summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /net
Initial import
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/Kconfig61
-rw-r--r--net/6lowpan/Makefile12
-rw-r--r--net/6lowpan/iphc.c628
-rw-r--r--net/6lowpan/nhc.c241
-rw-r--r--net/6lowpan/nhc.h146
-rw-r--r--net/6lowpan/nhc_dest.c28
-rw-r--r--net/6lowpan/nhc_fragment.c27
-rw-r--r--net/6lowpan/nhc_hop.c27
-rw-r--r--net/6lowpan/nhc_ipv6.c27
-rw-r--r--net/6lowpan/nhc_mobility.c27
-rw-r--r--net/6lowpan/nhc_routing.c27
-rw-r--r--net/6lowpan/nhc_udp.c157
-rw-r--r--net/802/Kconfig10
-rw-r--r--net/802/Makefile14
-rw-r--r--net/802/fc.c110
-rw-r--r--net/802/fddi.c189
-rw-r--r--net/802/garp.c638
-rw-r--r--net/802/hippi.c207
-rw-r--r--net/802/mrp.c926
-rw-r--r--net/802/p8022.c66
-rw-r--r--net/802/p8023.c64
-rw-r--r--net/802/psnap.c167
-rw-r--r--net/802/stp.c104
-rw-r--r--net/8021q/Kconfig40
-rw-r--r--net/8021q/Makefile11
-rw-r--r--net/8021q/vlan.c704
-rw-r--r--net/8021q/vlan.h173
-rw-r--r--net/8021q/vlan_core.c362
-rw-r--r--net/8021q/vlan_dev.c804
-rw-r--r--net/8021q/vlan_gvrp.c70
-rw-r--r--net/8021q/vlan_mvrp.c76
-rw-r--r--net/8021q/vlan_netlink.c273
-rw-r--r--net/8021q/vlanproc.c326
-rw-r--r--net/8021q/vlanproc.h20
-rw-r--r--net/9p/Kconfig36
-rw-r--r--net/9p/Makefile18
-rw-r--r--net/9p/client.c2263
-rw-r--r--net/9p/error.c247
-rw-r--r--net/9p/mod.c201
-rw-r--r--net/9p/protocol.c628
-rw-r--r--net/9p/protocol.h34
-rw-r--r--net/9p/trans_common.c29
-rw-r--r--net/9p/trans_common.h15
-rw-r--r--net/9p/trans_fd.c1117
-rw-r--r--net/9p/trans_rdma.c801
-rw-r--r--net/9p/trans_virtio.c776
-rw-r--r--net/9p/util.c141
-rw-r--r--net/Kconfig379
-rw-r--r--net/Makefile76
-rw-r--r--net/appletalk/Makefile9
-rw-r--r--net/appletalk/aarp.c1065
-rw-r--r--net/appletalk/atalk_proc.c303
-rw-r--r--net/appletalk/ddp.c1965
-rw-r--r--net/appletalk/dev.c45
-rw-r--r--net/appletalk/sysctl_net_atalk.c55
-rw-r--r--net/atm/Kconfig73
-rw-r--r--net/atm/Makefile15
-rw-r--r--net/atm/addr.c161
-rw-r--r--net/atm/addr.h20
-rw-r--r--net/atm/atm_misc.c101
-rw-r--r--net/atm/atm_sysfs.c190
-rw-r--r--net/atm/br2684.c886
-rw-r--r--net/atm/clip.c939
-rw-r--r--net/atm/common.c901
-rw-r--r--net/atm/common.h55
-rw-r--r--net/atm/ioctl.c369
-rw-r--r--net/atm/lec.c2285
-rw-r--r--net/atm/lec.h154
-rw-r--r--net/atm/lec_arpc.h96
-rw-r--r--net/atm/mpc.c1534
-rw-r--r--net/atm/mpc.h64
-rw-r--r--net/atm/mpoa_caches.c569
-rw-r--r--net/atm/mpoa_caches.h96
-rw-r--r--net/atm/mpoa_proc.c312
-rw-r--r--net/atm/pppoatm.c500
-rw-r--r--net/atm/proc.c497
-rw-r--r--net/atm/protocols.h13
-rw-r--r--net/atm/pvc.c162
-rw-r--r--net/atm/raw.c85
-rw-r--r--net/atm/resources.c463
-rw-r--r--net/atm/resources.h47
-rw-r--r--net/atm/signaling.c244
-rw-r--r--net/atm/signaling.h30
-rw-r--r--net/atm/svc.c690
-rw-r--r--net/ax25/Kconfig121
-rw-r--r--net/ax25/Makefile11
-rw-r--r--net/ax25/TODO20
-rw-r--r--net/ax25/af_ax25.c2022
-rw-r--r--net/ax25/ax25_addr.c307
-rw-r--r--net/ax25/ax25_dev.c198
-rw-r--r--net/ax25/ax25_ds_in.c302
-rw-r--r--net/ax25/ax25_ds_subr.c208
-rw-r--r--net/ax25/ax25_ds_timer.c236
-rw-r--r--net/ax25/ax25_iface.c217
-rw-r--r--net/ax25/ax25_in.c455
-rw-r--r--net/ax25/ax25_ip.c238
-rw-r--r--net/ax25/ax25_out.c398
-rw-r--r--net/ax25/ax25_route.c505
-rw-r--r--net/ax25/ax25_std_in.c446
-rw-r--r--net/ax25/ax25_std_subr.c86
-rw-r--r--net/ax25/ax25_std_timer.c175
-rw-r--r--net/ax25/ax25_subr.c289
-rw-r--r--net/ax25/ax25_timer.c226
-rw-r--r--net/ax25/ax25_uid.c222
-rw-r--r--net/ax25/sysctl_net_ax25.c184
-rw-r--r--net/batman-adv/Kconfig70
-rw-r--r--net/batman-adv/Makefile39
-rw-r--r--net/batman-adv/bat_algo.h23
-rw-r--r--net/batman-adv/bat_iv_ogm.c1979
-rw-r--r--net/batman-adv/bitarray.c92
-rw-r--r--net/batman-adv/bitarray.h51
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c1722
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h108
-rw-r--r--net/batman-adv/debugfs.c561
-rw-r--r--net/batman-adv/debugfs.h30
-rw-r--r--net/batman-adv/distributed-arp-table.c1204
-rw-r--r--net/batman-adv/distributed-arp-table.h171
-rw-r--r--net/batman-adv/fragmentation.c497
-rw-r--r--net/batman-adv/fragmentation.h47
-rw-r--r--net/batman-adv/gateway_client.c881
-rw-r--r--net/batman-adv/gateway_client.h40
-rw-r--r--net/batman-adv/gateway_common.c241
-rw-r--r--net/batman-adv/gateway_common.h47
-rw-r--r--net/batman-adv/hard-interface.c708
-rw-r--r--net/batman-adv/hard-interface.h97
-rw-r--r--net/batman-adv/hash.c76
-rw-r--r--net/batman-adv/hash.h187
-rw-r--r--net/batman-adv/icmp_socket.c385
-rw-r--r--net/batman-adv/icmp_socket.h28
-rw-r--r--net/batman-adv/main.c1265
-rw-r--r--net/batman-adv/main.h389
-rw-r--r--net/batman-adv/multicast.c750
-rw-r--r--net/batman-adv/multicast.h77
-rw-r--r--net/batman-adv/network-coding.c1936
-rw-r--r--net/batman-adv/network-coding.h123
-rw-r--r--net/batman-adv/originator.c1177
-rw-r--r--net/batman-adv/originator.h125
-rw-r--r--net/batman-adv/packet.h553
-rw-r--r--net/batman-adv/routing.c1086
-rw-r--r--net/batman-adv/routing.h51
-rw-r--r--net/batman-adv/send.c645
-rw-r--r--net/batman-adv/send.h98
-rw-r--r--net/batman-adv/soft-interface.c1107
-rw-r--r--net/batman-adv/soft-interface.h34
-rw-r--r--net/batman-adv/sysfs.c935
-rw-r--r--net/batman-adv/sysfs.h50
-rw-r--r--net/batman-adv/translation-table.c3710
-rw-r--r--net/batman-adv/translation-table.h56
-rw-r--r--net/batman-adv/types.h1257
-rw-r--r--net/bluetooth/6lowpan.c1469
-rw-r--r--net/bluetooth/Kconfig102
-rw-r--r--net/bluetooth/Makefile21
-rw-r--r--net/bluetooth/a2mp.c1025
-rw-r--r--net/bluetooth/a2mp.h142
-rw-r--r--net/bluetooth/af_bluetooth.c795
-rw-r--r--net/bluetooth/amp.c474
-rw-r--r--net/bluetooth/amp.h54
-rw-r--r--net/bluetooth/bnep/Kconfig24
-rw-r--r--net/bluetooth/bnep/Makefile7
-rw-r--r--net/bluetooth/bnep/bnep.h183
-rw-r--r--net/bluetooth/bnep/core.c769
-rw-r--r--net/bluetooth/bnep/netdev.c229
-rw-r--r--net/bluetooth/bnep/sock.c265
-rw-r--r--net/bluetooth/cmtp/Kconfig11
-rw-r--r--net/bluetooth/cmtp/Makefile7
-rw-r--r--net/bluetooth/cmtp/capi.c612
-rw-r--r--net/bluetooth/cmtp/cmtp.h129
-rw-r--r--net/bluetooth/cmtp/core.c512
-rw-r--r--net/bluetooth/cmtp/sock.c269
-rw-r--r--net/bluetooth/ecc.c816
-rw-r--r--net/bluetooth/ecc.h54
-rw-r--r--net/bluetooth/hci_conn.c1383
-rw-r--r--net/bluetooth/hci_core.c4241
-rw-r--r--net/bluetooth/hci_debugfs.c1142
-rw-r--r--net/bluetooth/hci_debugfs.h48
-rw-r--r--net/bluetooth/hci_event.c5404
-rw-r--r--net/bluetooth/hci_request.c568
-rw-r--r--net/bluetooth/hci_request.h57
-rw-r--r--net/bluetooth/hci_sock.c1452
-rw-r--r--net/bluetooth/hci_sysfs.c214
-rw-r--r--net/bluetooth/hidp/Kconfig12
-rw-r--r--net/bluetooth/hidp/Makefile7
-rw-r--r--net/bluetooth/hidp/core.c1446
-rw-r--r--net/bluetooth/hidp/hidp.h192
-rw-r--r--net/bluetooth/hidp/sock.c299
-rw-r--r--net/bluetooth/l2cap_core.c7622
-rw-r--r--net/bluetooth/l2cap_sock.c1650
-rw-r--r--net/bluetooth/lib.c168
-rw-r--r--net/bluetooth/mgmt.c8415
-rw-r--r--net/bluetooth/mgmt_util.c210
-rw-r--r--net/bluetooth/mgmt_util.h53
-rw-r--r--net/bluetooth/rfcomm/Kconfig18
-rw-r--r--net/bluetooth/rfcomm/Makefile8
-rw-r--r--net/bluetooth/rfcomm/core.c2277
-rw-r--r--net/bluetooth/rfcomm/sock.c1104
-rw-r--r--net/bluetooth/rfcomm/tty.c1174
-rw-r--r--net/bluetooth/sco.c1248
-rw-r--r--net/bluetooth/selftest.c271
-rw-r--r--net/bluetooth/selftest.h45
-rw-r--r--net/bluetooth/smp.c3670
-rw-r--r--net/bluetooth/smp.h209
-rw-r--r--net/bridge/Kconfig62
-rw-r--r--net/bridge/Makefile21
-rw-r--r--net/bridge/br.c274
-rw-r--r--net/bridge/br_device.c399
-rw-r--r--net/bridge/br_fdb.c1050
-rw-r--r--net/bridge/br_forward.c292
-rw-r--r--net/bridge/br_if.c586
-rw-r--r--net/bridge/br_input.c315
-rw-r--r--net/bridge/br_ioctl.c394
-rw-r--r--net/bridge/br_mdb.c499
-rw-r--r--net/bridge/br_multicast.c2301
-rw-r--r--net/bridge/br_netfilter.c1140
-rw-r--r--net/bridge/br_netlink.c888
-rw-r--r--net/bridge/br_nf_core.c95
-rw-r--r--net/bridge/br_private.h851
-rw-r--r--net/bridge/br_private_stp.h69
-rw-r--r--net/bridge/br_stp.c582
-rw-r--r--net/bridge/br_stp_bpdu.c242
-rw-r--r--net/bridge/br_stp_if.c316
-rw-r--r--net/bridge/br_stp_timer.c176
-rw-r--r--net/bridge/br_sysfs_br.c889
-rw-r--r--net/bridge/br_sysfs_if.c316
-rw-r--r--net/bridge/br_vlan.c736
-rw-r--r--net/bridge/netfilter/Kconfig227
-rw-r--r--net/bridge/netfilter/Makefile40
-rw-r--r--net/bridge/netfilter/ebt_802_3.c72
-rw-r--r--net/bridge/netfilter/ebt_among.c229
-rw-r--r--net/bridge/netfilter/ebt_arp.c140
-rw-r--r--net/bridge/netfilter/ebt_arpreply.c98
-rw-r--r--net/bridge/netfilter/ebt_dnat.c74
-rw-r--r--net/bridge/netfilter/ebt_ip.c130
-rw-r--r--net/bridge/netfilter/ebt_ip6.c158
-rw-r--r--net/bridge/netfilter/ebt_limit.c127
-rw-r--r--net/bridge/netfilter/ebt_log.c225
-rw-r--r--net/bridge/netfilter/ebt_mark.c110
-rw-r--r--net/bridge/netfilter/ebt_mark_m.c98
-rw-r--r--net/bridge/netfilter/ebt_nflog.c73
-rw-r--r--net/bridge/netfilter/ebt_pkttype.c56
-rw-r--r--net/bridge/netfilter/ebt_redirect.c80
-rw-r--r--net/bridge/netfilter/ebt_snat.c87
-rw-r--r--net/bridge/netfilter/ebt_stp.c197
-rw-r--r--net/bridge/netfilter/ebt_vlan.c180
-rw-r--r--net/bridge/netfilter/ebtable_broute.c100
-rw-r--r--net/bridge/netfilter/ebtable_filter.c136
-rw-r--r--net/bridge/netfilter/ebtable_nat.c136
-rw-r--r--net/bridge/netfilter/ebtables.c2432
-rw-r--r--net/bridge/netfilter/nf_log_bridge.c96
-rw-r--r--net/bridge/netfilter/nf_tables_bridge.c190
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c133
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c418
-rw-r--r--net/caif/Kconfig53
-rw-r--r--net/caif/Makefile15
-rw-r--r--net/caif/caif_dev.c574
-rw-r--r--net/caif/caif_socket.c1124
-rw-r--r--net/caif/caif_usb.c203
-rw-r--r--net/caif/cfcnfg.c611
-rw-r--r--net/caif/cfctrl.c641
-rw-r--r--net/caif/cfdbgl.c55
-rw-r--r--net/caif/cfdgml.c114
-rw-r--r--net/caif/cffrml.c197
-rw-r--r--net/caif/cfmuxl.c267
-rw-r--r--net/caif/cfpkt_skbuff.c393
-rw-r--r--net/caif/cfrfml.c302
-rw-r--r--net/caif/cfserl.c188
-rw-r--r--net/caif/cfsrvl.c221
-rw-r--r--net/caif/cfutill.c104
-rw-r--r--net/caif/cfveil.c101
-rw-r--r--net/caif/cfvidl.c65
-rw-r--r--net/caif/chnl_net.c552
-rw-r--r--net/can/Kconfig56
-rw-r--r--net/can/Makefile15
-rw-r--r--net/can/af_can.c965
-rw-r--r--net/can/af_can.h126
-rw-r--r--net/can/bcm.c1634
-rw-r--r--net/can/gw.c998
-rw-r--r--net/can/proc.c580
-rw-r--r--net/can/raw.c875
-rw-r--r--net/ceph/Kconfig44
-rw-r--r--net/ceph/Makefile15
-rw-r--r--net/ceph/armor.c105
-rw-r--r--net/ceph/auth.c340
-rw-r--r--net/ceph/auth_none.c137
-rw-r--r--net/ceph/auth_none.h29
-rw-r--r--net/ceph/auth_x.c782
-rw-r--r--net/ceph/auth_x.h52
-rw-r--r--net/ceph/auth_x_protocol.h90
-rw-r--r--net/ceph/buffer.c58
-rw-r--r--net/ceph/ceph_common.c725
-rw-r--r--net/ceph/ceph_fs.c78
-rw-r--r--net/ceph/ceph_hash.c121
-rw-r--r--net/ceph/ceph_strings.c44
-rw-r--r--net/ceph/crush/crush.c143
-rw-r--r--net/ceph/crush/crush_ln_table.h166
-rw-r--r--net/ceph/crush/hash.c149
-rw-r--r--net/ceph/crush/mapper.c927
-rw-r--r--net/ceph/crypto.c583
-rw-r--r--net/ceph/crypto.h51
-rw-r--r--net/ceph/debugfs.c309
-rw-r--r--net/ceph/messenger.c3395
-rw-r--r--net/ceph/mon_client.c1121
-rw-r--r--net/ceph/msgpool.c83
-rw-r--r--net/ceph/osd_client.c3008
-rw-r--r--net/ceph/osdmap.c1754
-rw-r--r--net/ceph/pagelist.c150
-rw-r--r--net/ceph/pagevec.c202
-rw-r--r--net/ceph/snapshot.c78
-rw-r--r--net/compat.c851
-rw-r--r--net/core/Makefile25
-rw-r--r--net/core/datagram.c753
-rw-r--r--net/core/dev.c7523
-rw-r--r--net/core/dev_addr_lists.c851
-rw-r--r--net/core/dev_ioctl.c567
-rw-r--r--net/core/drop_monitor.c437
-rw-r--r--net/core/dst.c397
-rw-r--r--net/core/ethtool.c1994
-rw-r--r--net/core/fib_rules.c799
-rw-r--r--net/core/filter.c1553
-rw-r--r--net/core/flow.c511
-rw-r--r--net/core/flow_dissector.c490
-rw-r--r--net/core/gen_estimator.c328
-rw-r--r--net/core/gen_stats.c364
-rw-r--r--net/core/link_watch.c253
-rw-r--r--net/core/neighbour.c3172
-rw-r--r--net/core/net-procfs.c423
-rw-r--r--net/core/net-sysfs.c1556
-rw-r--r--net/core/net-sysfs.h11
-rw-r--r--net/core/net-traces.c37
-rw-r--r--net/core/net_namespace.c965
-rw-r--r--net/core/netclassid_cgroup.c111
-rw-r--r--net/core/netevent.c70
-rw-r--r--net/core/netpoll.c852
-rw-r--r--net/core/netprio_cgroup.c288
-rw-r--r--net/core/pktgen.c3862
-rw-r--r--net/core/ptp_classifier.c193
-rw-r--r--net/core/request_sock.c206
-rw-r--r--net/core/rtnetlink.c3343
-rw-r--r--net/core/scm.c340
-rw-r--r--net/core/secure_seq.c273
-rw-r--r--net/core/skbuff.c4430
-rw-r--r--net/core/sock.c2979
-rw-r--r--net/core/sock_diag.c248
-rw-r--r--net/core/stream.c208
-rw-r--r--net/core/sysctl_net_core.c464
-rw-r--r--net/core/timestamping.c80
-rw-r--r--net/core/tso.c78
-rw-r--r--net/core/utils.c384
-rw-r--r--net/dcb/Kconfig22
-rw-r--r--net/dcb/Makefile1
-rw-r--r--net/dcb/dcbevent.c41
-rw-r--r--net/dcb/dcbnl.c1968
-rw-r--r--net/dccp/Kconfig62
-rw-r--r--net/dccp/Makefile28
-rw-r--r--net/dccp/ackvec.c409
-rw-r--r--net/dccp/ackvec.h138
-rw-r--r--net/dccp/ccid.c223
-rw-r--r--net/dccp/ccid.h265
-rw-r--r--net/dccp/ccids/Kconfig54
-rw-r--r--net/dccp/ccids/ccid2.c784
-rw-r--r--net/dccp/ccids/ccid2.h133
-rw-r--r--net/dccp/ccids/ccid3.c870
-rw-r--r--net/dccp/ccids/ccid3.h160
-rw-r--r--net/dccp/ccids/lib/loss_interval.c185
-rw-r--r--net/dccp/ccids/lib/loss_interval.h73
-rw-r--r--net/dccp/ccids/lib/packet_history.c449
-rw-r--r--net/dccp/ccids/lib/packet_history.h155
-rw-r--r--net/dccp/ccids/lib/tfrc.c45
-rw-r--r--net/dccp/ccids/lib/tfrc.h77
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c705
-rw-r--r--net/dccp/dccp.h499
-rw-r--r--net/dccp/diag.c87
-rw-r--r--net/dccp/feat.c1561
-rw-r--r--net/dccp/feat.h137
-rw-r--r--net/dccp/input.c732
-rw-r--r--net/dccp/ipv4.c1091
-rw-r--r--net/dccp/ipv6.c1147
-rw-r--r--net/dccp/ipv6.h34
-rw-r--r--net/dccp/minisocks.c261
-rw-r--r--net/dccp/options.c609
-rw-r--r--net/dccp/output.c698
-rw-r--r--net/dccp/probe.c202
-rw-r--r--net/dccp/proto.c1253
-rw-r--r--net/dccp/qpolicy.c137
-rw-r--r--net/dccp/sysctl.c118
-rw-r--r--net/dccp/timer.c271
-rw-r--r--net/decnet/Kconfig43
-rw-r--r--net/decnet/Makefile10
-rw-r--r--net/decnet/README8
-rw-r--r--net/decnet/TODO41
-rw-r--r--net/decnet/af_decnet.c2416
-rw-r--r--net/decnet/dn_dev.c1446
-rw-r--r--net/decnet/dn_fib.c791
-rw-r--r--net/decnet/dn_neigh.c616
-rw-r--r--net/decnet/dn_nsp_in.c917
-rw-r--r--net/decnet/dn_nsp_out.c718
-rw-r--r--net/decnet/dn_route.c1939
-rw-r--r--net/decnet/dn_rules.c257
-rw-r--r--net/decnet/dn_table.c926
-rw-r--r--net/decnet/dn_timer.c103
-rw-r--r--net/decnet/netfilter/Kconfig16
-rw-r--r--net/decnet/netfilter/Makefile6
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c159
-rw-r--r--net/decnet/sysctl_net_decnet.c372
-rw-r--r--net/dns_resolver/Kconfig27
-rw-r--r--net/dns_resolver/Makefile7
-rw-r--r--net/dns_resolver/dns_key.c319
-rw-r--r--net/dns_resolver/dns_query.c167
-rw-r--r--net/dns_resolver/internal.h43
-rw-r--r--net/dsa/Kconfig41
-rw-r--r--net/dsa/Makefile9
-rw-r--r--net/dsa/dsa.c939
-rw-r--r--net/dsa/dsa_priv.h78
-rw-r--r--net/dsa/slave.c966
-rw-r--r--net/dsa/tag_brcm.c171
-rw-r--r--net/dsa/tag_dsa.c189
-rw-r--r--net/dsa/tag_edsa.c208
-rw-r--r--net/dsa/tag_trailer.c117
-rw-r--r--net/ethernet/Makefile5
-rw-r--r--net/ethernet/eth.c484
-rw-r--r--net/hsr/Kconfig27
-rw-r--r--net/hsr/Makefile8
-rw-r--r--net/hsr/hsr_device.c503
-rw-r--r--net/hsr/hsr_device.h25
-rw-r--r--net/hsr/hsr_forward.c368
-rw-r--r--net/hsr/hsr_forward.h20
-rw-r--r--net/hsr/hsr_framereg.c490
-rw-r--r--net/hsr/hsr_framereg.h54
-rw-r--r--net/hsr/hsr_main.c136
-rw-r--r--net/hsr/hsr_main.h183
-rw-r--r--net/hsr/hsr_netlink.c493
-rw-r--r--net/hsr/hsr_netlink.h31
-rw-r--r--net/hsr/hsr_slave.c200
-rw-r--r--net/hsr/hsr_slave.h38
-rw-r--r--net/ieee802154/6lowpan/6lowpan_i.h72
-rw-r--r--net/ieee802154/6lowpan/Kconfig5
-rw-r--r--net/ieee802154/6lowpan/Makefile3
-rw-r--r--net/ieee802154/6lowpan/core.c306
-rw-r--r--net/ieee802154/6lowpan/reassembly.c585
-rw-r--r--net/ieee802154/6lowpan/rx.c171
-rw-r--r--net/ieee802154/6lowpan/tx.c271
-rw-r--r--net/ieee802154/Kconfig25
-rw-r--r--net/ieee802154/Makefile11
-rw-r--r--net/ieee802154/core.c325
-rw-r--r--net/ieee802154/core.h46
-rw-r--r--net/ieee802154/header_ops.c326
-rw-r--r--net/ieee802154/ieee802154.h86
-rw-r--r--net/ieee802154/netlink.c152
-rw-r--r--net/ieee802154/nl-mac.c1349
-rw-r--r--net/ieee802154/nl-phy.c349
-rw-r--r--net/ieee802154/nl802154.c999
-rw-r--r--net/ieee802154/nl802154.h7
-rw-r--r--net/ieee802154/nl_policy.c78
-rw-r--r--net/ieee802154/rdev-ops.h155
-rw-r--r--net/ieee802154/socket.c1128
-rw-r--r--net/ieee802154/sysfs.c79
-rw-r--r--net/ieee802154/sysfs.h9
-rw-r--r--net/ieee802154/trace.c7
-rw-r--r--net/ieee802154/trace.h247
-rw-r--r--net/ipv4/Kconfig696
-rw-r--r--net/ipv4/Makefile62
-rw-r--r--net/ipv4/af_inet.c1838
-rw-r--r--net/ipv4/ah4.c589
-rw-r--r--net/ipv4/arp.c1375
-rw-r--r--net/ipv4/cipso_ipv4.c2357
-rw-r--r--net/ipv4/datagram.c123
-rw-r--r--net/ipv4/devinet.c2386
-rw-r--r--net/ipv4/esp4.c731
-rw-r--r--net/ipv4/fib_frontend.c1251
-rw-r--r--net/ipv4/fib_lookup.h52
-rw-r--r--net/ipv4/fib_rules.c369
-rw-r--r--net/ipv4/fib_semantics.c1328
-rw-r--r--net/ipv4/fib_trie.c2659
-rw-r--r--net/ipv4/fou.c999
-rw-r--r--net/ipv4/geneve.c453
-rw-r--r--net/ipv4/gre_demux.c367
-rw-r--r--net/ipv4/gre_offload.c268
-rw-r--r--net/ipv4/icmp.c1218
-rw-r--r--net/ipv4/igmp.c2800
-rw-r--r--net/ipv4/inet_connection_sock.c978
-rw-r--r--net/ipv4/inet_diag.c1165
-rw-r--r--net/ipv4/inet_fragment.c463
-rw-r--r--net/ipv4/inet_hashtables.c618
-rw-r--r--net/ipv4/inet_lro.c374
-rw-r--r--net/ipv4/inet_timewait_sock.c328
-rw-r--r--net/ipv4/inetpeer.c558
-rw-r--r--net/ipv4/ip_forward.c159
-rw-r--r--net/ipv4/ip_fragment.c871
-rw-r--r--net/ipv4/ip_gre.c926
-rw-r--r--net/ipv4/ip_input.c467
-rw-r--r--net/ipv4/ip_options.c663
-rw-r--r--net/ipv4/ip_output.c1591
-rw-r--r--net/ipv4/ip_sockglue.c1544
-rw-r--r--net/ipv4/ip_tunnel.c1191
-rw-r--r--net/ipv4/ip_tunnel_core.c206
-rw-r--r--net/ipv4/ip_vti.c599
-rw-r--r--net/ipv4/ipcomp.c204
-rw-r--r--net/ipv4/ipconfig.c1693
-rw-r--r--net/ipv4/ipip.c569
-rw-r--r--net/ipv4/ipmr.c2792
-rw-r--r--net/ipv4/netfilter.c207
-rw-r--r--net/ipv4/netfilter/Kconfig406
-rw-r--r--net/ipv4/netfilter/Makefile72
-rw-r--r--net/ipv4/netfilter/arp_tables.c1926
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c91
-rw-r--r--net/ipv4/netfilter/arptable_filter.c91
-rw-r--r--net/ipv4/netfilter/ip_tables.c2282
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c794
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c138
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c91
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c113
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c480
-rw-r--r--net/ipv4/netfilter/ipt_ah.c91
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c144
-rw-r--r--net/ipv4/netfilter/iptable_filter.c109
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c147
-rw-r--r--net/ipv4/netfilter/iptable_nat.c154
-rw-r--r--net/ipv4/netfilter/iptable_raw.c89
-rw-r--r--net/ipv4/netfilter/iptable_security.c109
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c542
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c469
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c428
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c129
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c161
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c397
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c631
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c481
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c153
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c311
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c149
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c83
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c1313
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c192
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c102
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c125
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c107
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c88
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c76
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c76
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c76
-rw-r--r--net/ipv4/ping.c1223
-rw-r--r--net/ipv4/proc.c540
-rw-r--r--net/ipv4/protocol.c79
-rw-r--r--net/ipv4/raw.c1100
-rw-r--r--net/ipv4/route.c2800
-rw-r--r--net/ipv4/syncookies.c401
-rw-r--r--net/ipv4/sysctl_net_ipv4.c970
-rw-r--r--net/ipv4/tcp.c3225
-rw-r--r--net/ipv4/tcp_bic.c239
-rw-r--r--net/ipv4/tcp_cong.c441
-rw-r--r--net/ipv4/tcp_cubic.c504
-rw-r--r--net/ipv4/tcp_dctcp.c345
-rw-r--r--net/ipv4/tcp_diag.c68
-rw-r--r--net/ipv4/tcp_fastopen.c313
-rw-r--r--net/ipv4/tcp_highspeed.c185
-rw-r--r--net/ipv4/tcp_htcp.c317
-rw-r--r--net/ipv4/tcp_hybla.c192
-rw-r--r--net/ipv4/tcp_illinois.c355
-rw-r--r--net/ipv4/tcp_input.c6299
-rw-r--r--net/ipv4/tcp_ipv4.c2468
-rw-r--r--net/ipv4/tcp_lp.c342
-rw-r--r--net/ipv4/tcp_memcontrol.c233
-rw-r--r--net/ipv4/tcp_metrics.c1198
-rw-r--r--net/ipv4/tcp_minisocks.c830
-rw-r--r--net/ipv4/tcp_offload.c327
-rw-r--r--net/ipv4/tcp_output.c3525
-rw-r--r--net/ipv4/tcp_probe.c300
-rw-r--r--net/ipv4/tcp_scalable.c62
-rw-r--r--net/ipv4/tcp_timer.c652
-rw-r--r--net/ipv4/tcp_vegas.c337
-rw-r--r--net/ipv4/tcp_vegas.h25
-rw-r--r--net/ipv4/tcp_veno.c232
-rw-r--r--net/ipv4/tcp_westwood.c305
-rw-r--r--net/ipv4/tcp_yeah.c255
-rw-r--r--net/ipv4/tunnel4.c192
-rw-r--r--net/ipv4/udp.c2555
-rw-r--r--net/ipv4/udp_diag.c222
-rw-r--r--net/ipv4/udp_impl.h34
-rw-r--r--net/ipv4/udp_offload.c442
-rw-r--r--net/ipv4/udp_tunnel.c108
-rw-r--r--net/ipv4/udplite.c141
-rw-r--r--net/ipv4/xfrm4_input.c158
-rw-r--r--net/ipv4/xfrm4_mode_beet.c156
-rw-r--r--net/ipv4/xfrm4_mode_transport.c80
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c125
-rw-r--r--net/ipv4/xfrm4_output.c111
-rw-r--r--net/ipv4/xfrm4_policy.c332
-rw-r--r--net/ipv4/xfrm4_protocol.c301
-rw-r--r--net/ipv4/xfrm4_state.c100
-rw-r--r--net/ipv4/xfrm4_tunnel.c117
-rw-r--r--net/ipv6/Kconfig261
-rw-r--r--net/ipv6/Makefile51
-rw-r--r--net/ipv6/addrconf.c5859
-rw-r--r--net/ipv6/addrconf_core.c164
-rw-r--r--net/ipv6/addrlabel.c597
-rw-r--r--net/ipv6/af_inet6.c1030
-rw-r--r--net/ipv6/ah6.c805
-rw-r--r--net/ipv6/anycast.c556
-rw-r--r--net/ipv6/datagram.c971
-rw-r--r--net/ipv6/esp6.c673
-rw-r--r--net/ipv6/exthdrs.c874
-rw-r--r--net/ipv6/exthdrs_core.c281
-rw-r--r--net/ipv6/exthdrs_offload.c41
-rw-r--r--net/ipv6/fib6_rules.c337
-rw-r--r--net/ipv6/icmp.c1018
-rw-r--r--net/ipv6/inet6_connection_sock.c261
-rw-r--r--net/ipv6/inet6_hashtables.c275
-rw-r--r--net/ipv6/ip6_checksum.c124
-rw-r--r--net/ipv6/ip6_fib.c2060
-rw-r--r--net/ipv6/ip6_flowlabel.c881
-rw-r--r--net/ipv6/ip6_gre.c1726
-rw-r--r--net/ipv6/ip6_icmp.c47
-rw-r--r--net/ipv6/ip6_input.c363
-rw-r--r--net/ipv6/ip6_offload.c317
-rw-r--r--net/ipv6/ip6_offload.h18
-rw-r--r--net/ipv6/ip6_output.c1762
-rw-r--r--net/ipv6/ip6_tunnel.c1946
-rw-r--r--net/ipv6/ip6_udp_tunnel.c106
-rw-r--r--net/ipv6/ip6_vti.c1194
-rw-r--r--net/ipv6/ip6mr.c2512
-rw-r--r--net/ipv6/ipcomp6.c230
-rw-r--r--net/ipv6/ipv6_sockglue.c1384
-rw-r--r--net/ipv6/mcast.c2951
-rw-r--r--net/ipv6/mip6.c527
-rw-r--r--net/ipv6/ndisc.c1819
-rw-r--r--net/ipv6/netfilter.c219
-rw-r--r--net/ipv6/netfilter/Kconfig323
-rw-r--r--net/ipv6/netfilter/Makefile55
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2296
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c71
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c153
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c119
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c503
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c121
-rw-r--r--net/ipv6/netfilter/ip6t_eui64.c74
-rw-r--r--net/ipv6/netfilter/ip6t_frag.c136
-rw-r--r--net/ipv6/netfilter/ip6t_hbh.c215
-rw-r--r--net/ipv6/netfilter/ip6t_ipv6header.c153
-rw-r--r--net/ipv6/netfilter/ip6t_mh.c94
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c140
-rw-r--r--net/ipv6/netfilter/ip6t_rt.c225
-rw-r--r--net/ipv6/netfilter/ip6table_filter.c105
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c145
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c156
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c85
-rw-r--r--net/ipv6/netfilter/ip6table_security.c101
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c471
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c396
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c697
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c138
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c429
-rw-r--r--net/ipv6/netfilter/nf_nat_l3proto_ipv6.c487
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c120
-rw-r--r--net/ipv6/netfilter/nf_nat_proto_icmpv6.c90
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c248
-rw-r--r--net/ipv6/netfilter/nf_tables_ipv6.c124
-rw-r--r--net/ipv6/netfilter/nft_chain_nat_ipv6.c105
-rw-r--r--net/ipv6/netfilter/nft_chain_route_ipv6.c86
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c76
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c76
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c77
-rw-r--r--net/ipv6/output_core.c175
-rw-r--r--net/ipv6/ping.c275
-rw-r--r--net/ipv6/proc.c346
-rw-r--r--net/ipv6/protocol.c74
-rw-r--r--net/ipv6/raw.c1339
-rw-r--r--net/ipv6/reassembly.c772
-rw-r--r--net/ipv6/route.c3311
-rw-r--r--net/ipv6/sit.c1922
-rw-r--r--net/ipv6/syncookies.c273
-rw-r--r--net/ipv6/sysctl_net_ipv6.c200
-rw-r--r--net/ipv6/tcp_ipv6.c1965
-rw-r--r--net/ipv6/tcpv6_offload.c78
-rw-r--r--net/ipv6/tunnel6.c183
-rw-r--r--net/ipv6/udp.c1579
-rw-r--r--net/ipv6/udp_impl.h37
-rw-r--r--net/ipv6/udp_offload.c184
-rw-r--r--net/ipv6/udplite.c139
-rw-r--r--net/ipv6/xfrm6_input.c145
-rw-r--r--net/ipv6/xfrm6_mode_beet.c131
-rw-r--r--net/ipv6/xfrm6_mode_ro.c83
-rw-r--r--net/ipv6/xfrm6_mode_transport.c85
-rw-r--r--net/ipv6/xfrm6_mode_tunnel.c126
-rw-r--r--net/ipv6/xfrm6_output.c174
-rw-r--r--net/ipv6/xfrm6_policy.c426
-rw-r--r--net/ipv6/xfrm6_protocol.c279
-rw-r--r--net/ipv6/xfrm6_state.c198
-rw-r--r--net/ipv6/xfrm6_tunnel.c397
-rw-r--r--net/ipx/Kconfig60
-rw-r--r--net/ipx/Makefile8
-rw-r--r--net/ipx/af_ipx.c2084
-rw-r--r--net/ipx/ipx_proc.c340
-rw-r--r--net/ipx/ipx_route.c292
-rw-r--r--net/ipx/pe2.c35
-rw-r--r--net/ipx/sysctl_net_ipx.c39
-rw-r--r--net/irda/Kconfig96
-rw-r--r--net/irda/Makefile15
-rw-r--r--net/irda/af_irda.c2716
-rw-r--r--net/irda/discovery.c417
-rw-r--r--net/irda/ircomm/Kconfig12
-rw-r--r--net/irda/ircomm/Makefile8
-rw-r--r--net/irda/ircomm/ircomm_core.c563
-rw-r--r--net/irda/ircomm/ircomm_event.c246
-rw-r--r--net/irda/ircomm/ircomm_lmp.c350
-rw-r--r--net/irda/ircomm/ircomm_param.c502
-rw-r--r--net/irda/ircomm/ircomm_ttp.c350
-rw-r--r--net/irda/ircomm/ircomm_tty.c1362
-rw-r--r--net/irda/ircomm/ircomm_tty_attach.c987
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c413
-rw-r--r--net/irda/irda_device.c316
-rw-r--r--net/irda/iriap.c1081
-rw-r--r--net/irda/iriap_event.c496
-rw-r--r--net/irda/irias_object.c555
-rw-r--r--net/irda/irlan/Kconfig14
-rw-r--r--net/irda/irlan/Makefile7
-rw-r--r--net/irda/irlan/irlan_client.c559
-rw-r--r--net/irda/irlan/irlan_client_event.c511
-rw-r--r--net/irda/irlan/irlan_common.c1176
-rw-r--r--net/irda/irlan/irlan_eth.c340
-rw-r--r--net/irda/irlan/irlan_event.c60
-rw-r--r--net/irda/irlan/irlan_filter.c240
-rw-r--r--net/irda/irlan/irlan_provider.c408
-rw-r--r--net/irda/irlan/irlan_provider_event.c233
-rw-r--r--net/irda/irlap.c1207
-rw-r--r--net/irda/irlap_event.c2316
-rw-r--r--net/irda/irlap_frame.c1411
-rw-r--r--net/irda/irlmp.c1996
-rw-r--r--net/irda/irlmp_event.c886
-rw-r--r--net/irda/irlmp_frame.c476
-rw-r--r--net/irda/irmod.c199
-rw-r--r--net/irda/irnet/Kconfig13
-rw-r--r--net/irda/irnet/Makefile7
-rw-r--r--net/irda/irnet/irnet.h523
-rw-r--r--net/irda/irnet/irnet_irda.c1885
-rw-r--r--net/irda/irnet/irnet_irda.h178
-rw-r--r--net/irda/irnet/irnet_ppp.c1188
-rw-r--r--net/irda/irnet/irnet_ppp.h119
-rw-r--r--net/irda/irnetlink.c158
-rw-r--r--net/irda/irproc.c97
-rw-r--r--net/irda/irqueue.c913
-rw-r--r--net/irda/irsysctl.c258
-rw-r--r--net/irda/irttp.c1891
-rw-r--r--net/irda/parameters.c584
-rw-r--r--net/irda/qos.c771
-rw-r--r--net/irda/timer.c231
-rw-r--r--net/irda/wrapper.c492
-rw-r--r--net/iucv/Kconfig17
-rw-r--r--net/iucv/Makefile6
-rw-r--r--net/iucv/af_iucv.c2463
-rw-r--r--net/iucv/iucv.c2119
-rw-r--r--net/key/Makefile5
-rw-r--r--net/key/af_key.c3871
-rw-r--r--net/l2tp/Kconfig109
-rw-r--r--net/l2tp/Makefile15
-rw-r--r--net/l2tp/l2tp_core.c1901
-rw-r--r--net/l2tp/l2tp_core.h324
-rw-r--r--net/l2tp/l2tp_debugfs.c352
-rw-r--r--net/l2tp/l2tp_eth.c360
-rw-r--r--net/l2tp/l2tp_ip.c657
-rw-r--r--net/l2tp/l2tp_ip6.c803
-rw-r--r--net/l2tp/l2tp_netlink.c999
-rw-r--r--net/l2tp/l2tp_ppp.c1865
-rw-r--r--net/lapb/Kconfig21
-rw-r--r--net/lapb/Makefile7
-rw-r--r--net/lapb/lapb_iface.c441
-rw-r--r--net/lapb/lapb_in.c562
-rw-r--r--net/lapb/lapb_out.c211
-rw-r--r--net/lapb/lapb_subr.c308
-rw-r--r--net/lapb/lapb_timer.c179
-rw-r--r--net/llc/Kconfig10
-rw-r--r--net/llc/Makefile25
-rw-r--r--net/llc/af_llc.c1250
-rw-r--r--net/llc/llc_c_ac.c1444
-rw-r--r--net/llc/llc_c_ev.c748
-rw-r--r--net/llc/llc_c_st.c4946
-rw-r--r--net/llc/llc_conn.c1016
-rw-r--r--net/llc/llc_core.c168
-rw-r--r--net/llc/llc_if.c154
-rw-r--r--net/llc/llc_input.c226
-rw-r--r--net/llc/llc_output.c79
-rw-r--r--net/llc/llc_pdu.c372
-rw-r--r--net/llc/llc_proc.c277
-rw-r--r--net/llc/llc_s_ac.c208
-rw-r--r--net/llc/llc_s_ev.c115
-rw-r--r--net/llc/llc_s_st.c183
-rw-r--r--net/llc/llc_sap.c439
-rw-r--r--net/llc/llc_station.c125
-rw-r--r--net/llc/sysctl_net_llc.c78
-rw-r--r--net/mac80211/Kconfig307
-rw-r--r--net/mac80211/Makefile63
-rw-r--r--net/mac80211/aes_ccm.c104
-rw-r--r--net/mac80211/aes_ccm.h26
-rw-r--r--net/mac80211/aes_cmac.c164
-rw-r--r--net/mac80211/aes_cmac.h22
-rw-r--r--net/mac80211/aes_gcm.c99
-rw-r--r--net/mac80211/aes_gcm.h22
-rw-r--r--net/mac80211/aes_gmac.c84
-rw-r--r--net/mac80211/aes_gmac.h20
-rw-r--r--net/mac80211/agg-rx.c426
-rw-r--r--net/mac80211/agg-tx.c989
-rw-r--r--net/mac80211/cfg.c3782
-rw-r--r--net/mac80211/cfg.h9
-rw-r--r--net/mac80211/chan.c1760
-rw-r--r--net/mac80211/debug.h200
-rw-r--r--net/mac80211/debugfs.c310
-rw-r--r--net/mac80211/debugfs.h16
-rw-r--r--net/mac80211/debugfs_key.c416
-rw-r--r--net/mac80211/debugfs_key.h33
-rw-r--r--net/mac80211/debugfs_netdev.c743
-rw-r--r--net/mac80211/debugfs_netdev.h24
-rw-r--r--net/mac80211/debugfs_sta.c480
-rw-r--r--net/mac80211/debugfs_sta.h14
-rw-r--r--net/mac80211/driver-ops.h1382
-rw-r--r--net/mac80211/ethtool.c244
-rw-r--r--net/mac80211/event.c27
-rw-r--r--net/mac80211/ht.c522
-rw-r--r--net/mac80211/ibss.c1861
-rw-r--r--net/mac80211/ieee80211_i.h2074
-rw-r--r--net/mac80211/iface.c1912
-rw-r--r--net/mac80211/key.c1169
-rw-r--r--net/mac80211/key.h171
-rw-r--r--net/mac80211/led.c304
-rw-r--r--net/mac80211/led.h73
-rw-r--r--net/mac80211/main.c1266
-rw-r--r--net/mac80211/mesh.c1345
-rw-r--r--net/mac80211/mesh.h376
-rw-r--r--net/mac80211/mesh_hwmp.c1224
-rw-r--r--net/mac80211/mesh_pathtbl.c1142
-rw-r--r--net/mac80211/mesh_plink.c1126
-rw-r--r--net/mac80211/mesh_ps.c605
-rw-r--r--net/mac80211/mesh_sync.c225
-rw-r--r--net/mac80211/michael.c86
-rw-r--r--net/mac80211/michael.h25
-rw-r--r--net/mac80211/mlme.c5053
-rw-r--r--net/mac80211/ocb.c250
-rw-r--r--net/mac80211/offchannel.c502
-rw-r--r--net/mac80211/pm.c179
-rw-r--r--net/mac80211/rate.c764
-rw-r--r--net/mac80211/rate.h195
-rw-r--r--net/mac80211/rc80211_minstrel.c746
-rw-r--r--net/mac80211/rc80211_minstrel.h170
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c234
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c1383
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h127
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c321
-rw-r--r--net/mac80211/rx.c3671
-rw-r--r--net/mac80211/scan.c1211
-rw-r--r--net/mac80211/spectmgmt.c243
-rw-r--r--net/mac80211/sta_info.c2007
-rw-r--r--net/mac80211/sta_info.h641
-rw-r--r--net/mac80211/status.c929
-rw-r--r--net/mac80211/tdls.c1734
-rw-r--r--net/mac80211/tkip.c314
-rw-r--r--net/mac80211/tkip.h35
-rw-r--r--net/mac80211/trace.c76
-rw-r--r--net/mac80211/trace.h2352
-rw-r--r--net/mac80211/trace_msg.h53
-rw-r--r--net/mac80211/tx.c3385
-rw-r--r--net/mac80211/util.c3339
-rw-r--r--net/mac80211/vht.c424
-rw-r--r--net/mac80211/wep.c339
-rw-r--r--net/mac80211/wep.h34
-rw-r--r--net/mac80211/wme.c270
-rw-r--r--net/mac80211/wme.h24
-rw-r--r--net/mac80211/wpa.c1238
-rw-r--r--net/mac80211/wpa.h55
-rw-r--r--net/mac802154/Kconfig20
-rw-r--r--net/mac802154/Makefile5
-rw-r--r--net/mac802154/cfg.c231
-rw-r--r--net/mac802154/cfg.h9
-rw-r--r--net/mac802154/driver-ops.h223
-rw-r--r--net/mac802154/ieee802154_i.h189
-rw-r--r--net/mac802154/iface.c651
-rw-r--r--net/mac802154/llsec.c1056
-rw-r--r--net/mac802154/llsec.h108
-rw-r--r--net/mac802154/mac_cmd.c162
-rw-r--r--net/mac802154/main.c220
-rw-r--r--net/mac802154/mib.c286
-rw-r--r--net/mac802154/rx.c302
-rw-r--r--net/mac802154/tx.c149
-rw-r--r--net/mac802154/util.c95
-rw-r--r--net/mpls/Kconfig30
-rw-r--r--net/mpls/Makefile7
-rw-r--r--net/mpls/af_mpls.c1153
-rw-r--r--net/mpls/internal.h56
-rw-r--r--net/mpls/mpls_gso.c98
-rw-r--r--net/netfilter/Kconfig1447
-rw-r--r--net/netfilter/Makefile177
-rw-r--r--net/netfilter/core.c317
-rw-r--r--net/netfilter/ipset/Kconfig168
-rw-r--r--net/netfilter/ipset/Makefile29
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h293
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c384
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c421
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c318
-rw-r--r--net/netfilter/ipset/ip_set_core.c2042
-rw-r--r--net/netfilter/ipset/ip_set_getport.c174
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h1167
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c325
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c331
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c400
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c412
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c571
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c173
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c407
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c637
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c494
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c519
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c601
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c702
-rw-r--r--net/netfilter/ipset/pfxlen.c313
-rw-r--r--net/netfilter/ipvs/Kconfig291
-rw-r--r--net/netfilter/ipvs/Makefile41
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c627
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c1418
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2136
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c3956
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c276
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c209
-rw-r--r--net/netfilter/ipvs/ip_vs_fo.c79
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c503
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c633
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c818
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c93
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c299
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c143
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c110
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c171
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c409
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c165
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c591
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c712
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c499
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c130
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c254
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c144
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c385
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c1975
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c116
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c270
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c1380
-rw-r--r--net/netfilter/nf_conntrack_acct.c135
-rw-r--r--net/netfilter/nf_conntrack_amanda.c237
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c82
-rw-r--r--net/netfilter/nf_conntrack_core.c1821
-rw-r--r--net/netfilter/nf_conntrack_ecache.c337
-rw-r--r--net/netfilter/nf_conntrack_expect.c659
-rw-r--r--net/netfilter/nf_conntrack_extend.c191
-rw-r--r--net/netfilter/nf_conntrack_ftp.c646
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c888
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c1906
-rw-r--r--net/netfilter/nf_conntrack_h323_types.c1922
-rw-r--r--net/netfilter/nf_conntrack_helper.c502
-rw-r--r--net/netfilter/nf_conntrack_irc.c292
-rw-r--r--net/netfilter/nf_conntrack_l3proto_generic.c73
-rw-r--r--net/netfilter/nf_conntrack_labels.c108
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c71
-rw-r--r--net/netfilter/nf_conntrack_netlink.c3288
-rw-r--r--net/netfilter/nf_conntrack_pptp.c619
-rw-r--r--net/netfilter/nf_conntrack_proto.c523
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c1006
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c239
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c447
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c928
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c1739
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c368
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c405
-rw-r--r--net/netfilter/nf_conntrack_sane.c237
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c243
-rw-r--r--net/netfilter/nf_conntrack_sip.c1680
-rw-r--r--net/netfilter/nf_conntrack_snmp.c78
-rw-r--r--net/netfilter/nf_conntrack_standalone.c613
-rw-r--r--net/netfilter/nf_conntrack_tftp.c153
-rw-r--r--net/netfilter/nf_conntrack_timeout.c51
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c114
-rw-r--r--net/netfilter/nf_internals.h27
-rw-r--r--net/netfilter/nf_log.c534
-rw-r--r--net/netfilter/nf_log_common.c188
-rw-r--r--net/netfilter/nf_nat_amanda.c90
-rw-r--r--net/netfilter/nf_nat_core.c899
-rw-r--r--net/netfilter/nf_nat_ftp.c146
-rw-r--r--net/netfilter/nf_nat_helper.c212
-rw-r--r--net/netfilter/nf_nat_irc.c119
-rw-r--r--net/netfilter/nf_nat_proto_common.c114
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c116
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c96
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c85
-rw-r--r--net/netfilter/nf_nat_proto_udp.c76
-rw-r--r--net/netfilter/nf_nat_proto_udplite.c106
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c54
-rw-r--r--net/netfilter/nf_nat_redirect.c127
-rw-r--r--net/netfilter/nf_nat_sip.c653
-rw-r--r--net/netfilter/nf_nat_tftp.c52
-rw-r--r--net/netfilter/nf_queue.c230
-rw-r--r--net/netfilter/nf_sockopt.c165
-rw-r--r--net/netfilter/nf_synproxy_core.c434
-rw-r--r--net/netfilter/nf_tables_api.c4568
-rw-r--r--net/netfilter/nf_tables_core.c274
-rw-r--r--net/netfilter/nf_tables_inet.c104
-rw-r--r--net/netfilter/nfnetlink.c540
-rw-r--r--net/netfilter/nfnetlink_acct.c512
-rw-r--r--net/netfilter/nfnetlink_cthelper.c683
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c585
-rw-r--r--net/netfilter/nfnetlink_log.c1125
-rw-r--r--net/netfilter/nfnetlink_queue_core.c1362
-rw-r--r--net/netfilter/nfnetlink_queue_ct.c113
-rw-r--r--net/netfilter/nft_bitwise.c143
-rw-r--r--net/netfilter/nft_byteorder.c165
-rw-r--r--net/netfilter/nft_cmp.c227
-rw-r--r--net/netfilter/nft_compat.c814
-rw-r--r--net/netfilter/nft_counter.c114
-rw-r--r--net/netfilter/nft_ct.c461
-rw-r--r--net/netfilter/nft_dynset.c265
-rw-r--r--net/netfilter/nft_exthdr.c128
-rw-r--r--net/netfilter/nft_hash.c394
-rw-r--r--net/netfilter/nft_immediate.c131
-rw-r--r--net/netfilter/nft_limit.c120
-rw-r--r--net/netfilter/nft_log.c208
-rw-r--r--net/netfilter/nft_lookup.c155
-rw-r--r--net/netfilter/nft_masq.c79
-rw-r--r--net/netfilter/nft_meta.c388
-rw-r--r--net/netfilter/nft_nat.c293
-rw-r--r--net/netfilter/nft_payload.c157
-rw-r--r--net/netfilter/nft_queue.c132
-rw-r--r--net/netfilter/nft_rbtree.c280
-rw-r--r--net/netfilter/nft_redir.c111
-rw-r--r--net/netfilter/nft_reject.c111
-rw-r--r--net/netfilter/nft_reject_inet.c154
-rw-r--r--net/netfilter/x_tables.c1360
-rw-r--r--net/netfilter/xt_AUDIT.c231
-rw-r--r--net/netfilter/xt_CHECKSUM.c70
-rw-r--r--net/netfilter/xt_CLASSIFY.c73
-rw-r--r--net/netfilter/xt_CONNSECMARK.c143
-rw-r--r--net/netfilter/xt_CT.c444
-rw-r--r--net/netfilter/xt_DSCP.c166
-rw-r--r--net/netfilter/xt_HL.c169
-rw-r--r--net/netfilter/xt_HMARK.c372
-rw-r--r--net/netfilter/xt_IDLETIMER.c315
-rw-r--r--net/netfilter/xt_LED.c217
-rw-r--r--net/netfilter/xt_LOG.c113
-rw-r--r--net/netfilter/xt_NETMAP.c165
-rw-r--r--net/netfilter/xt_NFLOG.c73
-rw-r--r--net/netfilter/xt_NFQUEUE.c160
-rw-r--r--net/netfilter/xt_RATEEST.c194
-rw-r--r--net/netfilter/xt_REDIRECT.c113
-rw-r--r--net/netfilter/xt_SECMARK.c147
-rw-r--r--net/netfilter/xt_TCPMSS.c344
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c158
-rw-r--r--net/netfilter/xt_TEE.c309
-rw-r--r--net/netfilter/xt_TPROXY.c605
-rw-r--r--net/netfilter/xt_TRACE.c40
-rw-r--r--net/netfilter/xt_addrtype.c248
-rw-r--r--net/netfilter/xt_bpf.c74
-rw-r--r--net/netfilter/xt_cgroup.c72
-rw-r--r--net/netfilter/xt_cluster.c179
-rw-r--r--net/netfilter/xt_comment.c45
-rw-r--r--net/netfilter/xt_connbytes.c157
-rw-r--r--net/netfilter/xt_connlabel.c99
-rw-r--r--net/netfilter/xt_connlimit.c487
-rw-r--r--net/netfilter/xt_connmark.c166
-rw-r--r--net/netfilter/xt_conntrack.c333
-rw-r--r--net/netfilter/xt_cpu.c65
-rw-r--r--net/netfilter/xt_dccp.c188
-rw-r--r--net/netfilter/xt_devgroup.c82
-rw-r--r--net/netfilter/xt_dscp.c115
-rw-r--r--net/netfilter/xt_ecn.c179
-rw-r--r--net/netfilter/xt_esp.c107
-rw-r--r--net/netfilter/xt_hashlimit.c967
-rw-r--r--net/netfilter/xt_helper.c99
-rw-r--r--net/netfilter/xt_hl.c96
-rw-r--r--net/netfilter/xt_ipcomp.c111
-rw-r--r--net/netfilter/xt_iprange.c140
-rw-r--r--net/netfilter/xt_ipvs.c188
-rw-r--r--net/netfilter/xt_l2tp.c354
-rw-r--r--net/netfilter/xt_length.c70
-rw-r--r--net/netfilter/xt_limit.c210
-rw-r--r--net/netfilter/xt_mac.c66
-rw-r--r--net/netfilter/xt_mark.c84
-rw-r--r--net/netfilter/xt_multiport.c165
-rw-r--r--net/netfilter/xt_nat.c170
-rw-r--r--net/netfilter/xt_nfacct.c79
-rw-r--r--net/netfilter/xt_osf.c428
-rw-r--r--net/netfilter/xt_owner.c100
-rw-r--r--net/netfilter/xt_physdev.c140
-rw-r--r--net/netfilter/xt_pkttype.c65
-rw-r--r--net/netfilter/xt_policy.c188
-rw-r--r--net/netfilter/xt_quota.c90
-rw-r--r--net/netfilter/xt_rateest.c157
-rw-r--r--net/netfilter/xt_realm.c54
-rw-r--r--net/netfilter/xt_recent.c769
-rw-r--r--net/netfilter/xt_repldata.h47
-rw-r--r--net/netfilter/xt_sctp.c198
-rw-r--r--net/netfilter/xt_set.c741
-rw-r--r--net/netfilter/xt_socket.c513
-rw-r--r--net/netfilter/xt_state.c79
-rw-r--r--net/netfilter/xt_statistic.c101
-rw-r--r--net/netfilter/xt_string.c94
-rw-r--r--net/netfilter/xt_tcpmss.c110
-rw-r--r--net/netfilter/xt_tcpudp.c234
-rw-r--r--net/netfilter/xt_time.c291
-rw-r--r--net/netfilter/xt_u32.c123
-rw-r--r--net/netlabel/Kconfig17
-rw-r--r--net/netlabel/Makefile15
-rw-r--r--net/netlabel/netlabel_addrlist.c383
-rw-r--r--net/netlabel/netlabel_addrlist.h208
-rw-r--r--net/netlabel/netlabel_cipso_v4.c786
-rw-r--r--net/netlabel/netlabel_cipso_v4.h169
-rw-r--r--net/netlabel/netlabel_domainhash.c790
-rw-r--r--net/netlabel/netlabel_domainhash.h109
-rw-r--r--net/netlabel/netlabel_kapi.c1211
-rw-r--r--net/netlabel/netlabel_mgmt.c778
-rw-r--r--net/netlabel/netlabel_mgmt.h222
-rw-r--r--net/netlabel/netlabel_unlabeled.c1548
-rw-r--r--net/netlabel/netlabel_unlabeled.h245
-rw-r--r--net/netlabel/netlabel_user.c119
-rw-r--r--net/netlabel/netlabel_user.h65
-rw-r--r--net/netlink/Kconfig19
-rw-r--r--net/netlink/Makefile8
-rw-r--r--net/netlink/af_netlink.c3184
-rw-r--r--net/netlink/af_netlink.h79
-rw-r--r--net/netlink/diag.c238
-rw-r--r--net/netlink/genetlink.c1154
-rw-r--r--net/netrom/Makefile9
-rw-r--r--net/netrom/af_netrom.c1511
-rw-r--r--net/netrom/nr_dev.c181
-rw-r--r--net/netrom/nr_in.c305
-rw-r--r--net/netrom/nr_loopback.c77
-rw-r--r--net/netrom/nr_out.c273
-rw-r--r--net/netrom/nr_route.c1030
-rw-r--r--net/netrom/nr_subr.c281
-rw-r--r--net/netrom/nr_timer.c248
-rw-r--r--net/netrom/sysctl_net_netrom.c157
-rw-r--r--net/nfc/Kconfig34
-rw-r--r--net/nfc/Makefile13
-rw-r--r--net/nfc/af_nfc.c97
-rw-r--r--net/nfc/core.c1233
-rw-r--r--net/nfc/digital.h180
-rw-r--r--net/nfc/digital_core.c845
-rw-r--r--net/nfc/digital_dep.c1542
-rw-r--r--net/nfc/digital_technology.c1315
-rw-r--r--net/nfc/hci/Kconfig17
-rw-r--r--net/nfc/hci/Makefile8
-rw-r--r--net/nfc/hci/command.c372
-rw-r--r--net/nfc/hci/core.c1095
-rw-r--r--net/nfc/hci/hci.h132
-rw-r--r--net/nfc/hci/hcp.c150
-rw-r--r--net/nfc/hci/llc.c166
-rw-r--r--net/nfc/hci/llc.h67
-rw-r--r--net/nfc/hci/llc_nop.c97
-rw-r--r--net/nfc/hci/llc_shdlc.c854
-rw-r--r--net/nfc/llcp.h266
-rw-r--r--net/nfc/llcp_commands.c799
-rw-r--r--net/nfc/llcp_core.c1637
-rw-r--r--net/nfc/llcp_sock.c1034
-rw-r--r--net/nfc/nci/Kconfig21
-rw-r--r--net/nfc/nci/Makefile9
-rw-r--r--net/nfc/nci/core.c1281
-rw-r--r--net/nfc/nci/data.c298
-rw-r--r--net/nfc/nci/hci.c694
-rw-r--r--net/nfc/nci/lib.c85
-rw-r--r--net/nfc/nci/ntf.c800
-rw-r--r--net/nfc/nci/rsp.c355
-rw-r--r--net/nfc/nci/spi.c322
-rw-r--r--net/nfc/netlink.c1689
-rw-r--r--net/nfc/nfc.h158
-rw-r--r--net/nfc/rawsock.c431
-rw-r--r--net/openvswitch/Kconfig67
-rw-r--r--net/openvswitch/Makefile20
-rw-r--r--net/openvswitch/actions.c995
-rw-r--r--net/openvswitch/datapath.c2333
-rw-r--r--net/openvswitch/datapath.h205
-rw-r--r--net/openvswitch/dp_notify.c102
-rw-r--r--net/openvswitch/flow.c730
-rw-r--r--net/openvswitch/flow.h284
-rw-r--r--net/openvswitch/flow_netlink.c2309
-rw-r--r--net/openvswitch/flow_netlink.h73
-rw-r--r--net/openvswitch/flow_table.c778
-rw-r--r--net/openvswitch/flow_table.h90
-rw-r--r--net/openvswitch/vport-geneve.c274
-rw-r--r--net/openvswitch/vport-gre.c313
-rw-r--r--net/openvswitch/vport-internal_dev.c288
-rw-r--r--net/openvswitch/vport-internal_dev.h30
-rw-r--r--net/openvswitch/vport-netdev.c246
-rw-r--r--net/openvswitch/vport-netdev.h47
-rw-r--r--net/openvswitch/vport-vxlan.c322
-rw-r--r--net/openvswitch/vport-vxlan.h11
-rw-r--r--net/openvswitch/vport.c627
-rw-r--r--net/openvswitch/vport.h259
-rw-r--r--net/packet/Kconfig24
-rw-r--r--net/packet/Makefile7
-rw-r--r--net/packet/af_packet.c4176
-rw-r--r--net/packet/diag.c265
-rw-r--r--net/packet/internal.h125
-rw-r--r--net/phonet/Kconfig16
-rw-r--r--net/phonet/Makefile11
-rw-r--r--net/phonet/af_phonet.c549
-rw-r--r--net/phonet/datagram.c212
-rw-r--r--net/phonet/pep-gprs.c327
-rw-r--r--net/phonet/pep.c1373
-rw-r--r--net/phonet/pn_dev.c431
-rw-r--r--net/phonet/pn_netlink.c310
-rw-r--r--net/phonet/socket.c823
-rw-r--r--net/phonet/sysctl.c110
-rw-r--r--net/rds/Kconfig28
-rw-r--r--net/rds/Makefile19
-rw-r--r--net/rds/af_rds.c599
-rw-r--r--net/rds/bind.c203
-rw-r--r--net/rds/cong.c420
-rw-r--r--net/rds/connection.c591
-rw-r--r--net/rds/ib.c435
-rw-r--r--net/rds/ib.h372
-rw-r--r--net/rds/ib_cm.c838
-rw-r--r--net/rds/ib_rdma.c784
-rw-r--r--net/rds/ib_recv.c1064
-rw-r--r--net/rds/ib_ring.c168
-rw-r--r--net/rds/ib_send.c1020
-rw-r--r--net/rds/ib_stats.c97
-rw-r--r--net/rds/ib_sysctl.c121
-rw-r--r--net/rds/info.c243
-rw-r--r--net/rds/info.h30
-rw-r--r--net/rds/iw.c329
-rw-r--r--net/rds/iw.h395
-rw-r--r--net/rds/iw_cm.c765
-rw-r--r--net/rds/iw_rdma.c875
-rw-r--r--net/rds/iw_recv.c904
-rw-r--r--net/rds/iw_ring.c169
-rw-r--r--net/rds/iw_send.c974
-rw-r--r--net/rds/iw_stats.c95
-rw-r--r--net/rds/iw_sysctl.c123
-rw-r--r--net/rds/loop.c194
-rw-r--r--net/rds/loop.h9
-rw-r--r--net/rds/message.c366
-rw-r--r--net/rds/page.c215
-rw-r--r--net/rds/rdma.c859
-rw-r--r--net/rds/rdma_transport.c251
-rw-r--r--net/rds/rdma_transport.h24
-rw-r--r--net/rds/rds.h809
-rw-r--r--net/rds/recv.c549
-rw-r--r--net/rds/send.c1165
-rw-r--r--net/rds/stats.c152
-rw-r--r--net/rds/sysctl.c109
-rw-r--r--net/rds/tcp.c326
-rw-r--r--net/rds/tcp.h87
-rw-r--r--net/rds/tcp_connect.c156
-rw-r--r--net/rds/tcp_listen.c246
-rw-r--r--net/rds/tcp_recv.c336
-rw-r--r--net/rds/tcp_send.c215
-rw-r--r--net/rds/tcp_stats.c74
-rw-r--r--net/rds/threads.c222
-rw-r--r--net/rds/transport.c137
-rw-r--r--net/rfkill/Kconfig44
-rw-r--r--net/rfkill/Makefile9
-rw-r--r--net/rfkill/core.c1295
-rw-r--r--net/rfkill/input.c348
-rw-r--r--net/rfkill/rfkill-gpio.c195
-rw-r--r--net/rfkill/rfkill-regulator.c154
-rw-r--r--net/rfkill/rfkill.h27
-rw-r--r--net/rose/Makefile9
-rw-r--r--net/rose/af_rose.c1636
-rw-r--r--net/rose/rose_dev.c144
-rw-r--r--net/rose/rose_in.c294
-rw-r--r--net/rose/rose_link.c292
-rw-r--r--net/rose/rose_loopback.c124
-rw-r--r--net/rose/rose_out.c125
-rw-r--r--net/rose/rose_route.c1366
-rw-r--r--net/rose/rose_subr.c556
-rw-r--r--net/rose/rose_timer.c216
-rw-r--r--net/rose/sysctl_net_rose.c129
-rw-r--r--net/rxrpc/Kconfig44
-rw-r--r--net/rxrpc/Makefile28
-rw-r--r--net/rxrpc/af_rxrpc.c898
-rw-r--r--net/rxrpc/ar-accept.c510
-rw-r--r--net/rxrpc/ar-ack.c1356
-rw-r--r--net/rxrpc/ar-call.c1019
-rw-r--r--net/rxrpc/ar-connection.c928
-rw-r--r--net/rxrpc/ar-connevent.c405
-rw-r--r--net/rxrpc/ar-error.c241
-rw-r--r--net/rxrpc/ar-input.c788
-rw-r--r--net/rxrpc/ar-internal.h812
-rw-r--r--net/rxrpc/ar-key.c1247
-rw-r--r--net/rxrpc/ar-local.c408
-rw-r--r--net/rxrpc/ar-output.c711
-rw-r--r--net/rxrpc/ar-peer.c303
-rw-r--r--net/rxrpc/ar-proc.c192
-rw-r--r--net/rxrpc/ar-recvmsg.c424
-rw-r--r--net/rxrpc/ar-security.c264
-rw-r--r--net/rxrpc/ar-skbuff.c137
-rw-r--r--net/rxrpc/ar-transport.c283
-rw-r--r--net/rxrpc/rxkad.c1161
-rw-r--r--net/rxrpc/sysctl.c146
-rw-r--r--net/sched/Kconfig736
-rw-r--r--net/sched/Makefile66
-rw-r--r--net/sched/act_api.c1094
-rw-r--r--net/sched/act_bpf.c362
-rw-r--r--net/sched/act_connmark.c190
-rw-r--r--net/sched/act_csum.c584
-rw-r--r--net/sched/act_gact.c209
-rw-r--r--net/sched/act_ipt.c311
-rw-r--r--net/sched/act_mirred.c267
-rw-r--r--net/sched/act_nat.c306
-rw-r--r--net/sched/act_pedit.c243
-rw-r--r--net/sched/act_police.c372
-rw-r--r--net/sched/act_simple.c192
-rw-r--r--net/sched/act_skbedit.c199
-rw-r--r--net/sched/act_vlan.c207
-rw-r--r--net/sched/cls_api.c634
-rw-r--r--net/sched/cls_basic.c312
-rw-r--r--net/sched/cls_bpf.c495
-rw-r--r--net/sched/cls_cgroup.c233
-rw-r--r--net/sched/cls_flow.c694
-rw-r--r--net/sched/cls_fw.c438
-rw-r--r--net/sched/cls_route.c674
-rw-r--r--net/sched/cls_rsvp.c28
-rw-r--r--net/sched/cls_rsvp.h732
-rw-r--r--net/sched/cls_rsvp6.c28
-rw-r--r--net/sched/cls_tcindex.c578
-rw-r--r--net/sched/cls_u32.c1075
-rw-r--r--net/sched/em_canid.c233
-rw-r--r--net/sched/em_cmp.c99
-rw-r--r--net/sched/em_ipset.c135
-rw-r--r--net/sched/em_meta.c966
-rw-r--r--net/sched/em_nbyte.c80
-rw-r--r--net/sched/em_text.c157
-rw-r--r--net/sched/em_u32.c64
-rw-r--r--net/sched/ematch.c549
-rw-r--r--net/sched/sch_api.c1970
-rw-r--r--net/sched/sch_atm.c694
-rw-r--r--net/sched/sch_blackhole.c53
-rw-r--r--net/sched/sch_cbq.c2062
-rw-r--r--net/sched/sch_choke.c645
-rw-r--r--net/sched/sch_codel.c276
-rw-r--r--net/sched/sch_drr.c529
-rw-r--r--net/sched/sch_dsmark.c514
-rw-r--r--net/sched/sch_fifo.c180
-rw-r--r--net/sched/sch_fq.c873
-rw-r--r--net/sched/sch_fq_codel.c624
-rw-r--r--net/sched/sch_generic.c990
-rw-r--r--net/sched/sch_gred.c630
-rw-r--r--net/sched/sch_hfsc.c1754
-rw-r--r--net/sched/sch_hhf.c740
-rw-r--r--net/sched/sch_htb.c1630
-rw-r--r--net/sched/sch_ingress.c153
-rw-r--r--net/sched/sch_mq.c246
-rw-r--r--net/sched/sch_mqprio.c428
-rw-r--r--net/sched/sch_multiq.c444
-rw-r--r--net/sched/sch_netem.c1116
-rw-r--r--net/sched/sch_pie.c566
-rw-r--r--net/sched/sch_plug.c233
-rw-r--r--net/sched/sch_prio.c408
-rw-r--r--net/sched/sch_qfq.c1587
-rw-r--r--net/sched/sch_red.c391
-rw-r--r--net/sched/sch_sfb.c728
-rw-r--r--net/sched/sch_sfq.c939
-rw-r--r--net/sched/sch_tbf.c579
-rw-r--r--net/sched/sch_teql.c528
-rw-r--r--net/sctp/Kconfig103
-rw-r--r--net/sctp/Makefile21
-rw-r--r--net/sctp/associola.c1691
-rw-r--r--net/sctp/auth.c953
-rw-r--r--net/sctp/bind_addr.c552
-rw-r--r--net/sctp/chunk.c365
-rw-r--r--net/sctp/debug.c172
-rw-r--r--net/sctp/endpointola.c501
-rw-r--r--net/sctp/input.c1141
-rw-r--r--net/sctp/inqueue.c214
-rw-r--r--net/sctp/ipv6.c1076
-rw-r--r--net/sctp/objcnt.c142
-rw-r--r--net/sctp/output.c788
-rw-r--r--net/sctp/outqueue.c1787
-rw-r--r--net/sctp/primitive.c213
-rw-r--r--net/sctp/probe.c243
-rw-r--r--net/sctp/proc.c555
-rw-r--r--net/sctp/protocol.c1550
-rw-r--r--net/sctp/sm_make_chunk.c3501
-rw-r--r--net/sctp/sm_sideeffect.c1751
-rw-r--r--net/sctp/sm_statefuns.c6301
-rw-r--r--net/sctp/sm_statetable.c933
-rw-r--r--net/sctp/socket.c7419
-rw-r--r--net/sctp/ssnmap.c125
-rw-r--r--net/sctp/sysctl.c501
-rw-r--r--net/sctp/transport.c656
-rw-r--r--net/sctp/tsnmap.c379
-rw-r--r--net/sctp/ulpevent.c1065
-rw-r--r--net/sctp/ulpqueue.c1144
-rw-r--r--net/socket.c3304
-rw-r--r--net/sunrpc/Kconfig75
-rw-r--r--net/sunrpc/Makefile20
-rw-r--r--net/sunrpc/addr.c357
-rw-r--r--net/sunrpc/auth.c891
-rw-r--r--net/sunrpc/auth_generic.c290
-rw-r--r--net/sunrpc/auth_gss/Makefile14
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2097
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c234
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c988
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c327
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c788
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c229
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c166
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c226
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c626
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c481
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c384
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.h48
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c848
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.h267
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c1862
-rw-r--r--net/sunrpc/auth_null.c144
-rw-r--r--net/sunrpc/auth_unix.c247
-rw-r--r--net/sunrpc/backchannel_rqst.c325
-rw-r--r--net/sunrpc/bc_svc.c63
-rw-r--r--net/sunrpc/cache.c1826
-rw-r--r--net/sunrpc/clnt.c2478
-rw-r--r--net/sunrpc/debugfs.c298
-rw-r--r--net/sunrpc/netns.h42
-rw-r--r--net/sunrpc/rpc_pipe.c1527
-rw-r--r--net/sunrpc/rpcb_clnt.c1151
-rw-r--r--net/sunrpc/sched.c1141
-rw-r--r--net/sunrpc/socklib.c187
-rw-r--r--net/sunrpc/stats.c305
-rw-r--r--net/sunrpc/sunrpc.h66
-rw-r--r--net/sunrpc/sunrpc_syms.c136
-rw-r--r--net/sunrpc/svc.c1405
-rw-r--r--net/sunrpc/svc_xprt.c1371
-rw-r--r--net/sunrpc/svcauth.c166
-rw-r--r--net/sunrpc/svcauth_unix.c914
-rw-r--r--net/sunrpc/svcsock.c1672
-rw-r--r--net/sunrpc/sysctl.c185
-rw-r--r--net/sunrpc/timer.c122
-rw-r--r--net/sunrpc/xdr.c1515
-rw-r--r--net/sunrpc/xprt.c1416
-rw-r--r--net/sunrpc/xprtrdma/Makefile9
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c208
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c353
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c94
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c889
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c302
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c370
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c672
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c560
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c1366
-rw-r--r--net/sunrpc/xprtrdma/transport.c758
-rw-r--r--net/sunrpc/xprtrdma/verbs.c1672
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h496
-rw-r--r--net/sunrpc/xprtsock.c3034
-rw-r--r--net/switchdev/Kconfig13
-rw-r--r--net/switchdev/Makefile5
-rw-r--r--net/switchdev/switchdev.c404
-rw-r--r--net/sysctl_net.c114
-rw-r--r--net/tipc/Kconfig36
-rw-r--r--net/tipc/Makefile15
-rw-r--r--net/tipc/addr.c153
-rw-r--r--net/tipc/addr.h68
-rw-r--r--net/tipc/bcast.c986
-rw-r--r--net/tipc/bcast.h136
-rw-r--r--net/tipc/bearer.c1025
-rw-r--r--net/tipc/bearer.h221
-rw-r--r--net/tipc/core.c169
-rw-r--r--net/tipc/core.h116
-rw-r--r--net/tipc/discover.c418
-rw-r--r--net/tipc/discover.h51
-rw-r--r--net/tipc/eth_media.c99
-rw-r--r--net/tipc/ib_media.c101
-rw-r--r--net/tipc/link.c2272
-rw-r--r--net/tipc/link.h311
-rw-r--r--net/tipc/msg.c574
-rw-r--r--net/tipc/msg.h873
-rw-r--r--net/tipc/name_distr.c437
-rw-r--r--net/tipc/name_distr.h79
-rw-r--r--net/tipc/name_table.c1070
-rw-r--r--net/tipc/name_table.h133
-rw-r--r--net/tipc/net.c254
-rw-r--r--net/tipc/net.h49
-rw-r--r--net/tipc/netlink.c178
-rw-r--r--net/tipc/netlink.h53
-rw-r--r--net/tipc/netlink_compat.c1084
-rw-r--r--net/tipc/node.c621
-rw-r--r--net/tipc/node.h187
-rw-r--r--net/tipc/server.c634
-rw-r--r--net/tipc/server.h97
-rw-r--r--net/tipc/socket.c2837
-rw-r--r--net/tipc/socket.h56
-rw-r--r--net/tipc/subscr.c377
-rw-r--r--net/tipc/subscr.h83
-rw-r--r--net/tipc/sysctl.c71
-rw-r--r--net/tipc/udp_media.c448
-rw-r--r--net/unix/Kconfig28
-rw-r--r--net/unix/Makefile11
-rw-r--r--net/unix/af_unix.c2467
-rw-r--r--net/unix/diag.c328
-rw-r--r--net/unix/garbage.c372
-rw-r--r--net/unix/sysctl_net_unix.c61
-rw-r--r--net/vmw_vsock/Kconfig28
-rw-r--r--net/vmw_vsock/Makefile7
-rw-r--r--net/vmw_vsock/af_vsock.c1999
-rw-r--r--net/vmw_vsock/vmci_transport.c2169
-rw-r--r--net/vmw_vsock/vmci_transport.h142
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c680
-rw-r--r--net/vmw_vsock/vmci_transport_notify.h83
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c438
-rw-r--r--net/vmw_vsock/vsock_addr.c75
-rw-r--r--net/wimax/Kconfig39
-rw-r--r--net/wimax/Makefile14
-rw-r--r--net/wimax/debug-levels.h43
-rw-r--r--net/wimax/debugfs.c80
-rw-r--r--net/wimax/id-table.c145
-rw-r--r--net/wimax/op-msg.c407
-rw-r--r--net/wimax/op-reset.c123
-rw-r--r--net/wimax/op-rfkill.c447
-rw-r--r--net/wimax/op-state-get.c65
-rw-r--r--net/wimax/stack.c632
-rw-r--r--net/wimax/wimax-internal.h103
-rw-r--r--net/wireless/.gitignore1
-rw-r--r--net/wireless/Kconfig219
-rw-r--r--net/wireless/Makefile25
-rw-r--r--net/wireless/ap.c51
-rw-r--r--net/wireless/chan.c916
-rw-r--r--net/wireless/core.c1224
-rw-r--r--net/wireless/core.h492
-rw-r--r--net/wireless/db.txt17
-rw-r--r--net/wireless/debugfs.c117
-rw-r--r--net/wireless/debugfs.h11
-rw-r--r--net/wireless/ethtool.c24
-rw-r--r--net/wireless/genregdb.awk158
-rw-r--r--net/wireless/ibss.c542
-rw-r--r--net/wireless/lib80211.c257
-rw-r--r--net/wireless/lib80211_crypt_ccmp.c479
-rw-r--r--net/wireless/lib80211_crypt_tkip.c762
-rw-r--r--net/wireless/lib80211_crypt_wep.c289
-rw-r--r--net/wireless/mesh.c279
-rw-r--r--net/wireless/mlme.c785
-rw-r--r--net/wireless/nl80211.c12984
-rw-r--r--net/wireless/nl80211.h97
-rw-r--r--net/wireless/ocb.c88
-rw-r--r--net/wireless/radiotap.c369
-rw-r--r--net/wireless/rdev-ops.h1021
-rw-r--r--net/wireless/reg.c3205
-rw-r--r--net/wireless/reg.h146
-rw-r--r--net/wireless/regdb.h23
-rw-r--r--net/wireless/scan.c1679
-rw-r--r--net/wireless/sme.c1051
-rw-r--r--net/wireless/sysfs.c170
-rw-r--r--net/wireless/sysfs.h9
-rw-r--r--net/wireless/trace.c7
-rw-r--r--net/wireless/trace.h2824
-rw-r--r--net/wireless/util.c1814
-rw-r--r--net/wireless/wext-compat.c1531
-rw-r--r--net/wireless/wext-compat.h63
-rw-r--r--net/wireless/wext-core.c1088
-rw-r--r--net/wireless/wext-priv.c249
-rw-r--r--net/wireless/wext-proc.c156
-rw-r--r--net/wireless/wext-sme.c414
-rw-r--r--net/wireless/wext-spy.c232
-rw-r--r--net/x25/Kconfig35
-rw-r--r--net/x25/Makefile10
-rw-r--r--net/x25/af_x25.c1850
-rw-r--r--net/x25/sysctl_net_x25.c84
-rw-r--r--net/x25/x25_dev.c232
-rw-r--r--net/x25/x25_facilities.c354
-rw-r--r--net/x25/x25_forward.c170
-rw-r--r--net/x25/x25_in.c419
-rw-r--r--net/x25/x25_link.c411
-rw-r--r--net/x25/x25_out.c231
-rw-r--r--net/x25/x25_proc.c248
-rw-r--r--net/x25/x25_route.c226
-rw-r--r--net/x25/x25_subr.c390
-rw-r--r--net/x25/x25_timer.c174
-rw-r--r--net/xfrm/Kconfig85
-rw-r--r--net/xfrm/Makefile11
-rw-r--r--net/xfrm/xfrm_algo.c800
-rw-r--r--net/xfrm/xfrm_hash.c39
-rw-r--r--net/xfrm/xfrm_hash.h192
-rw-r--r--net/xfrm/xfrm_input.c396
-rw-r--r--net/xfrm/xfrm_ipcomp.c386
-rw-r--r--net/xfrm/xfrm_output.c236
-rw-r--r--net/xfrm/xfrm_policy.c3397
-rw-r--r--net/xfrm/xfrm_proc.c86
-rw-r--r--net/xfrm/xfrm_replay.c605
-rw-r--r--net/xfrm/xfrm_state.c2294
-rw-r--r--net/xfrm/xfrm_sysctl.c86
-rw-r--r--net/xfrm/xfrm_user.c3146
1579 files changed, 890937 insertions, 0 deletions
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig
new file mode 100644
index 000000000..7fa0f382e
--- /dev/null
+++ b/net/6lowpan/Kconfig
@@ -0,0 +1,61 @@
+menuconfig 6LOWPAN
+ tristate "6LoWPAN Support"
+ depends on IPV6
+ ---help---
+ This enables IPv6 over Low power Wireless Personal Area Network -
+ "6LoWPAN" which is supported by IEEE 802.15.4 or Bluetooth stacks.
+
+menuconfig 6LOWPAN_NHC
+ tristate "Next Header Compression Support"
+ depends on 6LOWPAN
+ default y
+ ---help---
+ Support for next header compression.
+
+if 6LOWPAN_NHC
+
+config 6LOWPAN_NHC_DEST
+ tristate "Destination Options Header Support"
+ default y
+ ---help---
+ 6LoWPAN IPv6 Destination Options Header compression according to
+ RFC6282.
+
+config 6LOWPAN_NHC_FRAGMENT
+ tristate "Fragment Header Support"
+ default y
+ ---help---
+ 6LoWPAN IPv6 Fragment Header compression according to RFC6282.
+
+config 6LOWPAN_NHC_HOP
+ tristate "Hop-by-Hop Options Header Support"
+ default y
+ ---help---
+ 6LoWPAN IPv6 Hop-by-Hop Options Header compression according to
+ RFC6282.
+
+config 6LOWPAN_NHC_IPV6
+ tristate "IPv6 Header Support"
+ default y
+ ---help---
+ 6LoWPAN IPv6 Header compression according to RFC6282.
+
+config 6LOWPAN_NHC_MOBILITY
+ tristate "Mobility Header Support"
+ default y
+ ---help---
+ 6LoWPAN IPv6 Mobility Header compression according to RFC6282.
+
+config 6LOWPAN_NHC_ROUTING
+ tristate "Routing Header Support"
+ default y
+ ---help---
+ 6LoWPAN IPv6 Routing Header compression according to RFC6282.
+
+config 6LOWPAN_NHC_UDP
+ tristate "UDP Header Support"
+ default y
+ ---help---
+ 6LoWPAN IPv6 UDP Header compression according to RFC6282.
+
+endif
diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile
new file mode 100644
index 000000000..eb8baa72a
--- /dev/null
+++ b/net/6lowpan/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_6LOWPAN) += 6lowpan.o
+
+6lowpan-y := iphc.o nhc.o
+
+#rfc6282 nhcs
+obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o
+obj-$(CONFIG_6LOWPAN_NHC_FRAGMENT) += nhc_fragment.o
+obj-$(CONFIG_6LOWPAN_NHC_HOP) += nhc_hop.o
+obj-$(CONFIG_6LOWPAN_NHC_IPV6) += nhc_ipv6.o
+obj-$(CONFIG_6LOWPAN_NHC_MOBILITY) += nhc_mobility.o
+obj-$(CONFIG_6LOWPAN_NHC_ROUTING) += nhc_routing.o
+obj-$(CONFIG_6LOWPAN_NHC_UDP) += nhc_udp.o
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
new file mode 100644
index 000000000..94a375c04
--- /dev/null
+++ b/net/6lowpan/iphc.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright 2011, Siemens AG
+ * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+/* Based on patches from Jon Smirl <jonsmirl@gmail.com>
+ * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* Jon's code is based on 6lowpan implementation for Contiki which is:
+ * Copyright (c) 2008, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <linux/bitops.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/6lowpan.h>
+#include <net/ipv6.h>
+#include <net/af_ieee802154.h>
+
+#include "nhc.h"
+
+/* Uncompress address function for source and
+ * destination address(non-multicast).
+ *
+ * address_mode is sam value or dam value.
+ */
+static int uncompress_addr(struct sk_buff *skb,
+ struct in6_addr *ipaddr, const u8 address_mode,
+ const u8 *lladdr, const u8 addr_type,
+ const u8 addr_len)
+{
+ bool fail;
+
+ switch (address_mode) {
+ case LOWPAN_IPHC_ADDR_00:
+ /* for global link addresses */
+ fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16);
+ break;
+ case LOWPAN_IPHC_ADDR_01:
+ /* fe:80::XXXX:XXXX:XXXX:XXXX */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8);
+ break;
+ case LOWPAN_IPHC_ADDR_02:
+ /* fe:80::ff:fe00:XXXX */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ ipaddr->s6_addr[11] = 0xFF;
+ ipaddr->s6_addr[12] = 0xFE;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2);
+ break;
+ case LOWPAN_IPHC_ADDR_03:
+ fail = false;
+ switch (addr_type) {
+ case IEEE802154_ADDR_LONG:
+ /* fe:80::XXXX:XXXX:XXXX:XXXX
+ * \_________________/
+ * hwaddr
+ */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ memcpy(&ipaddr->s6_addr[8], lladdr, addr_len);
+ /* second bit-flip (Universe/Local)
+ * is done according RFC2464
+ */
+ ipaddr->s6_addr[8] ^= 0x02;
+ break;
+ case IEEE802154_ADDR_SHORT:
+ /* fe:80::ff:fe00:XXXX
+ * \__/
+ * short_addr
+ *
+ * Universe/Local bit is zero.
+ */
+ ipaddr->s6_addr[0] = 0xFE;
+ ipaddr->s6_addr[1] = 0x80;
+ ipaddr->s6_addr[11] = 0xFF;
+ ipaddr->s6_addr[12] = 0xFE;
+ ipaddr->s6_addr16[7] = htons(*((u16 *)lladdr));
+ break;
+ default:
+ pr_debug("Invalid addr_type set\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ pr_debug("Invalid address mode value: 0x%x\n", address_mode);
+ return -EINVAL;
+ }
+
+ if (fail) {
+ pr_debug("Failed to fetch skb data\n");
+ return -EIO;
+ }
+
+ raw_dump_inline(NULL, "Reconstructed ipv6 addr is",
+ ipaddr->s6_addr, 16);
+
+ return 0;
+}
+
+/* Uncompress address function for source context
+ * based address(non-multicast).
+ */
+static int uncompress_context_based_src_addr(struct sk_buff *skb,
+ struct in6_addr *ipaddr,
+ const u8 sam)
+{
+ switch (sam) {
+ case LOWPAN_IPHC_ADDR_00:
+ /* unspec address ::
+ * Do nothing, address is already ::
+ */
+ break;
+ case LOWPAN_IPHC_ADDR_01:
+ /* TODO */
+ case LOWPAN_IPHC_ADDR_02:
+ /* TODO */
+ case LOWPAN_IPHC_ADDR_03:
+ /* TODO */
+ netdev_warn(skb->dev, "SAM value 0x%x not supported\n", sam);
+ return -EINVAL;
+ default:
+ pr_debug("Invalid sam value: 0x%x\n", sam);
+ return -EINVAL;
+ }
+
+ raw_dump_inline(NULL,
+ "Reconstructed context based ipv6 src addr is",
+ ipaddr->s6_addr, 16);
+
+ return 0;
+}
+
+/* Uncompress function for multicast destination address,
+ * when M bit is set.
+ */
+static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
+ struct in6_addr *ipaddr,
+ const u8 dam)
+{
+ bool fail;
+
+ switch (dam) {
+ case LOWPAN_IPHC_DAM_00:
+ /* 00: 128 bits. The full address
+ * is carried in-line.
+ */
+ fail = lowpan_fetch_skb(skb, ipaddr->s6_addr, 16);
+ break;
+ case LOWPAN_IPHC_DAM_01:
+ /* 01: 48 bits. The address takes
+ * the form ffXX::00XX:XXXX:XXXX.
+ */
+ ipaddr->s6_addr[0] = 0xFF;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1);
+ fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[11], 5);
+ break;
+ case LOWPAN_IPHC_DAM_10:
+ /* 10: 32 bits. The address takes
+ * the form ffXX::00XX:XXXX.
+ */
+ ipaddr->s6_addr[0] = 0xFF;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 1);
+ fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[13], 3);
+ break;
+ case LOWPAN_IPHC_DAM_11:
+ /* 11: 8 bits. The address takes
+ * the form ff02::00XX.
+ */
+ ipaddr->s6_addr[0] = 0xFF;
+ ipaddr->s6_addr[1] = 0x02;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[15], 1);
+ break;
+ default:
+ pr_debug("DAM value has a wrong value: 0x%x\n", dam);
+ return -EINVAL;
+ }
+
+ if (fail) {
+ pr_debug("Failed to fetch skb data\n");
+ return -EIO;
+ }
+
+ raw_dump_inline(NULL, "Reconstructed ipv6 multicast addr is",
+ ipaddr->s6_addr, 16);
+
+ return 0;
+}
+
+/* TTL uncompression values */
+static const u8 lowpan_ttl_values[] = { 0, 1, 64, 255 };
+
+int
+lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev,
+ const u8 *saddr, const u8 saddr_type,
+ const u8 saddr_len, const u8 *daddr,
+ const u8 daddr_type, const u8 daddr_len,
+ u8 iphc0, u8 iphc1)
+{
+ struct ipv6hdr hdr = {};
+ u8 tmp, num_context = 0;
+ int err;
+
+ raw_dump_table(__func__, "raw skb data dump uncompressed",
+ skb->data, skb->len);
+
+ /* another if the CID flag is set */
+ if (iphc1 & LOWPAN_IPHC_CID) {
+ pr_debug("CID flag is set, increase header with one\n");
+ if (lowpan_fetch_skb(skb, &num_context, sizeof(num_context)))
+ return -EINVAL;
+ }
+
+ hdr.version = 6;
+
+ /* Traffic Class and Flow Label */
+ switch ((iphc0 & LOWPAN_IPHC_TF) >> 3) {
+ /* Traffic Class and FLow Label carried in-line
+ * ECN + DSCP + 4-bit Pad + Flow Label (4 bytes)
+ */
+ case 0: /* 00b */
+ if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp)))
+ return -EINVAL;
+
+ memcpy(&hdr.flow_lbl, &skb->data[0], 3);
+ skb_pull(skb, 3);
+ hdr.priority = ((tmp >> 2) & 0x0f);
+ hdr.flow_lbl[0] = ((tmp >> 2) & 0x30) | (tmp << 6) |
+ (hdr.flow_lbl[0] & 0x0f);
+ break;
+ /* Traffic class carried in-line
+ * ECN + DSCP (1 byte), Flow Label is elided
+ */
+ case 2: /* 10b */
+ if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp)))
+ return -EINVAL;
+
+ hdr.priority = ((tmp >> 2) & 0x0f);
+ hdr.flow_lbl[0] = ((tmp << 6) & 0xC0) | ((tmp >> 2) & 0x30);
+ break;
+ /* Flow Label carried in-line
+ * ECN + 2-bit Pad + Flow Label (3 bytes), DSCP is elided
+ */
+ case 1: /* 01b */
+ if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp)))
+ return -EINVAL;
+
+ hdr.flow_lbl[0] = (skb->data[0] & 0x0F) | ((tmp >> 2) & 0x30);
+ memcpy(&hdr.flow_lbl[1], &skb->data[0], 2);
+ skb_pull(skb, 2);
+ break;
+ /* Traffic Class and Flow Label are elided */
+ case 3: /* 11b */
+ break;
+ default:
+ break;
+ }
+
+ /* Next Header */
+ if ((iphc0 & LOWPAN_IPHC_NH_C) == 0) {
+ /* Next header is carried inline */
+ if (lowpan_fetch_skb(skb, &hdr.nexthdr, sizeof(hdr.nexthdr)))
+ return -EINVAL;
+
+ pr_debug("NH flag is set, next header carried inline: %02x\n",
+ hdr.nexthdr);
+ }
+
+ /* Hop Limit */
+ if ((iphc0 & 0x03) != LOWPAN_IPHC_TTL_I) {
+ hdr.hop_limit = lowpan_ttl_values[iphc0 & 0x03];
+ } else {
+ if (lowpan_fetch_skb(skb, &hdr.hop_limit,
+ sizeof(hdr.hop_limit)))
+ return -EINVAL;
+ }
+
+ /* Extract SAM to the tmp variable */
+ tmp = ((iphc1 & LOWPAN_IPHC_SAM) >> LOWPAN_IPHC_SAM_BIT) & 0x03;
+
+ if (iphc1 & LOWPAN_IPHC_SAC) {
+ /* Source address context based uncompression */
+ pr_debug("SAC bit is set. Handle context based source address.\n");
+ err = uncompress_context_based_src_addr(skb, &hdr.saddr, tmp);
+ } else {
+ /* Source address uncompression */
+ pr_debug("source address stateless compression\n");
+ err = uncompress_addr(skb, &hdr.saddr, tmp, saddr,
+ saddr_type, saddr_len);
+ }
+
+ /* Check on error of previous branch */
+ if (err)
+ return -EINVAL;
+
+ /* Extract DAM to the tmp variable */
+ tmp = ((iphc1 & LOWPAN_IPHC_DAM_11) >> LOWPAN_IPHC_DAM_BIT) & 0x03;
+
+ /* check for Multicast Compression */
+ if (iphc1 & LOWPAN_IPHC_M) {
+ if (iphc1 & LOWPAN_IPHC_DAC) {
+ pr_debug("dest: context-based mcast compression\n");
+ /* TODO: implement this */
+ } else {
+ err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
+ tmp);
+
+ if (err)
+ return -EINVAL;
+ }
+ } else {
+ err = uncompress_addr(skb, &hdr.daddr, tmp, daddr,
+ daddr_type, daddr_len);
+ pr_debug("dest: stateless compression mode %d dest %pI6c\n",
+ tmp, &hdr.daddr);
+ if (err)
+ return -EINVAL;
+ }
+
+ /* Next header data uncompression */
+ if (iphc0 & LOWPAN_IPHC_NH_C) {
+ err = lowpan_nhc_do_uncompression(skb, dev, &hdr);
+ if (err < 0)
+ return err;
+ } else {
+ err = skb_cow(skb, sizeof(hdr));
+ if (unlikely(err))
+ return err;
+ }
+
+ hdr.payload_len = htons(skb->len);
+
+ pr_debug("skb headroom size = %d, data length = %d\n",
+ skb_headroom(skb), skb->len);
+
+ pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n\t"
+ "nexthdr = 0x%02x\n\thop_lim = %d\n\tdest = %pI6c\n",
+ hdr.version, ntohs(hdr.payload_len), hdr.nexthdr,
+ hdr.hop_limit, &hdr.daddr);
+
+ skb_push(skb, sizeof(hdr));
+ skb_reset_network_header(skb);
+ skb_copy_to_linear_data(skb, &hdr, sizeof(hdr));
+
+ raw_dump_table(__func__, "raw header dump", (u8 *)&hdr, sizeof(hdr));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(lowpan_header_decompress);
+
+static u8 lowpan_compress_addr_64(u8 **hc_ptr, u8 shift,
+ const struct in6_addr *ipaddr,
+ const unsigned char *lladdr)
+{
+ u8 val = 0;
+
+ if (is_addr_mac_addr_based(ipaddr, lladdr)) {
+ val = 3; /* 0-bits */
+ pr_debug("address compression 0 bits\n");
+ } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) {
+ /* compress IID to 16 bits xxxx::XXXX */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2);
+ val = 2; /* 16-bits */
+ raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)",
+ *hc_ptr - 2, 2);
+ } else {
+ /* do not compress IID => xxxx::IID */
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8);
+ val = 1; /* 64-bits */
+ raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)",
+ *hc_ptr - 8, 8);
+ }
+
+ return rol8(val, shift);
+}
+
+int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *_daddr,
+ const void *_saddr, unsigned int len)
+{
+ u8 tmp, iphc0, iphc1, *hc_ptr;
+ struct ipv6hdr *hdr;
+ u8 head[100] = {};
+ int ret, addr_type;
+
+ if (type != ETH_P_IPV6)
+ return -EINVAL;
+
+ hdr = ipv6_hdr(skb);
+ hc_ptr = head + 2;
+
+ pr_debug("IPv6 header dump:\n\tversion = %d\n\tlength = %d\n"
+ "\tnexthdr = 0x%02x\n\thop_lim = %d\n\tdest = %pI6c\n",
+ hdr->version, ntohs(hdr->payload_len), hdr->nexthdr,
+ hdr->hop_limit, &hdr->daddr);
+
+ raw_dump_table(__func__, "raw skb network header dump",
+ skb_network_header(skb), sizeof(struct ipv6hdr));
+
+ /* As we copy some bit-length fields, in the IPHC encoding bytes,
+ * we sometimes use |=
+ * If the field is 0, and the current bit value in memory is 1,
+ * this does not work. We therefore reset the IPHC encoding here
+ */
+ iphc0 = LOWPAN_DISPATCH_IPHC;
+ iphc1 = 0;
+
+ /* TODO: context lookup */
+
+ raw_dump_inline(__func__, "saddr",
+ (unsigned char *)_saddr, IEEE802154_ADDR_LEN);
+ raw_dump_inline(__func__, "daddr",
+ (unsigned char *)_daddr, IEEE802154_ADDR_LEN);
+
+ raw_dump_table(__func__, "sending raw skb network uncompressed packet",
+ skb->data, skb->len);
+
+ /* Traffic class, flow label
+ * If flow label is 0, compress it. If traffic class is 0, compress it
+ * We have to process both in the same time as the offset of traffic
+ * class depends on the presence of version and flow label
+ */
+
+ /* hc format of TC is ECN | DSCP , original one is DSCP | ECN */
+ tmp = (hdr->priority << 4) | (hdr->flow_lbl[0] >> 4);
+ tmp = ((tmp & 0x03) << 6) | (tmp >> 2);
+
+ if (((hdr->flow_lbl[0] & 0x0F) == 0) &&
+ (hdr->flow_lbl[1] == 0) && (hdr->flow_lbl[2] == 0)) {
+ /* flow label can be compressed */
+ iphc0 |= LOWPAN_IPHC_FL_C;
+ if ((hdr->priority == 0) &&
+ ((hdr->flow_lbl[0] & 0xF0) == 0)) {
+ /* compress (elide) all */
+ iphc0 |= LOWPAN_IPHC_TC_C;
+ } else {
+ /* compress only the flow label */
+ *hc_ptr = tmp;
+ hc_ptr += 1;
+ }
+ } else {
+ /* Flow label cannot be compressed */
+ if ((hdr->priority == 0) &&
+ ((hdr->flow_lbl[0] & 0xF0) == 0)) {
+ /* compress only traffic class */
+ iphc0 |= LOWPAN_IPHC_TC_C;
+ *hc_ptr = (tmp & 0xc0) | (hdr->flow_lbl[0] & 0x0F);
+ memcpy(hc_ptr + 1, &hdr->flow_lbl[1], 2);
+ hc_ptr += 3;
+ } else {
+ /* compress nothing */
+ memcpy(hc_ptr, hdr, 4);
+ /* replace the top byte with new ECN | DSCP format */
+ *hc_ptr = tmp;
+ hc_ptr += 4;
+ }
+ }
+
+ /* NOTE: payload length is always compressed */
+
+ /* Check if we provide the nhc format for nexthdr and compression
+ * functionality. If not nexthdr is handled inline and not compressed.
+ */
+ ret = lowpan_nhc_check_compression(skb, hdr, &hc_ptr, &iphc0);
+ if (ret < 0)
+ return ret;
+
+ /* Hop limit
+ * if 1: compress, encoding is 01
+ * if 64: compress, encoding is 10
+ * if 255: compress, encoding is 11
+ * else do not compress
+ */
+ switch (hdr->hop_limit) {
+ case 1:
+ iphc0 |= LOWPAN_IPHC_TTL_1;
+ break;
+ case 64:
+ iphc0 |= LOWPAN_IPHC_TTL_64;
+ break;
+ case 255:
+ iphc0 |= LOWPAN_IPHC_TTL_255;
+ break;
+ default:
+ lowpan_push_hc_data(&hc_ptr, &hdr->hop_limit,
+ sizeof(hdr->hop_limit));
+ }
+
+ addr_type = ipv6_addr_type(&hdr->saddr);
+ /* source address compression */
+ if (addr_type == IPV6_ADDR_ANY) {
+ pr_debug("source address is unspecified, setting SAC\n");
+ iphc1 |= LOWPAN_IPHC_SAC;
+ } else {
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+ LOWPAN_IPHC_SAM_BIT,
+ &hdr->saddr, _saddr);
+ pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
+ &hdr->saddr, iphc1);
+ } else {
+ pr_debug("send the full source address\n");
+ lowpan_push_hc_data(&hc_ptr, hdr->saddr.s6_addr, 16);
+ }
+ }
+
+ addr_type = ipv6_addr_type(&hdr->daddr);
+ /* destination address compression */
+ if (addr_type & IPV6_ADDR_MULTICAST) {
+ pr_debug("destination address is multicast: ");
+ iphc1 |= LOWPAN_IPHC_M;
+ if (lowpan_is_mcast_addr_compressable8(&hdr->daddr)) {
+ pr_debug("compressed to 1 octet\n");
+ iphc1 |= LOWPAN_IPHC_DAM_11;
+ /* use last byte */
+ lowpan_push_hc_data(&hc_ptr,
+ &hdr->daddr.s6_addr[15], 1);
+ } else if (lowpan_is_mcast_addr_compressable32(&hdr->daddr)) {
+ pr_debug("compressed to 4 octets\n");
+ iphc1 |= LOWPAN_IPHC_DAM_10;
+ /* second byte + the last three */
+ lowpan_push_hc_data(&hc_ptr,
+ &hdr->daddr.s6_addr[1], 1);
+ lowpan_push_hc_data(&hc_ptr,
+ &hdr->daddr.s6_addr[13], 3);
+ } else if (lowpan_is_mcast_addr_compressable48(&hdr->daddr)) {
+ pr_debug("compressed to 6 octets\n");
+ iphc1 |= LOWPAN_IPHC_DAM_01;
+ /* second byte + the last five */
+ lowpan_push_hc_data(&hc_ptr,
+ &hdr->daddr.s6_addr[1], 1);
+ lowpan_push_hc_data(&hc_ptr,
+ &hdr->daddr.s6_addr[11], 5);
+ } else {
+ pr_debug("using full address\n");
+ iphc1 |= LOWPAN_IPHC_DAM_00;
+ lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16);
+ }
+ } else {
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ /* TODO: context lookup */
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+ LOWPAN_IPHC_DAM_BIT, &hdr->daddr, _daddr);
+ pr_debug("dest address unicast link-local %pI6c "
+ "iphc1 0x%02x\n", &hdr->daddr, iphc1);
+ } else {
+ pr_debug("dest address unicast %pI6c\n", &hdr->daddr);
+ lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16);
+ }
+ }
+
+ /* next header compression */
+ if (iphc0 & LOWPAN_IPHC_NH_C) {
+ ret = lowpan_nhc_do_compression(skb, hdr, &hc_ptr);
+ if (ret < 0)
+ return ret;
+ }
+
+ head[0] = iphc0;
+ head[1] = iphc1;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_reset_transport_header(skb);
+ memcpy(skb_push(skb, hc_ptr - head), head, hc_ptr - head);
+ skb_reset_network_header(skb);
+
+ pr_debug("header len %d skb %u\n", (int)(hc_ptr - head), skb->len);
+
+ raw_dump_table(__func__, "raw skb data dump compressed",
+ skb->data, skb->len);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(lowpan_header_compress);
+
+static int __init lowpan_module_init(void)
+{
+ request_module_nowait("nhc_dest");
+ request_module_nowait("nhc_fragment");
+ request_module_nowait("nhc_hop");
+ request_module_nowait("nhc_ipv6");
+ request_module_nowait("nhc_mobility");
+ request_module_nowait("nhc_routing");
+ request_module_nowait("nhc_udp");
+
+ return 0;
+}
+module_init(lowpan_module_init);
+
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
new file mode 100644
index 000000000..fd20fc51a
--- /dev/null
+++ b/net/6lowpan/nhc.c
@@ -0,0 +1,241 @@
+/*
+ * 6LoWPAN next header compression
+ *
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+
+#include <net/ipv6.h>
+
+#include "nhc.h"
+
+static struct rb_root rb_root = RB_ROOT;
+static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX];
+static DEFINE_SPINLOCK(lowpan_nhc_lock);
+
+static int lowpan_nhc_insert(struct lowpan_nhc *nhc)
+{
+ struct rb_node **new = &rb_root.rb_node, *parent = NULL;
+
+ /* Figure out where to put new node */
+ while (*new) {
+ struct lowpan_nhc *this = container_of(*new, struct lowpan_nhc,
+ node);
+ int result, len_dif, len;
+
+ len_dif = nhc->idlen - this->idlen;
+
+ if (nhc->idlen < this->idlen)
+ len = nhc->idlen;
+ else
+ len = this->idlen;
+
+ result = memcmp(nhc->id, this->id, len);
+ if (!result)
+ result = len_dif;
+
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&nhc->node, parent, new);
+ rb_insert_color(&nhc->node, &rb_root);
+
+ return 0;
+}
+
+static void lowpan_nhc_remove(struct lowpan_nhc *nhc)
+{
+ rb_erase(&nhc->node, &rb_root);
+}
+
+static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
+{
+ struct rb_node *node = rb_root.rb_node;
+ const u8 *nhcid_skb_ptr = skb->data;
+
+ while (node) {
+ struct lowpan_nhc *nhc = container_of(node, struct lowpan_nhc,
+ node);
+ u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN];
+ int result, i;
+
+ if (nhcid_skb_ptr + nhc->idlen > skb->data + skb->len)
+ return NULL;
+
+ /* copy and mask afterwards the nhid value from skb */
+ memcpy(nhcid_skb_ptr_masked, nhcid_skb_ptr, nhc->idlen);
+ for (i = 0; i < nhc->idlen; i++)
+ nhcid_skb_ptr_masked[i] &= nhc->idmask[i];
+
+ result = memcmp(nhcid_skb_ptr_masked, nhc->id, nhc->idlen);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return nhc;
+ }
+
+ return NULL;
+}
+
+int lowpan_nhc_check_compression(struct sk_buff *skb,
+ const struct ipv6hdr *hdr, u8 **hc_ptr,
+ u8 *iphc0)
+{
+ struct lowpan_nhc *nhc;
+
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ nhc = lowpan_nexthdr_nhcs[hdr->nexthdr];
+ if (nhc && nhc->compress)
+ *iphc0 |= LOWPAN_IPHC_NH_C;
+ else
+ lowpan_push_hc_data(hc_ptr, &hdr->nexthdr,
+ sizeof(hdr->nexthdr));
+
+ spin_unlock_bh(&lowpan_nhc_lock);
+
+ return 0;
+}
+
+int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr,
+ u8 **hc_ptr)
+{
+ int ret;
+ struct lowpan_nhc *nhc;
+
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ nhc = lowpan_nexthdr_nhcs[hdr->nexthdr];
+ /* check if the nhc module was removed in unlocked part.
+ * TODO: this is a workaround we should prevent unloading
+ * of nhc modules while unlocked part, this will always drop
+ * the lowpan packet but it's very unlikely.
+ *
+ * Solution isn't easy because we need to decide at
+ * lowpan_nhc_check_compression if we do a compression or not.
+ * Because the inline data which is added to skb, we can't move this
+ * handling.
+ */
+ if (unlikely(!nhc || !nhc->compress)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* In the case of RAW sockets the transport header is not set by
+ * the ip6 stack so we must set it ourselves
+ */
+ if (skb->transport_header == skb->network_header)
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ ret = nhc->compress(skb, hc_ptr);
+ if (ret < 0)
+ goto out;
+
+ /* skip the transport header */
+ skb_pull(skb, nhc->nexthdrlen);
+
+out:
+ spin_unlock_bh(&lowpan_nhc_lock);
+
+ return ret;
+}
+
+int lowpan_nhc_do_uncompression(struct sk_buff *skb, struct net_device *dev,
+ struct ipv6hdr *hdr)
+{
+ struct lowpan_nhc *nhc;
+ int ret;
+
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ nhc = lowpan_nhc_by_nhcid(skb);
+ if (nhc) {
+ if (nhc->uncompress) {
+ ret = nhc->uncompress(skb, sizeof(struct ipv6hdr) +
+ nhc->nexthdrlen);
+ if (ret < 0) {
+ spin_unlock_bh(&lowpan_nhc_lock);
+ return ret;
+ }
+ } else {
+ spin_unlock_bh(&lowpan_nhc_lock);
+ netdev_warn(dev, "received nhc id for %s which is not implemented.\n",
+ nhc->name);
+ return -ENOTSUPP;
+ }
+ } else {
+ spin_unlock_bh(&lowpan_nhc_lock);
+ netdev_warn(dev, "received unknown nhc id which was not found.\n");
+ return -ENOENT;
+ }
+
+ hdr->nexthdr = nhc->nexthdr;
+ skb_reset_transport_header(skb);
+ raw_dump_table(__func__, "raw transport header dump",
+ skb_transport_header(skb), nhc->nexthdrlen);
+
+ spin_unlock_bh(&lowpan_nhc_lock);
+
+ return 0;
+}
+
+int lowpan_nhc_add(struct lowpan_nhc *nhc)
+{
+ int ret;
+
+ if (!nhc->idlen || !nhc->idsetup)
+ return -EINVAL;
+
+ WARN_ONCE(nhc->idlen > LOWPAN_NHC_MAX_ID_LEN,
+ "LOWPAN_NHC_MAX_ID_LEN should be updated to %zd.\n",
+ nhc->idlen);
+
+ nhc->idsetup(nhc);
+
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ if (lowpan_nexthdr_nhcs[nhc->nexthdr]) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = lowpan_nhc_insert(nhc);
+ if (ret < 0)
+ goto out;
+
+ lowpan_nexthdr_nhcs[nhc->nexthdr] = nhc;
+out:
+ spin_unlock_bh(&lowpan_nhc_lock);
+ return ret;
+}
+EXPORT_SYMBOL(lowpan_nhc_add);
+
+void lowpan_nhc_del(struct lowpan_nhc *nhc)
+{
+ spin_lock_bh(&lowpan_nhc_lock);
+
+ lowpan_nhc_remove(nhc);
+ lowpan_nexthdr_nhcs[nhc->nexthdr] = NULL;
+
+ spin_unlock_bh(&lowpan_nhc_lock);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(lowpan_nhc_del);
diff --git a/net/6lowpan/nhc.h b/net/6lowpan/nhc.h
new file mode 100644
index 000000000..ed44938eb
--- /dev/null
+++ b/net/6lowpan/nhc.h
@@ -0,0 +1,146 @@
+#ifndef __6LOWPAN_NHC_H
+#define __6LOWPAN_NHC_H
+
+#include <linux/skbuff.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+
+#include <net/6lowpan.h>
+#include <net/ipv6.h>
+
+#define LOWPAN_NHC_MAX_ID_LEN 1
+
+/**
+ * LOWPAN_NHC - helper macro to generate nh id fields and lowpan_nhc struct
+ *
+ * @__nhc: variable name of the lowpan_nhc struct.
+ * @_name: const char * of common header compression name.
+ * @_nexthdr: ipv6 nexthdr field for the header compression.
+ * @_nexthdrlen: ipv6 nexthdr len for the reserved space.
+ * @_idsetup: callback to setup id and mask values.
+ * @_idlen: len for the next header id and mask, should be always the same.
+ * @_uncompress: callback for uncompression call.
+ * @_compress: callback for compression call.
+ */
+#define LOWPAN_NHC(__nhc, _name, _nexthdr, \
+ _hdrlen, _idsetup, _idlen, \
+ _uncompress, _compress) \
+static u8 __nhc##_val[_idlen]; \
+static u8 __nhc##_mask[_idlen]; \
+static struct lowpan_nhc __nhc = { \
+ .name = _name, \
+ .nexthdr = _nexthdr, \
+ .nexthdrlen = _hdrlen, \
+ .id = __nhc##_val, \
+ .idmask = __nhc##_mask, \
+ .idlen = _idlen, \
+ .idsetup = _idsetup, \
+ .uncompress = _uncompress, \
+ .compress = _compress, \
+}
+
+#define module_lowpan_nhc(__nhc) \
+static int __init __nhc##_init(void) \
+{ \
+ return lowpan_nhc_add(&(__nhc)); \
+} \
+module_init(__nhc##_init); \
+static void __exit __nhc##_exit(void) \
+{ \
+ lowpan_nhc_del(&(__nhc)); \
+} \
+module_exit(__nhc##_exit);
+
+/**
+ * struct lowpan_nhc - hold 6lowpan next hdr compression ifnformation
+ *
+ * @node: holder for the rbtree.
+ * @name: name of the specific next header compression
+ * @nexthdr: next header value of the protocol which should be compressed.
+ * @nexthdrlen: ipv6 nexthdr len for the reserved space.
+ * @id: array for nhc id. Note this need to be in network byteorder.
+ * @mask: array for nhc id mask. Note this need to be in network byteorder.
+ * @len: the length of the next header id and mask.
+ * @setup: callback to setup fill the next header id value and mask.
+ * @compress: callback to do the header compression.
+ * @uncompress: callback to do the header uncompression.
+ */
+struct lowpan_nhc {
+ struct rb_node node;
+ const char *name;
+ const u8 nexthdr;
+ const size_t nexthdrlen;
+ u8 *id;
+ u8 *idmask;
+ const size_t idlen;
+
+ void (*idsetup)(struct lowpan_nhc *nhc);
+ int (*uncompress)(struct sk_buff *skb, size_t needed);
+ int (*compress)(struct sk_buff *skb, u8 **hc_ptr);
+};
+
+/**
+ * lowpan_nhc_by_nexthdr - return the 6lowpan nhc by ipv6 nexthdr.
+ *
+ * @nexthdr: ipv6 nexthdr value.
+ */
+struct lowpan_nhc *lowpan_nhc_by_nexthdr(u8 nexthdr);
+
+/**
+ * lowpan_nhc_check_compression - checks if we support compression format. If
+ * we support the nhc by nexthdr field, the 6LoWPAN iphc NHC bit will be
+ * set. If we don't support nexthdr will be added as inline data to the
+ * 6LoWPAN header.
+ *
+ * @skb: skb of 6LoWPAN header to read nhc and replace header.
+ * @hdr: ipv6hdr to check the nexthdr value
+ * @hc_ptr: pointer for 6LoWPAN header which should increment at the end of
+ * replaced header.
+ * @iphc0: iphc0 pointer to set the 6LoWPAN NHC bit
+ */
+int lowpan_nhc_check_compression(struct sk_buff *skb,
+ const struct ipv6hdr *hdr, u8 **hc_ptr,
+ u8 *iphc0);
+
+/**
+ * lowpan_nhc_do_compression - calling compress callback for nhc
+ *
+ * @skb: skb of 6LoWPAN header to read nhc and replace header.
+ * @hdr: ipv6hdr to set the nexthdr value
+ * @hc_ptr: pointer for 6LoWPAN header which should increment at the end of
+ * replaced header.
+ */
+int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr,
+ u8 **hc_ptr);
+
+/**
+ * lowpan_nhc_do_uncompression - calling uncompress callback for nhc
+ *
+ * @nhc: 6LoWPAN nhc context, get by lowpan_nhc_by_ functions.
+ * @skb: skb of 6LoWPAN header, skb->data should be pointed to nhc id value.
+ * @dev: netdevice for print logging information.
+ * @hdr: ipv6hdr for setting nexthdr value.
+ */
+int lowpan_nhc_do_uncompression(struct sk_buff *skb, struct net_device *dev,
+ struct ipv6hdr *hdr);
+
+/**
+ * lowpan_nhc_add - register a next header compression to framework
+ *
+ * @nhc: nhc which should be add.
+ */
+int lowpan_nhc_add(struct lowpan_nhc *nhc);
+
+/**
+ * lowpan_nhc_del - delete a next header compression from framework
+ *
+ * @nhc: nhc which should be delete.
+ */
+void lowpan_nhc_del(struct lowpan_nhc *nhc);
+
+/**
+ * lowpan_nhc_init - adding all default nhcs
+ */
+void lowpan_nhc_init(void);
+
+#endif /* __6LOWPAN_NHC_H */
diff --git a/net/6lowpan/nhc_dest.c b/net/6lowpan/nhc_dest.c
new file mode 100644
index 000000000..0b292c964
--- /dev/null
+++ b/net/6lowpan/nhc_dest.c
@@ -0,0 +1,28 @@
+/*
+ * 6LoWPAN IPv6 Destination Options Header compression according to
+ * RFC6282
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_DEST_IDLEN 1
+#define LOWPAN_NHC_DEST_ID_0 0xe6
+#define LOWPAN_NHC_DEST_MASK_0 0xfe
+
+static void dest_nhid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_NHC_DEST_ID_0;
+ nhc->idmask[0] = LOWPAN_NHC_DEST_MASK_0;
+}
+
+LOWPAN_NHC(nhc_dest, "RFC6282 Destination Options", NEXTHDR_DEST, 0,
+ dest_nhid_setup, LOWPAN_NHC_DEST_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(nhc_dest);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Destination Options compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_fragment.c b/net/6lowpan/nhc_fragment.c
new file mode 100644
index 000000000..473dbc58e
--- /dev/null
+++ b/net/6lowpan/nhc_fragment.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN IPv6 Fragment Header compression according to RFC6282
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_FRAGMENT_IDLEN 1
+#define LOWPAN_NHC_FRAGMENT_ID_0 0xe4
+#define LOWPAN_NHC_FRAGMENT_MASK_0 0xfe
+
+static void fragment_nhid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_NHC_FRAGMENT_ID_0;
+ nhc->idmask[0] = LOWPAN_NHC_FRAGMENT_MASK_0;
+}
+
+LOWPAN_NHC(nhc_fragment, "RFC6282 Fragment", NEXTHDR_FRAGMENT, 0,
+ fragment_nhid_setup, LOWPAN_NHC_FRAGMENT_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(nhc_fragment);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Fragment compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_hop.c b/net/6lowpan/nhc_hop.c
new file mode 100644
index 000000000..1eb66be16
--- /dev/null
+++ b/net/6lowpan/nhc_hop.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN IPv6 Hop-by-Hop Options Header compression according to RFC6282
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_HOP_IDLEN 1
+#define LOWPAN_NHC_HOP_ID_0 0xe0
+#define LOWPAN_NHC_HOP_MASK_0 0xfe
+
+static void hop_nhid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_NHC_HOP_ID_0;
+ nhc->idmask[0] = LOWPAN_NHC_HOP_MASK_0;
+}
+
+LOWPAN_NHC(nhc_hop, "RFC6282 Hop-by-Hop Options", NEXTHDR_HOP, 0,
+ hop_nhid_setup, LOWPAN_NHC_HOP_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(nhc_hop);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Hop-by-Hop Options compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ipv6.c b/net/6lowpan/nhc_ipv6.c
new file mode 100644
index 000000000..2313d1600
--- /dev/null
+++ b/net/6lowpan/nhc_ipv6.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN IPv6 Header compression according to RFC6282
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_IPV6_IDLEN 1
+#define LOWPAN_NHC_IPV6_ID_0 0xee
+#define LOWPAN_NHC_IPV6_MASK_0 0xfe
+
+static void ipv6_nhid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_NHC_IPV6_ID_0;
+ nhc->idmask[0] = LOWPAN_NHC_IPV6_MASK_0;
+}
+
+LOWPAN_NHC(nhc_ipv6, "RFC6282 IPv6", NEXTHDR_IPV6, 0, ipv6_nhid_setup,
+ LOWPAN_NHC_IPV6_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(nhc_ipv6);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 IPv6 compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_mobility.c b/net/6lowpan/nhc_mobility.c
new file mode 100644
index 000000000..60d3f3886
--- /dev/null
+++ b/net/6lowpan/nhc_mobility.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN IPv6 Mobility Header compression according to RFC6282
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_MOBILITY_IDLEN 1
+#define LOWPAN_NHC_MOBILITY_ID_0 0xe8
+#define LOWPAN_NHC_MOBILITY_MASK_0 0xfe
+
+static void mobility_nhid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_NHC_MOBILITY_ID_0;
+ nhc->idmask[0] = LOWPAN_NHC_MOBILITY_MASK_0;
+}
+
+LOWPAN_NHC(nhc_mobility, "RFC6282 Mobility", NEXTHDR_MOBILITY, 0,
+ mobility_nhid_setup, LOWPAN_NHC_MOBILITY_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(nhc_mobility);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Mobility compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_routing.c b/net/6lowpan/nhc_routing.c
new file mode 100644
index 000000000..c393280f1
--- /dev/null
+++ b/net/6lowpan/nhc_routing.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN IPv6 Routing Header compression according to RFC6282
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_ROUTING_IDLEN 1
+#define LOWPAN_NHC_ROUTING_ID_0 0xe2
+#define LOWPAN_NHC_ROUTING_MASK_0 0xfe
+
+static void routing_nhid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_NHC_ROUTING_ID_0;
+ nhc->idmask[0] = LOWPAN_NHC_ROUTING_MASK_0;
+}
+
+LOWPAN_NHC(nhc_routing, "RFC6282 Routing", NEXTHDR_ROUTING, 0,
+ routing_nhid_setup, LOWPAN_NHC_ROUTING_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(nhc_routing);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Routing compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c
new file mode 100644
index 000000000..c6bcaeb42
--- /dev/null
+++ b/net/6lowpan/nhc_udp.c
@@ -0,0 +1,157 @@
+/*
+ * 6LoWPAN IPv6 UDP compression according to RFC6282
+ *
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Orignal written by:
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ * Jon Smirl <jonsmirl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_NHC_UDP_IDLEN 1
+
+static int udp_uncompress(struct sk_buff *skb, size_t needed)
+{
+ u8 tmp = 0, val = 0;
+ struct udphdr uh;
+ bool fail;
+ int err;
+
+ fail = lowpan_fetch_skb(skb, &tmp, sizeof(tmp));
+
+ pr_debug("UDP header uncompression\n");
+ switch (tmp & LOWPAN_NHC_UDP_CS_P_11) {
+ case LOWPAN_NHC_UDP_CS_P_00:
+ fail |= lowpan_fetch_skb(skb, &uh.source, sizeof(uh.source));
+ fail |= lowpan_fetch_skb(skb, &uh.dest, sizeof(uh.dest));
+ break;
+ case LOWPAN_NHC_UDP_CS_P_01:
+ fail |= lowpan_fetch_skb(skb, &uh.source, sizeof(uh.source));
+ fail |= lowpan_fetch_skb(skb, &val, sizeof(val));
+ uh.dest = htons(val + LOWPAN_NHC_UDP_8BIT_PORT);
+ break;
+ case LOWPAN_NHC_UDP_CS_P_10:
+ fail |= lowpan_fetch_skb(skb, &val, sizeof(val));
+ uh.source = htons(val + LOWPAN_NHC_UDP_8BIT_PORT);
+ fail |= lowpan_fetch_skb(skb, &uh.dest, sizeof(uh.dest));
+ break;
+ case LOWPAN_NHC_UDP_CS_P_11:
+ fail |= lowpan_fetch_skb(skb, &val, sizeof(val));
+ uh.source = htons(LOWPAN_NHC_UDP_4BIT_PORT + (val >> 4));
+ uh.dest = htons(LOWPAN_NHC_UDP_4BIT_PORT + (val & 0x0f));
+ break;
+ default:
+ BUG();
+ }
+
+ pr_debug("uncompressed UDP ports: src = %d, dst = %d\n",
+ ntohs(uh.source), ntohs(uh.dest));
+
+ /* checksum */
+ if (tmp & LOWPAN_NHC_UDP_CS_C) {
+ pr_debug_ratelimited("checksum elided currently not supported\n");
+ fail = true;
+ } else {
+ fail |= lowpan_fetch_skb(skb, &uh.check, sizeof(uh.check));
+ }
+
+ if (fail)
+ return -EINVAL;
+
+ /* UDP length needs to be infered from the lower layers
+ * here, we obtain the hint from the remaining size of the
+ * frame
+ */
+ uh.len = htons(skb->len + sizeof(struct udphdr));
+ pr_debug("uncompressed UDP length: src = %d", ntohs(uh.len));
+
+ /* replace the compressed UDP head by the uncompressed UDP
+ * header
+ */
+ err = skb_cow(skb, needed);
+ if (unlikely(err))
+ return err;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_copy_to_linear_data(skb, &uh, sizeof(struct udphdr));
+
+ return 0;
+}
+
+static int udp_compress(struct sk_buff *skb, u8 **hc_ptr)
+{
+ const struct udphdr *uh = udp_hdr(skb);
+ u8 tmp;
+
+ if (((ntohs(uh->source) & LOWPAN_NHC_UDP_4BIT_MASK) ==
+ LOWPAN_NHC_UDP_4BIT_PORT) &&
+ ((ntohs(uh->dest) & LOWPAN_NHC_UDP_4BIT_MASK) ==
+ LOWPAN_NHC_UDP_4BIT_PORT)) {
+ pr_debug("UDP header: both ports compression to 4 bits\n");
+ /* compression value */
+ tmp = LOWPAN_NHC_UDP_CS_P_11;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* source and destination port */
+ tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_4BIT_PORT +
+ ((ntohs(uh->source) - LOWPAN_NHC_UDP_4BIT_PORT) << 4);
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ } else if ((ntohs(uh->dest) & LOWPAN_NHC_UDP_8BIT_MASK) ==
+ LOWPAN_NHC_UDP_8BIT_PORT) {
+ pr_debug("UDP header: remove 8 bits of dest\n");
+ /* compression value */
+ tmp = LOWPAN_NHC_UDP_CS_P_01;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* source port */
+ lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source));
+ /* destination port */
+ tmp = ntohs(uh->dest) - LOWPAN_NHC_UDP_8BIT_PORT;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ } else if ((ntohs(uh->source) & LOWPAN_NHC_UDP_8BIT_MASK) ==
+ LOWPAN_NHC_UDP_8BIT_PORT) {
+ pr_debug("UDP header: remove 8 bits of source\n");
+ /* compression value */
+ tmp = LOWPAN_NHC_UDP_CS_P_10;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* source port */
+ tmp = ntohs(uh->source) - LOWPAN_NHC_UDP_8BIT_PORT;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* destination port */
+ lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest));
+ } else {
+ pr_debug("UDP header: can't compress\n");
+ /* compression value */
+ tmp = LOWPAN_NHC_UDP_CS_P_00;
+ lowpan_push_hc_data(hc_ptr, &tmp, sizeof(tmp));
+ /* source port */
+ lowpan_push_hc_data(hc_ptr, &uh->source, sizeof(uh->source));
+ /* destination port */
+ lowpan_push_hc_data(hc_ptr, &uh->dest, sizeof(uh->dest));
+ }
+
+ /* checksum is always inline */
+ lowpan_push_hc_data(hc_ptr, &uh->check, sizeof(uh->check));
+
+ return 0;
+}
+
+static void udp_nhid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_NHC_UDP_ID;
+ nhc->idmask[0] = LOWPAN_NHC_UDP_MASK;
+}
+
+LOWPAN_NHC(nhc_udp, "RFC6282 UDP", NEXTHDR_UDP, sizeof(struct udphdr),
+ udp_nhid_setup, LOWPAN_NHC_UDP_IDLEN, udp_uncompress, udp_compress);
+
+module_lowpan_nhc(nhc_udp);
+MODULE_DESCRIPTION("6LoWPAN next header RFC6282 UDP compression");
+MODULE_LICENSE("GPL");
diff --git a/net/802/Kconfig b/net/802/Kconfig
new file mode 100644
index 000000000..80d4bf789
--- /dev/null
+++ b/net/802/Kconfig
@@ -0,0 +1,10 @@
+config STP
+ tristate
+ select LLC
+
+config GARP
+ tristate
+ select STP
+
+config MRP
+ tristate
diff --git a/net/802/Makefile b/net/802/Makefile
new file mode 100644
index 000000000..37e654d66
--- /dev/null
+++ b/net/802/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for the Linux 802.x protocol layers.
+#
+
+# Check the p8022 selections against net/core/Makefile.
+obj-$(CONFIG_LLC) += p8022.o psnap.o
+obj-$(CONFIG_NET_FC) += fc.o
+obj-$(CONFIG_FDDI) += fddi.o
+obj-$(CONFIG_HIPPI) += hippi.o
+obj-$(CONFIG_IPX) += p8022.o psnap.o p8023.o
+obj-$(CONFIG_ATALK) += p8022.o psnap.o
+obj-$(CONFIG_STP) += stp.o
+obj-$(CONFIG_GARP) += garp.o
+obj-$(CONFIG_MRP) += mrp.o
diff --git a/net/802/fc.c b/net/802/fc.c
new file mode 100644
index 000000000..7b9219022
--- /dev/null
+++ b/net/802/fc.c
@@ -0,0 +1,110 @@
+/*
+ * NET3: Fibre Channel device handling subroutines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Vineet Abraham <vma@iol.unh.edu>
+ * v 1.0 03/22/99
+ */
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/fcdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <net/arp.h>
+
+/*
+ * Put the headers on a Fibre Channel packet.
+ */
+
+static int fc_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ struct fch_hdr *fch;
+ int hdr_len;
+
+ /*
+ * Add the 802.2 SNAP header if IP as the IPv4 code calls
+ * dev->hard_header directly.
+ */
+ if (type == ETH_P_IP || type == ETH_P_ARP)
+ {
+ struct fcllc *fcllc;
+
+ hdr_len = sizeof(struct fch_hdr) + sizeof(struct fcllc);
+ fch = (struct fch_hdr *)skb_push(skb, hdr_len);
+ fcllc = (struct fcllc *)(fch+1);
+ fcllc->dsap = fcllc->ssap = EXTENDED_SAP;
+ fcllc->llc = UI_CMD;
+ fcllc->protid[0] = fcllc->protid[1] = fcllc->protid[2] = 0x00;
+ fcllc->ethertype = htons(type);
+ }
+ else
+ {
+ hdr_len = sizeof(struct fch_hdr);
+ fch = (struct fch_hdr *)skb_push(skb, hdr_len);
+ }
+
+ if(saddr)
+ memcpy(fch->saddr,saddr,dev->addr_len);
+ else
+ memcpy(fch->saddr,dev->dev_addr,dev->addr_len);
+
+ if(daddr)
+ {
+ memcpy(fch->daddr,daddr,dev->addr_len);
+ return hdr_len;
+ }
+ return -hdr_len;
+}
+
+static const struct header_ops fc_header_ops = {
+ .create = fc_header,
+};
+
+static void fc_setup(struct net_device *dev)
+{
+ dev->header_ops = &fc_header_ops;
+ dev->type = ARPHRD_IEEE802;
+ dev->hard_header_len = FC_HLEN;
+ dev->mtu = 2024;
+ dev->addr_len = FC_ALEN;
+ dev->tx_queue_len = 100; /* Long queues on fc */
+ dev->flags = IFF_BROADCAST;
+
+ memset(dev->broadcast, 0xFF, FC_ALEN);
+}
+
+/**
+ * alloc_fcdev - Register fibre channel device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ * for this fibre channel device
+ *
+ * Fill in the fields of the device structure with fibre channel-generic values.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+struct net_device *alloc_fcdev(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "fc%d", NET_NAME_UNKNOWN, fc_setup);
+}
+EXPORT_SYMBOL(alloc_fcdev);
diff --git a/net/802/fddi.c b/net/802/fddi.c
new file mode 100644
index 000000000..7d3a0af95
--- /dev/null
+++ b/net/802/fddi.c
@@ -0,0 +1,189 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * FDDI-type device handling.
+ *
+ * Version: @(#)fddi.c 1.0.0 08/12/96
+ *
+ * Authors: Lawrence V. Stefani, <stefani@lkg.dec.com>
+ *
+ * fddi.c is based on previous eth.c and tr.c work by
+ * Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Florian La Roche, <rzsfl@rz.uni-sb.de>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes
+ * Alan Cox : New arp/rebuild header
+ * Maciej W. Rozycki : IPv6 support
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <net/arp.h>
+#include <net/sock.h>
+
+/*
+ * Create the FDDI MAC header for an arbitrary protocol layer
+ *
+ * saddr=NULL means use device source address
+ * daddr=NULL means leave destination address (eg unresolved arp)
+ */
+
+static int fddi_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ int hl = FDDI_K_SNAP_HLEN;
+ struct fddihdr *fddi;
+
+ if(type != ETH_P_IP && type != ETH_P_IPV6 && type != ETH_P_ARP)
+ hl=FDDI_K_8022_HLEN-3;
+ fddi = (struct fddihdr *)skb_push(skb, hl);
+ fddi->fc = FDDI_FC_K_ASYNC_LLC_DEF;
+ if(type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP)
+ {
+ fddi->hdr.llc_snap.dsap = FDDI_EXTENDED_SAP;
+ fddi->hdr.llc_snap.ssap = FDDI_EXTENDED_SAP;
+ fddi->hdr.llc_snap.ctrl = FDDI_UI_CMD;
+ fddi->hdr.llc_snap.oui[0] = 0x00;
+ fddi->hdr.llc_snap.oui[1] = 0x00;
+ fddi->hdr.llc_snap.oui[2] = 0x00;
+ fddi->hdr.llc_snap.ethertype = htons(type);
+ }
+
+ /* Set the source and destination hardware addresses */
+
+ if (saddr != NULL)
+ memcpy(fddi->saddr, saddr, dev->addr_len);
+ else
+ memcpy(fddi->saddr, dev->dev_addr, dev->addr_len);
+
+ if (daddr != NULL)
+ {
+ memcpy(fddi->daddr, daddr, dev->addr_len);
+ return hl;
+ }
+
+ return -hl;
+}
+
+/*
+ * Determine the packet's protocol ID and fill in skb fields.
+ * This routine is called before an incoming packet is passed
+ * up. It's used to fill in specific skb fields and to set
+ * the proper pointer to the start of packet data (skb->data).
+ */
+
+__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+ struct fddihdr *fddi = (struct fddihdr *)skb->data;
+ __be16 type;
+
+ /*
+ * Set mac.raw field to point to FC byte, set data field to point
+ * to start of packet data. Assume 802.2 SNAP frames for now.
+ */
+
+ skb->dev = dev;
+ skb_reset_mac_header(skb); /* point to frame control (FC) */
+
+ if(fddi->hdr.llc_8022_1.dsap==0xe0)
+ {
+ skb_pull(skb, FDDI_K_8022_HLEN-3);
+ type = htons(ETH_P_802_2);
+ }
+ else
+ {
+ skb_pull(skb, FDDI_K_SNAP_HLEN); /* adjust for 21 byte header */
+ type=fddi->hdr.llc_snap.ethertype;
+ }
+
+ /* Set packet type based on destination address and flag settings */
+
+ if (*fddi->daddr & 0x01)
+ {
+ if (memcmp(fddi->daddr, dev->broadcast, FDDI_K_ALEN) == 0)
+ skb->pkt_type = PACKET_BROADCAST;
+ else
+ skb->pkt_type = PACKET_MULTICAST;
+ }
+
+ else if (dev->flags & IFF_PROMISC)
+ {
+ if (memcmp(fddi->daddr, dev->dev_addr, FDDI_K_ALEN))
+ skb->pkt_type = PACKET_OTHERHOST;
+ }
+
+ /* Assume 802.2 SNAP frames, for now */
+
+ return type;
+}
+
+EXPORT_SYMBOL(fddi_type_trans);
+
+int fddi_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL(fddi_change_mtu);
+
+static const struct header_ops fddi_header_ops = {
+ .create = fddi_header,
+};
+
+
+static void fddi_setup(struct net_device *dev)
+{
+ dev->header_ops = &fddi_header_ops;
+ dev->type = ARPHRD_FDDI;
+ dev->hard_header_len = FDDI_K_SNAP_HLEN+3; /* Assume 802.2 SNAP hdr len + 3 pad bytes */
+ dev->mtu = FDDI_K_SNAP_DLEN; /* Assume max payload of 802.2 SNAP frame */
+ dev->addr_len = FDDI_K_ALEN;
+ dev->tx_queue_len = 100; /* Long queues on FDDI */
+ dev->flags = IFF_BROADCAST | IFF_MULTICAST;
+
+ memset(dev->broadcast, 0xFF, FDDI_K_ALEN);
+}
+
+/**
+ * alloc_fddidev - Register FDDI device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ * for this FDDI device
+ *
+ * Fill in the fields of the device structure with FDDI-generic values.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+struct net_device *alloc_fddidev(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "fddi%d", NET_NAME_UNKNOWN,
+ fddi_setup);
+}
+EXPORT_SYMBOL(alloc_fddidev);
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/garp.c b/net/802/garp.c
new file mode 100644
index 000000000..b38ee6dcb
--- /dev/null
+++ b/net/802/garp.c
@@ -0,0 +1,638 @@
+/*
+ * IEEE 802.1D Generic Attribute Registration Protocol (GARP)
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/garp.h>
+#include <asm/unaligned.h>
+
+static unsigned int garp_join_time __read_mostly = 200;
+module_param(garp_join_time, uint, 0644);
+MODULE_PARM_DESC(garp_join_time, "Join time in ms (default 200ms)");
+MODULE_LICENSE("GPL");
+
+static const struct garp_state_trans {
+ u8 state;
+ u8 action;
+} garp_applicant_state_table[GARP_APPLICANT_MAX + 1][GARP_EVENT_MAX + 1] = {
+ [GARP_APPLICANT_VA] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA,
+ .action = GARP_ACTION_S_JOIN_IN },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AA },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA },
+ },
+ [GARP_APPLICANT_AA] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA,
+ .action = GARP_ACTION_S_JOIN_IN },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA },
+ },
+ [GARP_APPLICANT_QA] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QA },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_LA },
+ },
+ [GARP_APPLICANT_LA] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_VO,
+ .action = GARP_ACTION_S_LEAVE_EMPTY },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_LA },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_LA },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_LA },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VA },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID },
+ },
+ [GARP_APPLICANT_VP] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_AA,
+ .action = GARP_ACTION_S_JOIN_IN },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AP },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_VO },
+ },
+ [GARP_APPLICANT_AP] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_QA,
+ .action = GARP_ACTION_S_JOIN_IN },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_AO },
+ },
+ [GARP_APPLICANT_QP] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QP },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_QO },
+ },
+ [GARP_APPLICANT_VO] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_AO },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_VP },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID },
+ },
+ [GARP_APPLICANT_AO] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_AP },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID },
+ },
+ [GARP_APPLICANT_QO] = {
+ [GARP_EVENT_TRANSMIT_PDU] = { .state = GARP_APPLICANT_INVALID },
+ [GARP_EVENT_R_JOIN_IN] = { .state = GARP_APPLICANT_QO },
+ [GARP_EVENT_R_JOIN_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_IN] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_R_LEAVE_EMPTY] = { .state = GARP_APPLICANT_VO },
+ [GARP_EVENT_REQ_JOIN] = { .state = GARP_APPLICANT_QP },
+ [GARP_EVENT_REQ_LEAVE] = { .state = GARP_APPLICANT_INVALID },
+ },
+};
+
+static int garp_attr_cmp(const struct garp_attr *attr,
+ const void *data, u8 len, u8 type)
+{
+ if (attr->type != type)
+ return attr->type - type;
+ if (attr->dlen != len)
+ return attr->dlen - len;
+ return memcmp(attr->data, data, len);
+}
+
+static struct garp_attr *garp_attr_lookup(const struct garp_applicant *app,
+ const void *data, u8 len, u8 type)
+{
+ struct rb_node *parent = app->gid.rb_node;
+ struct garp_attr *attr;
+ int d;
+
+ while (parent) {
+ attr = rb_entry(parent, struct garp_attr, node);
+ d = garp_attr_cmp(attr, data, len, type);
+ if (d > 0)
+ parent = parent->rb_left;
+ else if (d < 0)
+ parent = parent->rb_right;
+ else
+ return attr;
+ }
+ return NULL;
+}
+
+static struct garp_attr *garp_attr_create(struct garp_applicant *app,
+ const void *data, u8 len, u8 type)
+{
+ struct rb_node *parent = NULL, **p = &app->gid.rb_node;
+ struct garp_attr *attr;
+ int d;
+
+ while (*p) {
+ parent = *p;
+ attr = rb_entry(parent, struct garp_attr, node);
+ d = garp_attr_cmp(attr, data, len, type);
+ if (d > 0)
+ p = &parent->rb_left;
+ else if (d < 0)
+ p = &parent->rb_right;
+ else {
+ /* The attribute already exists; re-use it. */
+ return attr;
+ }
+ }
+ attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC);
+ if (!attr)
+ return attr;
+ attr->state = GARP_APPLICANT_VO;
+ attr->type = type;
+ attr->dlen = len;
+ memcpy(attr->data, data, len);
+
+ rb_link_node(&attr->node, parent, p);
+ rb_insert_color(&attr->node, &app->gid);
+ return attr;
+}
+
+static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr)
+{
+ rb_erase(&attr->node, &app->gid);
+ kfree(attr);
+}
+
+static int garp_pdu_init(struct garp_applicant *app)
+{
+ struct sk_buff *skb;
+ struct garp_pdu_hdr *gp;
+
+#define LLC_RESERVE sizeof(struct llc_pdu_un)
+ skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev),
+ GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ skb->dev = app->dev;
+ skb->protocol = htons(ETH_P_802_2);
+ skb_reserve(skb, LL_RESERVED_SPACE(app->dev) + LLC_RESERVE);
+
+ gp = (struct garp_pdu_hdr *)__skb_put(skb, sizeof(*gp));
+ put_unaligned(htons(GARP_PROTOCOL_ID), &gp->protocol);
+
+ app->pdu = skb;
+ return 0;
+}
+
+static int garp_pdu_append_end_mark(struct garp_applicant *app)
+{
+ if (skb_tailroom(app->pdu) < sizeof(u8))
+ return -1;
+ *(u8 *)__skb_put(app->pdu, sizeof(u8)) = GARP_END_MARK;
+ return 0;
+}
+
+static void garp_pdu_queue(struct garp_applicant *app)
+{
+ if (!app->pdu)
+ return;
+
+ garp_pdu_append_end_mark(app);
+ garp_pdu_append_end_mark(app);
+
+ llc_pdu_header_init(app->pdu, LLC_PDU_TYPE_U, LLC_SAP_BSPAN,
+ LLC_SAP_BSPAN, LLC_PDU_CMD);
+ llc_pdu_init_as_ui_cmd(app->pdu);
+ llc_mac_hdr_init(app->pdu, app->dev->dev_addr,
+ app->app->proto.group_address);
+
+ skb_queue_tail(&app->queue, app->pdu);
+ app->pdu = NULL;
+}
+
+static void garp_queue_xmit(struct garp_applicant *app)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&app->queue)))
+ dev_queue_xmit(skb);
+}
+
+static int garp_pdu_append_msg(struct garp_applicant *app, u8 attrtype)
+{
+ struct garp_msg_hdr *gm;
+
+ if (skb_tailroom(app->pdu) < sizeof(*gm))
+ return -1;
+ gm = (struct garp_msg_hdr *)__skb_put(app->pdu, sizeof(*gm));
+ gm->attrtype = attrtype;
+ garp_cb(app->pdu)->cur_type = attrtype;
+ return 0;
+}
+
+static int garp_pdu_append_attr(struct garp_applicant *app,
+ const struct garp_attr *attr,
+ enum garp_attr_event event)
+{
+ struct garp_attr_hdr *ga;
+ unsigned int len;
+ int err;
+again:
+ if (!app->pdu) {
+ err = garp_pdu_init(app);
+ if (err < 0)
+ return err;
+ }
+
+ if (garp_cb(app->pdu)->cur_type != attr->type) {
+ if (garp_cb(app->pdu)->cur_type &&
+ garp_pdu_append_end_mark(app) < 0)
+ goto queue;
+ if (garp_pdu_append_msg(app, attr->type) < 0)
+ goto queue;
+ }
+
+ len = sizeof(*ga) + attr->dlen;
+ if (skb_tailroom(app->pdu) < len)
+ goto queue;
+ ga = (struct garp_attr_hdr *)__skb_put(app->pdu, len);
+ ga->len = len;
+ ga->event = event;
+ memcpy(ga->data, attr->data, attr->dlen);
+ return 0;
+
+queue:
+ garp_pdu_queue(app);
+ goto again;
+}
+
+static void garp_attr_event(struct garp_applicant *app,
+ struct garp_attr *attr, enum garp_event event)
+{
+ enum garp_applicant_state state;
+
+ state = garp_applicant_state_table[attr->state][event].state;
+ if (state == GARP_APPLICANT_INVALID)
+ return;
+
+ switch (garp_applicant_state_table[attr->state][event].action) {
+ case GARP_ACTION_NONE:
+ break;
+ case GARP_ACTION_S_JOIN_IN:
+ /* When appending the attribute fails, don't update state in
+ * order to retry on next TRANSMIT_PDU event. */
+ if (garp_pdu_append_attr(app, attr, GARP_JOIN_IN) < 0)
+ return;
+ break;
+ case GARP_ACTION_S_LEAVE_EMPTY:
+ garp_pdu_append_attr(app, attr, GARP_LEAVE_EMPTY);
+ /* As a pure applicant, sending a leave message implies that
+ * the attribute was unregistered and can be destroyed. */
+ garp_attr_destroy(app, attr);
+ return;
+ default:
+ WARN_ON(1);
+ }
+
+ attr->state = state;
+}
+
+int garp_request_join(const struct net_device *dev,
+ const struct garp_application *appl,
+ const void *data, u8 len, u8 type)
+{
+ struct garp_port *port = rtnl_dereference(dev->garp_port);
+ struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+ struct garp_attr *attr;
+
+ spin_lock_bh(&app->lock);
+ attr = garp_attr_create(app, data, len, type);
+ if (!attr) {
+ spin_unlock_bh(&app->lock);
+ return -ENOMEM;
+ }
+ garp_attr_event(app, attr, GARP_EVENT_REQ_JOIN);
+ spin_unlock_bh(&app->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(garp_request_join);
+
+void garp_request_leave(const struct net_device *dev,
+ const struct garp_application *appl,
+ const void *data, u8 len, u8 type)
+{
+ struct garp_port *port = rtnl_dereference(dev->garp_port);
+ struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+ struct garp_attr *attr;
+
+ spin_lock_bh(&app->lock);
+ attr = garp_attr_lookup(app, data, len, type);
+ if (!attr) {
+ spin_unlock_bh(&app->lock);
+ return;
+ }
+ garp_attr_event(app, attr, GARP_EVENT_REQ_LEAVE);
+ spin_unlock_bh(&app->lock);
+}
+EXPORT_SYMBOL_GPL(garp_request_leave);
+
+static void garp_gid_event(struct garp_applicant *app, enum garp_event event)
+{
+ struct rb_node *node, *next;
+ struct garp_attr *attr;
+
+ for (node = rb_first(&app->gid);
+ next = node ? rb_next(node) : NULL, node != NULL;
+ node = next) {
+ attr = rb_entry(node, struct garp_attr, node);
+ garp_attr_event(app, attr, event);
+ }
+}
+
+static void garp_join_timer_arm(struct garp_applicant *app)
+{
+ unsigned long delay;
+
+ delay = (u64)msecs_to_jiffies(garp_join_time) * prandom_u32() >> 32;
+ mod_timer(&app->join_timer, jiffies + delay);
+}
+
+static void garp_join_timer(unsigned long data)
+{
+ struct garp_applicant *app = (struct garp_applicant *)data;
+
+ spin_lock(&app->lock);
+ garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+ garp_pdu_queue(app);
+ spin_unlock(&app->lock);
+
+ garp_queue_xmit(app);
+ garp_join_timer_arm(app);
+}
+
+static int garp_pdu_parse_end_mark(struct sk_buff *skb)
+{
+ if (!pskb_may_pull(skb, sizeof(u8)))
+ return -1;
+ if (*skb->data == GARP_END_MARK) {
+ skb_pull(skb, sizeof(u8));
+ return -1;
+ }
+ return 0;
+}
+
+static int garp_pdu_parse_attr(struct garp_applicant *app, struct sk_buff *skb,
+ u8 attrtype)
+{
+ const struct garp_attr_hdr *ga;
+ struct garp_attr *attr;
+ enum garp_event event;
+ unsigned int dlen;
+
+ if (!pskb_may_pull(skb, sizeof(*ga)))
+ return -1;
+ ga = (struct garp_attr_hdr *)skb->data;
+ if (ga->len < sizeof(*ga))
+ return -1;
+
+ if (!pskb_may_pull(skb, ga->len))
+ return -1;
+ skb_pull(skb, ga->len);
+ dlen = sizeof(*ga) - ga->len;
+
+ if (attrtype > app->app->maxattr)
+ return 0;
+
+ switch (ga->event) {
+ case GARP_LEAVE_ALL:
+ if (dlen != 0)
+ return -1;
+ garp_gid_event(app, GARP_EVENT_R_LEAVE_EMPTY);
+ return 0;
+ case GARP_JOIN_EMPTY:
+ event = GARP_EVENT_R_JOIN_EMPTY;
+ break;
+ case GARP_JOIN_IN:
+ event = GARP_EVENT_R_JOIN_IN;
+ break;
+ case GARP_LEAVE_EMPTY:
+ event = GARP_EVENT_R_LEAVE_EMPTY;
+ break;
+ case GARP_EMPTY:
+ event = GARP_EVENT_R_EMPTY;
+ break;
+ default:
+ return 0;
+ }
+
+ if (dlen == 0)
+ return -1;
+ attr = garp_attr_lookup(app, ga->data, dlen, attrtype);
+ if (attr == NULL)
+ return 0;
+ garp_attr_event(app, attr, event);
+ return 0;
+}
+
+static int garp_pdu_parse_msg(struct garp_applicant *app, struct sk_buff *skb)
+{
+ const struct garp_msg_hdr *gm;
+
+ if (!pskb_may_pull(skb, sizeof(*gm)))
+ return -1;
+ gm = (struct garp_msg_hdr *)skb->data;
+ if (gm->attrtype == 0)
+ return -1;
+ skb_pull(skb, sizeof(*gm));
+
+ while (skb->len > 0) {
+ if (garp_pdu_parse_attr(app, skb, gm->attrtype) < 0)
+ return -1;
+ if (garp_pdu_parse_end_mark(skb) < 0)
+ break;
+ }
+ return 0;
+}
+
+static void garp_pdu_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct garp_application *appl = proto->data;
+ struct garp_port *port;
+ struct garp_applicant *app;
+ const struct garp_pdu_hdr *gp;
+
+ port = rcu_dereference(dev->garp_port);
+ if (!port)
+ goto err;
+ app = rcu_dereference(port->applicants[appl->type]);
+ if (!app)
+ goto err;
+
+ if (!pskb_may_pull(skb, sizeof(*gp)))
+ goto err;
+ gp = (struct garp_pdu_hdr *)skb->data;
+ if (get_unaligned(&gp->protocol) != htons(GARP_PROTOCOL_ID))
+ goto err;
+ skb_pull(skb, sizeof(*gp));
+
+ spin_lock(&app->lock);
+ while (skb->len > 0) {
+ if (garp_pdu_parse_msg(app, skb) < 0)
+ break;
+ if (garp_pdu_parse_end_mark(skb) < 0)
+ break;
+ }
+ spin_unlock(&app->lock);
+err:
+ kfree_skb(skb);
+}
+
+static int garp_init_port(struct net_device *dev)
+{
+ struct garp_port *port;
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+ rcu_assign_pointer(dev->garp_port, port);
+ return 0;
+}
+
+static void garp_release_port(struct net_device *dev)
+{
+ struct garp_port *port = rtnl_dereference(dev->garp_port);
+ unsigned int i;
+
+ for (i = 0; i <= GARP_APPLICATION_MAX; i++) {
+ if (rtnl_dereference(port->applicants[i]))
+ return;
+ }
+ RCU_INIT_POINTER(dev->garp_port, NULL);
+ kfree_rcu(port, rcu);
+}
+
+int garp_init_applicant(struct net_device *dev, struct garp_application *appl)
+{
+ struct garp_applicant *app;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!rtnl_dereference(dev->garp_port)) {
+ err = garp_init_port(dev);
+ if (err < 0)
+ goto err1;
+ }
+
+ err = -ENOMEM;
+ app = kzalloc(sizeof(*app), GFP_KERNEL);
+ if (!app)
+ goto err2;
+
+ err = dev_mc_add(dev, appl->proto.group_address);
+ if (err < 0)
+ goto err3;
+
+ app->dev = dev;
+ app->app = appl;
+ app->gid = RB_ROOT;
+ spin_lock_init(&app->lock);
+ skb_queue_head_init(&app->queue);
+ rcu_assign_pointer(dev->garp_port->applicants[appl->type], app);
+ setup_timer(&app->join_timer, garp_join_timer, (unsigned long)app);
+ garp_join_timer_arm(app);
+ return 0;
+
+err3:
+ kfree(app);
+err2:
+ garp_release_port(dev);
+err1:
+ return err;
+}
+EXPORT_SYMBOL_GPL(garp_init_applicant);
+
+void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl)
+{
+ struct garp_port *port = rtnl_dereference(dev->garp_port);
+ struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+
+ ASSERT_RTNL();
+
+ RCU_INIT_POINTER(port->applicants[appl->type], NULL);
+
+ /* Delete timer and generate a final TRANSMIT_PDU event to flush out
+ * all pending messages before the applicant is gone. */
+ del_timer_sync(&app->join_timer);
+
+ spin_lock_bh(&app->lock);
+ garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+ garp_pdu_queue(app);
+ spin_unlock_bh(&app->lock);
+
+ garp_queue_xmit(app);
+
+ dev_mc_del(dev, appl->proto.group_address);
+ kfree_rcu(app, rcu);
+ garp_release_port(dev);
+}
+EXPORT_SYMBOL_GPL(garp_uninit_applicant);
+
+int garp_register_application(struct garp_application *appl)
+{
+ appl->proto.rcv = garp_pdu_rcv;
+ appl->proto.data = appl;
+ return stp_proto_register(&appl->proto);
+}
+EXPORT_SYMBOL_GPL(garp_register_application);
+
+void garp_unregister_application(struct garp_application *appl)
+{
+ stp_proto_unregister(&appl->proto);
+}
+EXPORT_SYMBOL_GPL(garp_unregister_application);
diff --git a/net/802/hippi.c b/net/802/hippi.c
new file mode 100644
index 000000000..ade1a52cd
--- /dev/null
+++ b/net/802/hippi.c
@@ -0,0 +1,207 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * HIPPI-type device handling.
+ *
+ * Version: @(#)hippi.c 1.0.0 05/29/97
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Florian La Roche, <rzsfl@rz.uni-sb.de>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Jes Sorensen, <Jes.Sorensen@cern.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/hippidevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <net/arp.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+
+/*
+ * Create the HIPPI MAC header for an arbitrary protocol layer
+ *
+ * saddr=NULL means use device source address
+ * daddr=NULL means leave destination address (eg unresolved arp)
+ */
+
+static int hippi_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN);
+ struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
+
+ if (!len){
+ len = skb->len - HIPPI_HLEN;
+ printk("hippi_header(): length not supplied\n");
+ }
+
+ /*
+ * Due to the stupidity of the little endian byte-order we
+ * have to set the fp field this way.
+ */
+ hip->fp.fixed = htonl(0x04800018);
+ hip->fp.d2_size = htonl(len + 8);
+ hip->le.fc = 0;
+ hip->le.double_wide = 0; /* only HIPPI 800 for the time being */
+ hip->le.message_type = 0; /* Data PDU */
+
+ hip->le.dest_addr_type = 2; /* 12 bit SC address */
+ hip->le.src_addr_type = 2; /* 12 bit SC address */
+
+ memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3);
+ memset(&hip->le.reserved, 0, 16);
+
+ hip->snap.dsap = HIPPI_EXTENDED_SAP;
+ hip->snap.ssap = HIPPI_EXTENDED_SAP;
+ hip->snap.ctrl = HIPPI_UI_CMD;
+ hip->snap.oui[0] = 0x00;
+ hip->snap.oui[1] = 0x00;
+ hip->snap.oui[2] = 0x00;
+ hip->snap.ethertype = htons(type);
+
+ if (daddr)
+ {
+ memcpy(hip->le.dest_switch_addr, daddr + 3, 3);
+ memcpy(&hcb->ifield, daddr + 2, 4);
+ return HIPPI_HLEN;
+ }
+ hcb->ifield = 0;
+ return -((int)HIPPI_HLEN);
+}
+
+
+/*
+ * Determine the packet's protocol ID.
+ */
+
+__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+ struct hippi_hdr *hip;
+
+ /*
+ * This is actually wrong ... question is if we really should
+ * set the raw address here.
+ */
+ skb->dev = dev;
+ skb_reset_mac_header(skb);
+ hip = (struct hippi_hdr *)skb_mac_header(skb);
+ skb_pull(skb, HIPPI_HLEN);
+
+ /*
+ * No fancy promisc stuff here now.
+ */
+
+ return hip->snap.ethertype;
+}
+
+EXPORT_SYMBOL(hippi_type_trans);
+
+int hippi_change_mtu(struct net_device *dev, int new_mtu)
+{
+ /*
+ * HIPPI's got these nice large MTUs.
+ */
+ if ((new_mtu < 68) || (new_mtu > 65280))
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL(hippi_change_mtu);
+
+/*
+ * For HIPPI we will actually use the lower 4 bytes of the hardware
+ * address as the I-FIELD rather than the actual hardware address.
+ */
+int hippi_mac_addr(struct net_device *dev, void *p)
+{
+ struct sockaddr *addr = p;
+ if (netif_running(dev))
+ return -EBUSY;
+ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ return 0;
+}
+EXPORT_SYMBOL(hippi_mac_addr);
+
+int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p)
+{
+ /* Never send broadcast/multicast ARP messages */
+ NEIGH_VAR_INIT(p, MCAST_PROBES, 0);
+
+ /* In IPv6 unicast probes are valid even on NBMA,
+ * because they are encapsulated in normal IPv6 protocol.
+ * Should be a generic flag.
+ */
+ if (p->tbl->family != AF_INET6)
+ NEIGH_VAR_INIT(p, UCAST_PROBES, 0);
+ return 0;
+}
+EXPORT_SYMBOL(hippi_neigh_setup_dev);
+
+static const struct header_ops hippi_header_ops = {
+ .create = hippi_header,
+};
+
+
+static void hippi_setup(struct net_device *dev)
+{
+ dev->header_ops = &hippi_header_ops;
+
+ /*
+ * We don't support HIPPI `ARP' for the time being, and probably
+ * never will unless someone else implements it. However we
+ * still need a fake ARPHRD to make ifconfig and friends play ball.
+ */
+ dev->type = ARPHRD_HIPPI;
+ dev->hard_header_len = HIPPI_HLEN;
+ dev->mtu = 65280;
+ dev->addr_len = HIPPI_ALEN;
+ dev->tx_queue_len = 25 /* 5 */;
+ memset(dev->broadcast, 0xFF, HIPPI_ALEN);
+
+
+ /*
+ * HIPPI doesn't support broadcast+multicast and we only use
+ * static ARP tables. ARP is disabled by hippi_neigh_setup_dev.
+ */
+ dev->flags = 0;
+}
+
+/**
+ * alloc_hippi_dev - Register HIPPI device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ * for this HIPPI device
+ *
+ * Fill in the fields of the device structure with HIPPI-generic values.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+
+struct net_device *alloc_hippi_dev(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "hip%d", NET_NAME_UNKNOWN,
+ hippi_setup);
+}
+
+EXPORT_SYMBOL(alloc_hippi_dev);
diff --git a/net/802/mrp.c b/net/802/mrp.c
new file mode 100644
index 000000000..72db2785e
--- /dev/null
+++ b/net/802/mrp.c
@@ -0,0 +1,926 @@
+/*
+ * IEEE 802.1Q Multiple Registration Protocol (MRP)
+ *
+ * Copyright (c) 2012 Massachusetts Institute of Technology
+ *
+ * Adapted from code in net/802/garp.c
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/mrp.h>
+#include <asm/unaligned.h>
+
+static unsigned int mrp_join_time __read_mostly = 200;
+module_param(mrp_join_time, uint, 0644);
+MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)");
+
+static unsigned int mrp_periodic_time __read_mostly = 1000;
+module_param(mrp_periodic_time, uint, 0644);
+MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)");
+
+MODULE_LICENSE("GPL");
+
+static const u8
+mrp_applicant_state_table[MRP_APPLICANT_MAX + 1][MRP_EVENT_MAX + 1] = {
+ [MRP_APPLICANT_VO] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_VP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VO,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VO,
+ },
+ [MRP_APPLICANT_VP] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_VP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VP,
+ },
+ [MRP_APPLICANT_VN] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_VN,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VN,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VN,
+ },
+ [MRP_APPLICANT_AN] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_AN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AN,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AN,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VN,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VN,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AN,
+ },
+ [MRP_APPLICANT_AA] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AA,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA,
+ },
+ [MRP_APPLICANT_QA] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_QA,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AA,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA,
+ },
+ [MRP_APPLICANT_LA] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AA,
+ [MRP_EVENT_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_TX] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_LA,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_LA,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_LA,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_LA,
+ },
+ [MRP_APPLICANT_AO] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_AO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VO,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AO,
+ },
+ [MRP_APPLICANT_QO] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_QO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_QO,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AO,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VO,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VO,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_QO,
+ },
+ [MRP_APPLICANT_AP] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_AP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_AO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QA,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP,
+ },
+ [MRP_APPLICANT_QP] = {
+ [MRP_EVENT_NEW] = MRP_APPLICANT_VN,
+ [MRP_EVENT_JOIN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_LV] = MRP_APPLICANT_QO,
+ [MRP_EVENT_TX] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_NEW] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_IN] = MRP_APPLICANT_QP,
+ [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_MT] = MRP_APPLICANT_AP,
+ [MRP_EVENT_R_LV] = MRP_APPLICANT_VP,
+ [MRP_EVENT_R_LA] = MRP_APPLICANT_VP,
+ [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP,
+ [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP,
+ },
+};
+
+static const u8
+mrp_tx_action_table[MRP_APPLICANT_MAX + 1] = {
+ [MRP_APPLICANT_VO] = MRP_TX_ACTION_S_IN_OPTIONAL,
+ [MRP_APPLICANT_VP] = MRP_TX_ACTION_S_JOIN_IN,
+ [MRP_APPLICANT_VN] = MRP_TX_ACTION_S_NEW,
+ [MRP_APPLICANT_AN] = MRP_TX_ACTION_S_NEW,
+ [MRP_APPLICANT_AA] = MRP_TX_ACTION_S_JOIN_IN,
+ [MRP_APPLICANT_QA] = MRP_TX_ACTION_S_JOIN_IN_OPTIONAL,
+ [MRP_APPLICANT_LA] = MRP_TX_ACTION_S_LV,
+ [MRP_APPLICANT_AO] = MRP_TX_ACTION_S_IN_OPTIONAL,
+ [MRP_APPLICANT_QO] = MRP_TX_ACTION_S_IN_OPTIONAL,
+ [MRP_APPLICANT_AP] = MRP_TX_ACTION_S_JOIN_IN,
+ [MRP_APPLICANT_QP] = MRP_TX_ACTION_S_IN_OPTIONAL,
+};
+
+static void mrp_attrvalue_inc(void *value, u8 len)
+{
+ u8 *v = (u8 *)value;
+
+ /* Add 1 to the last byte. If it becomes zero,
+ * go to the previous byte and repeat.
+ */
+ while (len > 0 && !++v[--len])
+ ;
+}
+
+static int mrp_attr_cmp(const struct mrp_attr *attr,
+ const void *value, u8 len, u8 type)
+{
+ if (attr->type != type)
+ return attr->type - type;
+ if (attr->len != len)
+ return attr->len - len;
+ return memcmp(attr->value, value, len);
+}
+
+static struct mrp_attr *mrp_attr_lookup(const struct mrp_applicant *app,
+ const void *value, u8 len, u8 type)
+{
+ struct rb_node *parent = app->mad.rb_node;
+ struct mrp_attr *attr;
+ int d;
+
+ while (parent) {
+ attr = rb_entry(parent, struct mrp_attr, node);
+ d = mrp_attr_cmp(attr, value, len, type);
+ if (d > 0)
+ parent = parent->rb_left;
+ else if (d < 0)
+ parent = parent->rb_right;
+ else
+ return attr;
+ }
+ return NULL;
+}
+
+static struct mrp_attr *mrp_attr_create(struct mrp_applicant *app,
+ const void *value, u8 len, u8 type)
+{
+ struct rb_node *parent = NULL, **p = &app->mad.rb_node;
+ struct mrp_attr *attr;
+ int d;
+
+ while (*p) {
+ parent = *p;
+ attr = rb_entry(parent, struct mrp_attr, node);
+ d = mrp_attr_cmp(attr, value, len, type);
+ if (d > 0)
+ p = &parent->rb_left;
+ else if (d < 0)
+ p = &parent->rb_right;
+ else {
+ /* The attribute already exists; re-use it. */
+ return attr;
+ }
+ }
+ attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC);
+ if (!attr)
+ return attr;
+ attr->state = MRP_APPLICANT_VO;
+ attr->type = type;
+ attr->len = len;
+ memcpy(attr->value, value, len);
+
+ rb_link_node(&attr->node, parent, p);
+ rb_insert_color(&attr->node, &app->mad);
+ return attr;
+}
+
+static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr)
+{
+ rb_erase(&attr->node, &app->mad);
+ kfree(attr);
+}
+
+static int mrp_pdu_init(struct mrp_applicant *app)
+{
+ struct sk_buff *skb;
+ struct mrp_pdu_hdr *ph;
+
+ skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev),
+ GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ skb->dev = app->dev;
+ skb->protocol = app->app->pkttype.type;
+ skb_reserve(skb, LL_RESERVED_SPACE(app->dev));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
+ ph = (struct mrp_pdu_hdr *)__skb_put(skb, sizeof(*ph));
+ ph->version = app->app->version;
+
+ app->pdu = skb;
+ return 0;
+}
+
+static int mrp_pdu_append_end_mark(struct mrp_applicant *app)
+{
+ __be16 *endmark;
+
+ if (skb_tailroom(app->pdu) < sizeof(*endmark))
+ return -1;
+ endmark = (__be16 *)__skb_put(app->pdu, sizeof(*endmark));
+ put_unaligned(MRP_END_MARK, endmark);
+ return 0;
+}
+
+static void mrp_pdu_queue(struct mrp_applicant *app)
+{
+ if (!app->pdu)
+ return;
+
+ if (mrp_cb(app->pdu)->mh)
+ mrp_pdu_append_end_mark(app);
+ mrp_pdu_append_end_mark(app);
+
+ dev_hard_header(app->pdu, app->dev, ntohs(app->app->pkttype.type),
+ app->app->group_address, app->dev->dev_addr,
+ app->pdu->len);
+
+ skb_queue_tail(&app->queue, app->pdu);
+ app->pdu = NULL;
+}
+
+static void mrp_queue_xmit(struct mrp_applicant *app)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&app->queue)))
+ dev_queue_xmit(skb);
+}
+
+static int mrp_pdu_append_msg_hdr(struct mrp_applicant *app,
+ u8 attrtype, u8 attrlen)
+{
+ struct mrp_msg_hdr *mh;
+
+ if (mrp_cb(app->pdu)->mh) {
+ if (mrp_pdu_append_end_mark(app) < 0)
+ return -1;
+ mrp_cb(app->pdu)->mh = NULL;
+ mrp_cb(app->pdu)->vah = NULL;
+ }
+
+ if (skb_tailroom(app->pdu) < sizeof(*mh))
+ return -1;
+ mh = (struct mrp_msg_hdr *)__skb_put(app->pdu, sizeof(*mh));
+ mh->attrtype = attrtype;
+ mh->attrlen = attrlen;
+ mrp_cb(app->pdu)->mh = mh;
+ return 0;
+}
+
+static int mrp_pdu_append_vecattr_hdr(struct mrp_applicant *app,
+ const void *firstattrvalue, u8 attrlen)
+{
+ struct mrp_vecattr_hdr *vah;
+
+ if (skb_tailroom(app->pdu) < sizeof(*vah) + attrlen)
+ return -1;
+ vah = (struct mrp_vecattr_hdr *)__skb_put(app->pdu,
+ sizeof(*vah) + attrlen);
+ put_unaligned(0, &vah->lenflags);
+ memcpy(vah->firstattrvalue, firstattrvalue, attrlen);
+ mrp_cb(app->pdu)->vah = vah;
+ memcpy(mrp_cb(app->pdu)->attrvalue, firstattrvalue, attrlen);
+ return 0;
+}
+
+static int mrp_pdu_append_vecattr_event(struct mrp_applicant *app,
+ const struct mrp_attr *attr,
+ enum mrp_vecattr_event vaevent)
+{
+ u16 len, pos;
+ u8 *vaevents;
+ int err;
+again:
+ if (!app->pdu) {
+ err = mrp_pdu_init(app);
+ if (err < 0)
+ return err;
+ }
+
+ /* If there is no Message header in the PDU, or the Message header is
+ * for a different attribute type, add an EndMark (if necessary) and a
+ * new Message header to the PDU.
+ */
+ if (!mrp_cb(app->pdu)->mh ||
+ mrp_cb(app->pdu)->mh->attrtype != attr->type ||
+ mrp_cb(app->pdu)->mh->attrlen != attr->len) {
+ if (mrp_pdu_append_msg_hdr(app, attr->type, attr->len) < 0)
+ goto queue;
+ }
+
+ /* If there is no VectorAttribute header for this Message in the PDU,
+ * or this attribute's value does not sequentially follow the previous
+ * attribute's value, add a new VectorAttribute header to the PDU.
+ */
+ if (!mrp_cb(app->pdu)->vah ||
+ memcmp(mrp_cb(app->pdu)->attrvalue, attr->value, attr->len)) {
+ if (mrp_pdu_append_vecattr_hdr(app, attr->value, attr->len) < 0)
+ goto queue;
+ }
+
+ len = be16_to_cpu(get_unaligned(&mrp_cb(app->pdu)->vah->lenflags));
+ pos = len % 3;
+
+ /* Events are packed into Vectors in the PDU, three to a byte. Add a
+ * byte to the end of the Vector if necessary.
+ */
+ if (!pos) {
+ if (skb_tailroom(app->pdu) < sizeof(u8))
+ goto queue;
+ vaevents = (u8 *)__skb_put(app->pdu, sizeof(u8));
+ } else {
+ vaevents = (u8 *)(skb_tail_pointer(app->pdu) - sizeof(u8));
+ }
+
+ switch (pos) {
+ case 0:
+ *vaevents = vaevent * (__MRP_VECATTR_EVENT_MAX *
+ __MRP_VECATTR_EVENT_MAX);
+ break;
+ case 1:
+ *vaevents += vaevent * __MRP_VECATTR_EVENT_MAX;
+ break;
+ case 2:
+ *vaevents += vaevent;
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ /* Increment the length of the VectorAttribute in the PDU, as well as
+ * the value of the next attribute that would continue its Vector.
+ */
+ put_unaligned(cpu_to_be16(++len), &mrp_cb(app->pdu)->vah->lenflags);
+ mrp_attrvalue_inc(mrp_cb(app->pdu)->attrvalue, attr->len);
+
+ return 0;
+
+queue:
+ mrp_pdu_queue(app);
+ goto again;
+}
+
+static void mrp_attr_event(struct mrp_applicant *app,
+ struct mrp_attr *attr, enum mrp_event event)
+{
+ enum mrp_applicant_state state;
+
+ state = mrp_applicant_state_table[attr->state][event];
+ if (state == MRP_APPLICANT_INVALID) {
+ WARN_ON(1);
+ return;
+ }
+
+ if (event == MRP_EVENT_TX) {
+ /* When appending the attribute fails, don't update its state
+ * in order to retry at the next TX event.
+ */
+
+ switch (mrp_tx_action_table[attr->state]) {
+ case MRP_TX_ACTION_NONE:
+ case MRP_TX_ACTION_S_JOIN_IN_OPTIONAL:
+ case MRP_TX_ACTION_S_IN_OPTIONAL:
+ break;
+ case MRP_TX_ACTION_S_NEW:
+ if (mrp_pdu_append_vecattr_event(
+ app, attr, MRP_VECATTR_EVENT_NEW) < 0)
+ return;
+ break;
+ case MRP_TX_ACTION_S_JOIN_IN:
+ if (mrp_pdu_append_vecattr_event(
+ app, attr, MRP_VECATTR_EVENT_JOIN_IN) < 0)
+ return;
+ break;
+ case MRP_TX_ACTION_S_LV:
+ if (mrp_pdu_append_vecattr_event(
+ app, attr, MRP_VECATTR_EVENT_LV) < 0)
+ return;
+ /* As a pure applicant, sending a leave message
+ * implies that the attribute was unregistered and
+ * can be destroyed.
+ */
+ mrp_attr_destroy(app, attr);
+ return;
+ default:
+ WARN_ON(1);
+ }
+ }
+
+ attr->state = state;
+}
+
+int mrp_request_join(const struct net_device *dev,
+ const struct mrp_application *appl,
+ const void *value, u8 len, u8 type)
+{
+ struct mrp_port *port = rtnl_dereference(dev->mrp_port);
+ struct mrp_applicant *app = rtnl_dereference(
+ port->applicants[appl->type]);
+ struct mrp_attr *attr;
+
+ if (sizeof(struct mrp_skb_cb) + len >
+ FIELD_SIZEOF(struct sk_buff, cb))
+ return -ENOMEM;
+
+ spin_lock_bh(&app->lock);
+ attr = mrp_attr_create(app, value, len, type);
+ if (!attr) {
+ spin_unlock_bh(&app->lock);
+ return -ENOMEM;
+ }
+ mrp_attr_event(app, attr, MRP_EVENT_JOIN);
+ spin_unlock_bh(&app->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mrp_request_join);
+
+void mrp_request_leave(const struct net_device *dev,
+ const struct mrp_application *appl,
+ const void *value, u8 len, u8 type)
+{
+ struct mrp_port *port = rtnl_dereference(dev->mrp_port);
+ struct mrp_applicant *app = rtnl_dereference(
+ port->applicants[appl->type]);
+ struct mrp_attr *attr;
+
+ if (sizeof(struct mrp_skb_cb) + len >
+ FIELD_SIZEOF(struct sk_buff, cb))
+ return;
+
+ spin_lock_bh(&app->lock);
+ attr = mrp_attr_lookup(app, value, len, type);
+ if (!attr) {
+ spin_unlock_bh(&app->lock);
+ return;
+ }
+ mrp_attr_event(app, attr, MRP_EVENT_LV);
+ spin_unlock_bh(&app->lock);
+}
+EXPORT_SYMBOL_GPL(mrp_request_leave);
+
+static void mrp_mad_event(struct mrp_applicant *app, enum mrp_event event)
+{
+ struct rb_node *node, *next;
+ struct mrp_attr *attr;
+
+ for (node = rb_first(&app->mad);
+ next = node ? rb_next(node) : NULL, node != NULL;
+ node = next) {
+ attr = rb_entry(node, struct mrp_attr, node);
+ mrp_attr_event(app, attr, event);
+ }
+}
+
+static void mrp_join_timer_arm(struct mrp_applicant *app)
+{
+ unsigned long delay;
+
+ delay = (u64)msecs_to_jiffies(mrp_join_time) * prandom_u32() >> 32;
+ mod_timer(&app->join_timer, jiffies + delay);
+}
+
+static void mrp_join_timer(unsigned long data)
+{
+ struct mrp_applicant *app = (struct mrp_applicant *)data;
+
+ spin_lock(&app->lock);
+ mrp_mad_event(app, MRP_EVENT_TX);
+ mrp_pdu_queue(app);
+ spin_unlock(&app->lock);
+
+ mrp_queue_xmit(app);
+ mrp_join_timer_arm(app);
+}
+
+static void mrp_periodic_timer_arm(struct mrp_applicant *app)
+{
+ mod_timer(&app->periodic_timer,
+ jiffies + msecs_to_jiffies(mrp_periodic_time));
+}
+
+static void mrp_periodic_timer(unsigned long data)
+{
+ struct mrp_applicant *app = (struct mrp_applicant *)data;
+
+ spin_lock(&app->lock);
+ mrp_mad_event(app, MRP_EVENT_PERIODIC);
+ mrp_pdu_queue(app);
+ spin_unlock(&app->lock);
+
+ mrp_periodic_timer_arm(app);
+}
+
+static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset)
+{
+ __be16 endmark;
+
+ if (skb_copy_bits(skb, *offset, &endmark, sizeof(endmark)) < 0)
+ return -1;
+ if (endmark == MRP_END_MARK) {
+ *offset += sizeof(endmark);
+ return -1;
+ }
+ return 0;
+}
+
+static void mrp_pdu_parse_vecattr_event(struct mrp_applicant *app,
+ struct sk_buff *skb,
+ enum mrp_vecattr_event vaevent)
+{
+ struct mrp_attr *attr;
+ enum mrp_event event;
+
+ attr = mrp_attr_lookup(app, mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen,
+ mrp_cb(skb)->mh->attrtype);
+ if (attr == NULL)
+ return;
+
+ switch (vaevent) {
+ case MRP_VECATTR_EVENT_NEW:
+ event = MRP_EVENT_R_NEW;
+ break;
+ case MRP_VECATTR_EVENT_JOIN_IN:
+ event = MRP_EVENT_R_JOIN_IN;
+ break;
+ case MRP_VECATTR_EVENT_IN:
+ event = MRP_EVENT_R_IN;
+ break;
+ case MRP_VECATTR_EVENT_JOIN_MT:
+ event = MRP_EVENT_R_JOIN_MT;
+ break;
+ case MRP_VECATTR_EVENT_MT:
+ event = MRP_EVENT_R_MT;
+ break;
+ case MRP_VECATTR_EVENT_LV:
+ event = MRP_EVENT_R_LV;
+ break;
+ default:
+ return;
+ }
+
+ mrp_attr_event(app, attr, event);
+}
+
+static int mrp_pdu_parse_vecattr(struct mrp_applicant *app,
+ struct sk_buff *skb, int *offset)
+{
+ struct mrp_vecattr_hdr _vah;
+ u16 valen;
+ u8 vaevents, vaevent;
+
+ mrp_cb(skb)->vah = skb_header_pointer(skb, *offset, sizeof(_vah),
+ &_vah);
+ if (!mrp_cb(skb)->vah)
+ return -1;
+ *offset += sizeof(_vah);
+
+ if (get_unaligned(&mrp_cb(skb)->vah->lenflags) &
+ MRP_VECATTR_HDR_FLAG_LA)
+ mrp_mad_event(app, MRP_EVENT_R_LA);
+ valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) &
+ MRP_VECATTR_HDR_LEN_MASK);
+
+ /* The VectorAttribute structure in a PDU carries event information
+ * about one or more attributes having consecutive values. Only the
+ * value for the first attribute is contained in the structure. So
+ * we make a copy of that value, and then increment it each time we
+ * advance to the next event in its Vector.
+ */
+ if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen >
+ FIELD_SIZEOF(struct sk_buff, cb))
+ return -1;
+ if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen) < 0)
+ return -1;
+ *offset += mrp_cb(skb)->mh->attrlen;
+
+ /* In a VectorAttribute, the Vector contains events which are packed
+ * three to a byte. We process one byte of the Vector at a time.
+ */
+ while (valen > 0) {
+ if (skb_copy_bits(skb, *offset, &vaevents,
+ sizeof(vaevents)) < 0)
+ return -1;
+ *offset += sizeof(vaevents);
+
+ /* Extract and process the first event. */
+ vaevent = vaevents / (__MRP_VECATTR_EVENT_MAX *
+ __MRP_VECATTR_EVENT_MAX);
+ if (vaevent >= __MRP_VECATTR_EVENT_MAX) {
+ /* The byte is malformed; stop processing. */
+ return -1;
+ }
+ mrp_pdu_parse_vecattr_event(app, skb, vaevent);
+
+ /* If present, extract and process the second event. */
+ if (!--valen)
+ break;
+ mrp_attrvalue_inc(mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen);
+ vaevents %= (__MRP_VECATTR_EVENT_MAX *
+ __MRP_VECATTR_EVENT_MAX);
+ vaevent = vaevents / __MRP_VECATTR_EVENT_MAX;
+ mrp_pdu_parse_vecattr_event(app, skb, vaevent);
+
+ /* If present, extract and process the third event. */
+ if (!--valen)
+ break;
+ mrp_attrvalue_inc(mrp_cb(skb)->attrvalue,
+ mrp_cb(skb)->mh->attrlen);
+ vaevents %= __MRP_VECATTR_EVENT_MAX;
+ vaevent = vaevents;
+ mrp_pdu_parse_vecattr_event(app, skb, vaevent);
+ }
+ return 0;
+}
+
+static int mrp_pdu_parse_msg(struct mrp_applicant *app, struct sk_buff *skb,
+ int *offset)
+{
+ struct mrp_msg_hdr _mh;
+
+ mrp_cb(skb)->mh = skb_header_pointer(skb, *offset, sizeof(_mh), &_mh);
+ if (!mrp_cb(skb)->mh)
+ return -1;
+ *offset += sizeof(_mh);
+
+ if (mrp_cb(skb)->mh->attrtype == 0 ||
+ mrp_cb(skb)->mh->attrtype > app->app->maxattr ||
+ mrp_cb(skb)->mh->attrlen == 0)
+ return -1;
+
+ while (skb->len > *offset) {
+ if (mrp_pdu_parse_end_mark(skb, offset) < 0)
+ break;
+ if (mrp_pdu_parse_vecattr(app, skb, offset) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static int mrp_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct mrp_application *appl = container_of(pt, struct mrp_application,
+ pkttype);
+ struct mrp_port *port;
+ struct mrp_applicant *app;
+ struct mrp_pdu_hdr _ph;
+ const struct mrp_pdu_hdr *ph;
+ int offset = skb_network_offset(skb);
+
+ /* If the interface is in promiscuous mode, drop the packet if
+ * it was unicast to another host.
+ */
+ if (unlikely(skb->pkt_type == PACKET_OTHERHOST))
+ goto out;
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto out;
+ port = rcu_dereference(dev->mrp_port);
+ if (unlikely(!port))
+ goto out;
+ app = rcu_dereference(port->applicants[appl->type]);
+ if (unlikely(!app))
+ goto out;
+
+ ph = skb_header_pointer(skb, offset, sizeof(_ph), &_ph);
+ if (!ph)
+ goto out;
+ offset += sizeof(_ph);
+
+ if (ph->version != app->app->version)
+ goto out;
+
+ spin_lock(&app->lock);
+ while (skb->len > offset) {
+ if (mrp_pdu_parse_end_mark(skb, &offset) < 0)
+ break;
+ if (mrp_pdu_parse_msg(app, skb, &offset) < 0)
+ break;
+ }
+ spin_unlock(&app->lock);
+out:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int mrp_init_port(struct net_device *dev)
+{
+ struct mrp_port *port;
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+ rcu_assign_pointer(dev->mrp_port, port);
+ return 0;
+}
+
+static void mrp_release_port(struct net_device *dev)
+{
+ struct mrp_port *port = rtnl_dereference(dev->mrp_port);
+ unsigned int i;
+
+ for (i = 0; i <= MRP_APPLICATION_MAX; i++) {
+ if (rtnl_dereference(port->applicants[i]))
+ return;
+ }
+ RCU_INIT_POINTER(dev->mrp_port, NULL);
+ kfree_rcu(port, rcu);
+}
+
+int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl)
+{
+ struct mrp_applicant *app;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!rtnl_dereference(dev->mrp_port)) {
+ err = mrp_init_port(dev);
+ if (err < 0)
+ goto err1;
+ }
+
+ err = -ENOMEM;
+ app = kzalloc(sizeof(*app), GFP_KERNEL);
+ if (!app)
+ goto err2;
+
+ err = dev_mc_add(dev, appl->group_address);
+ if (err < 0)
+ goto err3;
+
+ app->dev = dev;
+ app->app = appl;
+ app->mad = RB_ROOT;
+ spin_lock_init(&app->lock);
+ skb_queue_head_init(&app->queue);
+ rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app);
+ setup_timer(&app->join_timer, mrp_join_timer, (unsigned long)app);
+ mrp_join_timer_arm(app);
+ setup_timer(&app->periodic_timer, mrp_periodic_timer,
+ (unsigned long)app);
+ mrp_periodic_timer_arm(app);
+ return 0;
+
+err3:
+ kfree(app);
+err2:
+ mrp_release_port(dev);
+err1:
+ return err;
+}
+EXPORT_SYMBOL_GPL(mrp_init_applicant);
+
+void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
+{
+ struct mrp_port *port = rtnl_dereference(dev->mrp_port);
+ struct mrp_applicant *app = rtnl_dereference(
+ port->applicants[appl->type]);
+
+ ASSERT_RTNL();
+
+ RCU_INIT_POINTER(port->applicants[appl->type], NULL);
+
+ /* Delete timer and generate a final TX event to flush out
+ * all pending messages before the applicant is gone.
+ */
+ del_timer_sync(&app->join_timer);
+ del_timer_sync(&app->periodic_timer);
+
+ spin_lock_bh(&app->lock);
+ mrp_mad_event(app, MRP_EVENT_TX);
+ mrp_pdu_queue(app);
+ spin_unlock_bh(&app->lock);
+
+ mrp_queue_xmit(app);
+
+ dev_mc_del(dev, appl->group_address);
+ kfree_rcu(app, rcu);
+ mrp_release_port(dev);
+}
+EXPORT_SYMBOL_GPL(mrp_uninit_applicant);
+
+int mrp_register_application(struct mrp_application *appl)
+{
+ appl->pkttype.func = mrp_rcv;
+ dev_add_pack(&appl->pkttype);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mrp_register_application);
+
+void mrp_unregister_application(struct mrp_application *appl)
+{
+ dev_remove_pack(&appl->pkttype);
+}
+EXPORT_SYMBOL_GPL(mrp_unregister_application);
diff --git a/net/802/p8022.c b/net/802/p8022.c
new file mode 100644
index 000000000..0bda8de7d
--- /dev/null
+++ b/net/802/p8022.c
@@ -0,0 +1,66 @@
+/*
+ * NET3: Support for 802.2 demultiplexing off Ethernet
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Demultiplex 802.2 encoded protocols. We match the entry by the
+ * SSAP/DSAP pair and then deliver to the registered datalink that
+ * matches. The control byte is ignored and handling of such items
+ * is up to the routine passed the frame.
+ *
+ * Unlike the 802.3 datalink we have a list of 802.2 entries as
+ * there are multiple protocols to demux. The list is currently
+ * short (3 or 4 entries at most). The current demux assumes this.
+ */
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/datalink.h>
+#include <linux/mm.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <net/llc.h>
+#include <net/p8022.h>
+
+static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
+ unsigned char *dest)
+{
+ llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap);
+ return 0;
+}
+
+struct datalink_proto *register_8022_client(unsigned char type,
+ int (*func)(struct sk_buff *skb,
+ struct net_device *dev,
+ struct packet_type *pt,
+ struct net_device *orig_dev))
+{
+ struct datalink_proto *proto;
+
+ proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
+ if (proto) {
+ proto->type[0] = type;
+ proto->header_length = 3;
+ proto->request = p8022_request;
+ proto->sap = llc_sap_open(type, func);
+ if (!proto->sap) {
+ kfree(proto);
+ proto = NULL;
+ }
+ }
+ return proto;
+}
+
+void unregister_8022_client(struct datalink_proto *proto)
+{
+ llc_sap_put(proto->sap);
+ kfree(proto);
+}
+
+EXPORT_SYMBOL(register_8022_client);
+EXPORT_SYMBOL(unregister_8022_client);
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/p8023.c b/net/802/p8023.c
new file mode 100644
index 000000000..1256a40da
--- /dev/null
+++ b/net/802/p8023.c
@@ -0,0 +1,64 @@
+/*
+ * NET3: 802.3 data link hooks used for IPX 802.3
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * 802.3 isn't really a protocol data link layer. Some old IPX stuff
+ * uses it however. Note that there is only one 802.3 protocol layer
+ * in the system. We don't currently support different protocols
+ * running raw 802.3 on different devices. Thankfully nobody else
+ * has done anything like the old IPX.
+ */
+
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/datalink.h>
+#include <net/p8022.h>
+
+/*
+ * Place an 802.3 header on a packet. The driver will do the mac
+ * addresses, we just need to give it the buffer length.
+ */
+static int p8023_request(struct datalink_proto *dl,
+ struct sk_buff *skb, unsigned char *dest_node)
+{
+ struct net_device *dev = skb->dev;
+
+ dev_hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len);
+ return dev_queue_xmit(skb);
+}
+
+/*
+ * Create an 802.3 client. Note there can be only one 802.3 client
+ */
+struct datalink_proto *make_8023_client(void)
+{
+ struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
+
+ if (proto) {
+ proto->header_length = 0;
+ proto->request = p8023_request;
+ }
+ return proto;
+}
+
+/*
+ * Destroy the 802.3 client.
+ */
+void destroy_8023_client(struct datalink_proto *dl)
+{
+ kfree(dl);
+}
+
+EXPORT_SYMBOL(destroy_8023_client);
+EXPORT_SYMBOL(make_8023_client);
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/psnap.c b/net/802/psnap.c
new file mode 100644
index 000000000..db6baf7cf
--- /dev/null
+++ b/net/802/psnap.c
@@ -0,0 +1,167 @@
+/*
+ * SNAP data link layer. Derived from 802.2
+ *
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>,
+ * from the 802.2 layer by Greg Page.
+ * Merged in additions from Greg Page's psnap.c.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/datalink.h>
+#include <net/llc.h>
+#include <net/psnap.h>
+#include <linux/mm.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+
+static LIST_HEAD(snap_list);
+static DEFINE_SPINLOCK(snap_lock);
+static struct llc_sap *snap_sap;
+
+/*
+ * Find a snap client by matching the 5 bytes.
+ */
+static struct datalink_proto *find_snap_client(const unsigned char *desc)
+{
+ struct datalink_proto *proto = NULL, *p;
+
+ list_for_each_entry_rcu(p, &snap_list, node) {
+ if (!memcmp(p->type, desc, 5)) {
+ proto = p;
+ break;
+ }
+ }
+ return proto;
+}
+
+/*
+ * A SNAP packet has arrived
+ */
+static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ int rc = 1;
+ struct datalink_proto *proto;
+ static struct packet_type snap_packet_type = {
+ .type = cpu_to_be16(ETH_P_SNAP),
+ };
+
+ if (unlikely(!pskb_may_pull(skb, 5)))
+ goto drop;
+
+ rcu_read_lock();
+ proto = find_snap_client(skb_transport_header(skb));
+ if (proto) {
+ /* Pass the frame on. */
+ skb->transport_header += 5;
+ skb_pull_rcsum(skb, 5);
+ rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
+ }
+ rcu_read_unlock();
+
+ if (unlikely(!proto))
+ goto drop;
+
+out:
+ return rc;
+
+drop:
+ kfree_skb(skb);
+ goto out;
+}
+
+/*
+ * Put a SNAP header on a frame and pass to 802.2
+ */
+static int snap_request(struct datalink_proto *dl,
+ struct sk_buff *skb, u8 *dest)
+{
+ memcpy(skb_push(skb, 5), dl->type, 5);
+ llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap);
+ return 0;
+}
+
+/*
+ * Set up the SNAP layer
+ */
+EXPORT_SYMBOL(register_snap_client);
+EXPORT_SYMBOL(unregister_snap_client);
+
+static const char snap_err_msg[] __initconst =
+ KERN_CRIT "SNAP - unable to register with 802.2\n";
+
+static int __init snap_init(void)
+{
+ snap_sap = llc_sap_open(0xAA, snap_rcv);
+ if (!snap_sap) {
+ printk(snap_err_msg);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+module_init(snap_init);
+
+static void __exit snap_exit(void)
+{
+ llc_sap_put(snap_sap);
+}
+
+module_exit(snap_exit);
+
+
+/*
+ * Register SNAP clients. We don't yet use this for IP.
+ */
+struct datalink_proto *register_snap_client(const unsigned char *desc,
+ int (*rcvfunc)(struct sk_buff *,
+ struct net_device *,
+ struct packet_type *,
+ struct net_device *))
+{
+ struct datalink_proto *proto = NULL;
+
+ spin_lock_bh(&snap_lock);
+
+ if (find_snap_client(desc))
+ goto out;
+
+ proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
+ if (proto) {
+ memcpy(proto->type, desc, 5);
+ proto->rcvfunc = rcvfunc;
+ proto->header_length = 5 + 3; /* snap + 802.2 */
+ proto->request = snap_request;
+ list_add_rcu(&proto->node, &snap_list);
+ }
+out:
+ spin_unlock_bh(&snap_lock);
+
+ return proto;
+}
+
+/*
+ * Unregister SNAP clients. Protocols no longer want to play with us ...
+ */
+void unregister_snap_client(struct datalink_proto *proto)
+{
+ spin_lock_bh(&snap_lock);
+ list_del_rcu(&proto->node);
+ spin_unlock_bh(&snap_lock);
+
+ synchronize_net();
+
+ kfree(proto);
+}
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/stp.c b/net/802/stp.c
new file mode 100644
index 000000000..2c40ba0ec
--- /dev/null
+++ b/net/802/stp.c
@@ -0,0 +1,104 @@
+/*
+ * STP SAP demux
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/stp.h>
+
+/* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */
+#define GARP_ADDR_MIN 0x20
+#define GARP_ADDR_MAX 0x2F
+#define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN)
+
+static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly;
+static const struct stp_proto __rcu *stp_proto __read_mostly;
+
+static struct llc_sap *sap __read_mostly;
+static unsigned int sap_registered;
+static DEFINE_MUTEX(stp_proto_mutex);
+
+/* Called under rcu_read_lock from LLC */
+static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ const struct ethhdr *eh = eth_hdr(skb);
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ const struct stp_proto *proto;
+
+ if (pdu->ssap != LLC_SAP_BSPAN ||
+ pdu->dsap != LLC_SAP_BSPAN ||
+ pdu->ctrl_1 != LLC_PDU_TYPE_U)
+ goto err;
+
+ if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) {
+ proto = rcu_dereference(garp_protos[eh->h_dest[5] -
+ GARP_ADDR_MIN]);
+ if (proto &&
+ !ether_addr_equal(eh->h_dest, proto->group_address))
+ goto err;
+ } else
+ proto = rcu_dereference(stp_proto);
+
+ if (!proto)
+ goto err;
+
+ proto->rcv(proto, skb, dev);
+ return 0;
+
+err:
+ kfree_skb(skb);
+ return 0;
+}
+
+int stp_proto_register(const struct stp_proto *proto)
+{
+ int err = 0;
+
+ mutex_lock(&stp_proto_mutex);
+ if (sap_registered++ == 0) {
+ sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv);
+ if (!sap) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ if (is_zero_ether_addr(proto->group_address))
+ rcu_assign_pointer(stp_proto, proto);
+ else
+ rcu_assign_pointer(garp_protos[proto->group_address[5] -
+ GARP_ADDR_MIN], proto);
+out:
+ mutex_unlock(&stp_proto_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(stp_proto_register);
+
+void stp_proto_unregister(const struct stp_proto *proto)
+{
+ mutex_lock(&stp_proto_mutex);
+ if (is_zero_ether_addr(proto->group_address))
+ RCU_INIT_POINTER(stp_proto, NULL);
+ else
+ RCU_INIT_POINTER(garp_protos[proto->group_address[5] -
+ GARP_ADDR_MIN], NULL);
+ synchronize_rcu();
+
+ if (--sap_registered == 0)
+ llc_sap_put(sap);
+ mutex_unlock(&stp_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(stp_proto_unregister);
+
+MODULE_LICENSE("GPL");
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 000000000..423201809
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,40 @@
+#
+# Configuration for 802.1Q VLAN support
+#
+
+config VLAN_8021Q
+ tristate "802.1Q/802.1ad VLAN Support"
+ ---help---
+ Select this and you will be able to create 802.1Q VLAN interfaces
+ on your Ethernet interfaces. 802.1Q VLAN supports almost
+ everything a regular Ethernet interface does, including
+ firewalling, bridging, and of course IP traffic. You will need
+ the 'ip' utility in order to effectively use VLANs.
+ See the VLAN web page for more information:
+ <http://www.candelatech.com/~greear/vlan.html>
+
+ To compile this code as a module, choose M here: the module
+ will be called 8021q.
+
+ If unsure, say N.
+
+config VLAN_8021Q_GVRP
+ bool "GVRP (GARP VLAN Registration Protocol) support"
+ depends on VLAN_8021Q
+ select GARP
+ help
+ Select this to enable GVRP end-system support. GVRP is used for
+ automatic propagation of registered VLANs to switches.
+
+ If unsure, say N.
+
+config VLAN_8021Q_MVRP
+ bool "MVRP (Multiple VLAN Registration Protocol) support"
+ depends on VLAN_8021Q
+ select MRP
+ help
+ Select this to enable MVRP end-system support. MVRP is used for
+ automatic propagation of registered VLANs to switches; it
+ supersedes GVRP and is not backwards-compatible.
+
+ If unsure, say N.
diff --git a/net/8021q/Makefile b/net/8021q/Makefile
new file mode 100644
index 000000000..7bc8db08d
--- /dev/null
+++ b/net/8021q/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Linux VLAN layer.
+#
+obj-$(subst m,y,$(CONFIG_VLAN_8021Q)) += vlan_core.o
+obj-$(CONFIG_VLAN_8021Q) += 8021q.o
+
+8021q-y := vlan.o vlan_dev.o vlan_netlink.o
+8021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o
+8021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o
+8021q-$(CONFIG_PROC_FS) += vlanproc.o
+
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
new file mode 100644
index 000000000..59555f0f8
--- /dev/null
+++ b/net/8021q/vlan.c
@@ -0,0 +1,704 @@
+/*
+ * INET 802.1Q VLAN
+ * Ethernet-type device handling.
+ *
+ * Authors: Ben Greear <greearb@candelatech.com>
+ * Please send support related email to: netdev@vger.kernel.org
+ * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html
+ *
+ * Fixes:
+ * Fix for packet capture - Nick Eggleston <nick@dccinc.com>;
+ * Add HW acceleration hooks - David S. Miller <davem@redhat.com>;
+ * Correct all the locking - David S. Miller <davem@redhat.com>;
+ * Use hash table for VLAN groups - David S. Miller <davem@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+#include <net/p8022.h>
+#include <net/arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <asm/uaccess.h>
+
+#include <linux/if_vlan.h>
+#include "vlan.h"
+#include "vlanproc.h"
+
+#define DRV_VERSION "1.8"
+
+/* Global VLAN variables */
+
+int vlan_net_id __read_mostly;
+
+const char vlan_fullname[] = "802.1Q VLAN Support";
+const char vlan_version[] = DRV_VERSION;
+
+/* End of global variables definitions. */
+
+static int vlan_group_prealloc_vid(struct vlan_group *vg,
+ __be16 vlan_proto, u16 vlan_id)
+{
+ struct net_device **array;
+ unsigned int pidx, vidx;
+ unsigned int size;
+
+ ASSERT_RTNL();
+
+ pidx = vlan_proto_idx(vlan_proto);
+ vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN;
+ array = vg->vlan_devices_arrays[pidx][vidx];
+ if (array != NULL)
+ return 0;
+
+ size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
+ array = kzalloc(size, GFP_KERNEL);
+ if (array == NULL)
+ return -ENOBUFS;
+
+ vg->vlan_devices_arrays[pidx][vidx] = array;
+ return 0;
+}
+
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+ struct vlan_info *vlan_info;
+ struct vlan_group *grp;
+ u16 vlan_id = vlan->vlan_id;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(real_dev->vlan_info);
+ BUG_ON(!vlan_info);
+
+ grp = &vlan_info->grp;
+
+ grp->nr_vlan_devs--;
+
+ if (vlan->flags & VLAN_FLAG_MVRP)
+ vlan_mvrp_request_leave(dev);
+ if (vlan->flags & VLAN_FLAG_GVRP)
+ vlan_gvrp_request_leave(dev);
+
+ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL);
+
+ netdev_upper_dev_unlink(real_dev, dev);
+ /* Because unregister_netdevice_queue() makes sure at least one rcu
+ * grace period is respected before device freeing,
+ * we dont need to call synchronize_net() here.
+ */
+ unregister_netdevice_queue(dev, head);
+
+ if (grp->nr_vlan_devs == 0) {
+ vlan_mvrp_uninit_applicant(real_dev);
+ vlan_gvrp_uninit_applicant(real_dev);
+ }
+
+ /* Take it out of our own structures, but be sure to interlock with
+ * HW accelerating devices or SW vlan input packet processing if
+ * VLAN is not 0 (leave it there for 802.1p).
+ */
+ if (vlan_id)
+ vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
+
+ /* Get rid of the vlan's reference to real_dev */
+ dev_put(real_dev);
+}
+
+int vlan_check_real_dev(struct net_device *real_dev,
+ __be16 protocol, u16 vlan_id)
+{
+ const char *name = real_dev->name;
+
+ if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
+ pr_info("VLANs not supported on %s\n", name);
+ return -EOPNOTSUPP;
+ }
+
+ if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL)
+ return -EEXIST;
+
+ return 0;
+}
+
+int register_vlan_dev(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+ u16 vlan_id = vlan->vlan_id;
+ struct vlan_info *vlan_info;
+ struct vlan_group *grp;
+ int err;
+
+ err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id);
+ if (err)
+ return err;
+
+ vlan_info = rtnl_dereference(real_dev->vlan_info);
+ /* vlan_info should be there now. vlan_vid_add took care of it */
+ BUG_ON(!vlan_info);
+
+ grp = &vlan_info->grp;
+ if (grp->nr_vlan_devs == 0) {
+ err = vlan_gvrp_init_applicant(real_dev);
+ if (err < 0)
+ goto out_vid_del;
+ err = vlan_mvrp_init_applicant(real_dev);
+ if (err < 0)
+ goto out_uninit_gvrp;
+ }
+
+ err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id);
+ if (err < 0)
+ goto out_uninit_mvrp;
+
+ vlan->nest_level = dev_get_nest_level(real_dev, is_vlan_dev) + 1;
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out_uninit_mvrp;
+
+ err = netdev_upper_dev_link(real_dev, dev);
+ if (err)
+ goto out_unregister_netdev;
+
+ /* Account for reference in struct vlan_dev_priv */
+ dev_hold(real_dev);
+
+ netif_stacked_transfer_operstate(real_dev, dev);
+ linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */
+
+ /* So, got the sucker initialized, now lets place
+ * it into our local structure.
+ */
+ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev);
+ grp->nr_vlan_devs++;
+
+ return 0;
+
+out_unregister_netdev:
+ unregister_netdevice(dev);
+out_uninit_mvrp:
+ if (grp->nr_vlan_devs == 0)
+ vlan_mvrp_uninit_applicant(real_dev);
+out_uninit_gvrp:
+ if (grp->nr_vlan_devs == 0)
+ vlan_gvrp_uninit_applicant(real_dev);
+out_vid_del:
+ vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
+ return err;
+}
+
+/* Attach a VLAN device to a mac address (ie Ethernet Card).
+ * Returns 0 if the device was created or a negative error code otherwise.
+ */
+static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
+{
+ struct net_device *new_dev;
+ struct vlan_dev_priv *vlan;
+ struct net *net = dev_net(real_dev);
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+ char name[IFNAMSIZ];
+ int err;
+
+ if (vlan_id >= VLAN_VID_MASK)
+ return -ERANGE;
+
+ err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id);
+ if (err < 0)
+ return err;
+
+ /* Gotta set up the fields for the device. */
+ switch (vn->name_type) {
+ case VLAN_NAME_TYPE_RAW_PLUS_VID:
+ /* name will look like: eth1.0005 */
+ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id);
+ break;
+ case VLAN_NAME_TYPE_PLUS_VID_NO_PAD:
+ /* Put our vlan.VID in the name.
+ * Name will look like: vlan5
+ */
+ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id);
+ break;
+ case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD:
+ /* Put our vlan.VID in the name.
+ * Name will look like: eth0.5
+ */
+ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id);
+ break;
+ case VLAN_NAME_TYPE_PLUS_VID:
+ /* Put our vlan.VID in the name.
+ * Name will look like: vlan0005
+ */
+ default:
+ snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id);
+ }
+
+ new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name,
+ NET_NAME_UNKNOWN, vlan_setup);
+
+ if (new_dev == NULL)
+ return -ENOBUFS;
+
+ dev_net_set(new_dev, net);
+ /* need 4 bytes for extra VLAN header info,
+ * hope the underlying device can handle it.
+ */
+ new_dev->mtu = real_dev->mtu;
+ new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT);
+
+ vlan = vlan_dev_priv(new_dev);
+ vlan->vlan_proto = htons(ETH_P_8021Q);
+ vlan->vlan_id = vlan_id;
+ vlan->real_dev = real_dev;
+ vlan->dent = NULL;
+ vlan->flags = VLAN_FLAG_REORDER_HDR;
+
+ new_dev->rtnl_link_ops = &vlan_link_ops;
+ err = register_vlan_dev(new_dev);
+ if (err < 0)
+ goto out_free_newdev;
+
+ return 0;
+
+out_free_newdev:
+ free_netdev(new_dev);
+ return err;
+}
+
+static void vlan_sync_address(struct net_device *dev,
+ struct net_device *vlandev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
+
+ /* May be called without an actual change */
+ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
+ return;
+
+ /* vlan address was different from the old address and is equal to
+ * the new address */
+ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
+ ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
+ dev_uc_del(dev, vlandev->dev_addr);
+
+ /* vlan address was equal to the old address and is different from
+ * the new address */
+ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
+ !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
+ dev_uc_add(dev, vlandev->dev_addr);
+
+ ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
+}
+
+static void vlan_transfer_features(struct net_device *dev,
+ struct net_device *vlandev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
+
+ vlandev->gso_max_size = dev->gso_max_size;
+
+ if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
+ vlandev->hard_header_len = dev->hard_header_len;
+ else
+ vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN;
+
+#if IS_ENABLED(CONFIG_FCOE)
+ vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid;
+#endif
+
+ netdev_update_features(vlandev);
+}
+
+static int __vlan_device_event(struct net_device *dev, unsigned long event)
+{
+ int err = 0;
+
+ switch (event) {
+ case NETDEV_CHANGENAME:
+ vlan_proc_rem_dev(dev);
+ err = vlan_proc_add_dev(dev);
+ break;
+ case NETDEV_REGISTER:
+ err = vlan_proc_add_dev(dev);
+ break;
+ case NETDEV_UNREGISTER:
+ vlan_proc_rem_dev(dev);
+ break;
+ }
+
+ return err;
+}
+
+static int vlan_device_event(struct notifier_block *unused, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct vlan_group *grp;
+ struct vlan_info *vlan_info;
+ int i, flgs;
+ struct net_device *vlandev;
+ struct vlan_dev_priv *vlan;
+ bool last = false;
+ LIST_HEAD(list);
+
+ if (is_vlan_dev(dev)) {
+ int err = __vlan_device_event(dev, event);
+
+ if (err)
+ return notifier_from_errno(err);
+ }
+
+ if ((event == NETDEV_UP) &&
+ (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) {
+ pr_info("adding VLAN 0 to HW filter on device %s\n",
+ dev->name);
+ vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
+ }
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ goto out;
+ grp = &vlan_info->grp;
+
+ /* It is OK that we do not hold the group lock right now,
+ * as we run under the RTNL lock.
+ */
+
+ switch (event) {
+ case NETDEV_CHANGE:
+ /* Propagate real device state to vlan devices */
+ vlan_group_for_each_dev(grp, i, vlandev)
+ netif_stacked_transfer_operstate(dev, vlandev);
+ break;
+
+ case NETDEV_CHANGEADDR:
+ /* Adjust unicast filters on underlying device */
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ flgs = vlandev->flags;
+ if (!(flgs & IFF_UP))
+ continue;
+
+ vlan_sync_address(dev, vlandev);
+ }
+ break;
+
+ case NETDEV_CHANGEMTU:
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ if (vlandev->mtu <= dev->mtu)
+ continue;
+
+ dev_set_mtu(vlandev, dev->mtu);
+ }
+ break;
+
+ case NETDEV_FEAT_CHANGE:
+ /* Propagate device features to underlying device */
+ vlan_group_for_each_dev(grp, i, vlandev)
+ vlan_transfer_features(dev, vlandev);
+ break;
+
+ case NETDEV_DOWN: {
+ struct net_device *tmp;
+ LIST_HEAD(close_list);
+
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
+ vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
+
+ /* Put all VLANs for this dev in the down state too. */
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ flgs = vlandev->flags;
+ if (!(flgs & IFF_UP))
+ continue;
+
+ vlan = vlan_dev_priv(vlandev);
+ if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+ list_add(&vlandev->close_list, &close_list);
+ }
+
+ dev_close_many(&close_list, false);
+
+ list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) {
+ netif_stacked_transfer_operstate(dev, vlandev);
+ list_del_init(&vlandev->close_list);
+ }
+ list_del(&close_list);
+ break;
+ }
+ case NETDEV_UP:
+ /* Put all VLANs for this dev in the up state too. */
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ flgs = dev_get_flags(vlandev);
+ if (flgs & IFF_UP)
+ continue;
+
+ vlan = vlan_dev_priv(vlandev);
+ if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+ dev_change_flags(vlandev, flgs | IFF_UP);
+ netif_stacked_transfer_operstate(dev, vlandev);
+ }
+ break;
+
+ case NETDEV_UNREGISTER:
+ /* twiddle thumbs on netns device moves */
+ if (dev->reg_state != NETREG_UNREGISTERING)
+ break;
+
+ vlan_group_for_each_dev(grp, i, vlandev) {
+ /* removal of last vid destroys vlan_info, abort
+ * afterwards */
+ if (vlan_info->nr_vids == 1)
+ last = true;
+
+ unregister_vlan_dev(vlandev, &list);
+ if (last)
+ break;
+ }
+ unregister_netdevice_many(&list);
+ break;
+
+ case NETDEV_PRE_TYPE_CHANGE:
+ /* Forbid underlaying device to change its type. */
+ if (vlan_uses_dev(dev))
+ return NOTIFY_BAD;
+ break;
+
+ case NETDEV_NOTIFY_PEERS:
+ case NETDEV_BONDING_FAILOVER:
+ case NETDEV_RESEND_IGMP:
+ /* Propagate to vlan devices */
+ vlan_group_for_each_dev(grp, i, vlandev)
+ call_netdevice_notifiers(event, vlandev);
+ break;
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vlan_notifier_block __read_mostly = {
+ .notifier_call = vlan_device_event,
+};
+
+/*
+ * VLAN IOCTL handler.
+ * o execute requested action or pass command to the device driver
+ * arg is really a struct vlan_ioctl_args __user *.
+ */
+static int vlan_ioctl_handler(struct net *net, void __user *arg)
+{
+ int err;
+ struct vlan_ioctl_args args;
+ struct net_device *dev = NULL;
+
+ if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args)))
+ return -EFAULT;
+
+ /* Null terminate this sucker, just in case. */
+ args.device1[23] = 0;
+ args.u.device2[23] = 0;
+
+ rtnl_lock();
+
+ switch (args.cmd) {
+ case SET_VLAN_INGRESS_PRIORITY_CMD:
+ case SET_VLAN_EGRESS_PRIORITY_CMD:
+ case SET_VLAN_FLAG_CMD:
+ case ADD_VLAN_CMD:
+ case DEL_VLAN_CMD:
+ case GET_VLAN_REALDEV_NAME_CMD:
+ case GET_VLAN_VID_CMD:
+ err = -ENODEV;
+ dev = __dev_get_by_name(net, args.device1);
+ if (!dev)
+ goto out;
+
+ err = -EINVAL;
+ if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev))
+ goto out;
+ }
+
+ switch (args.cmd) {
+ case SET_VLAN_INGRESS_PRIORITY_CMD:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ vlan_dev_set_ingress_priority(dev,
+ args.u.skb_priority,
+ args.vlan_qos);
+ err = 0;
+ break;
+
+ case SET_VLAN_EGRESS_PRIORITY_CMD:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = vlan_dev_set_egress_priority(dev,
+ args.u.skb_priority,
+ args.vlan_qos);
+ break;
+
+ case SET_VLAN_FLAG_CMD:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = vlan_dev_change_flags(dev,
+ args.vlan_qos ? args.u.flag : 0,
+ args.u.flag);
+ break;
+
+ case SET_VLAN_NAME_TYPE_CMD:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ if ((args.u.name_type >= 0) &&
+ (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
+ struct vlan_net *vn;
+
+ vn = net_generic(net, vlan_net_id);
+ vn->name_type = args.u.name_type;
+ err = 0;
+ } else {
+ err = -EINVAL;
+ }
+ break;
+
+ case ADD_VLAN_CMD:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = register_vlan_device(dev, args.u.VID);
+ break;
+
+ case DEL_VLAN_CMD:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ unregister_vlan_dev(dev, NULL);
+ err = 0;
+ break;
+
+ case GET_VLAN_REALDEV_NAME_CMD:
+ err = 0;
+ vlan_dev_get_realdev_name(dev, args.u.device2);
+ if (copy_to_user(arg, &args,
+ sizeof(struct vlan_ioctl_args)))
+ err = -EFAULT;
+ break;
+
+ case GET_VLAN_VID_CMD:
+ err = 0;
+ args.u.VID = vlan_dev_vlan_id(dev);
+ if (copy_to_user(arg, &args,
+ sizeof(struct vlan_ioctl_args)))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+out:
+ rtnl_unlock();
+ return err;
+}
+
+static int __net_init vlan_init_net(struct net *net)
+{
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+ int err;
+
+ vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD;
+
+ err = vlan_proc_init(net);
+
+ return err;
+}
+
+static void __net_exit vlan_exit_net(struct net *net)
+{
+ vlan_proc_cleanup(net);
+}
+
+static struct pernet_operations vlan_net_ops = {
+ .init = vlan_init_net,
+ .exit = vlan_exit_net,
+ .id = &vlan_net_id,
+ .size = sizeof(struct vlan_net),
+};
+
+static int __init vlan_proto_init(void)
+{
+ int err;
+
+ pr_info("%s v%s\n", vlan_fullname, vlan_version);
+
+ err = register_pernet_subsys(&vlan_net_ops);
+ if (err < 0)
+ goto err0;
+
+ err = register_netdevice_notifier(&vlan_notifier_block);
+ if (err < 0)
+ goto err2;
+
+ err = vlan_gvrp_init();
+ if (err < 0)
+ goto err3;
+
+ err = vlan_mvrp_init();
+ if (err < 0)
+ goto err4;
+
+ err = vlan_netlink_init();
+ if (err < 0)
+ goto err5;
+
+ vlan_ioctl_set(vlan_ioctl_handler);
+ return 0;
+
+err5:
+ vlan_mvrp_uninit();
+err4:
+ vlan_gvrp_uninit();
+err3:
+ unregister_netdevice_notifier(&vlan_notifier_block);
+err2:
+ unregister_pernet_subsys(&vlan_net_ops);
+err0:
+ return err;
+}
+
+static void __exit vlan_cleanup_module(void)
+{
+ vlan_ioctl_set(NULL);
+ vlan_netlink_fini();
+
+ unregister_netdevice_notifier(&vlan_notifier_block);
+
+ unregister_pernet_subsys(&vlan_net_ops);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ vlan_mvrp_uninit();
+ vlan_gvrp_uninit();
+}
+
+module_init(vlan_proto_init);
+module_exit(vlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
new file mode 100644
index 000000000..9d010a09a
--- /dev/null
+++ b/net/8021q/vlan.h
@@ -0,0 +1,173 @@
+#ifndef __BEN_VLAN_802_1Q_INC__
+#define __BEN_VLAN_802_1Q_INC__
+
+#include <linux/if_vlan.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/list.h>
+
+/* if this changes, algorithm will have to be reworked because this
+ * depends on completely exhausting the VLAN identifier space. Thus
+ * it gives constant time look-up, but in many cases it wastes memory.
+ */
+#define VLAN_GROUP_ARRAY_SPLIT_PARTS 8
+#define VLAN_GROUP_ARRAY_PART_LEN (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS)
+
+enum vlan_protos {
+ VLAN_PROTO_8021Q = 0,
+ VLAN_PROTO_8021AD,
+ VLAN_PROTO_NUM,
+};
+
+struct vlan_group {
+ unsigned int nr_vlan_devs;
+ struct hlist_node hlist; /* linked list */
+ struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM]
+ [VLAN_GROUP_ARRAY_SPLIT_PARTS];
+};
+
+struct vlan_info {
+ struct net_device *real_dev; /* The ethernet(like) device
+ * the vlan is attached to.
+ */
+ struct vlan_group grp;
+ struct list_head vid_list;
+ unsigned int nr_vids;
+ struct rcu_head rcu;
+};
+
+static inline unsigned int vlan_proto_idx(__be16 proto)
+{
+ switch (proto) {
+ case htons(ETH_P_8021Q):
+ return VLAN_PROTO_8021Q;
+ case htons(ETH_P_8021AD):
+ return VLAN_PROTO_8021AD;
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg,
+ unsigned int pidx,
+ u16 vlan_id)
+{
+ struct net_device **array;
+
+ array = vg->vlan_devices_arrays[pidx]
+ [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+ return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
+}
+
+static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
+ __be16 vlan_proto,
+ u16 vlan_id)
+{
+ return __vlan_group_get_device(vg, vlan_proto_idx(vlan_proto), vlan_id);
+}
+
+static inline void vlan_group_set_device(struct vlan_group *vg,
+ __be16 vlan_proto, u16 vlan_id,
+ struct net_device *dev)
+{
+ struct net_device **array;
+ if (!vg)
+ return;
+ array = vg->vlan_devices_arrays[vlan_proto_idx(vlan_proto)]
+ [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+ array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
+}
+
+/* Must be invoked with rcu_read_lock or with RTNL. */
+static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
+ __be16 vlan_proto, u16 vlan_id)
+{
+ struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info);
+
+ if (vlan_info)
+ return vlan_group_get_device(&vlan_info->grp,
+ vlan_proto, vlan_id);
+
+ return NULL;
+}
+
+#define vlan_group_for_each_dev(grp, i, dev) \
+ for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
+ if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
+ (i) % VLAN_N_VID)))
+
+/* found in vlan_dev.c */
+void vlan_dev_set_ingress_priority(const struct net_device *dev,
+ u32 skb_prio, u16 vlan_prio);
+int vlan_dev_set_egress_priority(const struct net_device *dev,
+ u32 skb_prio, u16 vlan_prio);
+int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
+void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
+
+int vlan_check_real_dev(struct net_device *real_dev,
+ __be16 protocol, u16 vlan_id);
+void vlan_setup(struct net_device *dev);
+int register_vlan_dev(struct net_device *dev);
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+
+static inline u32 vlan_get_ingress_priority(struct net_device *dev,
+ u16 vlan_tci)
+{
+ struct vlan_dev_priv *vip = vlan_dev_priv(dev);
+
+ return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
+}
+
+#ifdef CONFIG_VLAN_8021Q_GVRP
+int vlan_gvrp_request_join(const struct net_device *dev);
+void vlan_gvrp_request_leave(const struct net_device *dev);
+int vlan_gvrp_init_applicant(struct net_device *dev);
+void vlan_gvrp_uninit_applicant(struct net_device *dev);
+int vlan_gvrp_init(void);
+void vlan_gvrp_uninit(void);
+#else
+static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; }
+static inline void vlan_gvrp_request_leave(const struct net_device *dev) {}
+static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; }
+static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {}
+static inline int vlan_gvrp_init(void) { return 0; }
+static inline void vlan_gvrp_uninit(void) {}
+#endif
+
+#ifdef CONFIG_VLAN_8021Q_MVRP
+int vlan_mvrp_request_join(const struct net_device *dev);
+void vlan_mvrp_request_leave(const struct net_device *dev);
+int vlan_mvrp_init_applicant(struct net_device *dev);
+void vlan_mvrp_uninit_applicant(struct net_device *dev);
+int vlan_mvrp_init(void);
+void vlan_mvrp_uninit(void);
+#else
+static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; }
+static inline void vlan_mvrp_request_leave(const struct net_device *dev) {}
+static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; }
+static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {}
+static inline int vlan_mvrp_init(void) { return 0; }
+static inline void vlan_mvrp_uninit(void) {}
+#endif
+
+extern const char vlan_fullname[];
+extern const char vlan_version[];
+int vlan_netlink_init(void);
+void vlan_netlink_fini(void);
+
+extern struct rtnl_link_ops vlan_link_ops;
+
+extern int vlan_net_id;
+
+struct proc_dir_entry;
+
+struct vlan_net {
+ /* /proc/net/vlan */
+ struct proc_dir_entry *proc_vlan_dir;
+ /* /proc/net/vlan/config */
+ struct proc_dir_entry *proc_vlan_conf;
+ /* Determines interface naming scheme. */
+ unsigned short name_type;
+};
+
+#endif /* !(__BEN_VLAN_802_1Q_INC__) */
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
new file mode 100644
index 000000000..61bf2a06e
--- /dev/null
+++ b/net/8021q/vlan_core.c
@@ -0,0 +1,362 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/netpoll.h>
+#include <linux/export.h>
+#include "vlan.h"
+
+bool vlan_do_receive(struct sk_buff **skbp)
+{
+ struct sk_buff *skb = *skbp;
+ __be16 vlan_proto = skb->vlan_proto;
+ u16 vlan_id = skb_vlan_tag_get_id(skb);
+ struct net_device *vlan_dev;
+ struct vlan_pcpu_stats *rx_stats;
+
+ vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id);
+ if (!vlan_dev)
+ return false;
+
+ skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return false;
+
+ skb->dev = vlan_dev;
+ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
+ /* Our lower layer thinks this is not local, let's make sure.
+ * This allows the VLAN to have a different MAC than the
+ * underlying device, and still route correctly. */
+ if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, vlan_dev->dev_addr))
+ skb->pkt_type = PACKET_HOST;
+ }
+
+ if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) {
+ unsigned int offset = skb->data - skb_mac_header(skb);
+
+ /*
+ * vlan_insert_tag expect skb->data pointing to mac header.
+ * So change skb->data before calling it and change back to
+ * original position later
+ */
+ skb_push(skb, offset);
+ skb = *skbp = vlan_insert_tag(skb, skb->vlan_proto,
+ skb->vlan_tci);
+ if (!skb)
+ return false;
+ skb_pull(skb, offset + VLAN_HLEN);
+ skb_reset_mac_len(skb);
+ }
+
+ skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
+ skb->vlan_tci = 0;
+
+ rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);
+
+ u64_stats_update_begin(&rx_stats->syncp);
+ rx_stats->rx_packets++;
+ rx_stats->rx_bytes += skb->len;
+ if (skb->pkt_type == PACKET_MULTICAST)
+ rx_stats->rx_multicast++;
+ u64_stats_update_end(&rx_stats->syncp);
+
+ return true;
+}
+
+/* Must be invoked with rcu_read_lock. */
+struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev,
+ __be16 vlan_proto, u16 vlan_id)
+{
+ struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info);
+
+ if (vlan_info) {
+ return vlan_group_get_device(&vlan_info->grp,
+ vlan_proto, vlan_id);
+ } else {
+ /*
+ * Lower devices of master uppers (bonding, team) do not have
+ * grp assigned to themselves. Grp is assigned to upper device
+ * instead.
+ */
+ struct net_device *upper_dev;
+
+ upper_dev = netdev_master_upper_dev_get_rcu(dev);
+ if (upper_dev)
+ return __vlan_find_dev_deep_rcu(upper_dev,
+ vlan_proto, vlan_id);
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(__vlan_find_dev_deep_rcu);
+
+struct net_device *vlan_dev_real_dev(const struct net_device *dev)
+{
+ struct net_device *ret = vlan_dev_priv(dev)->real_dev;
+
+ while (is_vlan_dev(ret))
+ ret = vlan_dev_priv(ret)->real_dev;
+
+ return ret;
+}
+EXPORT_SYMBOL(vlan_dev_real_dev);
+
+u16 vlan_dev_vlan_id(const struct net_device *dev)
+{
+ return vlan_dev_priv(dev)->vlan_id;
+}
+EXPORT_SYMBOL(vlan_dev_vlan_id);
+
+__be16 vlan_dev_vlan_proto(const struct net_device *dev)
+{
+ return vlan_dev_priv(dev)->vlan_proto;
+}
+EXPORT_SYMBOL(vlan_dev_vlan_proto);
+
+/*
+ * vlan info and vid list
+ */
+
+static void vlan_group_free(struct vlan_group *grp)
+{
+ int i, j;
+
+ for (i = 0; i < VLAN_PROTO_NUM; i++)
+ for (j = 0; j < VLAN_GROUP_ARRAY_SPLIT_PARTS; j++)
+ kfree(grp->vlan_devices_arrays[i][j]);
+}
+
+static void vlan_info_free(struct vlan_info *vlan_info)
+{
+ vlan_group_free(&vlan_info->grp);
+ kfree(vlan_info);
+}
+
+static void vlan_info_rcu_free(struct rcu_head *rcu)
+{
+ vlan_info_free(container_of(rcu, struct vlan_info, rcu));
+}
+
+static struct vlan_info *vlan_info_alloc(struct net_device *dev)
+{
+ struct vlan_info *vlan_info;
+
+ vlan_info = kzalloc(sizeof(struct vlan_info), GFP_KERNEL);
+ if (!vlan_info)
+ return NULL;
+
+ vlan_info->real_dev = dev;
+ INIT_LIST_HEAD(&vlan_info->vid_list);
+ return vlan_info;
+}
+
+struct vlan_vid_info {
+ struct list_head list;
+ __be16 proto;
+ u16 vid;
+ int refcount;
+};
+
+static bool vlan_hw_filter_capable(const struct net_device *dev,
+ const struct vlan_vid_info *vid_info)
+{
+ if (vid_info->proto == htons(ETH_P_8021Q) &&
+ dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
+ return true;
+ if (vid_info->proto == htons(ETH_P_8021AD) &&
+ dev->features & NETIF_F_HW_VLAN_STAG_FILTER)
+ return true;
+ return false;
+}
+
+static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info,
+ __be16 proto, u16 vid)
+{
+ struct vlan_vid_info *vid_info;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ if (vid_info->proto == proto && vid_info->vid == vid)
+ return vid_info;
+ }
+ return NULL;
+}
+
+static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid)
+{
+ struct vlan_vid_info *vid_info;
+
+ vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL);
+ if (!vid_info)
+ return NULL;
+ vid_info->proto = proto;
+ vid_info->vid = vid;
+
+ return vid_info;
+}
+
+static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
+ struct vlan_vid_info **pvid_info)
+{
+ struct net_device *dev = vlan_info->real_dev;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct vlan_vid_info *vid_info;
+ int err;
+
+ vid_info = vlan_vid_info_alloc(proto, vid);
+ if (!vid_info)
+ return -ENOMEM;
+
+ if (vlan_hw_filter_capable(dev, vid_info)) {
+ err = ops->ndo_vlan_rx_add_vid(dev, proto, vid);
+ if (err) {
+ kfree(vid_info);
+ return err;
+ }
+ }
+ list_add(&vid_info->list, &vlan_info->vid_list);
+ vlan_info->nr_vids++;
+ *pvid_info = vid_info;
+ return 0;
+}
+
+int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
+{
+ struct vlan_info *vlan_info;
+ struct vlan_vid_info *vid_info;
+ bool vlan_info_created = false;
+ int err;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info) {
+ vlan_info = vlan_info_alloc(dev);
+ if (!vlan_info)
+ return -ENOMEM;
+ vlan_info_created = true;
+ }
+ vid_info = vlan_vid_info_get(vlan_info, proto, vid);
+ if (!vid_info) {
+ err = __vlan_vid_add(vlan_info, proto, vid, &vid_info);
+ if (err)
+ goto out_free_vlan_info;
+ }
+ vid_info->refcount++;
+
+ if (vlan_info_created)
+ rcu_assign_pointer(dev->vlan_info, vlan_info);
+
+ return 0;
+
+out_free_vlan_info:
+ if (vlan_info_created)
+ kfree(vlan_info);
+ return err;
+}
+EXPORT_SYMBOL(vlan_vid_add);
+
+static void __vlan_vid_del(struct vlan_info *vlan_info,
+ struct vlan_vid_info *vid_info)
+{
+ struct net_device *dev = vlan_info->real_dev;
+ const struct net_device_ops *ops = dev->netdev_ops;
+ __be16 proto = vid_info->proto;
+ u16 vid = vid_info->vid;
+ int err;
+
+ if (vlan_hw_filter_capable(dev, vid_info)) {
+ err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
+ if (err) {
+ pr_warn("failed to kill vid %04x/%d for device %s\n",
+ proto, vid, dev->name);
+ }
+ }
+ list_del(&vid_info->list);
+ kfree(vid_info);
+ vlan_info->nr_vids--;
+}
+
+void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
+{
+ struct vlan_info *vlan_info;
+ struct vlan_vid_info *vid_info;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ return;
+
+ vid_info = vlan_vid_info_get(vlan_info, proto, vid);
+ if (!vid_info)
+ return;
+ vid_info->refcount--;
+ if (vid_info->refcount == 0) {
+ __vlan_vid_del(vlan_info, vid_info);
+ if (vlan_info->nr_vids == 0) {
+ RCU_INIT_POINTER(dev->vlan_info, NULL);
+ call_rcu(&vlan_info->rcu, vlan_info_rcu_free);
+ }
+ }
+}
+EXPORT_SYMBOL(vlan_vid_del);
+
+int vlan_vids_add_by_dev(struct net_device *dev,
+ const struct net_device *by_dev)
+{
+ struct vlan_vid_info *vid_info;
+ struct vlan_info *vlan_info;
+ int err;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(by_dev->vlan_info);
+ if (!vlan_info)
+ return 0;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ err = vlan_vid_add(dev, vid_info->proto, vid_info->vid);
+ if (err)
+ goto unwind;
+ }
+ return 0;
+
+unwind:
+ list_for_each_entry_continue_reverse(vid_info,
+ &vlan_info->vid_list,
+ list) {
+ vlan_vid_del(dev, vid_info->proto, vid_info->vid);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(vlan_vids_add_by_dev);
+
+void vlan_vids_del_by_dev(struct net_device *dev,
+ const struct net_device *by_dev)
+{
+ struct vlan_vid_info *vid_info;
+ struct vlan_info *vlan_info;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(by_dev->vlan_info);
+ if (!vlan_info)
+ return;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list)
+ vlan_vid_del(dev, vid_info->proto, vid_info->vid);
+}
+EXPORT_SYMBOL(vlan_vids_del_by_dev);
+
+bool vlan_uses_dev(const struct net_device *dev)
+{
+ struct vlan_info *vlan_info;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ return false;
+ return vlan_info->grp.nr_vlan_devs ? true : false;
+}
+EXPORT_SYMBOL(vlan_uses_dev);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
new file mode 100644
index 000000000..01d7ba840
--- /dev/null
+++ b/net/8021q/vlan_dev.c
@@ -0,0 +1,804 @@
+/* -*- linux-c -*-
+ * INET 802.1Q VLAN
+ * Ethernet-type device handling.
+ *
+ * Authors: Ben Greear <greearb@candelatech.com>
+ * Please send support related email to: netdev@vger.kernel.org
+ * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html
+ *
+ * Fixes: Mar 22 2001: Martin Bokaemper <mbokaemper@unispherenetworks.com>
+ * - reset skb->pkt_type on incoming packets when MAC was changed
+ * - see that changed MAC is saddr for outgoing packets
+ * Oct 20, 2001: Ard van Breeman:
+ * - Fix MC-list, finally.
+ * - Flush MC-list on VLAN destroy.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <net/arp.h>
+
+#include "vlan.h"
+#include "vlanproc.h"
+#include <linux/if_vlan.h>
+#include <linux/netpoll.h>
+
+/*
+ * Create the VLAN header for an arbitrary protocol layer
+ *
+ * saddr=NULL means use device source address
+ * daddr=NULL means leave destination address (eg unresolved arp)
+ *
+ * This is called when the SKB is moving down the stack towards the
+ * physical devices.
+ */
+static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr,
+ unsigned int len)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct vlan_hdr *vhdr;
+ unsigned int vhdrlen = 0;
+ u16 vlan_tci = 0;
+ int rc;
+
+ if (!(vlan->flags & VLAN_FLAG_REORDER_HDR)) {
+ vhdr = (struct vlan_hdr *) skb_push(skb, VLAN_HLEN);
+
+ vlan_tci = vlan->vlan_id;
+ vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority);
+ vhdr->h_vlan_TCI = htons(vlan_tci);
+
+ /*
+ * Set the protocol type. For a packet of type ETH_P_802_3/2 we
+ * put the length in here instead.
+ */
+ if (type != ETH_P_802_3 && type != ETH_P_802_2)
+ vhdr->h_vlan_encapsulated_proto = htons(type);
+ else
+ vhdr->h_vlan_encapsulated_proto = htons(len);
+
+ skb->protocol = vlan->vlan_proto;
+ type = ntohs(vlan->vlan_proto);
+ vhdrlen = VLAN_HLEN;
+ }
+
+ /* Before delegating work to the lower layer, enter our MAC-address */
+ if (saddr == NULL)
+ saddr = dev->dev_addr;
+
+ /* Now make the underlying real hard header */
+ dev = vlan->real_dev;
+ rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen);
+ if (rc > 0)
+ rc += vhdrlen;
+ return rc;
+}
+
+static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ if (vlan->netpoll)
+ netpoll_send_skb(vlan->netpoll, skb);
+#else
+ BUG();
+#endif
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
+ unsigned int len;
+ int ret;
+
+ /* Handle non-VLAN frames if they are sent to us, for example by DHCP.
+ *
+ * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING
+ * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs...
+ */
+ if (veth->h_vlan_proto != vlan->vlan_proto ||
+ vlan->flags & VLAN_FLAG_REORDER_HDR) {
+ u16 vlan_tci;
+ vlan_tci = vlan->vlan_id;
+ vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority);
+ __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci);
+ }
+
+ skb->dev = vlan->real_dev;
+ len = skb->len;
+ if (unlikely(netpoll_tx_running(dev)))
+ return vlan_netpoll_send_skb(vlan, skb);
+
+ ret = dev_queue_xmit(skb);
+
+ if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
+ struct vlan_pcpu_stats *stats;
+
+ stats = this_cpu_ptr(vlan->vlan_pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->tx_packets++;
+ stats->tx_bytes += len;
+ u64_stats_update_end(&stats->syncp);
+ } else {
+ this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped);
+ }
+
+ return ret;
+}
+
+static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
+{
+ /* TODO: gotta make sure the underlying layer can handle it,
+ * maybe an IFF_VLAN_CAPABLE flag for devices?
+ */
+ if (vlan_dev_priv(dev)->real_dev->mtu < new_mtu)
+ return -ERANGE;
+
+ dev->mtu = new_mtu;
+
+ return 0;
+}
+
+void vlan_dev_set_ingress_priority(const struct net_device *dev,
+ u32 skb_prio, u16 vlan_prio)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+ if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio)
+ vlan->nr_ingress_mappings--;
+ else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio)
+ vlan->nr_ingress_mappings++;
+
+ vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio;
+}
+
+int vlan_dev_set_egress_priority(const struct net_device *dev,
+ u32 skb_prio, u16 vlan_prio)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct vlan_priority_tci_mapping *mp = NULL;
+ struct vlan_priority_tci_mapping *np;
+ u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK;
+
+ /* See if a priority mapping exists.. */
+ mp = vlan->egress_priority_map[skb_prio & 0xF];
+ while (mp) {
+ if (mp->priority == skb_prio) {
+ if (mp->vlan_qos && !vlan_qos)
+ vlan->nr_egress_mappings--;
+ else if (!mp->vlan_qos && vlan_qos)
+ vlan->nr_egress_mappings++;
+ mp->vlan_qos = vlan_qos;
+ return 0;
+ }
+ mp = mp->next;
+ }
+
+ /* Create a new mapping then. */
+ mp = vlan->egress_priority_map[skb_prio & 0xF];
+ np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL);
+ if (!np)
+ return -ENOBUFS;
+
+ np->next = mp;
+ np->priority = skb_prio;
+ np->vlan_qos = vlan_qos;
+ /* Before inserting this element in hash table, make sure all its fields
+ * are committed to memory.
+ * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask()
+ */
+ smp_wmb();
+ vlan->egress_priority_map[skb_prio & 0xF] = np;
+ if (vlan_qos)
+ vlan->nr_egress_mappings++;
+ return 0;
+}
+
+/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */
+int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ u32 old_flags = vlan->flags;
+
+ if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP))
+ return -EINVAL;
+
+ vlan->flags = (old_flags & ~mask) | (flags & mask);
+
+ if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_GVRP) {
+ if (vlan->flags & VLAN_FLAG_GVRP)
+ vlan_gvrp_request_join(dev);
+ else
+ vlan_gvrp_request_leave(dev);
+ }
+
+ if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_MVRP) {
+ if (vlan->flags & VLAN_FLAG_MVRP)
+ vlan_mvrp_request_join(dev);
+ else
+ vlan_mvrp_request_leave(dev);
+ }
+ return 0;
+}
+
+void vlan_dev_get_realdev_name(const struct net_device *dev, char *result)
+{
+ strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23);
+}
+
+static int vlan_dev_open(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+ int err;
+
+ if (!(real_dev->flags & IFF_UP) &&
+ !(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+ return -ENETDOWN;
+
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
+ err = dev_uc_add(real_dev, dev->dev_addr);
+ if (err < 0)
+ goto out;
+ }
+
+ if (dev->flags & IFF_ALLMULTI) {
+ err = dev_set_allmulti(real_dev, 1);
+ if (err < 0)
+ goto del_unicast;
+ }
+ if (dev->flags & IFF_PROMISC) {
+ err = dev_set_promiscuity(real_dev, 1);
+ if (err < 0)
+ goto clear_allmulti;
+ }
+
+ ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr);
+
+ if (vlan->flags & VLAN_FLAG_GVRP)
+ vlan_gvrp_request_join(dev);
+
+ if (vlan->flags & VLAN_FLAG_MVRP)
+ vlan_mvrp_request_join(dev);
+
+ if (netif_carrier_ok(real_dev))
+ netif_carrier_on(dev);
+ return 0;
+
+clear_allmulti:
+ if (dev->flags & IFF_ALLMULTI)
+ dev_set_allmulti(real_dev, -1);
+del_unicast:
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
+ dev_uc_del(real_dev, dev->dev_addr);
+out:
+ netif_carrier_off(dev);
+ return err;
+}
+
+static int vlan_dev_stop(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+
+ dev_mc_unsync(real_dev, dev);
+ dev_uc_unsync(real_dev, dev);
+ if (dev->flags & IFF_ALLMULTI)
+ dev_set_allmulti(real_dev, -1);
+ if (dev->flags & IFF_PROMISC)
+ dev_set_promiscuity(real_dev, -1);
+
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
+ dev_uc_del(real_dev, dev->dev_addr);
+
+ netif_carrier_off(dev);
+ return 0;
+}
+
+static int vlan_dev_set_mac_address(struct net_device *dev, void *p)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ struct sockaddr *addr = p;
+ int err;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ if (!(dev->flags & IFF_UP))
+ goto out;
+
+ if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) {
+ err = dev_uc_add(real_dev, addr->sa_data);
+ if (err < 0)
+ return err;
+ }
+
+ if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
+ dev_uc_del(real_dev, dev->dev_addr);
+
+out:
+ ether_addr_copy(dev->dev_addr, addr->sa_data);
+ return 0;
+}
+
+static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ struct ifreq ifrr;
+ int err = -EOPNOTSUPP;
+
+ strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ);
+ ifrr.ifr_ifru = ifr->ifr_ifru;
+
+ switch (cmd) {
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSMIIREG:
+ case SIOCSHWTSTAMP:
+ case SIOCGHWTSTAMP:
+ if (netif_device_present(real_dev) && ops->ndo_do_ioctl)
+ err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd);
+ break;
+ }
+
+ if (!err)
+ ifr->ifr_ifru = ifrr.ifr_ifru;
+
+ return err;
+}
+
+static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int err = 0;
+
+ if (netif_device_present(real_dev) && ops->ndo_neigh_setup)
+ err = ops->ndo_neigh_setup(real_dev, pa);
+
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_FCOE)
+static int vlan_dev_fcoe_ddp_setup(struct net_device *dev, u16 xid,
+ struct scatterlist *sgl, unsigned int sgc)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = 0;
+
+ if (ops->ndo_fcoe_ddp_setup)
+ rc = ops->ndo_fcoe_ddp_setup(real_dev, xid, sgl, sgc);
+
+ return rc;
+}
+
+static int vlan_dev_fcoe_ddp_done(struct net_device *dev, u16 xid)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int len = 0;
+
+ if (ops->ndo_fcoe_ddp_done)
+ len = ops->ndo_fcoe_ddp_done(real_dev, xid);
+
+ return len;
+}
+
+static int vlan_dev_fcoe_enable(struct net_device *dev)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = -EINVAL;
+
+ if (ops->ndo_fcoe_enable)
+ rc = ops->ndo_fcoe_enable(real_dev);
+ return rc;
+}
+
+static int vlan_dev_fcoe_disable(struct net_device *dev)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = -EINVAL;
+
+ if (ops->ndo_fcoe_disable)
+ rc = ops->ndo_fcoe_disable(real_dev);
+ return rc;
+}
+
+static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = -EINVAL;
+
+ if (ops->ndo_fcoe_get_wwn)
+ rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type);
+ return rc;
+}
+
+static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid,
+ struct scatterlist *sgl, unsigned int sgc)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device_ops *ops = real_dev->netdev_ops;
+ int rc = 0;
+
+ if (ops->ndo_fcoe_ddp_target)
+ rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc);
+
+ return rc;
+}
+#endif
+
+static void vlan_dev_change_rx_flags(struct net_device *dev, int change)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+ if (dev->flags & IFF_UP) {
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
+ if (change & IFF_PROMISC)
+ dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1);
+ }
+}
+
+static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
+{
+ dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev);
+ dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev);
+}
+
+/*
+ * vlan network devices have devices nesting below it, and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key vlan_netdev_xmit_lock_key;
+static struct lock_class_key vlan_netdev_addr_lock_key;
+
+static void vlan_dev_set_lockdep_one(struct net_device *dev,
+ struct netdev_queue *txq,
+ void *_subclass)
+{
+ lockdep_set_class_and_subclass(&txq->_xmit_lock,
+ &vlan_netdev_xmit_lock_key,
+ *(int *)_subclass);
+}
+
+static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass)
+{
+ lockdep_set_class_and_subclass(&dev->addr_list_lock,
+ &vlan_netdev_addr_lock_key,
+ subclass);
+ netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass);
+}
+
+static int vlan_dev_get_lock_subclass(struct net_device *dev)
+{
+ return vlan_dev_priv(dev)->nest_level;
+}
+
+static const struct header_ops vlan_header_ops = {
+ .create = vlan_dev_hard_header,
+ .parse = eth_header_parse,
+};
+
+static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr,
+ unsigned int len)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+
+ if (saddr == NULL)
+ saddr = dev->dev_addr;
+
+ return dev_hard_header(skb, real_dev, type, daddr, saddr, len);
+}
+
+static const struct header_ops vlan_passthru_header_ops = {
+ .create = vlan_passthru_hard_header,
+ .parse = eth_header_parse,
+};
+
+static struct device_type vlan_type = {
+ .name = "vlan",
+};
+
+static const struct net_device_ops vlan_netdev_ops;
+
+static int vlan_dev_init(struct net_device *dev)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+ netif_carrier_off(dev);
+
+ /* IFF_BROADCAST|IFF_MULTICAST; ??? */
+ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
+ IFF_MASTER | IFF_SLAVE);
+ dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) |
+ (1<<__LINK_STATE_DORMANT))) |
+ (1<<__LINK_STATE_PRESENT);
+
+ dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG |
+ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
+ NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM |
+ NETIF_F_ALL_FCOE;
+
+ dev->features |= real_dev->vlan_features | NETIF_F_LLTX |
+ NETIF_F_GSO_SOFTWARE;
+ dev->gso_max_size = real_dev->gso_max_size;
+ if (dev->features & NETIF_F_VLAN_FEATURES)
+ netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n");
+
+ dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE;
+
+ /* ipv6 shared card related stuff */
+ dev->dev_id = real_dev->dev_id;
+
+ if (is_zero_ether_addr(dev->dev_addr))
+ eth_hw_addr_inherit(dev, real_dev);
+ if (is_zero_ether_addr(dev->broadcast))
+ memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
+
+#if IS_ENABLED(CONFIG_FCOE)
+ dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid;
+#endif
+
+ dev->needed_headroom = real_dev->needed_headroom;
+ if (vlan_hw_offload_capable(real_dev->features,
+ vlan_dev_priv(dev)->vlan_proto)) {
+ dev->header_ops = &vlan_passthru_header_ops;
+ dev->hard_header_len = real_dev->hard_header_len;
+ } else {
+ dev->header_ops = &vlan_header_ops;
+ dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
+ }
+
+ dev->netdev_ops = &vlan_netdev_ops;
+
+ SET_NETDEV_DEVTYPE(dev, &vlan_type);
+
+ vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev));
+
+ vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
+ if (!vlan_dev_priv(dev)->vlan_pcpu_stats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void vlan_dev_uninit(struct net_device *dev)
+{
+ struct vlan_priority_tci_mapping *pm;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
+ while ((pm = vlan->egress_priority_map[i]) != NULL) {
+ vlan->egress_priority_map[i] = pm->next;
+ kfree(pm);
+ }
+ }
+}
+
+static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ netdev_features_t old_features = features;
+
+ features = netdev_intersect_features(features, real_dev->vlan_features);
+ features |= NETIF_F_RXCSUM;
+ features = netdev_intersect_features(features, real_dev->features);
+
+ features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE);
+ features |= NETIF_F_LLTX;
+
+ return features;
+}
+
+static int vlan_ethtool_get_settings(struct net_device *dev,
+ struct ethtool_cmd *cmd)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+ return __ethtool_get_settings(vlan->real_dev, cmd);
+}
+
+static void vlan_ethtool_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strlcpy(info->driver, vlan_fullname, sizeof(info->driver));
+ strlcpy(info->version, vlan_version, sizeof(info->version));
+ strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
+}
+
+static int vlan_ethtool_get_ts_info(struct net_device *dev,
+ struct ethtool_ts_info *info)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops;
+
+ if (ops->get_ts_info) {
+ return ops->get_ts_info(vlan->real_dev, info);
+ } else {
+ info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ }
+
+ return 0;
+}
+
+static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+ struct vlan_pcpu_stats *p;
+ u32 rx_errors = 0, tx_dropped = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes;
+ unsigned int start;
+
+ p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
+ do {
+ start = u64_stats_fetch_begin_irq(&p->syncp);
+ rxpackets = p->rx_packets;
+ rxbytes = p->rx_bytes;
+ rxmulticast = p->rx_multicast;
+ txpackets = p->tx_packets;
+ txbytes = p->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&p->syncp, start));
+
+ stats->rx_packets += rxpackets;
+ stats->rx_bytes += rxbytes;
+ stats->multicast += rxmulticast;
+ stats->tx_packets += txpackets;
+ stats->tx_bytes += txbytes;
+ /* rx_errors & tx_dropped are u32 */
+ rx_errors += p->rx_errors;
+ tx_dropped += p->tx_dropped;
+ }
+ stats->rx_errors = rx_errors;
+ stats->tx_dropped = tx_dropped;
+
+ return stats;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void vlan_dev_poll_controller(struct net_device *dev)
+{
+ return;
+}
+
+static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
+ struct netpoll *netpoll;
+ int err = 0;
+
+ netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!netpoll)
+ goto out;
+
+ err = __netpoll_setup(netpoll, real_dev);
+ if (err) {
+ kfree(netpoll);
+ goto out;
+ }
+
+ vlan->netpoll = netpoll;
+
+out:
+ return err;
+}
+
+static void vlan_dev_netpoll_cleanup(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan= vlan_dev_priv(dev);
+ struct netpoll *netpoll = vlan->netpoll;
+
+ if (!netpoll)
+ return;
+
+ vlan->netpoll = NULL;
+
+ __netpoll_free_async(netpoll);
+}
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+
+static int vlan_dev_get_iflink(const struct net_device *dev)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+ return real_dev->ifindex;
+}
+
+static const struct ethtool_ops vlan_ethtool_ops = {
+ .get_settings = vlan_ethtool_get_settings,
+ .get_drvinfo = vlan_ethtool_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_ts_info = vlan_ethtool_get_ts_info,
+};
+
+static const struct net_device_ops vlan_netdev_ops = {
+ .ndo_change_mtu = vlan_dev_change_mtu,
+ .ndo_init = vlan_dev_init,
+ .ndo_uninit = vlan_dev_uninit,
+ .ndo_open = vlan_dev_open,
+ .ndo_stop = vlan_dev_stop,
+ .ndo_start_xmit = vlan_dev_hard_start_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_mac_address = vlan_dev_set_mac_address,
+ .ndo_set_rx_mode = vlan_dev_set_rx_mode,
+ .ndo_change_rx_flags = vlan_dev_change_rx_flags,
+ .ndo_do_ioctl = vlan_dev_ioctl,
+ .ndo_neigh_setup = vlan_dev_neigh_setup,
+ .ndo_get_stats64 = vlan_dev_get_stats64,
+#if IS_ENABLED(CONFIG_FCOE)
+ .ndo_fcoe_ddp_setup = vlan_dev_fcoe_ddp_setup,
+ .ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done,
+ .ndo_fcoe_enable = vlan_dev_fcoe_enable,
+ .ndo_fcoe_disable = vlan_dev_fcoe_disable,
+ .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn,
+ .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target,
+#endif
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_poll_controller = vlan_dev_poll_controller,
+ .ndo_netpoll_setup = vlan_dev_netpoll_setup,
+ .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
+#endif
+ .ndo_fix_features = vlan_dev_fix_features,
+ .ndo_get_lock_subclass = vlan_dev_get_lock_subclass,
+ .ndo_get_iflink = vlan_dev_get_iflink,
+};
+
+static void vlan_dev_free(struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+ free_percpu(vlan->vlan_pcpu_stats);
+ vlan->vlan_pcpu_stats = NULL;
+ free_netdev(dev);
+}
+
+void vlan_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->priv_flags |= IFF_802_1Q_VLAN;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netif_keep_dst(dev);
+ dev->tx_queue_len = 0;
+
+ dev->netdev_ops = &vlan_netdev_ops;
+ dev->destructor = vlan_dev_free;
+ dev->ethtool_ops = &vlan_ethtool_ops;
+
+ eth_zero_addr(dev->broadcast);
+}
diff --git a/net/8021q/vlan_gvrp.c b/net/8021q/vlan_gvrp.c
new file mode 100644
index 000000000..66a80320b
--- /dev/null
+++ b/net/8021q/vlan_gvrp.c
@@ -0,0 +1,70 @@
+/*
+ * IEEE 802.1Q GARP VLAN Registration Protocol (GVRP)
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/if_vlan.h>
+#include <net/garp.h>
+#include "vlan.h"
+
+#define GARP_GVRP_ADDRESS { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x21 }
+
+enum gvrp_attributes {
+ GVRP_ATTR_INVALID,
+ GVRP_ATTR_VID,
+ __GVRP_ATTR_MAX
+};
+#define GVRP_ATTR_MAX (__GVRP_ATTR_MAX - 1)
+
+static struct garp_application vlan_gvrp_app __read_mostly = {
+ .proto.group_address = GARP_GVRP_ADDRESS,
+ .maxattr = GVRP_ATTR_MAX,
+ .type = GARP_APPLICATION_GVRP,
+};
+
+int vlan_gvrp_request_join(const struct net_device *dev)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ __be16 vlan_id = htons(vlan->vlan_id);
+
+ if (vlan->vlan_proto != htons(ETH_P_8021Q))
+ return 0;
+ return garp_request_join(vlan->real_dev, &vlan_gvrp_app,
+ &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID);
+}
+
+void vlan_gvrp_request_leave(const struct net_device *dev)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ __be16 vlan_id = htons(vlan->vlan_id);
+
+ if (vlan->vlan_proto != htons(ETH_P_8021Q))
+ return;
+ garp_request_leave(vlan->real_dev, &vlan_gvrp_app,
+ &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID);
+}
+
+int vlan_gvrp_init_applicant(struct net_device *dev)
+{
+ return garp_init_applicant(dev, &vlan_gvrp_app);
+}
+
+void vlan_gvrp_uninit_applicant(struct net_device *dev)
+{
+ garp_uninit_applicant(dev, &vlan_gvrp_app);
+}
+
+int __init vlan_gvrp_init(void)
+{
+ return garp_register_application(&vlan_gvrp_app);
+}
+
+void vlan_gvrp_uninit(void)
+{
+ garp_unregister_application(&vlan_gvrp_app);
+}
diff --git a/net/8021q/vlan_mvrp.c b/net/8021q/vlan_mvrp.c
new file mode 100644
index 000000000..e0fe09180
--- /dev/null
+++ b/net/8021q/vlan_mvrp.c
@@ -0,0 +1,76 @@
+/*
+ * IEEE 802.1Q Multiple VLAN Registration Protocol (MVRP)
+ *
+ * Copyright (c) 2012 Massachusetts Institute of Technology
+ *
+ * Adapted from code in net/8021q/vlan_gvrp.c
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/mrp.h>
+#include "vlan.h"
+
+#define MRP_MVRP_ADDRESS { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x21 }
+
+enum mvrp_attributes {
+ MVRP_ATTR_INVALID,
+ MVRP_ATTR_VID,
+ __MVRP_ATTR_MAX
+};
+#define MVRP_ATTR_MAX (__MVRP_ATTR_MAX - 1)
+
+static struct mrp_application vlan_mrp_app __read_mostly = {
+ .type = MRP_APPLICATION_MVRP,
+ .maxattr = MVRP_ATTR_MAX,
+ .pkttype.type = htons(ETH_P_MVRP),
+ .group_address = MRP_MVRP_ADDRESS,
+ .version = 0,
+};
+
+int vlan_mvrp_request_join(const struct net_device *dev)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ __be16 vlan_id = htons(vlan->vlan_id);
+
+ if (vlan->vlan_proto != htons(ETH_P_8021Q))
+ return 0;
+ return mrp_request_join(vlan->real_dev, &vlan_mrp_app,
+ &vlan_id, sizeof(vlan_id), MVRP_ATTR_VID);
+}
+
+void vlan_mvrp_request_leave(const struct net_device *dev)
+{
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ __be16 vlan_id = htons(vlan->vlan_id);
+
+ if (vlan->vlan_proto != htons(ETH_P_8021Q))
+ return;
+ mrp_request_leave(vlan->real_dev, &vlan_mrp_app,
+ &vlan_id, sizeof(vlan_id), MVRP_ATTR_VID);
+}
+
+int vlan_mvrp_init_applicant(struct net_device *dev)
+{
+ return mrp_init_applicant(dev, &vlan_mrp_app);
+}
+
+void vlan_mvrp_uninit_applicant(struct net_device *dev)
+{
+ mrp_uninit_applicant(dev, &vlan_mrp_app);
+}
+
+int __init vlan_mvrp_init(void)
+{
+ return mrp_register_application(&vlan_mrp_app);
+}
+
+void vlan_mvrp_uninit(void)
+{
+ mrp_unregister_application(&vlan_mrp_app);
+}
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
new file mode 100644
index 000000000..c92b52f37
--- /dev/null
+++ b/net/8021q/vlan_netlink.c
@@ -0,0 +1,273 @@
+/*
+ * VLAN netlink control interface
+ *
+ * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include "vlan.h"
+
+
+static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = {
+ [IFLA_VLAN_ID] = { .type = NLA_U16 },
+ [IFLA_VLAN_FLAGS] = { .len = sizeof(struct ifla_vlan_flags) },
+ [IFLA_VLAN_EGRESS_QOS] = { .type = NLA_NESTED },
+ [IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED },
+ [IFLA_VLAN_PROTOCOL] = { .type = NLA_U16 },
+};
+
+static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = {
+ [IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) },
+};
+
+
+static inline int vlan_validate_qos_map(struct nlattr *attr)
+{
+ if (!attr)
+ return 0;
+ return nla_validate_nested(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy);
+}
+
+static int vlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ifla_vlan_flags *flags;
+ u16 id;
+ int err;
+
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (!data)
+ return -EINVAL;
+
+ if (data[IFLA_VLAN_PROTOCOL]) {
+ switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) {
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+ }
+
+ if (data[IFLA_VLAN_ID]) {
+ id = nla_get_u16(data[IFLA_VLAN_ID]);
+ if (id >= VLAN_VID_MASK)
+ return -ERANGE;
+ }
+ if (data[IFLA_VLAN_FLAGS]) {
+ flags = nla_data(data[IFLA_VLAN_FLAGS]);
+ if ((flags->flags & flags->mask) &
+ ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP))
+ return -EINVAL;
+ }
+
+ err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]);
+ if (err < 0)
+ return err;
+ err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static int vlan_changelink(struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ifla_vlan_flags *flags;
+ struct ifla_vlan_qos_mapping *m;
+ struct nlattr *attr;
+ int rem;
+
+ if (data[IFLA_VLAN_FLAGS]) {
+ flags = nla_data(data[IFLA_VLAN_FLAGS]);
+ vlan_dev_change_flags(dev, flags->flags, flags->mask);
+ }
+ if (data[IFLA_VLAN_INGRESS_QOS]) {
+ nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
+ m = nla_data(attr);
+ vlan_dev_set_ingress_priority(dev, m->to, m->from);
+ }
+ }
+ if (data[IFLA_VLAN_EGRESS_QOS]) {
+ nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
+ m = nla_data(attr);
+ vlan_dev_set_egress_priority(dev, m->from, m->to);
+ }
+ }
+ return 0;
+}
+
+static int vlan_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev;
+ __be16 proto;
+ int err;
+
+ if (!data[IFLA_VLAN_ID])
+ return -EINVAL;
+
+ if (!tb[IFLA_LINK])
+ return -EINVAL;
+ real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+ if (!real_dev)
+ return -ENODEV;
+
+ if (data[IFLA_VLAN_PROTOCOL])
+ proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]);
+ else
+ proto = htons(ETH_P_8021Q);
+
+ vlan->vlan_proto = proto;
+ vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]);
+ vlan->real_dev = real_dev;
+ vlan->flags = VLAN_FLAG_REORDER_HDR;
+
+ err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id);
+ if (err < 0)
+ return err;
+
+ if (!tb[IFLA_MTU])
+ dev->mtu = real_dev->mtu;
+ else if (dev->mtu > real_dev->mtu)
+ return -EINVAL;
+
+ err = vlan_changelink(dev, tb, data);
+ if (err < 0)
+ return err;
+
+ return register_vlan_dev(dev);
+}
+
+static inline size_t vlan_qos_map_size(unsigned int n)
+{
+ if (n == 0)
+ return 0;
+ /* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */
+ return nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n;
+}
+
+static size_t vlan_get_size(const struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+
+ return nla_total_size(2) + /* IFLA_VLAN_PROTOCOL */
+ nla_total_size(2) + /* IFLA_VLAN_ID */
+ nla_total_size(sizeof(struct ifla_vlan_flags)) + /* IFLA_VLAN_FLAGS */
+ vlan_qos_map_size(vlan->nr_ingress_mappings) +
+ vlan_qos_map_size(vlan->nr_egress_mappings);
+}
+
+static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct vlan_priority_tci_mapping *pm;
+ struct ifla_vlan_flags f;
+ struct ifla_vlan_qos_mapping m;
+ struct nlattr *nest;
+ unsigned int i;
+
+ if (nla_put_be16(skb, IFLA_VLAN_PROTOCOL, vlan->vlan_proto) ||
+ nla_put_u16(skb, IFLA_VLAN_ID, vlan->vlan_id))
+ goto nla_put_failure;
+ if (vlan->flags) {
+ f.flags = vlan->flags;
+ f.mask = ~0;
+ if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), &f))
+ goto nla_put_failure;
+ }
+ if (vlan->nr_ingress_mappings) {
+ nest = nla_nest_start(skb, IFLA_VLAN_INGRESS_QOS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) {
+ if (!vlan->ingress_priority_map[i])
+ continue;
+
+ m.from = i;
+ m.to = vlan->ingress_priority_map[i];
+ if (nla_put(skb, IFLA_VLAN_QOS_MAPPING,
+ sizeof(m), &m))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+
+ if (vlan->nr_egress_mappings) {
+ nest = nla_nest_start(skb, IFLA_VLAN_EGRESS_QOS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
+ for (pm = vlan->egress_priority_map[i]; pm;
+ pm = pm->next) {
+ if (!pm->vlan_qos)
+ continue;
+
+ m.from = pm->priority;
+ m.to = (pm->vlan_qos >> 13) & 0x7;
+ if (nla_put(skb, IFLA_VLAN_QOS_MAPPING,
+ sizeof(m), &m))
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(skb, nest);
+ }
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static struct net *vlan_get_link_net(const struct net_device *dev)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+ return dev_net(real_dev);
+}
+
+struct rtnl_link_ops vlan_link_ops __read_mostly = {
+ .kind = "vlan",
+ .maxtype = IFLA_VLAN_MAX,
+ .policy = vlan_policy,
+ .priv_size = sizeof(struct vlan_dev_priv),
+ .setup = vlan_setup,
+ .validate = vlan_validate,
+ .newlink = vlan_newlink,
+ .changelink = vlan_changelink,
+ .dellink = unregister_vlan_dev,
+ .get_size = vlan_get_size,
+ .fill_info = vlan_fill_info,
+ .get_link_net = vlan_get_link_net,
+};
+
+int __init vlan_netlink_init(void)
+{
+ return rtnl_link_register(&vlan_link_ops);
+}
+
+void __exit vlan_netlink_fini(void)
+{
+ rtnl_link_unregister(&vlan_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("vlan");
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
new file mode 100644
index 000000000..ae63cf72a
--- /dev/null
+++ b/net/8021q/vlanproc.c
@@ -0,0 +1,326 @@
+/******************************************************************************
+ * vlanproc.c VLAN Module. /proc filesystem interface.
+ *
+ * This module is completely hardware-independent and provides
+ * access to the router using Linux /proc filesystem.
+ *
+ * Author: Ben Greear, <greearb@candelatech.com> coppied from wanproc.c
+ * by: Gene Kozin <genek@compuserve.com>
+ *
+ * Copyright: (c) 1998 Ben Greear
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ * ============================================================================
+ * Jan 20, 1998 Ben Greear Initial Version
+ *****************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include "vlanproc.h"
+#include "vlan.h"
+
+/****** Function Prototypes *************************************************/
+
+/* Methods for preparing data for reading proc entries */
+static int vlan_seq_show(struct seq_file *seq, void *v);
+static void *vlan_seq_start(struct seq_file *seq, loff_t *pos);
+static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+static void vlan_seq_stop(struct seq_file *seq, void *);
+static int vlandev_seq_show(struct seq_file *seq, void *v);
+
+/*
+ * Global Data
+ */
+
+
+/*
+ * Names of the proc directory entries
+ */
+
+static const char name_root[] = "vlan";
+static const char name_conf[] = "config";
+
+/*
+ * Structures for interfacing with the /proc filesystem.
+ * VLAN creates its own directory /proc/net/vlan with the following
+ * entries:
+ * config device status/configuration
+ * <device> entry for each device
+ */
+
+/*
+ * Generic /proc/net/vlan/<file> file and inode operations
+ */
+
+static const struct seq_operations vlan_seq_ops = {
+ .start = vlan_seq_start,
+ .next = vlan_seq_next,
+ .stop = vlan_seq_stop,
+ .show = vlan_seq_show,
+};
+
+static int vlan_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &vlan_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations vlan_fops = {
+ .owner = THIS_MODULE,
+ .open = vlan_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+/*
+ * /proc/net/vlan/<device> file and inode operations
+ */
+
+static int vlandev_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, vlandev_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations vlandev_fops = {
+ .owner = THIS_MODULE,
+ .open = vlandev_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * Proc filesystem directory entries.
+ */
+
+/* Strings */
+static const char *const vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = {
+ [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID",
+ [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD",
+ [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD",
+ [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID",
+};
+/*
+ * Interface functions
+ */
+
+/*
+ * Clean up /proc/net/vlan entries
+ */
+
+void vlan_proc_cleanup(struct net *net)
+{
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+
+ if (vn->proc_vlan_conf)
+ remove_proc_entry(name_conf, vn->proc_vlan_dir);
+
+ if (vn->proc_vlan_dir)
+ remove_proc_entry(name_root, net->proc_net);
+
+ /* Dynamically added entries should be cleaned up as their vlan_device
+ * is removed, so we should not have to take care of it here...
+ */
+}
+
+/*
+ * Create /proc/net/vlan entries
+ */
+
+int __net_init vlan_proc_init(struct net *net)
+{
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+
+ vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net);
+ if (!vn->proc_vlan_dir)
+ goto err;
+
+ vn->proc_vlan_conf = proc_create(name_conf, S_IFREG|S_IRUSR|S_IWUSR,
+ vn->proc_vlan_dir, &vlan_fops);
+ if (!vn->proc_vlan_conf)
+ goto err;
+ return 0;
+
+err:
+ pr_err("can't create entry in proc filesystem!\n");
+ vlan_proc_cleanup(net);
+ return -ENOBUFS;
+}
+
+/*
+ * Add directory entry for VLAN device.
+ */
+
+int vlan_proc_add_dev(struct net_device *vlandev)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
+ struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id);
+
+ if (!strcmp(vlandev->name, name_conf))
+ return -EINVAL;
+ vlan->dent =
+ proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR,
+ vn->proc_vlan_dir, &vlandev_fops, vlandev);
+ if (!vlan->dent)
+ return -ENOBUFS;
+ return 0;
+}
+
+/*
+ * Delete directory entry for VLAN device.
+ */
+int vlan_proc_rem_dev(struct net_device *vlandev)
+{
+ /** NOTE: This will consume the memory pointed to by dent, it seems. */
+ proc_remove(vlan_dev_priv(vlandev)->dent);
+ vlan_dev_priv(vlandev)->dent = NULL;
+ return 0;
+}
+
+/****** Proc filesystem entry points ****************************************/
+
+/*
+ * The following few functions build the content of /proc/net/vlan/config
+ */
+
+/* start read of /proc/net/vlan/config */
+static void *vlan_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct net_device *dev;
+ struct net *net = seq_file_net(seq);
+ loff_t i = 1;
+
+ rcu_read_lock();
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for_each_netdev_rcu(net, dev) {
+ if (!is_vlan_dev(dev))
+ continue;
+
+ if (i++ == *pos)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net_device *dev;
+ struct net *net = seq_file_net(seq);
+
+ ++*pos;
+
+ dev = v;
+ if (v == SEQ_START_TOKEN)
+ dev = net_device_entry(&net->dev_base_head);
+
+ for_each_netdev_continue_rcu(net, dev) {
+ if (!is_vlan_dev(dev))
+ continue;
+
+ return dev;
+ }
+
+ return NULL;
+}
+
+static void vlan_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static int vlan_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_net(seq);
+ struct vlan_net *vn = net_generic(net, vlan_net_id);
+
+ if (v == SEQ_START_TOKEN) {
+ const char *nmtype = NULL;
+
+ seq_puts(seq, "VLAN Dev name | VLAN ID\n");
+
+ if (vn->name_type < ARRAY_SIZE(vlan_name_type_str))
+ nmtype = vlan_name_type_str[vn->name_type];
+
+ seq_printf(seq, "Name-Type: %s\n",
+ nmtype ? nmtype : "UNKNOWN");
+ } else {
+ const struct net_device *vlandev = v;
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
+
+ seq_printf(seq, "%-15s| %d | %s\n", vlandev->name,
+ vlan->vlan_id, vlan->real_dev->name);
+ }
+ return 0;
+}
+
+static int vlandev_seq_show(struct seq_file *seq, void *offset)
+{
+ struct net_device *vlandev = (struct net_device *) seq->private;
+ const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats;
+ static const char fmt64[] = "%30s %12llu\n";
+ int i;
+
+ if (!is_vlan_dev(vlandev))
+ return 0;
+
+ stats = dev_get_stats(vlandev, &temp);
+ seq_printf(seq,
+ "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n",
+ vlandev->name, vlan->vlan_id,
+ (int)(vlan->flags & 1), vlandev->priv_flags);
+
+ seq_printf(seq, fmt64, "total frames received", stats->rx_packets);
+ seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes);
+ seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast);
+ seq_puts(seq, "\n");
+ seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets);
+ seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes);
+ seq_printf(seq, "Device: %s", vlan->real_dev->name);
+ /* now show all PRIORITY mappings relating to this VLAN */
+ seq_printf(seq, "\nINGRESS priority mappings: "
+ "0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n",
+ vlan->ingress_priority_map[0],
+ vlan->ingress_priority_map[1],
+ vlan->ingress_priority_map[2],
+ vlan->ingress_priority_map[3],
+ vlan->ingress_priority_map[4],
+ vlan->ingress_priority_map[5],
+ vlan->ingress_priority_map[6],
+ vlan->ingress_priority_map[7]);
+
+ seq_printf(seq, " EGRESS priority mappings: ");
+ for (i = 0; i < 16; i++) {
+ const struct vlan_priority_tci_mapping *mp
+ = vlan->egress_priority_map[i];
+ while (mp) {
+ seq_printf(seq, "%u:%hu ",
+ mp->priority, ((mp->vlan_qos >> 13) & 0x7));
+ mp = mp->next;
+ }
+ }
+ seq_puts(seq, "\n");
+
+ return 0;
+}
diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h
new file mode 100644
index 000000000..063f60a3d
--- /dev/null
+++ b/net/8021q/vlanproc.h
@@ -0,0 +1,20 @@
+#ifndef __BEN_VLAN_PROC_INC__
+#define __BEN_VLAN_PROC_INC__
+
+#ifdef CONFIG_PROC_FS
+struct net;
+
+int vlan_proc_init(struct net *net);
+int vlan_proc_rem_dev(struct net_device *vlandev);
+int vlan_proc_add_dev(struct net_device *vlandev);
+void vlan_proc_cleanup(struct net *net);
+
+#else /* No CONFIG_PROC_FS */
+
+#define vlan_proc_init(net) (0)
+#define vlan_proc_cleanup(net) do {} while (0)
+#define vlan_proc_add_dev(dev) ({(void)(dev), 0; })
+#define vlan_proc_rem_dev(dev) ({(void)(dev), 0; })
+#endif
+
+#endif /* !(__BEN_VLAN_PROC_INC__) */
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
new file mode 100644
index 000000000..a75174a33
--- /dev/null
+++ b/net/9p/Kconfig
@@ -0,0 +1,36 @@
+#
+# 9P protocol configuration
+#
+
+menuconfig NET_9P
+ depends on NET
+ tristate "Plan 9 Resource Sharing Support (9P2000)"
+ help
+ If you say Y here, you will get experimental support for
+ Plan 9 resource sharing via the 9P2000 protocol.
+
+ See <http://v9fs.sf.net> for more information.
+
+ If unsure, say N.
+
+if NET_9P
+
+config NET_9P_VIRTIO
+ depends on VIRTIO
+ tristate "9P Virtio Transport"
+ help
+ This builds support for a transports between
+ guest partitions and a host partition.
+
+config NET_9P_RDMA
+ depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS
+ tristate "9P RDMA Transport (Experimental)"
+ help
+ This builds support for an RDMA transport.
+
+config NET_9P_DEBUG
+ bool "Debug information"
+ help
+ Say Y if you want the 9P subsystem to log debug information.
+
+endif
diff --git a/net/9p/Makefile b/net/9p/Makefile
new file mode 100644
index 000000000..a0874cc1f
--- /dev/null
+++ b/net/9p/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_NET_9P) := 9pnet.o
+obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
+obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o
+
+9pnet-objs := \
+ mod.o \
+ client.o \
+ error.o \
+ util.o \
+ protocol.o \
+ trans_fd.o \
+ trans_common.o \
+
+9pnet_virtio-objs := \
+ trans_virtio.o \
+
+9pnet_rdma-objs := \
+ trans_rdma.o \
diff --git a/net/9p/client.c b/net/9p/client.c
new file mode 100644
index 000000000..81925b923
--- /dev/null
+++ b/net/9p/client.c
@@ -0,0 +1,2263 @@
+/*
+ * net/9p/clnt.c
+ *
+ * 9P Client
+ *
+ * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <net/9p/9p.h>
+#include <linux/parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include "protocol.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/9p.h>
+
+/*
+ * Client Option Parsing (code inspired by NFS code)
+ * - a little lazy - parse all client options
+ */
+
+enum {
+ Opt_msize,
+ Opt_trans,
+ Opt_legacy,
+ Opt_version,
+ Opt_err,
+};
+
+static const match_table_t tokens = {
+ {Opt_msize, "msize=%u"},
+ {Opt_legacy, "noextend"},
+ {Opt_trans, "trans=%s"},
+ {Opt_version, "version=%s"},
+ {Opt_err, NULL},
+};
+
+inline int p9_is_proto_dotl(struct p9_client *clnt)
+{
+ return clnt->proto_version == p9_proto_2000L;
+}
+EXPORT_SYMBOL(p9_is_proto_dotl);
+
+inline int p9_is_proto_dotu(struct p9_client *clnt)
+{
+ return clnt->proto_version == p9_proto_2000u;
+}
+EXPORT_SYMBOL(p9_is_proto_dotu);
+
+/*
+ * Some error codes are taken directly from the server replies,
+ * make sure they are valid.
+ */
+static int safe_errno(int err)
+{
+ if ((err > 0) || (err < -MAX_ERRNO)) {
+ p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err);
+ return -EPROTO;
+ }
+ return err;
+}
+
+
+/* Interpret mount option for protocol version */
+static int get_protocol_version(char *s)
+{
+ int version = -EINVAL;
+
+ if (!strcmp(s, "9p2000")) {
+ version = p9_proto_legacy;
+ p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n");
+ } else if (!strcmp(s, "9p2000.u")) {
+ version = p9_proto_2000u;
+ p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n");
+ } else if (!strcmp(s, "9p2000.L")) {
+ version = p9_proto_2000L;
+ p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
+ } else
+ pr_info("Unknown protocol version %s\n", s);
+
+ return version;
+}
+
+/**
+ * parse_options - parse mount options into client structure
+ * @opts: options string passed from mount
+ * @clnt: existing v9fs client information
+ *
+ * Return 0 upon success, -ERRNO upon failure
+ */
+
+static int parse_opts(char *opts, struct p9_client *clnt)
+{
+ char *options, *tmp_options;
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ char *s;
+ int ret = 0;
+
+ clnt->proto_version = p9_proto_2000L;
+ clnt->msize = 8192;
+
+ if (!opts)
+ return 0;
+
+ tmp_options = kstrdup(opts, GFP_KERNEL);
+ if (!tmp_options) {
+ p9_debug(P9_DEBUG_ERROR,
+ "failed to allocate copy of option string\n");
+ return -ENOMEM;
+ }
+ options = tmp_options;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token, r;
+ if (!*p)
+ continue;
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_msize:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
+ clnt->msize = option;
+ break;
+ case Opt_trans:
+ s = match_strdup(&args[0]);
+ if (!s) {
+ ret = -ENOMEM;
+ p9_debug(P9_DEBUG_ERROR,
+ "problem allocating copy of trans arg\n");
+ goto free_and_return;
+ }
+ clnt->trans_mod = v9fs_get_trans_by_name(s);
+ if (clnt->trans_mod == NULL) {
+ pr_info("Could not find request transport: %s\n",
+ s);
+ ret = -EINVAL;
+ kfree(s);
+ goto free_and_return;
+ }
+ kfree(s);
+ break;
+ case Opt_legacy:
+ clnt->proto_version = p9_proto_legacy;
+ break;
+ case Opt_version:
+ s = match_strdup(&args[0]);
+ if (!s) {
+ ret = -ENOMEM;
+ p9_debug(P9_DEBUG_ERROR,
+ "problem allocating copy of version arg\n");
+ goto free_and_return;
+ }
+ ret = get_protocol_version(s);
+ if (ret == -EINVAL) {
+ kfree(s);
+ goto free_and_return;
+ }
+ kfree(s);
+ clnt->proto_version = ret;
+ break;
+ default:
+ continue;
+ }
+ }
+
+free_and_return:
+ kfree(tmp_options);
+ return ret;
+}
+
+static struct p9_fcall *p9_fcall_alloc(int alloc_msize)
+{
+ struct p9_fcall *fc;
+ fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS);
+ if (!fc)
+ return NULL;
+ fc->capacity = alloc_msize;
+ fc->sdata = (char *) fc + sizeof(struct p9_fcall);
+ return fc;
+}
+
+/**
+ * p9_tag_alloc - lookup/allocate a request by tag
+ * @c: client session to lookup tag within
+ * @tag: numeric id for transaction
+ *
+ * this is a simple array lookup, but will grow the
+ * request_slots as necessary to accommodate transaction
+ * ids which did not previously have a slot.
+ *
+ * this code relies on the client spinlock to manage locks, its
+ * possible we should switch to something else, but I'd rather
+ * stick with something low-overhead for the common case.
+ *
+ */
+
+static struct p9_req_t *
+p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
+{
+ unsigned long flags;
+ int row, col;
+ struct p9_req_t *req;
+ int alloc_msize = min(c->msize, max_size);
+
+ /* This looks up the original request by tag so we know which
+ * buffer to read the data into */
+ tag++;
+
+ if (tag >= c->max_tag) {
+ spin_lock_irqsave(&c->lock, flags);
+ /* check again since original check was outside of lock */
+ while (tag >= c->max_tag) {
+ row = (tag / P9_ROW_MAXTAG);
+ c->reqs[row] = kcalloc(P9_ROW_MAXTAG,
+ sizeof(struct p9_req_t), GFP_ATOMIC);
+
+ if (!c->reqs[row]) {
+ pr_err("Couldn't grow tag array\n");
+ spin_unlock_irqrestore(&c->lock, flags);
+ return ERR_PTR(-ENOMEM);
+ }
+ for (col = 0; col < P9_ROW_MAXTAG; col++) {
+ c->reqs[row][col].status = REQ_STATUS_IDLE;
+ c->reqs[row][col].tc = NULL;
+ }
+ c->max_tag += P9_ROW_MAXTAG;
+ }
+ spin_unlock_irqrestore(&c->lock, flags);
+ }
+ row = tag / P9_ROW_MAXTAG;
+ col = tag % P9_ROW_MAXTAG;
+
+ req = &c->reqs[row][col];
+ if (!req->wq) {
+ req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS);
+ if (!req->wq)
+ goto grow_failed;
+ init_waitqueue_head(req->wq);
+ }
+
+ if (!req->tc)
+ req->tc = p9_fcall_alloc(alloc_msize);
+ if (!req->rc)
+ req->rc = p9_fcall_alloc(alloc_msize);
+ if (!req->tc || !req->rc)
+ goto grow_failed;
+
+ p9pdu_reset(req->tc);
+ p9pdu_reset(req->rc);
+
+ req->tc->tag = tag-1;
+ req->status = REQ_STATUS_ALLOC;
+
+ return req;
+
+grow_failed:
+ pr_err("Couldn't grow tag array\n");
+ kfree(req->tc);
+ kfree(req->rc);
+ kfree(req->wq);
+ req->tc = req->rc = NULL;
+ req->wq = NULL;
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * p9_tag_lookup - lookup a request by tag
+ * @c: client session to lookup tag within
+ * @tag: numeric id for transaction
+ *
+ */
+
+struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
+{
+ int row, col;
+
+ /* This looks up the original request by tag so we know which
+ * buffer to read the data into */
+ tag++;
+
+ if(tag >= c->max_tag)
+ return NULL;
+
+ row = tag / P9_ROW_MAXTAG;
+ col = tag % P9_ROW_MAXTAG;
+
+ return &c->reqs[row][col];
+}
+EXPORT_SYMBOL(p9_tag_lookup);
+
+/**
+ * p9_tag_init - setup tags structure and contents
+ * @c: v9fs client struct
+ *
+ * This initializes the tags structure for each client instance.
+ *
+ */
+
+static int p9_tag_init(struct p9_client *c)
+{
+ int err = 0;
+
+ c->tagpool = p9_idpool_create();
+ if (IS_ERR(c->tagpool)) {
+ err = PTR_ERR(c->tagpool);
+ goto error;
+ }
+ err = p9_idpool_get(c->tagpool); /* reserve tag 0 */
+ if (err < 0) {
+ p9_idpool_destroy(c->tagpool);
+ goto error;
+ }
+ c->max_tag = 0;
+error:
+ return err;
+}
+
+/**
+ * p9_tag_cleanup - cleans up tags structure and reclaims resources
+ * @c: v9fs client struct
+ *
+ * This frees resources associated with the tags structure
+ *
+ */
+static void p9_tag_cleanup(struct p9_client *c)
+{
+ int row, col;
+
+ /* check to insure all requests are idle */
+ for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
+ for (col = 0; col < P9_ROW_MAXTAG; col++) {
+ if (c->reqs[row][col].status != REQ_STATUS_IDLE) {
+ p9_debug(P9_DEBUG_MUX,
+ "Attempting to cleanup non-free tag %d,%d\n",
+ row, col);
+ /* TODO: delay execution of cleanup */
+ return;
+ }
+ }
+ }
+
+ if (c->tagpool) {
+ p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */
+ p9_idpool_destroy(c->tagpool);
+ }
+
+ /* free requests associated with tags */
+ for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
+ for (col = 0; col < P9_ROW_MAXTAG; col++) {
+ kfree(c->reqs[row][col].wq);
+ kfree(c->reqs[row][col].tc);
+ kfree(c->reqs[row][col].rc);
+ }
+ kfree(c->reqs[row]);
+ }
+ c->max_tag = 0;
+}
+
+/**
+ * p9_free_req - free a request and clean-up as necessary
+ * c: client state
+ * r: request to release
+ *
+ */
+
+static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
+{
+ int tag = r->tc->tag;
+ p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
+
+ r->status = REQ_STATUS_IDLE;
+ if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool))
+ p9_idpool_put(tag, c->tagpool);
+}
+
+/**
+ * p9_client_cb - call back from transport to client
+ * c: client state
+ * req: request received
+ *
+ */
+void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
+{
+ p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag);
+
+ /*
+ * This barrier is needed to make sure any change made to req before
+ * the other thread wakes up will indeed be seen by the waiting side.
+ */
+ smp_wmb();
+ req->status = status;
+
+ wake_up(req->wq);
+ p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
+}
+EXPORT_SYMBOL(p9_client_cb);
+
+/**
+ * p9_parse_header - parse header arguments out of a packet
+ * @pdu: packet to parse
+ * @size: size of packet
+ * @type: type of request
+ * @tag: tag of packet
+ * @rewind: set if we need to rewind offset afterwards
+ */
+
+int
+p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag,
+ int rewind)
+{
+ int8_t r_type;
+ int16_t r_tag;
+ int32_t r_size;
+ int offset = pdu->offset;
+ int err;
+
+ pdu->offset = 0;
+ if (pdu->size == 0)
+ pdu->size = 7;
+
+ err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag);
+ if (err)
+ goto rewind_and_exit;
+
+ pdu->size = r_size;
+ pdu->id = r_type;
+ pdu->tag = r_tag;
+
+ p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n",
+ pdu->size, pdu->id, pdu->tag);
+
+ if (type)
+ *type = r_type;
+ if (tag)
+ *tag = r_tag;
+ if (size)
+ *size = r_size;
+
+
+rewind_and_exit:
+ if (rewind)
+ pdu->offset = offset;
+ return err;
+}
+EXPORT_SYMBOL(p9_parse_header);
+
+/**
+ * p9_check_errors - check 9p packet for error return and process it
+ * @c: current client instance
+ * @req: request to parse and check for error conditions
+ *
+ * returns error code if one is discovered, otherwise returns 0
+ *
+ * this will have to be more complicated if we have multiple
+ * error packet types
+ */
+
+static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
+{
+ int8_t type;
+ int err;
+ int ecode;
+
+ err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
+ /*
+ * dump the response from server
+ * This should be after check errors which poplulate pdu_fcall.
+ */
+ trace_9p_protocol_dump(c, req->rc);
+ if (err) {
+ p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
+ return err;
+ }
+ if (type != P9_RERROR && type != P9_RLERROR)
+ return 0;
+
+ if (!p9_is_proto_dotl(c)) {
+ char *ename;
+ err = p9pdu_readf(req->rc, c->proto_version, "s?d",
+ &ename, &ecode);
+ if (err)
+ goto out_err;
+
+ if (p9_is_proto_dotu(c))
+ err = -ecode;
+
+ if (!err || !IS_ERR_VALUE(err)) {
+ err = p9_errstr2errno(ename, strlen(ename));
+
+ p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
+ -ecode, ename);
+ }
+ kfree(ename);
+ } else {
+ err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
+ err = -ecode;
+
+ p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+ }
+
+ return err;
+
+out_err:
+ p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+
+ return err;
+}
+
+/**
+ * p9_check_zc_errors - check 9p packet for error return and process it
+ * @c: current client instance
+ * @req: request to parse and check for error conditions
+ * @in_hdrlen: Size of response protocol buffer.
+ *
+ * returns error code if one is discovered, otherwise returns 0
+ *
+ * this will have to be more complicated if we have multiple
+ * error packet types
+ */
+
+static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
+ struct iov_iter *uidata, int in_hdrlen)
+{
+ int err;
+ int ecode;
+ int8_t type;
+ char *ename = NULL;
+
+ err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
+ /*
+ * dump the response from server
+ * This should be after parse_header which poplulate pdu_fcall.
+ */
+ trace_9p_protocol_dump(c, req->rc);
+ if (err) {
+ p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
+ return err;
+ }
+
+ if (type != P9_RERROR && type != P9_RLERROR)
+ return 0;
+
+ if (!p9_is_proto_dotl(c)) {
+ /* Error is reported in string format */
+ int len;
+ /* 7 = header size for RERROR; */
+ int inline_len = in_hdrlen - 7;
+
+ len = req->rc->size - req->rc->offset;
+ if (len > (P9_ZC_HDR_SZ - 7)) {
+ err = -EFAULT;
+ goto out_err;
+ }
+
+ ename = &req->rc->sdata[req->rc->offset];
+ if (len > inline_len) {
+ /* We have error in external buffer */
+ err = copy_from_iter(ename + inline_len,
+ len - inline_len, uidata);
+ if (err != len - inline_len) {
+ err = -EFAULT;
+ goto out_err;
+ }
+ }
+ ename = NULL;
+ err = p9pdu_readf(req->rc, c->proto_version, "s?d",
+ &ename, &ecode);
+ if (err)
+ goto out_err;
+
+ if (p9_is_proto_dotu(c))
+ err = -ecode;
+
+ if (!err || !IS_ERR_VALUE(err)) {
+ err = p9_errstr2errno(ename, strlen(ename));
+
+ p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
+ -ecode, ename);
+ }
+ kfree(ename);
+ } else {
+ err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
+ err = -ecode;
+
+ p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+ }
+ return err;
+
+out_err:
+ p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+ return err;
+}
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
+
+/**
+ * p9_client_flush - flush (cancel) a request
+ * @c: client state
+ * @oldreq: request to cancel
+ *
+ * This sents a flush for a particular request and links
+ * the flush request to the original request. The current
+ * code only supports a single flush request although the protocol
+ * allows for multiple flush requests to be sent for a single request.
+ *
+ */
+
+static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
+{
+ struct p9_req_t *req;
+ int16_t oldtag;
+ int err;
+
+ err = p9_parse_header(oldreq->tc, NULL, NULL, &oldtag, 1);
+ if (err)
+ return err;
+
+ p9_debug(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag);
+
+ req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /*
+ * if we haven't received a response for oldreq,
+ * remove it from the list
+ */
+ if (oldreq->status == REQ_STATUS_SENT)
+ if (c->trans_mod->cancelled)
+ c->trans_mod->cancelled(c, oldreq);
+
+ p9_free_req(c, req);
+ return 0;
+}
+
+static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
+ int8_t type, int req_size,
+ const char *fmt, va_list ap)
+{
+ int tag, err;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
+
+ /* we allow for any status other than disconnected */
+ if (c->status == Disconnected)
+ return ERR_PTR(-EIO);
+
+ /* if status is begin_disconnected we allow only clunk request */
+ if ((c->status == BeginDisconnect) && (type != P9_TCLUNK))
+ return ERR_PTR(-EIO);
+
+ tag = P9_NOTAG;
+ if (type != P9_TVERSION) {
+ tag = p9_idpool_get(c->tagpool);
+ if (tag < 0)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ req = p9_tag_alloc(c, tag, req_size);
+ if (IS_ERR(req))
+ return req;
+
+ /* marshall the data */
+ p9pdu_prepare(req->tc, tag, type);
+ err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
+ if (err)
+ goto reterr;
+ p9pdu_finalize(c, req->tc);
+ trace_9p_client_req(c, type, tag);
+ return req;
+reterr:
+ p9_free_req(c, req);
+ return ERR_PTR(err);
+}
+
+/**
+ * p9_client_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_free_req)
+ */
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+{
+ va_list ap;
+ int sigpending, err;
+ unsigned long flags;
+ struct p9_req_t *req;
+
+ va_start(ap, fmt);
+ req = p9_client_prepare_req(c, type, c->msize, fmt, ap);
+ va_end(ap);
+ if (IS_ERR(req))
+ return req;
+
+ if (signal_pending(current)) {
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+ } else
+ sigpending = 0;
+
+ err = c->trans_mod->request(c, req);
+ if (err < 0) {
+ if (err != -ERESTARTSYS && err != -EFAULT)
+ c->status = Disconnected;
+ goto reterr;
+ }
+again:
+ /* Wait for the response */
+ err = wait_event_interruptible(*req->wq,
+ req->status >= REQ_STATUS_RCVD);
+
+ /*
+ * Make sure our req is coherent with regard to updates in other
+ * threads - echoes to wmb() in the callback
+ */
+ smp_rmb();
+
+ if ((err == -ERESTARTSYS) && (c->status == Connected)
+ && (type == P9_TFLUSH)) {
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+ goto again;
+ }
+
+ if (req->status == REQ_STATUS_ERROR) {
+ p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+ err = req->t_err;
+ }
+ if ((err == -ERESTARTSYS) && (c->status == Connected)) {
+ p9_debug(P9_DEBUG_MUX, "flushing\n");
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+
+ if (c->trans_mod->cancel(c, req))
+ p9_client_flush(c, req);
+
+ /* if we received the response anyway, don't signal error */
+ if (req->status == REQ_STATUS_RCVD)
+ err = 0;
+ }
+ if (sigpending) {
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+ }
+ if (err < 0)
+ goto reterr;
+
+ err = p9_check_errors(c, req);
+ trace_9p_client_res(c, type, req->rc->tag, err);
+ if (!err)
+ return req;
+reterr:
+ p9_free_req(c, req);
+ return ERR_PTR(safe_errno(err));
+}
+
+/**
+ * p9_client_zc_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @uidata: destination for zero copy read
+ * @uodata: source for zero copy write
+ * @inlen: read buffer size
+ * @olen: write buffer size
+ * @hdrlen: reader header size, This is the size of response protocol data
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_free_req)
+ */
+static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
+ struct iov_iter *uidata,
+ struct iov_iter *uodata,
+ int inlen, int olen, int in_hdrlen,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int sigpending, err;
+ unsigned long flags;
+ struct p9_req_t *req;
+
+ va_start(ap, fmt);
+ /*
+ * We allocate a inline protocol data of only 4k bytes.
+ * The actual content is passed in zero-copy fashion.
+ */
+ req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, fmt, ap);
+ va_end(ap);
+ if (IS_ERR(req))
+ return req;
+
+ if (signal_pending(current)) {
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+ } else
+ sigpending = 0;
+
+ err = c->trans_mod->zc_request(c, req, uidata, uodata,
+ inlen, olen, in_hdrlen);
+ if (err < 0) {
+ if (err == -EIO)
+ c->status = Disconnected;
+ if (err != -ERESTARTSYS)
+ goto reterr;
+ }
+ if (req->status == REQ_STATUS_ERROR) {
+ p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+ err = req->t_err;
+ }
+ if ((err == -ERESTARTSYS) && (c->status == Connected)) {
+ p9_debug(P9_DEBUG_MUX, "flushing\n");
+ sigpending = 1;
+ clear_thread_flag(TIF_SIGPENDING);
+
+ if (c->trans_mod->cancel(c, req))
+ p9_client_flush(c, req);
+
+ /* if we received the response anyway, don't signal error */
+ if (req->status == REQ_STATUS_RCVD)
+ err = 0;
+ }
+ if (sigpending) {
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+ }
+ if (err < 0)
+ goto reterr;
+
+ err = p9_check_zc_errors(c, req, uidata, in_hdrlen);
+ trace_9p_client_res(c, type, req->rc->tag, err);
+ if (!err)
+ return req;
+reterr:
+ p9_free_req(c, req);
+ return ERR_PTR(safe_errno(err));
+}
+
+static struct p9_fid *p9_fid_create(struct p9_client *clnt)
+{
+ int ret;
+ struct p9_fid *fid;
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
+ fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL);
+ if (!fid)
+ return ERR_PTR(-ENOMEM);
+
+ ret = p9_idpool_get(clnt->fidpool);
+ if (ret < 0) {
+ ret = -ENOSPC;
+ goto error;
+ }
+ fid->fid = ret;
+
+ memset(&fid->qid, 0, sizeof(struct p9_qid));
+ fid->mode = -1;
+ fid->uid = current_fsuid();
+ fid->clnt = clnt;
+ fid->rdir = NULL;
+ spin_lock_irqsave(&clnt->lock, flags);
+ list_add(&fid->flist, &clnt->fidlist);
+ spin_unlock_irqrestore(&clnt->lock, flags);
+
+ return fid;
+
+error:
+ kfree(fid);
+ return ERR_PTR(ret);
+}
+
+static void p9_fid_destroy(struct p9_fid *fid)
+{
+ struct p9_client *clnt;
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
+ clnt = fid->clnt;
+ p9_idpool_put(fid->fid, clnt->fidpool);
+ spin_lock_irqsave(&clnt->lock, flags);
+ list_del(&fid->flist);
+ spin_unlock_irqrestore(&clnt->lock, flags);
+ kfree(fid->rdir);
+ kfree(fid);
+}
+
+static int p9_client_version(struct p9_client *c)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ char *version;
+ int msize;
+
+ p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
+ c->msize, c->proto_version);
+
+ switch (c->proto_version) {
+ case p9_proto_2000L:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2000.L");
+ break;
+ case p9_proto_2000u:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2000.u");
+ break;
+ case p9_proto_legacy:
+ req = p9_client_rpc(c, P9_TVERSION, "ds",
+ c->msize, "9P2000");
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version);
+ if (err) {
+ p9_debug(P9_DEBUG_9P, "version error %d\n", err);
+ trace_9p_protocol_dump(c, req->rc);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
+ if (!strncmp(version, "9P2000.L", 8))
+ c->proto_version = p9_proto_2000L;
+ else if (!strncmp(version, "9P2000.u", 8))
+ c->proto_version = p9_proto_2000u;
+ else if (!strncmp(version, "9P2000", 6))
+ c->proto_version = p9_proto_legacy;
+ else {
+ err = -EREMOTEIO;
+ goto error;
+ }
+
+ if (msize < c->msize)
+ c->msize = msize;
+
+error:
+ kfree(version);
+ p9_free_req(c, req);
+
+ return err;
+}
+
+struct p9_client *p9_client_create(const char *dev_name, char *options)
+{
+ int err;
+ struct p9_client *clnt;
+ char *client_id;
+
+ err = 0;
+ clnt = kmalloc(sizeof(struct p9_client), GFP_KERNEL);
+ if (!clnt)
+ return ERR_PTR(-ENOMEM);
+
+ clnt->trans_mod = NULL;
+ clnt->trans = NULL;
+
+ client_id = utsname()->nodename;
+ memcpy(clnt->name, client_id, strlen(client_id) + 1);
+
+ spin_lock_init(&clnt->lock);
+ INIT_LIST_HEAD(&clnt->fidlist);
+
+ err = p9_tag_init(clnt);
+ if (err < 0)
+ goto free_client;
+
+ err = parse_opts(options, clnt);
+ if (err < 0)
+ goto destroy_tagpool;
+
+ if (!clnt->trans_mod)
+ clnt->trans_mod = v9fs_get_default_trans();
+
+ if (clnt->trans_mod == NULL) {
+ err = -EPROTONOSUPPORT;
+ p9_debug(P9_DEBUG_ERROR,
+ "No transport defined or default transport\n");
+ goto destroy_tagpool;
+ }
+
+ clnt->fidpool = p9_idpool_create();
+ if (IS_ERR(clnt->fidpool)) {
+ err = PTR_ERR(clnt->fidpool);
+ goto put_trans;
+ }
+
+ p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
+ clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
+
+ err = clnt->trans_mod->create(clnt, dev_name, options);
+ if (err)
+ goto destroy_fidpool;
+
+ if (clnt->msize > clnt->trans_mod->maxsize)
+ clnt->msize = clnt->trans_mod->maxsize;
+
+ err = p9_client_version(clnt);
+ if (err)
+ goto close_trans;
+
+ return clnt;
+
+close_trans:
+ clnt->trans_mod->close(clnt);
+destroy_fidpool:
+ p9_idpool_destroy(clnt->fidpool);
+put_trans:
+ v9fs_put_trans(clnt->trans_mod);
+destroy_tagpool:
+ p9_idpool_destroy(clnt->tagpool);
+free_client:
+ kfree(clnt);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_create);
+
+void p9_client_destroy(struct p9_client *clnt)
+{
+ struct p9_fid *fid, *fidptr;
+
+ p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt);
+
+ if (clnt->trans_mod)
+ clnt->trans_mod->close(clnt);
+
+ v9fs_put_trans(clnt->trans_mod);
+
+ list_for_each_entry_safe(fid, fidptr, &clnt->fidlist, flist) {
+ pr_info("Found fid %d not clunked\n", fid->fid);
+ p9_fid_destroy(fid);
+ }
+
+ if (clnt->fidpool)
+ p9_idpool_destroy(clnt->fidpool);
+
+ p9_tag_cleanup(clnt);
+
+ kfree(clnt);
+}
+EXPORT_SYMBOL(p9_client_destroy);
+
+void p9_client_disconnect(struct p9_client *clnt)
+{
+ p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt);
+ clnt->status = Disconnected;
+}
+EXPORT_SYMBOL(p9_client_disconnect);
+
+void p9_client_begin_disconnect(struct p9_client *clnt)
+{
+ p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt);
+ clnt->status = BeginDisconnect;
+}
+EXPORT_SYMBOL(p9_client_begin_disconnect);
+
+struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
+ char *uname, kuid_t n_uname, char *aname)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ struct p9_fid *fid;
+ struct p9_qid qid;
+
+
+ p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
+ afid ? afid->fid : -1, uname, aname);
+ fid = p9_fid_create(clnt);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ fid = NULL;
+ goto error;
+ }
+ fid->uid = n_uname;
+
+ req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid,
+ afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ p9_free_req(clnt, req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
+ qid.type, (unsigned long long)qid.path, qid.version);
+
+ memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+
+ p9_free_req(clnt, req);
+ return fid;
+
+error:
+ if (fid)
+ p9_fid_destroy(fid);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_attach);
+
+struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
+ char **wnames, int clone)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_fid *fid;
+ struct p9_qid *wqids;
+ struct p9_req_t *req;
+ uint16_t nwqids, count;
+
+ err = 0;
+ wqids = NULL;
+ clnt = oldfid->clnt;
+ if (clone) {
+ fid = p9_fid_create(clnt);
+ if (IS_ERR(fid)) {
+ err = PTR_ERR(fid);
+ fid = NULL;
+ goto error;
+ }
+
+ fid->uid = oldfid->uid;
+ } else
+ fid = oldfid;
+
+
+ p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
+ oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
+
+ req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid,
+ nwname, wnames);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ p9_free_req(clnt, req);
+ goto clunk_fid;
+ }
+ p9_free_req(clnt, req);
+
+ p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids);
+
+ if (nwqids != nwname) {
+ err = -ENOENT;
+ goto clunk_fid;
+ }
+
+ for (count = 0; count < nwqids; count++)
+ p9_debug(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n",
+ count, wqids[count].type,
+ (unsigned long long)wqids[count].path,
+ wqids[count].version);
+
+ if (nwname)
+ memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid));
+ else
+ fid->qid = oldfid->qid;
+
+ kfree(wqids);
+ return fid;
+
+clunk_fid:
+ kfree(wqids);
+ p9_client_clunk(fid);
+ fid = NULL;
+
+error:
+ if (fid && (fid != oldfid))
+ p9_fid_destroy(fid);
+
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_walk);
+
+int p9_client_open(struct p9_fid *fid, int mode)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ struct p9_qid qid;
+ int iounit;
+
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> %s fid %d mode %d\n",
+ p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode);
+ err = 0;
+
+ if (fid->mode != -1)
+ return -EINVAL;
+
+ if (p9_is_proto_dotl(clnt))
+ req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode);
+ else
+ req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n",
+ p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type,
+ (unsigned long long)qid.path, qid.version, iounit);
+
+ fid->mode = mode;
+ fid->iounit = iounit;
+
+free_and_error:
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_open);
+
+int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
+ kgid_t gid, struct p9_qid *qid)
+{
+ int err = 0;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ int iounit;
+
+ p9_debug(P9_DEBUG_9P,
+ ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n",
+ ofid->fid, name, flags, mode,
+ from_kgid(&init_user_ns, gid));
+ clnt = ofid->clnt;
+
+ if (ofid->mode != -1)
+ return -EINVAL;
+
+ req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags,
+ mode, gid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n",
+ qid->type,
+ (unsigned long long)qid->path,
+ qid->version, iounit);
+
+ ofid->mode = mode;
+ ofid->iounit = iounit;
+
+free_and_error:
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_create_dotl);
+
+int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
+ char *extension)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ struct p9_qid qid;
+ int iounit;
+
+ p9_debug(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n",
+ fid->fid, name, perm, mode);
+ err = 0;
+ clnt = fid->clnt;
+
+ if (fid->mode != -1)
+ return -EINVAL;
+
+ req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm,
+ mode, extension);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n",
+ qid.type,
+ (unsigned long long)qid.path,
+ qid.version, iounit);
+
+ fid->mode = mode;
+ fid->iounit = iounit;
+
+free_and_error:
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_fcreate);
+
+int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, kgid_t gid,
+ struct p9_qid *qid)
+{
+ int err = 0;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s symtgt %s\n",
+ dfid->fid, name, symtgt);
+ clnt = dfid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TSYMLINK, "dssg", dfid->fid, name, symtgt,
+ gid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n",
+ qid->type, (unsigned long long)qid->path, qid->version);
+
+free_and_error:
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_symlink);
+
+int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
+{
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n",
+ dfid->fid, oldfid->fid, newname);
+ clnt = dfid->clnt;
+ req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid,
+ newname);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ p9_debug(P9_DEBUG_9P, "<<< RLINK\n");
+ p9_free_req(clnt, req);
+ return 0;
+}
+EXPORT_SYMBOL(p9_client_link);
+
+int p9_client_fsync(struct p9_fid *fid, int datasync)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
+ fid->fid, datasync);
+ err = 0;
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
+
+ p9_free_req(clnt, req);
+
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_fsync);
+
+int p9_client_clunk(struct p9_fid *fid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ int retries = 0;
+
+ if (!fid) {
+ pr_warn("%s (%d): Trying to clunk with NULL fid\n",
+ __func__, task_pid_nr(current));
+ dump_stack();
+ return 0;
+ }
+
+again:
+ p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid,
+ retries);
+ err = 0;
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid);
+
+ p9_free_req(clnt, req);
+error:
+ /*
+ * Fid is not valid even after a failed clunk
+ * If interrupted, retry once then give up and
+ * leak fid until umount.
+ */
+ if (err == -ERESTARTSYS) {
+ if (retries++ == 0)
+ goto again;
+ } else
+ p9_fid_destroy(fid);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_clunk);
+
+int p9_client_remove(struct p9_fid *fid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid);
+ err = 0;
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid);
+
+ p9_free_req(clnt, req);
+error:
+ if (err == -ERESTARTSYS)
+ p9_client_clunk(fid);
+ else
+ p9_fid_destroy(fid);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_remove);
+
+int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
+{
+ int err = 0;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ p9_debug(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n",
+ dfid->fid, name, flags);
+
+ clnt = dfid->clnt;
+ req = p9_client_rpc(clnt, P9_TUNLINKAT, "dsd", dfid->fid, name, flags);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name);
+
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_unlinkat);
+
+int
+p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
+{
+ struct p9_client *clnt = fid->clnt;
+ struct p9_req_t *req;
+ int total = 0;
+
+ p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
+ fid->fid, (unsigned long long) offset, (int)iov_iter_count(to));
+
+ while (iov_iter_count(to)) {
+ int count = iov_iter_count(to);
+ int rsize, non_zc = 0;
+ char *dataptr;
+
+ rsize = fid->iounit;
+ if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
+ rsize = clnt->msize - P9_IOHDRSZ;
+
+ if (count < rsize)
+ rsize = count;
+
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && rsize > 1024) {
+ /*
+ * response header len is 11
+ * PDU Header(7) + IO Size (4)
+ */
+ req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
+ 0, 11, "dqd", fid->fid,
+ offset, rsize);
+ } else {
+ non_zc = 1;
+ req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
+ rsize);
+ }
+ if (IS_ERR(req)) {
+ *err = PTR_ERR(req);
+ break;
+ }
+
+ *err = p9pdu_readf(req->rc, clnt->proto_version,
+ "D", &count, &dataptr);
+ if (*err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ p9_free_req(clnt, req);
+ break;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
+ if (!count) {
+ p9_free_req(clnt, req);
+ break;
+ }
+
+ if (non_zc) {
+ int n = copy_to_iter(dataptr, count, to);
+ total += n;
+ offset += n;
+ if (n != count) {
+ *err = -EFAULT;
+ p9_free_req(clnt, req);
+ break;
+ }
+ } else {
+ iov_iter_advance(to, count);
+ total += count;
+ offset += count;
+ }
+ p9_free_req(clnt, req);
+ }
+ return total;
+}
+EXPORT_SYMBOL(p9_client_read);
+
+int
+p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+{
+ struct p9_client *clnt = fid->clnt;
+ struct p9_req_t *req;
+ int total = 0;
+
+ p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n",
+ fid->fid, (unsigned long long) offset,
+ iov_iter_count(from));
+
+ while (iov_iter_count(from)) {
+ int count = iov_iter_count(from);
+ int rsize = fid->iounit;
+ if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
+ rsize = clnt->msize - P9_IOHDRSZ;
+
+ if (count < rsize)
+ rsize = count;
+
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && rsize > 1024) {
+ req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0,
+ rsize, P9_ZC_HDR_SZ, "dqd",
+ fid->fid, offset, rsize);
+ } else {
+ req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
+ offset, rsize, from);
+ }
+ if (IS_ERR(req)) {
+ *err = PTR_ERR(req);
+ break;
+ }
+
+ *err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
+ if (*err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ p9_free_req(clnt, req);
+ break;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
+
+ p9_free_req(clnt, req);
+ iov_iter_advance(from, count);
+ total += count;
+ offset += count;
+ }
+ return total;
+}
+EXPORT_SYMBOL(p9_client_write);
+
+struct p9_wstat *p9_client_stat(struct p9_fid *fid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_wstat *ret = kmalloc(sizeof(struct p9_wstat), GFP_KERNEL);
+ struct p9_req_t *req;
+ u16 ignored;
+
+ p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid);
+
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ err = 0;
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ p9_free_req(clnt, req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P,
+ "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+ "<<< mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+ "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+ "<<< uid=%d gid=%d n_muid=%d\n",
+ ret->size, ret->type, ret->dev, ret->qid.type,
+ (unsigned long long)ret->qid.path, ret->qid.version, ret->mode,
+ ret->atime, ret->mtime, (unsigned long long)ret->length,
+ ret->name, ret->uid, ret->gid, ret->muid, ret->extension,
+ from_kuid(&init_user_ns, ret->n_uid),
+ from_kgid(&init_user_ns, ret->n_gid),
+ from_kuid(&init_user_ns, ret->n_muid));
+
+ p9_free_req(clnt, req);
+ return ret;
+
+error:
+ kfree(ret);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_stat);
+
+struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
+ u64 request_mask)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_stat_dotl *ret = kmalloc(sizeof(struct p9_stat_dotl),
+ GFP_KERNEL);
+ struct p9_req_t *req;
+
+ p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
+ fid->fid, request_mask);
+
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ err = 0;
+ clnt = fid->clnt;
+
+ req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ p9_free_req(clnt, req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P,
+ "<<< RGETATTR st_result_mask=%lld\n"
+ "<<< qid=%x.%llx.%x\n"
+ "<<< st_mode=%8.8x st_nlink=%llu\n"
+ "<<< st_uid=%d st_gid=%d\n"
+ "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n"
+ "<<< st_atime_sec=%lld st_atime_nsec=%lld\n"
+ "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
+ "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
+ "<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
+ "<<< st_gen=%lld st_data_version=%lld",
+ ret->st_result_mask, ret->qid.type, ret->qid.path,
+ ret->qid.version, ret->st_mode, ret->st_nlink,
+ from_kuid(&init_user_ns, ret->st_uid),
+ from_kgid(&init_user_ns, ret->st_gid),
+ ret->st_rdev, ret->st_size, ret->st_blksize,
+ ret->st_blocks, ret->st_atime_sec, ret->st_atime_nsec,
+ ret->st_mtime_sec, ret->st_mtime_nsec, ret->st_ctime_sec,
+ ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec,
+ ret->st_gen, ret->st_data_version);
+
+ p9_free_req(clnt, req);
+ return ret;
+
+error:
+ kfree(ret);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_getattr_dotl);
+
+static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
+{
+ int ret;
+
+ /* NOTE: size shouldn't include its own length */
+ /* size[2] type[2] dev[4] qid[13] */
+ /* mode[4] atime[4] mtime[4] length[8]*/
+ /* name[s] uid[s] gid[s] muid[s] */
+ ret = 2+4+13+4+4+4+8+2+2+2+2;
+
+ if (wst->name)
+ ret += strlen(wst->name);
+ if (wst->uid)
+ ret += strlen(wst->uid);
+ if (wst->gid)
+ ret += strlen(wst->gid);
+ if (wst->muid)
+ ret += strlen(wst->muid);
+
+ if ((proto_version == p9_proto_2000u) ||
+ (proto_version == p9_proto_2000L)) {
+ ret += 2+4+4+4; /* extension[s] n_uid[4] n_gid[4] n_muid[4] */
+ if (wst->extension)
+ ret += strlen(wst->extension);
+ }
+
+ return ret;
+}
+
+int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ err = 0;
+ clnt = fid->clnt;
+ wst->size = p9_client_statsize(wst, clnt->proto_version);
+ p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid);
+ p9_debug(P9_DEBUG_9P,
+ " sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+ " mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+ " name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+ " uid=%d gid=%d n_muid=%d\n",
+ wst->size, wst->type, wst->dev, wst->qid.type,
+ (unsigned long long)wst->qid.path, wst->qid.version, wst->mode,
+ wst->atime, wst->mtime, (unsigned long long)wst->length,
+ wst->name, wst->uid, wst->gid, wst->muid, wst->extension,
+ from_kuid(&init_user_ns, wst->n_uid),
+ from_kgid(&init_user_ns, wst->n_gid),
+ from_kuid(&init_user_ns, wst->n_muid));
+
+ req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, wst->size+2, wst);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid);
+
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_wstat);
+
+int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ err = 0;
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid);
+ p9_debug(P9_DEBUG_9P,
+ " valid=%x mode=%x uid=%d gid=%d size=%lld\n"
+ " atime_sec=%lld atime_nsec=%lld\n"
+ " mtime_sec=%lld mtime_nsec=%lld\n",
+ p9attr->valid, p9attr->mode,
+ from_kuid(&init_user_ns, p9attr->uid),
+ from_kgid(&init_user_ns, p9attr->gid),
+ p9attr->size, p9attr->atime_sec, p9attr->atime_nsec,
+ p9attr->mtime_sec, p9attr->mtime_nsec);
+
+ req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr);
+
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid);
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_setattr);
+
+int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ err = 0;
+ clnt = fid->clnt;
+
+ p9_debug(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid);
+
+ req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type,
+ &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail,
+ &sb->files, &sb->ffree, &sb->fsid, &sb->namelen);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ p9_free_req(clnt, req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RSTATFS fid %d type 0x%lx bsize %ld "
+ "blocks %llu bfree %llu bavail %llu files %llu ffree %llu "
+ "fsid %llu namelen %ld\n",
+ fid->fid, (long unsigned int)sb->type, (long int)sb->bsize,
+ sb->blocks, sb->bfree, sb->bavail, sb->files, sb->ffree,
+ sb->fsid, (long int)sb->namelen);
+
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_statfs);
+
+int p9_client_rename(struct p9_fid *fid,
+ struct p9_fid *newdirfid, const char *name)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ err = 0;
+ clnt = fid->clnt;
+
+ p9_debug(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n",
+ fid->fid, newdirfid->fid, name);
+
+ req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid,
+ newdirfid->fid, name);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid);
+
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_rename);
+
+int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
+ struct p9_fid *newdirfid, const char *new_name)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ err = 0;
+ clnt = olddirfid->clnt;
+
+ p9_debug(P9_DEBUG_9P, ">>> TRENAMEAT olddirfid %d old name %s"
+ " newdirfid %d new name %s\n", olddirfid->fid, old_name,
+ newdirfid->fid, new_name);
+
+ req = p9_client_rpc(clnt, P9_TRENAMEAT, "dsds", olddirfid->fid,
+ old_name, newdirfid->fid, new_name);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n",
+ newdirfid->fid, new_name);
+
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_renameat);
+
+/*
+ * An xattrwalk without @attr_name gives the fid for the lisxattr namespace
+ */
+struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
+ const char *attr_name, u64 *attr_size)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+ struct p9_fid *attr_fid;
+
+ err = 0;
+ clnt = file_fid->clnt;
+ attr_fid = p9_fid_create(clnt);
+ if (IS_ERR(attr_fid)) {
+ err = PTR_ERR(attr_fid);
+ attr_fid = NULL;
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P,
+ ">>> TXATTRWALK file_fid %d, attr_fid %d name %s\n",
+ file_fid->fid, attr_fid->fid, attr_name);
+
+ req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds",
+ file_fid->fid, attr_fid->fid, attr_name);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+ err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ p9_free_req(clnt, req);
+ goto clunk_fid;
+ }
+ p9_free_req(clnt, req);
+ p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n",
+ attr_fid->fid, *attr_size);
+ return attr_fid;
+clunk_fid:
+ p9_client_clunk(attr_fid);
+ attr_fid = NULL;
+error:
+ if (attr_fid && (attr_fid != file_fid))
+ p9_fid_destroy(attr_fid);
+
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrwalk);
+
+int p9_client_xattrcreate(struct p9_fid *fid, const char *name,
+ u64 attr_size, int flags)
+{
+ int err;
+ struct p9_req_t *req;
+ struct p9_client *clnt;
+
+ p9_debug(P9_DEBUG_9P,
+ ">>> TXATTRCREATE fid %d name %s size %lld flag %d\n",
+ fid->fid, name, (long long)attr_size, flags);
+ err = 0;
+ clnt = fid->clnt;
+ req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd",
+ fid->fid, name, attr_size, flags);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid);
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrcreate);
+
+int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
+{
+ int err, rsize, non_zc = 0;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+ char *dataptr;
+ struct kvec kv = {.iov_base = data, .iov_len = count};
+ struct iov_iter to;
+
+ iov_iter_kvec(&to, READ | ITER_KVEC, &kv, 1, count);
+
+ p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n",
+ fid->fid, (unsigned long long) offset, count);
+
+ err = 0;
+ clnt = fid->clnt;
+
+ rsize = fid->iounit;
+ if (!rsize || rsize > clnt->msize-P9_READDIRHDRSZ)
+ rsize = clnt->msize - P9_READDIRHDRSZ;
+
+ if (count < rsize)
+ rsize = count;
+
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && rsize > 1024) {
+ /*
+ * response header len is 11
+ * PDU Header(7) + IO Size (4)
+ */
+ req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0,
+ 11, "dqd", fid->fid, offset, rsize);
+ } else {
+ non_zc = 1;
+ req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid,
+ offset, rsize);
+ }
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto error;
+ }
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto free_and_error;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
+
+ if (non_zc)
+ memmove(data, dataptr, count);
+
+ p9_free_req(clnt, req);
+ return count;
+
+free_and_error:
+ p9_free_req(clnt, req);
+error:
+ return err;
+}
+EXPORT_SYMBOL(p9_client_readdir);
+
+int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode,
+ dev_t rdev, kgid_t gid, struct p9_qid *qid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ err = 0;
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d "
+ "minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev));
+ req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddg", fid->fid, name, mode,
+ MAJOR(rdev), MINOR(rdev), gid);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type,
+ (unsigned long long)qid->path, qid->version);
+
+error:
+ p9_free_req(clnt, req);
+ return err;
+
+}
+EXPORT_SYMBOL(p9_client_mknod_dotl);
+
+int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
+ kgid_t gid, struct p9_qid *qid)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ err = 0;
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n",
+ fid->fid, name, mode, from_kgid(&init_user_ns, gid));
+ req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg", fid->fid, name, mode,
+ gid);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
+ (unsigned long long)qid->path, qid->version);
+
+error:
+ p9_free_req(clnt, req);
+ return err;
+
+}
+EXPORT_SYMBOL(p9_client_mkdir_dotl);
+
+int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ err = 0;
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d "
+ "start %lld length %lld proc_id %d client_id %s\n",
+ fid->fid, flock->type, flock->flags, flock->start,
+ flock->length, flock->proc_id, flock->client_id);
+
+ req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type,
+ flock->flags, flock->start, flock->length,
+ flock->proc_id, flock->client_id);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "b", status);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
+error:
+ p9_free_req(clnt, req);
+ return err;
+
+}
+EXPORT_SYMBOL(p9_client_lock_dotl);
+
+int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ err = 0;
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld "
+ "length %lld proc_id %d client_id %s\n", fid->fid, glock->type,
+ glock->start, glock->length, glock->proc_id, glock->client_id);
+
+ req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid, glock->type,
+ glock->start, glock->length, glock->proc_id, glock->client_id);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type,
+ &glock->start, &glock->length, &glock->proc_id,
+ &glock->client_id);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
+ "proc_id %d client_id %s\n", glock->type, glock->start,
+ glock->length, glock->proc_id, glock->client_id);
+error:
+ p9_free_req(clnt, req);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_getlock_dotl);
+
+int p9_client_readlink(struct p9_fid *fid, char **target)
+{
+ int err;
+ struct p9_client *clnt;
+ struct p9_req_t *req;
+
+ err = 0;
+ clnt = fid->clnt;
+ p9_debug(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
+
+ req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ err = p9pdu_readf(req->rc, clnt->proto_version, "s", target);
+ if (err) {
+ trace_9p_protocol_dump(clnt, req->rc);
+ goto error;
+ }
+ p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
+error:
+ p9_free_req(clnt, req);
+ return err;
+}
+EXPORT_SYMBOL(p9_client_readlink);
diff --git a/net/9p/error.c b/net/9p/error.c
new file mode 100644
index 000000000..126fd0dce
--- /dev/null
+++ b/net/9p/error.c
@@ -0,0 +1,247 @@
+/*
+ * linux/fs/9p/error.c
+ *
+ * Error string handling
+ *
+ * Plan 9 uses error strings, Unix uses error numbers. These functions
+ * try to help manage that and provide for dynamically adding error
+ * mappings.
+ *
+ * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/errno.h>
+#include <net/9p/9p.h>
+
+/**
+ * struct errormap - map string errors from Plan 9 to Linux numeric ids
+ * @name: string sent over 9P
+ * @val: numeric id most closely representing @name
+ * @namelen: length of string
+ * @list: hash-table list for string lookup
+ */
+struct errormap {
+ char *name;
+ int val;
+
+ int namelen;
+ struct hlist_node list;
+};
+
+#define ERRHASHSZ 32
+static struct hlist_head hash_errmap[ERRHASHSZ];
+
+/* FixMe - reduce to a reasonable size */
+static struct errormap errmap[] = {
+ {"Operation not permitted", EPERM},
+ {"wstat prohibited", EPERM},
+ {"No such file or directory", ENOENT},
+ {"directory entry not found", ENOENT},
+ {"file not found", ENOENT},
+ {"Interrupted system call", EINTR},
+ {"Input/output error", EIO},
+ {"No such device or address", ENXIO},
+ {"Argument list too long", E2BIG},
+ {"Bad file descriptor", EBADF},
+ {"Resource temporarily unavailable", EAGAIN},
+ {"Cannot allocate memory", ENOMEM},
+ {"Permission denied", EACCES},
+ {"Bad address", EFAULT},
+ {"Block device required", ENOTBLK},
+ {"Device or resource busy", EBUSY},
+ {"File exists", EEXIST},
+ {"Invalid cross-device link", EXDEV},
+ {"No such device", ENODEV},
+ {"Not a directory", ENOTDIR},
+ {"Is a directory", EISDIR},
+ {"Invalid argument", EINVAL},
+ {"Too many open files in system", ENFILE},
+ {"Too many open files", EMFILE},
+ {"Text file busy", ETXTBSY},
+ {"File too large", EFBIG},
+ {"No space left on device", ENOSPC},
+ {"Illegal seek", ESPIPE},
+ {"Read-only file system", EROFS},
+ {"Too many links", EMLINK},
+ {"Broken pipe", EPIPE},
+ {"Numerical argument out of domain", EDOM},
+ {"Numerical result out of range", ERANGE},
+ {"Resource deadlock avoided", EDEADLK},
+ {"File name too long", ENAMETOOLONG},
+ {"No locks available", ENOLCK},
+ {"Function not implemented", ENOSYS},
+ {"Directory not empty", ENOTEMPTY},
+ {"Too many levels of symbolic links", ELOOP},
+ {"No message of desired type", ENOMSG},
+ {"Identifier removed", EIDRM},
+ {"No data available", ENODATA},
+ {"Machine is not on the network", ENONET},
+ {"Package not installed", ENOPKG},
+ {"Object is remote", EREMOTE},
+ {"Link has been severed", ENOLINK},
+ {"Communication error on send", ECOMM},
+ {"Protocol error", EPROTO},
+ {"Bad message", EBADMSG},
+ {"File descriptor in bad state", EBADFD},
+ {"Streams pipe error", ESTRPIPE},
+ {"Too many users", EUSERS},
+ {"Socket operation on non-socket", ENOTSOCK},
+ {"Message too long", EMSGSIZE},
+ {"Protocol not available", ENOPROTOOPT},
+ {"Protocol not supported", EPROTONOSUPPORT},
+ {"Socket type not supported", ESOCKTNOSUPPORT},
+ {"Operation not supported", EOPNOTSUPP},
+ {"Protocol family not supported", EPFNOSUPPORT},
+ {"Network is down", ENETDOWN},
+ {"Network is unreachable", ENETUNREACH},
+ {"Network dropped connection on reset", ENETRESET},
+ {"Software caused connection abort", ECONNABORTED},
+ {"Connection reset by peer", ECONNRESET},
+ {"No buffer space available", ENOBUFS},
+ {"Transport endpoint is already connected", EISCONN},
+ {"Transport endpoint is not connected", ENOTCONN},
+ {"Cannot send after transport endpoint shutdown", ESHUTDOWN},
+ {"Connection timed out", ETIMEDOUT},
+ {"Connection refused", ECONNREFUSED},
+ {"Host is down", EHOSTDOWN},
+ {"No route to host", EHOSTUNREACH},
+ {"Operation already in progress", EALREADY},
+ {"Operation now in progress", EINPROGRESS},
+ {"Is a named type file", EISNAM},
+ {"Remote I/O error", EREMOTEIO},
+ {"Disk quota exceeded", EDQUOT},
+/* errors from fossil, vacfs, and u9fs */
+ {"fid unknown or out of range", EBADF},
+ {"permission denied", EACCES},
+ {"file does not exist", ENOENT},
+ {"authentication failed", ECONNREFUSED},
+ {"bad offset in directory read", ESPIPE},
+ {"bad use of fid", EBADF},
+ {"wstat can't convert between files and directories", EPERM},
+ {"directory is not empty", ENOTEMPTY},
+ {"file exists", EEXIST},
+ {"file already exists", EEXIST},
+ {"file or directory already exists", EEXIST},
+ {"fid already in use", EBADF},
+ {"file in use", ETXTBSY},
+ {"i/o error", EIO},
+ {"file already open for I/O", ETXTBSY},
+ {"illegal mode", EINVAL},
+ {"illegal name", ENAMETOOLONG},
+ {"not a directory", ENOTDIR},
+ {"not a member of proposed group", EPERM},
+ {"not owner", EACCES},
+ {"only owner can change group in wstat", EACCES},
+ {"read only file system", EROFS},
+ {"no access to special file", EPERM},
+ {"i/o count too large", EIO},
+ {"unknown group", EINVAL},
+ {"unknown user", EINVAL},
+ {"bogus wstat buffer", EPROTO},
+ {"exclusive use file already open", EAGAIN},
+ {"corrupted directory entry", EIO},
+ {"corrupted file entry", EIO},
+ {"corrupted block label", EIO},
+ {"corrupted meta data", EIO},
+ {"illegal offset", EINVAL},
+ {"illegal path element", ENOENT},
+ {"root of file system is corrupted", EIO},
+ {"corrupted super block", EIO},
+ {"protocol botch", EPROTO},
+ {"file system is full", ENOSPC},
+ {"file is in use", EAGAIN},
+ {"directory entry is not allocated", ENOENT},
+ {"file is read only", EROFS},
+ {"file has been removed", EIDRM},
+ {"only support truncation to zero length", EPERM},
+ {"cannot remove root", EPERM},
+ {"file too big", EFBIG},
+ {"venti i/o error", EIO},
+ /* these are not errors */
+ {"u9fs rhostsauth: no authentication required", 0},
+ {"u9fs authnone: no authentication required", 0},
+ {NULL, -1}
+};
+
+/**
+ * p9_error_init - preload mappings into hash list
+ *
+ */
+
+int p9_error_init(void)
+{
+ struct errormap *c;
+ int bucket;
+
+ /* initialize hash table */
+ for (bucket = 0; bucket < ERRHASHSZ; bucket++)
+ INIT_HLIST_HEAD(&hash_errmap[bucket]);
+
+ /* load initial error map into hash table */
+ for (c = errmap; c->name != NULL; c++) {
+ c->namelen = strlen(c->name);
+ bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ;
+ INIT_HLIST_NODE(&c->list);
+ hlist_add_head(&c->list, &hash_errmap[bucket]);
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(p9_error_init);
+
+/**
+ * errstr2errno - convert error string to error number
+ * @errstr: error string
+ * @len: length of error string
+ *
+ */
+
+int p9_errstr2errno(char *errstr, int len)
+{
+ int errno;
+ struct errormap *c;
+ int bucket;
+
+ errno = 0;
+ c = NULL;
+ bucket = jhash(errstr, len, 0) % ERRHASHSZ;
+ hlist_for_each_entry(c, &hash_errmap[bucket], list) {
+ if (c->namelen == len && !memcmp(c->name, errstr, len)) {
+ errno = c->val;
+ break;
+ }
+ }
+
+ if (errno == 0) {
+ /* TODO: if error isn't found, add it dynamically */
+ errstr[len] = 0;
+ pr_err("%s: server reported unknown error %s\n",
+ __func__, errstr);
+ errno = ESERVERFAULT;
+ }
+
+ return -errno;
+}
+EXPORT_SYMBOL(p9_errstr2errno);
diff --git a/net/9p/mod.c b/net/9p/mod.c
new file mode 100644
index 000000000..6ab36aea7
--- /dev/null
+++ b/net/9p/mod.c
@@ -0,0 +1,201 @@
+/*
+ * net/9p/9p.c
+ *
+ * 9P entry point
+ *
+ * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <net/9p/9p.h>
+#include <linux/fs.h>
+#include <linux/parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_NET_9P_DEBUG
+unsigned int p9_debug_level = 0; /* feature-rific global debug level */
+EXPORT_SYMBOL(p9_debug_level);
+module_param_named(debug, p9_debug_level, uint, 0);
+MODULE_PARM_DESC(debug, "9P debugging level");
+
+void _p9_debug(enum p9_debug_flags level, const char *func,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if ((p9_debug_level & level) != level)
+ return;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (level == P9_DEBUG_9P)
+ pr_notice("(%8.8d) %pV", task_pid_nr(current), &vaf);
+ else
+ pr_notice("-- %s (%d): %pV", func, task_pid_nr(current), &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(_p9_debug);
+#endif
+
+/*
+ * Dynamic Transport Registration Routines
+ *
+ */
+
+static DEFINE_SPINLOCK(v9fs_trans_lock);
+static LIST_HEAD(v9fs_trans_list);
+
+/**
+ * v9fs_register_trans - register a new transport with 9p
+ * @m: structure describing the transport module and entry points
+ *
+ */
+void v9fs_register_trans(struct p9_trans_module *m)
+{
+ spin_lock(&v9fs_trans_lock);
+ list_add_tail(&m->list, &v9fs_trans_list);
+ spin_unlock(&v9fs_trans_lock);
+}
+EXPORT_SYMBOL(v9fs_register_trans);
+
+/**
+ * v9fs_unregister_trans - unregister a 9p transport
+ * @m: the transport to remove
+ *
+ */
+void v9fs_unregister_trans(struct p9_trans_module *m)
+{
+ spin_lock(&v9fs_trans_lock);
+ list_del_init(&m->list);
+ spin_unlock(&v9fs_trans_lock);
+}
+EXPORT_SYMBOL(v9fs_unregister_trans);
+
+/**
+ * v9fs_get_trans_by_name - get transport with the matching name
+ * @name: string identifying transport
+ *
+ */
+struct p9_trans_module *v9fs_get_trans_by_name(char *s)
+{
+ struct p9_trans_module *t, *found = NULL;
+
+ spin_lock(&v9fs_trans_lock);
+
+ list_for_each_entry(t, &v9fs_trans_list, list)
+ if (strcmp(t->name, s) == 0 &&
+ try_module_get(t->owner)) {
+ found = t;
+ break;
+ }
+
+ spin_unlock(&v9fs_trans_lock);
+ return found;
+}
+EXPORT_SYMBOL(v9fs_get_trans_by_name);
+
+/**
+ * v9fs_get_default_trans - get the default transport
+ *
+ */
+
+struct p9_trans_module *v9fs_get_default_trans(void)
+{
+ struct p9_trans_module *t, *found = NULL;
+
+ spin_lock(&v9fs_trans_lock);
+
+ list_for_each_entry(t, &v9fs_trans_list, list)
+ if (t->def && try_module_get(t->owner)) {
+ found = t;
+ break;
+ }
+
+ if (!found)
+ list_for_each_entry(t, &v9fs_trans_list, list)
+ if (try_module_get(t->owner)) {
+ found = t;
+ break;
+ }
+
+ spin_unlock(&v9fs_trans_lock);
+ return found;
+}
+EXPORT_SYMBOL(v9fs_get_default_trans);
+
+/**
+ * v9fs_put_trans - put trans
+ * @m: transport to put
+ *
+ */
+void v9fs_put_trans(struct p9_trans_module *m)
+{
+ if (m)
+ module_put(m->owner);
+}
+
+/**
+ * init_p9 - Initialize module
+ *
+ */
+static int __init init_p9(void)
+{
+ int ret = 0;
+
+ p9_error_init();
+ pr_info("Installing 9P2000 support\n");
+ p9_trans_fd_init();
+
+ return ret;
+}
+
+/**
+ * exit_p9 - shutdown module
+ *
+ */
+
+static void __exit exit_p9(void)
+{
+ pr_info("Unloading 9P2000 support\n");
+
+ p9_trans_fd_exit();
+}
+
+module_init(init_p9)
+module_exit(exit_p9)
+
+MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
new file mode 100644
index 000000000..16d287565
--- /dev/null
+++ b/net/9p/protocol.c
@@ -0,0 +1,628 @@
+/*
+ * net/9p/protocol.c
+ *
+ * 9P Protocol Support Code
+ *
+ * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ * Base on code from Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2008 by IBM, Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "protocol.h"
+
+#include <trace/events/9p.h>
+
+static int
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
+
+void p9stat_free(struct p9_wstat *stbuf)
+{
+ kfree(stbuf->name);
+ kfree(stbuf->uid);
+ kfree(stbuf->gid);
+ kfree(stbuf->muid);
+ kfree(stbuf->extension);
+}
+EXPORT_SYMBOL(p9stat_free);
+
+size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size)
+{
+ size_t len = min(pdu->size - pdu->offset, size);
+ memcpy(data, &pdu->sdata[pdu->offset], len);
+ pdu->offset += len;
+ return size - len;
+}
+
+static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size)
+{
+ size_t len = min(pdu->capacity - pdu->size, size);
+ memcpy(&pdu->sdata[pdu->size], data, len);
+ pdu->size += len;
+ return size - len;
+}
+
+static size_t
+pdu_write_u(struct p9_fcall *pdu, struct iov_iter *from, size_t size)
+{
+ size_t len = min(pdu->capacity - pdu->size, size);
+ struct iov_iter i = *from;
+ if (copy_from_iter(&pdu->sdata[pdu->size], len, &i) != len)
+ len = 0;
+
+ pdu->size += len;
+ return size - len;
+}
+
+/*
+ b - int8_t
+ w - int16_t
+ d - int32_t
+ q - int64_t
+ s - string
+ u - numeric uid
+ g - numeric gid
+ S - stat
+ Q - qid
+ D - data blob (int32_t size followed by void *, results are not freed)
+ T - array of strings (int16_t count, followed by strings)
+ R - array of qids (int16_t count, followed by qids)
+ A - stat for 9p2000.L (p9_stat_dotl)
+ ? - if optional = 1, continue parsing
+*/
+
+static int
+p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap)
+{
+ const char *ptr;
+ int errcode = 0;
+
+ for (ptr = fmt; *ptr; ptr++) {
+ switch (*ptr) {
+ case 'b':{
+ int8_t *val = va_arg(ap, int8_t *);
+ if (pdu_read(pdu, val, sizeof(*val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ }
+ break;
+ case 'w':{
+ int16_t *val = va_arg(ap, int16_t *);
+ __le16 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *val = le16_to_cpu(le_val);
+ }
+ break;
+ case 'd':{
+ int32_t *val = va_arg(ap, int32_t *);
+ __le32 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *val = le32_to_cpu(le_val);
+ }
+ break;
+ case 'q':{
+ int64_t *val = va_arg(ap, int64_t *);
+ __le64 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *val = le64_to_cpu(le_val);
+ }
+ break;
+ case 's':{
+ char **sptr = va_arg(ap, char **);
+ uint16_t len;
+
+ errcode = p9pdu_readf(pdu, proto_version,
+ "w", &len);
+ if (errcode)
+ break;
+
+ *sptr = kmalloc(len + 1, GFP_NOFS);
+ if (*sptr == NULL) {
+ errcode = -EFAULT;
+ break;
+ }
+ if (pdu_read(pdu, *sptr, len)) {
+ errcode = -EFAULT;
+ kfree(*sptr);
+ *sptr = NULL;
+ } else
+ (*sptr)[len] = 0;
+ }
+ break;
+ case 'u': {
+ kuid_t *uid = va_arg(ap, kuid_t *);
+ __le32 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *uid = make_kuid(&init_user_ns,
+ le32_to_cpu(le_val));
+ } break;
+ case 'g': {
+ kgid_t *gid = va_arg(ap, kgid_t *);
+ __le32 le_val;
+ if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+ errcode = -EFAULT;
+ break;
+ }
+ *gid = make_kgid(&init_user_ns,
+ le32_to_cpu(le_val));
+ } break;
+ case 'Q':{
+ struct p9_qid *qid =
+ va_arg(ap, struct p9_qid *);
+
+ errcode = p9pdu_readf(pdu, proto_version, "bdq",
+ &qid->type, &qid->version,
+ &qid->path);
+ }
+ break;
+ case 'S':{
+ struct p9_wstat *stbuf =
+ va_arg(ap, struct p9_wstat *);
+
+ memset(stbuf, 0, sizeof(struct p9_wstat));
+ stbuf->n_uid = stbuf->n_muid = INVALID_UID;
+ stbuf->n_gid = INVALID_GID;
+
+ errcode =
+ p9pdu_readf(pdu, proto_version,
+ "wwdQdddqssss?sugu",
+ &stbuf->size, &stbuf->type,
+ &stbuf->dev, &stbuf->qid,
+ &stbuf->mode, &stbuf->atime,
+ &stbuf->mtime, &stbuf->length,
+ &stbuf->name, &stbuf->uid,
+ &stbuf->gid, &stbuf->muid,
+ &stbuf->extension,
+ &stbuf->n_uid, &stbuf->n_gid,
+ &stbuf->n_muid);
+ if (errcode)
+ p9stat_free(stbuf);
+ }
+ break;
+ case 'D':{
+ uint32_t *count = va_arg(ap, uint32_t *);
+ void **data = va_arg(ap, void **);
+
+ errcode =
+ p9pdu_readf(pdu, proto_version, "d", count);
+ if (!errcode) {
+ *count =
+ min_t(uint32_t, *count,
+ pdu->size - pdu->offset);
+ *data = &pdu->sdata[pdu->offset];
+ }
+ }
+ break;
+ case 'T':{
+ uint16_t *nwname = va_arg(ap, uint16_t *);
+ char ***wnames = va_arg(ap, char ***);
+
+ errcode = p9pdu_readf(pdu, proto_version,
+ "w", nwname);
+ if (!errcode) {
+ *wnames =
+ kmalloc(sizeof(char *) * *nwname,
+ GFP_NOFS);
+ if (!*wnames)
+ errcode = -ENOMEM;
+ }
+
+ if (!errcode) {
+ int i;
+
+ for (i = 0; i < *nwname; i++) {
+ errcode =
+ p9pdu_readf(pdu,
+ proto_version,
+ "s",
+ &(*wnames)[i]);
+ if (errcode)
+ break;
+ }
+ }
+
+ if (errcode) {
+ if (*wnames) {
+ int i;
+
+ for (i = 0; i < *nwname; i++)
+ kfree((*wnames)[i]);
+ }
+ kfree(*wnames);
+ *wnames = NULL;
+ }
+ }
+ break;
+ case 'R':{
+ uint16_t *nwqid = va_arg(ap, uint16_t *);
+ struct p9_qid **wqids =
+ va_arg(ap, struct p9_qid **);
+
+ *wqids = NULL;
+
+ errcode =
+ p9pdu_readf(pdu, proto_version, "w", nwqid);
+ if (!errcode) {
+ *wqids =
+ kmalloc(*nwqid *
+ sizeof(struct p9_qid),
+ GFP_NOFS);
+ if (*wqids == NULL)
+ errcode = -ENOMEM;
+ }
+
+ if (!errcode) {
+ int i;
+
+ for (i = 0; i < *nwqid; i++) {
+ errcode =
+ p9pdu_readf(pdu,
+ proto_version,
+ "Q",
+ &(*wqids)[i]);
+ if (errcode)
+ break;
+ }
+ }
+
+ if (errcode) {
+ kfree(*wqids);
+ *wqids = NULL;
+ }
+ }
+ break;
+ case 'A': {
+ struct p9_stat_dotl *stbuf =
+ va_arg(ap, struct p9_stat_dotl *);
+
+ memset(stbuf, 0, sizeof(struct p9_stat_dotl));
+ errcode =
+ p9pdu_readf(pdu, proto_version,
+ "qQdugqqqqqqqqqqqqqqq",
+ &stbuf->st_result_mask,
+ &stbuf->qid,
+ &stbuf->st_mode,
+ &stbuf->st_uid, &stbuf->st_gid,
+ &stbuf->st_nlink,
+ &stbuf->st_rdev, &stbuf->st_size,
+ &stbuf->st_blksize, &stbuf->st_blocks,
+ &stbuf->st_atime_sec,
+ &stbuf->st_atime_nsec,
+ &stbuf->st_mtime_sec,
+ &stbuf->st_mtime_nsec,
+ &stbuf->st_ctime_sec,
+ &stbuf->st_ctime_nsec,
+ &stbuf->st_btime_sec,
+ &stbuf->st_btime_nsec,
+ &stbuf->st_gen,
+ &stbuf->st_data_version);
+ }
+ break;
+ case '?':
+ if ((proto_version != p9_proto_2000u) &&
+ (proto_version != p9_proto_2000L))
+ return 0;
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ if (errcode)
+ break;
+ }
+
+ return errcode;
+}
+
+int
+p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap)
+{
+ const char *ptr;
+ int errcode = 0;
+
+ for (ptr = fmt; *ptr; ptr++) {
+ switch (*ptr) {
+ case 'b':{
+ int8_t val = va_arg(ap, int);
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'w':{
+ __le16 val = cpu_to_le16(va_arg(ap, int));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'd':{
+ __le32 val = cpu_to_le32(va_arg(ap, int32_t));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'q':{
+ __le64 val = cpu_to_le64(va_arg(ap, int64_t));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ }
+ break;
+ case 's':{
+ const char *sptr = va_arg(ap, const char *);
+ uint16_t len = 0;
+ if (sptr)
+ len = min_t(size_t, strlen(sptr),
+ USHRT_MAX);
+
+ errcode = p9pdu_writef(pdu, proto_version,
+ "w", len);
+ if (!errcode && pdu_write(pdu, sptr, len))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'u': {
+ kuid_t uid = va_arg(ap, kuid_t);
+ __le32 val = cpu_to_le32(
+ from_kuid(&init_user_ns, uid));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ } break;
+ case 'g': {
+ kgid_t gid = va_arg(ap, kgid_t);
+ __le32 val = cpu_to_le32(
+ from_kgid(&init_user_ns, gid));
+ if (pdu_write(pdu, &val, sizeof(val)))
+ errcode = -EFAULT;
+ } break;
+ case 'Q':{
+ const struct p9_qid *qid =
+ va_arg(ap, const struct p9_qid *);
+ errcode =
+ p9pdu_writef(pdu, proto_version, "bdq",
+ qid->type, qid->version,
+ qid->path);
+ } break;
+ case 'S':{
+ const struct p9_wstat *stbuf =
+ va_arg(ap, const struct p9_wstat *);
+ errcode =
+ p9pdu_writef(pdu, proto_version,
+ "wwdQdddqssss?sugu",
+ stbuf->size, stbuf->type,
+ stbuf->dev, &stbuf->qid,
+ stbuf->mode, stbuf->atime,
+ stbuf->mtime, stbuf->length,
+ stbuf->name, stbuf->uid,
+ stbuf->gid, stbuf->muid,
+ stbuf->extension, stbuf->n_uid,
+ stbuf->n_gid, stbuf->n_muid);
+ } break;
+ case 'V':{
+ uint32_t count = va_arg(ap, uint32_t);
+ struct iov_iter *from =
+ va_arg(ap, struct iov_iter *);
+ errcode = p9pdu_writef(pdu, proto_version, "d",
+ count);
+ if (!errcode && pdu_write_u(pdu, from, count))
+ errcode = -EFAULT;
+ }
+ break;
+ case 'T':{
+ uint16_t nwname = va_arg(ap, int);
+ const char **wnames = va_arg(ap, const char **);
+
+ errcode = p9pdu_writef(pdu, proto_version, "w",
+ nwname);
+ if (!errcode) {
+ int i;
+
+ for (i = 0; i < nwname; i++) {
+ errcode =
+ p9pdu_writef(pdu,
+ proto_version,
+ "s",
+ wnames[i]);
+ if (errcode)
+ break;
+ }
+ }
+ }
+ break;
+ case 'R':{
+ uint16_t nwqid = va_arg(ap, int);
+ struct p9_qid *wqids =
+ va_arg(ap, struct p9_qid *);
+
+ errcode = p9pdu_writef(pdu, proto_version, "w",
+ nwqid);
+ if (!errcode) {
+ int i;
+
+ for (i = 0; i < nwqid; i++) {
+ errcode =
+ p9pdu_writef(pdu,
+ proto_version,
+ "Q",
+ &wqids[i]);
+ if (errcode)
+ break;
+ }
+ }
+ }
+ break;
+ case 'I':{
+ struct p9_iattr_dotl *p9attr = va_arg(ap,
+ struct p9_iattr_dotl *);
+
+ errcode = p9pdu_writef(pdu, proto_version,
+ "ddugqqqqq",
+ p9attr->valid,
+ p9attr->mode,
+ p9attr->uid,
+ p9attr->gid,
+ p9attr->size,
+ p9attr->atime_sec,
+ p9attr->atime_nsec,
+ p9attr->mtime_sec,
+ p9attr->mtime_nsec);
+ }
+ break;
+ case '?':
+ if ((proto_version != p9_proto_2000u) &&
+ (proto_version != p9_proto_2000L))
+ return 0;
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ if (errcode)
+ break;
+ }
+
+ return errcode;
+}
+
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = p9pdu_vreadf(pdu, proto_version, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+
+static int
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = p9pdu_vwritef(pdu, proto_version, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+
+int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st)
+{
+ struct p9_fcall fake_pdu;
+ int ret;
+
+ fake_pdu.size = len;
+ fake_pdu.capacity = len;
+ fake_pdu.sdata = buf;
+ fake_pdu.offset = 0;
+
+ ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "S", st);
+ if (ret) {
+ p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
+ trace_9p_protocol_dump(clnt, &fake_pdu);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(p9stat_read);
+
+int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type)
+{
+ pdu->id = type;
+ return p9pdu_writef(pdu, 0, "dbw", 0, type, tag);
+}
+
+int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu)
+{
+ int size = pdu->size;
+ int err;
+
+ pdu->size = 0;
+ err = p9pdu_writef(pdu, 0, "d", size);
+ pdu->size = size;
+
+ trace_9p_protocol_dump(clnt, pdu);
+ p9_debug(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n",
+ pdu->size, pdu->id, pdu->tag);
+
+ return err;
+}
+
+void p9pdu_reset(struct p9_fcall *pdu)
+{
+ pdu->offset = 0;
+ pdu->size = 0;
+}
+
+int p9dirent_read(struct p9_client *clnt, char *buf, int len,
+ struct p9_dirent *dirent)
+{
+ struct p9_fcall fake_pdu;
+ int ret;
+ char *nameptr;
+
+ fake_pdu.size = len;
+ fake_pdu.capacity = len;
+ fake_pdu.sdata = buf;
+ fake_pdu.offset = 0;
+
+ ret = p9pdu_readf(&fake_pdu, clnt->proto_version, "Qqbs", &dirent->qid,
+ &dirent->d_off, &dirent->d_type, &nameptr);
+ if (ret) {
+ p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
+ trace_9p_protocol_dump(clnt, &fake_pdu);
+ goto out;
+ }
+
+ strcpy(dirent->d_name, nameptr);
+ kfree(nameptr);
+
+out:
+ return fake_pdu.offset;
+}
+EXPORT_SYMBOL(p9dirent_read);
diff --git a/net/9p/protocol.h b/net/9p/protocol.h
new file mode 100644
index 000000000..2cc525fa4
--- /dev/null
+++ b/net/9p/protocol.h
@@ -0,0 +1,34 @@
+/*
+ * net/9p/protocol.h
+ *
+ * 9P Protocol Support Code
+ *
+ * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ * Base on code from Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2008 by IBM, Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+ va_list ap);
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
+int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type);
+int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu);
+void p9pdu_reset(struct p9_fcall *pdu);
+size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size);
diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c
new file mode 100644
index 000000000..38aa6345b
--- /dev/null
+++ b/net/9p/trans_common.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+
+/**
+ * p9_release_req_pages - Release pages after the transaction.
+ */
+void p9_release_pages(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++)
+ if (pages[i])
+ put_page(pages[i]);
+}
+EXPORT_SYMBOL(p9_release_pages);
diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h
new file mode 100644
index 000000000..c43babb3f
--- /dev/null
+++ b/net/9p/trans_common.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+void p9_release_pages(struct page **, int);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
new file mode 100644
index 000000000..bced8c074
--- /dev/null
+++ b/net/9p/trans_fd.c
@@ -0,0 +1,1117 @@
+/*
+ * linux/fs/9p/trans_fd.c
+ *
+ * Fd transport layer. Includes deprecated socket layer.
+ *
+ * Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
+ * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
+ * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/parser.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+
+#include <linux/syscalls.h> /* killme */
+
+#define P9_PORT 564
+#define MAX_SOCK_BUF (64*1024)
+#define MAXPOLLWADDR 2
+
+/**
+ * struct p9_fd_opts - per-transport options
+ * @rfd: file descriptor for reading (trans=fd)
+ * @wfd: file descriptor for writing (trans=fd)
+ * @port: port to connect to (trans=tcp)
+ *
+ */
+
+struct p9_fd_opts {
+ int rfd;
+ int wfd;
+ u16 port;
+ int privport;
+};
+
+/*
+ * Option Parsing (code inspired by NFS code)
+ * - a little lazy - parse all fd-transport options
+ */
+
+enum {
+ /* Options that take integer arguments */
+ Opt_port, Opt_rfdno, Opt_wfdno, Opt_err,
+ /* Options that take no arguments */
+ Opt_privport,
+};
+
+static const match_table_t tokens = {
+ {Opt_port, "port=%u"},
+ {Opt_rfdno, "rfdno=%u"},
+ {Opt_wfdno, "wfdno=%u"},
+ {Opt_privport, "privport"},
+ {Opt_err, NULL},
+};
+
+enum {
+ Rworksched = 1, /* read work scheduled or running */
+ Rpending = 2, /* can read */
+ Wworksched = 4, /* write work scheduled or running */
+ Wpending = 8, /* can write */
+};
+
+struct p9_poll_wait {
+ struct p9_conn *conn;
+ wait_queue_t wait;
+ wait_queue_head_t *wait_addr;
+};
+
+/**
+ * struct p9_conn - fd mux connection state information
+ * @mux_list: list link for mux to manage multiple connections (?)
+ * @client: reference to client instance for this connection
+ * @err: error state
+ * @req_list: accounting for requests which have been sent
+ * @unsent_req_list: accounting for requests that haven't been sent
+ * @req: current request being processed (if any)
+ * @tmp_buf: temporary buffer to read in header
+ * @rsize: amount to read for current frame
+ * @rpos: read position in current frame
+ * @rbuf: current read buffer
+ * @wpos: write position for current frame
+ * @wsize: amount of data to write for current frame
+ * @wbuf: current write buffer
+ * @poll_pending_link: pending links to be polled per conn
+ * @poll_wait: array of wait_q's for various worker threads
+ * @pt: poll state
+ * @rq: current read work
+ * @wq: current write work
+ * @wsched: ????
+ *
+ */
+
+struct p9_conn {
+ struct list_head mux_list;
+ struct p9_client *client;
+ int err;
+ struct list_head req_list;
+ struct list_head unsent_req_list;
+ struct p9_req_t *req;
+ char tmp_buf[7];
+ int rsize;
+ int rpos;
+ char *rbuf;
+ int wpos;
+ int wsize;
+ char *wbuf;
+ struct list_head poll_pending_link;
+ struct p9_poll_wait poll_wait[MAXPOLLWADDR];
+ poll_table pt;
+ struct work_struct rq;
+ struct work_struct wq;
+ unsigned long wsched;
+};
+
+/**
+ * struct p9_trans_fd - transport state
+ * @rd: reference to file to read from
+ * @wr: reference of file to write to
+ * @conn: connection state reference
+ *
+ */
+
+struct p9_trans_fd {
+ struct file *rd;
+ struct file *wr;
+ struct p9_conn conn;
+};
+
+static void p9_poll_workfn(struct work_struct *work);
+
+static DEFINE_SPINLOCK(p9_poll_lock);
+static LIST_HEAD(p9_poll_pending_list);
+static DECLARE_WORK(p9_poll_work, p9_poll_workfn);
+
+static unsigned int p9_ipport_resv_min = P9_DEF_MIN_RESVPORT;
+static unsigned int p9_ipport_resv_max = P9_DEF_MAX_RESVPORT;
+
+static void p9_mux_poll_stop(struct p9_conn *m)
+{
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) {
+ struct p9_poll_wait *pwait = &m->poll_wait[i];
+
+ if (pwait->wait_addr) {
+ remove_wait_queue(pwait->wait_addr, &pwait->wait);
+ pwait->wait_addr = NULL;
+ }
+ }
+
+ spin_lock_irqsave(&p9_poll_lock, flags);
+ list_del_init(&m->poll_pending_link);
+ spin_unlock_irqrestore(&p9_poll_lock, flags);
+}
+
+/**
+ * p9_conn_cancel - cancel all pending requests with error
+ * @m: mux data
+ * @err: error code
+ *
+ */
+
+static void p9_conn_cancel(struct p9_conn *m, int err)
+{
+ struct p9_req_t *req, *rtmp;
+ unsigned long flags;
+ LIST_HEAD(cancel_list);
+
+ p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
+
+ spin_lock_irqsave(&m->client->lock, flags);
+
+ if (m->err) {
+ spin_unlock_irqrestore(&m->client->lock, flags);
+ return;
+ }
+
+ m->err = err;
+
+ list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
+ list_move(&req->req_list, &cancel_list);
+ }
+ list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
+ list_move(&req->req_list, &cancel_list);
+ }
+ spin_unlock_irqrestore(&m->client->lock, flags);
+
+ list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
+ p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
+ list_del(&req->req_list);
+ if (!req->t_err)
+ req->t_err = err;
+ p9_client_cb(m->client, req, REQ_STATUS_ERROR);
+ }
+}
+
+static int
+p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt)
+{
+ int ret, n;
+ struct p9_trans_fd *ts = NULL;
+
+ if (client && client->status == Connected)
+ ts = client->trans;
+
+ if (!ts)
+ return -EREMOTEIO;
+
+ if (!ts->rd->f_op->poll)
+ return -EIO;
+
+ if (!ts->wr->f_op->poll)
+ return -EIO;
+
+ ret = ts->rd->f_op->poll(ts->rd, pt);
+ if (ret < 0)
+ return ret;
+
+ if (ts->rd != ts->wr) {
+ n = ts->wr->f_op->poll(ts->wr, pt);
+ if (n < 0)
+ return n;
+ ret = (ret & ~POLLOUT) | (n & ~POLLIN);
+ }
+
+ return ret;
+}
+
+/**
+ * p9_fd_read- read from a fd
+ * @client: client instance
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+
+static int p9_fd_read(struct p9_client *client, void *v, int len)
+{
+ int ret;
+ struct p9_trans_fd *ts = NULL;
+
+ if (client && client->status != Disconnected)
+ ts = client->trans;
+
+ if (!ts)
+ return -EREMOTEIO;
+
+ if (!(ts->rd->f_flags & O_NONBLOCK))
+ p9_debug(P9_DEBUG_ERROR, "blocking read ...\n");
+
+ ret = kernel_read(ts->rd, ts->rd->f_pos, v, len);
+ if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
+ client->status = Disconnected;
+ return ret;
+}
+
+/**
+ * p9_read_work - called when there is some data to be read from a transport
+ * @work: container of work to be done
+ *
+ */
+
+static void p9_read_work(struct work_struct *work)
+{
+ int n, err;
+ struct p9_conn *m;
+ int status = REQ_STATUS_ERROR;
+
+ m = container_of(work, struct p9_conn, rq);
+
+ if (m->err < 0)
+ return;
+
+ p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos);
+
+ if (!m->rbuf) {
+ m->rbuf = m->tmp_buf;
+ m->rpos = 0;
+ m->rsize = 7; /* start by reading header */
+ }
+
+ clear_bit(Rpending, &m->wsched);
+ p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n",
+ m, m->rpos, m->rsize, m->rsize-m->rpos);
+ err = p9_fd_read(m->client, m->rbuf + m->rpos,
+ m->rsize - m->rpos);
+ p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
+ if (err == -EAGAIN) {
+ goto end_clear;
+ }
+
+ if (err <= 0)
+ goto error;
+
+ m->rpos += err;
+
+ if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */
+ u16 tag;
+ p9_debug(P9_DEBUG_TRANS, "got new header\n");
+
+ n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */
+ if (n >= m->client->msize) {
+ p9_debug(P9_DEBUG_ERROR,
+ "requested packet size too big: %d\n", n);
+ err = -EIO;
+ goto error;
+ }
+
+ tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */
+ p9_debug(P9_DEBUG_TRANS,
+ "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
+
+ m->req = p9_tag_lookup(m->client, tag);
+ if (!m->req || (m->req->status != REQ_STATUS_SENT)) {
+ p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
+ tag);
+ err = -EIO;
+ goto error;
+ }
+
+ if (m->req->rc == NULL) {
+ m->req->rc = kmalloc(sizeof(struct p9_fcall) +
+ m->client->msize, GFP_NOFS);
+ if (!m->req->rc) {
+ m->req = NULL;
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+ m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall);
+ memcpy(m->rbuf, m->tmp_buf, m->rsize);
+ m->rsize = n;
+ }
+
+ /* not an else because some packets (like clunk) have no payload */
+ if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */
+ p9_debug(P9_DEBUG_TRANS, "got new packet\n");
+ spin_lock(&m->client->lock);
+ if (m->req->status != REQ_STATUS_ERROR)
+ status = REQ_STATUS_RCVD;
+ list_del(&m->req->req_list);
+ spin_unlock(&m->client->lock);
+ p9_client_cb(m->client, m->req, status);
+ m->rbuf = NULL;
+ m->rpos = 0;
+ m->rsize = 0;
+ m->req = NULL;
+ }
+
+end_clear:
+ clear_bit(Rworksched, &m->wsched);
+
+ if (!list_empty(&m->req_list)) {
+ if (test_and_clear_bit(Rpending, &m->wsched))
+ n = POLLIN;
+ else
+ n = p9_fd_poll(m->client, NULL);
+
+ if ((n & POLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) {
+ p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
+ schedule_work(&m->rq);
+ }
+ }
+
+ return;
+error:
+ p9_conn_cancel(m, err);
+ clear_bit(Rworksched, &m->wsched);
+}
+
+/**
+ * p9_fd_write - write to a socket
+ * @client: client instance
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+
+static int p9_fd_write(struct p9_client *client, void *v, int len)
+{
+ int ret;
+ mm_segment_t oldfs;
+ struct p9_trans_fd *ts = NULL;
+
+ if (client && client->status != Disconnected)
+ ts = client->trans;
+
+ if (!ts)
+ return -EREMOTEIO;
+
+ if (!(ts->wr->f_flags & O_NONBLOCK))
+ p9_debug(P9_DEBUG_ERROR, "blocking write ...\n");
+
+ oldfs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos);
+ set_fs(oldfs);
+
+ if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
+ client->status = Disconnected;
+ return ret;
+}
+
+/**
+ * p9_write_work - called when a transport can send some data
+ * @work: container for work to be done
+ *
+ */
+
+static void p9_write_work(struct work_struct *work)
+{
+ int n, err;
+ struct p9_conn *m;
+ struct p9_req_t *req;
+
+ m = container_of(work, struct p9_conn, wq);
+
+ if (m->err < 0) {
+ clear_bit(Wworksched, &m->wsched);
+ return;
+ }
+
+ if (!m->wsize) {
+ spin_lock(&m->client->lock);
+ if (list_empty(&m->unsent_req_list)) {
+ clear_bit(Wworksched, &m->wsched);
+ spin_unlock(&m->client->lock);
+ return;
+ }
+
+ req = list_entry(m->unsent_req_list.next, struct p9_req_t,
+ req_list);
+ req->status = REQ_STATUS_SENT;
+ p9_debug(P9_DEBUG_TRANS, "move req %p\n", req);
+ list_move_tail(&req->req_list, &m->req_list);
+
+ m->wbuf = req->tc->sdata;
+ m->wsize = req->tc->size;
+ m->wpos = 0;
+ spin_unlock(&m->client->lock);
+ }
+
+ p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n",
+ m, m->wpos, m->wsize);
+ clear_bit(Wpending, &m->wsched);
+ err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos);
+ p9_debug(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err);
+ if (err == -EAGAIN)
+ goto end_clear;
+
+
+ if (err < 0)
+ goto error;
+ else if (err == 0) {
+ err = -EREMOTEIO;
+ goto error;
+ }
+
+ m->wpos += err;
+ if (m->wpos == m->wsize)
+ m->wpos = m->wsize = 0;
+
+end_clear:
+ clear_bit(Wworksched, &m->wsched);
+
+ if (m->wsize || !list_empty(&m->unsent_req_list)) {
+ if (test_and_clear_bit(Wpending, &m->wsched))
+ n = POLLOUT;
+ else
+ n = p9_fd_poll(m->client, NULL);
+
+ if ((n & POLLOUT) &&
+ !test_and_set_bit(Wworksched, &m->wsched)) {
+ p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
+ schedule_work(&m->wq);
+ }
+ }
+
+ return;
+
+error:
+ p9_conn_cancel(m, err);
+ clear_bit(Wworksched, &m->wsched);
+}
+
+static int p9_pollwake(wait_queue_t *wait, unsigned int mode, int sync, void *key)
+{
+ struct p9_poll_wait *pwait =
+ container_of(wait, struct p9_poll_wait, wait);
+ struct p9_conn *m = pwait->conn;
+ unsigned long flags;
+
+ spin_lock_irqsave(&p9_poll_lock, flags);
+ if (list_empty(&m->poll_pending_link))
+ list_add_tail(&m->poll_pending_link, &p9_poll_pending_list);
+ spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+ schedule_work(&p9_poll_work);
+ return 1;
+}
+
+/**
+ * p9_pollwait - add poll task to the wait queue
+ * @filp: file pointer being polled
+ * @wait_address: wait_q to block on
+ * @p: poll state
+ *
+ * called by files poll operation to add v9fs-poll task to files wait queue
+ */
+
+static void
+p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+ struct p9_conn *m = container_of(p, struct p9_conn, pt);
+ struct p9_poll_wait *pwait = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) {
+ if (m->poll_wait[i].wait_addr == NULL) {
+ pwait = &m->poll_wait[i];
+ break;
+ }
+ }
+
+ if (!pwait) {
+ p9_debug(P9_DEBUG_ERROR, "not enough wait_address slots\n");
+ return;
+ }
+
+ pwait->conn = m;
+ pwait->wait_addr = wait_address;
+ init_waitqueue_func_entry(&pwait->wait, p9_pollwake);
+ add_wait_queue(wait_address, &pwait->wait);
+}
+
+/**
+ * p9_conn_create - initialize the per-session mux data
+ * @client: client instance
+ *
+ * Note: Creates the polling task if this is the first session.
+ */
+
+static void p9_conn_create(struct p9_client *client)
+{
+ int n;
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
+
+ p9_debug(P9_DEBUG_TRANS, "client %p msize %d\n", client, client->msize);
+
+ INIT_LIST_HEAD(&m->mux_list);
+ m->client = client;
+
+ INIT_LIST_HEAD(&m->req_list);
+ INIT_LIST_HEAD(&m->unsent_req_list);
+ INIT_WORK(&m->rq, p9_read_work);
+ INIT_WORK(&m->wq, p9_write_work);
+ INIT_LIST_HEAD(&m->poll_pending_link);
+ init_poll_funcptr(&m->pt, p9_pollwait);
+
+ n = p9_fd_poll(client, &m->pt);
+ if (n & POLLIN) {
+ p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
+ set_bit(Rpending, &m->wsched);
+ }
+
+ if (n & POLLOUT) {
+ p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
+ set_bit(Wpending, &m->wsched);
+ }
+}
+
+/**
+ * p9_poll_mux - polls a mux and schedules read or write works if necessary
+ * @m: connection to poll
+ *
+ */
+
+static void p9_poll_mux(struct p9_conn *m)
+{
+ int n;
+
+ if (m->err < 0)
+ return;
+
+ n = p9_fd_poll(m->client, NULL);
+ if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
+ p9_debug(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n);
+ if (n >= 0)
+ n = -ECONNRESET;
+ p9_conn_cancel(m, n);
+ }
+
+ if (n & POLLIN) {
+ set_bit(Rpending, &m->wsched);
+ p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
+ if (!test_and_set_bit(Rworksched, &m->wsched)) {
+ p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
+ schedule_work(&m->rq);
+ }
+ }
+
+ if (n & POLLOUT) {
+ set_bit(Wpending, &m->wsched);
+ p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
+ if ((m->wsize || !list_empty(&m->unsent_req_list)) &&
+ !test_and_set_bit(Wworksched, &m->wsched)) {
+ p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
+ schedule_work(&m->wq);
+ }
+ }
+}
+
+/**
+ * p9_fd_request - send 9P request
+ * The function can sleep until the request is scheduled for sending.
+ * The function can be interrupted. Return from the function is not
+ * a guarantee that the request is sent successfully.
+ *
+ * @client: client instance
+ * @req: request to be sent
+ *
+ */
+
+static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
+{
+ int n;
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
+
+ p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n",
+ m, current, req->tc, req->tc->id);
+ if (m->err < 0)
+ return m->err;
+
+ spin_lock(&client->lock);
+ req->status = REQ_STATUS_UNSENT;
+ list_add_tail(&req->req_list, &m->unsent_req_list);
+ spin_unlock(&client->lock);
+
+ if (test_and_clear_bit(Wpending, &m->wsched))
+ n = POLLOUT;
+ else
+ n = p9_fd_poll(m->client, NULL);
+
+ if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
+ schedule_work(&m->wq);
+
+ return 0;
+}
+
+static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ int ret = 1;
+
+ p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+ spin_lock(&client->lock);
+
+ if (req->status == REQ_STATUS_UNSENT) {
+ list_del(&req->req_list);
+ req->status = REQ_STATUS_FLSHD;
+ ret = 0;
+ }
+ spin_unlock(&client->lock);
+
+ return ret;
+}
+
+static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+ p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+ /* we haven't received a response for oldreq,
+ * remove it from the list.
+ */
+ spin_lock(&client->lock);
+ list_del(&req->req_list);
+ spin_unlock(&client->lock);
+
+ return 0;
+}
+
+/**
+ * parse_opts - parse mount options into p9_fd_opts structure
+ * @params: options string passed from mount
+ * @opts: fd transport-specific structure to parse options into
+ *
+ * Returns 0 upon success, -ERRNO upon failure
+ */
+
+static int parse_opts(char *params, struct p9_fd_opts *opts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ char *options, *tmp_options;
+
+ opts->port = P9_PORT;
+ opts->rfd = ~0;
+ opts->wfd = ~0;
+ opts->privport = 0;
+
+ if (!params)
+ return 0;
+
+ tmp_options = kstrdup(params, GFP_KERNEL);
+ if (!tmp_options) {
+ p9_debug(P9_DEBUG_ERROR,
+ "failed to allocate copy of option string\n");
+ return -ENOMEM;
+ }
+ options = tmp_options;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ int r;
+ if (!*p)
+ continue;
+ token = match_token(p, tokens, args);
+ if ((token != Opt_err) && (token != Opt_privport)) {
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ continue;
+ }
+ }
+ switch (token) {
+ case Opt_port:
+ opts->port = option;
+ break;
+ case Opt_rfdno:
+ opts->rfd = option;
+ break;
+ case Opt_wfdno:
+ opts->wfd = option;
+ break;
+ case Opt_privport:
+ opts->privport = 1;
+ break;
+ default:
+ continue;
+ }
+ }
+
+ kfree(tmp_options);
+ return 0;
+}
+
+static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
+{
+ struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd),
+ GFP_KERNEL);
+ if (!ts)
+ return -ENOMEM;
+
+ ts->rd = fget(rfd);
+ ts->wr = fget(wfd);
+ if (!ts->rd || !ts->wr) {
+ if (ts->rd)
+ fput(ts->rd);
+ if (ts->wr)
+ fput(ts->wr);
+ kfree(ts);
+ return -EIO;
+ }
+
+ client->trans = ts;
+ client->status = Connected;
+
+ return 0;
+}
+
+static int p9_socket_open(struct p9_client *client, struct socket *csocket)
+{
+ struct p9_trans_fd *p;
+ struct file *file;
+
+ p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ csocket->sk->sk_allocation = GFP_NOIO;
+ file = sock_alloc_file(csocket, 0, NULL);
+ if (IS_ERR(file)) {
+ pr_err("%s (%d): failed to map fd\n",
+ __func__, task_pid_nr(current));
+ sock_release(csocket);
+ kfree(p);
+ return PTR_ERR(file);
+ }
+
+ get_file(file);
+ p->wr = p->rd = file;
+ client->trans = p;
+ client->status = Connected;
+
+ p->rd->f_flags |= O_NONBLOCK;
+
+ p9_conn_create(client);
+ return 0;
+}
+
+/**
+ * p9_mux_destroy - cancels all pending requests of mux
+ * @m: mux to destroy
+ *
+ */
+
+static void p9_conn_destroy(struct p9_conn *m)
+{
+ p9_debug(P9_DEBUG_TRANS, "mux %p prev %p next %p\n",
+ m, m->mux_list.prev, m->mux_list.next);
+
+ p9_mux_poll_stop(m);
+ cancel_work_sync(&m->rq);
+ cancel_work_sync(&m->wq);
+
+ p9_conn_cancel(m, -ECONNRESET);
+
+ m->client = NULL;
+}
+
+/**
+ * p9_fd_close - shutdown file descriptor transport
+ * @client: client instance
+ *
+ */
+
+static void p9_fd_close(struct p9_client *client)
+{
+ struct p9_trans_fd *ts;
+
+ if (!client)
+ return;
+
+ ts = client->trans;
+ if (!ts)
+ return;
+
+ client->status = Disconnected;
+
+ p9_conn_destroy(&ts->conn);
+
+ if (ts->rd)
+ fput(ts->rd);
+ if (ts->wr)
+ fput(ts->wr);
+
+ kfree(ts);
+}
+
+/*
+ * stolen from NFS - maybe should be made a generic function?
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+ int rc, count, in[4];
+
+ rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+ if (rc != 4)
+ return -EINVAL;
+ for (count = 0; count < 4; count++) {
+ if (in[count] > 255)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int p9_bind_privport(struct socket *sock)
+{
+ struct sockaddr_in cl;
+ int port, err = -EINVAL;
+
+ memset(&cl, 0, sizeof(cl));
+ cl.sin_family = AF_INET;
+ cl.sin_addr.s_addr = INADDR_ANY;
+ for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) {
+ cl.sin_port = htons((ushort)port);
+ err = kernel_bind(sock, (struct sockaddr *)&cl, sizeof(cl));
+ if (err != -EADDRINUSE)
+ break;
+ }
+ return err;
+}
+
+
+static int
+p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
+{
+ int err;
+ struct socket *csocket;
+ struct sockaddr_in sin_server;
+ struct p9_fd_opts opts;
+
+ err = parse_opts(args, &opts);
+ if (err < 0)
+ return err;
+
+ if (valid_ipaddr4(addr) < 0)
+ return -EINVAL;
+
+ csocket = NULL;
+
+ sin_server.sin_family = AF_INET;
+ sin_server.sin_addr.s_addr = in_aton(addr);
+ sin_server.sin_port = htons(opts.port);
+ err = __sock_create(current->nsproxy->net_ns, PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &csocket, 1);
+ if (err) {
+ pr_err("%s (%d): problem creating socket\n",
+ __func__, task_pid_nr(current));
+ return err;
+ }
+
+ if (opts.privport) {
+ err = p9_bind_privport(csocket);
+ if (err < 0) {
+ pr_err("%s (%d): problem binding to privport\n",
+ __func__, task_pid_nr(current));
+ sock_release(csocket);
+ return err;
+ }
+ }
+
+ err = csocket->ops->connect(csocket,
+ (struct sockaddr *)&sin_server,
+ sizeof(struct sockaddr_in), 0);
+ if (err < 0) {
+ pr_err("%s (%d): problem connecting socket to %s\n",
+ __func__, task_pid_nr(current), addr);
+ sock_release(csocket);
+ return err;
+ }
+
+ return p9_socket_open(client, csocket);
+}
+
+static int
+p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
+{
+ int err;
+ struct socket *csocket;
+ struct sockaddr_un sun_server;
+
+ csocket = NULL;
+
+ if (strlen(addr) >= UNIX_PATH_MAX) {
+ pr_err("%s (%d): address too long: %s\n",
+ __func__, task_pid_nr(current), addr);
+ return -ENAMETOOLONG;
+ }
+
+ sun_server.sun_family = PF_UNIX;
+ strcpy(sun_server.sun_path, addr);
+ err = __sock_create(current->nsproxy->net_ns, PF_UNIX,
+ SOCK_STREAM, 0, &csocket, 1);
+ if (err < 0) {
+ pr_err("%s (%d): problem creating socket\n",
+ __func__, task_pid_nr(current));
+
+ return err;
+ }
+ err = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server,
+ sizeof(struct sockaddr_un) - 1, 0);
+ if (err < 0) {
+ pr_err("%s (%d): problem connecting socket: %s: %d\n",
+ __func__, task_pid_nr(current), addr, err);
+ sock_release(csocket);
+ return err;
+ }
+
+ return p9_socket_open(client, csocket);
+}
+
+static int
+p9_fd_create(struct p9_client *client, const char *addr, char *args)
+{
+ int err;
+ struct p9_fd_opts opts;
+
+ parse_opts(args, &opts);
+
+ if (opts.rfd == ~0 || opts.wfd == ~0) {
+ pr_err("Insufficient options for proto=fd\n");
+ return -ENOPROTOOPT;
+ }
+
+ err = p9_fd_open(client, opts.rfd, opts.wfd);
+ if (err < 0)
+ return err;
+
+ p9_conn_create(client);
+
+ return 0;
+}
+
+static struct p9_trans_module p9_tcp_trans = {
+ .name = "tcp",
+ .maxsize = MAX_SOCK_BUF,
+ .def = 0,
+ .create = p9_fd_create_tcp,
+ .close = p9_fd_close,
+ .request = p9_fd_request,
+ .cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
+ .owner = THIS_MODULE,
+};
+
+static struct p9_trans_module p9_unix_trans = {
+ .name = "unix",
+ .maxsize = MAX_SOCK_BUF,
+ .def = 0,
+ .create = p9_fd_create_unix,
+ .close = p9_fd_close,
+ .request = p9_fd_request,
+ .cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
+ .owner = THIS_MODULE,
+};
+
+static struct p9_trans_module p9_fd_trans = {
+ .name = "fd",
+ .maxsize = MAX_SOCK_BUF,
+ .def = 0,
+ .create = p9_fd_create,
+ .close = p9_fd_close,
+ .request = p9_fd_request,
+ .cancel = p9_fd_cancel,
+ .cancelled = p9_fd_cancelled,
+ .owner = THIS_MODULE,
+};
+
+/**
+ * p9_poll_proc - poll worker thread
+ * @a: thread state and arguments
+ *
+ * polls all v9fs transports for new events and queues the appropriate
+ * work to the work queue
+ *
+ */
+
+static void p9_poll_workfn(struct work_struct *work)
+{
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_TRANS, "start %p\n", current);
+
+ spin_lock_irqsave(&p9_poll_lock, flags);
+ while (!list_empty(&p9_poll_pending_list)) {
+ struct p9_conn *conn = list_first_entry(&p9_poll_pending_list,
+ struct p9_conn,
+ poll_pending_link);
+ list_del_init(&conn->poll_pending_link);
+ spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+ p9_poll_mux(conn);
+
+ spin_lock_irqsave(&p9_poll_lock, flags);
+ }
+ spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+ p9_debug(P9_DEBUG_TRANS, "finish\n");
+}
+
+int p9_trans_fd_init(void)
+{
+ v9fs_register_trans(&p9_tcp_trans);
+ v9fs_register_trans(&p9_unix_trans);
+ v9fs_register_trans(&p9_fd_trans);
+
+ return 0;
+}
+
+void p9_trans_fd_exit(void)
+{
+ flush_work(&p9_poll_work);
+ v9fs_unregister_trans(&p9_tcp_trans);
+ v9fs_unregister_trans(&p9_unix_trans);
+ v9fs_unregister_trans(&p9_fd_trans);
+}
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
new file mode 100644
index 000000000..3533d2a53
--- /dev/null
+++ b/net/9p/trans_rdma.c
@@ -0,0 +1,801 @@
+/*
+ * linux/fs/9p/trans_rdma.c
+ *
+ * RDMA transport layer based on the trans_fd.c implementation.
+ *
+ * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com>
+ * Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
+ * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
+ * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/parser.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#define P9_PORT 5640
+#define P9_RDMA_SQ_DEPTH 32
+#define P9_RDMA_RQ_DEPTH 32
+#define P9_RDMA_SEND_SGE 4
+#define P9_RDMA_RECV_SGE 4
+#define P9_RDMA_IRD 0
+#define P9_RDMA_ORD 0
+#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */
+#define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */
+
+/**
+ * struct p9_trans_rdma - RDMA transport instance
+ *
+ * @state: tracks the transport state machine for connection setup and tear down
+ * @cm_id: The RDMA CM ID
+ * @pd: Protection Domain pointer
+ * @qp: Queue Pair pointer
+ * @cq: Completion Queue pointer
+ * @dm_mr: DMA Memory Region pointer
+ * @lkey: The local access only memory region key
+ * @timeout: Number of uSecs to wait for connection management events
+ * @sq_depth: The depth of the Send Queue
+ * @sq_sem: Semaphore for the SQ
+ * @rq_depth: The depth of the Receive Queue.
+ * @rq_sem: Semaphore for the RQ
+ * @excess_rc : Amount of posted Receive Contexts without a pending request.
+ * See rdma_request()
+ * @addr: The remote peer's address
+ * @req_lock: Protects the active request list
+ * @cm_done: Completion event for connection management tracking
+ */
+struct p9_trans_rdma {
+ enum {
+ P9_RDMA_INIT,
+ P9_RDMA_ADDR_RESOLVED,
+ P9_RDMA_ROUTE_RESOLVED,
+ P9_RDMA_CONNECTED,
+ P9_RDMA_FLUSHING,
+ P9_RDMA_CLOSING,
+ P9_RDMA_CLOSED,
+ } state;
+ struct rdma_cm_id *cm_id;
+ struct ib_pd *pd;
+ struct ib_qp *qp;
+ struct ib_cq *cq;
+ struct ib_mr *dma_mr;
+ u32 lkey;
+ long timeout;
+ int sq_depth;
+ struct semaphore sq_sem;
+ int rq_depth;
+ struct semaphore rq_sem;
+ atomic_t excess_rc;
+ struct sockaddr_in addr;
+ spinlock_t req_lock;
+
+ struct completion cm_done;
+};
+
+/**
+ * p9_rdma_context - Keeps track of in-process WR
+ *
+ * @wc_op: The original WR op for when the CQE completes in error.
+ * @busa: Bus address to unmap when the WR completes
+ * @req: Keeps track of requests (send)
+ * @rc: Keepts track of replies (receive)
+ */
+struct p9_rdma_req;
+struct p9_rdma_context {
+ enum ib_wc_opcode wc_op;
+ dma_addr_t busa;
+ union {
+ struct p9_req_t *req;
+ struct p9_fcall *rc;
+ };
+};
+
+/**
+ * p9_rdma_opts - Collection of mount options
+ * @port: port of connection
+ * @sq_depth: The requested depth of the SQ. This really doesn't need
+ * to be any deeper than the number of threads used in the client
+ * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
+ * @timeout: Time to wait in msecs for CM events
+ */
+struct p9_rdma_opts {
+ short port;
+ int sq_depth;
+ int rq_depth;
+ long timeout;
+ int privport;
+};
+
+/*
+ * Option Parsing (code inspired by NFS code)
+ */
+enum {
+ /* Options that take integer arguments */
+ Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout,
+ /* Options that take no argument */
+ Opt_privport,
+ Opt_err,
+};
+
+static match_table_t tokens = {
+ {Opt_port, "port=%u"},
+ {Opt_sq_depth, "sq=%u"},
+ {Opt_rq_depth, "rq=%u"},
+ {Opt_timeout, "timeout=%u"},
+ {Opt_privport, "privport"},
+ {Opt_err, NULL},
+};
+
+/**
+ * parse_opts - parse mount options into rdma options structure
+ * @params: options string passed from mount
+ * @opts: rdma transport-specific structure to parse options into
+ *
+ * Returns 0 upon success, -ERRNO upon failure
+ */
+static int parse_opts(char *params, struct p9_rdma_opts *opts)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ char *options, *tmp_options;
+
+ opts->port = P9_PORT;
+ opts->sq_depth = P9_RDMA_SQ_DEPTH;
+ opts->rq_depth = P9_RDMA_RQ_DEPTH;
+ opts->timeout = P9_RDMA_TIMEOUT;
+ opts->privport = 0;
+
+ if (!params)
+ return 0;
+
+ tmp_options = kstrdup(params, GFP_KERNEL);
+ if (!tmp_options) {
+ p9_debug(P9_DEBUG_ERROR,
+ "failed to allocate copy of option string\n");
+ return -ENOMEM;
+ }
+ options = tmp_options;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ int r;
+ if (!*p)
+ continue;
+ token = match_token(p, tokens, args);
+ if ((token != Opt_err) && (token != Opt_privport)) {
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ continue;
+ }
+ }
+ switch (token) {
+ case Opt_port:
+ opts->port = option;
+ break;
+ case Opt_sq_depth:
+ opts->sq_depth = option;
+ break;
+ case Opt_rq_depth:
+ opts->rq_depth = option;
+ break;
+ case Opt_timeout:
+ opts->timeout = option;
+ break;
+ case Opt_privport:
+ opts->privport = 1;
+ break;
+ default:
+ continue;
+ }
+ }
+ /* RQ must be at least as large as the SQ */
+ opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
+ kfree(tmp_options);
+ return 0;
+}
+
+static int
+p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+ struct p9_client *c = id->context;
+ struct p9_trans_rdma *rdma = c->trans;
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ BUG_ON(rdma->state != P9_RDMA_INIT);
+ rdma->state = P9_RDMA_ADDR_RESOLVED;
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED);
+ rdma->state = P9_RDMA_ROUTE_RESOLVED;
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED);
+ rdma->state = P9_RDMA_CONNECTED;
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ if (rdma)
+ rdma->state = P9_RDMA_CLOSED;
+ if (c)
+ c->status = Disconnected;
+ break;
+
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ break;
+
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ case RDMA_CM_EVENT_REJECTED:
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ case RDMA_CM_EVENT_CONNECT_RESPONSE:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ c->status = Disconnected;
+ rdma_disconnect(rdma->cm_id);
+ break;
+ default:
+ BUG();
+ }
+ complete(&rdma->cm_done);
+ return 0;
+}
+
+static void
+handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
+ struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
+{
+ struct p9_req_t *req;
+ int err = 0;
+ int16_t tag;
+
+ req = NULL;
+ ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
+ DMA_FROM_DEVICE);
+
+ if (status != IB_WC_SUCCESS)
+ goto err_out;
+
+ err = p9_parse_header(c->rc, NULL, NULL, &tag, 1);
+ if (err)
+ goto err_out;
+
+ req = p9_tag_lookup(client, tag);
+ if (!req)
+ goto err_out;
+
+ /* Check that we have not yet received a reply for this request.
+ */
+ if (unlikely(req->rc)) {
+ pr_err("Duplicate reply for request %d", tag);
+ goto err_out;
+ }
+
+ req->rc = c->rc;
+ p9_client_cb(client, req, REQ_STATUS_RCVD);
+
+ return;
+
+ err_out:
+ p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status);
+ rdma->state = P9_RDMA_FLUSHING;
+ client->status = Disconnected;
+}
+
+static void
+handle_send(struct p9_client *client, struct p9_trans_rdma *rdma,
+ struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
+{
+ ib_dma_unmap_single(rdma->cm_id->device,
+ c->busa, c->req->tc->size,
+ DMA_TO_DEVICE);
+}
+
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+ p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n",
+ event->event, context);
+}
+
+static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct p9_client *client = cq_context;
+ struct p9_trans_rdma *rdma = client->trans;
+ int ret;
+ struct ib_wc wc;
+
+ ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
+ while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
+ struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id;
+
+ switch (c->wc_op) {
+ case IB_WC_RECV:
+ handle_recv(client, rdma, c, wc.status, wc.byte_len);
+ up(&rdma->rq_sem);
+ break;
+
+ case IB_WC_SEND:
+ handle_send(client, rdma, c, wc.status, wc.byte_len);
+ up(&rdma->sq_sem);
+ break;
+
+ default:
+ pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n",
+ c->wc_op, wc.opcode, wc.status);
+ break;
+ }
+ kfree(c);
+ }
+}
+
+static void cq_event_handler(struct ib_event *e, void *v)
+{
+ p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
+}
+
+static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
+{
+ if (!rdma)
+ return;
+
+ if (rdma->dma_mr && !IS_ERR(rdma->dma_mr))
+ ib_dereg_mr(rdma->dma_mr);
+
+ if (rdma->qp && !IS_ERR(rdma->qp))
+ ib_destroy_qp(rdma->qp);
+
+ if (rdma->pd && !IS_ERR(rdma->pd))
+ ib_dealloc_pd(rdma->pd);
+
+ if (rdma->cq && !IS_ERR(rdma->cq))
+ ib_destroy_cq(rdma->cq);
+
+ if (rdma->cm_id && !IS_ERR(rdma->cm_id))
+ rdma_destroy_id(rdma->cm_id);
+
+ kfree(rdma);
+}
+
+static int
+post_recv(struct p9_client *client, struct p9_rdma_context *c)
+{
+ struct p9_trans_rdma *rdma = client->trans;
+ struct ib_recv_wr wr, *bad_wr;
+ struct ib_sge sge;
+
+ c->busa = ib_dma_map_single(rdma->cm_id->device,
+ c->rc->sdata, client->msize,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
+ goto error;
+
+ sge.addr = c->busa;
+ sge.length = client->msize;
+ sge.lkey = rdma->lkey;
+
+ wr.next = NULL;
+ c->wc_op = IB_WC_RECV;
+ wr.wr_id = (unsigned long) c;
+ wr.sg_list = &sge;
+ wr.num_sge = 1;
+ return ib_post_recv(rdma->qp, &wr, &bad_wr);
+
+ error:
+ p9_debug(P9_DEBUG_ERROR, "EIO\n");
+ return -EIO;
+}
+
+static int rdma_request(struct p9_client *client, struct p9_req_t *req)
+{
+ struct p9_trans_rdma *rdma = client->trans;
+ struct ib_send_wr wr, *bad_wr;
+ struct ib_sge sge;
+ int err = 0;
+ unsigned long flags;
+ struct p9_rdma_context *c = NULL;
+ struct p9_rdma_context *rpl_context = NULL;
+
+ /* When an error occurs between posting the recv and the send,
+ * there will be a receive context posted without a pending request.
+ * Since there is no way to "un-post" it, we remember it and skip
+ * post_recv() for the next request.
+ * So here,
+ * see if we are this `next request' and need to absorb an excess rc.
+ * If yes, then drop and free our own, and do not recv_post().
+ **/
+ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
+ if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
+ /* Got one ! */
+ kfree(req->rc);
+ req->rc = NULL;
+ goto dont_need_post_recv;
+ } else {
+ /* We raced and lost. */
+ atomic_inc(&rdma->excess_rc);
+ }
+ }
+
+ /* Allocate an fcall for the reply */
+ rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
+ if (!rpl_context) {
+ err = -ENOMEM;
+ goto recv_error;
+ }
+ rpl_context->rc = req->rc;
+
+ /*
+ * Post a receive buffer for this request. We need to ensure
+ * there is a reply buffer available for every outstanding
+ * request. A flushed request can result in no reply for an
+ * outstanding request, so we must keep a count to avoid
+ * overflowing the RQ.
+ */
+ if (down_interruptible(&rdma->rq_sem)) {
+ err = -EINTR;
+ goto recv_error;
+ }
+
+ err = post_recv(client, rpl_context);
+ if (err) {
+ p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n");
+ goto recv_error;
+ }
+ /* remove posted receive buffer from request structure */
+ req->rc = NULL;
+
+dont_need_post_recv:
+ /* Post the request */
+ c = kmalloc(sizeof *c, GFP_NOFS);
+ if (!c) {
+ err = -ENOMEM;
+ goto send_error;
+ }
+ c->req = req;
+
+ c->busa = ib_dma_map_single(rdma->cm_id->device,
+ c->req->tc->sdata, c->req->tc->size,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
+ err = -EIO;
+ goto send_error;
+ }
+
+ sge.addr = c->busa;
+ sge.length = c->req->tc->size;
+ sge.lkey = rdma->lkey;
+
+ wr.next = NULL;
+ c->wc_op = IB_WC_SEND;
+ wr.wr_id = (unsigned long) c;
+ wr.opcode = IB_WR_SEND;
+ wr.send_flags = IB_SEND_SIGNALED;
+ wr.sg_list = &sge;
+ wr.num_sge = 1;
+
+ if (down_interruptible(&rdma->sq_sem)) {
+ err = -EINTR;
+ goto send_error;
+ }
+
+ /* Mark request as `sent' *before* we actually send it,
+ * because doing if after could erase the REQ_STATUS_RCVD
+ * status in case of a very fast reply.
+ */
+ req->status = REQ_STATUS_SENT;
+ err = ib_post_send(rdma->qp, &wr, &bad_wr);
+ if (err)
+ goto send_error;
+
+ /* Success */
+ return 0;
+
+ /* Handle errors that happened during or while preparing the send: */
+ send_error:
+ req->status = REQ_STATUS_ERROR;
+ kfree(c);
+ p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
+
+ /* Ach.
+ * We did recv_post(), but not send. We have one recv_post in excess.
+ */
+ atomic_inc(&rdma->excess_rc);
+ return err;
+
+ /* Handle errors that happened during or while preparing post_recv(): */
+ recv_error:
+ kfree(rpl_context);
+ spin_lock_irqsave(&rdma->req_lock, flags);
+ if (rdma->state < P9_RDMA_CLOSING) {
+ rdma->state = P9_RDMA_CLOSING;
+ spin_unlock_irqrestore(&rdma->req_lock, flags);
+ rdma_disconnect(rdma->cm_id);
+ } else
+ spin_unlock_irqrestore(&rdma->req_lock, flags);
+ return err;
+}
+
+static void rdma_close(struct p9_client *client)
+{
+ struct p9_trans_rdma *rdma;
+
+ if (!client)
+ return;
+
+ rdma = client->trans;
+ if (!rdma)
+ return;
+
+ client->status = Disconnected;
+ rdma_disconnect(rdma->cm_id);
+ rdma_destroy_trans(rdma);
+}
+
+/**
+ * alloc_rdma - Allocate and initialize the rdma transport structure
+ * @opts: Mount options structure
+ */
+static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
+{
+ struct p9_trans_rdma *rdma;
+
+ rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL);
+ if (!rdma)
+ return NULL;
+
+ rdma->sq_depth = opts->sq_depth;
+ rdma->rq_depth = opts->rq_depth;
+ rdma->timeout = opts->timeout;
+ spin_lock_init(&rdma->req_lock);
+ init_completion(&rdma->cm_done);
+ sema_init(&rdma->sq_sem, rdma->sq_depth);
+ sema_init(&rdma->rq_sem, rdma->rq_depth);
+ atomic_set(&rdma->excess_rc, 0);
+
+ return rdma;
+}
+
+static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ /* Nothing to do here.
+ * We will take care of it (if we have to) in rdma_cancelled()
+ */
+ return 1;
+}
+
+/* A request has been fully flushed without a reply.
+ * That means we have posted one buffer in excess.
+ */
+static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+ struct p9_trans_rdma *rdma = client->trans;
+ atomic_inc(&rdma->excess_rc);
+ return 0;
+}
+
+static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
+{
+ struct sockaddr_in cl = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ };
+ int port, err = -EINVAL;
+
+ for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
+ cl.sin_port = htons((ushort)port);
+ err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl);
+ if (err != -EADDRINUSE)
+ break;
+ }
+ return err;
+}
+
+/**
+ * trans_create_rdma - Transport method for creating atransport instance
+ * @client: client instance
+ * @addr: IP address string
+ * @args: Mount options string
+ */
+static int
+rdma_create_trans(struct p9_client *client, const char *addr, char *args)
+{
+ int err;
+ struct p9_rdma_opts opts;
+ struct p9_trans_rdma *rdma;
+ struct rdma_conn_param conn_param;
+ struct ib_qp_init_attr qp_attr;
+ struct ib_device_attr devattr;
+
+ /* Parse the transport specific mount options */
+ err = parse_opts(args, &opts);
+ if (err < 0)
+ return err;
+
+ /* Create and initialize the RDMA transport structure */
+ rdma = alloc_rdma(&opts);
+ if (!rdma)
+ return -ENOMEM;
+
+ /* Create the RDMA CM ID */
+ rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP,
+ IB_QPT_RC);
+ if (IS_ERR(rdma->cm_id))
+ goto error;
+
+ /* Associate the client with the transport */
+ client->trans = rdma;
+
+ /* Bind to a privileged port if we need to */
+ if (opts.privport) {
+ err = p9_rdma_bind_privport(rdma);
+ if (err < 0) {
+ pr_err("%s (%d): problem binding to privport: %d\n",
+ __func__, task_pid_nr(current), -err);
+ goto error;
+ }
+ }
+
+ /* Resolve the server's address */
+ rdma->addr.sin_family = AF_INET;
+ rdma->addr.sin_addr.s_addr = in_aton(addr);
+ rdma->addr.sin_port = htons(opts.port);
+ err = rdma_resolve_addr(rdma->cm_id, NULL,
+ (struct sockaddr *)&rdma->addr,
+ rdma->timeout);
+ if (err)
+ goto error;
+ err = wait_for_completion_interruptible(&rdma->cm_done);
+ if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED))
+ goto error;
+
+ /* Resolve the route to the server */
+ err = rdma_resolve_route(rdma->cm_id, rdma->timeout);
+ if (err)
+ goto error;
+ err = wait_for_completion_interruptible(&rdma->cm_done);
+ if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
+ goto error;
+
+ /* Query the device attributes */
+ err = ib_query_device(rdma->cm_id->device, &devattr);
+ if (err)
+ goto error;
+
+ /* Create the Completion Queue */
+ rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
+ cq_event_handler, client,
+ opts.sq_depth + opts.rq_depth + 1, 0);
+ if (IS_ERR(rdma->cq))
+ goto error;
+ ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
+
+ /* Create the Protection Domain */
+ rdma->pd = ib_alloc_pd(rdma->cm_id->device);
+ if (IS_ERR(rdma->pd))
+ goto error;
+
+ /* Cache the DMA lkey in the transport */
+ rdma->dma_mr = NULL;
+ if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ rdma->lkey = rdma->cm_id->device->local_dma_lkey;
+ else {
+ rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(rdma->dma_mr))
+ goto error;
+ rdma->lkey = rdma->dma_mr->lkey;
+ }
+
+ /* Create the Queue Pair */
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.event_handler = qp_event_handler;
+ qp_attr.qp_context = client;
+ qp_attr.cap.max_send_wr = opts.sq_depth;
+ qp_attr.cap.max_recv_wr = opts.rq_depth;
+ qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE;
+ qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE;
+ qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ qp_attr.qp_type = IB_QPT_RC;
+ qp_attr.send_cq = rdma->cq;
+ qp_attr.recv_cq = rdma->cq;
+ err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr);
+ if (err)
+ goto error;
+ rdma->qp = rdma->cm_id->qp;
+
+ /* Request a connection */
+ memset(&conn_param, 0, sizeof(conn_param));
+ conn_param.private_data = NULL;
+ conn_param.private_data_len = 0;
+ conn_param.responder_resources = P9_RDMA_IRD;
+ conn_param.initiator_depth = P9_RDMA_ORD;
+ err = rdma_connect(rdma->cm_id, &conn_param);
+ if (err)
+ goto error;
+ err = wait_for_completion_interruptible(&rdma->cm_done);
+ if (err || (rdma->state != P9_RDMA_CONNECTED))
+ goto error;
+
+ client->status = Connected;
+
+ return 0;
+
+error:
+ rdma_destroy_trans(rdma);
+ return -ENOTCONN;
+}
+
+static struct p9_trans_module p9_rdma_trans = {
+ .name = "rdma",
+ .maxsize = P9_RDMA_MAXSIZE,
+ .def = 0,
+ .owner = THIS_MODULE,
+ .create = rdma_create_trans,
+ .close = rdma_close,
+ .request = rdma_request,
+ .cancel = rdma_cancel,
+ .cancelled = rdma_cancelled,
+};
+
+/**
+ * p9_trans_rdma_init - Register the 9P RDMA transport driver
+ */
+static int __init p9_trans_rdma_init(void)
+{
+ v9fs_register_trans(&p9_rdma_trans);
+ return 0;
+}
+
+static void __exit p9_trans_rdma_exit(void)
+{
+ v9fs_unregister_trans(&p9_rdma_trans);
+}
+
+module_init(p9_trans_rdma_init);
+module_exit(p9_trans_rdma_exit);
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("RDMA Transport for 9P");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
new file mode 100644
index 000000000..9dd49ca67
--- /dev/null
+++ b/net/9p/trans_virtio.c
@@ -0,0 +1,776 @@
+/*
+ * The Virtio 9p transport driver
+ *
+ * This is a block based transport driver based on the lguest block driver
+ * code.
+ *
+ * Copyright (C) 2007, 2008 Eric Van Hensbergen, IBM Corporation
+ *
+ * Based on virtio console driver
+ * Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <linux/parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/virtio.h>
+#include <linux/virtio_9p.h>
+#include "trans_common.h"
+
+#define VIRTQUEUE_NUM 128
+
+/* a single mutex to manage channel initialization and attachment */
+static DEFINE_MUTEX(virtio_9p_lock);
+static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
+static atomic_t vp_pinned = ATOMIC_INIT(0);
+
+/**
+ * struct virtio_chan - per-instance transport information
+ * @initialized: whether the channel is initialized
+ * @inuse: whether the channel is in use
+ * @lock: protects multiple elements within this structure
+ * @client: client instance
+ * @vdev: virtio dev associated with this channel
+ * @vq: virtio queue associated with this channel
+ * @sg: scatter gather list which is used to pack a request (protected?)
+ *
+ * We keep all per-channel information in a structure.
+ * This structure is allocated within the devices dev->mem space.
+ * A pointer to the structure will get put in the transport private.
+ *
+ */
+
+struct virtio_chan {
+ bool inuse;
+
+ spinlock_t lock;
+
+ struct p9_client *client;
+ struct virtio_device *vdev;
+ struct virtqueue *vq;
+ int ring_bufs_avail;
+ wait_queue_head_t *vc_wq;
+ /* This is global limit. Since we don't have a global structure,
+ * will be placing it in each channel.
+ */
+ unsigned long p9_max_pages;
+ /* Scatterlist: can be too big for stack. */
+ struct scatterlist sg[VIRTQUEUE_NUM];
+
+ int tag_len;
+ /*
+ * tag name to identify a mount Non-null terminated
+ */
+ char *tag;
+
+ struct list_head chan_list;
+};
+
+static struct list_head virtio_chan_list;
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+ return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
+}
+
+/**
+ * p9_virtio_close - reclaim resources of a channel
+ * @client: client instance
+ *
+ * This reclaims a channel by freeing its resources and
+ * reseting its inuse flag.
+ *
+ */
+
+static void p9_virtio_close(struct p9_client *client)
+{
+ struct virtio_chan *chan = client->trans;
+
+ mutex_lock(&virtio_9p_lock);
+ if (chan)
+ chan->inuse = false;
+ mutex_unlock(&virtio_9p_lock);
+}
+
+/**
+ * req_done - callback which signals activity from the server
+ * @vq: virtio queue activity was received on
+ *
+ * This notifies us that the server has triggered some activity
+ * on the virtio channel - most likely a response to request we
+ * sent. Figure out which requests now have responses and wake up
+ * those threads.
+ *
+ * Bugs: could do with some additional sanity checking, but appears to work.
+ *
+ */
+
+static void req_done(struct virtqueue *vq)
+{
+ struct virtio_chan *chan = vq->vdev->priv;
+ struct p9_fcall *rc;
+ unsigned int len;
+ struct p9_req_t *req;
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_TRANS, ": request done\n");
+
+ while (1) {
+ spin_lock_irqsave(&chan->lock, flags);
+ rc = virtqueue_get_buf(chan->vq, &len);
+ if (rc == NULL) {
+ spin_unlock_irqrestore(&chan->lock, flags);
+ break;
+ }
+ chan->ring_bufs_avail = 1;
+ spin_unlock_irqrestore(&chan->lock, flags);
+ /* Wakeup if anyone waiting for VirtIO ring space. */
+ wake_up(chan->vc_wq);
+ p9_debug(P9_DEBUG_TRANS, ": rc %p\n", rc);
+ p9_debug(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag);
+ req = p9_tag_lookup(chan->client, rc->tag);
+ p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
+ }
+}
+
+/**
+ * pack_sg_list - pack a scatter gather list from a linear buffer
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @limit: maximum segment to pack data to
+ * @data: data to pack into scatter/gather list
+ * @count: amount of data to pack into the scatter/gather list
+ *
+ * sg_lists have multiple segments of various sizes. This will pack
+ * arbitrary data into an existing scatter gather list, segmenting the
+ * data as necessary within constraints.
+ *
+ */
+
+static int pack_sg_list(struct scatterlist *sg, int start,
+ int limit, char *data, int count)
+{
+ int s;
+ int index = start;
+
+ while (count) {
+ s = rest_of_page(data);
+ if (s > count)
+ s = count;
+ BUG_ON(index > limit);
+ /* Make sure we don't terminate early. */
+ sg_unmark_end(&sg[index]);
+ sg_set_buf(&sg[index++], data, s);
+ count -= s;
+ data += s;
+ }
+ if (index-start)
+ sg_mark_end(&sg[index - 1]);
+ return index-start;
+}
+
+/* We don't currently allow canceling of virtio requests */
+static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ return 1;
+}
+
+/**
+ * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer,
+ * this takes a list of pages.
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @pdata: a list of pages to add into sg.
+ * @nr_pages: number of pages to pack into the scatter/gather list
+ * @offs: amount of data in the beginning of first page _not_ to pack
+ * @count: amount of data to pack into the scatter/gather list
+ */
+static int
+pack_sg_list_p(struct scatterlist *sg, int start, int limit,
+ struct page **pdata, int nr_pages, size_t offs, int count)
+{
+ int i = 0, s;
+ int data_off = offs;
+ int index = start;
+
+ BUG_ON(nr_pages > (limit - start));
+ /*
+ * if the first page doesn't start at
+ * page boundary find the offset
+ */
+ while (nr_pages) {
+ s = PAGE_SIZE - data_off;
+ if (s > count)
+ s = count;
+ /* Make sure we don't terminate early. */
+ sg_unmark_end(&sg[index]);
+ sg_set_page(&sg[index++], pdata[i++], s, data_off);
+ data_off = 0;
+ count -= s;
+ nr_pages--;
+ }
+
+ if (index-start)
+ sg_mark_end(&sg[index - 1]);
+ return index - start;
+}
+
+/**
+ * p9_virtio_request - issue a request
+ * @client: client instance issuing the request
+ * @req: request to be issued
+ *
+ */
+
+static int
+p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
+{
+ int err;
+ int in, out, out_sgs, in_sgs;
+ unsigned long flags;
+ struct virtio_chan *chan = client->trans;
+ struct scatterlist *sgs[2];
+
+ p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
+
+ req->status = REQ_STATUS_SENT;
+req_retry:
+ spin_lock_irqsave(&chan->lock, flags);
+
+ out_sgs = in_sgs = 0;
+ /* Handle out VirtIO ring buffers */
+ out = pack_sg_list(chan->sg, 0,
+ VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
+ if (out)
+ sgs[out_sgs++] = chan->sg;
+
+ in = pack_sg_list(chan->sg, out,
+ VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
+ if (in)
+ sgs[out_sgs + in_sgs++] = chan->sg + out;
+
+ err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
+ GFP_ATOMIC);
+ if (err < 0) {
+ if (err == -ENOSPC) {
+ chan->ring_bufs_avail = 0;
+ spin_unlock_irqrestore(&chan->lock, flags);
+ err = wait_event_interruptible(*chan->vc_wq,
+ chan->ring_bufs_avail);
+ if (err == -ERESTARTSYS)
+ return err;
+
+ p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
+ goto req_retry;
+ } else {
+ spin_unlock_irqrestore(&chan->lock, flags);
+ p9_debug(P9_DEBUG_TRANS,
+ "virtio rpc add_sgs returned failure\n");
+ return -EIO;
+ }
+ }
+ virtqueue_kick(chan->vq);
+ spin_unlock_irqrestore(&chan->lock, flags);
+
+ p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
+ return 0;
+}
+
+static int p9_get_mapped_pages(struct virtio_chan *chan,
+ struct page ***pages,
+ struct iov_iter *data,
+ int count,
+ size_t *offs,
+ int *need_drop)
+{
+ int nr_pages;
+ int err;
+
+ if (!iov_iter_count(data))
+ return 0;
+
+ if (!(data->type & ITER_KVEC)) {
+ int n;
+ /*
+ * We allow only p9_max_pages pinned. We wait for the
+ * Other zc request to finish here
+ */
+ if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
+ err = wait_event_interruptible(vp_wq,
+ (atomic_read(&vp_pinned) < chan->p9_max_pages));
+ if (err == -ERESTARTSYS)
+ return err;
+ }
+ n = iov_iter_get_pages_alloc(data, pages, count, offs);
+ if (n < 0)
+ return n;
+ *need_drop = 1;
+ nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE);
+ atomic_add(nr_pages, &vp_pinned);
+ return n;
+ } else {
+ /* kernel buffer, no need to pin pages */
+ int index;
+ size_t len;
+ void *p;
+
+ /* we'd already checked that it's non-empty */
+ while (1) {
+ len = iov_iter_single_seg_count(data);
+ if (likely(len)) {
+ p = data->kvec->iov_base + data->iov_offset;
+ break;
+ }
+ iov_iter_advance(data, 0);
+ }
+ if (len > count)
+ len = count;
+
+ nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) -
+ (unsigned long)p / PAGE_SIZE;
+
+ *pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+ if (!*pages)
+ return -ENOMEM;
+
+ *need_drop = 0;
+ p -= (*offs = (unsigned long)p % PAGE_SIZE);
+ for (index = 0; index < nr_pages; index++) {
+ if (is_vmalloc_addr(p))
+ (*pages)[index] = vmalloc_to_page(p);
+ else
+ (*pages)[index] = kmap_to_page(p);
+ p += PAGE_SIZE;
+ }
+ return len;
+ }
+}
+
+/**
+ * p9_virtio_zc_request - issue a zero copy request
+ * @client: client instance issuing the request
+ * @req: request to be issued
+ * @uidata: user bffer that should be ued for zero copy read
+ * @uodata: user buffer that shoud be user for zero copy write
+ * @inlen: read buffer size
+ * @olen: write buffer size
+ * @hdrlen: reader header size, This is the size of response protocol data
+ *
+ */
+static int
+p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
+ struct iov_iter *uidata, struct iov_iter *uodata,
+ int inlen, int outlen, int in_hdr_len)
+{
+ int in, out, err, out_sgs, in_sgs;
+ unsigned long flags;
+ int in_nr_pages = 0, out_nr_pages = 0;
+ struct page **in_pages = NULL, **out_pages = NULL;
+ struct virtio_chan *chan = client->trans;
+ struct scatterlist *sgs[4];
+ size_t offs;
+ int need_drop = 0;
+
+ p9_debug(P9_DEBUG_TRANS, "virtio request\n");
+
+ if (uodata) {
+ int n = p9_get_mapped_pages(chan, &out_pages, uodata,
+ outlen, &offs, &need_drop);
+ if (n < 0)
+ return n;
+ out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
+ if (n != outlen) {
+ __le32 v = cpu_to_le32(n);
+ memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4);
+ outlen = n;
+ }
+ } else if (uidata) {
+ int n = p9_get_mapped_pages(chan, &in_pages, uidata,
+ inlen, &offs, &need_drop);
+ if (n < 0)
+ return n;
+ in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
+ if (n != inlen) {
+ __le32 v = cpu_to_le32(n);
+ memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4);
+ inlen = n;
+ }
+ }
+ req->status = REQ_STATUS_SENT;
+req_retry_pinned:
+ spin_lock_irqsave(&chan->lock, flags);
+
+ out_sgs = in_sgs = 0;
+
+ /* out data */
+ out = pack_sg_list(chan->sg, 0,
+ VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
+
+ if (out)
+ sgs[out_sgs++] = chan->sg;
+
+ if (out_pages) {
+ sgs[out_sgs++] = chan->sg + out;
+ out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
+ out_pages, out_nr_pages, offs, outlen);
+ }
+
+ /*
+ * Take care of in data
+ * For example TREAD have 11.
+ * 11 is the read/write header = PDU Header(7) + IO Size (4).
+ * Arrange in such a way that server places header in the
+ * alloced memory and payload onto the user buffer.
+ */
+ in = pack_sg_list(chan->sg, out,
+ VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len);
+ if (in)
+ sgs[out_sgs + in_sgs++] = chan->sg + out;
+
+ if (in_pages) {
+ sgs[out_sgs + in_sgs++] = chan->sg + out + in;
+ in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
+ in_pages, in_nr_pages, offs, inlen);
+ }
+
+ BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
+ err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
+ GFP_ATOMIC);
+ if (err < 0) {
+ if (err == -ENOSPC) {
+ chan->ring_bufs_avail = 0;
+ spin_unlock_irqrestore(&chan->lock, flags);
+ err = wait_event_interruptible(*chan->vc_wq,
+ chan->ring_bufs_avail);
+ if (err == -ERESTARTSYS)
+ goto err_out;
+
+ p9_debug(P9_DEBUG_TRANS, "Retry virtio request\n");
+ goto req_retry_pinned;
+ } else {
+ spin_unlock_irqrestore(&chan->lock, flags);
+ p9_debug(P9_DEBUG_TRANS,
+ "virtio rpc add_sgs returned failure\n");
+ err = -EIO;
+ goto err_out;
+ }
+ }
+ virtqueue_kick(chan->vq);
+ spin_unlock_irqrestore(&chan->lock, flags);
+ p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
+ err = wait_event_interruptible(*req->wq,
+ req->status >= REQ_STATUS_RCVD);
+ /*
+ * Non kernel buffers are pinned, unpin them
+ */
+err_out:
+ if (need_drop) {
+ if (in_pages) {
+ p9_release_pages(in_pages, in_nr_pages);
+ atomic_sub(in_nr_pages, &vp_pinned);
+ }
+ if (out_pages) {
+ p9_release_pages(out_pages, out_nr_pages);
+ atomic_sub(out_nr_pages, &vp_pinned);
+ }
+ /* wakeup anybody waiting for slots to pin pages */
+ wake_up(&vp_wq);
+ }
+ kfree(in_pages);
+ kfree(out_pages);
+ return err;
+}
+
+static ssize_t p9_mount_tag_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct virtio_chan *chan;
+ struct virtio_device *vdev;
+
+ vdev = dev_to_virtio(dev);
+ chan = vdev->priv;
+
+ memcpy(buf, chan->tag, chan->tag_len);
+ buf[chan->tag_len] = 0;
+
+ return chan->tag_len + 1;
+}
+
+static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL);
+
+/**
+ * p9_virtio_probe - probe for existence of 9P virtio channels
+ * @vdev: virtio device to probe
+ *
+ * This probes for existing virtio channels.
+ *
+ */
+
+static int p9_virtio_probe(struct virtio_device *vdev)
+{
+ __u16 tag_len;
+ char *tag;
+ int err;
+ struct virtio_chan *chan;
+
+ if (!vdev->config->get) {
+ dev_err(&vdev->dev, "%s failure: config access disabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL);
+ if (!chan) {
+ pr_err("Failed to allocate virtio 9P channel\n");
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ chan->vdev = vdev;
+
+ /* We expect one virtqueue, for requests. */
+ chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
+ if (IS_ERR(chan->vq)) {
+ err = PTR_ERR(chan->vq);
+ goto out_free_vq;
+ }
+ chan->vq->vdev->priv = chan;
+ spin_lock_init(&chan->lock);
+
+ sg_init_table(chan->sg, VIRTQUEUE_NUM);
+
+ chan->inuse = false;
+ if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) {
+ virtio_cread(vdev, struct virtio_9p_config, tag_len, &tag_len);
+ } else {
+ err = -EINVAL;
+ goto out_free_vq;
+ }
+ tag = kmalloc(tag_len, GFP_KERNEL);
+ if (!tag) {
+ err = -ENOMEM;
+ goto out_free_vq;
+ }
+
+ virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag),
+ tag, tag_len);
+ chan->tag = tag;
+ chan->tag_len = tag_len;
+ err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
+ if (err) {
+ goto out_free_tag;
+ }
+ chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL);
+ if (!chan->vc_wq) {
+ err = -ENOMEM;
+ goto out_free_tag;
+ }
+ init_waitqueue_head(chan->vc_wq);
+ chan->ring_bufs_avail = 1;
+ /* Ceiling limit to avoid denial of service attacks */
+ chan->p9_max_pages = nr_free_buffer_pages()/4;
+
+ virtio_device_ready(vdev);
+
+ mutex_lock(&virtio_9p_lock);
+ list_add_tail(&chan->chan_list, &virtio_chan_list);
+ mutex_unlock(&virtio_9p_lock);
+
+ /* Let udev rules use the new mount_tag attribute. */
+ kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE);
+
+ return 0;
+
+out_free_tag:
+ kfree(tag);
+out_free_vq:
+ vdev->config->del_vqs(vdev);
+ kfree(chan);
+fail:
+ return err;
+}
+
+
+/**
+ * p9_virtio_create - allocate a new virtio channel
+ * @client: client instance invoking this transport
+ * @devname: string identifying the channel to connect to (unused)
+ * @args: args passed from sys_mount() for per-transport options (unused)
+ *
+ * This sets up a transport channel for 9p communication. Right now
+ * we only match the first available channel, but eventually we couldlook up
+ * alternate channels by matching devname versus a virtio_config entry.
+ * We use a simple reference count mechanism to ensure that only a single
+ * mount has a channel open at a time.
+ *
+ */
+
+static int
+p9_virtio_create(struct p9_client *client, const char *devname, char *args)
+{
+ struct virtio_chan *chan;
+ int ret = -ENOENT;
+ int found = 0;
+
+ mutex_lock(&virtio_9p_lock);
+ list_for_each_entry(chan, &virtio_chan_list, chan_list) {
+ if (!strncmp(devname, chan->tag, chan->tag_len) &&
+ strlen(devname) == chan->tag_len) {
+ if (!chan->inuse) {
+ chan->inuse = true;
+ found = 1;
+ break;
+ }
+ ret = -EBUSY;
+ }
+ }
+ mutex_unlock(&virtio_9p_lock);
+
+ if (!found) {
+ pr_err("no channels available\n");
+ return ret;
+ }
+
+ client->trans = (void *)chan;
+ client->status = Connected;
+ chan->client = client;
+
+ return 0;
+}
+
+/**
+ * p9_virtio_remove - clean up resources associated with a virtio device
+ * @vdev: virtio device to remove
+ *
+ */
+
+static void p9_virtio_remove(struct virtio_device *vdev)
+{
+ struct virtio_chan *chan = vdev->priv;
+ unsigned long warning_time;
+
+ mutex_lock(&virtio_9p_lock);
+
+ /* Remove self from list so we don't get new users. */
+ list_del(&chan->chan_list);
+ warning_time = jiffies;
+
+ /* Wait for existing users to close. */
+ while (chan->inuse) {
+ mutex_unlock(&virtio_9p_lock);
+ msleep(250);
+ if (time_after(jiffies, warning_time + 10 * HZ)) {
+ dev_emerg(&vdev->dev,
+ "p9_virtio_remove: waiting for device in use.\n");
+ warning_time = jiffies;
+ }
+ mutex_lock(&virtio_9p_lock);
+ }
+
+ mutex_unlock(&virtio_9p_lock);
+
+ vdev->config->del_vqs(vdev);
+
+ sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
+ kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE);
+ kfree(chan->tag);
+ kfree(chan->vc_wq);
+ kfree(chan);
+
+}
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features[] = {
+ VIRTIO_9P_MOUNT_TAG,
+};
+
+/* The standard "struct lguest_driver": */
+static struct virtio_driver p9_virtio_drv = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = p9_virtio_probe,
+ .remove = p9_virtio_remove,
+};
+
+static struct p9_trans_module p9_virtio_trans = {
+ .name = "virtio",
+ .create = p9_virtio_create,
+ .close = p9_virtio_close,
+ .request = p9_virtio_request,
+ .zc_request = p9_virtio_zc_request,
+ .cancel = p9_virtio_cancel,
+ /*
+ * We leave one entry for input and one entry for response
+ * headers. We also skip one more entry to accomodate, address
+ * that are not at page boundary, that can result in an extra
+ * page in zero copy.
+ */
+ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
+ .def = 1,
+ .owner = THIS_MODULE,
+};
+
+/* The standard init function */
+static int __init p9_virtio_init(void)
+{
+ INIT_LIST_HEAD(&virtio_chan_list);
+
+ v9fs_register_trans(&p9_virtio_trans);
+ return register_virtio_driver(&p9_virtio_drv);
+}
+
+static void __exit p9_virtio_cleanup(void)
+{
+ unregister_virtio_driver(&p9_virtio_drv);
+ v9fs_unregister_trans(&p9_virtio_trans);
+}
+
+module_init(p9_virtio_init);
+module_exit(p9_virtio_cleanup);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_DESCRIPTION("Virtio 9p Transport");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/util.c b/net/9p/util.c
new file mode 100644
index 000000000..59f278e64
--- /dev/null
+++ b/net/9p/util.c
@@ -0,0 +1,141 @@
+/*
+ * net/9p/util.c
+ *
+ * This file contains some helper functions
+ *
+ * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02111-1301 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+
+/**
+ * struct p9_idpool - per-connection accounting for tag idpool
+ * @lock: protects the pool
+ * @pool: idr to allocate tag id from
+ *
+ */
+
+struct p9_idpool {
+ spinlock_t lock;
+ struct idr pool;
+};
+
+/**
+ * p9_idpool_create - create a new per-connection id pool
+ *
+ */
+
+struct p9_idpool *p9_idpool_create(void)
+{
+ struct p9_idpool *p;
+
+ p = kmalloc(sizeof(struct p9_idpool), GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&p->lock);
+ idr_init(&p->pool);
+
+ return p;
+}
+EXPORT_SYMBOL(p9_idpool_create);
+
+/**
+ * p9_idpool_destroy - create a new per-connection id pool
+ * @p: idpool to destroy
+ */
+
+void p9_idpool_destroy(struct p9_idpool *p)
+{
+ idr_destroy(&p->pool);
+ kfree(p);
+}
+EXPORT_SYMBOL(p9_idpool_destroy);
+
+/**
+ * p9_idpool_get - allocate numeric id from pool
+ * @p: pool to allocate from
+ *
+ * Bugs: This seems to be an awful generic function, should it be in idr.c with
+ * the lock included in struct idr?
+ */
+
+int p9_idpool_get(struct p9_idpool *p)
+{
+ int i;
+ unsigned long flags;
+
+ idr_preload(GFP_NOFS);
+ spin_lock_irqsave(&p->lock, flags);
+
+ /* no need to store exactly p, we just need something non-null */
+ i = idr_alloc(&p->pool, p, 0, 0, GFP_NOWAIT);
+
+ spin_unlock_irqrestore(&p->lock, flags);
+ idr_preload_end();
+ if (i < 0)
+ return -1;
+
+ p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", i, p);
+ return i;
+}
+EXPORT_SYMBOL(p9_idpool_get);
+
+/**
+ * p9_idpool_put - release numeric id from pool
+ * @id: numeric id which is being released
+ * @p: pool to release id into
+ *
+ * Bugs: This seems to be an awful generic function, should it be in idr.c with
+ * the lock included in struct idr?
+ */
+
+void p9_idpool_put(int id, struct p9_idpool *p)
+{
+ unsigned long flags;
+
+ p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", id, p);
+
+ spin_lock_irqsave(&p->lock, flags);
+ idr_remove(&p->pool, id);
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+EXPORT_SYMBOL(p9_idpool_put);
+
+/**
+ * p9_idpool_check - check if the specified id is available
+ * @id: id to check
+ * @p: pool to check
+ */
+
+int p9_idpool_check(int id, struct p9_idpool *p)
+{
+ return idr_find(&p->pool, id) != NULL;
+}
+EXPORT_SYMBOL(p9_idpool_check);
+
diff --git a/net/Kconfig b/net/Kconfig
new file mode 100644
index 000000000..44dd5786e
--- /dev/null
+++ b/net/Kconfig
@@ -0,0 +1,379 @@
+#
+# Network configuration
+#
+
+menuconfig NET
+ bool "Networking support"
+ select NLATTR
+ select GENERIC_NET_UTILS
+ select BPF
+ ---help---
+ Unless you really know what you are doing, you should say Y here.
+ The reason is that some programs need kernel networking support even
+ when running on a stand-alone machine that isn't connected to any
+ other computer.
+
+ If you are upgrading from an older kernel, you
+ should consider updating your networking tools too because changes
+ in the kernel and the tools often go hand in hand. The tools are
+ contained in the package net-tools, the location and version number
+ of which are given in <file:Documentation/Changes>.
+
+ For a general introduction to Linux networking, it is highly
+ recommended to read the NET-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>.
+
+if NET
+
+config WANT_COMPAT_NETLINK_MESSAGES
+ bool
+ help
+ This option can be selected by other options that need compat
+ netlink messages.
+
+config COMPAT_NETLINK_MESSAGES
+ def_bool y
+ depends on COMPAT
+ depends on WEXT_CORE || WANT_COMPAT_NETLINK_MESSAGES
+ help
+ This option makes it possible to send different netlink messages
+ to tasks depending on whether the task is a compat task or not. To
+ achieve this, you need to set skb_shinfo(skb)->frag_list to the
+ compat skb before sending the skb, the netlink code will sort out
+ which message to actually pass to the task.
+
+ Newly written code should NEVER need this option but do
+ compat-independent messages instead!
+
+menu "Networking options"
+
+source "net/packet/Kconfig"
+source "net/unix/Kconfig"
+source "net/xfrm/Kconfig"
+source "net/iucv/Kconfig"
+
+config INET
+ bool "TCP/IP networking"
+ select CRYPTO
+ select CRYPTO_AES
+ ---help---
+ These are the protocols used on the Internet and on most local
+ Ethernets. It is highly recommended to say Y here (this will enlarge
+ your kernel by about 400 KB), since some programs (e.g. the X window
+ system) use TCP/IP even if your machine is not connected to any
+ other computer. You will get the so-called loopback device which
+ allows you to ping yourself (great fun, that!).
+
+ For an excellent introduction to Linux networking, please read the
+ Linux Networking HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>.
+
+ If you say Y here and also to "/proc file system support" and
+ "Sysctl support" below, you can change various aspects of the
+ behavior of the TCP/IP code by writing to the (virtual) files in
+ /proc/sys/net/ipv4/*; the options are explained in the file
+ <file:Documentation/networking/ip-sysctl.txt>.
+
+ Short answer: say Y.
+
+if INET
+source "net/ipv4/Kconfig"
+source "net/ipv6/Kconfig"
+source "net/netlabel/Kconfig"
+
+endif # if INET
+
+config NETWORK_SECMARK
+ bool "Security Marking"
+ help
+ This enables security marking of network packets, similar
+ to nfmark, but designated for security purposes.
+ If you are unsure how to answer this question, answer N.
+
+config NET_PTP_CLASSIFY
+ def_bool n
+
+config NETWORK_PHY_TIMESTAMPING
+ bool "Timestamping in PHY devices"
+ select NET_PTP_CLASSIFY
+ help
+ This allows timestamping of network packets by PHYs with
+ hardware timestamping capabilities. This option adds some
+ overhead in the transmit and receive paths.
+
+ If you are unsure how to answer this question, answer N.
+
+menuconfig NETFILTER
+ bool "Network packet filtering framework (Netfilter)"
+ ---help---
+ Netfilter is a framework for filtering and mangling network packets
+ that pass through your Linux box.
+
+ The most common use of packet filtering is to run your Linux box as
+ a firewall protecting a local network from the Internet. The type of
+ firewall provided by this kernel support is called a "packet
+ filter", which means that it can reject individual network packets
+ based on type, source, destination etc. The other kind of firewall,
+ a "proxy-based" one, is more secure but more intrusive and more
+ bothersome to set up; it inspects the network traffic much more
+ closely, modifies it and has knowledge about the higher level
+ protocols, which a packet filter lacks. Moreover, proxy-based
+ firewalls often require changes to the programs running on the local
+ clients. Proxy-based firewalls don't need support by the kernel, but
+ they are often combined with a packet filter, which only works if
+ you say Y here.
+
+ You should also say Y here if you intend to use your Linux box as
+ the gateway to the Internet for a local network of machines without
+ globally valid IP addresses. This is called "masquerading": if one
+ of the computers on your local network wants to send something to
+ the outside, your box can "masquerade" as that computer, i.e. it
+ forwards the traffic to the intended outside destination, but
+ modifies the packets to make it look like they came from the
+ firewall box itself. It works both ways: if the outside host
+ replies, the Linux box will silently forward the traffic to the
+ correct local computer. This way, the computers on your local net
+ are completely invisible to the outside world, even though they can
+ reach the outside and can receive replies. It is even possible to
+ run globally visible servers from within a masqueraded local network
+ using a mechanism called portforwarding. Masquerading is also often
+ called NAT (Network Address Translation).
+
+ Another use of Netfilter is in transparent proxying: if a machine on
+ the local network tries to connect to an outside host, your Linux
+ box can transparently forward the traffic to a local server,
+ typically a caching proxy server.
+
+ Yet another use of Netfilter is building a bridging firewall. Using
+ a bridge with Network packet filtering enabled makes iptables "see"
+ the bridged traffic. For filtering on the lower network and Ethernet
+ protocols over the bridge, use ebtables (under bridge netfilter
+ configuration).
+
+ Various modules exist for netfilter which replace the previous
+ masquerading (ipmasqadm), packet filtering (ipchains), transparent
+ proxying, and portforwarding mechanisms. Please see
+ <file:Documentation/Changes> under "iptables" for the location of
+ these packages.
+
+if NETFILTER
+
+config NETFILTER_DEBUG
+ bool "Network packet filtering debugging"
+ depends on NETFILTER
+ help
+ You can say Y here if you want to get additional messages useful in
+ debugging the netfilter code.
+
+config NETFILTER_ADVANCED
+ bool "Advanced netfilter configuration"
+ depends on NETFILTER
+ default y
+ help
+ If you say Y here you can select between all the netfilter modules.
+ If you say N the more unusual ones will not be shown and the
+ basic ones needed by most people will default to 'M'.
+
+ If unsure, say Y.
+
+config BRIDGE_NETFILTER
+ tristate "Bridged IP/ARP packets filtering"
+ depends on BRIDGE
+ depends on NETFILTER && INET
+ depends on NETFILTER_ADVANCED
+ default m
+ ---help---
+ Enabling this option will let arptables resp. iptables see bridged
+ ARP resp. IP traffic. If you want a bridging firewall, you probably
+ want this option enabled.
+ Enabling or disabling this option doesn't enable or disable
+ ebtables.
+
+ If unsure, say N.
+
+source "net/netfilter/Kconfig"
+source "net/ipv4/netfilter/Kconfig"
+source "net/ipv6/netfilter/Kconfig"
+source "net/decnet/netfilter/Kconfig"
+source "net/bridge/netfilter/Kconfig"
+
+endif
+
+source "net/dccp/Kconfig"
+source "net/sctp/Kconfig"
+source "net/rds/Kconfig"
+source "net/tipc/Kconfig"
+source "net/atm/Kconfig"
+source "net/l2tp/Kconfig"
+source "net/802/Kconfig"
+source "net/bridge/Kconfig"
+source "net/dsa/Kconfig"
+source "net/8021q/Kconfig"
+source "net/decnet/Kconfig"
+source "net/llc/Kconfig"
+source "net/ipx/Kconfig"
+source "drivers/net/appletalk/Kconfig"
+source "net/x25/Kconfig"
+source "net/lapb/Kconfig"
+source "net/phonet/Kconfig"
+source "net/6lowpan/Kconfig"
+source "net/ieee802154/Kconfig"
+source "net/mac802154/Kconfig"
+source "net/sched/Kconfig"
+source "net/dcb/Kconfig"
+source "net/dns_resolver/Kconfig"
+source "net/batman-adv/Kconfig"
+source "net/openvswitch/Kconfig"
+source "net/vmw_vsock/Kconfig"
+source "net/netlink/Kconfig"
+source "net/mpls/Kconfig"
+source "net/hsr/Kconfig"
+source "net/switchdev/Kconfig"
+
+config RPS
+ bool
+ depends on SMP && SYSFS
+ default y
+
+config RFS_ACCEL
+ bool
+ depends on RPS
+ select CPU_RMAP
+ default y
+
+config XPS
+ bool
+ depends on SMP
+ default y
+
+config CGROUP_NET_PRIO
+ bool "Network priority cgroup"
+ depends on CGROUPS
+ ---help---
+ Cgroup subsystem for use in assigning processes to network priorities on
+ a per-interface basis.
+
+config CGROUP_NET_CLASSID
+ bool "Network classid cgroup"
+ depends on CGROUPS
+ ---help---
+ Cgroup subsystem for use as general purpose socket classid marker that is
+ being used in cls_cgroup and for netfilter matching.
+
+config NET_RX_BUSY_POLL
+ bool
+ default y
+
+config BQL
+ bool
+ depends on SYSFS
+ select DQL
+ default y
+
+config BPF_JIT
+ bool "enable BPF Just In Time compiler"
+ depends on HAVE_BPF_JIT
+ depends on MODULES
+ ---help---
+ Berkeley Packet Filter filtering capabilities are normally handled
+ by an interpreter. This option allows kernel to generate a native
+ code when filter is loaded in memory. This should speedup
+ packet sniffing (libpcap/tcpdump). Note : Admin should enable
+ this feature changing /proc/sys/net/core/bpf_jit_enable
+
+config NET_FLOW_LIMIT
+ bool
+ depends on RPS
+ default y
+ ---help---
+ The network stack has to drop packets when a receive processing CPU's
+ backlog reaches netdev_max_backlog. If a few out of many active flows
+ generate the vast majority of load, drop their traffic earlier to
+ maintain capacity for the other flows. This feature provides servers
+ with many clients some protection against DoS by a single (spoofed)
+ flow that greatly exceeds average workload.
+
+menu "Network testing"
+
+config NET_PKTGEN
+ tristate "Packet Generator (USE WITH CAUTION)"
+ depends on INET && PROC_FS
+ ---help---
+ This module will inject preconfigured packets, at a configurable
+ rate, out of a given interface. It is used for network interface
+ stress testing and performance analysis. If you don't understand
+ what was just said, you don't need it: say N.
+
+ Documentation on how to use the packet generator can be found
+ at <file:Documentation/networking/pktgen.txt>.
+
+ To compile this code as a module, choose M here: the
+ module will be called pktgen.
+
+config NET_TCPPROBE
+ tristate "TCP connection probing"
+ depends on INET && PROC_FS && KPROBES
+ ---help---
+ This module allows for capturing the changes to TCP connection
+ state in response to incoming packets. It is used for debugging
+ TCP congestion avoidance modules. If you don't understand
+ what was just said, you don't need it: say N.
+
+ Documentation on how to use TCP connection probing can be found
+ at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe
+
+ To compile this code as a module, choose M here: the
+ module will be called tcp_probe.
+
+config NET_DROP_MONITOR
+ tristate "Network packet drop alerting service"
+ depends on INET && TRACEPOINTS
+ ---help---
+ This feature provides an alerting service to userspace in the
+ event that packets are discarded in the network stack. Alerts
+ are broadcast via netlink socket to any listening user space
+ process. If you don't need network drop alerts, or if you are ok
+ just checking the various proc files and other utilities for
+ drop statistics, say N here.
+
+endmenu
+
+endmenu
+
+source "net/ax25/Kconfig"
+source "net/can/Kconfig"
+source "net/irda/Kconfig"
+source "net/bluetooth/Kconfig"
+source "net/rxrpc/Kconfig"
+
+config FIB_RULES
+ bool
+
+menuconfig WIRELESS
+ bool "Wireless"
+ depends on !S390
+ default y
+
+if WIRELESS
+
+source "net/wireless/Kconfig"
+source "net/mac80211/Kconfig"
+
+endif # WIRELESS
+
+source "net/wimax/Kconfig"
+
+source "net/rfkill/Kconfig"
+source "net/9p/Kconfig"
+source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
+source "net/nfc/Kconfig"
+
+
+endif # if NET
+
+# Used by archs to tell that they support BPF_JIT
+config HAVE_BPF_JIT
+ bool
diff --git a/net/Makefile b/net/Makefile
new file mode 100644
index 000000000..3995613e5
--- /dev/null
+++ b/net/Makefile
@@ -0,0 +1,76 @@
+#
+# Makefile for the linux networking.
+#
+# 2 Sep 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+#
+
+obj-$(CONFIG_NET) := socket.o core/
+
+tmp-$(CONFIG_COMPAT) := compat.o
+obj-$(CONFIG_NET) += $(tmp-y)
+
+# LLC has to be linked before the files in net/802/
+obj-$(CONFIG_LLC) += llc/
+obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
+obj-$(CONFIG_NETFILTER) += netfilter/
+obj-$(CONFIG_INET) += ipv4/
+obj-$(CONFIG_XFRM) += xfrm/
+obj-$(CONFIG_UNIX) += unix/
+obj-$(CONFIG_NET) += ipv6/
+obj-$(CONFIG_PACKET) += packet/
+obj-$(CONFIG_NET_KEY) += key/
+obj-$(CONFIG_BRIDGE) += bridge/
+obj-$(CONFIG_NET_DSA) += dsa/
+obj-$(CONFIG_IPX) += ipx/
+obj-$(CONFIG_ATALK) += appletalk/
+obj-$(CONFIG_X25) += x25/
+obj-$(CONFIG_LAPB) += lapb/
+obj-$(CONFIG_NETROM) += netrom/
+obj-$(CONFIG_ROSE) += rose/
+obj-$(CONFIG_AX25) += ax25/
+obj-$(CONFIG_CAN) += can/
+obj-$(CONFIG_IRDA) += irda/
+obj-$(CONFIG_BT) += bluetooth/
+obj-$(CONFIG_SUNRPC) += sunrpc/
+obj-$(CONFIG_AF_RXRPC) += rxrpc/
+obj-$(CONFIG_ATM) += atm/
+obj-$(CONFIG_L2TP) += l2tp/
+obj-$(CONFIG_DECNET) += decnet/
+obj-$(CONFIG_PHONET) += phonet/
+ifneq ($(CONFIG_VLAN_8021Q),)
+obj-y += 8021q/
+endif
+obj-$(CONFIG_IP_DCCP) += dccp/
+obj-$(CONFIG_IP_SCTP) += sctp/
+obj-$(CONFIG_RDS) += rds/
+obj-$(CONFIG_WIRELESS) += wireless/
+obj-$(CONFIG_MAC80211) += mac80211/
+obj-$(CONFIG_TIPC) += tipc/
+obj-$(CONFIG_NETLABEL) += netlabel/
+obj-$(CONFIG_IUCV) += iucv/
+obj-$(CONFIG_RFKILL) += rfkill/
+obj-$(CONFIG_NET_9P) += 9p/
+obj-$(CONFIG_CAIF) += caif/
+ifneq ($(CONFIG_DCB),)
+obj-y += dcb/
+endif
+obj-$(CONFIG_6LOWPAN) += 6lowpan/
+obj-$(CONFIG_IEEE802154) += ieee802154/
+obj-$(CONFIG_MAC802154) += mac802154/
+
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_SYSCTL) += sysctl_net.o
+endif
+obj-$(CONFIG_WIMAX) += wimax/
+obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
+obj-$(CONFIG_CEPH_LIB) += ceph/
+obj-$(CONFIG_BATMAN_ADV) += batman-adv/
+obj-$(CONFIG_NFC) += nfc/
+obj-$(CONFIG_OPENVSWITCH) += openvswitch/
+obj-$(CONFIG_VSOCKETS) += vmw_vsock/
+obj-$(CONFIG_MPLS) += mpls/
+obj-$(CONFIG_HSR) += hsr/
+ifneq ($(CONFIG_NET_SWITCHDEV),)
+obj-y += switchdev/
+endif
diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile
new file mode 100644
index 000000000..5cda56ede
--- /dev/null
+++ b/net/appletalk/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux AppleTalk layer.
+#
+
+obj-$(CONFIG_ATALK) += appletalk.o
+
+appletalk-y := aarp.o ddp.o dev.o
+appletalk-$(CONFIG_PROC_FS) += atalk_proc.o
+appletalk-$(CONFIG_SYSCTL) += sysctl_net_atalk.o
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
new file mode 100644
index 000000000..8ad3ec261
--- /dev/null
+++ b/net/appletalk/aarp.c
@@ -0,0 +1,1065 @@
+/*
+ * AARP: An implementation of the AppleTalk AARP protocol for
+ * Ethernet 'ELAP'.
+ *
+ * Alan Cox <Alan.Cox@linux.org>
+ *
+ * This doesn't fit cleanly with the IP arp. Potentially we can use
+ * the generic neighbour discovery code to clean this up.
+ *
+ * FIXME:
+ * We ought to handle the retransmits with a single list and a
+ * separate fast timer for when it is needed.
+ * Use neighbour discovery code.
+ * Token Ring Support.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * References:
+ * Inside AppleTalk (2nd Ed).
+ * Fixes:
+ * Jaume Grau - flush caches on AARP_PROBE
+ * Rob Newberry - Added proxy AARP and AARP proc fs,
+ * moved probing from DDP module.
+ * Arnaldo C. Melo - don't mangle rx packets
+ *
+ */
+
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/datalink.h>
+#include <net/psnap.h>
+#include <linux/atalk.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <linux/etherdevice.h>
+
+int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME;
+int sysctl_aarp_tick_time = AARP_TICK_TIME;
+int sysctl_aarp_retransmit_limit = AARP_RETRANSMIT_LIMIT;
+int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME;
+
+/* Lists of aarp entries */
+/**
+ * struct aarp_entry - AARP entry
+ * @last_sent - Last time we xmitted the aarp request
+ * @packet_queue - Queue of frames wait for resolution
+ * @status - Used for proxy AARP
+ * expires_at - Entry expiry time
+ * target_addr - DDP Address
+ * dev - Device to use
+ * hwaddr - Physical i/f address of target/router
+ * xmit_count - When this hits 10 we give up
+ * next - Next entry in chain
+ */
+struct aarp_entry {
+ /* These first two are only used for unresolved entries */
+ unsigned long last_sent;
+ struct sk_buff_head packet_queue;
+ int status;
+ unsigned long expires_at;
+ struct atalk_addr target_addr;
+ struct net_device *dev;
+ char hwaddr[ETH_ALEN];
+ unsigned short xmit_count;
+ struct aarp_entry *next;
+};
+
+/* Hashed list of resolved, unresolved and proxy entries */
+static struct aarp_entry *resolved[AARP_HASH_SIZE];
+static struct aarp_entry *unresolved[AARP_HASH_SIZE];
+static struct aarp_entry *proxies[AARP_HASH_SIZE];
+static int unresolved_count;
+
+/* One lock protects it all. */
+static DEFINE_RWLOCK(aarp_lock);
+
+/* Used to walk the list and purge/kick entries. */
+static struct timer_list aarp_timer;
+
+/*
+ * Delete an aarp queue
+ *
+ * Must run under aarp_lock.
+ */
+static void __aarp_expire(struct aarp_entry *a)
+{
+ skb_queue_purge(&a->packet_queue);
+ kfree(a);
+}
+
+/*
+ * Send an aarp queue entry request
+ *
+ * Must run under aarp_lock.
+ */
+static void __aarp_send_query(struct aarp_entry *a)
+{
+ static unsigned char aarp_eth_multicast[ETH_ALEN] =
+ { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF };
+ struct net_device *dev = a->dev;
+ struct elapaarp *eah;
+ int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length;
+ struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+ struct atalk_addr *sat = atalk_find_dev_addr(dev);
+
+ if (!skb)
+ return;
+
+ if (!sat) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* Set up the buffer */
+ skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_put(skb, sizeof(*eah));
+ skb->protocol = htons(ETH_P_ATALK);
+ skb->dev = dev;
+ eah = aarp_hdr(skb);
+
+ /* Set up the ARP */
+ eah->hw_type = htons(AARP_HW_TYPE_ETHERNET);
+ eah->pa_type = htons(ETH_P_ATALK);
+ eah->hw_len = ETH_ALEN;
+ eah->pa_len = AARP_PA_ALEN;
+ eah->function = htons(AARP_REQUEST);
+
+ ether_addr_copy(eah->hw_src, dev->dev_addr);
+
+ eah->pa_src_zero = 0;
+ eah->pa_src_net = sat->s_net;
+ eah->pa_src_node = sat->s_node;
+
+ eth_zero_addr(eah->hw_dst);
+
+ eah->pa_dst_zero = 0;
+ eah->pa_dst_net = a->target_addr.s_net;
+ eah->pa_dst_node = a->target_addr.s_node;
+
+ /* Send it */
+ aarp_dl->request(aarp_dl, skb, aarp_eth_multicast);
+ /* Update the sending count */
+ a->xmit_count++;
+ a->last_sent = jiffies;
+}
+
+/* This runs under aarp_lock and in softint context, so only atomic memory
+ * allocations can be used. */
+static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us,
+ struct atalk_addr *them, unsigned char *sha)
+{
+ struct elapaarp *eah;
+ int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length;
+ struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+
+ if (!skb)
+ return;
+
+ /* Set up the buffer */
+ skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_put(skb, sizeof(*eah));
+ skb->protocol = htons(ETH_P_ATALK);
+ skb->dev = dev;
+ eah = aarp_hdr(skb);
+
+ /* Set up the ARP */
+ eah->hw_type = htons(AARP_HW_TYPE_ETHERNET);
+ eah->pa_type = htons(ETH_P_ATALK);
+ eah->hw_len = ETH_ALEN;
+ eah->pa_len = AARP_PA_ALEN;
+ eah->function = htons(AARP_REPLY);
+
+ ether_addr_copy(eah->hw_src, dev->dev_addr);
+
+ eah->pa_src_zero = 0;
+ eah->pa_src_net = us->s_net;
+ eah->pa_src_node = us->s_node;
+
+ if (!sha)
+ eth_zero_addr(eah->hw_dst);
+ else
+ ether_addr_copy(eah->hw_dst, sha);
+
+ eah->pa_dst_zero = 0;
+ eah->pa_dst_net = them->s_net;
+ eah->pa_dst_node = them->s_node;
+
+ /* Send it */
+ aarp_dl->request(aarp_dl, skb, sha);
+}
+
+/*
+ * Send probe frames. Called from aarp_probe_network and
+ * aarp_proxy_probe_network.
+ */
+
+static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us)
+{
+ struct elapaarp *eah;
+ int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length;
+ struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+ static unsigned char aarp_eth_multicast[ETH_ALEN] =
+ { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF };
+
+ if (!skb)
+ return;
+
+ /* Set up the buffer */
+ skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_put(skb, sizeof(*eah));
+ skb->protocol = htons(ETH_P_ATALK);
+ skb->dev = dev;
+ eah = aarp_hdr(skb);
+
+ /* Set up the ARP */
+ eah->hw_type = htons(AARP_HW_TYPE_ETHERNET);
+ eah->pa_type = htons(ETH_P_ATALK);
+ eah->hw_len = ETH_ALEN;
+ eah->pa_len = AARP_PA_ALEN;
+ eah->function = htons(AARP_PROBE);
+
+ ether_addr_copy(eah->hw_src, dev->dev_addr);
+
+ eah->pa_src_zero = 0;
+ eah->pa_src_net = us->s_net;
+ eah->pa_src_node = us->s_node;
+
+ eth_zero_addr(eah->hw_dst);
+
+ eah->pa_dst_zero = 0;
+ eah->pa_dst_net = us->s_net;
+ eah->pa_dst_node = us->s_node;
+
+ /* Send it */
+ aarp_dl->request(aarp_dl, skb, aarp_eth_multicast);
+}
+
+/*
+ * Handle an aarp timer expire
+ *
+ * Must run under the aarp_lock.
+ */
+
+static void __aarp_expire_timer(struct aarp_entry **n)
+{
+ struct aarp_entry *t;
+
+ while (*n)
+ /* Expired ? */
+ if (time_after(jiffies, (*n)->expires_at)) {
+ t = *n;
+ *n = (*n)->next;
+ __aarp_expire(t);
+ } else
+ n = &((*n)->next);
+}
+
+/*
+ * Kick all pending requests 5 times a second.
+ *
+ * Must run under the aarp_lock.
+ */
+static void __aarp_kick(struct aarp_entry **n)
+{
+ struct aarp_entry *t;
+
+ while (*n)
+ /* Expired: if this will be the 11th tx, we delete instead. */
+ if ((*n)->xmit_count >= sysctl_aarp_retransmit_limit) {
+ t = *n;
+ *n = (*n)->next;
+ __aarp_expire(t);
+ } else {
+ __aarp_send_query(*n);
+ n = &((*n)->next);
+ }
+}
+
+/*
+ * A device has gone down. Take all entries referring to the device
+ * and remove them.
+ *
+ * Must run under the aarp_lock.
+ */
+static void __aarp_expire_device(struct aarp_entry **n, struct net_device *dev)
+{
+ struct aarp_entry *t;
+
+ while (*n)
+ if ((*n)->dev == dev) {
+ t = *n;
+ *n = (*n)->next;
+ __aarp_expire(t);
+ } else
+ n = &((*n)->next);
+}
+
+/* Handle the timer event */
+static void aarp_expire_timeout(unsigned long unused)
+{
+ int ct;
+
+ write_lock_bh(&aarp_lock);
+
+ for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
+ __aarp_expire_timer(&resolved[ct]);
+ __aarp_kick(&unresolved[ct]);
+ __aarp_expire_timer(&unresolved[ct]);
+ __aarp_expire_timer(&proxies[ct]);
+ }
+
+ write_unlock_bh(&aarp_lock);
+ mod_timer(&aarp_timer, jiffies +
+ (unresolved_count ? sysctl_aarp_tick_time :
+ sysctl_aarp_expiry_time));
+}
+
+/* Network device notifier chain handler. */
+static int aarp_device_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ int ct;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_DOWN) {
+ write_lock_bh(&aarp_lock);
+
+ for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
+ __aarp_expire_device(&resolved[ct], dev);
+ __aarp_expire_device(&unresolved[ct], dev);
+ __aarp_expire_device(&proxies[ct], dev);
+ }
+
+ write_unlock_bh(&aarp_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+/* Expire all entries in a hash chain */
+static void __aarp_expire_all(struct aarp_entry **n)
+{
+ struct aarp_entry *t;
+
+ while (*n) {
+ t = *n;
+ *n = (*n)->next;
+ __aarp_expire(t);
+ }
+}
+
+/* Cleanup all hash chains -- module unloading */
+static void aarp_purge(void)
+{
+ int ct;
+
+ write_lock_bh(&aarp_lock);
+ for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
+ __aarp_expire_all(&resolved[ct]);
+ __aarp_expire_all(&unresolved[ct]);
+ __aarp_expire_all(&proxies[ct]);
+ }
+ write_unlock_bh(&aarp_lock);
+}
+
+/*
+ * Create a new aarp entry. This must use GFP_ATOMIC because it
+ * runs while holding spinlocks.
+ */
+static struct aarp_entry *aarp_alloc(void)
+{
+ struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC);
+
+ if (a)
+ skb_queue_head_init(&a->packet_queue);
+ return a;
+}
+
+/*
+ * Find an entry. We might return an expired but not yet purged entry. We
+ * don't care as it will do no harm.
+ *
+ * This must run under the aarp_lock.
+ */
+static struct aarp_entry *__aarp_find_entry(struct aarp_entry *list,
+ struct net_device *dev,
+ struct atalk_addr *sat)
+{
+ while (list) {
+ if (list->target_addr.s_net == sat->s_net &&
+ list->target_addr.s_node == sat->s_node &&
+ list->dev == dev)
+ break;
+ list = list->next;
+ }
+
+ return list;
+}
+
+/* Called from the DDP code, and thus must be exported. */
+void aarp_proxy_remove(struct net_device *dev, struct atalk_addr *sa)
+{
+ int hash = sa->s_node % (AARP_HASH_SIZE - 1);
+ struct aarp_entry *a;
+
+ write_lock_bh(&aarp_lock);
+
+ a = __aarp_find_entry(proxies[hash], dev, sa);
+ if (a)
+ a->expires_at = jiffies - 1;
+
+ write_unlock_bh(&aarp_lock);
+}
+
+/* This must run under aarp_lock. */
+static struct atalk_addr *__aarp_proxy_find(struct net_device *dev,
+ struct atalk_addr *sa)
+{
+ int hash = sa->s_node % (AARP_HASH_SIZE - 1);
+ struct aarp_entry *a = __aarp_find_entry(proxies[hash], dev, sa);
+
+ return a ? sa : NULL;
+}
+
+/*
+ * Probe a Phase 1 device or a device that requires its Net:Node to
+ * be set via an ioctl.
+ */
+static void aarp_send_probe_phase1(struct atalk_iface *iface)
+{
+ struct ifreq atreq;
+ struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr;
+ const struct net_device_ops *ops = iface->dev->netdev_ops;
+
+ sa->sat_addr.s_node = iface->address.s_node;
+ sa->sat_addr.s_net = ntohs(iface->address.s_net);
+
+ /* We pass the Net:Node to the drivers/cards by a Device ioctl. */
+ if (!(ops->ndo_do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) {
+ ops->ndo_do_ioctl(iface->dev, &atreq, SIOCGIFADDR);
+ if (iface->address.s_net != htons(sa->sat_addr.s_net) ||
+ iface->address.s_node != sa->sat_addr.s_node)
+ iface->status |= ATIF_PROBE_FAIL;
+
+ iface->address.s_net = htons(sa->sat_addr.s_net);
+ iface->address.s_node = sa->sat_addr.s_node;
+ }
+}
+
+
+void aarp_probe_network(struct atalk_iface *atif)
+{
+ if (atif->dev->type == ARPHRD_LOCALTLK ||
+ atif->dev->type == ARPHRD_PPP)
+ aarp_send_probe_phase1(atif);
+ else {
+ unsigned int count;
+
+ for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
+ aarp_send_probe(atif->dev, &atif->address);
+
+ /* Defer 1/10th */
+ msleep(100);
+
+ if (atif->status & ATIF_PROBE_FAIL)
+ break;
+ }
+ }
+}
+
+int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
+{
+ int hash, retval = -EPROTONOSUPPORT;
+ struct aarp_entry *entry;
+ unsigned int count;
+
+ /*
+ * we don't currently support LocalTalk or PPP for proxy AARP;
+ * if someone wants to try and add it, have fun
+ */
+ if (atif->dev->type == ARPHRD_LOCALTLK ||
+ atif->dev->type == ARPHRD_PPP)
+ goto out;
+
+ /*
+ * create a new AARP entry with the flags set to be published --
+ * we need this one to hang around even if it's in use
+ */
+ entry = aarp_alloc();
+ retval = -ENOMEM;
+ if (!entry)
+ goto out;
+
+ entry->expires_at = -1;
+ entry->status = ATIF_PROBE;
+ entry->target_addr.s_node = sa->s_node;
+ entry->target_addr.s_net = sa->s_net;
+ entry->dev = atif->dev;
+
+ write_lock_bh(&aarp_lock);
+
+ hash = sa->s_node % (AARP_HASH_SIZE - 1);
+ entry->next = proxies[hash];
+ proxies[hash] = entry;
+
+ for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
+ aarp_send_probe(atif->dev, sa);
+
+ /* Defer 1/10th */
+ write_unlock_bh(&aarp_lock);
+ msleep(100);
+ write_lock_bh(&aarp_lock);
+
+ if (entry->status & ATIF_PROBE_FAIL)
+ break;
+ }
+
+ if (entry->status & ATIF_PROBE_FAIL) {
+ entry->expires_at = jiffies - 1; /* free the entry */
+ retval = -EADDRINUSE; /* return network full */
+ } else { /* clear the probing flag */
+ entry->status &= ~ATIF_PROBE;
+ retval = 1;
+ }
+
+ write_unlock_bh(&aarp_lock);
+out:
+ return retval;
+}
+
+/* Send a DDP frame */
+int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
+ struct atalk_addr *sa, void *hwaddr)
+{
+ static char ddp_eth_multicast[ETH_ALEN] =
+ { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF };
+ int hash;
+ struct aarp_entry *a;
+
+ skb_reset_network_header(skb);
+
+ /* Check for LocalTalk first */
+ if (dev->type == ARPHRD_LOCALTLK) {
+ struct atalk_addr *at = atalk_find_dev_addr(dev);
+ struct ddpehdr *ddp = (struct ddpehdr *)skb->data;
+ int ft = 2;
+
+ /*
+ * Compressible ?
+ *
+ * IFF: src_net == dest_net == device_net
+ * (zero matches anything)
+ */
+
+ if ((!ddp->deh_snet || at->s_net == ddp->deh_snet) &&
+ (!ddp->deh_dnet || at->s_net == ddp->deh_dnet)) {
+ skb_pull(skb, sizeof(*ddp) - 4);
+
+ /*
+ * The upper two remaining bytes are the port
+ * numbers we just happen to need. Now put the
+ * length in the lower two.
+ */
+ *((__be16 *)skb->data) = htons(skb->len);
+ ft = 1;
+ }
+ /*
+ * Nice and easy. No AARP type protocols occur here so we can
+ * just shovel it out with a 3 byte LLAP header
+ */
+
+ skb_push(skb, 3);
+ skb->data[0] = sa->s_node;
+ skb->data[1] = at->s_node;
+ skb->data[2] = ft;
+ skb->dev = dev;
+ goto sendit;
+ }
+
+ /* On a PPP link we neither compress nor aarp. */
+ if (dev->type == ARPHRD_PPP) {
+ skb->protocol = htons(ETH_P_PPPTALK);
+ skb->dev = dev;
+ goto sendit;
+ }
+
+ /* Non ELAP we cannot do. */
+ if (dev->type != ARPHRD_ETHER)
+ goto free_it;
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_ATALK);
+ hash = sa->s_node % (AARP_HASH_SIZE - 1);
+
+ /* Do we have a resolved entry? */
+ if (sa->s_node == ATADDR_BCAST) {
+ /* Send it */
+ ddp_dl->request(ddp_dl, skb, ddp_eth_multicast);
+ goto sent;
+ }
+
+ write_lock_bh(&aarp_lock);
+ a = __aarp_find_entry(resolved[hash], dev, sa);
+
+ if (a) { /* Return 1 and fill in the address */
+ a->expires_at = jiffies + (sysctl_aarp_expiry_time * 10);
+ ddp_dl->request(ddp_dl, skb, a->hwaddr);
+ write_unlock_bh(&aarp_lock);
+ goto sent;
+ }
+
+ /* Do we have an unresolved entry: This is the less common path */
+ a = __aarp_find_entry(unresolved[hash], dev, sa);
+ if (a) { /* Queue onto the unresolved queue */
+ skb_queue_tail(&a->packet_queue, skb);
+ goto out_unlock;
+ }
+
+ /* Allocate a new entry */
+ a = aarp_alloc();
+ if (!a) {
+ /* Whoops slipped... good job it's an unreliable protocol 8) */
+ write_unlock_bh(&aarp_lock);
+ goto free_it;
+ }
+
+ /* Set up the queue */
+ skb_queue_tail(&a->packet_queue, skb);
+ a->expires_at = jiffies + sysctl_aarp_resolve_time;
+ a->dev = dev;
+ a->next = unresolved[hash];
+ a->target_addr = *sa;
+ a->xmit_count = 0;
+ unresolved[hash] = a;
+ unresolved_count++;
+
+ /* Send an initial request for the address */
+ __aarp_send_query(a);
+
+ /*
+ * Switch to fast timer if needed (That is if this is the first
+ * unresolved entry to get added)
+ */
+
+ if (unresolved_count == 1)
+ mod_timer(&aarp_timer, jiffies + sysctl_aarp_tick_time);
+
+ /* Now finally, it is safe to drop the lock. */
+out_unlock:
+ write_unlock_bh(&aarp_lock);
+
+ /* Tell the ddp layer we have taken over for this frame. */
+ goto sent;
+
+sendit:
+ if (skb->sk)
+ skb->priority = skb->sk->sk_priority;
+ if (dev_queue_xmit(skb))
+ goto drop;
+sent:
+ return NET_XMIT_SUCCESS;
+free_it:
+ kfree_skb(skb);
+drop:
+ return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(aarp_send_ddp);
+
+/*
+ * An entry in the aarp unresolved queue has become resolved. Send
+ * all the frames queued under it.
+ *
+ * Must run under aarp_lock.
+ */
+static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a,
+ int hash)
+{
+ struct sk_buff *skb;
+
+ while (*list)
+ if (*list == a) {
+ unresolved_count--;
+ *list = a->next;
+
+ /* Move into the resolved list */
+ a->next = resolved[hash];
+ resolved[hash] = a;
+
+ /* Kick frames off */
+ while ((skb = skb_dequeue(&a->packet_queue)) != NULL) {
+ a->expires_at = jiffies +
+ sysctl_aarp_expiry_time * 10;
+ ddp_dl->request(ddp_dl, skb, a->hwaddr);
+ }
+ } else
+ list = &((*list)->next);
+}
+
+/*
+ * This is called by the SNAP driver whenever we see an AARP SNAP
+ * frame. We currently only support Ethernet.
+ */
+static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct elapaarp *ea = aarp_hdr(skb);
+ int hash, ret = 0;
+ __u16 function;
+ struct aarp_entry *a;
+ struct atalk_addr sa, *ma, da;
+ struct atalk_iface *ifa;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto out0;
+
+ /* We only do Ethernet SNAP AARP. */
+ if (dev->type != ARPHRD_ETHER)
+ goto out0;
+
+ /* Frame size ok? */
+ if (!skb_pull(skb, sizeof(*ea)))
+ goto out0;
+
+ function = ntohs(ea->function);
+
+ /* Sanity check fields. */
+ if (function < AARP_REQUEST || function > AARP_PROBE ||
+ ea->hw_len != ETH_ALEN || ea->pa_len != AARP_PA_ALEN ||
+ ea->pa_src_zero || ea->pa_dst_zero)
+ goto out0;
+
+ /* Looks good. */
+ hash = ea->pa_src_node % (AARP_HASH_SIZE - 1);
+
+ /* Build an address. */
+ sa.s_node = ea->pa_src_node;
+ sa.s_net = ea->pa_src_net;
+
+ /* Process the packet. Check for replies of me. */
+ ifa = atalk_find_dev(dev);
+ if (!ifa)
+ goto out1;
+
+ if (ifa->status & ATIF_PROBE &&
+ ifa->address.s_node == ea->pa_dst_node &&
+ ifa->address.s_net == ea->pa_dst_net) {
+ ifa->status |= ATIF_PROBE_FAIL; /* Fail the probe (in use) */
+ goto out1;
+ }
+
+ /* Check for replies of proxy AARP entries */
+ da.s_node = ea->pa_dst_node;
+ da.s_net = ea->pa_dst_net;
+
+ write_lock_bh(&aarp_lock);
+ a = __aarp_find_entry(proxies[hash], dev, &da);
+
+ if (a && a->status & ATIF_PROBE) {
+ a->status |= ATIF_PROBE_FAIL;
+ /*
+ * we do not respond to probe or request packets for
+ * this address while we are probing this address
+ */
+ goto unlock;
+ }
+
+ switch (function) {
+ case AARP_REPLY:
+ if (!unresolved_count) /* Speed up */
+ break;
+
+ /* Find the entry. */
+ a = __aarp_find_entry(unresolved[hash], dev, &sa);
+ if (!a || dev != a->dev)
+ break;
+
+ /* We can fill one in - this is good. */
+ ether_addr_copy(a->hwaddr, ea->hw_src);
+ __aarp_resolved(&unresolved[hash], a, hash);
+ if (!unresolved_count)
+ mod_timer(&aarp_timer,
+ jiffies + sysctl_aarp_expiry_time);
+ break;
+
+ case AARP_REQUEST:
+ case AARP_PROBE:
+
+ /*
+ * If it is my address set ma to my address and reply.
+ * We can treat probe and request the same. Probe
+ * simply means we shouldn't cache the querying host,
+ * as in a probe they are proposing an address not
+ * using one.
+ *
+ * Support for proxy-AARP added. We check if the
+ * address is one of our proxies before we toss the
+ * packet out.
+ */
+
+ sa.s_node = ea->pa_dst_node;
+ sa.s_net = ea->pa_dst_net;
+
+ /* See if we have a matching proxy. */
+ ma = __aarp_proxy_find(dev, &sa);
+ if (!ma)
+ ma = &ifa->address;
+ else { /* We need to make a copy of the entry. */
+ da.s_node = sa.s_node;
+ da.s_net = sa.s_net;
+ ma = &da;
+ }
+
+ if (function == AARP_PROBE) {
+ /*
+ * A probe implies someone trying to get an
+ * address. So as a precaution flush any
+ * entries we have for this address.
+ */
+ a = __aarp_find_entry(resolved[sa.s_node %
+ (AARP_HASH_SIZE - 1)],
+ skb->dev, &sa);
+
+ /*
+ * Make it expire next tick - that avoids us
+ * getting into a probe/flush/learn/probe/
+ * flush/learn cycle during probing of a slow
+ * to respond host addr.
+ */
+ if (a) {
+ a->expires_at = jiffies - 1;
+ mod_timer(&aarp_timer, jiffies +
+ sysctl_aarp_tick_time);
+ }
+ }
+
+ if (sa.s_node != ma->s_node)
+ break;
+
+ if (sa.s_net && ma->s_net && sa.s_net != ma->s_net)
+ break;
+
+ sa.s_node = ea->pa_src_node;
+ sa.s_net = ea->pa_src_net;
+
+ /* aarp_my_address has found the address to use for us.
+ */
+ aarp_send_reply(dev, ma, &sa, ea->hw_src);
+ break;
+ }
+
+unlock:
+ write_unlock_bh(&aarp_lock);
+out1:
+ ret = 1;
+out0:
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct notifier_block aarp_notifier = {
+ .notifier_call = aarp_device_event,
+};
+
+static unsigned char aarp_snap_id[] = { 0x00, 0x00, 0x00, 0x80, 0xF3 };
+
+void __init aarp_proto_init(void)
+{
+ aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv);
+ if (!aarp_dl)
+ printk(KERN_CRIT "Unable to register AARP with SNAP.\n");
+ setup_timer(&aarp_timer, aarp_expire_timeout, 0);
+ aarp_timer.expires = jiffies + sysctl_aarp_expiry_time;
+ add_timer(&aarp_timer);
+ register_netdevice_notifier(&aarp_notifier);
+}
+
+/* Remove the AARP entries associated with a device. */
+void aarp_device_down(struct net_device *dev)
+{
+ int ct;
+
+ write_lock_bh(&aarp_lock);
+
+ for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
+ __aarp_expire_device(&resolved[ct], dev);
+ __aarp_expire_device(&unresolved[ct], dev);
+ __aarp_expire_device(&proxies[ct], dev);
+ }
+
+ write_unlock_bh(&aarp_lock);
+}
+
+#ifdef CONFIG_PROC_FS
+struct aarp_iter_state {
+ int bucket;
+ struct aarp_entry **table;
+};
+
+/*
+ * Get the aarp entry that is in the chain described
+ * by the iterator.
+ * If pos is set then skip till that index.
+ * pos = 1 is the first entry
+ */
+static struct aarp_entry *iter_next(struct aarp_iter_state *iter, loff_t *pos)
+{
+ int ct = iter->bucket;
+ struct aarp_entry **table = iter->table;
+ loff_t off = 0;
+ struct aarp_entry *entry;
+
+ rescan:
+ while (ct < AARP_HASH_SIZE) {
+ for (entry = table[ct]; entry; entry = entry->next) {
+ if (!pos || ++off == *pos) {
+ iter->table = table;
+ iter->bucket = ct;
+ return entry;
+ }
+ }
+ ++ct;
+ }
+
+ if (table == resolved) {
+ ct = 0;
+ table = unresolved;
+ goto rescan;
+ }
+ if (table == unresolved) {
+ ct = 0;
+ table = proxies;
+ goto rescan;
+ }
+ return NULL;
+}
+
+static void *aarp_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(aarp_lock)
+{
+ struct aarp_iter_state *iter = seq->private;
+
+ read_lock_bh(&aarp_lock);
+ iter->table = resolved;
+ iter->bucket = 0;
+
+ return *pos ? iter_next(iter, pos) : SEQ_START_TOKEN;
+}
+
+static void *aarp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct aarp_entry *entry = v;
+ struct aarp_iter_state *iter = seq->private;
+
+ ++*pos;
+
+ /* first line after header */
+ if (v == SEQ_START_TOKEN)
+ entry = iter_next(iter, NULL);
+
+ /* next entry in current bucket */
+ else if (entry->next)
+ entry = entry->next;
+
+ /* next bucket or table */
+ else {
+ ++iter->bucket;
+ entry = iter_next(iter, NULL);
+ }
+ return entry;
+}
+
+static void aarp_seq_stop(struct seq_file *seq, void *v)
+ __releases(aarp_lock)
+{
+ read_unlock_bh(&aarp_lock);
+}
+
+static const char *dt2str(unsigned long ticks)
+{
+ static char buf[32];
+
+ sprintf(buf, "%ld.%02ld", ticks / HZ, ((ticks % HZ) * 100) / HZ);
+
+ return buf;
+}
+
+static int aarp_seq_show(struct seq_file *seq, void *v)
+{
+ struct aarp_iter_state *iter = seq->private;
+ struct aarp_entry *entry = v;
+ unsigned long now = jiffies;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Address Interface Hardware Address"
+ " Expires LastSend Retry Status\n");
+ else {
+ seq_printf(seq, "%04X:%02X %-12s",
+ ntohs(entry->target_addr.s_net),
+ (unsigned int) entry->target_addr.s_node,
+ entry->dev ? entry->dev->name : "????");
+ seq_printf(seq, "%pM", entry->hwaddr);
+ seq_printf(seq, " %8s",
+ dt2str((long)entry->expires_at - (long)now));
+ if (iter->table == unresolved)
+ seq_printf(seq, " %8s %6hu",
+ dt2str(now - entry->last_sent),
+ entry->xmit_count);
+ else
+ seq_puts(seq, " ");
+ seq_printf(seq, " %s\n",
+ (iter->table == resolved) ? "resolved"
+ : (iter->table == unresolved) ? "unresolved"
+ : (iter->table == proxies) ? "proxies"
+ : "unknown");
+ }
+ return 0;
+}
+
+static const struct seq_operations aarp_seq_ops = {
+ .start = aarp_seq_start,
+ .next = aarp_seq_next,
+ .stop = aarp_seq_stop,
+ .show = aarp_seq_show,
+};
+
+static int aarp_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &aarp_seq_ops,
+ sizeof(struct aarp_iter_state));
+}
+
+const struct file_operations atalk_seq_arp_fops = {
+ .owner = THIS_MODULE,
+ .open = aarp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif
+
+/* General module cleanup. Called from cleanup_module() in ddp.c. */
+void aarp_cleanup_module(void)
+{
+ del_timer_sync(&aarp_timer);
+ unregister_netdevice_notifier(&aarp_notifier);
+ unregister_snap_client(aarp_dl);
+ aarp_purge();
+}
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
new file mode 100644
index 000000000..af46bc49e
--- /dev/null
+++ b/net/appletalk/atalk_proc.c
@@ -0,0 +1,303 @@
+/*
+ * atalk_proc.c - proc support for Appletalk
+ *
+ * Copyright(c) Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, version 2.
+ */
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <linux/atalk.h>
+#include <linux/export.h>
+
+
+static __inline__ struct atalk_iface *atalk_get_interface_idx(loff_t pos)
+{
+ struct atalk_iface *i;
+
+ for (i = atalk_interfaces; pos && i; i = i->next)
+ --pos;
+
+ return i;
+}
+
+static void *atalk_seq_interface_start(struct seq_file *seq, loff_t *pos)
+ __acquires(atalk_interfaces_lock)
+{
+ loff_t l = *pos;
+
+ read_lock_bh(&atalk_interfaces_lock);
+ return l ? atalk_get_interface_idx(--l) : SEQ_START_TOKEN;
+}
+
+static void *atalk_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct atalk_iface *i;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN) {
+ i = NULL;
+ if (atalk_interfaces)
+ i = atalk_interfaces;
+ goto out;
+ }
+ i = v;
+ i = i->next;
+out:
+ return i;
+}
+
+static void atalk_seq_interface_stop(struct seq_file *seq, void *v)
+ __releases(atalk_interfaces_lock)
+{
+ read_unlock_bh(&atalk_interfaces_lock);
+}
+
+static int atalk_seq_interface_show(struct seq_file *seq, void *v)
+{
+ struct atalk_iface *iface;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Interface Address Networks "
+ "Status\n");
+ goto out;
+ }
+
+ iface = v;
+ seq_printf(seq, "%-16s %04X:%02X %04X-%04X %d\n",
+ iface->dev->name, ntohs(iface->address.s_net),
+ iface->address.s_node, ntohs(iface->nets.nr_firstnet),
+ ntohs(iface->nets.nr_lastnet), iface->status);
+out:
+ return 0;
+}
+
+static __inline__ struct atalk_route *atalk_get_route_idx(loff_t pos)
+{
+ struct atalk_route *r;
+
+ for (r = atalk_routes; pos && r; r = r->next)
+ --pos;
+
+ return r;
+}
+
+static void *atalk_seq_route_start(struct seq_file *seq, loff_t *pos)
+ __acquires(atalk_routes_lock)
+{
+ loff_t l = *pos;
+
+ read_lock_bh(&atalk_routes_lock);
+ return l ? atalk_get_route_idx(--l) : SEQ_START_TOKEN;
+}
+
+static void *atalk_seq_route_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct atalk_route *r;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN) {
+ r = NULL;
+ if (atalk_routes)
+ r = atalk_routes;
+ goto out;
+ }
+ r = v;
+ r = r->next;
+out:
+ return r;
+}
+
+static void atalk_seq_route_stop(struct seq_file *seq, void *v)
+ __releases(atalk_routes_lock)
+{
+ read_unlock_bh(&atalk_routes_lock);
+}
+
+static int atalk_seq_route_show(struct seq_file *seq, void *v)
+{
+ struct atalk_route *rt;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Target Router Flags Dev\n");
+ goto out;
+ }
+
+ if (atrtr_default.dev) {
+ rt = &atrtr_default;
+ seq_printf(seq, "Default %04X:%02X %-4d %s\n",
+ ntohs(rt->gateway.s_net), rt->gateway.s_node,
+ rt->flags, rt->dev->name);
+ }
+
+ rt = v;
+ seq_printf(seq, "%04X:%02X %04X:%02X %-4d %s\n",
+ ntohs(rt->target.s_net), rt->target.s_node,
+ ntohs(rt->gateway.s_net), rt->gateway.s_node,
+ rt->flags, rt->dev->name);
+out:
+ return 0;
+}
+
+static void *atalk_seq_socket_start(struct seq_file *seq, loff_t *pos)
+ __acquires(atalk_sockets_lock)
+{
+ read_lock_bh(&atalk_sockets_lock);
+ return seq_hlist_start_head(&atalk_sockets, *pos);
+}
+
+static void *atalk_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_hlist_next(v, &atalk_sockets, pos);
+}
+
+static void atalk_seq_socket_stop(struct seq_file *seq, void *v)
+ __releases(atalk_sockets_lock)
+{
+ read_unlock_bh(&atalk_sockets_lock);
+}
+
+static int atalk_seq_socket_show(struct seq_file *seq, void *v)
+{
+ struct sock *s;
+ struct atalk_sock *at;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "Type Local_addr Remote_addr Tx_queue "
+ "Rx_queue St UID\n");
+ goto out;
+ }
+
+ s = sk_entry(v);
+ at = at_sk(s);
+
+ seq_printf(seq, "%02X %04X:%02X:%02X %04X:%02X:%02X %08X:%08X "
+ "%02X %u\n",
+ s->sk_type, ntohs(at->src_net), at->src_node, at->src_port,
+ ntohs(at->dest_net), at->dest_node, at->dest_port,
+ sk_wmem_alloc_get(s),
+ sk_rmem_alloc_get(s),
+ s->sk_state,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)));
+out:
+ return 0;
+}
+
+static const struct seq_operations atalk_seq_interface_ops = {
+ .start = atalk_seq_interface_start,
+ .next = atalk_seq_interface_next,
+ .stop = atalk_seq_interface_stop,
+ .show = atalk_seq_interface_show,
+};
+
+static const struct seq_operations atalk_seq_route_ops = {
+ .start = atalk_seq_route_start,
+ .next = atalk_seq_route_next,
+ .stop = atalk_seq_route_stop,
+ .show = atalk_seq_route_show,
+};
+
+static const struct seq_operations atalk_seq_socket_ops = {
+ .start = atalk_seq_socket_start,
+ .next = atalk_seq_socket_next,
+ .stop = atalk_seq_socket_stop,
+ .show = atalk_seq_socket_show,
+};
+
+static int atalk_seq_interface_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &atalk_seq_interface_ops);
+}
+
+static int atalk_seq_route_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &atalk_seq_route_ops);
+}
+
+static int atalk_seq_socket_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &atalk_seq_socket_ops);
+}
+
+static const struct file_operations atalk_seq_interface_fops = {
+ .owner = THIS_MODULE,
+ .open = atalk_seq_interface_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations atalk_seq_route_fops = {
+ .owner = THIS_MODULE,
+ .open = atalk_seq_route_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations atalk_seq_socket_fops = {
+ .owner = THIS_MODULE,
+ .open = atalk_seq_socket_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct proc_dir_entry *atalk_proc_dir;
+
+int __init atalk_proc_init(void)
+{
+ struct proc_dir_entry *p;
+ int rc = -ENOMEM;
+
+ atalk_proc_dir = proc_mkdir("atalk", init_net.proc_net);
+ if (!atalk_proc_dir)
+ goto out;
+
+ p = proc_create("interface", S_IRUGO, atalk_proc_dir,
+ &atalk_seq_interface_fops);
+ if (!p)
+ goto out_interface;
+
+ p = proc_create("route", S_IRUGO, atalk_proc_dir,
+ &atalk_seq_route_fops);
+ if (!p)
+ goto out_route;
+
+ p = proc_create("socket", S_IRUGO, atalk_proc_dir,
+ &atalk_seq_socket_fops);
+ if (!p)
+ goto out_socket;
+
+ p = proc_create("arp", S_IRUGO, atalk_proc_dir, &atalk_seq_arp_fops);
+ if (!p)
+ goto out_arp;
+
+ rc = 0;
+out:
+ return rc;
+out_arp:
+ remove_proc_entry("socket", atalk_proc_dir);
+out_socket:
+ remove_proc_entry("route", atalk_proc_dir);
+out_route:
+ remove_proc_entry("interface", atalk_proc_dir);
+out_interface:
+ remove_proc_entry("atalk", init_net.proc_net);
+ goto out;
+}
+
+void __exit atalk_proc_exit(void)
+{
+ remove_proc_entry("interface", atalk_proc_dir);
+ remove_proc_entry("route", atalk_proc_dir);
+ remove_proc_entry("socket", atalk_proc_dir);
+ remove_proc_entry("arp", atalk_proc_dir);
+ remove_proc_entry("atalk", init_net.proc_net);
+}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
new file mode 100644
index 000000000..3b7ad43c7
--- /dev/null
+++ b/net/appletalk/ddp.c
@@ -0,0 +1,1965 @@
+/*
+ * DDP: An implementation of the AppleTalk DDP protocol for
+ * Ethernet 'ELAP'.
+ *
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ * With more than a little assistance from
+ *
+ * Wesley Craig <netatalk@umich.edu>
+ *
+ * Fixes:
+ * Neil Horman : Added missing device ioctls
+ * Michael Callahan : Made routing work
+ * Wesley Craig : Fix probing to listen to a
+ * passed node id.
+ * Alan Cox : Added send/recvmsg support
+ * Alan Cox : Moved at. to protinfo in
+ * socket.
+ * Alan Cox : Added firewall hooks.
+ * Alan Cox : Supports new ARPHRD_LOOPBACK
+ * Christer Weinigel : Routing and /proc fixes.
+ * Bradford Johnson : LocalTalk.
+ * Tom Dyas : Module support.
+ * Alan Cox : Hooks for PPP (based on the
+ * LocalTalk hook).
+ * Alan Cox : Posix bits
+ * Alan Cox/Mike Freeman : Possible fix to NBP problems
+ * Bradford Johnson : IP-over-DDP (experimental)
+ * Jay Schulist : Moved IP-over-DDP to its own
+ * driver file. (ipddp.c & ipddp.h)
+ * Jay Schulist : Made work as module with
+ * AppleTalk drivers, cleaned it.
+ * Rob Newberry : Added proxy AARP and AARP
+ * procfs, moved probing to AARP
+ * module.
+ * Adrian Sun/
+ * Michael Zuelsdorff : fix for net.0 packets. don't
+ * allow illegal ether/tokentalk
+ * port assignment. we lose a
+ * valid localtalk port as a
+ * result.
+ * Arnaldo C. de Melo : Cleanup, in preparation for
+ * shared skb support 8)
+ * Arnaldo C. de Melo : Move proc stuff to atalk_proc.c,
+ * use seq_file
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/termios.h> /* For TIOCOUTQ/INQ */
+#include <linux/compat.h>
+#include <linux/slab.h>
+#include <net/datalink.h>
+#include <net/psnap.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/route.h>
+#include <linux/atalk.h>
+#include <linux/highmem.h>
+
+struct datalink_proto *ddp_dl, *aarp_dl;
+static const struct proto_ops atalk_dgram_ops;
+
+/**************************************************************************\
+* *
+* Handlers for the socket list. *
+* *
+\**************************************************************************/
+
+HLIST_HEAD(atalk_sockets);
+DEFINE_RWLOCK(atalk_sockets_lock);
+
+static inline void __atalk_insert_socket(struct sock *sk)
+{
+ sk_add_node(sk, &atalk_sockets);
+}
+
+static inline void atalk_remove_socket(struct sock *sk)
+{
+ write_lock_bh(&atalk_sockets_lock);
+ sk_del_node_init(sk);
+ write_unlock_bh(&atalk_sockets_lock);
+}
+
+static struct sock *atalk_search_socket(struct sockaddr_at *to,
+ struct atalk_iface *atif)
+{
+ struct sock *s;
+
+ read_lock_bh(&atalk_sockets_lock);
+ sk_for_each(s, &atalk_sockets) {
+ struct atalk_sock *at = at_sk(s);
+
+ if (to->sat_port != at->src_port)
+ continue;
+
+ if (to->sat_addr.s_net == ATADDR_ANYNET &&
+ to->sat_addr.s_node == ATADDR_BCAST)
+ goto found;
+
+ if (to->sat_addr.s_net == at->src_net &&
+ (to->sat_addr.s_node == at->src_node ||
+ to->sat_addr.s_node == ATADDR_BCAST ||
+ to->sat_addr.s_node == ATADDR_ANYNODE))
+ goto found;
+
+ /* XXXX.0 -- we got a request for this router. make sure
+ * that the node is appropriately set. */
+ if (to->sat_addr.s_node == ATADDR_ANYNODE &&
+ to->sat_addr.s_net != ATADDR_ANYNET &&
+ atif->address.s_node == at->src_node) {
+ to->sat_addr.s_node = atif->address.s_node;
+ goto found;
+ }
+ }
+ s = NULL;
+found:
+ read_unlock_bh(&atalk_sockets_lock);
+ return s;
+}
+
+/**
+ * atalk_find_or_insert_socket - Try to find a socket matching ADDR
+ * @sk: socket to insert in the list if it is not there already
+ * @sat: address to search for
+ *
+ * Try to find a socket matching ADDR in the socket list, if found then return
+ * it. If not, insert SK into the socket list.
+ *
+ * This entire operation must execute atomically.
+ */
+static struct sock *atalk_find_or_insert_socket(struct sock *sk,
+ struct sockaddr_at *sat)
+{
+ struct sock *s;
+ struct atalk_sock *at;
+
+ write_lock_bh(&atalk_sockets_lock);
+ sk_for_each(s, &atalk_sockets) {
+ at = at_sk(s);
+
+ if (at->src_net == sat->sat_addr.s_net &&
+ at->src_node == sat->sat_addr.s_node &&
+ at->src_port == sat->sat_port)
+ goto found;
+ }
+ s = NULL;
+ __atalk_insert_socket(sk); /* Wheee, it's free, assign and insert. */
+found:
+ write_unlock_bh(&atalk_sockets_lock);
+ return s;
+}
+
+static void atalk_destroy_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ if (sk_has_allocations(sk)) {
+ sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME;
+ add_timer(&sk->sk_timer);
+ } else
+ sock_put(sk);
+}
+
+static inline void atalk_destroy_socket(struct sock *sk)
+{
+ atalk_remove_socket(sk);
+ skb_queue_purge(&sk->sk_receive_queue);
+
+ if (sk_has_allocations(sk)) {
+ setup_timer(&sk->sk_timer, atalk_destroy_timer,
+ (unsigned long)sk);
+ sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME;
+ add_timer(&sk->sk_timer);
+ } else
+ sock_put(sk);
+}
+
+/**************************************************************************\
+* *
+* Routing tables for the AppleTalk socket layer. *
+* *
+\**************************************************************************/
+
+/* Anti-deadlock ordering is atalk_routes_lock --> iface_lock -DaveM */
+struct atalk_route *atalk_routes;
+DEFINE_RWLOCK(atalk_routes_lock);
+
+struct atalk_iface *atalk_interfaces;
+DEFINE_RWLOCK(atalk_interfaces_lock);
+
+/* For probing devices or in a routerless network */
+struct atalk_route atrtr_default;
+
+/* AppleTalk interface control */
+/*
+ * Drop a device. Doesn't drop any of its routes - that is the caller's
+ * problem. Called when we down the interface or delete the address.
+ */
+static void atif_drop_device(struct net_device *dev)
+{
+ struct atalk_iface **iface = &atalk_interfaces;
+ struct atalk_iface *tmp;
+
+ write_lock_bh(&atalk_interfaces_lock);
+ while ((tmp = *iface) != NULL) {
+ if (tmp->dev == dev) {
+ *iface = tmp->next;
+ dev_put(dev);
+ kfree(tmp);
+ dev->atalk_ptr = NULL;
+ } else
+ iface = &tmp->next;
+ }
+ write_unlock_bh(&atalk_interfaces_lock);
+}
+
+static struct atalk_iface *atif_add_device(struct net_device *dev,
+ struct atalk_addr *sa)
+{
+ struct atalk_iface *iface = kzalloc(sizeof(*iface), GFP_KERNEL);
+
+ if (!iface)
+ goto out;
+
+ dev_hold(dev);
+ iface->dev = dev;
+ dev->atalk_ptr = iface;
+ iface->address = *sa;
+ iface->status = 0;
+
+ write_lock_bh(&atalk_interfaces_lock);
+ iface->next = atalk_interfaces;
+ atalk_interfaces = iface;
+ write_unlock_bh(&atalk_interfaces_lock);
+out:
+ return iface;
+}
+
+/* Perform phase 2 AARP probing on our tentative address */
+static int atif_probe_device(struct atalk_iface *atif)
+{
+ int netrange = ntohs(atif->nets.nr_lastnet) -
+ ntohs(atif->nets.nr_firstnet) + 1;
+ int probe_net = ntohs(atif->address.s_net);
+ int probe_node = atif->address.s_node;
+ int netct, nodect;
+
+ /* Offset the network we start probing with */
+ if (probe_net == ATADDR_ANYNET) {
+ probe_net = ntohs(atif->nets.nr_firstnet);
+ if (netrange)
+ probe_net += jiffies % netrange;
+ }
+ if (probe_node == ATADDR_ANYNODE)
+ probe_node = jiffies & 0xFF;
+
+ /* Scan the networks */
+ atif->status |= ATIF_PROBE;
+ for (netct = 0; netct <= netrange; netct++) {
+ /* Sweep the available nodes from a given start */
+ atif->address.s_net = htons(probe_net);
+ for (nodect = 0; nodect < 256; nodect++) {
+ atif->address.s_node = (nodect + probe_node) & 0xFF;
+ if (atif->address.s_node > 0 &&
+ atif->address.s_node < 254) {
+ /* Probe a proposed address */
+ aarp_probe_network(atif);
+
+ if (!(atif->status & ATIF_PROBE_FAIL)) {
+ atif->status &= ~ATIF_PROBE;
+ return 0;
+ }
+ }
+ atif->status &= ~ATIF_PROBE_FAIL;
+ }
+ probe_net++;
+ if (probe_net > ntohs(atif->nets.nr_lastnet))
+ probe_net = ntohs(atif->nets.nr_firstnet);
+ }
+ atif->status &= ~ATIF_PROBE;
+
+ return -EADDRINUSE; /* Network is full... */
+}
+
+
+/* Perform AARP probing for a proxy address */
+static int atif_proxy_probe_device(struct atalk_iface *atif,
+ struct atalk_addr *proxy_addr)
+{
+ int netrange = ntohs(atif->nets.nr_lastnet) -
+ ntohs(atif->nets.nr_firstnet) + 1;
+ /* we probe the interface's network */
+ int probe_net = ntohs(atif->address.s_net);
+ int probe_node = ATADDR_ANYNODE; /* we'll take anything */
+ int netct, nodect;
+
+ /* Offset the network we start probing with */
+ if (probe_net == ATADDR_ANYNET) {
+ probe_net = ntohs(atif->nets.nr_firstnet);
+ if (netrange)
+ probe_net += jiffies % netrange;
+ }
+
+ if (probe_node == ATADDR_ANYNODE)
+ probe_node = jiffies & 0xFF;
+
+ /* Scan the networks */
+ for (netct = 0; netct <= netrange; netct++) {
+ /* Sweep the available nodes from a given start */
+ proxy_addr->s_net = htons(probe_net);
+ for (nodect = 0; nodect < 256; nodect++) {
+ proxy_addr->s_node = (nodect + probe_node) & 0xFF;
+ if (proxy_addr->s_node > 0 &&
+ proxy_addr->s_node < 254) {
+ /* Tell AARP to probe a proposed address */
+ int ret = aarp_proxy_probe_network(atif,
+ proxy_addr);
+
+ if (ret != -EADDRINUSE)
+ return ret;
+ }
+ }
+ probe_net++;
+ if (probe_net > ntohs(atif->nets.nr_lastnet))
+ probe_net = ntohs(atif->nets.nr_firstnet);
+ }
+
+ return -EADDRINUSE; /* Network is full... */
+}
+
+
+struct atalk_addr *atalk_find_dev_addr(struct net_device *dev)
+{
+ struct atalk_iface *iface = dev->atalk_ptr;
+ return iface ? &iface->address : NULL;
+}
+
+static struct atalk_addr *atalk_find_primary(void)
+{
+ struct atalk_iface *fiface = NULL;
+ struct atalk_addr *retval;
+ struct atalk_iface *iface;
+
+ /*
+ * Return a point-to-point interface only if
+ * there is no non-ptp interface available.
+ */
+ read_lock_bh(&atalk_interfaces_lock);
+ for (iface = atalk_interfaces; iface; iface = iface->next) {
+ if (!fiface && !(iface->dev->flags & IFF_LOOPBACK))
+ fiface = iface;
+ if (!(iface->dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) {
+ retval = &iface->address;
+ goto out;
+ }
+ }
+
+ if (fiface)
+ retval = &fiface->address;
+ else if (atalk_interfaces)
+ retval = &atalk_interfaces->address;
+ else
+ retval = NULL;
+out:
+ read_unlock_bh(&atalk_interfaces_lock);
+ return retval;
+}
+
+/*
+ * Find a match for 'any network' - ie any of our interfaces with that
+ * node number will do just nicely.
+ */
+static struct atalk_iface *atalk_find_anynet(int node, struct net_device *dev)
+{
+ struct atalk_iface *iface = dev->atalk_ptr;
+
+ if (!iface || iface->status & ATIF_PROBE)
+ goto out_err;
+
+ if (node != ATADDR_BCAST &&
+ iface->address.s_node != node &&
+ node != ATADDR_ANYNODE)
+ goto out_err;
+out:
+ return iface;
+out_err:
+ iface = NULL;
+ goto out;
+}
+
+/* Find a match for a specific network:node pair */
+static struct atalk_iface *atalk_find_interface(__be16 net, int node)
+{
+ struct atalk_iface *iface;
+
+ read_lock_bh(&atalk_interfaces_lock);
+ for (iface = atalk_interfaces; iface; iface = iface->next) {
+ if ((node == ATADDR_BCAST ||
+ node == ATADDR_ANYNODE ||
+ iface->address.s_node == node) &&
+ iface->address.s_net == net &&
+ !(iface->status & ATIF_PROBE))
+ break;
+
+ /* XXXX.0 -- net.0 returns the iface associated with net */
+ if (node == ATADDR_ANYNODE && net != ATADDR_ANYNET &&
+ ntohs(iface->nets.nr_firstnet) <= ntohs(net) &&
+ ntohs(net) <= ntohs(iface->nets.nr_lastnet))
+ break;
+ }
+ read_unlock_bh(&atalk_interfaces_lock);
+ return iface;
+}
+
+
+/*
+ * Find a route for an AppleTalk packet. This ought to get cached in
+ * the socket (later on...). We know about host routes and the fact
+ * that a route must be direct to broadcast.
+ */
+static struct atalk_route *atrtr_find(struct atalk_addr *target)
+{
+ /*
+ * we must search through all routes unless we find a
+ * host route, because some host routes might overlap
+ * network routes
+ */
+ struct atalk_route *net_route = NULL;
+ struct atalk_route *r;
+
+ read_lock_bh(&atalk_routes_lock);
+ for (r = atalk_routes; r; r = r->next) {
+ if (!(r->flags & RTF_UP))
+ continue;
+
+ if (r->target.s_net == target->s_net) {
+ if (r->flags & RTF_HOST) {
+ /*
+ * if this host route is for the target,
+ * the we're done
+ */
+ if (r->target.s_node == target->s_node)
+ goto out;
+ } else
+ /*
+ * this route will work if there isn't a
+ * direct host route, so cache it
+ */
+ net_route = r;
+ }
+ }
+
+ /*
+ * if we found a network route but not a direct host
+ * route, then return it
+ */
+ if (net_route)
+ r = net_route;
+ else if (atrtr_default.dev)
+ r = &atrtr_default;
+ else /* No route can be found */
+ r = NULL;
+out:
+ read_unlock_bh(&atalk_routes_lock);
+ return r;
+}
+
+
+/*
+ * Given an AppleTalk network, find the device to use. This can be
+ * a simple lookup.
+ */
+struct net_device *atrtr_get_dev(struct atalk_addr *sa)
+{
+ struct atalk_route *atr = atrtr_find(sa);
+ return atr ? atr->dev : NULL;
+}
+
+/* Set up a default router */
+static void atrtr_set_default(struct net_device *dev)
+{
+ atrtr_default.dev = dev;
+ atrtr_default.flags = RTF_UP;
+ atrtr_default.gateway.s_net = htons(0);
+ atrtr_default.gateway.s_node = 0;
+}
+
+/*
+ * Add a router. Basically make sure it looks valid and stuff the
+ * entry in the list. While it uses netranges we always set them to one
+ * entry to work like netatalk.
+ */
+static int atrtr_create(struct rtentry *r, struct net_device *devhint)
+{
+ struct sockaddr_at *ta = (struct sockaddr_at *)&r->rt_dst;
+ struct sockaddr_at *ga = (struct sockaddr_at *)&r->rt_gateway;
+ struct atalk_route *rt;
+ struct atalk_iface *iface, *riface;
+ int retval = -EINVAL;
+
+ /*
+ * Fixme: Raise/Lower a routing change semaphore for these
+ * operations.
+ */
+
+ /* Validate the request */
+ if (ta->sat_family != AF_APPLETALK ||
+ (!devhint && ga->sat_family != AF_APPLETALK))
+ goto out;
+
+ /* Now walk the routing table and make our decisions */
+ write_lock_bh(&atalk_routes_lock);
+ for (rt = atalk_routes; rt; rt = rt->next) {
+ if (r->rt_flags != rt->flags)
+ continue;
+
+ if (ta->sat_addr.s_net == rt->target.s_net) {
+ if (!(rt->flags & RTF_HOST))
+ break;
+ if (ta->sat_addr.s_node == rt->target.s_node)
+ break;
+ }
+ }
+
+ if (!devhint) {
+ riface = NULL;
+
+ read_lock_bh(&atalk_interfaces_lock);
+ for (iface = atalk_interfaces; iface; iface = iface->next) {
+ if (!riface &&
+ ntohs(ga->sat_addr.s_net) >=
+ ntohs(iface->nets.nr_firstnet) &&
+ ntohs(ga->sat_addr.s_net) <=
+ ntohs(iface->nets.nr_lastnet))
+ riface = iface;
+
+ if (ga->sat_addr.s_net == iface->address.s_net &&
+ ga->sat_addr.s_node == iface->address.s_node)
+ riface = iface;
+ }
+ read_unlock_bh(&atalk_interfaces_lock);
+
+ retval = -ENETUNREACH;
+ if (!riface)
+ goto out_unlock;
+
+ devhint = riface->dev;
+ }
+
+ if (!rt) {
+ rt = kzalloc(sizeof(*rt), GFP_ATOMIC);
+
+ retval = -ENOBUFS;
+ if (!rt)
+ goto out_unlock;
+
+ rt->next = atalk_routes;
+ atalk_routes = rt;
+ }
+
+ /* Fill in the routing entry */
+ rt->target = ta->sat_addr;
+ dev_hold(devhint);
+ rt->dev = devhint;
+ rt->flags = r->rt_flags;
+ rt->gateway = ga->sat_addr;
+
+ retval = 0;
+out_unlock:
+ write_unlock_bh(&atalk_routes_lock);
+out:
+ return retval;
+}
+
+/* Delete a route. Find it and discard it */
+static int atrtr_delete(struct atalk_addr *addr)
+{
+ struct atalk_route **r = &atalk_routes;
+ int retval = 0;
+ struct atalk_route *tmp;
+
+ write_lock_bh(&atalk_routes_lock);
+ while ((tmp = *r) != NULL) {
+ if (tmp->target.s_net == addr->s_net &&
+ (!(tmp->flags&RTF_GATEWAY) ||
+ tmp->target.s_node == addr->s_node)) {
+ *r = tmp->next;
+ dev_put(tmp->dev);
+ kfree(tmp);
+ goto out;
+ }
+ r = &tmp->next;
+ }
+ retval = -ENOENT;
+out:
+ write_unlock_bh(&atalk_routes_lock);
+ return retval;
+}
+
+/*
+ * Called when a device is downed. Just throw away any routes
+ * via it.
+ */
+static void atrtr_device_down(struct net_device *dev)
+{
+ struct atalk_route **r = &atalk_routes;
+ struct atalk_route *tmp;
+
+ write_lock_bh(&atalk_routes_lock);
+ while ((tmp = *r) != NULL) {
+ if (tmp->dev == dev) {
+ *r = tmp->next;
+ dev_put(dev);
+ kfree(tmp);
+ } else
+ r = &tmp->next;
+ }
+ write_unlock_bh(&atalk_routes_lock);
+
+ if (atrtr_default.dev == dev)
+ atrtr_set_default(NULL);
+}
+
+/* Actually down the interface */
+static inline void atalk_dev_down(struct net_device *dev)
+{
+ atrtr_device_down(dev); /* Remove all routes for the device */
+ aarp_device_down(dev); /* Remove AARP entries for the device */
+ atif_drop_device(dev); /* Remove the device */
+}
+
+/*
+ * A device event has occurred. Watch for devices going down and
+ * delete our use of them (iface and route).
+ */
+static int ddp_device_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_DOWN)
+ /* Discard any use of this */
+ atalk_dev_down(dev);
+
+ return NOTIFY_DONE;
+}
+
+/* ioctl calls. Shouldn't even need touching */
+/* Device configuration ioctl calls */
+static int atif_ioctl(int cmd, void __user *arg)
+{
+ static char aarp_mcast[6] = { 0x09, 0x00, 0x00, 0xFF, 0xFF, 0xFF };
+ struct ifreq atreq;
+ struct atalk_netrange *nr;
+ struct sockaddr_at *sa;
+ struct net_device *dev;
+ struct atalk_iface *atif;
+ int ct;
+ int limit;
+ struct rtentry rtdef;
+ int add_route;
+
+ if (copy_from_user(&atreq, arg, sizeof(atreq)))
+ return -EFAULT;
+
+ dev = __dev_get_by_name(&init_net, atreq.ifr_name);
+ if (!dev)
+ return -ENODEV;
+
+ sa = (struct sockaddr_at *)&atreq.ifr_addr;
+ atif = atalk_find_dev(dev);
+
+ switch (cmd) {
+ case SIOCSIFADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sa->sat_family != AF_APPLETALK)
+ return -EINVAL;
+ if (dev->type != ARPHRD_ETHER &&
+ dev->type != ARPHRD_LOOPBACK &&
+ dev->type != ARPHRD_LOCALTLK &&
+ dev->type != ARPHRD_PPP)
+ return -EPROTONOSUPPORT;
+
+ nr = (struct atalk_netrange *)&sa->sat_zero[0];
+ add_route = 1;
+
+ /*
+ * if this is a point-to-point iface, and we already
+ * have an iface for this AppleTalk address, then we
+ * should not add a route
+ */
+ if ((dev->flags & IFF_POINTOPOINT) &&
+ atalk_find_interface(sa->sat_addr.s_net,
+ sa->sat_addr.s_node)) {
+ printk(KERN_DEBUG "AppleTalk: point-to-point "
+ "interface added with "
+ "existing address\n");
+ add_route = 0;
+ }
+
+ /*
+ * Phase 1 is fine on LocalTalk but we don't do
+ * EtherTalk phase 1. Anyone wanting to add it go ahead.
+ */
+ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
+ return -EPROTONOSUPPORT;
+ if (sa->sat_addr.s_node == ATADDR_BCAST ||
+ sa->sat_addr.s_node == 254)
+ return -EINVAL;
+ if (atif) {
+ /* Already setting address */
+ if (atif->status & ATIF_PROBE)
+ return -EBUSY;
+
+ atif->address.s_net = sa->sat_addr.s_net;
+ atif->address.s_node = sa->sat_addr.s_node;
+ atrtr_device_down(dev); /* Flush old routes */
+ } else {
+ atif = atif_add_device(dev, &sa->sat_addr);
+ if (!atif)
+ return -ENOMEM;
+ }
+ atif->nets = *nr;
+
+ /*
+ * Check if the chosen address is used. If so we
+ * error and atalkd will try another.
+ */
+
+ if (!(dev->flags & IFF_LOOPBACK) &&
+ !(dev->flags & IFF_POINTOPOINT) &&
+ atif_probe_device(atif) < 0) {
+ atif_drop_device(dev);
+ return -EADDRINUSE;
+ }
+
+ /* Hey it worked - add the direct routes */
+ sa = (struct sockaddr_at *)&rtdef.rt_gateway;
+ sa->sat_family = AF_APPLETALK;
+ sa->sat_addr.s_net = atif->address.s_net;
+ sa->sat_addr.s_node = atif->address.s_node;
+ sa = (struct sockaddr_at *)&rtdef.rt_dst;
+ rtdef.rt_flags = RTF_UP;
+ sa->sat_family = AF_APPLETALK;
+ sa->sat_addr.s_node = ATADDR_ANYNODE;
+ if (dev->flags & IFF_LOOPBACK ||
+ dev->flags & IFF_POINTOPOINT)
+ rtdef.rt_flags |= RTF_HOST;
+
+ /* Routerless initial state */
+ if (nr->nr_firstnet == htons(0) &&
+ nr->nr_lastnet == htons(0xFFFE)) {
+ sa->sat_addr.s_net = atif->address.s_net;
+ atrtr_create(&rtdef, dev);
+ atrtr_set_default(dev);
+ } else {
+ limit = ntohs(nr->nr_lastnet);
+ if (limit - ntohs(nr->nr_firstnet) > 4096) {
+ printk(KERN_WARNING "Too many routes/"
+ "iface.\n");
+ return -EINVAL;
+ }
+ if (add_route)
+ for (ct = ntohs(nr->nr_firstnet);
+ ct <= limit; ct++) {
+ sa->sat_addr.s_net = htons(ct);
+ atrtr_create(&rtdef, dev);
+ }
+ }
+ dev_mc_add_global(dev, aarp_mcast);
+ return 0;
+
+ case SIOCGIFADDR:
+ if (!atif)
+ return -EADDRNOTAVAIL;
+
+ sa->sat_family = AF_APPLETALK;
+ sa->sat_addr = atif->address;
+ break;
+
+ case SIOCGIFBRDADDR:
+ if (!atif)
+ return -EADDRNOTAVAIL;
+
+ sa->sat_family = AF_APPLETALK;
+ sa->sat_addr.s_net = atif->address.s_net;
+ sa->sat_addr.s_node = ATADDR_BCAST;
+ break;
+
+ case SIOCATALKDIFADDR:
+ case SIOCDIFADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sa->sat_family != AF_APPLETALK)
+ return -EINVAL;
+ atalk_dev_down(dev);
+ break;
+
+ case SIOCSARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sa->sat_family != AF_APPLETALK)
+ return -EINVAL;
+ /*
+ * for now, we only support proxy AARP on ELAP;
+ * we should be able to do it for LocalTalk, too.
+ */
+ if (dev->type != ARPHRD_ETHER)
+ return -EPROTONOSUPPORT;
+
+ /*
+ * atif points to the current interface on this network;
+ * we aren't concerned about its current status (at
+ * least for now), but it has all the settings about
+ * the network we're going to probe. Consequently, it
+ * must exist.
+ */
+ if (!atif)
+ return -EADDRNOTAVAIL;
+
+ nr = (struct atalk_netrange *)&(atif->nets);
+ /*
+ * Phase 1 is fine on Localtalk but we don't do
+ * Ethertalk phase 1. Anyone wanting to add it go ahead.
+ */
+ if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
+ return -EPROTONOSUPPORT;
+
+ if (sa->sat_addr.s_node == ATADDR_BCAST ||
+ sa->sat_addr.s_node == 254)
+ return -EINVAL;
+
+ /*
+ * Check if the chosen address is used. If so we
+ * error and ATCP will try another.
+ */
+ if (atif_proxy_probe_device(atif, &(sa->sat_addr)) < 0)
+ return -EADDRINUSE;
+
+ /*
+ * We now have an address on the local network, and
+ * the AARP code will defend it for us until we take it
+ * down. We don't set up any routes right now, because
+ * ATCP will install them manually via SIOCADDRT.
+ */
+ break;
+
+ case SIOCDARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sa->sat_family != AF_APPLETALK)
+ return -EINVAL;
+ if (!atif)
+ return -EADDRNOTAVAIL;
+
+ /* give to aarp module to remove proxy entry */
+ aarp_proxy_remove(atif->dev, &(sa->sat_addr));
+ return 0;
+ }
+
+ return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0;
+}
+
+/* Routing ioctl() calls */
+static int atrtr_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct rtentry rt;
+
+ if (copy_from_user(&rt, arg, sizeof(rt)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case SIOCDELRT:
+ if (rt.rt_dst.sa_family != AF_APPLETALK)
+ return -EINVAL;
+ return atrtr_delete(&((struct sockaddr_at *)
+ &rt.rt_dst)->sat_addr);
+
+ case SIOCADDRT: {
+ struct net_device *dev = NULL;
+ if (rt.rt_dev) {
+ char name[IFNAMSIZ];
+ if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+ name[IFNAMSIZ-1] = '\0';
+ dev = __dev_get_by_name(&init_net, name);
+ if (!dev)
+ return -ENODEV;
+ }
+ return atrtr_create(&rt, dev);
+ }
+ }
+ return -EINVAL;
+}
+
+/**************************************************************************\
+* *
+* Handling for system calls applied via the various interfaces to an *
+* AppleTalk socket object. *
+* *
+\**************************************************************************/
+
+/*
+ * Checksum: This is 'optional'. It's quite likely also a good
+ * candidate for assembler hackery 8)
+ */
+static unsigned long atalk_sum_partial(const unsigned char *data,
+ int len, unsigned long sum)
+{
+ /* This ought to be unwrapped neatly. I'll trust gcc for now */
+ while (len--) {
+ sum += *data++;
+ sum = rol16(sum, 1);
+ }
+ return sum;
+}
+
+/* Checksum skb data -- similar to skb_checksum */
+static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset,
+ int len, unsigned long sum)
+{
+ int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
+
+ /* checksum stuff in header space */
+ if ((copy = start - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ sum = atalk_sum_partial(skb->data + offset, copy, sum);
+ if ((len -= copy) == 0)
+ return sum;
+
+ offset += copy;
+ }
+
+ /* checksum stuff in frags */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap_atomic(skb_frag_page(frag));
+ sum = atalk_sum_partial(vaddr + frag->page_offset +
+ offset - start, copy, sum);
+ kunmap_atomic(vaddr);
+
+ if (!(len -= copy))
+ return sum;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ sum = atalk_sum_skb(frag_iter, offset - start,
+ copy, sum);
+ if ((len -= copy) == 0)
+ return sum;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ BUG_ON(len > 0);
+
+ return sum;
+}
+
+static __be16 atalk_checksum(const struct sk_buff *skb, int len)
+{
+ unsigned long sum;
+
+ /* skip header 4 bytes */
+ sum = atalk_sum_skb(skb, 4, len-4, 0);
+
+ /* Use 0xFFFF for 0. 0 itself means none */
+ return sum ? htons((unsigned short)sum) : htons(0xFFFF);
+}
+
+static struct proto ddp_proto = {
+ .name = "DDP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct atalk_sock),
+};
+
+/*
+ * Create a socket. Initialise the socket, blank the addresses
+ * set the state.
+ */
+static int atalk_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ int rc = -ESOCKTNOSUPPORT;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ /*
+ * We permit SOCK_DGRAM and RAW is an extension. It is trivial to do
+ * and gives you the full ELAP frame. Should be handy for CAP 8)
+ */
+ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+ goto out;
+ rc = -ENOMEM;
+ sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto);
+ if (!sk)
+ goto out;
+ rc = 0;
+ sock->ops = &atalk_dgram_ops;
+ sock_init_data(sock, sk);
+
+ /* Checksums on by default */
+ sock_set_flag(sk, SOCK_ZAPPED);
+out:
+ return rc;
+}
+
+/* Free a socket. No work needed */
+static int atalk_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ sock_hold(sk);
+ lock_sock(sk);
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ atalk_destroy_socket(sk);
+
+ release_sock(sk);
+ sock_put(sk);
+ }
+ return 0;
+}
+
+/**
+ * atalk_pick_and_bind_port - Pick a source port when one is not given
+ * @sk: socket to insert into the tables
+ * @sat: address to search for
+ *
+ * Pick a source port when one is not given. If we can find a suitable free
+ * one, we insert the socket into the tables using it.
+ *
+ * This whole operation must be atomic.
+ */
+static int atalk_pick_and_bind_port(struct sock *sk, struct sockaddr_at *sat)
+{
+ int retval;
+
+ write_lock_bh(&atalk_sockets_lock);
+
+ for (sat->sat_port = ATPORT_RESERVED;
+ sat->sat_port < ATPORT_LAST;
+ sat->sat_port++) {
+ struct sock *s;
+
+ sk_for_each(s, &atalk_sockets) {
+ struct atalk_sock *at = at_sk(s);
+
+ if (at->src_net == sat->sat_addr.s_net &&
+ at->src_node == sat->sat_addr.s_node &&
+ at->src_port == sat->sat_port)
+ goto try_next_port;
+ }
+
+ /* Wheee, it's free, assign and insert. */
+ __atalk_insert_socket(sk);
+ at_sk(sk)->src_port = sat->sat_port;
+ retval = 0;
+ goto out;
+
+try_next_port:;
+ }
+
+ retval = -EBUSY;
+out:
+ write_unlock_bh(&atalk_sockets_lock);
+ return retval;
+}
+
+static int atalk_autobind(struct sock *sk)
+{
+ struct atalk_sock *at = at_sk(sk);
+ struct sockaddr_at sat;
+ struct atalk_addr *ap = atalk_find_primary();
+ int n = -EADDRNOTAVAIL;
+
+ if (!ap || ap->s_net == htons(ATADDR_ANYNET))
+ goto out;
+
+ at->src_net = sat.sat_addr.s_net = ap->s_net;
+ at->src_node = sat.sat_addr.s_node = ap->s_node;
+
+ n = atalk_pick_and_bind_port(sk, &sat);
+ if (!n)
+ sock_reset_flag(sk, SOCK_ZAPPED);
+out:
+ return n;
+}
+
+/* Set the address 'our end' of the connection */
+static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_at *addr = (struct sockaddr_at *)uaddr;
+ struct sock *sk = sock->sk;
+ struct atalk_sock *at = at_sk(sk);
+ int err;
+
+ if (!sock_flag(sk, SOCK_ZAPPED) ||
+ addr_len != sizeof(struct sockaddr_at))
+ return -EINVAL;
+
+ if (addr->sat_family != AF_APPLETALK)
+ return -EAFNOSUPPORT;
+
+ lock_sock(sk);
+ if (addr->sat_addr.s_net == htons(ATADDR_ANYNET)) {
+ struct atalk_addr *ap = atalk_find_primary();
+
+ err = -EADDRNOTAVAIL;
+ if (!ap)
+ goto out;
+
+ at->src_net = addr->sat_addr.s_net = ap->s_net;
+ at->src_node = addr->sat_addr.s_node = ap->s_node;
+ } else {
+ err = -EADDRNOTAVAIL;
+ if (!atalk_find_interface(addr->sat_addr.s_net,
+ addr->sat_addr.s_node))
+ goto out;
+
+ at->src_net = addr->sat_addr.s_net;
+ at->src_node = addr->sat_addr.s_node;
+ }
+
+ if (addr->sat_port == ATADDR_ANYPORT) {
+ err = atalk_pick_and_bind_port(sk, addr);
+
+ if (err < 0)
+ goto out;
+ } else {
+ at->src_port = addr->sat_port;
+
+ err = -EADDRINUSE;
+ if (atalk_find_or_insert_socket(sk, addr))
+ goto out;
+ }
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+
+/* Set the address we talk to */
+static int atalk_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct atalk_sock *at = at_sk(sk);
+ struct sockaddr_at *addr;
+ int err;
+
+ sk->sk_state = TCP_CLOSE;
+ sock->state = SS_UNCONNECTED;
+
+ if (addr_len != sizeof(*addr))
+ return -EINVAL;
+
+ addr = (struct sockaddr_at *)uaddr;
+
+ if (addr->sat_family != AF_APPLETALK)
+ return -EAFNOSUPPORT;
+
+ if (addr->sat_addr.s_node == ATADDR_BCAST &&
+ !sock_flag(sk, SOCK_BROADCAST)) {
+#if 1
+ pr_warn("atalk_connect: %s is broken and did not set SO_BROADCAST.\n",
+ current->comm);
+#else
+ return -EACCES;
+#endif
+ }
+
+ lock_sock(sk);
+ err = -EBUSY;
+ if (sock_flag(sk, SOCK_ZAPPED))
+ if (atalk_autobind(sk) < 0)
+ goto out;
+
+ err = -ENETUNREACH;
+ if (!atrtr_get_dev(&addr->sat_addr))
+ goto out;
+
+ at->dest_port = addr->sat_port;
+ at->dest_net = addr->sat_addr.s_net;
+ at->dest_node = addr->sat_addr.s_node;
+
+ sock->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * Find the name of an AppleTalk socket. Just copy the right
+ * fields into the sockaddr.
+ */
+static int atalk_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sockaddr_at sat;
+ struct sock *sk = sock->sk;
+ struct atalk_sock *at = at_sk(sk);
+ int err;
+
+ lock_sock(sk);
+ err = -ENOBUFS;
+ if (sock_flag(sk, SOCK_ZAPPED))
+ if (atalk_autobind(sk) < 0)
+ goto out;
+
+ *uaddr_len = sizeof(struct sockaddr_at);
+ memset(&sat, 0, sizeof(sat));
+
+ if (peer) {
+ err = -ENOTCONN;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ sat.sat_addr.s_net = at->dest_net;
+ sat.sat_addr.s_node = at->dest_node;
+ sat.sat_port = at->dest_port;
+ } else {
+ sat.sat_addr.s_net = at->src_net;
+ sat.sat_addr.s_node = at->src_node;
+ sat.sat_port = at->src_port;
+ }
+
+ err = 0;
+ sat.sat_family = AF_APPLETALK;
+ memcpy(uaddr, &sat, sizeof(sat));
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE)
+static __inline__ int is_ip_over_ddp(struct sk_buff *skb)
+{
+ return skb->data[12] == 22;
+}
+
+static int handle_ip_over_ddp(struct sk_buff *skb)
+{
+ struct net_device *dev = __dev_get_by_name(&init_net, "ipddp0");
+ struct net_device_stats *stats;
+
+ /* This needs to be able to handle ipddp"N" devices */
+ if (!dev) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb_pull(skb, 13);
+ skb->dev = dev;
+ skb_reset_transport_header(skb);
+
+ stats = netdev_priv(dev);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len + 13;
+ return netif_rx(skb); /* Send the SKB up to a higher place. */
+}
+#else
+/* make it easy for gcc to optimize this test out, i.e. kill the code */
+#define is_ip_over_ddp(skb) 0
+#define handle_ip_over_ddp(skb) 0
+#endif
+
+static int atalk_route_packet(struct sk_buff *skb, struct net_device *dev,
+ struct ddpehdr *ddp, __u16 len_hops, int origlen)
+{
+ struct atalk_route *rt;
+ struct atalk_addr ta;
+
+ /*
+ * Don't route multicast, etc., packets, or packets sent to "this
+ * network"
+ */
+ if (skb->pkt_type != PACKET_HOST || !ddp->deh_dnet) {
+ /*
+ * FIXME:
+ *
+ * Can it ever happen that a packet is from a PPP iface and
+ * needs to be broadcast onto the default network?
+ */
+ if (dev->type == ARPHRD_PPP)
+ printk(KERN_DEBUG "AppleTalk: didn't forward broadcast "
+ "packet received from PPP iface\n");
+ goto free_it;
+ }
+
+ ta.s_net = ddp->deh_dnet;
+ ta.s_node = ddp->deh_dnode;
+
+ /* Route the packet */
+ rt = atrtr_find(&ta);
+ /* increment hops count */
+ len_hops += 1 << 10;
+ if (!rt || !(len_hops & (15 << 10)))
+ goto free_it;
+
+ /* FIXME: use skb->cb to be able to use shared skbs */
+
+ /*
+ * Route goes through another gateway, so set the target to the
+ * gateway instead.
+ */
+
+ if (rt->flags & RTF_GATEWAY) {
+ ta.s_net = rt->gateway.s_net;
+ ta.s_node = rt->gateway.s_node;
+ }
+
+ /* Fix up skb->len field */
+ skb_trim(skb, min_t(unsigned int, origlen,
+ (rt->dev->hard_header_len +
+ ddp_dl->header_length + (len_hops & 1023))));
+
+ /* FIXME: use skb->cb to be able to use shared skbs */
+ ddp->deh_len_hops = htons(len_hops);
+
+ /*
+ * Send the buffer onwards
+ *
+ * Now we must always be careful. If it's come from LocalTalk to
+ * EtherTalk it might not fit
+ *
+ * Order matters here: If a packet has to be copied to make a new
+ * headroom (rare hopefully) then it won't need unsharing.
+ *
+ * Note. ddp-> becomes invalid at the realloc.
+ */
+ if (skb_headroom(skb) < 22) {
+ /* 22 bytes - 12 ether, 2 len, 3 802.2 5 snap */
+ struct sk_buff *nskb = skb_realloc_headroom(skb, 32);
+ kfree_skb(skb);
+ skb = nskb;
+ } else
+ skb = skb_unshare(skb, GFP_ATOMIC);
+
+ /*
+ * If the buffer didn't vanish into the lack of space bitbucket we can
+ * send it.
+ */
+ if (skb == NULL)
+ goto drop;
+
+ if (aarp_send_ddp(rt->dev, skb, &ta, NULL) == NET_XMIT_DROP)
+ return NET_RX_DROP;
+ return NET_RX_SUCCESS;
+free_it:
+ kfree_skb(skb);
+drop:
+ return NET_RX_DROP;
+}
+
+/**
+ * atalk_rcv - Receive a packet (in skb) from device dev
+ * @skb - packet received
+ * @dev - network device where the packet comes from
+ * @pt - packet type
+ *
+ * Receive a packet (in skb) from device dev. This has come from the SNAP
+ * decoder, and on entry skb->transport_header is the DDP header, skb->len
+ * is the DDP header, skb->len is the DDP length. The physical headers
+ * have been extracted. PPP should probably pass frames marked as for this
+ * layer. [ie ARPHRD_ETHERTALK]
+ */
+static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct ddpehdr *ddp;
+ struct sock *sock;
+ struct atalk_iface *atif;
+ struct sockaddr_at tosat;
+ int origlen;
+ __u16 len_hops;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ /* Don't mangle buffer if shared */
+ if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
+ goto out;
+
+ /* Size check and make sure header is contiguous */
+ if (!pskb_may_pull(skb, sizeof(*ddp)))
+ goto drop;
+
+ ddp = ddp_hdr(skb);
+
+ len_hops = ntohs(ddp->deh_len_hops);
+
+ /* Trim buffer in case of stray trailing data */
+ origlen = skb->len;
+ skb_trim(skb, min_t(unsigned int, skb->len, len_hops & 1023));
+
+ /*
+ * Size check to see if ddp->deh_len was crap
+ * (Otherwise we'll detonate most spectacularly
+ * in the middle of atalk_checksum() or recvmsg()).
+ */
+ if (skb->len < sizeof(*ddp) || skb->len < (len_hops & 1023)) {
+ pr_debug("AppleTalk: dropping corrupted frame (deh_len=%u, "
+ "skb->len=%u)\n", len_hops & 1023, skb->len);
+ goto drop;
+ }
+
+ /*
+ * Any checksums. Note we don't do htons() on this == is assumed to be
+ * valid for net byte orders all over the networking code...
+ */
+ if (ddp->deh_sum &&
+ atalk_checksum(skb, len_hops & 1023) != ddp->deh_sum)
+ /* Not a valid AppleTalk frame - dustbin time */
+ goto drop;
+
+ /* Check the packet is aimed at us */
+ if (!ddp->deh_dnet) /* Net 0 is 'this network' */
+ atif = atalk_find_anynet(ddp->deh_dnode, dev);
+ else
+ atif = atalk_find_interface(ddp->deh_dnet, ddp->deh_dnode);
+
+ if (!atif) {
+ /* Not ours, so we route the packet via the correct
+ * AppleTalk iface
+ */
+ return atalk_route_packet(skb, dev, ddp, len_hops, origlen);
+ }
+
+ /* if IP over DDP is not selected this code will be optimized out */
+ if (is_ip_over_ddp(skb))
+ return handle_ip_over_ddp(skb);
+ /*
+ * Which socket - atalk_search_socket() looks for a *full match*
+ * of the <net, node, port> tuple.
+ */
+ tosat.sat_addr.s_net = ddp->deh_dnet;
+ tosat.sat_addr.s_node = ddp->deh_dnode;
+ tosat.sat_port = ddp->deh_dport;
+
+ sock = atalk_search_socket(&tosat, atif);
+ if (!sock) /* But not one of our sockets */
+ goto drop;
+
+ /* Queue packet (standard) */
+ if (sock_queue_rcv_skb(sock, skb) < 0)
+ goto drop;
+
+ return NET_RX_SUCCESS;
+
+drop:
+ kfree_skb(skb);
+out:
+ return NET_RX_DROP;
+
+}
+
+/*
+ * Receive a LocalTalk frame. We make some demands on the caller here.
+ * Caller must provide enough headroom on the packet to pull the short
+ * header and append a long one.
+ */
+static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ if (!net_eq(dev_net(dev), &init_net))
+ goto freeit;
+
+ /* Expand any short form frames */
+ if (skb_mac_header(skb)[2] == 1) {
+ struct ddpehdr *ddp;
+ /* Find our address */
+ struct atalk_addr *ap = atalk_find_dev_addr(dev);
+
+ if (!ap || skb->len < sizeof(__be16) || skb->len > 1023)
+ goto freeit;
+
+ /* Don't mangle buffer if shared */
+ if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
+ return 0;
+
+ /*
+ * The push leaves us with a ddephdr not an shdr, and
+ * handily the port bytes in the right place preset.
+ */
+ ddp = (struct ddpehdr *) skb_push(skb, sizeof(*ddp) - 4);
+
+ /* Now fill in the long header */
+
+ /*
+ * These two first. The mac overlays the new source/dest
+ * network information so we MUST copy these before
+ * we write the network numbers !
+ */
+
+ ddp->deh_dnode = skb_mac_header(skb)[0]; /* From physical header */
+ ddp->deh_snode = skb_mac_header(skb)[1]; /* From physical header */
+
+ ddp->deh_dnet = ap->s_net; /* Network number */
+ ddp->deh_snet = ap->s_net;
+ ddp->deh_sum = 0; /* No checksum */
+ /*
+ * Not sure about this bit...
+ */
+ /* Non routable, so force a drop if we slip up later */
+ ddp->deh_len_hops = htons(skb->len + (DDP_MAXHOPS << 10));
+ }
+ skb_reset_transport_header(skb);
+
+ return atalk_rcv(skb, dev, pt, orig_dev);
+freeit:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct atalk_sock *at = at_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_at *, usat, msg->msg_name);
+ int flags = msg->msg_flags;
+ int loopback = 0;
+ struct sockaddr_at local_satalk, gsat;
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct ddpehdr *ddp;
+ int size;
+ struct atalk_route *rt;
+ int err;
+
+ if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ if (len > DDP_MAXSZ)
+ return -EMSGSIZE;
+
+ lock_sock(sk);
+ if (usat) {
+ err = -EBUSY;
+ if (sock_flag(sk, SOCK_ZAPPED))
+ if (atalk_autobind(sk) < 0)
+ goto out;
+
+ err = -EINVAL;
+ if (msg->msg_namelen < sizeof(*usat) ||
+ usat->sat_family != AF_APPLETALK)
+ goto out;
+
+ err = -EPERM;
+ /* netatalk didn't implement this check */
+ if (usat->sat_addr.s_node == ATADDR_BCAST &&
+ !sock_flag(sk, SOCK_BROADCAST)) {
+ goto out;
+ }
+ } else {
+ err = -ENOTCONN;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ usat = &local_satalk;
+ usat->sat_family = AF_APPLETALK;
+ usat->sat_port = at->dest_port;
+ usat->sat_addr.s_node = at->dest_node;
+ usat->sat_addr.s_net = at->dest_net;
+ }
+
+ /* Build a packet */
+ SOCK_DEBUG(sk, "SK %p: Got address.\n", sk);
+
+ /* For headers */
+ size = sizeof(struct ddpehdr) + len + ddp_dl->header_length;
+
+ if (usat->sat_addr.s_net || usat->sat_addr.s_node == ATADDR_ANYNODE) {
+ rt = atrtr_find(&usat->sat_addr);
+ } else {
+ struct atalk_addr at_hint;
+
+ at_hint.s_node = 0;
+ at_hint.s_net = at->src_net;
+
+ rt = atrtr_find(&at_hint);
+ }
+ err = ENETUNREACH;
+ if (!rt)
+ goto out;
+
+ dev = rt->dev;
+
+ SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n",
+ sk, size, dev->name);
+
+ size += dev->hard_header_len;
+ release_sock(sk);
+ skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err);
+ lock_sock(sk);
+ if (!skb)
+ goto out;
+
+ skb_reserve(skb, ddp_dl->header_length);
+ skb_reserve(skb, dev->hard_header_len);
+ skb->dev = dev;
+
+ SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk);
+
+ ddp = (struct ddpehdr *)skb_put(skb, sizeof(struct ddpehdr));
+ ddp->deh_len_hops = htons(len + sizeof(*ddp));
+ ddp->deh_dnet = usat->sat_addr.s_net;
+ ddp->deh_snet = at->src_net;
+ ddp->deh_dnode = usat->sat_addr.s_node;
+ ddp->deh_snode = at->src_node;
+ ddp->deh_dport = usat->sat_port;
+ ddp->deh_sport = at->src_port;
+
+ SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len);
+
+ err = memcpy_from_msg(skb_put(skb, len), msg, len);
+ if (err) {
+ kfree_skb(skb);
+ err = -EFAULT;
+ goto out;
+ }
+
+ if (sk->sk_no_check_tx)
+ ddp->deh_sum = 0;
+ else
+ ddp->deh_sum = atalk_checksum(skb, len + sizeof(*ddp));
+
+ /*
+ * Loopback broadcast packets to non gateway targets (ie routes
+ * to group we are in)
+ */
+ if (ddp->deh_dnode == ATADDR_BCAST &&
+ !(rt->flags & RTF_GATEWAY) && !(dev->flags & IFF_LOOPBACK)) {
+ struct sk_buff *skb2 = skb_copy(skb, GFP_KERNEL);
+
+ if (skb2) {
+ loopback = 1;
+ SOCK_DEBUG(sk, "SK %p: send out(copy).\n", sk);
+ /*
+ * If it fails it is queued/sent above in the aarp queue
+ */
+ aarp_send_ddp(dev, skb2, &usat->sat_addr, NULL);
+ }
+ }
+
+ if (dev->flags & IFF_LOOPBACK || loopback) {
+ SOCK_DEBUG(sk, "SK %p: Loop back.\n", sk);
+ /* loop back */
+ skb_orphan(skb);
+ if (ddp->deh_dnode == ATADDR_BCAST) {
+ struct atalk_addr at_lo;
+
+ at_lo.s_node = 0;
+ at_lo.s_net = 0;
+
+ rt = atrtr_find(&at_lo);
+ if (!rt) {
+ kfree_skb(skb);
+ err = -ENETUNREACH;
+ goto out;
+ }
+ dev = rt->dev;
+ skb->dev = dev;
+ }
+ ddp_dl->request(ddp_dl, skb, dev->dev_addr);
+ } else {
+ SOCK_DEBUG(sk, "SK %p: send out.\n", sk);
+ if (rt->flags & RTF_GATEWAY) {
+ gsat.sat_addr = rt->gateway;
+ usat = &gsat;
+ }
+
+ /*
+ * If it fails it is queued/sent above in the aarp queue
+ */
+ aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
+ }
+ SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len);
+
+out:
+ release_sock(sk);
+ return err ? : len;
+}
+
+static int atalk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct ddpehdr *ddp;
+ int copied = 0;
+ int offset = 0;
+ int err = 0;
+ struct sk_buff *skb;
+
+ skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+ flags & MSG_DONTWAIT, &err);
+ lock_sock(sk);
+
+ if (!skb)
+ goto out;
+
+ /* FIXME: use skb->cb to be able to use shared skbs */
+ ddp = ddp_hdr(skb);
+ copied = ntohs(ddp->deh_len_hops) & 1023;
+
+ if (sk->sk_type != SOCK_RAW) {
+ offset = sizeof(*ddp);
+ copied -= offset;
+ }
+
+ if (copied > size) {
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+ err = skb_copy_datagram_msg(skb, offset, msg, copied);
+
+ if (!err && msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name);
+ sat->sat_family = AF_APPLETALK;
+ sat->sat_port = ddp->deh_sport;
+ sat->sat_addr.s_node = ddp->deh_snode;
+ sat->sat_addr.s_net = ddp->deh_snet;
+ msg->msg_namelen = sizeof(*sat);
+ }
+
+ skb_free_datagram(sk, skb); /* Free the datagram. */
+
+out:
+ release_sock(sk);
+ return err ? : copied;
+}
+
+
+/*
+ * AppleTalk ioctl calls.
+ */
+static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ int rc = -ENOIOCTLCMD;
+ struct sock *sk = sock->sk;
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ /* Protocol layer */
+ case TIOCOUTQ: {
+ long amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+
+ if (amount < 0)
+ amount = 0;
+ rc = put_user(amount, (int __user *)argp);
+ break;
+ }
+ case TIOCINQ: {
+ /*
+ * These two are safe on a single CPU system as only
+ * user tasks fiddle here
+ */
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ long amount = 0;
+
+ if (skb)
+ amount = skb->len - sizeof(struct ddpehdr);
+ rc = put_user(amount, (int __user *)argp);
+ break;
+ }
+ case SIOCGSTAMP:
+ rc = sock_get_timestamp(sk, argp);
+ break;
+ case SIOCGSTAMPNS:
+ rc = sock_get_timestampns(sk, argp);
+ break;
+ /* Routing */
+ case SIOCADDRT:
+ case SIOCDELRT:
+ rc = -EPERM;
+ if (capable(CAP_NET_ADMIN))
+ rc = atrtr_ioctl(cmd, argp);
+ break;
+ /* Interface */
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCATALKDIFADDR:
+ case SIOCDIFADDR:
+ case SIOCSARP: /* proxy AARP */
+ case SIOCDARP: /* proxy AARP */
+ rtnl_lock();
+ rc = atif_ioctl(cmd, argp);
+ rtnl_unlock();
+ break;
+ }
+
+ return rc;
+}
+
+
+#ifdef CONFIG_COMPAT
+static int atalk_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ /*
+ * SIOCATALKDIFADDR is a SIOCPROTOPRIVATE ioctl number, so we
+ * cannot handle it in common code. The data we access if ifreq
+ * here is compatible, so we can simply call the native
+ * handler.
+ */
+ if (cmd == SIOCATALKDIFADDR)
+ return atalk_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+
+ return -ENOIOCTLCMD;
+}
+#endif
+
+
+static const struct net_proto_family atalk_family_ops = {
+ .family = PF_APPLETALK,
+ .create = atalk_create,
+ .owner = THIS_MODULE,
+};
+
+static const struct proto_ops atalk_dgram_ops = {
+ .family = PF_APPLETALK,
+ .owner = THIS_MODULE,
+ .release = atalk_release,
+ .bind = atalk_bind,
+ .connect = atalk_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = atalk_getname,
+ .poll = datagram_poll,
+ .ioctl = atalk_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = atalk_compat_ioctl,
+#endif
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = atalk_sendmsg,
+ .recvmsg = atalk_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct notifier_block ddp_notifier = {
+ .notifier_call = ddp_device_event,
+};
+
+static struct packet_type ltalk_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_LOCALTALK),
+ .func = ltalk_rcv,
+};
+
+static struct packet_type ppptalk_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_PPPTALK),
+ .func = atalk_rcv,
+};
+
+static unsigned char ddp_snap_id[] = { 0x08, 0x00, 0x07, 0x80, 0x9B };
+
+/* Export symbols for use by drivers when AppleTalk is a module */
+EXPORT_SYMBOL(atrtr_get_dev);
+EXPORT_SYMBOL(atalk_find_dev_addr);
+
+static const char atalk_err_snap[] __initconst =
+ KERN_CRIT "Unable to register DDP with SNAP.\n";
+
+/* Called by proto.c on kernel start up */
+static int __init atalk_init(void)
+{
+ int rc = proto_register(&ddp_proto, 0);
+
+ if (rc != 0)
+ goto out;
+
+ (void)sock_register(&atalk_family_ops);
+ ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv);
+ if (!ddp_dl)
+ printk(atalk_err_snap);
+
+ dev_add_pack(&ltalk_packet_type);
+ dev_add_pack(&ppptalk_packet_type);
+
+ register_netdevice_notifier(&ddp_notifier);
+ aarp_proto_init();
+ atalk_proc_init();
+ atalk_register_sysctl();
+out:
+ return rc;
+}
+module_init(atalk_init);
+
+/*
+ * No explicit module reference count manipulation is needed in the
+ * protocol. Socket layer sets module reference count for us
+ * and interfaces reference counting is done
+ * by the network device layer.
+ *
+ * Ergo, before the AppleTalk module can be removed, all AppleTalk
+ * sockets be closed from user space.
+ */
+static void __exit atalk_exit(void)
+{
+#ifdef CONFIG_SYSCTL
+ atalk_unregister_sysctl();
+#endif /* CONFIG_SYSCTL */
+ atalk_proc_exit();
+ aarp_cleanup_module(); /* General aarp clean-up. */
+ unregister_netdevice_notifier(&ddp_notifier);
+ dev_remove_pack(&ltalk_packet_type);
+ dev_remove_pack(&ppptalk_packet_type);
+ unregister_snap_client(ddp_dl);
+ sock_unregister(PF_APPLETALK);
+ proto_unregister(&ddp_proto);
+}
+module_exit(atalk_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Alan Cox <alan@lxorguk.ukuu.org.uk>");
+MODULE_DESCRIPTION("AppleTalk 0.20\n");
+MODULE_ALIAS_NETPROTO(PF_APPLETALK);
diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c
new file mode 100644
index 000000000..e4158b8b9
--- /dev/null
+++ b/net/appletalk/dev.c
@@ -0,0 +1,45 @@
+/*
+ * Moved here from drivers/net/net_init.c, which is:
+ * Written 1993,1994,1995 by Donald Becker.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/if_ltalk.h>
+
+static void ltalk_setup(struct net_device *dev)
+{
+ /* Fill in the fields of the device structure with localtalk-generic values. */
+
+ dev->type = ARPHRD_LOCALTLK;
+ dev->hard_header_len = LTALK_HLEN;
+ dev->mtu = LTALK_MTU;
+ dev->addr_len = LTALK_ALEN;
+ dev->tx_queue_len = 10;
+
+ dev->broadcast[0] = 0xFF;
+
+ dev->flags = IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP;
+}
+
+/**
+ * alloc_ltalkdev - Allocates and sets up an localtalk device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ * for this localtalk device
+ *
+ * Fill in the fields of the device structure with localtalk-generic
+ * values. Basically does everything except registering the device.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+
+struct net_device *alloc_ltalkdev(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "lt%d", NET_NAME_UNKNOWN,
+ ltalk_setup);
+}
+EXPORT_SYMBOL(alloc_ltalkdev);
diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c
new file mode 100644
index 000000000..ebb864361
--- /dev/null
+++ b/net/appletalk/sysctl_net_atalk.c
@@ -0,0 +1,55 @@
+/*
+ * sysctl_net_atalk.c: sysctl interface to net AppleTalk subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/atalk directory entry (empty =) ). [MS]
+ * Dynamic registration, added aarp entries. (5/30/97 Chris Horn)
+ */
+
+#include <linux/sysctl.h>
+#include <net/sock.h>
+#include <linux/atalk.h>
+
+static struct ctl_table atalk_table[] = {
+ {
+ .procname = "aarp-expiry-time",
+ .data = &sysctl_aarp_expiry_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "aarp-tick-time",
+ .data = &sysctl_aarp_tick_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "aarp-retransmit-limit",
+ .data = &sysctl_aarp_retransmit_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "aarp-resolve-time",
+ .data = &sysctl_aarp_resolve_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { },
+};
+
+static struct ctl_table_header *atalk_table_header;
+
+void atalk_register_sysctl(void)
+{
+ atalk_table_header = register_net_sysctl(&init_net, "net/appletalk", atalk_table);
+}
+
+void atalk_unregister_sysctl(void)
+{
+ unregister_net_sysctl_table(atalk_table_header);
+}
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 000000000..754ea103b
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,73 @@
+#
+# Asynchronous Transfer Mode (ATM)
+#
+
+config ATM
+ tristate "Asynchronous Transfer Mode (ATM)"
+ ---help---
+ ATM is a high-speed networking technology for Local Area Networks
+ and Wide Area Networks. It uses a fixed packet size and is
+ connection oriented, allowing for the negotiation of minimum
+ bandwidth requirements.
+
+ In order to participate in an ATM network, your Linux box needs an
+ ATM networking card. If you have that, say Y here and to the driver
+ of your ATM card below.
+
+ Note that you need a set of user-space programs to actually make use
+ of ATM. See the file <file:Documentation/networking/atm.txt> for
+ further details.
+
+config ATM_CLIP
+ tristate "Classical IP over ATM"
+ depends on ATM && INET
+ help
+ Classical IP over ATM for PVCs and SVCs, supporting InARP and
+ ATMARP. If you want to communication with other IP hosts on your ATM
+ network, you will typically either say Y here or to "LAN Emulation
+ (LANE)" below.
+
+config ATM_CLIP_NO_ICMP
+ bool "Do NOT send ICMP if no neighbour"
+ depends on ATM_CLIP
+ help
+ Normally, an "ICMP host unreachable" message is sent if a neighbour
+ cannot be reached because there is no VC to it in the kernel's
+ ATMARP table. This may cause problems when ATMARP table entries are
+ briefly removed during revalidation. If you say Y here, packets to
+ such neighbours are silently discarded instead.
+
+config ATM_LANE
+ tristate "LAN Emulation (LANE) support"
+ depends on ATM
+ help
+ LAN Emulation emulates services of existing LANs across an ATM
+ network. Besides operating as a normal ATM end station client, Linux
+ LANE client can also act as an proxy client bridging packets between
+ ELAN and Ethernet segments. You need LANE if you want to try MPOA.
+
+config ATM_MPOA
+ tristate "Multi-Protocol Over ATM (MPOA) support"
+ depends on ATM && INET && ATM_LANE!=n
+ help
+ Multi-Protocol Over ATM allows ATM edge devices such as routers,
+ bridges and ATM attached hosts establish direct ATM VCs across
+ subnetwork boundaries. These shortcut connections bypass routers
+ enhancing overall network performance.
+
+config ATM_BR2684
+ tristate "RFC1483/2684 Bridged protocols"
+ depends on ATM && INET
+ help
+ ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
+ This device will act like an ethernet from the kernels point of view,
+ with the traffic being carried by ATM PVCs (currently 1 PVC/device).
+ This is sometimes used over DSL lines. If in doubt, say N.
+
+config ATM_BR2684_IPFILTER
+ bool "Per-VC IP filter kludge"
+ depends on ATM_BR2684
+ help
+ This is an experimental mechanism for users who need to terminate a
+ large number of IP-only vcc's. Do not enable this unless you are sure
+ you know what you are doing.
diff --git a/net/atm/Makefile b/net/atm/Makefile
new file mode 100644
index 000000000..cc50bd1ff
--- /dev/null
+++ b/net/atm/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the ATM Protocol Families.
+#
+
+atm-y := addr.o pvc.o signaling.o svc.o ioctl.o common.o atm_misc.o raw.o resources.o atm_sysfs.o
+mpoa-objs := mpc.o mpoa_caches.o mpoa_proc.o
+
+obj-$(CONFIG_ATM) += atm.o
+obj-$(CONFIG_ATM_CLIP) += clip.o
+obj-$(CONFIG_ATM_BR2684) += br2684.o
+atm-$(CONFIG_PROC_FS) += proc.o
+
+obj-$(CONFIG_ATM_LANE) += lec.o
+obj-$(CONFIG_ATM_MPOA) += mpoa.o
+obj-$(CONFIG_PPPOATM) += pppoatm.o
diff --git a/net/atm/addr.c b/net/atm/addr.c
new file mode 100644
index 000000000..dcda35c66
--- /dev/null
+++ b/net/atm/addr.c
@@ -0,0 +1,161 @@
+/* net/atm/addr.c - Local ATM address registry */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "signaling.h"
+#include "addr.h"
+
+static int check_addr(const struct sockaddr_atmsvc *addr)
+{
+ int i;
+
+ if (addr->sas_family != AF_ATMSVC)
+ return -EAFNOSUPPORT;
+ if (!*addr->sas_addr.pub)
+ return *addr->sas_addr.prv ? 0 : -EINVAL;
+ for (i = 1; i < ATM_E164_LEN + 1; i++) /* make sure it's \0-terminated */
+ if (!addr->sas_addr.pub[i])
+ return 0;
+ return -EINVAL;
+}
+
+static int identical(const struct sockaddr_atmsvc *a, const struct sockaddr_atmsvc *b)
+{
+ if (*a->sas_addr.prv)
+ if (memcmp(a->sas_addr.prv, b->sas_addr.prv, ATM_ESA_LEN))
+ return 0;
+ if (!*a->sas_addr.pub)
+ return !*b->sas_addr.pub;
+ if (!*b->sas_addr.pub)
+ return 0;
+ return !strcmp(a->sas_addr.pub, b->sas_addr.pub);
+}
+
+static void notify_sigd(const struct atm_dev *dev)
+{
+ struct sockaddr_atmpvc pvc;
+
+ pvc.sap_addr.itf = dev->number;
+ sigd_enq(NULL, as_itf_notify, NULL, &pvc, NULL);
+}
+
+void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t atype)
+{
+ unsigned long flags;
+ struct atm_dev_addr *this, *p;
+ struct list_head *head;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (atype == ATM_ADDR_LECS)
+ head = &dev->lecs;
+ else
+ head = &dev->local;
+ list_for_each_entry_safe(this, p, head, entry) {
+ list_del(&this->entry);
+ kfree(this);
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (head == &dev->local)
+ notify_sigd(dev);
+}
+
+int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
+ enum atm_addr_type_t atype)
+{
+ unsigned long flags;
+ struct atm_dev_addr *this;
+ struct list_head *head;
+ int error;
+
+ error = check_addr(addr);
+ if (error)
+ return error;
+ spin_lock_irqsave(&dev->lock, flags);
+ if (atype == ATM_ADDR_LECS)
+ head = &dev->lecs;
+ else
+ head = &dev->local;
+ list_for_each_entry(this, head, entry) {
+ if (identical(&this->addr, addr)) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -EEXIST;
+ }
+ }
+ this = kmalloc(sizeof(struct atm_dev_addr), GFP_ATOMIC);
+ if (!this) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -ENOMEM;
+ }
+ this->addr = *addr;
+ list_add(&this->entry, head);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ if (head == &dev->local)
+ notify_sigd(dev);
+ return 0;
+}
+
+int atm_del_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
+ enum atm_addr_type_t atype)
+{
+ unsigned long flags;
+ struct atm_dev_addr *this;
+ struct list_head *head;
+ int error;
+
+ error = check_addr(addr);
+ if (error)
+ return error;
+ spin_lock_irqsave(&dev->lock, flags);
+ if (atype == ATM_ADDR_LECS)
+ head = &dev->lecs;
+ else
+ head = &dev->local;
+ list_for_each_entry(this, head, entry) {
+ if (identical(&this->addr, addr)) {
+ list_del(&this->entry);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ kfree(this);
+ if (head == &dev->local)
+ notify_sigd(dev);
+ return 0;
+ }
+ }
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -ENOENT;
+}
+
+int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user * buf,
+ size_t size, enum atm_addr_type_t atype)
+{
+ unsigned long flags;
+ struct atm_dev_addr *this;
+ struct list_head *head;
+ int total = 0, error;
+ struct sockaddr_atmsvc *tmp_buf, *tmp_bufp;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ if (atype == ATM_ADDR_LECS)
+ head = &dev->lecs;
+ else
+ head = &dev->local;
+ list_for_each_entry(this, head, entry)
+ total += sizeof(struct sockaddr_atmsvc);
+ tmp_buf = tmp_bufp = kmalloc(total, GFP_ATOMIC);
+ if (!tmp_buf) {
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return -ENOMEM;
+ }
+ list_for_each_entry(this, head, entry)
+ memcpy(tmp_bufp++, &this->addr, sizeof(struct sockaddr_atmsvc));
+ spin_unlock_irqrestore(&dev->lock, flags);
+ error = total > size ? -E2BIG : total;
+ if (copy_to_user(buf, tmp_buf, total < size ? total : size))
+ error = -EFAULT;
+ kfree(tmp_buf);
+ return error;
+}
diff --git a/net/atm/addr.h b/net/atm/addr.h
new file mode 100644
index 000000000..6837e9e7e
--- /dev/null
+++ b/net/atm/addr.h
@@ -0,0 +1,20 @@
+/* net/atm/addr.h - Local ATM address registry */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef NET_ATM_ADDR_H
+#define NET_ATM_ADDR_H
+
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+
+void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t type);
+int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
+ enum atm_addr_type_t type);
+int atm_del_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
+ enum atm_addr_type_t type);
+int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user *buf,
+ size_t size, enum atm_addr_type_t type);
+
+#endif
diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c
new file mode 100644
index 000000000..876fbe83e
--- /dev/null
+++ b/net/atm/atm_misc.c
@@ -0,0 +1,101 @@
+/* net/atm/atm_misc.c - Various functions for use by ATM drivers */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL ICA */
+
+#include <linux/module.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/skbuff.h>
+#include <linux/sonet.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/atomic.h>
+
+int atm_charge(struct atm_vcc *vcc, int truesize)
+{
+ atm_force_charge(vcc, truesize);
+ if (atomic_read(&sk_atm(vcc)->sk_rmem_alloc) <= sk_atm(vcc)->sk_rcvbuf)
+ return 1;
+ atm_return(vcc, truesize);
+ atomic_inc(&vcc->stats->rx_drop);
+ return 0;
+}
+EXPORT_SYMBOL(atm_charge);
+
+struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc, int pdu_size,
+ gfp_t gfp_flags)
+{
+ struct sock *sk = sk_atm(vcc);
+ int guess = SKB_TRUESIZE(pdu_size);
+
+ atm_force_charge(vcc, guess);
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
+ struct sk_buff *skb = alloc_skb(pdu_size, gfp_flags);
+
+ if (skb) {
+ atomic_add(skb->truesize-guess,
+ &sk->sk_rmem_alloc);
+ return skb;
+ }
+ }
+ atm_return(vcc, guess);
+ atomic_inc(&vcc->stats->rx_drop);
+ return NULL;
+}
+EXPORT_SYMBOL(atm_alloc_charge);
+
+
+/*
+ * atm_pcr_goal returns the positive PCR if it should be rounded up, the
+ * negative PCR if it should be rounded down, and zero if the maximum available
+ * bandwidth should be used.
+ *
+ * The rules are as follows (* = maximum, - = absent (0), x = value "x",
+ * (x+ = x or next value above x, x- = x or next value below):
+ *
+ * min max pcr result min max pcr result
+ * - - - * (UBR only) x - - x+
+ * - - * * x - * *
+ * - - z z- x - z z-
+ * - * - * x * - x+
+ * - * * * x * * *
+ * - * z z- x * z z-
+ * - y - y- x y - x+
+ * - y * y- x y * y-
+ * - y z z- x y z z-
+ *
+ * All non-error cases can be converted with the following simple set of rules:
+ *
+ * if pcr == z then z-
+ * else if min == x && pcr == - then x+
+ * else if max == y then y-
+ * else *
+ */
+
+int atm_pcr_goal(const struct atm_trafprm *tp)
+{
+ if (tp->pcr && tp->pcr != ATM_MAX_PCR)
+ return -tp->pcr;
+ if (tp->min_pcr && !tp->pcr)
+ return tp->min_pcr;
+ if (tp->max_pcr != ATM_MAX_PCR)
+ return -tp->max_pcr;
+ return 0;
+}
+EXPORT_SYMBOL(atm_pcr_goal);
+
+void sonet_copy_stats(struct k_sonet_stats *from, struct sonet_stats *to)
+{
+#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i)
+ __SONET_ITEMS
+#undef __HANDLE_ITEM
+}
+EXPORT_SYMBOL(sonet_copy_stats);
+
+void sonet_subtract_stats(struct k_sonet_stats *from, struct sonet_stats *to)
+{
+#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i)
+ __SONET_ITEMS
+#undef __HANDLE_ITEM
+}
+EXPORT_SYMBOL(sonet_subtract_stats);
diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c
new file mode 100644
index 000000000..350bf62b2
--- /dev/null
+++ b/net/atm/atm_sysfs.c
@@ -0,0 +1,190 @@
+/* ATM driver model support. */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/atmdev.h>
+#include "common.h"
+#include "resources.h"
+
+#define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev)
+
+static ssize_t show_type(struct device *cdev,
+ struct device_attribute *attr, char *buf)
+{
+ struct atm_dev *adev = to_atm_dev(cdev);
+
+ return scnprintf(buf, PAGE_SIZE, "%s\n", adev->type);
+}
+
+static ssize_t show_address(struct device *cdev,
+ struct device_attribute *attr, char *buf)
+{
+ struct atm_dev *adev = to_atm_dev(cdev);
+
+ return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi);
+}
+
+static ssize_t show_atmaddress(struct device *cdev,
+ struct device_attribute *attr, char *buf)
+{
+ unsigned long flags;
+ struct atm_dev *adev = to_atm_dev(cdev);
+ struct atm_dev_addr *aaddr;
+ int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin;
+ int i, j, count = 0;
+
+ spin_lock_irqsave(&adev->lock, flags);
+ list_for_each_entry(aaddr, &adev->local, entry) {
+ for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) {
+ if (j == *fmt) {
+ count += scnprintf(buf + count,
+ PAGE_SIZE - count, ".");
+ ++fmt;
+ j = 0;
+ }
+ count += scnprintf(buf + count,
+ PAGE_SIZE - count, "%02x",
+ aaddr->addr.sas_addr.prv[i]);
+ }
+ count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
+ }
+ spin_unlock_irqrestore(&adev->lock, flags);
+
+ return count;
+}
+
+static ssize_t show_atmindex(struct device *cdev,
+ struct device_attribute *attr, char *buf)
+{
+ struct atm_dev *adev = to_atm_dev(cdev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", adev->number);
+}
+
+static ssize_t show_carrier(struct device *cdev,
+ struct device_attribute *attr, char *buf)
+{
+ struct atm_dev *adev = to_atm_dev(cdev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ adev->signal == ATM_PHY_SIG_LOST ? 0 : 1);
+}
+
+static ssize_t show_link_rate(struct device *cdev,
+ struct device_attribute *attr, char *buf)
+{
+ struct atm_dev *adev = to_atm_dev(cdev);
+ int link_rate;
+
+ /* show the link rate, not the data rate */
+ switch (adev->link_rate) {
+ case ATM_OC3_PCR:
+ link_rate = 155520000;
+ break;
+ case ATM_OC12_PCR:
+ link_rate = 622080000;
+ break;
+ case ATM_25_PCR:
+ link_rate = 25600000;
+ break;
+ default:
+ link_rate = adev->link_rate * 8 * 53;
+ }
+ return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate);
+}
+
+static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
+static DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL);
+static DEVICE_ATTR(atmindex, S_IRUGO, show_atmindex, NULL);
+static DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL);
+static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
+static DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL);
+
+static struct device_attribute *atm_attrs[] = {
+ &dev_attr_atmaddress,
+ &dev_attr_address,
+ &dev_attr_atmindex,
+ &dev_attr_carrier,
+ &dev_attr_type,
+ &dev_attr_link_rate,
+ NULL
+};
+
+
+static int atm_uevent(struct device *cdev, struct kobj_uevent_env *env)
+{
+ struct atm_dev *adev;
+
+ if (!cdev)
+ return -ENODEV;
+
+ adev = to_atm_dev(cdev);
+ if (!adev)
+ return -ENODEV;
+
+ if (add_uevent_var(env, "NAME=%s%d", adev->type, adev->number))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void atm_release(struct device *cdev)
+{
+ struct atm_dev *adev = to_atm_dev(cdev);
+
+ kfree(adev);
+}
+
+static struct class atm_class = {
+ .name = "atm",
+ .dev_release = atm_release,
+ .dev_uevent = atm_uevent,
+};
+
+int atm_register_sysfs(struct atm_dev *adev, struct device *parent)
+{
+ struct device *cdev = &adev->class_dev;
+ int i, j, err;
+
+ cdev->class = &atm_class;
+ cdev->parent = parent;
+ dev_set_drvdata(cdev, adev);
+
+ dev_set_name(cdev, "%s%d", adev->type, adev->number);
+ err = device_register(cdev);
+ if (err < 0)
+ return err;
+
+ for (i = 0; atm_attrs[i]; i++) {
+ err = device_create_file(cdev, atm_attrs[i]);
+ if (err)
+ goto err_out;
+ }
+
+ return 0;
+
+err_out:
+ for (j = 0; j < i; j++)
+ device_remove_file(cdev, atm_attrs[j]);
+ device_del(cdev);
+ return err;
+}
+
+void atm_unregister_sysfs(struct atm_dev *adev)
+{
+ struct device *cdev = &adev->class_dev;
+
+ device_del(cdev);
+}
+
+int __init atm_sysfs_init(void)
+{
+ return class_register(&atm_class);
+}
+
+void __exit atm_sysfs_exit(void)
+{
+ class_unregister(&atm_class);
+}
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
new file mode 100644
index 000000000..cc78538d1
--- /dev/null
+++ b/net/atm/br2684.c
@@ -0,0 +1,886 @@
+/*
+ * Ethernet netdevice using ATM AAL5 as underlying carrier
+ * (RFC1483 obsoleted by RFC2684) for Linux
+ *
+ * Authors: Marcell GAL, 2000, XDSL Ltd, Hungary
+ * Eric Kinzie, 2006-2007, US Naval Research Laboratory
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/ip.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <net/arp.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/capability.h>
+#include <linux/seq_file.h>
+
+#include <linux/atmbr2684.h>
+
+#include "common.h"
+
+static void skb_debug(const struct sk_buff *skb)
+{
+#ifdef SKB_DEBUG
+#define NUM2PRINT 50
+ print_hex_dump(KERN_DEBUG, "br2684: skb: ", DUMP_OFFSET,
+ 16, 1, skb->data, min(NUM2PRINT, skb->len), true);
+#endif
+}
+
+#define BR2684_ETHERTYPE_LEN 2
+#define BR2684_PAD_LEN 2
+
+#define LLC 0xaa, 0xaa, 0x03
+#define SNAP_BRIDGED 0x00, 0x80, 0xc2
+#define SNAP_ROUTED 0x00, 0x00, 0x00
+#define PID_ETHERNET 0x00, 0x07
+#define ETHERTYPE_IPV4 0x08, 0x00
+#define ETHERTYPE_IPV6 0x86, 0xdd
+#define PAD_BRIDGED 0x00, 0x00
+
+static const unsigned char ethertype_ipv4[] = { ETHERTYPE_IPV4 };
+static const unsigned char ethertype_ipv6[] = { ETHERTYPE_IPV6 };
+static const unsigned char llc_oui_pid_pad[] =
+ { LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED };
+static const unsigned char pad[] = { PAD_BRIDGED };
+static const unsigned char llc_oui_ipv4[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 };
+static const unsigned char llc_oui_ipv6[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 };
+
+enum br2684_encaps {
+ e_vc = BR2684_ENCAPS_VC,
+ e_llc = BR2684_ENCAPS_LLC,
+};
+
+struct br2684_vcc {
+ struct atm_vcc *atmvcc;
+ struct net_device *device;
+ /* keep old push, pop functions for chaining */
+ void (*old_push)(struct atm_vcc *vcc, struct sk_buff *skb);
+ void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb);
+ void (*old_release_cb)(struct atm_vcc *vcc);
+ struct module *old_owner;
+ enum br2684_encaps encaps;
+ struct list_head brvccs;
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+ struct br2684_filter filter;
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+ unsigned int copies_needed, copies_failed;
+ atomic_t qspace;
+};
+
+struct br2684_dev {
+ struct net_device *net_dev;
+ struct list_head br2684_devs;
+ int number;
+ struct list_head brvccs; /* one device <=> one vcc (before xmas) */
+ int mac_was_set;
+ enum br2684_payload payload;
+};
+
+/*
+ * This lock should be held for writing any time the list of devices or
+ * their attached vcc's could be altered. It should be held for reading
+ * any time these are being queried. Note that we sometimes need to
+ * do read-locking under interrupt context, so write locking must block
+ * the current CPU's interrupts
+ */
+static DEFINE_RWLOCK(devs_lock);
+
+static LIST_HEAD(br2684_devs);
+
+static inline struct br2684_dev *BRPRIV(const struct net_device *net_dev)
+{
+ return netdev_priv(net_dev);
+}
+
+static inline struct net_device *list_entry_brdev(const struct list_head *le)
+{
+ return list_entry(le, struct br2684_dev, br2684_devs)->net_dev;
+}
+
+static inline struct br2684_vcc *BR2684_VCC(const struct atm_vcc *atmvcc)
+{
+ return (struct br2684_vcc *)(atmvcc->user_back);
+}
+
+static inline struct br2684_vcc *list_entry_brvcc(const struct list_head *le)
+{
+ return list_entry(le, struct br2684_vcc, brvccs);
+}
+
+/* Caller should hold read_lock(&devs_lock) */
+static struct net_device *br2684_find_dev(const struct br2684_if_spec *s)
+{
+ struct list_head *lh;
+ struct net_device *net_dev;
+ switch (s->method) {
+ case BR2684_FIND_BYNUM:
+ list_for_each(lh, &br2684_devs) {
+ net_dev = list_entry_brdev(lh);
+ if (BRPRIV(net_dev)->number == s->spec.devnum)
+ return net_dev;
+ }
+ break;
+ case BR2684_FIND_BYIFNAME:
+ list_for_each(lh, &br2684_devs) {
+ net_dev = list_entry_brdev(lh);
+ if (!strncmp(net_dev->name, s->spec.ifname, IFNAMSIZ))
+ return net_dev;
+ }
+ break;
+ }
+ return NULL;
+}
+
+static int atm_dev_event(struct notifier_block *this, unsigned long event,
+ void *arg)
+{
+ struct atm_dev *atm_dev = arg;
+ struct list_head *lh;
+ struct net_device *net_dev;
+ struct br2684_vcc *brvcc;
+ struct atm_vcc *atm_vcc;
+ unsigned long flags;
+
+ pr_debug("event=%ld dev=%p\n", event, atm_dev);
+
+ read_lock_irqsave(&devs_lock, flags);
+ list_for_each(lh, &br2684_devs) {
+ net_dev = list_entry_brdev(lh);
+
+ list_for_each_entry(brvcc, &BRPRIV(net_dev)->brvccs, brvccs) {
+ atm_vcc = brvcc->atmvcc;
+ if (atm_vcc && brvcc->atmvcc->dev == atm_dev) {
+
+ if (atm_vcc->dev->signal == ATM_PHY_SIG_LOST)
+ netif_carrier_off(net_dev);
+ else
+ netif_carrier_on(net_dev);
+
+ }
+ }
+ }
+ read_unlock_irqrestore(&devs_lock, flags);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block atm_dev_notifier = {
+ .notifier_call = atm_dev_event,
+};
+
+/* chained vcc->pop function. Check if we should wake the netif_queue */
+static void br2684_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct br2684_vcc *brvcc = BR2684_VCC(vcc);
+
+ pr_debug("(vcc %p ; net_dev %p )\n", vcc, brvcc->device);
+ brvcc->old_pop(vcc, skb);
+
+ /* If the queue space just went up from zero, wake */
+ if (atomic_inc_return(&brvcc->qspace) == 1)
+ netif_wake_queue(brvcc->device);
+}
+
+/*
+ * Send a packet out a particular vcc. Not to useful right now, but paves
+ * the way for multiple vcc's per itf. Returns true if we can send,
+ * otherwise false
+ */
+static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev,
+ struct br2684_vcc *brvcc)
+{
+ struct br2684_dev *brdev = BRPRIV(dev);
+ struct atm_vcc *atmvcc;
+ int minheadroom = (brvcc->encaps == e_llc) ?
+ ((brdev->payload == p_bridged) ?
+ sizeof(llc_oui_pid_pad) : sizeof(llc_oui_ipv4)) :
+ ((brdev->payload == p_bridged) ? BR2684_PAD_LEN : 0);
+
+ if (skb_headroom(skb) < minheadroom) {
+ struct sk_buff *skb2 = skb_realloc_headroom(skb, minheadroom);
+ brvcc->copies_needed++;
+ dev_kfree_skb(skb);
+ if (skb2 == NULL) {
+ brvcc->copies_failed++;
+ return 0;
+ }
+ skb = skb2;
+ }
+
+ if (brvcc->encaps == e_llc) {
+ if (brdev->payload == p_bridged) {
+ skb_push(skb, sizeof(llc_oui_pid_pad));
+ skb_copy_to_linear_data(skb, llc_oui_pid_pad,
+ sizeof(llc_oui_pid_pad));
+ } else if (brdev->payload == p_routed) {
+ unsigned short prot = ntohs(skb->protocol);
+
+ skb_push(skb, sizeof(llc_oui_ipv4));
+ switch (prot) {
+ case ETH_P_IP:
+ skb_copy_to_linear_data(skb, llc_oui_ipv4,
+ sizeof(llc_oui_ipv4));
+ break;
+ case ETH_P_IPV6:
+ skb_copy_to_linear_data(skb, llc_oui_ipv6,
+ sizeof(llc_oui_ipv6));
+ break;
+ default:
+ dev_kfree_skb(skb);
+ return 0;
+ }
+ }
+ } else { /* e_vc */
+ if (brdev->payload == p_bridged) {
+ skb_push(skb, 2);
+ memset(skb->data, 0, 2);
+ }
+ }
+ skb_debug(skb);
+
+ ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc;
+ pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev);
+ atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc);
+ ATM_SKB(skb)->atm_options = atmvcc->atm_options;
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ if (atomic_dec_return(&brvcc->qspace) < 1) {
+ /* No more please! */
+ netif_stop_queue(brvcc->device);
+ /* We might have raced with br2684_pop() */
+ if (unlikely(atomic_read(&brvcc->qspace) > 0))
+ netif_wake_queue(brvcc->device);
+ }
+
+ /* If this fails immediately, the skb will be freed and br2684_pop()
+ will wake the queue if appropriate. Just return an error so that
+ the stats are updated correctly */
+ return !atmvcc->send(atmvcc, skb);
+}
+
+static void br2684_release_cb(struct atm_vcc *atmvcc)
+{
+ struct br2684_vcc *brvcc = BR2684_VCC(atmvcc);
+
+ if (atomic_read(&brvcc->qspace) > 0)
+ netif_wake_queue(brvcc->device);
+
+ if (brvcc->old_release_cb)
+ brvcc->old_release_cb(atmvcc);
+}
+
+static inline struct br2684_vcc *pick_outgoing_vcc(const struct sk_buff *skb,
+ const struct br2684_dev *brdev)
+{
+ return list_empty(&brdev->brvccs) ? NULL : list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */
+}
+
+static netdev_tx_t br2684_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct br2684_dev *brdev = BRPRIV(dev);
+ struct br2684_vcc *brvcc;
+ struct atm_vcc *atmvcc;
+ netdev_tx_t ret = NETDEV_TX_OK;
+
+ pr_debug("skb_dst(skb)=%p\n", skb_dst(skb));
+ read_lock(&devs_lock);
+ brvcc = pick_outgoing_vcc(skb, brdev);
+ if (brvcc == NULL) {
+ pr_debug("no vcc attached to dev %s\n", dev->name);
+ dev->stats.tx_errors++;
+ dev->stats.tx_carrier_errors++;
+ /* netif_stop_queue(dev); */
+ dev_kfree_skb(skb);
+ goto out_devs;
+ }
+ atmvcc = brvcc->atmvcc;
+
+ bh_lock_sock(sk_atm(atmvcc));
+
+ if (test_bit(ATM_VF_RELEASED, &atmvcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &atmvcc->flags) ||
+ !test_bit(ATM_VF_READY, &atmvcc->flags)) {
+ dev->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ goto out;
+ }
+
+ if (sock_owned_by_user(sk_atm(atmvcc))) {
+ netif_stop_queue(brvcc->device);
+ ret = NETDEV_TX_BUSY;
+ goto out;
+ }
+
+ if (!br2684_xmit_vcc(skb, dev, brvcc)) {
+ /*
+ * We should probably use netif_*_queue() here, but that
+ * involves added complication. We need to walk before
+ * we can run.
+ *
+ * Don't free here! this pointer might be no longer valid!
+ */
+ dev->stats.tx_errors++;
+ dev->stats.tx_fifo_errors++;
+ }
+ out:
+ bh_unlock_sock(sk_atm(atmvcc));
+ out_devs:
+ read_unlock(&devs_lock);
+ return ret;
+}
+
+/*
+ * We remember when the MAC gets set, so we don't override it later with
+ * the ESI of the ATM card of the first VC
+ */
+static int br2684_mac_addr(struct net_device *dev, void *p)
+{
+ int err = eth_mac_addr(dev, p);
+ if (!err)
+ BRPRIV(dev)->mac_was_set = 1;
+ return err;
+}
+
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+/* this IOCTL is experimental. */
+static int br2684_setfilt(struct atm_vcc *atmvcc, void __user * arg)
+{
+ struct br2684_vcc *brvcc;
+ struct br2684_filter_set fs;
+
+ if (copy_from_user(&fs, arg, sizeof fs))
+ return -EFAULT;
+ if (fs.ifspec.method != BR2684_FIND_BYNOTHING) {
+ /*
+ * This is really a per-vcc thing, but we can also search
+ * by device.
+ */
+ struct br2684_dev *brdev;
+ read_lock(&devs_lock);
+ brdev = BRPRIV(br2684_find_dev(&fs.ifspec));
+ if (brdev == NULL || list_empty(&brdev->brvccs) ||
+ brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */
+ brvcc = NULL;
+ else
+ brvcc = list_entry_brvcc(brdev->brvccs.next);
+ read_unlock(&devs_lock);
+ if (brvcc == NULL)
+ return -ESRCH;
+ } else
+ brvcc = BR2684_VCC(atmvcc);
+ memcpy(&brvcc->filter, &fs.filter, sizeof(brvcc->filter));
+ return 0;
+}
+
+/* Returns 1 if packet should be dropped */
+static inline int
+packet_fails_filter(__be16 type, struct br2684_vcc *brvcc, struct sk_buff *skb)
+{
+ if (brvcc->filter.netmask == 0)
+ return 0; /* no filter in place */
+ if (type == htons(ETH_P_IP) &&
+ (((struct iphdr *)(skb->data))->daddr & brvcc->filter.
+ netmask) == brvcc->filter.prefix)
+ return 0;
+ if (type == htons(ETH_P_ARP))
+ return 0;
+ /*
+ * TODO: we should probably filter ARPs too.. don't want to have
+ * them returning values that don't make sense, or is that ok?
+ */
+ return 1; /* drop */
+}
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+
+static void br2684_close_vcc(struct br2684_vcc *brvcc)
+{
+ pr_debug("removing VCC %p from dev %p\n", brvcc, brvcc->device);
+ write_lock_irq(&devs_lock);
+ list_del(&brvcc->brvccs);
+ write_unlock_irq(&devs_lock);
+ brvcc->atmvcc->user_back = NULL; /* what about vcc->recvq ??? */
+ brvcc->atmvcc->release_cb = brvcc->old_release_cb;
+ brvcc->old_push(brvcc->atmvcc, NULL); /* pass on the bad news */
+ module_put(brvcc->old_owner);
+ kfree(brvcc);
+}
+
+/* when AAL5 PDU comes in: */
+static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
+{
+ struct br2684_vcc *brvcc = BR2684_VCC(atmvcc);
+ struct net_device *net_dev = brvcc->device;
+ struct br2684_dev *brdev = BRPRIV(net_dev);
+
+ pr_debug("\n");
+
+ if (unlikely(skb == NULL)) {
+ /* skb==NULL means VCC is being destroyed */
+ br2684_close_vcc(brvcc);
+ if (list_empty(&brdev->brvccs)) {
+ write_lock_irq(&devs_lock);
+ list_del(&brdev->br2684_devs);
+ write_unlock_irq(&devs_lock);
+ unregister_netdev(net_dev);
+ free_netdev(net_dev);
+ }
+ return;
+ }
+
+ skb_debug(skb);
+ atm_return(atmvcc, skb->truesize);
+ pr_debug("skb from brdev %p\n", brdev);
+ if (brvcc->encaps == e_llc) {
+
+ if (skb->len > 7 && skb->data[7] == 0x01)
+ __skb_trim(skb, skb->len - 4);
+
+ /* accept packets that have "ipv[46]" in the snap header */
+ if ((skb->len >= (sizeof(llc_oui_ipv4))) &&
+ (memcmp(skb->data, llc_oui_ipv4,
+ sizeof(llc_oui_ipv4) - BR2684_ETHERTYPE_LEN) == 0)) {
+ if (memcmp(skb->data + 6, ethertype_ipv6,
+ sizeof(ethertype_ipv6)) == 0)
+ skb->protocol = htons(ETH_P_IPV6);
+ else if (memcmp(skb->data + 6, ethertype_ipv4,
+ sizeof(ethertype_ipv4)) == 0)
+ skb->protocol = htons(ETH_P_IP);
+ else
+ goto error;
+ skb_pull(skb, sizeof(llc_oui_ipv4));
+ skb_reset_network_header(skb);
+ skb->pkt_type = PACKET_HOST;
+ /*
+ * Let us waste some time for checking the encapsulation.
+ * Note, that only 7 char is checked so frames with a valid FCS
+ * are also accepted (but FCS is not checked of course).
+ */
+ } else if ((skb->len >= sizeof(llc_oui_pid_pad)) &&
+ (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) {
+ skb_pull(skb, sizeof(llc_oui_pid_pad));
+ skb->protocol = eth_type_trans(skb, net_dev);
+ } else
+ goto error;
+
+ } else { /* e_vc */
+ if (brdev->payload == p_routed) {
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ if (iph->version == 4)
+ skb->protocol = htons(ETH_P_IP);
+ else if (iph->version == 6)
+ skb->protocol = htons(ETH_P_IPV6);
+ else
+ goto error;
+ skb->pkt_type = PACKET_HOST;
+ } else { /* p_bridged */
+ /* first 2 chars should be 0 */
+ if (memcmp(skb->data, pad, BR2684_PAD_LEN) != 0)
+ goto error;
+ skb_pull(skb, BR2684_PAD_LEN);
+ skb->protocol = eth_type_trans(skb, net_dev);
+ }
+ }
+
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+ if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb)))
+ goto dropped;
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+ skb->dev = net_dev;
+ ATM_SKB(skb)->vcc = atmvcc; /* needed ? */
+ pr_debug("received packet's protocol: %x\n", ntohs(skb->protocol));
+ skb_debug(skb);
+ /* sigh, interface is down? */
+ if (unlikely(!(net_dev->flags & IFF_UP)))
+ goto dropped;
+ net_dev->stats.rx_packets++;
+ net_dev->stats.rx_bytes += skb->len;
+ memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+ netif_rx(skb);
+ return;
+
+dropped:
+ net_dev->stats.rx_dropped++;
+ goto free_skb;
+error:
+ net_dev->stats.rx_errors++;
+free_skb:
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Assign a vcc to a dev
+ * Note: we do not have explicit unassign, but look at _push()
+ */
+static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
+{
+ struct br2684_vcc *brvcc;
+ struct br2684_dev *brdev;
+ struct net_device *net_dev;
+ struct atm_backend_br2684 be;
+ int err;
+
+ if (copy_from_user(&be, arg, sizeof be))
+ return -EFAULT;
+ brvcc = kzalloc(sizeof(struct br2684_vcc), GFP_KERNEL);
+ if (!brvcc)
+ return -ENOMEM;
+ /*
+ * Allow two packets in the ATM queue. One actually being sent, and one
+ * for the ATM 'TX done' handler to send. It shouldn't take long to get
+ * the next one from the netdev queue, when we need it. More than that
+ * would be bufferbloat.
+ */
+ atomic_set(&brvcc->qspace, 2);
+ write_lock_irq(&devs_lock);
+ net_dev = br2684_find_dev(&be.ifspec);
+ if (net_dev == NULL) {
+ pr_err("tried to attach to non-existent device\n");
+ err = -ENXIO;
+ goto error;
+ }
+ brdev = BRPRIV(net_dev);
+ if (atmvcc->push == NULL) {
+ err = -EBADFD;
+ goto error;
+ }
+ if (!list_empty(&brdev->brvccs)) {
+ /* Only 1 VCC/dev right now */
+ err = -EEXIST;
+ goto error;
+ }
+ if (be.fcs_in != BR2684_FCSIN_NO ||
+ be.fcs_out != BR2684_FCSOUT_NO ||
+ be.fcs_auto || be.has_vpiid || be.send_padding ||
+ (be.encaps != BR2684_ENCAPS_VC &&
+ be.encaps != BR2684_ENCAPS_LLC) ||
+ be.min_size != 0) {
+ err = -EINVAL;
+ goto error;
+ }
+ pr_debug("vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps, brvcc);
+ if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) {
+ unsigned char *esi = atmvcc->dev->esi;
+ if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5])
+ memcpy(net_dev->dev_addr, esi, net_dev->addr_len);
+ else
+ net_dev->dev_addr[2] = 1;
+ }
+ list_add(&brvcc->brvccs, &brdev->brvccs);
+ write_unlock_irq(&devs_lock);
+ brvcc->device = net_dev;
+ brvcc->atmvcc = atmvcc;
+ atmvcc->user_back = brvcc;
+ brvcc->encaps = (enum br2684_encaps)be.encaps;
+ brvcc->old_push = atmvcc->push;
+ brvcc->old_pop = atmvcc->pop;
+ brvcc->old_release_cb = atmvcc->release_cb;
+ brvcc->old_owner = atmvcc->owner;
+ barrier();
+ atmvcc->push = br2684_push;
+ atmvcc->pop = br2684_pop;
+ atmvcc->release_cb = br2684_release_cb;
+ atmvcc->owner = THIS_MODULE;
+
+ /* initialize netdev carrier state */
+ if (atmvcc->dev->signal == ATM_PHY_SIG_LOST)
+ netif_carrier_off(net_dev);
+ else
+ netif_carrier_on(net_dev);
+
+ __module_get(THIS_MODULE);
+
+ /* re-process everything received between connection setup and
+ backend setup */
+ vcc_process_recv_queue(atmvcc);
+ return 0;
+
+error:
+ write_unlock_irq(&devs_lock);
+ kfree(brvcc);
+ return err;
+}
+
+static const struct net_device_ops br2684_netdev_ops = {
+ .ndo_start_xmit = br2684_start_xmit,
+ .ndo_set_mac_address = br2684_mac_addr,
+ .ndo_change_mtu = eth_change_mtu,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
+static const struct net_device_ops br2684_netdev_ops_routed = {
+ .ndo_start_xmit = br2684_start_xmit,
+ .ndo_set_mac_address = br2684_mac_addr,
+ .ndo_change_mtu = eth_change_mtu
+};
+
+static void br2684_setup(struct net_device *netdev)
+{
+ struct br2684_dev *brdev = BRPRIV(netdev);
+
+ ether_setup(netdev);
+ netdev->hard_header_len += sizeof(llc_oui_pid_pad); /* worst case */
+ brdev->net_dev = netdev;
+
+ netdev->netdev_ops = &br2684_netdev_ops;
+
+ INIT_LIST_HEAD(&brdev->brvccs);
+}
+
+static void br2684_setup_routed(struct net_device *netdev)
+{
+ struct br2684_dev *brdev = BRPRIV(netdev);
+
+ brdev->net_dev = netdev;
+ netdev->hard_header_len = sizeof(llc_oui_ipv4); /* worst case */
+ netdev->netdev_ops = &br2684_netdev_ops_routed;
+ netdev->addr_len = 0;
+ netdev->mtu = 1500;
+ netdev->type = ARPHRD_PPP;
+ netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+ netdev->tx_queue_len = 100;
+ INIT_LIST_HEAD(&brdev->brvccs);
+}
+
+static int br2684_create(void __user *arg)
+{
+ int err;
+ struct net_device *netdev;
+ struct br2684_dev *brdev;
+ struct atm_newif_br2684 ni;
+ enum br2684_payload payload;
+
+ pr_debug("\n");
+
+ if (copy_from_user(&ni, arg, sizeof ni))
+ return -EFAULT;
+
+ if (ni.media & BR2684_FLAG_ROUTED)
+ payload = p_routed;
+ else
+ payload = p_bridged;
+ ni.media &= 0xffff; /* strip flags */
+
+ if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500)
+ return -EINVAL;
+
+ netdev = alloc_netdev(sizeof(struct br2684_dev),
+ ni.ifname[0] ? ni.ifname : "nas%d",
+ NET_NAME_UNKNOWN,
+ (payload == p_routed) ? br2684_setup_routed : br2684_setup);
+ if (!netdev)
+ return -ENOMEM;
+
+ brdev = BRPRIV(netdev);
+
+ pr_debug("registered netdev %s\n", netdev->name);
+ /* open, stop, do_ioctl ? */
+ err = register_netdev(netdev);
+ if (err < 0) {
+ pr_err("register_netdev failed\n");
+ free_netdev(netdev);
+ return err;
+ }
+
+ write_lock_irq(&devs_lock);
+
+ brdev->payload = payload;
+
+ if (list_empty(&br2684_devs)) {
+ /* 1st br2684 device */
+ brdev->number = 1;
+ } else
+ brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
+
+ list_add_tail(&brdev->br2684_devs, &br2684_devs);
+ write_unlock_irq(&devs_lock);
+ return 0;
+}
+
+/*
+ * This handles ioctls actually performed on our vcc - we must return
+ * -ENOIOCTLCMD for any unrecognized ioctl
+ */
+static int br2684_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct atm_vcc *atmvcc = ATM_SD(sock);
+ void __user *argp = (void __user *)arg;
+ atm_backend_t b;
+
+ int err;
+ switch (cmd) {
+ case ATM_SETBACKEND:
+ case ATM_NEWBACKENDIF:
+ err = get_user(b, (atm_backend_t __user *) argp);
+ if (err)
+ return -EFAULT;
+ if (b != ATM_BACKEND_BR2684)
+ return -ENOIOCTLCMD;
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (cmd == ATM_SETBACKEND) {
+ if (sock->state != SS_CONNECTED)
+ return -EINVAL;
+ return br2684_regvcc(atmvcc, argp);
+ } else {
+ return br2684_create(argp);
+ }
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+ case BR2684_SETFILT:
+ if (atmvcc->push != br2684_push)
+ return -ENOIOCTLCMD;
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ err = br2684_setfilt(atmvcc, argp);
+
+ return err;
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+ }
+ return -ENOIOCTLCMD;
+}
+
+static struct atm_ioctl br2684_ioctl_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = br2684_ioctl,
+};
+
+#ifdef CONFIG_PROC_FS
+static void *br2684_seq_start(struct seq_file *seq, loff_t * pos)
+ __acquires(devs_lock)
+{
+ read_lock(&devs_lock);
+ return seq_list_start(&br2684_devs, *pos);
+}
+
+static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t * pos)
+{
+ return seq_list_next(v, &br2684_devs, pos);
+}
+
+static void br2684_seq_stop(struct seq_file *seq, void *v)
+ __releases(devs_lock)
+{
+ read_unlock(&devs_lock);
+}
+
+static int br2684_seq_show(struct seq_file *seq, void *v)
+{
+ const struct br2684_dev *brdev = list_entry(v, struct br2684_dev,
+ br2684_devs);
+ const struct net_device *net_dev = brdev->net_dev;
+ const struct br2684_vcc *brvcc;
+
+ seq_printf(seq, "dev %.16s: num=%d, mac=%pM (%s)\n",
+ net_dev->name,
+ brdev->number,
+ net_dev->dev_addr,
+ brdev->mac_was_set ? "set" : "auto");
+
+ list_for_each_entry(brvcc, &brdev->brvccs, brvccs) {
+ seq_printf(seq, " vcc %d.%d.%d: encaps=%s payload=%s"
+ ", failed copies %u/%u"
+ "\n", brvcc->atmvcc->dev->number,
+ brvcc->atmvcc->vpi, brvcc->atmvcc->vci,
+ (brvcc->encaps == e_llc) ? "LLC" : "VC",
+ (brdev->payload == p_bridged) ? "bridged" : "routed",
+ brvcc->copies_failed, brvcc->copies_needed);
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte]
+#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3)
+ if (brvcc->filter.netmask != 0)
+ seq_printf(seq, " filter=%d.%d.%d.%d/"
+ "%d.%d.%d.%d\n", bs(prefix), bs(netmask));
+#undef bs
+#undef b1
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+ }
+ return 0;
+}
+
+static const struct seq_operations br2684_seq_ops = {
+ .start = br2684_seq_start,
+ .next = br2684_seq_next,
+ .stop = br2684_seq_stop,
+ .show = br2684_seq_show,
+};
+
+static int br2684_proc_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &br2684_seq_ops);
+}
+
+static const struct file_operations br2684_proc_ops = {
+ .owner = THIS_MODULE,
+ .open = br2684_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+extern struct proc_dir_entry *atm_proc_root; /* from proc.c */
+#endif /* CONFIG_PROC_FS */
+
+static int __init br2684_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *p;
+ p = proc_create("br2684", 0, atm_proc_root, &br2684_proc_ops);
+ if (p == NULL)
+ return -ENOMEM;
+#endif
+ register_atm_ioctl(&br2684_ioctl_ops);
+ register_atmdevice_notifier(&atm_dev_notifier);
+ return 0;
+}
+
+static void __exit br2684_exit(void)
+{
+ struct net_device *net_dev;
+ struct br2684_dev *brdev;
+ struct br2684_vcc *brvcc;
+ deregister_atm_ioctl(&br2684_ioctl_ops);
+
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("br2684", atm_proc_root);
+#endif
+
+
+ unregister_atmdevice_notifier(&atm_dev_notifier);
+
+ while (!list_empty(&br2684_devs)) {
+ net_dev = list_entry_brdev(br2684_devs.next);
+ brdev = BRPRIV(net_dev);
+ while (!list_empty(&brdev->brvccs)) {
+ brvcc = list_entry_brvcc(brdev->brvccs.next);
+ br2684_close_vcc(brvcc);
+ }
+
+ list_del(&brdev->br2684_devs);
+ unregister_netdev(net_dev);
+ free_netdev(net_dev);
+ }
+}
+
+module_init(br2684_init);
+module_exit(br2684_exit);
+
+MODULE_AUTHOR("Marcell GAL");
+MODULE_DESCRIPTION("RFC2684 bridged protocols over ATM/AAL5");
+MODULE_LICENSE("GPL");
diff --git a/net/atm/clip.c b/net/atm/clip.c
new file mode 100644
index 000000000..17e55dfec
--- /dev/null
+++ b/net/atm/clip.c
@@ -0,0 +1,939 @@
+/* net/atm/clip.c - RFC1577 Classical IP over ATM */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h> /* for UINT_MAX */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
+#include <linux/if_arp.h> /* for some manifest constants */
+#include <linux/notifier.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/atmclip.h>
+#include <linux/atmarp.h>
+#include <linux/capability.h>
+#include <linux/ip.h> /* for net/route.h */
+#include <linux/in.h> /* for struct sockaddr_in */
+#include <linux/if.h> /* for IFF_UP */
+#include <linux/inetdevice.h>
+#include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <net/route.h> /* for struct rtable and routing */
+#include <net/icmp.h> /* icmp_send */
+#include <net/arp.h>
+#include <linux/param.h> /* for HZ */
+#include <linux/uaccess.h>
+#include <asm/byteorder.h> /* for htons etc. */
+#include <linux/atomic.h>
+
+#include "common.h"
+#include "resources.h"
+#include <net/atmclip.h>
+
+static struct net_device *clip_devs;
+static struct atm_vcc *atmarpd;
+static struct timer_list idle_timer;
+static const struct neigh_ops clip_neigh_ops;
+
+static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip)
+{
+ struct sock *sk;
+ struct atmarp_ctrl *ctrl;
+ struct sk_buff *skb;
+
+ pr_debug("(%d)\n", type);
+ if (!atmarpd)
+ return -EUNATCH;
+ skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+ ctrl = (struct atmarp_ctrl *)skb_put(skb, sizeof(struct atmarp_ctrl));
+ ctrl->type = type;
+ ctrl->itf_num = itf;
+ ctrl->ip = ip;
+ atm_force_charge(atmarpd, skb->truesize);
+
+ sk = sk_atm(atmarpd);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+ return 0;
+}
+
+static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry)
+{
+ pr_debug("%p to entry %p (neigh %p)\n", clip_vcc, entry, entry->neigh);
+ clip_vcc->entry = entry;
+ clip_vcc->xoff = 0; /* @@@ may overrun buffer by one packet */
+ clip_vcc->next = entry->vccs;
+ entry->vccs = clip_vcc;
+ entry->neigh->used = jiffies;
+}
+
+static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
+{
+ struct atmarp_entry *entry = clip_vcc->entry;
+ struct clip_vcc **walk;
+
+ if (!entry) {
+ pr_crit("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
+ return;
+ }
+ netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */
+ entry->neigh->used = jiffies;
+ for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
+ if (*walk == clip_vcc) {
+ int error;
+
+ *walk = clip_vcc->next; /* atomic */
+ clip_vcc->entry = NULL;
+ if (clip_vcc->xoff)
+ netif_wake_queue(entry->neigh->dev);
+ if (entry->vccs)
+ goto out;
+ entry->expires = jiffies - 1;
+ /* force resolution or expiration */
+ error = neigh_update(entry->neigh, NULL, NUD_NONE,
+ NEIGH_UPDATE_F_ADMIN);
+ if (error)
+ pr_crit("neigh_update failed with %d\n", error);
+ goto out;
+ }
+ pr_crit("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
+out:
+ netif_tx_unlock_bh(entry->neigh->dev);
+}
+
+/* The neighbour entry n->lock is held. */
+static int neigh_check_cb(struct neighbour *n)
+{
+ struct atmarp_entry *entry = neighbour_priv(n);
+ struct clip_vcc *cv;
+
+ if (n->ops != &clip_neigh_ops)
+ return 0;
+ for (cv = entry->vccs; cv; cv = cv->next) {
+ unsigned long exp = cv->last_use + cv->idle_timeout;
+
+ if (cv->idle_timeout && time_after(jiffies, exp)) {
+ pr_debug("releasing vcc %p->%p of entry %p\n",
+ cv, cv->vcc, entry);
+ vcc_release_async(cv->vcc, -ETIMEDOUT);
+ }
+ }
+
+ if (entry->vccs || time_before(jiffies, entry->expires))
+ return 0;
+
+ if (atomic_read(&n->refcnt) > 1) {
+ struct sk_buff *skb;
+
+ pr_debug("destruction postponed with ref %d\n",
+ atomic_read(&n->refcnt));
+
+ while ((skb = skb_dequeue(&n->arp_queue)) != NULL)
+ dev_kfree_skb(skb);
+
+ return 0;
+ }
+
+ pr_debug("expired neigh %p\n", n);
+ return 1;
+}
+
+static void idle_timer_check(unsigned long dummy)
+{
+ write_lock(&arp_tbl.lock);
+ __neigh_for_each_release(&arp_tbl, neigh_check_cb);
+ mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
+ write_unlock(&arp_tbl.lock);
+}
+
+static int clip_arp_rcv(struct sk_buff *skb)
+{
+ struct atm_vcc *vcc;
+
+ pr_debug("\n");
+ vcc = ATM_SKB(skb)->vcc;
+ if (!vcc || !atm_charge(vcc, skb->truesize)) {
+ dev_kfree_skb_any(skb);
+ return 0;
+ }
+ pr_debug("pushing to %p\n", vcc);
+ pr_debug("using %p\n", CLIP_VCC(vcc)->old_push);
+ CLIP_VCC(vcc)->old_push(vcc, skb);
+ return 0;
+}
+
+static const unsigned char llc_oui[] = {
+ 0xaa, /* DSAP: non-ISO */
+ 0xaa, /* SSAP: non-ISO */
+ 0x03, /* Ctrl: Unnumbered Information Command PDU */
+ 0x00, /* OUI: EtherType */
+ 0x00,
+ 0x00
+};
+
+static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct clip_vcc *clip_vcc = CLIP_VCC(vcc);
+
+ pr_debug("\n");
+
+ if (!clip_devs) {
+ atm_return(vcc, skb->truesize);
+ kfree_skb(skb);
+ return;
+ }
+
+ if (!skb) {
+ pr_debug("removing VCC %p\n", clip_vcc);
+ if (clip_vcc->entry)
+ unlink_clip_vcc(clip_vcc);
+ clip_vcc->old_push(vcc, NULL); /* pass on the bad news */
+ kfree(clip_vcc);
+ return;
+ }
+ atm_return(vcc, skb->truesize);
+ skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs;
+ /* clip_vcc->entry == NULL if we don't have an IP address yet */
+ if (!skb->dev) {
+ dev_kfree_skb_any(skb);
+ return;
+ }
+ ATM_SKB(skb)->vcc = vcc;
+ skb_reset_mac_header(skb);
+ if (!clip_vcc->encap ||
+ skb->len < RFC1483LLC_LEN ||
+ memcmp(skb->data, llc_oui, sizeof(llc_oui)))
+ skb->protocol = htons(ETH_P_IP);
+ else {
+ skb->protocol = ((__be16 *)skb->data)[3];
+ skb_pull(skb, RFC1483LLC_LEN);
+ if (skb->protocol == htons(ETH_P_ARP)) {
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+ clip_arp_rcv(skb);
+ return;
+ }
+ }
+ clip_vcc->last_use = jiffies;
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+ memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+ netif_rx(skb);
+}
+
+/*
+ * Note: these spinlocks _must_not_ block on non-SMP. The only goal is that
+ * clip_pop is atomic with respect to the critical section in clip_start_xmit.
+ */
+
+static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct clip_vcc *clip_vcc = CLIP_VCC(vcc);
+ struct net_device *dev = skb->dev;
+ int old;
+ unsigned long flags;
+
+ pr_debug("(vcc %p)\n", vcc);
+ clip_vcc->old_pop(vcc, skb);
+ /* skb->dev == NULL in outbound ARP packets */
+ if (!dev)
+ return;
+ spin_lock_irqsave(&PRIV(dev)->xoff_lock, flags);
+ if (atm_may_send(vcc, 0)) {
+ old = xchg(&clip_vcc->xoff, 0);
+ if (old)
+ netif_wake_queue(dev);
+ }
+ spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags);
+}
+
+static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+ __be32 *ip = (__be32 *) neigh->primary_key;
+
+ pr_debug("(neigh %p, skb %p)\n", neigh, skb);
+ to_atmarpd(act_need, PRIV(neigh->dev)->number, *ip);
+}
+
+static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb)
+{
+#ifndef CONFIG_ATM_CLIP_NO_ICMP
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+#endif
+ kfree_skb(skb);
+}
+
+static const struct neigh_ops clip_neigh_ops = {
+ .family = AF_INET,
+ .solicit = clip_neigh_solicit,
+ .error_report = clip_neigh_error,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
+};
+
+static int clip_constructor(struct neighbour *neigh)
+{
+ struct atmarp_entry *entry = neighbour_priv(neigh);
+
+ if (neigh->tbl->family != AF_INET)
+ return -EINVAL;
+
+ if (neigh->type != RTN_UNICAST)
+ return -EINVAL;
+
+ neigh->nud_state = NUD_NONE;
+ neigh->ops = &clip_neigh_ops;
+ neigh->output = neigh->ops->output;
+ entry->neigh = neigh;
+ entry->vccs = NULL;
+ entry->expires = jiffies - 1;
+
+ return 0;
+}
+
+/* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */
+
+/*
+ * We play with the resolve flag: 0 and 1 have the usual meaning, but -1 means
+ * to allocate the neighbour entry but not to ask atmarpd for resolution. Also,
+ * don't increment the usage count. This is used to create entries in
+ * clip_setentry.
+ */
+
+static int clip_encap(struct atm_vcc *vcc, int mode)
+{
+ CLIP_VCC(vcc)->encap = mode;
+ return 0;
+}
+
+static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct clip_priv *clip_priv = PRIV(dev);
+ struct dst_entry *dst = skb_dst(skb);
+ struct atmarp_entry *entry;
+ struct neighbour *n;
+ struct atm_vcc *vcc;
+ struct rtable *rt;
+ __be32 *daddr;
+ int old;
+ unsigned long flags;
+
+ pr_debug("(skb %p)\n", skb);
+ if (!dst) {
+ pr_err("skb_dst(skb) == NULL\n");
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+ rt = (struct rtable *) dst;
+ if (rt->rt_gateway)
+ daddr = &rt->rt_gateway;
+ else
+ daddr = &ip_hdr(skb)->daddr;
+ n = dst_neigh_lookup(dst, daddr);
+ if (!n) {
+ pr_err("NO NEIGHBOUR !\n");
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+ entry = neighbour_priv(n);
+ if (!entry->vccs) {
+ if (time_after(jiffies, entry->expires)) {
+ /* should be resolved */
+ entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ;
+ to_atmarpd(act_need, PRIV(dev)->number, *((__be32 *)n->primary_key));
+ }
+ if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS)
+ skb_queue_tail(&entry->neigh->arp_queue, skb);
+ else {
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ }
+ goto out_release_neigh;
+ }
+ pr_debug("neigh %p, vccs %p\n", entry, entry->vccs);
+ ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc;
+ pr_debug("using neighbour %p, vcc %p\n", n, vcc);
+ if (entry->vccs->encap) {
+ void *here;
+
+ here = skb_push(skb, RFC1483LLC_LEN);
+ memcpy(here, llc_oui, sizeof(llc_oui));
+ ((__be16 *) here)[3] = skb->protocol;
+ }
+ atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+ ATM_SKB(skb)->atm_options = vcc->atm_options;
+ entry->vccs->last_use = jiffies;
+ pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev);
+ old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */
+ if (old) {
+ pr_warn("XOFF->XOFF transition\n");
+ goto out_release_neigh;
+ }
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+ vcc->send(vcc, skb);
+ if (atm_may_send(vcc, 0)) {
+ entry->vccs->xoff = 0;
+ goto out_release_neigh;
+ }
+ spin_lock_irqsave(&clip_priv->xoff_lock, flags);
+ netif_stop_queue(dev); /* XOFF -> throttle immediately */
+ barrier();
+ if (!entry->vccs->xoff)
+ netif_start_queue(dev);
+ /* Oh, we just raced with clip_pop. netif_start_queue should be
+ good enough, because nothing should really be asleep because
+ of the brief netif_stop_queue. If this isn't true or if it
+ changes, use netif_wake_queue instead. */
+ spin_unlock_irqrestore(&clip_priv->xoff_lock, flags);
+out_release_neigh:
+ neigh_release(n);
+ return NETDEV_TX_OK;
+}
+
+static int clip_mkip(struct atm_vcc *vcc, int timeout)
+{
+ struct clip_vcc *clip_vcc;
+
+ if (!vcc->push)
+ return -EBADFD;
+ clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL);
+ if (!clip_vcc)
+ return -ENOMEM;
+ pr_debug("%p vcc %p\n", clip_vcc, vcc);
+ clip_vcc->vcc = vcc;
+ vcc->user_back = clip_vcc;
+ set_bit(ATM_VF_IS_CLIP, &vcc->flags);
+ clip_vcc->entry = NULL;
+ clip_vcc->xoff = 0;
+ clip_vcc->encap = 1;
+ clip_vcc->last_use = jiffies;
+ clip_vcc->idle_timeout = timeout * HZ;
+ clip_vcc->old_push = vcc->push;
+ clip_vcc->old_pop = vcc->pop;
+ vcc->push = clip_push;
+ vcc->pop = clip_pop;
+
+ /* re-process everything received between connection setup and MKIP */
+ vcc_process_recv_queue(vcc);
+
+ return 0;
+}
+
+static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
+{
+ struct neighbour *neigh;
+ struct atmarp_entry *entry;
+ int error;
+ struct clip_vcc *clip_vcc;
+ struct rtable *rt;
+
+ if (vcc->push != clip_push) {
+ pr_warn("non-CLIP VCC\n");
+ return -EBADF;
+ }
+ clip_vcc = CLIP_VCC(vcc);
+ if (!ip) {
+ if (!clip_vcc->entry) {
+ pr_err("hiding hidden ATMARP entry\n");
+ return 0;
+ }
+ pr_debug("remove\n");
+ unlink_clip_vcc(clip_vcc);
+ return 0;
+ }
+ rt = ip_route_output(&init_net, ip, 0, 1, 0);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1);
+ ip_rt_put(rt);
+ if (!neigh)
+ return -ENOMEM;
+ entry = neighbour_priv(neigh);
+ if (entry != clip_vcc->entry) {
+ if (!clip_vcc->entry)
+ pr_debug("add\n");
+ else {
+ pr_debug("update\n");
+ unlink_clip_vcc(clip_vcc);
+ }
+ link_vcc(clip_vcc, entry);
+ }
+ error = neigh_update(neigh, llc_oui, NUD_PERMANENT,
+ NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ return error;
+}
+
+static const struct net_device_ops clip_netdev_ops = {
+ .ndo_start_xmit = clip_start_xmit,
+ .ndo_neigh_construct = clip_constructor,
+};
+
+static void clip_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &clip_netdev_ops;
+ dev->type = ARPHRD_ATM;
+ dev->neigh_priv_len = sizeof(struct atmarp_entry);
+ dev->hard_header_len = RFC1483LLC_LEN;
+ dev->mtu = RFC1626_MTU;
+ dev->tx_queue_len = 100; /* "normal" queue (packets) */
+ /* When using a "real" qdisc, the qdisc determines the queue */
+ /* length. tx_queue_len is only used for the default case, */
+ /* without any more elaborate queuing. 100 is a reasonable */
+ /* compromise between decent burst-tolerance and protection */
+ /* against memory hogs. */
+ netif_keep_dst(dev);
+}
+
+static int clip_create(int number)
+{
+ struct net_device *dev;
+ struct clip_priv *clip_priv;
+ int error;
+
+ if (number != -1) {
+ for (dev = clip_devs; dev; dev = PRIV(dev)->next)
+ if (PRIV(dev)->number == number)
+ return -EEXIST;
+ } else {
+ number = 0;
+ for (dev = clip_devs; dev; dev = PRIV(dev)->next)
+ if (PRIV(dev)->number >= number)
+ number = PRIV(dev)->number + 1;
+ }
+ dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN,
+ clip_setup);
+ if (!dev)
+ return -ENOMEM;
+ clip_priv = PRIV(dev);
+ sprintf(dev->name, "atm%d", number);
+ spin_lock_init(&clip_priv->xoff_lock);
+ clip_priv->number = number;
+ error = register_netdev(dev);
+ if (error) {
+ free_netdev(dev);
+ return error;
+ }
+ clip_priv->next = clip_devs;
+ clip_devs = dev;
+ pr_debug("registered (net:%s)\n", dev->name);
+ return number;
+}
+
+static int clip_device_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ /* ignore non-CLIP devices */
+ if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ pr_debug("NETDEV_UP\n");
+ to_atmarpd(act_up, PRIV(dev)->number, 0);
+ break;
+ case NETDEV_GOING_DOWN:
+ pr_debug("NETDEV_DOWN\n");
+ to_atmarpd(act_down, PRIV(dev)->number, 0);
+ break;
+ case NETDEV_CHANGE:
+ case NETDEV_CHANGEMTU:
+ pr_debug("NETDEV_CHANGE*\n");
+ to_atmarpd(act_change, PRIV(dev)->number, 0);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int clip_inet_event(struct notifier_block *this, unsigned long event,
+ void *ifa)
+{
+ struct in_device *in_dev;
+ struct netdev_notifier_info info;
+
+ in_dev = ((struct in_ifaddr *)ifa)->ifa_dev;
+ /*
+ * Transitions are of the down-change-up type, so it's sufficient to
+ * handle the change on up.
+ */
+ if (event != NETDEV_UP)
+ return NOTIFY_DONE;
+ netdev_notifier_info_init(&info, in_dev->dev);
+ return clip_device_event(this, NETDEV_CHANGE, &info);
+}
+
+static struct notifier_block clip_dev_notifier = {
+ .notifier_call = clip_device_event,
+};
+
+
+
+static struct notifier_block clip_inet_notifier = {
+ .notifier_call = clip_inet_event,
+};
+
+
+
+static void atmarpd_close(struct atm_vcc *vcc)
+{
+ pr_debug("\n");
+
+ rtnl_lock();
+ atmarpd = NULL;
+ skb_queue_purge(&sk_atm(vcc)->sk_receive_queue);
+ rtnl_unlock();
+
+ pr_debug("(done)\n");
+ module_put(THIS_MODULE);
+}
+
+static struct atmdev_ops atmarpd_dev_ops = {
+ .close = atmarpd_close
+};
+
+
+static struct atm_dev atmarpd_dev = {
+ .ops = &atmarpd_dev_ops,
+ .type = "arpd",
+ .number = 999,
+ .lock = __SPIN_LOCK_UNLOCKED(atmarpd_dev.lock)
+};
+
+
+static int atm_init_atmarp(struct atm_vcc *vcc)
+{
+ rtnl_lock();
+ if (atmarpd) {
+ rtnl_unlock();
+ return -EADDRINUSE;
+ }
+
+ mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
+
+ atmarpd = vcc;
+ set_bit(ATM_VF_META, &vcc->flags);
+ set_bit(ATM_VF_READY, &vcc->flags);
+ /* allow replies and avoid getting closed if signaling dies */
+ vcc->dev = &atmarpd_dev;
+ vcc_insert_socket(sk_atm(vcc));
+ vcc->push = NULL;
+ vcc->pop = NULL; /* crash */
+ vcc->push_oam = NULL; /* crash */
+ rtnl_unlock();
+ return 0;
+}
+
+static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct atm_vcc *vcc = ATM_SD(sock);
+ int err = 0;
+
+ switch (cmd) {
+ case SIOCMKCLIP:
+ case ATMARPD_CTRL:
+ case ATMARP_MKIP:
+ case ATMARP_SETENTRY:
+ case ATMARP_ENCAP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ switch (cmd) {
+ case SIOCMKCLIP:
+ err = clip_create(arg);
+ break;
+ case ATMARPD_CTRL:
+ err = atm_init_atmarp(vcc);
+ if (!err) {
+ sock->state = SS_CONNECTED;
+ __module_get(THIS_MODULE);
+ }
+ break;
+ case ATMARP_MKIP:
+ err = clip_mkip(vcc, arg);
+ break;
+ case ATMARP_SETENTRY:
+ err = clip_setentry(vcc, (__force __be32)arg);
+ break;
+ case ATMARP_ENCAP:
+ err = clip_encap(vcc, arg);
+ break;
+ }
+ return err;
+}
+
+static struct atm_ioctl clip_ioctl_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = clip_ioctl,
+};
+
+#ifdef CONFIG_PROC_FS
+
+static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr)
+{
+ static int code[] = { 1, 2, 10, 6, 1, 0 };
+ static int e164[] = { 1, 8, 4, 6, 1, 0 };
+
+ if (*addr->sas_addr.pub) {
+ seq_printf(seq, "%s", addr->sas_addr.pub);
+ if (*addr->sas_addr.prv)
+ seq_putc(seq, '+');
+ } else if (!*addr->sas_addr.prv) {
+ seq_printf(seq, "%s", "(none)");
+ return;
+ }
+ if (*addr->sas_addr.prv) {
+ unsigned char *prv = addr->sas_addr.prv;
+ int *fields;
+ int i, j;
+
+ fields = *prv == ATM_AFI_E164 ? e164 : code;
+ for (i = 0; fields[i]; i++) {
+ for (j = fields[i]; j; j--)
+ seq_printf(seq, "%02X", *prv++);
+ if (fields[i + 1])
+ seq_putc(seq, '.');
+ }
+ }
+}
+
+/* This means the neighbour entry has no attached VCC objects. */
+#define SEQ_NO_VCC_TOKEN ((void *) 2)
+
+static void atmarp_info(struct seq_file *seq, struct neighbour *n,
+ struct atmarp_entry *entry, struct clip_vcc *clip_vcc)
+{
+ struct net_device *dev = n->dev;
+ unsigned long exp;
+ char buf[17];
+ int svc, llc, off;
+
+ svc = ((clip_vcc == SEQ_NO_VCC_TOKEN) ||
+ (sk_atm(clip_vcc->vcc)->sk_family == AF_ATMSVC));
+
+ llc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || clip_vcc->encap);
+
+ if (clip_vcc == SEQ_NO_VCC_TOKEN)
+ exp = entry->neigh->used;
+ else
+ exp = clip_vcc->last_use;
+
+ exp = (jiffies - exp) / HZ;
+
+ seq_printf(seq, "%-6s%-4s%-4s%5ld ",
+ dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp);
+
+ off = scnprintf(buf, sizeof(buf) - 1, "%pI4", n->primary_key);
+ while (off < 16)
+ buf[off++] = ' ';
+ buf[off] = '\0';
+ seq_printf(seq, "%s", buf);
+
+ if (clip_vcc == SEQ_NO_VCC_TOKEN) {
+ if (time_before(jiffies, entry->expires))
+ seq_printf(seq, "(resolving)\n");
+ else
+ seq_printf(seq, "(expired, ref %d)\n",
+ atomic_read(&entry->neigh->refcnt));
+ } else if (!svc) {
+ seq_printf(seq, "%d.%d.%d\n",
+ clip_vcc->vcc->dev->number,
+ clip_vcc->vcc->vpi, clip_vcc->vcc->vci);
+ } else {
+ svc_addr(seq, &clip_vcc->vcc->remote);
+ seq_putc(seq, '\n');
+ }
+}
+
+struct clip_seq_state {
+ /* This member must be first. */
+ struct neigh_seq_state ns;
+
+ /* Local to clip specific iteration. */
+ struct clip_vcc *vcc;
+};
+
+static struct clip_vcc *clip_seq_next_vcc(struct atmarp_entry *e,
+ struct clip_vcc *curr)
+{
+ if (!curr) {
+ curr = e->vccs;
+ if (!curr)
+ return SEQ_NO_VCC_TOKEN;
+ return curr;
+ }
+ if (curr == SEQ_NO_VCC_TOKEN)
+ return NULL;
+
+ curr = curr->next;
+
+ return curr;
+}
+
+static void *clip_seq_vcc_walk(struct clip_seq_state *state,
+ struct atmarp_entry *e, loff_t * pos)
+{
+ struct clip_vcc *vcc = state->vcc;
+
+ vcc = clip_seq_next_vcc(e, vcc);
+ if (vcc && pos != NULL) {
+ while (*pos) {
+ vcc = clip_seq_next_vcc(e, vcc);
+ if (!vcc)
+ break;
+ --(*pos);
+ }
+ }
+ state->vcc = vcc;
+
+ return vcc;
+}
+
+static void *clip_seq_sub_iter(struct neigh_seq_state *_state,
+ struct neighbour *n, loff_t * pos)
+{
+ struct clip_seq_state *state = (struct clip_seq_state *)_state;
+
+ if (n->dev->type != ARPHRD_ATM)
+ return NULL;
+
+ return clip_seq_vcc_walk(state, neighbour_priv(n), pos);
+}
+
+static void *clip_seq_start(struct seq_file *seq, loff_t * pos)
+{
+ struct clip_seq_state *state = seq->private;
+ state->ns.neigh_sub_iter = clip_seq_sub_iter;
+ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_NEIGH_ONLY);
+}
+
+static int clip_seq_show(struct seq_file *seq, void *v)
+{
+ static char atm_arp_banner[] =
+ "IPitf TypeEncp Idle IP address ATM address\n";
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, atm_arp_banner);
+ } else {
+ struct clip_seq_state *state = seq->private;
+ struct clip_vcc *vcc = state->vcc;
+ struct neighbour *n = v;
+
+ atmarp_info(seq, n, neighbour_priv(n), vcc);
+ }
+ return 0;
+}
+
+static const struct seq_operations arp_seq_ops = {
+ .start = clip_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = clip_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &arp_seq_ops,
+ sizeof(struct clip_seq_state));
+}
+
+static const struct file_operations arp_seq_fops = {
+ .open = arp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+ .owner = THIS_MODULE
+};
+#endif
+
+static void atm_clip_exit_noproc(void);
+
+static int __init atm_clip_init(void)
+{
+ register_atm_ioctl(&clip_ioctl_ops);
+ register_netdevice_notifier(&clip_dev_notifier);
+ register_inetaddr_notifier(&clip_inet_notifier);
+
+ setup_timer(&idle_timer, idle_timer_check, 0);
+
+#ifdef CONFIG_PROC_FS
+ {
+ struct proc_dir_entry *p;
+
+ p = proc_create("arp", S_IRUGO, atm_proc_root, &arp_seq_fops);
+ if (!p) {
+ pr_err("Unable to initialize /proc/net/atm/arp\n");
+ atm_clip_exit_noproc();
+ return -ENOMEM;
+ }
+ }
+#endif
+
+ return 0;
+}
+
+static void atm_clip_exit_noproc(void)
+{
+ struct net_device *dev, *next;
+
+ unregister_inetaddr_notifier(&clip_inet_notifier);
+ unregister_netdevice_notifier(&clip_dev_notifier);
+
+ deregister_atm_ioctl(&clip_ioctl_ops);
+
+ /* First, stop the idle timer, so it stops banging
+ * on the table.
+ */
+ del_timer_sync(&idle_timer);
+
+ dev = clip_devs;
+ while (dev) {
+ next = PRIV(dev)->next;
+ unregister_netdev(dev);
+ free_netdev(dev);
+ dev = next;
+ }
+}
+
+static void __exit atm_clip_exit(void)
+{
+ remove_proc_entry("arp", atm_proc_root);
+
+ atm_clip_exit_noproc();
+}
+
+module_init(atm_clip_init);
+module_exit(atm_clip_exit);
+MODULE_AUTHOR("Werner Almesberger");
+MODULE_DESCRIPTION("Classical/IP over ATM interface");
+MODULE_LICENSE("GPL");
diff --git a/net/atm/common.c b/net/atm/common.c
new file mode 100644
index 000000000..ed0466637
--- /dev/null
+++ b/net/atm/common.c
@@ -0,0 +1,901 @@
+/* net/atm/common.c - ATM sockets (common part for PVC and SVC) */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/net.h> /* struct socket, struct proto_ops */
+#include <linux/atm.h> /* ATM stuff */
+#include <linux/atmdev.h>
+#include <linux/socket.h> /* SOL_SOCKET */
+#include <linux/errno.h> /* error codes */
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/time.h> /* struct timeval */
+#include <linux/skbuff.h>
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/sock.h> /* struct sock */
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+
+#include <linux/atomic.h>
+
+#include "resources.h" /* atm_find_dev */
+#include "common.h" /* prototypes */
+#include "protocols.h" /* atm_init_<transport> */
+#include "addr.h" /* address registry */
+#include "signaling.h" /* for WAITING and sigd_attach */
+
+struct hlist_head vcc_hash[VCC_HTABLE_SIZE];
+EXPORT_SYMBOL(vcc_hash);
+
+DEFINE_RWLOCK(vcc_sklist_lock);
+EXPORT_SYMBOL(vcc_sklist_lock);
+
+static ATOMIC_NOTIFIER_HEAD(atm_dev_notify_chain);
+
+static void __vcc_insert_socket(struct sock *sk)
+{
+ struct atm_vcc *vcc = atm_sk(sk);
+ struct hlist_head *head = &vcc_hash[vcc->vci & (VCC_HTABLE_SIZE - 1)];
+ sk->sk_hash = vcc->vci & (VCC_HTABLE_SIZE - 1);
+ sk_add_node(sk, head);
+}
+
+void vcc_insert_socket(struct sock *sk)
+{
+ write_lock_irq(&vcc_sklist_lock);
+ __vcc_insert_socket(sk);
+ write_unlock_irq(&vcc_sklist_lock);
+}
+EXPORT_SYMBOL(vcc_insert_socket);
+
+static void vcc_remove_socket(struct sock *sk)
+{
+ write_lock_irq(&vcc_sklist_lock);
+ sk_del_node_init(sk);
+ write_unlock_irq(&vcc_sklist_lock);
+}
+
+static struct sk_buff *alloc_tx(struct atm_vcc *vcc, unsigned int size)
+{
+ struct sk_buff *skb;
+ struct sock *sk = sk_atm(vcc);
+
+ if (sk_wmem_alloc_get(sk) && !atm_may_send(vcc, size)) {
+ pr_debug("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n",
+ sk_wmem_alloc_get(sk), size, sk->sk_sndbuf);
+ return NULL;
+ }
+ while (!(skb = alloc_skb(size, GFP_KERNEL)))
+ schedule();
+ pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
+ atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ return skb;
+}
+
+static void vcc_sock_destruct(struct sock *sk)
+{
+ if (atomic_read(&sk->sk_rmem_alloc))
+ printk(KERN_DEBUG "%s: rmem leakage (%d bytes) detected.\n",
+ __func__, atomic_read(&sk->sk_rmem_alloc));
+
+ if (atomic_read(&sk->sk_wmem_alloc))
+ printk(KERN_DEBUG "%s: wmem leakage (%d bytes) detected.\n",
+ __func__, atomic_read(&sk->sk_wmem_alloc));
+}
+
+static void vcc_def_wakeup(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up(&wq->wait);
+ rcu_read_unlock();
+}
+
+static inline int vcc_writable(struct sock *sk)
+{
+ struct atm_vcc *vcc = atm_sk(sk);
+
+ return (vcc->qos.txtp.max_sdu +
+ atomic_read(&sk->sk_wmem_alloc)) <= sk->sk_sndbuf;
+}
+
+static void vcc_write_space(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+
+ if (vcc_writable(sk)) {
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible(&wq->wait);
+
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+
+ rcu_read_unlock();
+}
+
+static void vcc_release_cb(struct sock *sk)
+{
+ struct atm_vcc *vcc = atm_sk(sk);
+
+ if (vcc->release_cb)
+ vcc->release_cb(vcc);
+}
+
+static struct proto vcc_proto = {
+ .name = "VCC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct atm_vcc),
+ .release_cb = vcc_release_cb,
+};
+
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
+{
+ struct sock *sk;
+ struct atm_vcc *vcc;
+
+ sock->sk = NULL;
+ if (sock->type == SOCK_STREAM)
+ return -EINVAL;
+ sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto);
+ if (!sk)
+ return -ENOMEM;
+ sock_init_data(sock, sk);
+ sk->sk_state_change = vcc_def_wakeup;
+ sk->sk_write_space = vcc_write_space;
+
+ vcc = atm_sk(sk);
+ vcc->dev = NULL;
+ memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc));
+ memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc));
+ vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */
+ atomic_set(&sk->sk_wmem_alloc, 1);
+ atomic_set(&sk->sk_rmem_alloc, 0);
+ vcc->push = NULL;
+ vcc->pop = NULL;
+ vcc->owner = NULL;
+ vcc->push_oam = NULL;
+ vcc->release_cb = NULL;
+ vcc->vpi = vcc->vci = 0; /* no VCI/VPI yet */
+ vcc->atm_options = vcc->aal_options = 0;
+ sk->sk_destruct = vcc_sock_destruct;
+ return 0;
+}
+
+static void vcc_destroy_socket(struct sock *sk)
+{
+ struct atm_vcc *vcc = atm_sk(sk);
+ struct sk_buff *skb;
+
+ set_bit(ATM_VF_CLOSE, &vcc->flags);
+ clear_bit(ATM_VF_READY, &vcc->flags);
+ if (vcc->dev) {
+ if (vcc->dev->ops->close)
+ vcc->dev->ops->close(vcc);
+ if (vcc->push)
+ vcc->push(vcc, NULL); /* atmarpd has no push */
+ module_put(vcc->owner);
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ atm_return(vcc, skb->truesize);
+ kfree_skb(skb);
+ }
+
+ module_put(vcc->dev->ops->owner);
+ atm_dev_put(vcc->dev);
+ }
+
+ vcc_remove_socket(sk);
+}
+
+int vcc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ lock_sock(sk);
+ vcc_destroy_socket(sock->sk);
+ release_sock(sk);
+ sock_put(sk);
+ }
+
+ return 0;
+}
+
+void vcc_release_async(struct atm_vcc *vcc, int reply)
+{
+ struct sock *sk = sk_atm(vcc);
+
+ set_bit(ATM_VF_CLOSE, &vcc->flags);
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ sk->sk_err = -reply;
+ clear_bit(ATM_VF_WAITING, &vcc->flags);
+ sk->sk_state_change(sk);
+}
+EXPORT_SYMBOL(vcc_release_async);
+
+void vcc_process_recv_queue(struct atm_vcc *vcc)
+{
+ struct sk_buff_head queue, *rq;
+ struct sk_buff *skb, *tmp;
+ unsigned long flags;
+
+ __skb_queue_head_init(&queue);
+ rq = &sk_atm(vcc)->sk_receive_queue;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ skb_queue_splice_init(rq, &queue);
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ skb_queue_walk_safe(&queue, skb, tmp) {
+ __skb_unlink(skb, &queue);
+ vcc->push(vcc, skb);
+ }
+}
+EXPORT_SYMBOL(vcc_process_recv_queue);
+
+void atm_dev_signal_change(struct atm_dev *dev, char signal)
+{
+ pr_debug("%s signal=%d dev=%p number=%d dev->signal=%d\n",
+ __func__, signal, dev, dev->number, dev->signal);
+
+ /* atm driver sending invalid signal */
+ WARN_ON(signal < ATM_PHY_SIG_LOST || signal > ATM_PHY_SIG_FOUND);
+
+ if (dev->signal == signal)
+ return; /* no change */
+
+ dev->signal = signal;
+
+ atomic_notifier_call_chain(&atm_dev_notify_chain, signal, dev);
+}
+EXPORT_SYMBOL(atm_dev_signal_change);
+
+void atm_dev_release_vccs(struct atm_dev *dev)
+{
+ int i;
+
+ write_lock_irq(&vcc_sklist_lock);
+ for (i = 0; i < VCC_HTABLE_SIZE; i++) {
+ struct hlist_head *head = &vcc_hash[i];
+ struct hlist_node *tmp;
+ struct sock *s;
+ struct atm_vcc *vcc;
+
+ sk_for_each_safe(s, tmp, head) {
+ vcc = atm_sk(s);
+ if (vcc->dev == dev) {
+ vcc_release_async(vcc, -EPIPE);
+ sk_del_node_init(s);
+ }
+ }
+ }
+ write_unlock_irq(&vcc_sklist_lock);
+}
+EXPORT_SYMBOL(atm_dev_release_vccs);
+
+static int adjust_tp(struct atm_trafprm *tp, unsigned char aal)
+{
+ int max_sdu;
+
+ if (!tp->traffic_class)
+ return 0;
+ switch (aal) {
+ case ATM_AAL0:
+ max_sdu = ATM_CELL_SIZE-1;
+ break;
+ case ATM_AAL34:
+ max_sdu = ATM_MAX_AAL34_PDU;
+ break;
+ default:
+ pr_warn("AAL problems ... (%d)\n", aal);
+ /* fall through */
+ case ATM_AAL5:
+ max_sdu = ATM_MAX_AAL5_PDU;
+ }
+ if (!tp->max_sdu)
+ tp->max_sdu = max_sdu;
+ else if (tp->max_sdu > max_sdu)
+ return -EINVAL;
+ if (!tp->max_cdv)
+ tp->max_cdv = ATM_MAX_CDV;
+ return 0;
+}
+
+static int check_ci(const struct atm_vcc *vcc, short vpi, int vci)
+{
+ struct hlist_head *head = &vcc_hash[vci & (VCC_HTABLE_SIZE - 1)];
+ struct sock *s;
+ struct atm_vcc *walk;
+
+ sk_for_each(s, head) {
+ walk = atm_sk(s);
+ if (walk->dev != vcc->dev)
+ continue;
+ if (test_bit(ATM_VF_ADDR, &walk->flags) && walk->vpi == vpi &&
+ walk->vci == vci && ((walk->qos.txtp.traffic_class !=
+ ATM_NONE && vcc->qos.txtp.traffic_class != ATM_NONE) ||
+ (walk->qos.rxtp.traffic_class != ATM_NONE &&
+ vcc->qos.rxtp.traffic_class != ATM_NONE)))
+ return -EADDRINUSE;
+ }
+
+ /* allow VCCs with same VPI/VCI iff they don't collide on
+ TX/RX (but we may refuse such sharing for other reasons,
+ e.g. if protocol requires to have both channels) */
+
+ return 0;
+}
+
+static int find_ci(const struct atm_vcc *vcc, short *vpi, int *vci)
+{
+ static short p; /* poor man's per-device cache */
+ static int c;
+ short old_p;
+ int old_c;
+ int err;
+
+ if (*vpi != ATM_VPI_ANY && *vci != ATM_VCI_ANY) {
+ err = check_ci(vcc, *vpi, *vci);
+ return err;
+ }
+ /* last scan may have left values out of bounds for current device */
+ if (*vpi != ATM_VPI_ANY)
+ p = *vpi;
+ else if (p >= 1 << vcc->dev->ci_range.vpi_bits)
+ p = 0;
+ if (*vci != ATM_VCI_ANY)
+ c = *vci;
+ else if (c < ATM_NOT_RSV_VCI || c >= 1 << vcc->dev->ci_range.vci_bits)
+ c = ATM_NOT_RSV_VCI;
+ old_p = p;
+ old_c = c;
+ do {
+ if (!check_ci(vcc, p, c)) {
+ *vpi = p;
+ *vci = c;
+ return 0;
+ }
+ if (*vci == ATM_VCI_ANY) {
+ c++;
+ if (c >= 1 << vcc->dev->ci_range.vci_bits)
+ c = ATM_NOT_RSV_VCI;
+ }
+ if ((c == ATM_NOT_RSV_VCI || *vci != ATM_VCI_ANY) &&
+ *vpi == ATM_VPI_ANY) {
+ p++;
+ if (p >= 1 << vcc->dev->ci_range.vpi_bits)
+ p = 0;
+ }
+ } while (old_p != p || old_c != c);
+ return -EADDRINUSE;
+}
+
+static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi,
+ int vci)
+{
+ struct sock *sk = sk_atm(vcc);
+ int error;
+
+ if ((vpi != ATM_VPI_UNSPEC && vpi != ATM_VPI_ANY &&
+ vpi >> dev->ci_range.vpi_bits) || (vci != ATM_VCI_UNSPEC &&
+ vci != ATM_VCI_ANY && vci >> dev->ci_range.vci_bits))
+ return -EINVAL;
+ if (vci > 0 && vci < ATM_NOT_RSV_VCI && !capable(CAP_NET_BIND_SERVICE))
+ return -EPERM;
+ error = -ENODEV;
+ if (!try_module_get(dev->ops->owner))
+ return error;
+ vcc->dev = dev;
+ write_lock_irq(&vcc_sklist_lock);
+ if (test_bit(ATM_DF_REMOVED, &dev->flags) ||
+ (error = find_ci(vcc, &vpi, &vci))) {
+ write_unlock_irq(&vcc_sklist_lock);
+ goto fail_module_put;
+ }
+ vcc->vpi = vpi;
+ vcc->vci = vci;
+ __vcc_insert_socket(sk);
+ write_unlock_irq(&vcc_sklist_lock);
+ switch (vcc->qos.aal) {
+ case ATM_AAL0:
+ error = atm_init_aal0(vcc);
+ vcc->stats = &dev->stats.aal0;
+ break;
+ case ATM_AAL34:
+ error = atm_init_aal34(vcc);
+ vcc->stats = &dev->stats.aal34;
+ break;
+ case ATM_NO_AAL:
+ /* ATM_AAL5 is also used in the "0 for default" case */
+ vcc->qos.aal = ATM_AAL5;
+ /* fall through */
+ case ATM_AAL5:
+ error = atm_init_aal5(vcc);
+ vcc->stats = &dev->stats.aal5;
+ break;
+ default:
+ error = -EPROTOTYPE;
+ }
+ if (!error)
+ error = adjust_tp(&vcc->qos.txtp, vcc->qos.aal);
+ if (!error)
+ error = adjust_tp(&vcc->qos.rxtp, vcc->qos.aal);
+ if (error)
+ goto fail;
+ pr_debug("VCC %d.%d, AAL %d\n", vpi, vci, vcc->qos.aal);
+ pr_debug(" TX: %d, PCR %d..%d, SDU %d\n",
+ vcc->qos.txtp.traffic_class,
+ vcc->qos.txtp.min_pcr,
+ vcc->qos.txtp.max_pcr,
+ vcc->qos.txtp.max_sdu);
+ pr_debug(" RX: %d, PCR %d..%d, SDU %d\n",
+ vcc->qos.rxtp.traffic_class,
+ vcc->qos.rxtp.min_pcr,
+ vcc->qos.rxtp.max_pcr,
+ vcc->qos.rxtp.max_sdu);
+
+ if (dev->ops->open) {
+ error = dev->ops->open(vcc);
+ if (error)
+ goto fail;
+ }
+ return 0;
+
+fail:
+ vcc_remove_socket(sk);
+fail_module_put:
+ module_put(dev->ops->owner);
+ /* ensure we get dev module ref count correct */
+ vcc->dev = NULL;
+ return error;
+}
+
+int vcc_connect(struct socket *sock, int itf, short vpi, int vci)
+{
+ struct atm_dev *dev;
+ struct atm_vcc *vcc = ATM_SD(sock);
+ int error;
+
+ pr_debug("(vpi %d, vci %d)\n", vpi, vci);
+ if (sock->state == SS_CONNECTED)
+ return -EISCONN;
+ if (sock->state != SS_UNCONNECTED)
+ return -EINVAL;
+ if (!(vpi || vci))
+ return -EINVAL;
+
+ if (vpi != ATM_VPI_UNSPEC && vci != ATM_VCI_UNSPEC)
+ clear_bit(ATM_VF_PARTIAL, &vcc->flags);
+ else
+ if (test_bit(ATM_VF_PARTIAL, &vcc->flags))
+ return -EINVAL;
+ pr_debug("(TX: cl %d,bw %d-%d,sdu %d; "
+ "RX: cl %d,bw %d-%d,sdu %d,AAL %s%d)\n",
+ vcc->qos.txtp.traffic_class, vcc->qos.txtp.min_pcr,
+ vcc->qos.txtp.max_pcr, vcc->qos.txtp.max_sdu,
+ vcc->qos.rxtp.traffic_class, vcc->qos.rxtp.min_pcr,
+ vcc->qos.rxtp.max_pcr, vcc->qos.rxtp.max_sdu,
+ vcc->qos.aal == ATM_AAL5 ? "" :
+ vcc->qos.aal == ATM_AAL0 ? "" : " ??? code ",
+ vcc->qos.aal == ATM_AAL0 ? 0 : vcc->qos.aal);
+ if (!test_bit(ATM_VF_HASQOS, &vcc->flags))
+ return -EBADFD;
+ if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS ||
+ vcc->qos.rxtp.traffic_class == ATM_ANYCLASS)
+ return -EINVAL;
+ if (likely(itf != ATM_ITF_ANY)) {
+ dev = try_then_request_module(atm_dev_lookup(itf),
+ "atm-device-%d", itf);
+ } else {
+ dev = NULL;
+ mutex_lock(&atm_dev_mutex);
+ if (!list_empty(&atm_devs)) {
+ dev = list_entry(atm_devs.next,
+ struct atm_dev, dev_list);
+ atm_dev_hold(dev);
+ }
+ mutex_unlock(&atm_dev_mutex);
+ }
+ if (!dev)
+ return -ENODEV;
+ error = __vcc_connect(vcc, dev, vpi, vci);
+ if (error) {
+ atm_dev_put(dev);
+ return error;
+ }
+ if (vpi == ATM_VPI_UNSPEC || vci == ATM_VCI_UNSPEC)
+ set_bit(ATM_VF_PARTIAL, &vcc->flags);
+ if (test_bit(ATM_VF_READY, &ATM_SD(sock)->flags))
+ sock->state = SS_CONNECTED;
+ return 0;
+}
+
+int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct atm_vcc *vcc;
+ struct sk_buff *skb;
+ int copied, error = -EINVAL;
+
+ if (sock->state != SS_CONNECTED)
+ return -ENOTCONN;
+
+ /* only handle MSG_DONTWAIT and MSG_PEEK */
+ if (flags & ~(MSG_DONTWAIT | MSG_PEEK))
+ return -EOPNOTSUPP;
+
+ vcc = ATM_SD(sock);
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+ !test_bit(ATM_VF_READY, &vcc->flags))
+ return 0;
+
+ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &error);
+ if (!skb)
+ return error;
+
+ copied = skb->len;
+ if (copied > size) {
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ error = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (error)
+ return error;
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (!(flags & MSG_PEEK)) {
+ pr_debug("%d -= %d\n", atomic_read(&sk->sk_rmem_alloc),
+ skb->truesize);
+ atm_return(vcc, skb->truesize);
+ }
+
+ skb_free_datagram(sk, skb);
+ return copied;
+}
+
+int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
+{
+ struct sock *sk = sock->sk;
+ DEFINE_WAIT(wait);
+ struct atm_vcc *vcc;
+ struct sk_buff *skb;
+ int eff, error;
+
+ lock_sock(sk);
+ if (sock->state != SS_CONNECTED) {
+ error = -ENOTCONN;
+ goto out;
+ }
+ if (m->msg_name) {
+ error = -EISCONN;
+ goto out;
+ }
+ vcc = ATM_SD(sock);
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+ !test_bit(ATM_VF_READY, &vcc->flags)) {
+ error = -EPIPE;
+ send_sig(SIGPIPE, current, 0);
+ goto out;
+ }
+ if (!size) {
+ error = 0;
+ goto out;
+ }
+ if (size > vcc->qos.txtp.max_sdu) {
+ error = -EMSGSIZE;
+ goto out;
+ }
+
+ eff = (size+3) & ~3; /* align to word boundary */
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ error = 0;
+ while (!(skb = alloc_tx(vcc, eff))) {
+ if (m->msg_flags & MSG_DONTWAIT) {
+ error = -EAGAIN;
+ break;
+ }
+ schedule();
+ if (signal_pending(current)) {
+ error = -ERESTARTSYS;
+ break;
+ }
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+ !test_bit(ATM_VF_READY, &vcc->flags)) {
+ error = -EPIPE;
+ send_sig(SIGPIPE, current, 0);
+ break;
+ }
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (error)
+ goto out;
+ skb->dev = NULL; /* for paths shared with net_device interfaces */
+ ATM_SKB(skb)->atm_options = vcc->atm_options;
+ if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) {
+ kfree_skb(skb);
+ error = -EFAULT;
+ goto out;
+ }
+ if (eff != size)
+ memset(skb->data + size, 0, eff-size);
+ error = vcc->dev->ops->send(vcc, skb);
+ error = error ? error : size;
+out:
+ release_sock(sk);
+ return error;
+}
+
+unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct atm_vcc *vcc;
+ unsigned int mask;
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ mask = 0;
+
+ vcc = ATM_SD(sock);
+
+ /* exceptional events */
+ if (sk->sk_err)
+ mask = POLLERR;
+
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &vcc->flags))
+ mask |= POLLHUP;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* writable? */
+ if (sock->state == SS_CONNECTING &&
+ test_bit(ATM_VF_WAITING, &vcc->flags))
+ return mask;
+
+ if (vcc->qos.txtp.traffic_class != ATM_NONE &&
+ vcc_writable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+ return mask;
+}
+
+static int atm_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
+{
+ int error;
+
+ /*
+ * Don't let the QoS change the already connected AAL type nor the
+ * traffic class.
+ */
+ if (qos->aal != vcc->qos.aal ||
+ qos->rxtp.traffic_class != vcc->qos.rxtp.traffic_class ||
+ qos->txtp.traffic_class != vcc->qos.txtp.traffic_class)
+ return -EINVAL;
+ error = adjust_tp(&qos->txtp, qos->aal);
+ if (!error)
+ error = adjust_tp(&qos->rxtp, qos->aal);
+ if (error)
+ return error;
+ if (!vcc->dev->ops->change_qos)
+ return -EOPNOTSUPP;
+ if (sk_atm(vcc)->sk_family == AF_ATMPVC)
+ return vcc->dev->ops->change_qos(vcc, qos, ATM_MF_SET);
+ return svc_change_qos(vcc, qos);
+}
+
+static int check_tp(const struct atm_trafprm *tp)
+{
+ /* @@@ Should be merged with adjust_tp */
+ if (!tp->traffic_class || tp->traffic_class == ATM_ANYCLASS)
+ return 0;
+ if (tp->traffic_class != ATM_UBR && !tp->min_pcr && !tp->pcr &&
+ !tp->max_pcr)
+ return -EINVAL;
+ if (tp->min_pcr == ATM_MAX_PCR)
+ return -EINVAL;
+ if (tp->min_pcr && tp->max_pcr && tp->max_pcr != ATM_MAX_PCR &&
+ tp->min_pcr > tp->max_pcr)
+ return -EINVAL;
+ /*
+ * We allow pcr to be outside [min_pcr,max_pcr], because later
+ * adjustment may still push it in the valid range.
+ */
+ return 0;
+}
+
+static int check_qos(const struct atm_qos *qos)
+{
+ int error;
+
+ if (!qos->txtp.traffic_class && !qos->rxtp.traffic_class)
+ return -EINVAL;
+ if (qos->txtp.traffic_class != qos->rxtp.traffic_class &&
+ qos->txtp.traffic_class && qos->rxtp.traffic_class &&
+ qos->txtp.traffic_class != ATM_ANYCLASS &&
+ qos->rxtp.traffic_class != ATM_ANYCLASS)
+ return -EINVAL;
+ error = check_tp(&qos->txtp);
+ if (error)
+ return error;
+ return check_tp(&qos->rxtp);
+}
+
+int vcc_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct atm_vcc *vcc;
+ unsigned long value;
+ int error;
+
+ if (__SO_LEVEL_MATCH(optname, level) && optlen != __SO_SIZE(optname))
+ return -EINVAL;
+
+ vcc = ATM_SD(sock);
+ switch (optname) {
+ case SO_ATMQOS:
+ {
+ struct atm_qos qos;
+
+ if (copy_from_user(&qos, optval, sizeof(qos)))
+ return -EFAULT;
+ error = check_qos(&qos);
+ if (error)
+ return error;
+ if (sock->state == SS_CONNECTED)
+ return atm_change_qos(vcc, &qos);
+ if (sock->state != SS_UNCONNECTED)
+ return -EBADFD;
+ vcc->qos = qos;
+ set_bit(ATM_VF_HASQOS, &vcc->flags);
+ return 0;
+ }
+ case SO_SETCLP:
+ if (get_user(value, (unsigned long __user *)optval))
+ return -EFAULT;
+ if (value)
+ vcc->atm_options |= ATM_ATMOPT_CLP;
+ else
+ vcc->atm_options &= ~ATM_ATMOPT_CLP;
+ return 0;
+ default:
+ if (level == SOL_SOCKET)
+ return -EINVAL;
+ break;
+ }
+ if (!vcc->dev || !vcc->dev->ops->setsockopt)
+ return -EINVAL;
+ return vcc->dev->ops->setsockopt(vcc, level, optname, optval, optlen);
+}
+
+int vcc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct atm_vcc *vcc;
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (__SO_LEVEL_MATCH(optname, level) && len != __SO_SIZE(optname))
+ return -EINVAL;
+
+ vcc = ATM_SD(sock);
+ switch (optname) {
+ case SO_ATMQOS:
+ if (!test_bit(ATM_VF_HASQOS, &vcc->flags))
+ return -EINVAL;
+ return copy_to_user(optval, &vcc->qos, sizeof(vcc->qos))
+ ? -EFAULT : 0;
+ case SO_SETCLP:
+ return put_user(vcc->atm_options & ATM_ATMOPT_CLP ? 1 : 0,
+ (unsigned long __user *)optval) ? -EFAULT : 0;
+ case SO_ATMPVC:
+ {
+ struct sockaddr_atmpvc pvc;
+
+ if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags))
+ return -ENOTCONN;
+ memset(&pvc, 0, sizeof(pvc));
+ pvc.sap_family = AF_ATMPVC;
+ pvc.sap_addr.itf = vcc->dev->number;
+ pvc.sap_addr.vpi = vcc->vpi;
+ pvc.sap_addr.vci = vcc->vci;
+ return copy_to_user(optval, &pvc, sizeof(pvc)) ? -EFAULT : 0;
+ }
+ default:
+ if (level == SOL_SOCKET)
+ return -EINVAL;
+ break;
+ }
+ if (!vcc->dev || !vcc->dev->ops->getsockopt)
+ return -EINVAL;
+ return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len);
+}
+
+int register_atmdevice_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_atmdevice_notifier);
+
+void unregister_atmdevice_notifier(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_atmdevice_notifier);
+
+static int __init atm_init(void)
+{
+ int error;
+
+ error = proto_register(&vcc_proto, 0);
+ if (error < 0)
+ goto out;
+ error = atmpvc_init();
+ if (error < 0) {
+ pr_err("atmpvc_init() failed with %d\n", error);
+ goto out_unregister_vcc_proto;
+ }
+ error = atmsvc_init();
+ if (error < 0) {
+ pr_err("atmsvc_init() failed with %d\n", error);
+ goto out_atmpvc_exit;
+ }
+ error = atm_proc_init();
+ if (error < 0) {
+ pr_err("atm_proc_init() failed with %d\n", error);
+ goto out_atmsvc_exit;
+ }
+ error = atm_sysfs_init();
+ if (error < 0) {
+ pr_err("atm_sysfs_init() failed with %d\n", error);
+ goto out_atmproc_exit;
+ }
+out:
+ return error;
+out_atmproc_exit:
+ atm_proc_exit();
+out_atmsvc_exit:
+ atmsvc_exit();
+out_atmpvc_exit:
+ atmsvc_exit();
+out_unregister_vcc_proto:
+ proto_unregister(&vcc_proto);
+ goto out;
+}
+
+static void __exit atm_exit(void)
+{
+ atm_proc_exit();
+ atm_sysfs_exit();
+ atmsvc_exit();
+ atmpvc_exit();
+ proto_unregister(&vcc_proto);
+}
+
+subsys_initcall(atm_init);
+
+module_exit(atm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_ATMPVC);
+MODULE_ALIAS_NETPROTO(PF_ATMSVC);
diff --git a/net/atm/common.h b/net/atm/common.h
new file mode 100644
index 000000000..4d6f5b206
--- /dev/null
+++ b/net/atm/common.h
@@ -0,0 +1,55 @@
+/* net/atm/common.h - ATM sockets (common part for PVC and SVC) */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef NET_ATM_COMMON_H
+#define NET_ATM_COMMON_H
+
+#include <linux/net.h>
+#include <linux/poll.h> /* for poll_table */
+
+
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family);
+int vcc_release(struct socket *sock);
+int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
+int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags);
+int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len);
+unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
+int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+int vcc_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int vcc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen);
+void vcc_process_recv_queue(struct atm_vcc *vcc);
+
+int atmpvc_init(void);
+void atmpvc_exit(void);
+int atmsvc_init(void);
+void atmsvc_exit(void);
+int atm_sysfs_init(void);
+void atm_sysfs_exit(void);
+
+#ifdef CONFIG_PROC_FS
+int atm_proc_init(void);
+void atm_proc_exit(void);
+#else
+static inline int atm_proc_init(void)
+{
+ return 0;
+}
+
+static inline void atm_proc_exit(void)
+{
+ /* nothing */
+}
+#endif /* CONFIG_PROC_FS */
+
+/* SVC */
+int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos);
+
+void atm_dev_release_vccs(struct atm_dev *dev);
+
+#endif
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
new file mode 100644
index 000000000..bbd3b6399
--- /dev/null
+++ b/net/atm/ioctl.c
@@ -0,0 +1,369 @@
+/* ATM ioctl handling */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+/* 2003 John Levon <levon@movementarian.org> */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/net.h> /* struct socket, struct proto_ops */
+#include <linux/atm.h> /* ATM stuff */
+#include <linux/atmdev.h>
+#include <linux/atmclip.h> /* CLIP_*ENCAP */
+#include <linux/atmarp.h> /* manifest constants */
+#include <linux/capability.h>
+#include <linux/sonet.h> /* for ioctls */
+#include <linux/atmsvc.h>
+#include <linux/atmmpc.h>
+#include <net/atmclip.h>
+#include <linux/atmlec.h>
+#include <linux/mutex.h>
+#include <asm/ioctls.h>
+#include <net/compat.h>
+
+#include "resources.h"
+#include "signaling.h" /* for WAITING and sigd_attach */
+#include "common.h"
+
+
+static DEFINE_MUTEX(ioctl_mutex);
+static LIST_HEAD(ioctl_list);
+
+
+void register_atm_ioctl(struct atm_ioctl *ioctl)
+{
+ mutex_lock(&ioctl_mutex);
+ list_add_tail(&ioctl->list, &ioctl_list);
+ mutex_unlock(&ioctl_mutex);
+}
+EXPORT_SYMBOL(register_atm_ioctl);
+
+void deregister_atm_ioctl(struct atm_ioctl *ioctl)
+{
+ mutex_lock(&ioctl_mutex);
+ list_del(&ioctl->list);
+ mutex_unlock(&ioctl_mutex);
+}
+EXPORT_SYMBOL(deregister_atm_ioctl);
+
+static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg, int compat)
+{
+ struct sock *sk = sock->sk;
+ struct atm_vcc *vcc;
+ int error;
+ struct list_head *pos;
+ void __user *argp = (void __user *)arg;
+
+ vcc = ATM_SD(sock);
+ switch (cmd) {
+ case SIOCOUTQ:
+ if (sock->state != SS_CONNECTED ||
+ !test_bit(ATM_VF_READY, &vcc->flags)) {
+ error = -EINVAL;
+ goto done;
+ }
+ error = put_user(sk->sk_sndbuf - sk_wmem_alloc_get(sk),
+ (int __user *)argp) ? -EFAULT : 0;
+ goto done;
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+
+ if (sock->state != SS_CONNECTED) {
+ error = -EINVAL;
+ goto done;
+ }
+ skb = skb_peek(&sk->sk_receive_queue);
+ error = put_user(skb ? skb->len : 0,
+ (int __user *)argp) ? -EFAULT : 0;
+ goto done;
+ }
+ case SIOCGSTAMP: /* borrowed from IP */
+#ifdef CONFIG_COMPAT
+ if (compat)
+ error = compat_sock_get_timestamp(sk, argp);
+ else
+#endif
+ error = sock_get_timestamp(sk, argp);
+ goto done;
+ case SIOCGSTAMPNS: /* borrowed from IP */
+#ifdef CONFIG_COMPAT
+ if (compat)
+ error = compat_sock_get_timestampns(sk, argp);
+ else
+#endif
+ error = sock_get_timestampns(sk, argp);
+ goto done;
+ case ATM_SETSC:
+ net_warn_ratelimited("ATM_SETSC is obsolete; used by %s:%d\n",
+ current->comm, task_pid_nr(current));
+ error = 0;
+ goto done;
+ case ATMSIGD_CTRL:
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ /*
+ * The user/kernel protocol for exchanging signalling
+ * info uses kernel pointers as opaque references,
+ * so the holder of the file descriptor can scribble
+ * on the kernel... so we should make sure that we
+ * have the same privileges that /proc/kcore needs
+ */
+ if (!capable(CAP_SYS_RAWIO)) {
+ error = -EPERM;
+ goto done;
+ }
+#ifdef CONFIG_COMPAT
+ /* WTF? I don't even want to _think_ about making this
+ work for 32-bit userspace. TBH I don't really want
+ to think about it at all. dwmw2. */
+ if (compat) {
+ net_warn_ratelimited("32-bit task cannot be atmsigd\n");
+ error = -EINVAL;
+ goto done;
+ }
+#endif
+ error = sigd_attach(vcc);
+ if (!error)
+ sock->state = SS_CONNECTED;
+ goto done;
+ case ATM_SETBACKEND:
+ case ATM_NEWBACKENDIF:
+ {
+ atm_backend_t backend;
+ error = get_user(backend, (atm_backend_t __user *)argp);
+ if (error)
+ goto done;
+ switch (backend) {
+ case ATM_BACKEND_PPP:
+ request_module("pppoatm");
+ break;
+ case ATM_BACKEND_BR2684:
+ request_module("br2684");
+ break;
+ }
+ break;
+ }
+ case ATMMPC_CTRL:
+ case ATMMPC_DATA:
+ request_module("mpoa");
+ break;
+ case ATMARPD_CTRL:
+ request_module("clip");
+ break;
+ case ATMLEC_CTRL:
+ request_module("lec");
+ break;
+ }
+
+ error = -ENOIOCTLCMD;
+
+ mutex_lock(&ioctl_mutex);
+ list_for_each(pos, &ioctl_list) {
+ struct atm_ioctl *ic = list_entry(pos, struct atm_ioctl, list);
+ if (try_module_get(ic->owner)) {
+ error = ic->ioctl(sock, cmd, arg);
+ module_put(ic->owner);
+ if (error != -ENOIOCTLCMD)
+ break;
+ }
+ }
+ mutex_unlock(&ioctl_mutex);
+
+ if (error != -ENOIOCTLCMD)
+ goto done;
+
+ error = atm_dev_ioctl(cmd, argp, compat);
+
+done:
+ return error;
+}
+
+int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return do_vcc_ioctl(sock, cmd, arg, 0);
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * FIXME:
+ * The compat_ioctl handling is duplicated, using both these conversion
+ * routines and the compat argument to the actual handlers. Both
+ * versions are somewhat incomplete and should be merged, e.g. by
+ * moving the ioctl number translation into the actual handlers and
+ * killing the conversion code.
+ *
+ * -arnd, November 2009
+ */
+#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct compat_atmif_sioc)
+#define ATM_GETNAMES32 _IOW('a', ATMIOC_ITF+3, struct compat_atm_iobuf)
+#define ATM_GETTYPE32 _IOW('a', ATMIOC_ITF+4, struct compat_atmif_sioc)
+#define ATM_GETESI32 _IOW('a', ATMIOC_ITF+5, struct compat_atmif_sioc)
+#define ATM_GETADDR32 _IOW('a', ATMIOC_ITF+6, struct compat_atmif_sioc)
+#define ATM_RSTADDR32 _IOW('a', ATMIOC_ITF+7, struct compat_atmif_sioc)
+#define ATM_ADDADDR32 _IOW('a', ATMIOC_ITF+8, struct compat_atmif_sioc)
+#define ATM_DELADDR32 _IOW('a', ATMIOC_ITF+9, struct compat_atmif_sioc)
+#define ATM_GETCIRANGE32 _IOW('a', ATMIOC_ITF+10, struct compat_atmif_sioc)
+#define ATM_SETCIRANGE32 _IOW('a', ATMIOC_ITF+11, struct compat_atmif_sioc)
+#define ATM_SETESI32 _IOW('a', ATMIOC_ITF+12, struct compat_atmif_sioc)
+#define ATM_SETESIF32 _IOW('a', ATMIOC_ITF+13, struct compat_atmif_sioc)
+#define ATM_GETSTAT32 _IOW('a', ATMIOC_SARCOM+0, struct compat_atmif_sioc)
+#define ATM_GETSTATZ32 _IOW('a', ATMIOC_SARCOM+1, struct compat_atmif_sioc)
+#define ATM_GETLOOP32 _IOW('a', ATMIOC_SARCOM+2, struct compat_atmif_sioc)
+#define ATM_SETLOOP32 _IOW('a', ATMIOC_SARCOM+3, struct compat_atmif_sioc)
+#define ATM_QUERYLOOP32 _IOW('a', ATMIOC_SARCOM+4, struct compat_atmif_sioc)
+
+static struct {
+ unsigned int cmd32;
+ unsigned int cmd;
+} atm_ioctl_map[] = {
+ { ATM_GETLINKRATE32, ATM_GETLINKRATE },
+ { ATM_GETNAMES32, ATM_GETNAMES },
+ { ATM_GETTYPE32, ATM_GETTYPE },
+ { ATM_GETESI32, ATM_GETESI },
+ { ATM_GETADDR32, ATM_GETADDR },
+ { ATM_RSTADDR32, ATM_RSTADDR },
+ { ATM_ADDADDR32, ATM_ADDADDR },
+ { ATM_DELADDR32, ATM_DELADDR },
+ { ATM_GETCIRANGE32, ATM_GETCIRANGE },
+ { ATM_SETCIRANGE32, ATM_SETCIRANGE },
+ { ATM_SETESI32, ATM_SETESI },
+ { ATM_SETESIF32, ATM_SETESIF },
+ { ATM_GETSTAT32, ATM_GETSTAT },
+ { ATM_GETSTATZ32, ATM_GETSTATZ },
+ { ATM_GETLOOP32, ATM_GETLOOP },
+ { ATM_SETLOOP32, ATM_SETLOOP },
+ { ATM_QUERYLOOP32, ATM_QUERYLOOP },
+};
+
+#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map)
+
+static int do_atm_iobuf(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct atm_iobuf __user *iobuf;
+ struct compat_atm_iobuf __user *iobuf32;
+ u32 data;
+ void __user *datap;
+ int len, err;
+
+ iobuf = compat_alloc_user_space(sizeof(*iobuf));
+ iobuf32 = compat_ptr(arg);
+
+ if (get_user(len, &iobuf32->length) ||
+ get_user(data, &iobuf32->buffer))
+ return -EFAULT;
+ datap = compat_ptr(data);
+ if (put_user(len, &iobuf->length) ||
+ put_user(datap, &iobuf->buffer))
+ return -EFAULT;
+
+ err = do_vcc_ioctl(sock, cmd, (unsigned long) iobuf, 0);
+
+ if (!err) {
+ if (copy_in_user(&iobuf32->length, &iobuf->length,
+ sizeof(int)))
+ err = -EFAULT;
+ }
+
+ return err;
+}
+
+static int do_atmif_sioc(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct atmif_sioc __user *sioc;
+ struct compat_atmif_sioc __user *sioc32;
+ u32 data;
+ void __user *datap;
+ int err;
+
+ sioc = compat_alloc_user_space(sizeof(*sioc));
+ sioc32 = compat_ptr(arg);
+
+ if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
+ get_user(data, &sioc32->arg))
+ return -EFAULT;
+ datap = compat_ptr(data);
+ if (put_user(datap, &sioc->arg))
+ return -EFAULT;
+
+ err = do_vcc_ioctl(sock, cmd, (unsigned long) sioc, 0);
+
+ if (!err) {
+ if (copy_in_user(&sioc32->length, &sioc->length,
+ sizeof(int)))
+ err = -EFAULT;
+ }
+ return err;
+}
+
+static int do_atm_ioctl(struct socket *sock, unsigned int cmd32,
+ unsigned long arg)
+{
+ int i;
+ unsigned int cmd = 0;
+
+ switch (cmd32) {
+ case SONET_GETSTAT:
+ case SONET_GETSTATZ:
+ case SONET_GETDIAG:
+ case SONET_SETDIAG:
+ case SONET_CLRDIAG:
+ case SONET_SETFRAMING:
+ case SONET_GETFRAMING:
+ case SONET_GETFRSENSE:
+ return do_atmif_sioc(sock, cmd32, arg);
+ }
+
+ for (i = 0; i < NR_ATM_IOCTL; i++) {
+ if (cmd32 == atm_ioctl_map[i].cmd32) {
+ cmd = atm_ioctl_map[i].cmd;
+ break;
+ }
+ }
+ if (i == NR_ATM_IOCTL)
+ return -EINVAL;
+
+ switch (cmd) {
+ case ATM_GETNAMES:
+ return do_atm_iobuf(sock, cmd, arg);
+
+ case ATM_GETLINKRATE:
+ case ATM_GETTYPE:
+ case ATM_GETESI:
+ case ATM_GETADDR:
+ case ATM_RSTADDR:
+ case ATM_ADDADDR:
+ case ATM_DELADDR:
+ case ATM_GETCIRANGE:
+ case ATM_SETCIRANGE:
+ case ATM_SETESI:
+ case ATM_SETESIF:
+ case ATM_GETSTAT:
+ case ATM_GETSTATZ:
+ case ATM_GETLOOP:
+ case ATM_SETLOOP:
+ case ATM_QUERYLOOP:
+ return do_atmif_sioc(sock, cmd, arg);
+ }
+
+ return -EINVAL;
+}
+
+int vcc_compat_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret;
+
+ ret = do_vcc_ioctl(sock, cmd, arg, 1);
+ if (ret != -ENOIOCTLCMD)
+ return ret;
+
+ return do_atm_ioctl(sock, cmd, arg);
+}
+#endif
diff --git a/net/atm/lec.c b/net/atm/lec.c
new file mode 100644
index 000000000..cd3b37989
--- /dev/null
+++ b/net/atm/lec.c
@@ -0,0 +1,2285 @@
+/*
+ * lec.c: Lan Emulation driver
+ *
+ * Marko Kiiskila <mkiiskila@yahoo.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+
+/* We are ethernet device */
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <asm/byteorder.h>
+#include <linux/uaccess.h>
+#include <net/arp.h>
+#include <net/dst.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+
+/* And atm device */
+#include <linux/atmdev.h>
+#include <linux/atmlec.h>
+
+/* Proxy LEC knows about bridging */
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#include "../bridge/br_private.h"
+
+static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
+#endif
+
+/* Modular too */
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "lec.h"
+#include "lec_arpc.h"
+#include "resources.h"
+
+#define DUMP_PACKETS 0 /*
+ * 0 = None,
+ * 1 = 30 first bytes
+ * 2 = Whole packet
+ */
+
+#define LEC_UNRES_QUE_LEN 8 /*
+ * number of tx packets to queue for a
+ * single destination while waiting for SVC
+ */
+
+static int lec_open(struct net_device *dev);
+static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+static int lec_close(struct net_device *dev);
+static struct lec_arp_table *lec_arp_find(struct lec_priv *priv,
+ const unsigned char *mac_addr);
+static int lec_arp_remove(struct lec_priv *priv,
+ struct lec_arp_table *to_remove);
+/* LANE2 functions */
+static void lane2_associate_ind(struct net_device *dev, const u8 *mac_address,
+ const u8 *tlvs, u32 sizeoftlvs);
+static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force,
+ u8 **tlvs, u32 *sizeoftlvs);
+static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst,
+ const u8 *tlvs, u32 sizeoftlvs);
+
+static int lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr,
+ unsigned long permanent);
+static void lec_arp_check_empties(struct lec_priv *priv,
+ struct atm_vcc *vcc, struct sk_buff *skb);
+static void lec_arp_destroy(struct lec_priv *priv);
+static void lec_arp_init(struct lec_priv *priv);
+static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
+ const unsigned char *mac_to_find,
+ int is_rdesc,
+ struct lec_arp_table **ret_entry);
+static void lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr,
+ const unsigned char *atm_addr,
+ unsigned long remoteflag,
+ unsigned int targetless_le_arp);
+static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id);
+static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc);
+static void lec_set_flush_tran_id(struct lec_priv *priv,
+ const unsigned char *atm_addr,
+ unsigned long tran_id);
+static void lec_vcc_added(struct lec_priv *priv,
+ const struct atmlec_ioc *ioc_data,
+ struct atm_vcc *vcc,
+ void (*old_push)(struct atm_vcc *vcc,
+ struct sk_buff *skb));
+static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc);
+
+/* must be done under lec_arp_lock */
+static inline void lec_arp_hold(struct lec_arp_table *entry)
+{
+ atomic_inc(&entry->usage);
+}
+
+static inline void lec_arp_put(struct lec_arp_table *entry)
+{
+ if (atomic_dec_and_test(&entry->usage))
+ kfree(entry);
+}
+
+static struct lane2_ops lane2_ops = {
+ lane2_resolve, /* resolve, spec 3.1.3 */
+ lane2_associate_req, /* associate_req, spec 3.1.4 */
+ NULL /* associate indicator, spec 3.1.5 */
+};
+
+static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+/* Device structures */
+static struct net_device *dev_lec[MAX_LEC_ITF];
+
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
+{
+ char *buff;
+ struct lec_priv *priv;
+
+ /*
+ * Check if this is a BPDU. If so, ask zeppelin to send
+ * LE_TOPOLOGY_REQUEST with the same value of Topology Change bit
+ * as the Config BPDU has
+ */
+ buff = skb->data + skb->dev->hard_header_len;
+ if (*buff++ == 0x42 && *buff++ == 0x42 && *buff++ == 0x03) {
+ struct sock *sk;
+ struct sk_buff *skb2;
+ struct atmlec_msg *mesg;
+
+ skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC);
+ if (skb2 == NULL)
+ return;
+ skb2->len = sizeof(struct atmlec_msg);
+ mesg = (struct atmlec_msg *)skb2->data;
+ mesg->type = l_topology_change;
+ buff += 4;
+ mesg->content.normal.flag = *buff & 0x01;
+ /* 0x01 is topology change */
+
+ priv = netdev_priv(dev);
+ atm_force_charge(priv->lecd, skb2->truesize);
+ sk = sk_atm(priv->lecd);
+ skb_queue_tail(&sk->sk_receive_queue, skb2);
+ sk->sk_data_ready(sk);
+ }
+}
+#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+
+/*
+ * Open/initialize the netdevice. This is called (in the current kernel)
+ * sometime after booting when the 'ifconfig' program is run.
+ *
+ * This routine should set everything up anew at each open, even
+ * registers that "should" only need to be set once at boot, so that
+ * there is non-reboot way to recover if something goes wrong.
+ */
+
+static int lec_open(struct net_device *dev)
+{
+ netif_start_queue(dev);
+
+ return 0;
+}
+
+static void
+lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+
+ ATM_SKB(skb)->vcc = vcc;
+ ATM_SKB(skb)->atm_options = vcc->atm_options;
+
+ atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+ if (vcc->send(vcc, skb) < 0) {
+ dev->stats.tx_dropped++;
+ return;
+ }
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+}
+
+static void lec_tx_timeout(struct net_device *dev)
+{
+ pr_info("%s\n", dev->name);
+ dev->trans_start = jiffies;
+ netif_wake_queue(dev);
+}
+
+static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct sk_buff *skb2;
+ struct lec_priv *priv = netdev_priv(dev);
+ struct lecdatahdr_8023 *lec_h;
+ struct atm_vcc *vcc;
+ struct lec_arp_table *entry;
+ unsigned char *dst;
+ int min_frame_size;
+ int is_rdesc;
+
+ pr_debug("called\n");
+ if (!priv->lecd) {
+ pr_info("%s:No lecd attached\n", dev->name);
+ dev->stats.tx_errors++;
+ netif_stop_queue(dev);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n",
+ (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb),
+ (long)skb_end_pointer(skb));
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+ if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0)
+ lec_handle_bridge(skb, dev);
+#endif
+
+ /* Make sure we have room for lec_id */
+ if (skb_headroom(skb) < 2) {
+ pr_debug("reallocating skb\n");
+ skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN);
+ if (unlikely(!skb2)) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+ consume_skb(skb);
+ skb = skb2;
+ }
+ skb_push(skb, 2);
+
+ /* Put le header to place */
+ lec_h = (struct lecdatahdr_8023 *)skb->data;
+ lec_h->le_header = htons(priv->lecid);
+
+#if DUMP_PACKETS >= 2
+#define MAX_DUMP_SKB 99
+#elif DUMP_PACKETS >= 1
+#define MAX_DUMP_SKB 30
+#endif
+#if DUMP_PACKETS >= 1
+ printk(KERN_DEBUG "%s: send datalen:%ld lecid:%4.4x\n",
+ dev->name, skb->len, priv->lecid);
+ print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1,
+ skb->data, min(skb->len, MAX_DUMP_SKB), true);
+#endif /* DUMP_PACKETS >= 1 */
+
+ /* Minimum ethernet-frame size */
+ min_frame_size = LEC_MINIMUM_8023_SIZE;
+ if (skb->len < min_frame_size) {
+ if ((skb->len + skb_tailroom(skb)) < min_frame_size) {
+ skb2 = skb_copy_expand(skb, 0,
+ min_frame_size - skb->truesize,
+ GFP_ATOMIC);
+ dev_kfree_skb(skb);
+ if (skb2 == NULL) {
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+ skb = skb2;
+ }
+ skb_put(skb, min_frame_size - skb->len);
+ }
+
+ /* Send to right vcc */
+ is_rdesc = 0;
+ dst = lec_h->h_dest;
+ entry = NULL;
+ vcc = lec_arp_resolve(priv, dst, is_rdesc, &entry);
+ pr_debug("%s:vcc:%p vcc_flags:%lx, entry:%p\n",
+ dev->name, vcc, vcc ? vcc->flags : 0, entry);
+ if (!vcc || !test_bit(ATM_VF_READY, &vcc->flags)) {
+ if (entry && (entry->tx_wait.qlen < LEC_UNRES_QUE_LEN)) {
+ pr_debug("%s:queuing packet, MAC address %pM\n",
+ dev->name, lec_h->h_dest);
+ skb_queue_tail(&entry->tx_wait, skb);
+ } else {
+ pr_debug("%s:tx queue full or no arp entry, dropping, MAC address: %pM\n",
+ dev->name, lec_h->h_dest);
+ dev->stats.tx_dropped++;
+ dev_kfree_skb(skb);
+ }
+ goto out;
+ }
+#if DUMP_PACKETS > 0
+ printk(KERN_DEBUG "%s:sending to vpi:%d vci:%d\n",
+ dev->name, vcc->vpi, vcc->vci);
+#endif /* DUMP_PACKETS > 0 */
+
+ while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) {
+ pr_debug("emptying tx queue, MAC address %pM\n", lec_h->h_dest);
+ lec_send(vcc, skb2);
+ }
+
+ lec_send(vcc, skb);
+
+ if (!atm_may_send(vcc, 0)) {
+ struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
+
+ vpriv->xoff = 1;
+ netif_stop_queue(dev);
+
+ /*
+ * vcc->pop() might have occurred in between, making
+ * the vcc usuable again. Since xmit is serialized,
+ * this is the only situation we have to re-test.
+ */
+
+ if (atm_may_send(vcc, 0))
+ netif_wake_queue(dev);
+ }
+
+out:
+ if (entry)
+ lec_arp_put(entry);
+ dev->trans_start = jiffies;
+ return NETDEV_TX_OK;
+}
+
+/* The inverse routine to net_open(). */
+static int lec_close(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ unsigned long flags;
+ struct net_device *dev = (struct net_device *)vcc->proto_data;
+ struct lec_priv *priv = netdev_priv(dev);
+ struct atmlec_msg *mesg;
+ struct lec_arp_table *entry;
+ int i;
+ char *tmp; /* FIXME */
+
+ atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+ mesg = (struct atmlec_msg *)skb->data;
+ tmp = skb->data;
+ tmp += sizeof(struct atmlec_msg);
+ pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type);
+ switch (mesg->type) {
+ case l_set_mac_addr:
+ for (i = 0; i < 6; i++)
+ dev->dev_addr[i] = mesg->content.normal.mac_addr[i];
+ break;
+ case l_del_mac_addr:
+ for (i = 0; i < 6; i++)
+ dev->dev_addr[i] = 0;
+ break;
+ case l_addr_delete:
+ lec_addr_delete(priv, mesg->content.normal.atm_addr,
+ mesg->content.normal.flag);
+ break;
+ case l_topology_change:
+ priv->topology_change = mesg->content.normal.flag;
+ break;
+ case l_flush_complete:
+ lec_flush_complete(priv, mesg->content.normal.flag);
+ break;
+ case l_narp_req: /* LANE2: see 7.1.35 in the lane2 spec */
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ entry = lec_arp_find(priv, mesg->content.normal.mac_addr);
+ lec_arp_remove(priv, entry);
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+
+ if (mesg->content.normal.no_source_le_narp)
+ break;
+ /* FALL THROUGH */
+ case l_arp_update:
+ lec_arp_update(priv, mesg->content.normal.mac_addr,
+ mesg->content.normal.atm_addr,
+ mesg->content.normal.flag,
+ mesg->content.normal.targetless_le_arp);
+ pr_debug("in l_arp_update\n");
+ if (mesg->sizeoftlvs != 0) { /* LANE2 3.1.5 */
+ pr_debug("LANE2 3.1.5, got tlvs, size %d\n",
+ mesg->sizeoftlvs);
+ lane2_associate_ind(dev, mesg->content.normal.mac_addr,
+ tmp, mesg->sizeoftlvs);
+ }
+ break;
+ case l_config:
+ priv->maximum_unknown_frame_count =
+ mesg->content.config.maximum_unknown_frame_count;
+ priv->max_unknown_frame_time =
+ (mesg->content.config.max_unknown_frame_time * HZ);
+ priv->max_retry_count = mesg->content.config.max_retry_count;
+ priv->aging_time = (mesg->content.config.aging_time * HZ);
+ priv->forward_delay_time =
+ (mesg->content.config.forward_delay_time * HZ);
+ priv->arp_response_time =
+ (mesg->content.config.arp_response_time * HZ);
+ priv->flush_timeout = (mesg->content.config.flush_timeout * HZ);
+ priv->path_switching_delay =
+ (mesg->content.config.path_switching_delay * HZ);
+ priv->lane_version = mesg->content.config.lane_version;
+ /* LANE2 */
+ priv->lane2_ops = NULL;
+ if (priv->lane_version > 1)
+ priv->lane2_ops = &lane2_ops;
+ rtnl_lock();
+ if (dev_set_mtu(dev, mesg->content.config.mtu))
+ pr_info("%s: change_mtu to %d failed\n",
+ dev->name, mesg->content.config.mtu);
+ rtnl_unlock();
+ priv->is_proxy = mesg->content.config.is_proxy;
+ break;
+ case l_flush_tran_id:
+ lec_set_flush_tran_id(priv, mesg->content.normal.atm_addr,
+ mesg->content.normal.flag);
+ break;
+ case l_set_lecid:
+ priv->lecid =
+ (unsigned short)(0xffff & mesg->content.normal.flag);
+ break;
+ case l_should_bridge:
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+ {
+ pr_debug("%s: bridge zeppelin asks about %pM\n",
+ dev->name, mesg->content.proxy.mac_addr);
+
+ if (br_fdb_test_addr_hook == NULL)
+ break;
+
+ if (br_fdb_test_addr_hook(dev, mesg->content.proxy.mac_addr)) {
+ /* hit from bridge table, send LE_ARP_RESPONSE */
+ struct sk_buff *skb2;
+ struct sock *sk;
+
+ pr_debug("%s: entry found, responding to zeppelin\n",
+ dev->name);
+ skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC);
+ if (skb2 == NULL)
+ break;
+ skb2->len = sizeof(struct atmlec_msg);
+ skb_copy_to_linear_data(skb2, mesg, sizeof(*mesg));
+ atm_force_charge(priv->lecd, skb2->truesize);
+ sk = sk_atm(priv->lecd);
+ skb_queue_tail(&sk->sk_receive_queue, skb2);
+ sk->sk_data_ready(sk);
+ }
+ }
+#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+ break;
+ default:
+ pr_info("%s: Unknown message type %d\n", dev->name, mesg->type);
+ dev_kfree_skb(skb);
+ return -EINVAL;
+ }
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+static void lec_atm_close(struct atm_vcc *vcc)
+{
+ struct sk_buff *skb;
+ struct net_device *dev = (struct net_device *)vcc->proto_data;
+ struct lec_priv *priv = netdev_priv(dev);
+
+ priv->lecd = NULL;
+ /* Do something needful? */
+
+ netif_stop_queue(dev);
+ lec_arp_destroy(priv);
+
+ if (skb_peek(&sk_atm(vcc)->sk_receive_queue))
+ pr_info("%s closing with messages pending\n", dev->name);
+ while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) {
+ atm_return(vcc, skb->truesize);
+ dev_kfree_skb(skb);
+ }
+
+ pr_info("%s: Shut down!\n", dev->name);
+ module_put(THIS_MODULE);
+}
+
+static struct atmdev_ops lecdev_ops = {
+ .close = lec_atm_close,
+ .send = lec_atm_send
+};
+
+static struct atm_dev lecatm_dev = {
+ .ops = &lecdev_ops,
+ .type = "lec",
+ .number = 999, /* dummy device number */
+ .lock = __SPIN_LOCK_UNLOCKED(lecatm_dev.lock)
+};
+
+/*
+ * LANE2: new argument struct sk_buff *data contains
+ * the LE_ARP based TLVs introduced in the LANE2 spec
+ */
+static int
+send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
+ const unsigned char *mac_addr, const unsigned char *atm_addr,
+ struct sk_buff *data)
+{
+ struct sock *sk;
+ struct sk_buff *skb;
+ struct atmlec_msg *mesg;
+
+ if (!priv || !priv->lecd)
+ return -1;
+ skb = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC);
+ if (!skb)
+ return -1;
+ skb->len = sizeof(struct atmlec_msg);
+ mesg = (struct atmlec_msg *)skb->data;
+ memset(mesg, 0, sizeof(struct atmlec_msg));
+ mesg->type = type;
+ if (data != NULL)
+ mesg->sizeoftlvs = data->len;
+ if (mac_addr)
+ ether_addr_copy(mesg->content.normal.mac_addr, mac_addr);
+ else
+ mesg->content.normal.targetless_le_arp = 1;
+ if (atm_addr)
+ memcpy(&mesg->content.normal.atm_addr, atm_addr, ATM_ESA_LEN);
+
+ atm_force_charge(priv->lecd, skb->truesize);
+ sk = sk_atm(priv->lecd);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+
+ if (data != NULL) {
+ pr_debug("about to send %d bytes of data\n", data->len);
+ atm_force_charge(priv->lecd, data->truesize);
+ skb_queue_tail(&sk->sk_receive_queue, data);
+ sk->sk_data_ready(sk);
+ }
+
+ return 0;
+}
+
+/* shamelessly stolen from drivers/net/net_init.c */
+static int lec_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if ((new_mtu < 68) || (new_mtu > 18190))
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static void lec_set_multicast_list(struct net_device *dev)
+{
+ /*
+ * by default, all multicast frames arrive over the bus.
+ * eventually support selective multicast service
+ */
+}
+
+static const struct net_device_ops lec_netdev_ops = {
+ .ndo_open = lec_open,
+ .ndo_stop = lec_close,
+ .ndo_start_xmit = lec_start_xmit,
+ .ndo_change_mtu = lec_change_mtu,
+ .ndo_tx_timeout = lec_tx_timeout,
+ .ndo_set_rx_mode = lec_set_multicast_list,
+};
+
+static const unsigned char lec_ctrl_magic[] = {
+ 0xff,
+ 0x00,
+ 0x01,
+ 0x01
+};
+
+#define LEC_DATA_DIRECT_8023 2
+#define LEC_DATA_DIRECT_8025 3
+
+static int lec_is_data_direct(struct atm_vcc *vcc)
+{
+ return ((vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8023) ||
+ (vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8025));
+}
+
+static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ unsigned long flags;
+ struct net_device *dev = (struct net_device *)vcc->proto_data;
+ struct lec_priv *priv = netdev_priv(dev);
+
+#if DUMP_PACKETS > 0
+ printk(KERN_DEBUG "%s: vcc vpi:%d vci:%d\n",
+ dev->name, vcc->vpi, vcc->vci);
+#endif
+ if (!skb) {
+ pr_debug("%s: null skb\n", dev->name);
+ lec_vcc_close(priv, vcc);
+ return;
+ }
+#if DUMP_PACKETS >= 2
+#define MAX_SKB_DUMP 99
+#elif DUMP_PACKETS >= 1
+#define MAX_SKB_DUMP 30
+#endif
+#if DUMP_PACKETS > 0
+ printk(KERN_DEBUG "%s: rcv datalen:%ld lecid:%4.4x\n",
+ dev->name, skb->len, priv->lecid);
+ print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1,
+ skb->data, min(MAX_SKB_DUMP, skb->len), true);
+#endif /* DUMP_PACKETS > 0 */
+ if (memcmp(skb->data, lec_ctrl_magic, 4) == 0) {
+ /* Control frame, to daemon */
+ struct sock *sk = sk_atm(vcc);
+
+ pr_debug("%s: To daemon\n", dev->name);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+ } else { /* Data frame, queue to protocol handlers */
+ struct lec_arp_table *entry;
+ unsigned char *src, *dst;
+
+ atm_return(vcc, skb->truesize);
+ if (*(__be16 *) skb->data == htons(priv->lecid) ||
+ !priv->lecd || !(dev->flags & IFF_UP)) {
+ /*
+ * Probably looping back, or if lecd is missing,
+ * lecd has gone down
+ */
+ pr_debug("Ignoring frame...\n");
+ dev_kfree_skb(skb);
+ return;
+ }
+ dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest;
+
+ /*
+ * If this is a Data Direct VCC, and the VCC does not match
+ * the LE_ARP cache entry, delete the LE_ARP cache entry.
+ */
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ if (lec_is_data_direct(vcc)) {
+ src = ((struct lecdatahdr_8023 *)skb->data)->h_source;
+ entry = lec_arp_find(priv, src);
+ if (entry && entry->vcc != vcc) {
+ lec_arp_remove(priv, entry);
+ lec_arp_put(entry);
+ }
+ }
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+
+ if (!(dst[0] & 0x01) && /* Never filter Multi/Broadcast */
+ !priv->is_proxy && /* Proxy wants all the packets */
+ memcmp(dst, dev->dev_addr, dev->addr_len)) {
+ dev_kfree_skb(skb);
+ return;
+ }
+ if (!hlist_empty(&priv->lec_arp_empty_ones))
+ lec_arp_check_empties(priv, vcc, skb);
+ skb_pull(skb, 2); /* skip lec_id */
+ skb->protocol = eth_type_trans(skb, dev);
+ dev->stats.rx_packets++;
+ dev->stats.rx_bytes += skb->len;
+ memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+ netif_rx(skb);
+ }
+}
+
+static void lec_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
+ struct net_device *dev = skb->dev;
+
+ if (vpriv == NULL) {
+ pr_info("vpriv = NULL!?!?!?\n");
+ return;
+ }
+
+ vpriv->old_pop(vcc, skb);
+
+ if (vpriv->xoff && atm_may_send(vcc, 0)) {
+ vpriv->xoff = 0;
+ if (netif_running(dev) && netif_queue_stopped(dev))
+ netif_wake_queue(dev);
+ }
+}
+
+static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
+{
+ struct lec_vcc_priv *vpriv;
+ int bytes_left;
+ struct atmlec_ioc ioc_data;
+
+ /* Lecd must be up in this case */
+ bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc));
+ if (bytes_left != 0)
+ pr_info("copy from user failed for %d bytes\n", bytes_left);
+ if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF ||
+ !dev_lec[ioc_data.dev_num])
+ return -EINVAL;
+ vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL);
+ if (!vpriv)
+ return -ENOMEM;
+ vpriv->xoff = 0;
+ vpriv->old_pop = vcc->pop;
+ vcc->user_back = vpriv;
+ vcc->pop = lec_pop;
+ lec_vcc_added(netdev_priv(dev_lec[ioc_data.dev_num]),
+ &ioc_data, vcc, vcc->push);
+ vcc->proto_data = dev_lec[ioc_data.dev_num];
+ vcc->push = lec_push;
+ return 0;
+}
+
+static int lec_mcast_attach(struct atm_vcc *vcc, int arg)
+{
+ if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg])
+ return -EINVAL;
+ vcc->proto_data = dev_lec[arg];
+ return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc);
+}
+
+/* Initialize device. */
+static int lecd_attach(struct atm_vcc *vcc, int arg)
+{
+ int i;
+ struct lec_priv *priv;
+
+ if (arg < 0)
+ i = 0;
+ else
+ i = arg;
+ if (arg >= MAX_LEC_ITF)
+ return -EINVAL;
+ if (!dev_lec[i]) {
+ int size;
+
+ size = sizeof(struct lec_priv);
+ dev_lec[i] = alloc_etherdev(size);
+ if (!dev_lec[i])
+ return -ENOMEM;
+ dev_lec[i]->netdev_ops = &lec_netdev_ops;
+ snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i);
+ if (register_netdev(dev_lec[i])) {
+ free_netdev(dev_lec[i]);
+ return -EINVAL;
+ }
+
+ priv = netdev_priv(dev_lec[i]);
+ } else {
+ priv = netdev_priv(dev_lec[i]);
+ if (priv->lecd)
+ return -EADDRINUSE;
+ }
+ lec_arp_init(priv);
+ priv->itfnum = i; /* LANE2 addition */
+ priv->lecd = vcc;
+ vcc->dev = &lecatm_dev;
+ vcc_insert_socket(sk_atm(vcc));
+
+ vcc->proto_data = dev_lec[i];
+ set_bit(ATM_VF_META, &vcc->flags);
+ set_bit(ATM_VF_READY, &vcc->flags);
+
+ /* Set default values to these variables */
+ priv->maximum_unknown_frame_count = 1;
+ priv->max_unknown_frame_time = (1 * HZ);
+ priv->vcc_timeout_period = (1200 * HZ);
+ priv->max_retry_count = 1;
+ priv->aging_time = (300 * HZ);
+ priv->forward_delay_time = (15 * HZ);
+ priv->topology_change = 0;
+ priv->arp_response_time = (1 * HZ);
+ priv->flush_timeout = (4 * HZ);
+ priv->path_switching_delay = (6 * HZ);
+
+ if (dev_lec[i]->flags & IFF_UP)
+ netif_start_queue(dev_lec[i]);
+ __module_get(THIS_MODULE);
+ return i;
+}
+
+#ifdef CONFIG_PROC_FS
+static const char *lec_arp_get_status_string(unsigned char status)
+{
+ static const char *const lec_arp_status_string[] = {
+ "ESI_UNKNOWN ",
+ "ESI_ARP_PENDING ",
+ "ESI_VC_PENDING ",
+ "<Undefined> ",
+ "ESI_FLUSH_PENDING ",
+ "ESI_FORWARD_DIRECT"
+ };
+
+ if (status > ESI_FORWARD_DIRECT)
+ status = 3; /* ESI_UNDEFINED */
+ return lec_arp_status_string[status];
+}
+
+static void lec_info(struct seq_file *seq, struct lec_arp_table *entry)
+{
+ int i;
+
+ for (i = 0; i < ETH_ALEN; i++)
+ seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff);
+ seq_printf(seq, " ");
+ for (i = 0; i < ATM_ESA_LEN; i++)
+ seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff);
+ seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status),
+ entry->flags & 0xffff);
+ if (entry->vcc)
+ seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci);
+ else
+ seq_printf(seq, " ");
+ if (entry->recv_vcc) {
+ seq_printf(seq, " %3d %3d", entry->recv_vcc->vpi,
+ entry->recv_vcc->vci);
+ }
+ seq_putc(seq, '\n');
+}
+
+struct lec_state {
+ unsigned long flags;
+ struct lec_priv *locked;
+ struct hlist_node *node;
+ struct net_device *dev;
+ int itf;
+ int arp_table;
+ int misc_table;
+};
+
+static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
+ loff_t *l)
+{
+ struct hlist_node *e = state->node;
+
+ if (!e)
+ e = tbl->first;
+ if (e == SEQ_START_TOKEN) {
+ e = tbl->first;
+ --*l;
+ }
+
+ for (; e; e = e->next) {
+ if (--*l < 0)
+ break;
+ }
+ state->node = e;
+
+ return (*l < 0) ? state : NULL;
+}
+
+static void *lec_arp_walk(struct lec_state *state, loff_t *l,
+ struct lec_priv *priv)
+{
+ void *v = NULL;
+ int p;
+
+ for (p = state->arp_table; p < LEC_ARP_TABLE_SIZE; p++) {
+ v = lec_tbl_walk(state, &priv->lec_arp_tables[p], l);
+ if (v)
+ break;
+ }
+ state->arp_table = p;
+ return v;
+}
+
+static void *lec_misc_walk(struct lec_state *state, loff_t *l,
+ struct lec_priv *priv)
+{
+ struct hlist_head *lec_misc_tables[] = {
+ &priv->lec_arp_empty_ones,
+ &priv->lec_no_forward,
+ &priv->mcast_fwds
+ };
+ void *v = NULL;
+ int q;
+
+ for (q = state->misc_table; q < ARRAY_SIZE(lec_misc_tables); q++) {
+ v = lec_tbl_walk(state, lec_misc_tables[q], l);
+ if (v)
+ break;
+ }
+ state->misc_table = q;
+ return v;
+}
+
+static void *lec_priv_walk(struct lec_state *state, loff_t *l,
+ struct lec_priv *priv)
+{
+ if (!state->locked) {
+ state->locked = priv;
+ spin_lock_irqsave(&priv->lec_arp_lock, state->flags);
+ }
+ if (!lec_arp_walk(state, l, priv) && !lec_misc_walk(state, l, priv)) {
+ spin_unlock_irqrestore(&priv->lec_arp_lock, state->flags);
+ state->locked = NULL;
+ /* Partial state reset for the next time we get called */
+ state->arp_table = state->misc_table = 0;
+ }
+ return state->locked;
+}
+
+static void *lec_itf_walk(struct lec_state *state, loff_t *l)
+{
+ struct net_device *dev;
+ void *v;
+
+ dev = state->dev ? state->dev : dev_lec[state->itf];
+ v = (dev && netdev_priv(dev)) ?
+ lec_priv_walk(state, l, netdev_priv(dev)) : NULL;
+ if (!v && dev) {
+ dev_put(dev);
+ /* Partial state reset for the next time we get called */
+ dev = NULL;
+ }
+ state->dev = dev;
+ return v;
+}
+
+static void *lec_get_idx(struct lec_state *state, loff_t l)
+{
+ void *v = NULL;
+
+ for (; state->itf < MAX_LEC_ITF; state->itf++) {
+ v = lec_itf_walk(state, &l);
+ if (v)
+ break;
+ }
+ return v;
+}
+
+static void *lec_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct lec_state *state = seq->private;
+
+ state->itf = 0;
+ state->dev = NULL;
+ state->locked = NULL;
+ state->arp_table = 0;
+ state->misc_table = 0;
+ state->node = SEQ_START_TOKEN;
+
+ return *pos ? lec_get_idx(state, *pos) : SEQ_START_TOKEN;
+}
+
+static void lec_seq_stop(struct seq_file *seq, void *v)
+{
+ struct lec_state *state = seq->private;
+
+ if (state->dev) {
+ spin_unlock_irqrestore(&state->locked->lec_arp_lock,
+ state->flags);
+ dev_put(state->dev);
+ }
+}
+
+static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct lec_state *state = seq->private;
+
+ v = lec_get_idx(state, 1);
+ *pos += !!PTR_ERR(v);
+ return v;
+}
+
+static int lec_seq_show(struct seq_file *seq, void *v)
+{
+ static const char lec_banner[] =
+ "Itf MAC ATM destination"
+ " Status Flags "
+ "VPI/VCI Recv VPI/VCI\n";
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, lec_banner);
+ else {
+ struct lec_state *state = seq->private;
+ struct net_device *dev = state->dev;
+ struct lec_arp_table *entry = hlist_entry(state->node,
+ struct lec_arp_table,
+ next);
+
+ seq_printf(seq, "%s ", dev->name);
+ lec_info(seq, entry);
+ }
+ return 0;
+}
+
+static const struct seq_operations lec_seq_ops = {
+ .start = lec_seq_start,
+ .next = lec_seq_next,
+ .stop = lec_seq_stop,
+ .show = lec_seq_show,
+};
+
+static int lec_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &lec_seq_ops, sizeof(struct lec_state));
+}
+
+static const struct file_operations lec_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = lec_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif
+
+static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct atm_vcc *vcc = ATM_SD(sock);
+ int err = 0;
+
+ switch (cmd) {
+ case ATMLEC_CTRL:
+ case ATMLEC_MCAST:
+ case ATMLEC_DATA:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ switch (cmd) {
+ case ATMLEC_CTRL:
+ err = lecd_attach(vcc, (int)arg);
+ if (err >= 0)
+ sock->state = SS_CONNECTED;
+ break;
+ case ATMLEC_MCAST:
+ err = lec_mcast_attach(vcc, (int)arg);
+ break;
+ case ATMLEC_DATA:
+ err = lec_vcc_attach(vcc, (void __user *)arg);
+ break;
+ }
+
+ return err;
+}
+
+static struct atm_ioctl lane_ioctl_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = lane_ioctl,
+};
+
+static int __init lane_module_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *p;
+
+ p = proc_create("lec", S_IRUGO, atm_proc_root, &lec_seq_fops);
+ if (!p) {
+ pr_err("Unable to initialize /proc/net/atm/lec\n");
+ return -ENOMEM;
+ }
+#endif
+
+ register_atm_ioctl(&lane_ioctl_ops);
+ pr_info("lec.c: initialized\n");
+ return 0;
+}
+
+static void __exit lane_module_cleanup(void)
+{
+ int i;
+
+ remove_proc_entry("lec", atm_proc_root);
+
+ deregister_atm_ioctl(&lane_ioctl_ops);
+
+ for (i = 0; i < MAX_LEC_ITF; i++) {
+ if (dev_lec[i] != NULL) {
+ unregister_netdev(dev_lec[i]);
+ free_netdev(dev_lec[i]);
+ dev_lec[i] = NULL;
+ }
+ }
+}
+
+module_init(lane_module_init);
+module_exit(lane_module_cleanup);
+
+/*
+ * LANE2: 3.1.3, LE_RESOLVE.request
+ * Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs.
+ * If sizeoftlvs == NULL the default TLVs associated with with this
+ * lec will be used.
+ * If dst_mac == NULL, targetless LE_ARP will be sent
+ */
+static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force,
+ u8 **tlvs, u32 *sizeoftlvs)
+{
+ unsigned long flags;
+ struct lec_priv *priv = netdev_priv(dev);
+ struct lec_arp_table *table;
+ struct sk_buff *skb;
+ int retval;
+
+ if (force == 0) {
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ table = lec_arp_find(priv, dst_mac);
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+ if (table == NULL)
+ return -1;
+
+ *tlvs = kmemdup(table->tlvs, table->sizeoftlvs, GFP_ATOMIC);
+ if (*tlvs == NULL)
+ return -1;
+
+ *sizeoftlvs = table->sizeoftlvs;
+
+ return 0;
+ }
+
+ if (sizeoftlvs == NULL)
+ retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, NULL);
+
+ else {
+ skb = alloc_skb(*sizeoftlvs, GFP_ATOMIC);
+ if (skb == NULL)
+ return -1;
+ skb->len = *sizeoftlvs;
+ skb_copy_to_linear_data(skb, *tlvs, *sizeoftlvs);
+ retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, skb);
+ }
+ return retval;
+}
+
+/*
+ * LANE2: 3.1.4, LE_ASSOCIATE.request
+ * Associate the *tlvs with the *lan_dst address.
+ * Will overwrite any previous association
+ * Returns 1 for success, 0 for failure (out of memory)
+ *
+ */
+static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst,
+ const u8 *tlvs, u32 sizeoftlvs)
+{
+ int retval;
+ struct sk_buff *skb;
+ struct lec_priv *priv = netdev_priv(dev);
+
+ if (!ether_addr_equal(lan_dst, dev->dev_addr))
+ return 0; /* not our mac address */
+
+ kfree(priv->tlvs); /* NULL if there was no previous association */
+
+ priv->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL);
+ if (priv->tlvs == NULL)
+ return 0;
+ priv->sizeoftlvs = sizeoftlvs;
+
+ skb = alloc_skb(sizeoftlvs, GFP_ATOMIC);
+ if (skb == NULL)
+ return 0;
+ skb->len = sizeoftlvs;
+ skb_copy_to_linear_data(skb, tlvs, sizeoftlvs);
+ retval = send_to_lecd(priv, l_associate_req, NULL, NULL, skb);
+ if (retval != 0)
+ pr_info("lec.c: lane2_associate_req() failed\n");
+ /*
+ * If the previous association has changed we must
+ * somehow notify other LANE entities about the change
+ */
+ return 1;
+}
+
+/*
+ * LANE2: 3.1.5, LE_ASSOCIATE.indication
+ *
+ */
+static void lane2_associate_ind(struct net_device *dev, const u8 *mac_addr,
+ const u8 *tlvs, u32 sizeoftlvs)
+{
+#if 0
+ int i = 0;
+#endif
+ struct lec_priv *priv = netdev_priv(dev);
+#if 0 /*
+ * Why have the TLVs in LE_ARP entries
+ * since we do not use them? When you
+ * uncomment this code, make sure the
+ * TLVs get freed when entry is killed
+ */
+ struct lec_arp_table *entry = lec_arp_find(priv, mac_addr);
+
+ if (entry == NULL)
+ return; /* should not happen */
+
+ kfree(entry->tlvs);
+
+ entry->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL);
+ if (entry->tlvs == NULL)
+ return;
+ entry->sizeoftlvs = sizeoftlvs;
+#endif
+#if 0
+ pr_info("\n");
+ pr_info("dump of tlvs, sizeoftlvs=%d\n", sizeoftlvs);
+ while (i < sizeoftlvs)
+ pr_cont("%02x ", tlvs[i++]);
+
+ pr_cont("\n");
+#endif
+
+ /* tell MPOA about the TLVs we saw */
+ if (priv->lane2_ops && priv->lane2_ops->associate_indicator) {
+ priv->lane2_ops->associate_indicator(dev, mac_addr,
+ tlvs, sizeoftlvs);
+ }
+}
+
+/*
+ * Here starts what used to lec_arpc.c
+ *
+ * lec_arpc.c was added here when making
+ * lane client modular. October 1997
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/param.h>
+#include <linux/atomic.h>
+#include <linux/inetdevice.h>
+#include <net/route.h>
+
+#if 0
+#define pr_debug(format, args...)
+/*
+ #define pr_debug printk
+*/
+#endif
+#define DEBUG_ARP_TABLE 0
+
+#define LEC_ARP_REFRESH_INTERVAL (3*HZ)
+
+static void lec_arp_check_expire(struct work_struct *work);
+static void lec_arp_expire_arp(unsigned long data);
+
+/*
+ * Arp table funcs
+ */
+
+#define HASH(ch) (ch & (LEC_ARP_TABLE_SIZE - 1))
+
+/*
+ * Initialization of arp-cache
+ */
+static void lec_arp_init(struct lec_priv *priv)
+{
+ unsigned short i;
+
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&priv->lec_arp_tables[i]);
+ INIT_HLIST_HEAD(&priv->lec_arp_empty_ones);
+ INIT_HLIST_HEAD(&priv->lec_no_forward);
+ INIT_HLIST_HEAD(&priv->mcast_fwds);
+ spin_lock_init(&priv->lec_arp_lock);
+ INIT_DELAYED_WORK(&priv->lec_arp_work, lec_arp_check_expire);
+ schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL);
+}
+
+static void lec_arp_clear_vccs(struct lec_arp_table *entry)
+{
+ if (entry->vcc) {
+ struct atm_vcc *vcc = entry->vcc;
+ struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
+ struct net_device *dev = (struct net_device *)vcc->proto_data;
+
+ vcc->pop = vpriv->old_pop;
+ if (vpriv->xoff)
+ netif_wake_queue(dev);
+ kfree(vpriv);
+ vcc->user_back = NULL;
+ vcc->push = entry->old_push;
+ vcc_release_async(vcc, -EPIPE);
+ entry->vcc = NULL;
+ }
+ if (entry->recv_vcc) {
+ entry->recv_vcc->push = entry->old_recv_push;
+ vcc_release_async(entry->recv_vcc, -EPIPE);
+ entry->recv_vcc = NULL;
+ }
+}
+
+/*
+ * Insert entry to lec_arp_table
+ * LANE2: Add to the end of the list to satisfy 8.1.13
+ */
+static inline void
+lec_arp_add(struct lec_priv *priv, struct lec_arp_table *entry)
+{
+ struct hlist_head *tmp;
+
+ tmp = &priv->lec_arp_tables[HASH(entry->mac_addr[ETH_ALEN - 1])];
+ hlist_add_head(&entry->next, tmp);
+
+ pr_debug("Added entry:%pM\n", entry->mac_addr);
+}
+
+/*
+ * Remove entry from lec_arp_table
+ */
+static int
+lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove)
+{
+ struct lec_arp_table *entry;
+ int i, remove_vcc = 1;
+
+ if (!to_remove)
+ return -1;
+
+ hlist_del(&to_remove->next);
+ del_timer(&to_remove->timer);
+
+ /*
+ * If this is the only MAC connected to this VCC,
+ * also tear down the VCC
+ */
+ if (to_remove->status >= ESI_FLUSH_PENDING) {
+ /*
+ * ESI_FLUSH_PENDING, ESI_FORWARD_DIRECT
+ */
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry(entry,
+ &priv->lec_arp_tables[i], next) {
+ if (memcmp(to_remove->atm_addr,
+ entry->atm_addr, ATM_ESA_LEN) == 0) {
+ remove_vcc = 0;
+ break;
+ }
+ }
+ }
+ if (remove_vcc)
+ lec_arp_clear_vccs(to_remove);
+ }
+ skb_queue_purge(&to_remove->tx_wait); /* FIXME: good place for this? */
+
+ pr_debug("Removed entry:%pM\n", to_remove->mac_addr);
+ return 0;
+}
+
+#if DEBUG_ARP_TABLE
+static const char *get_status_string(unsigned char st)
+{
+ switch (st) {
+ case ESI_UNKNOWN:
+ return "ESI_UNKNOWN";
+ case ESI_ARP_PENDING:
+ return "ESI_ARP_PENDING";
+ case ESI_VC_PENDING:
+ return "ESI_VC_PENDING";
+ case ESI_FLUSH_PENDING:
+ return "ESI_FLUSH_PENDING";
+ case ESI_FORWARD_DIRECT:
+ return "ESI_FORWARD_DIRECT";
+ }
+ return "<UNKNOWN>";
+}
+
+static void dump_arp_table(struct lec_priv *priv)
+{
+ struct lec_arp_table *rulla;
+ char buf[256];
+ int i, j, offset;
+
+ pr_info("Dump %p:\n", priv);
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry(rulla,
+ &priv->lec_arp_tables[i], next) {
+ offset = 0;
+ offset += sprintf(buf, "%d: %p\n", i, rulla);
+ offset += sprintf(buf + offset, "Mac: %pM",
+ rulla->mac_addr);
+ offset += sprintf(buf + offset, " Atm:");
+ for (j = 0; j < ATM_ESA_LEN; j++) {
+ offset += sprintf(buf + offset,
+ "%2.2x ",
+ rulla->atm_addr[j] & 0xff);
+ }
+ offset += sprintf(buf + offset,
+ "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
+ rulla->vcc ? rulla->vcc->vpi : 0,
+ rulla->vcc ? rulla->vcc->vci : 0,
+ rulla->recv_vcc ? rulla->recv_vcc->
+ vpi : 0,
+ rulla->recv_vcc ? rulla->recv_vcc->
+ vci : 0, rulla->last_used,
+ rulla->timestamp, rulla->no_tries);
+ offset +=
+ sprintf(buf + offset,
+ "Flags:%x, Packets_flooded:%x, Status: %s ",
+ rulla->flags, rulla->packets_flooded,
+ get_status_string(rulla->status));
+ pr_info("%s\n", buf);
+ }
+ }
+
+ if (!hlist_empty(&priv->lec_no_forward))
+ pr_info("No forward\n");
+ hlist_for_each_entry(rulla, &priv->lec_no_forward, next) {
+ offset = 0;
+ offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
+ offset += sprintf(buf + offset, " Atm:");
+ for (j = 0; j < ATM_ESA_LEN; j++) {
+ offset += sprintf(buf + offset, "%2.2x ",
+ rulla->atm_addr[j] & 0xff);
+ }
+ offset += sprintf(buf + offset,
+ "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
+ rulla->vcc ? rulla->vcc->vpi : 0,
+ rulla->vcc ? rulla->vcc->vci : 0,
+ rulla->recv_vcc ? rulla->recv_vcc->vpi : 0,
+ rulla->recv_vcc ? rulla->recv_vcc->vci : 0,
+ rulla->last_used,
+ rulla->timestamp, rulla->no_tries);
+ offset += sprintf(buf + offset,
+ "Flags:%x, Packets_flooded:%x, Status: %s ",
+ rulla->flags, rulla->packets_flooded,
+ get_status_string(rulla->status));
+ pr_info("%s\n", buf);
+ }
+
+ if (!hlist_empty(&priv->lec_arp_empty_ones))
+ pr_info("Empty ones\n");
+ hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) {
+ offset = 0;
+ offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
+ offset += sprintf(buf + offset, " Atm:");
+ for (j = 0; j < ATM_ESA_LEN; j++) {
+ offset += sprintf(buf + offset, "%2.2x ",
+ rulla->atm_addr[j] & 0xff);
+ }
+ offset += sprintf(buf + offset,
+ "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
+ rulla->vcc ? rulla->vcc->vpi : 0,
+ rulla->vcc ? rulla->vcc->vci : 0,
+ rulla->recv_vcc ? rulla->recv_vcc->vpi : 0,
+ rulla->recv_vcc ? rulla->recv_vcc->vci : 0,
+ rulla->last_used,
+ rulla->timestamp, rulla->no_tries);
+ offset += sprintf(buf + offset,
+ "Flags:%x, Packets_flooded:%x, Status: %s ",
+ rulla->flags, rulla->packets_flooded,
+ get_status_string(rulla->status));
+ pr_info("%s", buf);
+ }
+
+ if (!hlist_empty(&priv->mcast_fwds))
+ pr_info("Multicast Forward VCCs\n");
+ hlist_for_each_entry(rulla, &priv->mcast_fwds, next) {
+ offset = 0;
+ offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
+ offset += sprintf(buf + offset, " Atm:");
+ for (j = 0; j < ATM_ESA_LEN; j++) {
+ offset += sprintf(buf + offset, "%2.2x ",
+ rulla->atm_addr[j] & 0xff);
+ }
+ offset += sprintf(buf + offset,
+ "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
+ rulla->vcc ? rulla->vcc->vpi : 0,
+ rulla->vcc ? rulla->vcc->vci : 0,
+ rulla->recv_vcc ? rulla->recv_vcc->vpi : 0,
+ rulla->recv_vcc ? rulla->recv_vcc->vci : 0,
+ rulla->last_used,
+ rulla->timestamp, rulla->no_tries);
+ offset += sprintf(buf + offset,
+ "Flags:%x, Packets_flooded:%x, Status: %s ",
+ rulla->flags, rulla->packets_flooded,
+ get_status_string(rulla->status));
+ pr_info("%s\n", buf);
+ }
+
+}
+#else
+#define dump_arp_table(priv) do { } while (0)
+#endif
+
+/*
+ * Destruction of arp-cache
+ */
+static void lec_arp_destroy(struct lec_priv *priv)
+{
+ unsigned long flags;
+ struct hlist_node *next;
+ struct lec_arp_table *entry;
+ int i;
+
+ cancel_delayed_work_sync(&priv->lec_arp_work);
+
+ /*
+ * Remove all entries
+ */
+
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_tables[i], next) {
+ lec_arp_remove(priv, entry);
+ lec_arp_put(entry);
+ }
+ INIT_HLIST_HEAD(&priv->lec_arp_tables[i]);
+ }
+
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_empty_ones, next) {
+ del_timer_sync(&entry->timer);
+ lec_arp_clear_vccs(entry);
+ hlist_del(&entry->next);
+ lec_arp_put(entry);
+ }
+ INIT_HLIST_HEAD(&priv->lec_arp_empty_ones);
+
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_no_forward, next) {
+ del_timer_sync(&entry->timer);
+ lec_arp_clear_vccs(entry);
+ hlist_del(&entry->next);
+ lec_arp_put(entry);
+ }
+ INIT_HLIST_HEAD(&priv->lec_no_forward);
+
+ hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) {
+ /* No timer, LANEv2 7.1.20 and 2.3.5.3 */
+ lec_arp_clear_vccs(entry);
+ hlist_del(&entry->next);
+ lec_arp_put(entry);
+ }
+ INIT_HLIST_HEAD(&priv->mcast_fwds);
+ priv->mcast_vcc = NULL;
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+/*
+ * Find entry by mac_address
+ */
+static struct lec_arp_table *lec_arp_find(struct lec_priv *priv,
+ const unsigned char *mac_addr)
+{
+ struct hlist_head *head;
+ struct lec_arp_table *entry;
+
+ pr_debug("%pM\n", mac_addr);
+
+ head = &priv->lec_arp_tables[HASH(mac_addr[ETH_ALEN - 1])];
+ hlist_for_each_entry(entry, head, next) {
+ if (ether_addr_equal(mac_addr, entry->mac_addr))
+ return entry;
+ }
+ return NULL;
+}
+
+static struct lec_arp_table *make_entry(struct lec_priv *priv,
+ const unsigned char *mac_addr)
+{
+ struct lec_arp_table *to_return;
+
+ to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC);
+ if (!to_return) {
+ pr_info("LEC: Arp entry kmalloc failed\n");
+ return NULL;
+ }
+ ether_addr_copy(to_return->mac_addr, mac_addr);
+ INIT_HLIST_NODE(&to_return->next);
+ setup_timer(&to_return->timer, lec_arp_expire_arp,
+ (unsigned long)to_return);
+ to_return->last_used = jiffies;
+ to_return->priv = priv;
+ skb_queue_head_init(&to_return->tx_wait);
+ atomic_set(&to_return->usage, 1);
+ return to_return;
+}
+
+/* Arp sent timer expired */
+static void lec_arp_expire_arp(unsigned long data)
+{
+ struct lec_arp_table *entry;
+
+ entry = (struct lec_arp_table *)data;
+
+ pr_debug("\n");
+ if (entry->status == ESI_ARP_PENDING) {
+ if (entry->no_tries <= entry->priv->max_retry_count) {
+ if (entry->is_rdesc)
+ send_to_lecd(entry->priv, l_rdesc_arp_xmt,
+ entry->mac_addr, NULL, NULL);
+ else
+ send_to_lecd(entry->priv, l_arp_xmt,
+ entry->mac_addr, NULL, NULL);
+ entry->no_tries++;
+ }
+ mod_timer(&entry->timer, jiffies + (1 * HZ));
+ }
+}
+
+/* Unknown/unused vcc expire, remove associated entry */
+static void lec_arp_expire_vcc(unsigned long data)
+{
+ unsigned long flags;
+ struct lec_arp_table *to_remove = (struct lec_arp_table *)data;
+ struct lec_priv *priv = to_remove->priv;
+
+ del_timer(&to_remove->timer);
+
+ pr_debug("%p %p: vpi:%d vci:%d\n",
+ to_remove, priv,
+ to_remove->vcc ? to_remove->recv_vcc->vpi : 0,
+ to_remove->vcc ? to_remove->recv_vcc->vci : 0);
+
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ hlist_del(&to_remove->next);
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+
+ lec_arp_clear_vccs(to_remove);
+ lec_arp_put(to_remove);
+}
+
+static bool __lec_arp_check_expire(struct lec_arp_table *entry,
+ unsigned long now,
+ struct lec_priv *priv)
+{
+ unsigned long time_to_check;
+
+ if ((entry->flags) & LEC_REMOTE_FLAG && priv->topology_change)
+ time_to_check = priv->forward_delay_time;
+ else
+ time_to_check = priv->aging_time;
+
+ pr_debug("About to expire: %lx - %lx > %lx\n",
+ now, entry->last_used, time_to_check);
+ if (time_after(now, entry->last_used + time_to_check) &&
+ !(entry->flags & LEC_PERMANENT_FLAG) &&
+ !(entry->mac_addr[0] & 0x01)) { /* LANE2: 7.1.20 */
+ /* Remove entry */
+ pr_debug("Entry timed out\n");
+ lec_arp_remove(priv, entry);
+ lec_arp_put(entry);
+ } else {
+ /* Something else */
+ if ((entry->status == ESI_VC_PENDING ||
+ entry->status == ESI_ARP_PENDING) &&
+ time_after_eq(now, entry->timestamp +
+ priv->max_unknown_frame_time)) {
+ entry->timestamp = jiffies;
+ entry->packets_flooded = 0;
+ if (entry->status == ESI_VC_PENDING)
+ send_to_lecd(priv, l_svc_setup,
+ entry->mac_addr,
+ entry->atm_addr,
+ NULL);
+ }
+ if (entry->status == ESI_FLUSH_PENDING &&
+ time_after_eq(now, entry->timestamp +
+ priv->path_switching_delay)) {
+ lec_arp_hold(entry);
+ return true;
+ }
+ }
+
+ return false;
+}
+/*
+ * Expire entries.
+ * 1. Re-set timer
+ * 2. For each entry, delete entries that have aged past the age limit.
+ * 3. For each entry, depending on the status of the entry, perform
+ * the following maintenance.
+ * a. If status is ESI_VC_PENDING or ESI_ARP_PENDING then if the
+ * tick_count is above the max_unknown_frame_time, clear
+ * the tick_count to zero and clear the packets_flooded counter
+ * to zero. This supports the packet rate limit per address
+ * while flooding unknowns.
+ * b. If the status is ESI_FLUSH_PENDING and the tick_count is greater
+ * than or equal to the path_switching_delay, change the status
+ * to ESI_FORWARD_DIRECT. This causes the flush period to end
+ * regardless of the progress of the flush protocol.
+ */
+static void lec_arp_check_expire(struct work_struct *work)
+{
+ unsigned long flags;
+ struct lec_priv *priv =
+ container_of(work, struct lec_priv, lec_arp_work.work);
+ struct hlist_node *next;
+ struct lec_arp_table *entry;
+ unsigned long now;
+ int i;
+
+ pr_debug("%p\n", priv);
+ now = jiffies;
+restart:
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_tables[i], next) {
+ if (__lec_arp_check_expire(entry, now, priv)) {
+ struct sk_buff *skb;
+ struct atm_vcc *vcc = entry->vcc;
+
+ spin_unlock_irqrestore(&priv->lec_arp_lock,
+ flags);
+ while ((skb = skb_dequeue(&entry->tx_wait)))
+ lec_send(vcc, skb);
+ entry->last_used = jiffies;
+ entry->status = ESI_FORWARD_DIRECT;
+ lec_arp_put(entry);
+
+ goto restart;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+
+ schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL);
+}
+
+/*
+ * Try to find vcc where mac_address is attached.
+ *
+ */
+static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
+ const unsigned char *mac_to_find,
+ int is_rdesc,
+ struct lec_arp_table **ret_entry)
+{
+ unsigned long flags;
+ struct lec_arp_table *entry;
+ struct atm_vcc *found;
+
+ if (mac_to_find[0] & 0x01) {
+ switch (priv->lane_version) {
+ case 1:
+ return priv->mcast_vcc;
+ case 2: /* LANE2 wants arp for multicast addresses */
+ if (ether_addr_equal(mac_to_find, bus_mac))
+ return priv->mcast_vcc;
+ break;
+ default:
+ break;
+ }
+ }
+
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ entry = lec_arp_find(priv, mac_to_find);
+
+ if (entry) {
+ if (entry->status == ESI_FORWARD_DIRECT) {
+ /* Connection Ok */
+ entry->last_used = jiffies;
+ lec_arp_hold(entry);
+ *ret_entry = entry;
+ found = entry->vcc;
+ goto out;
+ }
+ /*
+ * If the LE_ARP cache entry is still pending, reset count to 0
+ * so another LE_ARP request can be made for this frame.
+ */
+ if (entry->status == ESI_ARP_PENDING)
+ entry->no_tries = 0;
+ /*
+ * Data direct VC not yet set up, check to see if the unknown
+ * frame count is greater than the limit. If the limit has
+ * not been reached, allow the caller to send packet to
+ * BUS.
+ */
+ if (entry->status != ESI_FLUSH_PENDING &&
+ entry->packets_flooded <
+ priv->maximum_unknown_frame_count) {
+ entry->packets_flooded++;
+ pr_debug("Flooding..\n");
+ found = priv->mcast_vcc;
+ goto out;
+ }
+ /*
+ * We got here because entry->status == ESI_FLUSH_PENDING
+ * or BUS flood limit was reached for an entry which is
+ * in ESI_ARP_PENDING or ESI_VC_PENDING state.
+ */
+ lec_arp_hold(entry);
+ *ret_entry = entry;
+ pr_debug("entry->status %d entry->vcc %p\n", entry->status,
+ entry->vcc);
+ found = NULL;
+ } else {
+ /* No matching entry was found */
+ entry = make_entry(priv, mac_to_find);
+ pr_debug("Making entry\n");
+ if (!entry) {
+ found = priv->mcast_vcc;
+ goto out;
+ }
+ lec_arp_add(priv, entry);
+ /* We want arp-request(s) to be sent */
+ entry->packets_flooded = 1;
+ entry->status = ESI_ARP_PENDING;
+ entry->no_tries = 1;
+ entry->last_used = entry->timestamp = jiffies;
+ entry->is_rdesc = is_rdesc;
+ if (entry->is_rdesc)
+ send_to_lecd(priv, l_rdesc_arp_xmt, mac_to_find, NULL,
+ NULL);
+ else
+ send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL);
+ entry->timer.expires = jiffies + (1 * HZ);
+ entry->timer.function = lec_arp_expire_arp;
+ add_timer(&entry->timer);
+ found = priv->mcast_vcc;
+ }
+
+out:
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+ return found;
+}
+
+static int
+lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr,
+ unsigned long permanent)
+{
+ unsigned long flags;
+ struct hlist_node *next;
+ struct lec_arp_table *entry;
+ int i;
+
+ pr_debug("\n");
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_tables[i], next) {
+ if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN) &&
+ (permanent ||
+ !(entry->flags & LEC_PERMANENT_FLAG))) {
+ lec_arp_remove(priv, entry);
+ lec_arp_put(entry);
+ }
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+ return 0;
+ }
+ }
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+ return -1;
+}
+
+/*
+ * Notifies: Response to arp_request (atm_addr != NULL)
+ */
+static void
+lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr,
+ const unsigned char *atm_addr, unsigned long remoteflag,
+ unsigned int targetless_le_arp)
+{
+ unsigned long flags;
+ struct hlist_node *next;
+ struct lec_arp_table *entry, *tmp;
+ int i;
+
+ pr_debug("%smac:%pM\n",
+ (targetless_le_arp) ? "targetless " : "", mac_addr);
+
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ entry = lec_arp_find(priv, mac_addr);
+ if (entry == NULL && targetless_le_arp)
+ goto out; /*
+ * LANE2: ignore targetless LE_ARPs for which
+ * we have no entry in the cache. 7.1.30
+ */
+ if (!hlist_empty(&priv->lec_arp_empty_ones)) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_empty_ones, next) {
+ if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) {
+ hlist_del(&entry->next);
+ del_timer(&entry->timer);
+ tmp = lec_arp_find(priv, mac_addr);
+ if (tmp) {
+ del_timer(&tmp->timer);
+ tmp->status = ESI_FORWARD_DIRECT;
+ memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN);
+ tmp->vcc = entry->vcc;
+ tmp->old_push = entry->old_push;
+ tmp->last_used = jiffies;
+ del_timer(&entry->timer);
+ lec_arp_put(entry);
+ entry = tmp;
+ } else {
+ entry->status = ESI_FORWARD_DIRECT;
+ ether_addr_copy(entry->mac_addr,
+ mac_addr);
+ entry->last_used = jiffies;
+ lec_arp_add(priv, entry);
+ }
+ if (remoteflag)
+ entry->flags |= LEC_REMOTE_FLAG;
+ else
+ entry->flags &= ~LEC_REMOTE_FLAG;
+ pr_debug("After update\n");
+ dump_arp_table(priv);
+ goto out;
+ }
+ }
+ }
+
+ entry = lec_arp_find(priv, mac_addr);
+ if (!entry) {
+ entry = make_entry(priv, mac_addr);
+ if (!entry)
+ goto out;
+ entry->status = ESI_UNKNOWN;
+ lec_arp_add(priv, entry);
+ /* Temporary, changes before end of function */
+ }
+ memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN);
+ del_timer(&entry->timer);
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry(tmp,
+ &priv->lec_arp_tables[i], next) {
+ if (entry != tmp &&
+ !memcmp(tmp->atm_addr, atm_addr, ATM_ESA_LEN)) {
+ /* Vcc to this host exists */
+ if (tmp->status > ESI_VC_PENDING) {
+ /*
+ * ESI_FLUSH_PENDING,
+ * ESI_FORWARD_DIRECT
+ */
+ entry->vcc = tmp->vcc;
+ entry->old_push = tmp->old_push;
+ }
+ entry->status = tmp->status;
+ break;
+ }
+ }
+ }
+ if (remoteflag)
+ entry->flags |= LEC_REMOTE_FLAG;
+ else
+ entry->flags &= ~LEC_REMOTE_FLAG;
+ if (entry->status == ESI_ARP_PENDING || entry->status == ESI_UNKNOWN) {
+ entry->status = ESI_VC_PENDING;
+ send_to_lecd(priv, l_svc_setup, entry->mac_addr, atm_addr, NULL);
+ }
+ pr_debug("After update2\n");
+ dump_arp_table(priv);
+out:
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+/*
+ * Notifies: Vcc setup ready
+ */
+static void
+lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
+ struct atm_vcc *vcc,
+ void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb))
+{
+ unsigned long flags;
+ struct lec_arp_table *entry;
+ int i, found_entry = 0;
+
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ /* Vcc for Multicast Forward. No timer, LANEv2 7.1.20 and 2.3.5.3 */
+ if (ioc_data->receive == 2) {
+ pr_debug("LEC_ARP: Attaching mcast forward\n");
+#if 0
+ entry = lec_arp_find(priv, bus_mac);
+ if (!entry) {
+ pr_info("LEC_ARP: Multicast entry not found!\n");
+ goto out;
+ }
+ memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
+ entry->recv_vcc = vcc;
+ entry->old_recv_push = old_push;
+#endif
+ entry = make_entry(priv, bus_mac);
+ if (entry == NULL)
+ goto out;
+ del_timer(&entry->timer);
+ memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
+ entry->recv_vcc = vcc;
+ entry->old_recv_push = old_push;
+ hlist_add_head(&entry->next, &priv->mcast_fwds);
+ goto out;
+ } else if (ioc_data->receive == 1) {
+ /*
+ * Vcc which we don't want to make default vcc,
+ * attach it anyway.
+ */
+ pr_debug("LEC_ARP:Attaching data direct, not default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
+ ioc_data->atm_addr[0], ioc_data->atm_addr[1],
+ ioc_data->atm_addr[2], ioc_data->atm_addr[3],
+ ioc_data->atm_addr[4], ioc_data->atm_addr[5],
+ ioc_data->atm_addr[6], ioc_data->atm_addr[7],
+ ioc_data->atm_addr[8], ioc_data->atm_addr[9],
+ ioc_data->atm_addr[10], ioc_data->atm_addr[11],
+ ioc_data->atm_addr[12], ioc_data->atm_addr[13],
+ ioc_data->atm_addr[14], ioc_data->atm_addr[15],
+ ioc_data->atm_addr[16], ioc_data->atm_addr[17],
+ ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+ entry = make_entry(priv, bus_mac);
+ if (entry == NULL)
+ goto out;
+ memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
+ eth_zero_addr(entry->mac_addr);
+ entry->recv_vcc = vcc;
+ entry->old_recv_push = old_push;
+ entry->status = ESI_UNKNOWN;
+ entry->timer.expires = jiffies + priv->vcc_timeout_period;
+ entry->timer.function = lec_arp_expire_vcc;
+ hlist_add_head(&entry->next, &priv->lec_no_forward);
+ add_timer(&entry->timer);
+ dump_arp_table(priv);
+ goto out;
+ }
+ pr_debug("LEC_ARP:Attaching data direct, default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
+ ioc_data->atm_addr[0], ioc_data->atm_addr[1],
+ ioc_data->atm_addr[2], ioc_data->atm_addr[3],
+ ioc_data->atm_addr[4], ioc_data->atm_addr[5],
+ ioc_data->atm_addr[6], ioc_data->atm_addr[7],
+ ioc_data->atm_addr[8], ioc_data->atm_addr[9],
+ ioc_data->atm_addr[10], ioc_data->atm_addr[11],
+ ioc_data->atm_addr[12], ioc_data->atm_addr[13],
+ ioc_data->atm_addr[14], ioc_data->atm_addr[15],
+ ioc_data->atm_addr[16], ioc_data->atm_addr[17],
+ ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry(entry,
+ &priv->lec_arp_tables[i], next) {
+ if (memcmp
+ (ioc_data->atm_addr, entry->atm_addr,
+ ATM_ESA_LEN) == 0) {
+ pr_debug("LEC_ARP: Attaching data direct\n");
+ pr_debug("Currently -> Vcc: %d, Rvcc:%d\n",
+ entry->vcc ? entry->vcc->vci : 0,
+ entry->recv_vcc ? entry->recv_vcc->
+ vci : 0);
+ found_entry = 1;
+ del_timer(&entry->timer);
+ entry->vcc = vcc;
+ entry->old_push = old_push;
+ if (entry->status == ESI_VC_PENDING) {
+ if (priv->maximum_unknown_frame_count
+ == 0)
+ entry->status =
+ ESI_FORWARD_DIRECT;
+ else {
+ entry->timestamp = jiffies;
+ entry->status =
+ ESI_FLUSH_PENDING;
+#if 0
+ send_to_lecd(priv, l_flush_xmt,
+ NULL,
+ entry->atm_addr,
+ NULL);
+#endif
+ }
+ } else {
+ /*
+ * They were forming a connection
+ * to us, and we to them. Our
+ * ATM address is numerically lower
+ * than theirs, so we make connection
+ * we formed into default VCC (8.1.11).
+ * Connection they made gets torn
+ * down. This might confuse some
+ * clients. Can be changed if
+ * someone reports trouble...
+ */
+ ;
+ }
+ }
+ }
+ }
+ if (found_entry) {
+ pr_debug("After vcc was added\n");
+ dump_arp_table(priv);
+ goto out;
+ }
+ /*
+ * Not found, snatch address from first data packet that arrives
+ * from this vcc
+ */
+ entry = make_entry(priv, bus_mac);
+ if (!entry)
+ goto out;
+ entry->vcc = vcc;
+ entry->old_push = old_push;
+ memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
+ eth_zero_addr(entry->mac_addr);
+ entry->status = ESI_UNKNOWN;
+ hlist_add_head(&entry->next, &priv->lec_arp_empty_ones);
+ entry->timer.expires = jiffies + priv->vcc_timeout_period;
+ entry->timer.function = lec_arp_expire_vcc;
+ add_timer(&entry->timer);
+ pr_debug("After vcc was added\n");
+ dump_arp_table(priv);
+out:
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id)
+{
+ unsigned long flags;
+ struct lec_arp_table *entry;
+ int i;
+
+ pr_debug("%lx\n", tran_id);
+restart:
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry(entry,
+ &priv->lec_arp_tables[i], next) {
+ if (entry->flush_tran_id == tran_id &&
+ entry->status == ESI_FLUSH_PENDING) {
+ struct sk_buff *skb;
+ struct atm_vcc *vcc = entry->vcc;
+
+ lec_arp_hold(entry);
+ spin_unlock_irqrestore(&priv->lec_arp_lock,
+ flags);
+ while ((skb = skb_dequeue(&entry->tx_wait)))
+ lec_send(vcc, skb);
+ entry->last_used = jiffies;
+ entry->status = ESI_FORWARD_DIRECT;
+ lec_arp_put(entry);
+ pr_debug("LEC_ARP: Flushed\n");
+ goto restart;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+ dump_arp_table(priv);
+}
+
+static void
+lec_set_flush_tran_id(struct lec_priv *priv,
+ const unsigned char *atm_addr, unsigned long tran_id)
+{
+ unsigned long flags;
+ struct lec_arp_table *entry;
+ int i;
+
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++)
+ hlist_for_each_entry(entry,
+ &priv->lec_arp_tables[i], next) {
+ if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)) {
+ entry->flush_tran_id = tran_id;
+ pr_debug("Set flush transaction id to %lx for %p\n",
+ tran_id, entry);
+ }
+ }
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc)
+{
+ unsigned long flags;
+ unsigned char mac_addr[] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ };
+ struct lec_arp_table *to_add;
+ struct lec_vcc_priv *vpriv;
+ int err = 0;
+
+ vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL);
+ if (!vpriv)
+ return -ENOMEM;
+ vpriv->xoff = 0;
+ vpriv->old_pop = vcc->pop;
+ vcc->user_back = vpriv;
+ vcc->pop = lec_pop;
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ to_add = make_entry(priv, mac_addr);
+ if (!to_add) {
+ vcc->pop = vpriv->old_pop;
+ kfree(vpriv);
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(to_add->atm_addr, vcc->remote.sas_addr.prv, ATM_ESA_LEN);
+ to_add->status = ESI_FORWARD_DIRECT;
+ to_add->flags |= LEC_PERMANENT_FLAG;
+ to_add->vcc = vcc;
+ to_add->old_push = vcc->push;
+ vcc->push = lec_push;
+ priv->mcast_vcc = vcc;
+ lec_arp_add(priv, to_add);
+out:
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+ return err;
+}
+
+static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc)
+{
+ unsigned long flags;
+ struct hlist_node *next;
+ struct lec_arp_table *entry;
+ int i;
+
+ pr_debug("LEC_ARP: lec_vcc_close vpi:%d vci:%d\n", vcc->vpi, vcc->vci);
+ dump_arp_table(priv);
+
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+
+ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_tables[i], next) {
+ if (vcc == entry->vcc) {
+ lec_arp_remove(priv, entry);
+ lec_arp_put(entry);
+ if (priv->mcast_vcc == vcc)
+ priv->mcast_vcc = NULL;
+ }
+ }
+ }
+
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_empty_ones, next) {
+ if (entry->vcc == vcc) {
+ lec_arp_clear_vccs(entry);
+ del_timer(&entry->timer);
+ hlist_del(&entry->next);
+ lec_arp_put(entry);
+ }
+ }
+
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_no_forward, next) {
+ if (entry->recv_vcc == vcc) {
+ lec_arp_clear_vccs(entry);
+ del_timer(&entry->timer);
+ hlist_del(&entry->next);
+ lec_arp_put(entry);
+ }
+ }
+
+ hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) {
+ if (entry->recv_vcc == vcc) {
+ lec_arp_clear_vccs(entry);
+ /* No timer, LANEv2 7.1.20 and 2.3.5.3 */
+ hlist_del(&entry->next);
+ lec_arp_put(entry);
+ }
+ }
+
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+ dump_arp_table(priv);
+}
+
+static void
+lec_arp_check_empties(struct lec_priv *priv,
+ struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ unsigned long flags;
+ struct hlist_node *next;
+ struct lec_arp_table *entry, *tmp;
+ struct lecdatahdr_8023 *hdr = (struct lecdatahdr_8023 *)skb->data;
+ unsigned char *src = hdr->h_source;
+
+ spin_lock_irqsave(&priv->lec_arp_lock, flags);
+ hlist_for_each_entry_safe(entry, next,
+ &priv->lec_arp_empty_ones, next) {
+ if (vcc == entry->vcc) {
+ del_timer(&entry->timer);
+ ether_addr_copy(entry->mac_addr, src);
+ entry->status = ESI_FORWARD_DIRECT;
+ entry->last_used = jiffies;
+ /* We might have got an entry */
+ tmp = lec_arp_find(priv, src);
+ if (tmp) {
+ lec_arp_remove(priv, tmp);
+ lec_arp_put(tmp);
+ }
+ hlist_del(&entry->next);
+ lec_arp_add(priv, entry);
+ goto out;
+ }
+ }
+ pr_debug("LEC_ARP: Arp_check_empties: entry not found!\n");
+out:
+ spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+MODULE_LICENSE("GPL");
diff --git a/net/atm/lec.h b/net/atm/lec.h
new file mode 100644
index 000000000..4149db1b7
--- /dev/null
+++ b/net/atm/lec.h
@@ -0,0 +1,154 @@
+/*
+ * Lan Emulation client header file
+ *
+ * Marko Kiiskila <mkiiskila@yahoo.com>
+ */
+
+#ifndef _LEC_H_
+#define _LEC_H_
+
+#include <linux/atmdev.h>
+#include <linux/netdevice.h>
+#include <linux/atmlec.h>
+
+#define LEC_HEADER_LEN 16
+
+struct lecdatahdr_8023 {
+ __be16 le_header;
+ unsigned char h_dest[ETH_ALEN];
+ unsigned char h_source[ETH_ALEN];
+ __be16 h_type;
+};
+
+struct lecdatahdr_8025 {
+ __be16 le_header;
+ unsigned char ac_pad;
+ unsigned char fc;
+ unsigned char h_dest[ETH_ALEN];
+ unsigned char h_source[ETH_ALEN];
+};
+
+#define LEC_MINIMUM_8023_SIZE 62
+#define LEC_MINIMUM_8025_SIZE 16
+
+/*
+ * Operations that LANE2 capable device can do. Two first functions
+ * are used to make the device do things. See spec 3.1.3 and 3.1.4.
+ *
+ * The third function is intended for the MPOA component sitting on
+ * top of the LANE device. The MPOA component assigns it's own function
+ * to (*associate_indicator)() and the LANE device will use that
+ * function to tell about TLVs it sees floating through.
+ *
+ */
+struct lane2_ops {
+ int (*resolve) (struct net_device *dev, const u8 *dst_mac, int force,
+ u8 **tlvs, u32 *sizeoftlvs);
+ int (*associate_req) (struct net_device *dev, const u8 *lan_dst,
+ const u8 *tlvs, u32 sizeoftlvs);
+ void (*associate_indicator) (struct net_device *dev, const u8 *mac_addr,
+ const u8 *tlvs, u32 sizeoftlvs);
+};
+
+/*
+ * ATM LAN Emulation supports both LLC & Dix Ethernet EtherType
+ * frames.
+ *
+ * 1. Dix Ethernet EtherType frames encoded by placing EtherType
+ * field in h_type field. Data follows immediately after header.
+ * 2. LLC Data frames whose total length, including LLC field and data,
+ * but not padding required to meet the minimum data frame length,
+ * is less than ETH_P_802_3_MIN MUST be encoded by placing that length
+ * in the h_type field. The LLC field follows header immediately.
+ * 3. LLC data frames longer than this maximum MUST be encoded by placing
+ * the value 0 in the h_type field.
+ *
+ */
+
+/* Hash table size */
+#define LEC_ARP_TABLE_SIZE 16
+
+struct lec_priv {
+ unsigned short lecid; /* Lecid of this client */
+ struct hlist_head lec_arp_empty_ones;
+ /* Used for storing VCC's that don't have a MAC address attached yet */
+ struct hlist_head lec_arp_tables[LEC_ARP_TABLE_SIZE];
+ /* Actual LE ARP table */
+ struct hlist_head lec_no_forward;
+ /*
+ * Used for storing VCC's (and forward packets from) which are to
+ * age out by not using them to forward packets.
+ * This is because to some LE clients there will be 2 VCCs. Only
+ * one of them gets used.
+ */
+ struct hlist_head mcast_fwds;
+ /*
+ * With LANEv2 it is possible that BUS (or a special multicast server)
+ * establishes multiple Multicast Forward VCCs to us. This list
+ * collects all those VCCs. LANEv1 client has only one item in this
+ * list. These entries are not aged out.
+ */
+ spinlock_t lec_arp_lock;
+ struct atm_vcc *mcast_vcc; /* Default Multicast Send VCC */
+ struct atm_vcc *lecd;
+ struct delayed_work lec_arp_work; /* C10 */
+ unsigned int maximum_unknown_frame_count;
+ /*
+ * Within the period of time defined by this variable, the client will send
+ * no more than C10 frames to BUS for a given unicast destination. (C11)
+ */
+ unsigned long max_unknown_frame_time;
+ /*
+ * If no traffic has been sent in this vcc for this period of time,
+ * vcc will be torn down (C12)
+ */
+ unsigned long vcc_timeout_period;
+ /*
+ * An LE Client MUST not retry an LE_ARP_REQUEST for a
+ * given frame's LAN Destination more than maximum retry count times,
+ * after the first LEC_ARP_REQUEST (C13)
+ */
+ unsigned short max_retry_count;
+ /*
+ * Max time the client will maintain an entry in its arp cache in
+ * absence of a verification of that relationship (C17)
+ */
+ unsigned long aging_time;
+ /*
+ * Max time the client will maintain an entry in cache when
+ * topology change flag is true (C18)
+ */
+ unsigned long forward_delay_time; /* Topology change flag (C19) */
+ int topology_change;
+ /*
+ * Max time the client expects an LE_ARP_REQUEST/LE_ARP_RESPONSE
+ * cycle to take (C20)
+ */
+ unsigned long arp_response_time;
+ /*
+ * Time limit ot wait to receive an LE_FLUSH_RESPONSE after the
+ * LE_FLUSH_REQUEST has been sent before taking recover action. (C21)
+ */
+ unsigned long flush_timeout;
+ /* The time since sending a frame to the bus after which the
+ * LE Client may assume that the frame has been either discarded or
+ * delivered to the recipient (C22)
+ */
+ unsigned long path_switching_delay;
+
+ u8 *tlvs; /* LANE2: TLVs are new */
+ u32 sizeoftlvs; /* The size of the tlv array in bytes */
+ int lane_version; /* LANE2 */
+ int itfnum; /* e.g. 2 for lec2, 5 for lec5 */
+ struct lane2_ops *lane2_ops; /* can be NULL for LANE v1 */
+ int is_proxy; /* bridge between ATM and Ethernet */
+};
+
+struct lec_vcc_priv {
+ void (*old_pop) (struct atm_vcc *vcc, struct sk_buff *skb);
+ int xoff;
+};
+
+#define LEC_VCC_PRIV(vcc) ((struct lec_vcc_priv *)((vcc)->user_back))
+
+#endif /* _LEC_H_ */
diff --git a/net/atm/lec_arpc.h b/net/atm/lec_arpc.h
new file mode 100644
index 000000000..ec67435a4
--- /dev/null
+++ b/net/atm/lec_arpc.h
@@ -0,0 +1,96 @@
+/*
+ * Lec arp cache
+ *
+ * Marko Kiiskila <mkiiskila@yahoo.com>
+ */
+#ifndef _LEC_ARP_H_
+#define _LEC_ARP_H_
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/if_ether.h>
+#include <linux/atmlec.h>
+
+struct lec_arp_table {
+ struct hlist_node next; /* Linked entry list */
+ unsigned char atm_addr[ATM_ESA_LEN]; /* Atm address */
+ unsigned char mac_addr[ETH_ALEN]; /* Mac address */
+ int is_rdesc; /* Mac address is a route descriptor */
+ struct atm_vcc *vcc; /* Vcc this entry is attached */
+ struct atm_vcc *recv_vcc; /* Vcc we receive data from */
+
+ void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb);
+ /* Push that leads to daemon */
+
+ void (*old_recv_push) (struct atm_vcc *vcc, struct sk_buff *skb);
+ /* Push that leads to daemon */
+
+ unsigned long last_used; /* For expiry */
+ unsigned long timestamp; /* Used for various timestamping things:
+ * 1. FLUSH started
+ * (status=ESI_FLUSH_PENDING)
+ * 2. Counting to
+ * max_unknown_frame_time
+ * (status=ESI_ARP_PENDING||
+ * status=ESI_VC_PENDING)
+ */
+ unsigned char no_tries; /* No of times arp retry has been tried */
+ unsigned char status; /* Status of this entry */
+ unsigned short flags; /* Flags for this entry */
+ unsigned short packets_flooded; /* Data packets flooded */
+ unsigned long flush_tran_id; /* Transaction id in flush protocol */
+ struct timer_list timer; /* Arping timer */
+ struct lec_priv *priv; /* Pointer back */
+ u8 *tlvs;
+ u32 sizeoftlvs; /*
+ * LANE2: Each MAC address can have TLVs
+ * associated with it. sizeoftlvs tells the
+ * the length of the tlvs array
+ */
+ struct sk_buff_head tx_wait; /* wait queue for outgoing packets */
+ atomic_t usage; /* usage count */
+};
+
+/*
+ * LANE2: Template tlv struct for accessing
+ * the tlvs in the lec_arp_table->tlvs array
+ */
+struct tlv {
+ u32 type;
+ u8 length;
+ u8 value[255];
+};
+
+/* Status fields */
+#define ESI_UNKNOWN 0 /*
+ * Next packet sent to this mac address
+ * causes ARP-request to be sent
+ */
+#define ESI_ARP_PENDING 1 /*
+ * There is no ATM address associated with this
+ * 48-bit address. The LE-ARP protocol is in
+ * progress.
+ */
+#define ESI_VC_PENDING 2 /*
+ * There is a valid ATM address associated with
+ * this 48-bit address but there is no VC set
+ * up to that ATM address. The signaling
+ * protocol is in process.
+ */
+#define ESI_FLUSH_PENDING 4 /*
+ * The LEC has been notified of the FLUSH_START
+ * status and it is assumed that the flush
+ * protocol is in process.
+ */
+#define ESI_FORWARD_DIRECT 5 /*
+ * Either the Path Switching Delay (C22) has
+ * elapsed or the LEC has notified the Mapping
+ * that the flush protocol has completed. In
+ * either case, it is safe to forward packets
+ * to this address via the data direct VC.
+ */
+
+/* Flag values */
+#define LEC_REMOTE_FLAG 0x0001
+#define LEC_PERMANENT_FLAG 0x0002
+
+#endif /* _LEC_ARP_H_ */
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
new file mode 100644
index 000000000..0e982222d
--- /dev/null
+++ b/net/atm/mpc.c
@@ -0,0 +1,1534 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/seq_file.h>
+
+/* We are an ethernet device */
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/uaccess.h>
+#include <asm/byteorder.h>
+#include <net/checksum.h> /* for ip_fast_csum() */
+#include <net/arp.h>
+#include <net/dst.h>
+#include <linux/proc_fs.h>
+
+/* And atm device */
+#include <linux/atmdev.h>
+#include <linux/atmlec.h>
+#include <linux/atmmpc.h>
+/* Modular too */
+#include <linux/module.h>
+
+#include "lec.h"
+#include "mpc.h"
+#include "resources.h"
+
+/*
+ * mpc.c: Implementation of MPOA client kernel part
+ */
+
+#if 0
+#define dprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args)
+#define dprintk_cont(format, args...) printk(KERN_CONT format, ##args)
+#else
+#define dprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\
+ } while (0)
+#define dprintk_cont(format, args...) \
+ do { if (0) printk(KERN_CONT format, ##args); } while (0)
+#endif
+
+#if 0
+#define ddprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args)
+#define ddprintk_cont(format, args...) printk(KERN_CONT format, ##args)
+#else
+#define ddprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\
+ } while (0)
+#define ddprintk_cont(format, args...) \
+ do { if (0) printk(KERN_CONT format, ##args); } while (0)
+#endif
+
+/* mpc_daemon -> kernel */
+static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc);
+static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc);
+static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc);
+static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc);
+static void mps_death(struct k_message *msg, struct mpoa_client *mpc);
+static void clean_up(struct k_message *msg, struct mpoa_client *mpc,
+ int action);
+static void MPOA_cache_impos_rcvd(struct k_message *msg,
+ struct mpoa_client *mpc);
+static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg,
+ struct mpoa_client *mpc);
+static void set_mps_mac_addr_rcvd(struct k_message *mesg,
+ struct mpoa_client *mpc);
+
+static const uint8_t *copy_macs(struct mpoa_client *mpc,
+ const uint8_t *router_mac,
+ const uint8_t *tlvs, uint8_t mps_macs,
+ uint8_t device_type);
+static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry);
+
+static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc);
+static void mpoad_close(struct atm_vcc *vcc);
+static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb);
+
+static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb);
+static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
+ struct net_device *dev);
+static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
+ unsigned long event, void *dev);
+static void mpc_timer_refresh(void);
+static void mpc_cache_check(unsigned long checking_time);
+
+static struct llc_snap_hdr llc_snap_mpoa_ctrl = {
+ 0xaa, 0xaa, 0x03,
+ {0x00, 0x00, 0x5e},
+ {0x00, 0x03} /* For MPOA control PDUs */
+};
+static struct llc_snap_hdr llc_snap_mpoa_data = {
+ 0xaa, 0xaa, 0x03,
+ {0x00, 0x00, 0x00},
+ {0x08, 0x00} /* This is for IP PDUs only */
+};
+static struct llc_snap_hdr llc_snap_mpoa_data_tagged = {
+ 0xaa, 0xaa, 0x03,
+ {0x00, 0x00, 0x00},
+ {0x88, 0x4c} /* This is for tagged data PDUs */
+};
+
+static struct notifier_block mpoa_notifier = {
+ mpoa_event_listener,
+ NULL,
+ 0
+};
+
+struct mpoa_client *mpcs = NULL; /* FIXME */
+static struct atm_mpoa_qos *qos_head = NULL;
+static DEFINE_TIMER(mpc_timer, NULL, 0, 0);
+
+
+static struct mpoa_client *find_mpc_by_itfnum(int itf)
+{
+ struct mpoa_client *mpc;
+
+ mpc = mpcs; /* our global linked list */
+ while (mpc != NULL) {
+ if (mpc->dev_num == itf)
+ return mpc;
+ mpc = mpc->next;
+ }
+
+ return NULL; /* not found */
+}
+
+static struct mpoa_client *find_mpc_by_vcc(struct atm_vcc *vcc)
+{
+ struct mpoa_client *mpc;
+
+ mpc = mpcs; /* our global linked list */
+ while (mpc != NULL) {
+ if (mpc->mpoad_vcc == vcc)
+ return mpc;
+ mpc = mpc->next;
+ }
+
+ return NULL; /* not found */
+}
+
+static struct mpoa_client *find_mpc_by_lec(struct net_device *dev)
+{
+ struct mpoa_client *mpc;
+
+ mpc = mpcs; /* our global linked list */
+ while (mpc != NULL) {
+ if (mpc->dev == dev)
+ return mpc;
+ mpc = mpc->next;
+ }
+
+ return NULL; /* not found */
+}
+
+/*
+ * Functions for managing QoS list
+ */
+
+/*
+ * Overwrites the old entry or makes a new one.
+ */
+struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos)
+{
+ struct atm_mpoa_qos *entry;
+
+ entry = atm_mpoa_search_qos(dst_ip);
+ if (entry != NULL) {
+ entry->qos = *qos;
+ return entry;
+ }
+
+ entry = kmalloc(sizeof(struct atm_mpoa_qos), GFP_KERNEL);
+ if (entry == NULL) {
+ pr_info("mpoa: out of memory\n");
+ return entry;
+ }
+
+ entry->ipaddr = dst_ip;
+ entry->qos = *qos;
+
+ entry->next = qos_head;
+ qos_head = entry;
+
+ return entry;
+}
+
+struct atm_mpoa_qos *atm_mpoa_search_qos(__be32 dst_ip)
+{
+ struct atm_mpoa_qos *qos;
+
+ qos = qos_head;
+ while (qos) {
+ if (qos->ipaddr == dst_ip)
+ break;
+ qos = qos->next;
+ }
+
+ return qos;
+}
+
+/*
+ * Returns 0 for failure
+ */
+int atm_mpoa_delete_qos(struct atm_mpoa_qos *entry)
+{
+ struct atm_mpoa_qos *curr;
+
+ if (entry == NULL)
+ return 0;
+ if (entry == qos_head) {
+ qos_head = qos_head->next;
+ kfree(entry);
+ return 1;
+ }
+
+ curr = qos_head;
+ while (curr != NULL) {
+ if (curr->next == entry) {
+ curr->next = entry->next;
+ kfree(entry);
+ return 1;
+ }
+ curr = curr->next;
+ }
+
+ return 0;
+}
+
+/* this is buggered - we need locking for qos_head */
+void atm_mpoa_disp_qos(struct seq_file *m)
+{
+ struct atm_mpoa_qos *qos;
+
+ qos = qos_head;
+ seq_printf(m, "QoS entries for shortcuts:\n");
+ seq_printf(m, "IP address\n TX:max_pcr pcr min_pcr max_cdv max_sdu\n RX:max_pcr pcr min_pcr max_cdv max_sdu\n");
+
+ while (qos != NULL) {
+ seq_printf(m, "%pI4\n %-7d %-7d %-7d %-7d %-7d\n %-7d %-7d %-7d %-7d %-7d\n",
+ &qos->ipaddr,
+ qos->qos.txtp.max_pcr,
+ qos->qos.txtp.pcr,
+ qos->qos.txtp.min_pcr,
+ qos->qos.txtp.max_cdv,
+ qos->qos.txtp.max_sdu,
+ qos->qos.rxtp.max_pcr,
+ qos->qos.rxtp.pcr,
+ qos->qos.rxtp.min_pcr,
+ qos->qos.rxtp.max_cdv,
+ qos->qos.rxtp.max_sdu);
+ qos = qos->next;
+ }
+}
+
+static struct net_device *find_lec_by_itfnum(int itf)
+{
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ sprintf(name, "lec%d", itf);
+ dev = dev_get_by_name(&init_net, name);
+
+ return dev;
+}
+
+static struct mpoa_client *alloc_mpc(void)
+{
+ struct mpoa_client *mpc;
+
+ mpc = kzalloc(sizeof(struct mpoa_client), GFP_KERNEL);
+ if (mpc == NULL)
+ return NULL;
+ rwlock_init(&mpc->ingress_lock);
+ rwlock_init(&mpc->egress_lock);
+ mpc->next = mpcs;
+ atm_mpoa_init_cache(mpc);
+
+ mpc->parameters.mpc_p1 = MPC_P1;
+ mpc->parameters.mpc_p2 = MPC_P2;
+ memset(mpc->parameters.mpc_p3, 0, sizeof(mpc->parameters.mpc_p3));
+ mpc->parameters.mpc_p4 = MPC_P4;
+ mpc->parameters.mpc_p5 = MPC_P5;
+ mpc->parameters.mpc_p6 = MPC_P6;
+
+ mpcs = mpc;
+
+ return mpc;
+}
+
+/*
+ *
+ * start_mpc() puts the MPC on line. All the packets destined
+ * to the lec underneath us are now being monitored and
+ * shortcuts will be established.
+ *
+ */
+static void start_mpc(struct mpoa_client *mpc, struct net_device *dev)
+{
+
+ dprintk("(%s)\n", mpc->dev->name);
+ if (!dev->netdev_ops)
+ pr_info("(%s) not starting\n", dev->name);
+ else {
+ mpc->old_ops = dev->netdev_ops;
+ mpc->new_ops = *mpc->old_ops;
+ mpc->new_ops.ndo_start_xmit = mpc_send_packet;
+ dev->netdev_ops = &mpc->new_ops;
+ }
+}
+
+static void stop_mpc(struct mpoa_client *mpc)
+{
+ struct net_device *dev = mpc->dev;
+ dprintk("(%s)", mpc->dev->name);
+
+ /* Lets not nullify lec device's dev->hard_start_xmit */
+ if (dev->netdev_ops != &mpc->new_ops) {
+ dprintk_cont(" mpc already stopped, not fatal\n");
+ return;
+ }
+ dprintk_cont("\n");
+
+ dev->netdev_ops = mpc->old_ops;
+ mpc->old_ops = NULL;
+
+ /* close_shortcuts(mpc); ??? FIXME */
+}
+
+static const char *mpoa_device_type_string(char type) __attribute__ ((unused));
+
+static const char *mpoa_device_type_string(char type)
+{
+ switch (type) {
+ case NON_MPOA:
+ return "non-MPOA device";
+ case MPS:
+ return "MPS";
+ case MPC:
+ return "MPC";
+ case MPS_AND_MPC:
+ return "both MPS and MPC";
+ }
+
+ return "unspecified (non-MPOA) device";
+}
+
+/*
+ * lec device calls this via its netdev_priv(dev)->lane2_ops
+ * ->associate_indicator() when it sees a TLV in LE_ARP packet.
+ * We fill in the pointer above when we see a LANE2 lec initializing
+ * See LANE2 spec 3.1.5
+ *
+ * Quite a big and ugly function but when you look at it
+ * all it does is to try to locate and parse MPOA Device
+ * Type TLV.
+ * We give our lec a pointer to this function and when the
+ * lec sees a TLV it uses the pointer to call this function.
+ *
+ */
+static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
+ const u8 *tlvs, u32 sizeoftlvs)
+{
+ uint32_t type;
+ uint8_t length, mpoa_device_type, number_of_mps_macs;
+ const uint8_t *end_of_tlvs;
+ struct mpoa_client *mpc;
+
+ mpoa_device_type = number_of_mps_macs = 0; /* silence gcc */
+ dprintk("(%s) received TLV(s), ", dev->name);
+ dprintk("total length of all TLVs %d\n", sizeoftlvs);
+ mpc = find_mpc_by_lec(dev); /* Sampo-Fix: moved here from below */
+ if (mpc == NULL) {
+ pr_info("(%s) no mpc\n", dev->name);
+ return;
+ }
+ end_of_tlvs = tlvs + sizeoftlvs;
+ while (end_of_tlvs - tlvs >= 5) {
+ type = ((tlvs[0] << 24) | (tlvs[1] << 16) |
+ (tlvs[2] << 8) | tlvs[3]);
+ length = tlvs[4];
+ tlvs += 5;
+ dprintk(" type 0x%x length %02x\n", type, length);
+ if (tlvs + length > end_of_tlvs) {
+ pr_info("TLV value extends past its buffer, aborting parse\n");
+ return;
+ }
+
+ if (type == 0) {
+ pr_info("mpoa: (%s) TLV type was 0, returning\n",
+ dev->name);
+ return;
+ }
+
+ if (type != TLV_MPOA_DEVICE_TYPE) {
+ tlvs += length;
+ continue; /* skip other TLVs */
+ }
+ mpoa_device_type = *tlvs++;
+ number_of_mps_macs = *tlvs++;
+ dprintk("(%s) MPOA device type '%s', ",
+ dev->name, mpoa_device_type_string(mpoa_device_type));
+ if (mpoa_device_type == MPS_AND_MPC &&
+ length < (42 + number_of_mps_macs*ETH_ALEN)) { /* :) */
+ pr_info("(%s) short MPOA Device Type TLV\n",
+ dev->name);
+ continue;
+ }
+ if ((mpoa_device_type == MPS || mpoa_device_type == MPC) &&
+ length < 22 + number_of_mps_macs*ETH_ALEN) {
+ pr_info("(%s) short MPOA Device Type TLV\n", dev->name);
+ continue;
+ }
+ if (mpoa_device_type != MPS &&
+ mpoa_device_type != MPS_AND_MPC) {
+ dprintk("ignoring non-MPS device ");
+ if (mpoa_device_type == MPC)
+ tlvs += 20;
+ continue; /* we are only interested in MPSs */
+ }
+ if (number_of_mps_macs == 0 &&
+ mpoa_device_type == MPS_AND_MPC) {
+ pr_info("(%s) MPS_AND_MPC has zero MACs\n", dev->name);
+ continue; /* someone should read the spec */
+ }
+ dprintk_cont("this MPS has %d MAC addresses\n",
+ number_of_mps_macs);
+
+ /*
+ * ok, now we can go and tell our daemon
+ * the control address of MPS
+ */
+ send_set_mps_ctrl_addr(tlvs, mpc);
+
+ tlvs = copy_macs(mpc, mac_addr, tlvs,
+ number_of_mps_macs, mpoa_device_type);
+ if (tlvs == NULL)
+ return;
+ }
+ if (end_of_tlvs - tlvs != 0)
+ pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n",
+ dev->name, end_of_tlvs - tlvs);
+}
+
+/*
+ * Store at least advertizing router's MAC address
+ * plus the possible MAC address(es) to mpc->mps_macs.
+ * For a freshly allocated MPOA client mpc->mps_macs == 0.
+ */
+static const uint8_t *copy_macs(struct mpoa_client *mpc,
+ const uint8_t *router_mac,
+ const uint8_t *tlvs, uint8_t mps_macs,
+ uint8_t device_type)
+{
+ int num_macs;
+ num_macs = (mps_macs > 1) ? mps_macs : 1;
+
+ if (mpc->number_of_mps_macs != num_macs) { /* need to reallocate? */
+ if (mpc->number_of_mps_macs != 0)
+ kfree(mpc->mps_macs);
+ mpc->number_of_mps_macs = 0;
+ mpc->mps_macs = kmalloc(num_macs * ETH_ALEN, GFP_KERNEL);
+ if (mpc->mps_macs == NULL) {
+ pr_info("(%s) out of mem\n", mpc->dev->name);
+ return NULL;
+ }
+ }
+ ether_addr_copy(mpc->mps_macs, router_mac);
+ tlvs += 20; if (device_type == MPS_AND_MPC) tlvs += 20;
+ if (mps_macs > 0)
+ memcpy(mpc->mps_macs, tlvs, mps_macs*ETH_ALEN);
+ tlvs += mps_macs*ETH_ALEN;
+ mpc->number_of_mps_macs = num_macs;
+
+ return tlvs;
+}
+
+static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
+{
+ in_cache_entry *entry;
+ struct iphdr *iph;
+ char *buff;
+ __be32 ipaddr = 0;
+
+ static struct {
+ struct llc_snap_hdr hdr;
+ __be32 tag;
+ } tagged_llc_snap_hdr = {
+ {0xaa, 0xaa, 0x03, {0x00, 0x00, 0x00}, {0x88, 0x4c}},
+ 0
+ };
+
+ buff = skb->data + mpc->dev->hard_header_len;
+ iph = (struct iphdr *)buff;
+ ipaddr = iph->daddr;
+
+ ddprintk("(%s) ipaddr 0x%x\n",
+ mpc->dev->name, ipaddr);
+
+ entry = mpc->in_ops->get(ipaddr, mpc);
+ if (entry == NULL) {
+ entry = mpc->in_ops->add_entry(ipaddr, mpc);
+ if (entry != NULL)
+ mpc->in_ops->put(entry);
+ return 1;
+ }
+ /* threshold not exceeded or VCC not ready */
+ if (mpc->in_ops->cache_hit(entry, mpc) != OPEN) {
+ ddprintk("(%s) cache_hit: returns != OPEN\n",
+ mpc->dev->name);
+ mpc->in_ops->put(entry);
+ return 1;
+ }
+
+ ddprintk("(%s) using shortcut\n",
+ mpc->dev->name);
+ /* MPOA spec A.1.4, MPOA client must decrement IP ttl at least by one */
+ if (iph->ttl <= 1) {
+ ddprintk("(%s) IP ttl = %u, using LANE\n",
+ mpc->dev->name, iph->ttl);
+ mpc->in_ops->put(entry);
+ return 1;
+ }
+ iph->ttl--;
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ if (entry->ctrl_info.tag != 0) {
+ ddprintk("(%s) adding tag 0x%x\n",
+ mpc->dev->name, entry->ctrl_info.tag);
+ tagged_llc_snap_hdr.tag = entry->ctrl_info.tag;
+ skb_pull(skb, ETH_HLEN); /* get rid of Eth header */
+ skb_push(skb, sizeof(tagged_llc_snap_hdr));
+ /* add LLC/SNAP header */
+ skb_copy_to_linear_data(skb, &tagged_llc_snap_hdr,
+ sizeof(tagged_llc_snap_hdr));
+ } else {
+ skb_pull(skb, ETH_HLEN); /* get rid of Eth header */
+ skb_push(skb, sizeof(struct llc_snap_hdr));
+ /* add LLC/SNAP header + tag */
+ skb_copy_to_linear_data(skb, &llc_snap_mpoa_data,
+ sizeof(struct llc_snap_hdr));
+ }
+
+ atomic_add(skb->truesize, &sk_atm(entry->shortcut)->sk_wmem_alloc);
+ ATM_SKB(skb)->atm_options = entry->shortcut->atm_options;
+ entry->shortcut->send(entry->shortcut, skb);
+ entry->packets_fwded++;
+ mpc->in_ops->put(entry);
+
+ return 0;
+}
+
+/*
+ * Probably needs some error checks and locking, not sure...
+ */
+static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct mpoa_client *mpc;
+ struct ethhdr *eth;
+ int i = 0;
+
+ mpc = find_mpc_by_lec(dev); /* this should NEVER fail */
+ if (mpc == NULL) {
+ pr_info("(%s) no MPC found\n", dev->name);
+ goto non_ip;
+ }
+
+ eth = (struct ethhdr *)skb->data;
+ if (eth->h_proto != htons(ETH_P_IP))
+ goto non_ip; /* Multi-Protocol Over ATM :-) */
+
+ /* Weed out funny packets (e.g., AF_PACKET or raw). */
+ if (skb->len < ETH_HLEN + sizeof(struct iphdr))
+ goto non_ip;
+ skb_set_network_header(skb, ETH_HLEN);
+ if (skb->len < ETH_HLEN + ip_hdr(skb)->ihl * 4 || ip_hdr(skb)->ihl < 5)
+ goto non_ip;
+
+ while (i < mpc->number_of_mps_macs) {
+ if (ether_addr_equal(eth->h_dest, mpc->mps_macs + i * ETH_ALEN))
+ if (send_via_shortcut(skb, mpc) == 0) /* try shortcut */
+ return NETDEV_TX_OK;
+ i++;
+ }
+
+non_ip:
+ return __netdev_start_xmit(mpc->old_ops, skb, dev, false);
+}
+
+static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
+{
+ int bytes_left;
+ struct mpoa_client *mpc;
+ struct atmmpc_ioc ioc_data;
+ in_cache_entry *in_entry;
+ __be32 ipaddr;
+
+ bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmmpc_ioc));
+ if (bytes_left != 0) {
+ pr_info("mpoa:Short read (missed %d bytes) from userland\n",
+ bytes_left);
+ return -EFAULT;
+ }
+ ipaddr = ioc_data.ipaddr;
+ if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF)
+ return -EINVAL;
+
+ mpc = find_mpc_by_itfnum(ioc_data.dev_num);
+ if (mpc == NULL)
+ return -EINVAL;
+
+ if (ioc_data.type == MPC_SOCKET_INGRESS) {
+ in_entry = mpc->in_ops->get(ipaddr, mpc);
+ if (in_entry == NULL ||
+ in_entry->entry_state < INGRESS_RESOLVED) {
+ pr_info("(%s) did not find RESOLVED entry from ingress cache\n",
+ mpc->dev->name);
+ if (in_entry != NULL)
+ mpc->in_ops->put(in_entry);
+ return -EINVAL;
+ }
+ pr_info("(%s) attaching ingress SVC, entry = %pI4\n",
+ mpc->dev->name, &in_entry->ctrl_info.in_dst_ip);
+ in_entry->shortcut = vcc;
+ mpc->in_ops->put(in_entry);
+ } else {
+ pr_info("(%s) attaching egress SVC\n", mpc->dev->name);
+ }
+
+ vcc->proto_data = mpc->dev;
+ vcc->push = mpc_push;
+
+ return 0;
+}
+
+/*
+ *
+ */
+static void mpc_vcc_close(struct atm_vcc *vcc, struct net_device *dev)
+{
+ struct mpoa_client *mpc;
+ in_cache_entry *in_entry;
+ eg_cache_entry *eg_entry;
+
+ mpc = find_mpc_by_lec(dev);
+ if (mpc == NULL) {
+ pr_info("(%s) close for unknown MPC\n", dev->name);
+ return;
+ }
+
+ dprintk("(%s)\n", dev->name);
+ in_entry = mpc->in_ops->get_by_vcc(vcc, mpc);
+ if (in_entry) {
+ dprintk("(%s) ingress SVC closed ip = %pI4\n",
+ mpc->dev->name, &in_entry->ctrl_info.in_dst_ip);
+ in_entry->shortcut = NULL;
+ mpc->in_ops->put(in_entry);
+ }
+ eg_entry = mpc->eg_ops->get_by_vcc(vcc, mpc);
+ if (eg_entry) {
+ dprintk("(%s) egress SVC closed\n", mpc->dev->name);
+ eg_entry->shortcut = NULL;
+ mpc->eg_ops->put(eg_entry);
+ }
+
+ if (in_entry == NULL && eg_entry == NULL)
+ dprintk("(%s) unused vcc closed\n", dev->name);
+}
+
+static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct net_device *dev = (struct net_device *)vcc->proto_data;
+ struct sk_buff *new_skb;
+ eg_cache_entry *eg;
+ struct mpoa_client *mpc;
+ __be32 tag;
+ char *tmp;
+
+ ddprintk("(%s)\n", dev->name);
+ if (skb == NULL) {
+ dprintk("(%s) null skb, closing VCC\n", dev->name);
+ mpc_vcc_close(vcc, dev);
+ return;
+ }
+
+ skb->dev = dev;
+ if (memcmp(skb->data, &llc_snap_mpoa_ctrl,
+ sizeof(struct llc_snap_hdr)) == 0) {
+ struct sock *sk = sk_atm(vcc);
+
+ dprintk("(%s) control packet arrived\n", dev->name);
+ /* Pass control packets to daemon */
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+ return;
+ }
+
+ /* data coming over the shortcut */
+ atm_return(vcc, skb->truesize);
+
+ mpc = find_mpc_by_lec(dev);
+ if (mpc == NULL) {
+ pr_info("(%s) unknown MPC\n", dev->name);
+ return;
+ }
+
+ if (memcmp(skb->data, &llc_snap_mpoa_data_tagged,
+ sizeof(struct llc_snap_hdr)) == 0) { /* MPOA tagged data */
+ ddprintk("(%s) tagged data packet arrived\n", dev->name);
+
+ } else if (memcmp(skb->data, &llc_snap_mpoa_data,
+ sizeof(struct llc_snap_hdr)) == 0) { /* MPOA data */
+ pr_info("(%s) Unsupported non-tagged data packet arrived. Purging\n",
+ dev->name);
+ dev_kfree_skb_any(skb);
+ return;
+ } else {
+ pr_info("(%s) garbage arrived, purging\n", dev->name);
+ dev_kfree_skb_any(skb);
+ return;
+ }
+
+ tmp = skb->data + sizeof(struct llc_snap_hdr);
+ tag = *(__be32 *)tmp;
+
+ eg = mpc->eg_ops->get_by_tag(tag, mpc);
+ if (eg == NULL) {
+ pr_info("mpoa: (%s) Didn't find egress cache entry, tag = %u\n",
+ dev->name, tag);
+ purge_egress_shortcut(vcc, NULL);
+ dev_kfree_skb_any(skb);
+ return;
+ }
+
+ /*
+ * See if ingress MPC is using shortcut we opened as a return channel.
+ * This means we have a bi-directional vcc opened by us.
+ */
+ if (eg->shortcut == NULL) {
+ eg->shortcut = vcc;
+ pr_info("(%s) egress SVC in use\n", dev->name);
+ }
+
+ skb_pull(skb, sizeof(struct llc_snap_hdr) + sizeof(tag));
+ /* get rid of LLC/SNAP header */
+ new_skb = skb_realloc_headroom(skb, eg->ctrl_info.DH_length);
+ /* LLC/SNAP is shorter than MAC header :( */
+ dev_kfree_skb_any(skb);
+ if (new_skb == NULL) {
+ mpc->eg_ops->put(eg);
+ return;
+ }
+ skb_push(new_skb, eg->ctrl_info.DH_length); /* add MAC header */
+ skb_copy_to_linear_data(new_skb, eg->ctrl_info.DLL_header,
+ eg->ctrl_info.DH_length);
+ new_skb->protocol = eth_type_trans(new_skb, dev);
+ skb_reset_network_header(new_skb);
+
+ eg->latest_ip_addr = ip_hdr(new_skb)->saddr;
+ eg->packets_rcvd++;
+ mpc->eg_ops->put(eg);
+
+ memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data));
+ netif_rx(new_skb);
+}
+
+static struct atmdev_ops mpc_ops = { /* only send is required */
+ .close = mpoad_close,
+ .send = msg_from_mpoad
+};
+
+static struct atm_dev mpc_dev = {
+ .ops = &mpc_ops,
+ .type = "mpc",
+ .number = 42,
+ .lock = __SPIN_LOCK_UNLOCKED(mpc_dev.lock)
+ /* members not explicitly initialised will be 0 */
+};
+
+static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg)
+{
+ struct mpoa_client *mpc;
+ struct lec_priv *priv;
+ int err;
+
+ if (mpcs == NULL) {
+ init_timer(&mpc_timer);
+ mpc_timer_refresh();
+
+ /* This lets us now how our LECs are doing */
+ err = register_netdevice_notifier(&mpoa_notifier);
+ if (err < 0) {
+ del_timer(&mpc_timer);
+ return err;
+ }
+ }
+
+ mpc = find_mpc_by_itfnum(arg);
+ if (mpc == NULL) {
+ dprintk("allocating new mpc for itf %d\n", arg);
+ mpc = alloc_mpc();
+ if (mpc == NULL)
+ return -ENOMEM;
+ mpc->dev_num = arg;
+ mpc->dev = find_lec_by_itfnum(arg);
+ /* NULL if there was no lec */
+ }
+ if (mpc->mpoad_vcc) {
+ pr_info("mpoad is already present for itf %d\n", arg);
+ return -EADDRINUSE;
+ }
+
+ if (mpc->dev) { /* check if the lec is LANE2 capable */
+ priv = netdev_priv(mpc->dev);
+ if (priv->lane_version < 2) {
+ dev_put(mpc->dev);
+ mpc->dev = NULL;
+ } else
+ priv->lane2_ops->associate_indicator = lane2_assoc_ind;
+ }
+
+ mpc->mpoad_vcc = vcc;
+ vcc->dev = &mpc_dev;
+ vcc_insert_socket(sk_atm(vcc));
+ set_bit(ATM_VF_META, &vcc->flags);
+ set_bit(ATM_VF_READY, &vcc->flags);
+
+ if (mpc->dev) {
+ char empty[ATM_ESA_LEN];
+ memset(empty, 0, ATM_ESA_LEN);
+
+ start_mpc(mpc, mpc->dev);
+ /* set address if mpcd e.g. gets killed and restarted.
+ * If we do not do it now we have to wait for the next LE_ARP
+ */
+ if (memcmp(mpc->mps_ctrl_addr, empty, ATM_ESA_LEN) != 0)
+ send_set_mps_ctrl_addr(mpc->mps_ctrl_addr, mpc);
+ }
+
+ __module_get(THIS_MODULE);
+ return arg;
+}
+
+static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc)
+{
+ struct k_message mesg;
+
+ memcpy(mpc->mps_ctrl_addr, addr, ATM_ESA_LEN);
+
+ mesg.type = SET_MPS_CTRL_ADDR;
+ memcpy(mesg.MPS_ctrl, addr, ATM_ESA_LEN);
+ msg_to_mpoad(&mesg, mpc);
+}
+
+static void mpoad_close(struct atm_vcc *vcc)
+{
+ struct mpoa_client *mpc;
+ struct sk_buff *skb;
+
+ mpc = find_mpc_by_vcc(vcc);
+ if (mpc == NULL) {
+ pr_info("did not find MPC\n");
+ return;
+ }
+ if (!mpc->mpoad_vcc) {
+ pr_info("close for non-present mpoad\n");
+ return;
+ }
+
+ mpc->mpoad_vcc = NULL;
+ if (mpc->dev) {
+ struct lec_priv *priv = netdev_priv(mpc->dev);
+ priv->lane2_ops->associate_indicator = NULL;
+ stop_mpc(mpc);
+ dev_put(mpc->dev);
+ }
+
+ mpc->in_ops->destroy_cache(mpc);
+ mpc->eg_ops->destroy_cache(mpc);
+
+ while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) {
+ atm_return(vcc, skb->truesize);
+ kfree_skb(skb);
+ }
+
+ pr_info("(%s) going down\n",
+ (mpc->dev) ? mpc->dev->name : "<unknown>");
+ module_put(THIS_MODULE);
+}
+
+/*
+ *
+ */
+static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+
+ struct mpoa_client *mpc = find_mpc_by_vcc(vcc);
+ struct k_message *mesg = (struct k_message *)skb->data;
+ atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+
+ if (mpc == NULL) {
+ pr_info("no mpc found\n");
+ return 0;
+ }
+ dprintk("(%s)", mpc->dev ? mpc->dev->name : "<unknown>");
+ switch (mesg->type) {
+ case MPOA_RES_REPLY_RCVD:
+ dprintk_cont("mpoa_res_reply_rcvd\n");
+ MPOA_res_reply_rcvd(mesg, mpc);
+ break;
+ case MPOA_TRIGGER_RCVD:
+ dprintk_cont("mpoa_trigger_rcvd\n");
+ MPOA_trigger_rcvd(mesg, mpc);
+ break;
+ case INGRESS_PURGE_RCVD:
+ dprintk_cont("nhrp_purge_rcvd\n");
+ ingress_purge_rcvd(mesg, mpc);
+ break;
+ case EGRESS_PURGE_RCVD:
+ dprintk_cont("egress_purge_reply_rcvd\n");
+ egress_purge_rcvd(mesg, mpc);
+ break;
+ case MPS_DEATH:
+ dprintk_cont("mps_death\n");
+ mps_death(mesg, mpc);
+ break;
+ case CACHE_IMPOS_RCVD:
+ dprintk_cont("cache_impos_rcvd\n");
+ MPOA_cache_impos_rcvd(mesg, mpc);
+ break;
+ case SET_MPC_CTRL_ADDR:
+ dprintk_cont("set_mpc_ctrl_addr\n");
+ set_mpc_ctrl_addr_rcvd(mesg, mpc);
+ break;
+ case SET_MPS_MAC_ADDR:
+ dprintk_cont("set_mps_mac_addr\n");
+ set_mps_mac_addr_rcvd(mesg, mpc);
+ break;
+ case CLEAN_UP_AND_EXIT:
+ dprintk_cont("clean_up_and_exit\n");
+ clean_up(mesg, mpc, DIE);
+ break;
+ case RELOAD:
+ dprintk_cont("reload\n");
+ clean_up(mesg, mpc, RELOAD);
+ break;
+ case SET_MPC_PARAMS:
+ dprintk_cont("set_mpc_params\n");
+ mpc->parameters = mesg->content.params;
+ break;
+ default:
+ dprintk_cont("unknown message %d\n", mesg->type);
+ break;
+ }
+ kfree_skb(skb);
+
+ return 0;
+}
+
+/* Remember that this function may not do things that sleep */
+int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc)
+{
+ struct sk_buff *skb;
+ struct sock *sk;
+
+ if (mpc == NULL || !mpc->mpoad_vcc) {
+ pr_info("mesg %d to a non-existent mpoad\n", mesg->type);
+ return -ENXIO;
+ }
+
+ skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+ skb_put(skb, sizeof(struct k_message));
+ skb_copy_to_linear_data(skb, mesg, sizeof(*mesg));
+ atm_force_charge(mpc->mpoad_vcc, skb->truesize);
+
+ sk = sk_atm(mpc->mpoad_vcc);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+
+ return 0;
+}
+
+static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct mpoa_client *mpc;
+ struct lec_priv *priv;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (dev->name == NULL || strncmp(dev->name, "lec", 3))
+ return NOTIFY_DONE; /* we are only interested in lec:s */
+
+ switch (event) {
+ case NETDEV_REGISTER: /* a new lec device was allocated */
+ priv = netdev_priv(dev);
+ if (priv->lane_version < 2)
+ break;
+ priv->lane2_ops->associate_indicator = lane2_assoc_ind;
+ mpc = find_mpc_by_itfnum(priv->itfnum);
+ if (mpc == NULL) {
+ dprintk("allocating new mpc for %s\n", dev->name);
+ mpc = alloc_mpc();
+ if (mpc == NULL) {
+ pr_info("no new mpc");
+ break;
+ }
+ }
+ mpc->dev_num = priv->itfnum;
+ mpc->dev = dev;
+ dev_hold(dev);
+ dprintk("(%s) was initialized\n", dev->name);
+ break;
+ case NETDEV_UNREGISTER:
+ /* the lec device was deallocated */
+ mpc = find_mpc_by_lec(dev);
+ if (mpc == NULL)
+ break;
+ dprintk("device (%s) was deallocated\n", dev->name);
+ stop_mpc(mpc);
+ dev_put(mpc->dev);
+ mpc->dev = NULL;
+ break;
+ case NETDEV_UP:
+ /* the dev was ifconfig'ed up */
+ mpc = find_mpc_by_lec(dev);
+ if (mpc == NULL)
+ break;
+ if (mpc->mpoad_vcc != NULL)
+ start_mpc(mpc, dev);
+ break;
+ case NETDEV_DOWN:
+ /* the dev was ifconfig'ed down */
+ /* this means that the flow of packets from the
+ * upper layer stops
+ */
+ mpc = find_mpc_by_lec(dev);
+ if (mpc == NULL)
+ break;
+ if (mpc->mpoad_vcc != NULL)
+ stop_mpc(mpc);
+ break;
+ case NETDEV_REBOOT:
+ case NETDEV_CHANGE:
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_GOING_DOWN:
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Functions which are called after a message is received from mpcd.
+ * Msg is reused on purpose.
+ */
+
+
+static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
+{
+ __be32 dst_ip = msg->content.in_info.in_dst_ip;
+ in_cache_entry *entry;
+
+ entry = mpc->in_ops->get(dst_ip, mpc);
+ if (entry == NULL) {
+ entry = mpc->in_ops->add_entry(dst_ip, mpc);
+ entry->entry_state = INGRESS_RESOLVING;
+ msg->type = SND_MPOA_RES_RQST;
+ msg->content.in_info = entry->ctrl_info;
+ msg_to_mpoad(msg, mpc);
+ do_gettimeofday(&(entry->reply_wait));
+ mpc->in_ops->put(entry);
+ return;
+ }
+
+ if (entry->entry_state == INGRESS_INVALID) {
+ entry->entry_state = INGRESS_RESOLVING;
+ msg->type = SND_MPOA_RES_RQST;
+ msg->content.in_info = entry->ctrl_info;
+ msg_to_mpoad(msg, mpc);
+ do_gettimeofday(&(entry->reply_wait));
+ mpc->in_ops->put(entry);
+ return;
+ }
+
+ pr_info("(%s) entry already in resolving state\n",
+ (mpc->dev) ? mpc->dev->name : "<unknown>");
+ mpc->in_ops->put(entry);
+}
+
+/*
+ * Things get complicated because we have to check if there's an egress
+ * shortcut with suitable traffic parameters we could use.
+ */
+static void check_qos_and_open_shortcut(struct k_message *msg,
+ struct mpoa_client *client,
+ in_cache_entry *entry)
+{
+ __be32 dst_ip = msg->content.in_info.in_dst_ip;
+ struct atm_mpoa_qos *qos = atm_mpoa_search_qos(dst_ip);
+ eg_cache_entry *eg_entry = client->eg_ops->get_by_src_ip(dst_ip, client);
+
+ if (eg_entry && eg_entry->shortcut) {
+ if (eg_entry->shortcut->qos.txtp.traffic_class &
+ msg->qos.txtp.traffic_class &
+ (qos ? qos->qos.txtp.traffic_class : ATM_UBR | ATM_CBR)) {
+ if (eg_entry->shortcut->qos.txtp.traffic_class == ATM_UBR)
+ entry->shortcut = eg_entry->shortcut;
+ else if (eg_entry->shortcut->qos.txtp.max_pcr > 0)
+ entry->shortcut = eg_entry->shortcut;
+ }
+ if (entry->shortcut) {
+ dprintk("(%s) using egress SVC to reach %pI4\n",
+ client->dev->name, &dst_ip);
+ client->eg_ops->put(eg_entry);
+ return;
+ }
+ }
+ if (eg_entry != NULL)
+ client->eg_ops->put(eg_entry);
+
+ /* No luck in the egress cache we must open an ingress SVC */
+ msg->type = OPEN_INGRESS_SVC;
+ if (qos &&
+ (qos->qos.txtp.traffic_class == msg->qos.txtp.traffic_class)) {
+ msg->qos = qos->qos;
+ pr_info("(%s) trying to get a CBR shortcut\n",
+ client->dev->name);
+ } else
+ memset(&msg->qos, 0, sizeof(struct atm_qos));
+ msg_to_mpoad(msg, client);
+}
+
+static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc)
+{
+ __be32 dst_ip = msg->content.in_info.in_dst_ip;
+ in_cache_entry *entry = mpc->in_ops->get(dst_ip, mpc);
+
+ dprintk("(%s) ip %pI4\n",
+ mpc->dev->name, &dst_ip);
+ ddprintk("(%s) entry = %p",
+ mpc->dev->name, entry);
+ if (entry == NULL) {
+ pr_info("(%s) ARGH, received res. reply for an entry that doesn't exist.\n",
+ mpc->dev->name);
+ return;
+ }
+ ddprintk_cont(" entry_state = %d ", entry->entry_state);
+
+ if (entry->entry_state == INGRESS_RESOLVED) {
+ pr_info("(%s) RESOLVED entry!\n", mpc->dev->name);
+ mpc->in_ops->put(entry);
+ return;
+ }
+
+ entry->ctrl_info = msg->content.in_info;
+ do_gettimeofday(&(entry->tv));
+ do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */
+ entry->refresh_time = 0;
+ ddprintk_cont("entry->shortcut = %p\n", entry->shortcut);
+
+ if (entry->entry_state == INGRESS_RESOLVING &&
+ entry->shortcut != NULL) {
+ entry->entry_state = INGRESS_RESOLVED;
+ mpc->in_ops->put(entry);
+ return; /* Shortcut already open... */
+ }
+
+ if (entry->shortcut != NULL) {
+ pr_info("(%s) entry->shortcut != NULL, impossible!\n",
+ mpc->dev->name);
+ mpc->in_ops->put(entry);
+ return;
+ }
+
+ check_qos_and_open_shortcut(msg, mpc, entry);
+ entry->entry_state = INGRESS_RESOLVED;
+ mpc->in_ops->put(entry);
+
+ return;
+
+}
+
+static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc)
+{
+ __be32 dst_ip = msg->content.in_info.in_dst_ip;
+ __be32 mask = msg->ip_mask;
+ in_cache_entry *entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask);
+
+ if (entry == NULL) {
+ pr_info("(%s) purge for a non-existing entry, ip = %pI4\n",
+ mpc->dev->name, &dst_ip);
+ return;
+ }
+
+ do {
+ dprintk("(%s) removing an ingress entry, ip = %pI4\n",
+ mpc->dev->name, &dst_ip);
+ write_lock_bh(&mpc->ingress_lock);
+ mpc->in_ops->remove_entry(entry, mpc);
+ write_unlock_bh(&mpc->ingress_lock);
+ mpc->in_ops->put(entry);
+ entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask);
+ } while (entry != NULL);
+}
+
+static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc)
+{
+ __be32 cache_id = msg->content.eg_info.cache_id;
+ eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(cache_id, mpc);
+
+ if (entry == NULL) {
+ dprintk("(%s) purge for a non-existing entry\n",
+ mpc->dev->name);
+ return;
+ }
+
+ write_lock_irq(&mpc->egress_lock);
+ mpc->eg_ops->remove_entry(entry, mpc);
+ write_unlock_irq(&mpc->egress_lock);
+
+ mpc->eg_ops->put(entry);
+}
+
+static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry)
+{
+ struct sock *sk;
+ struct k_message *purge_msg;
+ struct sk_buff *skb;
+
+ dprintk("entering\n");
+ if (vcc == NULL) {
+ pr_info("vcc == NULL\n");
+ return;
+ }
+
+ skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC);
+ if (skb == NULL) {
+ pr_info("out of memory\n");
+ return;
+ }
+
+ skb_put(skb, sizeof(struct k_message));
+ memset(skb->data, 0, sizeof(struct k_message));
+ purge_msg = (struct k_message *)skb->data;
+ purge_msg->type = DATA_PLANE_PURGE;
+ if (entry != NULL)
+ purge_msg->content.eg_info = entry->ctrl_info;
+
+ atm_force_charge(vcc, skb->truesize);
+
+ sk = sk_atm(vcc);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+ dprintk("exiting\n");
+}
+
+/*
+ * Our MPS died. Tell our daemon to send NHRP data plane purge to each
+ * of the egress shortcuts we have.
+ */
+static void mps_death(struct k_message *msg, struct mpoa_client *mpc)
+{
+ eg_cache_entry *entry;
+
+ dprintk("(%s)\n", mpc->dev->name);
+
+ if (memcmp(msg->MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN)) {
+ pr_info("(%s) wrong MPS\n", mpc->dev->name);
+ return;
+ }
+
+ /* FIXME: This knows too much of the cache structure */
+ read_lock_irq(&mpc->egress_lock);
+ entry = mpc->eg_cache;
+ while (entry != NULL) {
+ purge_egress_shortcut(entry->shortcut, entry);
+ entry = entry->next;
+ }
+ read_unlock_irq(&mpc->egress_lock);
+
+ mpc->in_ops->destroy_cache(mpc);
+ mpc->eg_ops->destroy_cache(mpc);
+}
+
+static void MPOA_cache_impos_rcvd(struct k_message *msg,
+ struct mpoa_client *mpc)
+{
+ uint16_t holding_time;
+ eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(msg->content.eg_info.cache_id, mpc);
+
+ holding_time = msg->content.eg_info.holding_time;
+ dprintk("(%s) entry = %p, holding_time = %u\n",
+ mpc->dev->name, entry, holding_time);
+ if (entry == NULL && holding_time) {
+ entry = mpc->eg_ops->add_entry(msg, mpc);
+ mpc->eg_ops->put(entry);
+ return;
+ }
+ if (holding_time) {
+ mpc->eg_ops->update(entry, holding_time);
+ return;
+ }
+
+ write_lock_irq(&mpc->egress_lock);
+ mpc->eg_ops->remove_entry(entry, mpc);
+ write_unlock_irq(&mpc->egress_lock);
+
+ mpc->eg_ops->put(entry);
+}
+
+static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg,
+ struct mpoa_client *mpc)
+{
+ struct lec_priv *priv;
+ int i, retval ;
+
+ uint8_t tlv[4 + 1 + 1 + 1 + ATM_ESA_LEN];
+
+ tlv[0] = 00; tlv[1] = 0xa0; tlv[2] = 0x3e; tlv[3] = 0x2a; /* type */
+ tlv[4] = 1 + 1 + ATM_ESA_LEN; /* length */
+ tlv[5] = 0x02; /* MPOA client */
+ tlv[6] = 0x00; /* number of MPS MAC addresses */
+
+ memcpy(&tlv[7], mesg->MPS_ctrl, ATM_ESA_LEN); /* MPC ctrl ATM addr */
+ memcpy(mpc->our_ctrl_addr, mesg->MPS_ctrl, ATM_ESA_LEN);
+
+ dprintk("(%s) setting MPC ctrl ATM address to",
+ mpc->dev ? mpc->dev->name : "<unknown>");
+ for (i = 7; i < sizeof(tlv); i++)
+ dprintk_cont(" %02x", tlv[i]);
+ dprintk_cont("\n");
+
+ if (mpc->dev) {
+ priv = netdev_priv(mpc->dev);
+ retval = priv->lane2_ops->associate_req(mpc->dev,
+ mpc->dev->dev_addr,
+ tlv, sizeof(tlv));
+ if (retval == 0)
+ pr_info("(%s) MPOA device type TLV association failed\n",
+ mpc->dev->name);
+ retval = priv->lane2_ops->resolve(mpc->dev, NULL, 1, NULL, NULL);
+ if (retval < 0)
+ pr_info("(%s) targetless LE_ARP request failed\n",
+ mpc->dev->name);
+ }
+}
+
+static void set_mps_mac_addr_rcvd(struct k_message *msg,
+ struct mpoa_client *client)
+{
+
+ if (client->number_of_mps_macs)
+ kfree(client->mps_macs);
+ client->number_of_mps_macs = 0;
+ client->mps_macs = kmemdup(msg->MPS_ctrl, ETH_ALEN, GFP_KERNEL);
+ if (client->mps_macs == NULL) {
+ pr_info("out of memory\n");
+ return;
+ }
+ client->number_of_mps_macs = 1;
+}
+
+/*
+ * purge egress cache and tell daemon to 'action' (DIE, RELOAD)
+ */
+static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action)
+{
+
+ eg_cache_entry *entry;
+ msg->type = SND_EGRESS_PURGE;
+
+
+ /* FIXME: This knows too much of the cache structure */
+ read_lock_irq(&mpc->egress_lock);
+ entry = mpc->eg_cache;
+ while (entry != NULL) {
+ msg->content.eg_info = entry->ctrl_info;
+ dprintk("cache_id %u\n", entry->ctrl_info.cache_id);
+ msg_to_mpoad(msg, mpc);
+ entry = entry->next;
+ }
+ read_unlock_irq(&mpc->egress_lock);
+
+ msg->type = action;
+ msg_to_mpoad(msg, mpc);
+}
+
+static void mpc_timer_refresh(void)
+{
+ mpc_timer.expires = jiffies + (MPC_P2 * HZ);
+ mpc_timer.data = mpc_timer.expires;
+ mpc_timer.function = mpc_cache_check;
+ add_timer(&mpc_timer);
+}
+
+static void mpc_cache_check(unsigned long checking_time)
+{
+ struct mpoa_client *mpc = mpcs;
+ static unsigned long previous_resolving_check_time;
+ static unsigned long previous_refresh_time;
+
+ while (mpc != NULL) {
+ mpc->in_ops->clear_count(mpc);
+ mpc->eg_ops->clear_expired(mpc);
+ if (checking_time - previous_resolving_check_time >
+ mpc->parameters.mpc_p4 * HZ) {
+ mpc->in_ops->check_resolving(mpc);
+ previous_resolving_check_time = checking_time;
+ }
+ if (checking_time - previous_refresh_time >
+ mpc->parameters.mpc_p5 * HZ) {
+ mpc->in_ops->refresh(mpc);
+ previous_refresh_time = checking_time;
+ }
+ mpc = mpc->next;
+ }
+ mpc_timer_refresh();
+}
+
+static int atm_mpoa_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ int err = 0;
+ struct atm_vcc *vcc = ATM_SD(sock);
+
+ if (cmd != ATMMPC_CTRL && cmd != ATMMPC_DATA)
+ return -ENOIOCTLCMD;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ATMMPC_CTRL:
+ err = atm_mpoa_mpoad_attach(vcc, (int)arg);
+ if (err >= 0)
+ sock->state = SS_CONNECTED;
+ break;
+ case ATMMPC_DATA:
+ err = atm_mpoa_vcc_attach(vcc, (void __user *)arg);
+ break;
+ default:
+ break;
+ }
+ return err;
+}
+
+static struct atm_ioctl atm_ioctl_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = atm_mpoa_ioctl,
+};
+
+static __init int atm_mpoa_init(void)
+{
+ register_atm_ioctl(&atm_ioctl_ops);
+
+ if (mpc_proc_init() != 0)
+ pr_info("failed to initialize /proc/mpoa\n");
+
+ pr_info("mpc.c: initialized\n");
+
+ return 0;
+}
+
+static void __exit atm_mpoa_cleanup(void)
+{
+ struct mpoa_client *mpc, *tmp;
+ struct atm_mpoa_qos *qos, *nextqos;
+ struct lec_priv *priv;
+
+ mpc_proc_clean();
+
+ del_timer_sync(&mpc_timer);
+ unregister_netdevice_notifier(&mpoa_notifier);
+ deregister_atm_ioctl(&atm_ioctl_ops);
+
+ mpc = mpcs;
+ mpcs = NULL;
+ while (mpc != NULL) {
+ tmp = mpc->next;
+ if (mpc->dev != NULL) {
+ stop_mpc(mpc);
+ priv = netdev_priv(mpc->dev);
+ if (priv->lane2_ops != NULL)
+ priv->lane2_ops->associate_indicator = NULL;
+ }
+ ddprintk("about to clear caches\n");
+ mpc->in_ops->destroy_cache(mpc);
+ mpc->eg_ops->destroy_cache(mpc);
+ ddprintk("caches cleared\n");
+ kfree(mpc->mps_macs);
+ memset(mpc, 0, sizeof(struct mpoa_client));
+ ddprintk("about to kfree %p\n", mpc);
+ kfree(mpc);
+ ddprintk("next mpc is at %p\n", tmp);
+ mpc = tmp;
+ }
+
+ qos = qos_head;
+ qos_head = NULL;
+ while (qos != NULL) {
+ nextqos = qos->next;
+ dprintk("freeing qos entry %p\n", qos);
+ kfree(qos);
+ qos = nextqos;
+ }
+}
+
+module_init(atm_mpoa_init);
+module_exit(atm_mpoa_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/net/atm/mpc.h b/net/atm/mpc.h
new file mode 100644
index 000000000..0919a88bb
--- /dev/null
+++ b/net/atm/mpc.h
@@ -0,0 +1,64 @@
+#ifndef _MPC_H_
+#define _MPC_H_
+
+#include <linux/types.h>
+#include <linux/atm.h>
+#include <linux/atmmpc.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include "mpoa_caches.h"
+
+/* kernel -> mpc-daemon */
+int msg_to_mpoad(struct k_message *msg, struct mpoa_client *mpc);
+
+struct mpoa_client {
+ struct mpoa_client *next;
+ struct net_device *dev; /* lec in question */
+ int dev_num; /* e.g. 2 for lec2 */
+
+ struct atm_vcc *mpoad_vcc; /* control channel to mpoad */
+ uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */
+ uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */
+
+ rwlock_t ingress_lock;
+ struct in_cache_ops *in_ops; /* ingress cache operations */
+ in_cache_entry *in_cache; /* the ingress cache of this MPC */
+
+ rwlock_t egress_lock;
+ struct eg_cache_ops *eg_ops; /* egress cache operations */
+ eg_cache_entry *eg_cache; /* the egress cache of this MPC */
+
+ uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */
+ int number_of_mps_macs; /* number of the above MAC addresses */
+ struct mpc_parameters parameters; /* parameters for this client */
+
+ const struct net_device_ops *old_ops;
+ struct net_device_ops new_ops;
+};
+
+
+struct atm_mpoa_qos {
+ struct atm_mpoa_qos *next;
+ __be32 ipaddr;
+ struct atm_qos qos;
+};
+
+
+/* MPOA QoS operations */
+struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos);
+struct atm_mpoa_qos *atm_mpoa_search_qos(__be32 dst_ip);
+int atm_mpoa_delete_qos(struct atm_mpoa_qos *qos);
+
+/* Display QoS entries. This is for the procfs */
+struct seq_file;
+void atm_mpoa_disp_qos(struct seq_file *m);
+
+#ifdef CONFIG_PROC_FS
+int mpc_proc_init(void);
+void mpc_proc_clean(void);
+#else
+#define mpc_proc_init() (0)
+#define mpc_proc_clean() do { } while(0)
+#endif
+
+#endif /* _MPC_H_ */
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
new file mode 100644
index 000000000..d1b2d9a03
--- /dev/null
+++ b/net/atm/mpoa_caches.c
@@ -0,0 +1,569 @@
+#include <linux/types.h>
+#include <linux/atmmpc.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include "mpoa_caches.h"
+#include "mpc.h"
+
+/*
+ * mpoa_caches.c: Implementation of ingress and egress cache
+ * handling functions
+ */
+
+#if 0
+#define dprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */
+#else
+#define dprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+ } while (0)
+#endif
+
+#if 0
+#define ddprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */
+#else
+#define ddprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+ } while (0)
+#endif
+
+static in_cache_entry *in_cache_get(__be32 dst_ip,
+ struct mpoa_client *client)
+{
+ in_cache_entry *entry;
+
+ read_lock_bh(&client->ingress_lock);
+ entry = client->in_cache;
+ while (entry != NULL) {
+ if (entry->ctrl_info.in_dst_ip == dst_ip) {
+ atomic_inc(&entry->use);
+ read_unlock_bh(&client->ingress_lock);
+ return entry;
+ }
+ entry = entry->next;
+ }
+ read_unlock_bh(&client->ingress_lock);
+
+ return NULL;
+}
+
+static in_cache_entry *in_cache_get_with_mask(__be32 dst_ip,
+ struct mpoa_client *client,
+ __be32 mask)
+{
+ in_cache_entry *entry;
+
+ read_lock_bh(&client->ingress_lock);
+ entry = client->in_cache;
+ while (entry != NULL) {
+ if ((entry->ctrl_info.in_dst_ip & mask) == (dst_ip & mask)) {
+ atomic_inc(&entry->use);
+ read_unlock_bh(&client->ingress_lock);
+ return entry;
+ }
+ entry = entry->next;
+ }
+ read_unlock_bh(&client->ingress_lock);
+
+ return NULL;
+
+}
+
+static in_cache_entry *in_cache_get_by_vcc(struct atm_vcc *vcc,
+ struct mpoa_client *client)
+{
+ in_cache_entry *entry;
+
+ read_lock_bh(&client->ingress_lock);
+ entry = client->in_cache;
+ while (entry != NULL) {
+ if (entry->shortcut == vcc) {
+ atomic_inc(&entry->use);
+ read_unlock_bh(&client->ingress_lock);
+ return entry;
+ }
+ entry = entry->next;
+ }
+ read_unlock_bh(&client->ingress_lock);
+
+ return NULL;
+}
+
+static in_cache_entry *in_cache_add_entry(__be32 dst_ip,
+ struct mpoa_client *client)
+{
+ in_cache_entry *entry = kzalloc(sizeof(in_cache_entry), GFP_KERNEL);
+
+ if (entry == NULL) {
+ pr_info("mpoa: mpoa_caches.c: new_in_cache_entry: out of memory\n");
+ return NULL;
+ }
+
+ dprintk("adding an ingress entry, ip = %pI4\n", &dst_ip);
+
+ atomic_set(&entry->use, 1);
+ dprintk("new_in_cache_entry: about to lock\n");
+ write_lock_bh(&client->ingress_lock);
+ entry->next = client->in_cache;
+ entry->prev = NULL;
+ if (client->in_cache != NULL)
+ client->in_cache->prev = entry;
+ client->in_cache = entry;
+
+ memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
+ entry->ctrl_info.in_dst_ip = dst_ip;
+ do_gettimeofday(&(entry->tv));
+ entry->retry_time = client->parameters.mpc_p4;
+ entry->count = 1;
+ entry->entry_state = INGRESS_INVALID;
+ entry->ctrl_info.holding_time = HOLDING_TIME_DEFAULT;
+ atomic_inc(&entry->use);
+
+ write_unlock_bh(&client->ingress_lock);
+ dprintk("new_in_cache_entry: unlocked\n");
+
+ return entry;
+}
+
+static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
+{
+ struct atm_mpoa_qos *qos;
+ struct k_message msg;
+
+ entry->count++;
+ if (entry->entry_state == INGRESS_RESOLVED && entry->shortcut != NULL)
+ return OPEN;
+
+ if (entry->entry_state == INGRESS_REFRESHING) {
+ if (entry->count > mpc->parameters.mpc_p1) {
+ msg.type = SND_MPOA_RES_RQST;
+ msg.content.in_info = entry->ctrl_info;
+ memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN);
+ qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
+ if (qos != NULL)
+ msg.qos = qos->qos;
+ msg_to_mpoad(&msg, mpc);
+ do_gettimeofday(&(entry->reply_wait));
+ entry->entry_state = INGRESS_RESOLVING;
+ }
+ if (entry->shortcut != NULL)
+ return OPEN;
+ return CLOSED;
+ }
+
+ if (entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL)
+ return OPEN;
+
+ if (entry->count > mpc->parameters.mpc_p1 &&
+ entry->entry_state == INGRESS_INVALID) {
+ dprintk("(%s) threshold exceeded for ip %pI4, sending MPOA res req\n",
+ mpc->dev->name, &entry->ctrl_info.in_dst_ip);
+ entry->entry_state = INGRESS_RESOLVING;
+ msg.type = SND_MPOA_RES_RQST;
+ memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN);
+ msg.content.in_info = entry->ctrl_info;
+ qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
+ if (qos != NULL)
+ msg.qos = qos->qos;
+ msg_to_mpoad(&msg, mpc);
+ do_gettimeofday(&(entry->reply_wait));
+ }
+
+ return CLOSED;
+}
+
+static void in_cache_put(in_cache_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->use)) {
+ memset(entry, 0, sizeof(in_cache_entry));
+ kfree(entry);
+ }
+}
+
+/*
+ * This should be called with write lock on
+ */
+static void in_cache_remove_entry(in_cache_entry *entry,
+ struct mpoa_client *client)
+{
+ struct atm_vcc *vcc;
+ struct k_message msg;
+
+ vcc = entry->shortcut;
+ dprintk("removing an ingress entry, ip = %pI4\n",
+ &entry->ctrl_info.in_dst_ip);
+
+ if (entry->prev != NULL)
+ entry->prev->next = entry->next;
+ else
+ client->in_cache = entry->next;
+ if (entry->next != NULL)
+ entry->next->prev = entry->prev;
+ client->in_ops->put(entry);
+ if (client->in_cache == NULL && client->eg_cache == NULL) {
+ msg.type = STOP_KEEP_ALIVE_SM;
+ msg_to_mpoad(&msg, client);
+ }
+
+ /* Check if the egress side still uses this VCC */
+ if (vcc != NULL) {
+ eg_cache_entry *eg_entry = client->eg_ops->get_by_vcc(vcc,
+ client);
+ if (eg_entry != NULL) {
+ client->eg_ops->put(eg_entry);
+ return;
+ }
+ vcc_release_async(vcc, -EPIPE);
+ }
+}
+
+/* Call this every MPC-p2 seconds... Not exactly correct solution,
+ but an easy one... */
+static void clear_count_and_expired(struct mpoa_client *client)
+{
+ in_cache_entry *entry, *next_entry;
+ struct timeval now;
+
+ do_gettimeofday(&now);
+
+ write_lock_bh(&client->ingress_lock);
+ entry = client->in_cache;
+ while (entry != NULL) {
+ entry->count = 0;
+ next_entry = entry->next;
+ if ((now.tv_sec - entry->tv.tv_sec)
+ > entry->ctrl_info.holding_time) {
+ dprintk("holding time expired, ip = %pI4\n",
+ &entry->ctrl_info.in_dst_ip);
+ client->in_ops->remove_entry(entry, client);
+ }
+ entry = next_entry;
+ }
+ write_unlock_bh(&client->ingress_lock);
+}
+
+/* Call this every MPC-p4 seconds. */
+static void check_resolving_entries(struct mpoa_client *client)
+{
+
+ struct atm_mpoa_qos *qos;
+ in_cache_entry *entry;
+ struct timeval now;
+ struct k_message msg;
+
+ do_gettimeofday(&now);
+
+ read_lock_bh(&client->ingress_lock);
+ entry = client->in_cache;
+ while (entry != NULL) {
+ if (entry->entry_state == INGRESS_RESOLVING) {
+ if ((now.tv_sec - entry->hold_down.tv_sec) <
+ client->parameters.mpc_p6) {
+ entry = entry->next; /* Entry in hold down */
+ continue;
+ }
+ if ((now.tv_sec - entry->reply_wait.tv_sec) >
+ entry->retry_time) {
+ entry->retry_time = MPC_C1 * (entry->retry_time);
+ /*
+ * Retry time maximum exceeded,
+ * put entry in hold down.
+ */
+ if (entry->retry_time > client->parameters.mpc_p5) {
+ do_gettimeofday(&(entry->hold_down));
+ entry->retry_time = client->parameters.mpc_p4;
+ entry = entry->next;
+ continue;
+ }
+ /* Ask daemon to send a resolution request. */
+ memset(&(entry->hold_down), 0, sizeof(struct timeval));
+ msg.type = SND_MPOA_RES_RTRY;
+ memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN);
+ msg.content.in_info = entry->ctrl_info;
+ qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
+ if (qos != NULL)
+ msg.qos = qos->qos;
+ msg_to_mpoad(&msg, client);
+ do_gettimeofday(&(entry->reply_wait));
+ }
+ }
+ entry = entry->next;
+ }
+ read_unlock_bh(&client->ingress_lock);
+}
+
+/* Call this every MPC-p5 seconds. */
+static void refresh_entries(struct mpoa_client *client)
+{
+ struct timeval now;
+ struct in_cache_entry *entry = client->in_cache;
+
+ ddprintk("refresh_entries\n");
+ do_gettimeofday(&now);
+
+ read_lock_bh(&client->ingress_lock);
+ while (entry != NULL) {
+ if (entry->entry_state == INGRESS_RESOLVED) {
+ if (!(entry->refresh_time))
+ entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3;
+ if ((now.tv_sec - entry->reply_wait.tv_sec) >
+ entry->refresh_time) {
+ dprintk("refreshing an entry.\n");
+ entry->entry_state = INGRESS_REFRESHING;
+
+ }
+ }
+ entry = entry->next;
+ }
+ read_unlock_bh(&client->ingress_lock);
+}
+
+static void in_destroy_cache(struct mpoa_client *mpc)
+{
+ write_lock_irq(&mpc->ingress_lock);
+ while (mpc->in_cache != NULL)
+ mpc->in_ops->remove_entry(mpc->in_cache, mpc);
+ write_unlock_irq(&mpc->ingress_lock);
+}
+
+static eg_cache_entry *eg_cache_get_by_cache_id(__be32 cache_id,
+ struct mpoa_client *mpc)
+{
+ eg_cache_entry *entry;
+
+ read_lock_irq(&mpc->egress_lock);
+ entry = mpc->eg_cache;
+ while (entry != NULL) {
+ if (entry->ctrl_info.cache_id == cache_id) {
+ atomic_inc(&entry->use);
+ read_unlock_irq(&mpc->egress_lock);
+ return entry;
+ }
+ entry = entry->next;
+ }
+ read_unlock_irq(&mpc->egress_lock);
+
+ return NULL;
+}
+
+/* This can be called from any context since it saves CPU flags */
+static eg_cache_entry *eg_cache_get_by_tag(__be32 tag, struct mpoa_client *mpc)
+{
+ unsigned long flags;
+ eg_cache_entry *entry;
+
+ read_lock_irqsave(&mpc->egress_lock, flags);
+ entry = mpc->eg_cache;
+ while (entry != NULL) {
+ if (entry->ctrl_info.tag == tag) {
+ atomic_inc(&entry->use);
+ read_unlock_irqrestore(&mpc->egress_lock, flags);
+ return entry;
+ }
+ entry = entry->next;
+ }
+ read_unlock_irqrestore(&mpc->egress_lock, flags);
+
+ return NULL;
+}
+
+/* This can be called from any context since it saves CPU flags */
+static eg_cache_entry *eg_cache_get_by_vcc(struct atm_vcc *vcc,
+ struct mpoa_client *mpc)
+{
+ unsigned long flags;
+ eg_cache_entry *entry;
+
+ read_lock_irqsave(&mpc->egress_lock, flags);
+ entry = mpc->eg_cache;
+ while (entry != NULL) {
+ if (entry->shortcut == vcc) {
+ atomic_inc(&entry->use);
+ read_unlock_irqrestore(&mpc->egress_lock, flags);
+ return entry;
+ }
+ entry = entry->next;
+ }
+ read_unlock_irqrestore(&mpc->egress_lock, flags);
+
+ return NULL;
+}
+
+static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr,
+ struct mpoa_client *mpc)
+{
+ eg_cache_entry *entry;
+
+ read_lock_irq(&mpc->egress_lock);
+ entry = mpc->eg_cache;
+ while (entry != NULL) {
+ if (entry->latest_ip_addr == ipaddr) {
+ atomic_inc(&entry->use);
+ read_unlock_irq(&mpc->egress_lock);
+ return entry;
+ }
+ entry = entry->next;
+ }
+ read_unlock_irq(&mpc->egress_lock);
+
+ return NULL;
+}
+
+static void eg_cache_put(eg_cache_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->use)) {
+ memset(entry, 0, sizeof(eg_cache_entry));
+ kfree(entry);
+ }
+}
+
+/*
+ * This should be called with write lock on
+ */
+static void eg_cache_remove_entry(eg_cache_entry *entry,
+ struct mpoa_client *client)
+{
+ struct atm_vcc *vcc;
+ struct k_message msg;
+
+ vcc = entry->shortcut;
+ dprintk("removing an egress entry.\n");
+ if (entry->prev != NULL)
+ entry->prev->next = entry->next;
+ else
+ client->eg_cache = entry->next;
+ if (entry->next != NULL)
+ entry->next->prev = entry->prev;
+ client->eg_ops->put(entry);
+ if (client->in_cache == NULL && client->eg_cache == NULL) {
+ msg.type = STOP_KEEP_ALIVE_SM;
+ msg_to_mpoad(&msg, client);
+ }
+
+ /* Check if the ingress side still uses this VCC */
+ if (vcc != NULL) {
+ in_cache_entry *in_entry = client->in_ops->get_by_vcc(vcc, client);
+ if (in_entry != NULL) {
+ client->in_ops->put(in_entry);
+ return;
+ }
+ vcc_release_async(vcc, -EPIPE);
+ }
+}
+
+static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
+ struct mpoa_client *client)
+{
+ eg_cache_entry *entry = kzalloc(sizeof(eg_cache_entry), GFP_KERNEL);
+
+ if (entry == NULL) {
+ pr_info("out of memory\n");
+ return NULL;
+ }
+
+ dprintk("adding an egress entry, ip = %pI4, this should be our IP\n",
+ &msg->content.eg_info.eg_dst_ip);
+
+ atomic_set(&entry->use, 1);
+ dprintk("new_eg_cache_entry: about to lock\n");
+ write_lock_irq(&client->egress_lock);
+ entry->next = client->eg_cache;
+ entry->prev = NULL;
+ if (client->eg_cache != NULL)
+ client->eg_cache->prev = entry;
+ client->eg_cache = entry;
+
+ memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
+ entry->ctrl_info = msg->content.eg_info;
+ do_gettimeofday(&(entry->tv));
+ entry->entry_state = EGRESS_RESOLVED;
+ dprintk("new_eg_cache_entry cache_id %u\n",
+ ntohl(entry->ctrl_info.cache_id));
+ dprintk("mps_ip = %pI4\n", &entry->ctrl_info.mps_ip);
+ atomic_inc(&entry->use);
+
+ write_unlock_irq(&client->egress_lock);
+ dprintk("new_eg_cache_entry: unlocked\n");
+
+ return entry;
+}
+
+static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
+{
+ do_gettimeofday(&(entry->tv));
+ entry->entry_state = EGRESS_RESOLVED;
+ entry->ctrl_info.holding_time = holding_time;
+}
+
+static void clear_expired(struct mpoa_client *client)
+{
+ eg_cache_entry *entry, *next_entry;
+ struct timeval now;
+ struct k_message msg;
+
+ do_gettimeofday(&now);
+
+ write_lock_irq(&client->egress_lock);
+ entry = client->eg_cache;
+ while (entry != NULL) {
+ next_entry = entry->next;
+ if ((now.tv_sec - entry->tv.tv_sec)
+ > entry->ctrl_info.holding_time) {
+ msg.type = SND_EGRESS_PURGE;
+ msg.content.eg_info = entry->ctrl_info;
+ dprintk("egress_cache: holding time expired, cache_id = %u.\n",
+ ntohl(entry->ctrl_info.cache_id));
+ msg_to_mpoad(&msg, client);
+ client->eg_ops->remove_entry(entry, client);
+ }
+ entry = next_entry;
+ }
+ write_unlock_irq(&client->egress_lock);
+}
+
+static void eg_destroy_cache(struct mpoa_client *mpc)
+{
+ write_lock_irq(&mpc->egress_lock);
+ while (mpc->eg_cache != NULL)
+ mpc->eg_ops->remove_entry(mpc->eg_cache, mpc);
+ write_unlock_irq(&mpc->egress_lock);
+}
+
+
+static struct in_cache_ops ingress_ops = {
+ in_cache_add_entry, /* add_entry */
+ in_cache_get, /* get */
+ in_cache_get_with_mask, /* get_with_mask */
+ in_cache_get_by_vcc, /* get_by_vcc */
+ in_cache_put, /* put */
+ in_cache_remove_entry, /* remove_entry */
+ cache_hit, /* cache_hit */
+ clear_count_and_expired, /* clear_count */
+ check_resolving_entries, /* check_resolving */
+ refresh_entries, /* refresh */
+ in_destroy_cache /* destroy_cache */
+};
+
+static struct eg_cache_ops egress_ops = {
+ eg_cache_add_entry, /* add_entry */
+ eg_cache_get_by_cache_id, /* get_by_cache_id */
+ eg_cache_get_by_tag, /* get_by_tag */
+ eg_cache_get_by_vcc, /* get_by_vcc */
+ eg_cache_get_by_src_ip, /* get_by_src_ip */
+ eg_cache_put, /* put */
+ eg_cache_remove_entry, /* remove_entry */
+ update_eg_cache_entry, /* update */
+ clear_expired, /* clear_expired */
+ eg_destroy_cache /* destroy_cache */
+};
+
+
+void atm_mpoa_init_cache(struct mpoa_client *mpc)
+{
+ mpc->in_ops = &ingress_ops;
+ mpc->eg_ops = &egress_ops;
+}
diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h
new file mode 100644
index 000000000..8e5f78cf0
--- /dev/null
+++ b/net/atm/mpoa_caches.h
@@ -0,0 +1,96 @@
+#ifndef MPOA_CACHES_H
+#define MPOA_CACHES_H
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/atmmpc.h>
+
+struct mpoa_client;
+
+void atm_mpoa_init_cache(struct mpoa_client *mpc);
+
+typedef struct in_cache_entry {
+ struct in_cache_entry *next;
+ struct in_cache_entry *prev;
+ struct timeval tv;
+ struct timeval reply_wait;
+ struct timeval hold_down;
+ uint32_t packets_fwded;
+ uint16_t entry_state;
+ uint32_t retry_time;
+ uint32_t refresh_time;
+ uint32_t count;
+ struct atm_vcc *shortcut;
+ uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN];
+ struct in_ctrl_info ctrl_info;
+ atomic_t use;
+} in_cache_entry;
+
+struct in_cache_ops{
+ in_cache_entry *(*add_entry)(__be32 dst_ip,
+ struct mpoa_client *client);
+ in_cache_entry *(*get)(__be32 dst_ip, struct mpoa_client *client);
+ in_cache_entry *(*get_with_mask)(__be32 dst_ip,
+ struct mpoa_client *client,
+ __be32 mask);
+ in_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc,
+ struct mpoa_client *client);
+ void (*put)(in_cache_entry *entry);
+ void (*remove_entry)(in_cache_entry *delEntry,
+ struct mpoa_client *client );
+ int (*cache_hit)(in_cache_entry *entry,
+ struct mpoa_client *client);
+ void (*clear_count)(struct mpoa_client *client);
+ void (*check_resolving)(struct mpoa_client *client);
+ void (*refresh)(struct mpoa_client *client);
+ void (*destroy_cache)(struct mpoa_client *mpc);
+};
+
+typedef struct eg_cache_entry{
+ struct eg_cache_entry *next;
+ struct eg_cache_entry *prev;
+ struct timeval tv;
+ uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN];
+ struct atm_vcc *shortcut;
+ uint32_t packets_rcvd;
+ uint16_t entry_state;
+ __be32 latest_ip_addr; /* The src IP address of the last packet */
+ struct eg_ctrl_info ctrl_info;
+ atomic_t use;
+} eg_cache_entry;
+
+struct eg_cache_ops{
+ eg_cache_entry *(*add_entry)(struct k_message *msg, struct mpoa_client *client);
+ eg_cache_entry *(*get_by_cache_id)(__be32 cache_id, struct mpoa_client *client);
+ eg_cache_entry *(*get_by_tag)(__be32 cache_id, struct mpoa_client *client);
+ eg_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc, struct mpoa_client *client);
+ eg_cache_entry *(*get_by_src_ip)(__be32 ipaddr, struct mpoa_client *client);
+ void (*put)(eg_cache_entry *entry);
+ void (*remove_entry)(eg_cache_entry *entry, struct mpoa_client *client);
+ void (*update)(eg_cache_entry *entry, uint16_t holding_time);
+ void (*clear_expired)(struct mpoa_client *client);
+ void (*destroy_cache)(struct mpoa_client *mpc);
+};
+
+
+/* Ingress cache entry states */
+
+#define INGRESS_REFRESHING 3
+#define INGRESS_RESOLVED 2
+#define INGRESS_RESOLVING 1
+#define INGRESS_INVALID 0
+
+/* VCC states */
+
+#define OPEN 1
+#define CLOSED 0
+
+/* Egress cache entry states */
+
+#define EGRESS_RESOLVED 2
+#define EGRESS_PURGE 1
+#define EGRESS_INVALID 0
+
+#endif
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
new file mode 100644
index 000000000..2df34eb5d
--- /dev/null
+++ b/net/atm/mpoa_proc.c
@@ -0,0 +1,312 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#ifdef CONFIG_PROC_FS
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/atmmpc.h>
+#include <linux/atm.h>
+#include <linux/gfp.h>
+#include "mpc.h"
+#include "mpoa_caches.h"
+
+/*
+ * mpoa_proc.c: Implementation MPOA client's proc
+ * file system statistics
+ */
+
+#if 1
+#define dprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */
+#else
+#define dprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+ } while (0)
+#endif
+
+#if 0
+#define ddprintk(format, args...) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args) /* debug */
+#else
+#define ddprintk(format, args...) \
+ do { if (0) \
+ printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+ } while (0)
+#endif
+
+#define STAT_FILE_NAME "mpc" /* Our statistic file's name */
+
+extern struct mpoa_client *mpcs;
+extern struct proc_dir_entry *atm_proc_root; /* from proc.c. */
+
+static int proc_mpc_open(struct inode *inode, struct file *file);
+static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
+ size_t nbytes, loff_t *ppos);
+
+static int parse_qos(const char *buff);
+
+/*
+ * Define allowed FILE OPERATIONS
+ */
+static const struct file_operations mpc_file_operations = {
+ .owner = THIS_MODULE,
+ .open = proc_mpc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = proc_mpc_write,
+ .release = seq_release,
+};
+
+/*
+ * Returns the state of an ingress cache entry as a string
+ */
+static const char *ingress_state_string(int state)
+{
+ switch (state) {
+ case INGRESS_RESOLVING:
+ return "resolving ";
+ case INGRESS_RESOLVED:
+ return "resolved ";
+ case INGRESS_INVALID:
+ return "invalid ";
+ case INGRESS_REFRESHING:
+ return "refreshing ";
+ }
+
+ return "";
+}
+
+/*
+ * Returns the state of an egress cache entry as a string
+ */
+static const char *egress_state_string(int state)
+{
+ switch (state) {
+ case EGRESS_RESOLVED:
+ return "resolved ";
+ case EGRESS_PURGE:
+ return "purge ";
+ case EGRESS_INVALID:
+ return "invalid ";
+ }
+
+ return "";
+}
+
+/*
+ * FIXME: mpcs (and per-mpc lists) have no locking whatsoever.
+ */
+
+static void *mpc_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t l = *pos;
+ struct mpoa_client *mpc;
+
+ if (!l--)
+ return SEQ_START_TOKEN;
+ for (mpc = mpcs; mpc; mpc = mpc->next)
+ if (!l--)
+ return mpc;
+ return NULL;
+}
+
+static void *mpc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct mpoa_client *p = v;
+ (*pos)++;
+ return v == SEQ_START_TOKEN ? mpcs : p->next;
+}
+
+static void mpc_stop(struct seq_file *m, void *v)
+{
+}
+
+/*
+ * READING function - called when the /proc/atm/mpoa file is read from.
+ */
+static int mpc_show(struct seq_file *m, void *v)
+{
+ struct mpoa_client *mpc = v;
+ int i;
+ in_cache_entry *in_entry;
+ eg_cache_entry *eg_entry;
+ struct timeval now;
+ unsigned char ip_string[16];
+
+ if (v == SEQ_START_TOKEN) {
+ atm_mpoa_disp_qos(m);
+ return 0;
+ }
+
+ seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num);
+ seq_printf(m, "Ingress Entries:\nIP address State Holding time Packets fwded VPI VCI\n");
+ do_gettimeofday(&now);
+
+ for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) {
+ sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip);
+ seq_printf(m, "%-16s%s%-14lu%-12u",
+ ip_string,
+ ingress_state_string(in_entry->entry_state),
+ in_entry->ctrl_info.holding_time -
+ (now.tv_sec-in_entry->tv.tv_sec),
+ in_entry->packets_fwded);
+ if (in_entry->shortcut)
+ seq_printf(m, " %-3d %-3d",
+ in_entry->shortcut->vpi,
+ in_entry->shortcut->vci);
+ seq_printf(m, "\n");
+ }
+
+ seq_printf(m, "\n");
+ seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id State Holding time Packets recvd Latest IP addr VPI VCI\n");
+ for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) {
+ unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr;
+ for (i = 0; i < ATM_ESA_LEN; i++)
+ seq_printf(m, "%02x", p[i]);
+ seq_printf(m, "\n%-16lu%s%-14lu%-15u",
+ (unsigned long)ntohl(eg_entry->ctrl_info.cache_id),
+ egress_state_string(eg_entry->entry_state),
+ (eg_entry->ctrl_info.holding_time -
+ (now.tv_sec-eg_entry->tv.tv_sec)),
+ eg_entry->packets_rcvd);
+
+ /* latest IP address */
+ sprintf(ip_string, "%pI4", &eg_entry->latest_ip_addr);
+ seq_printf(m, "%-16s", ip_string);
+
+ if (eg_entry->shortcut)
+ seq_printf(m, " %-3d %-3d",
+ eg_entry->shortcut->vpi,
+ eg_entry->shortcut->vci);
+ seq_printf(m, "\n");
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const struct seq_operations mpc_op = {
+ .start = mpc_start,
+ .next = mpc_next,
+ .stop = mpc_stop,
+ .show = mpc_show
+};
+
+static int proc_mpc_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &mpc_op);
+}
+
+static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
+ size_t nbytes, loff_t *ppos)
+{
+ char *page, *p;
+ unsigned int len;
+
+ if (nbytes == 0)
+ return 0;
+
+ if (nbytes >= PAGE_SIZE)
+ nbytes = PAGE_SIZE-1;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ for (p = page, len = 0; len < nbytes; p++, len++) {
+ if (get_user(*p, buff++)) {
+ free_page((unsigned long)page);
+ return -EFAULT;
+ }
+ if (*p == '\0' || *p == '\n')
+ break;
+ }
+
+ *p = '\0';
+
+ if (!parse_qos(page))
+ printk("mpoa: proc_mpc_write: could not parse '%s'\n", page);
+
+ free_page((unsigned long)page);
+
+ return len;
+}
+
+static int parse_qos(const char *buff)
+{
+ /* possible lines look like this
+ * add 130.230.54.142 tx=max_pcr,max_sdu rx=max_pcr,max_sdu
+ */
+ unsigned char ip[4];
+ int tx_pcr, tx_sdu, rx_pcr, rx_sdu;
+ __be32 ipaddr;
+ struct atm_qos qos;
+
+ memset(&qos, 0, sizeof(struct atm_qos));
+
+ if (sscanf(buff, "del %hhu.%hhu.%hhu.%hhu",
+ ip, ip+1, ip+2, ip+3) == 4) {
+ ipaddr = *(__be32 *)ip;
+ return atm_mpoa_delete_qos(atm_mpoa_search_qos(ipaddr));
+ }
+
+ if (sscanf(buff, "add %hhu.%hhu.%hhu.%hhu tx=%d,%d rx=tx",
+ ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu) == 6) {
+ rx_pcr = tx_pcr;
+ rx_sdu = tx_sdu;
+ } else if (sscanf(buff, "add %hhu.%hhu.%hhu.%hhu tx=%d,%d rx=%d,%d",
+ ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu, &rx_pcr, &rx_sdu) != 8)
+ return 0;
+
+ ipaddr = *(__be32 *)ip;
+ qos.txtp.traffic_class = ATM_CBR;
+ qos.txtp.max_pcr = tx_pcr;
+ qos.txtp.max_sdu = tx_sdu;
+ qos.rxtp.traffic_class = ATM_CBR;
+ qos.rxtp.max_pcr = rx_pcr;
+ qos.rxtp.max_sdu = rx_sdu;
+ qos.aal = ATM_AAL5;
+ dprintk("parse_qos(): setting qos parameters to tx=%d,%d rx=%d,%d\n",
+ qos.txtp.max_pcr, qos.txtp.max_sdu,
+ qos.rxtp.max_pcr, qos.rxtp.max_sdu);
+
+ atm_mpoa_add_qos(ipaddr, &qos);
+ return 1;
+}
+
+/*
+ * INITIALIZATION function - called when module is initialized/loaded.
+ */
+int mpc_proc_init(void)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_file_operations);
+ if (!p) {
+ pr_err("Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/*
+ * DELETING function - called when module is removed.
+ */
+void mpc_proc_clean(void)
+{
+ remove_proc_entry(STAT_FILE_NAME, atm_proc_root);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+
+
+
+
+
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
new file mode 100644
index 000000000..c4e09846d
--- /dev/null
+++ b/net/atm/pppoatm.c
@@ -0,0 +1,500 @@
+/* net/atm/pppoatm.c - RFC2364 PPP over ATM/AAL5 */
+
+/* Copyright 1999-2000 by Mitchell Blank Jr */
+/* Based on clip.c; 1995-1999 by Werner Almesberger, EPFL LRC/ICA */
+/* And on ppp_async.c; Copyright 1999 Paul Mackerras */
+/* And help from Jens Axboe */
+
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This driver provides the encapsulation and framing for sending
+ * and receiving PPP frames in ATM AAL5 PDUs.
+ */
+
+/*
+ * One shortcoming of this driver is that it does not comply with
+ * section 8 of RFC2364 - we are supposed to detect a change
+ * in encapsulation and immediately abort the connection (in order
+ * to avoid a black-hole being created if our peer loses state
+ * and changes encapsulation unilaterally. However, since the
+ * ppp_generic layer actually does the decapsulation, we need
+ * a way of notifying it when we _think_ there might be a problem)
+ * There's two cases:
+ * 1. LLC-encapsulation was missing when it was enabled. In
+ * this case, we should tell the upper layer "tear down
+ * this session if this skb looks ok to you"
+ * 2. LLC-encapsulation was present when it was disabled. Then
+ * we need to tell the upper layer "this packet may be
+ * ok, but if its in error tear down the session"
+ * These hooks are not yet available in ppp_generic
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/capability.h>
+#include <linux/ppp_defs.h>
+#include <linux/ppp-ioctl.h>
+#include <linux/ppp_channel.h>
+#include <linux/atmppp.h>
+
+#include "common.h"
+
+enum pppoatm_encaps {
+ e_autodetect = PPPOATM_ENCAPS_AUTODETECT,
+ e_vc = PPPOATM_ENCAPS_VC,
+ e_llc = PPPOATM_ENCAPS_LLC,
+};
+
+struct pppoatm_vcc {
+ struct atm_vcc *atmvcc; /* VCC descriptor */
+ void (*old_push)(struct atm_vcc *, struct sk_buff *);
+ void (*old_pop)(struct atm_vcc *, struct sk_buff *);
+ void (*old_release_cb)(struct atm_vcc *);
+ struct module *old_owner;
+ /* keep old push/pop for detaching */
+ enum pppoatm_encaps encaps;
+ atomic_t inflight;
+ unsigned long blocked;
+ int flags; /* SC_COMP_PROT - compress protocol */
+ struct ppp_channel chan; /* interface to generic ppp layer */
+ struct tasklet_struct wakeup_tasklet;
+};
+
+/*
+ * We want to allow two packets in the queue. The one that's currently in
+ * flight, and *one* queued up ready for the ATM device to send immediately
+ * from its TX done IRQ. We want to be able to use atomic_inc_not_zero(), so
+ * inflight == -2 represents an empty queue, -1 one packet, and zero means
+ * there are two packets in the queue.
+ */
+#define NONE_INFLIGHT -2
+
+#define BLOCKED 0
+
+/*
+ * Header used for LLC Encapsulated PPP (4 bytes) followed by the LCP protocol
+ * ID (0xC021) used in autodetection
+ */
+static const unsigned char pppllc[6] = { 0xFE, 0xFE, 0x03, 0xCF, 0xC0, 0x21 };
+#define LLC_LEN (4)
+
+static inline struct pppoatm_vcc *atmvcc_to_pvcc(const struct atm_vcc *atmvcc)
+{
+ return (struct pppoatm_vcc *) (atmvcc->user_back);
+}
+
+static inline struct pppoatm_vcc *chan_to_pvcc(const struct ppp_channel *chan)
+{
+ return (struct pppoatm_vcc *) (chan->private);
+}
+
+/*
+ * We can't do this directly from our _pop handler, since the ppp code
+ * doesn't want to be called in interrupt context, so we do it from
+ * a tasklet
+ */
+static void pppoatm_wakeup_sender(unsigned long arg)
+{
+ ppp_output_wakeup((struct ppp_channel *) arg);
+}
+
+static void pppoatm_release_cb(struct atm_vcc *atmvcc)
+{
+ struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc);
+
+ /*
+ * As in pppoatm_pop(), it's safe to clear the BLOCKED bit here because
+ * the wakeup *can't* race with pppoatm_send(). They both hold the PPP
+ * channel's ->downl lock. And the potential race with *setting* it,
+ * which leads to the double-check dance in pppoatm_may_send(), doesn't
+ * exist here. In the sock_owned_by_user() case in pppoatm_send(), we
+ * set the BLOCKED bit while the socket is still locked. We know that
+ * ->release_cb() can't be called until that's done.
+ */
+ if (test_and_clear_bit(BLOCKED, &pvcc->blocked))
+ tasklet_schedule(&pvcc->wakeup_tasklet);
+ if (pvcc->old_release_cb)
+ pvcc->old_release_cb(atmvcc);
+}
+/*
+ * This gets called every time the ATM card has finished sending our
+ * skb. The ->old_pop will take care up normal atm flow control,
+ * but we also need to wake up the device if we blocked it
+ */
+static void pppoatm_pop(struct atm_vcc *atmvcc, struct sk_buff *skb)
+{
+ struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc);
+
+ pvcc->old_pop(atmvcc, skb);
+ atomic_dec(&pvcc->inflight);
+
+ /*
+ * We always used to run the wakeup tasklet unconditionally here, for
+ * fear of race conditions where we clear the BLOCKED flag just as we
+ * refuse another packet in pppoatm_send(). This was quite inefficient.
+ *
+ * In fact it's OK. The PPP core will only ever call pppoatm_send()
+ * while holding the channel->downl lock. And ppp_output_wakeup() as
+ * called by the tasklet will *also* grab that lock. So even if another
+ * CPU is in pppoatm_send() right now, the tasklet isn't going to race
+ * with it. The wakeup *will* happen after the other CPU is safely out
+ * of pppoatm_send() again.
+ *
+ * So if the CPU in pppoatm_send() has already set the BLOCKED bit and
+ * it about to return, that's fine. We trigger a wakeup which will
+ * happen later. And if the CPU in pppoatm_send() *hasn't* set the
+ * BLOCKED bit yet, that's fine too because of the double check in
+ * pppoatm_may_send() which is commented there.
+ */
+ if (test_and_clear_bit(BLOCKED, &pvcc->blocked))
+ tasklet_schedule(&pvcc->wakeup_tasklet);
+}
+
+/*
+ * Unbind from PPP - currently we only do this when closing the socket,
+ * but we could put this into an ioctl if need be
+ */
+static void pppoatm_unassign_vcc(struct atm_vcc *atmvcc)
+{
+ struct pppoatm_vcc *pvcc;
+ pvcc = atmvcc_to_pvcc(atmvcc);
+ atmvcc->push = pvcc->old_push;
+ atmvcc->pop = pvcc->old_pop;
+ atmvcc->release_cb = pvcc->old_release_cb;
+ tasklet_kill(&pvcc->wakeup_tasklet);
+ ppp_unregister_channel(&pvcc->chan);
+ atmvcc->user_back = NULL;
+ kfree(pvcc);
+}
+
+/* Called when an AAL5 PDU comes in */
+static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
+{
+ struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc);
+ pr_debug("\n");
+ if (skb == NULL) { /* VCC was closed */
+ struct module *module;
+
+ pr_debug("removing ATMPPP VCC %p\n", pvcc);
+ module = pvcc->old_owner;
+ pppoatm_unassign_vcc(atmvcc);
+ atmvcc->push(atmvcc, NULL); /* Pass along bad news */
+ module_put(module);
+ return;
+ }
+ atm_return(atmvcc, skb->truesize);
+ switch (pvcc->encaps) {
+ case e_llc:
+ if (skb->len < LLC_LEN ||
+ memcmp(skb->data, pppllc, LLC_LEN))
+ goto error;
+ skb_pull(skb, LLC_LEN);
+ break;
+ case e_autodetect:
+ if (pvcc->chan.ppp == NULL) { /* Not bound yet! */
+ kfree_skb(skb);
+ return;
+ }
+ if (skb->len >= sizeof(pppllc) &&
+ !memcmp(skb->data, pppllc, sizeof(pppllc))) {
+ pvcc->encaps = e_llc;
+ skb_pull(skb, LLC_LEN);
+ break;
+ }
+ if (skb->len >= (sizeof(pppllc) - LLC_LEN) &&
+ !memcmp(skb->data, &pppllc[LLC_LEN],
+ sizeof(pppllc) - LLC_LEN)) {
+ pvcc->encaps = e_vc;
+ pvcc->chan.mtu += LLC_LEN;
+ break;
+ }
+ pr_debug("Couldn't autodetect yet (skb: %02X %02X %02X %02X %02X %02X)\n",
+ skb->data[0], skb->data[1], skb->data[2],
+ skb->data[3], skb->data[4], skb->data[5]);
+ goto error;
+ case e_vc:
+ break;
+ }
+ ppp_input(&pvcc->chan, skb);
+ return;
+
+error:
+ kfree_skb(skb);
+ ppp_input_error(&pvcc->chan, 0);
+}
+
+static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size)
+{
+ /*
+ * It's not clear that we need to bother with using atm_may_send()
+ * to check we don't exceed sk->sk_sndbuf. If userspace sets a
+ * value of sk_sndbuf which is lower than the MTU, we're going to
+ * block for ever. But the code always did that before we introduced
+ * the packet count limit, so...
+ */
+ if (atm_may_send(pvcc->atmvcc, size) &&
+ atomic_inc_not_zero_hint(&pvcc->inflight, NONE_INFLIGHT))
+ return 1;
+
+ /*
+ * We use test_and_set_bit() rather than set_bit() here because
+ * we need to ensure there's a memory barrier after it. The bit
+ * *must* be set before we do the atomic_inc() on pvcc->inflight.
+ * There's no smp_mb__after_set_bit(), so it's this or abuse
+ * smp_mb__after_atomic().
+ */
+ test_and_set_bit(BLOCKED, &pvcc->blocked);
+
+ /*
+ * We may have raced with pppoatm_pop(). If it ran for the
+ * last packet in the queue, *just* before we set the BLOCKED
+ * bit, then it might never run again and the channel could
+ * remain permanently blocked. Cope with that race by checking
+ * *again*. If it did run in that window, we'll have space on
+ * the queue now and can return success. It's harmless to leave
+ * the BLOCKED flag set, since it's only used as a trigger to
+ * run the wakeup tasklet. Another wakeup will never hurt.
+ * If pppoatm_pop() is running but hasn't got as far as making
+ * space on the queue yet, then it hasn't checked the BLOCKED
+ * flag yet either, so we're safe in that case too. It'll issue
+ * an "immediate" wakeup... where "immediate" actually involves
+ * taking the PPP channel's ->downl lock, which is held by the
+ * code path that calls pppoatm_send(), and is thus going to
+ * wait for us to finish.
+ */
+ if (atm_may_send(pvcc->atmvcc, size) &&
+ atomic_inc_not_zero(&pvcc->inflight))
+ return 1;
+
+ return 0;
+}
+/*
+ * Called by the ppp_generic.c to send a packet - returns true if packet
+ * was accepted. If we return false, then it's our job to call
+ * ppp_output_wakeup(chan) when we're feeling more up to it.
+ * Note that in the ENOMEM case (as opposed to the !atm_may_send case)
+ * we should really drop the packet, but the generic layer doesn't
+ * support this yet. We just return 'DROP_PACKET' which we actually define
+ * as success, just to be clear what we're really doing.
+ */
+#define DROP_PACKET 1
+static int pppoatm_send(struct ppp_channel *chan, struct sk_buff *skb)
+{
+ struct pppoatm_vcc *pvcc = chan_to_pvcc(chan);
+ struct atm_vcc *vcc;
+ int ret;
+
+ ATM_SKB(skb)->vcc = pvcc->atmvcc;
+ pr_debug("(skb=0x%p, vcc=0x%p)\n", skb, pvcc->atmvcc);
+ if (skb->data[0] == '\0' && (pvcc->flags & SC_COMP_PROT))
+ (void) skb_pull(skb, 1);
+
+ vcc = ATM_SKB(skb)->vcc;
+ bh_lock_sock(sk_atm(vcc));
+ if (sock_owned_by_user(sk_atm(vcc))) {
+ /*
+ * Needs to happen (and be flushed, hence test_and_) before we unlock
+ * the socket. It needs to be seen by the time our ->release_cb gets
+ * called.
+ */
+ test_and_set_bit(BLOCKED, &pvcc->blocked);
+ goto nospace;
+ }
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+ !test_bit(ATM_VF_READY, &vcc->flags)) {
+ bh_unlock_sock(sk_atm(vcc));
+ kfree_skb(skb);
+ return DROP_PACKET;
+ }
+
+ switch (pvcc->encaps) { /* LLC encapsulation needed */
+ case e_llc:
+ if (skb_headroom(skb) < LLC_LEN) {
+ struct sk_buff *n;
+ n = skb_realloc_headroom(skb, LLC_LEN);
+ if (n != NULL &&
+ !pppoatm_may_send(pvcc, n->truesize)) {
+ kfree_skb(n);
+ goto nospace;
+ }
+ consume_skb(skb);
+ skb = n;
+ if (skb == NULL) {
+ bh_unlock_sock(sk_atm(vcc));
+ return DROP_PACKET;
+ }
+ } else if (!pppoatm_may_send(pvcc, skb->truesize))
+ goto nospace;
+ memcpy(skb_push(skb, LLC_LEN), pppllc, LLC_LEN);
+ break;
+ case e_vc:
+ if (!pppoatm_may_send(pvcc, skb->truesize))
+ goto nospace;
+ break;
+ case e_autodetect:
+ bh_unlock_sock(sk_atm(vcc));
+ pr_debug("Trying to send without setting encaps!\n");
+ kfree_skb(skb);
+ return 1;
+ }
+
+ atomic_add(skb->truesize, &sk_atm(ATM_SKB(skb)->vcc)->sk_wmem_alloc);
+ ATM_SKB(skb)->atm_options = ATM_SKB(skb)->vcc->atm_options;
+ pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n",
+ skb, ATM_SKB(skb)->vcc, ATM_SKB(skb)->vcc->dev);
+ ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
+ ? DROP_PACKET : 1;
+ bh_unlock_sock(sk_atm(vcc));
+ return ret;
+nospace:
+ bh_unlock_sock(sk_atm(vcc));
+ /*
+ * We don't have space to send this SKB now, but we might have
+ * already applied SC_COMP_PROT compression, so may need to undo
+ */
+ if ((pvcc->flags & SC_COMP_PROT) && skb_headroom(skb) > 0 &&
+ skb->data[-1] == '\0')
+ (void) skb_push(skb, 1);
+ return 0;
+}
+
+/* This handles ioctls sent to the /dev/ppp interface */
+static int pppoatm_devppp_ioctl(struct ppp_channel *chan, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case PPPIOCGFLAGS:
+ return put_user(chan_to_pvcc(chan)->flags, (int __user *) arg)
+ ? -EFAULT : 0;
+ case PPPIOCSFLAGS:
+ return get_user(chan_to_pvcc(chan)->flags, (int __user *) arg)
+ ? -EFAULT : 0;
+ }
+ return -ENOTTY;
+}
+
+static const struct ppp_channel_ops pppoatm_ops = {
+ .start_xmit = pppoatm_send,
+ .ioctl = pppoatm_devppp_ioctl,
+};
+
+static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg)
+{
+ struct atm_backend_ppp be;
+ struct pppoatm_vcc *pvcc;
+ int err;
+ /*
+ * Each PPPoATM instance has its own tasklet - this is just a
+ * prototypical one used to initialize them
+ */
+ static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0);
+ if (copy_from_user(&be, arg, sizeof be))
+ return -EFAULT;
+ if (be.encaps != PPPOATM_ENCAPS_AUTODETECT &&
+ be.encaps != PPPOATM_ENCAPS_VC && be.encaps != PPPOATM_ENCAPS_LLC)
+ return -EINVAL;
+ pvcc = kzalloc(sizeof(*pvcc), GFP_KERNEL);
+ if (pvcc == NULL)
+ return -ENOMEM;
+ pvcc->atmvcc = atmvcc;
+
+ /* Maximum is zero, so that we can use atomic_inc_not_zero() */
+ atomic_set(&pvcc->inflight, NONE_INFLIGHT);
+ pvcc->old_push = atmvcc->push;
+ pvcc->old_pop = atmvcc->pop;
+ pvcc->old_owner = atmvcc->owner;
+ pvcc->old_release_cb = atmvcc->release_cb;
+ pvcc->encaps = (enum pppoatm_encaps) be.encaps;
+ pvcc->chan.private = pvcc;
+ pvcc->chan.ops = &pppoatm_ops;
+ pvcc->chan.mtu = atmvcc->qos.txtp.max_sdu - PPP_HDRLEN -
+ (be.encaps == e_vc ? 0 : LLC_LEN);
+ pvcc->wakeup_tasklet = tasklet_proto;
+ pvcc->wakeup_tasklet.data = (unsigned long) &pvcc->chan;
+ err = ppp_register_channel(&pvcc->chan);
+ if (err != 0) {
+ kfree(pvcc);
+ return err;
+ }
+ atmvcc->user_back = pvcc;
+ atmvcc->push = pppoatm_push;
+ atmvcc->pop = pppoatm_pop;
+ atmvcc->release_cb = pppoatm_release_cb;
+ __module_get(THIS_MODULE);
+ atmvcc->owner = THIS_MODULE;
+
+ /* re-process everything received between connection setup and
+ backend setup */
+ vcc_process_recv_queue(atmvcc);
+ return 0;
+}
+
+/*
+ * This handles ioctls actually performed on our vcc - we must return
+ * -ENOIOCTLCMD for any unrecognized ioctl
+ */
+static int pppoatm_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct atm_vcc *atmvcc = ATM_SD(sock);
+ void __user *argp = (void __user *)arg;
+
+ if (cmd != ATM_SETBACKEND && atmvcc->push != pppoatm_push)
+ return -ENOIOCTLCMD;
+ switch (cmd) {
+ case ATM_SETBACKEND: {
+ atm_backend_t b;
+ if (get_user(b, (atm_backend_t __user *) argp))
+ return -EFAULT;
+ if (b != ATM_BACKEND_PPP)
+ return -ENOIOCTLCMD;
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (sock->state != SS_CONNECTED)
+ return -EINVAL;
+ return pppoatm_assign_vcc(atmvcc, argp);
+ }
+ case PPPIOCGCHAN:
+ return put_user(ppp_channel_index(&atmvcc_to_pvcc(atmvcc)->
+ chan), (int __user *) argp) ? -EFAULT : 0;
+ case PPPIOCGUNIT:
+ return put_user(ppp_unit_number(&atmvcc_to_pvcc(atmvcc)->
+ chan), (int __user *) argp) ? -EFAULT : 0;
+ }
+ return -ENOIOCTLCMD;
+}
+
+static struct atm_ioctl pppoatm_ioctl_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = pppoatm_ioctl,
+};
+
+static int __init pppoatm_init(void)
+{
+ register_atm_ioctl(&pppoatm_ioctl_ops);
+ return 0;
+}
+
+static void __exit pppoatm_exit(void)
+{
+ deregister_atm_ioctl(&pppoatm_ioctl_ops);
+}
+
+module_init(pppoatm_init);
+module_exit(pppoatm_exit);
+
+MODULE_AUTHOR("Mitchell Blank Jr <mitch@sfgoth.com>");
+MODULE_DESCRIPTION("RFC2364 PPP over ATM/AAL5");
+MODULE_LICENSE("GPL");
diff --git a/net/atm/proc.c b/net/atm/proc.c
new file mode 100644
index 000000000..bbb6461a4
--- /dev/null
+++ b/net/atm/proc.c
@@ -0,0 +1,497 @@
+/* net/atm/proc.c - ATM /proc interface
+ *
+ * Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA
+ *
+ * seq_file api usage by romieu@fr.zoreil.com
+ *
+ * Evaluating the efficiency of the whole thing if left as an exercise to
+ * the reader.
+ */
+
+#include <linux/module.h> /* for EXPORT_SYMBOL */
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/errno.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/netdevice.h>
+#include <linux/atmclip.h>
+#include <linux/init.h> /* for __init */
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/atmclip.h>
+#include <linux/uaccess.h>
+#include <linux/param.h> /* for HZ */
+#include <linux/atomic.h>
+#include "resources.h"
+#include "common.h" /* atm_proc_init prototype */
+#include "signaling.h" /* to get sigd - ugly too */
+
+static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
+ size_t count, loff_t *pos);
+
+static const struct file_operations proc_atm_dev_ops = {
+ .owner = THIS_MODULE,
+ .read = proc_dev_atm_read,
+ .llseek = noop_llseek,
+};
+
+static void add_stats(struct seq_file *seq, const char *aal,
+ const struct k_atm_aal_stats *stats)
+{
+ seq_printf(seq, "%s ( %d %d %d %d %d )", aal,
+ atomic_read(&stats->tx), atomic_read(&stats->tx_err),
+ atomic_read(&stats->rx), atomic_read(&stats->rx_err),
+ atomic_read(&stats->rx_drop));
+}
+
+static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev)
+{
+ int i;
+
+ seq_printf(seq, "%3d %-8s", dev->number, dev->type);
+ for (i = 0; i < ESI_LEN; i++)
+ seq_printf(seq, "%02x", dev->esi[i]);
+ seq_puts(seq, " ");
+ add_stats(seq, "0", &dev->stats.aal0);
+ seq_puts(seq, " ");
+ add_stats(seq, "5", &dev->stats.aal5);
+ seq_printf(seq, "\t[%d]", atomic_read(&dev->refcnt));
+ seq_putc(seq, '\n');
+}
+
+struct vcc_state {
+ int bucket;
+ struct sock *sk;
+ int family;
+};
+
+static inline int compare_family(struct sock *sk, int family)
+{
+ return !family || (sk->sk_family == family);
+}
+
+static int __vcc_walk(struct sock **sock, int family, int *bucket, loff_t l)
+{
+ struct sock *sk = *sock;
+
+ if (sk == SEQ_START_TOKEN) {
+ for (*bucket = 0; *bucket < VCC_HTABLE_SIZE; ++*bucket) {
+ struct hlist_head *head = &vcc_hash[*bucket];
+
+ sk = hlist_empty(head) ? NULL : __sk_head(head);
+ if (sk)
+ break;
+ }
+ l--;
+ }
+try_again:
+ for (; sk; sk = sk_next(sk)) {
+ l -= compare_family(sk, family);
+ if (l < 0)
+ goto out;
+ }
+ if (!sk && ++*bucket < VCC_HTABLE_SIZE) {
+ sk = sk_head(&vcc_hash[*bucket]);
+ goto try_again;
+ }
+ sk = SEQ_START_TOKEN;
+out:
+ *sock = sk;
+ return (l < 0);
+}
+
+static inline void *vcc_walk(struct vcc_state *state, loff_t l)
+{
+ return __vcc_walk(&state->sk, state->family, &state->bucket, l) ?
+ state : NULL;
+}
+
+static int __vcc_seq_open(struct inode *inode, struct file *file,
+ int family, const struct seq_operations *ops)
+{
+ struct vcc_state *state;
+
+ state = __seq_open_private(file, ops, sizeof(*state));
+ if (state == NULL)
+ return -ENOMEM;
+
+ state->family = family;
+ return 0;
+}
+
+static void *vcc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(vcc_sklist_lock)
+{
+ struct vcc_state *state = seq->private;
+ loff_t left = *pos;
+
+ read_lock(&vcc_sklist_lock);
+ state->sk = SEQ_START_TOKEN;
+ return left ? vcc_walk(state, left) : SEQ_START_TOKEN;
+}
+
+static void vcc_seq_stop(struct seq_file *seq, void *v)
+ __releases(vcc_sklist_lock)
+{
+ read_unlock(&vcc_sklist_lock);
+}
+
+static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct vcc_state *state = seq->private;
+
+ v = vcc_walk(state, 1);
+ *pos += !!PTR_ERR(v);
+ return v;
+}
+
+static void pvc_info(struct seq_file *seq, struct atm_vcc *vcc)
+{
+ static const char *const class_name[] = {
+ "off", "UBR", "CBR", "VBR", "ABR"};
+ static const char *const aal_name[] = {
+ "---", "1", "2", "3/4", /* 0- 3 */
+ "???", "5", "???", "???", /* 4- 7 */
+ "???", "???", "???", "???", /* 8-11 */
+ "???", "0", "???", "???"}; /* 12-15 */
+
+ seq_printf(seq, "%3d %3d %5d %-3s %7d %-5s %7d %-6s",
+ vcc->dev->number, vcc->vpi, vcc->vci,
+ vcc->qos.aal >= ARRAY_SIZE(aal_name) ? "err" :
+ aal_name[vcc->qos.aal], vcc->qos.rxtp.min_pcr,
+ class_name[vcc->qos.rxtp.traffic_class],
+ vcc->qos.txtp.min_pcr,
+ class_name[vcc->qos.txtp.traffic_class]);
+ if (test_bit(ATM_VF_IS_CLIP, &vcc->flags)) {
+ struct clip_vcc *clip_vcc = CLIP_VCC(vcc);
+ struct net_device *dev;
+
+ dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : NULL;
+ seq_printf(seq, "CLIP, Itf:%s, Encap:",
+ dev ? dev->name : "none?");
+ seq_printf(seq, "%s", clip_vcc->encap ? "LLC/SNAP" : "None");
+ }
+ seq_putc(seq, '\n');
+}
+
+static const char *vcc_state(struct atm_vcc *vcc)
+{
+ static const char *const map[] = { ATM_VS2TXT_MAP };
+
+ return map[ATM_VF2VS(vcc->flags)];
+}
+
+static void vcc_info(struct seq_file *seq, struct atm_vcc *vcc)
+{
+ struct sock *sk = sk_atm(vcc);
+
+ seq_printf(seq, "%pK ", vcc);
+ if (!vcc->dev)
+ seq_printf(seq, "Unassigned ");
+ else
+ seq_printf(seq, "%3d %3d %5d ", vcc->dev->number, vcc->vpi,
+ vcc->vci);
+ switch (sk->sk_family) {
+ case AF_ATMPVC:
+ seq_printf(seq, "PVC");
+ break;
+ case AF_ATMSVC:
+ seq_printf(seq, "SVC");
+ break;
+ default:
+ seq_printf(seq, "%3d", sk->sk_family);
+ }
+ seq_printf(seq, " %04lx %5d %7d/%7d %7d/%7d [%d]\n",
+ vcc->flags, sk->sk_err,
+ sk_wmem_alloc_get(sk), sk->sk_sndbuf,
+ sk_rmem_alloc_get(sk), sk->sk_rcvbuf,
+ atomic_read(&sk->sk_refcnt));
+}
+
+static void svc_info(struct seq_file *seq, struct atm_vcc *vcc)
+{
+ if (!vcc->dev)
+ seq_printf(seq, sizeof(void *) == 4 ?
+ "N/A@%pK%10s" : "N/A@%pK%2s", vcc, "");
+ else
+ seq_printf(seq, "%3d %3d %5d ",
+ vcc->dev->number, vcc->vpi, vcc->vci);
+ seq_printf(seq, "%-10s ", vcc_state(vcc));
+ seq_printf(seq, "%s%s", vcc->remote.sas_addr.pub,
+ *vcc->remote.sas_addr.pub && *vcc->remote.sas_addr.prv ? "+" : "");
+ if (*vcc->remote.sas_addr.prv) {
+ int i;
+
+ for (i = 0; i < ATM_ESA_LEN; i++)
+ seq_printf(seq, "%02x", vcc->remote.sas_addr.prv[i]);
+ }
+ seq_putc(seq, '\n');
+}
+
+static int atm_dev_seq_show(struct seq_file *seq, void *v)
+{
+ static char atm_dev_banner[] =
+ "Itf Type ESI/\"MAC\"addr "
+ "AAL(TX,err,RX,err,drop) ... [refcnt]\n";
+
+ if (v == &atm_devs)
+ seq_puts(seq, atm_dev_banner);
+ else {
+ struct atm_dev *dev = list_entry(v, struct atm_dev, dev_list);
+
+ atm_dev_info(seq, dev);
+ }
+ return 0;
+}
+
+static const struct seq_operations atm_dev_seq_ops = {
+ .start = atm_dev_seq_start,
+ .next = atm_dev_seq_next,
+ .stop = atm_dev_seq_stop,
+ .show = atm_dev_seq_show,
+};
+
+static int atm_dev_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &atm_dev_seq_ops);
+}
+
+static const struct file_operations devices_seq_fops = {
+ .open = atm_dev_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int pvc_seq_show(struct seq_file *seq, void *v)
+{
+ static char atm_pvc_banner[] =
+ "Itf VPI VCI AAL RX(PCR,Class) TX(PCR,Class)\n";
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, atm_pvc_banner);
+ else {
+ struct vcc_state *state = seq->private;
+ struct atm_vcc *vcc = atm_sk(state->sk);
+
+ pvc_info(seq, vcc);
+ }
+ return 0;
+}
+
+static const struct seq_operations pvc_seq_ops = {
+ .start = vcc_seq_start,
+ .next = vcc_seq_next,
+ .stop = vcc_seq_stop,
+ .show = pvc_seq_show,
+};
+
+static int pvc_seq_open(struct inode *inode, struct file *file)
+{
+ return __vcc_seq_open(inode, file, PF_ATMPVC, &pvc_seq_ops);
+}
+
+static const struct file_operations pvc_seq_fops = {
+ .open = pvc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int vcc_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, sizeof(void *) == 4 ? "%-8s%s" : "%-16s%s",
+ "Address ", "Itf VPI VCI Fam Flags Reply "
+ "Send buffer Recv buffer [refcnt]\n");
+ } else {
+ struct vcc_state *state = seq->private;
+ struct atm_vcc *vcc = atm_sk(state->sk);
+
+ vcc_info(seq, vcc);
+ }
+ return 0;
+}
+
+static const struct seq_operations vcc_seq_ops = {
+ .start = vcc_seq_start,
+ .next = vcc_seq_next,
+ .stop = vcc_seq_stop,
+ .show = vcc_seq_show,
+};
+
+static int vcc_seq_open(struct inode *inode, struct file *file)
+{
+ return __vcc_seq_open(inode, file, 0, &vcc_seq_ops);
+}
+
+static const struct file_operations vcc_seq_fops = {
+ .open = vcc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int svc_seq_show(struct seq_file *seq, void *v)
+{
+ static const char atm_svc_banner[] =
+ "Itf VPI VCI State Remote\n";
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, atm_svc_banner);
+ else {
+ struct vcc_state *state = seq->private;
+ struct atm_vcc *vcc = atm_sk(state->sk);
+
+ svc_info(seq, vcc);
+ }
+ return 0;
+}
+
+static const struct seq_operations svc_seq_ops = {
+ .start = vcc_seq_start,
+ .next = vcc_seq_next,
+ .stop = vcc_seq_stop,
+ .show = svc_seq_show,
+};
+
+static int svc_seq_open(struct inode *inode, struct file *file)
+{
+ return __vcc_seq_open(inode, file, PF_ATMSVC, &svc_seq_ops);
+}
+
+static const struct file_operations svc_seq_fops = {
+ .open = svc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct atm_dev *dev;
+ unsigned long page;
+ int length;
+
+ if (count == 0)
+ return 0;
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ dev = PDE_DATA(file_inode(file));
+ if (!dev->ops->proc_read)
+ length = -EINVAL;
+ else {
+ length = dev->ops->proc_read(dev, pos, (char *)page);
+ if (length > count)
+ length = -EINVAL;
+ }
+ if (length >= 0) {
+ if (copy_to_user(buf, (char *)page, length))
+ length = -EFAULT;
+ (*pos)++;
+ }
+ free_page(page);
+ return length;
+}
+
+struct proc_dir_entry *atm_proc_root;
+EXPORT_SYMBOL(atm_proc_root);
+
+
+int atm_proc_dev_register(struct atm_dev *dev)
+{
+ int error;
+
+ /* No proc info */
+ if (!dev->ops->proc_read)
+ return 0;
+
+ error = -ENOMEM;
+ dev->proc_name = kasprintf(GFP_KERNEL, "%s:%d", dev->type, dev->number);
+ if (!dev->proc_name)
+ goto err_out;
+
+ dev->proc_entry = proc_create_data(dev->proc_name, 0, atm_proc_root,
+ &proc_atm_dev_ops, dev);
+ if (!dev->proc_entry)
+ goto err_free_name;
+ return 0;
+
+err_free_name:
+ kfree(dev->proc_name);
+err_out:
+ return error;
+}
+
+void atm_proc_dev_deregister(struct atm_dev *dev)
+{
+ if (!dev->ops->proc_read)
+ return;
+
+ remove_proc_entry(dev->proc_name, atm_proc_root);
+ kfree(dev->proc_name);
+}
+
+static struct atm_proc_entry {
+ char *name;
+ const struct file_operations *proc_fops;
+ struct proc_dir_entry *dirent;
+} atm_proc_ents[] = {
+ { .name = "devices", .proc_fops = &devices_seq_fops },
+ { .name = "pvc", .proc_fops = &pvc_seq_fops },
+ { .name = "svc", .proc_fops = &svc_seq_fops },
+ { .name = "vc", .proc_fops = &vcc_seq_fops },
+ { .name = NULL, .proc_fops = NULL }
+};
+
+static void atm_proc_dirs_remove(void)
+{
+ static struct atm_proc_entry *e;
+
+ for (e = atm_proc_ents; e->name; e++) {
+ if (e->dirent)
+ remove_proc_entry(e->name, atm_proc_root);
+ }
+ remove_proc_entry("atm", init_net.proc_net);
+}
+
+int __init atm_proc_init(void)
+{
+ static struct atm_proc_entry *e;
+ int ret;
+
+ atm_proc_root = proc_net_mkdir(&init_net, "atm", init_net.proc_net);
+ if (!atm_proc_root)
+ goto err_out;
+ for (e = atm_proc_ents; e->name; e++) {
+ struct proc_dir_entry *dirent;
+
+ dirent = proc_create(e->name, S_IRUGO,
+ atm_proc_root, e->proc_fops);
+ if (!dirent)
+ goto err_out_remove;
+ e->dirent = dirent;
+ }
+ ret = 0;
+out:
+ return ret;
+
+err_out_remove:
+ atm_proc_dirs_remove();
+err_out:
+ ret = -ENOMEM;
+ goto out;
+}
+
+void atm_proc_exit(void)
+{
+ atm_proc_dirs_remove();
+}
diff --git a/net/atm/protocols.h b/net/atm/protocols.h
new file mode 100644
index 000000000..acdfc8562
--- /dev/null
+++ b/net/atm/protocols.h
@@ -0,0 +1,13 @@
+/* net/atm/protocols.h - ATM protocol handler entry points */
+
+/* Written 1995-1997 by Werner Almesberger, EPFL LRC */
+
+
+#ifndef NET_ATM_PROTOCOLS_H
+#define NET_ATM_PROTOCOLS_H
+
+int atm_init_aal0(struct atm_vcc *vcc); /* "raw" AAL0 */
+int atm_init_aal34(struct atm_vcc *vcc);/* "raw" AAL3/4 transport */
+int atm_init_aal5(struct atm_vcc *vcc); /* "raw" AAL5 transport */
+
+#endif
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
new file mode 100644
index 000000000..ae0324021
--- /dev/null
+++ b/net/atm/pvc.c
@@ -0,0 +1,162 @@
+/* net/atm/pvc.c - ATM PVC sockets */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#include <linux/net.h> /* struct socket, struct proto_ops */
+#include <linux/atm.h> /* ATM stuff */
+#include <linux/atmdev.h> /* ATM devices */
+#include <linux/errno.h> /* error codes */
+#include <linux/kernel.h> /* printk */
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+#include <net/sock.h> /* for sock_no_* */
+
+#include "resources.h" /* devs and vccs */
+#include "common.h" /* common for PVCs and SVCs */
+
+
+static int pvc_shutdown(struct socket *sock, int how)
+{
+ return 0;
+}
+
+static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr,
+ int sockaddr_len)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_atmpvc *addr;
+ struct atm_vcc *vcc;
+ int error;
+
+ if (sockaddr_len != sizeof(struct sockaddr_atmpvc))
+ return -EINVAL;
+ addr = (struct sockaddr_atmpvc *)sockaddr;
+ if (addr->sap_family != AF_ATMPVC)
+ return -EAFNOSUPPORT;
+ lock_sock(sk);
+ vcc = ATM_SD(sock);
+ if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) {
+ error = -EBADFD;
+ goto out;
+ }
+ if (test_bit(ATM_VF_PARTIAL, &vcc->flags)) {
+ if (vcc->vpi != ATM_VPI_UNSPEC)
+ addr->sap_addr.vpi = vcc->vpi;
+ if (vcc->vci != ATM_VCI_UNSPEC)
+ addr->sap_addr.vci = vcc->vci;
+ }
+ error = vcc_connect(sock, addr->sap_addr.itf, addr->sap_addr.vpi,
+ addr->sap_addr.vci);
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr,
+ int sockaddr_len, int flags)
+{
+ return pvc_bind(sock, sockaddr, sockaddr_len);
+}
+
+static int pvc_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int error;
+
+ lock_sock(sk);
+ error = vcc_setsockopt(sock, level, optname, optval, optlen);
+ release_sock(sk);
+ return error;
+}
+
+static int pvc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int error;
+
+ lock_sock(sk);
+ error = vcc_getsockopt(sock, level, optname, optval, optlen);
+ release_sock(sk);
+ return error;
+}
+
+static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr,
+ int *sockaddr_len, int peer)
+{
+ struct sockaddr_atmpvc *addr;
+ struct atm_vcc *vcc = ATM_SD(sock);
+
+ if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags))
+ return -ENOTCONN;
+ *sockaddr_len = sizeof(struct sockaddr_atmpvc);
+ addr = (struct sockaddr_atmpvc *)sockaddr;
+ memset(addr, 0, sizeof(*addr));
+ addr->sap_family = AF_ATMPVC;
+ addr->sap_addr.itf = vcc->dev->number;
+ addr->sap_addr.vpi = vcc->vpi;
+ addr->sap_addr.vci = vcc->vci;
+ return 0;
+}
+
+static const struct proto_ops pvc_proto_ops = {
+ .family = PF_ATMPVC,
+ .owner = THIS_MODULE,
+
+ .release = vcc_release,
+ .bind = pvc_bind,
+ .connect = pvc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = pvc_getname,
+ .poll = vcc_poll,
+ .ioctl = vcc_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = vcc_compat_ioctl,
+#endif
+ .listen = sock_no_listen,
+ .shutdown = pvc_shutdown,
+ .setsockopt = pvc_setsockopt,
+ .getsockopt = pvc_getsockopt,
+ .sendmsg = vcc_sendmsg,
+ .recvmsg = vcc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+
+static int pvc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ if (net != &init_net)
+ return -EAFNOSUPPORT;
+
+ sock->ops = &pvc_proto_ops;
+ return vcc_create(net, sock, protocol, PF_ATMPVC);
+}
+
+static const struct net_proto_family pvc_family_ops = {
+ .family = PF_ATMPVC,
+ .create = pvc_create,
+ .owner = THIS_MODULE,
+};
+
+
+/*
+ * Initialize the ATM PVC protocol family
+ */
+
+
+int __init atmpvc_init(void)
+{
+ return sock_register(&pvc_family_ops);
+}
+
+void atmpvc_exit(void)
+{
+ sock_unregister(PF_ATMPVC);
+}
diff --git a/net/atm/raw.c b/net/atm/raw.c
new file mode 100644
index 000000000..2e17e97a7
--- /dev/null
+++ b/net/atm/raw.c
@@ -0,0 +1,85 @@
+/* net/atm/raw.c - Raw AAL0 and AAL5 transports */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/atmdev.h>
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "common.h"
+#include "protocols.h"
+
+/*
+ * SKB == NULL indicates that the link is being closed
+ */
+
+static void atm_push_raw(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ if (skb) {
+ struct sock *sk = sk_atm(vcc);
+
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+ }
+}
+
+static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct sock *sk = sk_atm(vcc);
+
+ pr_debug("(%d) %d -= %d\n",
+ vcc->vci, sk_wmem_alloc_get(sk), skb->truesize);
+ atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+ dev_kfree_skb_any(skb);
+ sk->sk_write_space(sk);
+}
+
+static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ /*
+ * Note that if vpi/vci are _ANY or _UNSPEC the below will
+ * still work
+ */
+ if (!capable(CAP_NET_ADMIN) &&
+ (((u32 *)skb->data)[0] & (ATM_HDR_VPI_MASK | ATM_HDR_VCI_MASK)) !=
+ ((vcc->vpi << ATM_HDR_VPI_SHIFT) |
+ (vcc->vci << ATM_HDR_VCI_SHIFT))) {
+ kfree_skb(skb);
+ return -EADDRNOTAVAIL;
+ }
+ return vcc->dev->ops->send(vcc, skb);
+}
+
+int atm_init_aal0(struct atm_vcc *vcc)
+{
+ vcc->push = atm_push_raw;
+ vcc->pop = atm_pop_raw;
+ vcc->push_oam = NULL;
+ vcc->send = atm_send_aal0;
+ return 0;
+}
+
+int atm_init_aal34(struct atm_vcc *vcc)
+{
+ vcc->push = atm_push_raw;
+ vcc->pop = atm_pop_raw;
+ vcc->push_oam = NULL;
+ vcc->send = vcc->dev->ops->send;
+ return 0;
+}
+
+int atm_init_aal5(struct atm_vcc *vcc)
+{
+ vcc->push = atm_push_raw;
+ vcc->pop = atm_pop_raw;
+ vcc->push_oam = NULL;
+ vcc->send = vcc->dev->ops->send;
+ return 0;
+}
+EXPORT_SYMBOL(atm_init_aal5);
diff --git a/net/atm/resources.c b/net/atm/resources.c
new file mode 100644
index 000000000..0447d5d0b
--- /dev/null
+++ b/net/atm/resources.c
@@ -0,0 +1,463 @@
+/* net/atm/resources.c - Statically allocated resources */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+/* Fixes
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * 2002/01 - don't free the whole struct sock on sk->destruct time,
+ * use the default destruct function initialized by sock_init_data */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/atmdev.h>
+#include <linux/sonet.h>
+#include <linux/kernel.h> /* for barrier */
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <net/sock.h> /* for struct sock */
+
+#include "common.h"
+#include "resources.h"
+#include "addr.h"
+
+
+LIST_HEAD(atm_devs);
+DEFINE_MUTEX(atm_dev_mutex);
+
+static struct atm_dev *__alloc_atm_dev(const char *type)
+{
+ struct atm_dev *dev;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return NULL;
+ dev->type = type;
+ dev->signal = ATM_PHY_SIG_UNKNOWN;
+ dev->link_rate = ATM_OC3_PCR;
+ spin_lock_init(&dev->lock);
+ INIT_LIST_HEAD(&dev->local);
+ INIT_LIST_HEAD(&dev->lecs);
+
+ return dev;
+}
+
+static struct atm_dev *__atm_dev_lookup(int number)
+{
+ struct atm_dev *dev;
+ struct list_head *p;
+
+ list_for_each(p, &atm_devs) {
+ dev = list_entry(p, struct atm_dev, dev_list);
+ if (dev->number == number) {
+ atm_dev_hold(dev);
+ return dev;
+ }
+ }
+ return NULL;
+}
+
+struct atm_dev *atm_dev_lookup(int number)
+{
+ struct atm_dev *dev;
+
+ mutex_lock(&atm_dev_mutex);
+ dev = __atm_dev_lookup(number);
+ mutex_unlock(&atm_dev_mutex);
+ return dev;
+}
+EXPORT_SYMBOL(atm_dev_lookup);
+
+struct atm_dev *atm_dev_register(const char *type, struct device *parent,
+ const struct atmdev_ops *ops, int number,
+ unsigned long *flags)
+{
+ struct atm_dev *dev, *inuse;
+
+ dev = __alloc_atm_dev(type);
+ if (!dev) {
+ pr_err("no space for dev %s\n", type);
+ return NULL;
+ }
+ mutex_lock(&atm_dev_mutex);
+ if (number != -1) {
+ inuse = __atm_dev_lookup(number);
+ if (inuse) {
+ atm_dev_put(inuse);
+ mutex_unlock(&atm_dev_mutex);
+ kfree(dev);
+ return NULL;
+ }
+ dev->number = number;
+ } else {
+ dev->number = 0;
+ while ((inuse = __atm_dev_lookup(dev->number))) {
+ atm_dev_put(inuse);
+ dev->number++;
+ }
+ }
+
+ dev->ops = ops;
+ if (flags)
+ dev->flags = *flags;
+ else
+ memset(&dev->flags, 0, sizeof(dev->flags));
+ memset(&dev->stats, 0, sizeof(dev->stats));
+ atomic_set(&dev->refcnt, 1);
+
+ if (atm_proc_dev_register(dev) < 0) {
+ pr_err("atm_proc_dev_register failed for dev %s\n", type);
+ goto out_fail;
+ }
+
+ if (atm_register_sysfs(dev, parent) < 0) {
+ pr_err("atm_register_sysfs failed for dev %s\n", type);
+ atm_proc_dev_deregister(dev);
+ goto out_fail;
+ }
+
+ list_add_tail(&dev->dev_list, &atm_devs);
+
+out:
+ mutex_unlock(&atm_dev_mutex);
+ return dev;
+
+out_fail:
+ kfree(dev);
+ dev = NULL;
+ goto out;
+}
+EXPORT_SYMBOL(atm_dev_register);
+
+void atm_dev_deregister(struct atm_dev *dev)
+{
+ BUG_ON(test_bit(ATM_DF_REMOVED, &dev->flags));
+ set_bit(ATM_DF_REMOVED, &dev->flags);
+
+ /*
+ * if we remove current device from atm_devs list, new device
+ * with same number can appear, such we need deregister proc,
+ * release async all vccs and remove them from vccs list too
+ */
+ mutex_lock(&atm_dev_mutex);
+ list_del(&dev->dev_list);
+ mutex_unlock(&atm_dev_mutex);
+
+ atm_dev_release_vccs(dev);
+ atm_unregister_sysfs(dev);
+ atm_proc_dev_deregister(dev);
+
+ atm_dev_put(dev);
+}
+EXPORT_SYMBOL(atm_dev_deregister);
+
+static void copy_aal_stats(struct k_atm_aal_stats *from,
+ struct atm_aal_stats *to)
+{
+#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i)
+ __AAL_STAT_ITEMS
+#undef __HANDLE_ITEM
+}
+
+static void subtract_aal_stats(struct k_atm_aal_stats *from,
+ struct atm_aal_stats *to)
+{
+#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i)
+ __AAL_STAT_ITEMS
+#undef __HANDLE_ITEM
+}
+
+static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg,
+ int zero)
+{
+ struct atm_dev_stats tmp;
+ int error = 0;
+
+ copy_aal_stats(&dev->stats.aal0, &tmp.aal0);
+ copy_aal_stats(&dev->stats.aal34, &tmp.aal34);
+ copy_aal_stats(&dev->stats.aal5, &tmp.aal5);
+ if (arg)
+ error = copy_to_user(arg, &tmp, sizeof(tmp));
+ if (zero && !error) {
+ subtract_aal_stats(&dev->stats.aal0, &tmp.aal0);
+ subtract_aal_stats(&dev->stats.aal34, &tmp.aal34);
+ subtract_aal_stats(&dev->stats.aal5, &tmp.aal5);
+ }
+ return error ? -EFAULT : 0;
+}
+
+int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
+{
+ void __user *buf;
+ int error, len, number, size = 0;
+ struct atm_dev *dev;
+ struct list_head *p;
+ int *tmp_buf, *tmp_p;
+ int __user *sioc_len;
+ int __user *iobuf_len;
+
+#ifndef CONFIG_COMPAT
+ compat = 0; /* Just so the compiler _knows_ */
+#endif
+
+ switch (cmd) {
+ case ATM_GETNAMES:
+ if (compat) {
+#ifdef CONFIG_COMPAT
+ struct compat_atm_iobuf __user *ciobuf = arg;
+ compat_uptr_t cbuf;
+ iobuf_len = &ciobuf->length;
+ if (get_user(cbuf, &ciobuf->buffer))
+ return -EFAULT;
+ buf = compat_ptr(cbuf);
+#endif
+ } else {
+ struct atm_iobuf __user *iobuf = arg;
+ iobuf_len = &iobuf->length;
+ if (get_user(buf, &iobuf->buffer))
+ return -EFAULT;
+ }
+ if (get_user(len, iobuf_len))
+ return -EFAULT;
+ mutex_lock(&atm_dev_mutex);
+ list_for_each(p, &atm_devs)
+ size += sizeof(int);
+ if (size > len) {
+ mutex_unlock(&atm_dev_mutex);
+ return -E2BIG;
+ }
+ tmp_buf = kmalloc(size, GFP_ATOMIC);
+ if (!tmp_buf) {
+ mutex_unlock(&atm_dev_mutex);
+ return -ENOMEM;
+ }
+ tmp_p = tmp_buf;
+ list_for_each(p, &atm_devs) {
+ dev = list_entry(p, struct atm_dev, dev_list);
+ *tmp_p++ = dev->number;
+ }
+ mutex_unlock(&atm_dev_mutex);
+ error = ((copy_to_user(buf, tmp_buf, size)) ||
+ put_user(size, iobuf_len))
+ ? -EFAULT : 0;
+ kfree(tmp_buf);
+ return error;
+ default:
+ break;
+ }
+
+ if (compat) {
+#ifdef CONFIG_COMPAT
+ struct compat_atmif_sioc __user *csioc = arg;
+ compat_uptr_t carg;
+
+ sioc_len = &csioc->length;
+ if (get_user(carg, &csioc->arg))
+ return -EFAULT;
+ buf = compat_ptr(carg);
+
+ if (get_user(len, &csioc->length))
+ return -EFAULT;
+ if (get_user(number, &csioc->number))
+ return -EFAULT;
+#endif
+ } else {
+ struct atmif_sioc __user *sioc = arg;
+
+ sioc_len = &sioc->length;
+ if (get_user(buf, &sioc->arg))
+ return -EFAULT;
+ if (get_user(len, &sioc->length))
+ return -EFAULT;
+ if (get_user(number, &sioc->number))
+ return -EFAULT;
+ }
+
+ dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d",
+ number);
+ if (!dev)
+ return -ENODEV;
+
+ switch (cmd) {
+ case ATM_GETTYPE:
+ size = strlen(dev->type) + 1;
+ if (copy_to_user(buf, dev->type, size)) {
+ error = -EFAULT;
+ goto done;
+ }
+ break;
+ case ATM_GETESI:
+ size = ESI_LEN;
+ if (copy_to_user(buf, dev->esi, size)) {
+ error = -EFAULT;
+ goto done;
+ }
+ break;
+ case ATM_SETESI:
+ {
+ int i;
+
+ for (i = 0; i < ESI_LEN; i++)
+ if (dev->esi[i]) {
+ error = -EEXIST;
+ goto done;
+ }
+ }
+ /* fall through */
+ case ATM_SETESIF:
+ {
+ unsigned char esi[ESI_LEN];
+
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ if (copy_from_user(esi, buf, ESI_LEN)) {
+ error = -EFAULT;
+ goto done;
+ }
+ memcpy(dev->esi, esi, ESI_LEN);
+ error = ESI_LEN;
+ goto done;
+ }
+ case ATM_GETSTATZ:
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ /* fall through */
+ case ATM_GETSTAT:
+ size = sizeof(struct atm_dev_stats);
+ error = fetch_stats(dev, buf, cmd == ATM_GETSTATZ);
+ if (error)
+ goto done;
+ break;
+ case ATM_GETCIRANGE:
+ size = sizeof(struct atm_cirange);
+ if (copy_to_user(buf, &dev->ci_range, size)) {
+ error = -EFAULT;
+ goto done;
+ }
+ break;
+ case ATM_GETLINKRATE:
+ size = sizeof(int);
+ if (copy_to_user(buf, &dev->link_rate, size)) {
+ error = -EFAULT;
+ goto done;
+ }
+ break;
+ case ATM_RSTADDR:
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ atm_reset_addr(dev, ATM_ADDR_LOCAL);
+ break;
+ case ATM_ADDADDR:
+ case ATM_DELADDR:
+ case ATM_ADDLECSADDR:
+ case ATM_DELLECSADDR:
+ {
+ struct sockaddr_atmsvc addr;
+
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+
+ if (copy_from_user(&addr, buf, sizeof(addr))) {
+ error = -EFAULT;
+ goto done;
+ }
+ if (cmd == ATM_ADDADDR || cmd == ATM_ADDLECSADDR)
+ error = atm_add_addr(dev, &addr,
+ (cmd == ATM_ADDADDR ?
+ ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+ else
+ error = atm_del_addr(dev, &addr,
+ (cmd == ATM_DELADDR ?
+ ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+ goto done;
+ }
+ case ATM_GETADDR:
+ case ATM_GETLECSADDR:
+ error = atm_get_addr(dev, buf, len,
+ (cmd == ATM_GETADDR ?
+ ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+ if (error < 0)
+ goto done;
+ size = error;
+ /* may return 0, but later on size == 0 means "don't
+ write the length" */
+ error = put_user(size, sioc_len) ? -EFAULT : 0;
+ goto done;
+ case ATM_SETLOOP:
+ if (__ATM_LM_XTRMT((int) (unsigned long) buf) &&
+ __ATM_LM_XTLOC((int) (unsigned long) buf) >
+ __ATM_LM_XTRMT((int) (unsigned long) buf)) {
+ error = -EINVAL;
+ goto done;
+ }
+ /* fall through */
+ case ATM_SETCIRANGE:
+ case SONET_GETSTATZ:
+ case SONET_SETDIAG:
+ case SONET_CLRDIAG:
+ case SONET_SETFRAMING:
+ if (!capable(CAP_NET_ADMIN)) {
+ error = -EPERM;
+ goto done;
+ }
+ /* fall through */
+ default:
+ if (compat) {
+#ifdef CONFIG_COMPAT
+ if (!dev->ops->compat_ioctl) {
+ error = -EINVAL;
+ goto done;
+ }
+ size = dev->ops->compat_ioctl(dev, cmd, buf);
+#endif
+ } else {
+ if (!dev->ops->ioctl) {
+ error = -EINVAL;
+ goto done;
+ }
+ size = dev->ops->ioctl(dev, cmd, buf);
+ }
+ if (size < 0) {
+ error = (size == -ENOIOCTLCMD ? -ENOTTY : size);
+ goto done;
+ }
+ }
+
+ if (size)
+ error = put_user(size, sioc_len) ? -EFAULT : 0;
+ else
+ error = 0;
+done:
+ atm_dev_put(dev);
+ return error;
+}
+
+void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ mutex_lock(&atm_dev_mutex);
+ return seq_list_start_head(&atm_devs, *pos);
+}
+
+void atm_dev_seq_stop(struct seq_file *seq, void *v)
+{
+ mutex_unlock(&atm_dev_mutex);
+}
+
+void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &atm_devs, pos);
+}
diff --git a/net/atm/resources.h b/net/atm/resources.h
new file mode 100644
index 000000000..521431e30
--- /dev/null
+++ b/net/atm/resources.h
@@ -0,0 +1,47 @@
+/* net/atm/resources.h - ATM-related resources */
+
+/* Written 1995-1998 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef NET_ATM_RESOURCES_H
+#define NET_ATM_RESOURCES_H
+
+#include <linux/atmdev.h>
+#include <linux/mutex.h>
+
+
+extern struct list_head atm_devs;
+extern struct mutex atm_dev_mutex;
+
+int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat);
+
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/proc_fs.h>
+
+void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos);
+void atm_dev_seq_stop(struct seq_file *seq, void *v);
+void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+
+
+int atm_proc_dev_register(struct atm_dev *dev);
+void atm_proc_dev_deregister(struct atm_dev *dev);
+
+#else
+
+static inline int atm_proc_dev_register(struct atm_dev *dev)
+{
+ return 0;
+}
+
+static inline void atm_proc_dev_deregister(struct atm_dev *dev)
+{
+ /* nothing */
+}
+
+#endif /* CONFIG_PROC_FS */
+
+int atm_register_sysfs(struct atm_dev *adev, struct device *parent);
+void atm_unregister_sysfs(struct atm_dev *adev);
+#endif
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
new file mode 100644
index 000000000..4fd6af473
--- /dev/null
+++ b/net/atm/signaling.c
@@ -0,0 +1,244 @@
+/* net/atm/signaling.c - ATM signaling */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/errno.h> /* error codes */
+#include <linux/kernel.h> /* printk */
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/sched.h> /* jiffies and HZ */
+#include <linux/atm.h> /* ATM stuff */
+#include <linux/atmsap.h>
+#include <linux/atmsvc.h>
+#include <linux/atmdev.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+
+#include "resources.h"
+#include "signaling.h"
+
+struct atm_vcc *sigd = NULL;
+
+static void sigd_put_skb(struct sk_buff *skb)
+{
+ if (!sigd) {
+ pr_debug("atmsvc: no signaling daemon\n");
+ kfree_skb(skb);
+ return;
+ }
+ atm_force_charge(sigd, skb->truesize);
+ skb_queue_tail(&sk_atm(sigd)->sk_receive_queue, skb);
+ sk_atm(sigd)->sk_data_ready(sk_atm(sigd));
+}
+
+static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg)
+{
+ struct sk_buff *skb;
+
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+ !test_bit(ATM_VF_READY, &vcc->flags))
+ return;
+ msg->type = as_error;
+ if (!vcc->dev->ops->change_qos)
+ msg->reply = -EOPNOTSUPP;
+ else {
+ /* should lock VCC */
+ msg->reply = vcc->dev->ops->change_qos(vcc, &msg->qos,
+ msg->reply);
+ if (!msg->reply)
+ msg->type = as_okay;
+ }
+ /*
+ * Should probably just turn around the old skb. But the, the buffer
+ * space accounting needs to follow the change too. Maybe later.
+ */
+ while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
+ schedule();
+ *(struct atmsvc_msg *)skb_put(skb, sizeof(struct atmsvc_msg)) = *msg;
+ sigd_put_skb(skb);
+}
+
+static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct atmsvc_msg *msg;
+ struct atm_vcc *session_vcc;
+ struct sock *sk;
+
+ msg = (struct atmsvc_msg *) skb->data;
+ atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+ vcc = *(struct atm_vcc **) &msg->vcc;
+ pr_debug("%d (0x%lx)\n", (int)msg->type, (unsigned long)vcc);
+ sk = sk_atm(vcc);
+
+ switch (msg->type) {
+ case as_okay:
+ sk->sk_err = -msg->reply;
+ clear_bit(ATM_VF_WAITING, &vcc->flags);
+ if (!*vcc->local.sas_addr.prv && !*vcc->local.sas_addr.pub) {
+ vcc->local.sas_family = AF_ATMSVC;
+ memcpy(vcc->local.sas_addr.prv,
+ msg->local.sas_addr.prv, ATM_ESA_LEN);
+ memcpy(vcc->local.sas_addr.pub,
+ msg->local.sas_addr.pub, ATM_E164_LEN + 1);
+ }
+ session_vcc = vcc->session ? vcc->session : vcc;
+ if (session_vcc->vpi || session_vcc->vci)
+ break;
+ session_vcc->itf = msg->pvc.sap_addr.itf;
+ session_vcc->vpi = msg->pvc.sap_addr.vpi;
+ session_vcc->vci = msg->pvc.sap_addr.vci;
+ if (session_vcc->vpi || session_vcc->vci)
+ session_vcc->qos = msg->qos;
+ break;
+ case as_error:
+ clear_bit(ATM_VF_REGIS, &vcc->flags);
+ clear_bit(ATM_VF_READY, &vcc->flags);
+ sk->sk_err = -msg->reply;
+ clear_bit(ATM_VF_WAITING, &vcc->flags);
+ break;
+ case as_indicate:
+ vcc = *(struct atm_vcc **)&msg->listen_vcc;
+ sk = sk_atm(vcc);
+ pr_debug("as_indicate!!!\n");
+ lock_sock(sk);
+ if (sk_acceptq_is_full(sk)) {
+ sigd_enq(NULL, as_reject, vcc, NULL, NULL);
+ dev_kfree_skb(skb);
+ goto as_indicate_complete;
+ }
+ sk->sk_ack_backlog++;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk));
+ sk->sk_state_change(sk);
+as_indicate_complete:
+ release_sock(sk);
+ return 0;
+ case as_close:
+ set_bit(ATM_VF_RELEASED, &vcc->flags);
+ vcc_release_async(vcc, msg->reply);
+ goto out;
+ case as_modify:
+ modify_qos(vcc, msg);
+ break;
+ case as_addparty:
+ case as_dropparty:
+ sk->sk_err_soft = msg->reply;
+ /* < 0 failure, otherwise ep_ref */
+ clear_bit(ATM_VF_WAITING, &vcc->flags);
+ break;
+ default:
+ pr_alert("bad message type %d\n", (int)msg->type);
+ return -EINVAL;
+ }
+ sk->sk_state_change(sk);
+out:
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+void sigd_enq2(struct atm_vcc *vcc, enum atmsvc_msg_type type,
+ struct atm_vcc *listen_vcc, const struct sockaddr_atmpvc *pvc,
+ const struct sockaddr_atmsvc *svc, const struct atm_qos *qos,
+ int reply)
+{
+ struct sk_buff *skb;
+ struct atmsvc_msg *msg;
+ static unsigned int session = 0;
+
+ pr_debug("%d (0x%p)\n", (int)type, vcc);
+ while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
+ schedule();
+ msg = (struct atmsvc_msg *)skb_put(skb, sizeof(struct atmsvc_msg));
+ memset(msg, 0, sizeof(*msg));
+ msg->type = type;
+ *(struct atm_vcc **) &msg->vcc = vcc;
+ *(struct atm_vcc **) &msg->listen_vcc = listen_vcc;
+ msg->reply = reply;
+ if (qos)
+ msg->qos = *qos;
+ if (vcc)
+ msg->sap = vcc->sap;
+ if (svc)
+ msg->svc = *svc;
+ if (vcc)
+ msg->local = vcc->local;
+ if (pvc)
+ msg->pvc = *pvc;
+ if (vcc) {
+ if (type == as_connect && test_bit(ATM_VF_SESSION, &vcc->flags))
+ msg->session = ++session;
+ /* every new pmp connect gets the next session number */
+ }
+ sigd_put_skb(skb);
+ if (vcc)
+ set_bit(ATM_VF_REGIS, &vcc->flags);
+}
+
+void sigd_enq(struct atm_vcc *vcc, enum atmsvc_msg_type type,
+ struct atm_vcc *listen_vcc, const struct sockaddr_atmpvc *pvc,
+ const struct sockaddr_atmsvc *svc)
+{
+ sigd_enq2(vcc, type, listen_vcc, pvc, svc, vcc ? &vcc->qos : NULL, 0);
+ /* other ISP applications may use "reply" */
+}
+
+static void purge_vcc(struct atm_vcc *vcc)
+{
+ if (sk_atm(vcc)->sk_family == PF_ATMSVC &&
+ !test_bit(ATM_VF_META, &vcc->flags)) {
+ set_bit(ATM_VF_RELEASED, &vcc->flags);
+ clear_bit(ATM_VF_REGIS, &vcc->flags);
+ vcc_release_async(vcc, -EUNATCH);
+ }
+}
+
+static void sigd_close(struct atm_vcc *vcc)
+{
+ struct sock *s;
+ int i;
+
+ pr_debug("\n");
+ sigd = NULL;
+ if (skb_peek(&sk_atm(vcc)->sk_receive_queue))
+ pr_err("closing with requests pending\n");
+ skb_queue_purge(&sk_atm(vcc)->sk_receive_queue);
+
+ read_lock(&vcc_sklist_lock);
+ for (i = 0; i < VCC_HTABLE_SIZE; ++i) {
+ struct hlist_head *head = &vcc_hash[i];
+
+ sk_for_each(s, head) {
+ vcc = atm_sk(s);
+
+ purge_vcc(vcc);
+ }
+ }
+ read_unlock(&vcc_sklist_lock);
+}
+
+static struct atmdev_ops sigd_dev_ops = {
+ .close = sigd_close,
+ .send = sigd_send
+};
+
+static struct atm_dev sigd_dev = {
+ .ops = &sigd_dev_ops,
+ .type = "sig",
+ .number = 999,
+ .lock = __SPIN_LOCK_UNLOCKED(sigd_dev.lock)
+};
+
+int sigd_attach(struct atm_vcc *vcc)
+{
+ if (sigd)
+ return -EADDRINUSE;
+ pr_debug("\n");
+ sigd = vcc;
+ vcc->dev = &sigd_dev;
+ vcc_insert_socket(sk_atm(vcc));
+ set_bit(ATM_VF_META, &vcc->flags);
+ set_bit(ATM_VF_READY, &vcc->flags);
+ return 0;
+}
diff --git a/net/atm/signaling.h b/net/atm/signaling.h
new file mode 100644
index 000000000..08b2a69cc
--- /dev/null
+++ b/net/atm/signaling.h
@@ -0,0 +1,30 @@
+/* net/atm/signaling.h - ATM signaling */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef NET_ATM_SIGNALING_H
+#define NET_ATM_SIGNALING_H
+
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/atmsvc.h>
+
+
+extern struct atm_vcc *sigd; /* needed in svc_release */
+
+
+/*
+ * sigd_enq is a wrapper for sigd_enq2, covering the more common cases, and
+ * avoiding huge lists of null values.
+ */
+
+void sigd_enq2(struct atm_vcc *vcc,enum atmsvc_msg_type type,
+ struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
+ const struct sockaddr_atmsvc *svc,const struct atm_qos *qos,int reply);
+void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type,
+ struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
+ const struct sockaddr_atmsvc *svc);
+int sigd_attach(struct atm_vcc *vcc);
+
+#endif
diff --git a/net/atm/svc.c b/net/atm/svc.c
new file mode 100644
index 000000000..1ba23f501
--- /dev/null
+++ b/net/atm/svc.c
@@ -0,0 +1,690 @@
+/* net/atm/svc.c - ATM SVC sockets */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/string.h>
+#include <linux/net.h> /* struct socket, struct proto_ops */
+#include <linux/errno.h> /* error codes */
+#include <linux/kernel.h> /* printk */
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/sched.h> /* jiffies and HZ */
+#include <linux/fcntl.h> /* O_NONBLOCK */
+#include <linux/init.h>
+#include <linux/atm.h> /* ATM stuff */
+#include <linux/atmsap.h>
+#include <linux/atmsvc.h>
+#include <linux/atmdev.h>
+#include <linux/bitops.h>
+#include <net/sock.h> /* for sock_no_* */
+#include <linux/uaccess.h>
+#include <linux/export.h>
+
+#include "resources.h"
+#include "common.h" /* common for PVCs and SVCs */
+#include "signaling.h"
+#include "addr.h"
+
+static int svc_create(struct net *net, struct socket *sock, int protocol,
+ int kern);
+
+/*
+ * Note: since all this is still nicely synchronized with the signaling demon,
+ * there's no need to protect sleep loops with clis. If signaling is
+ * moved into the kernel, that would change.
+ */
+
+
+static int svc_shutdown(struct socket *sock, int how)
+{
+ return 0;
+}
+
+static void svc_disconnect(struct atm_vcc *vcc)
+{
+ DEFINE_WAIT(wait);
+ struct sk_buff *skb;
+ struct sock *sk = sk_atm(vcc);
+
+ pr_debug("%p\n", vcc);
+ if (test_bit(ATM_VF_REGIS, &vcc->flags)) {
+ sigd_enq(vcc, as_close, NULL, NULL, NULL);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd)
+ break;
+ schedule();
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ }
+ /* beware - socket is still in use by atmsigd until the last
+ as_indicate has been answered */
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ atm_return(vcc, skb->truesize);
+ pr_debug("LISTEN REL\n");
+ sigd_enq2(NULL, as_reject, vcc, NULL, NULL, &vcc->qos, 0);
+ dev_kfree_skb(skb);
+ }
+ clear_bit(ATM_VF_REGIS, &vcc->flags);
+ /* ... may retry later */
+}
+
+static int svc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct atm_vcc *vcc;
+
+ if (sk) {
+ vcc = ATM_SD(sock);
+ pr_debug("%p\n", vcc);
+ clear_bit(ATM_VF_READY, &vcc->flags);
+ /*
+ * VCC pointer is used as a reference,
+ * so we must not free it (thereby subjecting it to re-use)
+ * before all pending connections are closed
+ */
+ svc_disconnect(vcc);
+ vcc_release(sock);
+ }
+ return 0;
+}
+
+static int svc_bind(struct socket *sock, struct sockaddr *sockaddr,
+ int sockaddr_len)
+{
+ DEFINE_WAIT(wait);
+ struct sock *sk = sock->sk;
+ struct sockaddr_atmsvc *addr;
+ struct atm_vcc *vcc;
+ int error;
+
+ if (sockaddr_len != sizeof(struct sockaddr_atmsvc))
+ return -EINVAL;
+ lock_sock(sk);
+ if (sock->state == SS_CONNECTED) {
+ error = -EISCONN;
+ goto out;
+ }
+ if (sock->state != SS_UNCONNECTED) {
+ error = -EINVAL;
+ goto out;
+ }
+ vcc = ATM_SD(sock);
+ addr = (struct sockaddr_atmsvc *) sockaddr;
+ if (addr->sas_family != AF_ATMSVC) {
+ error = -EAFNOSUPPORT;
+ goto out;
+ }
+ clear_bit(ATM_VF_BOUND, &vcc->flags);
+ /* failing rebind will kill old binding */
+ /* @@@ check memory (de)allocation on rebind */
+ if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) {
+ error = -EBADFD;
+ goto out;
+ }
+ vcc->local = *addr;
+ set_bit(ATM_VF_WAITING, &vcc->flags);
+ sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
+ schedule();
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */
+ if (!sigd) {
+ error = -EUNATCH;
+ goto out;
+ }
+ if (!sk->sk_err)
+ set_bit(ATM_VF_BOUND, &vcc->flags);
+ error = -sk->sk_err;
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
+ int sockaddr_len, int flags)
+{
+ DEFINE_WAIT(wait);
+ struct sock *sk = sock->sk;
+ struct sockaddr_atmsvc *addr;
+ struct atm_vcc *vcc = ATM_SD(sock);
+ int error;
+
+ pr_debug("%p\n", vcc);
+ lock_sock(sk);
+ if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ switch (sock->state) {
+ default:
+ error = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ error = -EISCONN;
+ goto out;
+ case SS_CONNECTING:
+ if (test_bit(ATM_VF_WAITING, &vcc->flags)) {
+ error = -EALREADY;
+ goto out;
+ }
+ sock->state = SS_UNCONNECTED;
+ if (sk->sk_err) {
+ error = -sk->sk_err;
+ goto out;
+ }
+ break;
+ case SS_UNCONNECTED:
+ addr = (struct sockaddr_atmsvc *) sockaddr;
+ if (addr->sas_family != AF_ATMSVC) {
+ error = -EAFNOSUPPORT;
+ goto out;
+ }
+ if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) {
+ error = -EBADFD;
+ goto out;
+ }
+ if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS ||
+ vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (!vcc->qos.txtp.traffic_class &&
+ !vcc->qos.rxtp.traffic_class) {
+ error = -EINVAL;
+ goto out;
+ }
+ vcc->remote = *addr;
+ set_bit(ATM_VF_WAITING, &vcc->flags);
+ sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote);
+ if (flags & O_NONBLOCK) {
+ sock->state = SS_CONNECTING;
+ error = -EINPROGRESS;
+ goto out;
+ }
+ error = 0;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+ schedule();
+ if (!signal_pending(current)) {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ continue;
+ }
+ pr_debug("*ABORT*\n");
+ /*
+ * This is tricky:
+ * Kernel ---close--> Demon
+ * Kernel <--close--- Demon
+ * or
+ * Kernel ---close--> Demon
+ * Kernel <--error--- Demon
+ * or
+ * Kernel ---close--> Demon
+ * Kernel <--okay---- Demon
+ * Kernel <--close--- Demon
+ */
+ sigd_enq(vcc, as_close, NULL, NULL, NULL);
+ while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ schedule();
+ }
+ if (!sk->sk_err)
+ while (!test_bit(ATM_VF_RELEASED, &vcc->flags) &&
+ sigd) {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ schedule();
+ }
+ clear_bit(ATM_VF_REGIS, &vcc->flags);
+ clear_bit(ATM_VF_RELEASED, &vcc->flags);
+ clear_bit(ATM_VF_CLOSE, &vcc->flags);
+ /* we're gone now but may connect later */
+ error = -EINTR;
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (error)
+ goto out;
+ if (!sigd) {
+ error = -EUNATCH;
+ goto out;
+ }
+ if (sk->sk_err) {
+ error = -sk->sk_err;
+ goto out;
+ }
+ }
+
+ vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp);
+ vcc->qos.txtp.pcr = 0;
+ vcc->qos.txtp.min_pcr = 0;
+
+ error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci);
+ if (!error)
+ sock->state = SS_CONNECTED;
+ else
+ (void)svc_disconnect(vcc);
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int svc_listen(struct socket *sock, int backlog)
+{
+ DEFINE_WAIT(wait);
+ struct sock *sk = sock->sk;
+ struct atm_vcc *vcc = ATM_SD(sock);
+ int error;
+
+ pr_debug("%p\n", vcc);
+ lock_sock(sk);
+ /* let server handle listen on unbound sockets */
+ if (test_bit(ATM_VF_SESSION, &vcc->flags)) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (test_bit(ATM_VF_LISTEN, &vcc->flags)) {
+ error = -EADDRINUSE;
+ goto out;
+ }
+ set_bit(ATM_VF_WAITING, &vcc->flags);
+ sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
+ schedule();
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (!sigd) {
+ error = -EUNATCH;
+ goto out;
+ }
+ set_bit(ATM_VF_LISTEN, &vcc->flags);
+ vcc_insert_socket(sk);
+ sk->sk_max_ack_backlog = backlog > 0 ? backlog : ATM_BACKLOG_DEFAULT;
+ error = -sk->sk_err;
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ struct atmsvc_msg *msg;
+ struct atm_vcc *old_vcc = ATM_SD(sock);
+ struct atm_vcc *new_vcc;
+ int error;
+
+ lock_sock(sk);
+
+ error = svc_create(sock_net(sk), newsock, 0, 0);
+ if (error)
+ goto out;
+
+ new_vcc = ATM_SD(newsock);
+
+ pr_debug("%p -> %p\n", old_vcc, new_vcc);
+ while (1) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ while (!(skb = skb_dequeue(&sk->sk_receive_queue)) &&
+ sigd) {
+ if (test_bit(ATM_VF_RELEASED, &old_vcc->flags))
+ break;
+ if (test_bit(ATM_VF_CLOSE, &old_vcc->flags)) {
+ error = -sk->sk_err;
+ break;
+ }
+ if (flags & O_NONBLOCK) {
+ error = -EAGAIN;
+ break;
+ }
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ if (signal_pending(current)) {
+ error = -ERESTARTSYS;
+ break;
+ }
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (error)
+ goto out;
+ if (!skb) {
+ error = -EUNATCH;
+ goto out;
+ }
+ msg = (struct atmsvc_msg *)skb->data;
+ new_vcc->qos = msg->qos;
+ set_bit(ATM_VF_HASQOS, &new_vcc->flags);
+ new_vcc->remote = msg->svc;
+ new_vcc->local = msg->local;
+ new_vcc->sap = msg->sap;
+ error = vcc_connect(newsock, msg->pvc.sap_addr.itf,
+ msg->pvc.sap_addr.vpi,
+ msg->pvc.sap_addr.vci);
+ dev_kfree_skb(skb);
+ sk->sk_ack_backlog--;
+ if (error) {
+ sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL,
+ &old_vcc->qos, error);
+ error = error == -EAGAIN ? -EBUSY : error;
+ goto out;
+ }
+ /* wait should be short, so we ignore the non-blocking flag */
+ set_bit(ATM_VF_WAITING, &new_vcc->flags);
+ sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd)
+ break;
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ }
+ finish_wait(sk_sleep(sk_atm(new_vcc)), &wait);
+ if (!sigd) {
+ error = -EUNATCH;
+ goto out;
+ }
+ if (!sk_atm(new_vcc)->sk_err)
+ break;
+ if (sk_atm(new_vcc)->sk_err != ERESTARTSYS) {
+ error = -sk_atm(new_vcc)->sk_err;
+ goto out;
+ }
+ }
+ newsock->state = SS_CONNECTED;
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int svc_getname(struct socket *sock, struct sockaddr *sockaddr,
+ int *sockaddr_len, int peer)
+{
+ struct sockaddr_atmsvc *addr;
+
+ *sockaddr_len = sizeof(struct sockaddr_atmsvc);
+ addr = (struct sockaddr_atmsvc *) sockaddr;
+ memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local,
+ sizeof(struct sockaddr_atmsvc));
+ return 0;
+}
+
+int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
+{
+ struct sock *sk = sk_atm(vcc);
+ DEFINE_WAIT(wait);
+
+ set_bit(ATM_VF_WAITING, &vcc->flags);
+ sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) ||
+ test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) {
+ break;
+ }
+ schedule();
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (!sigd)
+ return -EUNATCH;
+ return -sk->sk_err;
+}
+
+static int svc_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct atm_vcc *vcc = ATM_SD(sock);
+ int value, error = 0;
+
+ lock_sock(sk);
+ switch (optname) {
+ case SO_ATMSAP:
+ if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (copy_from_user(&vcc->sap, optval, optlen)) {
+ error = -EFAULT;
+ goto out;
+ }
+ set_bit(ATM_VF_HASSAP, &vcc->flags);
+ break;
+ case SO_MULTIPOINT:
+ if (level != SOL_ATM || optlen != sizeof(int)) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (get_user(value, (int __user *)optval)) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (value == 1)
+ set_bit(ATM_VF_SESSION, &vcc->flags);
+ else if (value == 0)
+ clear_bit(ATM_VF_SESSION, &vcc->flags);
+ else
+ error = -EINVAL;
+ break;
+ default:
+ error = vcc_setsockopt(sock, level, optname, optval, optlen);
+ }
+
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int svc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int error = 0, len;
+
+ lock_sock(sk);
+ if (!__SO_LEVEL_MATCH(optname, level) || optname != SO_ATMSAP) {
+ error = vcc_getsockopt(sock, level, optname, optval, optlen);
+ goto out;
+ }
+ if (get_user(len, optlen)) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (len != sizeof(struct atm_sap)) {
+ error = -EINVAL;
+ goto out;
+ }
+ if (copy_to_user(optval, &ATM_SD(sock)->sap, sizeof(struct atm_sap))) {
+ error = -EFAULT;
+ goto out;
+ }
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
+ int sockaddr_len, int flags)
+{
+ DEFINE_WAIT(wait);
+ struct sock *sk = sock->sk;
+ struct atm_vcc *vcc = ATM_SD(sock);
+ int error;
+
+ lock_sock(sk);
+ set_bit(ATM_VF_WAITING, &vcc->flags);
+ sigd_enq(vcc, as_addparty, NULL, NULL,
+ (struct sockaddr_atmsvc *) sockaddr);
+ if (flags & O_NONBLOCK) {
+ error = -EINPROGRESS;
+ goto out;
+ }
+ pr_debug("added wait queue\n");
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
+ schedule();
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ error = xchg(&sk->sk_err_soft, 0);
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int svc_dropparty(struct socket *sock, int ep_ref)
+{
+ DEFINE_WAIT(wait);
+ struct sock *sk = sock->sk;
+ struct atm_vcc *vcc = ATM_SD(sock);
+ int error;
+
+ lock_sock(sk);
+ set_bit(ATM_VF_WAITING, &vcc->flags);
+ sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref);
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
+ schedule();
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (!sigd) {
+ error = -EUNATCH;
+ goto out;
+ }
+ error = xchg(&sk->sk_err_soft, 0);
+out:
+ release_sock(sk);
+ return error;
+}
+
+static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ int error, ep_ref;
+ struct sockaddr_atmsvc sa;
+ struct atm_vcc *vcc = ATM_SD(sock);
+
+ switch (cmd) {
+ case ATM_ADDPARTY:
+ if (!test_bit(ATM_VF_SESSION, &vcc->flags))
+ return -EINVAL;
+ if (copy_from_user(&sa, (void __user *) arg, sizeof(sa)))
+ return -EFAULT;
+ error = svc_addparty(sock, (struct sockaddr *)&sa, sizeof(sa),
+ 0);
+ break;
+ case ATM_DROPPARTY:
+ if (!test_bit(ATM_VF_SESSION, &vcc->flags))
+ return -EINVAL;
+ if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int)))
+ return -EFAULT;
+ error = svc_dropparty(sock, ep_ref);
+ break;
+ default:
+ error = vcc_ioctl(sock, cmd, arg);
+ }
+
+ return error;
+}
+
+#ifdef CONFIG_COMPAT
+static int svc_compat_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* The definition of ATM_ADDPARTY uses the size of struct atm_iobuf.
+ But actually it takes a struct sockaddr_atmsvc, which doesn't need
+ compat handling. So all we have to do is fix up cmd... */
+ if (cmd == COMPAT_ATM_ADDPARTY)
+ cmd = ATM_ADDPARTY;
+
+ if (cmd == ATM_ADDPARTY || cmd == ATM_DROPPARTY)
+ return svc_ioctl(sock, cmd, arg);
+ else
+ return vcc_compat_ioctl(sock, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
+static const struct proto_ops svc_proto_ops = {
+ .family = PF_ATMSVC,
+ .owner = THIS_MODULE,
+
+ .release = svc_release,
+ .bind = svc_bind,
+ .connect = svc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = svc_accept,
+ .getname = svc_getname,
+ .poll = vcc_poll,
+ .ioctl = svc_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = svc_compat_ioctl,
+#endif
+ .listen = svc_listen,
+ .shutdown = svc_shutdown,
+ .setsockopt = svc_setsockopt,
+ .getsockopt = svc_getsockopt,
+ .sendmsg = vcc_sendmsg,
+ .recvmsg = vcc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+
+static int svc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ int error;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ sock->ops = &svc_proto_ops;
+ error = vcc_create(net, sock, protocol, AF_ATMSVC);
+ if (error)
+ return error;
+ ATM_SD(sock)->local.sas_family = AF_ATMSVC;
+ ATM_SD(sock)->remote.sas_family = AF_ATMSVC;
+ return 0;
+}
+
+static const struct net_proto_family svc_family_ops = {
+ .family = PF_ATMSVC,
+ .create = svc_create,
+ .owner = THIS_MODULE,
+};
+
+
+/*
+ * Initialize the ATM SVC protocol family
+ */
+
+int __init atmsvc_init(void)
+{
+ return sock_register(&svc_family_ops);
+}
+
+void atmsvc_exit(void)
+{
+ sock_unregister(PF_ATMSVC);
+}
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
new file mode 100644
index 000000000..705e53ef4
--- /dev/null
+++ b/net/ax25/Kconfig
@@ -0,0 +1,121 @@
+#
+# Amateur Radio protocols and AX.25 device configuration
+#
+
+menuconfig HAMRADIO
+ depends on NET && !S390
+ bool "Amateur Radio support"
+ help
+ If you want to connect your Linux box to an amateur radio, answer Y
+ here. You want to read <http://www.tapr.org/>
+ and more specifically about AX.25 on Linux
+ <http://www.linux-ax25.org/>.
+
+ Note that the answer to this question won't directly affect the
+ kernel: saying N will just cause the configurator to skip all
+ the questions about amateur radio.
+
+comment "Packet Radio protocols"
+ depends on HAMRADIO
+
+config AX25
+ tristate "Amateur Radio AX.25 Level 2 protocol"
+ depends on HAMRADIO
+ help
+ This is the protocol used for computer communication over amateur
+ radio. It is either used by itself for point-to-point links, or to
+ carry other protocols such as tcp/ip. To use it, you need a device
+ that connects your Linux box to your amateur radio. You can either
+ use a low speed TNC (a Terminal Node Controller acts as a kind of
+ modem connecting your computer's serial port to your radio's
+ microphone input and speaker output) supporting the KISS protocol or
+ one of the various SCC cards that are supported by the generic Z8530
+ or the DMA SCC driver. Another option are the Baycom modem serial
+ and parallel port hacks or the sound card modem (supported by their
+ own drivers). If you say Y here, you also have to say Y to one of
+ those drivers.
+
+ Information about where to get supporting software for Linux amateur
+ radio as well as information about how to configure an AX.25 port is
+ contained in the AX25-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>. You might also want to
+ check out the file <file:Documentation/networking/ax25.txt> in the
+ kernel source. More information about digital amateur radio in
+ general is on the WWW at
+ <http://www.tapr.org/>.
+
+ To compile this driver as a module, choose M here: the
+ module will be called ax25.
+
+config AX25_DAMA_SLAVE
+ bool "AX.25 DAMA Slave support"
+ default y
+ depends on AX25
+ help
+ DAMA is a mechanism to prevent collisions when doing AX.25
+ networking. A DAMA server (called "master") accepts incoming traffic
+ from clients (called "slaves") and redistributes it to other slaves.
+ If you say Y here, your Linux box will act as a DAMA slave; this is
+ transparent in that you don't have to do any special DAMA
+ configuration. Linux cannot yet act as a DAMA server. This option
+ only compiles DAMA slave support into the kernel. It still needs to
+ be enabled at runtime. For more about DAMA see
+ <http://www.linux-ax25.org>. If unsure, say Y.
+
+# placeholder until implemented
+config AX25_DAMA_MASTER
+ bool 'AX.25 DAMA Master support'
+ depends on AX25_DAMA_SLAVE && BROKEN
+ help
+ DAMA is a mechanism to prevent collisions when doing AX.25
+ networking. A DAMA server (called "master") accepts incoming traffic
+ from clients (called "slaves") and redistributes it to other slaves.
+ If you say Y here, your Linux box will act as a DAMA master; this is
+ transparent in that you don't have to do any special DAMA
+ configuration. Linux cannot yet act as a DAMA server. This option
+ only compiles DAMA slave support into the kernel. It still needs to
+ be explicitly enabled, so if unsure, say Y.
+
+config NETROM
+ tristate "Amateur Radio NET/ROM protocol"
+ depends on AX25
+ help
+ NET/ROM is a network layer protocol on top of AX.25 useful for
+ routing.
+
+ A comprehensive listing of all the software for Linux amateur radio
+ users as well as information about how to configure an AX.25 port is
+ contained in the Linux Ham Wiki, available from
+ <http://www.linux-ax25.org>. You also might want to check out the
+ file <file:Documentation/networking/ax25.txt>. More information about
+ digital amateur radio in general is on the WWW at
+ <http://www.tapr.org/>.
+
+ To compile this driver as a module, choose M here: the
+ module will be called netrom.
+
+config ROSE
+ tristate "Amateur Radio X.25 PLP (Rose)"
+ depends on AX25
+ help
+ The Packet Layer Protocol (PLP) is a way to route packets over X.25
+ connections in general and amateur radio AX.25 connections in
+ particular, essentially an alternative to NET/ROM.
+
+ A comprehensive listing of all the software for Linux amateur radio
+ users as well as information about how to configure an AX.25 port is
+ contained in the Linux Ham Wiki, available from
+ <http://www.linux-ax25.org>. You also might want to check out the
+ file <file:Documentation/networking/ax25.txt>. More information about
+ digital amateur radio in general is on the WWW at
+ <http://www.tapr.org/>.
+
+ To compile this driver as a module, choose M here: the
+ module will be called rose.
+
+menu "AX.25 network device drivers"
+ depends on HAMRADIO && AX25
+
+source "drivers/net/hamradio/Kconfig"
+
+endmenu
diff --git a/net/ax25/Makefile b/net/ax25/Makefile
new file mode 100644
index 000000000..43c46d2ca
--- /dev/null
+++ b/net/ax25/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Linux AX.25 layer.
+#
+
+obj-$(CONFIG_AX25) += ax25.o
+
+ax25-y := ax25_addr.o ax25_dev.o ax25_iface.o ax25_in.o ax25_ip.o ax25_out.o \
+ ax25_route.o ax25_std_in.o ax25_std_subr.o ax25_std_timer.o \
+ ax25_subr.o ax25_timer.o ax25_uid.o af_ax25.o
+ax25-$(CONFIG_AX25_DAMA_SLAVE) += ax25_ds_in.o ax25_ds_subr.o ax25_ds_timer.o
+ax25-$(CONFIG_SYSCTL) += sysctl_net_ax25.o
diff --git a/net/ax25/TODO b/net/ax25/TODO
new file mode 100644
index 000000000..69fb4e368
--- /dev/null
+++ b/net/ax25/TODO
@@ -0,0 +1,20 @@
+Do the ax25_list_lock, ax25_dev_lock, linkfail_lockreally, ax25_frag_lock and
+listen_lock have to be bh-safe?
+
+Do the netrom and rose locks have to be bh-safe?
+
+A device might be deleted after lookup in the SIOCADDRT ioctl but before it's
+being used.
+
+Routes to a device being taken down might be deleted by ax25_rt_device_down
+but added by somebody else before the device has been deleted fully.
+
+The ax25_rt_find_route synopsys is pervert but I somehow had to deal with
+the race caused by the static variable in it's previous implementation.
+
+Implement proper socket locking in netrom and rose.
+
+Check socket locking when ax25_rcv is sending to raw sockets. In particular
+ax25_send_to_raw() seems fishy. Heck - ax25_rcv is fishy.
+
+Handle XID and TEST frames properly.
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
new file mode 100644
index 000000000..330c1f4a5
--- /dev/null
+++ b/net/ax25/af_ax25.c
@@ -0,0 +1,2022 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
+ * Copyright (C) Hans Alblas PE1AYX (hans@esrac.ele.tue.nl)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ */
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/netfilter.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <net/net_namespace.h>
+#include <net/tcp_states.h>
+#include <net/ip.h>
+#include <net/arp.h>
+
+
+
+HLIST_HEAD(ax25_list);
+DEFINE_SPINLOCK(ax25_list_lock);
+
+static const struct proto_ops ax25_proto_ops;
+
+static void ax25_free_sock(struct sock *sk)
+{
+ ax25_cb_put(ax25_sk(sk));
+}
+
+/*
+ * Socket removal during an interrupt is now safe.
+ */
+static void ax25_cb_del(ax25_cb *ax25)
+{
+ if (!hlist_unhashed(&ax25->ax25_node)) {
+ spin_lock_bh(&ax25_list_lock);
+ hlist_del_init(&ax25->ax25_node);
+ spin_unlock_bh(&ax25_list_lock);
+ ax25_cb_put(ax25);
+ }
+}
+
+/*
+ * Kill all bound sockets on a dropped device.
+ */
+static void ax25_kill_by_device(struct net_device *dev)
+{
+ ax25_dev *ax25_dev;
+ ax25_cb *s;
+
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ return;
+
+ spin_lock_bh(&ax25_list_lock);
+again:
+ ax25_for_each(s, &ax25_list) {
+ if (s->ax25_dev == ax25_dev) {
+ s->ax25_dev = NULL;
+ spin_unlock_bh(&ax25_list_lock);
+ ax25_disconnect(s, ENETUNREACH);
+ spin_lock_bh(&ax25_list_lock);
+
+ /* The entry could have been deleted from the
+ * list meanwhile and thus the next pointer is
+ * no longer valid. Play it safe and restart
+ * the scan. Forward progress is ensured
+ * because we set s->ax25_dev to NULL and we
+ * are never passed a NULL 'dev' argument.
+ */
+ goto again;
+ }
+ }
+ spin_unlock_bh(&ax25_list_lock);
+}
+
+/*
+ * Handle device status changes.
+ */
+static int ax25_device_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ /* Reject non AX.25 devices */
+ if (dev->type != ARPHRD_AX25)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ ax25_dev_device_up(dev);
+ break;
+ case NETDEV_DOWN:
+ ax25_kill_by_device(dev);
+ ax25_rt_device_down(dev);
+ ax25_dev_device_down(dev);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Add a socket to the bound sockets list.
+ */
+void ax25_cb_add(ax25_cb *ax25)
+{
+ spin_lock_bh(&ax25_list_lock);
+ ax25_cb_hold(ax25);
+ hlist_add_head(&ax25->ax25_node, &ax25_list);
+ spin_unlock_bh(&ax25_list_lock);
+}
+
+/*
+ * Find a socket that wants to accept the SABM we have just
+ * received.
+ */
+struct sock *ax25_find_listener(ax25_address *addr, int digi,
+ struct net_device *dev, int type)
+{
+ ax25_cb *s;
+
+ spin_lock(&ax25_list_lock);
+ ax25_for_each(s, &ax25_list) {
+ if ((s->iamdigi && !digi) || (!s->iamdigi && digi))
+ continue;
+ if (s->sk && !ax25cmp(&s->source_addr, addr) &&
+ s->sk->sk_type == type && s->sk->sk_state == TCP_LISTEN) {
+ /* If device is null we match any device */
+ if (s->ax25_dev == NULL || s->ax25_dev->dev == dev) {
+ sock_hold(s->sk);
+ spin_unlock(&ax25_list_lock);
+ return s->sk;
+ }
+ }
+ }
+ spin_unlock(&ax25_list_lock);
+
+ return NULL;
+}
+
+/*
+ * Find an AX.25 socket given both ends.
+ */
+struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr,
+ int type)
+{
+ struct sock *sk = NULL;
+ ax25_cb *s;
+
+ spin_lock(&ax25_list_lock);
+ ax25_for_each(s, &ax25_list) {
+ if (s->sk && !ax25cmp(&s->source_addr, my_addr) &&
+ !ax25cmp(&s->dest_addr, dest_addr) &&
+ s->sk->sk_type == type) {
+ sk = s->sk;
+ sock_hold(sk);
+ break;
+ }
+ }
+
+ spin_unlock(&ax25_list_lock);
+
+ return sk;
+}
+
+/*
+ * Find an AX.25 control block given both ends. It will only pick up
+ * floating AX.25 control blocks or non Raw socket bound control blocks.
+ */
+ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr,
+ ax25_digi *digi, struct net_device *dev)
+{
+ ax25_cb *s;
+
+ spin_lock_bh(&ax25_list_lock);
+ ax25_for_each(s, &ax25_list) {
+ if (s->sk && s->sk->sk_type != SOCK_SEQPACKET)
+ continue;
+ if (s->ax25_dev == NULL)
+ continue;
+ if (ax25cmp(&s->source_addr, src_addr) == 0 && ax25cmp(&s->dest_addr, dest_addr) == 0 && s->ax25_dev->dev == dev) {
+ if (digi != NULL && digi->ndigi != 0) {
+ if (s->digipeat == NULL)
+ continue;
+ if (ax25digicmp(s->digipeat, digi) != 0)
+ continue;
+ } else {
+ if (s->digipeat != NULL && s->digipeat->ndigi != 0)
+ continue;
+ }
+ ax25_cb_hold(s);
+ spin_unlock_bh(&ax25_list_lock);
+
+ return s;
+ }
+ }
+ spin_unlock_bh(&ax25_list_lock);
+
+ return NULL;
+}
+
+EXPORT_SYMBOL(ax25_find_cb);
+
+void ax25_send_to_raw(ax25_address *addr, struct sk_buff *skb, int proto)
+{
+ ax25_cb *s;
+ struct sk_buff *copy;
+
+ spin_lock(&ax25_list_lock);
+ ax25_for_each(s, &ax25_list) {
+ if (s->sk != NULL && ax25cmp(&s->source_addr, addr) == 0 &&
+ s->sk->sk_type == SOCK_RAW &&
+ s->sk->sk_protocol == proto &&
+ s->ax25_dev->dev == skb->dev &&
+ atomic_read(&s->sk->sk_rmem_alloc) <= s->sk->sk_rcvbuf) {
+ if ((copy = skb_clone(skb, GFP_ATOMIC)) == NULL)
+ continue;
+ if (sock_queue_rcv_skb(s->sk, copy) != 0)
+ kfree_skb(copy);
+ }
+ }
+ spin_unlock(&ax25_list_lock);
+}
+
+/*
+ * Deferred destroy.
+ */
+void ax25_destroy_socket(ax25_cb *);
+
+/*
+ * Handler for deferred kills.
+ */
+static void ax25_destroy_timer(unsigned long data)
+{
+ ax25_cb *ax25=(ax25_cb *)data;
+ struct sock *sk;
+
+ sk=ax25->sk;
+
+ bh_lock_sock(sk);
+ sock_hold(sk);
+ ax25_destroy_socket(ax25);
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * This is called from user mode and the timers. Thus it protects itself
+ * against interrupt users but doesn't worry about being called during
+ * work. Once it is removed from the queue no interrupt or bottom half
+ * will touch it and we are (fairly 8-) ) safe.
+ */
+void ax25_destroy_socket(ax25_cb *ax25)
+{
+ struct sk_buff *skb;
+
+ ax25_cb_del(ax25);
+
+ ax25_stop_heartbeat(ax25);
+ ax25_stop_t1timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_stop_t3timer(ax25);
+ ax25_stop_idletimer(ax25);
+
+ ax25_clear_queues(ax25); /* Flush the queues */
+
+ if (ax25->sk != NULL) {
+ while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) {
+ if (skb->sk != ax25->sk) {
+ /* A pending connection */
+ ax25_cb *sax25 = ax25_sk(skb->sk);
+
+ /* Queue the unaccepted socket for death */
+ sock_orphan(skb->sk);
+
+ /* 9A4GL: hack to release unaccepted sockets */
+ skb->sk->sk_state = TCP_LISTEN;
+
+ ax25_start_heartbeat(sax25);
+ sax25->state = AX25_STATE_0;
+ }
+
+ kfree_skb(skb);
+ }
+ skb_queue_purge(&ax25->sk->sk_write_queue);
+ }
+
+ if (ax25->sk != NULL) {
+ if (sk_has_allocations(ax25->sk)) {
+ /* Defer: outstanding buffers */
+ setup_timer(&ax25->dtimer, ax25_destroy_timer,
+ (unsigned long)ax25);
+ ax25->dtimer.expires = jiffies + 2 * HZ;
+ add_timer(&ax25->dtimer);
+ } else {
+ struct sock *sk=ax25->sk;
+ ax25->sk=NULL;
+ sock_put(sk);
+ }
+ } else {
+ ax25_cb_put(ax25);
+ }
+}
+
+/*
+ * dl1bke 960311: set parameters for existing AX.25 connections,
+ * includes a KILL command to abort any connection.
+ * VERY useful for debugging ;-)
+ */
+static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
+{
+ struct ax25_ctl_struct ax25_ctl;
+ ax25_digi digi;
+ ax25_dev *ax25_dev;
+ ax25_cb *ax25;
+ unsigned int k;
+ int ret = 0;
+
+ if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl)))
+ return -EFAULT;
+
+ if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL)
+ return -ENODEV;
+
+ if (ax25_ctl.digi_count > AX25_MAX_DIGIS)
+ return -EINVAL;
+
+ if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL)
+ return -EINVAL;
+
+ digi.ndigi = ax25_ctl.digi_count;
+ for (k = 0; k < digi.ndigi; k++)
+ digi.calls[k] = ax25_ctl.digi_addr[k];
+
+ if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL)
+ return -ENOTCONN;
+
+ switch (ax25_ctl.cmd) {
+ case AX25_KILL:
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ if (ax25_dev->dama.slave && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE)
+ ax25_dama_off(ax25);
+#endif
+ ax25_disconnect(ax25, ENETRESET);
+ break;
+
+ case AX25_WINDOW:
+ if (ax25->modulus == AX25_MODULUS) {
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > 7)
+ goto einval_put;
+ } else {
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > 63)
+ goto einval_put;
+ }
+ ax25->window = ax25_ctl.arg;
+ break;
+
+ case AX25_T1:
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > ULONG_MAX / HZ)
+ goto einval_put;
+ ax25->rtt = (ax25_ctl.arg * HZ) / 2;
+ ax25->t1 = ax25_ctl.arg * HZ;
+ break;
+
+ case AX25_T2:
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > ULONG_MAX / HZ)
+ goto einval_put;
+ ax25->t2 = ax25_ctl.arg * HZ;
+ break;
+
+ case AX25_N2:
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > 31)
+ goto einval_put;
+ ax25->n2count = 0;
+ ax25->n2 = ax25_ctl.arg;
+ break;
+
+ case AX25_T3:
+ if (ax25_ctl.arg > ULONG_MAX / HZ)
+ goto einval_put;
+ ax25->t3 = ax25_ctl.arg * HZ;
+ break;
+
+ case AX25_IDLE:
+ if (ax25_ctl.arg > ULONG_MAX / (60 * HZ))
+ goto einval_put;
+
+ ax25->idle = ax25_ctl.arg * 60 * HZ;
+ break;
+
+ case AX25_PACLEN:
+ if (ax25_ctl.arg < 16 || ax25_ctl.arg > 65535)
+ goto einval_put;
+ ax25->paclen = ax25_ctl.arg;
+ break;
+
+ default:
+ goto einval_put;
+ }
+
+out_put:
+ ax25_cb_put(ax25);
+ return ret;
+
+einval_put:
+ ret = -EINVAL;
+ goto out_put;
+}
+
+static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev)
+{
+ ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2;
+ ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]);
+ ax25->t2 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T2]);
+ ax25->t3 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T3]);
+ ax25->n2 = ax25_dev->values[AX25_VALUES_N2];
+ ax25->paclen = ax25_dev->values[AX25_VALUES_PACLEN];
+ ax25->idle = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_IDLE]);
+ ax25->backoff = ax25_dev->values[AX25_VALUES_BACKOFF];
+
+ if (ax25_dev->values[AX25_VALUES_AXDEFMODE]) {
+ ax25->modulus = AX25_EMODULUS;
+ ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW];
+ } else {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25_dev->values[AX25_VALUES_WINDOW];
+ }
+}
+
+/*
+ * Fill in a created AX.25 created control block with the default
+ * values for a particular device.
+ */
+void ax25_fillin_cb(ax25_cb *ax25, ax25_dev *ax25_dev)
+{
+ ax25->ax25_dev = ax25_dev;
+
+ if (ax25->ax25_dev != NULL) {
+ ax25_fillin_cb_from_dev(ax25, ax25_dev);
+ return;
+ }
+
+ /*
+ * No device, use kernel / AX.25 spec default values
+ */
+ ax25->rtt = msecs_to_jiffies(AX25_DEF_T1) / 2;
+ ax25->t1 = msecs_to_jiffies(AX25_DEF_T1);
+ ax25->t2 = msecs_to_jiffies(AX25_DEF_T2);
+ ax25->t3 = msecs_to_jiffies(AX25_DEF_T3);
+ ax25->n2 = AX25_DEF_N2;
+ ax25->paclen = AX25_DEF_PACLEN;
+ ax25->idle = msecs_to_jiffies(AX25_DEF_IDLE);
+ ax25->backoff = AX25_DEF_BACKOFF;
+
+ if (AX25_DEF_AXDEFMODE) {
+ ax25->modulus = AX25_EMODULUS;
+ ax25->window = AX25_DEF_EWINDOW;
+ } else {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = AX25_DEF_WINDOW;
+ }
+}
+
+/*
+ * Create an empty AX.25 control block.
+ */
+ax25_cb *ax25_create_cb(void)
+{
+ ax25_cb *ax25;
+
+ if ((ax25 = kzalloc(sizeof(*ax25), GFP_ATOMIC)) == NULL)
+ return NULL;
+
+ atomic_set(&ax25->refcount, 1);
+
+ skb_queue_head_init(&ax25->write_queue);
+ skb_queue_head_init(&ax25->frag_queue);
+ skb_queue_head_init(&ax25->ack_queue);
+ skb_queue_head_init(&ax25->reseq_queue);
+
+ ax25_setup_timers(ax25);
+
+ ax25_fillin_cb(ax25, NULL);
+
+ ax25->state = AX25_STATE_0;
+
+ return ax25;
+}
+
+/*
+ * Handling for system calls applied via the various interfaces to an
+ * AX25 socket object
+ */
+
+static int ax25_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ ax25_cb *ax25;
+ struct net_device *dev;
+ char devname[IFNAMSIZ];
+ unsigned long opt;
+ int res = 0;
+
+ if (level != SOL_AX25)
+ return -ENOPROTOOPT;
+
+ if (optlen < sizeof(unsigned int))
+ return -EINVAL;
+
+ if (get_user(opt, (unsigned int __user *)optval))
+ return -EFAULT;
+
+ lock_sock(sk);
+ ax25 = ax25_sk(sk);
+
+ switch (optname) {
+ case AX25_WINDOW:
+ if (ax25->modulus == AX25_MODULUS) {
+ if (opt < 1 || opt > 7) {
+ res = -EINVAL;
+ break;
+ }
+ } else {
+ if (opt < 1 || opt > 63) {
+ res = -EINVAL;
+ break;
+ }
+ }
+ ax25->window = opt;
+ break;
+
+ case AX25_T1:
+ if (opt < 1 || opt > ULONG_MAX / HZ) {
+ res = -EINVAL;
+ break;
+ }
+ ax25->rtt = (opt * HZ) >> 1;
+ ax25->t1 = opt * HZ;
+ break;
+
+ case AX25_T2:
+ if (opt < 1 || opt > ULONG_MAX / HZ) {
+ res = -EINVAL;
+ break;
+ }
+ ax25->t2 = opt * HZ;
+ break;
+
+ case AX25_N2:
+ if (opt < 1 || opt > 31) {
+ res = -EINVAL;
+ break;
+ }
+ ax25->n2 = opt;
+ break;
+
+ case AX25_T3:
+ if (opt < 1 || opt > ULONG_MAX / HZ) {
+ res = -EINVAL;
+ break;
+ }
+ ax25->t3 = opt * HZ;
+ break;
+
+ case AX25_IDLE:
+ if (opt > ULONG_MAX / (60 * HZ)) {
+ res = -EINVAL;
+ break;
+ }
+ ax25->idle = opt * 60 * HZ;
+ break;
+
+ case AX25_BACKOFF:
+ if (opt > 2) {
+ res = -EINVAL;
+ break;
+ }
+ ax25->backoff = opt;
+ break;
+
+ case AX25_EXTSEQ:
+ ax25->modulus = opt ? AX25_EMODULUS : AX25_MODULUS;
+ break;
+
+ case AX25_PIDINCL:
+ ax25->pidincl = opt ? 1 : 0;
+ break;
+
+ case AX25_IAMDIGI:
+ ax25->iamdigi = opt ? 1 : 0;
+ break;
+
+ case AX25_PACLEN:
+ if (opt < 16 || opt > 65535) {
+ res = -EINVAL;
+ break;
+ }
+ ax25->paclen = opt;
+ break;
+
+ case SO_BINDTODEVICE:
+ if (optlen > IFNAMSIZ)
+ optlen = IFNAMSIZ;
+
+ if (copy_from_user(devname, optval, optlen)) {
+ res = -EFAULT;
+ break;
+ }
+
+ if (sk->sk_type == SOCK_SEQPACKET &&
+ (sock->state != SS_UNCONNECTED ||
+ sk->sk_state == TCP_LISTEN)) {
+ res = -EADDRNOTAVAIL;
+ break;
+ }
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (!dev) {
+ res = -ENODEV;
+ break;
+ }
+
+ ax25->ax25_dev = ax25_dev_ax25dev(dev);
+ ax25_fillin_cb(ax25, ax25->ax25_dev);
+ dev_put(dev);
+ break;
+
+ default:
+ res = -ENOPROTOOPT;
+ }
+ release_sock(sk);
+
+ return res;
+}
+
+static int ax25_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ ax25_cb *ax25;
+ struct ax25_dev *ax25_dev;
+ char devname[IFNAMSIZ];
+ void *valptr;
+ int val = 0;
+ int maxlen, length;
+
+ if (level != SOL_AX25)
+ return -ENOPROTOOPT;
+
+ if (get_user(maxlen, optlen))
+ return -EFAULT;
+
+ if (maxlen < 1)
+ return -EFAULT;
+
+ valptr = (void *) &val;
+ length = min_t(unsigned int, maxlen, sizeof(int));
+
+ lock_sock(sk);
+ ax25 = ax25_sk(sk);
+
+ switch (optname) {
+ case AX25_WINDOW:
+ val = ax25->window;
+ break;
+
+ case AX25_T1:
+ val = ax25->t1 / HZ;
+ break;
+
+ case AX25_T2:
+ val = ax25->t2 / HZ;
+ break;
+
+ case AX25_N2:
+ val = ax25->n2;
+ break;
+
+ case AX25_T3:
+ val = ax25->t3 / HZ;
+ break;
+
+ case AX25_IDLE:
+ val = ax25->idle / (60 * HZ);
+ break;
+
+ case AX25_BACKOFF:
+ val = ax25->backoff;
+ break;
+
+ case AX25_EXTSEQ:
+ val = (ax25->modulus == AX25_EMODULUS);
+ break;
+
+ case AX25_PIDINCL:
+ val = ax25->pidincl;
+ break;
+
+ case AX25_IAMDIGI:
+ val = ax25->iamdigi;
+ break;
+
+ case AX25_PACLEN:
+ val = ax25->paclen;
+ break;
+
+ case SO_BINDTODEVICE:
+ ax25_dev = ax25->ax25_dev;
+
+ if (ax25_dev != NULL && ax25_dev->dev != NULL) {
+ strlcpy(devname, ax25_dev->dev->name, sizeof(devname));
+ length = strlen(devname) + 1;
+ } else {
+ *devname = '\0';
+ length = 1;
+ }
+
+ valptr = (void *) devname;
+ break;
+
+ default:
+ release_sock(sk);
+ return -ENOPROTOOPT;
+ }
+ release_sock(sk);
+
+ if (put_user(length, optlen))
+ return -EFAULT;
+
+ return copy_to_user(optval, valptr, length) ? -EFAULT : 0;
+}
+
+static int ax25_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int res = 0;
+
+ lock_sock(sk);
+ if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_LISTEN) {
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_state = TCP_LISTEN;
+ goto out;
+ }
+ res = -EOPNOTSUPP;
+
+out:
+ release_sock(sk);
+
+ return res;
+}
+
+/*
+ * XXX: when creating ax25_sock we should update the .obj_size setting
+ * below.
+ */
+static struct proto ax25_proto = {
+ .name = "AX25",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct sock),
+};
+
+static int ax25_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ ax25_cb *ax25;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ switch (sock->type) {
+ case SOCK_DGRAM:
+ if (protocol == 0 || protocol == PF_AX25)
+ protocol = AX25_P_TEXT;
+ break;
+
+ case SOCK_SEQPACKET:
+ switch (protocol) {
+ case 0:
+ case PF_AX25: /* For CLX */
+ protocol = AX25_P_TEXT;
+ break;
+ case AX25_P_SEGMENT:
+#ifdef CONFIG_INET
+ case AX25_P_ARP:
+ case AX25_P_IP:
+#endif
+#ifdef CONFIG_NETROM
+ case AX25_P_NETROM:
+#endif
+#ifdef CONFIG_ROSE
+ case AX25_P_ROSE:
+#endif
+ return -ESOCKTNOSUPPORT;
+#ifdef CONFIG_NETROM_MODULE
+ case AX25_P_NETROM:
+ if (ax25_protocol_is_registered(AX25_P_NETROM))
+ return -ESOCKTNOSUPPORT;
+ break;
+#endif
+#ifdef CONFIG_ROSE_MODULE
+ case AX25_P_ROSE:
+ if (ax25_protocol_is_registered(AX25_P_ROSE))
+ return -ESOCKTNOSUPPORT;
+#endif
+ default:
+ break;
+ }
+ break;
+
+ case SOCK_RAW:
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto);
+ if (sk == NULL)
+ return -ENOMEM;
+
+ ax25 = sk->sk_protinfo = ax25_create_cb();
+ if (!ax25) {
+ sk_free(sk);
+ return -ENOMEM;
+ }
+
+ sock_init_data(sock, sk);
+
+ sk->sk_destruct = ax25_free_sock;
+ sock->ops = &ax25_proto_ops;
+ sk->sk_protocol = protocol;
+
+ ax25->sk = sk;
+
+ return 0;
+}
+
+struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
+{
+ struct sock *sk;
+ ax25_cb *ax25, *oax25;
+
+ sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot);
+ if (sk == NULL)
+ return NULL;
+
+ if ((ax25 = ax25_create_cb()) == NULL) {
+ sk_free(sk);
+ return NULL;
+ }
+
+ switch (osk->sk_type) {
+ case SOCK_DGRAM:
+ break;
+ case SOCK_SEQPACKET:
+ break;
+ default:
+ sk_free(sk);
+ ax25_cb_put(ax25);
+ return NULL;
+ }
+
+ sock_init_data(NULL, sk);
+
+ sk->sk_type = osk->sk_type;
+ sk->sk_priority = osk->sk_priority;
+ sk->sk_protocol = osk->sk_protocol;
+ sk->sk_rcvbuf = osk->sk_rcvbuf;
+ sk->sk_sndbuf = osk->sk_sndbuf;
+ sk->sk_state = TCP_ESTABLISHED;
+ sock_copy_flags(sk, osk);
+
+ oax25 = ax25_sk(osk);
+
+ ax25->modulus = oax25->modulus;
+ ax25->backoff = oax25->backoff;
+ ax25->pidincl = oax25->pidincl;
+ ax25->iamdigi = oax25->iamdigi;
+ ax25->rtt = oax25->rtt;
+ ax25->t1 = oax25->t1;
+ ax25->t2 = oax25->t2;
+ ax25->t3 = oax25->t3;
+ ax25->n2 = oax25->n2;
+ ax25->idle = oax25->idle;
+ ax25->paclen = oax25->paclen;
+ ax25->window = oax25->window;
+
+ ax25->ax25_dev = ax25_dev;
+ ax25->source_addr = oax25->source_addr;
+
+ if (oax25->digipeat != NULL) {
+ ax25->digipeat = kmemdup(oax25->digipeat, sizeof(ax25_digi),
+ GFP_ATOMIC);
+ if (ax25->digipeat == NULL) {
+ sk_free(sk);
+ ax25_cb_put(ax25);
+ return NULL;
+ }
+ }
+
+ sk->sk_protinfo = ax25;
+ sk->sk_destruct = ax25_free_sock;
+ ax25->sk = sk;
+
+ return sk;
+}
+
+static int ax25_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ ax25_cb *ax25;
+
+ if (sk == NULL)
+ return 0;
+
+ sock_hold(sk);
+ sock_orphan(sk);
+ lock_sock(sk);
+ ax25 = ax25_sk(sk);
+
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ switch (ax25->state) {
+ case AX25_STATE_0:
+ release_sock(sk);
+ ax25_disconnect(ax25, 0);
+ lock_sock(sk);
+ ax25_destroy_socket(ax25);
+ break;
+
+ case AX25_STATE_1:
+ case AX25_STATE_2:
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ release_sock(sk);
+ ax25_disconnect(ax25, 0);
+ lock_sock(sk);
+ ax25_destroy_socket(ax25);
+ break;
+
+ case AX25_STATE_3:
+ case AX25_STATE_4:
+ ax25_clear_queues(ax25);
+ ax25->n2count = 0;
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_send_control(ax25,
+ AX25_DISC,
+ AX25_POLLON,
+ AX25_COMMAND);
+ ax25_stop_t2timer(ax25);
+ ax25_stop_t3timer(ax25);
+ ax25_stop_idletimer(ax25);
+ break;
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ ax25_stop_t3timer(ax25);
+ ax25_stop_idletimer(ax25);
+ break;
+#endif
+ }
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+ ax25->state = AX25_STATE_2;
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DESTROY);
+ break;
+
+ default:
+ break;
+ }
+ } else {
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ sk->sk_state_change(sk);
+ ax25_destroy_socket(ax25);
+ }
+
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+/*
+ * We support a funny extension here so you can (as root) give any callsign
+ * digipeated via a local address as source. This hack is obsolete now
+ * that we've implemented support for SO_BINDTODEVICE. It is however small
+ * and trivially backward compatible.
+ */
+static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
+ ax25_dev *ax25_dev = NULL;
+ ax25_uid_assoc *user;
+ ax25_address call;
+ ax25_cb *ax25;
+ int err = 0;
+
+ if (addr_len != sizeof(struct sockaddr_ax25) &&
+ addr_len != sizeof(struct full_sockaddr_ax25))
+ /* support for old structure may go away some time
+ * ax25_bind(): uses old (6 digipeater) socket structure.
+ */
+ if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
+ (addr_len > sizeof(struct full_sockaddr_ax25)))
+ return -EINVAL;
+
+ if (addr->fsa_ax25.sax25_family != AF_AX25)
+ return -EINVAL;
+
+ user = ax25_findbyuid(current_euid());
+ if (user) {
+ call = user->call;
+ ax25_uid_put(user);
+ } else {
+ if (ax25_uid_policy && !capable(CAP_NET_ADMIN))
+ return -EACCES;
+
+ call = addr->fsa_ax25.sax25_call;
+ }
+
+ lock_sock(sk);
+
+ ax25 = ax25_sk(sk);
+ if (!sock_flag(sk, SOCK_ZAPPED)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ax25->source_addr = call;
+
+ /*
+ * User already set interface with SO_BINDTODEVICE
+ */
+ if (ax25->ax25_dev != NULL)
+ goto done;
+
+ if (addr_len > sizeof(struct sockaddr_ax25) && addr->fsa_ax25.sax25_ndigis == 1) {
+ if (ax25cmp(&addr->fsa_digipeater[0], &null_ax25_address) != 0 &&
+ (ax25_dev = ax25_addr_ax25dev(&addr->fsa_digipeater[0])) == NULL) {
+ err = -EADDRNOTAVAIL;
+ goto out;
+ }
+ } else {
+ if ((ax25_dev = ax25_addr_ax25dev(&addr->fsa_ax25.sax25_call)) == NULL) {
+ err = -EADDRNOTAVAIL;
+ goto out;
+ }
+ }
+
+ if (ax25_dev != NULL)
+ ax25_fillin_cb(ax25, ax25_dev);
+
+done:
+ ax25_cb_add(ax25);
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+/*
+ * FIXME: nonblock behaviour looks like it may have a bug.
+ */
+static int __must_check ax25_connect(struct socket *sock,
+ struct sockaddr *uaddr, int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ ax25_cb *ax25 = ax25_sk(sk), *ax25t;
+ struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr;
+ ax25_digi *digi = NULL;
+ int ct = 0, err = 0;
+
+ /*
+ * some sanity checks. code further down depends on this
+ */
+
+ if (addr_len == sizeof(struct sockaddr_ax25))
+ /* support for this will go away in early 2.5.x
+ * ax25_connect(): uses obsolete socket structure
+ */
+ ;
+ else if (addr_len != sizeof(struct full_sockaddr_ax25))
+ /* support for old structure may go away some time
+ * ax25_connect(): uses old (6 digipeater) socket structure.
+ */
+ if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
+ (addr_len > sizeof(struct full_sockaddr_ax25)))
+ return -EINVAL;
+
+
+ if (fsa->fsa_ax25.sax25_family != AF_AX25)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ /* deal with restarts */
+ if (sock->state == SS_CONNECTING) {
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT: /* still trying */
+ err = -EINPROGRESS;
+ goto out_release;
+
+ case TCP_ESTABLISHED: /* connection established */
+ sock->state = SS_CONNECTED;
+ goto out_release;
+
+ case TCP_CLOSE: /* connection refused */
+ sock->state = SS_UNCONNECTED;
+ err = -ECONNREFUSED;
+ goto out_release;
+ }
+ }
+
+ if (sk->sk_state == TCP_ESTABLISHED && sk->sk_type == SOCK_SEQPACKET) {
+ err = -EISCONN; /* No reconnect on a seqpacket socket */
+ goto out_release;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ sock->state = SS_UNCONNECTED;
+
+ kfree(ax25->digipeat);
+ ax25->digipeat = NULL;
+
+ /*
+ * Handle digi-peaters to be used.
+ */
+ if (addr_len > sizeof(struct sockaddr_ax25) &&
+ fsa->fsa_ax25.sax25_ndigis != 0) {
+ /* Valid number of digipeaters ? */
+ if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) {
+ err = -EINVAL;
+ goto out_release;
+ }
+
+ if ((digi = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) {
+ err = -ENOBUFS;
+ goto out_release;
+ }
+
+ digi->ndigi = fsa->fsa_ax25.sax25_ndigis;
+ digi->lastrepeat = -1;
+
+ while (ct < fsa->fsa_ax25.sax25_ndigis) {
+ if ((fsa->fsa_digipeater[ct].ax25_call[6] &
+ AX25_HBIT) && ax25->iamdigi) {
+ digi->repeated[ct] = 1;
+ digi->lastrepeat = ct;
+ } else {
+ digi->repeated[ct] = 0;
+ }
+ digi->calls[ct] = fsa->fsa_digipeater[ct];
+ ct++;
+ }
+ }
+
+ /*
+ * Must bind first - autobinding in this may or may not work. If
+ * the socket is already bound, check to see if the device has
+ * been filled in, error if it hasn't.
+ */
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ /* check if we can remove this feature. It is broken. */
+ printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n",
+ current->comm);
+ if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) {
+ kfree(digi);
+ goto out_release;
+ }
+
+ ax25_fillin_cb(ax25, ax25->ax25_dev);
+ ax25_cb_add(ax25);
+ } else {
+ if (ax25->ax25_dev == NULL) {
+ kfree(digi);
+ err = -EHOSTUNREACH;
+ goto out_release;
+ }
+ }
+
+ if (sk->sk_type == SOCK_SEQPACKET &&
+ (ax25t=ax25_find_cb(&ax25->source_addr, &fsa->fsa_ax25.sax25_call, digi,
+ ax25->ax25_dev->dev))) {
+ kfree(digi);
+ err = -EADDRINUSE; /* Already such a connection */
+ ax25_cb_put(ax25t);
+ goto out_release;
+ }
+
+ ax25->dest_addr = fsa->fsa_ax25.sax25_call;
+ ax25->digipeat = digi;
+
+ /* First the easy one */
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ sock->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
+ goto out_release;
+ }
+
+ /* Move to connecting socket, ax.25 lapb WAIT_UA.. */
+ sock->state = SS_CONNECTING;
+ sk->sk_state = TCP_SYN_SENT;
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_std_establish_data_link(ax25);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ if (ax25->ax25_dev->dama.slave)
+ ax25_ds_establish_data_link(ax25);
+ else
+ ax25_std_establish_data_link(ax25);
+ break;
+#endif
+ }
+
+ ax25->state = AX25_STATE_1;
+
+ ax25_start_heartbeat(ax25);
+
+ /* Now the loop */
+ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) {
+ err = -EINPROGRESS;
+ goto out_release;
+ }
+
+ if (sk->sk_state == TCP_SYN_SENT) {
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk->sk_state != TCP_SYN_SENT)
+ break;
+ if (!signal_pending(current)) {
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ continue;
+ }
+ err = -ERESTARTSYS;
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (err)
+ goto out_release;
+ }
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ /* Not in ABM, not in WAIT_UA -> failed */
+ sock->state = SS_UNCONNECTED;
+ err = sock_error(sk); /* Always set at this point */
+ goto out_release;
+ }
+
+ sock->state = SS_CONNECTED;
+
+ err = 0;
+out_release:
+ release_sock(sk);
+
+ return err;
+}
+
+static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sk_buff *skb;
+ struct sock *newsk;
+ DEFINE_WAIT(wait);
+ struct sock *sk;
+ int err = 0;
+
+ if (sock->state != SS_UNCONNECTED)
+ return -EINVAL;
+
+ if ((sk = sock->sk) == NULL)
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (sk->sk_state != TCP_LISTEN) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * The read queue this time is holding sockets ready to use
+ * hooked into the SABM we saved
+ */
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (skb)
+ break;
+
+ if (flags & O_NONBLOCK) {
+ err = -EWOULDBLOCK;
+ break;
+ }
+ if (!signal_pending(current)) {
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ continue;
+ }
+ err = -ERESTARTSYS;
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (err)
+ goto out;
+
+ newsk = skb->sk;
+ sock_graft(newsk, newsock);
+
+ /* Now attach up the new socket */
+ kfree_skb(skb);
+ sk->sk_ack_backlog--;
+ newsock->state = SS_CONNECTED;
+
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr;
+ struct sock *sk = sock->sk;
+ unsigned char ndigi, i;
+ ax25_cb *ax25;
+ int err = 0;
+
+ memset(fsa, 0, sizeof(*fsa));
+ lock_sock(sk);
+ ax25 = ax25_sk(sk);
+
+ if (peer != 0) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ fsa->fsa_ax25.sax25_family = AF_AX25;
+ fsa->fsa_ax25.sax25_call = ax25->dest_addr;
+
+ if (ax25->digipeat != NULL) {
+ ndigi = ax25->digipeat->ndigi;
+ fsa->fsa_ax25.sax25_ndigis = ndigi;
+ for (i = 0; i < ndigi; i++)
+ fsa->fsa_digipeater[i] =
+ ax25->digipeat->calls[i];
+ }
+ } else {
+ fsa->fsa_ax25.sax25_family = AF_AX25;
+ fsa->fsa_ax25.sax25_call = ax25->source_addr;
+ fsa->fsa_ax25.sax25_ndigis = 1;
+ if (ax25->ax25_dev != NULL) {
+ memcpy(&fsa->fsa_digipeater[0],
+ ax25->ax25_dev->dev->dev_addr, AX25_ADDR_LEN);
+ } else {
+ fsa->fsa_digipeater[0] = null_ax25_address;
+ }
+ }
+ *uaddr_len = sizeof (struct full_sockaddr_ax25);
+
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name);
+ struct sock *sk = sock->sk;
+ struct sockaddr_ax25 sax;
+ struct sk_buff *skb;
+ ax25_digi dtmp, *dp;
+ ax25_cb *ax25;
+ size_t size;
+ int lv, err, addr_len = msg->msg_namelen;
+
+ if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ lock_sock(sk);
+ ax25 = ax25_sk(sk);
+
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ err = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ send_sig(SIGPIPE, current, 0);
+ err = -EPIPE;
+ goto out;
+ }
+
+ if (ax25->ax25_dev == NULL) {
+ err = -ENETUNREACH;
+ goto out;
+ }
+
+ if (len > ax25->ax25_dev->dev->mtu) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ if (usax != NULL) {
+ if (usax->sax25_family != AF_AX25) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (addr_len == sizeof(struct sockaddr_ax25))
+ /* ax25_sendmsg(): uses obsolete socket structure */
+ ;
+ else if (addr_len != sizeof(struct full_sockaddr_ax25))
+ /* support for old structure may go away some time
+ * ax25_sendmsg(): uses old (6 digipeater)
+ * socket structure.
+ */
+ if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
+ (addr_len > sizeof(struct full_sockaddr_ax25))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+
+ if (addr_len > sizeof(struct sockaddr_ax25) && usax->sax25_ndigis != 0) {
+ int ct = 0;
+ struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax;
+
+ /* Valid number of digipeaters ? */
+ if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ dtmp.ndigi = usax->sax25_ndigis;
+
+ while (ct < usax->sax25_ndigis) {
+ dtmp.repeated[ct] = 0;
+ dtmp.calls[ct] = fsa->fsa_digipeater[ct];
+ ct++;
+ }
+
+ dtmp.lastrepeat = 0;
+ }
+
+ sax = *usax;
+ if (sk->sk_type == SOCK_SEQPACKET &&
+ ax25cmp(&ax25->dest_addr, &sax.sax25_call)) {
+ err = -EISCONN;
+ goto out;
+ }
+ if (usax->sax25_ndigis == 0)
+ dp = NULL;
+ else
+ dp = &dtmp;
+ } else {
+ /*
+ * FIXME: 1003.1g - if the socket is like this because
+ * it has become closed (not started closed) and is VC
+ * we ought to SIGPIPE, EPIPE
+ */
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ err = -ENOTCONN;
+ goto out;
+ }
+ sax.sax25_family = AF_AX25;
+ sax.sax25_call = ax25->dest_addr;
+ dp = ax25->digipeat;
+ }
+
+ /* Build a packet */
+ /* Assume the worst case */
+ size = len + ax25->ax25_dev->dev->hard_header_len;
+
+ skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, &err);
+ if (skb == NULL)
+ goto out;
+
+ skb_reserve(skb, size - len);
+
+ /* User data follows immediately after the AX.25 data */
+ if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
+ err = -EFAULT;
+ kfree_skb(skb);
+ goto out;
+ }
+
+ skb_reset_network_header(skb);
+
+ /* Add the PID if one is not supplied by the user in the skb */
+ if (!ax25->pidincl)
+ *skb_push(skb, 1) = sk->sk_protocol;
+
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ /* Connected mode sockets go via the LAPB machine */
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ kfree_skb(skb);
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ /* Shove it onto the queue and kick */
+ ax25_output(ax25, ax25->paclen, skb);
+
+ err = len;
+ goto out;
+ }
+
+ skb_push(skb, 1 + ax25_addr_size(dp));
+
+ /* Building AX.25 Header */
+
+ /* Build an AX.25 header */
+ lv = ax25_addr_build(skb->data, &ax25->source_addr, &sax.sax25_call,
+ dp, AX25_COMMAND, AX25_MODULUS);
+
+ skb_set_transport_header(skb, lv);
+
+ *skb_transport_header(skb) = AX25_UI;
+
+ /* Datagram frames go straight out of the door as UI */
+ ax25_queue_xmit(skb, ax25->ax25_dev->dev);
+
+ err = len;
+
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int copied;
+ int err = 0;
+
+ lock_sock(sk);
+ /*
+ * This works for seqpacket too. The receiver has ordered the
+ * queue for us! We do one quick check first though
+ */
+ if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_ESTABLISHED) {
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ /* Now we can treat all alike */
+ skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+ flags & MSG_DONTWAIT, &err);
+ if (skb == NULL)
+ goto out;
+
+ if (!ax25_sk(sk)->pidincl)
+ skb_pull(skb, 1); /* Remove PID */
+
+ skb_reset_transport_header(skb);
+ copied = skb->len;
+
+ if (copied > size) {
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ skb_copy_datagram_msg(skb, 0, msg, copied);
+
+ if (msg->msg_name) {
+ ax25_digi digi;
+ ax25_address src;
+ const unsigned char *mac = skb_mac_header(skb);
+ DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name);
+
+ memset(sax, 0, sizeof(struct full_sockaddr_ax25));
+ ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL,
+ &digi, NULL, NULL);
+ sax->sax25_family = AF_AX25;
+ /* We set this correctly, even though we may not let the
+ application know the digi calls further down (because it
+ did NOT ask to know them). This could get political... **/
+ sax->sax25_ndigis = digi.ndigi;
+ sax->sax25_call = src;
+
+ if (sax->sax25_ndigis != 0) {
+ int ct;
+ struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)sax;
+
+ for (ct = 0; ct < digi.ndigi; ct++)
+ fsa->fsa_digipeater[ct] = digi.calls[ct];
+ }
+ msg->msg_namelen = sizeof(struct full_sockaddr_ax25);
+ }
+
+ skb_free_datagram(sk, skb);
+ err = copied;
+
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int ax25_shutdown(struct socket *sk, int how)
+{
+ /* FIXME - generate DM and RNR states */
+ return -EOPNOTSUPP;
+}
+
+static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ void __user *argp = (void __user *)arg;
+ int res = 0;
+
+ lock_sock(sk);
+ switch (cmd) {
+ case TIOCOUTQ: {
+ long amount;
+
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ res = put_user(amount, (int __user *)argp);
+ break;
+ }
+
+ case TIOCINQ: {
+ struct sk_buff *skb;
+ long amount = 0L;
+ /* These two are safe on a single CPU system as only user tasks fiddle here */
+ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+ amount = skb->len;
+ res = put_user(amount, (int __user *) argp);
+ break;
+ }
+
+ case SIOCGSTAMP:
+ res = sock_get_timestamp(sk, argp);
+ break;
+
+ case SIOCGSTAMPNS:
+ res = sock_get_timestampns(sk, argp);
+ break;
+
+ case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */
+ case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */
+ case SIOCAX25GETUID: {
+ struct sockaddr_ax25 sax25;
+ if (copy_from_user(&sax25, argp, sizeof(sax25))) {
+ res = -EFAULT;
+ break;
+ }
+ res = ax25_uid_ioctl(cmd, &sax25);
+ break;
+ }
+
+ case SIOCAX25NOUID: { /* Set the default policy (default/bar) */
+ long amount;
+ if (!capable(CAP_NET_ADMIN)) {
+ res = -EPERM;
+ break;
+ }
+ if (get_user(amount, (long __user *)argp)) {
+ res = -EFAULT;
+ break;
+ }
+ if (amount < 0 || amount > AX25_NOUID_BLOCK) {
+ res = -EINVAL;
+ break;
+ }
+ ax25_uid_policy = amount;
+ res = 0;
+ break;
+ }
+
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCAX25OPTRT:
+ if (!capable(CAP_NET_ADMIN)) {
+ res = -EPERM;
+ break;
+ }
+ res = ax25_rt_ioctl(cmd, argp);
+ break;
+
+ case SIOCAX25CTLCON:
+ if (!capable(CAP_NET_ADMIN)) {
+ res = -EPERM;
+ break;
+ }
+ res = ax25_ctl_ioctl(cmd, argp);
+ break;
+
+ case SIOCAX25GETINFO:
+ case SIOCAX25GETINFOOLD: {
+ ax25_cb *ax25 = ax25_sk(sk);
+ struct ax25_info_struct ax25_info;
+
+ ax25_info.t1 = ax25->t1 / HZ;
+ ax25_info.t2 = ax25->t2 / HZ;
+ ax25_info.t3 = ax25->t3 / HZ;
+ ax25_info.idle = ax25->idle / (60 * HZ);
+ ax25_info.n2 = ax25->n2;
+ ax25_info.t1timer = ax25_display_timer(&ax25->t1timer) / HZ;
+ ax25_info.t2timer = ax25_display_timer(&ax25->t2timer) / HZ;
+ ax25_info.t3timer = ax25_display_timer(&ax25->t3timer) / HZ;
+ ax25_info.idletimer = ax25_display_timer(&ax25->idletimer) / (60 * HZ);
+ ax25_info.n2count = ax25->n2count;
+ ax25_info.state = ax25->state;
+ ax25_info.rcv_q = sk_rmem_alloc_get(sk);
+ ax25_info.snd_q = sk_wmem_alloc_get(sk);
+ ax25_info.vs = ax25->vs;
+ ax25_info.vr = ax25->vr;
+ ax25_info.va = ax25->va;
+ ax25_info.vs_max = ax25->vs; /* reserved */
+ ax25_info.paclen = ax25->paclen;
+ ax25_info.window = ax25->window;
+
+ /* old structure? */
+ if (cmd == SIOCAX25GETINFOOLD) {
+ static int warned = 0;
+ if (!warned) {
+ printk(KERN_INFO "%s uses old SIOCAX25GETINFO\n",
+ current->comm);
+ warned=1;
+ }
+
+ if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct_deprecated))) {
+ res = -EFAULT;
+ break;
+ }
+ } else {
+ if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct))) {
+ res = -EINVAL;
+ break;
+ }
+ }
+ res = 0;
+ break;
+ }
+
+ case SIOCAX25ADDFWD:
+ case SIOCAX25DELFWD: {
+ struct ax25_fwd_struct ax25_fwd;
+ if (!capable(CAP_NET_ADMIN)) {
+ res = -EPERM;
+ break;
+ }
+ if (copy_from_user(&ax25_fwd, argp, sizeof(ax25_fwd))) {
+ res = -EFAULT;
+ break;
+ }
+ res = ax25_fwd_ioctl(cmd, &ax25_fwd);
+ break;
+ }
+
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFMETRIC:
+ case SIOCSIFMETRIC:
+ res = -EINVAL;
+ break;
+
+ default:
+ res = -ENOIOCTLCMD;
+ break;
+ }
+ release_sock(sk);
+
+ return res;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *ax25_info_start(struct seq_file *seq, loff_t *pos)
+ __acquires(ax25_list_lock)
+{
+ spin_lock_bh(&ax25_list_lock);
+ return seq_hlist_start(&ax25_list, *pos);
+}
+
+static void *ax25_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_hlist_next(v, &ax25_list, pos);
+}
+
+static void ax25_info_stop(struct seq_file *seq, void *v)
+ __releases(ax25_list_lock)
+{
+ spin_unlock_bh(&ax25_list_lock);
+}
+
+static int ax25_info_show(struct seq_file *seq, void *v)
+{
+ ax25_cb *ax25 = hlist_entry(v, struct ax25_cb, ax25_node);
+ char buf[11];
+ int k;
+
+
+ /*
+ * New format:
+ * magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode
+ */
+
+ seq_printf(seq, "%8.8lx %s %s%s ",
+ (long) ax25,
+ ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name,
+ ax2asc(buf, &ax25->source_addr),
+ ax25->iamdigi? "*":"");
+ seq_printf(seq, "%s", ax2asc(buf, &ax25->dest_addr));
+
+ for (k=0; (ax25->digipeat != NULL) && (k < ax25->digipeat->ndigi); k++) {
+ seq_printf(seq, ",%s%s",
+ ax2asc(buf, &ax25->digipeat->calls[k]),
+ ax25->digipeat->repeated[k]? "*":"");
+ }
+
+ seq_printf(seq, " %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %d %d",
+ ax25->state,
+ ax25->vs, ax25->vr, ax25->va,
+ ax25_display_timer(&ax25->t1timer) / HZ, ax25->t1 / HZ,
+ ax25_display_timer(&ax25->t2timer) / HZ, ax25->t2 / HZ,
+ ax25_display_timer(&ax25->t3timer) / HZ, ax25->t3 / HZ,
+ ax25_display_timer(&ax25->idletimer) / (60 * HZ),
+ ax25->idle / (60 * HZ),
+ ax25->n2count, ax25->n2,
+ ax25->rtt / HZ,
+ ax25->window,
+ ax25->paclen);
+
+ if (ax25->sk != NULL) {
+ seq_printf(seq, " %d %d %lu\n",
+ sk_wmem_alloc_get(ax25->sk),
+ sk_rmem_alloc_get(ax25->sk),
+ sock_i_ino(ax25->sk));
+ } else {
+ seq_puts(seq, " * * *\n");
+ }
+ return 0;
+}
+
+static const struct seq_operations ax25_info_seqops = {
+ .start = ax25_info_start,
+ .next = ax25_info_next,
+ .stop = ax25_info_stop,
+ .show = ax25_info_show,
+};
+
+static int ax25_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ax25_info_seqops);
+}
+
+static const struct file_operations ax25_info_fops = {
+ .owner = THIS_MODULE,
+ .open = ax25_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif
+
+static const struct net_proto_family ax25_family_ops = {
+ .family = PF_AX25,
+ .create = ax25_create,
+ .owner = THIS_MODULE,
+};
+
+static const struct proto_ops ax25_proto_ops = {
+ .family = PF_AX25,
+ .owner = THIS_MODULE,
+ .release = ax25_release,
+ .bind = ax25_bind,
+ .connect = ax25_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = ax25_accept,
+ .getname = ax25_getname,
+ .poll = datagram_poll,
+ .ioctl = ax25_ioctl,
+ .listen = ax25_listen,
+ .shutdown = ax25_shutdown,
+ .setsockopt = ax25_setsockopt,
+ .getsockopt = ax25_getsockopt,
+ .sendmsg = ax25_sendmsg,
+ .recvmsg = ax25_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+/*
+ * Called by socket.c on kernel start up
+ */
+static struct packet_type ax25_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_AX25),
+ .func = ax25_kiss_rcv,
+};
+
+static struct notifier_block ax25_dev_notifier = {
+ .notifier_call = ax25_device_event,
+};
+
+static int __init ax25_init(void)
+{
+ int rc = proto_register(&ax25_proto, 0);
+
+ if (rc != 0)
+ goto out;
+
+ sock_register(&ax25_family_ops);
+ dev_add_pack(&ax25_packet_type);
+ register_netdevice_notifier(&ax25_dev_notifier);
+
+ proc_create("ax25_route", S_IRUGO, init_net.proc_net,
+ &ax25_route_fops);
+ proc_create("ax25", S_IRUGO, init_net.proc_net, &ax25_info_fops);
+ proc_create("ax25_calls", S_IRUGO, init_net.proc_net, &ax25_uid_fops);
+out:
+ return rc;
+}
+module_init(ax25_init);
+
+
+MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The amateur radio AX.25 link layer protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_AX25);
+
+static void __exit ax25_exit(void)
+{
+ remove_proc_entry("ax25_route", init_net.proc_net);
+ remove_proc_entry("ax25", init_net.proc_net);
+ remove_proc_entry("ax25_calls", init_net.proc_net);
+
+ unregister_netdevice_notifier(&ax25_dev_notifier);
+
+ dev_remove_pack(&ax25_packet_type);
+
+ sock_unregister(PF_AX25);
+ proto_unregister(&ax25_proto);
+
+ ax25_rt_free();
+ ax25_uid_free();
+ ax25_dev_free();
+}
+module_exit(ax25_exit);
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
new file mode 100644
index 000000000..e7c9b0ea1
--- /dev/null
+++ b/net/ax25/ax25_addr.c
@@ -0,0 +1,307 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ * The default broadcast address of an interface is QST-0; the default address
+ * is LINUX-1. The null address is defined as a callsign of all spaces with
+ * an SSID of zero.
+ */
+
+const ax25_address ax25_bcast =
+ {{'Q' << 1, 'S' << 1, 'T' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}};
+const ax25_address ax25_defaddr =
+ {{'L' << 1, 'I' << 1, 'N' << 1, 'U' << 1, 'X' << 1, ' ' << 1, 1 << 1}};
+const ax25_address null_ax25_address =
+ {{' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}};
+
+EXPORT_SYMBOL_GPL(ax25_bcast);
+EXPORT_SYMBOL_GPL(ax25_defaddr);
+EXPORT_SYMBOL(null_ax25_address);
+
+/*
+ * ax25 -> ascii conversion
+ */
+char *ax2asc(char *buf, const ax25_address *a)
+{
+ char c, *s;
+ int n;
+
+ for (n = 0, s = buf; n < 6; n++) {
+ c = (a->ax25_call[n] >> 1) & 0x7F;
+
+ if (c != ' ') *s++ = c;
+ }
+
+ *s++ = '-';
+
+ if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+ *s++ = '1';
+ n -= 10;
+ }
+
+ *s++ = n + '0';
+ *s++ = '\0';
+
+ if (*buf == '\0' || *buf == '-')
+ return "*";
+
+ return buf;
+
+}
+
+EXPORT_SYMBOL(ax2asc);
+
+/*
+ * ascii -> ax25 conversion
+ */
+void asc2ax(ax25_address *addr, const char *callsign)
+{
+ const char *s;
+ int n;
+
+ for (s = callsign, n = 0; n < 6; n++) {
+ if (*s != '\0' && *s != '-')
+ addr->ax25_call[n] = *s++;
+ else
+ addr->ax25_call[n] = ' ';
+ addr->ax25_call[n] <<= 1;
+ addr->ax25_call[n] &= 0xFE;
+ }
+
+ if (*s++ == '\0') {
+ addr->ax25_call[6] = 0x00;
+ return;
+ }
+
+ addr->ax25_call[6] = *s++ - '0';
+
+ if (*s != '\0') {
+ addr->ax25_call[6] *= 10;
+ addr->ax25_call[6] += *s++ - '0';
+ }
+
+ addr->ax25_call[6] <<= 1;
+ addr->ax25_call[6] &= 0x1E;
+}
+
+EXPORT_SYMBOL(asc2ax);
+
+/*
+ * Compare two ax.25 addresses
+ */
+int ax25cmp(const ax25_address *a, const ax25_address *b)
+{
+ int ct = 0;
+
+ while (ct < 6) {
+ if ((a->ax25_call[ct] & 0xFE) != (b->ax25_call[ct] & 0xFE)) /* Clean off repeater bits */
+ return 1;
+ ct++;
+ }
+
+ if ((a->ax25_call[ct] & 0x1E) == (b->ax25_call[ct] & 0x1E)) /* SSID without control bit */
+ return 0;
+
+ return 2; /* Partial match */
+}
+
+EXPORT_SYMBOL(ax25cmp);
+
+/*
+ * Compare two AX.25 digipeater paths.
+ */
+int ax25digicmp(const ax25_digi *digi1, const ax25_digi *digi2)
+{
+ int i;
+
+ if (digi1->ndigi != digi2->ndigi)
+ return 1;
+
+ if (digi1->lastrepeat != digi2->lastrepeat)
+ return 1;
+
+ for (i = 0; i < digi1->ndigi; i++)
+ if (ax25cmp(&digi1->calls[i], &digi2->calls[i]) != 0)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Given an AX.25 address pull of to, from, digi list, command/response and the start of data
+ *
+ */
+const unsigned char *ax25_addr_parse(const unsigned char *buf, int len,
+ ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags,
+ int *dama)
+{
+ int d = 0;
+
+ if (len < 14) return NULL;
+
+ if (flags != NULL) {
+ *flags = 0;
+
+ if (buf[6] & AX25_CBIT)
+ *flags = AX25_COMMAND;
+ if (buf[13] & AX25_CBIT)
+ *flags = AX25_RESPONSE;
+ }
+
+ if (dama != NULL)
+ *dama = ~buf[13] & AX25_DAMA_FLAG;
+
+ /* Copy to, from */
+ if (dest != NULL)
+ memcpy(dest, buf + 0, AX25_ADDR_LEN);
+ if (src != NULL)
+ memcpy(src, buf + 7, AX25_ADDR_LEN);
+
+ buf += 2 * AX25_ADDR_LEN;
+ len -= 2 * AX25_ADDR_LEN;
+
+ digi->lastrepeat = -1;
+ digi->ndigi = 0;
+
+ while (!(buf[-1] & AX25_EBIT)) {
+ if (d >= AX25_MAX_DIGIS)
+ return NULL;
+ if (len < AX25_ADDR_LEN)
+ return NULL;
+
+ memcpy(&digi->calls[d], buf, AX25_ADDR_LEN);
+ digi->ndigi = d + 1;
+
+ if (buf[6] & AX25_HBIT) {
+ digi->repeated[d] = 1;
+ digi->lastrepeat = d;
+ } else {
+ digi->repeated[d] = 0;
+ }
+
+ buf += AX25_ADDR_LEN;
+ len -= AX25_ADDR_LEN;
+ d++;
+ }
+
+ return buf;
+}
+
+/*
+ * Assemble an AX.25 header from the bits
+ */
+int ax25_addr_build(unsigned char *buf, const ax25_address *src,
+ const ax25_address *dest, const ax25_digi *d, int flag, int modulus)
+{
+ int len = 0;
+ int ct = 0;
+
+ memcpy(buf, dest, AX25_ADDR_LEN);
+ buf[6] &= ~(AX25_EBIT | AX25_CBIT);
+ buf[6] |= AX25_SSSID_SPARE;
+
+ if (flag == AX25_COMMAND) buf[6] |= AX25_CBIT;
+
+ buf += AX25_ADDR_LEN;
+ len += AX25_ADDR_LEN;
+
+ memcpy(buf, src, AX25_ADDR_LEN);
+ buf[6] &= ~(AX25_EBIT | AX25_CBIT);
+ buf[6] &= ~AX25_SSSID_SPARE;
+
+ if (modulus == AX25_MODULUS)
+ buf[6] |= AX25_SSSID_SPARE;
+ else
+ buf[6] |= AX25_ESSID_SPARE;
+
+ if (flag == AX25_RESPONSE) buf[6] |= AX25_CBIT;
+
+ /*
+ * Fast path the normal digiless path
+ */
+ if (d == NULL || d->ndigi == 0) {
+ buf[6] |= AX25_EBIT;
+ return 2 * AX25_ADDR_LEN;
+ }
+
+ buf += AX25_ADDR_LEN;
+ len += AX25_ADDR_LEN;
+
+ while (ct < d->ndigi) {
+ memcpy(buf, &d->calls[ct], AX25_ADDR_LEN);
+
+ if (d->repeated[ct])
+ buf[6] |= AX25_HBIT;
+ else
+ buf[6] &= ~AX25_HBIT;
+
+ buf[6] &= ~AX25_EBIT;
+ buf[6] |= AX25_SSSID_SPARE;
+
+ buf += AX25_ADDR_LEN;
+ len += AX25_ADDR_LEN;
+ ct++;
+ }
+
+ buf[-1] |= AX25_EBIT;
+
+ return len;
+}
+
+int ax25_addr_size(const ax25_digi *dp)
+{
+ if (dp == NULL)
+ return 2 * AX25_ADDR_LEN;
+
+ return AX25_ADDR_LEN * (2 + dp->ndigi);
+}
+
+/*
+ * Reverse Digipeat List. May not pass both parameters as same struct
+ */
+void ax25_digi_invert(const ax25_digi *in, ax25_digi *out)
+{
+ int ct;
+
+ out->ndigi = in->ndigi;
+ out->lastrepeat = in->ndigi - in->lastrepeat - 2;
+
+ /* Invert the digipeaters */
+ for (ct = 0; ct < in->ndigi; ct++) {
+ out->calls[ct] = in->calls[in->ndigi - ct - 1];
+
+ if (ct <= out->lastrepeat) {
+ out->calls[ct].ax25_call[6] |= AX25_HBIT;
+ out->repeated[ct] = 1;
+ } else {
+ out->calls[ct].ax25_call[6] &= ~AX25_HBIT;
+ out->repeated[ct] = 0;
+ }
+ }
+}
+
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
new file mode 100644
index 000000000..3d106767b
--- /dev/null
+++ b/net/ax25/ax25_dev.c
@@ -0,0 +1,198 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/spinlock.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+ax25_dev *ax25_dev_list;
+DEFINE_SPINLOCK(ax25_dev_lock);
+
+ax25_dev *ax25_addr_ax25dev(ax25_address *addr)
+{
+ ax25_dev *ax25_dev, *res = NULL;
+
+ spin_lock_bh(&ax25_dev_lock);
+ for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next)
+ if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) {
+ res = ax25_dev;
+ }
+ spin_unlock_bh(&ax25_dev_lock);
+
+ return res;
+}
+
+/*
+ * This is called when an interface is brought up. These are
+ * reasonable defaults.
+ */
+void ax25_dev_device_up(struct net_device *dev)
+{
+ ax25_dev *ax25_dev;
+
+ if ((ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_ATOMIC)) == NULL) {
+ printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n");
+ return;
+ }
+
+ dev->ax25_ptr = ax25_dev;
+ ax25_dev->dev = dev;
+ dev_hold(dev);
+ ax25_dev->forward = NULL;
+
+ ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE;
+ ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE;
+ ax25_dev->values[AX25_VALUES_BACKOFF] = AX25_DEF_BACKOFF;
+ ax25_dev->values[AX25_VALUES_CONMODE] = AX25_DEF_CONMODE;
+ ax25_dev->values[AX25_VALUES_WINDOW] = AX25_DEF_WINDOW;
+ ax25_dev->values[AX25_VALUES_EWINDOW] = AX25_DEF_EWINDOW;
+ ax25_dev->values[AX25_VALUES_T1] = AX25_DEF_T1;
+ ax25_dev->values[AX25_VALUES_T2] = AX25_DEF_T2;
+ ax25_dev->values[AX25_VALUES_T3] = AX25_DEF_T3;
+ ax25_dev->values[AX25_VALUES_IDLE] = AX25_DEF_IDLE;
+ ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2;
+ ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN;
+ ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL;
+ ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT;
+
+#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER)
+ ax25_ds_setup_timer(ax25_dev);
+#endif
+
+ spin_lock_bh(&ax25_dev_lock);
+ ax25_dev->next = ax25_dev_list;
+ ax25_dev_list = ax25_dev;
+ spin_unlock_bh(&ax25_dev_lock);
+
+ ax25_register_dev_sysctl(ax25_dev);
+}
+
+void ax25_dev_device_down(struct net_device *dev)
+{
+ ax25_dev *s, *ax25_dev;
+
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ return;
+
+ ax25_unregister_dev_sysctl(ax25_dev);
+
+ spin_lock_bh(&ax25_dev_lock);
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ ax25_ds_del_timer(ax25_dev);
+#endif
+
+ /*
+ * Remove any packet forwarding that points to this device.
+ */
+ for (s = ax25_dev_list; s != NULL; s = s->next)
+ if (s->forward == dev)
+ s->forward = NULL;
+
+ if ((s = ax25_dev_list) == ax25_dev) {
+ ax25_dev_list = s->next;
+ spin_unlock_bh(&ax25_dev_lock);
+ dev_put(dev);
+ kfree(ax25_dev);
+ return;
+ }
+
+ while (s != NULL && s->next != NULL) {
+ if (s->next == ax25_dev) {
+ s->next = ax25_dev->next;
+ spin_unlock_bh(&ax25_dev_lock);
+ dev_put(dev);
+ kfree(ax25_dev);
+ return;
+ }
+
+ s = s->next;
+ }
+ spin_unlock_bh(&ax25_dev_lock);
+ dev->ax25_ptr = NULL;
+}
+
+int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd)
+{
+ ax25_dev *ax25_dev, *fwd_dev;
+
+ if ((ax25_dev = ax25_addr_ax25dev(&fwd->port_from)) == NULL)
+ return -EINVAL;
+
+ switch (cmd) {
+ case SIOCAX25ADDFWD:
+ if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL)
+ return -EINVAL;
+ if (ax25_dev->forward != NULL)
+ return -EINVAL;
+ ax25_dev->forward = fwd_dev->dev;
+ break;
+
+ case SIOCAX25DELFWD:
+ if (ax25_dev->forward == NULL)
+ return -EINVAL;
+ ax25_dev->forward = NULL;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+struct net_device *ax25_fwd_dev(struct net_device *dev)
+{
+ ax25_dev *ax25_dev;
+
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ return dev;
+
+ if (ax25_dev->forward == NULL)
+ return dev;
+
+ return ax25_dev->forward;
+}
+
+/*
+ * Free all memory associated with device structures.
+ */
+void __exit ax25_dev_free(void)
+{
+ ax25_dev *s, *ax25_dev;
+
+ spin_lock_bh(&ax25_dev_lock);
+ ax25_dev = ax25_dev_list;
+ while (ax25_dev != NULL) {
+ s = ax25_dev;
+ dev_put(ax25_dev->dev);
+ ax25_dev = ax25_dev->next;
+ kfree(s);
+ }
+ ax25_dev_list = NULL;
+ spin_unlock_bh(&ax25_dev_lock);
+}
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
new file mode 100644
index 000000000..9bd31e88a
--- /dev/null
+++ b/net/ax25/ax25_ds_in.c
@@ -0,0 +1,302 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ * State machine for state 1, Awaiting Connection State.
+ * The handling of the timer(s) is in file ax25_ds_timer.c.
+ * Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_ds_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type)
+{
+ switch (frametype) {
+ case AX25_SABM:
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ break;
+
+ case AX25_SABME:
+ ax25->modulus = AX25_EMODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ break;
+
+ case AX25_DISC:
+ ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE);
+ break;
+
+ case AX25_UA:
+ ax25_calculate_rtt(ax25);
+ ax25_stop_t1timer(ax25);
+ ax25_start_t3timer(ax25);
+ ax25_start_idletimer(ax25);
+ ax25->vs = 0;
+ ax25->va = 0;
+ ax25->vr = 0;
+ ax25->state = AX25_STATE_3;
+ ax25->n2count = 0;
+ if (ax25->sk != NULL) {
+ bh_lock_sock(ax25->sk);
+ ax25->sk->sk_state = TCP_ESTABLISHED;
+ /*
+ * For WAIT_SABM connections we will produce an accept
+ * ready socket here
+ */
+ if (!sock_flag(ax25->sk, SOCK_DEAD))
+ ax25->sk->sk_state_change(ax25->sk);
+ bh_unlock_sock(ax25->sk);
+ }
+ ax25_dama_on(ax25);
+
+ /* according to DK4EG's spec we are required to
+ * send a RR RESPONSE FINAL NR=0.
+ */
+
+ ax25_std_enquiry_response(ax25);
+ break;
+
+ case AX25_DM:
+ if (pf)
+ ax25_disconnect(ax25, ECONNREFUSED);
+ break;
+
+ default:
+ if (pf)
+ ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND);
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * State machine for state 2, Awaiting Release State.
+ * The handling of the timer(s) is in file ax25_ds_timer.c
+ * Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_ds_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type)
+{
+ switch (frametype) {
+ case AX25_SABM:
+ case AX25_SABME:
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ ax25_dama_off(ax25);
+ break;
+
+ case AX25_DISC:
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ ax25_dama_off(ax25);
+ ax25_disconnect(ax25, 0);
+ break;
+
+ case AX25_DM:
+ case AX25_UA:
+ if (pf) {
+ ax25_dama_off(ax25);
+ ax25_disconnect(ax25, 0);
+ }
+ break;
+
+ case AX25_I:
+ case AX25_REJ:
+ case AX25_RNR:
+ case AX25_RR:
+ if (pf) {
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ ax25_dama_off(ax25);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file ax25_timer.c
+ * Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_ds_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type)
+{
+ int queued = 0;
+
+ switch (frametype) {
+ case AX25_SABM:
+ case AX25_SABME:
+ if (frametype == AX25_SABM) {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ } else {
+ ax25->modulus = AX25_EMODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+ }
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ ax25_stop_t1timer(ax25);
+ ax25_start_t3timer(ax25);
+ ax25_start_idletimer(ax25);
+ ax25->condition = 0x00;
+ ax25->vs = 0;
+ ax25->va = 0;
+ ax25->vr = 0;
+ ax25_requeue_frames(ax25);
+ ax25_dama_on(ax25);
+ break;
+
+ case AX25_DISC:
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ ax25_dama_off(ax25);
+ ax25_disconnect(ax25, 0);
+ break;
+
+ case AX25_DM:
+ ax25_dama_off(ax25);
+ ax25_disconnect(ax25, ECONNRESET);
+ break;
+
+ case AX25_RR:
+ case AX25_RNR:
+ if (frametype == AX25_RR)
+ ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+ else
+ ax25->condition |= AX25_COND_PEER_RX_BUSY;
+
+ if (ax25_validate_nr(ax25, nr)) {
+ if (ax25_check_iframes_acked(ax25, nr))
+ ax25->n2count=0;
+ if (type == AX25_COMMAND && pf)
+ ax25_ds_enquiry_response(ax25);
+ } else {
+ ax25_ds_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ }
+ break;
+
+ case AX25_REJ:
+ ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+
+ if (ax25_validate_nr(ax25, nr)) {
+ if (ax25->va != nr)
+ ax25->n2count=0;
+
+ ax25_frames_acked(ax25, nr);
+ ax25_calculate_rtt(ax25);
+ ax25_stop_t1timer(ax25);
+ ax25_start_t3timer(ax25);
+ ax25_requeue_frames(ax25);
+
+ if (type == AX25_COMMAND && pf)
+ ax25_ds_enquiry_response(ax25);
+ } else {
+ ax25_ds_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ }
+ break;
+
+ case AX25_I:
+ if (!ax25_validate_nr(ax25, nr)) {
+ ax25_ds_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ break;
+ }
+ if (ax25->condition & AX25_COND_PEER_RX_BUSY) {
+ ax25_frames_acked(ax25, nr);
+ ax25->n2count = 0;
+ } else {
+ if (ax25_check_iframes_acked(ax25, nr))
+ ax25->n2count = 0;
+ }
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY) {
+ if (pf) ax25_ds_enquiry_response(ax25);
+ break;
+ }
+ if (ns == ax25->vr) {
+ ax25->vr = (ax25->vr + 1) % ax25->modulus;
+ queued = ax25_rx_iframe(ax25, skb);
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+ ax25->vr = ns; /* ax25->vr - 1 */
+ ax25->condition &= ~AX25_COND_REJECT;
+ if (pf) {
+ ax25_ds_enquiry_response(ax25);
+ } else {
+ if (!(ax25->condition & AX25_COND_ACK_PENDING)) {
+ ax25->condition |= AX25_COND_ACK_PENDING;
+ ax25_start_t2timer(ax25);
+ }
+ }
+ } else {
+ if (ax25->condition & AX25_COND_REJECT) {
+ if (pf) ax25_ds_enquiry_response(ax25);
+ } else {
+ ax25->condition |= AX25_COND_REJECT;
+ ax25_ds_enquiry_response(ax25);
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+ }
+ }
+ break;
+
+ case AX25_FRMR:
+ case AX25_ILLEGAL:
+ ax25_ds_establish_data_link(ax25);
+ ax25->state = AX25_STATE_1;
+ break;
+
+ default:
+ break;
+ }
+
+ return queued;
+}
+
+/*
+ * Higher level upcall for a LAPB frame
+ */
+int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type)
+{
+ int queued = 0, frametype, ns, nr, pf;
+
+ frametype = ax25_decode(ax25, skb, &ns, &nr, &pf);
+
+ switch (ax25->state) {
+ case AX25_STATE_1:
+ queued = ax25_ds_state1_machine(ax25, skb, frametype, pf, type);
+ break;
+ case AX25_STATE_2:
+ queued = ax25_ds_state2_machine(ax25, skb, frametype, pf, type);
+ break;
+ case AX25_STATE_3:
+ queued = ax25_ds_state3_machine(ax25, skb, frametype, ns, nr, pf, type);
+ break;
+ }
+
+ return queued;
+}
+
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
new file mode 100644
index 000000000..e05bd57b5
--- /dev/null
+++ b/net/ax25/ax25_ds_subr.c
@@ -0,0 +1,208 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/spinlock.h>
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+void ax25_ds_nr_error_recovery(ax25_cb *ax25)
+{
+ ax25_ds_establish_data_link(ax25);
+}
+
+/*
+ * dl1bke 960114: transmit I frames on DAMA poll
+ */
+void ax25_ds_enquiry_response(ax25_cb *ax25)
+{
+ ax25_cb *ax25o;
+
+ /* Please note that neither DK4EG's nor DG2FEF's
+ * DAMA spec mention the following behaviour as seen
+ * with TheFirmware:
+ *
+ * DB0ACH->DL1BKE <RR C P R0> [DAMA]
+ * DL1BKE->DB0ACH <I NR=0 NS=0>
+ * DL1BKE-7->DB0PRA-6 DB0ACH <I C S3 R5>
+ * DL1BKE->DB0ACH <RR R F R0>
+ *
+ * The Flexnet DAMA Master implementation apparently
+ * insists on the "proper" AX.25 behaviour:
+ *
+ * DB0ACH->DL1BKE <RR C P R0> [DAMA]
+ * DL1BKE->DB0ACH <RR R F R0>
+ * DL1BKE->DB0ACH <I NR=0 NS=0>
+ * DL1BKE-7->DB0PRA-6 DB0ACH <I C S3 R5>
+ *
+ * Flexnet refuses to send us *any* I frame if we send
+ * a REJ in case AX25_COND_REJECT is set. It is superfluous in
+ * this mode anyway (a RR or RNR invokes the retransmission).
+ * Is this a Flexnet bug?
+ */
+
+ ax25_std_enquiry_response(ax25);
+
+ if (!(ax25->condition & AX25_COND_PEER_RX_BUSY)) {
+ ax25_requeue_frames(ax25);
+ ax25_kick(ax25);
+ }
+
+ if (ax25->state == AX25_STATE_1 || ax25->state == AX25_STATE_2 || skb_peek(&ax25->ack_queue) != NULL)
+ ax25_ds_t1_timeout(ax25);
+ else
+ ax25->n2count = 0;
+
+ ax25_start_t3timer(ax25);
+ ax25_ds_set_timer(ax25->ax25_dev);
+
+ spin_lock(&ax25_list_lock);
+ ax25_for_each(ax25o, &ax25_list) {
+ if (ax25o == ax25)
+ continue;
+
+ if (ax25o->ax25_dev != ax25->ax25_dev)
+ continue;
+
+ if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2) {
+ ax25_ds_t1_timeout(ax25o);
+ continue;
+ }
+
+ if (!(ax25o->condition & AX25_COND_PEER_RX_BUSY) && ax25o->state == AX25_STATE_3) {
+ ax25_requeue_frames(ax25o);
+ ax25_kick(ax25o);
+ }
+
+ if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2 || skb_peek(&ax25o->ack_queue) != NULL)
+ ax25_ds_t1_timeout(ax25o);
+
+ /* do not start T3 for listening sockets (tnx DD8NE) */
+
+ if (ax25o->state != AX25_STATE_0)
+ ax25_start_t3timer(ax25o);
+ }
+ spin_unlock(&ax25_list_lock);
+}
+
+void ax25_ds_establish_data_link(ax25_cb *ax25)
+{
+ ax25->condition &= AX25_COND_DAMA_MODE;
+ ax25->n2count = 0;
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_start_t3timer(ax25);
+}
+
+/*
+ * :::FIXME:::
+ * This is a kludge. Not all drivers recognize kiss commands.
+ * We need a driver level request to switch duplex mode, that does
+ * either SCC changing, PI config or KISS as required. Currently
+ * this request isn't reliable.
+ */
+static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char param)
+{
+ struct sk_buff *skb;
+ unsigned char *p;
+
+ if (ax25_dev->dev == NULL)
+ return;
+
+ if ((skb = alloc_skb(2, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reset_network_header(skb);
+ p = skb_put(skb, 2);
+
+ *p++ = cmd;
+ *p++ = param;
+
+ skb->protocol = ax25_type_trans(skb, ax25_dev->dev);
+
+ dev_queue_xmit(skb);
+}
+
+/*
+ * A nasty problem arises if we count the number of DAMA connections
+ * wrong, especially when connections on the device already existed
+ * and our network node (or the sysop) decides to turn on DAMA Master
+ * mode. We thus flag the 'real' slave connections with
+ * ax25->dama_slave=1 and look on every disconnect if still slave
+ * connections exist.
+ */
+static int ax25_check_dama_slave(ax25_dev *ax25_dev)
+{
+ ax25_cb *ax25;
+ int res = 0;
+
+ spin_lock(&ax25_list_lock);
+ ax25_for_each(ax25, &ax25_list)
+ if (ax25->ax25_dev == ax25_dev && (ax25->condition & AX25_COND_DAMA_MODE) && ax25->state > AX25_STATE_1) {
+ res = 1;
+ break;
+ }
+ spin_unlock(&ax25_list_lock);
+
+ return res;
+}
+
+static void ax25_dev_dama_on(ax25_dev *ax25_dev)
+{
+ if (ax25_dev == NULL)
+ return;
+
+ if (ax25_dev->dama.slave == 0)
+ ax25_kiss_cmd(ax25_dev, 5, 1);
+
+ ax25_dev->dama.slave = 1;
+ ax25_ds_set_timer(ax25_dev);
+}
+
+void ax25_dev_dama_off(ax25_dev *ax25_dev)
+{
+ if (ax25_dev == NULL)
+ return;
+
+ if (ax25_dev->dama.slave && !ax25_check_dama_slave(ax25_dev)) {
+ ax25_kiss_cmd(ax25_dev, 5, 0);
+ ax25_dev->dama.slave = 0;
+ ax25_ds_del_timer(ax25_dev);
+ }
+}
+
+void ax25_dama_on(ax25_cb *ax25)
+{
+ ax25_dev_dama_on(ax25->ax25_dev);
+ ax25->condition |= AX25_COND_DAMA_MODE;
+}
+
+void ax25_dama_off(ax25_cb *ax25)
+{
+ ax25->condition &= ~AX25_COND_DAMA_MODE;
+ ax25_dev_dama_off(ax25->ax25_dev);
+}
+
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
new file mode 100644
index 000000000..951cd57bb
--- /dev/null
+++ b/net/ax25/ax25_ds_timer.c
@@ -0,0 +1,236 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/spinlock.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/tcp_states.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+static void ax25_ds_timeout(unsigned long);
+
+/*
+ * Add DAMA slave timeout timer to timer list.
+ * Unlike the connection based timers the timeout function gets
+ * triggered every second. Please note that NET_AX25_DAMA_SLAVE_TIMEOUT
+ * (aka /proc/sys/net/ax25/{dev}/dama_slave_timeout) is still in
+ * 1/10th of a second.
+ */
+
+void ax25_ds_setup_timer(ax25_dev *ax25_dev)
+{
+ setup_timer(&ax25_dev->dama.slave_timer, ax25_ds_timeout,
+ (unsigned long)ax25_dev);
+}
+
+void ax25_ds_del_timer(ax25_dev *ax25_dev)
+{
+ if (ax25_dev)
+ del_timer(&ax25_dev->dama.slave_timer);
+}
+
+void ax25_ds_set_timer(ax25_dev *ax25_dev)
+{
+ if (ax25_dev == NULL) /* paranoia */
+ return;
+
+ ax25_dev->dama.slave_timeout =
+ msecs_to_jiffies(ax25_dev->values[AX25_VALUES_DS_TIMEOUT]) / 10;
+ mod_timer(&ax25_dev->dama.slave_timer, jiffies + HZ);
+}
+
+/*
+ * DAMA Slave Timeout
+ * Silently discard all (slave) connections in case our master forgot us...
+ */
+
+static void ax25_ds_timeout(unsigned long arg)
+{
+ ax25_dev *ax25_dev = (struct ax25_dev *) arg;
+ ax25_cb *ax25;
+
+ if (ax25_dev == NULL || !ax25_dev->dama.slave)
+ return; /* Yikes! */
+
+ if (!ax25_dev->dama.slave_timeout || --ax25_dev->dama.slave_timeout) {
+ ax25_ds_set_timer(ax25_dev);
+ return;
+ }
+
+ spin_lock(&ax25_list_lock);
+ ax25_for_each(ax25, &ax25_list) {
+ if (ax25->ax25_dev != ax25_dev || !(ax25->condition & AX25_COND_DAMA_MODE))
+ continue;
+
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ ax25_disconnect(ax25, ETIMEDOUT);
+ }
+ spin_unlock(&ax25_list_lock);
+
+ ax25_dev_dama_off(ax25_dev);
+}
+
+void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
+{
+ struct sock *sk=ax25->sk;
+
+ if (sk)
+ bh_lock_sock(sk);
+
+ switch (ax25->state) {
+
+ case AX25_STATE_0:
+ /* Magic here: If we listen() and a new link dies before it
+ is accepted() it isn't 'dead' so doesn't get removed. */
+ if (!sk || sock_flag(sk, SOCK_DESTROY) ||
+ (sk->sk_state == TCP_LISTEN &&
+ sock_flag(sk, SOCK_DEAD))) {
+ if (sk) {
+ sock_hold(sk);
+ ax25_destroy_socket(ax25);
+ bh_unlock_sock(sk);
+ sock_put(sk);
+ } else
+ ax25_destroy_socket(ax25);
+ return;
+ }
+ break;
+
+ case AX25_STATE_3:
+ /*
+ * Check the state of the receive buffer.
+ */
+ if (sk != NULL) {
+ if (atomic_read(&sk->sk_rmem_alloc) <
+ (sk->sk_rcvbuf >> 1) &&
+ (ax25->condition & AX25_COND_OWN_RX_BUSY)) {
+ ax25->condition &= ~AX25_COND_OWN_RX_BUSY;
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+ break;
+ }
+ }
+ break;
+ }
+
+ if (sk)
+ bh_unlock_sock(sk);
+
+ ax25_start_heartbeat(ax25);
+}
+
+/* dl1bke 960114: T3 works much like the IDLE timeout, but
+ * gets reloaded with every frame for this
+ * connection.
+ */
+void ax25_ds_t3timer_expiry(ax25_cb *ax25)
+{
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ ax25_dama_off(ax25);
+ ax25_disconnect(ax25, ETIMEDOUT);
+}
+
+/* dl1bke 960228: close the connection when IDLE expires.
+ * unlike T3 this timer gets reloaded only on
+ * I frames.
+ */
+void ax25_ds_idletimer_expiry(ax25_cb *ax25)
+{
+ ax25_clear_queues(ax25);
+
+ ax25->n2count = 0;
+ ax25->state = AX25_STATE_2;
+
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+ ax25_stop_t3timer(ax25);
+
+ if (ax25->sk != NULL) {
+ bh_lock_sock(ax25->sk);
+ ax25->sk->sk_state = TCP_CLOSE;
+ ax25->sk->sk_err = 0;
+ ax25->sk->sk_shutdown |= SEND_SHUTDOWN;
+ if (!sock_flag(ax25->sk, SOCK_DEAD)) {
+ ax25->sk->sk_state_change(ax25->sk);
+ sock_set_flag(ax25->sk, SOCK_DEAD);
+ }
+ bh_unlock_sock(ax25->sk);
+ }
+}
+
+/* dl1bke 960114: The DAMA protocol requires to send data and SABM/DISC
+ * within the poll of any connected channel. Remember
+ * that we are not allowed to send anything unless we
+ * get polled by the Master.
+ *
+ * Thus we'll have to do parts of our T1 handling in
+ * ax25_enquiry_response().
+ */
+void ax25_ds_t1_timeout(ax25_cb *ax25)
+{
+ switch (ax25->state) {
+ case AX25_STATE_1:
+ if (ax25->n2count == ax25->n2) {
+ if (ax25->modulus == AX25_MODULUS) {
+ ax25_disconnect(ax25, ETIMEDOUT);
+ return;
+ } else {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ ax25->n2count = 0;
+ ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND);
+ }
+ } else {
+ ax25->n2count++;
+ if (ax25->modulus == AX25_MODULUS)
+ ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND);
+ else
+ ax25_send_control(ax25, AX25_SABME, AX25_POLLOFF, AX25_COMMAND);
+ }
+ break;
+
+ case AX25_STATE_2:
+ if (ax25->n2count == ax25->n2) {
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ ax25_disconnect(ax25, ETIMEDOUT);
+ return;
+ } else {
+ ax25->n2count++;
+ }
+ break;
+
+ case AX25_STATE_3:
+ if (ax25->n2count == ax25->n2) {
+ ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE);
+ ax25_disconnect(ax25, ETIMEDOUT);
+ return;
+ } else {
+ ax25->n2count++;
+ }
+ break;
+ }
+
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+}
diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c
new file mode 100644
index 000000000..7f16e8a93
--- /dev/null
+++ b/net/ax25/ax25_iface.c
@@ -0,0 +1,217 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+static struct ax25_protocol *protocol_list;
+static DEFINE_RWLOCK(protocol_list_lock);
+
+static HLIST_HEAD(ax25_linkfail_list);
+static DEFINE_SPINLOCK(linkfail_lock);
+
+static struct listen_struct {
+ struct listen_struct *next;
+ ax25_address callsign;
+ struct net_device *dev;
+} *listen_list = NULL;
+static DEFINE_SPINLOCK(listen_lock);
+
+/*
+ * Do not register the internal protocols AX25_P_TEXT, AX25_P_SEGMENT,
+ * AX25_P_IP or AX25_P_ARP ...
+ */
+void ax25_register_pid(struct ax25_protocol *ap)
+{
+ write_lock_bh(&protocol_list_lock);
+ ap->next = protocol_list;
+ protocol_list = ap;
+ write_unlock_bh(&protocol_list_lock);
+}
+
+EXPORT_SYMBOL_GPL(ax25_register_pid);
+
+void ax25_protocol_release(unsigned int pid)
+{
+ struct ax25_protocol *protocol;
+
+ write_lock_bh(&protocol_list_lock);
+ protocol = protocol_list;
+ if (protocol == NULL)
+ goto out;
+
+ if (protocol->pid == pid) {
+ protocol_list = protocol->next;
+ goto out;
+ }
+
+ while (protocol != NULL && protocol->next != NULL) {
+ if (protocol->next->pid == pid) {
+ protocol->next = protocol->next->next;
+ goto out;
+ }
+
+ protocol = protocol->next;
+ }
+out:
+ write_unlock_bh(&protocol_list_lock);
+}
+
+EXPORT_SYMBOL(ax25_protocol_release);
+
+void ax25_linkfail_register(struct ax25_linkfail *lf)
+{
+ spin_lock_bh(&linkfail_lock);
+ hlist_add_head(&lf->lf_node, &ax25_linkfail_list);
+ spin_unlock_bh(&linkfail_lock);
+}
+
+EXPORT_SYMBOL(ax25_linkfail_register);
+
+void ax25_linkfail_release(struct ax25_linkfail *lf)
+{
+ spin_lock_bh(&linkfail_lock);
+ hlist_del_init(&lf->lf_node);
+ spin_unlock_bh(&linkfail_lock);
+}
+
+EXPORT_SYMBOL(ax25_linkfail_release);
+
+int ax25_listen_register(ax25_address *callsign, struct net_device *dev)
+{
+ struct listen_struct *listen;
+
+ if (ax25_listen_mine(callsign, dev))
+ return 0;
+
+ if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL)
+ return -ENOMEM;
+
+ listen->callsign = *callsign;
+ listen->dev = dev;
+
+ spin_lock_bh(&listen_lock);
+ listen->next = listen_list;
+ listen_list = listen;
+ spin_unlock_bh(&listen_lock);
+
+ return 0;
+}
+
+EXPORT_SYMBOL(ax25_listen_register);
+
+void ax25_listen_release(ax25_address *callsign, struct net_device *dev)
+{
+ struct listen_struct *s, *listen;
+
+ spin_lock_bh(&listen_lock);
+ listen = listen_list;
+ if (listen == NULL) {
+ spin_unlock_bh(&listen_lock);
+ return;
+ }
+
+ if (ax25cmp(&listen->callsign, callsign) == 0 && listen->dev == dev) {
+ listen_list = listen->next;
+ spin_unlock_bh(&listen_lock);
+ kfree(listen);
+ return;
+ }
+
+ while (listen != NULL && listen->next != NULL) {
+ if (ax25cmp(&listen->next->callsign, callsign) == 0 && listen->next->dev == dev) {
+ s = listen->next;
+ listen->next = listen->next->next;
+ spin_unlock_bh(&listen_lock);
+ kfree(s);
+ return;
+ }
+
+ listen = listen->next;
+ }
+ spin_unlock_bh(&listen_lock);
+}
+
+EXPORT_SYMBOL(ax25_listen_release);
+
+int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *)
+{
+ int (*res)(struct sk_buff *, ax25_cb *) = NULL;
+ struct ax25_protocol *protocol;
+
+ read_lock(&protocol_list_lock);
+ for (protocol = protocol_list; protocol != NULL; protocol = protocol->next)
+ if (protocol->pid == pid) {
+ res = protocol->func;
+ break;
+ }
+ read_unlock(&protocol_list_lock);
+
+ return res;
+}
+
+int ax25_listen_mine(ax25_address *callsign, struct net_device *dev)
+{
+ struct listen_struct *listen;
+
+ spin_lock_bh(&listen_lock);
+ for (listen = listen_list; listen != NULL; listen = listen->next)
+ if (ax25cmp(&listen->callsign, callsign) == 0 &&
+ (listen->dev == dev || listen->dev == NULL)) {
+ spin_unlock_bh(&listen_lock);
+ return 1;
+ }
+ spin_unlock_bh(&listen_lock);
+
+ return 0;
+}
+
+void ax25_link_failed(ax25_cb *ax25, int reason)
+{
+ struct ax25_linkfail *lf;
+
+ spin_lock_bh(&linkfail_lock);
+ hlist_for_each_entry(lf, &ax25_linkfail_list, lf_node)
+ lf->func(ax25, reason);
+ spin_unlock_bh(&linkfail_lock);
+}
+
+int ax25_protocol_is_registered(unsigned int pid)
+{
+ struct ax25_protocol *protocol;
+ int res = 0;
+
+ read_lock_bh(&protocol_list_lock);
+ for (protocol = protocol_list; protocol != NULL; protocol = protocol->next)
+ if (protocol->pid == pid) {
+ res = 1;
+ break;
+ }
+ read_unlock_bh(&protocol_list_lock);
+
+ return res;
+}
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
new file mode 100644
index 000000000..7ed8ab724
--- /dev/null
+++ b/net/ax25/ax25_in.c
@@ -0,0 +1,455 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ * Given a fragment, queue it on the fragment queue and if the fragment
+ * is complete, send it back to ax25_rx_iframe.
+ */
+static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb)
+{
+ struct sk_buff *skbn, *skbo;
+
+ if (ax25->fragno != 0) {
+ if (!(*skb->data & AX25_SEG_FIRST)) {
+ if ((ax25->fragno - 1) == (*skb->data & AX25_SEG_REM)) {
+ /* Enqueue fragment */
+ ax25->fragno = *skb->data & AX25_SEG_REM;
+ skb_pull(skb, 1); /* skip fragno */
+ ax25->fraglen += skb->len;
+ skb_queue_tail(&ax25->frag_queue, skb);
+
+ /* Last fragment received ? */
+ if (ax25->fragno == 0) {
+ skbn = alloc_skb(AX25_MAX_HEADER_LEN +
+ ax25->fraglen,
+ GFP_ATOMIC);
+ if (!skbn) {
+ skb_queue_purge(&ax25->frag_queue);
+ return 1;
+ }
+
+ skb_reserve(skbn, AX25_MAX_HEADER_LEN);
+
+ skbn->dev = ax25->ax25_dev->dev;
+ skb_reset_network_header(skbn);
+ skb_reset_transport_header(skbn);
+
+ /* Copy data from the fragments */
+ while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) {
+ skb_copy_from_linear_data(skbo,
+ skb_put(skbn, skbo->len),
+ skbo->len);
+ kfree_skb(skbo);
+ }
+
+ ax25->fraglen = 0;
+
+ if (ax25_rx_iframe(ax25, skbn) == 0)
+ kfree_skb(skbn);
+ }
+
+ return 1;
+ }
+ }
+ } else {
+ /* First fragment received */
+ if (*skb->data & AX25_SEG_FIRST) {
+ skb_queue_purge(&ax25->frag_queue);
+ ax25->fragno = *skb->data & AX25_SEG_REM;
+ skb_pull(skb, 1); /* skip fragno */
+ ax25->fraglen = skb->len;
+ skb_queue_tail(&ax25->frag_queue, skb);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * This is where all valid I frames are sent to, to be dispatched to
+ * whichever protocol requires them.
+ */
+int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
+{
+ int (*func)(struct sk_buff *, ax25_cb *);
+ unsigned char pid;
+ int queued = 0;
+
+ if (skb == NULL) return 0;
+
+ ax25_start_idletimer(ax25);
+
+ pid = *skb->data;
+
+ if (pid == AX25_P_IP) {
+ /* working around a TCP bug to keep additional listeners
+ * happy. TCP re-uses the buffer and destroys the original
+ * content.
+ */
+ struct sk_buff *skbn = skb_copy(skb, GFP_ATOMIC);
+ if (skbn != NULL) {
+ kfree_skb(skb);
+ skb = skbn;
+ }
+
+ skb_pull(skb, 1); /* Remove PID */
+ skb->mac_header = skb->network_header;
+ skb_reset_network_header(skb);
+ skb->dev = ax25->ax25_dev->dev;
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = htons(ETH_P_IP);
+ netif_rx(skb);
+ return 1;
+ }
+ if (pid == AX25_P_SEGMENT) {
+ skb_pull(skb, 1); /* Remove PID */
+ return ax25_rx_fragment(ax25, skb);
+ }
+
+ if ((func = ax25_protocol_function(pid)) != NULL) {
+ skb_pull(skb, 1); /* Remove PID */
+ return (*func)(skb, ax25);
+ }
+
+ if (ax25->sk != NULL && ax25->ax25_dev->values[AX25_VALUES_CONMODE] == 2) {
+ if ((!ax25->pidincl && ax25->sk->sk_protocol == pid) ||
+ ax25->pidincl) {
+ if (sock_queue_rcv_skb(ax25->sk, skb) == 0)
+ queued = 1;
+ else
+ ax25->condition |= AX25_COND_OWN_RX_BUSY;
+ }
+ }
+
+ return queued;
+}
+
+/*
+ * Higher level upcall for a LAPB frame
+ */
+static int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, int dama)
+{
+ int queued = 0;
+
+ if (ax25->state == AX25_STATE_0)
+ return 0;
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ queued = ax25_std_frame_in(ax25, skb, type);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ if (dama || ax25->ax25_dev->dama.slave)
+ queued = ax25_ds_frame_in(ax25, skb, type);
+ else
+ queued = ax25_std_frame_in(ax25, skb, type);
+ break;
+#endif
+ }
+
+ return queued;
+}
+
+static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
+ ax25_address *dev_addr, struct packet_type *ptype)
+{
+ ax25_address src, dest, *next_digi = NULL;
+ int type = 0, mine = 0, dama;
+ struct sock *make, *sk;
+ ax25_digi dp, reverse_dp;
+ ax25_cb *ax25;
+ ax25_dev *ax25_dev;
+
+ /*
+ * Process the AX.25/LAPB frame.
+ */
+
+ skb_reset_transport_header(skb);
+
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ goto free;
+
+ /*
+ * Parse the address header.
+ */
+
+ if (ax25_addr_parse(skb->data, skb->len, &src, &dest, &dp, &type, &dama) == NULL)
+ goto free;
+
+ /*
+ * Ours perhaps ?
+ */
+ if (dp.lastrepeat + 1 < dp.ndigi) /* Not yet digipeated completely */
+ next_digi = &dp.calls[dp.lastrepeat + 1];
+
+ /*
+ * Pull of the AX.25 headers leaving the CTRL/PID bytes
+ */
+ skb_pull(skb, ax25_addr_size(&dp));
+
+ /* For our port addresses ? */
+ if (ax25cmp(&dest, dev_addr) == 0 && dp.lastrepeat + 1 == dp.ndigi)
+ mine = 1;
+
+ /* Also match on any registered callsign from L3/4 */
+ if (!mine && ax25_listen_mine(&dest, dev) && dp.lastrepeat + 1 == dp.ndigi)
+ mine = 1;
+
+ /* UI frame - bypass LAPB processing */
+ if ((*skb->data & ~0x10) == AX25_UI && dp.lastrepeat + 1 == dp.ndigi) {
+ skb_set_transport_header(skb, 2); /* skip control and pid */
+
+ ax25_send_to_raw(&dest, skb, skb->data[1]);
+
+ if (!mine && ax25cmp(&dest, (ax25_address *)dev->broadcast) != 0)
+ goto free;
+
+ /* Now we are pointing at the pid byte */
+ switch (skb->data[1]) {
+ case AX25_P_IP:
+ skb_pull(skb,2); /* drop PID/CTRL */
+ skb_reset_transport_header(skb);
+ skb_reset_network_header(skb);
+ skb->dev = dev;
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = htons(ETH_P_IP);
+ netif_rx(skb);
+ break;
+
+ case AX25_P_ARP:
+ skb_pull(skb,2);
+ skb_reset_transport_header(skb);
+ skb_reset_network_header(skb);
+ skb->dev = dev;
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = htons(ETH_P_ARP);
+ netif_rx(skb);
+ break;
+ case AX25_P_TEXT:
+ /* Now find a suitable dgram socket */
+ sk = ax25_get_socket(&dest, &src, SOCK_DGRAM);
+ if (sk != NULL) {
+ bh_lock_sock(sk);
+ if (atomic_read(&sk->sk_rmem_alloc) >=
+ sk->sk_rcvbuf) {
+ kfree_skb(skb);
+ } else {
+ /*
+ * Remove the control and PID.
+ */
+ skb_pull(skb, 2);
+ if (sock_queue_rcv_skb(sk, skb) != 0)
+ kfree_skb(skb);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+ } else {
+ kfree_skb(skb);
+ }
+ break;
+
+ default:
+ kfree_skb(skb); /* Will scan SOCK_AX25 RAW sockets */
+ break;
+ }
+
+ return 0;
+ }
+
+ /*
+ * Is connected mode supported on this device ?
+ * If not, should we DM the incoming frame (except DMs) or
+ * silently ignore them. For now we stay quiet.
+ */
+ if (ax25_dev->values[AX25_VALUES_CONMODE] == 0)
+ goto free;
+
+ /* LAPB */
+
+ /* AX.25 state 1-4 */
+
+ ax25_digi_invert(&dp, &reverse_dp);
+
+ if ((ax25 = ax25_find_cb(&dest, &src, &reverse_dp, dev)) != NULL) {
+ /*
+ * Process the frame. If it is queued up internally it
+ * returns one otherwise we free it immediately. This
+ * routine itself wakes the user context layers so we do
+ * no further work
+ */
+ if (ax25_process_rx_frame(ax25, skb, type, dama) == 0)
+ kfree_skb(skb);
+
+ ax25_cb_put(ax25);
+ return 0;
+ }
+
+ /* AX.25 state 0 (disconnected) */
+
+ /* a) received not a SABM(E) */
+
+ if ((*skb->data & ~AX25_PF) != AX25_SABM &&
+ (*skb->data & ~AX25_PF) != AX25_SABME) {
+ /*
+ * Never reply to a DM. Also ignore any connects for
+ * addresses that are not our interfaces and not a socket.
+ */
+ if ((*skb->data & ~AX25_PF) != AX25_DM && mine)
+ ax25_return_dm(dev, &src, &dest, &dp);
+
+ goto free;
+ }
+
+ /* b) received SABM(E) */
+
+ if (dp.lastrepeat + 1 == dp.ndigi)
+ sk = ax25_find_listener(&dest, 0, dev, SOCK_SEQPACKET);
+ else
+ sk = ax25_find_listener(next_digi, 1, dev, SOCK_SEQPACKET);
+
+ if (sk != NULL) {
+ bh_lock_sock(sk);
+ if (sk_acceptq_is_full(sk) ||
+ (make = ax25_make_new(sk, ax25_dev)) == NULL) {
+ if (mine)
+ ax25_return_dm(dev, &src, &dest, &dp);
+ kfree_skb(skb);
+ bh_unlock_sock(sk);
+ sock_put(sk);
+
+ return 0;
+ }
+
+ ax25 = ax25_sk(make);
+ skb_set_owner_r(skb, make);
+ skb_queue_head(&sk->sk_receive_queue, skb);
+
+ make->sk_state = TCP_ESTABLISHED;
+
+ sk->sk_ack_backlog++;
+ bh_unlock_sock(sk);
+ } else {
+ if (!mine)
+ goto free;
+
+ if ((ax25 = ax25_create_cb()) == NULL) {
+ ax25_return_dm(dev, &src, &dest, &dp);
+ goto free;
+ }
+
+ ax25_fillin_cb(ax25, ax25_dev);
+ }
+
+ ax25->source_addr = dest;
+ ax25->dest_addr = src;
+
+ /*
+ * Sort out any digipeated paths.
+ */
+ if (dp.ndigi && !ax25->digipeat &&
+ (ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
+ kfree_skb(skb);
+ ax25_destroy_socket(ax25);
+ if (sk)
+ sock_put(sk);
+ return 0;
+ }
+
+ if (dp.ndigi == 0) {
+ kfree(ax25->digipeat);
+ ax25->digipeat = NULL;
+ } else {
+ /* Reverse the source SABM's path */
+ memcpy(ax25->digipeat, &reverse_dp, sizeof(ax25_digi));
+ }
+
+ if ((*skb->data & ~AX25_PF) == AX25_SABME) {
+ ax25->modulus = AX25_EMODULUS;
+ ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW];
+ } else {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25_dev->values[AX25_VALUES_WINDOW];
+ }
+
+ ax25_send_control(ax25, AX25_UA, AX25_POLLON, AX25_RESPONSE);
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ if (dama && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE)
+ ax25_dama_on(ax25);
+#endif
+
+ ax25->state = AX25_STATE_3;
+
+ ax25_cb_add(ax25);
+
+ ax25_start_heartbeat(ax25);
+ ax25_start_t3timer(ax25);
+ ax25_start_idletimer(ax25);
+
+ if (sk) {
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ sock_put(sk);
+ } else {
+free:
+ kfree_skb(skb);
+ }
+ return 0;
+}
+
+/*
+ * Receive an AX.25 frame via a SLIP interface.
+ */
+int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype, struct net_device *orig_dev)
+{
+ skb_orphan(skb);
+
+ if (!net_eq(dev_net(dev), &init_net)) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if ((*skb->data & 0x0F) != 0) {
+ kfree_skb(skb); /* Not a KISS data frame */
+ return 0;
+ }
+
+ skb_pull(skb, AX25_KISS_HEADER_LEN); /* Remove the KISS byte */
+
+ return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype);
+}
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
new file mode 100644
index 000000000..7c646bb2c
--- /dev/null
+++ b/net/ax25/ax25_ip.c
@@ -0,0 +1,238 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/netfilter.h>
+#include <linux/sysctl.h>
+#include <net/ip.h>
+#include <net/arp.h>
+
+/*
+ * IP over AX.25 encapsulation.
+ */
+
+/*
+ * Shove an AX.25 UI header on an IP packet and handle ARP
+ */
+
+#ifdef CONFIG_INET
+
+static int ax25_hard_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
+{
+ unsigned char *buff;
+
+ /* they sometimes come back to us... */
+ if (type == ETH_P_AX25)
+ return 0;
+
+ /* header is an AX.25 UI frame from us to them */
+ buff = skb_push(skb, AX25_HEADER_LEN);
+ *buff++ = 0x00; /* KISS DATA */
+
+ if (daddr != NULL)
+ memcpy(buff, daddr, dev->addr_len); /* Address specified */
+
+ buff[6] &= ~AX25_CBIT;
+ buff[6] &= ~AX25_EBIT;
+ buff[6] |= AX25_SSSID_SPARE;
+ buff += AX25_ADDR_LEN;
+
+ if (saddr != NULL)
+ memcpy(buff, saddr, dev->addr_len);
+ else
+ memcpy(buff, dev->dev_addr, dev->addr_len);
+
+ buff[6] &= ~AX25_CBIT;
+ buff[6] |= AX25_EBIT;
+ buff[6] |= AX25_SSSID_SPARE;
+ buff += AX25_ADDR_LEN;
+
+ *buff++ = AX25_UI; /* UI */
+
+ /* Append a suitable AX.25 PID */
+ switch (type) {
+ case ETH_P_IP:
+ *buff++ = AX25_P_IP;
+ break;
+ case ETH_P_ARP:
+ *buff++ = AX25_P_ARP;
+ break;
+ default:
+ printk(KERN_ERR "AX.25: ax25_hard_header - wrong protocol type 0x%2.2x\n", type);
+ *buff++ = 0;
+ break;
+ }
+
+ if (daddr != NULL)
+ return AX25_HEADER_LEN;
+
+ return -AX25_HEADER_LEN; /* Unfinished header */
+}
+
+netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
+{
+ struct sk_buff *ourskb;
+ unsigned char *bp = skb->data;
+ ax25_route *route;
+ struct net_device *dev = NULL;
+ ax25_address *src, *dst;
+ ax25_digi *digipeat = NULL;
+ ax25_dev *ax25_dev;
+ ax25_cb *ax25;
+ char ip_mode = ' ';
+
+ dst = (ax25_address *)(bp + 1);
+ src = (ax25_address *)(bp + 8);
+
+ route = ax25_get_route(dst, NULL);
+ if (route) {
+ digipeat = route->digipeat;
+ dev = route->dev;
+ ip_mode = route->ip_mode;
+ }
+
+ if (dev == NULL)
+ dev = skb->dev;
+
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) {
+ kfree_skb(skb);
+ goto put;
+ }
+
+ if (bp[16] == AX25_P_IP) {
+ if (ip_mode == 'V' || (ip_mode == ' ' && ax25_dev->values[AX25_VALUES_IPDEFMODE])) {
+ /*
+ * We copy the buffer and release the original thereby
+ * keeping it straight
+ *
+ * Note: we report 1 back so the caller will
+ * not feed the frame direct to the physical device
+ * We don't want that to happen. (It won't be upset
+ * as we have pulled the frame from the queue by
+ * freeing it).
+ *
+ * NB: TCP modifies buffers that are still
+ * on a device queue, thus we use skb_copy()
+ * instead of using skb_clone() unless this
+ * gets fixed.
+ */
+
+ ax25_address src_c;
+ ax25_address dst_c;
+
+ if ((ourskb = skb_copy(skb, GFP_ATOMIC)) == NULL) {
+ kfree_skb(skb);
+ goto put;
+ }
+
+ if (skb->sk != NULL)
+ skb_set_owner_w(ourskb, skb->sk);
+
+ kfree_skb(skb);
+ /* dl9sau: bugfix
+ * after kfree_skb(), dst and src which were pointer
+ * to bp which is part of skb->data would not be valid
+ * anymore hope that after skb_pull(ourskb, ..) our
+ * dsc_c and src_c will not become invalid
+ */
+ bp = ourskb->data;
+ dst_c = *(ax25_address *)(bp + 1);
+ src_c = *(ax25_address *)(bp + 8);
+
+ skb_pull(ourskb, AX25_HEADER_LEN - 1); /* Keep PID */
+ skb_reset_network_header(ourskb);
+
+ ax25=ax25_send_frame(
+ ourskb,
+ ax25_dev->values[AX25_VALUES_PACLEN],
+ &src_c,
+ &dst_c, digipeat, dev);
+ if (ax25) {
+ ax25_cb_put(ax25);
+ }
+ goto put;
+ }
+ }
+
+ bp[7] &= ~AX25_CBIT;
+ bp[7] &= ~AX25_EBIT;
+ bp[7] |= AX25_SSSID_SPARE;
+
+ bp[14] &= ~AX25_CBIT;
+ bp[14] |= AX25_EBIT;
+ bp[14] |= AX25_SSSID_SPARE;
+
+ skb_pull(skb, AX25_KISS_HEADER_LEN);
+
+ if (digipeat != NULL) {
+ if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) {
+ kfree_skb(skb);
+ goto put;
+ }
+
+ skb = ourskb;
+ }
+
+ ax25_queue_xmit(skb, dev);
+
+put:
+ if (route)
+ ax25_put_route(route);
+
+ return NETDEV_TX_OK;
+}
+
+#else /* INET */
+
+static int ax25_hard_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
+{
+ return -AX25_HEADER_LEN;
+}
+
+netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+#endif
+
+const struct header_ops ax25_header_ops = {
+ .create = ax25_hard_header,
+};
+
+EXPORT_SYMBOL(ax25_header_ops);
+EXPORT_SYMBOL(ax25_ip_xmit);
+
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
new file mode 100644
index 000000000..be2acab9b
--- /dev/null
+++ b/net/ax25/ax25_out.c
@@ -0,0 +1,398 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/spinlock.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+static DEFINE_SPINLOCK(ax25_frag_lock);
+
+ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev)
+{
+ ax25_dev *ax25_dev;
+ ax25_cb *ax25;
+
+ /*
+ * Take the default packet length for the device if zero is
+ * specified.
+ */
+ if (paclen == 0) {
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ return NULL;
+
+ paclen = ax25_dev->values[AX25_VALUES_PACLEN];
+ }
+
+ /*
+ * Look for an existing connection.
+ */
+ if ((ax25 = ax25_find_cb(src, dest, digi, dev)) != NULL) {
+ ax25_output(ax25, paclen, skb);
+ return ax25; /* It already existed */
+ }
+
+ if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ return NULL;
+
+ if ((ax25 = ax25_create_cb()) == NULL)
+ return NULL;
+
+ ax25_fillin_cb(ax25, ax25_dev);
+
+ ax25->source_addr = *src;
+ ax25->dest_addr = *dest;
+
+ if (digi != NULL) {
+ ax25->digipeat = kmemdup(digi, sizeof(*digi), GFP_ATOMIC);
+ if (ax25->digipeat == NULL) {
+ ax25_cb_put(ax25);
+ return NULL;
+ }
+ }
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_std_establish_data_link(ax25);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ if (ax25_dev->dama.slave)
+ ax25_ds_establish_data_link(ax25);
+ else
+ ax25_std_establish_data_link(ax25);
+ break;
+#endif
+ }
+
+ /*
+ * There is one ref for the state machine; a caller needs
+ * one more to put it back, just like with the existing one.
+ */
+ ax25_cb_hold(ax25);
+
+ ax25_cb_add(ax25);
+
+ ax25->state = AX25_STATE_1;
+
+ ax25_start_heartbeat(ax25);
+
+ ax25_output(ax25, paclen, skb);
+
+ return ax25; /* We had to create it */
+}
+
+EXPORT_SYMBOL(ax25_send_frame);
+
+/*
+ * All outgoing AX.25 I frames pass via this routine. Therefore this is
+ * where the fragmentation of frames takes place. If fragment is set to
+ * zero then we are not allowed to do fragmentation, even if the frame
+ * is too large.
+ */
+void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb)
+{
+ struct sk_buff *skbn;
+ unsigned char *p;
+ int frontlen, len, fragno, ka9qfrag, first = 1;
+
+ if (paclen < 16) {
+ WARN_ON_ONCE(1);
+ kfree_skb(skb);
+ return;
+ }
+
+ if ((skb->len - 1) > paclen) {
+ if (*skb->data == AX25_P_TEXT) {
+ skb_pull(skb, 1); /* skip PID */
+ ka9qfrag = 0;
+ } else {
+ paclen -= 2; /* Allow for fragment control info */
+ ka9qfrag = 1;
+ }
+
+ fragno = skb->len / paclen;
+ if (skb->len % paclen == 0) fragno--;
+
+ frontlen = skb_headroom(skb); /* Address space + CTRL */
+
+ while (skb->len > 0) {
+ spin_lock_bh(&ax25_frag_lock);
+ if ((skbn = alloc_skb(paclen + 2 + frontlen, GFP_ATOMIC)) == NULL) {
+ spin_unlock_bh(&ax25_frag_lock);
+ printk(KERN_CRIT "AX.25: ax25_output - out of memory\n");
+ return;
+ }
+
+ if (skb->sk != NULL)
+ skb_set_owner_w(skbn, skb->sk);
+
+ spin_unlock_bh(&ax25_frag_lock);
+
+ len = (paclen > skb->len) ? skb->len : paclen;
+
+ if (ka9qfrag == 1) {
+ skb_reserve(skbn, frontlen + 2);
+ skb_set_network_header(skbn,
+ skb_network_offset(skb));
+ skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
+ p = skb_push(skbn, 2);
+
+ *p++ = AX25_P_SEGMENT;
+
+ *p = fragno--;
+ if (first) {
+ *p |= AX25_SEG_FIRST;
+ first = 0;
+ }
+ } else {
+ skb_reserve(skbn, frontlen + 1);
+ skb_set_network_header(skbn,
+ skb_network_offset(skb));
+ skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
+ p = skb_push(skbn, 1);
+ *p = AX25_P_TEXT;
+ }
+
+ skb_pull(skb, len);
+ skb_queue_tail(&ax25->write_queue, skbn); /* Throw it on the queue */
+ }
+
+ kfree_skb(skb);
+ } else {
+ skb_queue_tail(&ax25->write_queue, skb); /* Throw it on the queue */
+ }
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_kick(ax25);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ /*
+ * A DAMA slave is _required_ to work as normal AX.25L2V2
+ * if no DAMA master is available.
+ */
+ case AX25_PROTO_DAMA_SLAVE:
+ if (!ax25->ax25_dev->dama.slave) ax25_kick(ax25);
+ break;
+#endif
+ }
+}
+
+/*
+ * This procedure is passed a buffer descriptor for an iframe. It builds
+ * the rest of the control part of the frame and then writes it out.
+ */
+static void ax25_send_iframe(ax25_cb *ax25, struct sk_buff *skb, int poll_bit)
+{
+ unsigned char *frame;
+
+ if (skb == NULL)
+ return;
+
+ skb_reset_network_header(skb);
+
+ if (ax25->modulus == AX25_MODULUS) {
+ frame = skb_push(skb, 1);
+
+ *frame = AX25_I;
+ *frame |= (poll_bit) ? AX25_PF : 0;
+ *frame |= (ax25->vr << 5);
+ *frame |= (ax25->vs << 1);
+ } else {
+ frame = skb_push(skb, 2);
+
+ frame[0] = AX25_I;
+ frame[0] |= (ax25->vs << 1);
+ frame[1] = (poll_bit) ? AX25_EPF : 0;
+ frame[1] |= (ax25->vr << 1);
+ }
+
+ ax25_start_idletimer(ax25);
+
+ ax25_transmit_buffer(ax25, skb, AX25_COMMAND);
+}
+
+void ax25_kick(ax25_cb *ax25)
+{
+ struct sk_buff *skb, *skbn;
+ int last = 1;
+ unsigned short start, end, next;
+
+ if (ax25->state != AX25_STATE_3 && ax25->state != AX25_STATE_4)
+ return;
+
+ if (ax25->condition & AX25_COND_PEER_RX_BUSY)
+ return;
+
+ if (skb_peek(&ax25->write_queue) == NULL)
+ return;
+
+ start = (skb_peek(&ax25->ack_queue) == NULL) ? ax25->va : ax25->vs;
+ end = (ax25->va + ax25->window) % ax25->modulus;
+
+ if (start == end)
+ return;
+
+ /*
+ * Transmit data until either we're out of data to send or
+ * the window is full. Send a poll on the final I frame if
+ * the window is filled.
+ */
+
+ /*
+ * Dequeue the frame and copy it.
+ * Check for race with ax25_clear_queues().
+ */
+ skb = skb_dequeue(&ax25->write_queue);
+ if (!skb)
+ return;
+
+ ax25->vs = start;
+
+ do {
+ if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ skb_queue_head(&ax25->write_queue, skb);
+ break;
+ }
+
+ if (skb->sk != NULL)
+ skb_set_owner_w(skbn, skb->sk);
+
+ next = (ax25->vs + 1) % ax25->modulus;
+ last = (next == end);
+
+ /*
+ * Transmit the frame copy.
+ * bke 960114: do not set the Poll bit on the last frame
+ * in DAMA mode.
+ */
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_send_iframe(ax25, skbn, (last) ? AX25_POLLON : AX25_POLLOFF);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ ax25_send_iframe(ax25, skbn, AX25_POLLOFF);
+ break;
+#endif
+ }
+
+ ax25->vs = next;
+
+ /*
+ * Requeue the original data frame.
+ */
+ skb_queue_tail(&ax25->ack_queue, skb);
+
+ } while (!last && (skb = skb_dequeue(&ax25->write_queue)) != NULL);
+
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+
+ if (!ax25_t1timer_running(ax25)) {
+ ax25_stop_t3timer(ax25);
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+ }
+}
+
+void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
+{
+ struct sk_buff *skbn;
+ unsigned char *ptr;
+ int headroom;
+
+ if (ax25->ax25_dev == NULL) {
+ ax25_disconnect(ax25, ENETUNREACH);
+ return;
+ }
+
+ headroom = ax25_addr_size(ax25->digipeat);
+
+ if (skb_headroom(skb) < headroom) {
+ if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) {
+ printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n");
+ kfree_skb(skb);
+ return;
+ }
+
+ if (skb->sk != NULL)
+ skb_set_owner_w(skbn, skb->sk);
+
+ consume_skb(skb);
+ skb = skbn;
+ }
+
+ ptr = skb_push(skb, headroom);
+
+ ax25_addr_build(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type, ax25->modulus);
+
+ ax25_queue_xmit(skb, ax25->ax25_dev->dev);
+}
+
+/*
+ * A small shim to dev_queue_xmit to add the KISS control byte, and do
+ * any packet forwarding in operation.
+ */
+void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ unsigned char *ptr;
+
+ skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev));
+
+ ptr = skb_push(skb, 1);
+ *ptr = 0x00; /* KISS */
+
+ dev_queue_xmit(skb);
+}
+
+int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr)
+{
+ if (ax25->vs == nr) {
+ ax25_frames_acked(ax25, nr);
+ ax25_calculate_rtt(ax25);
+ ax25_stop_t1timer(ax25);
+ ax25_start_t3timer(ax25);
+ return 1;
+ } else {
+ if (ax25->va != nr) {
+ ax25_frames_acked(ax25, nr);
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+ return 1;
+ }
+ }
+ return 0;
+}
+
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
new file mode 100644
index 000000000..d39097737
--- /dev/null
+++ b/net/ax25/ax25_route.c
@@ -0,0 +1,505 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/timer.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+
+static ax25_route *ax25_route_list;
+static DEFINE_RWLOCK(ax25_route_lock);
+
+void ax25_rt_device_down(struct net_device *dev)
+{
+ ax25_route *s, *t, *ax25_rt;
+
+ write_lock_bh(&ax25_route_lock);
+ ax25_rt = ax25_route_list;
+ while (ax25_rt != NULL) {
+ s = ax25_rt;
+ ax25_rt = ax25_rt->next;
+
+ if (s->dev == dev) {
+ if (ax25_route_list == s) {
+ ax25_route_list = s->next;
+ kfree(s->digipeat);
+ kfree(s);
+ } else {
+ for (t = ax25_route_list; t != NULL; t = t->next) {
+ if (t->next == s) {
+ t->next = s->next;
+ kfree(s->digipeat);
+ kfree(s);
+ break;
+ }
+ }
+ }
+ }
+ }
+ write_unlock_bh(&ax25_route_lock);
+}
+
+static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
+{
+ ax25_route *ax25_rt;
+ ax25_dev *ax25_dev;
+ int i;
+
+ if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL)
+ return -EINVAL;
+ if (route->digi_count > AX25_MAX_DIGIS)
+ return -EINVAL;
+
+ write_lock_bh(&ax25_route_lock);
+
+ ax25_rt = ax25_route_list;
+ while (ax25_rt != NULL) {
+ if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 &&
+ ax25_rt->dev == ax25_dev->dev) {
+ kfree(ax25_rt->digipeat);
+ ax25_rt->digipeat = NULL;
+ if (route->digi_count != 0) {
+ if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
+ write_unlock_bh(&ax25_route_lock);
+ return -ENOMEM;
+ }
+ ax25_rt->digipeat->lastrepeat = -1;
+ ax25_rt->digipeat->ndigi = route->digi_count;
+ for (i = 0; i < route->digi_count; i++) {
+ ax25_rt->digipeat->repeated[i] = 0;
+ ax25_rt->digipeat->calls[i] = route->digi_addr[i];
+ }
+ }
+ write_unlock_bh(&ax25_route_lock);
+ return 0;
+ }
+ ax25_rt = ax25_rt->next;
+ }
+
+ if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) {
+ write_unlock_bh(&ax25_route_lock);
+ return -ENOMEM;
+ }
+
+ atomic_set(&ax25_rt->refcount, 1);
+ ax25_rt->callsign = route->dest_addr;
+ ax25_rt->dev = ax25_dev->dev;
+ ax25_rt->digipeat = NULL;
+ ax25_rt->ip_mode = ' ';
+ if (route->digi_count != 0) {
+ if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
+ write_unlock_bh(&ax25_route_lock);
+ kfree(ax25_rt);
+ return -ENOMEM;
+ }
+ ax25_rt->digipeat->lastrepeat = -1;
+ ax25_rt->digipeat->ndigi = route->digi_count;
+ for (i = 0; i < route->digi_count; i++) {
+ ax25_rt->digipeat->repeated[i] = 0;
+ ax25_rt->digipeat->calls[i] = route->digi_addr[i];
+ }
+ }
+ ax25_rt->next = ax25_route_list;
+ ax25_route_list = ax25_rt;
+ write_unlock_bh(&ax25_route_lock);
+
+ return 0;
+}
+
+void __ax25_put_route(ax25_route *ax25_rt)
+{
+ kfree(ax25_rt->digipeat);
+ kfree(ax25_rt);
+}
+
+static int ax25_rt_del(struct ax25_routes_struct *route)
+{
+ ax25_route *s, *t, *ax25_rt;
+ ax25_dev *ax25_dev;
+
+ if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL)
+ return -EINVAL;
+
+ write_lock_bh(&ax25_route_lock);
+
+ ax25_rt = ax25_route_list;
+ while (ax25_rt != NULL) {
+ s = ax25_rt;
+ ax25_rt = ax25_rt->next;
+ if (s->dev == ax25_dev->dev &&
+ ax25cmp(&route->dest_addr, &s->callsign) == 0) {
+ if (ax25_route_list == s) {
+ ax25_route_list = s->next;
+ ax25_put_route(s);
+ } else {
+ for (t = ax25_route_list; t != NULL; t = t->next) {
+ if (t->next == s) {
+ t->next = s->next;
+ ax25_put_route(s);
+ break;
+ }
+ }
+ }
+ }
+ }
+ write_unlock_bh(&ax25_route_lock);
+
+ return 0;
+}
+
+static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option)
+{
+ ax25_route *ax25_rt;
+ ax25_dev *ax25_dev;
+ int err = 0;
+
+ if ((ax25_dev = ax25_addr_ax25dev(&rt_option->port_addr)) == NULL)
+ return -EINVAL;
+
+ write_lock_bh(&ax25_route_lock);
+
+ ax25_rt = ax25_route_list;
+ while (ax25_rt != NULL) {
+ if (ax25_rt->dev == ax25_dev->dev &&
+ ax25cmp(&rt_option->dest_addr, &ax25_rt->callsign) == 0) {
+ switch (rt_option->cmd) {
+ case AX25_SET_RT_IPMODE:
+ switch (rt_option->arg) {
+ case ' ':
+ case 'D':
+ case 'V':
+ ax25_rt->ip_mode = rt_option->arg;
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+ break;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+ }
+ ax25_rt = ax25_rt->next;
+ }
+
+out:
+ write_unlock_bh(&ax25_route_lock);
+ return err;
+}
+
+int ax25_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct ax25_route_opt_struct rt_option;
+ struct ax25_routes_struct route;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ if (copy_from_user(&route, arg, sizeof(route)))
+ return -EFAULT;
+ return ax25_rt_add(&route);
+
+ case SIOCDELRT:
+ if (copy_from_user(&route, arg, sizeof(route)))
+ return -EFAULT;
+ return ax25_rt_del(&route);
+
+ case SIOCAX25OPTRT:
+ if (copy_from_user(&rt_option, arg, sizeof(rt_option)))
+ return -EFAULT;
+ return ax25_rt_opt(&rt_option);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *ax25_rt_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(ax25_route_lock)
+{
+ struct ax25_route *ax25_rt;
+ int i = 1;
+
+ read_lock(&ax25_route_lock);
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (ax25_rt = ax25_route_list; ax25_rt != NULL; ax25_rt = ax25_rt->next) {
+ if (i == *pos)
+ return ax25_rt;
+ ++i;
+ }
+
+ return NULL;
+}
+
+static void *ax25_rt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return (v == SEQ_START_TOKEN) ? ax25_route_list :
+ ((struct ax25_route *) v)->next;
+}
+
+static void ax25_rt_seq_stop(struct seq_file *seq, void *v)
+ __releases(ax25_route_lock)
+{
+ read_unlock(&ax25_route_lock);
+}
+
+static int ax25_rt_seq_show(struct seq_file *seq, void *v)
+{
+ char buf[11];
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "callsign dev mode digipeaters\n");
+ else {
+ struct ax25_route *ax25_rt = v;
+ const char *callsign;
+ int i;
+
+ if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0)
+ callsign = "default";
+ else
+ callsign = ax2asc(buf, &ax25_rt->callsign);
+
+ seq_printf(seq, "%-9s %-4s",
+ callsign,
+ ax25_rt->dev ? ax25_rt->dev->name : "???");
+
+ switch (ax25_rt->ip_mode) {
+ case 'V':
+ seq_puts(seq, " vc");
+ break;
+ case 'D':
+ seq_puts(seq, " dg");
+ break;
+ default:
+ seq_puts(seq, " *");
+ break;
+ }
+
+ if (ax25_rt->digipeat != NULL)
+ for (i = 0; i < ax25_rt->digipeat->ndigi; i++)
+ seq_printf(seq, " %s",
+ ax2asc(buf, &ax25_rt->digipeat->calls[i]));
+
+ seq_puts(seq, "\n");
+ }
+ return 0;
+}
+
+static const struct seq_operations ax25_rt_seqops = {
+ .start = ax25_rt_seq_start,
+ .next = ax25_rt_seq_next,
+ .stop = ax25_rt_seq_stop,
+ .show = ax25_rt_seq_show,
+};
+
+static int ax25_rt_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ax25_rt_seqops);
+}
+
+const struct file_operations ax25_route_fops = {
+ .owner = THIS_MODULE,
+ .open = ax25_rt_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif
+
+/*
+ * Find AX.25 route
+ *
+ * Only routes with a reference count of zero can be destroyed.
+ */
+ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
+{
+ ax25_route *ax25_spe_rt = NULL;
+ ax25_route *ax25_def_rt = NULL;
+ ax25_route *ax25_rt;
+
+ read_lock(&ax25_route_lock);
+ /*
+ * Bind to the physical interface we heard them on, or the default
+ * route if none is found;
+ */
+ for (ax25_rt = ax25_route_list; ax25_rt != NULL; ax25_rt = ax25_rt->next) {
+ if (dev == NULL) {
+ if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev != NULL)
+ ax25_spe_rt = ax25_rt;
+ if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev != NULL)
+ ax25_def_rt = ax25_rt;
+ } else {
+ if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev == dev)
+ ax25_spe_rt = ax25_rt;
+ if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev == dev)
+ ax25_def_rt = ax25_rt;
+ }
+ }
+
+ ax25_rt = ax25_def_rt;
+ if (ax25_spe_rt != NULL)
+ ax25_rt = ax25_spe_rt;
+
+ if (ax25_rt != NULL)
+ ax25_hold_route(ax25_rt);
+
+ read_unlock(&ax25_route_lock);
+
+ return ax25_rt;
+}
+
+/*
+ * Adjust path: If you specify a default route and want to connect
+ * a target on the digipeater path but w/o having a special route
+ * set before, the path has to be truncated from your target on.
+ */
+static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
+{
+ int k;
+
+ for (k = 0; k < digipeat->ndigi; k++) {
+ if (ax25cmp(addr, &digipeat->calls[k]) == 0)
+ break;
+ }
+
+ digipeat->ndigi = k;
+}
+
+
+/*
+ * Find which interface to use.
+ */
+int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
+{
+ ax25_uid_assoc *user;
+ ax25_route *ax25_rt;
+ int err = 0;
+
+ if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
+ return -EHOSTUNREACH;
+
+ if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) {
+ err = -EHOSTUNREACH;
+ goto put;
+ }
+
+ user = ax25_findbyuid(current_euid());
+ if (user) {
+ ax25->source_addr = user->call;
+ ax25_uid_put(user);
+ } else {
+ if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
+ err = -EPERM;
+ goto put;
+ }
+ ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
+ }
+
+ if (ax25_rt->digipeat != NULL) {
+ ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi),
+ GFP_ATOMIC);
+ if (ax25->digipeat == NULL) {
+ err = -ENOMEM;
+ goto put;
+ }
+ ax25_adjust_path(addr, ax25->digipeat);
+ }
+
+ if (ax25->sk != NULL) {
+ bh_lock_sock(ax25->sk);
+ sock_reset_flag(ax25->sk, SOCK_ZAPPED);
+ bh_unlock_sock(ax25->sk);
+ }
+
+put:
+ ax25_put_route(ax25_rt);
+
+ return err;
+}
+
+struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
+ ax25_address *dest, ax25_digi *digi)
+{
+ struct sk_buff *skbn;
+ unsigned char *bp;
+ int len;
+
+ len = digi->ndigi * AX25_ADDR_LEN;
+
+ if (skb_headroom(skb) < len) {
+ if ((skbn = skb_realloc_headroom(skb, len)) == NULL) {
+ printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n");
+ return NULL;
+ }
+
+ if (skb->sk != NULL)
+ skb_set_owner_w(skbn, skb->sk);
+
+ consume_skb(skb);
+
+ skb = skbn;
+ }
+
+ bp = skb_push(skb, len);
+
+ ax25_addr_build(bp, src, dest, digi, AX25_COMMAND, AX25_MODULUS);
+
+ return skb;
+}
+
+/*
+ * Free all memory associated with routing structures.
+ */
+void __exit ax25_rt_free(void)
+{
+ ax25_route *s, *ax25_rt = ax25_route_list;
+
+ write_lock_bh(&ax25_route_lock);
+ while (ax25_rt != NULL) {
+ s = ax25_rt;
+ ax25_rt = ax25_rt->next;
+
+ kfree(s->digipeat);
+ kfree(s);
+ }
+ write_unlock_bh(&ax25_route_lock);
+}
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
new file mode 100644
index 000000000..3fbf8f7b2
--- /dev/null
+++ b/net/ax25/ax25_std_in.c
@@ -0,0 +1,446 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
+ *
+ * Most of this code is based on the SDL diagrams published in the 7th ARRL
+ * Computer Networking Conference papers. The diagrams have mistakes in them,
+ * but are mostly correct. Before you modify the code could you read the SDL
+ * diagrams as the code is not obvious and probably very easy to break.
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ * State machine for state 1, Awaiting Connection State.
+ * The handling of the timer(s) is in file ax25_std_timer.c.
+ * Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_std_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type)
+{
+ switch (frametype) {
+ case AX25_SABM:
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ break;
+
+ case AX25_SABME:
+ ax25->modulus = AX25_EMODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ break;
+
+ case AX25_DISC:
+ ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE);
+ break;
+
+ case AX25_UA:
+ if (pf) {
+ ax25_calculate_rtt(ax25);
+ ax25_stop_t1timer(ax25);
+ ax25_start_t3timer(ax25);
+ ax25_start_idletimer(ax25);
+ ax25->vs = 0;
+ ax25->va = 0;
+ ax25->vr = 0;
+ ax25->state = AX25_STATE_3;
+ ax25->n2count = 0;
+ if (ax25->sk != NULL) {
+ bh_lock_sock(ax25->sk);
+ ax25->sk->sk_state = TCP_ESTABLISHED;
+ /* For WAIT_SABM connections we will produce an accept ready socket here */
+ if (!sock_flag(ax25->sk, SOCK_DEAD))
+ ax25->sk->sk_state_change(ax25->sk);
+ bh_unlock_sock(ax25->sk);
+ }
+ }
+ break;
+
+ case AX25_DM:
+ if (pf) {
+ if (ax25->modulus == AX25_MODULUS) {
+ ax25_disconnect(ax25, ECONNREFUSED);
+ } else {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * State machine for state 2, Awaiting Release State.
+ * The handling of the timer(s) is in file ax25_std_timer.c
+ * Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_std_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type)
+{
+ switch (frametype) {
+ case AX25_SABM:
+ case AX25_SABME:
+ ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE);
+ break;
+
+ case AX25_DISC:
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ ax25_disconnect(ax25, 0);
+ break;
+
+ case AX25_DM:
+ case AX25_UA:
+ if (pf)
+ ax25_disconnect(ax25, 0);
+ break;
+
+ case AX25_I:
+ case AX25_REJ:
+ case AX25_RNR:
+ case AX25_RR:
+ if (pf) ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file ax25_std_timer.c
+ * Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_std_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type)
+{
+ int queued = 0;
+
+ switch (frametype) {
+ case AX25_SABM:
+ case AX25_SABME:
+ if (frametype == AX25_SABM) {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ } else {
+ ax25->modulus = AX25_EMODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+ }
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ ax25_stop_t1timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_start_t3timer(ax25);
+ ax25_start_idletimer(ax25);
+ ax25->condition = 0x00;
+ ax25->vs = 0;
+ ax25->va = 0;
+ ax25->vr = 0;
+ ax25_requeue_frames(ax25);
+ break;
+
+ case AX25_DISC:
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ ax25_disconnect(ax25, 0);
+ break;
+
+ case AX25_DM:
+ ax25_disconnect(ax25, ECONNRESET);
+ break;
+
+ case AX25_RR:
+ case AX25_RNR:
+ if (frametype == AX25_RR)
+ ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+ else
+ ax25->condition |= AX25_COND_PEER_RX_BUSY;
+ if (type == AX25_COMMAND && pf)
+ ax25_std_enquiry_response(ax25);
+ if (ax25_validate_nr(ax25, nr)) {
+ ax25_check_iframes_acked(ax25, nr);
+ } else {
+ ax25_std_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ }
+ break;
+
+ case AX25_REJ:
+ ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+ if (type == AX25_COMMAND && pf)
+ ax25_std_enquiry_response(ax25);
+ if (ax25_validate_nr(ax25, nr)) {
+ ax25_frames_acked(ax25, nr);
+ ax25_calculate_rtt(ax25);
+ ax25_stop_t1timer(ax25);
+ ax25_start_t3timer(ax25);
+ ax25_requeue_frames(ax25);
+ } else {
+ ax25_std_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ }
+ break;
+
+ case AX25_I:
+ if (!ax25_validate_nr(ax25, nr)) {
+ ax25_std_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ break;
+ }
+ if (ax25->condition & AX25_COND_PEER_RX_BUSY) {
+ ax25_frames_acked(ax25, nr);
+ } else {
+ ax25_check_iframes_acked(ax25, nr);
+ }
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY) {
+ if (pf) ax25_std_enquiry_response(ax25);
+ break;
+ }
+ if (ns == ax25->vr) {
+ ax25->vr = (ax25->vr + 1) % ax25->modulus;
+ queued = ax25_rx_iframe(ax25, skb);
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+ ax25->vr = ns; /* ax25->vr - 1 */
+ ax25->condition &= ~AX25_COND_REJECT;
+ if (pf) {
+ ax25_std_enquiry_response(ax25);
+ } else {
+ if (!(ax25->condition & AX25_COND_ACK_PENDING)) {
+ ax25->condition |= AX25_COND_ACK_PENDING;
+ ax25_start_t2timer(ax25);
+ }
+ }
+ } else {
+ if (ax25->condition & AX25_COND_REJECT) {
+ if (pf) ax25_std_enquiry_response(ax25);
+ } else {
+ ax25->condition |= AX25_COND_REJECT;
+ ax25_send_control(ax25, AX25_REJ, pf, AX25_RESPONSE);
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+ }
+ }
+ break;
+
+ case AX25_FRMR:
+ case AX25_ILLEGAL:
+ ax25_std_establish_data_link(ax25);
+ ax25->state = AX25_STATE_1;
+ break;
+
+ default:
+ break;
+ }
+
+ return queued;
+}
+
+/*
+ * State machine for state 4, Timer Recovery State.
+ * The handling of the timer(s) is in file ax25_std_timer.c
+ * Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_std_state4_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type)
+{
+ int queued = 0;
+
+ switch (frametype) {
+ case AX25_SABM:
+ case AX25_SABME:
+ if (frametype == AX25_SABM) {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ } else {
+ ax25->modulus = AX25_EMODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+ }
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ ax25_stop_t1timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_start_t3timer(ax25);
+ ax25_start_idletimer(ax25);
+ ax25->condition = 0x00;
+ ax25->vs = 0;
+ ax25->va = 0;
+ ax25->vr = 0;
+ ax25->state = AX25_STATE_3;
+ ax25->n2count = 0;
+ ax25_requeue_frames(ax25);
+ break;
+
+ case AX25_DISC:
+ ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+ ax25_disconnect(ax25, 0);
+ break;
+
+ case AX25_DM:
+ ax25_disconnect(ax25, ECONNRESET);
+ break;
+
+ case AX25_RR:
+ case AX25_RNR:
+ if (frametype == AX25_RR)
+ ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+ else
+ ax25->condition |= AX25_COND_PEER_RX_BUSY;
+ if (type == AX25_RESPONSE && pf) {
+ ax25_stop_t1timer(ax25);
+ ax25->n2count = 0;
+ if (ax25_validate_nr(ax25, nr)) {
+ ax25_frames_acked(ax25, nr);
+ if (ax25->vs == ax25->va) {
+ ax25_start_t3timer(ax25);
+ ax25->state = AX25_STATE_3;
+ } else {
+ ax25_requeue_frames(ax25);
+ }
+ } else {
+ ax25_std_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ }
+ break;
+ }
+ if (type == AX25_COMMAND && pf)
+ ax25_std_enquiry_response(ax25);
+ if (ax25_validate_nr(ax25, nr)) {
+ ax25_frames_acked(ax25, nr);
+ } else {
+ ax25_std_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ }
+ break;
+
+ case AX25_REJ:
+ ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+ if (pf && type == AX25_RESPONSE) {
+ ax25_stop_t1timer(ax25);
+ ax25->n2count = 0;
+ if (ax25_validate_nr(ax25, nr)) {
+ ax25_frames_acked(ax25, nr);
+ if (ax25->vs == ax25->va) {
+ ax25_start_t3timer(ax25);
+ ax25->state = AX25_STATE_3;
+ } else {
+ ax25_requeue_frames(ax25);
+ }
+ } else {
+ ax25_std_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ }
+ break;
+ }
+ if (type == AX25_COMMAND && pf)
+ ax25_std_enquiry_response(ax25);
+ if (ax25_validate_nr(ax25, nr)) {
+ ax25_frames_acked(ax25, nr);
+ ax25_requeue_frames(ax25);
+ } else {
+ ax25_std_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ }
+ break;
+
+ case AX25_I:
+ if (!ax25_validate_nr(ax25, nr)) {
+ ax25_std_nr_error_recovery(ax25);
+ ax25->state = AX25_STATE_1;
+ break;
+ }
+ ax25_frames_acked(ax25, nr);
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY) {
+ if (pf)
+ ax25_std_enquiry_response(ax25);
+ break;
+ }
+ if (ns == ax25->vr) {
+ ax25->vr = (ax25->vr + 1) % ax25->modulus;
+ queued = ax25_rx_iframe(ax25, skb);
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+ ax25->vr = ns; /* ax25->vr - 1 */
+ ax25->condition &= ~AX25_COND_REJECT;
+ if (pf) {
+ ax25_std_enquiry_response(ax25);
+ } else {
+ if (!(ax25->condition & AX25_COND_ACK_PENDING)) {
+ ax25->condition |= AX25_COND_ACK_PENDING;
+ ax25_start_t2timer(ax25);
+ }
+ }
+ } else {
+ if (ax25->condition & AX25_COND_REJECT) {
+ if (pf) ax25_std_enquiry_response(ax25);
+ } else {
+ ax25->condition |= AX25_COND_REJECT;
+ ax25_send_control(ax25, AX25_REJ, pf, AX25_RESPONSE);
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+ }
+ }
+ break;
+
+ case AX25_FRMR:
+ case AX25_ILLEGAL:
+ ax25_std_establish_data_link(ax25);
+ ax25->state = AX25_STATE_1;
+ break;
+
+ default:
+ break;
+ }
+
+ return queued;
+}
+
+/*
+ * Higher level upcall for a LAPB frame
+ */
+int ax25_std_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type)
+{
+ int queued = 0, frametype, ns, nr, pf;
+
+ frametype = ax25_decode(ax25, skb, &ns, &nr, &pf);
+
+ switch (ax25->state) {
+ case AX25_STATE_1:
+ queued = ax25_std_state1_machine(ax25, skb, frametype, pf, type);
+ break;
+ case AX25_STATE_2:
+ queued = ax25_std_state2_machine(ax25, skb, frametype, pf, type);
+ break;
+ case AX25_STATE_3:
+ queued = ax25_std_state3_machine(ax25, skb, frametype, ns, nr, pf, type);
+ break;
+ case AX25_STATE_4:
+ queued = ax25_std_state4_machine(ax25, skb, frametype, ns, nr, pf, type);
+ break;
+ }
+
+ ax25_kick(ax25);
+
+ return queued;
+}
diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c
new file mode 100644
index 000000000..8b66a41e5
--- /dev/null
+++ b/net/ax25/ax25_std_subr.c
@@ -0,0 +1,86 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ * The following routines are taken from page 170 of the 7th ARRL Computer
+ * Networking Conference paper, as is the whole state machine.
+ */
+
+void ax25_std_nr_error_recovery(ax25_cb *ax25)
+{
+ ax25_std_establish_data_link(ax25);
+}
+
+void ax25_std_establish_data_link(ax25_cb *ax25)
+{
+ ax25->condition = 0x00;
+ ax25->n2count = 0;
+
+ if (ax25->modulus == AX25_MODULUS)
+ ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND);
+ else
+ ax25_send_control(ax25, AX25_SABME, AX25_POLLON, AX25_COMMAND);
+
+ ax25_calculate_t1(ax25);
+ ax25_stop_idletimer(ax25);
+ ax25_stop_t3timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_start_t1timer(ax25);
+}
+
+void ax25_std_transmit_enquiry(ax25_cb *ax25)
+{
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+ ax25_send_control(ax25, AX25_RNR, AX25_POLLON, AX25_COMMAND);
+ else
+ ax25_send_control(ax25, AX25_RR, AX25_POLLON, AX25_COMMAND);
+
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+}
+
+void ax25_std_enquiry_response(ax25_cb *ax25)
+{
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+ ax25_send_control(ax25, AX25_RNR, AX25_POLLON, AX25_RESPONSE);
+ else
+ ax25_send_control(ax25, AX25_RR, AX25_POLLON, AX25_RESPONSE);
+
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+}
+
+void ax25_std_timeout_response(ax25_cb *ax25)
+{
+ if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+ ax25_send_control(ax25, AX25_RNR, AX25_POLLOFF, AX25_RESPONSE);
+ else
+ ax25_send_control(ax25, AX25_RR, AX25_POLLOFF, AX25_RESPONSE);
+
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+}
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
new file mode 100644
index 000000000..004467c9e
--- /dev/null
+++ b/net/ax25/ax25_std_timer.c
@@ -0,0 +1,175 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+void ax25_std_heartbeat_expiry(ax25_cb *ax25)
+{
+ struct sock *sk = ax25->sk;
+
+ if (sk)
+ bh_lock_sock(sk);
+
+ switch (ax25->state) {
+ case AX25_STATE_0:
+ /* Magic here: If we listen() and a new link dies before it
+ is accepted() it isn't 'dead' so doesn't get removed. */
+ if (!sk || sock_flag(sk, SOCK_DESTROY) ||
+ (sk->sk_state == TCP_LISTEN &&
+ sock_flag(sk, SOCK_DEAD))) {
+ if (sk) {
+ sock_hold(sk);
+ ax25_destroy_socket(ax25);
+ bh_unlock_sock(sk);
+ sock_put(sk);
+ } else
+ ax25_destroy_socket(ax25);
+ return;
+ }
+ break;
+
+ case AX25_STATE_3:
+ case AX25_STATE_4:
+ /*
+ * Check the state of the receive buffer.
+ */
+ if (sk != NULL) {
+ if (atomic_read(&sk->sk_rmem_alloc) <
+ (sk->sk_rcvbuf >> 1) &&
+ (ax25->condition & AX25_COND_OWN_RX_BUSY)) {
+ ax25->condition &= ~AX25_COND_OWN_RX_BUSY;
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+ ax25_send_control(ax25, AX25_RR, AX25_POLLOFF, AX25_RESPONSE);
+ break;
+ }
+ }
+ }
+
+ if (sk)
+ bh_unlock_sock(sk);
+
+ ax25_start_heartbeat(ax25);
+}
+
+void ax25_std_t2timer_expiry(ax25_cb *ax25)
+{
+ if (ax25->condition & AX25_COND_ACK_PENDING) {
+ ax25->condition &= ~AX25_COND_ACK_PENDING;
+ ax25_std_timeout_response(ax25);
+ }
+}
+
+void ax25_std_t3timer_expiry(ax25_cb *ax25)
+{
+ ax25->n2count = 0;
+ ax25_std_transmit_enquiry(ax25);
+ ax25->state = AX25_STATE_4;
+}
+
+void ax25_std_idletimer_expiry(ax25_cb *ax25)
+{
+ ax25_clear_queues(ax25);
+
+ ax25->n2count = 0;
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ ax25->state = AX25_STATE_2;
+
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_stop_t3timer(ax25);
+
+ if (ax25->sk != NULL) {
+ bh_lock_sock(ax25->sk);
+ ax25->sk->sk_state = TCP_CLOSE;
+ ax25->sk->sk_err = 0;
+ ax25->sk->sk_shutdown |= SEND_SHUTDOWN;
+ if (!sock_flag(ax25->sk, SOCK_DEAD)) {
+ ax25->sk->sk_state_change(ax25->sk);
+ sock_set_flag(ax25->sk, SOCK_DEAD);
+ }
+ bh_unlock_sock(ax25->sk);
+ }
+}
+
+void ax25_std_t1timer_expiry(ax25_cb *ax25)
+{
+ switch (ax25->state) {
+ case AX25_STATE_1:
+ if (ax25->n2count == ax25->n2) {
+ if (ax25->modulus == AX25_MODULUS) {
+ ax25_disconnect(ax25, ETIMEDOUT);
+ return;
+ } else {
+ ax25->modulus = AX25_MODULUS;
+ ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+ ax25->n2count = 0;
+ ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND);
+ }
+ } else {
+ ax25->n2count++;
+ if (ax25->modulus == AX25_MODULUS)
+ ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND);
+ else
+ ax25_send_control(ax25, AX25_SABME, AX25_POLLON, AX25_COMMAND);
+ }
+ break;
+
+ case AX25_STATE_2:
+ if (ax25->n2count == ax25->n2) {
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ ax25_disconnect(ax25, ETIMEDOUT);
+ return;
+ } else {
+ ax25->n2count++;
+ ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+ }
+ break;
+
+ case AX25_STATE_3:
+ ax25->n2count = 1;
+ ax25_std_transmit_enquiry(ax25);
+ ax25->state = AX25_STATE_4;
+ break;
+
+ case AX25_STATE_4:
+ if (ax25->n2count == ax25->n2) {
+ ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE);
+ ax25_disconnect(ax25, ETIMEDOUT);
+ return;
+ } else {
+ ax25->n2count++;
+ ax25_std_transmit_enquiry(ax25);
+ }
+ break;
+ }
+
+ ax25_calculate_t1(ax25);
+ ax25_start_t1timer(ax25);
+}
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
new file mode 100644
index 000000000..1997538a5
--- /dev/null
+++ b/net/ax25/ax25_subr.c
@@ -0,0 +1,289 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ * This routine purges all the queues of frames.
+ */
+void ax25_clear_queues(ax25_cb *ax25)
+{
+ skb_queue_purge(&ax25->write_queue);
+ skb_queue_purge(&ax25->ack_queue);
+ skb_queue_purge(&ax25->reseq_queue);
+ skb_queue_purge(&ax25->frag_queue);
+}
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+ */
+void ax25_frames_acked(ax25_cb *ax25, unsigned short nr)
+{
+ struct sk_buff *skb;
+
+ /*
+ * Remove all the ack-ed frames from the ack queue.
+ */
+ if (ax25->va != nr) {
+ while (skb_peek(&ax25->ack_queue) != NULL && ax25->va != nr) {
+ skb = skb_dequeue(&ax25->ack_queue);
+ kfree_skb(skb);
+ ax25->va = (ax25->va + 1) % ax25->modulus;
+ }
+ }
+}
+
+void ax25_requeue_frames(ax25_cb *ax25)
+{
+ struct sk_buff *skb;
+
+ /*
+ * Requeue all the un-ack-ed frames on the output queue to be picked
+ * up by ax25_kick called from the timer. This arrangement handles the
+ * possibility of an empty output queue.
+ */
+ while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL)
+ skb_queue_head(&ax25->write_queue, skb);
+}
+
+/*
+ * Validate that the value of nr is between va and vs. Return true or
+ * false for testing.
+ */
+int ax25_validate_nr(ax25_cb *ax25, unsigned short nr)
+{
+ unsigned short vc = ax25->va;
+
+ while (vc != ax25->vs) {
+ if (nr == vc) return 1;
+ vc = (vc + 1) % ax25->modulus;
+ }
+
+ if (nr == ax25->vs) return 1;
+
+ return 0;
+}
+
+/*
+ * This routine is the centralised routine for parsing the control
+ * information for the different frame formats.
+ */
+int ax25_decode(ax25_cb *ax25, struct sk_buff *skb, int *ns, int *nr, int *pf)
+{
+ unsigned char *frame;
+ int frametype = AX25_ILLEGAL;
+
+ frame = skb->data;
+ *ns = *nr = *pf = 0;
+
+ if (ax25->modulus == AX25_MODULUS) {
+ if ((frame[0] & AX25_S) == 0) {
+ frametype = AX25_I; /* I frame - carries NR/NS/PF */
+ *ns = (frame[0] >> 1) & 0x07;
+ *nr = (frame[0] >> 5) & 0x07;
+ *pf = frame[0] & AX25_PF;
+ } else if ((frame[0] & AX25_U) == 1) { /* S frame - take out PF/NR */
+ frametype = frame[0] & 0x0F;
+ *nr = (frame[0] >> 5) & 0x07;
+ *pf = frame[0] & AX25_PF;
+ } else if ((frame[0] & AX25_U) == 3) { /* U frame - take out PF */
+ frametype = frame[0] & ~AX25_PF;
+ *pf = frame[0] & AX25_PF;
+ }
+ skb_pull(skb, 1);
+ } else {
+ if ((frame[0] & AX25_S) == 0) {
+ frametype = AX25_I; /* I frame - carries NR/NS/PF */
+ *ns = (frame[0] >> 1) & 0x7F;
+ *nr = (frame[1] >> 1) & 0x7F;
+ *pf = frame[1] & AX25_EPF;
+ skb_pull(skb, 2);
+ } else if ((frame[0] & AX25_U) == 1) { /* S frame - take out PF/NR */
+ frametype = frame[0] & 0x0F;
+ *nr = (frame[1] >> 1) & 0x7F;
+ *pf = frame[1] & AX25_EPF;
+ skb_pull(skb, 2);
+ } else if ((frame[0] & AX25_U) == 3) { /* U frame - take out PF */
+ frametype = frame[0] & ~AX25_PF;
+ *pf = frame[0] & AX25_PF;
+ skb_pull(skb, 1);
+ }
+ }
+
+ return frametype;
+}
+
+/*
+ * This routine is called when the HDLC layer internally generates a
+ * command or response for the remote machine ( eg. RR, UA etc. ).
+ * Only supervisory or unnumbered frames are processed.
+ */
+void ax25_send_control(ax25_cb *ax25, int frametype, int poll_bit, int type)
+{
+ struct sk_buff *skb;
+ unsigned char *dptr;
+
+ if ((skb = alloc_skb(ax25->ax25_dev->dev->hard_header_len + 2, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skb, ax25->ax25_dev->dev->hard_header_len);
+
+ skb_reset_network_header(skb);
+
+ /* Assume a response - address structure for DTE */
+ if (ax25->modulus == AX25_MODULUS) {
+ dptr = skb_put(skb, 1);
+ *dptr = frametype;
+ *dptr |= (poll_bit) ? AX25_PF : 0;
+ if ((frametype & AX25_U) == AX25_S) /* S frames carry NR */
+ *dptr |= (ax25->vr << 5);
+ } else {
+ if ((frametype & AX25_U) == AX25_U) {
+ dptr = skb_put(skb, 1);
+ *dptr = frametype;
+ *dptr |= (poll_bit) ? AX25_PF : 0;
+ } else {
+ dptr = skb_put(skb, 2);
+ dptr[0] = frametype;
+ dptr[1] = (ax25->vr << 1);
+ dptr[1] |= (poll_bit) ? AX25_EPF : 0;
+ }
+ }
+
+ ax25_transmit_buffer(ax25, skb, type);
+}
+
+/*
+ * Send a 'DM' to an unknown connection attempt, or an invalid caller.
+ *
+ * Note: src here is the sender, thus it's the target of the DM
+ */
+void ax25_return_dm(struct net_device *dev, ax25_address *src, ax25_address *dest, ax25_digi *digi)
+{
+ struct sk_buff *skb;
+ char *dptr;
+ ax25_digi retdigi;
+
+ if (dev == NULL)
+ return;
+
+ if ((skb = alloc_skb(dev->hard_header_len + 1, GFP_ATOMIC)) == NULL)
+ return; /* Next SABM will get DM'd */
+
+ skb_reserve(skb, dev->hard_header_len);
+ skb_reset_network_header(skb);
+
+ ax25_digi_invert(digi, &retdigi);
+
+ dptr = skb_put(skb, 1);
+
+ *dptr = AX25_DM | AX25_PF;
+
+ /*
+ * Do the address ourselves
+ */
+ dptr = skb_push(skb, ax25_addr_size(digi));
+ dptr += ax25_addr_build(dptr, dest, src, &retdigi, AX25_RESPONSE, AX25_MODULUS);
+
+ ax25_queue_xmit(skb, dev);
+}
+
+/*
+ * Exponential backoff for AX.25
+ */
+void ax25_calculate_t1(ax25_cb *ax25)
+{
+ int n, t = 2;
+
+ switch (ax25->backoff) {
+ case 0:
+ break;
+
+ case 1:
+ t += 2 * ax25->n2count;
+ break;
+
+ case 2:
+ for (n = 0; n < ax25->n2count; n++)
+ t *= 2;
+ if (t > 8) t = 8;
+ break;
+ }
+
+ ax25->t1 = t * ax25->rtt;
+}
+
+/*
+ * Calculate the Round Trip Time
+ */
+void ax25_calculate_rtt(ax25_cb *ax25)
+{
+ if (ax25->backoff == 0)
+ return;
+
+ if (ax25_t1timer_running(ax25) && ax25->n2count == 0)
+ ax25->rtt = (9 * ax25->rtt + ax25->t1 - ax25_display_timer(&ax25->t1timer)) / 10;
+
+ if (ax25->rtt < AX25_T1CLAMPLO)
+ ax25->rtt = AX25_T1CLAMPLO;
+
+ if (ax25->rtt > AX25_T1CLAMPHI)
+ ax25->rtt = AX25_T1CLAMPHI;
+}
+
+void ax25_disconnect(ax25_cb *ax25, int reason)
+{
+ ax25_clear_queues(ax25);
+
+ ax25_stop_t1timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_stop_t3timer(ax25);
+ ax25_stop_idletimer(ax25);
+
+ ax25->state = AX25_STATE_0;
+
+ ax25_link_failed(ax25, reason);
+
+ if (ax25->sk != NULL) {
+ local_bh_disable();
+ bh_lock_sock(ax25->sk);
+ ax25->sk->sk_state = TCP_CLOSE;
+ ax25->sk->sk_err = reason;
+ ax25->sk->sk_shutdown |= SEND_SHUTDOWN;
+ if (!sock_flag(ax25->sk, SOCK_DEAD)) {
+ ax25->sk->sk_state_change(ax25->sk);
+ sock_set_flag(ax25->sk, SOCK_DEAD);
+ }
+ bh_unlock_sock(ax25->sk);
+ local_bh_enable();
+ }
+}
diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c
new file mode 100644
index 000000000..c3cffa79b
--- /dev/null
+++ b/net/ax25/ax25_timer.c
@@ -0,0 +1,226 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi)
+ * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+static void ax25_heartbeat_expiry(unsigned long);
+static void ax25_t1timer_expiry(unsigned long);
+static void ax25_t2timer_expiry(unsigned long);
+static void ax25_t3timer_expiry(unsigned long);
+static void ax25_idletimer_expiry(unsigned long);
+
+void ax25_setup_timers(ax25_cb *ax25)
+{
+ setup_timer(&ax25->timer, ax25_heartbeat_expiry, (unsigned long)ax25);
+ setup_timer(&ax25->t1timer, ax25_t1timer_expiry, (unsigned long)ax25);
+ setup_timer(&ax25->t2timer, ax25_t2timer_expiry, (unsigned long)ax25);
+ setup_timer(&ax25->t3timer, ax25_t3timer_expiry, (unsigned long)ax25);
+ setup_timer(&ax25->idletimer, ax25_idletimer_expiry,
+ (unsigned long)ax25);
+}
+
+void ax25_start_heartbeat(ax25_cb *ax25)
+{
+ mod_timer(&ax25->timer, jiffies + 5 * HZ);
+}
+
+void ax25_start_t1timer(ax25_cb *ax25)
+{
+ mod_timer(&ax25->t1timer, jiffies + ax25->t1);
+}
+
+void ax25_start_t2timer(ax25_cb *ax25)
+{
+ mod_timer(&ax25->t2timer, jiffies + ax25->t2);
+}
+
+void ax25_start_t3timer(ax25_cb *ax25)
+{
+ if (ax25->t3 > 0)
+ mod_timer(&ax25->t3timer, jiffies + ax25->t3);
+ else
+ del_timer(&ax25->t3timer);
+}
+
+void ax25_start_idletimer(ax25_cb *ax25)
+{
+ if (ax25->idle > 0)
+ mod_timer(&ax25->idletimer, jiffies + ax25->idle);
+ else
+ del_timer(&ax25->idletimer);
+}
+
+void ax25_stop_heartbeat(ax25_cb *ax25)
+{
+ del_timer(&ax25->timer);
+}
+
+void ax25_stop_t1timer(ax25_cb *ax25)
+{
+ del_timer(&ax25->t1timer);
+}
+
+void ax25_stop_t2timer(ax25_cb *ax25)
+{
+ del_timer(&ax25->t2timer);
+}
+
+void ax25_stop_t3timer(ax25_cb *ax25)
+{
+ del_timer(&ax25->t3timer);
+}
+
+void ax25_stop_idletimer(ax25_cb *ax25)
+{
+ del_timer(&ax25->idletimer);
+}
+
+int ax25_t1timer_running(ax25_cb *ax25)
+{
+ return timer_pending(&ax25->t1timer);
+}
+
+unsigned long ax25_display_timer(struct timer_list *timer)
+{
+ if (!timer_pending(timer))
+ return 0;
+
+ return timer->expires - jiffies;
+}
+
+EXPORT_SYMBOL(ax25_display_timer);
+
+static void ax25_heartbeat_expiry(unsigned long param)
+{
+ int proto = AX25_PROTO_STD_SIMPLEX;
+ ax25_cb *ax25 = (ax25_cb *)param;
+
+ if (ax25->ax25_dev)
+ proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL];
+
+ switch (proto) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_std_heartbeat_expiry(ax25);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ if (ax25->ax25_dev->dama.slave)
+ ax25_ds_heartbeat_expiry(ax25);
+ else
+ ax25_std_heartbeat_expiry(ax25);
+ break;
+#endif
+ }
+}
+
+static void ax25_t1timer_expiry(unsigned long param)
+{
+ ax25_cb *ax25 = (ax25_cb *)param;
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_std_t1timer_expiry(ax25);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ if (!ax25->ax25_dev->dama.slave)
+ ax25_std_t1timer_expiry(ax25);
+ break;
+#endif
+ }
+}
+
+static void ax25_t2timer_expiry(unsigned long param)
+{
+ ax25_cb *ax25 = (ax25_cb *)param;
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_std_t2timer_expiry(ax25);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ if (!ax25->ax25_dev->dama.slave)
+ ax25_std_t2timer_expiry(ax25);
+ break;
+#endif
+ }
+}
+
+static void ax25_t3timer_expiry(unsigned long param)
+{
+ ax25_cb *ax25 = (ax25_cb *)param;
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_std_t3timer_expiry(ax25);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ if (ax25->ax25_dev->dama.slave)
+ ax25_ds_t3timer_expiry(ax25);
+ else
+ ax25_std_t3timer_expiry(ax25);
+ break;
+#endif
+ }
+}
+
+static void ax25_idletimer_expiry(unsigned long param)
+{
+ ax25_cb *ax25 = (ax25_cb *)param;
+
+ switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+ case AX25_PROTO_STD_SIMPLEX:
+ case AX25_PROTO_STD_DUPLEX:
+ ax25_std_idletimer_expiry(ax25);
+ break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ case AX25_PROTO_DAMA_SLAVE:
+ if (ax25->ax25_dev->dama.slave)
+ ax25_ds_idletimer_expiry(ax25);
+ else
+ ax25_std_idletimer_expiry(ax25);
+ break;
+#endif
+ }
+}
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
new file mode 100644
index 000000000..71c4badbc
--- /dev/null
+++ b/net/ax25/ax25_uid.c
@@ -0,0 +1,222 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/netfilter.h>
+#include <linux/sysctl.h>
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/arp.h>
+
+/*
+ * Callsign/UID mapper. This is in kernel space for security on multi-amateur machines.
+ */
+
+static HLIST_HEAD(ax25_uid_list);
+static DEFINE_RWLOCK(ax25_uid_lock);
+
+int ax25_uid_policy;
+
+EXPORT_SYMBOL(ax25_uid_policy);
+
+ax25_uid_assoc *ax25_findbyuid(kuid_t uid)
+{
+ ax25_uid_assoc *ax25_uid, *res = NULL;
+
+ read_lock(&ax25_uid_lock);
+ ax25_uid_for_each(ax25_uid, &ax25_uid_list) {
+ if (uid_eq(ax25_uid->uid, uid)) {
+ ax25_uid_hold(ax25_uid);
+ res = ax25_uid;
+ break;
+ }
+ }
+ read_unlock(&ax25_uid_lock);
+
+ return res;
+}
+
+EXPORT_SYMBOL(ax25_findbyuid);
+
+int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
+{
+ ax25_uid_assoc *ax25_uid;
+ ax25_uid_assoc *user;
+ unsigned long res;
+
+ switch (cmd) {
+ case SIOCAX25GETUID:
+ res = -ENOENT;
+ read_lock(&ax25_uid_lock);
+ ax25_uid_for_each(ax25_uid, &ax25_uid_list) {
+ if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
+ res = from_kuid_munged(current_user_ns(), ax25_uid->uid);
+ break;
+ }
+ }
+ read_unlock(&ax25_uid_lock);
+
+ return res;
+
+ case SIOCAX25ADDUID:
+ {
+ kuid_t sax25_kuid;
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ sax25_kuid = make_kuid(current_user_ns(), sax->sax25_uid);
+ if (!uid_valid(sax25_kuid))
+ return -EINVAL;
+ user = ax25_findbyuid(sax25_kuid);
+ if (user) {
+ ax25_uid_put(user);
+ return -EEXIST;
+ }
+ if (sax->sax25_uid == 0)
+ return -EINVAL;
+ if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+
+ atomic_set(&ax25_uid->refcount, 1);
+ ax25_uid->uid = sax25_kuid;
+ ax25_uid->call = sax->sax25_call;
+
+ write_lock(&ax25_uid_lock);
+ hlist_add_head(&ax25_uid->uid_node, &ax25_uid_list);
+ write_unlock(&ax25_uid_lock);
+
+ return 0;
+ }
+ case SIOCAX25DELUID:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ ax25_uid = NULL;
+ write_lock(&ax25_uid_lock);
+ ax25_uid_for_each(ax25_uid, &ax25_uid_list) {
+ if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0)
+ break;
+ }
+ if (ax25_uid == NULL) {
+ write_unlock(&ax25_uid_lock);
+ return -ENOENT;
+ }
+ hlist_del_init(&ax25_uid->uid_node);
+ ax25_uid_put(ax25_uid);
+ write_unlock(&ax25_uid_lock);
+
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+
+ return -EINVAL; /*NOTREACHED */
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(ax25_uid_lock)
+{
+ read_lock(&ax25_uid_lock);
+ return seq_hlist_start_head(&ax25_uid_list, *pos);
+}
+
+static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_hlist_next(v, &ax25_uid_list, pos);
+}
+
+static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
+ __releases(ax25_uid_lock)
+{
+ read_unlock(&ax25_uid_lock);
+}
+
+static int ax25_uid_seq_show(struct seq_file *seq, void *v)
+{
+ char buf[11];
+
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, "Policy: %d\n", ax25_uid_policy);
+ else {
+ struct ax25_uid_assoc *pt;
+
+ pt = hlist_entry(v, struct ax25_uid_assoc, uid_node);
+ seq_printf(seq, "%6d %s\n",
+ from_kuid_munged(seq_user_ns(seq), pt->uid),
+ ax2asc(buf, &pt->call));
+ }
+ return 0;
+}
+
+static const struct seq_operations ax25_uid_seqops = {
+ .start = ax25_uid_seq_start,
+ .next = ax25_uid_seq_next,
+ .stop = ax25_uid_seq_stop,
+ .show = ax25_uid_seq_show,
+};
+
+static int ax25_uid_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ax25_uid_seqops);
+}
+
+const struct file_operations ax25_uid_fops = {
+ .owner = THIS_MODULE,
+ .open = ax25_uid_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif
+
+/*
+ * Free all memory associated with UID/Callsign structures.
+ */
+void __exit ax25_uid_free(void)
+{
+ ax25_uid_assoc *ax25_uid;
+
+ write_lock(&ax25_uid_lock);
+again:
+ ax25_uid_for_each(ax25_uid, &ax25_uid_list) {
+ hlist_del_init(&ax25_uid->uid_node);
+ ax25_uid_put(ax25_uid);
+ goto again;
+ }
+ write_unlock(&ax25_uid_lock);
+}
diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c
new file mode 100644
index 000000000..919a5ce47
--- /dev/null
+++ b/net/ax25/sysctl_net_ax25.c
@@ -0,0 +1,184 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com)
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+#include <net/ax25.h>
+
+static int min_ipdefmode[1], max_ipdefmode[] = {1};
+static int min_axdefmode[1], max_axdefmode[] = {1};
+static int min_backoff[1], max_backoff[] = {2};
+static int min_conmode[1], max_conmode[] = {2};
+static int min_window[] = {1}, max_window[] = {7};
+static int min_ewindow[] = {1}, max_ewindow[] = {63};
+static int min_t1[] = {1}, max_t1[] = {30000};
+static int min_t2[] = {1}, max_t2[] = {20000};
+static int min_t3[1], max_t3[] = {3600000};
+static int min_idle[1], max_idle[] = {65535000};
+static int min_n2[] = {1}, max_n2[] = {31};
+static int min_paclen[] = {1}, max_paclen[] = {512};
+static int min_proto[1], max_proto[] = { AX25_PROTO_MAX };
+#ifdef CONFIG_AX25_DAMA_SLAVE
+static int min_ds_timeout[1], max_ds_timeout[] = {65535000};
+#endif
+
+static const struct ctl_table ax25_param_table[] = {
+ {
+ .procname = "ip_default_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_ipdefmode,
+ .extra2 = &max_ipdefmode
+ },
+ {
+ .procname = "ax25_default_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_axdefmode,
+ .extra2 = &max_axdefmode
+ },
+ {
+ .procname = "backoff_type",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_backoff,
+ .extra2 = &max_backoff
+ },
+ {
+ .procname = "connect_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_conmode,
+ .extra2 = &max_conmode
+ },
+ {
+ .procname = "standard_window_size",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_window,
+ .extra2 = &max_window
+ },
+ {
+ .procname = "extended_window_size",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_ewindow,
+ .extra2 = &max_ewindow
+ },
+ {
+ .procname = "t1_timeout",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_t1,
+ .extra2 = &max_t1
+ },
+ {
+ .procname = "t2_timeout",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_t2,
+ .extra2 = &max_t2
+ },
+ {
+ .procname = "t3_timeout",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_t3,
+ .extra2 = &max_t3
+ },
+ {
+ .procname = "idle_timeout",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_idle,
+ .extra2 = &max_idle
+ },
+ {
+ .procname = "maximum_retry_count",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_n2,
+ .extra2 = &max_n2
+ },
+ {
+ .procname = "maximum_packet_length",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_paclen,
+ .extra2 = &max_paclen
+ },
+ {
+ .procname = "protocol",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_proto,
+ .extra2 = &max_proto
+ },
+#ifdef CONFIG_AX25_DAMA_SLAVE
+ {
+ .procname = "dama_slave_timeout",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_ds_timeout,
+ .extra2 = &max_ds_timeout
+ },
+#endif
+
+ { } /* that's all, folks! */
+};
+
+int ax25_register_dev_sysctl(ax25_dev *ax25_dev)
+{
+ char path[sizeof("net/ax25/") + IFNAMSIZ];
+ int k;
+ struct ctl_table *table;
+
+ table = kmemdup(ax25_param_table, sizeof(ax25_param_table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ for (k = 0; k < AX25_MAX_VALUES; k++)
+ table[k].data = &ax25_dev->values[k];
+
+ snprintf(path, sizeof(path), "net/ax25/%s", ax25_dev->dev->name);
+ ax25_dev->sysheader = register_net_sysctl(&init_net, path, table);
+ if (!ax25_dev->sysheader) {
+ kfree(table);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev)
+{
+ struct ctl_table_header *header = ax25_dev->sysheader;
+ struct ctl_table *table;
+
+ if (header) {
+ ax25_dev->sysheader = NULL;
+ table = header->ctl_table_arg;
+ unregister_net_sysctl_table(header);
+ kfree(table);
+ }
+}
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
new file mode 100644
index 000000000..c6fc8f756
--- /dev/null
+++ b/net/batman-adv/Kconfig
@@ -0,0 +1,70 @@
+#
+# B.A.T.M.A.N meshing protocol
+#
+
+config BATMAN_ADV
+ tristate "B.A.T.M.A.N. Advanced Meshing Protocol"
+ depends on NET
+ select CRC16
+ select LIBCRC32C
+ default n
+ help
+ B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
+ a routing protocol for multi-hop ad-hoc mesh networks. The
+ networks may be wired or wireless. See
+ http://www.open-mesh.org/ for more information and user space
+ tools.
+
+config BATMAN_ADV_BLA
+ bool "Bridge Loop Avoidance"
+ depends on BATMAN_ADV && INET
+ default y
+ help
+ This option enables BLA (Bridge Loop Avoidance), a mechanism
+ to avoid Ethernet frames looping when mesh nodes are connected
+ to both the same LAN and the same mesh. If you will never use
+ more than one mesh node in the same LAN, you can safely remove
+ this feature and save some space.
+
+config BATMAN_ADV_DAT
+ bool "Distributed ARP Table"
+ depends on BATMAN_ADV && INET
+ default n
+ help
+ This option enables DAT (Distributed ARP Table), a DHT based
+ mechanism that increases ARP reliability on sparse wireless
+ mesh networks. If you think that your network does not need
+ this option you can safely remove it and save some space.
+
+config BATMAN_ADV_NC
+ bool "Network Coding"
+ depends on BATMAN_ADV
+ default n
+ help
+ This option enables network coding, a mechanism that aims to
+ increase the overall network throughput by fusing multiple
+ packets in one transmission.
+ Note that interfaces controlled by batman-adv must be manually
+ configured to have promiscuous mode enabled in order to make
+ network coding work.
+ If you think that your network does not need this feature you
+ can safely disable it and save some space.
+
+config BATMAN_ADV_MCAST
+ bool "Multicast optimisation"
+ depends on BATMAN_ADV
+ default n
+ help
+ This option enables the multicast optimisation which aims to
+ reduce the air overhead while improving the reliability of
+ multicast messages.
+
+config BATMAN_ADV_DEBUG
+ bool "B.A.T.M.A.N. debugging"
+ depends on BATMAN_ADV
+ depends on DEBUG_FS
+ help
+ This is an option for use by developers; most people should
+ say N here. This enables compilation of support for
+ outputting debugging information to the kernel log. The
+ output is controlled via the module parameter debug.
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
new file mode 100644
index 000000000..eb7d8c038
--- /dev/null
+++ b/net/batman-adv/Makefile
@@ -0,0 +1,39 @@
+#
+# Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+#
+# Marek Lindner, Simon Wunderlich
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+
+obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
+batman-adv-y += bat_iv_ogm.o
+batman-adv-y += bitarray.o
+batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
+batman-adv-y += debugfs.o
+batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o
+batman-adv-y += fragmentation.o
+batman-adv-y += gateway_client.o
+batman-adv-y += gateway_common.o
+batman-adv-y += hard-interface.o
+batman-adv-y += hash.o
+batman-adv-y += icmp_socket.o
+batman-adv-y += main.o
+batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o
+batman-adv-y += originator.o
+batman-adv-y += routing.o
+batman-adv-y += send.o
+batman-adv-y += soft-interface.o
+batman-adv-y += sysfs.o
+batman-adv-y += translation-table.o
+batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
new file mode 100644
index 000000000..4e49666f8
--- /dev/null
+++ b/net/batman-adv/bat_algo.h
@@ -0,0 +1,23 @@
+/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_BAT_ALGO_H_
+#define _NET_BATMAN_ADV_BAT_ALGO_H_
+
+int batadv_iv_init(void);
+
+#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
new file mode 100644
index 000000000..00e00e09b
--- /dev/null
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -0,0 +1,1979 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "translation-table.h"
+#include "originator.h"
+#include "routing.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+#include "hard-interface.h"
+#include "send.h"
+#include "bat_algo.h"
+#include "network-coding.h"
+
+/**
+ * enum batadv_dup_status - duplicate status
+ * @BATADV_NO_DUP: the packet is a duplicate
+ * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for the
+ * neighbor)
+ * @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor
+ * @BATADV_PROTECTED: originator is currently protected (after reboot)
+ */
+enum batadv_dup_status {
+ BATADV_NO_DUP = 0,
+ BATADV_ORIG_DUP,
+ BATADV_NEIGH_DUP,
+ BATADV_PROTECTED,
+};
+
+/**
+ * batadv_ring_buffer_set - update the ring buffer with the given value
+ * @lq_recv: pointer to the ring buffer
+ * @lq_index: index to store the value at
+ * @value: value to store in the ring buffer
+ */
+static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index,
+ uint8_t value)
+{
+ lq_recv[*lq_index] = value;
+ *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE;
+}
+
+/**
+ * batadv_ring_buffer_set - compute the average of all non-zero values stored
+ * in the given ring buffer
+ * @lq_recv: pointer to the ring buffer
+ *
+ * Returns computed average value.
+ */
+static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[])
+{
+ const uint8_t *ptr;
+ uint16_t count = 0, i = 0, sum = 0;
+
+ ptr = lq_recv;
+
+ while (i < BATADV_TQ_GLOBAL_WINDOW_SIZE) {
+ if (*ptr != 0) {
+ count++;
+ sum += *ptr;
+ }
+
+ i++;
+ ptr++;
+ }
+
+ if (count == 0)
+ return 0;
+
+ return (uint8_t)(sum / count);
+}
+
+/**
+ * batadv_iv_ogm_orig_free - free the private resources allocated for this
+ * orig_node
+ * @orig_node: the orig_node for which the resources have to be free'd
+ */
+static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node)
+{
+ kfree(orig_node->bat_iv.bcast_own);
+ kfree(orig_node->bat_iv.bcast_own_sum);
+}
+
+/**
+ * batadv_iv_ogm_orig_add_if - change the private structures of the orig_node to
+ * include the new hard-interface
+ * @orig_node: the orig_node that has to be changed
+ * @max_if_num: the current amount of interfaces
+ *
+ * Returns 0 on success, a negative error code otherwise.
+ */
+static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
+ int max_if_num)
+{
+ void *data_ptr;
+ size_t old_size;
+ int ret = -ENOMEM;
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ old_size = (max_if_num - 1) * sizeof(unsigned long) * BATADV_NUM_WORDS;
+ data_ptr = kmalloc_array(max_if_num,
+ BATADV_NUM_WORDS * sizeof(unsigned long),
+ GFP_ATOMIC);
+ if (!data_ptr)
+ goto unlock;
+
+ memcpy(data_ptr, orig_node->bat_iv.bcast_own, old_size);
+ kfree(orig_node->bat_iv.bcast_own);
+ orig_node->bat_iv.bcast_own = data_ptr;
+
+ data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC);
+ if (!data_ptr) {
+ kfree(orig_node->bat_iv.bcast_own);
+ goto unlock;
+ }
+
+ memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
+ (max_if_num - 1) * sizeof(uint8_t));
+ kfree(orig_node->bat_iv.bcast_own_sum);
+ orig_node->bat_iv.bcast_own_sum = data_ptr;
+
+ ret = 0;
+
+unlock:
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to
+ * exclude the removed interface
+ * @orig_node: the orig_node that has to be changed
+ * @max_if_num: the current amount of interfaces
+ * @del_if_num: the index of the interface being removed
+ *
+ * Returns 0 on success, a negative error code otherwise.
+ */
+static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
+ int max_if_num, int del_if_num)
+{
+ int chunk_size, ret = -ENOMEM, if_offset;
+ void *data_ptr = NULL;
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ /* last interface was removed */
+ if (max_if_num == 0)
+ goto free_bcast_own;
+
+ chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS;
+ data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC);
+ if (!data_ptr)
+ goto unlock;
+
+ /* copy first part */
+ memcpy(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size);
+
+ /* copy second part */
+ memcpy((char *)data_ptr + del_if_num * chunk_size,
+ orig_node->bat_iv.bcast_own + ((del_if_num + 1) * chunk_size),
+ (max_if_num - del_if_num) * chunk_size);
+
+free_bcast_own:
+ kfree(orig_node->bat_iv.bcast_own);
+ orig_node->bat_iv.bcast_own = data_ptr;
+
+ if (max_if_num == 0)
+ goto free_own_sum;
+
+ data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC);
+ if (!data_ptr) {
+ kfree(orig_node->bat_iv.bcast_own);
+ goto unlock;
+ }
+
+ memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
+ del_if_num * sizeof(uint8_t));
+
+ if_offset = (del_if_num + 1) * sizeof(uint8_t);
+ memcpy((char *)data_ptr + del_if_num * sizeof(uint8_t),
+ orig_node->bat_iv.bcast_own_sum + if_offset,
+ (max_if_num - del_if_num) * sizeof(uint8_t));
+
+free_own_sum:
+ kfree(orig_node->bat_iv.bcast_own_sum);
+ orig_node->bat_iv.bcast_own_sum = data_ptr;
+
+ ret = 0;
+unlock:
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_orig_get - retrieve or create (if does not exist) an originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: mac address of the originator
+ *
+ * Returns the originator object corresponding to the passed mac address or NULL
+ * on failure.
+ * If the object does not exists it is created an initialised.
+ */
+static struct batadv_orig_node *
+batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr)
+{
+ struct batadv_orig_node *orig_node;
+ int size, hash_added;
+
+ orig_node = batadv_orig_hash_find(bat_priv, addr);
+ if (orig_node)
+ return orig_node;
+
+ orig_node = batadv_orig_node_new(bat_priv, addr);
+ if (!orig_node)
+ return NULL;
+
+ spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock);
+
+ size = bat_priv->num_ifaces * sizeof(unsigned long) * BATADV_NUM_WORDS;
+ orig_node->bat_iv.bcast_own = kzalloc(size, GFP_ATOMIC);
+ if (!orig_node->bat_iv.bcast_own)
+ goto free_orig_node;
+
+ size = bat_priv->num_ifaces * sizeof(uint8_t);
+ orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC);
+ if (!orig_node->bat_iv.bcast_own_sum)
+ goto free_orig_node;
+
+ hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
+ batadv_choose_orig, orig_node,
+ &orig_node->hash_entry);
+ if (hash_added != 0)
+ goto free_orig_node;
+
+ return orig_node;
+
+free_orig_node:
+ /* free twice, as batadv_orig_node_new sets refcount to 2 */
+ batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_free_ref(orig_node);
+
+ return NULL;
+}
+
+static struct batadv_neigh_node *
+batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface,
+ const uint8_t *neigh_addr,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_neigh_node *neigh_node, *tmp_neigh_node;
+
+ neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node);
+ if (!neigh_node)
+ goto out;
+
+ if (!atomic_inc_not_zero(&hard_iface->refcount)) {
+ kfree(neigh_node);
+ neigh_node = NULL;
+ goto out;
+ }
+
+ neigh_node->orig_node = orig_neigh;
+ neigh_node->if_incoming = hard_iface;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface,
+ neigh_addr);
+ if (!tmp_neigh_node) {
+ hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
+ } else {
+ kfree(neigh_node);
+ batadv_hardif_free_ref(hard_iface);
+ neigh_node = tmp_neigh_node;
+ }
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ if (!tmp_neigh_node)
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Creating new neighbor %pM for orig_node %pM on interface %s\n",
+ neigh_addr, orig_node->orig,
+ hard_iface->net_dev->name);
+
+out:
+ return neigh_node;
+}
+
+static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ unsigned char *ogm_buff;
+ uint32_t random_seqno;
+ int res = -ENOMEM;
+
+ /* randomize initial seqno to avoid collision */
+ get_random_bytes(&random_seqno, sizeof(random_seqno));
+ atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno);
+
+ hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN;
+ ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC);
+ if (!ogm_buff)
+ goto out;
+
+ hard_iface->bat_iv.ogm_buff = ogm_buff;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
+ batadv_ogm_packet->packet_type = BATADV_IV_OGM;
+ batadv_ogm_packet->version = BATADV_COMPAT_VERSION;
+ batadv_ogm_packet->ttl = 2;
+ batadv_ogm_packet->flags = BATADV_NO_FLAGS;
+ batadv_ogm_packet->reserved = 0;
+ batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE;
+
+ res = 0;
+
+out:
+ return res;
+}
+
+static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ kfree(hard_iface->bat_iv.ogm_buff);
+ hard_iface->bat_iv.ogm_buff = NULL;
+}
+
+static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
+ ether_addr_copy(batadv_ogm_packet->orig,
+ hard_iface->net_dev->dev_addr);
+ ether_addr_copy(batadv_ogm_packet->prev_sender,
+ hard_iface->net_dev->dev_addr);
+}
+
+static void
+batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
+ batadv_ogm_packet->flags = BATADV_PRIMARIES_FIRST_HOP;
+ batadv_ogm_packet->ttl = BATADV_TTL;
+}
+
+/* when do we schedule our own ogm to be sent */
+static unsigned long
+batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv)
+{
+ unsigned int msecs;
+
+ msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
+ msecs += prandom_u32() % (2 * BATADV_JITTER);
+
+ return jiffies + msecs_to_jiffies(msecs);
+}
+
+/* when do we schedule a ogm packet to be sent */
+static unsigned long batadv_iv_ogm_fwd_send_time(void)
+{
+ return jiffies + msecs_to_jiffies(prandom_u32() % (BATADV_JITTER / 2));
+}
+
+/* apply hop penalty for a normal link */
+static uint8_t batadv_hop_penalty(uint8_t tq,
+ const struct batadv_priv *bat_priv)
+{
+ int hop_penalty = atomic_read(&bat_priv->hop_penalty);
+ int new_tq;
+
+ new_tq = tq * (BATADV_TQ_MAX_VALUE - hop_penalty);
+ new_tq /= BATADV_TQ_MAX_VALUE;
+
+ return new_tq;
+}
+
+/* is there another aggregated packet here? */
+static int batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
+ __be16 tvlv_len)
+{
+ int next_buff_pos = 0;
+
+ next_buff_pos += buff_pos + BATADV_OGM_HLEN;
+ next_buff_pos += ntohs(tvlv_len);
+
+ return (next_buff_pos <= packet_len) &&
+ (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
+}
+
+/* send a batman ogm to a given interface */
+static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ char *fwd_str;
+ uint8_t packet_num;
+ int16_t buff_pos;
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ struct sk_buff *skb;
+ uint8_t *packet_pos;
+
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ return;
+
+ packet_num = 0;
+ buff_pos = 0;
+ packet_pos = forw_packet->skb->data;
+ batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos;
+
+ /* adjust all flags and log packets */
+ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len,
+ batadv_ogm_packet->tvlv_len)) {
+ /* we might have aggregated direct link packets with an
+ * ordinary base packet
+ */
+ if (forw_packet->direct_link_flags & BIT(packet_num) &&
+ forw_packet->if_incoming == hard_iface)
+ batadv_ogm_packet->flags |= BATADV_DIRECTLINK;
+ else
+ batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK;
+
+ if (packet_num > 0 || !forw_packet->own)
+ fwd_str = "Forwarding";
+ else
+ fwd_str = "Sending own";
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s %spacket (originator %pM, seqno %u, TQ %d, TTL %d, IDF %s) on interface %s [%pM]\n",
+ fwd_str, (packet_num > 0 ? "aggregated " : ""),
+ batadv_ogm_packet->orig,
+ ntohl(batadv_ogm_packet->seqno),
+ batadv_ogm_packet->tq, batadv_ogm_packet->ttl,
+ (batadv_ogm_packet->flags & BATADV_DIRECTLINK ?
+ "on" : "off"),
+ hard_iface->net_dev->name,
+ hard_iface->net_dev->dev_addr);
+
+ buff_pos += BATADV_OGM_HLEN;
+ buff_pos += ntohs(batadv_ogm_packet->tvlv_len);
+ packet_num++;
+ packet_pos = forw_packet->skb->data + buff_pos;
+ batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos;
+ }
+
+ /* create clone because function is called more than once */
+ skb = skb_clone(forw_packet->skb, GFP_ATOMIC);
+ if (skb) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
+ skb->len + ETH_HLEN);
+ batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+ }
+}
+
+/* send a batman ogm packet */
+static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
+{
+ struct net_device *soft_iface;
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+
+ if (!forw_packet->if_incoming) {
+ pr_err("Error - can't forward packet: incoming iface not specified\n");
+ goto out;
+ }
+
+ soft_iface = forw_packet->if_incoming->soft_iface;
+ bat_priv = netdev_priv(soft_iface);
+
+ if (WARN_ON(!forw_packet->if_outgoing))
+ goto out;
+
+ if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface))
+ goto out;
+
+ if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE)
+ goto out;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* only for one specific outgoing interface */
+ batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing);
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+}
+
+/**
+ * batadv_iv_ogm_can_aggregate - find out if an OGM can be aggregated on an
+ * existing forward packet
+ * @new_bat_ogm_packet: OGM packet to be aggregated
+ * @bat_priv: the bat priv with all the soft interface information
+ * @packet_len: (total) length of the OGM
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ * @directlink: true if this is a direct link packet
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ * @forw_packet: the forwarded packet which should be checked
+ *
+ * Returns true if new_packet can be aggregated with forw_packet
+ */
+static bool
+batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
+ struct batadv_priv *bat_priv,
+ int packet_len, unsigned long send_time,
+ bool directlink,
+ const struct batadv_hard_iface *if_incoming,
+ const struct batadv_hard_iface *if_outgoing,
+ const struct batadv_forw_packet *forw_packet)
+{
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ int aggregated_bytes = forw_packet->packet_len + packet_len;
+ struct batadv_hard_iface *primary_if = NULL;
+ bool res = false;
+ unsigned long aggregation_end_time;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)forw_packet->skb->data;
+ aggregation_end_time = send_time;
+ aggregation_end_time += msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS);
+
+ /* we can aggregate the current packet to this aggregated packet
+ * if:
+ *
+ * - the send time is within our MAX_AGGREGATION_MS time
+ * - the resulting packet wont be bigger than
+ * MAX_AGGREGATION_BYTES
+ */
+ if (time_before(send_time, forw_packet->send_time) &&
+ time_after_eq(aggregation_end_time, forw_packet->send_time) &&
+ (aggregated_bytes <= BATADV_MAX_AGGREGATION_BYTES)) {
+ /* check aggregation compatibility
+ * -> direct link packets are broadcasted on
+ * their interface only
+ * -> aggregate packet if the current packet is
+ * a "global" packet as well as the base
+ * packet
+ */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* packet is not leaving on the same interface. */
+ if (forw_packet->if_outgoing != if_outgoing)
+ goto out;
+
+ /* packets without direct link flag and high TTL
+ * are flooded through the net
+ */
+ if ((!directlink) &&
+ (!(batadv_ogm_packet->flags & BATADV_DIRECTLINK)) &&
+ (batadv_ogm_packet->ttl != 1) &&
+
+ /* own packets originating non-primary
+ * interfaces leave only that interface
+ */
+ ((!forw_packet->own) ||
+ (forw_packet->if_incoming == primary_if))) {
+ res = true;
+ goto out;
+ }
+
+ /* if the incoming packet is sent via this one
+ * interface only - we still can aggregate
+ */
+ if ((directlink) &&
+ (new_bat_ogm_packet->ttl == 1) &&
+ (forw_packet->if_incoming == if_incoming) &&
+
+ /* packets from direct neighbors or
+ * own secondary interface packets
+ * (= secondary interface packets in general)
+ */
+ (batadv_ogm_packet->flags & BATADV_DIRECTLINK ||
+ (forw_packet->own &&
+ forw_packet->if_incoming != primary_if))) {
+ res = true;
+ goto out;
+ }
+ }
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return res;
+}
+
+/**
+ * batadv_iv_ogm_aggregate_new - create a new aggregated packet and add this
+ * packet to it.
+ * @packet_buff: pointer to the OGM
+ * @packet_len: (total) length of the OGM
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ * @direct_link: whether this OGM has direct link status
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ * @own_packet: true if it is a self-generated ogm
+ */
+static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
+ int packet_len, unsigned long send_time,
+ bool direct_link,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ int own_packet)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_forw_packet *forw_packet_aggr;
+ unsigned char *skb_buff;
+ unsigned int skb_size;
+
+ if (!atomic_inc_not_zero(&if_incoming->refcount))
+ return;
+
+ if (!atomic_inc_not_zero(&if_outgoing->refcount))
+ goto out_free_incoming;
+
+ /* own packet should always be scheduled */
+ if (!own_packet) {
+ if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "batman packet queue full\n");
+ goto out;
+ }
+ }
+
+ forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC);
+ if (!forw_packet_aggr) {
+ if (!own_packet)
+ atomic_inc(&bat_priv->batman_queue_left);
+ goto out;
+ }
+
+ if ((atomic_read(&bat_priv->aggregated_ogms)) &&
+ (packet_len < BATADV_MAX_AGGREGATION_BYTES))
+ skb_size = BATADV_MAX_AGGREGATION_BYTES;
+ else
+ skb_size = packet_len;
+
+ skb_size += ETH_HLEN;
+
+ forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size);
+ if (!forw_packet_aggr->skb) {
+ if (!own_packet)
+ atomic_inc(&bat_priv->batman_queue_left);
+ kfree(forw_packet_aggr);
+ goto out;
+ }
+ forw_packet_aggr->skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(forw_packet_aggr->skb, ETH_HLEN);
+
+ skb_buff = skb_put(forw_packet_aggr->skb, packet_len);
+ forw_packet_aggr->packet_len = packet_len;
+ memcpy(skb_buff, packet_buff, packet_len);
+
+ forw_packet_aggr->own = own_packet;
+ forw_packet_aggr->if_incoming = if_incoming;
+ forw_packet_aggr->if_outgoing = if_outgoing;
+ forw_packet_aggr->num_packets = 0;
+ forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS;
+ forw_packet_aggr->send_time = send_time;
+
+ /* save packet direct link flag status */
+ if (direct_link)
+ forw_packet_aggr->direct_link_flags |= 1;
+
+ /* add new packet to packet list */
+ spin_lock_bh(&bat_priv->forw_bat_list_lock);
+ hlist_add_head(&forw_packet_aggr->list, &bat_priv->forw_bat_list);
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ /* start timer for this packet */
+ INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work,
+ batadv_send_outstanding_bat_ogm_packet);
+ queue_delayed_work(batadv_event_workqueue,
+ &forw_packet_aggr->delayed_work,
+ send_time - jiffies);
+
+ return;
+out:
+ batadv_hardif_free_ref(if_outgoing);
+out_free_incoming:
+ batadv_hardif_free_ref(if_incoming);
+}
+
+/* aggregate a new packet into the existing ogm packet */
+static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr,
+ const unsigned char *packet_buff,
+ int packet_len, bool direct_link)
+{
+ unsigned char *skb_buff;
+ unsigned long new_direct_link_flag;
+
+ skb_buff = skb_put(forw_packet_aggr->skb, packet_len);
+ memcpy(skb_buff, packet_buff, packet_len);
+ forw_packet_aggr->packet_len += packet_len;
+ forw_packet_aggr->num_packets++;
+
+ /* save packet direct link flag status */
+ if (direct_link) {
+ new_direct_link_flag = BIT(forw_packet_aggr->num_packets);
+ forw_packet_aggr->direct_link_flags |= new_direct_link_flag;
+ }
+}
+
+/**
+ * batadv_iv_ogm_queue_add - queue up an OGM for transmission
+ * @bat_priv: the bat priv with all the soft interface information
+ * @packet_buff: pointer to the OGM
+ * @packet_len: (total) length of the OGM
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ * @own_packet: true if it is a self-generated ogm
+ * @send_time: timestamp (jiffies) when the packet is to be sent
+ */
+static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv,
+ unsigned char *packet_buff,
+ int packet_len,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ int own_packet, unsigned long send_time)
+{
+ /* _aggr -> pointer to the packet we want to aggregate with
+ * _pos -> pointer to the position in the queue
+ */
+ struct batadv_forw_packet *forw_packet_aggr = NULL;
+ struct batadv_forw_packet *forw_packet_pos = NULL;
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ bool direct_link;
+ unsigned long max_aggregation_jiffies;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff;
+ direct_link = batadv_ogm_packet->flags & BATADV_DIRECTLINK ? 1 : 0;
+ max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS);
+
+ /* find position for the packet in the forward queue */
+ spin_lock_bh(&bat_priv->forw_bat_list_lock);
+ /* own packets are not to be aggregated */
+ if ((atomic_read(&bat_priv->aggregated_ogms)) && (!own_packet)) {
+ hlist_for_each_entry(forw_packet_pos,
+ &bat_priv->forw_bat_list, list) {
+ if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet,
+ bat_priv, packet_len,
+ send_time, direct_link,
+ if_incoming,
+ if_outgoing,
+ forw_packet_pos)) {
+ forw_packet_aggr = forw_packet_pos;
+ break;
+ }
+ }
+ }
+
+ /* nothing to aggregate with - either aggregation disabled or no
+ * suitable aggregation packet found
+ */
+ if (!forw_packet_aggr) {
+ /* the following section can run without the lock */
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ /* if we could not aggregate this packet with one of the others
+ * we hold it back for a while, so that it might be aggregated
+ * later on
+ */
+ if (!own_packet && atomic_read(&bat_priv->aggregated_ogms))
+ send_time += max_aggregation_jiffies;
+
+ batadv_iv_ogm_aggregate_new(packet_buff, packet_len,
+ send_time, direct_link,
+ if_incoming, if_outgoing,
+ own_packet);
+ } else {
+ batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff,
+ packet_len, direct_link);
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+ }
+}
+
+static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node,
+ const struct ethhdr *ethhdr,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ bool is_single_hop_neigh,
+ bool is_from_best_next_hop,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ uint16_t tvlv_len;
+
+ if (batadv_ogm_packet->ttl <= 1) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n");
+ return;
+ }
+
+ if (!is_from_best_next_hop) {
+ /* Mark the forwarded packet when it is not coming from our
+ * best next hop. We still need to forward the packet for our
+ * neighbor link quality detection to work in case the packet
+ * originated from a single hop neighbor. Otherwise we can
+ * simply drop the ogm.
+ */
+ if (is_single_hop_neigh)
+ batadv_ogm_packet->flags |= BATADV_NOT_BEST_NEXT_HOP;
+ else
+ return;
+ }
+
+ tvlv_len = ntohs(batadv_ogm_packet->tvlv_len);
+
+ batadv_ogm_packet->ttl--;
+ ether_addr_copy(batadv_ogm_packet->prev_sender, ethhdr->h_source);
+
+ /* apply hop penalty */
+ batadv_ogm_packet->tq = batadv_hop_penalty(batadv_ogm_packet->tq,
+ bat_priv);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Forwarding packet: tq: %i, ttl: %i\n",
+ batadv_ogm_packet->tq, batadv_ogm_packet->ttl);
+
+ /* switch of primaries first hop flag when forwarding */
+ batadv_ogm_packet->flags &= ~BATADV_PRIMARIES_FIRST_HOP;
+ if (is_single_hop_neigh)
+ batadv_ogm_packet->flags |= BATADV_DIRECTLINK;
+ else
+ batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK;
+
+ batadv_iv_ogm_queue_add(bat_priv, (unsigned char *)batadv_ogm_packet,
+ BATADV_OGM_HLEN + tvlv_len,
+ if_incoming, if_outgoing, 0,
+ batadv_iv_ogm_fwd_send_time());
+}
+
+/**
+ * batadv_iv_ogm_slide_own_bcast_window - bitshift own OGM broadcast windows for
+ * the given interface
+ * @hard_iface: the interface for which the windows have to be shifted
+ */
+static void
+batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node;
+ unsigned long *word;
+ uint32_t i;
+ size_t word_index;
+ uint8_t *w;
+ int if_num;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ word_index = hard_iface->if_num * BATADV_NUM_WORDS;
+ word = &orig_node->bat_iv.bcast_own[word_index];
+
+ batadv_bit_get_packet(bat_priv, word, 1, 0);
+ if_num = hard_iface->if_num;
+ w = &orig_node->bat_iv.bcast_own_sum[if_num];
+ *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE);
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ }
+ rcu_read_unlock();
+ }
+}
+
+static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff;
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ struct batadv_hard_iface *primary_if, *tmp_hard_iface;
+ int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len;
+ uint32_t seqno;
+ uint16_t tvlv_len = 0;
+ unsigned long send_time;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (hard_iface == primary_if) {
+ /* tt changes have to be committed before the tvlv data is
+ * appended as it may alter the tt tvlv container
+ */
+ batadv_tt_local_commit_changes(bat_priv);
+ tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff,
+ ogm_buff_len,
+ BATADV_OGM_HLEN);
+ }
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff);
+ batadv_ogm_packet->tvlv_len = htons(tvlv_len);
+
+ /* change sequence number to network order */
+ seqno = (uint32_t)atomic_read(&hard_iface->bat_iv.ogm_seqno);
+ batadv_ogm_packet->seqno = htonl(seqno);
+ atomic_inc(&hard_iface->bat_iv.ogm_seqno);
+
+ batadv_iv_ogm_slide_own_bcast_window(hard_iface);
+
+ send_time = batadv_iv_ogm_emit_send_time(bat_priv);
+
+ if (hard_iface != primary_if) {
+ /* OGMs from secondary interfaces are only scheduled on their
+ * respective interfaces.
+ */
+ batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len,
+ hard_iface, hard_iface, 1, send_time);
+ goto out;
+ }
+
+ /* OGMs from primary interfaces are scheduled on all
+ * interfaces.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) {
+ if (tmp_hard_iface->soft_iface != hard_iface->soft_iface)
+ continue;
+ batadv_iv_ogm_queue_add(bat_priv, *ogm_buff,
+ *ogm_buff_len, hard_iface,
+ tmp_hard_iface, 1, send_time);
+ }
+ rcu_read_unlock();
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+}
+
+/**
+ * batadv_iv_ogm_orig_update - use OGM to update corresponding data in an
+ * originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: the orig node who originally emitted the ogm packet
+ * @orig_ifinfo: ifinfo for the outgoing interface of the orig_node
+ * @ethhdr: Ethernet header of the OGM
+ * @batadv_ogm_packet: the ogm packet
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ * @dup_status: the duplicate status of this ogm packet.
+ */
+static void
+batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_ifinfo *orig_ifinfo,
+ const struct ethhdr *ethhdr,
+ const struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ enum batadv_dup_status dup_status)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL;
+ struct batadv_neigh_node *router = NULL;
+ struct batadv_orig_node *orig_node_tmp;
+ int if_num;
+ uint8_t sum_orig, sum_neigh;
+ uint8_t *neigh_addr;
+ uint8_t tq_avg;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "update_originator(): Searching and updating originator entry of received packet\n");
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node,
+ &orig_node->neigh_list, list) {
+ neigh_addr = tmp_neigh_node->addr;
+ if (batadv_compare_eth(neigh_addr, ethhdr->h_source) &&
+ tmp_neigh_node->if_incoming == if_incoming &&
+ atomic_inc_not_zero(&tmp_neigh_node->refcount)) {
+ if (WARN(neigh_node, "too many matching neigh_nodes"))
+ batadv_neigh_node_free_ref(neigh_node);
+ neigh_node = tmp_neigh_node;
+ continue;
+ }
+
+ if (dup_status != BATADV_NO_DUP)
+ continue;
+
+ /* only update the entry for this outgoing interface */
+ neigh_ifinfo = batadv_neigh_ifinfo_get(tmp_neigh_node,
+ if_outgoing);
+ if (!neigh_ifinfo)
+ continue;
+
+ spin_lock_bh(&tmp_neigh_node->ifinfo_lock);
+ batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv,
+ &neigh_ifinfo->bat_iv.tq_index, 0);
+ tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv);
+ neigh_ifinfo->bat_iv.tq_avg = tq_avg;
+ spin_unlock_bh(&tmp_neigh_node->ifinfo_lock);
+
+ batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ neigh_ifinfo = NULL;
+ }
+
+ if (!neigh_node) {
+ struct batadv_orig_node *orig_tmp;
+
+ orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source);
+ if (!orig_tmp)
+ goto unlock;
+
+ neigh_node = batadv_iv_ogm_neigh_new(if_incoming,
+ ethhdr->h_source,
+ orig_node, orig_tmp);
+
+ batadv_orig_node_free_ref(orig_tmp);
+ if (!neigh_node)
+ goto unlock;
+ } else
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Updating existing last-hop neighbor of originator\n");
+
+ rcu_read_unlock();
+ neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
+ if (!neigh_ifinfo)
+ goto out;
+
+ neigh_node->last_seen = jiffies;
+
+ spin_lock_bh(&neigh_node->ifinfo_lock);
+ batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv,
+ &neigh_ifinfo->bat_iv.tq_index,
+ batadv_ogm_packet->tq);
+ tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv);
+ neigh_ifinfo->bat_iv.tq_avg = tq_avg;
+ spin_unlock_bh(&neigh_node->ifinfo_lock);
+
+ if (dup_status == BATADV_NO_DUP) {
+ orig_ifinfo->last_ttl = batadv_ogm_packet->ttl;
+ neigh_ifinfo->last_ttl = batadv_ogm_packet->ttl;
+ }
+
+ /* if this neighbor already is our next hop there is nothing
+ * to change
+ */
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+ if (router == neigh_node)
+ goto out;
+
+ if (router) {
+ router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
+ if (!router_ifinfo)
+ goto out;
+
+ /* if this neighbor does not offer a better TQ we won't
+ * consider it
+ */
+ if (router_ifinfo->bat_iv.tq_avg > neigh_ifinfo->bat_iv.tq_avg)
+ goto out;
+ }
+
+ /* if the TQ is the same and the link not more symmetric we
+ * won't consider it either
+ */
+ if (router_ifinfo &&
+ (neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg)) {
+ orig_node_tmp = router->orig_node;
+ spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
+ if_num = router->if_incoming->if_num;
+ sum_orig = orig_node_tmp->bat_iv.bcast_own_sum[if_num];
+ spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
+
+ orig_node_tmp = neigh_node->orig_node;
+ spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
+ if_num = neigh_node->if_incoming->if_num;
+ sum_neigh = orig_node_tmp->bat_iv.bcast_own_sum[if_num];
+ spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
+
+ if (sum_orig >= sum_neigh)
+ goto out;
+ }
+
+ batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node);
+ goto out;
+
+unlock:
+ rcu_read_unlock();
+out:
+ if (neigh_node)
+ batadv_neigh_node_free_ref(neigh_node);
+ if (router)
+ batadv_neigh_node_free_ref(router);
+ if (neigh_ifinfo)
+ batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_free_ref(router_ifinfo);
+}
+
+/**
+ * batadv_iv_ogm_calc_tq - calculate tq for current received ogm packet
+ * @orig_node: the orig node who originally emitted the ogm packet
+ * @orig_neigh_node: the orig node struct of the neighbor who sent the packet
+ * @batadv_ogm_packet: the ogm packet
+ * @if_incoming: interface where the packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ *
+ * Returns 1 if the link can be considered bidirectional, 0 otherwise
+ */
+static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node;
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+ uint8_t total_count;
+ uint8_t orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
+ unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
+ int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0;
+ unsigned int combined_tq;
+ int tq_iface_penalty;
+
+ /* find corresponding one hop neighbor */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node,
+ &orig_neigh_node->neigh_list, list) {
+ if (!batadv_compare_eth(tmp_neigh_node->addr,
+ orig_neigh_node->orig))
+ continue;
+
+ if (tmp_neigh_node->if_incoming != if_incoming)
+ continue;
+
+ if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+ continue;
+
+ neigh_node = tmp_neigh_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!neigh_node)
+ neigh_node = batadv_iv_ogm_neigh_new(if_incoming,
+ orig_neigh_node->orig,
+ orig_neigh_node,
+ orig_neigh_node);
+
+ if (!neigh_node)
+ goto out;
+
+ /* if orig_node is direct neighbor update neigh_node last_seen */
+ if (orig_node == orig_neigh_node)
+ neigh_node->last_seen = jiffies;
+
+ orig_node->last_seen = jiffies;
+
+ /* find packet count of corresponding one hop neighbor */
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ if_num = if_incoming->if_num;
+ orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num];
+ neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
+ if (neigh_ifinfo) {
+ neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count;
+ batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ } else {
+ neigh_rq_count = 0;
+ }
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ /* pay attention to not get a value bigger than 100 % */
+ if (orig_eq_count > neigh_rq_count)
+ total_count = neigh_rq_count;
+ else
+ total_count = orig_eq_count;
+
+ /* if we have too few packets (too less data) we set tq_own to zero
+ * if we receive too few packets it is not considered bidirectional
+ */
+ if (total_count < BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM ||
+ neigh_rq_count < BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM)
+ tq_own = 0;
+ else
+ /* neigh_node->real_packet_count is never zero as we
+ * only purge old information when getting new
+ * information
+ */
+ tq_own = (BATADV_TQ_MAX_VALUE * total_count) / neigh_rq_count;
+
+ /* 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does
+ * affect the nearly-symmetric links only a little, but
+ * punishes asymmetric links more. This will give a value
+ * between 0 and TQ_MAX_VALUE
+ */
+ neigh_rq_inv = BATADV_TQ_LOCAL_WINDOW_SIZE - neigh_rq_count;
+ neigh_rq_inv_cube = neigh_rq_inv * neigh_rq_inv * neigh_rq_inv;
+ neigh_rq_max_cube = BATADV_TQ_LOCAL_WINDOW_SIZE *
+ BATADV_TQ_LOCAL_WINDOW_SIZE *
+ BATADV_TQ_LOCAL_WINDOW_SIZE;
+ inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube;
+ inv_asym_penalty /= neigh_rq_max_cube;
+ tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty;
+
+ /* penalize if the OGM is forwarded on the same interface. WiFi
+ * interfaces and other half duplex devices suffer from throughput
+ * drops as they can't send and receive at the same time.
+ */
+ tq_iface_penalty = BATADV_TQ_MAX_VALUE;
+ if (if_outgoing && (if_incoming == if_outgoing) &&
+ batadv_is_wifi_netdev(if_outgoing->net_dev))
+ tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE,
+ bat_priv);
+
+ combined_tq = batadv_ogm_packet->tq *
+ tq_own *
+ tq_asym_penalty *
+ tq_iface_penalty;
+ combined_tq /= BATADV_TQ_MAX_VALUE *
+ BATADV_TQ_MAX_VALUE *
+ BATADV_TQ_MAX_VALUE;
+ batadv_ogm_packet->tq = combined_tq;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "bidirectional: orig = %-15pM neigh = %-15pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n",
+ orig_node->orig, orig_neigh_node->orig, total_count,
+ neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_penalty,
+ batadv_ogm_packet->tq, if_incoming->net_dev->name,
+ if_outgoing ? if_outgoing->net_dev->name : "DEFAULT");
+
+ /* if link has the minimum required transmission quality
+ * consider it bidirectional
+ */
+ if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT)
+ ret = 1;
+
+out:
+ if (neigh_node)
+ batadv_neigh_node_free_ref(neigh_node);
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_update_seqnos - process a batman packet for all interfaces,
+ * adjust the sequence number and find out whether it is a duplicate
+ * @ethhdr: ethernet header of the packet
+ * @batadv_ogm_packet: OGM packet to be considered
+ * @if_incoming: interface on which the OGM packet was received
+ * @if_outgoing: interface for which the retransmission should be considered
+ *
+ * Returns duplicate status as enum batadv_dup_status
+ */
+static enum batadv_dup_status
+batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
+ const struct batadv_ogm_packet *batadv_ogm_packet,
+ const struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_orig_node *orig_node;
+ struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+ int is_dup;
+ int32_t seq_diff;
+ int need_update = 0;
+ int set_mark;
+ enum batadv_dup_status ret = BATADV_NO_DUP;
+ uint32_t seqno = ntohl(batadv_ogm_packet->seqno);
+ uint8_t *neigh_addr;
+ uint8_t packet_count;
+ unsigned long *bitmap;
+
+ orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig);
+ if (!orig_node)
+ return BATADV_NO_DUP;
+
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (WARN_ON(!orig_ifinfo)) {
+ batadv_orig_node_free_ref(orig_node);
+ return 0;
+ }
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ seq_diff = seqno - orig_ifinfo->last_real_seqno;
+
+ /* signalize caller that the packet is to be dropped. */
+ if (!hlist_empty(&orig_node->neigh_list) &&
+ batadv_window_protected(bat_priv, seq_diff,
+ &orig_ifinfo->batman_seqno_reset)) {
+ ret = BATADV_PROTECTED;
+ goto out;
+ }
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+ neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node,
+ if_outgoing);
+ if (!neigh_ifinfo)
+ continue;
+
+ neigh_addr = neigh_node->addr;
+ is_dup = batadv_test_bit(neigh_ifinfo->bat_iv.real_bits,
+ orig_ifinfo->last_real_seqno,
+ seqno);
+
+ if (batadv_compare_eth(neigh_addr, ethhdr->h_source) &&
+ neigh_node->if_incoming == if_incoming) {
+ set_mark = 1;
+ if (is_dup)
+ ret = BATADV_NEIGH_DUP;
+ } else {
+ set_mark = 0;
+ if (is_dup && (ret != BATADV_NEIGH_DUP))
+ ret = BATADV_ORIG_DUP;
+ }
+
+ /* if the window moved, set the update flag. */
+ bitmap = neigh_ifinfo->bat_iv.real_bits;
+ need_update |= batadv_bit_get_packet(bat_priv, bitmap,
+ seq_diff, set_mark);
+
+ packet_count = bitmap_weight(bitmap,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ neigh_ifinfo->bat_iv.real_packet_count = packet_count;
+ batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ }
+ rcu_read_unlock();
+
+ if (need_update) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "%s updating last_seqno: old %u, new %u\n",
+ if_outgoing ? if_outgoing->net_dev->name : "DEFAULT",
+ orig_ifinfo->last_real_seqno, seqno);
+ orig_ifinfo->last_real_seqno = seqno;
+ }
+
+out:
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ batadv_orig_node_free_ref(orig_node);
+ if (orig_ifinfo)
+ batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ return ret;
+}
+
+/**
+ * batadv_iv_ogm_process_per_outif - process a batman iv OGM for an outgoing if
+ * @skb: the skb containing the OGM
+ * @ogm_offset: offset from skb->data to start of ogm header
+ * @orig_node: the (cached) orig node for the originator of this OGM
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ */
+static void
+batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_neigh_node *router = NULL, *router_router = NULL;
+ struct batadv_orig_node *orig_neigh_node;
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_node *orig_neigh_router = NULL;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ struct batadv_ogm_packet *ogm_packet;
+ enum batadv_dup_status dup_status;
+ bool is_from_best_next_hop = false;
+ bool is_single_hop_neigh = false;
+ bool sameseq, similar_ttl;
+ struct sk_buff *skb_priv;
+ struct ethhdr *ethhdr;
+ uint8_t *prev_sender;
+ int is_bidirect;
+
+ /* create a private copy of the skb, as some functions change tq value
+ * and/or flags.
+ */
+ skb_priv = skb_copy(skb, GFP_ATOMIC);
+ if (!skb_priv)
+ return;
+
+ ethhdr = eth_hdr(skb_priv);
+ ogm_packet = (struct batadv_ogm_packet *)(skb_priv->data + ogm_offset);
+
+ dup_status = batadv_iv_ogm_update_seqnos(ethhdr, ogm_packet,
+ if_incoming, if_outgoing);
+ if (batadv_compare_eth(ethhdr->h_source, ogm_packet->orig))
+ is_single_hop_neigh = true;
+
+ if (dup_status == BATADV_PROTECTED) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: packet within seqno protection time (sender: %pM)\n",
+ ethhdr->h_source);
+ goto out;
+ }
+
+ if (ogm_packet->tq == 0) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: originator packet with tq equal 0\n");
+ goto out;
+ }
+
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+ if (router) {
+ router_router = batadv_orig_router_get(router->orig_node,
+ if_outgoing);
+ router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
+ }
+
+ if ((router_ifinfo && router_ifinfo->bat_iv.tq_avg != 0) &&
+ (batadv_compare_eth(router->addr, ethhdr->h_source)))
+ is_from_best_next_hop = true;
+
+ prev_sender = ogm_packet->prev_sender;
+ /* avoid temporary routing loops */
+ if (router && router_router &&
+ (batadv_compare_eth(router->addr, prev_sender)) &&
+ !(batadv_compare_eth(ogm_packet->orig, prev_sender)) &&
+ (batadv_compare_eth(router->addr, router_router->addr))) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: ignoring all rebroadcast packets that may make me loop (sender: %pM)\n",
+ ethhdr->h_source);
+ goto out;
+ }
+
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ batadv_tvlv_ogm_receive(bat_priv, ogm_packet, orig_node);
+
+ /* if sender is a direct neighbor the sender mac equals
+ * originator mac
+ */
+ if (is_single_hop_neigh)
+ orig_neigh_node = orig_node;
+ else
+ orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv,
+ ethhdr->h_source);
+
+ if (!orig_neigh_node)
+ goto out;
+
+ /* Update nc_nodes of the originator */
+ batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node,
+ ogm_packet, is_single_hop_neigh);
+
+ orig_neigh_router = batadv_orig_router_get(orig_neigh_node,
+ if_outgoing);
+
+ /* drop packet if sender is not a direct neighbor and if we
+ * don't route towards it
+ */
+ if (!is_single_hop_neigh && (!orig_neigh_router)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: OGM via unknown neighbor!\n");
+ goto out_neigh;
+ }
+
+ is_bidirect = batadv_iv_ogm_calc_tq(orig_node, orig_neigh_node,
+ ogm_packet, if_incoming,
+ if_outgoing);
+
+ /* update ranking if it is not a duplicate or has the same
+ * seqno and similar ttl as the non-duplicate
+ */
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ goto out_neigh;
+
+ sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno);
+ similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl;
+
+ if (is_bidirect && ((dup_status == BATADV_NO_DUP) ||
+ (sameseq && similar_ttl))) {
+ batadv_iv_ogm_orig_update(bat_priv, orig_node,
+ orig_ifinfo, ethhdr,
+ ogm_packet, if_incoming,
+ if_outgoing, dup_status);
+ }
+ batadv_orig_ifinfo_free_ref(orig_ifinfo);
+
+ /* only forward for specific interface, not for the default one. */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ goto out_neigh;
+
+ /* is single hop (direct) neighbor */
+ if (is_single_hop_neigh) {
+ /* OGMs from secondary interfaces should only scheduled once
+ * per interface where it has been received, not multiple times
+ */
+ if ((ogm_packet->ttl <= 2) &&
+ (if_incoming != if_outgoing)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: OGM from secondary interface and wrong outgoing interface\n");
+ goto out_neigh;
+ }
+ /* mark direct link on incoming interface */
+ batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet,
+ is_single_hop_neigh,
+ is_from_best_next_hop, if_incoming,
+ if_outgoing);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Forwarding packet: rebroadcast neighbor packet with direct link flag\n");
+ goto out_neigh;
+ }
+
+ /* multihop originator */
+ if (!is_bidirect) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: not received via bidirectional link\n");
+ goto out_neigh;
+ }
+
+ if (dup_status == BATADV_NEIGH_DUP) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: duplicate packet received\n");
+ goto out_neigh;
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Forwarding packet: rebroadcast originator packet\n");
+ batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet,
+ is_single_hop_neigh, is_from_best_next_hop,
+ if_incoming, if_outgoing);
+
+out_neigh:
+ if ((orig_neigh_node) && (!is_single_hop_neigh))
+ batadv_orig_node_free_ref(orig_neigh_node);
+out:
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_free_ref(router_ifinfo);
+ if (router)
+ batadv_neigh_node_free_ref(router);
+ if (router_router)
+ batadv_neigh_node_free_ref(router_router);
+ if (orig_neigh_router)
+ batadv_neigh_node_free_ref(orig_neigh_router);
+
+ kfree_skb(skb_priv);
+}
+
+/**
+ * batadv_iv_ogm_process - process an incoming batman iv OGM
+ * @skb: the skb containing the OGM
+ * @ogm_offset: offset to the OGM which should be processed (for aggregates)
+ * @if_incoming: the interface where this packet was receved
+ */
+static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_orig_node *orig_neigh_node, *orig_node;
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_ogm_packet *ogm_packet;
+ uint32_t if_incoming_seqno;
+ bool has_directlink_flag;
+ struct ethhdr *ethhdr;
+ bool is_my_oldorig = false;
+ bool is_my_addr = false;
+ bool is_my_orig = false;
+
+ ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset);
+ ethhdr = eth_hdr(skb);
+
+ /* Silently drop when the batman packet is actually not a
+ * correct packet.
+ *
+ * This might happen if a packet is padded (e.g. Ethernet has a
+ * minimum frame length of 64 byte) and the aggregation interprets
+ * it as an additional length.
+ *
+ * TODO: A more sane solution would be to have a bit in the
+ * batadv_ogm_packet to detect whether the packet is the last
+ * packet in an aggregation. Here we expect that the padding
+ * is always zero (or not 0x01)
+ */
+ if (ogm_packet->packet_type != BATADV_IV_OGM)
+ return;
+
+ /* could be changed by schedule_own_packet() */
+ if_incoming_seqno = atomic_read(&if_incoming->bat_iv.ogm_seqno);
+
+ if (ogm_packet->flags & BATADV_DIRECTLINK)
+ has_directlink_flag = true;
+ else
+ has_directlink_flag = false;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, tq %d, TTL %d, V %d, IDF %d)\n",
+ ethhdr->h_source, if_incoming->net_dev->name,
+ if_incoming->net_dev->dev_addr, ogm_packet->orig,
+ ogm_packet->prev_sender, ntohl(ogm_packet->seqno),
+ ogm_packet->tq, ogm_packet->ttl,
+ ogm_packet->version, has_directlink_flag);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (hard_iface->soft_iface != if_incoming->soft_iface)
+ continue;
+
+ if (batadv_compare_eth(ethhdr->h_source,
+ hard_iface->net_dev->dev_addr))
+ is_my_addr = true;
+
+ if (batadv_compare_eth(ogm_packet->orig,
+ hard_iface->net_dev->dev_addr))
+ is_my_orig = true;
+
+ if (batadv_compare_eth(ogm_packet->prev_sender,
+ hard_iface->net_dev->dev_addr))
+ is_my_oldorig = true;
+ }
+ rcu_read_unlock();
+
+ if (is_my_addr) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: received my own broadcast (sender: %pM)\n",
+ ethhdr->h_source);
+ return;
+ }
+
+ if (is_my_orig) {
+ unsigned long *word;
+ int offset;
+ int32_t bit_pos;
+ int16_t if_num;
+ uint8_t *weight;
+
+ orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv,
+ ethhdr->h_source);
+ if (!orig_neigh_node)
+ return;
+
+ /* neighbor has to indicate direct link and it has to
+ * come via the corresponding interface
+ * save packet seqno for bidirectional check
+ */
+ if (has_directlink_flag &&
+ batadv_compare_eth(if_incoming->net_dev->dev_addr,
+ ogm_packet->orig)) {
+ if_num = if_incoming->if_num;
+ offset = if_num * BATADV_NUM_WORDS;
+
+ spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
+ word = &orig_neigh_node->bat_iv.bcast_own[offset];
+ bit_pos = if_incoming_seqno - 2;
+ bit_pos -= ntohl(ogm_packet->seqno);
+ batadv_set_bit(word, bit_pos);
+ weight = &orig_neigh_node->bat_iv.bcast_own_sum[if_num];
+ *weight = bitmap_weight(word,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: originator packet from myself (via neighbor)\n");
+ batadv_orig_node_free_ref(orig_neigh_node);
+ return;
+ }
+
+ if (is_my_oldorig) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: ignoring all rebroadcast echos (sender: %pM)\n",
+ ethhdr->h_source);
+ return;
+ }
+
+ if (ogm_packet->flags & BATADV_NOT_BEST_NEXT_HOP) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: ignoring all packets not forwarded from the best next hop (sender: %pM)\n",
+ ethhdr->h_source);
+ return;
+ }
+
+ orig_node = batadv_iv_ogm_orig_get(bat_priv, ogm_packet->orig);
+ if (!orig_node)
+ return;
+
+ batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node,
+ if_incoming, BATADV_IF_DEFAULT);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (hard_iface->soft_iface != bat_priv->soft_iface)
+ continue;
+
+ batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node,
+ if_incoming, hard_iface);
+ }
+ rcu_read_unlock();
+
+ batadv_orig_node_free_ref(orig_node);
+}
+
+static int batadv_iv_ogm_receive(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_ogm_packet *ogm_packet;
+ uint8_t *packet_pos;
+ int ogm_offset;
+ bool ret;
+
+ ret = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN);
+ if (!ret)
+ return NET_RX_DROP;
+
+ /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface
+ * that does not have B.A.T.M.A.N. IV enabled ?
+ */
+ if (bat_priv->bat_algo_ops->bat_ogm_emit != batadv_iv_ogm_emit)
+ return NET_RX_DROP;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
+ skb->len + ETH_HLEN);
+
+ ogm_offset = 0;
+ ogm_packet = (struct batadv_ogm_packet *)skb->data;
+
+ /* unpack the aggregated packets and process them one by one */
+ while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
+ ogm_packet->tvlv_len)) {
+ batadv_iv_ogm_process(skb, ogm_offset, if_incoming);
+
+ ogm_offset += BATADV_OGM_HLEN;
+ ogm_offset += ntohs(ogm_packet->tvlv_len);
+
+ packet_pos = skb->data + ogm_offset;
+ ogm_packet = (struct batadv_ogm_packet *)packet_pos;
+ }
+
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table
+ * @orig_node: the orig_node for which the neighbors are printed
+ * @if_outgoing: outgoing interface for these entries
+ * @seq: debugfs table seq_file struct
+ *
+ * Must be called while holding an rcu lock.
+ */
+static void
+batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing,
+ struct seq_file *seq)
+{
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_neigh_ifinfo *n_ifinfo;
+
+ hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+ n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!n_ifinfo)
+ continue;
+
+ seq_printf(seq, " %pM (%3i)",
+ neigh_node->addr,
+ n_ifinfo->bat_iv.tq_avg);
+
+ batadv_neigh_ifinfo_free_ref(n_ifinfo);
+ }
+}
+
+/**
+ * batadv_iv_ogm_orig_print - print the originator table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq: debugfs table seq_file struct
+ * @if_outgoing: the outgoing interface for which this should be printed
+ */
+static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv,
+ struct seq_file *seq,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ int last_seen_msecs, last_seen_secs;
+ struct batadv_orig_node *orig_node;
+ struct batadv_neigh_ifinfo *n_ifinfo;
+ unsigned long last_seen_jiffies;
+ struct hlist_head *head;
+ int batman_count = 0;
+ uint32_t i;
+
+ seq_printf(seq, " %-15s %s (%s/%i) %17s [%10s]: %20s ...\n",
+ "Originator", "last-seen", "#", BATADV_TQ_MAX_VALUE,
+ "Nexthop", "outgoingIF", "Potential nexthops");
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ neigh_node = batadv_orig_router_get(orig_node,
+ if_outgoing);
+ if (!neigh_node)
+ continue;
+
+ n_ifinfo = batadv_neigh_ifinfo_get(neigh_node,
+ if_outgoing);
+ if (!n_ifinfo)
+ goto next;
+
+ if (n_ifinfo->bat_iv.tq_avg == 0)
+ goto next;
+
+ last_seen_jiffies = jiffies - orig_node->last_seen;
+ last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
+ last_seen_secs = last_seen_msecs / 1000;
+ last_seen_msecs = last_seen_msecs % 1000;
+
+ seq_printf(seq, "%pM %4i.%03is (%3i) %pM [%10s]:",
+ orig_node->orig, last_seen_secs,
+ last_seen_msecs, n_ifinfo->bat_iv.tq_avg,
+ neigh_node->addr,
+ neigh_node->if_incoming->net_dev->name);
+
+ batadv_iv_ogm_orig_print_neigh(orig_node, if_outgoing,
+ seq);
+ seq_puts(seq, "\n");
+ batman_count++;
+
+next:
+ batadv_neigh_node_free_ref(neigh_node);
+ if (n_ifinfo)
+ batadv_neigh_ifinfo_free_ref(n_ifinfo);
+ }
+ rcu_read_unlock();
+ }
+
+ if (batman_count == 0)
+ seq_puts(seq, "No batman nodes in range ...\n");
+}
+
+/**
+ * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors
+ * @neigh1: the first neighbor object of the comparison
+ * @if_outgoing1: outgoing interface for the first neighbor
+ * @neigh2: the second neighbor object of the comparison
+ * @if_outgoing2: outgoing interface for the second neighbor
+ *
+ * Returns a value less, equal to or greater than 0 if the metric via neigh1 is
+ * lower, the same as or higher than the metric via neigh2
+ */
+static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
+ uint8_t tq1, tq2;
+ int diff;
+
+ neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+
+ if (!neigh1_ifinfo || !neigh2_ifinfo) {
+ diff = 0;
+ goto out;
+ }
+
+ tq1 = neigh1_ifinfo->bat_iv.tq_avg;
+ tq2 = neigh2_ifinfo->bat_iv.tq_avg;
+ diff = tq1 - tq2;
+
+out:
+ if (neigh1_ifinfo)
+ batadv_neigh_ifinfo_free_ref(neigh1_ifinfo);
+ if (neigh2_ifinfo)
+ batadv_neigh_ifinfo_free_ref(neigh2_ifinfo);
+
+ return diff;
+}
+
+/**
+ * batadv_iv_ogm_neigh_is_eob - check if neigh1 is equally good or better than
+ * neigh2 from the metric prospective
+ * @neigh1: the first neighbor object of the comparison
+ * @if_outgoing1: outgoing interface for the first neighbor
+ * @neigh2: the second neighbor object of the comparison
+ * @if_outgoing2: outgoing interface for the second neighbor
+ *
+ * Returns true if the metric via neigh1 is equally good or better than
+ * the metric via neigh2, false otherwise.
+ */
+static bool
+batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo;
+ uint8_t tq1, tq2;
+ bool ret;
+
+ neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+
+ /* we can't say that the metric is better */
+ if (!neigh1_ifinfo || !neigh2_ifinfo) {
+ ret = false;
+ goto out;
+ }
+
+ tq1 = neigh1_ifinfo->bat_iv.tq_avg;
+ tq2 = neigh2_ifinfo->bat_iv.tq_avg;
+ ret = (tq1 - tq2) > -BATADV_TQ_SIMILARITY_THRESHOLD;
+
+out:
+ if (neigh1_ifinfo)
+ batadv_neigh_ifinfo_free_ref(neigh1_ifinfo);
+ if (neigh2_ifinfo)
+ batadv_neigh_ifinfo_free_ref(neigh2_ifinfo);
+
+ return ret;
+}
+
+static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
+ .name = "BATMAN_IV",
+ .bat_iface_enable = batadv_iv_ogm_iface_enable,
+ .bat_iface_disable = batadv_iv_ogm_iface_disable,
+ .bat_iface_update_mac = batadv_iv_ogm_iface_update_mac,
+ .bat_primary_iface_set = batadv_iv_ogm_primary_iface_set,
+ .bat_ogm_schedule = batadv_iv_ogm_schedule,
+ .bat_ogm_emit = batadv_iv_ogm_emit,
+ .bat_neigh_cmp = batadv_iv_ogm_neigh_cmp,
+ .bat_neigh_is_equiv_or_better = batadv_iv_ogm_neigh_is_eob,
+ .bat_orig_print = batadv_iv_ogm_orig_print,
+ .bat_orig_free = batadv_iv_ogm_orig_free,
+ .bat_orig_add_if = batadv_iv_ogm_orig_add_if,
+ .bat_orig_del_if = batadv_iv_ogm_orig_del_if,
+};
+
+int __init batadv_iv_init(void)
+{
+ int ret;
+
+ /* batman originator packet */
+ ret = batadv_recv_handler_register(BATADV_IV_OGM,
+ batadv_iv_ogm_receive);
+ if (ret < 0)
+ goto out;
+
+ ret = batadv_algo_register(&batadv_batman_iv);
+ if (ret < 0)
+ goto handler_unregister;
+
+ goto out;
+
+handler_unregister:
+ batadv_recv_handler_unregister(BATADV_IV_OGM);
+out:
+ return ret;
+}
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
new file mode 100644
index 000000000..e3da07a64
--- /dev/null
+++ b/net/batman-adv/bitarray.c
@@ -0,0 +1,92 @@
+/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "bitarray.h"
+
+#include <linux/bitops.h>
+
+/* shift the packet array by n places. */
+static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n)
+{
+ if (n <= 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE)
+ return;
+
+ bitmap_shift_left(seq_bits, seq_bits, n, BATADV_TQ_LOCAL_WINDOW_SIZE);
+}
+
+/* receive and process one packet within the sequence number window.
+ *
+ * returns:
+ * 1 if the window was moved (either new or very old)
+ * 0 if the window was not moved/shifted.
+ */
+int batadv_bit_get_packet(void *priv, unsigned long *seq_bits,
+ int32_t seq_num_diff, int set_mark)
+{
+ struct batadv_priv *bat_priv = priv;
+
+ /* sequence number is slightly older. We already got a sequence number
+ * higher than this one, so we just mark it.
+ */
+ if (seq_num_diff <= 0 && seq_num_diff > -BATADV_TQ_LOCAL_WINDOW_SIZE) {
+ if (set_mark)
+ batadv_set_bit(seq_bits, -seq_num_diff);
+ return 0;
+ }
+
+ /* sequence number is slightly newer, so we shift the window and
+ * set the mark if required
+ */
+ if (seq_num_diff > 0 && seq_num_diff < BATADV_TQ_LOCAL_WINDOW_SIZE) {
+ batadv_bitmap_shift_left(seq_bits, seq_num_diff);
+
+ if (set_mark)
+ batadv_set_bit(seq_bits, 0);
+ return 1;
+ }
+
+ /* sequence number is much newer, probably missed a lot of packets */
+ if (seq_num_diff >= BATADV_TQ_LOCAL_WINDOW_SIZE &&
+ seq_num_diff < BATADV_EXPECTED_SEQNO_RANGE) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "We missed a lot of packets (%i) !\n",
+ seq_num_diff - 1);
+ bitmap_zero(seq_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+ if (set_mark)
+ batadv_set_bit(seq_bits, 0);
+ return 1;
+ }
+
+ /* received a much older packet. The other host either restarted
+ * or the old packet got delayed somewhere in the network. The
+ * packet should be dropped without calling this function if the
+ * seqno window is protected.
+ *
+ * seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE
+ * or
+ * seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE
+ */
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Other host probably restarted!\n");
+
+ bitmap_zero(seq_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+ if (set_mark)
+ batadv_set_bit(seq_bits, 0);
+
+ return 1;
+}
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
new file mode 100644
index 000000000..2acaafe60
--- /dev/null
+++ b/net/batman-adv/bitarray.h
@@ -0,0 +1,51 @@
+/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_BITARRAY_H_
+#define _NET_BATMAN_ADV_BITARRAY_H_
+
+/* Returns 1 if the corresponding bit in the given seq_bits indicates true
+ * and curr_seqno is within range of last_seqno. Otherwise returns 0.
+ */
+static inline int batadv_test_bit(const unsigned long *seq_bits,
+ uint32_t last_seqno, uint32_t curr_seqno)
+{
+ int32_t diff;
+
+ diff = last_seqno - curr_seqno;
+ if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE)
+ return 0;
+ return test_bit(diff, seq_bits) != 0;
+}
+
+/* turn corresponding bit on, so we can remember that we got the packet */
+static inline void batadv_set_bit(unsigned long *seq_bits, int32_t n)
+{
+ /* if too old, just drop it */
+ if (n < 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE)
+ return;
+
+ set_bit(n, seq_bits); /* turn the position on */
+}
+
+/* receive and process one packet, returns 1 if received seq_num is considered
+ * new, 0 if old
+ */
+int batadv_bit_get_packet(void *priv, unsigned long *seq_bits,
+ int32_t seq_num_diff, int set_mark);
+
+#endif /* _NET_BATMAN_ADV_BITARRAY_H_ */
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
new file mode 100644
index 000000000..ac4b96ecc
--- /dev/null
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -0,0 +1,1722 @@
+/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "hash.h"
+#include "hard-interface.h"
+#include "originator.h"
+#include "bridge_loop_avoidance.h"
+#include "translation-table.h"
+#include "send.h"
+
+#include <linux/etherdevice.h>
+#include <linux/crc16.h>
+#include <linux/if_arp.h>
+#include <net/arp.h>
+#include <linux/if_vlan.h>
+
+static const uint8_t batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05};
+
+static void batadv_bla_periodic_work(struct work_struct *work);
+static void
+batadv_bla_send_announce(struct batadv_priv *bat_priv,
+ struct batadv_bla_backbone_gw *backbone_gw);
+
+/* return the index of the claim */
+static inline uint32_t batadv_choose_claim(const void *data, uint32_t size)
+{
+ struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
+ uint32_t hash = 0;
+
+ hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr));
+ hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid));
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash % size;
+}
+
+/* return the index of the backbone gateway */
+static inline uint32_t batadv_choose_backbone_gw(const void *data,
+ uint32_t size)
+{
+ const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
+ uint32_t hash = 0;
+
+ hash = batadv_hash_bytes(hash, &claim->addr, sizeof(claim->addr));
+ hash = batadv_hash_bytes(hash, &claim->vid, sizeof(claim->vid));
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash % size;
+}
+
+/* compares address and vid of two backbone gws */
+static int batadv_compare_backbone_gw(const struct hlist_node *node,
+ const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_bla_backbone_gw,
+ hash_entry);
+ const struct batadv_bla_backbone_gw *gw1 = data1, *gw2 = data2;
+
+ if (!batadv_compare_eth(gw1->orig, gw2->orig))
+ return 0;
+
+ if (gw1->vid != gw2->vid)
+ return 0;
+
+ return 1;
+}
+
+/* compares address and vid of two claims */
+static int batadv_compare_claim(const struct hlist_node *node,
+ const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_bla_claim,
+ hash_entry);
+ const struct batadv_bla_claim *cl1 = data1, *cl2 = data2;
+
+ if (!batadv_compare_eth(cl1->addr, cl2->addr))
+ return 0;
+
+ if (cl1->vid != cl2->vid)
+ return 0;
+
+ return 1;
+}
+
+/* free a backbone gw */
+static void
+batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw)
+{
+ if (atomic_dec_and_test(&backbone_gw->refcount))
+ kfree_rcu(backbone_gw, rcu);
+}
+
+/* finally deinitialize the claim */
+static void batadv_claim_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_bla_claim *claim;
+
+ claim = container_of(rcu, struct batadv_bla_claim, rcu);
+
+ batadv_backbone_gw_free_ref(claim->backbone_gw);
+ kfree(claim);
+}
+
+/* free a claim, call claim_free_rcu if its the last reference */
+static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
+{
+ if (atomic_dec_and_test(&claim->refcount))
+ call_rcu(&claim->rcu, batadv_claim_free_rcu);
+}
+
+/**
+ * batadv_claim_hash_find
+ * @bat_priv: the bat priv with all the soft interface information
+ * @data: search data (may be local/static data)
+ *
+ * looks for a claim in the hash, and returns it if found
+ * or NULL otherwise.
+ */
+static struct batadv_bla_claim
+*batadv_claim_hash_find(struct batadv_priv *bat_priv,
+ struct batadv_bla_claim *data)
+{
+ struct batadv_hashtable *hash = bat_priv->bla.claim_hash;
+ struct hlist_head *head;
+ struct batadv_bla_claim *claim;
+ struct batadv_bla_claim *claim_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = batadv_choose_claim(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ if (!batadv_compare_claim(&claim->hash_entry, data))
+ continue;
+
+ if (!atomic_inc_not_zero(&claim->refcount))
+ continue;
+
+ claim_tmp = claim;
+ break;
+ }
+ rcu_read_unlock();
+
+ return claim_tmp;
+}
+
+/**
+ * batadv_backbone_hash_find - looks for a claim in the hash
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the address of the originator
+ * @vid: the VLAN ID
+ *
+ * Returns claim if found or NULL otherwise.
+ */
+static struct batadv_bla_backbone_gw *
+batadv_backbone_hash_find(struct batadv_priv *bat_priv,
+ uint8_t *addr, unsigned short vid)
+{
+ struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
+ struct hlist_head *head;
+ struct batadv_bla_backbone_gw search_entry, *backbone_gw;
+ struct batadv_bla_backbone_gw *backbone_gw_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ ether_addr_copy(search_entry.orig, addr);
+ search_entry.vid = vid;
+
+ index = batadv_choose_backbone_gw(&search_entry, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ if (!batadv_compare_backbone_gw(&backbone_gw->hash_entry,
+ &search_entry))
+ continue;
+
+ if (!atomic_inc_not_zero(&backbone_gw->refcount))
+ continue;
+
+ backbone_gw_tmp = backbone_gw;
+ break;
+ }
+ rcu_read_unlock();
+
+ return backbone_gw_tmp;
+}
+
+/* delete all claims for a backbone */
+static void
+batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
+{
+ struct batadv_hashtable *hash;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ struct batadv_bla_claim *claim;
+ int i;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+
+ hash = backbone_gw->bat_priv->bla.claim_hash;
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(claim, node_tmp,
+ head, hash_entry) {
+ if (claim->backbone_gw != backbone_gw)
+ continue;
+
+ batadv_claim_free_ref(claim);
+ hlist_del_rcu(&claim->hash_entry);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ /* all claims gone, initialize CRC */
+ backbone_gw->crc = BATADV_BLA_CRC_INIT;
+}
+
+/**
+ * batadv_bla_send_claim - sends a claim frame according to the provided info
+ * @bat_priv: the bat priv with all the soft interface information
+ * @mac: the mac address to be announced within the claim
+ * @vid: the VLAN ID
+ * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...)
+ */
+static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac,
+ unsigned short vid, int claimtype)
+{
+ struct sk_buff *skb;
+ struct ethhdr *ethhdr;
+ struct batadv_hard_iface *primary_if;
+ struct net_device *soft_iface;
+ uint8_t *hw_src;
+ struct batadv_bla_claim_dst local_claim_dest;
+ __be32 zeroip = 0;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return;
+
+ memcpy(&local_claim_dest, &bat_priv->bla.claim_dest,
+ sizeof(local_claim_dest));
+ local_claim_dest.type = claimtype;
+
+ soft_iface = primary_if->soft_iface;
+
+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
+ /* IP DST: 0.0.0.0 */
+ zeroip,
+ primary_if->soft_iface,
+ /* IP SRC: 0.0.0.0 */
+ zeroip,
+ /* Ethernet DST: Broadcast */
+ NULL,
+ /* Ethernet SRC/HW SRC: originator mac */
+ primary_if->net_dev->dev_addr,
+ /* HW DST: FF:43:05:XX:YY:YY
+ * with XX = claim type
+ * and YY:YY = group id
+ */
+ (uint8_t *)&local_claim_dest);
+
+ if (!skb)
+ goto out;
+
+ ethhdr = (struct ethhdr *)skb->data;
+ hw_src = (uint8_t *)ethhdr + ETH_HLEN + sizeof(struct arphdr);
+
+ /* now we pretend that the client would have sent this ... */
+ switch (claimtype) {
+ case BATADV_CLAIM_TYPE_CLAIM:
+ /* normal claim frame
+ * set Ethernet SRC to the clients mac
+ */
+ ether_addr_copy(ethhdr->h_source, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_send_claim(): CLAIM %pM on vid %d\n", mac,
+ BATADV_PRINT_VID(vid));
+ break;
+ case BATADV_CLAIM_TYPE_UNCLAIM:
+ /* unclaim frame
+ * set HW SRC to the clients mac
+ */
+ ether_addr_copy(hw_src, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_send_claim(): UNCLAIM %pM on vid %d\n", mac,
+ BATADV_PRINT_VID(vid));
+ break;
+ case BATADV_CLAIM_TYPE_ANNOUNCE:
+ /* announcement frame
+ * set HW SRC to the special mac containg the crc
+ */
+ ether_addr_copy(hw_src, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_send_claim(): ANNOUNCE of %pM on vid %d\n",
+ ethhdr->h_source, BATADV_PRINT_VID(vid));
+ break;
+ case BATADV_CLAIM_TYPE_REQUEST:
+ /* request frame
+ * set HW SRC and header destination to the receiving backbone
+ * gws mac
+ */
+ ether_addr_copy(hw_src, mac);
+ ether_addr_copy(ethhdr->h_dest, mac);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_send_claim(): REQUEST of %pM to %pM on vid %d\n",
+ ethhdr->h_source, ethhdr->h_dest,
+ BATADV_PRINT_VID(vid));
+ break;
+ }
+
+ if (vid & BATADV_VLAN_HAS_TAG)
+ skb = vlan_insert_tag(skb, htons(ETH_P_8021Q),
+ vid & VLAN_VID_MASK);
+
+ skb_reset_mac_header(skb);
+ skb->protocol = eth_type_trans(skb, soft_iface);
+ batadv_inc_counter(bat_priv, BATADV_CNT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
+ skb->len + ETH_HLEN);
+ soft_iface->last_rx = jiffies;
+
+ netif_rx(skb);
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+}
+
+/**
+ * batadv_bla_get_backbone_gw
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the mac address of the originator
+ * @vid: the VLAN ID
+ * @own_backbone: set if the requested backbone is local
+ *
+ * searches for the backbone gw or creates a new one if it could not
+ * be found.
+ */
+static struct batadv_bla_backbone_gw *
+batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig,
+ unsigned short vid, bool own_backbone)
+{
+ struct batadv_bla_backbone_gw *entry;
+ struct batadv_orig_node *orig_node;
+ int hash_added;
+
+ entry = batadv_backbone_hash_find(bat_priv, orig, vid);
+
+ if (entry)
+ return entry;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_get_backbone_gw(): not found (%pM, %d), creating new entry\n",
+ orig, BATADV_PRINT_VID(vid));
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return NULL;
+
+ entry->vid = vid;
+ entry->lasttime = jiffies;
+ entry->crc = BATADV_BLA_CRC_INIT;
+ entry->bat_priv = bat_priv;
+ atomic_set(&entry->request_sent, 0);
+ atomic_set(&entry->wait_periods, 0);
+ ether_addr_copy(entry->orig, orig);
+
+ /* one for the hash, one for returning */
+ atomic_set(&entry->refcount, 2);
+
+ hash_added = batadv_hash_add(bat_priv->bla.backbone_hash,
+ batadv_compare_backbone_gw,
+ batadv_choose_backbone_gw, entry,
+ &entry->hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* hash failed, free the structure */
+ kfree(entry);
+ return NULL;
+ }
+
+ /* this is a gateway now, remove any TT entry on this VLAN */
+ orig_node = batadv_orig_hash_find(bat_priv, orig);
+ if (orig_node) {
+ batadv_tt_global_del_orig(bat_priv, orig_node, vid,
+ "became a backbone gateway");
+ batadv_orig_node_free_ref(orig_node);
+ }
+
+ if (own_backbone) {
+ batadv_bla_send_announce(bat_priv, entry);
+
+ /* this will be decreased in the worker thread */
+ atomic_inc(&entry->request_sent);
+ atomic_set(&entry->wait_periods, BATADV_BLA_WAIT_PERIODS);
+ atomic_inc(&bat_priv->bla.num_requests);
+ }
+
+ return entry;
+}
+
+/* update or add the own backbone gw to make sure we announce
+ * where we receive other backbone gws
+ */
+static void
+batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ backbone_gw = batadv_bla_get_backbone_gw(bat_priv,
+ primary_if->net_dev->dev_addr,
+ vid, true);
+ if (unlikely(!backbone_gw))
+ return;
+
+ backbone_gw->lasttime = jiffies;
+ batadv_backbone_gw_free_ref(backbone_gw);
+}
+
+/**
+ * batadv_bla_answer_request - answer a bla request by sending own claims
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: interface where the request came on
+ * @vid: the vid where the request came on
+ *
+ * Repeat all of our own claims, and finally send an ANNOUNCE frame
+ * to allow the requester another check if the CRC is correct now.
+ */
+static void batadv_bla_answer_request(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ unsigned short vid)
+{
+ struct hlist_head *head;
+ struct batadv_hashtable *hash;
+ struct batadv_bla_claim *claim;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ int i;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_answer_request(): received a claim request, send all of our own claims again\n");
+
+ backbone_gw = batadv_backbone_hash_find(bat_priv,
+ primary_if->net_dev->dev_addr,
+ vid);
+ if (!backbone_gw)
+ return;
+
+ hash = bat_priv->bla.claim_hash;
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ /* only own claims are interesting */
+ if (claim->backbone_gw != backbone_gw)
+ continue;
+
+ batadv_bla_send_claim(bat_priv, claim->addr, claim->vid,
+ BATADV_CLAIM_TYPE_CLAIM);
+ }
+ rcu_read_unlock();
+ }
+
+ /* finally, send an announcement frame */
+ batadv_bla_send_announce(bat_priv, backbone_gw);
+ batadv_backbone_gw_free_ref(backbone_gw);
+}
+
+/**
+ * batadv_bla_send_request - send a request to repeat claims
+ * @backbone_gw: the backbone gateway from whom we are out of sync
+ *
+ * When the crc is wrong, ask the backbone gateway for a full table update.
+ * After the request, it will repeat all of his own claims and finally
+ * send an announcement claim with which we can check again.
+ */
+static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw)
+{
+ /* first, remove all old entries */
+ batadv_bla_del_backbone_claims(backbone_gw);
+
+ batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
+ "Sending REQUEST to %pM\n", backbone_gw->orig);
+
+ /* send request */
+ batadv_bla_send_claim(backbone_gw->bat_priv, backbone_gw->orig,
+ backbone_gw->vid, BATADV_CLAIM_TYPE_REQUEST);
+
+ /* no local broadcasts should be sent or received, for now. */
+ if (!atomic_read(&backbone_gw->request_sent)) {
+ atomic_inc(&backbone_gw->bat_priv->bla.num_requests);
+ atomic_set(&backbone_gw->request_sent, 1);
+ }
+}
+
+/**
+ * batadv_bla_send_announce
+ * @bat_priv: the bat priv with all the soft interface information
+ * @backbone_gw: our backbone gateway which should be announced
+ *
+ * This function sends an announcement. It is called from multiple
+ * places.
+ */
+static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
+ struct batadv_bla_backbone_gw *backbone_gw)
+{
+ uint8_t mac[ETH_ALEN];
+ __be16 crc;
+
+ memcpy(mac, batadv_announce_mac, 4);
+ crc = htons(backbone_gw->crc);
+ memcpy(&mac[4], &crc, 2);
+
+ batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid,
+ BATADV_CLAIM_TYPE_ANNOUNCE);
+}
+
+/**
+ * batadv_bla_add_claim - Adds a claim in the claim hash
+ * @bat_priv: the bat priv with all the soft interface information
+ * @mac: the mac address of the claim
+ * @vid: the VLAN ID of the frame
+ * @backbone_gw: the backbone gateway which claims it
+ */
+static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
+ const uint8_t *mac, const unsigned short vid,
+ struct batadv_bla_backbone_gw *backbone_gw)
+{
+ struct batadv_bla_claim *claim;
+ struct batadv_bla_claim search_claim;
+ int hash_added;
+
+ ether_addr_copy(search_claim.addr, mac);
+ search_claim.vid = vid;
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+
+ /* create a new claim entry if it does not exist yet. */
+ if (!claim) {
+ claim = kzalloc(sizeof(*claim), GFP_ATOMIC);
+ if (!claim)
+ return;
+
+ ether_addr_copy(claim->addr, mac);
+ claim->vid = vid;
+ claim->lasttime = jiffies;
+ claim->backbone_gw = backbone_gw;
+
+ atomic_set(&claim->refcount, 2);
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n",
+ mac, BATADV_PRINT_VID(vid));
+ hash_added = batadv_hash_add(bat_priv->bla.claim_hash,
+ batadv_compare_claim,
+ batadv_choose_claim, claim,
+ &claim->hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* only local changes happened. */
+ kfree(claim);
+ return;
+ }
+ } else {
+ claim->lasttime = jiffies;
+ if (claim->backbone_gw == backbone_gw)
+ /* no need to register a new backbone */
+ goto claim_free_ref;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_add_claim(): changing ownership for %pM, vid %d\n",
+ mac, BATADV_PRINT_VID(vid));
+
+ claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ batadv_backbone_gw_free_ref(claim->backbone_gw);
+ }
+ /* set (new) backbone gw */
+ atomic_inc(&backbone_gw->refcount);
+ claim->backbone_gw = backbone_gw;
+
+ backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ backbone_gw->lasttime = jiffies;
+
+claim_free_ref:
+ batadv_claim_free_ref(claim);
+}
+
+/* Delete a claim from the claim hash which has the
+ * given mac address and vid.
+ */
+static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
+ const uint8_t *mac, const unsigned short vid)
+{
+ struct batadv_bla_claim search_claim, *claim;
+
+ ether_addr_copy(search_claim.addr, mac);
+ search_claim.vid = vid;
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+ if (!claim)
+ return;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_del_claim(): %pM, vid %d\n",
+ mac, BATADV_PRINT_VID(vid));
+
+ batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim,
+ batadv_choose_claim, claim);
+ batadv_claim_free_ref(claim); /* reference from the hash is gone */
+
+ claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+
+ /* don't need the reference from hash_find() anymore */
+ batadv_claim_free_ref(claim);
+}
+
+/* check for ANNOUNCE frame, return 1 if handled */
+static int batadv_handle_announce(struct batadv_priv *bat_priv,
+ uint8_t *an_addr, uint8_t *backbone_addr,
+ unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ uint16_t crc;
+
+ if (memcmp(an_addr, batadv_announce_mac, 4) != 0)
+ return 0;
+
+ backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid,
+ false);
+
+ if (unlikely(!backbone_gw))
+ return 1;
+
+ /* handle as ANNOUNCE frame */
+ backbone_gw->lasttime = jiffies;
+ crc = ntohs(*((__be16 *)(&an_addr[4])));
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "handle_announce(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n",
+ BATADV_PRINT_VID(vid), backbone_gw->orig, crc);
+
+ if (backbone_gw->crc != crc) {
+ batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
+ "handle_announce(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n",
+ backbone_gw->orig,
+ BATADV_PRINT_VID(backbone_gw->vid),
+ backbone_gw->crc, crc);
+
+ batadv_bla_send_request(backbone_gw);
+ } else {
+ /* if we have sent a request and the crc was OK,
+ * we can allow traffic again.
+ */
+ if (atomic_read(&backbone_gw->request_sent)) {
+ atomic_dec(&backbone_gw->bat_priv->bla.num_requests);
+ atomic_set(&backbone_gw->request_sent, 0);
+ }
+ }
+
+ batadv_backbone_gw_free_ref(backbone_gw);
+ return 1;
+}
+
+/* check for REQUEST frame, return 1 if handled */
+static int batadv_handle_request(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ uint8_t *backbone_addr,
+ struct ethhdr *ethhdr, unsigned short vid)
+{
+ /* check for REQUEST frame */
+ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest))
+ return 0;
+
+ /* sanity check, this should not happen on a normal switch,
+ * we ignore it in this case.
+ */
+ if (!batadv_compare_eth(ethhdr->h_dest, primary_if->net_dev->dev_addr))
+ return 1;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "handle_request(): REQUEST vid %d (sent by %pM)...\n",
+ BATADV_PRINT_VID(vid), ethhdr->h_source);
+
+ batadv_bla_answer_request(bat_priv, primary_if, vid);
+ return 1;
+}
+
+/* check for UNCLAIM frame, return 1 if handled */
+static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ uint8_t *backbone_addr,
+ uint8_t *claim_addr, unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ /* unclaim in any case if it is our own */
+ if (primary_if && batadv_compare_eth(backbone_addr,
+ primary_if->net_dev->dev_addr))
+ batadv_bla_send_claim(bat_priv, claim_addr, vid,
+ BATADV_CLAIM_TYPE_UNCLAIM);
+
+ backbone_gw = batadv_backbone_hash_find(bat_priv, backbone_addr, vid);
+
+ if (!backbone_gw)
+ return 1;
+
+ /* this must be an UNCLAIM frame */
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "handle_unclaim(): UNCLAIM %pM on vid %d (sent by %pM)...\n",
+ claim_addr, BATADV_PRINT_VID(vid), backbone_gw->orig);
+
+ batadv_bla_del_claim(bat_priv, claim_addr, vid);
+ batadv_backbone_gw_free_ref(backbone_gw);
+ return 1;
+}
+
+/* check for CLAIM frame, return 1 if handled */
+static int batadv_handle_claim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ uint8_t *backbone_addr, uint8_t *claim_addr,
+ unsigned short vid)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ /* register the gateway if not yet available, and add the claim. */
+
+ backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid,
+ false);
+
+ if (unlikely(!backbone_gw))
+ return 1;
+
+ /* this must be a CLAIM frame */
+ batadv_bla_add_claim(bat_priv, claim_addr, vid, backbone_gw);
+ if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr))
+ batadv_bla_send_claim(bat_priv, claim_addr, vid,
+ BATADV_CLAIM_TYPE_CLAIM);
+
+ /* TODO: we could call something like tt_local_del() here. */
+
+ batadv_backbone_gw_free_ref(backbone_gw);
+ return 1;
+}
+
+/**
+ * batadv_check_claim_group
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the primary interface of this batman interface
+ * @hw_src: the Hardware source in the ARP Header
+ * @hw_dst: the Hardware destination in the ARP Header
+ * @ethhdr: pointer to the Ethernet header of the claim frame
+ *
+ * checks if it is a claim packet and if its on the same group.
+ * This function also applies the group ID of the sender
+ * if it is in the same mesh.
+ *
+ * returns:
+ * 2 - if it is a claim packet and on the same group
+ * 1 - if is a claim packet from another group
+ * 0 - if it is not a claim packet
+ */
+static int batadv_check_claim_group(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ uint8_t *hw_src, uint8_t *hw_dst,
+ struct ethhdr *ethhdr)
+{
+ uint8_t *backbone_addr;
+ struct batadv_orig_node *orig_node;
+ struct batadv_bla_claim_dst *bla_dst, *bla_dst_own;
+
+ bla_dst = (struct batadv_bla_claim_dst *)hw_dst;
+ bla_dst_own = &bat_priv->bla.claim_dest;
+
+ /* if announcement packet, use the source,
+ * otherwise assume it is in the hw_src
+ */
+ switch (bla_dst->type) {
+ case BATADV_CLAIM_TYPE_CLAIM:
+ backbone_addr = hw_src;
+ break;
+ case BATADV_CLAIM_TYPE_REQUEST:
+ case BATADV_CLAIM_TYPE_ANNOUNCE:
+ case BATADV_CLAIM_TYPE_UNCLAIM:
+ backbone_addr = ethhdr->h_source;
+ break;
+ default:
+ return 0;
+ }
+
+ /* don't accept claim frames from ourselves */
+ if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr))
+ return 0;
+
+ /* if its already the same group, it is fine. */
+ if (bla_dst->group == bla_dst_own->group)
+ return 2;
+
+ /* lets see if this originator is in our mesh */
+ orig_node = batadv_orig_hash_find(bat_priv, backbone_addr);
+
+ /* dont accept claims from gateways which are not in
+ * the same mesh or group.
+ */
+ if (!orig_node)
+ return 1;
+
+ /* if our mesh friends mac is bigger, use it for ourselves. */
+ if (ntohs(bla_dst->group) > ntohs(bla_dst_own->group)) {
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "taking other backbones claim group: %#.4x\n",
+ ntohs(bla_dst->group));
+ bla_dst_own->group = bla_dst->group;
+ }
+
+ batadv_orig_node_free_ref(orig_node);
+
+ return 2;
+}
+
+/**
+ * batadv_bla_process_claim
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the primary hard interface of this batman soft interface
+ * @skb: the frame to be checked
+ *
+ * Check if this is a claim frame, and process it accordingly.
+ *
+ * returns 1 if it was a claim frame, otherwise return 0 to
+ * tell the callee that it can use the frame on its own.
+ */
+static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct sk_buff *skb)
+{
+ struct batadv_bla_claim_dst *bla_dst, *bla_dst_own;
+ uint8_t *hw_src, *hw_dst;
+ struct vlan_hdr *vhdr, vhdr_buf;
+ struct ethhdr *ethhdr;
+ struct arphdr *arphdr;
+ unsigned short vid;
+ int vlan_depth = 0;
+ __be16 proto;
+ int headlen;
+ int ret;
+
+ vid = batadv_get_vid(skb, 0);
+ ethhdr = eth_hdr(skb);
+
+ proto = ethhdr->h_proto;
+ headlen = ETH_HLEN;
+ if (vid & BATADV_VLAN_HAS_TAG) {
+ /* Traverse the VLAN/Ethertypes.
+ *
+ * At this point it is known that the first protocol is a VLAN
+ * header, so start checking at the encapsulated protocol.
+ *
+ * The depth of the VLAN headers is recorded to drop BLA claim
+ * frames encapsulated into multiple VLAN headers (QinQ).
+ */
+ do {
+ vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN,
+ &vhdr_buf);
+ if (!vhdr)
+ return 0;
+
+ proto = vhdr->h_vlan_encapsulated_proto;
+ headlen += VLAN_HLEN;
+ vlan_depth++;
+ } while (proto == htons(ETH_P_8021Q));
+ }
+
+ if (proto != htons(ETH_P_ARP))
+ return 0; /* not a claim frame */
+
+ /* this must be a ARP frame. check if it is a claim. */
+
+ if (unlikely(!pskb_may_pull(skb, headlen + arp_hdr_len(skb->dev))))
+ return 0;
+
+ /* pskb_may_pull() may have modified the pointers, get ethhdr again */
+ ethhdr = eth_hdr(skb);
+ arphdr = (struct arphdr *)((uint8_t *)ethhdr + headlen);
+
+ /* Check whether the ARP frame carries a valid
+ * IP information
+ */
+ if (arphdr->ar_hrd != htons(ARPHRD_ETHER))
+ return 0;
+ if (arphdr->ar_pro != htons(ETH_P_IP))
+ return 0;
+ if (arphdr->ar_hln != ETH_ALEN)
+ return 0;
+ if (arphdr->ar_pln != 4)
+ return 0;
+
+ hw_src = (uint8_t *)arphdr + sizeof(struct arphdr);
+ hw_dst = hw_src + ETH_ALEN + 4;
+ bla_dst = (struct batadv_bla_claim_dst *)hw_dst;
+ bla_dst_own = &bat_priv->bla.claim_dest;
+
+ /* check if it is a claim frame in general */
+ if (memcmp(bla_dst->magic, bla_dst_own->magic,
+ sizeof(bla_dst->magic)) != 0)
+ return 0;
+
+ /* check if there is a claim frame encapsulated deeper in (QinQ) and
+ * drop that, as this is not supported by BLA but should also not be
+ * sent via the mesh.
+ */
+ if (vlan_depth > 1)
+ return 1;
+
+ /* check if it is a claim frame. */
+ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst,
+ ethhdr);
+ if (ret == 1)
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_process_claim(): received a claim frame from another group. From: %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n",
+ ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src,
+ hw_dst);
+
+ if (ret < 2)
+ return ret;
+
+ /* become a backbone gw ourselves on this vlan if not happened yet */
+ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
+
+ /* check for the different types of claim frames ... */
+ switch (bla_dst->type) {
+ case BATADV_CLAIM_TYPE_CLAIM:
+ if (batadv_handle_claim(bat_priv, primary_if, hw_src,
+ ethhdr->h_source, vid))
+ return 1;
+ break;
+ case BATADV_CLAIM_TYPE_UNCLAIM:
+ if (batadv_handle_unclaim(bat_priv, primary_if,
+ ethhdr->h_source, hw_src, vid))
+ return 1;
+ break;
+
+ case BATADV_CLAIM_TYPE_ANNOUNCE:
+ if (batadv_handle_announce(bat_priv, hw_src, ethhdr->h_source,
+ vid))
+ return 1;
+ break;
+ case BATADV_CLAIM_TYPE_REQUEST:
+ if (batadv_handle_request(bat_priv, primary_if, hw_src, ethhdr,
+ vid))
+ return 1;
+ break;
+ }
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_process_claim(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n",
+ ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src, hw_dst);
+ return 1;
+}
+
+/* Check when we last heard from other nodes, and remove them in case of
+ * a time out, or clean all backbone gws if now is set.
+ */
+static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ struct batadv_hashtable *hash;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ int i;
+
+ hash = bat_priv->bla.backbone_hash;
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(backbone_gw, node_tmp,
+ head, hash_entry) {
+ if (now)
+ goto purge_now;
+ if (!batadv_has_timed_out(backbone_gw->lasttime,
+ BATADV_BLA_BACKBONE_TIMEOUT))
+ continue;
+
+ batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
+ "bla_purge_backbone_gw(): backbone gw %pM timed out\n",
+ backbone_gw->orig);
+
+purge_now:
+ /* don't wait for the pending request anymore */
+ if (atomic_read(&backbone_gw->request_sent))
+ atomic_dec(&bat_priv->bla.num_requests);
+
+ batadv_bla_del_backbone_claims(backbone_gw);
+
+ hlist_del_rcu(&backbone_gw->hash_entry);
+ batadv_backbone_gw_free_ref(backbone_gw);
+ }
+ spin_unlock_bh(list_lock);
+ }
+}
+
+/**
+ * batadv_bla_purge_claims
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the selected primary interface, may be NULL if now is set
+ * @now: whether the whole hash shall be wiped now
+ *
+ * Check when we heard last time from our own claims, and remove them in case of
+ * a time out, or clean all claims if now is set
+ */
+static void batadv_bla_purge_claims(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ int now)
+{
+ struct batadv_bla_claim *claim;
+ struct hlist_head *head;
+ struct batadv_hashtable *hash;
+ int i;
+
+ hash = bat_priv->bla.claim_hash;
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ if (now)
+ goto purge_now;
+ if (!batadv_compare_eth(claim->backbone_gw->orig,
+ primary_if->net_dev->dev_addr))
+ continue;
+ if (!batadv_has_timed_out(claim->lasttime,
+ BATADV_BLA_CLAIM_TIMEOUT))
+ continue;
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv,
+ "bla_purge_claims(): %pM, vid %d, time out\n",
+ claim->addr, claim->vid);
+
+purge_now:
+ batadv_handle_unclaim(bat_priv, primary_if,
+ claim->backbone_gw->orig,
+ claim->addr, claim->vid);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * batadv_bla_update_orig_address
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the new selected primary_if
+ * @oldif: the old primary interface, may be NULL
+ *
+ * Update the backbone gateways when the own orig address changes.
+ */
+void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_hard_iface *oldif)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct hlist_head *head;
+ struct batadv_hashtable *hash;
+ __be16 group;
+ int i;
+
+ /* reset bridge loop avoidance group id */
+ group = htons(crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN));
+ bat_priv->bla.claim_dest.group = group;
+
+ /* purge everything when bridge loop avoidance is turned off */
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ oldif = NULL;
+
+ if (!oldif) {
+ batadv_bla_purge_claims(bat_priv, NULL, 1);
+ batadv_bla_purge_backbone_gw(bat_priv, 1);
+ return;
+ }
+
+ hash = bat_priv->bla.backbone_hash;
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ /* own orig still holds the old value. */
+ if (!batadv_compare_eth(backbone_gw->orig,
+ oldif->net_dev->dev_addr))
+ continue;
+
+ ether_addr_copy(backbone_gw->orig,
+ primary_if->net_dev->dev_addr);
+ /* send an announce frame so others will ask for our
+ * claims and update their tables.
+ */
+ batadv_bla_send_announce(bat_priv, backbone_gw);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/* periodic work to do:
+ * * purge structures when they are too old
+ * * send announcements
+ */
+static void batadv_bla_periodic_work(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv *bat_priv;
+ struct batadv_priv_bla *priv_bla;
+ struct hlist_head *head;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct batadv_hashtable *hash;
+ struct batadv_hard_iface *primary_if;
+ int i;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ priv_bla = container_of(delayed_work, struct batadv_priv_bla, work);
+ bat_priv = container_of(priv_bla, struct batadv_priv, bla);
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ batadv_bla_purge_claims(bat_priv, primary_if, 0);
+ batadv_bla_purge_backbone_gw(bat_priv, 0);
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ goto out;
+
+ hash = bat_priv->bla.backbone_hash;
+ if (!hash)
+ goto out;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ if (!batadv_compare_eth(backbone_gw->orig,
+ primary_if->net_dev->dev_addr))
+ continue;
+
+ backbone_gw->lasttime = jiffies;
+
+ batadv_bla_send_announce(bat_priv, backbone_gw);
+
+ /* request_sent is only set after creation to avoid
+ * problems when we are not yet known as backbone gw
+ * in the backbone.
+ *
+ * We can reset this now after we waited some periods
+ * to give bridge forward delays and bla group forming
+ * some grace time.
+ */
+
+ if (atomic_read(&backbone_gw->request_sent) == 0)
+ continue;
+
+ if (!atomic_dec_and_test(&backbone_gw->wait_periods))
+ continue;
+
+ atomic_dec(&backbone_gw->bat_priv->bla.num_requests);
+ atomic_set(&backbone_gw->request_sent, 0);
+ }
+ rcu_read_unlock();
+ }
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work,
+ msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH));
+}
+
+/* The hash for claim and backbone hash receive the same key because they
+ * are getting initialized by hash_new with the same key. Reinitializing
+ * them with to different keys to allow nested locking without generating
+ * lockdep warnings
+ */
+static struct lock_class_key batadv_claim_hash_lock_class_key;
+static struct lock_class_key batadv_backbone_hash_lock_class_key;
+
+/* initialize all bla structures */
+int batadv_bla_init(struct batadv_priv *bat_priv)
+{
+ int i;
+ uint8_t claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00};
+ struct batadv_hard_iface *primary_if;
+ uint16_t crc;
+ unsigned long entrytime;
+
+ spin_lock_init(&bat_priv->bla.bcast_duplist_lock);
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hash registering\n");
+
+ /* setting claim destination address */
+ memcpy(&bat_priv->bla.claim_dest.magic, claim_dest, 3);
+ bat_priv->bla.claim_dest.type = 0;
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (primary_if) {
+ crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN);
+ bat_priv->bla.claim_dest.group = htons(crc);
+ batadv_hardif_free_ref(primary_if);
+ } else {
+ bat_priv->bla.claim_dest.group = 0; /* will be set later */
+ }
+
+ /* initialize the duplicate list */
+ entrytime = jiffies - msecs_to_jiffies(BATADV_DUPLIST_TIMEOUT);
+ for (i = 0; i < BATADV_DUPLIST_SIZE; i++)
+ bat_priv->bla.bcast_duplist[i].entrytime = entrytime;
+ bat_priv->bla.bcast_duplist_curr = 0;
+
+ if (bat_priv->bla.claim_hash)
+ return 0;
+
+ bat_priv->bla.claim_hash = batadv_hash_new(128);
+ bat_priv->bla.backbone_hash = batadv_hash_new(32);
+
+ if (!bat_priv->bla.claim_hash || !bat_priv->bla.backbone_hash)
+ return -ENOMEM;
+
+ batadv_hash_set_lock_class(bat_priv->bla.claim_hash,
+ &batadv_claim_hash_lock_class_key);
+ batadv_hash_set_lock_class(bat_priv->bla.backbone_hash,
+ &batadv_backbone_hash_lock_class_key);
+
+ batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hashes initialized\n");
+
+ INIT_DELAYED_WORK(&bat_priv->bla.work, batadv_bla_periodic_work);
+
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work,
+ msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH));
+ return 0;
+}
+
+/**
+ * batadv_bla_check_bcast_duplist
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: contains the bcast_packet to be checked
+ *
+ * check if it is on our broadcast list. Another gateway might
+ * have sent the same packet because it is connected to the same backbone,
+ * so we have to remove this duplicate.
+ *
+ * This is performed by checking the CRC, which will tell us
+ * with a good chance that it is the same packet. If it is furthermore
+ * sent by another host, drop it. We allow equal packets from
+ * the same host however as this might be intended.
+ */
+int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ int i, curr, ret = 0;
+ __be32 crc;
+ struct batadv_bcast_packet *bcast_packet;
+ struct batadv_bcast_duplist_entry *entry;
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+
+ /* calculate the crc ... */
+ crc = batadv_skb_crc32(skb, (u8 *)(bcast_packet + 1));
+
+ spin_lock_bh(&bat_priv->bla.bcast_duplist_lock);
+
+ for (i = 0; i < BATADV_DUPLIST_SIZE; i++) {
+ curr = (bat_priv->bla.bcast_duplist_curr + i);
+ curr %= BATADV_DUPLIST_SIZE;
+ entry = &bat_priv->bla.bcast_duplist[curr];
+
+ /* we can stop searching if the entry is too old ;
+ * later entries will be even older
+ */
+ if (batadv_has_timed_out(entry->entrytime,
+ BATADV_DUPLIST_TIMEOUT))
+ break;
+
+ if (entry->crc != crc)
+ continue;
+
+ if (batadv_compare_eth(entry->orig, bcast_packet->orig))
+ continue;
+
+ /* this entry seems to match: same crc, not too old,
+ * and from another gw. therefore return 1 to forbid it.
+ */
+ ret = 1;
+ goto out;
+ }
+ /* not found, add a new entry (overwrite the oldest entry)
+ * and allow it, its the first occurrence.
+ */
+ curr = (bat_priv->bla.bcast_duplist_curr + BATADV_DUPLIST_SIZE - 1);
+ curr %= BATADV_DUPLIST_SIZE;
+ entry = &bat_priv->bla.bcast_duplist[curr];
+ entry->crc = crc;
+ entry->entrytime = jiffies;
+ ether_addr_copy(entry->orig, bcast_packet->orig);
+ bat_priv->bla.bcast_duplist_curr = curr;
+
+out:
+ spin_unlock_bh(&bat_priv->bla.bcast_duplist_lock);
+
+ return ret;
+}
+
+/**
+ * batadv_bla_is_backbone_gw_orig
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: originator mac address
+ * @vid: VLAN identifier
+ *
+ * Check if the originator is a gateway for the VLAN identified by vid.
+ *
+ * Returns true if orig is a backbone for this vid, false otherwise.
+ */
+bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig,
+ unsigned short vid)
+{
+ struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
+ struct hlist_head *head;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ int i;
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ return false;
+
+ if (!hash)
+ return false;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ if (batadv_compare_eth(backbone_gw->orig, orig) &&
+ backbone_gw->vid == vid) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return false;
+}
+
+/**
+ * batadv_bla_is_backbone_gw
+ * @skb: the frame to be checked
+ * @orig_node: the orig_node of the frame
+ * @hdr_size: maximum length of the frame
+ *
+ * bla_is_backbone_gw inspects the skb for the VLAN ID and returns 1
+ * if the orig_node is also a gateway on the soft interface, otherwise it
+ * returns 0.
+ */
+int batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node, int hdr_size)
+{
+ struct batadv_bla_backbone_gw *backbone_gw;
+ unsigned short vid;
+
+ if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance))
+ return 0;
+
+ /* first, find out the vid. */
+ if (!pskb_may_pull(skb, hdr_size + ETH_HLEN))
+ return 0;
+
+ vid = batadv_get_vid(skb, hdr_size);
+
+ /* see if this originator is a backbone gw for this VLAN */
+ backbone_gw = batadv_backbone_hash_find(orig_node->bat_priv,
+ orig_node->orig, vid);
+ if (!backbone_gw)
+ return 0;
+
+ batadv_backbone_gw_free_ref(backbone_gw);
+ return 1;
+}
+
+/* free all bla structures (for softinterface free or module unload) */
+void batadv_bla_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_hard_iface *primary_if;
+
+ cancel_delayed_work_sync(&bat_priv->bla.work);
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (bat_priv->bla.claim_hash) {
+ batadv_bla_purge_claims(bat_priv, primary_if, 1);
+ batadv_hash_destroy(bat_priv->bla.claim_hash);
+ bat_priv->bla.claim_hash = NULL;
+ }
+ if (bat_priv->bla.backbone_hash) {
+ batadv_bla_purge_backbone_gw(bat_priv, 1);
+ batadv_hash_destroy(bat_priv->bla.backbone_hash);
+ bat_priv->bla.backbone_hash = NULL;
+ }
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+}
+
+/**
+ * batadv_bla_rx
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the frame to be checked
+ * @vid: the VLAN ID of the frame
+ * @is_bcast: the packet came in a broadcast packet type.
+ *
+ * bla_rx avoidance checks if:
+ * * we have to race for a claim
+ * * if the frame is allowed on the LAN
+ *
+ * in these cases, the skb is further handled by this function and
+ * returns 1, otherwise it returns 0 and the caller shall further
+ * process the skb.
+ */
+int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, bool is_bcast)
+{
+ struct ethhdr *ethhdr;
+ struct batadv_bla_claim search_claim, *claim = NULL;
+ struct batadv_hard_iface *primary_if;
+ int ret;
+
+ ethhdr = eth_hdr(skb);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto handled;
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ goto allow;
+
+ if (unlikely(atomic_read(&bat_priv->bla.num_requests)))
+ /* don't allow broadcasts while requests are in flight */
+ if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast)
+ goto handled;
+
+ ether_addr_copy(search_claim.addr, ethhdr->h_source);
+ search_claim.vid = vid;
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+
+ if (!claim) {
+ /* possible optimization: race for a claim */
+ /* No claim exists yet, claim it for us!
+ */
+ batadv_handle_claim(bat_priv, primary_if,
+ primary_if->net_dev->dev_addr,
+ ethhdr->h_source, vid);
+ goto allow;
+ }
+
+ /* if it is our own claim ... */
+ if (batadv_compare_eth(claim->backbone_gw->orig,
+ primary_if->net_dev->dev_addr)) {
+ /* ... allow it in any case */
+ claim->lasttime = jiffies;
+ goto allow;
+ }
+
+ /* if it is a broadcast ... */
+ if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) {
+ /* ... drop it. the responsible gateway is in charge.
+ *
+ * We need to check is_bcast because with the gateway
+ * feature, broadcasts (like DHCP requests) may be sent
+ * using a unicast packet type.
+ */
+ goto handled;
+ } else {
+ /* seems the client considers us as its best gateway.
+ * send a claim and update the claim table
+ * immediately.
+ */
+ batadv_handle_claim(bat_priv, primary_if,
+ primary_if->net_dev->dev_addr,
+ ethhdr->h_source, vid);
+ goto allow;
+ }
+allow:
+ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
+ ret = 0;
+ goto out;
+
+handled:
+ kfree_skb(skb);
+ ret = 1;
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ if (claim)
+ batadv_claim_free_ref(claim);
+ return ret;
+}
+
+/**
+ * batadv_bla_tx
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the frame to be checked
+ * @vid: the VLAN ID of the frame
+ *
+ * bla_tx checks if:
+ * * a claim was received which has to be processed
+ * * the frame is allowed on the mesh
+ *
+ * in these cases, the skb is further handled by this function and
+ * returns 1, otherwise it returns 0 and the caller shall further
+ * process the skb.
+ *
+ * This call might reallocate skb data.
+ */
+int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid)
+{
+ struct ethhdr *ethhdr;
+ struct batadv_bla_claim search_claim, *claim = NULL;
+ struct batadv_hard_iface *primary_if;
+ int ret = 0;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ if (!atomic_read(&bat_priv->bridge_loop_avoidance))
+ goto allow;
+
+ if (batadv_bla_process_claim(bat_priv, primary_if, skb))
+ goto handled;
+
+ ethhdr = eth_hdr(skb);
+
+ if (unlikely(atomic_read(&bat_priv->bla.num_requests)))
+ /* don't allow broadcasts while requests are in flight */
+ if (is_multicast_ether_addr(ethhdr->h_dest))
+ goto handled;
+
+ ether_addr_copy(search_claim.addr, ethhdr->h_source);
+ search_claim.vid = vid;
+
+ claim = batadv_claim_hash_find(bat_priv, &search_claim);
+
+ /* if no claim exists, allow it. */
+ if (!claim)
+ goto allow;
+
+ /* check if we are responsible. */
+ if (batadv_compare_eth(claim->backbone_gw->orig,
+ primary_if->net_dev->dev_addr)) {
+ /* if yes, the client has roamed and we have
+ * to unclaim it.
+ */
+ batadv_handle_unclaim(bat_priv, primary_if,
+ primary_if->net_dev->dev_addr,
+ ethhdr->h_source, vid);
+ goto allow;
+ }
+
+ /* check if it is a multicast/broadcast frame */
+ if (is_multicast_ether_addr(ethhdr->h_dest)) {
+ /* drop it. the responsible gateway has forwarded it into
+ * the backbone network.
+ */
+ goto handled;
+ } else {
+ /* we must allow it. at least if we are
+ * responsible for the DESTINATION.
+ */
+ goto allow;
+ }
+allow:
+ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid);
+ ret = 0;
+ goto out;
+handled:
+ ret = 1;
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ if (claim)
+ batadv_claim_free_ref(claim);
+ return ret;
+}
+
+int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hashtable *hash = bat_priv->bla.claim_hash;
+ struct batadv_bla_claim *claim;
+ struct batadv_hard_iface *primary_if;
+ struct hlist_head *head;
+ uint32_t i;
+ bool is_own;
+ uint8_t *primary_addr;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ goto out;
+
+ primary_addr = primary_if->net_dev->dev_addr;
+ seq_printf(seq,
+ "Claims announced for the mesh %s (orig %pM, group id %#.4x)\n",
+ net_dev->name, primary_addr,
+ ntohs(bat_priv->bla.claim_dest.group));
+ seq_printf(seq, " %-17s %-5s %-17s [o] (%-6s)\n",
+ "Client", "VID", "Originator", "CRC");
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ is_own = batadv_compare_eth(claim->backbone_gw->orig,
+ primary_addr);
+ seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n",
+ claim->addr, BATADV_PRINT_VID(claim->vid),
+ claim->backbone_gw->orig,
+ (is_own ? 'x' : ' '),
+ claim->backbone_gw->crc);
+ }
+ rcu_read_unlock();
+ }
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return 0;
+}
+
+int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ struct batadv_hard_iface *primary_if;
+ struct hlist_head *head;
+ int secs, msecs;
+ uint32_t i;
+ bool is_own;
+ uint8_t *primary_addr;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ goto out;
+
+ primary_addr = primary_if->net_dev->dev_addr;
+ seq_printf(seq,
+ "Backbones announced for the mesh %s (orig %pM, group id %#.4x)\n",
+ net_dev->name, primary_addr,
+ ntohs(bat_priv->bla.claim_dest.group));
+ seq_printf(seq, " %-17s %-5s %-9s (%-6s)\n",
+ "Originator", "VID", "last seen", "CRC");
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ msecs = jiffies_to_msecs(jiffies -
+ backbone_gw->lasttime);
+ secs = msecs / 1000;
+ msecs = msecs % 1000;
+
+ is_own = batadv_compare_eth(backbone_gw->orig,
+ primary_addr);
+ if (is_own)
+ continue;
+
+ seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n",
+ backbone_gw->orig,
+ BATADV_PRINT_VID(backbone_gw->vid), secs,
+ msecs, backbone_gw->crc);
+ }
+ rcu_read_unlock();
+ }
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return 0;
+}
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
new file mode 100644
index 000000000..43c985d92
--- /dev/null
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -0,0 +1,108 @@
+/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_BLA_H_
+#define _NET_BATMAN_ADV_BLA_H_
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, bool is_bcast);
+int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid);
+int batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node, int hdr_size);
+int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
+ void *offset);
+bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig,
+ unsigned short vid);
+int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_hard_iface *oldif);
+int batadv_bla_init(struct batadv_priv *bat_priv);
+void batadv_bla_free(struct batadv_priv *bat_priv);
+
+#define BATADV_BLA_CRC_INIT 0
+#else /* ifdef CONFIG_BATMAN_ADV_BLA */
+
+static inline int batadv_bla_rx(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid,
+ bool is_bcast)
+{
+ return 0;
+}
+
+static inline int batadv_bla_tx(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ return 0;
+}
+
+static inline int batadv_bla_is_backbone_gw(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ int hdr_size)
+{
+ return 0;
+}
+
+static inline int batadv_bla_claim_table_seq_print_text(struct seq_file *seq,
+ void *offset)
+{
+ return 0;
+}
+
+static inline int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
+ void *offset)
+{
+ return 0;
+}
+
+static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv,
+ uint8_t *orig,
+ unsigned short vid)
+{
+ return false;
+}
+
+static inline int
+batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline void
+batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if,
+ struct batadv_hard_iface *oldif)
+{
+}
+
+static inline int batadv_bla_init(struct batadv_priv *bat_priv)
+{
+ return 1;
+}
+
+static inline void batadv_bla_free(struct batadv_priv *bat_priv)
+{
+}
+
+#endif /* ifdef CONFIG_BATMAN_ADV_BLA */
+
+#endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
new file mode 100644
index 000000000..a4972874c
--- /dev/null
+++ b/net/batman-adv/debugfs.c
@@ -0,0 +1,561 @@
+/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+
+#include <linux/debugfs.h>
+
+#include "debugfs.h"
+#include "translation-table.h"
+#include "originator.h"
+#include "hard-interface.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+#include "soft-interface.h"
+#include "icmp_socket.h"
+#include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
+#include "network-coding.h"
+
+static struct dentry *batadv_debugfs;
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1)
+
+static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN;
+
+static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log,
+ size_t idx)
+{
+ return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK];
+}
+
+static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log,
+ char c)
+{
+ char *char_addr;
+
+ char_addr = batadv_log_char_addr(debug_log, debug_log->log_end);
+ *char_addr = c;
+ debug_log->log_end++;
+
+ if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len)
+ debug_log->log_start = debug_log->log_end - batadv_log_buff_len;
+}
+
+__printf(2, 3)
+static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
+ const char *fmt, ...)
+{
+ va_list args;
+ static char debug_log_buf[256];
+ char *p;
+
+ if (!debug_log)
+ return 0;
+
+ spin_lock_bh(&debug_log->lock);
+ va_start(args, fmt);
+ vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args);
+ va_end(args);
+
+ for (p = debug_log_buf; *p != 0; p++)
+ batadv_emit_log_char(debug_log, *p);
+
+ spin_unlock_bh(&debug_log->lock);
+
+ wake_up(&debug_log->queue_wait);
+
+ return 0;
+}
+
+int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
+{
+ va_list args;
+ char tmp_log_buf[256];
+
+ va_start(args, fmt);
+ vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args);
+ batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s",
+ jiffies_to_msecs(jiffies), tmp_log_buf);
+ va_end(args);
+
+ return 0;
+}
+
+static int batadv_log_open(struct inode *inode, struct file *file)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -EBUSY;
+
+ nonseekable_open(inode, file);
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+static int batadv_log_release(struct inode *inode, struct file *file)
+{
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+static int batadv_log_empty(struct batadv_priv_debug_log *debug_log)
+{
+ return !(debug_log->log_start - debug_log->log_end);
+}
+
+static ssize_t batadv_log_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct batadv_priv *bat_priv = file->private_data;
+ struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
+ int error, i = 0;
+ char *char_addr;
+ char c;
+
+ if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log))
+ return -EAGAIN;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (count == 0)
+ return 0;
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ error = wait_event_interruptible(debug_log->queue_wait,
+ (!batadv_log_empty(debug_log)));
+
+ if (error)
+ return error;
+
+ spin_lock_bh(&debug_log->lock);
+
+ while ((!error) && (i < count) &&
+ (debug_log->log_start != debug_log->log_end)) {
+ char_addr = batadv_log_char_addr(debug_log,
+ debug_log->log_start);
+ c = *char_addr;
+
+ debug_log->log_start++;
+
+ spin_unlock_bh(&debug_log->lock);
+
+ error = __put_user(c, buf);
+
+ spin_lock_bh(&debug_log->lock);
+
+ buf++;
+ i++;
+ }
+
+ spin_unlock_bh(&debug_log->lock);
+
+ if (!error)
+ return i;
+
+ return error;
+}
+
+static unsigned int batadv_log_poll(struct file *file, poll_table *wait)
+{
+ struct batadv_priv *bat_priv = file->private_data;
+ struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
+
+ poll_wait(file, &debug_log->queue_wait, wait);
+
+ if (!batadv_log_empty(debug_log))
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static const struct file_operations batadv_log_fops = {
+ .open = batadv_log_open,
+ .release = batadv_log_release,
+ .read = batadv_log_read,
+ .poll = batadv_log_poll,
+ .llseek = no_llseek,
+};
+
+static int batadv_debug_log_setup(struct batadv_priv *bat_priv)
+{
+ struct dentry *d;
+
+ if (!bat_priv->debug_dir)
+ goto err;
+
+ bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC);
+ if (!bat_priv->debug_log)
+ goto err;
+
+ spin_lock_init(&bat_priv->debug_log->lock);
+ init_waitqueue_head(&bat_priv->debug_log->queue_wait);
+
+ d = debugfs_create_file("log", S_IFREG | S_IRUSR,
+ bat_priv->debug_dir, bat_priv,
+ &batadv_log_fops);
+ if (!d)
+ goto err;
+
+ return 0;
+
+err:
+ return -ENOMEM;
+}
+
+static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
+{
+ kfree(bat_priv->debug_log);
+ bat_priv->debug_log = NULL;
+}
+#else /* CONFIG_BATMAN_ADV_DEBUG */
+static int batadv_debug_log_setup(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
+{
+}
+#endif
+
+static int batadv_algorithms_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, batadv_algo_seq_print_text, NULL);
+}
+
+static int batadv_originators_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_orig_seq_print_text, net_dev);
+}
+
+/**
+ * batadv_originators_hardif_open - handles debugfs output for the
+ * originator table of an hard interface
+ * @inode: inode pointer to debugfs file
+ * @file: pointer to the seq_file
+ */
+static int batadv_originators_hardif_open(struct inode *inode,
+ struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_orig_hardif_seq_print_text, net_dev);
+}
+
+static int batadv_gateways_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_gw_client_seq_print_text, net_dev);
+}
+
+static int batadv_transtable_global_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_tt_global_seq_print_text, net_dev);
+}
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+static int batadv_bla_claim_table_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_bla_claim_table_seq_print_text,
+ net_dev);
+}
+
+static int batadv_bla_backbone_table_open(struct inode *inode,
+ struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_bla_backbone_table_seq_print_text,
+ net_dev);
+}
+
+#endif
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+/**
+ * batadv_dat_cache_open - Prepare file handler for reads from dat_chache
+ * @inode: inode which was opened
+ * @file: file handle to be initialized
+ */
+static int batadv_dat_cache_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_dat_cache_seq_print_text, net_dev);
+}
+#endif
+
+static int batadv_transtable_local_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_tt_local_seq_print_text, net_dev);
+}
+
+struct batadv_debuginfo {
+ struct attribute attr;
+ const struct file_operations fops;
+};
+
+#ifdef CONFIG_BATMAN_ADV_NC
+static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_nc_nodes_seq_print_text, net_dev);
+}
+#endif
+
+#define BATADV_DEBUGINFO(_name, _mode, _open) \
+struct batadv_debuginfo batadv_debuginfo_##_name = { \
+ .attr = { .name = __stringify(_name), \
+ .mode = _mode, }, \
+ .fops = { .owner = THIS_MODULE, \
+ .open = _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ } \
+}
+
+/* the following attributes are general and therefore they will be directly
+ * placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs
+ */
+static BATADV_DEBUGINFO(routing_algos, S_IRUGO, batadv_algorithms_open);
+
+static struct batadv_debuginfo *batadv_general_debuginfos[] = {
+ &batadv_debuginfo_routing_algos,
+ NULL,
+};
+
+/* The following attributes are per soft interface */
+static BATADV_DEBUGINFO(originators, S_IRUGO, batadv_originators_open);
+static BATADV_DEBUGINFO(gateways, S_IRUGO, batadv_gateways_open);
+static BATADV_DEBUGINFO(transtable_global, S_IRUGO,
+ batadv_transtable_global_open);
+#ifdef CONFIG_BATMAN_ADV_BLA
+static BATADV_DEBUGINFO(bla_claim_table, S_IRUGO, batadv_bla_claim_table_open);
+static BATADV_DEBUGINFO(bla_backbone_table, S_IRUGO,
+ batadv_bla_backbone_table_open);
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
+static BATADV_DEBUGINFO(dat_cache, S_IRUGO, batadv_dat_cache_open);
+#endif
+static BATADV_DEBUGINFO(transtable_local, S_IRUGO,
+ batadv_transtable_local_open);
+#ifdef CONFIG_BATMAN_ADV_NC
+static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open);
+#endif
+
+static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
+ &batadv_debuginfo_originators,
+ &batadv_debuginfo_gateways,
+ &batadv_debuginfo_transtable_global,
+#ifdef CONFIG_BATMAN_ADV_BLA
+ &batadv_debuginfo_bla_claim_table,
+ &batadv_debuginfo_bla_backbone_table,
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
+ &batadv_debuginfo_dat_cache,
+#endif
+ &batadv_debuginfo_transtable_local,
+#ifdef CONFIG_BATMAN_ADV_NC
+ &batadv_debuginfo_nc_nodes,
+#endif
+ NULL,
+};
+
+#define BATADV_HARDIF_DEBUGINFO(_name, _mode, _open) \
+struct batadv_debuginfo batadv_hardif_debuginfo_##_name = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = _mode, \
+ }, \
+ .fops = { \
+ .owner = THIS_MODULE, \
+ .open = _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ }, \
+}
+
+static BATADV_HARDIF_DEBUGINFO(originators, S_IRUGO,
+ batadv_originators_hardif_open);
+
+static struct batadv_debuginfo *batadv_hardif_debuginfos[] = {
+ &batadv_hardif_debuginfo_originators,
+ NULL,
+};
+
+void batadv_debugfs_init(void)
+{
+ struct batadv_debuginfo **bat_debug;
+ struct dentry *file;
+
+ batadv_debugfs = debugfs_create_dir(BATADV_DEBUGFS_SUBDIR, NULL);
+ if (batadv_debugfs == ERR_PTR(-ENODEV))
+ batadv_debugfs = NULL;
+
+ if (!batadv_debugfs)
+ goto err;
+
+ for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug) {
+ file = debugfs_create_file(((*bat_debug)->attr).name,
+ S_IFREG | ((*bat_debug)->attr).mode,
+ batadv_debugfs, NULL,
+ &(*bat_debug)->fops);
+ if (!file) {
+ pr_err("Can't add general debugfs file: %s\n",
+ ((*bat_debug)->attr).name);
+ goto err;
+ }
+ }
+
+ return;
+err:
+ debugfs_remove_recursive(batadv_debugfs);
+ batadv_debugfs = NULL;
+}
+
+void batadv_debugfs_destroy(void)
+{
+ debugfs_remove_recursive(batadv_debugfs);
+ batadv_debugfs = NULL;
+}
+
+/**
+ * batadv_debugfs_add_hardif - creates the base directory for a hard interface
+ * in debugfs.
+ * @hard_iface: hard interface which should be added.
+ */
+int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_debuginfo **bat_debug;
+ struct dentry *file;
+
+ if (!batadv_debugfs)
+ goto out;
+
+ hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name,
+ batadv_debugfs);
+ if (!hard_iface->debug_dir)
+ goto out;
+
+ for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) {
+ file = debugfs_create_file(((*bat_debug)->attr).name,
+ S_IFREG | ((*bat_debug)->attr).mode,
+ hard_iface->debug_dir,
+ hard_iface->net_dev,
+ &(*bat_debug)->fops);
+ if (!file)
+ goto rem_attr;
+ }
+
+ return 0;
+rem_attr:
+ debugfs_remove_recursive(hard_iface->debug_dir);
+ hard_iface->debug_dir = NULL;
+out:
+#ifdef CONFIG_DEBUG_FS
+ return -ENOMEM;
+#else
+ return 0;
+#endif /* CONFIG_DEBUG_FS */
+}
+
+/**
+ * batadv_debugfs_del_hardif - delete the base directory for a hard interface
+ * in debugfs.
+ * @hard_iface: hard interface which is deleted.
+ */
+void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
+{
+ if (batadv_debugfs) {
+ debugfs_remove_recursive(hard_iface->debug_dir);
+ hard_iface->debug_dir = NULL;
+ }
+}
+
+int batadv_debugfs_add_meshif(struct net_device *dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_debuginfo **bat_debug;
+ struct dentry *file;
+
+ if (!batadv_debugfs)
+ goto out;
+
+ bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs);
+ if (!bat_priv->debug_dir)
+ goto out;
+
+ if (batadv_socket_setup(bat_priv) < 0)
+ goto rem_attr;
+
+ if (batadv_debug_log_setup(bat_priv) < 0)
+ goto rem_attr;
+
+ for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug) {
+ file = debugfs_create_file(((*bat_debug)->attr).name,
+ S_IFREG | ((*bat_debug)->attr).mode,
+ bat_priv->debug_dir,
+ dev, &(*bat_debug)->fops);
+ if (!file) {
+ batadv_err(dev, "Can't add debugfs file: %s/%s\n",
+ dev->name, ((*bat_debug)->attr).name);
+ goto rem_attr;
+ }
+ }
+
+ if (batadv_nc_init_debugfs(bat_priv) < 0)
+ goto rem_attr;
+
+ return 0;
+rem_attr:
+ debugfs_remove_recursive(bat_priv->debug_dir);
+ bat_priv->debug_dir = NULL;
+out:
+#ifdef CONFIG_DEBUG_FS
+ return -ENOMEM;
+#else
+ return 0;
+#endif /* CONFIG_DEBUG_FS */
+}
+
+void batadv_debugfs_del_meshif(struct net_device *dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+
+ batadv_debug_log_cleanup(bat_priv);
+
+ if (batadv_debugfs) {
+ debugfs_remove_recursive(bat_priv->debug_dir);
+ bat_priv->debug_dir = NULL;
+ }
+}
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
new file mode 100644
index 000000000..37c4d6ddd
--- /dev/null
+++ b/net/batman-adv/debugfs.h
@@ -0,0 +1,30 @@
+/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_DEBUGFS_H_
+#define _NET_BATMAN_ADV_DEBUGFS_H_
+
+#define BATADV_DEBUGFS_SUBDIR "batman_adv"
+
+void batadv_debugfs_init(void);
+void batadv_debugfs_destroy(void);
+int batadv_debugfs_add_meshif(struct net_device *dev);
+void batadv_debugfs_del_meshif(struct net_device *dev);
+int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
+void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
+
+#endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
new file mode 100644
index 000000000..aad022dd1
--- /dev/null
+++ b/net/batman-adv/distributed-arp-table.c
@@ -0,0 +1,1204 @@
+/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <net/arp.h>
+
+#include "main.h"
+#include "hash.h"
+#include "distributed-arp-table.h"
+#include "hard-interface.h"
+#include "originator.h"
+#include "send.h"
+#include "types.h"
+#include "translation-table.h"
+
+static void batadv_dat_purge(struct work_struct *work);
+
+/**
+ * batadv_dat_start_timer - initialise the DAT periodic worker
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_dat_start_timer(struct batadv_priv *bat_priv)
+{
+ INIT_DELAYED_WORK(&bat_priv->dat.work, batadv_dat_purge);
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->dat.work,
+ msecs_to_jiffies(10000));
+}
+
+/**
+ * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly
+ * free it
+ * @dat_entry: the entry to free
+ */
+static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry)
+{
+ if (atomic_dec_and_test(&dat_entry->refcount))
+ kfree_rcu(dat_entry, rcu);
+}
+
+/**
+ * batadv_dat_to_purge - check whether a dat_entry has to be purged or not
+ * @dat_entry: the entry to check
+ *
+ * Returns true if the entry has to be purged now, false otherwise.
+ */
+static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry)
+{
+ return batadv_has_timed_out(dat_entry->last_update,
+ BATADV_DAT_ENTRY_TIMEOUT);
+}
+
+/**
+ * __batadv_dat_purge - delete entries from the DAT local storage
+ * @bat_priv: the bat priv with all the soft interface information
+ * @to_purge: function in charge to decide whether an entry has to be purged or
+ * not. This function takes the dat_entry as argument and has to
+ * returns a boolean value: true is the entry has to be deleted,
+ * false otherwise
+ *
+ * Loops over each entry in the DAT local storage and deletes it if and only if
+ * the to_purge function passed as argument returns true.
+ */
+static void __batadv_dat_purge(struct batadv_priv *bat_priv,
+ bool (*to_purge)(struct batadv_dat_entry *))
+{
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ struct batadv_dat_entry *dat_entry;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ uint32_t i;
+
+ if (!bat_priv->dat.hash)
+ return;
+
+ for (i = 0; i < bat_priv->dat.hash->size; i++) {
+ head = &bat_priv->dat.hash->table[i];
+ list_lock = &bat_priv->dat.hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(dat_entry, node_tmp, head,
+ hash_entry) {
+ /* if a helper function has been passed as parameter,
+ * ask it if the entry has to be purged or not
+ */
+ if (to_purge && !to_purge(dat_entry))
+ continue;
+
+ hlist_del_rcu(&dat_entry->hash_entry);
+ batadv_dat_entry_free_ref(dat_entry);
+ }
+ spin_unlock_bh(list_lock);
+ }
+}
+
+/**
+ * batadv_dat_purge - periodic task that deletes old entries from the local DAT
+ * hash table
+ * @work: kernel work struct
+ */
+static void batadv_dat_purge(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_dat *priv_dat;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ priv_dat = container_of(delayed_work, struct batadv_priv_dat, work);
+ bat_priv = container_of(priv_dat, struct batadv_priv, dat);
+
+ __batadv_dat_purge(bat_priv, batadv_dat_to_purge);
+ batadv_dat_start_timer(bat_priv);
+}
+
+/**
+ * batadv_compare_dat - comparing function used in the local DAT hash table
+ * @node: node in the local table
+ * @data2: second object to compare the node to
+ *
+ * Returns 1 if the two entries are the same, 0 otherwise.
+ */
+static int batadv_compare_dat(const struct hlist_node *node, const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_dat_entry,
+ hash_entry);
+
+ return memcmp(data1, data2, sizeof(__be32)) == 0 ? 1 : 0;
+}
+
+/**
+ * batadv_arp_hw_src - extract the hw_src field from an ARP packet
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ *
+ * Returns the value of the hw_src field in the ARP packet.
+ */
+static uint8_t *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
+{
+ uint8_t *addr;
+
+ addr = (uint8_t *)(skb->data + hdr_size);
+ addr += ETH_HLEN + sizeof(struct arphdr);
+
+ return addr;
+}
+
+/**
+ * batadv_arp_ip_src - extract the ip_src field from an ARP packet
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ *
+ * Returns the value of the ip_src field in the ARP packet.
+ */
+static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
+{
+ return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN);
+}
+
+/**
+ * batadv_arp_hw_dst - extract the hw_dst field from an ARP packet
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ *
+ * Returns the value of the hw_dst field in the ARP packet.
+ */
+static uint8_t *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
+{
+ return batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN + 4;
+}
+
+/**
+ * batadv_arp_ip_dst - extract the ip_dst field from an ARP packet
+ * @skb: ARP packet
+ * @hdr_size: size of the possible header before the ARP packet
+ *
+ * Returns the value of the ip_dst field in the ARP packet.
+ */
+static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
+{
+ return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4);
+}
+
+/**
+ * batadv_hash_dat - compute the hash value for an IP address
+ * @data: data to hash
+ * @size: size of the hash table
+ *
+ * Returns the selected index in the hash table for the given data.
+ */
+static uint32_t batadv_hash_dat(const void *data, uint32_t size)
+{
+ uint32_t hash = 0;
+ const struct batadv_dat_entry *dat = data;
+
+ hash = batadv_hash_bytes(hash, &dat->ip, sizeof(dat->ip));
+ hash = batadv_hash_bytes(hash, &dat->vid, sizeof(dat->vid));
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash % size;
+}
+
+/**
+ * batadv_dat_entry_hash_find - look for a given dat_entry in the local hash
+ * table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ip: search key
+ * @vid: VLAN identifier
+ *
+ * Returns the dat_entry if found, NULL otherwise.
+ */
+static struct batadv_dat_entry *
+batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
+ unsigned short vid)
+{
+ struct hlist_head *head;
+ struct batadv_dat_entry to_find, *dat_entry, *dat_entry_tmp = NULL;
+ struct batadv_hashtable *hash = bat_priv->dat.hash;
+ uint32_t index;
+
+ if (!hash)
+ return NULL;
+
+ to_find.ip = ip;
+ to_find.vid = vid;
+
+ index = batadv_hash_dat(&to_find, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dat_entry, head, hash_entry) {
+ if (dat_entry->ip != ip)
+ continue;
+
+ if (!atomic_inc_not_zero(&dat_entry->refcount))
+ continue;
+
+ dat_entry_tmp = dat_entry;
+ break;
+ }
+ rcu_read_unlock();
+
+ return dat_entry_tmp;
+}
+
+/**
+ * batadv_dat_entry_add - add a new dat entry or update it if already exists
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ip: ipv4 to add/edit
+ * @mac_addr: mac address to assign to the given ipv4
+ * @vid: VLAN identifier
+ */
+static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
+ uint8_t *mac_addr, unsigned short vid)
+{
+ struct batadv_dat_entry *dat_entry;
+ int hash_added;
+
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip, vid);
+ /* if this entry is already known, just update it */
+ if (dat_entry) {
+ if (!batadv_compare_eth(dat_entry->mac_addr, mac_addr))
+ ether_addr_copy(dat_entry->mac_addr, mac_addr);
+ dat_entry->last_update = jiffies;
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Entry updated: %pI4 %pM (vid: %d)\n",
+ &dat_entry->ip, dat_entry->mac_addr,
+ BATADV_PRINT_VID(vid));
+ goto out;
+ }
+
+ dat_entry = kmalloc(sizeof(*dat_entry), GFP_ATOMIC);
+ if (!dat_entry)
+ goto out;
+
+ dat_entry->ip = ip;
+ dat_entry->vid = vid;
+ ether_addr_copy(dat_entry->mac_addr, mac_addr);
+ dat_entry->last_update = jiffies;
+ atomic_set(&dat_entry->refcount, 2);
+
+ hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat,
+ batadv_hash_dat, dat_entry,
+ &dat_entry->hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* remove the reference for the hash */
+ batadv_dat_entry_free_ref(dat_entry);
+ goto out;
+ }
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "New entry added: %pI4 %pM (vid: %d)\n",
+ &dat_entry->ip, dat_entry->mac_addr, BATADV_PRINT_VID(vid));
+
+out:
+ if (dat_entry)
+ batadv_dat_entry_free_ref(dat_entry);
+}
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+
+/**
+ * batadv_dbg_arp - print a debug message containing all the ARP packet details
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: ARP packet
+ * @type: ARP type
+ * @hdr_size: size of the possible header before the ARP packet
+ * @msg: message to print together with the debugging information
+ */
+static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ uint16_t type, int hdr_size, char *msg)
+{
+ struct batadv_unicast_4addr_packet *unicast_4addr_packet;
+ struct batadv_bcast_packet *bcast_pkt;
+ uint8_t *orig_addr;
+ __be32 ip_src, ip_dst;
+
+ if (msg)
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "%s\n", msg);
+
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "ARP MSG = [src: %pM-%pI4 dst: %pM-%pI4]\n",
+ batadv_arp_hw_src(skb, hdr_size), &ip_src,
+ batadv_arp_hw_dst(skb, hdr_size), &ip_dst);
+
+ if (hdr_size == 0)
+ return;
+
+ unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data;
+
+ switch (unicast_4addr_packet->u.packet_type) {
+ case BATADV_UNICAST:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* encapsulated within a UNICAST packet\n");
+ break;
+ case BATADV_UNICAST_4ADDR:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* encapsulated within a UNICAST_4ADDR packet (src: %pM)\n",
+ unicast_4addr_packet->src);
+ switch (unicast_4addr_packet->subtype) {
+ case BATADV_P_DAT_DHT_PUT:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DAT_DHT_PUT\n");
+ break;
+ case BATADV_P_DAT_DHT_GET:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DAT_DHT_GET\n");
+ break;
+ case BATADV_P_DAT_CACHE_REPLY:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* type: DAT_CACHE_REPLY\n");
+ break;
+ case BATADV_P_DATA:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: DATA\n");
+ break;
+ default:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "* type: Unknown (%u)!\n",
+ unicast_4addr_packet->u.packet_type);
+ }
+ break;
+ case BATADV_BCAST:
+ bcast_pkt = (struct batadv_bcast_packet *)unicast_4addr_packet;
+ orig_addr = bcast_pkt->orig;
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* encapsulated within a BCAST packet (src: %pM)\n",
+ orig_addr);
+ break;
+ default:
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "* encapsulated within an unknown packet type (0x%x)\n",
+ unicast_4addr_packet->u.packet_type);
+ }
+}
+
+#else
+
+static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ uint16_t type, int hdr_size, char *msg)
+{
+}
+
+#endif /* CONFIG_BATMAN_ADV_DEBUG */
+
+/**
+ * batadv_is_orig_node_eligible - check whether a node can be a DHT candidate
+ * @res: the array with the already selected candidates
+ * @select: number of already selected candidates
+ * @tmp_max: address of the currently evaluated node
+ * @max: current round max address
+ * @last_max: address of the last selected candidate
+ * @candidate: orig_node under evaluation
+ * @max_orig_node: last selected candidate
+ *
+ * Returns true if the node has been elected as next candidate or false
+ * otherwise.
+ */
+static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res,
+ int select, batadv_dat_addr_t tmp_max,
+ batadv_dat_addr_t max,
+ batadv_dat_addr_t last_max,
+ struct batadv_orig_node *candidate,
+ struct batadv_orig_node *max_orig_node)
+{
+ bool ret = false;
+ int j;
+
+ /* check if orig node candidate is running DAT */
+ if (!(candidate->capabilities & BATADV_ORIG_CAPA_HAS_DAT))
+ goto out;
+
+ /* Check if this node has already been selected... */
+ for (j = 0; j < select; j++)
+ if (res[j].orig_node == candidate)
+ break;
+ /* ..and possibly skip it */
+ if (j < select)
+ goto out;
+ /* sanity check: has it already been selected? This should not happen */
+ if (tmp_max > last_max)
+ goto out;
+ /* check if during this iteration an originator with a closer dht
+ * address has already been found
+ */
+ if (tmp_max < max)
+ goto out;
+ /* this is an hash collision with the temporary selected node. Choose
+ * the one with the lowest address
+ */
+ if ((tmp_max == max) && max_orig_node &&
+ (batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0))
+ goto out;
+
+ ret = true;
+out:
+ return ret;
+}
+
+/**
+ * batadv_choose_next_candidate - select the next DHT candidate
+ * @bat_priv: the bat priv with all the soft interface information
+ * @cands: candidates array
+ * @select: number of candidates already present in the array
+ * @ip_key: key to look up in the DHT
+ * @last_max: pointer where the address of the selected candidate will be saved
+ */
+static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
+ struct batadv_dat_candidate *cands,
+ int select, batadv_dat_addr_t ip_key,
+ batadv_dat_addr_t *last_max)
+{
+ batadv_dat_addr_t max = 0, tmp_max = 0;
+ struct batadv_orig_node *orig_node, *max_orig_node = NULL;
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ int i;
+
+ /* if no node is eligible as candidate, leave the candidate type as
+ * NOT_FOUND
+ */
+ cands[select].type = BATADV_DAT_CANDIDATE_NOT_FOUND;
+
+ /* iterate over the originator list and find the node with the closest
+ * dat_address which has not been selected yet
+ */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ /* the dht space is a ring using unsigned addresses */
+ tmp_max = BATADV_DAT_ADDR_MAX - orig_node->dat_addr +
+ ip_key;
+
+ if (!batadv_is_orig_node_eligible(cands, select,
+ tmp_max, max,
+ *last_max, orig_node,
+ max_orig_node))
+ continue;
+
+ if (!atomic_inc_not_zero(&orig_node->refcount))
+ continue;
+
+ max = tmp_max;
+ if (max_orig_node)
+ batadv_orig_node_free_ref(max_orig_node);
+ max_orig_node = orig_node;
+ }
+ rcu_read_unlock();
+ }
+ if (max_orig_node) {
+ cands[select].type = BATADV_DAT_CANDIDATE_ORIG;
+ cands[select].orig_node = max_orig_node;
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "dat_select_candidates() %d: selected %pM addr=%u dist=%u\n",
+ select, max_orig_node->orig, max_orig_node->dat_addr,
+ max);
+ }
+ *last_max = max;
+}
+
+/**
+ * batadv_dat_select_candidates - select the nodes which the DHT message has to
+ * be sent to
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ip_dst: ipv4 to look up in the DHT
+ *
+ * An originator O is selected if and only if its DHT_ID value is one of three
+ * closest values (from the LEFT, with wrap around if needed) then the hash
+ * value of the key. ip_dst is the key.
+ *
+ * Returns the candidate array of size BATADV_DAT_CANDIDATE_NUM.
+ */
+static struct batadv_dat_candidate *
+batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
+{
+ int select;
+ batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key;
+ struct batadv_dat_candidate *res;
+
+ if (!bat_priv->orig_hash)
+ return NULL;
+
+ res = kmalloc_array(BATADV_DAT_CANDIDATES_NUM, sizeof(*res),
+ GFP_ATOMIC);
+ if (!res)
+ return NULL;
+
+ ip_key = (batadv_dat_addr_t)batadv_hash_dat(&ip_dst,
+ BATADV_DAT_ADDR_MAX);
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "dat_select_candidates(): IP=%pI4 hash(IP)=%u\n", &ip_dst,
+ ip_key);
+
+ for (select = 0; select < BATADV_DAT_CANDIDATES_NUM; select++)
+ batadv_choose_next_candidate(bat_priv, res, select, ip_key,
+ &last_max);
+
+ return res;
+}
+
+/**
+ * batadv_dat_send_data - send a payload to the selected candidates
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: payload to send
+ * @ip: the DHT key
+ * @packet_subtype: unicast4addr packet subtype to use
+ *
+ * This function copies the skb with pskb_copy() and is sent as unicast packet
+ * to each of the selected candidates.
+ *
+ * Returns true if the packet is sent to at least one candidate, false
+ * otherwise.
+ */
+static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, __be32 ip,
+ int packet_subtype)
+{
+ int i;
+ bool ret = false;
+ int send_status;
+ struct batadv_neigh_node *neigh_node = NULL;
+ struct sk_buff *tmp_skb;
+ struct batadv_dat_candidate *cand;
+
+ cand = batadv_dat_select_candidates(bat_priv, ip);
+ if (!cand)
+ goto out;
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "DHT_SEND for %pI4\n", &ip);
+
+ for (i = 0; i < BATADV_DAT_CANDIDATES_NUM; i++) {
+ if (cand[i].type == BATADV_DAT_CANDIDATE_NOT_FOUND)
+ continue;
+
+ neigh_node = batadv_orig_router_get(cand[i].orig_node,
+ BATADV_IF_DEFAULT);
+ if (!neigh_node)
+ goto free_orig;
+
+ tmp_skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
+ if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, tmp_skb,
+ cand[i].orig_node,
+ packet_subtype)) {
+ kfree_skb(tmp_skb);
+ goto free_neigh;
+ }
+
+ send_status = batadv_send_skb_packet(tmp_skb,
+ neigh_node->if_incoming,
+ neigh_node->addr);
+ if (send_status == NET_XMIT_SUCCESS) {
+ /* count the sent packet */
+ switch (packet_subtype) {
+ case BATADV_P_DAT_DHT_GET:
+ batadv_inc_counter(bat_priv,
+ BATADV_CNT_DAT_GET_TX);
+ break;
+ case BATADV_P_DAT_DHT_PUT:
+ batadv_inc_counter(bat_priv,
+ BATADV_CNT_DAT_PUT_TX);
+ break;
+ }
+
+ /* packet sent to a candidate: return true */
+ ret = true;
+ }
+free_neigh:
+ batadv_neigh_node_free_ref(neigh_node);
+free_orig:
+ batadv_orig_node_free_ref(cand[i].orig_node);
+ }
+
+out:
+ kfree(cand);
+ return ret;
+}
+
+/**
+ * batadv_dat_tvlv_container_update - update the dat tvlv container after dat
+ * setting change
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv)
+{
+ char dat_mode;
+
+ dat_mode = atomic_read(&bat_priv->distributed_arp_table);
+
+ switch (dat_mode) {
+ case 0:
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_DAT, 1);
+ break;
+ case 1:
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_DAT, 1,
+ NULL, 0);
+ break;
+ }
+}
+
+/**
+ * batadv_dat_status_update - update the dat tvlv container after dat
+ * setting change
+ * @net_dev: the soft interface net device
+ */
+void batadv_dat_status_update(struct net_device *net_dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+
+ batadv_dat_tvlv_container_update(bat_priv);
+}
+
+/**
+ * batadv_gw_tvlv_ogm_handler_v1 - process incoming dat tvlv container
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the gateway data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t flags,
+ void *tvlv_value,
+ uint16_t tvlv_value_len)
+{
+ if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND)
+ orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_DAT;
+ else
+ orig->capabilities |= BATADV_ORIG_CAPA_HAS_DAT;
+}
+
+/**
+ * batadv_dat_hash_free - free the local DAT hash table
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_dat_hash_free(struct batadv_priv *bat_priv)
+{
+ if (!bat_priv->dat.hash)
+ return;
+
+ __batadv_dat_purge(bat_priv, NULL);
+
+ batadv_hash_destroy(bat_priv->dat.hash);
+
+ bat_priv->dat.hash = NULL;
+}
+
+/**
+ * batadv_dat_init - initialise the DAT internals
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+int batadv_dat_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->dat.hash)
+ return 0;
+
+ bat_priv->dat.hash = batadv_hash_new(1024);
+
+ if (!bat_priv->dat.hash)
+ return -ENOMEM;
+
+ batadv_dat_start_timer(bat_priv);
+
+ batadv_tvlv_handler_register(bat_priv, batadv_dat_tvlv_ogm_handler_v1,
+ NULL, BATADV_TVLV_DAT, 1,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+ batadv_dat_tvlv_container_update(bat_priv);
+ return 0;
+}
+
+/**
+ * batadv_dat_free - free the DAT internals
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_dat_free(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_DAT, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_DAT, 1);
+
+ cancel_delayed_work_sync(&bat_priv->dat.work);
+
+ batadv_dat_hash_free(bat_priv);
+}
+
+/**
+ * batadv_dat_cache_seq_print_text - print the local DAT hash table
+ * @seq: seq file to print on
+ * @offset: not used
+ */
+int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hashtable *hash = bat_priv->dat.hash;
+ struct batadv_dat_entry *dat_entry;
+ struct batadv_hard_iface *primary_if;
+ struct hlist_head *head;
+ unsigned long last_seen_jiffies;
+ int last_seen_msecs, last_seen_secs, last_seen_mins;
+ uint32_t i;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ goto out;
+
+ seq_printf(seq, "Distributed ARP Table (%s):\n", net_dev->name);
+ seq_printf(seq, " %-7s %-9s %4s %11s\n", "IPv4",
+ "MAC", "VID", "last-seen");
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dat_entry, head, hash_entry) {
+ last_seen_jiffies = jiffies - dat_entry->last_update;
+ last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
+ last_seen_mins = last_seen_msecs / 60000;
+ last_seen_msecs = last_seen_msecs % 60000;
+ last_seen_secs = last_seen_msecs / 1000;
+
+ seq_printf(seq, " * %15pI4 %14pM %4i %6i:%02i\n",
+ &dat_entry->ip, dat_entry->mac_addr,
+ BATADV_PRINT_VID(dat_entry->vid),
+ last_seen_mins, last_seen_secs);
+ }
+ rcu_read_unlock();
+ }
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return 0;
+}
+
+/**
+ * batadv_arp_get_type - parse an ARP packet and gets the type
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: packet to analyse
+ * @hdr_size: size of the possible header before the ARP packet in the skb
+ *
+ * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise.
+ */
+static uint16_t batadv_arp_get_type(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ struct arphdr *arphdr;
+ struct ethhdr *ethhdr;
+ __be32 ip_src, ip_dst;
+ uint8_t *hw_src, *hw_dst;
+ uint16_t type = 0;
+
+ /* pull the ethernet header */
+ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN)))
+ goto out;
+
+ ethhdr = (struct ethhdr *)(skb->data + hdr_size);
+
+ if (ethhdr->h_proto != htons(ETH_P_ARP))
+ goto out;
+
+ /* pull the ARP payload */
+ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN +
+ arp_hdr_len(skb->dev))))
+ goto out;
+
+ arphdr = (struct arphdr *)(skb->data + hdr_size + ETH_HLEN);
+
+ /* check whether the ARP packet carries a valid IP information */
+ if (arphdr->ar_hrd != htons(ARPHRD_ETHER))
+ goto out;
+
+ if (arphdr->ar_pro != htons(ETH_P_IP))
+ goto out;
+
+ if (arphdr->ar_hln != ETH_ALEN)
+ goto out;
+
+ if (arphdr->ar_pln != 4)
+ goto out;
+
+ /* Check for bad reply/request. If the ARP message is not sane, DAT
+ * will simply ignore it
+ */
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+ if (ipv4_is_loopback(ip_src) || ipv4_is_multicast(ip_src) ||
+ ipv4_is_loopback(ip_dst) || ipv4_is_multicast(ip_dst) ||
+ ipv4_is_zeronet(ip_src) || ipv4_is_lbcast(ip_src) ||
+ ipv4_is_zeronet(ip_dst) || ipv4_is_lbcast(ip_dst))
+ goto out;
+
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ if (is_zero_ether_addr(hw_src) || is_multicast_ether_addr(hw_src))
+ goto out;
+
+ /* don't care about the destination MAC address in ARP requests */
+ if (arphdr->ar_op != htons(ARPOP_REQUEST)) {
+ hw_dst = batadv_arp_hw_dst(skb, hdr_size);
+ if (is_zero_ether_addr(hw_dst) ||
+ is_multicast_ether_addr(hw_dst))
+ goto out;
+ }
+
+ type = ntohs(arphdr->ar_op);
+out:
+ return type;
+}
+
+/**
+ * batadv_dat_get_vid - extract the VLAN identifier from skb if any
+ * @skb: the buffer containing the packet to extract the VID from
+ * @hdr_size: the size of the batman-adv header encapsulating the packet
+ *
+ * If the packet embedded in the skb is vlan tagged this function returns the
+ * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned.
+ */
+static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
+{
+ unsigned short vid;
+
+ vid = batadv_get_vid(skb, *hdr_size);
+
+ /* ARP parsing functions jump forward of hdr_size + ETH_HLEN.
+ * If the header contained in the packet is a VLAN one (which is longer)
+ * hdr_size is updated so that the functions will still skip the
+ * correct amount of bytes.
+ */
+ if (vid & BATADV_VLAN_HAS_TAG)
+ *hdr_size += VLAN_HLEN;
+
+ return vid;
+}
+
+/**
+ * batadv_dat_snoop_outgoing_arp_request - snoop the ARP request and try to
+ * answer using DAT
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: packet to check
+ *
+ * Returns true if the message has been sent to the dht candidates, false
+ * otherwise. In case of a positive return value the message has to be enqueued
+ * to permit the fallback.
+ */
+bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ uint16_t type = 0;
+ __be32 ip_dst, ip_src;
+ uint8_t *hw_src;
+ bool ret = false;
+ struct batadv_dat_entry *dat_entry = NULL;
+ struct sk_buff *skb_new;
+ int hdr_size = 0;
+ unsigned short vid;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ goto out;
+
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, skb, hdr_size);
+ /* If the node gets an ARP_REQUEST it has to send a DHT_GET unicast
+ * message to the selected DHT candidates
+ */
+ if (type != ARPOP_REQUEST)
+ goto out;
+
+ batadv_dbg_arp(bat_priv, skb, type, hdr_size,
+ "Parsing outgoing ARP REQUEST");
+
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid);
+ if (dat_entry) {
+ /* If the ARP request is destined for a local client the local
+ * client will answer itself. DAT would only generate a
+ * duplicate packet.
+ *
+ * Moreover, if the soft-interface is enslaved into a bridge, an
+ * additional DAT answer may trigger kernel warnings about
+ * a packet coming from the wrong port.
+ */
+ if (batadv_is_my_client(bat_priv, dat_entry->mac_addr, vid)) {
+ ret = true;
+ goto out;
+ }
+
+ skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
+ bat_priv->soft_iface, ip_dst, hw_src,
+ dat_entry->mac_addr, hw_src);
+ if (!skb_new)
+ goto out;
+
+ if (vid & BATADV_VLAN_HAS_TAG)
+ skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
+ vid & VLAN_VID_MASK);
+
+ skb_reset_mac_header(skb_new);
+ skb_new->protocol = eth_type_trans(skb_new,
+ bat_priv->soft_iface);
+ bat_priv->stats.rx_packets++;
+ bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size;
+ bat_priv->soft_iface->last_rx = jiffies;
+
+ netif_rx(skb_new);
+ batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n");
+ ret = true;
+ } else {
+ /* Send the request to the DHT */
+ ret = batadv_dat_send_data(bat_priv, skb, ip_dst,
+ BATADV_P_DAT_DHT_GET);
+ }
+out:
+ if (dat_entry)
+ batadv_dat_entry_free_ref(dat_entry);
+ return ret;
+}
+
+/**
+ * batadv_dat_snoop_incoming_arp_request - snoop the ARP request and try to
+ * answer using the local DAT storage
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: packet to check
+ * @hdr_size: size of the encapsulation header
+ *
+ * Returns true if the request has been answered, false otherwise.
+ */
+bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ uint16_t type;
+ __be32 ip_src, ip_dst;
+ uint8_t *hw_src;
+ struct sk_buff *skb_new;
+ struct batadv_dat_entry *dat_entry = NULL;
+ bool ret = false;
+ unsigned short vid;
+ int err;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ goto out;
+
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, skb, hdr_size);
+ if (type != ARPOP_REQUEST)
+ goto out;
+
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+
+ batadv_dbg_arp(bat_priv, skb, type, hdr_size,
+ "Parsing incoming ARP REQUEST");
+
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid);
+ if (!dat_entry)
+ goto out;
+
+ skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
+ bat_priv->soft_iface, ip_dst, hw_src,
+ dat_entry->mac_addr, hw_src);
+
+ if (!skb_new)
+ goto out;
+
+ /* the rest of the TX path assumes that the mac_header offset pointing
+ * to the inner Ethernet header has been set, therefore reset it now.
+ */
+ skb_reset_mac_header(skb_new);
+
+ if (vid & BATADV_VLAN_HAS_TAG)
+ skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q),
+ vid & VLAN_VID_MASK);
+
+ /* To preserve backwards compatibility, the node has choose the outgoing
+ * format based on the incoming request packet type. The assumption is
+ * that a node not using the 4addr packet format doesn't support it.
+ */
+ if (hdr_size == sizeof(struct batadv_unicast_4addr_packet))
+ err = batadv_send_skb_via_tt_4addr(bat_priv, skb_new,
+ BATADV_P_DAT_CACHE_REPLY,
+ NULL, vid);
+ else
+ err = batadv_send_skb_via_tt(bat_priv, skb_new, NULL, vid);
+
+ if (err != NET_XMIT_DROP) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_DAT_CACHED_REPLY_TX);
+ ret = true;
+ }
+out:
+ if (dat_entry)
+ batadv_dat_entry_free_ref(dat_entry);
+ if (ret)
+ kfree_skb(skb);
+ return ret;
+}
+
+/**
+ * batadv_dat_snoop_outgoing_arp_reply - snoop the ARP reply and fill the DHT
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: packet to check
+ */
+void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ uint16_t type;
+ __be32 ip_src, ip_dst;
+ uint8_t *hw_src, *hw_dst;
+ int hdr_size = 0;
+ unsigned short vid;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ return;
+
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, skb, hdr_size);
+ if (type != ARPOP_REPLY)
+ return;
+
+ batadv_dbg_arp(bat_priv, skb, type, hdr_size,
+ "Parsing outgoing ARP REPLY");
+
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ hw_dst = batadv_arp_hw_dst(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+ batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid);
+
+ /* Send the ARP reply to the candidates for both the IP addresses that
+ * the node obtained from the ARP reply
+ */
+ batadv_dat_send_data(bat_priv, skb, ip_src, BATADV_P_DAT_DHT_PUT);
+ batadv_dat_send_data(bat_priv, skb, ip_dst, BATADV_P_DAT_DHT_PUT);
+}
+
+/**
+ * batadv_dat_snoop_incoming_arp_reply - snoop the ARP reply and fill the local
+ * DAT storage only
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: packet to check
+ * @hdr_size: size of the encapsulation header
+ */
+bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ uint16_t type;
+ __be32 ip_src, ip_dst;
+ uint8_t *hw_src, *hw_dst;
+ bool ret = false;
+ unsigned short vid;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ goto out;
+
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, skb, hdr_size);
+ if (type != ARPOP_REPLY)
+ goto out;
+
+ batadv_dbg_arp(bat_priv, skb, type, hdr_size,
+ "Parsing incoming ARP REPLY");
+
+ hw_src = batadv_arp_hw_src(skb, hdr_size);
+ ip_src = batadv_arp_ip_src(skb, hdr_size);
+ hw_dst = batadv_arp_hw_dst(skb, hdr_size);
+ ip_dst = batadv_arp_ip_dst(skb, hdr_size);
+
+ /* Update our internal cache with both the IP addresses the node got
+ * within the ARP reply
+ */
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+ batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid);
+
+ /* if this REPLY is directed to a client of mine, let's deliver the
+ * packet to the interface
+ */
+ ret = !batadv_is_my_client(bat_priv, hw_dst, vid);
+out:
+ if (ret)
+ kfree_skb(skb);
+ /* if ret == false -> packet has to be delivered to the interface */
+ return ret;
+}
+
+/**
+ * batadv_dat_drop_broadcast_packet - check if an ARP request has to be dropped
+ * (because the node has already obtained the reply via DAT) or not
+ * @bat_priv: the bat priv with all the soft interface information
+ * @forw_packet: the broadcast packet
+ *
+ * Returns true if the node can drop the packet, false otherwise.
+ */
+bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet)
+{
+ uint16_t type;
+ __be32 ip_dst;
+ struct batadv_dat_entry *dat_entry = NULL;
+ bool ret = false;
+ int hdr_size = sizeof(struct batadv_bcast_packet);
+ unsigned short vid;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ goto out;
+
+ /* If this packet is an ARP_REQUEST and the node already has the
+ * information that it is going to ask, then the packet can be dropped
+ */
+ if (forw_packet->num_packets)
+ goto out;
+
+ vid = batadv_dat_get_vid(forw_packet->skb, &hdr_size);
+
+ type = batadv_arp_get_type(bat_priv, forw_packet->skb, hdr_size);
+ if (type != ARPOP_REQUEST)
+ goto out;
+
+ ip_dst = batadv_arp_ip_dst(forw_packet->skb, hdr_size);
+ dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst, vid);
+ /* check if the node already got this entry */
+ if (!dat_entry) {
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "ARP Request for %pI4: fallback\n", &ip_dst);
+ goto out;
+ }
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "ARP Request for %pI4: fallback prevented\n", &ip_dst);
+ ret = true;
+
+out:
+ if (dat_entry)
+ batadv_dat_entry_free_ref(dat_entry);
+ return ret;
+}
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
new file mode 100644
index 000000000..2fe0764c6
--- /dev/null
+++ b/net/batman-adv/distributed-arp-table.h
@@ -0,0 +1,171 @@
+/* Copyright (C) 2011-2014 B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_
+#define _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+
+#include "types.h"
+#include "originator.h"
+
+#include <linux/if_arp.h>
+
+/* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */
+#define BATADV_DAT_ADDR_MAX ((batadv_dat_addr_t)~(batadv_dat_addr_t)0)
+
+void batadv_dat_status_update(struct net_device *net_dev);
+bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size);
+void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size);
+bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet);
+
+/**
+ * batadv_dat_init_orig_node_addr - assign a DAT address to the orig_node
+ * @orig_node: the node to assign the DAT address to
+ */
+static inline void
+batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node)
+{
+ uint32_t addr;
+
+ addr = batadv_choose_orig(orig_node->orig, BATADV_DAT_ADDR_MAX);
+ orig_node->dat_addr = (batadv_dat_addr_t)addr;
+}
+
+/**
+ * batadv_dat_init_own_addr - assign a DAT address to the node itself
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: a pointer to the primary interface
+ */
+static inline void
+batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *primary_if)
+{
+ uint32_t addr;
+
+ addr = batadv_choose_orig(primary_if->net_dev->dev_addr,
+ BATADV_DAT_ADDR_MAX);
+
+ bat_priv->dat.addr = (batadv_dat_addr_t)addr;
+}
+
+int batadv_dat_init(struct batadv_priv *bat_priv);
+void batadv_dat_free(struct batadv_priv *bat_priv);
+int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset);
+
+/**
+ * batadv_dat_inc_counter - increment the correct DAT packet counter
+ * @bat_priv: the bat priv with all the soft interface information
+ * @subtype: the 4addr subtype of the packet to be counted
+ *
+ * Updates the ethtool statistics for the received packet if it is a DAT subtype
+ */
+static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv,
+ uint8_t subtype)
+{
+ switch (subtype) {
+ case BATADV_P_DAT_DHT_GET:
+ batadv_inc_counter(bat_priv,
+ BATADV_CNT_DAT_GET_RX);
+ break;
+ case BATADV_P_DAT_DHT_PUT:
+ batadv_inc_counter(bat_priv,
+ BATADV_CNT_DAT_PUT_RX);
+ break;
+ }
+}
+
+#else
+
+static inline void batadv_dat_status_update(struct net_device *net_dev)
+{
+}
+
+static inline bool
+batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline bool
+batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ return false;
+}
+
+static inline bool
+batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline bool
+batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ return false;
+}
+
+static inline bool
+batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet)
+{
+ return false;
+}
+
+static inline void
+batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node)
+{
+}
+
+static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *iface)
+{
+}
+
+static inline void batadv_arp_change_timeout(struct net_device *soft_iface,
+ const char *name)
+{
+}
+
+static inline int batadv_dat_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_dat_free(struct batadv_priv *bat_priv)
+{
+}
+
+static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv,
+ uint8_t subtype)
+{
+}
+
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
+#endif /* _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_ */
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
new file mode 100644
index 000000000..3d1dcaa3e
--- /dev/null
+++ b/net/batman-adv/fragmentation.c
@@ -0,0 +1,497 @@
+/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors:
+ *
+ * Martin Hundebøll <martin@hundeboll.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "fragmentation.h"
+#include "send.h"
+#include "originator.h"
+#include "routing.h"
+#include "hard-interface.h"
+#include "soft-interface.h"
+
+/**
+ * batadv_frag_clear_chain - delete entries in the fragment buffer chain
+ * @head: head of chain with entries.
+ *
+ * Free fragments in the passed hlist. Should be called with appropriate lock.
+ */
+static void batadv_frag_clear_chain(struct hlist_head *head)
+{
+ struct batadv_frag_list_entry *entry;
+ struct hlist_node *node;
+
+ hlist_for_each_entry_safe(entry, node, head, list) {
+ hlist_del(&entry->list);
+ kfree_skb(entry->skb);
+ kfree(entry);
+ }
+}
+
+/**
+ * batadv_frag_purge_orig - free fragments associated to an orig
+ * @orig_node: originator to free fragments from
+ * @check_cb: optional function to tell if an entry should be purged
+ */
+void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
+ bool (*check_cb)(struct batadv_frag_table_entry *))
+{
+ struct batadv_frag_table_entry *chain;
+ uint8_t i;
+
+ for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
+ chain = &orig_node->fragments[i];
+ spin_lock_bh(&orig_node->fragments[i].lock);
+
+ if (!check_cb || check_cb(chain)) {
+ batadv_frag_clear_chain(&orig_node->fragments[i].head);
+ orig_node->fragments[i].size = 0;
+ }
+
+ spin_unlock_bh(&orig_node->fragments[i].lock);
+ }
+}
+
+/**
+ * batadv_frag_size_limit - maximum possible size of packet to be fragmented
+ *
+ * Returns the maximum size of payload that can be fragmented.
+ */
+static int batadv_frag_size_limit(void)
+{
+ int limit = BATADV_FRAG_MAX_FRAG_SIZE;
+
+ limit -= sizeof(struct batadv_frag_packet);
+ limit *= BATADV_FRAG_MAX_FRAGMENTS;
+
+ return limit;
+}
+
+/**
+ * batadv_frag_init_chain - check and prepare fragment chain for new fragment
+ * @chain: chain in fragments table to init
+ * @seqno: sequence number of the received fragment
+ *
+ * Make chain ready for a fragment with sequence number "seqno". Delete existing
+ * entries if they have an "old" sequence number.
+ *
+ * Caller must hold chain->lock.
+ *
+ * Returns true if chain is empty and caller can just insert the new fragment
+ * without searching for the right position.
+ */
+static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
+ uint16_t seqno)
+{
+ if (chain->seqno == seqno)
+ return false;
+
+ if (!hlist_empty(&chain->head))
+ batadv_frag_clear_chain(&chain->head);
+
+ chain->size = 0;
+ chain->seqno = seqno;
+
+ return true;
+}
+
+/**
+ * batadv_frag_insert_packet - insert a fragment into a fragment chain
+ * @orig_node: originator that the fragment was received from
+ * @skb: skb to insert
+ * @chain_out: list head to attach complete chains of fragments to
+ *
+ * Insert a new fragment into the reverse ordered chain in the right table
+ * entry. The hash table entry is cleared if "old" fragments exist in it.
+ *
+ * Returns true if skb is buffered, false on error. If the chain has all the
+ * fragments needed to merge the packet, the chain is moved to the passed head
+ * to avoid locking the chain in the table.
+ */
+static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
+ struct sk_buff *skb,
+ struct hlist_head *chain_out)
+{
+ struct batadv_frag_table_entry *chain;
+ struct batadv_frag_list_entry *frag_entry_new = NULL, *frag_entry_curr;
+ struct batadv_frag_list_entry *frag_entry_last = NULL;
+ struct batadv_frag_packet *frag_packet;
+ uint8_t bucket;
+ uint16_t seqno, hdr_size = sizeof(struct batadv_frag_packet);
+ bool ret = false;
+
+ /* Linearize packet to avoid linearizing 16 packets in a row when doing
+ * the later merge. Non-linear merge should be added to remove this
+ * linearization.
+ */
+ if (skb_linearize(skb) < 0)
+ goto err;
+
+ frag_packet = (struct batadv_frag_packet *)skb->data;
+ seqno = ntohs(frag_packet->seqno);
+ bucket = seqno % BATADV_FRAG_BUFFER_COUNT;
+
+ frag_entry_new = kmalloc(sizeof(*frag_entry_new), GFP_ATOMIC);
+ if (!frag_entry_new)
+ goto err;
+
+ frag_entry_new->skb = skb;
+ frag_entry_new->no = frag_packet->no;
+
+ /* Select entry in the "chain table" and delete any prior fragments
+ * with another sequence number. batadv_frag_init_chain() returns true,
+ * if the list is empty at return.
+ */
+ chain = &orig_node->fragments[bucket];
+ spin_lock_bh(&chain->lock);
+ if (batadv_frag_init_chain(chain, seqno)) {
+ hlist_add_head(&frag_entry_new->list, &chain->head);
+ chain->size = skb->len - hdr_size;
+ chain->timestamp = jiffies;
+ ret = true;
+ goto out;
+ }
+
+ /* Find the position for the new fragment. */
+ hlist_for_each_entry(frag_entry_curr, &chain->head, list) {
+ /* Drop packet if fragment already exists. */
+ if (frag_entry_curr->no == frag_entry_new->no)
+ goto err_unlock;
+
+ /* Order fragments from highest to lowest. */
+ if (frag_entry_curr->no < frag_entry_new->no) {
+ hlist_add_before(&frag_entry_new->list,
+ &frag_entry_curr->list);
+ chain->size += skb->len - hdr_size;
+ chain->timestamp = jiffies;
+ ret = true;
+ goto out;
+ }
+
+ /* store current entry because it could be the last in list */
+ frag_entry_last = frag_entry_curr;
+ }
+
+ /* Reached the end of the list, so insert after 'frag_entry_last'. */
+ if (likely(frag_entry_last)) {
+ hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list);
+ chain->size += skb->len - hdr_size;
+ chain->timestamp = jiffies;
+ ret = true;
+ }
+
+out:
+ if (chain->size > batadv_frag_size_limit() ||
+ ntohs(frag_packet->total_size) > batadv_frag_size_limit()) {
+ /* Clear chain if total size of either the list or the packet
+ * exceeds the maximum size of one merged packet.
+ */
+ batadv_frag_clear_chain(&chain->head);
+ chain->size = 0;
+ } else if (ntohs(frag_packet->total_size) == chain->size) {
+ /* All fragments received. Hand over chain to caller. */
+ hlist_move_list(&chain->head, chain_out);
+ chain->size = 0;
+ }
+
+err_unlock:
+ spin_unlock_bh(&chain->lock);
+
+err:
+ if (!ret)
+ kfree(frag_entry_new);
+
+ return ret;
+}
+
+/**
+ * batadv_frag_merge_packets - merge a chain of fragments
+ * @chain: head of chain with fragments
+ * @skb: packet with total size of skb after merging
+ *
+ * Expand the first skb in the chain and copy the content of the remaining
+ * skb's into the expanded one. After doing so, clear the chain.
+ *
+ * Returns the merged skb or NULL on error.
+ */
+static struct sk_buff *
+batadv_frag_merge_packets(struct hlist_head *chain, struct sk_buff *skb)
+{
+ struct batadv_frag_packet *packet;
+ struct batadv_frag_list_entry *entry;
+ struct sk_buff *skb_out = NULL;
+ int size, hdr_size = sizeof(struct batadv_frag_packet);
+
+ /* Make sure incoming skb has non-bogus data. */
+ packet = (struct batadv_frag_packet *)skb->data;
+ size = ntohs(packet->total_size);
+ if (size > batadv_frag_size_limit())
+ goto free;
+
+ /* Remove first entry, as this is the destination for the rest of the
+ * fragments.
+ */
+ entry = hlist_entry(chain->first, struct batadv_frag_list_entry, list);
+ hlist_del(&entry->list);
+ skb_out = entry->skb;
+ kfree(entry);
+
+ /* Make room for the rest of the fragments. */
+ if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) {
+ kfree_skb(skb_out);
+ skb_out = NULL;
+ goto free;
+ }
+
+ /* Move the existing MAC header to just before the payload. (Override
+ * the fragment header.)
+ */
+ skb_pull_rcsum(skb_out, hdr_size);
+ memmove(skb_out->data - ETH_HLEN, skb_mac_header(skb_out), ETH_HLEN);
+ skb_set_mac_header(skb_out, -ETH_HLEN);
+ skb_reset_network_header(skb_out);
+ skb_reset_transport_header(skb_out);
+
+ /* Copy the payload of the each fragment into the last skb */
+ hlist_for_each_entry(entry, chain, list) {
+ size = entry->skb->len - hdr_size;
+ memcpy(skb_put(skb_out, size), entry->skb->data + hdr_size,
+ size);
+ }
+
+free:
+ /* Locking is not needed, because 'chain' is not part of any orig. */
+ batadv_frag_clear_chain(chain);
+ return skb_out;
+}
+
+/**
+ * batadv_frag_skb_buffer - buffer fragment for later merge
+ * @skb: skb to buffer
+ * @orig_node_src: originator that the skb is received from
+ *
+ * Add fragment to buffer and merge fragments if possible.
+ *
+ * There are three possible outcomes: 1) Packet is merged: Return true and
+ * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
+ * to NULL; 3) Error: Return false and leave skb as is.
+ */
+bool batadv_frag_skb_buffer(struct sk_buff **skb,
+ struct batadv_orig_node *orig_node_src)
+{
+ struct sk_buff *skb_out = NULL;
+ struct hlist_head head = HLIST_HEAD_INIT;
+ bool ret = false;
+
+ /* Add packet to buffer and table entry if merge is possible. */
+ if (!batadv_frag_insert_packet(orig_node_src, *skb, &head))
+ goto out_err;
+
+ /* Leave if more fragments are needed to merge. */
+ if (hlist_empty(&head))
+ goto out;
+
+ skb_out = batadv_frag_merge_packets(&head, *skb);
+ if (!skb_out)
+ goto out_err;
+
+out:
+ *skb = skb_out;
+ ret = true;
+out_err:
+ return ret;
+}
+
+/**
+ * batadv_frag_skb_fwd - forward fragments that would exceed MTU when merged
+ * @skb: skb to forward
+ * @recv_if: interface that the skb is received on
+ * @orig_node_src: originator that the skb is received from
+ *
+ * Look up the next-hop of the fragments payload and check if the merged packet
+ * will exceed the MTU towards the next-hop. If so, the fragment is forwarded
+ * without merging it.
+ *
+ * Returns true if the fragment is consumed/forwarded, false otherwise.
+ */
+bool batadv_frag_skb_fwd(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_orig_node *orig_node_src)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_orig_node *orig_node_dst = NULL;
+ struct batadv_neigh_node *neigh_node = NULL;
+ struct batadv_frag_packet *packet;
+ uint16_t total_size;
+ bool ret = false;
+
+ packet = (struct batadv_frag_packet *)skb->data;
+ orig_node_dst = batadv_orig_hash_find(bat_priv, packet->dest);
+ if (!orig_node_dst)
+ goto out;
+
+ neigh_node = batadv_find_router(bat_priv, orig_node_dst, recv_if);
+ if (!neigh_node)
+ goto out;
+
+ /* Forward the fragment, if the merged packet would be too big to
+ * be assembled.
+ */
+ total_size = ntohs(packet->total_size);
+ if (total_size > neigh_node->if_incoming->net_dev->mtu) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_FWD);
+ batadv_add_counter(bat_priv, BATADV_CNT_FRAG_FWD_BYTES,
+ skb->len + ETH_HLEN);
+
+ packet->ttl--;
+ batadv_send_skb_packet(skb, neigh_node->if_incoming,
+ neigh_node->addr);
+ ret = true;
+ }
+
+out:
+ if (orig_node_dst)
+ batadv_orig_node_free_ref(orig_node_dst);
+ if (neigh_node)
+ batadv_neigh_node_free_ref(neigh_node);
+ return ret;
+}
+
+/**
+ * batadv_frag_create - create a fragment from skb
+ * @skb: skb to create fragment from
+ * @frag_head: header to use in new fragment
+ * @mtu: size of new fragment
+ *
+ * Split the passed skb into two fragments: A new one with size matching the
+ * passed mtu and the old one with the rest. The new skb contains data from the
+ * tail of the old skb.
+ *
+ * Returns the new fragment, NULL on error.
+ */
+static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
+ struct batadv_frag_packet *frag_head,
+ unsigned int mtu)
+{
+ struct sk_buff *skb_fragment;
+ unsigned header_size = sizeof(*frag_head);
+ unsigned fragment_size = mtu - header_size;
+
+ skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN);
+ if (!skb_fragment)
+ goto err;
+
+ skb->priority = TC_PRIO_CONTROL;
+
+ /* Eat the last mtu-bytes of the skb */
+ skb_reserve(skb_fragment, header_size + ETH_HLEN);
+ skb_split(skb, skb_fragment, skb->len - fragment_size);
+
+ /* Add the header */
+ skb_push(skb_fragment, header_size);
+ memcpy(skb_fragment->data, frag_head, header_size);
+
+err:
+ return skb_fragment;
+}
+
+/**
+ * batadv_frag_send_packet - create up to 16 fragments from the passed skb
+ * @skb: skb to create fragments from
+ * @orig_node: final destination of the created fragments
+ * @neigh_node: next-hop of the created fragments
+ *
+ * Returns true on success, false otherwise.
+ */
+bool batadv_frag_send_packet(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node)
+{
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_frag_packet frag_header;
+ struct sk_buff *skb_fragment;
+ unsigned mtu = neigh_node->if_incoming->net_dev->mtu;
+ unsigned header_size = sizeof(frag_header);
+ unsigned max_fragment_size, max_packet_size;
+ bool ret = false;
+
+ /* To avoid merge and refragmentation at next-hops we never send
+ * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE
+ */
+ mtu = min_t(unsigned, mtu, BATADV_FRAG_MAX_FRAG_SIZE);
+ max_fragment_size = mtu - header_size;
+ max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS;
+
+ /* Don't even try to fragment, if we need more than 16 fragments */
+ if (skb->len > max_packet_size)
+ goto out_err;
+
+ bat_priv = orig_node->bat_priv;
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out_err;
+
+ /* Create one header to be copied to all fragments */
+ frag_header.packet_type = BATADV_UNICAST_FRAG;
+ frag_header.version = BATADV_COMPAT_VERSION;
+ frag_header.ttl = BATADV_TTL;
+ frag_header.seqno = htons(atomic_inc_return(&bat_priv->frag_seqno));
+ frag_header.reserved = 0;
+ frag_header.no = 0;
+ frag_header.total_size = htons(skb->len);
+ ether_addr_copy(frag_header.orig, primary_if->net_dev->dev_addr);
+ ether_addr_copy(frag_header.dest, orig_node->orig);
+
+ /* Eat and send fragments from the tail of skb */
+ while (skb->len > max_fragment_size) {
+ skb_fragment = batadv_frag_create(skb, &frag_header, mtu);
+ if (!skb_fragment)
+ goto out_err;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
+ skb_fragment->len + ETH_HLEN);
+ batadv_send_skb_packet(skb_fragment, neigh_node->if_incoming,
+ neigh_node->addr);
+ frag_header.no++;
+
+ /* The initial check in this function should cover this case */
+ if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1)
+ goto out_err;
+ }
+
+ /* Make room for the fragment header. */
+ if (batadv_skb_head_push(skb, header_size) < 0 ||
+ pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0)
+ goto out_err;
+
+ memcpy(skb->data, &frag_header, header_size);
+
+ /* Send the last fragment */
+ batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
+ skb->len + ETH_HLEN);
+ batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+
+ ret = true;
+
+out_err:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+
+ return ret;
+}
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
new file mode 100644
index 000000000..d848cf667
--- /dev/null
+++ b/net/batman-adv/fragmentation.h
@@ -0,0 +1,47 @@
+/* Copyright (C) 2013-2014 B.A.T.M.A.N. contributors:
+ *
+ * Martin Hundebøll <martin@hundeboll.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_
+#define _NET_BATMAN_ADV_FRAGMENTATION_H_
+
+void batadv_frag_purge_orig(struct batadv_orig_node *orig,
+ bool (*check_cb)(struct batadv_frag_table_entry *));
+bool batadv_frag_skb_fwd(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_orig_node *orig_node_src);
+bool batadv_frag_skb_buffer(struct sk_buff **skb,
+ struct batadv_orig_node *orig_node);
+bool batadv_frag_send_packet(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node);
+
+/**
+ * batadv_frag_check_entry - check if a list of fragments has timed out
+ * @frags_entry: table entry to check
+ *
+ * Returns true if the frags entry has timed out, false otherwise.
+ */
+static inline bool
+batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry)
+{
+ if (!hlist_empty(&frags_entry->head) &&
+ batadv_has_timed_out(frags_entry->timestamp, BATADV_FRAG_TIMEOUT))
+ return true;
+ return false;
+}
+
+#endif /* _NET_BATMAN_ADV_FRAGMENTATION_H_ */
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
new file mode 100644
index 000000000..090828cf1
--- /dev/null
+++ b/net/batman-adv/gateway_client.c
@@ -0,0 +1,881 @@
+/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "sysfs.h"
+#include "gateway_client.h"
+#include "gateway_common.h"
+#include "hard-interface.h"
+#include "originator.h"
+#include "translation-table.h"
+#include "routing.h"
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <linux/if_vlan.h>
+
+/* These are the offsets of the "hw type" and "hw address length" in the dhcp
+ * packet starting at the beginning of the dhcp header
+ */
+#define BATADV_DHCP_HTYPE_OFFSET 1
+#define BATADV_DHCP_HLEN_OFFSET 2
+/* Value of htype representing Ethernet */
+#define BATADV_DHCP_HTYPE_ETHERNET 0x01
+/* This is the offset of the "chaddr" field in the dhcp packet starting at the
+ * beginning of the dhcp header
+ */
+#define BATADV_DHCP_CHADDR_OFFSET 28
+
+static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node)
+{
+ if (atomic_dec_and_test(&gw_node->refcount)) {
+ batadv_orig_node_free_ref(gw_node->orig_node);
+ kfree_rcu(gw_node, rcu);
+ }
+}
+
+static struct batadv_gw_node *
+batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *gw_node;
+
+ rcu_read_lock();
+ gw_node = rcu_dereference(bat_priv->gw.curr_gw);
+ if (!gw_node)
+ goto out;
+
+ if (!atomic_inc_not_zero(&gw_node->refcount))
+ gw_node = NULL;
+
+out:
+ rcu_read_unlock();
+ return gw_node;
+}
+
+struct batadv_orig_node *
+batadv_gw_get_selected_orig(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *gw_node;
+ struct batadv_orig_node *orig_node = NULL;
+
+ gw_node = batadv_gw_get_selected_gw_node(bat_priv);
+ if (!gw_node)
+ goto out;
+
+ rcu_read_lock();
+ orig_node = gw_node->orig_node;
+ if (!orig_node)
+ goto unlock;
+
+ if (!atomic_inc_not_zero(&orig_node->refcount))
+ orig_node = NULL;
+
+unlock:
+ rcu_read_unlock();
+out:
+ if (gw_node)
+ batadv_gw_node_free_ref(gw_node);
+ return orig_node;
+}
+
+static void batadv_gw_select(struct batadv_priv *bat_priv,
+ struct batadv_gw_node *new_gw_node)
+{
+ struct batadv_gw_node *curr_gw_node;
+
+ spin_lock_bh(&bat_priv->gw.list_lock);
+
+ if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount))
+ new_gw_node = NULL;
+
+ curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1);
+ rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node);
+
+ if (curr_gw_node)
+ batadv_gw_node_free_ref(curr_gw_node);
+
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+}
+
+/**
+ * batadv_gw_reselect - force a gateway reselection
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Set a flag to remind the GW component to perform a new gateway reselection.
+ * However this function does not ensure that the current gateway is going to be
+ * deselected. The reselection mechanism may elect the same gateway once again.
+ *
+ * This means that invoking batadv_gw_reselect() does not guarantee a gateway
+ * change and therefore a uevent is not necessarily expected.
+ */
+void batadv_gw_reselect(struct batadv_priv *bat_priv)
+{
+ atomic_set(&bat_priv->gw.reselect, 1);
+}
+
+static struct batadv_gw_node *
+batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
+{
+ struct batadv_neigh_node *router;
+ struct batadv_neigh_ifinfo *router_ifinfo;
+ struct batadv_gw_node *gw_node, *curr_gw = NULL;
+ uint32_t max_gw_factor = 0, tmp_gw_factor = 0;
+ uint32_t gw_divisor;
+ uint8_t max_tq = 0;
+ uint8_t tq_avg;
+ struct batadv_orig_node *orig_node;
+
+ gw_divisor = BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE;
+ gw_divisor *= 64;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
+ if (gw_node->deleted)
+ continue;
+
+ orig_node = gw_node->orig_node;
+ router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ continue;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router,
+ BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto next;
+
+ if (!atomic_inc_not_zero(&gw_node->refcount))
+ goto next;
+
+ tq_avg = router_ifinfo->bat_iv.tq_avg;
+
+ switch (atomic_read(&bat_priv->gw_sel_class)) {
+ case 1: /* fast connection */
+ tmp_gw_factor = tq_avg * tq_avg;
+ tmp_gw_factor *= gw_node->bandwidth_down;
+ tmp_gw_factor *= 100 * 100;
+ tmp_gw_factor /= gw_divisor;
+
+ if ((tmp_gw_factor > max_gw_factor) ||
+ ((tmp_gw_factor == max_gw_factor) &&
+ (tq_avg > max_tq))) {
+ if (curr_gw)
+ batadv_gw_node_free_ref(curr_gw);
+ curr_gw = gw_node;
+ atomic_inc(&curr_gw->refcount);
+ }
+ break;
+
+ default: /* 2: stable connection (use best statistic)
+ * 3: fast-switch (use best statistic but change as
+ * soon as a better gateway appears)
+ * XX: late-switch (use best statistic but change as
+ * soon as a better gateway appears which has
+ * $routing_class more tq points)
+ */
+ if (tq_avg > max_tq) {
+ if (curr_gw)
+ batadv_gw_node_free_ref(curr_gw);
+ curr_gw = gw_node;
+ atomic_inc(&curr_gw->refcount);
+ }
+ break;
+ }
+
+ if (tq_avg > max_tq)
+ max_tq = tq_avg;
+
+ if (tmp_gw_factor > max_gw_factor)
+ max_gw_factor = tmp_gw_factor;
+
+ batadv_gw_node_free_ref(gw_node);
+
+next:
+ batadv_neigh_node_free_ref(router);
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_free_ref(router_ifinfo);
+ }
+ rcu_read_unlock();
+
+ return curr_gw;
+}
+
+/**
+ * batadv_gw_check_client_stop - check if client mode has been switched off
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * This function assumes the caller has checked that the gw state *is actually
+ * changing*. This function is not supposed to be called when there is no state
+ * change.
+ */
+void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *curr_gw;
+
+ if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT)
+ return;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ if (!curr_gw)
+ return;
+
+ /* deselect the current gateway so that next time that client mode is
+ * enabled a proper GW_ADD event can be sent
+ */
+ batadv_gw_select(bat_priv, NULL);
+
+ /* if batman-adv is switching the gw client mode off and a gateway was
+ * already selected, send a DEL uevent
+ */
+ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL);
+
+ batadv_gw_node_free_ref(curr_gw);
+}
+
+void batadv_gw_election(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *curr_gw = NULL, *next_gw = NULL;
+ struct batadv_neigh_node *router = NULL;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ char gw_addr[18] = { '\0' };
+
+ if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw)
+ goto out;
+
+ next_gw = batadv_gw_get_best_gw_node(bat_priv);
+
+ if (curr_gw == next_gw)
+ goto out;
+
+ if (next_gw) {
+ sprintf(gw_addr, "%pM", next_gw->orig_node->orig);
+
+ router = batadv_orig_router_get(next_gw->orig_node,
+ BATADV_IF_DEFAULT);
+ if (!router) {
+ batadv_gw_reselect(bat_priv);
+ goto out;
+ }
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router,
+ BATADV_IF_DEFAULT);
+ if (!router_ifinfo) {
+ batadv_gw_reselect(bat_priv);
+ goto out;
+ }
+ }
+
+ if ((curr_gw) && (!next_gw)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Removing selected gateway - no gateway in range\n");
+ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL,
+ NULL);
+ } else if ((!curr_gw) && (next_gw)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n",
+ next_gw->orig_node->orig,
+ next_gw->bandwidth_down / 10,
+ next_gw->bandwidth_down % 10,
+ next_gw->bandwidth_up / 10,
+ next_gw->bandwidth_up % 10,
+ router_ifinfo->bat_iv.tq_avg);
+ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_ADD,
+ gw_addr);
+ } else {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Changing route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n",
+ next_gw->orig_node->orig,
+ next_gw->bandwidth_down / 10,
+ next_gw->bandwidth_down % 10,
+ next_gw->bandwidth_up / 10,
+ next_gw->bandwidth_up % 10,
+ router_ifinfo->bat_iv.tq_avg);
+ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_CHANGE,
+ gw_addr);
+ }
+
+ batadv_gw_select(bat_priv, next_gw);
+
+out:
+ if (curr_gw)
+ batadv_gw_node_free_ref(curr_gw);
+ if (next_gw)
+ batadv_gw_node_free_ref(next_gw);
+ if (router)
+ batadv_neigh_node_free_ref(router);
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_free_ref(router_ifinfo);
+}
+
+void batadv_gw_check_election(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_neigh_ifinfo *router_orig_tq = NULL;
+ struct batadv_neigh_ifinfo *router_gw_tq = NULL;
+ struct batadv_orig_node *curr_gw_orig;
+ struct batadv_neigh_node *router_gw = NULL, *router_orig = NULL;
+ uint8_t gw_tq_avg, orig_tq_avg;
+
+ curr_gw_orig = batadv_gw_get_selected_orig(bat_priv);
+ if (!curr_gw_orig)
+ goto reselect;
+
+ router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT);
+ if (!router_gw)
+ goto reselect;
+
+ router_gw_tq = batadv_neigh_ifinfo_get(router_gw,
+ BATADV_IF_DEFAULT);
+ if (!router_gw_tq)
+ goto reselect;
+
+ /* this node already is the gateway */
+ if (curr_gw_orig == orig_node)
+ goto out;
+
+ router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
+ if (!router_orig)
+ goto out;
+
+ router_orig_tq = batadv_neigh_ifinfo_get(router_orig,
+ BATADV_IF_DEFAULT);
+ if (!router_orig_tq)
+ goto out;
+
+ gw_tq_avg = router_gw_tq->bat_iv.tq_avg;
+ orig_tq_avg = router_orig_tq->bat_iv.tq_avg;
+
+ /* the TQ value has to be better */
+ if (orig_tq_avg < gw_tq_avg)
+ goto out;
+
+ /* if the routing class is greater than 3 the value tells us how much
+ * greater the TQ value of the new gateway must be
+ */
+ if ((atomic_read(&bat_priv->gw_sel_class) > 3) &&
+ (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw_sel_class)))
+ goto out;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n",
+ gw_tq_avg, orig_tq_avg);
+
+reselect:
+ batadv_gw_reselect(bat_priv);
+out:
+ if (curr_gw_orig)
+ batadv_orig_node_free_ref(curr_gw_orig);
+ if (router_gw)
+ batadv_neigh_node_free_ref(router_gw);
+ if (router_orig)
+ batadv_neigh_node_free_ref(router_orig);
+ if (router_gw_tq)
+ batadv_neigh_ifinfo_free_ref(router_gw_tq);
+ if (router_orig_tq)
+ batadv_neigh_ifinfo_free_ref(router_orig_tq);
+}
+
+/**
+ * batadv_gw_node_add - add gateway node to list of available gateways
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: originator announcing gateway capabilities
+ * @gateway: announced bandwidth information
+ */
+static void batadv_gw_node_add(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_gateway_data *gateway)
+{
+ struct batadv_gw_node *gw_node;
+
+ if (gateway->bandwidth_down == 0)
+ return;
+
+ if (!atomic_inc_not_zero(&orig_node->refcount))
+ return;
+
+ gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC);
+ if (!gw_node) {
+ batadv_orig_node_free_ref(orig_node);
+ return;
+ }
+
+ INIT_HLIST_NODE(&gw_node->list);
+ gw_node->orig_node = orig_node;
+ atomic_set(&gw_node->refcount, 1);
+
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list);
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n",
+ orig_node->orig,
+ ntohl(gateway->bandwidth_down) / 10,
+ ntohl(gateway->bandwidth_down) % 10,
+ ntohl(gateway->bandwidth_up) / 10,
+ ntohl(gateway->bandwidth_up) % 10);
+}
+
+/**
+ * batadv_gw_node_get - retrieve gateway node from list of available gateways
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: originator announcing gateway capabilities
+ *
+ * Returns gateway node if found or NULL otherwise.
+ */
+static struct batadv_gw_node *
+batadv_gw_node_get(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_gw_node *gw_node_tmp, *gw_node = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.list, list) {
+ if (gw_node_tmp->orig_node != orig_node)
+ continue;
+
+ if (gw_node_tmp->deleted)
+ continue;
+
+ if (!atomic_inc_not_zero(&gw_node_tmp->refcount))
+ continue;
+
+ gw_node = gw_node_tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return gw_node;
+}
+
+/**
+ * batadv_gw_node_update - update list of available gateways with changed
+ * bandwidth information
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: originator announcing gateway capabilities
+ * @gateway: announced bandwidth information
+ */
+void batadv_gw_node_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_gateway_data *gateway)
+{
+ struct batadv_gw_node *gw_node, *curr_gw = NULL;
+
+ gw_node = batadv_gw_node_get(bat_priv, orig_node);
+ if (!gw_node) {
+ batadv_gw_node_add(bat_priv, orig_node, gateway);
+ goto out;
+ }
+
+ if ((gw_node->bandwidth_down == ntohl(gateway->bandwidth_down)) &&
+ (gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)))
+ goto out;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Gateway bandwidth of originator %pM changed from %u.%u/%u.%u MBit to %u.%u/%u.%u MBit\n",
+ orig_node->orig,
+ gw_node->bandwidth_down / 10,
+ gw_node->bandwidth_down % 10,
+ gw_node->bandwidth_up / 10,
+ gw_node->bandwidth_up % 10,
+ ntohl(gateway->bandwidth_down) / 10,
+ ntohl(gateway->bandwidth_down) % 10,
+ ntohl(gateway->bandwidth_up) / 10,
+ ntohl(gateway->bandwidth_up) % 10);
+
+ gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
+ gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
+
+ gw_node->deleted = 0;
+ if (ntohl(gateway->bandwidth_down) == 0) {
+ gw_node->deleted = jiffies;
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Gateway %pM removed from gateway list\n",
+ orig_node->orig);
+
+ /* Note: We don't need a NULL check here, since curr_gw never
+ * gets dereferenced.
+ */
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ if (gw_node == curr_gw)
+ batadv_gw_reselect(bat_priv);
+ }
+
+out:
+ if (curr_gw)
+ batadv_gw_node_free_ref(curr_gw);
+ if (gw_node)
+ batadv_gw_node_free_ref(gw_node);
+}
+
+void batadv_gw_node_delete(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_tvlv_gateway_data gateway;
+
+ gateway.bandwidth_down = 0;
+ gateway.bandwidth_up = 0;
+
+ batadv_gw_node_update(bat_priv, orig_node, &gateway);
+}
+
+void batadv_gw_node_purge(struct batadv_priv *bat_priv)
+{
+ struct batadv_gw_node *gw_node, *curr_gw;
+ struct hlist_node *node_tmp;
+ unsigned long timeout = msecs_to_jiffies(2 * BATADV_PURGE_TIMEOUT);
+ int do_reselect = 0;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ spin_lock_bh(&bat_priv->gw.list_lock);
+
+ hlist_for_each_entry_safe(gw_node, node_tmp,
+ &bat_priv->gw.list, list) {
+ if (((!gw_node->deleted) ||
+ (time_before(jiffies, gw_node->deleted + timeout))) &&
+ atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE)
+ continue;
+
+ if (curr_gw == gw_node)
+ do_reselect = 1;
+
+ hlist_del_rcu(&gw_node->list);
+ batadv_gw_node_free_ref(gw_node);
+ }
+
+ spin_unlock_bh(&bat_priv->gw.list_lock);
+
+ /* gw_reselect() needs to acquire the gw_list_lock */
+ if (do_reselect)
+ batadv_gw_reselect(bat_priv);
+
+ if (curr_gw)
+ batadv_gw_node_free_ref(curr_gw);
+}
+
+/* fails if orig_node has no router */
+static int batadv_write_buffer_text(struct batadv_priv *bat_priv,
+ struct seq_file *seq,
+ const struct batadv_gw_node *gw_node)
+{
+ struct batadv_gw_node *curr_gw;
+ struct batadv_neigh_node *router;
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL;
+ int ret = -1;
+
+ router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
+ if (!router)
+ goto out;
+
+ router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
+ if (!router_ifinfo)
+ goto out;
+
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+
+ seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n",
+ (curr_gw == gw_node ? "=>" : " "),
+ gw_node->orig_node->orig,
+ router_ifinfo->bat_iv.tq_avg, router->addr,
+ router->if_incoming->net_dev->name,
+ gw_node->bandwidth_down / 10,
+ gw_node->bandwidth_down % 10,
+ gw_node->bandwidth_up / 10,
+ gw_node->bandwidth_up % 10);
+ ret = seq_has_overflowed(seq) ? -1 : 0;
+
+ if (curr_gw)
+ batadv_gw_node_free_ref(curr_gw);
+out:
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_free_ref(router_ifinfo);
+ if (router)
+ batadv_neigh_node_free_ref(router);
+ return ret;
+}
+
+int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hard_iface *primary_if;
+ struct batadv_gw_node *gw_node;
+ int gw_count = 0;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ goto out;
+
+ seq_printf(seq,
+ " %-12s (%s/%i) %17s [%10s]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n",
+ "Gateway", "#", BATADV_TQ_MAX_VALUE, "Nexthop", "outgoingIF",
+ BATADV_SOURCE_VERSION, primary_if->net_dev->name,
+ primary_if->net_dev->dev_addr, net_dev->name);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
+ if (gw_node->deleted)
+ continue;
+
+ /* fails if orig_node has no router */
+ if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0)
+ continue;
+
+ gw_count++;
+ }
+ rcu_read_unlock();
+
+ if (gw_count == 0)
+ seq_puts(seq, "No gateways in range ...\n");
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return 0;
+}
+
+/**
+ * batadv_gw_dhcp_recipient_get - check if a packet is a DHCP message
+ * @skb: the packet to check
+ * @header_len: a pointer to the batman-adv header size
+ * @chaddr: buffer where the client address will be stored. Valid
+ * only if the function returns BATADV_DHCP_TO_CLIENT
+ *
+ * Returns:
+ * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error
+ * while parsing it
+ * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server
+ * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client
+ *
+ * This function may re-allocate the data buffer of the skb passed as argument.
+ */
+enum batadv_dhcp_recipient
+batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
+ uint8_t *chaddr)
+{
+ enum batadv_dhcp_recipient ret = BATADV_DHCP_NO;
+ struct ethhdr *ethhdr;
+ struct iphdr *iphdr;
+ struct ipv6hdr *ipv6hdr;
+ struct udphdr *udphdr;
+ struct vlan_ethhdr *vhdr;
+ int chaddr_offset;
+ __be16 proto;
+ uint8_t *p;
+
+ /* check for ethernet header */
+ if (!pskb_may_pull(skb, *header_len + ETH_HLEN))
+ return BATADV_DHCP_NO;
+
+ ethhdr = eth_hdr(skb);
+ proto = ethhdr->h_proto;
+ *header_len += ETH_HLEN;
+
+ /* check for initial vlan header */
+ if (proto == htons(ETH_P_8021Q)) {
+ if (!pskb_may_pull(skb, *header_len + VLAN_HLEN))
+ return BATADV_DHCP_NO;
+
+ vhdr = vlan_eth_hdr(skb);
+ proto = vhdr->h_vlan_encapsulated_proto;
+ *header_len += VLAN_HLEN;
+ }
+
+ /* check for ip header */
+ switch (proto) {
+ case htons(ETH_P_IP):
+ if (!pskb_may_pull(skb, *header_len + sizeof(*iphdr)))
+ return BATADV_DHCP_NO;
+
+ iphdr = (struct iphdr *)(skb->data + *header_len);
+ *header_len += iphdr->ihl * 4;
+
+ /* check for udp header */
+ if (iphdr->protocol != IPPROTO_UDP)
+ return BATADV_DHCP_NO;
+
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_may_pull(skb, *header_len + sizeof(*ipv6hdr)))
+ return BATADV_DHCP_NO;
+
+ ipv6hdr = (struct ipv6hdr *)(skb->data + *header_len);
+ *header_len += sizeof(*ipv6hdr);
+
+ /* check for udp header */
+ if (ipv6hdr->nexthdr != IPPROTO_UDP)
+ return BATADV_DHCP_NO;
+
+ break;
+ default:
+ return BATADV_DHCP_NO;
+ }
+
+ if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr)))
+ return BATADV_DHCP_NO;
+
+ /* skb->data might have been reallocated by pskb_may_pull() */
+ ethhdr = eth_hdr(skb);
+ if (ntohs(ethhdr->h_proto) == ETH_P_8021Q)
+ ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN);
+
+ udphdr = (struct udphdr *)(skb->data + *header_len);
+ *header_len += sizeof(*udphdr);
+
+ /* check for bootp port */
+ switch (proto) {
+ case htons(ETH_P_IP):
+ if (udphdr->dest == htons(67))
+ ret = BATADV_DHCP_TO_SERVER;
+ else if (udphdr->source == htons(67))
+ ret = BATADV_DHCP_TO_CLIENT;
+ break;
+ case htons(ETH_P_IPV6):
+ if (udphdr->dest == htons(547))
+ ret = BATADV_DHCP_TO_SERVER;
+ else if (udphdr->source == htons(547))
+ ret = BATADV_DHCP_TO_CLIENT;
+ break;
+ }
+
+ chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET;
+ /* store the client address if the message is going to a client */
+ if (ret == BATADV_DHCP_TO_CLIENT &&
+ pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) {
+ /* check if the DHCP packet carries an Ethernet DHCP */
+ p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET;
+ if (*p != BATADV_DHCP_HTYPE_ETHERNET)
+ return BATADV_DHCP_NO;
+
+ /* check if the DHCP packet carries a valid Ethernet address */
+ p = skb->data + *header_len + BATADV_DHCP_HLEN_OFFSET;
+ if (*p != ETH_ALEN)
+ return BATADV_DHCP_NO;
+
+ ether_addr_copy(chaddr, skb->data + chaddr_offset);
+ }
+
+ return ret;
+}
+
+/**
+ * batadv_gw_out_of_range - check if the dhcp request destination is the best gw
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the outgoing packet
+ *
+ * Check if the skb is a DHCP request and if it is sent to the current best GW
+ * server. Due to topology changes it may be the case that the GW server
+ * previously selected is not the best one anymore.
+ *
+ * Returns true if the packet destination is unicast and it is not the best gw,
+ * false otherwise.
+ *
+ * This call might reallocate skb data.
+ * Must be invoked only when the DHCP packet is going TO a DHCP SERVER.
+ */
+bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_neigh_node *neigh_curr = NULL, *neigh_old = NULL;
+ struct batadv_orig_node *orig_dst_node = NULL;
+ struct batadv_gw_node *gw_node = NULL, *curr_gw = NULL;
+ struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo;
+ struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
+ bool out_of_range = false;
+ uint8_t curr_tq_avg;
+ unsigned short vid;
+
+ vid = batadv_get_vid(skb, 0);
+
+ orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source,
+ ethhdr->h_dest, vid);
+ if (!orig_dst_node)
+ goto out;
+
+ gw_node = batadv_gw_node_get(bat_priv, orig_dst_node);
+ if (!gw_node)
+ goto out;
+
+ switch (atomic_read(&bat_priv->gw_mode)) {
+ case BATADV_GW_MODE_SERVER:
+ /* If we are a GW then we are our best GW. We can artificially
+ * set the tq towards ourself as the maximum value
+ */
+ curr_tq_avg = BATADV_TQ_MAX_VALUE;
+ break;
+ case BATADV_GW_MODE_CLIENT:
+ curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ if (!curr_gw)
+ goto out;
+
+ /* packet is going to our gateway */
+ if (curr_gw->orig_node == orig_dst_node)
+ goto out;
+
+ /* If the dhcp packet has been sent to a different gw,
+ * we have to evaluate whether the old gw is still
+ * reliable enough
+ */
+ neigh_curr = batadv_find_router(bat_priv, curr_gw->orig_node,
+ NULL);
+ if (!neigh_curr)
+ goto out;
+
+ curr_ifinfo = batadv_neigh_ifinfo_get(neigh_curr,
+ BATADV_IF_DEFAULT);
+ if (!curr_ifinfo)
+ goto out;
+
+ curr_tq_avg = curr_ifinfo->bat_iv.tq_avg;
+ batadv_neigh_ifinfo_free_ref(curr_ifinfo);
+
+ break;
+ case BATADV_GW_MODE_OFF:
+ default:
+ goto out;
+ }
+
+ neigh_old = batadv_find_router(bat_priv, orig_dst_node, NULL);
+ if (!neigh_old)
+ goto out;
+
+ old_ifinfo = batadv_neigh_ifinfo_get(neigh_old, BATADV_IF_DEFAULT);
+ if (!old_ifinfo)
+ goto out;
+
+ if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD)
+ out_of_range = true;
+ batadv_neigh_ifinfo_free_ref(old_ifinfo);
+
+out:
+ if (orig_dst_node)
+ batadv_orig_node_free_ref(orig_dst_node);
+ if (curr_gw)
+ batadv_gw_node_free_ref(curr_gw);
+ if (gw_node)
+ batadv_gw_node_free_ref(gw_node);
+ if (neigh_old)
+ batadv_neigh_node_free_ref(neigh_old);
+ if (neigh_curr)
+ batadv_neigh_node_free_ref(neigh_curr);
+ return out_of_range;
+}
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
new file mode 100644
index 000000000..7ee53bb7d
--- /dev/null
+++ b/net/batman-adv/gateway_client.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
+#define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
+
+void batadv_gw_check_client_stop(struct batadv_priv *bat_priv);
+void batadv_gw_reselect(struct batadv_priv *bat_priv);
+void batadv_gw_election(struct batadv_priv *bat_priv);
+struct batadv_orig_node *
+batadv_gw_get_selected_orig(struct batadv_priv *bat_priv);
+void batadv_gw_check_election(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node);
+void batadv_gw_node_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_gateway_data *gateway);
+void batadv_gw_node_delete(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node);
+void batadv_gw_node_purge(struct batadv_priv *bat_priv);
+int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset);
+bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb);
+enum batadv_dhcp_recipient
+batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
+ uint8_t *chaddr);
+
+#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
new file mode 100644
index 000000000..88a1bc380
--- /dev/null
+++ b/net/batman-adv/gateway_common.c
@@ -0,0 +1,241 @@
+/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+
+/**
+ * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download
+ * and upload bandwidth information
+ * @net_dev: the soft interface net device
+ * @buff: string buffer to parse
+ * @down: pointer holding the returned download bandwidth information
+ * @up: pointer holding the returned upload bandwidth information
+ *
+ * Returns false on parse error and true otherwise.
+ */
+static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
+ uint32_t *down, uint32_t *up)
+{
+ enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT;
+ char *slash_ptr, *tmp_ptr;
+ long ldown, lup;
+ int ret;
+
+ slash_ptr = strchr(buff, '/');
+ if (slash_ptr)
+ *slash_ptr = 0;
+
+ if (strlen(buff) > 4) {
+ tmp_ptr = buff + strlen(buff) - 4;
+
+ if (strncasecmp(tmp_ptr, "mbit", 4) == 0)
+ bw_unit_type = BATADV_BW_UNIT_MBIT;
+
+ if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) ||
+ (bw_unit_type == BATADV_BW_UNIT_MBIT))
+ *tmp_ptr = '\0';
+ }
+
+ ret = kstrtol(buff, 10, &ldown);
+ if (ret) {
+ batadv_err(net_dev,
+ "Download speed of gateway mode invalid: %s\n",
+ buff);
+ return false;
+ }
+
+ switch (bw_unit_type) {
+ case BATADV_BW_UNIT_MBIT:
+ *down = ldown * 10;
+ break;
+ case BATADV_BW_UNIT_KBIT:
+ default:
+ *down = ldown / 100;
+ break;
+ }
+
+ /* we also got some upload info */
+ if (slash_ptr) {
+ bw_unit_type = BATADV_BW_UNIT_KBIT;
+
+ if (strlen(slash_ptr + 1) > 4) {
+ tmp_ptr = slash_ptr + 1 - 4 + strlen(slash_ptr + 1);
+
+ if (strncasecmp(tmp_ptr, "mbit", 4) == 0)
+ bw_unit_type = BATADV_BW_UNIT_MBIT;
+
+ if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) ||
+ (bw_unit_type == BATADV_BW_UNIT_MBIT))
+ *tmp_ptr = '\0';
+ }
+
+ ret = kstrtol(slash_ptr + 1, 10, &lup);
+ if (ret) {
+ batadv_err(net_dev,
+ "Upload speed of gateway mode invalid: %s\n",
+ slash_ptr + 1);
+ return false;
+ }
+
+ switch (bw_unit_type) {
+ case BATADV_BW_UNIT_MBIT:
+ *up = lup * 10;
+ break;
+ case BATADV_BW_UNIT_KBIT:
+ default:
+ *up = lup / 100;
+ break;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * batadv_gw_tvlv_container_update - update the gw tvlv container after gateway
+ * setting change
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv)
+{
+ struct batadv_tvlv_gateway_data gw;
+ uint32_t down, up;
+ char gw_mode;
+
+ gw_mode = atomic_read(&bat_priv->gw_mode);
+
+ switch (gw_mode) {
+ case BATADV_GW_MODE_OFF:
+ case BATADV_GW_MODE_CLIENT:
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_GW, 1);
+ break;
+ case BATADV_GW_MODE_SERVER:
+ down = atomic_read(&bat_priv->gw.bandwidth_down);
+ up = atomic_read(&bat_priv->gw.bandwidth_up);
+ gw.bandwidth_down = htonl(down);
+ gw.bandwidth_up = htonl(up);
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_GW, 1,
+ &gw, sizeof(gw));
+ break;
+ }
+}
+
+ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
+ size_t count)
+{
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ uint32_t down_curr, up_curr, down_new = 0, up_new = 0;
+ bool ret;
+
+ down_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_down);
+ up_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_up);
+
+ ret = batadv_parse_gw_bandwidth(net_dev, buff, &down_new, &up_new);
+ if (!ret)
+ goto end;
+
+ if (!down_new)
+ down_new = 1;
+
+ if (!up_new)
+ up_new = down_new / 5;
+
+ if (!up_new)
+ up_new = 1;
+
+ if ((down_curr == down_new) && (up_curr == up_new))
+ return count;
+
+ batadv_gw_reselect(bat_priv);
+ batadv_info(net_dev,
+ "Changing gateway bandwidth from: '%u.%u/%u.%u MBit' to: '%u.%u/%u.%u MBit'\n",
+ down_curr / 10, down_curr % 10, up_curr / 10, up_curr % 10,
+ down_new / 10, down_new % 10, up_new / 10, up_new % 10);
+
+ atomic_set(&bat_priv->gw.bandwidth_down, down_new);
+ atomic_set(&bat_priv->gw.bandwidth_up, up_new);
+ batadv_gw_tvlv_container_update(bat_priv);
+
+end:
+ return count;
+}
+
+/**
+ * batadv_gw_tvlv_ogm_handler_v1 - process incoming gateway tvlv container
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the gateway data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t flags,
+ void *tvlv_value,
+ uint16_t tvlv_value_len)
+{
+ struct batadv_tvlv_gateway_data gateway, *gateway_ptr;
+
+ /* only fetch the tvlv value if the handler wasn't called via the
+ * CIFNOTFND flag and if there is data to fetch
+ */
+ if ((flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) ||
+ (tvlv_value_len < sizeof(gateway))) {
+ gateway.bandwidth_down = 0;
+ gateway.bandwidth_up = 0;
+ } else {
+ gateway_ptr = tvlv_value;
+ gateway.bandwidth_down = gateway_ptr->bandwidth_down;
+ gateway.bandwidth_up = gateway_ptr->bandwidth_up;
+ if ((gateway.bandwidth_down == 0) ||
+ (gateway.bandwidth_up == 0)) {
+ gateway.bandwidth_down = 0;
+ gateway.bandwidth_up = 0;
+ }
+ }
+
+ batadv_gw_node_update(bat_priv, orig, &gateway);
+
+ /* restart gateway selection if fast or late switching was enabled */
+ if ((gateway.bandwidth_down != 0) &&
+ (atomic_read(&bat_priv->gw_mode) == BATADV_GW_MODE_CLIENT) &&
+ (atomic_read(&bat_priv->gw_sel_class) > 2))
+ batadv_gw_check_election(bat_priv, orig);
+}
+
+/**
+ * batadv_gw_init - initialise the gateway handling internals
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_gw_init(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1,
+ NULL, BATADV_TVLV_GW, 1,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+}
+
+/**
+ * batadv_gw_free - free the gateway handling internals
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_gw_free(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_GW, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_GW, 1);
+}
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
new file mode 100644
index 000000000..aa5116561
--- /dev/null
+++ b/net/batman-adv/gateway_common.h
@@ -0,0 +1,47 @@
+/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_
+#define _NET_BATMAN_ADV_GATEWAY_COMMON_H_
+
+enum batadv_gw_modes {
+ BATADV_GW_MODE_OFF,
+ BATADV_GW_MODE_CLIENT,
+ BATADV_GW_MODE_SERVER,
+};
+
+/**
+ * enum batadv_bandwidth_units - bandwidth unit types
+ * @BATADV_BW_UNIT_KBIT: unit type kbit
+ * @BATADV_BW_UNIT_MBIT: unit type mbit
+ */
+enum batadv_bandwidth_units {
+ BATADV_BW_UNIT_KBIT,
+ BATADV_BW_UNIT_MBIT,
+};
+
+#define BATADV_GW_MODE_OFF_NAME "off"
+#define BATADV_GW_MODE_CLIENT_NAME "client"
+#define BATADV_GW_MODE_SERVER_NAME "server"
+
+ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
+ size_t count);
+void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv);
+void batadv_gw_init(struct batadv_priv *bat_priv);
+void batadv_gw_free(struct batadv_priv *bat_priv);
+
+#endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
new file mode 100644
index 000000000..baf1f9843
--- /dev/null
+++ b/net/batman-adv/hard-interface.c
@@ -0,0 +1,708 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "distributed-arp-table.h"
+#include "hard-interface.h"
+#include "soft-interface.h"
+#include "send.h"
+#include "translation-table.h"
+#include "routing.h"
+#include "sysfs.h"
+#include "debugfs.h"
+#include "originator.h"
+#include "hash.h"
+#include "bridge_loop_avoidance.h"
+#include "gateway_client.h"
+
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+
+void batadv_hardif_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ hard_iface = container_of(rcu, struct batadv_hard_iface, rcu);
+ dev_put(hard_iface->net_dev);
+ kfree(hard_iface);
+}
+
+struct batadv_hard_iface *
+batadv_hardif_get_by_netdev(const struct net_device *net_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->net_dev == net_dev &&
+ atomic_inc_not_zero(&hard_iface->refcount))
+ goto out;
+ }
+
+ hard_iface = NULL;
+
+out:
+ rcu_read_unlock();
+ return hard_iface;
+}
+
+/**
+ * batadv_is_on_batman_iface - check if a device is a batman iface descendant
+ * @net_dev: the device to check
+ *
+ * If the user creates any virtual device on top of a batman-adv interface, it
+ * is important to prevent this new interface to be used to create a new mesh
+ * network (this behaviour would lead to a batman-over-batman configuration).
+ * This function recursively checks all the fathers of the device passed as
+ * argument looking for a batman-adv soft interface.
+ *
+ * Returns true if the device is descendant of a batman-adv mesh interface (or
+ * if it is a batman-adv interface itself), false otherwise
+ */
+static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
+{
+ struct net_device *parent_dev;
+ bool ret;
+
+ /* check if this is a batman-adv mesh interface */
+ if (batadv_softif_is_valid(net_dev))
+ return true;
+
+ /* no more parents..stop recursion */
+ if (dev_get_iflink(net_dev) == 0 ||
+ dev_get_iflink(net_dev) == net_dev->ifindex)
+ return false;
+
+ /* recurse over the parent device */
+ parent_dev = __dev_get_by_index(&init_net, dev_get_iflink(net_dev));
+ /* if we got a NULL parent_dev there is something broken.. */
+ if (WARN(!parent_dev, "Cannot find parent device"))
+ return false;
+
+ ret = batadv_is_on_batman_iface(parent_dev);
+
+ return ret;
+}
+
+static int batadv_is_valid_iface(const struct net_device *net_dev)
+{
+ if (net_dev->flags & IFF_LOOPBACK)
+ return 0;
+
+ if (net_dev->type != ARPHRD_ETHER)
+ return 0;
+
+ if (net_dev->addr_len != ETH_ALEN)
+ return 0;
+
+ /* no batman over batman */
+ if (batadv_is_on_batman_iface(net_dev))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * batadv_is_wifi_netdev - check if the given net_device struct is a wifi
+ * interface
+ * @net_device: the device to check
+ *
+ * Returns true if the net device is a 802.11 wireless device, false otherwise.
+ */
+bool batadv_is_wifi_netdev(struct net_device *net_device)
+{
+ if (!net_device)
+ return false;
+
+#ifdef CONFIG_WIRELESS_EXT
+ /* pre-cfg80211 drivers have to implement WEXT, so it is possible to
+ * check for wireless_handlers != NULL
+ */
+ if (net_device->wireless_handlers)
+ return true;
+#endif
+
+ /* cfg80211 drivers have to set ieee80211_ptr */
+ if (net_device->ieee80211_ptr)
+ return true;
+
+ return false;
+}
+
+static struct batadv_hard_iface *
+batadv_hardif_get_active(const struct net_device *soft_iface)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface != soft_iface)
+ continue;
+
+ if (hard_iface->if_status == BATADV_IF_ACTIVE &&
+ atomic_inc_not_zero(&hard_iface->refcount))
+ goto out;
+ }
+
+ hard_iface = NULL;
+
+out:
+ rcu_read_unlock();
+ return hard_iface;
+}
+
+static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *oldif)
+{
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ batadv_dat_init_own_addr(bat_priv, primary_if);
+ batadv_bla_update_orig_address(bat_priv, primary_if, oldif);
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+}
+
+static void batadv_primary_if_select(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *new_hard_iface)
+{
+ struct batadv_hard_iface *curr_hard_iface;
+
+ ASSERT_RTNL();
+
+ if (new_hard_iface && !atomic_inc_not_zero(&new_hard_iface->refcount))
+ new_hard_iface = NULL;
+
+ curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1);
+ rcu_assign_pointer(bat_priv->primary_if, new_hard_iface);
+
+ if (!new_hard_iface)
+ goto out;
+
+ bat_priv->bat_algo_ops->bat_primary_iface_set(new_hard_iface);
+ batadv_primary_if_update_addr(bat_priv, curr_hard_iface);
+
+out:
+ if (curr_hard_iface)
+ batadv_hardif_free_ref(curr_hard_iface);
+}
+
+static bool
+batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface)
+{
+ if (hard_iface->net_dev->flags & IFF_UP)
+ return true;
+
+ return false;
+}
+
+static void batadv_check_known_mac_addr(const struct net_device *net_dev)
+{
+ const struct batadv_hard_iface *hard_iface;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
+ (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+ continue;
+
+ if (hard_iface->net_dev == net_dev)
+ continue;
+
+ if (!batadv_compare_eth(hard_iface->net_dev->dev_addr,
+ net_dev->dev_addr))
+ continue;
+
+ pr_warn("The newly added mac address (%pM) already exists on: %s\n",
+ net_dev->dev_addr, hard_iface->net_dev->name);
+ pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n");
+ }
+ rcu_read_unlock();
+}
+
+int batadv_hardif_min_mtu(struct net_device *soft_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ const struct batadv_hard_iface *hard_iface;
+ int min_mtu = INT_MAX;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
+ (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+ continue;
+
+ if (hard_iface->soft_iface != soft_iface)
+ continue;
+
+ min_mtu = min_t(int, hard_iface->net_dev->mtu, min_mtu);
+ }
+ rcu_read_unlock();
+
+ if (atomic_read(&bat_priv->fragmentation) == 0)
+ goto out;
+
+ /* with fragmentation enabled the maximum size of internally generated
+ * packets such as translation table exchanges or tvlv containers, etc
+ * has to be calculated
+ */
+ min_mtu = min_t(int, min_mtu, BATADV_FRAG_MAX_FRAG_SIZE);
+ min_mtu -= sizeof(struct batadv_frag_packet);
+ min_mtu *= BATADV_FRAG_MAX_FRAGMENTS;
+
+out:
+ /* report to the other components the maximum amount of bytes that
+ * batman-adv can send over the wire (without considering the payload
+ * overhead). For example, this value is used by TT to compute the
+ * maximum local table table size
+ */
+ atomic_set(&bat_priv->packet_size_max, min_mtu);
+
+ /* the real soft-interface MTU is computed by removing the payload
+ * overhead from the maximum amount of bytes that was just computed.
+ *
+ * However batman-adv does not support MTUs bigger than ETH_DATA_LEN
+ */
+ return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN);
+}
+
+/* adjusts the MTU if a new interface with a smaller MTU appeared. */
+void batadv_update_min_mtu(struct net_device *soft_iface)
+{
+ soft_iface->mtu = batadv_hardif_min_mtu(soft_iface);
+
+ /* Check if the local translate table should be cleaned up to match a
+ * new (and smaller) MTU.
+ */
+ batadv_tt_local_resize_to_mtu(soft_iface);
+}
+
+static void
+batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+
+ if (hard_iface->if_status != BATADV_IF_INACTIVE)
+ goto out;
+
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+
+ bat_priv->bat_algo_ops->bat_iface_update_mac(hard_iface);
+ hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED;
+
+ /* the first active interface becomes our primary interface or
+ * the next active interface after the old primary interface was removed
+ */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ batadv_primary_if_select(bat_priv, hard_iface);
+
+ batadv_info(hard_iface->soft_iface, "Interface activated: %s\n",
+ hard_iface->net_dev->name);
+
+ batadv_update_min_mtu(hard_iface->soft_iface);
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+}
+
+static void
+batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
+{
+ if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
+ (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+ return;
+
+ hard_iface->if_status = BATADV_IF_INACTIVE;
+
+ batadv_info(hard_iface->soft_iface, "Interface deactivated: %s\n",
+ hard_iface->net_dev->name);
+
+ batadv_update_min_mtu(hard_iface->soft_iface);
+}
+
+/**
+ * batadv_master_del_slave - remove hard_iface from the current master interface
+ * @slave: the interface enslaved in another master
+ * @master: the master from which slave has to be removed
+ *
+ * Invoke ndo_del_slave on master passing slave as argument. In this way slave
+ * is free'd and master can correctly change its internal state.
+ * Return 0 on success, a negative value representing the error otherwise
+ */
+static int batadv_master_del_slave(struct batadv_hard_iface *slave,
+ struct net_device *master)
+{
+ int ret;
+
+ if (!master)
+ return 0;
+
+ ret = -EBUSY;
+ if (master->netdev_ops->ndo_del_slave)
+ ret = master->netdev_ops->ndo_del_slave(master, slave->net_dev);
+
+ return ret;
+}
+
+int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
+ const char *iface_name)
+{
+ struct batadv_priv *bat_priv;
+ struct net_device *soft_iface, *master;
+ __be16 ethertype = htons(ETH_P_BATMAN);
+ int max_header_len = batadv_max_header_len();
+ int ret;
+
+ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
+ goto out;
+
+ if (!atomic_inc_not_zero(&hard_iface->refcount))
+ goto out;
+
+ soft_iface = dev_get_by_name(&init_net, iface_name);
+
+ if (!soft_iface) {
+ soft_iface = batadv_softif_create(iface_name);
+
+ if (!soft_iface) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /* dev_get_by_name() increases the reference counter for us */
+ dev_hold(soft_iface);
+ }
+
+ if (!batadv_softif_is_valid(soft_iface)) {
+ pr_err("Can't create batman mesh interface %s: already exists as regular interface\n",
+ soft_iface->name);
+ ret = -EINVAL;
+ goto err_dev;
+ }
+
+ /* check if the interface is enslaved in another virtual one and
+ * in that case unlink it first
+ */
+ master = netdev_master_upper_dev_get(hard_iface->net_dev);
+ ret = batadv_master_del_slave(hard_iface, master);
+ if (ret)
+ goto err_dev;
+
+ hard_iface->soft_iface = soft_iface;
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+
+ ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface);
+ if (ret)
+ goto err_dev;
+
+ ret = bat_priv->bat_algo_ops->bat_iface_enable(hard_iface);
+ if (ret < 0)
+ goto err_upper;
+
+ hard_iface->if_num = bat_priv->num_ifaces;
+ bat_priv->num_ifaces++;
+ hard_iface->if_status = BATADV_IF_INACTIVE;
+ ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces);
+ if (ret < 0) {
+ bat_priv->bat_algo_ops->bat_iface_disable(hard_iface);
+ bat_priv->num_ifaces--;
+ hard_iface->if_status = BATADV_IF_NOT_IN_USE;
+ goto err_upper;
+ }
+
+ hard_iface->batman_adv_ptype.type = ethertype;
+ hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv;
+ hard_iface->batman_adv_ptype.dev = hard_iface->net_dev;
+ dev_add_pack(&hard_iface->batman_adv_ptype);
+
+ batadv_info(hard_iface->soft_iface, "Adding interface: %s\n",
+ hard_iface->net_dev->name);
+
+ if (atomic_read(&bat_priv->fragmentation) &&
+ hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len)
+ batadv_info(hard_iface->soft_iface,
+ "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n",
+ hard_iface->net_dev->name, hard_iface->net_dev->mtu,
+ ETH_DATA_LEN + max_header_len);
+
+ if (!atomic_read(&bat_priv->fragmentation) &&
+ hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len)
+ batadv_info(hard_iface->soft_iface,
+ "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n",
+ hard_iface->net_dev->name, hard_iface->net_dev->mtu,
+ ETH_DATA_LEN + max_header_len);
+
+ if (batadv_hardif_is_iface_up(hard_iface))
+ batadv_hardif_activate_interface(hard_iface);
+ else
+ batadv_err(hard_iface->soft_iface,
+ "Not using interface %s (retrying later): interface not active\n",
+ hard_iface->net_dev->name);
+
+ /* begin scheduling originator messages on that interface */
+ batadv_schedule_bat_ogm(hard_iface);
+
+out:
+ return 0;
+
+err_upper:
+ netdev_upper_dev_unlink(hard_iface->net_dev, soft_iface);
+err_dev:
+ hard_iface->soft_iface = NULL;
+ dev_put(soft_iface);
+err:
+ batadv_hardif_free_ref(hard_iface);
+ return ret;
+}
+
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
+ enum batadv_hard_if_cleanup autodel)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_hard_iface *primary_if = NULL;
+
+ if (hard_iface->if_status == BATADV_IF_ACTIVE)
+ batadv_hardif_deactivate_interface(hard_iface);
+
+ if (hard_iface->if_status != BATADV_IF_INACTIVE)
+ goto out;
+
+ batadv_info(hard_iface->soft_iface, "Removing interface: %s\n",
+ hard_iface->net_dev->name);
+ dev_remove_pack(&hard_iface->batman_adv_ptype);
+
+ bat_priv->num_ifaces--;
+ batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (hard_iface == primary_if) {
+ struct batadv_hard_iface *new_if;
+
+ new_if = batadv_hardif_get_active(hard_iface->soft_iface);
+ batadv_primary_if_select(bat_priv, new_if);
+
+ if (new_if)
+ batadv_hardif_free_ref(new_if);
+ }
+
+ bat_priv->bat_algo_ops->bat_iface_disable(hard_iface);
+ hard_iface->if_status = BATADV_IF_NOT_IN_USE;
+
+ /* delete all references to this hard_iface */
+ batadv_purge_orig_ref(bat_priv);
+ batadv_purge_outstanding_packets(bat_priv, hard_iface);
+ dev_put(hard_iface->soft_iface);
+
+ /* nobody uses this interface anymore */
+ if (!bat_priv->num_ifaces) {
+ batadv_gw_check_client_stop(bat_priv);
+
+ if (autodel == BATADV_IF_CLEANUP_AUTO)
+ batadv_softif_destroy_sysfs(hard_iface->soft_iface);
+ }
+
+ netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface);
+ hard_iface->soft_iface = NULL;
+ batadv_hardif_free_ref(hard_iface);
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+}
+
+/**
+ * batadv_hardif_remove_interface_finish - cleans up the remains of a hardif
+ * @work: work queue item
+ *
+ * Free the parts of the hard interface which can not be removed under
+ * rtnl lock (to prevent deadlock situations).
+ */
+static void batadv_hardif_remove_interface_finish(struct work_struct *work)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ hard_iface = container_of(work, struct batadv_hard_iface,
+ cleanup_work);
+
+ batadv_debugfs_del_hardif(hard_iface);
+ batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
+ batadv_hardif_free_ref(hard_iface);
+}
+
+static struct batadv_hard_iface *
+batadv_hardif_add_interface(struct net_device *net_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+ int ret;
+
+ ASSERT_RTNL();
+
+ ret = batadv_is_valid_iface(net_dev);
+ if (ret != 1)
+ goto out;
+
+ dev_hold(net_dev);
+
+ hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC);
+ if (!hard_iface)
+ goto release_dev;
+
+ ret = batadv_sysfs_add_hardif(&hard_iface->hardif_obj, net_dev);
+ if (ret)
+ goto free_if;
+
+ hard_iface->if_num = -1;
+ hard_iface->net_dev = net_dev;
+ hard_iface->soft_iface = NULL;
+ hard_iface->if_status = BATADV_IF_NOT_IN_USE;
+
+ ret = batadv_debugfs_add_hardif(hard_iface);
+ if (ret)
+ goto free_sysfs;
+
+ INIT_LIST_HEAD(&hard_iface->list);
+ INIT_WORK(&hard_iface->cleanup_work,
+ batadv_hardif_remove_interface_finish);
+
+ hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT;
+ if (batadv_is_wifi_netdev(net_dev))
+ hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
+
+ /* extra reference for return */
+ atomic_set(&hard_iface->refcount, 2);
+
+ batadv_check_known_mac_addr(hard_iface->net_dev);
+ list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
+
+ return hard_iface;
+
+free_sysfs:
+ batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
+free_if:
+ kfree(hard_iface);
+release_dev:
+ dev_put(net_dev);
+out:
+ return NULL;
+}
+
+static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
+{
+ ASSERT_RTNL();
+
+ /* first deactivate interface */
+ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
+ batadv_hardif_disable_interface(hard_iface,
+ BATADV_IF_CLEANUP_AUTO);
+
+ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
+ return;
+
+ hard_iface->if_status = BATADV_IF_TO_BE_REMOVED;
+ queue_work(batadv_event_workqueue, &hard_iface->cleanup_work);
+}
+
+void batadv_hardif_remove_interfaces(void)
+{
+ struct batadv_hard_iface *hard_iface, *hard_iface_tmp;
+
+ rtnl_lock();
+ list_for_each_entry_safe(hard_iface, hard_iface_tmp,
+ &batadv_hardif_list, list) {
+ list_del_rcu(&hard_iface->list);
+ batadv_hardif_remove_interface(hard_iface);
+ }
+ rtnl_unlock();
+}
+
+static int batadv_hard_if_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *net_dev = netdev_notifier_info_to_dev(ptr);
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_priv *bat_priv;
+
+ if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) {
+ batadv_sysfs_add_meshif(net_dev);
+ bat_priv = netdev_priv(net_dev);
+ batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
+ return NOTIFY_DONE;
+ }
+
+ hard_iface = batadv_hardif_get_by_netdev(net_dev);
+ if (!hard_iface && event == NETDEV_REGISTER)
+ hard_iface = batadv_hardif_add_interface(net_dev);
+
+ if (!hard_iface)
+ goto out;
+
+ switch (event) {
+ case NETDEV_UP:
+ batadv_hardif_activate_interface(hard_iface);
+ break;
+ case NETDEV_GOING_DOWN:
+ case NETDEV_DOWN:
+ batadv_hardif_deactivate_interface(hard_iface);
+ break;
+ case NETDEV_UNREGISTER:
+ list_del_rcu(&hard_iface->list);
+
+ batadv_hardif_remove_interface(hard_iface);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (hard_iface->soft_iface)
+ batadv_update_min_mtu(hard_iface->soft_iface);
+ break;
+ case NETDEV_CHANGEADDR:
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
+ goto hardif_put;
+
+ batadv_check_known_mac_addr(hard_iface->net_dev);
+
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+ bat_priv->bat_algo_ops->bat_iface_update_mac(hard_iface);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto hardif_put;
+
+ if (hard_iface == primary_if)
+ batadv_primary_if_update_addr(bat_priv, NULL);
+ break;
+ default:
+ break;
+ }
+
+hardif_put:
+ batadv_hardif_free_ref(hard_iface);
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return NOTIFY_DONE;
+}
+
+struct notifier_block batadv_hard_if_notifier = {
+ .notifier_call = batadv_hard_if_event,
+};
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
new file mode 100644
index 000000000..1918cd50b
--- /dev/null
+++ b/net/batman-adv/hard-interface.h
@@ -0,0 +1,97 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_
+#define _NET_BATMAN_ADV_HARD_INTERFACE_H_
+
+enum batadv_hard_if_state {
+ BATADV_IF_NOT_IN_USE,
+ BATADV_IF_TO_BE_REMOVED,
+ BATADV_IF_INACTIVE,
+ BATADV_IF_ACTIVE,
+ BATADV_IF_TO_BE_ACTIVATED,
+ BATADV_IF_I_WANT_YOU,
+};
+
+/**
+ * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal
+ * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
+ * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was removed
+ */
+enum batadv_hard_if_cleanup {
+ BATADV_IF_CLEANUP_KEEP,
+ BATADV_IF_CLEANUP_AUTO,
+};
+
+extern struct notifier_block batadv_hard_if_notifier;
+
+bool batadv_is_wifi_netdev(struct net_device *net_device);
+bool batadv_is_wifi_iface(int ifindex);
+struct batadv_hard_iface*
+batadv_hardif_get_by_netdev(const struct net_device *net_dev);
+int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
+ const char *iface_name);
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
+ enum batadv_hard_if_cleanup autodel);
+void batadv_hardif_remove_interfaces(void);
+int batadv_hardif_min_mtu(struct net_device *soft_iface);
+void batadv_update_min_mtu(struct net_device *soft_iface);
+void batadv_hardif_free_rcu(struct rcu_head *rcu);
+
+/**
+ * batadv_hardif_free_ref - decrement the hard interface refcounter and
+ * possibly free it
+ * @hard_iface: the hard interface to free
+ */
+static inline void
+batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface)
+{
+ if (atomic_dec_and_test(&hard_iface->refcount))
+ call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu);
+}
+
+/**
+ * batadv_hardif_free_ref_now - decrement the hard interface refcounter and
+ * possibly free it (without rcu callback)
+ * @hard_iface: the hard interface to free
+ */
+static inline void
+batadv_hardif_free_ref_now(struct batadv_hard_iface *hard_iface)
+{
+ if (atomic_dec_and_test(&hard_iface->refcount))
+ batadv_hardif_free_rcu(&hard_iface->rcu);
+}
+
+static inline struct batadv_hard_iface *
+batadv_primary_if_get_selected(struct batadv_priv *bat_priv)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ rcu_read_lock();
+ hard_iface = rcu_dereference(bat_priv->primary_if);
+ if (!hard_iface)
+ goto out;
+
+ if (!atomic_inc_not_zero(&hard_iface->refcount))
+ hard_iface = NULL;
+
+out:
+ rcu_read_unlock();
+ return hard_iface;
+}
+
+#endif /* _NET_BATMAN_ADV_HARD_INTERFACE_H_ */
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
new file mode 100644
index 000000000..7c1c63080
--- /dev/null
+++ b/net/batman-adv/hash.c
@@ -0,0 +1,76 @@
+/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "hash.h"
+
+/* clears the hash */
+static void batadv_hash_init(struct batadv_hashtable *hash)
+{
+ uint32_t i;
+
+ for (i = 0; i < hash->size; i++) {
+ INIT_HLIST_HEAD(&hash->table[i]);
+ spin_lock_init(&hash->list_locks[i]);
+ }
+}
+
+/* free only the hashtable and the hash itself. */
+void batadv_hash_destroy(struct batadv_hashtable *hash)
+{
+ kfree(hash->list_locks);
+ kfree(hash->table);
+ kfree(hash);
+}
+
+/* allocates and clears the hash */
+struct batadv_hashtable *batadv_hash_new(uint32_t size)
+{
+ struct batadv_hashtable *hash;
+
+ hash = kmalloc(sizeof(*hash), GFP_ATOMIC);
+ if (!hash)
+ return NULL;
+
+ hash->table = kmalloc_array(size, sizeof(*hash->table), GFP_ATOMIC);
+ if (!hash->table)
+ goto free_hash;
+
+ hash->list_locks = kmalloc_array(size, sizeof(*hash->list_locks),
+ GFP_ATOMIC);
+ if (!hash->list_locks)
+ goto free_table;
+
+ hash->size = size;
+ batadv_hash_init(hash);
+ return hash;
+
+free_table:
+ kfree(hash->table);
+free_hash:
+ kfree(hash);
+ return NULL;
+}
+
+void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
+ struct lock_class_key *key)
+{
+ uint32_t i;
+
+ for (i = 0; i < hash->size; i++)
+ lockdep_set_class(&hash->list_locks[i], key);
+}
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
new file mode 100644
index 000000000..539fc1266
--- /dev/null
+++ b/net/batman-adv/hash.h
@@ -0,0 +1,187 @@
+/* Copyright (C) 2006-2014 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_HASH_H_
+#define _NET_BATMAN_ADV_HASH_H_
+
+#include <linux/list.h>
+
+/* callback to a compare function. should compare 2 element datas for their
+ * keys, return 0 if same and not 0 if not same
+ */
+typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *,
+ const void *);
+
+/* the hashfunction, should return an index
+ * based on the key in the data of the first
+ * argument and the size the second
+ */
+typedef uint32_t (*batadv_hashdata_choose_cb)(const void *, uint32_t);
+typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *);
+
+struct batadv_hashtable {
+ struct hlist_head *table; /* the hashtable itself with the buckets */
+ spinlock_t *list_locks; /* spinlock for each hash list entry */
+ uint32_t size; /* size of hashtable */
+};
+
+/* allocates and clears the hash */
+struct batadv_hashtable *batadv_hash_new(uint32_t size);
+
+/* set class key for all locks */
+void batadv_hash_set_lock_class(struct batadv_hashtable *hash,
+ struct lock_class_key *key);
+
+/* free only the hashtable and the hash itself. */
+void batadv_hash_destroy(struct batadv_hashtable *hash);
+
+/* remove the hash structure. if hashdata_free_cb != NULL, this function will be
+ * called to remove the elements inside of the hash. if you don't remove the
+ * elements, memory might be leaked.
+ */
+static inline void batadv_hash_delete(struct batadv_hashtable *hash,
+ batadv_hashdata_free_cb free_cb,
+ void *arg)
+{
+ struct hlist_head *head;
+ struct hlist_node *node, *node_tmp;
+ spinlock_t *list_lock; /* spinlock to protect write access */
+ uint32_t i;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_safe(node, node_tmp, head) {
+ hlist_del_rcu(node);
+
+ if (free_cb)
+ free_cb(node, arg);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_hash_destroy(hash);
+}
+
+/**
+ * batadv_hash_bytes - hash some bytes and add them to the previous hash
+ * @hash: previous hash value
+ * @data: data to be hashed
+ * @size: number of bytes to be hashed
+ *
+ * Returns the new hash value.
+ */
+static inline uint32_t batadv_hash_bytes(uint32_t hash, const void *data,
+ uint32_t size)
+{
+ const unsigned char *key = data;
+ int i;
+
+ for (i = 0; i < size; i++) {
+ hash += key[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
+ return hash;
+}
+
+/**
+ * batadv_hash_add - adds data to the hashtable
+ * @hash: storage hash table
+ * @compare: callback to determine if 2 hash elements are identical
+ * @choose: callback calculating the hash index
+ * @data: data passed to the aforementioned callbacks as argument
+ * @data_node: to be added element
+ *
+ * Returns 0 on success, 1 if the element already is in the hash
+ * and -1 on error.
+ */
+static inline int batadv_hash_add(struct batadv_hashtable *hash,
+ batadv_hashdata_compare_cb compare,
+ batadv_hashdata_choose_cb choose,
+ const void *data,
+ struct hlist_node *data_node)
+{
+ uint32_t index;
+ int ret = -1;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ spinlock_t *list_lock; /* spinlock to protect write access */
+
+ if (!hash)
+ goto out;
+
+ index = choose(data, hash->size);
+ head = &hash->table[index];
+ list_lock = &hash->list_locks[index];
+
+ spin_lock_bh(list_lock);
+
+ hlist_for_each(node, head) {
+ if (!compare(node, data))
+ continue;
+
+ ret = 1;
+ goto unlock;
+ }
+
+ /* no duplicate found in list, add new element */
+ hlist_add_head_rcu(data_node, head);
+
+ ret = 0;
+
+unlock:
+ spin_unlock_bh(list_lock);
+out:
+ return ret;
+}
+
+/* removes data from hash, if found. returns pointer do data on success, so you
+ * can remove the used structure yourself, or NULL on error . data could be the
+ * structure you use with just the key filled, we just need the key for
+ * comparing.
+ */
+static inline void *batadv_hash_remove(struct batadv_hashtable *hash,
+ batadv_hashdata_compare_cb compare,
+ batadv_hashdata_choose_cb choose,
+ void *data)
+{
+ uint32_t index;
+ struct hlist_node *node;
+ struct hlist_head *head;
+ void *data_save = NULL;
+
+ index = choose(data, hash->size);
+ head = &hash->table[index];
+
+ spin_lock_bh(&hash->list_locks[index]);
+ hlist_for_each(node, head) {
+ if (!compare(node, data))
+ continue;
+
+ data_save = node;
+ hlist_del_rcu(node);
+ break;
+ }
+ spin_unlock_bh(&hash->list_locks[index]);
+
+ return data_save;
+}
+
+#endif /* _NET_BATMAN_ADV_HASH_H_ */
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
new file mode 100644
index 000000000..161ef8f17
--- /dev/null
+++ b/net/batman-adv/icmp_socket.c
@@ -0,0 +1,385 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include "icmp_socket.h"
+#include "send.h"
+#include "hash.h"
+#include "originator.h"
+#include "hard-interface.h"
+
+static struct batadv_socket_client *batadv_socket_client_hash[256];
+
+static void batadv_socket_add_packet(struct batadv_socket_client *socket_client,
+ struct batadv_icmp_header *icmph,
+ size_t icmp_len);
+
+void batadv_socket_init(void)
+{
+ memset(batadv_socket_client_hash, 0, sizeof(batadv_socket_client_hash));
+}
+
+static int batadv_socket_open(struct inode *inode, struct file *file)
+{
+ unsigned int i;
+ struct batadv_socket_client *socket_client;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EBUSY;
+
+ nonseekable_open(inode, file);
+
+ socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL);
+ if (!socket_client) {
+ module_put(THIS_MODULE);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(batadv_socket_client_hash); i++) {
+ if (!batadv_socket_client_hash[i]) {
+ batadv_socket_client_hash[i] = socket_client;
+ break;
+ }
+ }
+
+ if (i == ARRAY_SIZE(batadv_socket_client_hash)) {
+ pr_err("Error - can't add another packet client: maximum number of clients reached\n");
+ kfree(socket_client);
+ module_put(THIS_MODULE);
+ return -EXFULL;
+ }
+
+ INIT_LIST_HEAD(&socket_client->queue_list);
+ socket_client->queue_len = 0;
+ socket_client->index = i;
+ socket_client->bat_priv = inode->i_private;
+ spin_lock_init(&socket_client->lock);
+ init_waitqueue_head(&socket_client->queue_wait);
+
+ file->private_data = socket_client;
+
+ return 0;
+}
+
+static int batadv_socket_release(struct inode *inode, struct file *file)
+{
+ struct batadv_socket_client *socket_client = file->private_data;
+ struct batadv_socket_packet *socket_packet;
+ struct list_head *list_pos, *list_pos_tmp;
+
+ spin_lock_bh(&socket_client->lock);
+
+ /* for all packets in the queue ... */
+ list_for_each_safe(list_pos, list_pos_tmp, &socket_client->queue_list) {
+ socket_packet = list_entry(list_pos,
+ struct batadv_socket_packet, list);
+
+ list_del(list_pos);
+ kfree(socket_packet);
+ }
+
+ batadv_socket_client_hash[socket_client->index] = NULL;
+ spin_unlock_bh(&socket_client->lock);
+
+ kfree(socket_client);
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static ssize_t batadv_socket_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct batadv_socket_client *socket_client = file->private_data;
+ struct batadv_socket_packet *socket_packet;
+ size_t packet_len;
+ int error;
+
+ if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0))
+ return -EAGAIN;
+
+ if ((!buf) || (count < sizeof(struct batadv_icmp_packet)))
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ error = wait_event_interruptible(socket_client->queue_wait,
+ socket_client->queue_len);
+
+ if (error)
+ return error;
+
+ spin_lock_bh(&socket_client->lock);
+
+ socket_packet = list_first_entry(&socket_client->queue_list,
+ struct batadv_socket_packet, list);
+ list_del(&socket_packet->list);
+ socket_client->queue_len--;
+
+ spin_unlock_bh(&socket_client->lock);
+
+ packet_len = min(count, socket_packet->icmp_len);
+ error = copy_to_user(buf, &socket_packet->icmp_packet, packet_len);
+
+ kfree(socket_packet);
+
+ if (error)
+ return -EFAULT;
+
+ return packet_len;
+}
+
+static ssize_t batadv_socket_write(struct file *file, const char __user *buff,
+ size_t len, loff_t *off)
+{
+ struct batadv_socket_client *socket_client = file->private_data;
+ struct batadv_priv *bat_priv = socket_client->bat_priv;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct sk_buff *skb;
+ struct batadv_icmp_packet_rr *icmp_packet_rr;
+ struct batadv_icmp_header *icmp_header;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_neigh_node *neigh_node = NULL;
+ size_t packet_len = sizeof(struct batadv_icmp_packet);
+ uint8_t *addr;
+
+ if (len < sizeof(struct batadv_icmp_header)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Error - can't send packet from char device: invalid packet size\n");
+ return -EINVAL;
+ }
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (!primary_if) {
+ len = -EFAULT;
+ goto out;
+ }
+
+ if (len >= BATADV_ICMP_MAX_PACKET_SIZE)
+ packet_len = BATADV_ICMP_MAX_PACKET_SIZE;
+ else
+ packet_len = len;
+
+ skb = netdev_alloc_skb_ip_align(NULL, packet_len + ETH_HLEN);
+ if (!skb) {
+ len = -ENOMEM;
+ goto out;
+ }
+
+ skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(skb, ETH_HLEN);
+ icmp_header = (struct batadv_icmp_header *)skb_put(skb, packet_len);
+
+ if (copy_from_user(icmp_header, buff, packet_len)) {
+ len = -EFAULT;
+ goto free_skb;
+ }
+
+ if (icmp_header->packet_type != BATADV_ICMP) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Error - can't send packet from char device: got bogus packet type (expected: BAT_ICMP)\n");
+ len = -EINVAL;
+ goto free_skb;
+ }
+
+ switch (icmp_header->msg_type) {
+ case BATADV_ECHO_REQUEST:
+ if (len < sizeof(struct batadv_icmp_packet)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Error - can't send packet from char device: invalid packet size\n");
+ len = -EINVAL;
+ goto free_skb;
+ }
+
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ goto dst_unreach;
+
+ orig_node = batadv_orig_hash_find(bat_priv, icmp_header->dst);
+ if (!orig_node)
+ goto dst_unreach;
+
+ neigh_node = batadv_orig_router_get(orig_node,
+ BATADV_IF_DEFAULT);
+ if (!neigh_node)
+ goto dst_unreach;
+
+ if (!neigh_node->if_incoming)
+ goto dst_unreach;
+
+ if (neigh_node->if_incoming->if_status != BATADV_IF_ACTIVE)
+ goto dst_unreach;
+
+ icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmp_header;
+ if (packet_len == sizeof(*icmp_packet_rr)) {
+ addr = neigh_node->if_incoming->net_dev->dev_addr;
+ ether_addr_copy(icmp_packet_rr->rr[0], addr);
+ }
+
+ break;
+ default:
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Error - can't send packet from char device: got unknown message type\n");
+ len = -EINVAL;
+ goto free_skb;
+ }
+
+ icmp_header->uid = socket_client->index;
+
+ if (icmp_header->version != BATADV_COMPAT_VERSION) {
+ icmp_header->msg_type = BATADV_PARAMETER_PROBLEM;
+ icmp_header->version = BATADV_COMPAT_VERSION;
+ batadv_socket_add_packet(socket_client, icmp_header,
+ packet_len);
+ goto free_skb;
+ }
+
+ ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr);
+
+ batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+ goto out;
+
+dst_unreach:
+ icmp_header->msg_type = BATADV_DESTINATION_UNREACHABLE;
+ batadv_socket_add_packet(socket_client, icmp_header, packet_len);
+free_skb:
+ kfree_skb(skb);
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ if (neigh_node)
+ batadv_neigh_node_free_ref(neigh_node);
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ return len;
+}
+
+static unsigned int batadv_socket_poll(struct file *file, poll_table *wait)
+{
+ struct batadv_socket_client *socket_client = file->private_data;
+
+ poll_wait(file, &socket_client->queue_wait, wait);
+
+ if (socket_client->queue_len > 0)
+ return POLLIN | POLLRDNORM;
+
+ return 0;
+}
+
+static const struct file_operations batadv_fops = {
+ .owner = THIS_MODULE,
+ .open = batadv_socket_open,
+ .release = batadv_socket_release,
+ .read = batadv_socket_read,
+ .write = batadv_socket_write,
+ .poll = batadv_socket_poll,
+ .llseek = no_llseek,
+};
+
+int batadv_socket_setup(struct batadv_priv *bat_priv)
+{
+ struct dentry *d;
+
+ if (!bat_priv->debug_dir)
+ goto err;
+
+ d = debugfs_create_file(BATADV_ICMP_SOCKET, S_IFREG | S_IWUSR | S_IRUSR,
+ bat_priv->debug_dir, bat_priv, &batadv_fops);
+ if (!d)
+ goto err;
+
+ return 0;
+
+err:
+ return -ENOMEM;
+}
+
+/**
+ * batadv_socket_receive_packet - schedule an icmp packet to be sent to userspace
+ * on an icmp socket.
+ * @socket_client: the socket this packet belongs to
+ * @icmph: pointer to the header of the icmp packet
+ * @icmp_len: total length of the icmp packet
+ */
+static void batadv_socket_add_packet(struct batadv_socket_client *socket_client,
+ struct batadv_icmp_header *icmph,
+ size_t icmp_len)
+{
+ struct batadv_socket_packet *socket_packet;
+ size_t len;
+
+ socket_packet = kmalloc(sizeof(*socket_packet), GFP_ATOMIC);
+
+ if (!socket_packet)
+ return;
+
+ len = icmp_len;
+ /* check the maximum length before filling the buffer */
+ if (len > sizeof(socket_packet->icmp_packet))
+ len = sizeof(socket_packet->icmp_packet);
+
+ INIT_LIST_HEAD(&socket_packet->list);
+ memcpy(&socket_packet->icmp_packet, icmph, len);
+ socket_packet->icmp_len = len;
+
+ spin_lock_bh(&socket_client->lock);
+
+ /* while waiting for the lock the socket_client could have been
+ * deleted
+ */
+ if (!batadv_socket_client_hash[icmph->uid]) {
+ spin_unlock_bh(&socket_client->lock);
+ kfree(socket_packet);
+ return;
+ }
+
+ list_add_tail(&socket_packet->list, &socket_client->queue_list);
+ socket_client->queue_len++;
+
+ if (socket_client->queue_len > 100) {
+ socket_packet = list_first_entry(&socket_client->queue_list,
+ struct batadv_socket_packet,
+ list);
+
+ list_del(&socket_packet->list);
+ kfree(socket_packet);
+ socket_client->queue_len--;
+ }
+
+ spin_unlock_bh(&socket_client->lock);
+
+ wake_up(&socket_client->queue_wait);
+}
+
+/**
+ * batadv_socket_receive_packet - schedule an icmp packet to be received
+ * locally and sent to userspace.
+ * @icmph: pointer to the header of the icmp packet
+ * @icmp_len: total length of the icmp packet
+ */
+void batadv_socket_receive_packet(struct batadv_icmp_header *icmph,
+ size_t icmp_len)
+{
+ struct batadv_socket_client *hash;
+
+ hash = batadv_socket_client_hash[icmph->uid];
+ if (hash)
+ batadv_socket_add_packet(hash, icmph, icmp_len);
+}
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
new file mode 100644
index 000000000..0c33950aa
--- /dev/null
+++ b/net/batman-adv/icmp_socket.h
@@ -0,0 +1,28 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_
+#define _NET_BATMAN_ADV_ICMP_SOCKET_H_
+
+#define BATADV_ICMP_SOCKET "socket"
+
+void batadv_socket_init(void);
+int batadv_socket_setup(struct batadv_priv *bat_priv);
+void batadv_socket_receive_packet(struct batadv_icmp_header *icmph,
+ size_t icmp_len);
+
+#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
new file mode 100644
index 000000000..12fc77bef
--- /dev/null
+++ b/net/batman-adv/main.c
@@ -0,0 +1,1265 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/crc32c.h>
+#include <linux/highmem.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
+#include "main.h"
+#include "sysfs.h"
+#include "debugfs.h"
+#include "routing.h"
+#include "send.h"
+#include "originator.h"
+#include "soft-interface.h"
+#include "icmp_socket.h"
+#include "translation-table.h"
+#include "hard-interface.h"
+#include "gateway_client.h"
+#include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
+#include "multicast.h"
+#include "gateway_common.h"
+#include "hash.h"
+#include "bat_algo.h"
+#include "network-coding.h"
+#include "fragmentation.h"
+
+/* List manipulations on hardif_list have to be rtnl_lock()'ed,
+ * list traversals just rcu-locked
+ */
+struct list_head batadv_hardif_list;
+static int (*batadv_rx_handler[256])(struct sk_buff *,
+ struct batadv_hard_iface *);
+char batadv_routing_algo[20] = "BATMAN_IV";
+static struct hlist_head batadv_algo_list;
+
+unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+struct workqueue_struct *batadv_event_workqueue;
+
+static void batadv_recv_handler_init(void);
+
+static int __init batadv_init(void)
+{
+ INIT_LIST_HEAD(&batadv_hardif_list);
+ INIT_HLIST_HEAD(&batadv_algo_list);
+
+ batadv_recv_handler_init();
+
+ batadv_iv_init();
+ batadv_nc_init();
+
+ batadv_event_workqueue = create_singlethread_workqueue("bat_events");
+
+ if (!batadv_event_workqueue)
+ return -ENOMEM;
+
+ batadv_socket_init();
+ batadv_debugfs_init();
+
+ register_netdevice_notifier(&batadv_hard_if_notifier);
+ rtnl_link_register(&batadv_link_ops);
+
+ pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n",
+ BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION);
+
+ return 0;
+}
+
+static void __exit batadv_exit(void)
+{
+ batadv_debugfs_destroy();
+ rtnl_link_unregister(&batadv_link_ops);
+ unregister_netdevice_notifier(&batadv_hard_if_notifier);
+ batadv_hardif_remove_interfaces();
+
+ flush_workqueue(batadv_event_workqueue);
+ destroy_workqueue(batadv_event_workqueue);
+ batadv_event_workqueue = NULL;
+
+ rcu_barrier();
+}
+
+int batadv_mesh_init(struct net_device *soft_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ int ret;
+
+ spin_lock_init(&bat_priv->forw_bat_list_lock);
+ spin_lock_init(&bat_priv->forw_bcast_list_lock);
+ spin_lock_init(&bat_priv->tt.changes_list_lock);
+ spin_lock_init(&bat_priv->tt.req_list_lock);
+ spin_lock_init(&bat_priv->tt.roam_list_lock);
+ spin_lock_init(&bat_priv->tt.last_changeset_lock);
+ spin_lock_init(&bat_priv->tt.commit_lock);
+ spin_lock_init(&bat_priv->gw.list_lock);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ spin_lock_init(&bat_priv->mcast.want_lists_lock);
+#endif
+ spin_lock_init(&bat_priv->tvlv.container_list_lock);
+ spin_lock_init(&bat_priv->tvlv.handler_list_lock);
+ spin_lock_init(&bat_priv->softif_vlan_list_lock);
+
+ INIT_HLIST_HEAD(&bat_priv->forw_bat_list);
+ INIT_HLIST_HEAD(&bat_priv->forw_bcast_list);
+ INIT_HLIST_HEAD(&bat_priv->gw.list);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list);
+ INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list);
+ INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list);
+#endif
+ INIT_LIST_HEAD(&bat_priv->tt.changes_list);
+ INIT_LIST_HEAD(&bat_priv->tt.req_list);
+ INIT_LIST_HEAD(&bat_priv->tt.roam_list);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ INIT_HLIST_HEAD(&bat_priv->mcast.mla_list);
+#endif
+ INIT_HLIST_HEAD(&bat_priv->tvlv.container_list);
+ INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list);
+ INIT_HLIST_HEAD(&bat_priv->softif_vlan_list);
+
+ ret = batadv_originator_init(bat_priv);
+ if (ret < 0)
+ goto err;
+
+ ret = batadv_tt_init(bat_priv);
+ if (ret < 0)
+ goto err;
+
+ ret = batadv_bla_init(bat_priv);
+ if (ret < 0)
+ goto err;
+
+ ret = batadv_dat_init(bat_priv);
+ if (ret < 0)
+ goto err;
+
+ ret = batadv_nc_mesh_init(bat_priv);
+ if (ret < 0)
+ goto err;
+
+ batadv_gw_init(bat_priv);
+ batadv_mcast_init(bat_priv);
+
+ atomic_set(&bat_priv->gw.reselect, 0);
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_ACTIVE);
+
+ return 0;
+
+err:
+ batadv_mesh_free(soft_iface);
+ return ret;
+}
+
+void batadv_mesh_free(struct net_device *soft_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+
+ batadv_purge_outstanding_packets(bat_priv, NULL);
+
+ batadv_gw_node_purge(bat_priv);
+ batadv_nc_mesh_free(bat_priv);
+ batadv_dat_free(bat_priv);
+ batadv_bla_free(bat_priv);
+
+ batadv_mcast_free(bat_priv);
+
+ /* Free the TT and the originator tables only after having terminated
+ * all the other depending components which may use these structures for
+ * their purposes.
+ */
+ batadv_tt_free(bat_priv);
+
+ /* Since the originator table clean up routine is accessing the TT
+ * tables as well, it has to be invoked after the TT tables have been
+ * freed and marked as empty. This ensures that no cleanup RCU callbacks
+ * accessing the TT data are scheduled for later execution.
+ */
+ batadv_originator_free(bat_priv);
+
+ batadv_gw_free(bat_priv);
+
+ free_percpu(bat_priv->bat_counters);
+ bat_priv->bat_counters = NULL;
+
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
+}
+
+/**
+ * batadv_is_my_mac - check if the given mac address belongs to any of the real
+ * interfaces in the current mesh
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the address to check
+ */
+int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr)
+{
+ const struct batadv_hard_iface *hard_iface;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (hard_iface->soft_iface != bat_priv->soft_iface)
+ continue;
+
+ if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * batadv_seq_print_text_primary_if_get - called from debugfs table printing
+ * function that requires the primary interface
+ * @seq: debugfs table seq_file struct
+ *
+ * Returns primary interface if found or NULL otherwise.
+ */
+struct batadv_hard_iface *
+batadv_seq_print_text_primary_if_get(struct seq_file *seq)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+
+ if (!primary_if) {
+ seq_printf(seq,
+ "BATMAN mesh %s disabled - please specify interfaces to enable it\n",
+ net_dev->name);
+ goto out;
+ }
+
+ if (primary_if->if_status == BATADV_IF_ACTIVE)
+ goto out;
+
+ seq_printf(seq,
+ "BATMAN mesh %s disabled - primary interface not active\n",
+ net_dev->name);
+ batadv_hardif_free_ref(primary_if);
+ primary_if = NULL;
+
+out:
+ return primary_if;
+}
+
+/**
+ * batadv_max_header_len - calculate maximum encapsulation overhead for a
+ * payload packet
+ *
+ * Return the maximum encapsulation overhead in bytes.
+ */
+int batadv_max_header_len(void)
+{
+ int header_len = 0;
+
+ header_len = max_t(int, header_len,
+ sizeof(struct batadv_unicast_packet));
+ header_len = max_t(int, header_len,
+ sizeof(struct batadv_unicast_4addr_packet));
+ header_len = max_t(int, header_len,
+ sizeof(struct batadv_bcast_packet));
+
+#ifdef CONFIG_BATMAN_ADV_NC
+ header_len = max_t(int, header_len,
+ sizeof(struct batadv_coded_packet));
+#endif
+
+ return header_len + ETH_HLEN;
+}
+
+/**
+ * batadv_skb_set_priority - sets skb priority according to packet content
+ * @skb: the packet to be sent
+ * @offset: offset to the packet content
+ *
+ * This function sets a value between 256 and 263 (802.1d priority), which
+ * can be interpreted by the cfg80211 or other drivers.
+ */
+void batadv_skb_set_priority(struct sk_buff *skb, int offset)
+{
+ struct iphdr ip_hdr_tmp, *ip_hdr;
+ struct ipv6hdr ip6_hdr_tmp, *ip6_hdr;
+ struct ethhdr ethhdr_tmp, *ethhdr;
+ struct vlan_ethhdr *vhdr, vhdr_tmp;
+ u32 prio;
+
+ /* already set, do nothing */
+ if (skb->priority >= 256 && skb->priority <= 263)
+ return;
+
+ ethhdr = skb_header_pointer(skb, offset, sizeof(*ethhdr), &ethhdr_tmp);
+ if (!ethhdr)
+ return;
+
+ switch (ethhdr->h_proto) {
+ case htons(ETH_P_8021Q):
+ vhdr = skb_header_pointer(skb, offset + sizeof(*vhdr),
+ sizeof(*vhdr), &vhdr_tmp);
+ if (!vhdr)
+ return;
+ prio = ntohs(vhdr->h_vlan_TCI) & VLAN_PRIO_MASK;
+ prio = prio >> VLAN_PRIO_SHIFT;
+ break;
+ case htons(ETH_P_IP):
+ ip_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr),
+ sizeof(*ip_hdr), &ip_hdr_tmp);
+ if (!ip_hdr)
+ return;
+ prio = (ipv4_get_dsfield(ip_hdr) & 0xfc) >> 5;
+ break;
+ case htons(ETH_P_IPV6):
+ ip6_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr),
+ sizeof(*ip6_hdr), &ip6_hdr_tmp);
+ if (!ip6_hdr)
+ return;
+ prio = (ipv6_get_dsfield(ip6_hdr) & 0xfc) >> 5;
+ break;
+ default:
+ return;
+ }
+
+ skb->priority = prio + 256;
+}
+
+static int batadv_recv_unhandled_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ return NET_RX_DROP;
+}
+
+/* incoming packets with the batman ethertype received on any active hard
+ * interface
+ */
+int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev)
+{
+ struct batadv_priv *bat_priv;
+ struct batadv_ogm_packet *batadv_ogm_packet;
+ struct batadv_hard_iface *hard_iface;
+ uint8_t idx;
+ int ret;
+
+ hard_iface = container_of(ptype, struct batadv_hard_iface,
+ batman_adv_ptype);
+ skb = skb_share_check(skb, GFP_ATOMIC);
+
+ /* skb was released by skb_share_check() */
+ if (!skb)
+ goto err_out;
+
+ /* packet should hold at least type and version */
+ if (unlikely(!pskb_may_pull(skb, 2)))
+ goto err_free;
+
+ /* expect a valid ethernet header here. */
+ if (unlikely(skb->mac_len != ETH_HLEN || !skb_mac_header(skb)))
+ goto err_free;
+
+ if (!hard_iface->soft_iface)
+ goto err_free;
+
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ goto err_free;
+
+ /* discard frames on not active interfaces */
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ goto err_free;
+
+ batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data;
+
+ if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: incompatible batman version (%i)\n",
+ batadv_ogm_packet->version);
+ goto err_free;
+ }
+
+ /* reset control block to avoid left overs from previous users */
+ memset(skb->cb, 0, sizeof(struct batadv_skb_cb));
+
+ /* all receive handlers return whether they received or reused
+ * the supplied skb. if not, we have to free the skb.
+ */
+ idx = batadv_ogm_packet->packet_type;
+ ret = (*batadv_rx_handler[idx])(skb, hard_iface);
+
+ if (ret == NET_RX_DROP)
+ kfree_skb(skb);
+
+ /* return NET_RX_SUCCESS in any case as we
+ * most probably dropped the packet for
+ * routing-logical reasons.
+ */
+ return NET_RX_SUCCESS;
+
+err_free:
+ kfree_skb(skb);
+err_out:
+ return NET_RX_DROP;
+}
+
+static void batadv_recv_handler_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(batadv_rx_handler); i++)
+ batadv_rx_handler[i] = batadv_recv_unhandled_packet;
+
+ for (i = BATADV_UNICAST_MIN; i <= BATADV_UNICAST_MAX; i++)
+ batadv_rx_handler[i] = batadv_recv_unhandled_unicast_packet;
+
+ /* compile time checks for sizes */
+ BUILD_BUG_ON(sizeof(struct batadv_bla_claim_dst) != 6);
+ BUILD_BUG_ON(sizeof(struct batadv_ogm_packet) != 24);
+ BUILD_BUG_ON(sizeof(struct batadv_icmp_header) != 20);
+ BUILD_BUG_ON(sizeof(struct batadv_icmp_packet) != 20);
+ BUILD_BUG_ON(sizeof(struct batadv_icmp_packet_rr) != 116);
+ BUILD_BUG_ON(sizeof(struct batadv_unicast_packet) != 10);
+ BUILD_BUG_ON(sizeof(struct batadv_unicast_4addr_packet) != 18);
+ BUILD_BUG_ON(sizeof(struct batadv_frag_packet) != 20);
+ BUILD_BUG_ON(sizeof(struct batadv_bcast_packet) != 14);
+ BUILD_BUG_ON(sizeof(struct batadv_coded_packet) != 46);
+ BUILD_BUG_ON(sizeof(struct batadv_unicast_tvlv_packet) != 20);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_hdr) != 4);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_gateway_data) != 8);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_vlan_data) != 8);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12);
+ BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8);
+
+ /* broadcast packet */
+ batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet;
+
+ /* unicast packets ... */
+ /* unicast with 4 addresses packet */
+ batadv_rx_handler[BATADV_UNICAST_4ADDR] = batadv_recv_unicast_packet;
+ /* unicast packet */
+ batadv_rx_handler[BATADV_UNICAST] = batadv_recv_unicast_packet;
+ /* unicast tvlv packet */
+ batadv_rx_handler[BATADV_UNICAST_TVLV] = batadv_recv_unicast_tvlv;
+ /* batman icmp packet */
+ batadv_rx_handler[BATADV_ICMP] = batadv_recv_icmp_packet;
+ /* Fragmented packets */
+ batadv_rx_handler[BATADV_UNICAST_FRAG] = batadv_recv_frag_packet;
+}
+
+int
+batadv_recv_handler_register(uint8_t packet_type,
+ int (*recv_handler)(struct sk_buff *,
+ struct batadv_hard_iface *))
+{
+ int (*curr)(struct sk_buff *,
+ struct batadv_hard_iface *);
+ curr = batadv_rx_handler[packet_type];
+
+ if ((curr != batadv_recv_unhandled_packet) &&
+ (curr != batadv_recv_unhandled_unicast_packet))
+ return -EBUSY;
+
+ batadv_rx_handler[packet_type] = recv_handler;
+ return 0;
+}
+
+void batadv_recv_handler_unregister(uint8_t packet_type)
+{
+ batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet;
+}
+
+static struct batadv_algo_ops *batadv_algo_get(char *name)
+{
+ struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp;
+
+ hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) {
+ if (strcmp(bat_algo_ops_tmp->name, name) != 0)
+ continue;
+
+ bat_algo_ops = bat_algo_ops_tmp;
+ break;
+ }
+
+ return bat_algo_ops;
+}
+
+int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
+{
+ struct batadv_algo_ops *bat_algo_ops_tmp;
+ int ret;
+
+ bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name);
+ if (bat_algo_ops_tmp) {
+ pr_info("Trying to register already registered routing algorithm: %s\n",
+ bat_algo_ops->name);
+ ret = -EEXIST;
+ goto out;
+ }
+
+ /* all algorithms must implement all ops (for now) */
+ if (!bat_algo_ops->bat_iface_enable ||
+ !bat_algo_ops->bat_iface_disable ||
+ !bat_algo_ops->bat_iface_update_mac ||
+ !bat_algo_ops->bat_primary_iface_set ||
+ !bat_algo_ops->bat_ogm_schedule ||
+ !bat_algo_ops->bat_ogm_emit ||
+ !bat_algo_ops->bat_neigh_cmp ||
+ !bat_algo_ops->bat_neigh_is_equiv_or_better) {
+ pr_info("Routing algo '%s' does not implement required ops\n",
+ bat_algo_ops->name);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ INIT_HLIST_NODE(&bat_algo_ops->list);
+ hlist_add_head(&bat_algo_ops->list, &batadv_algo_list);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
+{
+ struct batadv_algo_ops *bat_algo_ops;
+ int ret = -EINVAL;
+
+ bat_algo_ops = batadv_algo_get(name);
+ if (!bat_algo_ops)
+ goto out;
+
+ bat_priv->bat_algo_ops = bat_algo_ops;
+ ret = 0;
+
+out:
+ return ret;
+}
+
+int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct batadv_algo_ops *bat_algo_ops;
+
+ seq_puts(seq, "Available routing algorithms:\n");
+
+ hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
+ seq_printf(seq, "%s\n", bat_algo_ops->name);
+ }
+
+ return 0;
+}
+
+/**
+ * batadv_skb_crc32 - calculate CRC32 of the whole packet and skip bytes in
+ * the header
+ * @skb: skb pointing to fragmented socket buffers
+ * @payload_ptr: Pointer to position inside the head buffer of the skb
+ * marking the start of the data to be CRC'ed
+ *
+ * payload_ptr must always point to an address in the skb head buffer and not to
+ * a fragment.
+ */
+__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
+{
+ u32 crc = 0;
+ unsigned int from;
+ unsigned int to = skb->len;
+ struct skb_seq_state st;
+ const u8 *data;
+ unsigned int len;
+ unsigned int consumed = 0;
+
+ from = (unsigned int)(payload_ptr - skb->data);
+
+ skb_prepare_seq_read(skb, from, to, &st);
+ while ((len = skb_seq_read(consumed, &data, &st)) != 0) {
+ crc = crc32c(crc, data, len);
+ consumed += len;
+ }
+
+ return htonl(crc);
+}
+
+/**
+ * batadv_tvlv_handler_free_ref - decrement the tvlv handler refcounter and
+ * possibly free it
+ * @tvlv_handler: the tvlv handler to free
+ */
+static void
+batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler)
+{
+ if (atomic_dec_and_test(&tvlv_handler->refcount))
+ kfree_rcu(tvlv_handler, rcu);
+}
+
+/**
+ * batadv_tvlv_handler_get - retrieve tvlv handler from the tvlv handler list
+ * based on the provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv handler type to look for
+ * @version: tvlv handler version to look for
+ *
+ * Returns tvlv handler if found or NULL otherwise.
+ */
+static struct batadv_tvlv_handler
+*batadv_tvlv_handler_get(struct batadv_priv *bat_priv,
+ uint8_t type, uint8_t version)
+{
+ struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tvlv_handler_tmp,
+ &bat_priv->tvlv.handler_list, list) {
+ if (tvlv_handler_tmp->type != type)
+ continue;
+
+ if (tvlv_handler_tmp->version != version)
+ continue;
+
+ if (!atomic_inc_not_zero(&tvlv_handler_tmp->refcount))
+ continue;
+
+ tvlv_handler = tvlv_handler_tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tvlv_handler;
+}
+
+/**
+ * batadv_tvlv_container_free_ref - decrement the tvlv container refcounter and
+ * possibly free it
+ * @tvlv: the tvlv container to free
+ */
+static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv)
+{
+ if (atomic_dec_and_test(&tvlv->refcount))
+ kfree(tvlv);
+}
+
+/**
+ * batadv_tvlv_container_get - retrieve tvlv container from the tvlv container
+ * list based on the provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv container type to look for
+ * @version: tvlv container version to look for
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ *
+ * Returns tvlv container if found or NULL otherwise.
+ */
+static struct batadv_tvlv_container
+*batadv_tvlv_container_get(struct batadv_priv *bat_priv,
+ uint8_t type, uint8_t version)
+{
+ struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL;
+
+ hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) {
+ if (tvlv_tmp->tvlv_hdr.type != type)
+ continue;
+
+ if (tvlv_tmp->tvlv_hdr.version != version)
+ continue;
+
+ if (!atomic_inc_not_zero(&tvlv_tmp->refcount))
+ continue;
+
+ tvlv = tvlv_tmp;
+ break;
+ }
+
+ return tvlv;
+}
+
+/**
+ * batadv_tvlv_container_list_size - calculate the size of the tvlv container
+ * list entries
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ *
+ * Returns size of all currently registered tvlv containers in bytes.
+ */
+static uint16_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
+{
+ struct batadv_tvlv_container *tvlv;
+ uint16_t tvlv_len = 0;
+
+ hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
+ tvlv_len += sizeof(struct batadv_tvlv_hdr);
+ tvlv_len += ntohs(tvlv->tvlv_hdr.len);
+ }
+
+ return tvlv_len;
+}
+
+/**
+ * batadv_tvlv_container_remove - remove tvlv container from the tvlv container
+ * list
+ * @tvlv: the to be removed tvlv container
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (tvlv.container_list_lock).
+ */
+static void batadv_tvlv_container_remove(struct batadv_tvlv_container *tvlv)
+{
+ if (!tvlv)
+ return;
+
+ hlist_del(&tvlv->list);
+
+ /* first call to decrement the counter, second call to free */
+ batadv_tvlv_container_free_ref(tvlv);
+ batadv_tvlv_container_free_ref(tvlv);
+}
+
+/**
+ * batadv_tvlv_container_unregister - unregister tvlv container based on the
+ * provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv container type to unregister
+ * @version: tvlv container type to unregister
+ */
+void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
+ uint8_t type, uint8_t version)
+{
+ struct batadv_tvlv_container *tvlv;
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv = batadv_tvlv_container_get(bat_priv, type, version);
+ batadv_tvlv_container_remove(tvlv);
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+}
+
+/**
+ * batadv_tvlv_container_register - register tvlv type, version and content
+ * to be propagated with each (primary interface) OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv container type
+ * @version: tvlv container version
+ * @tvlv_value: tvlv container content
+ * @tvlv_value_len: tvlv container content length
+ *
+ * If a container of the same type and version was already registered the new
+ * content is going to replace the old one.
+ */
+void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
+ uint8_t type, uint8_t version,
+ void *tvlv_value, uint16_t tvlv_value_len)
+{
+ struct batadv_tvlv_container *tvlv_old, *tvlv_new;
+
+ if (!tvlv_value)
+ tvlv_value_len = 0;
+
+ tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC);
+ if (!tvlv_new)
+ return;
+
+ tvlv_new->tvlv_hdr.version = version;
+ tvlv_new->tvlv_hdr.type = type;
+ tvlv_new->tvlv_hdr.len = htons(tvlv_value_len);
+
+ memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len));
+ INIT_HLIST_NODE(&tvlv_new->list);
+ atomic_set(&tvlv_new->refcount, 1);
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
+ batadv_tvlv_container_remove(tvlv_old);
+ hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list);
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+}
+
+/**
+ * batadv_tvlv_realloc_packet_buff - reallocate packet buffer to accommodate
+ * requested packet size
+ * @packet_buff: packet buffer
+ * @packet_buff_len: packet buffer size
+ * @min_packet_len: requested packet minimum size
+ * @additional_packet_len: requested additional packet size on top of minimum
+ * size
+ *
+ * Returns true of the packet buffer could be changed to the requested size,
+ * false otherwise.
+ */
+static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
+ int *packet_buff_len,
+ int min_packet_len,
+ int additional_packet_len)
+{
+ unsigned char *new_buff;
+
+ new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC);
+
+ /* keep old buffer if kmalloc should fail */
+ if (new_buff) {
+ memcpy(new_buff, *packet_buff, min_packet_len);
+ kfree(*packet_buff);
+ *packet_buff = new_buff;
+ *packet_buff_len = min_packet_len + additional_packet_len;
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * batadv_tvlv_container_ogm_append - append tvlv container content to given
+ * OGM packet buffer
+ * @bat_priv: the bat priv with all the soft interface information
+ * @packet_buff: ogm packet buffer
+ * @packet_buff_len: ogm packet buffer size including ogm header and tvlv
+ * content
+ * @packet_min_len: ogm header size to be preserved for the OGM itself
+ *
+ * The ogm packet might be enlarged or shrunk depending on the current size
+ * and the size of the to-be-appended tvlv containers.
+ *
+ * Returns size of all appended tvlv containers in bytes.
+ */
+uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+ unsigned char **packet_buff,
+ int *packet_buff_len,
+ int packet_min_len)
+{
+ struct batadv_tvlv_container *tvlv;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ uint16_t tvlv_value_len;
+ void *tvlv_value;
+ bool ret;
+
+ spin_lock_bh(&bat_priv->tvlv.container_list_lock);
+ tvlv_value_len = batadv_tvlv_container_list_size(bat_priv);
+
+ ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len,
+ packet_min_len, tvlv_value_len);
+
+ if (!ret)
+ goto end;
+
+ if (!tvlv_value_len)
+ goto end;
+
+ tvlv_value = (*packet_buff) + packet_min_len;
+
+ hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
+ tvlv_hdr = tvlv_value;
+ tvlv_hdr->type = tvlv->tvlv_hdr.type;
+ tvlv_hdr->version = tvlv->tvlv_hdr.version;
+ tvlv_hdr->len = tvlv->tvlv_hdr.len;
+ tvlv_value = tvlv_hdr + 1;
+ memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len));
+ tvlv_value = (uint8_t *)tvlv_value + ntohs(tvlv->tvlv_hdr.len);
+ }
+
+end:
+ spin_unlock_bh(&bat_priv->tvlv.container_list_lock);
+ return tvlv_value_len;
+}
+
+/**
+ * batadv_tvlv_call_handler - parse the given tvlv buffer to call the
+ * appropriate handlers
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tvlv_handler: tvlv callback function handling the tvlv content
+ * @ogm_source: flag indicating wether the tvlv is an ogm or a unicast packet
+ * @orig_node: orig node emitting the ogm packet
+ * @src: source mac address of the unicast packet
+ * @dst: destination mac address of the unicast packet
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ *
+ * Returns success if handler was not found or the return value of the handler
+ * callback.
+ */
+static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_handler *tvlv_handler,
+ bool ogm_source,
+ struct batadv_orig_node *orig_node,
+ uint8_t *src, uint8_t *dst,
+ void *tvlv_value, uint16_t tvlv_value_len)
+{
+ if (!tvlv_handler)
+ return NET_RX_SUCCESS;
+
+ if (ogm_source) {
+ if (!tvlv_handler->ogm_handler)
+ return NET_RX_SUCCESS;
+
+ if (!orig_node)
+ return NET_RX_SUCCESS;
+
+ tvlv_handler->ogm_handler(bat_priv, orig_node,
+ BATADV_NO_FLAGS,
+ tvlv_value, tvlv_value_len);
+ tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED;
+ } else {
+ if (!src)
+ return NET_RX_SUCCESS;
+
+ if (!dst)
+ return NET_RX_SUCCESS;
+
+ if (!tvlv_handler->unicast_handler)
+ return NET_RX_SUCCESS;
+
+ return tvlv_handler->unicast_handler(bat_priv, src,
+ dst, tvlv_value,
+ tvlv_value_len);
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_tvlv_containers_process - parse the given tvlv buffer to call the
+ * appropriate handlers
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ogm_source: flag indicating wether the tvlv is an ogm or a unicast packet
+ * @orig_node: orig node emitting the ogm packet
+ * @src: source mac address of the unicast packet
+ * @dst: destination mac address of the unicast packet
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ *
+ * Returns success when processing an OGM or the return value of all called
+ * handler callbacks.
+ */
+int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
+ bool ogm_source,
+ struct batadv_orig_node *orig_node,
+ uint8_t *src, uint8_t *dst,
+ void *tvlv_value, uint16_t tvlv_value_len)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ uint16_t tvlv_value_cont_len;
+ uint8_t cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND;
+ int ret = NET_RX_SUCCESS;
+
+ while (tvlv_value_len >= sizeof(*tvlv_hdr)) {
+ tvlv_hdr = tvlv_value;
+ tvlv_value_cont_len = ntohs(tvlv_hdr->len);
+ tvlv_value = tvlv_hdr + 1;
+ tvlv_value_len -= sizeof(*tvlv_hdr);
+
+ if (tvlv_value_cont_len > tvlv_value_len)
+ break;
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv,
+ tvlv_hdr->type,
+ tvlv_hdr->version);
+
+ ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler,
+ ogm_source, orig_node,
+ src, dst, tvlv_value,
+ tvlv_value_cont_len);
+ if (tvlv_handler)
+ batadv_tvlv_handler_free_ref(tvlv_handler);
+ tvlv_value = (uint8_t *)tvlv_value + tvlv_value_cont_len;
+ tvlv_value_len -= tvlv_value_cont_len;
+ }
+
+ if (!ogm_source)
+ return ret;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tvlv_handler,
+ &bat_priv->tvlv.handler_list, list) {
+ if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) &&
+ !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED))
+ tvlv_handler->ogm_handler(bat_priv, orig_node,
+ cifnotfound, NULL, 0);
+
+ tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED;
+ }
+ rcu_read_unlock();
+
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_tvlv_ogm_receive - process an incoming ogm and call the appropriate
+ * handlers
+ * @bat_priv: the bat priv with all the soft interface information
+ * @batadv_ogm_packet: ogm packet containing the tvlv containers
+ * @orig_node: orig node emitting the ogm packet
+ */
+void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_orig_node *orig_node)
+{
+ void *tvlv_value;
+ uint16_t tvlv_value_len;
+
+ if (!batadv_ogm_packet)
+ return;
+
+ tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len);
+ if (!tvlv_value_len)
+ return;
+
+ tvlv_value = batadv_ogm_packet + 1;
+
+ batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL,
+ tvlv_value, tvlv_value_len);
+}
+
+/**
+ * batadv_tvlv_handler_register - register tvlv handler based on the provided
+ * type and version (both need to match) for ogm tvlv payload and/or unicast
+ * payload
+ * @bat_priv: the bat priv with all the soft interface information
+ * @optr: ogm tvlv handler callback function. This function receives the orig
+ * node, flags and the tvlv content as argument to process.
+ * @uptr: unicast tvlv handler callback function. This function receives the
+ * source & destination of the unicast packet as well as the tvlv content
+ * to process.
+ * @type: tvlv handler type to be registered
+ * @version: tvlv handler version to be registered
+ * @flags: flags to enable or disable TVLV API behavior
+ */
+void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
+ void (*optr)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t flags,
+ void *tvlv_value,
+ uint16_t tvlv_value_len),
+ int (*uptr)(struct batadv_priv *bat_priv,
+ uint8_t *src, uint8_t *dst,
+ void *tvlv_value,
+ uint16_t tvlv_value_len),
+ uint8_t type, uint8_t version, uint8_t flags)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
+ if (tvlv_handler) {
+ batadv_tvlv_handler_free_ref(tvlv_handler);
+ return;
+ }
+
+ tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC);
+ if (!tvlv_handler)
+ return;
+
+ tvlv_handler->ogm_handler = optr;
+ tvlv_handler->unicast_handler = uptr;
+ tvlv_handler->type = type;
+ tvlv_handler->version = version;
+ tvlv_handler->flags = flags;
+ atomic_set(&tvlv_handler->refcount, 1);
+ INIT_HLIST_NODE(&tvlv_handler->list);
+
+ spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
+ hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list);
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+}
+
+/**
+ * batadv_tvlv_handler_unregister - unregister tvlv handler based on the
+ * provided type and version (both need to match)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @type: tvlv handler type to be unregistered
+ * @version: tvlv handler version to be unregistered
+ */
+void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
+ uint8_t type, uint8_t version)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
+ if (!tvlv_handler)
+ return;
+
+ batadv_tvlv_handler_free_ref(tvlv_handler);
+ spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
+ hlist_del_rcu(&tvlv_handler->list);
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
+ batadv_tvlv_handler_free_ref(tvlv_handler);
+}
+
+/**
+ * batadv_tvlv_unicast_send - send a unicast packet with tvlv payload to the
+ * specified host
+ * @bat_priv: the bat priv with all the soft interface information
+ * @src: source mac address of the unicast packet
+ * @dst: destination mac address of the unicast packet
+ * @type: tvlv type
+ * @version: tvlv version
+ * @tvlv_value: tvlv content
+ * @tvlv_value_len: tvlv content length
+ */
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src,
+ uint8_t *dst, uint8_t type, uint8_t version,
+ void *tvlv_value, uint16_t tvlv_value_len)
+{
+ struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ struct batadv_orig_node *orig_node;
+ struct sk_buff *skb = NULL;
+ unsigned char *tvlv_buff;
+ unsigned int tvlv_len;
+ ssize_t hdr_len = sizeof(*unicast_tvlv_packet);
+ bool ret = false;
+
+ orig_node = batadv_orig_hash_find(bat_priv, dst);
+ if (!orig_node)
+ goto out;
+
+ tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len;
+
+ skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len);
+ if (!skb)
+ goto out;
+
+ skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(skb, ETH_HLEN);
+ tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len);
+ unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff;
+ unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV;
+ unicast_tvlv_packet->version = BATADV_COMPAT_VERSION;
+ unicast_tvlv_packet->ttl = BATADV_TTL;
+ unicast_tvlv_packet->reserved = 0;
+ unicast_tvlv_packet->tvlv_len = htons(tvlv_len);
+ unicast_tvlv_packet->align = 0;
+ ether_addr_copy(unicast_tvlv_packet->src, src);
+ ether_addr_copy(unicast_tvlv_packet->dst, dst);
+
+ tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1);
+ tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff;
+ tvlv_hdr->version = version;
+ tvlv_hdr->type = type;
+ tvlv_hdr->len = htons(tvlv_value_len);
+ tvlv_buff += sizeof(*tvlv_hdr);
+ memcpy(tvlv_buff, tvlv_value, tvlv_value_len);
+
+ if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP)
+ ret = true;
+
+out:
+ if (skb && !ret)
+ kfree_skb(skb);
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+}
+
+/**
+ * batadv_get_vid - extract the VLAN identifier from skb if any
+ * @skb: the buffer containing the packet
+ * @header_len: length of the batman header preceding the ethernet header
+ *
+ * If the packet embedded in the skb is vlan tagged this function returns the
+ * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned.
+ */
+unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
+{
+ struct ethhdr *ethhdr = (struct ethhdr *)(skb->data + header_len);
+ struct vlan_ethhdr *vhdr;
+ unsigned short vid;
+
+ if (ethhdr->h_proto != htons(ETH_P_8021Q))
+ return BATADV_NO_FLAGS;
+
+ if (!pskb_may_pull(skb, header_len + VLAN_ETH_HLEN))
+ return BATADV_NO_FLAGS;
+
+ vhdr = (struct vlan_ethhdr *)(skb->data + header_len);
+ vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK;
+ vid |= BATADV_VLAN_HAS_TAG;
+
+ return vid;
+}
+
+/**
+ * batadv_vlan_ap_isola_get - return the AP isolation status for the given vlan
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vid: the VLAN identifier for which the AP isolation attributed as to be
+ * looked up
+ *
+ * Returns true if AP isolation is on for the VLAN idenfied by vid, false
+ * otherwise
+ */
+bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid)
+{
+ bool ap_isolation_enabled = false;
+ struct batadv_softif_vlan *vlan;
+
+ /* if the AP isolation is requested on a VLAN, then check for its
+ * setting in the proper VLAN private data structure
+ */
+ vlan = batadv_softif_vlan_get(bat_priv, vid);
+ if (vlan) {
+ ap_isolation_enabled = atomic_read(&vlan->ap_isolation);
+ batadv_softif_vlan_free_ref(vlan);
+ }
+
+ return ap_isolation_enabled;
+}
+
+static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
+{
+ struct batadv_algo_ops *bat_algo_ops;
+ char *algo_name = (char *)val;
+ size_t name_len = strlen(algo_name);
+
+ if (name_len > 0 && algo_name[name_len - 1] == '\n')
+ algo_name[name_len - 1] = '\0';
+
+ bat_algo_ops = batadv_algo_get(algo_name);
+ if (!bat_algo_ops) {
+ pr_err("Routing algorithm '%s' is not supported\n", algo_name);
+ return -EINVAL;
+ }
+
+ return param_set_copystring(algo_name, kp);
+}
+
+static const struct kernel_param_ops batadv_param_ops_ra = {
+ .set = batadv_param_set_ra,
+ .get = param_get_string,
+};
+
+static struct kparam_string batadv_param_string_ra = {
+ .maxlen = sizeof(batadv_routing_algo),
+ .string = batadv_routing_algo,
+};
+
+module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra,
+ 0644);
+module_init(batadv_init);
+module_exit(batadv_exit);
+
+MODULE_LICENSE("GPL");
+
+MODULE_AUTHOR(BATADV_DRIVER_AUTHOR);
+MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
+MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE);
+MODULE_VERSION(BATADV_SOURCE_VERSION);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
new file mode 100644
index 000000000..4d2318829
--- /dev/null
+++ b/net/batman-adv/main.h
@@ -0,0 +1,389 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_MAIN_H_
+#define _NET_BATMAN_ADV_MAIN_H_
+
+#define BATADV_DRIVER_AUTHOR "Marek Lindner <mareklindner@neomailbox.ch>, " \
+ "Simon Wunderlich <sw@simonwunderlich.de>"
+#define BATADV_DRIVER_DESC "B.A.T.M.A.N. advanced"
+#define BATADV_DRIVER_DEVICE "batman-adv"
+
+#ifndef BATADV_SOURCE_VERSION
+#define BATADV_SOURCE_VERSION "2015.0"
+#endif
+
+/* B.A.T.M.A.N. parameters */
+
+#define BATADV_TQ_MAX_VALUE 255
+#define BATADV_JITTER 20
+
+/* Time To Live of broadcast messages */
+#define BATADV_TTL 50
+
+/* purge originators after time in seconds if no valid packet comes in
+ * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE
+ */
+#define BATADV_PURGE_TIMEOUT 200000 /* 200 seconds */
+#define BATADV_TT_LOCAL_TIMEOUT 600000 /* in milliseconds */
+#define BATADV_TT_CLIENT_ROAM_TIMEOUT 600000 /* in milliseconds */
+#define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */
+#define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */
+#define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */
+#define BATADV_DAT_ENTRY_TIMEOUT (5*60000) /* 5 mins in milliseconds */
+/* sliding packet range of received originator messages in sequence numbers
+ * (should be a multiple of our word size)
+ */
+#define BATADV_TQ_LOCAL_WINDOW_SIZE 64
+/* milliseconds we have to keep pending tt_req */
+#define BATADV_TT_REQUEST_TIMEOUT 3000
+
+#define BATADV_TQ_GLOBAL_WINDOW_SIZE 5
+#define BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM 1
+#define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1
+#define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1
+
+/* number of OGMs sent with the last tt diff */
+#define BATADV_TT_OGM_APPEND_MAX 3
+
+/* Time in which a client can roam at most ROAMING_MAX_COUNT times in
+ * milliseconds
+ */
+#define BATADV_ROAMING_MAX_TIME 20000
+#define BATADV_ROAMING_MAX_COUNT 5
+
+#define BATADV_NO_FLAGS 0
+
+#define BATADV_NULL_IFINDEX 0 /* dummy ifindex used to avoid iface checks */
+
+#define BATADV_NO_MARK 0
+
+/* default interface for multi interface operation. The default interface is
+ * used for communication which originated locally (i.e. is not forwarded)
+ * or where special forwarding is not desired/necessary.
+ */
+#define BATADV_IF_DEFAULT ((struct batadv_hard_iface *)NULL)
+
+#define BATADV_NUM_WORDS BITS_TO_LONGS(BATADV_TQ_LOCAL_WINDOW_SIZE)
+
+#define BATADV_LOG_BUF_LEN 8192 /* has to be a power of 2 */
+
+/* number of packets to send for broadcasts on different interface types */
+#define BATADV_NUM_BCASTS_DEFAULT 1
+#define BATADV_NUM_BCASTS_WIRELESS 3
+#define BATADV_NUM_BCASTS_MAX 3
+
+/* msecs after which an ARP_REQUEST is sent in broadcast as fallback */
+#define ARP_REQ_DELAY 250
+/* numbers of originator to contact for any PUT/GET DHT operation */
+#define BATADV_DAT_CANDIDATES_NUM 3
+
+/* BATADV_TQ_SIMILARITY_THRESHOLD - TQ points that a secondary metric can differ
+ * at most from the primary one in order to be still considered acceptable
+ */
+#define BATADV_TQ_SIMILARITY_THRESHOLD 50
+
+/* how much worse secondary interfaces may be to be considered as bonding
+ * candidates
+ */
+#define BATADV_BONDING_TQ_THRESHOLD 50
+
+/* should not be bigger than 512 bytes or change the size of
+ * forw_packet->direct_link_flags
+ */
+#define BATADV_MAX_AGGREGATION_BYTES 512
+#define BATADV_MAX_AGGREGATION_MS 100
+
+#define BATADV_BLA_PERIOD_LENGTH 10000 /* 10 seconds */
+#define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 3)
+#define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10)
+#define BATADV_BLA_WAIT_PERIODS 3
+
+#define BATADV_DUPLIST_SIZE 16
+#define BATADV_DUPLIST_TIMEOUT 500 /* 500 ms */
+/* don't reset again within 30 seconds */
+#define BATADV_RESET_PROTECTION_MS 30000
+#define BATADV_EXPECTED_SEQNO_RANGE 65536
+
+#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */
+
+enum batadv_mesh_state {
+ BATADV_MESH_INACTIVE,
+ BATADV_MESH_ACTIVE,
+ BATADV_MESH_DEACTIVATING,
+};
+
+#define BATADV_BCAST_QUEUE_LEN 256
+#define BATADV_BATMAN_QUEUE_LEN 256
+
+enum batadv_uev_action {
+ BATADV_UEV_ADD = 0,
+ BATADV_UEV_DEL,
+ BATADV_UEV_CHANGE,
+};
+
+enum batadv_uev_type {
+ BATADV_UEV_GW = 0,
+};
+
+#define BATADV_GW_THRESHOLD 50
+
+/* Number of fragment chains for each orig_node */
+#define BATADV_FRAG_BUFFER_COUNT 8
+/* Maximum number of fragments for one packet */
+#define BATADV_FRAG_MAX_FRAGMENTS 16
+/* Maxumim size of each fragment */
+#define BATADV_FRAG_MAX_FRAG_SIZE 1400
+/* Time to keep fragments while waiting for rest of the fragments */
+#define BATADV_FRAG_TIMEOUT 10000
+
+#define BATADV_DAT_CANDIDATE_NOT_FOUND 0
+#define BATADV_DAT_CANDIDATE_ORIG 1
+
+/* Debug Messages */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+/* Append 'batman-adv: ' before kernel messages */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+/* Kernel headers */
+
+#include <linux/mutex.h> /* mutex */
+#include <linux/module.h> /* needed by all modules */
+#include <linux/netdevice.h> /* netdevice */
+#include <linux/etherdevice.h> /* ethernet address classification */
+#include <linux/if_ether.h> /* ethernet header */
+#include <linux/poll.h> /* poll_table */
+#include <linux/kthread.h> /* kernel threads */
+#include <linux/pkt_sched.h> /* schedule types */
+#include <linux/workqueue.h> /* workqueue */
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <net/sock.h> /* struct sock */
+#include <net/addrconf.h> /* ipv6 address stuff */
+#include <linux/ip.h>
+#include <net/rtnetlink.h>
+#include <linux/jiffies.h>
+#include <linux/seq_file.h>
+#include <linux/if_vlan.h>
+
+#include "types.h"
+
+#define BATADV_PRINT_VID(vid) (vid & BATADV_VLAN_HAS_TAG ? \
+ (int)(vid & VLAN_VID_MASK) : -1)
+
+extern char batadv_routing_algo[];
+extern struct list_head batadv_hardif_list;
+
+extern unsigned char batadv_broadcast_addr[];
+extern struct workqueue_struct *batadv_event_workqueue;
+
+int batadv_mesh_init(struct net_device *soft_iface);
+void batadv_mesh_free(struct net_device *soft_iface);
+int batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr);
+struct batadv_hard_iface *
+batadv_seq_print_text_primary_if_get(struct seq_file *seq);
+int batadv_max_header_len(void);
+void batadv_skb_set_priority(struct sk_buff *skb, int offset);
+int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype,
+ struct net_device *orig_dev);
+int
+batadv_recv_handler_register(uint8_t packet_type,
+ int (*recv_handler)(struct sk_buff *,
+ struct batadv_hard_iface *));
+void batadv_recv_handler_unregister(uint8_t packet_type);
+int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
+int batadv_algo_select(struct batadv_priv *bat_priv, char *name);
+int batadv_algo_seq_print_text(struct seq_file *seq, void *offset);
+__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr);
+
+/**
+ * enum batadv_dbg_level - available log levels
+ * @BATADV_DBG_BATMAN: OGM and TQ computations related messages
+ * @BATADV_DBG_ROUTES: route added / changed / deleted
+ * @BATADV_DBG_TT: translation table messages
+ * @BATADV_DBG_BLA: bridge loop avoidance messages
+ * @BATADV_DBG_DAT: ARP snooping and DAT related messages
+ * @BATADV_DBG_NC: network coding related messages
+ * @BATADV_DBG_ALL: the union of all the above log levels
+ */
+enum batadv_dbg_level {
+ BATADV_DBG_BATMAN = BIT(0),
+ BATADV_DBG_ROUTES = BIT(1),
+ BATADV_DBG_TT = BIT(2),
+ BATADV_DBG_BLA = BIT(3),
+ BATADV_DBG_DAT = BIT(4),
+ BATADV_DBG_NC = BIT(5),
+ BATADV_DBG_ALL = 63,
+};
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
+__printf(2, 3);
+
+/* possibly ratelimited debug output */
+#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
+ do { \
+ if (atomic_read(&bat_priv->log_level) & type && \
+ (!ratelimited || net_ratelimit())) \
+ batadv_debug_log(bat_priv, fmt, ## arg);\
+ } \
+ while (0)
+#else /* !CONFIG_BATMAN_ADV_DEBUG */
+__printf(4, 5)
+static inline void _batadv_dbg(int type __always_unused,
+ struct batadv_priv *bat_priv __always_unused,
+ int ratelimited __always_unused,
+ const char *fmt __always_unused, ...)
+{
+}
+#endif
+
+#define batadv_dbg(type, bat_priv, arg...) \
+ _batadv_dbg(type, bat_priv, 0, ## arg)
+#define batadv_dbg_ratelimited(type, bat_priv, arg...) \
+ _batadv_dbg(type, bat_priv, 1, ## arg)
+
+#define batadv_info(net_dev, fmt, arg...) \
+ do { \
+ struct net_device *_netdev = (net_dev); \
+ struct batadv_priv *_batpriv = netdev_priv(_netdev); \
+ batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \
+ pr_info("%s: " fmt, _netdev->name, ## arg); \
+ } while (0)
+#define batadv_err(net_dev, fmt, arg...) \
+ do { \
+ struct net_device *_netdev = (net_dev); \
+ struct batadv_priv *_batpriv = netdev_priv(_netdev); \
+ batadv_dbg(BATADV_DBG_ALL, _batpriv, fmt, ## arg); \
+ pr_err("%s: " fmt, _netdev->name, ## arg); \
+ } while (0)
+
+/* returns 1 if they are the same ethernet addr
+ *
+ * note: can't use ether_addr_equal() as it requires aligned memory
+ */
+static inline int batadv_compare_eth(const void *data1, const void *data2)
+{
+ return ether_addr_equal_unaligned(data1, data2);
+}
+
+/**
+ * has_timed_out - compares current time (jiffies) and timestamp + timeout
+ * @timestamp: base value to compare with (in jiffies)
+ * @timeout: added to base value before comparing (in milliseconds)
+ *
+ * Returns true if current time is after timestamp + timeout
+ */
+static inline bool batadv_has_timed_out(unsigned long timestamp,
+ unsigned int timeout)
+{
+ return time_is_before_jiffies(timestamp + msecs_to_jiffies(timeout));
+}
+
+#define batadv_atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0)
+
+/* Returns the smallest signed integer in two's complement with the sizeof x */
+#define batadv_smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u)))
+
+/* Checks if a sequence number x is a predecessor/successor of y.
+ * they handle overflows/underflows and can correctly check for a
+ * predecessor/successor unless the variable sequence number has grown by
+ * more then 2**(bitwidth(x)-1)-1.
+ * This means that for a uint8_t with the maximum value 255, it would think:
+ * - when adding nothing - it is neither a predecessor nor a successor
+ * - before adding more than 127 to the starting value - it is a predecessor,
+ * - when adding 128 - it is neither a predecessor nor a successor,
+ * - after adding more than 127 to the starting value - it is a successor
+ */
+#define batadv_seq_before(x, y) ({typeof(x)_d1 = (x); \
+ typeof(y)_d2 = (y); \
+ typeof(x)_dummy = (_d1 - _d2); \
+ (void)(&_d1 == &_d2); \
+ _dummy > batadv_smallest_signed_int(_dummy); })
+#define batadv_seq_after(x, y) batadv_seq_before(y, x)
+
+/* Stop preemption on local cpu while incrementing the counter */
+static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
+ size_t count)
+{
+ this_cpu_add(bat_priv->bat_counters[idx], count);
+}
+
+#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
+
+/* Sum and return the cpu-local counters for index 'idx' */
+static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv,
+ size_t idx)
+{
+ uint64_t *counters, sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ counters = per_cpu_ptr(bat_priv->bat_counters, cpu);
+ sum += counters[idx];
+ }
+
+ return sum;
+}
+
+/* Define a macro to reach the control buffer of the skb. The members of the
+ * control buffer are defined in struct batadv_skb_cb in types.h.
+ * The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h.
+ */
+#define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0]))
+
+void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
+ uint8_t type, uint8_t version,
+ void *tvlv_value, uint16_t tvlv_value_len);
+uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
+ unsigned char **packet_buff,
+ int *packet_buff_len,
+ int packet_min_len);
+void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
+ struct batadv_ogm_packet *batadv_ogm_packet,
+ struct batadv_orig_node *orig_node);
+void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
+ uint8_t type, uint8_t version);
+
+void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
+ void (*optr)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t flags,
+ void *tvlv_value,
+ uint16_t tvlv_value_len),
+ int (*uptr)(struct batadv_priv *bat_priv,
+ uint8_t *src, uint8_t *dst,
+ void *tvlv_value,
+ uint16_t tvlv_value_len),
+ uint8_t type, uint8_t version, uint8_t flags);
+void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
+ uint8_t type, uint8_t version);
+int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
+ bool ogm_source,
+ struct batadv_orig_node *orig_node,
+ uint8_t *src, uint8_t *dst,
+ void *tvlv_buff, uint16_t tvlv_buff_len);
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src,
+ uint8_t *dst, uint8_t type, uint8_t version,
+ void *tvlv_value, uint16_t tvlv_value_len);
+unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len);
+bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid);
+
+#endif /* _NET_BATMAN_ADV_MAIN_H_ */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
new file mode 100644
index 000000000..b24e4bb64
--- /dev/null
+++ b/net/batman-adv/multicast.c
@@ -0,0 +1,750 @@
+/* Copyright (C) 2014 B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "multicast.h"
+#include "originator.h"
+#include "hard-interface.h"
+#include "translation-table.h"
+
+/**
+ * batadv_mcast_mla_softif_get - get softif multicast listeners
+ * @dev: the device to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ *
+ * Collect multicast addresses of the local multicast listeners
+ * on the given soft interface, dev, in the given mcast_list.
+ *
+ * Returns -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
+ */
+static int batadv_mcast_mla_softif_get(struct net_device *dev,
+ struct hlist_head *mcast_list)
+{
+ struct netdev_hw_addr *mc_list_entry;
+ struct batadv_hw_addr *new;
+ int ret = 0;
+
+ netif_addr_lock_bh(dev);
+ netdev_for_each_mc_addr(mc_list_entry, dev) {
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ether_addr_copy(new->addr, mc_list_entry->addr);
+ hlist_add_head(&new->list, mcast_list);
+ ret++;
+ }
+ netif_addr_unlock_bh(dev);
+
+ return ret;
+}
+
+/**
+ * batadv_mcast_mla_is_duplicate - check whether an address is in a list
+ * @mcast_addr: the multicast address to check
+ * @mcast_list: the list with multicast addresses to search in
+ *
+ * Returns true if the given address is already in the given list.
+ * Otherwise returns false.
+ */
+static bool batadv_mcast_mla_is_duplicate(uint8_t *mcast_addr,
+ struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+
+ hlist_for_each_entry(mcast_entry, mcast_list, list)
+ if (batadv_compare_eth(mcast_entry->addr, mcast_addr))
+ return true;
+
+ return false;
+}
+
+/**
+ * batadv_mcast_mla_list_free - free a list of multicast addresses
+ * @mcast_list: the list to free
+ *
+ * Removes and frees all items in the given mcast_list.
+ */
+static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
+ hlist_del(&mcast_entry->list);
+ kfree(mcast_entry);
+ }
+}
+
+/**
+ * batadv_mcast_mla_tt_retract - clean up multicast listener announcements
+ * @bat_priv: the bat priv with all the soft interface information
+ * @mcast_list: a list of addresses which should _not_ be removed
+ *
+ * Retracts the announcement of any multicast listener from the
+ * translation table except the ones listed in the given mcast_list.
+ *
+ * If mcast_list is NULL then all are retracted.
+ */
+static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
+ struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list,
+ list) {
+ if (mcast_list &&
+ batadv_mcast_mla_is_duplicate(mcast_entry->addr,
+ mcast_list))
+ continue;
+
+ batadv_tt_local_remove(bat_priv, mcast_entry->addr,
+ BATADV_NO_FLAGS,
+ "mcast TT outdated", false);
+
+ hlist_del(&mcast_entry->list);
+ kfree(mcast_entry);
+ }
+}
+
+/**
+ * batadv_mcast_mla_tt_add - add multicast listener announcements
+ * @bat_priv: the bat priv with all the soft interface information
+ * @mcast_list: a list of addresses which are going to get added
+ *
+ * Adds multicast listener announcements from the given mcast_list to the
+ * translation table if they have not been added yet.
+ */
+static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
+ struct hlist_head *mcast_list)
+{
+ struct batadv_hw_addr *mcast_entry;
+ struct hlist_node *tmp;
+
+ if (!mcast_list)
+ return;
+
+ hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) {
+ if (batadv_mcast_mla_is_duplicate(mcast_entry->addr,
+ &bat_priv->mcast.mla_list))
+ continue;
+
+ if (!batadv_tt_local_add(bat_priv->soft_iface,
+ mcast_entry->addr, BATADV_NO_FLAGS,
+ BATADV_NULL_IFINDEX, BATADV_NO_MARK))
+ continue;
+
+ hlist_del(&mcast_entry->list);
+ hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list);
+ }
+}
+
+/**
+ * batadv_mcast_has_bridge - check whether the soft-iface is bridged
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Checks whether there is a bridge on top of our soft interface. Returns
+ * true if so, false otherwise.
+ */
+static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
+{
+ struct net_device *upper = bat_priv->soft_iface;
+
+ rcu_read_lock();
+ do {
+ upper = netdev_master_upper_dev_get_rcu(upper);
+ } while (upper && !(upper->priv_flags & IFF_EBRIDGE));
+ rcu_read_unlock();
+
+ return upper;
+}
+
+/**
+ * batadv_mcast_mla_tvlv_update - update multicast tvlv
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Updates the own multicast tvlv with our current multicast related settings,
+ * capabilities and inabilities.
+ *
+ * Returns true if the tvlv container is registered afterwards. Otherwise
+ * returns false.
+ */
+static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv)
+{
+ struct batadv_tvlv_mcast_data mcast_data;
+
+ mcast_data.flags = BATADV_NO_FLAGS;
+ memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved));
+
+ /* Avoid attaching MLAs, if there is a bridge on top of our soft
+ * interface, we don't support that yet (TODO)
+ */
+ if (batadv_mcast_has_bridge(bat_priv)) {
+ if (bat_priv->mcast.enabled) {
+ batadv_tvlv_container_unregister(bat_priv,
+ BATADV_TVLV_MCAST, 1);
+ bat_priv->mcast.enabled = false;
+ }
+
+ return false;
+ }
+
+ if (!bat_priv->mcast.enabled ||
+ mcast_data.flags != bat_priv->mcast.flags) {
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 1,
+ &mcast_data, sizeof(mcast_data));
+ bat_priv->mcast.flags = mcast_data.flags;
+ bat_priv->mcast.enabled = true;
+ }
+
+ return true;
+}
+
+/**
+ * batadv_mcast_mla_update - update the own MLAs
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Updates the own multicast listener announcements in the translation
+ * table as well as the own, announced multicast tvlv container.
+ */
+void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
+{
+ struct net_device *soft_iface = bat_priv->soft_iface;
+ struct hlist_head mcast_list = HLIST_HEAD_INIT;
+ int ret;
+
+ if (!batadv_mcast_mla_tvlv_update(bat_priv))
+ goto update;
+
+ ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list);
+ if (ret < 0)
+ goto out;
+
+update:
+ batadv_mcast_mla_tt_retract(bat_priv, &mcast_list);
+ batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
+
+out:
+ batadv_mcast_mla_list_free(&mcast_list);
+}
+
+/**
+ * batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the IPv4 packet to check
+ * @is_unsnoopable: stores whether the destination is snoopable
+ *
+ * Checks whether the given IPv4 packet has the potential to be forwarded with a
+ * mode more optimal than classic flooding.
+ *
+ * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM in case of
+ * memory allocation failure.
+ */
+static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ bool *is_unsnoopable)
+{
+ struct iphdr *iphdr;
+
+ /* We might fail due to out-of-memory -> drop it */
+ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr)))
+ return -ENOMEM;
+
+ iphdr = ip_hdr(skb);
+
+ /* TODO: Implement Multicast Router Discovery (RFC4286),
+ * then allow scope > link local, too
+ */
+ if (!ipv4_is_local_multicast(iphdr->daddr))
+ return -EINVAL;
+
+ /* link-local multicast listeners behind a bridge are
+ * not snoopable (see RFC4541, section 2.1.2.2)
+ */
+ *is_unsnoopable = true;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the IPv6 packet to check
+ * @is_unsnoopable: stores whether the destination is snoopable
+ *
+ * Checks whether the given IPv6 packet has the potential to be forwarded with a
+ * mode more optimal than classic flooding.
+ *
+ * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out
+ * of memory.
+ */
+static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ bool *is_unsnoopable)
+{
+ struct ipv6hdr *ip6hdr;
+
+ /* We might fail due to out-of-memory -> drop it */
+ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr)))
+ return -ENOMEM;
+
+ ip6hdr = ipv6_hdr(skb);
+
+ /* TODO: Implement Multicast Router Discovery (RFC4286),
+ * then allow scope > link local, too
+ */
+ if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) != IPV6_ADDR_SCOPE_LINKLOCAL)
+ return -EINVAL;
+
+ /* link-local-all-nodes multicast listeners behind a bridge are
+ * not snoopable (see RFC4541, section 3, paragraph 3)
+ */
+ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr))
+ *is_unsnoopable = true;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_mode_check - check for optimized forwarding potential
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the multicast frame to check
+ * @is_unsnoopable: stores whether the destination is snoopable
+ *
+ * Checks whether the given multicast ethernet frame has the potential to be
+ * forwarded with a mode more optimal than classic flooding.
+ *
+ * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out
+ * of memory.
+ */
+static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ bool *is_unsnoopable)
+{
+ struct ethhdr *ethhdr = eth_hdr(skb);
+
+ if (!atomic_read(&bat_priv->multicast_mode))
+ return -EINVAL;
+
+ if (atomic_read(&bat_priv->mcast.num_disabled))
+ return -EINVAL;
+
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_IP:
+ return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
+ is_unsnoopable);
+ case ETH_P_IPV6:
+ return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb,
+ is_unsnoopable);
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * batadv_mcast_want_all_ip_count - count nodes with unspecific mcast interest
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: ethernet header of a packet
+ *
+ * Returns the number of nodes which want all IPv4 multicast traffic if the
+ * given ethhdr is from an IPv4 packet or the number of nodes which want all
+ * IPv6 traffic if it matches an IPv6 packet.
+ */
+static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv,
+ struct ethhdr *ethhdr)
+{
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_IP:
+ return atomic_read(&bat_priv->mcast.num_want_all_ipv4);
+ case ETH_P_IPV6:
+ return atomic_read(&bat_priv->mcast.num_want_all_ipv6);
+ default:
+ /* we shouldn't be here... */
+ return 0;
+ }
+}
+
+/**
+ * batadv_mcast_forw_tt_node_get - get a multicast tt node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: the ether header containing the multicast destination
+ *
+ * Returns an orig_node matching the multicast address provided by ethhdr
+ * via a translation table lookup. This increases the returned nodes refcount.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv,
+ struct ethhdr *ethhdr)
+{
+ return batadv_transtable_search(bat_priv, ethhdr->h_source,
+ ethhdr->h_dest, BATADV_NO_FLAGS);
+}
+
+/**
+ * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and
+ * increases its refcount.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv)
+{
+ struct batadv_orig_node *tmp_orig_node, *orig_node = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_orig_node,
+ &bat_priv->mcast.want_all_ipv4_list,
+ mcast_want_all_ipv4_node) {
+ if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+ continue;
+
+ orig_node = tmp_orig_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_node;
+}
+
+/**
+ * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set
+ * and increases its refcount.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv)
+{
+ struct batadv_orig_node *tmp_orig_node, *orig_node = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_orig_node,
+ &bat_priv->mcast.want_all_ipv6_list,
+ mcast_want_all_ipv6_node) {
+ if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+ continue;
+
+ orig_node = tmp_orig_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_node;
+}
+
+/**
+ * batadv_mcast_want_forw_ip_node_get - get a node with an ipv4/ipv6 flag
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: an ethernet header to determine the protocol family from
+ *
+ * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or
+ * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and
+ * increases its refcount.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv,
+ struct ethhdr *ethhdr)
+{
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_IP:
+ return batadv_mcast_forw_ipv4_node_get(bat_priv);
+ case ETH_P_IPV6:
+ return batadv_mcast_forw_ipv6_node_get(bat_priv);
+ default:
+ /* we shouldn't be here... */
+ return NULL;
+ }
+}
+
+/**
+ * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag
+ * set and increases its refcount.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv)
+{
+ struct batadv_orig_node *tmp_orig_node, *orig_node = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_orig_node,
+ &bat_priv->mcast.want_all_unsnoopables_list,
+ mcast_want_all_unsnoopables_node) {
+ if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+ continue;
+
+ orig_node = tmp_orig_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_node;
+}
+
+/**
+ * batadv_mcast_forw_mode - check on how to forward a multicast packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: The multicast packet to check
+ * @orig: an originator to be set to forward the skb to
+ *
+ * Returns the forwarding mode as enum batadv_forw_mode and in case of
+ * BATADV_FORW_SINGLE set the orig to the single originator the skb
+ * should be forwarded to.
+ */
+enum batadv_forw_mode
+batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ struct batadv_orig_node **orig)
+{
+ int ret, tt_count, ip_count, unsnoop_count, total_count;
+ bool is_unsnoopable = false;
+ struct ethhdr *ethhdr;
+
+ ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable);
+ if (ret == -ENOMEM)
+ return BATADV_FORW_NONE;
+ else if (ret < 0)
+ return BATADV_FORW_ALL;
+
+ ethhdr = eth_hdr(skb);
+
+ tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest,
+ BATADV_NO_FLAGS);
+ ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr);
+ unsnoop_count = !is_unsnoopable ? 0 :
+ atomic_read(&bat_priv->mcast.num_want_all_unsnoopables);
+
+ total_count = tt_count + ip_count + unsnoop_count;
+
+ switch (total_count) {
+ case 1:
+ if (tt_count)
+ *orig = batadv_mcast_forw_tt_node_get(bat_priv, ethhdr);
+ else if (ip_count)
+ *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr);
+ else if (unsnoop_count)
+ *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv);
+
+ if (*orig)
+ return BATADV_FORW_SINGLE;
+
+ /* fall through */
+ case 0:
+ return BATADV_FORW_NONE;
+ default:
+ return BATADV_FORW_ALL;
+ }
+}
+
+/**
+ * batadv_mcast_want_unsnoop_update - update unsnoop counter and list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator,
+ * orig, has toggled then this method updates counter and list accordingly.
+ */
+static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t mcast_flags)
+{
+ /* switched from flag unset to set */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) {
+ atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ hlist_add_head_rcu(&orig->mcast_want_all_unsnoopables_node,
+ &bat_priv->mcast.want_all_unsnoopables_list);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag set to unset */
+ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) {
+ atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ hlist_del_rcu(&orig->mcast_want_all_unsnoopables_node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_want_ipv4_update - update want-all-ipv4 counter and list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has
+ * toggled then this method updates counter and list accordingly.
+ */
+static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t mcast_flags)
+{
+ /* switched from flag unset to set */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) {
+ atomic_inc(&bat_priv->mcast.num_want_all_ipv4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ hlist_add_head_rcu(&orig->mcast_want_all_ipv4_node,
+ &bat_priv->mcast.want_all_ipv4_list);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag set to unset */
+ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) {
+ atomic_dec(&bat_priv->mcast.num_want_all_ipv4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ hlist_del_rcu(&orig->mcast_want_all_ipv4_node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_want_ipv6_update - update want-all-ipv6 counter and list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has
+ * toggled then this method updates counter and list accordingly.
+ */
+static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t mcast_flags)
+{
+ /* switched from flag unset to set */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) {
+ atomic_inc(&bat_priv->mcast.num_want_all_ipv6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ hlist_add_head_rcu(&orig->mcast_want_all_ipv6_node,
+ &bat_priv->mcast.want_all_ipv6_list);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag set to unset */
+ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) {
+ atomic_dec(&bat_priv->mcast.num_want_all_ipv6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ hlist_del_rcu(&orig->mcast_want_all_ipv6_node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_tvlv_ogm_handler_v1 - process incoming multicast tvlv container
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the multicast data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t flags,
+ void *tvlv_value,
+ uint16_t tvlv_value_len)
+{
+ bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+ uint8_t mcast_flags = BATADV_NO_FLAGS;
+ bool orig_initialized;
+
+ orig_initialized = orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST;
+
+ /* If mcast support is turned on decrease the disabled mcast node
+ * counter only if we had increased it for this node before. If this
+ * is a completely new orig_node no need to decrease the counter.
+ */
+ if (orig_mcast_enabled &&
+ !(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST)) {
+ if (orig_initialized)
+ atomic_dec(&bat_priv->mcast.num_disabled);
+ orig->capabilities |= BATADV_ORIG_CAPA_HAS_MCAST;
+ /* If mcast support is being switched off or if this is an initial
+ * OGM without mcast support then increase the disabled mcast
+ * node counter.
+ */
+ } else if (!orig_mcast_enabled &&
+ (orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST ||
+ !orig_initialized)) {
+ atomic_inc(&bat_priv->mcast.num_disabled);
+ orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_MCAST;
+ }
+
+ orig->capa_initialized |= BATADV_ORIG_CAPA_HAS_MCAST;
+
+ if (orig_mcast_enabled && tvlv_value &&
+ (tvlv_value_len >= sizeof(mcast_flags)))
+ mcast_flags = *(uint8_t *)tvlv_value;
+
+ batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags);
+
+ orig->mcast_flags = mcast_flags;
+}
+
+/**
+ * batadv_mcast_init - initialize the multicast optimizations structures
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_mcast_init(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler_v1,
+ NULL, BATADV_TVLV_MCAST, 1,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+}
+
+/**
+ * batadv_mcast_free - free the multicast optimizations structures
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_mcast_free(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1);
+
+ batadv_mcast_mla_tt_retract(bat_priv, NULL);
+}
+
+/**
+ * batadv_mcast_purge_orig - reset originator global mcast state modifications
+ * @orig: the originator which is going to get purged
+ */
+void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
+{
+ struct batadv_priv *bat_priv = orig->bat_priv;
+
+ if (!(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST) &&
+ orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST)
+ atomic_dec(&bat_priv->mcast.num_disabled);
+
+ batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
+ batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
+ batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
+}
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
new file mode 100644
index 000000000..3a44ebdb4
--- /dev/null
+++ b/net/batman-adv/multicast.h
@@ -0,0 +1,77 @@
+/* Copyright (C) 2014 B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_MULTICAST_H_
+#define _NET_BATMAN_ADV_MULTICAST_H_
+
+/**
+ * batadv_forw_mode - the way a packet should be forwarded as
+ * @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic
+ * flooding)
+ * @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the
+ * BATMAN unicast routing protocol)
+ * @BATADV_FORW_NONE: don't forward, drop it
+ */
+enum batadv_forw_mode {
+ BATADV_FORW_ALL,
+ BATADV_FORW_SINGLE,
+ BATADV_FORW_NONE,
+};
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+
+void batadv_mcast_mla_update(struct batadv_priv *bat_priv);
+
+enum batadv_forw_mode
+batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ struct batadv_orig_node **mcast_single_orig);
+
+void batadv_mcast_init(struct batadv_priv *bat_priv);
+
+void batadv_mcast_free(struct batadv_priv *bat_priv);
+
+void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node);
+
+#else
+
+static inline void batadv_mcast_mla_update(struct batadv_priv *bat_priv)
+{
+}
+
+static inline enum batadv_forw_mode
+batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ struct batadv_orig_node **mcast_single_orig)
+{
+ return BATADV_FORW_ALL;
+}
+
+static inline int batadv_mcast_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_mcast_free(struct batadv_priv *bat_priv)
+{
+}
+
+static inline void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node)
+{
+}
+
+#endif /* CONFIG_BATMAN_ADV_MCAST */
+
+#endif /* _NET_BATMAN_ADV_MULTICAST_H_ */
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
new file mode 100644
index 000000000..127cc4d73
--- /dev/null
+++ b/net/batman-adv/network-coding.c
@@ -0,0 +1,1936 @@
+/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors:
+ *
+ * Martin Hundebøll, Jeppe Ledet-Pedersen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/debugfs.h>
+
+#include "main.h"
+#include "hash.h"
+#include "network-coding.h"
+#include "send.h"
+#include "originator.h"
+#include "hard-interface.h"
+#include "routing.h"
+
+static struct lock_class_key batadv_nc_coding_hash_lock_class_key;
+static struct lock_class_key batadv_nc_decoding_hash_lock_class_key;
+
+static void batadv_nc_worker(struct work_struct *work);
+static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+
+/**
+ * batadv_nc_init - one-time initialization for network coding
+ */
+int __init batadv_nc_init(void)
+{
+ int ret;
+
+ /* Register our packet type */
+ ret = batadv_recv_handler_register(BATADV_CODED,
+ batadv_nc_recv_coded_packet);
+
+ return ret;
+}
+
+/**
+ * batadv_nc_start_timer - initialise the nc periodic worker
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_nc_start_timer(struct batadv_priv *bat_priv)
+{
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work,
+ msecs_to_jiffies(10));
+}
+
+/**
+ * batadv_nc_tvlv_container_update - update the network coding tvlv container
+ * after network coding setting change
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv)
+{
+ char nc_mode;
+
+ nc_mode = atomic_read(&bat_priv->network_coding);
+
+ switch (nc_mode) {
+ case 0:
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1);
+ break;
+ case 1:
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_NC, 1,
+ NULL, 0);
+ break;
+ }
+}
+
+/**
+ * batadv_nc_status_update - update the network coding tvlv container after
+ * network coding setting change
+ * @net_dev: the soft interface net device
+ */
+void batadv_nc_status_update(struct net_device *net_dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+
+ batadv_nc_tvlv_container_update(bat_priv);
+}
+
+/**
+ * batadv_nc_tvlv_ogm_handler_v1 - process incoming nc tvlv container
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the gateway data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t flags,
+ void *tvlv_value,
+ uint16_t tvlv_value_len)
+{
+ if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND)
+ orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_NC;
+ else
+ orig->capabilities |= BATADV_ORIG_CAPA_HAS_NC;
+}
+
+/**
+ * batadv_nc_mesh_init - initialise coding hash table and start house keeping
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
+{
+ bat_priv->nc.timestamp_fwd_flush = jiffies;
+ bat_priv->nc.timestamp_sniffed_purge = jiffies;
+
+ if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash)
+ return 0;
+
+ bat_priv->nc.coding_hash = batadv_hash_new(128);
+ if (!bat_priv->nc.coding_hash)
+ goto err;
+
+ batadv_hash_set_lock_class(bat_priv->nc.coding_hash,
+ &batadv_nc_coding_hash_lock_class_key);
+
+ bat_priv->nc.decoding_hash = batadv_hash_new(128);
+ if (!bat_priv->nc.decoding_hash)
+ goto err;
+
+ batadv_hash_set_lock_class(bat_priv->nc.decoding_hash,
+ &batadv_nc_decoding_hash_lock_class_key);
+
+ INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker);
+ batadv_nc_start_timer(bat_priv);
+
+ batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1,
+ NULL, BATADV_TVLV_NC, 1,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+ batadv_nc_tvlv_container_update(bat_priv);
+ return 0;
+
+err:
+ return -ENOMEM;
+}
+
+/**
+ * batadv_nc_init_bat_priv - initialise the nc specific bat_priv variables
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
+{
+ atomic_set(&bat_priv->network_coding, 1);
+ bat_priv->nc.min_tq = 200;
+ bat_priv->nc.max_fwd_delay = 10;
+ bat_priv->nc.max_buffer_time = 200;
+}
+
+/**
+ * batadv_nc_init_orig - initialise the nc fields of an orig_node
+ * @orig_node: the orig_node which is going to be initialised
+ */
+void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
+{
+ INIT_LIST_HEAD(&orig_node->in_coding_list);
+ INIT_LIST_HEAD(&orig_node->out_coding_list);
+ spin_lock_init(&orig_node->in_coding_list_lock);
+ spin_lock_init(&orig_node->out_coding_list_lock);
+}
+
+/**
+ * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove
+ * its refcount on the orig_node
+ * @rcu: rcu pointer of the nc node
+ */
+static void batadv_nc_node_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_nc_node *nc_node;
+
+ nc_node = container_of(rcu, struct batadv_nc_node, rcu);
+ batadv_orig_node_free_ref(nc_node->orig_node);
+ kfree(nc_node);
+}
+
+/**
+ * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly
+ * frees it
+ * @nc_node: the nc node to free
+ */
+static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node)
+{
+ if (atomic_dec_and_test(&nc_node->refcount))
+ call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu);
+}
+
+/**
+ * batadv_nc_path_free_ref - decrements the nc path refcounter and possibly
+ * frees it
+ * @nc_path: the nc node to free
+ */
+static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path)
+{
+ if (atomic_dec_and_test(&nc_path->refcount))
+ kfree_rcu(nc_path, rcu);
+}
+
+/**
+ * batadv_nc_packet_free - frees nc packet
+ * @nc_packet: the nc packet to free
+ */
+static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet)
+{
+ if (nc_packet->skb)
+ kfree_skb(nc_packet->skb);
+
+ batadv_nc_path_free_ref(nc_packet->nc_path);
+ kfree(nc_packet);
+}
+
+/**
+ * batadv_nc_to_purge_nc_node - checks whether an nc node has to be purged
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_node: the nc node to check
+ *
+ * Returns true if the entry has to be purged now, false otherwise
+ */
+static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_nc_node *nc_node)
+{
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ return true;
+
+ return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT);
+}
+
+/**
+ * batadv_nc_to_purge_nc_path_coding - checks whether an nc path has timed out
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_path: the nc path to check
+ *
+ * Returns true if the entry has to be purged now, false otherwise
+ */
+static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
+ struct batadv_nc_path *nc_path)
+{
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ return true;
+
+ /* purge the path when no packets has been added for 10 times the
+ * max_fwd_delay time
+ */
+ return batadv_has_timed_out(nc_path->last_valid,
+ bat_priv->nc.max_fwd_delay * 10);
+}
+
+/**
+ * batadv_nc_to_purge_nc_path_decoding - checks whether an nc path has timed out
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_path: the nc path to check
+ *
+ * Returns true if the entry has to be purged now, false otherwise
+ */
+static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv,
+ struct batadv_nc_path *nc_path)
+{
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ return true;
+
+ /* purge the path when no packets has been added for 10 times the
+ * max_buffer time
+ */
+ return batadv_has_timed_out(nc_path->last_valid,
+ bat_priv->nc.max_buffer_time*10);
+}
+
+/**
+ * batadv_nc_purge_orig_nc_nodes - go through list of nc nodes and purge stale
+ * entries
+ * @bat_priv: the bat priv with all the soft interface information
+ * @list: list of nc nodes
+ * @lock: nc node list lock
+ * @to_purge: function in charge to decide whether an entry has to be purged or
+ * not. This function takes the nc node as argument and has to return
+ * a boolean value: true if the entry has to be deleted, false
+ * otherwise
+ */
+static void
+batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv,
+ struct list_head *list,
+ spinlock_t *lock,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_node *))
+{
+ struct batadv_nc_node *nc_node, *nc_node_tmp;
+
+ /* For each nc_node in list */
+ spin_lock_bh(lock);
+ list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) {
+ /* if an helper function has been passed as parameter,
+ * ask it if the entry has to be purged or not
+ */
+ if (to_purge && !to_purge(bat_priv, nc_node))
+ continue;
+
+ batadv_dbg(BATADV_DBG_NC, bat_priv,
+ "Removing nc_node %pM -> %pM\n",
+ nc_node->addr, nc_node->orig_node->orig);
+ list_del_rcu(&nc_node->list);
+ batadv_nc_node_free_ref(nc_node);
+ }
+ spin_unlock_bh(lock);
+}
+
+/**
+ * batadv_nc_purge_orig - purges all nc node data attached of the given
+ * originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig_node with the nc node entries to be purged
+ * @to_purge: function in charge to decide whether an entry has to be purged or
+ * not. This function takes the nc node as argument and has to return
+ * a boolean value: true is the entry has to be deleted, false
+ * otherwise
+ */
+void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_node *))
+{
+ /* Check ingoing nc_node's of this orig_node */
+ batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list,
+ &orig_node->in_coding_list_lock,
+ to_purge);
+
+ /* Check outgoing nc_node's of this orig_node */
+ batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list,
+ &orig_node->out_coding_list_lock,
+ to_purge);
+}
+
+/**
+ * batadv_nc_purge_orig_hash - traverse entire originator hash to check if they
+ * have timed out nc nodes
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node;
+ uint32_t i;
+
+ if (!hash)
+ return;
+
+ /* For each orig_node */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry)
+ batadv_nc_purge_orig(bat_priv, orig_node,
+ batadv_nc_to_purge_nc_node);
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * batadv_nc_purge_paths - traverse all nc paths part of the hash and remove
+ * unused ones
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hash: hash table containing the nc paths to check
+ * @to_purge: function in charge to decide whether an entry has to be purged or
+ * not. This function takes the nc node as argument and has to return
+ * a boolean value: true is the entry has to be deleted, false
+ * otherwise
+ */
+static void batadv_nc_purge_paths(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_path *))
+{
+ struct hlist_head *head;
+ struct hlist_node *node_tmp;
+ struct batadv_nc_path *nc_path;
+ spinlock_t *lock; /* Protects lists in hash */
+ uint32_t i;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ lock = &hash->list_locks[i];
+
+ /* For each nc_path in this bin */
+ spin_lock_bh(lock);
+ hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) {
+ /* if an helper function has been passed as parameter,
+ * ask it if the entry has to be purged or not
+ */
+ if (to_purge && !to_purge(bat_priv, nc_path))
+ continue;
+
+ /* purging an non-empty nc_path should never happen, but
+ * is observed under high CPU load. Delay the purging
+ * until next iteration to allow the packet_list to be
+ * emptied first.
+ */
+ if (!unlikely(list_empty(&nc_path->packet_list))) {
+ net_ratelimited_function(printk,
+ KERN_WARNING
+ "Skipping free of non-empty nc_path (%pM -> %pM)!\n",
+ nc_path->prev_hop,
+ nc_path->next_hop);
+ continue;
+ }
+
+ /* nc_path is unused, so remove it */
+ batadv_dbg(BATADV_DBG_NC, bat_priv,
+ "Remove nc_path %pM -> %pM\n",
+ nc_path->prev_hop, nc_path->next_hop);
+ hlist_del_rcu(&nc_path->hash_entry);
+ batadv_nc_path_free_ref(nc_path);
+ }
+ spin_unlock_bh(lock);
+ }
+}
+
+/**
+ * batadv_nc_hash_key_gen - computes the nc_path hash key
+ * @key: buffer to hold the final hash key
+ * @src: source ethernet mac address going into the hash key
+ * @dst: destination ethernet mac address going into the hash key
+ */
+static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src,
+ const char *dst)
+{
+ memcpy(key->prev_hop, src, sizeof(key->prev_hop));
+ memcpy(key->next_hop, dst, sizeof(key->next_hop));
+}
+
+/**
+ * batadv_nc_hash_choose - compute the hash value for an nc path
+ * @data: data to hash
+ * @size: size of the hash table
+ *
+ * Returns the selected index in the hash table for the given data.
+ */
+static uint32_t batadv_nc_hash_choose(const void *data, uint32_t size)
+{
+ const struct batadv_nc_path *nc_path = data;
+ uint32_t hash = 0;
+
+ hash = batadv_hash_bytes(hash, &nc_path->prev_hop,
+ sizeof(nc_path->prev_hop));
+ hash = batadv_hash_bytes(hash, &nc_path->next_hop,
+ sizeof(nc_path->next_hop));
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash % size;
+}
+
+/**
+ * batadv_nc_hash_compare - comparing function used in the network coding hash
+ * tables
+ * @node: node in the local table
+ * @data2: second object to compare the node to
+ *
+ * Returns 1 if the two entry are the same, 0 otherwise
+ */
+static int batadv_nc_hash_compare(const struct hlist_node *node,
+ const void *data2)
+{
+ const struct batadv_nc_path *nc_path1, *nc_path2;
+
+ nc_path1 = container_of(node, struct batadv_nc_path, hash_entry);
+ nc_path2 = data2;
+
+ /* Return 1 if the two keys are identical */
+ if (memcmp(nc_path1->prev_hop, nc_path2->prev_hop,
+ sizeof(nc_path1->prev_hop)) != 0)
+ return 0;
+
+ if (memcmp(nc_path1->next_hop, nc_path2->next_hop,
+ sizeof(nc_path1->next_hop)) != 0)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * batadv_nc_hash_find - search for an existing nc path and return it
+ * @hash: hash table containing the nc path
+ * @data: search key
+ *
+ * Returns the nc_path if found, NULL otherwise.
+ */
+static struct batadv_nc_path *
+batadv_nc_hash_find(struct batadv_hashtable *hash,
+ void *data)
+{
+ struct hlist_head *head;
+ struct batadv_nc_path *nc_path, *nc_path_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = batadv_nc_hash_choose(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nc_path, head, hash_entry) {
+ if (!batadv_nc_hash_compare(&nc_path->hash_entry, data))
+ continue;
+
+ if (!atomic_inc_not_zero(&nc_path->refcount))
+ continue;
+
+ nc_path_tmp = nc_path;
+ break;
+ }
+ rcu_read_unlock();
+
+ return nc_path_tmp;
+}
+
+/**
+ * batadv_nc_send_packet - send non-coded packet and free nc_packet struct
+ * @nc_packet: the nc packet to send
+ */
+static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
+{
+ batadv_send_skb_packet(nc_packet->skb,
+ nc_packet->neigh_node->if_incoming,
+ nc_packet->nc_path->next_hop);
+ nc_packet->skb = NULL;
+ batadv_nc_packet_free(nc_packet);
+}
+
+/**
+ * batadv_nc_sniffed_purge - Checks timestamp of given sniffed nc_packet.
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_path: the nc path the packet belongs to
+ * @nc_packet: the nc packet to be checked
+ *
+ * Checks whether the given sniffed (overheard) nc_packet has hit its buffering
+ * timeout. If so, the packet is no longer kept and the entry deleted from the
+ * queue. Has to be called with the appropriate locks.
+ *
+ * Returns false as soon as the entry in the fifo queue has not been timed out
+ * yet and true otherwise.
+ */
+static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
+ struct batadv_nc_path *nc_path,
+ struct batadv_nc_packet *nc_packet)
+{
+ unsigned long timeout = bat_priv->nc.max_buffer_time;
+ bool res = false;
+
+ /* Packets are added to tail, so the remaining packets did not time
+ * out and we can stop processing the current queue
+ */
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE &&
+ !batadv_has_timed_out(nc_packet->timestamp, timeout))
+ goto out;
+
+ /* purge nc packet */
+ list_del(&nc_packet->list);
+ batadv_nc_packet_free(nc_packet);
+
+ res = true;
+
+out:
+ return res;
+}
+
+/**
+ * batadv_nc_fwd_flush - Checks the timestamp of the given nc packet.
+ * @bat_priv: the bat priv with all the soft interface information
+ * @nc_path: the nc path the packet belongs to
+ * @nc_packet: the nc packet to be checked
+ *
+ * Checks whether the given nc packet has hit its forward timeout. If so, the
+ * packet is no longer delayed, immediately sent and the entry deleted from the
+ * queue. Has to be called with the appropriate locks.
+ *
+ * Returns false as soon as the entry in the fifo queue has not been timed out
+ * yet and true otherwise.
+ */
+static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv,
+ struct batadv_nc_path *nc_path,
+ struct batadv_nc_packet *nc_packet)
+{
+ unsigned long timeout = bat_priv->nc.max_fwd_delay;
+
+ /* Packets are added to tail, so the remaining packets did not time
+ * out and we can stop processing the current queue
+ */
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE &&
+ !batadv_has_timed_out(nc_packet->timestamp, timeout))
+ return false;
+
+ /* Send packet */
+ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
+ batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
+ nc_packet->skb->len + ETH_HLEN);
+ list_del(&nc_packet->list);
+ batadv_nc_send_packet(nc_packet);
+
+ return true;
+}
+
+/**
+ * batadv_nc_process_nc_paths - traverse given nc packet pool and free timed out
+ * nc packets
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hash: to be processed hash table
+ * @process_fn: Function called to process given nc packet. Should return true
+ * to encourage this function to proceed with the next packet.
+ * Otherwise the rest of the current queue is skipped.
+ */
+static void
+batadv_nc_process_nc_paths(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ bool (*process_fn)(struct batadv_priv *,
+ struct batadv_nc_path *,
+ struct batadv_nc_packet *))
+{
+ struct hlist_head *head;
+ struct batadv_nc_packet *nc_packet, *nc_packet_tmp;
+ struct batadv_nc_path *nc_path;
+ bool ret;
+ int i;
+
+ if (!hash)
+ return;
+
+ /* Loop hash table bins */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ /* Loop coding paths */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nc_path, head, hash_entry) {
+ /* Loop packets */
+ spin_lock_bh(&nc_path->packet_list_lock);
+ list_for_each_entry_safe(nc_packet, nc_packet_tmp,
+ &nc_path->packet_list, list) {
+ ret = process_fn(bat_priv, nc_path, nc_packet);
+ if (!ret)
+ break;
+ }
+ spin_unlock_bh(&nc_path->packet_list_lock);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * batadv_nc_worker - periodic task for house keeping related to network coding
+ * @work: kernel work struct
+ */
+static void batadv_nc_worker(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_nc *priv_nc;
+ struct batadv_priv *bat_priv;
+ unsigned long timeout;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ priv_nc = container_of(delayed_work, struct batadv_priv_nc, work);
+ bat_priv = container_of(priv_nc, struct batadv_priv, nc);
+
+ batadv_nc_purge_orig_hash(bat_priv);
+ batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash,
+ batadv_nc_to_purge_nc_path_coding);
+ batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash,
+ batadv_nc_to_purge_nc_path_decoding);
+
+ timeout = bat_priv->nc.max_fwd_delay;
+
+ if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) {
+ batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash,
+ batadv_nc_fwd_flush);
+ bat_priv->nc.timestamp_fwd_flush = jiffies;
+ }
+
+ if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge,
+ bat_priv->nc.max_buffer_time)) {
+ batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash,
+ batadv_nc_sniffed_purge);
+ bat_priv->nc.timestamp_sniffed_purge = jiffies;
+ }
+
+ /* Schedule a new check */
+ batadv_nc_start_timer(bat_priv);
+}
+
+/**
+ * batadv_can_nc_with_orig - checks whether the given orig node is suitable for
+ * coding or not
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: neighboring orig node which may be used as nc candidate
+ * @ogm_packet: incoming ogm packet also used for the checks
+ *
+ * Returns true if:
+ * 1) The OGM must have the most recent sequence number.
+ * 2) The TTL must be decremented by one and only one.
+ * 3) The OGM must be received from the first hop from orig_node.
+ * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq.
+ */
+static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_ogm_packet *ogm_packet)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ uint32_t last_real_seqno;
+ uint8_t last_ttl;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT);
+ if (!orig_ifinfo)
+ return false;
+
+ last_ttl = orig_ifinfo->last_ttl;
+ last_real_seqno = orig_ifinfo->last_real_seqno;
+ batadv_orig_ifinfo_free_ref(orig_ifinfo);
+
+ if (last_real_seqno != ntohl(ogm_packet->seqno))
+ return false;
+ if (last_ttl != ogm_packet->ttl + 1)
+ return false;
+ if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender))
+ return false;
+ if (ogm_packet->tq < bat_priv->nc.min_tq)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_nc_find_nc_node - search for an existing nc node and return it
+ * @orig_node: orig node originating the ogm packet
+ * @orig_neigh_node: neighboring orig node from which we received the ogm packet
+ * (can be equal to orig_node)
+ * @in_coding: traverse incoming or outgoing network coding list
+ *
+ * Returns the nc_node if found, NULL otherwise.
+ */
+static struct batadv_nc_node
+*batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ bool in_coding)
+{
+ struct batadv_nc_node *nc_node, *nc_node_out = NULL;
+ struct list_head *list;
+
+ if (in_coding)
+ list = &orig_neigh_node->in_coding_list;
+ else
+ list = &orig_neigh_node->out_coding_list;
+
+ /* Traverse list of nc_nodes to orig_node */
+ rcu_read_lock();
+ list_for_each_entry_rcu(nc_node, list, list) {
+ if (!batadv_compare_eth(nc_node->addr, orig_node->orig))
+ continue;
+
+ if (!atomic_inc_not_zero(&nc_node->refcount))
+ continue;
+
+ /* Found a match */
+ nc_node_out = nc_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return nc_node_out;
+}
+
+/**
+ * batadv_nc_get_nc_node - retrieves an nc node or creates the entry if it was
+ * not found
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node originating the ogm packet
+ * @orig_neigh_node: neighboring orig node from which we received the ogm packet
+ * (can be equal to orig_node)
+ * @in_coding: traverse incoming or outgoing network coding list
+ *
+ * Returns the nc_node if found or created, NULL in case of an error.
+ */
+static struct batadv_nc_node
+*batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ bool in_coding)
+{
+ struct batadv_nc_node *nc_node;
+ spinlock_t *lock; /* Used to lock list selected by "int in_coding" */
+ struct list_head *list;
+
+ /* Check if nc_node is already added */
+ nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding);
+
+ /* Node found */
+ if (nc_node)
+ return nc_node;
+
+ nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC);
+ if (!nc_node)
+ return NULL;
+
+ if (!atomic_inc_not_zero(&orig_neigh_node->refcount))
+ goto free;
+
+ /* Initialize nc_node */
+ INIT_LIST_HEAD(&nc_node->list);
+ ether_addr_copy(nc_node->addr, orig_node->orig);
+ nc_node->orig_node = orig_neigh_node;
+ atomic_set(&nc_node->refcount, 2);
+
+ /* Select ingoing or outgoing coding node */
+ if (in_coding) {
+ lock = &orig_neigh_node->in_coding_list_lock;
+ list = &orig_neigh_node->in_coding_list;
+ } else {
+ lock = &orig_neigh_node->out_coding_list_lock;
+ list = &orig_neigh_node->out_coding_list;
+ }
+
+ batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n",
+ nc_node->addr, nc_node->orig_node->orig);
+
+ /* Add nc_node to orig_node */
+ spin_lock_bh(lock);
+ list_add_tail_rcu(&nc_node->list, list);
+ spin_unlock_bh(lock);
+
+ return nc_node;
+
+free:
+ kfree(nc_node);
+ return NULL;
+}
+
+/**
+ * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node structs
+ * (best called on incoming OGMs)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node originating the ogm packet
+ * @orig_neigh_node: neighboring orig node from which we received the ogm packet
+ * (can be equal to orig_node)
+ * @ogm_packet: incoming ogm packet
+ * @is_single_hop_neigh: orig_node is a single hop neighbor
+ */
+void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *ogm_packet,
+ int is_single_hop_neigh)
+{
+ struct batadv_nc_node *in_nc_node = NULL, *out_nc_node = NULL;
+
+ /* Check if network coding is enabled */
+ if (!atomic_read(&bat_priv->network_coding))
+ goto out;
+
+ /* check if orig node is network coding enabled */
+ if (!(orig_node->capabilities & BATADV_ORIG_CAPA_HAS_NC))
+ goto out;
+
+ /* accept ogms from 'good' neighbors and single hop neighbors */
+ if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) &&
+ !is_single_hop_neigh)
+ goto out;
+
+ /* Add orig_node as in_nc_node on hop */
+ in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node,
+ orig_neigh_node, true);
+ if (!in_nc_node)
+ goto out;
+
+ in_nc_node->last_seen = jiffies;
+
+ /* Add hop as out_nc_node on orig_node */
+ out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node,
+ orig_node, false);
+ if (!out_nc_node)
+ goto out;
+
+ out_nc_node->last_seen = jiffies;
+
+out:
+ if (in_nc_node)
+ batadv_nc_node_free_ref(in_nc_node);
+ if (out_nc_node)
+ batadv_nc_node_free_ref(out_nc_node);
+}
+
+/**
+ * batadv_nc_get_path - get existing nc_path or allocate a new one
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hash: hash table containing the nc path
+ * @src: ethernet source address - first half of the nc path search key
+ * @dst: ethernet destination address - second half of the nc path search key
+ *
+ * Returns pointer to nc_path if the path was found or created, returns NULL
+ * on error.
+ */
+static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ uint8_t *src,
+ uint8_t *dst)
+{
+ int hash_added;
+ struct batadv_nc_path *nc_path, nc_path_key;
+
+ batadv_nc_hash_key_gen(&nc_path_key, src, dst);
+
+ /* Search for existing nc_path */
+ nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key);
+
+ if (nc_path) {
+ /* Set timestamp to delay removal of nc_path */
+ nc_path->last_valid = jiffies;
+ return nc_path;
+ }
+
+ /* No existing nc_path was found; create a new */
+ nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC);
+
+ if (!nc_path)
+ return NULL;
+
+ /* Initialize nc_path */
+ INIT_LIST_HEAD(&nc_path->packet_list);
+ spin_lock_init(&nc_path->packet_list_lock);
+ atomic_set(&nc_path->refcount, 2);
+ nc_path->last_valid = jiffies;
+ ether_addr_copy(nc_path->next_hop, dst);
+ ether_addr_copy(nc_path->prev_hop, src);
+
+ batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n",
+ nc_path->prev_hop,
+ nc_path->next_hop);
+
+ /* Add nc_path to hash table */
+ hash_added = batadv_hash_add(hash, batadv_nc_hash_compare,
+ batadv_nc_hash_choose, &nc_path_key,
+ &nc_path->hash_entry);
+
+ if (hash_added < 0) {
+ kfree(nc_path);
+ return NULL;
+ }
+
+ return nc_path;
+}
+
+/**
+ * batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair
+ * selection of a receiver with slightly lower TQ than the other
+ * @tq: to be weighted tq value
+ */
+static uint8_t batadv_nc_random_weight_tq(uint8_t tq)
+{
+ uint8_t rand_val, rand_tq;
+
+ get_random_bytes(&rand_val, sizeof(rand_val));
+
+ /* randomize the estimated packet loss (max TQ - estimated TQ) */
+ rand_tq = rand_val * (BATADV_TQ_MAX_VALUE - tq);
+
+ /* normalize the randomized packet loss */
+ rand_tq /= BATADV_TQ_MAX_VALUE;
+
+ /* convert to (randomized) estimated tq again */
+ return BATADV_TQ_MAX_VALUE - rand_tq;
+}
+
+/**
+ * batadv_nc_memxor - XOR destination with source
+ * @dst: byte array to XOR into
+ * @src: byte array to XOR from
+ * @len: length of destination array
+ */
+static void batadv_nc_memxor(char *dst, const char *src, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; ++i)
+ dst[i] ^= src[i];
+}
+
+/**
+ * batadv_nc_code_packets - code a received unicast_packet with an nc packet
+ * into a coded_packet and send it
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: data skb to forward
+ * @ethhdr: pointer to the ethernet header inside the skb
+ * @nc_packet: structure containing the packet to the skb can be coded with
+ * @neigh_node: next hop to forward packet to
+ *
+ * Returns true if both packets are consumed, false otherwise.
+ */
+static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ struct ethhdr *ethhdr,
+ struct batadv_nc_packet *nc_packet,
+ struct batadv_neigh_node *neigh_node)
+{
+ uint8_t tq_weighted_neigh, tq_weighted_coding, tq_tmp;
+ struct sk_buff *skb_dest, *skb_src;
+ struct batadv_unicast_packet *packet1;
+ struct batadv_unicast_packet *packet2;
+ struct batadv_coded_packet *coded_packet;
+ struct batadv_neigh_node *neigh_tmp, *router_neigh;
+ struct batadv_neigh_node *router_coding = NULL;
+ struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL;
+ struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL;
+ uint8_t *first_source, *first_dest, *second_source, *second_dest;
+ __be32 packet_id1, packet_id2;
+ size_t count;
+ bool res = false;
+ int coding_len;
+ int unicast_size = sizeof(*packet1);
+ int coded_size = sizeof(*coded_packet);
+ int header_add = coded_size - unicast_size;
+
+ /* TODO: do we need to consider the outgoing interface for
+ * coded packets?
+ */
+ router_neigh = batadv_orig_router_get(neigh_node->orig_node,
+ BATADV_IF_DEFAULT);
+ if (!router_neigh)
+ goto out;
+
+ router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh,
+ BATADV_IF_DEFAULT);
+ if (!router_neigh_ifinfo)
+ goto out;
+
+ neigh_tmp = nc_packet->neigh_node;
+ router_coding = batadv_orig_router_get(neigh_tmp->orig_node,
+ BATADV_IF_DEFAULT);
+ if (!router_coding)
+ goto out;
+
+ router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding,
+ BATADV_IF_DEFAULT);
+ if (!router_coding_ifinfo)
+ goto out;
+
+ tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg;
+ tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp);
+ tq_tmp = router_coding_ifinfo->bat_iv.tq_avg;
+ tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp);
+
+ /* Select one destination for the MAC-header dst-field based on
+ * weighted TQ-values.
+ */
+ if (tq_weighted_neigh >= tq_weighted_coding) {
+ /* Destination from nc_packet is selected for MAC-header */
+ first_dest = nc_packet->nc_path->next_hop;
+ first_source = nc_packet->nc_path->prev_hop;
+ second_dest = neigh_node->addr;
+ second_source = ethhdr->h_source;
+ packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data;
+ packet2 = (struct batadv_unicast_packet *)skb->data;
+ packet_id1 = nc_packet->packet_id;
+ packet_id2 = batadv_skb_crc32(skb,
+ skb->data + sizeof(*packet2));
+ } else {
+ /* Destination for skb is selected for MAC-header */
+ first_dest = neigh_node->addr;
+ first_source = ethhdr->h_source;
+ second_dest = nc_packet->nc_path->next_hop;
+ second_source = nc_packet->nc_path->prev_hop;
+ packet1 = (struct batadv_unicast_packet *)skb->data;
+ packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data;
+ packet_id1 = batadv_skb_crc32(skb,
+ skb->data + sizeof(*packet1));
+ packet_id2 = nc_packet->packet_id;
+ }
+
+ /* Instead of zero padding the smallest data buffer, we
+ * code into the largest.
+ */
+ if (skb->len <= nc_packet->skb->len) {
+ skb_dest = nc_packet->skb;
+ skb_src = skb;
+ } else {
+ skb_dest = skb;
+ skb_src = nc_packet->skb;
+ }
+
+ /* coding_len is used when decoding the packet shorter packet */
+ coding_len = skb_src->len - unicast_size;
+
+ if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0)
+ goto out;
+
+ skb_push(skb_dest, header_add);
+
+ coded_packet = (struct batadv_coded_packet *)skb_dest->data;
+ skb_reset_mac_header(skb_dest);
+
+ coded_packet->packet_type = BATADV_CODED;
+ coded_packet->version = BATADV_COMPAT_VERSION;
+ coded_packet->ttl = packet1->ttl;
+
+ /* Info about first unicast packet */
+ ether_addr_copy(coded_packet->first_source, first_source);
+ ether_addr_copy(coded_packet->first_orig_dest, packet1->dest);
+ coded_packet->first_crc = packet_id1;
+ coded_packet->first_ttvn = packet1->ttvn;
+
+ /* Info about second unicast packet */
+ ether_addr_copy(coded_packet->second_dest, second_dest);
+ ether_addr_copy(coded_packet->second_source, second_source);
+ ether_addr_copy(coded_packet->second_orig_dest, packet2->dest);
+ coded_packet->second_crc = packet_id2;
+ coded_packet->second_ttl = packet2->ttl;
+ coded_packet->second_ttvn = packet2->ttvn;
+ coded_packet->coded_len = htons(coding_len);
+
+ /* This is where the magic happens: Code skb_src into skb_dest */
+ batadv_nc_memxor(skb_dest->data + coded_size,
+ skb_src->data + unicast_size, coding_len);
+
+ /* Update counters accordingly */
+ if (BATADV_SKB_CB(skb_src)->decoded &&
+ BATADV_SKB_CB(skb_dest)->decoded) {
+ /* Both packets are recoded */
+ count = skb_src->len + ETH_HLEN;
+ count += skb_dest->len + ETH_HLEN;
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count);
+ } else if (!BATADV_SKB_CB(skb_src)->decoded &&
+ !BATADV_SKB_CB(skb_dest)->decoded) {
+ /* Both packets are newly coded */
+ count = skb_src->len + ETH_HLEN;
+ count += skb_dest->len + ETH_HLEN;
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count);
+ } else if (BATADV_SKB_CB(skb_src)->decoded &&
+ !BATADV_SKB_CB(skb_dest)->decoded) {
+ /* skb_src recoded and skb_dest is newly coded */
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES,
+ skb_src->len + ETH_HLEN);
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES,
+ skb_dest->len + ETH_HLEN);
+ } else if (!BATADV_SKB_CB(skb_src)->decoded &&
+ BATADV_SKB_CB(skb_dest)->decoded) {
+ /* skb_src is newly coded and skb_dest is recoded */
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES,
+ skb_src->len + ETH_HLEN);
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES,
+ skb_dest->len + ETH_HLEN);
+ }
+
+ /* skb_src is now coded into skb_dest, so free it */
+ kfree_skb(skb_src);
+
+ /* avoid duplicate free of skb from nc_packet */
+ nc_packet->skb = NULL;
+ batadv_nc_packet_free(nc_packet);
+
+ /* Send the coded packet and return true */
+ batadv_send_skb_packet(skb_dest, neigh_node->if_incoming, first_dest);
+ res = true;
+out:
+ if (router_neigh)
+ batadv_neigh_node_free_ref(router_neigh);
+ if (router_coding)
+ batadv_neigh_node_free_ref(router_coding);
+ if (router_neigh_ifinfo)
+ batadv_neigh_ifinfo_free_ref(router_neigh_ifinfo);
+ if (router_coding_ifinfo)
+ batadv_neigh_ifinfo_free_ref(router_coding_ifinfo);
+ return res;
+}
+
+/**
+ * batadv_nc_skb_coding_possible - true if a decoded skb is available at dst.
+ * @skb: data skb to forward
+ * @dst: destination mac address of the other skb to code with
+ * @src: source mac address of skb
+ *
+ * Whenever we network code a packet we have to check whether we received it in
+ * a network coded form. If so, we may not be able to use it for coding because
+ * some neighbors may also have received (overheard) the packet in the network
+ * coded form without being able to decode it. It is hard to know which of the
+ * neighboring nodes was able to decode the packet, therefore we can only
+ * re-code the packet if the source of the previous encoded packet is involved.
+ * Since the source encoded the packet we can be certain it has all necessary
+ * decode information.
+ *
+ * Returns true if coding of a decoded packet is allowed.
+ */
+static bool batadv_nc_skb_coding_possible(struct sk_buff *skb,
+ uint8_t *dst, uint8_t *src)
+{
+ if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src))
+ return false;
+ return true;
+}
+
+/**
+ * batadv_nc_path_search - Find the coding path matching in_nc_node and
+ * out_nc_node to retrieve a buffered packet that can be used for coding.
+ * @bat_priv: the bat priv with all the soft interface information
+ * @in_nc_node: pointer to skb next hop's neighbor nc node
+ * @out_nc_node: pointer to skb source's neighbor nc node
+ * @skb: data skb to forward
+ * @eth_dst: next hop mac address of skb
+ *
+ * Returns true if coding of a decoded skb is allowed.
+ */
+static struct batadv_nc_packet *
+batadv_nc_path_search(struct batadv_priv *bat_priv,
+ struct batadv_nc_node *in_nc_node,
+ struct batadv_nc_node *out_nc_node,
+ struct sk_buff *skb,
+ uint8_t *eth_dst)
+{
+ struct batadv_nc_path *nc_path, nc_path_key;
+ struct batadv_nc_packet *nc_packet_out = NULL;
+ struct batadv_nc_packet *nc_packet, *nc_packet_tmp;
+ struct batadv_hashtable *hash = bat_priv->nc.coding_hash;
+ int idx;
+
+ if (!hash)
+ return NULL;
+
+ /* Create almost path key */
+ batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr,
+ out_nc_node->addr);
+ idx = batadv_nc_hash_choose(&nc_path_key, hash->size);
+
+ /* Check for coding opportunities in this nc_path */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) {
+ if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr))
+ continue;
+
+ if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr))
+ continue;
+
+ spin_lock_bh(&nc_path->packet_list_lock);
+ if (list_empty(&nc_path->packet_list)) {
+ spin_unlock_bh(&nc_path->packet_list_lock);
+ continue;
+ }
+
+ list_for_each_entry_safe(nc_packet, nc_packet_tmp,
+ &nc_path->packet_list, list) {
+ if (!batadv_nc_skb_coding_possible(nc_packet->skb,
+ eth_dst,
+ in_nc_node->addr))
+ continue;
+
+ /* Coding opportunity is found! */
+ list_del(&nc_packet->list);
+ nc_packet_out = nc_packet;
+ break;
+ }
+
+ spin_unlock_bh(&nc_path->packet_list_lock);
+ break;
+ }
+ rcu_read_unlock();
+
+ return nc_packet_out;
+}
+
+/**
+ * batadv_nc_skb_src_search - Loops through the list of neighoring nodes of the
+ * skb's sender (may be equal to the originator).
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: data skb to forward
+ * @eth_dst: next hop mac address of skb
+ * @eth_src: source mac address of skb
+ * @in_nc_node: pointer to skb next hop's neighbor nc node
+ *
+ * Returns an nc packet if a suitable coding packet was found, NULL otherwise.
+ */
+static struct batadv_nc_packet *
+batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ uint8_t *eth_dst,
+ uint8_t *eth_src,
+ struct batadv_nc_node *in_nc_node)
+{
+ struct batadv_orig_node *orig_node;
+ struct batadv_nc_node *out_nc_node;
+ struct batadv_nc_packet *nc_packet = NULL;
+
+ orig_node = batadv_orig_hash_find(bat_priv, eth_src);
+ if (!orig_node)
+ return NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(out_nc_node,
+ &orig_node->out_coding_list, list) {
+ /* Check if the skb is decoded and if recoding is possible */
+ if (!batadv_nc_skb_coding_possible(skb,
+ out_nc_node->addr, eth_src))
+ continue;
+
+ /* Search for an opportunity in this nc_path */
+ nc_packet = batadv_nc_path_search(bat_priv, in_nc_node,
+ out_nc_node, skb, eth_dst);
+ if (nc_packet)
+ break;
+ }
+ rcu_read_unlock();
+
+ batadv_orig_node_free_ref(orig_node);
+ return nc_packet;
+}
+
+/**
+ * batadv_nc_skb_store_before_coding - set the ethernet src and dst of the
+ * unicast skb before it is stored for use in later decoding
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: data skb to store
+ * @eth_dst_new: new destination mac address of skb
+ */
+static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ uint8_t *eth_dst_new)
+{
+ struct ethhdr *ethhdr;
+
+ /* Copy skb header to change the mac header */
+ skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Set the mac header as if we actually sent the packet uncoded */
+ ethhdr = eth_hdr(skb);
+ ether_addr_copy(ethhdr->h_source, ethhdr->h_dest);
+ ether_addr_copy(ethhdr->h_dest, eth_dst_new);
+
+ /* Set data pointer to MAC header to mimic packets from our tx path */
+ skb_push(skb, ETH_HLEN);
+
+ /* Add the packet to the decoding packet pool */
+ batadv_nc_skb_store_for_decoding(bat_priv, skb);
+
+ /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free
+ * our ref
+ */
+ kfree_skb(skb);
+}
+
+/**
+ * batadv_nc_skb_dst_search - Loops through list of neighboring nodes to dst.
+ * @skb: data skb to forward
+ * @neigh_node: next hop to forward packet to
+ * @ethhdr: pointer to the ethernet header inside the skb
+ *
+ * Loops through list of neighboring nodes the next hop has a good connection to
+ * (receives OGMs with a sufficient quality). We need to find a neighbor of our
+ * next hop that potentially sent a packet which our next hop also received
+ * (overheard) and has stored for later decoding.
+ *
+ * Returns true if the skb was consumed (encoded packet sent) or false otherwise
+ */
+static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node,
+ struct ethhdr *ethhdr)
+{
+ struct net_device *netdev = neigh_node->if_incoming->soft_iface;
+ struct batadv_priv *bat_priv = netdev_priv(netdev);
+ struct batadv_orig_node *orig_node = neigh_node->orig_node;
+ struct batadv_nc_node *nc_node;
+ struct batadv_nc_packet *nc_packet = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) {
+ /* Search for coding opportunity with this in_nc_node */
+ nc_packet = batadv_nc_skb_src_search(bat_priv, skb,
+ neigh_node->addr,
+ ethhdr->h_source, nc_node);
+
+ /* Opportunity was found, so stop searching */
+ if (nc_packet)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!nc_packet)
+ return false;
+
+ /* Save packets for later decoding */
+ batadv_nc_skb_store_before_coding(bat_priv, skb,
+ neigh_node->addr);
+ batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb,
+ nc_packet->neigh_node->addr);
+
+ /* Code and send packets */
+ if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet,
+ neigh_node))
+ return true;
+
+ /* out of mem ? Coding failed - we have to free the buffered packet
+ * to avoid memleaks. The skb passed as argument will be dealt with
+ * by the calling function.
+ */
+ batadv_nc_send_packet(nc_packet);
+ return false;
+}
+
+/**
+ * batadv_nc_skb_add_to_path - buffer skb for later encoding / decoding
+ * @skb: skb to add to path
+ * @nc_path: path to add skb to
+ * @neigh_node: next hop to forward packet to
+ * @packet_id: checksum to identify packet
+ *
+ * Returns true if the packet was buffered or false in case of an error.
+ */
+static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
+ struct batadv_nc_path *nc_path,
+ struct batadv_neigh_node *neigh_node,
+ __be32 packet_id)
+{
+ struct batadv_nc_packet *nc_packet;
+
+ nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC);
+ if (!nc_packet)
+ return false;
+
+ /* Initialize nc_packet */
+ nc_packet->timestamp = jiffies;
+ nc_packet->packet_id = packet_id;
+ nc_packet->skb = skb;
+ nc_packet->neigh_node = neigh_node;
+ nc_packet->nc_path = nc_path;
+
+ /* Add coding packet to list */
+ spin_lock_bh(&nc_path->packet_list_lock);
+ list_add_tail(&nc_packet->list, &nc_path->packet_list);
+ spin_unlock_bh(&nc_path->packet_list_lock);
+
+ return true;
+}
+
+/**
+ * batadv_nc_skb_forward - try to code a packet or add it to the coding packet
+ * buffer
+ * @skb: data skb to forward
+ * @neigh_node: next hop to forward packet to
+ *
+ * Returns true if the skb was consumed (encoded packet sent) or false otherwise
+ */
+bool batadv_nc_skb_forward(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node)
+{
+ const struct net_device *netdev = neigh_node->if_incoming->soft_iface;
+ struct batadv_priv *bat_priv = netdev_priv(netdev);
+ struct batadv_unicast_packet *packet;
+ struct batadv_nc_path *nc_path;
+ struct ethhdr *ethhdr = eth_hdr(skb);
+ __be32 packet_id;
+ u8 *payload;
+
+ /* Check if network coding is enabled */
+ if (!atomic_read(&bat_priv->network_coding))
+ goto out;
+
+ /* We only handle unicast packets */
+ payload = skb_network_header(skb);
+ packet = (struct batadv_unicast_packet *)payload;
+ if (packet->packet_type != BATADV_UNICAST)
+ goto out;
+
+ /* Try to find a coding opportunity and send the skb if one is found */
+ if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr))
+ return true;
+
+ /* Find or create a nc_path for this src-dst pair */
+ nc_path = batadv_nc_get_path(bat_priv,
+ bat_priv->nc.coding_hash,
+ ethhdr->h_source,
+ neigh_node->addr);
+
+ if (!nc_path)
+ goto out;
+
+ /* Add skb to nc_path */
+ packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet));
+ if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id))
+ goto free_nc_path;
+
+ /* Packet is consumed */
+ return true;
+
+free_nc_path:
+ batadv_nc_path_free_ref(nc_path);
+out:
+ /* Packet is not consumed */
+ return false;
+}
+
+/**
+ * batadv_nc_skb_store_for_decoding - save a clone of the skb which can be used
+ * when decoding coded packets
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: data skb to store
+ */
+void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_unicast_packet *packet;
+ struct batadv_nc_path *nc_path;
+ struct ethhdr *ethhdr = eth_hdr(skb);
+ __be32 packet_id;
+ u8 *payload;
+
+ /* Check if network coding is enabled */
+ if (!atomic_read(&bat_priv->network_coding))
+ goto out;
+
+ /* Check for supported packet type */
+ payload = skb_network_header(skb);
+ packet = (struct batadv_unicast_packet *)payload;
+ if (packet->packet_type != BATADV_UNICAST)
+ goto out;
+
+ /* Find existing nc_path or create a new */
+ nc_path = batadv_nc_get_path(bat_priv,
+ bat_priv->nc.decoding_hash,
+ ethhdr->h_source,
+ ethhdr->h_dest);
+
+ if (!nc_path)
+ goto out;
+
+ /* Clone skb and adjust skb->data to point at batman header */
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto free_nc_path;
+
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ goto free_skb;
+
+ if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN)))
+ goto free_skb;
+
+ /* Add skb to nc_path */
+ packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet));
+ if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id))
+ goto free_skb;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER);
+ return;
+
+free_skb:
+ kfree_skb(skb);
+free_nc_path:
+ batadv_nc_path_free_ref(nc_path);
+out:
+ return;
+}
+
+/**
+ * batadv_nc_skb_store_sniffed_unicast - check if a received unicast packet
+ * should be saved in the decoding buffer and, if so, store it there
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: unicast skb to store
+ */
+void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct ethhdr *ethhdr = eth_hdr(skb);
+
+ if (batadv_is_my_mac(bat_priv, ethhdr->h_dest))
+ return;
+
+ /* Set data pointer to MAC header to mimic packets from our tx path */
+ skb_push(skb, ETH_HLEN);
+
+ batadv_nc_skb_store_for_decoding(bat_priv, skb);
+}
+
+/**
+ * batadv_nc_skb_decode_packet - decode given skb using the decode data stored
+ * in nc_packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: unicast skb to decode
+ * @nc_packet: decode data needed to decode the skb
+ *
+ * Returns pointer to decoded unicast packet if the packet was decoded or NULL
+ * in case of an error.
+ */
+static struct batadv_unicast_packet *
+batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ struct batadv_nc_packet *nc_packet)
+{
+ const int h_size = sizeof(struct batadv_unicast_packet);
+ const int h_diff = sizeof(struct batadv_coded_packet) - h_size;
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_coded_packet coded_packet_tmp;
+ struct ethhdr *ethhdr, ethhdr_tmp;
+ uint8_t *orig_dest, ttl, ttvn;
+ unsigned int coding_len;
+ int err;
+
+ /* Save headers temporarily */
+ memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp));
+ memcpy(&ethhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp));
+
+ if (skb_cow(skb, 0) < 0)
+ return NULL;
+
+ if (unlikely(!skb_pull_rcsum(skb, h_diff)))
+ return NULL;
+
+ /* Data points to batman header, so set mac header 14 bytes before
+ * and network to data
+ */
+ skb_set_mac_header(skb, -ETH_HLEN);
+ skb_reset_network_header(skb);
+
+ /* Reconstruct original mac header */
+ ethhdr = eth_hdr(skb);
+ *ethhdr = ethhdr_tmp;
+
+ /* Select the correct unicast header information based on the location
+ * of our mac address in the coded_packet header
+ */
+ if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) {
+ /* If we are the second destination the packet was overheard,
+ * so the Ethernet address must be copied to h_dest and
+ * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST
+ */
+ ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest);
+ skb->pkt_type = PACKET_HOST;
+
+ orig_dest = coded_packet_tmp.second_orig_dest;
+ ttl = coded_packet_tmp.second_ttl;
+ ttvn = coded_packet_tmp.second_ttvn;
+ } else {
+ orig_dest = coded_packet_tmp.first_orig_dest;
+ ttl = coded_packet_tmp.ttl;
+ ttvn = coded_packet_tmp.first_ttvn;
+ }
+
+ coding_len = ntohs(coded_packet_tmp.coded_len);
+
+ if (coding_len > skb->len)
+ return NULL;
+
+ /* Here the magic is reversed:
+ * extract the missing packet from the received coded packet
+ */
+ batadv_nc_memxor(skb->data + h_size,
+ nc_packet->skb->data + h_size,
+ coding_len);
+
+ /* Resize decoded skb if decoded with larger packet */
+ if (nc_packet->skb->len > coding_len + h_size) {
+ err = pskb_trim_rcsum(skb, coding_len + h_size);
+ if (err)
+ return NULL;
+ }
+
+ /* Create decoded unicast packet */
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ unicast_packet->packet_type = BATADV_UNICAST;
+ unicast_packet->version = BATADV_COMPAT_VERSION;
+ unicast_packet->ttl = ttl;
+ ether_addr_copy(unicast_packet->dest, orig_dest);
+ unicast_packet->ttvn = ttvn;
+
+ batadv_nc_packet_free(nc_packet);
+ return unicast_packet;
+}
+
+/**
+ * batadv_nc_find_decoding_packet - search through buffered decoding data to
+ * find the data needed to decode the coded packet
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: pointer to the ethernet header inside the coded packet
+ * @coded: coded packet we try to find decode data for
+ *
+ * Returns pointer to nc packet if the needed data was found or NULL otherwise.
+ */
+static struct batadv_nc_packet *
+batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
+ struct ethhdr *ethhdr,
+ struct batadv_coded_packet *coded)
+{
+ struct batadv_hashtable *hash = bat_priv->nc.decoding_hash;
+ struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL;
+ struct batadv_nc_path *nc_path, nc_path_key;
+ uint8_t *dest, *source;
+ __be32 packet_id;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ /* Select the correct packet id based on the location of our mac-addr */
+ dest = ethhdr->h_source;
+ if (!batadv_is_my_mac(bat_priv, coded->second_dest)) {
+ source = coded->second_source;
+ packet_id = coded->second_crc;
+ } else {
+ source = coded->first_source;
+ packet_id = coded->first_crc;
+ }
+
+ batadv_nc_hash_key_gen(&nc_path_key, source, dest);
+ index = batadv_nc_hash_choose(&nc_path_key, hash->size);
+
+ /* Search for matching coding path */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) {
+ /* Find matching nc_packet */
+ spin_lock_bh(&nc_path->packet_list_lock);
+ list_for_each_entry(tmp_nc_packet,
+ &nc_path->packet_list, list) {
+ if (packet_id == tmp_nc_packet->packet_id) {
+ list_del(&tmp_nc_packet->list);
+
+ nc_packet = tmp_nc_packet;
+ break;
+ }
+ }
+ spin_unlock_bh(&nc_path->packet_list_lock);
+
+ if (nc_packet)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!nc_packet)
+ batadv_dbg(BATADV_DBG_NC, bat_priv,
+ "No decoding packet found for %u\n", packet_id);
+
+ return nc_packet;
+}
+
+/**
+ * batadv_nc_recv_coded_packet - try to decode coded packet and enqueue the
+ * resulting unicast packet
+ * @skb: incoming coded packet
+ * @recv_if: pointer to interface this packet was received on
+ */
+static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_coded_packet *coded_packet;
+ struct batadv_nc_packet *nc_packet;
+ struct ethhdr *ethhdr;
+ int hdr_size = sizeof(*coded_packet);
+
+ /* Check if network coding is enabled */
+ if (!atomic_read(&bat_priv->network_coding))
+ return NET_RX_DROP;
+
+ /* Make sure we can access (and remove) header */
+ if (unlikely(!pskb_may_pull(skb, hdr_size)))
+ return NET_RX_DROP;
+
+ coded_packet = (struct batadv_coded_packet *)skb->data;
+ ethhdr = eth_hdr(skb);
+
+ /* Verify frame is destined for us */
+ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) &&
+ !batadv_is_my_mac(bat_priv, coded_packet->second_dest))
+ return NET_RX_DROP;
+
+ /* Update stat counter */
+ if (batadv_is_my_mac(bat_priv, coded_packet->second_dest))
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED);
+
+ nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr,
+ coded_packet);
+ if (!nc_packet) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
+ return NET_RX_DROP;
+ }
+
+ /* Make skb's linear, because decoding accesses the entire buffer */
+ if (skb_linearize(skb) < 0)
+ goto free_nc_packet;
+
+ if (skb_linearize(nc_packet->skb) < 0)
+ goto free_nc_packet;
+
+ /* Decode the packet */
+ unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet);
+ if (!unicast_packet) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
+ goto free_nc_packet;
+ }
+
+ /* Mark packet as decoded to do correct recoding when forwarding */
+ BATADV_SKB_CB(skb)->decoded = true;
+ batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE);
+ batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES,
+ skb->len + ETH_HLEN);
+ return batadv_recv_unicast_packet(skb, recv_if);
+
+free_nc_packet:
+ batadv_nc_packet_free(nc_packet);
+ return NET_RX_DROP;
+}
+
+/**
+ * batadv_nc_mesh_free - clean up network coding memory
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_NC, 1);
+ cancel_delayed_work_sync(&bat_priv->nc.work);
+
+ batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL);
+ batadv_hash_destroy(bat_priv->nc.coding_hash);
+ batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL);
+ batadv_hash_destroy(bat_priv->nc.decoding_hash);
+}
+
+/**
+ * batadv_nc_nodes_seq_print_text - print the nc node information
+ * @seq: seq file to print on
+ * @offset: not used
+ */
+int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct batadv_hard_iface *primary_if;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node;
+ struct batadv_nc_node *nc_node;
+ int i;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ goto out;
+
+ /* Traverse list of originators */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ /* For each orig_node in this bin */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ /* no need to print the orig node if it does not have
+ * network coding neighbors
+ */
+ if (list_empty(&orig_node->in_coding_list) &&
+ list_empty(&orig_node->out_coding_list))
+ continue;
+
+ seq_printf(seq, "Node: %pM\n", orig_node->orig);
+
+ seq_puts(seq, " Ingoing: ");
+ /* For each in_nc_node to this orig_node */
+ list_for_each_entry_rcu(nc_node,
+ &orig_node->in_coding_list,
+ list)
+ seq_printf(seq, "%pM ",
+ nc_node->addr);
+ seq_puts(seq, "\n");
+
+ seq_puts(seq, " Outgoing: ");
+ /* For out_nc_node to this orig_node */
+ list_for_each_entry_rcu(nc_node,
+ &orig_node->out_coding_list,
+ list)
+ seq_printf(seq, "%pM ",
+ nc_node->addr);
+ seq_puts(seq, "\n\n");
+ }
+ rcu_read_unlock();
+ }
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return 0;
+}
+
+/**
+ * batadv_nc_init_debugfs - create nc folder and related files in debugfs
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
+{
+ struct dentry *nc_dir, *file;
+
+ nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir);
+ if (!nc_dir)
+ goto out;
+
+ file = debugfs_create_u8("min_tq", S_IRUGO | S_IWUSR, nc_dir,
+ &bat_priv->nc.min_tq);
+ if (!file)
+ goto out;
+
+ file = debugfs_create_u32("max_fwd_delay", S_IRUGO | S_IWUSR, nc_dir,
+ &bat_priv->nc.max_fwd_delay);
+ if (!file)
+ goto out;
+
+ file = debugfs_create_u32("max_buffer_time", S_IRUGO | S_IWUSR, nc_dir,
+ &bat_priv->nc.max_buffer_time);
+ if (!file)
+ goto out;
+
+ return 0;
+
+out:
+ return -ENOMEM;
+}
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
new file mode 100644
index 000000000..358c0d686
--- /dev/null
+++ b/net/batman-adv/network-coding.h
@@ -0,0 +1,123 @@
+/* Copyright (C) 2012-2014 B.A.T.M.A.N. contributors:
+ *
+ * Martin Hundebøll, Jeppe Ledet-Pedersen
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_
+#define _NET_BATMAN_ADV_NETWORK_CODING_H_
+
+#ifdef CONFIG_BATMAN_ADV_NC
+
+void batadv_nc_status_update(struct net_device *net_dev);
+int batadv_nc_init(void);
+int batadv_nc_mesh_init(struct batadv_priv *bat_priv);
+void batadv_nc_mesh_free(struct batadv_priv *bat_priv);
+void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *ogm_packet,
+ int is_single_hop_neigh);
+void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_node *));
+void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv);
+void batadv_nc_init_orig(struct batadv_orig_node *orig_node);
+bool batadv_nc_skb_forward(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node);
+void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_nc_init_debugfs(struct batadv_priv *bat_priv);
+
+#else /* ifdef CONFIG_BATMAN_ADV_NC */
+
+static inline void batadv_nc_status_update(struct net_device *net_dev)
+{
+}
+
+static inline int batadv_nc_init(void)
+{
+ return 0;
+}
+
+static inline int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
+{
+}
+
+static inline void
+batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_orig_node *orig_neigh_node,
+ struct batadv_ogm_packet *ogm_packet,
+ int is_single_hop_neigh)
+{
+}
+
+static inline void
+batadv_nc_purge_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ bool (*to_purge)(struct batadv_priv *,
+ struct batadv_nc_node *))
+{
+}
+
+static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
+{
+}
+
+static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
+{
+}
+
+static inline bool batadv_nc_skb_forward(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node)
+{
+ return false;
+}
+
+static inline void
+batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+}
+
+static inline void
+batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+}
+
+static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq,
+ void *offset)
+{
+ return 0;
+}
+
+static inline int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+#endif /* ifdef CONFIG_BATMAN_ADV_NC */
+
+#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
new file mode 100644
index 000000000..90e805aba
--- /dev/null
+++ b/net/batman-adv/originator.c
@@ -0,0 +1,1177 @@
+/* Copyright (C) 2009-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "distributed-arp-table.h"
+#include "originator.h"
+#include "hash.h"
+#include "translation-table.h"
+#include "routing.h"
+#include "gateway_client.h"
+#include "hard-interface.h"
+#include "soft-interface.h"
+#include "bridge_loop_avoidance.h"
+#include "network-coding.h"
+#include "fragmentation.h"
+#include "multicast.h"
+
+/* hash class keys */
+static struct lock_class_key batadv_orig_hash_lock_class_key;
+
+static void batadv_purge_orig(struct work_struct *work);
+
+/* returns 1 if they are the same originator */
+int batadv_compare_orig(const struct hlist_node *node, const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_orig_node,
+ hash_entry);
+
+ return batadv_compare_eth(data1, data2);
+}
+
+/**
+ * batadv_orig_node_vlan_get - get an orig_node_vlan object
+ * @orig_node: the originator serving the VLAN
+ * @vid: the VLAN identifier
+ *
+ * Returns the vlan object identified by vid and belonging to orig_node or NULL
+ * if it does not exist.
+ */
+struct batadv_orig_node_vlan *
+batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ struct batadv_orig_node_vlan *vlan = NULL, *tmp;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) {
+ if (tmp->vid != vid)
+ continue;
+
+ if (!atomic_inc_not_zero(&tmp->refcount))
+ continue;
+
+ vlan = tmp;
+
+ break;
+ }
+ rcu_read_unlock();
+
+ return vlan;
+}
+
+/**
+ * batadv_orig_node_vlan_new - search and possibly create an orig_node_vlan
+ * object
+ * @orig_node: the originator serving the VLAN
+ * @vid: the VLAN identifier
+ *
+ * Returns NULL in case of failure or the vlan object identified by vid and
+ * belonging to orig_node otherwise. The object is created and added to the list
+ * if it does not exist.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_orig_node_vlan *
+batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ struct batadv_orig_node_vlan *vlan;
+
+ spin_lock_bh(&orig_node->vlan_list_lock);
+
+ /* first look if an object for this vid already exists */
+ vlan = batadv_orig_node_vlan_get(orig_node, vid);
+ if (vlan)
+ goto out;
+
+ vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC);
+ if (!vlan)
+ goto out;
+
+ atomic_set(&vlan->refcount, 2);
+ vlan->vid = vid;
+
+ list_add_rcu(&vlan->list, &orig_node->vlan_list);
+
+out:
+ spin_unlock_bh(&orig_node->vlan_list_lock);
+
+ return vlan;
+}
+
+/**
+ * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly free
+ * the originator-vlan object
+ * @orig_vlan: the originator-vlan object to release
+ */
+void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan)
+{
+ if (atomic_dec_and_test(&orig_vlan->refcount))
+ kfree_rcu(orig_vlan, rcu);
+}
+
+int batadv_originator_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->orig_hash)
+ return 0;
+
+ bat_priv->orig_hash = batadv_hash_new(1024);
+
+ if (!bat_priv->orig_hash)
+ goto err;
+
+ batadv_hash_set_lock_class(bat_priv->orig_hash,
+ &batadv_orig_hash_lock_class_key);
+
+ INIT_DELAYED_WORK(&bat_priv->orig_work, batadv_purge_orig);
+ queue_delayed_work(batadv_event_workqueue,
+ &bat_priv->orig_work,
+ msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
+
+ return 0;
+
+err:
+ return -ENOMEM;
+}
+
+/**
+ * batadv_neigh_ifinfo_free_rcu - free the neigh_ifinfo object
+ * @rcu: rcu pointer of the neigh_ifinfo object
+ */
+static void batadv_neigh_ifinfo_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+
+ neigh_ifinfo = container_of(rcu, struct batadv_neigh_ifinfo, rcu);
+
+ if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
+ batadv_hardif_free_ref_now(neigh_ifinfo->if_outgoing);
+
+ kfree(neigh_ifinfo);
+}
+
+/**
+ * batadv_neigh_ifinfo_free_now - decrement the refcounter and possibly free
+ * the neigh_ifinfo (without rcu callback)
+ * @neigh_ifinfo: the neigh_ifinfo object to release
+ */
+static void
+batadv_neigh_ifinfo_free_ref_now(struct batadv_neigh_ifinfo *neigh_ifinfo)
+{
+ if (atomic_dec_and_test(&neigh_ifinfo->refcount))
+ batadv_neigh_ifinfo_free_rcu(&neigh_ifinfo->rcu);
+}
+
+/**
+ * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly free
+ * the neigh_ifinfo
+ * @neigh_ifinfo: the neigh_ifinfo object to release
+ */
+void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo)
+{
+ if (atomic_dec_and_test(&neigh_ifinfo->refcount))
+ call_rcu(&neigh_ifinfo->rcu, batadv_neigh_ifinfo_free_rcu);
+}
+
+/**
+ * batadv_neigh_node_free_rcu - free the neigh_node
+ * @rcu: rcu pointer of the neigh_node
+ */
+static void batadv_neigh_node_free_rcu(struct rcu_head *rcu)
+{
+ struct hlist_node *node_tmp;
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+
+ neigh_node = container_of(rcu, struct batadv_neigh_node, rcu);
+
+ hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
+ &neigh_node->ifinfo_list, list) {
+ batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo);
+ }
+ batadv_hardif_free_ref_now(neigh_node->if_incoming);
+
+ kfree(neigh_node);
+}
+
+/**
+ * batadv_neigh_node_free_ref_now - decrement the neighbors refcounter
+ * and possibly free it (without rcu callback)
+ * @neigh_node: neigh neighbor to free
+ */
+static void
+batadv_neigh_node_free_ref_now(struct batadv_neigh_node *neigh_node)
+{
+ if (atomic_dec_and_test(&neigh_node->refcount))
+ batadv_neigh_node_free_rcu(&neigh_node->rcu);
+}
+
+/**
+ * batadv_neigh_node_free_ref - decrement the neighbors refcounter
+ * and possibly free it
+ * @neigh_node: neigh neighbor to free
+ */
+void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node)
+{
+ if (atomic_dec_and_test(&neigh_node->refcount))
+ call_rcu(&neigh_node->rcu, batadv_neigh_node_free_rcu);
+}
+
+/**
+ * batadv_orig_node_get_router - router to the originator depending on iface
+ * @orig_node: the orig node for the router
+ * @if_outgoing: the interface where the payload packet has been received or
+ * the OGM should be sent to
+ *
+ * Returns the neighbor which should be router for this orig_node/iface.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_neigh_node *
+batadv_orig_router_get(struct batadv_orig_node *orig_node,
+ const struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_node *router = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) {
+ if (orig_ifinfo->if_outgoing != if_outgoing)
+ continue;
+
+ router = rcu_dereference(orig_ifinfo->router);
+ break;
+ }
+
+ if (router && !atomic_inc_not_zero(&router->refcount))
+ router = NULL;
+
+ rcu_read_unlock();
+ return router;
+}
+
+/**
+ * batadv_orig_ifinfo_get - find the ifinfo from an orig_node
+ * @orig_node: the orig node to be queried
+ * @if_outgoing: the interface for which the ifinfo should be acquired
+ *
+ * Returns the requested orig_ifinfo or NULL if not found.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_orig_ifinfo *
+batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *tmp, *orig_ifinfo = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp, &orig_node->ifinfo_list,
+ list) {
+ if (tmp->if_outgoing != if_outgoing)
+ continue;
+
+ if (!atomic_inc_not_zero(&tmp->refcount))
+ continue;
+
+ orig_ifinfo = tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_ifinfo;
+}
+
+/**
+ * batadv_orig_ifinfo_new - search and possibly create an orig_ifinfo object
+ * @orig_node: the orig node to be queried
+ * @if_outgoing: the interface for which the ifinfo should be acquired
+ *
+ * Returns NULL in case of failure or the orig_ifinfo object for the if_outgoing
+ * interface otherwise. The object is created and added to the list
+ * if it does not exist.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_orig_ifinfo *
+batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ unsigned long reset_time;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing);
+ if (orig_ifinfo)
+ goto out;
+
+ orig_ifinfo = kzalloc(sizeof(*orig_ifinfo), GFP_ATOMIC);
+ if (!orig_ifinfo)
+ goto out;
+
+ if (if_outgoing != BATADV_IF_DEFAULT &&
+ !atomic_inc_not_zero(&if_outgoing->refcount)) {
+ kfree(orig_ifinfo);
+ orig_ifinfo = NULL;
+ goto out;
+ }
+
+ reset_time = jiffies - 1;
+ reset_time -= msecs_to_jiffies(BATADV_RESET_PROTECTION_MS);
+ orig_ifinfo->batman_seqno_reset = reset_time;
+ orig_ifinfo->if_outgoing = if_outgoing;
+ INIT_HLIST_NODE(&orig_ifinfo->list);
+ atomic_set(&orig_ifinfo->refcount, 2);
+ hlist_add_head_rcu(&orig_ifinfo->list,
+ &orig_node->ifinfo_list);
+out:
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+ return orig_ifinfo;
+}
+
+/**
+ * batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node
+ * @neigh_node: the neigh node to be queried
+ * @if_outgoing: the interface for which the ifinfo should be acquired
+ *
+ * The object is returned with refcounter increased by 1.
+ *
+ * Returns the requested neigh_ifinfo or NULL if not found
+ */
+struct batadv_neigh_ifinfo *
+batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL,
+ *tmp_neigh_ifinfo;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_ifinfo, &neigh->ifinfo_list,
+ list) {
+ if (tmp_neigh_ifinfo->if_outgoing != if_outgoing)
+ continue;
+
+ if (!atomic_inc_not_zero(&tmp_neigh_ifinfo->refcount))
+ continue;
+
+ neigh_ifinfo = tmp_neigh_ifinfo;
+ break;
+ }
+ rcu_read_unlock();
+
+ return neigh_ifinfo;
+}
+
+/**
+ * batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object
+ * @neigh_node: the neigh node to be queried
+ * @if_outgoing: the interface for which the ifinfo should be acquired
+ *
+ * Returns NULL in case of failure or the neigh_ifinfo object for the
+ * if_outgoing interface otherwise. The object is created and added to the list
+ * if it does not exist.
+ *
+ * The object is returned with refcounter increased by 1.
+ */
+struct batadv_neigh_ifinfo *
+batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+
+ spin_lock_bh(&neigh->ifinfo_lock);
+
+ neigh_ifinfo = batadv_neigh_ifinfo_get(neigh, if_outgoing);
+ if (neigh_ifinfo)
+ goto out;
+
+ neigh_ifinfo = kzalloc(sizeof(*neigh_ifinfo), GFP_ATOMIC);
+ if (!neigh_ifinfo)
+ goto out;
+
+ if (if_outgoing && !atomic_inc_not_zero(&if_outgoing->refcount)) {
+ kfree(neigh_ifinfo);
+ neigh_ifinfo = NULL;
+ goto out;
+ }
+
+ INIT_HLIST_NODE(&neigh_ifinfo->list);
+ atomic_set(&neigh_ifinfo->refcount, 2);
+ neigh_ifinfo->if_outgoing = if_outgoing;
+
+ hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list);
+
+out:
+ spin_unlock_bh(&neigh->ifinfo_lock);
+
+ return neigh_ifinfo;
+}
+
+/**
+ * batadv_neigh_node_new - create and init a new neigh_node object
+ * @hard_iface: the interface where the neighbour is connected to
+ * @neigh_addr: the mac address of the neighbour interface
+ * @orig_node: originator object representing the neighbour
+ *
+ * Allocates a new neigh_node object and initialises all the generic fields.
+ * Returns the new object or NULL on failure.
+ */
+struct batadv_neigh_node *
+batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
+ const uint8_t *neigh_addr,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_neigh_node *neigh_node;
+
+ neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC);
+ if (!neigh_node)
+ goto out;
+
+ INIT_HLIST_NODE(&neigh_node->list);
+ INIT_HLIST_HEAD(&neigh_node->ifinfo_list);
+ spin_lock_init(&neigh_node->ifinfo_lock);
+
+ ether_addr_copy(neigh_node->addr, neigh_addr);
+ neigh_node->if_incoming = hard_iface;
+ neigh_node->orig_node = orig_node;
+
+ /* extra reference for return */
+ atomic_set(&neigh_node->refcount, 2);
+
+out:
+ return neigh_node;
+}
+
+/**
+ * batadv_neigh_node_get - retrieve a neighbour from the list
+ * @orig_node: originator which the neighbour belongs to
+ * @hard_iface: the interface where this neighbour is connected to
+ * @addr: the address of the neighbour
+ *
+ * Looks for and possibly returns a neighbour belonging to this originator list
+ * which is connected through the provided hard interface.
+ * Returns NULL if the neighbour is not found.
+ */
+struct batadv_neigh_node *
+batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
+ const struct batadv_hard_iface *hard_iface,
+ const uint8_t *addr)
+{
+ struct batadv_neigh_node *tmp_neigh_node, *res = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) {
+ if (!batadv_compare_eth(tmp_neigh_node->addr, addr))
+ continue;
+
+ if (tmp_neigh_node->if_incoming != hard_iface)
+ continue;
+
+ if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+ continue;
+
+ res = tmp_neigh_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return res;
+}
+
+/**
+ * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object
+ * @rcu: rcu pointer of the orig_ifinfo object
+ */
+static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_node *router;
+
+ orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu);
+
+ if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
+ batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing);
+
+ /* this is the last reference to this object */
+ router = rcu_dereference_protected(orig_ifinfo->router, true);
+ if (router)
+ batadv_neigh_node_free_ref_now(router);
+ kfree(orig_ifinfo);
+}
+
+/**
+ * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free
+ * the orig_ifinfo (without rcu callback)
+ * @orig_ifinfo: the orig_ifinfo object to release
+ */
+static void
+batadv_orig_ifinfo_free_ref_now(struct batadv_orig_ifinfo *orig_ifinfo)
+{
+ if (atomic_dec_and_test(&orig_ifinfo->refcount))
+ batadv_orig_ifinfo_free_rcu(&orig_ifinfo->rcu);
+}
+
+/**
+ * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free
+ * the orig_ifinfo
+ * @orig_ifinfo: the orig_ifinfo object to release
+ */
+void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo)
+{
+ if (atomic_dec_and_test(&orig_ifinfo->refcount))
+ call_rcu(&orig_ifinfo->rcu, batadv_orig_ifinfo_free_rcu);
+}
+
+static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
+{
+ struct hlist_node *node_tmp;
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_orig_node *orig_node;
+ struct batadv_orig_ifinfo *orig_ifinfo;
+
+ orig_node = container_of(rcu, struct batadv_orig_node, rcu);
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ /* for all neighbors towards this originator ... */
+ hlist_for_each_entry_safe(neigh_node, node_tmp,
+ &orig_node->neigh_list, list) {
+ hlist_del_rcu(&neigh_node->list);
+ batadv_neigh_node_free_ref_now(neigh_node);
+ }
+
+ hlist_for_each_entry_safe(orig_ifinfo, node_tmp,
+ &orig_node->ifinfo_list, list) {
+ hlist_del_rcu(&orig_ifinfo->list);
+ batadv_orig_ifinfo_free_ref_now(orig_ifinfo);
+ }
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ batadv_mcast_purge_orig(orig_node);
+
+ /* Free nc_nodes */
+ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL);
+
+ batadv_frag_purge_orig(orig_node, NULL);
+
+ if (orig_node->bat_priv->bat_algo_ops->bat_orig_free)
+ orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node);
+
+ kfree(orig_node->tt_buff);
+ kfree(orig_node);
+}
+
+/**
+ * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly
+ * schedule an rcu callback for freeing it
+ * @orig_node: the orig node to free
+ */
+void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node)
+{
+ if (atomic_dec_and_test(&orig_node->refcount))
+ call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
+}
+
+/**
+ * batadv_orig_node_free_ref_now - decrement the orig node refcounter and
+ * possibly free it (without rcu callback)
+ * @orig_node: the orig node to free
+ */
+void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node)
+{
+ if (atomic_dec_and_test(&orig_node->refcount))
+ batadv_orig_node_free_rcu(&orig_node->rcu);
+}
+
+void batadv_originator_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* spinlock to protect write access */
+ struct batadv_orig_node *orig_node;
+ uint32_t i;
+
+ if (!hash)
+ return;
+
+ cancel_delayed_work_sync(&bat_priv->orig_work);
+
+ bat_priv->orig_hash = NULL;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(orig_node, node_tmp,
+ head, hash_entry) {
+ hlist_del_rcu(&orig_node->hash_entry);
+ batadv_orig_node_free_ref(orig_node);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_hash_destroy(hash);
+}
+
+/**
+ * batadv_orig_node_new - creates a new orig_node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the mac address of the originator
+ *
+ * Creates a new originator object and initialise all the generic fields.
+ * The new object is not added to the originator list.
+ * Returns the newly created object or NULL on failure.
+ */
+struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
+ const uint8_t *addr)
+{
+ struct batadv_orig_node *orig_node;
+ struct batadv_orig_node_vlan *vlan;
+ unsigned long reset_time;
+ int i;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Creating new originator: %pM\n", addr);
+
+ orig_node = kzalloc(sizeof(*orig_node), GFP_ATOMIC);
+ if (!orig_node)
+ return NULL;
+
+ INIT_HLIST_HEAD(&orig_node->neigh_list);
+ INIT_LIST_HEAD(&orig_node->vlan_list);
+ INIT_HLIST_HEAD(&orig_node->ifinfo_list);
+ spin_lock_init(&orig_node->bcast_seqno_lock);
+ spin_lock_init(&orig_node->neigh_list_lock);
+ spin_lock_init(&orig_node->tt_buff_lock);
+ spin_lock_init(&orig_node->tt_lock);
+ spin_lock_init(&orig_node->vlan_list_lock);
+
+ batadv_nc_init_orig(orig_node);
+
+ /* extra reference for return */
+ atomic_set(&orig_node->refcount, 2);
+
+ orig_node->bat_priv = bat_priv;
+ ether_addr_copy(orig_node->orig, addr);
+ batadv_dat_init_orig_node_addr(orig_node);
+ atomic_set(&orig_node->last_ttvn, 0);
+ orig_node->tt_buff = NULL;
+ orig_node->tt_buff_len = 0;
+ orig_node->last_seen = jiffies;
+ reset_time = jiffies - 1 - msecs_to_jiffies(BATADV_RESET_PROTECTION_MS);
+ orig_node->bcast_seqno_reset = reset_time;
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ orig_node->mcast_flags = BATADV_NO_FLAGS;
+#endif
+
+ /* create a vlan object for the "untagged" LAN */
+ vlan = batadv_orig_node_vlan_new(orig_node, BATADV_NO_FLAGS);
+ if (!vlan)
+ goto free_orig_node;
+ /* batadv_orig_node_vlan_new() increases the refcounter.
+ * Immediately release vlan since it is not needed anymore in this
+ * context
+ */
+ batadv_orig_node_vlan_free_ref(vlan);
+
+ for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
+ INIT_HLIST_HEAD(&orig_node->fragments[i].head);
+ spin_lock_init(&orig_node->fragments[i].lock);
+ orig_node->fragments[i].size = 0;
+ }
+
+ return orig_node;
+free_orig_node:
+ kfree(orig_node);
+ return NULL;
+}
+
+/**
+ * batadv_purge_neigh_ifinfo - purge obsolete ifinfo entries from neighbor
+ * @bat_priv: the bat priv with all the soft interface information
+ * @neigh: orig node which is to be checked
+ */
+static void
+batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
+ struct batadv_neigh_node *neigh)
+{
+ struct batadv_neigh_ifinfo *neigh_ifinfo;
+ struct batadv_hard_iface *if_outgoing;
+ struct hlist_node *node_tmp;
+
+ spin_lock_bh(&neigh->ifinfo_lock);
+
+ /* for all ifinfo objects for this neighinator */
+ hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
+ &neigh->ifinfo_list, list) {
+ if_outgoing = neigh_ifinfo->if_outgoing;
+
+ /* always keep the default interface */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ continue;
+
+ /* don't purge if the interface is not (going) down */
+ if ((if_outgoing->if_status != BATADV_IF_INACTIVE) &&
+ (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) &&
+ (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED))
+ continue;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "neighbor/ifinfo purge: neighbor %pM, iface: %s\n",
+ neigh->addr, if_outgoing->net_dev->name);
+
+ hlist_del_rcu(&neigh_ifinfo->list);
+ batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ }
+
+ spin_unlock_bh(&neigh->ifinfo_lock);
+}
+
+/**
+ * batadv_purge_orig_ifinfo - purge obsolete ifinfo entries from originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which is to be checked
+ *
+ * Returns true if any ifinfo entry was purged, false otherwise.
+ */
+static bool
+batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_hard_iface *if_outgoing;
+ struct hlist_node *node_tmp;
+ bool ifinfo_purged = false;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ /* for all ifinfo objects for this originator */
+ hlist_for_each_entry_safe(orig_ifinfo, node_tmp,
+ &orig_node->ifinfo_list, list) {
+ if_outgoing = orig_ifinfo->if_outgoing;
+
+ /* always keep the default interface */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ continue;
+
+ /* don't purge if the interface is not (going) down */
+ if ((if_outgoing->if_status != BATADV_IF_INACTIVE) &&
+ (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) &&
+ (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED))
+ continue;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "router/ifinfo purge: originator %pM, iface: %s\n",
+ orig_node->orig, if_outgoing->net_dev->name);
+
+ ifinfo_purged = true;
+
+ hlist_del_rcu(&orig_ifinfo->list);
+ batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ if (orig_node->last_bonding_candidate == orig_ifinfo) {
+ orig_node->last_bonding_candidate = NULL;
+ batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ }
+ }
+
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+
+ return ifinfo_purged;
+}
+
+/**
+ * batadv_purge_orig_neighbors - purges neighbors from originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which is to be checked
+ *
+ * Returns true if any neighbor was purged, false otherwise
+ */
+static bool
+batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct hlist_node *node_tmp;
+ struct batadv_neigh_node *neigh_node;
+ bool neigh_purged = false;
+ unsigned long last_seen;
+ struct batadv_hard_iface *if_incoming;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+
+ /* for all neighbors towards this originator ... */
+ hlist_for_each_entry_safe(neigh_node, node_tmp,
+ &orig_node->neigh_list, list) {
+ last_seen = neigh_node->last_seen;
+ if_incoming = neigh_node->if_incoming;
+
+ if ((batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT)) ||
+ (if_incoming->if_status == BATADV_IF_INACTIVE) ||
+ (if_incoming->if_status == BATADV_IF_NOT_IN_USE) ||
+ (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) {
+ if ((if_incoming->if_status == BATADV_IF_INACTIVE) ||
+ (if_incoming->if_status == BATADV_IF_NOT_IN_USE) ||
+ (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED))
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n",
+ orig_node->orig, neigh_node->addr,
+ if_incoming->net_dev->name);
+ else
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "neighbor timeout: originator %pM, neighbor: %pM, last_seen: %u\n",
+ orig_node->orig, neigh_node->addr,
+ jiffies_to_msecs(last_seen));
+
+ neigh_purged = true;
+
+ hlist_del_rcu(&neigh_node->list);
+ batadv_neigh_node_free_ref(neigh_node);
+ } else {
+ /* only necessary if not the whole neighbor is to be
+ * deleted, but some interface has been removed.
+ */
+ batadv_purge_neigh_ifinfo(bat_priv, neigh_node);
+ }
+ }
+
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+ return neigh_purged;
+}
+
+/**
+ * batadv_find_best_neighbor - finds the best neighbor after purging
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which is to be checked
+ * @if_outgoing: the interface for which the metric should be compared
+ *
+ * Returns the current best neighbor, with refcount increased.
+ */
+static struct batadv_neigh_node *
+batadv_find_best_neighbor(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *best = NULL, *neigh;
+ struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(neigh, &orig_node->neigh_list, list) {
+ if (best && (bao->bat_neigh_cmp(neigh, if_outgoing,
+ best, if_outgoing) <= 0))
+ continue;
+
+ if (!atomic_inc_not_zero(&neigh->refcount))
+ continue;
+
+ if (best)
+ batadv_neigh_node_free_ref(best);
+
+ best = neigh;
+ }
+ rcu_read_unlock();
+
+ return best;
+}
+
+/**
+ * batadv_purge_orig_node - purges obsolete information from an orig_node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which is to be checked
+ *
+ * This function checks if the orig_node or substructures of it have become
+ * obsolete, and purges this information if that's the case.
+ *
+ * Returns true if the orig_node is to be removed, false otherwise.
+ */
+static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_neigh_node *best_neigh_node;
+ struct batadv_hard_iface *hard_iface;
+ bool changed_ifinfo, changed_neigh;
+
+ if (batadv_has_timed_out(orig_node->last_seen,
+ 2 * BATADV_PURGE_TIMEOUT)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Originator timeout: originator %pM, last_seen %u\n",
+ orig_node->orig,
+ jiffies_to_msecs(orig_node->last_seen));
+ return true;
+ }
+ changed_ifinfo = batadv_purge_orig_ifinfo(bat_priv, orig_node);
+ changed_neigh = batadv_purge_orig_neighbors(bat_priv, orig_node);
+
+ if (!changed_ifinfo && !changed_neigh)
+ return false;
+
+ /* first for NULL ... */
+ best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node,
+ BATADV_IF_DEFAULT);
+ batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT,
+ best_neigh_node);
+ if (best_neigh_node)
+ batadv_neigh_node_free_ref(best_neigh_node);
+
+ /* ... then for all other interfaces. */
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (hard_iface->soft_iface != bat_priv->soft_iface)
+ continue;
+
+ best_neigh_node = batadv_find_best_neighbor(bat_priv,
+ orig_node,
+ hard_iface);
+ batadv_update_route(bat_priv, orig_node, hard_iface,
+ best_neigh_node);
+ if (best_neigh_node)
+ batadv_neigh_node_free_ref(best_neigh_node);
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+static void _batadv_purge_orig(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* spinlock to protect write access */
+ struct batadv_orig_node *orig_node;
+ uint32_t i;
+
+ if (!hash)
+ return;
+
+ /* for all origins... */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(orig_node, node_tmp,
+ head, hash_entry) {
+ if (batadv_purge_orig_node(bat_priv, orig_node)) {
+ batadv_gw_node_delete(bat_priv, orig_node);
+ hlist_del_rcu(&orig_node->hash_entry);
+ batadv_tt_global_del_orig(orig_node->bat_priv,
+ orig_node, -1,
+ "originator timed out");
+ batadv_orig_node_free_ref(orig_node);
+ continue;
+ }
+
+ batadv_frag_purge_orig(orig_node,
+ batadv_frag_check_entry);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_gw_node_purge(bat_priv);
+ batadv_gw_election(bat_priv);
+}
+
+static void batadv_purge_orig(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ bat_priv = container_of(delayed_work, struct batadv_priv, orig_work);
+ _batadv_purge_orig(bat_priv);
+ queue_delayed_work(batadv_event_workqueue,
+ &bat_priv->orig_work,
+ msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
+}
+
+void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
+{
+ _batadv_purge_orig(bat_priv);
+}
+
+int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ return 0;
+
+ seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
+ BATADV_SOURCE_VERSION, primary_if->net_dev->name,
+ primary_if->net_dev->dev_addr, net_dev->name,
+ bat_priv->bat_algo_ops->name);
+
+ batadv_hardif_free_ref(primary_if);
+
+ if (!bat_priv->bat_algo_ops->bat_orig_print) {
+ seq_puts(seq,
+ "No printing function for this routing protocol\n");
+ return 0;
+ }
+
+ bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq,
+ BATADV_IF_DEFAULT);
+
+ return 0;
+}
+
+/**
+ * batadv_orig_hardif_seq_print_text - writes originator infos for a specific
+ * outgoing interface
+ * @seq: debugfs table seq_file struct
+ * @offset: not used
+ *
+ * Returns 0
+ */
+int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_priv *bat_priv;
+
+ hard_iface = batadv_hardif_get_by_netdev(net_dev);
+
+ if (!hard_iface || !hard_iface->soft_iface) {
+ seq_puts(seq, "Interface not known to B.A.T.M.A.N.\n");
+ goto out;
+ }
+
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+ if (!bat_priv->bat_algo_ops->bat_orig_print) {
+ seq_puts(seq,
+ "No printing function for this routing protocol\n");
+ goto out;
+ }
+
+ if (hard_iface->if_status != BATADV_IF_ACTIVE) {
+ seq_puts(seq, "Interface not active\n");
+ goto out;
+ }
+
+ seq_printf(seq, "[B.A.T.M.A.N. adv %s, IF/MAC: %s/%pM (%s %s)]\n",
+ BATADV_SOURCE_VERSION, hard_iface->net_dev->name,
+ hard_iface->net_dev->dev_addr,
+ hard_iface->soft_iface->name, bat_priv->bat_algo_ops->name);
+
+ bat_priv->bat_algo_ops->bat_orig_print(bat_priv, seq, hard_iface);
+
+out:
+ if (hard_iface)
+ batadv_hardif_free_ref(hard_iface);
+ return 0;
+}
+
+int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
+ int max_if_num)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node;
+ uint32_t i;
+ int ret;
+
+ /* resize all orig nodes because orig_node->bcast_own(_sum) depend on
+ * if_num
+ */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ ret = 0;
+ if (bao->bat_orig_add_if)
+ ret = bao->bat_orig_add_if(orig_node,
+ max_if_num);
+ if (ret == -ENOMEM)
+ goto err;
+ }
+ rcu_read_unlock();
+ }
+
+ return 0;
+
+err:
+ rcu_read_unlock();
+ return -ENOMEM;
+}
+
+int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
+ int max_if_num)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct batadv_hard_iface *hard_iface_tmp;
+ struct batadv_orig_node *orig_node;
+ struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ uint32_t i;
+ int ret;
+
+ /* resize all orig nodes because orig_node->bcast_own(_sum) depend on
+ * if_num
+ */
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ ret = 0;
+ if (bao->bat_orig_del_if)
+ ret = bao->bat_orig_del_if(orig_node,
+ max_if_num,
+ hard_iface->if_num);
+ if (ret == -ENOMEM)
+ goto err;
+ }
+ rcu_read_unlock();
+ }
+
+ /* renumber remaining batman interfaces _inside_ of orig_hash_lock */
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface_tmp, &batadv_hardif_list, list) {
+ if (hard_iface_tmp->if_status == BATADV_IF_NOT_IN_USE)
+ continue;
+
+ if (hard_iface == hard_iface_tmp)
+ continue;
+
+ if (hard_iface->soft_iface != hard_iface_tmp->soft_iface)
+ continue;
+
+ if (hard_iface_tmp->if_num > hard_iface->if_num)
+ hard_iface_tmp->if_num--;
+ }
+ rcu_read_unlock();
+
+ hard_iface->if_num = -1;
+ return 0;
+
+err:
+ rcu_read_unlock();
+ return -ENOMEM;
+}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
new file mode 100644
index 000000000..aa4a43696
--- /dev/null
+++ b/net/batman-adv/originator.h
@@ -0,0 +1,125 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_ORIGINATOR_H_
+#define _NET_BATMAN_ADV_ORIGINATOR_H_
+
+#include "hash.h"
+
+int batadv_compare_orig(const struct hlist_node *node, const void *data2);
+int batadv_originator_init(struct batadv_priv *bat_priv);
+void batadv_originator_free(struct batadv_priv *bat_priv);
+void batadv_purge_orig_ref(struct batadv_priv *bat_priv);
+void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node);
+void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node);
+struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
+ const uint8_t *addr);
+struct batadv_neigh_node *
+batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
+ const struct batadv_hard_iface *hard_iface,
+ const uint8_t *addr);
+struct batadv_neigh_node *
+batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
+ const uint8_t *neigh_addr,
+ struct batadv_orig_node *orig_node);
+void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node);
+struct batadv_neigh_node *
+batadv_orig_router_get(struct batadv_orig_node *orig_node,
+ const struct batadv_hard_iface *if_outgoing);
+struct batadv_neigh_ifinfo *
+batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
+ struct batadv_hard_iface *if_outgoing);
+struct batadv_neigh_ifinfo *
+batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
+ struct batadv_hard_iface *if_outgoing);
+void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo);
+
+struct batadv_orig_ifinfo *
+batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing);
+struct batadv_orig_ifinfo *
+batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing);
+void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo);
+
+int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
+ int max_if_num);
+int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
+ int max_if_num);
+struct batadv_orig_node_vlan *
+batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
+ unsigned short vid);
+struct batadv_orig_node_vlan *
+batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
+ unsigned short vid);
+void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan);
+
+/* hashfunction to choose an entry in a hash table of given size
+ * hash algorithm from http://en.wikipedia.org/wiki/Hash_table
+ */
+static inline uint32_t batadv_choose_orig(const void *data, uint32_t size)
+{
+ const unsigned char *key = data;
+ uint32_t hash = 0;
+ size_t i;
+
+ for (i = 0; i < 6; i++) {
+ hash += key[i];
+ hash += (hash << 10);
+ hash ^= (hash >> 6);
+ }
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash % size;
+}
+
+static inline struct batadv_orig_node *
+batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data)
+{
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ struct hlist_head *head;
+ struct batadv_orig_node *orig_node, *orig_node_tmp = NULL;
+ int index;
+
+ if (!hash)
+ return NULL;
+
+ index = batadv_choose_orig(data, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ if (!batadv_compare_eth(orig_node, data))
+ continue;
+
+ if (!atomic_inc_not_zero(&orig_node->refcount))
+ continue;
+
+ orig_node_tmp = orig_node;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_node_tmp;
+}
+
+#endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
new file mode 100644
index 000000000..b81fbbf21
--- /dev/null
+++ b/net/batman-adv/packet.h
@@ -0,0 +1,553 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_PACKET_H_
+#define _NET_BATMAN_ADV_PACKET_H_
+
+/**
+ * enum batadv_packettype - types for batman-adv encapsulated packets
+ * @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
+ * @BATADV_BCAST: broadcast packets carrying broadcast payload
+ * @BATADV_CODED: network coded packets
+ *
+ * @BATADV_UNICAST: unicast packets carrying unicast payload traffic
+ * @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original
+ * payload packet
+ * @BATADV_UNICAST_4ADDR: unicast packet including the originator address of
+ * the sender
+ * @BATADV_ICMP: unicast packet like IP ICMP used for ping or traceroute
+ * @BATADV_UNICAST_TVLV: unicast packet carrying TVLV containers
+ */
+enum batadv_packettype {
+ /* 0x00 - 0x3f: local packets or special rules for handling */
+ BATADV_IV_OGM = 0x00,
+ BATADV_BCAST = 0x01,
+ BATADV_CODED = 0x02,
+ /* 0x40 - 0x7f: unicast */
+#define BATADV_UNICAST_MIN 0x40
+ BATADV_UNICAST = 0x40,
+ BATADV_UNICAST_FRAG = 0x41,
+ BATADV_UNICAST_4ADDR = 0x42,
+ BATADV_ICMP = 0x43,
+ BATADV_UNICAST_TVLV = 0x44,
+#define BATADV_UNICAST_MAX 0x7f
+ /* 0x80 - 0xff: reserved */
+};
+
+/**
+ * enum batadv_subtype - packet subtype for unicast4addr
+ * @BATADV_P_DATA: user payload
+ * @BATADV_P_DAT_DHT_GET: DHT request message
+ * @BATADV_P_DAT_DHT_PUT: DHT store message
+ * @BATADV_P_DAT_CACHE_REPLY: ARP reply generated by DAT
+ */
+enum batadv_subtype {
+ BATADV_P_DATA = 0x01,
+ BATADV_P_DAT_DHT_GET = 0x02,
+ BATADV_P_DAT_DHT_PUT = 0x03,
+ BATADV_P_DAT_CACHE_REPLY = 0x04,
+};
+
+/* this file is included by batctl which needs these defines */
+#define BATADV_COMPAT_VERSION 15
+
+/**
+ * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets
+ * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was
+ * previously received from someone else than the best neighbor.
+ * @BATADV_PRIMARIES_FIRST_HOP: flag is set when the primary interface address
+ * is used, and the packet travels its first hop.
+ * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a
+ * one hop neighbor on the interface where it was originally received.
+ */
+enum batadv_iv_flags {
+ BATADV_NOT_BEST_NEXT_HOP = BIT(0),
+ BATADV_PRIMARIES_FIRST_HOP = BIT(1),
+ BATADV_DIRECTLINK = BIT(2),
+};
+
+/* ICMP message types */
+enum batadv_icmp_packettype {
+ BATADV_ECHO_REPLY = 0,
+ BATADV_DESTINATION_UNREACHABLE = 3,
+ BATADV_ECHO_REQUEST = 8,
+ BATADV_TTL_EXCEEDED = 11,
+ BATADV_PARAMETER_PROBLEM = 12,
+};
+
+/**
+ * enum batadv_mcast_flags - flags for multicast capabilities and settings
+ * @BATADV_MCAST_WANT_ALL_UNSNOOPABLES: we want all packets destined for
+ * 224.0.0.0/24 or ff02::1
+ * @BATADV_MCAST_WANT_ALL_IPV4: we want all IPv4 multicast packets
+ * @BATADV_MCAST_WANT_ALL_IPV6: we want all IPv6 multicast packets
+ */
+enum batadv_mcast_flags {
+ BATADV_MCAST_WANT_ALL_UNSNOOPABLES = BIT(0),
+ BATADV_MCAST_WANT_ALL_IPV4 = BIT(1),
+ BATADV_MCAST_WANT_ALL_IPV6 = BIT(2),
+};
+
+/* tt data subtypes */
+#define BATADV_TT_DATA_TYPE_MASK 0x0F
+
+/**
+ * enum batadv_tt_data_flags - flags for tt data tvlv
+ * @BATADV_TT_OGM_DIFF: TT diff propagated through OGM
+ * @BATADV_TT_REQUEST: TT request message
+ * @BATADV_TT_RESPONSE: TT response message
+ * @BATADV_TT_FULL_TABLE: contains full table to replace existing table
+ */
+enum batadv_tt_data_flags {
+ BATADV_TT_OGM_DIFF = BIT(0),
+ BATADV_TT_REQUEST = BIT(1),
+ BATADV_TT_RESPONSE = BIT(2),
+ BATADV_TT_FULL_TABLE = BIT(4),
+};
+
+/**
+ * enum batadv_tt_client_flags - TT client specific flags
+ * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table
+ * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new
+ * update telling its new real location has not been received/sent yet
+ * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface.
+ * This information is used by the "AP Isolation" feature
+ * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This
+ * information is used by the Extended Isolation feature
+ * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table
+ * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has
+ * not been announced yet
+ * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept
+ * in the table for one more originator interval for consistency purposes
+ * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of
+ * the network but no nnode has already announced it
+ *
+ * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire.
+ * Bits from 8 to 15 are called _local flags_ because they are used for local
+ * computations only.
+ *
+ * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with
+ * the other nodes in the network. To achieve this goal these flags are included
+ * in the TT CRC computation.
+ */
+enum batadv_tt_client_flags {
+ BATADV_TT_CLIENT_DEL = BIT(0),
+ BATADV_TT_CLIENT_ROAM = BIT(1),
+ BATADV_TT_CLIENT_WIFI = BIT(4),
+ BATADV_TT_CLIENT_ISOLA = BIT(5),
+ BATADV_TT_CLIENT_NOPURGE = BIT(8),
+ BATADV_TT_CLIENT_NEW = BIT(9),
+ BATADV_TT_CLIENT_PENDING = BIT(10),
+ BATADV_TT_CLIENT_TEMP = BIT(11),
+};
+
+/**
+ * batadv_vlan_flags - flags for the four MSB of any vlan ID field
+ * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
+ */
+enum batadv_vlan_flags {
+ BATADV_VLAN_HAS_TAG = BIT(15),
+};
+
+/* claim frame types for the bridge loop avoidance */
+enum batadv_bla_claimframe {
+ BATADV_CLAIM_TYPE_CLAIM = 0x00,
+ BATADV_CLAIM_TYPE_UNCLAIM = 0x01,
+ BATADV_CLAIM_TYPE_ANNOUNCE = 0x02,
+ BATADV_CLAIM_TYPE_REQUEST = 0x03,
+};
+
+/**
+ * enum batadv_tvlv_type - tvlv type definitions
+ * @BATADV_TVLV_GW: gateway tvlv
+ * @BATADV_TVLV_DAT: distributed arp table tvlv
+ * @BATADV_TVLV_NC: network coding tvlv
+ * @BATADV_TVLV_TT: translation table tvlv
+ * @BATADV_TVLV_ROAM: roaming advertisement tvlv
+ * @BATADV_TVLV_MCAST: multicast capability tvlv
+ */
+enum batadv_tvlv_type {
+ BATADV_TVLV_GW = 0x01,
+ BATADV_TVLV_DAT = 0x02,
+ BATADV_TVLV_NC = 0x03,
+ BATADV_TVLV_TT = 0x04,
+ BATADV_TVLV_ROAM = 0x05,
+ BATADV_TVLV_MCAST = 0x06,
+};
+
+#pragma pack(2)
+/* the destination hardware field in the ARP frame is used to
+ * transport the claim type and the group id
+ */
+struct batadv_bla_claim_dst {
+ uint8_t magic[3]; /* FF:43:05 */
+ uint8_t type; /* bla_claimframe */
+ __be16 group; /* group id */
+};
+
+#pragma pack()
+
+/**
+ * struct batadv_ogm_packet - ogm (routing protocol) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @flags: contains routing relevant flags - see enum batadv_iv_flags
+ * @tvlv_len: length of tvlv data following the ogm header
+ */
+struct batadv_ogm_packet {
+ uint8_t packet_type;
+ uint8_t version;
+ uint8_t ttl;
+ uint8_t flags;
+ __be32 seqno;
+ uint8_t orig[ETH_ALEN];
+ uint8_t prev_sender[ETH_ALEN];
+ uint8_t reserved;
+ uint8_t tq;
+ __be16 tvlv_len;
+ /* __packed is not needed as the struct size is divisible by 4,
+ * and the largest data type in this struct has a size of 4.
+ */
+};
+
+#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
+
+/**
+ * batadv_icmp_header - common members among all the ICMP packets
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @align: not used - useful for alignment purposes only
+ *
+ * This structure is used for ICMP packets parsing only and it is never sent
+ * over the wire. The alignment field at the end is there to ensure that
+ * members are padded the same way as they are in real packets.
+ */
+struct batadv_icmp_header {
+ uint8_t packet_type;
+ uint8_t version;
+ uint8_t ttl;
+ uint8_t msg_type; /* see ICMP message types above */
+ uint8_t dst[ETH_ALEN];
+ uint8_t orig[ETH_ALEN];
+ uint8_t uid;
+ uint8_t align[3];
+};
+
+/**
+ * batadv_icmp_packet - ICMP packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @reserved: not used - useful for alignment
+ * @seqno: ICMP sequence number
+ */
+struct batadv_icmp_packet {
+ uint8_t packet_type;
+ uint8_t version;
+ uint8_t ttl;
+ uint8_t msg_type; /* see ICMP message types above */
+ uint8_t dst[ETH_ALEN];
+ uint8_t orig[ETH_ALEN];
+ uint8_t uid;
+ uint8_t reserved;
+ __be16 seqno;
+};
+
+#define BATADV_RR_LEN 16
+
+/**
+ * batadv_icmp_packet_rr - ICMP RouteRecord packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @msg_type: ICMP packet type
+ * @dst: address of the destination node
+ * @orig: address of the source node
+ * @uid: local ICMP socket identifier
+ * @rr_cur: number of entries the rr array
+ * @seqno: ICMP sequence number
+ * @rr: route record array
+ */
+struct batadv_icmp_packet_rr {
+ uint8_t packet_type;
+ uint8_t version;
+ uint8_t ttl;
+ uint8_t msg_type; /* see ICMP message types above */
+ uint8_t dst[ETH_ALEN];
+ uint8_t orig[ETH_ALEN];
+ uint8_t uid;
+ uint8_t rr_cur;
+ __be16 seqno;
+ uint8_t rr[BATADV_RR_LEN][ETH_ALEN];
+};
+
+#define BATADV_ICMP_MAX_PACKET_SIZE sizeof(struct batadv_icmp_packet_rr)
+
+/* All packet headers in front of an ethernet header have to be completely
+ * divisible by 2 but not by 4 to make the payload after the ethernet
+ * header again 4 bytes boundary aligned.
+ *
+ * A packing of 2 is necessary to avoid extra padding at the end of the struct
+ * caused by a structure member which is larger than two bytes. Otherwise
+ * the structure would not fulfill the previously mentioned rule to avoid the
+ * misalignment of the payload after the ethernet header. It may also lead to
+ * leakage of information when the padding it not initialized before sending.
+ */
+#pragma pack(2)
+
+/**
+ * struct batadv_unicast_packet - unicast packet for network payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @ttvn: translation table version number
+ * @dest: originator destination of the unicast packet
+ */
+struct batadv_unicast_packet {
+ uint8_t packet_type;
+ uint8_t version;
+ uint8_t ttl;
+ uint8_t ttvn; /* destination translation table version number */
+ uint8_t dest[ETH_ALEN];
+ /* "4 bytes boundary + 2 bytes" long to make the payload after the
+ * following ethernet header again 4 bytes boundary aligned
+ */
+};
+
+/**
+ * struct batadv_unicast_4addr_packet - extended unicast packet
+ * @u: common unicast packet header
+ * @src: address of the source
+ * @subtype: packet subtype
+ */
+struct batadv_unicast_4addr_packet {
+ struct batadv_unicast_packet u;
+ uint8_t src[ETH_ALEN];
+ uint8_t subtype;
+ uint8_t reserved;
+ /* "4 bytes boundary + 2 bytes" long to make the payload after the
+ * following ethernet header again 4 bytes boundary aligned
+ */
+};
+
+/**
+ * struct batadv_frag_packet - fragmented packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @dest: final destination used when routing fragments
+ * @orig: originator of the fragment used when merging the packet
+ * @no: fragment number within this sequence
+ * @reserved: reserved byte for alignment
+ * @seqno: sequence identification
+ * @total_size: size of the merged packet
+ */
+struct batadv_frag_packet {
+ uint8_t packet_type;
+ uint8_t version; /* batman version field */
+ uint8_t ttl;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ uint8_t no:4;
+ uint8_t reserved:4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ uint8_t reserved:4;
+ uint8_t no:4;
+#else
+#error "unknown bitfield endianness"
+#endif
+ uint8_t dest[ETH_ALEN];
+ uint8_t orig[ETH_ALEN];
+ __be16 seqno;
+ __be16 total_size;
+};
+
+/**
+ * struct batadv_bcast_packet - broadcast packet for network payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @reserved: reserved byte for alignment
+ * @seqno: sequence identification
+ * @orig: originator of the broadcast packet
+ */
+struct batadv_bcast_packet {
+ uint8_t packet_type;
+ uint8_t version; /* batman version field */
+ uint8_t ttl;
+ uint8_t reserved;
+ __be32 seqno;
+ uint8_t orig[ETH_ALEN];
+ /* "4 bytes boundary + 2 bytes" long to make the payload after the
+ * following ethernet header again 4 bytes boundary aligned
+ */
+};
+
+/**
+ * struct batadv_coded_packet - network coded packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @reserved: Align following fields to 2-byte boundaries
+ * @first_source: original source of first included packet
+ * @first_orig_dest: original destinal of first included packet
+ * @first_crc: checksum of first included packet
+ * @first_ttvn: tt-version number of first included packet
+ * @second_ttl: ttl of second packet
+ * @second_dest: second receiver of this coded packet
+ * @second_source: original source of second included packet
+ * @second_orig_dest: original destination of second included packet
+ * @second_crc: checksum of second included packet
+ * @second_ttvn: tt version number of second included packet
+ * @coded_len: length of network coded part of the payload
+ */
+struct batadv_coded_packet {
+ uint8_t packet_type;
+ uint8_t version; /* batman version field */
+ uint8_t ttl;
+ uint8_t first_ttvn;
+ /* uint8_t first_dest[ETH_ALEN]; - saved in mac header destination */
+ uint8_t first_source[ETH_ALEN];
+ uint8_t first_orig_dest[ETH_ALEN];
+ __be32 first_crc;
+ uint8_t second_ttl;
+ uint8_t second_ttvn;
+ uint8_t second_dest[ETH_ALEN];
+ uint8_t second_source[ETH_ALEN];
+ uint8_t second_orig_dest[ETH_ALEN];
+ __be32 second_crc;
+ __be16 coded_len;
+};
+
+#pragma pack()
+
+/**
+ * struct batadv_unicast_tvlv - generic unicast packet with tvlv payload
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @ttl: time to live for this packet, part of the genereal header
+ * @reserved: reserved field (for packet alignment)
+ * @src: address of the source
+ * @dst: address of the destination
+ * @tvlv_len: length of tvlv data following the unicast tvlv header
+ * @align: 2 bytes to align the header to a 4 byte boundary
+ */
+struct batadv_unicast_tvlv_packet {
+ uint8_t packet_type;
+ uint8_t version; /* batman version field */
+ uint8_t ttl;
+ uint8_t reserved;
+ uint8_t dst[ETH_ALEN];
+ uint8_t src[ETH_ALEN];
+ __be16 tvlv_len;
+ uint16_t align;
+};
+
+/**
+ * struct batadv_tvlv_hdr - base tvlv header struct
+ * @type: tvlv container type (see batadv_tvlv_type)
+ * @version: tvlv container version
+ * @len: tvlv container length
+ */
+struct batadv_tvlv_hdr {
+ uint8_t type;
+ uint8_t version;
+ __be16 len;
+};
+
+/**
+ * struct batadv_tvlv_gateway_data - gateway data propagated through gw tvlv
+ * container
+ * @bandwidth_down: advertised uplink download bandwidth
+ * @bandwidth_up: advertised uplink upload bandwidth
+ */
+struct batadv_tvlv_gateway_data {
+ __be32 bandwidth_down;
+ __be32 bandwidth_up;
+};
+
+/**
+ * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
+ * @flags: translation table flags (see batadv_tt_data_flags)
+ * @ttvn: translation table version number
+ * @vlan_num: number of announced VLANs. In the TVLV this struct is followed by
+ * one batadv_tvlv_tt_vlan_data object per announced vlan
+ */
+struct batadv_tvlv_tt_data {
+ uint8_t flags;
+ uint8_t ttvn;
+ __be16 num_vlan;
+};
+
+/**
+ * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through
+ * the tt tvlv container
+ * @crc: crc32 checksum of the entries belonging to this vlan
+ * @vid: vlan identifier
+ * @reserved: unused, useful for alignment purposes
+ */
+struct batadv_tvlv_tt_vlan_data {
+ __be32 crc;
+ __be16 vid;
+ uint16_t reserved;
+};
+
+/**
+ * struct batadv_tvlv_tt_change - translation table diff data
+ * @flags: status indicators concerning the non-mesh client (see
+ * batadv_tt_client_flags)
+ * @reserved: reserved field - useful for alignment purposes only
+ * @addr: mac address of non-mesh client that triggered this tt change
+ * @vid: VLAN identifier
+ */
+struct batadv_tvlv_tt_change {
+ uint8_t flags;
+ uint8_t reserved[3];
+ uint8_t addr[ETH_ALEN];
+ __be16 vid;
+};
+
+/**
+ * struct batadv_tvlv_roam_adv - roaming advertisement
+ * @client: mac address of roaming client
+ * @vid: VLAN identifier
+ */
+struct batadv_tvlv_roam_adv {
+ uint8_t client[ETH_ALEN];
+ __be16 vid;
+};
+
+/**
+ * struct batadv_tvlv_mcast_data - payload of a multicast tvlv
+ * @flags: multicast flags announced by the orig node
+ * @reserved: reserved field
+ */
+struct batadv_tvlv_mcast_data {
+ uint8_t flags;
+ uint8_t reserved[3];
+};
+
+#endif /* _NET_BATMAN_ADV_PACKET_H_ */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
new file mode 100644
index 000000000..da83982bf
--- /dev/null
+++ b/net/batman-adv/routing.c
@@ -0,0 +1,1086 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "routing.h"
+#include "send.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "icmp_socket.h"
+#include "translation-table.h"
+#include "originator.h"
+#include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
+#include "network-coding.h"
+#include "fragmentation.h"
+
+#include <linux/if_vlan.h>
+
+static int batadv_route_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+
+/**
+ * _batadv_update_route - set the router for this originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which is to be configured
+ * @recv_if: the receive interface for which this route is set
+ * @neigh_node: neighbor which should be the next router
+ *
+ * This function does not perform any error checks
+ */
+static void _batadv_update_route(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_neigh_node *neigh_node)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ struct batadv_neigh_node *curr_router;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, recv_if);
+ if (!orig_ifinfo)
+ return;
+
+ rcu_read_lock();
+ curr_router = rcu_dereference(orig_ifinfo->router);
+ if (curr_router && !atomic_inc_not_zero(&curr_router->refcount))
+ curr_router = NULL;
+ rcu_read_unlock();
+
+ /* route deleted */
+ if ((curr_router) && (!neigh_node)) {
+ batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
+ "Deleting route towards: %pM\n", orig_node->orig);
+ batadv_tt_global_del_orig(bat_priv, orig_node, -1,
+ "Deleted route towards originator");
+
+ /* route added */
+ } else if ((!curr_router) && (neigh_node)) {
+ batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
+ "Adding route towards: %pM (via %pM)\n",
+ orig_node->orig, neigh_node->addr);
+ /* route changed */
+ } else if (neigh_node && curr_router) {
+ batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
+ "Changing route towards: %pM (now via %pM - was via %pM)\n",
+ orig_node->orig, neigh_node->addr,
+ curr_router->addr);
+ }
+
+ if (curr_router)
+ batadv_neigh_node_free_ref(curr_router);
+
+ /* increase refcount of new best neighbor */
+ if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount))
+ neigh_node = NULL;
+
+ spin_lock_bh(&orig_node->neigh_list_lock);
+ rcu_assign_pointer(orig_ifinfo->router, neigh_node);
+ spin_unlock_bh(&orig_node->neigh_list_lock);
+ batadv_orig_ifinfo_free_ref(orig_ifinfo);
+
+ /* decrease refcount of previous best neighbor */
+ if (curr_router)
+ batadv_neigh_node_free_ref(curr_router);
+}
+
+/**
+ * batadv_update_route - set the router for this originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: orig node which is to be configured
+ * @recv_if: the receive interface for which this route is set
+ * @neigh_node: neighbor which should be the next router
+ */
+void batadv_update_route(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_neigh_node *neigh_node)
+{
+ struct batadv_neigh_node *router = NULL;
+
+ if (!orig_node)
+ goto out;
+
+ router = batadv_orig_router_get(orig_node, recv_if);
+
+ if (router != neigh_node)
+ _batadv_update_route(bat_priv, orig_node, recv_if, neigh_node);
+
+out:
+ if (router)
+ batadv_neigh_node_free_ref(router);
+}
+
+/* checks whether the host restarted and is in the protection time.
+ * returns:
+ * 0 if the packet is to be accepted
+ * 1 if the packet is to be ignored.
+ */
+int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff,
+ unsigned long *last_reset)
+{
+ if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE ||
+ seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) {
+ if (!batadv_has_timed_out(*last_reset,
+ BATADV_RESET_PROTECTION_MS))
+ return 1;
+
+ *last_reset = jiffies;
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "old packet received, start protection\n");
+ }
+
+ return 0;
+}
+
+bool batadv_check_management_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ int header_len)
+{
+ struct ethhdr *ethhdr;
+
+ /* drop packet if it has not necessary minimum size */
+ if (unlikely(!pskb_may_pull(skb, header_len)))
+ return false;
+
+ ethhdr = eth_hdr(skb);
+
+ /* packet with broadcast indication but unicast recipient */
+ if (!is_broadcast_ether_addr(ethhdr->h_dest))
+ return false;
+
+ /* packet with broadcast sender address */
+ if (is_broadcast_ether_addr(ethhdr->h_source))
+ return false;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, 0) < 0)
+ return false;
+
+ /* keep skb linear */
+ if (skb_linearize(skb) < 0)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_recv_my_icmp_packet - receive an icmp packet locally
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: icmp packet to process
+ *
+ * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * otherwise.
+ */
+static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_icmp_header *icmph;
+ int res, ret = NET_RX_DROP;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+
+ switch (icmph->msg_type) {
+ case BATADV_ECHO_REPLY:
+ case BATADV_DESTINATION_UNREACHABLE:
+ case BATADV_TTL_EXCEEDED:
+ /* receive the packet */
+ if (skb_linearize(skb) < 0)
+ break;
+
+ batadv_socket_receive_packet(icmph, skb->len);
+ break;
+ case BATADV_ECHO_REQUEST:
+ /* answer echo request (ping) */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* get routing information */
+ orig_node = batadv_orig_hash_find(bat_priv, icmph->orig);
+ if (!orig_node)
+ goto out;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto out;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+
+ ether_addr_copy(icmph->dst, icmph->orig);
+ ether_addr_copy(icmph->orig, primary_if->net_dev->dev_addr);
+ icmph->msg_type = BATADV_ECHO_REPLY;
+ icmph->ttl = BATADV_TTL;
+
+ res = batadv_send_skb_to_orig(skb, orig_node, NULL);
+ if (res != NET_XMIT_DROP)
+ ret = NET_RX_SUCCESS;
+
+ break;
+ default:
+ /* drop unknown type */
+ goto out;
+ }
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ return ret;
+}
+
+static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_icmp_packet *icmp_packet;
+ int ret = NET_RX_DROP;
+
+ icmp_packet = (struct batadv_icmp_packet *)skb->data;
+
+ /* send TTL exceeded if packet is an echo request (traceroute) */
+ if (icmp_packet->msg_type != BATADV_ECHO_REQUEST) {
+ pr_debug("Warning - can't forward icmp packet from %pM to %pM: ttl exceeded\n",
+ icmp_packet->orig, icmp_packet->dst);
+ goto out;
+ }
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* get routing information */
+ orig_node = batadv_orig_hash_find(bat_priv, icmp_packet->orig);
+ if (!orig_node)
+ goto out;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto out;
+
+ icmp_packet = (struct batadv_icmp_packet *)skb->data;
+
+ ether_addr_copy(icmp_packet->dst, icmp_packet->orig);
+ ether_addr_copy(icmp_packet->orig, primary_if->net_dev->dev_addr);
+ icmp_packet->msg_type = BATADV_TTL_EXCEEDED;
+ icmp_packet->ttl = BATADV_TTL;
+
+ if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP)
+ ret = NET_RX_SUCCESS;
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ return ret;
+}
+
+int batadv_recv_icmp_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_icmp_header *icmph;
+ struct batadv_icmp_packet_rr *icmp_packet_rr;
+ struct ethhdr *ethhdr;
+ struct batadv_orig_node *orig_node = NULL;
+ int hdr_size = sizeof(struct batadv_icmp_header);
+ int ret = NET_RX_DROP;
+
+ /* drop packet if it has not necessary minimum size */
+ if (unlikely(!pskb_may_pull(skb, hdr_size)))
+ goto out;
+
+ ethhdr = eth_hdr(skb);
+
+ /* packet with unicast indication but broadcast recipient */
+ if (is_broadcast_ether_addr(ethhdr->h_dest))
+ goto out;
+
+ /* packet with broadcast sender address */
+ if (is_broadcast_ether_addr(ethhdr->h_source))
+ goto out;
+
+ /* not for me */
+ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest))
+ goto out;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+
+ /* add record route information if not full */
+ if ((icmph->msg_type == BATADV_ECHO_REPLY ||
+ icmph->msg_type == BATADV_ECHO_REQUEST) &&
+ (skb->len >= sizeof(struct batadv_icmp_packet_rr))) {
+ if (skb_linearize(skb) < 0)
+ goto out;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto out;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+ icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph;
+ if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN)
+ goto out;
+
+ ether_addr_copy(icmp_packet_rr->rr[icmp_packet_rr->rr_cur],
+ ethhdr->h_dest);
+ icmp_packet_rr->rr_cur++;
+ }
+
+ /* packet for me */
+ if (batadv_is_my_mac(bat_priv, icmph->dst))
+ return batadv_recv_my_icmp_packet(bat_priv, skb);
+
+ /* TTL exceeded */
+ if (icmph->ttl < 2)
+ return batadv_recv_icmp_ttl_exceeded(bat_priv, skb);
+
+ /* get routing information */
+ orig_node = batadv_orig_hash_find(bat_priv, icmph->dst);
+ if (!orig_node)
+ goto out;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto out;
+
+ icmph = (struct batadv_icmp_header *)skb->data;
+
+ /* decrement ttl */
+ icmph->ttl--;
+
+ /* route it */
+ if (batadv_send_skb_to_orig(skb, orig_node, recv_if) != NET_XMIT_DROP)
+ ret = NET_RX_SUCCESS;
+
+out:
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ return ret;
+}
+
+/**
+ * batadv_check_unicast_packet - Check for malformed unicast packets
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: packet to check
+ * @hdr_size: size of header to pull
+ *
+ * Check for short header and bad addresses in given packet. Returns negative
+ * value when check fails and 0 otherwise. The negative value depends on the
+ * reason: -ENODATA for bad header, -EBADR for broadcast destination or source,
+ * and -EREMOTE for non-local (other host) destination.
+ */
+static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ struct ethhdr *ethhdr;
+
+ /* drop packet if it has not necessary minimum size */
+ if (unlikely(!pskb_may_pull(skb, hdr_size)))
+ return -ENODATA;
+
+ ethhdr = eth_hdr(skb);
+
+ /* packet with unicast indication but broadcast recipient */
+ if (is_broadcast_ether_addr(ethhdr->h_dest))
+ return -EBADR;
+
+ /* packet with broadcast sender address */
+ if (is_broadcast_ether_addr(ethhdr->h_source))
+ return -EBADR;
+
+ /* not for me */
+ if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest))
+ return -EREMOTE;
+
+ return 0;
+}
+
+/**
+ * batadv_find_router - find a suitable router for this originator
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: the destination node
+ * @recv_if: pointer to interface this packet was received on
+ *
+ * Returns the router which should be used for this orig_node on
+ * this interface, or NULL if not available.
+ */
+struct batadv_neigh_node *
+batadv_find_router(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ struct batadv_neigh_node *first_candidate_router = NULL;
+ struct batadv_neigh_node *next_candidate_router = NULL;
+ struct batadv_neigh_node *router, *cand_router = NULL;
+ struct batadv_neigh_node *last_cand_router = NULL;
+ struct batadv_orig_ifinfo *cand, *first_candidate = NULL;
+ struct batadv_orig_ifinfo *next_candidate = NULL;
+ struct batadv_orig_ifinfo *last_candidate;
+ bool last_candidate_found = false;
+
+ if (!orig_node)
+ return NULL;
+
+ router = batadv_orig_router_get(orig_node, recv_if);
+
+ if (!router)
+ return router;
+
+ /* only consider bonding for recv_if == BATADV_IF_DEFAULT (first hop)
+ * and if activated.
+ */
+ if (!(recv_if == BATADV_IF_DEFAULT && atomic_read(&bat_priv->bonding)))
+ return router;
+
+ /* bonding: loop through the list of possible routers found
+ * for the various outgoing interfaces and find a candidate after
+ * the last chosen bonding candidate (next_candidate). If no such
+ * router is found, use the first candidate found (the previously
+ * chosen bonding candidate might have been the last one in the list).
+ * If this can't be found either, return the previously chosen
+ * router - obviously there are no other candidates.
+ */
+ rcu_read_lock();
+ last_candidate = orig_node->last_bonding_candidate;
+ if (last_candidate)
+ last_cand_router = rcu_dereference(last_candidate->router);
+
+ hlist_for_each_entry_rcu(cand, &orig_node->ifinfo_list, list) {
+ /* acquire some structures and references ... */
+ if (!atomic_inc_not_zero(&cand->refcount))
+ continue;
+
+ cand_router = rcu_dereference(cand->router);
+ if (!cand_router)
+ goto next;
+
+ if (!atomic_inc_not_zero(&cand_router->refcount)) {
+ cand_router = NULL;
+ goto next;
+ }
+
+ /* alternative candidate should be good enough to be
+ * considered
+ */
+ if (!bao->bat_neigh_is_equiv_or_better(cand_router,
+ cand->if_outgoing,
+ router, recv_if))
+ goto next;
+
+ /* don't use the same router twice */
+ if (last_cand_router == cand_router)
+ goto next;
+
+ /* mark the first possible candidate */
+ if (!first_candidate) {
+ atomic_inc(&cand_router->refcount);
+ atomic_inc(&cand->refcount);
+ first_candidate = cand;
+ first_candidate_router = cand_router;
+ }
+
+ /* check if the loop has already passed the previously selected
+ * candidate ... this function should select the next candidate
+ * AFTER the previously used bonding candidate.
+ */
+ if (!last_candidate || last_candidate_found) {
+ next_candidate = cand;
+ next_candidate_router = cand_router;
+ break;
+ }
+
+ if (last_candidate == cand)
+ last_candidate_found = true;
+next:
+ /* free references */
+ if (cand_router) {
+ batadv_neigh_node_free_ref(cand_router);
+ cand_router = NULL;
+ }
+ batadv_orig_ifinfo_free_ref(cand);
+ }
+ rcu_read_unlock();
+
+ /* last_bonding_candidate is reset below, remove the old reference. */
+ if (orig_node->last_bonding_candidate)
+ batadv_orig_ifinfo_free_ref(orig_node->last_bonding_candidate);
+
+ /* After finding candidates, handle the three cases:
+ * 1) there is a next candidate, use that
+ * 2) there is no next candidate, use the first of the list
+ * 3) there is no candidate at all, return the default router
+ */
+ if (next_candidate) {
+ batadv_neigh_node_free_ref(router);
+
+ /* remove references to first candidate, we don't need it. */
+ if (first_candidate) {
+ batadv_neigh_node_free_ref(first_candidate_router);
+ batadv_orig_ifinfo_free_ref(first_candidate);
+ }
+ router = next_candidate_router;
+ orig_node->last_bonding_candidate = next_candidate;
+ } else if (first_candidate) {
+ batadv_neigh_node_free_ref(router);
+
+ /* refcounting has already been done in the loop above. */
+ router = first_candidate_router;
+ orig_node->last_bonding_candidate = first_candidate;
+ } else {
+ orig_node->last_bonding_candidate = NULL;
+ }
+
+ return router;
+}
+
+static int batadv_route_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_unicast_packet *unicast_packet;
+ struct ethhdr *ethhdr = eth_hdr(skb);
+ int res, hdr_len, ret = NET_RX_DROP;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+
+ /* TTL exceeded */
+ if (unicast_packet->ttl < 2) {
+ pr_debug("Warning - can't forward unicast packet from %pM to %pM: ttl exceeded\n",
+ ethhdr->h_source, unicast_packet->dest);
+ goto out;
+ }
+
+ /* get routing information */
+ orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest);
+
+ if (!orig_node)
+ goto out;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto out;
+
+ /* decrement ttl */
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ unicast_packet->ttl--;
+
+ switch (unicast_packet->packet_type) {
+ case BATADV_UNICAST_4ADDR:
+ hdr_len = sizeof(struct batadv_unicast_4addr_packet);
+ break;
+ case BATADV_UNICAST:
+ hdr_len = sizeof(struct batadv_unicast_packet);
+ break;
+ default:
+ /* other packet types not supported - yet */
+ hdr_len = -1;
+ break;
+ }
+
+ if (hdr_len > 0)
+ batadv_skb_set_priority(skb, hdr_len);
+
+ res = batadv_send_skb_to_orig(skb, orig_node, recv_if);
+
+ /* translate transmit result into receive result */
+ if (res == NET_XMIT_SUCCESS) {
+ /* skb was transmitted and consumed */
+ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
+ batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
+ skb->len + ETH_HLEN);
+
+ ret = NET_RX_SUCCESS;
+ } else if (res == NET_XMIT_POLICED) {
+ /* skb was buffered and consumed */
+ ret = NET_RX_SUCCESS;
+ }
+
+out:
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ return ret;
+}
+
+/**
+ * batadv_reroute_unicast_packet - update the unicast header for re-routing
+ * @bat_priv: the bat priv with all the soft interface information
+ * @unicast_packet: the unicast header to be updated
+ * @dst_addr: the payload destination
+ * @vid: VLAN identifier
+ *
+ * Search the translation table for dst_addr and update the unicast header with
+ * the new corresponding information (originator address where the destination
+ * client currently is and its known TTVN)
+ *
+ * Returns true if the packet header has been updated, false otherwise
+ */
+static bool
+batadv_reroute_unicast_packet(struct batadv_priv *bat_priv,
+ struct batadv_unicast_packet *unicast_packet,
+ uint8_t *dst_addr, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_hard_iface *primary_if = NULL;
+ bool ret = false;
+ uint8_t *orig_addr, orig_ttvn;
+
+ if (batadv_is_my_client(bat_priv, dst_addr, vid)) {
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+ orig_addr = primary_if->net_dev->dev_addr;
+ orig_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn);
+ } else {
+ orig_node = batadv_transtable_search(bat_priv, NULL, dst_addr,
+ vid);
+ if (!orig_node)
+ goto out;
+
+ if (batadv_compare_eth(orig_node->orig, unicast_packet->dest))
+ goto out;
+
+ orig_addr = orig_node->orig;
+ orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn);
+ }
+
+ /* update the packet header */
+ ether_addr_copy(unicast_packet->dest, orig_addr);
+ unicast_packet->ttvn = orig_ttvn;
+
+ ret = true;
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+
+ return ret;
+}
+
+static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_len) {
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_hard_iface *primary_if;
+ struct batadv_orig_node *orig_node;
+ uint8_t curr_ttvn, old_ttvn;
+ struct ethhdr *ethhdr;
+ unsigned short vid;
+ int is_old_ttvn;
+
+ /* check if there is enough data before accessing it */
+ if (!pskb_may_pull(skb, hdr_len + ETH_HLEN))
+ return 0;
+
+ /* create a copy of the skb (in case of for re-routing) to modify it. */
+ if (skb_cow(skb, sizeof(*unicast_packet)) < 0)
+ return 0;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ vid = batadv_get_vid(skb, hdr_len);
+ ethhdr = (struct ethhdr *)(skb->data + hdr_len);
+
+ /* check if the destination client was served by this node and it is now
+ * roaming. In this case, it means that the node has got a ROAM_ADV
+ * message and that it knows the new destination in the mesh to re-route
+ * the packet to
+ */
+ if (batadv_tt_local_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) {
+ if (batadv_reroute_unicast_packet(bat_priv, unicast_packet,
+ ethhdr->h_dest, vid))
+ batadv_dbg_ratelimited(BATADV_DBG_TT,
+ bat_priv,
+ "Rerouting unicast packet to %pM (dst=%pM): Local Roaming\n",
+ unicast_packet->dest,
+ ethhdr->h_dest);
+ /* at this point the mesh destination should have been
+ * substituted with the originator address found in the global
+ * table. If not, let the packet go untouched anyway because
+ * there is nothing the node can do
+ */
+ return 1;
+ }
+
+ /* retrieve the TTVN known by this node for the packet destination. This
+ * value is used later to check if the node which sent (or re-routed
+ * last time) the packet had an updated information or not
+ */
+ curr_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn);
+ if (!batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
+ orig_node = batadv_orig_hash_find(bat_priv,
+ unicast_packet->dest);
+ /* if it is not possible to find the orig_node representing the
+ * destination, the packet can immediately be dropped as it will
+ * not be possible to deliver it
+ */
+ if (!orig_node)
+ return 0;
+
+ curr_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn);
+ batadv_orig_node_free_ref(orig_node);
+ }
+
+ /* check if the TTVN contained in the packet is fresher than what the
+ * node knows
+ */
+ is_old_ttvn = batadv_seq_before(unicast_packet->ttvn, curr_ttvn);
+ if (!is_old_ttvn)
+ return 1;
+
+ old_ttvn = unicast_packet->ttvn;
+ /* the packet was forged based on outdated network information. Its
+ * destination can possibly be updated and forwarded towards the new
+ * target host
+ */
+ if (batadv_reroute_unicast_packet(bat_priv, unicast_packet,
+ ethhdr->h_dest, vid)) {
+ batadv_dbg_ratelimited(BATADV_DBG_TT, bat_priv,
+ "Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n",
+ unicast_packet->dest, ethhdr->h_dest,
+ old_ttvn, curr_ttvn);
+ return 1;
+ }
+
+ /* the packet has not been re-routed: either the destination is
+ * currently served by this node or there is no destination at all and
+ * it is possible to drop the packet
+ */
+ if (!batadv_is_my_client(bat_priv, ethhdr->h_dest, vid))
+ return 0;
+
+ /* update the header in order to let the packet be delivered to this
+ * node's soft interface
+ */
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return 0;
+
+ ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr);
+
+ batadv_hardif_free_ref(primary_if);
+
+ unicast_packet->ttvn = curr_ttvn;
+
+ return 1;
+}
+
+/**
+ * batadv_recv_unhandled_unicast_packet - receive and process packets which
+ * are in the unicast number space but not yet known to the implementation
+ * @skb: unicast tvlv packet to process
+ * @recv_if: pointer to interface this packet was received on
+ *
+ * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * otherwise.
+ */
+int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ int check, hdr_size = sizeof(*unicast_packet);
+
+ check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
+ if (check < 0)
+ return NET_RX_DROP;
+
+ /* we don't know about this type, drop it. */
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ if (batadv_is_my_mac(bat_priv, unicast_packet->dest))
+ return NET_RX_DROP;
+
+ return batadv_route_unicast_packet(skb, recv_if);
+}
+
+int batadv_recv_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_unicast_packet *unicast_packet;
+ struct batadv_unicast_4addr_packet *unicast_4addr_packet;
+ uint8_t *orig_addr;
+ struct batadv_orig_node *orig_node = NULL;
+ int check, hdr_size = sizeof(*unicast_packet);
+ bool is4addr;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data;
+
+ is4addr = unicast_packet->packet_type == BATADV_UNICAST_4ADDR;
+ /* the caller function should have already pulled 2 bytes */
+ if (is4addr)
+ hdr_size = sizeof(*unicast_4addr_packet);
+
+ /* function returns -EREMOTE for promiscuous packets */
+ check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
+
+ /* Even though the packet is not for us, we might save it to use for
+ * decoding a later received coded packet
+ */
+ if (check == -EREMOTE)
+ batadv_nc_skb_store_sniffed_unicast(bat_priv, skb);
+
+ if (check < 0)
+ return NET_RX_DROP;
+ if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size))
+ return NET_RX_DROP;
+
+ /* packet for me */
+ if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
+ if (is4addr) {
+ batadv_dat_inc_counter(bat_priv,
+ unicast_4addr_packet->subtype);
+ orig_addr = unicast_4addr_packet->src;
+ orig_node = batadv_orig_hash_find(bat_priv, orig_addr);
+ }
+
+ if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb,
+ hdr_size))
+ goto rx_success;
+ if (batadv_dat_snoop_incoming_arp_reply(bat_priv, skb,
+ hdr_size))
+ goto rx_success;
+
+ batadv_interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size,
+ orig_node);
+
+rx_success:
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+
+ return NET_RX_SUCCESS;
+ }
+
+ return batadv_route_unicast_packet(skb, recv_if);
+}
+
+/**
+ * batadv_recv_unicast_tvlv - receive and process unicast tvlv packets
+ * @skb: unicast tvlv packet to process
+ * @recv_if: pointer to interface this packet was received on
+ * @dst_addr: the payload destination
+ *
+ * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * otherwise.
+ */
+int batadv_recv_unicast_tvlv(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
+ unsigned char *tvlv_buff;
+ uint16_t tvlv_buff_len;
+ int hdr_size = sizeof(*unicast_tvlv_packet);
+ int ret = NET_RX_DROP;
+
+ if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0)
+ return NET_RX_DROP;
+
+ /* the header is likely to be modified while forwarding */
+ if (skb_cow(skb, hdr_size) < 0)
+ return NET_RX_DROP;
+
+ /* packet needs to be linearized to access the tvlv content */
+ if (skb_linearize(skb) < 0)
+ return NET_RX_DROP;
+
+ unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)skb->data;
+
+ tvlv_buff = (unsigned char *)(skb->data + hdr_size);
+ tvlv_buff_len = ntohs(unicast_tvlv_packet->tvlv_len);
+
+ if (tvlv_buff_len > skb->len - hdr_size)
+ return NET_RX_DROP;
+
+ ret = batadv_tvlv_containers_process(bat_priv, false, NULL,
+ unicast_tvlv_packet->src,
+ unicast_tvlv_packet->dst,
+ tvlv_buff, tvlv_buff_len);
+
+ if (ret != NET_RX_SUCCESS)
+ ret = batadv_route_unicast_packet(skb, recv_if);
+ else
+ consume_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_recv_frag_packet - process received fragment
+ * @skb: the received fragment
+ * @recv_if: interface that the skb is received on
+ *
+ * This function does one of the three following things: 1) Forward fragment, if
+ * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till
+ * lack further fragments; 3) Merge fragments, if we have all needed parts.
+ *
+ * Return NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
+ */
+int batadv_recv_frag_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_orig_node *orig_node_src = NULL;
+ struct batadv_frag_packet *frag_packet;
+ int ret = NET_RX_DROP;
+
+ if (batadv_check_unicast_packet(bat_priv, skb,
+ sizeof(*frag_packet)) < 0)
+ goto out;
+
+ frag_packet = (struct batadv_frag_packet *)skb->data;
+ orig_node_src = batadv_orig_hash_find(bat_priv, frag_packet->orig);
+ if (!orig_node_src)
+ goto out;
+
+ /* Route the fragment if it is not for us and too big to be merged. */
+ if (!batadv_is_my_mac(bat_priv, frag_packet->dest) &&
+ batadv_frag_skb_fwd(skb, recv_if, orig_node_src)) {
+ ret = NET_RX_SUCCESS;
+ goto out;
+ }
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_FRAG_RX_BYTES, skb->len);
+
+ /* Add fragment to buffer and merge if possible. */
+ if (!batadv_frag_skb_buffer(&skb, orig_node_src))
+ goto out;
+
+ /* Deliver merged packet to the appropriate handler, if it was
+ * merged
+ */
+ if (skb)
+ batadv_batman_skb_recv(skb, recv_if->net_dev,
+ &recv_if->batman_adv_ptype, NULL);
+
+ ret = NET_RX_SUCCESS;
+
+out:
+ if (orig_node_src)
+ batadv_orig_node_free_ref(orig_node_src);
+
+ return ret;
+}
+
+int batadv_recv_bcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_bcast_packet *bcast_packet;
+ struct ethhdr *ethhdr;
+ int hdr_size = sizeof(*bcast_packet);
+ int ret = NET_RX_DROP;
+ int32_t seq_diff;
+ uint32_t seqno;
+
+ /* drop packet if it has not necessary minimum size */
+ if (unlikely(!pskb_may_pull(skb, hdr_size)))
+ goto out;
+
+ ethhdr = eth_hdr(skb);
+
+ /* packet with broadcast indication but unicast recipient */
+ if (!is_broadcast_ether_addr(ethhdr->h_dest))
+ goto out;
+
+ /* packet with broadcast sender address */
+ if (is_broadcast_ether_addr(ethhdr->h_source))
+ goto out;
+
+ /* ignore broadcasts sent by myself */
+ if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
+ goto out;
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+
+ /* ignore broadcasts originated by myself */
+ if (batadv_is_my_mac(bat_priv, bcast_packet->orig))
+ goto out;
+
+ if (bcast_packet->ttl < 2)
+ goto out;
+
+ orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig);
+
+ if (!orig_node)
+ goto out;
+
+ spin_lock_bh(&orig_node->bcast_seqno_lock);
+
+ seqno = ntohl(bcast_packet->seqno);
+ /* check whether the packet is a duplicate */
+ if (batadv_test_bit(orig_node->bcast_bits, orig_node->last_bcast_seqno,
+ seqno))
+ goto spin_unlock;
+
+ seq_diff = seqno - orig_node->last_bcast_seqno;
+
+ /* check whether the packet is old and the host just restarted. */
+ if (batadv_window_protected(bat_priv, seq_diff,
+ &orig_node->bcast_seqno_reset))
+ goto spin_unlock;
+
+ /* mark broadcast in flood history, update window position
+ * if required.
+ */
+ if (batadv_bit_get_packet(bat_priv, orig_node->bcast_bits, seq_diff, 1))
+ orig_node->last_bcast_seqno = seqno;
+
+ spin_unlock_bh(&orig_node->bcast_seqno_lock);
+
+ /* check whether this has been sent by another originator before */
+ if (batadv_bla_check_bcast_duplist(bat_priv, skb))
+ goto out;
+
+ batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet));
+
+ /* rebroadcast packet */
+ batadv_add_bcast_packet_to_list(bat_priv, skb, 1);
+
+ /* don't hand the broadcast up if it is from an originator
+ * from the same backbone.
+ */
+ if (batadv_bla_is_backbone_gw(skb, orig_node, hdr_size))
+ goto out;
+
+ if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb, hdr_size))
+ goto rx_success;
+ if (batadv_dat_snoop_incoming_arp_reply(bat_priv, skb, hdr_size))
+ goto rx_success;
+
+ /* broadcast for me */
+ batadv_interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size,
+ orig_node);
+
+rx_success:
+ ret = NET_RX_SUCCESS;
+ goto out;
+
+spin_unlock:
+ spin_unlock_bh(&orig_node->bcast_seqno_lock);
+out:
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ return ret;
+}
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
new file mode 100644
index 000000000..557d3d12a
--- /dev/null
+++ b/net/batman-adv/routing.h
@@ -0,0 +1,51 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_ROUTING_H_
+#define _NET_BATMAN_ADV_ROUTING_H_
+
+bool batadv_check_management_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ int header_len);
+void batadv_update_route(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if,
+ struct batadv_neigh_node *neigh_node);
+int batadv_recv_icmp_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_frag_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *iface);
+int batadv_recv_bcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_tt_query(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_roam_adv(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_unicast_tvlv(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+struct batadv_neigh_node *
+batadv_find_router(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if);
+int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff,
+ unsigned long *last_reset);
+
+#endif /* _NET_BATMAN_ADV_ROUTING_H_ */
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
new file mode 100644
index 000000000..3d64ed20c
--- /dev/null
+++ b/net/batman-adv/send.c
@@ -0,0 +1,645 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "distributed-arp-table.h"
+#include "send.h"
+#include "routing.h"
+#include "translation-table.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+#include "originator.h"
+#include "network-coding.h"
+#include "fragmentation.h"
+#include "multicast.h"
+
+static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
+
+/* send out an already prepared packet to the given address via the
+ * specified batman interface
+ */
+int batadv_send_skb_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ const uint8_t *dst_addr)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct ethhdr *ethhdr;
+
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ goto send_skb_err;
+
+ if (unlikely(!hard_iface->net_dev))
+ goto send_skb_err;
+
+ if (!(hard_iface->net_dev->flags & IFF_UP)) {
+ pr_warn("Interface %s is not up - can't send packet via that interface!\n",
+ hard_iface->net_dev->name);
+ goto send_skb_err;
+ }
+
+ /* push to the ethernet header. */
+ if (batadv_skb_head_push(skb, ETH_HLEN) < 0)
+ goto send_skb_err;
+
+ skb_reset_mac_header(skb);
+
+ ethhdr = eth_hdr(skb);
+ ether_addr_copy(ethhdr->h_source, hard_iface->net_dev->dev_addr);
+ ether_addr_copy(ethhdr->h_dest, dst_addr);
+ ethhdr->h_proto = htons(ETH_P_BATMAN);
+
+ skb_set_network_header(skb, ETH_HLEN);
+ skb->protocol = htons(ETH_P_BATMAN);
+
+ skb->dev = hard_iface->net_dev;
+
+ /* Save a clone of the skb to use when decoding coded packets */
+ batadv_nc_skb_store_for_decoding(bat_priv, skb);
+
+ /* dev_queue_xmit() returns a negative result on error. However on
+ * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
+ * (which is > 0). This will not be treated as an error.
+ */
+ return dev_queue_xmit(skb);
+send_skb_err:
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+
+/**
+ * batadv_send_skb_to_orig - Lookup next-hop and transmit skb.
+ * @skb: Packet to be transmitted.
+ * @orig_node: Final destination of the packet.
+ * @recv_if: Interface used when receiving the packet (can be NULL).
+ *
+ * Looks up the best next-hop towards the passed originator and passes the
+ * skb on for preparation of MAC header. If the packet originated from this
+ * host, NULL can be passed as recv_if and no interface alternating is
+ * attempted.
+ *
+ * Returns NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or
+ * NET_XMIT_POLICED if the skb is buffered for later transmit.
+ */
+int batadv_send_skb_to_orig(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = orig_node->bat_priv;
+ struct batadv_neigh_node *neigh_node;
+ int ret = NET_XMIT_DROP;
+
+ /* batadv_find_router() increases neigh_nodes refcount if found. */
+ neigh_node = batadv_find_router(bat_priv, orig_node, recv_if);
+ if (!neigh_node)
+ goto out;
+
+ /* Check if the skb is too large to send in one piece and fragment
+ * it if needed.
+ */
+ if (atomic_read(&bat_priv->fragmentation) &&
+ skb->len > neigh_node->if_incoming->net_dev->mtu) {
+ /* Fragment and send packet. */
+ if (batadv_frag_send_packet(skb, orig_node, neigh_node))
+ ret = NET_XMIT_SUCCESS;
+
+ goto out;
+ }
+
+ /* try to network code the packet, if it is received on an interface
+ * (i.e. being forwarded). If the packet originates from this node or if
+ * network coding fails, then send the packet as usual.
+ */
+ if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) {
+ ret = NET_XMIT_POLICED;
+ } else {
+ batadv_send_skb_packet(skb, neigh_node->if_incoming,
+ neigh_node->addr);
+ ret = NET_XMIT_SUCCESS;
+ }
+
+out:
+ if (neigh_node)
+ batadv_neigh_node_free_ref(neigh_node);
+
+ return ret;
+}
+
+/**
+ * batadv_send_skb_push_fill_unicast - extend the buffer and initialize the
+ * common fields for unicast packets
+ * @skb: the skb carrying the unicast header to initialize
+ * @hdr_size: amount of bytes to push at the beginning of the skb
+ * @orig_node: the destination node
+ *
+ * Returns false if the buffer extension was not possible or true otherwise.
+ */
+static bool
+batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_unicast_packet *unicast_packet;
+ uint8_t ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn);
+
+ if (batadv_skb_head_push(skb, hdr_size) < 0)
+ return false;
+
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+ unicast_packet->version = BATADV_COMPAT_VERSION;
+ /* batman packet type: unicast */
+ unicast_packet->packet_type = BATADV_UNICAST;
+ /* set unicast ttl */
+ unicast_packet->ttl = BATADV_TTL;
+ /* copy the destination for faster routing */
+ ether_addr_copy(unicast_packet->dest, orig_node->orig);
+ /* set the destination tt version number */
+ unicast_packet->ttvn = ttvn;
+
+ return true;
+}
+
+/**
+ * batadv_send_skb_prepare_unicast - encapsulate an skb with a unicast header
+ * @skb: the skb containing the payload to encapsulate
+ * @orig_node: the destination node
+ *
+ * Returns false if the payload could not be encapsulated or true otherwise.
+ */
+static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node)
+{
+ size_t uni_size = sizeof(struct batadv_unicast_packet);
+
+ return batadv_send_skb_push_fill_unicast(skb, uni_size, orig_node);
+}
+
+/**
+ * batadv_send_skb_prepare_unicast_4addr - encapsulate an skb with a
+ * unicast 4addr header
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the skb containing the payload to encapsulate
+ * @orig_node: the destination node
+ * @packet_subtype: the unicast 4addr packet subtype to use
+ *
+ * Returns false if the payload could not be encapsulated or true otherwise.
+ */
+bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ struct batadv_orig_node *orig,
+ int packet_subtype)
+{
+ struct batadv_hard_iface *primary_if;
+ struct batadv_unicast_4addr_packet *uc_4addr_packet;
+ bool ret = false;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* Pull the header space and fill the unicast_packet substructure.
+ * We can do that because the first member of the uc_4addr_packet
+ * is of type struct unicast_packet
+ */
+ if (!batadv_send_skb_push_fill_unicast(skb, sizeof(*uc_4addr_packet),
+ orig))
+ goto out;
+
+ uc_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data;
+ uc_4addr_packet->u.packet_type = BATADV_UNICAST_4ADDR;
+ ether_addr_copy(uc_4addr_packet->src, primary_if->net_dev->dev_addr);
+ uc_4addr_packet->subtype = packet_subtype;
+ uc_4addr_packet->reserved = 0;
+
+ ret = true;
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return ret;
+}
+
+/**
+ * batadv_send_skb_unicast - encapsulate and send an skb via unicast
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: payload to send
+ * @packet_type: the batman unicast packet type to use
+ * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast
+ * 4addr packets)
+ * @orig_node: the originator to send the packet to
+ * @vid: the vid to be used to search the translation table
+ *
+ * Wrap the given skb into a batman-adv unicast or unicast-4addr header
+ * depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied
+ * as packet_type. Then send this frame to the given orig_node and release a
+ * reference to this orig_node.
+ *
+ * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int packet_type,
+ int packet_subtype,
+ struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ struct ethhdr *ethhdr;
+ struct batadv_unicast_packet *unicast_packet;
+ int ret = NET_XMIT_DROP;
+
+ if (!orig_node)
+ goto out;
+
+ switch (packet_type) {
+ case BATADV_UNICAST:
+ if (!batadv_send_skb_prepare_unicast(skb, orig_node))
+ goto out;
+ break;
+ case BATADV_UNICAST_4ADDR:
+ if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, skb,
+ orig_node,
+ packet_subtype))
+ goto out;
+ break;
+ default:
+ /* this function supports UNICAST and UNICAST_4ADDR only. It
+ * should never be invoked with any other packet type
+ */
+ goto out;
+ }
+
+ /* skb->data might have been reallocated by
+ * batadv_send_skb_prepare_unicast{,_4addr}()
+ */
+ ethhdr = eth_hdr(skb);
+ unicast_packet = (struct batadv_unicast_packet *)skb->data;
+
+ /* inform the destination node that we are still missing a correct route
+ * for this client. The destination will receive this packet and will
+ * try to reroute it because the ttvn contained in the header is less
+ * than the current one
+ */
+ if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid))
+ unicast_packet->ttvn = unicast_packet->ttvn - 1;
+
+ if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP)
+ ret = NET_XMIT_SUCCESS;
+
+out:
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ if (ret == NET_XMIT_DROP)
+ kfree_skb(skb);
+ return ret;
+}
+
+/**
+ * batadv_send_skb_via_tt_generic - send an skb via TT lookup
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: payload to send
+ * @packet_type: the batman unicast packet type to use
+ * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast
+ * 4addr packets)
+ * @dst_hint: can be used to override the destination contained in the skb
+ * @vid: the vid to be used to search the translation table
+ *
+ * Look up the recipient node for the destination address in the ethernet
+ * header via the translation table. Wrap the given skb into a batman-adv
+ * unicast or unicast-4addr header depending on whether BATADV_UNICAST or
+ * BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame
+ * to the according destination node.
+ *
+ * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int packet_type,
+ int packet_subtype, uint8_t *dst_hint,
+ unsigned short vid)
+{
+ struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
+ struct batadv_orig_node *orig_node;
+ uint8_t *src, *dst;
+
+ src = ethhdr->h_source;
+ dst = ethhdr->h_dest;
+
+ /* if we got an hint! let's send the packet to this client (if any) */
+ if (dst_hint) {
+ src = NULL;
+ dst = dst_hint;
+ }
+ orig_node = batadv_transtable_search(bat_priv, src, dst, vid);
+
+ return batadv_send_skb_unicast(bat_priv, skb, packet_type,
+ packet_subtype, orig_node, vid);
+}
+
+/**
+ * batadv_send_skb_via_gw - send an skb via gateway lookup
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: payload to send
+ * @vid: the vid to be used to search the translation table
+ *
+ * Look up the currently selected gateway. Wrap the given skb into a batman-adv
+ * unicast header and send this frame to this gateway node.
+ *
+ * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+
+ orig_node = batadv_gw_get_selected_orig(bat_priv);
+ return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0,
+ orig_node, vid);
+}
+
+void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+
+ if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
+ (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
+ return;
+
+ /* the interface gets activated here to avoid race conditions between
+ * the moment of activating the interface in
+ * hardif_activate_interface() where the originator mac is set and
+ * outdated packets (especially uninitialized mac addresses) in the
+ * packet queue
+ */
+ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED)
+ hard_iface->if_status = BATADV_IF_ACTIVE;
+
+ bat_priv->bat_algo_ops->bat_ogm_schedule(hard_iface);
+}
+
+static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
+{
+ if (forw_packet->skb)
+ kfree_skb(forw_packet->skb);
+ if (forw_packet->if_incoming)
+ batadv_hardif_free_ref(forw_packet->if_incoming);
+ if (forw_packet->if_outgoing)
+ batadv_hardif_free_ref(forw_packet->if_outgoing);
+ kfree(forw_packet);
+}
+
+static void
+_batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
+ struct batadv_forw_packet *forw_packet,
+ unsigned long send_time)
+{
+ /* add new packet to packet list */
+ spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+ hlist_add_head(&forw_packet->list, &bat_priv->forw_bcast_list);
+ spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+ /* start timer for this packet */
+ queue_delayed_work(batadv_event_workqueue, &forw_packet->delayed_work,
+ send_time);
+}
+
+/* add a broadcast packet to the queue and setup timers. broadcast packets
+ * are sent multiple times to increase probability for being received.
+ *
+ * This function returns NETDEV_TX_OK on success and NETDEV_TX_BUSY on
+ * errors.
+ *
+ * The skb is not consumed, so the caller should make sure that the
+ * skb is freed.
+ */
+int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
+ const struct sk_buff *skb,
+ unsigned long delay)
+{
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_forw_packet *forw_packet;
+ struct batadv_bcast_packet *bcast_packet;
+ struct sk_buff *newskb;
+
+ if (!batadv_atomic_dec_not_zero(&bat_priv->bcast_queue_left)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "bcast packet queue full\n");
+ goto out;
+ }
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out_and_inc;
+
+ forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC);
+
+ if (!forw_packet)
+ goto out_and_inc;
+
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb)
+ goto packet_free;
+
+ /* as we have a copy now, it is safe to decrease the TTL */
+ bcast_packet = (struct batadv_bcast_packet *)newskb->data;
+ bcast_packet->ttl--;
+
+ skb_reset_mac_header(newskb);
+
+ forw_packet->skb = newskb;
+ forw_packet->if_incoming = primary_if;
+ forw_packet->if_outgoing = NULL;
+
+ /* how often did we send the bcast packet ? */
+ forw_packet->num_packets = 0;
+
+ INIT_DELAYED_WORK(&forw_packet->delayed_work,
+ batadv_send_outstanding_bcast_packet);
+
+ _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay);
+ return NETDEV_TX_OK;
+
+packet_free:
+ kfree(forw_packet);
+out_and_inc:
+ atomic_inc(&bat_priv->bcast_queue_left);
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return NETDEV_TX_BUSY;
+}
+
+static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct delayed_work *delayed_work;
+ struct batadv_forw_packet *forw_packet;
+ struct sk_buff *skb1;
+ struct net_device *soft_iface;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ forw_packet = container_of(delayed_work, struct batadv_forw_packet,
+ delayed_work);
+ soft_iface = forw_packet->if_incoming->soft_iface;
+ bat_priv = netdev_priv(soft_iface);
+
+ spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+ hlist_del(&forw_packet->list);
+ spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ goto out;
+
+ if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet))
+ goto out;
+
+ /* rebroadcast packet */
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface != soft_iface)
+ continue;
+
+ if (forw_packet->num_packets >= hard_iface->num_bcasts)
+ continue;
+
+ /* send a copy of the saved skb */
+ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
+ if (skb1)
+ batadv_send_skb_packet(skb1, hard_iface,
+ batadv_broadcast_addr);
+ }
+ rcu_read_unlock();
+
+ forw_packet->num_packets++;
+
+ /* if we still have some more bcasts to send */
+ if (forw_packet->num_packets < BATADV_NUM_BCASTS_MAX) {
+ _batadv_add_bcast_packet_to_list(bat_priv, forw_packet,
+ msecs_to_jiffies(5));
+ return;
+ }
+
+out:
+ batadv_forw_packet_free(forw_packet);
+ atomic_inc(&bat_priv->bcast_queue_left);
+}
+
+void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_forw_packet *forw_packet;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ forw_packet = container_of(delayed_work, struct batadv_forw_packet,
+ delayed_work);
+ bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
+ spin_lock_bh(&bat_priv->forw_bat_list_lock);
+ hlist_del(&forw_packet->list);
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ goto out;
+
+ bat_priv->bat_algo_ops->bat_ogm_emit(forw_packet);
+
+ /* we have to have at least one packet in the queue to determine the
+ * queues wake up time unless we are shutting down.
+ *
+ * only re-schedule if this is the "original" copy, e.g. the OGM of the
+ * primary interface should only be rescheduled once per period, but
+ * this function will be called for the forw_packet instances of the
+ * other secondary interfaces as well.
+ */
+ if (forw_packet->own &&
+ forw_packet->if_incoming == forw_packet->if_outgoing)
+ batadv_schedule_bat_ogm(forw_packet->if_incoming);
+
+out:
+ /* don't count own packet */
+ if (!forw_packet->own)
+ atomic_inc(&bat_priv->batman_queue_left);
+
+ batadv_forw_packet_free(forw_packet);
+}
+
+void
+batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
+ const struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_forw_packet *forw_packet;
+ struct hlist_node *safe_tmp_node;
+ bool pending;
+
+ if (hard_iface)
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "purge_outstanding_packets(): %s\n",
+ hard_iface->net_dev->name);
+ else
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "purge_outstanding_packets()\n");
+
+ /* free bcast list */
+ spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+ hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
+ &bat_priv->forw_bcast_list, list) {
+ /* if purge_outstanding_packets() was called with an argument
+ * we delete only packets belonging to the given interface
+ */
+ if ((hard_iface) &&
+ (forw_packet->if_incoming != hard_iface))
+ continue;
+
+ spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+ /* batadv_send_outstanding_bcast_packet() will lock the list to
+ * delete the item from the list
+ */
+ pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
+ spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+
+ if (pending) {
+ hlist_del(&forw_packet->list);
+ batadv_forw_packet_free(forw_packet);
+ }
+ }
+ spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+ /* free batman packet list */
+ spin_lock_bh(&bat_priv->forw_bat_list_lock);
+ hlist_for_each_entry_safe(forw_packet, safe_tmp_node,
+ &bat_priv->forw_bat_list, list) {
+ /* if purge_outstanding_packets() was called with an argument
+ * we delete only packets belonging to the given interface
+ */
+ if ((hard_iface) &&
+ (forw_packet->if_incoming != hard_iface) &&
+ (forw_packet->if_outgoing != hard_iface))
+ continue;
+
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+ /* send_outstanding_bat_packet() will lock the list to
+ * delete the item from the list
+ */
+ pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
+ spin_lock_bh(&bat_priv->forw_bat_list_lock);
+
+ if (pending) {
+ hlist_del(&forw_packet->list);
+ batadv_forw_packet_free(forw_packet);
+ }
+ }
+ spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+}
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
new file mode 100644
index 000000000..38d0ec183
--- /dev/null
+++ b/net/batman-adv/send.h
@@ -0,0 +1,98 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_SEND_H_
+#define _NET_BATMAN_ADV_SEND_H_
+
+int batadv_send_skb_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ const uint8_t *dst_addr);
+int batadv_send_skb_to_orig(struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *recv_if);
+void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface);
+int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
+ const struct sk_buff *skb,
+ unsigned long delay);
+void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work);
+void
+batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
+ const struct batadv_hard_iface *hard_iface);
+bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ struct batadv_orig_node *orig_node,
+ int packet_subtype);
+int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int packet_type,
+ int packet_subtype,
+ struct batadv_orig_node *orig_node,
+ unsigned short vid);
+int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int packet_type,
+ int packet_subtype, uint8_t *dst_hint,
+ unsigned short vid);
+int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid);
+
+/**
+ * batadv_send_skb_via_tt - send an skb via TT lookup
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the payload to send
+ * @dst_hint: can be used to override the destination contained in the skb
+ * @vid: the vid to be used to search the translation table
+ *
+ * Look up the recipient node for the destination address in the ethernet
+ * header via the translation table. Wrap the given skb into a batman-adv
+ * unicast header. Then send this frame to the according destination node.
+ *
+ * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, uint8_t *dst_hint,
+ unsigned short vid)
+{
+ return batadv_send_skb_via_tt_generic(bat_priv, skb, BATADV_UNICAST, 0,
+ dst_hint, vid);
+}
+
+/**
+ * batadv_send_skb_via_tt_4addr - send an skb via TT lookup
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: the payload to send
+ * @packet_subtype: the unicast 4addr packet subtype to use
+ * @dst_hint: can be used to override the destination contained in the skb
+ * @vid: the vid to be used to search the translation table
+ *
+ * Look up the recipient node for the destination address in the ethernet
+ * header via the translation table. Wrap the given skb into a batman-adv
+ * unicast-4addr header. Then send this frame to the according destination
+ * node.
+ *
+ * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ */
+static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ int packet_subtype,
+ uint8_t *dst_hint,
+ unsigned short vid)
+{
+ return batadv_send_skb_via_tt_generic(bat_priv, skb,
+ BATADV_UNICAST_4ADDR,
+ packet_subtype, dst_hint, vid);
+}
+
+#endif /* _NET_BATMAN_ADV_SEND_H_ */
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
new file mode 100644
index 000000000..5ec31d7de
--- /dev/null
+++ b/net/batman-adv/soft-interface.c
@@ -0,0 +1,1107 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "distributed-arp-table.h"
+#include "routing.h"
+#include "send.h"
+#include "debugfs.h"
+#include "translation-table.h"
+#include "hash.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+#include "sysfs.h"
+#include "originator.h"
+#include <linux/slab.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include "multicast.h"
+#include "bridge_loop_avoidance.h"
+#include "network-coding.h"
+
+static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd);
+static void batadv_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info);
+static u32 batadv_get_msglevel(struct net_device *dev);
+static void batadv_set_msglevel(struct net_device *dev, u32 value);
+static u32 batadv_get_link(struct net_device *dev);
+static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data);
+static void batadv_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, u64 *data);
+static int batadv_get_sset_count(struct net_device *dev, int stringset);
+
+static const struct ethtool_ops batadv_ethtool_ops = {
+ .get_settings = batadv_get_settings,
+ .get_drvinfo = batadv_get_drvinfo,
+ .get_msglevel = batadv_get_msglevel,
+ .set_msglevel = batadv_set_msglevel,
+ .get_link = batadv_get_link,
+ .get_strings = batadv_get_strings,
+ .get_ethtool_stats = batadv_get_ethtool_stats,
+ .get_sset_count = batadv_get_sset_count,
+};
+
+int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
+{
+ int result;
+
+ /* TODO: We must check if we can release all references to non-payload
+ * data using skb_header_release in our skbs to allow skb_cow_header to
+ * work optimally. This means that those skbs are not allowed to read
+ * or write any data which is before the current position of skb->data
+ * after that call and thus allow other skbs with the same data buffer
+ * to write freely in that area.
+ */
+ result = skb_cow_head(skb, len);
+ if (result < 0)
+ return result;
+
+ skb_push(skb, len);
+ return 0;
+}
+
+static int batadv_interface_open(struct net_device *dev)
+{
+ netif_start_queue(dev);
+ return 0;
+}
+
+static int batadv_interface_release(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static struct net_device_stats *batadv_interface_stats(struct net_device *dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct net_device_stats *stats = &bat_priv->stats;
+
+ stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX);
+ stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES);
+ stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED);
+ stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX);
+ stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES);
+ return stats;
+}
+
+static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct sockaddr *addr = p;
+ uint8_t old_addr[ETH_ALEN];
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ ether_addr_copy(old_addr, dev->dev_addr);
+ ether_addr_copy(dev->dev_addr, addr->sa_data);
+
+ /* only modify transtable if it has been initialized before */
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) {
+ batadv_tt_local_remove(bat_priv, old_addr, BATADV_NO_FLAGS,
+ "mac address changed", false);
+ batadv_tt_local_add(dev, addr->sa_data, BATADV_NO_FLAGS,
+ BATADV_NULL_IFINDEX, BATADV_NO_MARK);
+ }
+
+ return 0;
+}
+
+static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
+{
+ /* check ranges */
+ if ((new_mtu < 68) || (new_mtu > batadv_hardif_min_mtu(dev)))
+ return -EINVAL;
+
+ dev->mtu = new_mtu;
+
+ return 0;
+}
+
+/**
+ * batadv_interface_set_rx_mode - set the rx mode of a device
+ * @dev: registered network device to modify
+ *
+ * We do not actually need to set any rx filters for the virtual batman
+ * soft interface. However a dummy handler enables a user to set static
+ * multicast listeners for instance.
+ */
+static void batadv_interface_set_rx_mode(struct net_device *dev)
+{
+}
+
+static int batadv_interface_tx(struct sk_buff *skb,
+ struct net_device *soft_iface)
+{
+ struct ethhdr *ethhdr;
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_bcast_packet *bcast_packet;
+ __be16 ethertype = htons(ETH_P_BATMAN);
+ static const uint8_t stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00,
+ 0x00, 0x00};
+ static const uint8_t ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00,
+ 0x00, 0x00};
+ enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO;
+ uint8_t *dst_hint = NULL, chaddr[ETH_ALEN];
+ struct vlan_ethhdr *vhdr;
+ unsigned int header_len = 0;
+ int data_len = skb->len, ret;
+ unsigned long brd_delay = 1;
+ bool do_bcast = false, client_added;
+ unsigned short vid;
+ uint32_t seqno;
+ int gw_mode;
+ enum batadv_forw_mode forw_mode;
+ struct batadv_orig_node *mcast_single_orig = NULL;
+
+ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
+ goto dropped;
+
+ soft_iface->trans_start = jiffies;
+ vid = batadv_get_vid(skb, 0);
+ ethhdr = eth_hdr(skb);
+
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_8021Q:
+ vhdr = vlan_eth_hdr(skb);
+
+ if (vhdr->h_vlan_encapsulated_proto != ethertype)
+ break;
+
+ /* fall through */
+ case ETH_P_BATMAN:
+ goto dropped;
+ }
+
+ if (batadv_bla_tx(bat_priv, skb, vid))
+ goto dropped;
+
+ /* skb->data might have been reallocated by batadv_bla_tx() */
+ ethhdr = eth_hdr(skb);
+
+ /* Register the client MAC in the transtable */
+ if (!is_multicast_ether_addr(ethhdr->h_source)) {
+ client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source,
+ vid, skb->skb_iif,
+ skb->mark);
+ if (!client_added)
+ goto dropped;
+ }
+
+ /* don't accept stp packets. STP does not help in meshes.
+ * better use the bridge loop avoidance ...
+ *
+ * The same goes for ECTP sent at least by some Cisco Switches,
+ * it might confuse the mesh when used with bridge loop avoidance.
+ */
+ if (batadv_compare_eth(ethhdr->h_dest, stp_addr))
+ goto dropped;
+
+ if (batadv_compare_eth(ethhdr->h_dest, ectp_addr))
+ goto dropped;
+
+ gw_mode = atomic_read(&bat_priv->gw_mode);
+ if (is_multicast_ether_addr(ethhdr->h_dest)) {
+ /* if gw mode is off, broadcast every packet */
+ if (gw_mode == BATADV_GW_MODE_OFF) {
+ do_bcast = true;
+ goto send;
+ }
+
+ dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len,
+ chaddr);
+ /* skb->data may have been modified by
+ * batadv_gw_dhcp_recipient_get()
+ */
+ ethhdr = eth_hdr(skb);
+ /* if gw_mode is on, broadcast any non-DHCP message.
+ * All the DHCP packets are going to be sent as unicast
+ */
+ if (dhcp_rcp == BATADV_DHCP_NO) {
+ do_bcast = true;
+ goto send;
+ }
+
+ if (dhcp_rcp == BATADV_DHCP_TO_CLIENT)
+ dst_hint = chaddr;
+ else if ((gw_mode == BATADV_GW_MODE_SERVER) &&
+ (dhcp_rcp == BATADV_DHCP_TO_SERVER))
+ /* gateways should not forward any DHCP message if
+ * directed to a DHCP server
+ */
+ goto dropped;
+
+send:
+ if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) {
+ forw_mode = batadv_mcast_forw_mode(bat_priv, skb,
+ &mcast_single_orig);
+ if (forw_mode == BATADV_FORW_NONE)
+ goto dropped;
+
+ if (forw_mode == BATADV_FORW_SINGLE)
+ do_bcast = false;
+ }
+ }
+
+ batadv_skb_set_priority(skb, 0);
+
+ /* ethernet packet should be broadcasted */
+ if (do_bcast) {
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto dropped;
+
+ /* in case of ARP request, we do not immediately broadcasti the
+ * packet, instead we first wait for DAT to try to retrieve the
+ * correct ARP entry
+ */
+ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb))
+ brd_delay = msecs_to_jiffies(ARP_REQ_DELAY);
+
+ if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0)
+ goto dropped;
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+ bcast_packet->version = BATADV_COMPAT_VERSION;
+ bcast_packet->ttl = BATADV_TTL;
+
+ /* batman packet type: broadcast */
+ bcast_packet->packet_type = BATADV_BCAST;
+ bcast_packet->reserved = 0;
+
+ /* hw address of first interface is the orig mac because only
+ * this mac is known throughout the mesh
+ */
+ ether_addr_copy(bcast_packet->orig,
+ primary_if->net_dev->dev_addr);
+
+ /* set broadcast sequence number */
+ seqno = atomic_inc_return(&bat_priv->bcast_seqno);
+ bcast_packet->seqno = htonl(seqno);
+
+ batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay);
+
+ /* a copy is stored in the bcast list, therefore removing
+ * the original skb.
+ */
+ kfree_skb(skb);
+
+ /* unicast packet */
+ } else {
+ /* DHCP packets going to a server will use the GW feature */
+ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) {
+ ret = batadv_gw_out_of_range(bat_priv, skb);
+ if (ret)
+ goto dropped;
+ ret = batadv_send_skb_via_gw(bat_priv, skb, vid);
+ } else if (mcast_single_orig) {
+ ret = batadv_send_skb_unicast(bat_priv, skb,
+ BATADV_UNICAST, 0,
+ mcast_single_orig, vid);
+ } else {
+ if (batadv_dat_snoop_outgoing_arp_request(bat_priv,
+ skb))
+ goto dropped;
+
+ batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb);
+
+ ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint,
+ vid);
+ }
+ if (ret == NET_XMIT_DROP)
+ goto dropped_freed;
+ }
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len);
+ goto end;
+
+dropped:
+ kfree_skb(skb);
+dropped_freed:
+ batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED);
+end:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return NETDEV_TX_OK;
+}
+
+void batadv_interface_rx(struct net_device *soft_iface,
+ struct sk_buff *skb, struct batadv_hard_iface *recv_if,
+ int hdr_size, struct batadv_orig_node *orig_node)
+{
+ struct batadv_bcast_packet *batadv_bcast_packet;
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ __be16 ethertype = htons(ETH_P_BATMAN);
+ struct vlan_ethhdr *vhdr;
+ struct ethhdr *ethhdr;
+ unsigned short vid;
+ bool is_bcast;
+
+ batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data;
+ is_bcast = (batadv_bcast_packet->packet_type == BATADV_BCAST);
+
+ /* check if enough space is available for pulling, and pull */
+ if (!pskb_may_pull(skb, hdr_size))
+ goto dropped;
+
+ skb_pull_rcsum(skb, hdr_size);
+ skb_reset_mac_header(skb);
+
+ /* clean the netfilter state now that the batman-adv header has been
+ * removed
+ */
+ nf_reset(skb);
+
+ vid = batadv_get_vid(skb, 0);
+ ethhdr = eth_hdr(skb);
+
+ switch (ntohs(ethhdr->h_proto)) {
+ case ETH_P_8021Q:
+ vhdr = (struct vlan_ethhdr *)skb->data;
+
+ if (vhdr->h_vlan_encapsulated_proto != ethertype)
+ break;
+
+ /* fall through */
+ case ETH_P_BATMAN:
+ goto dropped;
+ }
+
+ /* skb->dev & skb->pkt_type are set here */
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ goto dropped;
+ skb->protocol = eth_type_trans(skb, soft_iface);
+
+ /* should not be necessary anymore as we use skb_pull_rcsum()
+ * TODO: please verify this and remove this TODO
+ * -- Dec 21st 2009, Simon Wunderlich
+ */
+
+ /* skb->ip_summed = CHECKSUM_UNNECESSARY; */
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
+ skb->len + ETH_HLEN);
+
+ soft_iface->last_rx = jiffies;
+
+ /* Let the bridge loop avoidance check the packet. If will
+ * not handle it, we can safely push it up.
+ */
+ if (batadv_bla_rx(bat_priv, skb, vid, is_bcast))
+ goto out;
+
+ if (orig_node)
+ batadv_tt_add_temporary_global_entry(bat_priv, orig_node,
+ ethhdr->h_source, vid);
+
+ if (is_multicast_ether_addr(ethhdr->h_dest)) {
+ /* set the mark on broadcast packets if AP isolation is ON and
+ * the packet is coming from an "isolated" client
+ */
+ if (batadv_vlan_ap_isola_get(bat_priv, vid) &&
+ batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source,
+ vid)) {
+ /* save bits in skb->mark not covered by the mask and
+ * apply the mark on the rest
+ */
+ skb->mark &= ~bat_priv->isolation_mark_mask;
+ skb->mark |= bat_priv->isolation_mark;
+ }
+ } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source,
+ ethhdr->h_dest, vid)) {
+ goto dropped;
+ }
+
+ netif_rx(skb);
+ goto out;
+
+dropped:
+ kfree_skb(skb);
+out:
+ return;
+}
+
+/**
+ * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and
+ * possibly free it
+ * @softif_vlan: the vlan object to release
+ */
+void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
+{
+ if (atomic_dec_and_test(&vlan->refcount)) {
+ spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock);
+ hlist_del_rcu(&vlan->list);
+ spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock);
+
+ kfree_rcu(vlan, rcu);
+ }
+}
+
+/**
+ * batadv_softif_vlan_get - get the vlan object for a specific vid
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vid: the identifier of the vlan object to retrieve
+ *
+ * Returns the private data of the vlan matching the vid passed as argument or
+ * NULL otherwise. The refcounter of the returned object is incremented by 1.
+ */
+struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
+ unsigned short vid)
+{
+ struct batadv_softif_vlan *vlan_tmp, *vlan = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) {
+ if (vlan_tmp->vid != vid)
+ continue;
+
+ if (!atomic_inc_not_zero(&vlan_tmp->refcount))
+ continue;
+
+ vlan = vlan_tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return vlan;
+}
+
+/**
+ * batadv_create_vlan - allocate the needed resources for a new vlan
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vid: the VLAN identifier
+ *
+ * Returns 0 on success, a negative error otherwise.
+ */
+int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
+{
+ struct batadv_softif_vlan *vlan;
+ int err;
+
+ vlan = batadv_softif_vlan_get(bat_priv, vid);
+ if (vlan) {
+ batadv_softif_vlan_free_ref(vlan);
+ return -EEXIST;
+ }
+
+ vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC);
+ if (!vlan)
+ return -ENOMEM;
+
+ vlan->bat_priv = bat_priv;
+ vlan->vid = vid;
+ atomic_set(&vlan->refcount, 1);
+
+ atomic_set(&vlan->ap_isolation, 0);
+
+ err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
+ if (err) {
+ kfree(vlan);
+ return err;
+ }
+
+ spin_lock_bh(&bat_priv->softif_vlan_list_lock);
+ hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list);
+ spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
+
+ /* add a new TT local entry. This one will be marked with the NOPURGE
+ * flag
+ */
+ batadv_tt_local_add(bat_priv->soft_iface,
+ bat_priv->soft_iface->dev_addr, vid,
+ BATADV_NULL_IFINDEX, BATADV_NO_MARK);
+
+ return 0;
+}
+
+/**
+ * batadv_softif_destroy_vlan - remove and destroy a softif_vlan object
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vlan: the object to remove
+ */
+static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
+ struct batadv_softif_vlan *vlan)
+{
+ /* explicitly remove the associated TT local entry because it is marked
+ * with the NOPURGE flag
+ */
+ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr,
+ vlan->vid, "vlan interface destroyed", false);
+
+ batadv_sysfs_del_vlan(bat_priv, vlan);
+ batadv_softif_vlan_free_ref(vlan);
+}
+
+/**
+ * batadv_interface_add_vid - ndo_add_vid API implementation
+ * @dev: the netdev of the mesh interface
+ * @vid: identifier of the new vlan
+ *
+ * Set up all the internal structures for handling the new vlan on top of the
+ * mesh interface
+ *
+ * Returns 0 on success or a negative error code in case of failure.
+ */
+static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
+ unsigned short vid)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_softif_vlan *vlan;
+ int ret;
+
+ /* only 802.1Q vlans are supported.
+ * batman-adv does not know how to handle other types
+ */
+ if (proto != htons(ETH_P_8021Q))
+ return -EINVAL;
+
+ vid |= BATADV_VLAN_HAS_TAG;
+
+ /* if a new vlan is getting created and it already exists, it means that
+ * it was not deleted yet. batadv_softif_vlan_get() increases the
+ * refcount in order to revive the object.
+ *
+ * if it does not exist then create it.
+ */
+ vlan = batadv_softif_vlan_get(bat_priv, vid);
+ if (!vlan)
+ return batadv_softif_create_vlan(bat_priv, vid);
+
+ /* recreate the sysfs object if it was already destroyed (and it should
+ * be since we received a kill_vid() for this vlan
+ */
+ if (!vlan->kobj) {
+ ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
+ if (ret) {
+ batadv_softif_vlan_free_ref(vlan);
+ return ret;
+ }
+ }
+
+ /* add a new TT local entry. This one will be marked with the NOPURGE
+ * flag. This must be added again, even if the vlan object already
+ * exists, because the entry was deleted by kill_vid()
+ */
+ batadv_tt_local_add(bat_priv->soft_iface,
+ bat_priv->soft_iface->dev_addr, vid,
+ BATADV_NULL_IFINDEX, BATADV_NO_MARK);
+
+ return 0;
+}
+
+/**
+ * batadv_interface_kill_vid - ndo_kill_vid API implementation
+ * @dev: the netdev of the mesh interface
+ * @vid: identifier of the deleted vlan
+ *
+ * Destroy all the internal structures used to handle the vlan identified by vid
+ * on top of the mesh interface
+ *
+ * Returns 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q
+ * or -ENOENT if the specified vlan id wasn't registered.
+ */
+static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
+ unsigned short vid)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_softif_vlan *vlan;
+
+ /* only 802.1Q vlans are supported. batman-adv does not know how to
+ * handle other types
+ */
+ if (proto != htons(ETH_P_8021Q))
+ return -EINVAL;
+
+ vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG);
+ if (!vlan)
+ return -ENOENT;
+
+ batadv_softif_destroy_vlan(bat_priv, vlan);
+
+ /* finally free the vlan object */
+ batadv_softif_vlan_free_ref(vlan);
+
+ return 0;
+}
+
+/* batman-adv network devices have devices nesting below it and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key batadv_netdev_xmit_lock_key;
+static struct lock_class_key batadv_netdev_addr_lock_key;
+
+/**
+ * batadv_set_lockdep_class_one - Set lockdep class for a single tx queue
+ * @dev: device which owns the tx queue
+ * @txq: tx queue to modify
+ * @_unused: always NULL
+ */
+static void batadv_set_lockdep_class_one(struct net_device *dev,
+ struct netdev_queue *txq,
+ void *_unused)
+{
+ lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key);
+}
+
+/**
+ * batadv_set_lockdep_class - Set txq and addr_list lockdep class
+ * @dev: network device to modify
+ */
+static void batadv_set_lockdep_class(struct net_device *dev)
+{
+ lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key);
+ netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL);
+}
+
+/**
+ * batadv_softif_destroy_finish - cleans up the remains of a softif
+ * @work: work queue item
+ *
+ * Free the parts of the soft interface which can not be removed under
+ * rtnl lock (to prevent deadlock situations).
+ */
+static void batadv_softif_destroy_finish(struct work_struct *work)
+{
+ struct batadv_softif_vlan *vlan;
+ struct batadv_priv *bat_priv;
+ struct net_device *soft_iface;
+
+ bat_priv = container_of(work, struct batadv_priv,
+ cleanup_work);
+ soft_iface = bat_priv->soft_iface;
+
+ /* destroy the "untagged" VLAN */
+ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (vlan) {
+ batadv_softif_destroy_vlan(bat_priv, vlan);
+ batadv_softif_vlan_free_ref(vlan);
+ }
+
+ batadv_sysfs_del_meshif(soft_iface);
+ unregister_netdev(soft_iface);
+}
+
+/**
+ * batadv_softif_init_late - late stage initialization of soft interface
+ * @dev: registered network device to modify
+ *
+ * Returns error code on failures
+ */
+static int batadv_softif_init_late(struct net_device *dev)
+{
+ struct batadv_priv *bat_priv;
+ uint32_t random_seqno;
+ int ret;
+ size_t cnt_len = sizeof(uint64_t) * BATADV_CNT_NUM;
+
+ batadv_set_lockdep_class(dev);
+
+ bat_priv = netdev_priv(dev);
+ bat_priv->soft_iface = dev;
+ INIT_WORK(&bat_priv->cleanup_work, batadv_softif_destroy_finish);
+
+ /* batadv_interface_stats() needs to be available as soon as
+ * register_netdevice() has been called
+ */
+ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(uint64_t));
+ if (!bat_priv->bat_counters)
+ return -ENOMEM;
+
+ atomic_set(&bat_priv->aggregated_ogms, 1);
+ atomic_set(&bat_priv->bonding, 0);
+#ifdef CONFIG_BATMAN_ADV_BLA
+ atomic_set(&bat_priv->bridge_loop_avoidance, 0);
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
+ atomic_set(&bat_priv->distributed_arp_table, 1);
+#endif
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ bat_priv->mcast.flags = BATADV_NO_FLAGS;
+ atomic_set(&bat_priv->multicast_mode, 1);
+ atomic_set(&bat_priv->mcast.num_disabled, 0);
+ atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0);
+ atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0);
+ atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
+#endif
+ atomic_set(&bat_priv->gw_mode, BATADV_GW_MODE_OFF);
+ atomic_set(&bat_priv->gw_sel_class, 20);
+ atomic_set(&bat_priv->gw.bandwidth_down, 100);
+ atomic_set(&bat_priv->gw.bandwidth_up, 20);
+ atomic_set(&bat_priv->orig_interval, 1000);
+ atomic_set(&bat_priv->hop_penalty, 30);
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ atomic_set(&bat_priv->log_level, 0);
+#endif
+ atomic_set(&bat_priv->fragmentation, 1);
+ atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN);
+ atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN);
+ atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN);
+
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
+ atomic_set(&bat_priv->bcast_seqno, 1);
+ atomic_set(&bat_priv->tt.vn, 0);
+ atomic_set(&bat_priv->tt.local_changes, 0);
+ atomic_set(&bat_priv->tt.ogm_append_cnt, 0);
+#ifdef CONFIG_BATMAN_ADV_BLA
+ atomic_set(&bat_priv->bla.num_requests, 0);
+#endif
+ bat_priv->tt.last_changeset = NULL;
+ bat_priv->tt.last_changeset_len = 0;
+ bat_priv->isolation_mark = 0;
+ bat_priv->isolation_mark_mask = 0;
+
+ /* randomize initial seqno to avoid collision */
+ get_random_bytes(&random_seqno, sizeof(random_seqno));
+ atomic_set(&bat_priv->frag_seqno, random_seqno);
+
+ bat_priv->primary_if = NULL;
+ bat_priv->num_ifaces = 0;
+
+ batadv_nc_init_bat_priv(bat_priv);
+
+ ret = batadv_algo_select(bat_priv, batadv_routing_algo);
+ if (ret < 0)
+ goto free_bat_counters;
+
+ ret = batadv_debugfs_add_meshif(dev);
+ if (ret < 0)
+ goto free_bat_counters;
+
+ ret = batadv_mesh_init(dev);
+ if (ret < 0)
+ goto unreg_debugfs;
+
+ return 0;
+
+unreg_debugfs:
+ batadv_debugfs_del_meshif(dev);
+free_bat_counters:
+ free_percpu(bat_priv->bat_counters);
+ bat_priv->bat_counters = NULL;
+
+ return ret;
+}
+
+/**
+ * batadv_softif_slave_add - Add a slave interface to a batadv_soft_interface
+ * @dev: batadv_soft_interface used as master interface
+ * @slave_dev: net_device which should become the slave interface
+ *
+ * Return 0 if successful or error otherwise.
+ */
+static int batadv_softif_slave_add(struct net_device *dev,
+ struct net_device *slave_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+ int ret = -EINVAL;
+
+ hard_iface = batadv_hardif_get_by_netdev(slave_dev);
+ if (!hard_iface || hard_iface->soft_iface != NULL)
+ goto out;
+
+ ret = batadv_hardif_enable_interface(hard_iface, dev->name);
+
+out:
+ if (hard_iface)
+ batadv_hardif_free_ref(hard_iface);
+ return ret;
+}
+
+/**
+ * batadv_softif_slave_del - Delete a slave iface from a batadv_soft_interface
+ * @dev: batadv_soft_interface used as master interface
+ * @slave_dev: net_device which should be removed from the master interface
+ *
+ * Return 0 if successful or error otherwise.
+ */
+static int batadv_softif_slave_del(struct net_device *dev,
+ struct net_device *slave_dev)
+{
+ struct batadv_hard_iface *hard_iface;
+ int ret = -EINVAL;
+
+ hard_iface = batadv_hardif_get_by_netdev(slave_dev);
+
+ if (!hard_iface || hard_iface->soft_iface != dev)
+ goto out;
+
+ batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_KEEP);
+ ret = 0;
+
+out:
+ if (hard_iface)
+ batadv_hardif_free_ref(hard_iface);
+ return ret;
+}
+
+static const struct net_device_ops batadv_netdev_ops = {
+ .ndo_init = batadv_softif_init_late,
+ .ndo_open = batadv_interface_open,
+ .ndo_stop = batadv_interface_release,
+ .ndo_get_stats = batadv_interface_stats,
+ .ndo_vlan_rx_add_vid = batadv_interface_add_vid,
+ .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid,
+ .ndo_set_mac_address = batadv_interface_set_mac_addr,
+ .ndo_change_mtu = batadv_interface_change_mtu,
+ .ndo_set_rx_mode = batadv_interface_set_rx_mode,
+ .ndo_start_xmit = batadv_interface_tx,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_add_slave = batadv_softif_slave_add,
+ .ndo_del_slave = batadv_softif_slave_del,
+};
+
+/**
+ * batadv_softif_free - Deconstructor of batadv_soft_interface
+ * @dev: Device to cleanup and remove
+ */
+static void batadv_softif_free(struct net_device *dev)
+{
+ batadv_debugfs_del_meshif(dev);
+ batadv_mesh_free(dev);
+
+ /* some scheduled RCU callbacks need the bat_priv struct to accomplish
+ * their tasks. Wait for them all to be finished before freeing the
+ * netdev and its private data (bat_priv)
+ */
+ rcu_barrier();
+
+ free_netdev(dev);
+}
+
+/**
+ * batadv_softif_init_early - early stage initialization of soft interface
+ * @dev: registered network device to modify
+ */
+static void batadv_softif_init_early(struct net_device *dev)
+{
+ struct batadv_priv *priv = netdev_priv(dev);
+
+ ether_setup(dev);
+
+ dev->netdev_ops = &batadv_netdev_ops;
+ dev->destructor = batadv_softif_free;
+ dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+ dev->tx_queue_len = 0;
+
+ /* can't call min_mtu, because the needed variables
+ * have not been initialized yet
+ */
+ dev->mtu = ETH_DATA_LEN;
+ /* reserve more space in the skbuff for our header */
+ dev->hard_header_len = batadv_max_header_len();
+
+ /* generate random address */
+ eth_hw_addr_random(dev);
+
+ dev->ethtool_ops = &batadv_ethtool_ops;
+
+ memset(priv, 0, sizeof(*priv));
+}
+
+struct net_device *batadv_softif_create(const char *name)
+{
+ struct net_device *soft_iface;
+ int ret;
+
+ soft_iface = alloc_netdev(sizeof(struct batadv_priv), name,
+ NET_NAME_UNKNOWN, batadv_softif_init_early);
+ if (!soft_iface)
+ return NULL;
+
+ soft_iface->rtnl_link_ops = &batadv_link_ops;
+
+ ret = register_netdevice(soft_iface);
+ if (ret < 0) {
+ pr_err("Unable to register the batman interface '%s': %i\n",
+ name, ret);
+ free_netdev(soft_iface);
+ return NULL;
+ }
+
+ return soft_iface;
+}
+
+/**
+ * batadv_softif_destroy_sysfs - deletion of batadv_soft_interface via sysfs
+ * @soft_iface: the to-be-removed batman-adv interface
+ */
+void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+
+ queue_work(batadv_event_workqueue, &bat_priv->cleanup_work);
+}
+
+/**
+ * batadv_softif_destroy_netlink - deletion of batadv_soft_interface via netlink
+ * @soft_iface: the to-be-removed batman-adv interface
+ * @head: list pointer
+ */
+static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
+ struct list_head *head)
+{
+ struct batadv_hard_iface *hard_iface;
+
+ list_for_each_entry(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface == soft_iface)
+ batadv_hardif_disable_interface(hard_iface,
+ BATADV_IF_CLEANUP_KEEP);
+ }
+
+ batadv_sysfs_del_meshif(soft_iface);
+ unregister_netdevice_queue(soft_iface, head);
+}
+
+int batadv_softif_is_valid(const struct net_device *net_dev)
+{
+ if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx)
+ return 1;
+
+ return 0;
+}
+
+struct rtnl_link_ops batadv_link_ops __read_mostly = {
+ .kind = "batadv",
+ .priv_size = sizeof(struct batadv_priv),
+ .setup = batadv_softif_init_early,
+ .dellink = batadv_softif_destroy_netlink,
+};
+
+/* ethtool */
+static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ cmd->supported = 0;
+ cmd->advertising = 0;
+ ethtool_cmd_speed_set(cmd, SPEED_10);
+ cmd->duplex = DUPLEX_FULL;
+ cmd->port = PORT_TP;
+ cmd->phy_address = 0;
+ cmd->transceiver = XCVR_INTERNAL;
+ cmd->autoneg = AUTONEG_DISABLE;
+ cmd->maxtxpkt = 0;
+ cmd->maxrxpkt = 0;
+
+ return 0;
+}
+
+static void batadv_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strlcpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver));
+ strlcpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version));
+ strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strlcpy(info->bus_info, "batman", sizeof(info->bus_info));
+}
+
+static u32 batadv_get_msglevel(struct net_device *dev)
+{
+ return -EOPNOTSUPP;
+}
+
+static void batadv_set_msglevel(struct net_device *dev, u32 value)
+{
+}
+
+static u32 batadv_get_link(struct net_device *dev)
+{
+ return 1;
+}
+
+/* Inspired by drivers/net/ethernet/dlink/sundance.c:1702
+ * Declare each description string in struct.name[] to get fixed sized buffer
+ * and compile time checking for strings longer than ETH_GSTRING_LEN.
+ */
+static const struct {
+ const char name[ETH_GSTRING_LEN];
+} batadv_counters_strings[] = {
+ { "tx" },
+ { "tx_bytes" },
+ { "tx_dropped" },
+ { "rx" },
+ { "rx_bytes" },
+ { "forward" },
+ { "forward_bytes" },
+ { "mgmt_tx" },
+ { "mgmt_tx_bytes" },
+ { "mgmt_rx" },
+ { "mgmt_rx_bytes" },
+ { "frag_tx" },
+ { "frag_tx_bytes" },
+ { "frag_rx" },
+ { "frag_rx_bytes" },
+ { "frag_fwd" },
+ { "frag_fwd_bytes" },
+ { "tt_request_tx" },
+ { "tt_request_rx" },
+ { "tt_response_tx" },
+ { "tt_response_rx" },
+ { "tt_roam_adv_tx" },
+ { "tt_roam_adv_rx" },
+#ifdef CONFIG_BATMAN_ADV_DAT
+ { "dat_get_tx" },
+ { "dat_get_rx" },
+ { "dat_put_tx" },
+ { "dat_put_rx" },
+ { "dat_cached_reply_tx" },
+#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+ { "nc_code" },
+ { "nc_code_bytes" },
+ { "nc_recode" },
+ { "nc_recode_bytes" },
+ { "nc_buffer" },
+ { "nc_decode" },
+ { "nc_decode_bytes" },
+ { "nc_decode_failed" },
+ { "nc_sniffed" },
+#endif
+};
+
+static void batadv_get_strings(struct net_device *dev, uint32_t stringset,
+ uint8_t *data)
+{
+ if (stringset == ETH_SS_STATS)
+ memcpy(data, batadv_counters_strings,
+ sizeof(batadv_counters_strings));
+}
+
+static void batadv_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < BATADV_CNT_NUM; i++)
+ data[i] = batadv_sum_counter(bat_priv, i);
+}
+
+static int batadv_get_sset_count(struct net_device *dev, int stringset)
+{
+ if (stringset == ETH_SS_STATS)
+ return BATADV_CNT_NUM;
+
+ return -EOPNOTSUPP;
+}
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
new file mode 100644
index 000000000..dbab22fd8
--- /dev/null
+++ b/net/batman-adv/soft-interface.h
@@ -0,0 +1,34 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_
+#define _NET_BATMAN_ADV_SOFT_INTERFACE_H_
+
+int batadv_skb_head_push(struct sk_buff *skb, unsigned int len);
+void batadv_interface_rx(struct net_device *soft_iface,
+ struct sk_buff *skb, struct batadv_hard_iface *recv_if,
+ int hdr_size, struct batadv_orig_node *orig_node);
+struct net_device *batadv_softif_create(const char *name);
+void batadv_softif_destroy_sysfs(struct net_device *soft_iface);
+int batadv_softif_is_valid(const struct net_device *net_dev);
+extern struct rtnl_link_ops batadv_link_ops;
+int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid);
+void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *softif_vlan);
+struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
+ unsigned short vid);
+
+#endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
new file mode 100644
index 000000000..a75dc12f9
--- /dev/null
+++ b/net/batman-adv/sysfs.c
@@ -0,0 +1,935 @@
+/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "sysfs.h"
+#include "translation-table.h"
+#include "distributed-arp-table.h"
+#include "network-coding.h"
+#include "originator.h"
+#include "hard-interface.h"
+#include "soft-interface.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+
+static struct net_device *batadv_kobj_to_netdev(struct kobject *obj)
+{
+ struct device *dev = container_of(obj->parent, struct device, kobj);
+
+ return to_net_dev(dev);
+}
+
+static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(obj);
+
+ return netdev_priv(net_dev);
+}
+
+/**
+ * batadv_vlan_kobj_to_batpriv - convert a vlan kobj in the associated batpriv
+ * @obj: kobject to covert
+ *
+ * Returns the associated batadv_priv struct.
+ */
+static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
+{
+ /* VLAN specific attributes are located in the root sysfs folder if they
+ * refer to the untagged VLAN..
+ */
+ if (!strcmp(BATADV_SYSFS_IF_MESH_SUBDIR, obj->name))
+ return batadv_kobj_to_batpriv(obj);
+
+ /* ..while the attributes for the tagged vlans are located in
+ * the in the corresponding "vlan%VID" subfolder
+ */
+ return batadv_kobj_to_batpriv(obj->parent);
+}
+
+/**
+ * batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct
+ * @obj: kobject to covert
+ *
+ * Returns the associated softif_vlan struct if found, NULL otherwise.
+ */
+static struct batadv_softif_vlan *
+batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj)
+{
+ struct batadv_softif_vlan *vlan_tmp, *vlan = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) {
+ if (vlan_tmp->kobj != obj)
+ continue;
+
+ if (!atomic_inc_not_zero(&vlan_tmp->refcount))
+ continue;
+
+ vlan = vlan_tmp;
+ break;
+ }
+ rcu_read_unlock();
+
+ return vlan;
+}
+
+#define BATADV_UEV_TYPE_VAR "BATTYPE="
+#define BATADV_UEV_ACTION_VAR "BATACTION="
+#define BATADV_UEV_DATA_VAR "BATDATA="
+
+static char *batadv_uev_action_str[] = {
+ "add",
+ "del",
+ "change"
+};
+
+static char *batadv_uev_type_str[] = {
+ "gw"
+};
+
+/* Use this, if you have customized show and store functions for vlan attrs */
+#define BATADV_ATTR_VLAN(_name, _mode, _show, _store) \
+struct batadv_attribute batadv_attr_vlan_##_name = { \
+ .attr = {.name = __stringify(_name), \
+ .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+}
+
+/* Use this, if you have customized show and store functions */
+#define BATADV_ATTR(_name, _mode, _show, _store) \
+struct batadv_attribute batadv_attr_##_name = { \
+ .attr = {.name = __stringify(_name), \
+ .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+}
+
+#define BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \
+ssize_t batadv_store_##_name(struct kobject *kobj, \
+ struct attribute *attr, char *buff, \
+ size_t count) \
+{ \
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
+ struct batadv_priv *bat_priv = netdev_priv(net_dev); \
+ \
+ return __batadv_store_bool_attr(buff, count, _post_func, attr, \
+ &bat_priv->_name, net_dev); \
+}
+
+#define BATADV_ATTR_SIF_SHOW_BOOL(_name) \
+ssize_t batadv_show_##_name(struct kobject *kobj, \
+ struct attribute *attr, char *buff) \
+{ \
+ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \
+ \
+ return sprintf(buff, "%s\n", \
+ atomic_read(&bat_priv->_name) == 0 ? \
+ "disabled" : "enabled"); \
+} \
+
+/* Use this, if you are going to turn a [name] in the soft-interface
+ * (bat_priv) on or off
+ */
+#define BATADV_ATTR_SIF_BOOL(_name, _mode, _post_func) \
+ static BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \
+ static BATADV_ATTR_SIF_SHOW_BOOL(_name) \
+ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
+ batadv_store_##_name)
+
+#define BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func) \
+ssize_t batadv_store_##_name(struct kobject *kobj, \
+ struct attribute *attr, char *buff, \
+ size_t count) \
+{ \
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
+ struct batadv_priv *bat_priv = netdev_priv(net_dev); \
+ \
+ return __batadv_store_uint_attr(buff, count, _min, _max, \
+ _post_func, attr, \
+ &bat_priv->_name, net_dev); \
+}
+
+#define BATADV_ATTR_SIF_SHOW_UINT(_name) \
+ssize_t batadv_show_##_name(struct kobject *kobj, \
+ struct attribute *attr, char *buff) \
+{ \
+ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \
+ \
+ return sprintf(buff, "%i\n", atomic_read(&bat_priv->_name)); \
+} \
+
+/* Use this, if you are going to set [name] in the soft-interface
+ * (bat_priv) to an unsigned integer value
+ */
+#define BATADV_ATTR_SIF_UINT(_name, _mode, _min, _max, _post_func) \
+ static BATADV_ATTR_SIF_STORE_UINT(_name, _min, _max, _post_func)\
+ static BATADV_ATTR_SIF_SHOW_UINT(_name) \
+ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
+ batadv_store_##_name)
+
+#define BATADV_ATTR_VLAN_STORE_BOOL(_name, _post_func) \
+ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \
+ struct attribute *attr, char *buff, \
+ size_t count) \
+{ \
+ struct batadv_priv *bat_priv = batadv_vlan_kobj_to_batpriv(kobj);\
+ struct batadv_softif_vlan *vlan = batadv_kobj_to_vlan(bat_priv, \
+ kobj); \
+ size_t res = __batadv_store_bool_attr(buff, count, _post_func, \
+ attr, &vlan->_name, \
+ bat_priv->soft_iface); \
+ \
+ batadv_softif_vlan_free_ref(vlan); \
+ return res; \
+}
+
+#define BATADV_ATTR_VLAN_SHOW_BOOL(_name) \
+ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \
+ struct attribute *attr, char *buff) \
+{ \
+ struct batadv_priv *bat_priv = batadv_vlan_kobj_to_batpriv(kobj);\
+ struct batadv_softif_vlan *vlan = batadv_kobj_to_vlan(bat_priv, \
+ kobj); \
+ size_t res = sprintf(buff, "%s\n", \
+ atomic_read(&vlan->_name) == 0 ? \
+ "disabled" : "enabled"); \
+ \
+ batadv_softif_vlan_free_ref(vlan); \
+ return res; \
+}
+
+/* Use this, if you are going to turn a [name] in the vlan struct on or off */
+#define BATADV_ATTR_VLAN_BOOL(_name, _mode, _post_func) \
+ static BATADV_ATTR_VLAN_STORE_BOOL(_name, _post_func) \
+ static BATADV_ATTR_VLAN_SHOW_BOOL(_name) \
+ static BATADV_ATTR_VLAN(_name, _mode, batadv_show_vlan_##_name, \
+ batadv_store_vlan_##_name)
+
+static int batadv_store_bool_attr(char *buff, size_t count,
+ struct net_device *net_dev,
+ const char *attr_name, atomic_t *attr)
+{
+ int enabled = -1;
+
+ if (buff[count - 1] == '\n')
+ buff[count - 1] = '\0';
+
+ if ((strncmp(buff, "1", 2) == 0) ||
+ (strncmp(buff, "enable", 7) == 0) ||
+ (strncmp(buff, "enabled", 8) == 0))
+ enabled = 1;
+
+ if ((strncmp(buff, "0", 2) == 0) ||
+ (strncmp(buff, "disable", 8) == 0) ||
+ (strncmp(buff, "disabled", 9) == 0))
+ enabled = 0;
+
+ if (enabled < 0) {
+ batadv_info(net_dev, "%s: Invalid parameter received: %s\n",
+ attr_name, buff);
+ return -EINVAL;
+ }
+
+ if (atomic_read(attr) == enabled)
+ return count;
+
+ batadv_info(net_dev, "%s: Changing from: %s to: %s\n", attr_name,
+ atomic_read(attr) == 1 ? "enabled" : "disabled",
+ enabled == 1 ? "enabled" : "disabled");
+
+ atomic_set(attr, (unsigned int)enabled);
+ return count;
+}
+
+static inline ssize_t
+__batadv_store_bool_attr(char *buff, size_t count,
+ void (*post_func)(struct net_device *),
+ struct attribute *attr,
+ atomic_t *attr_store, struct net_device *net_dev)
+{
+ int ret;
+
+ ret = batadv_store_bool_attr(buff, count, net_dev, attr->name,
+ attr_store);
+ if (post_func && ret)
+ post_func(net_dev);
+
+ return ret;
+}
+
+static int batadv_store_uint_attr(const char *buff, size_t count,
+ struct net_device *net_dev,
+ const char *attr_name,
+ unsigned int min, unsigned int max,
+ atomic_t *attr)
+{
+ unsigned long uint_val;
+ int ret;
+
+ ret = kstrtoul(buff, 10, &uint_val);
+ if (ret) {
+ batadv_info(net_dev, "%s: Invalid parameter received: %s\n",
+ attr_name, buff);
+ return -EINVAL;
+ }
+
+ if (uint_val < min) {
+ batadv_info(net_dev, "%s: Value is too small: %lu min: %u\n",
+ attr_name, uint_val, min);
+ return -EINVAL;
+ }
+
+ if (uint_val > max) {
+ batadv_info(net_dev, "%s: Value is too big: %lu max: %u\n",
+ attr_name, uint_val, max);
+ return -EINVAL;
+ }
+
+ if (atomic_read(attr) == uint_val)
+ return count;
+
+ batadv_info(net_dev, "%s: Changing from: %i to: %lu\n",
+ attr_name, atomic_read(attr), uint_val);
+
+ atomic_set(attr, uint_val);
+ return count;
+}
+
+static inline ssize_t
+__batadv_store_uint_attr(const char *buff, size_t count,
+ int min, int max,
+ void (*post_func)(struct net_device *),
+ const struct attribute *attr,
+ atomic_t *attr_store, struct net_device *net_dev)
+{
+ int ret;
+
+ ret = batadv_store_uint_attr(buff, count, net_dev, attr->name, min, max,
+ attr_store);
+ if (post_func && ret)
+ post_func(net_dev);
+
+ return ret;
+}
+
+static ssize_t batadv_show_bat_algo(struct kobject *kobj,
+ struct attribute *attr, char *buff)
+{
+ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
+
+ return sprintf(buff, "%s\n", bat_priv->bat_algo_ops->name);
+}
+
+static void batadv_post_gw_reselect(struct net_device *net_dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+
+ batadv_gw_reselect(bat_priv);
+}
+
+static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr,
+ char *buff)
+{
+ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
+ int bytes_written;
+
+ switch (atomic_read(&bat_priv->gw_mode)) {
+ case BATADV_GW_MODE_CLIENT:
+ bytes_written = sprintf(buff, "%s\n",
+ BATADV_GW_MODE_CLIENT_NAME);
+ break;
+ case BATADV_GW_MODE_SERVER:
+ bytes_written = sprintf(buff, "%s\n",
+ BATADV_GW_MODE_SERVER_NAME);
+ break;
+ default:
+ bytes_written = sprintf(buff, "%s\n",
+ BATADV_GW_MODE_OFF_NAME);
+ break;
+ }
+
+ return bytes_written;
+}
+
+static ssize_t batadv_store_gw_mode(struct kobject *kobj,
+ struct attribute *attr, char *buff,
+ size_t count)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ char *curr_gw_mode_str;
+ int gw_mode_tmp = -1;
+
+ if (buff[count - 1] == '\n')
+ buff[count - 1] = '\0';
+
+ if (strncmp(buff, BATADV_GW_MODE_OFF_NAME,
+ strlen(BATADV_GW_MODE_OFF_NAME)) == 0)
+ gw_mode_tmp = BATADV_GW_MODE_OFF;
+
+ if (strncmp(buff, BATADV_GW_MODE_CLIENT_NAME,
+ strlen(BATADV_GW_MODE_CLIENT_NAME)) == 0)
+ gw_mode_tmp = BATADV_GW_MODE_CLIENT;
+
+ if (strncmp(buff, BATADV_GW_MODE_SERVER_NAME,
+ strlen(BATADV_GW_MODE_SERVER_NAME)) == 0)
+ gw_mode_tmp = BATADV_GW_MODE_SERVER;
+
+ if (gw_mode_tmp < 0) {
+ batadv_info(net_dev,
+ "Invalid parameter for 'gw mode' setting received: %s\n",
+ buff);
+ return -EINVAL;
+ }
+
+ if (atomic_read(&bat_priv->gw_mode) == gw_mode_tmp)
+ return count;
+
+ switch (atomic_read(&bat_priv->gw_mode)) {
+ case BATADV_GW_MODE_CLIENT:
+ curr_gw_mode_str = BATADV_GW_MODE_CLIENT_NAME;
+ break;
+ case BATADV_GW_MODE_SERVER:
+ curr_gw_mode_str = BATADV_GW_MODE_SERVER_NAME;
+ break;
+ default:
+ curr_gw_mode_str = BATADV_GW_MODE_OFF_NAME;
+ break;
+ }
+
+ batadv_info(net_dev, "Changing gw mode from: %s to: %s\n",
+ curr_gw_mode_str, buff);
+
+ /* Invoking batadv_gw_reselect() is not enough to really de-select the
+ * current GW. It will only instruct the gateway client code to perform
+ * a re-election the next time that this is needed.
+ *
+ * When gw client mode is being switched off the current GW must be
+ * de-selected explicitly otherwise no GW_ADD uevent is thrown on
+ * client mode re-activation. This is operation is performed in
+ * batadv_gw_check_client_stop().
+ */
+ batadv_gw_reselect(bat_priv);
+ /* always call batadv_gw_check_client_stop() before changing the gateway
+ * state
+ */
+ batadv_gw_check_client_stop(bat_priv);
+ atomic_set(&bat_priv->gw_mode, (unsigned int)gw_mode_tmp);
+ batadv_gw_tvlv_container_update(bat_priv);
+ return count;
+}
+
+static ssize_t batadv_show_gw_bwidth(struct kobject *kobj,
+ struct attribute *attr, char *buff)
+{
+ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
+ uint32_t down, up;
+
+ down = atomic_read(&bat_priv->gw.bandwidth_down);
+ up = atomic_read(&bat_priv->gw.bandwidth_up);
+
+ return sprintf(buff, "%u.%u/%u.%u MBit\n", down / 10,
+ down % 10, up / 10, up % 10);
+}
+
+static ssize_t batadv_store_gw_bwidth(struct kobject *kobj,
+ struct attribute *attr, char *buff,
+ size_t count)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+
+ if (buff[count - 1] == '\n')
+ buff[count - 1] = '\0';
+
+ return batadv_gw_bandwidth_set(net_dev, buff, count);
+}
+
+/**
+ * batadv_show_isolation_mark - print the current isolation mark/mask
+ * @kobj: kobject representing the private mesh sysfs directory
+ * @attr: the batman-adv attribute the user is interacting with
+ * @buff: the buffer that will contain the data to send back to the user
+ *
+ * Returns the number of bytes written into 'buff' on success or a negative
+ * error code in case of failure
+ */
+static ssize_t batadv_show_isolation_mark(struct kobject *kobj,
+ struct attribute *attr, char *buff)
+{
+ struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
+
+ return sprintf(buff, "%#.8x/%#.8x\n", bat_priv->isolation_mark,
+ bat_priv->isolation_mark_mask);
+}
+
+/**
+ * batadv_store_isolation_mark - parse and store the isolation mark/mask entered
+ * by the user
+ * @kobj: kobject representing the private mesh sysfs directory
+ * @attr: the batman-adv attribute the user is interacting with
+ * @buff: the buffer containing the user data
+ * @count: number of bytes in the buffer
+ *
+ * Returns 'count' on success or a negative error code in case of failure
+ */
+static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
+ struct attribute *attr, char *buff,
+ size_t count)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ uint32_t mark, mask;
+ char *mask_ptr;
+
+ /* parse the mask if it has been specified, otherwise assume the mask is
+ * the biggest possible
+ */
+ mask = 0xFFFFFFFF;
+ mask_ptr = strchr(buff, '/');
+ if (mask_ptr) {
+ *mask_ptr = '\0';
+ mask_ptr++;
+
+ /* the mask must be entered in hex base as it is going to be a
+ * bitmask and not a prefix length
+ */
+ if (kstrtou32(mask_ptr, 16, &mask) < 0)
+ return -EINVAL;
+ }
+
+ /* the mark can be entered in any base */
+ if (kstrtou32(buff, 0, &mark) < 0)
+ return -EINVAL;
+
+ bat_priv->isolation_mark_mask = mask;
+ /* erase bits not covered by the mask */
+ bat_priv->isolation_mark = mark & bat_priv->isolation_mark_mask;
+
+ batadv_info(net_dev,
+ "New skb mark for extended isolation: %#.8x/%#.8x\n",
+ bat_priv->isolation_mark, bat_priv->isolation_mark_mask);
+
+ return count;
+}
+
+BATADV_ATTR_SIF_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL);
+BATADV_ATTR_SIF_BOOL(bonding, S_IRUGO | S_IWUSR, NULL);
+#ifdef CONFIG_BATMAN_ADV_BLA
+BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, S_IRUGO | S_IWUSR, NULL);
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
+BATADV_ATTR_SIF_BOOL(distributed_arp_table, S_IRUGO | S_IWUSR,
+ batadv_dat_status_update);
+#endif
+BATADV_ATTR_SIF_BOOL(fragmentation, S_IRUGO | S_IWUSR, batadv_update_min_mtu);
+static BATADV_ATTR(routing_algo, S_IRUGO, batadv_show_bat_algo, NULL);
+static BATADV_ATTR(gw_mode, S_IRUGO | S_IWUSR, batadv_show_gw_mode,
+ batadv_store_gw_mode);
+BATADV_ATTR_SIF_UINT(orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER,
+ INT_MAX, NULL);
+BATADV_ATTR_SIF_UINT(hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE,
+ NULL);
+BATADV_ATTR_SIF_UINT(gw_sel_class, S_IRUGO | S_IWUSR, 1, BATADV_TQ_MAX_VALUE,
+ batadv_post_gw_reselect);
+static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth,
+ batadv_store_gw_bwidth);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+BATADV_ATTR_SIF_BOOL(multicast_mode, S_IRUGO | S_IWUSR, NULL);
+#endif
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+BATADV_ATTR_SIF_UINT(log_level, S_IRUGO | S_IWUSR, 0, BATADV_DBG_ALL, NULL);
+#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+BATADV_ATTR_SIF_BOOL(network_coding, S_IRUGO | S_IWUSR,
+ batadv_nc_status_update);
+#endif
+static BATADV_ATTR(isolation_mark, S_IRUGO | S_IWUSR,
+ batadv_show_isolation_mark, batadv_store_isolation_mark);
+
+static struct batadv_attribute *batadv_mesh_attrs[] = {
+ &batadv_attr_aggregated_ogms,
+ &batadv_attr_bonding,
+#ifdef CONFIG_BATMAN_ADV_BLA
+ &batadv_attr_bridge_loop_avoidance,
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
+ &batadv_attr_distributed_arp_table,
+#endif
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ &batadv_attr_multicast_mode,
+#endif
+ &batadv_attr_fragmentation,
+ &batadv_attr_routing_algo,
+ &batadv_attr_gw_mode,
+ &batadv_attr_orig_interval,
+ &batadv_attr_hop_penalty,
+ &batadv_attr_gw_sel_class,
+ &batadv_attr_gw_bandwidth,
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ &batadv_attr_log_level,
+#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+ &batadv_attr_network_coding,
+#endif
+ &batadv_attr_isolation_mark,
+ NULL,
+};
+
+BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL);
+
+/**
+ * batadv_vlan_attrs - array of vlan specific sysfs attributes
+ */
+static struct batadv_attribute *batadv_vlan_attrs[] = {
+ &batadv_attr_vlan_ap_isolation,
+ NULL,
+};
+
+int batadv_sysfs_add_meshif(struct net_device *dev)
+{
+ struct kobject *batif_kobject = &dev->dev.kobj;
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_attribute **bat_attr;
+ int err;
+
+ bat_priv->mesh_obj = kobject_create_and_add(BATADV_SYSFS_IF_MESH_SUBDIR,
+ batif_kobject);
+ if (!bat_priv->mesh_obj) {
+ batadv_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name,
+ BATADV_SYSFS_IF_MESH_SUBDIR);
+ goto out;
+ }
+
+ for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) {
+ err = sysfs_create_file(bat_priv->mesh_obj,
+ &((*bat_attr)->attr));
+ if (err) {
+ batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
+ dev->name, BATADV_SYSFS_IF_MESH_SUBDIR,
+ ((*bat_attr)->attr).name);
+ goto rem_attr;
+ }
+ }
+
+ return 0;
+
+rem_attr:
+ for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
+ sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
+
+ kobject_put(bat_priv->mesh_obj);
+ bat_priv->mesh_obj = NULL;
+out:
+ return -ENOMEM;
+}
+
+void batadv_sysfs_del_meshif(struct net_device *dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_attribute **bat_attr;
+
+ for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
+ sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
+
+ kobject_put(bat_priv->mesh_obj);
+ bat_priv->mesh_obj = NULL;
+}
+
+/**
+ * batadv_sysfs_add_vlan - add all the needed sysfs objects for the new vlan
+ * @dev: netdev of the mesh interface
+ * @vlan: private data of the newly added VLAN interface
+ *
+ * Returns 0 on success and -ENOMEM if any of the structure allocations fails.
+ */
+int batadv_sysfs_add_vlan(struct net_device *dev,
+ struct batadv_softif_vlan *vlan)
+{
+ char vlan_subdir[sizeof(BATADV_SYSFS_VLAN_SUBDIR_PREFIX) + 5];
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct batadv_attribute **bat_attr;
+ int err;
+
+ if (vlan->vid & BATADV_VLAN_HAS_TAG) {
+ sprintf(vlan_subdir, BATADV_SYSFS_VLAN_SUBDIR_PREFIX "%hu",
+ vlan->vid & VLAN_VID_MASK);
+
+ vlan->kobj = kobject_create_and_add(vlan_subdir,
+ bat_priv->mesh_obj);
+ if (!vlan->kobj) {
+ batadv_err(dev, "Can't add sysfs directory: %s/%s\n",
+ dev->name, vlan_subdir);
+ goto out;
+ }
+ } else {
+ /* the untagged LAN uses the root folder to store its "VLAN
+ * specific attributes"
+ */
+ vlan->kobj = bat_priv->mesh_obj;
+ kobject_get(bat_priv->mesh_obj);
+ }
+
+ for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) {
+ err = sysfs_create_file(vlan->kobj,
+ &((*bat_attr)->attr));
+ if (err) {
+ batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
+ dev->name, vlan_subdir,
+ ((*bat_attr)->attr).name);
+ goto rem_attr;
+ }
+ }
+
+ return 0;
+
+rem_attr:
+ for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
+ sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
+
+ kobject_put(vlan->kobj);
+ vlan->kobj = NULL;
+out:
+ return -ENOMEM;
+}
+
+/**
+ * batadv_sysfs_del_vlan - remove all the sysfs objects for a given VLAN
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vlan: the private data of the VLAN to destroy
+ */
+void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
+ struct batadv_softif_vlan *vlan)
+{
+ struct batadv_attribute **bat_attr;
+
+ for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
+ sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
+
+ kobject_put(vlan->kobj);
+ vlan->kobj = NULL;
+}
+
+static ssize_t batadv_show_mesh_iface(struct kobject *kobj,
+ struct attribute *attr, char *buff)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct batadv_hard_iface *hard_iface;
+ ssize_t length;
+ const char *ifname;
+
+ hard_iface = batadv_hardif_get_by_netdev(net_dev);
+ if (!hard_iface)
+ return 0;
+
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
+ ifname = "none";
+ else
+ ifname = hard_iface->soft_iface->name;
+
+ length = sprintf(buff, "%s\n", ifname);
+
+ batadv_hardif_free_ref(hard_iface);
+
+ return length;
+}
+
+static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
+ struct attribute *attr, char *buff,
+ size_t count)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct batadv_hard_iface *hard_iface;
+ int status_tmp = -1;
+ int ret = count;
+
+ hard_iface = batadv_hardif_get_by_netdev(net_dev);
+ if (!hard_iface)
+ return count;
+
+ if (buff[count - 1] == '\n')
+ buff[count - 1] = '\0';
+
+ if (strlen(buff) >= IFNAMSIZ) {
+ pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
+ buff);
+ batadv_hardif_free_ref(hard_iface);
+ return -EINVAL;
+ }
+
+ if (strncmp(buff, "none", 4) == 0)
+ status_tmp = BATADV_IF_NOT_IN_USE;
+ else
+ status_tmp = BATADV_IF_I_WANT_YOU;
+
+ if (hard_iface->if_status == status_tmp)
+ goto out;
+
+ if ((hard_iface->soft_iface) &&
+ (strncmp(hard_iface->soft_iface->name, buff, IFNAMSIZ) == 0))
+ goto out;
+
+ rtnl_lock();
+
+ if (status_tmp == BATADV_IF_NOT_IN_USE) {
+ batadv_hardif_disable_interface(hard_iface,
+ BATADV_IF_CLEANUP_AUTO);
+ goto unlock;
+ }
+
+ /* if the interface already is in use */
+ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
+ batadv_hardif_disable_interface(hard_iface,
+ BATADV_IF_CLEANUP_AUTO);
+
+ ret = batadv_hardif_enable_interface(hard_iface, buff);
+
+unlock:
+ rtnl_unlock();
+out:
+ batadv_hardif_free_ref(hard_iface);
+ return ret;
+}
+
+static ssize_t batadv_show_iface_status(struct kobject *kobj,
+ struct attribute *attr, char *buff)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct batadv_hard_iface *hard_iface;
+ ssize_t length;
+
+ hard_iface = batadv_hardif_get_by_netdev(net_dev);
+ if (!hard_iface)
+ return 0;
+
+ switch (hard_iface->if_status) {
+ case BATADV_IF_TO_BE_REMOVED:
+ length = sprintf(buff, "disabling\n");
+ break;
+ case BATADV_IF_INACTIVE:
+ length = sprintf(buff, "inactive\n");
+ break;
+ case BATADV_IF_ACTIVE:
+ length = sprintf(buff, "active\n");
+ break;
+ case BATADV_IF_TO_BE_ACTIVATED:
+ length = sprintf(buff, "enabling\n");
+ break;
+ case BATADV_IF_NOT_IN_USE:
+ default:
+ length = sprintf(buff, "not in use\n");
+ break;
+ }
+
+ batadv_hardif_free_ref(hard_iface);
+
+ return length;
+}
+
+static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface,
+ batadv_store_mesh_iface);
+static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL);
+
+static struct batadv_attribute *batadv_batman_attrs[] = {
+ &batadv_attr_mesh_iface,
+ &batadv_attr_iface_status,
+ NULL,
+};
+
+int batadv_sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev)
+{
+ struct kobject *hardif_kobject = &dev->dev.kobj;
+ struct batadv_attribute **bat_attr;
+ int err;
+
+ *hardif_obj = kobject_create_and_add(BATADV_SYSFS_IF_BAT_SUBDIR,
+ hardif_kobject);
+
+ if (!*hardif_obj) {
+ batadv_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name,
+ BATADV_SYSFS_IF_BAT_SUBDIR);
+ goto out;
+ }
+
+ for (bat_attr = batadv_batman_attrs; *bat_attr; ++bat_attr) {
+ err = sysfs_create_file(*hardif_obj, &((*bat_attr)->attr));
+ if (err) {
+ batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
+ dev->name, BATADV_SYSFS_IF_BAT_SUBDIR,
+ ((*bat_attr)->attr).name);
+ goto rem_attr;
+ }
+ }
+
+ return 0;
+
+rem_attr:
+ for (bat_attr = batadv_batman_attrs; *bat_attr; ++bat_attr)
+ sysfs_remove_file(*hardif_obj, &((*bat_attr)->attr));
+out:
+ return -ENOMEM;
+}
+
+void batadv_sysfs_del_hardif(struct kobject **hardif_obj)
+{
+ kobject_put(*hardif_obj);
+ *hardif_obj = NULL;
+}
+
+int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
+ enum batadv_uev_action action, const char *data)
+{
+ int ret = -ENOMEM;
+ struct kobject *bat_kobj;
+ char *uevent_env[4] = { NULL, NULL, NULL, NULL };
+
+ bat_kobj = &bat_priv->soft_iface->dev.kobj;
+
+ uevent_env[0] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_TYPE_VAR,
+ batadv_uev_type_str[type]);
+ if (!uevent_env[0])
+ goto out;
+
+ uevent_env[1] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_ACTION_VAR,
+ batadv_uev_action_str[action]);
+ if (!uevent_env[1])
+ goto out;
+
+ /* If the event is DEL, ignore the data field */
+ if (action != BATADV_UEV_DEL) {
+ uevent_env[2] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_DATA_VAR, data);
+ if (!uevent_env[2])
+ goto out;
+ }
+
+ ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env);
+out:
+ kfree(uevent_env[0]);
+ kfree(uevent_env[1]);
+ kfree(uevent_env[2]);
+
+ if (ret)
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n",
+ batadv_uev_type_str[type],
+ batadv_uev_action_str[action],
+ (action == BATADV_UEV_DEL ? "NULL" : data), ret);
+ return ret;
+}
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
new file mode 100644
index 000000000..b715b60db
--- /dev/null
+++ b/net/batman-adv/sysfs.h
@@ -0,0 +1,50 @@
+/* Copyright (C) 2010-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_SYSFS_H_
+#define _NET_BATMAN_ADV_SYSFS_H_
+
+#define BATADV_SYSFS_IF_MESH_SUBDIR "mesh"
+#define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv"
+/**
+ * BATADV_SYSFS_VLAN_SUBDIR_PREFIX - prefix of the subfolder that will be
+ * created in the sysfs hierarchy for each VLAN interface. The subfolder will
+ * be named "BATADV_SYSFS_VLAN_SUBDIR_PREFIX%vid".
+ */
+#define BATADV_SYSFS_VLAN_SUBDIR_PREFIX "vlan"
+
+struct batadv_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+ char *buf, size_t count);
+};
+
+int batadv_sysfs_add_meshif(struct net_device *dev);
+void batadv_sysfs_del_meshif(struct net_device *dev);
+int batadv_sysfs_add_hardif(struct kobject **hardif_obj,
+ struct net_device *dev);
+void batadv_sysfs_del_hardif(struct kobject **hardif_obj);
+int batadv_sysfs_add_vlan(struct net_device *dev,
+ struct batadv_softif_vlan *vlan);
+void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
+ struct batadv_softif_vlan *vlan);
+int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
+ enum batadv_uev_action action, const char *data);
+
+#endif /* _NET_BATMAN_ADV_SYSFS_H_ */
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
new file mode 100644
index 000000000..07b263a43
--- /dev/null
+++ b/net/batman-adv/translation-table.c
@@ -0,0 +1,3710 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich, Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+#include "translation-table.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "send.h"
+#include "hash.h"
+#include "originator.h"
+#include "routing.h"
+#include "bridge_loop_avoidance.h"
+#include "multicast.h"
+
+#include <linux/crc32c.h>
+
+/* hash class keys */
+static struct lock_class_key batadv_tt_local_hash_lock_class_key;
+static struct lock_class_key batadv_tt_global_hash_lock_class_key;
+
+static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client,
+ unsigned short vid,
+ struct batadv_orig_node *orig_node);
+static void batadv_tt_purge(struct work_struct *work);
+static void
+batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry);
+static void batadv_tt_global_del(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *addr,
+ unsigned short vid, const char *message,
+ bool roaming);
+
+/* returns 1 if they are the same mac addr */
+static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
+{
+ const void *data1 = container_of(node, struct batadv_tt_common_entry,
+ hash_entry);
+
+ return batadv_compare_eth(data1, data2);
+}
+
+/**
+ * batadv_choose_tt - return the index of the tt entry in the hash table
+ * @data: pointer to the tt_common_entry object to map
+ * @size: the size of the hash table
+ *
+ * Returns the hash index where the object represented by 'data' should be
+ * stored at.
+ */
+static inline uint32_t batadv_choose_tt(const void *data, uint32_t size)
+{
+ struct batadv_tt_common_entry *tt;
+ uint32_t hash = 0;
+
+ tt = (struct batadv_tt_common_entry *)data;
+ hash = batadv_hash_bytes(hash, &tt->addr, ETH_ALEN);
+ hash = batadv_hash_bytes(hash, &tt->vid, sizeof(tt->vid));
+
+ hash += (hash << 3);
+ hash ^= (hash >> 11);
+ hash += (hash << 15);
+
+ return hash % size;
+}
+
+/**
+ * batadv_tt_hash_find - look for a client in the given hash table
+ * @hash: the hash table to search
+ * @addr: the mac address of the client to look for
+ * @vid: VLAN identifier
+ *
+ * Returns a pointer to the tt_common struct belonging to the searched client if
+ * found, NULL otherwise.
+ */
+static struct batadv_tt_common_entry *
+batadv_tt_hash_find(struct batadv_hashtable *hash, const uint8_t *addr,
+ unsigned short vid)
+{
+ struct hlist_head *head;
+ struct batadv_tt_common_entry to_search, *tt, *tt_tmp = NULL;
+ uint32_t index;
+
+ if (!hash)
+ return NULL;
+
+ ether_addr_copy(to_search.addr, addr);
+ to_search.vid = vid;
+
+ index = batadv_choose_tt(&to_search, hash->size);
+ head = &hash->table[index];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt, head, hash_entry) {
+ if (!batadv_compare_eth(tt, addr))
+ continue;
+
+ if (tt->vid != vid)
+ continue;
+
+ if (!atomic_inc_not_zero(&tt->refcount))
+ continue;
+
+ tt_tmp = tt;
+ break;
+ }
+ rcu_read_unlock();
+
+ return tt_tmp;
+}
+
+/**
+ * batadv_tt_local_hash_find - search the local table for a given client
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the mac address of the client to look for
+ * @vid: VLAN identifier
+ *
+ * Returns a pointer to the corresponding tt_local_entry struct if the client is
+ * found, NULL otherwise.
+ */
+static struct batadv_tt_local_entry *
+batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr,
+ unsigned short vid)
+{
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_local_entry *tt_local_entry = NULL;
+
+ tt_common_entry = batadv_tt_hash_find(bat_priv->tt.local_hash, addr,
+ vid);
+ if (tt_common_entry)
+ tt_local_entry = container_of(tt_common_entry,
+ struct batadv_tt_local_entry,
+ common);
+ return tt_local_entry;
+}
+
+/**
+ * batadv_tt_global_hash_find - search the global table for a given client
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the mac address of the client to look for
+ * @vid: VLAN identifier
+ *
+ * Returns a pointer to the corresponding tt_global_entry struct if the client
+ * is found, NULL otherwise.
+ */
+static struct batadv_tt_global_entry *
+batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr,
+ unsigned short vid)
+{
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_global_entry *tt_global_entry = NULL;
+
+ tt_common_entry = batadv_tt_hash_find(bat_priv->tt.global_hash, addr,
+ vid);
+ if (tt_common_entry)
+ tt_global_entry = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+ return tt_global_entry;
+}
+
+static void
+batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry)
+{
+ if (atomic_dec_and_test(&tt_local_entry->common.refcount))
+ kfree_rcu(tt_local_entry, common.rcu);
+}
+
+/**
+ * batadv_tt_global_entry_free_ref - decrement the refcounter for a
+ * tt_global_entry and possibly free it
+ * @tt_global_entry: the object to free
+ */
+static void
+batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry)
+{
+ if (atomic_dec_and_test(&tt_global_entry->common.refcount)) {
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ kfree_rcu(tt_global_entry, common.rcu);
+ }
+}
+
+/**
+ * batadv_tt_global_hash_count - count the number of orig entries
+ * @hash: hash table containing the tt entries
+ * @addr: the mac address of the client to count entries for
+ * @vid: VLAN identifier
+ *
+ * Return the number of originators advertising the given address/data
+ * (excluding ourself).
+ */
+int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
+ const uint8_t *addr, unsigned short vid)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+ int count;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global_entry)
+ return 0;
+
+ count = atomic_read(&tt_global_entry->orig_list_count);
+ batadv_tt_global_entry_free_ref(tt_global_entry);
+
+ return count;
+}
+
+static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu);
+
+ /* We are in an rcu callback here, therefore we cannot use
+ * batadv_orig_node_free_ref() and its call_rcu():
+ * An rcu_barrier() wouldn't wait for that to finish
+ */
+ batadv_orig_node_free_ref_now(orig_entry->orig_node);
+ kfree(orig_entry);
+}
+
+/**
+ * batadv_tt_local_size_mod - change the size by v of the local table identified
+ * by vid
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vid: the VLAN identifier of the sub-table to change
+ * @v: the amount to sum to the local table size
+ */
+static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv,
+ unsigned short vid, int v)
+{
+ struct batadv_softif_vlan *vlan;
+
+ vlan = batadv_softif_vlan_get(bat_priv, vid);
+ if (!vlan)
+ return;
+
+ atomic_add(v, &vlan->tt.num_entries);
+
+ batadv_softif_vlan_free_ref(vlan);
+}
+
+/**
+ * batadv_tt_local_size_inc - increase by one the local table size for the given
+ * vid
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vid: the VLAN identifier
+ */
+static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv,
+ unsigned short vid)
+{
+ batadv_tt_local_size_mod(bat_priv, vid, 1);
+}
+
+/**
+ * batadv_tt_local_size_dec - decrease by one the local table size for the given
+ * vid
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vid: the VLAN identifier
+ */
+static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv,
+ unsigned short vid)
+{
+ batadv_tt_local_size_mod(bat_priv, vid, -1);
+}
+
+/**
+ * batadv_tt_global_size_mod - change the size by v of the local table
+ * identified by vid
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vid: the VLAN identifier
+ * @v: the amount to sum to the global table size
+ */
+static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node,
+ unsigned short vid, int v)
+{
+ struct batadv_orig_node_vlan *vlan;
+
+ vlan = batadv_orig_node_vlan_new(orig_node, vid);
+ if (!vlan)
+ return;
+
+ if (atomic_add_return(v, &vlan->tt.num_entries) == 0) {
+ spin_lock_bh(&orig_node->vlan_list_lock);
+ list_del_rcu(&vlan->list);
+ spin_unlock_bh(&orig_node->vlan_list_lock);
+ batadv_orig_node_vlan_free_ref(vlan);
+ }
+
+ batadv_orig_node_vlan_free_ref(vlan);
+}
+
+/**
+ * batadv_tt_global_size_inc - increase by one the global table size for the
+ * given vid
+ * @orig_node: the originator which global table size has to be decreased
+ * @vid: the vlan identifier
+ */
+static void batadv_tt_global_size_inc(struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ batadv_tt_global_size_mod(orig_node, vid, 1);
+}
+
+/**
+ * batadv_tt_global_size_dec - decrease by one the global table size for the
+ * given vid
+ * @orig_node: the originator which global table size has to be decreased
+ * @vid: the vlan identifier
+ */
+static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ batadv_tt_global_size_mod(orig_node, vid, -1);
+}
+
+static void
+batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry)
+{
+ if (!atomic_dec_and_test(&orig_entry->refcount))
+ return;
+
+ call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu);
+}
+
+/**
+ * batadv_tt_local_event - store a local TT event (ADD/DEL)
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_local_entry: the TT entry involved in the event
+ * @event_flags: flags to store in the event structure
+ */
+static void batadv_tt_local_event(struct batadv_priv *bat_priv,
+ struct batadv_tt_local_entry *tt_local_entry,
+ uint8_t event_flags)
+{
+ struct batadv_tt_change_node *tt_change_node, *entry, *safe;
+ struct batadv_tt_common_entry *common = &tt_local_entry->common;
+ uint8_t flags = common->flags | event_flags;
+ bool event_removed = false;
+ bool del_op_requested, del_op_entry;
+
+ tt_change_node = kmalloc(sizeof(*tt_change_node), GFP_ATOMIC);
+ if (!tt_change_node)
+ return;
+
+ tt_change_node->change.flags = flags;
+ memset(tt_change_node->change.reserved, 0,
+ sizeof(tt_change_node->change.reserved));
+ ether_addr_copy(tt_change_node->change.addr, common->addr);
+ tt_change_node->change.vid = htons(common->vid);
+
+ del_op_requested = flags & BATADV_TT_CLIENT_DEL;
+
+ /* check for ADD+DEL or DEL+ADD events */
+ spin_lock_bh(&bat_priv->tt.changes_list_lock);
+ list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
+ list) {
+ if (!batadv_compare_eth(entry->change.addr, common->addr))
+ continue;
+
+ /* DEL+ADD in the same orig interval have no effect and can be
+ * removed to avoid silly behaviour on the receiver side. The
+ * other way around (ADD+DEL) can happen in case of roaming of
+ * a client still in the NEW state. Roaming of NEW clients is
+ * now possible due to automatically recognition of "temporary"
+ * clients
+ */
+ del_op_entry = entry->change.flags & BATADV_TT_CLIENT_DEL;
+ if (!del_op_requested && del_op_entry)
+ goto del;
+ if (del_op_requested && !del_op_entry)
+ goto del;
+
+ /* this is a second add in the same originator interval. It
+ * means that flags have been changed: update them!
+ */
+ if (!del_op_requested && !del_op_entry)
+ entry->change.flags = flags;
+
+ continue;
+del:
+ list_del(&entry->list);
+ kfree(entry);
+ kfree(tt_change_node);
+ event_removed = true;
+ goto unlock;
+ }
+
+ /* track the change in the OGMinterval list */
+ list_add_tail(&tt_change_node->list, &bat_priv->tt.changes_list);
+
+unlock:
+ spin_unlock_bh(&bat_priv->tt.changes_list_lock);
+
+ if (event_removed)
+ atomic_dec(&bat_priv->tt.local_changes);
+ else
+ atomic_inc(&bat_priv->tt.local_changes);
+}
+
+/**
+ * batadv_tt_len - compute length in bytes of given number of tt changes
+ * @changes_num: number of tt changes
+ *
+ * Returns computed length in bytes.
+ */
+static int batadv_tt_len(int changes_num)
+{
+ return changes_num * sizeof(struct batadv_tvlv_tt_change);
+}
+
+/**
+ * batadv_tt_entries - compute the number of entries fitting in tt_len bytes
+ * @tt_len: available space
+ *
+ * Returns the number of entries.
+ */
+static uint16_t batadv_tt_entries(uint16_t tt_len)
+{
+ return tt_len / batadv_tt_len(1);
+}
+
+/**
+ * batadv_tt_local_table_transmit_size - calculates the local translation table
+ * size when transmitted over the air
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Returns local translation table size in bytes.
+ */
+static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv)
+{
+ uint16_t num_vlan = 0, tt_local_entries = 0;
+ struct batadv_softif_vlan *vlan;
+ int hdr_size;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ num_vlan++;
+ tt_local_entries += atomic_read(&vlan->tt.num_entries);
+ }
+ rcu_read_unlock();
+
+ /* header size of tvlv encapsulated tt response payload */
+ hdr_size = sizeof(struct batadv_unicast_tvlv_packet);
+ hdr_size += sizeof(struct batadv_tvlv_hdr);
+ hdr_size += sizeof(struct batadv_tvlv_tt_data);
+ hdr_size += num_vlan * sizeof(struct batadv_tvlv_tt_vlan_data);
+
+ return hdr_size + batadv_tt_len(tt_local_entries);
+}
+
+static int batadv_tt_local_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->tt.local_hash)
+ return 0;
+
+ bat_priv->tt.local_hash = batadv_hash_new(1024);
+
+ if (!bat_priv->tt.local_hash)
+ return -ENOMEM;
+
+ batadv_hash_set_lock_class(bat_priv->tt.local_hash,
+ &batadv_tt_local_hash_lock_class_key);
+
+ return 0;
+}
+
+static void batadv_tt_global_free(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global,
+ const char *message)
+{
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting global tt entry %pM (vid: %d): %s\n",
+ tt_global->common.addr,
+ BATADV_PRINT_VID(tt_global->common.vid), message);
+
+ batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt,
+ batadv_choose_tt, &tt_global->common);
+ batadv_tt_global_entry_free_ref(tt_global);
+}
+
+/**
+ * batadv_tt_local_add - add a new client to the local table or update an
+ * existing client
+ * @soft_iface: netdev struct of the mesh interface
+ * @addr: the mac address of the client to add
+ * @vid: VLAN identifier
+ * @ifindex: index of the interface where the client is connected to (useful to
+ * identify wireless clients)
+ * @mark: the value contained in the skb->mark field of the received packet (if
+ * any)
+ *
+ * Returns true if the client was successfully added, false otherwise.
+ */
+bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr,
+ unsigned short vid, int ifindex, uint32_t mark)
+{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_tt_local_entry *tt_local;
+ struct batadv_tt_global_entry *tt_global = NULL;
+ struct batadv_softif_vlan *vlan;
+ struct net_device *in_dev = NULL;
+ struct hlist_head *head;
+ struct batadv_tt_orig_list_entry *orig_entry;
+ int hash_added, table_size, packet_size_max;
+ bool ret = false, roamed_back = false;
+ uint8_t remote_flags;
+ uint32_t match_mark;
+
+ if (ifindex != BATADV_NULL_IFINDEX)
+ in_dev = dev_get_by_index(&init_net, ifindex);
+
+ tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid);
+
+ if (!is_multicast_ether_addr(addr))
+ tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid);
+
+ if (tt_local) {
+ tt_local->last_seen = jiffies;
+ if (tt_local->common.flags & BATADV_TT_CLIENT_PENDING) {
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Re-adding pending client %pM (vid: %d)\n",
+ addr, BATADV_PRINT_VID(vid));
+ /* whatever the reason why the PENDING flag was set,
+ * this is a client which was enqueued to be removed in
+ * this orig_interval. Since it popped up again, the
+ * flag can be reset like it was never enqueued
+ */
+ tt_local->common.flags &= ~BATADV_TT_CLIENT_PENDING;
+ goto add_event;
+ }
+
+ if (tt_local->common.flags & BATADV_TT_CLIENT_ROAM) {
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Roaming client %pM (vid: %d) came back to its original location\n",
+ addr, BATADV_PRINT_VID(vid));
+ /* the ROAM flag is set because this client roamed away
+ * and the node got a roaming_advertisement message. Now
+ * that the client popped up again at its original
+ * location such flag can be unset
+ */
+ tt_local->common.flags &= ~BATADV_TT_CLIENT_ROAM;
+ roamed_back = true;
+ }
+ goto check_roaming;
+ }
+
+ /* Ignore the client if we cannot send it in a full table response. */
+ table_size = batadv_tt_local_table_transmit_size(bat_priv);
+ table_size += batadv_tt_len(1);
+ packet_size_max = atomic_read(&bat_priv->packet_size_max);
+ if (table_size > packet_size_max) {
+ net_ratelimited_function(batadv_info, soft_iface,
+ "Local translation table size (%i) exceeds maximum packet size (%i); Ignoring new local tt entry: %pM\n",
+ table_size, packet_size_max, addr);
+ goto out;
+ }
+
+ tt_local = kmalloc(sizeof(*tt_local), GFP_ATOMIC);
+ if (!tt_local)
+ goto out;
+
+ /* increase the refcounter of the related vlan */
+ vlan = batadv_softif_vlan_get(bat_priv, vid);
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n",
+ addr, BATADV_PRINT_VID(vid),
+ (uint8_t)atomic_read(&bat_priv->tt.vn));
+
+ ether_addr_copy(tt_local->common.addr, addr);
+ /* The local entry has to be marked as NEW to avoid to send it in
+ * a full table response going out before the next ttvn increment
+ * (consistency check)
+ */
+ tt_local->common.flags = BATADV_TT_CLIENT_NEW;
+ tt_local->common.vid = vid;
+ if (batadv_is_wifi_netdev(in_dev))
+ tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
+ atomic_set(&tt_local->common.refcount, 2);
+ tt_local->last_seen = jiffies;
+ tt_local->common.added_at = tt_local->last_seen;
+
+ /* the batman interface mac and multicast addresses should never be
+ * purged
+ */
+ if (batadv_compare_eth(addr, soft_iface->dev_addr) ||
+ is_multicast_ether_addr(addr))
+ tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE;
+
+ hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt,
+ batadv_choose_tt, &tt_local->common,
+ &tt_local->common.hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* remove the reference for the hash */
+ batadv_tt_local_entry_free_ref(tt_local);
+ batadv_softif_vlan_free_ref(vlan);
+ goto out;
+ }
+
+add_event:
+ batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS);
+
+check_roaming:
+ /* Check whether it is a roaming, but don't do anything if the roaming
+ * process has already been handled
+ */
+ if (tt_global && !(tt_global->common.flags & BATADV_TT_CLIENT_ROAM)) {
+ /* These node are probably going to update their tt table */
+ head = &tt_global->orig_list;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ batadv_send_roam_adv(bat_priv, tt_global->common.addr,
+ tt_global->common.vid,
+ orig_entry->orig_node);
+ }
+ rcu_read_unlock();
+ if (roamed_back) {
+ batadv_tt_global_free(bat_priv, tt_global,
+ "Roaming canceled");
+ tt_global = NULL;
+ } else {
+ /* The global entry has to be marked as ROAMING and
+ * has to be kept for consistency purpose
+ */
+ tt_global->common.flags |= BATADV_TT_CLIENT_ROAM;
+ tt_global->roam_at = jiffies;
+ }
+ }
+
+ /* store the current remote flags before altering them. This helps
+ * understanding is flags are changing or not
+ */
+ remote_flags = tt_local->common.flags & BATADV_TT_REMOTE_MASK;
+
+ if (batadv_is_wifi_netdev(in_dev))
+ tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
+ else
+ tt_local->common.flags &= ~BATADV_TT_CLIENT_WIFI;
+
+ /* check the mark in the skb: if it's equal to the configured
+ * isolation_mark, it means the packet is coming from an isolated
+ * non-mesh client
+ */
+ match_mark = (mark & bat_priv->isolation_mark_mask);
+ if (bat_priv->isolation_mark_mask &&
+ match_mark == bat_priv->isolation_mark)
+ tt_local->common.flags |= BATADV_TT_CLIENT_ISOLA;
+ else
+ tt_local->common.flags &= ~BATADV_TT_CLIENT_ISOLA;
+
+ /* if any "dynamic" flag has been modified, resend an ADD event for this
+ * entry so that all the nodes can get the new flags
+ */
+ if (remote_flags ^ (tt_local->common.flags & BATADV_TT_REMOTE_MASK))
+ batadv_tt_local_event(bat_priv, tt_local, BATADV_NO_FLAGS);
+
+ ret = true;
+out:
+ if (in_dev)
+ dev_put(in_dev);
+ if (tt_local)
+ batadv_tt_local_entry_free_ref(tt_local);
+ if (tt_global)
+ batadv_tt_global_entry_free_ref(tt_global);
+ return ret;
+}
+
+/**
+ * batadv_tt_prepare_tvlv_global_data - prepare the TVLV TT header to send
+ * within a TT Response directed to another node
+ * @orig_node: originator for which the TT data has to be prepared
+ * @tt_data: uninitialised pointer to the address of the TVLV buffer
+ * @tt_change: uninitialised pointer to the address of the area where the TT
+ * changed can be stored
+ * @tt_len: pointer to the length to reserve to the tt_change. if -1 this
+ * function reserves the amount of space needed to send the entire global TT
+ * table. In case of success the value is updated with the real amount of
+ * reserved bytes
+
+ * Allocate the needed amount of memory for the entire TT TVLV and write its
+ * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
+ * objects, one per active VLAN served by the originator node.
+ *
+ * Return the size of the allocated buffer or 0 in case of failure.
+ */
+static uint16_t
+batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_tt_data **tt_data,
+ struct batadv_tvlv_tt_change **tt_change,
+ int32_t *tt_len)
+{
+ uint16_t num_vlan = 0, num_entries = 0, change_offset, tvlv_len;
+ struct batadv_tvlv_tt_vlan_data *tt_vlan;
+ struct batadv_orig_node_vlan *vlan;
+ uint8_t *tt_change_ptr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ num_vlan++;
+ num_entries += atomic_read(&vlan->tt.num_entries);
+ }
+
+ change_offset = sizeof(**tt_data);
+ change_offset += num_vlan * sizeof(*tt_vlan);
+
+ /* if tt_len is negative, allocate the space needed by the full table */
+ if (*tt_len < 0)
+ *tt_len = batadv_tt_len(num_entries);
+
+ tvlv_len = *tt_len;
+ tvlv_len += change_offset;
+
+ *tt_data = kmalloc(tvlv_len, GFP_ATOMIC);
+ if (!*tt_data) {
+ *tt_len = 0;
+ goto out;
+ }
+
+ (*tt_data)->flags = BATADV_NO_FLAGS;
+ (*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn);
+ (*tt_data)->num_vlan = htons(num_vlan);
+
+ tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
+ list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ tt_vlan->vid = htons(vlan->vid);
+ tt_vlan->crc = htonl(vlan->tt.crc);
+
+ tt_vlan++;
+ }
+
+ tt_change_ptr = (uint8_t *)*tt_data + change_offset;
+ *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
+
+out:
+ rcu_read_unlock();
+ return tvlv_len;
+}
+
+/**
+ * batadv_tt_prepare_tvlv_local_data - allocate and prepare the TT TVLV for this
+ * node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_data: uninitialised pointer to the address of the TVLV buffer
+ * @tt_change: uninitialised pointer to the address of the area where the TT
+ * changes can be stored
+ * @tt_len: pointer to the length to reserve to the tt_change. if -1 this
+ * function reserves the amount of space needed to send the entire local TT
+ * table. In case of success the value is updated with the real amount of
+ * reserved bytes
+ *
+ * Allocate the needed amount of memory for the entire TT TVLV and write its
+ * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
+ * objects, one per active VLAN.
+ *
+ * Return the size of the allocated buffer or 0 in case of failure.
+ */
+static uint16_t
+batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data **tt_data,
+ struct batadv_tvlv_tt_change **tt_change,
+ int32_t *tt_len)
+{
+ struct batadv_tvlv_tt_vlan_data *tt_vlan;
+ struct batadv_softif_vlan *vlan;
+ uint16_t num_vlan = 0, num_entries = 0, tvlv_len;
+ uint8_t *tt_change_ptr;
+ int change_offset;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ num_vlan++;
+ num_entries += atomic_read(&vlan->tt.num_entries);
+ }
+
+ change_offset = sizeof(**tt_data);
+ change_offset += num_vlan * sizeof(*tt_vlan);
+
+ /* if tt_len is negative, allocate the space needed by the full table */
+ if (*tt_len < 0)
+ *tt_len = batadv_tt_len(num_entries);
+
+ tvlv_len = *tt_len;
+ tvlv_len += change_offset;
+
+ *tt_data = kmalloc(tvlv_len, GFP_ATOMIC);
+ if (!*tt_data) {
+ tvlv_len = 0;
+ goto out;
+ }
+
+ (*tt_data)->flags = BATADV_NO_FLAGS;
+ (*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn);
+ (*tt_data)->num_vlan = htons(num_vlan);
+
+ tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
+ hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ tt_vlan->vid = htons(vlan->vid);
+ tt_vlan->crc = htonl(vlan->tt.crc);
+
+ tt_vlan++;
+ }
+
+ tt_change_ptr = (uint8_t *)*tt_data + change_offset;
+ *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
+
+out:
+ rcu_read_unlock();
+ return tvlv_len;
+}
+
+/**
+ * batadv_tt_tvlv_container_update - update the translation table tvlv container
+ * after local tt changes have been committed
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_change_node *entry, *safe;
+ struct batadv_tvlv_tt_data *tt_data;
+ struct batadv_tvlv_tt_change *tt_change;
+ int tt_diff_len, tt_change_len = 0;
+ int tt_diff_entries_num = 0, tt_diff_entries_count = 0;
+ uint16_t tvlv_len;
+
+ tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes);
+ tt_diff_len = batadv_tt_len(tt_diff_entries_num);
+
+ /* if we have too many changes for one packet don't send any
+ * and wait for the tt table request which will be fragmented
+ */
+ if (tt_diff_len > bat_priv->soft_iface->mtu)
+ tt_diff_len = 0;
+
+ tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tt_data,
+ &tt_change, &tt_diff_len);
+ if (!tvlv_len)
+ return;
+
+ tt_data->flags = BATADV_TT_OGM_DIFF;
+
+ if (tt_diff_len == 0)
+ goto container_register;
+
+ spin_lock_bh(&bat_priv->tt.changes_list_lock);
+ atomic_set(&bat_priv->tt.local_changes, 0);
+
+ list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
+ list) {
+ if (tt_diff_entries_count < tt_diff_entries_num) {
+ memcpy(tt_change + tt_diff_entries_count,
+ &entry->change,
+ sizeof(struct batadv_tvlv_tt_change));
+ tt_diff_entries_count++;
+ }
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ spin_unlock_bh(&bat_priv->tt.changes_list_lock);
+
+ /* Keep the buffer for possible tt_request */
+ spin_lock_bh(&bat_priv->tt.last_changeset_lock);
+ kfree(bat_priv->tt.last_changeset);
+ bat_priv->tt.last_changeset_len = 0;
+ bat_priv->tt.last_changeset = NULL;
+ tt_change_len = batadv_tt_len(tt_diff_entries_count);
+ /* check whether this new OGM has no changes due to size problems */
+ if (tt_diff_entries_count > 0) {
+ /* if kmalloc() fails we will reply with the full table
+ * instead of providing the diff
+ */
+ bat_priv->tt.last_changeset = kzalloc(tt_diff_len, GFP_ATOMIC);
+ if (bat_priv->tt.last_changeset) {
+ memcpy(bat_priv->tt.last_changeset,
+ tt_change, tt_change_len);
+ bat_priv->tt.last_changeset_len = tt_diff_len;
+ }
+ }
+ spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
+
+container_register:
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_TT, 1, tt_data,
+ tvlv_len);
+ kfree(tt_data);
+}
+
+int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_local_entry *tt_local;
+ struct batadv_hard_iface *primary_if;
+ struct batadv_softif_vlan *vlan;
+ struct hlist_head *head;
+ unsigned short vid;
+ uint32_t i;
+ int last_seen_secs;
+ int last_seen_msecs;
+ unsigned long last_seen_jiffies;
+ bool no_purge;
+ uint16_t np_flag = BATADV_TT_CLIENT_NOPURGE;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ goto out;
+
+ seq_printf(seq,
+ "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n",
+ net_dev->name, (uint8_t)atomic_read(&bat_priv->tt.vn));
+ seq_printf(seq, " %-13s %s %-8s %-9s (%-10s)\n", "Client", "VID",
+ "Flags", "Last seen", "CRC");
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt_common_entry,
+ head, hash_entry) {
+ tt_local = container_of(tt_common_entry,
+ struct batadv_tt_local_entry,
+ common);
+ vid = tt_common_entry->vid;
+ last_seen_jiffies = jiffies - tt_local->last_seen;
+ last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
+ last_seen_secs = last_seen_msecs / 1000;
+ last_seen_msecs = last_seen_msecs % 1000;
+
+ no_purge = tt_common_entry->flags & np_flag;
+
+ vlan = batadv_softif_vlan_get(bat_priv, vid);
+ if (!vlan) {
+ seq_printf(seq, "Cannot retrieve VLAN %d\n",
+ BATADV_PRINT_VID(vid));
+ continue;
+ }
+
+ seq_printf(seq,
+ " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n",
+ tt_common_entry->addr,
+ BATADV_PRINT_VID(tt_common_entry->vid),
+ (tt_common_entry->flags &
+ BATADV_TT_CLIENT_ROAM ? 'R' : '.'),
+ no_purge ? 'P' : '.',
+ (tt_common_entry->flags &
+ BATADV_TT_CLIENT_NEW ? 'N' : '.'),
+ (tt_common_entry->flags &
+ BATADV_TT_CLIENT_PENDING ? 'X' : '.'),
+ (tt_common_entry->flags &
+ BATADV_TT_CLIENT_WIFI ? 'W' : '.'),
+ (tt_common_entry->flags &
+ BATADV_TT_CLIENT_ISOLA ? 'I' : '.'),
+ no_purge ? 0 : last_seen_secs,
+ no_purge ? 0 : last_seen_msecs,
+ vlan->tt.crc);
+
+ batadv_softif_vlan_free_ref(vlan);
+ }
+ rcu_read_unlock();
+ }
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return 0;
+}
+
+static void
+batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
+ struct batadv_tt_local_entry *tt_local_entry,
+ uint16_t flags, const char *message)
+{
+ batadv_tt_local_event(bat_priv, tt_local_entry, flags);
+
+ /* The local client has to be marked as "pending to be removed" but has
+ * to be kept in the table in order to send it in a full table
+ * response issued before the net ttvn increment (consistency check)
+ */
+ tt_local_entry->common.flags |= BATADV_TT_CLIENT_PENDING;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Local tt entry (%pM, vid: %d) pending to be removed: %s\n",
+ tt_local_entry->common.addr,
+ BATADV_PRINT_VID(tt_local_entry->common.vid), message);
+}
+
+/**
+ * batadv_tt_local_remove - logically remove an entry from the local table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the MAC address of the client to remove
+ * @vid: VLAN identifier
+ * @message: message to append to the log on deletion
+ * @roaming: true if the deletion is due to a roaming event
+ *
+ * Returns the flags assigned to the local entry before being deleted
+ */
+uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv,
+ const uint8_t *addr, unsigned short vid,
+ const char *message, bool roaming)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+ uint16_t flags, curr_flags = BATADV_NO_FLAGS;
+ struct batadv_softif_vlan *vlan;
+
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
+ if (!tt_local_entry)
+ goto out;
+
+ curr_flags = tt_local_entry->common.flags;
+
+ flags = BATADV_TT_CLIENT_DEL;
+ /* if this global entry addition is due to a roaming, the node has to
+ * mark the local entry as "roamed" in order to correctly reroute
+ * packets later
+ */
+ if (roaming) {
+ flags |= BATADV_TT_CLIENT_ROAM;
+ /* mark the local client as ROAMed */
+ tt_local_entry->common.flags |= BATADV_TT_CLIENT_ROAM;
+ }
+
+ if (!(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW)) {
+ batadv_tt_local_set_pending(bat_priv, tt_local_entry, flags,
+ message);
+ goto out;
+ }
+ /* if this client has been added right now, it is possible to
+ * immediately purge it
+ */
+ batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL);
+ hlist_del_rcu(&tt_local_entry->common.hash_entry);
+ batadv_tt_local_entry_free_ref(tt_local_entry);
+
+ /* decrease the reference held for this vlan */
+ vlan = batadv_softif_vlan_get(bat_priv, vid);
+ batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_free_ref(vlan);
+
+out:
+ if (tt_local_entry)
+ batadv_tt_local_entry_free_ref(tt_local_entry);
+
+ return curr_flags;
+}
+
+/**
+ * batadv_tt_local_purge_list - purge inactive tt local entries
+ * @bat_priv: the bat priv with all the soft interface information
+ * @head: pointer to the list containing the local tt entries
+ * @timeout: parameter deciding whether a given tt local entry is considered
+ * inactive or not
+ */
+static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv,
+ struct hlist_head *head,
+ int timeout)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct hlist_node *node_tmp;
+
+ hlist_for_each_entry_safe(tt_common_entry, node_tmp, head,
+ hash_entry) {
+ tt_local_entry = container_of(tt_common_entry,
+ struct batadv_tt_local_entry,
+ common);
+ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_NOPURGE)
+ continue;
+
+ /* entry already marked for deletion */
+ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING)
+ continue;
+
+ if (!batadv_has_timed_out(tt_local_entry->last_seen, timeout))
+ continue;
+
+ batadv_tt_local_set_pending(bat_priv, tt_local_entry,
+ BATADV_TT_CLIENT_DEL, "timed out");
+ }
+}
+
+/**
+ * batadv_tt_local_purge - purge inactive tt local entries
+ * @bat_priv: the bat priv with all the soft interface information
+ * @timeout: parameter deciding whether a given tt local entry is considered
+ * inactive or not
+ */
+static void batadv_tt_local_purge(struct batadv_priv *bat_priv,
+ int timeout)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ uint32_t i;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ batadv_tt_local_purge_list(bat_priv, head, timeout);
+ spin_unlock_bh(list_lock);
+ }
+}
+
+static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_local_entry *tt_local;
+ struct batadv_softif_vlan *vlan;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ uint32_t i;
+
+ if (!bat_priv->tt.local_hash)
+ return;
+
+ hash = bat_priv->tt.local_hash;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common_entry, node_tmp,
+ head, hash_entry) {
+ hlist_del_rcu(&tt_common_entry->hash_entry);
+ tt_local = container_of(tt_common_entry,
+ struct batadv_tt_local_entry,
+ common);
+
+ /* decrease the reference held for this vlan */
+ vlan = batadv_softif_vlan_get(bat_priv,
+ tt_common_entry->vid);
+ batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_free_ref(vlan);
+
+ batadv_tt_local_entry_free_ref(tt_local);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_hash_destroy(hash);
+
+ bat_priv->tt.local_hash = NULL;
+}
+
+static int batadv_tt_global_init(struct batadv_priv *bat_priv)
+{
+ if (bat_priv->tt.global_hash)
+ return 0;
+
+ bat_priv->tt.global_hash = batadv_hash_new(1024);
+
+ if (!bat_priv->tt.global_hash)
+ return -ENOMEM;
+
+ batadv_hash_set_lock_class(bat_priv->tt.global_hash,
+ &batadv_tt_global_hash_lock_class_key);
+
+ return 0;
+}
+
+static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_change_node *entry, *safe;
+
+ spin_lock_bh(&bat_priv->tt.changes_list_lock);
+
+ list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
+ list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+
+ atomic_set(&bat_priv->tt.local_changes, 0);
+ spin_unlock_bh(&bat_priv->tt.changes_list_lock);
+}
+
+/* retrieves the orig_tt_list_entry belonging to orig_node from the
+ * batadv_tt_global_entry list
+ *
+ * returns it with an increased refcounter, NULL if not found
+ */
+static struct batadv_tt_orig_list_entry *
+batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
+ const struct batadv_orig_node *orig_node)
+{
+ struct batadv_tt_orig_list_entry *tmp_orig_entry, *orig_entry = NULL;
+ const struct hlist_head *head;
+
+ rcu_read_lock();
+ head = &entry->orig_list;
+ hlist_for_each_entry_rcu(tmp_orig_entry, head, list) {
+ if (tmp_orig_entry->orig_node != orig_node)
+ continue;
+ if (!atomic_inc_not_zero(&tmp_orig_entry->refcount))
+ continue;
+
+ orig_entry = tmp_orig_entry;
+ break;
+ }
+ rcu_read_unlock();
+
+ return orig_entry;
+}
+
+/* find out if an orig_node is already in the list of a tt_global_entry.
+ * returns true if found, false otherwise
+ */
+static bool
+batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry,
+ const struct batadv_orig_node *orig_node)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+ bool found = false;
+
+ orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node);
+ if (orig_entry) {
+ found = true;
+ batadv_tt_orig_list_entry_free_ref(orig_entry);
+ }
+
+ return found;
+}
+
+static void
+batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
+ struct batadv_orig_node *orig_node, int ttvn)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node);
+ if (orig_entry) {
+ /* refresh the ttvn: the current value could be a bogus one that
+ * was added during a "temporary client detection"
+ */
+ orig_entry->ttvn = ttvn;
+ goto out;
+ }
+
+ orig_entry = kzalloc(sizeof(*orig_entry), GFP_ATOMIC);
+ if (!orig_entry)
+ goto out;
+
+ INIT_HLIST_NODE(&orig_entry->list);
+ atomic_inc(&orig_node->refcount);
+ batadv_tt_global_size_inc(orig_node, tt_global->common.vid);
+ orig_entry->orig_node = orig_node;
+ orig_entry->ttvn = ttvn;
+ atomic_set(&orig_entry->refcount, 2);
+
+ spin_lock_bh(&tt_global->list_lock);
+ hlist_add_head_rcu(&orig_entry->list,
+ &tt_global->orig_list);
+ spin_unlock_bh(&tt_global->list_lock);
+ atomic_inc(&tt_global->orig_list_count);
+
+out:
+ if (orig_entry)
+ batadv_tt_orig_list_entry_free_ref(orig_entry);
+}
+
+/**
+ * batadv_tt_global_add - add a new TT global entry or update an existing one
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: the originator announcing the client
+ * @tt_addr: the mac address of the non-mesh client
+ * @vid: VLAN identifier
+ * @flags: TT flags that have to be set for this non-mesh client
+ * @ttvn: the tt version number ever announcing this non-mesh client
+ *
+ * Add a new TT global entry for the given originator. If the entry already
+ * exists add a new reference to the given originator (a global entry can have
+ * references to multiple originators) and adjust the flags attribute to reflect
+ * the function argument.
+ * If a TT local entry exists for this non-mesh client remove it.
+ *
+ * The caller must hold orig_node refcount.
+ *
+ * Return true if the new entry has been added, false otherwise
+ */
+static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *tt_addr,
+ unsigned short vid, uint16_t flags,
+ uint8_t ttvn)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+ struct batadv_tt_local_entry *tt_local_entry;
+ bool ret = false;
+ int hash_added;
+ struct batadv_tt_common_entry *common;
+ uint16_t local_flags;
+
+ /* ignore global entries from backbone nodes */
+ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid))
+ return true;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, tt_addr, vid);
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, tt_addr, vid);
+
+ /* if the node already has a local client for this entry, it has to wait
+ * for a roaming advertisement instead of manually messing up the global
+ * table
+ */
+ if ((flags & BATADV_TT_CLIENT_TEMP) && tt_local_entry &&
+ !(tt_local_entry->common.flags & BATADV_TT_CLIENT_NEW))
+ goto out;
+
+ if (!tt_global_entry) {
+ tt_global_entry = kzalloc(sizeof(*tt_global_entry), GFP_ATOMIC);
+ if (!tt_global_entry)
+ goto out;
+
+ common = &tt_global_entry->common;
+ ether_addr_copy(common->addr, tt_addr);
+ common->vid = vid;
+
+ common->flags = flags;
+ tt_global_entry->roam_at = 0;
+ /* node must store current time in case of roaming. This is
+ * needed to purge this entry out on timeout (if nobody claims
+ * it)
+ */
+ if (flags & BATADV_TT_CLIENT_ROAM)
+ tt_global_entry->roam_at = jiffies;
+ atomic_set(&common->refcount, 2);
+ common->added_at = jiffies;
+
+ INIT_HLIST_HEAD(&tt_global_entry->orig_list);
+ atomic_set(&tt_global_entry->orig_list_count, 0);
+ spin_lock_init(&tt_global_entry->list_lock);
+
+ hash_added = batadv_hash_add(bat_priv->tt.global_hash,
+ batadv_compare_tt,
+ batadv_choose_tt, common,
+ &common->hash_entry);
+
+ if (unlikely(hash_added != 0)) {
+ /* remove the reference for the hash */
+ batadv_tt_global_entry_free_ref(tt_global_entry);
+ goto out_remove;
+ }
+ } else {
+ common = &tt_global_entry->common;
+ /* If there is already a global entry, we can use this one for
+ * our processing.
+ * But if we are trying to add a temporary client then here are
+ * two options at this point:
+ * 1) the global client is not a temporary client: the global
+ * client has to be left as it is, temporary information
+ * should never override any already known client state
+ * 2) the global client is a temporary client: purge the
+ * originator list and add the new one orig_entry
+ */
+ if (flags & BATADV_TT_CLIENT_TEMP) {
+ if (!(common->flags & BATADV_TT_CLIENT_TEMP))
+ goto out;
+ if (batadv_tt_global_entry_has_orig(tt_global_entry,
+ orig_node))
+ goto out_remove;
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ goto add_orig_entry;
+ }
+
+ /* if the client was temporary added before receiving the first
+ * OGM announcing it, we have to clear the TEMP flag
+ */
+ common->flags &= ~BATADV_TT_CLIENT_TEMP;
+
+ /* the change can carry possible "attribute" flags like the
+ * TT_CLIENT_WIFI, therefore they have to be copied in the
+ * client entry
+ */
+ tt_global_entry->common.flags |= flags;
+
+ /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only
+ * one originator left in the list and we previously received a
+ * delete + roaming change for this originator.
+ *
+ * We should first delete the old originator before adding the
+ * new one.
+ */
+ if (common->flags & BATADV_TT_CLIENT_ROAM) {
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ common->flags &= ~BATADV_TT_CLIENT_ROAM;
+ tt_global_entry->roam_at = 0;
+ }
+ }
+add_orig_entry:
+ /* add the new orig_entry (if needed) or update it */
+ batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn);
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Creating new global tt entry: %pM (vid: %d, via %pM)\n",
+ common->addr, BATADV_PRINT_VID(common->vid),
+ orig_node->orig);
+ ret = true;
+
+out_remove:
+ /* Do not remove multicast addresses from the local hash on
+ * global additions
+ */
+ if (is_multicast_ether_addr(tt_addr))
+ goto out;
+
+ /* remove address from local hash if present */
+ local_flags = batadv_tt_local_remove(bat_priv, tt_addr, vid,
+ "global tt received",
+ flags & BATADV_TT_CLIENT_ROAM);
+ tt_global_entry->common.flags |= local_flags & BATADV_TT_CLIENT_WIFI;
+
+ if (!(flags & BATADV_TT_CLIENT_ROAM))
+ /* this is a normal global add. Therefore the client is not in a
+ * roaming state anymore.
+ */
+ tt_global_entry->common.flags &= ~BATADV_TT_CLIENT_ROAM;
+
+out:
+ if (tt_global_entry)
+ batadv_tt_global_entry_free_ref(tt_global_entry);
+ if (tt_local_entry)
+ batadv_tt_local_entry_free_ref(tt_local_entry);
+ return ret;
+}
+
+/**
+ * batadv_transtable_best_orig - Get best originator list entry from tt entry
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_global_entry: global translation table entry to be analyzed
+ *
+ * This functon assumes the caller holds rcu_read_lock().
+ * Returns best originator list entry or NULL on errors.
+ */
+static struct batadv_tt_orig_list_entry *
+batadv_transtable_best_orig(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global_entry)
+{
+ struct batadv_neigh_node *router, *best_router = NULL;
+ struct batadv_algo_ops *bao = bat_priv->bat_algo_ops;
+ struct hlist_head *head;
+ struct batadv_tt_orig_list_entry *orig_entry, *best_entry = NULL;
+
+ head = &tt_global_entry->orig_list;
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ router = batadv_orig_router_get(orig_entry->orig_node,
+ BATADV_IF_DEFAULT);
+ if (!router)
+ continue;
+
+ if (best_router &&
+ bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT,
+ best_router, BATADV_IF_DEFAULT) <= 0) {
+ batadv_neigh_node_free_ref(router);
+ continue;
+ }
+
+ /* release the refcount for the "old" best */
+ if (best_router)
+ batadv_neigh_node_free_ref(best_router);
+
+ best_entry = orig_entry;
+ best_router = router;
+ }
+
+ if (best_router)
+ batadv_neigh_node_free_ref(best_router);
+
+ return best_entry;
+}
+
+/**
+ * batadv_tt_global_print_entry - print all orig nodes who announce the address
+ * for this global entry
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_global_entry: global translation table entry to be printed
+ * @seq: debugfs table seq_file struct
+ *
+ * This functon assumes the caller holds rcu_read_lock().
+ */
+static void
+batadv_tt_global_print_entry(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global_entry,
+ struct seq_file *seq)
+{
+ struct batadv_tt_orig_list_entry *orig_entry, *best_entry;
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_orig_node_vlan *vlan;
+ struct hlist_head *head;
+ uint8_t last_ttvn;
+ uint16_t flags;
+
+ tt_common_entry = &tt_global_entry->common;
+ flags = tt_common_entry->flags;
+
+ best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry);
+ if (best_entry) {
+ vlan = batadv_orig_node_vlan_get(best_entry->orig_node,
+ tt_common_entry->vid);
+ if (!vlan) {
+ seq_printf(seq,
+ " * Cannot retrieve VLAN %d for originator %pM\n",
+ BATADV_PRINT_VID(tt_common_entry->vid),
+ best_entry->orig_node->orig);
+ goto print_list;
+ }
+
+ last_ttvn = atomic_read(&best_entry->orig_node->last_ttvn);
+ seq_printf(seq,
+ " %c %pM %4i (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n",
+ '*', tt_global_entry->common.addr,
+ BATADV_PRINT_VID(tt_global_entry->common.vid),
+ best_entry->ttvn, best_entry->orig_node->orig,
+ last_ttvn, vlan->tt.crc,
+ (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'),
+ (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'),
+ (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'),
+ (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.'));
+
+ batadv_orig_node_vlan_free_ref(vlan);
+ }
+
+print_list:
+ head = &tt_global_entry->orig_list;
+
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ if (best_entry == orig_entry)
+ continue;
+
+ vlan = batadv_orig_node_vlan_get(orig_entry->orig_node,
+ tt_common_entry->vid);
+ if (!vlan) {
+ seq_printf(seq,
+ " + Cannot retrieve VLAN %d for originator %pM\n",
+ BATADV_PRINT_VID(tt_common_entry->vid),
+ orig_entry->orig_node->orig);
+ continue;
+ }
+
+ last_ttvn = atomic_read(&orig_entry->orig_node->last_ttvn);
+ seq_printf(seq,
+ " %c %pM %4d (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n",
+ '+', tt_global_entry->common.addr,
+ BATADV_PRINT_VID(tt_global_entry->common.vid),
+ orig_entry->ttvn, orig_entry->orig_node->orig,
+ last_ttvn, vlan->tt.crc,
+ (flags & BATADV_TT_CLIENT_ROAM ? 'R' : '.'),
+ (flags & BATADV_TT_CLIENT_WIFI ? 'W' : '.'),
+ (flags & BATADV_TT_CLIENT_ISOLA ? 'I' : '.'),
+ (flags & BATADV_TT_CLIENT_TEMP ? 'T' : '.'));
+
+ batadv_orig_node_vlan_free_ref(vlan);
+ }
+}
+
+int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hashtable *hash = bat_priv->tt.global_hash;
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_global_entry *tt_global;
+ struct batadv_hard_iface *primary_if;
+ struct hlist_head *head;
+ uint32_t i;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ goto out;
+
+ seq_printf(seq,
+ "Globally announced TT entries received via the mesh %s\n",
+ net_dev->name);
+ seq_printf(seq, " %-13s %s %s %-15s %s (%-10s) %s\n",
+ "Client", "VID", "(TTVN)", "Originator", "(Curr TTVN)",
+ "CRC", "Flags");
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt_common_entry,
+ head, hash_entry) {
+ tt_global = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+ batadv_tt_global_print_entry(bat_priv, tt_global, seq);
+ }
+ rcu_read_unlock();
+ }
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ return 0;
+}
+
+/**
+ * batadv_tt_global_del_orig_entry - remove and free an orig_entry
+ * @tt_global_entry: the global entry to remove the orig_entry from
+ * @orig_entry: the orig entry to remove and free
+ *
+ * Remove an orig_entry from its list in the given tt_global_entry and
+ * free this orig_entry afterwards.
+ */
+static void
+batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry,
+ struct batadv_tt_orig_list_entry *orig_entry)
+{
+ batadv_tt_global_size_dec(orig_entry->orig_node,
+ tt_global_entry->common.vid);
+ atomic_dec(&tt_global_entry->orig_list_count);
+ hlist_del_rcu(&orig_entry->list);
+ batadv_tt_orig_list_entry_free_ref(orig_entry);
+}
+
+/* deletes the orig list of a tt_global_entry */
+static void
+batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry)
+{
+ struct hlist_head *head;
+ struct hlist_node *safe;
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ spin_lock_bh(&tt_global_entry->list_lock);
+ head = &tt_global_entry->orig_list;
+ hlist_for_each_entry_safe(orig_entry, safe, head, list)
+ batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry);
+ spin_unlock_bh(&tt_global_entry->list_lock);
+}
+
+/**
+ * batadv_tt_global_del_orig_node - remove orig_node from a global tt entry
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_global_entry: the global entry to remove the orig_node from
+ * @orig_node: the originator announcing the client
+ * @message: message to append to the log on deletion
+ *
+ * Remove the given orig_node and its according orig_entry from the given
+ * global tt entry.
+ */
+static void
+batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global_entry,
+ struct batadv_orig_node *orig_node,
+ const char *message)
+{
+ struct hlist_head *head;
+ struct hlist_node *safe;
+ struct batadv_tt_orig_list_entry *orig_entry;
+ unsigned short vid;
+
+ spin_lock_bh(&tt_global_entry->list_lock);
+ head = &tt_global_entry->orig_list;
+ hlist_for_each_entry_safe(orig_entry, safe, head, list) {
+ if (orig_entry->orig_node == orig_node) {
+ vid = tt_global_entry->common.vid;
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting %pM from global tt entry %pM (vid: %d): %s\n",
+ orig_node->orig,
+ tt_global_entry->common.addr,
+ BATADV_PRINT_VID(vid), message);
+ batadv_tt_global_del_orig_entry(tt_global_entry,
+ orig_entry);
+ }
+ }
+ spin_unlock_bh(&tt_global_entry->list_lock);
+}
+
+/* If the client is to be deleted, we check if it is the last origantor entry
+ * within tt_global entry. If yes, we set the BATADV_TT_CLIENT_ROAM flag and the
+ * timer, otherwise we simply remove the originator scheduled for deletion.
+ */
+static void
+batadv_tt_global_del_roaming(struct batadv_priv *bat_priv,
+ struct batadv_tt_global_entry *tt_global_entry,
+ struct batadv_orig_node *orig_node,
+ const char *message)
+{
+ bool last_entry = true;
+ struct hlist_head *head;
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ /* no local entry exists, case 1:
+ * Check if this is the last one or if other entries exist.
+ */
+
+ rcu_read_lock();
+ head = &tt_global_entry->orig_list;
+ hlist_for_each_entry_rcu(orig_entry, head, list) {
+ if (orig_entry->orig_node != orig_node) {
+ last_entry = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (last_entry) {
+ /* its the last one, mark for roaming. */
+ tt_global_entry->common.flags |= BATADV_TT_CLIENT_ROAM;
+ tt_global_entry->roam_at = jiffies;
+ } else
+ /* there is another entry, we can simply delete this
+ * one and can still use the other one.
+ */
+ batadv_tt_global_del_orig_node(bat_priv, tt_global_entry,
+ orig_node, message);
+}
+
+/**
+ * batadv_tt_global_del - remove a client from the global table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: an originator serving this client
+ * @addr: the mac address of the client
+ * @vid: VLAN identifier
+ * @message: a message explaining the reason for deleting the client to print
+ * for debugging purpose
+ * @roaming: true if the deletion has been triggered by a roaming event
+ */
+static void batadv_tt_global_del(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *addr, unsigned short vid,
+ const char *message, bool roaming)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+ struct batadv_tt_local_entry *local_entry = NULL;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global_entry)
+ goto out;
+
+ if (!roaming) {
+ batadv_tt_global_del_orig_node(bat_priv, tt_global_entry,
+ orig_node, message);
+
+ if (hlist_empty(&tt_global_entry->orig_list))
+ batadv_tt_global_free(bat_priv, tt_global_entry,
+ message);
+
+ goto out;
+ }
+
+ /* if we are deleting a global entry due to a roam
+ * event, there are two possibilities:
+ * 1) the client roamed from node A to node B => if there
+ * is only one originator left for this client, we mark
+ * it with BATADV_TT_CLIENT_ROAM, we start a timer and we
+ * wait for node B to claim it. In case of timeout
+ * the entry is purged.
+ *
+ * If there are other originators left, we directly delete
+ * the originator.
+ * 2) the client roamed to us => we can directly delete
+ * the global entry, since it is useless now.
+ */
+ local_entry = batadv_tt_local_hash_find(bat_priv,
+ tt_global_entry->common.addr,
+ vid);
+ if (local_entry) {
+ /* local entry exists, case 2: client roamed to us. */
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ batadv_tt_global_free(bat_priv, tt_global_entry, message);
+ } else
+ /* no local entry exists, case 1: check for roaming */
+ batadv_tt_global_del_roaming(bat_priv, tt_global_entry,
+ orig_node, message);
+
+out:
+ if (tt_global_entry)
+ batadv_tt_global_entry_free_ref(tt_global_entry);
+ if (local_entry)
+ batadv_tt_local_entry_free_ref(local_entry);
+}
+
+/**
+ * batadv_tt_global_del_orig - remove all the TT global entries belonging to the
+ * given originator matching the provided vid
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: the originator owning the entries to remove
+ * @match_vid: the VLAN identifier to match. If negative all the entries will be
+ * removed
+ * @message: debug message to print as "reason"
+ */
+void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ int32_t match_vid,
+ const char *message)
+{
+ struct batadv_tt_global_entry *tt_global;
+ struct batadv_tt_common_entry *tt_common_entry;
+ uint32_t i;
+ struct batadv_hashtable *hash = bat_priv->tt.global_hash;
+ struct hlist_node *safe;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ unsigned short vid;
+
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common_entry, safe,
+ head, hash_entry) {
+ /* remove only matching entries */
+ if (match_vid >= 0 && tt_common_entry->vid != match_vid)
+ continue;
+
+ tt_global = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+
+ batadv_tt_global_del_orig_node(bat_priv, tt_global,
+ orig_node, message);
+
+ if (hlist_empty(&tt_global->orig_list)) {
+ vid = tt_global->common.vid;
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting global tt entry %pM (vid: %d): %s\n",
+ tt_global->common.addr,
+ BATADV_PRINT_VID(vid), message);
+ hlist_del_rcu(&tt_common_entry->hash_entry);
+ batadv_tt_global_entry_free_ref(tt_global);
+ }
+ }
+ spin_unlock_bh(list_lock);
+ }
+ orig_node->capa_initialized &= ~BATADV_ORIG_CAPA_HAS_TT;
+}
+
+static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global,
+ char **msg)
+{
+ bool purge = false;
+ unsigned long roam_timeout = BATADV_TT_CLIENT_ROAM_TIMEOUT;
+ unsigned long temp_timeout = BATADV_TT_CLIENT_TEMP_TIMEOUT;
+
+ if ((tt_global->common.flags & BATADV_TT_CLIENT_ROAM) &&
+ batadv_has_timed_out(tt_global->roam_at, roam_timeout)) {
+ purge = true;
+ *msg = "Roaming timeout\n";
+ }
+
+ if ((tt_global->common.flags & BATADV_TT_CLIENT_TEMP) &&
+ batadv_has_timed_out(tt_global->common.added_at, temp_timeout)) {
+ purge = true;
+ *msg = "Temporary client timeout\n";
+ }
+
+ return purge;
+}
+
+static void batadv_tt_global_purge(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.global_hash;
+ struct hlist_head *head;
+ struct hlist_node *node_tmp;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ uint32_t i;
+ char *msg = NULL;
+ struct batadv_tt_common_entry *tt_common;
+ struct batadv_tt_global_entry *tt_global;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common, node_tmp, head,
+ hash_entry) {
+ tt_global = container_of(tt_common,
+ struct batadv_tt_global_entry,
+ common);
+
+ if (!batadv_tt_global_to_purge(tt_global, &msg))
+ continue;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting global tt entry %pM (vid: %d): %s\n",
+ tt_global->common.addr,
+ BATADV_PRINT_VID(tt_global->common.vid),
+ msg);
+
+ hlist_del_rcu(&tt_common->hash_entry);
+
+ batadv_tt_global_entry_free_ref(tt_global);
+ }
+ spin_unlock_bh(list_lock);
+ }
+}
+
+static void batadv_tt_global_table_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tt_global_entry *tt_global;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ uint32_t i;
+
+ if (!bat_priv->tt.global_hash)
+ return;
+
+ hash = bat_priv->tt.global_hash;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common_entry, node_tmp,
+ head, hash_entry) {
+ hlist_del_rcu(&tt_common_entry->hash_entry);
+ tt_global = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+ batadv_tt_global_entry_free_ref(tt_global);
+ }
+ spin_unlock_bh(list_lock);
+ }
+
+ batadv_hash_destroy(hash);
+
+ bat_priv->tt.global_hash = NULL;
+}
+
+static bool
+_batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry,
+ struct batadv_tt_global_entry *tt_global_entry)
+{
+ bool ret = false;
+
+ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_WIFI &&
+ tt_global_entry->common.flags & BATADV_TT_CLIENT_WIFI)
+ ret = true;
+
+ /* check if the two clients are marked as isolated */
+ if (tt_local_entry->common.flags & BATADV_TT_CLIENT_ISOLA &&
+ tt_global_entry->common.flags & BATADV_TT_CLIENT_ISOLA)
+ ret = true;
+
+ return ret;
+}
+
+/**
+ * batadv_transtable_search - get the mesh destination for a given client
+ * @bat_priv: the bat priv with all the soft interface information
+ * @src: mac address of the source client
+ * @addr: mac address of the destination client
+ * @vid: VLAN identifier
+ *
+ * Returns a pointer to the originator that was selected as destination in the
+ * mesh for contacting the client 'addr', NULL otherwise.
+ * In case of multiple originators serving the same client, the function returns
+ * the best one (best in terms of metric towards the destination node).
+ *
+ * If the two clients are AP isolated the function returns NULL.
+ */
+struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
+ const uint8_t *src,
+ const uint8_t *addr,
+ unsigned short vid)
+{
+ struct batadv_tt_local_entry *tt_local_entry = NULL;
+ struct batadv_tt_global_entry *tt_global_entry = NULL;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_tt_orig_list_entry *best_entry;
+
+ if (src && batadv_vlan_ap_isola_get(bat_priv, vid)) {
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, src, vid);
+ if (!tt_local_entry ||
+ (tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING))
+ goto out;
+ }
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global_entry)
+ goto out;
+
+ /* check whether the clients should not communicate due to AP
+ * isolation
+ */
+ if (tt_local_entry &&
+ _batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
+ goto out;
+
+ rcu_read_lock();
+ best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry);
+ /* found anything? */
+ if (best_entry)
+ orig_node = best_entry->orig_node;
+ if (orig_node && !atomic_inc_not_zero(&orig_node->refcount))
+ orig_node = NULL;
+ rcu_read_unlock();
+
+out:
+ if (tt_global_entry)
+ batadv_tt_global_entry_free_ref(tt_global_entry);
+ if (tt_local_entry)
+ batadv_tt_local_entry_free_ref(tt_local_entry);
+
+ return orig_node;
+}
+
+/**
+ * batadv_tt_global_crc - calculates the checksum of the local table belonging
+ * to the given orig_node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: originator for which the CRC should be computed
+ * @vid: VLAN identifier for which the CRC32 has to be computed
+ *
+ * This function computes the checksum for the global table corresponding to a
+ * specific originator. In particular, the checksum is computed as follows: For
+ * each client connected to the originator the CRC32C of the MAC address and the
+ * VID is computed and then all the CRC32Cs of the various clients are xor'ed
+ * together.
+ *
+ * The idea behind is that CRC32C should be used as much as possible in order to
+ * produce a unique hash of the table, but since the order which is used to feed
+ * the CRC32C function affects the result and since every node in the network
+ * probably sorts the clients differently, the hash function cannot be directly
+ * computed over the entire table. Hence the CRC32C is used only on
+ * the single client entry, while all the results are then xor'ed together
+ * because the XOR operation can combine them all while trying to reduce the
+ * noise as much as possible.
+ *
+ * Returns the checksum of the global table of a given originator.
+ */
+static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ unsigned short vid)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.global_hash;
+ struct batadv_tt_common_entry *tt_common;
+ struct batadv_tt_global_entry *tt_global;
+ struct hlist_head *head;
+ uint32_t i, crc_tmp, crc = 0;
+ uint8_t flags;
+ __be16 tmp_vid;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt_common, head, hash_entry) {
+ tt_global = container_of(tt_common,
+ struct batadv_tt_global_entry,
+ common);
+ /* compute the CRC only for entries belonging to the
+ * VLAN identified by the vid passed as parameter
+ */
+ if (tt_common->vid != vid)
+ continue;
+
+ /* Roaming clients are in the global table for
+ * consistency only. They don't have to be
+ * taken into account while computing the
+ * global crc
+ */
+ if (tt_common->flags & BATADV_TT_CLIENT_ROAM)
+ continue;
+ /* Temporary clients have not been announced yet, so
+ * they have to be skipped while computing the global
+ * crc
+ */
+ if (tt_common->flags & BATADV_TT_CLIENT_TEMP)
+ continue;
+
+ /* find out if this global entry is announced by this
+ * originator
+ */
+ if (!batadv_tt_global_entry_has_orig(tt_global,
+ orig_node))
+ continue;
+
+ /* use network order to read the VID: this ensures that
+ * every node reads the bytes in the same order.
+ */
+ tmp_vid = htons(tt_common->vid);
+ crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid));
+
+ /* compute the CRC on flags that have to be kept in sync
+ * among nodes
+ */
+ flags = tt_common->flags & BATADV_TT_SYNC_MASK;
+ crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags));
+
+ crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN);
+ }
+ rcu_read_unlock();
+ }
+
+ return crc;
+}
+
+/**
+ * batadv_tt_local_crc - calculates the checksum of the local table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @vid: VLAN identifier for which the CRC32 has to be computed
+ *
+ * For details about the computation, please refer to the documentation for
+ * batadv_tt_global_crc().
+ *
+ * Returns the checksum of the local table
+ */
+static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv,
+ unsigned short vid)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct batadv_tt_common_entry *tt_common;
+ struct hlist_head *head;
+ uint32_t i, crc_tmp, crc = 0;
+ uint8_t flags;
+ __be16 tmp_vid;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt_common, head, hash_entry) {
+ /* compute the CRC only for entries belonging to the
+ * VLAN identified by vid
+ */
+ if (tt_common->vid != vid)
+ continue;
+
+ /* not yet committed clients have not to be taken into
+ * account while computing the CRC
+ */
+ if (tt_common->flags & BATADV_TT_CLIENT_NEW)
+ continue;
+
+ /* use network order to read the VID: this ensures that
+ * every node reads the bytes in the same order.
+ */
+ tmp_vid = htons(tt_common->vid);
+ crc_tmp = crc32c(0, &tmp_vid, sizeof(tmp_vid));
+
+ /* compute the CRC on flags that have to be kept in sync
+ * among nodes
+ */
+ flags = tt_common->flags & BATADV_TT_SYNC_MASK;
+ crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags));
+
+ crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN);
+ }
+ rcu_read_unlock();
+ }
+
+ return crc;
+}
+
+static void batadv_tt_req_list_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_req_node *node, *safe;
+
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+
+ list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ list_del(&node->list);
+ kfree(node);
+ }
+
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+}
+
+static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const void *tt_buff,
+ uint16_t tt_buff_len)
+{
+ /* Replace the old buffer only if I received something in the
+ * last OGM (the OGM could carry no changes)
+ */
+ spin_lock_bh(&orig_node->tt_buff_lock);
+ if (tt_buff_len > 0) {
+ kfree(orig_node->tt_buff);
+ orig_node->tt_buff_len = 0;
+ orig_node->tt_buff = kmalloc(tt_buff_len, GFP_ATOMIC);
+ if (orig_node->tt_buff) {
+ memcpy(orig_node->tt_buff, tt_buff, tt_buff_len);
+ orig_node->tt_buff_len = tt_buff_len;
+ }
+ }
+ spin_unlock_bh(&orig_node->tt_buff_lock);
+}
+
+static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_req_node *node, *safe;
+
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+ list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ if (batadv_has_timed_out(node->issued_at,
+ BATADV_TT_REQUEST_TIMEOUT)) {
+ list_del(&node->list);
+ kfree(node);
+ }
+ }
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+}
+
+/* returns the pointer to the new tt_req_node struct if no request
+ * has already been issued for this orig_node, NULL otherwise
+ */
+static struct batadv_tt_req_node *
+batadv_new_tt_req_node(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL;
+
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+ list_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) {
+ if (batadv_compare_eth(tt_req_node_tmp, orig_node) &&
+ !batadv_has_timed_out(tt_req_node_tmp->issued_at,
+ BATADV_TT_REQUEST_TIMEOUT))
+ goto unlock;
+ }
+
+ tt_req_node = kmalloc(sizeof(*tt_req_node), GFP_ATOMIC);
+ if (!tt_req_node)
+ goto unlock;
+
+ ether_addr_copy(tt_req_node->addr, orig_node->orig);
+ tt_req_node->issued_at = jiffies;
+
+ list_add(&tt_req_node->list, &bat_priv->tt.req_list);
+unlock:
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+ return tt_req_node;
+}
+
+/**
+ * batadv_tt_local_valid - verify that given tt entry is a valid one
+ * @entry_ptr: to be checked local tt entry
+ * @data_ptr: not used but definition required to satisfy the callback prototype
+ *
+ * Returns 1 if the entry is a valid, 0 otherwise.
+ */
+static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr)
+{
+ const struct batadv_tt_common_entry *tt_common_entry = entry_ptr;
+
+ if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW)
+ return 0;
+ return 1;
+}
+
+static int batadv_tt_global_valid(const void *entry_ptr,
+ const void *data_ptr)
+{
+ const struct batadv_tt_common_entry *tt_common_entry = entry_ptr;
+ const struct batadv_tt_global_entry *tt_global_entry;
+ const struct batadv_orig_node *orig_node = data_ptr;
+
+ if (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM ||
+ tt_common_entry->flags & BATADV_TT_CLIENT_TEMP)
+ return 0;
+
+ tt_global_entry = container_of(tt_common_entry,
+ struct batadv_tt_global_entry,
+ common);
+
+ return batadv_tt_global_entry_has_orig(tt_global_entry, orig_node);
+}
+
+/**
+ * batadv_tt_tvlv_generate - fill the tvlv buff with the tt entries from the
+ * specified tt hash
+ * @bat_priv: the bat priv with all the soft interface information
+ * @hash: hash table containing the tt entries
+ * @tt_len: expected tvlv tt data buffer length in number of bytes
+ * @tvlv_buff: pointer to the buffer to fill with the TT data
+ * @valid_cb: function to filter tt change entries
+ * @cb_data: data passed to the filter function as argument
+ */
+static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ void *tvlv_buff, uint16_t tt_len,
+ int (*valid_cb)(const void *, const void *),
+ void *cb_data)
+{
+ struct batadv_tt_common_entry *tt_common_entry;
+ struct batadv_tvlv_tt_change *tt_change;
+ struct hlist_head *head;
+ uint16_t tt_tot, tt_num_entries = 0;
+ uint32_t i;
+
+ tt_tot = batadv_tt_entries(tt_len);
+ tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff;
+
+ rcu_read_lock();
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ hlist_for_each_entry_rcu(tt_common_entry,
+ head, hash_entry) {
+ if (tt_tot == tt_num_entries)
+ break;
+
+ if ((valid_cb) && (!valid_cb(tt_common_entry, cb_data)))
+ continue;
+
+ ether_addr_copy(tt_change->addr, tt_common_entry->addr);
+ tt_change->flags = tt_common_entry->flags;
+ tt_change->vid = htons(tt_common_entry->vid);
+ memset(tt_change->reserved, 0,
+ sizeof(tt_change->reserved));
+
+ tt_num_entries++;
+ tt_change++;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * batadv_tt_global_check_crc - check if all the CRCs are correct
+ * @orig_node: originator for which the CRCs have to be checked
+ * @tt_vlan: pointer to the first tvlv VLAN entry
+ * @num_vlan: number of tvlv VLAN entries
+ * @create: if true, create VLAN objects if not found
+ *
+ * Return true if all the received CRCs match the locally stored ones, false
+ * otherwise
+ */
+static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_tt_vlan_data *tt_vlan,
+ uint16_t num_vlan)
+{
+ struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp;
+ struct batadv_orig_node_vlan *vlan;
+ uint32_t crc;
+ int i;
+
+ /* check if each received CRC matches the locally stored one */
+ for (i = 0; i < num_vlan; i++) {
+ tt_vlan_tmp = tt_vlan + i;
+
+ /* if orig_node is a backbone node for this VLAN, don't check
+ * the CRC as we ignore all the global entries over it
+ */
+ if (batadv_bla_is_backbone_gw_orig(orig_node->bat_priv,
+ orig_node->orig,
+ ntohs(tt_vlan_tmp->vid)))
+ continue;
+
+ vlan = batadv_orig_node_vlan_get(orig_node,
+ ntohs(tt_vlan_tmp->vid));
+ if (!vlan)
+ return false;
+
+ crc = vlan->tt.crc;
+ batadv_orig_node_vlan_free_ref(vlan);
+
+ if (crc != ntohl(tt_vlan_tmp->crc))
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * batadv_tt_local_update_crc - update all the local CRCs
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv)
+{
+ struct batadv_softif_vlan *vlan;
+
+ /* recompute the global CRC for each VLAN */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ vlan->tt.crc = batadv_tt_local_crc(bat_priv, vlan->vid);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * batadv_tt_global_update_crc - update all the global CRCs for this orig_node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: the orig_node for which the CRCs have to be updated
+ */
+static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_orig_node_vlan *vlan;
+ uint32_t crc;
+
+ /* recompute the global CRC for each VLAN */
+ rcu_read_lock();
+ list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ /* if orig_node is a backbone node for this VLAN, don't compute
+ * the CRC as we ignore all the global entries over it
+ */
+ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig,
+ vlan->vid))
+ continue;
+
+ crc = batadv_tt_global_crc(bat_priv, orig_node, vlan->vid);
+ vlan->tt.crc = crc;
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * batadv_send_tt_request - send a TT Request message to a given node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @dst_orig_node: the destination of the message
+ * @ttvn: the version number that the source of the message is looking for
+ * @tt_vlan: pointer to the first tvlv VLAN object to request
+ * @num_vlan: number of tvlv VLAN entries
+ * @full_table: ask for the entire translation table if true, while only for the
+ * last TT diff otherwise
+ */
+static int batadv_send_tt_request(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *dst_orig_node,
+ uint8_t ttvn,
+ struct batadv_tvlv_tt_vlan_data *tt_vlan,
+ uint16_t num_vlan, bool full_table)
+{
+ struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
+ struct batadv_tt_req_node *tt_req_node = NULL;
+ struct batadv_tvlv_tt_vlan_data *tt_vlan_req;
+ struct batadv_hard_iface *primary_if;
+ bool ret = false;
+ int i, size;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* The new tt_req will be issued only if I'm not waiting for a
+ * reply from the same orig_node yet
+ */
+ tt_req_node = batadv_new_tt_req_node(bat_priv, dst_orig_node);
+ if (!tt_req_node)
+ goto out;
+
+ size = sizeof(*tvlv_tt_data) + sizeof(*tt_vlan_req) * num_vlan;
+ tvlv_tt_data = kzalloc(size, GFP_ATOMIC);
+ if (!tvlv_tt_data)
+ goto out;
+
+ tvlv_tt_data->flags = BATADV_TT_REQUEST;
+ tvlv_tt_data->ttvn = ttvn;
+ tvlv_tt_data->num_vlan = htons(num_vlan);
+
+ /* send all the CRCs within the request. This is needed by intermediate
+ * nodes to ensure they have the correct table before replying
+ */
+ tt_vlan_req = (struct batadv_tvlv_tt_vlan_data *)(tvlv_tt_data + 1);
+ for (i = 0; i < num_vlan; i++) {
+ tt_vlan_req->vid = tt_vlan->vid;
+ tt_vlan_req->crc = tt_vlan->crc;
+
+ tt_vlan_req++;
+ tt_vlan++;
+ }
+
+ if (full_table)
+ tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending TT_REQUEST to %pM [%c]\n",
+ dst_orig_node->orig, full_table ? 'F' : '.');
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_TX);
+ batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr,
+ dst_orig_node->orig, BATADV_TVLV_TT, 1,
+ tvlv_tt_data, size);
+ ret = true;
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ if (ret && tt_req_node) {
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+ list_del(&tt_req_node->list);
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+ kfree(tt_req_node);
+ }
+ kfree(tvlv_tt_data);
+ return ret;
+}
+
+/**
+ * batadv_send_other_tt_response - send reply to tt request concerning another
+ * node's translation table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_data: tt data containing the tt request information
+ * @req_src: mac address of tt request sender
+ * @req_dst: mac address of tt request recipient
+ *
+ * Returns true if tt request reply was sent, false otherwise.
+ */
+static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data *tt_data,
+ uint8_t *req_src, uint8_t *req_dst)
+{
+ struct batadv_orig_node *req_dst_orig_node;
+ struct batadv_orig_node *res_dst_orig_node = NULL;
+ struct batadv_tvlv_tt_change *tt_change;
+ struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
+ struct batadv_tvlv_tt_vlan_data *tt_vlan;
+ bool ret = false, full_table;
+ uint8_t orig_ttvn, req_ttvn;
+ uint16_t tvlv_len;
+ int32_t tt_len;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n",
+ req_src, tt_data->ttvn, req_dst,
+ (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.'));
+
+ /* Let's get the orig node of the REAL destination */
+ req_dst_orig_node = batadv_orig_hash_find(bat_priv, req_dst);
+ if (!req_dst_orig_node)
+ goto out;
+
+ res_dst_orig_node = batadv_orig_hash_find(bat_priv, req_src);
+ if (!res_dst_orig_node)
+ goto out;
+
+ orig_ttvn = (uint8_t)atomic_read(&req_dst_orig_node->last_ttvn);
+ req_ttvn = tt_data->ttvn;
+
+ tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1);
+ /* this node doesn't have the requested data */
+ if (orig_ttvn != req_ttvn ||
+ !batadv_tt_global_check_crc(req_dst_orig_node, tt_vlan,
+ ntohs(tt_data->num_vlan)))
+ goto out;
+
+ /* If the full table has been explicitly requested */
+ if (tt_data->flags & BATADV_TT_FULL_TABLE ||
+ !req_dst_orig_node->tt_buff)
+ full_table = true;
+ else
+ full_table = false;
+
+ /* TT fragmentation hasn't been implemented yet, so send as many
+ * TT entries fit a single packet as possible only
+ */
+ if (!full_table) {
+ spin_lock_bh(&req_dst_orig_node->tt_buff_lock);
+ tt_len = req_dst_orig_node->tt_buff_len;
+
+ tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node,
+ &tvlv_tt_data,
+ &tt_change,
+ &tt_len);
+ if (!tt_len)
+ goto unlock;
+
+ /* Copy the last orig_node's OGM buffer */
+ memcpy(tt_change, req_dst_orig_node->tt_buff,
+ req_dst_orig_node->tt_buff_len);
+ spin_unlock_bh(&req_dst_orig_node->tt_buff_lock);
+ } else {
+ /* allocate the tvlv, put the tt_data and all the tt_vlan_data
+ * in the initial part
+ */
+ tt_len = -1;
+ tvlv_len = batadv_tt_prepare_tvlv_global_data(req_dst_orig_node,
+ &tvlv_tt_data,
+ &tt_change,
+ &tt_len);
+ if (!tt_len)
+ goto out;
+
+ /* fill the rest of the tvlv with the real TT entries */
+ batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.global_hash,
+ tt_change, tt_len,
+ batadv_tt_global_valid,
+ req_dst_orig_node);
+ }
+
+ /* Don't send the response, if larger than fragmented packet. */
+ tt_len = sizeof(struct batadv_unicast_tvlv_packet) + tvlv_len;
+ if (tt_len > atomic_read(&bat_priv->packet_size_max)) {
+ net_ratelimited_function(batadv_info, bat_priv->soft_iface,
+ "Ignoring TT_REQUEST from %pM; Response size exceeds max packet size.\n",
+ res_dst_orig_node->orig);
+ goto out;
+ }
+
+ tvlv_tt_data->flags = BATADV_TT_RESPONSE;
+ tvlv_tt_data->ttvn = req_ttvn;
+
+ if (full_table)
+ tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Sending TT_RESPONSE %pM for %pM [%c] (ttvn: %u)\n",
+ res_dst_orig_node->orig, req_dst_orig_node->orig,
+ full_table ? 'F' : '.', req_ttvn);
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX);
+
+ batadv_tvlv_unicast_send(bat_priv, req_dst_orig_node->orig,
+ req_src, BATADV_TVLV_TT, 1, tvlv_tt_data,
+ tvlv_len);
+
+ ret = true;
+ goto out;
+
+unlock:
+ spin_unlock_bh(&req_dst_orig_node->tt_buff_lock);
+
+out:
+ if (res_dst_orig_node)
+ batadv_orig_node_free_ref(res_dst_orig_node);
+ if (req_dst_orig_node)
+ batadv_orig_node_free_ref(req_dst_orig_node);
+ kfree(tvlv_tt_data);
+ return ret;
+}
+
+/**
+ * batadv_send_my_tt_response - send reply to tt request concerning this node's
+ * translation table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_data: tt data containing the tt request information
+ * @req_src: mac address of tt request sender
+ *
+ * Returns true if tt request reply was sent, false otherwise.
+ */
+static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data *tt_data,
+ uint8_t *req_src)
+{
+ struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
+ struct batadv_hard_iface *primary_if = NULL;
+ struct batadv_tvlv_tt_change *tt_change;
+ struct batadv_orig_node *orig_node;
+ uint8_t my_ttvn, req_ttvn;
+ uint16_t tvlv_len;
+ bool full_table;
+ int32_t tt_len;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n",
+ req_src, tt_data->ttvn,
+ (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.'));
+
+ spin_lock_bh(&bat_priv->tt.commit_lock);
+
+ my_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn);
+ req_ttvn = tt_data->ttvn;
+
+ orig_node = batadv_orig_hash_find(bat_priv, req_src);
+ if (!orig_node)
+ goto out;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* If the full table has been explicitly requested or the gap
+ * is too big send the whole local translation table
+ */
+ if (tt_data->flags & BATADV_TT_FULL_TABLE || my_ttvn != req_ttvn ||
+ !bat_priv->tt.last_changeset)
+ full_table = true;
+ else
+ full_table = false;
+
+ /* TT fragmentation hasn't been implemented yet, so send as many
+ * TT entries fit a single packet as possible only
+ */
+ if (!full_table) {
+ spin_lock_bh(&bat_priv->tt.last_changeset_lock);
+
+ tt_len = bat_priv->tt.last_changeset_len;
+ tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv,
+ &tvlv_tt_data,
+ &tt_change,
+ &tt_len);
+ if (!tt_len)
+ goto unlock;
+
+ /* Copy the last orig_node's OGM buffer */
+ memcpy(tt_change, bat_priv->tt.last_changeset,
+ bat_priv->tt.last_changeset_len);
+ spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
+ } else {
+ req_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn);
+
+ /* allocate the tvlv, put the tt_data and all the tt_vlan_data
+ * in the initial part
+ */
+ tt_len = -1;
+ tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv,
+ &tvlv_tt_data,
+ &tt_change,
+ &tt_len);
+ if (!tt_len)
+ goto out;
+
+ /* fill the rest of the tvlv with the real TT entries */
+ batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.local_hash,
+ tt_change, tt_len,
+ batadv_tt_local_valid, NULL);
+ }
+
+ tvlv_tt_data->flags = BATADV_TT_RESPONSE;
+ tvlv_tt_data->ttvn = req_ttvn;
+
+ if (full_table)
+ tvlv_tt_data->flags |= BATADV_TT_FULL_TABLE;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Sending TT_RESPONSE to %pM [%c] (ttvn: %u)\n",
+ orig_node->orig, full_table ? 'F' : '.', req_ttvn);
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_TX);
+
+ batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr,
+ req_src, BATADV_TVLV_TT, 1, tvlv_tt_data,
+ tvlv_len);
+
+ goto out;
+
+unlock:
+ spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
+out:
+ spin_unlock_bh(&bat_priv->tt.commit_lock);
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+ kfree(tvlv_tt_data);
+ /* The packet was for this host, so it doesn't need to be re-routed */
+ return true;
+}
+
+/**
+ * batadv_send_tt_response - send reply to tt request
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_data: tt data containing the tt request information
+ * @req_src: mac address of tt request sender
+ * @req_dst: mac address of tt request recipient
+ *
+ * Returns true if tt request reply was sent, false otherwise.
+ */
+static bool batadv_send_tt_response(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data *tt_data,
+ uint8_t *req_src, uint8_t *req_dst)
+{
+ if (batadv_is_my_mac(bat_priv, req_dst))
+ return batadv_send_my_tt_response(bat_priv, tt_data, req_src);
+ return batadv_send_other_tt_response(bat_priv, tt_data, req_src,
+ req_dst);
+}
+
+static void _batadv_tt_update_changes(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_tvlv_tt_change *tt_change,
+ uint16_t tt_num_changes, uint8_t ttvn)
+{
+ int i;
+ int roams;
+
+ for (i = 0; i < tt_num_changes; i++) {
+ if ((tt_change + i)->flags & BATADV_TT_CLIENT_DEL) {
+ roams = (tt_change + i)->flags & BATADV_TT_CLIENT_ROAM;
+ batadv_tt_global_del(bat_priv, orig_node,
+ (tt_change + i)->addr,
+ ntohs((tt_change + i)->vid),
+ "tt removed by changes",
+ roams);
+ } else {
+ if (!batadv_tt_global_add(bat_priv, orig_node,
+ (tt_change + i)->addr,
+ ntohs((tt_change + i)->vid),
+ (tt_change + i)->flags, ttvn))
+ /* In case of problem while storing a
+ * global_entry, we stop the updating
+ * procedure without committing the
+ * ttvn change. This will avoid to send
+ * corrupted data on tt_request
+ */
+ return;
+ }
+ }
+ orig_node->capa_initialized |= BATADV_ORIG_CAPA_HAS_TT;
+}
+
+static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_change *tt_change,
+ uint8_t ttvn, uint8_t *resp_src,
+ uint16_t num_entries)
+{
+ struct batadv_orig_node *orig_node;
+
+ orig_node = batadv_orig_hash_find(bat_priv, resp_src);
+ if (!orig_node)
+ goto out;
+
+ /* Purge the old table first.. */
+ batadv_tt_global_del_orig(bat_priv, orig_node, -1,
+ "Received full table");
+
+ _batadv_tt_update_changes(bat_priv, orig_node, tt_change, num_entries,
+ ttvn);
+
+ spin_lock_bh(&orig_node->tt_buff_lock);
+ kfree(orig_node->tt_buff);
+ orig_node->tt_buff_len = 0;
+ orig_node->tt_buff = NULL;
+ spin_unlock_bh(&orig_node->tt_buff_lock);
+
+ atomic_set(&orig_node->last_ttvn, ttvn);
+
+out:
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+}
+
+static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ uint16_t tt_num_changes, uint8_t ttvn,
+ struct batadv_tvlv_tt_change *tt_change)
+{
+ _batadv_tt_update_changes(bat_priv, orig_node, tt_change,
+ tt_num_changes, ttvn);
+
+ batadv_tt_save_orig_buffer(bat_priv, orig_node, tt_change,
+ batadv_tt_len(tt_num_changes));
+ atomic_set(&orig_node->last_ttvn, ttvn);
+}
+
+/**
+ * batadv_is_my_client - check if a client is served by the local node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the mac address of the client to check
+ * @vid: VLAN identifier
+ *
+ * Returns true if the client is served by this node, false otherwise.
+ */
+bool batadv_is_my_client(struct batadv_priv *bat_priv, const uint8_t *addr,
+ unsigned short vid)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+ bool ret = false;
+
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
+ if (!tt_local_entry)
+ goto out;
+ /* Check if the client has been logically deleted (but is kept for
+ * consistency purpose)
+ */
+ if ((tt_local_entry->common.flags & BATADV_TT_CLIENT_PENDING) ||
+ (tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM))
+ goto out;
+ ret = true;
+out:
+ if (tt_local_entry)
+ batadv_tt_local_entry_free_ref(tt_local_entry);
+ return ret;
+}
+
+/**
+ * batadv_handle_tt_response - process incoming tt reply
+ * @bat_priv: the bat priv with all the soft interface information
+ * @tt_data: tt data containing the tt request information
+ * @resp_src: mac address of tt reply sender
+ * @num_entries: number of tt change entries appended to the tt data
+ */
+static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
+ struct batadv_tvlv_tt_data *tt_data,
+ uint8_t *resp_src, uint16_t num_entries)
+{
+ struct batadv_tt_req_node *node, *safe;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_tvlv_tt_change *tt_change;
+ uint8_t *tvlv_ptr = (uint8_t *)tt_data;
+ uint16_t change_offset;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n",
+ resp_src, tt_data->ttvn, num_entries,
+ (tt_data->flags & BATADV_TT_FULL_TABLE ? 'F' : '.'));
+
+ orig_node = batadv_orig_hash_find(bat_priv, resp_src);
+ if (!orig_node)
+ goto out;
+
+ spin_lock_bh(&orig_node->tt_lock);
+
+ change_offset = sizeof(struct batadv_tvlv_tt_vlan_data);
+ change_offset *= ntohs(tt_data->num_vlan);
+ change_offset += sizeof(*tt_data);
+ tvlv_ptr += change_offset;
+
+ tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr;
+ if (tt_data->flags & BATADV_TT_FULL_TABLE) {
+ batadv_tt_fill_gtable(bat_priv, tt_change, tt_data->ttvn,
+ resp_src, num_entries);
+ } else {
+ batadv_tt_update_changes(bat_priv, orig_node, num_entries,
+ tt_data->ttvn, tt_change);
+ }
+
+ /* Recalculate the CRC for this orig_node and store it */
+ batadv_tt_global_update_crc(bat_priv, orig_node);
+
+ spin_unlock_bh(&orig_node->tt_lock);
+
+ /* Delete the tt_req_node from pending tt_requests list */
+ spin_lock_bh(&bat_priv->tt.req_list_lock);
+ list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) {
+ if (!batadv_compare_eth(node->addr, resp_src))
+ continue;
+ list_del(&node->list);
+ kfree(node);
+ }
+
+ spin_unlock_bh(&bat_priv->tt.req_list_lock);
+out:
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+}
+
+static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_roam_node *node, *safe;
+
+ spin_lock_bh(&bat_priv->tt.roam_list_lock);
+
+ list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) {
+ list_del(&node->list);
+ kfree(node);
+ }
+
+ spin_unlock_bh(&bat_priv->tt.roam_list_lock);
+}
+
+static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
+{
+ struct batadv_tt_roam_node *node, *safe;
+
+ spin_lock_bh(&bat_priv->tt.roam_list_lock);
+ list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) {
+ if (!batadv_has_timed_out(node->first_time,
+ BATADV_ROAMING_MAX_TIME))
+ continue;
+
+ list_del(&node->list);
+ kfree(node);
+ }
+ spin_unlock_bh(&bat_priv->tt.roam_list_lock);
+}
+
+/* This function checks whether the client already reached the
+ * maximum number of possible roaming phases. In this case the ROAMING_ADV
+ * will not be sent.
+ *
+ * returns true if the ROAMING_ADV can be sent, false otherwise
+ */
+static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv,
+ uint8_t *client)
+{
+ struct batadv_tt_roam_node *tt_roam_node;
+ bool ret = false;
+
+ spin_lock_bh(&bat_priv->tt.roam_list_lock);
+ /* The new tt_req will be issued only if I'm not waiting for a
+ * reply from the same orig_node yet
+ */
+ list_for_each_entry(tt_roam_node, &bat_priv->tt.roam_list, list) {
+ if (!batadv_compare_eth(tt_roam_node->addr, client))
+ continue;
+
+ if (batadv_has_timed_out(tt_roam_node->first_time,
+ BATADV_ROAMING_MAX_TIME))
+ continue;
+
+ if (!batadv_atomic_dec_not_zero(&tt_roam_node->counter))
+ /* Sorry, you roamed too many times! */
+ goto unlock;
+ ret = true;
+ break;
+ }
+
+ if (!ret) {
+ tt_roam_node = kmalloc(sizeof(*tt_roam_node), GFP_ATOMIC);
+ if (!tt_roam_node)
+ goto unlock;
+
+ tt_roam_node->first_time = jiffies;
+ atomic_set(&tt_roam_node->counter,
+ BATADV_ROAMING_MAX_COUNT - 1);
+ ether_addr_copy(tt_roam_node->addr, client);
+
+ list_add(&tt_roam_node->list, &bat_priv->tt.roam_list);
+ ret = true;
+ }
+
+unlock:
+ spin_unlock_bh(&bat_priv->tt.roam_list_lock);
+ return ret;
+}
+
+/**
+ * batadv_send_roam_adv - send a roaming advertisement message
+ * @bat_priv: the bat priv with all the soft interface information
+ * @client: mac address of the roaming client
+ * @vid: VLAN identifier
+ * @orig_node: message destination
+ *
+ * Send a ROAMING_ADV message to the node which was previously serving this
+ * client. This is done to inform the node that from now on all traffic destined
+ * for this particular roamed client has to be forwarded to the sender of the
+ * roaming message.
+ */
+static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client,
+ unsigned short vid,
+ struct batadv_orig_node *orig_node)
+{
+ struct batadv_hard_iface *primary_if;
+ struct batadv_tvlv_roam_adv tvlv_roam;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ /* before going on we have to check whether the client has
+ * already roamed to us too many times
+ */
+ if (!batadv_tt_check_roam_count(bat_priv, client))
+ goto out;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Sending ROAMING_ADV to %pM (client %pM, vid: %d)\n",
+ orig_node->orig, client, BATADV_PRINT_VID(vid));
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_TX);
+
+ memcpy(tvlv_roam.client, client, sizeof(tvlv_roam.client));
+ tvlv_roam.vid = htons(vid);
+
+ batadv_tvlv_unicast_send(bat_priv, primary_if->net_dev->dev_addr,
+ orig_node->orig, BATADV_TVLV_ROAM, 1,
+ &tvlv_roam, sizeof(tvlv_roam));
+
+out:
+ if (primary_if)
+ batadv_hardif_free_ref(primary_if);
+}
+
+static void batadv_tt_purge(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct batadv_priv_tt *priv_tt;
+ struct batadv_priv *bat_priv;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ priv_tt = container_of(delayed_work, struct batadv_priv_tt, work);
+ bat_priv = container_of(priv_tt, struct batadv_priv, tt);
+
+ batadv_tt_local_purge(bat_priv, BATADV_TT_LOCAL_TIMEOUT);
+ batadv_tt_global_purge(bat_priv);
+ batadv_tt_req_purge(bat_priv);
+ batadv_tt_roam_purge(bat_priv);
+
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work,
+ msecs_to_jiffies(BATADV_TT_WORK_PERIOD));
+}
+
+void batadv_tt_free(struct batadv_priv *bat_priv)
+{
+ batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_TT, 1);
+
+ cancel_delayed_work_sync(&bat_priv->tt.work);
+
+ batadv_tt_local_table_free(bat_priv);
+ batadv_tt_global_table_free(bat_priv);
+ batadv_tt_req_list_free(bat_priv);
+ batadv_tt_changes_list_free(bat_priv);
+ batadv_tt_roam_list_free(bat_priv);
+
+ kfree(bat_priv->tt.last_changeset);
+}
+
+/**
+ * batadv_tt_local_set_flags - set or unset the specified flags on the local
+ * table and possibly count them in the TT size
+ * @bat_priv: the bat priv with all the soft interface information
+ * @flags: the flag to switch
+ * @enable: whether to set or unset the flag
+ * @count: whether to increase the TT size by the number of changed entries
+ */
+static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv,
+ uint16_t flags, bool enable, bool count)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct batadv_tt_common_entry *tt_common_entry;
+ uint16_t changed_num = 0;
+ struct hlist_head *head;
+ uint32_t i;
+
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tt_common_entry,
+ head, hash_entry) {
+ if (enable) {
+ if ((tt_common_entry->flags & flags) == flags)
+ continue;
+ tt_common_entry->flags |= flags;
+ } else {
+ if (!(tt_common_entry->flags & flags))
+ continue;
+ tt_common_entry->flags &= ~flags;
+ }
+ changed_num++;
+
+ if (!count)
+ continue;
+
+ batadv_tt_local_size_inc(bat_priv,
+ tt_common_entry->vid);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/* Purge out all the tt local entries marked with BATADV_TT_CLIENT_PENDING */
+static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
+{
+ struct batadv_hashtable *hash = bat_priv->tt.local_hash;
+ struct batadv_tt_common_entry *tt_common;
+ struct batadv_tt_local_entry *tt_local;
+ struct batadv_softif_vlan *vlan;
+ struct hlist_node *node_tmp;
+ struct hlist_head *head;
+ spinlock_t *list_lock; /* protects write access to the hash lists */
+ uint32_t i;
+
+ if (!hash)
+ return;
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+ list_lock = &hash->list_locks[i];
+
+ spin_lock_bh(list_lock);
+ hlist_for_each_entry_safe(tt_common, node_tmp, head,
+ hash_entry) {
+ if (!(tt_common->flags & BATADV_TT_CLIENT_PENDING))
+ continue;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Deleting local tt entry (%pM, vid: %d): pending\n",
+ tt_common->addr,
+ BATADV_PRINT_VID(tt_common->vid));
+
+ batadv_tt_local_size_dec(bat_priv, tt_common->vid);
+ hlist_del_rcu(&tt_common->hash_entry);
+ tt_local = container_of(tt_common,
+ struct batadv_tt_local_entry,
+ common);
+
+ /* decrease the reference held for this vlan */
+ vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid);
+ batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_free_ref(vlan);
+
+ batadv_tt_local_entry_free_ref(tt_local);
+ }
+ spin_unlock_bh(list_lock);
+ }
+}
+
+/**
+ * batadv_tt_local_commit_changes_nolock - commit all pending local tt changes
+ * which have been queued in the time since the last commit
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Caller must hold tt->commit_lock.
+ */
+static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
+{
+ /* Update multicast addresses in local translation table */
+ batadv_mcast_mla_update(bat_priv);
+
+ if (atomic_read(&bat_priv->tt.local_changes) < 1) {
+ if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt))
+ batadv_tt_tvlv_container_update(bat_priv);
+ return;
+ }
+
+ batadv_tt_local_set_flags(bat_priv, BATADV_TT_CLIENT_NEW, false, true);
+
+ batadv_tt_local_purge_pending_clients(bat_priv);
+ batadv_tt_local_update_crc(bat_priv);
+
+ /* Increment the TTVN only once per OGM interval */
+ atomic_inc(&bat_priv->tt.vn);
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Local changes committed, updating to ttvn %u\n",
+ (uint8_t)atomic_read(&bat_priv->tt.vn));
+
+ /* reset the sending counter */
+ atomic_set(&bat_priv->tt.ogm_append_cnt, BATADV_TT_OGM_APPEND_MAX);
+ batadv_tt_tvlv_container_update(bat_priv);
+}
+
+/**
+ * batadv_tt_local_commit_changes - commit all pending local tt changes which
+ * have been queued in the time since the last commit
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
+{
+ spin_lock_bh(&bat_priv->tt.commit_lock);
+ batadv_tt_local_commit_changes_nolock(bat_priv);
+ spin_unlock_bh(&bat_priv->tt.commit_lock);
+}
+
+bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, uint8_t *src,
+ uint8_t *dst, unsigned short vid)
+{
+ struct batadv_tt_local_entry *tt_local_entry = NULL;
+ struct batadv_tt_global_entry *tt_global_entry = NULL;
+ struct batadv_softif_vlan *vlan;
+ bool ret = false;
+
+ vlan = batadv_softif_vlan_get(bat_priv, vid);
+ if (!vlan || !atomic_read(&vlan->ap_isolation))
+ goto out;
+
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid);
+ if (!tt_local_entry)
+ goto out;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, src, vid);
+ if (!tt_global_entry)
+ goto out;
+
+ if (!_batadv_is_ap_isolated(tt_local_entry, tt_global_entry))
+ goto out;
+
+ ret = true;
+
+out:
+ if (vlan)
+ batadv_softif_vlan_free_ref(vlan);
+ if (tt_global_entry)
+ batadv_tt_global_entry_free_ref(tt_global_entry);
+ if (tt_local_entry)
+ batadv_tt_local_entry_free_ref(tt_local_entry);
+ return ret;
+}
+
+/**
+ * batadv_tt_update_orig - update global translation table with new tt
+ * information received via ogms
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node of the ogm
+ * @tt_vlan: pointer to the first tvlv VLAN entry
+ * @tt_num_vlan: number of tvlv VLAN entries
+ * @tt_change: pointer to the first entry in the TT buffer
+ * @tt_num_changes: number of tt changes inside the tt buffer
+ * @ttvn: translation table version number of this changeset
+ * @tt_crc: crc32 checksum of orig node's translation table
+ */
+static void batadv_tt_update_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const void *tt_buff, uint16_t tt_num_vlan,
+ struct batadv_tvlv_tt_change *tt_change,
+ uint16_t tt_num_changes, uint8_t ttvn)
+{
+ uint8_t orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn);
+ struct batadv_tvlv_tt_vlan_data *tt_vlan;
+ bool full_table = true;
+ bool has_tt_init;
+
+ tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff;
+ has_tt_init = orig_node->capa_initialized & BATADV_ORIG_CAPA_HAS_TT;
+
+ /* orig table not initialised AND first diff is in the OGM OR the ttvn
+ * increased by one -> we can apply the attached changes
+ */
+ if ((!has_tt_init && ttvn == 1) || ttvn - orig_ttvn == 1) {
+ /* the OGM could not contain the changes due to their size or
+ * because they have already been sent BATADV_TT_OGM_APPEND_MAX
+ * times.
+ * In this case send a tt request
+ */
+ if (!tt_num_changes) {
+ full_table = false;
+ goto request_table;
+ }
+
+ spin_lock_bh(&orig_node->tt_lock);
+
+ batadv_tt_update_changes(bat_priv, orig_node, tt_num_changes,
+ ttvn, tt_change);
+
+ /* Even if we received the precomputed crc with the OGM, we
+ * prefer to recompute it to spot any possible inconsistency
+ * in the global table
+ */
+ batadv_tt_global_update_crc(bat_priv, orig_node);
+
+ spin_unlock_bh(&orig_node->tt_lock);
+
+ /* The ttvn alone is not enough to guarantee consistency
+ * because a single value could represent different states
+ * (due to the wrap around). Thus a node has to check whether
+ * the resulting table (after applying the changes) is still
+ * consistent or not. E.g. a node could disconnect while its
+ * ttvn is X and reconnect on ttvn = X + TTVN_MAX: in this case
+ * checking the CRC value is mandatory to detect the
+ * inconsistency
+ */
+ if (!batadv_tt_global_check_crc(orig_node, tt_vlan,
+ tt_num_vlan))
+ goto request_table;
+ } else {
+ /* if we missed more than one change or our tables are not
+ * in sync anymore -> request fresh tt data
+ */
+ if (!has_tt_init || ttvn != orig_ttvn ||
+ !batadv_tt_global_check_crc(orig_node, tt_vlan,
+ tt_num_vlan)) {
+request_table:
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "TT inconsistency for %pM. Need to retrieve the correct information (ttvn: %u last_ttvn: %u num_changes: %u)\n",
+ orig_node->orig, ttvn, orig_ttvn,
+ tt_num_changes);
+ batadv_send_tt_request(bat_priv, orig_node, ttvn,
+ tt_vlan, tt_num_vlan,
+ full_table);
+ return;
+ }
+ }
+}
+
+/**
+ * batadv_tt_global_client_is_roaming - check if a client is marked as roaming
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the mac address of the client to check
+ * @vid: VLAN identifier
+ *
+ * Returns true if we know that the client has moved from its old originator
+ * to another one. This entry is still kept for consistency purposes and will be
+ * deleted later by a DEL or because of timeout
+ */
+bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
+ uint8_t *addr, unsigned short vid)
+{
+ struct batadv_tt_global_entry *tt_global_entry;
+ bool ret = false;
+
+ tt_global_entry = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global_entry)
+ goto out;
+
+ ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM;
+ batadv_tt_global_entry_free_ref(tt_global_entry);
+out:
+ return ret;
+}
+
+/**
+ * batadv_tt_local_client_is_roaming - tells whether the client is roaming
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the mac address of the local client to query
+ * @vid: VLAN identifier
+ *
+ * Returns true if the local client is known to be roaming (it is not served by
+ * this node anymore) or not. If yes, the client is still present in the table
+ * to keep the latter consistent with the node TTVN
+ */
+bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
+ uint8_t *addr, unsigned short vid)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+ bool ret = false;
+
+ tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
+ if (!tt_local_entry)
+ goto out;
+
+ ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM;
+ batadv_tt_local_entry_free_ref(tt_local_entry);
+out:
+ return ret;
+}
+
+bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *addr,
+ unsigned short vid)
+{
+ bool ret = false;
+
+ if (!batadv_tt_global_add(bat_priv, orig_node, addr, vid,
+ BATADV_TT_CLIENT_TEMP,
+ atomic_read(&orig_node->last_ttvn)))
+ goto out;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Added temporary global client (addr: %pM, vid: %d, orig: %pM)\n",
+ addr, BATADV_PRINT_VID(vid), orig_node->orig);
+ ret = true;
+out:
+ return ret;
+}
+
+/**
+ * batadv_tt_local_resize_to_mtu - resize the local translation table fit the
+ * maximum packet size that can be transported through the mesh
+ * @soft_iface: netdev struct of the mesh interface
+ *
+ * Remove entries older than 'timeout' and half timeout if more entries need
+ * to be removed.
+ */
+void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ int packet_size_max = atomic_read(&bat_priv->packet_size_max);
+ int table_size, timeout = BATADV_TT_LOCAL_TIMEOUT / 2;
+ bool reduced = false;
+
+ spin_lock_bh(&bat_priv->tt.commit_lock);
+
+ while (true) {
+ table_size = batadv_tt_local_table_transmit_size(bat_priv);
+ if (packet_size_max >= table_size)
+ break;
+
+ batadv_tt_local_purge(bat_priv, timeout);
+ batadv_tt_local_purge_pending_clients(bat_priv);
+
+ timeout /= 2;
+ reduced = true;
+ net_ratelimited_function(batadv_info, soft_iface,
+ "Forced to purge local tt entries to fit new maximum fragment MTU (%i)\n",
+ packet_size_max);
+ }
+
+ /* commit these changes immediately, to avoid synchronization problem
+ * with the TTVN
+ */
+ if (reduced)
+ batadv_tt_local_commit_changes_nolock(bat_priv);
+
+ spin_unlock_bh(&bat_priv->tt.commit_lock);
+}
+
+/**
+ * batadv_tt_tvlv_ogm_handler_v1 - process incoming tt tvlv container
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig: the orig_node of the ogm
+ * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
+ * @tvlv_value: tvlv buffer containing the gateway data
+ * @tvlv_value_len: tvlv buffer length
+ */
+static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t flags, void *tvlv_value,
+ uint16_t tvlv_value_len)
+{
+ struct batadv_tvlv_tt_vlan_data *tt_vlan;
+ struct batadv_tvlv_tt_change *tt_change;
+ struct batadv_tvlv_tt_data *tt_data;
+ uint16_t num_entries, num_vlan;
+
+ if (tvlv_value_len < sizeof(*tt_data))
+ return;
+
+ tt_data = (struct batadv_tvlv_tt_data *)tvlv_value;
+ tvlv_value_len -= sizeof(*tt_data);
+
+ num_vlan = ntohs(tt_data->num_vlan);
+
+ if (tvlv_value_len < sizeof(*tt_vlan) * num_vlan)
+ return;
+
+ tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1);
+ tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan);
+ tvlv_value_len -= sizeof(*tt_vlan) * num_vlan;
+
+ num_entries = batadv_tt_entries(tvlv_value_len);
+
+ batadv_tt_update_orig(bat_priv, orig, tt_vlan, num_vlan, tt_change,
+ num_entries, tt_data->ttvn);
+}
+
+/**
+ * batadv_tt_tvlv_unicast_handler_v1 - process incoming (unicast) tt tvlv
+ * container
+ * @bat_priv: the bat priv with all the soft interface information
+ * @src: mac address of tt tvlv sender
+ * @dst: mac address of tt tvlv recipient
+ * @tvlv_value: tvlv buffer containing the tt data
+ * @tvlv_value_len: tvlv buffer length
+ *
+ * Returns NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS
+ * otherwise.
+ */
+static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
+ uint8_t *src, uint8_t *dst,
+ void *tvlv_value,
+ uint16_t tvlv_value_len)
+{
+ struct batadv_tvlv_tt_data *tt_data;
+ uint16_t tt_vlan_len, tt_num_entries;
+ char tt_flag;
+ bool ret;
+
+ if (tvlv_value_len < sizeof(*tt_data))
+ return NET_RX_SUCCESS;
+
+ tt_data = (struct batadv_tvlv_tt_data *)tvlv_value;
+ tvlv_value_len -= sizeof(*tt_data);
+
+ tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data);
+ tt_vlan_len *= ntohs(tt_data->num_vlan);
+
+ if (tvlv_value_len < tt_vlan_len)
+ return NET_RX_SUCCESS;
+
+ tvlv_value_len -= tt_vlan_len;
+ tt_num_entries = batadv_tt_entries(tvlv_value_len);
+
+ switch (tt_data->flags & BATADV_TT_DATA_TYPE_MASK) {
+ case BATADV_TT_REQUEST:
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_REQUEST_RX);
+
+ /* If this node cannot provide a TT response the tt_request is
+ * forwarded
+ */
+ ret = batadv_send_tt_response(bat_priv, tt_data, src, dst);
+ if (!ret) {
+ if (tt_data->flags & BATADV_TT_FULL_TABLE)
+ tt_flag = 'F';
+ else
+ tt_flag = '.';
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Routing TT_REQUEST to %pM [%c]\n",
+ dst, tt_flag);
+ /* tvlv API will re-route the packet */
+ return NET_RX_DROP;
+ }
+ break;
+ case BATADV_TT_RESPONSE:
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_RESPONSE_RX);
+
+ if (batadv_is_my_mac(bat_priv, dst)) {
+ batadv_handle_tt_response(bat_priv, tt_data,
+ src, tt_num_entries);
+ return NET_RX_SUCCESS;
+ }
+
+ if (tt_data->flags & BATADV_TT_FULL_TABLE)
+ tt_flag = 'F';
+ else
+ tt_flag = '.';
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Routing TT_RESPONSE to %pM [%c]\n", dst, tt_flag);
+
+ /* tvlv API will re-route the packet */
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_roam_tvlv_unicast_handler_v1 - process incoming tt roam tvlv container
+ * @bat_priv: the bat priv with all the soft interface information
+ * @src: mac address of tt tvlv sender
+ * @dst: mac address of tt tvlv recipient
+ * @tvlv_value: tvlv buffer containing the tt data
+ * @tvlv_value_len: tvlv buffer length
+ *
+ * Returns NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS
+ * otherwise.
+ */
+static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
+ uint8_t *src, uint8_t *dst,
+ void *tvlv_value,
+ uint16_t tvlv_value_len)
+{
+ struct batadv_tvlv_roam_adv *roaming_adv;
+ struct batadv_orig_node *orig_node = NULL;
+
+ /* If this node is not the intended recipient of the
+ * roaming advertisement the packet is forwarded
+ * (the tvlv API will re-route the packet).
+ */
+ if (!batadv_is_my_mac(bat_priv, dst))
+ return NET_RX_DROP;
+
+ if (tvlv_value_len < sizeof(*roaming_adv))
+ goto out;
+
+ orig_node = batadv_orig_hash_find(bat_priv, src);
+ if (!orig_node)
+ goto out;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX);
+ roaming_adv = (struct batadv_tvlv_roam_adv *)tvlv_value;
+
+ batadv_dbg(BATADV_DBG_TT, bat_priv,
+ "Received ROAMING_ADV from %pM (client %pM)\n",
+ src, roaming_adv->client);
+
+ batadv_tt_global_add(bat_priv, orig_node, roaming_adv->client,
+ ntohs(roaming_adv->vid), BATADV_TT_CLIENT_ROAM,
+ atomic_read(&orig_node->last_ttvn) + 1);
+
+out:
+ if (orig_node)
+ batadv_orig_node_free_ref(orig_node);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * batadv_tt_init - initialise the translation table internals
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return 0 on success or negative error number in case of failure.
+ */
+int batadv_tt_init(struct batadv_priv *bat_priv)
+{
+ int ret;
+
+ /* synchronized flags must be remote */
+ BUILD_BUG_ON(!(BATADV_TT_SYNC_MASK & BATADV_TT_REMOTE_MASK));
+
+ ret = batadv_tt_local_init(bat_priv);
+ if (ret < 0)
+ return ret;
+
+ ret = batadv_tt_global_init(bat_priv);
+ if (ret < 0)
+ return ret;
+
+ batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1,
+ batadv_tt_tvlv_unicast_handler_v1,
+ BATADV_TVLV_TT, 1, BATADV_NO_FLAGS);
+
+ batadv_tvlv_handler_register(bat_priv, NULL,
+ batadv_roam_tvlv_unicast_handler_v1,
+ BATADV_TVLV_ROAM, 1, BATADV_NO_FLAGS);
+
+ INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge);
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->tt.work,
+ msecs_to_jiffies(BATADV_TT_WORK_PERIOD));
+
+ return 1;
+}
+
+/**
+ * batadv_tt_global_is_isolated - check if a client is marked as isolated
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the mac address of the client
+ * @vid: the identifier of the VLAN where this client is connected
+ *
+ * Returns true if the client is marked with the TT_CLIENT_ISOLA flag, false
+ * otherwise
+ */
+bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
+ const uint8_t *addr, unsigned short vid)
+{
+ struct batadv_tt_global_entry *tt;
+ bool ret;
+
+ tt = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt)
+ return false;
+
+ ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA;
+
+ batadv_tt_global_entry_free_ref(tt);
+
+ return ret;
+}
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
new file mode 100644
index 000000000..ad84d7b89
--- /dev/null
+++ b/net/batman-adv/translation-table.h
@@ -0,0 +1,56 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich, Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
+#define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
+
+int batadv_tt_init(struct batadv_priv *bat_priv);
+bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr,
+ unsigned short vid, int ifindex, uint32_t mark);
+uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv,
+ const uint8_t *addr, unsigned short vid,
+ const char *message, bool roaming);
+int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset);
+void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ int32_t match_vid, const char *message);
+int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
+ const uint8_t *addr, unsigned short vid);
+struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
+ const uint8_t *src,
+ const uint8_t *addr,
+ unsigned short vid);
+void batadv_tt_free(struct batadv_priv *bat_priv);
+bool batadv_is_my_client(struct batadv_priv *bat_priv, const uint8_t *addr,
+ unsigned short vid);
+bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, uint8_t *src,
+ uint8_t *dst, unsigned short vid);
+void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv);
+bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
+ uint8_t *addr, unsigned short vid);
+bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
+ uint8_t *addr, unsigned short vid);
+void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface);
+bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ const unsigned char *addr,
+ unsigned short vid);
+bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
+ const uint8_t *addr, unsigned short vid);
+
+#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
new file mode 100644
index 000000000..9398c3fb4
--- /dev/null
+++ b/net/batman-adv/types.h
@@ -0,0 +1,1257 @@
+/* Copyright (C) 2007-2014 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NET_BATMAN_ADV_TYPES_H_
+#define _NET_BATMAN_ADV_TYPES_H_
+
+#include "packet.h"
+#include "bitarray.h"
+#include <linux/kernel.h>
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+
+/**
+ * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is
+ * changed, BATADV_DAT_ADDR_MAX is changed as well.
+ *
+ * *Please be careful: batadv_dat_addr_t must be UNSIGNED*
+ */
+#define batadv_dat_addr_t uint16_t
+
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
+/**
+ * enum batadv_dhcp_recipient - dhcp destination
+ * @BATADV_DHCP_NO: packet is not a dhcp message
+ * @BATADV_DHCP_TO_SERVER: dhcp message is directed to a server
+ * @BATADV_DHCP_TO_CLIENT: dhcp message is directed to a client
+ */
+enum batadv_dhcp_recipient {
+ BATADV_DHCP_NO = 0,
+ BATADV_DHCP_TO_SERVER,
+ BATADV_DHCP_TO_CLIENT,
+};
+
+/**
+ * BATADV_TT_REMOTE_MASK - bitmask selecting the flags that are sent over the
+ * wire only
+ */
+#define BATADV_TT_REMOTE_MASK 0x00FF
+
+/**
+ * BATADV_TT_SYNC_MASK - bitmask of the flags that need to be kept in sync
+ * among the nodes. These flags are used to compute the global/local CRC
+ */
+#define BATADV_TT_SYNC_MASK 0x00F0
+
+/**
+ * struct batadv_hard_iface_bat_iv - per hard interface B.A.T.M.A.N. IV data
+ * @ogm_buff: buffer holding the OGM packet
+ * @ogm_buff_len: length of the OGM packet buffer
+ * @ogm_seqno: OGM sequence number - used to identify each OGM
+ */
+struct batadv_hard_iface_bat_iv {
+ unsigned char *ogm_buff;
+ int ogm_buff_len;
+ atomic_t ogm_seqno;
+};
+
+/**
+ * struct batadv_hard_iface - network device known to batman-adv
+ * @list: list node for batadv_hardif_list
+ * @if_num: identificator of the interface
+ * @if_status: status of the interface for batman-adv
+ * @net_dev: pointer to the net_device
+ * @num_bcasts: number of payload re-broadcasts on this interface (ARQ)
+ * @hardif_obj: kobject of the per interface sysfs "mesh" directory
+ * @refcount: number of contexts the object is used
+ * @batman_adv_ptype: packet type describing packets that should be processed by
+ * batman-adv for this interface
+ * @soft_iface: the batman-adv interface which uses this network interface
+ * @rcu: struct used for freeing in an RCU-safe manner
+ * @bat_iv: BATMAN IV specific per hard interface data
+ * @cleanup_work: work queue callback item for hard interface deinit
+ * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
+ */
+struct batadv_hard_iface {
+ struct list_head list;
+ int16_t if_num;
+ char if_status;
+ struct net_device *net_dev;
+ uint8_t num_bcasts;
+ struct kobject *hardif_obj;
+ atomic_t refcount;
+ struct packet_type batman_adv_ptype;
+ struct net_device *soft_iface;
+ struct rcu_head rcu;
+ struct batadv_hard_iface_bat_iv bat_iv;
+ struct work_struct cleanup_work;
+ struct dentry *debug_dir;
+};
+
+/**
+ * struct batadv_orig_ifinfo - originator info per outgoing interface
+ * @list: list node for orig_node::ifinfo_list
+ * @if_outgoing: pointer to outgoing hard interface
+ * @router: router that should be used to reach this originator
+ * @last_real_seqno: last and best known sequence number
+ * @last_ttl: ttl of last received packet
+ * @batman_seqno_reset: time when the batman seqno window was reset
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+struct batadv_orig_ifinfo {
+ struct hlist_node list;
+ struct batadv_hard_iface *if_outgoing;
+ struct batadv_neigh_node __rcu *router; /* rcu protected pointer */
+ uint32_t last_real_seqno;
+ uint8_t last_ttl;
+ unsigned long batman_seqno_reset;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_frag_table_entry - head in the fragment buffer table
+ * @head: head of list with fragments
+ * @lock: lock to protect the list of fragments
+ * @timestamp: time (jiffie) of last received fragment
+ * @seqno: sequence number of the fragments in the list
+ * @size: accumulated size of packets in list
+ */
+struct batadv_frag_table_entry {
+ struct hlist_head head;
+ spinlock_t lock; /* protects head */
+ unsigned long timestamp;
+ uint16_t seqno;
+ uint16_t size;
+};
+
+/**
+ * struct batadv_frag_list_entry - entry in a list of fragments
+ * @list: list node information
+ * @skb: fragment
+ * @no: fragment number in the set
+ */
+struct batadv_frag_list_entry {
+ struct hlist_node list;
+ struct sk_buff *skb;
+ uint8_t no;
+};
+
+/**
+ * struct batadv_vlan_tt - VLAN specific TT attributes
+ * @crc: CRC32 checksum of the entries belonging to this vlan
+ * @num_entries: number of TT entries for this VLAN
+ */
+struct batadv_vlan_tt {
+ uint32_t crc;
+ atomic_t num_entries;
+};
+
+/**
+ * struct batadv_orig_node_vlan - VLAN specific data per orig_node
+ * @vid: the VLAN identifier
+ * @tt: VLAN specific TT attributes
+ * @list: list node for orig_node::vlan_list
+ * @refcount: number of context where this object is currently in use
+ * @rcu: struct used for freeing in a RCU-safe manner
+ */
+struct batadv_orig_node_vlan {
+ unsigned short vid;
+ struct batadv_vlan_tt tt;
+ struct list_head list;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members
+ * @bcast_own: bitfield containing the number of our OGMs this orig_node
+ * rebroadcasted "back" to us (relative to last_real_seqno)
+ * @bcast_own_sum: counted result of bcast_own
+ * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum,
+ * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
+ */
+struct batadv_orig_bat_iv {
+ unsigned long *bcast_own;
+ uint8_t *bcast_own_sum;
+ /* ogm_cnt_lock protects: bcast_own, bcast_own_sum,
+ * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
+ */
+ spinlock_t ogm_cnt_lock;
+};
+
+/**
+ * struct batadv_orig_node - structure for orig_list maintaining nodes of mesh
+ * @orig: originator ethernet address
+ * @ifinfo_list: list for routers per outgoing interface
+ * @last_bonding_candidate: pointer to last ifinfo of last used router
+ * @batadv_dat_addr_t: address of the orig node in the distributed hash
+ * @last_seen: time when last packet from this node was received
+ * @bcast_seqno_reset: time when the broadcast seqno window was reset
+ * @mcast_flags: multicast flags announced by the orig node
+ * @mcast_want_all_unsnoop_node: a list node for the
+ * mcast.want_all_unsnoopables list
+ * @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4 list
+ * @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6 list
+ * @capabilities: announced capabilities of this originator
+ * @capa_initialized: bitfield to remember whether a capability was initialized
+ * @last_ttvn: last seen translation table version number
+ * @tt_buff: last tt changeset this node received from the orig node
+ * @tt_buff_len: length of the last tt changeset this node received from the
+ * orig node
+ * @tt_buff_lock: lock that protects tt_buff and tt_buff_len
+ * @tt_lock: prevents from updating the table while reading it. Table update is
+ * made up by two operations (data structure update and metdata -CRC/TTVN-
+ * recalculation) and they have to be executed atomically in order to avoid
+ * another thread to read the table/metadata between those.
+ * @bcast_bits: bitfield containing the info which payload broadcast originated
+ * from this orig node this host already has seen (relative to
+ * last_bcast_seqno)
+ * @last_bcast_seqno: last broadcast sequence number received by this host
+ * @neigh_list: list of potential next hop neighbor towards this orig node
+ * @neigh_list_lock: lock protecting neigh_list and router
+ * @hash_entry: hlist node for batadv_priv::orig_hash
+ * @bat_priv: pointer to soft_iface this orig node belongs to
+ * @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ * @in_coding_list: list of nodes this orig can hear
+ * @out_coding_list: list of nodes that can hear this orig
+ * @in_coding_list_lock: protects in_coding_list
+ * @out_coding_list_lock: protects out_coding_list
+ * @fragments: array with heads for fragment chains
+ * @vlan_list: a list of orig_node_vlan structs, one per VLAN served by the
+ * originator represented by this object
+ * @vlan_list_lock: lock protecting vlan_list
+ * @bat_iv: B.A.T.M.A.N. IV private structure
+ */
+struct batadv_orig_node {
+ uint8_t orig[ETH_ALEN];
+ struct hlist_head ifinfo_list;
+ struct batadv_orig_ifinfo *last_bonding_candidate;
+#ifdef CONFIG_BATMAN_ADV_DAT
+ batadv_dat_addr_t dat_addr;
+#endif
+ unsigned long last_seen;
+ unsigned long bcast_seqno_reset;
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ uint8_t mcast_flags;
+ struct hlist_node mcast_want_all_unsnoopables_node;
+ struct hlist_node mcast_want_all_ipv4_node;
+ struct hlist_node mcast_want_all_ipv6_node;
+#endif
+ uint8_t capabilities;
+ uint8_t capa_initialized;
+ atomic_t last_ttvn;
+ unsigned char *tt_buff;
+ int16_t tt_buff_len;
+ spinlock_t tt_buff_lock; /* protects tt_buff & tt_buff_len */
+ /* prevents from changing the table while reading it */
+ spinlock_t tt_lock;
+ DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+ uint32_t last_bcast_seqno;
+ struct hlist_head neigh_list;
+ /* neigh_list_lock protects: neigh_list and router */
+ spinlock_t neigh_list_lock;
+ struct hlist_node hash_entry;
+ struct batadv_priv *bat_priv;
+ /* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */
+ spinlock_t bcast_seqno_lock;
+ atomic_t refcount;
+ struct rcu_head rcu;
+#ifdef CONFIG_BATMAN_ADV_NC
+ struct list_head in_coding_list;
+ struct list_head out_coding_list;
+ spinlock_t in_coding_list_lock; /* Protects in_coding_list */
+ spinlock_t out_coding_list_lock; /* Protects out_coding_list */
+#endif
+ struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT];
+ struct list_head vlan_list;
+ spinlock_t vlan_list_lock; /* protects vlan_list */
+ struct batadv_orig_bat_iv bat_iv;
+};
+
+/**
+ * enum batadv_orig_capabilities - orig node capabilities
+ * @BATADV_ORIG_CAPA_HAS_DAT: orig node has distributed arp table enabled
+ * @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled
+ * @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability
+ * @BATADV_ORIG_CAPA_HAS_MCAST: orig node has some multicast capability
+ * (= orig node announces a tvlv of type BATADV_TVLV_MCAST)
+ */
+enum batadv_orig_capabilities {
+ BATADV_ORIG_CAPA_HAS_DAT = BIT(0),
+ BATADV_ORIG_CAPA_HAS_NC = BIT(1),
+ BATADV_ORIG_CAPA_HAS_TT = BIT(2),
+ BATADV_ORIG_CAPA_HAS_MCAST = BIT(3),
+};
+
+/**
+ * struct batadv_gw_node - structure for orig nodes announcing gw capabilities
+ * @list: list node for batadv_priv_gw::list
+ * @orig_node: pointer to corresponding orig node
+ * @bandwidth_down: advertised uplink download bandwidth
+ * @bandwidth_up: advertised uplink upload bandwidth
+ * @deleted: this struct is scheduled for deletion
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+struct batadv_gw_node {
+ struct hlist_node list;
+ struct batadv_orig_node *orig_node;
+ uint32_t bandwidth_down;
+ uint32_t bandwidth_up;
+ unsigned long deleted;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_neigh_node - structure for single hops neighbors
+ * @list: list node for batadv_orig_node::neigh_list
+ * @orig_node: pointer to corresponding orig_node
+ * @addr: the MAC address of the neighboring interface
+ * @ifinfo_list: list for routing metrics per outgoing interface
+ * @ifinfo_lock: lock protecting private ifinfo members and list
+ * @if_incoming: pointer to incoming hard interface
+ * @last_seen: when last packet via this neighbor was received
+ * @last_ttl: last received ttl from this neigh node
+ * @rcu: struct used for freeing in an RCU-safe manner
+ * @bat_iv: B.A.T.M.A.N. IV private structure
+ */
+struct batadv_neigh_node {
+ struct hlist_node list;
+ struct batadv_orig_node *orig_node;
+ uint8_t addr[ETH_ALEN];
+ struct hlist_head ifinfo_list;
+ spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */
+ struct batadv_hard_iface *if_incoming;
+ unsigned long last_seen;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing
+ * interface for BATMAN IV
+ * @tq_recv: ring buffer of received TQ values from this neigh node
+ * @tq_index: ring buffer index
+ * @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv)
+ * @real_bits: bitfield containing the number of OGMs received from this neigh
+ * node (relative to orig_node->last_real_seqno)
+ * @real_packet_count: counted result of real_bits
+ */
+struct batadv_neigh_ifinfo_bat_iv {
+ uint8_t tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE];
+ uint8_t tq_index;
+ uint8_t tq_avg;
+ DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE);
+ uint8_t real_packet_count;
+};
+
+/**
+ * struct batadv_neigh_ifinfo - neighbor information per outgoing interface
+ * @list: list node for batadv_neigh_node::ifinfo_list
+ * @if_outgoing: pointer to outgoing hard interface
+ * @bat_iv: B.A.T.M.A.N. IV private structure
+ * @last_ttl: last received ttl from this neigh node
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in a RCU-safe manner
+ */
+struct batadv_neigh_ifinfo {
+ struct hlist_node list;
+ struct batadv_hard_iface *if_outgoing;
+ struct batadv_neigh_ifinfo_bat_iv bat_iv;
+ uint8_t last_ttl;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression
+ * @orig[ETH_ALEN]: mac address of orig node orginating the broadcast
+ * @crc: crc32 checksum of broadcast payload
+ * @entrytime: time when the broadcast packet was received
+ */
+#ifdef CONFIG_BATMAN_ADV_BLA
+struct batadv_bcast_duplist_entry {
+ uint8_t orig[ETH_ALEN];
+ __be32 crc;
+ unsigned long entrytime;
+};
+#endif
+
+/**
+ * enum batadv_counters - indices for traffic counters
+ * @BATADV_CNT_TX: transmitted payload traffic packet counter
+ * @BATADV_CNT_TX_BYTES: transmitted payload traffic bytes counter
+ * @BATADV_CNT_TX_DROPPED: dropped transmission payload traffic packet counter
+ * @BATADV_CNT_RX: received payload traffic packet counter
+ * @BATADV_CNT_RX_BYTES: received payload traffic bytes counter
+ * @BATADV_CNT_FORWARD: forwarded payload traffic packet counter
+ * @BATADV_CNT_FORWARD_BYTES: forwarded payload traffic bytes counter
+ * @BATADV_CNT_MGMT_TX: transmitted routing protocol traffic packet counter
+ * @BATADV_CNT_MGMT_TX_BYTES: transmitted routing protocol traffic bytes counter
+ * @BATADV_CNT_MGMT_RX: received routing protocol traffic packet counter
+ * @BATADV_CNT_MGMT_RX_BYTES: received routing protocol traffic bytes counter
+ * @BATADV_CNT_FRAG_TX: transmitted fragment traffic packet counter
+ * @BATADV_CNT_FRAG_TX_BYTES: transmitted fragment traffic bytes counter
+ * @BATADV_CNT_FRAG_RX: received fragment traffic packet counter
+ * @BATADV_CNT_FRAG_RX_BYTES: received fragment traffic bytes counter
+ * @BATADV_CNT_FRAG_FWD: forwarded fragment traffic packet counter
+ * @BATADV_CNT_FRAG_FWD_BYTES: forwarded fragment traffic bytes counter
+ * @BATADV_CNT_TT_REQUEST_TX: transmitted tt req traffic packet counter
+ * @BATADV_CNT_TT_REQUEST_RX: received tt req traffic packet counter
+ * @BATADV_CNT_TT_RESPONSE_TX: transmitted tt resp traffic packet counter
+ * @BATADV_CNT_TT_RESPONSE_RX: received tt resp traffic packet counter
+ * @BATADV_CNT_TT_ROAM_ADV_TX: transmitted tt roam traffic packet counter
+ * @BATADV_CNT_TT_ROAM_ADV_RX: received tt roam traffic packet counter
+ * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter
+ * @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter
+ * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter
+ * @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter
+ * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic packet
+ * counter
+ * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter
+ * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes counter
+ * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet counter
+ * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes counter
+ * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc decoding
+ * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter
+ * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes counter
+ * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic packet
+ * counter
+ * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in promisc
+ * mode.
+ * @BATADV_CNT_NUM: number of traffic counters
+ */
+enum batadv_counters {
+ BATADV_CNT_TX,
+ BATADV_CNT_TX_BYTES,
+ BATADV_CNT_TX_DROPPED,
+ BATADV_CNT_RX,
+ BATADV_CNT_RX_BYTES,
+ BATADV_CNT_FORWARD,
+ BATADV_CNT_FORWARD_BYTES,
+ BATADV_CNT_MGMT_TX,
+ BATADV_CNT_MGMT_TX_BYTES,
+ BATADV_CNT_MGMT_RX,
+ BATADV_CNT_MGMT_RX_BYTES,
+ BATADV_CNT_FRAG_TX,
+ BATADV_CNT_FRAG_TX_BYTES,
+ BATADV_CNT_FRAG_RX,
+ BATADV_CNT_FRAG_RX_BYTES,
+ BATADV_CNT_FRAG_FWD,
+ BATADV_CNT_FRAG_FWD_BYTES,
+ BATADV_CNT_TT_REQUEST_TX,
+ BATADV_CNT_TT_REQUEST_RX,
+ BATADV_CNT_TT_RESPONSE_TX,
+ BATADV_CNT_TT_RESPONSE_RX,
+ BATADV_CNT_TT_ROAM_ADV_TX,
+ BATADV_CNT_TT_ROAM_ADV_RX,
+#ifdef CONFIG_BATMAN_ADV_DAT
+ BATADV_CNT_DAT_GET_TX,
+ BATADV_CNT_DAT_GET_RX,
+ BATADV_CNT_DAT_PUT_TX,
+ BATADV_CNT_DAT_PUT_RX,
+ BATADV_CNT_DAT_CACHED_REPLY_TX,
+#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+ BATADV_CNT_NC_CODE,
+ BATADV_CNT_NC_CODE_BYTES,
+ BATADV_CNT_NC_RECODE,
+ BATADV_CNT_NC_RECODE_BYTES,
+ BATADV_CNT_NC_BUFFER,
+ BATADV_CNT_NC_DECODE,
+ BATADV_CNT_NC_DECODE_BYTES,
+ BATADV_CNT_NC_DECODE_FAILED,
+ BATADV_CNT_NC_SNIFFED,
+#endif
+ BATADV_CNT_NUM,
+};
+
+/**
+ * struct batadv_priv_tt - per mesh interface translation table data
+ * @vn: translation table version number
+ * @ogm_append_cnt: counter of number of OGMs containing the local tt diff
+ * @local_changes: changes registered in an originator interval
+ * @changes_list: tracks tt local changes within an originator interval
+ * @local_hash: local translation table hash table
+ * @global_hash: global translation table hash table
+ * @req_list: list of pending & unanswered tt_requests
+ * @roam_list: list of the last roaming events of each client limiting the
+ * number of roaming events to avoid route flapping
+ * @changes_list_lock: lock protecting changes_list
+ * @req_list_lock: lock protecting req_list
+ * @roam_list_lock: lock protecting roam_list
+ * @last_changeset: last tt changeset this host has generated
+ * @last_changeset_len: length of last tt changeset this host has generated
+ * @last_changeset_lock: lock protecting last_changeset & last_changeset_len
+ * @commit_lock: prevents from executing a local TT commit while reading the
+ * local table. The local TT commit is made up by two operations (data
+ * structure update and metdata -CRC/TTVN- recalculation) and they have to be
+ * executed atomically in order to avoid another thread to read the
+ * table/metadata between those.
+ * @work: work queue callback item for translation table purging
+ */
+struct batadv_priv_tt {
+ atomic_t vn;
+ atomic_t ogm_append_cnt;
+ atomic_t local_changes;
+ struct list_head changes_list;
+ struct batadv_hashtable *local_hash;
+ struct batadv_hashtable *global_hash;
+ struct list_head req_list;
+ struct list_head roam_list;
+ spinlock_t changes_list_lock; /* protects changes */
+ spinlock_t req_list_lock; /* protects req_list */
+ spinlock_t roam_list_lock; /* protects roam_list */
+ unsigned char *last_changeset;
+ int16_t last_changeset_len;
+ /* protects last_changeset & last_changeset_len */
+ spinlock_t last_changeset_lock;
+ /* prevents from executing a commit while reading the table */
+ spinlock_t commit_lock;
+ struct delayed_work work;
+};
+
+/**
+ * struct batadv_priv_bla - per mesh interface bridge loope avoidance data
+ * @num_requests; number of bla requests in flight
+ * @claim_hash: hash table containing mesh nodes this host has claimed
+ * @backbone_hash: hash table containing all detected backbone gateways
+ * @bcast_duplist: recently received broadcast packets array (for broadcast
+ * duplicate suppression)
+ * @bcast_duplist_curr: index of last broadcast packet added to bcast_duplist
+ * @bcast_duplist_lock: lock protecting bcast_duplist & bcast_duplist_curr
+ * @claim_dest: local claim data (e.g. claim group)
+ * @work: work queue callback item for cleanups & bla announcements
+ */
+#ifdef CONFIG_BATMAN_ADV_BLA
+struct batadv_priv_bla {
+ atomic_t num_requests;
+ struct batadv_hashtable *claim_hash;
+ struct batadv_hashtable *backbone_hash;
+ struct batadv_bcast_duplist_entry bcast_duplist[BATADV_DUPLIST_SIZE];
+ int bcast_duplist_curr;
+ /* protects bcast_duplist & bcast_duplist_curr */
+ spinlock_t bcast_duplist_lock;
+ struct batadv_bla_claim_dst claim_dest;
+ struct delayed_work work;
+};
+#endif
+
+/**
+ * struct batadv_priv_debug_log - debug logging data
+ * @log_buff: buffer holding the logs (ring bufer)
+ * @log_start: index of next character to read
+ * @log_end: index of next character to write
+ * @lock: lock protecting log_buff, log_start & log_end
+ * @queue_wait: log reader's wait queue
+ */
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+struct batadv_priv_debug_log {
+ char log_buff[BATADV_LOG_BUF_LEN];
+ unsigned long log_start;
+ unsigned long log_end;
+ spinlock_t lock; /* protects log_buff, log_start and log_end */
+ wait_queue_head_t queue_wait;
+};
+#endif
+
+/**
+ * struct batadv_priv_gw - per mesh interface gateway data
+ * @list: list of available gateway nodes
+ * @list_lock: lock protecting gw_list & curr_gw
+ * @curr_gw: pointer to currently selected gateway node
+ * @bandwidth_down: advertised uplink download bandwidth (if gw_mode server)
+ * @bandwidth_up: advertised uplink upload bandwidth (if gw_mode server)
+ * @reselect: bool indicating a gateway re-selection is in progress
+ */
+struct batadv_priv_gw {
+ struct hlist_head list;
+ spinlock_t list_lock; /* protects gw_list & curr_gw */
+ struct batadv_gw_node __rcu *curr_gw; /* rcu protected pointer */
+ atomic_t bandwidth_down;
+ atomic_t bandwidth_up;
+ atomic_t reselect;
+};
+
+/**
+ * struct batadv_priv_tvlv - per mesh interface tvlv data
+ * @container_list: list of registered tvlv containers to be sent with each OGM
+ * @handler_list: list of the various tvlv content handlers
+ * @container_list_lock: protects tvlv container list access
+ * @handler_list_lock: protects handler list access
+ */
+struct batadv_priv_tvlv {
+ struct hlist_head container_list;
+ struct hlist_head handler_list;
+ spinlock_t container_list_lock; /* protects container_list */
+ spinlock_t handler_list_lock; /* protects handler_list */
+};
+
+/**
+ * struct batadv_priv_dat - per mesh interface DAT private data
+ * @addr: node DAT address
+ * @hash: hashtable representing the local ARP cache
+ * @work: work queue callback item for cache purging
+ */
+#ifdef CONFIG_BATMAN_ADV_DAT
+struct batadv_priv_dat {
+ batadv_dat_addr_t addr;
+ struct batadv_hashtable *hash;
+ struct delayed_work work;
+};
+#endif
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+/**
+ * struct batadv_priv_mcast - per mesh interface mcast data
+ * @mla_list: list of multicast addresses we are currently announcing via TT
+ * @want_all_unsnoopables_list: a list of orig_nodes wanting all unsnoopable
+ * multicast traffic
+ * @want_all_ipv4_list: a list of orig_nodes wanting all IPv4 multicast traffic
+ * @want_all_ipv6_list: a list of orig_nodes wanting all IPv6 multicast traffic
+ * @flags: the flags we have last sent in our mcast tvlv
+ * @enabled: whether the multicast tvlv is currently enabled
+ * @num_disabled: number of nodes that have no mcast tvlv
+ * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP traffic
+ * @num_want_all_ipv4: counter for items in want_all_ipv4_list
+ * @num_want_all_ipv6: counter for items in want_all_ipv6_list
+ * @want_lists_lock: lock for protecting modifications to mcast want lists
+ * (traversals are rcu-locked)
+ */
+struct batadv_priv_mcast {
+ struct hlist_head mla_list;
+ struct hlist_head want_all_unsnoopables_list;
+ struct hlist_head want_all_ipv4_list;
+ struct hlist_head want_all_ipv6_list;
+ uint8_t flags;
+ bool enabled;
+ atomic_t num_disabled;
+ atomic_t num_want_all_unsnoopables;
+ atomic_t num_want_all_ipv4;
+ atomic_t num_want_all_ipv6;
+ /* protects want_all_{unsnoopables,ipv4,ipv6}_list */
+ spinlock_t want_lists_lock;
+};
+#endif
+
+/**
+ * struct batadv_priv_nc - per mesh interface network coding private data
+ * @work: work queue callback item for cleanup
+ * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
+ * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
+ * @max_fwd_delay: maximum packet forward delay to allow coding of packets
+ * @max_buffer_time: buffer time for sniffed packets used to decoding
+ * @timestamp_fwd_flush: timestamp of last forward packet queue flush
+ * @timestamp_sniffed_purge: timestamp of last sniffed packet queue purge
+ * @coding_hash: Hash table used to buffer skbs while waiting for another
+ * incoming skb to code it with. Skbs are added to the buffer just before being
+ * forwarded in routing.c
+ * @decoding_hash: Hash table used to buffer skbs that might be needed to decode
+ * a received coded skb. The buffer is used for 1) skbs arriving on the
+ * soft-interface; 2) skbs overheard on the hard-interface; and 3) skbs
+ * forwarded by batman-adv.
+ */
+struct batadv_priv_nc {
+ struct delayed_work work;
+ struct dentry *debug_dir;
+ u8 min_tq;
+ u32 max_fwd_delay;
+ u32 max_buffer_time;
+ unsigned long timestamp_fwd_flush;
+ unsigned long timestamp_sniffed_purge;
+ struct batadv_hashtable *coding_hash;
+ struct batadv_hashtable *decoding_hash;
+};
+
+/**
+ * struct batadv_softif_vlan - per VLAN attributes set
+ * @bat_priv: pointer to the mesh object
+ * @vid: VLAN identifier
+ * @kobj: kobject for sysfs vlan subdirectory
+ * @ap_isolation: AP isolation state
+ * @tt: TT private attributes (VLAN specific)
+ * @list: list node for bat_priv::softif_vlan_list
+ * @refcount: number of context where this object is currently in use
+ * @rcu: struct used for freeing in a RCU-safe manner
+ */
+struct batadv_softif_vlan {
+ struct batadv_priv *bat_priv;
+ unsigned short vid;
+ struct kobject *kobj;
+ atomic_t ap_isolation; /* boolean */
+ struct batadv_vlan_tt tt;
+ struct hlist_node list;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_priv - per mesh interface data
+ * @mesh_state: current status of the mesh (inactive/active/deactivating)
+ * @soft_iface: net device which holds this struct as private data
+ * @stats: structure holding the data for the ndo_get_stats() call
+ * @bat_counters: mesh internal traffic statistic counters (see batadv_counters)
+ * @aggregated_ogms: bool indicating whether OGM aggregation is enabled
+ * @bonding: bool indicating whether traffic bonding is enabled
+ * @fragmentation: bool indicating whether traffic fragmentation is enabled
+ * @packet_size_max: max packet size that can be transmitted via
+ * multiple fragmented skbs or a single frame if fragmentation is disabled
+ * @frag_seqno: incremental counter to identify chains of egress fragments
+ * @bridge_loop_avoidance: bool indicating whether bridge loop avoidance is
+ * enabled
+ * @distributed_arp_table: bool indicating whether distributed ARP table is
+ * enabled
+ * @multicast_mode: Enable or disable multicast optimizations on this node's
+ * sender/originating side
+ * @gw_mode: gateway operation: off, client or server (see batadv_gw_modes)
+ * @gw_sel_class: gateway selection class (applies if gw_mode client)
+ * @orig_interval: OGM broadcast interval in milliseconds
+ * @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop
+ * @log_level: configured log level (see batadv_dbg_level)
+ * @bcast_seqno: last sent broadcast packet sequence number
+ * @bcast_queue_left: number of remaining buffered broadcast packet slots
+ * @batman_queue_left: number of remaining OGM packet slots
+ * @num_ifaces: number of interfaces assigned to this mesh interface
+ * @mesh_obj: kobject for sysfs mesh subdirectory
+ * @debug_dir: dentry for debugfs batman-adv subdirectory
+ * @forw_bat_list: list of aggregated OGMs that will be forwarded
+ * @forw_bcast_list: list of broadcast packets that will be rebroadcasted
+ * @orig_hash: hash table containing mesh participants (orig nodes)
+ * @forw_bat_list_lock: lock protecting forw_bat_list
+ * @forw_bcast_list_lock: lock protecting forw_bcast_list
+ * @orig_work: work queue callback item for orig node purging
+ * @cleanup_work: work queue callback item for soft interface deinit
+ * @primary_if: one of the hard interfaces assigned to this mesh interface
+ * becomes the primary interface
+ * @bat_algo_ops: routing algorithm used by this mesh interface
+ * @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top
+ * of the mesh interface represented by this object
+ * @softif_vlan_list_lock: lock protecting softif_vlan_list
+ * @bla: bridge loope avoidance data
+ * @debug_log: holding debug logging relevant data
+ * @gw: gateway data
+ * @tt: translation table data
+ * @tvlv: type-version-length-value data
+ * @dat: distributed arp table data
+ * @mcast: multicast data
+ * @network_coding: bool indicating whether network coding is enabled
+ * @batadv_priv_nc: network coding data
+ */
+struct batadv_priv {
+ atomic_t mesh_state;
+ struct net_device *soft_iface;
+ struct net_device_stats stats;
+ uint64_t __percpu *bat_counters; /* Per cpu counters */
+ atomic_t aggregated_ogms;
+ atomic_t bonding;
+ atomic_t fragmentation;
+ atomic_t packet_size_max;
+ atomic_t frag_seqno;
+#ifdef CONFIG_BATMAN_ADV_BLA
+ atomic_t bridge_loop_avoidance;
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
+ atomic_t distributed_arp_table;
+#endif
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ atomic_t multicast_mode;
+#endif
+ atomic_t gw_mode;
+ atomic_t gw_sel_class;
+ atomic_t orig_interval;
+ atomic_t hop_penalty;
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ atomic_t log_level;
+#endif
+ uint32_t isolation_mark;
+ uint32_t isolation_mark_mask;
+ atomic_t bcast_seqno;
+ atomic_t bcast_queue_left;
+ atomic_t batman_queue_left;
+ char num_ifaces;
+ struct kobject *mesh_obj;
+ struct dentry *debug_dir;
+ struct hlist_head forw_bat_list;
+ struct hlist_head forw_bcast_list;
+ struct batadv_hashtable *orig_hash;
+ spinlock_t forw_bat_list_lock; /* protects forw_bat_list */
+ spinlock_t forw_bcast_list_lock; /* protects forw_bcast_list */
+ struct delayed_work orig_work;
+ struct work_struct cleanup_work;
+ struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */
+ struct batadv_algo_ops *bat_algo_ops;
+ struct hlist_head softif_vlan_list;
+ spinlock_t softif_vlan_list_lock; /* protects softif_vlan_list */
+#ifdef CONFIG_BATMAN_ADV_BLA
+ struct batadv_priv_bla bla;
+#endif
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ struct batadv_priv_debug_log *debug_log;
+#endif
+ struct batadv_priv_gw gw;
+ struct batadv_priv_tt tt;
+ struct batadv_priv_tvlv tvlv;
+#ifdef CONFIG_BATMAN_ADV_DAT
+ struct batadv_priv_dat dat;
+#endif
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ struct batadv_priv_mcast mcast;
+#endif
+#ifdef CONFIG_BATMAN_ADV_NC
+ atomic_t network_coding;
+ struct batadv_priv_nc nc;
+#endif /* CONFIG_BATMAN_ADV_NC */
+};
+
+/**
+ * struct batadv_socket_client - layer2 icmp socket client data
+ * @queue_list: packet queue for packets destined for this socket client
+ * @queue_len: number of packets in the packet queue (queue_list)
+ * @index: socket client's index in the batadv_socket_client_hash
+ * @lock: lock protecting queue_list, queue_len & index
+ * @queue_wait: socket client's wait queue
+ * @bat_priv: pointer to soft_iface this client belongs to
+ */
+struct batadv_socket_client {
+ struct list_head queue_list;
+ unsigned int queue_len;
+ unsigned char index;
+ spinlock_t lock; /* protects queue_list, queue_len & index */
+ wait_queue_head_t queue_wait;
+ struct batadv_priv *bat_priv;
+};
+
+/**
+ * struct batadv_socket_packet - layer2 icmp packet for socket client
+ * @list: list node for batadv_socket_client::queue_list
+ * @icmp_len: size of the layer2 icmp packet
+ * @icmp_packet: layer2 icmp packet
+ */
+struct batadv_socket_packet {
+ struct list_head list;
+ size_t icmp_len;
+ uint8_t icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE];
+};
+
+/**
+ * struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN
+ * @orig: originator address of backbone node (mac address of primary iface)
+ * @vid: vlan id this gateway was detected on
+ * @hash_entry: hlist node for batadv_priv_bla::backbone_hash
+ * @bat_priv: pointer to soft_iface this backbone gateway belongs to
+ * @lasttime: last time we heard of this backbone gw
+ * @wait_periods: grace time for bridge forward delays and bla group forming at
+ * bootup phase - no bcast traffic is formwared until it has elapsed
+ * @request_sent: if this bool is set to true we are out of sync with this
+ * backbone gateway - no bcast traffic is formwared until the situation was
+ * resolved
+ * @crc: crc16 checksum over all claims
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+#ifdef CONFIG_BATMAN_ADV_BLA
+struct batadv_bla_backbone_gw {
+ uint8_t orig[ETH_ALEN];
+ unsigned short vid;
+ struct hlist_node hash_entry;
+ struct batadv_priv *bat_priv;
+ unsigned long lasttime;
+ atomic_t wait_periods;
+ atomic_t request_sent;
+ uint16_t crc;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_bla_claim - claimed non-mesh client structure
+ * @addr: mac address of claimed non-mesh client
+ * @vid: vlan id this client was detected on
+ * @batadv_bla_backbone_gw: pointer to backbone gw claiming this client
+ * @lasttime: last time we heard of claim (locals only)
+ * @hash_entry: hlist node for batadv_priv_bla::claim_hash
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+struct batadv_bla_claim {
+ uint8_t addr[ETH_ALEN];
+ unsigned short vid;
+ struct batadv_bla_backbone_gw *backbone_gw;
+ unsigned long lasttime;
+ struct hlist_node hash_entry;
+ struct rcu_head rcu;
+ atomic_t refcount;
+};
+#endif
+
+/**
+ * struct batadv_tt_common_entry - tt local & tt global common data
+ * @addr: mac address of non-mesh client
+ * @vid: VLAN identifier
+ * @hash_entry: hlist node for batadv_priv_tt::local_hash or for
+ * batadv_priv_tt::global_hash
+ * @flags: various state handling flags (see batadv_tt_client_flags)
+ * @added_at: timestamp used for purging stale tt common entries
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+struct batadv_tt_common_entry {
+ uint8_t addr[ETH_ALEN];
+ unsigned short vid;
+ struct hlist_node hash_entry;
+ uint16_t flags;
+ unsigned long added_at;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_tt_local_entry - translation table local entry data
+ * @common: general translation table data
+ * @last_seen: timestamp used for purging stale tt local entries
+ */
+struct batadv_tt_local_entry {
+ struct batadv_tt_common_entry common;
+ unsigned long last_seen;
+};
+
+/**
+ * struct batadv_tt_global_entry - translation table global entry data
+ * @common: general translation table data
+ * @orig_list: list of orig nodes announcing this non-mesh client
+ * @orig_list_count: number of items in the orig_list
+ * @list_lock: lock protecting orig_list
+ * @roam_at: time at which TT_GLOBAL_ROAM was set
+ */
+struct batadv_tt_global_entry {
+ struct batadv_tt_common_entry common;
+ struct hlist_head orig_list;
+ atomic_t orig_list_count;
+ spinlock_t list_lock; /* protects orig_list */
+ unsigned long roam_at;
+};
+
+/**
+ * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client
+ * @orig_node: pointer to orig node announcing this non-mesh client
+ * @ttvn: translation table version number which added the non-mesh client
+ * @list: list node for batadv_tt_global_entry::orig_list
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+struct batadv_tt_orig_list_entry {
+ struct batadv_orig_node *orig_node;
+ uint8_t ttvn;
+ struct hlist_node list;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_tt_change_node - structure for tt changes occurred
+ * @list: list node for batadv_priv_tt::changes_list
+ * @change: holds the actual translation table diff data
+ */
+struct batadv_tt_change_node {
+ struct list_head list;
+ struct batadv_tvlv_tt_change change;
+};
+
+/**
+ * struct batadv_tt_req_node - data to keep track of the tt requests in flight
+ * @addr: mac address address of the originator this request was sent to
+ * @issued_at: timestamp used for purging stale tt requests
+ * @list: list node for batadv_priv_tt::req_list
+ */
+struct batadv_tt_req_node {
+ uint8_t addr[ETH_ALEN];
+ unsigned long issued_at;
+ struct list_head list;
+};
+
+/**
+ * struct batadv_tt_roam_node - roaming client data
+ * @addr: mac address of the client in the roaming phase
+ * @counter: number of allowed roaming events per client within a single
+ * OGM interval (changes are committed with each OGM)
+ * @first_time: timestamp used for purging stale roaming node entries
+ * @list: list node for batadv_priv_tt::roam_list
+ */
+struct batadv_tt_roam_node {
+ uint8_t addr[ETH_ALEN];
+ atomic_t counter;
+ unsigned long first_time;
+ struct list_head list;
+};
+
+/**
+ * struct batadv_nc_node - network coding node
+ * @list: next and prev pointer for the list handling
+ * @addr: the node's mac address
+ * @refcount: number of contexts the object is used by
+ * @rcu: struct used for freeing in an RCU-safe manner
+ * @orig_node: pointer to corresponding orig node struct
+ * @last_seen: timestamp of last ogm received from this node
+ */
+struct batadv_nc_node {
+ struct list_head list;
+ uint8_t addr[ETH_ALEN];
+ atomic_t refcount;
+ struct rcu_head rcu;
+ struct batadv_orig_node *orig_node;
+ unsigned long last_seen;
+};
+
+/**
+ * struct batadv_nc_path - network coding path
+ * @hash_entry: next and prev pointer for the list handling
+ * @rcu: struct used for freeing in an RCU-safe manner
+ * @refcount: number of contexts the object is used by
+ * @packet_list: list of buffered packets for this path
+ * @packet_list_lock: access lock for packet list
+ * @next_hop: next hop (destination) of path
+ * @prev_hop: previous hop (source) of path
+ * @last_valid: timestamp for last validation of path
+ */
+struct batadv_nc_path {
+ struct hlist_node hash_entry;
+ struct rcu_head rcu;
+ atomic_t refcount;
+ struct list_head packet_list;
+ spinlock_t packet_list_lock; /* Protects packet_list */
+ uint8_t next_hop[ETH_ALEN];
+ uint8_t prev_hop[ETH_ALEN];
+ unsigned long last_valid;
+};
+
+/**
+ * struct batadv_nc_packet - network coding packet used when coding and
+ * decoding packets
+ * @list: next and prev pointer for the list handling
+ * @packet_id: crc32 checksum of skb data
+ * @timestamp: field containing the info when the packet was added to path
+ * @neigh_node: pointer to original next hop neighbor of skb
+ * @skb: skb which can be encoded or used for decoding
+ * @nc_path: pointer to path this nc packet is attached to
+ */
+struct batadv_nc_packet {
+ struct list_head list;
+ __be32 packet_id;
+ unsigned long timestamp;
+ struct batadv_neigh_node *neigh_node;
+ struct sk_buff *skb;
+ struct batadv_nc_path *nc_path;
+};
+
+/**
+ * struct batadv_skb_cb - control buffer structure used to store private data
+ * relevant to batman-adv in the skb->cb buffer in skbs.
+ * @decoded: Marks a skb as decoded, which is checked when searching for coding
+ * opportunities in network-coding.c
+ */
+struct batadv_skb_cb {
+ bool decoded;
+};
+
+/**
+ * struct batadv_forw_packet - structure for bcast packets to be sent/forwarded
+ * @list: list node for batadv_socket_client::queue_list
+ * @send_time: execution time for delayed_work (packet sending)
+ * @own: bool for locally generated packets (local OGMs are re-scheduled after
+ * sending)
+ * @skb: bcast packet's skb buffer
+ * @packet_len: size of aggregated OGM packet inside the skb buffer
+ * @direct_link_flags: direct link flags for aggregated OGM packets
+ * @num_packets: counter for bcast packet retransmission
+ * @delayed_work: work queue callback item for packet sending
+ * @if_incoming: pointer to incoming hard-iface or primary iface if
+ * locally generated packet
+ * @if_outgoing: packet where the packet should be sent to, or NULL if
+ * unspecified
+ */
+struct batadv_forw_packet {
+ struct hlist_node list;
+ unsigned long send_time;
+ uint8_t own;
+ struct sk_buff *skb;
+ uint16_t packet_len;
+ uint32_t direct_link_flags;
+ uint8_t num_packets;
+ struct delayed_work delayed_work;
+ struct batadv_hard_iface *if_incoming;
+ struct batadv_hard_iface *if_outgoing;
+};
+
+/**
+ * struct batadv_algo_ops - mesh algorithm callbacks
+ * @list: list node for the batadv_algo_list
+ * @name: name of the algorithm
+ * @bat_iface_enable: init routing info when hard-interface is enabled
+ * @bat_iface_disable: de-init routing info when hard-interface is disabled
+ * @bat_iface_update_mac: (re-)init mac addresses of the protocol information
+ * belonging to this hard-interface
+ * @bat_primary_iface_set: called when primary interface is selected / changed
+ * @bat_ogm_schedule: prepare a new outgoing OGM for the send queue
+ * @bat_ogm_emit: send scheduled OGM
+ * @bat_neigh_cmp: compare the metrics of two neighbors for their respective
+ * outgoing interfaces
+ * @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or better
+ * than neigh2 for their respective outgoing interface from the metric
+ * prospective
+ * @bat_orig_print: print the originator table (optional)
+ * @bat_orig_free: free the resources allocated by the routing algorithm for an
+ * orig_node object
+ * @bat_orig_add_if: ask the routing algorithm to apply the needed changes to
+ * the orig_node due to a new hard-interface being added into the mesh
+ * @bat_orig_del_if: ask the routing algorithm to apply the needed changes to
+ * the orig_node due to an hard-interface being removed from the mesh
+ */
+struct batadv_algo_ops {
+ struct hlist_node list;
+ char *name;
+ int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface);
+ void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface);
+ void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface);
+ void (*bat_primary_iface_set)(struct batadv_hard_iface *hard_iface);
+ void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface);
+ void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet);
+ int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2);
+ bool (*bat_neigh_is_equiv_or_better)
+ (struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2);
+ /* orig_node handling API */
+ void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq,
+ struct batadv_hard_iface *hard_iface);
+ void (*bat_orig_free)(struct batadv_orig_node *orig_node);
+ int (*bat_orig_add_if)(struct batadv_orig_node *orig_node,
+ int max_if_num);
+ int (*bat_orig_del_if)(struct batadv_orig_node *orig_node,
+ int max_if_num, int del_if_num);
+};
+
+/**
+ * struct batadv_dat_entry - it is a single entry of batman-adv ARP backend. It
+ * is used to stored ARP entries needed for the global DAT cache
+ * @ip: the IPv4 corresponding to this DAT/ARP entry
+ * @mac_addr: the MAC address associated to the stored IPv4
+ * @vid: the vlan ID associated to this entry
+ * @last_update: time in jiffies when this entry was refreshed last time
+ * @hash_entry: hlist node for batadv_priv_dat::hash
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+struct batadv_dat_entry {
+ __be32 ip;
+ uint8_t mac_addr[ETH_ALEN];
+ unsigned short vid;
+ unsigned long last_update;
+ struct hlist_node hash_entry;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct batadv_hw_addr - a list entry for a MAC address
+ * @list: list node for the linking of entries
+ * @addr: the MAC address of this list entry
+ */
+struct batadv_hw_addr {
+ struct hlist_node list;
+ unsigned char addr[ETH_ALEN];
+};
+
+/**
+ * struct batadv_dat_candidate - candidate destination for DAT operations
+ * @type: the type of the selected candidate. It can one of the following:
+ * - BATADV_DAT_CANDIDATE_NOT_FOUND
+ * - BATADV_DAT_CANDIDATE_ORIG
+ * @orig_node: if type is BATADV_DAT_CANDIDATE_ORIG this field points to the
+ * corresponding originator node structure
+ */
+struct batadv_dat_candidate {
+ int type;
+ struct batadv_orig_node *orig_node;
+};
+
+/**
+ * struct batadv_tvlv_container - container for tvlv appended to OGMs
+ * @list: hlist node for batadv_priv_tvlv::container_list
+ * @tvlv_hdr: tvlv header information needed to construct the tvlv
+ * @value_len: length of the buffer following this struct which contains
+ * the actual tvlv payload
+ * @refcount: number of contexts the object is used
+ */
+struct batadv_tvlv_container {
+ struct hlist_node list;
+ struct batadv_tvlv_hdr tvlv_hdr;
+ atomic_t refcount;
+};
+
+/**
+ * struct batadv_tvlv_handler - handler for specific tvlv type and version
+ * @list: hlist node for batadv_priv_tvlv::handler_list
+ * @ogm_handler: handler callback which is given the tvlv payload to process on
+ * incoming OGM packets
+ * @unicast_handler: handler callback which is given the tvlv payload to process
+ * on incoming unicast tvlv packets
+ * @type: tvlv type this handler feels responsible for
+ * @version: tvlv version this handler feels responsible for
+ * @flags: tvlv handler flags
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in an RCU-safe manner
+ */
+struct batadv_tvlv_handler {
+ struct hlist_node list;
+ void (*ogm_handler)(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ uint8_t flags,
+ void *tvlv_value, uint16_t tvlv_value_len);
+ int (*unicast_handler)(struct batadv_priv *bat_priv,
+ uint8_t *src, uint8_t *dst,
+ void *tvlv_value, uint16_t tvlv_value_len);
+ uint8_t type;
+ uint8_t version;
+ uint8_t flags;
+ atomic_t refcount;
+ struct rcu_head rcu;
+};
+
+/**
+ * enum batadv_tvlv_handler_flags - tvlv handler flags definitions
+ * @BATADV_TVLV_HANDLER_OGM_CIFNOTFND: tvlv ogm processing function will call
+ * this handler even if its type was not found (with no data)
+ * @BATADV_TVLV_HANDLER_OGM_CALLED: interval tvlv handling flag - the API marks
+ * a handler as being called, so it won't be called if the
+ * BATADV_TVLV_HANDLER_OGM_CIFNOTFND flag was set
+ */
+enum batadv_tvlv_handler_flags {
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND = BIT(1),
+ BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2),
+};
+
+#endif /* _NET_BATMAN_ADV_TYPES_H_ */
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
new file mode 100644
index 000000000..1742b849f
--- /dev/null
+++ b/net/bluetooth/6lowpan.c
@@ -0,0 +1,1469 @@
+/*
+ Copyright (c) 2013-2014 Intel Corp.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 and
+ only version 2 as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+*/
+
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#include <net/af_ieee802154.h> /* to get the address type */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include <net/6lowpan.h> /* for the compression support */
+
+#define VERSION "0.1"
+
+static struct dentry *lowpan_enable_debugfs;
+static struct dentry *lowpan_control_debugfs;
+
+#define IFACE_NAME_TEMPLATE "bt%d"
+#define EUI64_ADDR_LEN 8
+
+struct skb_cb {
+ struct in6_addr addr;
+ struct in6_addr gw;
+ struct l2cap_chan *chan;
+ int status;
+};
+#define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb))
+
+/* The devices list contains those devices that we are acting
+ * as a proxy. The BT 6LoWPAN device is a virtual device that
+ * connects to the Bluetooth LE device. The real connection to
+ * BT device is done via l2cap layer. There exists one
+ * virtual device / one BT 6LoWPAN network (=hciX device).
+ * The list contains struct lowpan_dev elements.
+ */
+static LIST_HEAD(bt_6lowpan_devices);
+static DEFINE_SPINLOCK(devices_lock);
+
+static bool enable_6lowpan;
+
+/* We are listening incoming connections via this channel
+ */
+static struct l2cap_chan *listen_chan;
+
+struct lowpan_peer {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct l2cap_chan *chan;
+
+ /* peer addresses in various formats */
+ unsigned char eui64_addr[EUI64_ADDR_LEN];
+ struct in6_addr peer_addr;
+};
+
+struct lowpan_dev {
+ struct list_head list;
+
+ struct hci_dev *hdev;
+ struct net_device *netdev;
+ struct list_head peers;
+ atomic_t peer_count; /* number of items in peers list */
+
+ struct work_struct delete_netdev;
+ struct delayed_work notify_peers;
+};
+
+static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev)
+{
+ return netdev_priv(netdev);
+}
+
+static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer)
+{
+ list_add_rcu(&peer->list, &dev->peers);
+ atomic_inc(&dev->peer_count);
+}
+
+static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer)
+{
+ list_del_rcu(&peer->list);
+ kfree_rcu(peer, rcu);
+
+ module_put(THIS_MODULE);
+
+ if (atomic_dec_and_test(&dev->peer_count)) {
+ BT_DBG("last peer");
+ return true;
+ }
+
+ return false;
+}
+
+static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev,
+ bdaddr_t *ba, __u8 type)
+{
+ struct lowpan_peer *peer;
+
+ BT_DBG("peers %d addr %pMR type %d", atomic_read(&dev->peer_count),
+ ba, type);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ BT_DBG("dst addr %pMR dst type %d",
+ &peer->chan->dst, peer->chan->dst_type);
+
+ if (bacmp(&peer->chan->dst, ba))
+ continue;
+
+ if (type == peer->chan->dst_type) {
+ rcu_read_unlock();
+ return peer;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static inline struct lowpan_peer *__peer_lookup_chan(struct lowpan_dev *dev,
+ struct l2cap_chan *chan)
+{
+ struct lowpan_peer *peer;
+
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ if (peer->chan == chan)
+ return peer;
+ }
+
+ return NULL;
+}
+
+static inline struct lowpan_peer *__peer_lookup_conn(struct lowpan_dev *dev,
+ struct l2cap_conn *conn)
+{
+ struct lowpan_peer *peer;
+
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ if (peer->chan->conn == conn)
+ return peer;
+ }
+
+ return NULL;
+}
+
+static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev,
+ struct in6_addr *daddr,
+ struct sk_buff *skb)
+{
+ struct lowpan_peer *peer;
+ struct in6_addr *nexthop;
+ struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ int count = atomic_read(&dev->peer_count);
+
+ BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt);
+
+ /* If we have multiple 6lowpan peers, then check where we should
+ * send the packet. If only one peer exists, then we can send the
+ * packet right away.
+ */
+ if (count == 1) {
+ rcu_read_lock();
+ peer = list_first_or_null_rcu(&dev->peers, struct lowpan_peer,
+ list);
+ rcu_read_unlock();
+ return peer;
+ }
+
+ if (!rt) {
+ nexthop = &lowpan_cb(skb)->gw;
+
+ if (ipv6_addr_any(nexthop))
+ return NULL;
+ } else {
+ nexthop = rt6_nexthop(rt);
+
+ /* We need to remember the address because it is needed
+ * by bt_xmit() when sending the packet. In bt_xmit(), the
+ * destination routing info is not set.
+ */
+ memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr));
+ }
+
+ BT_DBG("gw %pI6c", nexthop);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ BT_DBG("dst addr %pMR dst type %d ip %pI6c",
+ &peer->chan->dst, peer->chan->dst_type,
+ &peer->peer_addr);
+
+ if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) {
+ rcu_read_unlock();
+ return peer;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn)
+{
+ struct lowpan_dev *entry;
+ struct lowpan_peer *peer = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ peer = __peer_lookup_conn(entry, conn);
+ if (peer)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return peer;
+}
+
+static struct lowpan_dev *lookup_dev(struct l2cap_conn *conn)
+{
+ struct lowpan_dev *entry;
+ struct lowpan_dev *dev = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ if (conn->hcon->hdev == entry->hdev) {
+ dev = entry;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return dev;
+}
+
+static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev)
+{
+ struct sk_buff *skb_cp;
+
+ skb_cp = skb_copy(skb, GFP_ATOMIC);
+ if (!skb_cp)
+ return NET_RX_DROP;
+
+ return netif_rx(skb_cp);
+}
+
+static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev,
+ struct l2cap_chan *chan)
+{
+ const u8 *saddr, *daddr;
+ u8 iphc0, iphc1;
+ struct lowpan_dev *dev;
+ struct lowpan_peer *peer;
+
+ dev = lowpan_dev(netdev);
+
+ rcu_read_lock();
+ peer = __peer_lookup_chan(dev, chan);
+ rcu_read_unlock();
+ if (!peer)
+ return -EINVAL;
+
+ saddr = peer->eui64_addr;
+ daddr = dev->netdev->dev_addr;
+
+ /* at least two bytes will be used for the encoding */
+ if (skb->len < 2)
+ return -EINVAL;
+
+ if (lowpan_fetch_skb_u8(skb, &iphc0))
+ return -EINVAL;
+
+ if (lowpan_fetch_skb_u8(skb, &iphc1))
+ return -EINVAL;
+
+ return lowpan_header_decompress(skb, netdev,
+ saddr, IEEE802154_ADDR_LONG,
+ EUI64_ADDR_LEN, daddr,
+ IEEE802154_ADDR_LONG, EUI64_ADDR_LEN,
+ iphc0, iphc1);
+
+}
+
+static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
+ struct l2cap_chan *chan)
+{
+ struct sk_buff *local_skb;
+ int ret;
+
+ if (!netif_running(dev))
+ goto drop;
+
+ if (dev->type != ARPHRD_6LOWPAN)
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto drop;
+
+ /* check that it's our buffer */
+ if (skb->data[0] == LOWPAN_DISPATCH_IPV6) {
+ /* Copy the packet so that the IPv6 header is
+ * properly aligned.
+ */
+ local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1,
+ skb_tailroom(skb), GFP_ATOMIC);
+ if (!local_skb)
+ goto drop;
+
+ local_skb->protocol = htons(ETH_P_IPV6);
+ local_skb->pkt_type = PACKET_HOST;
+
+ skb_reset_network_header(local_skb);
+ skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));
+
+ if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) {
+ kfree_skb(local_skb);
+ goto drop;
+ }
+
+ dev->stats.rx_bytes += skb->len;
+ dev->stats.rx_packets++;
+
+ consume_skb(local_skb);
+ consume_skb(skb);
+ } else {
+ switch (skb->data[0] & 0xe0) {
+ case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */
+ local_skb = skb_clone(skb, GFP_ATOMIC);
+ if (!local_skb)
+ goto drop;
+
+ ret = iphc_decompress(local_skb, dev, chan);
+ if (ret < 0) {
+ kfree_skb(local_skb);
+ goto drop;
+ }
+
+ local_skb->protocol = htons(ETH_P_IPV6);
+ local_skb->pkt_type = PACKET_HOST;
+ local_skb->dev = dev;
+
+ if (give_skb_to_upper(local_skb, dev)
+ != NET_RX_SUCCESS) {
+ kfree_skb(local_skb);
+ goto drop;
+ }
+
+ dev->stats.rx_bytes += skb->len;
+ dev->stats.rx_packets++;
+
+ consume_skb(local_skb);
+ consume_skb(skb);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return NET_RX_SUCCESS;
+
+drop:
+ dev->stats.rx_dropped++;
+ return NET_RX_DROP;
+}
+
+/* Packet from BT LE device */
+static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct lowpan_dev *dev;
+ struct lowpan_peer *peer;
+ int err;
+
+ peer = lookup_peer(chan->conn);
+ if (!peer)
+ return -ENOENT;
+
+ dev = lookup_dev(chan->conn);
+ if (!dev || !dev->netdev)
+ return -ENOENT;
+
+ err = recv_pkt(skb, dev->netdev, chan);
+ if (err) {
+ BT_DBG("recv pkt %d", err);
+ err = -EAGAIN;
+ }
+
+ return err;
+}
+
+static u8 get_addr_type_from_eui64(u8 byte)
+{
+ /* Is universal(0) or local(1) bit */
+ return ((byte & 0x02) ? BDADDR_LE_RANDOM : BDADDR_LE_PUBLIC);
+}
+
+static void copy_to_bdaddr(struct in6_addr *ip6_daddr, bdaddr_t *addr)
+{
+ u8 *eui64 = ip6_daddr->s6_addr + 8;
+
+ addr->b[0] = eui64[7];
+ addr->b[1] = eui64[6];
+ addr->b[2] = eui64[5];
+ addr->b[3] = eui64[2];
+ addr->b[4] = eui64[1];
+ addr->b[5] = eui64[0];
+}
+
+static void convert_dest_bdaddr(struct in6_addr *ip6_daddr,
+ bdaddr_t *addr, u8 *addr_type)
+{
+ copy_to_bdaddr(ip6_daddr, addr);
+
+ /* We need to toggle the U/L bit that we got from IPv6 address
+ * so that we get the proper address and type of the BD address.
+ */
+ addr->b[5] ^= 0x02;
+
+ *addr_type = get_addr_type_from_eui64(addr->b[5]);
+}
+
+static int setup_header(struct sk_buff *skb, struct net_device *netdev,
+ bdaddr_t *peer_addr, u8 *peer_addr_type)
+{
+ struct in6_addr ipv6_daddr;
+ struct lowpan_dev *dev;
+ struct lowpan_peer *peer;
+ bdaddr_t addr, *any = BDADDR_ANY;
+ u8 *daddr = any->b;
+ int err, status = 0;
+
+ dev = lowpan_dev(netdev);
+
+ memcpy(&ipv6_daddr, &lowpan_cb(skb)->addr, sizeof(ipv6_daddr));
+
+ if (ipv6_addr_is_multicast(&ipv6_daddr)) {
+ lowpan_cb(skb)->chan = NULL;
+ } else {
+ u8 addr_type;
+
+ /* Get destination BT device from skb.
+ * If there is no such peer then discard the packet.
+ */
+ convert_dest_bdaddr(&ipv6_daddr, &addr, &addr_type);
+
+ BT_DBG("dest addr %pMR type %d IP %pI6c", &addr,
+ addr_type, &ipv6_daddr);
+
+ peer = peer_lookup_ba(dev, &addr, addr_type);
+ if (!peer) {
+ /* The packet might be sent to 6lowpan interface
+ * because of routing (either via default route
+ * or user set route) so get peer according to
+ * the destination address.
+ */
+ peer = peer_lookup_dst(dev, &ipv6_daddr, skb);
+ if (!peer) {
+ BT_DBG("no such peer %pMR found", &addr);
+ return -ENOENT;
+ }
+ }
+
+ daddr = peer->eui64_addr;
+ *peer_addr = addr;
+ *peer_addr_type = addr_type;
+ lowpan_cb(skb)->chan = peer->chan;
+
+ status = 1;
+ }
+
+ lowpan_header_compress(skb, netdev, ETH_P_IPV6, daddr,
+ dev->netdev->dev_addr, skb->len);
+
+ err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0);
+ if (err < 0)
+ return err;
+
+ return status;
+}
+
+static int header_create(struct sk_buff *skb, struct net_device *netdev,
+ unsigned short type, const void *_daddr,
+ const void *_saddr, unsigned int len)
+{
+ struct ipv6hdr *hdr;
+
+ if (type != ETH_P_IPV6)
+ return -EINVAL;
+
+ hdr = ipv6_hdr(skb);
+
+ memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, sizeof(struct in6_addr));
+
+ return 0;
+}
+
+/* Packet to BT LE device */
+static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct msghdr msg;
+ struct kvec iv;
+ int err;
+
+ /* Remember the skb so that we can send EAGAIN to the caller if
+ * we run out of credits.
+ */
+ chan->data = skb;
+
+ iv.iov_base = skb->data;
+ iv.iov_len = skb->len;
+
+ memset(&msg, 0, sizeof(msg));
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, skb->len);
+
+ err = l2cap_chan_send(chan, &msg, skb->len);
+ if (err > 0) {
+ netdev->stats.tx_bytes += err;
+ netdev->stats.tx_packets++;
+ return 0;
+ }
+
+ if (!err)
+ err = lowpan_cb(skb)->status;
+
+ if (err < 0) {
+ if (err == -EAGAIN)
+ netdev->stats.tx_dropped++;
+ else
+ netdev->stats.tx_errors++;
+ }
+
+ return err;
+}
+
+static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct sk_buff *local_skb;
+ struct lowpan_dev *entry;
+ int err = 0;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ struct lowpan_peer *pentry;
+ struct lowpan_dev *dev;
+
+ if (entry->netdev != netdev)
+ continue;
+
+ dev = lowpan_dev(entry->netdev);
+
+ list_for_each_entry_rcu(pentry, &dev->peers, list) {
+ int ret;
+
+ local_skb = skb_clone(skb, GFP_ATOMIC);
+
+ BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p",
+ netdev->name,
+ &pentry->chan->dst, pentry->chan->dst_type,
+ &pentry->peer_addr, pentry->chan);
+ ret = send_pkt(pentry->chan, local_skb, netdev);
+ if (ret < 0)
+ err = ret;
+
+ kfree_skb(local_skb);
+ }
+ }
+
+ rcu_read_unlock();
+
+ return err;
+}
+
+static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ int err = 0;
+ bdaddr_t addr;
+ u8 addr_type;
+
+ /* We must take a copy of the skb before we modify/replace the ipv6
+ * header as the header could be used elsewhere
+ */
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_XMIT_DROP;
+
+ /* Return values from setup_header()
+ * <0 - error, packet is dropped
+ * 0 - this is a multicast packet
+ * 1 - this is unicast packet
+ */
+ err = setup_header(skb, netdev, &addr, &addr_type);
+ if (err < 0) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ if (err) {
+ if (lowpan_cb(skb)->chan) {
+ BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p",
+ netdev->name, &addr, addr_type,
+ &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan);
+ err = send_pkt(lowpan_cb(skb)->chan, skb, netdev);
+ } else {
+ err = -ENOENT;
+ }
+ } else {
+ /* We need to send the packet to every device behind this
+ * interface.
+ */
+ err = send_mcast_pkt(skb, netdev);
+ }
+
+ dev_kfree_skb(skb);
+
+ if (err)
+ BT_DBG("ERROR: xmit failed (%d)", err);
+
+ return err < 0 ? NET_XMIT_DROP : err;
+}
+
+static struct lock_class_key bt_tx_busylock;
+static struct lock_class_key bt_netdev_xmit_lock_key;
+
+static void bt_set_lockdep_class_one(struct net_device *dev,
+ struct netdev_queue *txq,
+ void *_unused)
+{
+ lockdep_set_class(&txq->_xmit_lock, &bt_netdev_xmit_lock_key);
+}
+
+static int bt_dev_init(struct net_device *dev)
+{
+ netdev_for_each_tx_queue(dev, bt_set_lockdep_class_one, NULL);
+ dev->qdisc_tx_busylock = &bt_tx_busylock;
+
+ return 0;
+}
+
+static const struct net_device_ops netdev_ops = {
+ .ndo_init = bt_dev_init,
+ .ndo_start_xmit = bt_xmit,
+};
+
+static struct header_ops header_ops = {
+ .create = header_create,
+};
+
+static void netdev_setup(struct net_device *dev)
+{
+ dev->addr_len = EUI64_ADDR_LEN;
+ dev->type = ARPHRD_6LOWPAN;
+
+ dev->hard_header_len = 0;
+ dev->needed_tailroom = 0;
+ dev->mtu = IPV6_MIN_MTU;
+ dev->tx_queue_len = 0;
+ dev->flags = IFF_RUNNING | IFF_POINTOPOINT |
+ IFF_MULTICAST;
+ dev->watchdog_timeo = 0;
+
+ dev->netdev_ops = &netdev_ops;
+ dev->header_ops = &header_ops;
+ dev->destructor = free_netdev;
+}
+
+static struct device_type bt_type = {
+ .name = "bluetooth",
+};
+
+static void set_addr(u8 *eui, u8 *addr, u8 addr_type)
+{
+ /* addr is the BT address in little-endian format */
+ eui[0] = addr[5];
+ eui[1] = addr[4];
+ eui[2] = addr[3];
+ eui[3] = 0xFF;
+ eui[4] = 0xFE;
+ eui[5] = addr[2];
+ eui[6] = addr[1];
+ eui[7] = addr[0];
+
+ /* Universal/local bit set, BT 6lowpan draft ch. 3.2.1 */
+ if (addr_type == BDADDR_LE_PUBLIC)
+ eui[0] &= ~0x02;
+ else
+ eui[0] |= 0x02;
+
+ BT_DBG("type %d addr %*phC", addr_type, 8, eui);
+}
+
+static void set_dev_addr(struct net_device *netdev, bdaddr_t *addr,
+ u8 addr_type)
+{
+ netdev->addr_assign_type = NET_ADDR_PERM;
+ set_addr(netdev->dev_addr, addr->b, addr_type);
+}
+
+static void ifup(struct net_device *netdev)
+{
+ int err;
+
+ rtnl_lock();
+ err = dev_open(netdev);
+ if (err < 0)
+ BT_INFO("iface %s cannot be opened (%d)", netdev->name, err);
+ rtnl_unlock();
+}
+
+static void ifdown(struct net_device *netdev)
+{
+ int err;
+
+ rtnl_lock();
+ err = dev_close(netdev);
+ if (err < 0)
+ BT_INFO("iface %s cannot be closed (%d)", netdev->name, err);
+ rtnl_unlock();
+}
+
+static void do_notify_peers(struct work_struct *work)
+{
+ struct lowpan_dev *dev = container_of(work, struct lowpan_dev,
+ notify_peers.work);
+
+ netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */
+}
+
+static bool is_bt_6lowpan(struct hci_conn *hcon)
+{
+ if (hcon->type != LE_LINK)
+ return false;
+
+ if (!enable_6lowpan)
+ return false;
+
+ return true;
+}
+
+static struct l2cap_chan *chan_create(void)
+{
+ struct l2cap_chan *chan;
+
+ chan = l2cap_chan_create();
+ if (!chan)
+ return NULL;
+
+ l2cap_chan_set_defaults(chan);
+
+ chan->chan_type = L2CAP_CHAN_CONN_ORIENTED;
+ chan->mode = L2CAP_MODE_LE_FLOWCTL;
+ chan->omtu = 65535;
+ chan->imtu = chan->omtu;
+
+ return chan;
+}
+
+static struct l2cap_chan *chan_open(struct l2cap_chan *pchan)
+{
+ struct l2cap_chan *chan;
+
+ chan = chan_create();
+ if (!chan)
+ return NULL;
+
+ chan->remote_mps = chan->omtu;
+ chan->mps = chan->omtu;
+
+ chan->state = BT_CONNECTED;
+
+ return chan;
+}
+
+static void set_ip_addr_bits(u8 addr_type, u8 *addr)
+{
+ if (addr_type == BDADDR_LE_PUBLIC)
+ *addr |= 0x02;
+ else
+ *addr &= ~0x02;
+}
+
+static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan,
+ struct lowpan_dev *dev)
+{
+ struct lowpan_peer *peer;
+
+ peer = kzalloc(sizeof(*peer), GFP_ATOMIC);
+ if (!peer)
+ return NULL;
+
+ peer->chan = chan;
+ memset(&peer->peer_addr, 0, sizeof(struct in6_addr));
+
+ /* RFC 2464 ch. 5 */
+ peer->peer_addr.s6_addr[0] = 0xFE;
+ peer->peer_addr.s6_addr[1] = 0x80;
+ set_addr((u8 *)&peer->peer_addr.s6_addr + 8, chan->dst.b,
+ chan->dst_type);
+
+ memcpy(&peer->eui64_addr, (u8 *)&peer->peer_addr.s6_addr + 8,
+ EUI64_ADDR_LEN);
+
+ /* IPv6 address needs to have the U/L bit set properly so toggle
+ * it back here.
+ */
+ set_ip_addr_bits(chan->dst_type, (u8 *)&peer->peer_addr.s6_addr + 8);
+
+ spin_lock(&devices_lock);
+ INIT_LIST_HEAD(&peer->list);
+ peer_add(dev, peer);
+ spin_unlock(&devices_lock);
+
+ /* Notifying peers about us needs to be done without locks held */
+ INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers);
+ schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100));
+
+ return peer->chan;
+}
+
+static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
+{
+ struct net_device *netdev;
+ int err = 0;
+
+ netdev = alloc_netdev(sizeof(struct lowpan_dev), IFACE_NAME_TEMPLATE,
+ NET_NAME_UNKNOWN, netdev_setup);
+ if (!netdev)
+ return -ENOMEM;
+
+ set_dev_addr(netdev, &chan->src, chan->src_type);
+
+ netdev->netdev_ops = &netdev_ops;
+ SET_NETDEV_DEV(netdev, &chan->conn->hcon->dev);
+ SET_NETDEV_DEVTYPE(netdev, &bt_type);
+
+ err = register_netdev(netdev);
+ if (err < 0) {
+ BT_INFO("register_netdev failed %d", err);
+ free_netdev(netdev);
+ goto out;
+ }
+
+ BT_DBG("ifindex %d peer bdaddr %pMR type %d my addr %pMR type %d",
+ netdev->ifindex, &chan->dst, chan->dst_type,
+ &chan->src, chan->src_type);
+ set_bit(__LINK_STATE_PRESENT, &netdev->state);
+
+ *dev = netdev_priv(netdev);
+ (*dev)->netdev = netdev;
+ (*dev)->hdev = chan->conn->hcon->hdev;
+ INIT_LIST_HEAD(&(*dev)->peers);
+
+ spin_lock(&devices_lock);
+ INIT_LIST_HEAD(&(*dev)->list);
+ list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
+ spin_unlock(&devices_lock);
+
+ return 0;
+
+out:
+ return err;
+}
+
+static inline void chan_ready_cb(struct l2cap_chan *chan)
+{
+ struct lowpan_dev *dev;
+
+ dev = lookup_dev(chan->conn);
+
+ BT_DBG("chan %p conn %p dev %p", chan, chan->conn, dev);
+
+ if (!dev) {
+ if (setup_netdev(chan, &dev) < 0) {
+ l2cap_chan_del(chan, -ENOENT);
+ return;
+ }
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return;
+
+ add_peer_chan(chan, dev);
+ ifup(dev->netdev);
+}
+
+static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan)
+{
+ struct l2cap_chan *chan;
+
+ chan = chan_open(pchan);
+ chan->ops = pchan->ops;
+
+ BT_DBG("chan %p pchan %p", chan, pchan);
+
+ return chan;
+}
+
+static void delete_netdev(struct work_struct *work)
+{
+ struct lowpan_dev *entry = container_of(work, struct lowpan_dev,
+ delete_netdev);
+
+ unregister_netdev(entry->netdev);
+
+ /* The entry pointer is deleted in device_event() */
+}
+
+static void chan_close_cb(struct l2cap_chan *chan)
+{
+ struct lowpan_dev *entry;
+ struct lowpan_dev *dev = NULL;
+ struct lowpan_peer *peer;
+ int err = -ENOENT;
+ bool last = false, removed = true;
+
+ BT_DBG("chan %p conn %p", chan, chan->conn);
+
+ if (chan->conn && chan->conn->hcon) {
+ if (!is_bt_6lowpan(chan->conn->hcon))
+ return;
+
+ /* If conn is set, then the netdev is also there and we should
+ * not remove it.
+ */
+ removed = false;
+ }
+
+ spin_lock(&devices_lock);
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ dev = lowpan_dev(entry->netdev);
+ peer = __peer_lookup_chan(dev, chan);
+ if (peer) {
+ last = peer_del(dev, peer);
+ err = 0;
+
+ BT_DBG("dev %p removing %speer %p", dev,
+ last ? "last " : "1 ", peer);
+ BT_DBG("chan %p orig refcnt %d", chan,
+ atomic_read(&chan->kref.refcount));
+
+ l2cap_chan_put(chan);
+ break;
+ }
+ }
+
+ if (!err && last && dev && !atomic_read(&dev->peer_count)) {
+ spin_unlock(&devices_lock);
+
+ cancel_delayed_work_sync(&dev->notify_peers);
+
+ ifdown(dev->netdev);
+
+ if (!removed) {
+ INIT_WORK(&entry->delete_netdev, delete_netdev);
+ schedule_work(&entry->delete_netdev);
+ }
+ } else {
+ spin_unlock(&devices_lock);
+ }
+
+ return;
+}
+
+static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err)
+{
+ BT_DBG("chan %p conn %p state %s err %d", chan, chan->conn,
+ state_to_string(state), err);
+}
+
+static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan,
+ unsigned long hdr_len,
+ unsigned long len, int nb)
+{
+ /* Note that we must allocate using GFP_ATOMIC here as
+ * this function is called originally from netdev hard xmit
+ * function in atomic context.
+ */
+ return bt_skb_alloc(hdr_len + len, GFP_ATOMIC);
+}
+
+static void chan_suspend_cb(struct l2cap_chan *chan)
+{
+ struct sk_buff *skb = chan->data;
+
+ BT_DBG("chan %p conn %p skb %p", chan, chan->conn, skb);
+
+ if (!skb)
+ return;
+
+ lowpan_cb(skb)->status = -EAGAIN;
+}
+
+static void chan_resume_cb(struct l2cap_chan *chan)
+{
+ struct sk_buff *skb = chan->data;
+
+ BT_DBG("chan %p conn %p skb %p", chan, chan->conn, skb);
+
+ if (!skb)
+ return;
+
+ lowpan_cb(skb)->status = 0;
+}
+
+static long chan_get_sndtimeo_cb(struct l2cap_chan *chan)
+{
+ return L2CAP_CONN_TIMEOUT;
+}
+
+static const struct l2cap_ops bt_6lowpan_chan_ops = {
+ .name = "L2CAP 6LoWPAN channel",
+ .new_connection = chan_new_conn_cb,
+ .recv = chan_recv_cb,
+ .close = chan_close_cb,
+ .state_change = chan_state_change_cb,
+ .ready = chan_ready_cb,
+ .resume = chan_resume_cb,
+ .suspend = chan_suspend_cb,
+ .get_sndtimeo = chan_get_sndtimeo_cb,
+ .alloc_skb = chan_alloc_skb_cb,
+
+ .teardown = l2cap_chan_no_teardown,
+ .defer = l2cap_chan_no_defer,
+ .set_shutdown = l2cap_chan_no_set_shutdown,
+};
+
+static inline __u8 bdaddr_type(__u8 type)
+{
+ if (type == ADDR_LE_DEV_PUBLIC)
+ return BDADDR_LE_PUBLIC;
+ else
+ return BDADDR_LE_RANDOM;
+}
+
+static struct l2cap_chan *chan_get(void)
+{
+ struct l2cap_chan *pchan;
+
+ pchan = chan_create();
+ if (!pchan)
+ return NULL;
+
+ pchan->ops = &bt_6lowpan_chan_ops;
+
+ return pchan;
+}
+
+static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type)
+{
+ struct l2cap_chan *pchan;
+ int err;
+
+ pchan = chan_get();
+ if (!pchan)
+ return -EINVAL;
+
+ err = l2cap_chan_connect(pchan, cpu_to_le16(L2CAP_PSM_IPSP), 0,
+ addr, dst_type);
+
+ BT_DBG("chan %p err %d", pchan, err);
+ if (err < 0)
+ l2cap_chan_put(pchan);
+
+ return err;
+}
+
+static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type)
+{
+ struct lowpan_peer *peer;
+
+ BT_DBG("conn %p dst type %d", conn, dst_type);
+
+ peer = lookup_peer(conn);
+ if (!peer)
+ return -ENOENT;
+
+ BT_DBG("peer %p chan %p", peer, peer->chan);
+
+ l2cap_chan_close(peer->chan, ENOENT);
+
+ return 0;
+}
+
+static struct l2cap_chan *bt_6lowpan_listen(void)
+{
+ bdaddr_t *addr = BDADDR_ANY;
+ struct l2cap_chan *pchan;
+ int err;
+
+ if (!enable_6lowpan)
+ return NULL;
+
+ pchan = chan_get();
+ if (!pchan)
+ return NULL;
+
+ pchan->state = BT_LISTEN;
+ pchan->src_type = BDADDR_LE_PUBLIC;
+
+ atomic_set(&pchan->nesting, L2CAP_NESTING_PARENT);
+
+ BT_DBG("chan %p src type %d", pchan, pchan->src_type);
+
+ err = l2cap_add_psm(pchan, addr, cpu_to_le16(L2CAP_PSM_IPSP));
+ if (err) {
+ l2cap_chan_put(pchan);
+ BT_ERR("psm cannot be added err %d", err);
+ return NULL;
+ }
+
+ return pchan;
+}
+
+static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
+ struct l2cap_conn **conn)
+{
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ bdaddr_t *src = BDADDR_ANY;
+ int n;
+
+ n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu",
+ &addr->b[5], &addr->b[4], &addr->b[3],
+ &addr->b[2], &addr->b[1], &addr->b[0],
+ addr_type);
+
+ if (n < 7)
+ return -EINVAL;
+
+ hdev = hci_get_route(addr, src);
+ if (!hdev)
+ return -ENOENT;
+
+ hci_dev_lock(hdev);
+ hcon = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr);
+ hci_dev_unlock(hdev);
+
+ if (!hcon)
+ return -ENOENT;
+
+ *conn = (struct l2cap_conn *)hcon->l2cap_data;
+
+ BT_DBG("conn %p dst %pMR type %d", *conn, &hcon->dst, hcon->dst_type);
+
+ return 0;
+}
+
+static void disconnect_all_peers(void)
+{
+ struct lowpan_dev *entry;
+ struct lowpan_peer *peer, *tmp_peer, *new_peer;
+ struct list_head peers;
+
+ INIT_LIST_HEAD(&peers);
+
+ /* We make a separate list of peers as the close_cb() will
+ * modify the device peers list so it is better not to mess
+ * with the same list at the same time.
+ */
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ list_for_each_entry_rcu(peer, &entry->peers, list) {
+ new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC);
+ if (!new_peer)
+ break;
+
+ new_peer->chan = peer->chan;
+ INIT_LIST_HEAD(&new_peer->list);
+
+ list_add(&new_peer->list, &peers);
+ }
+ }
+
+ rcu_read_unlock();
+
+ spin_lock(&devices_lock);
+ list_for_each_entry_safe(peer, tmp_peer, &peers, list) {
+ l2cap_chan_close(peer->chan, ENOENT);
+
+ list_del_rcu(&peer->list);
+ kfree_rcu(peer, rcu);
+
+ module_put(THIS_MODULE);
+ }
+ spin_unlock(&devices_lock);
+}
+
+struct set_enable {
+ struct work_struct work;
+ bool flag;
+};
+
+static void do_enable_set(struct work_struct *work)
+{
+ struct set_enable *set_enable = container_of(work,
+ struct set_enable, work);
+
+ if (!set_enable->flag || enable_6lowpan != set_enable->flag)
+ /* Disconnect existing connections if 6lowpan is
+ * disabled
+ */
+ disconnect_all_peers();
+
+ enable_6lowpan = set_enable->flag;
+
+ if (listen_chan) {
+ l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_put(listen_chan);
+ }
+
+ listen_chan = bt_6lowpan_listen();
+
+ kfree(set_enable);
+}
+
+static int lowpan_enable_set(void *data, u64 val)
+{
+ struct set_enable *set_enable;
+
+ set_enable = kzalloc(sizeof(*set_enable), GFP_KERNEL);
+ if (!set_enable)
+ return -ENOMEM;
+
+ set_enable->flag = !!val;
+ INIT_WORK(&set_enable->work, do_enable_set);
+
+ schedule_work(&set_enable->work);
+
+ return 0;
+}
+
+static int lowpan_enable_get(void *data, u64 *val)
+{
+ *val = enable_6lowpan;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get,
+ lowpan_enable_set, "%llu\n");
+
+static ssize_t lowpan_control_write(struct file *fp,
+ const char __user *user_buffer,
+ size_t count,
+ loff_t *position)
+{
+ char buf[32];
+ size_t buf_size = min(count, sizeof(buf) - 1);
+ int ret;
+ bdaddr_t addr;
+ u8 addr_type;
+ struct l2cap_conn *conn = NULL;
+
+ if (copy_from_user(buf, user_buffer, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+
+ if (memcmp(buf, "connect ", 8) == 0) {
+ ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn);
+ if (ret == -EINVAL)
+ return ret;
+
+ if (listen_chan) {
+ l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_put(listen_chan);
+ listen_chan = NULL;
+ }
+
+ if (conn) {
+ struct lowpan_peer *peer;
+
+ if (!is_bt_6lowpan(conn->hcon))
+ return -EINVAL;
+
+ peer = lookup_peer(conn);
+ if (peer) {
+ BT_DBG("6LoWPAN connection already exists");
+ return -EALREADY;
+ }
+
+ BT_DBG("conn %p dst %pMR type %d user %d", conn,
+ &conn->hcon->dst, conn->hcon->dst_type,
+ addr_type);
+ }
+
+ ret = bt_6lowpan_connect(&addr, addr_type);
+ if (ret < 0)
+ return ret;
+
+ return count;
+ }
+
+ if (memcmp(buf, "disconnect ", 11) == 0) {
+ ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn);
+ if (ret < 0)
+ return ret;
+
+ ret = bt_6lowpan_disconnect(conn, addr_type);
+ if (ret < 0)
+ return ret;
+
+ return count;
+ }
+
+ return count;
+}
+
+static int lowpan_control_show(struct seq_file *f, void *ptr)
+{
+ struct lowpan_dev *entry;
+ struct lowpan_peer *peer;
+
+ spin_lock(&devices_lock);
+
+ list_for_each_entry(entry, &bt_6lowpan_devices, list) {
+ list_for_each_entry(peer, &entry->peers, list)
+ seq_printf(f, "%pMR (type %u)\n",
+ &peer->chan->dst, peer->chan->dst_type);
+ }
+
+ spin_unlock(&devices_lock);
+
+ return 0;
+}
+
+static int lowpan_control_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lowpan_control_show, inode->i_private);
+}
+
+static const struct file_operations lowpan_control_fops = {
+ .open = lowpan_control_open,
+ .read = seq_read,
+ .write = lowpan_control_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void disconnect_devices(void)
+{
+ struct lowpan_dev *entry, *tmp, *new_dev;
+ struct list_head devices;
+
+ INIT_LIST_HEAD(&devices);
+
+ /* We make a separate list of devices because the unregister_netdev()
+ * will call device_event() which will also want to modify the same
+ * devices list.
+ */
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ new_dev = kmalloc(sizeof(*new_dev), GFP_ATOMIC);
+ if (!new_dev)
+ break;
+
+ new_dev->netdev = entry->netdev;
+ INIT_LIST_HEAD(&new_dev->list);
+
+ list_add_rcu(&new_dev->list, &devices);
+ }
+
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(entry, tmp, &devices, list) {
+ ifdown(entry->netdev);
+ BT_DBG("Unregistering netdev %s %p",
+ entry->netdev->name, entry->netdev);
+ unregister_netdev(entry->netdev);
+ kfree(entry);
+ }
+}
+
+static int device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct lowpan_dev *entry;
+
+ if (netdev->type != ARPHRD_6LOWPAN)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ spin_lock(&devices_lock);
+ list_for_each_entry(entry, &bt_6lowpan_devices, list) {
+ if (entry->netdev == netdev) {
+ BT_DBG("Unregistered netdev %s %p",
+ netdev->name, netdev);
+ list_del(&entry->list);
+ kfree(entry);
+ break;
+ }
+ }
+ spin_unlock(&devices_lock);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block bt_6lowpan_dev_notifier = {
+ .notifier_call = device_event,
+};
+
+static int __init bt_6lowpan_init(void)
+{
+ lowpan_enable_debugfs = debugfs_create_file("6lowpan_enable", 0644,
+ bt_debugfs, NULL,
+ &lowpan_enable_fops);
+ lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644,
+ bt_debugfs, NULL,
+ &lowpan_control_fops);
+
+ return register_netdevice_notifier(&bt_6lowpan_dev_notifier);
+}
+
+static void __exit bt_6lowpan_exit(void)
+{
+ debugfs_remove(lowpan_enable_debugfs);
+ debugfs_remove(lowpan_control_debugfs);
+
+ if (listen_chan) {
+ l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_put(listen_chan);
+ }
+
+ disconnect_devices();
+
+ unregister_netdevice_notifier(&bt_6lowpan_dev_notifier);
+}
+
+module_init(bt_6lowpan_init);
+module_exit(bt_6lowpan_exit);
+
+MODULE_AUTHOR("Jukka Rissanen <jukka.rissanen@linux.intel.com>");
+MODULE_DESCRIPTION("Bluetooth 6LoWPAN");
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
new file mode 100644
index 000000000..b8c794b87
--- /dev/null
+++ b/net/bluetooth/Kconfig
@@ -0,0 +1,102 @@
+#
+# Bluetooth subsystem configuration
+#
+
+menuconfig BT
+ tristate "Bluetooth subsystem support"
+ depends on NET && !S390
+ depends on RFKILL || !RFKILL
+ select CRC16
+ select CRYPTO
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_AES
+ select CRYPTO_CMAC
+ select CRYPTO_ECB
+ select CRYPTO_SHA256
+ help
+ Bluetooth is low-cost, low-power, short-range wireless technology.
+ It was designed as a replacement for cables and other short-range
+ technologies like IrDA. Bluetooth operates in personal area range
+ that typically extends up to 10 meters. More information about
+ Bluetooth can be found at <http://www.bluetooth.com/>.
+
+ Linux Bluetooth subsystem consist of several layers:
+ Bluetooth Core
+ HCI device and connection manager, scheduler
+ SCO audio links
+ L2CAP (Logical Link Control and Adaptation Protocol)
+ SMP (Security Manager Protocol) on LE (Low Energy) links
+ HCI Device drivers (Interface to the hardware)
+ RFCOMM Module (RFCOMM Protocol)
+ BNEP Module (Bluetooth Network Encapsulation Protocol)
+ CMTP Module (CAPI Message Transport Protocol)
+ HIDP Module (Human Interface Device Protocol)
+
+ Say Y here to compile Bluetooth support into the kernel or say M to
+ compile it as module (bluetooth).
+
+ To use Linux Bluetooth subsystem, you will need several user-space
+ utilities like hciconfig and bluetoothd. These utilities and updates
+ to Bluetooth kernel modules are provided in the BlueZ packages. For
+ more information, see <http://www.bluez.org/>.
+
+config BT_BREDR
+ bool "Bluetooth Classic (BR/EDR) features"
+ depends on BT
+ default y
+
+source "net/bluetooth/rfcomm/Kconfig"
+
+source "net/bluetooth/bnep/Kconfig"
+
+source "net/bluetooth/cmtp/Kconfig"
+
+source "net/bluetooth/hidp/Kconfig"
+
+config BT_LE
+ bool "Bluetooth Low Energy (LE) features"
+ depends on BT
+ default y
+
+config BT_6LOWPAN
+ tristate "Bluetooth 6LoWPAN support"
+ depends on BT_LE && 6LOWPAN
+ help
+ IPv6 compression over Bluetooth Low Energy.
+
+config BT_SELFTEST
+ bool "Bluetooth self testing support"
+ depends on BT && DEBUG_KERNEL
+ help
+ Run self tests when initializing the Bluetooth subsystem. This
+ is a developer option and can cause significant delay when booting
+ the system.
+
+ When the Bluetooth subsystem is built as module, then the test
+ cases are run first thing at module load time. When the Bluetooth
+ subsystem is compiled into the kernel image, then the test cases
+ are run late in the initcall hierarchy.
+
+config BT_SELFTEST_ECDH
+ bool "ECDH test cases"
+ depends on BT_LE && BT_SELFTEST
+ help
+ Run test cases for ECDH cryptographic functionality used by the
+ Bluetooth Low Energy Secure Connections feature.
+
+config BT_SELFTEST_SMP
+ bool "SMP test cases"
+ depends on BT_LE && BT_SELFTEST
+ help
+ Run test cases for SMP cryptographic functionality, including both
+ legacy SMP as well as the Secure Connections features.
+
+config BT_DEBUGFS
+ bool "Export Bluetooth internals in debugfs"
+ depends on BT && DEBUG_FS
+ default y
+ help
+ Provide extensive information about internal Bluetooth states
+ in debugfs.
+
+source "drivers/bluetooth/Kconfig"
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
new file mode 100644
index 000000000..9a8ea232d
--- /dev/null
+++ b/net/bluetooth/Makefile
@@ -0,0 +1,21 @@
+#
+# Makefile for the Linux Bluetooth subsystem.
+#
+
+obj-$(CONFIG_BT) += bluetooth.o
+obj-$(CONFIG_BT_RFCOMM) += rfcomm/
+obj-$(CONFIG_BT_BNEP) += bnep/
+obj-$(CONFIG_BT_CMTP) += cmtp/
+obj-$(CONFIG_BT_HIDP) += hidp/
+obj-$(CONFIG_BT_6LOWPAN) += bluetooth_6lowpan.o
+
+bluetooth_6lowpan-y := 6lowpan.o
+
+bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
+ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o sco.o lib.o \
+ a2mp.o amp.o ecc.o hci_request.o mgmt_util.o
+
+bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
+bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
+
+subdir-ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
new file mode 100644
index 000000000..5a04eb1a7
--- /dev/null
+++ b/net/bluetooth/a2mp.c
@@ -0,0 +1,1025 @@
+/*
+ Copyright (c) 2010,2011 Code Aurora Forum. All rights reserved.
+ Copyright (c) 2011,2012 Intel Corp.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 and
+ only version 2 as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+*/
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "a2mp.h"
+#include "amp.h"
+
+#define A2MP_FEAT_EXT 0x8000
+
+/* Global AMP Manager list */
+static LIST_HEAD(amp_mgr_list);
+static DEFINE_MUTEX(amp_mgr_list_lock);
+
+/* A2MP build & send command helper functions */
+static struct a2mp_cmd *__a2mp_build(u8 code, u8 ident, u16 len, void *data)
+{
+ struct a2mp_cmd *cmd;
+ int plen;
+
+ plen = sizeof(*cmd) + len;
+ cmd = kzalloc(plen, GFP_KERNEL);
+ if (!cmd)
+ return NULL;
+
+ cmd->code = code;
+ cmd->ident = ident;
+ cmd->len = cpu_to_le16(len);
+
+ memcpy(cmd->data, data, len);
+
+ return cmd;
+}
+
+static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data)
+{
+ struct l2cap_chan *chan = mgr->a2mp_chan;
+ struct a2mp_cmd *cmd;
+ u16 total_len = len + sizeof(*cmd);
+ struct kvec iv;
+ struct msghdr msg;
+
+ cmd = __a2mp_build(code, ident, len, data);
+ if (!cmd)
+ return;
+
+ iv.iov_base = cmd;
+ iv.iov_len = total_len;
+
+ memset(&msg, 0, sizeof(msg));
+
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, total_len);
+
+ l2cap_chan_send(chan, &msg, total_len);
+
+ kfree(cmd);
+}
+
+static u8 __next_ident(struct amp_mgr *mgr)
+{
+ if (++mgr->ident == 0)
+ mgr->ident = 1;
+
+ return mgr->ident;
+}
+
+static struct amp_mgr *amp_mgr_lookup_by_state(u8 state)
+{
+ struct amp_mgr *mgr;
+
+ mutex_lock(&amp_mgr_list_lock);
+ list_for_each_entry(mgr, &amp_mgr_list, list) {
+ if (test_and_clear_bit(state, &mgr->state)) {
+ amp_mgr_get(mgr);
+ mutex_unlock(&amp_mgr_list_lock);
+ return mgr;
+ }
+ }
+ mutex_unlock(&amp_mgr_list_lock);
+
+ return NULL;
+}
+
+/* hci_dev_list shall be locked */
+static void __a2mp_add_cl(struct amp_mgr *mgr, struct a2mp_cl *cl)
+{
+ struct hci_dev *hdev;
+ int i = 1;
+
+ cl[0].id = AMP_ID_BREDR;
+ cl[0].type = AMP_TYPE_BREDR;
+ cl[0].status = AMP_STATUS_BLUETOOTH_ONLY;
+
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ if (hdev->dev_type == HCI_AMP) {
+ cl[i].id = hdev->id;
+ cl[i].type = hdev->amp_type;
+ if (test_bit(HCI_UP, &hdev->flags))
+ cl[i].status = hdev->amp_status;
+ else
+ cl[i].status = AMP_STATUS_POWERED_DOWN;
+ i++;
+ }
+ }
+}
+
+/* Processing A2MP messages */
+static int a2mp_command_rej(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_cmd_rej *rej = (void *) skb->data;
+
+ if (le16_to_cpu(hdr->len) < sizeof(*rej))
+ return -EINVAL;
+
+ BT_DBG("ident %d reason %d", hdr->ident, le16_to_cpu(rej->reason));
+
+ skb_pull(skb, sizeof(*rej));
+
+ return 0;
+}
+
+static int a2mp_discover_req(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_discov_req *req = (void *) skb->data;
+ u16 len = le16_to_cpu(hdr->len);
+ struct a2mp_discov_rsp *rsp;
+ u16 ext_feat;
+ u8 num_ctrl;
+ struct hci_dev *hdev;
+
+ if (len < sizeof(*req))
+ return -EINVAL;
+
+ skb_pull(skb, sizeof(*req));
+
+ ext_feat = le16_to_cpu(req->ext_feat);
+
+ BT_DBG("mtu %d efm 0x%4.4x", le16_to_cpu(req->mtu), ext_feat);
+
+ /* check that packet is not broken for now */
+ while (ext_feat & A2MP_FEAT_EXT) {
+ if (len < sizeof(ext_feat))
+ return -EINVAL;
+
+ ext_feat = get_unaligned_le16(skb->data);
+ BT_DBG("efm 0x%4.4x", ext_feat);
+ len -= sizeof(ext_feat);
+ skb_pull(skb, sizeof(ext_feat));
+ }
+
+ read_lock(&hci_dev_list_lock);
+
+ /* at minimum the BR/EDR needs to be listed */
+ num_ctrl = 1;
+
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ if (hdev->dev_type == HCI_AMP)
+ num_ctrl++;
+ }
+
+ len = num_ctrl * sizeof(struct a2mp_cl) + sizeof(*rsp);
+ rsp = kmalloc(len, GFP_ATOMIC);
+ if (!rsp) {
+ read_unlock(&hci_dev_list_lock);
+ return -ENOMEM;
+ }
+
+ rsp->mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
+ rsp->ext_feat = 0;
+
+ __a2mp_add_cl(mgr, rsp->cl);
+
+ read_unlock(&hci_dev_list_lock);
+
+ a2mp_send(mgr, A2MP_DISCOVER_RSP, hdr->ident, len, rsp);
+
+ kfree(rsp);
+ return 0;
+}
+
+static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_discov_rsp *rsp = (void *) skb->data;
+ u16 len = le16_to_cpu(hdr->len);
+ struct a2mp_cl *cl;
+ u16 ext_feat;
+ bool found = false;
+
+ if (len < sizeof(*rsp))
+ return -EINVAL;
+
+ len -= sizeof(*rsp);
+ skb_pull(skb, sizeof(*rsp));
+
+ ext_feat = le16_to_cpu(rsp->ext_feat);
+
+ BT_DBG("mtu %d efm 0x%4.4x", le16_to_cpu(rsp->mtu), ext_feat);
+
+ /* check that packet is not broken for now */
+ while (ext_feat & A2MP_FEAT_EXT) {
+ if (len < sizeof(ext_feat))
+ return -EINVAL;
+
+ ext_feat = get_unaligned_le16(skb->data);
+ BT_DBG("efm 0x%4.4x", ext_feat);
+ len -= sizeof(ext_feat);
+ skb_pull(skb, sizeof(ext_feat));
+ }
+
+ cl = (void *) skb->data;
+ while (len >= sizeof(*cl)) {
+ BT_DBG("Remote AMP id %d type %d status %d", cl->id, cl->type,
+ cl->status);
+
+ if (cl->id != AMP_ID_BREDR && cl->type != AMP_TYPE_BREDR) {
+ struct a2mp_info_req req;
+
+ found = true;
+ req.id = cl->id;
+ a2mp_send(mgr, A2MP_GETINFO_REQ, __next_ident(mgr),
+ sizeof(req), &req);
+ }
+
+ len -= sizeof(*cl);
+ cl = (void *) skb_pull(skb, sizeof(*cl));
+ }
+
+ /* Fall back to L2CAP init sequence */
+ if (!found) {
+ struct l2cap_conn *conn = mgr->l2cap_conn;
+ struct l2cap_chan *chan;
+
+ mutex_lock(&conn->chan_lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+
+ BT_DBG("chan %p state %s", chan,
+ state_to_string(chan->state));
+
+ if (chan->scid == L2CAP_CID_A2MP)
+ continue;
+
+ l2cap_chan_lock(chan);
+
+ if (chan->state == BT_CONNECT)
+ l2cap_send_conn_req(chan);
+
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+ }
+
+ return 0;
+}
+
+static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_cl *cl = (void *) skb->data;
+
+ while (skb->len >= sizeof(*cl)) {
+ BT_DBG("Controller id %d type %d status %d", cl->id, cl->type,
+ cl->status);
+ cl = (struct a2mp_cl *) skb_pull(skb, sizeof(*cl));
+ }
+
+ /* TODO send A2MP_CHANGE_RSP */
+
+ return 0;
+}
+
+static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_info_req *req = (void *) skb->data;
+ struct hci_dev *hdev;
+
+ if (le16_to_cpu(hdr->len) < sizeof(*req))
+ return -EINVAL;
+
+ BT_DBG("id %d", req->id);
+
+ hdev = hci_dev_get(req->id);
+ if (!hdev || hdev->dev_type != HCI_AMP) {
+ struct a2mp_info_rsp rsp;
+
+ rsp.id = req->id;
+ rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
+
+ a2mp_send(mgr, A2MP_GETINFO_RSP, hdr->ident, sizeof(rsp),
+ &rsp);
+
+ goto done;
+ }
+
+ set_bit(READ_LOC_AMP_INFO, &mgr->state);
+ hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
+
+done:
+ if (hdev)
+ hci_dev_put(hdev);
+
+ skb_pull(skb, sizeof(*req));
+ return 0;
+}
+
+static int a2mp_getinfo_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_info_rsp *rsp = (struct a2mp_info_rsp *) skb->data;
+ struct a2mp_amp_assoc_req req;
+ struct amp_ctrl *ctrl;
+
+ if (le16_to_cpu(hdr->len) < sizeof(*rsp))
+ return -EINVAL;
+
+ BT_DBG("id %d status 0x%2.2x", rsp->id, rsp->status);
+
+ if (rsp->status)
+ return -EINVAL;
+
+ ctrl = amp_ctrl_add(mgr, rsp->id);
+ if (!ctrl)
+ return -ENOMEM;
+
+ req.id = rsp->id;
+ a2mp_send(mgr, A2MP_GETAMPASSOC_REQ, __next_ident(mgr), sizeof(req),
+ &req);
+
+ skb_pull(skb, sizeof(*rsp));
+ return 0;
+}
+
+static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_amp_assoc_req *req = (void *) skb->data;
+ struct hci_dev *hdev;
+ struct amp_mgr *tmp;
+
+ if (le16_to_cpu(hdr->len) < sizeof(*req))
+ return -EINVAL;
+
+ BT_DBG("id %d", req->id);
+
+ /* Make sure that other request is not processed */
+ tmp = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC);
+
+ hdev = hci_dev_get(req->id);
+ if (!hdev || hdev->amp_type == AMP_TYPE_BREDR || tmp) {
+ struct a2mp_amp_assoc_rsp rsp;
+ rsp.id = req->id;
+
+ if (tmp) {
+ rsp.status = A2MP_STATUS_COLLISION_OCCURED;
+ amp_mgr_put(tmp);
+ } else {
+ rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
+ }
+
+ a2mp_send(mgr, A2MP_GETAMPASSOC_RSP, hdr->ident, sizeof(rsp),
+ &rsp);
+
+ goto done;
+ }
+
+ amp_read_loc_assoc(hdev, mgr);
+
+done:
+ if (hdev)
+ hci_dev_put(hdev);
+
+ skb_pull(skb, sizeof(*req));
+ return 0;
+}
+
+static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_amp_assoc_rsp *rsp = (void *) skb->data;
+ u16 len = le16_to_cpu(hdr->len);
+ struct hci_dev *hdev;
+ struct amp_ctrl *ctrl;
+ struct hci_conn *hcon;
+ size_t assoc_len;
+
+ if (len < sizeof(*rsp))
+ return -EINVAL;
+
+ assoc_len = len - sizeof(*rsp);
+
+ BT_DBG("id %d status 0x%2.2x assoc len %zu", rsp->id, rsp->status,
+ assoc_len);
+
+ if (rsp->status)
+ return -EINVAL;
+
+ /* Save remote ASSOC data */
+ ctrl = amp_ctrl_lookup(mgr, rsp->id);
+ if (ctrl) {
+ u8 *assoc;
+
+ assoc = kmemdup(rsp->amp_assoc, assoc_len, GFP_KERNEL);
+ if (!assoc) {
+ amp_ctrl_put(ctrl);
+ return -ENOMEM;
+ }
+
+ ctrl->assoc = assoc;
+ ctrl->assoc_len = assoc_len;
+ ctrl->assoc_rem_len = assoc_len;
+ ctrl->assoc_len_so_far = 0;
+
+ amp_ctrl_put(ctrl);
+ }
+
+ /* Create Phys Link */
+ hdev = hci_dev_get(rsp->id);
+ if (!hdev)
+ return -EINVAL;
+
+ hcon = phylink_add(hdev, mgr, rsp->id, true);
+ if (!hcon)
+ goto done;
+
+ BT_DBG("Created hcon %p: loc:%d -> rem:%d", hcon, hdev->id, rsp->id);
+
+ mgr->bredr_chan->remote_amp_id = rsp->id;
+
+ amp_create_phylink(hdev, mgr, hcon);
+
+done:
+ hci_dev_put(hdev);
+ skb_pull(skb, len);
+ return 0;
+}
+
+static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_physlink_req *req = (void *) skb->data;
+
+ struct a2mp_physlink_rsp rsp;
+ struct hci_dev *hdev;
+ struct hci_conn *hcon;
+ struct amp_ctrl *ctrl;
+
+ if (le16_to_cpu(hdr->len) < sizeof(*req))
+ return -EINVAL;
+
+ BT_DBG("local_id %d, remote_id %d", req->local_id, req->remote_id);
+
+ rsp.local_id = req->remote_id;
+ rsp.remote_id = req->local_id;
+
+ hdev = hci_dev_get(req->remote_id);
+ if (!hdev || hdev->amp_type == AMP_TYPE_BREDR) {
+ rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
+ goto send_rsp;
+ }
+
+ ctrl = amp_ctrl_lookup(mgr, rsp.remote_id);
+ if (!ctrl) {
+ ctrl = amp_ctrl_add(mgr, rsp.remote_id);
+ if (ctrl) {
+ amp_ctrl_get(ctrl);
+ } else {
+ rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
+ goto send_rsp;
+ }
+ }
+
+ if (ctrl) {
+ size_t assoc_len = le16_to_cpu(hdr->len) - sizeof(*req);
+ u8 *assoc;
+
+ assoc = kmemdup(req->amp_assoc, assoc_len, GFP_KERNEL);
+ if (!assoc) {
+ amp_ctrl_put(ctrl);
+ return -ENOMEM;
+ }
+
+ ctrl->assoc = assoc;
+ ctrl->assoc_len = assoc_len;
+ ctrl->assoc_rem_len = assoc_len;
+ ctrl->assoc_len_so_far = 0;
+
+ amp_ctrl_put(ctrl);
+ }
+
+ hcon = phylink_add(hdev, mgr, req->local_id, false);
+ if (hcon) {
+ amp_accept_phylink(hdev, mgr, hcon);
+ rsp.status = A2MP_STATUS_SUCCESS;
+ } else {
+ rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
+ }
+
+send_rsp:
+ if (hdev)
+ hci_dev_put(hdev);
+
+ /* Reply error now and success after HCI Write Remote AMP Assoc
+ command complete with success status
+ */
+ if (rsp.status != A2MP_STATUS_SUCCESS) {
+ a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, hdr->ident,
+ sizeof(rsp), &rsp);
+ } else {
+ set_bit(WRITE_REMOTE_AMP_ASSOC, &mgr->state);
+ mgr->ident = hdr->ident;
+ }
+
+ skb_pull(skb, le16_to_cpu(hdr->len));
+ return 0;
+}
+
+static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ struct a2mp_physlink_req *req = (void *) skb->data;
+ struct a2mp_physlink_rsp rsp;
+ struct hci_dev *hdev;
+ struct hci_conn *hcon;
+
+ if (le16_to_cpu(hdr->len) < sizeof(*req))
+ return -EINVAL;
+
+ BT_DBG("local_id %d remote_id %d", req->local_id, req->remote_id);
+
+ rsp.local_id = req->remote_id;
+ rsp.remote_id = req->local_id;
+ rsp.status = A2MP_STATUS_SUCCESS;
+
+ hdev = hci_dev_get(req->remote_id);
+ if (!hdev) {
+ rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
+ goto send_rsp;
+ }
+
+ hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
+ &mgr->l2cap_conn->hcon->dst);
+ if (!hcon) {
+ BT_ERR("No phys link exist");
+ rsp.status = A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS;
+ goto clean;
+ }
+
+ /* TODO Disconnect Phys Link here */
+
+clean:
+ hci_dev_put(hdev);
+
+send_rsp:
+ a2mp_send(mgr, A2MP_DISCONNPHYSLINK_RSP, hdr->ident, sizeof(rsp), &rsp);
+
+ skb_pull(skb, sizeof(*req));
+ return 0;
+}
+
+static inline int a2mp_cmd_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
+ struct a2mp_cmd *hdr)
+{
+ BT_DBG("ident %d code 0x%2.2x", hdr->ident, hdr->code);
+
+ skb_pull(skb, le16_to_cpu(hdr->len));
+ return 0;
+}
+
+/* Handle A2MP signalling */
+static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct a2mp_cmd *hdr;
+ struct amp_mgr *mgr = chan->data;
+ int err = 0;
+
+ amp_mgr_get(mgr);
+
+ while (skb->len >= sizeof(*hdr)) {
+ u16 len;
+
+ hdr = (void *) skb->data;
+ len = le16_to_cpu(hdr->len);
+
+ BT_DBG("code 0x%2.2x id %d len %u", hdr->code, hdr->ident, len);
+
+ skb_pull(skb, sizeof(*hdr));
+
+ if (len > skb->len || !hdr->ident) {
+ err = -EINVAL;
+ break;
+ }
+
+ mgr->ident = hdr->ident;
+
+ switch (hdr->code) {
+ case A2MP_COMMAND_REJ:
+ a2mp_command_rej(mgr, skb, hdr);
+ break;
+
+ case A2MP_DISCOVER_REQ:
+ err = a2mp_discover_req(mgr, skb, hdr);
+ break;
+
+ case A2MP_CHANGE_NOTIFY:
+ err = a2mp_change_notify(mgr, skb, hdr);
+ break;
+
+ case A2MP_GETINFO_REQ:
+ err = a2mp_getinfo_req(mgr, skb, hdr);
+ break;
+
+ case A2MP_GETAMPASSOC_REQ:
+ err = a2mp_getampassoc_req(mgr, skb, hdr);
+ break;
+
+ case A2MP_CREATEPHYSLINK_REQ:
+ err = a2mp_createphyslink_req(mgr, skb, hdr);
+ break;
+
+ case A2MP_DISCONNPHYSLINK_REQ:
+ err = a2mp_discphyslink_req(mgr, skb, hdr);
+ break;
+
+ case A2MP_DISCOVER_RSP:
+ err = a2mp_discover_rsp(mgr, skb, hdr);
+ break;
+
+ case A2MP_GETINFO_RSP:
+ err = a2mp_getinfo_rsp(mgr, skb, hdr);
+ break;
+
+ case A2MP_GETAMPASSOC_RSP:
+ err = a2mp_getampassoc_rsp(mgr, skb, hdr);
+ break;
+
+ case A2MP_CHANGE_RSP:
+ case A2MP_CREATEPHYSLINK_RSP:
+ case A2MP_DISCONNPHYSLINK_RSP:
+ err = a2mp_cmd_rsp(mgr, skb, hdr);
+ break;
+
+ default:
+ BT_ERR("Unknown A2MP sig cmd 0x%2.2x", hdr->code);
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ if (err) {
+ struct a2mp_cmd_rej rej;
+
+ rej.reason = cpu_to_le16(0);
+ hdr = (void *) skb->data;
+
+ BT_DBG("Send A2MP Rej: cmd 0x%2.2x err %d", hdr->code, err);
+
+ a2mp_send(mgr, A2MP_COMMAND_REJ, hdr->ident, sizeof(rej),
+ &rej);
+ }
+
+ /* Always free skb and return success error code to prevent
+ from sending L2CAP Disconnect over A2MP channel */
+ kfree_skb(skb);
+
+ amp_mgr_put(mgr);
+
+ return 0;
+}
+
+static void a2mp_chan_close_cb(struct l2cap_chan *chan)
+{
+ l2cap_chan_put(chan);
+}
+
+static void a2mp_chan_state_change_cb(struct l2cap_chan *chan, int state,
+ int err)
+{
+ struct amp_mgr *mgr = chan->data;
+
+ if (!mgr)
+ return;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(state));
+
+ chan->state = state;
+
+ switch (state) {
+ case BT_CLOSED:
+ if (mgr)
+ amp_mgr_put(mgr);
+ break;
+ }
+}
+
+static struct sk_buff *a2mp_chan_alloc_skb_cb(struct l2cap_chan *chan,
+ unsigned long hdr_len,
+ unsigned long len, int nb)
+{
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ return skb;
+}
+
+static const struct l2cap_ops a2mp_chan_ops = {
+ .name = "L2CAP A2MP channel",
+ .recv = a2mp_chan_recv_cb,
+ .close = a2mp_chan_close_cb,
+ .state_change = a2mp_chan_state_change_cb,
+ .alloc_skb = a2mp_chan_alloc_skb_cb,
+
+ /* Not implemented for A2MP */
+ .new_connection = l2cap_chan_no_new_connection,
+ .teardown = l2cap_chan_no_teardown,
+ .ready = l2cap_chan_no_ready,
+ .defer = l2cap_chan_no_defer,
+ .resume = l2cap_chan_no_resume,
+ .set_shutdown = l2cap_chan_no_set_shutdown,
+ .get_sndtimeo = l2cap_chan_no_get_sndtimeo,
+};
+
+static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
+{
+ struct l2cap_chan *chan;
+ int err;
+
+ chan = l2cap_chan_create();
+ if (!chan)
+ return NULL;
+
+ BT_DBG("chan %p", chan);
+
+ chan->chan_type = L2CAP_CHAN_FIXED;
+ chan->scid = L2CAP_CID_A2MP;
+ chan->dcid = L2CAP_CID_A2MP;
+ chan->omtu = L2CAP_A2MP_DEFAULT_MTU;
+ chan->imtu = L2CAP_A2MP_DEFAULT_MTU;
+ chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
+
+ chan->ops = &a2mp_chan_ops;
+
+ l2cap_chan_set_defaults(chan);
+ chan->remote_max_tx = chan->max_tx;
+ chan->remote_tx_win = chan->tx_win;
+
+ chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO;
+ chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO;
+
+ skb_queue_head_init(&chan->tx_q);
+
+ chan->mode = L2CAP_MODE_ERTM;
+
+ err = l2cap_ertm_init(chan);
+ if (err < 0) {
+ l2cap_chan_del(chan, 0);
+ return NULL;
+ }
+
+ chan->conf_state = 0;
+
+ if (locked)
+ __l2cap_chan_add(conn, chan);
+ else
+ l2cap_chan_add(conn, chan);
+
+ chan->remote_mps = chan->omtu;
+ chan->mps = chan->omtu;
+
+ chan->state = BT_CONNECTED;
+
+ return chan;
+}
+
+/* AMP Manager functions */
+struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
+{
+ BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+
+ kref_get(&mgr->kref);
+
+ return mgr;
+}
+
+static void amp_mgr_destroy(struct kref *kref)
+{
+ struct amp_mgr *mgr = container_of(kref, struct amp_mgr, kref);
+
+ BT_DBG("mgr %p", mgr);
+
+ mutex_lock(&amp_mgr_list_lock);
+ list_del(&mgr->list);
+ mutex_unlock(&amp_mgr_list_lock);
+
+ amp_ctrl_list_flush(mgr);
+ kfree(mgr);
+}
+
+int amp_mgr_put(struct amp_mgr *mgr)
+{
+ BT_DBG("mgr %p orig refcnt %d", mgr, atomic_read(&mgr->kref.refcount));
+
+ return kref_put(&mgr->kref, &amp_mgr_destroy);
+}
+
+static struct amp_mgr *amp_mgr_create(struct l2cap_conn *conn, bool locked)
+{
+ struct amp_mgr *mgr;
+ struct l2cap_chan *chan;
+
+ mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
+ if (!mgr)
+ return NULL;
+
+ BT_DBG("conn %p mgr %p", conn, mgr);
+
+ mgr->l2cap_conn = conn;
+
+ chan = a2mp_chan_open(conn, locked);
+ if (!chan) {
+ kfree(mgr);
+ return NULL;
+ }
+
+ mgr->a2mp_chan = chan;
+ chan->data = mgr;
+
+ conn->hcon->amp_mgr = mgr;
+
+ kref_init(&mgr->kref);
+
+ /* Remote AMP ctrl list initialization */
+ INIT_LIST_HEAD(&mgr->amp_ctrls);
+ mutex_init(&mgr->amp_ctrls_lock);
+
+ mutex_lock(&amp_mgr_list_lock);
+ list_add(&mgr->list, &amp_mgr_list);
+ mutex_unlock(&amp_mgr_list_lock);
+
+ return mgr;
+}
+
+struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct amp_mgr *mgr;
+
+ if (conn->hcon->type != ACL_LINK)
+ return NULL;
+
+ mgr = amp_mgr_create(conn, false);
+ if (!mgr) {
+ BT_ERR("Could not create AMP manager");
+ return NULL;
+ }
+
+ BT_DBG("mgr: %p chan %p", mgr, mgr->a2mp_chan);
+
+ return mgr->a2mp_chan;
+}
+
+void a2mp_send_getinfo_rsp(struct hci_dev *hdev)
+{
+ struct amp_mgr *mgr;
+ struct a2mp_info_rsp rsp;
+
+ mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_INFO);
+ if (!mgr)
+ return;
+
+ BT_DBG("%s mgr %p", hdev->name, mgr);
+
+ rsp.id = hdev->id;
+ rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
+
+ if (hdev->amp_type != AMP_TYPE_BREDR) {
+ rsp.status = 0;
+ rsp.total_bw = cpu_to_le32(hdev->amp_total_bw);
+ rsp.max_bw = cpu_to_le32(hdev->amp_max_bw);
+ rsp.min_latency = cpu_to_le32(hdev->amp_min_latency);
+ rsp.pal_cap = cpu_to_le16(hdev->amp_pal_cap);
+ rsp.assoc_size = cpu_to_le16(hdev->amp_assoc_size);
+ }
+
+ a2mp_send(mgr, A2MP_GETINFO_RSP, mgr->ident, sizeof(rsp), &rsp);
+ amp_mgr_put(mgr);
+}
+
+void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status)
+{
+ struct amp_mgr *mgr;
+ struct amp_assoc *loc_assoc = &hdev->loc_assoc;
+ struct a2mp_amp_assoc_rsp *rsp;
+ size_t len;
+
+ mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC);
+ if (!mgr)
+ return;
+
+ BT_DBG("%s mgr %p", hdev->name, mgr);
+
+ len = sizeof(struct a2mp_amp_assoc_rsp) + loc_assoc->len;
+ rsp = kzalloc(len, GFP_KERNEL);
+ if (!rsp) {
+ amp_mgr_put(mgr);
+ return;
+ }
+
+ rsp->id = hdev->id;
+
+ if (status) {
+ rsp->status = A2MP_STATUS_INVALID_CTRL_ID;
+ } else {
+ rsp->status = A2MP_STATUS_SUCCESS;
+ memcpy(rsp->amp_assoc, loc_assoc->data, loc_assoc->len);
+ }
+
+ a2mp_send(mgr, A2MP_GETAMPASSOC_RSP, mgr->ident, len, rsp);
+ amp_mgr_put(mgr);
+ kfree(rsp);
+}
+
+void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status)
+{
+ struct amp_mgr *mgr;
+ struct amp_assoc *loc_assoc = &hdev->loc_assoc;
+ struct a2mp_physlink_req *req;
+ struct l2cap_chan *bredr_chan;
+ size_t len;
+
+ mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC_FINAL);
+ if (!mgr)
+ return;
+
+ len = sizeof(*req) + loc_assoc->len;
+
+ BT_DBG("%s mgr %p assoc_len %zu", hdev->name, mgr, len);
+
+ req = kzalloc(len, GFP_KERNEL);
+ if (!req) {
+ amp_mgr_put(mgr);
+ return;
+ }
+
+ bredr_chan = mgr->bredr_chan;
+ if (!bredr_chan)
+ goto clean;
+
+ req->local_id = hdev->id;
+ req->remote_id = bredr_chan->remote_amp_id;
+ memcpy(req->amp_assoc, loc_assoc->data, loc_assoc->len);
+
+ a2mp_send(mgr, A2MP_CREATEPHYSLINK_REQ, __next_ident(mgr), len, req);
+
+clean:
+ amp_mgr_put(mgr);
+ kfree(req);
+}
+
+void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status)
+{
+ struct amp_mgr *mgr;
+ struct a2mp_physlink_rsp rsp;
+ struct hci_conn *hs_hcon;
+
+ mgr = amp_mgr_lookup_by_state(WRITE_REMOTE_AMP_ASSOC);
+ if (!mgr)
+ return;
+
+ hs_hcon = hci_conn_hash_lookup_state(hdev, AMP_LINK, BT_CONNECT);
+ if (!hs_hcon) {
+ rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
+ } else {
+ rsp.remote_id = hs_hcon->remote_id;
+ rsp.status = A2MP_STATUS_SUCCESS;
+ }
+
+ BT_DBG("%s mgr %p hs_hcon %p status %u", hdev->name, mgr, hs_hcon,
+ status);
+
+ rsp.local_id = hdev->id;
+ a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, mgr->ident, sizeof(rsp), &rsp);
+ amp_mgr_put(mgr);
+}
+
+void a2mp_discover_amp(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct amp_mgr *mgr = conn->hcon->amp_mgr;
+ struct a2mp_discov_req req;
+
+ BT_DBG("chan %p conn %p mgr %p", chan, conn, mgr);
+
+ if (!mgr) {
+ mgr = amp_mgr_create(conn, true);
+ if (!mgr)
+ return;
+ }
+
+ mgr->bredr_chan = chan;
+
+ req.mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
+ req.ext_feat = 0;
+ a2mp_send(mgr, A2MP_DISCOVER_REQ, 1, sizeof(req), &req);
+}
diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h
new file mode 100644
index 000000000..296f665ad
--- /dev/null
+++ b/net/bluetooth/a2mp.h
@@ -0,0 +1,142 @@
+/*
+ Copyright (c) 2010,2011 Code Aurora Forum. All rights reserved.
+ Copyright (c) 2011,2012 Intel Corp.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 and
+ only version 2 as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+*/
+
+#ifndef __A2MP_H
+#define __A2MP_H
+
+#include <net/bluetooth/l2cap.h>
+
+enum amp_mgr_state {
+ READ_LOC_AMP_INFO,
+ READ_LOC_AMP_ASSOC,
+ READ_LOC_AMP_ASSOC_FINAL,
+ WRITE_REMOTE_AMP_ASSOC,
+};
+
+struct amp_mgr {
+ struct list_head list;
+ struct l2cap_conn *l2cap_conn;
+ struct l2cap_chan *a2mp_chan;
+ struct l2cap_chan *bredr_chan;
+ struct kref kref;
+ __u8 ident;
+ __u8 handle;
+ unsigned long state;
+ unsigned long flags;
+
+ struct list_head amp_ctrls;
+ struct mutex amp_ctrls_lock;
+};
+
+struct a2mp_cmd {
+ __u8 code;
+ __u8 ident;
+ __le16 len;
+ __u8 data[0];
+} __packed;
+
+/* A2MP command codes */
+#define A2MP_COMMAND_REJ 0x01
+struct a2mp_cmd_rej {
+ __le16 reason;
+ __u8 data[0];
+} __packed;
+
+#define A2MP_DISCOVER_REQ 0x02
+struct a2mp_discov_req {
+ __le16 mtu;
+ __le16 ext_feat;
+} __packed;
+
+struct a2mp_cl {
+ __u8 id;
+ __u8 type;
+ __u8 status;
+} __packed;
+
+#define A2MP_DISCOVER_RSP 0x03
+struct a2mp_discov_rsp {
+ __le16 mtu;
+ __le16 ext_feat;
+ struct a2mp_cl cl[0];
+} __packed;
+
+#define A2MP_CHANGE_NOTIFY 0x04
+#define A2MP_CHANGE_RSP 0x05
+
+#define A2MP_GETINFO_REQ 0x06
+struct a2mp_info_req {
+ __u8 id;
+} __packed;
+
+#define A2MP_GETINFO_RSP 0x07
+struct a2mp_info_rsp {
+ __u8 id;
+ __u8 status;
+ __le32 total_bw;
+ __le32 max_bw;
+ __le32 min_latency;
+ __le16 pal_cap;
+ __le16 assoc_size;
+} __packed;
+
+#define A2MP_GETAMPASSOC_REQ 0x08
+struct a2mp_amp_assoc_req {
+ __u8 id;
+} __packed;
+
+#define A2MP_GETAMPASSOC_RSP 0x09
+struct a2mp_amp_assoc_rsp {
+ __u8 id;
+ __u8 status;
+ __u8 amp_assoc[0];
+} __packed;
+
+#define A2MP_CREATEPHYSLINK_REQ 0x0A
+#define A2MP_DISCONNPHYSLINK_REQ 0x0C
+struct a2mp_physlink_req {
+ __u8 local_id;
+ __u8 remote_id;
+ __u8 amp_assoc[0];
+} __packed;
+
+#define A2MP_CREATEPHYSLINK_RSP 0x0B
+#define A2MP_DISCONNPHYSLINK_RSP 0x0D
+struct a2mp_physlink_rsp {
+ __u8 local_id;
+ __u8 remote_id;
+ __u8 status;
+} __packed;
+
+/* A2MP response status */
+#define A2MP_STATUS_SUCCESS 0x00
+#define A2MP_STATUS_INVALID_CTRL_ID 0x01
+#define A2MP_STATUS_UNABLE_START_LINK_CREATION 0x02
+#define A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS 0x02
+#define A2MP_STATUS_COLLISION_OCCURED 0x03
+#define A2MP_STATUS_DISCONN_REQ_RECVD 0x04
+#define A2MP_STATUS_PHYS_LINK_EXISTS 0x05
+#define A2MP_STATUS_SECURITY_VIOLATION 0x06
+
+struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr);
+int amp_mgr_put(struct amp_mgr *mgr);
+struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
+ struct sk_buff *skb);
+void a2mp_discover_amp(struct l2cap_chan *chan);
+void a2mp_send_getinfo_rsp(struct hci_dev *hdev);
+void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status);
+void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status);
+void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status);
+
+#endif /* __A2MP_H */
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
new file mode 100644
index 000000000..70f9d945f
--- /dev/null
+++ b/net/bluetooth/af_bluetooth.c
@@ -0,0 +1,795 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth address family and sockets. */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <asm/ioctls.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <linux/proc_fs.h>
+
+#include "selftest.h"
+
+#define VERSION "2.20"
+
+/* Bluetooth sockets */
+#define BT_MAX_PROTO 8
+static const struct net_proto_family *bt_proto[BT_MAX_PROTO];
+static DEFINE_RWLOCK(bt_proto_lock);
+
+static struct lock_class_key bt_lock_key[BT_MAX_PROTO];
+static const char *const bt_key_strings[BT_MAX_PROTO] = {
+ "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_HCI",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_SCO",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_BNEP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_CMTP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_HIDP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_AVDTP",
+};
+
+static struct lock_class_key bt_slock_key[BT_MAX_PROTO];
+static const char *const bt_slock_key_strings[BT_MAX_PROTO] = {
+ "slock-AF_BLUETOOTH-BTPROTO_L2CAP",
+ "slock-AF_BLUETOOTH-BTPROTO_HCI",
+ "slock-AF_BLUETOOTH-BTPROTO_SCO",
+ "slock-AF_BLUETOOTH-BTPROTO_RFCOMM",
+ "slock-AF_BLUETOOTH-BTPROTO_BNEP",
+ "slock-AF_BLUETOOTH-BTPROTO_CMTP",
+ "slock-AF_BLUETOOTH-BTPROTO_HIDP",
+ "slock-AF_BLUETOOTH-BTPROTO_AVDTP",
+};
+
+void bt_sock_reclassify_lock(struct sock *sk, int proto)
+{
+ BUG_ON(!sk);
+ BUG_ON(sock_owned_by_user(sk));
+
+ sock_lock_init_class_and_name(sk,
+ bt_slock_key_strings[proto], &bt_slock_key[proto],
+ bt_key_strings[proto], &bt_lock_key[proto]);
+}
+EXPORT_SYMBOL(bt_sock_reclassify_lock);
+
+int bt_sock_register(int proto, const struct net_proto_family *ops)
+{
+ int err = 0;
+
+ if (proto < 0 || proto >= BT_MAX_PROTO)
+ return -EINVAL;
+
+ write_lock(&bt_proto_lock);
+
+ if (bt_proto[proto])
+ err = -EEXIST;
+ else
+ bt_proto[proto] = ops;
+
+ write_unlock(&bt_proto_lock);
+
+ return err;
+}
+EXPORT_SYMBOL(bt_sock_register);
+
+void bt_sock_unregister(int proto)
+{
+ if (proto < 0 || proto >= BT_MAX_PROTO)
+ return;
+
+ write_lock(&bt_proto_lock);
+ bt_proto[proto] = NULL;
+ write_unlock(&bt_proto_lock);
+}
+EXPORT_SYMBOL(bt_sock_unregister);
+
+static int bt_sock_create(struct net *net, struct socket *sock, int proto,
+ int kern)
+{
+ int err;
+
+ if (net != &init_net)
+ return -EAFNOSUPPORT;
+
+ if (proto < 0 || proto >= BT_MAX_PROTO)
+ return -EINVAL;
+
+ if (!bt_proto[proto])
+ request_module("bt-proto-%d", proto);
+
+ err = -EPROTONOSUPPORT;
+
+ read_lock(&bt_proto_lock);
+
+ if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) {
+ err = bt_proto[proto]->create(net, sock, proto, kern);
+ if (!err)
+ bt_sock_reclassify_lock(sock->sk, proto);
+ module_put(bt_proto[proto]->owner);
+ }
+
+ read_unlock(&bt_proto_lock);
+
+ return err;
+}
+
+void bt_sock_link(struct bt_sock_list *l, struct sock *sk)
+{
+ write_lock(&l->lock);
+ sk_add_node(sk, &l->head);
+ write_unlock(&l->lock);
+}
+EXPORT_SYMBOL(bt_sock_link);
+
+void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk)
+{
+ write_lock(&l->lock);
+ sk_del_node_init(sk);
+ write_unlock(&l->lock);
+}
+EXPORT_SYMBOL(bt_sock_unlink);
+
+void bt_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+ BT_DBG("parent %p, sk %p", parent, sk);
+
+ sock_hold(sk);
+ list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q);
+ bt_sk(sk)->parent = parent;
+ parent->sk_ack_backlog++;
+}
+EXPORT_SYMBOL(bt_accept_enqueue);
+
+void bt_accept_unlink(struct sock *sk)
+{
+ BT_DBG("sk %p state %d", sk, sk->sk_state);
+
+ list_del_init(&bt_sk(sk)->accept_q);
+ bt_sk(sk)->parent->sk_ack_backlog--;
+ bt_sk(sk)->parent = NULL;
+ sock_put(sk);
+}
+EXPORT_SYMBOL(bt_accept_unlink);
+
+struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
+{
+ struct list_head *p, *n;
+ struct sock *sk;
+
+ BT_DBG("parent %p", parent);
+
+ list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
+ sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
+
+ lock_sock(sk);
+
+ /* FIXME: Is this check still needed */
+ if (sk->sk_state == BT_CLOSED) {
+ release_sock(sk);
+ bt_accept_unlink(sk);
+ continue;
+ }
+
+ if (sk->sk_state == BT_CONNECTED || !newsock ||
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) {
+ bt_accept_unlink(sk);
+ if (newsock)
+ sock_graft(sk, newsock);
+
+ release_sock(sk);
+ return sk;
+ }
+
+ release_sock(sk);
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(bt_accept_dequeue);
+
+int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ int noblock = flags & MSG_DONTWAIT;
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ size_t copied;
+ int err;
+
+ BT_DBG("sock %p sk %p len %zu", sock, sk, len);
+
+ if (flags & (MSG_OOB))
+ return -EOPNOTSUPP;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb) {
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 0;
+
+ return err;
+ }
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ skb_reset_transport_header(skb);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err == 0) {
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (bt_sk(sk)->skb_msg_name)
+ bt_sk(sk)->skb_msg_name(skb, msg->msg_name,
+ &msg->msg_namelen);
+ }
+
+ skb_free_datagram(sk, skb);
+
+ return err ? : copied;
+}
+EXPORT_SYMBOL(bt_sock_recvmsg);
+
+static long bt_sock_data_wait(struct sock *sk, long timeo)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+
+ if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN))
+ break;
+
+ if (signal_pending(current) || !timeo)
+ break;
+
+ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return timeo;
+}
+
+int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ size_t target, copied = 0;
+ long timeo;
+
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ BT_DBG("sk %p size %zu", sk, size);
+
+ lock_sock(sk);
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ struct sk_buff *skb;
+ int chunk;
+
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (!skb) {
+ if (copied >= target)
+ break;
+
+ err = sock_error(sk);
+ if (err)
+ break;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+
+ timeo = bt_sock_data_wait(sk, timeo);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+ continue;
+ }
+
+ chunk = min_t(unsigned int, skb->len, size);
+ if (skb_copy_datagram_msg(skb, 0, msg, chunk)) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ copied += chunk;
+ size -= chunk;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (!(flags & MSG_PEEK)) {
+ int skb_len = skb_headlen(skb);
+
+ if (chunk <= skb_len) {
+ __skb_pull(skb, chunk);
+ } else {
+ struct sk_buff *frag;
+
+ __skb_pull(skb, skb_len);
+ chunk -= skb_len;
+
+ skb_walk_frags(skb, frag) {
+ if (chunk <= frag->len) {
+ /* Pulling partial data */
+ skb->len -= chunk;
+ skb->data_len -= chunk;
+ __skb_pull(frag, chunk);
+ break;
+ } else if (frag->len) {
+ /* Pulling all frag data */
+ chunk -= frag->len;
+ skb->len -= frag->len;
+ skb->data_len -= frag->len;
+ __skb_pull(frag, frag->len);
+ }
+ }
+ }
+
+ if (skb->len) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ kfree_skb(skb);
+
+ } else {
+ /* put message back and return */
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ } while (size);
+
+out:
+ release_sock(sk);
+ return copied ? : err;
+}
+EXPORT_SYMBOL(bt_sock_stream_recvmsg);
+
+static inline unsigned int bt_accept_poll(struct sock *parent)
+{
+ struct list_head *p, *n;
+ struct sock *sk;
+
+ list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
+ sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
+ if (sk->sk_state == BT_CONNECTED ||
+ (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) &&
+ sk->sk_state == BT_CONNECT2))
+ return POLLIN | POLLRDNORM;
+ }
+
+ return 0;
+}
+
+unsigned int bt_sock_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask = 0;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ poll_wait(file, sk_sleep(sk), wait);
+
+ if (sk->sk_state == BT_LISTEN)
+ return bt_accept_poll(sk);
+
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sk->sk_state == BT_CLOSED)
+ mask |= POLLHUP;
+
+ if (sk->sk_state == BT_CONNECT ||
+ sk->sk_state == BT_CONNECT2 ||
+ sk->sk_state == BT_CONFIG)
+ return mask;
+
+ if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ return mask;
+}
+EXPORT_SYMBOL(bt_sock_poll);
+
+int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ long amount;
+ int err;
+
+ BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg);
+
+ switch (cmd) {
+ case TIOCOUTQ:
+ if (sk->sk_state == BT_LISTEN)
+ return -EINVAL;
+
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ err = put_user(amount, (int __user *) arg);
+ break;
+
+ case TIOCINQ:
+ if (sk->sk_state == BT_LISTEN)
+ return -EINVAL;
+
+ lock_sock(sk);
+ skb = skb_peek(&sk->sk_receive_queue);
+ amount = skb ? skb->len : 0;
+ release_sock(sk);
+ err = put_user(amount, (int __user *) arg);
+ break;
+
+ case SIOCGSTAMP:
+ err = sock_get_timestamp(sk, (struct timeval __user *) arg);
+ break;
+
+ case SIOCGSTAMPNS:
+ err = sock_get_timestampns(sk, (struct timespec __user *) arg);
+ break;
+
+ default:
+ err = -ENOIOCTLCMD;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(bt_sock_ioctl);
+
+/* This function expects the sk lock to be held when called */
+int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ int err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (sk->sk_state != state) {
+ if (!timeo) {
+ err = -EINPROGRESS;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = sock_error(sk);
+ if (err)
+ break;
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return err;
+}
+EXPORT_SYMBOL(bt_sock_wait_state);
+
+/* This function expects the sk lock to be held when called */
+int bt_sock_wait_ready(struct sock *sk, unsigned long flags)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ unsigned long timeo;
+ int err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags)) {
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = sock_error(sk);
+ if (err)
+ break;
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ return err;
+}
+EXPORT_SYMBOL(bt_sock_wait_ready);
+
+#ifdef CONFIG_PROC_FS
+struct bt_seq_state {
+ struct bt_sock_list *l;
+};
+
+static void *bt_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(seq->private->l->lock)
+{
+ struct bt_seq_state *s = seq->private;
+ struct bt_sock_list *l = s->l;
+
+ read_lock(&l->lock);
+ return seq_hlist_start_head(&l->head, *pos);
+}
+
+static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bt_seq_state *s = seq->private;
+ struct bt_sock_list *l = s->l;
+
+ return seq_hlist_next(v, &l->head, pos);
+}
+
+static void bt_seq_stop(struct seq_file *seq, void *v)
+ __releases(seq->private->l->lock)
+{
+ struct bt_seq_state *s = seq->private;
+ struct bt_sock_list *l = s->l;
+
+ read_unlock(&l->lock);
+}
+
+static int bt_seq_show(struct seq_file *seq, void *v)
+{
+ struct bt_seq_state *s = seq->private;
+ struct bt_sock_list *l = s->l;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq ,"sk RefCnt Rmem Wmem User Inode Parent");
+
+ if (l->custom_seq_show) {
+ seq_putc(seq, ' ');
+ l->custom_seq_show(seq, v);
+ }
+
+ seq_putc(seq, '\n');
+ } else {
+ struct sock *sk = sk_entry(v);
+ struct bt_sock *bt = bt_sk(sk);
+
+ seq_printf(seq,
+ "%pK %-6d %-6u %-6u %-6u %-6lu %-6lu",
+ sk,
+ atomic_read(&sk->sk_refcnt),
+ sk_rmem_alloc_get(sk),
+ sk_wmem_alloc_get(sk),
+ from_kuid(seq_user_ns(seq), sock_i_uid(sk)),
+ sock_i_ino(sk),
+ bt->parent? sock_i_ino(bt->parent): 0LU);
+
+ if (l->custom_seq_show) {
+ seq_putc(seq, ' ');
+ l->custom_seq_show(seq, v);
+ }
+
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static const struct seq_operations bt_seq_ops = {
+ .start = bt_seq_start,
+ .next = bt_seq_next,
+ .stop = bt_seq_stop,
+ .show = bt_seq_show,
+};
+
+static int bt_seq_open(struct inode *inode, struct file *file)
+{
+ struct bt_sock_list *sk_list;
+ struct bt_seq_state *s;
+
+ sk_list = PDE_DATA(inode);
+ s = __seq_open_private(file, &bt_seq_ops,
+ sizeof(struct bt_seq_state));
+ if (!s)
+ return -ENOMEM;
+
+ s->l = sk_list;
+ return 0;
+}
+
+static const struct file_operations bt_fops = {
+ .open = bt_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private
+};
+
+int bt_procfs_init(struct net *net, const char *name,
+ struct bt_sock_list* sk_list,
+ int (* seq_show)(struct seq_file *, void *))
+{
+ sk_list->custom_seq_show = seq_show;
+
+ if (!proc_create_data(name, 0, net->proc_net, &bt_fops, sk_list))
+ return -ENOMEM;
+ return 0;
+}
+
+void bt_procfs_cleanup(struct net *net, const char *name)
+{
+ remove_proc_entry(name, net->proc_net);
+}
+#else
+int bt_procfs_init(struct net *net, const char *name,
+ struct bt_sock_list* sk_list,
+ int (* seq_show)(struct seq_file *, void *))
+{
+ return 0;
+}
+
+void bt_procfs_cleanup(struct net *net, const char *name)
+{
+}
+#endif
+EXPORT_SYMBOL(bt_procfs_init);
+EXPORT_SYMBOL(bt_procfs_cleanup);
+
+static struct net_proto_family bt_sock_family_ops = {
+ .owner = THIS_MODULE,
+ .family = PF_BLUETOOTH,
+ .create = bt_sock_create,
+};
+
+struct dentry *bt_debugfs;
+EXPORT_SYMBOL_GPL(bt_debugfs);
+
+static int __init bt_init(void)
+{
+ int err;
+
+ sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
+
+ BT_INFO("Core ver %s", VERSION);
+
+ err = bt_selftest();
+ if (err < 0)
+ return err;
+
+ bt_debugfs = debugfs_create_dir("bluetooth", NULL);
+
+ err = bt_sysfs_init();
+ if (err < 0)
+ return err;
+
+ err = sock_register(&bt_sock_family_ops);
+ if (err < 0) {
+ bt_sysfs_cleanup();
+ return err;
+ }
+
+ BT_INFO("HCI device and connection manager initialized");
+
+ err = hci_sock_init();
+ if (err < 0)
+ goto error;
+
+ err = l2cap_init();
+ if (err < 0)
+ goto sock_err;
+
+ err = sco_init();
+ if (err < 0) {
+ l2cap_exit();
+ goto sock_err;
+ }
+
+ err = mgmt_init();
+ if (err < 0) {
+ sco_exit();
+ l2cap_exit();
+ goto sock_err;
+ }
+
+ return 0;
+
+sock_err:
+ hci_sock_cleanup();
+
+error:
+ sock_unregister(PF_BLUETOOTH);
+ bt_sysfs_cleanup();
+
+ return err;
+}
+
+static void __exit bt_exit(void)
+{
+ mgmt_exit();
+
+ sco_exit();
+
+ l2cap_exit();
+
+ hci_sock_cleanup();
+
+ sock_unregister(PF_BLUETOOTH);
+
+ bt_sysfs_cleanup();
+
+ debugfs_remove_recursive(bt_debugfs);
+}
+
+subsys_initcall(bt_init);
+module_exit(bt_exit);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
new file mode 100644
index 000000000..ee016f039
--- /dev/null
+++ b/net/bluetooth/amp.c
@@ -0,0 +1,474 @@
+/*
+ Copyright (c) 2011,2012 Intel Corp.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 and
+ only version 2 as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+*/
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci.h>
+#include <net/bluetooth/hci_core.h>
+#include <crypto/hash.h>
+
+#include "a2mp.h"
+#include "amp.h"
+
+/* Remote AMP Controllers interface */
+void amp_ctrl_get(struct amp_ctrl *ctrl)
+{
+ BT_DBG("ctrl %p orig refcnt %d", ctrl,
+ atomic_read(&ctrl->kref.refcount));
+
+ kref_get(&ctrl->kref);
+}
+
+static void amp_ctrl_destroy(struct kref *kref)
+{
+ struct amp_ctrl *ctrl = container_of(kref, struct amp_ctrl, kref);
+
+ BT_DBG("ctrl %p", ctrl);
+
+ kfree(ctrl->assoc);
+ kfree(ctrl);
+}
+
+int amp_ctrl_put(struct amp_ctrl *ctrl)
+{
+ BT_DBG("ctrl %p orig refcnt %d", ctrl,
+ atomic_read(&ctrl->kref.refcount));
+
+ return kref_put(&ctrl->kref, &amp_ctrl_destroy);
+}
+
+struct amp_ctrl *amp_ctrl_add(struct amp_mgr *mgr, u8 id)
+{
+ struct amp_ctrl *ctrl;
+
+ ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+ if (!ctrl)
+ return NULL;
+
+ kref_init(&ctrl->kref);
+ ctrl->id = id;
+
+ mutex_lock(&mgr->amp_ctrls_lock);
+ list_add(&ctrl->list, &mgr->amp_ctrls);
+ mutex_unlock(&mgr->amp_ctrls_lock);
+
+ BT_DBG("mgr %p ctrl %p", mgr, ctrl);
+
+ return ctrl;
+}
+
+void amp_ctrl_list_flush(struct amp_mgr *mgr)
+{
+ struct amp_ctrl *ctrl, *n;
+
+ BT_DBG("mgr %p", mgr);
+
+ mutex_lock(&mgr->amp_ctrls_lock);
+ list_for_each_entry_safe(ctrl, n, &mgr->amp_ctrls, list) {
+ list_del(&ctrl->list);
+ amp_ctrl_put(ctrl);
+ }
+ mutex_unlock(&mgr->amp_ctrls_lock);
+}
+
+struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id)
+{
+ struct amp_ctrl *ctrl;
+
+ BT_DBG("mgr %p id %d", mgr, id);
+
+ mutex_lock(&mgr->amp_ctrls_lock);
+ list_for_each_entry(ctrl, &mgr->amp_ctrls, list) {
+ if (ctrl->id == id) {
+ amp_ctrl_get(ctrl);
+ mutex_unlock(&mgr->amp_ctrls_lock);
+ return ctrl;
+ }
+ }
+ mutex_unlock(&mgr->amp_ctrls_lock);
+
+ return NULL;
+}
+
+/* Physical Link interface */
+static u8 __next_handle(struct amp_mgr *mgr)
+{
+ if (++mgr->handle == 0)
+ mgr->handle = 1;
+
+ return mgr->handle;
+}
+
+struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
+ u8 remote_id, bool out)
+{
+ bdaddr_t *dst = &mgr->l2cap_conn->hcon->dst;
+ struct hci_conn *hcon;
+ u8 role = out ? HCI_ROLE_MASTER : HCI_ROLE_SLAVE;
+
+ hcon = hci_conn_add(hdev, AMP_LINK, dst, role);
+ if (!hcon)
+ return NULL;
+
+ BT_DBG("hcon %p dst %pMR", hcon, dst);
+
+ hcon->state = BT_CONNECT;
+ hcon->attempt++;
+ hcon->handle = __next_handle(mgr);
+ hcon->remote_id = remote_id;
+ hcon->amp_mgr = amp_mgr_get(mgr);
+
+ return hcon;
+}
+
+/* AMP crypto key generation interface */
+static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *shash;
+ int ret;
+
+ if (!ksize)
+ return -EINVAL;
+
+ tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+ if (IS_ERR(tfm)) {
+ BT_DBG("crypto_alloc_ahash failed: err %ld", PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+
+ ret = crypto_shash_setkey(tfm, key, ksize);
+ if (ret) {
+ BT_DBG("crypto_ahash_setkey failed: err %d", ret);
+ goto failed;
+ }
+
+ shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(tfm),
+ GFP_KERNEL);
+ if (!shash) {
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ shash->tfm = tfm;
+ shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ret = crypto_shash_digest(shash, plaintext, psize, output);
+
+ kfree(shash);
+
+failed:
+ crypto_free_shash(tfm);
+ return ret;
+}
+
+int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct link_key *key;
+ u8 keybuf[HCI_AMP_LINK_KEY_SIZE];
+ u8 gamp_key[HCI_AMP_LINK_KEY_SIZE];
+ int err;
+
+ if (!hci_conn_check_link_mode(conn))
+ return -EACCES;
+
+ BT_DBG("conn %p key_type %d", conn, conn->key_type);
+
+ /* Legacy key */
+ if (conn->key_type < 3) {
+ BT_ERR("Legacy key type %d", conn->key_type);
+ return -EACCES;
+ }
+
+ *type = conn->key_type;
+ *len = HCI_AMP_LINK_KEY_SIZE;
+
+ key = hci_find_link_key(hdev, &conn->dst);
+ if (!key) {
+ BT_DBG("No Link key for conn %p dst %pMR", conn, &conn->dst);
+ return -EACCES;
+ }
+
+ /* BR/EDR Link Key concatenated together with itself */
+ memcpy(&keybuf[0], key->val, HCI_LINK_KEY_SIZE);
+ memcpy(&keybuf[HCI_LINK_KEY_SIZE], key->val, HCI_LINK_KEY_SIZE);
+
+ /* Derive Generic AMP Link Key (gamp) */
+ err = hmac_sha256(keybuf, HCI_AMP_LINK_KEY_SIZE, "gamp", 4, gamp_key);
+ if (err) {
+ BT_ERR("Could not derive Generic AMP Key: err %d", err);
+ return err;
+ }
+
+ if (conn->key_type == HCI_LK_DEBUG_COMBINATION) {
+ BT_DBG("Use Generic AMP Key (gamp)");
+ memcpy(data, gamp_key, HCI_AMP_LINK_KEY_SIZE);
+ return err;
+ }
+
+ /* Derive Dedicated AMP Link Key: "802b" is 802.11 PAL keyID */
+ return hmac_sha256(gamp_key, HCI_AMP_LINK_KEY_SIZE, "802b", 4, data);
+}
+
+void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle)
+{
+ struct hci_cp_read_local_amp_assoc cp;
+ struct amp_assoc *loc_assoc = &hdev->loc_assoc;
+
+ BT_DBG("%s handle %d", hdev->name, phy_handle);
+
+ cp.phy_handle = phy_handle;
+ cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
+ cp.len_so_far = cpu_to_le16(loc_assoc->offset);
+
+ hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+}
+
+void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr)
+{
+ struct hci_cp_read_local_amp_assoc cp;
+
+ memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc));
+ memset(&cp, 0, sizeof(cp));
+
+ cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
+
+ set_bit(READ_LOC_AMP_ASSOC, &mgr->state);
+ hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+}
+
+void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
+ struct hci_conn *hcon)
+{
+ struct hci_cp_read_local_amp_assoc cp;
+ struct amp_mgr *mgr = hcon->amp_mgr;
+
+ cp.phy_handle = hcon->handle;
+ cp.len_so_far = cpu_to_le16(0);
+ cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
+
+ set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state);
+
+ /* Read Local AMP Assoc final link information data */
+ hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
+}
+
+/* Write AMP Assoc data fragments, returns true with last fragment written*/
+static bool amp_write_rem_assoc_frag(struct hci_dev *hdev,
+ struct hci_conn *hcon)
+{
+ struct hci_cp_write_remote_amp_assoc *cp;
+ struct amp_mgr *mgr = hcon->amp_mgr;
+ struct amp_ctrl *ctrl;
+ u16 frag_len, len;
+
+ ctrl = amp_ctrl_lookup(mgr, hcon->remote_id);
+ if (!ctrl)
+ return false;
+
+ if (!ctrl->assoc_rem_len) {
+ BT_DBG("all fragments are written");
+ ctrl->assoc_rem_len = ctrl->assoc_len;
+ ctrl->assoc_len_so_far = 0;
+
+ amp_ctrl_put(ctrl);
+ return true;
+ }
+
+ frag_len = min_t(u16, 248, ctrl->assoc_rem_len);
+ len = frag_len + sizeof(*cp);
+
+ cp = kzalloc(len, GFP_KERNEL);
+ if (!cp) {
+ amp_ctrl_put(ctrl);
+ return false;
+ }
+
+ BT_DBG("hcon %p ctrl %p frag_len %u assoc_len %u rem_len %u",
+ hcon, ctrl, frag_len, ctrl->assoc_len, ctrl->assoc_rem_len);
+
+ cp->phy_handle = hcon->handle;
+ cp->len_so_far = cpu_to_le16(ctrl->assoc_len_so_far);
+ cp->rem_len = cpu_to_le16(ctrl->assoc_rem_len);
+ memcpy(cp->frag, ctrl->assoc, frag_len);
+
+ ctrl->assoc_len_so_far += frag_len;
+ ctrl->assoc_rem_len -= frag_len;
+
+ amp_ctrl_put(ctrl);
+
+ hci_send_cmd(hdev, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp);
+
+ kfree(cp);
+
+ return false;
+}
+
+void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle)
+{
+ struct hci_conn *hcon;
+
+ BT_DBG("%s phy handle 0x%2.2x", hdev->name, handle);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon)
+ return;
+
+ /* Send A2MP create phylink rsp when all fragments are written */
+ if (amp_write_rem_assoc_frag(hdev, hcon))
+ a2mp_send_create_phy_link_rsp(hdev, 0);
+}
+
+void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle)
+{
+ struct hci_conn *hcon;
+
+ BT_DBG("%s phy handle 0x%2.2x", hdev->name, handle);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon)
+ return;
+
+ BT_DBG("%s phy handle 0x%2.2x hcon %p", hdev->name, handle, hcon);
+
+ amp_write_rem_assoc_frag(hdev, hcon);
+}
+
+void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
+ struct hci_conn *hcon)
+{
+ struct hci_cp_create_phy_link cp;
+
+ cp.phy_handle = hcon->handle;
+
+ BT_DBG("%s hcon %p phy handle 0x%2.2x", hdev->name, hcon,
+ hcon->handle);
+
+ if (phylink_gen_key(mgr->l2cap_conn->hcon, cp.key, &cp.key_len,
+ &cp.key_type)) {
+ BT_DBG("Cannot create link key");
+ return;
+ }
+
+ hci_send_cmd(hdev, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp);
+}
+
+void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
+ struct hci_conn *hcon)
+{
+ struct hci_cp_accept_phy_link cp;
+
+ cp.phy_handle = hcon->handle;
+
+ BT_DBG("%s hcon %p phy handle 0x%2.2x", hdev->name, hcon,
+ hcon->handle);
+
+ if (phylink_gen_key(mgr->l2cap_conn->hcon, cp.key, &cp.key_len,
+ &cp.key_type)) {
+ BT_DBG("Cannot create link key");
+ return;
+ }
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp);
+}
+
+void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon)
+{
+ struct hci_dev *bredr_hdev = hci_dev_hold(bredr_hcon->hdev);
+ struct amp_mgr *mgr = hs_hcon->amp_mgr;
+ struct l2cap_chan *bredr_chan;
+
+ BT_DBG("bredr_hcon %p hs_hcon %p mgr %p", bredr_hcon, hs_hcon, mgr);
+
+ if (!bredr_hdev || !mgr || !mgr->bredr_chan)
+ return;
+
+ bredr_chan = mgr->bredr_chan;
+
+ l2cap_chan_lock(bredr_chan);
+
+ set_bit(FLAG_EFS_ENABLE, &bredr_chan->flags);
+ bredr_chan->remote_amp_id = hs_hcon->remote_id;
+ bredr_chan->local_amp_id = hs_hcon->hdev->id;
+ bredr_chan->hs_hcon = hs_hcon;
+ bredr_chan->conn->mtu = hs_hcon->hdev->block_mtu;
+
+ __l2cap_physical_cfm(bredr_chan, 0);
+
+ l2cap_chan_unlock(bredr_chan);
+
+ hci_dev_put(bredr_hdev);
+}
+
+void amp_create_logical_link(struct l2cap_chan *chan)
+{
+ struct hci_conn *hs_hcon = chan->hs_hcon;
+ struct hci_cp_create_accept_logical_link cp;
+ struct hci_dev *hdev;
+
+ BT_DBG("chan %p hs_hcon %p dst %pMR", chan, hs_hcon,
+ &chan->conn->hcon->dst);
+
+ if (!hs_hcon)
+ return;
+
+ hdev = hci_dev_hold(chan->hs_hcon->hdev);
+ if (!hdev)
+ return;
+
+ cp.phy_handle = hs_hcon->handle;
+
+ cp.tx_flow_spec.id = chan->local_id;
+ cp.tx_flow_spec.stype = chan->local_stype;
+ cp.tx_flow_spec.msdu = cpu_to_le16(chan->local_msdu);
+ cp.tx_flow_spec.sdu_itime = cpu_to_le32(chan->local_sdu_itime);
+ cp.tx_flow_spec.acc_lat = cpu_to_le32(chan->local_acc_lat);
+ cp.tx_flow_spec.flush_to = cpu_to_le32(chan->local_flush_to);
+
+ cp.rx_flow_spec.id = chan->remote_id;
+ cp.rx_flow_spec.stype = chan->remote_stype;
+ cp.rx_flow_spec.msdu = cpu_to_le16(chan->remote_msdu);
+ cp.rx_flow_spec.sdu_itime = cpu_to_le32(chan->remote_sdu_itime);
+ cp.rx_flow_spec.acc_lat = cpu_to_le32(chan->remote_acc_lat);
+ cp.rx_flow_spec.flush_to = cpu_to_le32(chan->remote_flush_to);
+
+ if (hs_hcon->out)
+ hci_send_cmd(hdev, HCI_OP_CREATE_LOGICAL_LINK, sizeof(cp),
+ &cp);
+ else
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_LOGICAL_LINK, sizeof(cp),
+ &cp);
+
+ hci_dev_put(hdev);
+}
+
+void amp_disconnect_logical_link(struct hci_chan *hchan)
+{
+ struct hci_conn *hcon = hchan->conn;
+ struct hci_cp_disconn_logical_link cp;
+
+ if (hcon->state != BT_CONNECTED) {
+ BT_DBG("hchan %p not connected", hchan);
+ return;
+ }
+
+ cp.log_handle = cpu_to_le16(hchan->handle);
+ hci_send_cmd(hcon->hdev, HCI_OP_DISCONN_LOGICAL_LINK, sizeof(cp), &cp);
+}
+
+void amp_destroy_logical_link(struct hci_chan *hchan, u8 reason)
+{
+ BT_DBG("hchan %p", hchan);
+
+ hci_chan_del(hchan);
+}
diff --git a/net/bluetooth/amp.h b/net/bluetooth/amp.h
new file mode 100644
index 000000000..7ea3db77b
--- /dev/null
+++ b/net/bluetooth/amp.h
@@ -0,0 +1,54 @@
+/*
+ Copyright (c) 2011,2012 Intel Corp.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 and
+ only version 2 as published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+*/
+
+#ifndef __AMP_H
+#define __AMP_H
+
+struct amp_ctrl {
+ struct list_head list;
+ struct kref kref;
+ __u8 id;
+ __u16 assoc_len_so_far;
+ __u16 assoc_rem_len;
+ __u16 assoc_len;
+ __u8 *assoc;
+};
+
+int amp_ctrl_put(struct amp_ctrl *ctrl);
+void amp_ctrl_get(struct amp_ctrl *ctrl);
+struct amp_ctrl *amp_ctrl_add(struct amp_mgr *mgr, u8 id);
+struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id);
+void amp_ctrl_list_flush(struct amp_mgr *mgr);
+
+struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
+ u8 remote_id, bool out);
+
+int phylink_gen_key(struct hci_conn *hcon, u8 *data, u8 *len, u8 *type);
+
+void amp_read_loc_info(struct hci_dev *hdev, struct amp_mgr *mgr);
+void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle);
+void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr);
+void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
+ struct hci_conn *hcon);
+void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
+ struct hci_conn *hcon);
+void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
+ struct hci_conn *hcon);
+void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle);
+void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle);
+void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon);
+void amp_create_logical_link(struct l2cap_chan *chan);
+void amp_disconnect_logical_link(struct hci_chan *hchan);
+void amp_destroy_logical_link(struct hci_chan *hchan, u8 reason);
+
+#endif /* __AMP_H */
diff --git a/net/bluetooth/bnep/Kconfig b/net/bluetooth/bnep/Kconfig
new file mode 100644
index 000000000..9b70317c4
--- /dev/null
+++ b/net/bluetooth/bnep/Kconfig
@@ -0,0 +1,24 @@
+config BT_BNEP
+ tristate "BNEP protocol support"
+ depends on BT_BREDR
+ select CRC32
+ help
+ BNEP (Bluetooth Network Encapsulation Protocol) is Ethernet
+ emulation layer on top of Bluetooth. BNEP is required for
+ Bluetooth PAN (Personal Area Network).
+
+ Say Y here to compile BNEP support into the kernel or say M to
+ compile it as module (bnep).
+
+config BT_BNEP_MC_FILTER
+ bool "Multicast filter support"
+ depends on BT_BNEP
+ help
+ This option enables the multicast filter support for BNEP.
+
+config BT_BNEP_PROTO_FILTER
+ bool "Protocol filter support"
+ depends on BT_BNEP
+ help
+ This option enables the protocol filter support for BNEP.
+
diff --git a/net/bluetooth/bnep/Makefile b/net/bluetooth/bnep/Makefile
new file mode 100644
index 000000000..c7821e76c
--- /dev/null
+++ b/net/bluetooth/bnep/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Bluetooth BNEP layer.
+#
+
+obj-$(CONFIG_BT_BNEP) += bnep.o
+
+bnep-objs := core.o sock.o netdev.o
diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h
new file mode 100644
index 000000000..40854c99b
--- /dev/null
+++ b/net/bluetooth/bnep/bnep.h
@@ -0,0 +1,183 @@
+/*
+ BNEP protocol definition for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License, version 2, as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _BNEP_H
+#define _BNEP_H
+
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <net/bluetooth/bluetooth.h>
+
+/* Limits */
+#define BNEP_MAX_PROTO_FILTERS 5
+#define BNEP_MAX_MULTICAST_FILTERS 20
+
+/* UUIDs */
+#define BNEP_BASE_UUID 0x0000000000001000800000805F9B34FB
+#define BNEP_UUID16 0x02
+#define BNEP_UUID32 0x04
+#define BNEP_UUID128 0x16
+
+#define BNEP_SVC_PANU 0x1115
+#define BNEP_SVC_NAP 0x1116
+#define BNEP_SVC_GN 0x1117
+
+/* Packet types */
+#define BNEP_GENERAL 0x00
+#define BNEP_CONTROL 0x01
+#define BNEP_COMPRESSED 0x02
+#define BNEP_COMPRESSED_SRC_ONLY 0x03
+#define BNEP_COMPRESSED_DST_ONLY 0x04
+
+/* Control types */
+#define BNEP_CMD_NOT_UNDERSTOOD 0x00
+#define BNEP_SETUP_CONN_REQ 0x01
+#define BNEP_SETUP_CONN_RSP 0x02
+#define BNEP_FILTER_NET_TYPE_SET 0x03
+#define BNEP_FILTER_NET_TYPE_RSP 0x04
+#define BNEP_FILTER_MULTI_ADDR_SET 0x05
+#define BNEP_FILTER_MULTI_ADDR_RSP 0x06
+
+/* Extension types */
+#define BNEP_EXT_CONTROL 0x00
+
+/* Response messages */
+#define BNEP_SUCCESS 0x00
+
+#define BNEP_CONN_INVALID_DST 0x01
+#define BNEP_CONN_INVALID_SRC 0x02
+#define BNEP_CONN_INVALID_SVC 0x03
+#define BNEP_CONN_NOT_ALLOWED 0x04
+
+#define BNEP_FILTER_UNSUPPORTED_REQ 0x01
+#define BNEP_FILTER_INVALID_RANGE 0x02
+#define BNEP_FILTER_INVALID_MCADDR 0x02
+#define BNEP_FILTER_LIMIT_REACHED 0x03
+#define BNEP_FILTER_DENIED_SECURITY 0x04
+
+/* L2CAP settings */
+#define BNEP_MTU 1691
+#define BNEP_PSM 0x0f
+#define BNEP_FLUSH_TO 0xffff
+#define BNEP_CONNECT_TO 15
+#define BNEP_FILTER_TO 15
+
+/* Headers */
+#define BNEP_TYPE_MASK 0x7f
+#define BNEP_EXT_HEADER 0x80
+
+struct bnep_setup_conn_req {
+ __u8 type;
+ __u8 ctrl;
+ __u8 uuid_size;
+ __u8 service[0];
+} __packed;
+
+struct bnep_set_filter_req {
+ __u8 type;
+ __u8 ctrl;
+ __be16 len;
+ __u8 list[0];
+} __packed;
+
+struct bnep_control_rsp {
+ __u8 type;
+ __u8 ctrl;
+ __be16 resp;
+} __packed;
+
+struct bnep_ext_hdr {
+ __u8 type;
+ __u8 len;
+ __u8 data[0];
+} __packed;
+
+/* BNEP ioctl defines */
+#define BNEPCONNADD _IOW('B', 200, int)
+#define BNEPCONNDEL _IOW('B', 201, int)
+#define BNEPGETCONNLIST _IOR('B', 210, int)
+#define BNEPGETCONNINFO _IOR('B', 211, int)
+#define BNEPGETSUPPFEAT _IOR('B', 212, int)
+
+#define BNEP_SETUP_RESPONSE 0
+#define BNEP_SETUP_RSP_SENT 10
+
+struct bnep_connadd_req {
+ int sock; /* Connected socket */
+ __u32 flags;
+ __u16 role;
+ char device[16]; /* Name of the Ethernet device */
+};
+
+struct bnep_conndel_req {
+ __u32 flags;
+ __u8 dst[ETH_ALEN];
+};
+
+struct bnep_conninfo {
+ __u32 flags;
+ __u16 role;
+ __u16 state;
+ __u8 dst[ETH_ALEN];
+ char device[16];
+};
+
+struct bnep_connlist_req {
+ __u32 cnum;
+ struct bnep_conninfo __user *ci;
+};
+
+struct bnep_proto_filter {
+ __u16 start;
+ __u16 end;
+};
+
+int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock);
+int bnep_del_connection(struct bnep_conndel_req *req);
+int bnep_get_connlist(struct bnep_connlist_req *req);
+int bnep_get_conninfo(struct bnep_conninfo *ci);
+
+/* BNEP sessions */
+struct bnep_session {
+ struct list_head list;
+
+ unsigned int role;
+ unsigned long state;
+ unsigned long flags;
+ atomic_t terminate;
+ struct task_struct *task;
+
+ struct ethhdr eh;
+ struct msghdr msg;
+
+ struct bnep_proto_filter proto_filter[BNEP_MAX_PROTO_FILTERS];
+ unsigned long long mc_filter;
+
+ struct socket *sock;
+ struct net_device *dev;
+};
+
+void bnep_net_setup(struct net_device *dev);
+int bnep_sock_init(void);
+void bnep_sock_cleanup(void);
+
+static inline int bnep_mc_hash(__u8 *addr)
+{
+ return crc32_be(~0, addr, ETH_ALEN) >> 26;
+}
+
+#endif
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
new file mode 100644
index 000000000..1641367e5
--- /dev/null
+++ b/net/bluetooth/bnep/core.c
@@ -0,0 +1,769 @@
+/*
+ BNEP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2001-2002 Inventel Systemes
+ Written 2001-2002 by
+ Clément Moreau <clement.moreau@inventel.fr>
+ David Libault <david.libault@inventel.fr>
+
+ Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/file.h>
+#include <linux/etherdevice.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "bnep.h"
+
+#define VERSION "1.3"
+
+static bool compress_src = true;
+static bool compress_dst = true;
+
+static LIST_HEAD(bnep_session_list);
+static DECLARE_RWSEM(bnep_session_sem);
+
+static struct bnep_session *__bnep_get_session(u8 *dst)
+{
+ struct bnep_session *s;
+
+ BT_DBG("");
+
+ list_for_each_entry(s, &bnep_session_list, list)
+ if (ether_addr_equal(dst, s->eh.h_source))
+ return s;
+
+ return NULL;
+}
+
+static void __bnep_link_session(struct bnep_session *s)
+{
+ list_add(&s->list, &bnep_session_list);
+}
+
+static void __bnep_unlink_session(struct bnep_session *s)
+{
+ list_del(&s->list);
+}
+
+static int bnep_send(struct bnep_session *s, void *data, size_t len)
+{
+ struct socket *sock = s->sock;
+ struct kvec iv = { data, len };
+
+ return kernel_sendmsg(sock, &s->msg, &iv, 1, len);
+}
+
+static int bnep_send_rsp(struct bnep_session *s, u8 ctrl, u16 resp)
+{
+ struct bnep_control_rsp rsp;
+ rsp.type = BNEP_CONTROL;
+ rsp.ctrl = ctrl;
+ rsp.resp = htons(resp);
+ return bnep_send(s, &rsp, sizeof(rsp));
+}
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+static inline void bnep_set_default_proto_filter(struct bnep_session *s)
+{
+ /* (IPv4, ARP) */
+ s->proto_filter[0].start = ETH_P_IP;
+ s->proto_filter[0].end = ETH_P_ARP;
+ /* (RARP, AppleTalk) */
+ s->proto_filter[1].start = ETH_P_RARP;
+ s->proto_filter[1].end = ETH_P_AARP;
+ /* (IPX, IPv6) */
+ s->proto_filter[2].start = ETH_P_IPX;
+ s->proto_filter[2].end = ETH_P_IPV6;
+}
+#endif
+
+static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len)
+{
+ int n;
+
+ if (len < 2)
+ return -EILSEQ;
+
+ n = get_unaligned_be16(data);
+ data++;
+ len -= 2;
+
+ if (len < n)
+ return -EILSEQ;
+
+ BT_DBG("filter len %d", n);
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+ n /= 4;
+ if (n <= BNEP_MAX_PROTO_FILTERS) {
+ struct bnep_proto_filter *f = s->proto_filter;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ f[i].start = get_unaligned_be16(data++);
+ f[i].end = get_unaligned_be16(data++);
+
+ BT_DBG("proto filter start %d end %d",
+ f[i].start, f[i].end);
+ }
+
+ if (i < BNEP_MAX_PROTO_FILTERS)
+ memset(f + i, 0, sizeof(*f));
+
+ if (n == 0)
+ bnep_set_default_proto_filter(s);
+
+ bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_SUCCESS);
+ } else {
+ bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_LIMIT_REACHED);
+ }
+#else
+ bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_UNSUPPORTED_REQ);
+#endif
+ return 0;
+}
+
+static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len)
+{
+ int n;
+
+ if (len < 2)
+ return -EILSEQ;
+
+ n = get_unaligned_be16(data);
+ data += 2;
+ len -= 2;
+
+ if (len < n)
+ return -EILSEQ;
+
+ BT_DBG("filter len %d", n);
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+ n /= (ETH_ALEN * 2);
+
+ if (n > 0) {
+ int i;
+
+ s->mc_filter = 0;
+
+ /* Always send broadcast */
+ set_bit(bnep_mc_hash(s->dev->broadcast), (ulong *) &s->mc_filter);
+
+ /* Add address ranges to the multicast hash */
+ for (; n > 0; n--) {
+ u8 a1[6], *a2;
+
+ memcpy(a1, data, ETH_ALEN);
+ data += ETH_ALEN;
+ a2 = data;
+ data += ETH_ALEN;
+
+ BT_DBG("mc filter %pMR -> %pMR", a1, a2);
+
+ /* Iterate from a1 to a2 */
+ set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter);
+ while (memcmp(a1, a2, 6) < 0 && s->mc_filter != ~0LL) {
+ /* Increment a1 */
+ i = 5;
+ while (i >= 0 && ++a1[i--] == 0)
+ ;
+
+ set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter);
+ }
+ }
+ }
+
+ BT_DBG("mc filter hash 0x%llx", s->mc_filter);
+
+ bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_SUCCESS);
+#else
+ bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_FILTER_UNSUPPORTED_REQ);
+#endif
+ return 0;
+}
+
+static int bnep_rx_control(struct bnep_session *s, void *data, int len)
+{
+ u8 cmd = *(u8 *)data;
+ int err = 0;
+
+ data++;
+ len--;
+
+ switch (cmd) {
+ case BNEP_CMD_NOT_UNDERSTOOD:
+ case BNEP_SETUP_CONN_RSP:
+ case BNEP_FILTER_NET_TYPE_RSP:
+ case BNEP_FILTER_MULTI_ADDR_RSP:
+ /* Ignore these for now */
+ break;
+
+ case BNEP_FILTER_NET_TYPE_SET:
+ err = bnep_ctrl_set_netfilter(s, data, len);
+ break;
+
+ case BNEP_FILTER_MULTI_ADDR_SET:
+ err = bnep_ctrl_set_mcfilter(s, data, len);
+ break;
+
+ case BNEP_SETUP_CONN_REQ:
+ /* Successful response should be sent only once */
+ if (test_bit(BNEP_SETUP_RESPONSE, &s->flags) &&
+ !test_and_set_bit(BNEP_SETUP_RSP_SENT, &s->flags))
+ err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP,
+ BNEP_SUCCESS);
+ else
+ err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP,
+ BNEP_CONN_NOT_ALLOWED);
+ break;
+
+ default: {
+ u8 pkt[3];
+ pkt[0] = BNEP_CONTROL;
+ pkt[1] = BNEP_CMD_NOT_UNDERSTOOD;
+ pkt[2] = cmd;
+ err = bnep_send(s, pkt, sizeof(pkt));
+ }
+ break;
+ }
+
+ return err;
+}
+
+static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb)
+{
+ struct bnep_ext_hdr *h;
+ int err = 0;
+
+ do {
+ h = (void *) skb->data;
+ if (!skb_pull(skb, sizeof(*h))) {
+ err = -EILSEQ;
+ break;
+ }
+
+ BT_DBG("type 0x%x len %d", h->type, h->len);
+
+ switch (h->type & BNEP_TYPE_MASK) {
+ case BNEP_EXT_CONTROL:
+ bnep_rx_control(s, skb->data, skb->len);
+ break;
+
+ default:
+ /* Unknown extension, skip it. */
+ break;
+ }
+
+ if (!skb_pull(skb, h->len)) {
+ err = -EILSEQ;
+ break;
+ }
+ } while (!err && (h->type & BNEP_EXT_HEADER));
+
+ return err;
+}
+
+static u8 __bnep_rx_hlen[] = {
+ ETH_HLEN, /* BNEP_GENERAL */
+ 0, /* BNEP_CONTROL */
+ 2, /* BNEP_COMPRESSED */
+ ETH_ALEN + 2, /* BNEP_COMPRESSED_SRC_ONLY */
+ ETH_ALEN + 2 /* BNEP_COMPRESSED_DST_ONLY */
+};
+
+static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
+{
+ struct net_device *dev = s->dev;
+ struct sk_buff *nskb;
+ u8 type, ctrl_type;
+
+ dev->stats.rx_bytes += skb->len;
+
+ type = *(u8 *) skb->data;
+ skb_pull(skb, 1);
+ ctrl_type = *(u8 *)skb->data;
+
+ if ((type & BNEP_TYPE_MASK) >= sizeof(__bnep_rx_hlen))
+ goto badframe;
+
+ if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) {
+ if (bnep_rx_control(s, skb->data, skb->len) < 0) {
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (!(type & BNEP_EXT_HEADER)) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ /* Verify and pull ctrl message since it's already processed */
+ switch (ctrl_type) {
+ case BNEP_SETUP_CONN_REQ:
+ /* Pull: ctrl type (1 b), len (1 b), data (len bytes) */
+ if (!skb_pull(skb, 2 + *(u8 *)(skb->data + 1) * 2))
+ goto badframe;
+ break;
+ case BNEP_FILTER_MULTI_ADDR_SET:
+ case BNEP_FILTER_NET_TYPE_SET:
+ /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */
+ if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2))
+ goto badframe;
+ break;
+ default:
+ kfree_skb(skb);
+ return 0;
+ }
+ } else {
+ skb_reset_mac_header(skb);
+
+ /* Verify and pull out header */
+ if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK]))
+ goto badframe;
+
+ s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2));
+ }
+
+ if (type & BNEP_EXT_HEADER) {
+ if (bnep_rx_extension(s, skb) < 0)
+ goto badframe;
+ }
+
+ /* Strip 802.1p header */
+ if (ntohs(s->eh.h_proto) == ETH_P_8021Q) {
+ if (!skb_pull(skb, 4))
+ goto badframe;
+ s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2));
+ }
+
+ /* We have to alloc new skb and copy data here :(. Because original skb
+ * may not be modified and because of the alignment requirements. */
+ nskb = alloc_skb(2 + ETH_HLEN + skb->len, GFP_KERNEL);
+ if (!nskb) {
+ dev->stats.rx_dropped++;
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ skb_reserve(nskb, 2);
+
+ /* Decompress header and construct ether frame */
+ switch (type & BNEP_TYPE_MASK) {
+ case BNEP_COMPRESSED:
+ memcpy(__skb_put(nskb, ETH_HLEN), &s->eh, ETH_HLEN);
+ break;
+
+ case BNEP_COMPRESSED_SRC_ONLY:
+ memcpy(__skb_put(nskb, ETH_ALEN), s->eh.h_dest, ETH_ALEN);
+ memcpy(__skb_put(nskb, ETH_ALEN), skb_mac_header(skb), ETH_ALEN);
+ put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2));
+ break;
+
+ case BNEP_COMPRESSED_DST_ONLY:
+ memcpy(__skb_put(nskb, ETH_ALEN), skb_mac_header(skb),
+ ETH_ALEN);
+ memcpy(__skb_put(nskb, ETH_ALEN + 2), s->eh.h_source,
+ ETH_ALEN + 2);
+ break;
+
+ case BNEP_GENERAL:
+ memcpy(__skb_put(nskb, ETH_ALEN * 2), skb_mac_header(skb),
+ ETH_ALEN * 2);
+ put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2));
+ break;
+ }
+
+ skb_copy_from_linear_data(skb, __skb_put(nskb, skb->len), skb->len);
+ kfree_skb(skb);
+
+ dev->stats.rx_packets++;
+ nskb->ip_summed = CHECKSUM_NONE;
+ nskb->protocol = eth_type_trans(nskb, dev);
+ netif_rx_ni(nskb);
+ return 0;
+
+badframe:
+ dev->stats.rx_errors++;
+ kfree_skb(skb);
+ return 0;
+}
+
+static u8 __bnep_tx_types[] = {
+ BNEP_GENERAL,
+ BNEP_COMPRESSED_SRC_ONLY,
+ BNEP_COMPRESSED_DST_ONLY,
+ BNEP_COMPRESSED
+};
+
+static int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
+{
+ struct ethhdr *eh = (void *) skb->data;
+ struct socket *sock = s->sock;
+ struct kvec iv[3];
+ int len = 0, il = 0;
+ u8 type = 0;
+
+ BT_DBG("skb %p dev %p type %d", skb, skb->dev, skb->pkt_type);
+
+ if (!skb->dev) {
+ /* Control frame sent by us */
+ goto send;
+ }
+
+ iv[il++] = (struct kvec) { &type, 1 };
+ len++;
+
+ if (compress_src && ether_addr_equal(eh->h_dest, s->eh.h_source))
+ type |= 0x01;
+
+ if (compress_dst && ether_addr_equal(eh->h_source, s->eh.h_dest))
+ type |= 0x02;
+
+ if (type)
+ skb_pull(skb, ETH_ALEN * 2);
+
+ type = __bnep_tx_types[type];
+ switch (type) {
+ case BNEP_COMPRESSED_SRC_ONLY:
+ iv[il++] = (struct kvec) { eh->h_source, ETH_ALEN };
+ len += ETH_ALEN;
+ break;
+
+ case BNEP_COMPRESSED_DST_ONLY:
+ iv[il++] = (struct kvec) { eh->h_dest, ETH_ALEN };
+ len += ETH_ALEN;
+ break;
+ }
+
+send:
+ iv[il++] = (struct kvec) { skb->data, skb->len };
+ len += skb->len;
+
+ /* FIXME: linearize skb */
+ {
+ len = kernel_sendmsg(sock, &s->msg, iv, il, len);
+ }
+ kfree_skb(skb);
+
+ if (len > 0) {
+ s->dev->stats.tx_bytes += len;
+ s->dev->stats.tx_packets++;
+ return 0;
+ }
+
+ return len;
+}
+
+static int bnep_session(void *arg)
+{
+ struct bnep_session *s = arg;
+ struct net_device *dev = s->dev;
+ struct sock *sk = s->sock->sk;
+ struct sk_buff *skb;
+ wait_queue_t wait;
+
+ BT_DBG("");
+
+ set_user_nice(current, -15);
+
+ init_waitqueue_entry(&wait, current);
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (atomic_read(&s->terminate))
+ break;
+ /* RX */
+ while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ if (!skb_linearize(skb))
+ bnep_rx_frame(s, skb);
+ else
+ kfree_skb(skb);
+ }
+
+ if (sk->sk_state != BT_CONNECTED)
+ break;
+
+ /* TX */
+ while ((skb = skb_dequeue(&sk->sk_write_queue)))
+ if (bnep_tx_frame(s, skb))
+ break;
+ netif_wake_queue(dev);
+
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ /* Cleanup session */
+ down_write(&bnep_session_sem);
+
+ /* Delete network device */
+ unregister_netdev(dev);
+
+ /* Wakeup user-space polling for socket errors */
+ s->sock->sk->sk_err = EUNATCH;
+
+ wake_up_interruptible(sk_sleep(s->sock->sk));
+
+ /* Release the socket */
+ fput(s->sock->file);
+
+ __bnep_unlink_session(s);
+
+ up_write(&bnep_session_sem);
+ free_netdev(dev);
+ module_put_and_exit(0);
+ return 0;
+}
+
+static struct device *bnep_get_device(struct bnep_session *session)
+{
+ struct l2cap_conn *conn = l2cap_pi(session->sock->sk)->chan->conn;
+
+ if (!conn || !conn->hcon)
+ return NULL;
+
+ return &conn->hcon->dev;
+}
+
+static struct device_type bnep_type = {
+ .name = "bluetooth",
+};
+
+int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
+{
+ u32 valid_flags = BIT(BNEP_SETUP_RESPONSE);
+ struct net_device *dev;
+ struct bnep_session *s, *ss;
+ u8 dst[ETH_ALEN], src[ETH_ALEN];
+ int err;
+
+ BT_DBG("");
+
+ if (!l2cap_is_socket(sock))
+ return -EBADFD;
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
+ baswap((void *) dst, &l2cap_pi(sock->sk)->chan->dst);
+ baswap((void *) src, &l2cap_pi(sock->sk)->chan->src);
+
+ /* session struct allocated as private part of net_device */
+ dev = alloc_netdev(sizeof(struct bnep_session),
+ (*req->device) ? req->device : "bnep%d",
+ NET_NAME_UNKNOWN,
+ bnep_net_setup);
+ if (!dev)
+ return -ENOMEM;
+
+ down_write(&bnep_session_sem);
+
+ ss = __bnep_get_session(dst);
+ if (ss && ss->state == BT_CONNECTED) {
+ err = -EEXIST;
+ goto failed;
+ }
+
+ s = netdev_priv(dev);
+
+ /* This is rx header therefore addresses are swapped.
+ * ie. eh.h_dest is our local address. */
+ memcpy(s->eh.h_dest, &src, ETH_ALEN);
+ memcpy(s->eh.h_source, &dst, ETH_ALEN);
+ memcpy(dev->dev_addr, s->eh.h_dest, ETH_ALEN);
+
+ s->dev = dev;
+ s->sock = sock;
+ s->role = req->role;
+ s->state = BT_CONNECTED;
+ s->flags = req->flags;
+
+ s->msg.msg_flags = MSG_NOSIGNAL;
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+ /* Set default mc filter */
+ set_bit(bnep_mc_hash(dev->broadcast), (ulong *) &s->mc_filter);
+#endif
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+ /* Set default protocol filter */
+ bnep_set_default_proto_filter(s);
+#endif
+
+ SET_NETDEV_DEV(dev, bnep_get_device(s));
+ SET_NETDEV_DEVTYPE(dev, &bnep_type);
+
+ err = register_netdev(dev);
+ if (err)
+ goto failed;
+
+ __bnep_link_session(s);
+
+ __module_get(THIS_MODULE);
+ s->task = kthread_run(bnep_session, s, "kbnepd %s", dev->name);
+ if (IS_ERR(s->task)) {
+ /* Session thread start failed, gotta cleanup. */
+ module_put(THIS_MODULE);
+ unregister_netdev(dev);
+ __bnep_unlink_session(s);
+ err = PTR_ERR(s->task);
+ goto failed;
+ }
+
+ up_write(&bnep_session_sem);
+ strcpy(req->device, dev->name);
+ return 0;
+
+failed:
+ up_write(&bnep_session_sem);
+ free_netdev(dev);
+ return err;
+}
+
+int bnep_del_connection(struct bnep_conndel_req *req)
+{
+ u32 valid_flags = 0;
+ struct bnep_session *s;
+ int err = 0;
+
+ BT_DBG("");
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
+ down_read(&bnep_session_sem);
+
+ s = __bnep_get_session(req->dst);
+ if (s) {
+ atomic_inc(&s->terminate);
+ wake_up_process(s->task);
+ } else
+ err = -ENOENT;
+
+ up_read(&bnep_session_sem);
+ return err;
+}
+
+static void __bnep_copy_ci(struct bnep_conninfo *ci, struct bnep_session *s)
+{
+ u32 valid_flags = BIT(BNEP_SETUP_RESPONSE);
+
+ memset(ci, 0, sizeof(*ci));
+ memcpy(ci->dst, s->eh.h_source, ETH_ALEN);
+ strcpy(ci->device, s->dev->name);
+ ci->flags = s->flags & valid_flags;
+ ci->state = s->state;
+ ci->role = s->role;
+}
+
+int bnep_get_connlist(struct bnep_connlist_req *req)
+{
+ struct bnep_session *s;
+ int err = 0, n = 0;
+
+ down_read(&bnep_session_sem);
+
+ list_for_each_entry(s, &bnep_session_list, list) {
+ struct bnep_conninfo ci;
+
+ __bnep_copy_ci(&ci, s);
+
+ if (copy_to_user(req->ci, &ci, sizeof(ci))) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (++n >= req->cnum)
+ break;
+
+ req->ci++;
+ }
+ req->cnum = n;
+
+ up_read(&bnep_session_sem);
+ return err;
+}
+
+int bnep_get_conninfo(struct bnep_conninfo *ci)
+{
+ struct bnep_session *s;
+ int err = 0;
+
+ down_read(&bnep_session_sem);
+
+ s = __bnep_get_session(ci->dst);
+ if (s)
+ __bnep_copy_ci(ci, s);
+ else
+ err = -ENOENT;
+
+ up_read(&bnep_session_sem);
+ return err;
+}
+
+static int __init bnep_init(void)
+{
+ char flt[50] = "";
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+ strcat(flt, "protocol ");
+#endif
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+ strcat(flt, "multicast");
+#endif
+
+ BT_INFO("BNEP (Ethernet Emulation) ver %s", VERSION);
+ if (flt[0])
+ BT_INFO("BNEP filters: %s", flt);
+
+ bnep_sock_init();
+ return 0;
+}
+
+static void __exit bnep_exit(void)
+{
+ bnep_sock_cleanup();
+}
+
+module_init(bnep_init);
+module_exit(bnep_exit);
+
+module_param(compress_src, bool, 0644);
+MODULE_PARM_DESC(compress_src, "Compress sources headers");
+
+module_param(compress_dst, bool, 0644);
+MODULE_PARM_DESC(compress_dst, "Compress destination headers");
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth BNEP ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("bt-proto-4");
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
new file mode 100644
index 000000000..6ceb5d36a
--- /dev/null
+++ b/net/bluetooth/bnep/netdev.c
@@ -0,0 +1,229 @@
+/*
+ BNEP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2001-2002 Inventel Systemes
+ Written 2001-2002 by
+ Clément Moreau <clement.moreau@inventel.fr>
+ David Libault <david.libault@inventel.fr>
+
+ Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/etherdevice.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "bnep.h"
+
+#define BNEP_TX_QUEUE_LEN 20
+
+static int bnep_net_open(struct net_device *dev)
+{
+ netif_start_queue(dev);
+ return 0;
+}
+
+static int bnep_net_close(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static void bnep_net_set_mc_list(struct net_device *dev)
+{
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+ struct bnep_session *s = netdev_priv(dev);
+ struct sock *sk = s->sock->sk;
+ struct bnep_set_filter_req *r;
+ struct sk_buff *skb;
+ int size;
+
+ BT_DBG("%s mc_count %d", dev->name, netdev_mc_count(dev));
+
+ size = sizeof(*r) + (BNEP_MAX_MULTICAST_FILTERS + 1) * ETH_ALEN * 2;
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb) {
+ BT_ERR("%s Multicast list allocation failed", dev->name);
+ return;
+ }
+
+ r = (void *) skb->data;
+ __skb_put(skb, sizeof(*r));
+
+ r->type = BNEP_CONTROL;
+ r->ctrl = BNEP_FILTER_MULTI_ADDR_SET;
+
+ if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
+ u8 start[ETH_ALEN] = { 0x01 };
+
+ /* Request all addresses */
+ memcpy(__skb_put(skb, ETH_ALEN), start, ETH_ALEN);
+ memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
+ r->len = htons(ETH_ALEN * 2);
+ } else {
+ struct netdev_hw_addr *ha;
+ int i, len = skb->len;
+
+ if (dev->flags & IFF_BROADCAST) {
+ memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
+ memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
+ }
+
+ /* FIXME: We should group addresses here. */
+
+ i = 0;
+ netdev_for_each_mc_addr(ha, dev) {
+ if (i == BNEP_MAX_MULTICAST_FILTERS)
+ break;
+ memcpy(__skb_put(skb, ETH_ALEN), ha->addr, ETH_ALEN);
+ memcpy(__skb_put(skb, ETH_ALEN), ha->addr, ETH_ALEN);
+
+ i++;
+ }
+ r->len = htons(skb->len - len);
+ }
+
+ skb_queue_tail(&sk->sk_write_queue, skb);
+ wake_up_interruptible(sk_sleep(sk));
+#endif
+}
+
+static int bnep_net_set_mac_addr(struct net_device *dev, void *arg)
+{
+ BT_DBG("%s", dev->name);
+ return 0;
+}
+
+static void bnep_net_timeout(struct net_device *dev)
+{
+ BT_DBG("net_timeout");
+ netif_wake_queue(dev);
+}
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+static int bnep_net_mc_filter(struct sk_buff *skb, struct bnep_session *s)
+{
+ struct ethhdr *eh = (void *) skb->data;
+
+ if ((eh->h_dest[0] & 1) && !test_bit(bnep_mc_hash(eh->h_dest), (ulong *) &s->mc_filter))
+ return 1;
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+/* Determine ether protocol. Based on eth_type_trans. */
+static u16 bnep_net_eth_proto(struct sk_buff *skb)
+{
+ struct ethhdr *eh = (void *) skb->data;
+ u16 proto = ntohs(eh->h_proto);
+
+ if (proto >= ETH_P_802_3_MIN)
+ return proto;
+
+ if (get_unaligned((__be16 *) skb->data) == htons(0xFFFF))
+ return ETH_P_802_3;
+
+ return ETH_P_802_2;
+}
+
+static int bnep_net_proto_filter(struct sk_buff *skb, struct bnep_session *s)
+{
+ u16 proto = bnep_net_eth_proto(skb);
+ struct bnep_proto_filter *f = s->proto_filter;
+ int i;
+
+ for (i = 0; i < BNEP_MAX_PROTO_FILTERS && f[i].end; i++) {
+ if (proto >= f[i].start && proto <= f[i].end)
+ return 0;
+ }
+
+ BT_DBG("BNEP: filtered skb %p, proto 0x%.4x", skb, proto);
+ return 1;
+}
+#endif
+
+static netdev_tx_t bnep_net_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct bnep_session *s = netdev_priv(dev);
+ struct sock *sk = s->sock->sk;
+
+ BT_DBG("skb %p, dev %p", skb, dev);
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+ if (bnep_net_mc_filter(skb, s)) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+#endif
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+ if (bnep_net_proto_filter(skb, s)) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+#endif
+
+ /*
+ * We cannot send L2CAP packets from here as we are potentially in a bh.
+ * So we have to queue them and wake up session thread which is sleeping
+ * on the sk_sleep(sk).
+ */
+ dev->trans_start = jiffies;
+ skb_queue_tail(&sk->sk_write_queue, skb);
+ wake_up_interruptible(sk_sleep(sk));
+
+ if (skb_queue_len(&sk->sk_write_queue) >= BNEP_TX_QUEUE_LEN) {
+ BT_DBG("tx queue is full");
+
+ /* Stop queuing.
+ * Session thread will do netif_wake_queue() */
+ netif_stop_queue(dev);
+ }
+
+ return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops bnep_netdev_ops = {
+ .ndo_open = bnep_net_open,
+ .ndo_stop = bnep_net_close,
+ .ndo_start_xmit = bnep_net_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_rx_mode = bnep_net_set_mc_list,
+ .ndo_set_mac_address = bnep_net_set_mac_addr,
+ .ndo_tx_timeout = bnep_net_timeout,
+ .ndo_change_mtu = eth_change_mtu,
+
+};
+
+void bnep_net_setup(struct net_device *dev)
+{
+
+ eth_broadcast_addr(dev->broadcast);
+ dev->addr_len = ETH_ALEN;
+
+ ether_setup(dev);
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->netdev_ops = &bnep_netdev_ops;
+
+ dev->watchdog_timeo = HZ * 2;
+}
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
new file mode 100644
index 000000000..bde2bdd9e
--- /dev/null
+++ b/net/bluetooth/bnep/sock.c
@@ -0,0 +1,265 @@
+/*
+ BNEP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2001-2002 Inventel Systemes
+ Written 2001-2002 by
+ David Libault <david.libault@inventel.fr>
+
+ Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/export.h>
+#include <linux/file.h>
+
+#include "bnep.h"
+
+static struct bt_sock_list bnep_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(bnep_sk_list.lock)
+};
+
+static int bnep_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ BT_DBG("sock %p sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ bt_sock_unlink(&bnep_sk_list, sk);
+
+ sock_orphan(sk);
+ sock_put(sk);
+ return 0;
+}
+
+static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct bnep_connlist_req cl;
+ struct bnep_connadd_req ca;
+ struct bnep_conndel_req cd;
+ struct bnep_conninfo ci;
+ struct socket *nsock;
+ void __user *argp = (void __user *)arg;
+ __u32 supp_feat = BIT(BNEP_SETUP_RESPONSE);
+ int err;
+
+ BT_DBG("cmd %x arg %lx", cmd, arg);
+
+ switch (cmd) {
+ case BNEPCONNADD:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&ca, argp, sizeof(ca)))
+ return -EFAULT;
+
+ nsock = sockfd_lookup(ca.sock, &err);
+ if (!nsock)
+ return err;
+
+ if (nsock->sk->sk_state != BT_CONNECTED) {
+ sockfd_put(nsock);
+ return -EBADFD;
+ }
+ ca.device[sizeof(ca.device)-1] = 0;
+
+ err = bnep_add_connection(&ca, nsock);
+ if (!err) {
+ if (copy_to_user(argp, &ca, sizeof(ca)))
+ err = -EFAULT;
+ } else
+ sockfd_put(nsock);
+
+ return err;
+
+ case BNEPCONNDEL:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&cd, argp, sizeof(cd)))
+ return -EFAULT;
+
+ return bnep_del_connection(&cd);
+
+ case BNEPGETCONNLIST:
+ if (copy_from_user(&cl, argp, sizeof(cl)))
+ return -EFAULT;
+
+ if (cl.cnum <= 0)
+ return -EINVAL;
+
+ err = bnep_get_connlist(&cl);
+ if (!err && copy_to_user(argp, &cl, sizeof(cl)))
+ return -EFAULT;
+
+ return err;
+
+ case BNEPGETCONNINFO:
+ if (copy_from_user(&ci, argp, sizeof(ci)))
+ return -EFAULT;
+
+ err = bnep_get_conninfo(&ci);
+ if (!err && copy_to_user(argp, &ci, sizeof(ci)))
+ return -EFAULT;
+
+ return err;
+
+ case BNEPGETSUPPFEAT:
+ if (copy_to_user(argp, &supp_feat, sizeof(supp_feat)))
+ return -EFAULT;
+
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ if (cmd == BNEPGETCONNLIST) {
+ struct bnep_connlist_req cl;
+ u32 uci;
+ int err;
+
+ if (get_user(cl.cnum, (u32 __user *) arg) ||
+ get_user(uci, (u32 __user *) (arg + 4)))
+ return -EFAULT;
+
+ cl.ci = compat_ptr(uci);
+
+ if (cl.cnum <= 0)
+ return -EINVAL;
+
+ err = bnep_get_connlist(&cl);
+
+ if (!err && put_user(cl.cnum, (u32 __user *) arg))
+ err = -EFAULT;
+
+ return err;
+ }
+
+ return bnep_sock_ioctl(sock, cmd, arg);
+}
+#endif
+
+static const struct proto_ops bnep_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = bnep_sock_release,
+ .ioctl = bnep_sock_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = bnep_sock_compat_ioctl,
+#endif
+ .bind = sock_no_bind,
+ .getname = sock_no_getname,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .poll = sock_no_poll,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .mmap = sock_no_mmap
+};
+
+static struct proto bnep_proto = {
+ .name = "BNEP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct bt_sock)
+};
+
+static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+
+ sock->ops = &bnep_sock_ops;
+
+ sock->state = SS_UNCONNECTED;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = protocol;
+ sk->sk_state = BT_OPEN;
+
+ bt_sock_link(&bnep_sk_list, sk);
+ return 0;
+}
+
+static const struct net_proto_family bnep_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = bnep_sock_create
+};
+
+int __init bnep_sock_init(void)
+{
+ int err;
+
+ err = proto_register(&bnep_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_BNEP, &bnep_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("Can't register BNEP socket");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "bnep", &bnep_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create BNEP proc file");
+ bt_sock_unregister(BTPROTO_BNEP);
+ goto error;
+ }
+
+ BT_INFO("BNEP socket layer initialized");
+
+ return 0;
+
+error:
+ proto_unregister(&bnep_proto);
+ return err;
+}
+
+void __exit bnep_sock_cleanup(void)
+{
+ bt_procfs_cleanup(&init_net, "bnep");
+ bt_sock_unregister(BTPROTO_BNEP);
+ proto_unregister(&bnep_proto);
+}
diff --git a/net/bluetooth/cmtp/Kconfig b/net/bluetooth/cmtp/Kconfig
new file mode 100644
index 000000000..939da0fbd
--- /dev/null
+++ b/net/bluetooth/cmtp/Kconfig
@@ -0,0 +1,11 @@
+config BT_CMTP
+ tristate "CMTP protocol support"
+ depends on BT_BREDR && ISDN_CAPI
+ help
+ CMTP (CAPI Message Transport Protocol) is a transport layer
+ for CAPI messages. CMTP is required for the Bluetooth Common
+ ISDN Access Profile.
+
+ Say Y here to compile CMTP support into the kernel or say M to
+ compile it as module (cmtp).
+
diff --git a/net/bluetooth/cmtp/Makefile b/net/bluetooth/cmtp/Makefile
new file mode 100644
index 000000000..890a9a5a6
--- /dev/null
+++ b/net/bluetooth/cmtp/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Bluetooth CMTP layer
+#
+
+obj-$(CONFIG_BT_CMTP) += cmtp.o
+
+cmtp-objs := core.o sock.o capi.o
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
new file mode 100644
index 000000000..b0c6c6af7
--- /dev/null
+++ b/net/bluetooth/cmtp/capi.c
@@ -0,0 +1,612 @@
+/*
+ CMTP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <net/sock.h>
+
+#include <linux/isdn/capilli.h>
+#include <linux/isdn/capicmd.h>
+#include <linux/isdn/capiutil.h>
+
+#include "cmtp.h"
+
+#define CAPI_INTEROPERABILITY 0x20
+
+#define CAPI_INTEROPERABILITY_REQ CAPICMD(CAPI_INTEROPERABILITY, CAPI_REQ)
+#define CAPI_INTEROPERABILITY_CONF CAPICMD(CAPI_INTEROPERABILITY, CAPI_CONF)
+#define CAPI_INTEROPERABILITY_IND CAPICMD(CAPI_INTEROPERABILITY, CAPI_IND)
+#define CAPI_INTEROPERABILITY_RESP CAPICMD(CAPI_INTEROPERABILITY, CAPI_RESP)
+
+#define CAPI_INTEROPERABILITY_REQ_LEN (CAPI_MSG_BASELEN + 2)
+#define CAPI_INTEROPERABILITY_CONF_LEN (CAPI_MSG_BASELEN + 4)
+#define CAPI_INTEROPERABILITY_IND_LEN (CAPI_MSG_BASELEN + 2)
+#define CAPI_INTEROPERABILITY_RESP_LEN (CAPI_MSG_BASELEN + 2)
+
+#define CAPI_FUNCTION_REGISTER 0
+#define CAPI_FUNCTION_RELEASE 1
+#define CAPI_FUNCTION_GET_PROFILE 2
+#define CAPI_FUNCTION_GET_MANUFACTURER 3
+#define CAPI_FUNCTION_GET_VERSION 4
+#define CAPI_FUNCTION_GET_SERIAL_NUMBER 5
+#define CAPI_FUNCTION_MANUFACTURER 6
+#define CAPI_FUNCTION_LOOPBACK 7
+
+
+#define CMTP_MSGNUM 1
+#define CMTP_APPLID 2
+#define CMTP_MAPPING 3
+
+static struct cmtp_application *cmtp_application_add(struct cmtp_session *session, __u16 appl)
+{
+ struct cmtp_application *app = kzalloc(sizeof(*app), GFP_KERNEL);
+
+ BT_DBG("session %p application %p appl %d", session, app, appl);
+
+ if (!app)
+ return NULL;
+
+ app->state = BT_OPEN;
+ app->appl = appl;
+
+ list_add_tail(&app->list, &session->applications);
+
+ return app;
+}
+
+static void cmtp_application_del(struct cmtp_session *session, struct cmtp_application *app)
+{
+ BT_DBG("session %p application %p", session, app);
+
+ if (app) {
+ list_del(&app->list);
+ kfree(app);
+ }
+}
+
+static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value)
+{
+ struct cmtp_application *app;
+ struct list_head *p, *n;
+
+ list_for_each_safe(p, n, &session->applications) {
+ app = list_entry(p, struct cmtp_application, list);
+ switch (pattern) {
+ case CMTP_MSGNUM:
+ if (app->msgnum == value)
+ return app;
+ break;
+ case CMTP_APPLID:
+ if (app->appl == value)
+ return app;
+ break;
+ case CMTP_MAPPING:
+ if (app->mapping == value)
+ return app;
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+static int cmtp_msgnum_get(struct cmtp_session *session)
+{
+ session->msgnum++;
+
+ if ((session->msgnum & 0xff) > 200)
+ session->msgnum = CMTP_INITIAL_MSGNUM + 1;
+
+ return session->msgnum;
+}
+
+static void cmtp_send_capimsg(struct cmtp_session *session, struct sk_buff *skb)
+{
+ struct cmtp_scb *scb = (void *) skb->cb;
+
+ BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+ scb->id = -1;
+ scb->data = (CAPIMSG_COMMAND(skb->data) == CAPI_DATA_B3);
+
+ skb_queue_tail(&session->transmit, skb);
+
+ wake_up_interruptible(sk_sleep(session->sock->sk));
+}
+
+static void cmtp_send_interopmsg(struct cmtp_session *session,
+ __u8 subcmd, __u16 appl, __u16 msgnum,
+ __u16 function, unsigned char *buf, int len)
+{
+ struct sk_buff *skb;
+ unsigned char *s;
+
+ BT_DBG("session %p subcmd 0x%02x appl %d msgnum %d", session, subcmd, appl, msgnum);
+
+ skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC);
+ if (!skb) {
+ BT_ERR("Can't allocate memory for interoperability packet");
+ return;
+ }
+
+ s = skb_put(skb, CAPI_MSG_BASELEN + 6 + len);
+
+ capimsg_setu16(s, 0, CAPI_MSG_BASELEN + 6 + len);
+ capimsg_setu16(s, 2, appl);
+ capimsg_setu8 (s, 4, CAPI_INTEROPERABILITY);
+ capimsg_setu8 (s, 5, subcmd);
+ capimsg_setu16(s, 6, msgnum);
+
+ /* Interoperability selector (Bluetooth Device Management) */
+ capimsg_setu16(s, 8, 0x0001);
+
+ capimsg_setu8 (s, 10, 3 + len);
+ capimsg_setu16(s, 11, function);
+ capimsg_setu8 (s, 13, len);
+
+ if (len > 0)
+ memcpy(s + 14, buf, len);
+
+ cmtp_send_capimsg(session, skb);
+}
+
+static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *skb)
+{
+ struct capi_ctr *ctrl = &session->ctrl;
+ struct cmtp_application *application;
+ __u16 appl, msgnum, func, info;
+ __u32 controller;
+
+ BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+ switch (CAPIMSG_SUBCOMMAND(skb->data)) {
+ case CAPI_CONF:
+ if (skb->len < CAPI_MSG_BASELEN + 10)
+ break;
+
+ func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 5);
+ info = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 8);
+
+ switch (func) {
+ case CAPI_FUNCTION_REGISTER:
+ msgnum = CAPIMSG_MSGID(skb->data);
+
+ application = cmtp_application_get(session, CMTP_MSGNUM, msgnum);
+ if (application) {
+ application->state = BT_CONNECTED;
+ application->msgnum = 0;
+ application->mapping = CAPIMSG_APPID(skb->data);
+ wake_up_interruptible(&session->wait);
+ }
+
+ break;
+
+ case CAPI_FUNCTION_RELEASE:
+ appl = CAPIMSG_APPID(skb->data);
+
+ application = cmtp_application_get(session, CMTP_MAPPING, appl);
+ if (application) {
+ application->state = BT_CLOSED;
+ application->msgnum = 0;
+ wake_up_interruptible(&session->wait);
+ }
+
+ break;
+
+ case CAPI_FUNCTION_GET_PROFILE:
+ if (skb->len < CAPI_MSG_BASELEN + 11 + sizeof(capi_profile))
+ break;
+
+ controller = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 11);
+ msgnum = CAPIMSG_MSGID(skb->data);
+
+ if (!info && (msgnum == CMTP_INITIAL_MSGNUM)) {
+ session->ncontroller = controller;
+ wake_up_interruptible(&session->wait);
+ break;
+ }
+
+ if (!info && ctrl) {
+ memcpy(&ctrl->profile,
+ skb->data + CAPI_MSG_BASELEN + 11,
+ sizeof(capi_profile));
+ session->state = BT_CONNECTED;
+ capi_ctr_ready(ctrl);
+ }
+
+ break;
+
+ case CAPI_FUNCTION_GET_MANUFACTURER:
+ if (skb->len < CAPI_MSG_BASELEN + 15)
+ break;
+
+ if (!info && ctrl) {
+ int len = min_t(uint, CAPI_MANUFACTURER_LEN,
+ skb->data[CAPI_MSG_BASELEN + 14]);
+
+ memset(ctrl->manu, 0, CAPI_MANUFACTURER_LEN);
+ strncpy(ctrl->manu,
+ skb->data + CAPI_MSG_BASELEN + 15, len);
+ }
+
+ break;
+
+ case CAPI_FUNCTION_GET_VERSION:
+ if (skb->len < CAPI_MSG_BASELEN + 32)
+ break;
+
+ if (!info && ctrl) {
+ ctrl->version.majorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 16);
+ ctrl->version.minorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 20);
+ ctrl->version.majormanuversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 24);
+ ctrl->version.minormanuversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 28);
+ }
+
+ break;
+
+ case CAPI_FUNCTION_GET_SERIAL_NUMBER:
+ if (skb->len < CAPI_MSG_BASELEN + 17)
+ break;
+
+ if (!info && ctrl) {
+ int len = min_t(uint, CAPI_SERIAL_LEN,
+ skb->data[CAPI_MSG_BASELEN + 16]);
+
+ memset(ctrl->serial, 0, CAPI_SERIAL_LEN);
+ strncpy(ctrl->serial,
+ skb->data + CAPI_MSG_BASELEN + 17, len);
+ }
+
+ break;
+ }
+
+ break;
+
+ case CAPI_IND:
+ if (skb->len < CAPI_MSG_BASELEN + 6)
+ break;
+
+ func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 3);
+
+ if (func == CAPI_FUNCTION_LOOPBACK) {
+ int len = min_t(uint, skb->len - CAPI_MSG_BASELEN - 6,
+ skb->data[CAPI_MSG_BASELEN + 5]);
+ appl = CAPIMSG_APPID(skb->data);
+ msgnum = CAPIMSG_MSGID(skb->data);
+ cmtp_send_interopmsg(session, CAPI_RESP, appl, msgnum, func,
+ skb->data + CAPI_MSG_BASELEN + 6, len);
+ }
+
+ break;
+ }
+
+ kfree_skb(skb);
+}
+
+void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb)
+{
+ struct capi_ctr *ctrl = &session->ctrl;
+ struct cmtp_application *application;
+ __u16 appl;
+ __u32 contr;
+
+ BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+ if (skb->len < CAPI_MSG_BASELEN)
+ return;
+
+ if (CAPIMSG_COMMAND(skb->data) == CAPI_INTEROPERABILITY) {
+ cmtp_recv_interopmsg(session, skb);
+ return;
+ }
+
+ if (session->flags & BIT(CMTP_LOOPBACK)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ appl = CAPIMSG_APPID(skb->data);
+ contr = CAPIMSG_CONTROL(skb->data);
+
+ application = cmtp_application_get(session, CMTP_MAPPING, appl);
+ if (application) {
+ appl = application->appl;
+ CAPIMSG_SETAPPID(skb->data, appl);
+ } else {
+ BT_ERR("Can't find application with id %d", appl);
+ kfree_skb(skb);
+ return;
+ }
+
+ if ((contr & 0x7f) == 0x01) {
+ contr = (contr & 0xffffff80) | session->num;
+ CAPIMSG_SETCONTROL(skb->data, contr);
+ }
+
+ capi_ctr_handle_message(ctrl, appl, skb);
+}
+
+static int cmtp_load_firmware(struct capi_ctr *ctrl, capiloaddata *data)
+{
+ BT_DBG("ctrl %p data %p", ctrl, data);
+
+ return 0;
+}
+
+static void cmtp_reset_ctr(struct capi_ctr *ctrl)
+{
+ struct cmtp_session *session = ctrl->driverdata;
+
+ BT_DBG("ctrl %p", ctrl);
+
+ capi_ctr_down(ctrl);
+
+ atomic_inc(&session->terminate);
+ wake_up_process(session->task);
+}
+
+static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_params *rp)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct cmtp_session *session = ctrl->driverdata;
+ struct cmtp_application *application;
+ unsigned long timeo = CMTP_INTEROP_TIMEOUT;
+ unsigned char buf[8];
+ int err = 0, nconn, want = rp->level3cnt;
+
+ BT_DBG("ctrl %p appl %d level3cnt %d datablkcnt %d datablklen %d",
+ ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen);
+
+ application = cmtp_application_add(session, appl);
+ if (!application) {
+ BT_ERR("Can't allocate memory for new application");
+ return;
+ }
+
+ if (want < 0)
+ nconn = ctrl->profile.nbchannel * -want;
+ else
+ nconn = want;
+
+ if (nconn == 0)
+ nconn = ctrl->profile.nbchannel;
+
+ capimsg_setu16(buf, 0, nconn);
+ capimsg_setu16(buf, 2, rp->datablkcnt);
+ capimsg_setu16(buf, 4, rp->datablklen);
+
+ application->state = BT_CONFIG;
+ application->msgnum = cmtp_msgnum_get(session);
+
+ cmtp_send_interopmsg(session, CAPI_REQ, 0x0000, application->msgnum,
+ CAPI_FUNCTION_REGISTER, buf, 6);
+
+ add_wait_queue(&session->wait, &wait);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (application->state == BT_CLOSED) {
+ err = -application->err;
+ break;
+ }
+
+ if (application->state == BT_CONNECTED)
+ break;
+
+ if (signal_pending(current)) {
+ err = -EINTR;
+ break;
+ }
+
+ timeo = schedule_timeout(timeo);
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&session->wait, &wait);
+
+ if (err) {
+ cmtp_application_del(session, application);
+ return;
+ }
+}
+
+static void cmtp_release_appl(struct capi_ctr *ctrl, __u16 appl)
+{
+ struct cmtp_session *session = ctrl->driverdata;
+ struct cmtp_application *application;
+
+ BT_DBG("ctrl %p appl %d", ctrl, appl);
+
+ application = cmtp_application_get(session, CMTP_APPLID, appl);
+ if (!application) {
+ BT_ERR("Can't find application");
+ return;
+ }
+
+ application->msgnum = cmtp_msgnum_get(session);
+
+ cmtp_send_interopmsg(session, CAPI_REQ, application->mapping, application->msgnum,
+ CAPI_FUNCTION_RELEASE, NULL, 0);
+
+ wait_event_interruptible_timeout(session->wait,
+ (application->state == BT_CLOSED), CMTP_INTEROP_TIMEOUT);
+
+ cmtp_application_del(session, application);
+}
+
+static u16 cmtp_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
+{
+ struct cmtp_session *session = ctrl->driverdata;
+ struct cmtp_application *application;
+ __u16 appl;
+ __u32 contr;
+
+ BT_DBG("ctrl %p skb %p", ctrl, skb);
+
+ appl = CAPIMSG_APPID(skb->data);
+ contr = CAPIMSG_CONTROL(skb->data);
+
+ application = cmtp_application_get(session, CMTP_APPLID, appl);
+ if ((!application) || (application->state != BT_CONNECTED)) {
+ BT_ERR("Can't find application with id %d", appl);
+ return CAPI_ILLAPPNR;
+ }
+
+ CAPIMSG_SETAPPID(skb->data, application->mapping);
+
+ if ((contr & 0x7f) == session->num) {
+ contr = (contr & 0xffffff80) | 0x01;
+ CAPIMSG_SETCONTROL(skb->data, contr);
+ }
+
+ cmtp_send_capimsg(session, skb);
+
+ return CAPI_NOERROR;
+}
+
+static char *cmtp_procinfo(struct capi_ctr *ctrl)
+{
+ return "CAPI Message Transport Protocol";
+}
+
+static int cmtp_proc_show(struct seq_file *m, void *v)
+{
+ struct capi_ctr *ctrl = m->private;
+ struct cmtp_session *session = ctrl->driverdata;
+ struct cmtp_application *app;
+ struct list_head *p, *n;
+
+ seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl));
+ seq_printf(m, "addr %s\n", session->name);
+ seq_printf(m, "ctrl %d\n", session->num);
+
+ list_for_each_safe(p, n, &session->applications) {
+ app = list_entry(p, struct cmtp_application, list);
+ seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping);
+ }
+
+ return 0;
+}
+
+static int cmtp_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cmtp_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations cmtp_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cmtp_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+int cmtp_attach_device(struct cmtp_session *session)
+{
+ unsigned char buf[4];
+ long ret;
+
+ BT_DBG("session %p", session);
+
+ capimsg_setu32(buf, 0, 0);
+
+ cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, CMTP_INITIAL_MSGNUM,
+ CAPI_FUNCTION_GET_PROFILE, buf, 4);
+
+ ret = wait_event_interruptible_timeout(session->wait,
+ session->ncontroller, CMTP_INTEROP_TIMEOUT);
+
+ BT_INFO("Found %d CAPI controller(s) on device %s", session->ncontroller, session->name);
+
+ if (!ret)
+ return -ETIMEDOUT;
+
+ if (!session->ncontroller)
+ return -ENODEV;
+
+ if (session->ncontroller > 1)
+ BT_INFO("Setting up only CAPI controller 1");
+
+ session->ctrl.owner = THIS_MODULE;
+ session->ctrl.driverdata = session;
+ strcpy(session->ctrl.name, session->name);
+
+ session->ctrl.driver_name = "cmtp";
+ session->ctrl.load_firmware = cmtp_load_firmware;
+ session->ctrl.reset_ctr = cmtp_reset_ctr;
+ session->ctrl.register_appl = cmtp_register_appl;
+ session->ctrl.release_appl = cmtp_release_appl;
+ session->ctrl.send_message = cmtp_send_message;
+
+ session->ctrl.procinfo = cmtp_procinfo;
+ session->ctrl.proc_fops = &cmtp_proc_fops;
+
+ if (attach_capi_ctr(&session->ctrl) < 0) {
+ BT_ERR("Can't attach new controller");
+ return -EBUSY;
+ }
+
+ session->num = session->ctrl.cnr;
+
+ BT_DBG("session %p num %d", session, session->num);
+
+ capimsg_setu32(buf, 0, 1);
+
+ cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session),
+ CAPI_FUNCTION_GET_MANUFACTURER, buf, 4);
+
+ cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session),
+ CAPI_FUNCTION_GET_VERSION, buf, 4);
+
+ cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session),
+ CAPI_FUNCTION_GET_SERIAL_NUMBER, buf, 4);
+
+ cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session),
+ CAPI_FUNCTION_GET_PROFILE, buf, 4);
+
+ return 0;
+}
+
+void cmtp_detach_device(struct cmtp_session *session)
+{
+ BT_DBG("session %p", session);
+
+ detach_capi_ctr(&session->ctrl);
+}
diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h
new file mode 100644
index 000000000..c32638ddd
--- /dev/null
+++ b/net/bluetooth/cmtp/cmtp.h
@@ -0,0 +1,129 @@
+/*
+ CMTP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#ifndef __CMTP_H
+#define __CMTP_H
+
+#include <linux/types.h>
+#include <net/bluetooth/bluetooth.h>
+
+#define BTNAMSIZ 18
+
+/* CMTP ioctl defines */
+#define CMTPCONNADD _IOW('C', 200, int)
+#define CMTPCONNDEL _IOW('C', 201, int)
+#define CMTPGETCONNLIST _IOR('C', 210, int)
+#define CMTPGETCONNINFO _IOR('C', 211, int)
+
+#define CMTP_LOOPBACK 0
+
+struct cmtp_connadd_req {
+ int sock; /* Connected socket */
+ __u32 flags;
+};
+
+struct cmtp_conndel_req {
+ bdaddr_t bdaddr;
+ __u32 flags;
+};
+
+struct cmtp_conninfo {
+ bdaddr_t bdaddr;
+ __u32 flags;
+ __u16 state;
+ int num;
+};
+
+struct cmtp_connlist_req {
+ __u32 cnum;
+ struct cmtp_conninfo __user *ci;
+};
+
+int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock);
+int cmtp_del_connection(struct cmtp_conndel_req *req);
+int cmtp_get_connlist(struct cmtp_connlist_req *req);
+int cmtp_get_conninfo(struct cmtp_conninfo *ci);
+
+/* CMTP session defines */
+#define CMTP_INTEROP_TIMEOUT (HZ * 5)
+#define CMTP_INITIAL_MSGNUM 0xff00
+
+struct cmtp_session {
+ struct list_head list;
+
+ struct socket *sock;
+
+ bdaddr_t bdaddr;
+
+ unsigned long state;
+ unsigned long flags;
+
+ uint mtu;
+
+ char name[BTNAMSIZ];
+
+ atomic_t terminate;
+ struct task_struct *task;
+
+ wait_queue_head_t wait;
+
+ int ncontroller;
+ int num;
+ struct capi_ctr ctrl;
+
+ struct list_head applications;
+
+ unsigned long blockids;
+ int msgnum;
+
+ struct sk_buff_head transmit;
+
+ struct sk_buff *reassembly[16];
+};
+
+struct cmtp_application {
+ struct list_head list;
+
+ unsigned long state;
+ int err;
+
+ __u16 appl;
+ __u16 mapping;
+
+ __u16 msgnum;
+};
+
+struct cmtp_scb {
+ int id;
+ int data;
+};
+
+int cmtp_attach_device(struct cmtp_session *session);
+void cmtp_detach_device(struct cmtp_session *session);
+
+void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb);
+
+/* CMTP init defines */
+int cmtp_init_sockets(void);
+void cmtp_cleanup_sockets(void);
+
+#endif /* __CMTP_H */
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
new file mode 100644
index 000000000..298ed3701
--- /dev/null
+++ b/net/bluetooth/cmtp/core.c
@@ -0,0 +1,512 @@
+/*
+ CMTP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/freezer.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <net/sock.h>
+
+#include <linux/isdn/capilli.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "cmtp.h"
+
+#define VERSION "1.0"
+
+static DECLARE_RWSEM(cmtp_session_sem);
+static LIST_HEAD(cmtp_session_list);
+
+static struct cmtp_session *__cmtp_get_session(bdaddr_t *bdaddr)
+{
+ struct cmtp_session *session;
+
+ BT_DBG("");
+
+ list_for_each_entry(session, &cmtp_session_list, list)
+ if (!bacmp(bdaddr, &session->bdaddr))
+ return session;
+
+ return NULL;
+}
+
+static void __cmtp_link_session(struct cmtp_session *session)
+{
+ list_add(&session->list, &cmtp_session_list);
+}
+
+static void __cmtp_unlink_session(struct cmtp_session *session)
+{
+ list_del(&session->list);
+}
+
+static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_conninfo *ci)
+{
+ u32 valid_flags = BIT(CMTP_LOOPBACK);
+ memset(ci, 0, sizeof(*ci));
+ bacpy(&ci->bdaddr, &session->bdaddr);
+
+ ci->flags = session->flags & valid_flags;
+ ci->state = session->state;
+
+ ci->num = session->num;
+}
+
+
+static inline int cmtp_alloc_block_id(struct cmtp_session *session)
+{
+ int i, id = -1;
+
+ for (i = 0; i < 16; i++)
+ if (!test_and_set_bit(i, &session->blockids)) {
+ id = i;
+ break;
+ }
+
+ return id;
+}
+
+static inline void cmtp_free_block_id(struct cmtp_session *session, int id)
+{
+ clear_bit(id, &session->blockids);
+}
+
+static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const unsigned char *buf, int count)
+{
+ struct sk_buff *skb = session->reassembly[id], *nskb;
+ int size;
+
+ BT_DBG("session %p buf %p count %d", session, buf, count);
+
+ size = (skb) ? skb->len + count : count;
+
+ nskb = alloc_skb(size, GFP_ATOMIC);
+ if (!nskb) {
+ BT_ERR("Can't allocate memory for CAPI message");
+ return;
+ }
+
+ if (skb && (skb->len > 0))
+ skb_copy_from_linear_data(skb, skb_put(nskb, skb->len), skb->len);
+
+ memcpy(skb_put(nskb, count), buf, count);
+
+ session->reassembly[id] = nskb;
+
+ kfree_skb(skb);
+}
+
+static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb)
+{
+ __u8 hdr, hdrlen, id;
+ __u16 len;
+
+ BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+ while (skb->len > 0) {
+ hdr = skb->data[0];
+
+ switch (hdr & 0xc0) {
+ case 0x40:
+ hdrlen = 2;
+ len = skb->data[1];
+ break;
+ case 0x80:
+ hdrlen = 3;
+ len = skb->data[1] | (skb->data[2] << 8);
+ break;
+ default:
+ hdrlen = 1;
+ len = 0;
+ break;
+ }
+
+ id = (hdr & 0x3c) >> 2;
+
+ BT_DBG("hdr 0x%02x hdrlen %d len %d id %d", hdr, hdrlen, len, id);
+
+ if (hdrlen + len > skb->len) {
+ BT_ERR("Wrong size or header information in CMTP frame");
+ break;
+ }
+
+ if (len == 0) {
+ skb_pull(skb, hdrlen);
+ continue;
+ }
+
+ switch (hdr & 0x03) {
+ case 0x00:
+ cmtp_add_msgpart(session, id, skb->data + hdrlen, len);
+ cmtp_recv_capimsg(session, session->reassembly[id]);
+ session->reassembly[id] = NULL;
+ break;
+ case 0x01:
+ cmtp_add_msgpart(session, id, skb->data + hdrlen, len);
+ break;
+ default:
+ if (session->reassembly[id] != NULL)
+ kfree_skb(session->reassembly[id]);
+ session->reassembly[id] = NULL;
+ break;
+ }
+
+ skb_pull(skb, hdrlen + len);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, int len)
+{
+ struct socket *sock = session->sock;
+ struct kvec iv = { data, len };
+ struct msghdr msg;
+
+ BT_DBG("session %p data %p len %d", session, data, len);
+
+ if (!len)
+ return 0;
+
+ memset(&msg, 0, sizeof(msg));
+
+ return kernel_sendmsg(sock, &msg, &iv, 1, len);
+}
+
+static void cmtp_process_transmit(struct cmtp_session *session)
+{
+ struct sk_buff *skb, *nskb;
+ unsigned char *hdr;
+ unsigned int size, tail;
+
+ BT_DBG("session %p", session);
+
+ nskb = alloc_skb(session->mtu, GFP_ATOMIC);
+ if (!nskb) {
+ BT_ERR("Can't allocate memory for new frame");
+ return;
+ }
+
+ while ((skb = skb_dequeue(&session->transmit))) {
+ struct cmtp_scb *scb = (void *) skb->cb;
+
+ tail = session->mtu - nskb->len;
+ if (tail < 5) {
+ cmtp_send_frame(session, nskb->data, nskb->len);
+ skb_trim(nskb, 0);
+ tail = session->mtu;
+ }
+
+ size = min_t(uint, ((tail < 258) ? (tail - 2) : (tail - 3)), skb->len);
+
+ if (scb->id < 0) {
+ scb->id = cmtp_alloc_block_id(session);
+ if (scb->id < 0) {
+ skb_queue_head(&session->transmit, skb);
+ break;
+ }
+ }
+
+ if (size < 256) {
+ hdr = skb_put(nskb, 2);
+ hdr[0] = 0x40
+ | ((scb->id << 2) & 0x3c)
+ | ((skb->len == size) ? 0x00 : 0x01);
+ hdr[1] = size;
+ } else {
+ hdr = skb_put(nskb, 3);
+ hdr[0] = 0x80
+ | ((scb->id << 2) & 0x3c)
+ | ((skb->len == size) ? 0x00 : 0x01);
+ hdr[1] = size & 0xff;
+ hdr[2] = size >> 8;
+ }
+
+ skb_copy_from_linear_data(skb, skb_put(nskb, size), size);
+ skb_pull(skb, size);
+
+ if (skb->len > 0) {
+ skb_queue_head(&session->transmit, skb);
+ } else {
+ cmtp_free_block_id(session, scb->id);
+ if (scb->data) {
+ cmtp_send_frame(session, nskb->data, nskb->len);
+ skb_trim(nskb, 0);
+ }
+ kfree_skb(skb);
+ }
+ }
+
+ cmtp_send_frame(session, nskb->data, nskb->len);
+
+ kfree_skb(nskb);
+}
+
+static int cmtp_session(void *arg)
+{
+ struct cmtp_session *session = arg;
+ struct sock *sk = session->sock->sk;
+ struct sk_buff *skb;
+ wait_queue_t wait;
+
+ BT_DBG("session %p", session);
+
+ set_user_nice(current, -15);
+
+ init_waitqueue_entry(&wait, current);
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (atomic_read(&session->terminate))
+ break;
+ if (sk->sk_state != BT_CONNECTED)
+ break;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ if (!skb_linearize(skb))
+ cmtp_recv_frame(session, skb);
+ else
+ kfree_skb(skb);
+ }
+
+ cmtp_process_transmit(session);
+
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ down_write(&cmtp_session_sem);
+
+ if (!(session->flags & BIT(CMTP_LOOPBACK)))
+ cmtp_detach_device(session);
+
+ fput(session->sock->file);
+
+ __cmtp_unlink_session(session);
+
+ up_write(&cmtp_session_sem);
+
+ kfree(session);
+ module_put_and_exit(0);
+ return 0;
+}
+
+int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
+{
+ u32 valid_flags = BIT(CMTP_LOOPBACK);
+ struct cmtp_session *session, *s;
+ int i, err;
+
+ BT_DBG("");
+
+ if (!l2cap_is_socket(sock))
+ return -EBADFD;
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
+ session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL);
+ if (!session)
+ return -ENOMEM;
+
+ down_write(&cmtp_session_sem);
+
+ s = __cmtp_get_session(&l2cap_pi(sock->sk)->chan->dst);
+ if (s && s->state == BT_CONNECTED) {
+ err = -EEXIST;
+ goto failed;
+ }
+
+ bacpy(&session->bdaddr, &l2cap_pi(sock->sk)->chan->dst);
+
+ session->mtu = min_t(uint, l2cap_pi(sock->sk)->chan->omtu,
+ l2cap_pi(sock->sk)->chan->imtu);
+
+ BT_DBG("mtu %d", session->mtu);
+
+ sprintf(session->name, "%pMR", &session->bdaddr);
+
+ session->sock = sock;
+ session->state = BT_CONFIG;
+
+ init_waitqueue_head(&session->wait);
+
+ session->msgnum = CMTP_INITIAL_MSGNUM;
+
+ INIT_LIST_HEAD(&session->applications);
+
+ skb_queue_head_init(&session->transmit);
+
+ for (i = 0; i < 16; i++)
+ session->reassembly[i] = NULL;
+
+ session->flags = req->flags;
+
+ __cmtp_link_session(session);
+
+ __module_get(THIS_MODULE);
+ session->task = kthread_run(cmtp_session, session, "kcmtpd_ctr_%d",
+ session->num);
+ if (IS_ERR(session->task)) {
+ module_put(THIS_MODULE);
+ err = PTR_ERR(session->task);
+ goto unlink;
+ }
+
+ if (!(session->flags & BIT(CMTP_LOOPBACK))) {
+ err = cmtp_attach_device(session);
+ if (err < 0) {
+ atomic_inc(&session->terminate);
+ wake_up_process(session->task);
+ up_write(&cmtp_session_sem);
+ return err;
+ }
+ }
+
+ up_write(&cmtp_session_sem);
+ return 0;
+
+unlink:
+ __cmtp_unlink_session(session);
+
+failed:
+ up_write(&cmtp_session_sem);
+ kfree(session);
+ return err;
+}
+
+int cmtp_del_connection(struct cmtp_conndel_req *req)
+{
+ u32 valid_flags = 0;
+ struct cmtp_session *session;
+ int err = 0;
+
+ BT_DBG("");
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
+ down_read(&cmtp_session_sem);
+
+ session = __cmtp_get_session(&req->bdaddr);
+ if (session) {
+ /* Flush the transmit queue */
+ skb_queue_purge(&session->transmit);
+
+ /* Stop session thread */
+ atomic_inc(&session->terminate);
+ wake_up_process(session->task);
+ } else
+ err = -ENOENT;
+
+ up_read(&cmtp_session_sem);
+ return err;
+}
+
+int cmtp_get_connlist(struct cmtp_connlist_req *req)
+{
+ struct cmtp_session *session;
+ int err = 0, n = 0;
+
+ BT_DBG("");
+
+ down_read(&cmtp_session_sem);
+
+ list_for_each_entry(session, &cmtp_session_list, list) {
+ struct cmtp_conninfo ci;
+
+ __cmtp_copy_session(session, &ci);
+
+ if (copy_to_user(req->ci, &ci, sizeof(ci))) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (++n >= req->cnum)
+ break;
+
+ req->ci++;
+ }
+ req->cnum = n;
+
+ up_read(&cmtp_session_sem);
+ return err;
+}
+
+int cmtp_get_conninfo(struct cmtp_conninfo *ci)
+{
+ struct cmtp_session *session;
+ int err = 0;
+
+ down_read(&cmtp_session_sem);
+
+ session = __cmtp_get_session(&ci->bdaddr);
+ if (session)
+ __cmtp_copy_session(session, ci);
+ else
+ err = -ENOENT;
+
+ up_read(&cmtp_session_sem);
+ return err;
+}
+
+
+static int __init cmtp_init(void)
+{
+ BT_INFO("CMTP (CAPI Emulation) ver %s", VERSION);
+
+ cmtp_init_sockets();
+
+ return 0;
+}
+
+static void __exit cmtp_exit(void)
+{
+ cmtp_cleanup_sockets();
+}
+
+module_init(cmtp_init);
+module_exit(cmtp_exit);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth CMTP ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("bt-proto-5");
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
new file mode 100644
index 000000000..d82787d41
--- /dev/null
+++ b/net/bluetooth/cmtp/sock.c
@@ -0,0 +1,269 @@
+/*
+ CMTP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/export.h>
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+#include <linux/gfp.h>
+#include <linux/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/isdn/capilli.h>
+
+
+#include "cmtp.h"
+
+static struct bt_sock_list cmtp_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(cmtp_sk_list.lock)
+};
+
+static int cmtp_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ BT_DBG("sock %p sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ bt_sock_unlink(&cmtp_sk_list, sk);
+
+ sock_orphan(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct cmtp_connadd_req ca;
+ struct cmtp_conndel_req cd;
+ struct cmtp_connlist_req cl;
+ struct cmtp_conninfo ci;
+ struct socket *nsock;
+ void __user *argp = (void __user *)arg;
+ int err;
+
+ BT_DBG("cmd %x arg %lx", cmd, arg);
+
+ switch (cmd) {
+ case CMTPCONNADD:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&ca, argp, sizeof(ca)))
+ return -EFAULT;
+
+ nsock = sockfd_lookup(ca.sock, &err);
+ if (!nsock)
+ return err;
+
+ if (nsock->sk->sk_state != BT_CONNECTED) {
+ sockfd_put(nsock);
+ return -EBADFD;
+ }
+
+ err = cmtp_add_connection(&ca, nsock);
+ if (!err) {
+ if (copy_to_user(argp, &ca, sizeof(ca)))
+ err = -EFAULT;
+ } else
+ sockfd_put(nsock);
+
+ return err;
+
+ case CMTPCONNDEL:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&cd, argp, sizeof(cd)))
+ return -EFAULT;
+
+ return cmtp_del_connection(&cd);
+
+ case CMTPGETCONNLIST:
+ if (copy_from_user(&cl, argp, sizeof(cl)))
+ return -EFAULT;
+
+ if (cl.cnum <= 0)
+ return -EINVAL;
+
+ err = cmtp_get_connlist(&cl);
+ if (!err && copy_to_user(argp, &cl, sizeof(cl)))
+ return -EFAULT;
+
+ return err;
+
+ case CMTPGETCONNINFO:
+ if (copy_from_user(&ci, argp, sizeof(ci)))
+ return -EFAULT;
+
+ err = cmtp_get_conninfo(&ci);
+ if (!err && copy_to_user(argp, &ci, sizeof(ci)))
+ return -EFAULT;
+
+ return err;
+ }
+
+ return -EINVAL;
+}
+
+#ifdef CONFIG_COMPAT
+static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ if (cmd == CMTPGETCONNLIST) {
+ struct cmtp_connlist_req cl;
+ u32 uci;
+ int err;
+
+ if (get_user(cl.cnum, (u32 __user *) arg) ||
+ get_user(uci, (u32 __user *) (arg + 4)))
+ return -EFAULT;
+
+ cl.ci = compat_ptr(uci);
+
+ if (cl.cnum <= 0)
+ return -EINVAL;
+
+ err = cmtp_get_connlist(&cl);
+
+ if (!err && put_user(cl.cnum, (u32 __user *) arg))
+ err = -EFAULT;
+
+ return err;
+ }
+
+ return cmtp_sock_ioctl(sock, cmd, arg);
+}
+#endif
+
+static const struct proto_ops cmtp_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = cmtp_sock_release,
+ .ioctl = cmtp_sock_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = cmtp_sock_compat_ioctl,
+#endif
+ .bind = sock_no_bind,
+ .getname = sock_no_getname,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .poll = sock_no_poll,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .mmap = sock_no_mmap
+};
+
+static struct proto cmtp_proto = {
+ .name = "CMTP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct bt_sock)
+};
+
+static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+
+ sock->ops = &cmtp_sock_ops;
+
+ sock->state = SS_UNCONNECTED;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = protocol;
+ sk->sk_state = BT_OPEN;
+
+ bt_sock_link(&cmtp_sk_list, sk);
+
+ return 0;
+}
+
+static const struct net_proto_family cmtp_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = cmtp_sock_create
+};
+
+int cmtp_init_sockets(void)
+{
+ int err;
+
+ err = proto_register(&cmtp_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_CMTP, &cmtp_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("Can't register CMTP socket");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "cmtp", &cmtp_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create CMTP proc file");
+ bt_sock_unregister(BTPROTO_HIDP);
+ goto error;
+ }
+
+ BT_INFO("CMTP socket layer initialized");
+
+ return 0;
+
+error:
+ proto_unregister(&cmtp_proto);
+ return err;
+}
+
+void cmtp_cleanup_sockets(void)
+{
+ bt_procfs_cleanup(&init_net, "cmtp");
+ bt_sock_unregister(BTPROTO_CMTP);
+ proto_unregister(&cmtp_proto);
+}
diff --git a/net/bluetooth/ecc.c b/net/bluetooth/ecc.c
new file mode 100644
index 000000000..e1709f846
--- /dev/null
+++ b/net/bluetooth/ecc.c
@@ -0,0 +1,816 @@
+/*
+ * Copyright (c) 2013, Kenneth MacKay
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/random.h>
+
+#include "ecc.h"
+
+/* 256-bit curve */
+#define ECC_BYTES 32
+
+#define MAX_TRIES 16
+
+/* Number of u64's needed */
+#define NUM_ECC_DIGITS (ECC_BYTES / 8)
+
+struct ecc_point {
+ u64 x[NUM_ECC_DIGITS];
+ u64 y[NUM_ECC_DIGITS];
+};
+
+typedef struct {
+ u64 m_low;
+ u64 m_high;
+} uint128_t;
+
+#define CURVE_P_32 { 0xFFFFFFFFFFFFFFFFull, 0x00000000FFFFFFFFull, \
+ 0x0000000000000000ull, 0xFFFFFFFF00000001ull }
+
+#define CURVE_G_32 { \
+ { 0xF4A13945D898C296ull, 0x77037D812DEB33A0ull, \
+ 0xF8BCE6E563A440F2ull, 0x6B17D1F2E12C4247ull }, \
+ { 0xCBB6406837BF51F5ull, 0x2BCE33576B315ECEull, \
+ 0x8EE7EB4A7C0F9E16ull, 0x4FE342E2FE1A7F9Bull } \
+}
+
+#define CURVE_N_32 { 0xF3B9CAC2FC632551ull, 0xBCE6FAADA7179E84ull, \
+ 0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF00000000ull }
+
+static u64 curve_p[NUM_ECC_DIGITS] = CURVE_P_32;
+static struct ecc_point curve_g = CURVE_G_32;
+static u64 curve_n[NUM_ECC_DIGITS] = CURVE_N_32;
+
+static void vli_clear(u64 *vli)
+{
+ int i;
+
+ for (i = 0; i < NUM_ECC_DIGITS; i++)
+ vli[i] = 0;
+}
+
+/* Returns true if vli == 0, false otherwise. */
+static bool vli_is_zero(const u64 *vli)
+{
+ int i;
+
+ for (i = 0; i < NUM_ECC_DIGITS; i++) {
+ if (vli[i])
+ return false;
+ }
+
+ return true;
+}
+
+/* Returns nonzero if bit bit of vli is set. */
+static u64 vli_test_bit(const u64 *vli, unsigned int bit)
+{
+ return (vli[bit / 64] & ((u64) 1 << (bit % 64)));
+}
+
+/* Counts the number of 64-bit "digits" in vli. */
+static unsigned int vli_num_digits(const u64 *vli)
+{
+ int i;
+
+ /* Search from the end until we find a non-zero digit.
+ * We do it in reverse because we expect that most digits will
+ * be nonzero.
+ */
+ for (i = NUM_ECC_DIGITS - 1; i >= 0 && vli[i] == 0; i--);
+
+ return (i + 1);
+}
+
+/* Counts the number of bits required for vli. */
+static unsigned int vli_num_bits(const u64 *vli)
+{
+ unsigned int i, num_digits;
+ u64 digit;
+
+ num_digits = vli_num_digits(vli);
+ if (num_digits == 0)
+ return 0;
+
+ digit = vli[num_digits - 1];
+ for (i = 0; digit; i++)
+ digit >>= 1;
+
+ return ((num_digits - 1) * 64 + i);
+}
+
+/* Sets dest = src. */
+static void vli_set(u64 *dest, const u64 *src)
+{
+ int i;
+
+ for (i = 0; i < NUM_ECC_DIGITS; i++)
+ dest[i] = src[i];
+}
+
+/* Returns sign of left - right. */
+static int vli_cmp(const u64 *left, const u64 *right)
+{
+ int i;
+
+ for (i = NUM_ECC_DIGITS - 1; i >= 0; i--) {
+ if (left[i] > right[i])
+ return 1;
+ else if (left[i] < right[i])
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Computes result = in << c, returning carry. Can modify in place
+ * (if result == in). 0 < shift < 64.
+ */
+static u64 vli_lshift(u64 *result, const u64 *in,
+ unsigned int shift)
+{
+ u64 carry = 0;
+ int i;
+
+ for (i = 0; i < NUM_ECC_DIGITS; i++) {
+ u64 temp = in[i];
+
+ result[i] = (temp << shift) | carry;
+ carry = temp >> (64 - shift);
+ }
+
+ return carry;
+}
+
+/* Computes vli = vli >> 1. */
+static void vli_rshift1(u64 *vli)
+{
+ u64 *end = vli;
+ u64 carry = 0;
+
+ vli += NUM_ECC_DIGITS;
+
+ while (vli-- > end) {
+ u64 temp = *vli;
+ *vli = (temp >> 1) | carry;
+ carry = temp << 63;
+ }
+}
+
+/* Computes result = left + right, returning carry. Can modify in place. */
+static u64 vli_add(u64 *result, const u64 *left,
+ const u64 *right)
+{
+ u64 carry = 0;
+ int i;
+
+ for (i = 0; i < NUM_ECC_DIGITS; i++) {
+ u64 sum;
+
+ sum = left[i] + right[i] + carry;
+ if (sum != left[i])
+ carry = (sum < left[i]);
+
+ result[i] = sum;
+ }
+
+ return carry;
+}
+
+/* Computes result = left - right, returning borrow. Can modify in place. */
+static u64 vli_sub(u64 *result, const u64 *left, const u64 *right)
+{
+ u64 borrow = 0;
+ int i;
+
+ for (i = 0; i < NUM_ECC_DIGITS; i++) {
+ u64 diff;
+
+ diff = left[i] - right[i] - borrow;
+ if (diff != left[i])
+ borrow = (diff > left[i]);
+
+ result[i] = diff;
+ }
+
+ return borrow;
+}
+
+static uint128_t mul_64_64(u64 left, u64 right)
+{
+ u64 a0 = left & 0xffffffffull;
+ u64 a1 = left >> 32;
+ u64 b0 = right & 0xffffffffull;
+ u64 b1 = right >> 32;
+ u64 m0 = a0 * b0;
+ u64 m1 = a0 * b1;
+ u64 m2 = a1 * b0;
+ u64 m3 = a1 * b1;
+ uint128_t result;
+
+ m2 += (m0 >> 32);
+ m2 += m1;
+
+ /* Overflow */
+ if (m2 < m1)
+ m3 += 0x100000000ull;
+
+ result.m_low = (m0 & 0xffffffffull) | (m2 << 32);
+ result.m_high = m3 + (m2 >> 32);
+
+ return result;
+}
+
+static uint128_t add_128_128(uint128_t a, uint128_t b)
+{
+ uint128_t result;
+
+ result.m_low = a.m_low + b.m_low;
+ result.m_high = a.m_high + b.m_high + (result.m_low < a.m_low);
+
+ return result;
+}
+
+static void vli_mult(u64 *result, const u64 *left, const u64 *right)
+{
+ uint128_t r01 = { 0, 0 };
+ u64 r2 = 0;
+ unsigned int i, k;
+
+ /* Compute each digit of result in sequence, maintaining the
+ * carries.
+ */
+ for (k = 0; k < NUM_ECC_DIGITS * 2 - 1; k++) {
+ unsigned int min;
+
+ if (k < NUM_ECC_DIGITS)
+ min = 0;
+ else
+ min = (k + 1) - NUM_ECC_DIGITS;
+
+ for (i = min; i <= k && i < NUM_ECC_DIGITS; i++) {
+ uint128_t product;
+
+ product = mul_64_64(left[i], right[k - i]);
+
+ r01 = add_128_128(r01, product);
+ r2 += (r01.m_high < product.m_high);
+ }
+
+ result[k] = r01.m_low;
+ r01.m_low = r01.m_high;
+ r01.m_high = r2;
+ r2 = 0;
+ }
+
+ result[NUM_ECC_DIGITS * 2 - 1] = r01.m_low;
+}
+
+static void vli_square(u64 *result, const u64 *left)
+{
+ uint128_t r01 = { 0, 0 };
+ u64 r2 = 0;
+ int i, k;
+
+ for (k = 0; k < NUM_ECC_DIGITS * 2 - 1; k++) {
+ unsigned int min;
+
+ if (k < NUM_ECC_DIGITS)
+ min = 0;
+ else
+ min = (k + 1) - NUM_ECC_DIGITS;
+
+ for (i = min; i <= k && i <= k - i; i++) {
+ uint128_t product;
+
+ product = mul_64_64(left[i], left[k - i]);
+
+ if (i < k - i) {
+ r2 += product.m_high >> 63;
+ product.m_high = (product.m_high << 1) |
+ (product.m_low >> 63);
+ product.m_low <<= 1;
+ }
+
+ r01 = add_128_128(r01, product);
+ r2 += (r01.m_high < product.m_high);
+ }
+
+ result[k] = r01.m_low;
+ r01.m_low = r01.m_high;
+ r01.m_high = r2;
+ r2 = 0;
+ }
+
+ result[NUM_ECC_DIGITS * 2 - 1] = r01.m_low;
+}
+
+/* Computes result = (left + right) % mod.
+ * Assumes that left < mod and right < mod, result != mod.
+ */
+static void vli_mod_add(u64 *result, const u64 *left, const u64 *right,
+ const u64 *mod)
+{
+ u64 carry;
+
+ carry = vli_add(result, left, right);
+
+ /* result > mod (result = mod + remainder), so subtract mod to
+ * get remainder.
+ */
+ if (carry || vli_cmp(result, mod) >= 0)
+ vli_sub(result, result, mod);
+}
+
+/* Computes result = (left - right) % mod.
+ * Assumes that left < mod and right < mod, result != mod.
+ */
+static void vli_mod_sub(u64 *result, const u64 *left, const u64 *right,
+ const u64 *mod)
+{
+ u64 borrow = vli_sub(result, left, right);
+
+ /* In this case, p_result == -diff == (max int) - diff.
+ * Since -x % d == d - x, we can get the correct result from
+ * result + mod (with overflow).
+ */
+ if (borrow)
+ vli_add(result, result, mod);
+}
+
+/* Computes result = product % curve_p
+ from http://www.nsa.gov/ia/_files/nist-routines.pdf */
+static void vli_mmod_fast(u64 *result, const u64 *product)
+{
+ u64 tmp[NUM_ECC_DIGITS];
+ int carry;
+
+ /* t */
+ vli_set(result, product);
+
+ /* s1 */
+ tmp[0] = 0;
+ tmp[1] = product[5] & 0xffffffff00000000ull;
+ tmp[2] = product[6];
+ tmp[3] = product[7];
+ carry = vli_lshift(tmp, tmp, 1);
+ carry += vli_add(result, result, tmp);
+
+ /* s2 */
+ tmp[1] = product[6] << 32;
+ tmp[2] = (product[6] >> 32) | (product[7] << 32);
+ tmp[3] = product[7] >> 32;
+ carry += vli_lshift(tmp, tmp, 1);
+ carry += vli_add(result, result, tmp);
+
+ /* s3 */
+ tmp[0] = product[4];
+ tmp[1] = product[5] & 0xffffffff;
+ tmp[2] = 0;
+ tmp[3] = product[7];
+ carry += vli_add(result, result, tmp);
+
+ /* s4 */
+ tmp[0] = (product[4] >> 32) | (product[5] << 32);
+ tmp[1] = (product[5] >> 32) | (product[6] & 0xffffffff00000000ull);
+ tmp[2] = product[7];
+ tmp[3] = (product[6] >> 32) | (product[4] << 32);
+ carry += vli_add(result, result, tmp);
+
+ /* d1 */
+ tmp[0] = (product[5] >> 32) | (product[6] << 32);
+ tmp[1] = (product[6] >> 32);
+ tmp[2] = 0;
+ tmp[3] = (product[4] & 0xffffffff) | (product[5] << 32);
+ carry -= vli_sub(result, result, tmp);
+
+ /* d2 */
+ tmp[0] = product[6];
+ tmp[1] = product[7];
+ tmp[2] = 0;
+ tmp[3] = (product[4] >> 32) | (product[5] & 0xffffffff00000000ull);
+ carry -= vli_sub(result, result, tmp);
+
+ /* d3 */
+ tmp[0] = (product[6] >> 32) | (product[7] << 32);
+ tmp[1] = (product[7] >> 32) | (product[4] << 32);
+ tmp[2] = (product[4] >> 32) | (product[5] << 32);
+ tmp[3] = (product[6] << 32);
+ carry -= vli_sub(result, result, tmp);
+
+ /* d4 */
+ tmp[0] = product[7];
+ tmp[1] = product[4] & 0xffffffff00000000ull;
+ tmp[2] = product[5];
+ tmp[3] = product[6] & 0xffffffff00000000ull;
+ carry -= vli_sub(result, result, tmp);
+
+ if (carry < 0) {
+ do {
+ carry += vli_add(result, result, curve_p);
+ } while (carry < 0);
+ } else {
+ while (carry || vli_cmp(curve_p, result) != 1)
+ carry -= vli_sub(result, result, curve_p);
+ }
+}
+
+/* Computes result = (left * right) % curve_p. */
+static void vli_mod_mult_fast(u64 *result, const u64 *left, const u64 *right)
+{
+ u64 product[2 * NUM_ECC_DIGITS];
+
+ vli_mult(product, left, right);
+ vli_mmod_fast(result, product);
+}
+
+/* Computes result = left^2 % curve_p. */
+static void vli_mod_square_fast(u64 *result, const u64 *left)
+{
+ u64 product[2 * NUM_ECC_DIGITS];
+
+ vli_square(product, left);
+ vli_mmod_fast(result, product);
+}
+
+#define EVEN(vli) (!(vli[0] & 1))
+/* Computes result = (1 / p_input) % mod. All VLIs are the same size.
+ * See "From Euclid's GCD to Montgomery Multiplication to the Great Divide"
+ * https://labs.oracle.com/techrep/2001/smli_tr-2001-95.pdf
+ */
+static void vli_mod_inv(u64 *result, const u64 *input, const u64 *mod)
+{
+ u64 a[NUM_ECC_DIGITS], b[NUM_ECC_DIGITS];
+ u64 u[NUM_ECC_DIGITS], v[NUM_ECC_DIGITS];
+ u64 carry;
+ int cmp_result;
+
+ if (vli_is_zero(input)) {
+ vli_clear(result);
+ return;
+ }
+
+ vli_set(a, input);
+ vli_set(b, mod);
+ vli_clear(u);
+ u[0] = 1;
+ vli_clear(v);
+
+ while ((cmp_result = vli_cmp(a, b)) != 0) {
+ carry = 0;
+
+ if (EVEN(a)) {
+ vli_rshift1(a);
+
+ if (!EVEN(u))
+ carry = vli_add(u, u, mod);
+
+ vli_rshift1(u);
+ if (carry)
+ u[NUM_ECC_DIGITS - 1] |= 0x8000000000000000ull;
+ } else if (EVEN(b)) {
+ vli_rshift1(b);
+
+ if (!EVEN(v))
+ carry = vli_add(v, v, mod);
+
+ vli_rshift1(v);
+ if (carry)
+ v[NUM_ECC_DIGITS - 1] |= 0x8000000000000000ull;
+ } else if (cmp_result > 0) {
+ vli_sub(a, a, b);
+ vli_rshift1(a);
+
+ if (vli_cmp(u, v) < 0)
+ vli_add(u, u, mod);
+
+ vli_sub(u, u, v);
+ if (!EVEN(u))
+ carry = vli_add(u, u, mod);
+
+ vli_rshift1(u);
+ if (carry)
+ u[NUM_ECC_DIGITS - 1] |= 0x8000000000000000ull;
+ } else {
+ vli_sub(b, b, a);
+ vli_rshift1(b);
+
+ if (vli_cmp(v, u) < 0)
+ vli_add(v, v, mod);
+
+ vli_sub(v, v, u);
+ if (!EVEN(v))
+ carry = vli_add(v, v, mod);
+
+ vli_rshift1(v);
+ if (carry)
+ v[NUM_ECC_DIGITS - 1] |= 0x8000000000000000ull;
+ }
+ }
+
+ vli_set(result, u);
+}
+
+/* ------ Point operations ------ */
+
+/* Returns true if p_point is the point at infinity, false otherwise. */
+static bool ecc_point_is_zero(const struct ecc_point *point)
+{
+ return (vli_is_zero(point->x) && vli_is_zero(point->y));
+}
+
+/* Point multiplication algorithm using Montgomery's ladder with co-Z
+ * coordinates. From http://eprint.iacr.org/2011/338.pdf
+ */
+
+/* Double in place */
+static void ecc_point_double_jacobian(u64 *x1, u64 *y1, u64 *z1)
+{
+ /* t1 = x, t2 = y, t3 = z */
+ u64 t4[NUM_ECC_DIGITS];
+ u64 t5[NUM_ECC_DIGITS];
+
+ if (vli_is_zero(z1))
+ return;
+
+ vli_mod_square_fast(t4, y1); /* t4 = y1^2 */
+ vli_mod_mult_fast(t5, x1, t4); /* t5 = x1*y1^2 = A */
+ vli_mod_square_fast(t4, t4); /* t4 = y1^4 */
+ vli_mod_mult_fast(y1, y1, z1); /* t2 = y1*z1 = z3 */
+ vli_mod_square_fast(z1, z1); /* t3 = z1^2 */
+
+ vli_mod_add(x1, x1, z1, curve_p); /* t1 = x1 + z1^2 */
+ vli_mod_add(z1, z1, z1, curve_p); /* t3 = 2*z1^2 */
+ vli_mod_sub(z1, x1, z1, curve_p); /* t3 = x1 - z1^2 */
+ vli_mod_mult_fast(x1, x1, z1); /* t1 = x1^2 - z1^4 */
+
+ vli_mod_add(z1, x1, x1, curve_p); /* t3 = 2*(x1^2 - z1^4) */
+ vli_mod_add(x1, x1, z1, curve_p); /* t1 = 3*(x1^2 - z1^4) */
+ if (vli_test_bit(x1, 0)) {
+ u64 carry = vli_add(x1, x1, curve_p);
+ vli_rshift1(x1);
+ x1[NUM_ECC_DIGITS - 1] |= carry << 63;
+ } else {
+ vli_rshift1(x1);
+ }
+ /* t1 = 3/2*(x1^2 - z1^4) = B */
+
+ vli_mod_square_fast(z1, x1); /* t3 = B^2 */
+ vli_mod_sub(z1, z1, t5, curve_p); /* t3 = B^2 - A */
+ vli_mod_sub(z1, z1, t5, curve_p); /* t3 = B^2 - 2A = x3 */
+ vli_mod_sub(t5, t5, z1, curve_p); /* t5 = A - x3 */
+ vli_mod_mult_fast(x1, x1, t5); /* t1 = B * (A - x3) */
+ vli_mod_sub(t4, x1, t4, curve_p); /* t4 = B * (A - x3) - y1^4 = y3 */
+
+ vli_set(x1, z1);
+ vli_set(z1, y1);
+ vli_set(y1, t4);
+}
+
+/* Modify (x1, y1) => (x1 * z^2, y1 * z^3) */
+static void apply_z(u64 *x1, u64 *y1, u64 *z)
+{
+ u64 t1[NUM_ECC_DIGITS];
+
+ vli_mod_square_fast(t1, z); /* z^2 */
+ vli_mod_mult_fast(x1, x1, t1); /* x1 * z^2 */
+ vli_mod_mult_fast(t1, t1, z); /* z^3 */
+ vli_mod_mult_fast(y1, y1, t1); /* y1 * z^3 */
+}
+
+/* P = (x1, y1) => 2P, (x2, y2) => P' */
+static void xycz_initial_double(u64 *x1, u64 *y1, u64 *x2, u64 *y2,
+ u64 *p_initial_z)
+{
+ u64 z[NUM_ECC_DIGITS];
+
+ vli_set(x2, x1);
+ vli_set(y2, y1);
+
+ vli_clear(z);
+ z[0] = 1;
+
+ if (p_initial_z)
+ vli_set(z, p_initial_z);
+
+ apply_z(x1, y1, z);
+
+ ecc_point_double_jacobian(x1, y1, z);
+
+ apply_z(x2, y2, z);
+}
+
+/* Input P = (x1, y1, Z), Q = (x2, y2, Z)
+ * Output P' = (x1', y1', Z3), P + Q = (x3, y3, Z3)
+ * or P => P', Q => P + Q
+ */
+static void xycz_add(u64 *x1, u64 *y1, u64 *x2, u64 *y2)
+{
+ /* t1 = X1, t2 = Y1, t3 = X2, t4 = Y2 */
+ u64 t5[NUM_ECC_DIGITS];
+
+ vli_mod_sub(t5, x2, x1, curve_p); /* t5 = x2 - x1 */
+ vli_mod_square_fast(t5, t5); /* t5 = (x2 - x1)^2 = A */
+ vli_mod_mult_fast(x1, x1, t5); /* t1 = x1*A = B */
+ vli_mod_mult_fast(x2, x2, t5); /* t3 = x2*A = C */
+ vli_mod_sub(y2, y2, y1, curve_p); /* t4 = y2 - y1 */
+ vli_mod_square_fast(t5, y2); /* t5 = (y2 - y1)^2 = D */
+
+ vli_mod_sub(t5, t5, x1, curve_p); /* t5 = D - B */
+ vli_mod_sub(t5, t5, x2, curve_p); /* t5 = D - B - C = x3 */
+ vli_mod_sub(x2, x2, x1, curve_p); /* t3 = C - B */
+ vli_mod_mult_fast(y1, y1, x2); /* t2 = y1*(C - B) */
+ vli_mod_sub(x2, x1, t5, curve_p); /* t3 = B - x3 */
+ vli_mod_mult_fast(y2, y2, x2); /* t4 = (y2 - y1)*(B - x3) */
+ vli_mod_sub(y2, y2, y1, curve_p); /* t4 = y3 */
+
+ vli_set(x2, t5);
+}
+
+/* Input P = (x1, y1, Z), Q = (x2, y2, Z)
+ * Output P + Q = (x3, y3, Z3), P - Q = (x3', y3', Z3)
+ * or P => P - Q, Q => P + Q
+ */
+static void xycz_add_c(u64 *x1, u64 *y1, u64 *x2, u64 *y2)
+{
+ /* t1 = X1, t2 = Y1, t3 = X2, t4 = Y2 */
+ u64 t5[NUM_ECC_DIGITS];
+ u64 t6[NUM_ECC_DIGITS];
+ u64 t7[NUM_ECC_DIGITS];
+
+ vli_mod_sub(t5, x2, x1, curve_p); /* t5 = x2 - x1 */
+ vli_mod_square_fast(t5, t5); /* t5 = (x2 - x1)^2 = A */
+ vli_mod_mult_fast(x1, x1, t5); /* t1 = x1*A = B */
+ vli_mod_mult_fast(x2, x2, t5); /* t3 = x2*A = C */
+ vli_mod_add(t5, y2, y1, curve_p); /* t4 = y2 + y1 */
+ vli_mod_sub(y2, y2, y1, curve_p); /* t4 = y2 - y1 */
+
+ vli_mod_sub(t6, x2, x1, curve_p); /* t6 = C - B */
+ vli_mod_mult_fast(y1, y1, t6); /* t2 = y1 * (C - B) */
+ vli_mod_add(t6, x1, x2, curve_p); /* t6 = B + C */
+ vli_mod_square_fast(x2, y2); /* t3 = (y2 - y1)^2 */
+ vli_mod_sub(x2, x2, t6, curve_p); /* t3 = x3 */
+
+ vli_mod_sub(t7, x1, x2, curve_p); /* t7 = B - x3 */
+ vli_mod_mult_fast(y2, y2, t7); /* t4 = (y2 - y1)*(B - x3) */
+ vli_mod_sub(y2, y2, y1, curve_p); /* t4 = y3 */
+
+ vli_mod_square_fast(t7, t5); /* t7 = (y2 + y1)^2 = F */
+ vli_mod_sub(t7, t7, t6, curve_p); /* t7 = x3' */
+ vli_mod_sub(t6, t7, x1, curve_p); /* t6 = x3' - B */
+ vli_mod_mult_fast(t6, t6, t5); /* t6 = (y2 + y1)*(x3' - B) */
+ vli_mod_sub(y1, t6, y1, curve_p); /* t2 = y3' */
+
+ vli_set(x1, t7);
+}
+
+static void ecc_point_mult(struct ecc_point *result,
+ const struct ecc_point *point, u64 *scalar,
+ u64 *initial_z, int num_bits)
+{
+ /* R0 and R1 */
+ u64 rx[2][NUM_ECC_DIGITS];
+ u64 ry[2][NUM_ECC_DIGITS];
+ u64 z[NUM_ECC_DIGITS];
+ int i, nb;
+
+ vli_set(rx[1], point->x);
+ vli_set(ry[1], point->y);
+
+ xycz_initial_double(rx[1], ry[1], rx[0], ry[0], initial_z);
+
+ for (i = num_bits - 2; i > 0; i--) {
+ nb = !vli_test_bit(scalar, i);
+ xycz_add_c(rx[1 - nb], ry[1 - nb], rx[nb], ry[nb]);
+ xycz_add(rx[nb], ry[nb], rx[1 - nb], ry[1 - nb]);
+ }
+
+ nb = !vli_test_bit(scalar, 0);
+ xycz_add_c(rx[1 - nb], ry[1 - nb], rx[nb], ry[nb]);
+
+ /* Find final 1/Z value. */
+ vli_mod_sub(z, rx[1], rx[0], curve_p); /* X1 - X0 */
+ vli_mod_mult_fast(z, z, ry[1 - nb]); /* Yb * (X1 - X0) */
+ vli_mod_mult_fast(z, z, point->x); /* xP * Yb * (X1 - X0) */
+ vli_mod_inv(z, z, curve_p); /* 1 / (xP * Yb * (X1 - X0)) */
+ vli_mod_mult_fast(z, z, point->y); /* yP / (xP * Yb * (X1 - X0)) */
+ vli_mod_mult_fast(z, z, rx[1 - nb]); /* Xb * yP / (xP * Yb * (X1 - X0)) */
+ /* End 1/Z calculation */
+
+ xycz_add(rx[nb], ry[nb], rx[1 - nb], ry[1 - nb]);
+
+ apply_z(rx[0], ry[0], z);
+
+ vli_set(result->x, rx[0]);
+ vli_set(result->y, ry[0]);
+}
+
+static void ecc_bytes2native(const u8 bytes[ECC_BYTES],
+ u64 native[NUM_ECC_DIGITS])
+{
+ int i;
+
+ for (i = 0; i < NUM_ECC_DIGITS; i++) {
+ const u8 *digit = bytes + 8 * (NUM_ECC_DIGITS - 1 - i);
+
+ native[NUM_ECC_DIGITS - 1 - i] =
+ ((u64) digit[0] << 0) |
+ ((u64) digit[1] << 8) |
+ ((u64) digit[2] << 16) |
+ ((u64) digit[3] << 24) |
+ ((u64) digit[4] << 32) |
+ ((u64) digit[5] << 40) |
+ ((u64) digit[6] << 48) |
+ ((u64) digit[7] << 56);
+ }
+}
+
+static void ecc_native2bytes(const u64 native[NUM_ECC_DIGITS],
+ u8 bytes[ECC_BYTES])
+{
+ int i;
+
+ for (i = 0; i < NUM_ECC_DIGITS; i++) {
+ u8 *digit = bytes + 8 * (NUM_ECC_DIGITS - 1 - i);
+
+ digit[0] = native[NUM_ECC_DIGITS - 1 - i] >> 0;
+ digit[1] = native[NUM_ECC_DIGITS - 1 - i] >> 8;
+ digit[2] = native[NUM_ECC_DIGITS - 1 - i] >> 16;
+ digit[3] = native[NUM_ECC_DIGITS - 1 - i] >> 24;
+ digit[4] = native[NUM_ECC_DIGITS - 1 - i] >> 32;
+ digit[5] = native[NUM_ECC_DIGITS - 1 - i] >> 40;
+ digit[6] = native[NUM_ECC_DIGITS - 1 - i] >> 48;
+ digit[7] = native[NUM_ECC_DIGITS - 1 - i] >> 56;
+ }
+}
+
+bool ecc_make_key(u8 public_key[64], u8 private_key[32])
+{
+ struct ecc_point pk;
+ u64 priv[NUM_ECC_DIGITS];
+ unsigned int tries = 0;
+
+ do {
+ if (tries++ >= MAX_TRIES)
+ return false;
+
+ get_random_bytes(priv, ECC_BYTES);
+
+ if (vli_is_zero(priv))
+ continue;
+
+ /* Make sure the private key is in the range [1, n-1]. */
+ if (vli_cmp(curve_n, priv) != 1)
+ continue;
+
+ ecc_point_mult(&pk, &curve_g, priv, NULL, vli_num_bits(priv));
+ } while (ecc_point_is_zero(&pk));
+
+ ecc_native2bytes(priv, private_key);
+ ecc_native2bytes(pk.x, public_key);
+ ecc_native2bytes(pk.y, &public_key[32]);
+
+ return true;
+}
+
+bool ecdh_shared_secret(const u8 public_key[64], const u8 private_key[32],
+ u8 secret[32])
+{
+ u64 priv[NUM_ECC_DIGITS];
+ u64 rand[NUM_ECC_DIGITS];
+ struct ecc_point product, pk;
+
+ get_random_bytes(rand, ECC_BYTES);
+
+ ecc_bytes2native(public_key, pk.x);
+ ecc_bytes2native(&public_key[32], pk.y);
+ ecc_bytes2native(private_key, priv);
+
+ ecc_point_mult(&product, &pk, priv, rand, vli_num_bits(priv));
+
+ ecc_native2bytes(product.x, secret);
+
+ return !ecc_point_is_zero(&product);
+}
diff --git a/net/bluetooth/ecc.h b/net/bluetooth/ecc.h
new file mode 100644
index 000000000..8d6a2f4d1
--- /dev/null
+++ b/net/bluetooth/ecc.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2013, Kenneth MacKay
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Create a public/private key pair.
+ * Outputs:
+ * public_key - Will be filled in with the public key.
+ * private_key - Will be filled in with the private key.
+ *
+ * Returns true if the key pair was generated successfully, false
+ * if an error occurred. The keys are with the LSB first.
+ */
+bool ecc_make_key(u8 public_key[64], u8 private_key[32]);
+
+/* Compute a shared secret given your secret key and someone else's
+ * public key.
+ * Note: It is recommended that you hash the result of ecdh_shared_secret
+ * before using it for symmetric encryption or HMAC.
+ *
+ * Inputs:
+ * public_key - The public key of the remote party
+ * private_key - Your private key.
+ *
+ * Outputs:
+ * secret - Will be filled in with the shared secret value.
+ *
+ * Returns true if the shared secret was generated successfully, false
+ * if an error occurred. Both input and output parameters are with the
+ * LSB first.
+ */
+bool ecdh_shared_secret(const u8 public_key[64], const u8 private_key[32],
+ u8 secret[32]);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
new file mode 100644
index 000000000..ee5e59839
--- /dev/null
+++ b/net/bluetooth/hci_conn.c
@@ -0,0 +1,1383 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI connection handling. */
+
+#include <linux/export.h>
+#include <linux/debugfs.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "hci_request.h"
+#include "smp.h"
+#include "a2mp.h"
+
+struct sco_param {
+ u16 pkt_type;
+ u16 max_latency;
+ u8 retrans_effort;
+};
+
+static const struct sco_param esco_param_cvsd[] = {
+ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */
+ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */
+ { EDR_ESCO_MASK | ESCO_EV3, 0x0007, 0x01 }, /* S1 */
+ { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0x01 }, /* D1 */
+ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0x01 }, /* D0 */
+};
+
+static const struct sco_param sco_param_cvsd[] = {
+ { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0xff }, /* D1 */
+ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0xff }, /* D0 */
+};
+
+static const struct sco_param esco_param_msbc[] = {
+ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d, 0x02 }, /* T2 */
+ { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */
+};
+
+static void hci_le_create_connection_cancel(struct hci_conn *conn)
+{
+ hci_send_cmd(conn->hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL);
+}
+
+static void hci_acl_create_connection(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct inquiry_entry *ie;
+ struct hci_cp_create_conn cp;
+
+ BT_DBG("hcon %p", conn);
+
+ conn->state = BT_CONNECT;
+ conn->out = true;
+ conn->role = HCI_ROLE_MASTER;
+
+ conn->attempt++;
+
+ conn->link_policy = hdev->link_policy;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pscan_rep_mode = 0x02;
+
+ ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+ if (ie) {
+ if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
+ cp.pscan_rep_mode = ie->data.pscan_rep_mode;
+ cp.pscan_mode = ie->data.pscan_mode;
+ cp.clock_offset = ie->data.clock_offset |
+ cpu_to_le16(0x8000);
+ }
+
+ memcpy(conn->dev_class, ie->data.dev_class, 3);
+ if (ie->data.ssp_mode > 0)
+ set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+ }
+
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+ if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
+ cp.role_switch = 0x01;
+ else
+ cp.role_switch = 0x00;
+
+ hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp);
+}
+
+static void hci_acl_create_connection_cancel(struct hci_conn *conn)
+{
+ struct hci_cp_create_conn_cancel cp;
+
+ BT_DBG("hcon %p", conn);
+
+ if (conn->hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return;
+
+ bacpy(&cp.bdaddr, &conn->dst);
+ hci_send_cmd(conn->hdev, HCI_OP_CREATE_CONN_CANCEL, sizeof(cp), &cp);
+}
+
+static void hci_reject_sco(struct hci_conn *conn)
+{
+ struct hci_cp_reject_sync_conn_req cp;
+
+ cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
+ bacpy(&cp.bdaddr, &conn->dst);
+
+ hci_send_cmd(conn->hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp);
+}
+
+int hci_disconnect(struct hci_conn *conn, __u8 reason)
+{
+ struct hci_cp_disconnect cp;
+
+ BT_DBG("hcon %p", conn);
+
+ /* When we are master of an established connection and it enters
+ * the disconnect timeout, then go ahead and try to read the
+ * current clock offset. Processing of the result is done
+ * within the event handling and hci_clock_offset_evt function.
+ */
+ if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER) {
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_read_clock_offset clkoff_cp;
+
+ clkoff_cp.handle = cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(clkoff_cp),
+ &clkoff_cp);
+ }
+
+ conn->state = BT_DISCONN;
+
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.reason = reason;
+ return hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp);
+}
+
+static void hci_amp_disconn(struct hci_conn *conn)
+{
+ struct hci_cp_disconn_phy_link cp;
+
+ BT_DBG("hcon %p", conn);
+
+ conn->state = BT_DISCONN;
+
+ cp.phy_handle = HCI_PHY_HANDLE(conn->handle);
+ cp.reason = hci_proto_disconn_ind(conn);
+ hci_send_cmd(conn->hdev, HCI_OP_DISCONN_PHY_LINK,
+ sizeof(cp), &cp);
+}
+
+static void hci_add_sco(struct hci_conn *conn, __u16 handle)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_add_sco cp;
+
+ BT_DBG("hcon %p", conn);
+
+ conn->state = BT_CONNECT;
+ conn->out = true;
+
+ conn->attempt++;
+
+ cp.handle = cpu_to_le16(handle);
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+ hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp);
+}
+
+bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_setup_sync_conn cp;
+ const struct sco_param *param;
+
+ BT_DBG("hcon %p", conn);
+
+ conn->state = BT_CONNECT;
+ conn->out = true;
+
+ conn->attempt++;
+
+ cp.handle = cpu_to_le16(handle);
+
+ cp.tx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.rx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.voice_setting = cpu_to_le16(conn->setting);
+
+ switch (conn->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_TRANSP:
+ if (conn->attempt > ARRAY_SIZE(esco_param_msbc))
+ return false;
+ param = &esco_param_msbc[conn->attempt - 1];
+ break;
+ case SCO_AIRMODE_CVSD:
+ if (lmp_esco_capable(conn->link)) {
+ if (conn->attempt > ARRAY_SIZE(esco_param_cvsd))
+ return false;
+ param = &esco_param_cvsd[conn->attempt - 1];
+ } else {
+ if (conn->attempt > ARRAY_SIZE(sco_param_cvsd))
+ return false;
+ param = &sco_param_cvsd[conn->attempt - 1];
+ }
+ break;
+ default:
+ return false;
+ }
+
+ cp.retrans_effort = param->retrans_effort;
+ cp.pkt_type = __cpu_to_le16(param->pkt_type);
+ cp.max_latency = __cpu_to_le16(param->max_latency);
+
+ if (hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0)
+ return false;
+
+ return true;
+}
+
+u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
+ u16 to_multiplier)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_conn_params *params;
+ struct hci_cp_le_conn_update cp;
+
+ hci_dev_lock(hdev);
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ params->conn_min_interval = min;
+ params->conn_max_interval = max;
+ params->conn_latency = latency;
+ params->supervision_timeout = to_multiplier;
+ }
+
+ hci_dev_unlock(hdev);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.conn_interval_min = cpu_to_le16(min);
+ cp.conn_interval_max = cpu_to_le16(max);
+ cp.conn_latency = cpu_to_le16(latency);
+ cp.supervision_timeout = cpu_to_le16(to_multiplier);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp);
+
+ if (params)
+ return 0x01;
+
+ return 0x00;
+}
+
+void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
+ __u8 ltk[16])
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_le_start_enc cp;
+
+ BT_DBG("hcon %p", conn);
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.rand = rand;
+ cp.ediv = ediv;
+ memcpy(cp.ltk, ltk, sizeof(cp.ltk));
+
+ hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp);
+}
+
+/* Device _must_ be locked */
+void hci_sco_setup(struct hci_conn *conn, __u8 status)
+{
+ struct hci_conn *sco = conn->link;
+
+ if (!sco)
+ return;
+
+ BT_DBG("hcon %p", conn);
+
+ if (!status) {
+ if (lmp_esco_capable(conn->hdev))
+ hci_setup_sync(sco, conn->handle);
+ else
+ hci_add_sco(sco, conn->handle);
+ } else {
+ hci_connect_cfm(sco, status);
+ hci_conn_del(sco);
+ }
+}
+
+static void hci_conn_timeout(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ disc_work.work);
+ int refcnt = atomic_read(&conn->refcnt);
+
+ BT_DBG("hcon %p state %s", conn, state_to_string(conn->state));
+
+ WARN_ON(refcnt < 0);
+
+ /* FIXME: It was observed that in pairing failed scenario, refcnt
+ * drops below 0. Probably this is because l2cap_conn_del calls
+ * l2cap_chan_del for each channel, and inside l2cap_chan_del conn is
+ * dropped. After that loop hci_chan_del is called which also drops
+ * conn. For now make sure that ACL is alive if refcnt is higher then 0,
+ * otherwise drop it.
+ */
+ if (refcnt > 0)
+ return;
+
+ switch (conn->state) {
+ case BT_CONNECT:
+ case BT_CONNECT2:
+ if (conn->out) {
+ if (conn->type == ACL_LINK)
+ hci_acl_create_connection_cancel(conn);
+ else if (conn->type == LE_LINK)
+ hci_le_create_connection_cancel(conn);
+ } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
+ hci_reject_sco(conn);
+ }
+ break;
+ case BT_CONFIG:
+ case BT_CONNECTED:
+ if (conn->type == AMP_LINK) {
+ hci_amp_disconn(conn);
+ } else {
+ __u8 reason = hci_proto_disconn_ind(conn);
+ hci_disconnect(conn, reason);
+ }
+ break;
+ default:
+ conn->state = BT_CLOSED;
+ break;
+ }
+}
+
+/* Enter sniff mode */
+static void hci_conn_idle(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ idle_work.work);
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("hcon %p mode %d", conn, conn->mode);
+
+ if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn))
+ return;
+
+ if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF))
+ return;
+
+ if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) {
+ struct hci_cp_sniff_subrate cp;
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.max_latency = cpu_to_le16(0);
+ cp.min_remote_timeout = cpu_to_le16(0);
+ cp.min_local_timeout = cpu_to_le16(0);
+ hci_send_cmd(hdev, HCI_OP_SNIFF_SUBRATE, sizeof(cp), &cp);
+ }
+
+ if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) {
+ struct hci_cp_sniff_mode cp;
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.max_interval = cpu_to_le16(hdev->sniff_max_interval);
+ cp.min_interval = cpu_to_le16(hdev->sniff_min_interval);
+ cp.attempt = cpu_to_le16(4);
+ cp.timeout = cpu_to_le16(1);
+ hci_send_cmd(hdev, HCI_OP_SNIFF_MODE, sizeof(cp), &cp);
+ }
+}
+
+static void hci_conn_auto_accept(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ auto_accept_work.work);
+
+ hci_send_cmd(conn->hdev, HCI_OP_USER_CONFIRM_REPLY, sizeof(conn->dst),
+ &conn->dst);
+}
+
+static void le_conn_timeout(struct work_struct *work)
+{
+ struct hci_conn *conn = container_of(work, struct hci_conn,
+ le_conn_timeout.work);
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("");
+
+ /* We could end up here due to having done directed advertising,
+ * so clean up the state if necessary. This should however only
+ * happen with broken hardware or if low duty cycle was used
+ * (which doesn't have a timeout of its own).
+ */
+ if (conn->role == HCI_ROLE_SLAVE) {
+ u8 enable = 0x00;
+ hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
+ &enable);
+ hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT);
+ return;
+ }
+
+ hci_le_create_connection_cancel(conn);
+}
+
+struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
+ u8 role)
+{
+ struct hci_conn *conn;
+
+ BT_DBG("%s dst %pMR", hdev->name, dst);
+
+ conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+ if (!conn)
+ return NULL;
+
+ bacpy(&conn->dst, dst);
+ bacpy(&conn->src, &hdev->bdaddr);
+ conn->hdev = hdev;
+ conn->type = type;
+ conn->role = role;
+ conn->mode = HCI_CM_ACTIVE;
+ conn->state = BT_OPEN;
+ conn->auth_type = HCI_AT_GENERAL_BONDING;
+ conn->io_capability = hdev->io_capability;
+ conn->remote_auth = 0xff;
+ conn->key_type = 0xff;
+ conn->rssi = HCI_RSSI_INVALID;
+ conn->tx_power = HCI_TX_POWER_INVALID;
+ conn->max_tx_power = HCI_TX_POWER_INVALID;
+
+ set_bit(HCI_CONN_POWER_SAVE, &conn->flags);
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+
+ if (conn->role == HCI_ROLE_MASTER)
+ conn->out = true;
+
+ switch (type) {
+ case ACL_LINK:
+ conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK;
+ break;
+ case LE_LINK:
+ /* conn->src should reflect the local identity address */
+ hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+ break;
+ case SCO_LINK:
+ if (lmp_esco_capable(hdev))
+ conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+ (hdev->esco_type & EDR_ESCO_MASK);
+ else
+ conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK;
+ break;
+ case ESCO_LINK:
+ conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK;
+ break;
+ }
+
+ skb_queue_head_init(&conn->data_q);
+
+ INIT_LIST_HEAD(&conn->chan_list);
+
+ INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout);
+ INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept);
+ INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle);
+ INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout);
+
+ atomic_set(&conn->refcnt, 0);
+
+ hci_dev_hold(hdev);
+
+ hci_conn_hash_add(hdev, conn);
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_CONN_ADD);
+
+ hci_conn_init_sysfs(conn);
+
+ return conn;
+}
+
+int hci_conn_del(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("%s hcon %p handle %d", hdev->name, conn, conn->handle);
+
+ cancel_delayed_work_sync(&conn->disc_work);
+ cancel_delayed_work_sync(&conn->auto_accept_work);
+ cancel_delayed_work_sync(&conn->idle_work);
+
+ if (conn->type == ACL_LINK) {
+ struct hci_conn *sco = conn->link;
+ if (sco)
+ sco->link = NULL;
+
+ /* Unacked frames */
+ hdev->acl_cnt += conn->sent;
+ } else if (conn->type == LE_LINK) {
+ cancel_delayed_work(&conn->le_conn_timeout);
+
+ if (hdev->le_pkts)
+ hdev->le_cnt += conn->sent;
+ else
+ hdev->acl_cnt += conn->sent;
+ } else {
+ struct hci_conn *acl = conn->link;
+ if (acl) {
+ acl->link = NULL;
+ hci_conn_drop(acl);
+ }
+ }
+
+ hci_chan_list_flush(conn);
+
+ if (conn->amp_mgr)
+ amp_mgr_put(conn->amp_mgr);
+
+ hci_conn_hash_del(hdev, conn);
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
+
+ skb_queue_purge(&conn->data_q);
+
+ hci_conn_del_sysfs(conn);
+
+ debugfs_remove_recursive(conn->debugfs);
+
+ if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags))
+ hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type);
+
+ hci_dev_put(hdev);
+
+ hci_conn_put(conn);
+
+ return 0;
+}
+
+struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
+{
+ int use_src = bacmp(src, BDADDR_ANY);
+ struct hci_dev *hdev = NULL, *d;
+
+ BT_DBG("%pMR -> %pMR", src, dst);
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (!test_bit(HCI_UP, &d->flags) ||
+ hci_dev_test_flag(d, HCI_USER_CHANNEL) ||
+ d->dev_type != HCI_BREDR)
+ continue;
+
+ /* Simple routing:
+ * No source address - find interface with bdaddr != dst
+ * Source address - find interface with bdaddr == src
+ */
+
+ if (use_src) {
+ if (!bacmp(&d->bdaddr, src)) {
+ hdev = d; break;
+ }
+ } else {
+ if (bacmp(&d->bdaddr, dst)) {
+ hdev = d; break;
+ }
+ }
+ }
+
+ if (hdev)
+ hdev = hci_dev_hold(hdev);
+
+ read_unlock(&hci_dev_list_lock);
+ return hdev;
+}
+EXPORT_SYMBOL(hci_get_route);
+
+/* This function requires the caller holds hdev->lock */
+void hci_le_conn_failed(struct hci_conn *conn, u8 status)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_conn_params *params;
+
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
+ conn->dst_type);
+ if (params && params->conn) {
+ hci_conn_drop(params->conn);
+ hci_conn_put(params->conn);
+ params->conn = NULL;
+ }
+
+ conn->state = BT_CLOSED;
+
+ mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type,
+ status);
+
+ hci_connect_cfm(conn, status);
+
+ hci_conn_del(conn);
+
+ /* Since we may have temporarily stopped the background scanning in
+ * favor of connection establishment, we should restart it.
+ */
+ hci_update_background_scan(hdev);
+
+ /* Re-enable advertising in case this was a failed connection
+ * attempt as a peripheral.
+ */
+ mgmt_reenable_advertising(hdev);
+}
+
+static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct hci_conn *conn;
+
+ if (status == 0)
+ return;
+
+ BT_ERR("HCI request failed to create LE connection: status 0x%2.2x",
+ status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ if (!conn)
+ goto done;
+
+ hci_le_conn_failed(conn, status);
+
+done:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_req_add_le_create_conn(struct hci_request *req,
+ struct hci_conn *conn)
+{
+ struct hci_cp_le_create_conn cp;
+ struct hci_dev *hdev = conn->hdev;
+ u8 own_addr_type;
+
+ memset(&cp, 0, sizeof(cp));
+
+ /* Update random address, but set require_privacy to false so
+ * that we never connect with an non-resolvable address.
+ */
+ if (hci_update_random_address(req, false, &own_addr_type))
+ return;
+
+ cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
+ cp.scan_window = cpu_to_le16(hdev->le_scan_window);
+ bacpy(&cp.peer_addr, &conn->dst);
+ cp.peer_addr_type = conn->dst_type;
+ cp.own_address_type = own_addr_type;
+ cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
+ cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
+
+ conn->state = BT_CONNECT;
+}
+
+static void hci_req_directed_advertising(struct hci_request *req,
+ struct hci_conn *conn)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_adv_param cp;
+ u8 own_addr_type;
+ u8 enable;
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ if (hci_update_random_address(req, false, &own_addr_type) < 0)
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.type = LE_ADV_DIRECT_IND;
+ cp.own_address_type = own_addr_type;
+ cp.direct_addr_type = conn->dst_type;
+ bacpy(&cp.direct_addr, &conn->dst);
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
+
+ enable = 0x01;
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+
+ conn->state = BT_CONNECT;
+}
+
+struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
+ u8 dst_type, u8 sec_level, u16 conn_timeout,
+ u8 role)
+{
+ struct hci_conn_params *params;
+ struct hci_conn *conn;
+ struct smp_irk *irk;
+ struct hci_request req;
+ int err;
+
+ /* Let's make sure that le is enabled.*/
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (lmp_le_capable(hdev))
+ return ERR_PTR(-ECONNREFUSED);
+
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ /* Some devices send ATT messages as soon as the physical link is
+ * established. To be able to handle these ATT messages, the user-
+ * space first establishes the connection and then starts the pairing
+ * process.
+ *
+ * So if a hci_conn object already exists for the following connection
+ * attempt, we simply update pending_sec_level and auth_type fields
+ * and return the object found.
+ */
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst);
+ if (conn) {
+ conn->pending_sec_level = sec_level;
+ goto done;
+ }
+
+ /* Since the controller supports only one LE connection attempt at a
+ * time, we return -EBUSY if there is any connection attempt running.
+ */
+ conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ if (conn)
+ return ERR_PTR(-EBUSY);
+
+ /* When given an identity address with existing identity
+ * resolving key, the connection needs to be established
+ * to a resolvable random address.
+ *
+ * This uses the cached random resolvable address from
+ * a previous scan. When no cached address is available,
+ * try connecting to the identity address instead.
+ *
+ * Storing the resolvable random address is required here
+ * to handle connection failures. The address will later
+ * be resolved back into the original identity address
+ * from the connect request.
+ */
+ irk = hci_find_irk_by_addr(hdev, dst, dst_type);
+ if (irk && bacmp(&irk->rpa, BDADDR_ANY)) {
+ dst = &irk->rpa;
+ dst_type = ADDR_LE_DEV_RANDOM;
+ }
+
+ conn = hci_conn_add(hdev, LE_LINK, dst, role);
+ if (!conn)
+ return ERR_PTR(-ENOMEM);
+
+ conn->dst_type = dst_type;
+ conn->sec_level = BT_SECURITY_LOW;
+ conn->pending_sec_level = sec_level;
+ conn->conn_timeout = conn_timeout;
+
+ hci_req_init(&req, hdev);
+
+ /* Disable advertising if we're active. For master role
+ * connections most controllers will refuse to connect if
+ * advertising is enabled, and for slave role connections we
+ * anyway have to disable it in order to start directed
+ * advertising.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV)) {
+ u8 enable = 0x00;
+ hci_req_add(&req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
+ &enable);
+ }
+
+ /* If requested to connect as slave use directed advertising */
+ if (conn->role == HCI_ROLE_SLAVE) {
+ /* If we're active scanning most controllers are unable
+ * to initiate advertising. Simply reject the attempt.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
+ hdev->le_scan_type == LE_SCAN_ACTIVE) {
+ skb_queue_purge(&req.cmd_q);
+ hci_conn_del(conn);
+ return ERR_PTR(-EBUSY);
+ }
+
+ hci_req_directed_advertising(&req, conn);
+ goto create_conn;
+ }
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ conn->le_conn_min_interval = params->conn_min_interval;
+ conn->le_conn_max_interval = params->conn_max_interval;
+ conn->le_conn_latency = params->conn_latency;
+ conn->le_supv_timeout = params->supervision_timeout;
+ } else {
+ conn->le_conn_min_interval = hdev->le_conn_min_interval;
+ conn->le_conn_max_interval = hdev->le_conn_max_interval;
+ conn->le_conn_latency = hdev->le_conn_latency;
+ conn->le_supv_timeout = hdev->le_supv_timeout;
+ }
+
+ /* If controller is scanning, we stop it since some controllers are
+ * not able to scan and connect at the same time. Also set the
+ * HCI_LE_SCAN_INTERRUPTED flag so that the command complete
+ * handler for scan disabling knows to set the correct discovery
+ * state.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ hci_req_add_le_scan_disable(&req);
+ hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED);
+ }
+
+ hci_req_add_le_create_conn(&req, conn);
+
+create_conn:
+ err = hci_req_run(&req, create_le_conn_complete);
+ if (err) {
+ hci_conn_del(conn);
+ return ERR_PTR(err);
+ }
+
+done:
+ hci_conn_hold(conn);
+ return conn;
+}
+
+struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
+ u8 sec_level, u8 auth_type)
+{
+ struct hci_conn *acl;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ if (lmp_bredr_capable(hdev))
+ return ERR_PTR(-ECONNREFUSED);
+
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
+ if (!acl) {
+ acl = hci_conn_add(hdev, ACL_LINK, dst, HCI_ROLE_MASTER);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ hci_conn_hold(acl);
+
+ if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
+ acl->sec_level = BT_SECURITY_LOW;
+ acl->pending_sec_level = sec_level;
+ acl->auth_type = auth_type;
+ hci_acl_create_connection(acl);
+ }
+
+ return acl;
+}
+
+struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
+ __u16 setting)
+{
+ struct hci_conn *acl;
+ struct hci_conn *sco;
+
+ acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING);
+ if (IS_ERR(acl))
+ return acl;
+
+ sco = hci_conn_hash_lookup_ba(hdev, type, dst);
+ if (!sco) {
+ sco = hci_conn_add(hdev, type, dst, HCI_ROLE_MASTER);
+ if (!sco) {
+ hci_conn_drop(acl);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ acl->link = sco;
+ sco->link = acl;
+
+ hci_conn_hold(sco);
+
+ sco->setting = setting;
+
+ if (acl->state == BT_CONNECTED &&
+ (sco->state == BT_OPEN || sco->state == BT_CLOSED)) {
+ set_bit(HCI_CONN_POWER_SAVE, &acl->flags);
+ hci_conn_enter_active_mode(acl, BT_POWER_FORCE_ACTIVE_ON);
+
+ if (test_bit(HCI_CONN_MODE_CHANGE_PEND, &acl->flags)) {
+ /* defer SCO setup until mode change completed */
+ set_bit(HCI_CONN_SCO_SETUP_PEND, &acl->flags);
+ return sco;
+ }
+
+ hci_sco_setup(acl, 0x00);
+ }
+
+ return sco;
+}
+
+/* Check link security requirement */
+int hci_conn_check_link_mode(struct hci_conn *conn)
+{
+ BT_DBG("hcon %p", conn);
+
+ /* In Secure Connections Only mode, it is required that Secure
+ * Connections is used and the link is encrypted with AES-CCM
+ * using a P-256 authenticated combination key.
+ */
+ if (hci_dev_test_flag(conn->hdev, HCI_SC_ONLY)) {
+ if (!hci_conn_sc_enabled(conn) ||
+ !test_bit(HCI_CONN_AES_CCM, &conn->flags) ||
+ conn->key_type != HCI_LK_AUTH_COMBINATION_P256)
+ return 0;
+ }
+
+ if (hci_conn_ssp_enabled(conn) &&
+ !test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ return 0;
+
+ return 1;
+}
+
+/* Authenticate remote device */
+static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
+{
+ BT_DBG("hcon %p", conn);
+
+ if (conn->pending_sec_level > sec_level)
+ sec_level = conn->pending_sec_level;
+
+ if (sec_level > conn->sec_level)
+ conn->pending_sec_level = sec_level;
+ else if (test_bit(HCI_CONN_AUTH, &conn->flags))
+ return 1;
+
+ /* Make sure we preserve an existing MITM requirement*/
+ auth_type |= (conn->auth_type & 0x01);
+
+ conn->auth_type = auth_type;
+
+ if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) {
+ struct hci_cp_auth_requested cp;
+
+ cp.handle = cpu_to_le16(conn->handle);
+ hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
+ sizeof(cp), &cp);
+
+ /* If we're already encrypted set the REAUTH_PEND flag,
+ * otherwise set the ENCRYPT_PEND.
+ */
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ set_bit(HCI_CONN_REAUTH_PEND, &conn->flags);
+ else
+ set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
+ }
+
+ return 0;
+}
+
+/* Encrypt the the link */
+static void hci_conn_encrypt(struct hci_conn *conn)
+{
+ BT_DBG("hcon %p", conn);
+
+ if (!test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) {
+ struct hci_cp_set_conn_encrypt cp;
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.encrypt = 0x01;
+ hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+ &cp);
+ }
+}
+
+/* Enable security */
+int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type,
+ bool initiator)
+{
+ BT_DBG("hcon %p", conn);
+
+ if (conn->type == LE_LINK)
+ return smp_conn_security(conn, sec_level);
+
+ /* For sdp we don't need the link key. */
+ if (sec_level == BT_SECURITY_SDP)
+ return 1;
+
+ /* For non 2.1 devices and low security level we don't need the link
+ key. */
+ if (sec_level == BT_SECURITY_LOW && !hci_conn_ssp_enabled(conn))
+ return 1;
+
+ /* For other security levels we need the link key. */
+ if (!test_bit(HCI_CONN_AUTH, &conn->flags))
+ goto auth;
+
+ /* An authenticated FIPS approved combination key has sufficient
+ * security for security level 4. */
+ if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256 &&
+ sec_level == BT_SECURITY_FIPS)
+ goto encrypt;
+
+ /* An authenticated combination key has sufficient security for
+ security level 3. */
+ if ((conn->key_type == HCI_LK_AUTH_COMBINATION_P192 ||
+ conn->key_type == HCI_LK_AUTH_COMBINATION_P256) &&
+ sec_level == BT_SECURITY_HIGH)
+ goto encrypt;
+
+ /* An unauthenticated combination key has sufficient security for
+ security level 1 and 2. */
+ if ((conn->key_type == HCI_LK_UNAUTH_COMBINATION_P192 ||
+ conn->key_type == HCI_LK_UNAUTH_COMBINATION_P256) &&
+ (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW))
+ goto encrypt;
+
+ /* A combination key has always sufficient security for the security
+ levels 1 or 2. High security level requires the combination key
+ is generated using maximum PIN code length (16).
+ For pre 2.1 units. */
+ if (conn->key_type == HCI_LK_COMBINATION &&
+ (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW ||
+ conn->pin_length == 16))
+ goto encrypt;
+
+auth:
+ if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags))
+ return 0;
+
+ if (initiator)
+ set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags);
+
+ if (!hci_conn_auth(conn, sec_level, auth_type))
+ return 0;
+
+encrypt:
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ return 1;
+
+ hci_conn_encrypt(conn);
+ return 0;
+}
+EXPORT_SYMBOL(hci_conn_security);
+
+/* Check secure link requirement */
+int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level)
+{
+ BT_DBG("hcon %p", conn);
+
+ /* Accept if non-secure or higher security level is required */
+ if (sec_level != BT_SECURITY_HIGH && sec_level != BT_SECURITY_FIPS)
+ return 1;
+
+ /* Accept if secure or higher security level is already present */
+ if (conn->sec_level == BT_SECURITY_HIGH ||
+ conn->sec_level == BT_SECURITY_FIPS)
+ return 1;
+
+ /* Reject not secure link */
+ return 0;
+}
+EXPORT_SYMBOL(hci_conn_check_secure);
+
+/* Switch role */
+int hci_conn_switch_role(struct hci_conn *conn, __u8 role)
+{
+ BT_DBG("hcon %p", conn);
+
+ if (role == conn->role)
+ return 1;
+
+ if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->flags)) {
+ struct hci_cp_switch_role cp;
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.role = role;
+ hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_conn_switch_role);
+
+/* Enter active mode */
+void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("hcon %p mode %d", conn, conn->mode);
+
+ if (conn->mode != HCI_CM_SNIFF)
+ goto timer;
+
+ if (!test_bit(HCI_CONN_POWER_SAVE, &conn->flags) && !force_active)
+ goto timer;
+
+ if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) {
+ struct hci_cp_exit_sniff_mode cp;
+ cp.handle = cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_EXIT_SNIFF_MODE, sizeof(cp), &cp);
+ }
+
+timer:
+ if (hdev->idle_timeout > 0)
+ queue_delayed_work(hdev->workqueue, &conn->idle_work,
+ msecs_to_jiffies(hdev->idle_timeout));
+}
+
+/* Drop all connection on the device */
+void hci_conn_hash_flush(struct hci_dev *hdev)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *c, *n;
+
+ BT_DBG("hdev %s", hdev->name);
+
+ list_for_each_entry_safe(c, n, &h->list, list) {
+ c->state = BT_CLOSED;
+
+ hci_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM);
+ hci_conn_del(c);
+ }
+}
+
+/* Check pending connect attempts */
+void hci_conn_check_pending(struct hci_dev *hdev)
+{
+ struct hci_conn *conn;
+
+ BT_DBG("hdev %s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
+ if (conn)
+ hci_acl_create_connection(conn);
+
+ hci_dev_unlock(hdev);
+}
+
+static u32 get_link_mode(struct hci_conn *conn)
+{
+ u32 link_mode = 0;
+
+ if (conn->role == HCI_ROLE_MASTER)
+ link_mode |= HCI_LM_MASTER;
+
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ link_mode |= HCI_LM_ENCRYPT;
+
+ if (test_bit(HCI_CONN_AUTH, &conn->flags))
+ link_mode |= HCI_LM_AUTH;
+
+ if (test_bit(HCI_CONN_SECURE, &conn->flags))
+ link_mode |= HCI_LM_SECURE;
+
+ if (test_bit(HCI_CONN_FIPS, &conn->flags))
+ link_mode |= HCI_LM_FIPS;
+
+ return link_mode;
+}
+
+int hci_get_conn_list(void __user *arg)
+{
+ struct hci_conn *c;
+ struct hci_conn_list_req req, *cl;
+ struct hci_conn_info *ci;
+ struct hci_dev *hdev;
+ int n = 0, size, err;
+
+ if (copy_from_user(&req, arg, sizeof(req)))
+ return -EFAULT;
+
+ if (!req.conn_num || req.conn_num > (PAGE_SIZE * 2) / sizeof(*ci))
+ return -EINVAL;
+
+ size = sizeof(req) + req.conn_num * sizeof(*ci);
+
+ cl = kmalloc(size, GFP_KERNEL);
+ if (!cl)
+ return -ENOMEM;
+
+ hdev = hci_dev_get(req.dev_id);
+ if (!hdev) {
+ kfree(cl);
+ return -ENODEV;
+ }
+
+ ci = cl->conn_info;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(c, &hdev->conn_hash.list, list) {
+ bacpy(&(ci + n)->bdaddr, &c->dst);
+ (ci + n)->handle = c->handle;
+ (ci + n)->type = c->type;
+ (ci + n)->out = c->out;
+ (ci + n)->state = c->state;
+ (ci + n)->link_mode = get_link_mode(c);
+ if (++n >= req.conn_num)
+ break;
+ }
+ hci_dev_unlock(hdev);
+
+ cl->dev_id = hdev->id;
+ cl->conn_num = n;
+ size = sizeof(req) + n * sizeof(*ci);
+
+ hci_dev_put(hdev);
+
+ err = copy_to_user(arg, cl, size);
+ kfree(cl);
+
+ return err ? -EFAULT : 0;
+}
+
+int hci_get_conn_info(struct hci_dev *hdev, void __user *arg)
+{
+ struct hci_conn_info_req req;
+ struct hci_conn_info ci;
+ struct hci_conn *conn;
+ char __user *ptr = arg + sizeof(req);
+
+ if (copy_from_user(&req, arg, sizeof(req)))
+ return -EFAULT;
+
+ hci_dev_lock(hdev);
+ conn = hci_conn_hash_lookup_ba(hdev, req.type, &req.bdaddr);
+ if (conn) {
+ bacpy(&ci.bdaddr, &conn->dst);
+ ci.handle = conn->handle;
+ ci.type = conn->type;
+ ci.out = conn->out;
+ ci.state = conn->state;
+ ci.link_mode = get_link_mode(conn);
+ }
+ hci_dev_unlock(hdev);
+
+ if (!conn)
+ return -ENOENT;
+
+ return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0;
+}
+
+int hci_get_auth_info(struct hci_dev *hdev, void __user *arg)
+{
+ struct hci_auth_info_req req;
+ struct hci_conn *conn;
+
+ if (copy_from_user(&req, arg, sizeof(req)))
+ return -EFAULT;
+
+ hci_dev_lock(hdev);
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &req.bdaddr);
+ if (conn)
+ req.type = conn->auth_type;
+ hci_dev_unlock(hdev);
+
+ if (!conn)
+ return -ENOENT;
+
+ return copy_to_user(arg, &req, sizeof(req)) ? -EFAULT : 0;
+}
+
+struct hci_chan *hci_chan_create(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_chan *chan;
+
+ BT_DBG("%s hcon %p", hdev->name, conn);
+
+ if (test_bit(HCI_CONN_DROP, &conn->flags)) {
+ BT_DBG("Refusing to create new hci_chan");
+ return NULL;
+ }
+
+ chan = kzalloc(sizeof(*chan), GFP_KERNEL);
+ if (!chan)
+ return NULL;
+
+ chan->conn = hci_conn_get(conn);
+ skb_queue_head_init(&chan->data_q);
+ chan->state = BT_CONNECTED;
+
+ list_add_rcu(&chan->list, &conn->chan_list);
+
+ return chan;
+}
+
+void hci_chan_del(struct hci_chan *chan)
+{
+ struct hci_conn *conn = chan->conn;
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("%s hcon %p chan %p", hdev->name, conn, chan);
+
+ list_del_rcu(&chan->list);
+
+ synchronize_rcu();
+
+ /* Prevent new hci_chan's to be created for this hci_conn */
+ set_bit(HCI_CONN_DROP, &conn->flags);
+
+ hci_conn_put(conn);
+
+ skb_queue_purge(&chan->data_q);
+ kfree(chan);
+}
+
+void hci_chan_list_flush(struct hci_conn *conn)
+{
+ struct hci_chan *chan, *n;
+
+ BT_DBG("hcon %p", conn);
+
+ list_for_each_entry_safe(chan, n, &conn->chan_list, list)
+ hci_chan_del(chan);
+}
+
+static struct hci_chan *__hci_chan_lookup_handle(struct hci_conn *hcon,
+ __u16 handle)
+{
+ struct hci_chan *hchan;
+
+ list_for_each_entry(hchan, &hcon->chan_list, list) {
+ if (hchan->handle == handle)
+ return hchan;
+ }
+
+ return NULL;
+}
+
+struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *hcon;
+ struct hci_chan *hchan = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(hcon, &h->list, list) {
+ hchan = __hci_chan_lookup_handle(hcon, handle);
+ if (hchan)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return hchan;
+}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
new file mode 100644
index 000000000..c4802f3bd
--- /dev/null
+++ b/net/bluetooth/hci_core.c
@@ -0,0 +1,4241 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright (C) 2011 ProFUSION Embedded Systems
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI core. */
+
+#include <linux/export.h>
+#include <linux/idr.h>
+#include <linux/rfkill.h>
+#include <linux/debugfs.h>
+#include <linux/crypto.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "hci_request.h"
+#include "hci_debugfs.h"
+#include "smp.h"
+
+static void hci_rx_work(struct work_struct *work);
+static void hci_cmd_work(struct work_struct *work);
+static void hci_tx_work(struct work_struct *work);
+
+/* HCI device list */
+LIST_HEAD(hci_dev_list);
+DEFINE_RWLOCK(hci_dev_list_lock);
+
+/* HCI callback list */
+LIST_HEAD(hci_cb_list);
+DEFINE_MUTEX(hci_cb_list_lock);
+
+/* HCI ID Numbering */
+static DEFINE_IDA(hci_index_ida);
+
+/* ----- HCI requests ----- */
+
+#define HCI_REQ_DONE 0
+#define HCI_REQ_PEND 1
+#define HCI_REQ_CANCELED 2
+
+#define hci_req_lock(d) mutex_lock(&d->req_lock)
+#define hci_req_unlock(d) mutex_unlock(&d->req_lock)
+
+/* ---- HCI notifications ---- */
+
+static void hci_notify(struct hci_dev *hdev, int event)
+{
+ hci_sock_dev_event(hdev, event);
+}
+
+/* ---- HCI debugfs entries ---- */
+
+static ssize_t dut_mode_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y': 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ struct sk_buff *skb;
+ char buf[32];
+ size_t buf_size = min(count, (sizeof(buf)-1));
+ bool enable;
+ int err;
+
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return -ENETDOWN;
+
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ if (strtobool(buf, &enable))
+ return -EINVAL;
+
+ if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
+ return -EALREADY;
+
+ hci_req_lock(hdev);
+ if (enable)
+ skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL,
+ HCI_CMD_TIMEOUT);
+ else
+ skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL,
+ HCI_CMD_TIMEOUT);
+ hci_req_unlock(hdev);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ err = -bt_to_errno(skb->data[0]);
+ kfree_skb(skb);
+
+ if (err < 0)
+ return err;
+
+ hci_dev_change_flag(hdev, HCI_DUT_MODE);
+
+ return count;
+}
+
+static const struct file_operations dut_mode_fops = {
+ .open = simple_open,
+ .read = dut_mode_read,
+ .write = dut_mode_write,
+ .llseek = default_llseek,
+};
+
+/* ---- HCI requests ---- */
+
+static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
+ struct sk_buff *skb)
+{
+ BT_DBG("%s result 0x%2.2x", hdev->name, result);
+
+ if (hdev->req_status == HCI_REQ_PEND) {
+ hdev->req_result = result;
+ hdev->req_status = HCI_REQ_DONE;
+ if (skb)
+ hdev->req_skb = skb_get(skb);
+ wake_up_interruptible(&hdev->req_wait_q);
+ }
+}
+
+static void hci_req_cancel(struct hci_dev *hdev, int err)
+{
+ BT_DBG("%s err 0x%2.2x", hdev->name, err);
+
+ if (hdev->req_status == HCI_REQ_PEND) {
+ hdev->req_result = err;
+ hdev->req_status = HCI_REQ_CANCELED;
+ wake_up_interruptible(&hdev->req_wait_q);
+ }
+}
+
+struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u8 event, u32 timeout)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct hci_request req;
+ struct sk_buff *skb;
+ int err = 0;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_req_init(&req, hdev);
+
+ hci_req_add_ev(&req, opcode, plen, param, event);
+
+ hdev->req_status = HCI_REQ_PEND;
+
+ add_wait_queue(&hdev->req_wait_q, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = hci_req_run_skb(&req, hci_req_sync_complete);
+ if (err < 0) {
+ remove_wait_queue(&hdev->req_wait_q, &wait);
+ set_current_state(TASK_RUNNING);
+ return ERR_PTR(err);
+ }
+
+ schedule_timeout(timeout);
+
+ remove_wait_queue(&hdev->req_wait_q, &wait);
+
+ if (signal_pending(current))
+ return ERR_PTR(-EINTR);
+
+ switch (hdev->req_status) {
+ case HCI_REQ_DONE:
+ err = -bt_to_errno(hdev->req_result);
+ break;
+
+ case HCI_REQ_CANCELED:
+ err = -hdev->req_result;
+ break;
+
+ default:
+ err = -ETIMEDOUT;
+ break;
+ }
+
+ hdev->req_status = hdev->req_result = 0;
+ skb = hdev->req_skb;
+ hdev->req_skb = NULL;
+
+ BT_DBG("%s end: err %d", hdev->name, err);
+
+ if (err < 0) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ if (!skb)
+ return ERR_PTR(-ENODATA);
+
+ return skb;
+}
+EXPORT_SYMBOL(__hci_cmd_sync_ev);
+
+struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ return __hci_cmd_sync_ev(hdev, opcode, plen, param, 0, timeout);
+}
+EXPORT_SYMBOL(__hci_cmd_sync);
+
+/* Execute request and wait for completion. */
+static int __hci_req_sync(struct hci_dev *hdev,
+ void (*func)(struct hci_request *req,
+ unsigned long opt),
+ unsigned long opt, __u32 timeout)
+{
+ struct hci_request req;
+ DECLARE_WAITQUEUE(wait, current);
+ int err = 0;
+
+ BT_DBG("%s start", hdev->name);
+
+ hci_req_init(&req, hdev);
+
+ hdev->req_status = HCI_REQ_PEND;
+
+ func(&req, opt);
+
+ add_wait_queue(&hdev->req_wait_q, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = hci_req_run_skb(&req, hci_req_sync_complete);
+ if (err < 0) {
+ hdev->req_status = 0;
+
+ remove_wait_queue(&hdev->req_wait_q, &wait);
+ set_current_state(TASK_RUNNING);
+
+ /* ENODATA means the HCI request command queue is empty.
+ * This can happen when a request with conditionals doesn't
+ * trigger any commands to be sent. This is normal behavior
+ * and should not trigger an error return.
+ */
+ if (err == -ENODATA)
+ return 0;
+
+ return err;
+ }
+
+ schedule_timeout(timeout);
+
+ remove_wait_queue(&hdev->req_wait_q, &wait);
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ switch (hdev->req_status) {
+ case HCI_REQ_DONE:
+ err = -bt_to_errno(hdev->req_result);
+ break;
+
+ case HCI_REQ_CANCELED:
+ err = -hdev->req_result;
+ break;
+
+ default:
+ err = -ETIMEDOUT;
+ break;
+ }
+
+ hdev->req_status = hdev->req_result = 0;
+
+ BT_DBG("%s end: err %d", hdev->name, err);
+
+ return err;
+}
+
+static int hci_req_sync(struct hci_dev *hdev,
+ void (*req)(struct hci_request *req,
+ unsigned long opt),
+ unsigned long opt, __u32 timeout)
+{
+ int ret;
+
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return -ENETDOWN;
+
+ /* Serialize all requests */
+ hci_req_lock(hdev);
+ ret = __hci_req_sync(hdev, req, opt, timeout);
+ hci_req_unlock(hdev);
+
+ return ret;
+}
+
+static void hci_reset_req(struct hci_request *req, unsigned long opt)
+{
+ BT_DBG("%s %ld", req->hdev->name, opt);
+
+ /* Reset device */
+ set_bit(HCI_RESET, &req->hdev->flags);
+ hci_req_add(req, HCI_OP_RESET, 0, NULL);
+}
+
+static void bredr_init(struct hci_request *req)
+{
+ req->hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_PACKET_BASED;
+
+ /* Read Local Supported Features */
+ hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL);
+
+ /* Read Local Version */
+ hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL);
+
+ /* Read BD Address */
+ hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL);
+}
+
+static void amp_init1(struct hci_request *req)
+{
+ req->hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_BLOCK_BASED;
+
+ /* Read Local Version */
+ hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL);
+
+ /* Read Local Supported Commands */
+ hci_req_add(req, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL);
+
+ /* Read Local AMP Info */
+ hci_req_add(req, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
+
+ /* Read Data Blk size */
+ hci_req_add(req, HCI_OP_READ_DATA_BLOCK_SIZE, 0, NULL);
+
+ /* Read Flow Control Mode */
+ hci_req_add(req, HCI_OP_READ_FLOW_CONTROL_MODE, 0, NULL);
+
+ /* Read Location Data */
+ hci_req_add(req, HCI_OP_READ_LOCATION_DATA, 0, NULL);
+}
+
+static void amp_init2(struct hci_request *req)
+{
+ /* Read Local Supported Features. Not all AMP controllers
+ * support this so it's placed conditionally in the second
+ * stage init.
+ */
+ if (req->hdev->commands[14] & 0x20)
+ hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL);
+}
+
+static void hci_init1_req(struct hci_request *req, unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ BT_DBG("%s %ld", hdev->name, opt);
+
+ /* Reset */
+ if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks))
+ hci_reset_req(req, 0);
+
+ switch (hdev->dev_type) {
+ case HCI_BREDR:
+ bredr_init(req);
+ break;
+
+ case HCI_AMP:
+ amp_init1(req);
+ break;
+
+ default:
+ BT_ERR("Unknown device type %d", hdev->dev_type);
+ break;
+ }
+}
+
+static void bredr_setup(struct hci_request *req)
+{
+ __le16 param;
+ __u8 flt_type;
+
+ /* Read Buffer Size (ACL mtu, max pkt, etc.) */
+ hci_req_add(req, HCI_OP_READ_BUFFER_SIZE, 0, NULL);
+
+ /* Read Class of Device */
+ hci_req_add(req, HCI_OP_READ_CLASS_OF_DEV, 0, NULL);
+
+ /* Read Local Name */
+ hci_req_add(req, HCI_OP_READ_LOCAL_NAME, 0, NULL);
+
+ /* Read Voice Setting */
+ hci_req_add(req, HCI_OP_READ_VOICE_SETTING, 0, NULL);
+
+ /* Read Number of Supported IAC */
+ hci_req_add(req, HCI_OP_READ_NUM_SUPPORTED_IAC, 0, NULL);
+
+ /* Read Current IAC LAP */
+ hci_req_add(req, HCI_OP_READ_CURRENT_IAC_LAP, 0, NULL);
+
+ /* Clear Event Filters */
+ flt_type = HCI_FLT_CLEAR_ALL;
+ hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &flt_type);
+
+ /* Connection accept timeout ~20 secs */
+ param = cpu_to_le16(0x7d00);
+ hci_req_add(req, HCI_OP_WRITE_CA_TIMEOUT, 2, &param);
+}
+
+static void le_setup(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ /* Read LE Buffer Size */
+ hci_req_add(req, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL);
+
+ /* Read LE Local Supported Features */
+ hci_req_add(req, HCI_OP_LE_READ_LOCAL_FEATURES, 0, NULL);
+
+ /* Read LE Supported States */
+ hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL);
+
+ /* Read LE White List Size */
+ hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, 0, NULL);
+
+ /* Clear LE White List */
+ hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
+
+ /* LE-only controllers have LE implicitly enabled */
+ if (!lmp_bredr_capable(hdev))
+ hci_dev_set_flag(hdev, HCI_LE_ENABLED);
+}
+
+static void hci_setup_event_mask(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ /* The second byte is 0xff instead of 0x9f (two reserved bits
+ * disabled) since a Broadcom 1.2 dongle doesn't respond to the
+ * command otherwise.
+ */
+ u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 };
+
+ /* CSR 1.1 dongles does not accept any bitfield so don't try to set
+ * any event mask for pre 1.2 devices.
+ */
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return;
+
+ if (lmp_bredr_capable(hdev)) {
+ events[4] |= 0x01; /* Flow Specification Complete */
+ events[4] |= 0x02; /* Inquiry Result with RSSI */
+ events[4] |= 0x04; /* Read Remote Extended Features Complete */
+ events[5] |= 0x08; /* Synchronous Connection Complete */
+ events[5] |= 0x10; /* Synchronous Connection Changed */
+ } else {
+ /* Use a different default for LE-only devices */
+ memset(events, 0, sizeof(events));
+ events[0] |= 0x10; /* Disconnection Complete */
+ events[1] |= 0x08; /* Read Remote Version Information Complete */
+ events[1] |= 0x20; /* Command Complete */
+ events[1] |= 0x40; /* Command Status */
+ events[1] |= 0x80; /* Hardware Error */
+ events[2] |= 0x04; /* Number of Completed Packets */
+ events[3] |= 0x02; /* Data Buffer Overflow */
+
+ if (hdev->le_features[0] & HCI_LE_ENCRYPTION) {
+ events[0] |= 0x80; /* Encryption Change */
+ events[5] |= 0x80; /* Encryption Key Refresh Complete */
+ }
+ }
+
+ if (lmp_inq_rssi_capable(hdev))
+ events[4] |= 0x02; /* Inquiry Result with RSSI */
+
+ if (lmp_sniffsubr_capable(hdev))
+ events[5] |= 0x20; /* Sniff Subrating */
+
+ if (lmp_pause_enc_capable(hdev))
+ events[5] |= 0x80; /* Encryption Key Refresh Complete */
+
+ if (lmp_ext_inq_capable(hdev))
+ events[5] |= 0x40; /* Extended Inquiry Result */
+
+ if (lmp_no_flush_capable(hdev))
+ events[7] |= 0x01; /* Enhanced Flush Complete */
+
+ if (lmp_lsto_capable(hdev))
+ events[6] |= 0x80; /* Link Supervision Timeout Changed */
+
+ if (lmp_ssp_capable(hdev)) {
+ events[6] |= 0x01; /* IO Capability Request */
+ events[6] |= 0x02; /* IO Capability Response */
+ events[6] |= 0x04; /* User Confirmation Request */
+ events[6] |= 0x08; /* User Passkey Request */
+ events[6] |= 0x10; /* Remote OOB Data Request */
+ events[6] |= 0x20; /* Simple Pairing Complete */
+ events[7] |= 0x04; /* User Passkey Notification */
+ events[7] |= 0x08; /* Keypress Notification */
+ events[7] |= 0x10; /* Remote Host Supported
+ * Features Notification
+ */
+ }
+
+ if (lmp_le_capable(hdev))
+ events[7] |= 0x20; /* LE Meta-Event */
+
+ hci_req_add(req, HCI_OP_SET_EVENT_MASK, sizeof(events), events);
+}
+
+static void hci_init2_req(struct hci_request *req, unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ if (hdev->dev_type == HCI_AMP)
+ return amp_init2(req);
+
+ if (lmp_bredr_capable(hdev))
+ bredr_setup(req);
+ else
+ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED);
+
+ if (lmp_le_capable(hdev))
+ le_setup(req);
+
+ /* All Bluetooth 1.2 and later controllers should support the
+ * HCI command for reading the local supported commands.
+ *
+ * Unfortunately some controllers indicate Bluetooth 1.2 support,
+ * but do not have support for this command. If that is the case,
+ * the driver can quirk the behavior and skip reading the local
+ * supported commands.
+ */
+ if (hdev->hci_ver > BLUETOOTH_VER_1_1 &&
+ !test_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks))
+ hci_req_add(req, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL);
+
+ if (lmp_ssp_capable(hdev)) {
+ /* When SSP is available, then the host features page
+ * should also be available as well. However some
+ * controllers list the max_page as 0 as long as SSP
+ * has not been enabled. To achieve proper debugging
+ * output, force the minimum max_page to 1 at least.
+ */
+ hdev->max_page = 0x01;
+
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ u8 mode = 0x01;
+
+ hci_req_add(req, HCI_OP_WRITE_SSP_MODE,
+ sizeof(mode), &mode);
+ } else {
+ struct hci_cp_write_eir cp;
+
+ memset(hdev->eir, 0, sizeof(hdev->eir));
+ memset(&cp, 0, sizeof(cp));
+
+ hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
+ }
+ }
+
+ if (lmp_inq_rssi_capable(hdev) ||
+ test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) {
+ u8 mode;
+
+ /* If Extended Inquiry Result events are supported, then
+ * they are clearly preferred over Inquiry Result with RSSI
+ * events.
+ */
+ mode = lmp_ext_inq_capable(hdev) ? 0x02 : 0x01;
+
+ hci_req_add(req, HCI_OP_WRITE_INQUIRY_MODE, 1, &mode);
+ }
+
+ if (lmp_inq_tx_pwr_capable(hdev))
+ hci_req_add(req, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL);
+
+ if (lmp_ext_feat_capable(hdev)) {
+ struct hci_cp_read_local_ext_features cp;
+
+ cp.page = 0x01;
+ hci_req_add(req, HCI_OP_READ_LOCAL_EXT_FEATURES,
+ sizeof(cp), &cp);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) {
+ u8 enable = 1;
+ hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, sizeof(enable),
+ &enable);
+ }
+}
+
+static void hci_setup_link_policy(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_def_link_policy cp;
+ u16 link_policy = 0;
+
+ if (lmp_rswitch_capable(hdev))
+ link_policy |= HCI_LP_RSWITCH;
+ if (lmp_hold_capable(hdev))
+ link_policy |= HCI_LP_HOLD;
+ if (lmp_sniff_capable(hdev))
+ link_policy |= HCI_LP_SNIFF;
+ if (lmp_park_capable(hdev))
+ link_policy |= HCI_LP_PARK;
+
+ cp.policy = cpu_to_le16(link_policy);
+ hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, sizeof(cp), &cp);
+}
+
+static void hci_set_le_support(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_le_host_supported cp;
+
+ /* LE-only devices do not support explicit enablement */
+ if (!lmp_bredr_capable(hdev))
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ cp.le = 0x01;
+ cp.simul = 0x00;
+ }
+
+ if (cp.le != lmp_host_le_capable(hdev))
+ hci_req_add(req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp),
+ &cp);
+}
+
+static void hci_set_event_mask_page_2(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+ /* If Connectionless Slave Broadcast master role is supported
+ * enable all necessary events for it.
+ */
+ if (lmp_csb_master_capable(hdev)) {
+ events[1] |= 0x40; /* Triggered Clock Capture */
+ events[1] |= 0x80; /* Synchronization Train Complete */
+ events[2] |= 0x10; /* Slave Page Response Timeout */
+ events[2] |= 0x20; /* CSB Channel Map Change */
+ }
+
+ /* If Connectionless Slave Broadcast slave role is supported
+ * enable all necessary events for it.
+ */
+ if (lmp_csb_slave_capable(hdev)) {
+ events[2] |= 0x01; /* Synchronization Train Received */
+ events[2] |= 0x02; /* CSB Receive */
+ events[2] |= 0x04; /* CSB Timeout */
+ events[2] |= 0x08; /* Truncated Page Complete */
+ }
+
+ /* Enable Authenticated Payload Timeout Expired event if supported */
+ if (lmp_ping_capable(hdev) || hdev->le_features[0] & HCI_LE_PING)
+ events[2] |= 0x80;
+
+ hci_req_add(req, HCI_OP_SET_EVENT_MASK_PAGE_2, sizeof(events), events);
+}
+
+static void hci_init3_req(struct hci_request *req, unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
+ u8 p;
+
+ hci_setup_event_mask(req);
+
+ if (hdev->commands[6] & 0x20) {
+ struct hci_cp_read_stored_link_key cp;
+
+ bacpy(&cp.bdaddr, BDADDR_ANY);
+ cp.read_all = 0x01;
+ hci_req_add(req, HCI_OP_READ_STORED_LINK_KEY, sizeof(cp), &cp);
+ }
+
+ if (hdev->commands[5] & 0x10)
+ hci_setup_link_policy(req);
+
+ if (hdev->commands[8] & 0x01)
+ hci_req_add(req, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL);
+
+ /* Some older Broadcom based Bluetooth 1.2 controllers do not
+ * support the Read Page Scan Type command. Check support for
+ * this command in the bit mask of supported commands.
+ */
+ if (hdev->commands[13] & 0x01)
+ hci_req_add(req, HCI_OP_READ_PAGE_SCAN_TYPE, 0, NULL);
+
+ if (lmp_le_capable(hdev)) {
+ u8 events[8];
+
+ memset(events, 0, sizeof(events));
+ events[0] = 0x0f;
+
+ if (hdev->le_features[0] & HCI_LE_ENCRYPTION)
+ events[0] |= 0x10; /* LE Long Term Key Request */
+
+ /* If controller supports the Connection Parameters Request
+ * Link Layer Procedure, enable the corresponding event.
+ */
+ if (hdev->le_features[0] & HCI_LE_CONN_PARAM_REQ_PROC)
+ events[0] |= 0x20; /* LE Remote Connection
+ * Parameter Request
+ */
+
+ /* If the controller supports the Data Length Extension
+ * feature, enable the corresponding event.
+ */
+ if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)
+ events[0] |= 0x40; /* LE Data Length Change */
+
+ /* If the controller supports Extended Scanner Filter
+ * Policies, enable the correspondig event.
+ */
+ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)
+ events[1] |= 0x04; /* LE Direct Advertising
+ * Report
+ */
+
+ /* If the controller supports the LE Read Local P-256
+ * Public Key command, enable the corresponding event.
+ */
+ if (hdev->commands[34] & 0x02)
+ events[0] |= 0x80; /* LE Read Local P-256
+ * Public Key Complete
+ */
+
+ /* If the controller supports the LE Generate DHKey
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[34] & 0x04)
+ events[1] |= 0x01; /* LE Generate DHKey Complete */
+
+ hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events),
+ events);
+
+ if (hdev->commands[25] & 0x40) {
+ /* Read LE Advertising Channel TX Power */
+ hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL);
+ }
+
+ if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
+ /* Read LE Maximum Data Length */
+ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL);
+
+ /* Read LE Suggested Default Data Length */
+ hci_req_add(req, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL);
+ }
+
+ hci_set_le_support(req);
+ }
+
+ /* Read features beyond page 1 if available */
+ for (p = 2; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) {
+ struct hci_cp_read_local_ext_features cp;
+
+ cp.page = p;
+ hci_req_add(req, HCI_OP_READ_LOCAL_EXT_FEATURES,
+ sizeof(cp), &cp);
+ }
+}
+
+static void hci_init4_req(struct hci_request *req, unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ /* Some Broadcom based Bluetooth controllers do not support the
+ * Delete Stored Link Key command. They are clearly indicating its
+ * absence in the bit mask of supported commands.
+ *
+ * Check the supported commands and only if the the command is marked
+ * as supported send it. If not supported assume that the controller
+ * does not have actual support for stored link keys which makes this
+ * command redundant anyway.
+ *
+ * Some controllers indicate that they support handling deleting
+ * stored link keys, but they don't. The quirk lets a driver
+ * just disable this command.
+ */
+ if (hdev->commands[6] & 0x80 &&
+ !test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) {
+ struct hci_cp_delete_stored_link_key cp;
+
+ bacpy(&cp.bdaddr, BDADDR_ANY);
+ cp.delete_all = 0x01;
+ hci_req_add(req, HCI_OP_DELETE_STORED_LINK_KEY,
+ sizeof(cp), &cp);
+ }
+
+ /* Set event mask page 2 if the HCI command for it is supported */
+ if (hdev->commands[22] & 0x04)
+ hci_set_event_mask_page_2(req);
+
+ /* Read local codec list if the HCI command is supported */
+ if (hdev->commands[29] & 0x20)
+ hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL);
+
+ /* Get MWS transport configuration if the HCI command is supported */
+ if (hdev->commands[30] & 0x08)
+ hci_req_add(req, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL);
+
+ /* Check for Synchronization Train support */
+ if (lmp_sync_train_capable(hdev))
+ hci_req_add(req, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL);
+
+ /* Enable Secure Connections if supported and configured */
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) &&
+ bredr_sc_enabled(hdev)) {
+ u8 support = 0x01;
+
+ hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT,
+ sizeof(support), &support);
+ }
+}
+
+static int __hci_init(struct hci_dev *hdev)
+{
+ int err;
+
+ err = __hci_req_sync(hdev, hci_init1_req, 0, HCI_INIT_TIMEOUT);
+ if (err < 0)
+ return err;
+
+ /* The Device Under Test (DUT) mode is special and available for
+ * all controller types. So just create it early on.
+ */
+ if (hci_dev_test_flag(hdev, HCI_SETUP)) {
+ debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev,
+ &dut_mode_fops);
+ }
+
+ err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT);
+ if (err < 0)
+ return err;
+
+ /* HCI_BREDR covers both single-mode LE, BR/EDR and dual-mode
+ * BR/EDR/LE type controllers. AMP controllers only need the
+ * first two stages of init.
+ */
+ if (hdev->dev_type != HCI_BREDR)
+ return 0;
+
+ err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT);
+ if (err < 0)
+ return err;
+
+ err = __hci_req_sync(hdev, hci_init4_req, 0, HCI_INIT_TIMEOUT);
+ if (err < 0)
+ return err;
+
+ /* This function is only called when the controller is actually in
+ * configured state. When the controller is marked as unconfigured,
+ * this initialization procedure is not run.
+ *
+ * It means that it is possible that a controller runs through its
+ * setup phase and then discovers missing settings. If that is the
+ * case, then this function will not be called. It then will only
+ * be called during the config phase.
+ *
+ * So only when in setup phase or config phase, create the debugfs
+ * entries and register the SMP channels.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG))
+ return 0;
+
+ hci_debugfs_create_common(hdev);
+
+ if (lmp_bredr_capable(hdev))
+ hci_debugfs_create_bredr(hdev);
+
+ if (lmp_le_capable(hdev))
+ hci_debugfs_create_le(hdev);
+
+ return 0;
+}
+
+static void hci_init0_req(struct hci_request *req, unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ BT_DBG("%s %ld", hdev->name, opt);
+
+ /* Reset */
+ if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks))
+ hci_reset_req(req, 0);
+
+ /* Read Local Version */
+ hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL);
+
+ /* Read BD Address */
+ if (hdev->set_bdaddr)
+ hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL);
+}
+
+static int __hci_unconf_init(struct hci_dev *hdev)
+{
+ int err;
+
+ if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+ return 0;
+
+ err = __hci_req_sync(hdev, hci_init0_req, 0, HCI_INIT_TIMEOUT);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void hci_scan_req(struct hci_request *req, unsigned long opt)
+{
+ __u8 scan = opt;
+
+ BT_DBG("%s %x", req->hdev->name, scan);
+
+ /* Inquiry and Page scans */
+ hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+}
+
+static void hci_auth_req(struct hci_request *req, unsigned long opt)
+{
+ __u8 auth = opt;
+
+ BT_DBG("%s %x", req->hdev->name, auth);
+
+ /* Authentication */
+ hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth);
+}
+
+static void hci_encrypt_req(struct hci_request *req, unsigned long opt)
+{
+ __u8 encrypt = opt;
+
+ BT_DBG("%s %x", req->hdev->name, encrypt);
+
+ /* Encryption */
+ hci_req_add(req, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt);
+}
+
+static void hci_linkpol_req(struct hci_request *req, unsigned long opt)
+{
+ __le16 policy = cpu_to_le16(opt);
+
+ BT_DBG("%s %x", req->hdev->name, policy);
+
+ /* Default link policy */
+ hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy);
+}
+
+/* Get HCI device by index.
+ * Device is held on return. */
+struct hci_dev *hci_dev_get(int index)
+{
+ struct hci_dev *hdev = NULL, *d;
+
+ BT_DBG("%d", index);
+
+ if (index < 0)
+ return NULL;
+
+ read_lock(&hci_dev_list_lock);
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (d->id == index) {
+ hdev = hci_dev_hold(d);
+ break;
+ }
+ }
+ read_unlock(&hci_dev_list_lock);
+ return hdev;
+}
+
+/* ---- Inquiry support ---- */
+
+bool hci_discovery_active(struct hci_dev *hdev)
+{
+ struct discovery_state *discov = &hdev->discovery;
+
+ switch (discov->state) {
+ case DISCOVERY_FINDING:
+ case DISCOVERY_RESOLVING:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+void hci_discovery_set_state(struct hci_dev *hdev, int state)
+{
+ int old_state = hdev->discovery.state;
+
+ BT_DBG("%s state %u -> %u", hdev->name, hdev->discovery.state, state);
+
+ if (old_state == state)
+ return;
+
+ hdev->discovery.state = state;
+
+ switch (state) {
+ case DISCOVERY_STOPPED:
+ hci_update_background_scan(hdev);
+
+ if (old_state != DISCOVERY_STARTING)
+ mgmt_discovering(hdev, 0);
+ break;
+ case DISCOVERY_STARTING:
+ break;
+ case DISCOVERY_FINDING:
+ mgmt_discovering(hdev, 1);
+ break;
+ case DISCOVERY_RESOLVING:
+ break;
+ case DISCOVERY_STOPPING:
+ break;
+ }
+}
+
+void hci_inquiry_cache_flush(struct hci_dev *hdev)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *p, *n;
+
+ list_for_each_entry_safe(p, n, &cache->all, all) {
+ list_del(&p->all);
+ kfree(p);
+ }
+
+ INIT_LIST_HEAD(&cache->unknown);
+ INIT_LIST_HEAD(&cache->resolve);
+}
+
+struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev,
+ bdaddr_t *bdaddr)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ BT_DBG("cache %p, %pMR", cache, bdaddr);
+
+ list_for_each_entry(e, &cache->all, all) {
+ if (!bacmp(&e->data.bdaddr, bdaddr))
+ return e;
+ }
+
+ return NULL;
+}
+
+struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev,
+ bdaddr_t *bdaddr)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ BT_DBG("cache %p, %pMR", cache, bdaddr);
+
+ list_for_each_entry(e, &cache->unknown, list) {
+ if (!bacmp(&e->data.bdaddr, bdaddr))
+ return e;
+ }
+
+ return NULL;
+}
+
+struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev,
+ bdaddr_t *bdaddr,
+ int state)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ BT_DBG("cache %p bdaddr %pMR state %d", cache, bdaddr, state);
+
+ list_for_each_entry(e, &cache->resolve, list) {
+ if (!bacmp(bdaddr, BDADDR_ANY) && e->name_state == state)
+ return e;
+ if (!bacmp(&e->data.bdaddr, bdaddr))
+ return e;
+ }
+
+ return NULL;
+}
+
+void hci_inquiry_cache_update_resolve(struct hci_dev *hdev,
+ struct inquiry_entry *ie)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct list_head *pos = &cache->resolve;
+ struct inquiry_entry *p;
+
+ list_del(&ie->list);
+
+ list_for_each_entry(p, &cache->resolve, list) {
+ if (p->name_state != NAME_PENDING &&
+ abs(p->data.rssi) >= abs(ie->data.rssi))
+ break;
+ pos = &p->list;
+ }
+
+ list_add(&ie->list, pos);
+}
+
+u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data,
+ bool name_known)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *ie;
+ u32 flags = 0;
+
+ BT_DBG("cache %p, %pMR", cache, &data->bdaddr);
+
+ hci_remove_remote_oob_data(hdev, &data->bdaddr, BDADDR_BREDR);
+
+ if (!data->ssp_mode)
+ flags |= MGMT_DEV_FOUND_LEGACY_PAIRING;
+
+ ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr);
+ if (ie) {
+ if (!ie->data.ssp_mode)
+ flags |= MGMT_DEV_FOUND_LEGACY_PAIRING;
+
+ if (ie->name_state == NAME_NEEDED &&
+ data->rssi != ie->data.rssi) {
+ ie->data.rssi = data->rssi;
+ hci_inquiry_cache_update_resolve(hdev, ie);
+ }
+
+ goto update;
+ }
+
+ /* Entry not in the cache. Add new one. */
+ ie = kzalloc(sizeof(*ie), GFP_KERNEL);
+ if (!ie) {
+ flags |= MGMT_DEV_FOUND_CONFIRM_NAME;
+ goto done;
+ }
+
+ list_add(&ie->all, &cache->all);
+
+ if (name_known) {
+ ie->name_state = NAME_KNOWN;
+ } else {
+ ie->name_state = NAME_NOT_KNOWN;
+ list_add(&ie->list, &cache->unknown);
+ }
+
+update:
+ if (name_known && ie->name_state != NAME_KNOWN &&
+ ie->name_state != NAME_PENDING) {
+ ie->name_state = NAME_KNOWN;
+ list_del(&ie->list);
+ }
+
+ memcpy(&ie->data, data, sizeof(*data));
+ ie->timestamp = jiffies;
+ cache->timestamp = jiffies;
+
+ if (ie->name_state == NAME_NOT_KNOWN)
+ flags |= MGMT_DEV_FOUND_CONFIRM_NAME;
+
+done:
+ return flags;
+}
+
+static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf)
+{
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_info *info = (struct inquiry_info *) buf;
+ struct inquiry_entry *e;
+ int copied = 0;
+
+ list_for_each_entry(e, &cache->all, all) {
+ struct inquiry_data *data = &e->data;
+
+ if (copied >= num)
+ break;
+
+ bacpy(&info->bdaddr, &data->bdaddr);
+ info->pscan_rep_mode = data->pscan_rep_mode;
+ info->pscan_period_mode = data->pscan_period_mode;
+ info->pscan_mode = data->pscan_mode;
+ memcpy(info->dev_class, data->dev_class, 3);
+ info->clock_offset = data->clock_offset;
+
+ info++;
+ copied++;
+ }
+
+ BT_DBG("cache %p, copied %d", cache, copied);
+ return copied;
+}
+
+static void hci_inq_req(struct hci_request *req, unsigned long opt)
+{
+ struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt;
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_inquiry cp;
+
+ BT_DBG("%s", hdev->name);
+
+ if (test_bit(HCI_INQUIRY, &hdev->flags))
+ return;
+
+ /* Start Inquiry */
+ memcpy(&cp.lap, &ir->lap, 3);
+ cp.length = ir->length;
+ cp.num_rsp = ir->num_rsp;
+ hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
+}
+
+int hci_inquiry(void __user *arg)
+{
+ __u8 __user *ptr = arg;
+ struct hci_inquiry_req ir;
+ struct hci_dev *hdev;
+ int err = 0, do_inquiry = 0, max_rsp;
+ long timeo;
+ __u8 *buf;
+
+ if (copy_from_user(&ir, ptr, sizeof(ir)))
+ return -EFAULT;
+
+ hdev = hci_dev_get(ir.dev_id);
+ if (!hdev)
+ return -ENODEV;
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EBUSY;
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ if (hdev->dev_type != HCI_BREDR) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ hci_dev_lock(hdev);
+ if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX ||
+ inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) {
+ hci_inquiry_cache_flush(hdev);
+ do_inquiry = 1;
+ }
+ hci_dev_unlock(hdev);
+
+ timeo = ir.length * msecs_to_jiffies(2000);
+
+ if (do_inquiry) {
+ err = hci_req_sync(hdev, hci_inq_req, (unsigned long) &ir,
+ timeo);
+ if (err < 0)
+ goto done;
+
+ /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is
+ * cleared). If it is interrupted by a signal, return -EINTR.
+ */
+ if (wait_on_bit(&hdev->flags, HCI_INQUIRY,
+ TASK_INTERRUPTIBLE))
+ return -EINTR;
+ }
+
+ /* for unlimited number of responses we will use buffer with
+ * 255 entries
+ */
+ max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp;
+
+ /* cache_dump can't sleep. Therefore we allocate temp buffer and then
+ * copy it to the user space.
+ */
+ buf = kmalloc(sizeof(struct inquiry_info) * max_rsp, GFP_KERNEL);
+ if (!buf) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ hci_dev_lock(hdev);
+ ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf);
+ hci_dev_unlock(hdev);
+
+ BT_DBG("num_rsp %d", ir.num_rsp);
+
+ if (!copy_to_user(ptr, &ir, sizeof(ir))) {
+ ptr += sizeof(ir);
+ if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) *
+ ir.num_rsp))
+ err = -EFAULT;
+ } else
+ err = -EFAULT;
+
+ kfree(buf);
+
+done:
+ hci_dev_put(hdev);
+ return err;
+}
+
+static int hci_dev_do_open(struct hci_dev *hdev)
+{
+ int ret = 0;
+
+ BT_DBG("%s %p", hdev->name, hdev);
+
+ hci_req_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ ret = -ENODEV;
+ goto done;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ /* Check for rfkill but allow the HCI setup stage to
+ * proceed (which in itself doesn't cause any RF activity).
+ */
+ if (hci_dev_test_flag(hdev, HCI_RFKILLED)) {
+ ret = -ERFKILL;
+ goto done;
+ }
+
+ /* Check for valid public address or a configured static
+ * random adddress, but let the HCI setup proceed to
+ * be able to determine if there is a public address
+ * or not.
+ *
+ * In case of user channel usage, it is not important
+ * if a public address or static random address is
+ * available.
+ *
+ * This check is only valid for BR/EDR controllers
+ * since AMP controllers do not have an address.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hdev->dev_type == HCI_BREDR &&
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !bacmp(&hdev->static_addr, BDADDR_ANY)) {
+ ret = -EADDRNOTAVAIL;
+ goto done;
+ }
+ }
+
+ if (test_bit(HCI_UP, &hdev->flags)) {
+ ret = -EALREADY;
+ goto done;
+ }
+
+ if (hdev->open(hdev)) {
+ ret = -EIO;
+ goto done;
+ }
+
+ atomic_set(&hdev->cmd_cnt, 1);
+ set_bit(HCI_INIT, &hdev->flags);
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP)) {
+ if (hdev->setup)
+ ret = hdev->setup(hdev);
+
+ /* The transport driver can set these quirks before
+ * creating the HCI device or in its setup callback.
+ *
+ * In case any of them is set, the controller has to
+ * start up as unconfigured.
+ */
+ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) ||
+ test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks))
+ hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
+
+ /* For an unconfigured controller it is required to
+ * read at least the version information provided by
+ * the Read Local Version Information command.
+ *
+ * If the set_bdaddr driver callback is provided, then
+ * also the original Bluetooth public device address
+ * will be read using the Read BD Address command.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ ret = __hci_unconf_init(hdev);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ /* If public address change is configured, ensure that
+ * the address gets programmed. If the driver does not
+ * support changing the public address, fail the power
+ * on procedure.
+ */
+ if (bacmp(&hdev->public_addr, BDADDR_ANY) &&
+ hdev->set_bdaddr)
+ ret = hdev->set_bdaddr(hdev, &hdev->public_addr);
+ else
+ ret = -EADDRNOTAVAIL;
+ }
+
+ if (!ret) {
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+ ret = __hci_init(hdev);
+ }
+
+ clear_bit(HCI_INIT, &hdev->flags);
+
+ if (!ret) {
+ hci_dev_hold(hdev);
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ set_bit(HCI_UP, &hdev->flags);
+ hci_notify(hdev, HCI_DEV_UP);
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG) &&
+ !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hdev->dev_type == HCI_BREDR) {
+ hci_dev_lock(hdev);
+ mgmt_powered(hdev, 1);
+ hci_dev_unlock(hdev);
+ }
+ } else {
+ /* Init failed, cleanup */
+ flush_work(&hdev->tx_work);
+ flush_work(&hdev->cmd_work);
+ flush_work(&hdev->rx_work);
+
+ skb_queue_purge(&hdev->cmd_q);
+ skb_queue_purge(&hdev->rx_q);
+
+ if (hdev->flush)
+ hdev->flush(hdev);
+
+ if (hdev->sent_cmd) {
+ kfree_skb(hdev->sent_cmd);
+ hdev->sent_cmd = NULL;
+ }
+
+ hdev->close(hdev);
+ hdev->flags &= BIT(HCI_RAW);
+ }
+
+done:
+ hci_req_unlock(hdev);
+ return ret;
+}
+
+/* ---- HCI ioctl helpers ---- */
+
+int hci_dev_open(__u16 dev)
+{
+ struct hci_dev *hdev;
+ int err;
+
+ hdev = hci_dev_get(dev);
+ if (!hdev)
+ return -ENODEV;
+
+ /* Devices that are marked as unconfigured can only be powered
+ * up as user channel. Trying to bring them up as normal devices
+ * will result into a failure. Only user channel operation is
+ * possible.
+ *
+ * When this function is called for a user channel, the flag
+ * HCI_USER_CHANNEL will be set first before attempting to
+ * open the device.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ /* We need to ensure that no other power on/off work is pending
+ * before proceeding to call hci_dev_do_open. This is
+ * particularly important if the setup procedure has not yet
+ * completed.
+ */
+ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF))
+ cancel_delayed_work(&hdev->power_off);
+
+ /* After this call it is guaranteed that the setup procedure
+ * has finished. This means that error conditions like RFKILL
+ * or no valid public or static random address apply.
+ */
+ flush_workqueue(hdev->req_workqueue);
+
+ /* For controllers not using the management interface and that
+ * are brought up using legacy ioctl, set the HCI_BONDABLE bit
+ * so that pairing works for them. Once the management interface
+ * is in use this bit will be cleared again and userspace has
+ * to explicitly enable it.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ !hci_dev_test_flag(hdev, HCI_MGMT))
+ hci_dev_set_flag(hdev, HCI_BONDABLE);
+
+ err = hci_dev_do_open(hdev);
+
+done:
+ hci_dev_put(hdev);
+ return err;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void hci_pend_le_actions_clear(struct hci_dev *hdev)
+{
+ struct hci_conn_params *p;
+
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ if (p->conn) {
+ hci_conn_drop(p->conn);
+ hci_conn_put(p->conn);
+ p->conn = NULL;
+ }
+ list_del_init(&p->action);
+ }
+
+ BT_DBG("All LE pending actions cleared");
+}
+
+static int hci_dev_do_close(struct hci_dev *hdev)
+{
+ BT_DBG("%s %p", hdev->name, hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
+ test_bit(HCI_UP, &hdev->flags)) {
+ /* Execute vendor specific shutdown routine */
+ if (hdev->shutdown)
+ hdev->shutdown(hdev);
+ }
+
+ cancel_delayed_work(&hdev->power_off);
+
+ hci_req_cancel(hdev, ENODEV);
+ hci_req_lock(hdev);
+
+ if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ hci_req_unlock(hdev);
+ return 0;
+ }
+
+ /* Flush RX and TX works */
+ flush_work(&hdev->tx_work);
+ flush_work(&hdev->rx_work);
+
+ if (hdev->discov_timeout > 0) {
+ cancel_delayed_work(&hdev->discov_off);
+ hdev->discov_timeout = 0;
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
+ cancel_delayed_work(&hdev->service_cache);
+
+ cancel_delayed_work_sync(&hdev->le_scan_disable);
+ cancel_delayed_work_sync(&hdev->le_scan_restart);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ cancel_delayed_work_sync(&hdev->rpa_expired);
+
+ /* Avoid potential lockdep warnings from the *_flush() calls by
+ * ensuring the workqueue is empty up front.
+ */
+ drain_workqueue(hdev->workqueue);
+
+ hci_dev_lock(hdev);
+
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+ if (!hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
+ if (hdev->dev_type == HCI_BREDR)
+ mgmt_powered(hdev, 0);
+ }
+
+ hci_inquiry_cache_flush(hdev);
+ hci_pend_le_actions_clear(hdev);
+ hci_conn_hash_flush(hdev);
+ hci_dev_unlock(hdev);
+
+ smp_unregister(hdev);
+
+ hci_notify(hdev, HCI_DEV_DOWN);
+
+ if (hdev->flush)
+ hdev->flush(hdev);
+
+ /* Reset device */
+ skb_queue_purge(&hdev->cmd_q);
+ atomic_set(&hdev->cmd_cnt, 1);
+ if (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) &&
+ !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) {
+ set_bit(HCI_INIT, &hdev->flags);
+ __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT);
+ clear_bit(HCI_INIT, &hdev->flags);
+ }
+
+ /* flush cmd work */
+ flush_work(&hdev->cmd_work);
+
+ /* Drop queues */
+ skb_queue_purge(&hdev->rx_q);
+ skb_queue_purge(&hdev->cmd_q);
+ skb_queue_purge(&hdev->raw_q);
+
+ /* Drop last sent command */
+ if (hdev->sent_cmd) {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ kfree_skb(hdev->sent_cmd);
+ hdev->sent_cmd = NULL;
+ }
+
+ /* After this point our queues are empty
+ * and no tasks are scheduled. */
+ hdev->close(hdev);
+
+ /* Clear flags */
+ hdev->flags &= BIT(HCI_RAW);
+ hci_dev_clear_volatile_flags(hdev);
+
+ /* Controller radio is available but is currently powered down */
+ hdev->amp_status = AMP_STATUS_POWERED_DOWN;
+
+ memset(hdev->eir, 0, sizeof(hdev->eir));
+ memset(hdev->dev_class, 0, sizeof(hdev->dev_class));
+ bacpy(&hdev->random_addr, BDADDR_ANY);
+
+ hci_req_unlock(hdev);
+
+ hci_dev_put(hdev);
+ return 0;
+}
+
+int hci_dev_close(__u16 dev)
+{
+ struct hci_dev *hdev;
+ int err;
+
+ hdev = hci_dev_get(dev);
+ if (!hdev)
+ return -ENODEV;
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EBUSY;
+ goto done;
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF))
+ cancel_delayed_work(&hdev->power_off);
+
+ err = hci_dev_do_close(hdev);
+
+done:
+ hci_dev_put(hdev);
+ return err;
+}
+
+static int hci_dev_do_reset(struct hci_dev *hdev)
+{
+ int ret;
+
+ BT_DBG("%s %p", hdev->name, hdev);
+
+ hci_req_lock(hdev);
+
+ /* Drop queues */
+ skb_queue_purge(&hdev->rx_q);
+ skb_queue_purge(&hdev->cmd_q);
+
+ /* Avoid potential lockdep warnings from the *_flush() calls by
+ * ensuring the workqueue is empty up front.
+ */
+ drain_workqueue(hdev->workqueue);
+
+ hci_dev_lock(hdev);
+ hci_inquiry_cache_flush(hdev);
+ hci_conn_hash_flush(hdev);
+ hci_dev_unlock(hdev);
+
+ if (hdev->flush)
+ hdev->flush(hdev);
+
+ atomic_set(&hdev->cmd_cnt, 1);
+ hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0;
+
+ ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT);
+
+ hci_req_unlock(hdev);
+ return ret;
+}
+
+int hci_dev_reset(__u16 dev)
+{
+ struct hci_dev *hdev;
+ int err;
+
+ hdev = hci_dev_get(dev);
+ if (!hdev)
+ return -ENODEV;
+
+ if (!test_bit(HCI_UP, &hdev->flags)) {
+ err = -ENETDOWN;
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EBUSY;
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ err = hci_dev_do_reset(hdev);
+
+done:
+ hci_dev_put(hdev);
+ return err;
+}
+
+int hci_dev_reset_stat(__u16 dev)
+{
+ struct hci_dev *hdev;
+ int ret = 0;
+
+ hdev = hci_dev_get(dev);
+ if (!hdev)
+ return -ENODEV;
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ ret = -EBUSY;
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ ret = -EOPNOTSUPP;
+ goto done;
+ }
+
+ memset(&hdev->stat, 0, sizeof(struct hci_dev_stats));
+
+done:
+ hci_dev_put(hdev);
+ return ret;
+}
+
+static void hci_update_scan_state(struct hci_dev *hdev, u8 scan)
+{
+ bool conn_changed, discov_changed;
+
+ BT_DBG("%s scan 0x%02x", hdev->name, scan);
+
+ if ((scan & SCAN_PAGE))
+ conn_changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_CONNECTABLE);
+ else
+ conn_changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_CONNECTABLE);
+
+ if ((scan & SCAN_INQUIRY)) {
+ discov_changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_DISCOVERABLE);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ discov_changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_DISCOVERABLE);
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+
+ if (conn_changed || discov_changed) {
+ /* In case this was disabled through mgmt */
+ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ mgmt_update_adv_data(hdev);
+
+ mgmt_new_settings(hdev);
+ }
+}
+
+int hci_dev_cmd(unsigned int cmd, void __user *arg)
+{
+ struct hci_dev *hdev;
+ struct hci_dev_req dr;
+ int err = 0;
+
+ if (copy_from_user(&dr, arg, sizeof(dr)))
+ return -EFAULT;
+
+ hdev = hci_dev_get(dr.dev_id);
+ if (!hdev)
+ return -ENODEV;
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EBUSY;
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ if (hdev->dev_type != HCI_BREDR) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ switch (cmd) {
+ case HCISETAUTH:
+ err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt,
+ HCI_INIT_TIMEOUT);
+ break;
+
+ case HCISETENCRYPT:
+ if (!lmp_encrypt_capable(hdev)) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ if (!test_bit(HCI_AUTH, &hdev->flags)) {
+ /* Auth must be enabled first */
+ err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt,
+ HCI_INIT_TIMEOUT);
+ if (err)
+ break;
+ }
+
+ err = hci_req_sync(hdev, hci_encrypt_req, dr.dev_opt,
+ HCI_INIT_TIMEOUT);
+ break;
+
+ case HCISETSCAN:
+ err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt,
+ HCI_INIT_TIMEOUT);
+
+ /* Ensure that the connectable and discoverable states
+ * get correctly modified as this was a non-mgmt change.
+ */
+ if (!err)
+ hci_update_scan_state(hdev, dr.dev_opt);
+ break;
+
+ case HCISETLINKPOL:
+ err = hci_req_sync(hdev, hci_linkpol_req, dr.dev_opt,
+ HCI_INIT_TIMEOUT);
+ break;
+
+ case HCISETLINKMODE:
+ hdev->link_mode = ((__u16) dr.dev_opt) &
+ (HCI_LM_MASTER | HCI_LM_ACCEPT);
+ break;
+
+ case HCISETPTYPE:
+ hdev->pkt_type = (__u16) dr.dev_opt;
+ break;
+
+ case HCISETACLMTU:
+ hdev->acl_mtu = *((__u16 *) &dr.dev_opt + 1);
+ hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0);
+ break;
+
+ case HCISETSCOMTU:
+ hdev->sco_mtu = *((__u16 *) &dr.dev_opt + 1);
+ hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0);
+ break;
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+done:
+ hci_dev_put(hdev);
+ return err;
+}
+
+int hci_get_dev_list(void __user *arg)
+{
+ struct hci_dev *hdev;
+ struct hci_dev_list_req *dl;
+ struct hci_dev_req *dr;
+ int n = 0, size, err;
+ __u16 dev_num;
+
+ if (get_user(dev_num, (__u16 __user *) arg))
+ return -EFAULT;
+
+ if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr))
+ return -EINVAL;
+
+ size = sizeof(*dl) + dev_num * sizeof(*dr);
+
+ dl = kzalloc(size, GFP_KERNEL);
+ if (!dl)
+ return -ENOMEM;
+
+ dr = dl->dev_req;
+
+ read_lock(&hci_dev_list_lock);
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ unsigned long flags = hdev->flags;
+
+ /* When the auto-off is configured it means the transport
+ * is running, but in that case still indicate that the
+ * device is actually down.
+ */
+ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF))
+ flags &= ~BIT(HCI_UP);
+
+ (dr + n)->dev_id = hdev->id;
+ (dr + n)->dev_opt = flags;
+
+ if (++n >= dev_num)
+ break;
+ }
+ read_unlock(&hci_dev_list_lock);
+
+ dl->dev_num = n;
+ size = sizeof(*dl) + n * sizeof(*dr);
+
+ err = copy_to_user(arg, dl, size);
+ kfree(dl);
+
+ return err ? -EFAULT : 0;
+}
+
+int hci_get_dev_info(void __user *arg)
+{
+ struct hci_dev *hdev;
+ struct hci_dev_info di;
+ unsigned long flags;
+ int err = 0;
+
+ if (copy_from_user(&di, arg, sizeof(di)))
+ return -EFAULT;
+
+ hdev = hci_dev_get(di.dev_id);
+ if (!hdev)
+ return -ENODEV;
+
+ /* When the auto-off is configured it means the transport
+ * is running, but in that case still indicate that the
+ * device is actually down.
+ */
+ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF))
+ flags = hdev->flags & ~BIT(HCI_UP);
+ else
+ flags = hdev->flags;
+
+ strcpy(di.name, hdev->name);
+ di.bdaddr = hdev->bdaddr;
+ di.type = (hdev->bus & 0x0f) | ((hdev->dev_type & 0x03) << 4);
+ di.flags = flags;
+ di.pkt_type = hdev->pkt_type;
+ if (lmp_bredr_capable(hdev)) {
+ di.acl_mtu = hdev->acl_mtu;
+ di.acl_pkts = hdev->acl_pkts;
+ di.sco_mtu = hdev->sco_mtu;
+ di.sco_pkts = hdev->sco_pkts;
+ } else {
+ di.acl_mtu = hdev->le_mtu;
+ di.acl_pkts = hdev->le_pkts;
+ di.sco_mtu = 0;
+ di.sco_pkts = 0;
+ }
+ di.link_policy = hdev->link_policy;
+ di.link_mode = hdev->link_mode;
+
+ memcpy(&di.stat, &hdev->stat, sizeof(di.stat));
+ memcpy(&di.features, &hdev->features, sizeof(di.features));
+
+ if (copy_to_user(arg, &di, sizeof(di)))
+ err = -EFAULT;
+
+ hci_dev_put(hdev);
+
+ return err;
+}
+
+/* ---- Interface to HCI drivers ---- */
+
+static int hci_rfkill_set_block(void *data, bool blocked)
+{
+ struct hci_dev *hdev = data;
+
+ BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked);
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+ return -EBUSY;
+
+ if (blocked) {
+ hci_dev_set_flag(hdev, HCI_RFKILLED);
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG))
+ hci_dev_do_close(hdev);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_RFKILLED);
+ }
+
+ return 0;
+}
+
+static const struct rfkill_ops hci_rfkill_ops = {
+ .set_block = hci_rfkill_set_block,
+};
+
+static void hci_power_on(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, power_on);
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ err = hci_dev_do_open(hdev);
+ if (err < 0) {
+ hci_dev_lock(hdev);
+ mgmt_set_powered_failed(hdev, err);
+ hci_dev_unlock(hdev);
+ return;
+ }
+
+ /* During the HCI setup phase, a few error conditions are
+ * ignored and they need to be checked now. If they are still
+ * valid, it is important to turn the device back off.
+ */
+ if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
+ hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
+ (hdev->dev_type == HCI_BREDR &&
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
+ hci_dev_do_close(hdev);
+ } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) {
+ queue_delayed_work(hdev->req_workqueue, &hdev->power_off,
+ HCI_AUTO_OFF_TIMEOUT);
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) {
+ /* For unconfigured devices, set the HCI_RAW flag
+ * so that userspace can easily identify them.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ set_bit(HCI_RAW, &hdev->flags);
+
+ /* For fully configured devices, this will send
+ * the Index Added event. For unconfigured devices,
+ * it will send Unconfigued Index Added event.
+ *
+ * Devices with HCI_QUIRK_RAW_DEVICE are ignored
+ * and no event will be send.
+ */
+ mgmt_index_added(hdev);
+ } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) {
+ /* When the controller is now configured, then it
+ * is important to clear the HCI_RAW flag.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ clear_bit(HCI_RAW, &hdev->flags);
+
+ /* Powering on the controller with HCI_CONFIG set only
+ * happens with the transition from unconfigured to
+ * configured. This will send the Index Added event.
+ */
+ mgmt_index_added(hdev);
+ }
+}
+
+static void hci_power_off(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ power_off.work);
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_do_close(hdev);
+}
+
+static void hci_error_reset(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset);
+
+ BT_DBG("%s", hdev->name);
+
+ if (hdev->hw_error)
+ hdev->hw_error(hdev, hdev->hw_error_code);
+ else
+ BT_ERR("%s hardware error 0x%2.2x", hdev->name,
+ hdev->hw_error_code);
+
+ if (hci_dev_do_close(hdev))
+ return;
+
+ hci_dev_do_open(hdev);
+}
+
+static void hci_discov_off(struct work_struct *work)
+{
+ struct hci_dev *hdev;
+
+ hdev = container_of(work, struct hci_dev, discov_off.work);
+
+ BT_DBG("%s", hdev->name);
+
+ mgmt_discoverable_timeout(hdev);
+}
+
+void hci_uuids_clear(struct hci_dev *hdev)
+{
+ struct bt_uuid *uuid, *tmp;
+
+ list_for_each_entry_safe(uuid, tmp, &hdev->uuids, list) {
+ list_del(&uuid->list);
+ kfree(uuid);
+ }
+}
+
+void hci_link_keys_clear(struct hci_dev *hdev)
+{
+ struct link_key *key;
+
+ list_for_each_entry_rcu(key, &hdev->link_keys, list) {
+ list_del_rcu(&key->list);
+ kfree_rcu(key, rcu);
+ }
+}
+
+void hci_smp_ltks_clear(struct hci_dev *hdev)
+{
+ struct smp_ltk *k;
+
+ list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ list_del_rcu(&k->list);
+ kfree_rcu(k, rcu);
+ }
+}
+
+void hci_smp_irks_clear(struct hci_dev *hdev)
+{
+ struct smp_irk *k;
+
+ list_for_each_entry_rcu(k, &hdev->identity_resolving_keys, list) {
+ list_del_rcu(&k->list);
+ kfree_rcu(k, rcu);
+ }
+}
+
+struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+ struct link_key *k;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(k, &hdev->link_keys, list) {
+ if (bacmp(bdaddr, &k->bdaddr) == 0) {
+ rcu_read_unlock();
+ return k;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 key_type, u8 old_key_type)
+{
+ /* Legacy key */
+ if (key_type < 0x03)
+ return true;
+
+ /* Debug keys are insecure so don't store them persistently */
+ if (key_type == HCI_LK_DEBUG_COMBINATION)
+ return false;
+
+ /* Changed combination key and there's no previous one */
+ if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff)
+ return false;
+
+ /* Security mode 3 case */
+ if (!conn)
+ return true;
+
+ /* BR/EDR key derived using SC from an LE link */
+ if (conn->type == LE_LINK)
+ return true;
+
+ /* Neither local nor remote side had no-bonding as requirement */
+ if (conn->auth_type > 0x01 && conn->remote_auth > 0x01)
+ return true;
+
+ /* Local side had dedicated bonding as requirement */
+ if (conn->auth_type == 0x02 || conn->auth_type == 0x03)
+ return true;
+
+ /* Remote side had dedicated bonding as requirement */
+ if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03)
+ return true;
+
+ /* If none of the above criteria match, then don't store the key
+ * persistently */
+ return false;
+}
+
+static u8 ltk_role(u8 type)
+{
+ if (type == SMP_LTK)
+ return HCI_ROLE_MASTER;
+
+ return HCI_ROLE_SLAVE;
+}
+
+struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, u8 role)
+{
+ struct smp_ltk *k;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ if (addr_type != k->bdaddr_type || bacmp(bdaddr, &k->bdaddr))
+ continue;
+
+ if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) {
+ rcu_read_unlock();
+ return k;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa)
+{
+ struct smp_irk *irk;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
+ if (!bacmp(&irk->rpa, rpa)) {
+ rcu_read_unlock();
+ return irk;
+ }
+ }
+
+ list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
+ if (smp_irk_matches(hdev, irk->val, rpa)) {
+ bacpy(&irk->rpa, rpa);
+ rcu_read_unlock();
+ return irk;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type)
+{
+ struct smp_irk *irk;
+
+ /* Identity Address must be public or static random */
+ if (addr_type == ADDR_LE_DEV_RANDOM && (bdaddr->b[5] & 0xc0) != 0xc0)
+ return NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
+ if (addr_type == irk->addr_type &&
+ bacmp(bdaddr, &irk->bdaddr) == 0) {
+ rcu_read_unlock();
+ return irk;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn,
+ bdaddr_t *bdaddr, u8 *val, u8 type,
+ u8 pin_len, bool *persistent)
+{
+ struct link_key *key, *old_key;
+ u8 old_key_type;
+
+ old_key = hci_find_link_key(hdev, bdaddr);
+ if (old_key) {
+ old_key_type = old_key->type;
+ key = old_key;
+ } else {
+ old_key_type = conn ? conn->key_type : 0xff;
+ key = kzalloc(sizeof(*key), GFP_KERNEL);
+ if (!key)
+ return NULL;
+ list_add_rcu(&key->list, &hdev->link_keys);
+ }
+
+ BT_DBG("%s key for %pMR type %u", hdev->name, bdaddr, type);
+
+ /* Some buggy controller combinations generate a changed
+ * combination key for legacy pairing even when there's no
+ * previous key */
+ if (type == HCI_LK_CHANGED_COMBINATION &&
+ (!conn || conn->remote_auth == 0xff) && old_key_type == 0xff) {
+ type = HCI_LK_COMBINATION;
+ if (conn)
+ conn->key_type = type;
+ }
+
+ bacpy(&key->bdaddr, bdaddr);
+ memcpy(key->val, val, HCI_LINK_KEY_SIZE);
+ key->pin_len = pin_len;
+
+ if (type == HCI_LK_CHANGED_COMBINATION)
+ key->type = old_key_type;
+ else
+ key->type = type;
+
+ if (persistent)
+ *persistent = hci_persistent_key(hdev, conn, type,
+ old_key_type);
+
+ return key;
+}
+
+struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, u8 type, u8 authenticated,
+ u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand)
+{
+ struct smp_ltk *key, *old_key;
+ u8 role = ltk_role(type);
+
+ old_key = hci_find_ltk(hdev, bdaddr, addr_type, role);
+ if (old_key)
+ key = old_key;
+ else {
+ key = kzalloc(sizeof(*key), GFP_KERNEL);
+ if (!key)
+ return NULL;
+ list_add_rcu(&key->list, &hdev->long_term_keys);
+ }
+
+ bacpy(&key->bdaddr, bdaddr);
+ key->bdaddr_type = addr_type;
+ memcpy(key->val, tk, sizeof(key->val));
+ key->authenticated = authenticated;
+ key->ediv = ediv;
+ key->rand = rand;
+ key->enc_size = enc_size;
+ key->type = type;
+
+ return key;
+}
+
+struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, u8 val[16], bdaddr_t *rpa)
+{
+ struct smp_irk *irk;
+
+ irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type);
+ if (!irk) {
+ irk = kzalloc(sizeof(*irk), GFP_KERNEL);
+ if (!irk)
+ return NULL;
+
+ bacpy(&irk->bdaddr, bdaddr);
+ irk->addr_type = addr_type;
+
+ list_add_rcu(&irk->list, &hdev->identity_resolving_keys);
+ }
+
+ memcpy(irk->val, val, 16);
+ bacpy(&irk->rpa, rpa);
+
+ return irk;
+}
+
+int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+ struct link_key *key;
+
+ key = hci_find_link_key(hdev, bdaddr);
+ if (!key)
+ return -ENOENT;
+
+ BT_DBG("%s removing %pMR", hdev->name, bdaddr);
+
+ list_del_rcu(&key->list);
+ kfree_rcu(key, rcu);
+
+ return 0;
+}
+
+int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type)
+{
+ struct smp_ltk *k;
+ int removed = 0;
+
+ list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type)
+ continue;
+
+ BT_DBG("%s removing %pMR", hdev->name, bdaddr);
+
+ list_del_rcu(&k->list);
+ kfree_rcu(k, rcu);
+ removed++;
+ }
+
+ return removed ? 0 : -ENOENT;
+}
+
+void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type)
+{
+ struct smp_irk *k;
+
+ list_for_each_entry_rcu(k, &hdev->identity_resolving_keys, list) {
+ if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type)
+ continue;
+
+ BT_DBG("%s removing %pMR", hdev->name, bdaddr);
+
+ list_del_rcu(&k->list);
+ kfree_rcu(k, rcu);
+ }
+}
+
+bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type)
+{
+ struct smp_ltk *k;
+ struct smp_irk *irk;
+ u8 addr_type;
+
+ if (type == BDADDR_BREDR) {
+ if (hci_find_link_key(hdev, bdaddr))
+ return true;
+ return false;
+ }
+
+ /* Convert to HCI addr type which struct smp_ltk uses */
+ if (type == BDADDR_LE_PUBLIC)
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ else
+ addr_type = ADDR_LE_DEV_RANDOM;
+
+ irk = hci_get_irk(hdev, bdaddr, addr_type);
+ if (irk) {
+ bdaddr = &irk->bdaddr;
+ addr_type = irk->addr_type;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ if (k->bdaddr_type == addr_type && !bacmp(bdaddr, &k->bdaddr)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+/* HCI command timer function */
+static void hci_cmd_timeout(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ cmd_timer.work);
+
+ if (hdev->sent_cmd) {
+ struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data;
+ u16 opcode = __le16_to_cpu(sent->opcode);
+
+ BT_ERR("%s command 0x%4.4x tx timeout", hdev->name, opcode);
+ } else {
+ BT_ERR("%s command tx timeout", hdev->name);
+ }
+
+ atomic_set(&hdev->cmd_cnt, 1);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+}
+
+struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 bdaddr_type)
+{
+ struct oob_data *data;
+
+ list_for_each_entry(data, &hdev->remote_oob_data, list) {
+ if (bacmp(bdaddr, &data->bdaddr) != 0)
+ continue;
+ if (data->bdaddr_type != bdaddr_type)
+ continue;
+ return data;
+ }
+
+ return NULL;
+}
+
+int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 bdaddr_type)
+{
+ struct oob_data *data;
+
+ data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type);
+ if (!data)
+ return -ENOENT;
+
+ BT_DBG("%s removing %pMR (%u)", hdev->name, bdaddr, bdaddr_type);
+
+ list_del(&data->list);
+ kfree(data);
+
+ return 0;
+}
+
+void hci_remote_oob_data_clear(struct hci_dev *hdev)
+{
+ struct oob_data *data, *n;
+
+ list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) {
+ list_del(&data->list);
+ kfree(data);
+ }
+}
+
+int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 bdaddr_type, u8 *hash192, u8 *rand192,
+ u8 *hash256, u8 *rand256)
+{
+ struct oob_data *data;
+
+ data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type);
+ if (!data) {
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ bacpy(&data->bdaddr, bdaddr);
+ data->bdaddr_type = bdaddr_type;
+ list_add(&data->list, &hdev->remote_oob_data);
+ }
+
+ if (hash192 && rand192) {
+ memcpy(data->hash192, hash192, sizeof(data->hash192));
+ memcpy(data->rand192, rand192, sizeof(data->rand192));
+ if (hash256 && rand256)
+ data->present = 0x03;
+ } else {
+ memset(data->hash192, 0, sizeof(data->hash192));
+ memset(data->rand192, 0, sizeof(data->rand192));
+ if (hash256 && rand256)
+ data->present = 0x02;
+ else
+ data->present = 0x00;
+ }
+
+ if (hash256 && rand256) {
+ memcpy(data->hash256, hash256, sizeof(data->hash256));
+ memcpy(data->rand256, rand256, sizeof(data->rand256));
+ } else {
+ memset(data->hash256, 0, sizeof(data->hash256));
+ memset(data->rand256, 0, sizeof(data->rand256));
+ if (hash192 && rand192)
+ data->present = 0x01;
+ }
+
+ BT_DBG("%s for %pMR", hdev->name, bdaddr);
+
+ return 0;
+}
+
+struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
+ bdaddr_t *bdaddr, u8 type)
+{
+ struct bdaddr_list *b;
+
+ list_for_each_entry(b, bdaddr_list, list) {
+ if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
+ return b;
+ }
+
+ return NULL;
+}
+
+void hci_bdaddr_list_clear(struct list_head *bdaddr_list)
+{
+ struct list_head *p, *n;
+
+ list_for_each_safe(p, n, bdaddr_list) {
+ struct bdaddr_list *b = list_entry(p, struct bdaddr_list, list);
+
+ list_del(p);
+ kfree(b);
+ }
+}
+
+int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type)
+{
+ struct bdaddr_list *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY))
+ return -EBADF;
+
+ if (hci_bdaddr_list_lookup(list, bdaddr, type))
+ return -EEXIST;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ bacpy(&entry->bdaddr, bdaddr);
+ entry->bdaddr_type = type;
+
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
+int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
+{
+ struct bdaddr_list *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY)) {
+ hci_bdaddr_list_clear(list);
+ return 0;
+ }
+
+ entry = hci_bdaddr_list_lookup(list, bdaddr, type);
+ if (!entry)
+ return -ENOENT;
+
+ list_del(&entry->list);
+ kfree(entry);
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
+ bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *params;
+
+ /* The conn params list only contains identity addresses */
+ if (!hci_is_identity_address(addr, addr_type))
+ return NULL;
+
+ list_for_each_entry(params, &hdev->le_conn_params, list) {
+ if (bacmp(&params->addr, addr) == 0 &&
+ params->addr_type == addr_type) {
+ return params;
+ }
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
+ bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *param;
+
+ /* The list only contains identity addresses */
+ if (!hci_is_identity_address(addr, addr_type))
+ return NULL;
+
+ list_for_each_entry(param, list, action) {
+ if (bacmp(&param->addr, addr) == 0 &&
+ param->addr_type == addr_type)
+ return param;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
+ bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *params;
+
+ if (!hci_is_identity_address(addr, addr_type))
+ return NULL;
+
+ params = hci_conn_params_lookup(hdev, addr, addr_type);
+ if (params)
+ return params;
+
+ params = kzalloc(sizeof(*params), GFP_KERNEL);
+ if (!params) {
+ BT_ERR("Out of memory");
+ return NULL;
+ }
+
+ bacpy(&params->addr, addr);
+ params->addr_type = addr_type;
+
+ list_add(&params->list, &hdev->le_conn_params);
+ INIT_LIST_HEAD(&params->action);
+
+ params->conn_min_interval = hdev->le_conn_min_interval;
+ params->conn_max_interval = hdev->le_conn_max_interval;
+ params->conn_latency = hdev->le_conn_latency;
+ params->supervision_timeout = hdev->le_supv_timeout;
+ params->auto_connect = HCI_AUTO_CONN_DISABLED;
+
+ BT_DBG("addr %pMR (type %u)", addr, addr_type);
+
+ return params;
+}
+
+static void hci_conn_params_free(struct hci_conn_params *params)
+{
+ if (params->conn) {
+ hci_conn_drop(params->conn);
+ hci_conn_put(params->conn);
+ }
+
+ list_del(&params->action);
+ list_del(&params->list);
+ kfree(params);
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type)
+{
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_lookup(hdev, addr, addr_type);
+ if (!params)
+ return;
+
+ hci_conn_params_free(params);
+
+ hci_update_background_scan(hdev);
+
+ BT_DBG("addr %pMR (type %u)", addr, addr_type);
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_conn_params_clear_disabled(struct hci_dev *hdev)
+{
+ struct hci_conn_params *params, *tmp;
+
+ list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) {
+ if (params->auto_connect != HCI_AUTO_CONN_DISABLED)
+ continue;
+ list_del(&params->list);
+ kfree(params);
+ }
+
+ BT_DBG("All LE disabled connection parameters were removed");
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_conn_params_clear_all(struct hci_dev *hdev)
+{
+ struct hci_conn_params *params, *tmp;
+
+ list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list)
+ hci_conn_params_free(params);
+
+ hci_update_background_scan(hdev);
+
+ BT_DBG("All LE connection parameters were removed");
+}
+
+static void inquiry_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ if (status) {
+ BT_ERR("Failed to start inquiry: status %d", status);
+
+ hci_dev_lock(hdev);
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ hci_dev_unlock(hdev);
+ return;
+ }
+}
+
+static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ /* General inquiry access code (GIAC) */
+ u8 lap[3] = { 0x33, 0x8b, 0x9e };
+ struct hci_cp_inquiry cp;
+ int err;
+
+ if (status) {
+ BT_ERR("Failed to disable LE scanning: status %d", status);
+ return;
+ }
+
+ hdev->discovery.scan_start = 0;
+
+ switch (hdev->discovery.type) {
+ case DISCOV_TYPE_LE:
+ hci_dev_lock(hdev);
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ hci_dev_unlock(hdev);
+ break;
+
+ case DISCOV_TYPE_INTERLEAVED:
+ hci_dev_lock(hdev);
+
+ if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY,
+ &hdev->quirks)) {
+ /* If we were running LE only scan, change discovery
+ * state. If we were running both LE and BR/EDR inquiry
+ * simultaneously, and BR/EDR inquiry is already
+ * finished, stop discovery, otherwise BR/EDR inquiry
+ * will stop discovery when finished. If we will resolve
+ * remote device name, do not change discovery state.
+ */
+ if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
+ hdev->discovery.state != DISCOVERY_RESOLVING)
+ hci_discovery_set_state(hdev,
+ DISCOVERY_STOPPED);
+ } else {
+ struct hci_request req;
+
+ hci_inquiry_cache_flush(hdev);
+
+ hci_req_init(&req, hdev);
+
+ memset(&cp, 0, sizeof(cp));
+ memcpy(&cp.lap, lap, sizeof(cp.lap));
+ cp.length = DISCOV_INTERLEAVED_INQUIRY_LEN;
+ hci_req_add(&req, HCI_OP_INQUIRY, sizeof(cp), &cp);
+
+ err = hci_req_run(&req, inquiry_complete);
+ if (err) {
+ BT_ERR("Inquiry request failed: err %d", err);
+ hci_discovery_set_state(hdev,
+ DISCOVERY_STOPPED);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+ break;
+ }
+}
+
+static void le_scan_disable_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ le_scan_disable.work);
+ struct hci_request req;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ cancel_delayed_work_sync(&hdev->le_scan_restart);
+
+ hci_req_init(&req, hdev);
+
+ hci_req_add_le_scan_disable(&req);
+
+ err = hci_req_run(&req, le_scan_disable_work_complete);
+ if (err)
+ BT_ERR("Disable LE scanning request failed: err %d", err);
+}
+
+static void le_scan_restart_work_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ unsigned long timeout, duration, scan_start, now;
+
+ BT_DBG("%s", hdev->name);
+
+ if (status) {
+ BT_ERR("Failed to restart LE scan: status %d", status);
+ return;
+ }
+
+ if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) ||
+ !hdev->discovery.scan_start)
+ return;
+
+ /* When the scan was started, hdev->le_scan_disable has been queued
+ * after duration from scan_start. During scan restart this job
+ * has been canceled, and we need to queue it again after proper
+ * timeout, to make sure that scan does not run indefinitely.
+ */
+ duration = hdev->discovery.scan_duration;
+ scan_start = hdev->discovery.scan_start;
+ now = jiffies;
+ if (now - scan_start <= duration) {
+ int elapsed;
+
+ if (now >= scan_start)
+ elapsed = now - scan_start;
+ else
+ elapsed = ULONG_MAX - scan_start + now;
+
+ timeout = duration - elapsed;
+ } else {
+ timeout = 0;
+ }
+ queue_delayed_work(hdev->workqueue,
+ &hdev->le_scan_disable, timeout);
+}
+
+static void le_scan_restart_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ le_scan_restart.work);
+ struct hci_request req;
+ struct hci_cp_le_set_scan_enable cp;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ /* If controller is not scanning we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ return;
+
+ hci_req_init(&req, hdev);
+
+ hci_req_add_le_scan_disable(&req);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_ENABLE;
+ cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ hci_req_add(&req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+
+ err = hci_req_run(&req, le_scan_restart_work_complete);
+ if (err)
+ BT_ERR("Restart LE scan request failed: err %d", err);
+}
+
+/* Copy the Identity Address of the controller.
+ *
+ * If the controller has a public BD_ADDR, then by default use that one.
+ * If this is a LE only controller without a public address, default to
+ * the static random address.
+ *
+ * For debugging purposes it is possible to force controllers with a
+ * public address to use the static random address instead.
+ *
+ * In case BR/EDR has been disabled on a dual-mode controller and
+ * userspace has configured a static address, then that address
+ * becomes the identity address instead of the public BR/EDR address.
+ */
+void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 *bdaddr_type)
+{
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
+ (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ bacpy(bdaddr, &hdev->static_addr);
+ *bdaddr_type = ADDR_LE_DEV_RANDOM;
+ } else {
+ bacpy(bdaddr, &hdev->bdaddr);
+ *bdaddr_type = ADDR_LE_DEV_PUBLIC;
+ }
+}
+
+/* Alloc HCI device */
+struct hci_dev *hci_alloc_dev(void)
+{
+ struct hci_dev *hdev;
+
+ hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
+ if (!hdev)
+ return NULL;
+
+ hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1);
+ hdev->esco_type = (ESCO_HV1);
+ hdev->link_mode = (HCI_LM_ACCEPT);
+ hdev->num_iac = 0x01; /* One IAC support is mandatory */
+ hdev->io_capability = 0x03; /* No Input No Output */
+ hdev->manufacturer = 0xffff; /* Default to internal use */
+ hdev->inq_tx_power = HCI_TX_POWER_INVALID;
+ hdev->adv_tx_power = HCI_TX_POWER_INVALID;
+
+ hdev->sniff_max_interval = 800;
+ hdev->sniff_min_interval = 80;
+
+ hdev->le_adv_channel_map = 0x07;
+ hdev->le_adv_min_interval = 0x0800;
+ hdev->le_adv_max_interval = 0x0800;
+ hdev->le_scan_interval = 0x0060;
+ hdev->le_scan_window = 0x0030;
+ hdev->le_conn_min_interval = 0x0028;
+ hdev->le_conn_max_interval = 0x0038;
+ hdev->le_conn_latency = 0x0000;
+ hdev->le_supv_timeout = 0x002a;
+ hdev->le_def_tx_len = 0x001b;
+ hdev->le_def_tx_time = 0x0148;
+ hdev->le_max_tx_len = 0x001b;
+ hdev->le_max_tx_time = 0x0148;
+ hdev->le_max_rx_len = 0x001b;
+ hdev->le_max_rx_time = 0x0148;
+
+ hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT;
+ hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
+ hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE;
+ hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE;
+
+ mutex_init(&hdev->lock);
+ mutex_init(&hdev->req_lock);
+
+ INIT_LIST_HEAD(&hdev->mgmt_pending);
+ INIT_LIST_HEAD(&hdev->blacklist);
+ INIT_LIST_HEAD(&hdev->whitelist);
+ INIT_LIST_HEAD(&hdev->uuids);
+ INIT_LIST_HEAD(&hdev->link_keys);
+ INIT_LIST_HEAD(&hdev->long_term_keys);
+ INIT_LIST_HEAD(&hdev->identity_resolving_keys);
+ INIT_LIST_HEAD(&hdev->remote_oob_data);
+ INIT_LIST_HEAD(&hdev->le_white_list);
+ INIT_LIST_HEAD(&hdev->le_conn_params);
+ INIT_LIST_HEAD(&hdev->pend_le_conns);
+ INIT_LIST_HEAD(&hdev->pend_le_reports);
+ INIT_LIST_HEAD(&hdev->conn_hash.list);
+
+ INIT_WORK(&hdev->rx_work, hci_rx_work);
+ INIT_WORK(&hdev->cmd_work, hci_cmd_work);
+ INIT_WORK(&hdev->tx_work, hci_tx_work);
+ INIT_WORK(&hdev->power_on, hci_power_on);
+ INIT_WORK(&hdev->error_reset, hci_error_reset);
+
+ INIT_DELAYED_WORK(&hdev->power_off, hci_power_off);
+ INIT_DELAYED_WORK(&hdev->discov_off, hci_discov_off);
+ INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
+ INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
+
+ skb_queue_head_init(&hdev->rx_q);
+ skb_queue_head_init(&hdev->cmd_q);
+ skb_queue_head_init(&hdev->raw_q);
+
+ init_waitqueue_head(&hdev->req_wait_q);
+
+ INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout);
+
+ hci_init_sysfs(hdev);
+ discovery_init(hdev);
+ adv_info_init(hdev);
+
+ return hdev;
+}
+EXPORT_SYMBOL(hci_alloc_dev);
+
+/* Free HCI device */
+void hci_free_dev(struct hci_dev *hdev)
+{
+ /* will free via device release */
+ put_device(&hdev->dev);
+}
+EXPORT_SYMBOL(hci_free_dev);
+
+/* Register HCI device */
+int hci_register_dev(struct hci_dev *hdev)
+{
+ int id, error;
+
+ if (!hdev->open || !hdev->close || !hdev->send)
+ return -EINVAL;
+
+ /* Do not allow HCI_AMP devices to register at index 0,
+ * so the index can be used as the AMP controller ID.
+ */
+ switch (hdev->dev_type) {
+ case HCI_BREDR:
+ id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL);
+ break;
+ case HCI_AMP:
+ id = ida_simple_get(&hci_index_ida, 1, 0, GFP_KERNEL);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (id < 0)
+ return id;
+
+ sprintf(hdev->name, "hci%d", id);
+ hdev->id = id;
+
+ BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
+
+ hdev->workqueue = alloc_workqueue("%s", WQ_HIGHPRI | WQ_UNBOUND |
+ WQ_MEM_RECLAIM, 1, hdev->name);
+ if (!hdev->workqueue) {
+ error = -ENOMEM;
+ goto err;
+ }
+
+ hdev->req_workqueue = alloc_workqueue("%s", WQ_HIGHPRI | WQ_UNBOUND |
+ WQ_MEM_RECLAIM, 1, hdev->name);
+ if (!hdev->req_workqueue) {
+ destroy_workqueue(hdev->workqueue);
+ error = -ENOMEM;
+ goto err;
+ }
+
+ if (!IS_ERR_OR_NULL(bt_debugfs))
+ hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs);
+
+ dev_set_name(&hdev->dev, "%s", hdev->name);
+
+ error = device_add(&hdev->dev);
+ if (error < 0)
+ goto err_wqueue;
+
+ hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev,
+ RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops,
+ hdev);
+ if (hdev->rfkill) {
+ if (rfkill_register(hdev->rfkill) < 0) {
+ rfkill_destroy(hdev->rfkill);
+ hdev->rfkill = NULL;
+ }
+ }
+
+ if (hdev->rfkill && rfkill_blocked(hdev->rfkill))
+ hci_dev_set_flag(hdev, HCI_RFKILLED);
+
+ hci_dev_set_flag(hdev, HCI_SETUP);
+ hci_dev_set_flag(hdev, HCI_AUTO_OFF);
+
+ if (hdev->dev_type == HCI_BREDR) {
+ /* Assume BR/EDR support until proven otherwise (such as
+ * through reading supported features during init.
+ */
+ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
+ }
+
+ write_lock(&hci_dev_list_lock);
+ list_add(&hdev->list, &hci_dev_list);
+ write_unlock(&hci_dev_list_lock);
+
+ /* Devices that are marked for raw-only usage are unconfigured
+ * and should not be included in normal operation.
+ */
+ if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+ hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
+
+ hci_notify(hdev, HCI_DEV_REG);
+ hci_dev_hold(hdev);
+
+ queue_work(hdev->req_workqueue, &hdev->power_on);
+
+ return id;
+
+err_wqueue:
+ destroy_workqueue(hdev->workqueue);
+ destroy_workqueue(hdev->req_workqueue);
+err:
+ ida_simple_remove(&hci_index_ida, hdev->id);
+
+ return error;
+}
+EXPORT_SYMBOL(hci_register_dev);
+
+/* Unregister HCI device */
+void hci_unregister_dev(struct hci_dev *hdev)
+{
+ int id;
+
+ BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
+
+ hci_dev_set_flag(hdev, HCI_UNREGISTER);
+
+ id = hdev->id;
+
+ write_lock(&hci_dev_list_lock);
+ list_del(&hdev->list);
+ write_unlock(&hci_dev_list_lock);
+
+ hci_dev_do_close(hdev);
+
+ cancel_work_sync(&hdev->power_on);
+
+ if (!test_bit(HCI_INIT, &hdev->flags) &&
+ !hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ hci_dev_lock(hdev);
+ mgmt_index_removed(hdev);
+ hci_dev_unlock(hdev);
+ }
+
+ /* mgmt_index_removed should take care of emptying the
+ * pending list */
+ BUG_ON(!list_empty(&hdev->mgmt_pending));
+
+ hci_notify(hdev, HCI_DEV_UNREG);
+
+ if (hdev->rfkill) {
+ rfkill_unregister(hdev->rfkill);
+ rfkill_destroy(hdev->rfkill);
+ }
+
+ device_del(&hdev->dev);
+
+ debugfs_remove_recursive(hdev->debugfs);
+
+ destroy_workqueue(hdev->workqueue);
+ destroy_workqueue(hdev->req_workqueue);
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_clear(&hdev->blacklist);
+ hci_bdaddr_list_clear(&hdev->whitelist);
+ hci_uuids_clear(hdev);
+ hci_link_keys_clear(hdev);
+ hci_smp_ltks_clear(hdev);
+ hci_smp_irks_clear(hdev);
+ hci_remote_oob_data_clear(hdev);
+ hci_bdaddr_list_clear(&hdev->le_white_list);
+ hci_conn_params_clear_all(hdev);
+ hci_discovery_filter_clear(hdev);
+ hci_dev_unlock(hdev);
+
+ hci_dev_put(hdev);
+
+ ida_simple_remove(&hci_index_ida, id);
+}
+EXPORT_SYMBOL(hci_unregister_dev);
+
+/* Suspend HCI device */
+int hci_suspend_dev(struct hci_dev *hdev)
+{
+ hci_notify(hdev, HCI_DEV_SUSPEND);
+ return 0;
+}
+EXPORT_SYMBOL(hci_suspend_dev);
+
+/* Resume HCI device */
+int hci_resume_dev(struct hci_dev *hdev)
+{
+ hci_notify(hdev, HCI_DEV_RESUME);
+ return 0;
+}
+EXPORT_SYMBOL(hci_resume_dev);
+
+/* Reset HCI device */
+int hci_reset_dev(struct hci_dev *hdev)
+{
+ const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 };
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(3, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
+ memcpy(skb_put(skb, 3), hw_err, 3);
+
+ /* Send Hardware Error to upper stack */
+ return hci_recv_frame(hdev, skb);
+}
+EXPORT_SYMBOL(hci_reset_dev);
+
+/* Receive frame from HCI drivers */
+int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ if (!hdev || (!test_bit(HCI_UP, &hdev->flags)
+ && !test_bit(HCI_INIT, &hdev->flags))) {
+ kfree_skb(skb);
+ return -ENXIO;
+ }
+
+ /* Incoming skb */
+ bt_cb(skb)->incoming = 1;
+
+ /* Time stamp */
+ __net_timestamp(skb);
+
+ skb_queue_tail(&hdev->rx_q, skb);
+ queue_work(hdev->workqueue, &hdev->rx_work);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_recv_frame);
+
+/* ---- Interface to upper protocols ---- */
+
+int hci_register_cb(struct hci_cb *cb)
+{
+ BT_DBG("%p name %s", cb, cb->name);
+
+ mutex_lock(&hci_cb_list_lock);
+ list_add_tail(&cb->list, &hci_cb_list);
+ mutex_unlock(&hci_cb_list_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_register_cb);
+
+int hci_unregister_cb(struct hci_cb *cb)
+{
+ BT_DBG("%p name %s", cb, cb->name);
+
+ mutex_lock(&hci_cb_list_lock);
+ list_del(&cb->list);
+ mutex_unlock(&hci_cb_list_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_unregister_cb);
+
+static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ int err;
+
+ BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
+
+ /* Time stamp */
+ __net_timestamp(skb);
+
+ /* Send copy to monitor */
+ hci_send_to_monitor(hdev, skb);
+
+ if (atomic_read(&hdev->promisc)) {
+ /* Send copy to the sockets */
+ hci_send_to_sock(hdev, skb);
+ }
+
+ /* Get rid of skb owner, prior to sending to the driver. */
+ skb_orphan(skb);
+
+ err = hdev->send(hdev, skb);
+ if (err < 0) {
+ BT_ERR("%s sending frame failed (%d)", hdev->name, err);
+ kfree_skb(skb);
+ }
+}
+
+/* Send HCI command */
+int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
+ const void *param)
+{
+ struct sk_buff *skb;
+
+ BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen);
+
+ skb = hci_prepare_cmd(hdev, opcode, plen, param);
+ if (!skb) {
+ BT_ERR("%s no memory for command", hdev->name);
+ return -ENOMEM;
+ }
+
+ /* Stand-alone HCI commands must be flagged as
+ * single-command requests.
+ */
+ bt_cb(skb)->req.start = true;
+
+ skb_queue_tail(&hdev->cmd_q, skb);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+
+ return 0;
+}
+
+/* Get data from the previously sent command */
+void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+{
+ struct hci_command_hdr *hdr;
+
+ if (!hdev->sent_cmd)
+ return NULL;
+
+ hdr = (void *) hdev->sent_cmd->data;
+
+ if (hdr->opcode != cpu_to_le16(opcode))
+ return NULL;
+
+ BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode);
+
+ return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE;
+}
+
+/* Send ACL data */
+static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags)
+{
+ struct hci_acl_hdr *hdr;
+ int len = skb->len;
+
+ skb_push(skb, HCI_ACL_HDR_SIZE);
+ skb_reset_transport_header(skb);
+ hdr = (struct hci_acl_hdr *)skb_transport_header(skb);
+ hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags));
+ hdr->dlen = cpu_to_le16(len);
+}
+
+static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
+ struct sk_buff *skb, __u16 flags)
+{
+ struct hci_conn *conn = chan->conn;
+ struct hci_dev *hdev = conn->hdev;
+ struct sk_buff *list;
+
+ skb->len = skb_headlen(skb);
+ skb->data_len = 0;
+
+ bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
+
+ switch (hdev->dev_type) {
+ case HCI_BREDR:
+ hci_add_acl_hdr(skb, conn->handle, flags);
+ break;
+ case HCI_AMP:
+ hci_add_acl_hdr(skb, chan->handle, flags);
+ break;
+ default:
+ BT_ERR("%s unknown dev_type %d", hdev->name, hdev->dev_type);
+ return;
+ }
+
+ list = skb_shinfo(skb)->frag_list;
+ if (!list) {
+ /* Non fragmented */
+ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len);
+
+ skb_queue_tail(queue, skb);
+ } else {
+ /* Fragmented */
+ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+ skb_shinfo(skb)->frag_list = NULL;
+
+ /* Queue all fragments atomically. We need to use spin_lock_bh
+ * here because of 6LoWPAN links, as there this function is
+ * called from softirq and using normal spin lock could cause
+ * deadlocks.
+ */
+ spin_lock_bh(&queue->lock);
+
+ __skb_queue_tail(queue, skb);
+
+ flags &= ~ACL_START;
+ flags |= ACL_CONT;
+ do {
+ skb = list; list = list->next;
+
+ bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
+ hci_add_acl_hdr(skb, conn->handle, flags);
+
+ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+ __skb_queue_tail(queue, skb);
+ } while (list);
+
+ spin_unlock_bh(&queue->lock);
+ }
+}
+
+void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags)
+{
+ struct hci_dev *hdev = chan->conn->hdev;
+
+ BT_DBG("%s chan %p flags 0x%4.4x", hdev->name, chan, flags);
+
+ hci_queue_acl(chan, &chan->data_q, skb, flags);
+
+ queue_work(hdev->workqueue, &hdev->tx_work);
+}
+
+/* Send SCO data */
+void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_sco_hdr hdr;
+
+ BT_DBG("%s len %d", hdev->name, skb->len);
+
+ hdr.handle = cpu_to_le16(conn->handle);
+ hdr.dlen = skb->len;
+
+ skb_push(skb, HCI_SCO_HDR_SIZE);
+ skb_reset_transport_header(skb);
+ memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE);
+
+ bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
+
+ skb_queue_tail(&conn->data_q, skb);
+ queue_work(hdev->workqueue, &hdev->tx_work);
+}
+
+/* ---- HCI TX task (outgoing data) ---- */
+
+/* HCI Connection scheduler */
+static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
+ int *quote)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *conn = NULL, *c;
+ unsigned int num = 0, min = ~0;
+
+ /* We don't have to lock device here. Connections are always
+ * added and removed with TX task disabled. */
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(c, &h->list, list) {
+ if (c->type != type || skb_queue_empty(&c->data_q))
+ continue;
+
+ if (c->state != BT_CONNECTED && c->state != BT_CONFIG)
+ continue;
+
+ num++;
+
+ if (c->sent < min) {
+ min = c->sent;
+ conn = c;
+ }
+
+ if (hci_conn_num(hdev, type) == num)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ if (conn) {
+ int cnt, q;
+
+ switch (conn->type) {
+ case ACL_LINK:
+ cnt = hdev->acl_cnt;
+ break;
+ case SCO_LINK:
+ case ESCO_LINK:
+ cnt = hdev->sco_cnt;
+ break;
+ case LE_LINK:
+ cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
+ break;
+ default:
+ cnt = 0;
+ BT_ERR("Unknown link type");
+ }
+
+ q = cnt / num;
+ *quote = q ? q : 1;
+ } else
+ *quote = 0;
+
+ BT_DBG("conn %p quote %d", conn, *quote);
+ return conn;
+}
+
+static void hci_link_tx_to(struct hci_dev *hdev, __u8 type)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *c;
+
+ BT_ERR("%s link tx timeout", hdev->name);
+
+ rcu_read_lock();
+
+ /* Kill stalled connections */
+ list_for_each_entry_rcu(c, &h->list, list) {
+ if (c->type == type && c->sent) {
+ BT_ERR("%s killing stalled connection %pMR",
+ hdev->name, &c->dst);
+ hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM);
+ }
+ }
+
+ rcu_read_unlock();
+}
+
+static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type,
+ int *quote)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_chan *chan = NULL;
+ unsigned int num = 0, min = ~0, cur_prio = 0;
+ struct hci_conn *conn;
+ int cnt, q, conn_num = 0;
+
+ BT_DBG("%s", hdev->name);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(conn, &h->list, list) {
+ struct hci_chan *tmp;
+
+ if (conn->type != type)
+ continue;
+
+ if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
+ continue;
+
+ conn_num++;
+
+ list_for_each_entry_rcu(tmp, &conn->chan_list, list) {
+ struct sk_buff *skb;
+
+ if (skb_queue_empty(&tmp->data_q))
+ continue;
+
+ skb = skb_peek(&tmp->data_q);
+ if (skb->priority < cur_prio)
+ continue;
+
+ if (skb->priority > cur_prio) {
+ num = 0;
+ min = ~0;
+ cur_prio = skb->priority;
+ }
+
+ num++;
+
+ if (conn->sent < min) {
+ min = conn->sent;
+ chan = tmp;
+ }
+ }
+
+ if (hci_conn_num(hdev, type) == conn_num)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ if (!chan)
+ return NULL;
+
+ switch (chan->conn->type) {
+ case ACL_LINK:
+ cnt = hdev->acl_cnt;
+ break;
+ case AMP_LINK:
+ cnt = hdev->block_cnt;
+ break;
+ case SCO_LINK:
+ case ESCO_LINK:
+ cnt = hdev->sco_cnt;
+ break;
+ case LE_LINK:
+ cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
+ break;
+ default:
+ cnt = 0;
+ BT_ERR("Unknown link type");
+ }
+
+ q = cnt / num;
+ *quote = q ? q : 1;
+ BT_DBG("chan %p quote %d", chan, *quote);
+ return chan;
+}
+
+static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *conn;
+ int num = 0;
+
+ BT_DBG("%s", hdev->name);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(conn, &h->list, list) {
+ struct hci_chan *chan;
+
+ if (conn->type != type)
+ continue;
+
+ if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
+ continue;
+
+ num++;
+
+ list_for_each_entry_rcu(chan, &conn->chan_list, list) {
+ struct sk_buff *skb;
+
+ if (chan->sent) {
+ chan->sent = 0;
+ continue;
+ }
+
+ if (skb_queue_empty(&chan->data_q))
+ continue;
+
+ skb = skb_peek(&chan->data_q);
+ if (skb->priority >= HCI_PRIO_MAX - 1)
+ continue;
+
+ skb->priority = HCI_PRIO_MAX - 1;
+
+ BT_DBG("chan %p skb %p promoted to %d", chan, skb,
+ skb->priority);
+ }
+
+ if (hci_conn_num(hdev, type) == num)
+ break;
+ }
+
+ rcu_read_unlock();
+
+}
+
+static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ /* Calculate count of blocks used by this packet */
+ return DIV_ROUND_UP(skb->len - HCI_ACL_HDR_SIZE, hdev->block_len);
+}
+
+static void __check_timeout(struct hci_dev *hdev, unsigned int cnt)
+{
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ /* ACL tx timeout must be longer than maximum
+ * link supervision timeout (40.9 seconds) */
+ if (!cnt && time_after(jiffies, hdev->acl_last_tx +
+ HCI_ACL_TX_TIMEOUT))
+ hci_link_tx_to(hdev, ACL_LINK);
+ }
+}
+
+static void hci_sched_acl_pkt(struct hci_dev *hdev)
+{
+ unsigned int cnt = hdev->acl_cnt;
+ struct hci_chan *chan;
+ struct sk_buff *skb;
+ int quote;
+
+ __check_timeout(hdev, cnt);
+
+ while (hdev->acl_cnt &&
+ (chan = hci_chan_sent(hdev, ACL_LINK, &quote))) {
+ u32 priority = (skb_peek(&chan->data_q))->priority;
+ while (quote-- && (skb = skb_peek(&chan->data_q))) {
+ BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
+ skb->len, skb->priority);
+
+ /* Stop if priority has changed */
+ if (skb->priority < priority)
+ break;
+
+ skb = skb_dequeue(&chan->data_q);
+
+ hci_conn_enter_active_mode(chan->conn,
+ bt_cb(skb)->force_active);
+
+ hci_send_frame(hdev, skb);
+ hdev->acl_last_tx = jiffies;
+
+ hdev->acl_cnt--;
+ chan->sent++;
+ chan->conn->sent++;
+ }
+ }
+
+ if (cnt != hdev->acl_cnt)
+ hci_prio_recalculate(hdev, ACL_LINK);
+}
+
+static void hci_sched_acl_blk(struct hci_dev *hdev)
+{
+ unsigned int cnt = hdev->block_cnt;
+ struct hci_chan *chan;
+ struct sk_buff *skb;
+ int quote;
+ u8 type;
+
+ __check_timeout(hdev, cnt);
+
+ BT_DBG("%s", hdev->name);
+
+ if (hdev->dev_type == HCI_AMP)
+ type = AMP_LINK;
+ else
+ type = ACL_LINK;
+
+ while (hdev->block_cnt > 0 &&
+ (chan = hci_chan_sent(hdev, type, &quote))) {
+ u32 priority = (skb_peek(&chan->data_q))->priority;
+ while (quote > 0 && (skb = skb_peek(&chan->data_q))) {
+ int blocks;
+
+ BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
+ skb->len, skb->priority);
+
+ /* Stop if priority has changed */
+ if (skb->priority < priority)
+ break;
+
+ skb = skb_dequeue(&chan->data_q);
+
+ blocks = __get_blocks(hdev, skb);
+ if (blocks > hdev->block_cnt)
+ return;
+
+ hci_conn_enter_active_mode(chan->conn,
+ bt_cb(skb)->force_active);
+
+ hci_send_frame(hdev, skb);
+ hdev->acl_last_tx = jiffies;
+
+ hdev->block_cnt -= blocks;
+ quote -= blocks;
+
+ chan->sent += blocks;
+ chan->conn->sent += blocks;
+ }
+ }
+
+ if (cnt != hdev->block_cnt)
+ hci_prio_recalculate(hdev, type);
+}
+
+static void hci_sched_acl(struct hci_dev *hdev)
+{
+ BT_DBG("%s", hdev->name);
+
+ /* No ACL link over BR/EDR controller */
+ if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_BREDR)
+ return;
+
+ /* No AMP link over AMP controller */
+ if (!hci_conn_num(hdev, AMP_LINK) && hdev->dev_type == HCI_AMP)
+ return;
+
+ switch (hdev->flow_ctl_mode) {
+ case HCI_FLOW_CTL_MODE_PACKET_BASED:
+ hci_sched_acl_pkt(hdev);
+ break;
+
+ case HCI_FLOW_CTL_MODE_BLOCK_BASED:
+ hci_sched_acl_blk(hdev);
+ break;
+ }
+}
+
+/* Schedule SCO */
+static void hci_sched_sco(struct hci_dev *hdev)
+{
+ struct hci_conn *conn;
+ struct sk_buff *skb;
+ int quote;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hci_conn_num(hdev, SCO_LINK))
+ return;
+
+ while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, &quote))) {
+ while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+ BT_DBG("skb %p len %d", skb, skb->len);
+ hci_send_frame(hdev, skb);
+
+ conn->sent++;
+ if (conn->sent == ~0)
+ conn->sent = 0;
+ }
+ }
+}
+
+static void hci_sched_esco(struct hci_dev *hdev)
+{
+ struct hci_conn *conn;
+ struct sk_buff *skb;
+ int quote;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hci_conn_num(hdev, ESCO_LINK))
+ return;
+
+ while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK,
+ &quote))) {
+ while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+ BT_DBG("skb %p len %d", skb, skb->len);
+ hci_send_frame(hdev, skb);
+
+ conn->sent++;
+ if (conn->sent == ~0)
+ conn->sent = 0;
+ }
+ }
+}
+
+static void hci_sched_le(struct hci_dev *hdev)
+{
+ struct hci_chan *chan;
+ struct sk_buff *skb;
+ int quote, cnt, tmp;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hci_conn_num(hdev, LE_LINK))
+ return;
+
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ /* LE tx timeout must be longer than maximum
+ * link supervision timeout (40.9 seconds) */
+ if (!hdev->le_cnt && hdev->le_pkts &&
+ time_after(jiffies, hdev->le_last_tx + HZ * 45))
+ hci_link_tx_to(hdev, LE_LINK);
+ }
+
+ cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt;
+ tmp = cnt;
+ while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) {
+ u32 priority = (skb_peek(&chan->data_q))->priority;
+ while (quote-- && (skb = skb_peek(&chan->data_q))) {
+ BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
+ skb->len, skb->priority);
+
+ /* Stop if priority has changed */
+ if (skb->priority < priority)
+ break;
+
+ skb = skb_dequeue(&chan->data_q);
+
+ hci_send_frame(hdev, skb);
+ hdev->le_last_tx = jiffies;
+
+ cnt--;
+ chan->sent++;
+ chan->conn->sent++;
+ }
+ }
+
+ if (hdev->le_pkts)
+ hdev->le_cnt = cnt;
+ else
+ hdev->acl_cnt = cnt;
+
+ if (cnt != tmp)
+ hci_prio_recalculate(hdev, LE_LINK);
+}
+
+static void hci_tx_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work);
+ struct sk_buff *skb;
+
+ BT_DBG("%s acl %d sco %d le %d", hdev->name, hdev->acl_cnt,
+ hdev->sco_cnt, hdev->le_cnt);
+
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ /* Schedule queues and send stuff to HCI driver */
+ hci_sched_acl(hdev);
+ hci_sched_sco(hdev);
+ hci_sched_esco(hdev);
+ hci_sched_le(hdev);
+ }
+
+ /* Send next queued raw (unknown type) packet */
+ while ((skb = skb_dequeue(&hdev->raw_q)))
+ hci_send_frame(hdev, skb);
+}
+
+/* ----- HCI RX task (incoming data processing) ----- */
+
+/* ACL data packet */
+static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_acl_hdr *hdr = (void *) skb->data;
+ struct hci_conn *conn;
+ __u16 handle, flags;
+
+ skb_pull(skb, HCI_ACL_HDR_SIZE);
+
+ handle = __le16_to_cpu(hdr->handle);
+ flags = hci_flags(handle);
+ handle = hci_handle(handle);
+
+ BT_DBG("%s len %d handle 0x%4.4x flags 0x%4.4x", hdev->name, skb->len,
+ handle, flags);
+
+ hdev->stat.acl_rx++;
+
+ hci_dev_lock(hdev);
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ hci_dev_unlock(hdev);
+
+ if (conn) {
+ hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF);
+
+ /* Send to upper protocol */
+ l2cap_recv_acldata(conn, skb, flags);
+ return;
+ } else {
+ BT_ERR("%s ACL packet for unknown connection handle %d",
+ hdev->name, handle);
+ }
+
+ kfree_skb(skb);
+}
+
+/* SCO data packet */
+static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_sco_hdr *hdr = (void *) skb->data;
+ struct hci_conn *conn;
+ __u16 handle;
+
+ skb_pull(skb, HCI_SCO_HDR_SIZE);
+
+ handle = __le16_to_cpu(hdr->handle);
+
+ BT_DBG("%s len %d handle 0x%4.4x", hdev->name, skb->len, handle);
+
+ hdev->stat.sco_rx++;
+
+ hci_dev_lock(hdev);
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ hci_dev_unlock(hdev);
+
+ if (conn) {
+ /* Send to upper protocol */
+ sco_recv_scodata(conn, skb);
+ return;
+ } else {
+ BT_ERR("%s SCO packet for unknown connection handle %d",
+ hdev->name, handle);
+ }
+
+ kfree_skb(skb);
+}
+
+static bool hci_req_is_complete(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+
+ skb = skb_peek(&hdev->cmd_q);
+ if (!skb)
+ return true;
+
+ return bt_cb(skb)->req.start;
+}
+
+static void hci_resend_last(struct hci_dev *hdev)
+{
+ struct hci_command_hdr *sent;
+ struct sk_buff *skb;
+ u16 opcode;
+
+ if (!hdev->sent_cmd)
+ return;
+
+ sent = (void *) hdev->sent_cmd->data;
+ opcode = __le16_to_cpu(sent->opcode);
+ if (opcode == HCI_OP_RESET)
+ return;
+
+ skb = skb_clone(hdev->sent_cmd, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ skb_queue_head(&hdev->cmd_q, skb);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+}
+
+void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ BT_DBG("opcode 0x%04x status 0x%02x", opcode, status);
+
+ /* If the completed command doesn't match the last one that was
+ * sent we need to do special handling of it.
+ */
+ if (!hci_sent_cmd_data(hdev, opcode)) {
+ /* Some CSR based controllers generate a spontaneous
+ * reset complete event during init and any pending
+ * command will never be completed. In such a case we
+ * need to resend whatever was the last sent
+ * command.
+ */
+ if (test_bit(HCI_INIT, &hdev->flags) && opcode == HCI_OP_RESET)
+ hci_resend_last(hdev);
+
+ return;
+ }
+
+ /* If the command succeeded and there's still more commands in
+ * this request the request is not yet complete.
+ */
+ if (!status && !hci_req_is_complete(hdev))
+ return;
+
+ /* If this was the last command in a request the complete
+ * callback would be found in hdev->sent_cmd instead of the
+ * command queue (hdev->cmd_q).
+ */
+ if (bt_cb(hdev->sent_cmd)->req.complete) {
+ *req_complete = bt_cb(hdev->sent_cmd)->req.complete;
+ return;
+ }
+
+ if (bt_cb(hdev->sent_cmd)->req.complete_skb) {
+ *req_complete_skb = bt_cb(hdev->sent_cmd)->req.complete_skb;
+ return;
+ }
+
+ /* Remove all pending commands belonging to this request */
+ spin_lock_irqsave(&hdev->cmd_q.lock, flags);
+ while ((skb = __skb_dequeue(&hdev->cmd_q))) {
+ if (bt_cb(skb)->req.start) {
+ __skb_queue_head(&hdev->cmd_q, skb);
+ break;
+ }
+
+ *req_complete = bt_cb(skb)->req.complete;
+ *req_complete_skb = bt_cb(skb)->req.complete_skb;
+ kfree_skb(skb);
+ }
+ spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
+}
+
+static void hci_rx_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, rx_work);
+ struct sk_buff *skb;
+
+ BT_DBG("%s", hdev->name);
+
+ while ((skb = skb_dequeue(&hdev->rx_q))) {
+ /* Send copy to monitor */
+ hci_send_to_monitor(hdev, skb);
+
+ if (atomic_read(&hdev->promisc)) {
+ /* Send copy to the sockets */
+ hci_send_to_sock(hdev, skb);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ kfree_skb(skb);
+ continue;
+ }
+
+ if (test_bit(HCI_INIT, &hdev->flags)) {
+ /* Don't process data packets in this states. */
+ switch (bt_cb(skb)->pkt_type) {
+ case HCI_ACLDATA_PKT:
+ case HCI_SCODATA_PKT:
+ kfree_skb(skb);
+ continue;
+ }
+ }
+
+ /* Process frame */
+ switch (bt_cb(skb)->pkt_type) {
+ case HCI_EVENT_PKT:
+ BT_DBG("%s Event packet", hdev->name);
+ hci_event_packet(hdev, skb);
+ break;
+
+ case HCI_ACLDATA_PKT:
+ BT_DBG("%s ACL data packet", hdev->name);
+ hci_acldata_packet(hdev, skb);
+ break;
+
+ case HCI_SCODATA_PKT:
+ BT_DBG("%s SCO data packet", hdev->name);
+ hci_scodata_packet(hdev, skb);
+ break;
+
+ default:
+ kfree_skb(skb);
+ break;
+ }
+ }
+}
+
+static void hci_cmd_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work);
+ struct sk_buff *skb;
+
+ BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name,
+ atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q));
+
+ /* Send queued commands */
+ if (atomic_read(&hdev->cmd_cnt)) {
+ skb = skb_dequeue(&hdev->cmd_q);
+ if (!skb)
+ return;
+
+ kfree_skb(hdev->sent_cmd);
+
+ hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
+ if (hdev->sent_cmd) {
+ atomic_dec(&hdev->cmd_cnt);
+ hci_send_frame(hdev, skb);
+ if (test_bit(HCI_RESET, &hdev->flags))
+ cancel_delayed_work(&hdev->cmd_timer);
+ else
+ schedule_delayed_work(&hdev->cmd_timer,
+ HCI_CMD_TIMEOUT);
+ } else {
+ skb_queue_head(&hdev->cmd_q, skb);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+ }
+ }
+}
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
new file mode 100644
index 000000000..7db422094
--- /dev/null
+++ b/net/bluetooth/hci_debugfs.c
@@ -0,0 +1,1142 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/debugfs.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "hci_debugfs.h"
+
+#define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \
+static ssize_t __name ## _read(struct file *file, \
+ char __user *user_buf, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct hci_dev *hdev = file->private_data; \
+ char buf[3]; \
+ \
+ buf[0] = test_bit(__quirk, &hdev->quirks) ? 'Y' : 'N'; \
+ buf[1] = '\n'; \
+ buf[2] = '\0'; \
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2); \
+} \
+ \
+static ssize_t __name ## _write(struct file *file, \
+ const char __user *user_buf, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct hci_dev *hdev = file->private_data; \
+ char buf[32]; \
+ size_t buf_size = min(count, (sizeof(buf) - 1)); \
+ bool enable; \
+ \
+ if (test_bit(HCI_UP, &hdev->flags)) \
+ return -EBUSY; \
+ \
+ if (copy_from_user(buf, user_buf, buf_size)) \
+ return -EFAULT; \
+ \
+ buf[buf_size] = '\0'; \
+ if (strtobool(buf, &enable)) \
+ return -EINVAL; \
+ \
+ if (enable == test_bit(__quirk, &hdev->quirks)) \
+ return -EALREADY; \
+ \
+ change_bit(__quirk, &hdev->quirks); \
+ \
+ return count; \
+} \
+ \
+static const struct file_operations __name ## _fops = { \
+ .open = simple_open, \
+ .read = __name ## _read, \
+ .write = __name ## _write, \
+ .llseek = default_llseek, \
+} \
+
+static int features_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ u8 p;
+
+ hci_dev_lock(hdev);
+ for (p = 0; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) {
+ seq_printf(f, "%2u: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x "
+ "0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", p,
+ hdev->features[p][0], hdev->features[p][1],
+ hdev->features[p][2], hdev->features[p][3],
+ hdev->features[p][4], hdev->features[p][5],
+ hdev->features[p][6], hdev->features[p][7]);
+ }
+ if (lmp_le_capable(hdev))
+ seq_printf(f, "LE: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x "
+ "0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n",
+ hdev->le_features[0], hdev->le_features[1],
+ hdev->le_features[2], hdev->le_features[3],
+ hdev->le_features[4], hdev->le_features[5],
+ hdev->le_features[6], hdev->le_features[7]);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int features_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, features_show, inode->i_private);
+}
+
+static const struct file_operations features_fops = {
+ .open = features_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int device_id_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+
+ hci_dev_lock(hdev);
+ seq_printf(f, "%4.4x:%4.4x:%4.4x:%4.4x\n", hdev->devid_source,
+ hdev->devid_vendor, hdev->devid_product, hdev->devid_version);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int device_id_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, device_id_show, inode->i_private);
+}
+
+static const struct file_operations device_id_fops = {
+ .open = device_id_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int device_list_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct hci_conn_params *p;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->whitelist, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ seq_printf(f, "%pMR (type %u) %u\n", &p->addr, p->addr_type,
+ p->auto_connect);
+ }
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int device_list_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, device_list_show, inode->i_private);
+}
+
+static const struct file_operations device_list_fops = {
+ .open = device_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int blacklist_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->blacklist, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int blacklist_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, blacklist_show, inode->i_private);
+}
+
+static const struct file_operations blacklist_fops = {
+ .open = blacklist_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int uuids_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct bt_uuid *uuid;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ u8 i, val[16];
+
+ /* The Bluetooth UUID values are stored in big endian,
+ * but with reversed byte order. So convert them into
+ * the right order for the %pUb modifier.
+ */
+ for (i = 0; i < 16; i++)
+ val[i] = uuid->uuid[15 - i];
+
+ seq_printf(f, "%pUb\n", val);
+ }
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int uuids_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, uuids_show, inode->i_private);
+}
+
+static const struct file_operations uuids_fops = {
+ .open = uuids_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int remote_oob_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct oob_data *data;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(data, &hdev->remote_oob_data, list) {
+ seq_printf(f, "%pMR (type %u) %u %*phN %*phN %*phN %*phN\n",
+ &data->bdaddr, data->bdaddr_type, data->present,
+ 16, data->hash192, 16, data->rand192,
+ 16, data->hash256, 16, data->rand256);
+ }
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int remote_oob_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, remote_oob_show, inode->i_private);
+}
+
+static const struct file_operations remote_oob_fops = {
+ .open = remote_oob_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int conn_info_min_age_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val == 0 || val > hdev->conn_info_max_age)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->conn_info_min_age = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_info_min_age_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->conn_info_min_age;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(conn_info_min_age_fops, conn_info_min_age_get,
+ conn_info_min_age_set, "%llu\n");
+
+static int conn_info_max_age_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val == 0 || val < hdev->conn_info_min_age)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->conn_info_max_age = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_info_max_age_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->conn_info_max_age;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(conn_info_max_age_fops, conn_info_max_age_get,
+ conn_info_max_age_set, "%llu\n");
+
+static ssize_t use_debug_keys_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS) ? 'Y': 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static const struct file_operations use_debug_keys_fops = {
+ .open = simple_open,
+ .read = use_debug_keys_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t sc_only_mode_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_SC_ONLY) ? 'Y': 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static const struct file_operations sc_only_mode_fops = {
+ .open = simple_open,
+ .read = sc_only_mode_read,
+ .llseek = default_llseek,
+};
+
+void hci_debugfs_create_common(struct hci_dev *hdev)
+{
+ debugfs_create_file("features", 0444, hdev->debugfs, hdev,
+ &features_fops);
+ debugfs_create_u16("manufacturer", 0444, hdev->debugfs,
+ &hdev->manufacturer);
+ debugfs_create_u8("hci_version", 0444, hdev->debugfs, &hdev->hci_ver);
+ debugfs_create_u16("hci_revision", 0444, hdev->debugfs, &hdev->hci_rev);
+ debugfs_create_u8("hardware_error", 0444, hdev->debugfs,
+ &hdev->hw_error_code);
+ debugfs_create_file("device_id", 0444, hdev->debugfs, hdev,
+ &device_id_fops);
+
+ debugfs_create_file("device_list", 0444, hdev->debugfs, hdev,
+ &device_list_fops);
+ debugfs_create_file("blacklist", 0444, hdev->debugfs, hdev,
+ &blacklist_fops);
+ debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops);
+ debugfs_create_file("remote_oob", 0400, hdev->debugfs, hdev,
+ &remote_oob_fops);
+
+ debugfs_create_file("conn_info_min_age", 0644, hdev->debugfs, hdev,
+ &conn_info_min_age_fops);
+ debugfs_create_file("conn_info_max_age", 0644, hdev->debugfs, hdev,
+ &conn_info_max_age_fops);
+
+ if (lmp_ssp_capable(hdev) || lmp_le_capable(hdev))
+ debugfs_create_file("use_debug_keys", 0444, hdev->debugfs,
+ hdev, &use_debug_keys_fops);
+
+ if (lmp_sc_capable(hdev) || lmp_le_capable(hdev))
+ debugfs_create_file("sc_only_mode", 0444, hdev->debugfs,
+ hdev, &sc_only_mode_fops);
+}
+
+static int inquiry_cache_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct discovery_state *cache = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ hci_dev_lock(hdev);
+
+ list_for_each_entry(e, &cache->all, all) {
+ struct inquiry_data *data = &e->data;
+ seq_printf(f, "%pMR %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n",
+ &data->bdaddr,
+ data->pscan_rep_mode, data->pscan_period_mode,
+ data->pscan_mode, data->dev_class[2],
+ data->dev_class[1], data->dev_class[0],
+ __le16_to_cpu(data->clock_offset),
+ data->rssi, data->ssp_mode, e->timestamp);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int inquiry_cache_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, inquiry_cache_show, inode->i_private);
+}
+
+static const struct file_operations inquiry_cache_fops = {
+ .open = inquiry_cache_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int link_keys_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct link_key *key;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(key, &hdev->link_keys, list)
+ seq_printf(f, "%pMR %u %*phN %u\n", &key->bdaddr, key->type,
+ HCI_LINK_KEY_SIZE, key->val, key->pin_len);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int link_keys_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, link_keys_show, inode->i_private);
+}
+
+static const struct file_operations link_keys_fops = {
+ .open = link_keys_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dev_class_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+
+ hci_dev_lock(hdev);
+ seq_printf(f, "0x%.2x%.2x%.2x\n", hdev->dev_class[2],
+ hdev->dev_class[1], hdev->dev_class[0]);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int dev_class_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dev_class_show, inode->i_private);
+}
+
+static const struct file_operations dev_class_fops = {
+ .open = dev_class_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int voice_setting_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->voice_setting;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(voice_setting_fops, voice_setting_get,
+ NULL, "0x%4.4llx\n");
+
+static ssize_t ssp_debug_mode_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hdev->ssp_debug_mode ? 'Y': 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static const struct file_operations ssp_debug_mode_fops = {
+ .open = simple_open,
+ .read = ssp_debug_mode_read,
+ .llseek = default_llseek,
+};
+
+static int auto_accept_delay_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ hdev->auto_accept_delay = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int auto_accept_delay_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->auto_accept_delay;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get,
+ auto_accept_delay_set, "%llu\n");
+
+static int idle_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val != 0 && (val < 500 || val > 3600000))
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->idle_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int idle_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->idle_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(idle_timeout_fops, idle_timeout_get,
+ idle_timeout_set, "%llu\n");
+
+static int sniff_min_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val == 0 || val % 2 || val > hdev->sniff_max_interval)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->sniff_min_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int sniff_min_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->sniff_min_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(sniff_min_interval_fops, sniff_min_interval_get,
+ sniff_min_interval_set, "%llu\n");
+
+static int sniff_max_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val == 0 || val % 2 || val < hdev->sniff_min_interval)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->sniff_max_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int sniff_max_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->sniff_max_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(sniff_max_interval_fops, sniff_max_interval_get,
+ sniff_max_interval_set, "%llu\n");
+
+void hci_debugfs_create_bredr(struct hci_dev *hdev)
+{
+ debugfs_create_file("inquiry_cache", 0444, hdev->debugfs, hdev,
+ &inquiry_cache_fops);
+ debugfs_create_file("link_keys", 0400, hdev->debugfs, hdev,
+ &link_keys_fops);
+ debugfs_create_file("dev_class", 0444, hdev->debugfs, hdev,
+ &dev_class_fops);
+ debugfs_create_file("voice_setting", 0444, hdev->debugfs, hdev,
+ &voice_setting_fops);
+
+ if (lmp_ssp_capable(hdev)) {
+ debugfs_create_file("ssp_debug_mode", 0444, hdev->debugfs,
+ hdev, &ssp_debug_mode_fops);
+ debugfs_create_file("auto_accept_delay", 0644, hdev->debugfs,
+ hdev, &auto_accept_delay_fops);
+ }
+
+ if (lmp_sniff_capable(hdev)) {
+ debugfs_create_file("idle_timeout", 0644, hdev->debugfs,
+ hdev, &idle_timeout_fops);
+ debugfs_create_file("sniff_min_interval", 0644, hdev->debugfs,
+ hdev, &sniff_min_interval_fops);
+ debugfs_create_file("sniff_max_interval", 0644, hdev->debugfs,
+ hdev, &sniff_max_interval_fops);
+ }
+}
+
+static int identity_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ bdaddr_t addr;
+ u8 addr_type;
+
+ hci_dev_lock(hdev);
+
+ hci_copy_identity_address(hdev, &addr, &addr_type);
+
+ seq_printf(f, "%pMR (type %u) %*phN %pMR\n", &addr, addr_type,
+ 16, hdev->irk, &hdev->rpa);
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int identity_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, identity_show, inode->i_private);
+}
+
+static const struct file_operations identity_fops = {
+ .open = identity_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int rpa_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ /* Require the RPA timeout to be at least 30 seconds and at most
+ * 24 hours.
+ */
+ if (val < 30 || val > (60 * 60 * 24))
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->rpa_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int rpa_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->rpa_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(rpa_timeout_fops, rpa_timeout_get,
+ rpa_timeout_set, "%llu\n");
+
+static int random_address_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+
+ hci_dev_lock(hdev);
+ seq_printf(f, "%pMR\n", &hdev->random_addr);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int random_address_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, random_address_show, inode->i_private);
+}
+
+static const struct file_operations random_address_fops = {
+ .open = random_address_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int static_address_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+
+ hci_dev_lock(hdev);
+ seq_printf(f, "%pMR\n", &hdev->static_addr);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int static_address_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, static_address_show, inode->i_private);
+}
+
+static const struct file_operations static_address_fops = {
+ .open = static_address_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static ssize_t force_static_address_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ? 'Y': 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_static_address_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[32];
+ size_t buf_size = min(count, (sizeof(buf)-1));
+ bool enable;
+
+ if (test_bit(HCI_UP, &hdev->flags))
+ return -EBUSY;
+
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ if (strtobool(buf, &enable))
+ return -EINVAL;
+
+ if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR))
+ return -EALREADY;
+
+ hci_dev_change_flag(hdev, HCI_FORCE_STATIC_ADDR);
+
+ return count;
+}
+
+static const struct file_operations force_static_address_fops = {
+ .open = simple_open,
+ .read = force_static_address_read,
+ .write = force_static_address_write,
+ .llseek = default_llseek,
+};
+
+static int white_list_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->le_white_list, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int white_list_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, white_list_show, inode->i_private);
+}
+
+static const struct file_operations white_list_fops = {
+ .open = white_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int identity_resolving_keys_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct smp_irk *irk;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
+ seq_printf(f, "%pMR (type %u) %*phN %pMR\n",
+ &irk->bdaddr, irk->addr_type,
+ 16, irk->val, &irk->rpa);
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int identity_resolving_keys_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, identity_resolving_keys_show,
+ inode->i_private);
+}
+
+static const struct file_operations identity_resolving_keys_fops = {
+ .open = identity_resolving_keys_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int long_term_keys_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct smp_ltk *ltk;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ltk, &hdev->long_term_keys, list)
+ seq_printf(f, "%pMR (type %u) %u 0x%02x %u %.4x %.16llx %*phN\n",
+ &ltk->bdaddr, ltk->bdaddr_type, ltk->authenticated,
+ ltk->type, ltk->enc_size, __le16_to_cpu(ltk->ediv),
+ __le64_to_cpu(ltk->rand), 16, ltk->val);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int long_term_keys_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, long_term_keys_show, inode->i_private);
+}
+
+static const struct file_operations long_term_keys_fops = {
+ .open = long_term_keys_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int conn_min_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_conn_min_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_min_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_conn_min_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(conn_min_interval_fops, conn_min_interval_get,
+ conn_min_interval_set, "%llu\n");
+
+static int conn_max_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_conn_max_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_max_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_conn_max_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(conn_max_interval_fops, conn_max_interval_get,
+ conn_max_interval_set, "%llu\n");
+
+static int conn_latency_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val > 0x01f3)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_conn_latency = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int conn_latency_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_conn_latency;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(conn_latency_fops, conn_latency_get,
+ conn_latency_set, "%llu\n");
+
+static int supervision_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x000a || val > 0x0c80)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_supv_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int supervision_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_supv_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(supervision_timeout_fops, supervision_timeout_get,
+ supervision_timeout_set, "%llu\n");
+
+static int adv_channel_map_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x01 || val > 0x07)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_adv_channel_map = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int adv_channel_map_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_adv_channel_map;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get,
+ adv_channel_map_set, "%llu\n");
+
+static int adv_min_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_adv_min_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int adv_min_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_adv_min_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(adv_min_interval_fops, adv_min_interval_get,
+ adv_min_interval_set, "%llu\n");
+
+static int adv_max_interval_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_adv_max_interval = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int adv_max_interval_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_adv_max_interval;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get,
+ adv_max_interval_set, "%llu\n");
+
+DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter,
+ HCI_QUIRK_STRICT_DUPLICATE_FILTER);
+DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery,
+ HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
+
+void hci_debugfs_create_le(struct hci_dev *hdev)
+{
+ debugfs_create_file("identity", 0400, hdev->debugfs, hdev,
+ &identity_fops);
+ debugfs_create_file("rpa_timeout", 0644, hdev->debugfs, hdev,
+ &rpa_timeout_fops);
+ debugfs_create_file("random_address", 0444, hdev->debugfs, hdev,
+ &random_address_fops);
+ debugfs_create_file("static_address", 0444, hdev->debugfs, hdev,
+ &static_address_fops);
+
+ /* For controllers with a public address, provide a debug
+ * option to force the usage of the configured static
+ * address. By default the public address is used.
+ */
+ if (bacmp(&hdev->bdaddr, BDADDR_ANY))
+ debugfs_create_file("force_static_address", 0644,
+ hdev->debugfs, hdev,
+ &force_static_address_fops);
+
+ debugfs_create_u8("white_list_size", 0444, hdev->debugfs,
+ &hdev->le_white_list_size);
+ debugfs_create_file("white_list", 0444, hdev->debugfs, hdev,
+ &white_list_fops);
+ debugfs_create_file("identity_resolving_keys", 0400, hdev->debugfs,
+ hdev, &identity_resolving_keys_fops);
+ debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev,
+ &long_term_keys_fops);
+ debugfs_create_file("conn_min_interval", 0644, hdev->debugfs, hdev,
+ &conn_min_interval_fops);
+ debugfs_create_file("conn_max_interval", 0644, hdev->debugfs, hdev,
+ &conn_max_interval_fops);
+ debugfs_create_file("conn_latency", 0644, hdev->debugfs, hdev,
+ &conn_latency_fops);
+ debugfs_create_file("supervision_timeout", 0644, hdev->debugfs, hdev,
+ &supervision_timeout_fops);
+ debugfs_create_file("adv_channel_map", 0644, hdev->debugfs, hdev,
+ &adv_channel_map_fops);
+ debugfs_create_file("adv_min_interval", 0644, hdev->debugfs, hdev,
+ &adv_min_interval_fops);
+ debugfs_create_file("adv_max_interval", 0644, hdev->debugfs, hdev,
+ &adv_max_interval_fops);
+ debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs,
+ &hdev->discov_interleaved_timeout);
+
+ debugfs_create_file("quirk_strict_duplicate_filter", 0644,
+ hdev->debugfs, hdev,
+ &quirk_strict_duplicate_filter_fops);
+ debugfs_create_file("quirk_simultaneous_discovery", 0644,
+ hdev->debugfs, hdev,
+ &quirk_simultaneous_discovery_fops);
+}
+
+void hci_debugfs_create_conn(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ char name[6];
+
+ if (IS_ERR_OR_NULL(hdev->debugfs))
+ return;
+
+ snprintf(name, sizeof(name), "%u", conn->handle);
+ conn->debugfs = debugfs_create_dir(name, hdev->debugfs);
+}
diff --git a/net/bluetooth/hci_debugfs.h b/net/bluetooth/hci_debugfs.h
new file mode 100644
index 000000000..4444dc8ce
--- /dev/null
+++ b/net/bluetooth/hci_debugfs.h
@@ -0,0 +1,48 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#if IS_ENABLED(CONFIG_BT_DEBUGFS)
+
+void hci_debugfs_create_common(struct hci_dev *hdev);
+void hci_debugfs_create_bredr(struct hci_dev *hdev);
+void hci_debugfs_create_le(struct hci_dev *hdev);
+void hci_debugfs_create_conn(struct hci_conn *conn);
+
+#else
+
+static inline void hci_debugfs_create_common(struct hci_dev *hdev)
+{
+}
+
+static inline void hci_debugfs_create_bredr(struct hci_dev *hdev)
+{
+}
+
+static inline void hci_debugfs_create_le(struct hci_dev *hdev)
+{
+}
+
+static inline void hci_debugfs_create_conn(struct hci_conn *conn)
+{
+}
+
+#endif
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
new file mode 100644
index 000000000..7b61be736
--- /dev/null
+++ b/net/bluetooth/hci_event.c
@@ -0,0 +1,5404 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI event handling. */
+
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "hci_request.h"
+#include "hci_debugfs.h"
+#include "a2mp.h"
+#include "amp.h"
+#include "smp.h"
+
+#define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+
+/* Handle HCI Event packets */
+
+static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ clear_bit(HCI_INQUIRY, &hdev->flags);
+ smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */
+ wake_up_bit(&hdev->flags, HCI_INQUIRY);
+
+ hci_dev_lock(hdev);
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ hci_dev_unlock(hdev);
+
+ hci_conn_check_pending(hdev);
+}
+
+static void hci_cc_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ hci_dev_set_flag(hdev, HCI_PERIODIC_INQ);
+}
+
+static void hci_cc_exit_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ);
+
+ hci_conn_check_pending(hdev);
+}
+
+static void hci_cc_remote_name_req_cancel(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ BT_DBG("%s", hdev->name);
+}
+
+static void hci_cc_role_discovery(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_role_discovery *rp = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->role = rp->role;
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_link_policy *rp = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->link_policy = __le16_to_cpu(rp->policy);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_write_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_write_link_policy *rp = (void *) skb->data;
+ struct hci_conn *conn;
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LINK_POLICY);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->link_policy = get_unaligned_le16(sent + 2);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_def_link_policy(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_def_link_policy *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->link_policy = __le16_to_cpu(rp->policy);
+}
+
+static void hci_cc_write_def_link_policy(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_LINK_POLICY);
+ if (!sent)
+ return;
+
+ hdev->link_policy = get_unaligned_le16(sent);
+}
+
+static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ clear_bit(HCI_RESET, &hdev->flags);
+
+ if (status)
+ return;
+
+ /* Reset all non-persistent flags */
+ hci_dev_clear_volatile_flags(hdev);
+
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+ hdev->inq_tx_power = HCI_TX_POWER_INVALID;
+ hdev->adv_tx_power = HCI_TX_POWER_INVALID;
+
+ memset(hdev->adv_data, 0, sizeof(hdev->adv_data));
+ hdev->adv_data_len = 0;
+
+ memset(hdev->scan_rsp_data, 0, sizeof(hdev->scan_rsp_data));
+ hdev->scan_rsp_data_len = 0;
+
+ hdev->le_scan_type = LE_SCAN_PASSIVE;
+
+ hdev->ssp_debug_mode = 0;
+
+ hci_bdaddr_list_clear(&hdev->le_white_list);
+}
+
+static void hci_cc_read_stored_link_key(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_stored_link_key *rp = (void *)skb->data;
+ struct hci_cp_read_stored_link_key *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_READ_STORED_LINK_KEY);
+ if (!sent)
+ return;
+
+ if (!rp->status && sent->read_all == 0x01) {
+ hdev->stored_max_keys = rp->max_keys;
+ hdev->stored_num_keys = rp->num_keys;
+ }
+}
+
+static void hci_cc_delete_stored_link_key(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_delete_stored_link_key *rp = (void *)skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ if (rp->num_keys <= hdev->stored_num_keys)
+ hdev->stored_num_keys -= rp->num_keys;
+ else
+ hdev->stored_num_keys = 0;
+}
+
+static void hci_cc_write_local_name(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LOCAL_NAME);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_set_local_name_complete(hdev, sent, status);
+ else if (!status)
+ memcpy(hdev->dev_name, sent, HCI_MAX_NAME_LENGTH);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_local_name(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_local_name *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG))
+ memcpy(hdev->dev_name, rp->name, HCI_MAX_NAME_LENGTH);
+}
+
+static void hci_cc_write_auth_enable(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_ENABLE);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (!status) {
+ __u8 param = *((__u8 *) sent);
+
+ if (param == AUTH_ENABLED)
+ set_bit(HCI_AUTH, &hdev->flags);
+ else
+ clear_bit(HCI_AUTH, &hdev->flags);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_auth_enable_complete(hdev, status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_write_encrypt_mode(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ __u8 param;
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_ENCRYPT_MODE);
+ if (!sent)
+ return;
+
+ param = *((__u8 *) sent);
+
+ if (param)
+ set_bit(HCI_ENCRYPT, &hdev->flags);
+ else
+ clear_bit(HCI_ENCRYPT, &hdev->flags);
+}
+
+static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ __u8 param;
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SCAN_ENABLE);
+ if (!sent)
+ return;
+
+ param = *((__u8 *) sent);
+
+ hci_dev_lock(hdev);
+
+ if (status) {
+ hdev->discov_timeout = 0;
+ goto done;
+ }
+
+ if (param & SCAN_INQUIRY)
+ set_bit(HCI_ISCAN, &hdev->flags);
+ else
+ clear_bit(HCI_ISCAN, &hdev->flags);
+
+ if (param & SCAN_PAGE)
+ set_bit(HCI_PSCAN, &hdev->flags);
+ else
+ clear_bit(HCI_PSCAN, &hdev->flags);
+
+done:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_class_of_dev *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ memcpy(hdev->dev_class, rp->dev_class, 3);
+
+ BT_DBG("%s class 0x%.2x%.2x%.2x", hdev->name,
+ hdev->dev_class[2], hdev->dev_class[1], hdev->dev_class[0]);
+}
+
+static void hci_cc_write_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_CLASS_OF_DEV);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (status == 0)
+ memcpy(hdev->dev_class, sent, 3);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_set_class_of_dev_complete(hdev, sent, status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_voice_setting(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_voice_setting *rp = (void *) skb->data;
+ __u16 setting;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ setting = __le16_to_cpu(rp->voice_setting);
+
+ if (hdev->voice_setting == setting)
+ return;
+
+ hdev->voice_setting = setting;
+
+ BT_DBG("%s voice setting 0x%4.4x", hdev->name, setting);
+
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
+}
+
+static void hci_cc_write_voice_setting(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ __u16 setting;
+ void *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_VOICE_SETTING);
+ if (!sent)
+ return;
+
+ setting = get_unaligned_le16(sent);
+
+ if (hdev->voice_setting == setting)
+ return;
+
+ hdev->voice_setting = setting;
+
+ BT_DBG("%s voice setting 0x%4.4x", hdev->name, setting);
+
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
+}
+
+static void hci_cc_read_num_supported_iac(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_num_supported_iac *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->num_iac = rp->num_iac;
+
+ BT_DBG("%s num iac %d", hdev->name, hdev->num_iac);
+}
+
+static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_write_ssp_mode *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SSP_MODE);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (!status) {
+ if (sent->mode)
+ hdev->features[1][0] |= LMP_HOST_SSP;
+ else
+ hdev->features[1][0] &= ~LMP_HOST_SSP;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_ssp_enable_complete(hdev, sent->mode, status);
+ else if (!status) {
+ if (sent->mode)
+ hci_dev_set_flag(hdev, HCI_SSP_ENABLED);
+ else
+ hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_write_sc_support(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ u8 status = *((u8 *) skb->data);
+ struct hci_cp_write_sc_support *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SC_SUPPORT);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (!status) {
+ if (sent->support)
+ hdev->features[1][0] |= LMP_HOST_SC;
+ else
+ hdev->features[1][0] &= ~LMP_HOST_SC;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT) && !status) {
+ if (sent->support)
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+ else
+ hci_dev_clear_flag(hdev, HCI_SC_ENABLED);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_local_version *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ hdev->hci_ver = rp->hci_ver;
+ hdev->hci_rev = __le16_to_cpu(rp->hci_rev);
+ hdev->lmp_ver = rp->lmp_ver;
+ hdev->manufacturer = __le16_to_cpu(rp->manufacturer);
+ hdev->lmp_subver = __le16_to_cpu(rp->lmp_subver);
+ }
+}
+
+static void hci_cc_read_local_commands(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_commands *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG))
+ memcpy(hdev->commands, rp->commands, sizeof(hdev->commands));
+}
+
+static void hci_cc_read_local_features(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_features *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ memcpy(hdev->features, rp->features, 8);
+
+ /* Adjust default settings according to features
+ * supported by device. */
+
+ if (hdev->features[0][0] & LMP_3SLOT)
+ hdev->pkt_type |= (HCI_DM3 | HCI_DH3);
+
+ if (hdev->features[0][0] & LMP_5SLOT)
+ hdev->pkt_type |= (HCI_DM5 | HCI_DH5);
+
+ if (hdev->features[0][1] & LMP_HV2) {
+ hdev->pkt_type |= (HCI_HV2);
+ hdev->esco_type |= (ESCO_HV2);
+ }
+
+ if (hdev->features[0][1] & LMP_HV3) {
+ hdev->pkt_type |= (HCI_HV3);
+ hdev->esco_type |= (ESCO_HV3);
+ }
+
+ if (lmp_esco_capable(hdev))
+ hdev->esco_type |= (ESCO_EV3);
+
+ if (hdev->features[0][4] & LMP_EV4)
+ hdev->esco_type |= (ESCO_EV4);
+
+ if (hdev->features[0][4] & LMP_EV5)
+ hdev->esco_type |= (ESCO_EV5);
+
+ if (hdev->features[0][5] & LMP_EDR_ESCO_2M)
+ hdev->esco_type |= (ESCO_2EV3);
+
+ if (hdev->features[0][5] & LMP_EDR_ESCO_3M)
+ hdev->esco_type |= (ESCO_3EV3);
+
+ if (hdev->features[0][5] & LMP_EDR_3S_ESCO)
+ hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5);
+}
+
+static void hci_cc_read_local_ext_features(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_ext_features *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ if (hdev->max_page < rp->max_page)
+ hdev->max_page = rp->max_page;
+
+ if (rp->page < HCI_MAX_PAGES)
+ memcpy(hdev->features[rp->page], rp->features, 8);
+}
+
+static void hci_cc_read_flow_control_mode(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_flow_control_mode *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->flow_ctl_mode = rp->mode;
+}
+
+static void hci_cc_read_buffer_size(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_buffer_size *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->acl_mtu = __le16_to_cpu(rp->acl_mtu);
+ hdev->sco_mtu = rp->sco_mtu;
+ hdev->acl_pkts = __le16_to_cpu(rp->acl_max_pkt);
+ hdev->sco_pkts = __le16_to_cpu(rp->sco_max_pkt);
+
+ if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) {
+ hdev->sco_mtu = 64;
+ hdev->sco_pkts = 8;
+ }
+
+ hdev->acl_cnt = hdev->acl_pkts;
+ hdev->sco_cnt = hdev->sco_pkts;
+
+ BT_DBG("%s acl mtu %d:%d sco mtu %d:%d", hdev->name, hdev->acl_mtu,
+ hdev->acl_pkts, hdev->sco_mtu, hdev->sco_pkts);
+}
+
+static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_bd_addr *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ if (test_bit(HCI_INIT, &hdev->flags))
+ bacpy(&hdev->bdaddr, &rp->bdaddr);
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP))
+ bacpy(&hdev->setup_addr, &rp->bdaddr);
+}
+
+static void hci_cc_read_page_scan_activity(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_page_scan_activity *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ if (test_bit(HCI_INIT, &hdev->flags)) {
+ hdev->page_scan_interval = __le16_to_cpu(rp->interval);
+ hdev->page_scan_window = __le16_to_cpu(rp->window);
+ }
+}
+
+static void hci_cc_write_page_scan_activity(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ u8 status = *((u8 *) skb->data);
+ struct hci_cp_write_page_scan_activity *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY);
+ if (!sent)
+ return;
+
+ hdev->page_scan_interval = __le16_to_cpu(sent->interval);
+ hdev->page_scan_window = __le16_to_cpu(sent->window);
+}
+
+static void hci_cc_read_page_scan_type(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_page_scan_type *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ if (test_bit(HCI_INIT, &hdev->flags))
+ hdev->page_scan_type = rp->type;
+}
+
+static void hci_cc_write_page_scan_type(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ u8 status = *((u8 *) skb->data);
+ u8 *type;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ type = hci_sent_cmd_data(hdev, HCI_OP_WRITE_PAGE_SCAN_TYPE);
+ if (type)
+ hdev->page_scan_type = *type;
+}
+
+static void hci_cc_read_data_block_size(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_data_block_size *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->block_mtu = __le16_to_cpu(rp->max_acl_len);
+ hdev->block_len = __le16_to_cpu(rp->block_len);
+ hdev->num_blocks = __le16_to_cpu(rp->num_blocks);
+
+ hdev->block_cnt = hdev->num_blocks;
+
+ BT_DBG("%s blk mtu %d cnt %d len %d", hdev->name, hdev->block_mtu,
+ hdev->block_cnt, hdev->block_len);
+}
+
+static void hci_cc_read_clock(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_clock *rp = (void *) skb->data;
+ struct hci_cp_read_clock *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ if (skb->len < sizeof(*rp))
+ return;
+
+ if (rp->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_READ_CLOCK);
+ if (!cp)
+ goto unlock;
+
+ if (cp->which == 0x00) {
+ hdev->clock = le32_to_cpu(rp->clock);
+ goto unlock;
+ }
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn) {
+ conn->clock = le32_to_cpu(rp->clock);
+ conn->clock_accuracy = le16_to_cpu(rp->accuracy);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_local_amp_info(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_amp_info *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ goto a2mp_rsp;
+
+ hdev->amp_status = rp->amp_status;
+ hdev->amp_total_bw = __le32_to_cpu(rp->total_bw);
+ hdev->amp_max_bw = __le32_to_cpu(rp->max_bw);
+ hdev->amp_min_latency = __le32_to_cpu(rp->min_latency);
+ hdev->amp_max_pdu = __le32_to_cpu(rp->max_pdu);
+ hdev->amp_type = rp->amp_type;
+ hdev->amp_pal_cap = __le16_to_cpu(rp->pal_cap);
+ hdev->amp_assoc_size = __le16_to_cpu(rp->max_assoc_size);
+ hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to);
+ hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to);
+
+a2mp_rsp:
+ a2mp_send_getinfo_rsp(hdev);
+}
+
+static void hci_cc_read_local_amp_assoc(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_amp_assoc *rp = (void *) skb->data;
+ struct amp_assoc *assoc = &hdev->loc_assoc;
+ size_t rem_len, frag_len;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ goto a2mp_rsp;
+
+ frag_len = skb->len - sizeof(*rp);
+ rem_len = __le16_to_cpu(rp->rem_len);
+
+ if (rem_len > frag_len) {
+ BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len);
+
+ memcpy(assoc->data + assoc->offset, rp->frag, frag_len);
+ assoc->offset += frag_len;
+
+ /* Read other fragments */
+ amp_read_loc_assoc_frag(hdev, rp->phy_handle);
+
+ return;
+ }
+
+ memcpy(assoc->data + assoc->offset, rp->frag, rem_len);
+ assoc->len = assoc->offset + rem_len;
+ assoc->offset = 0;
+
+a2mp_rsp:
+ /* Send A2MP Rsp when all fragments are received */
+ a2mp_send_getampassoc_rsp(hdev, rp->status);
+ a2mp_send_create_phy_link_req(hdev, rp->status);
+}
+
+static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_inq_rsp_tx_power *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->inq_tx_power = rp->tx_power;
+}
+
+static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_pin_code_reply *rp = (void *) skb->data;
+ struct hci_cp_pin_code_reply *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_pin_code_reply_complete(hdev, &rp->bdaddr, rp->status);
+
+ if (rp->status)
+ goto unlock;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_PIN_CODE_REPLY);
+ if (!cp)
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+ if (conn)
+ conn->pin_length = cp->pin_len;
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_pin_code_neg_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_pin_code_neg_reply *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_pin_code_neg_reply_complete(hdev, &rp->bdaddr,
+ rp->status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_le_read_buffer_size(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_buffer_size *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->le_mtu = __le16_to_cpu(rp->le_mtu);
+ hdev->le_pkts = rp->le_max_pkt;
+
+ hdev->le_cnt = hdev->le_pkts;
+
+ BT_DBG("%s le mtu %d:%d", hdev->name, hdev->le_mtu, hdev->le_pkts);
+}
+
+static void hci_cc_le_read_local_features(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_local_features *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ memcpy(hdev->le_features, rp->features, 8);
+}
+
+static void hci_cc_le_read_adv_tx_power(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_adv_tx_power *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->adv_tx_power = rp->tx_power;
+}
+
+static void hci_cc_user_confirm_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_confirm_reply_complete(hdev, &rp->bdaddr, ACL_LINK, 0,
+ rp->status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_user_confirm_neg_reply(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_confirm_neg_reply_complete(hdev, &rp->bdaddr,
+ ACL_LINK, 0, rp->status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_user_passkey_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_reply_complete(hdev, &rp->bdaddr, ACL_LINK,
+ 0, rp->status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_user_passkey_neg_reply(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_neg_reply_complete(hdev, &rp->bdaddr,
+ ACL_LINK, 0, rp->status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_local_oob_data(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_oob_data *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+}
+
+static void hci_cc_read_local_oob_ext_data(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_oob_ext_data *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+}
+
+static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ bdaddr_t *sent;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_RANDOM_ADDR);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ bacpy(&hdev->random_addr, sent);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 *sent, status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_ENABLE);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ /* If we're doing connection initiation as peripheral. Set a
+ * timeout in case something goes wrong.
+ */
+ if (*sent) {
+ struct hci_conn *conn;
+
+ hci_dev_set_flag(hdev, HCI_LE_ADV);
+
+ conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ if (conn)
+ queue_delayed_work(hdev->workqueue,
+ &conn->le_conn_timeout,
+ conn->conn_timeout);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_cp_le_set_scan_param *cp;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_PARAM);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ hdev->le_scan_type = cp->type;
+
+ hci_dev_unlock(hdev);
+}
+
+static bool has_pending_adv_report(struct hci_dev *hdev)
+{
+ struct discovery_state *d = &hdev->discovery;
+
+ return bacmp(&d->last_adv_addr, BDADDR_ANY);
+}
+
+static void clear_pending_adv_report(struct hci_dev *hdev)
+{
+ struct discovery_state *d = &hdev->discovery;
+
+ bacpy(&d->last_adv_addr, BDADDR_ANY);
+ d->last_adv_data_len = 0;
+}
+
+static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 bdaddr_type, s8 rssi, u32 flags,
+ u8 *data, u8 len)
+{
+ struct discovery_state *d = &hdev->discovery;
+
+ bacpy(&d->last_adv_addr, bdaddr);
+ d->last_adv_addr_type = bdaddr_type;
+ d->last_adv_rssi = rssi;
+ d->last_adv_flags = flags;
+ memcpy(d->last_adv_data, data, len);
+ d->last_adv_data_len = len;
+}
+
+static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_scan_enable *cp;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ switch (cp->enable) {
+ case LE_SCAN_ENABLE:
+ hci_dev_set_flag(hdev, HCI_LE_SCAN);
+ if (hdev->le_scan_type == LE_SCAN_ACTIVE)
+ clear_pending_adv_report(hdev);
+ break;
+
+ case LE_SCAN_DISABLE:
+ /* We do this here instead of when setting DISCOVERY_STOPPED
+ * since the latter would potentially require waiting for
+ * inquiry to stop too.
+ */
+ if (has_pending_adv_report(hdev)) {
+ struct discovery_state *d = &hdev->discovery;
+
+ mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
+ d->last_adv_addr_type, NULL,
+ d->last_adv_rssi, d->last_adv_flags,
+ d->last_adv_data,
+ d->last_adv_data_len, NULL, 0);
+ }
+
+ /* Cancel this timer so that we don't try to disable scanning
+ * when it's already disabled.
+ */
+ cancel_delayed_work(&hdev->le_scan_disable);
+
+ hci_dev_clear_flag(hdev, HCI_LE_SCAN);
+
+ /* The HCI_LE_SCAN_INTERRUPTED flag indicates that we
+ * interrupted scanning due to a connect request. Mark
+ * therefore discovery as stopped. If this was not
+ * because of a connect request advertising might have
+ * been disabled because of active scanning, so
+ * re-enable it again if necessary.
+ */
+ if (hci_dev_test_and_clear_flag(hdev, HCI_LE_SCAN_INTERRUPTED))
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ else if (!hci_dev_test_flag(hdev, HCI_LE_ADV) &&
+ hdev->discovery.state == DISCOVERY_FINDING)
+ mgmt_reenable_advertising(hdev);
+
+ break;
+
+ default:
+ BT_ERR("Used reserved LE_Scan_Enable param %d", cp->enable);
+ break;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_le_read_white_list_size(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_white_list_size *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size);
+
+ if (rp->status)
+ return;
+
+ hdev->le_white_list_size = rp->size;
+}
+
+static void hci_cc_le_clear_white_list(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ hci_bdaddr_list_clear(&hdev->le_white_list);
+}
+
+static void hci_cc_le_add_to_white_list(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_add_to_white_list *sent;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_WHITE_LIST);
+ if (!sent)
+ return;
+
+ hci_bdaddr_list_add(&hdev->le_white_list, &sent->bdaddr,
+ sent->bdaddr_type);
+}
+
+static void hci_cc_le_del_from_white_list(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_del_from_white_list *sent;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_WHITE_LIST);
+ if (!sent)
+ return;
+
+ hci_bdaddr_list_del(&hdev->le_white_list, &sent->bdaddr,
+ sent->bdaddr_type);
+}
+
+static void hci_cc_le_read_supported_states(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_supported_states *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ memcpy(hdev->le_states, rp->le_states, 8);
+}
+
+static void hci_cc_le_read_def_data_len(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_def_data_len *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->le_def_tx_len = le16_to_cpu(rp->tx_len);
+ hdev->le_def_tx_time = le16_to_cpu(rp->tx_time);
+}
+
+static void hci_cc_le_write_def_data_len(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_write_def_data_len *sent;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_WRITE_DEF_DATA_LEN);
+ if (!sent)
+ return;
+
+ hdev->le_def_tx_len = le16_to_cpu(sent->tx_len);
+ hdev->le_def_tx_time = le16_to_cpu(sent->tx_time);
+}
+
+static void hci_cc_le_read_max_data_len(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_max_data_len *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->le_max_tx_len = le16_to_cpu(rp->tx_len);
+ hdev->le_max_tx_time = le16_to_cpu(rp->tx_time);
+ hdev->le_max_rx_len = le16_to_cpu(rp->rx_len);
+ hdev->le_max_rx_time = le16_to_cpu(rp->rx_time);
+}
+
+static void hci_cc_write_le_host_supported(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_write_le_host_supported *sent;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (sent->le) {
+ hdev->features[1][0] |= LMP_HOST_LE;
+ hci_dev_set_flag(hdev, HCI_LE_ENABLED);
+ } else {
+ hdev->features[1][0] &= ~LMP_HOST_LE;
+ hci_dev_clear_flag(hdev, HCI_LE_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING);
+ }
+
+ if (sent->simul)
+ hdev->features[1][0] |= LMP_HOST_LE_BREDR;
+ else
+ hdev->features[1][0] &= ~LMP_HOST_LE_BREDR;
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_cp_le_set_adv_param *cp;
+ u8 status = *((u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_PARAM);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+ hdev->adv_addr_type = cp->own_address_type;
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_write_remote_amp_assoc(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_write_remote_amp_assoc *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x",
+ hdev->name, rp->status, rp->phy_handle);
+
+ if (rp->status)
+ return;
+
+ amp_write_rem_assoc_continue(hdev, rp->phy_handle);
+}
+
+static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_read_rssi *rp = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->rssi = rp->rssi;
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_tx_power(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_cp_read_tx_power *sent;
+ struct hci_rp_read_tx_power *rp = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_READ_TX_POWER);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (!conn)
+ goto unlock;
+
+ switch (sent->type) {
+ case 0x00:
+ conn->tx_power = rp->tx_power;
+ break;
+ case 0x01:
+ conn->max_tx_power = rp->tx_power;
+ break;
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_write_ssp_debug_mode(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ u8 status = *((u8 *) skb->data);
+ u8 *mode;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ mode = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE);
+ if (mode)
+ hdev->ssp_debug_mode = *mode;
+}
+
+static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status)
+{
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status) {
+ hci_conn_check_pending(hdev);
+ return;
+ }
+
+ set_bit(HCI_INQUIRY, &hdev->flags);
+}
+
+static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_create_conn *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_CONN);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+
+ BT_DBG("%s bdaddr %pMR hcon %p", hdev->name, &cp->bdaddr, conn);
+
+ if (status) {
+ if (conn && conn->state == BT_CONNECT) {
+ if (status != 0x0c || conn->attempt > 2) {
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, status);
+ hci_conn_del(conn);
+ } else
+ conn->state = BT_CONNECT2;
+ }
+ } else {
+ if (!conn) {
+ conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr,
+ HCI_ROLE_MASTER);
+ if (!conn)
+ BT_ERR("No memory for new connection");
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_add_sco *cp;
+ struct hci_conn *acl, *sco;
+ __u16 handle;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_ADD_SCO);
+ if (!cp)
+ return;
+
+ handle = __le16_to_cpu(cp->handle);
+
+ BT_DBG("%s handle 0x%4.4x", hdev->name, handle);
+
+ hci_dev_lock(hdev);
+
+ acl = hci_conn_hash_lookup_handle(hdev, handle);
+ if (acl) {
+ sco = acl->link;
+ if (sco) {
+ sco->state = BT_CLOSED;
+
+ hci_connect_cfm(sco, status);
+ hci_conn_del(sco);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_auth_requested(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_auth_requested *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_AUTH_REQUESTED);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_set_conn_encrypt(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_set_conn_encrypt *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_SET_CONN_ENCRYPT);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static int hci_outgoing_auth_needed(struct hci_dev *hdev,
+ struct hci_conn *conn)
+{
+ if (conn->state != BT_CONFIG || !conn->out)
+ return 0;
+
+ if (conn->pending_sec_level == BT_SECURITY_SDP)
+ return 0;
+
+ /* Only request authentication for SSP connections or non-SSP
+ * devices with sec_level MEDIUM or HIGH or if MITM protection
+ * is requested.
+ */
+ if (!hci_conn_ssp_enabled(conn) && !(conn->auth_type & 0x01) &&
+ conn->pending_sec_level != BT_SECURITY_FIPS &&
+ conn->pending_sec_level != BT_SECURITY_HIGH &&
+ conn->pending_sec_level != BT_SECURITY_MEDIUM)
+ return 0;
+
+ return 1;
+}
+
+static int hci_resolve_name(struct hci_dev *hdev,
+ struct inquiry_entry *e)
+{
+ struct hci_cp_remote_name_req cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ bacpy(&cp.bdaddr, &e->data.bdaddr);
+ cp.pscan_rep_mode = e->data.pscan_rep_mode;
+ cp.pscan_mode = e->data.pscan_mode;
+ cp.clock_offset = e->data.clock_offset;
+
+ return hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
+}
+
+static bool hci_resolve_next_name(struct hci_dev *hdev)
+{
+ struct discovery_state *discov = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ if (list_empty(&discov->resolve))
+ return false;
+
+ e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED);
+ if (!e)
+ return false;
+
+ if (hci_resolve_name(hdev, e) == 0) {
+ e->name_state = NAME_PENDING;
+ return true;
+ }
+
+ return false;
+}
+
+static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn,
+ bdaddr_t *bdaddr, u8 *name, u8 name_len)
+{
+ struct discovery_state *discov = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ /* Update the mgmt connected state if necessary. Be careful with
+ * conn objects that exist but are not (yet) connected however.
+ * Only those in BT_CONFIG or BT_CONNECTED states can be
+ * considered connected.
+ */
+ if (conn &&
+ (conn->state == BT_CONFIG || conn->state == BT_CONNECTED) &&
+ !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+ mgmt_device_connected(hdev, conn, 0, name, name_len);
+
+ if (discov->state == DISCOVERY_STOPPED)
+ return;
+
+ if (discov->state == DISCOVERY_STOPPING)
+ goto discov_complete;
+
+ if (discov->state != DISCOVERY_RESOLVING)
+ return;
+
+ e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING);
+ /* If the device was not found in a list of found devices names of which
+ * are pending. there is no need to continue resolving a next name as it
+ * will be done upon receiving another Remote Name Request Complete
+ * Event */
+ if (!e)
+ return;
+
+ list_del(&e->list);
+ if (name) {
+ e->name_state = NAME_KNOWN;
+ mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00,
+ e->data.rssi, name, name_len);
+ } else {
+ e->name_state = NAME_NOT_KNOWN;
+ }
+
+ if (hci_resolve_next_name(hdev))
+ return;
+
+discov_complete:
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+}
+
+static void hci_cs_remote_name_req(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_remote_name_req *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ /* If successful wait for the name req complete event before
+ * checking for the need to do authentication */
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_REMOTE_NAME_REQ);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ hci_check_pending_name(hdev, conn, &cp->bdaddr, NULL, 0);
+
+ if (!conn)
+ goto unlock;
+
+ if (!hci_outgoing_auth_needed(hdev, conn))
+ goto unlock;
+
+ if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) {
+ struct hci_cp_auth_requested auth_cp;
+
+ set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags);
+
+ auth_cp.handle = __cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED,
+ sizeof(auth_cp), &auth_cp);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_read_remote_features(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_read_remote_features *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_READ_REMOTE_FEATURES);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_read_remote_ext_features *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_READ_REMOTE_EXT_FEATURES);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_setup_sync_conn *cp;
+ struct hci_conn *acl, *sco;
+ __u16 handle;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_SETUP_SYNC_CONN);
+ if (!cp)
+ return;
+
+ handle = __le16_to_cpu(cp->handle);
+
+ BT_DBG("%s handle 0x%4.4x", hdev->name, handle);
+
+ hci_dev_lock(hdev);
+
+ acl = hci_conn_hash_lookup_handle(hdev, handle);
+ if (acl) {
+ sco = acl->link;
+ if (sco) {
+ sco->state = BT_CLOSED;
+
+ hci_connect_cfm(sco, status);
+ hci_conn_del(sco);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_sniff_mode *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_SNIFF_MODE);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags);
+
+ if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->flags))
+ hci_sco_setup(conn, status);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_exit_sniff_mode(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_exit_sniff_mode *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_EXIT_SNIFF_MODE);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags);
+
+ if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->flags))
+ hci_sco_setup(conn, status);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_disconnect *cp;
+ struct hci_conn *conn;
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_DISCONNECT);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn)
+ mgmt_disconnect_failed(hdev, &conn->dst, conn->type,
+ conn->dst_type, status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_create_phylink(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_create_phy_link *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (status) {
+ struct hci_conn *hcon;
+
+ hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle);
+ if (hcon)
+ hci_conn_del(hcon);
+ } else {
+ amp_write_remote_assoc(hdev, cp->phy_handle);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_accept_phylink(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_accept_phy_link *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK);
+ if (!cp)
+ return;
+
+ amp_write_remote_assoc(hdev, cp->phy_handle);
+}
+
+static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_create_conn *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ /* All connection failure handling is taken care of by the
+ * hci_le_conn_failed function which is triggered by the HCI
+ * request completion callbacks used for connecting.
+ */
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CONN);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->peer_addr);
+ if (!conn)
+ goto unlock;
+
+ /* Store the initiator and responder address information which
+ * is needed for SMP. These values will not change during the
+ * lifetime of the connection.
+ */
+ conn->init_addr_type = cp->own_address_type;
+ if (cp->own_address_type == ADDR_LE_DEV_RANDOM)
+ bacpy(&conn->init_addr, &hdev->random_addr);
+ else
+ bacpy(&conn->init_addr, &hdev->bdaddr);
+
+ conn->resp_addr_type = cp->peer_addr_type;
+ bacpy(&conn->resp_addr, &cp->peer_addr);
+
+ /* We don't want the connection attempt to stick around
+ * indefinitely since LE doesn't have a page timeout concept
+ * like BR/EDR. Set a timer for any connection that doesn't use
+ * the white list for connecting.
+ */
+ if (cp->filter_policy == HCI_LE_USE_PEER_ADDR)
+ queue_delayed_work(conn->hdev->workqueue,
+ &conn->le_conn_timeout,
+ conn->conn_timeout);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_read_remote_features *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_READ_REMOTE_FEATURES);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ if (conn->state == BT_CONFIG) {
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_le_start_enc(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_start_enc *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_START_ENC);
+ if (!cp)
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (!conn)
+ goto unlock;
+
+ if (conn->state != BT_CONNECTED)
+ goto unlock;
+
+ hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cs_switch_role(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_switch_role *cp;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (!status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_SWITCH_ROLE);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+ if (conn)
+ clear_bit(HCI_CONN_RSWITCH_PEND, &conn->flags);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ struct discovery_state *discov = &hdev->discovery;
+ struct inquiry_entry *e;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ hci_conn_check_pending(hdev);
+
+ if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags))
+ return;
+
+ smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */
+ wake_up_bit(&hdev->flags, HCI_INQUIRY);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (discov->state != DISCOVERY_FINDING)
+ goto unlock;
+
+ if (list_empty(&discov->resolve)) {
+ /* When BR/EDR inquiry is active and no LE scanning is in
+ * progress, then change discovery state to indicate completion.
+ *
+ * When running LE scanning and BR/EDR inquiry simultaneously
+ * and the LE scan already finished, then change the discovery
+ * state to indicate completion.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
+ !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks))
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ goto unlock;
+ }
+
+ e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED);
+ if (e && hci_resolve_name(hdev, e) == 0) {
+ e->name_state = NAME_PENDING;
+ hci_discovery_set_state(hdev, DISCOVERY_RESOLVING);
+ } else {
+ /* When BR/EDR inquiry is active and no LE scanning is in
+ * progress, then change discovery state to indicate completion.
+ *
+ * When running LE scanning and BR/EDR inquiry simultaneously
+ * and the LE scan already finished, then change the discovery
+ * state to indicate completion.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
+ !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks))
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct inquiry_data data;
+ struct inquiry_info *info = (void *) (skb->data + 1);
+ int num_rsp = *((__u8 *) skb->data);
+
+ BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+
+ if (!num_rsp)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
+ return;
+
+ hci_dev_lock(hdev);
+
+ for (; num_rsp; num_rsp--, info++) {
+ u32 flags;
+
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = info->pscan_mode;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = HCI_RSSI_INVALID;
+ data.ssp_mode = 0x00;
+
+ flags = hci_inquiry_cache_update(hdev, &data, false);
+
+ mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
+ info->dev_class, HCI_RSSI_INVALID,
+ flags, NULL, 0, NULL, 0);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_conn_complete *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
+ if (!conn) {
+ if (ev->link_type != SCO_LINK)
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ conn->type = SCO_LINK;
+ }
+
+ if (!ev->status) {
+ conn->handle = __le16_to_cpu(ev->handle);
+
+ if (conn->type == ACL_LINK) {
+ conn->state = BT_CONFIG;
+ hci_conn_hold(conn);
+
+ if (!conn->out && !hci_conn_ssp_enabled(conn) &&
+ !hci_find_link_key(hdev, &ev->bdaddr))
+ conn->disc_timeout = HCI_PAIRING_TIMEOUT;
+ else
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ } else
+ conn->state = BT_CONNECTED;
+
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
+
+ if (test_bit(HCI_AUTH, &hdev->flags))
+ set_bit(HCI_CONN_AUTH, &conn->flags);
+
+ if (test_bit(HCI_ENCRYPT, &hdev->flags))
+ set_bit(HCI_CONN_ENCRYPT, &conn->flags);
+
+ /* Get remote features */
+ if (conn->type == ACL_LINK) {
+ struct hci_cp_read_remote_features cp;
+ cp.handle = ev->handle;
+ hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES,
+ sizeof(cp), &cp);
+
+ hci_update_page_scan(hdev);
+ }
+
+ /* Set packet type for incoming connection */
+ if (!conn->out && hdev->hci_ver < BLUETOOTH_VER_2_0) {
+ struct hci_cp_change_conn_ptype cp;
+ cp.handle = ev->handle;
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+ hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE, sizeof(cp),
+ &cp);
+ }
+ } else {
+ conn->state = BT_CLOSED;
+ if (conn->type == ACL_LINK)
+ mgmt_connect_failed(hdev, &conn->dst, conn->type,
+ conn->dst_type, ev->status);
+ }
+
+ if (conn->type == ACL_LINK)
+ hci_sco_setup(conn, ev->status);
+
+ if (ev->status) {
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
+ } else if (ev->link_type != ACL_LINK)
+ hci_connect_cfm(conn, ev->status);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ hci_conn_check_pending(hdev);
+}
+
+static void hci_reject_conn(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+ struct hci_cp_reject_conn_req cp;
+
+ bacpy(&cp.bdaddr, bdaddr);
+ cp.reason = HCI_ERROR_REJ_BAD_ADDR;
+ hci_send_cmd(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp);
+}
+
+static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_conn_request *ev = (void *) skb->data;
+ int mask = hdev->link_mode;
+ struct inquiry_entry *ie;
+ struct hci_conn *conn;
+ __u8 flags = 0;
+
+ BT_DBG("%s bdaddr %pMR type 0x%x", hdev->name, &ev->bdaddr,
+ ev->link_type);
+
+ mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type,
+ &flags);
+
+ if (!(mask & HCI_LM_ACCEPT)) {
+ hci_reject_conn(hdev, &ev->bdaddr);
+ return;
+ }
+
+ if (hci_bdaddr_list_lookup(&hdev->blacklist, &ev->bdaddr,
+ BDADDR_BREDR)) {
+ hci_reject_conn(hdev, &ev->bdaddr);
+ return;
+ }
+
+ /* Require HCI_CONNECTABLE or a whitelist entry to accept the
+ * connection. These features are only touched through mgmt so
+ * only do the checks if HCI_MGMT is set.
+ */
+ if (hci_dev_test_flag(hdev, HCI_MGMT) &&
+ !hci_dev_test_flag(hdev, HCI_CONNECTABLE) &&
+ !hci_bdaddr_list_lookup(&hdev->whitelist, &ev->bdaddr,
+ BDADDR_BREDR)) {
+ hci_reject_conn(hdev, &ev->bdaddr);
+ return;
+ }
+
+ /* Connection accepted */
+
+ hci_dev_lock(hdev);
+
+ ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+ if (ie)
+ memcpy(ie->data.dev_class, ev->dev_class, 3);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type,
+ &ev->bdaddr);
+ if (!conn) {
+ conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr,
+ HCI_ROLE_SLAVE);
+ if (!conn) {
+ BT_ERR("No memory for new connection");
+ hci_dev_unlock(hdev);
+ return;
+ }
+ }
+
+ memcpy(conn->dev_class, ev->dev_class, 3);
+
+ hci_dev_unlock(hdev);
+
+ if (ev->link_type == ACL_LINK ||
+ (!(flags & HCI_PROTO_DEFER) && !lmp_esco_capable(hdev))) {
+ struct hci_cp_accept_conn_req cp;
+ conn->state = BT_CONNECT;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+
+ if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER))
+ cp.role = 0x00; /* Become master */
+ else
+ cp.role = 0x01; /* Remain slave */
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp);
+ } else if (!(flags & HCI_PROTO_DEFER)) {
+ struct hci_cp_accept_sync_conn_req cp;
+ conn->state = BT_CONNECT;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+ cp.tx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.rx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.max_latency = cpu_to_le16(0xffff);
+ cp.content_format = cpu_to_le16(hdev->voice_setting);
+ cp.retrans_effort = 0xff;
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, sizeof(cp),
+ &cp);
+ } else {
+ conn->state = BT_CONNECT2;
+ hci_connect_cfm(conn, 0);
+ }
+}
+
+static u8 hci_to_mgmt_reason(u8 err)
+{
+ switch (err) {
+ case HCI_ERROR_CONNECTION_TIMEOUT:
+ return MGMT_DEV_DISCONN_TIMEOUT;
+ case HCI_ERROR_REMOTE_USER_TERM:
+ case HCI_ERROR_REMOTE_LOW_RESOURCES:
+ case HCI_ERROR_REMOTE_POWER_OFF:
+ return MGMT_DEV_DISCONN_REMOTE;
+ case HCI_ERROR_LOCAL_HOST_TERM:
+ return MGMT_DEV_DISCONN_LOCAL_HOST;
+ default:
+ return MGMT_DEV_DISCONN_UNKNOWN;
+ }
+}
+
+static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_disconn_complete *ev = (void *) skb->data;
+ u8 reason = hci_to_mgmt_reason(ev->reason);
+ struct hci_conn_params *params;
+ struct hci_conn *conn;
+ bool mgmt_connected;
+ u8 type;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ if (ev->status) {
+ mgmt_disconnect_failed(hdev, &conn->dst, conn->type,
+ conn->dst_type, ev->status);
+ goto unlock;
+ }
+
+ conn->state = BT_CLOSED;
+
+ mgmt_connected = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags);
+ mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type,
+ reason, mgmt_connected);
+
+ if (conn->type == ACL_LINK) {
+ if (test_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
+ hci_remove_link_key(hdev, &conn->dst);
+
+ hci_update_page_scan(hdev);
+ }
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_LINK_LOSS:
+ if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT)
+ break;
+ /* Fall through */
+
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ list_del_init(&params->action);
+ list_add(&params->action, &hdev->pend_le_conns);
+ hci_update_background_scan(hdev);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ type = conn->type;
+
+ hci_disconn_cfm(conn, ev->reason);
+ hci_conn_del(conn);
+
+ /* Re-enable advertising if necessary, since it might
+ * have been disabled by the connection. From the
+ * HCI_LE_Set_Advertise_Enable command description in
+ * the core specification (v4.0):
+ * "The Controller shall continue advertising until the Host
+ * issues an LE_Set_Advertise_Enable command with
+ * Advertising_Enable set to 0x00 (Advertising is disabled)
+ * or until a connection is created or until the Advertising
+ * is timed out due to Directed Advertising."
+ */
+ if (type == LE_LINK)
+ mgmt_reenable_advertising(hdev);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_auth_complete *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ if (!ev->status) {
+ if (!hci_conn_ssp_enabled(conn) &&
+ test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) {
+ BT_INFO("re-auth of legacy device is not possible.");
+ } else {
+ set_bit(HCI_CONN_AUTH, &conn->flags);
+ conn->sec_level = conn->pending_sec_level;
+ }
+ } else {
+ mgmt_auth_failed(conn, ev->status);
+ }
+
+ clear_bit(HCI_CONN_AUTH_PEND, &conn->flags);
+ clear_bit(HCI_CONN_REAUTH_PEND, &conn->flags);
+
+ if (conn->state == BT_CONFIG) {
+ if (!ev->status && hci_conn_ssp_enabled(conn)) {
+ struct hci_cp_set_conn_encrypt cp;
+ cp.handle = ev->handle;
+ cp.encrypt = 0x01;
+ hci_send_cmd(hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+ &cp);
+ } else {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ }
+ } else {
+ hci_auth_cfm(conn, ev->status);
+
+ hci_conn_hold(conn);
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ hci_conn_drop(conn);
+ }
+
+ if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) {
+ if (!ev->status) {
+ struct hci_cp_set_conn_encrypt cp;
+ cp.handle = ev->handle;
+ cp.encrypt = 0x01;
+ hci_send_cmd(hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+ &cp);
+ } else {
+ clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
+ hci_encrypt_cfm(conn, ev->status, 0x00);
+ }
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_name_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_remote_name *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_conn_check_pending(hdev);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto check_auth;
+
+ if (ev->status == 0)
+ hci_check_pending_name(hdev, conn, &ev->bdaddr, ev->name,
+ strnlen(ev->name, HCI_MAX_NAME_LENGTH));
+ else
+ hci_check_pending_name(hdev, conn, &ev->bdaddr, NULL, 0);
+
+check_auth:
+ if (!conn)
+ goto unlock;
+
+ if (!hci_outgoing_auth_needed(hdev, conn))
+ goto unlock;
+
+ if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) {
+ struct hci_cp_auth_requested cp;
+
+ set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags);
+
+ cp.handle = __cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_encrypt_change *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ if (!ev->status) {
+ if (ev->encrypt) {
+ /* Encryption implies authentication */
+ set_bit(HCI_CONN_AUTH, &conn->flags);
+ set_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ conn->sec_level = conn->pending_sec_level;
+
+ /* P-256 authentication key implies FIPS */
+ if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256)
+ set_bit(HCI_CONN_FIPS, &conn->flags);
+
+ if ((conn->type == ACL_LINK && ev->encrypt == 0x02) ||
+ conn->type == LE_LINK)
+ set_bit(HCI_CONN_AES_CCM, &conn->flags);
+ } else {
+ clear_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ clear_bit(HCI_CONN_AES_CCM, &conn->flags);
+ }
+ }
+
+ /* We should disregard the current RPA and generate a new one
+ * whenever the encryption procedure fails.
+ */
+ if (ev->status && conn->type == LE_LINK)
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+
+ clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
+
+ if (ev->status && conn->state == BT_CONNECTED) {
+ hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ if (conn->state == BT_CONFIG) {
+ if (!ev->status)
+ conn->state = BT_CONNECTED;
+
+ /* In Secure Connections Only mode, do not allow any
+ * connections that are not encrypted with AES-CCM
+ * using a P-256 authenticated combination key.
+ */
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY) &&
+ (!test_bit(HCI_CONN_AES_CCM, &conn->flags) ||
+ conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) {
+ hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ } else
+ hci_encrypt_cfm(conn, ev->status, ev->encrypt);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_change_link_key_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_change_link_key_complete *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn) {
+ if (!ev->status)
+ set_bit(HCI_CONN_SECURE, &conn->flags);
+
+ clear_bit(HCI_CONN_AUTH_PEND, &conn->flags);
+
+ hci_key_change_cfm(conn, ev->status);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_features_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_remote_features *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ if (!ev->status)
+ memcpy(conn->features[0], ev->features, 8);
+
+ if (conn->state != BT_CONFIG)
+ goto unlock;
+
+ if (!ev->status && lmp_ext_feat_capable(hdev) &&
+ lmp_ext_feat_capable(conn)) {
+ struct hci_cp_read_remote_ext_features cp;
+ cp.handle = ev->handle;
+ cp.page = 0x01;
+ hci_send_cmd(hdev, HCI_OP_READ_REMOTE_EXT_FEATURES,
+ sizeof(cp), &cp);
+ goto unlock;
+ }
+
+ if (!ev->status && !test_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) {
+ struct hci_cp_remote_name_req cp;
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pscan_rep_mode = 0x02;
+ hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
+ } else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+ mgmt_device_connected(hdev, conn, 0, NULL, 0);
+
+ if (!hci_outgoing_auth_needed(hdev, conn)) {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
+ u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct hci_ev_cmd_complete *ev = (void *) skb->data;
+
+ *opcode = __le16_to_cpu(ev->opcode);
+ *status = skb->data[sizeof(*ev)];
+
+ skb_pull(skb, sizeof(*ev));
+
+ switch (*opcode) {
+ case HCI_OP_INQUIRY_CANCEL:
+ hci_cc_inquiry_cancel(hdev, skb);
+ break;
+
+ case HCI_OP_PERIODIC_INQ:
+ hci_cc_periodic_inq(hdev, skb);
+ break;
+
+ case HCI_OP_EXIT_PERIODIC_INQ:
+ hci_cc_exit_periodic_inq(hdev, skb);
+ break;
+
+ case HCI_OP_REMOTE_NAME_REQ_CANCEL:
+ hci_cc_remote_name_req_cancel(hdev, skb);
+ break;
+
+ case HCI_OP_ROLE_DISCOVERY:
+ hci_cc_role_discovery(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LINK_POLICY:
+ hci_cc_read_link_policy(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_LINK_POLICY:
+ hci_cc_write_link_policy(hdev, skb);
+ break;
+
+ case HCI_OP_READ_DEF_LINK_POLICY:
+ hci_cc_read_def_link_policy(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_DEF_LINK_POLICY:
+ hci_cc_write_def_link_policy(hdev, skb);
+ break;
+
+ case HCI_OP_RESET:
+ hci_cc_reset(hdev, skb);
+ break;
+
+ case HCI_OP_READ_STORED_LINK_KEY:
+ hci_cc_read_stored_link_key(hdev, skb);
+ break;
+
+ case HCI_OP_DELETE_STORED_LINK_KEY:
+ hci_cc_delete_stored_link_key(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_LOCAL_NAME:
+ hci_cc_write_local_name(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_NAME:
+ hci_cc_read_local_name(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_AUTH_ENABLE:
+ hci_cc_write_auth_enable(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_ENCRYPT_MODE:
+ hci_cc_write_encrypt_mode(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_SCAN_ENABLE:
+ hci_cc_write_scan_enable(hdev, skb);
+ break;
+
+ case HCI_OP_READ_CLASS_OF_DEV:
+ hci_cc_read_class_of_dev(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_CLASS_OF_DEV:
+ hci_cc_write_class_of_dev(hdev, skb);
+ break;
+
+ case HCI_OP_READ_VOICE_SETTING:
+ hci_cc_read_voice_setting(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_VOICE_SETTING:
+ hci_cc_write_voice_setting(hdev, skb);
+ break;
+
+ case HCI_OP_READ_NUM_SUPPORTED_IAC:
+ hci_cc_read_num_supported_iac(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_SSP_MODE:
+ hci_cc_write_ssp_mode(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_SC_SUPPORT:
+ hci_cc_write_sc_support(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_VERSION:
+ hci_cc_read_local_version(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_COMMANDS:
+ hci_cc_read_local_commands(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_FEATURES:
+ hci_cc_read_local_features(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_EXT_FEATURES:
+ hci_cc_read_local_ext_features(hdev, skb);
+ break;
+
+ case HCI_OP_READ_BUFFER_SIZE:
+ hci_cc_read_buffer_size(hdev, skb);
+ break;
+
+ case HCI_OP_READ_BD_ADDR:
+ hci_cc_read_bd_addr(hdev, skb);
+ break;
+
+ case HCI_OP_READ_PAGE_SCAN_ACTIVITY:
+ hci_cc_read_page_scan_activity(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_PAGE_SCAN_ACTIVITY:
+ hci_cc_write_page_scan_activity(hdev, skb);
+ break;
+
+ case HCI_OP_READ_PAGE_SCAN_TYPE:
+ hci_cc_read_page_scan_type(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_PAGE_SCAN_TYPE:
+ hci_cc_write_page_scan_type(hdev, skb);
+ break;
+
+ case HCI_OP_READ_DATA_BLOCK_SIZE:
+ hci_cc_read_data_block_size(hdev, skb);
+ break;
+
+ case HCI_OP_READ_FLOW_CONTROL_MODE:
+ hci_cc_read_flow_control_mode(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_AMP_INFO:
+ hci_cc_read_local_amp_info(hdev, skb);
+ break;
+
+ case HCI_OP_READ_CLOCK:
+ hci_cc_read_clock(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_AMP_ASSOC:
+ hci_cc_read_local_amp_assoc(hdev, skb);
+ break;
+
+ case HCI_OP_READ_INQ_RSP_TX_POWER:
+ hci_cc_read_inq_rsp_tx_power(hdev, skb);
+ break;
+
+ case HCI_OP_PIN_CODE_REPLY:
+ hci_cc_pin_code_reply(hdev, skb);
+ break;
+
+ case HCI_OP_PIN_CODE_NEG_REPLY:
+ hci_cc_pin_code_neg_reply(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_OOB_DATA:
+ hci_cc_read_local_oob_data(hdev, skb);
+ break;
+
+ case HCI_OP_READ_LOCAL_OOB_EXT_DATA:
+ hci_cc_read_local_oob_ext_data(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_BUFFER_SIZE:
+ hci_cc_le_read_buffer_size(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_LOCAL_FEATURES:
+ hci_cc_le_read_local_features(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_ADV_TX_POWER:
+ hci_cc_le_read_adv_tx_power(hdev, skb);
+ break;
+
+ case HCI_OP_USER_CONFIRM_REPLY:
+ hci_cc_user_confirm_reply(hdev, skb);
+ break;
+
+ case HCI_OP_USER_CONFIRM_NEG_REPLY:
+ hci_cc_user_confirm_neg_reply(hdev, skb);
+ break;
+
+ case HCI_OP_USER_PASSKEY_REPLY:
+ hci_cc_user_passkey_reply(hdev, skb);
+ break;
+
+ case HCI_OP_USER_PASSKEY_NEG_REPLY:
+ hci_cc_user_passkey_neg_reply(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_RANDOM_ADDR:
+ hci_cc_le_set_random_addr(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_ADV_ENABLE:
+ hci_cc_le_set_adv_enable(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_SCAN_PARAM:
+ hci_cc_le_set_scan_param(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_SCAN_ENABLE:
+ hci_cc_le_set_scan_enable(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_WHITE_LIST_SIZE:
+ hci_cc_le_read_white_list_size(hdev, skb);
+ break;
+
+ case HCI_OP_LE_CLEAR_WHITE_LIST:
+ hci_cc_le_clear_white_list(hdev, skb);
+ break;
+
+ case HCI_OP_LE_ADD_TO_WHITE_LIST:
+ hci_cc_le_add_to_white_list(hdev, skb);
+ break;
+
+ case HCI_OP_LE_DEL_FROM_WHITE_LIST:
+ hci_cc_le_del_from_white_list(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_SUPPORTED_STATES:
+ hci_cc_le_read_supported_states(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_DEF_DATA_LEN:
+ hci_cc_le_read_def_data_len(hdev, skb);
+ break;
+
+ case HCI_OP_LE_WRITE_DEF_DATA_LEN:
+ hci_cc_le_write_def_data_len(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_MAX_DATA_LEN:
+ hci_cc_le_read_max_data_len(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_LE_HOST_SUPPORTED:
+ hci_cc_write_le_host_supported(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_ADV_PARAM:
+ hci_cc_set_adv_param(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_REMOTE_AMP_ASSOC:
+ hci_cc_write_remote_amp_assoc(hdev, skb);
+ break;
+
+ case HCI_OP_READ_RSSI:
+ hci_cc_read_rssi(hdev, skb);
+ break;
+
+ case HCI_OP_READ_TX_POWER:
+ hci_cc_read_tx_power(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_SSP_DEBUG_MODE:
+ hci_cc_write_ssp_debug_mode(hdev, skb);
+ break;
+
+ default:
+ BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
+ break;
+ }
+
+ if (*opcode != HCI_OP_NOP)
+ cancel_delayed_work(&hdev->cmd_timer);
+
+ if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags))
+ atomic_set(&hdev->cmd_cnt, 1);
+
+ hci_req_cmd_complete(hdev, *opcode, *status, req_complete,
+ req_complete_skb);
+
+ if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q))
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+}
+
+static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
+ u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct hci_ev_cmd_status *ev = (void *) skb->data;
+
+ skb_pull(skb, sizeof(*ev));
+
+ *opcode = __le16_to_cpu(ev->opcode);
+ *status = ev->status;
+
+ switch (*opcode) {
+ case HCI_OP_INQUIRY:
+ hci_cs_inquiry(hdev, ev->status);
+ break;
+
+ case HCI_OP_CREATE_CONN:
+ hci_cs_create_conn(hdev, ev->status);
+ break;
+
+ case HCI_OP_DISCONNECT:
+ hci_cs_disconnect(hdev, ev->status);
+ break;
+
+ case HCI_OP_ADD_SCO:
+ hci_cs_add_sco(hdev, ev->status);
+ break;
+
+ case HCI_OP_AUTH_REQUESTED:
+ hci_cs_auth_requested(hdev, ev->status);
+ break;
+
+ case HCI_OP_SET_CONN_ENCRYPT:
+ hci_cs_set_conn_encrypt(hdev, ev->status);
+ break;
+
+ case HCI_OP_REMOTE_NAME_REQ:
+ hci_cs_remote_name_req(hdev, ev->status);
+ break;
+
+ case HCI_OP_READ_REMOTE_FEATURES:
+ hci_cs_read_remote_features(hdev, ev->status);
+ break;
+
+ case HCI_OP_READ_REMOTE_EXT_FEATURES:
+ hci_cs_read_remote_ext_features(hdev, ev->status);
+ break;
+
+ case HCI_OP_SETUP_SYNC_CONN:
+ hci_cs_setup_sync_conn(hdev, ev->status);
+ break;
+
+ case HCI_OP_CREATE_PHY_LINK:
+ hci_cs_create_phylink(hdev, ev->status);
+ break;
+
+ case HCI_OP_ACCEPT_PHY_LINK:
+ hci_cs_accept_phylink(hdev, ev->status);
+ break;
+
+ case HCI_OP_SNIFF_MODE:
+ hci_cs_sniff_mode(hdev, ev->status);
+ break;
+
+ case HCI_OP_EXIT_SNIFF_MODE:
+ hci_cs_exit_sniff_mode(hdev, ev->status);
+ break;
+
+ case HCI_OP_SWITCH_ROLE:
+ hci_cs_switch_role(hdev, ev->status);
+ break;
+
+ case HCI_OP_LE_CREATE_CONN:
+ hci_cs_le_create_conn(hdev, ev->status);
+ break;
+
+ case HCI_OP_LE_READ_REMOTE_FEATURES:
+ hci_cs_le_read_remote_features(hdev, ev->status);
+ break;
+
+ case HCI_OP_LE_START_ENC:
+ hci_cs_le_start_enc(hdev, ev->status);
+ break;
+
+ default:
+ BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
+ break;
+ }
+
+ if (*opcode != HCI_OP_NOP)
+ cancel_delayed_work(&hdev->cmd_timer);
+
+ if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags))
+ atomic_set(&hdev->cmd_cnt, 1);
+
+ /* Indicate request completion if the command failed. Also, if
+ * we're not waiting for a special event and we get a success
+ * command status we should try to flag the request as completed
+ * (since for this kind of commands there will not be a command
+ * complete event).
+ */
+ if (ev->status ||
+ (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->req.event))
+ hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete,
+ req_complete_skb);
+
+ if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q))
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+}
+
+static void hci_hardware_error_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_hardware_error *ev = (void *) skb->data;
+
+ hdev->hw_error_code = ev->code;
+
+ queue_work(hdev->req_workqueue, &hdev->error_reset);
+}
+
+static void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_role_change *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (conn) {
+ if (!ev->status)
+ conn->role = ev->role;
+
+ clear_bit(HCI_CONN_RSWITCH_PEND, &conn->flags);
+
+ hci_role_switch_cfm(conn, ev->status, ev->role);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_num_comp_pkts *ev = (void *) skb->data;
+ int i;
+
+ if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_PACKET_BASED) {
+ BT_ERR("Wrong event for mode %d", hdev->flow_ctl_mode);
+ return;
+ }
+
+ if (skb->len < sizeof(*ev) || skb->len < sizeof(*ev) +
+ ev->num_hndl * sizeof(struct hci_comp_pkts_info)) {
+ BT_DBG("%s bad parameters", hdev->name);
+ return;
+ }
+
+ BT_DBG("%s num_hndl %d", hdev->name, ev->num_hndl);
+
+ for (i = 0; i < ev->num_hndl; i++) {
+ struct hci_comp_pkts_info *info = &ev->handles[i];
+ struct hci_conn *conn;
+ __u16 handle, count;
+
+ handle = __le16_to_cpu(info->handle);
+ count = __le16_to_cpu(info->count);
+
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!conn)
+ continue;
+
+ conn->sent -= count;
+
+ switch (conn->type) {
+ case ACL_LINK:
+ hdev->acl_cnt += count;
+ if (hdev->acl_cnt > hdev->acl_pkts)
+ hdev->acl_cnt = hdev->acl_pkts;
+ break;
+
+ case LE_LINK:
+ if (hdev->le_pkts) {
+ hdev->le_cnt += count;
+ if (hdev->le_cnt > hdev->le_pkts)
+ hdev->le_cnt = hdev->le_pkts;
+ } else {
+ hdev->acl_cnt += count;
+ if (hdev->acl_cnt > hdev->acl_pkts)
+ hdev->acl_cnt = hdev->acl_pkts;
+ }
+ break;
+
+ case SCO_LINK:
+ hdev->sco_cnt += count;
+ if (hdev->sco_cnt > hdev->sco_pkts)
+ hdev->sco_cnt = hdev->sco_pkts;
+ break;
+
+ default:
+ BT_ERR("Unknown type %d conn %p", conn->type, conn);
+ break;
+ }
+ }
+
+ queue_work(hdev->workqueue, &hdev->tx_work);
+}
+
+static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev,
+ __u16 handle)
+{
+ struct hci_chan *chan;
+
+ switch (hdev->dev_type) {
+ case HCI_BREDR:
+ return hci_conn_hash_lookup_handle(hdev, handle);
+ case HCI_AMP:
+ chan = hci_chan_lookup_handle(hdev, handle);
+ if (chan)
+ return chan->conn;
+ break;
+ default:
+ BT_ERR("%s unknown dev_type %d", hdev->name, hdev->dev_type);
+ break;
+ }
+
+ return NULL;
+}
+
+static void hci_num_comp_blocks_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_num_comp_blocks *ev = (void *) skb->data;
+ int i;
+
+ if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_BLOCK_BASED) {
+ BT_ERR("Wrong event for mode %d", hdev->flow_ctl_mode);
+ return;
+ }
+
+ if (skb->len < sizeof(*ev) || skb->len < sizeof(*ev) +
+ ev->num_hndl * sizeof(struct hci_comp_blocks_info)) {
+ BT_DBG("%s bad parameters", hdev->name);
+ return;
+ }
+
+ BT_DBG("%s num_blocks %d num_hndl %d", hdev->name, ev->num_blocks,
+ ev->num_hndl);
+
+ for (i = 0; i < ev->num_hndl; i++) {
+ struct hci_comp_blocks_info *info = &ev->handles[i];
+ struct hci_conn *conn = NULL;
+ __u16 handle, block_count;
+
+ handle = __le16_to_cpu(info->handle);
+ block_count = __le16_to_cpu(info->blocks);
+
+ conn = __hci_conn_lookup_handle(hdev, handle);
+ if (!conn)
+ continue;
+
+ conn->sent -= block_count;
+
+ switch (conn->type) {
+ case ACL_LINK:
+ case AMP_LINK:
+ hdev->block_cnt += block_count;
+ if (hdev->block_cnt > hdev->num_blocks)
+ hdev->block_cnt = hdev->num_blocks;
+ break;
+
+ default:
+ BT_ERR("Unknown type %d conn %p", conn->type, conn);
+ break;
+ }
+ }
+
+ queue_work(hdev->workqueue, &hdev->tx_work);
+}
+
+static void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_mode_change *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn) {
+ conn->mode = ev->mode;
+
+ if (!test_and_clear_bit(HCI_CONN_MODE_CHANGE_PEND,
+ &conn->flags)) {
+ if (conn->mode == HCI_CM_ACTIVE)
+ set_bit(HCI_CONN_POWER_SAVE, &conn->flags);
+ else
+ clear_bit(HCI_CONN_POWER_SAVE, &conn->flags);
+ }
+
+ if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->flags))
+ hci_sco_setup(conn, ev->status);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_pin_code_req *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ if (conn->state == BT_CONNECTED) {
+ hci_conn_hold(conn);
+ conn->disc_timeout = HCI_PAIRING_TIMEOUT;
+ hci_conn_drop(conn);
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_BONDABLE) &&
+ !test_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags)) {
+ hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY,
+ sizeof(ev->bdaddr), &ev->bdaddr);
+ } else if (hci_dev_test_flag(hdev, HCI_MGMT)) {
+ u8 secure;
+
+ if (conn->pending_sec_level == BT_SECURITY_HIGH)
+ secure = 1;
+ else
+ secure = 0;
+
+ mgmt_pin_code_request(hdev, &ev->bdaddr, secure);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void conn_set_key(struct hci_conn *conn, u8 key_type, u8 pin_len)
+{
+ if (key_type == HCI_LK_CHANGED_COMBINATION)
+ return;
+
+ conn->pin_length = pin_len;
+ conn->key_type = key_type;
+
+ switch (key_type) {
+ case HCI_LK_LOCAL_UNIT:
+ case HCI_LK_REMOTE_UNIT:
+ case HCI_LK_DEBUG_COMBINATION:
+ return;
+ case HCI_LK_COMBINATION:
+ if (pin_len == 16)
+ conn->pending_sec_level = BT_SECURITY_HIGH;
+ else
+ conn->pending_sec_level = BT_SECURITY_MEDIUM;
+ break;
+ case HCI_LK_UNAUTH_COMBINATION_P192:
+ case HCI_LK_UNAUTH_COMBINATION_P256:
+ conn->pending_sec_level = BT_SECURITY_MEDIUM;
+ break;
+ case HCI_LK_AUTH_COMBINATION_P192:
+ conn->pending_sec_level = BT_SECURITY_HIGH;
+ break;
+ case HCI_LK_AUTH_COMBINATION_P256:
+ conn->pending_sec_level = BT_SECURITY_FIPS;
+ break;
+ }
+}
+
+static void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_link_key_req *ev = (void *) skb->data;
+ struct hci_cp_link_key_reply cp;
+ struct hci_conn *conn;
+ struct link_key *key;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+
+ hci_dev_lock(hdev);
+
+ key = hci_find_link_key(hdev, &ev->bdaddr);
+ if (!key) {
+ BT_DBG("%s link key not found for %pMR", hdev->name,
+ &ev->bdaddr);
+ goto not_found;
+ }
+
+ BT_DBG("%s found key type %u for %pMR", hdev->name, key->type,
+ &ev->bdaddr);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (conn) {
+ clear_bit(HCI_CONN_NEW_LINK_KEY, &conn->flags);
+
+ if ((key->type == HCI_LK_UNAUTH_COMBINATION_P192 ||
+ key->type == HCI_LK_UNAUTH_COMBINATION_P256) &&
+ conn->auth_type != 0xff && (conn->auth_type & 0x01)) {
+ BT_DBG("%s ignoring unauthenticated key", hdev->name);
+ goto not_found;
+ }
+
+ if (key->type == HCI_LK_COMBINATION && key->pin_len < 16 &&
+ (conn->pending_sec_level == BT_SECURITY_HIGH ||
+ conn->pending_sec_level == BT_SECURITY_FIPS)) {
+ BT_DBG("%s ignoring key unauthenticated for high security",
+ hdev->name);
+ goto not_found;
+ }
+
+ conn_set_key(conn, key->type, key->pin_len);
+ }
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ memcpy(cp.link_key, key->val, HCI_LINK_KEY_SIZE);
+
+ hci_send_cmd(hdev, HCI_OP_LINK_KEY_REPLY, sizeof(cp), &cp);
+
+ hci_dev_unlock(hdev);
+
+ return;
+
+not_found:
+ hci_send_cmd(hdev, HCI_OP_LINK_KEY_NEG_REPLY, 6, &ev->bdaddr);
+ hci_dev_unlock(hdev);
+}
+
+static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_link_key_notify *ev = (void *) skb->data;
+ struct hci_conn *conn;
+ struct link_key *key;
+ bool persistent;
+ u8 pin_len = 0;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ hci_conn_hold(conn);
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ hci_conn_drop(conn);
+
+ set_bit(HCI_CONN_NEW_LINK_KEY, &conn->flags);
+ conn_set_key(conn, ev->key_type, conn->pin_length);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto unlock;
+
+ key = hci_add_link_key(hdev, conn, &ev->bdaddr, ev->link_key,
+ ev->key_type, pin_len, &persistent);
+ if (!key)
+ goto unlock;
+
+ /* Update connection information since adding the key will have
+ * fixed up the type in the case of changed combination keys.
+ */
+ if (ev->key_type == HCI_LK_CHANGED_COMBINATION)
+ conn_set_key(conn, key->type, key->pin_len);
+
+ mgmt_new_link_key(hdev, key, persistent);
+
+ /* Keep debug keys around only if the HCI_KEEP_DEBUG_KEYS flag
+ * is set. If it's not set simply remove the key from the kernel
+ * list (we've still notified user space about it but with
+ * store_hint being 0).
+ */
+ if (key->type == HCI_LK_DEBUG_COMBINATION &&
+ !hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS)) {
+ list_del_rcu(&key->list);
+ kfree_rcu(key, rcu);
+ goto unlock;
+ }
+
+ if (persistent)
+ clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags);
+ else
+ set_bit(HCI_CONN_FLUSH_KEY, &conn->flags);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_clock_offset *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn && !ev->status) {
+ struct inquiry_entry *ie;
+
+ ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+ if (ie) {
+ ie->data.clock_offset = ev->clock_offset;
+ ie->timestamp = jiffies;
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_pkt_type_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_pkt_type_change *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn && !ev->status)
+ conn->pkt_type = __le16_to_cpu(ev->pkt_type);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_pscan_rep_mode *ev = (void *) skb->data;
+ struct inquiry_entry *ie;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+ if (ie) {
+ ie->data.pscan_rep_mode = ev->pscan_rep_mode;
+ ie->timestamp = jiffies;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct inquiry_data data;
+ int num_rsp = *((__u8 *) skb->data);
+
+ BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+
+ if (!num_rsp)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
+ return;
+
+ hci_dev_lock(hdev);
+
+ if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
+ struct inquiry_info_with_rssi_and_pscan_mode *info;
+ info = (void *) (skb->data + 1);
+
+ for (; num_rsp; num_rsp--, info++) {
+ u32 flags;
+
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = info->pscan_mode;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ data.ssp_mode = 0x00;
+
+ flags = hci_inquiry_cache_update(hdev, &data, false);
+
+ mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
+ info->dev_class, info->rssi,
+ flags, NULL, 0, NULL, 0);
+ }
+ } else {
+ struct inquiry_info_with_rssi *info = (void *) (skb->data + 1);
+
+ for (; num_rsp; num_rsp--, info++) {
+ u32 flags;
+
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = 0x00;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ data.ssp_mode = 0x00;
+
+ flags = hci_inquiry_cache_update(hdev, &data, false);
+
+ mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
+ info->dev_class, info->rssi,
+ flags, NULL, 0, NULL, 0);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_ext_features_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_remote_ext_features *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ if (ev->page < HCI_MAX_PAGES)
+ memcpy(conn->features[ev->page], ev->features, 8);
+
+ if (!ev->status && ev->page == 0x01) {
+ struct inquiry_entry *ie;
+
+ ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+ if (ie)
+ ie->data.ssp_mode = (ev->features[0] & LMP_HOST_SSP);
+
+ if (ev->features[0] & LMP_HOST_SSP) {
+ set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+ } else {
+ /* It is mandatory by the Bluetooth specification that
+ * Extended Inquiry Results are only used when Secure
+ * Simple Pairing is enabled, but some devices violate
+ * this.
+ *
+ * To make these devices work, the internal SSP
+ * enabled flag needs to be cleared if the remote host
+ * features do not indicate SSP support */
+ clear_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+ }
+
+ if (ev->features[0] & LMP_HOST_SC)
+ set_bit(HCI_CONN_SC_ENABLED, &conn->flags);
+ }
+
+ if (conn->state != BT_CONFIG)
+ goto unlock;
+
+ if (!ev->status && !test_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) {
+ struct hci_cp_remote_name_req cp;
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pscan_rep_mode = 0x02;
+ hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
+ } else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+ mgmt_device_connected(hdev, conn, 0, NULL, 0);
+
+ if (!hci_outgoing_auth_needed(hdev, conn)) {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_sync_conn_complete *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
+ if (!conn) {
+ if (ev->link_type == ESCO_LINK)
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ conn->type = SCO_LINK;
+ }
+
+ switch (ev->status) {
+ case 0x00:
+ conn->handle = __le16_to_cpu(ev->handle);
+ conn->state = BT_CONNECTED;
+
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
+ break;
+
+ case 0x10: /* Connection Accept Timeout */
+ case 0x0d: /* Connection Rejected due to Limited Resources */
+ case 0x11: /* Unsupported Feature or Parameter Value */
+ case 0x1c: /* SCO interval rejected */
+ case 0x1a: /* Unsupported Remote Feature */
+ case 0x1f: /* Unspecified error */
+ case 0x20: /* Unsupported LMP Parameter value */
+ if (conn->out) {
+ conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+ (hdev->esco_type & EDR_ESCO_MASK);
+ if (hci_setup_sync(conn, conn->link->handle))
+ goto unlock;
+ }
+ /* fall through */
+
+ default:
+ conn->state = BT_CLOSED;
+ break;
+ }
+
+ hci_connect_cfm(conn, ev->status);
+ if (ev->status)
+ hci_conn_del(conn);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static inline size_t eir_get_length(u8 *eir, size_t eir_len)
+{
+ size_t parsed = 0;
+
+ while (parsed < eir_len) {
+ u8 field_len = eir[0];
+
+ if (field_len == 0)
+ return parsed;
+
+ parsed += field_len + 1;
+ eir += field_len + 1;
+ }
+
+ return eir_len;
+}
+
+static void hci_extended_inquiry_result_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct inquiry_data data;
+ struct extended_inquiry_info *info = (void *) (skb->data + 1);
+ int num_rsp = *((__u8 *) skb->data);
+ size_t eir_len;
+
+ BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+
+ if (!num_rsp)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
+ return;
+
+ hci_dev_lock(hdev);
+
+ for (; num_rsp; num_rsp--, info++) {
+ u32 flags;
+ bool name_known;
+
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = 0x00;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ data.ssp_mode = 0x01;
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ name_known = eir_has_data_type(info->data,
+ sizeof(info->data),
+ EIR_NAME_COMPLETE);
+ else
+ name_known = true;
+
+ flags = hci_inquiry_cache_update(hdev, &data, name_known);
+
+ eir_len = eir_get_length(info->data, sizeof(info->data));
+
+ mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
+ info->dev_class, info->rssi,
+ flags, info->data, eir_len, NULL, 0);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_key_refresh_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_key_refresh_complete *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x handle 0x%4.4x", hdev->name, ev->status,
+ __le16_to_cpu(ev->handle));
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ /* For BR/EDR the necessary steps are taken through the
+ * auth_complete event.
+ */
+ if (conn->type != LE_LINK)
+ goto unlock;
+
+ if (!ev->status)
+ conn->sec_level = conn->pending_sec_level;
+
+ clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
+
+ if (ev->status && conn->state == BT_CONNECTED) {
+ hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ if (conn->state == BT_CONFIG) {
+ if (!ev->status)
+ conn->state = BT_CONNECTED;
+
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_drop(conn);
+ } else {
+ hci_auth_cfm(conn, ev->status);
+
+ hci_conn_hold(conn);
+ conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ hci_conn_drop(conn);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static u8 hci_get_auth_req(struct hci_conn *conn)
+{
+ /* If remote requests no-bonding follow that lead */
+ if (conn->remote_auth == HCI_AT_NO_BONDING ||
+ conn->remote_auth == HCI_AT_NO_BONDING_MITM)
+ return conn->remote_auth | (conn->auth_type & 0x01);
+
+ /* If both remote and local have enough IO capabilities, require
+ * MITM protection
+ */
+ if (conn->remote_cap != HCI_IO_NO_INPUT_OUTPUT &&
+ conn->io_capability != HCI_IO_NO_INPUT_OUTPUT)
+ return conn->remote_auth | 0x01;
+
+ /* No MITM protection possible so ignore remote requirement */
+ return (conn->remote_auth & ~0x01) | (conn->auth_type & 0x01);
+}
+
+static u8 bredr_oob_data_present(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct oob_data *data;
+
+ data = hci_find_remote_oob_data(hdev, &conn->dst, BDADDR_BREDR);
+ if (!data)
+ return 0x00;
+
+ if (bredr_sc_enabled(hdev)) {
+ /* When Secure Connections is enabled, then just
+ * return the present value stored with the OOB
+ * data. The stored value contains the right present
+ * information. However it can only be trusted when
+ * not in Secure Connection Only mode.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_SC_ONLY))
+ return data->present;
+
+ /* When Secure Connections Only mode is enabled, then
+ * the P-256 values are required. If they are not
+ * available, then do not declare that OOB data is
+ * present.
+ */
+ if (!memcmp(data->rand256, ZERO_KEY, 16) ||
+ !memcmp(data->hash256, ZERO_KEY, 16))
+ return 0x00;
+
+ return 0x02;
+ }
+
+ /* When Secure Connections is not enabled or actually
+ * not supported by the hardware, then check that if
+ * P-192 data values are present.
+ */
+ if (!memcmp(data->rand192, ZERO_KEY, 16) ||
+ !memcmp(data->hash192, ZERO_KEY, 16))
+ return 0x00;
+
+ return 0x01;
+}
+
+static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_io_capa_request *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ hci_conn_hold(conn);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto unlock;
+
+ /* Allow pairing if we're pairable, the initiators of the
+ * pairing or if the remote is not requesting bonding.
+ */
+ if (hci_dev_test_flag(hdev, HCI_BONDABLE) ||
+ test_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags) ||
+ (conn->remote_auth & ~0x01) == HCI_AT_NO_BONDING) {
+ struct hci_cp_io_capability_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ /* Change the IO capability from KeyboardDisplay
+ * to DisplayYesNo as it is not supported by BT spec. */
+ cp.capability = (conn->io_capability == 0x04) ?
+ HCI_IO_DISPLAY_YESNO : conn->io_capability;
+
+ /* If we are initiators, there is no remote information yet */
+ if (conn->remote_auth == 0xff) {
+ /* Request MITM protection if our IO caps allow it
+ * except for the no-bonding case.
+ */
+ if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT &&
+ conn->auth_type != HCI_AT_NO_BONDING)
+ conn->auth_type |= 0x01;
+ } else {
+ conn->auth_type = hci_get_auth_req(conn);
+ }
+
+ /* If we're not bondable, force one of the non-bondable
+ * authentication requirement values.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BONDABLE))
+ conn->auth_type &= HCI_AT_NO_BONDING_MITM;
+
+ cp.authentication = conn->auth_type;
+ cp.oob_data = bredr_oob_data_present(conn);
+
+ hci_send_cmd(hdev, HCI_OP_IO_CAPABILITY_REPLY,
+ sizeof(cp), &cp);
+ } else {
+ struct hci_cp_io_capability_neg_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ cp.reason = HCI_ERROR_PAIRING_NOT_ALLOWED;
+
+ hci_send_cmd(hdev, HCI_OP_IO_CAPABILITY_NEG_REPLY,
+ sizeof(cp), &cp);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_io_capa_reply_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_io_capa_reply *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ conn->remote_cap = ev->capability;
+ conn->remote_auth = ev->authentication;
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_user_confirm_request_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_user_confirm_req *ev = (void *) skb->data;
+ int loc_mitm, rem_mitm, confirm_hint = 0;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ loc_mitm = (conn->auth_type & 0x01);
+ rem_mitm = (conn->remote_auth & 0x01);
+
+ /* If we require MITM but the remote device can't provide that
+ * (it has NoInputNoOutput) then reject the confirmation
+ * request. We check the security level here since it doesn't
+ * necessarily match conn->auth_type.
+ */
+ if (conn->pending_sec_level > BT_SECURITY_MEDIUM &&
+ conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) {
+ BT_DBG("Rejecting request: remote device can't provide MITM");
+ hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY,
+ sizeof(ev->bdaddr), &ev->bdaddr);
+ goto unlock;
+ }
+
+ /* If no side requires MITM protection; auto-accept */
+ if ((!loc_mitm || conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) &&
+ (!rem_mitm || conn->io_capability == HCI_IO_NO_INPUT_OUTPUT)) {
+
+ /* If we're not the initiators request authorization to
+ * proceed from user space (mgmt_user_confirm with
+ * confirm_hint set to 1). The exception is if neither
+ * side had MITM or if the local IO capability is
+ * NoInputNoOutput, in which case we do auto-accept
+ */
+ if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) &&
+ conn->io_capability != HCI_IO_NO_INPUT_OUTPUT &&
+ (loc_mitm || rem_mitm)) {
+ BT_DBG("Confirming auto-accept as acceptor");
+ confirm_hint = 1;
+ goto confirm;
+ }
+
+ BT_DBG("Auto-accept of user confirmation with %ums delay",
+ hdev->auto_accept_delay);
+
+ if (hdev->auto_accept_delay > 0) {
+ int delay = msecs_to_jiffies(hdev->auto_accept_delay);
+ queue_delayed_work(conn->hdev->workqueue,
+ &conn->auto_accept_work, delay);
+ goto unlock;
+ }
+
+ hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_REPLY,
+ sizeof(ev->bdaddr), &ev->bdaddr);
+ goto unlock;
+ }
+
+confirm:
+ mgmt_user_confirm_request(hdev, &ev->bdaddr, ACL_LINK, 0,
+ le32_to_cpu(ev->passkey), confirm_hint);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_user_passkey_request_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_user_passkey_req *ev = (void *) skb->data;
+
+ BT_DBG("%s", hdev->name);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_request(hdev, &ev->bdaddr, ACL_LINK, 0);
+}
+
+static void hci_user_passkey_notify_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_user_passkey_notify *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ return;
+
+ conn->passkey_notify = __le32_to_cpu(ev->passkey);
+ conn->passkey_entered = 0;
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_notify(hdev, &conn->dst, conn->type,
+ conn->dst_type, conn->passkey_notify,
+ conn->passkey_entered);
+}
+
+static void hci_keypress_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_keypress_notify *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ return;
+
+ switch (ev->type) {
+ case HCI_KEYPRESS_STARTED:
+ conn->passkey_entered = 0;
+ return;
+
+ case HCI_KEYPRESS_ENTERED:
+ conn->passkey_entered++;
+ break;
+
+ case HCI_KEYPRESS_ERASED:
+ conn->passkey_entered--;
+ break;
+
+ case HCI_KEYPRESS_CLEARED:
+ conn->passkey_entered = 0;
+ break;
+
+ case HCI_KEYPRESS_COMPLETED:
+ return;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
+ mgmt_user_passkey_notify(hdev, &conn->dst, conn->type,
+ conn->dst_type, conn->passkey_notify,
+ conn->passkey_entered);
+}
+
+static void hci_simple_pair_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_simple_pair_complete *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (!conn)
+ goto unlock;
+
+ /* Reset the authentication requirement to unknown */
+ conn->remote_auth = 0xff;
+
+ /* To avoid duplicate auth_failed events to user space we check
+ * the HCI_CONN_AUTH_PEND flag which will be set if we
+ * initiated the authentication. A traditional auth_complete
+ * event gets always produced as initiator and is also mapped to
+ * the mgmt_auth_failed event */
+ if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) && ev->status)
+ mgmt_auth_failed(conn, ev->status);
+
+ hci_conn_drop(conn);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_host_features_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_remote_host_features *ev = (void *) skb->data;
+ struct inquiry_entry *ie;
+ struct hci_conn *conn;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+ if (conn)
+ memcpy(conn->features[1], ev->features, 8);
+
+ ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+ if (ie)
+ ie->data.ssp_mode = (ev->features[0] & LMP_HOST_SSP);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_remote_oob_data_request_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_remote_oob_data_request *ev = (void *) skb->data;
+ struct oob_data *data;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ goto unlock;
+
+ data = hci_find_remote_oob_data(hdev, &ev->bdaddr, BDADDR_BREDR);
+ if (!data) {
+ struct hci_cp_remote_oob_data_neg_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_NEG_REPLY,
+ sizeof(cp), &cp);
+ goto unlock;
+ }
+
+ if (bredr_sc_enabled(hdev)) {
+ struct hci_cp_remote_oob_ext_data_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+ memset(cp.hash192, 0, sizeof(cp.hash192));
+ memset(cp.rand192, 0, sizeof(cp.rand192));
+ } else {
+ memcpy(cp.hash192, data->hash192, sizeof(cp.hash192));
+ memcpy(cp.rand192, data->rand192, sizeof(cp.rand192));
+ }
+ memcpy(cp.hash256, data->hash256, sizeof(cp.hash256));
+ memcpy(cp.rand256, data->rand256, sizeof(cp.rand256));
+
+ hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_EXT_DATA_REPLY,
+ sizeof(cp), &cp);
+ } else {
+ struct hci_cp_remote_oob_data_reply cp;
+
+ bacpy(&cp.bdaddr, &ev->bdaddr);
+ memcpy(cp.hash, data->hash192, sizeof(cp.hash));
+ memcpy(cp.rand, data->rand192, sizeof(cp.rand));
+
+ hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_REPLY,
+ sizeof(cp), &cp);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_phy_link_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_phy_link_complete *ev = (void *) skb->data;
+ struct hci_conn *hcon, *bredr_hcon;
+
+ BT_DBG("%s handle 0x%2.2x status 0x%2.2x", hdev->name, ev->phy_handle,
+ ev->status);
+
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ return;
+ }
+
+ if (ev->status) {
+ hci_conn_del(hcon);
+ hci_dev_unlock(hdev);
+ return;
+ }
+
+ bredr_hcon = hcon->amp_mgr->l2cap_conn->hcon;
+
+ hcon->state = BT_CONNECTED;
+ bacpy(&hcon->dst, &bredr_hcon->dst);
+
+ hci_conn_hold(hcon);
+ hcon->disc_timeout = HCI_DISCONN_TIMEOUT;
+ hci_conn_drop(hcon);
+
+ hci_debugfs_create_conn(hcon);
+ hci_conn_add_sysfs(hcon);
+
+ amp_physical_cfm(bredr_hcon, hcon);
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_loglink_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_logical_link_complete *ev = (void *) skb->data;
+ struct hci_conn *hcon;
+ struct hci_chan *hchan;
+ struct amp_mgr *mgr;
+
+ BT_DBG("%s log_handle 0x%4.4x phy_handle 0x%2.2x status 0x%2.2x",
+ hdev->name, le16_to_cpu(ev->handle), ev->phy_handle,
+ ev->status);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
+ if (!hcon)
+ return;
+
+ /* Create AMP hchan */
+ hchan = hci_chan_create(hcon);
+ if (!hchan)
+ return;
+
+ hchan->handle = le16_to_cpu(ev->handle);
+
+ BT_DBG("hcon %p mgr %p hchan %p", hcon, hcon->amp_mgr, hchan);
+
+ mgr = hcon->amp_mgr;
+ if (mgr && mgr->bredr_chan) {
+ struct l2cap_chan *bredr_chan = mgr->bredr_chan;
+
+ l2cap_chan_lock(bredr_chan);
+
+ bredr_chan->conn->mtu = hdev->block_mtu;
+ l2cap_logical_cfm(bredr_chan, hchan, 0);
+ hci_conn_hold(hcon);
+
+ l2cap_chan_unlock(bredr_chan);
+ }
+}
+
+static void hci_disconn_loglink_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_disconn_logical_link_complete *ev = (void *) skb->data;
+ struct hci_chan *hchan;
+
+ BT_DBG("%s log handle 0x%4.4x status 0x%2.2x", hdev->name,
+ le16_to_cpu(ev->handle), ev->status);
+
+ if (ev->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ hchan = hci_chan_lookup_handle(hdev, le16_to_cpu(ev->handle));
+ if (!hchan)
+ goto unlock;
+
+ amp_destroy_logical_link(hchan, ev->reason);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_disconn_phy_link_complete *ev = (void *) skb->data;
+ struct hci_conn *hcon;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ if (ev->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
+ if (hcon) {
+ hcon->state = BT_CLOSED;
+ hci_conn_del(hcon);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_le_conn_complete *ev = (void *) skb->data;
+ struct hci_conn_params *params;
+ struct hci_conn *conn;
+ struct smp_irk *irk;
+ u8 addr_type;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ /* All controllers implicitly stop advertising in the event of a
+ * connection, so ensure that the state bit is cleared.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ if (!conn) {
+ conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role);
+ if (!conn) {
+ BT_ERR("No memory for new connection");
+ goto unlock;
+ }
+
+ conn->dst_type = ev->bdaddr_type;
+
+ /* If we didn't have a hci_conn object previously
+ * but we're in master role this must be something
+ * initiated using a white list. Since white list based
+ * connections are not "first class citizens" we don't
+ * have full tracking of them. Therefore, we go ahead
+ * with a "best effort" approach of determining the
+ * initiator address based on the HCI_PRIVACY flag.
+ */
+ if (conn->out) {
+ conn->resp_addr_type = ev->bdaddr_type;
+ bacpy(&conn->resp_addr, &ev->bdaddr);
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+ conn->init_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(&conn->init_addr, &hdev->rpa);
+ } else {
+ hci_copy_identity_address(hdev,
+ &conn->init_addr,
+ &conn->init_addr_type);
+ }
+ }
+ } else {
+ cancel_delayed_work(&conn->le_conn_timeout);
+ }
+
+ if (!conn->out) {
+ /* Set the responder (our side) address type based on
+ * the advertising address type.
+ */
+ conn->resp_addr_type = hdev->adv_addr_type;
+ if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM)
+ bacpy(&conn->resp_addr, &hdev->random_addr);
+ else
+ bacpy(&conn->resp_addr, &hdev->bdaddr);
+
+ conn->init_addr_type = ev->bdaddr_type;
+ bacpy(&conn->init_addr, &ev->bdaddr);
+
+ /* For incoming connections, set the default minimum
+ * and maximum connection interval. They will be used
+ * to check if the parameters are in range and if not
+ * trigger the connection update procedure.
+ */
+ conn->le_conn_min_interval = hdev->le_conn_min_interval;
+ conn->le_conn_max_interval = hdev->le_conn_max_interval;
+ }
+
+ /* Lookup the identity address from the stored connection
+ * address and address type.
+ *
+ * When establishing connections to an identity address, the
+ * connection procedure will store the resolvable random
+ * address first. Now if it can be converted back into the
+ * identity address, start using the identity address from
+ * now on.
+ */
+ irk = hci_get_irk(hdev, &conn->dst, conn->dst_type);
+ if (irk) {
+ bacpy(&conn->dst, &irk->bdaddr);
+ conn->dst_type = irk->addr_type;
+ }
+
+ if (ev->status) {
+ hci_le_conn_failed(conn, ev->status);
+ goto unlock;
+ }
+
+ if (conn->dst_type == ADDR_LE_DEV_PUBLIC)
+ addr_type = BDADDR_LE_PUBLIC;
+ else
+ addr_type = BDADDR_LE_RANDOM;
+
+ /* Drop the connection if the device is blocked */
+ if (hci_bdaddr_list_lookup(&hdev->blacklist, &conn->dst, addr_type)) {
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+ mgmt_device_connected(hdev, conn, 0, NULL, 0);
+
+ conn->sec_level = BT_SECURITY_LOW;
+ conn->handle = __le16_to_cpu(ev->handle);
+ conn->state = BT_CONFIG;
+
+ conn->le_conn_interval = le16_to_cpu(ev->interval);
+ conn->le_conn_latency = le16_to_cpu(ev->latency);
+ conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout);
+
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
+
+ if (!ev->status) {
+ /* The remote features procedure is defined for master
+ * role only. So only in case of an initiated connection
+ * request the remote features.
+ *
+ * If the local controller supports slave-initiated features
+ * exchange, then requesting the remote features in slave
+ * role is possible. Otherwise just transition into the
+ * connected state without requesting the remote features.
+ */
+ if (conn->out ||
+ (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) {
+ struct hci_cp_le_read_remote_features cp;
+
+ cp.handle = __cpu_to_le16(conn->handle);
+
+ hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
+ sizeof(cp), &cp);
+
+ hci_conn_hold(conn);
+ } else {
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, ev->status);
+ }
+ } else {
+ hci_connect_cfm(conn, ev->status);
+ }
+
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
+ conn->dst_type);
+ if (params) {
+ list_del_init(&params->action);
+ if (params->conn) {
+ hci_conn_drop(params->conn);
+ hci_conn_put(params->conn);
+ params->conn = NULL;
+ }
+ }
+
+unlock:
+ hci_update_background_scan(hdev);
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_conn_update_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_conn_update_complete *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ if (ev->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn) {
+ conn->le_conn_interval = le16_to_cpu(ev->interval);
+ conn->le_conn_latency = le16_to_cpu(ev->latency);
+ conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+/* This function requires the caller holds hdev->lock */
+static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
+ bdaddr_t *addr,
+ u8 addr_type, u8 adv_type)
+{
+ struct hci_conn *conn;
+ struct hci_conn_params *params;
+
+ /* If the event is not connectable don't proceed further */
+ if (adv_type != LE_ADV_IND && adv_type != LE_ADV_DIRECT_IND)
+ return NULL;
+
+ /* Ignore if the device is blocked */
+ if (hci_bdaddr_list_lookup(&hdev->blacklist, addr, addr_type))
+ return NULL;
+
+ /* Most controller will fail if we try to create new connections
+ * while we have an existing one in slave role.
+ */
+ if (hdev->conn_hash.le_num_slave > 0)
+ return NULL;
+
+ /* If we're not connectable only connect devices that we have in
+ * our pend_le_conns list.
+ */
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ addr, addr_type);
+ if (!params)
+ return NULL;
+
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_DIRECT:
+ /* Only devices advertising with ADV_DIRECT_IND are
+ * triggering a connection attempt. This is allowing
+ * incoming connections from slave devices.
+ */
+ if (adv_type != LE_ADV_DIRECT_IND)
+ return NULL;
+ break;
+ case HCI_AUTO_CONN_ALWAYS:
+ /* Devices advertising with ADV_IND or ADV_DIRECT_IND
+ * are triggering a connection attempt. This means
+ * that incoming connectioms from slave device are
+ * accepted and also outgoing connections to slave
+ * devices are established when found.
+ */
+ break;
+ default:
+ return NULL;
+ }
+
+ conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW,
+ HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER);
+ if (!IS_ERR(conn)) {
+ /* Store the pointer since we don't really have any
+ * other owner of the object besides the params that
+ * triggered it. This way we can abort the connection if
+ * the parameters get removed and keep the reference
+ * count consistent once the connection is established.
+ */
+ params->conn = hci_conn_get(conn);
+ return conn;
+ }
+
+ switch (PTR_ERR(conn)) {
+ case -EBUSY:
+ /* If hci_connect() returns -EBUSY it means there is already
+ * an LE connection attempt going on. Since controllers don't
+ * support more than one connection attempt at the time, we
+ * don't consider this an error case.
+ */
+ break;
+ default:
+ BT_DBG("Failed to connect: err %ld", PTR_ERR(conn));
+ return NULL;
+ }
+
+ return NULL;
+}
+
+static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
+ u8 bdaddr_type, bdaddr_t *direct_addr,
+ u8 direct_addr_type, s8 rssi, u8 *data, u8 len)
+{
+ struct discovery_state *d = &hdev->discovery;
+ struct smp_irk *irk;
+ struct hci_conn *conn;
+ bool match;
+ u32 flags;
+
+ /* If the direct address is present, then this report is from
+ * a LE Direct Advertising Report event. In that case it is
+ * important to see if the address is matching the local
+ * controller address.
+ */
+ if (direct_addr) {
+ /* Only resolvable random addresses are valid for these
+ * kind of reports and others can be ignored.
+ */
+ if (!hci_bdaddr_is_rpa(direct_addr, direct_addr_type))
+ return;
+
+ /* If the controller is not using resolvable random
+ * addresses, then this report can be ignored.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
+ return;
+
+ /* If the local IRK of the controller does not match
+ * with the resolvable random address provided, then
+ * this report can be ignored.
+ */
+ if (!smp_irk_matches(hdev, hdev->irk, direct_addr))
+ return;
+ }
+
+ /* Check if we need to convert to identity address */
+ irk = hci_get_irk(hdev, bdaddr, bdaddr_type);
+ if (irk) {
+ bdaddr = &irk->bdaddr;
+ bdaddr_type = irk->addr_type;
+ }
+
+ /* Check if we have been requested to connect to this device */
+ conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type);
+ if (conn && type == LE_ADV_IND) {
+ /* Store report for later inclusion by
+ * mgmt_device_connected
+ */
+ memcpy(conn->le_adv_data, data, len);
+ conn->le_adv_data_len = len;
+ }
+
+ /* Passive scanning shouldn't trigger any device found events,
+ * except for devices marked as CONN_REPORT for which we do send
+ * device found events.
+ */
+ if (hdev->le_scan_type == LE_SCAN_PASSIVE) {
+ if (type == LE_ADV_DIRECT_IND)
+ return;
+
+ if (!hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ bdaddr, bdaddr_type))
+ return;
+
+ if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND)
+ flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
+ else
+ flags = 0;
+ mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+ rssi, flags, data, len, NULL, 0);
+ return;
+ }
+
+ /* When receiving non-connectable or scannable undirected
+ * advertising reports, this means that the remote device is
+ * not connectable and then clearly indicate this in the
+ * device found event.
+ *
+ * When receiving a scan response, then there is no way to
+ * know if the remote device is connectable or not. However
+ * since scan responses are merged with a previously seen
+ * advertising report, the flags field from that report
+ * will be used.
+ *
+ * In the really unlikely case that a controller get confused
+ * and just sends a scan response event, then it is marked as
+ * not connectable as well.
+ */
+ if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND ||
+ type == LE_ADV_SCAN_RSP)
+ flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
+ else
+ flags = 0;
+
+ /* If there's nothing pending either store the data from this
+ * event or send an immediate device found event if the data
+ * should not be stored for later.
+ */
+ if (!has_pending_adv_report(hdev)) {
+ /* If the report will trigger a SCAN_REQ store it for
+ * later merging.
+ */
+ if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) {
+ store_pending_adv_report(hdev, bdaddr, bdaddr_type,
+ rssi, flags, data, len);
+ return;
+ }
+
+ mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+ rssi, flags, data, len, NULL, 0);
+ return;
+ }
+
+ /* Check if the pending report is for the same device as the new one */
+ match = (!bacmp(bdaddr, &d->last_adv_addr) &&
+ bdaddr_type == d->last_adv_addr_type);
+
+ /* If the pending data doesn't match this report or this isn't a
+ * scan response (e.g. we got a duplicate ADV_IND) then force
+ * sending of the pending data.
+ */
+ if (type != LE_ADV_SCAN_RSP || !match) {
+ /* Send out whatever is in the cache, but skip duplicates */
+ if (!match)
+ mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
+ d->last_adv_addr_type, NULL,
+ d->last_adv_rssi, d->last_adv_flags,
+ d->last_adv_data,
+ d->last_adv_data_len, NULL, 0);
+
+ /* If the new report will trigger a SCAN_REQ store it for
+ * later merging.
+ */
+ if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) {
+ store_pending_adv_report(hdev, bdaddr, bdaddr_type,
+ rssi, flags, data, len);
+ return;
+ }
+
+ /* The advertising reports cannot be merged, so clear
+ * the pending report and send out a device found event.
+ */
+ clear_pending_adv_report(hdev);
+ mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+ rssi, flags, data, len, NULL, 0);
+ return;
+ }
+
+ /* If we get here we've got a pending ADV_IND or ADV_SCAN_IND and
+ * the new event is a SCAN_RSP. We can therefore proceed with
+ * sending a merged device found event.
+ */
+ mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
+ d->last_adv_addr_type, NULL, rssi, d->last_adv_flags,
+ d->last_adv_data, d->last_adv_data_len, data, len);
+ clear_pending_adv_report(hdev);
+}
+
+static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ u8 num_reports = skb->data[0];
+ void *ptr = &skb->data[1];
+
+ hci_dev_lock(hdev);
+
+ while (num_reports--) {
+ struct hci_ev_le_advertising_info *ev = ptr;
+ s8 rssi;
+
+ rssi = ev->data[ev->length];
+ process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
+ ev->bdaddr_type, NULL, 0, rssi,
+ ev->data, ev->length);
+
+ ptr += sizeof(*ev) + ev->length + 1;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_remote_feat_complete *ev = (void *)skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn) {
+ if (!ev->status)
+ memcpy(conn->features[0], ev->features, 8);
+
+ if (conn->state == BT_CONFIG) {
+ __u8 status;
+
+ /* If the local controller supports slave-initiated
+ * features exchange, but the remote controller does
+ * not, then it is possible that the error code 0x1a
+ * for unsupported remote feature gets returned.
+ *
+ * In this specific case, allow the connection to
+ * transition into connected state and mark it as
+ * successful.
+ */
+ if ((hdev->le_features[0] & HCI_LE_SLAVE_FEATURES) &&
+ !conn->out && ev->status == 0x1a)
+ status = 0x00;
+ else
+ status = ev->status;
+
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, status);
+ hci_conn_drop(conn);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_le_ltk_req *ev = (void *) skb->data;
+ struct hci_cp_le_ltk_reply cp;
+ struct hci_cp_le_ltk_neg_reply neg;
+ struct hci_conn *conn;
+ struct smp_ltk *ltk;
+
+ BT_DBG("%s handle 0x%4.4x", hdev->name, __le16_to_cpu(ev->handle));
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (conn == NULL)
+ goto not_found;
+
+ ltk = hci_find_ltk(hdev, &conn->dst, conn->dst_type, conn->role);
+ if (!ltk)
+ goto not_found;
+
+ if (smp_ltk_is_sc(ltk)) {
+ /* With SC both EDiv and Rand are set to zero */
+ if (ev->ediv || ev->rand)
+ goto not_found;
+ } else {
+ /* For non-SC keys check that EDiv and Rand match */
+ if (ev->ediv != ltk->ediv || ev->rand != ltk->rand)
+ goto not_found;
+ }
+
+ memcpy(cp.ltk, ltk->val, sizeof(ltk->val));
+ cp.handle = cpu_to_le16(conn->handle);
+
+ conn->pending_sec_level = smp_ltk_sec_level(ltk);
+
+ conn->enc_key_size = ltk->enc_size;
+
+ hci_send_cmd(hdev, HCI_OP_LE_LTK_REPLY, sizeof(cp), &cp);
+
+ /* Ref. Bluetooth Core SPEC pages 1975 and 2004. STK is a
+ * temporary key used to encrypt a connection following
+ * pairing. It is used during the Encrypted Session Setup to
+ * distribute the keys. Later, security can be re-established
+ * using a distributed LTK.
+ */
+ if (ltk->type == SMP_STK) {
+ set_bit(HCI_CONN_STK_ENCRYPT, &conn->flags);
+ list_del_rcu(&ltk->list);
+ kfree_rcu(ltk, rcu);
+ } else {
+ clear_bit(HCI_CONN_STK_ENCRYPT, &conn->flags);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return;
+
+not_found:
+ neg.handle = ev->handle;
+ hci_send_cmd(hdev, HCI_OP_LE_LTK_NEG_REPLY, sizeof(neg), &neg);
+ hci_dev_unlock(hdev);
+}
+
+static void send_conn_param_neg_reply(struct hci_dev *hdev, u16 handle,
+ u8 reason)
+{
+ struct hci_cp_le_conn_param_req_neg_reply cp;
+
+ cp.handle = cpu_to_le16(handle);
+ cp.reason = reason;
+
+ hci_send_cmd(hdev, HCI_OP_LE_CONN_PARAM_REQ_NEG_REPLY, sizeof(cp),
+ &cp);
+}
+
+static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_remote_conn_param_req *ev = (void *) skb->data;
+ struct hci_cp_le_conn_param_req_reply cp;
+ struct hci_conn *hcon;
+ u16 handle, min, max, latency, timeout;
+
+ handle = le16_to_cpu(ev->handle);
+ min = le16_to_cpu(ev->interval_min);
+ max = le16_to_cpu(ev->interval_max);
+ latency = le16_to_cpu(ev->latency);
+ timeout = le16_to_cpu(ev->timeout);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon || hcon->state != BT_CONNECTED)
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_UNKNOWN_CONN_ID);
+
+ if (hci_check_conn_params(min, max, latency, timeout))
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_INVALID_LL_PARAMS);
+
+ if (hcon->role == HCI_ROLE_MASTER) {
+ struct hci_conn_params *params;
+ u8 store_hint;
+
+ hci_dev_lock(hdev);
+
+ params = hci_conn_params_lookup(hdev, &hcon->dst,
+ hcon->dst_type);
+ if (params) {
+ params->conn_min_interval = min;
+ params->conn_max_interval = max;
+ params->conn_latency = latency;
+ params->supervision_timeout = timeout;
+ store_hint = 0x01;
+ } else{
+ store_hint = 0x00;
+ }
+
+ hci_dev_unlock(hdev);
+
+ mgmt_new_conn_param(hdev, &hcon->dst, hcon->dst_type,
+ store_hint, min, max, latency, timeout);
+ }
+
+ cp.handle = ev->handle;
+ cp.interval_min = ev->interval_min;
+ cp.interval_max = ev->interval_max;
+ cp.latency = ev->latency;
+ cp.timeout = ev->timeout;
+ cp.min_ce_len = 0;
+ cp.max_ce_len = 0;
+
+ hci_send_cmd(hdev, HCI_OP_LE_CONN_PARAM_REQ_REPLY, sizeof(cp), &cp);
+}
+
+static void hci_le_direct_adv_report_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ u8 num_reports = skb->data[0];
+ void *ptr = &skb->data[1];
+
+ hci_dev_lock(hdev);
+
+ while (num_reports--) {
+ struct hci_ev_le_direct_adv_info *ev = ptr;
+
+ process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
+ ev->bdaddr_type, &ev->direct_addr,
+ ev->direct_addr_type, ev->rssi, NULL, 0);
+
+ ptr += sizeof(*ev);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_le_meta *le_ev = (void *) skb->data;
+
+ skb_pull(skb, sizeof(*le_ev));
+
+ switch (le_ev->subevent) {
+ case HCI_EV_LE_CONN_COMPLETE:
+ hci_le_conn_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_CONN_UPDATE_COMPLETE:
+ hci_le_conn_update_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_ADVERTISING_REPORT:
+ hci_le_adv_report_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_REMOTE_FEAT_COMPLETE:
+ hci_le_remote_feat_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_LTK_REQ:
+ hci_le_ltk_request_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_REMOTE_CONN_PARAM_REQ:
+ hci_le_remote_conn_param_req_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_DIRECT_ADV_REPORT:
+ hci_le_direct_adv_report_evt(hdev, skb);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_channel_selected *ev = (void *) skb->data;
+ struct hci_conn *hcon;
+
+ BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle);
+
+ skb_pull(skb, sizeof(*ev));
+
+ hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
+ if (!hcon)
+ return;
+
+ amp_read_loc_assoc_final_data(hdev, hcon);
+}
+
+static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
+ u8 event, struct sk_buff *skb)
+{
+ struct hci_ev_cmd_complete *ev;
+ struct hci_event_hdr *hdr;
+
+ if (!skb)
+ return false;
+
+ if (skb->len < sizeof(*hdr)) {
+ BT_ERR("Too short HCI event");
+ return false;
+ }
+
+ hdr = (void *) skb->data;
+ skb_pull(skb, HCI_EVENT_HDR_SIZE);
+
+ if (event) {
+ if (hdr->evt != event)
+ return false;
+ return true;
+ }
+
+ if (hdr->evt != HCI_EV_CMD_COMPLETE) {
+ BT_DBG("Last event is not cmd complete (0x%2.2x)", hdr->evt);
+ return false;
+ }
+
+ if (skb->len < sizeof(*ev)) {
+ BT_ERR("Too short cmd_complete event");
+ return false;
+ }
+
+ ev = (void *) skb->data;
+ skb_pull(skb, sizeof(*ev));
+
+ if (opcode != __le16_to_cpu(ev->opcode)) {
+ BT_DBG("opcode doesn't match (0x%2.2x != 0x%2.2x)", opcode,
+ __le16_to_cpu(ev->opcode));
+ return false;
+ }
+
+ return true;
+}
+
+void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_event_hdr *hdr = (void *) skb->data;
+ hci_req_complete_t req_complete = NULL;
+ hci_req_complete_skb_t req_complete_skb = NULL;
+ struct sk_buff *orig_skb = NULL;
+ u8 status = 0, event = hdr->evt, req_evt = 0;
+ u16 opcode = HCI_OP_NOP;
+
+ if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->req.event == event) {
+ struct hci_command_hdr *cmd_hdr = (void *) hdev->sent_cmd->data;
+ opcode = __le16_to_cpu(cmd_hdr->opcode);
+ hci_req_cmd_complete(hdev, opcode, status, &req_complete,
+ &req_complete_skb);
+ req_evt = event;
+ }
+
+ /* If it looks like we might end up having to call
+ * req_complete_skb, store a pristine copy of the skb since the
+ * various handlers may modify the original one through
+ * skb_pull() calls, etc.
+ */
+ if (req_complete_skb || event == HCI_EV_CMD_STATUS ||
+ event == HCI_EV_CMD_COMPLETE)
+ orig_skb = skb_clone(skb, GFP_KERNEL);
+
+ skb_pull(skb, HCI_EVENT_HDR_SIZE);
+
+ switch (event) {
+ case HCI_EV_INQUIRY_COMPLETE:
+ hci_inquiry_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_INQUIRY_RESULT:
+ hci_inquiry_result_evt(hdev, skb);
+ break;
+
+ case HCI_EV_CONN_COMPLETE:
+ hci_conn_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_CONN_REQUEST:
+ hci_conn_request_evt(hdev, skb);
+ break;
+
+ case HCI_EV_DISCONN_COMPLETE:
+ hci_disconn_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_AUTH_COMPLETE:
+ hci_auth_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_REMOTE_NAME:
+ hci_remote_name_evt(hdev, skb);
+ break;
+
+ case HCI_EV_ENCRYPT_CHANGE:
+ hci_encrypt_change_evt(hdev, skb);
+ break;
+
+ case HCI_EV_CHANGE_LINK_KEY_COMPLETE:
+ hci_change_link_key_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_REMOTE_FEATURES:
+ hci_remote_features_evt(hdev, skb);
+ break;
+
+ case HCI_EV_CMD_COMPLETE:
+ hci_cmd_complete_evt(hdev, skb, &opcode, &status,
+ &req_complete, &req_complete_skb);
+ break;
+
+ case HCI_EV_CMD_STATUS:
+ hci_cmd_status_evt(hdev, skb, &opcode, &status, &req_complete,
+ &req_complete_skb);
+ break;
+
+ case HCI_EV_HARDWARE_ERROR:
+ hci_hardware_error_evt(hdev, skb);
+ break;
+
+ case HCI_EV_ROLE_CHANGE:
+ hci_role_change_evt(hdev, skb);
+ break;
+
+ case HCI_EV_NUM_COMP_PKTS:
+ hci_num_comp_pkts_evt(hdev, skb);
+ break;
+
+ case HCI_EV_MODE_CHANGE:
+ hci_mode_change_evt(hdev, skb);
+ break;
+
+ case HCI_EV_PIN_CODE_REQ:
+ hci_pin_code_request_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LINK_KEY_REQ:
+ hci_link_key_request_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LINK_KEY_NOTIFY:
+ hci_link_key_notify_evt(hdev, skb);
+ break;
+
+ case HCI_EV_CLOCK_OFFSET:
+ hci_clock_offset_evt(hdev, skb);
+ break;
+
+ case HCI_EV_PKT_TYPE_CHANGE:
+ hci_pkt_type_change_evt(hdev, skb);
+ break;
+
+ case HCI_EV_PSCAN_REP_MODE:
+ hci_pscan_rep_mode_evt(hdev, skb);
+ break;
+
+ case HCI_EV_INQUIRY_RESULT_WITH_RSSI:
+ hci_inquiry_result_with_rssi_evt(hdev, skb);
+ break;
+
+ case HCI_EV_REMOTE_EXT_FEATURES:
+ hci_remote_ext_features_evt(hdev, skb);
+ break;
+
+ case HCI_EV_SYNC_CONN_COMPLETE:
+ hci_sync_conn_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_EXTENDED_INQUIRY_RESULT:
+ hci_extended_inquiry_result_evt(hdev, skb);
+ break;
+
+ case HCI_EV_KEY_REFRESH_COMPLETE:
+ hci_key_refresh_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_IO_CAPA_REQUEST:
+ hci_io_capa_request_evt(hdev, skb);
+ break;
+
+ case HCI_EV_IO_CAPA_REPLY:
+ hci_io_capa_reply_evt(hdev, skb);
+ break;
+
+ case HCI_EV_USER_CONFIRM_REQUEST:
+ hci_user_confirm_request_evt(hdev, skb);
+ break;
+
+ case HCI_EV_USER_PASSKEY_REQUEST:
+ hci_user_passkey_request_evt(hdev, skb);
+ break;
+
+ case HCI_EV_USER_PASSKEY_NOTIFY:
+ hci_user_passkey_notify_evt(hdev, skb);
+ break;
+
+ case HCI_EV_KEYPRESS_NOTIFY:
+ hci_keypress_notify_evt(hdev, skb);
+ break;
+
+ case HCI_EV_SIMPLE_PAIR_COMPLETE:
+ hci_simple_pair_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_REMOTE_HOST_FEATURES:
+ hci_remote_host_features_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_META:
+ hci_le_meta_evt(hdev, skb);
+ break;
+
+ case HCI_EV_CHANNEL_SELECTED:
+ hci_chan_selected_evt(hdev, skb);
+ break;
+
+ case HCI_EV_REMOTE_OOB_DATA_REQUEST:
+ hci_remote_oob_data_request_evt(hdev, skb);
+ break;
+
+ case HCI_EV_PHY_LINK_COMPLETE:
+ hci_phy_link_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LOGICAL_LINK_COMPLETE:
+ hci_loglink_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE:
+ hci_disconn_loglink_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_DISCONN_PHY_LINK_COMPLETE:
+ hci_disconn_phylink_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_NUM_COMP_BLOCKS:
+ hci_num_comp_blocks_evt(hdev, skb);
+ break;
+
+ default:
+ BT_DBG("%s event 0x%2.2x", hdev->name, event);
+ break;
+ }
+
+ if (req_complete) {
+ req_complete(hdev, status, opcode);
+ } else if (req_complete_skb) {
+ if (!hci_get_cmd_complete(hdev, opcode, req_evt, orig_skb)) {
+ kfree_skb(orig_skb);
+ orig_skb = NULL;
+ }
+ req_complete_skb(hdev, status, opcode, orig_skb);
+ }
+
+ kfree_skb(orig_skb);
+ kfree_skb(skb);
+ hdev->stat.evt_rx++;
+}
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
new file mode 100644
index 000000000..d6025d6e6
--- /dev/null
+++ b/net/bluetooth/hci_request.c
@@ -0,0 +1,568 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "smp.h"
+#include "hci_request.h"
+
+void hci_req_init(struct hci_request *req, struct hci_dev *hdev)
+{
+ skb_queue_head_init(&req->cmd_q);
+ req->hdev = hdev;
+ req->err = 0;
+}
+
+static int req_run(struct hci_request *req, hci_req_complete_t complete,
+ hci_req_complete_skb_t complete_skb)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ BT_DBG("length %u", skb_queue_len(&req->cmd_q));
+
+ /* If an error occurred during request building, remove all HCI
+ * commands queued on the HCI request queue.
+ */
+ if (req->err) {
+ skb_queue_purge(&req->cmd_q);
+ return req->err;
+ }
+
+ /* Do not allow empty requests */
+ if (skb_queue_empty(&req->cmd_q))
+ return -ENODATA;
+
+ skb = skb_peek_tail(&req->cmd_q);
+ bt_cb(skb)->req.complete = complete;
+ bt_cb(skb)->req.complete_skb = complete_skb;
+
+ spin_lock_irqsave(&hdev->cmd_q.lock, flags);
+ skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q);
+ spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
+
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+
+ return 0;
+}
+
+int hci_req_run(struct hci_request *req, hci_req_complete_t complete)
+{
+ return req_run(req, complete, NULL);
+}
+
+int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete)
+{
+ return req_run(req, NULL, complete);
+}
+
+struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param)
+{
+ int len = HCI_COMMAND_HDR_SIZE + plen;
+ struct hci_command_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ hdr = (struct hci_command_hdr *) skb_put(skb, HCI_COMMAND_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(opcode);
+ hdr->plen = plen;
+
+ if (plen)
+ memcpy(skb_put(skb, plen), param, plen);
+
+ BT_DBG("skb len %d", skb->len);
+
+ bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
+ bt_cb(skb)->opcode = opcode;
+
+ return skb;
+}
+
+/* Queue a command to an asynchronous HCI request */
+void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
+ const void *param, u8 event)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct sk_buff *skb;
+
+ BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen);
+
+ /* If an error occurred during request building, there is no point in
+ * queueing the HCI command. We can simply return.
+ */
+ if (req->err)
+ return;
+
+ skb = hci_prepare_cmd(hdev, opcode, plen, param);
+ if (!skb) {
+ BT_ERR("%s no memory for command (opcode 0x%4.4x)",
+ hdev->name, opcode);
+ req->err = -ENOMEM;
+ return;
+ }
+
+ if (skb_queue_empty(&req->cmd_q))
+ bt_cb(skb)->req.start = true;
+
+ bt_cb(skb)->req.event = event;
+
+ skb_queue_tail(&req->cmd_q, skb);
+}
+
+void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
+ const void *param)
+{
+ hci_req_add_ev(req, opcode, plen, param, 0);
+}
+
+void hci_req_add_le_scan_disable(struct hci_request *req)
+{
+ struct hci_cp_le_set_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_DISABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+}
+
+static void add_to_white_list(struct hci_request *req,
+ struct hci_conn_params *params)
+{
+ struct hci_cp_le_add_to_white_list cp;
+
+ cp.bdaddr_type = params->addr_type;
+ bacpy(&cp.bdaddr, &params->addr);
+
+ hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp);
+}
+
+static u8 update_white_list(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_conn_params *params;
+ struct bdaddr_list *b;
+ uint8_t white_list_entries = 0;
+
+ /* Go through the current white list programmed into the
+ * controller one by one and check if that address is still
+ * in the list of pending connections or list of devices to
+ * report. If not present in either list, then queue the
+ * command to remove it from the controller.
+ */
+ list_for_each_entry(b, &hdev->le_white_list, list) {
+ struct hci_cp_le_del_from_white_list cp;
+
+ if (hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ &b->bdaddr, b->bdaddr_type) ||
+ hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ &b->bdaddr, b->bdaddr_type)) {
+ white_list_entries++;
+ continue;
+ }
+
+ cp.bdaddr_type = b->bdaddr_type;
+ bacpy(&cp.bdaddr, &b->bdaddr);
+
+ hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
+ sizeof(cp), &cp);
+ }
+
+ /* Since all no longer valid white list entries have been
+ * removed, walk through the list of pending connections
+ * and ensure that any new device gets programmed into
+ * the controller.
+ *
+ * If the list of the devices is larger than the list of
+ * available white list entries in the controller, then
+ * just abort and return filer policy value to not use the
+ * white list.
+ */
+ list_for_each_entry(params, &hdev->pend_le_conns, action) {
+ if (hci_bdaddr_list_lookup(&hdev->le_white_list,
+ &params->addr, params->addr_type))
+ continue;
+
+ if (white_list_entries >= hdev->le_white_list_size) {
+ /* Select filter policy to accept all advertising */
+ return 0x00;
+ }
+
+ if (hci_find_irk_by_addr(hdev, &params->addr,
+ params->addr_type)) {
+ /* White list can not be used with RPAs */
+ return 0x00;
+ }
+
+ white_list_entries++;
+ add_to_white_list(req, params);
+ }
+
+ /* After adding all new pending connections, walk through
+ * the list of pending reports and also add these to the
+ * white list if there is still space.
+ */
+ list_for_each_entry(params, &hdev->pend_le_reports, action) {
+ if (hci_bdaddr_list_lookup(&hdev->le_white_list,
+ &params->addr, params->addr_type))
+ continue;
+
+ if (white_list_entries >= hdev->le_white_list_size) {
+ /* Select filter policy to accept all advertising */
+ return 0x00;
+ }
+
+ if (hci_find_irk_by_addr(hdev, &params->addr,
+ params->addr_type)) {
+ /* White list can not be used with RPAs */
+ return 0x00;
+ }
+
+ white_list_entries++;
+ add_to_white_list(req, params);
+ }
+
+ /* Select filter policy to use white list */
+ return 0x01;
+}
+
+void hci_req_add_le_passive_scan(struct hci_request *req)
+{
+ struct hci_cp_le_set_scan_param param_cp;
+ struct hci_cp_le_set_scan_enable enable_cp;
+ struct hci_dev *hdev = req->hdev;
+ u8 own_addr_type;
+ u8 filter_policy;
+
+ /* Set require_privacy to false since no SCAN_REQ are send
+ * during passive scanning. Not using an non-resolvable address
+ * here is important so that peer devices using direct
+ * advertising with our address will be correctly reported
+ * by the controller.
+ */
+ if (hci_update_random_address(req, false, &own_addr_type))
+ return;
+
+ /* Adding or removing entries from the white list must
+ * happen before enabling scanning. The controller does
+ * not allow white list modification while scanning.
+ */
+ filter_policy = update_white_list(req);
+
+ /* When the controller is using random resolvable addresses and
+ * with that having LE privacy enabled, then controllers with
+ * Extended Scanner Filter Policies support can now enable support
+ * for handling directed advertising.
+ *
+ * So instead of using filter polices 0x00 (no whitelist)
+ * and 0x01 (whitelist enabled) use the new filter policies
+ * 0x02 (no whitelist) and 0x03 (whitelist enabled).
+ */
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY) &&
+ (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY))
+ filter_policy |= 0x02;
+
+ memset(&param_cp, 0, sizeof(param_cp));
+ param_cp.type = LE_SCAN_PASSIVE;
+ param_cp.interval = cpu_to_le16(hdev->le_scan_interval);
+ param_cp.window = cpu_to_le16(hdev->le_scan_window);
+ param_cp.own_address_type = own_addr_type;
+ param_cp.filter_policy = filter_policy;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
+ &param_cp);
+
+ memset(&enable_cp, 0, sizeof(enable_cp));
+ enable_cp.enable = LE_SCAN_ENABLE;
+ enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
+ &enable_cp);
+}
+
+static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ /* If we're advertising or initiating an LE connection we can't
+ * go ahead and change the random address at this time. This is
+ * because the eventual initiator address used for the
+ * subsequently created connection will be undefined (some
+ * controllers use the new address and others the one we had
+ * when the operation started).
+ *
+ * In this kind of scenario skip the update and let the random
+ * address be updated at the next cycle.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
+ hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) {
+ BT_DBG("Deferring random address update");
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ return;
+ }
+
+ hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa);
+}
+
+int hci_update_random_address(struct hci_request *req, bool require_privacy,
+ u8 *own_addr_type)
+{
+ struct hci_dev *hdev = req->hdev;
+ int err;
+
+ /* If privacy is enabled use a resolvable private address. If
+ * current RPA has expired or there is something else than
+ * the current RPA in use, then generate a new one.
+ */
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+ int to;
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) &&
+ !bacmp(&hdev->random_addr, &hdev->rpa))
+ return 0;
+
+ err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+ if (err < 0) {
+ BT_ERR("%s failed to generate new RPA", hdev->name);
+ return err;
+ }
+
+ set_random_addr(req, &hdev->rpa);
+
+ to = msecs_to_jiffies(hdev->rpa_timeout * 1000);
+ queue_delayed_work(hdev->workqueue, &hdev->rpa_expired, to);
+
+ return 0;
+ }
+
+ /* In case of required privacy without resolvable private address,
+ * use an non-resolvable private address. This is useful for active
+ * scanning and non-connectable advertising.
+ */
+ if (require_privacy) {
+ bdaddr_t nrpa;
+
+ while (true) {
+ /* The non-resolvable private address is generated
+ * from random six bytes with the two most significant
+ * bits cleared.
+ */
+ get_random_bytes(&nrpa, 6);
+ nrpa.b[5] &= 0x3f;
+
+ /* The non-resolvable private address shall not be
+ * equal to the public address.
+ */
+ if (bacmp(&hdev->bdaddr, &nrpa))
+ break;
+ }
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+ set_random_addr(req, &nrpa);
+ return 0;
+ }
+
+ /* If forcing static address is in use or there is no public
+ * address use the static address as random address (but skip
+ * the HCI command if the current random address is already the
+ * static one.
+ *
+ * In case BR/EDR has been disabled on a dual-mode controller
+ * and a static address has been configured, then use that
+ * address instead of the public BR/EDR address.
+ */
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
+ (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+ if (bacmp(&hdev->static_addr, &hdev->random_addr))
+ hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6,
+ &hdev->static_addr);
+ return 0;
+ }
+
+ /* Neither privacy nor static address is being used so use a
+ * public address.
+ */
+ *own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+ return 0;
+}
+
+static bool disconnected_whitelist_entries(struct hci_dev *hdev)
+{
+ struct bdaddr_list *b;
+
+ list_for_each_entry(b, &hdev->whitelist, list) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr);
+ if (!conn)
+ return true;
+
+ if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
+ return true;
+ }
+
+ return false;
+}
+
+void __hci_update_page_scan(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ u8 scan;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return;
+
+ if (!hdev_is_powered(hdev))
+ return;
+
+ if (mgmt_powering_down(hdev))
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) ||
+ disconnected_whitelist_entries(hdev))
+ scan = SCAN_PAGE;
+ else
+ scan = SCAN_DISABLED;
+
+ if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE))
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ scan |= SCAN_INQUIRY;
+
+ hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+}
+
+void hci_update_page_scan(struct hci_dev *hdev)
+{
+ struct hci_request req;
+
+ hci_req_init(&req, hdev);
+ __hci_update_page_scan(&req);
+ hci_req_run(&req, NULL);
+}
+
+/* This function controls the background scanning based on hdev->pend_le_conns
+ * list. If there are pending LE connection we start the background scanning,
+ * otherwise we stop it.
+ *
+ * This function requires the caller holds hdev->lock.
+ */
+void __hci_update_background_scan(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_conn *conn;
+
+ if (!test_bit(HCI_UP, &hdev->flags) ||
+ test_bit(HCI_INIT, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ hci_dev_test_flag(hdev, HCI_AUTO_OFF) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return;
+
+ /* No point in doing scanning if LE support hasn't been enabled */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return;
+
+ /* If discovery is active don't interfere with it */
+ if (hdev->discovery.state != DISCOVERY_STOPPED)
+ return;
+
+ /* Reset RSSI and UUID filters when starting background scanning
+ * since these filters are meant for service discovery only.
+ *
+ * The Start Discovery and Start Service Discovery operations
+ * ensure to set proper values for RSSI threshold and UUID
+ * filter list. So it is safe to just reset them here.
+ */
+ hci_discovery_filter_clear(hdev);
+
+ if (list_empty(&hdev->pend_le_conns) &&
+ list_empty(&hdev->pend_le_reports)) {
+ /* If there is no pending LE connections or devices
+ * to be scanned for, we should stop the background
+ * scanning.
+ */
+
+ /* If controller is not scanning we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ return;
+
+ hci_req_add_le_scan_disable(req);
+
+ BT_DBG("%s stopping background scanning", hdev->name);
+ } else {
+ /* If there is at least one pending LE connection, we should
+ * keep the background scan running.
+ */
+
+ /* If controller is connecting, we should not start scanning
+ * since some controllers are not able to scan and connect at
+ * the same time.
+ */
+ conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ if (conn)
+ return;
+
+ /* If controller is currently scanning, we stop it to ensure we
+ * don't miss any advertising (due to duplicates filter).
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ hci_req_add_le_scan_disable(req);
+
+ hci_req_add_le_passive_scan(req);
+
+ BT_DBG("%s starting background scanning", hdev->name);
+ }
+}
+
+static void update_background_scan_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ if (status)
+ BT_DBG("HCI request failed to update background scanning: "
+ "status 0x%2.2x", status);
+}
+
+void hci_update_background_scan(struct hci_dev *hdev)
+{
+ int err;
+ struct hci_request req;
+
+ hci_req_init(&req, hdev);
+
+ __hci_update_background_scan(&req);
+
+ err = hci_req_run(&req, update_background_scan_complete);
+ if (err && err != -ENODATA)
+ BT_ERR("Failed to run HCI request: err %d", err);
+}
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
new file mode 100644
index 000000000..bf6df92f4
--- /dev/null
+++ b/net/bluetooth/hci_request.h
@@ -0,0 +1,57 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+struct hci_request {
+ struct hci_dev *hdev;
+ struct sk_buff_head cmd_q;
+
+ /* If something goes wrong when building the HCI request, the error
+ * value is stored in this field.
+ */
+ int err;
+};
+
+void hci_req_init(struct hci_request *req, struct hci_dev *hdev);
+int hci_req_run(struct hci_request *req, hci_req_complete_t complete);
+int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete);
+void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
+ const void *param);
+void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
+ const void *param, u8 event);
+void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb);
+
+struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param);
+
+void hci_req_add_le_scan_disable(struct hci_request *req);
+void hci_req_add_le_passive_scan(struct hci_request *req);
+
+void hci_update_page_scan(struct hci_dev *hdev);
+void __hci_update_page_scan(struct hci_request *req);
+
+int hci_update_random_address(struct hci_request *req, bool require_privacy,
+ u8 *own_addr_type);
+
+void hci_update_background_scan(struct hci_dev *hdev);
+void __hci_update_background_scan(struct hci_request *req);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
new file mode 100644
index 000000000..e11a5cfda
--- /dev/null
+++ b/net/bluetooth/hci_sock.c
@@ -0,0 +1,1452 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI sockets. */
+
+#include <linux/export.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_mon.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "mgmt_util.h"
+
+static LIST_HEAD(mgmt_chan_list);
+static DEFINE_MUTEX(mgmt_chan_list_lock);
+
+static atomic_t monitor_promisc = ATOMIC_INIT(0);
+
+/* ----- HCI socket interface ----- */
+
+/* Socket info */
+#define hci_pi(sk) ((struct hci_pinfo *) sk)
+
+struct hci_pinfo {
+ struct bt_sock bt;
+ struct hci_dev *hdev;
+ struct hci_filter filter;
+ __u32 cmsg_mask;
+ unsigned short channel;
+ unsigned long flags;
+};
+
+void hci_sock_set_flag(struct sock *sk, int nr)
+{
+ set_bit(nr, &hci_pi(sk)->flags);
+}
+
+void hci_sock_clear_flag(struct sock *sk, int nr)
+{
+ clear_bit(nr, &hci_pi(sk)->flags);
+}
+
+int hci_sock_test_flag(struct sock *sk, int nr)
+{
+ return test_bit(nr, &hci_pi(sk)->flags);
+}
+
+unsigned short hci_sock_get_channel(struct sock *sk)
+{
+ return hci_pi(sk)->channel;
+}
+
+static inline int hci_test_bit(int nr, const void *addr)
+{
+ return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
+}
+
+/* Security filter */
+#define HCI_SFLT_MAX_OGF 5
+
+struct hci_sec_filter {
+ __u32 type_mask;
+ __u32 event_mask[2];
+ __u32 ocf_mask[HCI_SFLT_MAX_OGF + 1][4];
+};
+
+static const struct hci_sec_filter hci_sec_filter = {
+ /* Packet types */
+ 0x10,
+ /* Events */
+ { 0x1000d9fe, 0x0000b00c },
+ /* Commands */
+ {
+ { 0x0 },
+ /* OGF_LINK_CTL */
+ { 0xbe000006, 0x00000001, 0x00000000, 0x00 },
+ /* OGF_LINK_POLICY */
+ { 0x00005200, 0x00000000, 0x00000000, 0x00 },
+ /* OGF_HOST_CTL */
+ { 0xaab00200, 0x2b402aaa, 0x05220154, 0x00 },
+ /* OGF_INFO_PARAM */
+ { 0x000002be, 0x00000000, 0x00000000, 0x00 },
+ /* OGF_STATUS_PARAM */
+ { 0x000000ea, 0x00000000, 0x00000000, 0x00 }
+ }
+};
+
+static struct bt_sock_list hci_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(hci_sk_list.lock)
+};
+
+static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb)
+{
+ struct hci_filter *flt;
+ int flt_type, flt_event;
+
+ /* Apply filter */
+ flt = &hci_pi(sk)->filter;
+
+ if (bt_cb(skb)->pkt_type == HCI_VENDOR_PKT)
+ flt_type = 0;
+ else
+ flt_type = bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS;
+
+ if (!test_bit(flt_type, &flt->type_mask))
+ return true;
+
+ /* Extra filter for event packets only */
+ if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT)
+ return false;
+
+ flt_event = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
+
+ if (!hci_test_bit(flt_event, &flt->event_mask))
+ return true;
+
+ /* Check filter only when opcode is set */
+ if (!flt->opcode)
+ return false;
+
+ if (flt_event == HCI_EV_CMD_COMPLETE &&
+ flt->opcode != get_unaligned((__le16 *)(skb->data + 3)))
+ return true;
+
+ if (flt_event == HCI_EV_CMD_STATUS &&
+ flt->opcode != get_unaligned((__le16 *)(skb->data + 4)))
+ return true;
+
+ return false;
+}
+
+/* Send frame to RAW socket */
+void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct sock *sk;
+ struct sk_buff *skb_copy = NULL;
+
+ BT_DBG("hdev %p len %d", hdev, skb->len);
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct sk_buff *nskb;
+
+ if (sk->sk_state != BT_BOUND || hci_pi(sk)->hdev != hdev)
+ continue;
+
+ /* Don't send frame to the socket it came from */
+ if (skb->sk == sk)
+ continue;
+
+ if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) {
+ if (is_filtered_packet(sk, skb))
+ continue;
+ } else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
+ if (!bt_cb(skb)->incoming)
+ continue;
+ if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT &&
+ bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
+ bt_cb(skb)->pkt_type != HCI_SCODATA_PKT)
+ continue;
+ } else {
+ /* Don't send frame to other channel types */
+ continue;
+ }
+
+ if (!skb_copy) {
+ /* Create a private copy with headroom */
+ skb_copy = __pskb_copy_fclone(skb, 1, GFP_ATOMIC, true);
+ if (!skb_copy)
+ continue;
+
+ /* Put type byte before the data */
+ memcpy(skb_push(skb_copy, 1), &bt_cb(skb)->pkt_type, 1);
+ }
+
+ nskb = skb_clone(skb_copy, GFP_ATOMIC);
+ if (!nskb)
+ continue;
+
+ if (sock_queue_rcv_skb(sk, nskb))
+ kfree_skb(nskb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+
+ kfree_skb(skb_copy);
+}
+
+/* Send frame to sockets with specific channel */
+void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
+ int flag, struct sock *skip_sk)
+{
+ struct sock *sk;
+
+ BT_DBG("channel %u len %d", channel, skb->len);
+
+ read_lock(&hci_sk_list.lock);
+
+ sk_for_each(sk, &hci_sk_list.head) {
+ struct sk_buff *nskb;
+
+ /* Ignore socket without the flag set */
+ if (!hci_sock_test_flag(sk, flag))
+ continue;
+
+ /* Skip the original socket */
+ if (sk == skip_sk)
+ continue;
+
+ if (sk->sk_state != BT_BOUND)
+ continue;
+
+ if (hci_pi(sk)->channel != channel)
+ continue;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ continue;
+
+ if (sock_queue_rcv_skb(sk, nskb))
+ kfree_skb(nskb);
+ }
+
+ read_unlock(&hci_sk_list.lock);
+}
+
+/* Send frame to monitor socket */
+void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct sk_buff *skb_copy = NULL;
+ struct hci_mon_hdr *hdr;
+ __le16 opcode;
+
+ if (!atomic_read(&monitor_promisc))
+ return;
+
+ BT_DBG("hdev %p len %d", hdev, skb->len);
+
+ switch (bt_cb(skb)->pkt_type) {
+ case HCI_COMMAND_PKT:
+ opcode = cpu_to_le16(HCI_MON_COMMAND_PKT);
+ break;
+ case HCI_EVENT_PKT:
+ opcode = cpu_to_le16(HCI_MON_EVENT_PKT);
+ break;
+ case HCI_ACLDATA_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_ACL_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_ACL_TX_PKT);
+ break;
+ case HCI_SCODATA_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_SCO_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT);
+ break;
+ default:
+ return;
+ }
+
+ /* Create a private copy with headroom */
+ skb_copy = __pskb_copy_fclone(skb, HCI_MON_HDR_SIZE, GFP_ATOMIC, true);
+ if (!skb_copy)
+ return;
+
+ /* Put header before the data */
+ hdr = (void *) skb_push(skb_copy, HCI_MON_HDR_SIZE);
+ hdr->opcode = opcode;
+ hdr->index = cpu_to_le16(hdev->id);
+ hdr->len = cpu_to_le16(skb->len);
+
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb_copy,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb_copy);
+}
+
+static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
+{
+ struct hci_mon_hdr *hdr;
+ struct hci_mon_new_index *ni;
+ struct sk_buff *skb;
+ __le16 opcode;
+
+ switch (event) {
+ case HCI_DEV_REG:
+ skb = bt_skb_alloc(HCI_MON_NEW_INDEX_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ ni = (void *) skb_put(skb, HCI_MON_NEW_INDEX_SIZE);
+ ni->type = hdev->dev_type;
+ ni->bus = hdev->bus;
+ bacpy(&ni->bdaddr, &hdev->bdaddr);
+ memcpy(ni->name, hdev->name, 8);
+
+ opcode = cpu_to_le16(HCI_MON_NEW_INDEX);
+ break;
+
+ case HCI_DEV_UNREG:
+ skb = bt_skb_alloc(0, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ opcode = cpu_to_le16(HCI_MON_DEL_INDEX);
+ break;
+
+ default:
+ return NULL;
+ }
+
+ __net_timestamp(skb);
+
+ hdr = (void *) skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = opcode;
+ hdr->index = cpu_to_le16(hdev->id);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ return skb;
+}
+
+static void send_monitor_replay(struct sock *sk)
+{
+ struct hci_dev *hdev;
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ struct sk_buff *skb;
+
+ skb = create_monitor_event(hdev, HCI_DEV_REG);
+ if (!skb)
+ continue;
+
+ if (sock_queue_rcv_skb(sk, skb))
+ kfree_skb(skb);
+ }
+
+ read_unlock(&hci_dev_list_lock);
+}
+
+/* Generate internal stack event */
+static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
+{
+ struct hci_event_hdr *hdr;
+ struct hci_ev_stack_internal *ev;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ hdr = (void *) skb_put(skb, HCI_EVENT_HDR_SIZE);
+ hdr->evt = HCI_EV_STACK_INTERNAL;
+ hdr->plen = sizeof(*ev) + dlen;
+
+ ev = (void *) skb_put(skb, sizeof(*ev) + dlen);
+ ev->type = type;
+ memcpy(ev->data, data, dlen);
+
+ bt_cb(skb)->incoming = 1;
+ __net_timestamp(skb);
+
+ bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
+ hci_send_to_sock(hdev, skb);
+ kfree_skb(skb);
+}
+
+void hci_sock_dev_event(struct hci_dev *hdev, int event)
+{
+ struct hci_ev_si_device ev;
+
+ BT_DBG("hdev %s event %d", hdev->name, event);
+
+ /* Send event to monitor */
+ if (atomic_read(&monitor_promisc)) {
+ struct sk_buff *skb;
+
+ skb = create_monitor_event(hdev, event);
+ if (skb) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ HCI_SOCK_TRUSTED, NULL);
+ kfree_skb(skb);
+ }
+ }
+
+ /* Send event to sockets */
+ ev.event = event;
+ ev.dev_id = hdev->id;
+ hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev);
+
+ if (event == HCI_DEV_UNREG) {
+ struct sock *sk;
+
+ /* Detach sockets from device */
+ read_lock(&hci_sk_list.lock);
+ sk_for_each(sk, &hci_sk_list.head) {
+ bh_lock_sock_nested(sk);
+ if (hci_pi(sk)->hdev == hdev) {
+ hci_pi(sk)->hdev = NULL;
+ sk->sk_err = EPIPE;
+ sk->sk_state = BT_OPEN;
+ sk->sk_state_change(sk);
+
+ hci_dev_put(hdev);
+ }
+ bh_unlock_sock(sk);
+ }
+ read_unlock(&hci_sk_list.lock);
+ }
+}
+
+static struct hci_mgmt_chan *__hci_mgmt_chan_find(unsigned short channel)
+{
+ struct hci_mgmt_chan *c;
+
+ list_for_each_entry(c, &mgmt_chan_list, list) {
+ if (c->channel == channel)
+ return c;
+ }
+
+ return NULL;
+}
+
+static struct hci_mgmt_chan *hci_mgmt_chan_find(unsigned short channel)
+{
+ struct hci_mgmt_chan *c;
+
+ mutex_lock(&mgmt_chan_list_lock);
+ c = __hci_mgmt_chan_find(channel);
+ mutex_unlock(&mgmt_chan_list_lock);
+
+ return c;
+}
+
+int hci_mgmt_chan_register(struct hci_mgmt_chan *c)
+{
+ if (c->channel < HCI_CHANNEL_CONTROL)
+ return -EINVAL;
+
+ mutex_lock(&mgmt_chan_list_lock);
+ if (__hci_mgmt_chan_find(c->channel)) {
+ mutex_unlock(&mgmt_chan_list_lock);
+ return -EALREADY;
+ }
+
+ list_add_tail(&c->list, &mgmt_chan_list);
+
+ mutex_unlock(&mgmt_chan_list_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_mgmt_chan_register);
+
+void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c)
+{
+ mutex_lock(&mgmt_chan_list_lock);
+ list_del(&c->list);
+ mutex_unlock(&mgmt_chan_list_lock);
+}
+EXPORT_SYMBOL(hci_mgmt_chan_unregister);
+
+static int hci_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct hci_dev *hdev;
+
+ BT_DBG("sock %p sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ hdev = hci_pi(sk)->hdev;
+
+ if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR)
+ atomic_dec(&monitor_promisc);
+
+ bt_sock_unlink(&hci_sk_list, sk);
+
+ if (hdev) {
+ if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
+ mgmt_index_added(hdev);
+ hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
+ hci_dev_close(hdev->id);
+ }
+
+ atomic_dec(&hdev->promisc);
+ hci_dev_put(hdev);
+ }
+
+ sock_orphan(sk);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+
+ sock_put(sk);
+ return 0;
+}
+
+static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg)
+{
+ bdaddr_t bdaddr;
+ int err;
+
+ if (copy_from_user(&bdaddr, arg, sizeof(bdaddr)))
+ return -EFAULT;
+
+ hci_dev_lock(hdev);
+
+ err = hci_bdaddr_list_add(&hdev->blacklist, &bdaddr, BDADDR_BREDR);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg)
+{
+ bdaddr_t bdaddr;
+ int err;
+
+ if (copy_from_user(&bdaddr, arg, sizeof(bdaddr)))
+ return -EFAULT;
+
+ hci_dev_lock(hdev);
+
+ err = hci_bdaddr_list_del(&hdev->blacklist, &bdaddr, BDADDR_BREDR);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+/* Ioctls that require bound socket */
+static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
+ unsigned long arg)
+{
+ struct hci_dev *hdev = hci_pi(sk)->hdev;
+
+ if (!hdev)
+ return -EBADFD;
+
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+ return -EBUSY;
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ return -EOPNOTSUPP;
+
+ if (hdev->dev_type != HCI_BREDR)
+ return -EOPNOTSUPP;
+
+ switch (cmd) {
+ case HCISETRAW:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return -EOPNOTSUPP;
+
+ case HCIGETCONNINFO:
+ return hci_get_conn_info(hdev, (void __user *) arg);
+
+ case HCIGETAUTHINFO:
+ return hci_get_auth_info(hdev, (void __user *) arg);
+
+ case HCIBLOCKADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_sock_blacklist_add(hdev, (void __user *) arg);
+
+ case HCIUNBLOCKADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_sock_blacklist_del(hdev, (void __user *) arg);
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *) arg;
+ struct sock *sk = sock->sk;
+ int err;
+
+ BT_DBG("cmd %x arg %lx", cmd, arg);
+
+ lock_sock(sk);
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ release_sock(sk);
+
+ switch (cmd) {
+ case HCIGETDEVLIST:
+ return hci_get_dev_list(argp);
+
+ case HCIGETDEVINFO:
+ return hci_get_dev_info(argp);
+
+ case HCIGETCONNLIST:
+ return hci_get_conn_list(argp);
+
+ case HCIDEVUP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_dev_open(arg);
+
+ case HCIDEVDOWN:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_dev_close(arg);
+
+ case HCIDEVRESET:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_dev_reset(arg);
+
+ case HCIDEVRESTAT:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_dev_reset_stat(arg);
+
+ case HCISETSCAN:
+ case HCISETAUTH:
+ case HCISETENCRYPT:
+ case HCISETPTYPE:
+ case HCISETLINKPOL:
+ case HCISETLINKMODE:
+ case HCISETACLMTU:
+ case HCISETSCOMTU:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return hci_dev_cmd(cmd, argp);
+
+ case HCIINQUIRY:
+ return hci_inquiry(argp);
+ }
+
+ lock_sock(sk);
+
+ err = hci_sock_bound_ioctl(sk, cmd, arg);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
+ int addr_len)
+{
+ struct sockaddr_hci haddr;
+ struct sock *sk = sock->sk;
+ struct hci_dev *hdev = NULL;
+ int len, err = 0;
+
+ BT_DBG("sock %p sk %p", sock, sk);
+
+ if (!addr)
+ return -EINVAL;
+
+ memset(&haddr, 0, sizeof(haddr));
+ len = min_t(unsigned int, sizeof(haddr), addr_len);
+ memcpy(&haddr, addr, len);
+
+ if (haddr.hci_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_BOUND) {
+ err = -EALREADY;
+ goto done;
+ }
+
+ switch (haddr.hci_channel) {
+ case HCI_CHANNEL_RAW:
+ if (hci_pi(sk)->hdev) {
+ err = -EALREADY;
+ goto done;
+ }
+
+ if (haddr.hci_dev != HCI_DEV_NONE) {
+ hdev = hci_dev_get(haddr.hci_dev);
+ if (!hdev) {
+ err = -ENODEV;
+ goto done;
+ }
+
+ atomic_inc(&hdev->promisc);
+ }
+
+ hci_pi(sk)->hdev = hdev;
+ break;
+
+ case HCI_CHANNEL_USER:
+ if (hci_pi(sk)->hdev) {
+ err = -EALREADY;
+ goto done;
+ }
+
+ if (haddr.hci_dev == HCI_DEV_NONE) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ goto done;
+ }
+
+ hdev = hci_dev_get(haddr.hci_dev);
+ if (!hdev) {
+ err = -ENODEV;
+ goto done;
+ }
+
+ if (test_bit(HCI_INIT, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ (!hci_dev_test_flag(hdev, HCI_AUTO_OFF) &&
+ test_bit(HCI_UP, &hdev->flags))) {
+ err = -EBUSY;
+ hci_dev_put(hdev);
+ goto done;
+ }
+
+ if (hci_dev_test_and_set_flag(hdev, HCI_USER_CHANNEL)) {
+ err = -EUSERS;
+ hci_dev_put(hdev);
+ goto done;
+ }
+
+ mgmt_index_removed(hdev);
+
+ err = hci_dev_open(hdev->id);
+ if (err) {
+ if (err == -EALREADY) {
+ /* In case the transport is already up and
+ * running, clear the error here.
+ *
+ * This can happen when opening an user
+ * channel and HCI_AUTO_OFF grace period
+ * is still active.
+ */
+ err = 0;
+ } else {
+ hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
+ mgmt_index_added(hdev);
+ hci_dev_put(hdev);
+ goto done;
+ }
+ }
+
+ atomic_inc(&hdev->promisc);
+
+ hci_pi(sk)->hdev = hdev;
+ break;
+
+ case HCI_CHANNEL_MONITOR:
+ if (haddr.hci_dev != HCI_DEV_NONE) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!capable(CAP_NET_RAW)) {
+ err = -EPERM;
+ goto done;
+ }
+
+ /* The monitor interface is restricted to CAP_NET_RAW
+ * capabilities and with that implicitly trusted.
+ */
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ send_monitor_replay(sk);
+
+ atomic_inc(&monitor_promisc);
+ break;
+
+ default:
+ if (!hci_mgmt_chan_find(haddr.hci_channel)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (haddr.hci_dev != HCI_DEV_NONE) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ /* Users with CAP_NET_ADMIN capabilities are allowed
+ * access to all management commands and events. For
+ * untrusted users the interface is restricted and
+ * also only untrusted events are sent.
+ */
+ if (capable(CAP_NET_ADMIN))
+ hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+
+ /* At the moment the index and unconfigured index events
+ * are enabled unconditionally. Setting them on each
+ * socket when binding keeps this functionality. They
+ * however might be cleared later and then sending of these
+ * events will be disabled, but that is then intentional.
+ *
+ * This also enables generic events that are safe to be
+ * received by untrusted users. Example for such events
+ * are changes to settings, class of device, name etc.
+ */
+ if (haddr.hci_channel == HCI_CHANNEL_CONTROL) {
+ hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
+ hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS);
+ }
+ break;
+ }
+
+
+ hci_pi(sk)->channel = haddr.hci_channel;
+ sk->sk_state = BT_BOUND;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int hci_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int *addr_len, int peer)
+{
+ struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr;
+ struct sock *sk = sock->sk;
+ struct hci_dev *hdev;
+ int err = 0;
+
+ BT_DBG("sock %p sk %p", sock, sk);
+
+ if (peer)
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+
+ hdev = hci_pi(sk)->hdev;
+ if (!hdev) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ *addr_len = sizeof(*haddr);
+ haddr->hci_family = AF_BLUETOOTH;
+ haddr->hci_dev = hdev->id;
+ haddr->hci_channel= hci_pi(sk)->channel;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+ __u32 mask = hci_pi(sk)->cmsg_mask;
+
+ if (mask & HCI_CMSG_DIR) {
+ int incoming = bt_cb(skb)->incoming;
+ put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming),
+ &incoming);
+ }
+
+ if (mask & HCI_CMSG_TSTAMP) {
+#ifdef CONFIG_COMPAT
+ struct compat_timeval ctv;
+#endif
+ struct timeval tv;
+ void *data;
+ int len;
+
+ skb_get_timestamp(skb, &tv);
+
+ data = &tv;
+ len = sizeof(tv);
+#ifdef CONFIG_COMPAT
+ if (!COMPAT_USE_64BIT_TIME &&
+ (msg->msg_flags & MSG_CMSG_COMPAT)) {
+ ctv.tv_sec = tv.tv_sec;
+ ctv.tv_usec = tv.tv_usec;
+ data = &ctv;
+ len = sizeof(ctv);
+ }
+#endif
+
+ put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, len, data);
+ }
+}
+
+static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ int noblock = flags & MSG_DONTWAIT;
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int copied, err;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (flags & (MSG_OOB))
+ return -EOPNOTSUPP;
+
+ if (sk->sk_state == BT_CLOSED)
+ return 0;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ return err;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ skb_reset_transport_header(skb);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ hci_sock_cmsg(sk, msg, skb);
+ break;
+ case HCI_CHANNEL_USER:
+ case HCI_CHANNEL_MONITOR:
+ sock_recv_timestamp(msg, sk, skb);
+ break;
+ default:
+ if (hci_mgmt_chan_find(hci_pi(sk)->channel))
+ sock_recv_timestamp(msg, sk, skb);
+ break;
+ }
+
+ skb_free_datagram(sk, skb);
+
+ return err ? : copied;
+}
+
+static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
+ struct msghdr *msg, size_t msglen)
+{
+ void *buf;
+ u8 *cp;
+ struct mgmt_hdr *hdr;
+ u16 opcode, index, len;
+ struct hci_dev *hdev = NULL;
+ const struct hci_mgmt_handler *handler;
+ bool var_len, no_hdev;
+ int err;
+
+ BT_DBG("got %zu bytes", msglen);
+
+ if (msglen < sizeof(*hdr))
+ return -EINVAL;
+
+ buf = kmalloc(msglen, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (memcpy_from_msg(buf, msg, msglen)) {
+ err = -EFAULT;
+ goto done;
+ }
+
+ hdr = buf;
+ opcode = __le16_to_cpu(hdr->opcode);
+ index = __le16_to_cpu(hdr->index);
+ len = __le16_to_cpu(hdr->len);
+
+ if (len != msglen - sizeof(*hdr)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (opcode >= chan->handler_count ||
+ chan->handlers[opcode].func == NULL) {
+ BT_DBG("Unknown op %u", opcode);
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_UNKNOWN_COMMAND);
+ goto done;
+ }
+
+ handler = &chan->handlers[opcode];
+
+ if (!hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) &&
+ !(handler->flags & HCI_MGMT_UNTRUSTED)) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_PERMISSION_DENIED);
+ goto done;
+ }
+
+ if (index != MGMT_INDEX_NONE) {
+ hdev = hci_dev_get(index);
+ if (!hdev) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !(handler->flags & HCI_MGMT_UNCONFIGURED)) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
+ }
+
+ no_hdev = (handler->flags & HCI_MGMT_NO_HDEV);
+ if (no_hdev != !hdev) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
+
+ var_len = (handler->flags & HCI_MGMT_VAR_LEN);
+ if ((var_len && len < handler->data_len) ||
+ (!var_len && len != handler->data_len)) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto done;
+ }
+
+ if (hdev && chan->hdev_init)
+ chan->hdev_init(sk, hdev);
+
+ cp = buf + sizeof(*hdr);
+
+ err = handler->func(sk, hdev, cp, len);
+ if (err < 0)
+ goto done;
+
+ err = msglen;
+
+done:
+ if (hdev)
+ hci_dev_put(hdev);
+
+ kfree(buf);
+ return err;
+}
+
+static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct hci_mgmt_chan *chan;
+ struct hci_dev *hdev;
+ struct sk_buff *skb;
+ int err;
+
+ BT_DBG("sock %p sk %p", sock, sk);
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE))
+ return -EINVAL;
+
+ if (len < 4 || len > HCI_MAX_FRAME_SIZE)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ switch (hci_pi(sk)->channel) {
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ break;
+ case HCI_CHANNEL_MONITOR:
+ err = -EOPNOTSUPP;
+ goto done;
+ default:
+ mutex_lock(&mgmt_chan_list_lock);
+ chan = __hci_mgmt_chan_find(hci_pi(sk)->channel);
+ if (chan)
+ err = hci_mgmt_cmd(chan, sk, msg, len);
+ else
+ err = -EINVAL;
+
+ mutex_unlock(&mgmt_chan_list_lock);
+ goto done;
+ }
+
+ hdev = hci_pi(sk)->hdev;
+ if (!hdev) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (!test_bit(HCI_UP, &hdev->flags)) {
+ err = -ENETDOWN;
+ goto done;
+ }
+
+ skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto done;
+
+ if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
+ err = -EFAULT;
+ goto drop;
+ }
+
+ bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
+ skb_pull(skb, 1);
+
+ if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
+ /* No permission check is needed for user channel
+ * since that gets enforced when binding the socket.
+ *
+ * However check that the packet type is valid.
+ */
+ if (bt_cb(skb)->pkt_type != HCI_COMMAND_PKT &&
+ bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
+ bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) {
+ err = -EINVAL;
+ goto drop;
+ }
+
+ skb_queue_tail(&hdev->raw_q, skb);
+ queue_work(hdev->workqueue, &hdev->tx_work);
+ } else if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
+ u16 opcode = get_unaligned_le16(skb->data);
+ u16 ogf = hci_opcode_ogf(opcode);
+ u16 ocf = hci_opcode_ocf(opcode);
+
+ if (((ogf > HCI_SFLT_MAX_OGF) ||
+ !hci_test_bit(ocf & HCI_FLT_OCF_BITS,
+ &hci_sec_filter.ocf_mask[ogf])) &&
+ !capable(CAP_NET_RAW)) {
+ err = -EPERM;
+ goto drop;
+ }
+
+ if (ogf == 0x3f) {
+ skb_queue_tail(&hdev->raw_q, skb);
+ queue_work(hdev->workqueue, &hdev->tx_work);
+ } else {
+ /* Stand-alone HCI commands must be flagged as
+ * single-command requests.
+ */
+ bt_cb(skb)->req.start = true;
+
+ skb_queue_tail(&hdev->cmd_q, skb);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+ }
+ } else {
+ if (!capable(CAP_NET_RAW)) {
+ err = -EPERM;
+ goto drop;
+ }
+
+ skb_queue_tail(&hdev->raw_q, skb);
+ queue_work(hdev->workqueue, &hdev->tx_work);
+ }
+
+ err = len;
+
+done:
+ release_sock(sk);
+ return err;
+
+drop:
+ kfree_skb(skb);
+ goto done;
+}
+
+static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int len)
+{
+ struct hci_ufilter uf = { .opcode = 0 };
+ struct sock *sk = sock->sk;
+ int err = 0, opt = 0;
+
+ BT_DBG("sk %p, opt %d", sk, optname);
+
+ lock_sock(sk);
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ switch (optname) {
+ case HCI_DATA_DIR:
+ if (get_user(opt, (int __user *)optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt)
+ hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR;
+ else
+ hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_DIR;
+ break;
+
+ case HCI_TIME_STAMP:
+ if (get_user(opt, (int __user *)optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt)
+ hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP;
+ else
+ hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_TSTAMP;
+ break;
+
+ case HCI_FILTER:
+ {
+ struct hci_filter *f = &hci_pi(sk)->filter;
+
+ uf.type_mask = f->type_mask;
+ uf.opcode = f->opcode;
+ uf.event_mask[0] = *((u32 *) f->event_mask + 0);
+ uf.event_mask[1] = *((u32 *) f->event_mask + 1);
+ }
+
+ len = min_t(unsigned int, len, sizeof(uf));
+ if (copy_from_user(&uf, optval, len)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (!capable(CAP_NET_RAW)) {
+ uf.type_mask &= hci_sec_filter.type_mask;
+ uf.event_mask[0] &= *((u32 *) hci_sec_filter.event_mask + 0);
+ uf.event_mask[1] &= *((u32 *) hci_sec_filter.event_mask + 1);
+ }
+
+ {
+ struct hci_filter *f = &hci_pi(sk)->filter;
+
+ f->type_mask = uf.type_mask;
+ f->opcode = uf.opcode;
+ *((u32 *) f->event_mask + 0) = uf.event_mask[0];
+ *((u32 *) f->event_mask + 1) = uf.event_mask[1];
+ }
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct hci_ufilter uf;
+ struct sock *sk = sock->sk;
+ int len, opt, err = 0;
+
+ BT_DBG("sk %p, opt %d", sk, optname);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ switch (optname) {
+ case HCI_DATA_DIR:
+ if (hci_pi(sk)->cmsg_mask & HCI_CMSG_DIR)
+ opt = 1;
+ else
+ opt = 0;
+
+ if (put_user(opt, optval))
+ err = -EFAULT;
+ break;
+
+ case HCI_TIME_STAMP:
+ if (hci_pi(sk)->cmsg_mask & HCI_CMSG_TSTAMP)
+ opt = 1;
+ else
+ opt = 0;
+
+ if (put_user(opt, optval))
+ err = -EFAULT;
+ break;
+
+ case HCI_FILTER:
+ {
+ struct hci_filter *f = &hci_pi(sk)->filter;
+
+ memset(&uf, 0, sizeof(uf));
+ uf.type_mask = f->type_mask;
+ uf.opcode = f->opcode;
+ uf.event_mask[0] = *((u32 *) f->event_mask + 0);
+ uf.event_mask[1] = *((u32 *) f->event_mask + 1);
+ }
+
+ len = min_t(unsigned int, len, sizeof(uf));
+ if (copy_to_user(optval, &uf, len))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static const struct proto_ops hci_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = hci_sock_release,
+ .bind = hci_sock_bind,
+ .getname = hci_sock_getname,
+ .sendmsg = hci_sock_sendmsg,
+ .recvmsg = hci_sock_recvmsg,
+ .ioctl = hci_sock_ioctl,
+ .poll = datagram_poll,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = hci_sock_setsockopt,
+ .getsockopt = hci_sock_getsockopt,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .mmap = sock_no_mmap
+};
+
+static struct proto hci_sk_proto = {
+ .name = "HCI",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct hci_pinfo)
+};
+
+static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ sock->ops = &hci_sock_ops;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = protocol;
+
+ sock->state = SS_UNCONNECTED;
+ sk->sk_state = BT_OPEN;
+
+ bt_sock_link(&hci_sk_list, sk);
+ return 0;
+}
+
+static const struct net_proto_family hci_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = hci_sock_create,
+};
+
+int __init hci_sock_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sockaddr_hci) > sizeof(struct sockaddr));
+
+ err = proto_register(&hci_sk_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_HCI, &hci_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("HCI socket registration failed");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "hci", &hci_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create HCI proc file");
+ bt_sock_unregister(BTPROTO_HCI);
+ goto error;
+ }
+
+ BT_INFO("HCI socket layer initialized");
+
+ return 0;
+
+error:
+ proto_unregister(&hci_sk_proto);
+ return err;
+}
+
+void hci_sock_cleanup(void)
+{
+ bt_procfs_cleanup(&init_net, "hci");
+ bt_sock_unregister(BTPROTO_HCI);
+ proto_unregister(&hci_sk_proto);
+}
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
new file mode 100644
index 000000000..555982a78
--- /dev/null
+++ b/net/bluetooth/hci_sysfs.c
@@ -0,0 +1,214 @@
+/* Bluetooth HCI driver model support. */
+
+#include <linux/module.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+static struct class *bt_class;
+
+static inline char *link_typetostr(int type)
+{
+ switch (type) {
+ case ACL_LINK:
+ return "ACL";
+ case SCO_LINK:
+ return "SCO";
+ case ESCO_LINK:
+ return "eSCO";
+ case LE_LINK:
+ return "LE";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static ssize_t show_link_type(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hci_conn *conn = to_hci_conn(dev);
+ return sprintf(buf, "%s\n", link_typetostr(conn->type));
+}
+
+static ssize_t show_link_address(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hci_conn *conn = to_hci_conn(dev);
+ return sprintf(buf, "%pMR\n", &conn->dst);
+}
+
+#define LINK_ATTR(_name, _mode, _show, _store) \
+struct device_attribute link_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+static LINK_ATTR(type, S_IRUGO, show_link_type, NULL);
+static LINK_ATTR(address, S_IRUGO, show_link_address, NULL);
+
+static struct attribute *bt_link_attrs[] = {
+ &link_attr_type.attr,
+ &link_attr_address.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(bt_link);
+
+static void bt_link_release(struct device *dev)
+{
+ struct hci_conn *conn = to_hci_conn(dev);
+ kfree(conn);
+}
+
+static struct device_type bt_link = {
+ .name = "link",
+ .groups = bt_link_groups,
+ .release = bt_link_release,
+};
+
+/*
+ * The rfcomm tty device will possibly retain even when conn
+ * is down, and sysfs doesn't support move zombie device,
+ * so we should move the device before conn device is destroyed.
+ */
+static int __match_tty(struct device *dev, void *data)
+{
+ return !strncmp(dev_name(dev), "rfcomm", 6);
+}
+
+void hci_conn_init_sysfs(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("conn %p", conn);
+
+ conn->dev.type = &bt_link;
+ conn->dev.class = bt_class;
+ conn->dev.parent = &hdev->dev;
+
+ device_initialize(&conn->dev);
+}
+
+void hci_conn_add_sysfs(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("conn %p", conn);
+
+ dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle);
+
+ if (device_add(&conn->dev) < 0) {
+ BT_ERR("Failed to register connection device");
+ return;
+ }
+
+ hci_dev_hold(hdev);
+}
+
+void hci_conn_del_sysfs(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ if (!device_is_registered(&conn->dev))
+ return;
+
+ while (1) {
+ struct device *dev;
+
+ dev = device_find_child(&conn->dev, NULL, __match_tty);
+ if (!dev)
+ break;
+ device_move(dev, NULL, DPM_ORDER_DEV_LAST);
+ put_device(dev);
+ }
+
+ device_del(&conn->dev);
+
+ hci_dev_put(hdev);
+}
+
+static inline char *host_typetostr(int type)
+{
+ switch (type) {
+ case HCI_BREDR:
+ return "BR/EDR";
+ case HCI_AMP:
+ return "AMP";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static ssize_t show_type(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hci_dev *hdev = to_hci_dev(dev);
+ return sprintf(buf, "%s\n", host_typetostr(hdev->dev_type));
+}
+
+static ssize_t show_name(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hci_dev *hdev = to_hci_dev(dev);
+ char name[HCI_MAX_NAME_LENGTH + 1];
+ int i;
+
+ for (i = 0; i < HCI_MAX_NAME_LENGTH; i++)
+ name[i] = hdev->dev_name[i];
+
+ name[HCI_MAX_NAME_LENGTH] = '\0';
+ return sprintf(buf, "%s\n", name);
+}
+
+static ssize_t show_address(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hci_dev *hdev = to_hci_dev(dev);
+ return sprintf(buf, "%pMR\n", &hdev->bdaddr);
+}
+
+static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
+static DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
+static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
+
+static struct attribute *bt_host_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_name.attr,
+ &dev_attr_address.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(bt_host);
+
+static void bt_host_release(struct device *dev)
+{
+ struct hci_dev *hdev = to_hci_dev(dev);
+ kfree(hdev);
+ module_put(THIS_MODULE);
+}
+
+static struct device_type bt_host = {
+ .name = "host",
+ .groups = bt_host_groups,
+ .release = bt_host_release,
+};
+
+void hci_init_sysfs(struct hci_dev *hdev)
+{
+ struct device *dev = &hdev->dev;
+
+ dev->type = &bt_host;
+ dev->class = bt_class;
+
+ __module_get(THIS_MODULE);
+ device_initialize(dev);
+}
+
+int __init bt_sysfs_init(void)
+{
+ bt_class = class_create(THIS_MODULE, "bluetooth");
+
+ return PTR_ERR_OR_ZERO(bt_class);
+}
+
+void bt_sysfs_cleanup(void)
+{
+ class_destroy(bt_class);
+}
diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig
new file mode 100644
index 000000000..bc8610b24
--- /dev/null
+++ b/net/bluetooth/hidp/Kconfig
@@ -0,0 +1,12 @@
+config BT_HIDP
+ tristate "HIDP protocol support"
+ depends on BT_BREDR && INPUT
+ select HID
+ help
+ HIDP (Human Interface Device Protocol) is a transport layer
+ for HID reports. HIDP is required for the Bluetooth Human
+ Interface Device Profile.
+
+ Say Y here to compile HIDP support into the kernel or say M to
+ compile it as module (hidp).
+
diff --git a/net/bluetooth/hidp/Makefile b/net/bluetooth/hidp/Makefile
new file mode 100644
index 000000000..a9ee11569
--- /dev/null
+++ b/net/bluetooth/hidp/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Bluetooth HIDP layer
+#
+
+obj-$(CONFIG_BT_HIDP) += hidp.o
+
+hidp-objs := core.o sock.o
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
new file mode 100644
index 000000000..9070dfd6b
--- /dev/null
+++ b/net/bluetooth/hidp/core.c
@@ -0,0 +1,1446 @@
+/*
+ HIDP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
+ Copyright (C) 2013 David Herrmann <dh.herrmann@gmail.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/kref.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/hidraw.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "hidp.h"
+
+#define VERSION "1.2"
+
+static DECLARE_RWSEM(hidp_session_sem);
+static LIST_HEAD(hidp_session_list);
+
+static unsigned char hidp_keycode[256] = {
+ 0, 0, 0, 0, 30, 48, 46, 32, 18, 33, 34, 35, 23, 36,
+ 37, 38, 50, 49, 24, 25, 16, 19, 31, 20, 22, 47, 17, 45,
+ 21, 44, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 1,
+ 14, 15, 57, 12, 13, 26, 27, 43, 43, 39, 40, 41, 51, 52,
+ 53, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 87, 88,
+ 99, 70, 119, 110, 102, 104, 111, 107, 109, 106, 105, 108, 103, 69,
+ 98, 55, 74, 78, 96, 79, 80, 81, 75, 76, 77, 71, 72, 73,
+ 82, 83, 86, 127, 116, 117, 183, 184, 185, 186, 187, 188, 189, 190,
+ 191, 192, 193, 194, 134, 138, 130, 132, 128, 129, 131, 137, 133, 135,
+ 136, 113, 115, 114, 0, 0, 0, 121, 0, 89, 93, 124, 92, 94,
+ 95, 0, 0, 0, 122, 123, 90, 91, 85, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 29, 42, 56, 125, 97, 54, 100, 126, 164, 166, 165, 163, 161, 115,
+ 114, 113, 150, 158, 159, 128, 136, 177, 178, 176, 142, 152, 173, 140
+};
+
+static unsigned char hidp_mkeyspat[] = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
+
+static int hidp_session_probe(struct l2cap_conn *conn,
+ struct l2cap_user *user);
+static void hidp_session_remove(struct l2cap_conn *conn,
+ struct l2cap_user *user);
+static int hidp_session_thread(void *arg);
+static void hidp_session_terminate(struct hidp_session *s);
+
+static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci)
+{
+ u32 valid_flags = 0;
+ memset(ci, 0, sizeof(*ci));
+ bacpy(&ci->bdaddr, &session->bdaddr);
+
+ ci->flags = session->flags & valid_flags;
+ ci->state = BT_CONNECTED;
+
+ if (session->input) {
+ ci->vendor = session->input->id.vendor;
+ ci->product = session->input->id.product;
+ ci->version = session->input->id.version;
+ if (session->input->name)
+ strlcpy(ci->name, session->input->name, 128);
+ else
+ strlcpy(ci->name, "HID Boot Device", 128);
+ } else if (session->hid) {
+ ci->vendor = session->hid->vendor;
+ ci->product = session->hid->product;
+ ci->version = session->hid->version;
+ strlcpy(ci->name, session->hid->name, 128);
+ }
+}
+
+/* assemble skb, queue message on @transmit and wake up the session thread */
+static int hidp_send_message(struct hidp_session *session, struct socket *sock,
+ struct sk_buff_head *transmit, unsigned char hdr,
+ const unsigned char *data, int size)
+{
+ struct sk_buff *skb;
+ struct sock *sk = sock->sk;
+
+ BT_DBG("session %p data %p size %d", session, data, size);
+
+ if (atomic_read(&session->terminate))
+ return -EIO;
+
+ skb = alloc_skb(size + 1, GFP_ATOMIC);
+ if (!skb) {
+ BT_ERR("Can't allocate memory for new frame");
+ return -ENOMEM;
+ }
+
+ *skb_put(skb, 1) = hdr;
+ if (data && size > 0)
+ memcpy(skb_put(skb, size), data, size);
+
+ skb_queue_tail(transmit, skb);
+ wake_up_interruptible(sk_sleep(sk));
+
+ return 0;
+}
+
+static int hidp_send_ctrl_message(struct hidp_session *session,
+ unsigned char hdr, const unsigned char *data,
+ int size)
+{
+ return hidp_send_message(session, session->ctrl_sock,
+ &session->ctrl_transmit, hdr, data, size);
+}
+
+static int hidp_send_intr_message(struct hidp_session *session,
+ unsigned char hdr, const unsigned char *data,
+ int size)
+{
+ return hidp_send_message(session, session->intr_sock,
+ &session->intr_transmit, hdr, data, size);
+}
+
+static int hidp_input_event(struct input_dev *dev, unsigned int type,
+ unsigned int code, int value)
+{
+ struct hidp_session *session = input_get_drvdata(dev);
+ unsigned char newleds;
+ unsigned char hdr, data[2];
+
+ BT_DBG("session %p type %d code %d value %d",
+ session, type, code, value);
+
+ if (type != EV_LED)
+ return -1;
+
+ newleds = (!!test_bit(LED_KANA, dev->led) << 3) |
+ (!!test_bit(LED_COMPOSE, dev->led) << 3) |
+ (!!test_bit(LED_SCROLLL, dev->led) << 2) |
+ (!!test_bit(LED_CAPSL, dev->led) << 1) |
+ (!!test_bit(LED_NUML, dev->led) << 0);
+
+ if (session->leds == newleds)
+ return 0;
+
+ session->leds = newleds;
+
+ hdr = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT;
+ data[0] = 0x01;
+ data[1] = newleds;
+
+ return hidp_send_intr_message(session, hdr, data, 2);
+}
+
+static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
+{
+ struct input_dev *dev = session->input;
+ unsigned char *keys = session->keys;
+ unsigned char *udata = skb->data + 1;
+ signed char *sdata = skb->data + 1;
+ int i, size = skb->len - 1;
+
+ switch (skb->data[0]) {
+ case 0x01: /* Keyboard report */
+ for (i = 0; i < 8; i++)
+ input_report_key(dev, hidp_keycode[i + 224], (udata[0] >> i) & 1);
+
+ /* If all the key codes have been set to 0x01, it means
+ * too many keys were pressed at the same time. */
+ if (!memcmp(udata + 2, hidp_mkeyspat, 6))
+ break;
+
+ for (i = 2; i < 8; i++) {
+ if (keys[i] > 3 && memscan(udata + 2, keys[i], 6) == udata + 8) {
+ if (hidp_keycode[keys[i]])
+ input_report_key(dev, hidp_keycode[keys[i]], 0);
+ else
+ BT_ERR("Unknown key (scancode %#x) released.", keys[i]);
+ }
+
+ if (udata[i] > 3 && memscan(keys + 2, udata[i], 6) == keys + 8) {
+ if (hidp_keycode[udata[i]])
+ input_report_key(dev, hidp_keycode[udata[i]], 1);
+ else
+ BT_ERR("Unknown key (scancode %#x) pressed.", udata[i]);
+ }
+ }
+
+ memcpy(keys, udata, 8);
+ break;
+
+ case 0x02: /* Mouse report */
+ input_report_key(dev, BTN_LEFT, sdata[0] & 0x01);
+ input_report_key(dev, BTN_RIGHT, sdata[0] & 0x02);
+ input_report_key(dev, BTN_MIDDLE, sdata[0] & 0x04);
+ input_report_key(dev, BTN_SIDE, sdata[0] & 0x08);
+ input_report_key(dev, BTN_EXTRA, sdata[0] & 0x10);
+
+ input_report_rel(dev, REL_X, sdata[1]);
+ input_report_rel(dev, REL_Y, sdata[2]);
+
+ if (size > 3)
+ input_report_rel(dev, REL_WHEEL, sdata[3]);
+ break;
+ }
+
+ input_sync(dev);
+}
+
+static int hidp_get_raw_report(struct hid_device *hid,
+ unsigned char report_number,
+ unsigned char *data, size_t count,
+ unsigned char report_type)
+{
+ struct hidp_session *session = hid->driver_data;
+ struct sk_buff *skb;
+ size_t len;
+ int numbered_reports = hid->report_enum[report_type].numbered;
+ int ret;
+
+ if (atomic_read(&session->terminate))
+ return -EIO;
+
+ switch (report_type) {
+ case HID_FEATURE_REPORT:
+ report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_FEATURE;
+ break;
+ case HID_INPUT_REPORT:
+ report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_INPUT;
+ break;
+ case HID_OUTPUT_REPORT:
+ report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_OUPUT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (mutex_lock_interruptible(&session->report_mutex))
+ return -ERESTARTSYS;
+
+ /* Set up our wait, and send the report request to the device. */
+ session->waiting_report_type = report_type & HIDP_DATA_RTYPE_MASK;
+ session->waiting_report_number = numbered_reports ? report_number : -1;
+ set_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+ data[0] = report_number;
+ ret = hidp_send_ctrl_message(session, report_type, data, 1);
+ if (ret)
+ goto err;
+
+ /* Wait for the return of the report. The returned report
+ gets put in session->report_return. */
+ while (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) &&
+ !atomic_read(&session->terminate)) {
+ int res;
+
+ res = wait_event_interruptible_timeout(session->report_queue,
+ !test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)
+ || atomic_read(&session->terminate),
+ 5*HZ);
+ if (res == 0) {
+ /* timeout */
+ ret = -EIO;
+ goto err;
+ }
+ if (res < 0) {
+ /* signal */
+ ret = -ERESTARTSYS;
+ goto err;
+ }
+ }
+
+ skb = session->report_return;
+ if (skb) {
+ len = skb->len < count ? skb->len : count;
+ memcpy(data, skb->data, len);
+
+ kfree_skb(skb);
+ session->report_return = NULL;
+ } else {
+ /* Device returned a HANDSHAKE, indicating protocol error. */
+ len = -EIO;
+ }
+
+ clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+ mutex_unlock(&session->report_mutex);
+
+ return len;
+
+err:
+ clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+ mutex_unlock(&session->report_mutex);
+ return ret;
+}
+
+static int hidp_set_raw_report(struct hid_device *hid, unsigned char reportnum,
+ unsigned char *data, size_t count,
+ unsigned char report_type)
+{
+ struct hidp_session *session = hid->driver_data;
+ int ret;
+
+ switch (report_type) {
+ case HID_FEATURE_REPORT:
+ report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE;
+ break;
+ case HID_INPUT_REPORT:
+ report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_INPUT;
+ break;
+ case HID_OUTPUT_REPORT:
+ report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_OUPUT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (mutex_lock_interruptible(&session->report_mutex))
+ return -ERESTARTSYS;
+
+ /* Set up our wait, and send the report request to the device. */
+ data[0] = reportnum;
+ set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+ ret = hidp_send_ctrl_message(session, report_type, data, count);
+ if (ret)
+ goto err;
+
+ /* Wait for the ACK from the device. */
+ while (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags) &&
+ !atomic_read(&session->terminate)) {
+ int res;
+
+ res = wait_event_interruptible_timeout(session->report_queue,
+ !test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)
+ || atomic_read(&session->terminate),
+ 10*HZ);
+ if (res == 0) {
+ /* timeout */
+ ret = -EIO;
+ goto err;
+ }
+ if (res < 0) {
+ /* signal */
+ ret = -ERESTARTSYS;
+ goto err;
+ }
+ }
+
+ if (!session->output_report_success) {
+ ret = -EIO;
+ goto err;
+ }
+
+ ret = count;
+
+err:
+ clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+ mutex_unlock(&session->report_mutex);
+ return ret;
+}
+
+static int hidp_output_report(struct hid_device *hid, __u8 *data, size_t count)
+{
+ struct hidp_session *session = hid->driver_data;
+
+ return hidp_send_intr_message(session,
+ HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT,
+ data, count);
+}
+
+static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum,
+ __u8 *buf, size_t len, unsigned char rtype,
+ int reqtype)
+{
+ switch (reqtype) {
+ case HID_REQ_GET_REPORT:
+ return hidp_get_raw_report(hid, reportnum, buf, len, rtype);
+ case HID_REQ_SET_REPORT:
+ return hidp_set_raw_report(hid, reportnum, buf, len, rtype);
+ default:
+ return -EIO;
+ }
+}
+
+static void hidp_idle_timeout(unsigned long arg)
+{
+ struct hidp_session *session = (struct hidp_session *) arg;
+
+ hidp_session_terminate(session);
+}
+
+static void hidp_set_timer(struct hidp_session *session)
+{
+ if (session->idle_to > 0)
+ mod_timer(&session->timer, jiffies + HZ * session->idle_to);
+}
+
+static void hidp_del_timer(struct hidp_session *session)
+{
+ if (session->idle_to > 0)
+ del_timer(&session->timer);
+}
+
+static void hidp_process_report(struct hidp_session *session,
+ int type, const u8 *data, int len, int intr)
+{
+ if (len > HID_MAX_BUFFER_SIZE)
+ len = HID_MAX_BUFFER_SIZE;
+
+ memcpy(session->input_buf, data, len);
+ hid_input_report(session->hid, type, session->input_buf, len, intr);
+}
+
+static void hidp_process_handshake(struct hidp_session *session,
+ unsigned char param)
+{
+ BT_DBG("session %p param 0x%02x", session, param);
+ session->output_report_success = 0; /* default condition */
+
+ switch (param) {
+ case HIDP_HSHK_SUCCESSFUL:
+ /* FIXME: Call into SET_ GET_ handlers here */
+ session->output_report_success = 1;
+ break;
+
+ case HIDP_HSHK_NOT_READY:
+ case HIDP_HSHK_ERR_INVALID_REPORT_ID:
+ case HIDP_HSHK_ERR_UNSUPPORTED_REQUEST:
+ case HIDP_HSHK_ERR_INVALID_PARAMETER:
+ if (test_and_clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags))
+ wake_up_interruptible(&session->report_queue);
+
+ /* FIXME: Call into SET_ GET_ handlers here */
+ break;
+
+ case HIDP_HSHK_ERR_UNKNOWN:
+ break;
+
+ case HIDP_HSHK_ERR_FATAL:
+ /* Device requests a reboot, as this is the only way this error
+ * can be recovered. */
+ hidp_send_ctrl_message(session,
+ HIDP_TRANS_HID_CONTROL | HIDP_CTRL_SOFT_RESET, NULL, 0);
+ break;
+
+ default:
+ hidp_send_ctrl_message(session,
+ HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
+ break;
+ }
+
+ /* Wake up the waiting thread. */
+ if (test_and_clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags))
+ wake_up_interruptible(&session->report_queue);
+}
+
+static void hidp_process_hid_control(struct hidp_session *session,
+ unsigned char param)
+{
+ BT_DBG("session %p param 0x%02x", session, param);
+
+ if (param == HIDP_CTRL_VIRTUAL_CABLE_UNPLUG) {
+ /* Flush the transmit queues */
+ skb_queue_purge(&session->ctrl_transmit);
+ skb_queue_purge(&session->intr_transmit);
+
+ hidp_session_terminate(session);
+ }
+}
+
+/* Returns true if the passed-in skb should be freed by the caller. */
+static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
+ unsigned char param)
+{
+ int done_with_skb = 1;
+ BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param);
+
+ switch (param) {
+ case HIDP_DATA_RTYPE_INPUT:
+ hidp_set_timer(session);
+
+ if (session->input)
+ hidp_input_report(session, skb);
+
+ if (session->hid)
+ hidp_process_report(session, HID_INPUT_REPORT,
+ skb->data, skb->len, 0);
+ break;
+
+ case HIDP_DATA_RTYPE_OTHER:
+ case HIDP_DATA_RTYPE_OUPUT:
+ case HIDP_DATA_RTYPE_FEATURE:
+ break;
+
+ default:
+ hidp_send_ctrl_message(session,
+ HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
+ }
+
+ if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) &&
+ param == session->waiting_report_type) {
+ if (session->waiting_report_number < 0 ||
+ session->waiting_report_number == skb->data[0]) {
+ /* hidp_get_raw_report() is waiting on this report. */
+ session->report_return = skb;
+ done_with_skb = 0;
+ clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+ wake_up_interruptible(&session->report_queue);
+ }
+ }
+
+ return done_with_skb;
+}
+
+static void hidp_recv_ctrl_frame(struct hidp_session *session,
+ struct sk_buff *skb)
+{
+ unsigned char hdr, type, param;
+ int free_skb = 1;
+
+ BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+ hdr = skb->data[0];
+ skb_pull(skb, 1);
+
+ type = hdr & HIDP_HEADER_TRANS_MASK;
+ param = hdr & HIDP_HEADER_PARAM_MASK;
+
+ switch (type) {
+ case HIDP_TRANS_HANDSHAKE:
+ hidp_process_handshake(session, param);
+ break;
+
+ case HIDP_TRANS_HID_CONTROL:
+ hidp_process_hid_control(session, param);
+ break;
+
+ case HIDP_TRANS_DATA:
+ free_skb = hidp_process_data(session, skb, param);
+ break;
+
+ default:
+ hidp_send_ctrl_message(session,
+ HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_UNSUPPORTED_REQUEST, NULL, 0);
+ break;
+ }
+
+ if (free_skb)
+ kfree_skb(skb);
+}
+
+static void hidp_recv_intr_frame(struct hidp_session *session,
+ struct sk_buff *skb)
+{
+ unsigned char hdr;
+
+ BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+ hdr = skb->data[0];
+ skb_pull(skb, 1);
+
+ if (hdr == (HIDP_TRANS_DATA | HIDP_DATA_RTYPE_INPUT)) {
+ hidp_set_timer(session);
+
+ if (session->input)
+ hidp_input_report(session, skb);
+
+ if (session->hid) {
+ hidp_process_report(session, HID_INPUT_REPORT,
+ skb->data, skb->len, 1);
+ BT_DBG("report len %d", skb->len);
+ }
+ } else {
+ BT_DBG("Unsupported protocol header 0x%02x", hdr);
+ }
+
+ kfree_skb(skb);
+}
+
+static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
+{
+ struct kvec iv = { data, len };
+ struct msghdr msg;
+
+ BT_DBG("sock %p data %p len %d", sock, data, len);
+
+ if (!len)
+ return 0;
+
+ memset(&msg, 0, sizeof(msg));
+
+ return kernel_sendmsg(sock, &msg, &iv, 1, len);
+}
+
+/* dequeue message from @transmit and send via @sock */
+static void hidp_process_transmit(struct hidp_session *session,
+ struct sk_buff_head *transmit,
+ struct socket *sock)
+{
+ struct sk_buff *skb;
+ int ret;
+
+ BT_DBG("session %p", session);
+
+ while ((skb = skb_dequeue(transmit))) {
+ ret = hidp_send_frame(sock, skb->data, skb->len);
+ if (ret == -EAGAIN) {
+ skb_queue_head(transmit, skb);
+ break;
+ } else if (ret < 0) {
+ hidp_session_terminate(session);
+ kfree_skb(skb);
+ break;
+ }
+
+ hidp_set_timer(session);
+ kfree_skb(skb);
+ }
+}
+
+static int hidp_setup_input(struct hidp_session *session,
+ struct hidp_connadd_req *req)
+{
+ struct input_dev *input;
+ int i;
+
+ input = input_allocate_device();
+ if (!input)
+ return -ENOMEM;
+
+ session->input = input;
+
+ input_set_drvdata(input, session);
+
+ input->name = "Bluetooth HID Boot Protocol Device";
+
+ input->id.bustype = BUS_BLUETOOTH;
+ input->id.vendor = req->vendor;
+ input->id.product = req->product;
+ input->id.version = req->version;
+
+ if (req->subclass & 0x40) {
+ set_bit(EV_KEY, input->evbit);
+ set_bit(EV_LED, input->evbit);
+ set_bit(EV_REP, input->evbit);
+
+ set_bit(LED_NUML, input->ledbit);
+ set_bit(LED_CAPSL, input->ledbit);
+ set_bit(LED_SCROLLL, input->ledbit);
+ set_bit(LED_COMPOSE, input->ledbit);
+ set_bit(LED_KANA, input->ledbit);
+
+ for (i = 0; i < sizeof(hidp_keycode); i++)
+ set_bit(hidp_keycode[i], input->keybit);
+ clear_bit(0, input->keybit);
+ }
+
+ if (req->subclass & 0x80) {
+ input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL);
+ input->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) |
+ BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE);
+ input->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y);
+ input->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_SIDE) |
+ BIT_MASK(BTN_EXTRA);
+ input->relbit[0] |= BIT_MASK(REL_WHEEL);
+ }
+
+ input->dev.parent = &session->conn->hcon->dev;
+
+ input->event = hidp_input_event;
+
+ return 0;
+}
+
+static int hidp_open(struct hid_device *hid)
+{
+ return 0;
+}
+
+static void hidp_close(struct hid_device *hid)
+{
+}
+
+static int hidp_parse(struct hid_device *hid)
+{
+ struct hidp_session *session = hid->driver_data;
+
+ return hid_parse_report(session->hid, session->rd_data,
+ session->rd_size);
+}
+
+static int hidp_start(struct hid_device *hid)
+{
+ return 0;
+}
+
+static void hidp_stop(struct hid_device *hid)
+{
+ struct hidp_session *session = hid->driver_data;
+
+ skb_queue_purge(&session->ctrl_transmit);
+ skb_queue_purge(&session->intr_transmit);
+
+ hid->claimed = 0;
+}
+
+static struct hid_ll_driver hidp_hid_driver = {
+ .parse = hidp_parse,
+ .start = hidp_start,
+ .stop = hidp_stop,
+ .open = hidp_open,
+ .close = hidp_close,
+ .raw_request = hidp_raw_request,
+ .output_report = hidp_output_report,
+};
+
+/* This function sets up the hid device. It does not add it
+ to the HID system. That is done in hidp_add_connection(). */
+static int hidp_setup_hid(struct hidp_session *session,
+ struct hidp_connadd_req *req)
+{
+ struct hid_device *hid;
+ int err;
+
+ session->rd_data = memdup_user(req->rd_data, req->rd_size);
+ if (IS_ERR(session->rd_data))
+ return PTR_ERR(session->rd_data);
+
+ session->rd_size = req->rd_size;
+
+ hid = hid_allocate_device();
+ if (IS_ERR(hid)) {
+ err = PTR_ERR(hid);
+ goto fault;
+ }
+
+ session->hid = hid;
+
+ hid->driver_data = session;
+
+ hid->bus = BUS_BLUETOOTH;
+ hid->vendor = req->vendor;
+ hid->product = req->product;
+ hid->version = req->version;
+ hid->country = req->country;
+
+ strncpy(hid->name, req->name, sizeof(req->name) - 1);
+
+ snprintf(hid->phys, sizeof(hid->phys), "%pMR",
+ &l2cap_pi(session->ctrl_sock->sk)->chan->src);
+
+ /* NOTE: Some device modules depend on the dst address being stored in
+ * uniq. Please be aware of this before making changes to this behavior.
+ */
+ snprintf(hid->uniq, sizeof(hid->uniq), "%pMR",
+ &l2cap_pi(session->ctrl_sock->sk)->chan->dst);
+
+ hid->dev.parent = &session->conn->hcon->dev;
+ hid->ll_driver = &hidp_hid_driver;
+
+ /* True if device is blacklisted in drivers/hid/hid-core.c */
+ if (hid_ignore(hid)) {
+ hid_destroy_device(session->hid);
+ session->hid = NULL;
+ return -ENODEV;
+ }
+
+ return 0;
+
+fault:
+ kfree(session->rd_data);
+ session->rd_data = NULL;
+
+ return err;
+}
+
+/* initialize session devices */
+static int hidp_session_dev_init(struct hidp_session *session,
+ struct hidp_connadd_req *req)
+{
+ int ret;
+
+ if (req->rd_size > 0) {
+ ret = hidp_setup_hid(session, req);
+ if (ret && ret != -ENODEV)
+ return ret;
+ }
+
+ if (!session->hid) {
+ ret = hidp_setup_input(session, req);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* destroy session devices */
+static void hidp_session_dev_destroy(struct hidp_session *session)
+{
+ if (session->hid)
+ put_device(&session->hid->dev);
+ else if (session->input)
+ input_put_device(session->input);
+
+ kfree(session->rd_data);
+ session->rd_data = NULL;
+}
+
+/* add HID/input devices to their underlying bus systems */
+static int hidp_session_dev_add(struct hidp_session *session)
+{
+ int ret;
+
+ /* Both HID and input systems drop a ref-count when unregistering the
+ * device but they don't take a ref-count when registering them. Work
+ * around this by explicitly taking a refcount during registration
+ * which is dropped automatically by unregistering the devices. */
+
+ if (session->hid) {
+ ret = hid_add_device(session->hid);
+ if (ret)
+ return ret;
+ get_device(&session->hid->dev);
+ } else if (session->input) {
+ ret = input_register_device(session->input);
+ if (ret)
+ return ret;
+ input_get_device(session->input);
+ }
+
+ return 0;
+}
+
+/* remove HID/input devices from their bus systems */
+static void hidp_session_dev_del(struct hidp_session *session)
+{
+ if (session->hid)
+ hid_destroy_device(session->hid);
+ else if (session->input)
+ input_unregister_device(session->input);
+}
+
+/*
+ * Asynchronous device registration
+ * HID device drivers might want to perform I/O during initialization to
+ * detect device types. Therefore, call device registration in a separate
+ * worker so the HIDP thread can schedule I/O operations.
+ * Note that this must be called after the worker thread was initialized
+ * successfully. This will then add the devices and increase session state
+ * on success, otherwise it will terminate the session thread.
+ */
+static void hidp_session_dev_work(struct work_struct *work)
+{
+ struct hidp_session *session = container_of(work,
+ struct hidp_session,
+ dev_init);
+ int ret;
+
+ ret = hidp_session_dev_add(session);
+ if (!ret)
+ atomic_inc(&session->state);
+ else
+ hidp_session_terminate(session);
+}
+
+/*
+ * Create new session object
+ * Allocate session object, initialize static fields, copy input data into the
+ * object and take a reference to all sub-objects.
+ * This returns 0 on success and puts a pointer to the new session object in
+ * \out. Otherwise, an error code is returned.
+ * The new session object has an initial ref-count of 1.
+ */
+static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr,
+ struct socket *ctrl_sock,
+ struct socket *intr_sock,
+ struct hidp_connadd_req *req,
+ struct l2cap_conn *conn)
+{
+ struct hidp_session *session;
+ int ret;
+ struct bt_sock *ctrl, *intr;
+
+ ctrl = bt_sk(ctrl_sock->sk);
+ intr = bt_sk(intr_sock->sk);
+
+ session = kzalloc(sizeof(*session), GFP_KERNEL);
+ if (!session)
+ return -ENOMEM;
+
+ /* object and runtime management */
+ kref_init(&session->ref);
+ atomic_set(&session->state, HIDP_SESSION_IDLING);
+ init_waitqueue_head(&session->state_queue);
+ session->flags = req->flags & BIT(HIDP_BLUETOOTH_VENDOR_ID);
+
+ /* connection management */
+ bacpy(&session->bdaddr, bdaddr);
+ session->conn = l2cap_conn_get(conn);
+ session->user.probe = hidp_session_probe;
+ session->user.remove = hidp_session_remove;
+ session->ctrl_sock = ctrl_sock;
+ session->intr_sock = intr_sock;
+ skb_queue_head_init(&session->ctrl_transmit);
+ skb_queue_head_init(&session->intr_transmit);
+ session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl)->chan->omtu,
+ l2cap_pi(ctrl)->chan->imtu);
+ session->intr_mtu = min_t(uint, l2cap_pi(intr)->chan->omtu,
+ l2cap_pi(intr)->chan->imtu);
+ session->idle_to = req->idle_to;
+
+ /* device management */
+ INIT_WORK(&session->dev_init, hidp_session_dev_work);
+ setup_timer(&session->timer, hidp_idle_timeout,
+ (unsigned long)session);
+
+ /* session data */
+ mutex_init(&session->report_mutex);
+ init_waitqueue_head(&session->report_queue);
+
+ ret = hidp_session_dev_init(session, req);
+ if (ret)
+ goto err_free;
+
+ get_file(session->intr_sock->file);
+ get_file(session->ctrl_sock->file);
+ *out = session;
+ return 0;
+
+err_free:
+ l2cap_conn_put(session->conn);
+ kfree(session);
+ return ret;
+}
+
+/* increase ref-count of the given session by one */
+static void hidp_session_get(struct hidp_session *session)
+{
+ kref_get(&session->ref);
+}
+
+/* release callback */
+static void session_free(struct kref *ref)
+{
+ struct hidp_session *session = container_of(ref, struct hidp_session,
+ ref);
+
+ hidp_session_dev_destroy(session);
+ skb_queue_purge(&session->ctrl_transmit);
+ skb_queue_purge(&session->intr_transmit);
+ fput(session->intr_sock->file);
+ fput(session->ctrl_sock->file);
+ l2cap_conn_put(session->conn);
+ kfree(session);
+}
+
+/* decrease ref-count of the given session by one */
+static void hidp_session_put(struct hidp_session *session)
+{
+ kref_put(&session->ref, session_free);
+}
+
+/*
+ * Search the list of active sessions for a session with target address
+ * \bdaddr. You must hold at least a read-lock on \hidp_session_sem. As long as
+ * you do not release this lock, the session objects cannot vanish and you can
+ * safely take a reference to the session yourself.
+ */
+static struct hidp_session *__hidp_session_find(const bdaddr_t *bdaddr)
+{
+ struct hidp_session *session;
+
+ list_for_each_entry(session, &hidp_session_list, list) {
+ if (!bacmp(bdaddr, &session->bdaddr))
+ return session;
+ }
+
+ return NULL;
+}
+
+/*
+ * Same as __hidp_session_find() but no locks must be held. This also takes a
+ * reference of the returned session (if non-NULL) so you must drop this
+ * reference if you no longer use the object.
+ */
+static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr)
+{
+ struct hidp_session *session;
+
+ down_read(&hidp_session_sem);
+
+ session = __hidp_session_find(bdaddr);
+ if (session)
+ hidp_session_get(session);
+
+ up_read(&hidp_session_sem);
+
+ return session;
+}
+
+/*
+ * Start session synchronously
+ * This starts a session thread and waits until initialization
+ * is done or returns an error if it couldn't be started.
+ * If this returns 0 the session thread is up and running. You must call
+ * hipd_session_stop_sync() before deleting any runtime resources.
+ */
+static int hidp_session_start_sync(struct hidp_session *session)
+{
+ unsigned int vendor, product;
+
+ if (session->hid) {
+ vendor = session->hid->vendor;
+ product = session->hid->product;
+ } else if (session->input) {
+ vendor = session->input->id.vendor;
+ product = session->input->id.product;
+ } else {
+ vendor = 0x0000;
+ product = 0x0000;
+ }
+
+ session->task = kthread_run(hidp_session_thread, session,
+ "khidpd_%04x%04x", vendor, product);
+ if (IS_ERR(session->task))
+ return PTR_ERR(session->task);
+
+ while (atomic_read(&session->state) <= HIDP_SESSION_IDLING)
+ wait_event(session->state_queue,
+ atomic_read(&session->state) > HIDP_SESSION_IDLING);
+
+ return 0;
+}
+
+/*
+ * Terminate session thread
+ * Wake up session thread and notify it to stop. This is asynchronous and
+ * returns immediately. Call this whenever a runtime error occurs and you want
+ * the session to stop.
+ * Note: wake_up_process() performs any necessary memory-barriers for us.
+ */
+static void hidp_session_terminate(struct hidp_session *session)
+{
+ atomic_inc(&session->terminate);
+ wake_up_process(session->task);
+}
+
+/*
+ * Probe HIDP session
+ * This is called from the l2cap_conn core when our l2cap_user object is bound
+ * to the hci-connection. We get the session via the \user object and can now
+ * start the session thread, link it into the global session list and
+ * schedule HID/input device registration.
+ * The global session-list owns its own reference to the session object so you
+ * can drop your own reference after registering the l2cap_user object.
+ */
+static int hidp_session_probe(struct l2cap_conn *conn,
+ struct l2cap_user *user)
+{
+ struct hidp_session *session = container_of(user,
+ struct hidp_session,
+ user);
+ struct hidp_session *s;
+ int ret;
+
+ down_write(&hidp_session_sem);
+
+ /* check that no other session for this device exists */
+ s = __hidp_session_find(&session->bdaddr);
+ if (s) {
+ ret = -EEXIST;
+ goto out_unlock;
+ }
+
+ if (session->input) {
+ ret = hidp_session_dev_add(session);
+ if (ret)
+ goto out_unlock;
+ }
+
+ ret = hidp_session_start_sync(session);
+ if (ret)
+ goto out_del;
+
+ /* HID device registration is async to allow I/O during probe */
+ if (session->input)
+ atomic_inc(&session->state);
+ else
+ schedule_work(&session->dev_init);
+
+ hidp_session_get(session);
+ list_add(&session->list, &hidp_session_list);
+ ret = 0;
+ goto out_unlock;
+
+out_del:
+ if (session->input)
+ hidp_session_dev_del(session);
+out_unlock:
+ up_write(&hidp_session_sem);
+ return ret;
+}
+
+/*
+ * Remove HIDP session
+ * Called from the l2cap_conn core when either we explicitly unregistered
+ * the l2cap_user object or if the underlying connection is shut down.
+ * We signal the hidp-session thread to shut down, unregister the HID/input
+ * devices and unlink the session from the global list.
+ * This drops the reference to the session that is owned by the global
+ * session-list.
+ * Note: We _must_ not synchronosly wait for the session-thread to shut down.
+ * This is, because the session-thread might be waiting for an HCI lock that is
+ * held while we are called. Therefore, we only unregister the devices and
+ * notify the session-thread to terminate. The thread itself owns a reference
+ * to the session object so it can safely shut down.
+ */
+static void hidp_session_remove(struct l2cap_conn *conn,
+ struct l2cap_user *user)
+{
+ struct hidp_session *session = container_of(user,
+ struct hidp_session,
+ user);
+
+ down_write(&hidp_session_sem);
+
+ hidp_session_terminate(session);
+
+ cancel_work_sync(&session->dev_init);
+ if (session->input ||
+ atomic_read(&session->state) > HIDP_SESSION_PREPARING)
+ hidp_session_dev_del(session);
+
+ list_del(&session->list);
+
+ up_write(&hidp_session_sem);
+
+ hidp_session_put(session);
+}
+
+/*
+ * Session Worker
+ * This performs the actual main-loop of the HIDP worker. We first check
+ * whether the underlying connection is still alive, then parse all pending
+ * messages and finally send all outstanding messages.
+ */
+static void hidp_session_run(struct hidp_session *session)
+{
+ struct sock *ctrl_sk = session->ctrl_sock->sk;
+ struct sock *intr_sk = session->intr_sock->sk;
+ struct sk_buff *skb;
+
+ for (;;) {
+ /*
+ * This thread can be woken up two ways:
+ * - You call hidp_session_terminate() which sets the
+ * session->terminate flag and wakes this thread up.
+ * - Via modifying the socket state of ctrl/intr_sock. This
+ * thread is woken up by ->sk_state_changed().
+ *
+ * Note: set_current_state() performs any necessary
+ * memory-barriers for us.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (atomic_read(&session->terminate))
+ break;
+
+ if (ctrl_sk->sk_state != BT_CONNECTED ||
+ intr_sk->sk_state != BT_CONNECTED)
+ break;
+
+ /* parse incoming intr-skbs */
+ while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ if (!skb_linearize(skb))
+ hidp_recv_intr_frame(session, skb);
+ else
+ kfree_skb(skb);
+ }
+
+ /* send pending intr-skbs */
+ hidp_process_transmit(session, &session->intr_transmit,
+ session->intr_sock);
+
+ /* parse incoming ctrl-skbs */
+ while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ if (!skb_linearize(skb))
+ hidp_recv_ctrl_frame(session, skb);
+ else
+ kfree_skb(skb);
+ }
+
+ /* send pending ctrl-skbs */
+ hidp_process_transmit(session, &session->ctrl_transmit,
+ session->ctrl_sock);
+
+ schedule();
+ }
+
+ atomic_inc(&session->terminate);
+ set_current_state(TASK_RUNNING);
+}
+
+/*
+ * HIDP session thread
+ * This thread runs the I/O for a single HIDP session. Startup is synchronous
+ * which allows us to take references to ourself here instead of doing that in
+ * the caller.
+ * When we are ready to run we notify the caller and call hidp_session_run().
+ */
+static int hidp_session_thread(void *arg)
+{
+ struct hidp_session *session = arg;
+ wait_queue_t ctrl_wait, intr_wait;
+
+ BT_DBG("session %p", session);
+
+ /* initialize runtime environment */
+ hidp_session_get(session);
+ __module_get(THIS_MODULE);
+ set_user_nice(current, -15);
+ hidp_set_timer(session);
+
+ init_waitqueue_entry(&ctrl_wait, current);
+ init_waitqueue_entry(&intr_wait, current);
+ add_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait);
+ add_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait);
+ /* This memory barrier is paired with wq_has_sleeper(). See
+ * sock_poll_wait() for more information why this is needed. */
+ smp_mb();
+
+ /* notify synchronous startup that we're ready */
+ atomic_inc(&session->state);
+ wake_up(&session->state_queue);
+
+ /* run session */
+ hidp_session_run(session);
+
+ /* cleanup runtime environment */
+ remove_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait);
+ remove_wait_queue(sk_sleep(session->intr_sock->sk), &ctrl_wait);
+ wake_up_interruptible(&session->report_queue);
+ hidp_del_timer(session);
+
+ /*
+ * If we stopped ourself due to any internal signal, we should try to
+ * unregister our own session here to avoid having it linger until the
+ * parent l2cap_conn dies or user-space cleans it up.
+ * This does not deadlock as we don't do any synchronous shutdown.
+ * Instead, this call has the same semantics as if user-space tried to
+ * delete the session.
+ */
+ l2cap_unregister_user(session->conn, &session->user);
+ hidp_session_put(session);
+
+ module_put_and_exit(0);
+ return 0;
+}
+
+static int hidp_verify_sockets(struct socket *ctrl_sock,
+ struct socket *intr_sock)
+{
+ struct l2cap_chan *ctrl_chan, *intr_chan;
+ struct bt_sock *ctrl, *intr;
+ struct hidp_session *session;
+
+ if (!l2cap_is_socket(ctrl_sock) || !l2cap_is_socket(intr_sock))
+ return -EINVAL;
+
+ ctrl_chan = l2cap_pi(ctrl_sock->sk)->chan;
+ intr_chan = l2cap_pi(intr_sock->sk)->chan;
+
+ if (bacmp(&ctrl_chan->src, &intr_chan->src) ||
+ bacmp(&ctrl_chan->dst, &intr_chan->dst))
+ return -ENOTUNIQ;
+
+ ctrl = bt_sk(ctrl_sock->sk);
+ intr = bt_sk(intr_sock->sk);
+
+ if (ctrl->sk.sk_state != BT_CONNECTED ||
+ intr->sk.sk_state != BT_CONNECTED)
+ return -EBADFD;
+
+ /* early session check, we check again during session registration */
+ session = hidp_session_find(&ctrl_chan->dst);
+ if (session) {
+ hidp_session_put(session);
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+int hidp_connection_add(struct hidp_connadd_req *req,
+ struct socket *ctrl_sock,
+ struct socket *intr_sock)
+{
+ u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG) |
+ BIT(HIDP_BOOT_PROTOCOL_MODE);
+ struct hidp_session *session;
+ struct l2cap_conn *conn;
+ struct l2cap_chan *chan;
+ int ret;
+
+ ret = hidp_verify_sockets(ctrl_sock, intr_sock);
+ if (ret)
+ return ret;
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
+ chan = l2cap_pi(ctrl_sock->sk)->chan;
+ conn = NULL;
+ l2cap_chan_lock(chan);
+ if (chan->conn)
+ conn = l2cap_conn_get(chan->conn);
+ l2cap_chan_unlock(chan);
+
+ if (!conn)
+ return -EBADFD;
+
+ ret = hidp_session_new(&session, &chan->dst, ctrl_sock,
+ intr_sock, req, conn);
+ if (ret)
+ goto out_conn;
+
+ ret = l2cap_register_user(conn, &session->user);
+ if (ret)
+ goto out_session;
+
+ ret = 0;
+
+out_session:
+ hidp_session_put(session);
+out_conn:
+ l2cap_conn_put(conn);
+ return ret;
+}
+
+int hidp_connection_del(struct hidp_conndel_req *req)
+{
+ u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG);
+ struct hidp_session *session;
+
+ if (req->flags & ~valid_flags)
+ return -EINVAL;
+
+ session = hidp_session_find(&req->bdaddr);
+ if (!session)
+ return -ENOENT;
+
+ if (req->flags & BIT(HIDP_VIRTUAL_CABLE_UNPLUG))
+ hidp_send_ctrl_message(session,
+ HIDP_TRANS_HID_CONTROL |
+ HIDP_CTRL_VIRTUAL_CABLE_UNPLUG,
+ NULL, 0);
+ else
+ l2cap_unregister_user(session->conn, &session->user);
+
+ hidp_session_put(session);
+
+ return 0;
+}
+
+int hidp_get_connlist(struct hidp_connlist_req *req)
+{
+ struct hidp_session *session;
+ int err = 0, n = 0;
+
+ BT_DBG("");
+
+ down_read(&hidp_session_sem);
+
+ list_for_each_entry(session, &hidp_session_list, list) {
+ struct hidp_conninfo ci;
+
+ hidp_copy_session(session, &ci);
+
+ if (copy_to_user(req->ci, &ci, sizeof(ci))) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (++n >= req->cnum)
+ break;
+
+ req->ci++;
+ }
+ req->cnum = n;
+
+ up_read(&hidp_session_sem);
+ return err;
+}
+
+int hidp_get_conninfo(struct hidp_conninfo *ci)
+{
+ struct hidp_session *session;
+
+ session = hidp_session_find(&ci->bdaddr);
+ if (session) {
+ hidp_copy_session(session, ci);
+ hidp_session_put(session);
+ }
+
+ return session ? 0 : -ENOENT;
+}
+
+static int __init hidp_init(void)
+{
+ BT_INFO("HIDP (Human Interface Emulation) ver %s", VERSION);
+
+ return hidp_init_sockets();
+}
+
+static void __exit hidp_exit(void)
+{
+ hidp_cleanup_sockets();
+}
+
+module_init(hidp_init);
+module_exit(hidp_exit);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_AUTHOR("David Herrmann <dh.herrmann@gmail.com>");
+MODULE_DESCRIPTION("Bluetooth HIDP ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("bt-proto-6");
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
new file mode 100644
index 000000000..8798492a6
--- /dev/null
+++ b/net/bluetooth/hidp/hidp.h
@@ -0,0 +1,192 @@
+/*
+ HIDP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#ifndef __HIDP_H
+#define __HIDP_H
+
+#include <linux/types.h>
+#include <linux/hid.h>
+#include <linux/kref.h>
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/l2cap.h>
+
+/* HIDP header masks */
+#define HIDP_HEADER_TRANS_MASK 0xf0
+#define HIDP_HEADER_PARAM_MASK 0x0f
+
+/* HIDP transaction types */
+#define HIDP_TRANS_HANDSHAKE 0x00
+#define HIDP_TRANS_HID_CONTROL 0x10
+#define HIDP_TRANS_GET_REPORT 0x40
+#define HIDP_TRANS_SET_REPORT 0x50
+#define HIDP_TRANS_GET_PROTOCOL 0x60
+#define HIDP_TRANS_SET_PROTOCOL 0x70
+#define HIDP_TRANS_GET_IDLE 0x80
+#define HIDP_TRANS_SET_IDLE 0x90
+#define HIDP_TRANS_DATA 0xa0
+#define HIDP_TRANS_DATC 0xb0
+
+/* HIDP handshake results */
+#define HIDP_HSHK_SUCCESSFUL 0x00
+#define HIDP_HSHK_NOT_READY 0x01
+#define HIDP_HSHK_ERR_INVALID_REPORT_ID 0x02
+#define HIDP_HSHK_ERR_UNSUPPORTED_REQUEST 0x03
+#define HIDP_HSHK_ERR_INVALID_PARAMETER 0x04
+#define HIDP_HSHK_ERR_UNKNOWN 0x0e
+#define HIDP_HSHK_ERR_FATAL 0x0f
+
+/* HIDP control operation parameters */
+#define HIDP_CTRL_NOP 0x00
+#define HIDP_CTRL_HARD_RESET 0x01
+#define HIDP_CTRL_SOFT_RESET 0x02
+#define HIDP_CTRL_SUSPEND 0x03
+#define HIDP_CTRL_EXIT_SUSPEND 0x04
+#define HIDP_CTRL_VIRTUAL_CABLE_UNPLUG 0x05
+
+/* HIDP data transaction headers */
+#define HIDP_DATA_RTYPE_MASK 0x03
+#define HIDP_DATA_RSRVD_MASK 0x0c
+#define HIDP_DATA_RTYPE_OTHER 0x00
+#define HIDP_DATA_RTYPE_INPUT 0x01
+#define HIDP_DATA_RTYPE_OUPUT 0x02
+#define HIDP_DATA_RTYPE_FEATURE 0x03
+
+/* HIDP protocol header parameters */
+#define HIDP_PROTO_BOOT 0x00
+#define HIDP_PROTO_REPORT 0x01
+
+/* HIDP ioctl defines */
+#define HIDPCONNADD _IOW('H', 200, int)
+#define HIDPCONNDEL _IOW('H', 201, int)
+#define HIDPGETCONNLIST _IOR('H', 210, int)
+#define HIDPGETCONNINFO _IOR('H', 211, int)
+
+#define HIDP_VIRTUAL_CABLE_UNPLUG 0
+#define HIDP_BOOT_PROTOCOL_MODE 1
+#define HIDP_BLUETOOTH_VENDOR_ID 9
+#define HIDP_WAITING_FOR_RETURN 10
+#define HIDP_WAITING_FOR_SEND_ACK 11
+
+struct hidp_connadd_req {
+ int ctrl_sock; /* Connected control socket */
+ int intr_sock; /* Connected interrupt socket */
+ __u16 parser;
+ __u16 rd_size;
+ __u8 __user *rd_data;
+ __u8 country;
+ __u8 subclass;
+ __u16 vendor;
+ __u16 product;
+ __u16 version;
+ __u32 flags;
+ __u32 idle_to;
+ char name[128];
+};
+
+struct hidp_conndel_req {
+ bdaddr_t bdaddr;
+ __u32 flags;
+};
+
+struct hidp_conninfo {
+ bdaddr_t bdaddr;
+ __u32 flags;
+ __u16 state;
+ __u16 vendor;
+ __u16 product;
+ __u16 version;
+ char name[128];
+};
+
+struct hidp_connlist_req {
+ __u32 cnum;
+ struct hidp_conninfo __user *ci;
+};
+
+int hidp_connection_add(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock);
+int hidp_connection_del(struct hidp_conndel_req *req);
+int hidp_get_connlist(struct hidp_connlist_req *req);
+int hidp_get_conninfo(struct hidp_conninfo *ci);
+
+enum hidp_session_state {
+ HIDP_SESSION_IDLING,
+ HIDP_SESSION_PREPARING,
+ HIDP_SESSION_RUNNING,
+};
+
+/* HIDP session defines */
+struct hidp_session {
+ struct list_head list;
+ struct kref ref;
+
+ /* runtime management */
+ atomic_t state;
+ wait_queue_head_t state_queue;
+ atomic_t terminate;
+ struct task_struct *task;
+ unsigned long flags;
+
+ /* connection management */
+ bdaddr_t bdaddr;
+ struct l2cap_conn *conn;
+ struct l2cap_user user;
+ struct socket *ctrl_sock;
+ struct socket *intr_sock;
+ struct sk_buff_head ctrl_transmit;
+ struct sk_buff_head intr_transmit;
+ uint ctrl_mtu;
+ uint intr_mtu;
+ unsigned long idle_to;
+
+ /* device management */
+ struct work_struct dev_init;
+ struct input_dev *input;
+ struct hid_device *hid;
+ struct timer_list timer;
+
+ /* Report descriptor */
+ __u8 *rd_data;
+ uint rd_size;
+
+ /* session data */
+ unsigned char keys[8];
+ unsigned char leds;
+
+ /* Used in hidp_get_raw_report() */
+ int waiting_report_type; /* HIDP_DATA_RTYPE_* */
+ int waiting_report_number; /* -1 for not numbered */
+ struct mutex report_mutex;
+ struct sk_buff *report_return;
+ wait_queue_head_t report_queue;
+
+ /* Used in hidp_output_raw_report() */
+ int output_report_success; /* boolean */
+
+ /* temporary input buffer */
+ u8 input_buf[HID_MAX_BUFFER_SIZE];
+};
+
+/* HIDP init defines */
+int __init hidp_init_sockets(void);
+void __exit hidp_cleanup_sockets(void);
+
+#endif /* __HIDP_H */
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
new file mode 100644
index 000000000..cb3fdde19
--- /dev/null
+++ b/net/bluetooth/hidp/sock.c
@@ -0,0 +1,299 @@
+/*
+ HIDP implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/export.h>
+#include <linux/file.h>
+
+#include "hidp.h"
+
+static struct bt_sock_list hidp_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(hidp_sk_list.lock)
+};
+
+static int hidp_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ BT_DBG("sock %p sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ bt_sock_unlink(&hidp_sk_list, sk);
+
+ sock_orphan(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = (void __user *) arg;
+ struct hidp_connadd_req ca;
+ struct hidp_conndel_req cd;
+ struct hidp_connlist_req cl;
+ struct hidp_conninfo ci;
+ struct socket *csock;
+ struct socket *isock;
+ int err;
+
+ BT_DBG("cmd %x arg %lx", cmd, arg);
+
+ switch (cmd) {
+ case HIDPCONNADD:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&ca, argp, sizeof(ca)))
+ return -EFAULT;
+
+ csock = sockfd_lookup(ca.ctrl_sock, &err);
+ if (!csock)
+ return err;
+
+ isock = sockfd_lookup(ca.intr_sock, &err);
+ if (!isock) {
+ sockfd_put(csock);
+ return err;
+ }
+
+ err = hidp_connection_add(&ca, csock, isock);
+ if (!err && copy_to_user(argp, &ca, sizeof(ca)))
+ err = -EFAULT;
+
+ sockfd_put(csock);
+ sockfd_put(isock);
+
+ return err;
+
+ case HIDPCONNDEL:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&cd, argp, sizeof(cd)))
+ return -EFAULT;
+
+ return hidp_connection_del(&cd);
+
+ case HIDPGETCONNLIST:
+ if (copy_from_user(&cl, argp, sizeof(cl)))
+ return -EFAULT;
+
+ if (cl.cnum <= 0)
+ return -EINVAL;
+
+ err = hidp_get_connlist(&cl);
+ if (!err && copy_to_user(argp, &cl, sizeof(cl)))
+ return -EFAULT;
+
+ return err;
+
+ case HIDPGETCONNINFO:
+ if (copy_from_user(&ci, argp, sizeof(ci)))
+ return -EFAULT;
+
+ err = hidp_get_conninfo(&ci);
+ if (!err && copy_to_user(argp, &ci, sizeof(ci)))
+ return -EFAULT;
+
+ return err;
+ }
+
+ return -EINVAL;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_hidp_connadd_req {
+ int ctrl_sock; /* Connected control socket */
+ int intr_sock; /* Connected interrupt socket */
+ __u16 parser;
+ __u16 rd_size;
+ compat_uptr_t rd_data;
+ __u8 country;
+ __u8 subclass;
+ __u16 vendor;
+ __u16 product;
+ __u16 version;
+ __u32 flags;
+ __u32 idle_to;
+ char name[128];
+};
+
+static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ if (cmd == HIDPGETCONNLIST) {
+ struct hidp_connlist_req cl;
+ u32 uci;
+ int err;
+
+ if (get_user(cl.cnum, (u32 __user *) arg) ||
+ get_user(uci, (u32 __user *) (arg + 4)))
+ return -EFAULT;
+
+ cl.ci = compat_ptr(uci);
+
+ if (cl.cnum <= 0)
+ return -EINVAL;
+
+ err = hidp_get_connlist(&cl);
+
+ if (!err && put_user(cl.cnum, (u32 __user *) arg))
+ err = -EFAULT;
+
+ return err;
+ } else if (cmd == HIDPCONNADD) {
+ struct compat_hidp_connadd_req ca;
+ struct hidp_connadd_req __user *uca;
+
+ uca = compat_alloc_user_space(sizeof(*uca));
+
+ if (copy_from_user(&ca, (void __user *) arg, sizeof(ca)))
+ return -EFAULT;
+
+ if (put_user(ca.ctrl_sock, &uca->ctrl_sock) ||
+ put_user(ca.intr_sock, &uca->intr_sock) ||
+ put_user(ca.parser, &uca->parser) ||
+ put_user(ca.rd_size, &uca->rd_size) ||
+ put_user(compat_ptr(ca.rd_data), &uca->rd_data) ||
+ put_user(ca.country, &uca->country) ||
+ put_user(ca.subclass, &uca->subclass) ||
+ put_user(ca.vendor, &uca->vendor) ||
+ put_user(ca.product, &uca->product) ||
+ put_user(ca.version, &uca->version) ||
+ put_user(ca.flags, &uca->flags) ||
+ put_user(ca.idle_to, &uca->idle_to) ||
+ copy_to_user(&uca->name[0], &ca.name[0], 128))
+ return -EFAULT;
+
+ arg = (unsigned long) uca;
+
+ /* Fall through. We don't actually write back any _changes_
+ to the structure anyway, so there's no need to copy back
+ into the original compat version */
+ }
+
+ return hidp_sock_ioctl(sock, cmd, arg);
+}
+#endif
+
+static const struct proto_ops hidp_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = hidp_sock_release,
+ .ioctl = hidp_sock_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = hidp_sock_compat_ioctl,
+#endif
+ .bind = sock_no_bind,
+ .getname = sock_no_getname,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = sock_no_recvmsg,
+ .poll = sock_no_poll,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .mmap = sock_no_mmap
+};
+
+static struct proto hidp_proto = {
+ .name = "HIDP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct bt_sock)
+};
+
+static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+
+ sock->ops = &hidp_sock_ops;
+
+ sock->state = SS_UNCONNECTED;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = protocol;
+ sk->sk_state = BT_OPEN;
+
+ bt_sock_link(&hidp_sk_list, sk);
+
+ return 0;
+}
+
+static const struct net_proto_family hidp_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = hidp_sock_create
+};
+
+int __init hidp_init_sockets(void)
+{
+ int err;
+
+ err = proto_register(&hidp_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_HIDP, &hidp_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("Can't register HIDP socket");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "hidp", &hidp_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create HIDP proc file");
+ bt_sock_unregister(BTPROTO_HIDP);
+ goto error;
+ }
+
+ BT_INFO("HIDP socket layer initialized");
+
+ return 0;
+
+error:
+ proto_unregister(&hidp_proto);
+ return err;
+}
+
+void __exit hidp_cleanup_sockets(void)
+{
+ bt_procfs_cleanup(&init_net, "hidp");
+ bt_sock_unregister(BTPROTO_HIDP);
+ proto_unregister(&hidp_proto);
+}
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
new file mode 100644
index 000000000..dad419782
--- /dev/null
+++ b/net/bluetooth/l2cap_core.c
@@ -0,0 +1,7622 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org>
+ Copyright (C) 2010 Google Inc.
+ Copyright (C) 2011 ProFUSION Embedded Systems
+ Copyright (c) 2012 Code Aurora Forum. All rights reserved.
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth L2CAP core. */
+
+#include <linux/module.h>
+
+#include <linux/debugfs.h>
+#include <linux/crc16.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "smp.h"
+#include "a2mp.h"
+#include "amp.h"
+
+#define LE_FLOWCTL_MAX_CREDITS 65535
+
+bool disable_ertm;
+
+static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD;
+
+static LIST_HEAD(chan_list);
+static DEFINE_RWLOCK(chan_list_lock);
+
+static u16 le_max_credits = L2CAP_LE_MAX_CREDITS;
+static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS;
+
+static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
+ u8 code, u8 ident, u16 dlen, void *data);
+static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
+ void *data);
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data);
+static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err);
+
+static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
+ struct sk_buff_head *skbs, u8 event);
+
+static inline u8 bdaddr_type(u8 link_type, u8 bdaddr_type)
+{
+ if (link_type == LE_LINK) {
+ if (bdaddr_type == ADDR_LE_DEV_PUBLIC)
+ return BDADDR_LE_PUBLIC;
+ else
+ return BDADDR_LE_RANDOM;
+ }
+
+ return BDADDR_BREDR;
+}
+
+static inline u8 bdaddr_src_type(struct hci_conn *hcon)
+{
+ return bdaddr_type(hcon->type, hcon->src_type);
+}
+
+static inline u8 bdaddr_dst_type(struct hci_conn *hcon)
+{
+ return bdaddr_type(hcon->type, hcon->dst_type);
+}
+
+/* ---- L2CAP channels ---- */
+
+static struct l2cap_chan *__l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
+ u16 cid)
+{
+ struct l2cap_chan *c;
+
+ list_for_each_entry(c, &conn->chan_l, list) {
+ if (c->dcid == cid)
+ return c;
+ }
+ return NULL;
+}
+
+static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn,
+ u16 cid)
+{
+ struct l2cap_chan *c;
+
+ list_for_each_entry(c, &conn->chan_l, list) {
+ if (c->scid == cid)
+ return c;
+ }
+ return NULL;
+}
+
+/* Find channel with given SCID.
+ * Returns locked channel. */
+static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
+ u16 cid)
+{
+ struct l2cap_chan *c;
+
+ mutex_lock(&conn->chan_lock);
+ c = __l2cap_get_chan_by_scid(conn, cid);
+ if (c)
+ l2cap_chan_lock(c);
+ mutex_unlock(&conn->chan_lock);
+
+ return c;
+}
+
+/* Find channel with given DCID.
+ * Returns locked channel.
+ */
+static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
+ u16 cid)
+{
+ struct l2cap_chan *c;
+
+ mutex_lock(&conn->chan_lock);
+ c = __l2cap_get_chan_by_dcid(conn, cid);
+ if (c)
+ l2cap_chan_lock(c);
+ mutex_unlock(&conn->chan_lock);
+
+ return c;
+}
+
+static struct l2cap_chan *__l2cap_get_chan_by_ident(struct l2cap_conn *conn,
+ u8 ident)
+{
+ struct l2cap_chan *c;
+
+ list_for_each_entry(c, &conn->chan_l, list) {
+ if (c->ident == ident)
+ return c;
+ }
+ return NULL;
+}
+
+static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn,
+ u8 ident)
+{
+ struct l2cap_chan *c;
+
+ mutex_lock(&conn->chan_lock);
+ c = __l2cap_get_chan_by_ident(conn, ident);
+ if (c)
+ l2cap_chan_lock(c);
+ mutex_unlock(&conn->chan_lock);
+
+ return c;
+}
+
+static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src)
+{
+ struct l2cap_chan *c;
+
+ list_for_each_entry(c, &chan_list, global_l) {
+ if (c->sport == psm && !bacmp(&c->src, src))
+ return c;
+ }
+ return NULL;
+}
+
+int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
+{
+ int err;
+
+ write_lock(&chan_list_lock);
+
+ if (psm && __l2cap_global_chan_by_addr(psm, src)) {
+ err = -EADDRINUSE;
+ goto done;
+ }
+
+ if (psm) {
+ chan->psm = psm;
+ chan->sport = psm;
+ err = 0;
+ } else {
+ u16 p;
+
+ err = -EINVAL;
+ for (p = 0x1001; p < 0x1100; p += 2)
+ if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) {
+ chan->psm = cpu_to_le16(p);
+ chan->sport = cpu_to_le16(p);
+ err = 0;
+ break;
+ }
+ }
+
+done:
+ write_unlock(&chan_list_lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(l2cap_add_psm);
+
+int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid)
+{
+ write_lock(&chan_list_lock);
+
+ /* Override the defaults (which are for conn-oriented) */
+ chan->omtu = L2CAP_DEFAULT_MTU;
+ chan->chan_type = L2CAP_CHAN_FIXED;
+
+ chan->scid = scid;
+
+ write_unlock(&chan_list_lock);
+
+ return 0;
+}
+
+static u16 l2cap_alloc_cid(struct l2cap_conn *conn)
+{
+ u16 cid, dyn_end;
+
+ if (conn->hcon->type == LE_LINK)
+ dyn_end = L2CAP_CID_LE_DYN_END;
+ else
+ dyn_end = L2CAP_CID_DYN_END;
+
+ for (cid = L2CAP_CID_DYN_START; cid < dyn_end; cid++) {
+ if (!__l2cap_get_chan_by_scid(conn, cid))
+ return cid;
+ }
+
+ return 0;
+}
+
+static void l2cap_state_change(struct l2cap_chan *chan, int state)
+{
+ BT_DBG("chan %p %s -> %s", chan, state_to_string(chan->state),
+ state_to_string(state));
+
+ chan->state = state;
+ chan->ops->state_change(chan, state, 0);
+}
+
+static inline void l2cap_state_change_and_error(struct l2cap_chan *chan,
+ int state, int err)
+{
+ chan->state = state;
+ chan->ops->state_change(chan, chan->state, err);
+}
+
+static inline void l2cap_chan_set_err(struct l2cap_chan *chan, int err)
+{
+ chan->ops->state_change(chan, chan->state, err);
+}
+
+static void __set_retrans_timer(struct l2cap_chan *chan)
+{
+ if (!delayed_work_pending(&chan->monitor_timer) &&
+ chan->retrans_timeout) {
+ l2cap_set_timer(chan, &chan->retrans_timer,
+ msecs_to_jiffies(chan->retrans_timeout));
+ }
+}
+
+static void __set_monitor_timer(struct l2cap_chan *chan)
+{
+ __clear_retrans_timer(chan);
+ if (chan->monitor_timeout) {
+ l2cap_set_timer(chan, &chan->monitor_timer,
+ msecs_to_jiffies(chan->monitor_timeout));
+ }
+}
+
+static struct sk_buff *l2cap_ertm_seq_in_queue(struct sk_buff_head *head,
+ u16 seq)
+{
+ struct sk_buff *skb;
+
+ skb_queue_walk(head, skb) {
+ if (bt_cb(skb)->l2cap.txseq == seq)
+ return skb;
+ }
+
+ return NULL;
+}
+
+/* ---- L2CAP sequence number lists ---- */
+
+/* For ERTM, ordered lists of sequence numbers must be tracked for
+ * SREJ requests that are received and for frames that are to be
+ * retransmitted. These seq_list functions implement a singly-linked
+ * list in an array, where membership in the list can also be checked
+ * in constant time. Items can also be added to the tail of the list
+ * and removed from the head in constant time, without further memory
+ * allocs or frees.
+ */
+
+static int l2cap_seq_list_init(struct l2cap_seq_list *seq_list, u16 size)
+{
+ size_t alloc_size, i;
+
+ /* Allocated size is a power of 2 to map sequence numbers
+ * (which may be up to 14 bits) in to a smaller array that is
+ * sized for the negotiated ERTM transmit windows.
+ */
+ alloc_size = roundup_pow_of_two(size);
+
+ seq_list->list = kmalloc(sizeof(u16) * alloc_size, GFP_KERNEL);
+ if (!seq_list->list)
+ return -ENOMEM;
+
+ seq_list->mask = alloc_size - 1;
+ seq_list->head = L2CAP_SEQ_LIST_CLEAR;
+ seq_list->tail = L2CAP_SEQ_LIST_CLEAR;
+ for (i = 0; i < alloc_size; i++)
+ seq_list->list[i] = L2CAP_SEQ_LIST_CLEAR;
+
+ return 0;
+}
+
+static inline void l2cap_seq_list_free(struct l2cap_seq_list *seq_list)
+{
+ kfree(seq_list->list);
+}
+
+static inline bool l2cap_seq_list_contains(struct l2cap_seq_list *seq_list,
+ u16 seq)
+{
+ /* Constant-time check for list membership */
+ return seq_list->list[seq & seq_list->mask] != L2CAP_SEQ_LIST_CLEAR;
+}
+
+static inline u16 l2cap_seq_list_pop(struct l2cap_seq_list *seq_list)
+{
+ u16 seq = seq_list->head;
+ u16 mask = seq_list->mask;
+
+ seq_list->head = seq_list->list[seq & mask];
+ seq_list->list[seq & mask] = L2CAP_SEQ_LIST_CLEAR;
+
+ if (seq_list->head == L2CAP_SEQ_LIST_TAIL) {
+ seq_list->head = L2CAP_SEQ_LIST_CLEAR;
+ seq_list->tail = L2CAP_SEQ_LIST_CLEAR;
+ }
+
+ return seq;
+}
+
+static void l2cap_seq_list_clear(struct l2cap_seq_list *seq_list)
+{
+ u16 i;
+
+ if (seq_list->head == L2CAP_SEQ_LIST_CLEAR)
+ return;
+
+ for (i = 0; i <= seq_list->mask; i++)
+ seq_list->list[i] = L2CAP_SEQ_LIST_CLEAR;
+
+ seq_list->head = L2CAP_SEQ_LIST_CLEAR;
+ seq_list->tail = L2CAP_SEQ_LIST_CLEAR;
+}
+
+static void l2cap_seq_list_append(struct l2cap_seq_list *seq_list, u16 seq)
+{
+ u16 mask = seq_list->mask;
+
+ /* All appends happen in constant time */
+
+ if (seq_list->list[seq & mask] != L2CAP_SEQ_LIST_CLEAR)
+ return;
+
+ if (seq_list->tail == L2CAP_SEQ_LIST_CLEAR)
+ seq_list->head = seq;
+ else
+ seq_list->list[seq_list->tail & mask] = seq;
+
+ seq_list->tail = seq;
+ seq_list->list[seq & mask] = L2CAP_SEQ_LIST_TAIL;
+}
+
+static void l2cap_chan_timeout(struct work_struct *work)
+{
+ struct l2cap_chan *chan = container_of(work, struct l2cap_chan,
+ chan_timer.work);
+ struct l2cap_conn *conn = chan->conn;
+ int reason;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+
+ mutex_lock(&conn->chan_lock);
+ l2cap_chan_lock(chan);
+
+ if (chan->state == BT_CONNECTED || chan->state == BT_CONFIG)
+ reason = ECONNREFUSED;
+ else if (chan->state == BT_CONNECT &&
+ chan->sec_level != BT_SECURITY_SDP)
+ reason = ECONNREFUSED;
+ else
+ reason = ETIMEDOUT;
+
+ l2cap_chan_close(chan, reason);
+
+ l2cap_chan_unlock(chan);
+
+ chan->ops->close(chan);
+ mutex_unlock(&conn->chan_lock);
+
+ l2cap_chan_put(chan);
+}
+
+struct l2cap_chan *l2cap_chan_create(void)
+{
+ struct l2cap_chan *chan;
+
+ chan = kzalloc(sizeof(*chan), GFP_ATOMIC);
+ if (!chan)
+ return NULL;
+
+ mutex_init(&chan->lock);
+
+ /* Set default lock nesting level */
+ atomic_set(&chan->nesting, L2CAP_NESTING_NORMAL);
+
+ write_lock(&chan_list_lock);
+ list_add(&chan->global_l, &chan_list);
+ write_unlock(&chan_list_lock);
+
+ INIT_DELAYED_WORK(&chan->chan_timer, l2cap_chan_timeout);
+
+ chan->state = BT_OPEN;
+
+ kref_init(&chan->kref);
+
+ /* This flag is cleared in l2cap_chan_ready() */
+ set_bit(CONF_NOT_COMPLETE, &chan->conf_state);
+
+ BT_DBG("chan %p", chan);
+
+ return chan;
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_create);
+
+static void l2cap_chan_destroy(struct kref *kref)
+{
+ struct l2cap_chan *chan = container_of(kref, struct l2cap_chan, kref);
+
+ BT_DBG("chan %p", chan);
+
+ write_lock(&chan_list_lock);
+ list_del(&chan->global_l);
+ write_unlock(&chan_list_lock);
+
+ kfree(chan);
+}
+
+void l2cap_chan_hold(struct l2cap_chan *c)
+{
+ BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+
+ kref_get(&c->kref);
+}
+
+void l2cap_chan_put(struct l2cap_chan *c)
+{
+ BT_DBG("chan %p orig refcnt %d", c, atomic_read(&c->kref.refcount));
+
+ kref_put(&c->kref, l2cap_chan_destroy);
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_put);
+
+void l2cap_chan_set_defaults(struct l2cap_chan *chan)
+{
+ chan->fcs = L2CAP_FCS_CRC16;
+ chan->max_tx = L2CAP_DEFAULT_MAX_TX;
+ chan->tx_win = L2CAP_DEFAULT_TX_WINDOW;
+ chan->tx_win_max = L2CAP_DEFAULT_TX_WINDOW;
+ chan->remote_max_tx = chan->max_tx;
+ chan->remote_tx_win = chan->tx_win;
+ chan->ack_win = L2CAP_DEFAULT_TX_WINDOW;
+ chan->sec_level = BT_SECURITY_LOW;
+ chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
+ chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO;
+ chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO;
+ chan->conf_state = 0;
+
+ set_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults);
+
+static void l2cap_le_flowctl_init(struct l2cap_chan *chan)
+{
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ chan->tx_credits = 0;
+ chan->rx_credits = le_max_credits;
+ chan->mps = min_t(u16, chan->imtu, le_default_mps);
+
+ skb_queue_head_init(&chan->tx_q);
+}
+
+void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
+{
+ BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn,
+ __le16_to_cpu(chan->psm), chan->dcid);
+
+ conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM;
+
+ chan->conn = conn;
+
+ switch (chan->chan_type) {
+ case L2CAP_CHAN_CONN_ORIENTED:
+ /* Alloc CID for connection-oriented socket */
+ chan->scid = l2cap_alloc_cid(conn);
+ if (conn->hcon->type == ACL_LINK)
+ chan->omtu = L2CAP_DEFAULT_MTU;
+ break;
+
+ case L2CAP_CHAN_CONN_LESS:
+ /* Connectionless socket */
+ chan->scid = L2CAP_CID_CONN_LESS;
+ chan->dcid = L2CAP_CID_CONN_LESS;
+ chan->omtu = L2CAP_DEFAULT_MTU;
+ break;
+
+ case L2CAP_CHAN_FIXED:
+ /* Caller will set CID and CID specific MTU values */
+ break;
+
+ default:
+ /* Raw socket can send/recv signalling messages only */
+ chan->scid = L2CAP_CID_SIGNALING;
+ chan->dcid = L2CAP_CID_SIGNALING;
+ chan->omtu = L2CAP_DEFAULT_MTU;
+ }
+
+ chan->local_id = L2CAP_BESTEFFORT_ID;
+ chan->local_stype = L2CAP_SERV_BESTEFFORT;
+ chan->local_msdu = L2CAP_DEFAULT_MAX_SDU_SIZE;
+ chan->local_sdu_itime = L2CAP_DEFAULT_SDU_ITIME;
+ chan->local_acc_lat = L2CAP_DEFAULT_ACC_LAT;
+ chan->local_flush_to = L2CAP_EFS_DEFAULT_FLUSH_TO;
+
+ l2cap_chan_hold(chan);
+
+ /* Only keep a reference for fixed channels if they requested it */
+ if (chan->chan_type != L2CAP_CHAN_FIXED ||
+ test_bit(FLAG_HOLD_HCI_CONN, &chan->flags))
+ hci_conn_hold(conn->hcon);
+
+ list_add(&chan->list, &conn->chan_l);
+}
+
+void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
+{
+ mutex_lock(&conn->chan_lock);
+ __l2cap_chan_add(conn, chan);
+ mutex_unlock(&conn->chan_lock);
+}
+
+void l2cap_chan_del(struct l2cap_chan *chan, int err)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ __clear_chan_timer(chan);
+
+ BT_DBG("chan %p, conn %p, err %d, state %s", chan, conn, err,
+ state_to_string(chan->state));
+
+ chan->ops->teardown(chan, err);
+
+ if (conn) {
+ struct amp_mgr *mgr = conn->hcon->amp_mgr;
+ /* Delete from channel list */
+ list_del(&chan->list);
+
+ l2cap_chan_put(chan);
+
+ chan->conn = NULL;
+
+ /* Reference was only held for non-fixed channels or
+ * fixed channels that explicitly requested it using the
+ * FLAG_HOLD_HCI_CONN flag.
+ */
+ if (chan->chan_type != L2CAP_CHAN_FIXED ||
+ test_bit(FLAG_HOLD_HCI_CONN, &chan->flags))
+ hci_conn_drop(conn->hcon);
+
+ if (mgr && mgr->bredr_chan == chan)
+ mgr->bredr_chan = NULL;
+ }
+
+ if (chan->hs_hchan) {
+ struct hci_chan *hs_hchan = chan->hs_hchan;
+
+ BT_DBG("chan %p disconnect hs_hchan %p", chan, hs_hchan);
+ amp_disconnect_logical_link(hs_hchan);
+ }
+
+ if (test_bit(CONF_NOT_COMPLETE, &chan->conf_state))
+ return;
+
+ switch(chan->mode) {
+ case L2CAP_MODE_BASIC:
+ break;
+
+ case L2CAP_MODE_LE_FLOWCTL:
+ skb_queue_purge(&chan->tx_q);
+ break;
+
+ case L2CAP_MODE_ERTM:
+ __clear_retrans_timer(chan);
+ __clear_monitor_timer(chan);
+ __clear_ack_timer(chan);
+
+ skb_queue_purge(&chan->srej_q);
+
+ l2cap_seq_list_free(&chan->srej_list);
+ l2cap_seq_list_free(&chan->retrans_list);
+
+ /* fall through */
+
+ case L2CAP_MODE_STREAMING:
+ skb_queue_purge(&chan->tx_q);
+ break;
+ }
+
+ return;
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_del);
+
+static void l2cap_conn_update_id_addr(struct work_struct *work)
+{
+ struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
+ id_addr_update_work);
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan;
+
+ mutex_lock(&conn->chan_lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ l2cap_chan_lock(chan);
+ bacpy(&chan->dst, &hcon->dst);
+ chan->dst_type = bdaddr_dst_type(hcon);
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+}
+
+static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_le_conn_rsp rsp;
+ u16 result;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ result = L2CAP_CR_AUTHORIZATION;
+ else
+ result = L2CAP_CR_BAD_PSM;
+
+ l2cap_state_change(chan, BT_DISCONN);
+
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.mtu = cpu_to_le16(chan->imtu);
+ rsp.mps = cpu_to_le16(chan->mps);
+ rsp.credits = cpu_to_le16(chan->rx_credits);
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
+ &rsp);
+}
+
+static void l2cap_chan_connect_reject(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_conn_rsp rsp;
+ u16 result;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ result = L2CAP_CR_SEC_BLOCK;
+ else
+ result = L2CAP_CR_BAD_PSM;
+
+ l2cap_state_change(chan, BT_DISCONN);
+
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.result = cpu_to_le16(result);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+}
+
+void l2cap_chan_close(struct l2cap_chan *chan, int reason)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+
+ switch (chan->state) {
+ case BT_LISTEN:
+ chan->ops->teardown(chan, 0);
+ break;
+
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) {
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+ l2cap_send_disconn_req(chan, reason);
+ } else
+ l2cap_chan_del(chan, reason);
+ break;
+
+ case BT_CONNECT2:
+ if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) {
+ if (conn->hcon->type == ACL_LINK)
+ l2cap_chan_connect_reject(chan);
+ else if (conn->hcon->type == LE_LINK)
+ l2cap_chan_le_connect_reject(chan);
+ }
+
+ l2cap_chan_del(chan, reason);
+ break;
+
+ case BT_CONNECT:
+ case BT_DISCONN:
+ l2cap_chan_del(chan, reason);
+ break;
+
+ default:
+ chan->ops->teardown(chan, 0);
+ break;
+ }
+}
+EXPORT_SYMBOL(l2cap_chan_close);
+
+static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan)
+{
+ switch (chan->chan_type) {
+ case L2CAP_CHAN_RAW:
+ switch (chan->sec_level) {
+ case BT_SECURITY_HIGH:
+ case BT_SECURITY_FIPS:
+ return HCI_AT_DEDICATED_BONDING_MITM;
+ case BT_SECURITY_MEDIUM:
+ return HCI_AT_DEDICATED_BONDING;
+ default:
+ return HCI_AT_NO_BONDING;
+ }
+ break;
+ case L2CAP_CHAN_CONN_LESS:
+ if (chan->psm == cpu_to_le16(L2CAP_PSM_3DSP)) {
+ if (chan->sec_level == BT_SECURITY_LOW)
+ chan->sec_level = BT_SECURITY_SDP;
+ }
+ if (chan->sec_level == BT_SECURITY_HIGH ||
+ chan->sec_level == BT_SECURITY_FIPS)
+ return HCI_AT_NO_BONDING_MITM;
+ else
+ return HCI_AT_NO_BONDING;
+ break;
+ case L2CAP_CHAN_CONN_ORIENTED:
+ if (chan->psm == cpu_to_le16(L2CAP_PSM_SDP)) {
+ if (chan->sec_level == BT_SECURITY_LOW)
+ chan->sec_level = BT_SECURITY_SDP;
+
+ if (chan->sec_level == BT_SECURITY_HIGH ||
+ chan->sec_level == BT_SECURITY_FIPS)
+ return HCI_AT_NO_BONDING_MITM;
+ else
+ return HCI_AT_NO_BONDING;
+ }
+ /* fall through */
+ default:
+ switch (chan->sec_level) {
+ case BT_SECURITY_HIGH:
+ case BT_SECURITY_FIPS:
+ return HCI_AT_GENERAL_BONDING_MITM;
+ case BT_SECURITY_MEDIUM:
+ return HCI_AT_GENERAL_BONDING;
+ default:
+ return HCI_AT_NO_BONDING;
+ }
+ break;
+ }
+}
+
+/* Service level security */
+int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator)
+{
+ struct l2cap_conn *conn = chan->conn;
+ __u8 auth_type;
+
+ if (conn->hcon->type == LE_LINK)
+ return smp_conn_security(conn->hcon, chan->sec_level);
+
+ auth_type = l2cap_get_auth_type(chan);
+
+ return hci_conn_security(conn->hcon, chan->sec_level, auth_type,
+ initiator);
+}
+
+static u8 l2cap_get_ident(struct l2cap_conn *conn)
+{
+ u8 id;
+
+ /* Get next available identificator.
+ * 1 - 128 are used by kernel.
+ * 129 - 199 are reserved.
+ * 200 - 254 are used by utilities like l2ping, etc.
+ */
+
+ mutex_lock(&conn->ident_lock);
+
+ if (++conn->tx_ident > 128)
+ conn->tx_ident = 1;
+
+ id = conn->tx_ident;
+
+ mutex_unlock(&conn->ident_lock);
+
+ return id;
+}
+
+static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
+ void *data)
+{
+ struct sk_buff *skb = l2cap_build_cmd(conn, code, ident, len, data);
+ u8 flags;
+
+ BT_DBG("code 0x%2.2x", code);
+
+ if (!skb)
+ return;
+
+ /* Use NO_FLUSH if supported or we have an LE link (which does
+ * not support auto-flushing packets) */
+ if (lmp_no_flush_capable(conn->hcon->hdev) ||
+ conn->hcon->type == LE_LINK)
+ flags = ACL_START_NO_FLUSH;
+ else
+ flags = ACL_START;
+
+ bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON;
+ skb->priority = HCI_PRIO_MAX;
+
+ hci_send_acl(conn->hchan, skb, flags);
+}
+
+static bool __chan_is_moving(struct l2cap_chan *chan)
+{
+ return chan->move_state != L2CAP_MOVE_STABLE &&
+ chan->move_state != L2CAP_MOVE_WAIT_PREPARE;
+}
+
+static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct hci_conn *hcon = chan->conn->hcon;
+ u16 flags;
+
+ BT_DBG("chan %p, skb %p len %d priority %u", chan, skb, skb->len,
+ skb->priority);
+
+ if (chan->hs_hcon && !__chan_is_moving(chan)) {
+ if (chan->hs_hchan)
+ hci_send_acl(chan->hs_hchan, skb, ACL_COMPLETE);
+ else
+ kfree_skb(skb);
+
+ return;
+ }
+
+ /* Use NO_FLUSH for LE links (where this is the only option) or
+ * if the BR/EDR link supports it and flushing has not been
+ * explicitly requested (through FLAG_FLUSHABLE).
+ */
+ if (hcon->type == LE_LINK ||
+ (!test_bit(FLAG_FLUSHABLE, &chan->flags) &&
+ lmp_no_flush_capable(hcon->hdev)))
+ flags = ACL_START_NO_FLUSH;
+ else
+ flags = ACL_START;
+
+ bt_cb(skb)->force_active = test_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+ hci_send_acl(chan->conn->hchan, skb, flags);
+}
+
+static void __unpack_enhanced_control(u16 enh, struct l2cap_ctrl *control)
+{
+ control->reqseq = (enh & L2CAP_CTRL_REQSEQ) >> L2CAP_CTRL_REQSEQ_SHIFT;
+ control->final = (enh & L2CAP_CTRL_FINAL) >> L2CAP_CTRL_FINAL_SHIFT;
+
+ if (enh & L2CAP_CTRL_FRAME_TYPE) {
+ /* S-Frame */
+ control->sframe = 1;
+ control->poll = (enh & L2CAP_CTRL_POLL) >> L2CAP_CTRL_POLL_SHIFT;
+ control->super = (enh & L2CAP_CTRL_SUPERVISE) >> L2CAP_CTRL_SUPER_SHIFT;
+
+ control->sar = 0;
+ control->txseq = 0;
+ } else {
+ /* I-Frame */
+ control->sframe = 0;
+ control->sar = (enh & L2CAP_CTRL_SAR) >> L2CAP_CTRL_SAR_SHIFT;
+ control->txseq = (enh & L2CAP_CTRL_TXSEQ) >> L2CAP_CTRL_TXSEQ_SHIFT;
+
+ control->poll = 0;
+ control->super = 0;
+ }
+}
+
+static void __unpack_extended_control(u32 ext, struct l2cap_ctrl *control)
+{
+ control->reqseq = (ext & L2CAP_EXT_CTRL_REQSEQ) >> L2CAP_EXT_CTRL_REQSEQ_SHIFT;
+ control->final = (ext & L2CAP_EXT_CTRL_FINAL) >> L2CAP_EXT_CTRL_FINAL_SHIFT;
+
+ if (ext & L2CAP_EXT_CTRL_FRAME_TYPE) {
+ /* S-Frame */
+ control->sframe = 1;
+ control->poll = (ext & L2CAP_EXT_CTRL_POLL) >> L2CAP_EXT_CTRL_POLL_SHIFT;
+ control->super = (ext & L2CAP_EXT_CTRL_SUPERVISE) >> L2CAP_EXT_CTRL_SUPER_SHIFT;
+
+ control->sar = 0;
+ control->txseq = 0;
+ } else {
+ /* I-Frame */
+ control->sframe = 0;
+ control->sar = (ext & L2CAP_EXT_CTRL_SAR) >> L2CAP_EXT_CTRL_SAR_SHIFT;
+ control->txseq = (ext & L2CAP_EXT_CTRL_TXSEQ) >> L2CAP_EXT_CTRL_TXSEQ_SHIFT;
+
+ control->poll = 0;
+ control->super = 0;
+ }
+}
+
+static inline void __unpack_control(struct l2cap_chan *chan,
+ struct sk_buff *skb)
+{
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags)) {
+ __unpack_extended_control(get_unaligned_le32(skb->data),
+ &bt_cb(skb)->l2cap);
+ skb_pull(skb, L2CAP_EXT_CTRL_SIZE);
+ } else {
+ __unpack_enhanced_control(get_unaligned_le16(skb->data),
+ &bt_cb(skb)->l2cap);
+ skb_pull(skb, L2CAP_ENH_CTRL_SIZE);
+ }
+}
+
+static u32 __pack_extended_control(struct l2cap_ctrl *control)
+{
+ u32 packed;
+
+ packed = control->reqseq << L2CAP_EXT_CTRL_REQSEQ_SHIFT;
+ packed |= control->final << L2CAP_EXT_CTRL_FINAL_SHIFT;
+
+ if (control->sframe) {
+ packed |= control->poll << L2CAP_EXT_CTRL_POLL_SHIFT;
+ packed |= control->super << L2CAP_EXT_CTRL_SUPER_SHIFT;
+ packed |= L2CAP_EXT_CTRL_FRAME_TYPE;
+ } else {
+ packed |= control->sar << L2CAP_EXT_CTRL_SAR_SHIFT;
+ packed |= control->txseq << L2CAP_EXT_CTRL_TXSEQ_SHIFT;
+ }
+
+ return packed;
+}
+
+static u16 __pack_enhanced_control(struct l2cap_ctrl *control)
+{
+ u16 packed;
+
+ packed = control->reqseq << L2CAP_CTRL_REQSEQ_SHIFT;
+ packed |= control->final << L2CAP_CTRL_FINAL_SHIFT;
+
+ if (control->sframe) {
+ packed |= control->poll << L2CAP_CTRL_POLL_SHIFT;
+ packed |= control->super << L2CAP_CTRL_SUPER_SHIFT;
+ packed |= L2CAP_CTRL_FRAME_TYPE;
+ } else {
+ packed |= control->sar << L2CAP_CTRL_SAR_SHIFT;
+ packed |= control->txseq << L2CAP_CTRL_TXSEQ_SHIFT;
+ }
+
+ return packed;
+}
+
+static inline void __pack_control(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb)
+{
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags)) {
+ put_unaligned_le32(__pack_extended_control(control),
+ skb->data + L2CAP_HDR_SIZE);
+ } else {
+ put_unaligned_le16(__pack_enhanced_control(control),
+ skb->data + L2CAP_HDR_SIZE);
+ }
+}
+
+static inline unsigned int __ertm_hdr_size(struct l2cap_chan *chan)
+{
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ return L2CAP_EXT_HDR_SIZE;
+ else
+ return L2CAP_ENH_HDR_SIZE;
+}
+
+static struct sk_buff *l2cap_create_sframe_pdu(struct l2cap_chan *chan,
+ u32 control)
+{
+ struct sk_buff *skb;
+ struct l2cap_hdr *lh;
+ int hlen = __ertm_hdr_size(chan);
+
+ if (chan->fcs == L2CAP_FCS_CRC16)
+ hlen += L2CAP_FCS_SIZE;
+
+ skb = bt_skb_alloc(hlen, GFP_KERNEL);
+
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+ lh->len = cpu_to_le16(hlen - L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ put_unaligned_le32(control, skb_put(skb, L2CAP_EXT_CTRL_SIZE));
+ else
+ put_unaligned_le16(control, skb_put(skb, L2CAP_ENH_CTRL_SIZE));
+
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ u16 fcs = crc16(0, (u8 *)skb->data, skb->len);
+ put_unaligned_le16(fcs, skb_put(skb, L2CAP_FCS_SIZE));
+ }
+
+ skb->priority = HCI_PRIO_MAX;
+ return skb;
+}
+
+static void l2cap_send_sframe(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ struct sk_buff *skb;
+ u32 control_field;
+
+ BT_DBG("chan %p, control %p", chan, control);
+
+ if (!control->sframe)
+ return;
+
+ if (__chan_is_moving(chan))
+ return;
+
+ if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state) &&
+ !control->poll)
+ control->final = 1;
+
+ if (control->super == L2CAP_SUPER_RR)
+ clear_bit(CONN_RNR_SENT, &chan->conn_state);
+ else if (control->super == L2CAP_SUPER_RNR)
+ set_bit(CONN_RNR_SENT, &chan->conn_state);
+
+ if (control->super != L2CAP_SUPER_SREJ) {
+ chan->last_acked_seq = control->reqseq;
+ __clear_ack_timer(chan);
+ }
+
+ BT_DBG("reqseq %d, final %d, poll %d, super %d", control->reqseq,
+ control->final, control->poll, control->super);
+
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ control_field = __pack_extended_control(control);
+ else
+ control_field = __pack_enhanced_control(control);
+
+ skb = l2cap_create_sframe_pdu(chan, control_field);
+ if (!IS_ERR(skb))
+ l2cap_do_send(chan, skb);
+}
+
+static void l2cap_send_rr_or_rnr(struct l2cap_chan *chan, bool poll)
+{
+ struct l2cap_ctrl control;
+
+ BT_DBG("chan %p, poll %d", chan, poll);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.poll = poll;
+
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state))
+ control.super = L2CAP_SUPER_RNR;
+ else
+ control.super = L2CAP_SUPER_RR;
+
+ control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &control);
+}
+
+static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan)
+{
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED)
+ return true;
+
+ return !test_bit(CONF_CONNECT_PEND, &chan->conf_state);
+}
+
+static bool __amp_capable(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_dev *hdev;
+ bool amp_available = false;
+
+ if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
+ return false;
+
+ if (!(conn->remote_fixed_chan & L2CAP_FC_A2MP))
+ return false;
+
+ read_lock(&hci_dev_list_lock);
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ if (hdev->amp_type != AMP_TYPE_BREDR &&
+ test_bit(HCI_UP, &hdev->flags)) {
+ amp_available = true;
+ break;
+ }
+ }
+ read_unlock(&hci_dev_list_lock);
+
+ if (chan->chan_policy == BT_CHANNEL_POLICY_AMP_PREFERRED)
+ return amp_available;
+
+ return false;
+}
+
+static bool l2cap_check_efs(struct l2cap_chan *chan)
+{
+ /* Check EFS parameters */
+ return true;
+}
+
+void l2cap_send_conn_req(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_conn_req req;
+
+ req.scid = cpu_to_le16(chan->scid);
+ req.psm = chan->psm;
+
+ chan->ident = l2cap_get_ident(conn);
+
+ set_bit(CONF_CONNECT_PEND, &chan->conf_state);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ, sizeof(req), &req);
+}
+
+static void l2cap_send_create_chan_req(struct l2cap_chan *chan, u8 amp_id)
+{
+ struct l2cap_create_chan_req req;
+ req.scid = cpu_to_le16(chan->scid);
+ req.psm = chan->psm;
+ req.amp_id = amp_id;
+
+ chan->ident = l2cap_get_ident(chan->conn);
+
+ l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_REQ,
+ sizeof(req), &req);
+}
+
+static void l2cap_move_setup(struct l2cap_chan *chan)
+{
+ struct sk_buff *skb;
+
+ BT_DBG("chan %p", chan);
+
+ if (chan->mode != L2CAP_MODE_ERTM)
+ return;
+
+ __clear_retrans_timer(chan);
+ __clear_monitor_timer(chan);
+ __clear_ack_timer(chan);
+
+ chan->retry_count = 0;
+ skb_queue_walk(&chan->tx_q, skb) {
+ if (bt_cb(skb)->l2cap.retries)
+ bt_cb(skb)->l2cap.retries = 1;
+ else
+ break;
+ }
+
+ chan->expected_tx_seq = chan->buffer_seq;
+
+ clear_bit(CONN_REJ_ACT, &chan->conn_state);
+ clear_bit(CONN_SREJ_ACT, &chan->conn_state);
+ l2cap_seq_list_clear(&chan->retrans_list);
+ l2cap_seq_list_clear(&chan->srej_list);
+ skb_queue_purge(&chan->srej_q);
+
+ chan->tx_state = L2CAP_TX_STATE_XMIT;
+ chan->rx_state = L2CAP_RX_STATE_MOVE;
+
+ set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+}
+
+static void l2cap_move_done(struct l2cap_chan *chan)
+{
+ u8 move_role = chan->move_role;
+ BT_DBG("chan %p", chan);
+
+ chan->move_state = L2CAP_MOVE_STABLE;
+ chan->move_role = L2CAP_MOVE_ROLE_NONE;
+
+ if (chan->mode != L2CAP_MODE_ERTM)
+ return;
+
+ switch (move_role) {
+ case L2CAP_MOVE_ROLE_INITIATOR:
+ l2cap_tx(chan, NULL, NULL, L2CAP_EV_EXPLICIT_POLL);
+ chan->rx_state = L2CAP_RX_STATE_WAIT_F;
+ break;
+ case L2CAP_MOVE_ROLE_RESPONDER:
+ chan->rx_state = L2CAP_RX_STATE_WAIT_P;
+ break;
+ }
+}
+
+static void l2cap_chan_ready(struct l2cap_chan *chan)
+{
+ /* The channel may have already been flagged as connected in
+ * case of receiving data before the L2CAP info req/rsp
+ * procedure is complete.
+ */
+ if (chan->state == BT_CONNECTED)
+ return;
+
+ /* This clears all conf flags, including CONF_NOT_COMPLETE */
+ chan->conf_state = 0;
+ __clear_chan_timer(chan);
+
+ if (chan->mode == L2CAP_MODE_LE_FLOWCTL && !chan->tx_credits)
+ chan->ops->suspend(chan);
+
+ chan->state = BT_CONNECTED;
+
+ chan->ops->ready(chan);
+}
+
+static void l2cap_le_connect(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_le_conn_req req;
+
+ if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ req.psm = chan->psm;
+ req.scid = cpu_to_le16(chan->scid);
+ req.mtu = cpu_to_le16(chan->imtu);
+ req.mps = cpu_to_le16(chan->mps);
+ req.credits = cpu_to_le16(chan->rx_credits);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_REQ,
+ sizeof(req), &req);
+}
+
+static void l2cap_le_start(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ if (!smp_conn_security(conn->hcon, chan->sec_level))
+ return;
+
+ if (!chan->psm) {
+ l2cap_chan_ready(chan);
+ return;
+ }
+
+ if (chan->state == BT_CONNECT)
+ l2cap_le_connect(chan);
+}
+
+static void l2cap_start_connection(struct l2cap_chan *chan)
+{
+ if (__amp_capable(chan)) {
+ BT_DBG("chan %p AMP capable: discover AMPs", chan);
+ a2mp_discover_amp(chan);
+ } else if (chan->conn->hcon->type == LE_LINK) {
+ l2cap_le_start(chan);
+ } else {
+ l2cap_send_conn_req(chan);
+ }
+}
+
+static void l2cap_request_info(struct l2cap_conn *conn)
+{
+ struct l2cap_info_req req;
+
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)
+ return;
+
+ req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT;
+ conn->info_ident = l2cap_get_ident(conn);
+
+ schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT);
+
+ l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ,
+ sizeof(req), &req);
+}
+
+static void l2cap_do_start(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ if (conn->hcon->type == LE_LINK) {
+ l2cap_le_start(chan);
+ return;
+ }
+
+ if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)) {
+ l2cap_request_info(conn);
+ return;
+ }
+
+ if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE))
+ return;
+
+ if (l2cap_chan_check_security(chan, true) &&
+ __l2cap_no_conn_pending(chan))
+ l2cap_start_connection(chan);
+}
+
+static inline int l2cap_mode_supported(__u8 mode, __u32 feat_mask)
+{
+ u32 local_feat_mask = l2cap_feat_mask;
+ if (!disable_ertm)
+ local_feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING;
+
+ switch (mode) {
+ case L2CAP_MODE_ERTM:
+ return L2CAP_FEAT_ERTM & feat_mask & local_feat_mask;
+ case L2CAP_MODE_STREAMING:
+ return L2CAP_FEAT_STREAMING & feat_mask & local_feat_mask;
+ default:
+ return 0x00;
+ }
+}
+
+static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_disconn_req req;
+
+ if (!conn)
+ return;
+
+ if (chan->mode == L2CAP_MODE_ERTM && chan->state == BT_CONNECTED) {
+ __clear_retrans_timer(chan);
+ __clear_monitor_timer(chan);
+ __clear_ack_timer(chan);
+ }
+
+ if (chan->scid == L2CAP_CID_A2MP) {
+ l2cap_state_change(chan, BT_DISCONN);
+ return;
+ }
+
+ req.dcid = cpu_to_le16(chan->dcid);
+ req.scid = cpu_to_le16(chan->scid);
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_DISCONN_REQ,
+ sizeof(req), &req);
+
+ l2cap_state_change_and_error(chan, BT_DISCONN, err);
+}
+
+/* ---- L2CAP connections ---- */
+static void l2cap_conn_start(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan, *tmp;
+
+ BT_DBG("conn %p", conn);
+
+ mutex_lock(&conn->chan_lock);
+
+ list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
+ l2cap_chan_lock(chan);
+
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ l2cap_chan_ready(chan);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (chan->state == BT_CONNECT) {
+ if (!l2cap_chan_check_security(chan, true) ||
+ !__l2cap_no_conn_pending(chan)) {
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (!l2cap_mode_supported(chan->mode, conn->feat_mask)
+ && test_bit(CONF_STATE2_DEVICE,
+ &chan->conf_state)) {
+ l2cap_chan_close(chan, ECONNRESET);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ l2cap_start_connection(chan);
+
+ } else if (chan->state == BT_CONNECT2) {
+ struct l2cap_conn_rsp rsp;
+ char buf[128];
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+
+ if (l2cap_chan_check_security(chan, false)) {
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ rsp.result = cpu_to_le16(L2CAP_CR_PEND);
+ rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND);
+ chan->ops->defer(chan);
+
+ } else {
+ l2cap_state_change(chan, BT_CONFIG);
+ rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ }
+ } else {
+ rsp.result = cpu_to_le16(L2CAP_CR_PEND);
+ rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND);
+ }
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP,
+ sizeof(rsp), &rsp);
+
+ if (test_bit(CONF_REQ_SENT, &chan->conf_state) ||
+ rsp.result != L2CAP_CR_SUCCESS) {
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ set_bit(CONF_REQ_SENT, &chan->conf_state);
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf), buf);
+ chan->num_conf_req++;
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+}
+
+static void l2cap_le_conn_ready(struct l2cap_conn *conn)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+
+ BT_DBG("%s conn %p", hdev->name, conn);
+
+ /* For outgoing pairing which doesn't necessarily have an
+ * associated socket (e.g. mgmt_pair_device).
+ */
+ if (hcon->out)
+ smp_conn_security(hcon, hcon->pending_sec_level);
+
+ /* For LE slave connections, make sure the connection interval
+ * is in the range of the minium and maximum interval that has
+ * been configured for this connection. If not, then trigger
+ * the connection update procedure.
+ */
+ if (hcon->role == HCI_ROLE_SLAVE &&
+ (hcon->le_conn_interval < hcon->le_conn_min_interval ||
+ hcon->le_conn_interval > hcon->le_conn_max_interval)) {
+ struct l2cap_conn_param_update_req req;
+
+ req.min = cpu_to_le16(hcon->le_conn_min_interval);
+ req.max = cpu_to_le16(hcon->le_conn_max_interval);
+ req.latency = cpu_to_le16(hcon->le_conn_latency);
+ req.to_multiplier = cpu_to_le16(hcon->le_supv_timeout);
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn),
+ L2CAP_CONN_PARAM_UPDATE_REQ, sizeof(req), &req);
+ }
+}
+
+static void l2cap_conn_ready(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan;
+ struct hci_conn *hcon = conn->hcon;
+
+ BT_DBG("conn %p", conn);
+
+ if (hcon->type == ACL_LINK)
+ l2cap_request_info(conn);
+
+ mutex_lock(&conn->chan_lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+
+ l2cap_chan_lock(chan);
+
+ if (chan->scid == L2CAP_CID_A2MP) {
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (hcon->type == LE_LINK) {
+ l2cap_le_start(chan);
+ } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)
+ l2cap_chan_ready(chan);
+ } else if (chan->state == BT_CONNECT) {
+ l2cap_do_start(chan);
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+
+ if (hcon->type == LE_LINK)
+ l2cap_le_conn_ready(conn);
+
+ queue_work(hcon->hdev->workqueue, &conn->pending_rx_work);
+}
+
+/* Notify sockets that we cannot guaranty reliability anymore */
+static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
+{
+ struct l2cap_chan *chan;
+
+ BT_DBG("conn %p", conn);
+
+ mutex_lock(&conn->chan_lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags))
+ l2cap_chan_set_err(chan, err);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+}
+
+static void l2cap_info_timeout(struct work_struct *work)
+{
+ struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
+ info_timer.work);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+}
+
+/*
+ * l2cap_user
+ * External modules can register l2cap_user objects on l2cap_conn. The ->probe
+ * callback is called during registration. The ->remove callback is called
+ * during unregistration.
+ * An l2cap_user object can either be explicitly unregistered or when the
+ * underlying l2cap_conn object is deleted. This guarantees that l2cap->hcon,
+ * l2cap->hchan, .. are valid as long as the remove callback hasn't been called.
+ * External modules must own a reference to the l2cap_conn object if they intend
+ * to call l2cap_unregister_user(). The l2cap_conn object might get destroyed at
+ * any time if they don't.
+ */
+
+int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user)
+{
+ struct hci_dev *hdev = conn->hcon->hdev;
+ int ret;
+
+ /* We need to check whether l2cap_conn is registered. If it is not, we
+ * must not register the l2cap_user. l2cap_conn_del() is unregisters
+ * l2cap_conn objects, but doesn't provide its own locking. Instead, it
+ * relies on the parent hci_conn object to be locked. This itself relies
+ * on the hci_dev object to be locked. So we must lock the hci device
+ * here, too. */
+
+ hci_dev_lock(hdev);
+
+ if (user->list.next || user->list.prev) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* conn->hchan is NULL after l2cap_conn_del() was called */
+ if (!conn->hchan) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ ret = user->probe(conn, user);
+ if (ret)
+ goto out_unlock;
+
+ list_add(&user->list, &conn->users);
+ ret = 0;
+
+out_unlock:
+ hci_dev_unlock(hdev);
+ return ret;
+}
+EXPORT_SYMBOL(l2cap_register_user);
+
+void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user)
+{
+ struct hci_dev *hdev = conn->hcon->hdev;
+
+ hci_dev_lock(hdev);
+
+ if (!user->list.next || !user->list.prev)
+ goto out_unlock;
+
+ list_del(&user->list);
+ user->list.next = NULL;
+ user->list.prev = NULL;
+ user->remove(conn, user);
+
+out_unlock:
+ hci_dev_unlock(hdev);
+}
+EXPORT_SYMBOL(l2cap_unregister_user);
+
+static void l2cap_unregister_all_users(struct l2cap_conn *conn)
+{
+ struct l2cap_user *user;
+
+ while (!list_empty(&conn->users)) {
+ user = list_first_entry(&conn->users, struct l2cap_user, list);
+ list_del(&user->list);
+ user->list.next = NULL;
+ user->list.prev = NULL;
+ user->remove(conn, user);
+ }
+}
+
+static void l2cap_conn_del(struct hci_conn *hcon, int err)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan, *l;
+
+ if (!conn)
+ return;
+
+ BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+
+ kfree_skb(conn->rx_skb);
+
+ skb_queue_purge(&conn->pending_rx);
+
+ /* We can not call flush_work(&conn->pending_rx_work) here since we
+ * might block if we are running on a worker from the same workqueue
+ * pending_rx_work is waiting on.
+ */
+ if (work_pending(&conn->pending_rx_work))
+ cancel_work_sync(&conn->pending_rx_work);
+
+ if (work_pending(&conn->id_addr_update_work))
+ cancel_work_sync(&conn->id_addr_update_work);
+
+ l2cap_unregister_all_users(conn);
+
+ /* Force the connection to be immediately dropped */
+ hcon->disc_timeout = 0;
+
+ mutex_lock(&conn->chan_lock);
+
+ /* Kill channels */
+ list_for_each_entry_safe(chan, l, &conn->chan_l, list) {
+ l2cap_chan_hold(chan);
+ l2cap_chan_lock(chan);
+
+ l2cap_chan_del(chan, err);
+
+ l2cap_chan_unlock(chan);
+
+ chan->ops->close(chan);
+ l2cap_chan_put(chan);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+
+ hci_chan_del(conn->hchan);
+
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)
+ cancel_delayed_work_sync(&conn->info_timer);
+
+ hcon->l2cap_data = NULL;
+ conn->hchan = NULL;
+ l2cap_conn_put(conn);
+}
+
+static void l2cap_conn_free(struct kref *ref)
+{
+ struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref);
+
+ hci_conn_put(conn->hcon);
+ kfree(conn);
+}
+
+struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn)
+{
+ kref_get(&conn->ref);
+ return conn;
+}
+EXPORT_SYMBOL(l2cap_conn_get);
+
+void l2cap_conn_put(struct l2cap_conn *conn)
+{
+ kref_put(&conn->ref, l2cap_conn_free);
+}
+EXPORT_SYMBOL(l2cap_conn_put);
+
+/* ---- Socket interface ---- */
+
+/* Find socket with psm and source / destination bdaddr.
+ * Returns closest match.
+ */
+static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
+ bdaddr_t *src,
+ bdaddr_t *dst,
+ u8 link_type)
+{
+ struct l2cap_chan *c, *c1 = NULL;
+
+ read_lock(&chan_list_lock);
+
+ list_for_each_entry(c, &chan_list, global_l) {
+ if (state && c->state != state)
+ continue;
+
+ if (link_type == ACL_LINK && c->src_type != BDADDR_BREDR)
+ continue;
+
+ if (link_type == LE_LINK && c->src_type == BDADDR_BREDR)
+ continue;
+
+ if (c->psm == psm) {
+ int src_match, dst_match;
+ int src_any, dst_any;
+
+ /* Exact match. */
+ src_match = !bacmp(&c->src, src);
+ dst_match = !bacmp(&c->dst, dst);
+ if (src_match && dst_match) {
+ l2cap_chan_hold(c);
+ read_unlock(&chan_list_lock);
+ return c;
+ }
+
+ /* Closest match */
+ src_any = !bacmp(&c->src, BDADDR_ANY);
+ dst_any = !bacmp(&c->dst, BDADDR_ANY);
+ if ((src_match && dst_any) || (src_any && dst_match) ||
+ (src_any && dst_any))
+ c1 = c;
+ }
+ }
+
+ if (c1)
+ l2cap_chan_hold(c1);
+
+ read_unlock(&chan_list_lock);
+
+ return c1;
+}
+
+static void l2cap_monitor_timeout(struct work_struct *work)
+{
+ struct l2cap_chan *chan = container_of(work, struct l2cap_chan,
+ monitor_timer.work);
+
+ BT_DBG("chan %p", chan);
+
+ l2cap_chan_lock(chan);
+
+ if (!chan->conn) {
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return;
+ }
+
+ l2cap_tx(chan, NULL, NULL, L2CAP_EV_MONITOR_TO);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+}
+
+static void l2cap_retrans_timeout(struct work_struct *work)
+{
+ struct l2cap_chan *chan = container_of(work, struct l2cap_chan,
+ retrans_timer.work);
+
+ BT_DBG("chan %p", chan);
+
+ l2cap_chan_lock(chan);
+
+ if (!chan->conn) {
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return;
+ }
+
+ l2cap_tx(chan, NULL, NULL, L2CAP_EV_RETRANS_TO);
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+}
+
+static void l2cap_streaming_send(struct l2cap_chan *chan,
+ struct sk_buff_head *skbs)
+{
+ struct sk_buff *skb;
+ struct l2cap_ctrl *control;
+
+ BT_DBG("chan %p, skbs %p", chan, skbs);
+
+ if (__chan_is_moving(chan))
+ return;
+
+ skb_queue_splice_tail_init(skbs, &chan->tx_q);
+
+ while (!skb_queue_empty(&chan->tx_q)) {
+
+ skb = skb_dequeue(&chan->tx_q);
+
+ bt_cb(skb)->l2cap.retries = 1;
+ control = &bt_cb(skb)->l2cap;
+
+ control->reqseq = 0;
+ control->txseq = chan->next_tx_seq;
+
+ __pack_control(chan, control, skb);
+
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ u16 fcs = crc16(0, (u8 *) skb->data, skb->len);
+ put_unaligned_le16(fcs, skb_put(skb, L2CAP_FCS_SIZE));
+ }
+
+ l2cap_do_send(chan, skb);
+
+ BT_DBG("Sent txseq %u", control->txseq);
+
+ chan->next_tx_seq = __next_seq(chan, chan->next_tx_seq);
+ chan->frames_sent++;
+ }
+}
+
+static int l2cap_ertm_send(struct l2cap_chan *chan)
+{
+ struct sk_buff *skb, *tx_skb;
+ struct l2cap_ctrl *control;
+ int sent = 0;
+
+ BT_DBG("chan %p", chan);
+
+ if (chan->state != BT_CONNECTED)
+ return -ENOTCONN;
+
+ if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
+ return 0;
+
+ if (__chan_is_moving(chan))
+ return 0;
+
+ while (chan->tx_send_head &&
+ chan->unacked_frames < chan->remote_tx_win &&
+ chan->tx_state == L2CAP_TX_STATE_XMIT) {
+
+ skb = chan->tx_send_head;
+
+ bt_cb(skb)->l2cap.retries = 1;
+ control = &bt_cb(skb)->l2cap;
+
+ if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state))
+ control->final = 1;
+
+ control->reqseq = chan->buffer_seq;
+ chan->last_acked_seq = chan->buffer_seq;
+ control->txseq = chan->next_tx_seq;
+
+ __pack_control(chan, control, skb);
+
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ u16 fcs = crc16(0, (u8 *) skb->data, skb->len);
+ put_unaligned_le16(fcs, skb_put(skb, L2CAP_FCS_SIZE));
+ }
+
+ /* Clone after data has been modified. Data is assumed to be
+ read-only (for locking purposes) on cloned sk_buffs.
+ */
+ tx_skb = skb_clone(skb, GFP_KERNEL);
+
+ if (!tx_skb)
+ break;
+
+ __set_retrans_timer(chan);
+
+ chan->next_tx_seq = __next_seq(chan, chan->next_tx_seq);
+ chan->unacked_frames++;
+ chan->frames_sent++;
+ sent++;
+
+ if (skb_queue_is_last(&chan->tx_q, skb))
+ chan->tx_send_head = NULL;
+ else
+ chan->tx_send_head = skb_queue_next(&chan->tx_q, skb);
+
+ l2cap_do_send(chan, tx_skb);
+ BT_DBG("Sent txseq %u", control->txseq);
+ }
+
+ BT_DBG("Sent %d, %u unacked, %u in ERTM queue", sent,
+ chan->unacked_frames, skb_queue_len(&chan->tx_q));
+
+ return sent;
+}
+
+static void l2cap_ertm_resend(struct l2cap_chan *chan)
+{
+ struct l2cap_ctrl control;
+ struct sk_buff *skb;
+ struct sk_buff *tx_skb;
+ u16 seq;
+
+ BT_DBG("chan %p", chan);
+
+ if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
+ return;
+
+ if (__chan_is_moving(chan))
+ return;
+
+ while (chan->retrans_list.head != L2CAP_SEQ_LIST_CLEAR) {
+ seq = l2cap_seq_list_pop(&chan->retrans_list);
+
+ skb = l2cap_ertm_seq_in_queue(&chan->tx_q, seq);
+ if (!skb) {
+ BT_DBG("Error: Can't retransmit seq %d, frame missing",
+ seq);
+ continue;
+ }
+
+ bt_cb(skb)->l2cap.retries++;
+ control = bt_cb(skb)->l2cap;
+
+ if (chan->max_tx != 0 &&
+ bt_cb(skb)->l2cap.retries > chan->max_tx) {
+ BT_DBG("Retry limit exceeded (%d)", chan->max_tx);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ l2cap_seq_list_clear(&chan->retrans_list);
+ break;
+ }
+
+ control.reqseq = chan->buffer_seq;
+ if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state))
+ control.final = 1;
+ else
+ control.final = 0;
+
+ if (skb_cloned(skb)) {
+ /* Cloned sk_buffs are read-only, so we need a
+ * writeable copy
+ */
+ tx_skb = skb_copy(skb, GFP_KERNEL);
+ } else {
+ tx_skb = skb_clone(skb, GFP_KERNEL);
+ }
+
+ if (!tx_skb) {
+ l2cap_seq_list_clear(&chan->retrans_list);
+ break;
+ }
+
+ /* Update skb contents */
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags)) {
+ put_unaligned_le32(__pack_extended_control(&control),
+ tx_skb->data + L2CAP_HDR_SIZE);
+ } else {
+ put_unaligned_le16(__pack_enhanced_control(&control),
+ tx_skb->data + L2CAP_HDR_SIZE);
+ }
+
+ /* Update FCS */
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ u16 fcs = crc16(0, (u8 *) tx_skb->data,
+ tx_skb->len - L2CAP_FCS_SIZE);
+ put_unaligned_le16(fcs, skb_tail_pointer(tx_skb) -
+ L2CAP_FCS_SIZE);
+ }
+
+ l2cap_do_send(chan, tx_skb);
+
+ BT_DBG("Resent txseq %d", control.txseq);
+
+ chan->last_acked_seq = chan->buffer_seq;
+ }
+}
+
+static void l2cap_retransmit(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ BT_DBG("chan %p, control %p", chan, control);
+
+ l2cap_seq_list_append(&chan->retrans_list, control->reqseq);
+ l2cap_ertm_resend(chan);
+}
+
+static void l2cap_retransmit_all(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ struct sk_buff *skb;
+
+ BT_DBG("chan %p, control %p", chan, control);
+
+ if (control->poll)
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+
+ l2cap_seq_list_clear(&chan->retrans_list);
+
+ if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
+ return;
+
+ if (chan->unacked_frames) {
+ skb_queue_walk(&chan->tx_q, skb) {
+ if (bt_cb(skb)->l2cap.txseq == control->reqseq ||
+ skb == chan->tx_send_head)
+ break;
+ }
+
+ skb_queue_walk_from(&chan->tx_q, skb) {
+ if (skb == chan->tx_send_head)
+ break;
+
+ l2cap_seq_list_append(&chan->retrans_list,
+ bt_cb(skb)->l2cap.txseq);
+ }
+
+ l2cap_ertm_resend(chan);
+ }
+}
+
+static void l2cap_send_ack(struct l2cap_chan *chan)
+{
+ struct l2cap_ctrl control;
+ u16 frames_to_ack = __seq_offset(chan, chan->buffer_seq,
+ chan->last_acked_seq);
+ int threshold;
+
+ BT_DBG("chan %p last_acked_seq %d buffer_seq %d",
+ chan, chan->last_acked_seq, chan->buffer_seq);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state) &&
+ chan->rx_state == L2CAP_RX_STATE_RECV) {
+ __clear_ack_timer(chan);
+ control.super = L2CAP_SUPER_RNR;
+ control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &control);
+ } else {
+ if (!test_bit(CONN_REMOTE_BUSY, &chan->conn_state)) {
+ l2cap_ertm_send(chan);
+ /* If any i-frames were sent, they included an ack */
+ if (chan->buffer_seq == chan->last_acked_seq)
+ frames_to_ack = 0;
+ }
+
+ /* Ack now if the window is 3/4ths full.
+ * Calculate without mul or div
+ */
+ threshold = chan->ack_win;
+ threshold += threshold << 1;
+ threshold >>= 2;
+
+ BT_DBG("frames_to_ack %u, threshold %d", frames_to_ack,
+ threshold);
+
+ if (frames_to_ack >= threshold) {
+ __clear_ack_timer(chan);
+ control.super = L2CAP_SUPER_RR;
+ control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &control);
+ frames_to_ack = 0;
+ }
+
+ if (frames_to_ack)
+ __set_ack_timer(chan);
+ }
+}
+
+static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
+ struct msghdr *msg, int len,
+ int count, struct sk_buff *skb)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff **frag;
+ int sent = 0;
+
+ if (copy_from_iter(skb_put(skb, count), count, &msg->msg_iter) != count)
+ return -EFAULT;
+
+ sent += count;
+ len -= count;
+
+ /* Continuation fragments (no L2CAP header) */
+ frag = &skb_shinfo(skb)->frag_list;
+ while (len) {
+ struct sk_buff *tmp;
+
+ count = min_t(unsigned int, conn->mtu, len);
+
+ tmp = chan->ops->alloc_skb(chan, 0, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ *frag = tmp;
+
+ if (copy_from_iter(skb_put(*frag, count), count,
+ &msg->msg_iter) != count)
+ return -EFAULT;
+
+ sent += count;
+ len -= count;
+
+ skb->len += (*frag)->len;
+ skb->data_len += (*frag)->len;
+
+ frag = &(*frag)->next;
+ }
+
+ return sent;
+}
+
+static struct sk_buff *l2cap_create_connless_pdu(struct l2cap_chan *chan,
+ struct msghdr *msg, size_t len)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff *skb;
+ int err, count, hlen = L2CAP_HDR_SIZE + L2CAP_PSMLEN_SIZE;
+ struct l2cap_hdr *lh;
+
+ BT_DBG("chan %p psm 0x%2.2x len %zu", chan,
+ __le16_to_cpu(chan->psm), len);
+
+ count = min_t(unsigned int, (conn->mtu - hlen), len);
+
+ skb = chan->ops->alloc_skb(chan, hlen, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(skb))
+ return skb;
+
+ /* Create L2CAP header */
+ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+ lh->len = cpu_to_le16(len + L2CAP_PSMLEN_SIZE);
+ put_unaligned(chan->psm, (__le16 *) skb_put(skb, L2CAP_PSMLEN_SIZE));
+
+ err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb);
+ if (unlikely(err < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ return skb;
+}
+
+static struct sk_buff *l2cap_create_basic_pdu(struct l2cap_chan *chan,
+ struct msghdr *msg, size_t len)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff *skb;
+ int err, count;
+ struct l2cap_hdr *lh;
+
+ BT_DBG("chan %p len %zu", chan, len);
+
+ count = min_t(unsigned int, (conn->mtu - L2CAP_HDR_SIZE), len);
+
+ skb = chan->ops->alloc_skb(chan, L2CAP_HDR_SIZE, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(skb))
+ return skb;
+
+ /* Create L2CAP header */
+ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+ lh->len = cpu_to_le16(len);
+
+ err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb);
+ if (unlikely(err < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ return skb;
+}
+
+static struct sk_buff *l2cap_create_iframe_pdu(struct l2cap_chan *chan,
+ struct msghdr *msg, size_t len,
+ u16 sdulen)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff *skb;
+ int err, count, hlen;
+ struct l2cap_hdr *lh;
+
+ BT_DBG("chan %p len %zu", chan, len);
+
+ if (!conn)
+ return ERR_PTR(-ENOTCONN);
+
+ hlen = __ertm_hdr_size(chan);
+
+ if (sdulen)
+ hlen += L2CAP_SDULEN_SIZE;
+
+ if (chan->fcs == L2CAP_FCS_CRC16)
+ hlen += L2CAP_FCS_SIZE;
+
+ count = min_t(unsigned int, (conn->mtu - hlen), len);
+
+ skb = chan->ops->alloc_skb(chan, hlen, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(skb))
+ return skb;
+
+ /* Create L2CAP header */
+ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+ lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE));
+
+ /* Control header is populated later */
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ put_unaligned_le32(0, skb_put(skb, L2CAP_EXT_CTRL_SIZE));
+ else
+ put_unaligned_le16(0, skb_put(skb, L2CAP_ENH_CTRL_SIZE));
+
+ if (sdulen)
+ put_unaligned_le16(sdulen, skb_put(skb, L2CAP_SDULEN_SIZE));
+
+ err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb);
+ if (unlikely(err < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ bt_cb(skb)->l2cap.fcs = chan->fcs;
+ bt_cb(skb)->l2cap.retries = 0;
+ return skb;
+}
+
+static int l2cap_segment_sdu(struct l2cap_chan *chan,
+ struct sk_buff_head *seg_queue,
+ struct msghdr *msg, size_t len)
+{
+ struct sk_buff *skb;
+ u16 sdu_len;
+ size_t pdu_len;
+ u8 sar;
+
+ BT_DBG("chan %p, msg %p, len %zu", chan, msg, len);
+
+ /* It is critical that ERTM PDUs fit in a single HCI fragment,
+ * so fragmented skbs are not used. The HCI layer's handling
+ * of fragmented skbs is not compatible with ERTM's queueing.
+ */
+
+ /* PDU size is derived from the HCI MTU */
+ pdu_len = chan->conn->mtu;
+
+ /* Constrain PDU size for BR/EDR connections */
+ if (!chan->hs_hcon)
+ pdu_len = min_t(size_t, pdu_len, L2CAP_BREDR_MAX_PAYLOAD);
+
+ /* Adjust for largest possible L2CAP overhead. */
+ if (chan->fcs)
+ pdu_len -= L2CAP_FCS_SIZE;
+
+ pdu_len -= __ertm_hdr_size(chan);
+
+ /* Remote device may have requested smaller PDUs */
+ pdu_len = min_t(size_t, pdu_len, chan->remote_mps);
+
+ if (len <= pdu_len) {
+ sar = L2CAP_SAR_UNSEGMENTED;
+ sdu_len = 0;
+ pdu_len = len;
+ } else {
+ sar = L2CAP_SAR_START;
+ sdu_len = len;
+ }
+
+ while (len > 0) {
+ skb = l2cap_create_iframe_pdu(chan, msg, pdu_len, sdu_len);
+
+ if (IS_ERR(skb)) {
+ __skb_queue_purge(seg_queue);
+ return PTR_ERR(skb);
+ }
+
+ bt_cb(skb)->l2cap.sar = sar;
+ __skb_queue_tail(seg_queue, skb);
+
+ len -= pdu_len;
+ if (sdu_len)
+ sdu_len = 0;
+
+ if (len <= pdu_len) {
+ sar = L2CAP_SAR_END;
+ pdu_len = len;
+ } else {
+ sar = L2CAP_SAR_CONTINUE;
+ }
+ }
+
+ return 0;
+}
+
+static struct sk_buff *l2cap_create_le_flowctl_pdu(struct l2cap_chan *chan,
+ struct msghdr *msg,
+ size_t len, u16 sdulen)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct sk_buff *skb;
+ int err, count, hlen;
+ struct l2cap_hdr *lh;
+
+ BT_DBG("chan %p len %zu", chan, len);
+
+ if (!conn)
+ return ERR_PTR(-ENOTCONN);
+
+ hlen = L2CAP_HDR_SIZE;
+
+ if (sdulen)
+ hlen += L2CAP_SDULEN_SIZE;
+
+ count = min_t(unsigned int, (conn->mtu - hlen), len);
+
+ skb = chan->ops->alloc_skb(chan, hlen, count,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (IS_ERR(skb))
+ return skb;
+
+ /* Create L2CAP header */
+ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+ lh->cid = cpu_to_le16(chan->dcid);
+ lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE));
+
+ if (sdulen)
+ put_unaligned_le16(sdulen, skb_put(skb, L2CAP_SDULEN_SIZE));
+
+ err = l2cap_skbuff_fromiovec(chan, msg, len, count, skb);
+ if (unlikely(err < 0)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ return skb;
+}
+
+static int l2cap_segment_le_sdu(struct l2cap_chan *chan,
+ struct sk_buff_head *seg_queue,
+ struct msghdr *msg, size_t len)
+{
+ struct sk_buff *skb;
+ size_t pdu_len;
+ u16 sdu_len;
+
+ BT_DBG("chan %p, msg %p, len %zu", chan, msg, len);
+
+ sdu_len = len;
+ pdu_len = chan->remote_mps - L2CAP_SDULEN_SIZE;
+
+ while (len > 0) {
+ if (len <= pdu_len)
+ pdu_len = len;
+
+ skb = l2cap_create_le_flowctl_pdu(chan, msg, pdu_len, sdu_len);
+ if (IS_ERR(skb)) {
+ __skb_queue_purge(seg_queue);
+ return PTR_ERR(skb);
+ }
+
+ __skb_queue_tail(seg_queue, skb);
+
+ len -= pdu_len;
+
+ if (sdu_len) {
+ sdu_len = 0;
+ pdu_len += L2CAP_SDULEN_SIZE;
+ }
+ }
+
+ return 0;
+}
+
+int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
+{
+ struct sk_buff *skb;
+ int err;
+ struct sk_buff_head seg_queue;
+
+ if (!chan->conn)
+ return -ENOTCONN;
+
+ /* Connectionless channel */
+ if (chan->chan_type == L2CAP_CHAN_CONN_LESS) {
+ skb = l2cap_create_connless_pdu(chan, msg, len);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Channel lock is released before requesting new skb and then
+ * reacquired thus we need to recheck channel state.
+ */
+ if (chan->state != BT_CONNECTED) {
+ kfree_skb(skb);
+ return -ENOTCONN;
+ }
+
+ l2cap_do_send(chan, skb);
+ return len;
+ }
+
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ /* Check outgoing MTU */
+ if (len > chan->omtu)
+ return -EMSGSIZE;
+
+ if (!chan->tx_credits)
+ return -EAGAIN;
+
+ __skb_queue_head_init(&seg_queue);
+
+ err = l2cap_segment_le_sdu(chan, &seg_queue, msg, len);
+
+ if (chan->state != BT_CONNECTED) {
+ __skb_queue_purge(&seg_queue);
+ err = -ENOTCONN;
+ }
+
+ if (err)
+ return err;
+
+ skb_queue_splice_tail_init(&seg_queue, &chan->tx_q);
+
+ while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) {
+ l2cap_do_send(chan, skb_dequeue(&chan->tx_q));
+ chan->tx_credits--;
+ }
+
+ if (!chan->tx_credits)
+ chan->ops->suspend(chan);
+
+ err = len;
+
+ break;
+
+ case L2CAP_MODE_BASIC:
+ /* Check outgoing MTU */
+ if (len > chan->omtu)
+ return -EMSGSIZE;
+
+ /* Create a basic PDU */
+ skb = l2cap_create_basic_pdu(chan, msg, len);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Channel lock is released before requesting new skb and then
+ * reacquired thus we need to recheck channel state.
+ */
+ if (chan->state != BT_CONNECTED) {
+ kfree_skb(skb);
+ return -ENOTCONN;
+ }
+
+ l2cap_do_send(chan, skb);
+ err = len;
+ break;
+
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ /* Check outgoing MTU */
+ if (len > chan->omtu) {
+ err = -EMSGSIZE;
+ break;
+ }
+
+ __skb_queue_head_init(&seg_queue);
+
+ /* Do segmentation before calling in to the state machine,
+ * since it's possible to block while waiting for memory
+ * allocation.
+ */
+ err = l2cap_segment_sdu(chan, &seg_queue, msg, len);
+
+ /* The channel could have been closed while segmenting,
+ * check that it is still connected.
+ */
+ if (chan->state != BT_CONNECTED) {
+ __skb_queue_purge(&seg_queue);
+ err = -ENOTCONN;
+ }
+
+ if (err)
+ break;
+
+ if (chan->mode == L2CAP_MODE_ERTM)
+ l2cap_tx(chan, NULL, &seg_queue, L2CAP_EV_DATA_REQUEST);
+ else
+ l2cap_streaming_send(chan, &seg_queue);
+
+ err = len;
+
+ /* If the skbs were not queued for sending, they'll still be in
+ * seg_queue and need to be purged.
+ */
+ __skb_queue_purge(&seg_queue);
+ break;
+
+ default:
+ BT_DBG("bad state %1.1x", chan->mode);
+ err = -EBADFD;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_send);
+
+static void l2cap_send_srej(struct l2cap_chan *chan, u16 txseq)
+{
+ struct l2cap_ctrl control;
+ u16 seq;
+
+ BT_DBG("chan %p, txseq %u", chan, txseq);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.super = L2CAP_SUPER_SREJ;
+
+ for (seq = chan->expected_tx_seq; seq != txseq;
+ seq = __next_seq(chan, seq)) {
+ if (!l2cap_ertm_seq_in_queue(&chan->srej_q, seq)) {
+ control.reqseq = seq;
+ l2cap_send_sframe(chan, &control);
+ l2cap_seq_list_append(&chan->srej_list, seq);
+ }
+ }
+
+ chan->expected_tx_seq = __next_seq(chan, txseq);
+}
+
+static void l2cap_send_srej_tail(struct l2cap_chan *chan)
+{
+ struct l2cap_ctrl control;
+
+ BT_DBG("chan %p", chan);
+
+ if (chan->srej_list.tail == L2CAP_SEQ_LIST_CLEAR)
+ return;
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.super = L2CAP_SUPER_SREJ;
+ control.reqseq = chan->srej_list.tail;
+ l2cap_send_sframe(chan, &control);
+}
+
+static void l2cap_send_srej_list(struct l2cap_chan *chan, u16 txseq)
+{
+ struct l2cap_ctrl control;
+ u16 initial_head;
+ u16 seq;
+
+ BT_DBG("chan %p, txseq %u", chan, txseq);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.super = L2CAP_SUPER_SREJ;
+
+ /* Capture initial list head to allow only one pass through the list. */
+ initial_head = chan->srej_list.head;
+
+ do {
+ seq = l2cap_seq_list_pop(&chan->srej_list);
+ if (seq == txseq || seq == L2CAP_SEQ_LIST_CLEAR)
+ break;
+
+ control.reqseq = seq;
+ l2cap_send_sframe(chan, &control);
+ l2cap_seq_list_append(&chan->srej_list, seq);
+ } while (chan->srej_list.head != initial_head);
+}
+
+static void l2cap_process_reqseq(struct l2cap_chan *chan, u16 reqseq)
+{
+ struct sk_buff *acked_skb;
+ u16 ackseq;
+
+ BT_DBG("chan %p, reqseq %u", chan, reqseq);
+
+ if (chan->unacked_frames == 0 || reqseq == chan->expected_ack_seq)
+ return;
+
+ BT_DBG("expected_ack_seq %u, unacked_frames %u",
+ chan->expected_ack_seq, chan->unacked_frames);
+
+ for (ackseq = chan->expected_ack_seq; ackseq != reqseq;
+ ackseq = __next_seq(chan, ackseq)) {
+
+ acked_skb = l2cap_ertm_seq_in_queue(&chan->tx_q, ackseq);
+ if (acked_skb) {
+ skb_unlink(acked_skb, &chan->tx_q);
+ kfree_skb(acked_skb);
+ chan->unacked_frames--;
+ }
+ }
+
+ chan->expected_ack_seq = reqseq;
+
+ if (chan->unacked_frames == 0)
+ __clear_retrans_timer(chan);
+
+ BT_DBG("unacked_frames %u", chan->unacked_frames);
+}
+
+static void l2cap_abort_rx_srej_sent(struct l2cap_chan *chan)
+{
+ BT_DBG("chan %p", chan);
+
+ chan->expected_tx_seq = chan->buffer_seq;
+ l2cap_seq_list_clear(&chan->srej_list);
+ skb_queue_purge(&chan->srej_q);
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+}
+
+static void l2cap_tx_state_xmit(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff_head *skbs, u8 event)
+{
+ BT_DBG("chan %p, control %p, skbs %p, event %d", chan, control, skbs,
+ event);
+
+ switch (event) {
+ case L2CAP_EV_DATA_REQUEST:
+ if (chan->tx_send_head == NULL)
+ chan->tx_send_head = skb_peek(skbs);
+
+ skb_queue_splice_tail_init(skbs, &chan->tx_q);
+ l2cap_ertm_send(chan);
+ break;
+ case L2CAP_EV_LOCAL_BUSY_DETECTED:
+ BT_DBG("Enter LOCAL_BUSY");
+ set_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+ if (chan->rx_state == L2CAP_RX_STATE_SREJ_SENT) {
+ /* The SREJ_SENT state must be aborted if we are to
+ * enter the LOCAL_BUSY state.
+ */
+ l2cap_abort_rx_srej_sent(chan);
+ }
+
+ l2cap_send_ack(chan);
+
+ break;
+ case L2CAP_EV_LOCAL_BUSY_CLEAR:
+ BT_DBG("Exit LOCAL_BUSY");
+ clear_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+ if (test_bit(CONN_RNR_SENT, &chan->conn_state)) {
+ struct l2cap_ctrl local_control;
+
+ memset(&local_control, 0, sizeof(local_control));
+ local_control.sframe = 1;
+ local_control.super = L2CAP_SUPER_RR;
+ local_control.poll = 1;
+ local_control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &local_control);
+
+ chan->retry_count = 1;
+ __set_monitor_timer(chan);
+ chan->tx_state = L2CAP_TX_STATE_WAIT_F;
+ }
+ break;
+ case L2CAP_EV_RECV_REQSEQ_AND_FBIT:
+ l2cap_process_reqseq(chan, control->reqseq);
+ break;
+ case L2CAP_EV_EXPLICIT_POLL:
+ l2cap_send_rr_or_rnr(chan, 1);
+ chan->retry_count = 1;
+ __set_monitor_timer(chan);
+ __clear_ack_timer(chan);
+ chan->tx_state = L2CAP_TX_STATE_WAIT_F;
+ break;
+ case L2CAP_EV_RETRANS_TO:
+ l2cap_send_rr_or_rnr(chan, 1);
+ chan->retry_count = 1;
+ __set_monitor_timer(chan);
+ chan->tx_state = L2CAP_TX_STATE_WAIT_F;
+ break;
+ case L2CAP_EV_RECV_FBIT:
+ /* Nothing to process */
+ break;
+ default:
+ break;
+ }
+}
+
+static void l2cap_tx_state_wait_f(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff_head *skbs, u8 event)
+{
+ BT_DBG("chan %p, control %p, skbs %p, event %d", chan, control, skbs,
+ event);
+
+ switch (event) {
+ case L2CAP_EV_DATA_REQUEST:
+ if (chan->tx_send_head == NULL)
+ chan->tx_send_head = skb_peek(skbs);
+ /* Queue data, but don't send. */
+ skb_queue_splice_tail_init(skbs, &chan->tx_q);
+ break;
+ case L2CAP_EV_LOCAL_BUSY_DETECTED:
+ BT_DBG("Enter LOCAL_BUSY");
+ set_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+ if (chan->rx_state == L2CAP_RX_STATE_SREJ_SENT) {
+ /* The SREJ_SENT state must be aborted if we are to
+ * enter the LOCAL_BUSY state.
+ */
+ l2cap_abort_rx_srej_sent(chan);
+ }
+
+ l2cap_send_ack(chan);
+
+ break;
+ case L2CAP_EV_LOCAL_BUSY_CLEAR:
+ BT_DBG("Exit LOCAL_BUSY");
+ clear_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+ if (test_bit(CONN_RNR_SENT, &chan->conn_state)) {
+ struct l2cap_ctrl local_control;
+ memset(&local_control, 0, sizeof(local_control));
+ local_control.sframe = 1;
+ local_control.super = L2CAP_SUPER_RR;
+ local_control.poll = 1;
+ local_control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &local_control);
+
+ chan->retry_count = 1;
+ __set_monitor_timer(chan);
+ chan->tx_state = L2CAP_TX_STATE_WAIT_F;
+ }
+ break;
+ case L2CAP_EV_RECV_REQSEQ_AND_FBIT:
+ l2cap_process_reqseq(chan, control->reqseq);
+
+ /* Fall through */
+
+ case L2CAP_EV_RECV_FBIT:
+ if (control && control->final) {
+ __clear_monitor_timer(chan);
+ if (chan->unacked_frames > 0)
+ __set_retrans_timer(chan);
+ chan->retry_count = 0;
+ chan->tx_state = L2CAP_TX_STATE_XMIT;
+ BT_DBG("recv fbit tx_state 0x2.2%x", chan->tx_state);
+ }
+ break;
+ case L2CAP_EV_EXPLICIT_POLL:
+ /* Ignore */
+ break;
+ case L2CAP_EV_MONITOR_TO:
+ if (chan->max_tx == 0 || chan->retry_count < chan->max_tx) {
+ l2cap_send_rr_or_rnr(chan, 1);
+ __set_monitor_timer(chan);
+ chan->retry_count++;
+ } else {
+ l2cap_send_disconn_req(chan, ECONNABORTED);
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
+ struct sk_buff_head *skbs, u8 event)
+{
+ BT_DBG("chan %p, control %p, skbs %p, event %d, state %d",
+ chan, control, skbs, event, chan->tx_state);
+
+ switch (chan->tx_state) {
+ case L2CAP_TX_STATE_XMIT:
+ l2cap_tx_state_xmit(chan, control, skbs, event);
+ break;
+ case L2CAP_TX_STATE_WAIT_F:
+ l2cap_tx_state_wait_f(chan, control, skbs, event);
+ break;
+ default:
+ /* Ignore event */
+ break;
+ }
+}
+
+static void l2cap_pass_to_tx(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ BT_DBG("chan %p, control %p", chan, control);
+ l2cap_tx(chan, control, NULL, L2CAP_EV_RECV_REQSEQ_AND_FBIT);
+}
+
+static void l2cap_pass_to_tx_fbit(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ BT_DBG("chan %p, control %p", chan, control);
+ l2cap_tx(chan, control, NULL, L2CAP_EV_RECV_FBIT);
+}
+
+/* Copy frame to all raw sockets on that connection */
+static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+ struct l2cap_chan *chan;
+
+ BT_DBG("conn %p", conn);
+
+ mutex_lock(&conn->chan_lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ if (chan->chan_type != L2CAP_CHAN_RAW)
+ continue;
+
+ /* Don't send frame to the channel it came from */
+ if (bt_cb(skb)->l2cap.chan == chan)
+ continue;
+
+ nskb = skb_clone(skb, GFP_KERNEL);
+ if (!nskb)
+ continue;
+ if (chan->ops->recv(chan, nskb))
+ kfree_skb(nskb);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+}
+
+/* ---- L2CAP signalling commands ---- */
+static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code,
+ u8 ident, u16 dlen, void *data)
+{
+ struct sk_buff *skb, **frag;
+ struct l2cap_cmd_hdr *cmd;
+ struct l2cap_hdr *lh;
+ int len, count;
+
+ BT_DBG("conn %p, code 0x%2.2x, ident 0x%2.2x, len %u",
+ conn, code, ident, dlen);
+
+ if (conn->mtu < L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE)
+ return NULL;
+
+ len = L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE + dlen;
+ count = min_t(unsigned int, conn->mtu, len);
+
+ skb = bt_skb_alloc(count, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+ lh->len = cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen);
+
+ if (conn->hcon->type == LE_LINK)
+ lh->cid = cpu_to_le16(L2CAP_CID_LE_SIGNALING);
+ else
+ lh->cid = cpu_to_le16(L2CAP_CID_SIGNALING);
+
+ cmd = (struct l2cap_cmd_hdr *) skb_put(skb, L2CAP_CMD_HDR_SIZE);
+ cmd->code = code;
+ cmd->ident = ident;
+ cmd->len = cpu_to_le16(dlen);
+
+ if (dlen) {
+ count -= L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE;
+ memcpy(skb_put(skb, count), data, count);
+ data += count;
+ }
+
+ len -= skb->len;
+
+ /* Continuation fragments (no L2CAP header) */
+ frag = &skb_shinfo(skb)->frag_list;
+ while (len) {
+ count = min_t(unsigned int, conn->mtu, len);
+
+ *frag = bt_skb_alloc(count, GFP_KERNEL);
+ if (!*frag)
+ goto fail;
+
+ memcpy(skb_put(*frag, count), data, count);
+
+ len -= count;
+ data += count;
+
+ frag = &(*frag)->next;
+ }
+
+ return skb;
+
+fail:
+ kfree_skb(skb);
+ return NULL;
+}
+
+static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen,
+ unsigned long *val)
+{
+ struct l2cap_conf_opt *opt = *ptr;
+ int len;
+
+ len = L2CAP_CONF_OPT_SIZE + opt->len;
+ *ptr += len;
+
+ *type = opt->type;
+ *olen = opt->len;
+
+ switch (opt->len) {
+ case 1:
+ *val = *((u8 *) opt->val);
+ break;
+
+ case 2:
+ *val = get_unaligned_le16(opt->val);
+ break;
+
+ case 4:
+ *val = get_unaligned_le32(opt->val);
+ break;
+
+ default:
+ *val = (unsigned long) opt->val;
+ break;
+ }
+
+ BT_DBG("type 0x%2.2x len %u val 0x%lx", *type, opt->len, *val);
+ return len;
+}
+
+static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val)
+{
+ struct l2cap_conf_opt *opt = *ptr;
+
+ BT_DBG("type 0x%2.2x len %u val 0x%lx", type, len, val);
+
+ opt->type = type;
+ opt->len = len;
+
+ switch (len) {
+ case 1:
+ *((u8 *) opt->val) = val;
+ break;
+
+ case 2:
+ put_unaligned_le16(val, opt->val);
+ break;
+
+ case 4:
+ put_unaligned_le32(val, opt->val);
+ break;
+
+ default:
+ memcpy(opt->val, (void *) val, len);
+ break;
+ }
+
+ *ptr += L2CAP_CONF_OPT_SIZE + len;
+}
+
+static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan)
+{
+ struct l2cap_conf_efs efs;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_ERTM:
+ efs.id = chan->local_id;
+ efs.stype = chan->local_stype;
+ efs.msdu = cpu_to_le16(chan->local_msdu);
+ efs.sdu_itime = cpu_to_le32(chan->local_sdu_itime);
+ efs.acc_lat = cpu_to_le32(L2CAP_DEFAULT_ACC_LAT);
+ efs.flush_to = cpu_to_le32(L2CAP_EFS_DEFAULT_FLUSH_TO);
+ break;
+
+ case L2CAP_MODE_STREAMING:
+ efs.id = 1;
+ efs.stype = L2CAP_SERV_BESTEFFORT;
+ efs.msdu = cpu_to_le16(chan->local_msdu);
+ efs.sdu_itime = cpu_to_le32(chan->local_sdu_itime);
+ efs.acc_lat = 0;
+ efs.flush_to = 0;
+ break;
+
+ default:
+ return;
+ }
+
+ l2cap_add_conf_opt(ptr, L2CAP_CONF_EFS, sizeof(efs),
+ (unsigned long) &efs);
+}
+
+static void l2cap_ack_timeout(struct work_struct *work)
+{
+ struct l2cap_chan *chan = container_of(work, struct l2cap_chan,
+ ack_timer.work);
+ u16 frames_to_ack;
+
+ BT_DBG("chan %p", chan);
+
+ l2cap_chan_lock(chan);
+
+ frames_to_ack = __seq_offset(chan, chan->buffer_seq,
+ chan->last_acked_seq);
+
+ if (frames_to_ack)
+ l2cap_send_rr_or_rnr(chan, 0);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+}
+
+int l2cap_ertm_init(struct l2cap_chan *chan)
+{
+ int err;
+
+ chan->next_tx_seq = 0;
+ chan->expected_tx_seq = 0;
+ chan->expected_ack_seq = 0;
+ chan->unacked_frames = 0;
+ chan->buffer_seq = 0;
+ chan->frames_sent = 0;
+ chan->last_acked_seq = 0;
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+
+ skb_queue_head_init(&chan->tx_q);
+
+ chan->local_amp_id = AMP_ID_BREDR;
+ chan->move_id = AMP_ID_BREDR;
+ chan->move_state = L2CAP_MOVE_STABLE;
+ chan->move_role = L2CAP_MOVE_ROLE_NONE;
+
+ if (chan->mode != L2CAP_MODE_ERTM)
+ return 0;
+
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+ chan->tx_state = L2CAP_TX_STATE_XMIT;
+
+ INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout);
+ INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout);
+ INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout);
+
+ skb_queue_head_init(&chan->srej_q);
+
+ err = l2cap_seq_list_init(&chan->srej_list, chan->tx_win);
+ if (err < 0)
+ return err;
+
+ err = l2cap_seq_list_init(&chan->retrans_list, chan->remote_tx_win);
+ if (err < 0)
+ l2cap_seq_list_free(&chan->srej_list);
+
+ return err;
+}
+
+static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask)
+{
+ switch (mode) {
+ case L2CAP_MODE_STREAMING:
+ case L2CAP_MODE_ERTM:
+ if (l2cap_mode_supported(mode, remote_feat_mask))
+ return mode;
+ /* fall through */
+ default:
+ return L2CAP_MODE_BASIC;
+ }
+}
+
+static inline bool __l2cap_ews_supported(struct l2cap_conn *conn)
+{
+ return ((conn->local_fixed_chan & L2CAP_FC_A2MP) &&
+ (conn->feat_mask & L2CAP_FEAT_EXT_WINDOW));
+}
+
+static inline bool __l2cap_efs_supported(struct l2cap_conn *conn)
+{
+ return ((conn->local_fixed_chan & L2CAP_FC_A2MP) &&
+ (conn->feat_mask & L2CAP_FEAT_EXT_FLOW));
+}
+
+static void __l2cap_set_ertm_timeouts(struct l2cap_chan *chan,
+ struct l2cap_conf_rfc *rfc)
+{
+ if (chan->local_amp_id != AMP_ID_BREDR && chan->hs_hcon) {
+ u64 ertm_to = chan->hs_hcon->hdev->amp_be_flush_to;
+
+ /* Class 1 devices have must have ERTM timeouts
+ * exceeding the Link Supervision Timeout. The
+ * default Link Supervision Timeout for AMP
+ * controllers is 10 seconds.
+ *
+ * Class 1 devices use 0xffffffff for their
+ * best-effort flush timeout, so the clamping logic
+ * will result in a timeout that meets the above
+ * requirement. ERTM timeouts are 16-bit values, so
+ * the maximum timeout is 65.535 seconds.
+ */
+
+ /* Convert timeout to milliseconds and round */
+ ertm_to = DIV_ROUND_UP_ULL(ertm_to, 1000);
+
+ /* This is the recommended formula for class 2 devices
+ * that start ERTM timers when packets are sent to the
+ * controller.
+ */
+ ertm_to = 3 * ertm_to + 500;
+
+ if (ertm_to > 0xffff)
+ ertm_to = 0xffff;
+
+ rfc->retrans_timeout = cpu_to_le16((u16) ertm_to);
+ rfc->monitor_timeout = rfc->retrans_timeout;
+ } else {
+ rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
+ rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
+ }
+}
+
+static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
+{
+ if (chan->tx_win > L2CAP_DEFAULT_TX_WINDOW &&
+ __l2cap_ews_supported(chan->conn)) {
+ /* use extended control field */
+ set_bit(FLAG_EXT_CTRL, &chan->flags);
+ chan->tx_win_max = L2CAP_DEFAULT_EXT_WINDOW;
+ } else {
+ chan->tx_win = min_t(u16, chan->tx_win,
+ L2CAP_DEFAULT_TX_WINDOW);
+ chan->tx_win_max = L2CAP_DEFAULT_TX_WINDOW;
+ }
+ chan->ack_win = chan->tx_win;
+}
+
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_conf_req *req = data;
+ struct l2cap_conf_rfc rfc = { .mode = chan->mode };
+ void *ptr = req->data;
+ u16 size;
+
+ BT_DBG("chan %p", chan);
+
+ if (chan->num_conf_req || chan->num_conf_rsp)
+ goto done;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_STREAMING:
+ case L2CAP_MODE_ERTM:
+ if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state))
+ break;
+
+ if (__l2cap_efs_supported(chan->conn))
+ set_bit(FLAG_EFS_ENABLE, &chan->flags);
+
+ /* fall through */
+ default:
+ chan->mode = l2cap_select_mode(rfc.mode, chan->conn->feat_mask);
+ break;
+ }
+
+done:
+ if (chan->imtu != L2CAP_DEFAULT_MTU)
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu);
+
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ if (disable_ertm)
+ break;
+
+ if (!(chan->conn->feat_mask & L2CAP_FEAT_ERTM) &&
+ !(chan->conn->feat_mask & L2CAP_FEAT_STREAMING))
+ break;
+
+ rfc.mode = L2CAP_MODE_BASIC;
+ rfc.txwin_size = 0;
+ rfc.max_transmit = 0;
+ rfc.retrans_timeout = 0;
+ rfc.monitor_timeout = 0;
+ rfc.max_pdu_size = 0;
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc);
+ break;
+
+ case L2CAP_MODE_ERTM:
+ rfc.mode = L2CAP_MODE_ERTM;
+ rfc.max_transmit = chan->max_tx;
+
+ __l2cap_set_ertm_timeouts(chan, &rfc);
+
+ size = min_t(u16, L2CAP_DEFAULT_MAX_PDU_SIZE, chan->conn->mtu -
+ L2CAP_EXT_HDR_SIZE - L2CAP_SDULEN_SIZE -
+ L2CAP_FCS_SIZE);
+ rfc.max_pdu_size = cpu_to_le16(size);
+
+ l2cap_txwin_setup(chan);
+
+ rfc.txwin_size = min_t(u16, chan->tx_win,
+ L2CAP_DEFAULT_TX_WINDOW);
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc);
+
+ if (test_bit(FLAG_EFS_ENABLE, &chan->flags))
+ l2cap_add_opt_efs(&ptr, chan);
+
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
+ chan->tx_win);
+
+ if (chan->conn->feat_mask & L2CAP_FEAT_FCS)
+ if (chan->fcs == L2CAP_FCS_NONE ||
+ test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) {
+ chan->fcs = L2CAP_FCS_NONE;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1,
+ chan->fcs);
+ }
+ break;
+
+ case L2CAP_MODE_STREAMING:
+ l2cap_txwin_setup(chan);
+ rfc.mode = L2CAP_MODE_STREAMING;
+ rfc.txwin_size = 0;
+ rfc.max_transmit = 0;
+ rfc.retrans_timeout = 0;
+ rfc.monitor_timeout = 0;
+
+ size = min_t(u16, L2CAP_DEFAULT_MAX_PDU_SIZE, chan->conn->mtu -
+ L2CAP_EXT_HDR_SIZE - L2CAP_SDULEN_SIZE -
+ L2CAP_FCS_SIZE);
+ rfc.max_pdu_size = cpu_to_le16(size);
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc);
+
+ if (test_bit(FLAG_EFS_ENABLE, &chan->flags))
+ l2cap_add_opt_efs(&ptr, chan);
+
+ if (chan->conn->feat_mask & L2CAP_FEAT_FCS)
+ if (chan->fcs == L2CAP_FCS_NONE ||
+ test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) {
+ chan->fcs = L2CAP_FCS_NONE;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1,
+ chan->fcs);
+ }
+ break;
+ }
+
+ req->dcid = cpu_to_le16(chan->dcid);
+ req->flags = cpu_to_le16(0);
+
+ return ptr - data;
+}
+
+static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_conf_rsp *rsp = data;
+ void *ptr = rsp->data;
+ void *req = chan->conf_req;
+ int len = chan->conf_len;
+ int type, hint, olen;
+ unsigned long val;
+ struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
+ struct l2cap_conf_efs efs;
+ u8 remote_efs = 0;
+ u16 mtu = L2CAP_DEFAULT_MTU;
+ u16 result = L2CAP_CONF_SUCCESS;
+ u16 size;
+
+ BT_DBG("chan %p", chan);
+
+ while (len >= L2CAP_CONF_OPT_SIZE) {
+ len -= l2cap_get_conf_opt(&req, &type, &olen, &val);
+
+ hint = type & L2CAP_CONF_HINT;
+ type &= L2CAP_CONF_MASK;
+
+ switch (type) {
+ case L2CAP_CONF_MTU:
+ mtu = val;
+ break;
+
+ case L2CAP_CONF_FLUSH_TO:
+ chan->flush_to = val;
+ break;
+
+ case L2CAP_CONF_QOS:
+ break;
+
+ case L2CAP_CONF_RFC:
+ if (olen == sizeof(rfc))
+ memcpy(&rfc, (void *) val, olen);
+ break;
+
+ case L2CAP_CONF_FCS:
+ if (val == L2CAP_FCS_NONE)
+ set_bit(CONF_RECV_NO_FCS, &chan->conf_state);
+ break;
+
+ case L2CAP_CONF_EFS:
+ remote_efs = 1;
+ if (olen == sizeof(efs))
+ memcpy(&efs, (void *) val, olen);
+ break;
+
+ case L2CAP_CONF_EWS:
+ if (!(chan->conn->local_fixed_chan & L2CAP_FC_A2MP))
+ return -ECONNREFUSED;
+
+ set_bit(FLAG_EXT_CTRL, &chan->flags);
+ set_bit(CONF_EWS_RECV, &chan->conf_state);
+ chan->tx_win_max = L2CAP_DEFAULT_EXT_WINDOW;
+ chan->remote_tx_win = val;
+ break;
+
+ default:
+ if (hint)
+ break;
+
+ result = L2CAP_CONF_UNKNOWN;
+ *((u8 *) ptr++) = type;
+ break;
+ }
+ }
+
+ if (chan->num_conf_rsp || chan->num_conf_req > 1)
+ goto done;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_STREAMING:
+ case L2CAP_MODE_ERTM:
+ if (!test_bit(CONF_STATE2_DEVICE, &chan->conf_state)) {
+ chan->mode = l2cap_select_mode(rfc.mode,
+ chan->conn->feat_mask);
+ break;
+ }
+
+ if (remote_efs) {
+ if (__l2cap_efs_supported(chan->conn))
+ set_bit(FLAG_EFS_ENABLE, &chan->flags);
+ else
+ return -ECONNREFUSED;
+ }
+
+ if (chan->mode != rfc.mode)
+ return -ECONNREFUSED;
+
+ break;
+ }
+
+done:
+ if (chan->mode != rfc.mode) {
+ result = L2CAP_CONF_UNACCEPT;
+ rfc.mode = chan->mode;
+
+ if (chan->num_conf_rsp == 1)
+ return -ECONNREFUSED;
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc);
+ }
+
+ if (result == L2CAP_CONF_SUCCESS) {
+ /* Configure output options and let the other side know
+ * which ones we don't like. */
+
+ if (mtu < L2CAP_DEFAULT_MIN_MTU)
+ result = L2CAP_CONF_UNACCEPT;
+ else {
+ chan->omtu = mtu;
+ set_bit(CONF_MTU_DONE, &chan->conf_state);
+ }
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu);
+
+ if (remote_efs) {
+ if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != chan->local_stype) {
+
+ result = L2CAP_CONF_UNACCEPT;
+
+ if (chan->num_conf_req >= 1)
+ return -ECONNREFUSED;
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS,
+ sizeof(efs),
+ (unsigned long) &efs);
+ } else {
+ /* Send PENDING Conf Rsp */
+ result = L2CAP_CONF_PENDING;
+ set_bit(CONF_LOC_CONF_PEND, &chan->conf_state);
+ }
+ }
+
+ switch (rfc.mode) {
+ case L2CAP_MODE_BASIC:
+ chan->fcs = L2CAP_FCS_NONE;
+ set_bit(CONF_MODE_DONE, &chan->conf_state);
+ break;
+
+ case L2CAP_MODE_ERTM:
+ if (!test_bit(CONF_EWS_RECV, &chan->conf_state))
+ chan->remote_tx_win = rfc.txwin_size;
+ else
+ rfc.txwin_size = L2CAP_DEFAULT_TX_WINDOW;
+
+ chan->remote_max_tx = rfc.max_transmit;
+
+ size = min_t(u16, le16_to_cpu(rfc.max_pdu_size),
+ chan->conn->mtu - L2CAP_EXT_HDR_SIZE -
+ L2CAP_SDULEN_SIZE - L2CAP_FCS_SIZE);
+ rfc.max_pdu_size = cpu_to_le16(size);
+ chan->remote_mps = size;
+
+ __l2cap_set_ertm_timeouts(chan, &rfc);
+
+ set_bit(CONF_MODE_DONE, &chan->conf_state);
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
+ sizeof(rfc), (unsigned long) &rfc);
+
+ if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
+ chan->remote_id = efs.id;
+ chan->remote_stype = efs.stype;
+ chan->remote_msdu = le16_to_cpu(efs.msdu);
+ chan->remote_flush_to =
+ le32_to_cpu(efs.flush_to);
+ chan->remote_acc_lat =
+ le32_to_cpu(efs.acc_lat);
+ chan->remote_sdu_itime =
+ le32_to_cpu(efs.sdu_itime);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS,
+ sizeof(efs),
+ (unsigned long) &efs);
+ }
+ break;
+
+ case L2CAP_MODE_STREAMING:
+ size = min_t(u16, le16_to_cpu(rfc.max_pdu_size),
+ chan->conn->mtu - L2CAP_EXT_HDR_SIZE -
+ L2CAP_SDULEN_SIZE - L2CAP_FCS_SIZE);
+ rfc.max_pdu_size = cpu_to_le16(size);
+ chan->remote_mps = size;
+
+ set_bit(CONF_MODE_DONE, &chan->conf_state);
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc);
+
+ break;
+
+ default:
+ result = L2CAP_CONF_UNACCEPT;
+
+ memset(&rfc, 0, sizeof(rfc));
+ rfc.mode = chan->mode;
+ }
+
+ if (result == L2CAP_CONF_SUCCESS)
+ set_bit(CONF_OUTPUT_DONE, &chan->conf_state);
+ }
+ rsp->scid = cpu_to_le16(chan->dcid);
+ rsp->result = cpu_to_le16(result);
+ rsp->flags = cpu_to_le16(0);
+
+ return ptr - data;
+}
+
+static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
+ void *data, u16 *result)
+{
+ struct l2cap_conf_req *req = data;
+ void *ptr = req->data;
+ int type, olen;
+ unsigned long val;
+ struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
+ struct l2cap_conf_efs efs;
+
+ BT_DBG("chan %p, rsp %p, len %d, req %p", chan, rsp, len, data);
+
+ while (len >= L2CAP_CONF_OPT_SIZE) {
+ len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+
+ switch (type) {
+ case L2CAP_CONF_MTU:
+ if (val < L2CAP_DEFAULT_MIN_MTU) {
+ *result = L2CAP_CONF_UNACCEPT;
+ chan->imtu = L2CAP_DEFAULT_MIN_MTU;
+ } else
+ chan->imtu = val;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu);
+ break;
+
+ case L2CAP_CONF_FLUSH_TO:
+ chan->flush_to = val;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO,
+ 2, chan->flush_to);
+ break;
+
+ case L2CAP_CONF_RFC:
+ if (olen == sizeof(rfc))
+ memcpy(&rfc, (void *)val, olen);
+
+ if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state) &&
+ rfc.mode != chan->mode)
+ return -ECONNREFUSED;
+
+ chan->fcs = 0;
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
+ sizeof(rfc), (unsigned long) &rfc);
+ break;
+
+ case L2CAP_CONF_EWS:
+ chan->ack_win = min_t(u16, val, chan->ack_win);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
+ chan->tx_win);
+ break;
+
+ case L2CAP_CONF_EFS:
+ if (olen == sizeof(efs))
+ memcpy(&efs, (void *)val, olen);
+
+ if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != chan->local_stype)
+ return -ECONNREFUSED;
+
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
+ (unsigned long) &efs);
+ break;
+
+ case L2CAP_CONF_FCS:
+ if (*result == L2CAP_CONF_PENDING)
+ if (val == L2CAP_FCS_NONE)
+ set_bit(CONF_RECV_NO_FCS,
+ &chan->conf_state);
+ break;
+ }
+ }
+
+ if (chan->mode == L2CAP_MODE_BASIC && chan->mode != rfc.mode)
+ return -ECONNREFUSED;
+
+ chan->mode = rfc.mode;
+
+ if (*result == L2CAP_CONF_SUCCESS || *result == L2CAP_CONF_PENDING) {
+ switch (rfc.mode) {
+ case L2CAP_MODE_ERTM:
+ chan->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
+ chan->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
+ chan->mps = le16_to_cpu(rfc.max_pdu_size);
+ if (!test_bit(FLAG_EXT_CTRL, &chan->flags))
+ chan->ack_win = min_t(u16, chan->ack_win,
+ rfc.txwin_size);
+
+ if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
+ chan->local_msdu = le16_to_cpu(efs.msdu);
+ chan->local_sdu_itime =
+ le32_to_cpu(efs.sdu_itime);
+ chan->local_acc_lat = le32_to_cpu(efs.acc_lat);
+ chan->local_flush_to =
+ le32_to_cpu(efs.flush_to);
+ }
+ break;
+
+ case L2CAP_MODE_STREAMING:
+ chan->mps = le16_to_cpu(rfc.max_pdu_size);
+ }
+ }
+
+ req->dcid = cpu_to_le16(chan->dcid);
+ req->flags = cpu_to_le16(0);
+
+ return ptr - data;
+}
+
+static int l2cap_build_conf_rsp(struct l2cap_chan *chan, void *data,
+ u16 result, u16 flags)
+{
+ struct l2cap_conf_rsp *rsp = data;
+ void *ptr = rsp->data;
+
+ BT_DBG("chan %p", chan);
+
+ rsp->scid = cpu_to_le16(chan->dcid);
+ rsp->result = cpu_to_le16(result);
+ rsp->flags = cpu_to_le16(flags);
+
+ return ptr - data;
+}
+
+void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan)
+{
+ struct l2cap_le_conn_rsp rsp;
+ struct l2cap_conn *conn = chan->conn;
+
+ BT_DBG("chan %p", chan);
+
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.mtu = cpu_to_le16(chan->imtu);
+ rsp.mps = cpu_to_le16(chan->mps);
+ rsp.credits = cpu_to_le16(chan->rx_credits);
+ rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
+ &rsp);
+}
+
+void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
+{
+ struct l2cap_conn_rsp rsp;
+ struct l2cap_conn *conn = chan->conn;
+ u8 buf[128];
+ u8 rsp_code;
+
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+
+ if (chan->hs_hcon)
+ rsp_code = L2CAP_CREATE_CHAN_RSP;
+ else
+ rsp_code = L2CAP_CONN_RSP;
+
+ BT_DBG("chan %p rsp_code %u", chan, rsp_code);
+
+ l2cap_send_cmd(conn, chan->ident, rsp_code, sizeof(rsp), &rsp);
+
+ if (test_and_set_bit(CONF_REQ_SENT, &chan->conf_state))
+ return;
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf), buf);
+ chan->num_conf_req++;
+}
+
+static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len)
+{
+ int type, olen;
+ unsigned long val;
+ /* Use sane default values in case a misbehaving remote device
+ * did not send an RFC or extended window size option.
+ */
+ u16 txwin_ext = chan->ack_win;
+ struct l2cap_conf_rfc rfc = {
+ .mode = chan->mode,
+ .retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO),
+ .monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO),
+ .max_pdu_size = cpu_to_le16(chan->imtu),
+ .txwin_size = min_t(u16, chan->ack_win, L2CAP_DEFAULT_TX_WINDOW),
+ };
+
+ BT_DBG("chan %p, rsp %p, len %d", chan, rsp, len);
+
+ if ((chan->mode != L2CAP_MODE_ERTM) && (chan->mode != L2CAP_MODE_STREAMING))
+ return;
+
+ while (len >= L2CAP_CONF_OPT_SIZE) {
+ len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+
+ switch (type) {
+ case L2CAP_CONF_RFC:
+ if (olen == sizeof(rfc))
+ memcpy(&rfc, (void *)val, olen);
+ break;
+ case L2CAP_CONF_EWS:
+ txwin_ext = val;
+ break;
+ }
+ }
+
+ switch (rfc.mode) {
+ case L2CAP_MODE_ERTM:
+ chan->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
+ chan->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
+ chan->mps = le16_to_cpu(rfc.max_pdu_size);
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ chan->ack_win = min_t(u16, chan->ack_win, txwin_ext);
+ else
+ chan->ack_win = min_t(u16, chan->ack_win,
+ rfc.txwin_size);
+ break;
+ case L2CAP_MODE_STREAMING:
+ chan->mps = le16_to_cpu(rfc.max_pdu_size);
+ }
+}
+
+static inline int l2cap_command_rej(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_cmd_rej_unk *rej = (struct l2cap_cmd_rej_unk *) data;
+
+ if (cmd_len < sizeof(*rej))
+ return -EPROTO;
+
+ if (rej->reason != L2CAP_REJ_NOT_UNDERSTOOD)
+ return 0;
+
+ if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) &&
+ cmd->ident == conn->info_ident) {
+ cancel_delayed_work(&conn->info_timer);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+ }
+
+ return 0;
+}
+
+static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd,
+ u8 *data, u8 rsp_code, u8 amp_id)
+{
+ struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
+ struct l2cap_conn_rsp rsp;
+ struct l2cap_chan *chan = NULL, *pchan;
+ int result, status = L2CAP_CS_NO_INFO;
+
+ u16 dcid = 0, scid = __le16_to_cpu(req->scid);
+ __le16 psm = req->psm;
+
+ BT_DBG("psm 0x%2.2x scid 0x%4.4x", __le16_to_cpu(psm), scid);
+
+ /* Check if we have socket listening on psm */
+ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
+ &conn->hcon->dst, ACL_LINK);
+ if (!pchan) {
+ result = L2CAP_CR_BAD_PSM;
+ goto sendresp;
+ }
+
+ mutex_lock(&conn->chan_lock);
+ l2cap_chan_lock(pchan);
+
+ /* Check if the ACL is secure enough (if not SDP) */
+ if (psm != cpu_to_le16(L2CAP_PSM_SDP) &&
+ !hci_conn_check_link_mode(conn->hcon)) {
+ conn->disc_reason = HCI_ERROR_AUTH_FAILURE;
+ result = L2CAP_CR_SEC_BLOCK;
+ goto response;
+ }
+
+ result = L2CAP_CR_NO_MEM;
+
+ /* Check if we already have channel with that dcid */
+ if (__l2cap_get_chan_by_dcid(conn, scid))
+ goto response;
+
+ chan = pchan->ops->new_connection(pchan);
+ if (!chan)
+ goto response;
+
+ /* For certain devices (ex: HID mouse), support for authentication,
+ * pairing and bonding is optional. For such devices, inorder to avoid
+ * the ACL alive for too long after L2CAP disconnection, reset the ACL
+ * disc_timeout back to HCI_DISCONN_TIMEOUT during L2CAP connect.
+ */
+ conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT;
+
+ bacpy(&chan->src, &conn->hcon->src);
+ bacpy(&chan->dst, &conn->hcon->dst);
+ chan->src_type = bdaddr_src_type(conn->hcon);
+ chan->dst_type = bdaddr_dst_type(conn->hcon);
+ chan->psm = psm;
+ chan->dcid = scid;
+ chan->local_amp_id = amp_id;
+
+ __l2cap_chan_add(conn, chan);
+
+ dcid = chan->scid;
+
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ chan->ident = cmd->ident;
+
+ if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) {
+ if (l2cap_chan_check_security(chan, false)) {
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ l2cap_state_change(chan, BT_CONNECT2);
+ result = L2CAP_CR_PEND;
+ status = L2CAP_CS_AUTHOR_PEND;
+ chan->ops->defer(chan);
+ } else {
+ /* Force pending result for AMP controllers.
+ * The connection will succeed after the
+ * physical link is up.
+ */
+ if (amp_id == AMP_ID_BREDR) {
+ l2cap_state_change(chan, BT_CONFIG);
+ result = L2CAP_CR_SUCCESS;
+ } else {
+ l2cap_state_change(chan, BT_CONNECT2);
+ result = L2CAP_CR_PEND;
+ }
+ status = L2CAP_CS_NO_INFO;
+ }
+ } else {
+ l2cap_state_change(chan, BT_CONNECT2);
+ result = L2CAP_CR_PEND;
+ status = L2CAP_CS_AUTHEN_PEND;
+ }
+ } else {
+ l2cap_state_change(chan, BT_CONNECT2);
+ result = L2CAP_CR_PEND;
+ status = L2CAP_CS_NO_INFO;
+ }
+
+response:
+ l2cap_chan_unlock(pchan);
+ mutex_unlock(&conn->chan_lock);
+ l2cap_chan_put(pchan);
+
+sendresp:
+ rsp.scid = cpu_to_le16(scid);
+ rsp.dcid = cpu_to_le16(dcid);
+ rsp.result = cpu_to_le16(result);
+ rsp.status = cpu_to_le16(status);
+ l2cap_send_cmd(conn, cmd->ident, rsp_code, sizeof(rsp), &rsp);
+
+ if (result == L2CAP_CR_PEND && status == L2CAP_CS_NO_INFO) {
+ struct l2cap_info_req info;
+ info.type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT;
+ conn->info_ident = l2cap_get_ident(conn);
+
+ schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT);
+
+ l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ,
+ sizeof(info), &info);
+ }
+
+ if (chan && !test_bit(CONF_REQ_SENT, &chan->conf_state) &&
+ result == L2CAP_CR_SUCCESS) {
+ u8 buf[128];
+ set_bit(CONF_REQ_SENT, &chan->conf_state);
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf), buf);
+ chan->num_conf_req++;
+ }
+
+ return chan;
+}
+
+static int l2cap_connect_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
+{
+ struct hci_dev *hdev = conn->hcon->hdev;
+ struct hci_conn *hcon = conn->hcon;
+
+ if (cmd_len < sizeof(struct l2cap_conn_req))
+ return -EPROTO;
+
+ hci_dev_lock(hdev);
+ if (hci_dev_test_flag(hdev, HCI_MGMT) &&
+ !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &hcon->flags))
+ mgmt_device_connected(hdev, hcon, 0, NULL, 0);
+ hci_dev_unlock(hdev);
+
+ l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP, 0);
+ return 0;
+}
+
+static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_conn_rsp *rsp = (struct l2cap_conn_rsp *) data;
+ u16 scid, dcid, result, status;
+ struct l2cap_chan *chan;
+ u8 req[128];
+ int err;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(rsp->scid);
+ dcid = __le16_to_cpu(rsp->dcid);
+ result = __le16_to_cpu(rsp->result);
+ status = __le16_to_cpu(rsp->status);
+
+ BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x",
+ dcid, scid, result, status);
+
+ mutex_lock(&conn->chan_lock);
+
+ if (scid) {
+ chan = __l2cap_get_chan_by_scid(conn, scid);
+ if (!chan) {
+ err = -EBADSLT;
+ goto unlock;
+ }
+ } else {
+ chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
+ if (!chan) {
+ err = -EBADSLT;
+ goto unlock;
+ }
+ }
+
+ err = 0;
+
+ l2cap_chan_lock(chan);
+
+ switch (result) {
+ case L2CAP_CR_SUCCESS:
+ l2cap_state_change(chan, BT_CONFIG);
+ chan->ident = 0;
+ chan->dcid = dcid;
+ clear_bit(CONF_CONNECT_PEND, &chan->conf_state);
+
+ if (test_and_set_bit(CONF_REQ_SENT, &chan->conf_state))
+ break;
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, req), req);
+ chan->num_conf_req++;
+ break;
+
+ case L2CAP_CR_PEND:
+ set_bit(CONF_CONNECT_PEND, &chan->conf_state);
+ break;
+
+ default:
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ l2cap_chan_unlock(chan);
+
+unlock:
+ mutex_unlock(&conn->chan_lock);
+
+ return err;
+}
+
+static inline void set_default_fcs(struct l2cap_chan *chan)
+{
+ /* FCS is enabled only in ERTM or streaming mode, if one or both
+ * sides request it.
+ */
+ if (chan->mode != L2CAP_MODE_ERTM && chan->mode != L2CAP_MODE_STREAMING)
+ chan->fcs = L2CAP_FCS_NONE;
+ else if (!test_bit(CONF_RECV_NO_FCS, &chan->conf_state))
+ chan->fcs = L2CAP_FCS_CRC16;
+}
+
+static void l2cap_send_efs_conf_rsp(struct l2cap_chan *chan, void *data,
+ u8 ident, u16 flags)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ BT_DBG("conn %p chan %p ident %d flags 0x%4.4x", conn, chan, ident,
+ flags);
+
+ clear_bit(CONF_LOC_CONF_PEND, &chan->conf_state);
+ set_bit(CONF_OUTPUT_DONE, &chan->conf_state);
+
+ l2cap_send_cmd(conn, ident, L2CAP_CONF_RSP,
+ l2cap_build_conf_rsp(chan, data,
+ L2CAP_CONF_SUCCESS, flags), data);
+}
+
+static void cmd_reject_invalid_cid(struct l2cap_conn *conn, u8 ident,
+ u16 scid, u16 dcid)
+{
+ struct l2cap_cmd_rej_cid rej;
+
+ rej.reason = cpu_to_le16(L2CAP_REJ_INVALID_CID);
+ rej.scid = __cpu_to_le16(scid);
+ rej.dcid = __cpu_to_le16(dcid);
+
+ l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
+}
+
+static inline int l2cap_config_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_conf_req *req = (struct l2cap_conf_req *) data;
+ u16 dcid, flags;
+ u8 rsp[64];
+ struct l2cap_chan *chan;
+ int len, err = 0;
+
+ if (cmd_len < sizeof(*req))
+ return -EPROTO;
+
+ dcid = __le16_to_cpu(req->dcid);
+ flags = __le16_to_cpu(req->flags);
+
+ BT_DBG("dcid 0x%4.4x flags 0x%2.2x", dcid, flags);
+
+ chan = l2cap_get_chan_by_scid(conn, dcid);
+ if (!chan) {
+ cmd_reject_invalid_cid(conn, cmd->ident, dcid, 0);
+ return 0;
+ }
+
+ if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2) {
+ cmd_reject_invalid_cid(conn, cmd->ident, chan->scid,
+ chan->dcid);
+ goto unlock;
+ }
+
+ /* Reject if config buffer is too small. */
+ len = cmd_len - sizeof(*req);
+ if (chan->conf_len + len > sizeof(chan->conf_req)) {
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP,
+ l2cap_build_conf_rsp(chan, rsp,
+ L2CAP_CONF_REJECT, flags), rsp);
+ goto unlock;
+ }
+
+ /* Store config. */
+ memcpy(chan->conf_req + chan->conf_len, req->data, len);
+ chan->conf_len += len;
+
+ if (flags & L2CAP_CONF_FLAG_CONTINUATION) {
+ /* Incomplete config. Send empty response. */
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP,
+ l2cap_build_conf_rsp(chan, rsp,
+ L2CAP_CONF_SUCCESS, flags), rsp);
+ goto unlock;
+ }
+
+ /* Complete config. */
+ len = l2cap_parse_conf_req(chan, rsp);
+ if (len < 0) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto unlock;
+ }
+
+ chan->ident = cmd->ident;
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, len, rsp);
+ chan->num_conf_rsp++;
+
+ /* Reset config buffer. */
+ chan->conf_len = 0;
+
+ if (!test_bit(CONF_OUTPUT_DONE, &chan->conf_state))
+ goto unlock;
+
+ if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) {
+ set_default_fcs(chan);
+
+ if (chan->mode == L2CAP_MODE_ERTM ||
+ chan->mode == L2CAP_MODE_STREAMING)
+ err = l2cap_ertm_init(chan);
+
+ if (err < 0)
+ l2cap_send_disconn_req(chan, -err);
+ else
+ l2cap_chan_ready(chan);
+
+ goto unlock;
+ }
+
+ if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) {
+ u8 buf[64];
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf), buf);
+ chan->num_conf_req++;
+ }
+
+ /* Got Conf Rsp PENDING from remote side and assume we sent
+ Conf Rsp PENDING in the code above */
+ if (test_bit(CONF_REM_CONF_PEND, &chan->conf_state) &&
+ test_bit(CONF_LOC_CONF_PEND, &chan->conf_state)) {
+
+ /* check compatibility */
+
+ /* Send rsp for BR/EDR channel */
+ if (!chan->hs_hcon)
+ l2cap_send_efs_conf_rsp(chan, rsp, cmd->ident, flags);
+ else
+ chan->ident = cmd->ident;
+ }
+
+unlock:
+ l2cap_chan_unlock(chan);
+ return err;
+}
+
+static inline int l2cap_config_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_conf_rsp *rsp = (struct l2cap_conf_rsp *)data;
+ u16 scid, flags, result;
+ struct l2cap_chan *chan;
+ int len = cmd_len - sizeof(*rsp);
+ int err = 0;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(rsp->scid);
+ flags = __le16_to_cpu(rsp->flags);
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("scid 0x%4.4x flags 0x%2.2x result 0x%2.2x len %d", scid, flags,
+ result, len);
+
+ chan = l2cap_get_chan_by_scid(conn, scid);
+ if (!chan)
+ return 0;
+
+ switch (result) {
+ case L2CAP_CONF_SUCCESS:
+ l2cap_conf_rfc_get(chan, rsp->data, len);
+ clear_bit(CONF_REM_CONF_PEND, &chan->conf_state);
+ break;
+
+ case L2CAP_CONF_PENDING:
+ set_bit(CONF_REM_CONF_PEND, &chan->conf_state);
+
+ if (test_bit(CONF_LOC_CONF_PEND, &chan->conf_state)) {
+ char buf[64];
+
+ len = l2cap_parse_conf_rsp(chan, rsp->data, len,
+ buf, &result);
+ if (len < 0) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto done;
+ }
+
+ if (!chan->hs_hcon) {
+ l2cap_send_efs_conf_rsp(chan, buf, cmd->ident,
+ 0);
+ } else {
+ if (l2cap_check_efs(chan)) {
+ amp_create_logical_link(chan);
+ chan->ident = cmd->ident;
+ }
+ }
+ }
+ goto done;
+
+ case L2CAP_CONF_UNACCEPT:
+ if (chan->num_conf_rsp <= L2CAP_CONF_MAX_CONF_RSP) {
+ char req[64];
+
+ if (len > sizeof(req) - sizeof(struct l2cap_conf_req)) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto done;
+ }
+
+ /* throw out any old stored conf requests */
+ result = L2CAP_CONF_SUCCESS;
+ len = l2cap_parse_conf_rsp(chan, rsp->data, len,
+ req, &result);
+ if (len < 0) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto done;
+ }
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn),
+ L2CAP_CONF_REQ, len, req);
+ chan->num_conf_req++;
+ if (result != L2CAP_CONF_SUCCESS)
+ goto done;
+ break;
+ }
+
+ default:
+ l2cap_chan_set_err(chan, ECONNRESET);
+
+ __set_chan_timer(chan, L2CAP_DISC_REJ_TIMEOUT);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto done;
+ }
+
+ if (flags & L2CAP_CONF_FLAG_CONTINUATION)
+ goto done;
+
+ set_bit(CONF_INPUT_DONE, &chan->conf_state);
+
+ if (test_bit(CONF_OUTPUT_DONE, &chan->conf_state)) {
+ set_default_fcs(chan);
+
+ if (chan->mode == L2CAP_MODE_ERTM ||
+ chan->mode == L2CAP_MODE_STREAMING)
+ err = l2cap_ertm_init(chan);
+
+ if (err < 0)
+ l2cap_send_disconn_req(chan, -err);
+ else
+ l2cap_chan_ready(chan);
+ }
+
+done:
+ l2cap_chan_unlock(chan);
+ return err;
+}
+
+static inline int l2cap_disconnect_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_disconn_req *req = (struct l2cap_disconn_req *) data;
+ struct l2cap_disconn_rsp rsp;
+ u16 dcid, scid;
+ struct l2cap_chan *chan;
+
+ if (cmd_len != sizeof(*req))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(req->scid);
+ dcid = __le16_to_cpu(req->dcid);
+
+ BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid);
+
+ mutex_lock(&conn->chan_lock);
+
+ chan = __l2cap_get_chan_by_scid(conn, dcid);
+ if (!chan) {
+ mutex_unlock(&conn->chan_lock);
+ cmd_reject_invalid_cid(conn, cmd->ident, dcid, scid);
+ return 0;
+ }
+
+ l2cap_chan_lock(chan);
+
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.scid = cpu_to_le16(chan->dcid);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp);
+
+ chan->ops->set_shutdown(chan);
+
+ l2cap_chan_hold(chan);
+ l2cap_chan_del(chan, ECONNRESET);
+
+ l2cap_chan_unlock(chan);
+
+ chan->ops->close(chan);
+ l2cap_chan_put(chan);
+
+ mutex_unlock(&conn->chan_lock);
+
+ return 0;
+}
+
+static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_disconn_rsp *rsp = (struct l2cap_disconn_rsp *) data;
+ u16 dcid, scid;
+ struct l2cap_chan *chan;
+
+ if (cmd_len != sizeof(*rsp))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(rsp->scid);
+ dcid = __le16_to_cpu(rsp->dcid);
+
+ BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid);
+
+ mutex_lock(&conn->chan_lock);
+
+ chan = __l2cap_get_chan_by_scid(conn, scid);
+ if (!chan) {
+ mutex_unlock(&conn->chan_lock);
+ return 0;
+ }
+
+ l2cap_chan_lock(chan);
+
+ l2cap_chan_hold(chan);
+ l2cap_chan_del(chan, 0);
+
+ l2cap_chan_unlock(chan);
+
+ chan->ops->close(chan);
+ l2cap_chan_put(chan);
+
+ mutex_unlock(&conn->chan_lock);
+
+ return 0;
+}
+
+static inline int l2cap_information_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_info_req *req = (struct l2cap_info_req *) data;
+ u16 type;
+
+ if (cmd_len != sizeof(*req))
+ return -EPROTO;
+
+ type = __le16_to_cpu(req->type);
+
+ BT_DBG("type 0x%4.4x", type);
+
+ if (type == L2CAP_IT_FEAT_MASK) {
+ u8 buf[8];
+ u32 feat_mask = l2cap_feat_mask;
+ struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
+ rsp->type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+ rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
+ if (!disable_ertm)
+ feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING
+ | L2CAP_FEAT_FCS;
+ if (conn->local_fixed_chan & L2CAP_FC_A2MP)
+ feat_mask |= L2CAP_FEAT_EXT_FLOW
+ | L2CAP_FEAT_EXT_WINDOW;
+
+ put_unaligned_le32(feat_mask, rsp->data);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf),
+ buf);
+ } else if (type == L2CAP_IT_FIXED_CHAN) {
+ u8 buf[12];
+ struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
+
+ rsp->type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+ rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
+ rsp->data[0] = conn->local_fixed_chan;
+ memset(rsp->data + 1, 0, 7);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf),
+ buf);
+ } else {
+ struct l2cap_info_rsp rsp;
+ rsp.type = cpu_to_le16(type);
+ rsp.result = cpu_to_le16(L2CAP_IR_NOTSUPP);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(rsp),
+ &rsp);
+ }
+
+ return 0;
+}
+
+static inline int l2cap_information_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) data;
+ u16 type, result;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ type = __le16_to_cpu(rsp->type);
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("type 0x%4.4x result 0x%2.2x", type, result);
+
+ /* L2CAP Info req/rsp are unbound to channels, add extra checks */
+ if (cmd->ident != conn->info_ident ||
+ conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)
+ return 0;
+
+ cancel_delayed_work(&conn->info_timer);
+
+ if (result != L2CAP_IR_SUCCESS) {
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+
+ return 0;
+ }
+
+ switch (type) {
+ case L2CAP_IT_FEAT_MASK:
+ conn->feat_mask = get_unaligned_le32(rsp->data);
+
+ if (conn->feat_mask & L2CAP_FEAT_FIXED_CHAN) {
+ struct l2cap_info_req req;
+ req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+
+ conn->info_ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, conn->info_ident,
+ L2CAP_INFO_REQ, sizeof(req), &req);
+ } else {
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+ }
+ break;
+
+ case L2CAP_IT_FIXED_CHAN:
+ conn->remote_fixed_chan = rsp->data[0];
+ conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+ conn->info_ident = 0;
+
+ l2cap_conn_start(conn);
+ break;
+ }
+
+ return 0;
+}
+
+static int l2cap_create_channel_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd,
+ u16 cmd_len, void *data)
+{
+ struct l2cap_create_chan_req *req = data;
+ struct l2cap_create_chan_rsp rsp;
+ struct l2cap_chan *chan;
+ struct hci_dev *hdev;
+ u16 psm, scid;
+
+ if (cmd_len != sizeof(*req))
+ return -EPROTO;
+
+ if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
+ return -EINVAL;
+
+ psm = le16_to_cpu(req->psm);
+ scid = le16_to_cpu(req->scid);
+
+ BT_DBG("psm 0x%2.2x, scid 0x%4.4x, amp_id %d", psm, scid, req->amp_id);
+
+ /* For controller id 0 make BR/EDR connection */
+ if (req->amp_id == AMP_ID_BREDR) {
+ l2cap_connect(conn, cmd, data, L2CAP_CREATE_CHAN_RSP,
+ req->amp_id);
+ return 0;
+ }
+
+ /* Validate AMP controller id */
+ hdev = hci_dev_get(req->amp_id);
+ if (!hdev)
+ goto error;
+
+ if (hdev->dev_type != HCI_AMP || !test_bit(HCI_UP, &hdev->flags)) {
+ hci_dev_put(hdev);
+ goto error;
+ }
+
+ chan = l2cap_connect(conn, cmd, data, L2CAP_CREATE_CHAN_RSP,
+ req->amp_id);
+ if (chan) {
+ struct amp_mgr *mgr = conn->hcon->amp_mgr;
+ struct hci_conn *hs_hcon;
+
+ hs_hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
+ &conn->hcon->dst);
+ if (!hs_hcon) {
+ hci_dev_put(hdev);
+ cmd_reject_invalid_cid(conn, cmd->ident, chan->scid,
+ chan->dcid);
+ return 0;
+ }
+
+ BT_DBG("mgr %p bredr_chan %p hs_hcon %p", mgr, chan, hs_hcon);
+
+ mgr->bredr_chan = chan;
+ chan->hs_hcon = hs_hcon;
+ chan->fcs = L2CAP_FCS_NONE;
+ conn->mtu = hdev->block_mtu;
+ }
+
+ hci_dev_put(hdev);
+
+ return 0;
+
+error:
+ rsp.dcid = 0;
+ rsp.scid = cpu_to_le16(scid);
+ rsp.result = cpu_to_le16(L2CAP_CR_BAD_AMP);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CREATE_CHAN_RSP,
+ sizeof(rsp), &rsp);
+
+ return 0;
+}
+
+static void l2cap_send_move_chan_req(struct l2cap_chan *chan, u8 dest_amp_id)
+{
+ struct l2cap_move_chan_req req;
+ u8 ident;
+
+ BT_DBG("chan %p, dest_amp_id %d", chan, dest_amp_id);
+
+ ident = l2cap_get_ident(chan->conn);
+ chan->ident = ident;
+
+ req.icid = cpu_to_le16(chan->scid);
+ req.dest_amp_id = dest_amp_id;
+
+ l2cap_send_cmd(chan->conn, ident, L2CAP_MOVE_CHAN_REQ, sizeof(req),
+ &req);
+
+ __set_chan_timer(chan, L2CAP_MOVE_TIMEOUT);
+}
+
+static void l2cap_send_move_chan_rsp(struct l2cap_chan *chan, u16 result)
+{
+ struct l2cap_move_chan_rsp rsp;
+
+ BT_DBG("chan %p, result 0x%4.4x", chan, result);
+
+ rsp.icid = cpu_to_le16(chan->dcid);
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(chan->conn, chan->ident, L2CAP_MOVE_CHAN_RSP,
+ sizeof(rsp), &rsp);
+}
+
+static void l2cap_send_move_chan_cfm(struct l2cap_chan *chan, u16 result)
+{
+ struct l2cap_move_chan_cfm cfm;
+
+ BT_DBG("chan %p, result 0x%4.4x", chan, result);
+
+ chan->ident = l2cap_get_ident(chan->conn);
+
+ cfm.icid = cpu_to_le16(chan->scid);
+ cfm.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(chan->conn, chan->ident, L2CAP_MOVE_CHAN_CFM,
+ sizeof(cfm), &cfm);
+
+ __set_chan_timer(chan, L2CAP_MOVE_TIMEOUT);
+}
+
+static void l2cap_send_move_chan_cfm_icid(struct l2cap_conn *conn, u16 icid)
+{
+ struct l2cap_move_chan_cfm cfm;
+
+ BT_DBG("conn %p, icid 0x%4.4x", conn, icid);
+
+ cfm.icid = cpu_to_le16(icid);
+ cfm.result = cpu_to_le16(L2CAP_MC_UNCONFIRMED);
+
+ l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_MOVE_CHAN_CFM,
+ sizeof(cfm), &cfm);
+}
+
+static void l2cap_send_move_chan_cfm_rsp(struct l2cap_conn *conn, u8 ident,
+ u16 icid)
+{
+ struct l2cap_move_chan_cfm_rsp rsp;
+
+ BT_DBG("icid 0x%4.4x", icid);
+
+ rsp.icid = cpu_to_le16(icid);
+ l2cap_send_cmd(conn, ident, L2CAP_MOVE_CHAN_CFM_RSP, sizeof(rsp), &rsp);
+}
+
+static void __release_logical_link(struct l2cap_chan *chan)
+{
+ chan->hs_hchan = NULL;
+ chan->hs_hcon = NULL;
+
+ /* Placeholder - release the logical link */
+}
+
+static void l2cap_logical_fail(struct l2cap_chan *chan)
+{
+ /* Logical link setup failed */
+ if (chan->state != BT_CONNECTED) {
+ /* Create channel failure, disconnect */
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ switch (chan->move_role) {
+ case L2CAP_MOVE_ROLE_RESPONDER:
+ l2cap_move_done(chan);
+ l2cap_send_move_chan_rsp(chan, L2CAP_MR_NOT_SUPP);
+ break;
+ case L2CAP_MOVE_ROLE_INITIATOR:
+ if (chan->move_state == L2CAP_MOVE_WAIT_LOGICAL_COMP ||
+ chan->move_state == L2CAP_MOVE_WAIT_LOGICAL_CFM) {
+ /* Remote has only sent pending or
+ * success responses, clean up
+ */
+ l2cap_move_done(chan);
+ }
+
+ /* Other amp move states imply that the move
+ * has already aborted
+ */
+ l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
+ break;
+ }
+}
+
+static void l2cap_logical_finish_create(struct l2cap_chan *chan,
+ struct hci_chan *hchan)
+{
+ struct l2cap_conf_rsp rsp;
+
+ chan->hs_hchan = hchan;
+ chan->hs_hcon->l2cap_data = chan->conn;
+
+ l2cap_send_efs_conf_rsp(chan, &rsp, chan->ident, 0);
+
+ if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) {
+ int err;
+
+ set_default_fcs(chan);
+
+ err = l2cap_ertm_init(chan);
+ if (err < 0)
+ l2cap_send_disconn_req(chan, -err);
+ else
+ l2cap_chan_ready(chan);
+ }
+}
+
+static void l2cap_logical_finish_move(struct l2cap_chan *chan,
+ struct hci_chan *hchan)
+{
+ chan->hs_hcon = hchan->conn;
+ chan->hs_hcon->l2cap_data = chan->conn;
+
+ BT_DBG("move_state %d", chan->move_state);
+
+ switch (chan->move_state) {
+ case L2CAP_MOVE_WAIT_LOGICAL_COMP:
+ /* Move confirm will be sent after a success
+ * response is received
+ */
+ chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
+ break;
+ case L2CAP_MOVE_WAIT_LOGICAL_CFM:
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
+ } else if (chan->move_role == L2CAP_MOVE_ROLE_INITIATOR) {
+ chan->move_state = L2CAP_MOVE_WAIT_CONFIRM_RSP;
+ l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
+ } else if (chan->move_role == L2CAP_MOVE_ROLE_RESPONDER) {
+ chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
+ l2cap_send_move_chan_rsp(chan, L2CAP_MR_SUCCESS);
+ }
+ break;
+ default:
+ /* Move was not in expected state, free the channel */
+ __release_logical_link(chan);
+
+ chan->move_state = L2CAP_MOVE_STABLE;
+ }
+}
+
+/* Call with chan locked */
+void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan,
+ u8 status)
+{
+ BT_DBG("chan %p, hchan %p, status %d", chan, hchan, status);
+
+ if (status) {
+ l2cap_logical_fail(chan);
+ __release_logical_link(chan);
+ return;
+ }
+
+ if (chan->state != BT_CONNECTED) {
+ /* Ignore logical link if channel is on BR/EDR */
+ if (chan->local_amp_id != AMP_ID_BREDR)
+ l2cap_logical_finish_create(chan, hchan);
+ } else {
+ l2cap_logical_finish_move(chan, hchan);
+ }
+}
+
+void l2cap_move_start(struct l2cap_chan *chan)
+{
+ BT_DBG("chan %p", chan);
+
+ if (chan->local_amp_id == AMP_ID_BREDR) {
+ if (chan->chan_policy != BT_CHANNEL_POLICY_AMP_PREFERRED)
+ return;
+ chan->move_role = L2CAP_MOVE_ROLE_INITIATOR;
+ chan->move_state = L2CAP_MOVE_WAIT_PREPARE;
+ /* Placeholder - start physical link setup */
+ } else {
+ chan->move_role = L2CAP_MOVE_ROLE_INITIATOR;
+ chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
+ chan->move_id = 0;
+ l2cap_move_setup(chan);
+ l2cap_send_move_chan_req(chan, 0);
+ }
+}
+
+static void l2cap_do_create(struct l2cap_chan *chan, int result,
+ u8 local_amp_id, u8 remote_amp_id)
+{
+ BT_DBG("chan %p state %s %u -> %u", chan, state_to_string(chan->state),
+ local_amp_id, remote_amp_id);
+
+ chan->fcs = L2CAP_FCS_NONE;
+
+ /* Outgoing channel on AMP */
+ if (chan->state == BT_CONNECT) {
+ if (result == L2CAP_CR_SUCCESS) {
+ chan->local_amp_id = local_amp_id;
+ l2cap_send_create_chan_req(chan, remote_amp_id);
+ } else {
+ /* Revert to BR/EDR connect */
+ l2cap_send_conn_req(chan);
+ }
+
+ return;
+ }
+
+ /* Incoming channel on AMP */
+ if (__l2cap_no_conn_pending(chan)) {
+ struct l2cap_conn_rsp rsp;
+ char buf[128];
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+
+ if (result == L2CAP_CR_SUCCESS) {
+ /* Send successful response */
+ rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ } else {
+ /* Send negative response */
+ rsp.result = cpu_to_le16(L2CAP_CR_NO_MEM);
+ rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+ }
+
+ l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_RSP,
+ sizeof(rsp), &rsp);
+
+ if (result == L2CAP_CR_SUCCESS) {
+ l2cap_state_change(chan, BT_CONFIG);
+ set_bit(CONF_REQ_SENT, &chan->conf_state);
+ l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn),
+ L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf), buf);
+ chan->num_conf_req++;
+ }
+ }
+}
+
+static void l2cap_do_move_initiate(struct l2cap_chan *chan, u8 local_amp_id,
+ u8 remote_amp_id)
+{
+ l2cap_move_setup(chan);
+ chan->move_id = local_amp_id;
+ chan->move_state = L2CAP_MOVE_WAIT_RSP;
+
+ l2cap_send_move_chan_req(chan, remote_amp_id);
+}
+
+static void l2cap_do_move_respond(struct l2cap_chan *chan, int result)
+{
+ struct hci_chan *hchan = NULL;
+
+ /* Placeholder - get hci_chan for logical link */
+
+ if (hchan) {
+ if (hchan->state == BT_CONNECTED) {
+ /* Logical link is ready to go */
+ chan->hs_hcon = hchan->conn;
+ chan->hs_hcon->l2cap_data = chan->conn;
+ chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
+ l2cap_send_move_chan_rsp(chan, L2CAP_MR_SUCCESS);
+
+ l2cap_logical_cfm(chan, hchan, L2CAP_MR_SUCCESS);
+ } else {
+ /* Wait for logical link to be ready */
+ chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
+ }
+ } else {
+ /* Logical link not available */
+ l2cap_send_move_chan_rsp(chan, L2CAP_MR_NOT_ALLOWED);
+ }
+}
+
+static void l2cap_do_move_cancel(struct l2cap_chan *chan, int result)
+{
+ if (chan->move_role == L2CAP_MOVE_ROLE_RESPONDER) {
+ u8 rsp_result;
+ if (result == -EINVAL)
+ rsp_result = L2CAP_MR_BAD_ID;
+ else
+ rsp_result = L2CAP_MR_NOT_ALLOWED;
+
+ l2cap_send_move_chan_rsp(chan, rsp_result);
+ }
+
+ chan->move_role = L2CAP_MOVE_ROLE_NONE;
+ chan->move_state = L2CAP_MOVE_STABLE;
+
+ /* Restart data transmission */
+ l2cap_ertm_send(chan);
+}
+
+/* Invoke with locked chan */
+void __l2cap_physical_cfm(struct l2cap_chan *chan, int result)
+{
+ u8 local_amp_id = chan->local_amp_id;
+ u8 remote_amp_id = chan->remote_amp_id;
+
+ BT_DBG("chan %p, result %d, local_amp_id %d, remote_amp_id %d",
+ chan, result, local_amp_id, remote_amp_id);
+
+ if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) {
+ l2cap_chan_unlock(chan);
+ return;
+ }
+
+ if (chan->state != BT_CONNECTED) {
+ l2cap_do_create(chan, result, local_amp_id, remote_amp_id);
+ } else if (result != L2CAP_MR_SUCCESS) {
+ l2cap_do_move_cancel(chan, result);
+ } else {
+ switch (chan->move_role) {
+ case L2CAP_MOVE_ROLE_INITIATOR:
+ l2cap_do_move_initiate(chan, local_amp_id,
+ remote_amp_id);
+ break;
+ case L2CAP_MOVE_ROLE_RESPONDER:
+ l2cap_do_move_respond(chan, result);
+ break;
+ default:
+ l2cap_do_move_cancel(chan, result);
+ break;
+ }
+ }
+}
+
+static inline int l2cap_move_channel_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd,
+ u16 cmd_len, void *data)
+{
+ struct l2cap_move_chan_req *req = data;
+ struct l2cap_move_chan_rsp rsp;
+ struct l2cap_chan *chan;
+ u16 icid = 0;
+ u16 result = L2CAP_MR_NOT_ALLOWED;
+
+ if (cmd_len != sizeof(*req))
+ return -EPROTO;
+
+ icid = le16_to_cpu(req->icid);
+
+ BT_DBG("icid 0x%4.4x, dest_amp_id %d", icid, req->dest_amp_id);
+
+ if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
+ return -EINVAL;
+
+ chan = l2cap_get_chan_by_dcid(conn, icid);
+ if (!chan) {
+ rsp.icid = cpu_to_le16(icid);
+ rsp.result = cpu_to_le16(L2CAP_MR_NOT_ALLOWED);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_MOVE_CHAN_RSP,
+ sizeof(rsp), &rsp);
+ return 0;
+ }
+
+ chan->ident = cmd->ident;
+
+ if (chan->scid < L2CAP_CID_DYN_START ||
+ chan->chan_policy == BT_CHANNEL_POLICY_BREDR_ONLY ||
+ (chan->mode != L2CAP_MODE_ERTM &&
+ chan->mode != L2CAP_MODE_STREAMING)) {
+ result = L2CAP_MR_NOT_ALLOWED;
+ goto send_move_response;
+ }
+
+ if (chan->local_amp_id == req->dest_amp_id) {
+ result = L2CAP_MR_SAME_ID;
+ goto send_move_response;
+ }
+
+ if (req->dest_amp_id != AMP_ID_BREDR) {
+ struct hci_dev *hdev;
+ hdev = hci_dev_get(req->dest_amp_id);
+ if (!hdev || hdev->dev_type != HCI_AMP ||
+ !test_bit(HCI_UP, &hdev->flags)) {
+ if (hdev)
+ hci_dev_put(hdev);
+
+ result = L2CAP_MR_BAD_ID;
+ goto send_move_response;
+ }
+ hci_dev_put(hdev);
+ }
+
+ /* Detect a move collision. Only send a collision response
+ * if this side has "lost", otherwise proceed with the move.
+ * The winner has the larger bd_addr.
+ */
+ if ((__chan_is_moving(chan) ||
+ chan->move_role != L2CAP_MOVE_ROLE_NONE) &&
+ bacmp(&conn->hcon->src, &conn->hcon->dst) > 0) {
+ result = L2CAP_MR_COLLISION;
+ goto send_move_response;
+ }
+
+ chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
+ l2cap_move_setup(chan);
+ chan->move_id = req->dest_amp_id;
+ icid = chan->dcid;
+
+ if (req->dest_amp_id == AMP_ID_BREDR) {
+ /* Moving to BR/EDR */
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
+ result = L2CAP_MR_PEND;
+ } else {
+ chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
+ result = L2CAP_MR_SUCCESS;
+ }
+ } else {
+ chan->move_state = L2CAP_MOVE_WAIT_PREPARE;
+ /* Placeholder - uncomment when amp functions are available */
+ /*amp_accept_physical(chan, req->dest_amp_id);*/
+ result = L2CAP_MR_PEND;
+ }
+
+send_move_response:
+ l2cap_send_move_chan_rsp(chan, result);
+
+ l2cap_chan_unlock(chan);
+
+ return 0;
+}
+
+static void l2cap_move_continue(struct l2cap_conn *conn, u16 icid, u16 result)
+{
+ struct l2cap_chan *chan;
+ struct hci_chan *hchan = NULL;
+
+ chan = l2cap_get_chan_by_scid(conn, icid);
+ if (!chan) {
+ l2cap_send_move_chan_cfm_icid(conn, icid);
+ return;
+ }
+
+ __clear_chan_timer(chan);
+ if (result == L2CAP_MR_PEND)
+ __set_chan_timer(chan, L2CAP_MOVE_ERTX_TIMEOUT);
+
+ switch (chan->move_state) {
+ case L2CAP_MOVE_WAIT_LOGICAL_COMP:
+ /* Move confirm will be sent when logical link
+ * is complete.
+ */
+ chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
+ break;
+ case L2CAP_MOVE_WAIT_RSP_SUCCESS:
+ if (result == L2CAP_MR_PEND) {
+ break;
+ } else if (test_bit(CONN_LOCAL_BUSY,
+ &chan->conn_state)) {
+ chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
+ } else {
+ /* Logical link is up or moving to BR/EDR,
+ * proceed with move
+ */
+ chan->move_state = L2CAP_MOVE_WAIT_CONFIRM_RSP;
+ l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
+ }
+ break;
+ case L2CAP_MOVE_WAIT_RSP:
+ /* Moving to AMP */
+ if (result == L2CAP_MR_SUCCESS) {
+ /* Remote is ready, send confirm immediately
+ * after logical link is ready
+ */
+ chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
+ } else {
+ /* Both logical link and move success
+ * are required to confirm
+ */
+ chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_COMP;
+ }
+
+ /* Placeholder - get hci_chan for logical link */
+ if (!hchan) {
+ /* Logical link not available */
+ l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
+ break;
+ }
+
+ /* If the logical link is not yet connected, do not
+ * send confirmation.
+ */
+ if (hchan->state != BT_CONNECTED)
+ break;
+
+ /* Logical link is already ready to go */
+
+ chan->hs_hcon = hchan->conn;
+ chan->hs_hcon->l2cap_data = chan->conn;
+
+ if (result == L2CAP_MR_SUCCESS) {
+ /* Can confirm now */
+ l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
+ } else {
+ /* Now only need move success
+ * to confirm
+ */
+ chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
+ }
+
+ l2cap_logical_cfm(chan, hchan, L2CAP_MR_SUCCESS);
+ break;
+ default:
+ /* Any other amp move state means the move failed. */
+ chan->move_id = chan->local_amp_id;
+ l2cap_move_done(chan);
+ l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
+ }
+
+ l2cap_chan_unlock(chan);
+}
+
+static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid,
+ u16 result)
+{
+ struct l2cap_chan *chan;
+
+ chan = l2cap_get_chan_by_ident(conn, ident);
+ if (!chan) {
+ /* Could not locate channel, icid is best guess */
+ l2cap_send_move_chan_cfm_icid(conn, icid);
+ return;
+ }
+
+ __clear_chan_timer(chan);
+
+ if (chan->move_role == L2CAP_MOVE_ROLE_INITIATOR) {
+ if (result == L2CAP_MR_COLLISION) {
+ chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
+ } else {
+ /* Cleanup - cancel move */
+ chan->move_id = chan->local_amp_id;
+ l2cap_move_done(chan);
+ }
+ }
+
+ l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
+
+ l2cap_chan_unlock(chan);
+}
+
+static int l2cap_move_channel_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd,
+ u16 cmd_len, void *data)
+{
+ struct l2cap_move_chan_rsp *rsp = data;
+ u16 icid, result;
+
+ if (cmd_len != sizeof(*rsp))
+ return -EPROTO;
+
+ icid = le16_to_cpu(rsp->icid);
+ result = le16_to_cpu(rsp->result);
+
+ BT_DBG("icid 0x%4.4x, result 0x%4.4x", icid, result);
+
+ if (result == L2CAP_MR_SUCCESS || result == L2CAP_MR_PEND)
+ l2cap_move_continue(conn, icid, result);
+ else
+ l2cap_move_fail(conn, cmd->ident, icid, result);
+
+ return 0;
+}
+
+static int l2cap_move_channel_confirm(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd,
+ u16 cmd_len, void *data)
+{
+ struct l2cap_move_chan_cfm *cfm = data;
+ struct l2cap_chan *chan;
+ u16 icid, result;
+
+ if (cmd_len != sizeof(*cfm))
+ return -EPROTO;
+
+ icid = le16_to_cpu(cfm->icid);
+ result = le16_to_cpu(cfm->result);
+
+ BT_DBG("icid 0x%4.4x, result 0x%4.4x", icid, result);
+
+ chan = l2cap_get_chan_by_dcid(conn, icid);
+ if (!chan) {
+ /* Spec requires a response even if the icid was not found */
+ l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
+ return 0;
+ }
+
+ if (chan->move_state == L2CAP_MOVE_WAIT_CONFIRM) {
+ if (result == L2CAP_MC_CONFIRMED) {
+ chan->local_amp_id = chan->move_id;
+ if (chan->local_amp_id == AMP_ID_BREDR)
+ __release_logical_link(chan);
+ } else {
+ chan->move_id = chan->local_amp_id;
+ }
+
+ l2cap_move_done(chan);
+ }
+
+ l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
+
+ l2cap_chan_unlock(chan);
+
+ return 0;
+}
+
+static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd,
+ u16 cmd_len, void *data)
+{
+ struct l2cap_move_chan_cfm_rsp *rsp = data;
+ struct l2cap_chan *chan;
+ u16 icid;
+
+ if (cmd_len != sizeof(*rsp))
+ return -EPROTO;
+
+ icid = le16_to_cpu(rsp->icid);
+
+ BT_DBG("icid 0x%4.4x", icid);
+
+ chan = l2cap_get_chan_by_scid(conn, icid);
+ if (!chan)
+ return 0;
+
+ __clear_chan_timer(chan);
+
+ if (chan->move_state == L2CAP_MOVE_WAIT_CONFIRM_RSP) {
+ chan->local_amp_id = chan->move_id;
+
+ if (chan->local_amp_id == AMP_ID_BREDR && chan->hs_hchan)
+ __release_logical_link(chan);
+
+ l2cap_move_done(chan);
+ }
+
+ l2cap_chan_unlock(chan);
+
+ return 0;
+}
+
+static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd,
+ u16 cmd_len, u8 *data)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_conn_param_update_req *req;
+ struct l2cap_conn_param_update_rsp rsp;
+ u16 min, max, latency, to_multiplier;
+ int err;
+
+ if (hcon->role != HCI_ROLE_MASTER)
+ return -EINVAL;
+
+ if (cmd_len != sizeof(struct l2cap_conn_param_update_req))
+ return -EPROTO;
+
+ req = (struct l2cap_conn_param_update_req *) data;
+ min = __le16_to_cpu(req->min);
+ max = __le16_to_cpu(req->max);
+ latency = __le16_to_cpu(req->latency);
+ to_multiplier = __le16_to_cpu(req->to_multiplier);
+
+ BT_DBG("min 0x%4.4x max 0x%4.4x latency: 0x%4.4x Timeout: 0x%4.4x",
+ min, max, latency, to_multiplier);
+
+ memset(&rsp, 0, sizeof(rsp));
+
+ err = hci_check_conn_params(min, max, latency, to_multiplier);
+ if (err)
+ rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED);
+ else
+ rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_ACCEPTED);
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP,
+ sizeof(rsp), &rsp);
+
+ if (!err) {
+ u8 store_hint;
+
+ store_hint = hci_le_conn_update(hcon, min, max, latency,
+ to_multiplier);
+ mgmt_new_conn_param(hcon->hdev, &hcon->dst, hcon->dst_type,
+ store_hint, min, max, latency,
+ to_multiplier);
+
+ }
+
+ return 0;
+}
+
+static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_le_conn_rsp *rsp = (struct l2cap_le_conn_rsp *) data;
+ struct hci_conn *hcon = conn->hcon;
+ u16 dcid, mtu, mps, credits, result;
+ struct l2cap_chan *chan;
+ int err, sec_level;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ dcid = __le16_to_cpu(rsp->dcid);
+ mtu = __le16_to_cpu(rsp->mtu);
+ mps = __le16_to_cpu(rsp->mps);
+ credits = __le16_to_cpu(rsp->credits);
+ result = __le16_to_cpu(rsp->result);
+
+ if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23))
+ return -EPROTO;
+
+ BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x",
+ dcid, mtu, mps, credits, result);
+
+ mutex_lock(&conn->chan_lock);
+
+ chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
+ if (!chan) {
+ err = -EBADSLT;
+ goto unlock;
+ }
+
+ err = 0;
+
+ l2cap_chan_lock(chan);
+
+ switch (result) {
+ case L2CAP_CR_SUCCESS:
+ chan->ident = 0;
+ chan->dcid = dcid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ chan->tx_credits = credits;
+ l2cap_chan_ready(chan);
+ break;
+
+ case L2CAP_CR_AUTHENTICATION:
+ case L2CAP_CR_ENCRYPTION:
+ /* If we already have MITM protection we can't do
+ * anything.
+ */
+ if (hcon->sec_level > BT_SECURITY_MEDIUM) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ sec_level = hcon->sec_level + 1;
+ if (chan->sec_level < sec_level)
+ chan->sec_level = sec_level;
+
+ /* We'll need to send a new Connect Request */
+ clear_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags);
+
+ smp_conn_security(hcon, chan->sec_level);
+ break;
+
+ default:
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ l2cap_chan_unlock(chan);
+
+unlock:
+ mutex_unlock(&conn->chan_lock);
+
+ return err;
+}
+
+static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ int err = 0;
+
+ switch (cmd->code) {
+ case L2CAP_COMMAND_REJ:
+ l2cap_command_rej(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONN_REQ:
+ err = l2cap_connect_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONN_RSP:
+ case L2CAP_CREATE_CHAN_RSP:
+ l2cap_connect_create_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONF_REQ:
+ err = l2cap_config_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONF_RSP:
+ l2cap_config_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_DISCONN_REQ:
+ err = l2cap_disconnect_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_DISCONN_RSP:
+ l2cap_disconnect_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECHO_REQ:
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_ECHO_RSP, cmd_len, data);
+ break;
+
+ case L2CAP_ECHO_RSP:
+ break;
+
+ case L2CAP_INFO_REQ:
+ err = l2cap_information_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_INFO_RSP:
+ l2cap_information_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CREATE_CHAN_REQ:
+ err = l2cap_create_channel_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_MOVE_CHAN_REQ:
+ err = l2cap_move_channel_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_MOVE_CHAN_RSP:
+ l2cap_move_channel_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_MOVE_CHAN_CFM:
+ err = l2cap_move_channel_confirm(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_MOVE_CHAN_CFM_RSP:
+ l2cap_move_channel_confirm_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ default:
+ BT_ERR("Unknown BR/EDR signaling command 0x%2.2x", cmd->code);
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
+
+static int l2cap_le_connect_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_le_conn_req *req = (struct l2cap_le_conn_req *) data;
+ struct l2cap_le_conn_rsp rsp;
+ struct l2cap_chan *chan, *pchan;
+ u16 dcid, scid, credits, mtu, mps;
+ __le16 psm;
+ u8 result;
+
+ if (cmd_len != sizeof(*req))
+ return -EPROTO;
+
+ scid = __le16_to_cpu(req->scid);
+ mtu = __le16_to_cpu(req->mtu);
+ mps = __le16_to_cpu(req->mps);
+ psm = req->psm;
+ dcid = 0;
+ credits = 0;
+
+ if (mtu < 23 || mps < 23)
+ return -EPROTO;
+
+ BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm),
+ scid, mtu, mps);
+
+ /* Check if we have socket listening on psm */
+ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
+ &conn->hcon->dst, LE_LINK);
+ if (!pchan) {
+ result = L2CAP_CR_BAD_PSM;
+ chan = NULL;
+ goto response;
+ }
+
+ mutex_lock(&conn->chan_lock);
+ l2cap_chan_lock(pchan);
+
+ if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
+ SMP_ALLOW_STK)) {
+ result = L2CAP_CR_AUTHENTICATION;
+ chan = NULL;
+ goto response_unlock;
+ }
+
+ /* Check if we already have channel with that dcid */
+ if (__l2cap_get_chan_by_dcid(conn, scid)) {
+ result = L2CAP_CR_NO_MEM;
+ chan = NULL;
+ goto response_unlock;
+ }
+
+ chan = pchan->ops->new_connection(pchan);
+ if (!chan) {
+ result = L2CAP_CR_NO_MEM;
+ goto response_unlock;
+ }
+
+ l2cap_le_flowctl_init(chan);
+
+ bacpy(&chan->src, &conn->hcon->src);
+ bacpy(&chan->dst, &conn->hcon->dst);
+ chan->src_type = bdaddr_src_type(conn->hcon);
+ chan->dst_type = bdaddr_dst_type(conn->hcon);
+ chan->psm = psm;
+ chan->dcid = scid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ chan->tx_credits = __le16_to_cpu(req->credits);
+
+ __l2cap_chan_add(conn, chan);
+ dcid = chan->scid;
+ credits = chan->rx_credits;
+
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ chan->ident = cmd->ident;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ l2cap_state_change(chan, BT_CONNECT2);
+ /* The following result value is actually not defined
+ * for LE CoC but we use it to let the function know
+ * that it should bail out after doing its cleanup
+ * instead of sending a response.
+ */
+ result = L2CAP_CR_PEND;
+ chan->ops->defer(chan);
+ } else {
+ l2cap_chan_ready(chan);
+ result = L2CAP_CR_SUCCESS;
+ }
+
+response_unlock:
+ l2cap_chan_unlock(pchan);
+ mutex_unlock(&conn->chan_lock);
+ l2cap_chan_put(pchan);
+
+ if (result == L2CAP_CR_PEND)
+ return 0;
+
+response:
+ if (chan) {
+ rsp.mtu = cpu_to_le16(chan->imtu);
+ rsp.mps = cpu_to_le16(chan->mps);
+ } else {
+ rsp.mtu = 0;
+ rsp.mps = 0;
+ }
+
+ rsp.dcid = cpu_to_le16(dcid);
+ rsp.credits = cpu_to_le16(credits);
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), &rsp);
+
+ return 0;
+}
+
+static inline int l2cap_le_credits(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_le_credits *pkt;
+ struct l2cap_chan *chan;
+ u16 cid, credits, max_credits;
+
+ if (cmd_len != sizeof(*pkt))
+ return -EPROTO;
+
+ pkt = (struct l2cap_le_credits *) data;
+ cid = __le16_to_cpu(pkt->cid);
+ credits = __le16_to_cpu(pkt->credits);
+
+ BT_DBG("cid 0x%4.4x credits 0x%4.4x", cid, credits);
+
+ chan = l2cap_get_chan_by_dcid(conn, cid);
+ if (!chan)
+ return -EBADSLT;
+
+ max_credits = LE_FLOWCTL_MAX_CREDITS - chan->tx_credits;
+ if (credits > max_credits) {
+ BT_ERR("LE credits overflow");
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ l2cap_chan_unlock(chan);
+
+ /* Return 0 so that we don't trigger an unnecessary
+ * command reject packet.
+ */
+ return 0;
+ }
+
+ chan->tx_credits += credits;
+
+ while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) {
+ l2cap_do_send(chan, skb_dequeue(&chan->tx_q));
+ chan->tx_credits--;
+ }
+
+ if (chan->tx_credits)
+ chan->ops->resume(chan);
+
+ l2cap_chan_unlock(chan);
+
+ return 0;
+}
+
+static inline int l2cap_le_command_rej(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_cmd_rej_unk *rej = (struct l2cap_cmd_rej_unk *) data;
+ struct l2cap_chan *chan;
+
+ if (cmd_len < sizeof(*rej))
+ return -EPROTO;
+
+ mutex_lock(&conn->chan_lock);
+
+ chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
+ if (!chan)
+ goto done;
+
+ l2cap_chan_lock(chan);
+ l2cap_chan_del(chan, ECONNREFUSED);
+ l2cap_chan_unlock(chan);
+
+done:
+ mutex_unlock(&conn->chan_lock);
+ return 0;
+}
+
+static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ int err = 0;
+
+ switch (cmd->code) {
+ case L2CAP_COMMAND_REJ:
+ l2cap_le_command_rej(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONN_PARAM_UPDATE_REQ:
+ err = l2cap_conn_param_update_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_CONN_PARAM_UPDATE_RSP:
+ break;
+
+ case L2CAP_LE_CONN_RSP:
+ l2cap_le_connect_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_LE_CONN_REQ:
+ err = l2cap_le_connect_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_LE_CREDITS:
+ err = l2cap_le_credits(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_DISCONN_REQ:
+ err = l2cap_disconnect_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_DISCONN_RSP:
+ l2cap_disconnect_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ default:
+ BT_ERR("Unknown LE signaling command 0x%2.2x", cmd->code);
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
+
+static inline void l2cap_le_sig_channel(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_cmd_hdr *cmd;
+ u16 len;
+ int err;
+
+ if (hcon->type != LE_LINK)
+ goto drop;
+
+ if (skb->len < L2CAP_CMD_HDR_SIZE)
+ goto drop;
+
+ cmd = (void *) skb->data;
+ skb_pull(skb, L2CAP_CMD_HDR_SIZE);
+
+ len = le16_to_cpu(cmd->len);
+
+ BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd->code, len, cmd->ident);
+
+ if (len != skb->len || !cmd->ident) {
+ BT_DBG("corrupted command");
+ goto drop;
+ }
+
+ err = l2cap_le_sig_cmd(conn, cmd, len, skb->data);
+ if (err) {
+ struct l2cap_cmd_rej_unk rej;
+
+ BT_ERR("Wrong link type (%d)", err);
+
+ rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD);
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ,
+ sizeof(rej), &rej);
+ }
+
+drop:
+ kfree_skb(skb);
+}
+
+static inline void l2cap_sig_channel(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct hci_conn *hcon = conn->hcon;
+ u8 *data = skb->data;
+ int len = skb->len;
+ struct l2cap_cmd_hdr cmd;
+ int err;
+
+ l2cap_raw_recv(conn, skb);
+
+ if (hcon->type != ACL_LINK)
+ goto drop;
+
+ while (len >= L2CAP_CMD_HDR_SIZE) {
+ u16 cmd_len;
+ memcpy(&cmd, data, L2CAP_CMD_HDR_SIZE);
+ data += L2CAP_CMD_HDR_SIZE;
+ len -= L2CAP_CMD_HDR_SIZE;
+
+ cmd_len = le16_to_cpu(cmd.len);
+
+ BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd.code, cmd_len,
+ cmd.ident);
+
+ if (cmd_len > len || !cmd.ident) {
+ BT_DBG("corrupted command");
+ break;
+ }
+
+ err = l2cap_bredr_sig_cmd(conn, &cmd, cmd_len, data);
+ if (err) {
+ struct l2cap_cmd_rej_unk rej;
+
+ BT_ERR("Wrong link type (%d)", err);
+
+ rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD);
+ l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ,
+ sizeof(rej), &rej);
+ }
+
+ data += cmd_len;
+ len -= cmd_len;
+ }
+
+drop:
+ kfree_skb(skb);
+}
+
+static int l2cap_check_fcs(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ u16 our_fcs, rcv_fcs;
+ int hdr_size;
+
+ if (test_bit(FLAG_EXT_CTRL, &chan->flags))
+ hdr_size = L2CAP_EXT_HDR_SIZE;
+ else
+ hdr_size = L2CAP_ENH_HDR_SIZE;
+
+ if (chan->fcs == L2CAP_FCS_CRC16) {
+ skb_trim(skb, skb->len - L2CAP_FCS_SIZE);
+ rcv_fcs = get_unaligned_le16(skb->data + skb->len);
+ our_fcs = crc16(0, skb->data - hdr_size, skb->len + hdr_size);
+
+ if (our_fcs != rcv_fcs)
+ return -EBADMSG;
+ }
+ return 0;
+}
+
+static void l2cap_send_i_or_rr_or_rnr(struct l2cap_chan *chan)
+{
+ struct l2cap_ctrl control;
+
+ BT_DBG("chan %p", chan);
+
+ memset(&control, 0, sizeof(control));
+ control.sframe = 1;
+ control.final = 1;
+ control.reqseq = chan->buffer_seq;
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ control.super = L2CAP_SUPER_RNR;
+ l2cap_send_sframe(chan, &control);
+ }
+
+ if (test_and_clear_bit(CONN_REMOTE_BUSY, &chan->conn_state) &&
+ chan->unacked_frames > 0)
+ __set_retrans_timer(chan);
+
+ /* Send pending iframes */
+ l2cap_ertm_send(chan);
+
+ if (!test_bit(CONN_LOCAL_BUSY, &chan->conn_state) &&
+ test_bit(CONN_SEND_FBIT, &chan->conn_state)) {
+ /* F-bit wasn't sent in an s-frame or i-frame yet, so
+ * send it now.
+ */
+ control.super = L2CAP_SUPER_RR;
+ l2cap_send_sframe(chan, &control);
+ }
+}
+
+static void append_skb_frag(struct sk_buff *skb, struct sk_buff *new_frag,
+ struct sk_buff **last_frag)
+{
+ /* skb->len reflects data in skb as well as all fragments
+ * skb->data_len reflects only data in fragments
+ */
+ if (!skb_has_frag_list(skb))
+ skb_shinfo(skb)->frag_list = new_frag;
+
+ new_frag->next = NULL;
+
+ (*last_frag)->next = new_frag;
+ *last_frag = new_frag;
+
+ skb->len += new_frag->len;
+ skb->data_len += new_frag->len;
+ skb->truesize += new_frag->truesize;
+}
+
+static int l2cap_reassemble_sdu(struct l2cap_chan *chan, struct sk_buff *skb,
+ struct l2cap_ctrl *control)
+{
+ int err = -EINVAL;
+
+ switch (control->sar) {
+ case L2CAP_SAR_UNSEGMENTED:
+ if (chan->sdu)
+ break;
+
+ err = chan->ops->recv(chan, skb);
+ break;
+
+ case L2CAP_SAR_START:
+ if (chan->sdu)
+ break;
+
+ chan->sdu_len = get_unaligned_le16(skb->data);
+ skb_pull(skb, L2CAP_SDULEN_SIZE);
+
+ if (chan->sdu_len > chan->imtu) {
+ err = -EMSGSIZE;
+ break;
+ }
+
+ if (skb->len >= chan->sdu_len)
+ break;
+
+ chan->sdu = skb;
+ chan->sdu_last_frag = skb;
+
+ skb = NULL;
+ err = 0;
+ break;
+
+ case L2CAP_SAR_CONTINUE:
+ if (!chan->sdu)
+ break;
+
+ append_skb_frag(chan->sdu, skb,
+ &chan->sdu_last_frag);
+ skb = NULL;
+
+ if (chan->sdu->len >= chan->sdu_len)
+ break;
+
+ err = 0;
+ break;
+
+ case L2CAP_SAR_END:
+ if (!chan->sdu)
+ break;
+
+ append_skb_frag(chan->sdu, skb,
+ &chan->sdu_last_frag);
+ skb = NULL;
+
+ if (chan->sdu->len != chan->sdu_len)
+ break;
+
+ err = chan->ops->recv(chan, chan->sdu);
+
+ if (!err) {
+ /* Reassembly complete */
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ }
+ break;
+ }
+
+ if (err) {
+ kfree_skb(skb);
+ kfree_skb(chan->sdu);
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ }
+
+ return err;
+}
+
+static int l2cap_resegment(struct l2cap_chan *chan)
+{
+ /* Placeholder */
+ return 0;
+}
+
+void l2cap_chan_busy(struct l2cap_chan *chan, int busy)
+{
+ u8 event;
+
+ if (chan->mode != L2CAP_MODE_ERTM)
+ return;
+
+ event = busy ? L2CAP_EV_LOCAL_BUSY_DETECTED : L2CAP_EV_LOCAL_BUSY_CLEAR;
+ l2cap_tx(chan, NULL, NULL, event);
+}
+
+static int l2cap_rx_queued_iframes(struct l2cap_chan *chan)
+{
+ int err = 0;
+ /* Pass sequential frames to l2cap_reassemble_sdu()
+ * until a gap is encountered.
+ */
+
+ BT_DBG("chan %p", chan);
+
+ while (!test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ struct sk_buff *skb;
+ BT_DBG("Searching for skb with txseq %d (queue len %d)",
+ chan->buffer_seq, skb_queue_len(&chan->srej_q));
+
+ skb = l2cap_ertm_seq_in_queue(&chan->srej_q, chan->buffer_seq);
+
+ if (!skb)
+ break;
+
+ skb_unlink(skb, &chan->srej_q);
+ chan->buffer_seq = __next_seq(chan, chan->buffer_seq);
+ err = l2cap_reassemble_sdu(chan, skb, &bt_cb(skb)->l2cap);
+ if (err)
+ break;
+ }
+
+ if (skb_queue_empty(&chan->srej_q)) {
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+ l2cap_send_ack(chan);
+ }
+
+ return err;
+}
+
+static void l2cap_handle_srej(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ struct sk_buff *skb;
+
+ BT_DBG("chan %p, control %p", chan, control);
+
+ if (control->reqseq == chan->next_tx_seq) {
+ BT_DBG("Invalid reqseq %d, disconnecting", control->reqseq);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ skb = l2cap_ertm_seq_in_queue(&chan->tx_q, control->reqseq);
+
+ if (skb == NULL) {
+ BT_DBG("Seq %d not available for retransmission",
+ control->reqseq);
+ return;
+ }
+
+ if (chan->max_tx != 0 && bt_cb(skb)->l2cap.retries >= chan->max_tx) {
+ BT_DBG("Retry limit exceeded (%d)", chan->max_tx);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ if (control->poll) {
+ l2cap_pass_to_tx(chan, control);
+
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+ l2cap_retransmit(chan, control);
+ l2cap_ertm_send(chan);
+
+ if (chan->tx_state == L2CAP_TX_STATE_WAIT_F) {
+ set_bit(CONN_SREJ_ACT, &chan->conn_state);
+ chan->srej_save_reqseq = control->reqseq;
+ }
+ } else {
+ l2cap_pass_to_tx_fbit(chan, control);
+
+ if (control->final) {
+ if (chan->srej_save_reqseq != control->reqseq ||
+ !test_and_clear_bit(CONN_SREJ_ACT,
+ &chan->conn_state))
+ l2cap_retransmit(chan, control);
+ } else {
+ l2cap_retransmit(chan, control);
+ if (chan->tx_state == L2CAP_TX_STATE_WAIT_F) {
+ set_bit(CONN_SREJ_ACT, &chan->conn_state);
+ chan->srej_save_reqseq = control->reqseq;
+ }
+ }
+ }
+}
+
+static void l2cap_handle_rej(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control)
+{
+ struct sk_buff *skb;
+
+ BT_DBG("chan %p, control %p", chan, control);
+
+ if (control->reqseq == chan->next_tx_seq) {
+ BT_DBG("Invalid reqseq %d, disconnecting", control->reqseq);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ skb = l2cap_ertm_seq_in_queue(&chan->tx_q, control->reqseq);
+
+ if (chan->max_tx && skb &&
+ bt_cb(skb)->l2cap.retries >= chan->max_tx) {
+ BT_DBG("Retry limit exceeded (%d)", chan->max_tx);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return;
+ }
+
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ l2cap_pass_to_tx(chan, control);
+
+ if (control->final) {
+ if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state))
+ l2cap_retransmit_all(chan, control);
+ } else {
+ l2cap_retransmit_all(chan, control);
+ l2cap_ertm_send(chan);
+ if (chan->tx_state == L2CAP_TX_STATE_WAIT_F)
+ set_bit(CONN_REJ_ACT, &chan->conn_state);
+ }
+}
+
+static u8 l2cap_classify_txseq(struct l2cap_chan *chan, u16 txseq)
+{
+ BT_DBG("chan %p, txseq %d", chan, txseq);
+
+ BT_DBG("last_acked_seq %d, expected_tx_seq %d", chan->last_acked_seq,
+ chan->expected_tx_seq);
+
+ if (chan->rx_state == L2CAP_RX_STATE_SREJ_SENT) {
+ if (__seq_offset(chan, txseq, chan->last_acked_seq) >=
+ chan->tx_win) {
+ /* See notes below regarding "double poll" and
+ * invalid packets.
+ */
+ if (chan->tx_win <= ((chan->tx_win_max + 1) >> 1)) {
+ BT_DBG("Invalid/Ignore - after SREJ");
+ return L2CAP_TXSEQ_INVALID_IGNORE;
+ } else {
+ BT_DBG("Invalid - in window after SREJ sent");
+ return L2CAP_TXSEQ_INVALID;
+ }
+ }
+
+ if (chan->srej_list.head == txseq) {
+ BT_DBG("Expected SREJ");
+ return L2CAP_TXSEQ_EXPECTED_SREJ;
+ }
+
+ if (l2cap_ertm_seq_in_queue(&chan->srej_q, txseq)) {
+ BT_DBG("Duplicate SREJ - txseq already stored");
+ return L2CAP_TXSEQ_DUPLICATE_SREJ;
+ }
+
+ if (l2cap_seq_list_contains(&chan->srej_list, txseq)) {
+ BT_DBG("Unexpected SREJ - not requested");
+ return L2CAP_TXSEQ_UNEXPECTED_SREJ;
+ }
+ }
+
+ if (chan->expected_tx_seq == txseq) {
+ if (__seq_offset(chan, txseq, chan->last_acked_seq) >=
+ chan->tx_win) {
+ BT_DBG("Invalid - txseq outside tx window");
+ return L2CAP_TXSEQ_INVALID;
+ } else {
+ BT_DBG("Expected");
+ return L2CAP_TXSEQ_EXPECTED;
+ }
+ }
+
+ if (__seq_offset(chan, txseq, chan->last_acked_seq) <
+ __seq_offset(chan, chan->expected_tx_seq, chan->last_acked_seq)) {
+ BT_DBG("Duplicate - expected_tx_seq later than txseq");
+ return L2CAP_TXSEQ_DUPLICATE;
+ }
+
+ if (__seq_offset(chan, txseq, chan->last_acked_seq) >= chan->tx_win) {
+ /* A source of invalid packets is a "double poll" condition,
+ * where delays cause us to send multiple poll packets. If
+ * the remote stack receives and processes both polls,
+ * sequence numbers can wrap around in such a way that a
+ * resent frame has a sequence number that looks like new data
+ * with a sequence gap. This would trigger an erroneous SREJ
+ * request.
+ *
+ * Fortunately, this is impossible with a tx window that's
+ * less than half of the maximum sequence number, which allows
+ * invalid frames to be safely ignored.
+ *
+ * With tx window sizes greater than half of the tx window
+ * maximum, the frame is invalid and cannot be ignored. This
+ * causes a disconnect.
+ */
+
+ if (chan->tx_win <= ((chan->tx_win_max + 1) >> 1)) {
+ BT_DBG("Invalid/Ignore - txseq outside tx window");
+ return L2CAP_TXSEQ_INVALID_IGNORE;
+ } else {
+ BT_DBG("Invalid - txseq outside tx window");
+ return L2CAP_TXSEQ_INVALID;
+ }
+ } else {
+ BT_DBG("Unexpected - txseq indicates missing frames");
+ return L2CAP_TXSEQ_UNEXPECTED;
+ }
+}
+
+static int l2cap_rx_state_recv(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err = 0;
+ bool skb_in_use = false;
+
+ BT_DBG("chan %p, control %p, skb %p, event %d", chan, control, skb,
+ event);
+
+ switch (event) {
+ case L2CAP_EV_RECV_IFRAME:
+ switch (l2cap_classify_txseq(chan, control->txseq)) {
+ case L2CAP_TXSEQ_EXPECTED:
+ l2cap_pass_to_tx(chan, control);
+
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ BT_DBG("Busy, discarding expected seq %d",
+ control->txseq);
+ break;
+ }
+
+ chan->expected_tx_seq = __next_seq(chan,
+ control->txseq);
+
+ chan->buffer_seq = chan->expected_tx_seq;
+ skb_in_use = true;
+
+ err = l2cap_reassemble_sdu(chan, skb, control);
+ if (err)
+ break;
+
+ if (control->final) {
+ if (!test_and_clear_bit(CONN_REJ_ACT,
+ &chan->conn_state)) {
+ control->final = 0;
+ l2cap_retransmit_all(chan, control);
+ l2cap_ertm_send(chan);
+ }
+ }
+
+ if (!test_bit(CONN_LOCAL_BUSY, &chan->conn_state))
+ l2cap_send_ack(chan);
+ break;
+ case L2CAP_TXSEQ_UNEXPECTED:
+ l2cap_pass_to_tx(chan, control);
+
+ /* Can't issue SREJ frames in the local busy state.
+ * Drop this frame, it will be seen as missing
+ * when local busy is exited.
+ */
+ if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+ BT_DBG("Busy, discarding unexpected seq %d",
+ control->txseq);
+ break;
+ }
+
+ /* There was a gap in the sequence, so an SREJ
+ * must be sent for each missing frame. The
+ * current frame is stored for later use.
+ */
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ clear_bit(CONN_SREJ_ACT, &chan->conn_state);
+ l2cap_seq_list_clear(&chan->srej_list);
+ l2cap_send_srej(chan, control->txseq);
+
+ chan->rx_state = L2CAP_RX_STATE_SREJ_SENT;
+ break;
+ case L2CAP_TXSEQ_DUPLICATE:
+ l2cap_pass_to_tx(chan, control);
+ break;
+ case L2CAP_TXSEQ_INVALID_IGNORE:
+ break;
+ case L2CAP_TXSEQ_INVALID:
+ default:
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ break;
+ }
+ break;
+ case L2CAP_EV_RECV_RR:
+ l2cap_pass_to_tx(chan, control);
+ if (control->final) {
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state) &&
+ !__chan_is_moving(chan)) {
+ control->final = 0;
+ l2cap_retransmit_all(chan, control);
+ }
+
+ l2cap_ertm_send(chan);
+ } else if (control->poll) {
+ l2cap_send_i_or_rr_or_rnr(chan);
+ } else {
+ if (test_and_clear_bit(CONN_REMOTE_BUSY,
+ &chan->conn_state) &&
+ chan->unacked_frames)
+ __set_retrans_timer(chan);
+
+ l2cap_ertm_send(chan);
+ }
+ break;
+ case L2CAP_EV_RECV_RNR:
+ set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+ l2cap_pass_to_tx(chan, control);
+ if (control && control->poll) {
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+ l2cap_send_rr_or_rnr(chan, 0);
+ }
+ __clear_retrans_timer(chan);
+ l2cap_seq_list_clear(&chan->retrans_list);
+ break;
+ case L2CAP_EV_RECV_REJ:
+ l2cap_handle_rej(chan, control);
+ break;
+ case L2CAP_EV_RECV_SREJ:
+ l2cap_handle_srej(chan, control);
+ break;
+ default:
+ break;
+ }
+
+ if (skb && !skb_in_use) {
+ BT_DBG("Freeing %p", skb);
+ kfree_skb(skb);
+ }
+
+ return err;
+}
+
+static int l2cap_rx_state_srej_sent(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err = 0;
+ u16 txseq = control->txseq;
+ bool skb_in_use = false;
+
+ BT_DBG("chan %p, control %p, skb %p, event %d", chan, control, skb,
+ event);
+
+ switch (event) {
+ case L2CAP_EV_RECV_IFRAME:
+ switch (l2cap_classify_txseq(chan, txseq)) {
+ case L2CAP_TXSEQ_EXPECTED:
+ /* Keep frame for reassembly later */
+ l2cap_pass_to_tx(chan, control);
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ chan->expected_tx_seq = __next_seq(chan, txseq);
+ break;
+ case L2CAP_TXSEQ_EXPECTED_SREJ:
+ l2cap_seq_list_pop(&chan->srej_list);
+
+ l2cap_pass_to_tx(chan, control);
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ err = l2cap_rx_queued_iframes(chan);
+ if (err)
+ break;
+
+ break;
+ case L2CAP_TXSEQ_UNEXPECTED:
+ /* Got a frame that can't be reassembled yet.
+ * Save it for later, and send SREJs to cover
+ * the missing frames.
+ */
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ l2cap_pass_to_tx(chan, control);
+ l2cap_send_srej(chan, control->txseq);
+ break;
+ case L2CAP_TXSEQ_UNEXPECTED_SREJ:
+ /* This frame was requested with an SREJ, but
+ * some expected retransmitted frames are
+ * missing. Request retransmission of missing
+ * SREJ'd frames.
+ */
+ skb_queue_tail(&chan->srej_q, skb);
+ skb_in_use = true;
+ BT_DBG("Queued %p (queue len %d)", skb,
+ skb_queue_len(&chan->srej_q));
+
+ l2cap_pass_to_tx(chan, control);
+ l2cap_send_srej_list(chan, control->txseq);
+ break;
+ case L2CAP_TXSEQ_DUPLICATE_SREJ:
+ /* We've already queued this frame. Drop this copy. */
+ l2cap_pass_to_tx(chan, control);
+ break;
+ case L2CAP_TXSEQ_DUPLICATE:
+ /* Expecting a later sequence number, so this frame
+ * was already received. Ignore it completely.
+ */
+ break;
+ case L2CAP_TXSEQ_INVALID_IGNORE:
+ break;
+ case L2CAP_TXSEQ_INVALID:
+ default:
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ break;
+ }
+ break;
+ case L2CAP_EV_RECV_RR:
+ l2cap_pass_to_tx(chan, control);
+ if (control->final) {
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ if (!test_and_clear_bit(CONN_REJ_ACT,
+ &chan->conn_state)) {
+ control->final = 0;
+ l2cap_retransmit_all(chan, control);
+ }
+
+ l2cap_ertm_send(chan);
+ } else if (control->poll) {
+ if (test_and_clear_bit(CONN_REMOTE_BUSY,
+ &chan->conn_state) &&
+ chan->unacked_frames) {
+ __set_retrans_timer(chan);
+ }
+
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+ l2cap_send_srej_tail(chan);
+ } else {
+ if (test_and_clear_bit(CONN_REMOTE_BUSY,
+ &chan->conn_state) &&
+ chan->unacked_frames)
+ __set_retrans_timer(chan);
+
+ l2cap_send_ack(chan);
+ }
+ break;
+ case L2CAP_EV_RECV_RNR:
+ set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+ l2cap_pass_to_tx(chan, control);
+ if (control->poll) {
+ l2cap_send_srej_tail(chan);
+ } else {
+ struct l2cap_ctrl rr_control;
+ memset(&rr_control, 0, sizeof(rr_control));
+ rr_control.sframe = 1;
+ rr_control.super = L2CAP_SUPER_RR;
+ rr_control.reqseq = chan->buffer_seq;
+ l2cap_send_sframe(chan, &rr_control);
+ }
+
+ break;
+ case L2CAP_EV_RECV_REJ:
+ l2cap_handle_rej(chan, control);
+ break;
+ case L2CAP_EV_RECV_SREJ:
+ l2cap_handle_srej(chan, control);
+ break;
+ }
+
+ if (skb && !skb_in_use) {
+ BT_DBG("Freeing %p", skb);
+ kfree_skb(skb);
+ }
+
+ return err;
+}
+
+static int l2cap_finish_move(struct l2cap_chan *chan)
+{
+ BT_DBG("chan %p", chan);
+
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+
+ if (chan->hs_hcon)
+ chan->conn->mtu = chan->hs_hcon->hdev->block_mtu;
+ else
+ chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+
+ return l2cap_resegment(chan);
+}
+
+static int l2cap_rx_state_wait_p(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err;
+
+ BT_DBG("chan %p, control %p, skb %p, event %d", chan, control, skb,
+ event);
+
+ if (!control->poll)
+ return -EPROTO;
+
+ l2cap_process_reqseq(chan, control->reqseq);
+
+ if (!skb_queue_empty(&chan->tx_q))
+ chan->tx_send_head = skb_peek(&chan->tx_q);
+ else
+ chan->tx_send_head = NULL;
+
+ /* Rewind next_tx_seq to the point expected
+ * by the receiver.
+ */
+ chan->next_tx_seq = control->reqseq;
+ chan->unacked_frames = 0;
+
+ err = l2cap_finish_move(chan);
+ if (err)
+ return err;
+
+ set_bit(CONN_SEND_FBIT, &chan->conn_state);
+ l2cap_send_i_or_rr_or_rnr(chan);
+
+ if (event == L2CAP_EV_RECV_IFRAME)
+ return -EPROTO;
+
+ return l2cap_rx_state_recv(chan, control, NULL, event);
+}
+
+static int l2cap_rx_state_wait_f(struct l2cap_chan *chan,
+ struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err;
+
+ if (!control->final)
+ return -EPROTO;
+
+ clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+ chan->rx_state = L2CAP_RX_STATE_RECV;
+ l2cap_process_reqseq(chan, control->reqseq);
+
+ if (!skb_queue_empty(&chan->tx_q))
+ chan->tx_send_head = skb_peek(&chan->tx_q);
+ else
+ chan->tx_send_head = NULL;
+
+ /* Rewind next_tx_seq to the point expected
+ * by the receiver.
+ */
+ chan->next_tx_seq = control->reqseq;
+ chan->unacked_frames = 0;
+
+ if (chan->hs_hcon)
+ chan->conn->mtu = chan->hs_hcon->hdev->block_mtu;
+ else
+ chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+
+ err = l2cap_resegment(chan);
+
+ if (!err)
+ err = l2cap_rx_state_recv(chan, control, skb, event);
+
+ return err;
+}
+
+static bool __valid_reqseq(struct l2cap_chan *chan, u16 reqseq)
+{
+ /* Make sure reqseq is for a packet that has been sent but not acked */
+ u16 unacked;
+
+ unacked = __seq_offset(chan, chan->next_tx_seq, chan->expected_ack_seq);
+ return __seq_offset(chan, chan->next_tx_seq, reqseq) <= unacked;
+}
+
+static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
+ struct sk_buff *skb, u8 event)
+{
+ int err = 0;
+
+ BT_DBG("chan %p, control %p, skb %p, event %d, state %d", chan,
+ control, skb, event, chan->rx_state);
+
+ if (__valid_reqseq(chan, control->reqseq)) {
+ switch (chan->rx_state) {
+ case L2CAP_RX_STATE_RECV:
+ err = l2cap_rx_state_recv(chan, control, skb, event);
+ break;
+ case L2CAP_RX_STATE_SREJ_SENT:
+ err = l2cap_rx_state_srej_sent(chan, control, skb,
+ event);
+ break;
+ case L2CAP_RX_STATE_WAIT_P:
+ err = l2cap_rx_state_wait_p(chan, control, skb, event);
+ break;
+ case L2CAP_RX_STATE_WAIT_F:
+ err = l2cap_rx_state_wait_f(chan, control, skb, event);
+ break;
+ default:
+ /* shut it down */
+ break;
+ }
+ } else {
+ BT_DBG("Invalid reqseq %d (next_tx_seq %d, expected_ack_seq %d",
+ control->reqseq, chan->next_tx_seq,
+ chan->expected_ack_seq);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ }
+
+ return err;
+}
+
+static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
+ struct sk_buff *skb)
+{
+ int err = 0;
+
+ BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb,
+ chan->rx_state);
+
+ if (l2cap_classify_txseq(chan, control->txseq) ==
+ L2CAP_TXSEQ_EXPECTED) {
+ l2cap_pass_to_tx(chan, control);
+
+ BT_DBG("buffer_seq %d->%d", chan->buffer_seq,
+ __next_seq(chan, chan->buffer_seq));
+
+ chan->buffer_seq = __next_seq(chan, chan->buffer_seq);
+
+ l2cap_reassemble_sdu(chan, skb, control);
+ } else {
+ if (chan->sdu) {
+ kfree_skb(chan->sdu);
+ chan->sdu = NULL;
+ }
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+
+ if (skb) {
+ BT_DBG("Freeing %p", skb);
+ kfree_skb(skb);
+ }
+ }
+
+ chan->last_acked_seq = control->txseq;
+ chan->expected_tx_seq = __next_seq(chan, control->txseq);
+
+ return err;
+}
+
+static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct l2cap_ctrl *control = &bt_cb(skb)->l2cap;
+ u16 len;
+ u8 event;
+
+ __unpack_control(chan, skb);
+
+ len = skb->len;
+
+ /*
+ * We can just drop the corrupted I-frame here.
+ * Receiver will miss it and start proper recovery
+ * procedures and ask for retransmission.
+ */
+ if (l2cap_check_fcs(chan, skb))
+ goto drop;
+
+ if (!control->sframe && control->sar == L2CAP_SAR_START)
+ len -= L2CAP_SDULEN_SIZE;
+
+ if (chan->fcs == L2CAP_FCS_CRC16)
+ len -= L2CAP_FCS_SIZE;
+
+ if (len > chan->mps) {
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto drop;
+ }
+
+ if (!control->sframe) {
+ int err;
+
+ BT_DBG("iframe sar %d, reqseq %d, final %d, txseq %d",
+ control->sar, control->reqseq, control->final,
+ control->txseq);
+
+ /* Validate F-bit - F=0 always valid, F=1 only
+ * valid in TX WAIT_F
+ */
+ if (control->final && chan->tx_state != L2CAP_TX_STATE_WAIT_F)
+ goto drop;
+
+ if (chan->mode != L2CAP_MODE_STREAMING) {
+ event = L2CAP_EV_RECV_IFRAME;
+ err = l2cap_rx(chan, control, skb, event);
+ } else {
+ err = l2cap_stream_rx(chan, control, skb);
+ }
+
+ if (err)
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ } else {
+ const u8 rx_func_to_event[4] = {
+ L2CAP_EV_RECV_RR, L2CAP_EV_RECV_REJ,
+ L2CAP_EV_RECV_RNR, L2CAP_EV_RECV_SREJ
+ };
+
+ /* Only I-frames are expected in streaming mode */
+ if (chan->mode == L2CAP_MODE_STREAMING)
+ goto drop;
+
+ BT_DBG("sframe reqseq %d, final %d, poll %d, super %d",
+ control->reqseq, control->final, control->poll,
+ control->super);
+
+ if (len != 0) {
+ BT_ERR("Trailing bytes: %d in sframe", len);
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ goto drop;
+ }
+
+ /* Validate F and P bits */
+ if (control->final && (control->poll ||
+ chan->tx_state != L2CAP_TX_STATE_WAIT_F))
+ goto drop;
+
+ event = rx_func_to_event[control->super];
+ if (l2cap_rx(chan, control, skb, event))
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ }
+
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_le_credits pkt;
+ u16 return_credits;
+
+ /* We return more credits to the sender only after the amount of
+ * credits falls below half of the initial amount.
+ */
+ if (chan->rx_credits >= (le_max_credits + 1) / 2)
+ return;
+
+ return_credits = le_max_credits - chan->rx_credits;
+
+ BT_DBG("chan %p returning %u credits to sender", chan, return_credits);
+
+ chan->rx_credits += return_credits;
+
+ pkt.cid = cpu_to_le16(chan->scid);
+ pkt.credits = cpu_to_le16(return_credits);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt);
+}
+
+static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ int err;
+
+ if (!chan->rx_credits) {
+ BT_ERR("No credits to receive LE L2CAP data");
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return -ENOBUFS;
+ }
+
+ if (chan->imtu < skb->len) {
+ BT_ERR("Too big LE L2CAP PDU");
+ return -ENOBUFS;
+ }
+
+ chan->rx_credits--;
+ BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits);
+
+ l2cap_chan_le_send_credits(chan);
+
+ err = 0;
+
+ if (!chan->sdu) {
+ u16 sdu_len;
+
+ sdu_len = get_unaligned_le16(skb->data);
+ skb_pull(skb, L2CAP_SDULEN_SIZE);
+
+ BT_DBG("Start of new SDU. sdu_len %u skb->len %u imtu %u",
+ sdu_len, skb->len, chan->imtu);
+
+ if (sdu_len > chan->imtu) {
+ BT_ERR("Too big LE L2CAP SDU length received");
+ err = -EMSGSIZE;
+ goto failed;
+ }
+
+ if (skb->len > sdu_len) {
+ BT_ERR("Too much LE L2CAP data received");
+ err = -EINVAL;
+ goto failed;
+ }
+
+ if (skb->len == sdu_len)
+ return chan->ops->recv(chan, skb);
+
+ chan->sdu = skb;
+ chan->sdu_len = sdu_len;
+ chan->sdu_last_frag = skb;
+
+ return 0;
+ }
+
+ BT_DBG("SDU fragment. chan->sdu->len %u skb->len %u chan->sdu_len %u",
+ chan->sdu->len, skb->len, chan->sdu_len);
+
+ if (chan->sdu->len + skb->len > chan->sdu_len) {
+ BT_ERR("Too much LE L2CAP data received");
+ err = -EINVAL;
+ goto failed;
+ }
+
+ append_skb_frag(chan->sdu, skb, &chan->sdu_last_frag);
+ skb = NULL;
+
+ if (chan->sdu->len == chan->sdu_len) {
+ err = chan->ops->recv(chan, chan->sdu);
+ if (!err) {
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ }
+ }
+
+failed:
+ if (err) {
+ kfree_skb(skb);
+ kfree_skb(chan->sdu);
+ chan->sdu = NULL;
+ chan->sdu_last_frag = NULL;
+ chan->sdu_len = 0;
+ }
+
+ /* We can't return an error here since we took care of the skb
+ * freeing internally. An error return would cause the caller to
+ * do a double-free of the skb.
+ */
+ return 0;
+}
+
+static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
+ struct sk_buff *skb)
+{
+ struct l2cap_chan *chan;
+
+ chan = l2cap_get_chan_by_scid(conn, cid);
+ if (!chan) {
+ if (cid == L2CAP_CID_A2MP) {
+ chan = a2mp_channel_create(conn, skb);
+ if (!chan) {
+ kfree_skb(skb);
+ return;
+ }
+
+ l2cap_chan_lock(chan);
+ } else {
+ BT_DBG("unknown cid 0x%4.4x", cid);
+ /* Drop packet and return */
+ kfree_skb(skb);
+ return;
+ }
+ }
+
+ BT_DBG("chan %p, len %d", chan, skb->len);
+
+ /* If we receive data on a fixed channel before the info req/rsp
+ * procdure is done simply assume that the channel is supported
+ * and mark it as ready.
+ */
+ if (chan->chan_type == L2CAP_CHAN_FIXED)
+ l2cap_chan_ready(chan);
+
+ if (chan->state != BT_CONNECTED)
+ goto drop;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ if (l2cap_le_data_rcv(chan, skb) < 0)
+ goto drop;
+
+ goto done;
+
+ case L2CAP_MODE_BASIC:
+ /* If socket recv buffers overflows we drop data here
+ * which is *bad* because L2CAP has to be reliable.
+ * But we don't have any other choice. L2CAP doesn't
+ * provide flow control mechanism. */
+
+ if (chan->imtu < skb->len) {
+ BT_ERR("Dropping L2CAP data: receive buffer overflow");
+ goto drop;
+ }
+
+ if (!chan->ops->recv(chan, skb))
+ goto done;
+ break;
+
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ l2cap_data_rcv(chan, skb);
+ goto done;
+
+ default:
+ BT_DBG("chan %p: bad mode 0x%2.2x", chan, chan->mode);
+ break;
+ }
+
+drop:
+ kfree_skb(skb);
+
+done:
+ l2cap_chan_unlock(chan);
+}
+
+static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm,
+ struct sk_buff *skb)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan;
+
+ if (hcon->type != ACL_LINK)
+ goto free_skb;
+
+ chan = l2cap_global_chan_by_psm(0, psm, &hcon->src, &hcon->dst,
+ ACL_LINK);
+ if (!chan)
+ goto free_skb;
+
+ BT_DBG("chan %p, len %d", chan, skb->len);
+
+ if (chan->state != BT_BOUND && chan->state != BT_CONNECTED)
+ goto drop;
+
+ if (chan->imtu < skb->len)
+ goto drop;
+
+ /* Store remote BD_ADDR and PSM for msg_name */
+ bacpy(&bt_cb(skb)->l2cap.bdaddr, &hcon->dst);
+ bt_cb(skb)->l2cap.psm = psm;
+
+ if (!chan->ops->recv(chan, skb)) {
+ l2cap_chan_put(chan);
+ return;
+ }
+
+drop:
+ l2cap_chan_put(chan);
+free_skb:
+ kfree_skb(skb);
+}
+
+static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct l2cap_hdr *lh = (void *) skb->data;
+ struct hci_conn *hcon = conn->hcon;
+ u16 cid, len;
+ __le16 psm;
+
+ if (hcon->state != BT_CONNECTED) {
+ BT_DBG("queueing pending rx skb");
+ skb_queue_tail(&conn->pending_rx, skb);
+ return;
+ }
+
+ skb_pull(skb, L2CAP_HDR_SIZE);
+ cid = __le16_to_cpu(lh->cid);
+ len = __le16_to_cpu(lh->len);
+
+ if (len != skb->len) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* Since we can't actively block incoming LE connections we must
+ * at least ensure that we ignore incoming data from them.
+ */
+ if (hcon->type == LE_LINK &&
+ hci_bdaddr_list_lookup(&hcon->hdev->blacklist, &hcon->dst,
+ bdaddr_dst_type(hcon))) {
+ kfree_skb(skb);
+ return;
+ }
+
+ BT_DBG("len %d, cid 0x%4.4x", len, cid);
+
+ switch (cid) {
+ case L2CAP_CID_SIGNALING:
+ l2cap_sig_channel(conn, skb);
+ break;
+
+ case L2CAP_CID_CONN_LESS:
+ psm = get_unaligned((__le16 *) skb->data);
+ skb_pull(skb, L2CAP_PSMLEN_SIZE);
+ l2cap_conless_channel(conn, psm, skb);
+ break;
+
+ case L2CAP_CID_LE_SIGNALING:
+ l2cap_le_sig_channel(conn, skb);
+ break;
+
+ default:
+ l2cap_data_channel(conn, cid, skb);
+ break;
+ }
+}
+
+static void process_pending_rx(struct work_struct *work)
+{
+ struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
+ pending_rx_work);
+ struct sk_buff *skb;
+
+ BT_DBG("");
+
+ while ((skb = skb_dequeue(&conn->pending_rx)))
+ l2cap_recv_frame(conn, skb);
+}
+
+static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct hci_chan *hchan;
+
+ if (conn)
+ return conn;
+
+ hchan = hci_chan_create(hcon);
+ if (!hchan)
+ return NULL;
+
+ conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+ if (!conn) {
+ hci_chan_del(hchan);
+ return NULL;
+ }
+
+ kref_init(&conn->ref);
+ hcon->l2cap_data = conn;
+ conn->hcon = hci_conn_get(hcon);
+ conn->hchan = hchan;
+
+ BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan);
+
+ switch (hcon->type) {
+ case LE_LINK:
+ if (hcon->hdev->le_mtu) {
+ conn->mtu = hcon->hdev->le_mtu;
+ break;
+ }
+ /* fall through */
+ default:
+ conn->mtu = hcon->hdev->acl_mtu;
+ break;
+ }
+
+ conn->feat_mask = 0;
+
+ conn->local_fixed_chan = L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS;
+
+ if (hcon->type == ACL_LINK &&
+ hci_dev_test_flag(hcon->hdev, HCI_HS_ENABLED))
+ conn->local_fixed_chan |= L2CAP_FC_A2MP;
+
+ if (hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED) &&
+ (bredr_sc_enabled(hcon->hdev) ||
+ hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP)))
+ conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR;
+
+ mutex_init(&conn->ident_lock);
+ mutex_init(&conn->chan_lock);
+
+ INIT_LIST_HEAD(&conn->chan_l);
+ INIT_LIST_HEAD(&conn->users);
+
+ INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout);
+
+ skb_queue_head_init(&conn->pending_rx);
+ INIT_WORK(&conn->pending_rx_work, process_pending_rx);
+ INIT_WORK(&conn->id_addr_update_work, l2cap_conn_update_id_addr);
+
+ conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM;
+
+ return conn;
+}
+
+static bool is_valid_psm(u16 psm, u8 dst_type) {
+ if (!psm)
+ return false;
+
+ if (bdaddr_type_is_le(dst_type))
+ return (psm <= 0x00ff);
+
+ /* PSM must be odd and lsb of upper byte must be 0 */
+ return ((psm & 0x0101) == 0x0001);
+}
+
+int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
+ bdaddr_t *dst, u8 dst_type)
+{
+ struct l2cap_conn *conn;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ int err;
+
+ BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst,
+ dst_type, __le16_to_cpu(psm));
+
+ hdev = hci_get_route(dst, &chan->src);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+
+ if (!is_valid_psm(__le16_to_cpu(psm), dst_type) && !cid &&
+ chan->chan_type != L2CAP_CHAN_RAW) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED && !psm) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (chan->chan_type == L2CAP_CHAN_FIXED && !cid) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ break;
+ case L2CAP_MODE_LE_FLOWCTL:
+ l2cap_le_flowctl_init(chan);
+ break;
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ if (!disable_ertm)
+ break;
+ /* fall through */
+ default:
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ switch (chan->state) {
+ case BT_CONNECT:
+ case BT_CONNECT2:
+ case BT_CONFIG:
+ /* Already connecting */
+ err = 0;
+ goto done;
+
+ case BT_CONNECTED:
+ /* Already connected */
+ err = -EISCONN;
+ goto done;
+
+ case BT_OPEN:
+ case BT_BOUND:
+ /* Can connect */
+ break;
+
+ default:
+ err = -EBADFD;
+ goto done;
+ }
+
+ /* Set destination address and psm */
+ bacpy(&chan->dst, dst);
+ chan->dst_type = dst_type;
+
+ chan->psm = psm;
+ chan->dcid = cid;
+
+ if (bdaddr_type_is_le(dst_type)) {
+ u8 role;
+
+ /* Convert from L2CAP channel address type to HCI address type
+ */
+ if (dst_type == BDADDR_LE_PUBLIC)
+ dst_type = ADDR_LE_DEV_PUBLIC;
+ else
+ dst_type = ADDR_LE_DEV_RANDOM;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ role = HCI_ROLE_SLAVE;
+ else
+ role = HCI_ROLE_MASTER;
+
+ hcon = hci_connect_le(hdev, dst, dst_type, chan->sec_level,
+ HCI_LE_CONN_TIMEOUT, role);
+ } else {
+ u8 auth_type = l2cap_get_auth_type(chan);
+ hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type);
+ }
+
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto done;
+ }
+
+ conn = l2cap_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto done;
+ }
+
+ mutex_lock(&conn->chan_lock);
+ l2cap_chan_lock(chan);
+
+ if (cid && __l2cap_get_chan_by_dcid(conn, cid)) {
+ hci_conn_drop(hcon);
+ err = -EBUSY;
+ goto chan_unlock;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&chan->src, &hcon->src);
+ chan->src_type = bdaddr_src_type(hcon);
+
+ __l2cap_chan_add(conn, chan);
+
+ /* l2cap_chan_add takes its own ref so we can drop this one */
+ hci_conn_drop(hcon);
+
+ l2cap_state_change(chan, BT_CONNECT);
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ /* Release chan->sport so that it can be reused by other
+ * sockets (as it's only used for listening sockets).
+ */
+ write_lock(&chan_list_lock);
+ chan->sport = 0;
+ write_unlock(&chan_list_lock);
+
+ if (hcon->state == BT_CONNECTED) {
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ __clear_chan_timer(chan);
+ if (l2cap_chan_check_security(chan, true))
+ l2cap_state_change(chan, BT_CONNECTED);
+ } else
+ l2cap_do_start(chan);
+ }
+
+ err = 0;
+
+chan_unlock:
+ l2cap_chan_unlock(chan);
+ mutex_unlock(&conn->chan_lock);
+done:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+EXPORT_SYMBOL_GPL(l2cap_chan_connect);
+
+/* ---- L2CAP interface with lower layer (HCI) ---- */
+
+int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+ int exact = 0, lm1 = 0, lm2 = 0;
+ struct l2cap_chan *c;
+
+ BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr);
+
+ /* Find listening sockets and check their link_mode */
+ read_lock(&chan_list_lock);
+ list_for_each_entry(c, &chan_list, global_l) {
+ if (c->state != BT_LISTEN)
+ continue;
+
+ if (!bacmp(&c->src, &hdev->bdaddr)) {
+ lm1 |= HCI_LM_ACCEPT;
+ if (test_bit(FLAG_ROLE_SWITCH, &c->flags))
+ lm1 |= HCI_LM_MASTER;
+ exact++;
+ } else if (!bacmp(&c->src, BDADDR_ANY)) {
+ lm2 |= HCI_LM_ACCEPT;
+ if (test_bit(FLAG_ROLE_SWITCH, &c->flags))
+ lm2 |= HCI_LM_MASTER;
+ }
+ }
+ read_unlock(&chan_list_lock);
+
+ return exact ? lm1 : lm2;
+}
+
+/* Find the next fixed channel in BT_LISTEN state, continue iteration
+ * from an existing channel in the list or from the beginning of the
+ * global list (by passing NULL as first parameter).
+ */
+static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c,
+ struct hci_conn *hcon)
+{
+ u8 src_type = bdaddr_src_type(hcon);
+
+ read_lock(&chan_list_lock);
+
+ if (c)
+ c = list_next_entry(c, global_l);
+ else
+ c = list_entry(chan_list.next, typeof(*c), global_l);
+
+ list_for_each_entry_from(c, &chan_list, global_l) {
+ if (c->chan_type != L2CAP_CHAN_FIXED)
+ continue;
+ if (c->state != BT_LISTEN)
+ continue;
+ if (bacmp(&c->src, &hcon->src) && bacmp(&c->src, BDADDR_ANY))
+ continue;
+ if (src_type != c->src_type)
+ continue;
+
+ l2cap_chan_hold(c);
+ read_unlock(&chan_list_lock);
+ return c;
+ }
+
+ read_unlock(&chan_list_lock);
+
+ return NULL;
+}
+
+static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
+{
+ struct hci_dev *hdev = hcon->hdev;
+ struct l2cap_conn *conn;
+ struct l2cap_chan *pchan;
+ u8 dst_type;
+
+ if (hcon->type != ACL_LINK && hcon->type != LE_LINK)
+ return;
+
+ BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status);
+
+ if (status) {
+ l2cap_conn_del(hcon, bt_to_errno(status));
+ return;
+ }
+
+ conn = l2cap_conn_add(hcon);
+ if (!conn)
+ return;
+
+ dst_type = bdaddr_dst_type(hcon);
+
+ /* If device is blocked, do not create channels for it */
+ if (hci_bdaddr_list_lookup(&hdev->blacklist, &hcon->dst, dst_type))
+ return;
+
+ /* Find fixed channels and notify them of the new connection. We
+ * use multiple individual lookups, continuing each time where
+ * we left off, because the list lock would prevent calling the
+ * potentially sleeping l2cap_chan_lock() function.
+ */
+ pchan = l2cap_global_fixed_chan(NULL, hcon);
+ while (pchan) {
+ struct l2cap_chan *chan, *next;
+
+ /* Client fixed channels should override server ones */
+ if (__l2cap_get_chan_by_dcid(conn, pchan->scid))
+ goto next;
+
+ l2cap_chan_lock(pchan);
+ chan = pchan->ops->new_connection(pchan);
+ if (chan) {
+ bacpy(&chan->src, &hcon->src);
+ bacpy(&chan->dst, &hcon->dst);
+ chan->src_type = bdaddr_src_type(hcon);
+ chan->dst_type = dst_type;
+
+ __l2cap_chan_add(conn, chan);
+ }
+
+ l2cap_chan_unlock(pchan);
+next:
+ next = l2cap_global_fixed_chan(pchan, hcon);
+ l2cap_chan_put(pchan);
+ pchan = next;
+ }
+
+ l2cap_conn_ready(conn);
+}
+
+int l2cap_disconn_ind(struct hci_conn *hcon)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+
+ BT_DBG("hcon %p", hcon);
+
+ if (!conn)
+ return HCI_ERROR_REMOTE_USER_TERM;
+ return conn->disc_reason;
+}
+
+static void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason)
+{
+ if (hcon->type != ACL_LINK && hcon->type != LE_LINK)
+ return;
+
+ BT_DBG("hcon %p reason %d", hcon, reason);
+
+ l2cap_conn_del(hcon, bt_to_errno(reason));
+}
+
+static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt)
+{
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED)
+ return;
+
+ if (encrypt == 0x00) {
+ if (chan->sec_level == BT_SECURITY_MEDIUM) {
+ __set_chan_timer(chan, L2CAP_ENC_TIMEOUT);
+ } else if (chan->sec_level == BT_SECURITY_HIGH ||
+ chan->sec_level == BT_SECURITY_FIPS)
+ l2cap_chan_close(chan, ECONNREFUSED);
+ } else {
+ if (chan->sec_level == BT_SECURITY_MEDIUM)
+ __clear_chan_timer(chan);
+ }
+}
+
+static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan;
+
+ if (!conn)
+ return;
+
+ BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt);
+
+ mutex_lock(&conn->chan_lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ l2cap_chan_lock(chan);
+
+ BT_DBG("chan %p scid 0x%4.4x state %s", chan, chan->scid,
+ state_to_string(chan->state));
+
+ if (chan->scid == L2CAP_CID_A2MP) {
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (!status && encrypt)
+ chan->sec_level = hcon->sec_level;
+
+ if (!__l2cap_no_conn_pending(chan)) {
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (!status && (chan->state == BT_CONNECTED ||
+ chan->state == BT_CONFIG)) {
+ chan->ops->resume(chan);
+ l2cap_check_encryption(chan, encrypt);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ if (chan->state == BT_CONNECT) {
+ if (!status)
+ l2cap_start_connection(chan);
+ else
+ __set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
+ } else if (chan->state == BT_CONNECT2 &&
+ chan->mode != L2CAP_MODE_LE_FLOWCTL) {
+ struct l2cap_conn_rsp rsp;
+ __u16 res, stat;
+
+ if (!status) {
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ res = L2CAP_CR_PEND;
+ stat = L2CAP_CS_AUTHOR_PEND;
+ chan->ops->defer(chan);
+ } else {
+ l2cap_state_change(chan, BT_CONFIG);
+ res = L2CAP_CR_SUCCESS;
+ stat = L2CAP_CS_NO_INFO;
+ }
+ } else {
+ l2cap_state_change(chan, BT_DISCONN);
+ __set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
+ res = L2CAP_CR_SEC_BLOCK;
+ stat = L2CAP_CS_NO_INFO;
+ }
+
+ rsp.scid = cpu_to_le16(chan->dcid);
+ rsp.dcid = cpu_to_le16(chan->scid);
+ rsp.result = cpu_to_le16(res);
+ rsp.status = cpu_to_le16(stat);
+ l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP,
+ sizeof(rsp), &rsp);
+
+ if (!test_bit(CONF_REQ_SENT, &chan->conf_state) &&
+ res == L2CAP_CR_SUCCESS) {
+ char buf[128];
+ set_bit(CONF_REQ_SENT, &chan->conf_state);
+ l2cap_send_cmd(conn, l2cap_get_ident(conn),
+ L2CAP_CONF_REQ,
+ l2cap_build_conf_req(chan, buf),
+ buf);
+ chan->num_conf_req++;
+ }
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+}
+
+int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_hdr *hdr;
+ int len;
+
+ /* For AMP controller do not create l2cap conn */
+ if (!conn && hcon->hdev->dev_type != HCI_BREDR)
+ goto drop;
+
+ if (!conn)
+ conn = l2cap_conn_add(hcon);
+
+ if (!conn)
+ goto drop;
+
+ BT_DBG("conn %p len %d flags 0x%x", conn, skb->len, flags);
+
+ switch (flags) {
+ case ACL_START:
+ case ACL_START_NO_FLUSH:
+ case ACL_COMPLETE:
+ if (conn->rx_len) {
+ BT_ERR("Unexpected start frame (len %d)", skb->len);
+ kfree_skb(conn->rx_skb);
+ conn->rx_skb = NULL;
+ conn->rx_len = 0;
+ l2cap_conn_unreliable(conn, ECOMM);
+ }
+
+ /* Start fragment always begin with Basic L2CAP header */
+ if (skb->len < L2CAP_HDR_SIZE) {
+ BT_ERR("Frame is too short (len %d)", skb->len);
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ hdr = (struct l2cap_hdr *) skb->data;
+ len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE;
+
+ if (len == skb->len) {
+ /* Complete frame received */
+ l2cap_recv_frame(conn, skb);
+ return 0;
+ }
+
+ BT_DBG("Start: total len %d, frag len %d", len, skb->len);
+
+ if (skb->len > len) {
+ BT_ERR("Frame is too long (len %d, expected len %d)",
+ skb->len, len);
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ /* Allocate skb for the complete frame (with header) */
+ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL);
+ if (!conn->rx_skb)
+ goto drop;
+
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+ skb->len);
+ conn->rx_len = len - skb->len;
+ break;
+
+ case ACL_CONT:
+ BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len);
+
+ if (!conn->rx_len) {
+ BT_ERR("Unexpected continuation frame (len %d)", skb->len);
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ if (skb->len > conn->rx_len) {
+ BT_ERR("Fragment is too long (len %d, expected %d)",
+ skb->len, conn->rx_len);
+ kfree_skb(conn->rx_skb);
+ conn->rx_skb = NULL;
+ conn->rx_len = 0;
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+ skb->len);
+ conn->rx_len -= skb->len;
+
+ if (!conn->rx_len) {
+ /* Complete frame received. l2cap_recv_frame
+ * takes ownership of the skb so set the global
+ * rx_skb pointer to NULL first.
+ */
+ struct sk_buff *rx_skb = conn->rx_skb;
+ conn->rx_skb = NULL;
+ l2cap_recv_frame(conn, rx_skb);
+ }
+ break;
+ }
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct hci_cb l2cap_cb = {
+ .name = "L2CAP",
+ .connect_cfm = l2cap_connect_cfm,
+ .disconn_cfm = l2cap_disconn_cfm,
+ .security_cfm = l2cap_security_cfm,
+};
+
+static int l2cap_debugfs_show(struct seq_file *f, void *p)
+{
+ struct l2cap_chan *c;
+
+ read_lock(&chan_list_lock);
+
+ list_for_each_entry(c, &chan_list, global_l) {
+ seq_printf(f, "%pMR (%u) %pMR (%u) %d %d 0x%4.4x 0x%4.4x %d %d %d %d\n",
+ &c->src, c->src_type, &c->dst, c->dst_type,
+ c->state, __le16_to_cpu(c->psm),
+ c->scid, c->dcid, c->imtu, c->omtu,
+ c->sec_level, c->mode);
+ }
+
+ read_unlock(&chan_list_lock);
+
+ return 0;
+}
+
+static int l2cap_debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, l2cap_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations l2cap_debugfs_fops = {
+ .open = l2cap_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *l2cap_debugfs;
+
+int __init l2cap_init(void)
+{
+ int err;
+
+ err = l2cap_init_sockets();
+ if (err < 0)
+ return err;
+
+ hci_register_cb(&l2cap_cb);
+
+ if (IS_ERR_OR_NULL(bt_debugfs))
+ return 0;
+
+ l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs,
+ NULL, &l2cap_debugfs_fops);
+
+ debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs,
+ &le_max_credits);
+ debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs,
+ &le_default_mps);
+
+ return 0;
+}
+
+void l2cap_exit(void)
+{
+ debugfs_remove(l2cap_debugfs);
+ hci_unregister_cb(&l2cap_cb);
+ l2cap_cleanup_sockets();
+}
+
+module_param(disable_ertm, bool, 0644);
+MODULE_PARM_DESC(disable_ertm, "Disable enhanced retransmission mode");
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
new file mode 100644
index 000000000..a7278f05e
--- /dev/null
+++ b/net/bluetooth/l2cap_sock.c
@@ -0,0 +1,1650 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org>
+ Copyright (C) 2010 Google Inc.
+ Copyright (C) 2011 ProFUSION Embedded Systems
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth L2CAP sockets. */
+
+#include <linux/module.h>
+#include <linux/export.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "smp.h"
+
+static struct bt_sock_list l2cap_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(l2cap_sk_list.lock)
+};
+
+static const struct proto_ops l2cap_sock_ops;
+static void l2cap_sock_init(struct sock *sk, struct sock *parent);
+static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio);
+
+bool l2cap_is_socket(struct socket *sock)
+{
+ return sock && sock->ops == &l2cap_sock_ops;
+}
+EXPORT_SYMBOL(l2cap_is_socket);
+
+static int l2cap_validate_bredr_psm(u16 psm)
+{
+ /* PSM must be odd and lsb of upper byte must be 0 */
+ if ((psm & 0x0101) != 0x0001)
+ return -EINVAL;
+
+ /* Restrict usage of well-known PSMs */
+ if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ return 0;
+}
+
+static int l2cap_validate_le_psm(u16 psm)
+{
+ /* Valid LE_PSM ranges are defined only until 0x00ff */
+ if (psm > 0x00ff)
+ return -EINVAL;
+
+ /* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */
+ if (psm <= 0x007f && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ return 0;
+}
+
+static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct sockaddr_l2 la;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (!addr || addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ memset(&la, 0, sizeof(la));
+ len = min_t(unsigned int, sizeof(la), alen);
+ memcpy(&la, addr, len);
+
+ if (la.l2_cid && la.l2_psm)
+ return -EINVAL;
+
+ if (!bdaddr_type_is_valid(la.l2_bdaddr_type))
+ return -EINVAL;
+
+ if (bdaddr_type_is_le(la.l2_bdaddr_type)) {
+ /* We only allow ATT user space socket */
+ if (la.l2_cid &&
+ la.l2_cid != cpu_to_le16(L2CAP_CID_ATT))
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_OPEN) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (la.l2_psm) {
+ __u16 psm = __le16_to_cpu(la.l2_psm);
+
+ if (la.l2_bdaddr_type == BDADDR_BREDR)
+ err = l2cap_validate_bredr_psm(psm);
+ else
+ err = l2cap_validate_le_psm(psm);
+
+ if (err)
+ goto done;
+ }
+
+ if (la.l2_cid)
+ err = l2cap_add_scid(chan, __le16_to_cpu(la.l2_cid));
+ else
+ err = l2cap_add_psm(chan, &la.l2_bdaddr, la.l2_psm);
+
+ if (err < 0)
+ goto done;
+
+ switch (chan->chan_type) {
+ case L2CAP_CHAN_CONN_LESS:
+ if (__le16_to_cpu(la.l2_psm) == L2CAP_PSM_3DSP)
+ chan->sec_level = BT_SECURITY_SDP;
+ break;
+ case L2CAP_CHAN_CONN_ORIENTED:
+ if (__le16_to_cpu(la.l2_psm) == L2CAP_PSM_SDP ||
+ __le16_to_cpu(la.l2_psm) == L2CAP_PSM_RFCOMM)
+ chan->sec_level = BT_SECURITY_SDP;
+ break;
+ case L2CAP_CHAN_RAW:
+ chan->sec_level = BT_SECURITY_SDP;
+ break;
+ case L2CAP_CHAN_FIXED:
+ /* Fixed channels default to the L2CAP core not holding a
+ * hci_conn reference for them. For fixed channels mapping to
+ * L2CAP sockets we do want to hold a reference so set the
+ * appropriate flag to request it.
+ */
+ set_bit(FLAG_HOLD_HCI_CONN, &chan->flags);
+ break;
+ }
+
+ bacpy(&chan->src, &la.l2_bdaddr);
+ chan->src_type = la.l2_bdaddr_type;
+
+ if (chan->psm && bdaddr_type_is_le(chan->src_type))
+ chan->mode = L2CAP_MODE_LE_FLOWCTL;
+
+ chan->state = BT_BOUND;
+ sk->sk_state = BT_BOUND;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct sockaddr_l2 la;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (!addr || alen < sizeof(addr->sa_family) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ memset(&la, 0, sizeof(la));
+ len = min_t(unsigned int, sizeof(la), alen);
+ memcpy(&la, addr, len);
+
+ if (la.l2_cid && la.l2_psm)
+ return -EINVAL;
+
+ if (!bdaddr_type_is_valid(la.l2_bdaddr_type))
+ return -EINVAL;
+
+ /* Check that the socket wasn't bound to something that
+ * conflicts with the address given to connect(). If chan->src
+ * is BDADDR_ANY it means bind() was never used, in which case
+ * chan->src_type and la.l2_bdaddr_type do not need to match.
+ */
+ if (chan->src_type == BDADDR_BREDR && bacmp(&chan->src, BDADDR_ANY) &&
+ bdaddr_type_is_le(la.l2_bdaddr_type)) {
+ /* Old user space versions will try to incorrectly bind
+ * the ATT socket using BDADDR_BREDR. We need to accept
+ * this and fix up the source address type only when
+ * both the source CID and destination CID indicate
+ * ATT. Anything else is an invalid combination.
+ */
+ if (chan->scid != L2CAP_CID_ATT ||
+ la.l2_cid != cpu_to_le16(L2CAP_CID_ATT))
+ return -EINVAL;
+
+ /* We don't have the hdev available here to make a
+ * better decision on random vs public, but since all
+ * user space versions that exhibit this issue anyway do
+ * not support random local addresses assuming public
+ * here is good enough.
+ */
+ chan->src_type = BDADDR_LE_PUBLIC;
+ }
+
+ if (chan->src_type != BDADDR_BREDR && la.l2_bdaddr_type == BDADDR_BREDR)
+ return -EINVAL;
+
+ if (bdaddr_type_is_le(la.l2_bdaddr_type)) {
+ /* We only allow ATT user space socket */
+ if (la.l2_cid &&
+ la.l2_cid != cpu_to_le16(L2CAP_CID_ATT))
+ return -EINVAL;
+ }
+
+ if (chan->psm && bdaddr_type_is_le(chan->src_type))
+ chan->mode = L2CAP_MODE_LE_FLOWCTL;
+
+ err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid),
+ &la.l2_bdaddr, la.l2_bdaddr_type);
+ if (err)
+ return err;
+
+ lock_sock(sk);
+
+ err = bt_sock_wait_state(sk, BT_CONNECTED,
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
+
+ release_sock(sk);
+
+ return err;
+}
+
+static int l2cap_sock_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ int err = 0;
+
+ BT_DBG("sk %p backlog %d", sk, backlog);
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_BOUND) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ case L2CAP_MODE_LE_FLOWCTL:
+ break;
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ if (!disable_ertm)
+ break;
+ /* fall through */
+ default:
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+
+ /* Listening channels need to use nested locking in order not to
+ * cause lockdep warnings when the created child channels end up
+ * being locked in the same thread as the parent channel.
+ */
+ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT);
+
+ chan->state = BT_LISTEN;
+ sk->sk_state = BT_LISTEN;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
+ int flags)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = sock->sk, *nsk;
+ long timeo;
+ int err = 0;
+
+ lock_sock_nested(sk, L2CAP_NESTING_PARENT);
+
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ BT_DBG("sk %p timeo %ld", sk, timeo);
+
+ /* Wait for an incoming connection. (wake-one). */
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (1) {
+ if (sk->sk_state != BT_LISTEN) {
+ err = -EBADFD;
+ break;
+ }
+
+ nsk = bt_accept_dequeue(sk, newsock);
+ if (nsk)
+ break;
+
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+
+ lock_sock_nested(sk, L2CAP_NESTING_PARENT);
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (err)
+ goto done;
+
+ newsock->state = SS_CONNECTED;
+
+ BT_DBG("new socket %p", nsk);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int *len, int peer)
+{
+ struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (peer && sk->sk_state != BT_CONNECTED &&
+ sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2 &&
+ sk->sk_state != BT_CONFIG)
+ return -ENOTCONN;
+
+ memset(la, 0, sizeof(struct sockaddr_l2));
+ addr->sa_family = AF_BLUETOOTH;
+ *len = sizeof(struct sockaddr_l2);
+
+ la->l2_psm = chan->psm;
+
+ if (peer) {
+ bacpy(&la->l2_bdaddr, &chan->dst);
+ la->l2_cid = cpu_to_le16(chan->dcid);
+ la->l2_bdaddr_type = chan->dst_type;
+ } else {
+ bacpy(&la->l2_bdaddr, &chan->src);
+ la->l2_cid = cpu_to_le16(chan->scid);
+ la->l2_bdaddr_type = chan->src_type;
+ }
+
+ return 0;
+}
+
+static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct l2cap_options opts;
+ struct l2cap_conninfo cinfo;
+ int len, err = 0;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case L2CAP_OPTIONS:
+ /* LE sockets should use BT_SNDMTU/BT_RCVMTU, but since
+ * legacy ATT code depends on getsockopt for
+ * L2CAP_OPTIONS we need to let this pass.
+ */
+ if (bdaddr_type_is_le(chan->src_type) &&
+ chan->scid != L2CAP_CID_ATT) {
+ err = -EINVAL;
+ break;
+ }
+
+ memset(&opts, 0, sizeof(opts));
+ opts.imtu = chan->imtu;
+ opts.omtu = chan->omtu;
+ opts.flush_to = chan->flush_to;
+ opts.mode = chan->mode;
+ opts.fcs = chan->fcs;
+ opts.max_tx = chan->max_tx;
+ opts.txwin_size = chan->tx_win;
+
+ len = min_t(unsigned int, len, sizeof(opts));
+ if (copy_to_user(optval, (char *) &opts, len))
+ err = -EFAULT;
+
+ break;
+
+ case L2CAP_LM:
+ switch (chan->sec_level) {
+ case BT_SECURITY_LOW:
+ opt = L2CAP_LM_AUTH;
+ break;
+ case BT_SECURITY_MEDIUM:
+ opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT;
+ break;
+ case BT_SECURITY_HIGH:
+ opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT |
+ L2CAP_LM_SECURE;
+ break;
+ case BT_SECURITY_FIPS:
+ opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT |
+ L2CAP_LM_SECURE | L2CAP_LM_FIPS;
+ break;
+ default:
+ opt = 0;
+ break;
+ }
+
+ if (test_bit(FLAG_ROLE_SWITCH, &chan->flags))
+ opt |= L2CAP_LM_MASTER;
+
+ if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags))
+ opt |= L2CAP_LM_RELIABLE;
+
+ if (put_user(opt, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case L2CAP_CONNINFO:
+ if (sk->sk_state != BT_CONNECTED &&
+ !(sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ memset(&cinfo, 0, sizeof(cinfo));
+ cinfo.hci_handle = chan->conn->hcon->handle;
+ memcpy(cinfo.dev_class, chan->conn->hcon->dev_class, 3);
+
+ len = min_t(unsigned int, len, sizeof(cinfo));
+ if (copy_to_user(optval, (char *) &cinfo, len))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct bt_security sec;
+ struct bt_power pwr;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_L2CAP)
+ return l2cap_sock_getsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+ chan->chan_type != L2CAP_CHAN_FIXED &&
+ chan->chan_type != L2CAP_CHAN_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ memset(&sec, 0, sizeof(sec));
+ if (chan->conn) {
+ sec.level = chan->conn->hcon->sec_level;
+
+ if (sk->sk_state == BT_CONNECTED)
+ sec.key_size = chan->conn->hcon->enc_key_size;
+ } else {
+ sec.level = chan->sec_level;
+ }
+
+ len = min_t(unsigned int, len, sizeof(sec));
+ if (copy_to_user(optval, (char *) &sec, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+ (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case BT_FLUSHABLE:
+ if (put_user(test_bit(FLAG_FLUSHABLE, &chan->flags),
+ (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case BT_POWER:
+ if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_STREAM
+ && sk->sk_type != SOCK_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ pwr.force_active = test_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+
+ len = min_t(unsigned int, len, sizeof(pwr));
+ if (copy_to_user(optval, (char *) &pwr, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_CHANNEL_POLICY:
+ if (put_user(chan->chan_policy, (u32 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_SNDMTU:
+ if (!bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ if (put_user(chan->omtu, (u16 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_RCVMTU:
+ if (!bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(chan->imtu, (u16 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu)
+{
+ switch (chan->scid) {
+ case L2CAP_CID_ATT:
+ if (mtu < L2CAP_LE_MIN_MTU)
+ return false;
+ break;
+
+ default:
+ if (mtu < L2CAP_DEFAULT_MIN_MTU)
+ return false;
+ }
+
+ return true;
+}
+
+static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct l2cap_options opts;
+ int len, err = 0;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case L2CAP_OPTIONS:
+ if (bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (sk->sk_state == BT_CONNECTED) {
+ err = -EINVAL;
+ break;
+ }
+
+ opts.imtu = chan->imtu;
+ opts.omtu = chan->omtu;
+ opts.flush_to = chan->flush_to;
+ opts.mode = chan->mode;
+ opts.fcs = chan->fcs;
+ opts.max_tx = chan->max_tx;
+ opts.txwin_size = chan->tx_win;
+
+ len = min_t(unsigned int, sizeof(opts), optlen);
+ if (copy_from_user((char *) &opts, optval, len)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opts.txwin_size > L2CAP_DEFAULT_EXT_WINDOW) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (!l2cap_valid_mtu(chan, opts.imtu)) {
+ err = -EINVAL;
+ break;
+ }
+
+ chan->mode = opts.mode;
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ break;
+ case L2CAP_MODE_BASIC:
+ clear_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+ break;
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ if (!disable_ertm)
+ break;
+ /* fall through */
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ chan->imtu = opts.imtu;
+ chan->omtu = opts.omtu;
+ chan->fcs = opts.fcs;
+ chan->max_tx = opts.max_tx;
+ chan->tx_win = opts.txwin_size;
+ chan->flush_to = opts.flush_to;
+ break;
+
+ case L2CAP_LM:
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt & L2CAP_LM_FIPS) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (opt & L2CAP_LM_AUTH)
+ chan->sec_level = BT_SECURITY_LOW;
+ if (opt & L2CAP_LM_ENCRYPT)
+ chan->sec_level = BT_SECURITY_MEDIUM;
+ if (opt & L2CAP_LM_SECURE)
+ chan->sec_level = BT_SECURITY_HIGH;
+
+ if (opt & L2CAP_LM_MASTER)
+ set_bit(FLAG_ROLE_SWITCH, &chan->flags);
+ else
+ clear_bit(FLAG_ROLE_SWITCH, &chan->flags);
+
+ if (opt & L2CAP_LM_RELIABLE)
+ set_bit(FLAG_FORCE_RELIABLE, &chan->flags);
+ else
+ clear_bit(FLAG_FORCE_RELIABLE, &chan->flags);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct bt_security sec;
+ struct bt_power pwr;
+ struct l2cap_conn *conn;
+ int len, err = 0;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_L2CAP)
+ return l2cap_sock_setsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+ chan->chan_type != L2CAP_CHAN_FIXED &&
+ chan->chan_type != L2CAP_CHAN_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = BT_SECURITY_LOW;
+
+ len = min_t(unsigned int, sizeof(sec), optlen);
+ if (copy_from_user((char *) &sec, optval, len)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (sec.level < BT_SECURITY_LOW ||
+ sec.level > BT_SECURITY_HIGH) {
+ err = -EINVAL;
+ break;
+ }
+
+ chan->sec_level = sec.level;
+
+ if (!chan->conn)
+ break;
+
+ conn = chan->conn;
+
+ /*change security for LE channels */
+ if (chan->scid == L2CAP_CID_ATT) {
+ if (smp_conn_security(conn->hcon, sec.level))
+ break;
+ set_bit(FLAG_PENDING_SECURITY, &chan->flags);
+ sk->sk_state = BT_CONFIG;
+ chan->state = BT_CONFIG;
+
+ /* or for ACL link */
+ } else if ((sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) ||
+ sk->sk_state == BT_CONNECTED) {
+ if (!l2cap_chan_check_security(chan, true))
+ set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags);
+ else
+ sk->sk_state_change(sk);
+ } else {
+ err = -EINVAL;
+ }
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt) {
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ set_bit(FLAG_DEFER_SETUP, &chan->flags);
+ } else {
+ clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ clear_bit(FLAG_DEFER_SETUP, &chan->flags);
+ }
+ break;
+
+ case BT_FLUSHABLE:
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt > BT_FLUSHABLE_ON) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (opt == BT_FLUSHABLE_OFF) {
+ conn = chan->conn;
+ /* proceed further only when we have l2cap_conn and
+ No Flush support in the LM */
+ if (!conn || !lmp_no_flush_capable(conn->hcon->hdev)) {
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ if (opt)
+ set_bit(FLAG_FLUSHABLE, &chan->flags);
+ else
+ clear_bit(FLAG_FLUSHABLE, &chan->flags);
+ break;
+
+ case BT_POWER:
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+ chan->chan_type != L2CAP_CHAN_RAW) {
+ err = -EINVAL;
+ break;
+ }
+
+ pwr.force_active = BT_POWER_FORCE_ACTIVE_ON;
+
+ len = min_t(unsigned int, sizeof(pwr), optlen);
+ if (copy_from_user((char *) &pwr, optval, len)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (pwr.force_active)
+ set_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+ else
+ clear_bit(FLAG_FORCE_ACTIVE, &chan->flags);
+ break;
+
+ case BT_CHANNEL_POLICY:
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt > BT_CHANNEL_POLICY_AMP_PREFERRED) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (chan->mode != L2CAP_MODE_ERTM &&
+ chan->mode != L2CAP_MODE_STREAMING) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ chan->chan_policy = (u8) opt;
+
+ if (sk->sk_state == BT_CONNECTED &&
+ chan->move_role == L2CAP_MOVE_ROLE_NONE)
+ l2cap_move_start(chan);
+
+ break;
+
+ case BT_SNDMTU:
+ if (!bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ /* Setting is not supported as it's the remote side that
+ * decides this.
+ */
+ err = -EPERM;
+ break;
+
+ case BT_RCVMTU:
+ if (!bdaddr_type_is_le(chan->src_type)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (sk->sk_state == BT_CONNECTED) {
+ err = -EISCONN;
+ break;
+ }
+
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ chan->imtu = opt;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ int err;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ if (sk->sk_state != BT_CONNECTED)
+ return -ENOTCONN;
+
+ lock_sock(sk);
+ err = bt_sock_wait_ready(sk, msg->msg_flags);
+ release_sock(sk);
+ if (err)
+ return err;
+
+ l2cap_chan_lock(chan);
+ err = l2cap_chan_send(chan, msg, len);
+ l2cap_chan_unlock(chan);
+
+ return err;
+}
+
+static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_pinfo *pi = l2cap_pi(sk);
+ int err;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP,
+ &bt_sk(sk)->flags)) {
+ if (bdaddr_type_is_le(pi->chan->src_type)) {
+ sk->sk_state = BT_CONNECTED;
+ pi->chan->state = BT_CONNECTED;
+ __l2cap_le_connect_rsp_defer(pi->chan);
+ } else {
+ sk->sk_state = BT_CONFIG;
+ pi->chan->state = BT_CONFIG;
+ __l2cap_connect_rsp_defer(pi->chan);
+ }
+
+ err = 0;
+ goto done;
+ }
+
+ release_sock(sk);
+
+ if (sock->type == SOCK_STREAM)
+ err = bt_sock_stream_recvmsg(sock, msg, len, flags);
+ else
+ err = bt_sock_recvmsg(sock, msg, len, flags);
+
+ if (pi->chan->mode != L2CAP_MODE_ERTM)
+ return err;
+
+ /* Attempt to put pending rx data in the socket buffer */
+
+ lock_sock(sk);
+
+ if (!test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state))
+ goto done;
+
+ if (pi->rx_busy_skb) {
+ if (!sock_queue_rcv_skb(sk, pi->rx_busy_skb))
+ pi->rx_busy_skb = NULL;
+ else
+ goto done;
+ }
+
+ /* Restore data flow when half of the receive buffer is
+ * available. This avoids resending large numbers of
+ * frames.
+ */
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1)
+ l2cap_chan_busy(pi->chan, 0);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void l2cap_sock_kill(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+ return;
+
+ BT_DBG("sk %p state %s", sk, state_to_string(sk->sk_state));
+
+ /* Kill poor orphan */
+
+ l2cap_chan_put(l2cap_pi(sk)->chan);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+}
+
+static int __l2cap_wait_ack(struct sock *sk)
+{
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ DECLARE_WAITQUEUE(wait, current);
+ int err = 0;
+ int timeo = HZ/5;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (chan->unacked_frames > 0 && chan->conn) {
+ if (!timeo)
+ timeo = HZ/5;
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = sock_error(sk);
+ if (err)
+ break;
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return err;
+}
+
+static int l2cap_sock_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct l2cap_chan *chan;
+ struct l2cap_conn *conn;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ chan = l2cap_pi(sk)->chan;
+ conn = chan->conn;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+
+ if (conn)
+ mutex_lock(&conn->chan_lock);
+
+ l2cap_chan_lock(chan);
+ lock_sock(sk);
+
+ if (!sk->sk_shutdown) {
+ if (chan->mode == L2CAP_MODE_ERTM)
+ err = __l2cap_wait_ack(sk);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ release_sock(sk);
+ l2cap_chan_close(chan, 0);
+ lock_sock(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
+ err = bt_sock_wait_state(sk, BT_CLOSED,
+ sk->sk_lingertime);
+ }
+
+ if (!err && sk->sk_err)
+ err = -sk->sk_err;
+
+ release_sock(sk);
+ l2cap_chan_unlock(chan);
+
+ if (conn)
+ mutex_unlock(&conn->chan_lock);
+
+ return err;
+}
+
+static int l2cap_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ int err;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ bt_sock_unlink(&l2cap_sk_list, sk);
+
+ err = l2cap_sock_shutdown(sock, 2);
+
+ sock_orphan(sk);
+ l2cap_sock_kill(sk);
+ return err;
+}
+
+static void l2cap_sock_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ BT_DBG("parent %p state %s", parent,
+ state_to_string(parent->sk_state));
+
+ /* Close not yet accepted channels */
+ while ((sk = bt_accept_dequeue(parent, NULL))) {
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+
+ BT_DBG("child chan %p state %s", chan,
+ state_to_string(chan->state));
+
+ l2cap_chan_lock(chan);
+ __clear_chan_timer(chan);
+ l2cap_chan_close(chan, ECONNRESET);
+ l2cap_chan_unlock(chan);
+
+ l2cap_sock_kill(sk);
+ }
+}
+
+static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk, *parent = chan->data;
+
+ lock_sock(parent);
+
+ /* Check for backlog size */
+ if (sk_acceptq_is_full(parent)) {
+ BT_DBG("backlog full %d", parent->sk_ack_backlog);
+ release_sock(parent);
+ return NULL;
+ }
+
+ sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP,
+ GFP_ATOMIC);
+ if (!sk) {
+ release_sock(parent);
+ return NULL;
+ }
+
+ bt_sock_reclassify_lock(sk, BTPROTO_L2CAP);
+
+ l2cap_sock_init(sk, parent);
+
+ bt_accept_enqueue(parent, sk);
+
+ release_sock(parent);
+
+ return l2cap_pi(sk)->chan;
+}
+
+static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct sock *sk = chan->data;
+ int err;
+
+ lock_sock(sk);
+
+ if (l2cap_pi(sk)->rx_busy_skb) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ err = sock_queue_rcv_skb(sk, skb);
+
+ /* For ERTM, handle one skb that doesn't fit into the recv
+ * buffer. This is important to do because the data frames
+ * have already been acked, so the skb cannot be discarded.
+ *
+ * Notify the l2cap core that the buffer is full, so the
+ * LOCAL_BUSY state is entered and no more frames are
+ * acked and reassembled until there is buffer space
+ * available.
+ */
+ if (err < 0 && chan->mode == L2CAP_MODE_ERTM) {
+ l2cap_pi(sk)->rx_busy_skb = skb;
+ l2cap_chan_busy(chan, 1);
+ err = 0;
+ }
+
+done:
+ release_sock(sk);
+
+ return err;
+}
+
+static void l2cap_sock_close_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ l2cap_sock_kill(sk);
+}
+
+static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err)
+{
+ struct sock *sk = chan->data;
+ struct sock *parent;
+
+ BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
+
+ /* This callback can be called both for server (BT_LISTEN)
+ * sockets as well as "normal" ones. To avoid lockdep warnings
+ * with child socket locking (through l2cap_sock_cleanup_listen)
+ * we need separation into separate nesting levels. The simplest
+ * way to accomplish this is to inherit the nesting level used
+ * for the channel.
+ */
+ lock_sock_nested(sk, atomic_read(&chan->nesting));
+
+ parent = bt_sk(sk)->parent;
+
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+ switch (chan->state) {
+ case BT_OPEN:
+ case BT_BOUND:
+ case BT_CLOSED:
+ break;
+ case BT_LISTEN:
+ l2cap_sock_cleanup_listen(sk);
+ sk->sk_state = BT_CLOSED;
+ chan->state = BT_CLOSED;
+
+ break;
+ default:
+ sk->sk_state = BT_CLOSED;
+ chan->state = BT_CLOSED;
+
+ sk->sk_err = err;
+
+ if (parent) {
+ bt_accept_unlink(sk);
+ parent->sk_data_ready(parent);
+ } else {
+ sk->sk_state_change(sk);
+ }
+
+ break;
+ }
+
+ release_sock(sk);
+}
+
+static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state,
+ int err)
+{
+ struct sock *sk = chan->data;
+
+ sk->sk_state = state;
+
+ if (err)
+ sk->sk_err = err;
+}
+
+static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
+ unsigned long hdr_len,
+ unsigned long len, int nb)
+{
+ struct sock *sk = chan->data;
+ struct sk_buff *skb;
+ int err;
+
+ l2cap_chan_unlock(chan);
+ skb = bt_skb_send_alloc(sk, hdr_len + len, nb, &err);
+ l2cap_chan_lock(chan);
+
+ if (!skb)
+ return ERR_PTR(err);
+
+ skb->priority = sk->sk_priority;
+
+ bt_cb(skb)->l2cap.chan = chan;
+
+ return skb;
+}
+
+static void l2cap_sock_ready_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+ struct sock *parent;
+
+ lock_sock(sk);
+
+ parent = bt_sk(sk)->parent;
+
+ BT_DBG("sk %p, parent %p", sk, parent);
+
+ sk->sk_state = BT_CONNECTED;
+ sk->sk_state_change(sk);
+
+ if (parent)
+ parent->sk_data_ready(parent);
+
+ release_sock(sk);
+}
+
+static void l2cap_sock_defer_cb(struct l2cap_chan *chan)
+{
+ struct sock *parent, *sk = chan->data;
+
+ lock_sock(sk);
+
+ parent = bt_sk(sk)->parent;
+ if (parent)
+ parent->sk_data_ready(parent);
+
+ release_sock(sk);
+}
+
+static void l2cap_sock_resume_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ if (test_and_clear_bit(FLAG_PENDING_SECURITY, &chan->flags)) {
+ sk->sk_state = BT_CONNECTED;
+ chan->state = BT_CONNECTED;
+ }
+
+ clear_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags);
+ sk->sk_state_change(sk);
+}
+
+static void l2cap_sock_set_shutdown_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ release_sock(sk);
+}
+
+static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ return sk->sk_sndtimeo;
+}
+
+static void l2cap_sock_suspend_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ set_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags);
+ sk->sk_state_change(sk);
+}
+
+static const struct l2cap_ops l2cap_chan_ops = {
+ .name = "L2CAP Socket Interface",
+ .new_connection = l2cap_sock_new_connection_cb,
+ .recv = l2cap_sock_recv_cb,
+ .close = l2cap_sock_close_cb,
+ .teardown = l2cap_sock_teardown_cb,
+ .state_change = l2cap_sock_state_change_cb,
+ .ready = l2cap_sock_ready_cb,
+ .defer = l2cap_sock_defer_cb,
+ .resume = l2cap_sock_resume_cb,
+ .suspend = l2cap_sock_suspend_cb,
+ .set_shutdown = l2cap_sock_set_shutdown_cb,
+ .get_sndtimeo = l2cap_sock_get_sndtimeo_cb,
+ .alloc_skb = l2cap_sock_alloc_skb_cb,
+};
+
+static void l2cap_sock_destruct(struct sock *sk)
+{
+ BT_DBG("sk %p", sk);
+
+ if (l2cap_pi(sk)->chan)
+ l2cap_chan_put(l2cap_pi(sk)->chan);
+
+ if (l2cap_pi(sk)->rx_busy_skb) {
+ kfree_skb(l2cap_pi(sk)->rx_busy_skb);
+ l2cap_pi(sk)->rx_busy_skb = NULL;
+ }
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+}
+
+static void l2cap_skb_msg_name(struct sk_buff *skb, void *msg_name,
+ int *msg_namelen)
+{
+ DECLARE_SOCKADDR(struct sockaddr_l2 *, la, msg_name);
+
+ memset(la, 0, sizeof(struct sockaddr_l2));
+ la->l2_family = AF_BLUETOOTH;
+ la->l2_psm = bt_cb(skb)->l2cap.psm;
+ bacpy(&la->l2_bdaddr, &bt_cb(skb)->l2cap.bdaddr);
+
+ *msg_namelen = sizeof(struct sockaddr_l2);
+}
+
+static void l2cap_sock_init(struct sock *sk, struct sock *parent)
+{
+ struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+
+ BT_DBG("sk %p", sk);
+
+ if (parent) {
+ struct l2cap_chan *pchan = l2cap_pi(parent)->chan;
+
+ sk->sk_type = parent->sk_type;
+ bt_sk(sk)->flags = bt_sk(parent)->flags;
+
+ chan->chan_type = pchan->chan_type;
+ chan->imtu = pchan->imtu;
+ chan->omtu = pchan->omtu;
+ chan->conf_state = pchan->conf_state;
+ chan->mode = pchan->mode;
+ chan->fcs = pchan->fcs;
+ chan->max_tx = pchan->max_tx;
+ chan->tx_win = pchan->tx_win;
+ chan->tx_win_max = pchan->tx_win_max;
+ chan->sec_level = pchan->sec_level;
+ chan->flags = pchan->flags;
+ chan->tx_credits = pchan->tx_credits;
+ chan->rx_credits = pchan->rx_credits;
+
+ if (chan->chan_type == L2CAP_CHAN_FIXED) {
+ chan->scid = pchan->scid;
+ chan->dcid = pchan->scid;
+ }
+
+ security_sk_clone(parent, sk);
+ } else {
+ switch (sk->sk_type) {
+ case SOCK_RAW:
+ chan->chan_type = L2CAP_CHAN_RAW;
+ break;
+ case SOCK_DGRAM:
+ chan->chan_type = L2CAP_CHAN_CONN_LESS;
+ bt_sk(sk)->skb_msg_name = l2cap_skb_msg_name;
+ break;
+ case SOCK_SEQPACKET:
+ case SOCK_STREAM:
+ chan->chan_type = L2CAP_CHAN_CONN_ORIENTED;
+ break;
+ }
+
+ chan->imtu = L2CAP_DEFAULT_MTU;
+ chan->omtu = 0;
+ if (!disable_ertm && sk->sk_type == SOCK_STREAM) {
+ chan->mode = L2CAP_MODE_ERTM;
+ set_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+ } else {
+ chan->mode = L2CAP_MODE_BASIC;
+ }
+
+ l2cap_chan_set_defaults(chan);
+ }
+
+ /* Default config options */
+ chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
+
+ chan->data = sk;
+ chan->ops = &l2cap_chan_ops;
+}
+
+static struct proto l2cap_proto = {
+ .name = "L2CAP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct l2cap_pinfo)
+};
+
+static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio)
+{
+ struct sock *sk;
+ struct l2cap_chan *chan;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk);
+ INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+ sk->sk_destruct = l2cap_sock_destruct;
+ sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = proto;
+ sk->sk_state = BT_OPEN;
+
+ chan = l2cap_chan_create();
+ if (!chan) {
+ sk_free(sk);
+ return NULL;
+ }
+
+ l2cap_chan_hold(chan);
+
+ l2cap_pi(sk)->chan = chan;
+
+ return sk;
+}
+
+static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM &&
+ sock->type != SOCK_DGRAM && sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+ return -EPERM;
+
+ sock->ops = &l2cap_sock_ops;
+
+ sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+ if (!sk)
+ return -ENOMEM;
+
+ l2cap_sock_init(sk, NULL);
+ bt_sock_link(&l2cap_sk_list, sk);
+ return 0;
+}
+
+static const struct proto_ops l2cap_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = l2cap_sock_release,
+ .bind = l2cap_sock_bind,
+ .connect = l2cap_sock_connect,
+ .listen = l2cap_sock_listen,
+ .accept = l2cap_sock_accept,
+ .getname = l2cap_sock_getname,
+ .sendmsg = l2cap_sock_sendmsg,
+ .recvmsg = l2cap_sock_recvmsg,
+ .poll = bt_sock_poll,
+ .ioctl = bt_sock_ioctl,
+ .mmap = sock_no_mmap,
+ .socketpair = sock_no_socketpair,
+ .shutdown = l2cap_sock_shutdown,
+ .setsockopt = l2cap_sock_setsockopt,
+ .getsockopt = l2cap_sock_getsockopt
+};
+
+static const struct net_proto_family l2cap_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = l2cap_sock_create,
+};
+
+int __init l2cap_init_sockets(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sockaddr_l2) > sizeof(struct sockaddr));
+
+ err = proto_register(&l2cap_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_L2CAP, &l2cap_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("L2CAP socket registration failed");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "l2cap", &l2cap_sk_list,
+ NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create L2CAP proc file");
+ bt_sock_unregister(BTPROTO_L2CAP);
+ goto error;
+ }
+
+ BT_INFO("L2CAP socket layer initialized");
+
+ return 0;
+
+error:
+ proto_unregister(&l2cap_proto);
+ return err;
+}
+
+void l2cap_cleanup_sockets(void)
+{
+ bt_procfs_cleanup(&init_net, "l2cap");
+ bt_sock_unregister(BTPROTO_L2CAP);
+ proto_unregister(&l2cap_proto);
+}
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
new file mode 100644
index 000000000..b36bc0415
--- /dev/null
+++ b/net/bluetooth/lib.c
@@ -0,0 +1,168 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth kernel library. */
+
+#define pr_fmt(fmt) "Bluetooth: " fmt
+
+#include <linux/export.h>
+
+#include <net/bluetooth/bluetooth.h>
+
+void baswap(bdaddr_t *dst, bdaddr_t *src)
+{
+ unsigned char *d = (unsigned char *) dst;
+ unsigned char *s = (unsigned char *) src;
+ unsigned int i;
+
+ for (i = 0; i < 6; i++)
+ d[i] = s[5 - i];
+}
+EXPORT_SYMBOL(baswap);
+
+/* Bluetooth error codes to Unix errno mapping */
+int bt_to_errno(__u16 code)
+{
+ switch (code) {
+ case 0:
+ return 0;
+
+ case 0x01:
+ return EBADRQC;
+
+ case 0x02:
+ return ENOTCONN;
+
+ case 0x03:
+ return EIO;
+
+ case 0x04:
+ case 0x3c:
+ return EHOSTDOWN;
+
+ case 0x05:
+ return EACCES;
+
+ case 0x06:
+ return EBADE;
+
+ case 0x07:
+ return ENOMEM;
+
+ case 0x08:
+ return ETIMEDOUT;
+
+ case 0x09:
+ return EMLINK;
+
+ case 0x0a:
+ return EMLINK;
+
+ case 0x0b:
+ return EALREADY;
+
+ case 0x0c:
+ return EBUSY;
+
+ case 0x0d:
+ case 0x0e:
+ case 0x0f:
+ return ECONNREFUSED;
+
+ case 0x10:
+ return ETIMEDOUT;
+
+ case 0x11:
+ case 0x27:
+ case 0x29:
+ case 0x20:
+ return EOPNOTSUPP;
+
+ case 0x12:
+ return EINVAL;
+
+ case 0x13:
+ case 0x14:
+ case 0x15:
+ return ECONNRESET;
+
+ case 0x16:
+ return ECONNABORTED;
+
+ case 0x17:
+ return ELOOP;
+
+ case 0x18:
+ return EACCES;
+
+ case 0x1a:
+ return EPROTONOSUPPORT;
+
+ case 0x1b:
+ return ECONNREFUSED;
+
+ case 0x19:
+ case 0x1e:
+ case 0x23:
+ case 0x24:
+ case 0x25:
+ return EPROTO;
+
+ default:
+ return ENOSYS;
+ }
+}
+EXPORT_SYMBOL(bt_to_errno);
+
+void bt_info(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_info("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_info);
+
+void bt_err(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_err("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_err);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
new file mode 100644
index 000000000..7fd87e713
--- /dev/null
+++ b/net/bluetooth/mgmt.c
@@ -0,0 +1,8415 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2010 Nokia Corporation
+ Copyright (C) 2011-2012 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI Management interface */
+
+#include <linux/module.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_sock.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "hci_request.h"
+#include "smp.h"
+#include "mgmt_util.h"
+
+#define MGMT_VERSION 1
+#define MGMT_REVISION 9
+
+static const u16 mgmt_commands[] = {
+ MGMT_OP_READ_INDEX_LIST,
+ MGMT_OP_READ_INFO,
+ MGMT_OP_SET_POWERED,
+ MGMT_OP_SET_DISCOVERABLE,
+ MGMT_OP_SET_CONNECTABLE,
+ MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_OP_SET_BONDABLE,
+ MGMT_OP_SET_LINK_SECURITY,
+ MGMT_OP_SET_SSP,
+ MGMT_OP_SET_HS,
+ MGMT_OP_SET_LE,
+ MGMT_OP_SET_DEV_CLASS,
+ MGMT_OP_SET_LOCAL_NAME,
+ MGMT_OP_ADD_UUID,
+ MGMT_OP_REMOVE_UUID,
+ MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_OP_DISCONNECT,
+ MGMT_OP_GET_CONNECTIONS,
+ MGMT_OP_PIN_CODE_REPLY,
+ MGMT_OP_PIN_CODE_NEG_REPLY,
+ MGMT_OP_SET_IO_CAPABILITY,
+ MGMT_OP_PAIR_DEVICE,
+ MGMT_OP_CANCEL_PAIR_DEVICE,
+ MGMT_OP_UNPAIR_DEVICE,
+ MGMT_OP_USER_CONFIRM_REPLY,
+ MGMT_OP_USER_CONFIRM_NEG_REPLY,
+ MGMT_OP_USER_PASSKEY_REPLY,
+ MGMT_OP_USER_PASSKEY_NEG_REPLY,
+ MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+ MGMT_OP_START_DISCOVERY,
+ MGMT_OP_STOP_DISCOVERY,
+ MGMT_OP_CONFIRM_NAME,
+ MGMT_OP_BLOCK_DEVICE,
+ MGMT_OP_UNBLOCK_DEVICE,
+ MGMT_OP_SET_DEVICE_ID,
+ MGMT_OP_SET_ADVERTISING,
+ MGMT_OP_SET_BREDR,
+ MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_OP_SET_SECURE_CONN,
+ MGMT_OP_SET_DEBUG_KEYS,
+ MGMT_OP_SET_PRIVACY,
+ MGMT_OP_LOAD_IRKS,
+ MGMT_OP_GET_CONN_INFO,
+ MGMT_OP_GET_CLOCK_INFO,
+ MGMT_OP_ADD_DEVICE,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_OP_LOAD_CONN_PARAM,
+ MGMT_OP_READ_UNCONF_INDEX_LIST,
+ MGMT_OP_READ_CONFIG_INFO,
+ MGMT_OP_SET_EXTERNAL_CONFIG,
+ MGMT_OP_SET_PUBLIC_ADDRESS,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_OP_READ_LOCAL_OOB_EXT_DATA,
+ MGMT_OP_READ_EXT_INDEX_LIST,
+ MGMT_OP_READ_ADV_FEATURES,
+ MGMT_OP_ADD_ADVERTISING,
+ MGMT_OP_REMOVE_ADVERTISING,
+};
+
+static const u16 mgmt_events[] = {
+ MGMT_EV_CONTROLLER_ERROR,
+ MGMT_EV_INDEX_ADDED,
+ MGMT_EV_INDEX_REMOVED,
+ MGMT_EV_NEW_SETTINGS,
+ MGMT_EV_CLASS_OF_DEV_CHANGED,
+ MGMT_EV_LOCAL_NAME_CHANGED,
+ MGMT_EV_NEW_LINK_KEY,
+ MGMT_EV_NEW_LONG_TERM_KEY,
+ MGMT_EV_DEVICE_CONNECTED,
+ MGMT_EV_DEVICE_DISCONNECTED,
+ MGMT_EV_CONNECT_FAILED,
+ MGMT_EV_PIN_CODE_REQUEST,
+ MGMT_EV_USER_CONFIRM_REQUEST,
+ MGMT_EV_USER_PASSKEY_REQUEST,
+ MGMT_EV_AUTH_FAILED,
+ MGMT_EV_DEVICE_FOUND,
+ MGMT_EV_DISCOVERING,
+ MGMT_EV_DEVICE_BLOCKED,
+ MGMT_EV_DEVICE_UNBLOCKED,
+ MGMT_EV_DEVICE_UNPAIRED,
+ MGMT_EV_PASSKEY_NOTIFY,
+ MGMT_EV_NEW_IRK,
+ MGMT_EV_NEW_CSRK,
+ MGMT_EV_DEVICE_ADDED,
+ MGMT_EV_DEVICE_REMOVED,
+ MGMT_EV_NEW_CONN_PARAM,
+ MGMT_EV_UNCONF_INDEX_ADDED,
+ MGMT_EV_UNCONF_INDEX_REMOVED,
+ MGMT_EV_NEW_CONFIG_OPTIONS,
+ MGMT_EV_EXT_INDEX_ADDED,
+ MGMT_EV_EXT_INDEX_REMOVED,
+ MGMT_EV_LOCAL_OOB_DATA_UPDATED,
+ MGMT_EV_ADVERTISING_ADDED,
+ MGMT_EV_ADVERTISING_REMOVED,
+};
+
+static const u16 mgmt_untrusted_commands[] = {
+ MGMT_OP_READ_INDEX_LIST,
+ MGMT_OP_READ_INFO,
+ MGMT_OP_READ_UNCONF_INDEX_LIST,
+ MGMT_OP_READ_CONFIG_INFO,
+ MGMT_OP_READ_EXT_INDEX_LIST,
+};
+
+static const u16 mgmt_untrusted_events[] = {
+ MGMT_EV_INDEX_ADDED,
+ MGMT_EV_INDEX_REMOVED,
+ MGMT_EV_NEW_SETTINGS,
+ MGMT_EV_CLASS_OF_DEV_CHANGED,
+ MGMT_EV_LOCAL_NAME_CHANGED,
+ MGMT_EV_UNCONF_INDEX_ADDED,
+ MGMT_EV_UNCONF_INDEX_REMOVED,
+ MGMT_EV_NEW_CONFIG_OPTIONS,
+ MGMT_EV_EXT_INDEX_ADDED,
+ MGMT_EV_EXT_INDEX_REMOVED,
+};
+
+#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000)
+
+#define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00"
+
+/* HCI to MGMT error code conversion table */
+static u8 mgmt_status_table[] = {
+ MGMT_STATUS_SUCCESS,
+ MGMT_STATUS_UNKNOWN_COMMAND, /* Unknown Command */
+ MGMT_STATUS_NOT_CONNECTED, /* No Connection */
+ MGMT_STATUS_FAILED, /* Hardware Failure */
+ MGMT_STATUS_CONNECT_FAILED, /* Page Timeout */
+ MGMT_STATUS_AUTH_FAILED, /* Authentication Failed */
+ MGMT_STATUS_AUTH_FAILED, /* PIN or Key Missing */
+ MGMT_STATUS_NO_RESOURCES, /* Memory Full */
+ MGMT_STATUS_TIMEOUT, /* Connection Timeout */
+ MGMT_STATUS_NO_RESOURCES, /* Max Number of Connections */
+ MGMT_STATUS_NO_RESOURCES, /* Max Number of SCO Connections */
+ MGMT_STATUS_ALREADY_CONNECTED, /* ACL Connection Exists */
+ MGMT_STATUS_BUSY, /* Command Disallowed */
+ MGMT_STATUS_NO_RESOURCES, /* Rejected Limited Resources */
+ MGMT_STATUS_REJECTED, /* Rejected Security */
+ MGMT_STATUS_REJECTED, /* Rejected Personal */
+ MGMT_STATUS_TIMEOUT, /* Host Timeout */
+ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported Feature */
+ MGMT_STATUS_INVALID_PARAMS, /* Invalid Parameters */
+ MGMT_STATUS_DISCONNECTED, /* OE User Ended Connection */
+ MGMT_STATUS_NO_RESOURCES, /* OE Low Resources */
+ MGMT_STATUS_DISCONNECTED, /* OE Power Off */
+ MGMT_STATUS_DISCONNECTED, /* Connection Terminated */
+ MGMT_STATUS_BUSY, /* Repeated Attempts */
+ MGMT_STATUS_REJECTED, /* Pairing Not Allowed */
+ MGMT_STATUS_FAILED, /* Unknown LMP PDU */
+ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported Remote Feature */
+ MGMT_STATUS_REJECTED, /* SCO Offset Rejected */
+ MGMT_STATUS_REJECTED, /* SCO Interval Rejected */
+ MGMT_STATUS_REJECTED, /* Air Mode Rejected */
+ MGMT_STATUS_INVALID_PARAMS, /* Invalid LMP Parameters */
+ MGMT_STATUS_FAILED, /* Unspecified Error */
+ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported LMP Parameter Value */
+ MGMT_STATUS_FAILED, /* Role Change Not Allowed */
+ MGMT_STATUS_TIMEOUT, /* LMP Response Timeout */
+ MGMT_STATUS_FAILED, /* LMP Error Transaction Collision */
+ MGMT_STATUS_FAILED, /* LMP PDU Not Allowed */
+ MGMT_STATUS_REJECTED, /* Encryption Mode Not Accepted */
+ MGMT_STATUS_FAILED, /* Unit Link Key Used */
+ MGMT_STATUS_NOT_SUPPORTED, /* QoS Not Supported */
+ MGMT_STATUS_TIMEOUT, /* Instant Passed */
+ MGMT_STATUS_NOT_SUPPORTED, /* Pairing Not Supported */
+ MGMT_STATUS_FAILED, /* Transaction Collision */
+ MGMT_STATUS_INVALID_PARAMS, /* Unacceptable Parameter */
+ MGMT_STATUS_REJECTED, /* QoS Rejected */
+ MGMT_STATUS_NOT_SUPPORTED, /* Classification Not Supported */
+ MGMT_STATUS_REJECTED, /* Insufficient Security */
+ MGMT_STATUS_INVALID_PARAMS, /* Parameter Out Of Range */
+ MGMT_STATUS_BUSY, /* Role Switch Pending */
+ MGMT_STATUS_FAILED, /* Slot Violation */
+ MGMT_STATUS_FAILED, /* Role Switch Failed */
+ MGMT_STATUS_INVALID_PARAMS, /* EIR Too Large */
+ MGMT_STATUS_NOT_SUPPORTED, /* Simple Pairing Not Supported */
+ MGMT_STATUS_BUSY, /* Host Busy Pairing */
+ MGMT_STATUS_REJECTED, /* Rejected, No Suitable Channel */
+ MGMT_STATUS_BUSY, /* Controller Busy */
+ MGMT_STATUS_INVALID_PARAMS, /* Unsuitable Connection Interval */
+ MGMT_STATUS_TIMEOUT, /* Directed Advertising Timeout */
+ MGMT_STATUS_AUTH_FAILED, /* Terminated Due to MIC Failure */
+ MGMT_STATUS_CONNECT_FAILED, /* Connection Establishment Failed */
+ MGMT_STATUS_CONNECT_FAILED, /* MAC Connection Failed */
+};
+
+static u8 mgmt_status(u8 hci_status)
+{
+ if (hci_status < ARRAY_SIZE(mgmt_status_table))
+ return mgmt_status_table[hci_status];
+
+ return MGMT_STATUS_FAILED;
+}
+
+static int mgmt_index_event(u16 event, struct hci_dev *hdev, void *data,
+ u16 len, int flag)
+{
+ return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
+ flag, NULL);
+}
+
+static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data,
+ u16 len, int flag, struct sock *skip_sk)
+{
+ return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
+ flag, skip_sk);
+}
+
+static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data,
+ u16 len, struct sock *skip_sk)
+{
+ return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
+ HCI_MGMT_GENERIC_EVENTS, skip_sk);
+}
+
+static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
+ struct sock *skip_sk)
+{
+ return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len,
+ HCI_SOCK_TRUSTED, skip_sk);
+}
+
+static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_rp_read_version rp;
+
+ BT_DBG("sock %p", sk);
+
+ rp.version = MGMT_VERSION;
+ rp.revision = cpu_to_le16(MGMT_REVISION);
+
+ return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0,
+ &rp, sizeof(rp));
+}
+
+static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_rp_read_commands *rp;
+ u16 num_commands, num_events;
+ size_t rp_size;
+ int i, err;
+
+ BT_DBG("sock %p", sk);
+
+ if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) {
+ num_commands = ARRAY_SIZE(mgmt_commands);
+ num_events = ARRAY_SIZE(mgmt_events);
+ } else {
+ num_commands = ARRAY_SIZE(mgmt_untrusted_commands);
+ num_events = ARRAY_SIZE(mgmt_untrusted_events);
+ }
+
+ rp_size = sizeof(*rp) + ((num_commands + num_events) * sizeof(u16));
+
+ rp = kmalloc(rp_size, GFP_KERNEL);
+ if (!rp)
+ return -ENOMEM;
+
+ rp->num_commands = cpu_to_le16(num_commands);
+ rp->num_events = cpu_to_le16(num_events);
+
+ if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) {
+ __le16 *opcode = rp->opcodes;
+
+ for (i = 0; i < num_commands; i++, opcode++)
+ put_unaligned_le16(mgmt_commands[i], opcode);
+
+ for (i = 0; i < num_events; i++, opcode++)
+ put_unaligned_le16(mgmt_events[i], opcode);
+ } else {
+ __le16 *opcode = rp->opcodes;
+
+ for (i = 0; i < num_commands; i++, opcode++)
+ put_unaligned_le16(mgmt_untrusted_commands[i], opcode);
+
+ for (i = 0; i < num_events; i++, opcode++)
+ put_unaligned_le16(mgmt_untrusted_events[i], opcode);
+ }
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_COMMANDS, 0,
+ rp, rp_size);
+ kfree(rp);
+
+ return err;
+}
+
+static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_rp_read_index_list *rp;
+ struct hci_dev *d;
+ size_t rp_len;
+ u16 count;
+ int err;
+
+ BT_DBG("sock %p", sk);
+
+ read_lock(&hci_dev_list_lock);
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (d->dev_type == HCI_BREDR &&
+ !hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ count++;
+ }
+
+ rp_len = sizeof(*rp) + (2 * count);
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp) {
+ read_unlock(&hci_dev_list_lock);
+ return -ENOMEM;
+ }
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (hci_dev_test_flag(d, HCI_SETUP) ||
+ hci_dev_test_flag(d, HCI_CONFIG) ||
+ hci_dev_test_flag(d, HCI_USER_CHANNEL))
+ continue;
+
+ /* Devices marked as raw-only are neither configured
+ * nor unconfigured controllers.
+ */
+ if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+ continue;
+
+ if (d->dev_type == HCI_BREDR &&
+ !hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
+ rp->index[count++] = cpu_to_le16(d->id);
+ BT_DBG("Added hci%u", d->id);
+ }
+ }
+
+ rp->num_controllers = cpu_to_le16(count);
+ rp_len = sizeof(*rp) + (2 * count);
+
+ read_unlock(&hci_dev_list_lock);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_INDEX_LIST,
+ 0, rp, rp_len);
+
+ kfree(rp);
+
+ return err;
+}
+
+static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_unconf_index_list *rp;
+ struct hci_dev *d;
+ size_t rp_len;
+ u16 count;
+ int err;
+
+ BT_DBG("sock %p", sk);
+
+ read_lock(&hci_dev_list_lock);
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (d->dev_type == HCI_BREDR &&
+ hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ count++;
+ }
+
+ rp_len = sizeof(*rp) + (2 * count);
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp) {
+ read_unlock(&hci_dev_list_lock);
+ return -ENOMEM;
+ }
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (hci_dev_test_flag(d, HCI_SETUP) ||
+ hci_dev_test_flag(d, HCI_CONFIG) ||
+ hci_dev_test_flag(d, HCI_USER_CHANNEL))
+ continue;
+
+ /* Devices marked as raw-only are neither configured
+ * nor unconfigured controllers.
+ */
+ if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+ continue;
+
+ if (d->dev_type == HCI_BREDR &&
+ hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
+ rp->index[count++] = cpu_to_le16(d->id);
+ BT_DBG("Added hci%u", d->id);
+ }
+ }
+
+ rp->num_controllers = cpu_to_le16(count);
+ rp_len = sizeof(*rp) + (2 * count);
+
+ read_unlock(&hci_dev_list_lock);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+ MGMT_OP_READ_UNCONF_INDEX_LIST, 0, rp, rp_len);
+
+ kfree(rp);
+
+ return err;
+}
+
+static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_ext_index_list *rp;
+ struct hci_dev *d;
+ size_t rp_len;
+ u16 count;
+ int err;
+
+ BT_DBG("sock %p", sk);
+
+ read_lock(&hci_dev_list_lock);
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (d->dev_type == HCI_BREDR || d->dev_type == HCI_AMP)
+ count++;
+ }
+
+ rp_len = sizeof(*rp) + (sizeof(rp->entry[0]) * count);
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp) {
+ read_unlock(&hci_dev_list_lock);
+ return -ENOMEM;
+ }
+
+ count = 0;
+ list_for_each_entry(d, &hci_dev_list, list) {
+ if (hci_dev_test_flag(d, HCI_SETUP) ||
+ hci_dev_test_flag(d, HCI_CONFIG) ||
+ hci_dev_test_flag(d, HCI_USER_CHANNEL))
+ continue;
+
+ /* Devices marked as raw-only are neither configured
+ * nor unconfigured controllers.
+ */
+ if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+ continue;
+
+ if (d->dev_type == HCI_BREDR) {
+ if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ rp->entry[count].type = 0x01;
+ else
+ rp->entry[count].type = 0x00;
+ } else if (d->dev_type == HCI_AMP) {
+ rp->entry[count].type = 0x02;
+ } else {
+ continue;
+ }
+
+ rp->entry[count].bus = d->bus;
+ rp->entry[count++].index = cpu_to_le16(d->id);
+ BT_DBG("Added hci%u", d->id);
+ }
+
+ rp->num_controllers = cpu_to_le16(count);
+ rp_len = sizeof(*rp) + (sizeof(rp->entry[0]) * count);
+
+ read_unlock(&hci_dev_list_lock);
+
+ /* If this command is called at least once, then all the
+ * default index and unconfigured index events are disabled
+ * and from now on only extended index events are used.
+ */
+ hci_sock_set_flag(sk, HCI_MGMT_EXT_INDEX_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_INDEX_EVENTS);
+ hci_sock_clear_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+ MGMT_OP_READ_EXT_INDEX_LIST, 0, rp, rp_len);
+
+ kfree(rp);
+
+ return err;
+}
+
+static bool is_configured(struct hci_dev *hdev)
+{
+ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) &&
+ !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED))
+ return false;
+
+ if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) &&
+ !bacmp(&hdev->public_addr, BDADDR_ANY))
+ return false;
+
+ return true;
+}
+
+static __le32 get_missing_options(struct hci_dev *hdev)
+{
+ u32 options = 0;
+
+ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) &&
+ !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED))
+ options |= MGMT_OPTION_EXTERNAL_CONFIG;
+
+ if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) &&
+ !bacmp(&hdev->public_addr, BDADDR_ANY))
+ options |= MGMT_OPTION_PUBLIC_ADDRESS;
+
+ return cpu_to_le32(options);
+}
+
+static int new_options(struct hci_dev *hdev, struct sock *skip)
+{
+ __le32 options = get_missing_options(hdev);
+
+ return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options,
+ sizeof(options), skip);
+}
+
+static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
+{
+ __le32 options = get_missing_options(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, opcode, 0, &options,
+ sizeof(options));
+}
+
+static int read_config_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_config_info rp;
+ u32 options = 0;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ hci_dev_lock(hdev);
+
+ memset(&rp, 0, sizeof(rp));
+ rp.manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks))
+ options |= MGMT_OPTION_EXTERNAL_CONFIG;
+
+ if (hdev->set_bdaddr)
+ options |= MGMT_OPTION_PUBLIC_ADDRESS;
+
+ rp.supported_options = cpu_to_le32(options);
+ rp.missing_options = get_missing_options(hdev);
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONFIG_INFO, 0,
+ &rp, sizeof(rp));
+}
+
+static u32 get_supported_settings(struct hci_dev *hdev)
+{
+ u32 settings = 0;
+
+ settings |= MGMT_SETTING_POWERED;
+ settings |= MGMT_SETTING_BONDABLE;
+ settings |= MGMT_SETTING_DEBUG_KEYS;
+ settings |= MGMT_SETTING_CONNECTABLE;
+ settings |= MGMT_SETTING_DISCOVERABLE;
+
+ if (lmp_bredr_capable(hdev)) {
+ if (hdev->hci_ver >= BLUETOOTH_VER_1_2)
+ settings |= MGMT_SETTING_FAST_CONNECTABLE;
+ settings |= MGMT_SETTING_BREDR;
+ settings |= MGMT_SETTING_LINK_SECURITY;
+
+ if (lmp_ssp_capable(hdev)) {
+ settings |= MGMT_SETTING_SSP;
+ settings |= MGMT_SETTING_HS;
+ }
+
+ if (lmp_sc_capable(hdev))
+ settings |= MGMT_SETTING_SECURE_CONN;
+ }
+
+ if (lmp_le_capable(hdev)) {
+ settings |= MGMT_SETTING_LE;
+ settings |= MGMT_SETTING_ADVERTISING;
+ settings |= MGMT_SETTING_SECURE_CONN;
+ settings |= MGMT_SETTING_PRIVACY;
+ settings |= MGMT_SETTING_STATIC_ADDRESS;
+ }
+
+ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) ||
+ hdev->set_bdaddr)
+ settings |= MGMT_SETTING_CONFIGURATION;
+
+ return settings;
+}
+
+static u32 get_current_settings(struct hci_dev *hdev)
+{
+ u32 settings = 0;
+
+ if (hdev_is_powered(hdev))
+ settings |= MGMT_SETTING_POWERED;
+
+ if (hci_dev_test_flag(hdev, HCI_CONNECTABLE))
+ settings |= MGMT_SETTING_CONNECTABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
+ settings |= MGMT_SETTING_FAST_CONNECTABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ settings |= MGMT_SETTING_DISCOVERABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_BONDABLE))
+ settings |= MGMT_SETTING_BONDABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ settings |= MGMT_SETTING_BREDR;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ settings |= MGMT_SETTING_LE;
+
+ if (hci_dev_test_flag(hdev, HCI_LINK_SECURITY))
+ settings |= MGMT_SETTING_LINK_SECURITY;
+
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ settings |= MGMT_SETTING_SSP;
+
+ if (hci_dev_test_flag(hdev, HCI_HS_ENABLED))
+ settings |= MGMT_SETTING_HS;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ settings |= MGMT_SETTING_ADVERTISING;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ENABLED))
+ settings |= MGMT_SETTING_SECURE_CONN;
+
+ if (hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS))
+ settings |= MGMT_SETTING_DEBUG_KEYS;
+
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY))
+ settings |= MGMT_SETTING_PRIVACY;
+
+ /* The current setting for static address has two purposes. The
+ * first is to indicate if the static address will be used and
+ * the second is to indicate if it is actually set.
+ *
+ * This means if the static address is not configured, this flag
+ * will never be set. If the address is configured, then if the
+ * address is actually used decides if the flag is set or not.
+ *
+ * For single mode LE only controllers and dual-mode controllers
+ * with BR/EDR disabled, the existence of the static address will
+ * be evaluated.
+ */
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY)) {
+ if (bacmp(&hdev->static_addr, BDADDR_ANY))
+ settings |= MGMT_SETTING_STATIC_ADDRESS;
+ }
+
+ return settings;
+}
+
+#define PNP_INFO_SVCLASS_ID 0x1200
+
+static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 4)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ u16 uuid16;
+
+ if (uuid->size != 16)
+ continue;
+
+ uuid16 = get_unaligned_le16(&uuid->uuid[12]);
+ if (uuid16 < 0x1100)
+ continue;
+
+ if (uuid16 == PNP_INFO_SVCLASS_ID)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID16_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + sizeof(u16) > len) {
+ uuids_start[1] = EIR_UUID16_SOME;
+ break;
+ }
+
+ *ptr++ = (uuid16 & 0x00ff);
+ *ptr++ = (uuid16 & 0xff00) >> 8;
+ uuids_start[0] += sizeof(uuid16);
+ }
+
+ return ptr;
+}
+
+static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 6)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ if (uuid->size != 32)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID32_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + sizeof(u32) > len) {
+ uuids_start[1] = EIR_UUID32_SOME;
+ break;
+ }
+
+ memcpy(ptr, &uuid->uuid[12], sizeof(u32));
+ ptr += sizeof(u32);
+ uuids_start[0] += sizeof(u32);
+ }
+
+ return ptr;
+}
+
+static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 18)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ if (uuid->size != 128)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID128_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + 16 > len) {
+ uuids_start[1] = EIR_UUID128_SOME;
+ break;
+ }
+
+ memcpy(ptr, uuid->uuid, 16);
+ ptr += 16;
+ uuids_start[0] += 16;
+ }
+
+ return ptr;
+}
+
+static struct mgmt_pending_cmd *pending_find(u16 opcode, struct hci_dev *hdev)
+{
+ return mgmt_pending_find(HCI_CHANNEL_CONTROL, opcode, hdev);
+}
+
+static struct mgmt_pending_cmd *pending_find_data(u16 opcode,
+ struct hci_dev *hdev,
+ const void *data)
+{
+ return mgmt_pending_find_data(HCI_CHANNEL_CONTROL, opcode, hdev, data);
+}
+
+static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+{
+ u8 ad_len = 0;
+ size_t name_len;
+
+ name_len = strlen(hdev->dev_name);
+ if (name_len > 0) {
+ size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
+
+ if (name_len > max_len) {
+ name_len = max_len;
+ ptr[1] = EIR_NAME_SHORT;
+ } else
+ ptr[1] = EIR_NAME_COMPLETE;
+
+ ptr[0] = name_len + 1;
+
+ memcpy(ptr + 2, hdev->dev_name, name_len);
+
+ ad_len += (name_len + 2);
+ ptr += (name_len + 2);
+ }
+
+ return ad_len;
+}
+
+static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+{
+ /* TODO: Set the appropriate entries based on advertising instance flags
+ * here once flags other than 0 are supported.
+ */
+ memcpy(ptr, hdev->adv_instance.scan_rsp_data,
+ hdev->adv_instance.scan_rsp_len);
+
+ return hdev->adv_instance.scan_rsp_len;
+}
+
+static void update_scan_rsp_data_for_instance(struct hci_request *req,
+ u8 instance)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_scan_rsp_data cp;
+ u8 len;
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (instance)
+ len = create_instance_scan_rsp_data(hdev, cp.data);
+ else
+ len = create_default_scan_rsp_data(hdev, cp.data);
+
+ if (hdev->scan_rsp_data_len == len &&
+ !memcmp(cp.data, hdev->scan_rsp_data, len))
+ return;
+
+ memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
+ hdev->scan_rsp_data_len = len;
+
+ cp.length = len;
+
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
+}
+
+static void update_scan_rsp_data(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ u8 instance;
+
+ /* The "Set Advertising" setting supersedes the "Add Advertising"
+ * setting. Here we set the scan response data based on which
+ * setting was set. When neither apply, default to the global settings,
+ * represented by instance "0".
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
+ !hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ instance = 0x01;
+ else
+ instance = 0x00;
+
+ update_scan_rsp_data_for_instance(req, instance);
+}
+
+static u8 get_adv_discov_flags(struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ /* If there's a pending mgmt command the flags will not yet have
+ * their final values, so check for this first.
+ */
+ cmd = pending_find(MGMT_OP_SET_DISCOVERABLE, hdev);
+ if (cmd) {
+ struct mgmt_mode *cp = cmd->param;
+ if (cp->val == 0x01)
+ return LE_AD_GENERAL;
+ else if (cp->val == 0x02)
+ return LE_AD_LIMITED;
+ } else {
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ return LE_AD_LIMITED;
+ else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ return LE_AD_GENERAL;
+ }
+
+ return 0;
+}
+
+static u8 get_current_adv_instance(struct hci_dev *hdev)
+{
+ /* The "Set Advertising" setting supersedes the "Add Advertising"
+ * setting. Here we set the advertising data based on which
+ * setting was set. When neither apply, default to the global settings,
+ * represented by instance "0".
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
+ !hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return 0x01;
+
+ return 0x00;
+}
+
+static bool get_connectable(struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ /* If there's a pending mgmt command the flag will not yet have
+ * it's final value, so check for this first.
+ */
+ cmd = pending_find(MGMT_OP_SET_CONNECTABLE, hdev);
+ if (cmd) {
+ struct mgmt_mode *cp = cmd->param;
+
+ return cp->val;
+ }
+
+ return hci_dev_test_flag(hdev, HCI_CONNECTABLE);
+}
+
+static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
+{
+ u32 flags;
+
+ if (instance > 0x01)
+ return 0;
+
+ if (instance == 0x01)
+ return hdev->adv_instance.flags;
+
+ /* Instance 0 always manages the "Tx Power" and "Flags" fields */
+ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS;
+
+ /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting corresponds
+ * to the "connectable" instance flag.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
+ flags |= MGMT_ADV_FLAG_CONNECTABLE;
+
+ return flags;
+}
+
+static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
+{
+ /* Ignore instance 0 and other unsupported instances */
+ if (instance != 0x01)
+ return 0;
+
+ /* TODO: Take into account the "appearance" and "local-name" flags here.
+ * These are currently being ignored as they are not supported.
+ */
+ return hdev->adv_instance.scan_rsp_len;
+}
+
+static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
+{
+ u8 ad_len = 0, flags = 0;
+ u32 instance_flags = get_adv_instance_flags(hdev, instance);
+
+ /* The Add Advertising command allows userspace to set both the general
+ * and limited discoverable flags.
+ */
+ if (instance_flags & MGMT_ADV_FLAG_DISCOV)
+ flags |= LE_AD_GENERAL;
+
+ if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV)
+ flags |= LE_AD_LIMITED;
+
+ if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) {
+ /* If a discovery flag wasn't provided, simply use the global
+ * settings.
+ */
+ if (!flags)
+ flags |= get_adv_discov_flags(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ flags |= LE_AD_NO_BREDR;
+
+ /* If flags would still be empty, then there is no need to
+ * include the "Flags" AD field".
+ */
+ if (flags) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_FLAGS;
+ ptr[2] = flags;
+
+ ad_len += 3;
+ ptr += 3;
+ }
+ }
+
+ if (instance) {
+ memcpy(ptr, hdev->adv_instance.adv_data,
+ hdev->adv_instance.adv_data_len);
+
+ ad_len += hdev->adv_instance.adv_data_len;
+ ptr += hdev->adv_instance.adv_data_len;
+ }
+
+ /* Provide Tx Power only if we can provide a valid value for it */
+ if (hdev->adv_tx_power != HCI_TX_POWER_INVALID &&
+ (instance_flags & MGMT_ADV_FLAG_TX_POWER)) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8)hdev->adv_tx_power;
+
+ ad_len += 3;
+ ptr += 3;
+ }
+
+ return ad_len;
+}
+
+static void update_adv_data_for_instance(struct hci_request *req, u8 instance)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_adv_data cp;
+ u8 len;
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+
+ len = create_instance_adv_data(hdev, instance, cp.data);
+
+ /* There's nothing to do if the data hasn't changed */
+ if (hdev->adv_data_len == len &&
+ memcmp(cp.data, hdev->adv_data, len) == 0)
+ return;
+
+ memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
+ hdev->adv_data_len = len;
+
+ cp.length = len;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
+}
+
+static void update_adv_data(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ u8 instance = get_current_adv_instance(hdev);
+
+ update_adv_data_for_instance(req, instance);
+}
+
+int mgmt_update_adv_data(struct hci_dev *hdev)
+{
+ struct hci_request req;
+
+ hci_req_init(&req, hdev);
+ update_adv_data(&req);
+
+ return hci_req_run(&req, NULL);
+}
+
+static void create_eir(struct hci_dev *hdev, u8 *data)
+{
+ u8 *ptr = data;
+ size_t name_len;
+
+ name_len = strlen(hdev->dev_name);
+
+ if (name_len > 0) {
+ /* EIR Data type */
+ if (name_len > 48) {
+ name_len = 48;
+ ptr[1] = EIR_NAME_SHORT;
+ } else
+ ptr[1] = EIR_NAME_COMPLETE;
+
+ /* EIR Data length */
+ ptr[0] = name_len + 1;
+
+ memcpy(ptr + 2, hdev->dev_name, name_len);
+
+ ptr += (name_len + 2);
+ }
+
+ if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) {
+ ptr[0] = 2;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8) hdev->inq_tx_power;
+
+ ptr += 3;
+ }
+
+ if (hdev->devid_source > 0) {
+ ptr[0] = 9;
+ ptr[1] = EIR_DEVICE_ID;
+
+ put_unaligned_le16(hdev->devid_source, ptr + 2);
+ put_unaligned_le16(hdev->devid_vendor, ptr + 4);
+ put_unaligned_le16(hdev->devid_product, ptr + 6);
+ put_unaligned_le16(hdev->devid_version, ptr + 8);
+
+ ptr += 10;
+ }
+
+ ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+ ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+ ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+}
+
+static void update_eir(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_eir cp;
+
+ if (!hdev_is_powered(hdev))
+ return;
+
+ if (!lmp_ext_inq_capable(hdev))
+ return;
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+
+ create_eir(hdev, cp.data);
+
+ if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0)
+ return;
+
+ memcpy(hdev->eir, cp.data, sizeof(cp.data));
+
+ hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
+}
+
+static u8 get_service_classes(struct hci_dev *hdev)
+{
+ struct bt_uuid *uuid;
+ u8 val = 0;
+
+ list_for_each_entry(uuid, &hdev->uuids, list)
+ val |= uuid->svc_hint;
+
+ return val;
+}
+
+static void update_class(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ u8 cod[3];
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hdev_is_powered(hdev))
+ return;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
+ return;
+
+ cod[0] = hdev->minor_class;
+ cod[1] = hdev->major_class;
+ cod[2] = get_service_classes(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ cod[1] |= 0x20;
+
+ if (memcmp(cod, hdev->dev_class, 3) == 0)
+ return;
+
+ hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod);
+}
+
+static void disable_advertising(struct hci_request *req)
+{
+ u8 enable = 0x00;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+}
+
+static void enable_advertising(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_adv_param cp;
+ u8 own_addr_type, enable = 0x01;
+ bool connectable;
+ u8 instance;
+ u32 flags;
+
+ if (hci_conn_num(hdev, LE_LINK) > 0)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ disable_advertising(req);
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ instance = get_current_adv_instance(hdev);
+ flags = get_adv_instance_flags(hdev, instance);
+
+ /* If the "connectable" instance flag was not set, then choose between
+ * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+ */
+ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+ get_connectable(hdev);
+
+ /* Set require_privacy to true only when non-connectable
+ * advertising is used. In that case it is fine to use a
+ * non-resolvable private address.
+ */
+ if (hci_update_random_address(req, !connectable, &own_addr_type) < 0)
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval);
+ cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval);
+
+ if (connectable)
+ cp.type = LE_ADV_IND;
+ else if (get_adv_instance_scan_rsp_len(hdev, instance))
+ cp.type = LE_ADV_SCAN_IND;
+ else
+ cp.type = LE_ADV_NONCONN_IND;
+
+ cp.own_address_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+}
+
+static void service_cache_off(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ service_cache.work);
+ struct hci_request req;
+
+ if (!hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
+ return;
+
+ hci_req_init(&req, hdev);
+
+ hci_dev_lock(hdev);
+
+ update_eir(&req);
+ update_class(&req);
+
+ hci_dev_unlock(hdev);
+
+ hci_req_run(&req, NULL);
+}
+
+static void rpa_expired(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ rpa_expired.work);
+ struct hci_request req;
+
+ BT_DBG("");
+
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return;
+
+ /* The generation of a new RPA and programming it into the
+ * controller happens in the enable_advertising() function.
+ */
+ hci_req_init(&req, hdev);
+ enable_advertising(&req);
+ hci_req_run(&req, NULL);
+}
+
+static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev)
+{
+ if (hci_dev_test_and_set_flag(hdev, HCI_MGMT))
+ return;
+
+ INIT_DELAYED_WORK(&hdev->service_cache, service_cache_off);
+ INIT_DELAYED_WORK(&hdev->rpa_expired, rpa_expired);
+
+ /* Non-mgmt controlled devices get this bit set
+ * implicitly so that pairing works for them, however
+ * for mgmt we require user-space to explicitly enable
+ * it
+ */
+ hci_dev_clear_flag(hdev, HCI_BONDABLE);
+}
+
+static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_info rp;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ hci_dev_lock(hdev);
+
+ memset(&rp, 0, sizeof(rp));
+
+ bacpy(&rp.bdaddr, &hdev->bdaddr);
+
+ rp.version = hdev->hci_ver;
+ rp.manufacturer = cpu_to_le16(hdev->manufacturer);
+
+ rp.supported_settings = cpu_to_le32(get_supported_settings(hdev));
+ rp.current_settings = cpu_to_le32(get_current_settings(hdev));
+
+ memcpy(rp.dev_class, hdev->dev_class, 3);
+
+ memcpy(rp.name, hdev->dev_name, sizeof(hdev->dev_name));
+ memcpy(rp.short_name, hdev->short_name, sizeof(hdev->short_name));
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_INFO, 0, &rp,
+ sizeof(rp));
+}
+
+static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
+{
+ __le32 settings = cpu_to_le32(get_current_settings(hdev));
+
+ return mgmt_cmd_complete(sk, hdev->id, opcode, 0, &settings,
+ sizeof(settings));
+}
+
+static void clean_up_hci_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ BT_DBG("%s status 0x%02x", hdev->name, status);
+
+ if (hci_conn_count(hdev) == 0) {
+ cancel_delayed_work(&hdev->power_off);
+ queue_work(hdev->req_workqueue, &hdev->power_off.work);
+ }
+}
+
+static bool hci_stop_discovery(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_remote_name_req_cancel cp;
+ struct inquiry_entry *e;
+
+ switch (hdev->discovery.state) {
+ case DISCOVERY_FINDING:
+ if (test_bit(HCI_INQUIRY, &hdev->flags))
+ hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ cancel_delayed_work(&hdev->le_scan_disable);
+ hci_req_add_le_scan_disable(req);
+ }
+
+ return true;
+
+ case DISCOVERY_RESOLVING:
+ e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY,
+ NAME_PENDING);
+ if (!e)
+ break;
+
+ bacpy(&cp.bdaddr, &e->data.bdaddr);
+ hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp),
+ &cp);
+
+ return true;
+
+ default:
+ /* Passive scanning */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ hci_req_add_le_scan_disable(req);
+ return true;
+ }
+
+ break;
+ }
+
+ return false;
+}
+
+static void advertising_added(struct sock *sk, struct hci_dev *hdev,
+ u8 instance)
+{
+ struct mgmt_ev_advertising_added ev;
+
+ ev.instance = instance;
+
+ mgmt_event(MGMT_EV_ADVERTISING_ADDED, hdev, &ev, sizeof(ev), sk);
+}
+
+static void advertising_removed(struct sock *sk, struct hci_dev *hdev,
+ u8 instance)
+{
+ struct mgmt_ev_advertising_removed ev;
+
+ ev.instance = instance;
+
+ mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk);
+}
+
+static void clear_adv_instance(struct hci_dev *hdev)
+{
+ struct hci_request req;
+
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ return;
+
+ if (hdev->adv_instance.timeout)
+ cancel_delayed_work(&hdev->adv_instance.timeout_exp);
+
+ memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance));
+ advertising_removed(NULL, hdev, 1);
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
+
+ if (!hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return;
+
+ hci_req_init(&req, hdev);
+ disable_advertising(&req);
+ hci_req_run(&req, NULL);
+}
+
+static int clean_up_hci_state(struct hci_dev *hdev)
+{
+ struct hci_request req;
+ struct hci_conn *conn;
+ bool discov_stopped;
+ int err;
+
+ hci_req_init(&req, hdev);
+
+ if (test_bit(HCI_ISCAN, &hdev->flags) ||
+ test_bit(HCI_PSCAN, &hdev->flags)) {
+ u8 scan = 0x00;
+ hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+ }
+
+ if (hdev->adv_instance.timeout)
+ clear_adv_instance(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ disable_advertising(&req);
+
+ discov_stopped = hci_stop_discovery(&req);
+
+ list_for_each_entry(conn, &hdev->conn_hash.list, list) {
+ struct hci_cp_disconnect dc;
+ struct hci_cp_reject_conn_req rej;
+
+ switch (conn->state) {
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ dc.handle = cpu_to_le16(conn->handle);
+ dc.reason = 0x15; /* Terminated due to Power Off */
+ hci_req_add(&req, HCI_OP_DISCONNECT, sizeof(dc), &dc);
+ break;
+ case BT_CONNECT:
+ if (conn->type == LE_LINK)
+ hci_req_add(&req, HCI_OP_LE_CREATE_CONN_CANCEL,
+ 0, NULL);
+ else if (conn->type == ACL_LINK)
+ hci_req_add(&req, HCI_OP_CREATE_CONN_CANCEL,
+ 6, &conn->dst);
+ break;
+ case BT_CONNECT2:
+ bacpy(&rej.bdaddr, &conn->dst);
+ rej.reason = 0x15; /* Terminated due to Power Off */
+ if (conn->type == ACL_LINK)
+ hci_req_add(&req, HCI_OP_REJECT_CONN_REQ,
+ sizeof(rej), &rej);
+ else if (conn->type == SCO_LINK)
+ hci_req_add(&req, HCI_OP_REJECT_SYNC_CONN_REQ,
+ sizeof(rej), &rej);
+ break;
+ }
+ }
+
+ err = hci_req_run(&req, clean_up_hci_complete);
+ if (!err && discov_stopped)
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
+
+ return err;
+}
+
+static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (pending_find(MGMT_OP_SET_POWERED, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
+ cancel_delayed_work(&hdev->power_off);
+
+ if (cp->val) {
+ mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev,
+ data, len);
+ err = mgmt_powered(hdev, 1);
+ goto failed;
+ }
+ }
+
+ if (!!cp->val == hdev_is_powered(hdev)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_POWERED, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ if (cp->val) {
+ queue_work(hdev->req_workqueue, &hdev->power_on);
+ err = 0;
+ } else {
+ /* Disconnect connections, stop scans, etc */
+ err = clean_up_hci_state(hdev);
+ if (!err)
+ queue_delayed_work(hdev->req_workqueue, &hdev->power_off,
+ HCI_POWER_OFF_TIMEOUT);
+
+ /* ENODATA means there were no HCI commands queued */
+ if (err == -ENODATA) {
+ cancel_delayed_work(&hdev->power_off);
+ queue_work(hdev->req_workqueue, &hdev->power_off.work);
+ err = 0;
+ }
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int new_settings(struct hci_dev *hdev, struct sock *skip)
+{
+ __le32 ev = cpu_to_le32(get_current_settings(hdev));
+
+ return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
+ sizeof(ev), skip);
+}
+
+int mgmt_new_settings(struct hci_dev *hdev)
+{
+ return new_settings(hdev, NULL);
+}
+
+struct cmd_lookup {
+ struct sock *sk;
+ struct hci_dev *hdev;
+ u8 mgmt_status;
+};
+
+static void settings_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ struct cmd_lookup *match = data;
+
+ send_settings_rsp(cmd->sk, cmd->opcode, match->hdev);
+
+ list_del(&cmd->list);
+
+ if (match->sk == NULL) {
+ match->sk = cmd->sk;
+ sock_hold(match->sk);
+ }
+
+ mgmt_pending_free(cmd);
+}
+
+static void cmd_status_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ u8 *status = data;
+
+ mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, *status);
+ mgmt_pending_remove(cmd);
+}
+
+static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ if (cmd->cmd_complete) {
+ u8 *status = data;
+
+ cmd->cmd_complete(cmd, *status);
+ mgmt_pending_remove(cmd);
+
+ return;
+ }
+
+ cmd_status_rsp(cmd, data);
+}
+
+static int generic_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
+{
+ return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status,
+ cmd->param, cmd->param_len);
+}
+
+static int addr_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
+{
+ return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status,
+ cmd->param, sizeof(struct mgmt_addr_info));
+}
+
+static u8 mgmt_bredr_support(struct hci_dev *hdev)
+{
+ if (!lmp_bredr_capable(hdev))
+ return MGMT_STATUS_NOT_SUPPORTED;
+ else if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return MGMT_STATUS_REJECTED;
+ else
+ return MGMT_STATUS_SUCCESS;
+}
+
+static u8 mgmt_le_support(struct hci_dev *hdev)
+{
+ if (!lmp_le_capable(hdev))
+ return MGMT_STATUS_NOT_SUPPORTED;
+ else if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return MGMT_STATUS_REJECTED;
+ else
+ return MGMT_STATUS_SUCCESS;
+}
+
+static void set_discoverable_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_mode *cp;
+ struct hci_request req;
+ bool changed;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_SET_DISCOVERABLE, hdev);
+ if (!cmd)
+ goto unlock;
+
+ if (status) {
+ u8 mgmt_err = mgmt_status(status);
+ mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ goto remove_cmd;
+ }
+
+ cp = cmd->param;
+ if (cp->val) {
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE);
+
+ if (hdev->discov_timeout > 0) {
+ int to = msecs_to_jiffies(hdev->discov_timeout * 1000);
+ queue_delayed_work(hdev->workqueue, &hdev->discov_off,
+ to);
+ }
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE);
+ }
+
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_DISCOVERABLE, hdev);
+
+ if (changed)
+ new_settings(hdev, cmd->sk);
+
+ /* When the discoverable mode gets changed, make sure
+ * that class of device has the limited discoverable
+ * bit correctly set. Also update page scan based on whitelist
+ * entries.
+ */
+ hci_req_init(&req, hdev);
+ __hci_update_page_scan(&req);
+ update_class(&req);
+ hci_req_run(&req, NULL);
+
+remove_cmd:
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_discoverable *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ u16 timeout;
+ u8 scan;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ timeout = __le16_to_cpu(cp->timeout);
+
+ /* Disabling discoverable requires that no timeout is set,
+ * and enabling limited discoverable requires a timeout.
+ */
+ if ((cp->val == 0x00 && timeout > 0) ||
+ (cp->val == 0x02 && timeout == 0))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev) && timeout > 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_NOT_POWERED);
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) ||
+ pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_CONNECTABLE)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_REJECTED);
+ goto failed;
+ }
+
+ if (!hdev_is_powered(hdev)) {
+ bool changed = false;
+
+ /* Setting limited discoverable when powered off is
+ * not a valid operation since it requires a timeout
+ * and so no need to check HCI_LIMITED_DISCOVERABLE.
+ */
+ if (!!cp->val != hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) {
+ hci_dev_change_flag(hdev, HCI_DISCOVERABLE);
+ changed = true;
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_DISCOVERABLE, hdev);
+ if (err < 0)
+ goto failed;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto failed;
+ }
+
+ /* If the current mode is the same, then just update the timeout
+ * value with the new value. And if only the timeout gets updated,
+ * then no need for any HCI transactions.
+ */
+ if (!!cp->val == hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+ (cp->val == 0x02) == hci_dev_test_flag(hdev,
+ HCI_LIMITED_DISCOVERABLE)) {
+ cancel_delayed_work(&hdev->discov_off);
+ hdev->discov_timeout = timeout;
+
+ if (cp->val && hdev->discov_timeout > 0) {
+ int to = msecs_to_jiffies(hdev->discov_timeout * 1000);
+ queue_delayed_work(hdev->workqueue, &hdev->discov_off,
+ to);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_DISCOVERABLE, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ /* Cancel any potential discoverable timeout that might be
+ * still active and store new timeout value. The arming of
+ * the timeout happens in the complete handler.
+ */
+ cancel_delayed_work(&hdev->discov_off);
+ hdev->discov_timeout = timeout;
+
+ /* Limited discoverable mode */
+ if (cp->val == 0x02)
+ hci_dev_set_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+
+ hci_req_init(&req, hdev);
+
+ /* The procedure for LE-only controllers is much simpler - just
+ * update the advertising data.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ goto update_ad;
+
+ scan = SCAN_PAGE;
+
+ if (cp->val) {
+ struct hci_cp_write_current_iac_lap hci_cp;
+
+ if (cp->val == 0x02) {
+ /* Limited discoverable mode */
+ hci_cp.num_iac = min_t(u8, hdev->num_iac, 2);
+ hci_cp.iac_lap[0] = 0x00; /* LIAC */
+ hci_cp.iac_lap[1] = 0x8b;
+ hci_cp.iac_lap[2] = 0x9e;
+ hci_cp.iac_lap[3] = 0x33; /* GIAC */
+ hci_cp.iac_lap[4] = 0x8b;
+ hci_cp.iac_lap[5] = 0x9e;
+ } else {
+ /* General discoverable mode */
+ hci_cp.num_iac = 1;
+ hci_cp.iac_lap[0] = 0x33; /* GIAC */
+ hci_cp.iac_lap[1] = 0x8b;
+ hci_cp.iac_lap[2] = 0x9e;
+ }
+
+ hci_req_add(&req, HCI_OP_WRITE_CURRENT_IAC_LAP,
+ (hci_cp.num_iac * 3) + 1, &hci_cp);
+
+ scan |= SCAN_INQUIRY;
+ } else {
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ }
+
+ hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, sizeof(scan), &scan);
+
+update_ad:
+ update_adv_data(&req);
+
+ err = hci_req_run(&req, set_discoverable_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void write_fast_connectable(struct hci_request *req, bool enable)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_page_scan_activity acp;
+ u8 type;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return;
+
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return;
+
+ if (enable) {
+ type = PAGE_SCAN_TYPE_INTERLACED;
+
+ /* 160 msec page scan interval */
+ acp.interval = cpu_to_le16(0x0100);
+ } else {
+ type = PAGE_SCAN_TYPE_STANDARD; /* default */
+
+ /* default 1.28 sec page scan */
+ acp.interval = cpu_to_le16(0x0800);
+ }
+
+ acp.window = cpu_to_le16(0x0012);
+
+ if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval ||
+ __cpu_to_le16(hdev->page_scan_window) != acp.window)
+ hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
+ sizeof(acp), &acp);
+
+ if (hdev->page_scan_type != type)
+ hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type);
+}
+
+static void set_connectable_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_mode *cp;
+ bool conn_changed, discov_changed;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_SET_CONNECTABLE, hdev);
+ if (!cmd)
+ goto unlock;
+
+ if (status) {
+ u8 mgmt_err = mgmt_status(status);
+ mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err);
+ goto remove_cmd;
+ }
+
+ cp = cmd->param;
+ if (cp->val) {
+ conn_changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_CONNECTABLE);
+ discov_changed = false;
+ } else {
+ conn_changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_CONNECTABLE);
+ discov_changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_DISCOVERABLE);
+ }
+
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev);
+
+ if (conn_changed || discov_changed) {
+ new_settings(hdev, cmd->sk);
+ hci_update_page_scan(hdev);
+ if (discov_changed)
+ mgmt_update_adv_data(hdev);
+ hci_update_background_scan(hdev);
+ }
+
+remove_cmd:
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_connectable_update_settings(struct hci_dev *hdev,
+ struct sock *sk, u8 val)
+{
+ bool changed = false;
+ int err;
+
+ if (!!val != hci_dev_test_flag(hdev, HCI_CONNECTABLE))
+ changed = true;
+
+ if (val) {
+ hci_dev_set_flag(hdev, HCI_CONNECTABLE);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_CONNECTABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_CONNECTABLE, hdev);
+ if (err < 0)
+ return err;
+
+ if (changed) {
+ hci_update_page_scan(hdev);
+ hci_update_background_scan(hdev);
+ return new_settings(hdev, sk);
+ }
+
+ return 0;
+}
+
+static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ u8 scan;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = set_connectable_update_settings(hdev, sk, cp->val);
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) ||
+ pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ hci_req_init(&req, hdev);
+
+ /* If BR/EDR is not enabled and we disable advertising as a
+ * by-product of disabling connectable, we need to update the
+ * advertising flags.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ if (!cp->val) {
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ }
+ update_adv_data(&req);
+ } else if (cp->val != test_bit(HCI_PSCAN, &hdev->flags)) {
+ if (cp->val) {
+ scan = SCAN_PAGE;
+ } else {
+ /* If we don't have any whitelist entries just
+ * disable all scanning. If there are entries
+ * and we had both page and inquiry scanning
+ * enabled then fall back to only page scanning.
+ * Otherwise no changes are needed.
+ */
+ if (list_empty(&hdev->whitelist))
+ scan = SCAN_DISABLED;
+ else if (test_bit(HCI_ISCAN, &hdev->flags))
+ scan = SCAN_PAGE;
+ else
+ goto no_scan_update;
+
+ if (test_bit(HCI_ISCAN, &hdev->flags) &&
+ hdev->discov_timeout > 0)
+ cancel_delayed_work(&hdev->discov_off);
+ }
+
+ hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+ }
+
+no_scan_update:
+ /* Update the advertising parameters if necessary */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ enable_advertising(&req);
+
+ err = hci_req_run(&req, set_connectable_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ if (err == -ENODATA)
+ err = set_connectable_update_settings(hdev, sk,
+ cp->val);
+ goto failed;
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ bool changed;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (cp->val)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_BONDABLE);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_BONDABLE);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_BONDABLE, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u8 val, status;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ status = mgmt_bredr_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY,
+ status);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ bool changed = false;
+
+ if (!!cp->val != hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) {
+ hci_dev_change_flag(hdev, HCI_LINK_SECURITY);
+ changed = true;
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_LINK_SECURITY, hdev);
+ if (err < 0)
+ goto failed;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_LINK_SECURITY, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ val = !!cp->val;
+
+ if (test_bit(HCI_AUTH, &hdev->flags) == val) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_LINK_SECURITY, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_LINK_SECURITY, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ err = hci_send_cmd(hdev, HCI_OP_WRITE_AUTH_ENABLE, sizeof(val), &val);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ u8 status;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ status = mgmt_bredr_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, status);
+
+ if (!lmp_ssp_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ bool changed;
+
+ if (cp->val) {
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_SSP_ENABLED);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_SSP_ENABLED);
+ if (!changed)
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_HS_ENABLED);
+ else
+ hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev);
+ if (err < 0)
+ goto failed;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_SSP, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ if (!!cp->val == hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_SSP, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ if (!cp->val && hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS))
+ hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE,
+ sizeof(cp->val), &cp->val);
+
+ err = hci_send_cmd(hdev, HCI_OP_WRITE_SSP_MODE, 1, &cp->val);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ bool changed;
+ u8 status;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ status = mgmt_bredr_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, status);
+
+ if (!lmp_ssp_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (pending_find(MGMT_OP_SET_SSP, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (cp->val) {
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_HS_ENABLED);
+ } else {
+ if (hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_HS_ENABLED);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_HS, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct cmd_lookup match = { NULL, hdev };
+
+ hci_dev_lock(hdev);
+
+ if (status) {
+ u8 mgmt_err = mgmt_status(status);
+
+ mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, cmd_status_rsp,
+ &mgmt_err);
+ goto unlock;
+ }
+
+ mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, settings_rsp, &match);
+
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+ /* Make sure the controller has a good default for
+ * advertising data. Restrict the update to when LE
+ * has actually been enabled. During power on, the
+ * update in powered_update_hci will take care of it.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ struct hci_request req;
+
+ hci_req_init(&req, hdev);
+ update_adv_data(&req);
+ update_scan_rsp_data(&req);
+ __hci_update_background_scan(&req);
+ hci_req_run(&req, NULL);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct hci_cp_write_le_host_supported hci_cp;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+ u8 val, enabled;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Bluetooth single mode LE only controllers or dual-mode
+ * controllers configured as LE only devices, do not allow
+ * switching LE off. These have either LE enabled explicitly
+ * or BR/EDR has been previously switched off.
+ *
+ * When trying to enable an already enabled LE, then gracefully
+ * send a positive response. Trying to disable it however will
+ * result into rejection.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ if (cp->val == 0x01)
+ return send_settings_rsp(sk, MGMT_OP_SET_LE, hdev);
+
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_REJECTED);
+ }
+
+ hci_dev_lock(hdev);
+
+ val = !!cp->val;
+ enabled = lmp_host_le_capable(hdev);
+
+ if (!hdev_is_powered(hdev) || val == enabled) {
+ bool changed = false;
+
+ if (val != hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ hci_dev_change_flag(hdev, HCI_LE_ENABLED);
+ changed = true;
+ }
+
+ if (!val && hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING);
+ changed = true;
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_LE, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_SET_LE, hdev) ||
+ pending_find(MGMT_OP_SET_ADVERTISING, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_LE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+
+ memset(&hci_cp, 0, sizeof(hci_cp));
+
+ if (val) {
+ hci_cp.le = val;
+ hci_cp.simul = 0x00;
+ } else {
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ disable_advertising(&req);
+ }
+
+ hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp),
+ &hci_cp);
+
+ err = hci_req_run(&req, le_enable_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+/* This is a helper function to test for pending mgmt commands that can
+ * cause CoD or EIR HCI commands. We can only allow one such pending
+ * mgmt command at a time since otherwise we cannot easily track what
+ * the current values are, will be, and based on that calculate if a new
+ * HCI command needs to be sent and if yes with what value.
+ */
+static bool pending_eir_or_class(struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ list_for_each_entry(cmd, &hdev->mgmt_pending, list) {
+ switch (cmd->opcode) {
+ case MGMT_OP_ADD_UUID:
+ case MGMT_OP_REMOVE_UUID:
+ case MGMT_OP_SET_DEV_CLASS:
+ case MGMT_OP_SET_POWERED:
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static const u8 bluetooth_base_uuid[] = {
+ 0xfb, 0x34, 0x9b, 0x5f, 0x80, 0x00, 0x00, 0x80,
+ 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static u8 get_uuid_size(const u8 *uuid)
+{
+ u32 val;
+
+ if (memcmp(uuid, bluetooth_base_uuid, 12))
+ return 128;
+
+ val = get_unaligned_le32(&uuid[12]);
+ if (val > 0xffff)
+ return 32;
+
+ return 16;
+}
+
+static void mgmt_class_complete(struct hci_dev *hdev, u16 mgmt_op, u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(mgmt_op, hdev);
+ if (!cmd)
+ goto unlock;
+
+ mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
+ mgmt_status(status), hdev->dev_class, 3);
+
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void add_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ BT_DBG("status 0x%02x", status);
+
+ mgmt_class_complete(hdev, MGMT_OP_ADD_UUID, status);
+}
+
+static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_cp_add_uuid *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ struct bt_uuid *uuid;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (pending_eir_or_class(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_UUID,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ uuid = kmalloc(sizeof(*uuid), GFP_KERNEL);
+ if (!uuid) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ memcpy(uuid->uuid, cp->uuid, 16);
+ uuid->svc_hint = cp->svc_hint;
+ uuid->size = get_uuid_size(cp->uuid);
+
+ list_add_tail(&uuid->list, &hdev->uuids);
+
+ hci_req_init(&req, hdev);
+
+ update_class(&req);
+ update_eir(&req);
+
+ err = hci_req_run(&req, add_uuid_complete);
+ if (err < 0) {
+ if (err != -ENODATA)
+ goto failed;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_UUID, 0,
+ hdev->dev_class, 3);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_ADD_UUID, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ err = 0;
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static bool enable_service_cache(struct hci_dev *hdev)
+{
+ if (!hdev_is_powered(hdev))
+ return false;
+
+ if (!hci_dev_test_and_set_flag(hdev, HCI_SERVICE_CACHE)) {
+ queue_delayed_work(hdev->workqueue, &hdev->service_cache,
+ CACHE_TIMEOUT);
+ return true;
+ }
+
+ return false;
+}
+
+static void remove_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ BT_DBG("status 0x%02x", status);
+
+ mgmt_class_complete(hdev, MGMT_OP_REMOVE_UUID, status);
+}
+
+static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_remove_uuid *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct bt_uuid *match, *tmp;
+ u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ struct hci_request req;
+ int err, found;
+
+ BT_DBG("request for %s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (pending_eir_or_class(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (memcmp(cp->uuid, bt_uuid_any, 16) == 0) {
+ hci_uuids_clear(hdev);
+
+ if (enable_service_cache(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_UUID,
+ 0, hdev->dev_class, 3);
+ goto unlock;
+ }
+
+ goto update_class;
+ }
+
+ found = 0;
+
+ list_for_each_entry_safe(match, tmp, &hdev->uuids, list) {
+ if (memcmp(match->uuid, cp->uuid, 16) != 0)
+ continue;
+
+ list_del(&match->list);
+ kfree(match);
+ found++;
+ }
+
+ if (found == 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+update_class:
+ hci_req_init(&req, hdev);
+
+ update_class(&req);
+ update_eir(&req);
+
+ err = hci_req_run(&req, remove_uuid_complete);
+ if (err < 0) {
+ if (err != -ENODATA)
+ goto unlock;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_UUID, 0,
+ hdev->dev_class, 3);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_UUID, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = 0;
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void set_class_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ BT_DBG("status 0x%02x", status);
+
+ mgmt_class_complete(hdev, MGMT_OP_SET_DEV_CLASS, status);
+}
+
+static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_dev_class *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!lmp_bredr_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ hci_dev_lock(hdev);
+
+ if (pending_eir_or_class(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if ((cp->minor & 0x03) != 0 || (cp->major & 0xe0) != 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ hdev->major_class = cp->major;
+ hdev->minor_class = cp->minor;
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0,
+ hdev->dev_class, 3);
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) {
+ hci_dev_unlock(hdev);
+ cancel_delayed_work_sync(&hdev->service_cache);
+ hci_dev_lock(hdev);
+ update_eir(&req);
+ }
+
+ update_class(&req);
+
+ err = hci_req_run(&req, set_class_complete);
+ if (err < 0) {
+ if (err != -ENODATA)
+ goto unlock;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0,
+ hdev->dev_class, 3);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_DEV_CLASS, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = 0;
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_load_link_keys *cp = data;
+ const u16 max_key_count = ((U16_MAX - sizeof(*cp)) /
+ sizeof(struct mgmt_link_key_info));
+ u16 key_count, expected_len;
+ bool changed;
+ int i;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!lmp_bredr_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ key_count = __le16_to_cpu(cp->key_count);
+ if (key_count > max_key_count) {
+ BT_ERR("load_link_keys: too big key_count value %u",
+ key_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = sizeof(*cp) + key_count *
+ sizeof(struct mgmt_link_key_info);
+ if (expected_len != len) {
+ BT_ERR("load_link_keys: expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ if (cp->debug_keys != 0x00 && cp->debug_keys != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ BT_DBG("%s debug_keys %u key_count %u", hdev->name, cp->debug_keys,
+ key_count);
+
+ for (i = 0; i < key_count; i++) {
+ struct mgmt_link_key_info *key = &cp->keys[i];
+
+ if (key->addr.type != BDADDR_BREDR || key->type > 0x08)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_LOAD_LINK_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ hci_dev_lock(hdev);
+
+ hci_link_keys_clear(hdev);
+
+ if (cp->debug_keys)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_KEEP_DEBUG_KEYS);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_KEEP_DEBUG_KEYS);
+
+ if (changed)
+ new_settings(hdev, NULL);
+
+ for (i = 0; i < key_count; i++) {
+ struct mgmt_link_key_info *key = &cp->keys[i];
+
+ /* Always ignore debug keys and require a new pairing if
+ * the user wants to use them.
+ */
+ if (key->type == HCI_LK_DEBUG_COMBINATION)
+ continue;
+
+ hci_add_link_key(hdev, NULL, &key->addr.bdaddr, key->val,
+ key->type, key->pin_len, NULL);
+ }
+
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, 0, NULL, 0);
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int device_unpaired(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, struct sock *skip_sk)
+{
+ struct mgmt_ev_device_unpaired ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = addr_type;
+
+ return mgmt_event(MGMT_EV_DEVICE_UNPAIRED, hdev, &ev, sizeof(ev),
+ skip_sk);
+}
+
+static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_unpair_device *cp = data;
+ struct mgmt_rp_unpair_device rp;
+ struct hci_cp_disconnect dc;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ int err;
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ if (cp->disconnect != 0x00 && cp->disconnect != 0x01)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ /* If disconnection is requested, then look up the
+ * connection. If the remote device is connected, it
+ * will be later used to terminate the link.
+ *
+ * Setting it to NULL explicitly will cause no
+ * termination of the link.
+ */
+ if (cp->disconnect)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = NULL;
+
+ err = hci_remove_link_key(hdev, &cp->addr.bdaddr);
+ } else {
+ u8 addr_type;
+
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK,
+ &cp->addr.bdaddr);
+ if (conn) {
+ /* Defer clearing up the connection parameters
+ * until closing to give a chance of keeping
+ * them if a repairing happens.
+ */
+ set_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags);
+
+ /* If disconnection is not requested, then
+ * clear the connection variable so that the
+ * link is not terminated.
+ */
+ if (!cp->disconnect)
+ conn = NULL;
+ }
+
+ if (cp->addr.type == BDADDR_LE_PUBLIC)
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ else
+ addr_type = ADDR_LE_DEV_RANDOM;
+
+ hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type);
+
+ err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type);
+ }
+
+ if (err < 0) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
+ MGMT_STATUS_NOT_PAIRED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ /* If the connection variable is set, then termination of the
+ * link is requested.
+ */
+ if (!conn) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, 0,
+ &rp, sizeof(rp));
+ device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, sk);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_UNPAIR_DEVICE, hdev, cp,
+ sizeof(*cp));
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ dc.handle = cpu_to_le16(conn->handle);
+ dc.reason = 0x13; /* Remote User Terminated Connection */
+ err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_disconnect *cp = data;
+ struct mgmt_rp_disconnect rp;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ int err;
+
+ BT_DBG("");
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!test_bit(HCI_UP, &hdev->flags)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_DISCONNECT, hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
+ MGMT_STATUS_BUSY, &rp, sizeof(rp));
+ goto failed;
+ }
+
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr);
+
+ if (!conn || conn->state == BT_OPEN || conn->state == BT_CLOSED) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
+ MGMT_STATUS_NOT_CONNECTED, &rp,
+ sizeof(rp));
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_DISCONNECT, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ cmd->cmd_complete = generic_cmd_complete;
+
+ err = hci_disconnect(conn, HCI_ERROR_REMOTE_USER_TERM);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static u8 link_to_bdaddr(u8 link_type, u8 addr_type)
+{
+ switch (link_type) {
+ case LE_LINK:
+ switch (addr_type) {
+ case ADDR_LE_DEV_PUBLIC:
+ return BDADDR_LE_PUBLIC;
+
+ default:
+ /* Fallback to LE Random address type */
+ return BDADDR_LE_RANDOM;
+ }
+
+ default:
+ /* Fallback to BR/EDR type */
+ return BDADDR_BREDR;
+ }
+}
+
+static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_rp_get_connections *rp;
+ struct hci_conn *c;
+ size_t rp_len;
+ int err;
+ u16 i;
+
+ BT_DBG("");
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_CONNECTIONS,
+ MGMT_STATUS_NOT_POWERED);
+ goto unlock;
+ }
+
+ i = 0;
+ list_for_each_entry(c, &hdev->conn_hash.list, list) {
+ if (test_bit(HCI_CONN_MGMT_CONNECTED, &c->flags))
+ i++;
+ }
+
+ rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info));
+ rp = kmalloc(rp_len, GFP_KERNEL);
+ if (!rp) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ i = 0;
+ list_for_each_entry(c, &hdev->conn_hash.list, list) {
+ if (!test_bit(HCI_CONN_MGMT_CONNECTED, &c->flags))
+ continue;
+ bacpy(&rp->addr[i].bdaddr, &c->dst);
+ rp->addr[i].type = link_to_bdaddr(c->type, c->dst_type);
+ if (c->type == SCO_LINK || c->type == ESCO_LINK)
+ continue;
+ i++;
+ }
+
+ rp->conn_count = cpu_to_le16(i);
+
+ /* Recalculate length in case of filtered SCO connections, etc */
+ rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info));
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, 0, rp,
+ rp_len);
+
+ kfree(rp);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_pin_code_neg_reply *cp)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_NEG_REPLY, hdev, cp,
+ sizeof(*cp));
+ if (!cmd)
+ return -ENOMEM;
+
+ err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY,
+ sizeof(cp->addr.bdaddr), &cp->addr.bdaddr);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+ return err;
+}
+
+static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct hci_conn *conn;
+ struct mgmt_cp_pin_code_reply *cp = data;
+ struct hci_cp_pin_code_reply reply;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ BT_DBG("");
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY,
+ MGMT_STATUS_NOT_POWERED);
+ goto failed;
+ }
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr);
+ if (!conn) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY,
+ MGMT_STATUS_NOT_CONNECTED);
+ goto failed;
+ }
+
+ if (conn->pending_sec_level == BT_SECURITY_HIGH && cp->pin_len != 16) {
+ struct mgmt_cp_pin_code_neg_reply ncp;
+
+ memcpy(&ncp.addr, &cp->addr, sizeof(ncp.addr));
+
+ BT_ERR("PIN code is not 16 bytes long");
+
+ err = send_pin_code_neg_reply(sk, hdev, &ncp);
+ if (err >= 0)
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_REPLY, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ bacpy(&reply.bdaddr, &cp->addr.bdaddr);
+ reply.pin_len = cp->pin_len;
+ memcpy(reply.pin_code, cp->pin_code, sizeof(reply.pin_code));
+
+ err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_REPLY, sizeof(reply), &reply);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_io_capability *cp = data;
+
+ BT_DBG("");
+
+ if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
+ MGMT_STATUS_INVALID_PARAMS, NULL, 0);
+
+ hci_dev_lock(hdev);
+
+ hdev->io_capability = cp->io_capability;
+
+ BT_DBG("%s IO capability set to 0x%02x", hdev->name,
+ hdev->io_capability);
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, 0,
+ NULL, 0);
+}
+
+static struct mgmt_pending_cmd *find_pairing(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct mgmt_pending_cmd *cmd;
+
+ list_for_each_entry(cmd, &hdev->mgmt_pending, list) {
+ if (cmd->opcode != MGMT_OP_PAIR_DEVICE)
+ continue;
+
+ if (cmd->user_data != conn)
+ continue;
+
+ return cmd;
+ }
+
+ return NULL;
+}
+
+static int pairing_complete(struct mgmt_pending_cmd *cmd, u8 status)
+{
+ struct mgmt_rp_pair_device rp;
+ struct hci_conn *conn = cmd->user_data;
+ int err;
+
+ bacpy(&rp.addr.bdaddr, &conn->dst);
+ rp.addr.type = link_to_bdaddr(conn->type, conn->dst_type);
+
+ err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE,
+ status, &rp, sizeof(rp));
+
+ /* So we don't get further callbacks for this connection */
+ conn->connect_cfm_cb = NULL;
+ conn->security_cfm_cb = NULL;
+ conn->disconn_cfm_cb = NULL;
+
+ hci_conn_drop(conn);
+
+ /* The device is paired so there is no need to remove
+ * its connection parameters anymore.
+ */
+ clear_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags);
+
+ hci_conn_put(conn);
+
+ return err;
+}
+
+void mgmt_smp_complete(struct hci_conn *conn, bool complete)
+{
+ u8 status = complete ? MGMT_STATUS_SUCCESS : MGMT_STATUS_FAILED;
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = find_pairing(conn);
+ if (cmd) {
+ cmd->cmd_complete(cmd, status);
+ mgmt_pending_remove(cmd);
+ }
+}
+
+static void pairing_complete_cb(struct hci_conn *conn, u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status %u", status);
+
+ cmd = find_pairing(conn);
+ if (!cmd) {
+ BT_DBG("Unable to find a pending command");
+ return;
+ }
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+static void le_pairing_complete_cb(struct hci_conn *conn, u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status %u", status);
+
+ if (!status)
+ return;
+
+ cmd = find_pairing(conn);
+ if (!cmd) {
+ BT_DBG("Unable to find a pending command");
+ return;
+ }
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_pair_device *cp = data;
+ struct mgmt_rp_pair_device rp;
+ struct mgmt_pending_cmd *cmd;
+ u8 sec_level, auth_type;
+ struct hci_conn *conn;
+ int err;
+
+ BT_DBG("");
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ if (cp->io_cap > SMP_IO_KEYBOARD_DISPLAY)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (hci_bdaddr_is_paired(hdev, &cp->addr.bdaddr, cp->addr.type)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_ALREADY_PAIRED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ sec_level = BT_SECURITY_MEDIUM;
+ auth_type = HCI_AT_DEDICATED_BONDING;
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level,
+ auth_type);
+ } else {
+ u8 addr_type;
+
+ /* Convert from L2CAP channel address type to HCI address type
+ */
+ if (cp->addr.type == BDADDR_LE_PUBLIC)
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ else
+ addr_type = ADDR_LE_DEV_RANDOM;
+
+ /* When pairing a new device, it is expected to remember
+ * this device for future connections. Adding the connection
+ * parameter information ahead of time allows tracking
+ * of the slave preferred values and will speed up any
+ * further connection establishment.
+ *
+ * If connection parameters already exist, then they
+ * will be kept and this function does nothing.
+ */
+ hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type);
+
+ conn = hci_connect_le(hdev, &cp->addr.bdaddr, addr_type,
+ sec_level, HCI_LE_CONN_TIMEOUT,
+ HCI_ROLE_MASTER);
+ }
+
+ if (IS_ERR(conn)) {
+ int status;
+
+ if (PTR_ERR(conn) == -EBUSY)
+ status = MGMT_STATUS_BUSY;
+ else if (PTR_ERR(conn) == -EOPNOTSUPP)
+ status = MGMT_STATUS_NOT_SUPPORTED;
+ else if (PTR_ERR(conn) == -ECONNREFUSED)
+ status = MGMT_STATUS_REJECTED;
+ else
+ status = MGMT_STATUS_CONNECT_FAILED;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ status, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ if (conn->connect_cfm_cb) {
+ hci_conn_drop(conn);
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE,
+ MGMT_STATUS_BUSY, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_PAIR_DEVICE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
+ cmd->cmd_complete = pairing_complete;
+
+ /* For LE, just connecting isn't a proof that the pairing finished */
+ if (cp->addr.type == BDADDR_BREDR) {
+ conn->connect_cfm_cb = pairing_complete_cb;
+ conn->security_cfm_cb = pairing_complete_cb;
+ conn->disconn_cfm_cb = pairing_complete_cb;
+ } else {
+ conn->connect_cfm_cb = le_pairing_complete_cb;
+ conn->security_cfm_cb = le_pairing_complete_cb;
+ conn->disconn_cfm_cb = le_pairing_complete_cb;
+ }
+
+ conn->io_capability = cp->io_cap;
+ cmd->user_data = hci_conn_get(conn);
+
+ if ((conn->state == BT_CONNECTED || conn->state == BT_CONFIG) &&
+ hci_conn_security(conn, sec_level, auth_type, true)) {
+ cmd->cmd_complete(cmd, 0);
+ mgmt_pending_remove(cmd);
+ }
+
+ err = 0;
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_addr_info *addr = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ int err;
+
+ BT_DBG("");
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE,
+ MGMT_STATUS_NOT_POWERED);
+ goto unlock;
+ }
+
+ cmd = pending_find(MGMT_OP_PAIR_DEVICE, hdev);
+ if (!cmd) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ conn = cmd->user_data;
+
+ if (bacmp(&addr->bdaddr, &conn->dst) != 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ cmd->cmd_complete(cmd, MGMT_STATUS_CANCELLED);
+ mgmt_pending_remove(cmd);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, 0,
+ addr, sizeof(*addr));
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_addr_info *addr, u16 mgmt_op,
+ u16 hci_op, __le32 passkey)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ int err;
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
+ MGMT_STATUS_NOT_POWERED, addr,
+ sizeof(*addr));
+ goto done;
+ }
+
+ if (addr->type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &addr->bdaddr);
+ else
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &addr->bdaddr);
+
+ if (!conn) {
+ err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
+ MGMT_STATUS_NOT_CONNECTED, addr,
+ sizeof(*addr));
+ goto done;
+ }
+
+ if (addr->type == BDADDR_LE_PUBLIC || addr->type == BDADDR_LE_RANDOM) {
+ err = smp_user_confirm_reply(conn, mgmt_op, passkey);
+ if (!err)
+ err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
+ MGMT_STATUS_SUCCESS, addr,
+ sizeof(*addr));
+ else
+ err = mgmt_cmd_complete(sk, hdev->id, mgmt_op,
+ MGMT_STATUS_FAILED, addr,
+ sizeof(*addr));
+
+ goto done;
+ }
+
+ cmd = mgmt_pending_add(sk, mgmt_op, hdev, addr, sizeof(*addr));
+ if (!cmd) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ /* Continue with pairing via HCI */
+ if (hci_op == HCI_OP_USER_PASSKEY_REPLY) {
+ struct hci_cp_user_passkey_reply cp;
+
+ bacpy(&cp.bdaddr, &addr->bdaddr);
+ cp.passkey = passkey;
+ err = hci_send_cmd(hdev, hci_op, sizeof(cp), &cp);
+ } else
+ err = hci_send_cmd(hdev, hci_op, sizeof(addr->bdaddr),
+ &addr->bdaddr);
+
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+done:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_pin_code_neg_reply *cp = data;
+
+ BT_DBG("");
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_PIN_CODE_NEG_REPLY,
+ HCI_OP_PIN_CODE_NEG_REPLY, 0);
+}
+
+static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_user_confirm_reply *cp = data;
+
+ BT_DBG("");
+
+ if (len != sizeof(*cp))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_USER_CONFIRM_REPLY,
+ HCI_OP_USER_CONFIRM_REPLY, 0);
+}
+
+static int user_confirm_neg_reply(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_user_confirm_neg_reply *cp = data;
+
+ BT_DBG("");
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_USER_CONFIRM_NEG_REPLY,
+ HCI_OP_USER_CONFIRM_NEG_REPLY, 0);
+}
+
+static int user_passkey_reply(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_user_passkey_reply *cp = data;
+
+ BT_DBG("");
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_USER_PASSKEY_REPLY,
+ HCI_OP_USER_PASSKEY_REPLY, cp->passkey);
+}
+
+static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_user_passkey_neg_reply *cp = data;
+
+ BT_DBG("");
+
+ return user_pairing_resp(sk, hdev, &cp->addr,
+ MGMT_OP_USER_PASSKEY_NEG_REPLY,
+ HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
+}
+
+static void update_name(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_local_name cp;
+
+ memcpy(cp.name, hdev->dev_name, sizeof(cp.name));
+
+ hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp);
+}
+
+static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct mgmt_cp_set_local_name *cp;
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_SET_LOCAL_NAME, hdev);
+ if (!cmd)
+ goto unlock;
+
+ cp = cmd->param;
+
+ if (status)
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
+ mgmt_status(status));
+ else
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
+ cp, sizeof(*cp));
+
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_local_name *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+
+ BT_DBG("");
+
+ hci_dev_lock(hdev);
+
+ /* If the old values are the same as the new ones just return a
+ * direct command complete event.
+ */
+ if (!memcmp(hdev->dev_name, cp->name, sizeof(hdev->dev_name)) &&
+ !memcmp(hdev->short_name, cp->short_name,
+ sizeof(hdev->short_name))) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
+ data, len);
+ goto failed;
+ }
+
+ memcpy(hdev->short_name, cp->short_name, sizeof(hdev->short_name));
+
+ if (!hdev_is_powered(hdev)) {
+ memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name));
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
+ data, len);
+ if (err < 0)
+ goto failed;
+
+ err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev,
+ data, len, sk);
+
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_LOCAL_NAME, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name));
+
+ hci_req_init(&req, hdev);
+
+ if (lmp_bredr_capable(hdev)) {
+ update_name(&req);
+ update_eir(&req);
+ }
+
+ /* The name is stored in the scan response data and so
+ * no need to udpate the advertising data here.
+ */
+ if (lmp_le_capable(hdev))
+ update_scan_rsp_data(&req);
+
+ err = hci_req_run(&req, set_name_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ struct mgmt_rp_read_local_oob_data mgmt_rp;
+ size_t rp_size = sizeof(mgmt_rp);
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("%s status %u", hdev->name, status);
+
+ cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev);
+ if (!cmd)
+ return;
+
+ if (status || !skb) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ status ? mgmt_status(status) : MGMT_STATUS_FAILED);
+ goto remove;
+ }
+
+ memset(&mgmt_rp, 0, sizeof(mgmt_rp));
+
+ if (opcode == HCI_OP_READ_LOCAL_OOB_DATA) {
+ struct hci_rp_read_local_oob_data *rp = (void *) skb->data;
+
+ if (skb->len < sizeof(*rp)) {
+ mgmt_cmd_status(cmd->sk, hdev->id,
+ MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_FAILED);
+ goto remove;
+ }
+
+ memcpy(mgmt_rp.hash192, rp->hash, sizeof(rp->hash));
+ memcpy(mgmt_rp.rand192, rp->rand, sizeof(rp->rand));
+
+ rp_size -= sizeof(mgmt_rp.hash256) + sizeof(mgmt_rp.rand256);
+ } else {
+ struct hci_rp_read_local_oob_ext_data *rp = (void *) skb->data;
+
+ if (skb->len < sizeof(*rp)) {
+ mgmt_cmd_status(cmd->sk, hdev->id,
+ MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_FAILED);
+ goto remove;
+ }
+
+ memcpy(mgmt_rp.hash192, rp->hash192, sizeof(rp->hash192));
+ memcpy(mgmt_rp.rand192, rp->rand192, sizeof(rp->rand192));
+
+ memcpy(mgmt_rp.hash256, rp->hash256, sizeof(rp->hash256));
+ memcpy(mgmt_rp.rand256, rp->rand256, sizeof(rp->rand256));
+ }
+
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_SUCCESS, &mgmt_rp, rp_size);
+
+remove:
+ mgmt_pending_remove(cmd);
+}
+
+static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_NOT_POWERED);
+ goto unlock;
+ }
+
+ if (!lmp_ssp_capable(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_NOT_SUPPORTED);
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+
+ if (bredr_sc_enabled(hdev))
+ hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_EXT_DATA, 0, NULL);
+ else
+ hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL);
+
+ err = hci_req_run_skb(&req, read_local_oob_data_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_addr_info *addr = data;
+ int err;
+
+ BT_DBG("%s ", hdev->name);
+
+ if (!bdaddr_type_is_valid(addr->type))
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS,
+ addr, sizeof(*addr));
+
+ hci_dev_lock(hdev);
+
+ if (len == MGMT_ADD_REMOTE_OOB_DATA_SIZE) {
+ struct mgmt_cp_add_remote_oob_data *cp = data;
+ u8 status;
+
+ if (cp->addr.type != BDADDR_BREDR) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+ goto unlock;
+ }
+
+ err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr,
+ cp->addr.type, cp->hash,
+ cp->rand, NULL, NULL);
+ if (err < 0)
+ status = MGMT_STATUS_FAILED;
+ else
+ status = MGMT_STATUS_SUCCESS;
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA, status,
+ &cp->addr, sizeof(cp->addr));
+ } else if (len == MGMT_ADD_REMOTE_OOB_EXT_DATA_SIZE) {
+ struct mgmt_cp_add_remote_oob_ext_data *cp = data;
+ u8 *rand192, *hash192, *rand256, *hash256;
+ u8 status;
+
+ if (bdaddr_type_is_le(cp->addr.type)) {
+ /* Enforce zero-valued 192-bit parameters as
+ * long as legacy SMP OOB isn't implemented.
+ */
+ if (memcmp(cp->rand192, ZERO_KEY, 16) ||
+ memcmp(cp->hash192, ZERO_KEY, 16)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS,
+ addr, sizeof(*addr));
+ goto unlock;
+ }
+
+ rand192 = NULL;
+ hash192 = NULL;
+ } else {
+ /* In case one of the P-192 values is set to zero,
+ * then just disable OOB data for P-192.
+ */
+ if (!memcmp(cp->rand192, ZERO_KEY, 16) ||
+ !memcmp(cp->hash192, ZERO_KEY, 16)) {
+ rand192 = NULL;
+ hash192 = NULL;
+ } else {
+ rand192 = cp->rand192;
+ hash192 = cp->hash192;
+ }
+ }
+
+ /* In case one of the P-256 values is set to zero, then just
+ * disable OOB data for P-256.
+ */
+ if (!memcmp(cp->rand256, ZERO_KEY, 16) ||
+ !memcmp(cp->hash256, ZERO_KEY, 16)) {
+ rand256 = NULL;
+ hash256 = NULL;
+ } else {
+ rand256 = cp->rand256;
+ hash256 = cp->hash256;
+ }
+
+ err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr,
+ cp->addr.type, hash192, rand192,
+ hash256, rand256);
+ if (err < 0)
+ status = MGMT_STATUS_FAILED;
+ else
+ status = MGMT_STATUS_SUCCESS;
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_REMOTE_OOB_DATA,
+ status, &cp->addr, sizeof(cp->addr));
+ } else {
+ BT_ERR("add_remote_oob_data: invalid length of %u bytes", len);
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_remove_remote_oob_data *cp = data;
+ u8 status;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (cp->addr.type != BDADDR_BREDR)
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_lock(hdev);
+
+ if (!bacmp(&cp->addr.bdaddr, BDADDR_ANY)) {
+ hci_remote_oob_data_clear(hdev);
+ status = MGMT_STATUS_SUCCESS;
+ goto done;
+ }
+
+ err = hci_remove_remote_oob_data(hdev, &cp->addr.bdaddr, cp->addr.type);
+ if (err < 0)
+ status = MGMT_STATUS_INVALID_PARAMS;
+ else
+ status = MGMT_STATUS_SUCCESS;
+
+done:
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+ status, &cp->addr, sizeof(cp->addr));
+
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static bool trigger_bredr_inquiry(struct hci_request *req, u8 *status)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_inquiry cp;
+ /* General inquiry access code (GIAC) */
+ u8 lap[3] = { 0x33, 0x8b, 0x9e };
+
+ *status = mgmt_bredr_support(hdev);
+ if (*status)
+ return false;
+
+ if (hci_dev_test_flag(hdev, HCI_INQUIRY)) {
+ *status = MGMT_STATUS_BUSY;
+ return false;
+ }
+
+ hci_inquiry_cache_flush(hdev);
+
+ memset(&cp, 0, sizeof(cp));
+ memcpy(&cp.lap, lap, sizeof(cp.lap));
+ cp.length = DISCOV_BREDR_INQUIRY_LEN;
+
+ hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
+
+ return true;
+}
+
+static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_scan_param param_cp;
+ struct hci_cp_le_set_scan_enable enable_cp;
+ u8 own_addr_type;
+ int err;
+
+ *status = mgmt_le_support(hdev);
+ if (*status)
+ return false;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV)) {
+ /* Don't let discovery abort an outgoing connection attempt
+ * that's using directed advertising.
+ */
+ if (hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) {
+ *status = MGMT_STATUS_REJECTED;
+ return false;
+ }
+
+ disable_advertising(req);
+ }
+
+ /* If controller is scanning, it means the background scanning is
+ * running. Thus, we should temporarily stop it in order to set the
+ * discovery scanning parameters.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ hci_req_add_le_scan_disable(req);
+
+ /* All active scans will be done with either a resolvable private
+ * address (when privacy feature has been enabled) or non-resolvable
+ * private address.
+ */
+ err = hci_update_random_address(req, true, &own_addr_type);
+ if (err < 0) {
+ *status = MGMT_STATUS_FAILED;
+ return false;
+ }
+
+ memset(&param_cp, 0, sizeof(param_cp));
+ param_cp.type = LE_SCAN_ACTIVE;
+ param_cp.interval = cpu_to_le16(interval);
+ param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN);
+ param_cp.own_address_type = own_addr_type;
+
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
+ &param_cp);
+
+ memset(&enable_cp, 0, sizeof(enable_cp));
+ enable_cp.enable = LE_SCAN_ENABLE;
+ enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
+ &enable_cp);
+
+ return true;
+}
+
+static bool trigger_discovery(struct hci_request *req, u8 *status)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ switch (hdev->discovery.type) {
+ case DISCOV_TYPE_BREDR:
+ if (!trigger_bredr_inquiry(req, status))
+ return false;
+ break;
+
+ case DISCOV_TYPE_INTERLEAVED:
+ if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY,
+ &hdev->quirks)) {
+ /* During simultaneous discovery, we double LE scan
+ * interval. We must leave some time for the controller
+ * to do BR/EDR inquiry.
+ */
+ if (!trigger_le_scan(req, DISCOV_LE_SCAN_INT * 2,
+ status))
+ return false;
+
+ if (!trigger_bredr_inquiry(req, status))
+ return false;
+
+ return true;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ *status = MGMT_STATUS_NOT_SUPPORTED;
+ return false;
+ }
+ /* fall through */
+
+ case DISCOV_TYPE_LE:
+ if (!trigger_le_scan(req, DISCOV_LE_SCAN_INT, status))
+ return false;
+ break;
+
+ default:
+ *status = MGMT_STATUS_INVALID_PARAMS;
+ return false;
+ }
+
+ return true;
+}
+
+static void start_discovery_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+ unsigned long timeout;
+
+ BT_DBG("status %d", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_START_DISCOVERY, hdev);
+ if (!cmd)
+ cmd = pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev);
+
+ if (cmd) {
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+ }
+
+ if (status) {
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ goto unlock;
+ }
+
+ hci_discovery_set_state(hdev, DISCOVERY_FINDING);
+
+ /* If the scan involves LE scan, pick proper timeout to schedule
+ * hdev->le_scan_disable that will stop it.
+ */
+ switch (hdev->discovery.type) {
+ case DISCOV_TYPE_LE:
+ timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ break;
+ case DISCOV_TYPE_INTERLEAVED:
+ /* When running simultaneous discovery, the LE scanning time
+ * should occupy the whole discovery time sine BR/EDR inquiry
+ * and LE scanning are scheduled by the controller.
+ *
+ * For interleaving discovery in comparison, BR/EDR inquiry
+ * and LE scanning are done sequentially with separate
+ * timeouts.
+ */
+ if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks))
+ timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ else
+ timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout);
+ break;
+ case DISCOV_TYPE_BREDR:
+ timeout = 0;
+ break;
+ default:
+ BT_ERR("Invalid discovery type %d", hdev->discovery.type);
+ timeout = 0;
+ break;
+ }
+
+ if (timeout) {
+ /* When service discovery is used and the controller has
+ * a strict duplicate filter, it is important to remember
+ * the start and duration of the scan. This is required
+ * for restarting scanning during the discovery phase.
+ */
+ if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER,
+ &hdev->quirks) &&
+ hdev->discovery.result_filtering) {
+ hdev->discovery.scan_start = jiffies;
+ hdev->discovery.scan_duration = timeout;
+ }
+
+ queue_delayed_work(hdev->workqueue,
+ &hdev->le_scan_disable, timeout);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int start_discovery(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_start_discovery *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ u8 status;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY,
+ MGMT_STATUS_NOT_POWERED,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
+ if (hdev->discovery.state != DISCOVERY_STOPPED ||
+ hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY,
+ MGMT_STATUS_BUSY, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_START_DISCOVERY, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ cmd->cmd_complete = generic_cmd_complete;
+
+ /* Clear the discovery filter first to free any previously
+ * allocated memory for the UUID list.
+ */
+ hci_discovery_filter_clear(hdev);
+
+ hdev->discovery.type = cp->type;
+ hdev->discovery.report_invalid_rssi = false;
+
+ hci_req_init(&req, hdev);
+
+ if (!trigger_discovery(&req, &status)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY,
+ status, &cp->type, sizeof(cp->type));
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+ err = hci_req_run(&req, start_discovery_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+ hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int service_discovery_cmd_complete(struct mgmt_pending_cmd *cmd,
+ u8 status)
+{
+ return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status,
+ cmd->param, 1);
+}
+
+static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_start_service_discovery *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ const u16 max_uuid_count = ((U16_MAX - sizeof(*cp)) / 16);
+ u16 uuid_count, expected_len;
+ u8 status;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_NOT_POWERED,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
+ if (hdev->discovery.state != DISCOVERY_STOPPED ||
+ hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_BUSY, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
+ uuid_count = __le16_to_cpu(cp->uuid_count);
+ if (uuid_count > max_uuid_count) {
+ BT_ERR("service_discovery: too big uuid_count value %u",
+ uuid_count);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_INVALID_PARAMS, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
+ expected_len = sizeof(*cp) + uuid_count * 16;
+ if (expected_len != len) {
+ BT_ERR("service_discovery: expected %u bytes, got %u bytes",
+ expected_len, len);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_INVALID_PARAMS, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY,
+ hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ cmd->cmd_complete = service_discovery_cmd_complete;
+
+ /* Clear the discovery filter first to free any previously
+ * allocated memory for the UUID list.
+ */
+ hci_discovery_filter_clear(hdev);
+
+ hdev->discovery.result_filtering = true;
+ hdev->discovery.type = cp->type;
+ hdev->discovery.rssi = cp->rssi;
+ hdev->discovery.uuid_count = uuid_count;
+
+ if (uuid_count > 0) {
+ hdev->discovery.uuids = kmemdup(cp->uuids, uuid_count * 16,
+ GFP_KERNEL);
+ if (!hdev->discovery.uuids) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_FAILED,
+ &cp->type, sizeof(cp->type));
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+ }
+
+ hci_req_init(&req, hdev);
+
+ if (!trigger_discovery(&req, &status)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ status, &cp->type, sizeof(cp->type));
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+ err = hci_req_run(&req, start_discovery_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+ hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void stop_discovery_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status %d", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_STOP_DISCOVERY, hdev);
+ if (cmd) {
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+ }
+
+ if (!status)
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+ hci_dev_unlock(hdev);
+}
+
+static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_stop_discovery *mgmt_cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (!hci_discovery_active(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY,
+ MGMT_STATUS_REJECTED, &mgmt_cp->type,
+ sizeof(mgmt_cp->type));
+ goto unlock;
+ }
+
+ if (hdev->discovery.type != mgmt_cp->type) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY,
+ MGMT_STATUS_INVALID_PARAMS,
+ &mgmt_cp->type, sizeof(mgmt_cp->type));
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ cmd->cmd_complete = generic_cmd_complete;
+
+ hci_req_init(&req, hdev);
+
+ hci_stop_discovery(&req);
+
+ err = hci_req_run(&req, stop_discovery_complete);
+ if (!err) {
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
+ goto unlock;
+ }
+
+ mgmt_pending_remove(cmd);
+
+ /* If no HCI commands were sent we're done */
+ if (err == -ENODATA) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, 0,
+ &mgmt_cp->type, sizeof(mgmt_cp->type));
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_confirm_name *cp = data;
+ struct inquiry_entry *e;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (!hci_discovery_active(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME,
+ MGMT_STATUS_FAILED, &cp->addr,
+ sizeof(cp->addr));
+ goto failed;
+ }
+
+ e = hci_inquiry_cache_lookup_unknown(hdev, &cp->addr.bdaddr);
+ if (!e) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME,
+ MGMT_STATUS_INVALID_PARAMS, &cp->addr,
+ sizeof(cp->addr));
+ goto failed;
+ }
+
+ if (cp->name_known) {
+ e->name_state = NAME_KNOWN;
+ list_del(&e->list);
+ } else {
+ e->name_state = NAME_NEEDED;
+ hci_inquiry_cache_update_resolve(hdev, e);
+ }
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, 0,
+ &cp->addr, sizeof(cp->addr));
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int block_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_block_device *cp = data;
+ u8 status;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_lock(hdev);
+
+ err = hci_bdaddr_list_add(&hdev->blacklist, &cp->addr.bdaddr,
+ cp->addr.type);
+ if (err < 0) {
+ status = MGMT_STATUS_FAILED;
+ goto done;
+ }
+
+ mgmt_event(MGMT_EV_DEVICE_BLOCKED, hdev, &cp->addr, sizeof(cp->addr),
+ sk);
+ status = MGMT_STATUS_SUCCESS;
+
+done:
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, status,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_unblock_device *cp = data;
+ u8 status;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_lock(hdev);
+
+ err = hci_bdaddr_list_del(&hdev->blacklist, &cp->addr.bdaddr,
+ cp->addr.type);
+ if (err < 0) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ mgmt_event(MGMT_EV_DEVICE_UNBLOCKED, hdev, &cp->addr, sizeof(cp->addr),
+ sk);
+ status = MGMT_STATUS_SUCCESS;
+
+done:
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, status,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_device_id *cp = data;
+ struct hci_request req;
+ int err;
+ __u16 source;
+
+ BT_DBG("%s", hdev->name);
+
+ source = __le16_to_cpu(cp->source);
+
+ if (source > 0x0002)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEVICE_ID,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ hdev->devid_source = source;
+ hdev->devid_vendor = __le16_to_cpu(cp->vendor);
+ hdev->devid_product = __le16_to_cpu(cp->product);
+ hdev->devid_version = __le16_to_cpu(cp->version);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, 0,
+ NULL, 0);
+
+ hci_req_init(&req, hdev);
+ update_eir(&req);
+ hci_req_run(&req, NULL);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void enable_advertising_instance(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ BT_DBG("status %d", status);
+}
+
+static void set_advertising_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct cmd_lookup match = { NULL, hdev };
+ struct hci_request req;
+
+ hci_dev_lock(hdev);
+
+ if (status) {
+ u8 mgmt_err = mgmt_status(status);
+
+ mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev,
+ cmd_status_rsp, &mgmt_err);
+ goto unlock;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ hci_dev_set_flag(hdev, HCI_ADVERTISING);
+ else
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING);
+
+ mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, settings_rsp,
+ &match);
+
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+ /* If "Set Advertising" was just disabled and instance advertising was
+ * set up earlier, then enable the advertising instance.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ goto unlock;
+
+ hci_req_init(&req, hdev);
+
+ update_adv_data(&req);
+ enable_advertising(&req);
+
+ if (hci_req_run(&req, enable_advertising_instance) < 0)
+ BT_ERR("Failed to re-configure advertising");
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ u8 val, status;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ status = mgmt_le_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ status);
+
+ if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ val = !!cp->val;
+
+ /* The following conditions are ones which mean that we should
+ * not do any HCI communication but directly send a mgmt
+ * response to user space (after toggling the flag if
+ * necessary).
+ */
+ if (!hdev_is_powered(hdev) ||
+ (val == hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) ||
+ hci_conn_num(hdev, LE_LINK) > 0 ||
+ (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
+ hdev->le_scan_type == LE_SCAN_ACTIVE)) {
+ bool changed;
+
+ if (cp->val) {
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING);
+ if (cp->val == 0x02)
+ hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_ADVERTISING);
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_ADVERTISING, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_SET_ADVERTISING, hdev) ||
+ pending_find(MGMT_OP_SET_LE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_ADVERTISING, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+
+ if (cp->val == 0x02)
+ hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+
+ if (val) {
+ /* Switch to instance "0" for the Set Advertising setting. */
+ update_adv_data_for_instance(&req, 0);
+ update_scan_rsp_data_for_instance(&req, 0);
+ enable_advertising(&req);
+ } else {
+ disable_advertising(&req);
+ }
+
+ err = hci_req_run(&req, set_advertising_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_static_address(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_static_address *cp = data;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (hdev_is_powered(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_STATUS_REJECTED);
+
+ if (bacmp(&cp->bdaddr, BDADDR_ANY)) {
+ if (!bacmp(&cp->bdaddr, BDADDR_NONE))
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Two most significant bits shall be set */
+ if ((cp->bdaddr.b[5] & 0xc0) != 0xc0)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_STATIC_ADDRESS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ hci_dev_lock(hdev);
+
+ bacpy(&hdev->static_addr, &cp->bdaddr);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_STATIC_ADDRESS, hdev);
+ if (err < 0)
+ goto unlock;
+
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_scan_params(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_scan_params *cp = data;
+ __u16 interval, window;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ interval = __le16_to_cpu(cp->interval);
+
+ if (interval < 0x0004 || interval > 0x4000)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ window = __le16_to_cpu(cp->window);
+
+ if (window < 0x0004 || window > 0x4000)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (window > interval)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ hdev->le_scan_interval = interval;
+ hdev->le_scan_window = window;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, 0,
+ NULL, 0);
+
+ /* If background scan is running, restart it so new parameters are
+ * loaded.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
+ hdev->discovery.state == DISCOVERY_STOPPED) {
+ struct hci_request req;
+
+ hci_req_init(&req, hdev);
+
+ hci_req_add_le_scan_disable(&req);
+ hci_req_add_le_passive_scan(&req);
+
+ hci_req_run(&req, NULL);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void fast_connectable_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev);
+ if (!cmd)
+ goto unlock;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ mgmt_status(status));
+ } else {
+ struct mgmt_mode *cp = cmd->param;
+
+ if (cp->val)
+ hci_dev_set_flag(hdev, HCI_FAST_CONNECTABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE);
+
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev);
+ new_settings(hdev, cmd->sk);
+ }
+
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) ||
+ hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (!!cp->val == hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE,
+ hdev);
+ goto unlock;
+ }
+
+ if (!hdev_is_powered(hdev)) {
+ hci_dev_change_flag(hdev, HCI_FAST_CONNECTABLE);
+ err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE,
+ hdev);
+ new_settings(hdev, sk);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev,
+ data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+
+ write_fast_connectable(&req, cp->val);
+
+ err = hci_req_run(&req, fast_connectable_complete);
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_STATUS_FAILED);
+ mgmt_pending_remove(cmd);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void set_bredr_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_SET_BREDR, hdev);
+ if (!cmd)
+ goto unlock;
+
+ if (status) {
+ u8 mgmt_err = mgmt_status(status);
+
+ /* We need to restore the flag if related HCI commands
+ * failed.
+ */
+ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED);
+
+ mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err);
+ } else {
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_BREDR, hdev);
+ new_settings(hdev, cmd->sk);
+ }
+
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!lmp_bredr_capable(hdev) || !lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (cp->val == hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_BREDR, hdev);
+ goto unlock;
+ }
+
+ if (!hdev_is_powered(hdev)) {
+ if (!cp->val) {
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_LINK_SECURITY);
+ hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE);
+ hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
+ }
+
+ hci_dev_change_flag(hdev, HCI_BREDR_ENABLED);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_BREDR, hdev);
+ if (err < 0)
+ goto unlock;
+
+ err = new_settings(hdev, sk);
+ goto unlock;
+ }
+
+ /* Reject disabling when powered on */
+ if (!cp->val) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ } else {
+ /* When configuring a dual-mode controller to operate
+ * with LE only and using a static address, then switching
+ * BR/EDR back on is not allowed.
+ *
+ * Dual-mode controllers shall operate with the public
+ * address as its identity address for BR/EDR and LE. So
+ * reject the attempt to create an invalid configuration.
+ *
+ * The same restrictions applies when secure connections
+ * has been enabled. For BR/EDR this is a controller feature
+ * while for LE it is a host stack feature. This means that
+ * switching BR/EDR back on when secure connections has been
+ * enabled is not a supported transaction.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ (bacmp(&hdev->static_addr, BDADDR_ANY) ||
+ hci_dev_test_flag(hdev, HCI_SC_ENABLED))) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+ }
+
+ if (pending_find(MGMT_OP_SET_BREDR, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_BREDR, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ /* We need to flip the bit already here so that update_adv_data
+ * generates the correct flags.
+ */
+ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
+
+ hci_req_init(&req, hdev);
+
+ write_fast_connectable(&req, false);
+ __hci_update_page_scan(&req);
+
+ /* Since only the advertising data flags will change, there
+ * is no need to update the scan response data.
+ */
+ update_adv_data(&req);
+
+ err = hci_req_run(&req, set_bredr_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void sc_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_mode *cp;
+
+ BT_DBG("%s status %u", hdev->name, status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_SET_SECURE_CONN, hdev);
+ if (!cmd)
+ goto unlock;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode,
+ mgmt_status(status));
+ goto remove;
+ }
+
+ cp = cmd->param;
+
+ switch (cp->val) {
+ case 0x00:
+ hci_dev_clear_flag(hdev, HCI_SC_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_SC_ONLY);
+ break;
+ case 0x01:
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_SC_ONLY);
+ break;
+ case 0x02:
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+ hci_dev_set_flag(hdev, HCI_SC_ONLY);
+ break;
+ }
+
+ send_settings_rsp(cmd->sk, MGMT_OP_SET_SECURE_CONN, hdev);
+ new_settings(hdev, cmd->sk);
+
+remove:
+ mgmt_pending_remove(cmd);
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_secure_conn(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ u8 val;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!lmp_sc_capable(hdev) &&
+ !hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ lmp_sc_capable(hdev) &&
+ !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev) || !lmp_sc_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ bool changed;
+
+ if (cp->val) {
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_SC_ENABLED);
+ if (cp->val == 0x02)
+ hci_dev_set_flag(hdev, HCI_SC_ONLY);
+ else
+ hci_dev_clear_flag(hdev, HCI_SC_ONLY);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_SC_ENABLED);
+ hci_dev_clear_flag(hdev, HCI_SC_ONLY);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev);
+ if (err < 0)
+ goto failed;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+ goto failed;
+ }
+
+ if (pending_find(MGMT_OP_SET_SECURE_CONN, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
+ val = !!cp->val;
+
+ if (val == hci_dev_test_flag(hdev, HCI_SC_ENABLED) &&
+ (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+ err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev);
+ goto failed;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_SECURE_CONN, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ hci_req_init(&req, hdev);
+ hci_req_add(&req, HCI_OP_WRITE_SC_SUPPORT, 1, &val);
+ err = hci_req_run(&req, sc_enable_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
+failed:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_debug_keys(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ bool changed, use_changed;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (cp->val)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_KEEP_DEBUG_KEYS);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_KEEP_DEBUG_KEYS);
+
+ if (cp->val == 0x02)
+ use_changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_USE_DEBUG_KEYS);
+ else
+ use_changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_USE_DEBUG_KEYS);
+
+ if (hdev_is_powered(hdev) && use_changed &&
+ hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ u8 mode = (cp->val == 0x02) ? 0x01 : 0x00;
+ hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE,
+ sizeof(mode), &mode);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_DEBUG_KEYS, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
+ u16 len)
+{
+ struct mgmt_cp_set_privacy *cp = cp_data;
+ bool changed;
+ int err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->privacy != 0x00 && cp->privacy != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (hdev_is_powered(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
+ MGMT_STATUS_REJECTED);
+
+ hci_dev_lock(hdev);
+
+ /* If user space supports this command it is also expected to
+ * handle IRKs. Therefore, set the HCI_RPA_RESOLVING flag.
+ */
+ hci_dev_set_flag(hdev, HCI_RPA_RESOLVING);
+
+ if (cp->privacy) {
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY);
+ memcpy(hdev->irk, cp->irk, sizeof(hdev->irk));
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY);
+ memset(hdev->irk, 0, sizeof(hdev->irk));
+ hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+ }
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static bool irk_is_valid(struct mgmt_irk_info *irk)
+{
+ switch (irk->addr.type) {
+ case BDADDR_LE_PUBLIC:
+ return true;
+
+ case BDADDR_LE_RANDOM:
+ /* Two most significant bits shall be set */
+ if ((irk->addr.bdaddr.b[5] & 0xc0) != 0xc0)
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
+ u16 len)
+{
+ struct mgmt_cp_load_irks *cp = cp_data;
+ const u16 max_irk_count = ((U16_MAX - sizeof(*cp)) /
+ sizeof(struct mgmt_irk_info));
+ u16 irk_count, expected_len;
+ int i, err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ irk_count = __le16_to_cpu(cp->irk_count);
+ if (irk_count > max_irk_count) {
+ BT_ERR("load_irks: too big irk_count value %u", irk_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info);
+ if (expected_len != len) {
+ BT_ERR("load_irks: expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ BT_DBG("%s irk_count %u", hdev->name, irk_count);
+
+ for (i = 0; i < irk_count; i++) {
+ struct mgmt_irk_info *key = &cp->irks[i];
+
+ if (!irk_is_valid(key))
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_LOAD_IRKS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ hci_dev_lock(hdev);
+
+ hci_smp_irks_clear(hdev);
+
+ for (i = 0; i < irk_count; i++) {
+ struct mgmt_irk_info *irk = &cp->irks[i];
+ u8 addr_type;
+
+ if (irk->addr.type == BDADDR_LE_PUBLIC)
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ else
+ addr_type = ADDR_LE_DEV_RANDOM;
+
+ hci_add_irk(hdev, &irk->addr.bdaddr, addr_type, irk->val,
+ BDADDR_ANY);
+ }
+
+ hci_dev_set_flag(hdev, HCI_RPA_RESOLVING);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_IRKS, 0, NULL, 0);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static bool ltk_is_valid(struct mgmt_ltk_info *key)
+{
+ if (key->master != 0x00 && key->master != 0x01)
+ return false;
+
+ switch (key->addr.type) {
+ case BDADDR_LE_PUBLIC:
+ return true;
+
+ case BDADDR_LE_RANDOM:
+ /* Two most significant bits shall be set */
+ if ((key->addr.bdaddr.b[5] & 0xc0) != 0xc0)
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
+ void *cp_data, u16 len)
+{
+ struct mgmt_cp_load_long_term_keys *cp = cp_data;
+ const u16 max_key_count = ((U16_MAX - sizeof(*cp)) /
+ sizeof(struct mgmt_ltk_info));
+ u16 key_count, expected_len;
+ int i, err;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ key_count = __le16_to_cpu(cp->key_count);
+ if (key_count > max_key_count) {
+ BT_ERR("load_ltks: too big key_count value %u", key_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = sizeof(*cp) + key_count *
+ sizeof(struct mgmt_ltk_info);
+ if (expected_len != len) {
+ BT_ERR("load_keys: expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ BT_DBG("%s key_count %u", hdev->name, key_count);
+
+ for (i = 0; i < key_count; i++) {
+ struct mgmt_ltk_info *key = &cp->keys[i];
+
+ if (!ltk_is_valid(key))
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_LOAD_LONG_TERM_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ hci_dev_lock(hdev);
+
+ hci_smp_ltks_clear(hdev);
+
+ for (i = 0; i < key_count; i++) {
+ struct mgmt_ltk_info *key = &cp->keys[i];
+ u8 type, addr_type, authenticated;
+
+ if (key->addr.type == BDADDR_LE_PUBLIC)
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ else
+ addr_type = ADDR_LE_DEV_RANDOM;
+
+ switch (key->type) {
+ case MGMT_LTK_UNAUTHENTICATED:
+ authenticated = 0x00;
+ type = key->master ? SMP_LTK : SMP_LTK_SLAVE;
+ break;
+ case MGMT_LTK_AUTHENTICATED:
+ authenticated = 0x01;
+ type = key->master ? SMP_LTK : SMP_LTK_SLAVE;
+ break;
+ case MGMT_LTK_P256_UNAUTH:
+ authenticated = 0x00;
+ type = SMP_LTK_P256;
+ break;
+ case MGMT_LTK_P256_AUTH:
+ authenticated = 0x01;
+ type = SMP_LTK_P256;
+ break;
+ case MGMT_LTK_P256_DEBUG:
+ authenticated = 0x00;
+ type = SMP_LTK_P256_DEBUG;
+ default:
+ continue;
+ }
+
+ hci_add_ltk(hdev, &key->addr.bdaddr, addr_type, type,
+ authenticated, key->val, key->enc_size, key->ediv,
+ key->rand);
+ }
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0,
+ NULL, 0);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static int conn_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
+{
+ struct hci_conn *conn = cmd->user_data;
+ struct mgmt_rp_get_conn_info rp;
+ int err;
+
+ memcpy(&rp.addr, cmd->param, sizeof(rp.addr));
+
+ if (status == MGMT_STATUS_SUCCESS) {
+ rp.rssi = conn->rssi;
+ rp.tx_power = conn->tx_power;
+ rp.max_tx_power = conn->max_tx_power;
+ } else {
+ rp.rssi = HCI_RSSI_INVALID;
+ rp.tx_power = HCI_TX_POWER_INVALID;
+ rp.max_tx_power = HCI_TX_POWER_INVALID;
+ }
+
+ err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO,
+ status, &rp, sizeof(rp));
+
+ hci_conn_drop(conn);
+ hci_conn_put(conn);
+
+ return err;
+}
+
+static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status,
+ u16 opcode)
+{
+ struct hci_cp_read_rssi *cp;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+ u16 handle;
+ u8 status;
+
+ BT_DBG("status 0x%02x", hci_status);
+
+ hci_dev_lock(hdev);
+
+ /* Commands sent in request are either Read RSSI or Read Transmit Power
+ * Level so we check which one was last sent to retrieve connection
+ * handle. Both commands have handle as first parameter so it's safe to
+ * cast data on the same command struct.
+ *
+ * First command sent is always Read RSSI and we fail only if it fails.
+ * In other case we simply override error to indicate success as we
+ * already remembered if TX power value is actually valid.
+ */
+ cp = hci_sent_cmd_data(hdev, HCI_OP_READ_RSSI);
+ if (!cp) {
+ cp = hci_sent_cmd_data(hdev, HCI_OP_READ_TX_POWER);
+ status = MGMT_STATUS_SUCCESS;
+ } else {
+ status = mgmt_status(hci_status);
+ }
+
+ if (!cp) {
+ BT_ERR("invalid sent_cmd in conn_info response");
+ goto unlock;
+ }
+
+ handle = __le16_to_cpu(cp->handle);
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!conn) {
+ BT_ERR("unknown handle (%d) in conn_info response", handle);
+ goto unlock;
+ }
+
+ cmd = pending_find_data(MGMT_OP_GET_CONN_INFO, hdev, conn);
+ if (!cmd)
+ goto unlock;
+
+ cmd->cmd_complete(cmd, status);
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_get_conn_info *cp = data;
+ struct mgmt_rp_get_conn_info rp;
+ struct hci_conn *conn;
+ unsigned long conn_info_age;
+ int err = 0;
+
+ BT_DBG("%s", hdev->name);
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr);
+
+ if (!conn || conn->state != BT_CONNECTED) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_NOT_CONNECTED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (pending_find_data(MGMT_OP_GET_CONN_INFO, hdev, conn)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_BUSY, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ /* To avoid client trying to guess when to poll again for information we
+ * calculate conn info age as random value between min/max set in hdev.
+ */
+ conn_info_age = hdev->conn_info_min_age +
+ prandom_u32_max(hdev->conn_info_max_age -
+ hdev->conn_info_min_age);
+
+ /* Query controller to refresh cached values if they are too old or were
+ * never read.
+ */
+ if (time_after(jiffies, conn->conn_info_timestamp +
+ msecs_to_jiffies(conn_info_age)) ||
+ !conn->conn_info_timestamp) {
+ struct hci_request req;
+ struct hci_cp_read_tx_power req_txp_cp;
+ struct hci_cp_read_rssi req_rssi_cp;
+ struct mgmt_pending_cmd *cmd;
+
+ hci_req_init(&req, hdev);
+ req_rssi_cp.handle = cpu_to_le16(conn->handle);
+ hci_req_add(&req, HCI_OP_READ_RSSI, sizeof(req_rssi_cp),
+ &req_rssi_cp);
+
+ /* For LE links TX power does not change thus we don't need to
+ * query for it once value is known.
+ */
+ if (!bdaddr_type_is_le(cp->addr.type) ||
+ conn->tx_power == HCI_TX_POWER_INVALID) {
+ req_txp_cp.handle = cpu_to_le16(conn->handle);
+ req_txp_cp.type = 0x00;
+ hci_req_add(&req, HCI_OP_READ_TX_POWER,
+ sizeof(req_txp_cp), &req_txp_cp);
+ }
+
+ /* Max TX power needs to be read only once per connection */
+ if (conn->max_tx_power == HCI_TX_POWER_INVALID) {
+ req_txp_cp.handle = cpu_to_le16(conn->handle);
+ req_txp_cp.type = 0x01;
+ hci_req_add(&req, HCI_OP_READ_TX_POWER,
+ sizeof(req_txp_cp), &req_txp_cp);
+ }
+
+ err = hci_req_run(&req, conn_info_refresh_complete);
+ if (err < 0)
+ goto unlock;
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_GET_CONN_INFO, hdev,
+ data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_conn_hold(conn);
+ cmd->user_data = hci_conn_get(conn);
+ cmd->cmd_complete = conn_info_cmd_complete;
+
+ conn->conn_info_timestamp = jiffies;
+ } else {
+ /* Cache is valid, just reply with values cached in hci_conn */
+ rp.rssi = conn->rssi;
+ rp.tx_power = conn->tx_power;
+ rp.max_tx_power = conn->max_tx_power;
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
+{
+ struct hci_conn *conn = cmd->user_data;
+ struct mgmt_rp_get_clock_info rp;
+ struct hci_dev *hdev;
+ int err;
+
+ memset(&rp, 0, sizeof(rp));
+ memcpy(&rp.addr, &cmd->param, sizeof(rp.addr));
+
+ if (status)
+ goto complete;
+
+ hdev = hci_dev_get(cmd->index);
+ if (hdev) {
+ rp.local_clock = cpu_to_le32(hdev->clock);
+ hci_dev_put(hdev);
+ }
+
+ if (conn) {
+ rp.piconet_clock = cpu_to_le32(conn->clock);
+ rp.accuracy = cpu_to_le16(conn->clock_accuracy);
+ }
+
+complete:
+ err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, &rp,
+ sizeof(rp));
+
+ if (conn) {
+ hci_conn_drop(conn);
+ hci_conn_put(conn);
+ }
+
+ return err;
+}
+
+static void get_clock_info_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct hci_cp_read_clock *hci_cp;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status %u", hdev->name, status);
+
+ hci_dev_lock(hdev);
+
+ hci_cp = hci_sent_cmd_data(hdev, HCI_OP_READ_CLOCK);
+ if (!hci_cp)
+ goto unlock;
+
+ if (hci_cp->which) {
+ u16 handle = __le16_to_cpu(hci_cp->handle);
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ } else {
+ conn = NULL;
+ }
+
+ cmd = pending_find_data(MGMT_OP_GET_CLOCK_INFO, hdev, conn);
+ if (!cmd)
+ goto unlock;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_get_clock_info *cp = data;
+ struct mgmt_rp_get_clock_info rp;
+ struct hci_cp_read_clock hci_cp;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ struct hci_conn *conn;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ memset(&rp, 0, sizeof(rp));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+
+ if (cp->addr.type != BDADDR_BREDR)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO,
+ MGMT_STATUS_INVALID_PARAMS,
+ &rp, sizeof(rp));
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO,
+ MGMT_STATUS_NOT_POWERED, &rp,
+ sizeof(rp));
+ goto unlock;
+ }
+
+ if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) {
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ if (!conn || conn->state != BT_CONNECTED) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_GET_CLOCK_INFO,
+ MGMT_STATUS_NOT_CONNECTED,
+ &rp, sizeof(rp));
+ goto unlock;
+ }
+ } else {
+ conn = NULL;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_GET_CLOCK_INFO, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ cmd->cmd_complete = clock_info_cmd_complete;
+
+ hci_req_init(&req, hdev);
+
+ memset(&hci_cp, 0, sizeof(hci_cp));
+ hci_req_add(&req, HCI_OP_READ_CLOCK, sizeof(hci_cp), &hci_cp);
+
+ if (conn) {
+ hci_conn_hold(conn);
+ cmd->user_data = hci_conn_get(conn);
+
+ hci_cp.handle = cpu_to_le16(conn->handle);
+ hci_cp.which = 0x01; /* Piconet clock */
+ hci_req_add(&req, HCI_OP_READ_CLOCK, sizeof(hci_cp), &hci_cp);
+ }
+
+ err = hci_req_run(&req, get_clock_info_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
+{
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr);
+ if (!conn)
+ return false;
+
+ if (conn->dst_type != type)
+ return false;
+
+ if (conn->state != BT_CONNECTED)
+ return false;
+
+ return true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr,
+ u8 addr_type, u8 auto_connect)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_add(hdev, addr, addr_type);
+ if (!params)
+ return -EIO;
+
+ if (params->auto_connect == auto_connect)
+ return 0;
+
+ list_del_init(&params->action);
+
+ switch (auto_connect) {
+ case HCI_AUTO_CONN_DISABLED:
+ case HCI_AUTO_CONN_LINK_LOSS:
+ __hci_update_background_scan(req);
+ break;
+ case HCI_AUTO_CONN_REPORT:
+ list_add(&params->action, &hdev->pend_le_reports);
+ __hci_update_background_scan(req);
+ break;
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ if (!is_connected(hdev, addr, addr_type)) {
+ list_add(&params->action, &hdev->pend_le_conns);
+ __hci_update_background_scan(req);
+ }
+ break;
+ }
+
+ params->auto_connect = auto_connect;
+
+ BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type,
+ auto_connect);
+
+ return 0;
+}
+
+static void device_added(struct sock *sk, struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 type, u8 action)
+{
+ struct mgmt_ev_device_added ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = type;
+ ev.action = action;
+
+ mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk);
+}
+
+static void add_device_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_ADD_DEVICE, hdev);
+ if (!cmd)
+ goto unlock;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int add_device(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_add_device *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ u8 auto_conn, addr_type;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!bdaddr_type_is_valid(cp->addr.type) ||
+ !bacmp(&cp->addr.bdaddr, BDADDR_ANY))
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ if (cp->action != 0x00 && cp->action != 0x01 && cp->action != 0x02)
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
+
+ hci_req_init(&req, hdev);
+
+ hci_dev_lock(hdev);
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_ADD_DEVICE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ /* Only incoming connections action is supported for now */
+ if (cp->action != 0x01) {
+ err = cmd->cmd_complete(cmd,
+ MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
+ err = hci_bdaddr_list_add(&hdev->whitelist, &cp->addr.bdaddr,
+ cp->addr.type);
+ if (err)
+ goto unlock;
+
+ __hci_update_page_scan(&req);
+
+ goto added;
+ }
+
+ if (cp->addr.type == BDADDR_LE_PUBLIC)
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ else
+ addr_type = ADDR_LE_DEV_RANDOM;
+
+ if (cp->action == 0x02)
+ auto_conn = HCI_AUTO_CONN_ALWAYS;
+ else if (cp->action == 0x01)
+ auto_conn = HCI_AUTO_CONN_DIRECT;
+ else
+ auto_conn = HCI_AUTO_CONN_REPORT;
+
+ /* If the connection parameters don't exist for this device,
+ * they will be created and configured with defaults.
+ */
+ if (hci_conn_params_set(&req, &cp->addr.bdaddr, addr_type,
+ auto_conn) < 0) {
+ err = cmd->cmd_complete(cmd, MGMT_STATUS_FAILED);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
+added:
+ device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action);
+
+ err = hci_req_run(&req, add_device_complete);
+ if (err < 0) {
+ /* ENODATA means no HCI commands were needed (e.g. if
+ * the adapter is powered off).
+ */
+ if (err == -ENODATA)
+ err = cmd->cmd_complete(cmd, MGMT_STATUS_SUCCESS);
+ mgmt_pending_remove(cmd);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void device_removed(struct sock *sk, struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 type)
+{
+ struct mgmt_ev_device_removed ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = type;
+
+ mgmt_event(MGMT_EV_DEVICE_REMOVED, hdev, &ev, sizeof(ev), sk);
+}
+
+static void remove_device_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_REMOVE_DEVICE, hdev);
+ if (!cmd)
+ goto unlock;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int remove_device(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_remove_device *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_req_init(&req, hdev);
+
+ hci_dev_lock(hdev);
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_DEVICE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ cmd->cmd_complete = addr_cmd_complete;
+
+ if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) {
+ struct hci_conn_params *params;
+ u8 addr_type;
+
+ if (!bdaddr_type_is_valid(cp->addr.type)) {
+ err = cmd->cmd_complete(cmd,
+ MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ err = hci_bdaddr_list_del(&hdev->whitelist,
+ &cp->addr.bdaddr,
+ cp->addr.type);
+ if (err) {
+ err = cmd->cmd_complete(cmd,
+ MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
+ __hci_update_page_scan(&req);
+
+ device_removed(sk, hdev, &cp->addr.bdaddr,
+ cp->addr.type);
+ goto complete;
+ }
+
+ if (cp->addr.type == BDADDR_LE_PUBLIC)
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ else
+ addr_type = ADDR_LE_DEV_RANDOM;
+
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ addr_type);
+ if (!params) {
+ err = cmd->cmd_complete(cmd,
+ MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
+ if (params->auto_connect == HCI_AUTO_CONN_DISABLED) {
+ err = cmd->cmd_complete(cmd,
+ MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
+ list_del(&params->action);
+ list_del(&params->list);
+ kfree(params);
+ __hci_update_background_scan(&req);
+
+ device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type);
+ } else {
+ struct hci_conn_params *p, *tmp;
+ struct bdaddr_list *b, *btmp;
+
+ if (cp->addr.type) {
+ err = cmd->cmd_complete(cmd,
+ MGMT_STATUS_INVALID_PARAMS);
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
+
+ list_for_each_entry_safe(b, btmp, &hdev->whitelist, list) {
+ device_removed(sk, hdev, &b->bdaddr, b->bdaddr_type);
+ list_del(&b->list);
+ kfree(b);
+ }
+
+ __hci_update_page_scan(&req);
+
+ list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) {
+ if (p->auto_connect == HCI_AUTO_CONN_DISABLED)
+ continue;
+ device_removed(sk, hdev, &p->addr, p->addr_type);
+ list_del(&p->action);
+ list_del(&p->list);
+ kfree(p);
+ }
+
+ BT_DBG("All LE connection parameters were removed");
+
+ __hci_update_background_scan(&req);
+ }
+
+complete:
+ err = hci_req_run(&req, remove_device_complete);
+ if (err < 0) {
+ /* ENODATA means no HCI commands were needed (e.g. if
+ * the adapter is powered off).
+ */
+ if (err == -ENODATA)
+ err = cmd->cmd_complete(cmd, MGMT_STATUS_SUCCESS);
+ mgmt_pending_remove(cmd);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_load_conn_param *cp = data;
+ const u16 max_param_count = ((U16_MAX - sizeof(*cp)) /
+ sizeof(struct mgmt_conn_param));
+ u16 param_count, expected_len;
+ int i;
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ param_count = __le16_to_cpu(cp->param_count);
+ if (param_count > max_param_count) {
+ BT_ERR("load_conn_param: too big param_count value %u",
+ param_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = sizeof(*cp) + param_count *
+ sizeof(struct mgmt_conn_param);
+ if (expected_len != len) {
+ BT_ERR("load_conn_param: expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ BT_DBG("%s param_count %u", hdev->name, param_count);
+
+ hci_dev_lock(hdev);
+
+ hci_conn_params_clear_disabled(hdev);
+
+ for (i = 0; i < param_count; i++) {
+ struct mgmt_conn_param *param = &cp->params[i];
+ struct hci_conn_params *hci_param;
+ u16 min, max, latency, timeout;
+ u8 addr_type;
+
+ BT_DBG("Adding %pMR (type %u)", &param->addr.bdaddr,
+ param->addr.type);
+
+ if (param->addr.type == BDADDR_LE_PUBLIC) {
+ addr_type = ADDR_LE_DEV_PUBLIC;
+ } else if (param->addr.type == BDADDR_LE_RANDOM) {
+ addr_type = ADDR_LE_DEV_RANDOM;
+ } else {
+ BT_ERR("Ignoring invalid connection parameters");
+ continue;
+ }
+
+ min = le16_to_cpu(param->min_interval);
+ max = le16_to_cpu(param->max_interval);
+ latency = le16_to_cpu(param->latency);
+ timeout = le16_to_cpu(param->timeout);
+
+ BT_DBG("min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x",
+ min, max, latency, timeout);
+
+ if (hci_check_conn_params(min, max, latency, timeout) < 0) {
+ BT_ERR("Ignoring invalid connection parameters");
+ continue;
+ }
+
+ hci_param = hci_conn_params_add(hdev, &param->addr.bdaddr,
+ addr_type);
+ if (!hci_param) {
+ BT_ERR("Failed to add connection parameters");
+ continue;
+ }
+
+ hci_param->conn_min_interval = min;
+ hci_param->conn_max_interval = max;
+ hci_param->conn_latency = latency;
+ hci_param->supervision_timeout = timeout;
+ }
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, 0,
+ NULL, 0);
+}
+
+static int set_external_config(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_external_config *cp = data;
+ bool changed;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (hdev_is_powered(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->config != 0x00 && cp->config != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (!test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ hci_dev_lock(hdev);
+
+ if (cp->config)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_EXT_CONFIGURED);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_EXT_CONFIGURED);
+
+ err = send_options_rsp(sk, MGMT_OP_SET_EXTERNAL_CONFIG, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (!changed)
+ goto unlock;
+
+ err = new_options(hdev, sk);
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) == is_configured(hdev)) {
+ mgmt_index_removed(hdev);
+
+ if (hci_dev_test_and_change_flag(hdev, HCI_UNCONFIGURED)) {
+ hci_dev_set_flag(hdev, HCI_CONFIG);
+ hci_dev_set_flag(hdev, HCI_AUTO_OFF);
+
+ queue_work(hdev->req_workqueue, &hdev->power_on);
+ } else {
+ set_bit(HCI_RAW, &hdev->flags);
+ mgmt_index_added(hdev);
+ }
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int set_public_address(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_public_address *cp = data;
+ bool changed;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (hdev_is_powered(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS,
+ MGMT_STATUS_REJECTED);
+
+ if (!bacmp(&cp->bdaddr, BDADDR_ANY))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (!hdev->set_bdaddr)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ hci_dev_lock(hdev);
+
+ changed = !!bacmp(&hdev->public_addr, &cp->bdaddr);
+ bacpy(&hdev->public_addr, &cp->bdaddr);
+
+ err = send_options_rsp(sk, MGMT_OP_SET_PUBLIC_ADDRESS, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (!changed)
+ goto unlock;
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ err = new_options(hdev, sk);
+
+ if (is_configured(hdev)) {
+ mgmt_index_removed(hdev);
+
+ hci_dev_clear_flag(hdev, HCI_UNCONFIGURED);
+
+ hci_dev_set_flag(hdev, HCI_CONFIG);
+ hci_dev_set_flag(hdev, HCI_AUTO_OFF);
+
+ queue_work(hdev->req_workqueue, &hdev->power_on);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data,
+ u8 data_len)
+{
+ eir[eir_len++] = sizeof(type) + data_len;
+ eir[eir_len++] = type;
+ memcpy(&eir[eir_len], data, data_len);
+ eir_len += data_len;
+
+ return eir_len;
+}
+
+static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ const struct mgmt_cp_read_local_oob_ext_data *mgmt_cp;
+ struct mgmt_rp_read_local_oob_ext_data *mgmt_rp;
+ u8 *h192, *r192, *h256, *r256;
+ struct mgmt_pending_cmd *cmd;
+ u16 eir_len;
+ int err;
+
+ BT_DBG("%s status %u", hdev->name, status);
+
+ cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev);
+ if (!cmd)
+ return;
+
+ mgmt_cp = cmd->param;
+
+ if (status) {
+ status = mgmt_status(status);
+ eir_len = 0;
+
+ h192 = NULL;
+ r192 = NULL;
+ h256 = NULL;
+ r256 = NULL;
+ } else if (opcode == HCI_OP_READ_LOCAL_OOB_DATA) {
+ struct hci_rp_read_local_oob_data *rp;
+
+ if (skb->len != sizeof(*rp)) {
+ status = MGMT_STATUS_FAILED;
+ eir_len = 0;
+ } else {
+ status = MGMT_STATUS_SUCCESS;
+ rp = (void *)skb->data;
+
+ eir_len = 5 + 18 + 18;
+ h192 = rp->hash;
+ r192 = rp->rand;
+ h256 = NULL;
+ r256 = NULL;
+ }
+ } else {
+ struct hci_rp_read_local_oob_ext_data *rp;
+
+ if (skb->len != sizeof(*rp)) {
+ status = MGMT_STATUS_FAILED;
+ eir_len = 0;
+ } else {
+ status = MGMT_STATUS_SUCCESS;
+ rp = (void *)skb->data;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+ eir_len = 5 + 18 + 18;
+ h192 = NULL;
+ r192 = NULL;
+ } else {
+ eir_len = 5 + 18 + 18 + 18 + 18;
+ h192 = rp->hash192;
+ r192 = rp->rand192;
+ }
+
+ h256 = rp->hash256;
+ r256 = rp->rand256;
+ }
+ }
+
+ mgmt_rp = kmalloc(sizeof(*mgmt_rp) + eir_len, GFP_KERNEL);
+ if (!mgmt_rp)
+ goto done;
+
+ if (status)
+ goto send_rsp;
+
+ eir_len = eir_append_data(mgmt_rp->eir, 0, EIR_CLASS_OF_DEV,
+ hdev->dev_class, 3);
+
+ if (h192 && r192) {
+ eir_len = eir_append_data(mgmt_rp->eir, eir_len,
+ EIR_SSP_HASH_C192, h192, 16);
+ eir_len = eir_append_data(mgmt_rp->eir, eir_len,
+ EIR_SSP_RAND_R192, r192, 16);
+ }
+
+ if (h256 && r256) {
+ eir_len = eir_append_data(mgmt_rp->eir, eir_len,
+ EIR_SSP_HASH_C256, h256, 16);
+ eir_len = eir_append_data(mgmt_rp->eir, eir_len,
+ EIR_SSP_RAND_R256, r256, 16);
+ }
+
+send_rsp:
+ mgmt_rp->type = mgmt_cp->type;
+ mgmt_rp->eir_len = cpu_to_le16(eir_len);
+
+ err = mgmt_cmd_complete(cmd->sk, hdev->id,
+ MGMT_OP_READ_LOCAL_OOB_EXT_DATA, status,
+ mgmt_rp, sizeof(*mgmt_rp) + eir_len);
+ if (err < 0 || status)
+ goto done;
+
+ hci_sock_set_flag(cmd->sk, HCI_MGMT_OOB_DATA_EVENTS);
+
+ err = mgmt_limited_event(MGMT_EV_LOCAL_OOB_DATA_UPDATED, hdev,
+ mgmt_rp, sizeof(*mgmt_rp) + eir_len,
+ HCI_MGMT_OOB_DATA_EVENTS, cmd->sk);
+done:
+ kfree(mgmt_rp);
+ mgmt_pending_remove(cmd);
+}
+
+static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk,
+ struct mgmt_cp_read_local_oob_ext_data *cp)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ int err;
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev,
+ cp, sizeof(*cp));
+ if (!cmd)
+ return -ENOMEM;
+
+ hci_req_init(&req, hdev);
+
+ if (bredr_sc_enabled(hdev))
+ hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_EXT_DATA, 0, NULL);
+ else
+ hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL);
+
+ err = hci_req_run_skb(&req, read_local_oob_ext_data_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ return err;
+ }
+
+ return 0;
+}
+
+static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_read_local_oob_ext_data *cp = data;
+ struct mgmt_rp_read_local_oob_ext_data *rp;
+ size_t rp_len;
+ u16 eir_len;
+ u8 status, flags, role, addr[7], hash[16], rand[16];
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (hdev_is_powered(hdev)) {
+ switch (cp->type) {
+ case BIT(BDADDR_BREDR):
+ status = mgmt_bredr_support(hdev);
+ if (status)
+ eir_len = 0;
+ else
+ eir_len = 5;
+ break;
+ case (BIT(BDADDR_LE_PUBLIC) | BIT(BDADDR_LE_RANDOM)):
+ status = mgmt_le_support(hdev);
+ if (status)
+ eir_len = 0;
+ else
+ eir_len = 9 + 3 + 18 + 18 + 3;
+ break;
+ default:
+ status = MGMT_STATUS_INVALID_PARAMS;
+ eir_len = 0;
+ break;
+ }
+ } else {
+ status = MGMT_STATUS_NOT_POWERED;
+ eir_len = 0;
+ }
+
+ rp_len = sizeof(*rp) + eir_len;
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp)
+ return -ENOMEM;
+
+ if (status)
+ goto complete;
+
+ hci_dev_lock(hdev);
+
+ eir_len = 0;
+ switch (cp->type) {
+ case BIT(BDADDR_BREDR):
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ err = read_local_ssp_oob_req(hdev, sk, cp);
+ hci_dev_unlock(hdev);
+ if (!err)
+ goto done;
+
+ status = MGMT_STATUS_FAILED;
+ goto complete;
+ } else {
+ eir_len = eir_append_data(rp->eir, eir_len,
+ EIR_CLASS_OF_DEV,
+ hdev->dev_class, 3);
+ }
+ break;
+ case (BIT(BDADDR_LE_PUBLIC) | BIT(BDADDR_LE_RANDOM)):
+ if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) &&
+ smp_generate_oob(hdev, hash, rand) < 0) {
+ hci_dev_unlock(hdev);
+ status = MGMT_STATUS_FAILED;
+ goto complete;
+ }
+
+ /* This should return the active RPA, but since the RPA
+ * is only programmed on demand, it is really hard to fill
+ * this in at the moment. For now disallow retrieving
+ * local out-of-band data when privacy is in use.
+ *
+ * Returning the identity address will not help here since
+ * pairing happens before the identity resolving key is
+ * known and thus the connection establishment happens
+ * based on the RPA and not the identity address.
+ */
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+ hci_dev_unlock(hdev);
+ status = MGMT_STATUS_REJECTED;
+ goto complete;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
+ (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ memcpy(addr, &hdev->static_addr, 6);
+ addr[6] = 0x01;
+ } else {
+ memcpy(addr, &hdev->bdaddr, 6);
+ addr[6] = 0x00;
+ }
+
+ eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_BDADDR,
+ addr, sizeof(addr));
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ role = 0x02;
+ else
+ role = 0x01;
+
+ eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_ROLE,
+ &role, sizeof(role));
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ENABLED)) {
+ eir_len = eir_append_data(rp->eir, eir_len,
+ EIR_LE_SC_CONFIRM,
+ hash, sizeof(hash));
+
+ eir_len = eir_append_data(rp->eir, eir_len,
+ EIR_LE_SC_RANDOM,
+ rand, sizeof(rand));
+ }
+
+ flags = get_adv_discov_flags(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ flags |= LE_AD_NO_BREDR;
+
+ eir_len = eir_append_data(rp->eir, eir_len, EIR_FLAGS,
+ &flags, sizeof(flags));
+ break;
+ }
+
+ hci_dev_unlock(hdev);
+
+ hci_sock_set_flag(sk, HCI_MGMT_OOB_DATA_EVENTS);
+
+ status = MGMT_STATUS_SUCCESS;
+
+complete:
+ rp->type = cp->type;
+ rp->eir_len = cpu_to_le16(eir_len);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_EXT_DATA,
+ status, rp, sizeof(*rp) + eir_len);
+ if (err < 0 || status)
+ goto done;
+
+ err = mgmt_limited_event(MGMT_EV_LOCAL_OOB_DATA_UPDATED, hdev,
+ rp, sizeof(*rp) + eir_len,
+ HCI_MGMT_OOB_DATA_EVENTS, sk);
+
+done:
+ kfree(rp);
+
+ return err;
+}
+
+static u32 get_supported_adv_flags(struct hci_dev *hdev)
+{
+ u32 flags = 0;
+
+ flags |= MGMT_ADV_FLAG_CONNECTABLE;
+ flags |= MGMT_ADV_FLAG_DISCOV;
+ flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
+ flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
+
+ if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
+ flags |= MGMT_ADV_FLAG_TX_POWER;
+
+ return flags;
+}
+
+static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_adv_features *rp;
+ size_t rp_len;
+ int err;
+ bool instance;
+ u32 supported_flags;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES,
+ MGMT_STATUS_REJECTED);
+
+ hci_dev_lock(hdev);
+
+ rp_len = sizeof(*rp);
+
+ /* Currently only one instance is supported, so just add 1 to the
+ * response length.
+ */
+ instance = hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE);
+ if (instance)
+ rp_len++;
+
+ rp = kmalloc(rp_len, GFP_ATOMIC);
+ if (!rp) {
+ hci_dev_unlock(hdev);
+ return -ENOMEM;
+ }
+
+ supported_flags = get_supported_adv_flags(hdev);
+
+ rp->supported_flags = cpu_to_le32(supported_flags);
+ rp->max_adv_data_len = HCI_MAX_AD_LENGTH;
+ rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH;
+ rp->max_instances = 1;
+
+ /* Currently only one instance is supported, so simply return the
+ * current instance number.
+ */
+ if (instance) {
+ rp->num_instances = 1;
+ rp->instance[0] = 1;
+ } else {
+ rp->num_instances = 0;
+ }
+
+ hci_dev_unlock(hdev);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES,
+ MGMT_STATUS_SUCCESS, rp, rp_len);
+
+ kfree(rp);
+
+ return err;
+}
+
+static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
+ u8 len, bool is_adv_data)
+{
+ u8 max_len = HCI_MAX_AD_LENGTH;
+ int i, cur_len;
+ bool flags_managed = false;
+ bool tx_power_managed = false;
+ u32 flags_params = MGMT_ADV_FLAG_DISCOV | MGMT_ADV_FLAG_LIMITED_DISCOV |
+ MGMT_ADV_FLAG_MANAGED_FLAGS;
+
+ if (is_adv_data && (adv_flags & flags_params)) {
+ flags_managed = true;
+ max_len -= 3;
+ }
+
+ if (is_adv_data && (adv_flags & MGMT_ADV_FLAG_TX_POWER)) {
+ tx_power_managed = true;
+ max_len -= 3;
+ }
+
+ if (len > max_len)
+ return false;
+
+ /* Make sure that the data is correctly formatted. */
+ for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) {
+ cur_len = data[i];
+
+ if (flags_managed && data[i + 1] == EIR_FLAGS)
+ return false;
+
+ if (tx_power_managed && data[i + 1] == EIR_TX_POWER)
+ return false;
+
+ /* If the current field length would exceed the total data
+ * length, then it's invalid.
+ */
+ if (i + cur_len >= len)
+ return false;
+ }
+
+ return true;
+}
+
+static void add_advertising_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_rp_add_advertising rp;
+
+ BT_DBG("status %d", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev);
+
+ if (status) {
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
+ memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance));
+ advertising_removed(cmd ? cmd->sk : NULL, hdev, 1);
+ }
+
+ if (!cmd)
+ goto unlock;
+
+ rp.instance = 0x01;
+
+ if (status)
+ mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode,
+ mgmt_status(status));
+ else
+ mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
+ mgmt_status(status), &rp, sizeof(rp));
+
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void adv_timeout_expired(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ adv_instance.timeout_exp.work);
+
+ hdev->adv_instance.timeout = 0;
+
+ hci_dev_lock(hdev);
+ clear_adv_instance(hdev);
+ hci_dev_unlock(hdev);
+}
+
+static int add_advertising(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_add_advertising *cp = data;
+ struct mgmt_rp_add_advertising rp;
+ u32 flags;
+ u32 supported_flags;
+ u8 status;
+ u16 timeout;
+ int err;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+
+ BT_DBG("%s", hdev->name);
+
+ status = mgmt_le_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ status);
+
+ flags = __le32_to_cpu(cp->flags);
+ timeout = __le16_to_cpu(cp->timeout);
+
+ /* The current implementation only supports adding one instance and only
+ * a subset of the specified flags.
+ */
+ supported_flags = get_supported_adv_flags(hdev);
+ if (cp->instance != 0x01 || (flags & ~supported_flags))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (timeout && !hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) ||
+ pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) ||
+ pending_find(MGMT_OP_SET_LE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) ||
+ !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len,
+ cp->scan_rsp_len, false)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ INIT_DELAYED_WORK(&hdev->adv_instance.timeout_exp, adv_timeout_expired);
+
+ hdev->adv_instance.flags = flags;
+ hdev->adv_instance.adv_data_len = cp->adv_data_len;
+ hdev->adv_instance.scan_rsp_len = cp->scan_rsp_len;
+
+ if (cp->adv_data_len)
+ memcpy(hdev->adv_instance.adv_data, cp->data, cp->adv_data_len);
+
+ if (cp->scan_rsp_len)
+ memcpy(hdev->adv_instance.scan_rsp_data,
+ cp->data + cp->adv_data_len, cp->scan_rsp_len);
+
+ if (hdev->adv_instance.timeout)
+ cancel_delayed_work(&hdev->adv_instance.timeout_exp);
+
+ hdev->adv_instance.timeout = timeout;
+
+ if (timeout)
+ queue_delayed_work(hdev->workqueue,
+ &hdev->adv_instance.timeout_exp,
+ msecs_to_jiffies(timeout * 1000));
+
+ if (!hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ advertising_added(sk, hdev, 1);
+
+ /* If the HCI_ADVERTISING flag is set or the device isn't powered then
+ * we have no HCI communication to make. Simply return.
+ */
+ if (!hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ rp.instance = 0x01;
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ /* We're good to go, update advertising data, parameters, and start
+ * advertising.
+ */
+ cmd = mgmt_pending_add(sk, MGMT_OP_ADD_ADVERTISING, hdev, data,
+ data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+
+ update_adv_data(&req);
+ update_scan_rsp_data(&req);
+ enable_advertising(&req);
+
+ err = hci_req_run(&req, add_advertising_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void remove_advertising_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_rp_remove_advertising rp;
+
+ BT_DBG("status %d", status);
+
+ hci_dev_lock(hdev);
+
+ /* A failure status here only means that we failed to disable
+ * advertising. Otherwise, the advertising instance has been removed,
+ * so report success.
+ */
+ cmd = pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev);
+ if (!cmd)
+ goto unlock;
+
+ rp.instance = 1;
+
+ mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, MGMT_STATUS_SUCCESS,
+ &rp, sizeof(rp));
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_remove_advertising *cp = data;
+ struct mgmt_rp_remove_advertising rp;
+ int err;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+
+ BT_DBG("%s", hdev->name);
+
+ /* The current implementation only allows modifying instance no 1. A
+ * value of 0 indicates that all instances should be cleared.
+ */
+ if (cp->instance > 1)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) ||
+ pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) ||
+ pending_find(MGMT_OP_SET_LE, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ if (hdev->adv_instance.timeout)
+ cancel_delayed_work(&hdev->adv_instance.timeout_exp);
+
+ memset(&hdev->adv_instance, 0, sizeof(hdev->adv_instance));
+
+ advertising_removed(sk, hdev, 1);
+
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
+
+ /* If the HCI_ADVERTISING flag is set or the device isn't powered then
+ * we have no HCI communication to make. Simply return.
+ */
+ if (!hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ rp.instance = 1;
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_ADVERTISING, hdev, data,
+ data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+ disable_advertising(&req);
+
+ err = hci_req_run(&req, remove_advertising_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static const struct hci_mgmt_handler mgmt_handlers[] = {
+ { NULL }, /* 0x0000 (no command) */
+ { read_version, MGMT_READ_VERSION_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_commands, MGMT_READ_COMMANDS_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_index_list, MGMT_READ_INDEX_LIST_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_controller_info, MGMT_READ_INFO_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_powered, MGMT_SETTING_SIZE },
+ { set_discoverable, MGMT_SET_DISCOVERABLE_SIZE },
+ { set_connectable, MGMT_SETTING_SIZE },
+ { set_fast_connectable, MGMT_SETTING_SIZE },
+ { set_bondable, MGMT_SETTING_SIZE },
+ { set_link_security, MGMT_SETTING_SIZE },
+ { set_ssp, MGMT_SETTING_SIZE },
+ { set_hs, MGMT_SETTING_SIZE },
+ { set_le, MGMT_SETTING_SIZE },
+ { set_dev_class, MGMT_SET_DEV_CLASS_SIZE },
+ { set_local_name, MGMT_SET_LOCAL_NAME_SIZE },
+ { add_uuid, MGMT_ADD_UUID_SIZE },
+ { remove_uuid, MGMT_REMOVE_UUID_SIZE },
+ { load_link_keys, MGMT_LOAD_LINK_KEYS_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { load_long_term_keys, MGMT_LOAD_LONG_TERM_KEYS_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { disconnect, MGMT_DISCONNECT_SIZE },
+ { get_connections, MGMT_GET_CONNECTIONS_SIZE },
+ { pin_code_reply, MGMT_PIN_CODE_REPLY_SIZE },
+ { pin_code_neg_reply, MGMT_PIN_CODE_NEG_REPLY_SIZE },
+ { set_io_capability, MGMT_SET_IO_CAPABILITY_SIZE },
+ { pair_device, MGMT_PAIR_DEVICE_SIZE },
+ { cancel_pair_device, MGMT_CANCEL_PAIR_DEVICE_SIZE },
+ { unpair_device, MGMT_UNPAIR_DEVICE_SIZE },
+ { user_confirm_reply, MGMT_USER_CONFIRM_REPLY_SIZE },
+ { user_confirm_neg_reply, MGMT_USER_CONFIRM_NEG_REPLY_SIZE },
+ { user_passkey_reply, MGMT_USER_PASSKEY_REPLY_SIZE },
+ { user_passkey_neg_reply, MGMT_USER_PASSKEY_NEG_REPLY_SIZE },
+ { read_local_oob_data, MGMT_READ_LOCAL_OOB_DATA_SIZE },
+ { add_remote_oob_data, MGMT_ADD_REMOTE_OOB_DATA_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { remove_remote_oob_data, MGMT_REMOVE_REMOTE_OOB_DATA_SIZE },
+ { start_discovery, MGMT_START_DISCOVERY_SIZE },
+ { stop_discovery, MGMT_STOP_DISCOVERY_SIZE },
+ { confirm_name, MGMT_CONFIRM_NAME_SIZE },
+ { block_device, MGMT_BLOCK_DEVICE_SIZE },
+ { unblock_device, MGMT_UNBLOCK_DEVICE_SIZE },
+ { set_device_id, MGMT_SET_DEVICE_ID_SIZE },
+ { set_advertising, MGMT_SETTING_SIZE },
+ { set_bredr, MGMT_SETTING_SIZE },
+ { set_static_address, MGMT_SET_STATIC_ADDRESS_SIZE },
+ { set_scan_params, MGMT_SET_SCAN_PARAMS_SIZE },
+ { set_secure_conn, MGMT_SETTING_SIZE },
+ { set_debug_keys, MGMT_SETTING_SIZE },
+ { set_privacy, MGMT_SET_PRIVACY_SIZE },
+ { load_irks, MGMT_LOAD_IRKS_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { get_conn_info, MGMT_GET_CONN_INFO_SIZE },
+ { get_clock_info, MGMT_GET_CLOCK_INFO_SIZE },
+ { add_device, MGMT_ADD_DEVICE_SIZE },
+ { remove_device, MGMT_REMOVE_DEVICE_SIZE },
+ { load_conn_param, MGMT_LOAD_CONN_PARAM_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { read_unconf_index_list, MGMT_READ_UNCONF_INDEX_LIST_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_config_info, MGMT_READ_CONFIG_INFO_SIZE,
+ HCI_MGMT_UNCONFIGURED |
+ HCI_MGMT_UNTRUSTED },
+ { set_external_config, MGMT_SET_EXTERNAL_CONFIG_SIZE,
+ HCI_MGMT_UNCONFIGURED },
+ { set_public_address, MGMT_SET_PUBLIC_ADDRESS_SIZE,
+ HCI_MGMT_UNCONFIGURED },
+ { start_service_discovery, MGMT_START_SERVICE_DISCOVERY_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { read_local_oob_ext_data, MGMT_READ_LOCAL_OOB_EXT_DATA_SIZE },
+ { read_ext_index_list, MGMT_READ_EXT_INDEX_LIST_SIZE,
+ HCI_MGMT_NO_HDEV |
+ HCI_MGMT_UNTRUSTED },
+ { read_adv_features, MGMT_READ_ADV_FEATURES_SIZE },
+ { add_advertising, MGMT_ADD_ADVERTISING_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE },
+};
+
+void mgmt_index_added(struct hci_dev *hdev)
+{
+ struct mgmt_ev_ext_index ev;
+
+ if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+ return;
+
+ switch (hdev->dev_type) {
+ case HCI_BREDR:
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev,
+ NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS);
+ ev.type = 0x01;
+ } else {
+ mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0,
+ HCI_MGMT_INDEX_EVENTS);
+ ev.type = 0x00;
+ }
+ break;
+ case HCI_AMP:
+ ev.type = 0x02;
+ break;
+ default:
+ return;
+ }
+
+ ev.bus = hdev->bus;
+
+ mgmt_index_event(MGMT_EV_EXT_INDEX_ADDED, hdev, &ev, sizeof(ev),
+ HCI_MGMT_EXT_INDEX_EVENTS);
+}
+
+void mgmt_index_removed(struct hci_dev *hdev)
+{
+ struct mgmt_ev_ext_index ev;
+ u8 status = MGMT_STATUS_INVALID_INDEX;
+
+ if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+ return;
+
+ switch (hdev->dev_type) {
+ case HCI_BREDR:
+ mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev,
+ NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS);
+ ev.type = 0x01;
+ } else {
+ mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0,
+ HCI_MGMT_INDEX_EVENTS);
+ ev.type = 0x00;
+ }
+ break;
+ case HCI_AMP:
+ ev.type = 0x02;
+ break;
+ default:
+ return;
+ }
+
+ ev.bus = hdev->bus;
+
+ mgmt_index_event(MGMT_EV_EXT_INDEX_REMOVED, hdev, &ev, sizeof(ev),
+ HCI_MGMT_EXT_INDEX_EVENTS);
+}
+
+/* This function requires the caller holds hdev->lock */
+static void restart_le_actions(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_conn_params *p;
+
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ /* Needed for AUTO_OFF case where might not "really"
+ * have been powered off.
+ */
+ list_del_init(&p->action);
+
+ switch (p->auto_connect) {
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ list_add(&p->action, &hdev->pend_le_conns);
+ break;
+ case HCI_AUTO_CONN_REPORT:
+ list_add(&p->action, &hdev->pend_le_reports);
+ break;
+ default:
+ break;
+ }
+ }
+
+ __hci_update_background_scan(req);
+}
+
+static void powered_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ struct cmd_lookup match = { NULL, hdev };
+
+ BT_DBG("status 0x%02x", status);
+
+ if (!status) {
+ /* Register the available SMP channels (BR/EDR and LE) only
+ * when successfully powering on the controller. This late
+ * registration is required so that LE SMP can clearly
+ * decide if the public address or static address is used.
+ */
+ smp_register(hdev);
+ }
+
+ hci_dev_lock(hdev);
+
+ mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match);
+
+ new_settings(hdev, match.sk);
+
+ hci_dev_unlock(hdev);
+
+ if (match.sk)
+ sock_put(match.sk);
+}
+
+static int powered_update_hci(struct hci_dev *hdev)
+{
+ struct hci_request req;
+ u8 link_sec;
+
+ hci_req_init(&req, hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) &&
+ !lmp_host_ssp_capable(hdev)) {
+ u8 mode = 0x01;
+
+ hci_req_add(&req, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode);
+
+ if (bredr_sc_enabled(hdev) && !lmp_host_sc_capable(hdev)) {
+ u8 support = 0x01;
+
+ hci_req_add(&req, HCI_OP_WRITE_SC_SUPPORT,
+ sizeof(support), &support);
+ }
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
+ lmp_bredr_capable(hdev)) {
+ struct hci_cp_write_le_host_supported cp;
+
+ cp.le = 0x01;
+ cp.simul = 0x00;
+
+ /* Check first if we already have the right
+ * host state (host features set)
+ */
+ if (cp.le != lmp_host_le_capable(hdev) ||
+ cp.simul != lmp_host_le_br_capable(hdev))
+ hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED,
+ sizeof(cp), &cp);
+ }
+
+ if (lmp_le_capable(hdev)) {
+ /* Make sure the controller has a good default for
+ * advertising data. This also applies to the case
+ * where BR/EDR was toggled during the AUTO_OFF phase.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ update_adv_data(&req);
+ update_scan_rsp_data(&req);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ enable_advertising(&req);
+
+ restart_le_actions(&req);
+ }
+
+ link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY);
+ if (link_sec != test_bit(HCI_AUTH, &hdev->flags))
+ hci_req_add(&req, HCI_OP_WRITE_AUTH_ENABLE,
+ sizeof(link_sec), &link_sec);
+
+ if (lmp_bredr_capable(hdev)) {
+ if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
+ write_fast_connectable(&req, true);
+ else
+ write_fast_connectable(&req, false);
+ __hci_update_page_scan(&req);
+ update_class(&req);
+ update_name(&req);
+ update_eir(&req);
+ }
+
+ return hci_req_run(&req, powered_complete);
+}
+
+int mgmt_powered(struct hci_dev *hdev, u8 powered)
+{
+ struct cmd_lookup match = { NULL, hdev };
+ u8 status, zero_cod[] = { 0, 0, 0 };
+ int err;
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return 0;
+
+ if (powered) {
+ if (powered_update_hci(hdev) == 0)
+ return 0;
+
+ mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp,
+ &match);
+ goto new_settings;
+ }
+
+ mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match);
+
+ /* If the power off is because of hdev unregistration let
+ * use the appropriate INVALID_INDEX status. Otherwise use
+ * NOT_POWERED. We cover both scenarios here since later in
+ * mgmt_index_removed() any hci_conn callbacks will have already
+ * been triggered, potentially causing misleading DISCONNECTED
+ * status responses.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ status = MGMT_STATUS_INVALID_INDEX;
+ else
+ status = MGMT_STATUS_NOT_POWERED;
+
+ mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
+
+ if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0)
+ mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
+ zero_cod, sizeof(zero_cod), NULL);
+
+new_settings:
+ err = new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+ return err;
+}
+
+void mgmt_set_powered_failed(struct hci_dev *hdev, int err)
+{
+ struct mgmt_pending_cmd *cmd;
+ u8 status;
+
+ cmd = pending_find(MGMT_OP_SET_POWERED, hdev);
+ if (!cmd)
+ return;
+
+ if (err == -ERFKILL)
+ status = MGMT_STATUS_RFKILLED;
+ else
+ status = MGMT_STATUS_FAILED;
+
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_POWERED, status);
+
+ mgmt_pending_remove(cmd);
+}
+
+void mgmt_discoverable_timeout(struct hci_dev *hdev)
+{
+ struct hci_request req;
+
+ hci_dev_lock(hdev);
+
+ /* When discoverable timeout triggers, then just make sure
+ * the limited discoverable flag is cleared. Even in the case
+ * of a timeout triggered from general discoverable, it is
+ * safe to unconditionally clear the flag.
+ */
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+
+ hci_req_init(&req, hdev);
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ u8 scan = SCAN_PAGE;
+ hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE,
+ sizeof(scan), &scan);
+ }
+ update_class(&req);
+
+ /* Advertising instances don't use the global discoverable setting, so
+ * only update AD if advertising was enabled using Set Advertising.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ update_adv_data(&req);
+
+ hci_req_run(&req, NULL);
+
+ hdev->discov_timeout = 0;
+
+ new_settings(hdev, NULL);
+
+ hci_dev_unlock(hdev);
+}
+
+void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key,
+ bool persistent)
+{
+ struct mgmt_ev_new_link_key ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ ev.store_hint = persistent;
+ bacpy(&ev.key.addr.bdaddr, &key->bdaddr);
+ ev.key.addr.type = BDADDR_BREDR;
+ ev.key.type = key->type;
+ memcpy(ev.key.val, key->val, HCI_LINK_KEY_SIZE);
+ ev.key.pin_len = key->pin_len;
+
+ mgmt_event(MGMT_EV_NEW_LINK_KEY, hdev, &ev, sizeof(ev), NULL);
+}
+
+static u8 mgmt_ltk_type(struct smp_ltk *ltk)
+{
+ switch (ltk->type) {
+ case SMP_LTK:
+ case SMP_LTK_SLAVE:
+ if (ltk->authenticated)
+ return MGMT_LTK_AUTHENTICATED;
+ return MGMT_LTK_UNAUTHENTICATED;
+ case SMP_LTK_P256:
+ if (ltk->authenticated)
+ return MGMT_LTK_P256_AUTH;
+ return MGMT_LTK_P256_UNAUTH;
+ case SMP_LTK_P256_DEBUG:
+ return MGMT_LTK_P256_DEBUG;
+ }
+
+ return MGMT_LTK_UNAUTHENTICATED;
+}
+
+void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent)
+{
+ struct mgmt_ev_new_long_term_key ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ /* Devices using resolvable or non-resolvable random addresses
+ * without providing an indentity resolving key don't require
+ * to store long term keys. Their addresses will change the
+ * next time around.
+ *
+ * Only when a remote device provides an identity address
+ * make sure the long term key is stored. If the remote
+ * identity is known, the long term keys are internally
+ * mapped to the identity address. So allow static random
+ * and public addresses here.
+ */
+ if (key->bdaddr_type == ADDR_LE_DEV_RANDOM &&
+ (key->bdaddr.b[5] & 0xc0) != 0xc0)
+ ev.store_hint = 0x00;
+ else
+ ev.store_hint = persistent;
+
+ bacpy(&ev.key.addr.bdaddr, &key->bdaddr);
+ ev.key.addr.type = link_to_bdaddr(LE_LINK, key->bdaddr_type);
+ ev.key.type = mgmt_ltk_type(key);
+ ev.key.enc_size = key->enc_size;
+ ev.key.ediv = key->ediv;
+ ev.key.rand = key->rand;
+
+ if (key->type == SMP_LTK)
+ ev.key.master = 1;
+
+ memcpy(ev.key.val, key->val, sizeof(key->val));
+
+ mgmt_event(MGMT_EV_NEW_LONG_TERM_KEY, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk)
+{
+ struct mgmt_ev_new_irk ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ /* For identity resolving keys from devices that are already
+ * using a public address or static random address, do not
+ * ask for storing this key. The identity resolving key really
+ * is only mandatory for devices using resovlable random
+ * addresses.
+ *
+ * Storing all identity resolving keys has the downside that
+ * they will be also loaded on next boot of they system. More
+ * identity resolving keys, means more time during scanning is
+ * needed to actually resolve these addresses.
+ */
+ if (bacmp(&irk->rpa, BDADDR_ANY))
+ ev.store_hint = 0x01;
+ else
+ ev.store_hint = 0x00;
+
+ bacpy(&ev.rpa, &irk->rpa);
+ bacpy(&ev.irk.addr.bdaddr, &irk->bdaddr);
+ ev.irk.addr.type = link_to_bdaddr(LE_LINK, irk->addr_type);
+ memcpy(ev.irk.val, irk->val, sizeof(irk->val));
+
+ mgmt_event(MGMT_EV_NEW_IRK, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk,
+ bool persistent)
+{
+ struct mgmt_ev_new_csrk ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ /* Devices using resolvable or non-resolvable random addresses
+ * without providing an indentity resolving key don't require
+ * to store signature resolving keys. Their addresses will change
+ * the next time around.
+ *
+ * Only when a remote device provides an identity address
+ * make sure the signature resolving key is stored. So allow
+ * static random and public addresses here.
+ */
+ if (csrk->bdaddr_type == ADDR_LE_DEV_RANDOM &&
+ (csrk->bdaddr.b[5] & 0xc0) != 0xc0)
+ ev.store_hint = 0x00;
+ else
+ ev.store_hint = persistent;
+
+ bacpy(&ev.key.addr.bdaddr, &csrk->bdaddr);
+ ev.key.addr.type = link_to_bdaddr(LE_LINK, csrk->bdaddr_type);
+ ev.key.type = csrk->type;
+ memcpy(ev.key.val, csrk->val, sizeof(csrk->val));
+
+ mgmt_event(MGMT_EV_NEW_CSRK, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 bdaddr_type, u8 store_hint, u16 min_interval,
+ u16 max_interval, u16 latency, u16 timeout)
+{
+ struct mgmt_ev_new_conn_param ev;
+
+ if (!hci_is_identity_address(bdaddr, bdaddr_type))
+ return;
+
+ memset(&ev, 0, sizeof(ev));
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(LE_LINK, bdaddr_type);
+ ev.store_hint = store_hint;
+ ev.min_interval = cpu_to_le16(min_interval);
+ ev.max_interval = cpu_to_le16(max_interval);
+ ev.latency = cpu_to_le16(latency);
+ ev.timeout = cpu_to_le16(timeout);
+
+ mgmt_event(MGMT_EV_NEW_CONN_PARAM, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn,
+ u32 flags, u8 *name, u8 name_len)
+{
+ char buf[512];
+ struct mgmt_ev_device_connected *ev = (void *) buf;
+ u16 eir_len = 0;
+
+ bacpy(&ev->addr.bdaddr, &conn->dst);
+ ev->addr.type = link_to_bdaddr(conn->type, conn->dst_type);
+
+ ev->flags = __cpu_to_le32(flags);
+
+ /* We must ensure that the EIR Data fields are ordered and
+ * unique. Keep it simple for now and avoid the problem by not
+ * adding any BR/EDR data to the LE adv.
+ */
+ if (conn->le_adv_data_len > 0) {
+ memcpy(&ev->eir[eir_len],
+ conn->le_adv_data, conn->le_adv_data_len);
+ eir_len = conn->le_adv_data_len;
+ } else {
+ if (name_len > 0)
+ eir_len = eir_append_data(ev->eir, 0, EIR_NAME_COMPLETE,
+ name, name_len);
+
+ if (memcmp(conn->dev_class, "\0\0\0", 3) != 0)
+ eir_len = eir_append_data(ev->eir, eir_len,
+ EIR_CLASS_OF_DEV,
+ conn->dev_class, 3);
+ }
+
+ ev->eir_len = cpu_to_le16(eir_len);
+
+ mgmt_event(MGMT_EV_DEVICE_CONNECTED, hdev, buf,
+ sizeof(*ev) + eir_len, NULL);
+}
+
+static void disconnect_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ struct sock **sk = data;
+
+ cmd->cmd_complete(cmd, 0);
+
+ *sk = cmd->sk;
+ sock_hold(*sk);
+
+ mgmt_pending_remove(cmd);
+}
+
+static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data)
+{
+ struct hci_dev *hdev = data;
+ struct mgmt_cp_unpair_device *cp = cmd->param;
+
+ device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk);
+
+ cmd->cmd_complete(cmd, 0);
+ mgmt_pending_remove(cmd);
+}
+
+bool mgmt_powering_down(struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+ struct mgmt_mode *cp;
+
+ cmd = pending_find(MGMT_OP_SET_POWERED, hdev);
+ if (!cmd)
+ return false;
+
+ cp = cmd->param;
+ if (!cp->val)
+ return true;
+
+ return false;
+}
+
+void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 reason,
+ bool mgmt_connected)
+{
+ struct mgmt_ev_device_disconnected ev;
+ struct sock *sk = NULL;
+
+ /* The connection is still in hci_conn_hash so test for 1
+ * instead of 0 to know if this is the last one.
+ */
+ if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) {
+ cancel_delayed_work(&hdev->power_off);
+ queue_work(hdev->req_workqueue, &hdev->power_off.work);
+ }
+
+ if (!mgmt_connected)
+ return;
+
+ if (link_type != ACL_LINK && link_type != LE_LINK)
+ return;
+
+ mgmt_pending_foreach(MGMT_OP_DISCONNECT, hdev, disconnect_rsp, &sk);
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+ ev.reason = reason;
+
+ mgmt_event(MGMT_EV_DEVICE_DISCONNECTED, hdev, &ev, sizeof(ev), sk);
+
+ if (sk)
+ sock_put(sk);
+
+ mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp,
+ hdev);
+}
+
+void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ u8 bdaddr_type = link_to_bdaddr(link_type, addr_type);
+ struct mgmt_cp_disconnect *cp;
+ struct mgmt_pending_cmd *cmd;
+
+ mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp,
+ hdev);
+
+ cmd = pending_find(MGMT_OP_DISCONNECT, hdev);
+ if (!cmd)
+ return;
+
+ cp = cmd->param;
+
+ if (bacmp(bdaddr, &cp->addr.bdaddr))
+ return;
+
+ if (cp->addr.type != bdaddr_type)
+ return;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
+ u8 addr_type, u8 status)
+{
+ struct mgmt_ev_connect_failed ev;
+
+ /* The connection is still in hci_conn_hash so test for 1
+ * instead of 0 to know if this is the last one.
+ */
+ if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) {
+ cancel_delayed_work(&hdev->power_off);
+ queue_work(hdev->req_workqueue, &hdev->power_off.work);
+ }
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+ ev.status = mgmt_status(status);
+
+ mgmt_event(MGMT_EV_CONNECT_FAILED, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_pin_code_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 secure)
+{
+ struct mgmt_ev_pin_code_request ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = BDADDR_BREDR;
+ ev.secure = secure;
+
+ mgmt_event(MGMT_EV_PIN_CODE_REQUEST, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = pending_find(MGMT_OP_PIN_CODE_REPLY, hdev);
+ if (!cmd)
+ return;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+void mgmt_pin_code_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 status)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = pending_find(MGMT_OP_PIN_CODE_NEG_REPLY, hdev);
+ if (!cmd)
+ return;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+}
+
+int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u32 value,
+ u8 confirm_hint)
+{
+ struct mgmt_ev_user_confirm_request ev;
+
+ BT_DBG("%s", hdev->name);
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+ ev.confirm_hint = confirm_hint;
+ ev.value = cpu_to_le32(value);
+
+ return mgmt_event(MGMT_EV_USER_CONFIRM_REQUEST, hdev, &ev, sizeof(ev),
+ NULL);
+}
+
+int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type)
+{
+ struct mgmt_ev_user_passkey_request ev;
+
+ BT_DBG("%s", hdev->name);
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+
+ return mgmt_event(MGMT_EV_USER_PASSKEY_REQUEST, hdev, &ev, sizeof(ev),
+ NULL);
+}
+
+static int user_pairing_resp_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status,
+ u8 opcode)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = pending_find(opcode, hdev);
+ if (!cmd)
+ return -ENOENT;
+
+ cmd->cmd_complete(cmd, mgmt_status(status));
+ mgmt_pending_remove(cmd);
+
+ return 0;
+}
+
+int mgmt_user_confirm_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type,
+ status, MGMT_OP_USER_CONFIRM_REPLY);
+}
+
+int mgmt_user_confirm_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type,
+ status,
+ MGMT_OP_USER_CONFIRM_NEG_REPLY);
+}
+
+int mgmt_user_passkey_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type,
+ status, MGMT_OP_USER_PASSKEY_REPLY);
+}
+
+int mgmt_user_passkey_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u8 status)
+{
+ return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type,
+ status,
+ MGMT_OP_USER_PASSKEY_NEG_REPLY);
+}
+
+int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 link_type, u8 addr_type, u32 passkey,
+ u8 entered)
+{
+ struct mgmt_ev_passkey_notify ev;
+
+ BT_DBG("%s", hdev->name);
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = link_to_bdaddr(link_type, addr_type);
+ ev.passkey = __cpu_to_le32(passkey);
+ ev.entered = entered;
+
+ return mgmt_event(MGMT_EV_PASSKEY_NOTIFY, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_auth_failed(struct hci_conn *conn, u8 hci_status)
+{
+ struct mgmt_ev_auth_failed ev;
+ struct mgmt_pending_cmd *cmd;
+ u8 status = mgmt_status(hci_status);
+
+ bacpy(&ev.addr.bdaddr, &conn->dst);
+ ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type);
+ ev.status = status;
+
+ cmd = find_pairing(conn);
+
+ mgmt_event(MGMT_EV_AUTH_FAILED, conn->hdev, &ev, sizeof(ev),
+ cmd ? cmd->sk : NULL);
+
+ if (cmd) {
+ cmd->cmd_complete(cmd, status);
+ mgmt_pending_remove(cmd);
+ }
+}
+
+void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status)
+{
+ struct cmd_lookup match = { NULL, hdev };
+ bool changed;
+
+ if (status) {
+ u8 mgmt_err = mgmt_status(status);
+ mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev,
+ cmd_status_rsp, &mgmt_err);
+ return;
+ }
+
+ if (test_bit(HCI_AUTH, &hdev->flags))
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_LINK_SECURITY);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_LINK_SECURITY);
+
+ mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, settings_rsp,
+ &match);
+
+ if (changed)
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+}
+
+static void clear_eir(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_eir cp;
+
+ if (!lmp_ext_inq_capable(hdev))
+ return;
+
+ memset(hdev->eir, 0, sizeof(hdev->eir));
+
+ memset(&cp, 0, sizeof(cp));
+
+ hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
+}
+
+void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status)
+{
+ struct cmd_lookup match = { NULL, hdev };
+ struct hci_request req;
+ bool changed = false;
+
+ if (status) {
+ u8 mgmt_err = mgmt_status(status);
+
+ if (enable && hci_dev_test_and_clear_flag(hdev,
+ HCI_SSP_ENABLED)) {
+ hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
+ new_settings(hdev, NULL);
+ }
+
+ mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, cmd_status_rsp,
+ &mgmt_err);
+ return;
+ }
+
+ if (enable) {
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED);
+ if (!changed)
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_HS_ENABLED);
+ else
+ hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
+ }
+
+ mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match);
+
+ if (changed)
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+ hci_req_init(&req, hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS))
+ hci_req_add(&req, HCI_OP_WRITE_SSP_DEBUG_MODE,
+ sizeof(enable), &enable);
+ update_eir(&req);
+ } else {
+ clear_eir(&req);
+ }
+
+ hci_req_run(&req, NULL);
+}
+
+static void sk_lookup(struct mgmt_pending_cmd *cmd, void *data)
+{
+ struct cmd_lookup *match = data;
+
+ if (match->sk == NULL) {
+ match->sk = cmd->sk;
+ sock_hold(match->sk);
+ }
+}
+
+void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
+ u8 status)
+{
+ struct cmd_lookup match = { NULL, hdev, mgmt_status(status) };
+
+ mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, sk_lookup, &match);
+ mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match);
+ mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
+
+ if (!status)
+ mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
+ dev_class, 3, NULL);
+
+ if (match.sk)
+ sock_put(match.sk);
+}
+
+void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
+{
+ struct mgmt_cp_set_local_name ev;
+ struct mgmt_pending_cmd *cmd;
+
+ if (status)
+ return;
+
+ memset(&ev, 0, sizeof(ev));
+ memcpy(ev.name, name, HCI_MAX_NAME_LENGTH);
+ memcpy(ev.short_name, hdev->short_name, HCI_MAX_SHORT_NAME_LENGTH);
+
+ cmd = pending_find(MGMT_OP_SET_LOCAL_NAME, hdev);
+ if (!cmd) {
+ memcpy(hdev->dev_name, name, sizeof(hdev->dev_name));
+
+ /* If this is a HCI command related to powering on the
+ * HCI dev don't send any mgmt signals.
+ */
+ if (pending_find(MGMT_OP_SET_POWERED, hdev))
+ return;
+ }
+
+ mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev),
+ cmd ? cmd->sk : NULL);
+}
+
+static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16])
+{
+ int i;
+
+ for (i = 0; i < uuid_count; i++) {
+ if (!memcmp(uuid, uuids[i], 16))
+ return true;
+ }
+
+ return false;
+}
+
+static bool eir_has_uuids(u8 *eir, u16 eir_len, u16 uuid_count, u8 (*uuids)[16])
+{
+ u16 parsed = 0;
+
+ while (parsed < eir_len) {
+ u8 field_len = eir[0];
+ u8 uuid[16];
+ int i;
+
+ if (field_len == 0)
+ break;
+
+ if (eir_len - parsed < field_len + 1)
+ break;
+
+ switch (eir[1]) {
+ case EIR_UUID16_ALL:
+ case EIR_UUID16_SOME:
+ for (i = 0; i + 3 <= field_len; i += 2) {
+ memcpy(uuid, bluetooth_base_uuid, 16);
+ uuid[13] = eir[i + 3];
+ uuid[12] = eir[i + 2];
+ if (has_uuid(uuid, uuid_count, uuids))
+ return true;
+ }
+ break;
+ case EIR_UUID32_ALL:
+ case EIR_UUID32_SOME:
+ for (i = 0; i + 5 <= field_len; i += 4) {
+ memcpy(uuid, bluetooth_base_uuid, 16);
+ uuid[15] = eir[i + 5];
+ uuid[14] = eir[i + 4];
+ uuid[13] = eir[i + 3];
+ uuid[12] = eir[i + 2];
+ if (has_uuid(uuid, uuid_count, uuids))
+ return true;
+ }
+ break;
+ case EIR_UUID128_ALL:
+ case EIR_UUID128_SOME:
+ for (i = 0; i + 17 <= field_len; i += 16) {
+ memcpy(uuid, eir + i + 2, 16);
+ if (has_uuid(uuid, uuid_count, uuids))
+ return true;
+ }
+ break;
+ }
+
+ parsed += field_len + 1;
+ eir += field_len + 1;
+ }
+
+ return false;
+}
+
+static void restart_le_scan(struct hci_dev *hdev)
+{
+ /* If controller is not scanning we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ return;
+
+ if (time_after(jiffies + DISCOV_LE_RESTART_DELAY,
+ hdev->discovery.scan_start +
+ hdev->discovery.scan_duration))
+ return;
+
+ queue_delayed_work(hdev->workqueue, &hdev->le_scan_restart,
+ DISCOV_LE_RESTART_DELAY);
+}
+
+static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
+ u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len)
+{
+ /* If a RSSI threshold has been specified, and
+ * HCI_QUIRK_STRICT_DUPLICATE_FILTER is not set, then all results with
+ * a RSSI smaller than the RSSI threshold will be dropped. If the quirk
+ * is set, let it through for further processing, as we might need to
+ * restart the scan.
+ *
+ * For BR/EDR devices (pre 1.2) providing no RSSI during inquiry,
+ * the results are also dropped.
+ */
+ if (hdev->discovery.rssi != HCI_RSSI_INVALID &&
+ (rssi == HCI_RSSI_INVALID ||
+ (rssi < hdev->discovery.rssi &&
+ !test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks))))
+ return false;
+
+ if (hdev->discovery.uuid_count != 0) {
+ /* If a list of UUIDs is provided in filter, results with no
+ * matching UUID should be dropped.
+ */
+ if (!eir_has_uuids(eir, eir_len, hdev->discovery.uuid_count,
+ hdev->discovery.uuids) &&
+ !eir_has_uuids(scan_rsp, scan_rsp_len,
+ hdev->discovery.uuid_count,
+ hdev->discovery.uuids))
+ return false;
+ }
+
+ /* If duplicate filtering does not report RSSI changes, then restart
+ * scanning to ensure updated result with updated RSSI values.
+ */
+ if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)) {
+ restart_le_scan(hdev);
+
+ /* Validate RSSI value against the RSSI threshold once more. */
+ if (hdev->discovery.rssi != HCI_RSSI_INVALID &&
+ rssi < hdev->discovery.rssi)
+ return false;
+ }
+
+ return true;
+}
+
+void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
+ u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
+ u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len)
+{
+ char buf[512];
+ struct mgmt_ev_device_found *ev = (void *)buf;
+ size_t ev_size;
+
+ /* Don't send events for a non-kernel initiated discovery. With
+ * LE one exception is if we have pend_le_reports > 0 in which
+ * case we're doing passive scanning and want these events.
+ */
+ if (!hci_discovery_active(hdev)) {
+ if (link_type == ACL_LINK)
+ return;
+ if (link_type == LE_LINK && list_empty(&hdev->pend_le_reports))
+ return;
+ }
+
+ if (hdev->discovery.result_filtering) {
+ /* We are using service discovery */
+ if (!is_filter_match(hdev, rssi, eir, eir_len, scan_rsp,
+ scan_rsp_len))
+ return;
+ }
+
+ /* Make sure that the buffer is big enough. The 5 extra bytes
+ * are for the potential CoD field.
+ */
+ if (sizeof(*ev) + eir_len + scan_rsp_len + 5 > sizeof(buf))
+ return;
+
+ memset(buf, 0, sizeof(buf));
+
+ /* In case of device discovery with BR/EDR devices (pre 1.2), the
+ * RSSI value was reported as 0 when not available. This behavior
+ * is kept when using device discovery. This is required for full
+ * backwards compatibility with the API.
+ *
+ * However when using service discovery, the value 127 will be
+ * returned when the RSSI is not available.
+ */
+ if (rssi == HCI_RSSI_INVALID && !hdev->discovery.report_invalid_rssi &&
+ link_type == ACL_LINK)
+ rssi = 0;
+
+ bacpy(&ev->addr.bdaddr, bdaddr);
+ ev->addr.type = link_to_bdaddr(link_type, addr_type);
+ ev->rssi = rssi;
+ ev->flags = cpu_to_le32(flags);
+
+ if (eir_len > 0)
+ /* Copy EIR or advertising data into event */
+ memcpy(ev->eir, eir, eir_len);
+
+ if (dev_class && !eir_has_data_type(ev->eir, eir_len, EIR_CLASS_OF_DEV))
+ eir_len = eir_append_data(ev->eir, eir_len, EIR_CLASS_OF_DEV,
+ dev_class, 3);
+
+ if (scan_rsp_len > 0)
+ /* Append scan response data to event */
+ memcpy(ev->eir + eir_len, scan_rsp, scan_rsp_len);
+
+ ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len);
+ ev_size = sizeof(*ev) + eir_len + scan_rsp_len;
+
+ mgmt_event(MGMT_EV_DEVICE_FOUND, hdev, ev, ev_size, NULL);
+}
+
+void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
+ u8 addr_type, s8 rssi, u8 *name, u8 name_len)
+{
+ struct mgmt_ev_device_found *ev;
+ char buf[sizeof(*ev) + HCI_MAX_NAME_LENGTH + 2];
+ u16 eir_len;
+
+ ev = (struct mgmt_ev_device_found *) buf;
+
+ memset(buf, 0, sizeof(buf));
+
+ bacpy(&ev->addr.bdaddr, bdaddr);
+ ev->addr.type = link_to_bdaddr(link_type, addr_type);
+ ev->rssi = rssi;
+
+ eir_len = eir_append_data(ev->eir, 0, EIR_NAME_COMPLETE, name,
+ name_len);
+
+ ev->eir_len = cpu_to_le16(eir_len);
+
+ mgmt_event(MGMT_EV_DEVICE_FOUND, hdev, ev, sizeof(*ev) + eir_len, NULL);
+}
+
+void mgmt_discovering(struct hci_dev *hdev, u8 discovering)
+{
+ struct mgmt_ev_discovering ev;
+
+ BT_DBG("%s discovering %u", hdev->name, discovering);
+
+ memset(&ev, 0, sizeof(ev));
+ ev.type = hdev->discovery.type;
+ ev.discovering = discovering;
+
+ mgmt_event(MGMT_EV_DISCOVERING, hdev, &ev, sizeof(ev), NULL);
+}
+
+static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ BT_DBG("%s status %u", hdev->name, status);
+}
+
+void mgmt_reenable_advertising(struct hci_dev *hdev)
+{
+ struct hci_request req;
+
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
+ return;
+
+ hci_req_init(&req, hdev);
+ enable_advertising(&req);
+ hci_req_run(&req, adv_enable_complete);
+}
+
+static struct hci_mgmt_chan chan = {
+ .channel = HCI_CHANNEL_CONTROL,
+ .handler_count = ARRAY_SIZE(mgmt_handlers),
+ .handlers = mgmt_handlers,
+ .hdev_init = mgmt_init_hdev,
+};
+
+int mgmt_init(void)
+{
+ return hci_mgmt_chan_register(&chan);
+}
+
+void mgmt_exit(void)
+{
+ hci_mgmt_chan_unregister(&chan);
+}
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
new file mode 100644
index 000000000..8c30c7eb8
--- /dev/null
+++ b/net/bluetooth/mgmt_util.c
@@ -0,0 +1,210 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2015 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "mgmt_util.h"
+
+int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
+ void *data, u16 data_len, int flag, struct sock *skip_sk)
+{
+ struct sk_buff *skb;
+ struct mgmt_hdr *hdr;
+
+ skb = alloc_skb(sizeof(*hdr) + data_len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = (void *) skb_put(skb, sizeof(*hdr));
+ hdr->opcode = cpu_to_le16(event);
+ if (hdev)
+ hdr->index = cpu_to_le16(hdev->id);
+ else
+ hdr->index = cpu_to_le16(MGMT_INDEX_NONE);
+ hdr->len = cpu_to_le16(data_len);
+
+ if (data)
+ memcpy(skb_put(skb, data_len), data, data_len);
+
+ /* Time stamp */
+ __net_timestamp(skb);
+
+ hci_send_to_channel(channel, skb, flag, skip_sk);
+ kfree_skb(skb);
+
+ return 0;
+}
+
+int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
+{
+ struct sk_buff *skb;
+ struct mgmt_hdr *hdr;
+ struct mgmt_ev_cmd_status *ev;
+ int err;
+
+ BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status);
+
+ skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = (void *) skb_put(skb, sizeof(*hdr));
+
+ hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS);
+ hdr->index = cpu_to_le16(index);
+ hdr->len = cpu_to_le16(sizeof(*ev));
+
+ ev = (void *) skb_put(skb, sizeof(*ev));
+ ev->status = status;
+ ev->opcode = cpu_to_le16(cmd);
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err < 0)
+ kfree_skb(skb);
+
+ return err;
+}
+
+int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
+ void *rp, size_t rp_len)
+{
+ struct sk_buff *skb;
+ struct mgmt_hdr *hdr;
+ struct mgmt_ev_cmd_complete *ev;
+ int err;
+
+ BT_DBG("sock %p", sk);
+
+ skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = (void *) skb_put(skb, sizeof(*hdr));
+
+ hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE);
+ hdr->index = cpu_to_le16(index);
+ hdr->len = cpu_to_le16(sizeof(*ev) + rp_len);
+
+ ev = (void *) skb_put(skb, sizeof(*ev) + rp_len);
+ ev->opcode = cpu_to_le16(cmd);
+ ev->status = status;
+
+ if (rp)
+ memcpy(ev->data, rp, rp_len);
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err < 0)
+ kfree_skb(skb);
+
+ return err;
+}
+
+struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode,
+ struct hci_dev *hdev)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ list_for_each_entry(cmd, &hdev->mgmt_pending, list) {
+ if (hci_sock_get_channel(cmd->sk) != channel)
+ continue;
+ if (cmd->opcode == opcode)
+ return cmd;
+ }
+
+ return NULL;
+}
+
+struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel,
+ u16 opcode,
+ struct hci_dev *hdev,
+ const void *data)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ list_for_each_entry(cmd, &hdev->mgmt_pending, list) {
+ if (cmd->user_data != data)
+ continue;
+ if (cmd->opcode == opcode)
+ return cmd;
+ }
+
+ return NULL;
+}
+
+void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev,
+ void (*cb)(struct mgmt_pending_cmd *cmd, void *data),
+ void *data)
+{
+ struct mgmt_pending_cmd *cmd, *tmp;
+
+ list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) {
+ if (opcode > 0 && cmd->opcode != opcode)
+ continue;
+
+ cb(cmd, data);
+ }
+}
+
+struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
+ struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return NULL;
+
+ cmd->opcode = opcode;
+ cmd->index = hdev->id;
+
+ cmd->param = kmemdup(data, len, GFP_KERNEL);
+ if (!cmd->param) {
+ kfree(cmd);
+ return NULL;
+ }
+
+ cmd->param_len = len;
+
+ cmd->sk = sk;
+ sock_hold(sk);
+
+ list_add(&cmd->list, &hdev->mgmt_pending);
+
+ return cmd;
+}
+
+void mgmt_pending_free(struct mgmt_pending_cmd *cmd)
+{
+ sock_put(cmd->sk);
+ kfree(cmd->param);
+ kfree(cmd);
+}
+
+void mgmt_pending_remove(struct mgmt_pending_cmd *cmd)
+{
+ list_del(&cmd->list);
+ mgmt_pending_free(cmd);
+}
diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h
new file mode 100644
index 000000000..6559f1892
--- /dev/null
+++ b/net/bluetooth/mgmt_util.h
@@ -0,0 +1,53 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2015 Intel Coropration
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+struct mgmt_pending_cmd {
+ struct list_head list;
+ u16 opcode;
+ int index;
+ void *param;
+ size_t param_len;
+ struct sock *sk;
+ void *user_data;
+ int (*cmd_complete)(struct mgmt_pending_cmd *cmd, u8 status);
+};
+
+int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
+ void *data, u16 data_len, int flag, struct sock *skip_sk);
+int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status);
+int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
+ void *rp, size_t rp_len);
+
+struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode,
+ struct hci_dev *hdev);
+struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel,
+ u16 opcode,
+ struct hci_dev *hdev,
+ const void *data);
+void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev,
+ void (*cb)(struct mgmt_pending_cmd *cmd, void *data),
+ void *data);
+struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
+ struct hci_dev *hdev,
+ void *data, u16 len);
+void mgmt_pending_free(struct mgmt_pending_cmd *cmd);
+void mgmt_pending_remove(struct mgmt_pending_cmd *cmd);
diff --git a/net/bluetooth/rfcomm/Kconfig b/net/bluetooth/rfcomm/Kconfig
new file mode 100644
index 000000000..335df7515
--- /dev/null
+++ b/net/bluetooth/rfcomm/Kconfig
@@ -0,0 +1,18 @@
+config BT_RFCOMM
+ tristate "RFCOMM protocol support"
+ depends on BT_BREDR
+ help
+ RFCOMM provides connection oriented stream transport. RFCOMM
+ support is required for Dialup Networking, OBEX and other Bluetooth
+ applications.
+
+ Say Y here to compile RFCOMM support into the kernel or say M to
+ compile it as module (rfcomm).
+
+config BT_RFCOMM_TTY
+ bool "RFCOMM TTY support"
+ depends on BT_RFCOMM
+ depends on TTY
+ help
+ This option enables TTY emulation support for RFCOMM channels.
+
diff --git a/net/bluetooth/rfcomm/Makefile b/net/bluetooth/rfcomm/Makefile
new file mode 100644
index 000000000..fe07988a3
--- /dev/null
+++ b/net/bluetooth/rfcomm/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux Bluetooth RFCOMM layer.
+#
+
+obj-$(CONFIG_BT_RFCOMM) += rfcomm.o
+
+rfcomm-y := core.o sock.o
+rfcomm-$(CONFIG_BT_RFCOMM_TTY) += tty.o
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
new file mode 100644
index 000000000..4fea24275
--- /dev/null
+++ b/net/bluetooth/rfcomm/core.c
@@ -0,0 +1,2277 @@
+/*
+ RFCOMM implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+ Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/*
+ * Bluetooth RFCOMM core.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/rfcomm.h>
+
+#define VERSION "1.11"
+
+static bool disable_cfc;
+static bool l2cap_ertm;
+static int channel_mtu = -1;
+static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU;
+
+static struct task_struct *rfcomm_thread;
+
+static DEFINE_MUTEX(rfcomm_mutex);
+#define rfcomm_lock() mutex_lock(&rfcomm_mutex)
+#define rfcomm_unlock() mutex_unlock(&rfcomm_mutex)
+
+
+static LIST_HEAD(session_list);
+
+static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len);
+static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci);
+static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci);
+static int rfcomm_queue_disc(struct rfcomm_dlc *d);
+static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type);
+static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d);
+static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig);
+static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len);
+static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits);
+static void rfcomm_make_uih(struct sk_buff *skb, u8 addr);
+
+static void rfcomm_process_connect(struct rfcomm_session *s);
+
+static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
+ bdaddr_t *dst,
+ u8 sec_level,
+ int *err);
+static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst);
+static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s);
+
+/* ---- RFCOMM frame parsing macros ---- */
+#define __get_dlci(b) ((b & 0xfc) >> 2)
+#define __get_channel(b) ((b & 0xf8) >> 3)
+#define __get_dir(b) ((b & 0x04) >> 2)
+#define __get_type(b) ((b & 0xef))
+
+#define __test_ea(b) ((b & 0x01))
+#define __test_cr(b) (!!(b & 0x02))
+#define __test_pf(b) (!!(b & 0x10))
+
+#define __session_dir(s) ((s)->initiator ? 0x00 : 0x01)
+
+#define __addr(cr, dlci) (((dlci & 0x3f) << 2) | (cr << 1) | 0x01)
+#define __ctrl(type, pf) (((type & 0xef) | (pf << 4)))
+#define __dlci(dir, chn) (((chn & 0x1f) << 1) | dir)
+#define __srv_channel(dlci) (dlci >> 1)
+#define __dir(dlci) (dlci & 0x01)
+
+#define __len8(len) (((len) << 1) | 1)
+#define __len16(len) ((len) << 1)
+
+/* MCC macros */
+#define __mcc_type(cr, type) (((type << 2) | (cr << 1) | 0x01))
+#define __get_mcc_type(b) ((b & 0xfc) >> 2)
+#define __get_mcc_len(b) ((b & 0xfe) >> 1)
+
+/* RPN macros */
+#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3))
+#define __get_rpn_data_bits(line) ((line) & 0x3)
+#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
+#define __get_rpn_parity(line) (((line) >> 3) & 0x7)
+
+static DECLARE_WAIT_QUEUE_HEAD(rfcomm_wq);
+
+static void rfcomm_schedule(void)
+{
+ wake_up_all(&rfcomm_wq);
+}
+
+/* ---- RFCOMM FCS computation ---- */
+
+/* reversed, 8-bit, poly=0x07 */
+static unsigned char rfcomm_crc_table[256] = {
+ 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75,
+ 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b,
+ 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69,
+ 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67,
+
+ 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d,
+ 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43,
+ 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51,
+ 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f,
+
+ 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05,
+ 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b,
+ 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19,
+ 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17,
+
+ 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d,
+ 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33,
+ 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21,
+ 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f,
+
+ 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95,
+ 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b,
+ 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89,
+ 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87,
+
+ 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad,
+ 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3,
+ 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1,
+ 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf,
+
+ 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5,
+ 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb,
+ 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9,
+ 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7,
+
+ 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd,
+ 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3,
+ 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1,
+ 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf
+};
+
+/* CRC on 2 bytes */
+#define __crc(data) (rfcomm_crc_table[rfcomm_crc_table[0xff ^ data[0]] ^ data[1]])
+
+/* FCS on 2 bytes */
+static inline u8 __fcs(u8 *data)
+{
+ return 0xff - __crc(data);
+}
+
+/* FCS on 3 bytes */
+static inline u8 __fcs2(u8 *data)
+{
+ return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]];
+}
+
+/* Check FCS */
+static inline int __check_fcs(u8 *data, int type, u8 fcs)
+{
+ u8 f = __crc(data);
+
+ if (type != RFCOMM_UIH)
+ f = rfcomm_crc_table[f ^ data[2]];
+
+ return rfcomm_crc_table[f ^ fcs] != 0xcf;
+}
+
+/* ---- L2CAP callbacks ---- */
+static void rfcomm_l2state_change(struct sock *sk)
+{
+ BT_DBG("%p state %d", sk, sk->sk_state);
+ rfcomm_schedule();
+}
+
+static void rfcomm_l2data_ready(struct sock *sk)
+{
+ BT_DBG("%p", sk);
+ rfcomm_schedule();
+}
+
+static int rfcomm_l2sock_create(struct socket **sock)
+{
+ int err;
+
+ BT_DBG("");
+
+ err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock);
+ if (!err) {
+ struct sock *sk = (*sock)->sk;
+ sk->sk_data_ready = rfcomm_l2data_ready;
+ sk->sk_state_change = rfcomm_l2state_change;
+ }
+ return err;
+}
+
+static int rfcomm_check_security(struct rfcomm_dlc *d)
+{
+ struct sock *sk = d->session->sock->sk;
+ struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;
+
+ __u8 auth_type;
+
+ switch (d->sec_level) {
+ case BT_SECURITY_HIGH:
+ case BT_SECURITY_FIPS:
+ auth_type = HCI_AT_GENERAL_BONDING_MITM;
+ break;
+ case BT_SECURITY_MEDIUM:
+ auth_type = HCI_AT_GENERAL_BONDING;
+ break;
+ default:
+ auth_type = HCI_AT_NO_BONDING;
+ break;
+ }
+
+ return hci_conn_security(conn->hcon, d->sec_level, auth_type,
+ d->out);
+}
+
+static void rfcomm_session_timeout(unsigned long arg)
+{
+ struct rfcomm_session *s = (void *) arg;
+
+ BT_DBG("session %p state %ld", s, s->state);
+
+ set_bit(RFCOMM_TIMED_OUT, &s->flags);
+ rfcomm_schedule();
+}
+
+static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout)
+{
+ BT_DBG("session %p state %ld timeout %ld", s, s->state, timeout);
+
+ mod_timer(&s->timer, jiffies + timeout);
+}
+
+static void rfcomm_session_clear_timer(struct rfcomm_session *s)
+{
+ BT_DBG("session %p state %ld", s, s->state);
+
+ del_timer_sync(&s->timer);
+}
+
+/* ---- RFCOMM DLCs ---- */
+static void rfcomm_dlc_timeout(unsigned long arg)
+{
+ struct rfcomm_dlc *d = (void *) arg;
+
+ BT_DBG("dlc %p state %ld", d, d->state);
+
+ set_bit(RFCOMM_TIMED_OUT, &d->flags);
+ rfcomm_dlc_put(d);
+ rfcomm_schedule();
+}
+
+static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout)
+{
+ BT_DBG("dlc %p state %ld timeout %ld", d, d->state, timeout);
+
+ if (!mod_timer(&d->timer, jiffies + timeout))
+ rfcomm_dlc_hold(d);
+}
+
+static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d)
+{
+ BT_DBG("dlc %p state %ld", d, d->state);
+
+ if (del_timer(&d->timer))
+ rfcomm_dlc_put(d);
+}
+
+static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
+{
+ BT_DBG("%p", d);
+
+ d->state = BT_OPEN;
+ d->flags = 0;
+ d->mscex = 0;
+ d->sec_level = BT_SECURITY_LOW;
+ d->mtu = RFCOMM_DEFAULT_MTU;
+ d->v24_sig = RFCOMM_V24_RTC | RFCOMM_V24_RTR | RFCOMM_V24_DV;
+
+ d->cfc = RFCOMM_CFC_DISABLED;
+ d->rx_credits = RFCOMM_DEFAULT_CREDITS;
+}
+
+struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio)
+{
+ struct rfcomm_dlc *d = kzalloc(sizeof(*d), prio);
+
+ if (!d)
+ return NULL;
+
+ setup_timer(&d->timer, rfcomm_dlc_timeout, (unsigned long)d);
+
+ skb_queue_head_init(&d->tx_queue);
+ mutex_init(&d->lock);
+ atomic_set(&d->refcnt, 1);
+
+ rfcomm_dlc_clear_state(d);
+
+ BT_DBG("%p", d);
+
+ return d;
+}
+
+void rfcomm_dlc_free(struct rfcomm_dlc *d)
+{
+ BT_DBG("%p", d);
+
+ skb_queue_purge(&d->tx_queue);
+ kfree(d);
+}
+
+static void rfcomm_dlc_link(struct rfcomm_session *s, struct rfcomm_dlc *d)
+{
+ BT_DBG("dlc %p session %p", d, s);
+
+ rfcomm_session_clear_timer(s);
+ rfcomm_dlc_hold(d);
+ list_add(&d->list, &s->dlcs);
+ d->session = s;
+}
+
+static void rfcomm_dlc_unlink(struct rfcomm_dlc *d)
+{
+ struct rfcomm_session *s = d->session;
+
+ BT_DBG("dlc %p refcnt %d session %p", d, atomic_read(&d->refcnt), s);
+
+ list_del(&d->list);
+ d->session = NULL;
+ rfcomm_dlc_put(d);
+
+ if (list_empty(&s->dlcs))
+ rfcomm_session_set_timer(s, RFCOMM_IDLE_TIMEOUT);
+}
+
+static struct rfcomm_dlc *rfcomm_dlc_get(struct rfcomm_session *s, u8 dlci)
+{
+ struct rfcomm_dlc *d;
+
+ list_for_each_entry(d, &s->dlcs, list)
+ if (d->dlci == dlci)
+ return d;
+
+ return NULL;
+}
+
+static int rfcomm_check_channel(u8 channel)
+{
+ return channel < 1 || channel > 30;
+}
+
+static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel)
+{
+ struct rfcomm_session *s;
+ int err = 0;
+ u8 dlci;
+
+ BT_DBG("dlc %p state %ld %pMR -> %pMR channel %d",
+ d, d->state, src, dst, channel);
+
+ if (rfcomm_check_channel(channel))
+ return -EINVAL;
+
+ if (d->state != BT_OPEN && d->state != BT_CLOSED)
+ return 0;
+
+ s = rfcomm_session_get(src, dst);
+ if (!s) {
+ s = rfcomm_session_create(src, dst, d->sec_level, &err);
+ if (!s)
+ return err;
+ }
+
+ dlci = __dlci(__session_dir(s), channel);
+
+ /* Check if DLCI already exists */
+ if (rfcomm_dlc_get(s, dlci))
+ return -EBUSY;
+
+ rfcomm_dlc_clear_state(d);
+
+ d->dlci = dlci;
+ d->addr = __addr(s->initiator, dlci);
+ d->priority = 7;
+
+ d->state = BT_CONFIG;
+ rfcomm_dlc_link(s, d);
+
+ d->out = 1;
+
+ d->mtu = s->mtu;
+ d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc;
+
+ if (s->state == BT_CONNECTED) {
+ if (rfcomm_check_security(d))
+ rfcomm_send_pn(s, 1, d);
+ else
+ set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+ }
+
+ rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
+
+ return 0;
+}
+
+int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel)
+{
+ int r;
+
+ rfcomm_lock();
+
+ r = __rfcomm_dlc_open(d, src, dst, channel);
+
+ rfcomm_unlock();
+ return r;
+}
+
+static void __rfcomm_dlc_disconn(struct rfcomm_dlc *d)
+{
+ struct rfcomm_session *s = d->session;
+
+ d->state = BT_DISCONN;
+ if (skb_queue_empty(&d->tx_queue)) {
+ rfcomm_send_disc(s, d->dlci);
+ rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT);
+ } else {
+ rfcomm_queue_disc(d);
+ rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2);
+ }
+}
+
+static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
+{
+ struct rfcomm_session *s = d->session;
+ if (!s)
+ return 0;
+
+ BT_DBG("dlc %p state %ld dlci %d err %d session %p",
+ d, d->state, d->dlci, err, s);
+
+ switch (d->state) {
+ case BT_CONNECT:
+ case BT_CONFIG:
+ case BT_OPEN:
+ case BT_CONNECT2:
+ if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+ set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+ rfcomm_schedule();
+ return 0;
+ }
+ }
+
+ switch (d->state) {
+ case BT_CONNECT:
+ case BT_CONNECTED:
+ __rfcomm_dlc_disconn(d);
+ break;
+
+ case BT_CONFIG:
+ if (s->state != BT_BOUND) {
+ __rfcomm_dlc_disconn(d);
+ break;
+ }
+ /* if closing a dlc in a session that hasn't been started,
+ * just close and unlink the dlc
+ */
+
+ default:
+ rfcomm_dlc_clear_timer(d);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CLOSED;
+ d->state_change(d, err);
+ rfcomm_dlc_unlock(d);
+
+ skb_queue_purge(&d->tx_queue);
+ rfcomm_dlc_unlink(d);
+ }
+
+ return 0;
+}
+
+int rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
+{
+ int r = 0;
+ struct rfcomm_dlc *d_list;
+ struct rfcomm_session *s, *s_list;
+
+ BT_DBG("dlc %p state %ld dlci %d err %d", d, d->state, d->dlci, err);
+
+ rfcomm_lock();
+
+ s = d->session;
+ if (!s)
+ goto no_session;
+
+ /* after waiting on the mutex check the session still exists
+ * then check the dlc still exists
+ */
+ list_for_each_entry(s_list, &session_list, list) {
+ if (s_list == s) {
+ list_for_each_entry(d_list, &s->dlcs, list) {
+ if (d_list == d) {
+ r = __rfcomm_dlc_close(d, err);
+ break;
+ }
+ }
+ break;
+ }
+ }
+
+no_session:
+ rfcomm_unlock();
+ return r;
+}
+
+struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel)
+{
+ struct rfcomm_session *s;
+ struct rfcomm_dlc *dlc = NULL;
+ u8 dlci;
+
+ if (rfcomm_check_channel(channel))
+ return ERR_PTR(-EINVAL);
+
+ rfcomm_lock();
+ s = rfcomm_session_get(src, dst);
+ if (s) {
+ dlci = __dlci(__session_dir(s), channel);
+ dlc = rfcomm_dlc_get(s, dlci);
+ }
+ rfcomm_unlock();
+ return dlc;
+}
+
+int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
+{
+ int len = skb->len;
+
+ if (d->state != BT_CONNECTED)
+ return -ENOTCONN;
+
+ BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len);
+
+ if (len > d->mtu)
+ return -EINVAL;
+
+ rfcomm_make_uih(skb, d->addr);
+ skb_queue_tail(&d->tx_queue, skb);
+
+ if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags))
+ rfcomm_schedule();
+ return len;
+}
+
+void rfcomm_dlc_send_noerror(struct rfcomm_dlc *d, struct sk_buff *skb)
+{
+ int len = skb->len;
+
+ BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len);
+
+ rfcomm_make_uih(skb, d->addr);
+ skb_queue_tail(&d->tx_queue, skb);
+
+ if (d->state == BT_CONNECTED &&
+ !test_bit(RFCOMM_TX_THROTTLED, &d->flags))
+ rfcomm_schedule();
+}
+
+void __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
+{
+ BT_DBG("dlc %p state %ld", d, d->state);
+
+ if (!d->cfc) {
+ d->v24_sig |= RFCOMM_V24_FC;
+ set_bit(RFCOMM_MSC_PENDING, &d->flags);
+ }
+ rfcomm_schedule();
+}
+
+void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
+{
+ BT_DBG("dlc %p state %ld", d, d->state);
+
+ if (!d->cfc) {
+ d->v24_sig &= ~RFCOMM_V24_FC;
+ set_bit(RFCOMM_MSC_PENDING, &d->flags);
+ }
+ rfcomm_schedule();
+}
+
+/*
+ Set/get modem status functions use _local_ status i.e. what we report
+ to the other side.
+ Remote status is provided by dlc->modem_status() callback.
+ */
+int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig)
+{
+ BT_DBG("dlc %p state %ld v24_sig 0x%x",
+ d, d->state, v24_sig);
+
+ if (test_bit(RFCOMM_RX_THROTTLED, &d->flags))
+ v24_sig |= RFCOMM_V24_FC;
+ else
+ v24_sig &= ~RFCOMM_V24_FC;
+
+ d->v24_sig = v24_sig;
+
+ if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags))
+ rfcomm_schedule();
+
+ return 0;
+}
+
+int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig)
+{
+ BT_DBG("dlc %p state %ld v24_sig 0x%x",
+ d, d->state, d->v24_sig);
+
+ *v24_sig = d->v24_sig;
+ return 0;
+}
+
+/* ---- RFCOMM sessions ---- */
+static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state)
+{
+ struct rfcomm_session *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ return NULL;
+
+ BT_DBG("session %p sock %p", s, sock);
+
+ setup_timer(&s->timer, rfcomm_session_timeout, (unsigned long) s);
+
+ INIT_LIST_HEAD(&s->dlcs);
+ s->state = state;
+ s->sock = sock;
+
+ s->mtu = RFCOMM_DEFAULT_MTU;
+ s->cfc = disable_cfc ? RFCOMM_CFC_DISABLED : RFCOMM_CFC_UNKNOWN;
+
+ /* Do not increment module usage count for listening sessions.
+ * Otherwise we won't be able to unload the module. */
+ if (state != BT_LISTEN)
+ if (!try_module_get(THIS_MODULE)) {
+ kfree(s);
+ return NULL;
+ }
+
+ list_add(&s->list, &session_list);
+
+ return s;
+}
+
+static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s)
+{
+ int state = s->state;
+
+ BT_DBG("session %p state %ld", s, s->state);
+
+ list_del(&s->list);
+
+ rfcomm_session_clear_timer(s);
+ sock_release(s->sock);
+ kfree(s);
+
+ if (state != BT_LISTEN)
+ module_put(THIS_MODULE);
+
+ return NULL;
+}
+
+static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst)
+{
+ struct rfcomm_session *s;
+ struct list_head *p, *n;
+ struct l2cap_chan *chan;
+ list_for_each_safe(p, n, &session_list) {
+ s = list_entry(p, struct rfcomm_session, list);
+ chan = l2cap_pi(s->sock->sk)->chan;
+
+ if ((!bacmp(src, BDADDR_ANY) || !bacmp(&chan->src, src)) &&
+ !bacmp(&chan->dst, dst))
+ return s;
+ }
+ return NULL;
+}
+
+static struct rfcomm_session *rfcomm_session_close(struct rfcomm_session *s,
+ int err)
+{
+ struct rfcomm_dlc *d;
+ struct list_head *p, *n;
+
+ s->state = BT_CLOSED;
+
+ BT_DBG("session %p state %ld err %d", s, s->state, err);
+
+ /* Close all dlcs */
+ list_for_each_safe(p, n, &s->dlcs) {
+ d = list_entry(p, struct rfcomm_dlc, list);
+ d->state = BT_CLOSED;
+ __rfcomm_dlc_close(d, err);
+ }
+
+ rfcomm_session_clear_timer(s);
+ return rfcomm_session_del(s);
+}
+
+static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
+ bdaddr_t *dst,
+ u8 sec_level,
+ int *err)
+{
+ struct rfcomm_session *s = NULL;
+ struct sockaddr_l2 addr;
+ struct socket *sock;
+ struct sock *sk;
+
+ BT_DBG("%pMR -> %pMR", src, dst);
+
+ *err = rfcomm_l2sock_create(&sock);
+ if (*err < 0)
+ return NULL;
+
+ bacpy(&addr.l2_bdaddr, src);
+ addr.l2_family = AF_BLUETOOTH;
+ addr.l2_psm = 0;
+ addr.l2_cid = 0;
+ addr.l2_bdaddr_type = BDADDR_BREDR;
+ *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+ if (*err < 0)
+ goto failed;
+
+ /* Set L2CAP options */
+ sk = sock->sk;
+ lock_sock(sk);
+ l2cap_pi(sk)->chan->imtu = l2cap_mtu;
+ l2cap_pi(sk)->chan->sec_level = sec_level;
+ if (l2cap_ertm)
+ l2cap_pi(sk)->chan->mode = L2CAP_MODE_ERTM;
+ release_sock(sk);
+
+ s = rfcomm_session_add(sock, BT_BOUND);
+ if (!s) {
+ *err = -ENOMEM;
+ goto failed;
+ }
+
+ s->initiator = 1;
+
+ bacpy(&addr.l2_bdaddr, dst);
+ addr.l2_family = AF_BLUETOOTH;
+ addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM);
+ addr.l2_cid = 0;
+ addr.l2_bdaddr_type = BDADDR_BREDR;
+ *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK);
+ if (*err == 0 || *err == -EINPROGRESS)
+ return s;
+
+ return rfcomm_session_del(s);
+
+failed:
+ sock_release(sock);
+ return NULL;
+}
+
+void rfcomm_session_getaddr(struct rfcomm_session *s, bdaddr_t *src, bdaddr_t *dst)
+{
+ struct l2cap_chan *chan = l2cap_pi(s->sock->sk)->chan;
+ if (src)
+ bacpy(src, &chan->src);
+ if (dst)
+ bacpy(dst, &chan->dst);
+}
+
+/* ---- RFCOMM frame sending ---- */
+static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len)
+{
+ struct kvec iv = { data, len };
+ struct msghdr msg;
+
+ BT_DBG("session %p len %d", s, len);
+
+ memset(&msg, 0, sizeof(msg));
+
+ return kernel_sendmsg(s->sock, &msg, &iv, 1, len);
+}
+
+static int rfcomm_send_cmd(struct rfcomm_session *s, struct rfcomm_cmd *cmd)
+{
+ BT_DBG("%p cmd %u", s, cmd->ctrl);
+
+ return rfcomm_send_frame(s, (void *) cmd, sizeof(*cmd));
+}
+
+static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci)
+{
+ struct rfcomm_cmd cmd;
+
+ BT_DBG("%p dlci %d", s, dlci);
+
+ cmd.addr = __addr(s->initiator, dlci);
+ cmd.ctrl = __ctrl(RFCOMM_SABM, 1);
+ cmd.len = __len8(0);
+ cmd.fcs = __fcs2((u8 *) &cmd);
+
+ return rfcomm_send_cmd(s, &cmd);
+}
+
+static int rfcomm_send_ua(struct rfcomm_session *s, u8 dlci)
+{
+ struct rfcomm_cmd cmd;
+
+ BT_DBG("%p dlci %d", s, dlci);
+
+ cmd.addr = __addr(!s->initiator, dlci);
+ cmd.ctrl = __ctrl(RFCOMM_UA, 1);
+ cmd.len = __len8(0);
+ cmd.fcs = __fcs2((u8 *) &cmd);
+
+ return rfcomm_send_cmd(s, &cmd);
+}
+
+static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci)
+{
+ struct rfcomm_cmd cmd;
+
+ BT_DBG("%p dlci %d", s, dlci);
+
+ cmd.addr = __addr(s->initiator, dlci);
+ cmd.ctrl = __ctrl(RFCOMM_DISC, 1);
+ cmd.len = __len8(0);
+ cmd.fcs = __fcs2((u8 *) &cmd);
+
+ return rfcomm_send_cmd(s, &cmd);
+}
+
+static int rfcomm_queue_disc(struct rfcomm_dlc *d)
+{
+ struct rfcomm_cmd *cmd;
+ struct sk_buff *skb;
+
+ BT_DBG("dlc %p dlci %d", d, d->dlci);
+
+ skb = alloc_skb(sizeof(*cmd), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ cmd = (void *) __skb_put(skb, sizeof(*cmd));
+ cmd->addr = d->addr;
+ cmd->ctrl = __ctrl(RFCOMM_DISC, 1);
+ cmd->len = __len8(0);
+ cmd->fcs = __fcs2((u8 *) cmd);
+
+ skb_queue_tail(&d->tx_queue, skb);
+ rfcomm_schedule();
+ return 0;
+}
+
+static int rfcomm_send_dm(struct rfcomm_session *s, u8 dlci)
+{
+ struct rfcomm_cmd cmd;
+
+ BT_DBG("%p dlci %d", s, dlci);
+
+ cmd.addr = __addr(!s->initiator, dlci);
+ cmd.ctrl = __ctrl(RFCOMM_DM, 1);
+ cmd.len = __len8(0);
+ cmd.fcs = __fcs2((u8 *) &cmd);
+
+ return rfcomm_send_cmd(s, &cmd);
+}
+
+static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type)
+{
+ struct rfcomm_hdr *hdr;
+ struct rfcomm_mcc *mcc;
+ u8 buf[16], *ptr = buf;
+
+ BT_DBG("%p cr %d type %d", s, cr, type);
+
+ hdr = (void *) ptr; ptr += sizeof(*hdr);
+ hdr->addr = __addr(s->initiator, 0);
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+ hdr->len = __len8(sizeof(*mcc) + 1);
+
+ mcc = (void *) ptr; ptr += sizeof(*mcc);
+ mcc->type = __mcc_type(0, RFCOMM_NSC);
+ mcc->len = __len8(1);
+
+ /* Type that we didn't like */
+ *ptr = __mcc_type(cr, type); ptr++;
+
+ *ptr = __fcs(buf); ptr++;
+
+ return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d)
+{
+ struct rfcomm_hdr *hdr;
+ struct rfcomm_mcc *mcc;
+ struct rfcomm_pn *pn;
+ u8 buf[16], *ptr = buf;
+
+ BT_DBG("%p cr %d dlci %d mtu %d", s, cr, d->dlci, d->mtu);
+
+ hdr = (void *) ptr; ptr += sizeof(*hdr);
+ hdr->addr = __addr(s->initiator, 0);
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+ hdr->len = __len8(sizeof(*mcc) + sizeof(*pn));
+
+ mcc = (void *) ptr; ptr += sizeof(*mcc);
+ mcc->type = __mcc_type(cr, RFCOMM_PN);
+ mcc->len = __len8(sizeof(*pn));
+
+ pn = (void *) ptr; ptr += sizeof(*pn);
+ pn->dlci = d->dlci;
+ pn->priority = d->priority;
+ pn->ack_timer = 0;
+ pn->max_retrans = 0;
+
+ if (s->cfc) {
+ pn->flow_ctrl = cr ? 0xf0 : 0xe0;
+ pn->credits = RFCOMM_DEFAULT_CREDITS;
+ } else {
+ pn->flow_ctrl = 0;
+ pn->credits = 0;
+ }
+
+ if (cr && channel_mtu >= 0)
+ pn->mtu = cpu_to_le16(channel_mtu);
+ else
+ pn->mtu = cpu_to_le16(d->mtu);
+
+ *ptr = __fcs(buf); ptr++;
+
+ return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
+ u8 bit_rate, u8 data_bits, u8 stop_bits,
+ u8 parity, u8 flow_ctrl_settings,
+ u8 xon_char, u8 xoff_char, u16 param_mask)
+{
+ struct rfcomm_hdr *hdr;
+ struct rfcomm_mcc *mcc;
+ struct rfcomm_rpn *rpn;
+ u8 buf[16], *ptr = buf;
+
+ BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x"
+ " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
+ s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
+ flow_ctrl_settings, xon_char, xoff_char, param_mask);
+
+ hdr = (void *) ptr; ptr += sizeof(*hdr);
+ hdr->addr = __addr(s->initiator, 0);
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+ hdr->len = __len8(sizeof(*mcc) + sizeof(*rpn));
+
+ mcc = (void *) ptr; ptr += sizeof(*mcc);
+ mcc->type = __mcc_type(cr, RFCOMM_RPN);
+ mcc->len = __len8(sizeof(*rpn));
+
+ rpn = (void *) ptr; ptr += sizeof(*rpn);
+ rpn->dlci = __addr(1, dlci);
+ rpn->bit_rate = bit_rate;
+ rpn->line_settings = __rpn_line_settings(data_bits, stop_bits, parity);
+ rpn->flow_ctrl = flow_ctrl_settings;
+ rpn->xon_char = xon_char;
+ rpn->xoff_char = xoff_char;
+ rpn->param_mask = cpu_to_le16(param_mask);
+
+ *ptr = __fcs(buf); ptr++;
+
+ return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_rls(struct rfcomm_session *s, int cr, u8 dlci, u8 status)
+{
+ struct rfcomm_hdr *hdr;
+ struct rfcomm_mcc *mcc;
+ struct rfcomm_rls *rls;
+ u8 buf[16], *ptr = buf;
+
+ BT_DBG("%p cr %d status 0x%x", s, cr, status);
+
+ hdr = (void *) ptr; ptr += sizeof(*hdr);
+ hdr->addr = __addr(s->initiator, 0);
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+ hdr->len = __len8(sizeof(*mcc) + sizeof(*rls));
+
+ mcc = (void *) ptr; ptr += sizeof(*mcc);
+ mcc->type = __mcc_type(cr, RFCOMM_RLS);
+ mcc->len = __len8(sizeof(*rls));
+
+ rls = (void *) ptr; ptr += sizeof(*rls);
+ rls->dlci = __addr(1, dlci);
+ rls->status = status;
+
+ *ptr = __fcs(buf); ptr++;
+
+ return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig)
+{
+ struct rfcomm_hdr *hdr;
+ struct rfcomm_mcc *mcc;
+ struct rfcomm_msc *msc;
+ u8 buf[16], *ptr = buf;
+
+ BT_DBG("%p cr %d v24 0x%x", s, cr, v24_sig);
+
+ hdr = (void *) ptr; ptr += sizeof(*hdr);
+ hdr->addr = __addr(s->initiator, 0);
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+ hdr->len = __len8(sizeof(*mcc) + sizeof(*msc));
+
+ mcc = (void *) ptr; ptr += sizeof(*mcc);
+ mcc->type = __mcc_type(cr, RFCOMM_MSC);
+ mcc->len = __len8(sizeof(*msc));
+
+ msc = (void *) ptr; ptr += sizeof(*msc);
+ msc->dlci = __addr(1, dlci);
+ msc->v24_sig = v24_sig | 0x01;
+
+ *ptr = __fcs(buf); ptr++;
+
+ return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_fcoff(struct rfcomm_session *s, int cr)
+{
+ struct rfcomm_hdr *hdr;
+ struct rfcomm_mcc *mcc;
+ u8 buf[16], *ptr = buf;
+
+ BT_DBG("%p cr %d", s, cr);
+
+ hdr = (void *) ptr; ptr += sizeof(*hdr);
+ hdr->addr = __addr(s->initiator, 0);
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+ hdr->len = __len8(sizeof(*mcc));
+
+ mcc = (void *) ptr; ptr += sizeof(*mcc);
+ mcc->type = __mcc_type(cr, RFCOMM_FCOFF);
+ mcc->len = __len8(0);
+
+ *ptr = __fcs(buf); ptr++;
+
+ return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_fcon(struct rfcomm_session *s, int cr)
+{
+ struct rfcomm_hdr *hdr;
+ struct rfcomm_mcc *mcc;
+ u8 buf[16], *ptr = buf;
+
+ BT_DBG("%p cr %d", s, cr);
+
+ hdr = (void *) ptr; ptr += sizeof(*hdr);
+ hdr->addr = __addr(s->initiator, 0);
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+ hdr->len = __len8(sizeof(*mcc));
+
+ mcc = (void *) ptr; ptr += sizeof(*mcc);
+ mcc->type = __mcc_type(cr, RFCOMM_FCON);
+ mcc->len = __len8(0);
+
+ *ptr = __fcs(buf); ptr++;
+
+ return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len)
+{
+ struct socket *sock = s->sock;
+ struct kvec iv[3];
+ struct msghdr msg;
+ unsigned char hdr[5], crc[1];
+
+ if (len > 125)
+ return -EINVAL;
+
+ BT_DBG("%p cr %d", s, cr);
+
+ hdr[0] = __addr(s->initiator, 0);
+ hdr[1] = __ctrl(RFCOMM_UIH, 0);
+ hdr[2] = 0x01 | ((len + 2) << 1);
+ hdr[3] = 0x01 | ((cr & 0x01) << 1) | (RFCOMM_TEST << 2);
+ hdr[4] = 0x01 | (len << 1);
+
+ crc[0] = __fcs(hdr);
+
+ iv[0].iov_base = hdr;
+ iv[0].iov_len = 5;
+ iv[1].iov_base = pattern;
+ iv[1].iov_len = len;
+ iv[2].iov_base = crc;
+ iv[2].iov_len = 1;
+
+ memset(&msg, 0, sizeof(msg));
+
+ return kernel_sendmsg(sock, &msg, iv, 3, 6 + len);
+}
+
+static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits)
+{
+ struct rfcomm_hdr *hdr;
+ u8 buf[16], *ptr = buf;
+
+ BT_DBG("%p addr %d credits %d", s, addr, credits);
+
+ hdr = (void *) ptr; ptr += sizeof(*hdr);
+ hdr->addr = addr;
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 1);
+ hdr->len = __len8(0);
+
+ *ptr = credits; ptr++;
+
+ *ptr = __fcs(buf); ptr++;
+
+ return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static void rfcomm_make_uih(struct sk_buff *skb, u8 addr)
+{
+ struct rfcomm_hdr *hdr;
+ int len = skb->len;
+ u8 *crc;
+
+ if (len > 127) {
+ hdr = (void *) skb_push(skb, 4);
+ put_unaligned(cpu_to_le16(__len16(len)), (__le16 *) &hdr->len);
+ } else {
+ hdr = (void *) skb_push(skb, 3);
+ hdr->len = __len8(len);
+ }
+ hdr->addr = addr;
+ hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+
+ crc = skb_put(skb, 1);
+ *crc = __fcs((void *) hdr);
+}
+
+/* ---- RFCOMM frame reception ---- */
+static struct rfcomm_session *rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci)
+{
+ BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+ if (dlci) {
+ /* Data channel */
+ struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci);
+ if (!d) {
+ rfcomm_send_dm(s, dlci);
+ return s;
+ }
+
+ switch (d->state) {
+ case BT_CONNECT:
+ rfcomm_dlc_clear_timer(d);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CONNECTED;
+ d->state_change(d, 0);
+ rfcomm_dlc_unlock(d);
+
+ rfcomm_send_msc(s, 1, dlci, d->v24_sig);
+ break;
+
+ case BT_DISCONN:
+ d->state = BT_CLOSED;
+ __rfcomm_dlc_close(d, 0);
+
+ if (list_empty(&s->dlcs)) {
+ s->state = BT_DISCONN;
+ rfcomm_send_disc(s, 0);
+ rfcomm_session_clear_timer(s);
+ }
+
+ break;
+ }
+ } else {
+ /* Control channel */
+ switch (s->state) {
+ case BT_CONNECT:
+ s->state = BT_CONNECTED;
+ rfcomm_process_connect(s);
+ break;
+
+ case BT_DISCONN:
+ s = rfcomm_session_close(s, ECONNRESET);
+ break;
+ }
+ }
+ return s;
+}
+
+static struct rfcomm_session *rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci)
+{
+ int err = 0;
+
+ BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+ if (dlci) {
+ /* Data DLC */
+ struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci);
+ if (d) {
+ if (d->state == BT_CONNECT || d->state == BT_CONFIG)
+ err = ECONNREFUSED;
+ else
+ err = ECONNRESET;
+
+ d->state = BT_CLOSED;
+ __rfcomm_dlc_close(d, err);
+ }
+ } else {
+ if (s->state == BT_CONNECT)
+ err = ECONNREFUSED;
+ else
+ err = ECONNRESET;
+
+ s = rfcomm_session_close(s, err);
+ }
+ return s;
+}
+
+static struct rfcomm_session *rfcomm_recv_disc(struct rfcomm_session *s,
+ u8 dlci)
+{
+ int err = 0;
+
+ BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+ if (dlci) {
+ struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci);
+ if (d) {
+ rfcomm_send_ua(s, dlci);
+
+ if (d->state == BT_CONNECT || d->state == BT_CONFIG)
+ err = ECONNREFUSED;
+ else
+ err = ECONNRESET;
+
+ d->state = BT_CLOSED;
+ __rfcomm_dlc_close(d, err);
+ } else
+ rfcomm_send_dm(s, dlci);
+
+ } else {
+ rfcomm_send_ua(s, 0);
+
+ if (s->state == BT_CONNECT)
+ err = ECONNREFUSED;
+ else
+ err = ECONNRESET;
+
+ s = rfcomm_session_close(s, err);
+ }
+ return s;
+}
+
+void rfcomm_dlc_accept(struct rfcomm_dlc *d)
+{
+ struct sock *sk = d->session->sock->sk;
+ struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;
+
+ BT_DBG("dlc %p", d);
+
+ rfcomm_send_ua(d->session, d->dlci);
+
+ rfcomm_dlc_clear_timer(d);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CONNECTED;
+ d->state_change(d, 0);
+ rfcomm_dlc_unlock(d);
+
+ if (d->role_switch)
+ hci_conn_switch_role(conn->hcon, 0x00);
+
+ rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
+}
+
+static void rfcomm_check_accept(struct rfcomm_dlc *d)
+{
+ if (rfcomm_check_security(d)) {
+ if (d->defer_setup) {
+ set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CONNECT2;
+ d->state_change(d, 0);
+ rfcomm_dlc_unlock(d);
+ } else
+ rfcomm_dlc_accept(d);
+ } else {
+ set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+ }
+}
+
+static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
+{
+ struct rfcomm_dlc *d;
+ u8 channel;
+
+ BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+ if (!dlci) {
+ rfcomm_send_ua(s, 0);
+
+ if (s->state == BT_OPEN) {
+ s->state = BT_CONNECTED;
+ rfcomm_process_connect(s);
+ }
+ return 0;
+ }
+
+ /* Check if DLC exists */
+ d = rfcomm_dlc_get(s, dlci);
+ if (d) {
+ if (d->state == BT_OPEN) {
+ /* DLC was previously opened by PN request */
+ rfcomm_check_accept(d);
+ }
+ return 0;
+ }
+
+ /* Notify socket layer about incoming connection */
+ channel = __srv_channel(dlci);
+ if (rfcomm_connect_ind(s, channel, &d)) {
+ d->dlci = dlci;
+ d->addr = __addr(s->initiator, dlci);
+ rfcomm_dlc_link(s, d);
+
+ rfcomm_check_accept(d);
+ } else {
+ rfcomm_send_dm(s, dlci);
+ }
+
+ return 0;
+}
+
+static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn)
+{
+ struct rfcomm_session *s = d->session;
+
+ BT_DBG("dlc %p state %ld dlci %d mtu %d fc 0x%x credits %d",
+ d, d->state, d->dlci, pn->mtu, pn->flow_ctrl, pn->credits);
+
+ if ((pn->flow_ctrl == 0xf0 && s->cfc != RFCOMM_CFC_DISABLED) ||
+ pn->flow_ctrl == 0xe0) {
+ d->cfc = RFCOMM_CFC_ENABLED;
+ d->tx_credits = pn->credits;
+ } else {
+ d->cfc = RFCOMM_CFC_DISABLED;
+ set_bit(RFCOMM_TX_THROTTLED, &d->flags);
+ }
+
+ if (s->cfc == RFCOMM_CFC_UNKNOWN)
+ s->cfc = d->cfc;
+
+ d->priority = pn->priority;
+
+ d->mtu = __le16_to_cpu(pn->mtu);
+
+ if (cr && d->mtu > s->mtu)
+ d->mtu = s->mtu;
+
+ return 0;
+}
+
+static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb)
+{
+ struct rfcomm_pn *pn = (void *) skb->data;
+ struct rfcomm_dlc *d;
+ u8 dlci = pn->dlci;
+
+ BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+ if (!dlci)
+ return 0;
+
+ d = rfcomm_dlc_get(s, dlci);
+ if (d) {
+ if (cr) {
+ /* PN request */
+ rfcomm_apply_pn(d, cr, pn);
+ rfcomm_send_pn(s, 0, d);
+ } else {
+ /* PN response */
+ switch (d->state) {
+ case BT_CONFIG:
+ rfcomm_apply_pn(d, cr, pn);
+
+ d->state = BT_CONNECT;
+ rfcomm_send_sabm(s, d->dlci);
+ break;
+ }
+ }
+ } else {
+ u8 channel = __srv_channel(dlci);
+
+ if (!cr)
+ return 0;
+
+ /* PN request for non existing DLC.
+ * Assume incoming connection. */
+ if (rfcomm_connect_ind(s, channel, &d)) {
+ d->dlci = dlci;
+ d->addr = __addr(s->initiator, dlci);
+ rfcomm_dlc_link(s, d);
+
+ rfcomm_apply_pn(d, cr, pn);
+
+ d->state = BT_OPEN;
+ rfcomm_send_pn(s, 0, d);
+ } else {
+ rfcomm_send_dm(s, dlci);
+ }
+ }
+ return 0;
+}
+
+static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_buff *skb)
+{
+ struct rfcomm_rpn *rpn = (void *) skb->data;
+ u8 dlci = __get_dlci(rpn->dlci);
+
+ u8 bit_rate = 0;
+ u8 data_bits = 0;
+ u8 stop_bits = 0;
+ u8 parity = 0;
+ u8 flow_ctrl = 0;
+ u8 xon_char = 0;
+ u8 xoff_char = 0;
+ u16 rpn_mask = RFCOMM_RPN_PM_ALL;
+
+ BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
+ dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
+ rpn->xon_char, rpn->xoff_char, rpn->param_mask);
+
+ if (!cr)
+ return 0;
+
+ if (len == 1) {
+ /* This is a request, return default (according to ETSI TS 07.10) settings */
+ bit_rate = RFCOMM_RPN_BR_9600;
+ data_bits = RFCOMM_RPN_DATA_8;
+ stop_bits = RFCOMM_RPN_STOP_1;
+ parity = RFCOMM_RPN_PARITY_NONE;
+ flow_ctrl = RFCOMM_RPN_FLOW_NONE;
+ xon_char = RFCOMM_RPN_XON_CHAR;
+ xoff_char = RFCOMM_RPN_XOFF_CHAR;
+ goto rpn_out;
+ }
+
+ /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit,
+ * no parity, no flow control lines, normal XON/XOFF chars */
+
+ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) {
+ bit_rate = rpn->bit_rate;
+ if (bit_rate > RFCOMM_RPN_BR_230400) {
+ BT_DBG("RPN bit rate mismatch 0x%x", bit_rate);
+ bit_rate = RFCOMM_RPN_BR_9600;
+ rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
+ }
+ }
+
+ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_DATA)) {
+ data_bits = __get_rpn_data_bits(rpn->line_settings);
+ if (data_bits != RFCOMM_RPN_DATA_8) {
+ BT_DBG("RPN data bits mismatch 0x%x", data_bits);
+ data_bits = RFCOMM_RPN_DATA_8;
+ rpn_mask ^= RFCOMM_RPN_PM_DATA;
+ }
+ }
+
+ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_STOP)) {
+ stop_bits = __get_rpn_stop_bits(rpn->line_settings);
+ if (stop_bits != RFCOMM_RPN_STOP_1) {
+ BT_DBG("RPN stop bits mismatch 0x%x", stop_bits);
+ stop_bits = RFCOMM_RPN_STOP_1;
+ rpn_mask ^= RFCOMM_RPN_PM_STOP;
+ }
+ }
+
+ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_PARITY)) {
+ parity = __get_rpn_parity(rpn->line_settings);
+ if (parity != RFCOMM_RPN_PARITY_NONE) {
+ BT_DBG("RPN parity mismatch 0x%x", parity);
+ parity = RFCOMM_RPN_PARITY_NONE;
+ rpn_mask ^= RFCOMM_RPN_PM_PARITY;
+ }
+ }
+
+ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_FLOW)) {
+ flow_ctrl = rpn->flow_ctrl;
+ if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) {
+ BT_DBG("RPN flow ctrl mismatch 0x%x", flow_ctrl);
+ flow_ctrl = RFCOMM_RPN_FLOW_NONE;
+ rpn_mask ^= RFCOMM_RPN_PM_FLOW;
+ }
+ }
+
+ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_XON)) {
+ xon_char = rpn->xon_char;
+ if (xon_char != RFCOMM_RPN_XON_CHAR) {
+ BT_DBG("RPN XON char mismatch 0x%x", xon_char);
+ xon_char = RFCOMM_RPN_XON_CHAR;
+ rpn_mask ^= RFCOMM_RPN_PM_XON;
+ }
+ }
+
+ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_XOFF)) {
+ xoff_char = rpn->xoff_char;
+ if (xoff_char != RFCOMM_RPN_XOFF_CHAR) {
+ BT_DBG("RPN XOFF char mismatch 0x%x", xoff_char);
+ xoff_char = RFCOMM_RPN_XOFF_CHAR;
+ rpn_mask ^= RFCOMM_RPN_PM_XOFF;
+ }
+ }
+
+rpn_out:
+ rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits,
+ parity, flow_ctrl, xon_char, xoff_char, rpn_mask);
+
+ return 0;
+}
+
+static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb)
+{
+ struct rfcomm_rls *rls = (void *) skb->data;
+ u8 dlci = __get_dlci(rls->dlci);
+
+ BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status);
+
+ if (!cr)
+ return 0;
+
+ /* We should probably do something with this information here. But
+ * for now it's sufficient just to reply -- Bluetooth 1.1 says it's
+ * mandatory to recognise and respond to RLS */
+
+ rfcomm_send_rls(s, 0, dlci, rls->status);
+
+ return 0;
+}
+
+static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb)
+{
+ struct rfcomm_msc *msc = (void *) skb->data;
+ struct rfcomm_dlc *d;
+ u8 dlci = __get_dlci(msc->dlci);
+
+ BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig);
+
+ d = rfcomm_dlc_get(s, dlci);
+ if (!d)
+ return 0;
+
+ if (cr) {
+ if (msc->v24_sig & RFCOMM_V24_FC && !d->cfc)
+ set_bit(RFCOMM_TX_THROTTLED, &d->flags);
+ else
+ clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
+
+ rfcomm_dlc_lock(d);
+
+ d->remote_v24_sig = msc->v24_sig;
+
+ if (d->modem_status)
+ d->modem_status(d, msc->v24_sig);
+
+ rfcomm_dlc_unlock(d);
+
+ rfcomm_send_msc(s, 0, dlci, msc->v24_sig);
+
+ d->mscex |= RFCOMM_MSCEX_RX;
+ } else
+ d->mscex |= RFCOMM_MSCEX_TX;
+
+ return 0;
+}
+
+static int rfcomm_recv_mcc(struct rfcomm_session *s, struct sk_buff *skb)
+{
+ struct rfcomm_mcc *mcc = (void *) skb->data;
+ u8 type, cr, len;
+
+ cr = __test_cr(mcc->type);
+ type = __get_mcc_type(mcc->type);
+ len = __get_mcc_len(mcc->len);
+
+ BT_DBG("%p type 0x%x cr %d", s, type, cr);
+
+ skb_pull(skb, 2);
+
+ switch (type) {
+ case RFCOMM_PN:
+ rfcomm_recv_pn(s, cr, skb);
+ break;
+
+ case RFCOMM_RPN:
+ rfcomm_recv_rpn(s, cr, len, skb);
+ break;
+
+ case RFCOMM_RLS:
+ rfcomm_recv_rls(s, cr, skb);
+ break;
+
+ case RFCOMM_MSC:
+ rfcomm_recv_msc(s, cr, skb);
+ break;
+
+ case RFCOMM_FCOFF:
+ if (cr) {
+ set_bit(RFCOMM_TX_THROTTLED, &s->flags);
+ rfcomm_send_fcoff(s, 0);
+ }
+ break;
+
+ case RFCOMM_FCON:
+ if (cr) {
+ clear_bit(RFCOMM_TX_THROTTLED, &s->flags);
+ rfcomm_send_fcon(s, 0);
+ }
+ break;
+
+ case RFCOMM_TEST:
+ if (cr)
+ rfcomm_send_test(s, 0, skb->data, skb->len);
+ break;
+
+ case RFCOMM_NSC:
+ break;
+
+ default:
+ BT_ERR("Unknown control type 0x%02x", type);
+ rfcomm_send_nsc(s, cr, type);
+ break;
+ }
+ return 0;
+}
+
+static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk_buff *skb)
+{
+ struct rfcomm_dlc *d;
+
+ BT_DBG("session %p state %ld dlci %d pf %d", s, s->state, dlci, pf);
+
+ d = rfcomm_dlc_get(s, dlci);
+ if (!d) {
+ rfcomm_send_dm(s, dlci);
+ goto drop;
+ }
+
+ if (pf && d->cfc) {
+ u8 credits = *(u8 *) skb->data; skb_pull(skb, 1);
+
+ d->tx_credits += credits;
+ if (d->tx_credits)
+ clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
+ }
+
+ if (skb->len && d->state == BT_CONNECTED) {
+ rfcomm_dlc_lock(d);
+ d->rx_credits--;
+ d->data_ready(d, skb);
+ rfcomm_dlc_unlock(d);
+ return 0;
+ }
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct rfcomm_session *rfcomm_recv_frame(struct rfcomm_session *s,
+ struct sk_buff *skb)
+{
+ struct rfcomm_hdr *hdr = (void *) skb->data;
+ u8 type, dlci, fcs;
+
+ if (!s) {
+ /* no session, so free socket data */
+ kfree_skb(skb);
+ return s;
+ }
+
+ dlci = __get_dlci(hdr->addr);
+ type = __get_type(hdr->ctrl);
+
+ /* Trim FCS */
+ skb->len--; skb->tail--;
+ fcs = *(u8 *)skb_tail_pointer(skb);
+
+ if (__check_fcs(skb->data, type, fcs)) {
+ BT_ERR("bad checksum in packet");
+ kfree_skb(skb);
+ return s;
+ }
+
+ if (__test_ea(hdr->len))
+ skb_pull(skb, 3);
+ else
+ skb_pull(skb, 4);
+
+ switch (type) {
+ case RFCOMM_SABM:
+ if (__test_pf(hdr->ctrl))
+ rfcomm_recv_sabm(s, dlci);
+ break;
+
+ case RFCOMM_DISC:
+ if (__test_pf(hdr->ctrl))
+ s = rfcomm_recv_disc(s, dlci);
+ break;
+
+ case RFCOMM_UA:
+ if (__test_pf(hdr->ctrl))
+ s = rfcomm_recv_ua(s, dlci);
+ break;
+
+ case RFCOMM_DM:
+ s = rfcomm_recv_dm(s, dlci);
+ break;
+
+ case RFCOMM_UIH:
+ if (dlci) {
+ rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb);
+ return s;
+ }
+ rfcomm_recv_mcc(s, skb);
+ break;
+
+ default:
+ BT_ERR("Unknown packet type 0x%02x", type);
+ break;
+ }
+ kfree_skb(skb);
+ return s;
+}
+
+/* ---- Connection and data processing ---- */
+
+static void rfcomm_process_connect(struct rfcomm_session *s)
+{
+ struct rfcomm_dlc *d;
+ struct list_head *p, *n;
+
+ BT_DBG("session %p state %ld", s, s->state);
+
+ list_for_each_safe(p, n, &s->dlcs) {
+ d = list_entry(p, struct rfcomm_dlc, list);
+ if (d->state == BT_CONFIG) {
+ d->mtu = s->mtu;
+ if (rfcomm_check_security(d)) {
+ rfcomm_send_pn(s, 1, d);
+ } else {
+ set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+ }
+ }
+ }
+}
+
+/* Send data queued for the DLC.
+ * Return number of frames left in the queue.
+ */
+static int rfcomm_process_tx(struct rfcomm_dlc *d)
+{
+ struct sk_buff *skb;
+ int err;
+
+ BT_DBG("dlc %p state %ld cfc %d rx_credits %d tx_credits %d",
+ d, d->state, d->cfc, d->rx_credits, d->tx_credits);
+
+ /* Send pending MSC */
+ if (test_and_clear_bit(RFCOMM_MSC_PENDING, &d->flags))
+ rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
+
+ if (d->cfc) {
+ /* CFC enabled.
+ * Give them some credits */
+ if (!test_bit(RFCOMM_RX_THROTTLED, &d->flags) &&
+ d->rx_credits <= (d->cfc >> 2)) {
+ rfcomm_send_credits(d->session, d->addr, d->cfc - d->rx_credits);
+ d->rx_credits = d->cfc;
+ }
+ } else {
+ /* CFC disabled.
+ * Give ourselves some credits */
+ d->tx_credits = 5;
+ }
+
+ if (test_bit(RFCOMM_TX_THROTTLED, &d->flags))
+ return skb_queue_len(&d->tx_queue);
+
+ while (d->tx_credits && (skb = skb_dequeue(&d->tx_queue))) {
+ err = rfcomm_send_frame(d->session, skb->data, skb->len);
+ if (err < 0) {
+ skb_queue_head(&d->tx_queue, skb);
+ break;
+ }
+ kfree_skb(skb);
+ d->tx_credits--;
+ }
+
+ if (d->cfc && !d->tx_credits) {
+ /* We're out of TX credits.
+ * Set TX_THROTTLED flag to avoid unnesary wakeups by dlc_send. */
+ set_bit(RFCOMM_TX_THROTTLED, &d->flags);
+ }
+
+ return skb_queue_len(&d->tx_queue);
+}
+
+static void rfcomm_process_dlcs(struct rfcomm_session *s)
+{
+ struct rfcomm_dlc *d;
+ struct list_head *p, *n;
+
+ BT_DBG("session %p state %ld", s, s->state);
+
+ list_for_each_safe(p, n, &s->dlcs) {
+ d = list_entry(p, struct rfcomm_dlc, list);
+
+ if (test_bit(RFCOMM_TIMED_OUT, &d->flags)) {
+ __rfcomm_dlc_close(d, ETIMEDOUT);
+ continue;
+ }
+
+ if (test_bit(RFCOMM_ENC_DROP, &d->flags)) {
+ __rfcomm_dlc_close(d, ECONNREFUSED);
+ continue;
+ }
+
+ if (test_and_clear_bit(RFCOMM_AUTH_ACCEPT, &d->flags)) {
+ rfcomm_dlc_clear_timer(d);
+ if (d->out) {
+ rfcomm_send_pn(s, 1, d);
+ rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
+ } else {
+ if (d->defer_setup) {
+ set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+ rfcomm_dlc_lock(d);
+ d->state = BT_CONNECT2;
+ d->state_change(d, 0);
+ rfcomm_dlc_unlock(d);
+ } else
+ rfcomm_dlc_accept(d);
+ }
+ continue;
+ } else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) {
+ rfcomm_dlc_clear_timer(d);
+ if (!d->out)
+ rfcomm_send_dm(s, d->dlci);
+ else
+ d->state = BT_CLOSED;
+ __rfcomm_dlc_close(d, ECONNREFUSED);
+ continue;
+ }
+
+ if (test_bit(RFCOMM_SEC_PENDING, &d->flags))
+ continue;
+
+ if (test_bit(RFCOMM_TX_THROTTLED, &s->flags))
+ continue;
+
+ if ((d->state == BT_CONNECTED || d->state == BT_DISCONN) &&
+ d->mscex == RFCOMM_MSCEX_OK)
+ rfcomm_process_tx(d);
+ }
+}
+
+static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s)
+{
+ struct socket *sock = s->sock;
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+
+ BT_DBG("session %p state %ld qlen %d", s, s->state, skb_queue_len(&sk->sk_receive_queue));
+
+ /* Get data directly from socket receive queue without copying it. */
+ while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ if (!skb_linearize(skb)) {
+ s = rfcomm_recv_frame(s, skb);
+ if (!s)
+ break;
+ } else {
+ kfree_skb(skb);
+ }
+ }
+
+ if (s && (sk->sk_state == BT_CLOSED))
+ s = rfcomm_session_close(s, sk->sk_err);
+
+ return s;
+}
+
+static void rfcomm_accept_connection(struct rfcomm_session *s)
+{
+ struct socket *sock = s->sock, *nsock;
+ int err;
+
+ /* Fast check for a new connection.
+ * Avoids unnesesary socket allocations. */
+ if (list_empty(&bt_sk(sock->sk)->accept_q))
+ return;
+
+ BT_DBG("session %p", s);
+
+ err = kernel_accept(sock, &nsock, O_NONBLOCK);
+ if (err < 0)
+ return;
+
+ /* Set our callbacks */
+ nsock->sk->sk_data_ready = rfcomm_l2data_ready;
+ nsock->sk->sk_state_change = rfcomm_l2state_change;
+
+ s = rfcomm_session_add(nsock, BT_OPEN);
+ if (s) {
+ /* We should adjust MTU on incoming sessions.
+ * L2CAP MTU minus UIH header and FCS. */
+ s->mtu = min(l2cap_pi(nsock->sk)->chan->omtu,
+ l2cap_pi(nsock->sk)->chan->imtu) - 5;
+
+ rfcomm_schedule();
+ } else
+ sock_release(nsock);
+}
+
+static struct rfcomm_session *rfcomm_check_connection(struct rfcomm_session *s)
+{
+ struct sock *sk = s->sock->sk;
+
+ BT_DBG("%p state %ld", s, s->state);
+
+ switch (sk->sk_state) {
+ case BT_CONNECTED:
+ s->state = BT_CONNECT;
+
+ /* We can adjust MTU on outgoing sessions.
+ * L2CAP MTU minus UIH header and FCS. */
+ s->mtu = min(l2cap_pi(sk)->chan->omtu, l2cap_pi(sk)->chan->imtu) - 5;
+
+ rfcomm_send_sabm(s, 0);
+ break;
+
+ case BT_CLOSED:
+ s = rfcomm_session_close(s, sk->sk_err);
+ break;
+ }
+ return s;
+}
+
+static void rfcomm_process_sessions(void)
+{
+ struct list_head *p, *n;
+
+ rfcomm_lock();
+
+ list_for_each_safe(p, n, &session_list) {
+ struct rfcomm_session *s;
+ s = list_entry(p, struct rfcomm_session, list);
+
+ if (test_and_clear_bit(RFCOMM_TIMED_OUT, &s->flags)) {
+ s->state = BT_DISCONN;
+ rfcomm_send_disc(s, 0);
+ continue;
+ }
+
+ switch (s->state) {
+ case BT_LISTEN:
+ rfcomm_accept_connection(s);
+ continue;
+
+ case BT_BOUND:
+ s = rfcomm_check_connection(s);
+ break;
+
+ default:
+ s = rfcomm_process_rx(s);
+ break;
+ }
+
+ if (s)
+ rfcomm_process_dlcs(s);
+ }
+
+ rfcomm_unlock();
+}
+
+static int rfcomm_add_listener(bdaddr_t *ba)
+{
+ struct sockaddr_l2 addr;
+ struct socket *sock;
+ struct sock *sk;
+ struct rfcomm_session *s;
+ int err = 0;
+
+ /* Create socket */
+ err = rfcomm_l2sock_create(&sock);
+ if (err < 0) {
+ BT_ERR("Create socket failed %d", err);
+ return err;
+ }
+
+ /* Bind socket */
+ bacpy(&addr.l2_bdaddr, ba);
+ addr.l2_family = AF_BLUETOOTH;
+ addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM);
+ addr.l2_cid = 0;
+ addr.l2_bdaddr_type = BDADDR_BREDR;
+ err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+ if (err < 0) {
+ BT_ERR("Bind failed %d", err);
+ goto failed;
+ }
+
+ /* Set L2CAP options */
+ sk = sock->sk;
+ lock_sock(sk);
+ l2cap_pi(sk)->chan->imtu = l2cap_mtu;
+ release_sock(sk);
+
+ /* Start listening on the socket */
+ err = kernel_listen(sock, 10);
+ if (err) {
+ BT_ERR("Listen failed %d", err);
+ goto failed;
+ }
+
+ /* Add listening session */
+ s = rfcomm_session_add(sock, BT_LISTEN);
+ if (!s) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ return 0;
+failed:
+ sock_release(sock);
+ return err;
+}
+
+static void rfcomm_kill_listener(void)
+{
+ struct rfcomm_session *s;
+ struct list_head *p, *n;
+
+ BT_DBG("");
+
+ list_for_each_safe(p, n, &session_list) {
+ s = list_entry(p, struct rfcomm_session, list);
+ rfcomm_session_del(s);
+ }
+}
+
+static int rfcomm_run(void *unused)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ BT_DBG("");
+
+ set_user_nice(current, -10);
+
+ rfcomm_add_listener(BDADDR_ANY);
+
+ add_wait_queue(&rfcomm_wq, &wait);
+ while (!kthread_should_stop()) {
+
+ /* Process stuff */
+ rfcomm_process_sessions();
+
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&rfcomm_wq, &wait);
+
+ rfcomm_kill_listener();
+
+ return 0;
+}
+
+static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
+{
+ struct rfcomm_session *s;
+ struct rfcomm_dlc *d;
+ struct list_head *p, *n;
+
+ BT_DBG("conn %p status 0x%02x encrypt 0x%02x", conn, status, encrypt);
+
+ s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst);
+ if (!s)
+ return;
+
+ list_for_each_safe(p, n, &s->dlcs) {
+ d = list_entry(p, struct rfcomm_dlc, list);
+
+ if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) {
+ rfcomm_dlc_clear_timer(d);
+ if (status || encrypt == 0x00) {
+ set_bit(RFCOMM_ENC_DROP, &d->flags);
+ continue;
+ }
+ }
+
+ if (d->state == BT_CONNECTED && !status && encrypt == 0x00) {
+ if (d->sec_level == BT_SECURITY_MEDIUM) {
+ set_bit(RFCOMM_SEC_PENDING, &d->flags);
+ rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+ continue;
+ } else if (d->sec_level == BT_SECURITY_HIGH ||
+ d->sec_level == BT_SECURITY_FIPS) {
+ set_bit(RFCOMM_ENC_DROP, &d->flags);
+ continue;
+ }
+ }
+
+ if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
+ continue;
+
+ if (!status && hci_conn_check_secure(conn, d->sec_level))
+ set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
+ else
+ set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+ }
+
+ rfcomm_schedule();
+}
+
+static struct hci_cb rfcomm_cb = {
+ .name = "RFCOMM",
+ .security_cfm = rfcomm_security_cfm
+};
+
+static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x)
+{
+ struct rfcomm_session *s;
+
+ rfcomm_lock();
+
+ list_for_each_entry(s, &session_list, list) {
+ struct l2cap_chan *chan = l2cap_pi(s->sock->sk)->chan;
+ struct rfcomm_dlc *d;
+ list_for_each_entry(d, &s->dlcs, list) {
+ seq_printf(f, "%pMR %pMR %ld %d %d %d %d\n",
+ &chan->src, &chan->dst,
+ d->state, d->dlci, d->mtu,
+ d->rx_credits, d->tx_credits);
+ }
+ }
+
+ rfcomm_unlock();
+
+ return 0;
+}
+
+static int rfcomm_dlc_debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rfcomm_dlc_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations rfcomm_dlc_debugfs_fops = {
+ .open = rfcomm_dlc_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rfcomm_dlc_debugfs;
+
+/* ---- Initialization ---- */
+static int __init rfcomm_init(void)
+{
+ int err;
+
+ hci_register_cb(&rfcomm_cb);
+
+ rfcomm_thread = kthread_run(rfcomm_run, NULL, "krfcommd");
+ if (IS_ERR(rfcomm_thread)) {
+ err = PTR_ERR(rfcomm_thread);
+ goto unregister;
+ }
+
+ err = rfcomm_init_ttys();
+ if (err < 0)
+ goto stop;
+
+ err = rfcomm_init_sockets();
+ if (err < 0)
+ goto cleanup;
+
+ BT_INFO("RFCOMM ver %s", VERSION);
+
+ if (IS_ERR_OR_NULL(bt_debugfs))
+ return 0;
+
+ rfcomm_dlc_debugfs = debugfs_create_file("rfcomm_dlc", 0444,
+ bt_debugfs, NULL,
+ &rfcomm_dlc_debugfs_fops);
+
+ return 0;
+
+cleanup:
+ rfcomm_cleanup_ttys();
+
+stop:
+ kthread_stop(rfcomm_thread);
+
+unregister:
+ hci_unregister_cb(&rfcomm_cb);
+
+ return err;
+}
+
+static void __exit rfcomm_exit(void)
+{
+ debugfs_remove(rfcomm_dlc_debugfs);
+
+ hci_unregister_cb(&rfcomm_cb);
+
+ kthread_stop(rfcomm_thread);
+
+ rfcomm_cleanup_ttys();
+
+ rfcomm_cleanup_sockets();
+}
+
+module_init(rfcomm_init);
+module_exit(rfcomm_exit);
+
+module_param(disable_cfc, bool, 0644);
+MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control");
+
+module_param(channel_mtu, int, 0644);
+MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel");
+
+module_param(l2cap_mtu, uint, 0644);
+MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection");
+
+module_param(l2cap_ertm, bool, 0644);
+MODULE_PARM_DESC(l2cap_ertm, "Use L2CAP ERTM mode for connection");
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth RFCOMM ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("bt-proto-3");
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
new file mode 100644
index 000000000..825e8fb51
--- /dev/null
+++ b/net/bluetooth/rfcomm/sock.c
@@ -0,0 +1,1104 @@
+/*
+ RFCOMM implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+ Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/*
+ * RFCOMM sockets.
+ */
+
+#include <linux/export.h>
+#include <linux/debugfs.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/rfcomm.h>
+
+static const struct proto_ops rfcomm_sock_ops;
+
+static struct bt_sock_list rfcomm_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(rfcomm_sk_list.lock)
+};
+
+static void rfcomm_sock_close(struct sock *sk);
+static void rfcomm_sock_kill(struct sock *sk);
+
+/* ---- DLC callbacks ----
+ *
+ * called under rfcomm_dlc_lock()
+ */
+static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
+{
+ struct sock *sk = d->owner;
+ if (!sk)
+ return;
+
+ atomic_add(skb->len, &sk->sk_rmem_alloc);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ rfcomm_dlc_throttle(d);
+}
+
+static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
+{
+ struct sock *sk = d->owner, *parent;
+ unsigned long flags;
+
+ if (!sk)
+ return;
+
+ BT_DBG("dlc %p state %ld err %d", d, d->state, err);
+
+ local_irq_save(flags);
+ bh_lock_sock(sk);
+
+ if (err)
+ sk->sk_err = err;
+
+ sk->sk_state = d->state;
+
+ parent = bt_sk(sk)->parent;
+ if (parent) {
+ if (d->state == BT_CLOSED) {
+ sock_set_flag(sk, SOCK_ZAPPED);
+ bt_accept_unlink(sk);
+ }
+ parent->sk_data_ready(parent);
+ } else {
+ if (d->state == BT_CONNECTED)
+ rfcomm_session_getaddr(d->session,
+ &rfcomm_pi(sk)->src, NULL);
+ sk->sk_state_change(sk);
+ }
+
+ bh_unlock_sock(sk);
+ local_irq_restore(flags);
+
+ if (parent && sock_flag(sk, SOCK_ZAPPED)) {
+ /* We have to drop DLC lock here, otherwise
+ * rfcomm_sock_destruct() will dead lock. */
+ rfcomm_dlc_unlock(d);
+ rfcomm_sock_kill(sk);
+ rfcomm_dlc_lock(d);
+ }
+}
+
+/* ---- Socket functions ---- */
+static struct sock *__rfcomm_get_listen_sock_by_addr(u8 channel, bdaddr_t *src)
+{
+ struct sock *sk = NULL;
+
+ sk_for_each(sk, &rfcomm_sk_list.head) {
+ if (rfcomm_pi(sk)->channel != channel)
+ continue;
+
+ if (bacmp(&rfcomm_pi(sk)->src, src))
+ continue;
+
+ if (sk->sk_state == BT_BOUND || sk->sk_state == BT_LISTEN)
+ break;
+ }
+
+ return sk ? sk : NULL;
+}
+
+/* Find socket with channel and source bdaddr.
+ * Returns closest match.
+ */
+static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src)
+{
+ struct sock *sk = NULL, *sk1 = NULL;
+
+ read_lock(&rfcomm_sk_list.lock);
+
+ sk_for_each(sk, &rfcomm_sk_list.head) {
+ if (state && sk->sk_state != state)
+ continue;
+
+ if (rfcomm_pi(sk)->channel == channel) {
+ /* Exact match. */
+ if (!bacmp(&rfcomm_pi(sk)->src, src))
+ break;
+
+ /* Closest match */
+ if (!bacmp(&rfcomm_pi(sk)->src, BDADDR_ANY))
+ sk1 = sk;
+ }
+ }
+
+ read_unlock(&rfcomm_sk_list.lock);
+
+ return sk ? sk : sk1;
+}
+
+static void rfcomm_sock_destruct(struct sock *sk)
+{
+ struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+
+ BT_DBG("sk %p dlc %p", sk, d);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+
+ rfcomm_dlc_lock(d);
+ rfcomm_pi(sk)->dlc = NULL;
+
+ /* Detach DLC if it's owned by this socket */
+ if (d->owner == sk)
+ d->owner = NULL;
+ rfcomm_dlc_unlock(d);
+
+ rfcomm_dlc_put(d);
+}
+
+static void rfcomm_sock_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ BT_DBG("parent %p", parent);
+
+ /* Close not yet accepted dlcs */
+ while ((sk = bt_accept_dequeue(parent, NULL))) {
+ rfcomm_sock_close(sk);
+ rfcomm_sock_kill(sk);
+ }
+
+ parent->sk_state = BT_CLOSED;
+ sock_set_flag(parent, SOCK_ZAPPED);
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void rfcomm_sock_kill(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+ return;
+
+ BT_DBG("sk %p state %d refcnt %d", sk, sk->sk_state, atomic_read(&sk->sk_refcnt));
+
+ /* Kill poor orphan */
+ bt_sock_unlink(&rfcomm_sk_list, sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+}
+
+static void __rfcomm_sock_close(struct sock *sk)
+{
+ struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+
+ BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
+
+ switch (sk->sk_state) {
+ case BT_LISTEN:
+ rfcomm_sock_cleanup_listen(sk);
+ break;
+
+ case BT_CONNECT:
+ case BT_CONNECT2:
+ case BT_CONFIG:
+ case BT_CONNECTED:
+ rfcomm_dlc_close(d, 0);
+
+ default:
+ sock_set_flag(sk, SOCK_ZAPPED);
+ break;
+ }
+}
+
+/* Close socket.
+ * Must be called on unlocked socket.
+ */
+static void rfcomm_sock_close(struct sock *sk)
+{
+ lock_sock(sk);
+ __rfcomm_sock_close(sk);
+ release_sock(sk);
+}
+
+static void rfcomm_sock_init(struct sock *sk, struct sock *parent)
+{
+ struct rfcomm_pinfo *pi = rfcomm_pi(sk);
+
+ BT_DBG("sk %p", sk);
+
+ if (parent) {
+ sk->sk_type = parent->sk_type;
+ pi->dlc->defer_setup = test_bit(BT_SK_DEFER_SETUP,
+ &bt_sk(parent)->flags);
+
+ pi->sec_level = rfcomm_pi(parent)->sec_level;
+ pi->role_switch = rfcomm_pi(parent)->role_switch;
+
+ security_sk_clone(parent, sk);
+ } else {
+ pi->dlc->defer_setup = 0;
+
+ pi->sec_level = BT_SECURITY_LOW;
+ pi->role_switch = 0;
+ }
+
+ pi->dlc->sec_level = pi->sec_level;
+ pi->dlc->role_switch = pi->role_switch;
+}
+
+static struct proto rfcomm_proto = {
+ .name = "RFCOMM",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct rfcomm_pinfo)
+};
+
+static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+{
+ struct rfcomm_dlc *d;
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk);
+ INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+ d = rfcomm_dlc_alloc(prio);
+ if (!d) {
+ sk_free(sk);
+ return NULL;
+ }
+
+ d->data_ready = rfcomm_sk_data_ready;
+ d->state_change = rfcomm_sk_state_change;
+
+ rfcomm_pi(sk)->dlc = d;
+ d->owner = sk;
+
+ sk->sk_destruct = rfcomm_sock_destruct;
+ sk->sk_sndtimeo = RFCOMM_CONN_TIMEOUT;
+
+ sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
+ sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = proto;
+ sk->sk_state = BT_OPEN;
+
+ bt_sock_link(&rfcomm_sk_list, sk);
+
+ BT_DBG("sk %p", sk);
+ return sk;
+}
+
+static int rfcomm_sock_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_STREAM && sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ sock->ops = &rfcomm_sock_ops;
+
+ sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+ if (!sk)
+ return -ENOMEM;
+
+ rfcomm_sock_init(sk, NULL);
+ return 0;
+}
+
+static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+ struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
+ struct sock *sk = sock->sk;
+ int chan = sa->rc_channel;
+ int err = 0;
+
+ BT_DBG("sk %p %pMR", sk, &sa->rc_bdaddr);
+
+ if (!addr || addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_OPEN) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ write_lock(&rfcomm_sk_list.lock);
+
+ if (chan && __rfcomm_get_listen_sock_by_addr(chan, &sa->rc_bdaddr)) {
+ err = -EADDRINUSE;
+ } else {
+ /* Save source address */
+ bacpy(&rfcomm_pi(sk)->src, &sa->rc_bdaddr);
+ rfcomm_pi(sk)->channel = chan;
+ sk->sk_state = BT_BOUND;
+ }
+
+ write_unlock(&rfcomm_sk_list.lock);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+{
+ struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
+ struct sock *sk = sock->sk;
+ struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+ int err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (alen < sizeof(struct sockaddr_rc) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ sk->sk_state = BT_CONNECT;
+ bacpy(&rfcomm_pi(sk)->dst, &sa->rc_bdaddr);
+ rfcomm_pi(sk)->channel = sa->rc_channel;
+
+ d->sec_level = rfcomm_pi(sk)->sec_level;
+ d->role_switch = rfcomm_pi(sk)->role_switch;
+
+ err = rfcomm_dlc_open(d, &rfcomm_pi(sk)->src, &sa->rc_bdaddr,
+ sa->rc_channel);
+ if (!err)
+ err = bt_sock_wait_state(sk, BT_CONNECTED,
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sk %p backlog %d", sk, backlog);
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_BOUND) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!rfcomm_pi(sk)->channel) {
+ bdaddr_t *src = &rfcomm_pi(sk)->src;
+ u8 channel;
+
+ err = -EINVAL;
+
+ write_lock(&rfcomm_sk_list.lock);
+
+ for (channel = 1; channel < 31; channel++)
+ if (!__rfcomm_get_listen_sock_by_addr(channel, src)) {
+ rfcomm_pi(sk)->channel = channel;
+ err = 0;
+ break;
+ }
+
+ write_unlock(&rfcomm_sk_list.lock);
+
+ if (err < 0)
+ goto done;
+ }
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = BT_LISTEN;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = sock->sk, *nsk;
+ long timeo;
+ int err = 0;
+
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ BT_DBG("sk %p timeo %ld", sk, timeo);
+
+ /* Wait for an incoming connection. (wake-one). */
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (1) {
+ if (sk->sk_state != BT_LISTEN) {
+ err = -EBADFD;
+ break;
+ }
+
+ nsk = bt_accept_dequeue(sk, newsock);
+ if (nsk)
+ break;
+
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (err)
+ goto done;
+
+ newsock->state = SS_CONNECTED;
+
+ BT_DBG("new socket %p", nsk);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+{
+ struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
+ struct sock *sk = sock->sk;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (peer && sk->sk_state != BT_CONNECTED &&
+ sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2)
+ return -ENOTCONN;
+
+ memset(sa, 0, sizeof(*sa));
+ sa->rc_family = AF_BLUETOOTH;
+ sa->rc_channel = rfcomm_pi(sk)->channel;
+ if (peer)
+ bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->dst);
+ else
+ bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->src);
+
+ *len = sizeof(struct sockaddr_rc);
+ return 0;
+}
+
+static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+ struct sk_buff *skb;
+ int sent;
+
+ if (test_bit(RFCOMM_DEFER_SETUP, &d->flags))
+ return -ENOTCONN;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ return -EPIPE;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ lock_sock(sk);
+
+ sent = bt_sock_wait_ready(sk, msg->msg_flags);
+ if (sent)
+ goto done;
+
+ while (len) {
+ size_t size = min_t(size_t, len, d->mtu);
+ int err;
+
+ skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE,
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb) {
+ if (sent == 0)
+ sent = err;
+ break;
+ }
+ skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
+
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err) {
+ kfree_skb(skb);
+ if (sent == 0)
+ sent = err;
+ break;
+ }
+
+ skb->priority = sk->sk_priority;
+
+ err = rfcomm_dlc_send(d, skb);
+ if (err < 0) {
+ kfree_skb(skb);
+ if (sent == 0)
+ sent = err;
+ break;
+ }
+
+ sent += size;
+ len -= size;
+ }
+
+done:
+ release_sock(sk);
+
+ return sent;
+}
+
+static int rfcomm_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+ int len;
+
+ if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+ rfcomm_dlc_accept(d);
+ return 0;
+ }
+
+ len = bt_sock_stream_recvmsg(sock, msg, size, flags);
+
+ lock_sock(sk);
+ if (!(flags & MSG_PEEK) && len > 0)
+ atomic_sub(len, &sk->sk_rmem_alloc);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2))
+ rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc);
+ release_sock(sk);
+
+ return len;
+}
+
+static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case RFCOMM_LM:
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt & RFCOMM_LM_FIPS) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (opt & RFCOMM_LM_AUTH)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW;
+ if (opt & RFCOMM_LM_ENCRYPT)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
+ if (opt & RFCOMM_LM_SECURE)
+ rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH;
+
+ rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct bt_security sec;
+ int err = 0;
+ size_t len;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_RFCOMM)
+ return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = BT_SECURITY_LOW;
+
+ len = min_t(unsigned int, sizeof(sec), optlen);
+ if (copy_from_user((char *) &sec, optval, len)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (sec.level > BT_SECURITY_HIGH) {
+ err = -EINVAL;
+ break;
+ }
+
+ rfcomm_pi(sk)->sec_level = sec.level;
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt)
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct sock *l2cap_sk;
+ struct l2cap_conn *conn;
+ struct rfcomm_conninfo cinfo;
+ int len, err = 0;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case RFCOMM_LM:
+ switch (rfcomm_pi(sk)->sec_level) {
+ case BT_SECURITY_LOW:
+ opt = RFCOMM_LM_AUTH;
+ break;
+ case BT_SECURITY_MEDIUM:
+ opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT;
+ break;
+ case BT_SECURITY_HIGH:
+ opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT |
+ RFCOMM_LM_SECURE;
+ break;
+ case BT_SECURITY_FIPS:
+ opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT |
+ RFCOMM_LM_SECURE | RFCOMM_LM_FIPS;
+ break;
+ default:
+ opt = 0;
+ break;
+ }
+
+ if (rfcomm_pi(sk)->role_switch)
+ opt |= RFCOMM_LM_MASTER;
+
+ if (put_user(opt, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case RFCOMM_CONNINFO:
+ if (sk->sk_state != BT_CONNECTED &&
+ !rfcomm_pi(sk)->dlc->defer_setup) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ l2cap_sk = rfcomm_pi(sk)->dlc->session->sock->sk;
+ conn = l2cap_pi(l2cap_sk)->chan->conn;
+
+ memset(&cinfo, 0, sizeof(cinfo));
+ cinfo.hci_handle = conn->hcon->handle;
+ memcpy(cinfo.dev_class, conn->hcon->dev_class, 3);
+
+ len = min_t(unsigned int, len, sizeof(cinfo));
+ if (copy_to_user(optval, (char *) &cinfo, len))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct bt_security sec;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_RFCOMM)
+ return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SECURITY:
+ if (sk->sk_type != SOCK_STREAM) {
+ err = -EINVAL;
+ break;
+ }
+
+ sec.level = rfcomm_pi(sk)->sec_level;
+ sec.key_size = 0;
+
+ len = min_t(unsigned int, len, sizeof(sec));
+ if (copy_to_user(optval, (char *) &sec, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+ (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk __maybe_unused = sock->sk;
+ int err;
+
+ BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg);
+
+ err = bt_sock_ioctl(sock, cmd, arg);
+
+ if (err == -ENOIOCTLCMD) {
+#ifdef CONFIG_BT_RFCOMM_TTY
+ lock_sock(sk);
+ err = rfcomm_dev_ioctl(sk, cmd, (void __user *) arg);
+ release_sock(sk);
+#else
+ err = -EOPNOTSUPP;
+#endif
+ }
+
+ return err;
+}
+
+static int rfcomm_sock_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ if (!sk->sk_shutdown) {
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ __rfcomm_sock_close(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
+ err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+ }
+ release_sock(sk);
+ return err;
+}
+
+static int rfcomm_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ int err;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ err = rfcomm_sock_shutdown(sock, 2);
+
+ sock_orphan(sk);
+ rfcomm_sock_kill(sk);
+ return err;
+}
+
+/* ---- RFCOMM core layer callbacks ----
+ *
+ * called under rfcomm_lock()
+ */
+int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc **d)
+{
+ struct sock *sk, *parent;
+ bdaddr_t src, dst;
+ int result = 0;
+
+ BT_DBG("session %p channel %d", s, channel);
+
+ rfcomm_session_getaddr(s, &src, &dst);
+
+ /* Check if we have socket listening on channel */
+ parent = rfcomm_get_sock_by_channel(BT_LISTEN, channel, &src);
+ if (!parent)
+ return 0;
+
+ bh_lock_sock(parent);
+
+ /* Check for backlog size */
+ if (sk_acceptq_is_full(parent)) {
+ BT_DBG("backlog full %d", parent->sk_ack_backlog);
+ goto done;
+ }
+
+ sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC);
+ if (!sk)
+ goto done;
+
+ bt_sock_reclassify_lock(sk, BTPROTO_RFCOMM);
+
+ rfcomm_sock_init(sk, parent);
+ bacpy(&rfcomm_pi(sk)->src, &src);
+ bacpy(&rfcomm_pi(sk)->dst, &dst);
+ rfcomm_pi(sk)->channel = channel;
+
+ sk->sk_state = BT_CONFIG;
+ bt_accept_enqueue(parent, sk);
+
+ /* Accept connection and return socket DLC */
+ *d = rfcomm_pi(sk)->dlc;
+ result = 1;
+
+done:
+ bh_unlock_sock(parent);
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
+ parent->sk_state_change(parent);
+
+ return result;
+}
+
+static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p)
+{
+ struct sock *sk;
+
+ read_lock(&rfcomm_sk_list.lock);
+
+ sk_for_each(sk, &rfcomm_sk_list.head) {
+ seq_printf(f, "%pMR %pMR %d %d\n",
+ &rfcomm_pi(sk)->src, &rfcomm_pi(sk)->dst,
+ sk->sk_state, rfcomm_pi(sk)->channel);
+ }
+
+ read_unlock(&rfcomm_sk_list.lock);
+
+ return 0;
+}
+
+static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rfcomm_sock_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations rfcomm_sock_debugfs_fops = {
+ .open = rfcomm_sock_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rfcomm_sock_debugfs;
+
+static const struct proto_ops rfcomm_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = rfcomm_sock_release,
+ .bind = rfcomm_sock_bind,
+ .connect = rfcomm_sock_connect,
+ .listen = rfcomm_sock_listen,
+ .accept = rfcomm_sock_accept,
+ .getname = rfcomm_sock_getname,
+ .sendmsg = rfcomm_sock_sendmsg,
+ .recvmsg = rfcomm_sock_recvmsg,
+ .shutdown = rfcomm_sock_shutdown,
+ .setsockopt = rfcomm_sock_setsockopt,
+ .getsockopt = rfcomm_sock_getsockopt,
+ .ioctl = rfcomm_sock_ioctl,
+ .poll = bt_sock_poll,
+ .socketpair = sock_no_socketpair,
+ .mmap = sock_no_mmap
+};
+
+static const struct net_proto_family rfcomm_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = rfcomm_sock_create
+};
+
+int __init rfcomm_init_sockets(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sockaddr_rc) > sizeof(struct sockaddr));
+
+ err = proto_register(&rfcomm_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_RFCOMM, &rfcomm_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("RFCOMM socket layer registration failed");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "rfcomm", &rfcomm_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create RFCOMM proc file");
+ bt_sock_unregister(BTPROTO_RFCOMM);
+ goto error;
+ }
+
+ BT_INFO("RFCOMM socket layer initialized");
+
+ if (IS_ERR_OR_NULL(bt_debugfs))
+ return 0;
+
+ rfcomm_sock_debugfs = debugfs_create_file("rfcomm", 0444,
+ bt_debugfs, NULL,
+ &rfcomm_sock_debugfs_fops);
+
+ return 0;
+
+error:
+ proto_unregister(&rfcomm_proto);
+ return err;
+}
+
+void __exit rfcomm_cleanup_sockets(void)
+{
+ bt_procfs_cleanup(&init_net, "rfcomm");
+
+ debugfs_remove(rfcomm_sock_debugfs);
+
+ bt_sock_unregister(BTPROTO_RFCOMM);
+
+ proto_unregister(&rfcomm_proto);
+}
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
new file mode 100644
index 000000000..8e385a0ae
--- /dev/null
+++ b/net/bluetooth/rfcomm/tty.c
@@ -0,0 +1,1174 @@
+/*
+ RFCOMM implementation for Linux Bluetooth stack (BlueZ).
+ Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+ Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/*
+ * RFCOMM TTY.
+ */
+
+#include <linux/module.h>
+
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/tty_flip.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/rfcomm.h>
+
+#define RFCOMM_TTY_MAGIC 0x6d02 /* magic number for rfcomm struct */
+#define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV /* whole lotta rfcomm devices */
+#define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */
+#define RFCOMM_TTY_MINOR 0
+
+static DEFINE_MUTEX(rfcomm_ioctl_mutex);
+static struct tty_driver *rfcomm_tty_driver;
+
+struct rfcomm_dev {
+ struct tty_port port;
+ struct list_head list;
+
+ char name[12];
+ int id;
+ unsigned long flags;
+ int err;
+
+ unsigned long status; /* don't export to userspace */
+
+ bdaddr_t src;
+ bdaddr_t dst;
+ u8 channel;
+
+ uint modem_status;
+
+ struct rfcomm_dlc *dlc;
+
+ struct device *tty_dev;
+
+ atomic_t wmem_alloc;
+
+ struct sk_buff_head pending;
+};
+
+static LIST_HEAD(rfcomm_dev_list);
+static DEFINE_MUTEX(rfcomm_dev_lock);
+
+static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb);
+static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err);
+static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig);
+
+/* ---- Device functions ---- */
+
+static void rfcomm_dev_destruct(struct tty_port *port)
+{
+ struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+ struct rfcomm_dlc *dlc = dev->dlc;
+
+ BT_DBG("dev %p dlc %p", dev, dlc);
+
+ rfcomm_dlc_lock(dlc);
+ /* Detach DLC if it's owned by this dev */
+ if (dlc->owner == dev)
+ dlc->owner = NULL;
+ rfcomm_dlc_unlock(dlc);
+
+ rfcomm_dlc_put(dlc);
+
+ if (dev->tty_dev)
+ tty_unregister_device(rfcomm_tty_driver, dev->id);
+
+ mutex_lock(&rfcomm_dev_lock);
+ list_del(&dev->list);
+ mutex_unlock(&rfcomm_dev_lock);
+
+ kfree(dev);
+
+ /* It's safe to call module_put() here because socket still
+ holds reference to this module. */
+ module_put(THIS_MODULE);
+}
+
+/* device-specific initialization: open the dlc */
+static int rfcomm_dev_activate(struct tty_port *port, struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+ int err;
+
+ err = rfcomm_dlc_open(dev->dlc, &dev->src, &dev->dst, dev->channel);
+ if (err)
+ set_bit(TTY_IO_ERROR, &tty->flags);
+ return err;
+}
+
+/* we block the open until the dlc->state becomes BT_CONNECTED */
+static int rfcomm_dev_carrier_raised(struct tty_port *port)
+{
+ struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+
+ return (dev->dlc->state == BT_CONNECTED);
+}
+
+/* device-specific cleanup: close the dlc */
+static void rfcomm_dev_shutdown(struct tty_port *port)
+{
+ struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
+
+ if (dev->tty_dev->parent)
+ device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST);
+
+ /* close the dlc */
+ rfcomm_dlc_close(dev->dlc, 0);
+}
+
+static const struct tty_port_operations rfcomm_port_ops = {
+ .destruct = rfcomm_dev_destruct,
+ .activate = rfcomm_dev_activate,
+ .shutdown = rfcomm_dev_shutdown,
+ .carrier_raised = rfcomm_dev_carrier_raised,
+};
+
+static struct rfcomm_dev *__rfcomm_dev_lookup(int id)
+{
+ struct rfcomm_dev *dev;
+
+ list_for_each_entry(dev, &rfcomm_dev_list, list)
+ if (dev->id == id)
+ return dev;
+
+ return NULL;
+}
+
+static struct rfcomm_dev *rfcomm_dev_get(int id)
+{
+ struct rfcomm_dev *dev;
+
+ mutex_lock(&rfcomm_dev_lock);
+
+ dev = __rfcomm_dev_lookup(id);
+
+ if (dev && !tty_port_get(&dev->port))
+ dev = NULL;
+
+ mutex_unlock(&rfcomm_dev_lock);
+
+ return dev;
+}
+
+static void rfcomm_reparent_device(struct rfcomm_dev *dev)
+{
+ struct hci_dev *hdev;
+ struct hci_conn *conn;
+
+ hdev = hci_get_route(&dev->dst, &dev->src);
+ if (!hdev)
+ return;
+
+ /* The lookup results are unsafe to access without the
+ * hci device lock (FIXME: why is this not documented?)
+ */
+ hci_dev_lock(hdev);
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &dev->dst);
+
+ /* Just because the acl link is in the hash table is no
+ * guarantee the sysfs device has been added ...
+ */
+ if (conn && device_is_registered(&conn->dev))
+ device_move(dev->tty_dev, &conn->dev, DPM_ORDER_DEV_AFTER_PARENT);
+
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+}
+
+static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf)
+{
+ struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
+ return sprintf(buf, "%pMR\n", &dev->dst);
+}
+
+static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf)
+{
+ struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
+ return sprintf(buf, "%d\n", dev->channel);
+}
+
+static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
+static DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL);
+
+static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req,
+ struct rfcomm_dlc *dlc)
+{
+ struct rfcomm_dev *dev, *entry;
+ struct list_head *head = &rfcomm_dev_list;
+ int err = 0;
+
+ dev = kzalloc(sizeof(struct rfcomm_dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock(&rfcomm_dev_lock);
+
+ if (req->dev_id < 0) {
+ dev->id = 0;
+
+ list_for_each_entry(entry, &rfcomm_dev_list, list) {
+ if (entry->id != dev->id)
+ break;
+
+ dev->id++;
+ head = &entry->list;
+ }
+ } else {
+ dev->id = req->dev_id;
+
+ list_for_each_entry(entry, &rfcomm_dev_list, list) {
+ if (entry->id == dev->id) {
+ err = -EADDRINUSE;
+ goto out;
+ }
+
+ if (entry->id > dev->id - 1)
+ break;
+
+ head = &entry->list;
+ }
+ }
+
+ if ((dev->id < 0) || (dev->id > RFCOMM_MAX_DEV - 1)) {
+ err = -ENFILE;
+ goto out;
+ }
+
+ sprintf(dev->name, "rfcomm%d", dev->id);
+
+ list_add(&dev->list, head);
+
+ bacpy(&dev->src, &req->src);
+ bacpy(&dev->dst, &req->dst);
+ dev->channel = req->channel;
+
+ dev->flags = req->flags &
+ ((1 << RFCOMM_RELEASE_ONHUP) | (1 << RFCOMM_REUSE_DLC));
+
+ tty_port_init(&dev->port);
+ dev->port.ops = &rfcomm_port_ops;
+
+ skb_queue_head_init(&dev->pending);
+
+ rfcomm_dlc_lock(dlc);
+
+ if (req->flags & (1 << RFCOMM_REUSE_DLC)) {
+ struct sock *sk = dlc->owner;
+ struct sk_buff *skb;
+
+ BUG_ON(!sk);
+
+ rfcomm_dlc_throttle(dlc);
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+ skb_orphan(skb);
+ skb_queue_tail(&dev->pending, skb);
+ atomic_sub(skb->len, &sk->sk_rmem_alloc);
+ }
+ }
+
+ dlc->data_ready = rfcomm_dev_data_ready;
+ dlc->state_change = rfcomm_dev_state_change;
+ dlc->modem_status = rfcomm_dev_modem_status;
+
+ dlc->owner = dev;
+ dev->dlc = dlc;
+
+ rfcomm_dev_modem_status(dlc, dlc->remote_v24_sig);
+
+ rfcomm_dlc_unlock(dlc);
+
+ /* It's safe to call __module_get() here because socket already
+ holds reference to this module. */
+ __module_get(THIS_MODULE);
+
+ mutex_unlock(&rfcomm_dev_lock);
+ return dev;
+
+out:
+ mutex_unlock(&rfcomm_dev_lock);
+ kfree(dev);
+ return ERR_PTR(err);
+}
+
+static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc)
+{
+ struct rfcomm_dev *dev;
+ struct device *tty;
+
+ BT_DBG("id %d channel %d", req->dev_id, req->channel);
+
+ dev = __rfcomm_dev_add(req, dlc);
+ if (IS_ERR(dev)) {
+ rfcomm_dlc_put(dlc);
+ return PTR_ERR(dev);
+ }
+
+ tty = tty_port_register_device(&dev->port, rfcomm_tty_driver,
+ dev->id, NULL);
+ if (IS_ERR(tty)) {
+ tty_port_put(&dev->port);
+ return PTR_ERR(tty);
+ }
+
+ dev->tty_dev = tty;
+ rfcomm_reparent_device(dev);
+ dev_set_drvdata(dev->tty_dev, dev);
+
+ if (device_create_file(dev->tty_dev, &dev_attr_address) < 0)
+ BT_ERR("Failed to create address attribute");
+
+ if (device_create_file(dev->tty_dev, &dev_attr_channel) < 0)
+ BT_ERR("Failed to create channel attribute");
+
+ return dev->id;
+}
+
+/* ---- Send buffer ---- */
+static inline unsigned int rfcomm_room(struct rfcomm_dev *dev)
+{
+ struct rfcomm_dlc *dlc = dev->dlc;
+
+ /* Limit the outstanding number of packets not yet sent to 40 */
+ int pending = 40 - atomic_read(&dev->wmem_alloc);
+
+ return max(0, pending) * dlc->mtu;
+}
+
+static void rfcomm_wfree(struct sk_buff *skb)
+{
+ struct rfcomm_dev *dev = (void *) skb->sk;
+ atomic_dec(&dev->wmem_alloc);
+ if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags))
+ tty_port_tty_wakeup(&dev->port);
+ tty_port_put(&dev->port);
+}
+
+static void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev)
+{
+ tty_port_get(&dev->port);
+ atomic_inc(&dev->wmem_alloc);
+ skb->sk = (void *) dev;
+ skb->destructor = rfcomm_wfree;
+}
+
+static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority)
+{
+ struct sk_buff *skb = alloc_skb(size, priority);
+ if (skb)
+ rfcomm_set_owner_w(skb, dev);
+ return skb;
+}
+
+/* ---- Device IOCTLs ---- */
+
+#define NOCAP_FLAGS ((1 << RFCOMM_REUSE_DLC) | (1 << RFCOMM_RELEASE_ONHUP))
+
+static int __rfcomm_create_dev(struct sock *sk, void __user *arg)
+{
+ struct rfcomm_dev_req req;
+ struct rfcomm_dlc *dlc;
+ int id;
+
+ if (copy_from_user(&req, arg, sizeof(req)))
+ return -EFAULT;
+
+ BT_DBG("sk %p dev_id %d flags 0x%x", sk, req.dev_id, req.flags);
+
+ if (req.flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (req.flags & (1 << RFCOMM_REUSE_DLC)) {
+ /* Socket must be connected */
+ if (sk->sk_state != BT_CONNECTED)
+ return -EBADFD;
+
+ dlc = rfcomm_pi(sk)->dlc;
+ rfcomm_dlc_hold(dlc);
+ } else {
+ /* Validate the channel is unused */
+ dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel);
+ if (IS_ERR(dlc))
+ return PTR_ERR(dlc);
+ else if (dlc) {
+ rfcomm_dlc_put(dlc);
+ return -EBUSY;
+ }
+ dlc = rfcomm_dlc_alloc(GFP_KERNEL);
+ if (!dlc)
+ return -ENOMEM;
+ }
+
+ id = rfcomm_dev_add(&req, dlc);
+ if (id < 0)
+ return id;
+
+ if (req.flags & (1 << RFCOMM_REUSE_DLC)) {
+ /* DLC is now used by device.
+ * Socket must be disconnected */
+ sk->sk_state = BT_CLOSED;
+ }
+
+ return id;
+}
+
+static int __rfcomm_release_dev(void __user *arg)
+{
+ struct rfcomm_dev_req req;
+ struct rfcomm_dev *dev;
+ struct tty_struct *tty;
+
+ if (copy_from_user(&req, arg, sizeof(req)))
+ return -EFAULT;
+
+ BT_DBG("dev_id %d flags 0x%x", req.dev_id, req.flags);
+
+ dev = rfcomm_dev_get(req.dev_id);
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) {
+ tty_port_put(&dev->port);
+ return -EPERM;
+ }
+
+ /* only release once */
+ if (test_and_set_bit(RFCOMM_DEV_RELEASED, &dev->status)) {
+ tty_port_put(&dev->port);
+ return -EALREADY;
+ }
+
+ if (req.flags & (1 << RFCOMM_HANGUP_NOW))
+ rfcomm_dlc_close(dev->dlc, 0);
+
+ /* Shut down TTY synchronously before freeing rfcomm_dev */
+ tty = tty_port_tty_get(&dev->port);
+ if (tty) {
+ tty_vhangup(tty);
+ tty_kref_put(tty);
+ }
+
+ if (!test_bit(RFCOMM_TTY_OWNED, &dev->status))
+ tty_port_put(&dev->port);
+
+ tty_port_put(&dev->port);
+ return 0;
+}
+
+static int rfcomm_create_dev(struct sock *sk, void __user *arg)
+{
+ int ret;
+
+ mutex_lock(&rfcomm_ioctl_mutex);
+ ret = __rfcomm_create_dev(sk, arg);
+ mutex_unlock(&rfcomm_ioctl_mutex);
+
+ return ret;
+}
+
+static int rfcomm_release_dev(void __user *arg)
+{
+ int ret;
+
+ mutex_lock(&rfcomm_ioctl_mutex);
+ ret = __rfcomm_release_dev(arg);
+ mutex_unlock(&rfcomm_ioctl_mutex);
+
+ return ret;
+}
+
+static int rfcomm_get_dev_list(void __user *arg)
+{
+ struct rfcomm_dev *dev;
+ struct rfcomm_dev_list_req *dl;
+ struct rfcomm_dev_info *di;
+ int n = 0, size, err;
+ u16 dev_num;
+
+ BT_DBG("");
+
+ if (get_user(dev_num, (u16 __user *) arg))
+ return -EFAULT;
+
+ if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di))
+ return -EINVAL;
+
+ size = sizeof(*dl) + dev_num * sizeof(*di);
+
+ dl = kzalloc(size, GFP_KERNEL);
+ if (!dl)
+ return -ENOMEM;
+
+ di = dl->dev_info;
+
+ mutex_lock(&rfcomm_dev_lock);
+
+ list_for_each_entry(dev, &rfcomm_dev_list, list) {
+ if (!tty_port_get(&dev->port))
+ continue;
+ (di + n)->id = dev->id;
+ (di + n)->flags = dev->flags;
+ (di + n)->state = dev->dlc->state;
+ (di + n)->channel = dev->channel;
+ bacpy(&(di + n)->src, &dev->src);
+ bacpy(&(di + n)->dst, &dev->dst);
+ tty_port_put(&dev->port);
+ if (++n >= dev_num)
+ break;
+ }
+
+ mutex_unlock(&rfcomm_dev_lock);
+
+ dl->dev_num = n;
+ size = sizeof(*dl) + n * sizeof(*di);
+
+ err = copy_to_user(arg, dl, size);
+ kfree(dl);
+
+ return err ? -EFAULT : 0;
+}
+
+static int rfcomm_get_dev_info(void __user *arg)
+{
+ struct rfcomm_dev *dev;
+ struct rfcomm_dev_info di;
+ int err = 0;
+
+ BT_DBG("");
+
+ if (copy_from_user(&di, arg, sizeof(di)))
+ return -EFAULT;
+
+ dev = rfcomm_dev_get(di.id);
+ if (!dev)
+ return -ENODEV;
+
+ di.flags = dev->flags;
+ di.channel = dev->channel;
+ di.state = dev->dlc->state;
+ bacpy(&di.src, &dev->src);
+ bacpy(&di.dst, &dev->dst);
+
+ if (copy_to_user(arg, &di, sizeof(di)))
+ err = -EFAULT;
+
+ tty_port_put(&dev->port);
+ return err;
+}
+
+int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ BT_DBG("cmd %d arg %p", cmd, arg);
+
+ switch (cmd) {
+ case RFCOMMCREATEDEV:
+ return rfcomm_create_dev(sk, arg);
+
+ case RFCOMMRELEASEDEV:
+ return rfcomm_release_dev(arg);
+
+ case RFCOMMGETDEVLIST:
+ return rfcomm_get_dev_list(arg);
+
+ case RFCOMMGETDEVINFO:
+ return rfcomm_get_dev_info(arg);
+ }
+
+ return -EINVAL;
+}
+
+/* ---- DLC callbacks ---- */
+static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb)
+{
+ struct rfcomm_dev *dev = dlc->owner;
+
+ if (!dev) {
+ kfree_skb(skb);
+ return;
+ }
+
+ if (!skb_queue_empty(&dev->pending)) {
+ skb_queue_tail(&dev->pending, skb);
+ return;
+ }
+
+ BT_DBG("dlc %p len %d", dlc, skb->len);
+
+ tty_insert_flip_string(&dev->port, skb->data, skb->len);
+ tty_flip_buffer_push(&dev->port);
+
+ kfree_skb(skb);
+}
+
+static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
+{
+ struct rfcomm_dev *dev = dlc->owner;
+ if (!dev)
+ return;
+
+ BT_DBG("dlc %p dev %p err %d", dlc, dev, err);
+
+ dev->err = err;
+ if (dlc->state == BT_CONNECTED) {
+ rfcomm_reparent_device(dev);
+
+ wake_up_interruptible(&dev->port.open_wait);
+ } else if (dlc->state == BT_CLOSED)
+ tty_port_tty_hangup(&dev->port, false);
+}
+
+static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
+{
+ struct rfcomm_dev *dev = dlc->owner;
+ if (!dev)
+ return;
+
+ BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
+
+ if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV))
+ tty_port_tty_hangup(&dev->port, true);
+
+ dev->modem_status =
+ ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
+ ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
+ ((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) |
+ ((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0);
+}
+
+/* ---- TTY functions ---- */
+static void rfcomm_tty_copy_pending(struct rfcomm_dev *dev)
+{
+ struct sk_buff *skb;
+ int inserted = 0;
+
+ BT_DBG("dev %p", dev);
+
+ rfcomm_dlc_lock(dev->dlc);
+
+ while ((skb = skb_dequeue(&dev->pending))) {
+ inserted += tty_insert_flip_string(&dev->port, skb->data,
+ skb->len);
+ kfree_skb(skb);
+ }
+
+ rfcomm_dlc_unlock(dev->dlc);
+
+ if (inserted > 0)
+ tty_flip_buffer_push(&dev->port);
+}
+
+/* do the reverse of install, clearing the tty fields and releasing the
+ * reference to tty_port
+ */
+static void rfcomm_tty_cleanup(struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = tty->driver_data;
+
+ clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
+
+ rfcomm_dlc_lock(dev->dlc);
+ tty->driver_data = NULL;
+ rfcomm_dlc_unlock(dev->dlc);
+
+ /*
+ * purge the dlc->tx_queue to avoid circular dependencies
+ * between dev and dlc
+ */
+ skb_queue_purge(&dev->dlc->tx_queue);
+
+ tty_port_put(&dev->port);
+}
+
+/* we acquire the tty_port reference since it's here the tty is first used
+ * by setting the termios. We also populate the driver_data field and install
+ * the tty port
+ */
+static int rfcomm_tty_install(struct tty_driver *driver, struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev;
+ struct rfcomm_dlc *dlc;
+ int err;
+
+ dev = rfcomm_dev_get(tty->index);
+ if (!dev)
+ return -ENODEV;
+
+ dlc = dev->dlc;
+
+ /* Attach TTY and open DLC */
+ rfcomm_dlc_lock(dlc);
+ tty->driver_data = dev;
+ rfcomm_dlc_unlock(dlc);
+ set_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
+
+ /* install the tty_port */
+ err = tty_port_install(&dev->port, driver, tty);
+ if (err) {
+ rfcomm_tty_cleanup(tty);
+ return err;
+ }
+
+ /* take over the tty_port reference if the port was created with the
+ * flag RFCOMM_RELEASE_ONHUP. This will force the release of the port
+ * when the last process closes the tty. The behaviour is expected by
+ * userspace.
+ */
+ if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
+ set_bit(RFCOMM_TTY_OWNED, &dev->status);
+ tty_port_put(&dev->port);
+ }
+
+ return 0;
+}
+
+static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
+{
+ struct rfcomm_dev *dev = tty->driver_data;
+ int err;
+
+ BT_DBG("tty %p id %d", tty, tty->index);
+
+ BT_DBG("dev %p dst %pMR channel %d opened %d", dev, &dev->dst,
+ dev->channel, dev->port.count);
+
+ err = tty_port_open(&dev->port, tty, filp);
+ if (err)
+ return err;
+
+ /*
+ * FIXME: rfcomm should use proper flow control for
+ * received data. This hack will be unnecessary and can
+ * be removed when that's implemented
+ */
+ rfcomm_tty_copy_pending(dev);
+
+ rfcomm_dlc_unthrottle(dev->dlc);
+
+ return 0;
+}
+
+static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc,
+ dev->port.count);
+
+ tty_port_close(&dev->port, tty, filp);
+}
+
+static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dlc *dlc = dev->dlc;
+ struct sk_buff *skb;
+ int sent = 0, size;
+
+ BT_DBG("tty %p count %d", tty, count);
+
+ while (count) {
+ size = min_t(uint, count, dlc->mtu);
+
+ skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC);
+ if (!skb)
+ break;
+
+ skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
+
+ memcpy(skb_put(skb, size), buf + sent, size);
+
+ rfcomm_dlc_send_noerror(dlc, skb);
+
+ sent += size;
+ count -= size;
+ }
+
+ return sent;
+}
+
+static int rfcomm_tty_write_room(struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ int room = 0;
+
+ if (dev && dev->dlc)
+ room = rfcomm_room(dev);
+
+ BT_DBG("tty %p room %d", tty, room);
+
+ return room;
+}
+
+static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
+{
+ BT_DBG("tty %p cmd 0x%02x", tty, cmd);
+
+ switch (cmd) {
+ case TCGETS:
+ BT_DBG("TCGETS is not supported");
+ return -ENOIOCTLCMD;
+
+ case TCSETS:
+ BT_DBG("TCSETS is not supported");
+ return -ENOIOCTLCMD;
+
+ case TIOCMIWAIT:
+ BT_DBG("TIOCMIWAIT");
+ break;
+
+ case TIOCGSERIAL:
+ BT_ERR("TIOCGSERIAL is not supported");
+ return -ENOIOCTLCMD;
+
+ case TIOCSSERIAL:
+ BT_ERR("TIOCSSERIAL is not supported");
+ return -ENOIOCTLCMD;
+
+ case TIOCSERGSTRUCT:
+ BT_ERR("TIOCSERGSTRUCT is not supported");
+ return -ENOIOCTLCMD;
+
+ case TIOCSERGETLSR:
+ BT_ERR("TIOCSERGETLSR is not supported");
+ return -ENOIOCTLCMD;
+
+ case TIOCSERCONFIG:
+ BT_ERR("TIOCSERCONFIG is not supported");
+ return -ENOIOCTLCMD;
+
+ default:
+ return -ENOIOCTLCMD; /* ioctls which we must ignore */
+
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
+{
+ struct ktermios *new = &tty->termios;
+ int old_baud_rate = tty_termios_baud_rate(old);
+ int new_baud_rate = tty_termios_baud_rate(new);
+
+ u8 baud, data_bits, stop_bits, parity, x_on, x_off;
+ u16 changes = 0;
+
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p termios %p", tty, old);
+
+ if (!dev || !dev->dlc || !dev->dlc->session)
+ return;
+
+ /* Handle turning off CRTSCTS */
+ if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS))
+ BT_DBG("Turning off CRTSCTS unsupported");
+
+ /* Parity on/off and when on, odd/even */
+ if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) ||
+ ((old->c_cflag & PARODD) != (new->c_cflag & PARODD))) {
+ changes |= RFCOMM_RPN_PM_PARITY;
+ BT_DBG("Parity change detected.");
+ }
+
+ /* Mark and space parity are not supported! */
+ if (new->c_cflag & PARENB) {
+ if (new->c_cflag & PARODD) {
+ BT_DBG("Parity is ODD");
+ parity = RFCOMM_RPN_PARITY_ODD;
+ } else {
+ BT_DBG("Parity is EVEN");
+ parity = RFCOMM_RPN_PARITY_EVEN;
+ }
+ } else {
+ BT_DBG("Parity is OFF");
+ parity = RFCOMM_RPN_PARITY_NONE;
+ }
+
+ /* Setting the x_on / x_off characters */
+ if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) {
+ BT_DBG("XOFF custom");
+ x_on = new->c_cc[VSTOP];
+ changes |= RFCOMM_RPN_PM_XON;
+ } else {
+ BT_DBG("XOFF default");
+ x_on = RFCOMM_RPN_XON_CHAR;
+ }
+
+ if (old->c_cc[VSTART] != new->c_cc[VSTART]) {
+ BT_DBG("XON custom");
+ x_off = new->c_cc[VSTART];
+ changes |= RFCOMM_RPN_PM_XOFF;
+ } else {
+ BT_DBG("XON default");
+ x_off = RFCOMM_RPN_XOFF_CHAR;
+ }
+
+ /* Handle setting of stop bits */
+ if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB))
+ changes |= RFCOMM_RPN_PM_STOP;
+
+ /* POSIX does not support 1.5 stop bits and RFCOMM does not
+ * support 2 stop bits. So a request for 2 stop bits gets
+ * translated to 1.5 stop bits */
+ if (new->c_cflag & CSTOPB)
+ stop_bits = RFCOMM_RPN_STOP_15;
+ else
+ stop_bits = RFCOMM_RPN_STOP_1;
+
+ /* Handle number of data bits [5-8] */
+ if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE))
+ changes |= RFCOMM_RPN_PM_DATA;
+
+ switch (new->c_cflag & CSIZE) {
+ case CS5:
+ data_bits = RFCOMM_RPN_DATA_5;
+ break;
+ case CS6:
+ data_bits = RFCOMM_RPN_DATA_6;
+ break;
+ case CS7:
+ data_bits = RFCOMM_RPN_DATA_7;
+ break;
+ case CS8:
+ data_bits = RFCOMM_RPN_DATA_8;
+ break;
+ default:
+ data_bits = RFCOMM_RPN_DATA_8;
+ break;
+ }
+
+ /* Handle baudrate settings */
+ if (old_baud_rate != new_baud_rate)
+ changes |= RFCOMM_RPN_PM_BITRATE;
+
+ switch (new_baud_rate) {
+ case 2400:
+ baud = RFCOMM_RPN_BR_2400;
+ break;
+ case 4800:
+ baud = RFCOMM_RPN_BR_4800;
+ break;
+ case 7200:
+ baud = RFCOMM_RPN_BR_7200;
+ break;
+ case 9600:
+ baud = RFCOMM_RPN_BR_9600;
+ break;
+ case 19200:
+ baud = RFCOMM_RPN_BR_19200;
+ break;
+ case 38400:
+ baud = RFCOMM_RPN_BR_38400;
+ break;
+ case 57600:
+ baud = RFCOMM_RPN_BR_57600;
+ break;
+ case 115200:
+ baud = RFCOMM_RPN_BR_115200;
+ break;
+ case 230400:
+ baud = RFCOMM_RPN_BR_230400;
+ break;
+ default:
+ /* 9600 is standard accordinag to the RFCOMM specification */
+ baud = RFCOMM_RPN_BR_9600;
+ break;
+
+ }
+
+ if (changes)
+ rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud,
+ data_bits, stop_bits, parity,
+ RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes);
+}
+
+static void rfcomm_tty_throttle(struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p dev %p", tty, dev);
+
+ rfcomm_dlc_throttle(dev->dlc);
+}
+
+static void rfcomm_tty_unthrottle(struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p dev %p", tty, dev);
+
+ rfcomm_dlc_unthrottle(dev->dlc);
+}
+
+static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p dev %p", tty, dev);
+
+ if (!dev || !dev->dlc)
+ return 0;
+
+ if (!skb_queue_empty(&dev->dlc->tx_queue))
+ return dev->dlc->mtu;
+
+ return 0;
+}
+
+static void rfcomm_tty_flush_buffer(struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p dev %p", tty, dev);
+
+ if (!dev || !dev->dlc)
+ return;
+
+ skb_queue_purge(&dev->dlc->tx_queue);
+ tty_wakeup(tty);
+}
+
+static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch)
+{
+ BT_DBG("tty %p ch %c", tty, ch);
+}
+
+static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+ BT_DBG("tty %p timeout %d", tty, timeout);
+}
+
+static void rfcomm_tty_hangup(struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p dev %p", tty, dev);
+
+ tty_port_hangup(&dev->port);
+}
+
+static int rfcomm_tty_tiocmget(struct tty_struct *tty)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p dev %p", tty, dev);
+
+ return dev->modem_status;
+}
+
+static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear)
+{
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dlc *dlc = dev->dlc;
+ u8 v24_sig;
+
+ BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear);
+
+ rfcomm_dlc_get_modem_status(dlc, &v24_sig);
+
+ if (set & TIOCM_DSR || set & TIOCM_DTR)
+ v24_sig |= RFCOMM_V24_RTC;
+ if (set & TIOCM_RTS || set & TIOCM_CTS)
+ v24_sig |= RFCOMM_V24_RTR;
+ if (set & TIOCM_RI)
+ v24_sig |= RFCOMM_V24_IC;
+ if (set & TIOCM_CD)
+ v24_sig |= RFCOMM_V24_DV;
+
+ if (clear & TIOCM_DSR || clear & TIOCM_DTR)
+ v24_sig &= ~RFCOMM_V24_RTC;
+ if (clear & TIOCM_RTS || clear & TIOCM_CTS)
+ v24_sig &= ~RFCOMM_V24_RTR;
+ if (clear & TIOCM_RI)
+ v24_sig &= ~RFCOMM_V24_IC;
+ if (clear & TIOCM_CD)
+ v24_sig &= ~RFCOMM_V24_DV;
+
+ rfcomm_dlc_set_modem_status(dlc, v24_sig);
+
+ return 0;
+}
+
+/* ---- TTY structure ---- */
+
+static const struct tty_operations rfcomm_ops = {
+ .open = rfcomm_tty_open,
+ .close = rfcomm_tty_close,
+ .write = rfcomm_tty_write,
+ .write_room = rfcomm_tty_write_room,
+ .chars_in_buffer = rfcomm_tty_chars_in_buffer,
+ .flush_buffer = rfcomm_tty_flush_buffer,
+ .ioctl = rfcomm_tty_ioctl,
+ .throttle = rfcomm_tty_throttle,
+ .unthrottle = rfcomm_tty_unthrottle,
+ .set_termios = rfcomm_tty_set_termios,
+ .send_xchar = rfcomm_tty_send_xchar,
+ .hangup = rfcomm_tty_hangup,
+ .wait_until_sent = rfcomm_tty_wait_until_sent,
+ .tiocmget = rfcomm_tty_tiocmget,
+ .tiocmset = rfcomm_tty_tiocmset,
+ .install = rfcomm_tty_install,
+ .cleanup = rfcomm_tty_cleanup,
+};
+
+int __init rfcomm_init_ttys(void)
+{
+ int error;
+
+ rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS);
+ if (!rfcomm_tty_driver)
+ return -ENOMEM;
+
+ rfcomm_tty_driver->driver_name = "rfcomm";
+ rfcomm_tty_driver->name = "rfcomm";
+ rfcomm_tty_driver->major = RFCOMM_TTY_MAJOR;
+ rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR;
+ rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
+ rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL;
+ rfcomm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
+ rfcomm_tty_driver->init_termios = tty_std_termios;
+ rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL;
+ rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON;
+ tty_set_operations(rfcomm_tty_driver, &rfcomm_ops);
+
+ error = tty_register_driver(rfcomm_tty_driver);
+ if (error) {
+ BT_ERR("Can't register RFCOMM TTY driver");
+ put_tty_driver(rfcomm_tty_driver);
+ return error;
+ }
+
+ BT_INFO("RFCOMM TTY layer initialized");
+
+ return 0;
+}
+
+void rfcomm_cleanup_ttys(void)
+{
+ tty_unregister_driver(rfcomm_tty_driver);
+ put_tty_driver(rfcomm_tty_driver);
+}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
new file mode 100644
index 000000000..4322c833e
--- /dev/null
+++ b/net/bluetooth/sco.c
@@ -0,0 +1,1248 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2000-2001 Qualcomm Incorporated
+
+ Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth SCO sockets. */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/sco.h>
+
+static bool disable_esco;
+
+static const struct proto_ops sco_sock_ops;
+
+static struct bt_sock_list sco_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(sco_sk_list.lock)
+};
+
+/* ---- SCO connections ---- */
+struct sco_conn {
+ struct hci_conn *hcon;
+
+ spinlock_t lock;
+ struct sock *sk;
+
+ unsigned int mtu;
+};
+
+#define sco_conn_lock(c) spin_lock(&c->lock);
+#define sco_conn_unlock(c) spin_unlock(&c->lock);
+
+static void sco_sock_close(struct sock *sk);
+static void sco_sock_kill(struct sock *sk);
+
+/* ----- SCO socket info ----- */
+#define sco_pi(sk) ((struct sco_pinfo *) sk)
+
+struct sco_pinfo {
+ struct bt_sock bt;
+ bdaddr_t src;
+ bdaddr_t dst;
+ __u32 flags;
+ __u16 setting;
+ struct sco_conn *conn;
+};
+
+/* ---- SCO timers ---- */
+#define SCO_CONN_TIMEOUT (HZ * 40)
+#define SCO_DISCONN_TIMEOUT (HZ * 2)
+
+static void sco_sock_timeout(unsigned long arg)
+{
+ struct sock *sk = (struct sock *) arg;
+
+ BT_DBG("sock %p state %d", sk, sk->sk_state);
+
+ bh_lock_sock(sk);
+ sk->sk_err = ETIMEDOUT;
+ sk->sk_state_change(sk);
+ bh_unlock_sock(sk);
+
+ sco_sock_kill(sk);
+ sock_put(sk);
+}
+
+static void sco_sock_set_timer(struct sock *sk, long timeout)
+{
+ BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout);
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout);
+}
+
+static void sco_sock_clear_timer(struct sock *sk)
+{
+ BT_DBG("sock %p state %d", sk, sk->sk_state);
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+/* ---- SCO connections ---- */
+static struct sco_conn *sco_conn_add(struct hci_conn *hcon)
+{
+ struct hci_dev *hdev = hcon->hdev;
+ struct sco_conn *conn = hcon->sco_data;
+
+ if (conn)
+ return conn;
+
+ conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL);
+ if (!conn)
+ return NULL;
+
+ spin_lock_init(&conn->lock);
+
+ hcon->sco_data = conn;
+ conn->hcon = hcon;
+
+ if (hdev->sco_mtu > 0)
+ conn->mtu = hdev->sco_mtu;
+ else
+ conn->mtu = 60;
+
+ BT_DBG("hcon %p conn %p", hcon, conn);
+
+ return conn;
+}
+
+/* Delete channel.
+ * Must be called on the locked socket. */
+static void sco_chan_del(struct sock *sk, int err)
+{
+ struct sco_conn *conn;
+
+ conn = sco_pi(sk)->conn;
+
+ BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
+
+ if (conn) {
+ sco_conn_lock(conn);
+ conn->sk = NULL;
+ sco_pi(sk)->conn = NULL;
+ sco_conn_unlock(conn);
+
+ if (conn->hcon)
+ hci_conn_drop(conn->hcon);
+ }
+
+ sk->sk_state = BT_CLOSED;
+ sk->sk_err = err;
+ sk->sk_state_change(sk);
+
+ sock_set_flag(sk, SOCK_ZAPPED);
+}
+
+static int sco_conn_del(struct hci_conn *hcon, int err)
+{
+ struct sco_conn *conn = hcon->sco_data;
+ struct sock *sk;
+
+ if (!conn)
+ return 0;
+
+ BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+
+ /* Kill socket */
+ sco_conn_lock(conn);
+ sk = conn->sk;
+ sco_conn_unlock(conn);
+
+ if (sk) {
+ bh_lock_sock(sk);
+ sco_sock_clear_timer(sk);
+ sco_chan_del(sk, err);
+ bh_unlock_sock(sk);
+ sco_sock_kill(sk);
+ }
+
+ hcon->sco_data = NULL;
+ kfree(conn);
+ return 0;
+}
+
+static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent)
+{
+ BT_DBG("conn %p", conn);
+
+ sco_pi(sk)->conn = conn;
+ conn->sk = sk;
+
+ if (parent)
+ bt_accept_enqueue(parent, sk);
+}
+
+static int sco_chan_add(struct sco_conn *conn, struct sock *sk,
+ struct sock *parent)
+{
+ int err = 0;
+
+ sco_conn_lock(conn);
+ if (conn->sk)
+ err = -EBUSY;
+ else
+ __sco_chan_add(conn, sk, parent);
+
+ sco_conn_unlock(conn);
+ return err;
+}
+
+static int sco_connect(struct sock *sk)
+{
+ struct sco_conn *conn;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ int err, type;
+
+ BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst);
+
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+
+ if (lmp_esco_capable(hdev) && !disable_esco)
+ type = ESCO_LINK;
+ else
+ type = SCO_LINK;
+
+ if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT &&
+ (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+
+ hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst,
+ sco_pi(sk)->setting);
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto done;
+ }
+
+ conn = sco_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto done;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&sco_pi(sk)->src, &hcon->src);
+
+ err = sco_chan_add(conn, sk, NULL);
+ if (err)
+ goto done;
+
+ if (hcon->state == BT_CONNECTED) {
+ sco_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ } else {
+ sk->sk_state = BT_CONNECT;
+ sco_sock_set_timer(sk, sk->sk_sndtimeo);
+ }
+
+done:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+
+static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sco_conn *conn = sco_pi(sk)->conn;
+ struct sk_buff *skb;
+ int err;
+
+ /* Check outgoing MTU */
+ if (len > conn->mtu)
+ return -EINVAL;
+
+ BT_DBG("sk %p len %d", sk, len);
+
+ skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ return err;
+
+ if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ hci_send_sco(conn->hcon, skb);
+
+ return len;
+}
+
+static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb)
+{
+ struct sock *sk;
+
+ sco_conn_lock(conn);
+ sk = conn->sk;
+ sco_conn_unlock(conn);
+
+ if (!sk)
+ goto drop;
+
+ BT_DBG("sk %p len %d", sk, skb->len);
+
+ if (sk->sk_state != BT_CONNECTED)
+ goto drop;
+
+ if (!sock_queue_rcv_skb(sk, skb))
+ return;
+
+drop:
+ kfree_skb(skb);
+}
+
+/* -------- Socket interface ---------- */
+static struct sock *__sco_get_sock_listen_by_addr(bdaddr_t *ba)
+{
+ struct sock *sk;
+
+ sk_for_each(sk, &sco_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (!bacmp(&sco_pi(sk)->src, ba))
+ return sk;
+ }
+
+ return NULL;
+}
+
+/* Find socket listening on source bdaddr.
+ * Returns closest match.
+ */
+static struct sock *sco_get_sock_listen(bdaddr_t *src)
+{
+ struct sock *sk = NULL, *sk1 = NULL;
+
+ read_lock(&sco_sk_list.lock);
+
+ sk_for_each(sk, &sco_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ /* Exact match. */
+ if (!bacmp(&sco_pi(sk)->src, src))
+ break;
+
+ /* Closest match */
+ if (!bacmp(&sco_pi(sk)->src, BDADDR_ANY))
+ sk1 = sk;
+ }
+
+ read_unlock(&sco_sk_list.lock);
+
+ return sk ? sk : sk1;
+}
+
+static void sco_sock_destruct(struct sock *sk)
+{
+ BT_DBG("sk %p", sk);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+}
+
+static void sco_sock_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ BT_DBG("parent %p", parent);
+
+ /* Close not yet accepted channels */
+ while ((sk = bt_accept_dequeue(parent, NULL))) {
+ sco_sock_close(sk);
+ sco_sock_kill(sk);
+ }
+
+ parent->sk_state = BT_CLOSED;
+ sock_set_flag(parent, SOCK_ZAPPED);
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void sco_sock_kill(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+ return;
+
+ BT_DBG("sk %p state %d", sk, sk->sk_state);
+
+ /* Kill poor orphan */
+ bt_sock_unlink(&sco_sk_list, sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+}
+
+static void __sco_sock_close(struct sock *sk)
+{
+ BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
+
+ switch (sk->sk_state) {
+ case BT_LISTEN:
+ sco_sock_cleanup_listen(sk);
+ break;
+
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ if (sco_pi(sk)->conn->hcon) {
+ sk->sk_state = BT_DISCONN;
+ sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT);
+ hci_conn_drop(sco_pi(sk)->conn->hcon);
+ sco_pi(sk)->conn->hcon = NULL;
+ } else
+ sco_chan_del(sk, ECONNRESET);
+ break;
+
+ case BT_CONNECT2:
+ case BT_CONNECT:
+ case BT_DISCONN:
+ sco_chan_del(sk, ECONNRESET);
+ break;
+
+ default:
+ sock_set_flag(sk, SOCK_ZAPPED);
+ break;
+ }
+}
+
+/* Must be called on unlocked socket. */
+static void sco_sock_close(struct sock *sk)
+{
+ sco_sock_clear_timer(sk);
+ lock_sock(sk);
+ __sco_sock_close(sk);
+ release_sock(sk);
+ sco_sock_kill(sk);
+}
+
+static void sco_sock_init(struct sock *sk, struct sock *parent)
+{
+ BT_DBG("sk %p", sk);
+
+ if (parent) {
+ sk->sk_type = parent->sk_type;
+ bt_sk(sk)->flags = bt_sk(parent)->flags;
+ security_sk_clone(parent, sk);
+ }
+}
+
+static struct proto sco_proto = {
+ .name = "SCO",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct sco_pinfo)
+};
+
+static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk);
+ INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+ sk->sk_destruct = sco_sock_destruct;
+ sk->sk_sndtimeo = SCO_CONN_TIMEOUT;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = proto;
+ sk->sk_state = BT_OPEN;
+
+ sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT;
+
+ setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk);
+
+ bt_sock_link(&sco_sk_list, sk);
+ return sk;
+}
+
+static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_SEQPACKET)
+ return -ESOCKTNOSUPPORT;
+
+ sock->ops = &sco_sock_ops;
+
+ sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+ if (!sk)
+ return -ENOMEM;
+
+ sco_sock_init(sk, NULL);
+ return 0;
+}
+
+static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+ struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr);
+
+ if (!addr || addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_OPEN) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ bacpy(&sco_pi(sk)->src, &sa->sco_bdaddr);
+
+ sk->sk_state = BT_BOUND;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+{
+ struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
+ struct sock *sk = sock->sk;
+ int err;
+
+ BT_DBG("sk %p", sk);
+
+ if (alen < sizeof(struct sockaddr_sco) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND)
+ return -EBADFD;
+
+ if (sk->sk_type != SOCK_SEQPACKET)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ /* Set destination address and psm */
+ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr);
+
+ err = sco_connect(sk);
+ if (err)
+ goto done;
+
+ err = bt_sock_wait_state(sk, BT_CONNECTED,
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int sco_sock_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ bdaddr_t *src = &sco_pi(sk)->src;
+ int err = 0;
+
+ BT_DBG("sk %p backlog %d", sk, backlog);
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_BOUND) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ write_lock(&sco_sk_list.lock);
+
+ if (__sco_get_sock_listen_by_addr(src)) {
+ err = -EADDRINUSE;
+ goto unlock;
+ }
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+
+ sk->sk_state = BT_LISTEN;
+
+unlock:
+ write_unlock(&sco_sk_list.lock);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = sock->sk, *ch;
+ long timeo;
+ int err = 0;
+
+ lock_sock(sk);
+
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ BT_DBG("sk %p timeo %ld", sk, timeo);
+
+ /* Wait for an incoming connection. (wake-one). */
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (1) {
+ if (sk->sk_state != BT_LISTEN) {
+ err = -EBADFD;
+ break;
+ }
+
+ ch = bt_accept_dequeue(sk, newsock);
+ if (ch)
+ break;
+
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ lock_sock(sk);
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (err)
+ goto done;
+
+ newsock->state = SS_CONNECTED;
+
+ BT_DBG("new socket %p", ch);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+{
+ struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
+ struct sock *sk = sock->sk;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ addr->sa_family = AF_BLUETOOTH;
+ *len = sizeof(struct sockaddr_sco);
+
+ if (peer)
+ bacpy(&sa->sco_bdaddr, &sco_pi(sk)->dst);
+ else
+ bacpy(&sa->sco_bdaddr, &sco_pi(sk)->src);
+
+ return 0;
+}
+
+static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ int err;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_CONNECTED)
+ err = sco_send_frame(sk, msg, len);
+ else
+ err = -ENOTCONN;
+
+ release_sock(sk);
+ return err;
+}
+
+static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("conn %p", conn);
+
+ conn->state = BT_CONFIG;
+
+ if (!lmp_esco_capable(hdev)) {
+ struct hci_cp_accept_conn_req cp;
+
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.role = 0x00; /* Ignored */
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp);
+ } else {
+ struct hci_cp_accept_sync_conn_req cp;
+
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+ cp.tx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.rx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.content_format = cpu_to_le16(setting);
+
+ switch (setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_TRANSP:
+ if (conn->pkt_type & ESCO_2EV3)
+ cp.max_latency = cpu_to_le16(0x0008);
+ else
+ cp.max_latency = cpu_to_le16(0x000D);
+ cp.retrans_effort = 0x02;
+ break;
+ case SCO_AIRMODE_CVSD:
+ cp.max_latency = cpu_to_le16(0xffff);
+ cp.retrans_effort = 0xff;
+ break;
+ }
+
+ hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ,
+ sizeof(cp), &cp);
+ }
+}
+
+static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sco_pinfo *pi = sco_pi(sk);
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ sco_conn_defer_accept(pi->conn->hcon, pi->setting);
+ sk->sk_state = BT_CONFIG;
+
+ release_sock(sk);
+ return 0;
+ }
+
+ release_sock(sk);
+
+ return bt_sock_recvmsg(sock, msg, len, flags);
+}
+
+static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int len, err = 0;
+ struct bt_voice voice;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ lock_sock(sk);
+
+ switch (optname) {
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt)
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ break;
+
+ case BT_VOICE:
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+ sk->sk_state != BT_CONNECT2) {
+ err = -EINVAL;
+ break;
+ }
+
+ voice.setting = sco_pi(sk)->setting;
+
+ len = min_t(unsigned int, sizeof(voice), optlen);
+ if (copy_from_user((char *) &voice, optval, len)) {
+ err = -EFAULT;
+ break;
+ }
+
+ /* Explicitly check for these values */
+ if (voice.setting != BT_VOICE_TRANSPARENT &&
+ voice.setting != BT_VOICE_CVSD_16BIT) {
+ err = -EINVAL;
+ break;
+ }
+
+ sco_pi(sk)->setting = voice.setting;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct sco_options opts;
+ struct sco_conninfo cinfo;
+ int len, err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case SCO_OPTIONS:
+ if (sk->sk_state != BT_CONNECTED &&
+ !(sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ opts.mtu = sco_pi(sk)->conn->mtu;
+
+ BT_DBG("mtu %d", opts.mtu);
+
+ len = min_t(unsigned int, len, sizeof(opts));
+ if (copy_to_user(optval, (char *)&opts, len))
+ err = -EFAULT;
+
+ break;
+
+ case SCO_CONNINFO:
+ if (sk->sk_state != BT_CONNECTED &&
+ !(sk->sk_state == BT_CONNECT2 &&
+ test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ memset(&cinfo, 0, sizeof(cinfo));
+ cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle;
+ memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3);
+
+ len = min_t(unsigned int, len, sizeof(cinfo));
+ if (copy_to_user(optval, (char *)&cinfo, len))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int len, err = 0;
+ struct bt_voice voice;
+
+ BT_DBG("sk %p", sk);
+
+ if (level == SOL_SCO)
+ return sco_sock_getsockopt_old(sock, optname, optval, optlen);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+ (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case BT_VOICE:
+ voice.setting = sco_pi(sk)->setting;
+
+ len = min_t(unsigned int, len, sizeof(voice));
+ if (copy_to_user(optval, (char *)&voice, len))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int sco_sock_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ if (!sk->sk_shutdown) {
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ sco_sock_clear_timer(sk);
+ __sco_sock_close(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
+ err = bt_sock_wait_state(sk, BT_CLOSED,
+ sk->sk_lingertime);
+ }
+ release_sock(sk);
+ return err;
+}
+
+static int sco_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ sco_sock_close(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING)) {
+ lock_sock(sk);
+ err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+ release_sock(sk);
+ }
+
+ sock_orphan(sk);
+ sco_sock_kill(sk);
+ return err;
+}
+
+static void sco_conn_ready(struct sco_conn *conn)
+{
+ struct sock *parent;
+ struct sock *sk = conn->sk;
+
+ BT_DBG("conn %p", conn);
+
+ if (sk) {
+ sco_sock_clear_timer(sk);
+ bh_lock_sock(sk);
+ sk->sk_state = BT_CONNECTED;
+ sk->sk_state_change(sk);
+ bh_unlock_sock(sk);
+ } else {
+ sco_conn_lock(conn);
+
+ parent = sco_get_sock_listen(&conn->hcon->src);
+ if (!parent) {
+ sco_conn_unlock(conn);
+ return;
+ }
+
+ bh_lock_sock(parent);
+
+ sk = sco_sock_alloc(sock_net(parent), NULL,
+ BTPROTO_SCO, GFP_ATOMIC);
+ if (!sk) {
+ bh_unlock_sock(parent);
+ sco_conn_unlock(conn);
+ return;
+ }
+
+ sco_sock_init(sk, parent);
+
+ bacpy(&sco_pi(sk)->src, &conn->hcon->src);
+ bacpy(&sco_pi(sk)->dst, &conn->hcon->dst);
+
+ hci_conn_hold(conn->hcon);
+ __sco_chan_add(conn, sk, parent);
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
+ sk->sk_state = BT_CONNECT2;
+ else
+ sk->sk_state = BT_CONNECTED;
+
+ /* Wake up parent */
+ parent->sk_data_ready(parent);
+
+ bh_unlock_sock(parent);
+
+ sco_conn_unlock(conn);
+ }
+}
+
+/* ----- SCO interface with lower layer (HCI) ----- */
+int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
+{
+ struct sock *sk;
+ int lm = 0;
+
+ BT_DBG("hdev %s, bdaddr %pMR", hdev->name, bdaddr);
+
+ /* Find listening sockets */
+ read_lock(&sco_sk_list.lock);
+ sk_for_each(sk, &sco_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (!bacmp(&sco_pi(sk)->src, &hdev->bdaddr) ||
+ !bacmp(&sco_pi(sk)->src, BDADDR_ANY)) {
+ lm |= HCI_LM_ACCEPT;
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))
+ *flags |= HCI_PROTO_DEFER;
+ break;
+ }
+ }
+ read_unlock(&sco_sk_list.lock);
+
+ return lm;
+}
+
+static void sco_connect_cfm(struct hci_conn *hcon, __u8 status)
+{
+ if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
+ return;
+
+ BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status);
+
+ if (!status) {
+ struct sco_conn *conn;
+
+ conn = sco_conn_add(hcon);
+ if (conn)
+ sco_conn_ready(conn);
+ } else
+ sco_conn_del(hcon, bt_to_errno(status));
+}
+
+static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
+{
+ if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
+ return;
+
+ BT_DBG("hcon %p reason %d", hcon, reason);
+
+ sco_conn_del(hcon, bt_to_errno(reason));
+}
+
+int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+{
+ struct sco_conn *conn = hcon->sco_data;
+
+ if (!conn)
+ goto drop;
+
+ BT_DBG("conn %p len %d", conn, skb->len);
+
+ if (skb->len) {
+ sco_recv_frame(conn, skb);
+ return 0;
+ }
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct hci_cb sco_cb = {
+ .name = "SCO",
+ .connect_cfm = sco_connect_cfm,
+ .disconn_cfm = sco_disconn_cfm,
+};
+
+static int sco_debugfs_show(struct seq_file *f, void *p)
+{
+ struct sock *sk;
+
+ read_lock(&sco_sk_list.lock);
+
+ sk_for_each(sk, &sco_sk_list.head) {
+ seq_printf(f, "%pMR %pMR %d\n", &sco_pi(sk)->src,
+ &sco_pi(sk)->dst, sk->sk_state);
+ }
+
+ read_unlock(&sco_sk_list.lock);
+
+ return 0;
+}
+
+static int sco_debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, sco_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations sco_debugfs_fops = {
+ .open = sco_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *sco_debugfs;
+
+static const struct proto_ops sco_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = sco_sock_release,
+ .bind = sco_sock_bind,
+ .connect = sco_sock_connect,
+ .listen = sco_sock_listen,
+ .accept = sco_sock_accept,
+ .getname = sco_sock_getname,
+ .sendmsg = sco_sock_sendmsg,
+ .recvmsg = sco_sock_recvmsg,
+ .poll = bt_sock_poll,
+ .ioctl = bt_sock_ioctl,
+ .mmap = sock_no_mmap,
+ .socketpair = sock_no_socketpair,
+ .shutdown = sco_sock_shutdown,
+ .setsockopt = sco_sock_setsockopt,
+ .getsockopt = sco_sock_getsockopt
+};
+
+static const struct net_proto_family sco_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = sco_sock_create,
+};
+
+int __init sco_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sockaddr_sco) > sizeof(struct sockaddr));
+
+ err = proto_register(&sco_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_SCO, &sco_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("SCO socket registration failed");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "sco", &sco_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create SCO proc file");
+ bt_sock_unregister(BTPROTO_SCO);
+ goto error;
+ }
+
+ BT_INFO("SCO socket layer initialized");
+
+ hci_register_cb(&sco_cb);
+
+ if (IS_ERR_OR_NULL(bt_debugfs))
+ return 0;
+
+ sco_debugfs = debugfs_create_file("sco", 0444, bt_debugfs,
+ NULL, &sco_debugfs_fops);
+
+ return 0;
+
+error:
+ proto_unregister(&sco_proto);
+ return err;
+}
+
+void sco_exit(void)
+{
+ bt_procfs_cleanup(&init_net, "sco");
+
+ debugfs_remove(sco_debugfs);
+
+ hci_unregister_cb(&sco_cb);
+
+ bt_sock_unregister(BTPROTO_SCO);
+
+ proto_unregister(&sco_proto);
+}
+
+module_param(disable_esco, bool, 0644);
+MODULE_PARM_DESC(disable_esco, "Disable eSCO connection creation");
diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c
new file mode 100644
index 000000000..dc688f13e
--- /dev/null
+++ b/net/bluetooth/selftest.c
@@ -0,0 +1,271 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/debugfs.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "ecc.h"
+#include "smp.h"
+#include "selftest.h"
+
+#if IS_ENABLED(CONFIG_BT_SELFTEST_ECDH)
+
+static const u8 priv_a_1[32] __initconst = {
+ 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58,
+ 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a,
+ 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74,
+ 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f,
+};
+static const u8 priv_b_1[32] __initconst = {
+ 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b,
+ 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59,
+ 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90,
+ 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55,
+};
+static const u8 pub_a_1[64] __initconst = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20,
+
+ 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74,
+ 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76,
+ 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63,
+ 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc,
+};
+static const u8 pub_b_1[64] __initconst = {
+ 0x90, 0xa1, 0xaa, 0x2f, 0xb2, 0x77, 0x90, 0x55,
+ 0x9f, 0xa6, 0x15, 0x86, 0xfd, 0x8a, 0xb5, 0x47,
+ 0x00, 0x4c, 0x9e, 0xf1, 0x84, 0x22, 0x59, 0x09,
+ 0x96, 0x1d, 0xaf, 0x1f, 0xf0, 0xf0, 0xa1, 0x1e,
+
+ 0x4a, 0x21, 0xb1, 0x15, 0xf9, 0xaf, 0x89, 0x5f,
+ 0x76, 0x36, 0x8e, 0xe2, 0x30, 0x11, 0x2d, 0x47,
+ 0x60, 0x51, 0xb8, 0x9a, 0x3a, 0x70, 0x56, 0x73,
+ 0x37, 0xad, 0x9d, 0x42, 0x3e, 0xf3, 0x55, 0x4c,
+};
+static const u8 dhkey_1[32] __initconst = {
+ 0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86,
+ 0xf1, 0x66, 0xf8, 0xb4, 0x13, 0x6b, 0x79, 0x99,
+ 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
+ 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec,
+};
+
+static const u8 priv_a_2[32] __initconst = {
+ 0x63, 0x76, 0x45, 0xd0, 0xf7, 0x73, 0xac, 0xb7,
+ 0xff, 0xdd, 0x03, 0x72, 0xb9, 0x72, 0x85, 0xb4,
+ 0x41, 0xb6, 0x5d, 0x0c, 0x5d, 0x54, 0x84, 0x60,
+ 0x1a, 0xa3, 0x9a, 0x3c, 0x69, 0x16, 0xa5, 0x06,
+};
+static const u8 priv_b_2[32] __initconst = {
+ 0xba, 0x30, 0x55, 0x50, 0x19, 0xa2, 0xca, 0xa3,
+ 0xa5, 0x29, 0x08, 0xc6, 0xb5, 0x03, 0x88, 0x7e,
+ 0x03, 0x2b, 0x50, 0x73, 0xd4, 0x2e, 0x50, 0x97,
+ 0x64, 0xcd, 0x72, 0x0d, 0x67, 0xa0, 0x9a, 0x52,
+};
+static const u8 pub_a_2[64] __initconst = {
+ 0xdd, 0x78, 0x5c, 0x74, 0x03, 0x9b, 0x7e, 0x98,
+ 0xcb, 0x94, 0x87, 0x4a, 0xad, 0xfa, 0xf8, 0xd5,
+ 0x43, 0x3e, 0x5c, 0xaf, 0xea, 0xb5, 0x4c, 0xf4,
+ 0x9e, 0x80, 0x79, 0x57, 0x7b, 0xa4, 0x31, 0x2c,
+
+ 0x4f, 0x5d, 0x71, 0x43, 0x77, 0x43, 0xf8, 0xea,
+ 0xd4, 0x3e, 0xbd, 0x17, 0x91, 0x10, 0x21, 0xd0,
+ 0x1f, 0x87, 0x43, 0x8e, 0x40, 0xe2, 0x52, 0xcd,
+ 0xbe, 0xdf, 0x98, 0x38, 0x18, 0x12, 0x95, 0x91,
+};
+static const u8 pub_b_2[64] __initconst = {
+ 0xcc, 0x00, 0x65, 0xe1, 0xf5, 0x6c, 0x0d, 0xcf,
+ 0xec, 0x96, 0x47, 0x20, 0x66, 0xc9, 0xdb, 0x84,
+ 0x81, 0x75, 0xa8, 0x4d, 0xc0, 0xdf, 0xc7, 0x9d,
+ 0x1b, 0x3f, 0x3d, 0xf2, 0x3f, 0xe4, 0x65, 0xf4,
+
+ 0x79, 0xb2, 0xec, 0xd8, 0xca, 0x55, 0xa1, 0xa8,
+ 0x43, 0x4d, 0x6b, 0xca, 0x10, 0xb0, 0xc2, 0x01,
+ 0xc2, 0x33, 0x4e, 0x16, 0x24, 0xc4, 0xef, 0xee,
+ 0x99, 0xd8, 0xbb, 0xbc, 0x48, 0xd0, 0x01, 0x02,
+};
+static const u8 dhkey_2[32] __initconst = {
+ 0x69, 0xeb, 0x21, 0x32, 0xf2, 0xc6, 0x05, 0x41,
+ 0x60, 0x19, 0xcd, 0x5e, 0x94, 0xe1, 0xe6, 0x5f,
+ 0x33, 0x07, 0xe3, 0x38, 0x4b, 0x68, 0xe5, 0x62,
+ 0x3f, 0x88, 0x6d, 0x2f, 0x3a, 0x84, 0x85, 0xab,
+};
+
+static const u8 priv_a_3[32] __initconst = {
+ 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58,
+ 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a,
+ 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74,
+ 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f,
+};
+static const u8 pub_a_3[64] __initconst = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20,
+
+ 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74,
+ 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76,
+ 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63,
+ 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc,
+};
+static const u8 dhkey_3[32] __initconst = {
+ 0x2d, 0xab, 0x00, 0x48, 0xcb, 0xb3, 0x7b, 0xda,
+ 0x55, 0x7b, 0x8b, 0x72, 0xa8, 0x57, 0x87, 0xc3,
+ 0x87, 0x27, 0x99, 0x32, 0xfc, 0x79, 0x5f, 0xae,
+ 0x7c, 0x1c, 0xf9, 0x49, 0xe6, 0xd7, 0xaa, 0x70,
+};
+
+static int __init test_ecdh_sample(const u8 priv_a[32], const u8 priv_b[32],
+ const u8 pub_a[64], const u8 pub_b[64],
+ const u8 dhkey[32])
+{
+ u8 dhkey_a[32], dhkey_b[32];
+
+ ecdh_shared_secret(pub_b, priv_a, dhkey_a);
+ ecdh_shared_secret(pub_a, priv_b, dhkey_b);
+
+ if (memcmp(dhkey_a, dhkey, 32))
+ return -EINVAL;
+
+ if (memcmp(dhkey_b, dhkey, 32))
+ return -EINVAL;
+
+ return 0;
+}
+
+static char test_ecdh_buffer[32];
+
+static ssize_t test_ecdh_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ return simple_read_from_buffer(user_buf, count, ppos, test_ecdh_buffer,
+ strlen(test_ecdh_buffer));
+}
+
+static const struct file_operations test_ecdh_fops = {
+ .open = simple_open,
+ .read = test_ecdh_read,
+ .llseek = default_llseek,
+};
+
+static int __init test_ecdh(void)
+{
+ ktime_t calltime, delta, rettime;
+ unsigned long long duration;
+ int err;
+
+ calltime = ktime_get();
+
+ err = test_ecdh_sample(priv_a_1, priv_b_1, pub_a_1, pub_b_1, dhkey_1);
+ if (err) {
+ BT_ERR("ECDH sample 1 failed");
+ goto done;
+ }
+
+ err = test_ecdh_sample(priv_a_2, priv_b_2, pub_a_2, pub_b_2, dhkey_2);
+ if (err) {
+ BT_ERR("ECDH sample 2 failed");
+ goto done;
+ }
+
+ err = test_ecdh_sample(priv_a_3, priv_a_3, pub_a_3, pub_a_3, dhkey_3);
+ if (err) {
+ BT_ERR("ECDH sample 3 failed");
+ goto done;
+ }
+
+ rettime = ktime_get();
+ delta = ktime_sub(rettime, calltime);
+ duration = (unsigned long long) ktime_to_ns(delta) >> 10;
+
+ BT_INFO("ECDH test passed in %llu usecs", duration);
+
+done:
+ if (!err)
+ snprintf(test_ecdh_buffer, sizeof(test_ecdh_buffer),
+ "PASS (%llu usecs)\n", duration);
+ else
+ snprintf(test_ecdh_buffer, sizeof(test_ecdh_buffer), "FAIL\n");
+
+ debugfs_create_file("selftest_ecdh", 0444, bt_debugfs, NULL,
+ &test_ecdh_fops);
+
+ return err;
+}
+
+#else
+
+static inline int test_ecdh(void)
+{
+ return 0;
+}
+
+#endif
+
+static int __init run_selftest(void)
+{
+ int err;
+
+ BT_INFO("Starting self testing");
+
+ err = test_ecdh();
+ if (err)
+ goto done;
+
+ err = bt_selftest_smp();
+
+done:
+ BT_INFO("Finished self testing");
+
+ return err;
+}
+
+#if IS_MODULE(CONFIG_BT)
+
+/* This is run when CONFIG_BT_SELFTEST=y and CONFIG_BT=m and is just a
+ * wrapper to allow running this at module init.
+ *
+ * If CONFIG_BT_SELFTEST=n, then this code is not compiled at all.
+ */
+int __init bt_selftest(void)
+{
+ return run_selftest();
+}
+
+#else
+
+/* This is run when CONFIG_BT_SELFTEST=y and CONFIG_BT=y and is run
+ * via late_initcall() as last item in the initialization sequence.
+ *
+ * If CONFIG_BT_SELFTEST=n, then this code is not compiled at all.
+ */
+static int __init bt_selftest_init(void)
+{
+ return run_selftest();
+}
+late_initcall(bt_selftest_init);
+
+#endif
diff --git a/net/bluetooth/selftest.h b/net/bluetooth/selftest.h
new file mode 100644
index 000000000..2aa0a346a
--- /dev/null
+++ b/net/bluetooth/selftest.h
@@ -0,0 +1,45 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2014 Intel Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#if IS_ENABLED(CONFIG_BT_SELFTEST) && IS_MODULE(CONFIG_BT)
+
+/* When CONFIG_BT_SELFTEST=y and the CONFIG_BT=m, then the self testing
+ * is run at module loading time.
+ */
+int bt_selftest(void);
+
+#else
+
+/* When CONFIG_BT_SELFTEST=y and CONFIG_BT=y, then the self testing
+ * is run via late_initcall() to make sure that subsys_initcall() of
+ * the Bluetooth subsystem and device_initcall() of the Crypto subsystem
+ * do not clash.
+ *
+ * When CONFIG_BT_SELFTEST=n, then this turns into an empty call that
+ * has no impact.
+ */
+static inline int bt_selftest(void)
+{
+ return 0;
+}
+
+#endif
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
new file mode 100644
index 000000000..1ab3dc9c8
--- /dev/null
+++ b/net/bluetooth/smp.c
@@ -0,0 +1,3670 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/debugfs.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <crypto/b128ops.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "ecc.h"
+#include "smp.h"
+
+/* Low-level debug macros to be used for stuff that we don't want
+ * accidentially in dmesg, i.e. the values of the various crypto keys
+ * and the inputs & outputs of crypto functions.
+ */
+#ifdef DEBUG
+#define SMP_DBG(fmt, ...) printk(KERN_DEBUG "%s: " fmt, __func__, \
+ ##__VA_ARGS__)
+#else
+#define SMP_DBG(fmt, ...) no_printk(KERN_DEBUG "%s: " fmt, __func__, \
+ ##__VA_ARGS__)
+#endif
+
+#define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd)
+
+/* Keys which are not distributed with Secure Connections */
+#define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY);
+
+#define SMP_TIMEOUT msecs_to_jiffies(30000)
+
+#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
+ 0x1f : 0x07)
+#define KEY_DIST_MASK 0x07
+
+/* Maximum message length that can be passed to aes_cmac */
+#define CMAC_MSG_MAX 80
+
+enum {
+ SMP_FLAG_TK_VALID,
+ SMP_FLAG_CFM_PENDING,
+ SMP_FLAG_MITM_AUTH,
+ SMP_FLAG_COMPLETE,
+ SMP_FLAG_INITIATOR,
+ SMP_FLAG_SC,
+ SMP_FLAG_REMOTE_PK,
+ SMP_FLAG_DEBUG_KEY,
+ SMP_FLAG_WAIT_USER,
+ SMP_FLAG_DHKEY_PENDING,
+ SMP_FLAG_REMOTE_OOB,
+ SMP_FLAG_LOCAL_OOB,
+};
+
+struct smp_dev {
+ /* Secure Connections OOB data */
+ u8 local_pk[64];
+ u8 local_sk[32];
+ u8 local_rand[16];
+ bool debug_key;
+
+ struct crypto_blkcipher *tfm_aes;
+ struct crypto_hash *tfm_cmac;
+};
+
+struct smp_chan {
+ struct l2cap_conn *conn;
+ struct delayed_work security_timer;
+ unsigned long allow_cmd; /* Bitmask of allowed commands */
+
+ u8 preq[7]; /* SMP Pairing Request */
+ u8 prsp[7]; /* SMP Pairing Response */
+ u8 prnd[16]; /* SMP Pairing Random (local) */
+ u8 rrnd[16]; /* SMP Pairing Random (remote) */
+ u8 pcnf[16]; /* SMP Pairing Confirm */
+ u8 tk[16]; /* SMP Temporary Key */
+ u8 rr[16]; /* Remote OOB ra/rb value */
+ u8 lr[16]; /* Local OOB ra/rb value */
+ u8 enc_key_size;
+ u8 remote_key_dist;
+ bdaddr_t id_addr;
+ u8 id_addr_type;
+ u8 irk[16];
+ struct smp_csrk *csrk;
+ struct smp_csrk *slave_csrk;
+ struct smp_ltk *ltk;
+ struct smp_ltk *slave_ltk;
+ struct smp_irk *remote_irk;
+ u8 *link_key;
+ unsigned long flags;
+ u8 method;
+ u8 passkey_round;
+
+ /* Secure Connections variables */
+ u8 local_pk[64];
+ u8 local_sk[32];
+ u8 remote_pk[64];
+ u8 dhkey[32];
+ u8 mackey[16];
+
+ struct crypto_blkcipher *tfm_aes;
+ struct crypto_hash *tfm_cmac;
+};
+
+/* These debug key values are defined in the SMP section of the core
+ * specification. debug_pk is the public debug key and debug_sk the
+ * private debug key.
+ */
+static const u8 debug_pk[64] = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20,
+
+ 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74,
+ 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76,
+ 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63,
+ 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc,
+};
+
+static const u8 debug_sk[32] = {
+ 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58,
+ 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a,
+ 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74,
+ 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f,
+};
+
+static inline void swap_buf(const u8 *src, u8 *dst, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++)
+ dst[len - 1 - i] = src[i];
+}
+
+/* The following functions map to the LE SC SMP crypto functions
+ * AES-CMAC, f4, f5, f6, g2 and h6.
+ */
+
+static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
+ size_t len, u8 mac[16])
+{
+ uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX];
+ struct hash_desc desc;
+ struct scatterlist sg;
+ int err;
+
+ if (len > CMAC_MSG_MAX)
+ return -EFBIG;
+
+ if (!tfm) {
+ BT_ERR("tfm %p", tfm);
+ return -EINVAL;
+ }
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ crypto_hash_init(&desc);
+
+ /* Swap key and message from LSB to MSB */
+ swap_buf(k, tmp, 16);
+ swap_buf(m, msg_msb, len);
+
+ SMP_DBG("msg (len %zu) %*phN", len, (int) len, m);
+ SMP_DBG("key %16phN", k);
+
+ err = crypto_hash_setkey(tfm, tmp, 16);
+ if (err) {
+ BT_ERR("cipher setkey failed: %d", err);
+ return err;
+ }
+
+ sg_init_one(&sg, msg_msb, len);
+
+ err = crypto_hash_update(&desc, &sg, len);
+ if (err) {
+ BT_ERR("Hash update error %d", err);
+ return err;
+ }
+
+ err = crypto_hash_final(&desc, mac_msb);
+ if (err) {
+ BT_ERR("Hash final error %d", err);
+ return err;
+ }
+
+ swap_buf(mac_msb, mac, 16);
+
+ SMP_DBG("mac %16phN", mac);
+
+ return 0;
+}
+
+static int smp_f4(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
+ const u8 x[16], u8 z, u8 res[16])
+{
+ u8 m[65];
+ int err;
+
+ SMP_DBG("u %32phN", u);
+ SMP_DBG("v %32phN", v);
+ SMP_DBG("x %16phN z %02x", x, z);
+
+ m[0] = z;
+ memcpy(m + 1, v, 32);
+ memcpy(m + 33, u, 32);
+
+ err = aes_cmac(tfm_cmac, x, m, sizeof(m), res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
+static int smp_f5(struct crypto_hash *tfm_cmac, const u8 w[32],
+ const u8 n1[16], const u8 n2[16], const u8 a1[7],
+ const u8 a2[7], u8 mackey[16], u8 ltk[16])
+{
+ /* The btle, salt and length "magic" values are as defined in
+ * the SMP section of the Bluetooth core specification. In ASCII
+ * the btle value ends up being 'btle'. The salt is just a
+ * random number whereas length is the value 256 in little
+ * endian format.
+ */
+ const u8 btle[4] = { 0x65, 0x6c, 0x74, 0x62 };
+ const u8 salt[16] = { 0xbe, 0x83, 0x60, 0x5a, 0xdb, 0x0b, 0x37, 0x60,
+ 0x38, 0xa5, 0xf5, 0xaa, 0x91, 0x83, 0x88, 0x6c };
+ const u8 length[2] = { 0x00, 0x01 };
+ u8 m[53], t[16];
+ int err;
+
+ SMP_DBG("w %32phN", w);
+ SMP_DBG("n1 %16phN n2 %16phN", n1, n2);
+ SMP_DBG("a1 %7phN a2 %7phN", a1, a2);
+
+ err = aes_cmac(tfm_cmac, salt, w, 32, t);
+ if (err)
+ return err;
+
+ SMP_DBG("t %16phN", t);
+
+ memcpy(m, length, 2);
+ memcpy(m + 2, a2, 7);
+ memcpy(m + 9, a1, 7);
+ memcpy(m + 16, n2, 16);
+ memcpy(m + 32, n1, 16);
+ memcpy(m + 48, btle, 4);
+
+ m[52] = 0; /* Counter */
+
+ err = aes_cmac(tfm_cmac, t, m, sizeof(m), mackey);
+ if (err)
+ return err;
+
+ SMP_DBG("mackey %16phN", mackey);
+
+ m[52] = 1; /* Counter */
+
+ err = aes_cmac(tfm_cmac, t, m, sizeof(m), ltk);
+ if (err)
+ return err;
+
+ SMP_DBG("ltk %16phN", ltk);
+
+ return 0;
+}
+
+static int smp_f6(struct crypto_hash *tfm_cmac, const u8 w[16],
+ const u8 n1[16], const u8 n2[16], const u8 r[16],
+ const u8 io_cap[3], const u8 a1[7], const u8 a2[7],
+ u8 res[16])
+{
+ u8 m[65];
+ int err;
+
+ SMP_DBG("w %16phN", w);
+ SMP_DBG("n1 %16phN n2 %16phN", n1, n2);
+ SMP_DBG("r %16phN io_cap %3phN a1 %7phN a2 %7phN", r, io_cap, a1, a2);
+
+ memcpy(m, a2, 7);
+ memcpy(m + 7, a1, 7);
+ memcpy(m + 14, io_cap, 3);
+ memcpy(m + 17, r, 16);
+ memcpy(m + 33, n2, 16);
+ memcpy(m + 49, n1, 16);
+
+ err = aes_cmac(tfm_cmac, w, m, sizeof(m), res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
+static int smp_g2(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
+ const u8 x[16], const u8 y[16], u32 *val)
+{
+ u8 m[80], tmp[16];
+ int err;
+
+ SMP_DBG("u %32phN", u);
+ SMP_DBG("v %32phN", v);
+ SMP_DBG("x %16phN y %16phN", x, y);
+
+ memcpy(m, y, 16);
+ memcpy(m + 16, v, 32);
+ memcpy(m + 48, u, 32);
+
+ err = aes_cmac(tfm_cmac, x, m, sizeof(m), tmp);
+ if (err)
+ return err;
+
+ *val = get_unaligned_le32(tmp);
+ *val %= 1000000;
+
+ SMP_DBG("val %06u", *val);
+
+ return 0;
+}
+
+static int smp_h6(struct crypto_hash *tfm_cmac, const u8 w[16],
+ const u8 key_id[4], u8 res[16])
+{
+ int err;
+
+ SMP_DBG("w %16phN key_id %4phN", w, key_id);
+
+ err = aes_cmac(tfm_cmac, w, key_id, 4, res);
+ if (err)
+ return err;
+
+ SMP_DBG("res %16phN", res);
+
+ return err;
+}
+
+/* The following functions map to the legacy SMP crypto functions e, c1,
+ * s1 and ah.
+ */
+
+static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
+{
+ struct blkcipher_desc desc;
+ struct scatterlist sg;
+ uint8_t tmp[16], data[16];
+ int err;
+
+ if (!tfm) {
+ BT_ERR("tfm %p", tfm);
+ return -EINVAL;
+ }
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ /* The most significant octet of key corresponds to k[0] */
+ swap_buf(k, tmp, 16);
+
+ err = crypto_blkcipher_setkey(tfm, tmp, 16);
+ if (err) {
+ BT_ERR("cipher setkey failed: %d", err);
+ return err;
+ }
+
+ /* Most significant octet of plaintextData corresponds to data[0] */
+ swap_buf(r, data, 16);
+
+ sg_init_one(&sg, data, 16);
+
+ err = crypto_blkcipher_encrypt(&desc, &sg, &sg, 16);
+ if (err)
+ BT_ERR("Encrypt data error %d", err);
+
+ /* Most significant octet of encryptedData corresponds to data[0] */
+ swap_buf(data, r, 16);
+
+ return err;
+}
+
+static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
+ const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat,
+ const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16])
+{
+ u8 p1[16], p2[16];
+ int err;
+
+ memset(p1, 0, 16);
+
+ /* p1 = pres || preq || _rat || _iat */
+ p1[0] = _iat;
+ p1[1] = _rat;
+ memcpy(p1 + 2, preq, 7);
+ memcpy(p1 + 9, pres, 7);
+
+ /* p2 = padding || ia || ra */
+ memcpy(p2, ra, 6);
+ memcpy(p2 + 6, ia, 6);
+ memset(p2 + 12, 0, 4);
+
+ /* res = r XOR p1 */
+ u128_xor((u128 *) res, (u128 *) r, (u128 *) p1);
+
+ /* res = e(k, res) */
+ err = smp_e(tfm_aes, k, res);
+ if (err) {
+ BT_ERR("Encrypt data error");
+ return err;
+ }
+
+ /* res = res XOR p2 */
+ u128_xor((u128 *) res, (u128 *) res, (u128 *) p2);
+
+ /* res = e(k, res) */
+ err = smp_e(tfm_aes, k, res);
+ if (err)
+ BT_ERR("Encrypt data error");
+
+ return err;
+}
+
+static int smp_s1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
+ const u8 r1[16], const u8 r2[16], u8 _r[16])
+{
+ int err;
+
+ /* Just least significant octets from r1 and r2 are considered */
+ memcpy(_r, r2, 8);
+ memcpy(_r + 8, r1, 8);
+
+ err = smp_e(tfm_aes, k, _r);
+ if (err)
+ BT_ERR("Encrypt data error");
+
+ return err;
+}
+
+static int smp_ah(struct crypto_blkcipher *tfm, const u8 irk[16],
+ const u8 r[3], u8 res[3])
+{
+ u8 _res[16];
+ int err;
+
+ /* r' = padding || r */
+ memcpy(_res, r, 3);
+ memset(_res + 3, 0, 13);
+
+ err = smp_e(tfm, irk, _res);
+ if (err) {
+ BT_ERR("Encrypt error");
+ return err;
+ }
+
+ /* The output of the random address function ah is:
+ * ah(h, r) = e(k, r') mod 2^24
+ * The output of the security function e is then truncated to 24 bits
+ * by taking the least significant 24 bits of the output of e as the
+ * result of ah.
+ */
+ memcpy(res, _res, 3);
+
+ return 0;
+}
+
+bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
+ const bdaddr_t *bdaddr)
+{
+ struct l2cap_chan *chan = hdev->smp_data;
+ struct smp_dev *smp;
+ u8 hash[3];
+ int err;
+
+ if (!chan || !chan->data)
+ return false;
+
+ smp = chan->data;
+
+ BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk);
+
+ err = smp_ah(smp->tfm_aes, irk, &bdaddr->b[3], hash);
+ if (err)
+ return false;
+
+ return !memcmp(bdaddr->b, hash, 3);
+}
+
+int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa)
+{
+ struct l2cap_chan *chan = hdev->smp_data;
+ struct smp_dev *smp;
+ int err;
+
+ if (!chan || !chan->data)
+ return -EOPNOTSUPP;
+
+ smp = chan->data;
+
+ get_random_bytes(&rpa->b[3], 3);
+
+ rpa->b[5] &= 0x3f; /* Clear two most significant bits */
+ rpa->b[5] |= 0x40; /* Set second most significant bit */
+
+ err = smp_ah(smp->tfm_aes, irk, &rpa->b[3], rpa->b);
+ if (err < 0)
+ return err;
+
+ BT_DBG("RPA %pMR", rpa);
+
+ return 0;
+}
+
+int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16])
+{
+ struct l2cap_chan *chan = hdev->smp_data;
+ struct smp_dev *smp;
+ int err;
+
+ if (!chan || !chan->data)
+ return -EOPNOTSUPP;
+
+ smp = chan->data;
+
+ if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
+ BT_DBG("Using debug keys");
+ memcpy(smp->local_pk, debug_pk, 64);
+ memcpy(smp->local_sk, debug_sk, 32);
+ smp->debug_key = true;
+ } else {
+ while (true) {
+ /* Generate local key pair for Secure Connections */
+ if (!ecc_make_key(smp->local_pk, smp->local_sk))
+ return -EIO;
+
+ /* This is unlikely, but we need to check that
+ * we didn't accidentially generate a debug key.
+ */
+ if (memcmp(smp->local_sk, debug_sk, 32))
+ break;
+ }
+ smp->debug_key = false;
+ }
+
+ SMP_DBG("OOB Public Key X: %32phN", smp->local_pk);
+ SMP_DBG("OOB Public Key Y: %32phN", smp->local_pk + 32);
+ SMP_DBG("OOB Private Key: %32phN", smp->local_sk);
+
+ get_random_bytes(smp->local_rand, 16);
+
+ err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->local_pk,
+ smp->local_rand, 0, hash);
+ if (err < 0)
+ return err;
+
+ memcpy(rand, smp->local_rand, 16);
+
+ return 0;
+}
+
+static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp;
+ struct kvec iv[2];
+ struct msghdr msg;
+
+ if (!chan)
+ return;
+
+ BT_DBG("code 0x%2.2x", code);
+
+ iv[0].iov_base = &code;
+ iv[0].iov_len = 1;
+
+ iv[1].iov_base = data;
+ iv[1].iov_len = len;
+
+ memset(&msg, 0, sizeof(msg));
+
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iv, 2, 1 + len);
+
+ l2cap_chan_send(chan, &msg, 1 + len);
+
+ if (!chan->data)
+ return;
+
+ smp = chan->data;
+
+ cancel_delayed_work_sync(&smp->security_timer);
+ schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT);
+}
+
+static u8 authreq_to_seclevel(u8 authreq)
+{
+ if (authreq & SMP_AUTH_MITM) {
+ if (authreq & SMP_AUTH_SC)
+ return BT_SECURITY_FIPS;
+ else
+ return BT_SECURITY_HIGH;
+ } else {
+ return BT_SECURITY_MEDIUM;
+ }
+}
+
+static __u8 seclevel_to_authreq(__u8 sec_level)
+{
+ switch (sec_level) {
+ case BT_SECURITY_FIPS:
+ case BT_SECURITY_HIGH:
+ return SMP_AUTH_MITM | SMP_AUTH_BONDING;
+ case BT_SECURITY_MEDIUM:
+ return SMP_AUTH_BONDING;
+ default:
+ return SMP_AUTH_NONE;
+ }
+}
+
+static void build_pairing_cmd(struct l2cap_conn *conn,
+ struct smp_cmd_pairing *req,
+ struct smp_cmd_pairing *rsp, __u8 authreq)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ u8 local_dist = 0, remote_dist = 0, oob_flag = SMP_OOB_NOT_PRESENT;
+
+ if (hci_dev_test_flag(hdev, HCI_BONDABLE)) {
+ local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN;
+ remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN;
+ authreq |= SMP_AUTH_BONDING;
+ } else {
+ authreq &= ~SMP_AUTH_BONDING;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING))
+ remote_dist |= SMP_DIST_ID_KEY;
+
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY))
+ local_dist |= SMP_DIST_ID_KEY;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) &&
+ (authreq & SMP_AUTH_SC)) {
+ struct oob_data *oob_data;
+ u8 bdaddr_type;
+
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
+ local_dist |= SMP_DIST_LINK_KEY;
+ remote_dist |= SMP_DIST_LINK_KEY;
+ }
+
+ if (hcon->dst_type == ADDR_LE_DEV_PUBLIC)
+ bdaddr_type = BDADDR_LE_PUBLIC;
+ else
+ bdaddr_type = BDADDR_LE_RANDOM;
+
+ oob_data = hci_find_remote_oob_data(hdev, &hcon->dst,
+ bdaddr_type);
+ if (oob_data && oob_data->present) {
+ set_bit(SMP_FLAG_REMOTE_OOB, &smp->flags);
+ oob_flag = SMP_OOB_PRESENT;
+ memcpy(smp->rr, oob_data->rand256, 16);
+ memcpy(smp->pcnf, oob_data->hash256, 16);
+ SMP_DBG("OOB Remote Confirmation: %16phN", smp->pcnf);
+ SMP_DBG("OOB Remote Random: %16phN", smp->rr);
+ }
+
+ } else {
+ authreq &= ~SMP_AUTH_SC;
+ }
+
+ if (rsp == NULL) {
+ req->io_capability = conn->hcon->io_capability;
+ req->oob_flag = oob_flag;
+ req->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ req->init_key_dist = local_dist;
+ req->resp_key_dist = remote_dist;
+ req->auth_req = (authreq & AUTH_REQ_MASK(hdev));
+
+ smp->remote_key_dist = remote_dist;
+ return;
+ }
+
+ rsp->io_capability = conn->hcon->io_capability;
+ rsp->oob_flag = oob_flag;
+ rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ rsp->init_key_dist = req->init_key_dist & remote_dist;
+ rsp->resp_key_dist = req->resp_key_dist & local_dist;
+ rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev));
+
+ smp->remote_key_dist = rsp->init_key_dist;
+}
+
+static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+
+ if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) ||
+ (max_key_size < SMP_MIN_ENC_KEY_SIZE))
+ return SMP_ENC_KEY_SIZE;
+
+ smp->enc_key_size = max_key_size;
+
+ return 0;
+}
+
+static void smp_chan_destroy(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ bool complete;
+
+ BUG_ON(!smp);
+
+ cancel_delayed_work_sync(&smp->security_timer);
+
+ complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags);
+ mgmt_smp_complete(hcon, complete);
+
+ kzfree(smp->csrk);
+ kzfree(smp->slave_csrk);
+ kzfree(smp->link_key);
+
+ crypto_free_blkcipher(smp->tfm_aes);
+ crypto_free_hash(smp->tfm_cmac);
+
+ /* Ensure that we don't leave any debug key around if debug key
+ * support hasn't been explicitly enabled.
+ */
+ if (smp->ltk && smp->ltk->type == SMP_LTK_P256_DEBUG &&
+ !hci_dev_test_flag(hcon->hdev, HCI_KEEP_DEBUG_KEYS)) {
+ list_del_rcu(&smp->ltk->list);
+ kfree_rcu(smp->ltk, rcu);
+ smp->ltk = NULL;
+ }
+
+ /* If pairing failed clean up any keys we might have */
+ if (!complete) {
+ if (smp->ltk) {
+ list_del_rcu(&smp->ltk->list);
+ kfree_rcu(smp->ltk, rcu);
+ }
+
+ if (smp->slave_ltk) {
+ list_del_rcu(&smp->slave_ltk->list);
+ kfree_rcu(smp->slave_ltk, rcu);
+ }
+
+ if (smp->remote_irk) {
+ list_del_rcu(&smp->remote_irk->list);
+ kfree_rcu(smp->remote_irk, rcu);
+ }
+ }
+
+ chan->data = NULL;
+ kzfree(smp);
+ hci_conn_drop(hcon);
+}
+
+static void smp_failure(struct l2cap_conn *conn, u8 reason)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan = conn->smp;
+
+ if (reason)
+ smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason),
+ &reason);
+
+ clear_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags);
+ mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE);
+
+ if (chan->data)
+ smp_chan_destroy(conn);
+}
+
+#define JUST_WORKS 0x00
+#define JUST_CFM 0x01
+#define REQ_PASSKEY 0x02
+#define CFM_PASSKEY 0x03
+#define REQ_OOB 0x04
+#define DSP_PASSKEY 0x05
+#define OVERLAP 0xFF
+
+static const u8 gen_method[5][5] = {
+ { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY },
+ { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY },
+ { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY },
+ { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM },
+ { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, OVERLAP },
+};
+
+static const u8 sc_method[5][5] = {
+ { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY },
+ { JUST_WORKS, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY },
+ { DSP_PASSKEY, DSP_PASSKEY, REQ_PASSKEY, JUST_WORKS, DSP_PASSKEY },
+ { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM },
+ { DSP_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY },
+};
+
+static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io)
+{
+ /* If either side has unknown io_caps, use JUST_CFM (which gets
+ * converted later to JUST_WORKS if we're initiators.
+ */
+ if (local_io > SMP_IO_KEYBOARD_DISPLAY ||
+ remote_io > SMP_IO_KEYBOARD_DISPLAY)
+ return JUST_CFM;
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags))
+ return sc_method[remote_io][local_io];
+
+ return gen_method[remote_io][local_io];
+}
+
+static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
+ u8 local_io, u8 remote_io)
+{
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ u32 passkey = 0;
+ int ret = 0;
+
+ /* Initialize key for JUST WORKS */
+ memset(smp->tk, 0, sizeof(smp->tk));
+ clear_bit(SMP_FLAG_TK_VALID, &smp->flags);
+
+ BT_DBG("tk_request: auth:%d lcl:%d rem:%d", auth, local_io, remote_io);
+
+ /* If neither side wants MITM, either "just" confirm an incoming
+ * request or use just-works for outgoing ones. The JUST_CFM
+ * will be converted to JUST_WORKS if necessary later in this
+ * function. If either side has MITM look up the method from the
+ * table.
+ */
+ if (!(auth & SMP_AUTH_MITM))
+ smp->method = JUST_CFM;
+ else
+ smp->method = get_auth_method(smp, local_io, remote_io);
+
+ /* Don't confirm locally initiated pairing attempts */
+ if (smp->method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR,
+ &smp->flags))
+ smp->method = JUST_WORKS;
+
+ /* Don't bother user space with no IO capabilities */
+ if (smp->method == JUST_CFM &&
+ hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
+ smp->method = JUST_WORKS;
+
+ /* If Just Works, Continue with Zero TK */
+ if (smp->method == JUST_WORKS) {
+ set_bit(SMP_FLAG_TK_VALID, &smp->flags);
+ return 0;
+ }
+
+ /* If this function is used for SC -> legacy fallback we
+ * can only recover the just-works case.
+ */
+ if (test_bit(SMP_FLAG_SC, &smp->flags))
+ return -EINVAL;
+
+ /* Not Just Works/Confirm results in MITM Authentication */
+ if (smp->method != JUST_CFM) {
+ set_bit(SMP_FLAG_MITM_AUTH, &smp->flags);
+ if (hcon->pending_sec_level < BT_SECURITY_HIGH)
+ hcon->pending_sec_level = BT_SECURITY_HIGH;
+ }
+
+ /* If both devices have Keyoard-Display I/O, the master
+ * Confirms and the slave Enters the passkey.
+ */
+ if (smp->method == OVERLAP) {
+ if (hcon->role == HCI_ROLE_MASTER)
+ smp->method = CFM_PASSKEY;
+ else
+ smp->method = REQ_PASSKEY;
+ }
+
+ /* Generate random passkey. */
+ if (smp->method == CFM_PASSKEY) {
+ memset(smp->tk, 0, sizeof(smp->tk));
+ get_random_bytes(&passkey, sizeof(passkey));
+ passkey %= 1000000;
+ put_unaligned_le32(passkey, smp->tk);
+ BT_DBG("PassKey: %d", passkey);
+ set_bit(SMP_FLAG_TK_VALID, &smp->flags);
+ }
+
+ if (smp->method == REQ_PASSKEY)
+ ret = mgmt_user_passkey_request(hcon->hdev, &hcon->dst,
+ hcon->type, hcon->dst_type);
+ else if (smp->method == JUST_CFM)
+ ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst,
+ hcon->type, hcon->dst_type,
+ passkey, 1);
+ else
+ ret = mgmt_user_passkey_notify(hcon->hdev, &hcon->dst,
+ hcon->type, hcon->dst_type,
+ passkey, 0);
+
+ return ret;
+}
+
+static u8 smp_confirm(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct smp_cmd_pairing_confirm cp;
+ int ret;
+
+ BT_DBG("conn %p", conn);
+
+ ret = smp_c1(smp->tfm_aes, smp->tk, smp->prnd, smp->preq, smp->prsp,
+ conn->hcon->init_addr_type, &conn->hcon->init_addr,
+ conn->hcon->resp_addr_type, &conn->hcon->resp_addr,
+ cp.confirm_val);
+ if (ret)
+ return SMP_UNSPECIFIED;
+
+ clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags);
+
+ smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp);
+
+ if (conn->hcon->out)
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+ else
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+
+ return 0;
+}
+
+static u8 smp_random(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ u8 confirm[16];
+ int ret;
+
+ if (IS_ERR_OR_NULL(smp->tfm_aes))
+ return SMP_UNSPECIFIED;
+
+ BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave");
+
+ ret = smp_c1(smp->tfm_aes, smp->tk, smp->rrnd, smp->preq, smp->prsp,
+ hcon->init_addr_type, &hcon->init_addr,
+ hcon->resp_addr_type, &hcon->resp_addr, confirm);
+ if (ret)
+ return SMP_UNSPECIFIED;
+
+ if (memcmp(smp->pcnf, confirm, sizeof(smp->pcnf)) != 0) {
+ BT_ERR("Pairing failed (confirmation values mismatch)");
+ return SMP_CONFIRM_FAILED;
+ }
+
+ if (hcon->out) {
+ u8 stk[16];
+ __le64 rand = 0;
+ __le16 ediv = 0;
+
+ smp_s1(smp->tfm_aes, smp->tk, smp->rrnd, smp->prnd, stk);
+
+ memset(stk + smp->enc_key_size, 0,
+ SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size);
+
+ if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags))
+ return SMP_UNSPECIFIED;
+
+ hci_le_start_enc(hcon, ediv, rand, stk);
+ hcon->enc_key_size = smp->enc_key_size;
+ set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags);
+ } else {
+ u8 stk[16], auth;
+ __le64 rand = 0;
+ __le16 ediv = 0;
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+ smp->prnd);
+
+ smp_s1(smp->tfm_aes, smp->tk, smp->prnd, smp->rrnd, stk);
+
+ memset(stk + smp->enc_key_size, 0,
+ SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size);
+
+ if (hcon->pending_sec_level == BT_SECURITY_HIGH)
+ auth = 1;
+ else
+ auth = 0;
+
+ /* Even though there's no _SLAVE suffix this is the
+ * slave STK we're adding for later lookup (the master
+ * STK never needs to be stored).
+ */
+ hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type,
+ SMP_STK, auth, stk, smp->enc_key_size, ediv, rand);
+ }
+
+ return 0;
+}
+
+static void smp_notify_keys(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_cmd_pairing *req = (void *) &smp->preq[1];
+ struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1];
+ bool persistent;
+
+ if (smp->remote_irk) {
+ mgmt_new_irk(hdev, smp->remote_irk);
+ /* Now that user space can be considered to know the
+ * identity address track the connection based on it
+ * from now on (assuming this is an LE link).
+ */
+ if (hcon->type == LE_LINK) {
+ bacpy(&hcon->dst, &smp->remote_irk->bdaddr);
+ hcon->dst_type = smp->remote_irk->addr_type;
+ queue_work(hdev->workqueue, &conn->id_addr_update_work);
+ }
+
+ /* When receiving an indentity resolving key for
+ * a remote device that does not use a resolvable
+ * private address, just remove the key so that
+ * it is possible to use the controller white
+ * list for scanning.
+ *
+ * Userspace will have been told to not store
+ * this key at this point. So it is safe to
+ * just remove it.
+ */
+ if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) {
+ list_del_rcu(&smp->remote_irk->list);
+ kfree_rcu(smp->remote_irk, rcu);
+ smp->remote_irk = NULL;
+ }
+ }
+
+ if (hcon->type == ACL_LINK) {
+ if (hcon->key_type == HCI_LK_DEBUG_COMBINATION)
+ persistent = false;
+ else
+ persistent = !test_bit(HCI_CONN_FLUSH_KEY,
+ &hcon->flags);
+ } else {
+ /* The LTKs and CSRKs should be persistent only if both sides
+ * had the bonding bit set in their authentication requests.
+ */
+ persistent = !!((req->auth_req & rsp->auth_req) &
+ SMP_AUTH_BONDING);
+ }
+
+
+ if (smp->csrk) {
+ smp->csrk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->csrk->bdaddr, &hcon->dst);
+ mgmt_new_csrk(hdev, smp->csrk, persistent);
+ }
+
+ if (smp->slave_csrk) {
+ smp->slave_csrk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->slave_csrk->bdaddr, &hcon->dst);
+ mgmt_new_csrk(hdev, smp->slave_csrk, persistent);
+ }
+
+ if (smp->ltk) {
+ smp->ltk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->ltk->bdaddr, &hcon->dst);
+ mgmt_new_ltk(hdev, smp->ltk, persistent);
+ }
+
+ if (smp->slave_ltk) {
+ smp->slave_ltk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->slave_ltk->bdaddr, &hcon->dst);
+ mgmt_new_ltk(hdev, smp->slave_ltk, persistent);
+ }
+
+ if (smp->link_key) {
+ struct link_key *key;
+ u8 type;
+
+ if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags))
+ type = HCI_LK_DEBUG_COMBINATION;
+ else if (hcon->sec_level == BT_SECURITY_FIPS)
+ type = HCI_LK_AUTH_COMBINATION_P256;
+ else
+ type = HCI_LK_UNAUTH_COMBINATION_P256;
+
+ key = hci_add_link_key(hdev, smp->conn->hcon, &hcon->dst,
+ smp->link_key, type, 0, &persistent);
+ if (key) {
+ mgmt_new_link_key(hdev, key, persistent);
+
+ /* Don't keep debug keys around if the relevant
+ * flag is not set.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS) &&
+ key->type == HCI_LK_DEBUG_COMBINATION) {
+ list_del_rcu(&key->list);
+ kfree_rcu(key, rcu);
+ }
+ }
+ }
+}
+
+static void sc_add_ltk(struct smp_chan *smp)
+{
+ struct hci_conn *hcon = smp->conn->hcon;
+ u8 key_type, auth;
+
+ if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags))
+ key_type = SMP_LTK_P256_DEBUG;
+ else
+ key_type = SMP_LTK_P256;
+
+ if (hcon->pending_sec_level == BT_SECURITY_FIPS)
+ auth = 1;
+ else
+ auth = 0;
+
+ memset(smp->tk + smp->enc_key_size, 0,
+ SMP_MAX_ENC_KEY_SIZE - smp->enc_key_size);
+
+ smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type,
+ key_type, auth, smp->tk, smp->enc_key_size,
+ 0, 0);
+}
+
+static void sc_generate_link_key(struct smp_chan *smp)
+{
+ /* These constants are as specified in the core specification.
+ * In ASCII they spell out to 'tmp1' and 'lebr'.
+ */
+ const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
+ const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c };
+
+ smp->link_key = kzalloc(16, GFP_KERNEL);
+ if (!smp->link_key)
+ return;
+
+ if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
+ kzfree(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
+
+ if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
+ kzfree(smp->link_key);
+ smp->link_key = NULL;
+ return;
+ }
+}
+
+static void smp_allow_key_dist(struct smp_chan *smp)
+{
+ /* Allow the first expected phase 3 PDU. The rest of the PDUs
+ * will be allowed in each PDU handler to ensure we receive
+ * them in the correct order.
+ */
+ if (smp->remote_key_dist & SMP_DIST_ENC_KEY)
+ SMP_ALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO);
+ else if (smp->remote_key_dist & SMP_DIST_ID_KEY)
+ SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO);
+ else if (smp->remote_key_dist & SMP_DIST_SIGN)
+ SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO);
+}
+
+static void sc_generate_ltk(struct smp_chan *smp)
+{
+ /* These constants are as specified in the core specification.
+ * In ASCII they spell out to 'tmp2' and 'brle'.
+ */
+ const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 };
+ const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 };
+ struct hci_conn *hcon = smp->conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct link_key *key;
+
+ key = hci_find_link_key(hdev, &hcon->dst);
+ if (!key) {
+ BT_ERR("%s No Link Key found to generate LTK", hdev->name);
+ return;
+ }
+
+ if (key->type == HCI_LK_DEBUG_COMBINATION)
+ set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
+
+ if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk))
+ return;
+
+ if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk))
+ return;
+
+ sc_add_ltk(smp);
+}
+
+static void smp_distribute_keys(struct smp_chan *smp)
+{
+ struct smp_cmd_pairing *req, *rsp;
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ __u8 *keydist;
+
+ BT_DBG("conn %p", conn);
+
+ rsp = (void *) &smp->prsp[1];
+
+ /* The responder sends its keys first */
+ if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) {
+ smp_allow_key_dist(smp);
+ return;
+ }
+
+ req = (void *) &smp->preq[1];
+
+ if (hcon->out) {
+ keydist = &rsp->init_key_dist;
+ *keydist &= req->init_key_dist;
+ } else {
+ keydist = &rsp->resp_key_dist;
+ *keydist &= req->resp_key_dist;
+ }
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ if (hcon->type == LE_LINK && (*keydist & SMP_DIST_LINK_KEY))
+ sc_generate_link_key(smp);
+ if (hcon->type == ACL_LINK && (*keydist & SMP_DIST_ENC_KEY))
+ sc_generate_ltk(smp);
+
+ /* Clear the keys which are generated but not distributed */
+ *keydist &= ~SMP_SC_NO_DIST;
+ }
+
+ BT_DBG("keydist 0x%x", *keydist);
+
+ if (*keydist & SMP_DIST_ENC_KEY) {
+ struct smp_cmd_encrypt_info enc;
+ struct smp_cmd_master_ident ident;
+ struct smp_ltk *ltk;
+ u8 authenticated;
+ __le16 ediv;
+ __le64 rand;
+
+ get_random_bytes(enc.ltk, sizeof(enc.ltk));
+ get_random_bytes(&ediv, sizeof(ediv));
+ get_random_bytes(&rand, sizeof(rand));
+
+ smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc);
+
+ authenticated = hcon->sec_level == BT_SECURITY_HIGH;
+ ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type,
+ SMP_LTK_SLAVE, authenticated, enc.ltk,
+ smp->enc_key_size, ediv, rand);
+ smp->slave_ltk = ltk;
+
+ ident.ediv = ediv;
+ ident.rand = rand;
+
+ smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident);
+
+ *keydist &= ~SMP_DIST_ENC_KEY;
+ }
+
+ if (*keydist & SMP_DIST_ID_KEY) {
+ struct smp_cmd_ident_addr_info addrinfo;
+ struct smp_cmd_ident_info idinfo;
+
+ memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk));
+
+ smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo);
+
+ /* The hci_conn contains the local identity address
+ * after the connection has been established.
+ *
+ * This is true even when the connection has been
+ * established using a resolvable random address.
+ */
+ bacpy(&addrinfo.bdaddr, &hcon->src);
+ addrinfo.addr_type = hcon->src_type;
+
+ smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo),
+ &addrinfo);
+
+ *keydist &= ~SMP_DIST_ID_KEY;
+ }
+
+ if (*keydist & SMP_DIST_SIGN) {
+ struct smp_cmd_sign_info sign;
+ struct smp_csrk *csrk;
+
+ /* Generate a new random key */
+ get_random_bytes(sign.csrk, sizeof(sign.csrk));
+
+ csrk = kzalloc(sizeof(*csrk), GFP_KERNEL);
+ if (csrk) {
+ if (hcon->sec_level > BT_SECURITY_MEDIUM)
+ csrk->type = MGMT_CSRK_LOCAL_AUTHENTICATED;
+ else
+ csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED;
+ memcpy(csrk->val, sign.csrk, sizeof(csrk->val));
+ }
+ smp->slave_csrk = csrk;
+
+ smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign);
+
+ *keydist &= ~SMP_DIST_SIGN;
+ }
+
+ /* If there are still keys to be received wait for them */
+ if (smp->remote_key_dist & KEY_DIST_MASK) {
+ smp_allow_key_dist(smp);
+ return;
+ }
+
+ set_bit(SMP_FLAG_COMPLETE, &smp->flags);
+ smp_notify_keys(conn);
+
+ smp_chan_destroy(conn);
+}
+
+static void smp_timeout(struct work_struct *work)
+{
+ struct smp_chan *smp = container_of(work, struct smp_chan,
+ security_timer.work);
+ struct l2cap_conn *conn = smp->conn;
+
+ BT_DBG("conn %p", conn);
+
+ hci_disconnect(conn->hcon, HCI_ERROR_REMOTE_USER_TERM);
+}
+
+static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp;
+
+ smp = kzalloc(sizeof(*smp), GFP_ATOMIC);
+ if (!smp)
+ return NULL;
+
+ smp->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(smp->tfm_aes)) {
+ BT_ERR("Unable to create ECB crypto context");
+ kzfree(smp);
+ return NULL;
+ }
+
+ smp->tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(smp->tfm_cmac)) {
+ BT_ERR("Unable to create CMAC crypto context");
+ crypto_free_blkcipher(smp->tfm_aes);
+ kzfree(smp);
+ return NULL;
+ }
+
+ smp->conn = conn;
+ chan->data = smp;
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_FAIL);
+
+ INIT_DELAYED_WORK(&smp->security_timer, smp_timeout);
+
+ hci_conn_hold(conn->hcon);
+
+ return smp;
+}
+
+static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16])
+{
+ struct hci_conn *hcon = smp->conn->hcon;
+ u8 *na, *nb, a[7], b[7];
+
+ if (hcon->out) {
+ na = smp->prnd;
+ nb = smp->rrnd;
+ } else {
+ na = smp->rrnd;
+ nb = smp->prnd;
+ }
+
+ memcpy(a, &hcon->init_addr, 6);
+ memcpy(b, &hcon->resp_addr, 6);
+ a[6] = hcon->init_addr_type;
+ b[6] = hcon->resp_addr_type;
+
+ return smp_f5(smp->tfm_cmac, smp->dhkey, na, nb, a, b, mackey, ltk);
+}
+
+static void sc_dhkey_check(struct smp_chan *smp)
+{
+ struct hci_conn *hcon = smp->conn->hcon;
+ struct smp_cmd_dhkey_check check;
+ u8 a[7], b[7], *local_addr, *remote_addr;
+ u8 io_cap[3], r[16];
+
+ memcpy(a, &hcon->init_addr, 6);
+ memcpy(b, &hcon->resp_addr, 6);
+ a[6] = hcon->init_addr_type;
+ b[6] = hcon->resp_addr_type;
+
+ if (hcon->out) {
+ local_addr = a;
+ remote_addr = b;
+ memcpy(io_cap, &smp->preq[1], 3);
+ } else {
+ local_addr = b;
+ remote_addr = a;
+ memcpy(io_cap, &smp->prsp[1], 3);
+ }
+
+ memset(r, 0, sizeof(r));
+
+ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+ put_unaligned_le32(hcon->passkey_notify, r);
+
+ if (smp->method == REQ_OOB)
+ memcpy(r, smp->rr, 16);
+
+ smp_f6(smp->tfm_cmac, smp->mackey, smp->prnd, smp->rrnd, r, io_cap,
+ local_addr, remote_addr, check.e);
+
+ smp_send_cmd(smp->conn, SMP_CMD_DHKEY_CHECK, sizeof(check), &check);
+}
+
+static u8 sc_passkey_send_confirm(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_cmd_pairing_confirm cfm;
+ u8 r;
+
+ r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01);
+ r |= 0x80;
+
+ get_random_bytes(smp->prnd, sizeof(smp->prnd));
+
+ if (smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, r,
+ cfm.confirm_val))
+ return SMP_UNSPECIFIED;
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm);
+
+ return 0;
+}
+
+static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ u8 cfm[16], r;
+
+ /* Ignore the PDU if we've already done 20 rounds (0 - 19) */
+ if (smp->passkey_round >= 20)
+ return 0;
+
+ switch (smp_op) {
+ case SMP_CMD_PAIRING_RANDOM:
+ r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01);
+ r |= 0x80;
+
+ if (smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk,
+ smp->rrnd, r, cfm))
+ return SMP_UNSPECIFIED;
+
+ if (memcmp(smp->pcnf, cfm, 16))
+ return SMP_CONFIRM_FAILED;
+
+ smp->passkey_round++;
+
+ if (smp->passkey_round == 20) {
+ /* Generate MacKey and LTK */
+ if (sc_mackey_and_ltk(smp, smp->mackey, smp->tk))
+ return SMP_UNSPECIFIED;
+ }
+
+ /* The round is only complete when the initiator
+ * receives pairing random.
+ */
+ if (!hcon->out) {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+ sizeof(smp->prnd), smp->prnd);
+ if (smp->passkey_round == 20)
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ else
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+ return 0;
+ }
+
+ /* Start the next round */
+ if (smp->passkey_round != 20)
+ return sc_passkey_round(smp, 0);
+
+ /* Passkey rounds are complete - start DHKey Check */
+ sc_dhkey_check(smp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+
+ break;
+
+ case SMP_CMD_PAIRING_CONFIRM:
+ if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) {
+ set_bit(SMP_FLAG_CFM_PENDING, &smp->flags);
+ return 0;
+ }
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+
+ if (hcon->out) {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+ sizeof(smp->prnd), smp->prnd);
+ return 0;
+ }
+
+ return sc_passkey_send_confirm(smp);
+
+ case SMP_CMD_PUBLIC_KEY:
+ default:
+ /* Initiating device starts the round */
+ if (!hcon->out)
+ return 0;
+
+ BT_DBG("%s Starting passkey round %u", hdev->name,
+ smp->passkey_round + 1);
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+
+ return sc_passkey_send_confirm(smp);
+ }
+
+ return 0;
+}
+
+static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ u8 smp_op;
+
+ clear_bit(SMP_FLAG_WAIT_USER, &smp->flags);
+
+ switch (mgmt_op) {
+ case MGMT_OP_USER_PASSKEY_NEG_REPLY:
+ smp_failure(smp->conn, SMP_PASSKEY_ENTRY_FAILED);
+ return 0;
+ case MGMT_OP_USER_CONFIRM_NEG_REPLY:
+ smp_failure(smp->conn, SMP_NUMERIC_COMP_FAILED);
+ return 0;
+ case MGMT_OP_USER_PASSKEY_REPLY:
+ hcon->passkey_notify = le32_to_cpu(passkey);
+ smp->passkey_round = 0;
+
+ if (test_and_clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags))
+ smp_op = SMP_CMD_PAIRING_CONFIRM;
+ else
+ smp_op = 0;
+
+ if (sc_passkey_round(smp, smp_op))
+ return -EIO;
+
+ return 0;
+ }
+
+ /* Initiator sends DHKey check first */
+ if (hcon->out) {
+ sc_dhkey_check(smp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ } else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) {
+ sc_dhkey_check(smp);
+ sc_add_ltk(smp);
+ }
+
+ return 0;
+}
+
+int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan;
+ struct smp_chan *smp;
+ u32 value;
+ int err;
+
+ BT_DBG("");
+
+ if (!conn)
+ return -ENOTCONN;
+
+ chan = conn->smp;
+ if (!chan)
+ return -ENOTCONN;
+
+ l2cap_chan_lock(chan);
+ if (!chan->data) {
+ err = -ENOTCONN;
+ goto unlock;
+ }
+
+ smp = chan->data;
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ err = sc_user_reply(smp, mgmt_op, passkey);
+ goto unlock;
+ }
+
+ switch (mgmt_op) {
+ case MGMT_OP_USER_PASSKEY_REPLY:
+ value = le32_to_cpu(passkey);
+ memset(smp->tk, 0, sizeof(smp->tk));
+ BT_DBG("PassKey: %d", value);
+ put_unaligned_le32(value, smp->tk);
+ /* Fall Through */
+ case MGMT_OP_USER_CONFIRM_REPLY:
+ set_bit(SMP_FLAG_TK_VALID, &smp->flags);
+ break;
+ case MGMT_OP_USER_PASSKEY_NEG_REPLY:
+ case MGMT_OP_USER_CONFIRM_NEG_REPLY:
+ smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED);
+ err = 0;
+ goto unlock;
+ default:
+ smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED);
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ err = 0;
+
+ /* If it is our turn to send Pairing Confirm, do so now */
+ if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) {
+ u8 rsp = smp_confirm(smp);
+ if (rsp)
+ smp_failure(conn, rsp);
+ }
+
+unlock:
+ l2cap_chan_unlock(chan);
+ return err;
+}
+
+static void build_bredr_pairing_cmd(struct smp_chan *smp,
+ struct smp_cmd_pairing *req,
+ struct smp_cmd_pairing *rsp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ u8 local_dist = 0, remote_dist = 0;
+
+ if (hci_dev_test_flag(hdev, HCI_BONDABLE)) {
+ local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN;
+ remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING))
+ remote_dist |= SMP_DIST_ID_KEY;
+
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY))
+ local_dist |= SMP_DIST_ID_KEY;
+
+ if (!rsp) {
+ memset(req, 0, sizeof(*req));
+
+ req->init_key_dist = local_dist;
+ req->resp_key_dist = remote_dist;
+ req->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+
+ smp->remote_key_dist = remote_dist;
+
+ return;
+ }
+
+ memset(rsp, 0, sizeof(*rsp));
+
+ rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ rsp->init_key_dist = req->init_key_dist & remote_dist;
+ rsp->resp_key_dist = req->resp_key_dist & local_dist;
+
+ smp->remote_key_dist = rsp->init_key_dist;
+}
+
+static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_pairing rsp, *req = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ struct smp_chan *smp;
+ u8 key_size, auth, sec_level;
+ int ret;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(*req))
+ return SMP_INVALID_PARAMS;
+
+ if (conn->hcon->role != HCI_ROLE_SLAVE)
+ return SMP_CMD_NOTSUPP;
+
+ if (!chan->data)
+ smp = smp_chan_create(conn);
+ else
+ smp = chan->data;
+
+ if (!smp)
+ return SMP_UNSPECIFIED;
+
+ /* We didn't start the pairing, so match remote */
+ auth = req->auth_req & AUTH_REQ_MASK(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_BONDABLE) &&
+ (auth & SMP_AUTH_BONDING))
+ return SMP_PAIRING_NOTSUPP;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC))
+ return SMP_AUTH_REQUIREMENTS;
+
+ smp->preq[0] = SMP_CMD_PAIRING_REQ;
+ memcpy(&smp->preq[1], req, sizeof(*req));
+ skb_pull(skb, sizeof(*req));
+
+ /* If the remote side's OOB flag is set it means it has
+ * successfully received our local OOB data - therefore set the
+ * flag to indicate that local OOB is in use.
+ */
+ if (req->oob_flag == SMP_OOB_PRESENT)
+ set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags);
+
+ /* SMP over BR/EDR requires special treatment */
+ if (conn->hcon->type == ACL_LINK) {
+ /* We must have a BR/EDR SC link */
+ if (!test_bit(HCI_CONN_AES_CCM, &conn->hcon->flags) &&
+ !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return SMP_CROSS_TRANSP_NOT_ALLOWED;
+
+ set_bit(SMP_FLAG_SC, &smp->flags);
+
+ build_bredr_pairing_cmd(smp, req, &rsp);
+
+ key_size = min(req->max_key_size, rsp.max_key_size);
+ if (check_enc_key_size(conn, key_size))
+ return SMP_ENC_KEY_SIZE;
+
+ /* Clear bits which are generated but not distributed */
+ smp->remote_key_dist &= ~SMP_SC_NO_DIST;
+
+ smp->prsp[0] = SMP_CMD_PAIRING_RSP;
+ memcpy(&smp->prsp[1], &rsp, sizeof(rsp));
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp);
+
+ smp_distribute_keys(smp);
+ return 0;
+ }
+
+ build_pairing_cmd(conn, req, &rsp, auth);
+
+ if (rsp.auth_req & SMP_AUTH_SC)
+ set_bit(SMP_FLAG_SC, &smp->flags);
+
+ if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
+ sec_level = BT_SECURITY_MEDIUM;
+ else
+ sec_level = authreq_to_seclevel(auth);
+
+ if (sec_level > conn->hcon->pending_sec_level)
+ conn->hcon->pending_sec_level = sec_level;
+
+ /* If we need MITM check that it can be achieved */
+ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) {
+ u8 method;
+
+ method = get_auth_method(smp, conn->hcon->io_capability,
+ req->io_capability);
+ if (method == JUST_WORKS || method == JUST_CFM)
+ return SMP_AUTH_REQUIREMENTS;
+ }
+
+ key_size = min(req->max_key_size, rsp.max_key_size);
+ if (check_enc_key_size(conn, key_size))
+ return SMP_ENC_KEY_SIZE;
+
+ get_random_bytes(smp->prnd, sizeof(smp->prnd));
+
+ smp->prsp[0] = SMP_CMD_PAIRING_RSP;
+ memcpy(&smp->prsp[1], &rsp, sizeof(rsp));
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp);
+
+ clear_bit(SMP_FLAG_INITIATOR, &smp->flags);
+
+ /* Strictly speaking we shouldn't allow Pairing Confirm for the
+ * SC case, however some implementations incorrectly copy RFU auth
+ * req bits from our security request, which may create a false
+ * positive SC enablement.
+ */
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY);
+ /* Clear bits which are generated but not distributed */
+ smp->remote_key_dist &= ~SMP_SC_NO_DIST;
+ /* Wait for Public Key from Initiating Device */
+ return 0;
+ }
+
+ /* Request setup of TK */
+ ret = tk_request(conn, 0, auth, rsp.io_capability, req->io_capability);
+ if (ret)
+ return SMP_UNSPECIFIED;
+
+ return 0;
+}
+
+static u8 sc_send_public_key(struct smp_chan *smp)
+{
+ struct hci_dev *hdev = smp->conn->hcon->hdev;
+
+ BT_DBG("");
+
+ if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) {
+ struct l2cap_chan *chan = hdev->smp_data;
+ struct smp_dev *smp_dev;
+
+ if (!chan || !chan->data)
+ return SMP_UNSPECIFIED;
+
+ smp_dev = chan->data;
+
+ memcpy(smp->local_pk, smp_dev->local_pk, 64);
+ memcpy(smp->local_sk, smp_dev->local_sk, 32);
+ memcpy(smp->lr, smp_dev->local_rand, 16);
+
+ if (smp_dev->debug_key)
+ set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
+
+ goto done;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
+ BT_DBG("Using debug keys");
+ memcpy(smp->local_pk, debug_pk, 64);
+ memcpy(smp->local_sk, debug_sk, 32);
+ set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
+ } else {
+ while (true) {
+ /* Generate local key pair for Secure Connections */
+ if (!ecc_make_key(smp->local_pk, smp->local_sk))
+ return SMP_UNSPECIFIED;
+
+ /* This is unlikely, but we need to check that
+ * we didn't accidentially generate a debug key.
+ */
+ if (memcmp(smp->local_sk, debug_sk, 32))
+ break;
+ }
+ }
+
+done:
+ SMP_DBG("Local Public Key X: %32phN", smp->local_pk);
+ SMP_DBG("Local Public Key Y: %32phN", smp->local_pk + 32);
+ SMP_DBG("Local Private Key: %32phN", smp->local_sk);
+
+ smp_send_cmd(smp->conn, SMP_CMD_PUBLIC_KEY, 64, smp->local_pk);
+
+ return 0;
+}
+
+static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_pairing *req, *rsp = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ u8 key_size, auth;
+ int ret;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(*rsp))
+ return SMP_INVALID_PARAMS;
+
+ if (conn->hcon->role != HCI_ROLE_MASTER)
+ return SMP_CMD_NOTSUPP;
+
+ skb_pull(skb, sizeof(*rsp));
+
+ req = (void *) &smp->preq[1];
+
+ key_size = min(req->max_key_size, rsp->max_key_size);
+ if (check_enc_key_size(conn, key_size))
+ return SMP_ENC_KEY_SIZE;
+
+ auth = rsp->auth_req & AUTH_REQ_MASK(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC))
+ return SMP_AUTH_REQUIREMENTS;
+
+ /* If the remote side's OOB flag is set it means it has
+ * successfully received our local OOB data - therefore set the
+ * flag to indicate that local OOB is in use.
+ */
+ if (rsp->oob_flag == SMP_OOB_PRESENT)
+ set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags);
+
+ smp->prsp[0] = SMP_CMD_PAIRING_RSP;
+ memcpy(&smp->prsp[1], rsp, sizeof(*rsp));
+
+ /* Update remote key distribution in case the remote cleared
+ * some bits that we had enabled in our request.
+ */
+ smp->remote_key_dist &= rsp->resp_key_dist;
+
+ /* For BR/EDR this means we're done and can start phase 3 */
+ if (conn->hcon->type == ACL_LINK) {
+ /* Clear bits which are generated but not distributed */
+ smp->remote_key_dist &= ~SMP_SC_NO_DIST;
+ smp_distribute_keys(smp);
+ return 0;
+ }
+
+ if ((req->auth_req & SMP_AUTH_SC) && (auth & SMP_AUTH_SC))
+ set_bit(SMP_FLAG_SC, &smp->flags);
+ else if (conn->hcon->pending_sec_level > BT_SECURITY_HIGH)
+ conn->hcon->pending_sec_level = BT_SECURITY_HIGH;
+
+ /* If we need MITM check that it can be achieved */
+ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) {
+ u8 method;
+
+ method = get_auth_method(smp, req->io_capability,
+ rsp->io_capability);
+ if (method == JUST_WORKS || method == JUST_CFM)
+ return SMP_AUTH_REQUIREMENTS;
+ }
+
+ get_random_bytes(smp->prnd, sizeof(smp->prnd));
+
+ /* Update remote key distribution in case the remote cleared
+ * some bits that we had enabled in our request.
+ */
+ smp->remote_key_dist &= rsp->resp_key_dist;
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ /* Clear bits which are generated but not distributed */
+ smp->remote_key_dist &= ~SMP_SC_NO_DIST;
+ SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY);
+ return sc_send_public_key(smp);
+ }
+
+ auth |= req->auth_req;
+
+ ret = tk_request(conn, 0, auth, req->io_capability, rsp->io_capability);
+ if (ret)
+ return SMP_UNSPECIFIED;
+
+ set_bit(SMP_FLAG_CFM_PENDING, &smp->flags);
+
+ /* Can't compose response until we have been confirmed */
+ if (test_bit(SMP_FLAG_TK_VALID, &smp->flags))
+ return smp_confirm(smp);
+
+ return 0;
+}
+
+static u8 sc_check_confirm(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+
+ BT_DBG("");
+
+ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+ return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM);
+
+ if (conn->hcon->out) {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+ smp->prnd);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+ }
+
+ return 0;
+}
+
+/* Work-around for some implementations that incorrectly copy RFU bits
+ * from our security request and thereby create the impression that
+ * we're doing SC when in fact the remote doesn't support it.
+ */
+static int fixup_sc_false_positive(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_cmd_pairing *req, *rsp;
+ u8 auth;
+
+ /* The issue is only observed when we're in slave role */
+ if (hcon->out)
+ return SMP_UNSPECIFIED;
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+ BT_ERR("Refusing SMP SC -> legacy fallback in SC-only mode");
+ return SMP_UNSPECIFIED;
+ }
+
+ BT_ERR("Trying to fall back to legacy SMP");
+
+ req = (void *) &smp->preq[1];
+ rsp = (void *) &smp->prsp[1];
+
+ /* Rebuild key dist flags which may have been cleared for SC */
+ smp->remote_key_dist = (req->init_key_dist & rsp->resp_key_dist);
+
+ auth = req->auth_req & AUTH_REQ_MASK(hdev);
+
+ if (tk_request(conn, 0, auth, rsp->io_capability, req->io_capability)) {
+ BT_ERR("Failed to fall back to legacy SMP");
+ return SMP_UNSPECIFIED;
+ }
+
+ clear_bit(SMP_FLAG_SC, &smp->flags);
+
+ return 0;
+}
+
+static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+
+ BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave");
+
+ if (skb->len < sizeof(smp->pcnf))
+ return SMP_INVALID_PARAMS;
+
+ memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf));
+ skb_pull(skb, sizeof(smp->pcnf));
+
+ if (test_bit(SMP_FLAG_SC, &smp->flags)) {
+ int ret;
+
+ /* Public Key exchange must happen before any other steps */
+ if (test_bit(SMP_FLAG_REMOTE_PK, &smp->flags))
+ return sc_check_confirm(smp);
+
+ BT_ERR("Unexpected SMP Pairing Confirm");
+
+ ret = fixup_sc_false_positive(smp);
+ if (ret)
+ return ret;
+ }
+
+ if (conn->hcon->out) {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+ smp->prnd);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+ return 0;
+ }
+
+ if (test_bit(SMP_FLAG_TK_VALID, &smp->flags))
+ return smp_confirm(smp);
+
+ set_bit(SMP_FLAG_CFM_PENDING, &smp->flags);
+
+ return 0;
+}
+
+static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ u8 *pkax, *pkbx, *na, *nb;
+ u32 passkey;
+ int err;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(smp->rrnd))
+ return SMP_INVALID_PARAMS;
+
+ memcpy(smp->rrnd, skb->data, sizeof(smp->rrnd));
+ skb_pull(skb, sizeof(smp->rrnd));
+
+ if (!test_bit(SMP_FLAG_SC, &smp->flags))
+ return smp_random(smp);
+
+ if (hcon->out) {
+ pkax = smp->local_pk;
+ pkbx = smp->remote_pk;
+ na = smp->prnd;
+ nb = smp->rrnd;
+ } else {
+ pkax = smp->remote_pk;
+ pkbx = smp->local_pk;
+ na = smp->rrnd;
+ nb = smp->prnd;
+ }
+
+ if (smp->method == REQ_OOB) {
+ if (!hcon->out)
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+ sizeof(smp->prnd), smp->prnd);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ goto mackey_and_ltk;
+ }
+
+ /* Passkey entry has special treatment */
+ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+ return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM);
+
+ if (hcon->out) {
+ u8 cfm[16];
+
+ err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk,
+ smp->rrnd, 0, cfm);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ if (memcmp(smp->pcnf, cfm, 16))
+ return SMP_CONFIRM_FAILED;
+ } else {
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+ smp->prnd);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ }
+
+mackey_and_ltk:
+ /* Generate MacKey and LTK */
+ err = sc_mackey_and_ltk(smp, smp->mackey, smp->tk);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ if (smp->method == JUST_WORKS || smp->method == REQ_OOB) {
+ if (hcon->out) {
+ sc_dhkey_check(smp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+ }
+ return 0;
+ }
+
+ err = smp_g2(smp->tfm_cmac, pkax, pkbx, na, nb, &passkey);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type,
+ hcon->dst_type, passkey, 0);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ set_bit(SMP_FLAG_WAIT_USER, &smp->flags);
+
+ return 0;
+}
+
+static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level)
+{
+ struct smp_ltk *key;
+ struct hci_conn *hcon = conn->hcon;
+
+ key = hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role);
+ if (!key)
+ return false;
+
+ if (smp_ltk_sec_level(key) < sec_level)
+ return false;
+
+ if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags))
+ return true;
+
+ hci_le_start_enc(hcon, key->ediv, key->rand, key->val);
+ hcon->enc_key_size = key->enc_size;
+
+ /* We never store STKs for master role, so clear this flag */
+ clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags);
+
+ return true;
+}
+
+bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level,
+ enum smp_key_pref key_pref)
+{
+ if (sec_level == BT_SECURITY_LOW)
+ return true;
+
+ /* If we're encrypted with an STK but the caller prefers using
+ * LTK claim insufficient security. This way we allow the
+ * connection to be re-encrypted with an LTK, even if the LTK
+ * provides the same level of security. Only exception is if we
+ * don't have an LTK (e.g. because of key distribution bits).
+ */
+ if (key_pref == SMP_USE_LTK &&
+ test_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags) &&
+ hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role))
+ return false;
+
+ if (hcon->sec_level >= sec_level)
+ return true;
+
+ return false;
+}
+
+static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_security_req *rp = (void *) skb->data;
+ struct smp_cmd_pairing cp;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_chan *smp;
+ u8 sec_level, auth;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(*rp))
+ return SMP_INVALID_PARAMS;
+
+ if (hcon->role != HCI_ROLE_MASTER)
+ return SMP_CMD_NOTSUPP;
+
+ auth = rp->auth_req & AUTH_REQ_MASK(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC))
+ return SMP_AUTH_REQUIREMENTS;
+
+ if (hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
+ sec_level = BT_SECURITY_MEDIUM;
+ else
+ sec_level = authreq_to_seclevel(auth);
+
+ if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK))
+ return 0;
+
+ if (sec_level > hcon->pending_sec_level)
+ hcon->pending_sec_level = sec_level;
+
+ if (smp_ltk_encrypt(conn, hcon->pending_sec_level))
+ return 0;
+
+ smp = smp_chan_create(conn);
+ if (!smp)
+ return SMP_UNSPECIFIED;
+
+ if (!hci_dev_test_flag(hdev, HCI_BONDABLE) &&
+ (auth & SMP_AUTH_BONDING))
+ return SMP_PAIRING_NOTSUPP;
+
+ skb_pull(skb, sizeof(*rp));
+
+ memset(&cp, 0, sizeof(cp));
+ build_pairing_cmd(conn, &cp, NULL, auth);
+
+ smp->preq[0] = SMP_CMD_PAIRING_REQ;
+ memcpy(&smp->preq[1], &cp, sizeof(cp));
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
+
+ return 0;
+}
+
+int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
+{
+ struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_chan *chan;
+ struct smp_chan *smp;
+ __u8 authreq;
+ int ret;
+
+ BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level);
+
+ /* This may be NULL if there's an unexpected disconnection */
+ if (!conn)
+ return 1;
+
+ chan = conn->smp;
+
+ if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED))
+ return 1;
+
+ if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK))
+ return 1;
+
+ if (sec_level > hcon->pending_sec_level)
+ hcon->pending_sec_level = sec_level;
+
+ if (hcon->role == HCI_ROLE_MASTER)
+ if (smp_ltk_encrypt(conn, hcon->pending_sec_level))
+ return 0;
+
+ l2cap_chan_lock(chan);
+
+ /* If SMP is already in progress ignore this request */
+ if (chan->data) {
+ ret = 0;
+ goto unlock;
+ }
+
+ smp = smp_chan_create(conn);
+ if (!smp) {
+ ret = 1;
+ goto unlock;
+ }
+
+ authreq = seclevel_to_authreq(sec_level);
+
+ if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED))
+ authreq |= SMP_AUTH_SC;
+
+ /* Require MITM if IO Capability allows or the security level
+ * requires it.
+ */
+ if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT ||
+ hcon->pending_sec_level > BT_SECURITY_MEDIUM)
+ authreq |= SMP_AUTH_MITM;
+
+ if (hcon->role == HCI_ROLE_MASTER) {
+ struct smp_cmd_pairing cp;
+
+ build_pairing_cmd(conn, &cp, NULL, authreq);
+ smp->preq[0] = SMP_CMD_PAIRING_REQ;
+ memcpy(&smp->preq[1], &cp, sizeof(cp));
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
+ } else {
+ struct smp_cmd_security_req cp;
+ cp.auth_req = authreq;
+ smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ);
+ }
+
+ set_bit(SMP_FLAG_INITIATOR, &smp->flags);
+ ret = 0;
+
+unlock:
+ l2cap_chan_unlock(chan);
+ return ret;
+}
+
+static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_encrypt_info *rp = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(*rp))
+ return SMP_INVALID_PARAMS;
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT);
+
+ skb_pull(skb, sizeof(*rp));
+
+ memcpy(smp->tk, rp->ltk, sizeof(smp->tk));
+
+ return 0;
+}
+
+static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_master_ident *rp = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_dev *hdev = conn->hcon->hdev;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_ltk *ltk;
+ u8 authenticated;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(*rp))
+ return SMP_INVALID_PARAMS;
+
+ /* Mark the information as received */
+ smp->remote_key_dist &= ~SMP_DIST_ENC_KEY;
+
+ if (smp->remote_key_dist & SMP_DIST_ID_KEY)
+ SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO);
+ else if (smp->remote_key_dist & SMP_DIST_SIGN)
+ SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO);
+
+ skb_pull(skb, sizeof(*rp));
+
+ authenticated = (hcon->sec_level == BT_SECURITY_HIGH);
+ ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK,
+ authenticated, smp->tk, smp->enc_key_size,
+ rp->ediv, rp->rand);
+ smp->ltk = ltk;
+ if (!(smp->remote_key_dist & KEY_DIST_MASK))
+ smp_distribute_keys(smp);
+
+ return 0;
+}
+
+static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_ident_info *info = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+
+ BT_DBG("");
+
+ if (skb->len < sizeof(*info))
+ return SMP_INVALID_PARAMS;
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO);
+
+ skb_pull(skb, sizeof(*info));
+
+ memcpy(smp->irk, info->irk, 16);
+
+ return 0;
+}
+
+static int smp_cmd_ident_addr_info(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct smp_cmd_ident_addr_info *info = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ bdaddr_t rpa;
+
+ BT_DBG("");
+
+ if (skb->len < sizeof(*info))
+ return SMP_INVALID_PARAMS;
+
+ /* Mark the information as received */
+ smp->remote_key_dist &= ~SMP_DIST_ID_KEY;
+
+ if (smp->remote_key_dist & SMP_DIST_SIGN)
+ SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO);
+
+ skb_pull(skb, sizeof(*info));
+
+ /* Strictly speaking the Core Specification (4.1) allows sending
+ * an empty address which would force us to rely on just the IRK
+ * as "identity information". However, since such
+ * implementations are not known of and in order to not over
+ * complicate our implementation, simply pretend that we never
+ * received an IRK for such a device.
+ *
+ * The Identity Address must also be a Static Random or Public
+ * Address, which hci_is_identity_address() checks for.
+ */
+ if (!bacmp(&info->bdaddr, BDADDR_ANY) ||
+ !hci_is_identity_address(&info->bdaddr, info->addr_type)) {
+ BT_ERR("Ignoring IRK with no identity address");
+ goto distribute;
+ }
+
+ bacpy(&smp->id_addr, &info->bdaddr);
+ smp->id_addr_type = info->addr_type;
+
+ if (hci_bdaddr_is_rpa(&hcon->dst, hcon->dst_type))
+ bacpy(&rpa, &hcon->dst);
+ else
+ bacpy(&rpa, BDADDR_ANY);
+
+ smp->remote_irk = hci_add_irk(conn->hcon->hdev, &smp->id_addr,
+ smp->id_addr_type, smp->irk, &rpa);
+
+distribute:
+ if (!(smp->remote_key_dist & KEY_DIST_MASK))
+ smp_distribute_keys(smp);
+
+ return 0;
+}
+
+static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_sign_info *rp = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct smp_csrk *csrk;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(*rp))
+ return SMP_INVALID_PARAMS;
+
+ /* Mark the information as received */
+ smp->remote_key_dist &= ~SMP_DIST_SIGN;
+
+ skb_pull(skb, sizeof(*rp));
+
+ csrk = kzalloc(sizeof(*csrk), GFP_KERNEL);
+ if (csrk) {
+ if (conn->hcon->sec_level > BT_SECURITY_MEDIUM)
+ csrk->type = MGMT_CSRK_REMOTE_AUTHENTICATED;
+ else
+ csrk->type = MGMT_CSRK_REMOTE_UNAUTHENTICATED;
+ memcpy(csrk->val, rp->csrk, sizeof(csrk->val));
+ }
+ smp->csrk = csrk;
+ smp_distribute_keys(smp);
+
+ return 0;
+}
+
+static u8 sc_select_method(struct smp_chan *smp)
+{
+ struct l2cap_conn *conn = smp->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_cmd_pairing *local, *remote;
+ u8 local_mitm, remote_mitm, local_io, remote_io, method;
+
+ if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags) ||
+ test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags))
+ return REQ_OOB;
+
+ /* The preq/prsp contain the raw Pairing Request/Response PDUs
+ * which are needed as inputs to some crypto functions. To get
+ * the "struct smp_cmd_pairing" from them we need to skip the
+ * first byte which contains the opcode.
+ */
+ if (hcon->out) {
+ local = (void *) &smp->preq[1];
+ remote = (void *) &smp->prsp[1];
+ } else {
+ local = (void *) &smp->prsp[1];
+ remote = (void *) &smp->preq[1];
+ }
+
+ local_io = local->io_capability;
+ remote_io = remote->io_capability;
+
+ local_mitm = (local->auth_req & SMP_AUTH_MITM);
+ remote_mitm = (remote->auth_req & SMP_AUTH_MITM);
+
+ /* If either side wants MITM, look up the method from the table,
+ * otherwise use JUST WORKS.
+ */
+ if (local_mitm || remote_mitm)
+ method = get_auth_method(smp, local_io, remote_io);
+ else
+ method = JUST_WORKS;
+
+ /* Don't confirm locally initiated pairing attempts */
+ if (method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+ method = JUST_WORKS;
+
+ return method;
+}
+
+static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_public_key *key = (void *) skb->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct l2cap_chan *chan = conn->smp;
+ struct smp_chan *smp = chan->data;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_cmd_pairing_confirm cfm;
+ int err;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(*key))
+ return SMP_INVALID_PARAMS;
+
+ memcpy(smp->remote_pk, key, 64);
+
+ if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) {
+ err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->remote_pk,
+ smp->rr, 0, cfm.confirm_val);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ if (memcmp(cfm.confirm_val, smp->pcnf, 16))
+ return SMP_CONFIRM_FAILED;
+ }
+
+ /* Non-initiating device sends its public key after receiving
+ * the key from the initiating device.
+ */
+ if (!hcon->out) {
+ err = sc_send_public_key(smp);
+ if (err)
+ return err;
+ }
+
+ SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk);
+ SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32);
+
+ if (!ecdh_shared_secret(smp->remote_pk, smp->local_sk, smp->dhkey))
+ return SMP_UNSPECIFIED;
+
+ SMP_DBG("DHKey %32phN", smp->dhkey);
+
+ set_bit(SMP_FLAG_REMOTE_PK, &smp->flags);
+
+ smp->method = sc_select_method(smp);
+
+ BT_DBG("%s selected method 0x%02x", hdev->name, smp->method);
+
+ /* JUST_WORKS and JUST_CFM result in an unauthenticated key */
+ if (smp->method == JUST_WORKS || smp->method == JUST_CFM)
+ hcon->pending_sec_level = BT_SECURITY_MEDIUM;
+ else
+ hcon->pending_sec_level = BT_SECURITY_FIPS;
+
+ if (!memcmp(debug_pk, smp->remote_pk, 64))
+ set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
+
+ if (smp->method == DSP_PASSKEY) {
+ get_random_bytes(&hcon->passkey_notify,
+ sizeof(hcon->passkey_notify));
+ hcon->passkey_notify %= 1000000;
+ hcon->passkey_entered = 0;
+ smp->passkey_round = 0;
+ if (mgmt_user_passkey_notify(hdev, &hcon->dst, hcon->type,
+ hcon->dst_type,
+ hcon->passkey_notify,
+ hcon->passkey_entered))
+ return SMP_UNSPECIFIED;
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+ return sc_passkey_round(smp, SMP_CMD_PUBLIC_KEY);
+ }
+
+ if (smp->method == REQ_OOB) {
+ if (hcon->out)
+ smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+ sizeof(smp->prnd), smp->prnd);
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+
+ return 0;
+ }
+
+ if (hcon->out)
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+
+ if (smp->method == REQ_PASSKEY) {
+ if (mgmt_user_passkey_request(hdev, &hcon->dst, hcon->type,
+ hcon->dst_type))
+ return SMP_UNSPECIFIED;
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+ set_bit(SMP_FLAG_WAIT_USER, &smp->flags);
+ return 0;
+ }
+
+ /* The Initiating device waits for the non-initiating device to
+ * send the confirm value.
+ */
+ if (conn->hcon->out)
+ return 0;
+
+ err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd,
+ 0, cfm.confirm_val);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+
+ return 0;
+}
+
+static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct smp_cmd_dhkey_check *check = (void *) skb->data;
+ struct l2cap_chan *chan = conn->smp;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_chan *smp = chan->data;
+ u8 a[7], b[7], *local_addr, *remote_addr;
+ u8 io_cap[3], r[16], e[16];
+ int err;
+
+ BT_DBG("conn %p", conn);
+
+ if (skb->len < sizeof(*check))
+ return SMP_INVALID_PARAMS;
+
+ memcpy(a, &hcon->init_addr, 6);
+ memcpy(b, &hcon->resp_addr, 6);
+ a[6] = hcon->init_addr_type;
+ b[6] = hcon->resp_addr_type;
+
+ if (hcon->out) {
+ local_addr = a;
+ remote_addr = b;
+ memcpy(io_cap, &smp->prsp[1], 3);
+ } else {
+ local_addr = b;
+ remote_addr = a;
+ memcpy(io_cap, &smp->preq[1], 3);
+ }
+
+ memset(r, 0, sizeof(r));
+
+ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+ put_unaligned_le32(hcon->passkey_notify, r);
+ else if (smp->method == REQ_OOB)
+ memcpy(r, smp->lr, 16);
+
+ err = smp_f6(smp->tfm_cmac, smp->mackey, smp->rrnd, smp->prnd, r,
+ io_cap, remote_addr, local_addr, e);
+ if (err)
+ return SMP_UNSPECIFIED;
+
+ if (memcmp(check->e, e, 16))
+ return SMP_DHKEY_CHECK_FAILED;
+
+ if (!hcon->out) {
+ if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) {
+ set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags);
+ return 0;
+ }
+
+ /* Slave sends DHKey check as response to master */
+ sc_dhkey_check(smp);
+ }
+
+ sc_add_ltk(smp);
+
+ if (hcon->out) {
+ hci_le_start_enc(hcon, 0, 0, smp->tk);
+ hcon->enc_key_size = smp->enc_key_size;
+ }
+
+ return 0;
+}
+
+static int smp_cmd_keypress_notify(struct l2cap_conn *conn,
+ struct sk_buff *skb)
+{
+ struct smp_cmd_keypress_notify *kp = (void *) skb->data;
+
+ BT_DBG("value 0x%02x", kp->value);
+
+ return 0;
+}
+
+static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct smp_chan *smp;
+ __u8 code, reason;
+ int err = 0;
+
+ if (skb->len < 1)
+ return -EILSEQ;
+
+ if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) {
+ reason = SMP_PAIRING_NOTSUPP;
+ goto done;
+ }
+
+ code = skb->data[0];
+ skb_pull(skb, sizeof(code));
+
+ smp = chan->data;
+
+ if (code > SMP_CMD_MAX)
+ goto drop;
+
+ if (smp && !test_and_clear_bit(code, &smp->allow_cmd))
+ goto drop;
+
+ /* If we don't have a context the only allowed commands are
+ * pairing request and security request.
+ */
+ if (!smp && code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ)
+ goto drop;
+
+ switch (code) {
+ case SMP_CMD_PAIRING_REQ:
+ reason = smp_cmd_pairing_req(conn, skb);
+ break;
+
+ case SMP_CMD_PAIRING_FAIL:
+ smp_failure(conn, 0);
+ err = -EPERM;
+ break;
+
+ case SMP_CMD_PAIRING_RSP:
+ reason = smp_cmd_pairing_rsp(conn, skb);
+ break;
+
+ case SMP_CMD_SECURITY_REQ:
+ reason = smp_cmd_security_req(conn, skb);
+ break;
+
+ case SMP_CMD_PAIRING_CONFIRM:
+ reason = smp_cmd_pairing_confirm(conn, skb);
+ break;
+
+ case SMP_CMD_PAIRING_RANDOM:
+ reason = smp_cmd_pairing_random(conn, skb);
+ break;
+
+ case SMP_CMD_ENCRYPT_INFO:
+ reason = smp_cmd_encrypt_info(conn, skb);
+ break;
+
+ case SMP_CMD_MASTER_IDENT:
+ reason = smp_cmd_master_ident(conn, skb);
+ break;
+
+ case SMP_CMD_IDENT_INFO:
+ reason = smp_cmd_ident_info(conn, skb);
+ break;
+
+ case SMP_CMD_IDENT_ADDR_INFO:
+ reason = smp_cmd_ident_addr_info(conn, skb);
+ break;
+
+ case SMP_CMD_SIGN_INFO:
+ reason = smp_cmd_sign_info(conn, skb);
+ break;
+
+ case SMP_CMD_PUBLIC_KEY:
+ reason = smp_cmd_public_key(conn, skb);
+ break;
+
+ case SMP_CMD_DHKEY_CHECK:
+ reason = smp_cmd_dhkey_check(conn, skb);
+ break;
+
+ case SMP_CMD_KEYPRESS_NOTIFY:
+ reason = smp_cmd_keypress_notify(conn, skb);
+ break;
+
+ default:
+ BT_DBG("Unknown command code 0x%2.2x", code);
+ reason = SMP_CMD_NOTSUPP;
+ goto done;
+ }
+
+done:
+ if (!err) {
+ if (reason)
+ smp_failure(conn, reason);
+ kfree_skb(skb);
+ }
+
+ return err;
+
+drop:
+ BT_ERR("%s unexpected SMP command 0x%02x from %pMR", hcon->hdev->name,
+ code, &hcon->dst);
+ kfree_skb(skb);
+ return 0;
+}
+
+static void smp_teardown_cb(struct l2cap_chan *chan, int err)
+{
+ struct l2cap_conn *conn = chan->conn;
+
+ BT_DBG("chan %p", chan);
+
+ if (chan->data)
+ smp_chan_destroy(conn);
+
+ conn->smp = NULL;
+ l2cap_chan_put(chan);
+}
+
+static void bredr_pairing(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
+ struct smp_cmd_pairing req;
+ struct smp_chan *smp;
+
+ BT_DBG("chan %p", chan);
+
+ /* Only new pairings are interesting */
+ if (!test_bit(HCI_CONN_NEW_LINK_KEY, &hcon->flags))
+ return;
+
+ /* Don't bother if we're not encrypted */
+ if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
+ return;
+
+ /* Only master may initiate SMP over BR/EDR */
+ if (hcon->role != HCI_ROLE_MASTER)
+ return;
+
+ /* Secure Connections support must be enabled */
+ if (!hci_dev_test_flag(hdev, HCI_SC_ENABLED))
+ return;
+
+ /* BR/EDR must use Secure Connections for SMP */
+ if (!test_bit(HCI_CONN_AES_CCM, &hcon->flags) &&
+ !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return;
+
+ /* If our LE support is not enabled don't do anything */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return;
+
+ /* Don't bother if remote LE support is not enabled */
+ if (!lmp_host_le_capable(hcon))
+ return;
+
+ /* Remote must support SMP fixed chan for BR/EDR */
+ if (!(conn->remote_fixed_chan & L2CAP_FC_SMP_BREDR))
+ return;
+
+ /* Don't bother if SMP is already ongoing */
+ if (chan->data)
+ return;
+
+ smp = smp_chan_create(conn);
+ if (!smp) {
+ BT_ERR("%s unable to create SMP context for BR/EDR",
+ hdev->name);
+ return;
+ }
+
+ set_bit(SMP_FLAG_SC, &smp->flags);
+
+ BT_DBG("%s starting SMP over BR/EDR", hdev->name);
+
+ /* Prepare and send the BR/EDR SMP Pairing Request */
+ build_bredr_pairing_cmd(smp, &req, NULL);
+
+ smp->preq[0] = SMP_CMD_PAIRING_REQ;
+ memcpy(&smp->preq[1], &req, sizeof(req));
+
+ smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(req), &req);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
+}
+
+static void smp_resume_cb(struct l2cap_chan *chan)
+{
+ struct smp_chan *smp = chan->data;
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_conn *hcon = conn->hcon;
+
+ BT_DBG("chan %p", chan);
+
+ if (hcon->type == ACL_LINK) {
+ bredr_pairing(chan);
+ return;
+ }
+
+ if (!smp)
+ return;
+
+ if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
+ return;
+
+ cancel_delayed_work(&smp->security_timer);
+
+ smp_distribute_keys(smp);
+}
+
+static void smp_ready_cb(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct hci_conn *hcon = conn->hcon;
+
+ BT_DBG("chan %p", chan);
+
+ conn->smp = chan;
+ l2cap_chan_hold(chan);
+
+ if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
+ bredr_pairing(chan);
+}
+
+static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ int err;
+
+ BT_DBG("chan %p", chan);
+
+ err = smp_sig_channel(chan, skb);
+ if (err) {
+ struct smp_chan *smp = chan->data;
+
+ if (smp)
+ cancel_delayed_work_sync(&smp->security_timer);
+
+ hci_disconnect(chan->conn->hcon, HCI_ERROR_AUTH_FAILURE);
+ }
+
+ return err;
+}
+
+static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan,
+ unsigned long hdr_len,
+ unsigned long len, int nb)
+{
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ skb->priority = HCI_PRIO_MAX;
+ bt_cb(skb)->l2cap.chan = chan;
+
+ return skb;
+}
+
+static const struct l2cap_ops smp_chan_ops = {
+ .name = "Security Manager",
+ .ready = smp_ready_cb,
+ .recv = smp_recv_cb,
+ .alloc_skb = smp_alloc_skb_cb,
+ .teardown = smp_teardown_cb,
+ .resume = smp_resume_cb,
+
+ .new_connection = l2cap_chan_no_new_connection,
+ .state_change = l2cap_chan_no_state_change,
+ .close = l2cap_chan_no_close,
+ .defer = l2cap_chan_no_defer,
+ .suspend = l2cap_chan_no_suspend,
+ .set_shutdown = l2cap_chan_no_set_shutdown,
+ .get_sndtimeo = l2cap_chan_no_get_sndtimeo,
+};
+
+static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan)
+{
+ struct l2cap_chan *chan;
+
+ BT_DBG("pchan %p", pchan);
+
+ chan = l2cap_chan_create();
+ if (!chan)
+ return NULL;
+
+ chan->chan_type = pchan->chan_type;
+ chan->ops = &smp_chan_ops;
+ chan->scid = pchan->scid;
+ chan->dcid = chan->scid;
+ chan->imtu = pchan->imtu;
+ chan->omtu = pchan->omtu;
+ chan->mode = pchan->mode;
+
+ /* Other L2CAP channels may request SMP routines in order to
+ * change the security level. This means that the SMP channel
+ * lock must be considered in its own category to avoid lockdep
+ * warnings.
+ */
+ atomic_set(&chan->nesting, L2CAP_NESTING_SMP);
+
+ BT_DBG("created chan %p", chan);
+
+ return chan;
+}
+
+static const struct l2cap_ops smp_root_chan_ops = {
+ .name = "Security Manager Root",
+ .new_connection = smp_new_conn_cb,
+
+ /* None of these are implemented for the root channel */
+ .close = l2cap_chan_no_close,
+ .alloc_skb = l2cap_chan_no_alloc_skb,
+ .recv = l2cap_chan_no_recv,
+ .state_change = l2cap_chan_no_state_change,
+ .teardown = l2cap_chan_no_teardown,
+ .ready = l2cap_chan_no_ready,
+ .defer = l2cap_chan_no_defer,
+ .suspend = l2cap_chan_no_suspend,
+ .resume = l2cap_chan_no_resume,
+ .set_shutdown = l2cap_chan_no_set_shutdown,
+ .get_sndtimeo = l2cap_chan_no_get_sndtimeo,
+};
+
+static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
+{
+ struct l2cap_chan *chan;
+ struct smp_dev *smp;
+ struct crypto_blkcipher *tfm_aes;
+ struct crypto_hash *tfm_cmac;
+
+ if (cid == L2CAP_CID_SMP_BREDR) {
+ smp = NULL;
+ goto create_chan;
+ }
+
+ smp = kzalloc(sizeof(*smp), GFP_KERNEL);
+ if (!smp)
+ return ERR_PTR(-ENOMEM);
+
+ tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm_aes)) {
+ BT_ERR("Unable to create ECB crypto context");
+ kzfree(smp);
+ return ERR_CAST(tfm_aes);
+ }
+
+ tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm_cmac)) {
+ BT_ERR("Unable to create CMAC crypto context");
+ crypto_free_blkcipher(tfm_aes);
+ kzfree(smp);
+ return ERR_CAST(tfm_cmac);
+ }
+
+ smp->tfm_aes = tfm_aes;
+ smp->tfm_cmac = tfm_cmac;
+
+create_chan:
+ chan = l2cap_chan_create();
+ if (!chan) {
+ if (smp) {
+ crypto_free_blkcipher(smp->tfm_aes);
+ crypto_free_hash(smp->tfm_cmac);
+ kzfree(smp);
+ }
+ return ERR_PTR(-ENOMEM);
+ }
+
+ chan->data = smp;
+
+ l2cap_add_scid(chan, cid);
+
+ l2cap_chan_set_defaults(chan);
+
+ if (cid == L2CAP_CID_SMP) {
+ u8 bdaddr_type;
+
+ hci_copy_identity_address(hdev, &chan->src, &bdaddr_type);
+
+ if (bdaddr_type == ADDR_LE_DEV_PUBLIC)
+ chan->src_type = BDADDR_LE_PUBLIC;
+ else
+ chan->src_type = BDADDR_LE_RANDOM;
+ } else {
+ bacpy(&chan->src, &hdev->bdaddr);
+ chan->src_type = BDADDR_BREDR;
+ }
+
+ chan->state = BT_LISTEN;
+ chan->mode = L2CAP_MODE_BASIC;
+ chan->imtu = L2CAP_DEFAULT_MTU;
+ chan->ops = &smp_root_chan_ops;
+
+ /* Set correct nesting level for a parent/listening channel */
+ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT);
+
+ return chan;
+}
+
+static void smp_del_chan(struct l2cap_chan *chan)
+{
+ struct smp_dev *smp;
+
+ BT_DBG("chan %p", chan);
+
+ smp = chan->data;
+ if (smp) {
+ chan->data = NULL;
+ if (smp->tfm_aes)
+ crypto_free_blkcipher(smp->tfm_aes);
+ if (smp->tfm_cmac)
+ crypto_free_hash(smp->tfm_cmac);
+ kzfree(smp);
+ }
+
+ l2cap_chan_put(chan);
+}
+
+static ssize_t force_bredr_smp_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y': 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_bredr_smp_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[32];
+ size_t buf_size = min(count, (sizeof(buf)-1));
+ bool enable;
+
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ if (strtobool(buf, &enable))
+ return -EINVAL;
+
+ if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
+ return -EALREADY;
+
+ if (enable) {
+ struct l2cap_chan *chan;
+
+ chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR);
+ if (IS_ERR(chan))
+ return PTR_ERR(chan);
+
+ hdev->smp_bredr_data = chan;
+ } else {
+ struct l2cap_chan *chan;
+
+ chan = hdev->smp_bredr_data;
+ hdev->smp_bredr_data = NULL;
+ smp_del_chan(chan);
+ }
+
+ hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP);
+
+ return count;
+}
+
+static const struct file_operations force_bredr_smp_fops = {
+ .open = simple_open,
+ .read = force_bredr_smp_read,
+ .write = force_bredr_smp_write,
+ .llseek = default_llseek,
+};
+
+int smp_register(struct hci_dev *hdev)
+{
+ struct l2cap_chan *chan;
+
+ BT_DBG("%s", hdev->name);
+
+ /* If the controller does not support Low Energy operation, then
+ * there is also no need to register any SMP channel.
+ */
+ if (!lmp_le_capable(hdev))
+ return 0;
+
+ if (WARN_ON(hdev->smp_data)) {
+ chan = hdev->smp_data;
+ hdev->smp_data = NULL;
+ smp_del_chan(chan);
+ }
+
+ chan = smp_add_cid(hdev, L2CAP_CID_SMP);
+ if (IS_ERR(chan))
+ return PTR_ERR(chan);
+
+ hdev->smp_data = chan;
+
+ /* If the controller does not support BR/EDR Secure Connections
+ * feature, then the BR/EDR SMP channel shall not be present.
+ *
+ * To test this with Bluetooth 4.0 controllers, create a debugfs
+ * switch that allows forcing BR/EDR SMP support and accepting
+ * cross-transport pairing on non-AES encrypted connections.
+ */
+ if (!lmp_sc_capable(hdev)) {
+ debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
+ hdev, &force_bredr_smp_fops);
+ return 0;
+ }
+
+ if (WARN_ON(hdev->smp_bredr_data)) {
+ chan = hdev->smp_bredr_data;
+ hdev->smp_bredr_data = NULL;
+ smp_del_chan(chan);
+ }
+
+ chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR);
+ if (IS_ERR(chan)) {
+ int err = PTR_ERR(chan);
+ chan = hdev->smp_data;
+ hdev->smp_data = NULL;
+ smp_del_chan(chan);
+ return err;
+ }
+
+ hdev->smp_bredr_data = chan;
+
+ return 0;
+}
+
+void smp_unregister(struct hci_dev *hdev)
+{
+ struct l2cap_chan *chan;
+
+ if (hdev->smp_bredr_data) {
+ chan = hdev->smp_bredr_data;
+ hdev->smp_bredr_data = NULL;
+ smp_del_chan(chan);
+ }
+
+ if (hdev->smp_data) {
+ chan = hdev->smp_data;
+ hdev->smp_data = NULL;
+ smp_del_chan(chan);
+ }
+}
+
+#if IS_ENABLED(CONFIG_BT_SELFTEST_SMP)
+
+static int __init test_ah(struct crypto_blkcipher *tfm_aes)
+{
+ const u8 irk[16] = {
+ 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
+ 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec };
+ const u8 r[3] = { 0x94, 0x81, 0x70 };
+ const u8 exp[3] = { 0xaa, 0xfb, 0x0d };
+ u8 res[3];
+ int err;
+
+ err = smp_ah(tfm_aes, irk, r, res);
+ if (err)
+ return err;
+
+ if (memcmp(res, exp, 3))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_c1(struct crypto_blkcipher *tfm_aes)
+{
+ const u8 k[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ const u8 r[16] = {
+ 0xe0, 0x2e, 0x70, 0xc6, 0x4e, 0x27, 0x88, 0x63,
+ 0x0e, 0x6f, 0xad, 0x56, 0x21, 0xd5, 0x83, 0x57 };
+ const u8 preq[7] = { 0x01, 0x01, 0x00, 0x00, 0x10, 0x07, 0x07 };
+ const u8 pres[7] = { 0x02, 0x03, 0x00, 0x00, 0x08, 0x00, 0x05 };
+ const u8 _iat = 0x01;
+ const u8 _rat = 0x00;
+ const bdaddr_t ra = { { 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1 } };
+ const bdaddr_t ia = { { 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1 } };
+ const u8 exp[16] = {
+ 0x86, 0x3b, 0xf1, 0xbe, 0xc5, 0x4d, 0xa7, 0xd2,
+ 0xea, 0x88, 0x89, 0x87, 0xef, 0x3f, 0x1e, 0x1e };
+ u8 res[16];
+ int err;
+
+ err = smp_c1(tfm_aes, k, r, preq, pres, _iat, &ia, _rat, &ra, res);
+ if (err)
+ return err;
+
+ if (memcmp(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_s1(struct crypto_blkcipher *tfm_aes)
+{
+ const u8 k[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ const u8 r1[16] = {
+ 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11 };
+ const u8 r2[16] = {
+ 0x00, 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99 };
+ const u8 exp[16] = {
+ 0x62, 0xa0, 0x6d, 0x79, 0xae, 0x16, 0x42, 0x5b,
+ 0x9b, 0xf4, 0xb0, 0xe8, 0xf0, 0xe1, 0x1f, 0x9a };
+ u8 res[16];
+ int err;
+
+ err = smp_s1(tfm_aes, k, r1, r2, res);
+ if (err)
+ return err;
+
+ if (memcmp(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_f4(struct crypto_hash *tfm_cmac)
+{
+ const u8 u[32] = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 };
+ const u8 v[32] = {
+ 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b,
+ 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59,
+ 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90,
+ 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 };
+ const u8 x[16] = {
+ 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff,
+ 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 };
+ const u8 z = 0x00;
+ const u8 exp[16] = {
+ 0x2d, 0x87, 0x74, 0xa9, 0xbe, 0xa1, 0xed, 0xf1,
+ 0x1c, 0xbd, 0xa9, 0x07, 0xf1, 0x16, 0xc9, 0xf2 };
+ u8 res[16];
+ int err;
+
+ err = smp_f4(tfm_cmac, u, v, x, z, res);
+ if (err)
+ return err;
+
+ if (memcmp(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_f5(struct crypto_hash *tfm_cmac)
+{
+ const u8 w[32] = {
+ 0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86,
+ 0xf1, 0x66, 0xf8, 0xb4, 0x13, 0x6b, 0x79, 0x99,
+ 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
+ 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec };
+ const u8 n1[16] = {
+ 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff,
+ 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 };
+ const u8 n2[16] = {
+ 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21,
+ 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 };
+ const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 };
+ const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 };
+ const u8 exp_ltk[16] = {
+ 0x38, 0x0a, 0x75, 0x94, 0xb5, 0x22, 0x05, 0x98,
+ 0x23, 0xcd, 0xd7, 0x69, 0x11, 0x79, 0x86, 0x69 };
+ const u8 exp_mackey[16] = {
+ 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd,
+ 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 };
+ u8 mackey[16], ltk[16];
+ int err;
+
+ err = smp_f5(tfm_cmac, w, n1, n2, a1, a2, mackey, ltk);
+ if (err)
+ return err;
+
+ if (memcmp(mackey, exp_mackey, 16))
+ return -EINVAL;
+
+ if (memcmp(ltk, exp_ltk, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_f6(struct crypto_hash *tfm_cmac)
+{
+ const u8 w[16] = {
+ 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd,
+ 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 };
+ const u8 n1[16] = {
+ 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff,
+ 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 };
+ const u8 n2[16] = {
+ 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21,
+ 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 };
+ const u8 r[16] = {
+ 0xc8, 0x0f, 0x2d, 0x0c, 0xd2, 0x42, 0xda, 0x08,
+ 0x54, 0xbb, 0x53, 0xb4, 0x3b, 0x34, 0xa3, 0x12 };
+ const u8 io_cap[3] = { 0x02, 0x01, 0x01 };
+ const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 };
+ const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 };
+ const u8 exp[16] = {
+ 0x61, 0x8f, 0x95, 0xda, 0x09, 0x0b, 0x6c, 0xd2,
+ 0xc5, 0xe8, 0xd0, 0x9c, 0x98, 0x73, 0xc4, 0xe3 };
+ u8 res[16];
+ int err;
+
+ err = smp_f6(tfm_cmac, w, n1, n2, r, io_cap, a1, a2, res);
+ if (err)
+ return err;
+
+ if (memcmp(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_g2(struct crypto_hash *tfm_cmac)
+{
+ const u8 u[32] = {
+ 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
+ 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef,
+ 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e,
+ 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 };
+ const u8 v[32] = {
+ 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b,
+ 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59,
+ 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90,
+ 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 };
+ const u8 x[16] = {
+ 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff,
+ 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 };
+ const u8 y[16] = {
+ 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21,
+ 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 };
+ const u32 exp_val = 0x2f9ed5ba % 1000000;
+ u32 val;
+ int err;
+
+ err = smp_g2(tfm_cmac, u, v, x, y, &val);
+ if (err)
+ return err;
+
+ if (val != exp_val)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init test_h6(struct crypto_hash *tfm_cmac)
+{
+ const u8 w[16] = {
+ 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
+ 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec };
+ const u8 key_id[4] = { 0x72, 0x62, 0x65, 0x6c };
+ const u8 exp[16] = {
+ 0x99, 0x63, 0xb1, 0x80, 0xe2, 0xa9, 0xd3, 0xe8,
+ 0x1c, 0xc9, 0x6d, 0xe7, 0x02, 0xe1, 0x9a, 0x2d };
+ u8 res[16];
+ int err;
+
+ err = smp_h6(tfm_cmac, w, key_id, res);
+ if (err)
+ return err;
+
+ if (memcmp(res, exp, 16))
+ return -EINVAL;
+
+ return 0;
+}
+
+static char test_smp_buffer[32];
+
+static ssize_t test_smp_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ return simple_read_from_buffer(user_buf, count, ppos, test_smp_buffer,
+ strlen(test_smp_buffer));
+}
+
+static const struct file_operations test_smp_fops = {
+ .open = simple_open,
+ .read = test_smp_read,
+ .llseek = default_llseek,
+};
+
+static int __init run_selftests(struct crypto_blkcipher *tfm_aes,
+ struct crypto_hash *tfm_cmac)
+{
+ ktime_t calltime, delta, rettime;
+ unsigned long long duration;
+ int err;
+
+ calltime = ktime_get();
+
+ err = test_ah(tfm_aes);
+ if (err) {
+ BT_ERR("smp_ah test failed");
+ goto done;
+ }
+
+ err = test_c1(tfm_aes);
+ if (err) {
+ BT_ERR("smp_c1 test failed");
+ goto done;
+ }
+
+ err = test_s1(tfm_aes);
+ if (err) {
+ BT_ERR("smp_s1 test failed");
+ goto done;
+ }
+
+ err = test_f4(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_f4 test failed");
+ goto done;
+ }
+
+ err = test_f5(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_f5 test failed");
+ goto done;
+ }
+
+ err = test_f6(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_f6 test failed");
+ goto done;
+ }
+
+ err = test_g2(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_g2 test failed");
+ goto done;
+ }
+
+ err = test_h6(tfm_cmac);
+ if (err) {
+ BT_ERR("smp_h6 test failed");
+ goto done;
+ }
+
+ rettime = ktime_get();
+ delta = ktime_sub(rettime, calltime);
+ duration = (unsigned long long) ktime_to_ns(delta) >> 10;
+
+ BT_INFO("SMP test passed in %llu usecs", duration);
+
+done:
+ if (!err)
+ snprintf(test_smp_buffer, sizeof(test_smp_buffer),
+ "PASS (%llu usecs)\n", duration);
+ else
+ snprintf(test_smp_buffer, sizeof(test_smp_buffer), "FAIL\n");
+
+ debugfs_create_file("selftest_smp", 0444, bt_debugfs, NULL,
+ &test_smp_fops);
+
+ return err;
+}
+
+int __init bt_selftest_smp(void)
+{
+ struct crypto_blkcipher *tfm_aes;
+ struct crypto_hash *tfm_cmac;
+ int err;
+
+ tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm_aes)) {
+ BT_ERR("Unable to create ECB crypto context");
+ return PTR_ERR(tfm_aes);
+ }
+
+ tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm_cmac)) {
+ BT_ERR("Unable to create CMAC crypto context");
+ crypto_free_blkcipher(tfm_aes);
+ return PTR_ERR(tfm_cmac);
+ }
+
+ err = run_selftests(tfm_aes, tfm_cmac);
+
+ crypto_free_hash(tfm_cmac);
+ crypto_free_blkcipher(tfm_aes);
+
+ return err;
+}
+
+#endif
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
new file mode 100644
index 000000000..6cf872563
--- /dev/null
+++ b/net/bluetooth/smp.h
@@ -0,0 +1,209 @@
+/*
+ BlueZ - Bluetooth protocol stack for Linux
+ Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2 as
+ published by the Free Software Foundation;
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+ CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+ ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+ COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+ SOFTWARE IS DISCLAIMED.
+*/
+
+#ifndef __SMP_H
+#define __SMP_H
+
+struct smp_command_hdr {
+ __u8 code;
+} __packed;
+
+#define SMP_CMD_PAIRING_REQ 0x01
+#define SMP_CMD_PAIRING_RSP 0x02
+struct smp_cmd_pairing {
+ __u8 io_capability;
+ __u8 oob_flag;
+ __u8 auth_req;
+ __u8 max_key_size;
+ __u8 init_key_dist;
+ __u8 resp_key_dist;
+} __packed;
+
+#define SMP_IO_DISPLAY_ONLY 0x00
+#define SMP_IO_DISPLAY_YESNO 0x01
+#define SMP_IO_KEYBOARD_ONLY 0x02
+#define SMP_IO_NO_INPUT_OUTPUT 0x03
+#define SMP_IO_KEYBOARD_DISPLAY 0x04
+
+#define SMP_OOB_NOT_PRESENT 0x00
+#define SMP_OOB_PRESENT 0x01
+
+#define SMP_DIST_ENC_KEY 0x01
+#define SMP_DIST_ID_KEY 0x02
+#define SMP_DIST_SIGN 0x04
+#define SMP_DIST_LINK_KEY 0x08
+
+#define SMP_AUTH_NONE 0x00
+#define SMP_AUTH_BONDING 0x01
+#define SMP_AUTH_MITM 0x04
+#define SMP_AUTH_SC 0x08
+#define SMP_AUTH_KEYPRESS 0x10
+
+#define SMP_CMD_PAIRING_CONFIRM 0x03
+struct smp_cmd_pairing_confirm {
+ __u8 confirm_val[16];
+} __packed;
+
+#define SMP_CMD_PAIRING_RANDOM 0x04
+struct smp_cmd_pairing_random {
+ __u8 rand_val[16];
+} __packed;
+
+#define SMP_CMD_PAIRING_FAIL 0x05
+struct smp_cmd_pairing_fail {
+ __u8 reason;
+} __packed;
+
+#define SMP_CMD_ENCRYPT_INFO 0x06
+struct smp_cmd_encrypt_info {
+ __u8 ltk[16];
+} __packed;
+
+#define SMP_CMD_MASTER_IDENT 0x07
+struct smp_cmd_master_ident {
+ __le16 ediv;
+ __le64 rand;
+} __packed;
+
+#define SMP_CMD_IDENT_INFO 0x08
+struct smp_cmd_ident_info {
+ __u8 irk[16];
+} __packed;
+
+#define SMP_CMD_IDENT_ADDR_INFO 0x09
+struct smp_cmd_ident_addr_info {
+ __u8 addr_type;
+ bdaddr_t bdaddr;
+} __packed;
+
+#define SMP_CMD_SIGN_INFO 0x0a
+struct smp_cmd_sign_info {
+ __u8 csrk[16];
+} __packed;
+
+#define SMP_CMD_SECURITY_REQ 0x0b
+struct smp_cmd_security_req {
+ __u8 auth_req;
+} __packed;
+
+#define SMP_CMD_PUBLIC_KEY 0x0c
+struct smp_cmd_public_key {
+ __u8 x[32];
+ __u8 y[32];
+} __packed;
+
+#define SMP_CMD_DHKEY_CHECK 0x0d
+struct smp_cmd_dhkey_check {
+ __u8 e[16];
+} __packed;
+
+#define SMP_CMD_KEYPRESS_NOTIFY 0x0e
+struct smp_cmd_keypress_notify {
+ __u8 value;
+} __packed;
+
+#define SMP_CMD_MAX 0x0e
+
+#define SMP_PASSKEY_ENTRY_FAILED 0x01
+#define SMP_OOB_NOT_AVAIL 0x02
+#define SMP_AUTH_REQUIREMENTS 0x03
+#define SMP_CONFIRM_FAILED 0x04
+#define SMP_PAIRING_NOTSUPP 0x05
+#define SMP_ENC_KEY_SIZE 0x06
+#define SMP_CMD_NOTSUPP 0x07
+#define SMP_UNSPECIFIED 0x08
+#define SMP_REPEATED_ATTEMPTS 0x09
+#define SMP_INVALID_PARAMS 0x0a
+#define SMP_DHKEY_CHECK_FAILED 0x0b
+#define SMP_NUMERIC_COMP_FAILED 0x0c
+#define SMP_BREDR_PAIRING_IN_PROGRESS 0x0d
+#define SMP_CROSS_TRANSP_NOT_ALLOWED 0x0e
+
+#define SMP_MIN_ENC_KEY_SIZE 7
+#define SMP_MAX_ENC_KEY_SIZE 16
+
+/* LTK types used in internal storage (struct smp_ltk) */
+enum {
+ SMP_STK,
+ SMP_LTK,
+ SMP_LTK_SLAVE,
+ SMP_LTK_P256,
+ SMP_LTK_P256_DEBUG,
+};
+
+static inline bool smp_ltk_is_sc(struct smp_ltk *key)
+{
+ switch (key->type) {
+ case SMP_LTK_P256:
+ case SMP_LTK_P256_DEBUG:
+ return true;
+ }
+
+ return false;
+}
+
+static inline u8 smp_ltk_sec_level(struct smp_ltk *key)
+{
+ if (key->authenticated) {
+ if (smp_ltk_is_sc(key))
+ return BT_SECURITY_FIPS;
+ else
+ return BT_SECURITY_HIGH;
+ }
+
+ return BT_SECURITY_MEDIUM;
+}
+
+/* Key preferences for smp_sufficient security */
+enum smp_key_pref {
+ SMP_ALLOW_STK,
+ SMP_USE_LTK,
+};
+
+/* SMP Commands */
+bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level,
+ enum smp_key_pref key_pref);
+int smp_conn_security(struct hci_conn *hcon, __u8 sec_level);
+int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey);
+
+bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
+ const bdaddr_t *bdaddr);
+int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa);
+int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]);
+
+int smp_register(struct hci_dev *hdev);
+void smp_unregister(struct hci_dev *hdev);
+
+#if IS_ENABLED(CONFIG_BT_SELFTEST_SMP)
+
+int bt_selftest_smp(void);
+
+#else
+
+static inline int bt_selftest_smp(void)
+{
+ return 0;
+}
+
+#endif
+
+#endif /* __SMP_H */
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 000000000..aa0d3b2f1
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,62 @@
+#
+# 802.1d Ethernet Bridging
+#
+
+config BRIDGE
+ tristate "802.1d Ethernet Bridging"
+ select LLC
+ select STP
+ depends on IPV6 || IPV6=n
+ ---help---
+ If you say Y here, then your Linux box will be able to act as an
+ Ethernet bridge, which means that the different Ethernet segments it
+ is connected to will appear as one Ethernet to the participants.
+ Several such bridges can work together to create even larger
+ networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
+ As this is a standard, Linux bridges will cooperate properly with
+ other third party bridge products.
+
+ In order to use the Ethernet bridge, you'll need the bridge
+ configuration tools; see <file:Documentation/networking/bridge.txt>
+ for location. Please read the Bridge mini-HOWTO for more
+ information.
+
+ If you enable iptables support along with the bridge support then you
+ turn your bridge into a bridging IP firewall.
+ iptables will then see the IP packets being bridged, so you need to
+ take this into account when setting up your firewall rules.
+ Enabling arptables support when bridging will let arptables see
+ bridged ARP traffic in the arptables FORWARD chain.
+
+ To compile this code as a module, choose M here: the module
+ will be called bridge.
+
+ If unsure, say N.
+
+config BRIDGE_IGMP_SNOOPING
+ bool "IGMP/MLD snooping"
+ depends on BRIDGE
+ depends on INET
+ default y
+ ---help---
+ If you say Y here, then the Ethernet bridge will be able selectively
+ forward multicast traffic based on IGMP/MLD traffic received from
+ each port.
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say Y.
+
+config BRIDGE_VLAN_FILTERING
+ bool "VLAN filtering"
+ depends on BRIDGE
+ depends on VLAN_8021Q
+ default n
+ ---help---
+ If you say Y here, then the Ethernet bridge will be able selectively
+ receive and forward traffic based on VLAN information in the packet
+ any VLAN information configured on the bridge port or bridge device.
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say Y.
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
new file mode 100644
index 000000000..fd7ee03c5
--- /dev/null
+++ b/net/bridge/Makefile
@@ -0,0 +1,21 @@
+#
+# Makefile for the IEEE 802.1d ethernet bridging layer.
+#
+
+obj-$(CONFIG_BRIDGE) += bridge.o
+
+bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
+ br_ioctl.o br_stp.o br_stp_bpdu.o \
+ br_stp_if.o br_stp_timer.o br_netlink.o
+
+bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
+
+bridge-$(subst m,y,$(CONFIG_BRIDGE_NETFILTER)) += br_nf_core.o
+
+obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
+
+bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
+
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o
+
+obj-$(CONFIG_NETFILTER) += netfilter/
diff --git a/net/bridge/br.c b/net/bridge/br.c
new file mode 100644
index 000000000..02c24cf63
--- /dev/null
+++ b/net/bridge/br.c
@@ -0,0 +1,274 @@
+/*
+ * Generic parts
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/init.h>
+#include <linux/llc.h>
+#include <net/llc.h>
+#include <net/stp.h>
+#include <net/switchdev.h>
+
+#include "br_private.h"
+
+/*
+ * Handle changes in state of network devices enslaved to a bridge.
+ *
+ * Note: don't care about up/down if bridge itself is down, because
+ * port state is checked when bridge is brought up.
+ */
+static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ bool changed_addr;
+ int err;
+
+ /* register of bridge completed, add sysfs entries */
+ if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) {
+ br_sysfs_addbr(dev);
+ return NOTIFY_DONE;
+ }
+
+ /* not a port of a bridge */
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return NOTIFY_DONE;
+
+ br = p->br;
+
+ switch (event) {
+ case NETDEV_CHANGEMTU:
+ dev_set_mtu(br->dev, br_min_mtu(br));
+ break;
+
+ case NETDEV_CHANGEADDR:
+ spin_lock_bh(&br->lock);
+ br_fdb_changeaddr(p, dev->dev_addr);
+ changed_addr = br_stp_recalculate_bridge_id(br);
+ spin_unlock_bh(&br->lock);
+
+ if (changed_addr)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
+
+ break;
+
+ case NETDEV_CHANGE:
+ br_port_carrier_check(p);
+ break;
+
+ case NETDEV_FEAT_CHANGE:
+ netdev_update_features(br->dev);
+ break;
+
+ case NETDEV_DOWN:
+ spin_lock_bh(&br->lock);
+ if (br->dev->flags & IFF_UP)
+ br_stp_disable_port(p);
+ spin_unlock_bh(&br->lock);
+ break;
+
+ case NETDEV_UP:
+ if (netif_running(br->dev) && netif_oper_up(dev)) {
+ spin_lock_bh(&br->lock);
+ br_stp_enable_port(p);
+ spin_unlock_bh(&br->lock);
+ }
+ break;
+
+ case NETDEV_UNREGISTER:
+ br_del_if(br, dev);
+ break;
+
+ case NETDEV_CHANGENAME:
+ err = br_sysfs_renameif(p);
+ if (err)
+ return notifier_from_errno(err);
+ break;
+
+ case NETDEV_PRE_TYPE_CHANGE:
+ /* Forbid underlaying device to change its type. */
+ return NOTIFY_BAD;
+
+ case NETDEV_RESEND_IGMP:
+ /* Propagate to master device */
+ call_netdevice_notifiers(event, br->dev);
+ break;
+ }
+
+ /* Events that may cause spanning tree to refresh */
+ if (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
+ event == NETDEV_CHANGE || event == NETDEV_DOWN)
+ br_ifinfo_notify(RTM_NEWLINK, p);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block br_device_notifier = {
+ .notifier_call = br_device_event
+};
+
+static int br_netdev_switch_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr);
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ struct netdev_switch_notifier_fdb_info *fdb_info;
+ int err = NOTIFY_DONE;
+
+ rtnl_lock();
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ goto out;
+
+ br = p->br;
+
+ switch (event) {
+ case NETDEV_SWITCH_FDB_ADD:
+ fdb_info = ptr;
+ err = br_fdb_external_learn_add(br, p, fdb_info->addr,
+ fdb_info->vid);
+ if (err)
+ err = notifier_from_errno(err);
+ break;
+ case NETDEV_SWITCH_FDB_DEL:
+ fdb_info = ptr;
+ err = br_fdb_external_learn_del(br, p, fdb_info->addr,
+ fdb_info->vid);
+ if (err)
+ err = notifier_from_errno(err);
+ break;
+ }
+
+out:
+ rtnl_unlock();
+ return err;
+}
+
+static struct notifier_block br_netdev_switch_notifier = {
+ .notifier_call = br_netdev_switch_event,
+};
+
+static void __net_exit br_net_exit(struct net *net)
+{
+ struct net_device *dev;
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ for_each_netdev(net, dev)
+ if (dev->priv_flags & IFF_EBRIDGE)
+ br_dev_delete(dev, &list);
+
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+
+}
+
+static struct pernet_operations br_net_ops = {
+ .exit = br_net_exit,
+};
+
+static const struct stp_proto br_stp_proto = {
+ .rcv = br_stp_rcv,
+};
+
+static int __init br_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
+
+ err = stp_proto_register(&br_stp_proto);
+ if (err < 0) {
+ pr_err("bridge: can't register sap for STP\n");
+ return err;
+ }
+
+ err = br_fdb_init();
+ if (err)
+ goto err_out;
+
+ err = register_pernet_subsys(&br_net_ops);
+ if (err)
+ goto err_out1;
+
+ err = br_nf_core_init();
+ if (err)
+ goto err_out2;
+
+ err = register_netdevice_notifier(&br_device_notifier);
+ if (err)
+ goto err_out3;
+
+ err = register_netdev_switch_notifier(&br_netdev_switch_notifier);
+ if (err)
+ goto err_out4;
+
+ err = br_netlink_init();
+ if (err)
+ goto err_out5;
+
+ brioctl_set(br_ioctl_deviceless_stub);
+
+#if IS_ENABLED(CONFIG_ATM_LANE)
+ br_fdb_test_addr_hook = br_fdb_test_addr;
+#endif
+
+ pr_info("bridge: automatic filtering via arp/ip/ip6tables has been "
+ "deprecated. Update your scripts to load br_netfilter if you "
+ "need this.\n");
+
+ return 0;
+
+err_out5:
+ unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
+err_out4:
+ unregister_netdevice_notifier(&br_device_notifier);
+err_out3:
+ br_nf_core_fini();
+err_out2:
+ unregister_pernet_subsys(&br_net_ops);
+err_out1:
+ br_fdb_fini();
+err_out:
+ stp_proto_unregister(&br_stp_proto);
+ return err;
+}
+
+static void __exit br_deinit(void)
+{
+ stp_proto_unregister(&br_stp_proto);
+ br_netlink_fini();
+ unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
+ unregister_netdevice_notifier(&br_device_notifier);
+ brioctl_set(NULL);
+ unregister_pernet_subsys(&br_net_ops);
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ br_nf_core_fini();
+#if IS_ENABLED(CONFIG_ATM_LANE)
+ br_fdb_test_addr_hook = NULL;
+#endif
+ br_fdb_fini();
+}
+
+module_init(br_init)
+module_exit(br_deinit)
+MODULE_LICENSE("GPL");
+MODULE_VERSION(BR_VERSION);
+MODULE_ALIAS_RTNL_LINK("bridge");
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
new file mode 100644
index 000000000..4ff77a169
--- /dev/null
+++ b/net/bridge/br_device.c
@@ -0,0 +1,399 @@
+/*
+ * Device handling code
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/list.h>
+#include <linux/netfilter_bridge.h>
+
+#include <asm/uaccess.h>
+#include "br_private.h"
+
+#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_MASK | NETIF_F_HW_CSUM)
+
+const struct nf_br_ops __rcu *nf_br_ops __read_mostly;
+EXPORT_SYMBOL_GPL(nf_br_ops);
+
+/* net device transmit always called with BH disabled */
+netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ const unsigned char *dest = skb->data;
+ struct net_bridge_fdb_entry *dst;
+ struct net_bridge_mdb_entry *mdst;
+ struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
+ const struct nf_br_ops *nf_ops;
+ u16 vid = 0;
+
+ rcu_read_lock();
+ nf_ops = rcu_dereference(nf_br_ops);
+ if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) {
+ rcu_read_unlock();
+ return NETDEV_TX_OK;
+ }
+
+ u64_stats_update_begin(&brstats->syncp);
+ brstats->tx_packets++;
+ brstats->tx_bytes += skb->len;
+ u64_stats_update_end(&brstats->syncp);
+
+ BR_INPUT_SKB_CB(skb)->brdev = dev;
+
+ skb_reset_mac_header(skb);
+ skb_pull(skb, ETH_HLEN);
+
+ if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid))
+ goto out;
+
+ if (is_broadcast_ether_addr(dest))
+ br_flood_deliver(br, skb, false);
+ else if (is_multicast_ether_addr(dest)) {
+ if (unlikely(netpoll_tx_running(dev))) {
+ br_flood_deliver(br, skb, false);
+ goto out;
+ }
+ if (br_multicast_rcv(br, NULL, skb, vid)) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ mdst = br_mdb_get(br, skb, vid);
+ if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
+ br_multicast_querier_exists(br, eth_hdr(skb)))
+ br_multicast_deliver(mdst, skb);
+ else
+ br_flood_deliver(br, skb, false);
+ } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL)
+ br_deliver(dst->dst, skb);
+ else
+ br_flood_deliver(br, skb, true);
+
+out:
+ rcu_read_unlock();
+ return NETDEV_TX_OK;
+}
+
+static int br_dev_init(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ int err;
+
+ br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!br->stats)
+ return -ENOMEM;
+
+ err = br_vlan_init(br);
+ if (err)
+ free_percpu(br->stats);
+
+ return err;
+}
+
+static int br_dev_open(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ netdev_update_features(dev);
+ netif_start_queue(dev);
+ br_stp_enable_bridge(br);
+ br_multicast_open(br);
+
+ return 0;
+}
+
+static void br_dev_set_multicast_list(struct net_device *dev)
+{
+}
+
+static void br_dev_change_rx_flags(struct net_device *dev, int change)
+{
+ if (change & IFF_PROMISC)
+ br_manage_promisc(netdev_priv(dev));
+}
+
+static int br_dev_stop(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ br_stp_disable_bridge(br);
+ br_multicast_stop(br);
+
+ netif_stop_queue(dev);
+
+ return 0;
+}
+
+static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct pcpu_sw_netstats tmp, sum = { 0 };
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ unsigned int start;
+ const struct pcpu_sw_netstats *bstats
+ = per_cpu_ptr(br->stats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&bstats->syncp);
+ memcpy(&tmp, bstats, sizeof(tmp));
+ } while (u64_stats_fetch_retry_irq(&bstats->syncp, start));
+ sum.tx_bytes += tmp.tx_bytes;
+ sum.tx_packets += tmp.tx_packets;
+ sum.rx_bytes += tmp.rx_bytes;
+ sum.rx_packets += tmp.rx_packets;
+ }
+
+ stats->tx_bytes = sum.tx_bytes;
+ stats->tx_packets = sum.tx_packets;
+ stats->rx_bytes = sum.rx_bytes;
+ stats->rx_packets = sum.rx_packets;
+
+ return stats;
+}
+
+static int br_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ if (new_mtu < 68 || new_mtu > br_min_mtu(br))
+ return -EINVAL;
+
+ dev->mtu = new_mtu;
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* remember the MTU in the rtable for PMTU */
+ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu);
+#endif
+
+ return 0;
+}
+
+/* Allow setting mac address to any valid ethernet address. */
+static int br_set_mac_address(struct net_device *dev, void *p)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct sockaddr *addr = p;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ spin_lock_bh(&br->lock);
+ if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) {
+ /* Mac address will be changed in br_stp_change_bridge_id(). */
+ br_stp_change_bridge_id(br, addr->sa_data);
+ }
+ spin_unlock_bh(&br->lock);
+
+ return 0;
+}
+
+static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+ strlcpy(info->driver, "bridge", sizeof(info->driver));
+ strlcpy(info->version, BR_VERSION, sizeof(info->version));
+ strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strlcpy(info->bus_info, "N/A", sizeof(info->bus_info));
+}
+
+static netdev_features_t br_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_features_recompute(br, features);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void br_poll_controller(struct net_device *br_dev)
+{
+}
+
+static void br_netpoll_cleanup(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list)
+ br_netpoll_disable(p);
+}
+
+static int __br_netpoll_enable(struct net_bridge_port *p)
+{
+ struct netpoll *np;
+ int err;
+
+ np = kzalloc(sizeof(*p->np), GFP_KERNEL);
+ if (!np)
+ return -ENOMEM;
+
+ err = __netpoll_setup(np, p->dev);
+ if (err) {
+ kfree(np);
+ return err;
+ }
+
+ p->np = np;
+ return err;
+}
+
+int br_netpoll_enable(struct net_bridge_port *p)
+{
+ if (!p->br->dev->npinfo)
+ return 0;
+
+ return __br_netpoll_enable(p);
+}
+
+static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p;
+ int err = 0;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (!p->dev)
+ continue;
+ err = __br_netpoll_enable(p);
+ if (err)
+ goto fail;
+ }
+
+out:
+ return err;
+
+fail:
+ br_netpoll_cleanup(dev);
+ goto out;
+}
+
+void br_netpoll_disable(struct net_bridge_port *p)
+{
+ struct netpoll *np = p->np;
+
+ if (!np)
+ return;
+
+ p->np = NULL;
+
+ __netpoll_free_async(np);
+}
+
+#endif
+
+static int br_add_slave(struct net_device *dev, struct net_device *slave_dev)
+
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_add_if(br, slave_dev);
+}
+
+static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_del_if(br, slave_dev);
+}
+
+static const struct ethtool_ops br_ethtool_ops = {
+ .get_drvinfo = br_getinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+static const struct net_device_ops br_netdev_ops = {
+ .ndo_open = br_dev_open,
+ .ndo_stop = br_dev_stop,
+ .ndo_init = br_dev_init,
+ .ndo_start_xmit = br_dev_xmit,
+ .ndo_get_stats64 = br_get_stats64,
+ .ndo_set_mac_address = br_set_mac_address,
+ .ndo_set_rx_mode = br_dev_set_multicast_list,
+ .ndo_change_rx_flags = br_dev_change_rx_flags,
+ .ndo_change_mtu = br_change_mtu,
+ .ndo_do_ioctl = br_dev_ioctl,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_netpoll_setup = br_netpoll_setup,
+ .ndo_netpoll_cleanup = br_netpoll_cleanup,
+ .ndo_poll_controller = br_poll_controller,
+#endif
+ .ndo_add_slave = br_add_slave,
+ .ndo_del_slave = br_del_slave,
+ .ndo_fix_features = br_fix_features,
+ .ndo_fdb_add = br_fdb_add,
+ .ndo_fdb_del = br_fdb_delete,
+ .ndo_fdb_dump = br_fdb_dump,
+ .ndo_bridge_getlink = br_getlink,
+ .ndo_bridge_setlink = br_setlink,
+ .ndo_bridge_dellink = br_dellink,
+};
+
+static void br_dev_free(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ free_percpu(br->stats);
+ free_netdev(dev);
+}
+
+static struct device_type br_type = {
+ .name = "bridge",
+};
+
+void br_dev_setup(struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ eth_hw_addr_random(dev);
+ ether_setup(dev);
+
+ dev->netdev_ops = &br_netdev_ops;
+ dev->destructor = br_dev_free;
+ dev->ethtool_ops = &br_ethtool_ops;
+ SET_NETDEV_DEVTYPE(dev, &br_type);
+ dev->tx_queue_len = 0;
+ dev->priv_flags = IFF_EBRIDGE;
+
+ dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL |
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
+ dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+ dev->vlan_features = COMMON_FEATURES;
+
+ br->dev = dev;
+ spin_lock_init(&br->lock);
+ INIT_LIST_HEAD(&br->port_list);
+ spin_lock_init(&br->hash_lock);
+
+ br->bridge_id.prio[0] = 0x80;
+ br->bridge_id.prio[1] = 0x00;
+
+ ether_addr_copy(br->group_addr, eth_reserved_addr_base);
+
+ br->stp_enabled = BR_NO_STP;
+ br->group_fwd_mask = BR_GROUPFWD_DEFAULT;
+ br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
+
+ br->designated_root = br->bridge_id;
+ br->bridge_max_age = br->max_age = 20 * HZ;
+ br->bridge_hello_time = br->hello_time = 2 * HZ;
+ br->bridge_forward_delay = br->forward_delay = 15 * HZ;
+ br->ageing_time = 300 * HZ;
+
+ br_netfilter_rtable_init(br);
+ br_stp_timer_init(br);
+ br_multicast_init(br);
+}
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
new file mode 100644
index 000000000..659fb9667
--- /dev/null
+++ b/net/bridge/br_fdb.c
@@ -0,0 +1,1050 @@
+/*
+ * Forwarding database
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/times.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+#include <asm/unaligned.h>
+#include <linux/if_vlan.h>
+#include "br_private.h"
+
+static struct kmem_cache *br_fdb_cache __read_mostly;
+static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
+ const unsigned char *addr,
+ __u16 vid);
+static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid);
+static void fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *, int);
+
+static u32 fdb_salt __read_mostly;
+
+int __init br_fdb_init(void)
+{
+ br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
+ sizeof(struct net_bridge_fdb_entry),
+ 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!br_fdb_cache)
+ return -ENOMEM;
+
+ get_random_bytes(&fdb_salt, sizeof(fdb_salt));
+ return 0;
+}
+
+void br_fdb_fini(void)
+{
+ kmem_cache_destroy(br_fdb_cache);
+}
+
+
+/* if topology_changing then use forward_delay (default 15 sec)
+ * otherwise keep longer (default 5 minutes)
+ */
+static inline unsigned long hold_time(const struct net_bridge *br)
+{
+ return br->topology_change ? br->forward_delay : br->ageing_time;
+}
+
+static inline int has_expired(const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb)
+{
+ return !fdb->is_static &&
+ time_before_eq(fdb->updated + hold_time(br), jiffies);
+}
+
+static inline int br_mac_hash(const unsigned char *mac, __u16 vid)
+{
+ /* use 1 byte of OUI and 3 bytes of NIC */
+ u32 key = get_unaligned((u32 *)(mac + 2));
+ return jhash_2words(key, vid, fdb_salt) & (BR_HASH_SIZE - 1);
+}
+
+static void fdb_rcu_free(struct rcu_head *head)
+{
+ struct net_bridge_fdb_entry *ent
+ = container_of(head, struct net_bridge_fdb_entry, rcu);
+ kmem_cache_free(br_fdb_cache, ent);
+}
+
+/* When a static FDB entry is added, the mac address from the entry is
+ * added to the bridge private HW address list and all required ports
+ * are then updated with the new information.
+ * Called under RTNL.
+ */
+static void fdb_add_hw_addr(struct net_bridge *br, const unsigned char *addr)
+{
+ int err;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (!br_promisc_port(p)) {
+ err = dev_uc_add(p->dev, addr);
+ if (err)
+ goto undo;
+ }
+ }
+
+ return;
+undo:
+ list_for_each_entry_continue_reverse(p, &br->port_list, list) {
+ if (!br_promisc_port(p))
+ dev_uc_del(p->dev, addr);
+ }
+}
+
+/* When a static FDB entry is deleted, the HW address from that entry is
+ * also removed from the bridge private HW address list and updates all
+ * the ports with needed information.
+ * Called under RTNL.
+ */
+static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
+{
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (!br_promisc_port(p))
+ dev_uc_del(p->dev, addr);
+ }
+}
+
+static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
+{
+ if (f->is_static)
+ fdb_del_hw_addr(br, f->addr.addr);
+
+ hlist_del_rcu(&f->hlist);
+ fdb_notify(br, f, RTM_DELNEIGH);
+ call_rcu(&f->rcu, fdb_rcu_free);
+}
+
+/* Delete a local entry if no other port had the same address. */
+static void fdb_delete_local(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_fdb_entry *f)
+{
+ const unsigned char *addr = f->addr.addr;
+ u16 vid = f->vlan_id;
+ struct net_bridge_port *op;
+
+ /* Maybe another port has same hw addr? */
+ list_for_each_entry(op, &br->port_list, list) {
+ if (op != p && ether_addr_equal(op->dev->dev_addr, addr) &&
+ (!vid || nbp_vlan_find(op, vid))) {
+ f->dst = op;
+ f->added_by_user = 0;
+ return;
+ }
+ }
+
+ /* Maybe bridge device has same hw addr? */
+ if (p && ether_addr_equal(br->dev->dev_addr, addr) &&
+ (!vid || br_vlan_find(br, vid))) {
+ f->dst = NULL;
+ f->added_by_user = 0;
+ return;
+ }
+
+ fdb_delete(br, f);
+}
+
+void br_fdb_find_delete_local(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid)
+{
+ struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+ struct net_bridge_fdb_entry *f;
+
+ spin_lock_bh(&br->hash_lock);
+ f = fdb_find(head, addr, vid);
+ if (f && f->is_local && !f->added_by_user && f->dst == p)
+ fdb_delete_local(br, p, f);
+ spin_unlock_bh(&br->hash_lock);
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+{
+ struct net_bridge *br = p->br;
+ struct net_port_vlans *pv = nbp_get_vlan_info(p);
+ bool no_vlan = !pv;
+ int i;
+ u16 vid;
+
+ spin_lock_bh(&br->hash_lock);
+
+ /* Search all chains since old address/hash is unknown */
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ struct hlist_node *h;
+ hlist_for_each(h, &br->hash[i]) {
+ struct net_bridge_fdb_entry *f;
+
+ f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
+ if (f->dst == p && f->is_local && !f->added_by_user) {
+ /* delete old one */
+ fdb_delete_local(br, p, f);
+
+ /* if this port has no vlan information
+ * configured, we can safely be done at
+ * this point.
+ */
+ if (no_vlan)
+ goto insert;
+ }
+ }
+ }
+
+insert:
+ /* insert new address, may fail if invalid address or dup. */
+ fdb_insert(br, p, newaddr, 0);
+
+ if (no_vlan)
+ goto done;
+
+ /* Now add entries for every VLAN configured on the port.
+ * This function runs under RTNL so the bitmap will not change
+ * from under us.
+ */
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
+ fdb_insert(br, p, newaddr, vid);
+
+done:
+ spin_unlock_bh(&br->hash_lock);
+}
+
+void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_port_vlans *pv;
+ u16 vid = 0;
+
+ spin_lock_bh(&br->hash_lock);
+
+ /* If old entry was unassociated with any port, then delete it. */
+ f = __br_fdb_get(br, br->dev->dev_addr, 0);
+ if (f && f->is_local && !f->dst)
+ fdb_delete_local(br, NULL, f);
+
+ fdb_insert(br, NULL, newaddr, 0);
+
+ /* Now remove and add entries for every VLAN configured on the
+ * bridge. This function runs under RTNL so the bitmap will not
+ * change from under us.
+ */
+ pv = br_get_vlan_info(br);
+ if (!pv)
+ goto out;
+
+ for_each_set_bit_from(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ f = __br_fdb_get(br, br->dev->dev_addr, vid);
+ if (f && f->is_local && !f->dst)
+ fdb_delete_local(br, NULL, f);
+ fdb_insert(br, NULL, newaddr, vid);
+ }
+out:
+ spin_unlock_bh(&br->hash_lock);
+}
+
+void br_fdb_cleanup(unsigned long _data)
+{
+ struct net_bridge *br = (struct net_bridge *)_data;
+ unsigned long delay = hold_time(br);
+ unsigned long next_timer = jiffies + br->ageing_time;
+ int i;
+
+ spin_lock(&br->hash_lock);
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ struct net_bridge_fdb_entry *f;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
+ unsigned long this_timer;
+ if (f->is_static)
+ continue;
+ this_timer = f->updated + delay;
+ if (time_before_eq(this_timer, jiffies))
+ fdb_delete(br, f);
+ else if (time_before(this_timer, next_timer))
+ next_timer = this_timer;
+ }
+ }
+ spin_unlock(&br->hash_lock);
+
+ mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
+}
+
+/* Completely flush all dynamic entries in forwarding database.*/
+void br_fdb_flush(struct net_bridge *br)
+{
+ int i;
+
+ spin_lock_bh(&br->hash_lock);
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ struct net_bridge_fdb_entry *f;
+ struct hlist_node *n;
+ hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
+ if (!f->is_static)
+ fdb_delete(br, f);
+ }
+ }
+ spin_unlock_bh(&br->hash_lock);
+}
+
+/* Flush all entries referring to a specific port.
+ * if do_all is set also flush static entries
+ */
+void br_fdb_delete_by_port(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ int do_all)
+{
+ int i;
+
+ spin_lock_bh(&br->hash_lock);
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ struct hlist_node *h, *g;
+
+ hlist_for_each_safe(h, g, &br->hash[i]) {
+ struct net_bridge_fdb_entry *f
+ = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
+ if (f->dst != p)
+ continue;
+
+ if (f->is_static && !do_all)
+ continue;
+
+ if (f->is_local)
+ fdb_delete_local(br, p, f);
+ else
+ fdb_delete(br, f);
+ }
+ }
+ spin_unlock_bh(&br->hash_lock);
+}
+
+/* No locking or refcounting, assumes caller has rcu_read_lock */
+struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ hlist_for_each_entry_rcu(fdb,
+ &br->hash[br_mac_hash(addr, vid)], hlist) {
+ if (ether_addr_equal(fdb->addr.addr, addr) &&
+ fdb->vlan_id == vid) {
+ if (unlikely(has_expired(br, fdb)))
+ break;
+ return fdb;
+ }
+ }
+
+ return NULL;
+}
+
+#if IS_ENABLED(CONFIG_ATM_LANE)
+/* Interface used by ATM LANE hook to test
+ * if an addr is on some other bridge port */
+int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
+{
+ struct net_bridge_fdb_entry *fdb;
+ struct net_bridge_port *port;
+ int ret;
+
+ rcu_read_lock();
+ port = br_port_get_rcu(dev);
+ if (!port)
+ ret = 0;
+ else {
+ fdb = __br_fdb_get(port->br, addr, 0);
+ ret = fdb && fdb->dst && fdb->dst->dev != dev &&
+ fdb->dst->state == BR_STATE_FORWARDING;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+#endif /* CONFIG_ATM_LANE */
+
+/*
+ * Fill buffer with forwarding table records in
+ * the API format.
+ */
+int br_fdb_fillbuf(struct net_bridge *br, void *buf,
+ unsigned long maxnum, unsigned long skip)
+{
+ struct __fdb_entry *fe = buf;
+ int i, num = 0;
+ struct net_bridge_fdb_entry *f;
+
+ memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
+
+ rcu_read_lock();
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
+ if (num >= maxnum)
+ goto out;
+
+ if (has_expired(br, f))
+ continue;
+
+ /* ignore pseudo entry for local MAC address */
+ if (!f->dst)
+ continue;
+
+ if (skip) {
+ --skip;
+ continue;
+ }
+
+ /* convert from internal format to API */
+ memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN);
+
+ /* due to ABI compat need to split into hi/lo */
+ fe->port_no = f->dst->port_no;
+ fe->port_hi = f->dst->port_no >> 8;
+
+ fe->is_local = f->is_local;
+ if (!f->is_static)
+ fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
+ ++fe;
+ ++num;
+ }
+ }
+
+ out:
+ rcu_read_unlock();
+
+ return num;
+}
+
+static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ hlist_for_each_entry(fdb, head, hlist) {
+ if (ether_addr_equal(fdb->addr.addr, addr) &&
+ fdb->vlan_id == vid)
+ return fdb;
+ }
+ return NULL;
+}
+
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ hlist_for_each_entry_rcu(fdb, head, hlist) {
+ if (ether_addr_equal(fdb->addr.addr, addr) &&
+ fdb->vlan_id == vid)
+ return fdb;
+ }
+ return NULL;
+}
+
+static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
+ struct net_bridge_port *source,
+ const unsigned char *addr,
+ __u16 vid)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
+ if (fdb) {
+ memcpy(fdb->addr.addr, addr, ETH_ALEN);
+ fdb->dst = source;
+ fdb->vlan_id = vid;
+ fdb->is_local = 0;
+ fdb->is_static = 0;
+ fdb->added_by_user = 0;
+ fdb->added_by_external_learn = 0;
+ fdb->updated = fdb->used = jiffies;
+ hlist_add_head_rcu(&fdb->hlist, head);
+ }
+ return fdb;
+}
+
+static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid)
+{
+ struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+ struct net_bridge_fdb_entry *fdb;
+
+ if (!is_valid_ether_addr(addr))
+ return -EINVAL;
+
+ fdb = fdb_find(head, addr, vid);
+ if (fdb) {
+ /* it is okay to have multiple ports with same
+ * address, just use the first one.
+ */
+ if (fdb->is_local)
+ return 0;
+ br_warn(br, "adding interface %s with same address "
+ "as a received packet\n",
+ source ? source->dev->name : br->dev->name);
+ fdb_delete(br, fdb);
+ }
+
+ fdb = fdb_create(head, source, addr, vid);
+ if (!fdb)
+ return -ENOMEM;
+
+ fdb->is_local = fdb->is_static = 1;
+ fdb_add_hw_addr(br, addr);
+ fdb_notify(br, fdb, RTM_NEWNEIGH);
+ return 0;
+}
+
+/* Add entry for local address of interface */
+int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid)
+{
+ int ret;
+
+ spin_lock_bh(&br->hash_lock);
+ ret = fdb_insert(br, source, addr, vid);
+ spin_unlock_bh(&br->hash_lock);
+ return ret;
+}
+
+void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid, bool added_by_user)
+{
+ struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+ struct net_bridge_fdb_entry *fdb;
+ bool fdb_modified = false;
+
+ /* some users want to always flood. */
+ if (hold_time(br) == 0)
+ return;
+
+ /* ignore packets unless we are using this port */
+ if (!(source->state == BR_STATE_LEARNING ||
+ source->state == BR_STATE_FORWARDING))
+ return;
+
+ fdb = fdb_find_rcu(head, addr, vid);
+ if (likely(fdb)) {
+ /* attempt to update an entry for a local interface */
+ if (unlikely(fdb->is_local)) {
+ if (net_ratelimit())
+ br_warn(br, "received packet on %s with "
+ "own address as source address\n",
+ source->dev->name);
+ } else {
+ /* fastpath: update of existing entry */
+ if (unlikely(source != fdb->dst)) {
+ fdb->dst = source;
+ fdb_modified = true;
+ }
+ fdb->updated = jiffies;
+ if (unlikely(added_by_user))
+ fdb->added_by_user = 1;
+ if (unlikely(fdb_modified))
+ fdb_notify(br, fdb, RTM_NEWNEIGH);
+ }
+ } else {
+ spin_lock(&br->hash_lock);
+ if (likely(!fdb_find(head, addr, vid))) {
+ fdb = fdb_create(head, source, addr, vid);
+ if (fdb) {
+ if (unlikely(added_by_user))
+ fdb->added_by_user = 1;
+ fdb_notify(br, fdb, RTM_NEWNEIGH);
+ }
+ }
+ /* else we lose race and someone else inserts
+ * it first, don't bother updating
+ */
+ spin_unlock(&br->hash_lock);
+ }
+}
+
+static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb)
+{
+ if (fdb->is_local)
+ return NUD_PERMANENT;
+ else if (fdb->is_static)
+ return NUD_NOARP;
+ else if (has_expired(fdb->dst->br, fdb))
+ return NUD_STALE;
+ else
+ return NUD_REACHABLE;
+}
+
+static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb,
+ u32 portid, u32 seq, int type, unsigned int flags)
+{
+ unsigned long now = jiffies;
+ struct nda_cacheinfo ci;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = fdb->added_by_external_learn ? NTF_EXT_LEARNED : 0;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex;
+ ndm->ndm_state = fdb_to_nud(fdb);
+
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
+ goto nla_put_failure;
+ ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
+ ci.ndm_confirmed = 0;
+ ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
+ ci.ndm_refcnt = 0;
+ if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+
+ if (fdb->vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->vlan_id))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t fdb_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(u32)) /* NDA_MASTER */
+ + nla_total_size(sizeof(u16)) /* NDA_VLAN */
+ + nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type)
+{
+ struct net *net = dev_net(br->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Dump information about entries, in response to GETNEIGH */
+int br_fdb_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ struct net_device *filter_dev,
+ int idx)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ int i;
+
+ if (!(dev->priv_flags & IFF_EBRIDGE))
+ goto out;
+
+ if (!filter_dev)
+ idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ struct net_bridge_fdb_entry *f;
+
+ hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
+ if (idx < cb->args[0])
+ goto skip;
+
+ if (filter_dev &&
+ (!f->dst || f->dst->dev != filter_dev)) {
+ if (filter_dev != dev)
+ goto skip;
+ /* !f->dst is a special case for bridge
+ * It means the MAC belongs to the bridge
+ * Therefore need a little more filtering
+ * we only want to dump the !f->dst case
+ */
+ if (f->dst)
+ goto skip;
+ }
+ if (!filter_dev && f->dst)
+ goto skip;
+
+ if (fdb_fill_info(skb, br, f,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI) < 0)
+ break;
+skip:
+ ++idx;
+ }
+ }
+
+out:
+ return idx;
+}
+
+/* Update (create or replace) forwarding database entry */
+static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
+ __u16 state, __u16 flags, __u16 vid)
+{
+ struct net_bridge *br = source->br;
+ struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
+ struct net_bridge_fdb_entry *fdb;
+ bool modified = false;
+
+ fdb = fdb_find(head, addr, vid);
+ if (fdb == NULL) {
+ if (!(flags & NLM_F_CREATE))
+ return -ENOENT;
+
+ fdb = fdb_create(head, source, addr, vid);
+ if (!fdb)
+ return -ENOMEM;
+
+ modified = true;
+ } else {
+ if (flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ if (fdb->dst != source) {
+ fdb->dst = source;
+ modified = true;
+ }
+ }
+
+ if (fdb_to_nud(fdb) != state) {
+ if (state & NUD_PERMANENT) {
+ fdb->is_local = 1;
+ if (!fdb->is_static) {
+ fdb->is_static = 1;
+ fdb_add_hw_addr(br, addr);
+ }
+ } else if (state & NUD_NOARP) {
+ fdb->is_local = 0;
+ if (!fdb->is_static) {
+ fdb->is_static = 1;
+ fdb_add_hw_addr(br, addr);
+ }
+ } else {
+ fdb->is_local = 0;
+ if (fdb->is_static) {
+ fdb->is_static = 0;
+ fdb_del_hw_addr(br, addr);
+ }
+ }
+
+ modified = true;
+ }
+ fdb->added_by_user = 1;
+
+ fdb->used = jiffies;
+ if (modified) {
+ fdb->updated = jiffies;
+ fdb_notify(br, fdb, RTM_NEWNEIGH);
+ }
+
+ return 0;
+}
+
+static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p,
+ const unsigned char *addr, u16 nlh_flags, u16 vid)
+{
+ int err = 0;
+
+ if (ndm->ndm_flags & NTF_USE) {
+ local_bh_disable();
+ rcu_read_lock();
+ br_fdb_update(p->br, p, addr, vid, true);
+ rcu_read_unlock();
+ local_bh_enable();
+ } else {
+ spin_lock_bh(&p->br->hash_lock);
+ err = fdb_add_entry(p, addr, ndm->ndm_state,
+ nlh_flags, vid);
+ spin_unlock_bh(&p->br->hash_lock);
+ }
+
+ return err;
+}
+
+/* Add new permanent fdb entry with RTM_NEWNEIGH */
+int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid, u16 nlh_flags)
+{
+ struct net_bridge_port *p;
+ int err = 0;
+ struct net_port_vlans *pv;
+
+ if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
+ pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
+ return -EINVAL;
+ }
+
+ if (is_zero_ether_addr(addr)) {
+ pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n");
+ return -EINVAL;
+ }
+
+ p = br_port_get_rtnl(dev);
+ if (p == NULL) {
+ pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n",
+ dev->name);
+ return -EINVAL;
+ }
+
+ pv = nbp_get_vlan_info(p);
+ if (vid) {
+ if (!pv || !test_bit(vid, pv->vlan_bitmap)) {
+ pr_info("bridge: RTM_NEWNEIGH with unconfigured "
+ "vlan %d on port %s\n", vid, dev->name);
+ return -EINVAL;
+ }
+
+ /* VID was specified, so use it. */
+ err = __br_fdb_add(ndm, p, addr, nlh_flags, vid);
+ } else {
+ err = __br_fdb_add(ndm, p, addr, nlh_flags, 0);
+ if (err || !pv)
+ goto out;
+
+ /* We have vlans configured on this port and user didn't
+ * specify a VLAN. To be nice, add/update entry for every
+ * vlan on this port.
+ */
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ err = __br_fdb_add(ndm, p, addr, nlh_flags, vid);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ return err;
+}
+
+static int fdb_delete_by_addr(struct net_bridge *br, const u8 *addr, u16 vlan)
+{
+ struct hlist_head *head = &br->hash[br_mac_hash(addr, vlan)];
+ struct net_bridge_fdb_entry *fdb;
+
+ fdb = fdb_find(head, addr, vlan);
+ if (!fdb)
+ return -ENOENT;
+
+ fdb_delete(br, fdb);
+ return 0;
+}
+
+static int __br_fdb_delete(struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid)
+{
+ int err;
+
+ spin_lock_bh(&p->br->hash_lock);
+ err = fdb_delete_by_addr(p->br, addr, vid);
+ spin_unlock_bh(&p->br->hash_lock);
+
+ return err;
+}
+
+/* Remove neighbor entry with RTM_DELNEIGH */
+int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid)
+{
+ struct net_bridge_port *p;
+ int err;
+ struct net_port_vlans *pv;
+
+ p = br_port_get_rtnl(dev);
+ if (p == NULL) {
+ pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n",
+ dev->name);
+ return -EINVAL;
+ }
+
+ pv = nbp_get_vlan_info(p);
+ if (vid) {
+ if (!pv || !test_bit(vid, pv->vlan_bitmap)) {
+ pr_info("bridge: RTM_DELNEIGH with unconfigured "
+ "vlan %d on port %s\n", vid, dev->name);
+ return -EINVAL;
+ }
+
+ err = __br_fdb_delete(p, addr, vid);
+ } else {
+ err = -ENOENT;
+ err &= __br_fdb_delete(p, addr, 0);
+ if (!pv)
+ goto out;
+
+ /* We have vlans configured on this port and user didn't
+ * specify a VLAN. To be nice, add/update entry for every
+ * vlan on this port.
+ */
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ err &= __br_fdb_delete(p, addr, vid);
+ }
+ }
+out:
+ return err;
+}
+
+int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p)
+{
+ struct net_bridge_fdb_entry *fdb, *tmp;
+ int i;
+ int err;
+
+ ASSERT_RTNL();
+
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ hlist_for_each_entry(fdb, &br->hash[i], hlist) {
+ /* We only care for static entries */
+ if (!fdb->is_static)
+ continue;
+
+ err = dev_uc_add(p->dev, fdb->addr.addr);
+ if (err)
+ goto rollback;
+ }
+ }
+ return 0;
+
+rollback:
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ hlist_for_each_entry(tmp, &br->hash[i], hlist) {
+ /* If we reached the fdb that failed, we can stop */
+ if (tmp == fdb)
+ break;
+
+ /* We only care for static entries */
+ if (!tmp->is_static)
+ continue;
+
+ dev_uc_del(p->dev, tmp->addr.addr);
+ }
+ }
+ return err;
+}
+
+void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
+{
+ struct net_bridge_fdb_entry *fdb;
+ int i;
+
+ ASSERT_RTNL();
+
+ for (i = 0; i < BR_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) {
+ /* We only care for static entries */
+ if (!fdb->is_static)
+ continue;
+
+ dev_uc_del(p->dev, fdb->addr.addr);
+ }
+ }
+}
+
+int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid)
+{
+ struct hlist_head *head;
+ struct net_bridge_fdb_entry *fdb;
+ int err = 0;
+
+ ASSERT_RTNL();
+ spin_lock_bh(&br->hash_lock);
+
+ head = &br->hash[br_mac_hash(addr, vid)];
+ fdb = fdb_find(head, addr, vid);
+ if (!fdb) {
+ fdb = fdb_create(head, p, addr, vid);
+ if (!fdb) {
+ err = -ENOMEM;
+ goto err_unlock;
+ }
+ fdb->added_by_external_learn = 1;
+ fdb_notify(br, fdb, RTM_NEWNEIGH);
+ } else if (fdb->added_by_external_learn) {
+ /* Refresh entry */
+ fdb->updated = fdb->used = jiffies;
+ } else if (!fdb->added_by_user) {
+ /* Take over SW learned entry */
+ fdb->added_by_external_learn = 1;
+ fdb->updated = jiffies;
+ fdb_notify(br, fdb, RTM_NEWNEIGH);
+ }
+
+err_unlock:
+ spin_unlock_bh(&br->hash_lock);
+
+ return err;
+}
+
+int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid)
+{
+ struct hlist_head *head;
+ struct net_bridge_fdb_entry *fdb;
+ int err = 0;
+
+ ASSERT_RTNL();
+ spin_lock_bh(&br->hash_lock);
+
+ head = &br->hash[br_mac_hash(addr, vid)];
+ fdb = fdb_find(head, addr, vid);
+ if (fdb && fdb->added_by_external_learn)
+ fdb_delete(br, fdb);
+ else
+ err = -ENOENT;
+
+ spin_unlock_bh(&br->hash_lock);
+
+ return err;
+}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
new file mode 100644
index 000000000..e97572b5d
--- /dev/null
+++ b/net/bridge/br_forward.c
@@ -0,0 +1,292 @@
+/*
+ * Forwarding decision
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/netfilter_bridge.h>
+#include "br_private.h"
+
+static int deliver_clone(const struct net_bridge_port *prev,
+ struct sk_buff *skb,
+ void (*__packet_hook)(const struct net_bridge_port *p,
+ struct sk_buff *skb));
+
+/* Don't forward packets to originating port or forwarding disabled */
+static inline int should_deliver(const struct net_bridge_port *p,
+ const struct sk_buff *skb)
+{
+ return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
+ br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) &&
+ p->state == BR_STATE_FORWARDING;
+}
+
+int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb)
+{
+ if (!is_skb_forwardable(skb->dev, skb)) {
+ kfree_skb(skb);
+ } else {
+ skb_push(skb, ETH_HLEN);
+ br_drop_fake_rtable(skb);
+ dev_queue_xmit(skb);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
+
+int br_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+ return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, sk, skb,
+ NULL, skb->dev,
+ br_dev_queue_push_xmit);
+
+}
+EXPORT_SYMBOL_GPL(br_forward_finish);
+
+static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+{
+ skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
+ if (!skb)
+ return;
+
+ skb->dev = to->dev;
+
+ if (unlikely(netpoll_tx_running(to->br->dev))) {
+ if (!is_skb_forwardable(skb->dev, skb))
+ kfree_skb(skb);
+ else {
+ skb_push(skb, ETH_HLEN);
+ br_netpoll_send_skb(to, skb);
+ }
+ return;
+ }
+
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb,
+ NULL, skb->dev,
+ br_forward_finish);
+}
+
+static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
+{
+ struct net_device *indev;
+
+ if (skb_warn_if_lro(skb)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
+ if (!skb)
+ return;
+
+ indev = skb->dev;
+ skb->dev = to->dev;
+ skb_forward_csum(skb);
+
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, NULL, skb,
+ indev, skb->dev,
+ br_forward_finish);
+}
+
+/* called with rcu_read_lock */
+void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+{
+ if (to && should_deliver(to, skb)) {
+ __br_deliver(to, skb);
+ return;
+ }
+
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(br_deliver);
+
+/* called with rcu_read_lock */
+void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0)
+{
+ if (should_deliver(to, skb)) {
+ if (skb0)
+ deliver_clone(to, skb, __br_forward);
+ else
+ __br_forward(to, skb);
+ return;
+ }
+
+ if (!skb0)
+ kfree_skb(skb);
+}
+
+static int deliver_clone(const struct net_bridge_port *prev,
+ struct sk_buff *skb,
+ void (*__packet_hook)(const struct net_bridge_port *p,
+ struct sk_buff *skb))
+{
+ struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb) {
+ dev->stats.tx_dropped++;
+ return -ENOMEM;
+ }
+
+ __packet_hook(prev, skb);
+ return 0;
+}
+
+static struct net_bridge_port *maybe_deliver(
+ struct net_bridge_port *prev, struct net_bridge_port *p,
+ struct sk_buff *skb,
+ void (*__packet_hook)(const struct net_bridge_port *p,
+ struct sk_buff *skb))
+{
+ int err;
+
+ if (!should_deliver(p, skb))
+ return prev;
+
+ if (!prev)
+ goto out;
+
+ err = deliver_clone(prev, skb, __packet_hook);
+ if (err)
+ return ERR_PTR(err);
+
+out:
+ return p;
+}
+
+/* called under bridge lock */
+static void br_flood(struct net_bridge *br, struct sk_buff *skb,
+ struct sk_buff *skb0,
+ void (*__packet_hook)(const struct net_bridge_port *p,
+ struct sk_buff *skb),
+ bool unicast)
+{
+ struct net_bridge_port *p;
+ struct net_bridge_port *prev;
+
+ prev = NULL;
+
+ list_for_each_entry_rcu(p, &br->port_list, list) {
+ /* Do not flood unicast traffic to ports that turn it off */
+ if (unicast && !(p->flags & BR_FLOOD))
+ continue;
+
+ /* Do not flood to ports that enable proxy ARP */
+ if (p->flags & BR_PROXYARP)
+ continue;
+ if ((p->flags & BR_PROXYARP_WIFI) &&
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied)
+ continue;
+
+ prev = maybe_deliver(prev, p, skb, __packet_hook);
+ if (IS_ERR(prev))
+ goto out;
+ }
+
+ if (!prev)
+ goto out;
+
+ if (skb0)
+ deliver_clone(prev, skb, __packet_hook);
+ else
+ __packet_hook(prev, skb);
+ return;
+
+out:
+ if (!skb0)
+ kfree_skb(skb);
+}
+
+
+/* called with rcu_read_lock */
+void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast)
+{
+ br_flood(br, skb, NULL, __br_deliver, unicast);
+}
+
+/* called under bridge lock */
+void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
+ struct sk_buff *skb2, bool unicast)
+{
+ br_flood(br, skb, skb2, __br_forward, unicast);
+}
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+/* called with rcu_read_lock */
+static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb, struct sk_buff *skb0,
+ void (*__packet_hook)(
+ const struct net_bridge_port *p,
+ struct sk_buff *skb))
+{
+ struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *prev = NULL;
+ struct net_bridge_port_group *p;
+ struct hlist_node *rp;
+
+ rp = rcu_dereference(hlist_first_rcu(&br->router_list));
+ p = mdst ? rcu_dereference(mdst->ports) : NULL;
+ while (p || rp) {
+ struct net_bridge_port *port, *lport, *rport;
+
+ lport = p ? p->port : NULL;
+ rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
+ NULL;
+
+ port = (unsigned long)lport > (unsigned long)rport ?
+ lport : rport;
+
+ prev = maybe_deliver(prev, port, skb, __packet_hook);
+ if (IS_ERR(prev))
+ goto out;
+
+ if ((unsigned long)lport >= (unsigned long)port)
+ p = rcu_dereference(p->next);
+ if ((unsigned long)rport >= (unsigned long)port)
+ rp = rcu_dereference(hlist_next_rcu(rp));
+ }
+
+ if (!prev)
+ goto out;
+
+ if (skb0)
+ deliver_clone(prev, skb, __packet_hook);
+ else
+ __packet_hook(prev, skb);
+ return;
+
+out:
+ if (!skb0)
+ kfree_skb(skb);
+}
+
+/* called with rcu_read_lock */
+void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb)
+{
+ br_multicast_flood(mdst, skb, NULL, __br_deliver);
+}
+
+/* called with rcu_read_lock */
+void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb, struct sk_buff *skb2)
+{
+ br_multicast_flood(mdst, skb, skb2, __br_forward);
+}
+#endif
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
new file mode 100644
index 000000000..1849d96b3
--- /dev/null
+++ b/net/bridge/br_if.c
@@ -0,0 +1,586 @@
+/*
+ * Userspace interface
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/netpoll.h>
+#include <linux/ethtool.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_ether.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/if_vlan.h>
+
+#include "br_private.h"
+
+/*
+ * Determine initial path cost based on speed.
+ * using recommendations from 802.1d standard
+ *
+ * Since driver might sleep need to not be holding any locks.
+ */
+static int port_cost(struct net_device *dev)
+{
+ struct ethtool_cmd ecmd;
+
+ if (!__ethtool_get_settings(dev, &ecmd)) {
+ switch (ethtool_cmd_speed(&ecmd)) {
+ case SPEED_10000:
+ return 2;
+ case SPEED_1000:
+ return 4;
+ case SPEED_100:
+ return 19;
+ case SPEED_10:
+ return 100;
+ }
+ }
+
+ /* Old silly heuristics based on name */
+ if (!strncmp(dev->name, "lec", 3))
+ return 7;
+
+ if (!strncmp(dev->name, "plip", 4))
+ return 2500;
+
+ return 100; /* assume old 10Mbps */
+}
+
+
+/* Check for port carrier transitions. */
+void br_port_carrier_check(struct net_bridge_port *p)
+{
+ struct net_device *dev = p->dev;
+ struct net_bridge *br = p->br;
+
+ if (!(p->flags & BR_ADMIN_COST) &&
+ netif_running(dev) && netif_oper_up(dev))
+ p->path_cost = port_cost(dev);
+
+ if (!netif_running(br->dev))
+ return;
+
+ spin_lock_bh(&br->lock);
+ if (netif_running(dev) && netif_oper_up(dev)) {
+ if (p->state == BR_STATE_DISABLED)
+ br_stp_enable_port(p);
+ } else {
+ if (p->state != BR_STATE_DISABLED)
+ br_stp_disable_port(p);
+ }
+ spin_unlock_bh(&br->lock);
+}
+
+static void br_port_set_promisc(struct net_bridge_port *p)
+{
+ int err = 0;
+
+ if (br_promisc_port(p))
+ return;
+
+ err = dev_set_promiscuity(p->dev, 1);
+ if (err)
+ return;
+
+ br_fdb_unsync_static(p->br, p);
+ p->flags |= BR_PROMISC;
+}
+
+static void br_port_clear_promisc(struct net_bridge_port *p)
+{
+ int err;
+
+ /* Check if the port is already non-promisc or if it doesn't
+ * support UNICAST filtering. Without unicast filtering support
+ * we'll end up re-enabling promisc mode anyway, so just check for
+ * it here.
+ */
+ if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT))
+ return;
+
+ /* Since we'll be clearing the promisc mode, program the port
+ * first so that we don't have interruption in traffic.
+ */
+ err = br_fdb_sync_static(p->br, p);
+ if (err)
+ return;
+
+ dev_set_promiscuity(p->dev, -1);
+ p->flags &= ~BR_PROMISC;
+}
+
+/* When a port is added or removed or when certain port flags
+ * change, this function is called to automatically manage
+ * promiscuity setting of all the bridge ports. We are always called
+ * under RTNL so can skip using rcu primitives.
+ */
+void br_manage_promisc(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+ bool set_all = false;
+
+ /* If vlan filtering is disabled or bridge interface is placed
+ * into promiscuous mode, place all ports in promiscuous mode.
+ */
+ if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br))
+ set_all = true;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (set_all) {
+ br_port_set_promisc(p);
+ } else {
+ /* If the number of auto-ports is <= 1, then all other
+ * ports will have their output configuration
+ * statically specified through fdbs. Since ingress
+ * on the auto-port becomes forwarding/egress to other
+ * ports and egress configuration is statically known,
+ * we can say that ingress configuration of the
+ * auto-port is also statically known.
+ * This lets us disable promiscuous mode and write
+ * this config to hw.
+ */
+ if (br->auto_cnt == 0 ||
+ (br->auto_cnt == 1 && br_auto_port(p)))
+ br_port_clear_promisc(p);
+ else
+ br_port_set_promisc(p);
+ }
+ }
+}
+
+static void nbp_update_port_count(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+ u32 cnt = 0;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (br_auto_port(p))
+ cnt++;
+ }
+ if (br->auto_cnt != cnt) {
+ br->auto_cnt = cnt;
+ br_manage_promisc(br);
+ }
+}
+
+static void nbp_delete_promisc(struct net_bridge_port *p)
+{
+ /* If port is currently promiscuous, unset promiscuity.
+ * Otherwise, it is a static port so remove all addresses
+ * from it.
+ */
+ dev_set_allmulti(p->dev, -1);
+ if (br_promisc_port(p))
+ dev_set_promiscuity(p->dev, -1);
+ else
+ br_fdb_unsync_static(p->br, p);
+}
+
+static void release_nbp(struct kobject *kobj)
+{
+ struct net_bridge_port *p
+ = container_of(kobj, struct net_bridge_port, kobj);
+ kfree(p);
+}
+
+static struct kobj_type brport_ktype = {
+#ifdef CONFIG_SYSFS
+ .sysfs_ops = &brport_sysfs_ops,
+#endif
+ .release = release_nbp,
+};
+
+static void destroy_nbp(struct net_bridge_port *p)
+{
+ struct net_device *dev = p->dev;
+
+ p->br = NULL;
+ p->dev = NULL;
+ dev_put(dev);
+
+ kobject_put(&p->kobj);
+}
+
+static void destroy_nbp_rcu(struct rcu_head *head)
+{
+ struct net_bridge_port *p =
+ container_of(head, struct net_bridge_port, rcu);
+ destroy_nbp(p);
+}
+
+/* Delete port(interface) from bridge is done in two steps.
+ * via RCU. First step, marks device as down. That deletes
+ * all the timers and stops new packets from flowing through.
+ *
+ * Final cleanup doesn't occur until after all CPU's finished
+ * processing packets.
+ *
+ * Protected from multiple admin operations by RTNL mutex
+ */
+static void del_nbp(struct net_bridge_port *p)
+{
+ struct net_bridge *br = p->br;
+ struct net_device *dev = p->dev;
+
+ sysfs_remove_link(br->ifobj, p->dev->name);
+
+ nbp_delete_promisc(p);
+
+ spin_lock_bh(&br->lock);
+ br_stp_disable_port(p);
+ spin_unlock_bh(&br->lock);
+
+ br_ifinfo_notify(RTM_DELLINK, p);
+
+ list_del_rcu(&p->list);
+
+ nbp_vlan_flush(p);
+ br_fdb_delete_by_port(br, p, 1);
+ nbp_update_port_count(br);
+
+ netdev_upper_dev_unlink(dev, br->dev);
+
+ dev->priv_flags &= ~IFF_BRIDGE_PORT;
+
+ netdev_rx_handler_unregister(dev);
+
+ br_multicast_del_port(p);
+
+ kobject_uevent(&p->kobj, KOBJ_REMOVE);
+ kobject_del(&p->kobj);
+
+ br_netpoll_disable(p);
+
+ call_rcu(&p->rcu, destroy_nbp_rcu);
+}
+
+/* Delete bridge device */
+void br_dev_delete(struct net_device *dev, struct list_head *head)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p, *n;
+
+ list_for_each_entry_safe(p, n, &br->port_list, list) {
+ del_nbp(p);
+ }
+
+ br_fdb_delete_by_port(br, NULL, 1);
+
+ br_vlan_flush(br);
+ del_timer_sync(&br->gc_timer);
+
+ br_sysfs_delbr(br->dev);
+ unregister_netdevice_queue(br->dev, head);
+}
+
+/* find an available port number */
+static int find_portno(struct net_bridge *br)
+{
+ int index;
+ struct net_bridge_port *p;
+ unsigned long *inuse;
+
+ inuse = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!inuse)
+ return -ENOMEM;
+
+ set_bit(0, inuse); /* zero is reserved */
+ list_for_each_entry(p, &br->port_list, list) {
+ set_bit(p->port_no, inuse);
+ }
+ index = find_first_zero_bit(inuse, BR_MAX_PORTS);
+ kfree(inuse);
+
+ return (index >= BR_MAX_PORTS) ? -EXFULL : index;
+}
+
+/* called with RTNL but without bridge lock */
+static struct net_bridge_port *new_nbp(struct net_bridge *br,
+ struct net_device *dev)
+{
+ int index;
+ struct net_bridge_port *p;
+
+ index = find_portno(br);
+ if (index < 0)
+ return ERR_PTR(index);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (p == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ p->br = br;
+ dev_hold(dev);
+ p->dev = dev;
+ p->path_cost = port_cost(dev);
+ p->priority = 0x8000 >> BR_PORT_BITS;
+ p->port_no = index;
+ p->flags = BR_LEARNING | BR_FLOOD;
+ br_init_port(p);
+ br_set_state(p, BR_STATE_DISABLED);
+ br_stp_port_timer_init(p);
+ br_multicast_add_port(p);
+
+ return p;
+}
+
+int br_add_bridge(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ int res;
+
+ dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN,
+ br_dev_setup);
+
+ if (!dev)
+ return -ENOMEM;
+
+ dev_net_set(dev, net);
+ dev->rtnl_link_ops = &br_link_ops;
+
+ res = register_netdev(dev);
+ if (res)
+ free_netdev(dev);
+ return res;
+}
+
+int br_del_bridge(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = __dev_get_by_name(net, name);
+ if (dev == NULL)
+ ret = -ENXIO; /* Could not find device */
+
+ else if (!(dev->priv_flags & IFF_EBRIDGE)) {
+ /* Attempt to delete non bridge device! */
+ ret = -EPERM;
+ }
+
+ else if (dev->flags & IFF_UP) {
+ /* Not shutdown yet. */
+ ret = -EBUSY;
+ }
+
+ else
+ br_dev_delete(dev, NULL);
+
+ rtnl_unlock();
+ return ret;
+}
+
+/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */
+int br_min_mtu(const struct net_bridge *br)
+{
+ const struct net_bridge_port *p;
+ int mtu = 0;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&br->port_list))
+ mtu = ETH_DATA_LEN;
+ else {
+ list_for_each_entry(p, &br->port_list, list) {
+ if (!mtu || p->dev->mtu < mtu)
+ mtu = p->dev->mtu;
+ }
+ }
+ return mtu;
+}
+
+/*
+ * Recomputes features using slave's features
+ */
+netdev_features_t br_features_recompute(struct net_bridge *br,
+ netdev_features_t features)
+{
+ struct net_bridge_port *p;
+ netdev_features_t mask;
+
+ if (list_empty(&br->port_list))
+ return features;
+
+ mask = features;
+ features &= ~NETIF_F_ONE_FOR_ALL;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ features = netdev_increment_features(features,
+ p->dev->features, mask);
+ }
+ features = netdev_add_tso_features(features, mask);
+
+ return features;
+}
+
+/* called with RTNL */
+int br_add_if(struct net_bridge *br, struct net_device *dev)
+{
+ struct net_bridge_port *p;
+ int err = 0;
+ bool changed_addr;
+
+ /* Don't allow bridging non-ethernet like devices, or DSA-enabled
+ * master network devices since the bridge layer rx_handler prevents
+ * the DSA fake ethertype handler to be invoked, so we do not strip off
+ * the DSA switch tag protocol header and the bridge layer just return
+ * RX_HANDLER_CONSUMED, stopping RX processing for these frames.
+ */
+ if ((dev->flags & IFF_LOOPBACK) ||
+ dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
+ !is_valid_ether_addr(dev->dev_addr) ||
+ netdev_uses_dsa(dev))
+ return -EINVAL;
+
+ /* No bridging of bridges */
+ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
+ return -ELOOP;
+
+ /* Device is already being bridged */
+ if (br_port_exists(dev))
+ return -EBUSY;
+
+ /* No bridging devices that dislike that (e.g. wireless) */
+ if (dev->priv_flags & IFF_DONT_BRIDGE)
+ return -EOPNOTSUPP;
+
+ p = new_nbp(br, dev);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ call_netdevice_notifiers(NETDEV_JOIN, dev);
+
+ err = dev_set_allmulti(dev, 1);
+ if (err)
+ goto put_back;
+
+ err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
+ SYSFS_BRIDGE_PORT_ATTR);
+ if (err)
+ goto err1;
+
+ err = br_sysfs_addif(p);
+ if (err)
+ goto err2;
+
+ err = br_netpoll_enable(p);
+ if (err)
+ goto err3;
+
+ err = netdev_rx_handler_register(dev, br_handle_frame, p);
+ if (err)
+ goto err4;
+
+ dev->priv_flags |= IFF_BRIDGE_PORT;
+
+ err = netdev_master_upper_dev_link(dev, br->dev);
+ if (err)
+ goto err5;
+
+ dev_disable_lro(dev);
+
+ list_add_rcu(&p->list, &br->port_list);
+
+ nbp_update_port_count(br);
+
+ netdev_update_features(br->dev);
+
+ if (br->dev->needed_headroom < dev->needed_headroom)
+ br->dev->needed_headroom = dev->needed_headroom;
+
+ if (br_fdb_insert(br, p, dev->dev_addr, 0))
+ netdev_err(dev, "failed insert local address bridge forwarding table\n");
+
+ if (nbp_vlan_init(p))
+ netdev_err(dev, "failed to initialize vlan filtering on this port\n");
+
+ spin_lock_bh(&br->lock);
+ changed_addr = br_stp_recalculate_bridge_id(br);
+
+ if (netif_running(dev) && netif_oper_up(dev) &&
+ (br->dev->flags & IFF_UP))
+ br_stp_enable_port(p);
+ spin_unlock_bh(&br->lock);
+
+ br_ifinfo_notify(RTM_NEWLINK, p);
+
+ if (changed_addr)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
+
+ dev_set_mtu(br->dev, br_min_mtu(br));
+
+ kobject_uevent(&p->kobj, KOBJ_ADD);
+
+ return 0;
+
+err5:
+ dev->priv_flags &= ~IFF_BRIDGE_PORT;
+ netdev_rx_handler_unregister(dev);
+err4:
+ br_netpoll_disable(p);
+err3:
+ sysfs_remove_link(br->ifobj, p->dev->name);
+err2:
+ kobject_put(&p->kobj);
+ p = NULL; /* kobject_put frees */
+err1:
+ dev_set_allmulti(dev, -1);
+put_back:
+ dev_put(dev);
+ kfree(p);
+ return err;
+}
+
+/* called with RTNL */
+int br_del_if(struct net_bridge *br, struct net_device *dev)
+{
+ struct net_bridge_port *p;
+ bool changed_addr;
+
+ p = br_port_get_rtnl(dev);
+ if (!p || p->br != br)
+ return -EINVAL;
+
+ /* Since more than one interface can be attached to a bridge,
+ * there still maybe an alternate path for netconsole to use;
+ * therefore there is no reason for a NETDEV_RELEASE event.
+ */
+ del_nbp(p);
+
+ dev_set_mtu(br->dev, br_min_mtu(br));
+
+ spin_lock_bh(&br->lock);
+ changed_addr = br_stp_recalculate_bridge_id(br);
+ spin_unlock_bh(&br->lock);
+
+ if (changed_addr)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
+
+ netdev_update_features(br->dev);
+
+ return 0;
+}
+
+void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
+{
+ struct net_bridge *br = p->br;
+
+ if (mask & BR_AUTO_MASK)
+ nbp_update_port_count(br);
+}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
new file mode 100644
index 000000000..f921a5dce
--- /dev/null
+++ b/net/bridge/br_input.c
@@ -0,0 +1,315 @@
+/*
+ * Handle incoming frames
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/neighbour.h>
+#include <net/arp.h>
+#include <linux/export.h>
+#include <linux/rculist.h>
+#include "br_private.h"
+
+/* Hook for brouter */
+br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
+EXPORT_SYMBOL(br_should_route_hook);
+
+static int br_pass_frame_up(struct sk_buff *skb)
+{
+ struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
+ struct net_bridge *br = netdev_priv(brdev);
+ struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
+ struct net_port_vlans *pv;
+
+ u64_stats_update_begin(&brstats->syncp);
+ brstats->rx_packets++;
+ brstats->rx_bytes += skb->len;
+ u64_stats_update_end(&brstats->syncp);
+
+ /* Bridge is just like any other port. Make sure the
+ * packet is allowed except in promisc modue when someone
+ * may be running packet capture.
+ */
+ pv = br_get_vlan_info(br);
+ if (!(brdev->flags & IFF_PROMISC) &&
+ !br_allowed_egress(br, pv, skb)) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ indev = skb->dev;
+ skb->dev = brdev;
+ skb = br_handle_vlan(br, pv, skb);
+ if (!skb)
+ return NET_RX_DROP;
+
+ return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb,
+ indev, NULL,
+ netif_receive_skb_sk);
+}
+
+static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p)
+{
+ struct net_device *dev = br->dev;
+ struct neighbour *n;
+ struct arphdr *parp;
+ u8 *arpptr, *sha;
+ __be32 sip, tip;
+
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+
+ if (dev->flags & IFF_NOARP)
+ return;
+
+ if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
+ dev->stats.tx_dropped++;
+ return;
+ }
+ parp = arp_hdr(skb);
+
+ if (parp->ar_pro != htons(ETH_P_IP) ||
+ parp->ar_op != htons(ARPOP_REQUEST) ||
+ parp->ar_hln != dev->addr_len ||
+ parp->ar_pln != 4)
+ return;
+
+ arpptr = (u8 *)parp + sizeof(struct arphdr);
+ sha = arpptr;
+ arpptr += dev->addr_len; /* sha */
+ memcpy(&sip, arpptr, sizeof(sip));
+ arpptr += sizeof(sip);
+ arpptr += dev->addr_len; /* tha */
+ memcpy(&tip, arpptr, sizeof(tip));
+
+ if (ipv4_is_loopback(tip) ||
+ ipv4_is_multicast(tip))
+ return;
+
+ n = neigh_lookup(&arp_tbl, &tip, dev);
+ if (n) {
+ struct net_bridge_fdb_entry *f;
+
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_release(n);
+ return;
+ }
+
+ f = __br_fdb_get(br, n->ha, vid);
+ if (f && ((p->flags & BR_PROXYARP) ||
+ (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
+ sha, n->ha, sha);
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ }
+
+ neigh_release(n);
+ }
+}
+
+/* note: already called with rcu_read_lock */
+int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb)
+{
+ const unsigned char *dest = eth_hdr(skb)->h_dest;
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+ struct net_bridge *br;
+ struct net_bridge_fdb_entry *dst;
+ struct net_bridge_mdb_entry *mdst;
+ struct sk_buff *skb2;
+ bool unicast = true;
+ u16 vid = 0;
+
+ if (!p || p->state == BR_STATE_DISABLED)
+ goto drop;
+
+ if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
+ goto out;
+
+ /* insert into forwarding database after filtering to avoid spoofing */
+ br = p->br;
+ if (p->flags & BR_LEARNING)
+ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
+
+ if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
+ br_multicast_rcv(br, p, skb, vid))
+ goto drop;
+
+ if (p->state == BR_STATE_LEARNING)
+ goto drop;
+
+ BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+
+ /* The packet skb2 goes to the local host (NULL to skip). */
+ skb2 = NULL;
+
+ if (br->dev->flags & IFF_PROMISC)
+ skb2 = skb;
+
+ dst = NULL;
+
+ if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
+ br_do_proxy_arp(skb, br, vid, p);
+
+ if (is_broadcast_ether_addr(dest)) {
+ skb2 = skb;
+ unicast = false;
+ } else if (is_multicast_ether_addr(dest)) {
+ mdst = br_mdb_get(br, skb, vid);
+ if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
+ br_multicast_querier_exists(br, eth_hdr(skb))) {
+ if ((mdst && mdst->mglist) ||
+ br_multicast_is_router(br))
+ skb2 = skb;
+ br_multicast_forward(mdst, skb, skb2);
+ skb = NULL;
+ if (!skb2)
+ goto out;
+ } else
+ skb2 = skb;
+
+ unicast = false;
+ br->dev->stats.multicast++;
+ } else if ((dst = __br_fdb_get(br, dest, vid)) &&
+ dst->is_local) {
+ skb2 = skb;
+ /* Do not forward the packet since it's local. */
+ skb = NULL;
+ }
+
+ if (skb) {
+ if (dst) {
+ dst->used = jiffies;
+ br_forward(dst->dst, skb, skb2);
+ } else
+ br_flood_forward(br, skb, skb2, unicast);
+ }
+
+ if (skb2)
+ return br_pass_frame_up(skb2);
+
+out:
+ return 0;
+drop:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(br_handle_frame_finish);
+
+/* note: already called with rcu_read_lock */
+static int br_handle_local_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+ u16 vid = 0;
+
+ /* check if vlan is allowed, to avoid spoofing */
+ if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
+ br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+ return 0; /* process further */
+}
+
+/*
+ * Return NULL if skb is handled
+ * note: already called with rcu_read_lock
+ */
+rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
+{
+ struct net_bridge_port *p;
+ struct sk_buff *skb = *pskb;
+ const unsigned char *dest = eth_hdr(skb)->h_dest;
+ br_should_route_hook_t *rhook;
+
+ if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+ return RX_HANDLER_PASS;
+
+ if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return RX_HANDLER_CONSUMED;
+
+ p = br_port_get_rcu(skb->dev);
+
+ if (unlikely(is_link_local_ether_addr(dest))) {
+ u16 fwd_mask = p->br->group_fwd_mask_required;
+
+ /*
+ * See IEEE 802.1D Table 7-10 Reserved addresses
+ *
+ * Assignment Value
+ * Bridge Group Address 01-80-C2-00-00-00
+ * (MAC Control) 802.3 01-80-C2-00-00-01
+ * (Link Aggregation) 802.3 01-80-C2-00-00-02
+ * 802.1X PAE address 01-80-C2-00-00-03
+ *
+ * 802.1AB LLDP 01-80-C2-00-00-0E
+ *
+ * Others reserved for future standardization
+ */
+ switch (dest[5]) {
+ case 0x00: /* Bridge Group Address */
+ /* If STP is turned off,
+ then must forward to keep loop detection */
+ if (p->br->stp_enabled == BR_NO_STP ||
+ fwd_mask & (1u << dest[5]))
+ goto forward;
+ break;
+
+ case 0x01: /* IEEE MAC (Pause) */
+ goto drop;
+
+ default:
+ /* Allow selective forwarding for most other protocols */
+ fwd_mask |= p->br->group_fwd_mask;
+ if (fwd_mask & (1u << dest[5]))
+ goto forward;
+ }
+
+ /* Deliver packet to local host only */
+ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb,
+ skb->dev, NULL, br_handle_local_finish)) {
+ return RX_HANDLER_CONSUMED; /* consumed by filter */
+ } else {
+ *pskb = skb;
+ return RX_HANDLER_PASS; /* continue processing */
+ }
+ }
+
+forward:
+ switch (p->state) {
+ case BR_STATE_FORWARDING:
+ rhook = rcu_dereference(br_should_route_hook);
+ if (rhook) {
+ if ((*rhook)(skb)) {
+ *pskb = skb;
+ return RX_HANDLER_PASS;
+ }
+ dest = eth_hdr(skb)->h_dest;
+ }
+ /* fall through */
+ case BR_STATE_LEARNING:
+ if (ether_addr_equal(p->br->dev->dev_addr, dest))
+ skb->pkt_type = PACKET_HOST;
+
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, NULL, skb,
+ skb->dev, NULL,
+ br_handle_frame_finish);
+ break;
+ default:
+drop:
+ kfree_skb(skb);
+ }
+ return RX_HANDLER_CONSUMED;
+}
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
new file mode 100644
index 000000000..8d423bc64
--- /dev/null
+++ b/net/bridge/br_ioctl.c
@@ -0,0 +1,394 @@
+/*
+ * Ioctl handler
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/if_bridge.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/times.h>
+#include <net/net_namespace.h>
+#include <asm/uaccess.h>
+#include "br_private.h"
+
+/* called with RTNL */
+static int get_bridge_ifindices(struct net *net, int *indices, int num)
+{
+ struct net_device *dev;
+ int i = 0;
+
+ for_each_netdev(net, dev) {
+ if (i >= num)
+ break;
+ if (dev->priv_flags & IFF_EBRIDGE)
+ indices[i++] = dev->ifindex;
+ }
+
+ return i;
+}
+
+/* called with RTNL */
+static void get_port_ifindices(struct net_bridge *br, int *ifindices, int num)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->port_no < num)
+ ifindices[p->port_no] = p->dev->ifindex;
+ }
+}
+
+/*
+ * Format up to a page worth of forwarding table entries
+ * userbuf -- where to copy result
+ * maxnum -- maximum number of entries desired
+ * (limited to a page for sanity)
+ * offset -- number of records to skip
+ */
+static int get_fdb_entries(struct net_bridge *br, void __user *userbuf,
+ unsigned long maxnum, unsigned long offset)
+{
+ int num;
+ void *buf;
+ size_t size;
+
+ /* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */
+ if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry))
+ maxnum = PAGE_SIZE/sizeof(struct __fdb_entry);
+
+ size = maxnum * sizeof(struct __fdb_entry);
+
+ buf = kmalloc(size, GFP_USER);
+ if (!buf)
+ return -ENOMEM;
+
+ num = br_fdb_fillbuf(br, buf, maxnum, offset);
+ if (num > 0) {
+ if (copy_to_user(userbuf, buf, num*sizeof(struct __fdb_entry)))
+ num = -EFAULT;
+ }
+ kfree(buf);
+
+ return num;
+}
+
+/* called with RTNL */
+static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
+{
+ struct net *net = dev_net(br->dev);
+ struct net_device *dev;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ dev = __dev_get_by_index(net, ifindex);
+ if (dev == NULL)
+ return -EINVAL;
+
+ if (isadd)
+ ret = br_add_if(br, dev);
+ else
+ ret = br_del_if(br, dev);
+
+ return ret;
+}
+
+/*
+ * Legacy ioctl's through SIOCDEVPRIVATE
+ * This interface is deprecated because it was too difficult to
+ * to do the translation for 32/64bit ioctl compatibility.
+ */
+static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ unsigned long args[4];
+
+ if (copy_from_user(args, rq->ifr_data, sizeof(args)))
+ return -EFAULT;
+
+ switch (args[0]) {
+ case BRCTL_ADD_IF:
+ case BRCTL_DEL_IF:
+ return add_del_if(br, args[1], args[0] == BRCTL_ADD_IF);
+
+ case BRCTL_GET_BRIDGE_INFO:
+ {
+ struct __bridge_info b;
+
+ memset(&b, 0, sizeof(struct __bridge_info));
+ rcu_read_lock();
+ memcpy(&b.designated_root, &br->designated_root, 8);
+ memcpy(&b.bridge_id, &br->bridge_id, 8);
+ b.root_path_cost = br->root_path_cost;
+ b.max_age = jiffies_to_clock_t(br->max_age);
+ b.hello_time = jiffies_to_clock_t(br->hello_time);
+ b.forward_delay = br->forward_delay;
+ b.bridge_max_age = br->bridge_max_age;
+ b.bridge_hello_time = br->bridge_hello_time;
+ b.bridge_forward_delay = jiffies_to_clock_t(br->bridge_forward_delay);
+ b.topology_change = br->topology_change;
+ b.topology_change_detected = br->topology_change_detected;
+ b.root_port = br->root_port;
+
+ b.stp_enabled = (br->stp_enabled != BR_NO_STP);
+ b.ageing_time = jiffies_to_clock_t(br->ageing_time);
+ b.hello_timer_value = br_timer_value(&br->hello_timer);
+ b.tcn_timer_value = br_timer_value(&br->tcn_timer);
+ b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
+ b.gc_timer_value = br_timer_value(&br->gc_timer);
+ rcu_read_unlock();
+
+ if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case BRCTL_GET_PORT_LIST:
+ {
+ int num, *indices;
+
+ num = args[2];
+ if (num < 0)
+ return -EINVAL;
+ if (num == 0)
+ num = 256;
+ if (num > BR_MAX_PORTS)
+ num = BR_MAX_PORTS;
+
+ indices = kcalloc(num, sizeof(int), GFP_KERNEL);
+ if (indices == NULL)
+ return -ENOMEM;
+
+ get_port_ifindices(br, indices, num);
+ if (copy_to_user((void __user *)args[1], indices, num*sizeof(int)))
+ num = -EFAULT;
+ kfree(indices);
+ return num;
+ }
+
+ case BRCTL_SET_BRIDGE_FORWARD_DELAY:
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ return br_set_forward_delay(br, args[1]);
+
+ case BRCTL_SET_BRIDGE_HELLO_TIME:
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ return br_set_hello_time(br, args[1]);
+
+ case BRCTL_SET_BRIDGE_MAX_AGE:
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ return br_set_max_age(br, args[1]);
+
+ case BRCTL_SET_AGEING_TIME:
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ br->ageing_time = clock_t_to_jiffies(args[1]);
+ return 0;
+
+ case BRCTL_GET_PORT_INFO:
+ {
+ struct __port_info p;
+ struct net_bridge_port *pt;
+
+ rcu_read_lock();
+ if ((pt = br_get_port(br, args[2])) == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ memset(&p, 0, sizeof(struct __port_info));
+ memcpy(&p.designated_root, &pt->designated_root, 8);
+ memcpy(&p.designated_bridge, &pt->designated_bridge, 8);
+ p.port_id = pt->port_id;
+ p.designated_port = pt->designated_port;
+ p.path_cost = pt->path_cost;
+ p.designated_cost = pt->designated_cost;
+ p.state = pt->state;
+ p.top_change_ack = pt->topology_change_ack;
+ p.config_pending = pt->config_pending;
+ p.message_age_timer_value = br_timer_value(&pt->message_age_timer);
+ p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer);
+ p.hold_timer_value = br_timer_value(&pt->hold_timer);
+
+ rcu_read_unlock();
+
+ if (copy_to_user((void __user *)args[1], &p, sizeof(p)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case BRCTL_SET_BRIDGE_STP_STATE:
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ br_stp_set_enabled(br, args[1]);
+ return 0;
+
+ case BRCTL_SET_BRIDGE_PRIORITY:
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ br_stp_set_bridge_priority(br, args[1]);
+ return 0;
+
+ case BRCTL_SET_PORT_PRIORITY:
+ {
+ struct net_bridge_port *p;
+ int ret;
+
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&br->lock);
+ if ((p = br_get_port(br, args[1])) == NULL)
+ ret = -EINVAL;
+ else
+ ret = br_stp_set_port_priority(p, args[2]);
+ spin_unlock_bh(&br->lock);
+ return ret;
+ }
+
+ case BRCTL_SET_PATH_COST:
+ {
+ struct net_bridge_port *p;
+ int ret;
+
+ if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&br->lock);
+ if ((p = br_get_port(br, args[1])) == NULL)
+ ret = -EINVAL;
+ else
+ ret = br_stp_set_path_cost(p, args[2]);
+ spin_unlock_bh(&br->lock);
+
+ return ret;
+ }
+
+ case BRCTL_GET_FDB_ENTRIES:
+ return get_fdb_entries(br, (void __user *)args[1],
+ args[2], args[3]);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int old_deviceless(struct net *net, void __user *uarg)
+{
+ unsigned long args[3];
+
+ if (copy_from_user(args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ switch (args[0]) {
+ case BRCTL_GET_VERSION:
+ return BRCTL_VERSION;
+
+ case BRCTL_GET_BRIDGES:
+ {
+ int *indices;
+ int ret = 0;
+
+ if (args[2] >= 2048)
+ return -ENOMEM;
+ indices = kcalloc(args[2], sizeof(int), GFP_KERNEL);
+ if (indices == NULL)
+ return -ENOMEM;
+
+ args[2] = get_bridge_ifindices(net, indices, args[2]);
+
+ ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int))
+ ? -EFAULT : args[2];
+
+ kfree(indices);
+ return ret;
+ }
+
+ case BRCTL_ADD_BRIDGE:
+ case BRCTL_DEL_BRIDGE:
+ {
+ char buf[IFNAMSIZ];
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ))
+ return -EFAULT;
+
+ buf[IFNAMSIZ-1] = 0;
+
+ if (args[0] == BRCTL_ADD_BRIDGE)
+ return br_add_bridge(net, buf);
+
+ return br_del_bridge(net, buf);
+ }
+ }
+
+ return -EOPNOTSUPP;
+}
+
+int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
+{
+ switch (cmd) {
+ case SIOCGIFBR:
+ case SIOCSIFBR:
+ return old_deviceless(net, uarg);
+
+ case SIOCBRADDBR:
+ case SIOCBRDELBR:
+ {
+ char buf[IFNAMSIZ];
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(buf, uarg, IFNAMSIZ))
+ return -EFAULT;
+
+ buf[IFNAMSIZ-1] = 0;
+ if (cmd == SIOCBRADDBR)
+ return br_add_bridge(net, buf);
+
+ return br_del_bridge(net, buf);
+ }
+ }
+ return -EOPNOTSUPP;
+}
+
+int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ switch (cmd) {
+ case SIOCDEVPRIVATE:
+ return old_dev_ioctl(dev, rq, cmd);
+
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
+
+ }
+
+ br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
+ return -EOPNOTSUPP;
+}
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
new file mode 100644
index 000000000..e29ad70b3
--- /dev/null
+++ b/net/bridge/br_mdb.c
@@ -0,0 +1,499 @@
+#include <linux/err.h>
+#include <linux/igmp.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#endif
+
+#include "br_private.h"
+
+static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p;
+ struct nlattr *nest;
+
+ if (!br->multicast_router || hlist_empty(&br->router_list))
+ return 0;
+
+ nest = nla_nest_start(skb, MDBA_ROUTER);
+ if (nest == NULL)
+ return -EMSGSIZE;
+
+ hlist_for_each_entry_rcu(p, &br->router_list, rlist) {
+ if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex))
+ goto fail;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+fail:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_mdb_htable *mdb;
+ struct nlattr *nest, *nest2;
+ int i, err = 0;
+ int idx = 0, s_idx = cb->args[1];
+
+ if (br->multicast_disabled)
+ return 0;
+
+ mdb = rcu_dereference(br->mdb);
+ if (!mdb)
+ return 0;
+
+ nest = nla_nest_start(skb, MDBA_MDB);
+ if (nest == NULL)
+ return -EMSGSIZE;
+
+ for (i = 0; i < mdb->max; i++) {
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port *port;
+
+ hlist_for_each_entry_rcu(mp, &mdb->mhash[i], hlist[mdb->ver]) {
+ if (idx < s_idx)
+ goto skip;
+
+ nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+ if (nest2 == NULL) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ for (pp = &mp->ports;
+ (p = rcu_dereference(*pp)) != NULL;
+ pp = &p->next) {
+ port = p->port;
+ if (port) {
+ struct br_mdb_entry e;
+ memset(&e, 0, sizeof(e));
+ e.ifindex = port->dev->ifindex;
+ e.state = p->state;
+ if (p->addr.proto == htons(ETH_P_IP))
+ e.addr.u.ip4 = p->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (p->addr.proto == htons(ETH_P_IPV6))
+ e.addr.u.ip6 = p->addr.u.ip6;
+#endif
+ e.addr.proto = p->addr.proto;
+ if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
+ nla_nest_cancel(skb, nest2);
+ err = -EMSGSIZE;
+ goto out;
+ }
+ }
+ }
+ nla_nest_end(skb, nest2);
+ skip:
+ idx++;
+ }
+ }
+
+out:
+ cb->args[1] = idx;
+ nla_nest_end(skb, nest);
+ return err;
+}
+
+static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net_device *dev;
+ struct net *net = sock_net(skb->sk);
+ struct nlmsghdr *nlh = NULL;
+ int idx = 0, s_idx;
+
+ s_idx = cb->args[0];
+
+ rcu_read_lock();
+
+ /* In theory this could be wrapped to 0... */
+ cb->seq = net->dev_base_seq + br_mdb_rehash_seq;
+
+ for_each_netdev_rcu(net, dev) {
+ if (dev->priv_flags & IFF_EBRIDGE) {
+ struct br_port_msg *bpm;
+
+ if (idx < s_idx)
+ goto skip;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_GETMDB,
+ sizeof(*bpm), NLM_F_MULTI);
+ if (nlh == NULL)
+ break;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->ifindex = dev->ifindex;
+ if (br_mdb_fill_info(skb, cb, dev) < 0)
+ goto out;
+ if (br_rports_fill_info(skb, cb, dev) < 0)
+ goto out;
+
+ cb->args[1] = 0;
+ nlmsg_end(skb, nlh);
+ skip:
+ idx++;
+ }
+ }
+
+out:
+ if (nlh)
+ nlmsg_end(skb, nlh);
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ struct br_mdb_entry *entry, u32 pid,
+ u32 seq, int type, unsigned int flags)
+{
+ struct nlmsghdr *nlh;
+ struct br_port_msg *bpm;
+ struct nlattr *nest, *nest2;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->family = AF_BRIDGE;
+ bpm->ifindex = dev->ifindex;
+ nest = nla_nest_start(skb, MDBA_MDB);
+ if (nest == NULL)
+ goto cancel;
+ nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+ if (nest2 == NULL)
+ goto end;
+
+ if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(*entry), entry))
+ goto end;
+
+ nla_nest_end(skb, nest2);
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+end:
+ nla_nest_end(skb, nest);
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_mdb_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct br_port_msg))
+ + nla_total_size(sizeof(struct br_mdb_entry));
+}
+
+static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry,
+ int type)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_mdb_fill(skb, dev, entry, 0, 0, type, NTF_SELF);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
+ struct br_ip *group, int type)
+{
+ struct br_mdb_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.ifindex = port->dev->ifindex;
+ entry.addr.proto = group->proto;
+ entry.addr.u.ip4 = group->u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ entry.addr.u.ip6 = group->u.ip6;
+#endif
+ __br_mdb_notify(dev, &entry, type);
+}
+
+static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
+{
+ if (entry->ifindex == 0)
+ return false;
+
+ if (entry->addr.proto == htons(ETH_P_IP)) {
+ if (!ipv4_is_multicast(entry->addr.u.ip4))
+ return false;
+ if (ipv4_is_local_multicast(entry->addr.u.ip4))
+ return false;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (entry->addr.proto == htons(ETH_P_IPV6)) {
+ if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
+ return false;
+#endif
+ } else
+ return false;
+ if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
+ return false;
+
+ return true;
+}
+
+static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct net_device **pdev, struct br_mdb_entry **pentry)
+{
+ struct net *net = sock_net(skb->sk);
+ struct br_mdb_entry *entry;
+ struct br_port_msg *bpm;
+ struct nlattr *tb[MDBA_SET_ENTRY_MAX+1];
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (bpm->ifindex == 0) {
+ pr_info("PF_BRIDGE: br_mdb_parse() with invalid ifindex\n");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (dev == NULL) {
+ pr_info("PF_BRIDGE: br_mdb_parse() with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ if (!(dev->priv_flags & IFF_EBRIDGE)) {
+ pr_info("PF_BRIDGE: br_mdb_parse() with non-bridge\n");
+ return -EOPNOTSUPP;
+ }
+
+ *pdev = dev;
+
+ if (!tb[MDBA_SET_ENTRY] ||
+ nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) {
+ pr_info("PF_BRIDGE: br_mdb_parse() with invalid attr\n");
+ return -EINVAL;
+ }
+
+ entry = nla_data(tb[MDBA_SET_ENTRY]);
+ if (!is_valid_mdb_entry(entry)) {
+ pr_info("PF_BRIDGE: br_mdb_parse() with invalid entry\n");
+ return -EINVAL;
+ }
+
+ *pentry = entry;
+ return 0;
+}
+
+static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
+ struct br_ip *group, unsigned char state)
+{
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_mdb_htable *mdb;
+ int err;
+
+ mdb = mlock_dereference(br->mdb, br);
+ mp = br_mdb_ip_get(mdb, group);
+ if (!mp) {
+ mp = br_multicast_new_group(br, port, group);
+ err = PTR_ERR(mp);
+ if (IS_ERR(mp))
+ return err;
+ }
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p->port == port)
+ return -EEXIST;
+ if ((unsigned long)p->port < (unsigned long)port)
+ break;
+ }
+
+ p = br_multicast_new_port_group(port, group, *pp, state);
+ if (unlikely(!p))
+ return -ENOMEM;
+ rcu_assign_pointer(*pp, p);
+
+ br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
+ return 0;
+}
+
+static int __br_mdb_add(struct net *net, struct net_bridge *br,
+ struct br_mdb_entry *entry)
+{
+ struct br_ip ip;
+ struct net_device *dev;
+ struct net_bridge_port *p;
+ int ret;
+
+ if (!netif_running(br->dev) || br->multicast_disabled)
+ return -EINVAL;
+
+ dev = __dev_get_by_index(net, entry->ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ p = br_port_get_rtnl(dev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+
+ ip.proto = entry->addr.proto;
+ if (ip.proto == htons(ETH_P_IP))
+ ip.u.ip4 = entry->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ ip.u.ip6 = entry->addr.u.ip6;
+#endif
+
+ spin_lock_bh(&br->multicast_lock);
+ ret = br_mdb_add_group(br, p, &ip, entry->state);
+ spin_unlock_bh(&br->multicast_lock);
+ return ret;
+}
+
+static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct br_mdb_entry *entry;
+ struct net_device *dev;
+ struct net_bridge *br;
+ int err;
+
+ err = br_mdb_parse(skb, nlh, &dev, &entry);
+ if (err < 0)
+ return err;
+
+ br = netdev_priv(dev);
+
+ err = __br_mdb_add(net, br, entry);
+ if (!err)
+ __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ return err;
+}
+
+static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ struct br_ip ip;
+ int err = -EINVAL;
+
+ if (!netif_running(br->dev) || br->multicast_disabled)
+ return -EINVAL;
+
+ ip.proto = entry->addr.proto;
+ if (ip.proto == htons(ETH_P_IP)) {
+ if (timer_pending(&br->ip4_other_query.timer))
+ return -EBUSY;
+
+ ip.u.ip4 = entry->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (timer_pending(&br->ip6_other_query.timer))
+ return -EBUSY;
+
+ ip.u.ip6 = entry->addr.u.ip6;
+#endif
+ }
+
+ spin_lock_bh(&br->multicast_lock);
+ mdb = mlock_dereference(br->mdb, br);
+
+ mp = br_mdb_ip_get(mdb, &ip);
+ if (!mp)
+ goto unlock;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (!p->port || p->port->dev->ifindex != entry->ifindex)
+ continue;
+
+ if (p->port->state == BR_STATE_DISABLED)
+ goto unlock;
+
+ rcu_assign_pointer(*pp, p->next);
+ hlist_del_init(&p->mglist);
+ del_timer(&p->timer);
+ call_rcu_bh(&p->rcu, br_multicast_free_pg);
+ err = 0;
+
+ if (!mp->ports && !mp->mglist &&
+ netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+ break;
+ }
+
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+ return err;
+}
+
+static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net_device *dev;
+ struct br_mdb_entry *entry;
+ struct net_bridge *br;
+ int err;
+
+ err = br_mdb_parse(skb, nlh, &dev, &entry);
+ if (err < 0)
+ return err;
+
+ br = netdev_priv(dev);
+
+ err = __br_mdb_del(br, entry);
+ if (!err)
+ __br_mdb_notify(dev, entry, RTM_DELMDB);
+ return err;
+}
+
+void br_mdb_init(void)
+{
+ rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
+ rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, NULL);
+}
+
+void br_mdb_uninit(void)
+{
+ rtnl_unregister(PF_BRIDGE, RTM_GETMDB);
+ rtnl_unregister(PF_BRIDGE, RTM_NEWMDB);
+ rtnl_unregister(PF_BRIDGE, RTM_DELMDB);
+}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
new file mode 100644
index 000000000..ff667e18b
--- /dev/null
+++ b/net/bridge/br_multicast.c
@@ -0,0 +1,2301 @@
+/*
+ * Bridge multicast support.
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/inetdevice.h>
+#include <net/ip.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/ip6_checksum.h>
+#include <net/addrconf.h>
+#endif
+
+#include "br_private.h"
+
+static void br_multicast_start_querier(struct net_bridge *br,
+ struct bridge_mcast_own_query *query);
+unsigned int br_mdb_rehash_seq;
+
+static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b)
+{
+ if (a->proto != b->proto)
+ return 0;
+ if (a->vid != b->vid)
+ return 0;
+ switch (a->proto) {
+ case htons(ETH_P_IP):
+ return a->u.ip4 == b->u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return ipv6_addr_equal(&a->u.ip6, &b->u.ip6);
+#endif
+ }
+ return 0;
+}
+
+static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip,
+ __u16 vid)
+{
+ return jhash_2words((__force u32)ip, vid, mdb->secret) & (mdb->max - 1);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb,
+ const struct in6_addr *ip,
+ __u16 vid)
+{
+ return jhash_2words(ipv6_addr_hash(ip), vid,
+ mdb->secret) & (mdb->max - 1);
+}
+#endif
+
+static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb,
+ struct br_ip *ip)
+{
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ return __br_ip4_hash(mdb, ip->u.ip4, ip->vid);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return __br_ip6_hash(mdb, &ip->u.ip6, ip->vid);
+#endif
+ }
+ return 0;
+}
+
+static struct net_bridge_mdb_entry *__br_mdb_ip_get(
+ struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash)
+{
+ struct net_bridge_mdb_entry *mp;
+
+ hlist_for_each_entry_rcu(mp, &mdb->mhash[hash], hlist[mdb->ver]) {
+ if (br_ip_equal(&mp->addr, dst))
+ return mp;
+ }
+
+ return NULL;
+}
+
+struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge_mdb_htable *mdb,
+ struct br_ip *dst)
+{
+ if (!mdb)
+ return NULL;
+
+ return __br_mdb_ip_get(mdb, dst, br_ip_hash(mdb, dst));
+}
+
+static struct net_bridge_mdb_entry *br_mdb_ip4_get(
+ struct net_bridge_mdb_htable *mdb, __be32 dst, __u16 vid)
+{
+ struct br_ip br_dst;
+
+ br_dst.u.ip4 = dst;
+ br_dst.proto = htons(ETH_P_IP);
+ br_dst.vid = vid;
+
+ return br_mdb_ip_get(mdb, &br_dst);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct net_bridge_mdb_entry *br_mdb_ip6_get(
+ struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst,
+ __u16 vid)
+{
+ struct br_ip br_dst;
+
+ br_dst.u.ip6 = *dst;
+ br_dst.proto = htons(ETH_P_IPV6);
+ br_dst.vid = vid;
+
+ return br_mdb_ip_get(mdb, &br_dst);
+}
+#endif
+
+struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
+ struct sk_buff *skb, u16 vid)
+{
+ struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
+ struct br_ip ip;
+
+ if (br->multicast_disabled)
+ return NULL;
+
+ if (BR_INPUT_SKB_CB(skb)->igmp)
+ return NULL;
+
+ ip.proto = skb->protocol;
+ ip.vid = vid;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ip.u.ip4 = ip_hdr(skb)->daddr;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ ip.u.ip6 = ipv6_hdr(skb)->daddr;
+ break;
+#endif
+ default:
+ return NULL;
+ }
+
+ return br_mdb_ip_get(mdb, &ip);
+}
+
+static void br_mdb_free(struct rcu_head *head)
+{
+ struct net_bridge_mdb_htable *mdb =
+ container_of(head, struct net_bridge_mdb_htable, rcu);
+ struct net_bridge_mdb_htable *old = mdb->old;
+
+ mdb->old = NULL;
+ kfree(old->mhash);
+ kfree(old);
+}
+
+static int br_mdb_copy(struct net_bridge_mdb_htable *new,
+ struct net_bridge_mdb_htable *old,
+ int elasticity)
+{
+ struct net_bridge_mdb_entry *mp;
+ int maxlen;
+ int len;
+ int i;
+
+ for (i = 0; i < old->max; i++)
+ hlist_for_each_entry(mp, &old->mhash[i], hlist[old->ver])
+ hlist_add_head(&mp->hlist[new->ver],
+ &new->mhash[br_ip_hash(new, &mp->addr)]);
+
+ if (!elasticity)
+ return 0;
+
+ maxlen = 0;
+ for (i = 0; i < new->max; i++) {
+ len = 0;
+ hlist_for_each_entry(mp, &new->mhash[i], hlist[new->ver])
+ len++;
+ if (len > maxlen)
+ maxlen = len;
+ }
+
+ return maxlen > elasticity ? -EINVAL : 0;
+}
+
+void br_multicast_free_pg(struct rcu_head *head)
+{
+ struct net_bridge_port_group *p =
+ container_of(head, struct net_bridge_port_group, rcu);
+
+ kfree(p);
+}
+
+static void br_multicast_free_group(struct rcu_head *head)
+{
+ struct net_bridge_mdb_entry *mp =
+ container_of(head, struct net_bridge_mdb_entry, rcu);
+
+ kfree(mp);
+}
+
+static void br_multicast_group_expired(unsigned long data)
+{
+ struct net_bridge_mdb_entry *mp = (void *)data;
+ struct net_bridge *br = mp->br;
+ struct net_bridge_mdb_htable *mdb;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || timer_pending(&mp->timer))
+ goto out;
+
+ mp->mglist = false;
+
+ if (mp->ports)
+ goto out;
+
+ mdb = mlock_dereference(br->mdb, br);
+
+ hlist_del_rcu(&mp->hlist[mdb->ver]);
+ mdb->size--;
+
+ call_rcu_bh(&mp->rcu, br_multicast_free_group);
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_multicast_del_pg(struct net_bridge *br,
+ struct net_bridge_port_group *pg)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+
+ mdb = mlock_dereference(br->mdb, br);
+
+ mp = br_mdb_ip_get(mdb, &pg->addr);
+ if (WARN_ON(!mp))
+ return;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p != pg)
+ continue;
+
+ rcu_assign_pointer(*pp, p->next);
+ hlist_del_init(&p->mglist);
+ del_timer(&p->timer);
+ call_rcu_bh(&p->rcu, br_multicast_free_pg);
+
+ if (!mp->ports && !mp->mglist &&
+ netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+
+ return;
+ }
+
+ WARN_ON(1);
+}
+
+static void br_multicast_port_group_expired(unsigned long data)
+{
+ struct net_bridge_port_group *pg = (void *)data;
+ struct net_bridge *br = pg->port->br;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
+ hlist_unhashed(&pg->mglist) || pg->state & MDB_PERMANENT)
+ goto out;
+
+ br_multicast_del_pg(br, pg);
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max,
+ int elasticity)
+{
+ struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1);
+ struct net_bridge_mdb_htable *mdb;
+ int err;
+
+ mdb = kmalloc(sizeof(*mdb), GFP_ATOMIC);
+ if (!mdb)
+ return -ENOMEM;
+
+ mdb->max = max;
+ mdb->old = old;
+
+ mdb->mhash = kzalloc(max * sizeof(*mdb->mhash), GFP_ATOMIC);
+ if (!mdb->mhash) {
+ kfree(mdb);
+ return -ENOMEM;
+ }
+
+ mdb->size = old ? old->size : 0;
+ mdb->ver = old ? old->ver ^ 1 : 0;
+
+ if (!old || elasticity)
+ get_random_bytes(&mdb->secret, sizeof(mdb->secret));
+ else
+ mdb->secret = old->secret;
+
+ if (!old)
+ goto out;
+
+ err = br_mdb_copy(mdb, old, elasticity);
+ if (err) {
+ kfree(mdb->mhash);
+ kfree(mdb);
+ return err;
+ }
+
+ br_mdb_rehash_seq++;
+ call_rcu_bh(&mdb->rcu, br_mdb_free);
+
+out:
+ rcu_assign_pointer(*mdbp, mdb);
+
+ return 0;
+}
+
+static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
+ __be32 group)
+{
+ struct sk_buff *skb;
+ struct igmphdr *ih;
+ struct ethhdr *eth;
+ struct iphdr *iph;
+
+ skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
+ sizeof(*ih) + 4);
+ if (!skb)
+ goto out;
+
+ skb->protocol = htons(ETH_P_IP);
+
+ skb_reset_mac_header(skb);
+ eth = eth_hdr(skb);
+
+ ether_addr_copy(eth->h_source, br->dev->dev_addr);
+ eth->h_dest[0] = 1;
+ eth->h_dest[1] = 0;
+ eth->h_dest[2] = 0x5e;
+ eth->h_dest[3] = 0;
+ eth->h_dest[4] = 0;
+ eth->h_dest[5] = 1;
+ eth->h_proto = htons(ETH_P_IP);
+ skb_put(skb, sizeof(*eth));
+
+ skb_set_network_header(skb, skb->len);
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->ihl = 6;
+ iph->tos = 0xc0;
+ iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4);
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = 1;
+ iph->protocol = IPPROTO_IGMP;
+ iph->saddr = br->multicast_query_use_ifaddr ?
+ inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0;
+ iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
+ ip_send_check(iph);
+ skb_put(skb, 24);
+
+ skb_set_transport_header(skb, skb->len);
+ ih = igmp_hdr(skb);
+ ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
+ ih->code = (group ? br->multicast_last_member_interval :
+ br->multicast_query_response_interval) /
+ (HZ / IGMP_TIMER_SCALE);
+ ih->group = group;
+ ih->csum = 0;
+ ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+ skb_put(skb, sizeof(*ih));
+
+ __skb_pull(skb, sizeof(*eth));
+
+out:
+ return skb;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
+ const struct in6_addr *group)
+{
+ struct sk_buff *skb;
+ struct ipv6hdr *ip6h;
+ struct mld_msg *mldq;
+ struct ethhdr *eth;
+ u8 *hopopt;
+ unsigned long interval;
+
+ skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
+ 8 + sizeof(*mldq));
+ if (!skb)
+ goto out;
+
+ skb->protocol = htons(ETH_P_IPV6);
+
+ /* Ethernet header */
+ skb_reset_mac_header(skb);
+ eth = eth_hdr(skb);
+
+ ether_addr_copy(eth->h_source, br->dev->dev_addr);
+ eth->h_proto = htons(ETH_P_IPV6);
+ skb_put(skb, sizeof(*eth));
+
+ /* IPv6 header + HbH option */
+ skb_set_network_header(skb, skb->len);
+ ip6h = ipv6_hdr(skb);
+
+ *(__force __be32 *)ip6h = htonl(0x60000000);
+ ip6h->payload_len = htons(8 + sizeof(*mldq));
+ ip6h->nexthdr = IPPROTO_HOPOPTS;
+ ip6h->hop_limit = 1;
+ ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
+ if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
+ &ip6h->saddr)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+ ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
+
+ hopopt = (u8 *)(ip6h + 1);
+ hopopt[0] = IPPROTO_ICMPV6; /* next hdr */
+ hopopt[1] = 0; /* length of HbH */
+ hopopt[2] = IPV6_TLV_ROUTERALERT; /* Router Alert */
+ hopopt[3] = 2; /* Length of RA Option */
+ hopopt[4] = 0; /* Type = 0x0000 (MLD) */
+ hopopt[5] = 0;
+ hopopt[6] = IPV6_TLV_PAD1; /* Pad1 */
+ hopopt[7] = IPV6_TLV_PAD1; /* Pad1 */
+
+ skb_put(skb, sizeof(*ip6h) + 8);
+
+ /* ICMPv6 */
+ skb_set_transport_header(skb, skb->len);
+ mldq = (struct mld_msg *) icmp6_hdr(skb);
+
+ interval = ipv6_addr_any(group) ?
+ br->multicast_query_response_interval :
+ br->multicast_last_member_interval;
+
+ mldq->mld_type = ICMPV6_MGM_QUERY;
+ mldq->mld_code = 0;
+ mldq->mld_cksum = 0;
+ mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
+ mldq->mld_reserved = 0;
+ mldq->mld_mca = *group;
+
+ /* checksum */
+ mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ sizeof(*mldq), IPPROTO_ICMPV6,
+ csum_partial(mldq,
+ sizeof(*mldq), 0));
+ skb_put(skb, sizeof(*mldq));
+
+ __skb_pull(skb, sizeof(*eth));
+
+out:
+ return skb;
+}
+#endif
+
+static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
+ struct br_ip *addr)
+{
+ switch (addr->proto) {
+ case htons(ETH_P_IP):
+ return br_ip4_multicast_alloc_query(br, addr->u.ip4);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return br_ip6_multicast_alloc_query(br, &addr->u.ip6);
+#endif
+ }
+ return NULL;
+}
+
+static struct net_bridge_mdb_entry *br_multicast_get_group(
+ struct net_bridge *br, struct net_bridge_port *port,
+ struct br_ip *group, int hash)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ unsigned int count = 0;
+ unsigned int max;
+ int elasticity;
+ int err;
+
+ mdb = rcu_dereference_protected(br->mdb, 1);
+ hlist_for_each_entry(mp, &mdb->mhash[hash], hlist[mdb->ver]) {
+ count++;
+ if (unlikely(br_ip_equal(group, &mp->addr)))
+ return mp;
+ }
+
+ elasticity = 0;
+ max = mdb->max;
+
+ if (unlikely(count > br->hash_elasticity && count)) {
+ if (net_ratelimit())
+ br_info(br, "Multicast hash table "
+ "chain limit reached: %s\n",
+ port ? port->dev->name : br->dev->name);
+
+ elasticity = br->hash_elasticity;
+ }
+
+ if (mdb->size >= max) {
+ max *= 2;
+ if (unlikely(max > br->hash_max)) {
+ br_warn(br, "Multicast hash table maximum of %d "
+ "reached, disabling snooping: %s\n",
+ br->hash_max,
+ port ? port->dev->name : br->dev->name);
+ err = -E2BIG;
+disable:
+ br->multicast_disabled = 1;
+ goto err;
+ }
+ }
+
+ if (max > mdb->max || elasticity) {
+ if (mdb->old) {
+ if (net_ratelimit())
+ br_info(br, "Multicast hash table "
+ "on fire: %s\n",
+ port ? port->dev->name : br->dev->name);
+ err = -EEXIST;
+ goto err;
+ }
+
+ err = br_mdb_rehash(&br->mdb, max, elasticity);
+ if (err) {
+ br_warn(br, "Cannot rehash multicast "
+ "hash table, disabling snooping: %s, %d, %d\n",
+ port ? port->dev->name : br->dev->name,
+ mdb->size, err);
+ goto disable;
+ }
+
+ err = -EAGAIN;
+ goto err;
+ }
+
+ return NULL;
+
+err:
+ mp = ERR_PTR(err);
+ return mp;
+}
+
+struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
+ struct net_bridge_port *port, struct br_ip *group)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ int hash;
+ int err;
+
+ mdb = rcu_dereference_protected(br->mdb, 1);
+ if (!mdb) {
+ err = br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0);
+ if (err)
+ return ERR_PTR(err);
+ goto rehash;
+ }
+
+ hash = br_ip_hash(mdb, group);
+ mp = br_multicast_get_group(br, port, group, hash);
+ switch (PTR_ERR(mp)) {
+ case 0:
+ break;
+
+ case -EAGAIN:
+rehash:
+ mdb = rcu_dereference_protected(br->mdb, 1);
+ hash = br_ip_hash(mdb, group);
+ break;
+
+ default:
+ goto out;
+ }
+
+ mp = kzalloc(sizeof(*mp), GFP_ATOMIC);
+ if (unlikely(!mp))
+ return ERR_PTR(-ENOMEM);
+
+ mp->br = br;
+ mp->addr = *group;
+ setup_timer(&mp->timer, br_multicast_group_expired,
+ (unsigned long)mp);
+
+ hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]);
+ mdb->size++;
+
+out:
+ return mp;
+}
+
+struct net_bridge_port_group *br_multicast_new_port_group(
+ struct net_bridge_port *port,
+ struct br_ip *group,
+ struct net_bridge_port_group __rcu *next,
+ unsigned char state)
+{
+ struct net_bridge_port_group *p;
+
+ p = kzalloc(sizeof(*p), GFP_ATOMIC);
+ if (unlikely(!p))
+ return NULL;
+
+ p->addr = *group;
+ p->port = port;
+ p->state = state;
+ rcu_assign_pointer(p->next, next);
+ hlist_add_head(&p->mglist, &port->mglist);
+ setup_timer(&p->timer, br_multicast_port_group_expired,
+ (unsigned long)p);
+ return p;
+}
+
+static int br_multicast_add_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct br_ip *group)
+{
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ unsigned long now = jiffies;
+ int err;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) ||
+ (port && port->state == BR_STATE_DISABLED))
+ goto out;
+
+ mp = br_multicast_new_group(br, port, group);
+ err = PTR_ERR(mp);
+ if (IS_ERR(mp))
+ goto err;
+
+ if (!port) {
+ mp->mglist = true;
+ mod_timer(&mp->timer, now + br->multicast_membership_interval);
+ goto out;
+ }
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p->port == port)
+ goto found;
+ if ((unsigned long)p->port < (unsigned long)port)
+ break;
+ }
+
+ p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY);
+ if (unlikely(!p))
+ goto err;
+ rcu_assign_pointer(*pp, p);
+ br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
+
+found:
+ mod_timer(&p->timer, now + br->multicast_membership_interval);
+out:
+ err = 0;
+
+err:
+ spin_unlock(&br->multicast_lock);
+ return err;
+}
+
+static int br_ip4_multicast_add_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ __be32 group,
+ __u16 vid)
+{
+ struct br_ip br_group;
+
+ if (ipv4_is_local_multicast(group))
+ return 0;
+
+ br_group.u.ip4 = group;
+ br_group.proto = htons(ETH_P_IP);
+ br_group.vid = vid;
+
+ return br_multicast_add_group(br, port, &br_group);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int br_ip6_multicast_add_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ const struct in6_addr *group,
+ __u16 vid)
+{
+ struct br_ip br_group;
+
+ if (ipv6_addr_is_ll_all_nodes(group))
+ return 0;
+
+ br_group.u.ip6 = *group;
+ br_group.proto = htons(ETH_P_IPV6);
+ br_group.vid = vid;
+
+ return br_multicast_add_group(br, port, &br_group);
+}
+#endif
+
+static void br_multicast_router_expired(unsigned long data)
+{
+ struct net_bridge_port *port = (void *)data;
+ struct net_bridge *br = port->br;
+
+ spin_lock(&br->multicast_lock);
+ if (port->multicast_router != 1 ||
+ timer_pending(&port->multicast_router_timer) ||
+ hlist_unhashed(&port->rlist))
+ goto out;
+
+ hlist_del_init_rcu(&port->rlist);
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_multicast_local_router_expired(unsigned long data)
+{
+}
+
+static void br_multicast_querier_expired(struct net_bridge *br,
+ struct bridge_mcast_own_query *query)
+{
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || br->multicast_disabled)
+ goto out;
+
+ br_multicast_start_querier(br, query);
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_ip4_multicast_querier_expired(unsigned long data)
+{
+ struct net_bridge *br = (void *)data;
+
+ br_multicast_querier_expired(br, &br->ip4_own_query);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_querier_expired(unsigned long data)
+{
+ struct net_bridge *br = (void *)data;
+
+ br_multicast_querier_expired(br, &br->ip6_own_query);
+}
+#endif
+
+static void br_multicast_select_own_querier(struct net_bridge *br,
+ struct br_ip *ip,
+ struct sk_buff *skb)
+{
+ if (ip->proto == htons(ETH_P_IP))
+ br->ip4_querier.addr.u.ip4 = ip_hdr(skb)->saddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ br->ip6_querier.addr.u.ip6 = ipv6_hdr(skb)->saddr;
+#endif
+}
+
+static void __br_multicast_send_query(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct br_ip *ip)
+{
+ struct sk_buff *skb;
+
+ skb = br_multicast_alloc_query(br, ip);
+ if (!skb)
+ return;
+
+ if (port) {
+ skb->dev = port->dev;
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb,
+ NULL, skb->dev,
+ br_dev_queue_push_xmit);
+ } else {
+ br_multicast_select_own_querier(br, ip, skb);
+ netif_rx(skb);
+ }
+}
+
+static void br_multicast_send_query(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct bridge_mcast_own_query *own_query)
+{
+ unsigned long time;
+ struct br_ip br_group;
+ struct bridge_mcast_other_query *other_query = NULL;
+
+ if (!netif_running(br->dev) || br->multicast_disabled ||
+ !br->multicast_querier)
+ return;
+
+ memset(&br_group.u, 0, sizeof(br_group.u));
+
+ if (port ? (own_query == &port->ip4_own_query) :
+ (own_query == &br->ip4_own_query)) {
+ other_query = &br->ip4_other_query;
+ br_group.proto = htons(ETH_P_IP);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ other_query = &br->ip6_other_query;
+ br_group.proto = htons(ETH_P_IPV6);
+#endif
+ }
+
+ if (!other_query || timer_pending(&other_query->timer))
+ return;
+
+ __br_multicast_send_query(br, port, &br_group);
+
+ time = jiffies;
+ time += own_query->startup_sent < br->multicast_startup_query_count ?
+ br->multicast_startup_query_interval :
+ br->multicast_query_interval;
+ mod_timer(&own_query->timer, time);
+}
+
+static void
+br_multicast_port_query_expired(struct net_bridge_port *port,
+ struct bridge_mcast_own_query *query)
+{
+ struct net_bridge *br = port->br;
+
+ spin_lock(&br->multicast_lock);
+ if (port->state == BR_STATE_DISABLED ||
+ port->state == BR_STATE_BLOCKING)
+ goto out;
+
+ if (query->startup_sent < br->multicast_startup_query_count)
+ query->startup_sent++;
+
+ br_multicast_send_query(port->br, port, query);
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_ip4_multicast_port_query_expired(unsigned long data)
+{
+ struct net_bridge_port *port = (void *)data;
+
+ br_multicast_port_query_expired(port, &port->ip4_own_query);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_port_query_expired(unsigned long data)
+{
+ struct net_bridge_port *port = (void *)data;
+
+ br_multicast_port_query_expired(port, &port->ip6_own_query);
+}
+#endif
+
+void br_multicast_add_port(struct net_bridge_port *port)
+{
+ port->multicast_router = 1;
+
+ setup_timer(&port->multicast_router_timer, br_multicast_router_expired,
+ (unsigned long)port);
+ setup_timer(&port->ip4_own_query.timer,
+ br_ip4_multicast_port_query_expired, (unsigned long)port);
+#if IS_ENABLED(CONFIG_IPV6)
+ setup_timer(&port->ip6_own_query.timer,
+ br_ip6_multicast_port_query_expired, (unsigned long)port);
+#endif
+}
+
+void br_multicast_del_port(struct net_bridge_port *port)
+{
+ del_timer_sync(&port->multicast_router_timer);
+}
+
+static void br_multicast_enable(struct bridge_mcast_own_query *query)
+{
+ query->startup_sent = 0;
+
+ if (try_to_del_timer_sync(&query->timer) >= 0 ||
+ del_timer(&query->timer))
+ mod_timer(&query->timer, jiffies);
+}
+
+void br_multicast_enable_port(struct net_bridge_port *port)
+{
+ struct net_bridge *br = port->br;
+
+ spin_lock(&br->multicast_lock);
+ if (br->multicast_disabled || !netif_running(br->dev))
+ goto out;
+
+ br_multicast_enable(&port->ip4_own_query);
+#if IS_ENABLED(CONFIG_IPV6)
+ br_multicast_enable(&port->ip6_own_query);
+#endif
+
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+void br_multicast_disable_port(struct net_bridge_port *port)
+{
+ struct net_bridge *br = port->br;
+ struct net_bridge_port_group *pg;
+ struct hlist_node *n;
+
+ spin_lock(&br->multicast_lock);
+ hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
+ br_multicast_del_pg(br, pg);
+
+ if (!hlist_unhashed(&port->rlist))
+ hlist_del_init_rcu(&port->rlist);
+ del_timer(&port->multicast_router_timer);
+ del_timer(&port->ip4_own_query.timer);
+#if IS_ENABLED(CONFIG_IPV6)
+ del_timer(&port->ip6_own_query.timer);
+#endif
+ spin_unlock(&br->multicast_lock);
+}
+
+static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ struct igmpv3_report *ih;
+ struct igmpv3_grec *grec;
+ int i;
+ int len;
+ int num;
+ int type;
+ int err = 0;
+ __be32 group;
+
+ if (!pskb_may_pull(skb, sizeof(*ih)))
+ return -EINVAL;
+
+ ih = igmpv3_report_hdr(skb);
+ num = ntohs(ih->ngrec);
+ len = sizeof(*ih);
+
+ for (i = 0; i < num; i++) {
+ len += sizeof(*grec);
+ if (!pskb_may_pull(skb, len))
+ return -EINVAL;
+
+ grec = (void *)(skb->data + len - sizeof(*grec));
+ group = grec->grec_mca;
+ type = grec->grec_type;
+
+ len += ntohs(grec->grec_nsrcs) * 4;
+ if (!pskb_may_pull(skb, len))
+ return -EINVAL;
+
+ /* We treat this as an IGMPv2 report for now. */
+ switch (type) {
+ case IGMPV3_MODE_IS_INCLUDE:
+ case IGMPV3_MODE_IS_EXCLUDE:
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ break;
+
+ default:
+ continue;
+ }
+
+ err = br_ip4_multicast_add_group(br, port, group, vid);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int br_ip6_multicast_mld2_report(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ struct icmp6hdr *icmp6h;
+ struct mld2_grec *grec;
+ int i;
+ int len;
+ int num;
+ int err = 0;
+
+ if (!pskb_may_pull(skb, sizeof(*icmp6h)))
+ return -EINVAL;
+
+ icmp6h = icmp6_hdr(skb);
+ num = ntohs(icmp6h->icmp6_dataun.un_data16[1]);
+ len = sizeof(*icmp6h);
+
+ for (i = 0; i < num; i++) {
+ __be16 *nsrcs, _nsrcs;
+
+ nsrcs = skb_header_pointer(skb,
+ len + offsetof(struct mld2_grec,
+ grec_nsrcs),
+ sizeof(_nsrcs), &_nsrcs);
+ if (!nsrcs)
+ return -EINVAL;
+
+ if (!pskb_may_pull(skb,
+ len + sizeof(*grec) +
+ sizeof(struct in6_addr) * ntohs(*nsrcs)))
+ return -EINVAL;
+
+ grec = (struct mld2_grec *)(skb->data + len);
+ len += sizeof(*grec) +
+ sizeof(struct in6_addr) * ntohs(*nsrcs);
+
+ /* We treat these as MLDv1 reports for now. */
+ switch (grec->grec_type) {
+ case MLD2_MODE_IS_INCLUDE:
+ case MLD2_MODE_IS_EXCLUDE:
+ case MLD2_CHANGE_TO_INCLUDE:
+ case MLD2_CHANGE_TO_EXCLUDE:
+ case MLD2_ALLOW_NEW_SOURCES:
+ case MLD2_BLOCK_OLD_SOURCES:
+ break;
+
+ default:
+ continue;
+ }
+
+ err = br_ip6_multicast_add_group(br, port, &grec->grec_mca,
+ vid);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+#endif
+
+static bool br_ip4_multicast_select_querier(struct net_bridge *br,
+ struct net_bridge_port *port,
+ __be32 saddr)
+{
+ if (!timer_pending(&br->ip4_own_query.timer) &&
+ !timer_pending(&br->ip4_other_query.timer))
+ goto update;
+
+ if (!br->ip4_querier.addr.u.ip4)
+ goto update;
+
+ if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.u.ip4))
+ goto update;
+
+ return false;
+
+update:
+ br->ip4_querier.addr.u.ip4 = saddr;
+
+ /* update protected by general multicast_lock by caller */
+ rcu_assign_pointer(br->ip4_querier.port, port);
+
+ return true;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static bool br_ip6_multicast_select_querier(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct in6_addr *saddr)
+{
+ if (!timer_pending(&br->ip6_own_query.timer) &&
+ !timer_pending(&br->ip6_other_query.timer))
+ goto update;
+
+ if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.u.ip6) <= 0)
+ goto update;
+
+ return false;
+
+update:
+ br->ip6_querier.addr.u.ip6 = *saddr;
+
+ /* update protected by general multicast_lock by caller */
+ rcu_assign_pointer(br->ip6_querier.port, port);
+
+ return true;
+}
+#endif
+
+static bool br_multicast_select_querier(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct br_ip *saddr)
+{
+ switch (saddr->proto) {
+ case htons(ETH_P_IP):
+ return br_ip4_multicast_select_querier(br, port, saddr->u.ip4);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return br_ip6_multicast_select_querier(br, port, &saddr->u.ip6);
+#endif
+ }
+
+ return false;
+}
+
+static void
+br_multicast_update_query_timer(struct net_bridge *br,
+ struct bridge_mcast_other_query *query,
+ unsigned long max_delay)
+{
+ if (!timer_pending(&query->timer))
+ query->delay_time = jiffies + max_delay;
+
+ mod_timer(&query->timer, jiffies + br->multicast_querier_interval);
+}
+
+/*
+ * Add port to router_list
+ * list is maintained ordered by pointer value
+ * and locked by br->multicast_lock and RCU
+ */
+static void br_multicast_add_router(struct net_bridge *br,
+ struct net_bridge_port *port)
+{
+ struct net_bridge_port *p;
+ struct hlist_node *slot = NULL;
+
+ if (!hlist_unhashed(&port->rlist))
+ return;
+
+ hlist_for_each_entry(p, &br->router_list, rlist) {
+ if ((unsigned long) port >= (unsigned long) p)
+ break;
+ slot = &p->rlist;
+ }
+
+ if (slot)
+ hlist_add_behind_rcu(&port->rlist, slot);
+ else
+ hlist_add_head_rcu(&port->rlist, &br->router_list);
+}
+
+static void br_multicast_mark_router(struct net_bridge *br,
+ struct net_bridge_port *port)
+{
+ unsigned long now = jiffies;
+
+ if (!port) {
+ if (br->multicast_router == 1)
+ mod_timer(&br->multicast_router_timer,
+ now + br->multicast_querier_interval);
+ return;
+ }
+
+ if (port->multicast_router != 1)
+ return;
+
+ br_multicast_add_router(br, port);
+
+ mod_timer(&port->multicast_router_timer,
+ now + br->multicast_querier_interval);
+}
+
+static void br_multicast_query_received(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct bridge_mcast_other_query *query,
+ struct br_ip *saddr,
+ unsigned long max_delay)
+{
+ if (!br_multicast_select_querier(br, port, saddr))
+ return;
+
+ br_multicast_update_query_timer(br, query, max_delay);
+ br_multicast_mark_router(br, port);
+}
+
+static int br_ip4_multicast_query(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct igmphdr *ih = igmp_hdr(skb);
+ struct net_bridge_mdb_entry *mp;
+ struct igmpv3_query *ih3;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ struct br_ip saddr;
+ unsigned long max_delay;
+ unsigned long now = jiffies;
+ __be32 group;
+ int err = 0;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) ||
+ (port && port->state == BR_STATE_DISABLED))
+ goto out;
+
+ group = ih->group;
+
+ if (skb->len == sizeof(*ih)) {
+ max_delay = ih->code * (HZ / IGMP_TIMER_SCALE);
+
+ if (!max_delay) {
+ max_delay = 10 * HZ;
+ group = 0;
+ }
+ } else {
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ih3 = igmpv3_query_hdr(skb);
+ if (ih3->nsrcs)
+ goto out;
+
+ max_delay = ih3->code ?
+ IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1;
+ }
+
+ /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+ * all-systems destination addresses (224.0.0.1) for general queries
+ */
+ if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!group) {
+ saddr.proto = htons(ETH_P_IP);
+ saddr.u.ip4 = iph->saddr;
+
+ br_multicast_query_received(br, port, &br->ip4_other_query,
+ &saddr, max_delay);
+ goto out;
+ }
+
+ mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid);
+ if (!mp)
+ goto out;
+
+ max_delay *= br->multicast_last_member_count;
+
+ if (mp->mglist &&
+ (timer_pending(&mp->timer) ?
+ time_after(mp->timer.expires, now + max_delay) :
+ try_to_del_timer_sync(&mp->timer) >= 0))
+ mod_timer(&mp->timer, now + max_delay);
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, now + max_delay) :
+ try_to_del_timer_sync(&p->timer) >= 0)
+ mod_timer(&p->timer, now + max_delay);
+ }
+
+out:
+ spin_unlock(&br->multicast_lock);
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int br_ip6_multicast_query(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct mld_msg *mld;
+ struct net_bridge_mdb_entry *mp;
+ struct mld2_query *mld2q;
+ struct net_bridge_port_group *p;
+ struct net_bridge_port_group __rcu **pp;
+ struct br_ip saddr;
+ unsigned long max_delay;
+ unsigned long now = jiffies;
+ const struct in6_addr *group = NULL;
+ bool is_general_query;
+ int err = 0;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) ||
+ (port && port->state == BR_STATE_DISABLED))
+ goto out;
+
+ /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
+ if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (skb->len == sizeof(*mld)) {
+ if (!pskb_may_pull(skb, sizeof(*mld))) {
+ err = -EINVAL;
+ goto out;
+ }
+ mld = (struct mld_msg *) icmp6_hdr(skb);
+ max_delay = msecs_to_jiffies(ntohs(mld->mld_maxdelay));
+ if (max_delay)
+ group = &mld->mld_mca;
+ } else {
+ if (!pskb_may_pull(skb, sizeof(*mld2q))) {
+ err = -EINVAL;
+ goto out;
+ }
+ mld2q = (struct mld2_query *)icmp6_hdr(skb);
+ if (!mld2q->mld2q_nsrcs)
+ group = &mld2q->mld2q_mca;
+
+ max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL);
+ }
+
+ is_general_query = group && ipv6_addr_any(group);
+
+ /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
+ * all-nodes destination address (ff02::1) for general queries
+ */
+ if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (is_general_query) {
+ saddr.proto = htons(ETH_P_IPV6);
+ saddr.u.ip6 = ip6h->saddr;
+
+ br_multicast_query_received(br, port, &br->ip6_other_query,
+ &saddr, max_delay);
+ goto out;
+ } else if (!group) {
+ goto out;
+ }
+
+ mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid);
+ if (!mp)
+ goto out;
+
+ max_delay *= br->multicast_last_member_count;
+ if (mp->mglist &&
+ (timer_pending(&mp->timer) ?
+ time_after(mp->timer.expires, now + max_delay) :
+ try_to_del_timer_sync(&mp->timer) >= 0))
+ mod_timer(&mp->timer, now + max_delay);
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, now + max_delay) :
+ try_to_del_timer_sync(&p->timer) >= 0)
+ mod_timer(&p->timer, now + max_delay);
+ }
+
+out:
+ spin_unlock(&br->multicast_lock);
+ return err;
+}
+#endif
+
+static void
+br_multicast_leave_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct br_ip *group,
+ struct bridge_mcast_other_query *other_query,
+ struct bridge_mcast_own_query *own_query)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port_group *p;
+ unsigned long now;
+ unsigned long time;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) ||
+ (port && port->state == BR_STATE_DISABLED) ||
+ timer_pending(&other_query->timer))
+ goto out;
+
+ mdb = mlock_dereference(br->mdb, br);
+ mp = br_mdb_ip_get(mdb, group);
+ if (!mp)
+ goto out;
+
+ if (br->multicast_querier) {
+ __br_multicast_send_query(br, port, &mp->addr);
+
+ time = jiffies + br->multicast_last_member_count *
+ br->multicast_last_member_interval;
+
+ mod_timer(&own_query->timer, time);
+
+ for (p = mlock_dereference(mp->ports, br);
+ p != NULL;
+ p = mlock_dereference(p->next, br)) {
+ if (p->port != port)
+ continue;
+
+ if (!hlist_unhashed(&p->mglist) &&
+ (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, time) :
+ try_to_del_timer_sync(&p->timer) >= 0)) {
+ mod_timer(&p->timer, time);
+ }
+
+ break;
+ }
+ }
+
+ if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) {
+ struct net_bridge_port_group __rcu **pp;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p->port != port)
+ continue;
+
+ rcu_assign_pointer(*pp, p->next);
+ hlist_del_init(&p->mglist);
+ del_timer(&p->timer);
+ call_rcu_bh(&p->rcu, br_multicast_free_pg);
+ br_mdb_notify(br->dev, port, group, RTM_DELMDB);
+
+ if (!mp->ports && !mp->mglist &&
+ netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+ }
+ goto out;
+ }
+
+ now = jiffies;
+ time = now + br->multicast_last_member_count *
+ br->multicast_last_member_interval;
+
+ if (!port) {
+ if (mp->mglist &&
+ (timer_pending(&mp->timer) ?
+ time_after(mp->timer.expires, time) :
+ try_to_del_timer_sync(&mp->timer) >= 0)) {
+ mod_timer(&mp->timer, time);
+ }
+
+ goto out;
+ }
+
+ for (p = mlock_dereference(mp->ports, br);
+ p != NULL;
+ p = mlock_dereference(p->next, br)) {
+ if (p->port != port)
+ continue;
+
+ if (!hlist_unhashed(&p->mglist) &&
+ (timer_pending(&p->timer) ?
+ time_after(p->timer.expires, time) :
+ try_to_del_timer_sync(&p->timer) >= 0)) {
+ mod_timer(&p->timer, time);
+ }
+
+ break;
+ }
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_ip4_multicast_leave_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ __be32 group,
+ __u16 vid)
+{
+ struct br_ip br_group;
+ struct bridge_mcast_own_query *own_query;
+
+ if (ipv4_is_local_multicast(group))
+ return;
+
+ own_query = port ? &port->ip4_own_query : &br->ip4_own_query;
+
+ br_group.u.ip4 = group;
+ br_group.proto = htons(ETH_P_IP);
+ br_group.vid = vid;
+
+ br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
+ own_query);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_leave_group(struct net_bridge *br,
+ struct net_bridge_port *port,
+ const struct in6_addr *group,
+ __u16 vid)
+{
+ struct br_ip br_group;
+ struct bridge_mcast_own_query *own_query;
+
+ if (ipv6_addr_is_ll_all_nodes(group))
+ return;
+
+ own_query = port ? &port->ip6_own_query : &br->ip6_own_query;
+
+ br_group.u.ip6 = *group;
+ br_group.proto = htons(ETH_P_IPV6);
+ br_group.vid = vid;
+
+ br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
+ own_query);
+}
+#endif
+
+static int br_multicast_ipv4_rcv(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ struct sk_buff *skb2 = skb;
+ const struct iphdr *iph;
+ struct igmphdr *ih;
+ unsigned int len;
+ unsigned int offset;
+ int err;
+
+ /* We treat OOM as packet loss for now. */
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (iph->ihl < 5 || iph->version != 4)
+ return -EINVAL;
+
+ if (!pskb_may_pull(skb, ip_hdrlen(skb)))
+ return -EINVAL;
+
+ iph = ip_hdr(skb);
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ return -EINVAL;
+
+ if (iph->protocol != IPPROTO_IGMP) {
+ if (!ipv4_is_local_multicast(iph->daddr))
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ return 0;
+ }
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len || len < ip_hdrlen(skb))
+ return -EINVAL;
+
+ if (skb->len > len) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = pskb_trim_rcsum(skb2, len);
+ if (err)
+ goto err_out;
+ }
+
+ len -= ip_hdrlen(skb2);
+ offset = skb_network_offset(skb2) + ip_hdrlen(skb2);
+ __skb_pull(skb2, offset);
+ skb_reset_transport_header(skb2);
+
+ err = -EINVAL;
+ if (!pskb_may_pull(skb2, sizeof(*ih)))
+ goto out;
+
+ switch (skb2->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (!csum_fold(skb2->csum))
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb2->csum = 0;
+ if (skb_checksum_complete(skb2))
+ goto out;
+ }
+
+ err = 0;
+
+ BR_INPUT_SKB_CB(skb)->igmp = 1;
+ ih = igmp_hdr(skb2);
+
+ switch (ih->type) {
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ err = br_ip4_multicast_add_group(br, port, ih->group, vid);
+ break;
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ err = br_ip4_multicast_igmp3_report(br, port, skb2, vid);
+ break;
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ err = br_ip4_multicast_query(br, port, skb2, vid);
+ break;
+ case IGMP_HOST_LEAVE_MESSAGE:
+ br_ip4_multicast_leave_group(br, port, ih->group, vid);
+ break;
+ }
+
+out:
+ __skb_push(skb2, offset);
+err_out:
+ if (skb2 != skb)
+ kfree_skb(skb2);
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int br_multicast_ipv6_rcv(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ struct sk_buff *skb2;
+ const struct ipv6hdr *ip6h;
+ u8 icmp6_type;
+ u8 nexthdr;
+ __be16 frag_off;
+ unsigned int len;
+ int offset;
+ int err;
+
+ if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ return -EINVAL;
+
+ ip6h = ipv6_hdr(skb);
+
+ /*
+ * We're interested in MLD messages only.
+ * - Version is 6
+ * - MLD has always Router Alert hop-by-hop option
+ * - But we do not support jumbrograms.
+ */
+ if (ip6h->version != 6)
+ return 0;
+
+ /* Prevent flooding this packet if there is no listener present */
+ if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr))
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+
+ if (ip6h->nexthdr != IPPROTO_HOPOPTS ||
+ ip6h->payload_len == 0)
+ return 0;
+
+ len = ntohs(ip6h->payload_len) + sizeof(*ip6h);
+ if (skb->len < len)
+ return -EINVAL;
+
+ nexthdr = ip6h->nexthdr;
+ offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off);
+
+ if (offset < 0 || nexthdr != IPPROTO_ICMPV6)
+ return 0;
+
+ /* Okay, we found ICMPv6 header */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = -EINVAL;
+ if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr)))
+ goto out;
+
+ len -= offset - skb_network_offset(skb2);
+
+ __skb_pull(skb2, offset);
+ skb_reset_transport_header(skb2);
+ skb_postpull_rcsum(skb2, skb_network_header(skb2),
+ skb_network_header_len(skb2));
+
+ icmp6_type = icmp6_hdr(skb2)->icmp6_type;
+
+ switch (icmp6_type) {
+ case ICMPV6_MGM_QUERY:
+ case ICMPV6_MGM_REPORT:
+ case ICMPV6_MGM_REDUCTION:
+ case ICMPV6_MLD2_REPORT:
+ break;
+ default:
+ err = 0;
+ goto out;
+ }
+
+ /* Okay, we found MLD message. Check further. */
+ if (skb2->len > len) {
+ err = pskb_trim_rcsum(skb2, len);
+ if (err)
+ goto out;
+ err = -EINVAL;
+ }
+
+ ip6h = ipv6_hdr(skb2);
+
+ switch (skb2->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb2->len,
+ IPPROTO_ICMPV6, skb2->csum))
+ break;
+ /*FALLTHROUGH*/
+ case CHECKSUM_NONE:
+ skb2->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
+ &ip6h->daddr,
+ skb2->len,
+ IPPROTO_ICMPV6, 0));
+ if (__skb_checksum_complete(skb2))
+ goto out;
+ }
+
+ err = 0;
+
+ BR_INPUT_SKB_CB(skb)->igmp = 1;
+
+ switch (icmp6_type) {
+ case ICMPV6_MGM_REPORT:
+ {
+ struct mld_msg *mld;
+ if (!pskb_may_pull(skb2, sizeof(*mld))) {
+ err = -EINVAL;
+ goto out;
+ }
+ mld = (struct mld_msg *)skb_transport_header(skb2);
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
+ break;
+ }
+ case ICMPV6_MLD2_REPORT:
+ err = br_ip6_multicast_mld2_report(br, port, skb2, vid);
+ break;
+ case ICMPV6_MGM_QUERY:
+ err = br_ip6_multicast_query(br, port, skb2, vid);
+ break;
+ case ICMPV6_MGM_REDUCTION:
+ {
+ struct mld_msg *mld;
+ if (!pskb_may_pull(skb2, sizeof(*mld))) {
+ err = -EINVAL;
+ goto out;
+ }
+ mld = (struct mld_msg *)skb_transport_header(skb2);
+ br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
+ }
+ }
+
+out:
+ kfree_skb(skb2);
+ return err;
+}
+#endif
+
+int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
+ struct sk_buff *skb, u16 vid)
+{
+ BR_INPUT_SKB_CB(skb)->igmp = 0;
+ BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
+
+ if (br->multicast_disabled)
+ return 0;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return br_multicast_ipv4_rcv(br, port, skb, vid);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return br_multicast_ipv6_rcv(br, port, skb, vid);
+#endif
+ }
+
+ return 0;
+}
+
+static void br_multicast_query_expired(struct net_bridge *br,
+ struct bridge_mcast_own_query *query,
+ struct bridge_mcast_querier *querier)
+{
+ spin_lock(&br->multicast_lock);
+ if (query->startup_sent < br->multicast_startup_query_count)
+ query->startup_sent++;
+
+ RCU_INIT_POINTER(querier->port, NULL);
+ br_multicast_send_query(br, NULL, query);
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_ip4_multicast_query_expired(unsigned long data)
+{
+ struct net_bridge *br = (void *)data;
+
+ br_multicast_query_expired(br, &br->ip4_own_query, &br->ip4_querier);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_query_expired(unsigned long data)
+{
+ struct net_bridge *br = (void *)data;
+
+ br_multicast_query_expired(br, &br->ip6_own_query, &br->ip6_querier);
+}
+#endif
+
+void br_multicast_init(struct net_bridge *br)
+{
+ br->hash_elasticity = 4;
+ br->hash_max = 512;
+
+ br->multicast_router = 1;
+ br->multicast_querier = 0;
+ br->multicast_query_use_ifaddr = 0;
+ br->multicast_last_member_count = 2;
+ br->multicast_startup_query_count = 2;
+
+ br->multicast_last_member_interval = HZ;
+ br->multicast_query_response_interval = 10 * HZ;
+ br->multicast_startup_query_interval = 125 * HZ / 4;
+ br->multicast_query_interval = 125 * HZ;
+ br->multicast_querier_interval = 255 * HZ;
+ br->multicast_membership_interval = 260 * HZ;
+
+ br->ip4_other_query.delay_time = 0;
+ br->ip4_querier.port = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ br->ip6_other_query.delay_time = 0;
+ br->ip6_querier.port = NULL;
+#endif
+
+ spin_lock_init(&br->multicast_lock);
+ setup_timer(&br->multicast_router_timer,
+ br_multicast_local_router_expired, 0);
+ setup_timer(&br->ip4_other_query.timer,
+ br_ip4_multicast_querier_expired, (unsigned long)br);
+ setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired,
+ (unsigned long)br);
+#if IS_ENABLED(CONFIG_IPV6)
+ setup_timer(&br->ip6_other_query.timer,
+ br_ip6_multicast_querier_expired, (unsigned long)br);
+ setup_timer(&br->ip6_own_query.timer, br_ip6_multicast_query_expired,
+ (unsigned long)br);
+#endif
+}
+
+static void __br_multicast_open(struct net_bridge *br,
+ struct bridge_mcast_own_query *query)
+{
+ query->startup_sent = 0;
+
+ if (br->multicast_disabled)
+ return;
+
+ mod_timer(&query->timer, jiffies);
+}
+
+void br_multicast_open(struct net_bridge *br)
+{
+ __br_multicast_open(br, &br->ip4_own_query);
+#if IS_ENABLED(CONFIG_IPV6)
+ __br_multicast_open(br, &br->ip6_own_query);
+#endif
+}
+
+void br_multicast_stop(struct net_bridge *br)
+{
+ struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
+ struct hlist_node *n;
+ u32 ver;
+ int i;
+
+ del_timer_sync(&br->multicast_router_timer);
+ del_timer_sync(&br->ip4_other_query.timer);
+ del_timer_sync(&br->ip4_own_query.timer);
+#if IS_ENABLED(CONFIG_IPV6)
+ del_timer_sync(&br->ip6_other_query.timer);
+ del_timer_sync(&br->ip6_own_query.timer);
+#endif
+
+ spin_lock_bh(&br->multicast_lock);
+ mdb = mlock_dereference(br->mdb, br);
+ if (!mdb)
+ goto out;
+
+ br->mdb = NULL;
+
+ ver = mdb->ver;
+ for (i = 0; i < mdb->max; i++) {
+ hlist_for_each_entry_safe(mp, n, &mdb->mhash[i],
+ hlist[ver]) {
+ del_timer(&mp->timer);
+ call_rcu_bh(&mp->rcu, br_multicast_free_group);
+ }
+ }
+
+ if (mdb->old) {
+ spin_unlock_bh(&br->multicast_lock);
+ rcu_barrier_bh();
+ spin_lock_bh(&br->multicast_lock);
+ WARN_ON(mdb->old);
+ }
+
+ mdb->old = mdb;
+ call_rcu_bh(&mdb->rcu, br_mdb_free);
+
+out:
+ spin_unlock_bh(&br->multicast_lock);
+}
+
+int br_multicast_set_router(struct net_bridge *br, unsigned long val)
+{
+ int err = -ENOENT;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (!netif_running(br->dev))
+ goto unlock;
+
+ switch (val) {
+ case 0:
+ case 2:
+ del_timer(&br->multicast_router_timer);
+ /* fall through */
+ case 1:
+ br->multicast_router = val;
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+
+ return err;
+}
+
+int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
+{
+ struct net_bridge *br = p->br;
+ int err = -ENOENT;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || p->state == BR_STATE_DISABLED)
+ goto unlock;
+
+ switch (val) {
+ case 0:
+ case 1:
+ case 2:
+ p->multicast_router = val;
+ err = 0;
+
+ if (val < 2 && !hlist_unhashed(&p->rlist))
+ hlist_del_init_rcu(&p->rlist);
+
+ if (val == 1)
+ break;
+
+ del_timer(&p->multicast_router_timer);
+
+ if (val == 0)
+ break;
+
+ br_multicast_add_router(br, p);
+ break;
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+unlock:
+ spin_unlock(&br->multicast_lock);
+
+ return err;
+}
+
+static void br_multicast_start_querier(struct net_bridge *br,
+ struct bridge_mcast_own_query *query)
+{
+ struct net_bridge_port *port;
+
+ __br_multicast_open(br, query);
+
+ list_for_each_entry(port, &br->port_list, list) {
+ if (port->state == BR_STATE_DISABLED ||
+ port->state == BR_STATE_BLOCKING)
+ continue;
+
+ if (query == &br->ip4_own_query)
+ br_multicast_enable(&port->ip4_own_query);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ br_multicast_enable(&port->ip6_own_query);
+#endif
+ }
+}
+
+int br_multicast_toggle(struct net_bridge *br, unsigned long val)
+{
+ int err = 0;
+ struct net_bridge_mdb_htable *mdb;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (br->multicast_disabled == !val)
+ goto unlock;
+
+ br->multicast_disabled = !val;
+ if (br->multicast_disabled)
+ goto unlock;
+
+ if (!netif_running(br->dev))
+ goto unlock;
+
+ mdb = mlock_dereference(br->mdb, br);
+ if (mdb) {
+ if (mdb->old) {
+ err = -EEXIST;
+rollback:
+ br->multicast_disabled = !!val;
+ goto unlock;
+ }
+
+ err = br_mdb_rehash(&br->mdb, mdb->max,
+ br->hash_elasticity);
+ if (err)
+ goto rollback;
+ }
+
+ br_multicast_start_querier(br, &br->ip4_own_query);
+#if IS_ENABLED(CONFIG_IPV6)
+ br_multicast_start_querier(br, &br->ip6_own_query);
+#endif
+
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+
+ return err;
+}
+
+int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
+{
+ unsigned long max_delay;
+
+ val = !!val;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (br->multicast_querier == val)
+ goto unlock;
+
+ br->multicast_querier = val;
+ if (!val)
+ goto unlock;
+
+ max_delay = br->multicast_query_response_interval;
+
+ if (!timer_pending(&br->ip4_other_query.timer))
+ br->ip4_other_query.delay_time = jiffies + max_delay;
+
+ br_multicast_start_querier(br, &br->ip4_own_query);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!timer_pending(&br->ip6_other_query.timer))
+ br->ip6_other_query.delay_time = jiffies + max_delay;
+
+ br_multicast_start_querier(br, &br->ip6_own_query);
+#endif
+
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
+
+int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val)
+{
+ int err = -ENOENT;
+ u32 old;
+ struct net_bridge_mdb_htable *mdb;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (!netif_running(br->dev))
+ goto unlock;
+
+ err = -EINVAL;
+ if (!is_power_of_2(val))
+ goto unlock;
+
+ mdb = mlock_dereference(br->mdb, br);
+ if (mdb && val < mdb->size)
+ goto unlock;
+
+ err = 0;
+
+ old = br->hash_max;
+ br->hash_max = val;
+
+ if (mdb) {
+ if (mdb->old) {
+ err = -EEXIST;
+rollback:
+ br->hash_max = old;
+ goto unlock;
+ }
+
+ err = br_mdb_rehash(&br->mdb, br->hash_max,
+ br->hash_elasticity);
+ if (err)
+ goto rollback;
+ }
+
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+
+ return err;
+}
+
+/**
+ * br_multicast_list_adjacent - Returns snooped multicast addresses
+ * @dev: The bridge port adjacent to which to retrieve addresses
+ * @br_ip_list: The list to store found, snooped multicast IP addresses in
+ *
+ * Creates a list of IP addresses (struct br_ip_list) sensed by the multicast
+ * snooping feature on all bridge ports of dev's bridge device, excluding
+ * the addresses from dev itself.
+ *
+ * Returns the number of items added to br_ip_list.
+ *
+ * Notes:
+ * - br_ip_list needs to be initialized by caller
+ * - br_ip_list might contain duplicates in the end
+ * (needs to be taken care of by caller)
+ * - br_ip_list needs to be freed by caller
+ */
+int br_multicast_list_adjacent(struct net_device *dev,
+ struct list_head *br_ip_list)
+{
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ struct net_bridge_port_group *group;
+ struct br_ip_list *entry;
+ int count = 0;
+
+ rcu_read_lock();
+ if (!br_ip_list || !br_port_exists(dev))
+ goto unlock;
+
+ port = br_port_get_rcu(dev);
+ if (!port || !port->br)
+ goto unlock;
+
+ br = port->br;
+
+ list_for_each_entry_rcu(port, &br->port_list, list) {
+ if (!port->dev || port->dev == dev)
+ continue;
+
+ hlist_for_each_entry_rcu(group, &port->mglist, mglist) {
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ goto unlock;
+
+ entry->addr = group->addr;
+ list_add(&entry->list, br_ip_list);
+ count++;
+ }
+ }
+
+unlock:
+ rcu_read_unlock();
+ return count;
+}
+EXPORT_SYMBOL_GPL(br_multicast_list_adjacent);
+
+/**
+ * br_multicast_has_querier_anywhere - Checks for a querier on a bridge
+ * @dev: The bridge port providing the bridge on which to check for a querier
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a valid querier exists anywhere on the bridged link layer.
+ * Otherwise returns false.
+ */
+bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
+{
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ struct ethhdr eth;
+ bool ret = false;
+
+ rcu_read_lock();
+ if (!br_port_exists(dev))
+ goto unlock;
+
+ port = br_port_get_rcu(dev);
+ if (!port || !port->br)
+ goto unlock;
+
+ br = port->br;
+
+ memset(&eth, 0, sizeof(eth));
+ eth.h_proto = htons(proto);
+
+ ret = br_multicast_querier_exists(br, &eth);
+
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere);
+
+/**
+ * br_multicast_has_querier_adjacent - Checks for a querier behind a bridge port
+ * @dev: The bridge port adjacent to which to check for a querier
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a selected querier is behind one of the other ports of this
+ * bridge. Otherwise returns false.
+ */
+bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto)
+{
+ struct net_bridge *br;
+ struct net_bridge_port *port;
+ bool ret = false;
+
+ rcu_read_lock();
+ if (!br_port_exists(dev))
+ goto unlock;
+
+ port = br_port_get_rcu(dev);
+ if (!port || !port->br)
+ goto unlock;
+
+ br = port->br;
+
+ switch (proto) {
+ case ETH_P_IP:
+ if (!timer_pending(&br->ip4_other_query.timer) ||
+ rcu_dereference(br->ip4_querier.port) == port)
+ goto unlock;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case ETH_P_IPV6:
+ if (!timer_pending(&br->ip6_other_query.timer) ||
+ rcu_dereference(br->ip6_querier.port) == port)
+ goto unlock;
+ break;
+#endif
+ default:
+ goto unlock;
+ }
+
+ ret = true;
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent);
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
new file mode 100644
index 000000000..60ddfbeb4
--- /dev/null
+++ b/net/bridge/br_netfilter.c
@@ -0,0 +1,1140 @@
+/*
+ * Handle firewalling
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_arp.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/route.h>
+#include <net/netfilter/br_netfilter.h>
+
+#include <asm/uaccess.h>
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *brnf_sysctl_header;
+static int brnf_call_iptables __read_mostly = 1;
+static int brnf_call_ip6tables __read_mostly = 1;
+static int brnf_call_arptables __read_mostly = 1;
+static int brnf_filter_vlan_tagged __read_mostly = 0;
+static int brnf_filter_pppoe_tagged __read_mostly = 0;
+static int brnf_pass_vlan_indev __read_mostly = 0;
+#else
+#define brnf_call_iptables 1
+#define brnf_call_ip6tables 1
+#define brnf_call_arptables 1
+#define brnf_filter_vlan_tagged 0
+#define brnf_filter_pppoe_tagged 0
+#define brnf_pass_vlan_indev 0
+#endif
+
+#define IS_IP(skb) \
+ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
+
+#define IS_IPV6(skb) \
+ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
+
+#define IS_ARP(skb) \
+ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+
+static inline __be16 vlan_proto(const struct sk_buff *skb)
+{
+ if (skb_vlan_tag_present(skb))
+ return skb->protocol;
+ else if (skb->protocol == htons(ETH_P_8021Q))
+ return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+ else
+ return 0;
+}
+
+#define IS_VLAN_IP(skb) \
+ (vlan_proto(skb) == htons(ETH_P_IP) && \
+ brnf_filter_vlan_tagged)
+
+#define IS_VLAN_IPV6(skb) \
+ (vlan_proto(skb) == htons(ETH_P_IPV6) && \
+ brnf_filter_vlan_tagged)
+
+#define IS_VLAN_ARP(skb) \
+ (vlan_proto(skb) == htons(ETH_P_ARP) && \
+ brnf_filter_vlan_tagged)
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+ return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+ sizeof(struct pppoe_hdr)));
+}
+
+#define IS_PPPOE_IP(skb) \
+ (skb->protocol == htons(ETH_P_PPP_SES) && \
+ pppoe_proto(skb) == htons(PPP_IP) && \
+ brnf_filter_pppoe_tagged)
+
+#define IS_PPPOE_IPV6(skb) \
+ (skb->protocol == htons(ETH_P_PPP_SES) && \
+ pppoe_proto(skb) == htons(PPP_IPV6) && \
+ brnf_filter_pppoe_tagged)
+
+/* largest possible L2 header, see br_nf_dev_queue_xmit() */
+#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+struct brnf_frag_data {
+ char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
+ u8 encap_size;
+ u8 size;
+};
+
+static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage);
+#endif
+
+static struct nf_bridge_info *nf_bridge_info_get(const struct sk_buff *skb)
+{
+ return skb->nf_bridge;
+}
+
+static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
+{
+ struct net_bridge_port *port;
+
+ port = br_port_get_rcu(dev);
+ return port ? &port->br->fake_rtable : NULL;
+}
+
+static inline struct net_device *bridge_parent(const struct net_device *dev)
+{
+ struct net_bridge_port *port;
+
+ port = br_port_get_rcu(dev);
+ return port ? port->br->dev : NULL;
+}
+
+static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
+{
+ skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
+ if (likely(skb->nf_bridge))
+ atomic_set(&(skb->nf_bridge->use), 1);
+
+ return skb->nf_bridge;
+}
+
+static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+ if (atomic_read(&nf_bridge->use) > 1) {
+ struct nf_bridge_info *tmp = nf_bridge_alloc(skb);
+
+ if (tmp) {
+ memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
+ atomic_set(&tmp->use, 1);
+ }
+ nf_bridge_put(nf_bridge);
+ nf_bridge = tmp;
+ }
+ return nf_bridge;
+}
+
+static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case __cpu_to_be16(ETH_P_8021Q):
+ return VLAN_HLEN;
+ case __cpu_to_be16(ETH_P_PPP_SES):
+ return PPPOE_SES_HLEN;
+ default:
+ return 0;
+ }
+}
+
+static inline void nf_bridge_push_encap_header(struct sk_buff *skb)
+{
+ unsigned int len = nf_bridge_encap_header_len(skb);
+
+ skb_push(skb, len);
+ skb->network_header -= len;
+}
+
+static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
+{
+ unsigned int len = nf_bridge_encap_header_len(skb);
+
+ skb_pull(skb, len);
+ skb->network_header += len;
+}
+
+static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
+{
+ unsigned int len = nf_bridge_encap_header_len(skb);
+
+ skb_pull_rcsum(skb, len);
+ skb->network_header += len;
+}
+
+/* When handing a packet over to the IP layer
+ * check whether we have a skb that is in the
+ * expected format
+ */
+
+static int br_parse_ip_options(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ struct net_device *dev = skb->dev;
+ u32 len;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+
+ /* Basic sanity checks */
+ if (iph->ihl < 5 || iph->version != 4)
+ goto inhdr_error;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto inhdr_error;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ } else if (len < (iph->ihl*4))
+ goto inhdr_error;
+
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ /* We should really parse IP options here but until
+ * somebody who actually uses IP options complains to
+ * us we'll just silently ignore the options because
+ * we're lazy!
+ */
+ return 0;
+
+inhdr_error:
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+ return -1;
+}
+
+static void nf_bridge_update_protocol(struct sk_buff *skb)
+{
+ switch (skb->nf_bridge->orig_proto) {
+ case BRNF_PROTO_8021Q:
+ skb->protocol = htons(ETH_P_8021Q);
+ break;
+ case BRNF_PROTO_PPPOE:
+ skb->protocol = htons(ETH_P_PPP_SES);
+ break;
+ case BRNF_PROTO_UNCHANGED:
+ break;
+ }
+}
+
+/* PF_BRIDGE/PRE_ROUTING *********************************************/
+/* Undo the changes made for ip6tables PREROUTING and continue the
+ * bridge PRE_ROUTING hook. */
+static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct rtable *rt;
+
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+ nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
+
+ rt = bridge_parent_rtable(nf_bridge->physindev);
+ if (!rt) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb_dst_set_noref(skb, &rt->dst);
+
+ skb->dev = nf_bridge->physindev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
+ skb->dev, NULL,
+ br_handle_frame_finish, 1);
+
+ return 0;
+}
+
+/* Obtain the correct destination MAC address, while preserving the original
+ * source MAC address. If we already know this address, we just copy it. If we
+ * don't, we use the neighbour framework to find out. In both cases, we make
+ * sure that br_handle_frame_finish() is called afterwards.
+ */
+static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
+{
+ struct neighbour *neigh;
+ struct dst_entry *dst;
+
+ skb->dev = bridge_parent(skb->dev);
+ if (!skb->dev)
+ goto free_skb;
+ dst = skb_dst(skb);
+ neigh = dst_neigh_lookup_skb(dst, skb);
+ if (neigh) {
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ int ret;
+
+ if (neigh->hh.hh_len) {
+ neigh_hh_bridge(&neigh->hh, skb);
+ skb->dev = nf_bridge->physindev;
+ ret = br_handle_frame_finish(sk, skb);
+ } else {
+ /* the neighbour function below overwrites the complete
+ * MAC header, so we save the Ethernet source address and
+ * protocol number.
+ */
+ skb_copy_from_linear_data_offset(skb,
+ -(ETH_HLEN-ETH_ALEN),
+ nf_bridge->neigh_header,
+ ETH_HLEN-ETH_ALEN);
+ /* tell br_dev_xmit to continue with forwarding */
+ nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+ /* FIXME Need to refragment */
+ ret = neigh->output(neigh, skb);
+ }
+ neigh_release(neigh);
+ return ret;
+ }
+free_skb:
+ kfree_skb(skb);
+ return 0;
+}
+
+static bool daddr_was_changed(const struct sk_buff *skb,
+ const struct nf_bridge_info *nf_bridge)
+{
+ return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
+}
+
+/* This requires some explaining. If DNAT has taken place,
+ * we will need to fix up the destination Ethernet address.
+ * This is also true when SNAT takes place (for the reply direction).
+ *
+ * There are two cases to consider:
+ * 1. The packet was DNAT'ed to a device in the same bridge
+ * port group as it was received on. We can still bridge
+ * the packet.
+ * 2. The packet was DNAT'ed to a different device, either
+ * a non-bridged device or another bridge port group.
+ * The packet will need to be routed.
+ *
+ * The correct way of distinguishing between these two cases is to
+ * call ip_route_input() and to look at skb->dst->dev, which is
+ * changed to the destination device if ip_route_input() succeeds.
+ *
+ * Let's first consider the case that ip_route_input() succeeds:
+ *
+ * If the output device equals the logical bridge device the packet
+ * came in on, we can consider this bridging. The corresponding MAC
+ * address will be obtained in br_nf_pre_routing_finish_bridge.
+ * Otherwise, the packet is considered to be routed and we just
+ * change the destination MAC address so that the packet will
+ * later be passed up to the IP stack to be routed. For a redirected
+ * packet, ip_route_input() will give back the localhost as output device,
+ * which differs from the bridge device.
+ *
+ * Let's now consider the case that ip_route_input() fails:
+ *
+ * This can be because the destination address is martian, in which case
+ * the packet will be dropped.
+ * If IP forwarding is disabled, ip_route_input() will fail, while
+ * ip_route_output_key() can return success. The source
+ * address for ip_route_output_key() is set to zero, so ip_route_output_key()
+ * thinks we're handling a locally generated packet and won't care
+ * if IP forwarding is enabled. If the output device equals the logical bridge
+ * device, we proceed as if ip_route_input() succeeded. If it differs from the
+ * logical bridge port or if ip_route_output_key() fails we drop the packet.
+ */
+static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct iphdr *iph = ip_hdr(skb);
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct rtable *rt;
+ int err;
+ int frag_max_size;
+
+ frag_max_size = IPCB(skb)->frag_max_size;
+ BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
+
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+ nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
+ if (daddr_was_changed(skb, nf_bridge)) {
+ if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ /* If err equals -EHOSTUNREACH the error is due to a
+ * martian destination or due to the fact that
+ * forwarding is disabled. For most martian packets,
+ * ip_route_output_key() will fail. It won't fail for 2 types of
+ * martian destinations: loopback destinations and destination
+ * 0.0.0.0. In both cases the packet will be dropped because the
+ * destination is the loopback device and not the bridge. */
+ if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
+ goto free_skb;
+
+ rt = ip_route_output(dev_net(dev), iph->daddr, 0,
+ RT_TOS(iph->tos), 0);
+ if (!IS_ERR(rt)) {
+ /* - Bridged-and-DNAT'ed traffic doesn't
+ * require ip_forwarding. */
+ if (rt->dst.dev == dev) {
+ skb_dst_set(skb, &rt->dst);
+ goto bridged_dnat;
+ }
+ ip_rt_put(rt);
+ }
+free_skb:
+ kfree_skb(skb);
+ return 0;
+ } else {
+ if (skb_dst(skb)->dev == dev) {
+bridged_dnat:
+ skb->dev = nf_bridge->physindev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ NF_HOOK_THRESH(NFPROTO_BRIDGE,
+ NF_BR_PRE_ROUTING,
+ sk, skb, skb->dev, NULL,
+ br_nf_pre_routing_finish_bridge,
+ 1);
+ return 0;
+ }
+ ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
+ skb->pkt_type = PACKET_HOST;
+ }
+ } else {
+ rt = bridge_parent_rtable(nf_bridge->physindev);
+ if (!rt) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb_dst_set_noref(skb, &rt->dst);
+ }
+
+ skb->dev = nf_bridge->physindev;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+ NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
+ skb->dev, NULL,
+ br_handle_frame_finish, 1);
+
+ return 0;
+}
+
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct net_device *vlan, *br;
+
+ br = bridge_parent(dev);
+ if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+ return br;
+
+ vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
+ skb_vlan_tag_get(skb) & VLAN_VID_MASK);
+
+ return vlan ? vlan : br;
+}
+
+/* Some common code for IPv4/IPv6 */
+static struct net_device *setup_pre_routing(struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ skb->pkt_type = PACKET_HOST;
+ nf_bridge->pkt_otherhost = true;
+ }
+
+ nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
+ nf_bridge->physindev = skb->dev;
+ skb->dev = brnf_get_logical_dev(skb, skb->dev);
+
+ if (skb->protocol == htons(ETH_P_8021Q))
+ nf_bridge->orig_proto = BRNF_PROTO_8021Q;
+ else if (skb->protocol == htons(ETH_P_PPP_SES))
+ nf_bridge->orig_proto = BRNF_PROTO_PPPOE;
+
+ /* Must drop socket now because of tproxy. */
+ skb_orphan(skb);
+ return skb->dev;
+}
+
+/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
+static int check_hbh_len(struct sk_buff *skb)
+{
+ unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+ u32 pkt_len;
+ const unsigned char *nh = skb_network_header(skb);
+ int off = raw - nh;
+ int len = (raw[1] + 1) << 3;
+
+ if ((raw + len) - skb->data > skb_headlen(skb))
+ goto bad;
+
+ off += 2;
+ len -= 2;
+
+ while (len > 0) {
+ int optlen = nh[off + 1] + 2;
+
+ switch (nh[off]) {
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ break;
+
+ case IPV6_TLV_PADN:
+ break;
+
+ case IPV6_TLV_JUMBO:
+ if (nh[off + 1] != 4 || (off & 3) != 2)
+ goto bad;
+ pkt_len = ntohl(*(__be32 *) (nh + off + 2));
+ if (pkt_len <= IPV6_MAXPLEN ||
+ ipv6_hdr(skb)->payload_len)
+ goto bad;
+ if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+ goto bad;
+ if (pskb_trim_rcsum(skb,
+ pkt_len + sizeof(struct ipv6hdr)))
+ goto bad;
+ nh = skb_network_header(skb);
+ break;
+ default:
+ if (optlen > len)
+ goto bad;
+ break;
+ }
+ off += optlen;
+ len -= optlen;
+ }
+ if (len == 0)
+ return 0;
+bad:
+ return -1;
+
+}
+
+/* Replicate the checks that IPv6 does on packet reception and pass the packet
+ * to ip6tables, which doesn't support NAT, so things are fairly simple. */
+static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct ipv6hdr *hdr;
+ u32 pkt_len;
+
+ if (skb->len < sizeof(struct ipv6hdr))
+ return NF_DROP;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return NF_DROP;
+
+ hdr = ipv6_hdr(skb);
+
+ if (hdr->version != 6)
+ return NF_DROP;
+
+ pkt_len = ntohs(hdr->payload_len);
+
+ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+ return NF_DROP;
+ if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+ return NF_DROP;
+ }
+ if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+ return NF_DROP;
+
+ nf_bridge_put(skb->nf_bridge);
+ if (!nf_bridge_alloc(skb))
+ return NF_DROP;
+ if (!setup_pre_routing(skb))
+ return NF_DROP;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb,
+ skb->dev, NULL,
+ br_nf_pre_routing_finish_ipv6);
+
+ return NF_STOLEN;
+}
+
+/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
+ * Replicate the checks that IPv4 does on packet reception.
+ * Set skb->dev to the bridge device (i.e. parent of the
+ * receiving device) to make netfilter happy, the REDIRECT
+ * target in particular. Save the original destination IP
+ * address to be able to detect DNAT afterwards. */
+static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_bridge_info *nf_bridge;
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ __u32 len = nf_bridge_encap_header_len(skb);
+
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return NF_DROP;
+
+ p = br_port_get_rcu(state->in);
+ if (p == NULL)
+ return NF_DROP;
+ br = p->br;
+
+ if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
+ if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
+ return NF_ACCEPT;
+
+ nf_bridge_pull_encap_header_rcsum(skb);
+ return br_nf_pre_routing_ipv6(ops, skb, state);
+ }
+
+ if (!brnf_call_iptables && !br->nf_call_iptables)
+ return NF_ACCEPT;
+
+ if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+ return NF_ACCEPT;
+
+ nf_bridge_pull_encap_header_rcsum(skb);
+
+ if (br_parse_ip_options(skb))
+ return NF_DROP;
+
+ nf_bridge_put(skb->nf_bridge);
+ if (!nf_bridge_alloc(skb))
+ return NF_DROP;
+ if (!setup_pre_routing(skb))
+ return NF_DROP;
+
+ nf_bridge = nf_bridge_info_get(skb);
+ nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr;
+
+ skb->protocol = htons(ETH_P_IP);
+
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb,
+ skb->dev, NULL,
+ br_nf_pre_routing_finish);
+
+ return NF_STOLEN;
+}
+
+
+/* PF_BRIDGE/LOCAL_IN ************************************************/
+/* The packet is locally destined, which requires a real
+ * dst_entry, so detach the fake one. On the way up, the
+ * packet would pass through PRE_ROUTING again (which already
+ * took place when the packet entered the bridge), but we
+ * register an IPv4 PRE_ROUTING 'sabotage' hook that will
+ * prevent this from happening. */
+static unsigned int br_nf_local_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ br_drop_fake_rtable(skb);
+ return NF_ACCEPT;
+}
+
+/* PF_BRIDGE/FORWARD *************************************************/
+static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *in;
+
+ if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+ int frag_max_size;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ frag_max_size = IPCB(skb)->frag_max_size;
+ BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
+ }
+
+ in = nf_bridge->physindev;
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+ nf_bridge_update_protocol(skb);
+ } else {
+ in = *((struct net_device **)(skb->cb));
+ }
+ nf_bridge_push_encap_header(skb);
+
+ NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb,
+ in, skb->dev, br_forward_finish, 1);
+ return 0;
+}
+
+
+/* This is the 'purely bridged' case. For IP, we pass the packet to
+ * netfilter with indev and outdev set to the bridge device,
+ * but we are still able to filter on the 'real' indev/outdev
+ * because of the physdev module. For ARP, indev and outdev are the
+ * bridge ports. */
+static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_bridge_info *nf_bridge;
+ struct net_device *parent;
+ u_int8_t pf;
+
+ if (!skb->nf_bridge)
+ return NF_ACCEPT;
+
+ /* Need exclusive nf_bridge_info since we might have multiple
+ * different physoutdevs. */
+ if (!nf_bridge_unshare(skb))
+ return NF_DROP;
+
+ nf_bridge = nf_bridge_info_get(skb);
+ if (!nf_bridge)
+ return NF_DROP;
+
+ parent = bridge_parent(state->out);
+ if (!parent)
+ return NF_DROP;
+
+ if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+ pf = NFPROTO_IPV4;
+ else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+ pf = NFPROTO_IPV6;
+ else
+ return NF_ACCEPT;
+
+ nf_bridge_pull_encap_header(skb);
+
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ skb->pkt_type = PACKET_HOST;
+ nf_bridge->pkt_otherhost = true;
+ }
+
+ if (pf == NFPROTO_IPV4) {
+ int frag_max = BR_INPUT_SKB_CB(skb)->frag_max_size;
+
+ if (br_parse_ip_options(skb))
+ return NF_DROP;
+
+ IPCB(skb)->frag_max_size = frag_max;
+ }
+
+ nf_bridge->physoutdev = skb->dev;
+ if (pf == NFPROTO_IPV4)
+ skb->protocol = htons(ETH_P_IP);
+ else
+ skb->protocol = htons(ETH_P_IPV6);
+
+ NF_HOOK(pf, NF_INET_FORWARD, NULL, skb,
+ brnf_get_logical_dev(skb, state->in),
+ parent, br_nf_forward_finish);
+
+ return NF_STOLEN;
+}
+
+static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ struct net_device **d = (struct net_device **)(skb->cb);
+
+ p = br_port_get_rcu(state->out);
+ if (p == NULL)
+ return NF_ACCEPT;
+ br = p->br;
+
+ if (!brnf_call_arptables && !br->nf_call_arptables)
+ return NF_ACCEPT;
+
+ if (!IS_ARP(skb)) {
+ if (!IS_VLAN_ARP(skb))
+ return NF_ACCEPT;
+ nf_bridge_pull_encap_header(skb);
+ }
+
+ if (arp_hdr(skb)->ar_pln != 4) {
+ if (IS_VLAN_ARP(skb))
+ nf_bridge_push_encap_header(skb);
+ return NF_ACCEPT;
+ }
+ *d = state->in;
+ NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb,
+ state->in, state->out, br_nf_forward_finish);
+
+ return NF_STOLEN;
+}
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb)
+{
+ struct brnf_frag_data *data;
+ int err;
+
+ data = this_cpu_ptr(&brnf_frag_data_storage);
+ err = skb_cow_head(skb, data->size);
+
+ if (err) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
+ __skb_push(skb, data->encap_size);
+
+ return br_dev_queue_push_xmit(sk, skb);
+}
+
+static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+ int frag_max_size;
+ unsigned int mtu_reserved;
+
+ if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP))
+ return br_dev_queue_push_xmit(sk, skb);
+
+ mtu_reserved = nf_bridge_mtu_reduction(skb);
+ /* This is wrong! We should preserve the original fragment
+ * boundaries by preserving frag_list rather than refragmenting.
+ */
+ if (skb->len + mtu_reserved > skb->dev->mtu) {
+ struct brnf_frag_data *data;
+
+ frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ if (br_parse_ip_options(skb))
+ /* Drop invalid packet */
+ return NF_DROP;
+ IPCB(skb)->frag_max_size = frag_max_size;
+
+ nf_bridge_update_protocol(skb);
+
+ data = this_cpu_ptr(&brnf_frag_data_storage);
+ data->encap_size = nf_bridge_encap_header_len(skb);
+ data->size = ETH_HLEN + data->encap_size;
+
+ skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
+ data->size);
+
+ ret = ip_fragment(sk, skb, br_nf_push_frag_xmit);
+ } else {
+ ret = br_dev_queue_push_xmit(sk, skb);
+ }
+
+ return ret;
+}
+#else
+static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
+{
+ return br_dev_queue_push_xmit(sk, skb);
+}
+#endif
+
+/* PF_BRIDGE/POST_ROUTING ********************************************/
+static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *realoutdev = bridge_parent(skb->dev);
+ u_int8_t pf;
+
+ /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in
+ * on a bridge, but was delivered locally and is now being routed:
+ *
+ * POST_ROUTING was already invoked from the ip stack.
+ */
+ if (!nf_bridge || !nf_bridge->physoutdev)
+ return NF_ACCEPT;
+
+ if (!realoutdev)
+ return NF_DROP;
+
+ if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+ pf = NFPROTO_IPV4;
+ else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+ pf = NFPROTO_IPV6;
+ else
+ return NF_ACCEPT;
+
+ /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
+ * about the value of skb->pkt_type. */
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ skb->pkt_type = PACKET_HOST;
+ nf_bridge->pkt_otherhost = true;
+ }
+
+ nf_bridge_pull_encap_header(skb);
+ if (pf == NFPROTO_IPV4)
+ skb->protocol = htons(ETH_P_IP);
+ else
+ skb->protocol = htons(ETH_P_IPV6);
+
+ NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb,
+ NULL, realoutdev,
+ br_nf_dev_queue_xmit);
+
+ return NF_STOLEN;
+}
+
+/* IP/SABOTAGE *****************************************************/
+/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
+ * for the second time. */
+static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (skb->nf_bridge &&
+ !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
+ return NF_STOP;
+ }
+
+ return NF_ACCEPT;
+}
+
+/* This is called when br_netfilter has called into iptables/netfilter,
+ * and DNAT has taken place on a bridge-forwarded packet.
+ *
+ * neigh->output has created a new MAC header, with local br0 MAC
+ * as saddr.
+ *
+ * This restores the original MAC saddr of the bridged packet
+ * before invoking bridge forward logic to transmit the packet.
+ */
+static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
+{
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ skb_pull(skb, ETH_HLEN);
+ nf_bridge->mask &= ~BRNF_BRIDGED_DNAT;
+
+ BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));
+
+ skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN),
+ nf_bridge->neigh_header,
+ ETH_HLEN - ETH_ALEN);
+ skb->dev = nf_bridge->physindev;
+ br_handle_frame_finish(NULL, skb);
+}
+
+static int br_nf_dev_xmit(struct sk_buff *skb)
+{
+ if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
+ br_nf_pre_routing_finish_bridge_slow(skb);
+ return 1;
+ }
+ return 0;
+}
+
+static const struct nf_br_ops br_ops = {
+ .br_dev_xmit_hook = br_nf_dev_xmit,
+};
+
+void br_netfilter_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(br_netfilter_enable);
+
+/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
+ * br_dev_queue_push_xmit is called afterwards */
+static struct nf_hook_ops br_nf_ops[] __read_mostly = {
+ {
+ .hook = br_nf_pre_routing,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_BR_PRI_BRNF,
+ },
+ {
+ .hook = br_nf_local_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_IN,
+ .priority = NF_BR_PRI_BRNF,
+ },
+ {
+ .hook = br_nf_forward_ip,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_FORWARD,
+ .priority = NF_BR_PRI_BRNF - 1,
+ },
+ {
+ .hook = br_nf_forward_arp,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_FORWARD,
+ .priority = NF_BR_PRI_BRNF,
+ },
+ {
+ .hook = br_nf_post_routing,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_POST_ROUTING,
+ .priority = NF_BR_PRI_LAST,
+ },
+ {
+ .hook = ip_sabotage_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_FIRST,
+ },
+ {
+ .hook = ip_sabotage_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_FIRST,
+ },
+};
+
+#ifdef CONFIG_SYSCTL
+static
+int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ if (write && *(int *)(ctl->data))
+ *(int *)(ctl->data) = 1;
+ return ret;
+}
+
+static struct ctl_table brnf_table[] = {
+ {
+ .procname = "bridge-nf-call-arptables",
+ .data = &brnf_call_arptables,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-call-iptables",
+ .data = &brnf_call_iptables,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-call-ip6tables",
+ .data = &brnf_call_ip6tables,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-filter-vlan-tagged",
+ .data = &brnf_filter_vlan_tagged,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-filter-pppoe-tagged",
+ .data = &brnf_filter_pppoe_tagged,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ {
+ .procname = "bridge-nf-pass-vlan-input-dev",
+ .data = &brnf_pass_vlan_indev,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
+ { }
+};
+#endif
+
+static int __init br_netfilter_init(void)
+{
+ int ret;
+
+ ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ if (ret < 0)
+ return ret;
+
+#ifdef CONFIG_SYSCTL
+ brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
+ if (brnf_sysctl_header == NULL) {
+ printk(KERN_WARNING
+ "br_netfilter: can't register to sysctl.\n");
+ nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ return -ENOMEM;
+ }
+#endif
+ RCU_INIT_POINTER(nf_br_ops, &br_ops);
+ printk(KERN_NOTICE "Bridge firewalling registered\n");
+ return 0;
+}
+
+static void __exit br_netfilter_fini(void)
+{
+ RCU_INIT_POINTER(nf_br_ops, NULL);
+ nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(brnf_sysctl_header);
+#endif
+}
+
+module_init(br_netfilter_init);
+module_exit(br_netfilter_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
new file mode 100644
index 000000000..4b5c23699
--- /dev/null
+++ b/net/bridge/br_netlink.c
@@ -0,0 +1,888 @@
+/*
+ * Bridge netlink control interface
+ *
+ * Authors:
+ * Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/etherdevice.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/switchdev.h>
+#include <uapi/linux/if_bridge.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+static int br_get_num_vlan_infos(const struct net_port_vlans *pv,
+ u32 filter_mask)
+{
+ u16 vid_range_start = 0, vid_range_end = 0;
+ u16 vid_range_flags = 0;
+ u16 pvid, vid, flags;
+ int num_vlans = 0;
+
+ if (filter_mask & RTEXT_FILTER_BRVLAN)
+ return pv->num_vlans;
+
+ if (!(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED))
+ return 0;
+
+ /* Count number of vlan info's
+ */
+ pvid = br_get_pvid(pv);
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ flags = 0;
+ if (vid == pvid)
+ flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (test_bit(vid, pv->untagged_bitmap))
+ flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+
+ if (vid_range_start == 0) {
+ goto initvars;
+ } else if ((vid - vid_range_end) == 1 &&
+ flags == vid_range_flags) {
+ vid_range_end = vid;
+ continue;
+ } else {
+ if ((vid_range_end - vid_range_start) > 0)
+ num_vlans += 2;
+ else
+ num_vlans += 1;
+ }
+initvars:
+ vid_range_start = vid;
+ vid_range_end = vid;
+ vid_range_flags = flags;
+ }
+
+ if (vid_range_start != 0) {
+ if ((vid_range_end - vid_range_start) > 0)
+ num_vlans += 2;
+ else
+ num_vlans += 1;
+ }
+
+ return num_vlans;
+}
+
+static size_t br_get_link_af_size_filtered(const struct net_device *dev,
+ u32 filter_mask)
+{
+ struct net_port_vlans *pv;
+ int num_vlan_infos;
+
+ rcu_read_lock();
+ if (br_port_exists(dev))
+ pv = nbp_get_vlan_info(br_port_get_rcu(dev));
+ else if (dev->priv_flags & IFF_EBRIDGE)
+ pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev));
+ else
+ pv = NULL;
+ if (pv)
+ num_vlan_infos = br_get_num_vlan_infos(pv, filter_mask);
+ else
+ num_vlan_infos = 0;
+ rcu_read_unlock();
+
+ if (!num_vlan_infos)
+ return 0;
+
+ /* Each VLAN is returned in bridge_vlan_info along with flags */
+ return num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+}
+
+static inline size_t br_port_info_size(void)
+{
+ return nla_total_size(1) /* IFLA_BRPORT_STATE */
+ + nla_total_size(2) /* IFLA_BRPORT_PRIORITY */
+ + nla_total_size(4) /* IFLA_BRPORT_COST */
+ + nla_total_size(1) /* IFLA_BRPORT_MODE */
+ + nla_total_size(1) /* IFLA_BRPORT_GUARD */
+ + nla_total_size(1) /* IFLA_BRPORT_PROTECT */
+ + nla_total_size(1) /* IFLA_BRPORT_FAST_LEAVE */
+ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */
+ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */
+ + 0;
+}
+
+static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(4) /* IFLA_MTU */
+ + nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(1) /* IFLA_OPERSTATE */
+ + nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */
+ + nla_total_size(br_get_link_af_size_filtered(dev,
+ filter_mask)); /* IFLA_AF_SPEC */
+}
+
+static int br_port_fill_attrs(struct sk_buff *skb,
+ const struct net_bridge_port *p)
+{
+ u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
+
+ if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
+ nla_put_u16(skb, IFLA_BRPORT_PRIORITY, p->priority) ||
+ nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) ||
+ nla_put_u8(skb, IFLA_BRPORT_MODE, mode) ||
+ nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) ||
+ nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) ||
+ nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) ||
+ nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) ||
+ nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) ||
+ nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) ||
+ nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI,
+ !!(p->flags & BR_PROXYARP_WIFI)))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start,
+ u16 vid_end, u16 flags)
+{
+ struct bridge_vlan_info vinfo;
+
+ if ((vid_end - vid_start) > 0) {
+ /* add range to skb */
+ vinfo.vid = vid_start;
+ vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_BEGIN;
+ if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ goto nla_put_failure;
+
+ vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN;
+
+ vinfo.vid = vid_end;
+ vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END;
+ if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ goto nla_put_failure;
+ } else {
+ vinfo.vid = vid_start;
+ vinfo.flags = flags;
+ if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int br_fill_ifvlaninfo_compressed(struct sk_buff *skb,
+ const struct net_port_vlans *pv)
+{
+ u16 vid_range_start = 0, vid_range_end = 0;
+ u16 vid_range_flags = 0;
+ u16 pvid, vid, flags;
+ int err = 0;
+
+ /* Pack IFLA_BRIDGE_VLAN_INFO's for every vlan
+ * and mark vlan info with begin and end flags
+ * if vlaninfo represents a range
+ */
+ pvid = br_get_pvid(pv);
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ flags = 0;
+ if (vid == pvid)
+ flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (test_bit(vid, pv->untagged_bitmap))
+ flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+
+ if (vid_range_start == 0) {
+ goto initvars;
+ } else if ((vid - vid_range_end) == 1 &&
+ flags == vid_range_flags) {
+ vid_range_end = vid;
+ continue;
+ } else {
+ err = br_fill_ifvlaninfo_range(skb, vid_range_start,
+ vid_range_end,
+ vid_range_flags);
+ if (err)
+ return err;
+ }
+
+initvars:
+ vid_range_start = vid;
+ vid_range_end = vid;
+ vid_range_flags = flags;
+ }
+
+ if (vid_range_start != 0) {
+ /* Call it once more to send any left over vlans */
+ err = br_fill_ifvlaninfo_range(skb, vid_range_start,
+ vid_range_end,
+ vid_range_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int br_fill_ifvlaninfo(struct sk_buff *skb,
+ const struct net_port_vlans *pv)
+{
+ struct bridge_vlan_info vinfo;
+ u16 pvid, vid;
+
+ pvid = br_get_pvid(pv);
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ vinfo.vid = vid;
+ vinfo.flags = 0;
+ if (vid == pvid)
+ vinfo.flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (test_bit(vid, pv->untagged_bitmap))
+ vinfo.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+
+ if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO,
+ sizeof(vinfo), &vinfo))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+/*
+ * Create one netlink message for one interface
+ * Contains port and master info as well as carrier and bridge state.
+ */
+static int br_fill_ifinfo(struct sk_buff *skb,
+ const struct net_bridge_port *port,
+ u32 pid, u32 seq, int event, unsigned int flags,
+ u32 filter_mask, const struct net_device *dev)
+{
+ const struct net_bridge *br;
+ struct ifinfomsg *hdr;
+ struct nlmsghdr *nlh;
+ u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+
+ if (port)
+ br = port->br;
+ else
+ br = netdev_priv(dev);
+
+ br_debug(br, "br_fill_info event %d port %s master %s\n",
+ event, dev->name, br->dev->name);
+
+ nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ hdr = nlmsg_data(nlh);
+ hdr->ifi_family = AF_BRIDGE;
+ hdr->__ifi_pad = 0;
+ hdr->ifi_type = dev->type;
+ hdr->ifi_index = dev->ifindex;
+ hdr->ifi_flags = dev_get_flags(dev);
+ hdr->ifi_change = 0;
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
+ goto nla_put_failure;
+
+ if (event == RTM_NEWLINK && port) {
+ struct nlattr *nest
+ = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
+
+ if (nest == NULL || br_port_fill_attrs(skb, port) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ }
+
+ /* Check if the VID information is requested */
+ if ((filter_mask & RTEXT_FILTER_BRVLAN) ||
+ (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) {
+ const struct net_port_vlans *pv;
+ struct nlattr *af;
+ int err;
+
+ if (port)
+ pv = nbp_get_vlan_info(port);
+ else
+ pv = br_get_vlan_info(br);
+
+ if (!pv || bitmap_empty(pv->vlan_bitmap, VLAN_N_VID))
+ goto done;
+
+ af = nla_nest_start(skb, IFLA_AF_SPEC);
+ if (!af)
+ goto nla_put_failure;
+
+ if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)
+ err = br_fill_ifvlaninfo_compressed(skb, pv);
+ else
+ err = br_fill_ifvlaninfo(skb, pv);
+ if (err)
+ goto nla_put_failure;
+ nla_nest_end(skb, af);
+ }
+
+done:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+/*
+ * Notify listeners of a change in port information
+ */
+void br_ifinfo_notify(int event, struct net_bridge_port *port)
+{
+ struct net *net;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED;
+
+ if (!port)
+ return;
+
+ net = dev_net(port->dev);
+ br_debug(port->br, "port %u(%s) event %d\n",
+ (unsigned int)port->port_no, port->dev->name, event);
+
+ skb = nlmsg_new(br_nlmsg_size(port->dev, filter), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, port->dev);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in br_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+}
+
+
+/*
+ * Dump information about all ports, in response to GETLINK
+ */
+int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u32 filter_mask, int nlflags)
+{
+ struct net_bridge_port *port = br_port_get_rtnl(dev);
+
+ if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) &&
+ !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED))
+ return 0;
+
+ return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags,
+ filter_mask, dev);
+}
+
+static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
+ int cmd, struct bridge_vlan_info *vinfo)
+{
+ int err = 0;
+
+ switch (cmd) {
+ case RTM_SETLINK:
+ if (p) {
+ err = nbp_vlan_add(p, vinfo->vid, vinfo->flags);
+ if (err)
+ break;
+
+ if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER)
+ err = br_vlan_add(p->br, vinfo->vid,
+ vinfo->flags);
+ } else {
+ err = br_vlan_add(br, vinfo->vid, vinfo->flags);
+ }
+ break;
+
+ case RTM_DELLINK:
+ if (p) {
+ nbp_vlan_delete(p, vinfo->vid);
+ if (vinfo->flags & BRIDGE_VLAN_INFO_MASTER)
+ br_vlan_delete(p->br, vinfo->vid);
+ } else {
+ br_vlan_delete(br, vinfo->vid);
+ }
+ break;
+ }
+
+ return err;
+}
+
+static int br_afspec(struct net_bridge *br,
+ struct net_bridge_port *p,
+ struct nlattr *af_spec,
+ int cmd)
+{
+ struct bridge_vlan_info *vinfo_start = NULL;
+ struct bridge_vlan_info *vinfo = NULL;
+ struct nlattr *attr;
+ int err = 0;
+ int rem;
+
+ nla_for_each_nested(attr, af_spec, rem) {
+ if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO)
+ continue;
+ if (nla_len(attr) != sizeof(struct bridge_vlan_info))
+ return -EINVAL;
+ vinfo = nla_data(attr);
+ if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ if (vinfo_start)
+ return -EINVAL;
+ vinfo_start = vinfo;
+ continue;
+ }
+
+ if (vinfo_start) {
+ struct bridge_vlan_info tmp_vinfo;
+ int v;
+
+ if (!(vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END))
+ return -EINVAL;
+
+ if (vinfo->vid <= vinfo_start->vid)
+ return -EINVAL;
+
+ memcpy(&tmp_vinfo, vinfo_start,
+ sizeof(struct bridge_vlan_info));
+
+ for (v = vinfo_start->vid; v <= vinfo->vid; v++) {
+ tmp_vinfo.vid = v;
+ err = br_vlan_info(br, p, cmd, &tmp_vinfo);
+ if (err)
+ break;
+ }
+ vinfo_start = NULL;
+ } else {
+ err = br_vlan_info(br, p, cmd, vinfo);
+ }
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
+ [IFLA_BRPORT_STATE] = { .type = NLA_U8 },
+ [IFLA_BRPORT_COST] = { .type = NLA_U32 },
+ [IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 },
+ [IFLA_BRPORT_MODE] = { .type = NLA_U8 },
+ [IFLA_BRPORT_GUARD] = { .type = NLA_U8 },
+ [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 },
+ [IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 },
+ [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 },
+ [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 },
+};
+
+/* Change the state of the port and notify spanning tree */
+static int br_set_port_state(struct net_bridge_port *p, u8 state)
+{
+ if (state > BR_STATE_BLOCKING)
+ return -EINVAL;
+
+ /* if kernel STP is running, don't allow changes */
+ if (p->br->stp_enabled == BR_KERNEL_STP)
+ return -EBUSY;
+
+ /* if device is not up, change is not allowed
+ * if link is not present, only allowable state is disabled
+ */
+ if (!netif_running(p->dev) ||
+ (!netif_oper_up(p->dev) && state != BR_STATE_DISABLED))
+ return -ENETDOWN;
+
+ br_set_state(p, state);
+ br_log_state(p);
+ br_port_state_selection(p->br);
+ return 0;
+}
+
+/* Set/clear or port flags based on attribute */
+static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
+ int attrtype, unsigned long mask)
+{
+ if (tb[attrtype]) {
+ u8 flag = nla_get_u8(tb[attrtype]);
+ if (flag)
+ p->flags |= mask;
+ else
+ p->flags &= ~mask;
+ }
+}
+
+/* Process bridge protocol info on port */
+static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
+{
+ int err;
+ unsigned long old_flags = p->flags;
+
+ br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
+ br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
+ br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
+ br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
+
+ if (tb[IFLA_BRPORT_COST]) {
+ err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_PRIORITY]) {
+ err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY]));
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_STATE]) {
+ err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE]));
+ if (err)
+ return err;
+ }
+
+ br_port_flags_change(p, old_flags ^ p->flags);
+ return 0;
+}
+
+/* Change state and parameters on port. */
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
+{
+ struct nlattr *protinfo;
+ struct nlattr *afspec;
+ struct net_bridge_port *p;
+ struct nlattr *tb[IFLA_BRPORT_MAX + 1];
+ int err = 0, ret_offload = 0;
+
+ protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO);
+ afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (!protinfo && !afspec)
+ return 0;
+
+ p = br_port_get_rtnl(dev);
+ /* We want to accept dev as bridge itself if the AF_SPEC
+ * is set to see if someone is setting vlan info on the bridge
+ */
+ if (!p && !afspec)
+ return -EINVAL;
+
+ if (p && protinfo) {
+ if (protinfo->nla_type & NLA_F_NESTED) {
+ err = nla_parse_nested(tb, IFLA_BRPORT_MAX,
+ protinfo, br_port_policy);
+ if (err)
+ return err;
+
+ spin_lock_bh(&p->br->lock);
+ err = br_setport(p, tb);
+ spin_unlock_bh(&p->br->lock);
+ } else {
+ /* Binary compatibility with old RSTP */
+ if (nla_len(protinfo) < sizeof(u8))
+ return -EINVAL;
+
+ spin_lock_bh(&p->br->lock);
+ err = br_set_port_state(p, nla_get_u8(protinfo));
+ spin_unlock_bh(&p->br->lock);
+ }
+ if (err)
+ goto out;
+ }
+
+ if (afspec) {
+ err = br_afspec((struct net_bridge *)netdev_priv(dev), p,
+ afspec, RTM_SETLINK);
+ }
+
+ if (p && !(flags & BRIDGE_FLAGS_SELF)) {
+ /* set bridge attributes in hardware if supported
+ */
+ ret_offload = netdev_switch_port_bridge_setlink(dev, nlh,
+ flags);
+ if (ret_offload && ret_offload != -EOPNOTSUPP)
+ br_warn(p->br, "error setting attrs on port %u(%s)\n",
+ (unsigned int)p->port_no, p->dev->name);
+ }
+
+ if (err == 0)
+ br_ifinfo_notify(RTM_NEWLINK, p);
+out:
+ return err;
+}
+
+/* Delete port information */
+int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
+{
+ struct nlattr *afspec;
+ struct net_bridge_port *p;
+ int err = 0, ret_offload = 0;
+
+ afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (!afspec)
+ return 0;
+
+ p = br_port_get_rtnl(dev);
+ /* We want to accept dev as bridge itself as well */
+ if (!p && !(dev->priv_flags & IFF_EBRIDGE))
+ return -EINVAL;
+
+ err = br_afspec((struct net_bridge *)netdev_priv(dev), p,
+ afspec, RTM_DELLINK);
+ if (err == 0)
+ /* Send RTM_NEWLINK because userspace
+ * expects RTM_NEWLINK for vlan dels
+ */
+ br_ifinfo_notify(RTM_NEWLINK, p);
+
+ if (p && !(flags & BRIDGE_FLAGS_SELF)) {
+ /* del bridge attributes in hardware
+ */
+ ret_offload = netdev_switch_port_bridge_dellink(dev, nlh,
+ flags);
+ if (ret_offload && ret_offload != -EOPNOTSUPP)
+ br_warn(p->br, "error deleting attrs on port %u (%s)\n",
+ (unsigned int)p->port_no, p->dev->name);
+ }
+
+ return err;
+}
+static int br_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ return 0;
+}
+
+static int br_dev_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (tb[IFLA_ADDRESS]) {
+ spin_lock_bh(&br->lock);
+ br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
+ spin_unlock_bh(&br->lock);
+ }
+
+ return register_netdevice(dev);
+}
+
+static int br_port_slave_changelink(struct net_device *brdev,
+ struct net_device *dev,
+ struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ if (!data)
+ return 0;
+ return br_setport(br_port_get_rtnl(dev), data);
+}
+
+static int br_port_fill_slave_info(struct sk_buff *skb,
+ const struct net_device *brdev,
+ const struct net_device *dev)
+{
+ return br_port_fill_attrs(skb, br_port_get_rtnl(dev));
+}
+
+static size_t br_port_get_slave_size(const struct net_device *brdev,
+ const struct net_device *dev)
+{
+ return br_port_info_size();
+}
+
+static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
+ [IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 },
+ [IFLA_BR_HELLO_TIME] = { .type = NLA_U32 },
+ [IFLA_BR_MAX_AGE] = { .type = NLA_U32 },
+ [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 },
+ [IFLA_BR_STP_STATE] = { .type = NLA_U32 },
+ [IFLA_BR_PRIORITY] = { .type = NLA_U16 },
+};
+
+static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct net_bridge *br = netdev_priv(brdev);
+ int err;
+
+ if (!data)
+ return 0;
+
+ if (data[IFLA_BR_FORWARD_DELAY]) {
+ err = br_set_forward_delay(br, nla_get_u32(data[IFLA_BR_FORWARD_DELAY]));
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_HELLO_TIME]) {
+ err = br_set_hello_time(br, nla_get_u32(data[IFLA_BR_HELLO_TIME]));
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_MAX_AGE]) {
+ err = br_set_max_age(br, nla_get_u32(data[IFLA_BR_MAX_AGE]));
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_AGEING_TIME]) {
+ u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]);
+
+ br->ageing_time = clock_t_to_jiffies(ageing_time);
+ }
+
+ if (data[IFLA_BR_STP_STATE]) {
+ u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]);
+
+ br_stp_set_enabled(br, stp_enabled);
+ }
+
+ if (data[IFLA_BR_PRIORITY]) {
+ u32 priority = nla_get_u16(data[IFLA_BR_PRIORITY]);
+
+ br_stp_set_bridge_priority(br, priority);
+ }
+
+ return 0;
+}
+
+static size_t br_get_size(const struct net_device *brdev)
+{
+ return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */
+ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */
+ 0;
+}
+
+static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
+{
+ struct net_bridge *br = netdev_priv(brdev);
+ u32 forward_delay = jiffies_to_clock_t(br->forward_delay);
+ u32 hello_time = jiffies_to_clock_t(br->hello_time);
+ u32 age_time = jiffies_to_clock_t(br->max_age);
+ u32 ageing_time = jiffies_to_clock_t(br->ageing_time);
+ u32 stp_enabled = br->stp_enabled;
+ u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1];
+
+ if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
+ nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) ||
+ nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) ||
+ nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) ||
+ nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) ||
+ nla_put_u16(skb, IFLA_BR_PRIORITY, priority))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static size_t br_get_link_af_size(const struct net_device *dev)
+{
+ struct net_port_vlans *pv;
+
+ if (br_port_exists(dev))
+ pv = nbp_get_vlan_info(br_port_get_rtnl(dev));
+ else if (dev->priv_flags & IFF_EBRIDGE)
+ pv = br_get_vlan_info((struct net_bridge *)netdev_priv(dev));
+ else
+ return 0;
+
+ if (!pv)
+ return 0;
+
+ /* Each VLAN is returned in bridge_vlan_info along with flags */
+ return pv->num_vlans * nla_total_size(sizeof(struct bridge_vlan_info));
+}
+
+static struct rtnl_af_ops br_af_ops __read_mostly = {
+ .family = AF_BRIDGE,
+ .get_link_af_size = br_get_link_af_size,
+};
+
+struct rtnl_link_ops br_link_ops __read_mostly = {
+ .kind = "bridge",
+ .priv_size = sizeof(struct net_bridge),
+ .setup = br_dev_setup,
+ .maxtype = IFLA_BRPORT_MAX,
+ .policy = br_policy,
+ .validate = br_validate,
+ .newlink = br_dev_newlink,
+ .changelink = br_changelink,
+ .dellink = br_dev_delete,
+ .get_size = br_get_size,
+ .fill_info = br_fill_info,
+
+ .slave_maxtype = IFLA_BRPORT_MAX,
+ .slave_policy = br_port_policy,
+ .slave_changelink = br_port_slave_changelink,
+ .get_slave_size = br_port_get_slave_size,
+ .fill_slave_info = br_port_fill_slave_info,
+};
+
+int __init br_netlink_init(void)
+{
+ int err;
+
+ br_mdb_init();
+ rtnl_af_register(&br_af_ops);
+
+ err = rtnl_link_register(&br_link_ops);
+ if (err)
+ goto out_af;
+
+ return 0;
+
+out_af:
+ rtnl_af_unregister(&br_af_ops);
+ br_mdb_uninit();
+ return err;
+}
+
+void br_netlink_fini(void)
+{
+ br_mdb_uninit();
+ rtnl_af_unregister(&br_af_ops);
+ rtnl_link_unregister(&br_link_ops);
+}
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
new file mode 100644
index 000000000..20cbb727d
--- /dev/null
+++ b/net/bridge/br_nf_core.c
@@ -0,0 +1,95 @@
+/*
+ * Handle firewalling core
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+#include <net/route.h>
+
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+}
+
+static void fake_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+}
+
+static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ return NULL;
+}
+
+static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ return NULL;
+}
+
+static unsigned int fake_mtu(const struct dst_entry *dst)
+{
+ return dst->dev->mtu;
+}
+
+static struct dst_ops fake_dst_ops = {
+ .family = AF_INET,
+ .update_pmtu = fake_update_pmtu,
+ .redirect = fake_redirect,
+ .cow_metrics = fake_cow_metrics,
+ .neigh_lookup = fake_neigh_lookup,
+ .mtu = fake_mtu,
+};
+
+/*
+ * Initialize bogus route table used to keep netfilter happy.
+ * Currently, we fill in the PMTU entry because netfilter
+ * refragmentation needs it, and the rt_flags entry because
+ * ipt_REJECT needs it. Future netfilter modules might
+ * require us to fill additional fields.
+ */
+static const u32 br_dst_default_metrics[RTAX_MAX] = {
+ [RTAX_MTU - 1] = 1500,
+};
+
+void br_netfilter_rtable_init(struct net_bridge *br)
+{
+ struct rtable *rt = &br->fake_rtable;
+
+ atomic_set(&rt->dst.__refcnt, 1);
+ rt->dst.dev = br->dev;
+ rt->dst.path = &rt->dst;
+ dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
+ rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
+ rt->dst.ops = &fake_dst_ops;
+}
+
+int __init br_nf_core_init(void)
+{
+ return dst_entries_init(&fake_dst_ops);
+}
+
+void br_nf_core_fini(void)
+{
+ dst_entries_destroy(&fake_dst_ops);
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
new file mode 100644
index 000000000..3362c2940
--- /dev/null
+++ b/net/bridge/br_private.h
@@ -0,0 +1,851 @@
+/*
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _BR_PRIVATE_H
+#define _BR_PRIVATE_H
+
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/netpoll.h>
+#include <linux/u64_stats_sync.h>
+#include <net/route.h>
+#include <linux/if_vlan.h>
+
+#define BR_HASH_BITS 8
+#define BR_HASH_SIZE (1 << BR_HASH_BITS)
+
+#define BR_HOLD_TIME (1*HZ)
+
+#define BR_PORT_BITS 10
+#define BR_MAX_PORTS (1<<BR_PORT_BITS)
+#define BR_VLAN_BITMAP_LEN BITS_TO_LONGS(VLAN_N_VID)
+
+#define BR_VERSION "2.3"
+
+/* Control of forwarding link local multicast */
+#define BR_GROUPFWD_DEFAULT 0
+/* Don't allow forwarding control protocols like STP and LLDP */
+#define BR_GROUPFWD_RESTRICTED 0x4007u
+/* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */
+#define BR_GROUPFWD_8021AD 0xB801u
+
+/* Path to usermode spanning tree program */
+#define BR_STP_PROG "/sbin/bridge-stp"
+
+typedef struct bridge_id bridge_id;
+typedef struct mac_addr mac_addr;
+typedef __u16 port_id;
+
+struct bridge_id
+{
+ unsigned char prio[2];
+ unsigned char addr[ETH_ALEN];
+};
+
+struct mac_addr
+{
+ unsigned char addr[ETH_ALEN];
+};
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+/* our own querier */
+struct bridge_mcast_own_query {
+ struct timer_list timer;
+ u32 startup_sent;
+};
+
+/* other querier */
+struct bridge_mcast_other_query {
+ struct timer_list timer;
+ unsigned long delay_time;
+};
+
+/* selected querier */
+struct bridge_mcast_querier {
+ struct br_ip addr;
+ struct net_bridge_port __rcu *port;
+};
+#endif
+
+struct net_port_vlans {
+ u16 port_idx;
+ u16 pvid;
+ union {
+ struct net_bridge_port *port;
+ struct net_bridge *br;
+ } parent;
+ struct rcu_head rcu;
+ unsigned long vlan_bitmap[BR_VLAN_BITMAP_LEN];
+ unsigned long untagged_bitmap[BR_VLAN_BITMAP_LEN];
+ u16 num_vlans;
+};
+
+struct net_bridge_fdb_entry
+{
+ struct hlist_node hlist;
+ struct net_bridge_port *dst;
+
+ struct rcu_head rcu;
+ unsigned long updated;
+ unsigned long used;
+ mac_addr addr;
+ unsigned char is_local:1,
+ is_static:1,
+ added_by_user:1,
+ added_by_external_learn:1;
+ __u16 vlan_id;
+};
+
+struct net_bridge_port_group {
+ struct net_bridge_port *port;
+ struct net_bridge_port_group __rcu *next;
+ struct hlist_node mglist;
+ struct rcu_head rcu;
+ struct timer_list timer;
+ struct br_ip addr;
+ unsigned char state;
+};
+
+struct net_bridge_mdb_entry
+{
+ struct hlist_node hlist[2];
+ struct net_bridge *br;
+ struct net_bridge_port_group __rcu *ports;
+ struct rcu_head rcu;
+ struct timer_list timer;
+ struct br_ip addr;
+ bool mglist;
+};
+
+struct net_bridge_mdb_htable
+{
+ struct hlist_head *mhash;
+ struct rcu_head rcu;
+ struct net_bridge_mdb_htable *old;
+ u32 size;
+ u32 max;
+ u32 secret;
+ u32 ver;
+};
+
+struct net_bridge_port
+{
+ struct net_bridge *br;
+ struct net_device *dev;
+ struct list_head list;
+
+ /* STP */
+ u8 priority;
+ u8 state;
+ u16 port_no;
+ unsigned char topology_change_ack;
+ unsigned char config_pending;
+ port_id port_id;
+ port_id designated_port;
+ bridge_id designated_root;
+ bridge_id designated_bridge;
+ u32 path_cost;
+ u32 designated_cost;
+ unsigned long designated_age;
+
+ struct timer_list forward_delay_timer;
+ struct timer_list hold_timer;
+ struct timer_list message_age_timer;
+ struct kobject kobj;
+ struct rcu_head rcu;
+
+ unsigned long flags;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ struct bridge_mcast_own_query ip4_own_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct bridge_mcast_own_query ip6_own_query;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+ unsigned char multicast_router;
+ struct timer_list multicast_router_timer;
+ struct hlist_head mglist;
+ struct hlist_node rlist;
+#endif
+
+#ifdef CONFIG_SYSFS
+ char sysfs_name[IFNAMSIZ];
+#endif
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct netpoll *np;
+#endif
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ struct net_port_vlans __rcu *vlan_info;
+#endif
+};
+
+#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
+#define br_promisc_port(p) ((p)->flags & BR_PROMISC)
+
+#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
+
+static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev)
+{
+ return rcu_dereference(dev->rx_handler_data);
+}
+
+static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *dev)
+{
+ return br_port_exists(dev) ?
+ rtnl_dereference(dev->rx_handler_data) : NULL;
+}
+
+struct net_bridge
+{
+ spinlock_t lock;
+ struct list_head port_list;
+ struct net_device *dev;
+
+ struct pcpu_sw_netstats __percpu *stats;
+ spinlock_t hash_lock;
+ struct hlist_head hash[BR_HASH_SIZE];
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ struct rtable fake_rtable;
+ bool nf_call_iptables;
+ bool nf_call_ip6tables;
+ bool nf_call_arptables;
+#endif
+ u16 group_fwd_mask;
+ u16 group_fwd_mask_required;
+
+ /* STP */
+ bridge_id designated_root;
+ bridge_id bridge_id;
+ u32 root_path_cost;
+ unsigned long max_age;
+ unsigned long hello_time;
+ unsigned long forward_delay;
+ unsigned long bridge_max_age;
+ unsigned long ageing_time;
+ unsigned long bridge_hello_time;
+ unsigned long bridge_forward_delay;
+
+ u8 group_addr[ETH_ALEN];
+ bool group_addr_set;
+ u16 root_port;
+
+ enum {
+ BR_NO_STP, /* no spanning tree */
+ BR_KERNEL_STP, /* old STP in kernel */
+ BR_USER_STP, /* new RSTP in userspace */
+ } stp_enabled;
+
+ unsigned char topology_change;
+ unsigned char topology_change_detected;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ unsigned char multicast_router;
+
+ u8 multicast_disabled:1;
+ u8 multicast_querier:1;
+ u8 multicast_query_use_ifaddr:1;
+
+ u32 hash_elasticity;
+ u32 hash_max;
+
+ u32 multicast_last_member_count;
+ u32 multicast_startup_query_count;
+
+ unsigned long multicast_last_member_interval;
+ unsigned long multicast_membership_interval;
+ unsigned long multicast_querier_interval;
+ unsigned long multicast_query_interval;
+ unsigned long multicast_query_response_interval;
+ unsigned long multicast_startup_query_interval;
+
+ spinlock_t multicast_lock;
+ struct net_bridge_mdb_htable __rcu *mdb;
+ struct hlist_head router_list;
+
+ struct timer_list multicast_router_timer;
+ struct bridge_mcast_other_query ip4_other_query;
+ struct bridge_mcast_own_query ip4_own_query;
+ struct bridge_mcast_querier ip4_querier;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct bridge_mcast_other_query ip6_other_query;
+ struct bridge_mcast_own_query ip6_own_query;
+ struct bridge_mcast_querier ip6_querier;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+#endif
+
+ struct timer_list hello_timer;
+ struct timer_list tcn_timer;
+ struct timer_list topology_change_timer;
+ struct timer_list gc_timer;
+ struct kobject *ifobj;
+ u32 auto_cnt;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ u8 vlan_enabled;
+ __be16 vlan_proto;
+ u16 default_pvid;
+ struct net_port_vlans __rcu *vlan_info;
+#endif
+};
+
+struct br_input_skb_cb {
+ struct net_device *brdev;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ int igmp;
+ int mrouters_only;
+#endif
+
+ u16 frag_max_size;
+ bool proxyarp_replied;
+
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ bool vlan_filtered;
+#endif
+};
+
+#define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb)
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (BR_INPUT_SKB_CB(__skb)->mrouters_only)
+#else
+# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (0)
+#endif
+
+#define br_printk(level, br, format, args...) \
+ printk(level "%s: " format, (br)->dev->name, ##args)
+
+#define br_err(__br, format, args...) \
+ br_printk(KERN_ERR, __br, format, ##args)
+#define br_warn(__br, format, args...) \
+ br_printk(KERN_WARNING, __br, format, ##args)
+#define br_notice(__br, format, args...) \
+ br_printk(KERN_NOTICE, __br, format, ##args)
+#define br_info(__br, format, args...) \
+ br_printk(KERN_INFO, __br, format, ##args)
+
+#define br_debug(br, format, args...) \
+ pr_debug("%s: " format, (br)->dev->name, ##args)
+
+/* called under bridge lock */
+static inline int br_is_root_bridge(const struct net_bridge *br)
+{
+ return !memcmp(&br->bridge_id, &br->designated_root, 8);
+}
+
+/* br_device.c */
+void br_dev_setup(struct net_device *dev);
+void br_dev_delete(struct net_device *dev, struct list_head *list);
+netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ struct netpoll *np = p->np;
+
+ if (np)
+ netpoll_send_skb(np, skb);
+}
+
+int br_netpoll_enable(struct net_bridge_port *p);
+void br_netpoll_disable(struct net_bridge_port *p);
+#else
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+}
+
+static inline int br_netpoll_enable(struct net_bridge_port *p)
+{
+ return 0;
+}
+
+static inline void br_netpoll_disable(struct net_bridge_port *p)
+{
+}
+#endif
+
+/* br_fdb.c */
+int br_fdb_init(void);
+void br_fdb_fini(void);
+void br_fdb_flush(struct net_bridge *br);
+void br_fdb_find_delete_local(struct net_bridge *br,
+ const struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid);
+void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
+void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
+void br_fdb_cleanup(unsigned long arg);
+void br_fdb_delete_by_port(struct net_bridge *br,
+ const struct net_bridge_port *p, int do_all);
+struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
+ const unsigned char *addr, __u16 vid);
+int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
+int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
+ unsigned long off);
+int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid);
+void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid, bool added_by_user);
+
+int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev, const unsigned char *addr, u16 vid);
+int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
+ const unsigned char *addr, u16 vid, u16 nlh_flags);
+int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev, struct net_device *fdev, int idx);
+int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
+void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
+int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid);
+int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
+ const unsigned char *addr, u16 vid);
+
+/* br_forward.c */
+void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb);
+int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb);
+void br_forward(const struct net_bridge_port *to,
+ struct sk_buff *skb, struct sk_buff *skb0);
+int br_forward_finish(struct sock *sk, struct sk_buff *skb);
+void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast);
+void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
+ struct sk_buff *skb2, bool unicast);
+
+/* br_if.c */
+void br_port_carrier_check(struct net_bridge_port *p);
+int br_add_bridge(struct net *net, const char *name);
+int br_del_bridge(struct net *net, const char *name);
+int br_add_if(struct net_bridge *br, struct net_device *dev);
+int br_del_if(struct net_bridge *br, struct net_device *dev);
+int br_min_mtu(const struct net_bridge *br);
+netdev_features_t br_features_recompute(struct net_bridge *br,
+ netdev_features_t features);
+void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
+void br_manage_promisc(struct net_bridge *br);
+
+/* br_input.c */
+int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb);
+rx_handler_result_t br_handle_frame(struct sk_buff **pskb);
+
+static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
+{
+ return rcu_dereference(dev->rx_handler) == br_handle_frame;
+}
+
+static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
+{
+ return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
+}
+
+/* br_ioctl.c */
+int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
+ void __user *arg);
+
+/* br_multicast.c */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+extern unsigned int br_mdb_rehash_seq;
+int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
+ struct sk_buff *skb, u16 vid);
+struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
+ struct sk_buff *skb, u16 vid);
+void br_multicast_add_port(struct net_bridge_port *port);
+void br_multicast_del_port(struct net_bridge_port *port);
+void br_multicast_enable_port(struct net_bridge_port *port);
+void br_multicast_disable_port(struct net_bridge_port *port);
+void br_multicast_init(struct net_bridge *br);
+void br_multicast_open(struct net_bridge *br);
+void br_multicast_stop(struct net_bridge *br);
+void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb);
+void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb, struct sk_buff *skb2);
+int br_multicast_set_router(struct net_bridge *br, unsigned long val);
+int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
+int br_multicast_toggle(struct net_bridge *br, unsigned long val);
+int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
+int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+struct net_bridge_mdb_entry *
+br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst);
+struct net_bridge_mdb_entry *
+br_multicast_new_group(struct net_bridge *br, struct net_bridge_port *port,
+ struct br_ip *group);
+void br_multicast_free_pg(struct rcu_head *head);
+struct net_bridge_port_group *
+br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
+ struct net_bridge_port_group __rcu *next,
+ unsigned char state);
+void br_mdb_init(void);
+void br_mdb_uninit(void);
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
+ struct br_ip *group, int type);
+
+#define mlock_dereference(X, br) \
+ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
+
+static inline bool br_multicast_is_router(struct net_bridge *br)
+{
+ return br->multicast_router == 2 ||
+ (br->multicast_router == 1 &&
+ timer_pending(&br->multicast_router_timer));
+}
+
+static inline bool
+__br_multicast_querier_exists(struct net_bridge *br,
+ struct bridge_mcast_other_query *querier)
+{
+ return time_is_before_jiffies(querier->delay_time) &&
+ (br->multicast_querier || timer_pending(&querier->timer));
+}
+
+static inline bool br_multicast_querier_exists(struct net_bridge *br,
+ struct ethhdr *eth)
+{
+ switch (eth->h_proto) {
+ case (htons(ETH_P_IP)):
+ return __br_multicast_querier_exists(br, &br->ip4_other_query);
+#if IS_ENABLED(CONFIG_IPV6)
+ case (htons(ETH_P_IPV6)):
+ return __br_multicast_querier_exists(br, &br->ip6_other_query);
+#endif
+ default:
+ return false;
+ }
+}
+#else
+static inline int br_multicast_rcv(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb,
+ u16 vid)
+{
+ return 0;
+}
+
+static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
+ struct sk_buff *skb, u16 vid)
+{
+ return NULL;
+}
+
+static inline void br_multicast_add_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_del_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_enable_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_disable_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_init(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_open(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_stop(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb)
+{
+}
+
+static inline void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
+ struct sk_buff *skb,
+ struct sk_buff *skb2)
+{
+}
+static inline bool br_multicast_is_router(struct net_bridge *br)
+{
+ return 0;
+}
+static inline bool br_multicast_querier_exists(struct net_bridge *br,
+ struct ethhdr *eth)
+{
+ return false;
+}
+static inline void br_mdb_init(void)
+{
+}
+static inline void br_mdb_uninit(void)
+{
+}
+#endif
+
+/* br_vlan.c */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
+ struct sk_buff *skb, u16 *vid);
+bool br_allowed_egress(struct net_bridge *br, const struct net_port_vlans *v,
+ const struct sk_buff *skb);
+bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
+struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_port_vlans *v,
+ struct sk_buff *skb);
+int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags);
+int br_vlan_delete(struct net_bridge *br, u16 vid);
+void br_vlan_flush(struct net_bridge *br);
+bool br_vlan_find(struct net_bridge *br, u16 vid);
+void br_recalculate_fwd_mask(struct net_bridge *br);
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
+int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
+int br_vlan_init(struct net_bridge *br);
+int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
+int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags);
+int nbp_vlan_delete(struct net_bridge_port *port, u16 vid);
+void nbp_vlan_flush(struct net_bridge_port *port);
+bool nbp_vlan_find(struct net_bridge_port *port, u16 vid);
+int nbp_vlan_init(struct net_bridge_port *port);
+
+static inline struct net_port_vlans *br_get_vlan_info(
+ const struct net_bridge *br)
+{
+ return rcu_dereference_rtnl(br->vlan_info);
+}
+
+static inline struct net_port_vlans *nbp_get_vlan_info(
+ const struct net_bridge_port *p)
+{
+ return rcu_dereference_rtnl(p->vlan_info);
+}
+
+/* Since bridge now depends on 8021Q module, but the time bridge sees the
+ * skb, the vlan tag will always be present if the frame was tagged.
+ */
+static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
+{
+ int err = 0;
+
+ if (skb_vlan_tag_present(skb))
+ *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK;
+ else {
+ *vid = 0;
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static inline u16 br_get_pvid(const struct net_port_vlans *v)
+{
+ if (!v)
+ return 0;
+
+ smp_rmb();
+ return v->pvid;
+}
+
+static inline int br_vlan_enabled(struct net_bridge *br)
+{
+ return br->vlan_enabled;
+}
+#else
+static inline bool br_allowed_ingress(struct net_bridge *br,
+ struct net_port_vlans *v,
+ struct sk_buff *skb,
+ u16 *vid)
+{
+ return true;
+}
+
+static inline bool br_allowed_egress(struct net_bridge *br,
+ const struct net_port_vlans *v,
+ const struct sk_buff *skb)
+{
+ return true;
+}
+
+static inline bool br_should_learn(struct net_bridge_port *p,
+ struct sk_buff *skb, u16 *vid)
+{
+ return true;
+}
+
+static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_port_vlans *v,
+ struct sk_buff *skb)
+{
+ return skb;
+}
+
+static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_vlan_delete(struct net_bridge *br, u16 vid)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void br_vlan_flush(struct net_bridge *br)
+{
+}
+
+static inline bool br_vlan_find(struct net_bridge *br, u16 vid)
+{
+ return false;
+}
+
+static inline void br_recalculate_fwd_mask(struct net_bridge *br)
+{
+}
+
+static inline int br_vlan_init(struct net_bridge *br)
+{
+ return 0;
+}
+
+static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void nbp_vlan_flush(struct net_bridge_port *port)
+{
+}
+
+static inline struct net_port_vlans *br_get_vlan_info(
+ const struct net_bridge *br)
+{
+ return NULL;
+}
+static inline struct net_port_vlans *nbp_get_vlan_info(
+ const struct net_bridge_port *p)
+{
+ return NULL;
+}
+
+static inline bool nbp_vlan_find(struct net_bridge_port *port, u16 vid)
+{
+ return false;
+}
+
+static inline int nbp_vlan_init(struct net_bridge_port *port)
+{
+ return 0;
+}
+
+static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag)
+{
+ return 0;
+}
+static inline u16 br_get_pvid(const struct net_port_vlans *v)
+{
+ return 0;
+}
+
+static inline int br_vlan_enabled(struct net_bridge *br)
+{
+ return 0;
+}
+#endif
+
+struct nf_br_ops {
+ int (*br_dev_xmit_hook)(struct sk_buff *skb);
+};
+extern const struct nf_br_ops __rcu *nf_br_ops;
+
+/* br_netfilter.c */
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+int br_nf_core_init(void);
+void br_nf_core_fini(void);
+void br_netfilter_rtable_init(struct net_bridge *);
+#else
+static inline int br_nf_core_init(void) { return 0; }
+static inline void br_nf_core_fini(void) {}
+#define br_netfilter_rtable_init(x)
+#endif
+
+/* br_stp.c */
+void br_log_state(const struct net_bridge_port *p);
+void br_set_state(struct net_bridge_port *p, unsigned int state);
+struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no);
+void br_init_port(struct net_bridge_port *p);
+void br_become_designated_port(struct net_bridge_port *p);
+
+void __br_set_forward_delay(struct net_bridge *br, unsigned long t);
+int br_set_forward_delay(struct net_bridge *br, unsigned long x);
+int br_set_hello_time(struct net_bridge *br, unsigned long x);
+int br_set_max_age(struct net_bridge *br, unsigned long x);
+
+
+/* br_stp_if.c */
+void br_stp_enable_bridge(struct net_bridge *br);
+void br_stp_disable_bridge(struct net_bridge *br);
+void br_stp_set_enabled(struct net_bridge *br, unsigned long val);
+void br_stp_enable_port(struct net_bridge_port *p);
+void br_stp_disable_port(struct net_bridge_port *p);
+bool br_stp_recalculate_bridge_id(struct net_bridge *br);
+void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
+void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio);
+int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio);
+int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost);
+ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id);
+
+/* br_stp_bpdu.c */
+struct stp_proto;
+void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+ struct net_device *dev);
+
+/* br_stp_timer.c */
+void br_stp_timer_init(struct net_bridge *br);
+void br_stp_port_timer_init(struct net_bridge_port *p);
+unsigned long br_timer_value(const struct timer_list *timer);
+
+/* br.c */
+#if IS_ENABLED(CONFIG_ATM_LANE)
+extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr);
+#endif
+
+/* br_netlink.c */
+extern struct rtnl_link_ops br_link_ops;
+int br_netlink_init(void);
+void br_netlink_fini(void);
+void br_ifinfo_notify(int event, struct net_bridge_port *port);
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
+int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
+int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
+ u32 filter_mask, int nlflags);
+
+#ifdef CONFIG_SYSFS
+/* br_sysfs_if.c */
+extern const struct sysfs_ops brport_sysfs_ops;
+int br_sysfs_addif(struct net_bridge_port *p);
+int br_sysfs_renameif(struct net_bridge_port *p);
+
+/* br_sysfs_br.c */
+int br_sysfs_addbr(struct net_device *dev);
+void br_sysfs_delbr(struct net_device *dev);
+
+#else
+
+static inline int br_sysfs_addif(struct net_bridge_port *p) { return 0; }
+static inline int br_sysfs_renameif(struct net_bridge_port *p) { return 0; }
+static inline int br_sysfs_addbr(struct net_device *dev) { return 0; }
+static inline void br_sysfs_delbr(struct net_device *dev) { return; }
+#endif /* CONFIG_SYSFS */
+
+#endif
diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h
new file mode 100644
index 000000000..2fe910c4e
--- /dev/null
+++ b/net/bridge/br_private_stp.h
@@ -0,0 +1,69 @@
+/*
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _BR_PRIVATE_STP_H
+#define _BR_PRIVATE_STP_H
+
+#define BPDU_TYPE_CONFIG 0
+#define BPDU_TYPE_TCN 0x80
+
+/* IEEE 802.1D-1998 timer values */
+#define BR_MIN_HELLO_TIME (1*HZ)
+#define BR_MAX_HELLO_TIME (10*HZ)
+
+#define BR_MIN_FORWARD_DELAY (2*HZ)
+#define BR_MAX_FORWARD_DELAY (30*HZ)
+
+#define BR_MIN_MAX_AGE (6*HZ)
+#define BR_MAX_MAX_AGE (40*HZ)
+
+#define BR_MIN_PATH_COST 1
+#define BR_MAX_PATH_COST 65535
+
+struct br_config_bpdu {
+ unsigned int topology_change:1;
+ unsigned int topology_change_ack:1;
+ bridge_id root;
+ int root_path_cost;
+ bridge_id bridge_id;
+ port_id port_id;
+ int message_age;
+ int max_age;
+ int hello_time;
+ int forward_delay;
+};
+
+/* called under bridge lock */
+static inline int br_is_designated_port(const struct net_bridge_port *p)
+{
+ return !memcmp(&p->designated_bridge, &p->br->bridge_id, 8) &&
+ (p->designated_port == p->port_id);
+}
+
+
+/* br_stp.c */
+void br_become_root_bridge(struct net_bridge *br);
+void br_config_bpdu_generation(struct net_bridge *);
+void br_configuration_update(struct net_bridge *);
+void br_port_state_selection(struct net_bridge *);
+void br_received_config_bpdu(struct net_bridge_port *p,
+ const struct br_config_bpdu *bpdu);
+void br_received_tcn_bpdu(struct net_bridge_port *p);
+void br_transmit_config(struct net_bridge_port *p);
+void br_transmit_tcn(struct net_bridge *br);
+void br_topology_change_detection(struct net_bridge *br);
+
+/* br_stp_bpdu.c */
+void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *);
+void br_send_tcn_bpdu(struct net_bridge_port *);
+
+#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
new file mode 100644
index 000000000..fb3ebe615
--- /dev/null
+++ b/net/bridge/br_stp.c
@@ -0,0 +1,582 @@
+/*
+ * Spanning tree protocol; generic parts
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/rculist.h>
+#include <net/switchdev.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+/* since time values in bpdu are in jiffies and then scaled (1/256)
+ * before sending, make sure that is at least one STP tick.
+ */
+#define MESSAGE_AGE_INCR ((HZ / 256) + 1)
+
+static const char *const br_port_state_names[] = {
+ [BR_STATE_DISABLED] = "disabled",
+ [BR_STATE_LISTENING] = "listening",
+ [BR_STATE_LEARNING] = "learning",
+ [BR_STATE_FORWARDING] = "forwarding",
+ [BR_STATE_BLOCKING] = "blocking",
+};
+
+void br_log_state(const struct net_bridge_port *p)
+{
+ br_info(p->br, "port %u(%s) entered %s state\n",
+ (unsigned int) p->port_no, p->dev->name,
+ br_port_state_names[p->state]);
+}
+
+void br_set_state(struct net_bridge_port *p, unsigned int state)
+{
+ int err;
+
+ p->state = state;
+ err = netdev_switch_port_stp_update(p->dev, state);
+ if (err && err != -EOPNOTSUPP)
+ br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
+ (unsigned int) p->port_no, p->dev->name);
+}
+
+/* called under bridge lock */
+struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry_rcu(p, &br->port_list, list) {
+ if (p->port_no == port_no)
+ return p;
+ }
+
+ return NULL;
+}
+
+/* called under bridge lock */
+static int br_should_become_root_port(const struct net_bridge_port *p,
+ u16 root_port)
+{
+ struct net_bridge *br;
+ struct net_bridge_port *rp;
+ int t;
+
+ br = p->br;
+ if (p->state == BR_STATE_DISABLED ||
+ br_is_designated_port(p))
+ return 0;
+
+ if (memcmp(&br->bridge_id, &p->designated_root, 8) <= 0)
+ return 0;
+
+ if (!root_port)
+ return 1;
+
+ rp = br_get_port(br, root_port);
+
+ t = memcmp(&p->designated_root, &rp->designated_root, 8);
+ if (t < 0)
+ return 1;
+ else if (t > 0)
+ return 0;
+
+ if (p->designated_cost + p->path_cost <
+ rp->designated_cost + rp->path_cost)
+ return 1;
+ else if (p->designated_cost + p->path_cost >
+ rp->designated_cost + rp->path_cost)
+ return 0;
+
+ t = memcmp(&p->designated_bridge, &rp->designated_bridge, 8);
+ if (t < 0)
+ return 1;
+ else if (t > 0)
+ return 0;
+
+ if (p->designated_port < rp->designated_port)
+ return 1;
+ else if (p->designated_port > rp->designated_port)
+ return 0;
+
+ if (p->port_id < rp->port_id)
+ return 1;
+
+ return 0;
+}
+
+static void br_root_port_block(const struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+
+ br_notice(br, "port %u(%s) tried to become root port (blocked)",
+ (unsigned int) p->port_no, p->dev->name);
+
+ br_set_state(p, BR_STATE_LISTENING);
+ br_log_state(p);
+ br_ifinfo_notify(RTM_NEWLINK, p);
+
+ if (br->forward_delay > 0)
+ mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay);
+}
+
+/* called under bridge lock */
+static void br_root_selection(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+ u16 root_port = 0;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (!br_should_become_root_port(p, root_port))
+ continue;
+
+ if (p->flags & BR_ROOT_BLOCK)
+ br_root_port_block(br, p);
+ else
+ root_port = p->port_no;
+ }
+
+ br->root_port = root_port;
+
+ if (!root_port) {
+ br->designated_root = br->bridge_id;
+ br->root_path_cost = 0;
+ } else {
+ p = br_get_port(br, root_port);
+ br->designated_root = p->designated_root;
+ br->root_path_cost = p->designated_cost + p->path_cost;
+ }
+}
+
+/* called under bridge lock */
+void br_become_root_bridge(struct net_bridge *br)
+{
+ br->max_age = br->bridge_max_age;
+ br->hello_time = br->bridge_hello_time;
+ br->forward_delay = br->bridge_forward_delay;
+ br_topology_change_detection(br);
+ del_timer(&br->tcn_timer);
+
+ if (br->dev->flags & IFF_UP) {
+ br_config_bpdu_generation(br);
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
+ }
+}
+
+/* called under bridge lock */
+void br_transmit_config(struct net_bridge_port *p)
+{
+ struct br_config_bpdu bpdu;
+ struct net_bridge *br;
+
+ if (timer_pending(&p->hold_timer)) {
+ p->config_pending = 1;
+ return;
+ }
+
+ br = p->br;
+
+ bpdu.topology_change = br->topology_change;
+ bpdu.topology_change_ack = p->topology_change_ack;
+ bpdu.root = br->designated_root;
+ bpdu.root_path_cost = br->root_path_cost;
+ bpdu.bridge_id = br->bridge_id;
+ bpdu.port_id = p->port_id;
+ if (br_is_root_bridge(br))
+ bpdu.message_age = 0;
+ else {
+ struct net_bridge_port *root
+ = br_get_port(br, br->root_port);
+ bpdu.message_age = (jiffies - root->designated_age)
+ + MESSAGE_AGE_INCR;
+ }
+ bpdu.max_age = br->max_age;
+ bpdu.hello_time = br->hello_time;
+ bpdu.forward_delay = br->forward_delay;
+
+ if (bpdu.message_age < br->max_age) {
+ br_send_config_bpdu(p, &bpdu);
+ p->topology_change_ack = 0;
+ p->config_pending = 0;
+ mod_timer(&p->hold_timer,
+ round_jiffies(jiffies + BR_HOLD_TIME));
+ }
+}
+
+/* called under bridge lock */
+static void br_record_config_information(struct net_bridge_port *p,
+ const struct br_config_bpdu *bpdu)
+{
+ p->designated_root = bpdu->root;
+ p->designated_cost = bpdu->root_path_cost;
+ p->designated_bridge = bpdu->bridge_id;
+ p->designated_port = bpdu->port_id;
+ p->designated_age = jiffies - bpdu->message_age;
+
+ mod_timer(&p->message_age_timer, jiffies
+ + (bpdu->max_age - bpdu->message_age));
+}
+
+/* called under bridge lock */
+static void br_record_config_timeout_values(struct net_bridge *br,
+ const struct br_config_bpdu *bpdu)
+{
+ br->max_age = bpdu->max_age;
+ br->hello_time = bpdu->hello_time;
+ br->forward_delay = bpdu->forward_delay;
+ br->topology_change = bpdu->topology_change;
+}
+
+/* called under bridge lock */
+void br_transmit_tcn(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ p = br_get_port(br, br->root_port);
+ if (p)
+ br_send_tcn_bpdu(p);
+ else
+ br_notice(br, "root port %u not found for topology notice\n",
+ br->root_port);
+}
+
+/* called under bridge lock */
+static int br_should_become_designated_port(const struct net_bridge_port *p)
+{
+ struct net_bridge *br;
+ int t;
+
+ br = p->br;
+ if (br_is_designated_port(p))
+ return 1;
+
+ if (memcmp(&p->designated_root, &br->designated_root, 8))
+ return 1;
+
+ if (br->root_path_cost < p->designated_cost)
+ return 1;
+ else if (br->root_path_cost > p->designated_cost)
+ return 0;
+
+ t = memcmp(&br->bridge_id, &p->designated_bridge, 8);
+ if (t < 0)
+ return 1;
+ else if (t > 0)
+ return 0;
+
+ if (p->port_id < p->designated_port)
+ return 1;
+
+ return 0;
+}
+
+/* called under bridge lock */
+static void br_designated_port_selection(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->state != BR_STATE_DISABLED &&
+ br_should_become_designated_port(p))
+ br_become_designated_port(p);
+
+ }
+}
+
+/* called under bridge lock */
+static int br_supersedes_port_info(const struct net_bridge_port *p,
+ const struct br_config_bpdu *bpdu)
+{
+ int t;
+
+ t = memcmp(&bpdu->root, &p->designated_root, 8);
+ if (t < 0)
+ return 1;
+ else if (t > 0)
+ return 0;
+
+ if (bpdu->root_path_cost < p->designated_cost)
+ return 1;
+ else if (bpdu->root_path_cost > p->designated_cost)
+ return 0;
+
+ t = memcmp(&bpdu->bridge_id, &p->designated_bridge, 8);
+ if (t < 0)
+ return 1;
+ else if (t > 0)
+ return 0;
+
+ if (memcmp(&bpdu->bridge_id, &p->br->bridge_id, 8))
+ return 1;
+
+ if (bpdu->port_id <= p->designated_port)
+ return 1;
+
+ return 0;
+}
+
+/* called under bridge lock */
+static void br_topology_change_acknowledged(struct net_bridge *br)
+{
+ br->topology_change_detected = 0;
+ del_timer(&br->tcn_timer);
+}
+
+/* called under bridge lock */
+void br_topology_change_detection(struct net_bridge *br)
+{
+ int isroot = br_is_root_bridge(br);
+
+ if (br->stp_enabled != BR_KERNEL_STP)
+ return;
+
+ br_info(br, "topology change detected, %s\n",
+ isroot ? "propagating" : "sending tcn bpdu");
+
+ if (isroot) {
+ br->topology_change = 1;
+ mod_timer(&br->topology_change_timer, jiffies
+ + br->bridge_forward_delay + br->bridge_max_age);
+ } else if (!br->topology_change_detected) {
+ br_transmit_tcn(br);
+ mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time);
+ }
+
+ br->topology_change_detected = 1;
+}
+
+/* called under bridge lock */
+void br_config_bpdu_generation(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->state != BR_STATE_DISABLED &&
+ br_is_designated_port(p))
+ br_transmit_config(p);
+ }
+}
+
+/* called under bridge lock */
+static void br_reply(struct net_bridge_port *p)
+{
+ br_transmit_config(p);
+}
+
+/* called under bridge lock */
+void br_configuration_update(struct net_bridge *br)
+{
+ br_root_selection(br);
+ br_designated_port_selection(br);
+}
+
+/* called under bridge lock */
+void br_become_designated_port(struct net_bridge_port *p)
+{
+ struct net_bridge *br;
+
+ br = p->br;
+ p->designated_root = br->designated_root;
+ p->designated_cost = br->root_path_cost;
+ p->designated_bridge = br->bridge_id;
+ p->designated_port = p->port_id;
+}
+
+
+/* called under bridge lock */
+static void br_make_blocking(struct net_bridge_port *p)
+{
+ if (p->state != BR_STATE_DISABLED &&
+ p->state != BR_STATE_BLOCKING) {
+ if (p->state == BR_STATE_FORWARDING ||
+ p->state == BR_STATE_LEARNING)
+ br_topology_change_detection(p->br);
+
+ br_set_state(p, BR_STATE_BLOCKING);
+ br_log_state(p);
+ br_ifinfo_notify(RTM_NEWLINK, p);
+
+ del_timer(&p->forward_delay_timer);
+ }
+}
+
+/* called under bridge lock */
+static void br_make_forwarding(struct net_bridge_port *p)
+{
+ struct net_bridge *br = p->br;
+
+ if (p->state != BR_STATE_BLOCKING)
+ return;
+
+ if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) {
+ br_set_state(p, BR_STATE_FORWARDING);
+ br_topology_change_detection(br);
+ del_timer(&p->forward_delay_timer);
+ } else if (br->stp_enabled == BR_KERNEL_STP)
+ br_set_state(p, BR_STATE_LISTENING);
+ else
+ br_set_state(p, BR_STATE_LEARNING);
+
+ br_multicast_enable_port(p);
+ br_log_state(p);
+ br_ifinfo_notify(RTM_NEWLINK, p);
+
+ if (br->forward_delay != 0)
+ mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay);
+}
+
+/* called under bridge lock */
+void br_port_state_selection(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+ unsigned int liveports = 0;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->state == BR_STATE_DISABLED)
+ continue;
+
+ /* Don't change port states if userspace is handling STP */
+ if (br->stp_enabled != BR_USER_STP) {
+ if (p->port_no == br->root_port) {
+ p->config_pending = 0;
+ p->topology_change_ack = 0;
+ br_make_forwarding(p);
+ } else if (br_is_designated_port(p)) {
+ del_timer(&p->message_age_timer);
+ br_make_forwarding(p);
+ } else {
+ p->config_pending = 0;
+ p->topology_change_ack = 0;
+ br_make_blocking(p);
+ }
+ }
+
+ if (p->state == BR_STATE_FORWARDING)
+ ++liveports;
+ }
+
+ if (liveports == 0)
+ netif_carrier_off(br->dev);
+ else
+ netif_carrier_on(br->dev);
+}
+
+/* called under bridge lock */
+static void br_topology_change_acknowledge(struct net_bridge_port *p)
+{
+ p->topology_change_ack = 1;
+ br_transmit_config(p);
+}
+
+/* called under bridge lock */
+void br_received_config_bpdu(struct net_bridge_port *p,
+ const struct br_config_bpdu *bpdu)
+{
+ struct net_bridge *br;
+ int was_root;
+
+ br = p->br;
+ was_root = br_is_root_bridge(br);
+
+ if (br_supersedes_port_info(p, bpdu)) {
+ br_record_config_information(p, bpdu);
+ br_configuration_update(br);
+ br_port_state_selection(br);
+
+ if (!br_is_root_bridge(br) && was_root) {
+ del_timer(&br->hello_timer);
+ if (br->topology_change_detected) {
+ del_timer(&br->topology_change_timer);
+ br_transmit_tcn(br);
+
+ mod_timer(&br->tcn_timer,
+ jiffies + br->bridge_hello_time);
+ }
+ }
+
+ if (p->port_no == br->root_port) {
+ br_record_config_timeout_values(br, bpdu);
+ br_config_bpdu_generation(br);
+ if (bpdu->topology_change_ack)
+ br_topology_change_acknowledged(br);
+ }
+ } else if (br_is_designated_port(p)) {
+ br_reply(p);
+ }
+}
+
+/* called under bridge lock */
+void br_received_tcn_bpdu(struct net_bridge_port *p)
+{
+ if (br_is_designated_port(p)) {
+ br_info(p->br, "port %u(%s) received tcn bpdu\n",
+ (unsigned int) p->port_no, p->dev->name);
+
+ br_topology_change_detection(p->br);
+ br_topology_change_acknowledge(p);
+ }
+}
+
+/* Change bridge STP parameter */
+int br_set_hello_time(struct net_bridge *br, unsigned long val)
+{
+ unsigned long t = clock_t_to_jiffies(val);
+
+ if (t < BR_MIN_HELLO_TIME || t > BR_MAX_HELLO_TIME)
+ return -ERANGE;
+
+ spin_lock_bh(&br->lock);
+ br->bridge_hello_time = t;
+ if (br_is_root_bridge(br))
+ br->hello_time = br->bridge_hello_time;
+ spin_unlock_bh(&br->lock);
+ return 0;
+}
+
+int br_set_max_age(struct net_bridge *br, unsigned long val)
+{
+ unsigned long t = clock_t_to_jiffies(val);
+
+ if (t < BR_MIN_MAX_AGE || t > BR_MAX_MAX_AGE)
+ return -ERANGE;
+
+ spin_lock_bh(&br->lock);
+ br->bridge_max_age = t;
+ if (br_is_root_bridge(br))
+ br->max_age = br->bridge_max_age;
+ spin_unlock_bh(&br->lock);
+ return 0;
+
+}
+
+void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
+{
+ br->bridge_forward_delay = t;
+ if (br_is_root_bridge(br))
+ br->forward_delay = br->bridge_forward_delay;
+}
+
+int br_set_forward_delay(struct net_bridge *br, unsigned long val)
+{
+ unsigned long t = clock_t_to_jiffies(val);
+ int err = -ERANGE;
+
+ spin_lock_bh(&br->lock);
+ if (br->stp_enabled != BR_NO_STP &&
+ (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY))
+ goto unlock;
+
+ __br_set_forward_delay(br, t);
+ err = 0;
+
+unlock:
+ spin_unlock_bh(&br->lock);
+ return err;
+}
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
new file mode 100644
index 000000000..534fc4cd2
--- /dev/null
+++ b/net/bridge/br_stp_bpdu.c
@@ -0,0 +1,242 @@
+/*
+ * Spanning tree protocol; BPDU handling
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/etherdevice.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#include <net/net_namespace.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/stp.h>
+#include <asm/unaligned.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+#define STP_HZ 256
+
+#define LLC_RESERVE sizeof(struct llc_pdu_un)
+
+static void br_send_bpdu(struct net_bridge_port *p,
+ const unsigned char *data, int length)
+{
+ struct sk_buff *skb;
+
+ skb = dev_alloc_skb(length+LLC_RESERVE);
+ if (!skb)
+ return;
+
+ skb->dev = p->dev;
+ skb->protocol = htons(ETH_P_802_2);
+ skb->priority = TC_PRIO_CONTROL;
+
+ skb_reserve(skb, LLC_RESERVE);
+ memcpy(__skb_put(skb, length), data, length);
+
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_U, LLC_SAP_BSPAN,
+ LLC_SAP_BSPAN, LLC_PDU_CMD);
+ llc_pdu_init_as_ui_cmd(skb);
+
+ llc_mac_hdr_init(skb, p->dev->dev_addr, p->br->group_addr);
+
+ skb_reset_mac_header(skb);
+
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, NULL, skb,
+ NULL, skb->dev,
+ dev_queue_xmit_sk);
+}
+
+static inline void br_set_ticks(unsigned char *dest, int j)
+{
+ unsigned long ticks = (STP_HZ * j)/ HZ;
+
+ put_unaligned_be16(ticks, dest);
+}
+
+static inline int br_get_ticks(const unsigned char *src)
+{
+ unsigned long ticks = get_unaligned_be16(src);
+
+ return DIV_ROUND_UP(ticks * HZ, STP_HZ);
+}
+
+/* called under bridge lock */
+void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
+{
+ unsigned char buf[35];
+
+ if (p->br->stp_enabled != BR_KERNEL_STP)
+ return;
+
+ buf[0] = 0;
+ buf[1] = 0;
+ buf[2] = 0;
+ buf[3] = BPDU_TYPE_CONFIG;
+ buf[4] = (bpdu->topology_change ? 0x01 : 0) |
+ (bpdu->topology_change_ack ? 0x80 : 0);
+ buf[5] = bpdu->root.prio[0];
+ buf[6] = bpdu->root.prio[1];
+ buf[7] = bpdu->root.addr[0];
+ buf[8] = bpdu->root.addr[1];
+ buf[9] = bpdu->root.addr[2];
+ buf[10] = bpdu->root.addr[3];
+ buf[11] = bpdu->root.addr[4];
+ buf[12] = bpdu->root.addr[5];
+ buf[13] = (bpdu->root_path_cost >> 24) & 0xFF;
+ buf[14] = (bpdu->root_path_cost >> 16) & 0xFF;
+ buf[15] = (bpdu->root_path_cost >> 8) & 0xFF;
+ buf[16] = bpdu->root_path_cost & 0xFF;
+ buf[17] = bpdu->bridge_id.prio[0];
+ buf[18] = bpdu->bridge_id.prio[1];
+ buf[19] = bpdu->bridge_id.addr[0];
+ buf[20] = bpdu->bridge_id.addr[1];
+ buf[21] = bpdu->bridge_id.addr[2];
+ buf[22] = bpdu->bridge_id.addr[3];
+ buf[23] = bpdu->bridge_id.addr[4];
+ buf[24] = bpdu->bridge_id.addr[5];
+ buf[25] = (bpdu->port_id >> 8) & 0xFF;
+ buf[26] = bpdu->port_id & 0xFF;
+
+ br_set_ticks(buf+27, bpdu->message_age);
+ br_set_ticks(buf+29, bpdu->max_age);
+ br_set_ticks(buf+31, bpdu->hello_time);
+ br_set_ticks(buf+33, bpdu->forward_delay);
+
+ br_send_bpdu(p, buf, 35);
+}
+
+/* called under bridge lock */
+void br_send_tcn_bpdu(struct net_bridge_port *p)
+{
+ unsigned char buf[4];
+
+ if (p->br->stp_enabled != BR_KERNEL_STP)
+ return;
+
+ buf[0] = 0;
+ buf[1] = 0;
+ buf[2] = 0;
+ buf[3] = BPDU_TYPE_TCN;
+ br_send_bpdu(p, buf, 4);
+}
+
+/*
+ * Called from llc.
+ *
+ * NO locks, but rcu_read_lock
+ */
+void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ const unsigned char *dest = eth_hdr(skb)->h_dest;
+ struct net_bridge_port *p;
+ struct net_bridge *br;
+ const unsigned char *buf;
+
+ if (!pskb_may_pull(skb, 4))
+ goto err;
+
+ /* compare of protocol id and version */
+ buf = skb->data;
+ if (buf[0] != 0 || buf[1] != 0 || buf[2] != 0)
+ goto err;
+
+ p = br_port_get_check_rcu(dev);
+ if (!p)
+ goto err;
+
+ br = p->br;
+ spin_lock(&br->lock);
+
+ if (br->stp_enabled != BR_KERNEL_STP)
+ goto out;
+
+ if (!(br->dev->flags & IFF_UP))
+ goto out;
+
+ if (p->state == BR_STATE_DISABLED)
+ goto out;
+
+ if (!ether_addr_equal(dest, br->group_addr))
+ goto out;
+
+ if (p->flags & BR_BPDU_GUARD) {
+ br_notice(br, "BPDU received on blocked port %u(%s)\n",
+ (unsigned int) p->port_no, p->dev->name);
+ br_stp_disable_port(p);
+ goto out;
+ }
+
+ buf = skb_pull(skb, 3);
+
+ if (buf[0] == BPDU_TYPE_CONFIG) {
+ struct br_config_bpdu bpdu;
+
+ if (!pskb_may_pull(skb, 32))
+ goto out;
+
+ buf = skb->data;
+ bpdu.topology_change = (buf[1] & 0x01) ? 1 : 0;
+ bpdu.topology_change_ack = (buf[1] & 0x80) ? 1 : 0;
+
+ bpdu.root.prio[0] = buf[2];
+ bpdu.root.prio[1] = buf[3];
+ bpdu.root.addr[0] = buf[4];
+ bpdu.root.addr[1] = buf[5];
+ bpdu.root.addr[2] = buf[6];
+ bpdu.root.addr[3] = buf[7];
+ bpdu.root.addr[4] = buf[8];
+ bpdu.root.addr[5] = buf[9];
+ bpdu.root_path_cost =
+ (buf[10] << 24) |
+ (buf[11] << 16) |
+ (buf[12] << 8) |
+ buf[13];
+ bpdu.bridge_id.prio[0] = buf[14];
+ bpdu.bridge_id.prio[1] = buf[15];
+ bpdu.bridge_id.addr[0] = buf[16];
+ bpdu.bridge_id.addr[1] = buf[17];
+ bpdu.bridge_id.addr[2] = buf[18];
+ bpdu.bridge_id.addr[3] = buf[19];
+ bpdu.bridge_id.addr[4] = buf[20];
+ bpdu.bridge_id.addr[5] = buf[21];
+ bpdu.port_id = (buf[22] << 8) | buf[23];
+
+ bpdu.message_age = br_get_ticks(buf+24);
+ bpdu.max_age = br_get_ticks(buf+26);
+ bpdu.hello_time = br_get_ticks(buf+28);
+ bpdu.forward_delay = br_get_ticks(buf+30);
+
+ if (bpdu.message_age > bpdu.max_age) {
+ if (net_ratelimit())
+ br_notice(p->br,
+ "port %u config from %pM"
+ " (message_age %ul > max_age %ul)\n",
+ p->port_no,
+ eth_hdr(skb)->h_source,
+ bpdu.message_age, bpdu.max_age);
+ goto out;
+ }
+
+ br_received_config_bpdu(p, &bpdu);
+ } else if (buf[0] == BPDU_TYPE_TCN) {
+ br_received_tcn_bpdu(p);
+ }
+ out:
+ spin_unlock(&br->lock);
+ err:
+ kfree_skb(skb);
+}
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
new file mode 100644
index 000000000..7832d07f4
--- /dev/null
+++ b/net/bridge/br_stp_if.c
@@ -0,0 +1,316 @@
+/*
+ * Spanning tree protocol; interface code
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+
+/* Port id is composed of priority and port number.
+ * NB: some bits of priority are dropped to
+ * make room for more ports.
+ */
+static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
+{
+ return ((u16)priority << BR_PORT_BITS)
+ | (port_no & ((1<<BR_PORT_BITS)-1));
+}
+
+#define BR_MAX_PORT_PRIORITY ((u16)~0 >> BR_PORT_BITS)
+
+/* called under bridge lock */
+void br_init_port(struct net_bridge_port *p)
+{
+ p->port_id = br_make_port_id(p->priority, p->port_no);
+ br_become_designated_port(p);
+ br_set_state(p, BR_STATE_BLOCKING);
+ p->topology_change_ack = 0;
+ p->config_pending = 0;
+}
+
+/* called under bridge lock */
+void br_stp_enable_bridge(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ spin_lock_bh(&br->lock);
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
+ mod_timer(&br->gc_timer, jiffies + HZ/10);
+
+ br_config_bpdu_generation(br);
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (netif_running(p->dev) && netif_oper_up(p->dev))
+ br_stp_enable_port(p);
+
+ }
+ spin_unlock_bh(&br->lock);
+}
+
+/* NO locks held */
+void br_stp_disable_bridge(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ spin_lock_bh(&br->lock);
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->state != BR_STATE_DISABLED)
+ br_stp_disable_port(p);
+
+ }
+
+ br->topology_change = 0;
+ br->topology_change_detected = 0;
+ spin_unlock_bh(&br->lock);
+
+ del_timer_sync(&br->hello_timer);
+ del_timer_sync(&br->topology_change_timer);
+ del_timer_sync(&br->tcn_timer);
+ del_timer_sync(&br->gc_timer);
+}
+
+/* called under bridge lock */
+void br_stp_enable_port(struct net_bridge_port *p)
+{
+ br_init_port(p);
+ br_port_state_selection(p->br);
+ br_log_state(p);
+ br_ifinfo_notify(RTM_NEWLINK, p);
+}
+
+/* called under bridge lock */
+void br_stp_disable_port(struct net_bridge_port *p)
+{
+ struct net_bridge *br = p->br;
+ int wasroot;
+
+ wasroot = br_is_root_bridge(br);
+ br_become_designated_port(p);
+ br_set_state(p, BR_STATE_DISABLED);
+ p->topology_change_ack = 0;
+ p->config_pending = 0;
+
+ br_log_state(p);
+ br_ifinfo_notify(RTM_NEWLINK, p);
+
+ del_timer(&p->message_age_timer);
+ del_timer(&p->forward_delay_timer);
+ del_timer(&p->hold_timer);
+
+ br_fdb_delete_by_port(br, p, 0);
+ br_multicast_disable_port(p);
+
+ br_configuration_update(br);
+
+ br_port_state_selection(br);
+
+ if (br_is_root_bridge(br) && !wasroot)
+ br_become_root_bridge(br);
+}
+
+static void br_stp_start(struct net_bridge *br)
+{
+ int r;
+ char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
+ char *envp[] = { NULL };
+
+ r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+
+ spin_lock_bh(&br->lock);
+
+ if (br->bridge_forward_delay < BR_MIN_FORWARD_DELAY)
+ __br_set_forward_delay(br, BR_MIN_FORWARD_DELAY);
+ else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY)
+ __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY);
+
+ if (r == 0) {
+ br->stp_enabled = BR_USER_STP;
+ br_debug(br, "userspace STP started\n");
+ } else {
+ br->stp_enabled = BR_KERNEL_STP;
+ br_debug(br, "using kernel STP\n");
+
+ /* To start timers on any ports left in blocking */
+ br_port_state_selection(br);
+ }
+
+ spin_unlock_bh(&br->lock);
+}
+
+static void br_stp_stop(struct net_bridge *br)
+{
+ int r;
+ char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL };
+ char *envp[] = { NULL };
+
+ if (br->stp_enabled == BR_USER_STP) {
+ r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ br_info(br, "userspace STP stopped, return code %d\n", r);
+
+ /* To start timers on any ports left in blocking */
+ spin_lock_bh(&br->lock);
+ br_port_state_selection(br);
+ spin_unlock_bh(&br->lock);
+ }
+
+ br->stp_enabled = BR_NO_STP;
+}
+
+void br_stp_set_enabled(struct net_bridge *br, unsigned long val)
+{
+ ASSERT_RTNL();
+
+ if (val) {
+ if (br->stp_enabled == BR_NO_STP)
+ br_stp_start(br);
+ } else {
+ if (br->stp_enabled != BR_NO_STP)
+ br_stp_stop(br);
+ }
+}
+
+/* called under bridge lock */
+void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
+{
+ /* should be aligned on 2 bytes for ether_addr_equal() */
+ unsigned short oldaddr_aligned[ETH_ALEN >> 1];
+ unsigned char *oldaddr = (unsigned char *)oldaddr_aligned;
+ struct net_bridge_port *p;
+ int wasroot;
+
+ wasroot = br_is_root_bridge(br);
+
+ br_fdb_change_mac_address(br, addr);
+
+ memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN);
+ memcpy(br->bridge_id.addr, addr, ETH_ALEN);
+ memcpy(br->dev->dev_addr, addr, ETH_ALEN);
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (ether_addr_equal(p->designated_bridge.addr, oldaddr))
+ memcpy(p->designated_bridge.addr, addr, ETH_ALEN);
+
+ if (ether_addr_equal(p->designated_root.addr, oldaddr))
+ memcpy(p->designated_root.addr, addr, ETH_ALEN);
+ }
+
+ br_configuration_update(br);
+ br_port_state_selection(br);
+ if (br_is_root_bridge(br) && !wasroot)
+ br_become_root_bridge(br);
+}
+
+/* should be aligned on 2 bytes for ether_addr_equal() */
+static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1];
+
+/* called under bridge lock */
+bool br_stp_recalculate_bridge_id(struct net_bridge *br)
+{
+ const unsigned char *br_mac_zero =
+ (const unsigned char *)br_mac_zero_aligned;
+ const unsigned char *addr = br_mac_zero;
+ struct net_bridge_port *p;
+
+ /* user has chosen a value so keep it */
+ if (br->dev->addr_assign_type == NET_ADDR_SET)
+ return false;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (addr == br_mac_zero ||
+ memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
+ addr = p->dev->dev_addr;
+
+ }
+
+ if (ether_addr_equal(br->bridge_id.addr, addr))
+ return false; /* no change */
+
+ br_stp_change_bridge_id(br, addr);
+ return true;
+}
+
+/* Acquires and releases bridge lock */
+void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio)
+{
+ struct net_bridge_port *p;
+ int wasroot;
+
+ spin_lock_bh(&br->lock);
+ wasroot = br_is_root_bridge(br);
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->state != BR_STATE_DISABLED &&
+ br_is_designated_port(p)) {
+ p->designated_bridge.prio[0] = (newprio >> 8) & 0xFF;
+ p->designated_bridge.prio[1] = newprio & 0xFF;
+ }
+
+ }
+
+ br->bridge_id.prio[0] = (newprio >> 8) & 0xFF;
+ br->bridge_id.prio[1] = newprio & 0xFF;
+ br_configuration_update(br);
+ br_port_state_selection(br);
+ if (br_is_root_bridge(br) && !wasroot)
+ br_become_root_bridge(br);
+ spin_unlock_bh(&br->lock);
+}
+
+/* called under bridge lock */
+int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio)
+{
+ port_id new_port_id;
+
+ if (newprio > BR_MAX_PORT_PRIORITY)
+ return -ERANGE;
+
+ new_port_id = br_make_port_id(newprio, p->port_no);
+ if (br_is_designated_port(p))
+ p->designated_port = new_port_id;
+
+ p->port_id = new_port_id;
+ p->priority = newprio;
+ if (!memcmp(&p->br->bridge_id, &p->designated_bridge, 8) &&
+ p->port_id < p->designated_port) {
+ br_become_designated_port(p);
+ br_port_state_selection(p->br);
+ }
+
+ return 0;
+}
+
+/* called under bridge lock */
+int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost)
+{
+ if (path_cost < BR_MIN_PATH_COST ||
+ path_cost > BR_MAX_PATH_COST)
+ return -ERANGE;
+
+ p->flags |= BR_ADMIN_COST;
+ p->path_cost = path_cost;
+ br_configuration_update(p->br);
+ br_port_state_selection(p->br);
+ return 0;
+}
+
+ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id)
+{
+ return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n",
+ id->prio[0], id->prio[1],
+ id->addr[0], id->addr[1], id->addr[2],
+ id->addr[3], id->addr[4], id->addr[5]);
+}
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
new file mode 100644
index 000000000..7caf7fae2
--- /dev/null
+++ b/net/bridge/br_stp_timer.c
@@ -0,0 +1,176 @@
+/*
+ * Spanning tree protocol; timer-related code
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Lennert Buytenhek <buytenh@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/times.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+/* called under bridge lock */
+static int br_is_designated_for_some_port(const struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->state != BR_STATE_DISABLED &&
+ !memcmp(&p->designated_bridge, &br->bridge_id, 8))
+ return 1;
+ }
+
+ return 0;
+}
+
+static void br_hello_timer_expired(unsigned long arg)
+{
+ struct net_bridge *br = (struct net_bridge *)arg;
+
+ br_debug(br, "hello timer expired\n");
+ spin_lock(&br->lock);
+ if (br->dev->flags & IFF_UP) {
+ br_config_bpdu_generation(br);
+
+ mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time));
+ }
+ spin_unlock(&br->lock);
+}
+
+static void br_message_age_timer_expired(unsigned long arg)
+{
+ struct net_bridge_port *p = (struct net_bridge_port *) arg;
+ struct net_bridge *br = p->br;
+ const bridge_id *id = &p->designated_bridge;
+ int was_root;
+
+ if (p->state == BR_STATE_DISABLED)
+ return;
+
+ br_info(br, "port %u(%s) neighbor %.2x%.2x.%pM lost\n",
+ (unsigned int) p->port_no, p->dev->name,
+ id->prio[0], id->prio[1], &id->addr);
+
+ /*
+ * According to the spec, the message age timer cannot be
+ * running when we are the root bridge. So.. this was_root
+ * check is redundant. I'm leaving it in for now, though.
+ */
+ spin_lock(&br->lock);
+ if (p->state == BR_STATE_DISABLED)
+ goto unlock;
+ was_root = br_is_root_bridge(br);
+
+ br_become_designated_port(p);
+ br_configuration_update(br);
+ br_port_state_selection(br);
+ if (br_is_root_bridge(br) && !was_root)
+ br_become_root_bridge(br);
+ unlock:
+ spin_unlock(&br->lock);
+}
+
+static void br_forward_delay_timer_expired(unsigned long arg)
+{
+ struct net_bridge_port *p = (struct net_bridge_port *) arg;
+ struct net_bridge *br = p->br;
+
+ br_debug(br, "port %u(%s) forward delay timer\n",
+ (unsigned int) p->port_no, p->dev->name);
+ spin_lock(&br->lock);
+ if (p->state == BR_STATE_LISTENING) {
+ br_set_state(p, BR_STATE_LEARNING);
+ mod_timer(&p->forward_delay_timer,
+ jiffies + br->forward_delay);
+ } else if (p->state == BR_STATE_LEARNING) {
+ br_set_state(p, BR_STATE_FORWARDING);
+ if (br_is_designated_for_some_port(br))
+ br_topology_change_detection(br);
+ netif_carrier_on(br->dev);
+ }
+ br_log_state(p);
+ rcu_read_lock();
+ br_ifinfo_notify(RTM_NEWLINK, p);
+ rcu_read_unlock();
+ spin_unlock(&br->lock);
+}
+
+static void br_tcn_timer_expired(unsigned long arg)
+{
+ struct net_bridge *br = (struct net_bridge *) arg;
+
+ br_debug(br, "tcn timer expired\n");
+ spin_lock(&br->lock);
+ if (!br_is_root_bridge(br) && (br->dev->flags & IFF_UP)) {
+ br_transmit_tcn(br);
+
+ mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time);
+ }
+ spin_unlock(&br->lock);
+}
+
+static void br_topology_change_timer_expired(unsigned long arg)
+{
+ struct net_bridge *br = (struct net_bridge *) arg;
+
+ br_debug(br, "topo change timer expired\n");
+ spin_lock(&br->lock);
+ br->topology_change_detected = 0;
+ br->topology_change = 0;
+ spin_unlock(&br->lock);
+}
+
+static void br_hold_timer_expired(unsigned long arg)
+{
+ struct net_bridge_port *p = (struct net_bridge_port *) arg;
+
+ br_debug(p->br, "port %u(%s) hold timer expired\n",
+ (unsigned int) p->port_no, p->dev->name);
+
+ spin_lock(&p->br->lock);
+ if (p->config_pending)
+ br_transmit_config(p);
+ spin_unlock(&p->br->lock);
+}
+
+void br_stp_timer_init(struct net_bridge *br)
+{
+ setup_timer(&br->hello_timer, br_hello_timer_expired,
+ (unsigned long) br);
+
+ setup_timer(&br->tcn_timer, br_tcn_timer_expired,
+ (unsigned long) br);
+
+ setup_timer(&br->topology_change_timer,
+ br_topology_change_timer_expired,
+ (unsigned long) br);
+
+ setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
+}
+
+void br_stp_port_timer_init(struct net_bridge_port *p)
+{
+ setup_timer(&p->message_age_timer, br_message_age_timer_expired,
+ (unsigned long) p);
+
+ setup_timer(&p->forward_delay_timer, br_forward_delay_timer_expired,
+ (unsigned long) p);
+
+ setup_timer(&p->hold_timer, br_hold_timer_expired,
+ (unsigned long) p);
+}
+
+/* Report ticks left (in USER_HZ) used for API */
+unsigned long br_timer_value(const struct timer_list *timer)
+{
+ return timer_pending(timer)
+ ? jiffies_delta_to_clock_t(timer->expires - jiffies) : 0;
+}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
new file mode 100644
index 000000000..4c97fc50f
--- /dev/null
+++ b/net/bridge/br_sysfs_br.c
@@ -0,0 +1,889 @@
+/*
+ * Sysfs attributes of bridge
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/rtnetlink.h>
+#include <linux/spinlock.h>
+#include <linux/times.h>
+
+#include "br_private.h"
+
+#define to_dev(obj) container_of(obj, struct device, kobj)
+#define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd)))
+
+/*
+ * Common code for storing bridge parameters.
+ */
+static ssize_t store_bridge_parm(struct device *d,
+ const char *buf, size_t len,
+ int (*set)(struct net_bridge *, unsigned long))
+{
+ struct net_bridge *br = to_bridge(d);
+ char *endp;
+ unsigned long val;
+ int err;
+
+ if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EINVAL;
+
+ err = (*set)(br, val);
+ return err ? err : len;
+}
+
+
+static ssize_t forward_delay_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay));
+}
+
+static ssize_t forward_delay_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_set_forward_delay);
+}
+static DEVICE_ATTR_RW(forward_delay);
+
+static ssize_t hello_time_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(to_bridge(d)->hello_time));
+}
+
+static ssize_t hello_time_store(struct device *d,
+ struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_set_hello_time);
+}
+static DEVICE_ATTR_RW(hello_time);
+
+static ssize_t max_age_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(to_bridge(d)->max_age));
+}
+
+static ssize_t max_age_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_set_max_age);
+}
+static DEVICE_ATTR_RW(max_age);
+
+static ssize_t ageing_time_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time));
+}
+
+static int set_ageing_time(struct net_bridge *br, unsigned long val)
+{
+ br->ageing_time = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t ageing_time_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_ageing_time);
+}
+static DEVICE_ATTR_RW(ageing_time);
+
+static ssize_t stp_state_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->stp_enabled);
+}
+
+
+static ssize_t stp_state_store(struct device *d,
+ struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ struct net_bridge *br = to_bridge(d);
+ char *endp;
+ unsigned long val;
+
+ if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+ br_stp_set_enabled(br, val);
+ rtnl_unlock();
+
+ return len;
+}
+static DEVICE_ATTR_RW(stp_state);
+
+static ssize_t group_fwd_mask_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%#x\n", br->group_fwd_mask);
+}
+
+
+static ssize_t group_fwd_mask_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct net_bridge *br = to_bridge(d);
+ char *endp;
+ unsigned long val;
+
+ if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EINVAL;
+
+ if (val & BR_GROUPFWD_RESTRICTED)
+ return -EINVAL;
+
+ br->group_fwd_mask = val;
+
+ return len;
+}
+static DEVICE_ATTR_RW(group_fwd_mask);
+
+static ssize_t priority_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n",
+ (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]);
+}
+
+static int set_priority(struct net_bridge *br, unsigned long val)
+{
+ br_stp_set_bridge_priority(br, (u16) val);
+ return 0;
+}
+
+static ssize_t priority_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_priority);
+}
+static DEVICE_ATTR_RW(priority);
+
+static ssize_t root_id_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ return br_show_bridge_id(buf, &to_bridge(d)->designated_root);
+}
+static DEVICE_ATTR_RO(root_id);
+
+static ssize_t bridge_id_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ return br_show_bridge_id(buf, &to_bridge(d)->bridge_id);
+}
+static DEVICE_ATTR_RO(bridge_id);
+
+static ssize_t root_port_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", to_bridge(d)->root_port);
+}
+static DEVICE_ATTR_RO(root_port);
+
+static ssize_t root_path_cost_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost);
+}
+static DEVICE_ATTR_RO(root_path_cost);
+
+static ssize_t topology_change_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", to_bridge(d)->topology_change);
+}
+static DEVICE_ATTR_RO(topology_change);
+
+static ssize_t topology_change_detected_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->topology_change_detected);
+}
+static DEVICE_ATTR_RO(topology_change_detected);
+
+static ssize_t hello_timer_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer));
+}
+static DEVICE_ATTR_RO(hello_timer);
+
+static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer));
+}
+static DEVICE_ATTR_RO(tcn_timer);
+
+static ssize_t topology_change_timer_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer));
+}
+static DEVICE_ATTR_RO(topology_change_timer);
+
+static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer));
+}
+static DEVICE_ATTR_RO(gc_timer);
+
+static ssize_t group_addr_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%x:%x:%x:%x:%x:%x\n",
+ br->group_addr[0], br->group_addr[1],
+ br->group_addr[2], br->group_addr[3],
+ br->group_addr[4], br->group_addr[5]);
+}
+
+static ssize_t group_addr_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_bridge *br = to_bridge(d);
+ u8 new_addr[6];
+ int i;
+
+ if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+ &new_addr[0], &new_addr[1], &new_addr[2],
+ &new_addr[3], &new_addr[4], &new_addr[5]) != 6)
+ return -EINVAL;
+
+ if (!is_link_local_ether_addr(new_addr))
+ return -EINVAL;
+
+ if (new_addr[5] == 1 || /* 802.3x Pause address */
+ new_addr[5] == 2 || /* 802.3ad Slow protocols */
+ new_addr[5] == 3) /* 802.1X PAE address */
+ return -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ spin_lock_bh(&br->lock);
+ for (i = 0; i < 6; i++)
+ br->group_addr[i] = new_addr[i];
+ spin_unlock_bh(&br->lock);
+
+ br->group_addr_set = true;
+ br_recalculate_fwd_mask(br);
+
+ rtnl_unlock();
+
+ return len;
+}
+
+static DEVICE_ATTR_RW(group_addr);
+
+static ssize_t flush_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_bridge *br = to_bridge(d);
+
+ if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ br_fdb_flush(br);
+ return len;
+}
+static DEVICE_ATTR_WO(flush);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static ssize_t multicast_router_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->multicast_router);
+}
+
+static ssize_t multicast_router_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_router);
+}
+static DEVICE_ATTR_RW(multicast_router);
+
+static ssize_t multicast_snooping_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", !br->multicast_disabled);
+}
+
+static ssize_t multicast_snooping_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_toggle);
+}
+static DEVICE_ATTR_RW(multicast_snooping);
+
+static ssize_t multicast_query_use_ifaddr_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr);
+}
+
+static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_query_use_ifaddr = !!val;
+ return 0;
+}
+
+static ssize_t
+multicast_query_use_ifaddr_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_query_use_ifaddr);
+}
+static DEVICE_ATTR_RW(multicast_query_use_ifaddr);
+
+static ssize_t multicast_querier_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->multicast_querier);
+}
+
+static ssize_t multicast_querier_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_querier);
+}
+static DEVICE_ATTR_RW(multicast_querier);
+
+static ssize_t hash_elasticity_show(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->hash_elasticity);
+}
+
+static int set_elasticity(struct net_bridge *br, unsigned long val)
+{
+ br->hash_elasticity = val;
+ return 0;
+}
+
+static ssize_t hash_elasticity_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_elasticity);
+}
+static DEVICE_ATTR_RW(hash_elasticity);
+
+static ssize_t hash_max_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->hash_max);
+}
+
+static ssize_t hash_max_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_multicast_set_hash_max);
+}
+static DEVICE_ATTR_RW(hash_max);
+
+static ssize_t multicast_last_member_count_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->multicast_last_member_count);
+}
+
+static int set_last_member_count(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_last_member_count = val;
+ return 0;
+}
+
+static ssize_t multicast_last_member_count_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_last_member_count);
+}
+static DEVICE_ATTR_RW(multicast_last_member_count);
+
+static ssize_t multicast_startup_query_count_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->multicast_startup_query_count);
+}
+
+static int set_startup_query_count(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_startup_query_count = val;
+ return 0;
+}
+
+static ssize_t multicast_startup_query_count_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_startup_query_count);
+}
+static DEVICE_ATTR_RW(multicast_startup_query_count);
+
+static ssize_t multicast_last_member_interval_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_last_member_interval));
+}
+
+static int set_last_member_interval(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_last_member_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_last_member_interval_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_last_member_interval);
+}
+static DEVICE_ATTR_RW(multicast_last_member_interval);
+
+static ssize_t multicast_membership_interval_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_membership_interval));
+}
+
+static int set_membership_interval(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_membership_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_membership_interval_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_membership_interval);
+}
+static DEVICE_ATTR_RW(multicast_membership_interval);
+
+static ssize_t multicast_querier_interval_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_querier_interval));
+}
+
+static int set_querier_interval(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_querier_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_querier_interval_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_querier_interval);
+}
+static DEVICE_ATTR_RW(multicast_querier_interval);
+
+static ssize_t multicast_query_interval_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_query_interval));
+}
+
+static int set_query_interval(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_query_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_query_interval_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_query_interval);
+}
+static DEVICE_ATTR_RW(multicast_query_interval);
+
+static ssize_t multicast_query_response_interval_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(
+ buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_query_response_interval));
+}
+
+static int set_query_response_interval(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_query_response_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_query_response_interval_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_query_response_interval);
+}
+static DEVICE_ATTR_RW(multicast_query_response_interval);
+
+static ssize_t multicast_startup_query_interval_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(
+ buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_startup_query_interval));
+}
+
+static int set_startup_query_interval(struct net_bridge *br, unsigned long val)
+{
+ br->multicast_startup_query_interval = clock_t_to_jiffies(val);
+ return 0;
+}
+
+static ssize_t multicast_startup_query_interval_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_startup_query_interval);
+}
+static DEVICE_ATTR_RW(multicast_startup_query_interval);
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+static ssize_t nf_call_iptables_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->nf_call_iptables);
+}
+
+static int set_nf_call_iptables(struct net_bridge *br, unsigned long val)
+{
+ br->nf_call_iptables = val ? true : false;
+ return 0;
+}
+
+static ssize_t nf_call_iptables_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_nf_call_iptables);
+}
+static DEVICE_ATTR_RW(nf_call_iptables);
+
+static ssize_t nf_call_ip6tables_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->nf_call_ip6tables);
+}
+
+static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val)
+{
+ br->nf_call_ip6tables = val ? true : false;
+ return 0;
+}
+
+static ssize_t nf_call_ip6tables_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_nf_call_ip6tables);
+}
+static DEVICE_ATTR_RW(nf_call_ip6tables);
+
+static ssize_t nf_call_arptables_show(
+ struct device *d, struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br->nf_call_arptables);
+}
+
+static int set_nf_call_arptables(struct net_bridge *br, unsigned long val)
+{
+ br->nf_call_arptables = val ? true : false;
+ return 0;
+}
+
+static ssize_t nf_call_arptables_store(
+ struct device *d, struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_nf_call_arptables);
+}
+static DEVICE_ATTR_RW(nf_call_arptables);
+#endif
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+static ssize_t vlan_filtering_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->vlan_enabled);
+}
+
+static ssize_t vlan_filtering_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_vlan_filter_toggle);
+}
+static DEVICE_ATTR_RW(vlan_filtering);
+
+static ssize_t vlan_protocol_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto));
+}
+
+static ssize_t vlan_protocol_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_vlan_set_proto);
+}
+static DEVICE_ATTR_RW(vlan_protocol);
+
+static ssize_t default_pvid_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br->default_pvid);
+}
+
+static ssize_t default_pvid_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid);
+}
+static DEVICE_ATTR_RW(default_pvid);
+#endif
+
+static struct attribute *bridge_attrs[] = {
+ &dev_attr_forward_delay.attr,
+ &dev_attr_hello_time.attr,
+ &dev_attr_max_age.attr,
+ &dev_attr_ageing_time.attr,
+ &dev_attr_stp_state.attr,
+ &dev_attr_group_fwd_mask.attr,
+ &dev_attr_priority.attr,
+ &dev_attr_bridge_id.attr,
+ &dev_attr_root_id.attr,
+ &dev_attr_root_path_cost.attr,
+ &dev_attr_root_port.attr,
+ &dev_attr_topology_change.attr,
+ &dev_attr_topology_change_detected.attr,
+ &dev_attr_hello_timer.attr,
+ &dev_attr_tcn_timer.attr,
+ &dev_attr_topology_change_timer.attr,
+ &dev_attr_gc_timer.attr,
+ &dev_attr_group_addr.attr,
+ &dev_attr_flush.attr,
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ &dev_attr_multicast_router.attr,
+ &dev_attr_multicast_snooping.attr,
+ &dev_attr_multicast_querier.attr,
+ &dev_attr_multicast_query_use_ifaddr.attr,
+ &dev_attr_hash_elasticity.attr,
+ &dev_attr_hash_max.attr,
+ &dev_attr_multicast_last_member_count.attr,
+ &dev_attr_multicast_startup_query_count.attr,
+ &dev_attr_multicast_last_member_interval.attr,
+ &dev_attr_multicast_membership_interval.attr,
+ &dev_attr_multicast_querier_interval.attr,
+ &dev_attr_multicast_query_interval.attr,
+ &dev_attr_multicast_query_response_interval.attr,
+ &dev_attr_multicast_startup_query_interval.attr,
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ &dev_attr_nf_call_iptables.attr,
+ &dev_attr_nf_call_ip6tables.attr,
+ &dev_attr_nf_call_arptables.attr,
+#endif
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+ &dev_attr_vlan_filtering.attr,
+ &dev_attr_vlan_protocol.attr,
+ &dev_attr_default_pvid.attr,
+#endif
+ NULL
+};
+
+static struct attribute_group bridge_group = {
+ .name = SYSFS_BRIDGE_ATTR,
+ .attrs = bridge_attrs,
+};
+
+/*
+ * Export the forwarding information table as a binary file
+ * The records are struct __fdb_entry.
+ *
+ * Returns the number of bytes read.
+ */
+static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ struct device *dev = to_dev(kobj);
+ struct net_bridge *br = to_bridge(dev);
+ int n;
+
+ /* must read whole records */
+ if (off % sizeof(struct __fdb_entry) != 0)
+ return -EINVAL;
+
+ n = br_fdb_fillbuf(br, buf,
+ count / sizeof(struct __fdb_entry),
+ off / sizeof(struct __fdb_entry));
+
+ if (n > 0)
+ n *= sizeof(struct __fdb_entry);
+
+ return n;
+}
+
+static struct bin_attribute bridge_forward = {
+ .attr = { .name = SYSFS_BRIDGE_FDB,
+ .mode = S_IRUGO, },
+ .read = brforward_read,
+};
+
+/*
+ * Add entries in sysfs onto the existing network class device
+ * for the bridge.
+ * Adds a attribute group "bridge" containing tuning parameters.
+ * Binary attribute containing the forward table
+ * Sub directory to hold links to interfaces.
+ *
+ * Note: the ifobj exists only to be a subdirectory
+ * to hold links. The ifobj exists in same data structure
+ * as it's parent the bridge so reference counting works.
+ */
+int br_sysfs_addbr(struct net_device *dev)
+{
+ struct kobject *brobj = &dev->dev.kobj;
+ struct net_bridge *br = netdev_priv(dev);
+ int err;
+
+ err = sysfs_create_group(brobj, &bridge_group);
+ if (err) {
+ pr_info("%s: can't create group %s/%s\n",
+ __func__, dev->name, bridge_group.name);
+ goto out1;
+ }
+
+ err = sysfs_create_bin_file(brobj, &bridge_forward);
+ if (err) {
+ pr_info("%s: can't create attribute file %s/%s\n",
+ __func__, dev->name, bridge_forward.attr.name);
+ goto out2;
+ }
+
+ br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj);
+ if (!br->ifobj) {
+ pr_info("%s: can't add kobject (directory) %s/%s\n",
+ __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR);
+ goto out3;
+ }
+ return 0;
+ out3:
+ sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward);
+ out2:
+ sysfs_remove_group(&dev->dev.kobj, &bridge_group);
+ out1:
+ return err;
+
+}
+
+void br_sysfs_delbr(struct net_device *dev)
+{
+ struct kobject *kobj = &dev->dev.kobj;
+ struct net_bridge *br = netdev_priv(dev);
+
+ kobject_put(br->ifobj);
+ sysfs_remove_bin_file(kobj, &bridge_forward);
+ sysfs_remove_group(kobj, &bridge_group);
+}
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
new file mode 100644
index 000000000..4905845a9
--- /dev/null
+++ b/net/bridge/br_sysfs_if.c
@@ -0,0 +1,316 @@
+/*
+ * Sysfs attributes of bridge ports
+ * Linux ethernet bridge
+ *
+ * Authors:
+ * Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/rtnetlink.h>
+#include <linux/spinlock.h>
+
+#include "br_private.h"
+
+struct brport_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct net_bridge_port *, char *);
+ int (*store)(struct net_bridge_port *, unsigned long);
+};
+
+#define BRPORT_ATTR(_name, _mode, _show, _store) \
+const struct brport_attribute brport_attr_##_name = { \
+ .attr = {.name = __stringify(_name), \
+ .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+};
+
+#define BRPORT_ATTR_FLAG(_name, _mask) \
+static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \
+{ \
+ return sprintf(buf, "%d\n", !!(p->flags & _mask)); \
+} \
+static int store_##_name(struct net_bridge_port *p, unsigned long v) \
+{ \
+ return store_flag(p, v, _mask); \
+} \
+static BRPORT_ATTR(_name, S_IRUGO | S_IWUSR, \
+ show_##_name, store_##_name)
+
+static int store_flag(struct net_bridge_port *p, unsigned long v,
+ unsigned long mask)
+{
+ unsigned long flags;
+
+ flags = p->flags;
+
+ if (v)
+ flags |= mask;
+ else
+ flags &= ~mask;
+
+ if (flags != p->flags) {
+ p->flags = flags;
+ br_port_flags_change(p, mask);
+ br_ifinfo_notify(RTM_NEWLINK, p);
+ }
+ return 0;
+}
+
+static ssize_t show_path_cost(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->path_cost);
+}
+
+static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR,
+ show_path_cost, br_stp_set_path_cost);
+
+static ssize_t show_priority(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->priority);
+}
+
+static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR,
+ show_priority, br_stp_set_port_priority);
+
+static ssize_t show_designated_root(struct net_bridge_port *p, char *buf)
+{
+ return br_show_bridge_id(buf, &p->designated_root);
+}
+static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL);
+
+static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf)
+{
+ return br_show_bridge_id(buf, &p->designated_bridge);
+}
+static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL);
+
+static ssize_t show_designated_port(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->designated_port);
+}
+static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL);
+
+static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->designated_cost);
+}
+static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL);
+
+static ssize_t show_port_id(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "0x%x\n", p->port_id);
+}
+static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL);
+
+static ssize_t show_port_no(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "0x%x\n", p->port_no);
+}
+
+static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL);
+
+static ssize_t show_change_ack(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->topology_change_ack);
+}
+static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL);
+
+static ssize_t show_config_pending(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->config_pending);
+}
+static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL);
+
+static ssize_t show_port_state(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->state);
+}
+static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL);
+
+static ssize_t show_message_age_timer(struct net_bridge_port *p,
+ char *buf)
+{
+ return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer));
+}
+static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL);
+
+static ssize_t show_forward_delay_timer(struct net_bridge_port *p,
+ char *buf)
+{
+ return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer));
+}
+static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL);
+
+static ssize_t show_hold_timer(struct net_bridge_port *p,
+ char *buf)
+{
+ return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer));
+}
+static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL);
+
+static int store_flush(struct net_bridge_port *p, unsigned long v)
+{
+ br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry
+ return 0;
+}
+static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);
+
+BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
+BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
+BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
+BRPORT_ATTR_FLAG(learning, BR_LEARNING);
+BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD);
+BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
+BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "%d\n", p->multicast_router);
+}
+
+static int store_multicast_router(struct net_bridge_port *p,
+ unsigned long v)
+{
+ return br_multicast_set_port_router(p, v);
+}
+static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
+ store_multicast_router);
+
+BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
+#endif
+
+static const struct brport_attribute *brport_attrs[] = {
+ &brport_attr_path_cost,
+ &brport_attr_priority,
+ &brport_attr_port_id,
+ &brport_attr_port_no,
+ &brport_attr_designated_root,
+ &brport_attr_designated_bridge,
+ &brport_attr_designated_port,
+ &brport_attr_designated_cost,
+ &brport_attr_state,
+ &brport_attr_change_ack,
+ &brport_attr_config_pending,
+ &brport_attr_message_age_timer,
+ &brport_attr_forward_delay_timer,
+ &brport_attr_hold_timer,
+ &brport_attr_flush,
+ &brport_attr_hairpin_mode,
+ &brport_attr_bpdu_guard,
+ &brport_attr_root_block,
+ &brport_attr_learning,
+ &brport_attr_unicast_flood,
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ &brport_attr_multicast_router,
+ &brport_attr_multicast_fast_leave,
+#endif
+ &brport_attr_proxyarp,
+ &brport_attr_proxyarp_wifi,
+ NULL
+};
+
+#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr)
+#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
+
+static ssize_t brport_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct brport_attribute *brport_attr = to_brport_attr(attr);
+ struct net_bridge_port *p = to_brport(kobj);
+
+ return brport_attr->show(p, buf);
+}
+
+static ssize_t brport_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct brport_attribute *brport_attr = to_brport_attr(attr);
+ struct net_bridge_port *p = to_brport(kobj);
+ ssize_t ret = -EINVAL;
+ char *endp;
+ unsigned long val;
+
+ if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp != buf) {
+ if (!rtnl_trylock())
+ return restart_syscall();
+ if (p->dev && p->br && brport_attr->store) {
+ spin_lock_bh(&p->br->lock);
+ ret = brport_attr->store(p, val);
+ spin_unlock_bh(&p->br->lock);
+ if (ret == 0)
+ ret = count;
+ }
+ rtnl_unlock();
+ }
+ return ret;
+}
+
+const struct sysfs_ops brport_sysfs_ops = {
+ .show = brport_show,
+ .store = brport_store,
+};
+
+/*
+ * Add sysfs entries to ethernet device added to a bridge.
+ * Creates a brport subdirectory with bridge attributes.
+ * Puts symlink in bridge's brif subdirectory
+ */
+int br_sysfs_addif(struct net_bridge_port *p)
+{
+ struct net_bridge *br = p->br;
+ const struct brport_attribute **a;
+ int err;
+
+ err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj,
+ SYSFS_BRIDGE_PORT_LINK);
+ if (err)
+ return err;
+
+ for (a = brport_attrs; *a; ++a) {
+ err = sysfs_create_file(&p->kobj, &((*a)->attr));
+ if (err)
+ return err;
+ }
+
+ strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
+ return sysfs_create_link(br->ifobj, &p->kobj, p->sysfs_name);
+}
+
+/* Rename bridge's brif symlink */
+int br_sysfs_renameif(struct net_bridge_port *p)
+{
+ struct net_bridge *br = p->br;
+ int err;
+
+ /* If a rename fails, the rollback will cause another
+ * rename call with the existing name.
+ */
+ if (!strncmp(p->sysfs_name, p->dev->name, IFNAMSIZ))
+ return 0;
+
+ err = sysfs_rename_link(br->ifobj, &p->kobj,
+ p->sysfs_name, p->dev->name);
+ if (err)
+ netdev_notice(br->dev, "unable to rename link %s to %s",
+ p->sysfs_name, p->dev->name);
+ else
+ strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
+
+ return err;
+}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
new file mode 100644
index 000000000..13013fe8d
--- /dev/null
+++ b/net/bridge/br_vlan.c
@@ -0,0 +1,736 @@
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+
+#include "br_private.h"
+
+static void __vlan_add_pvid(struct net_port_vlans *v, u16 vid)
+{
+ if (v->pvid == vid)
+ return;
+
+ smp_wmb();
+ v->pvid = vid;
+}
+
+static void __vlan_delete_pvid(struct net_port_vlans *v, u16 vid)
+{
+ if (v->pvid != vid)
+ return;
+
+ smp_wmb();
+ v->pvid = 0;
+}
+
+static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags)
+{
+ if (flags & BRIDGE_VLAN_INFO_PVID)
+ __vlan_add_pvid(v, vid);
+ else
+ __vlan_delete_pvid(v, vid);
+
+ if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
+ set_bit(vid, v->untagged_bitmap);
+ else
+ clear_bit(vid, v->untagged_bitmap);
+}
+
+static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
+{
+ struct net_bridge_port *p = NULL;
+ struct net_bridge *br;
+ struct net_device *dev;
+ int err;
+
+ if (test_bit(vid, v->vlan_bitmap)) {
+ __vlan_add_flags(v, vid, flags);
+ return 0;
+ }
+
+ if (v->port_idx) {
+ p = v->parent.port;
+ br = p->br;
+ dev = p->dev;
+ } else {
+ br = v->parent.br;
+ dev = br->dev;
+ }
+
+ if (p) {
+ /* Add VLAN to the device filter if it is supported.
+ * This ensures tagged traffic enters the bridge when
+ * promiscuous mode is disabled by br_manage_promisc().
+ */
+ err = vlan_vid_add(dev, br->vlan_proto, vid);
+ if (err)
+ return err;
+ }
+
+ err = br_fdb_insert(br, p, dev->dev_addr, vid);
+ if (err) {
+ br_err(br, "failed insert local address into bridge "
+ "forwarding table\n");
+ goto out_filt;
+ }
+
+ set_bit(vid, v->vlan_bitmap);
+ v->num_vlans++;
+ __vlan_add_flags(v, vid, flags);
+
+ return 0;
+
+out_filt:
+ if (p)
+ vlan_vid_del(dev, br->vlan_proto, vid);
+ return err;
+}
+
+static int __vlan_del(struct net_port_vlans *v, u16 vid)
+{
+ if (!test_bit(vid, v->vlan_bitmap))
+ return -EINVAL;
+
+ __vlan_delete_pvid(v, vid);
+ clear_bit(vid, v->untagged_bitmap);
+
+ if (v->port_idx) {
+ struct net_bridge_port *p = v->parent.port;
+ vlan_vid_del(p->dev, p->br->vlan_proto, vid);
+ }
+
+ clear_bit(vid, v->vlan_bitmap);
+ v->num_vlans--;
+ if (bitmap_empty(v->vlan_bitmap, VLAN_N_VID)) {
+ if (v->port_idx)
+ RCU_INIT_POINTER(v->parent.port->vlan_info, NULL);
+ else
+ RCU_INIT_POINTER(v->parent.br->vlan_info, NULL);
+ kfree_rcu(v, rcu);
+ }
+ return 0;
+}
+
+static void __vlan_flush(struct net_port_vlans *v)
+{
+ smp_wmb();
+ v->pvid = 0;
+ bitmap_zero(v->vlan_bitmap, VLAN_N_VID);
+ if (v->port_idx)
+ RCU_INIT_POINTER(v->parent.port->vlan_info, NULL);
+ else
+ RCU_INIT_POINTER(v->parent.br->vlan_info, NULL);
+ kfree_rcu(v, rcu);
+}
+
+struct sk_buff *br_handle_vlan(struct net_bridge *br,
+ const struct net_port_vlans *pv,
+ struct sk_buff *skb)
+{
+ u16 vid;
+
+ /* If this packet was not filtered at input, let it pass */
+ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
+ goto out;
+
+ /* Vlan filter table must be configured at this point. The
+ * only exception is the bridge is set in promisc mode and the
+ * packet is destined for the bridge device. In this case
+ * pass the packet as is.
+ */
+ if (!pv) {
+ if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) {
+ goto out;
+ } else {
+ kfree_skb(skb);
+ return NULL;
+ }
+ }
+
+ /* At this point, we know that the frame was filtered and contains
+ * a valid vlan id. If the vlan id is set in the untagged bitmap,
+ * send untagged; otherwise, send tagged.
+ */
+ br_vlan_get_tag(skb, &vid);
+ if (test_bit(vid, pv->untagged_bitmap))
+ skb->vlan_tci = 0;
+
+out:
+ return skb;
+}
+
+/* Called under RCU */
+bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
+ struct sk_buff *skb, u16 *vid)
+{
+ bool tagged;
+ __be16 proto;
+
+ /* If VLAN filtering is disabled on the bridge, all packets are
+ * permitted.
+ */
+ if (!br->vlan_enabled) {
+ BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
+ return true;
+ }
+
+ /* If there are no vlan in the permitted list, all packets are
+ * rejected.
+ */
+ if (!v)
+ goto drop;
+
+ BR_INPUT_SKB_CB(skb)->vlan_filtered = true;
+ proto = br->vlan_proto;
+
+ /* If vlan tx offload is disabled on bridge device and frame was
+ * sent from vlan device on the bridge device, it does not have
+ * HW accelerated vlan tag.
+ */
+ if (unlikely(!skb_vlan_tag_present(skb) &&
+ skb->protocol == proto)) {
+ skb = skb_vlan_untag(skb);
+ if (unlikely(!skb))
+ return false;
+ }
+
+ if (!br_vlan_get_tag(skb, vid)) {
+ /* Tagged frame */
+ if (skb->vlan_proto != proto) {
+ /* Protocol-mismatch, empty out vlan_tci for new tag */
+ skb_push(skb, ETH_HLEN);
+ skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb));
+ if (unlikely(!skb))
+ return false;
+
+ skb_pull(skb, ETH_HLEN);
+ skb_reset_mac_len(skb);
+ *vid = 0;
+ tagged = false;
+ } else {
+ tagged = true;
+ }
+ } else {
+ /* Untagged frame */
+ tagged = false;
+ }
+
+ if (!*vid) {
+ u16 pvid = br_get_pvid(v);
+
+ /* Frame had a tag with VID 0 or did not have a tag.
+ * See if pvid is set on this port. That tells us which
+ * vlan untagged or priority-tagged traffic belongs to.
+ */
+ if (!pvid)
+ goto drop;
+
+ /* PVID is set on this port. Any untagged or priority-tagged
+ * ingress frame is considered to belong to this vlan.
+ */
+ *vid = pvid;
+ if (likely(!tagged))
+ /* Untagged Frame. */
+ __vlan_hwaccel_put_tag(skb, proto, pvid);
+ else
+ /* Priority-tagged Frame.
+ * At this point, We know that skb->vlan_tci had
+ * VLAN_TAG_PRESENT bit and its VID field was 0x000.
+ * We update only VID field and preserve PCP field.
+ */
+ skb->vlan_tci |= pvid;
+
+ return true;
+ }
+
+ /* Frame had a valid vlan tag. See if vlan is allowed */
+ if (test_bit(*vid, v->vlan_bitmap))
+ return true;
+drop:
+ kfree_skb(skb);
+ return false;
+}
+
+/* Called under RCU. */
+bool br_allowed_egress(struct net_bridge *br,
+ const struct net_port_vlans *v,
+ const struct sk_buff *skb)
+{
+ u16 vid;
+
+ /* If this packet was not filtered at input, let it pass */
+ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
+ return true;
+
+ if (!v)
+ return false;
+
+ br_vlan_get_tag(skb, &vid);
+ if (test_bit(vid, v->vlan_bitmap))
+ return true;
+
+ return false;
+}
+
+/* Called under RCU */
+bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
+{
+ struct net_bridge *br = p->br;
+ struct net_port_vlans *v;
+
+ /* If filtering was disabled at input, let it pass. */
+ if (!br->vlan_enabled)
+ return true;
+
+ v = rcu_dereference(p->vlan_info);
+ if (!v)
+ return false;
+
+ if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto)
+ *vid = 0;
+
+ if (!*vid) {
+ *vid = br_get_pvid(v);
+ if (!*vid)
+ return false;
+
+ return true;
+ }
+
+ if (test_bit(*vid, v->vlan_bitmap))
+ return true;
+
+ return false;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags)
+{
+ struct net_port_vlans *pv = NULL;
+ int err;
+
+ ASSERT_RTNL();
+
+ pv = rtnl_dereference(br->vlan_info);
+ if (pv)
+ return __vlan_add(pv, vid, flags);
+
+ /* Create port vlan infomration
+ */
+ pv = kzalloc(sizeof(*pv), GFP_KERNEL);
+ if (!pv)
+ return -ENOMEM;
+
+ pv->parent.br = br;
+ err = __vlan_add(pv, vid, flags);
+ if (err)
+ goto out;
+
+ rcu_assign_pointer(br->vlan_info, pv);
+ return 0;
+out:
+ kfree(pv);
+ return err;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int br_vlan_delete(struct net_bridge *br, u16 vid)
+{
+ struct net_port_vlans *pv;
+
+ ASSERT_RTNL();
+
+ pv = rtnl_dereference(br->vlan_info);
+ if (!pv)
+ return -EINVAL;
+
+ br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
+
+ __vlan_del(pv, vid);
+ return 0;
+}
+
+void br_vlan_flush(struct net_bridge *br)
+{
+ struct net_port_vlans *pv;
+
+ ASSERT_RTNL();
+ pv = rtnl_dereference(br->vlan_info);
+ if (!pv)
+ return;
+
+ __vlan_flush(pv);
+}
+
+bool br_vlan_find(struct net_bridge *br, u16 vid)
+{
+ struct net_port_vlans *pv;
+ bool found = false;
+
+ rcu_read_lock();
+ pv = rcu_dereference(br->vlan_info);
+
+ if (!pv)
+ goto out;
+
+ if (test_bit(vid, pv->vlan_bitmap))
+ found = true;
+
+out:
+ rcu_read_unlock();
+ return found;
+}
+
+/* Must be protected by RTNL. */
+static void recalculate_group_addr(struct net_bridge *br)
+{
+ if (br->group_addr_set)
+ return;
+
+ spin_lock_bh(&br->lock);
+ if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) {
+ /* Bridge Group Address */
+ br->group_addr[5] = 0x00;
+ } else { /* vlan_enabled && ETH_P_8021AD */
+ /* Provider Bridge Group Address */
+ br->group_addr[5] = 0x08;
+ }
+ spin_unlock_bh(&br->lock);
+}
+
+/* Must be protected by RTNL. */
+void br_recalculate_fwd_mask(struct net_bridge *br)
+{
+ if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q))
+ br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
+ else /* vlan_enabled && ETH_P_8021AD */
+ br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
+ ~(1u << br->group_addr[5]);
+}
+
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+{
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (br->vlan_enabled == val)
+ goto unlock;
+
+ br->vlan_enabled = val;
+ br_manage_promisc(br);
+ recalculate_group_addr(br);
+ br_recalculate_fwd_mask(br);
+
+unlock:
+ rtnl_unlock();
+ return 0;
+}
+
+int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
+{
+ int err = 0;
+ struct net_bridge_port *p;
+ struct net_port_vlans *pv;
+ __be16 proto, oldproto;
+ u16 vid, errvid;
+
+ if (val != ETH_P_8021Q && val != ETH_P_8021AD)
+ return -EPROTONOSUPPORT;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ proto = htons(val);
+ if (br->vlan_proto == proto)
+ goto unlock;
+
+ /* Add VLANs for the new proto to the device filter. */
+ list_for_each_entry(p, &br->port_list, list) {
+ pv = rtnl_dereference(p->vlan_info);
+ if (!pv)
+ continue;
+
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
+ err = vlan_vid_add(p->dev, proto, vid);
+ if (err)
+ goto err_filt;
+ }
+ }
+
+ oldproto = br->vlan_proto;
+ br->vlan_proto = proto;
+
+ recalculate_group_addr(br);
+ br_recalculate_fwd_mask(br);
+
+ /* Delete VLANs for the old proto from the device filter. */
+ list_for_each_entry(p, &br->port_list, list) {
+ pv = rtnl_dereference(p->vlan_info);
+ if (!pv)
+ continue;
+
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
+ vlan_vid_del(p->dev, oldproto, vid);
+ }
+
+unlock:
+ rtnl_unlock();
+ return err;
+
+err_filt:
+ errvid = vid;
+ for_each_set_bit(vid, pv->vlan_bitmap, errvid)
+ vlan_vid_del(p->dev, proto, vid);
+
+ list_for_each_entry_continue_reverse(p, &br->port_list, list) {
+ pv = rtnl_dereference(p->vlan_info);
+ if (!pv)
+ continue;
+
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
+ vlan_vid_del(p->dev, proto, vid);
+ }
+
+ goto unlock;
+}
+
+static bool vlan_default_pvid(struct net_port_vlans *pv, u16 vid)
+{
+ return pv && vid == pv->pvid && test_bit(vid, pv->untagged_bitmap);
+}
+
+static void br_vlan_disable_default_pvid(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+ u16 pvid = br->default_pvid;
+
+ /* Disable default_pvid on all ports where it is still
+ * configured.
+ */
+ if (vlan_default_pvid(br_get_vlan_info(br), pvid))
+ br_vlan_delete(br, pvid);
+
+ list_for_each_entry(p, &br->port_list, list) {
+ if (vlan_default_pvid(nbp_get_vlan_info(p), pvid))
+ nbp_vlan_delete(p, pvid);
+ }
+
+ br->default_pvid = 0;
+}
+
+static int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
+{
+ struct net_bridge_port *p;
+ u16 old_pvid;
+ int err = 0;
+ unsigned long *changed;
+
+ changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!changed)
+ return -ENOMEM;
+
+ old_pvid = br->default_pvid;
+
+ /* Update default_pvid config only if we do not conflict with
+ * user configuration.
+ */
+ if ((!old_pvid || vlan_default_pvid(br_get_vlan_info(br), old_pvid)) &&
+ !br_vlan_find(br, pvid)) {
+ err = br_vlan_add(br, pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED);
+ if (err)
+ goto out;
+ br_vlan_delete(br, old_pvid);
+ set_bit(0, changed);
+ }
+
+ list_for_each_entry(p, &br->port_list, list) {
+ /* Update default_pvid config only if we do not conflict with
+ * user configuration.
+ */
+ if ((old_pvid &&
+ !vlan_default_pvid(nbp_get_vlan_info(p), old_pvid)) ||
+ nbp_vlan_find(p, pvid))
+ continue;
+
+ err = nbp_vlan_add(p, pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED);
+ if (err)
+ goto err_port;
+ nbp_vlan_delete(p, old_pvid);
+ set_bit(p->port_no, changed);
+ }
+
+ br->default_pvid = pvid;
+
+out:
+ kfree(changed);
+ return err;
+
+err_port:
+ list_for_each_entry_continue_reverse(p, &br->port_list, list) {
+ if (!test_bit(p->port_no, changed))
+ continue;
+
+ if (old_pvid)
+ nbp_vlan_add(p, old_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED);
+ nbp_vlan_delete(p, pvid);
+ }
+
+ if (test_bit(0, changed)) {
+ if (old_pvid)
+ br_vlan_add(br, old_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED);
+ br_vlan_delete(br, pvid);
+ }
+ goto out;
+}
+
+int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
+{
+ u16 pvid = val;
+ int err = 0;
+
+ if (val >= VLAN_VID_MASK)
+ return -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (pvid == br->default_pvid)
+ goto unlock;
+
+ /* Only allow default pvid change when filtering is disabled */
+ if (br->vlan_enabled) {
+ pr_info_once("Please disable vlan filtering to change default_pvid\n");
+ err = -EPERM;
+ goto unlock;
+ }
+
+ if (!pvid)
+ br_vlan_disable_default_pvid(br);
+ else
+ err = __br_vlan_set_default_pvid(br, pvid);
+
+unlock:
+ rtnl_unlock();
+ return err;
+}
+
+int br_vlan_init(struct net_bridge *br)
+{
+ br->vlan_proto = htons(ETH_P_8021Q);
+ br->default_pvid = 1;
+ return br_vlan_add(br, 1,
+ BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED);
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags)
+{
+ struct net_port_vlans *pv = NULL;
+ int err;
+
+ ASSERT_RTNL();
+
+ pv = rtnl_dereference(port->vlan_info);
+ if (pv)
+ return __vlan_add(pv, vid, flags);
+
+ /* Create port vlan infomration
+ */
+ pv = kzalloc(sizeof(*pv), GFP_KERNEL);
+ if (!pv) {
+ err = -ENOMEM;
+ goto clean_up;
+ }
+
+ pv->port_idx = port->port_no;
+ pv->parent.port = port;
+ err = __vlan_add(pv, vid, flags);
+ if (err)
+ goto clean_up;
+
+ rcu_assign_pointer(port->vlan_info, pv);
+ return 0;
+
+clean_up:
+ kfree(pv);
+ return err;
+}
+
+/* Must be protected by RTNL.
+ * Must be called with vid in range from 1 to 4094 inclusive.
+ */
+int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
+{
+ struct net_port_vlans *pv;
+
+ ASSERT_RTNL();
+
+ pv = rtnl_dereference(port->vlan_info);
+ if (!pv)
+ return -EINVAL;
+
+ br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid);
+
+ return __vlan_del(pv, vid);
+}
+
+void nbp_vlan_flush(struct net_bridge_port *port)
+{
+ struct net_port_vlans *pv;
+ u16 vid;
+
+ ASSERT_RTNL();
+
+ pv = rtnl_dereference(port->vlan_info);
+ if (!pv)
+ return;
+
+ for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
+ vlan_vid_del(port->dev, port->br->vlan_proto, vid);
+
+ __vlan_flush(pv);
+}
+
+bool nbp_vlan_find(struct net_bridge_port *port, u16 vid)
+{
+ struct net_port_vlans *pv;
+ bool found = false;
+
+ rcu_read_lock();
+ pv = rcu_dereference(port->vlan_info);
+
+ if (!pv)
+ goto out;
+
+ if (test_bit(vid, pv->vlan_bitmap))
+ found = true;
+
+out:
+ rcu_read_unlock();
+ return found;
+}
+
+int nbp_vlan_init(struct net_bridge_port *p)
+{
+ return p->br->default_pvid ?
+ nbp_vlan_add(p, p->br->default_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED) :
+ 0;
+}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
new file mode 100644
index 000000000..9cebf47ac
--- /dev/null
+++ b/net/bridge/netfilter/Kconfig
@@ -0,0 +1,227 @@
+#
+# Bridge netfilter configuration
+#
+#
+menuconfig NF_TABLES_BRIDGE
+ depends on BRIDGE && NETFILTER && NF_TABLES
+ tristate "Ethernet Bridge nf_tables support"
+
+if NF_TABLES_BRIDGE
+
+config NFT_BRIDGE_META
+ tristate "Netfilter nf_table bridge meta support"
+ depends on NFT_META
+ help
+ Add support for bridge dedicated meta key.
+
+config NFT_BRIDGE_REJECT
+ tristate "Netfilter nf_tables bridge reject support"
+ depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
+ help
+ Add support to reject packets.
+
+config NF_LOG_BRIDGE
+ tristate "Bridge packet logging"
+
+endif # NF_TABLES_BRIDGE
+
+menuconfig BRIDGE_NF_EBTABLES
+ tristate "Ethernet Bridge tables (ebtables) support"
+ depends on BRIDGE && NETFILTER && NETFILTER_XTABLES
+ help
+ ebtables is a general, extensible frame/packet identification
+ framework. Say 'Y' or 'M' here if you want to do Ethernet
+ filtering/NAT/brouting on the Ethernet bridge.
+
+if BRIDGE_NF_EBTABLES
+
+#
+# tables
+#
+config BRIDGE_EBT_BROUTE
+ tristate "ebt: broute table support"
+ help
+ The ebtables broute table is used to define rules that decide between
+ bridging and routing frames, giving Linux the functionality of a
+ brouter. See the man page for ebtables(8) and examples on the ebtables
+ website.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_T_FILTER
+ tristate "ebt: filter table support"
+ help
+ The ebtables filter table is used to define frame filtering rules at
+ local input, forwarding and local output. See the man page for
+ ebtables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_T_NAT
+ tristate "ebt: nat table support"
+ help
+ The ebtables nat table is used to define rules that alter the MAC
+ source address (MAC SNAT) or the MAC destination address (MAC DNAT).
+ See the man page for ebtables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+#
+# matches
+#
+config BRIDGE_EBT_802_3
+ tristate "ebt: 802.3 filter support"
+ help
+ This option adds matching support for 802.3 Ethernet frames.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_AMONG
+ tristate "ebt: among filter support"
+ help
+ This option adds the among match, which allows matching the MAC source
+ and/or destination address on a list of addresses. Optionally,
+ MAC/IP address pairs can be matched, f.e. for anti-spoofing rules.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_ARP
+ tristate "ebt: ARP filter support"
+ help
+ This option adds the ARP match, which allows ARP and RARP header field
+ filtering.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_IP
+ tristate "ebt: IP filter support"
+ help
+ This option adds the IP match, which allows basic IP header field
+ filtering.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_IP6
+ tristate "ebt: IP6 filter support"
+ depends on BRIDGE_NF_EBTABLES && IPV6
+ help
+ This option adds the IP6 match, which allows basic IPV6 header field
+ filtering.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_LIMIT
+ tristate "ebt: limit match support"
+ help
+ This option adds the limit match, which allows you to control
+ the rate at which a rule can be matched. This match is the
+ equivalent of the iptables limit match.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config BRIDGE_EBT_MARK
+ tristate "ebt: mark filter support"
+ help
+ This option adds the mark match, which allows matching frames based on
+ the 'nfmark' value in the frame. This can be set by the mark target.
+ This value is the same as the one used in the iptables mark match and
+ target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_PKTTYPE
+ tristate "ebt: packet type filter support"
+ help
+ This option adds the packet type match, which allows matching on the
+ type of packet based on its Ethernet "class" (as determined by
+ the generic networking code): broadcast, multicast,
+ for this host alone or for another host.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_STP
+ tristate "ebt: STP filter support"
+ help
+ This option adds the Spanning Tree Protocol match, which
+ allows STP header field filtering.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_VLAN
+ tristate "ebt: 802.1Q VLAN filter support"
+ help
+ This option adds the 802.1Q vlan match, which allows the filtering of
+ 802.1Q vlan fields.
+
+ To compile it as a module, choose M here. If unsure, say N.
+#
+# targets
+#
+config BRIDGE_EBT_ARPREPLY
+ tristate "ebt: arp reply target support"
+ depends on BRIDGE_NF_EBTABLES && INET
+ help
+ This option adds the arp reply target, which allows
+ automatically sending arp replies to arp requests.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_DNAT
+ tristate "ebt: dnat target support"
+ help
+ This option adds the MAC DNAT target, which allows altering the MAC
+ destination address of frames.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_MARK_T
+ tristate "ebt: mark target support"
+ help
+ This option adds the mark target, which allows marking frames by
+ setting the 'nfmark' value in the frame.
+ This value is the same as the one used in the iptables mark match and
+ target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_REDIRECT
+ tristate "ebt: redirect target support"
+ help
+ This option adds the MAC redirect target, which allows altering the MAC
+ destination address of a frame to that of the device it arrived on.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_SNAT
+ tristate "ebt: snat target support"
+ help
+ This option adds the MAC SNAT target, which allows altering the MAC
+ source address of frames.
+
+ To compile it as a module, choose M here. If unsure, say N.
+#
+# watchers
+#
+config BRIDGE_EBT_LOG
+ tristate "ebt: log support"
+ help
+ This option adds the log watcher, that you can use in any rule
+ in any ebtables table. It records info about the frame header
+ to the syslog.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config BRIDGE_EBT_NFLOG
+ tristate "ebt: nflog support"
+ help
+ This option enables the nflog watcher, which allows to LOG
+ messages through the netfilter logging API, which can use
+ either the old LOG target, the old ULOG target or nfnetlink_log
+ as backend.
+
+ This option adds the nflog watcher, that you can use in any rule
+ in any ebtables table.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+endif # BRIDGE_NF_EBTABLES
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
new file mode 100644
index 000000000..be4d0cea7
--- /dev/null
+++ b/net/bridge/netfilter/Makefile
@@ -0,0 +1,40 @@
+#
+# Makefile for the netfilter modules for Link Layer filtering on a bridge.
+#
+
+obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o
+obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o
+obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o
+
+# packet logging
+obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o
+
+obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o
+
+# tables
+obj-$(CONFIG_BRIDGE_EBT_BROUTE) += ebtable_broute.o
+obj-$(CONFIG_BRIDGE_EBT_T_FILTER) += ebtable_filter.o
+obj-$(CONFIG_BRIDGE_EBT_T_NAT) += ebtable_nat.o
+
+#matches
+obj-$(CONFIG_BRIDGE_EBT_802_3) += ebt_802_3.o
+obj-$(CONFIG_BRIDGE_EBT_AMONG) += ebt_among.o
+obj-$(CONFIG_BRIDGE_EBT_ARP) += ebt_arp.o
+obj-$(CONFIG_BRIDGE_EBT_IP) += ebt_ip.o
+obj-$(CONFIG_BRIDGE_EBT_IP6) += ebt_ip6.o
+obj-$(CONFIG_BRIDGE_EBT_LIMIT) += ebt_limit.o
+obj-$(CONFIG_BRIDGE_EBT_MARK) += ebt_mark_m.o
+obj-$(CONFIG_BRIDGE_EBT_PKTTYPE) += ebt_pkttype.o
+obj-$(CONFIG_BRIDGE_EBT_STP) += ebt_stp.o
+obj-$(CONFIG_BRIDGE_EBT_VLAN) += ebt_vlan.o
+
+# targets
+obj-$(CONFIG_BRIDGE_EBT_ARPREPLY) += ebt_arpreply.o
+obj-$(CONFIG_BRIDGE_EBT_MARK_T) += ebt_mark.o
+obj-$(CONFIG_BRIDGE_EBT_DNAT) += ebt_dnat.o
+obj-$(CONFIG_BRIDGE_EBT_REDIRECT) += ebt_redirect.o
+obj-$(CONFIG_BRIDGE_EBT_SNAT) += ebt_snat.o
+
+# watchers
+obj-$(CONFIG_BRIDGE_EBT_LOG) += ebt_log.o
+obj-$(CONFIG_BRIDGE_EBT_NFLOG) += ebt_nflog.o
diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
new file mode 100644
index 000000000..2a449b7ab
--- /dev/null
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -0,0 +1,72 @@
+/*
+ * 802_3
+ *
+ * Author:
+ * Chris Vitale csv@bluetail.com
+ *
+ * May 2003
+ *
+ */
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_802_3.h>
+
+static bool
+ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_802_3_info *info = par->matchinfo;
+ const struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb);
+ __be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type;
+
+ if (info->bitmask & EBT_802_3_SAP) {
+ if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP))
+ return false;
+ if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP))
+ return false;
+ }
+
+ if (info->bitmask & EBT_802_3_TYPE) {
+ if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE))
+ return false;
+ if (FWINV(info->type != type, EBT_802_3_TYPE))
+ return false;
+ }
+
+ return true;
+}
+
+static int ebt_802_3_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_802_3_info *info = par->matchinfo;
+
+ if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_match ebt_802_3_mt_reg __read_mostly = {
+ .name = "802_3",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_802_3_mt,
+ .checkentry = ebt_802_3_mt_check,
+ .matchsize = sizeof(struct ebt_802_3_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_802_3_init(void)
+{
+ return xt_register_match(&ebt_802_3_mt_reg);
+}
+
+static void __exit ebt_802_3_fini(void)
+{
+ xt_unregister_match(&ebt_802_3_mt_reg);
+}
+
+module_init(ebt_802_3_init);
+module_exit(ebt_802_3_fini);
+MODULE_DESCRIPTION("Ebtables: DSAP/SSAP field and SNAP type matching");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
new file mode 100644
index 000000000..9024283d2
--- /dev/null
+++ b/net/bridge/netfilter/ebt_among.c
@@ -0,0 +1,229 @@
+/*
+ * ebt_among
+ *
+ * Authors:
+ * Grzegorz Borowiak <grzes@gnu.univ.gda.pl>
+ *
+ * August, 2003
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/ip.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_among.h>
+
+static bool ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh,
+ const char *mac, __be32 ip)
+{
+ /* You may be puzzled as to how this code works.
+ * Some tricks were used, refer to
+ * include/linux/netfilter_bridge/ebt_among.h
+ * as there you can find a solution of this mystery.
+ */
+ const struct ebt_mac_wormhash_tuple *p;
+ int start, limit, i;
+ uint32_t cmp[2] = { 0, 0 };
+ int key = ((const unsigned char *)mac)[5];
+
+ ether_addr_copy(((char *) cmp) + 2, mac);
+ start = wh->table[key];
+ limit = wh->table[key + 1];
+ if (ip) {
+ for (i = start; i < limit; i++) {
+ p = &wh->pool[i];
+ if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0])
+ if (p->ip == 0 || p->ip == ip)
+ return true;
+ }
+ } else {
+ for (i = start; i < limit; i++) {
+ p = &wh->pool[i];
+ if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0])
+ if (p->ip == 0)
+ return true;
+ }
+ }
+ return false;
+}
+
+static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash
+ *wh)
+{
+ int i;
+
+ for (i = 0; i < 256; i++) {
+ if (wh->table[i] > wh->table[i + 1])
+ return -0x100 - i;
+ if (wh->table[i] < 0)
+ return -0x200 - i;
+ if (wh->table[i] > wh->poolsize)
+ return -0x300 - i;
+ }
+ if (wh->table[256] > wh->poolsize)
+ return -0xc00;
+ return 0;
+}
+
+static int get_ip_dst(const struct sk_buff *skb, __be32 *addr)
+{
+ if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) {
+ const struct iphdr *ih;
+ struct iphdr _iph;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ return -1;
+ *addr = ih->daddr;
+ } else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
+ const struct arphdr *ah;
+ struct arphdr _arph;
+ const __be32 *bp;
+ __be32 buf;
+
+ ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ if (ah == NULL ||
+ ah->ar_pln != sizeof(__be32) ||
+ ah->ar_hln != ETH_ALEN)
+ return -1;
+ bp = skb_header_pointer(skb, sizeof(struct arphdr) +
+ 2 * ETH_ALEN + sizeof(__be32),
+ sizeof(__be32), &buf);
+ if (bp == NULL)
+ return -1;
+ *addr = *bp;
+ }
+ return 0;
+}
+
+static int get_ip_src(const struct sk_buff *skb, __be32 *addr)
+{
+ if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) {
+ const struct iphdr *ih;
+ struct iphdr _iph;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ return -1;
+ *addr = ih->saddr;
+ } else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
+ const struct arphdr *ah;
+ struct arphdr _arph;
+ const __be32 *bp;
+ __be32 buf;
+
+ ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ if (ah == NULL ||
+ ah->ar_pln != sizeof(__be32) ||
+ ah->ar_hln != ETH_ALEN)
+ return -1;
+ bp = skb_header_pointer(skb, sizeof(struct arphdr) +
+ ETH_ALEN, sizeof(__be32), &buf);
+ if (bp == NULL)
+ return -1;
+ *addr = *bp;
+ }
+ return 0;
+}
+
+static bool
+ebt_among_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_among_info *info = par->matchinfo;
+ const char *dmac, *smac;
+ const struct ebt_mac_wormhash *wh_dst, *wh_src;
+ __be32 dip = 0, sip = 0;
+
+ wh_dst = ebt_among_wh_dst(info);
+ wh_src = ebt_among_wh_src(info);
+
+ if (wh_src) {
+ smac = eth_hdr(skb)->h_source;
+ if (get_ip_src(skb, &sip))
+ return false;
+ if (!(info->bitmask & EBT_AMONG_SRC_NEG)) {
+ /* we match only if it contains */
+ if (!ebt_mac_wormhash_contains(wh_src, smac, sip))
+ return false;
+ } else {
+ /* we match only if it DOES NOT contain */
+ if (ebt_mac_wormhash_contains(wh_src, smac, sip))
+ return false;
+ }
+ }
+
+ if (wh_dst) {
+ dmac = eth_hdr(skb)->h_dest;
+ if (get_ip_dst(skb, &dip))
+ return false;
+ if (!(info->bitmask & EBT_AMONG_DST_NEG)) {
+ /* we match only if it contains */
+ if (!ebt_mac_wormhash_contains(wh_dst, dmac, dip))
+ return false;
+ } else {
+ /* we match only if it DOES NOT contain */
+ if (ebt_mac_wormhash_contains(wh_dst, dmac, dip))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int ebt_among_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_among_info *info = par->matchinfo;
+ const struct ebt_entry_match *em =
+ container_of(par->matchinfo, const struct ebt_entry_match, data);
+ int expected_length = sizeof(struct ebt_among_info);
+ const struct ebt_mac_wormhash *wh_dst, *wh_src;
+ int err;
+
+ wh_dst = ebt_among_wh_dst(info);
+ wh_src = ebt_among_wh_src(info);
+ expected_length += ebt_mac_wormhash_size(wh_dst);
+ expected_length += ebt_mac_wormhash_size(wh_src);
+
+ if (em->match_size != EBT_ALIGN(expected_length)) {
+ pr_info("wrong size: %d against expected %d, rounded to %Zd\n",
+ em->match_size, expected_length,
+ EBT_ALIGN(expected_length));
+ return -EINVAL;
+ }
+ if (wh_dst && (err = ebt_mac_wormhash_check_integrity(wh_dst))) {
+ pr_info("dst integrity fail: %x\n", -err);
+ return -EINVAL;
+ }
+ if (wh_src && (err = ebt_mac_wormhash_check_integrity(wh_src))) {
+ pr_info("src integrity fail: %x\n", -err);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match ebt_among_mt_reg __read_mostly = {
+ .name = "among",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_among_mt,
+ .checkentry = ebt_among_mt_check,
+ .matchsize = -1, /* special case */
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_among_init(void)
+{
+ return xt_register_match(&ebt_among_mt_reg);
+}
+
+static void __exit ebt_among_fini(void)
+{
+ xt_unregister_match(&ebt_among_mt_reg);
+}
+
+module_init(ebt_among_init);
+module_exit(ebt_among_fini);
+MODULE_DESCRIPTION("Ebtables: Combined MAC/IP address list matching");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
new file mode 100644
index 000000000..cd457b891
--- /dev/null
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -0,0 +1,140 @@
+/*
+ * ebt_arp
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * Tim Gardner <timg@tpi.com>
+ *
+ * April, 2002
+ *
+ */
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_arp.h>
+
+static bool
+ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_arp_info *info = par->matchinfo;
+ const struct arphdr *ah;
+ struct arphdr _arph;
+
+ ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ if (ah == NULL)
+ return false;
+ if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode !=
+ ah->ar_op, EBT_ARP_OPCODE))
+ return false;
+ if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype !=
+ ah->ar_hrd, EBT_ARP_HTYPE))
+ return false;
+ if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype !=
+ ah->ar_pro, EBT_ARP_PTYPE))
+ return false;
+
+ if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) {
+ const __be32 *sap, *dap;
+ __be32 saddr, daddr;
+
+ if (ah->ar_pln != sizeof(__be32) || ah->ar_pro != htons(ETH_P_IP))
+ return false;
+ sap = skb_header_pointer(skb, sizeof(struct arphdr) +
+ ah->ar_hln, sizeof(saddr),
+ &saddr);
+ if (sap == NULL)
+ return false;
+ dap = skb_header_pointer(skb, sizeof(struct arphdr) +
+ 2*ah->ar_hln+sizeof(saddr),
+ sizeof(daddr), &daddr);
+ if (dap == NULL)
+ return false;
+ if (info->bitmask & EBT_ARP_SRC_IP &&
+ FWINV(info->saddr != (*sap & info->smsk), EBT_ARP_SRC_IP))
+ return false;
+ if (info->bitmask & EBT_ARP_DST_IP &&
+ FWINV(info->daddr != (*dap & info->dmsk), EBT_ARP_DST_IP))
+ return false;
+ if (info->bitmask & EBT_ARP_GRAT &&
+ FWINV(*dap != *sap, EBT_ARP_GRAT))
+ return false;
+ }
+
+ if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) {
+ const unsigned char *mp;
+ unsigned char _mac[ETH_ALEN];
+ uint8_t verdict, i;
+
+ if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER))
+ return false;
+ if (info->bitmask & EBT_ARP_SRC_MAC) {
+ mp = skb_header_pointer(skb, sizeof(struct arphdr),
+ sizeof(_mac), &_mac);
+ if (mp == NULL)
+ return false;
+ verdict = 0;
+ for (i = 0; i < 6; i++)
+ verdict |= (mp[i] ^ info->smaddr[i]) &
+ info->smmsk[i];
+ if (FWINV(verdict != 0, EBT_ARP_SRC_MAC))
+ return false;
+ }
+
+ if (info->bitmask & EBT_ARP_DST_MAC) {
+ mp = skb_header_pointer(skb, sizeof(struct arphdr) +
+ ah->ar_hln + ah->ar_pln,
+ sizeof(_mac), &_mac);
+ if (mp == NULL)
+ return false;
+ verdict = 0;
+ for (i = 0; i < 6; i++)
+ verdict |= (mp[i] ^ info->dmaddr[i]) &
+ info->dmmsk[i];
+ if (FWINV(verdict != 0, EBT_ARP_DST_MAC))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int ebt_arp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_arp_info *info = par->matchinfo;
+ const struct ebt_entry *e = par->entryinfo;
+
+ if ((e->ethproto != htons(ETH_P_ARP) &&
+ e->ethproto != htons(ETH_P_RARP)) ||
+ e->invflags & EBT_IPROTO)
+ return -EINVAL;
+ if (info->bitmask & ~EBT_ARP_MASK || info->invflags & ~EBT_ARP_MASK)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_match ebt_arp_mt_reg __read_mostly = {
+ .name = "arp",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_arp_mt,
+ .checkentry = ebt_arp_mt_check,
+ .matchsize = sizeof(struct ebt_arp_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_arp_init(void)
+{
+ return xt_register_match(&ebt_arp_mt_reg);
+}
+
+static void __exit ebt_arp_fini(void)
+{
+ xt_unregister_match(&ebt_arp_mt_reg);
+}
+
+module_init(ebt_arp_init);
+module_exit(ebt_arp_fini);
+MODULE_DESCRIPTION("Ebtables: ARP protocol packet match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
new file mode 100644
index 000000000..070cf134a
--- /dev/null
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -0,0 +1,98 @@
+/*
+ * ebt_arpreply
+ *
+ * Authors:
+ * Grzegorz Borowiak <grzes@gnu.univ.gda.pl>
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * August, 2003
+ *
+ */
+#include <linux/if_arp.h>
+#include <net/arp.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_arpreply.h>
+
+static unsigned int
+ebt_arpreply_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ebt_arpreply_info *info = par->targinfo;
+ const __be32 *siptr, *diptr;
+ __be32 _sip, _dip;
+ const struct arphdr *ap;
+ struct arphdr _ah;
+ const unsigned char *shp;
+ unsigned char _sha[ETH_ALEN];
+
+ ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah);
+ if (ap == NULL)
+ return EBT_DROP;
+
+ if (ap->ar_op != htons(ARPOP_REQUEST) ||
+ ap->ar_hln != ETH_ALEN ||
+ ap->ar_pro != htons(ETH_P_IP) ||
+ ap->ar_pln != 4)
+ return EBT_CONTINUE;
+
+ shp = skb_header_pointer(skb, sizeof(_ah), ETH_ALEN, &_sha);
+ if (shp == NULL)
+ return EBT_DROP;
+
+ siptr = skb_header_pointer(skb, sizeof(_ah) + ETH_ALEN,
+ sizeof(_sip), &_sip);
+ if (siptr == NULL)
+ return EBT_DROP;
+
+ diptr = skb_header_pointer(skb,
+ sizeof(_ah) + 2 * ETH_ALEN + sizeof(_sip),
+ sizeof(_dip), &_dip);
+ if (diptr == NULL)
+ return EBT_DROP;
+
+ arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in,
+ *diptr, shp, info->mac, shp);
+
+ return info->target;
+}
+
+static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ebt_arpreply_info *info = par->targinfo;
+ const struct ebt_entry *e = par->entryinfo;
+
+ if (BASE_CHAIN && info->target == EBT_RETURN)
+ return -EINVAL;
+ if (e->ethproto != htons(ETH_P_ARP) ||
+ e->invflags & EBT_IPROTO)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target ebt_arpreply_tg_reg __read_mostly = {
+ .name = "arpreply",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .table = "nat",
+ .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING),
+ .target = ebt_arpreply_tg,
+ .checkentry = ebt_arpreply_tg_check,
+ .targetsize = sizeof(struct ebt_arpreply_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_arpreply_init(void)
+{
+ return xt_register_target(&ebt_arpreply_tg_reg);
+}
+
+static void __exit ebt_arpreply_fini(void)
+{
+ xt_unregister_target(&ebt_arpreply_tg_reg);
+}
+
+module_init(ebt_arpreply_init);
+module_exit(ebt_arpreply_fini);
+MODULE_DESCRIPTION("Ebtables: ARP reply target");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
new file mode 100644
index 000000000..4e0b0c359
--- /dev/null
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -0,0 +1,74 @@
+/*
+ * ebt_dnat
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * June, 2002
+ *
+ */
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nat.h>
+
+static unsigned int
+ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ebt_nat_info *info = par->targinfo;
+
+ if (!skb_make_writable(skb, 0))
+ return EBT_DROP;
+
+ ether_addr_copy(eth_hdr(skb)->h_dest, info->mac);
+ return info->target;
+}
+
+static int ebt_dnat_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ebt_nat_info *info = par->targinfo;
+ unsigned int hook_mask;
+
+ if (BASE_CHAIN && info->target == EBT_RETURN)
+ return -EINVAL;
+
+ hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS);
+ if ((strcmp(par->table, "nat") != 0 ||
+ (hook_mask & ~((1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_LOCAL_OUT)))) &&
+ (strcmp(par->table, "broute") != 0 ||
+ hook_mask & ~(1 << NF_BR_BROUTING)))
+ return -EINVAL;
+ if (INVALID_TARGET)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target ebt_dnat_tg_reg __read_mostly = {
+ .name = "dnat",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_BROUTING),
+ .target = ebt_dnat_tg,
+ .checkentry = ebt_dnat_tg_check,
+ .targetsize = sizeof(struct ebt_nat_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_dnat_init(void)
+{
+ return xt_register_target(&ebt_dnat_tg_reg);
+}
+
+static void __exit ebt_dnat_fini(void)
+{
+ xt_unregister_target(&ebt_dnat_tg_reg);
+}
+
+module_init(ebt_dnat_init);
+module_exit(ebt_dnat_fini);
+MODULE_DESCRIPTION("Ebtables: Destination MAC address translation");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
new file mode 100644
index 000000000..23bca62d5
--- /dev/null
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -0,0 +1,130 @@
+/*
+ * ebt_ip
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * April, 2002
+ *
+ * Changes:
+ * added ip-sport and ip-dport
+ * Innominate Security Technologies AG <mhopf@innominate.com>
+ * September, 2002
+ */
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ip.h>
+
+struct tcpudphdr {
+ __be16 src;
+ __be16 dst;
+};
+
+static bool
+ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_ip_info *info = par->matchinfo;
+ const struct iphdr *ih;
+ struct iphdr _iph;
+ const struct tcpudphdr *pptr;
+ struct tcpudphdr _ports;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ return false;
+ if (info->bitmask & EBT_IP_TOS &&
+ FWINV(info->tos != ih->tos, EBT_IP_TOS))
+ return false;
+ if (info->bitmask & EBT_IP_SOURCE &&
+ FWINV((ih->saddr & info->smsk) !=
+ info->saddr, EBT_IP_SOURCE))
+ return false;
+ if ((info->bitmask & EBT_IP_DEST) &&
+ FWINV((ih->daddr & info->dmsk) !=
+ info->daddr, EBT_IP_DEST))
+ return false;
+ if (info->bitmask & EBT_IP_PROTO) {
+ if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO))
+ return false;
+ if (!(info->bitmask & EBT_IP_DPORT) &&
+ !(info->bitmask & EBT_IP_SPORT))
+ return true;
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ return false;
+ pptr = skb_header_pointer(skb, ih->ihl*4,
+ sizeof(_ports), &_ports);
+ if (pptr == NULL)
+ return false;
+ if (info->bitmask & EBT_IP_DPORT) {
+ u32 dst = ntohs(pptr->dst);
+ if (FWINV(dst < info->dport[0] ||
+ dst > info->dport[1],
+ EBT_IP_DPORT))
+ return false;
+ }
+ if (info->bitmask & EBT_IP_SPORT) {
+ u32 src = ntohs(pptr->src);
+ if (FWINV(src < info->sport[0] ||
+ src > info->sport[1],
+ EBT_IP_SPORT))
+ return false;
+ }
+ }
+ return true;
+}
+
+static int ebt_ip_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_ip_info *info = par->matchinfo;
+ const struct ebt_entry *e = par->entryinfo;
+
+ if (e->ethproto != htons(ETH_P_IP) ||
+ e->invflags & EBT_IPROTO)
+ return -EINVAL;
+ if (info->bitmask & ~EBT_IP_MASK || info->invflags & ~EBT_IP_MASK)
+ return -EINVAL;
+ if (info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT)) {
+ if (info->invflags & EBT_IP_PROTO)
+ return -EINVAL;
+ if (info->protocol != IPPROTO_TCP &&
+ info->protocol != IPPROTO_UDP &&
+ info->protocol != IPPROTO_UDPLITE &&
+ info->protocol != IPPROTO_SCTP &&
+ info->protocol != IPPROTO_DCCP)
+ return -EINVAL;
+ }
+ if (info->bitmask & EBT_IP_DPORT && info->dport[0] > info->dport[1])
+ return -EINVAL;
+ if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1])
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_match ebt_ip_mt_reg __read_mostly = {
+ .name = "ip",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_ip_mt,
+ .checkentry = ebt_ip_mt_check,
+ .matchsize = sizeof(struct ebt_ip_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_ip_init(void)
+{
+ return xt_register_match(&ebt_ip_mt_reg);
+}
+
+static void __exit ebt_ip_fini(void)
+{
+ xt_unregister_match(&ebt_ip_mt_reg);
+}
+
+module_init(ebt_ip_init);
+module_exit(ebt_ip_fini);
+MODULE_DESCRIPTION("Ebtables: IPv4 protocol packet match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
new file mode 100644
index 000000000..17fd5f2cb
--- /dev/null
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -0,0 +1,158 @@
+/*
+ * ebt_ip6
+ *
+ * Authors:
+ * Manohar Castelino <manohar.r.castelino@intel.com>
+ * Kuo-Lang Tseng <kuo-lang.tseng@intel.com>
+ * Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Summary:
+ * This is just a modification of the IPv4 code written by
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * with the changes required to support IPv6
+ *
+ * Jan, 2008
+ */
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <net/dsfield.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ip6.h>
+
+union pkthdr {
+ struct {
+ __be16 src;
+ __be16 dst;
+ } tcpudphdr;
+ struct {
+ u8 type;
+ u8 code;
+ } icmphdr;
+};
+
+static bool
+ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_ip6_info *info = par->matchinfo;
+ const struct ipv6hdr *ih6;
+ struct ipv6hdr _ip6h;
+ const union pkthdr *pptr;
+ union pkthdr _pkthdr;
+
+ ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h);
+ if (ih6 == NULL)
+ return false;
+ if (info->bitmask & EBT_IP6_TCLASS &&
+ FWINV(info->tclass != ipv6_get_dsfield(ih6), EBT_IP6_TCLASS))
+ return false;
+ if ((info->bitmask & EBT_IP6_SOURCE &&
+ FWINV(ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk,
+ &info->saddr), EBT_IP6_SOURCE)) ||
+ (info->bitmask & EBT_IP6_DEST &&
+ FWINV(ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk,
+ &info->daddr), EBT_IP6_DEST)))
+ return false;
+ if (info->bitmask & EBT_IP6_PROTO) {
+ uint8_t nexthdr = ih6->nexthdr;
+ __be16 frag_off;
+ int offset_ph;
+
+ offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off);
+ if (offset_ph == -1)
+ return false;
+ if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
+ return false;
+ if (!(info->bitmask & ( EBT_IP6_DPORT |
+ EBT_IP6_SPORT | EBT_IP6_ICMP6)))
+ return true;
+
+ /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */
+ pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr),
+ &_pkthdr);
+ if (pptr == NULL)
+ return false;
+ if (info->bitmask & EBT_IP6_DPORT) {
+ u16 dst = ntohs(pptr->tcpudphdr.dst);
+ if (FWINV(dst < info->dport[0] ||
+ dst > info->dport[1], EBT_IP6_DPORT))
+ return false;
+ }
+ if (info->bitmask & EBT_IP6_SPORT) {
+ u16 src = ntohs(pptr->tcpudphdr.src);
+ if (FWINV(src < info->sport[0] ||
+ src > info->sport[1], EBT_IP6_SPORT))
+ return false;
+ }
+ if ((info->bitmask & EBT_IP6_ICMP6) &&
+ FWINV(pptr->icmphdr.type < info->icmpv6_type[0] ||
+ pptr->icmphdr.type > info->icmpv6_type[1] ||
+ pptr->icmphdr.code < info->icmpv6_code[0] ||
+ pptr->icmphdr.code > info->icmpv6_code[1],
+ EBT_IP6_ICMP6))
+ return false;
+ }
+ return true;
+}
+
+static int ebt_ip6_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_entry *e = par->entryinfo;
+ struct ebt_ip6_info *info = par->matchinfo;
+
+ if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO)
+ return -EINVAL;
+ if (info->bitmask & ~EBT_IP6_MASK || info->invflags & ~EBT_IP6_MASK)
+ return -EINVAL;
+ if (info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT)) {
+ if (info->invflags & EBT_IP6_PROTO)
+ return -EINVAL;
+ if (info->protocol != IPPROTO_TCP &&
+ info->protocol != IPPROTO_UDP &&
+ info->protocol != IPPROTO_UDPLITE &&
+ info->protocol != IPPROTO_SCTP &&
+ info->protocol != IPPROTO_DCCP)
+ return -EINVAL;
+ }
+ if (info->bitmask & EBT_IP6_DPORT && info->dport[0] > info->dport[1])
+ return -EINVAL;
+ if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1])
+ return -EINVAL;
+ if (info->bitmask & EBT_IP6_ICMP6) {
+ if ((info->invflags & EBT_IP6_PROTO) ||
+ info->protocol != IPPROTO_ICMPV6)
+ return -EINVAL;
+ if (info->icmpv6_type[0] > info->icmpv6_type[1] ||
+ info->icmpv6_code[0] > info->icmpv6_code[1])
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match ebt_ip6_mt_reg __read_mostly = {
+ .name = "ip6",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_ip6_mt,
+ .checkentry = ebt_ip6_mt_check,
+ .matchsize = sizeof(struct ebt_ip6_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_ip6_init(void)
+{
+ return xt_register_match(&ebt_ip6_mt_reg);
+}
+
+static void __exit ebt_ip6_fini(void)
+{
+ xt_unregister_match(&ebt_ip6_mt_reg);
+}
+
+module_init(ebt_ip6_init);
+module_exit(ebt_ip6_fini);
+MODULE_DESCRIPTION("Ebtables: IPv6 protocol packet match");
+MODULE_AUTHOR("Kuo-Lang Tseng <kuo-lang.tseng@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
new file mode 100644
index 000000000..517e78bef
--- /dev/null
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -0,0 +1,127 @@
+/*
+ * ebt_limit
+ *
+ * Authors:
+ * Tom Marshall <tommy@home.tig-grr.com>
+ *
+ * Mostly copied from netfilter's ipt_limit.c, see that file for
+ * more explanation
+ *
+ * September, 2003
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_limit.h>
+
+static DEFINE_SPINLOCK(limit_lock);
+
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+static bool
+ebt_limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ebt_limit_info *info = (void *)par->matchinfo;
+ unsigned long now = jiffies;
+
+ spin_lock_bh(&limit_lock);
+ info->credit += (now - xchg(&info->prev, now)) * CREDITS_PER_JIFFY;
+ if (info->credit > info->credit_cap)
+ info->credit = info->credit_cap;
+
+ if (info->credit >= info->cost) {
+ /* We're not limited. */
+ info->credit -= info->cost;
+ spin_unlock_bh(&limit_lock);
+ return true;
+ }
+
+ spin_unlock_bh(&limit_lock);
+ return false;
+}
+
+/* Precision saver. */
+static u_int32_t
+user2credits(u_int32_t user)
+{
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+ /* Divide first. */
+ return (user / EBT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+ return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE;
+}
+
+static int ebt_limit_mt_check(const struct xt_mtchk_param *par)
+{
+ struct ebt_limit_info *info = par->matchinfo;
+
+ /* Check for overflow. */
+ if (info->burst == 0 ||
+ user2credits(info->avg * info->burst) < user2credits(info->avg)) {
+ pr_info("overflow, try lower: %u/%u\n",
+ info->avg, info->burst);
+ return -EINVAL;
+ }
+
+ /* User avg in seconds * EBT_LIMIT_SCALE: convert to jiffies * 128. */
+ info->prev = jiffies;
+ info->credit = user2credits(info->avg * info->burst);
+ info->credit_cap = user2credits(info->avg * info->burst);
+ info->cost = user2credits(info->avg);
+ return 0;
+}
+
+
+#ifdef CONFIG_COMPAT
+/*
+ * no conversion function needed --
+ * only avg/burst have meaningful values in userspace.
+ */
+struct ebt_compat_limit_info {
+ compat_uint_t avg, burst;
+ compat_ulong_t prev;
+ compat_uint_t credit, credit_cap, cost;
+};
+#endif
+
+static struct xt_match ebt_limit_mt_reg __read_mostly = {
+ .name = "limit",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_limit_mt,
+ .checkentry = ebt_limit_mt_check,
+ .matchsize = sizeof(struct ebt_limit_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct ebt_compat_limit_info),
+#endif
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_limit_init(void)
+{
+ return xt_register_match(&ebt_limit_mt_reg);
+}
+
+static void __exit ebt_limit_fini(void)
+{
+ xt_unregister_match(&ebt_limit_mt_reg);
+}
+
+module_init(ebt_limit_init);
+module_exit(ebt_limit_fini);
+MODULE_DESCRIPTION("Ebtables: Rate-limit match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
new file mode 100644
index 000000000..17f2e4bc2
--- /dev/null
+++ b/net/bridge/netfilter/ebt_log.c
@@ -0,0 +1,225 @@
+/*
+ * ebt_log
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
+ *
+ * April, 2002
+ *
+ */
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/spinlock.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_log.h>
+#include <linux/netfilter.h>
+
+static DEFINE_SPINLOCK(ebt_log_lock);
+
+static int ebt_log_tg_check(const struct xt_tgchk_param *par)
+{
+ struct ebt_log_info *info = par->targinfo;
+
+ if (info->bitmask & ~EBT_LOG_MASK)
+ return -EINVAL;
+ if (info->loglevel >= 8)
+ return -EINVAL;
+ info->prefix[EBT_LOG_PREFIX_SIZE - 1] = '\0';
+ return 0;
+}
+
+struct tcpudphdr
+{
+ __be16 src;
+ __be16 dst;
+};
+
+struct arppayload
+{
+ unsigned char mac_src[ETH_ALEN];
+ unsigned char ip_src[4];
+ unsigned char mac_dst[ETH_ALEN];
+ unsigned char ip_dst[4];
+};
+
+static void
+print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
+{
+ if (protocol == IPPROTO_TCP ||
+ protocol == IPPROTO_UDP ||
+ protocol == IPPROTO_UDPLITE ||
+ protocol == IPPROTO_SCTP ||
+ protocol == IPPROTO_DCCP) {
+ const struct tcpudphdr *pptr;
+ struct tcpudphdr _ports;
+
+ pptr = skb_header_pointer(skb, offset,
+ sizeof(_ports), &_ports);
+ if (pptr == NULL) {
+ printk(" INCOMPLETE TCP/UDP header");
+ return;
+ }
+ printk(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
+ }
+}
+
+static void
+ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
+ const struct sk_buff *skb, const struct net_device *in,
+ const struct net_device *out, const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ unsigned int bitmask;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ spin_lock_bh(&ebt_log_lock);
+ printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
+ '0' + loginfo->u.log.level, prefix,
+ in ? in->name : "", out ? out->name : "",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+
+ if (loginfo->type == NF_LOG_TYPE_LOG)
+ bitmask = loginfo->u.log.logflags;
+ else
+ bitmask = NF_LOG_MASK;
+
+ if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
+ htons(ETH_P_IP)) {
+ const struct iphdr *ih;
+ struct iphdr _iph;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ printk(" INCOMPLETE IP header");
+ goto out;
+ }
+ printk(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
+ &ih->saddr, &ih->daddr, ih->tos, ih->protocol);
+ print_ports(skb, ih->protocol, ih->ihl*4);
+ goto out;
+ }
+
+#if IS_ENABLED(CONFIG_BRIDGE_EBT_IP6)
+ if ((bitmask & EBT_LOG_IP6) && eth_hdr(skb)->h_proto ==
+ htons(ETH_P_IPV6)) {
+ const struct ipv6hdr *ih;
+ struct ipv6hdr _iph;
+ uint8_t nexthdr;
+ __be16 frag_off;
+ int offset_ph;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ printk(" INCOMPLETE IPv6 header");
+ goto out;
+ }
+ printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
+ &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
+ nexthdr = ih->nexthdr;
+ offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr, &frag_off);
+ if (offset_ph == -1)
+ goto out;
+ print_ports(skb, nexthdr, offset_ph);
+ goto out;
+ }
+#endif
+
+ if ((bitmask & EBT_LOG_ARP) &&
+ ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) ||
+ (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) {
+ const struct arphdr *ah;
+ struct arphdr _arph;
+
+ ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ if (ah == NULL) {
+ printk(" INCOMPLETE ARP header");
+ goto out;
+ }
+ printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
+ ntohs(ah->ar_op));
+
+ /* If it's for Ethernet and the lengths are OK,
+ * then log the ARP payload */
+ if (ah->ar_hrd == htons(1) &&
+ ah->ar_hln == ETH_ALEN &&
+ ah->ar_pln == sizeof(__be32)) {
+ const struct arppayload *ap;
+ struct arppayload _arpp;
+
+ ap = skb_header_pointer(skb, sizeof(_arph),
+ sizeof(_arpp), &_arpp);
+ if (ap == NULL) {
+ printk(" INCOMPLETE ARP payload");
+ goto out;
+ }
+ printk(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4",
+ ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+ }
+ }
+out:
+ printk("\n");
+ spin_unlock_bh(&ebt_log_lock);
+
+}
+
+static unsigned int
+ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ebt_log_info *info = par->targinfo;
+ struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
+
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = info->loglevel;
+ li.u.log.logflags = info->bitmask;
+
+ /* Remember that we have to use ebt_log_packet() not to break backward
+ * compatibility. We cannot use the default bridge packet logger via
+ * nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo
+ */
+ if (info->bitmask & EBT_LOG_NFLOG)
+ nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb,
+ par->in, par->out, &li, "%s", info->prefix);
+ else
+ ebt_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in,
+ par->out, &li, info->prefix);
+ return EBT_CONTINUE;
+}
+
+static struct xt_target ebt_log_tg_reg __read_mostly = {
+ .name = "log",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .target = ebt_log_tg,
+ .checkentry = ebt_log_tg_check,
+ .targetsize = sizeof(struct ebt_log_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_log_init(void)
+{
+ return xt_register_target(&ebt_log_tg_reg);
+}
+
+static void __exit ebt_log_fini(void)
+{
+ xt_unregister_target(&ebt_log_tg_reg);
+}
+
+module_init(ebt_log_init);
+module_exit(ebt_log_fini);
+MODULE_DESCRIPTION("Ebtables: Packet logging to syslog");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
new file mode 100644
index 000000000..66697cbd0
--- /dev/null
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -0,0 +1,110 @@
+/*
+ * ebt_mark
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * July, 2002
+ *
+ */
+
+/* The mark target can be used in any chain,
+ * I believe adding a mangle table just for marking is total overkill.
+ * Marking a frame doesn't really change anything in the frame anyway.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_mark_t.h>
+
+static unsigned int
+ebt_mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ebt_mark_t_info *info = par->targinfo;
+ int action = info->target & -16;
+
+ if (action == MARK_SET_VALUE)
+ skb->mark = info->mark;
+ else if (action == MARK_OR_VALUE)
+ skb->mark |= info->mark;
+ else if (action == MARK_AND_VALUE)
+ skb->mark &= info->mark;
+ else
+ skb->mark ^= info->mark;
+
+ return info->target | ~EBT_VERDICT_BITS;
+}
+
+static int ebt_mark_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ebt_mark_t_info *info = par->targinfo;
+ int tmp;
+
+ tmp = info->target | ~EBT_VERDICT_BITS;
+ if (BASE_CHAIN && tmp == EBT_RETURN)
+ return -EINVAL;
+ if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
+ return -EINVAL;
+ tmp = info->target & ~EBT_VERDICT_BITS;
+ if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE &&
+ tmp != MARK_AND_VALUE && tmp != MARK_XOR_VALUE)
+ return -EINVAL;
+ return 0;
+}
+#ifdef CONFIG_COMPAT
+struct compat_ebt_mark_t_info {
+ compat_ulong_t mark;
+ compat_uint_t target;
+};
+
+static void mark_tg_compat_from_user(void *dst, const void *src)
+{
+ const struct compat_ebt_mark_t_info *user = src;
+ struct ebt_mark_t_info *kern = dst;
+
+ kern->mark = user->mark;
+ kern->target = user->target;
+}
+
+static int mark_tg_compat_to_user(void __user *dst, const void *src)
+{
+ struct compat_ebt_mark_t_info __user *user = dst;
+ const struct ebt_mark_t_info *kern = src;
+
+ if (put_user(kern->mark, &user->mark) ||
+ put_user(kern->target, &user->target))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+static struct xt_target ebt_mark_tg_reg __read_mostly = {
+ .name = "mark",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .target = ebt_mark_tg,
+ .checkentry = ebt_mark_tg_check,
+ .targetsize = sizeof(struct ebt_mark_t_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_ebt_mark_t_info),
+ .compat_from_user = mark_tg_compat_from_user,
+ .compat_to_user = mark_tg_compat_to_user,
+#endif
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_mark_init(void)
+{
+ return xt_register_target(&ebt_mark_tg_reg);
+}
+
+static void __exit ebt_mark_fini(void)
+{
+ xt_unregister_target(&ebt_mark_tg_reg);
+}
+
+module_init(ebt_mark_init);
+module_exit(ebt_mark_fini);
+MODULE_DESCRIPTION("Ebtables: Packet mark modification");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
new file mode 100644
index 000000000..d98baefc4
--- /dev/null
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -0,0 +1,98 @@
+/*
+ * ebt_mark_m
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * July, 2002
+ *
+ */
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_mark_m.h>
+
+static bool
+ebt_mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_mark_m_info *info = par->matchinfo;
+
+ if (info->bitmask & EBT_MARK_OR)
+ return !!(skb->mark & info->mask) ^ info->invert;
+ return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int ebt_mark_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_mark_m_info *info = par->matchinfo;
+
+ if (info->bitmask & ~EBT_MARK_MASK)
+ return -EINVAL;
+ if ((info->bitmask & EBT_MARK_OR) && (info->bitmask & EBT_MARK_AND))
+ return -EINVAL;
+ if (!info->bitmask)
+ return -EINVAL;
+ return 0;
+}
+
+
+#ifdef CONFIG_COMPAT
+struct compat_ebt_mark_m_info {
+ compat_ulong_t mark, mask;
+ uint8_t invert, bitmask;
+};
+
+static void mark_mt_compat_from_user(void *dst, const void *src)
+{
+ const struct compat_ebt_mark_m_info *user = src;
+ struct ebt_mark_m_info *kern = dst;
+
+ kern->mark = user->mark;
+ kern->mask = user->mask;
+ kern->invert = user->invert;
+ kern->bitmask = user->bitmask;
+}
+
+static int mark_mt_compat_to_user(void __user *dst, const void *src)
+{
+ struct compat_ebt_mark_m_info __user *user = dst;
+ const struct ebt_mark_m_info *kern = src;
+
+ if (put_user(kern->mark, &user->mark) ||
+ put_user(kern->mask, &user->mask) ||
+ put_user(kern->invert, &user->invert) ||
+ put_user(kern->bitmask, &user->bitmask))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+static struct xt_match ebt_mark_mt_reg __read_mostly = {
+ .name = "mark_m",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_mark_mt,
+ .checkentry = ebt_mark_mt_check,
+ .matchsize = sizeof(struct ebt_mark_m_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_ebt_mark_m_info),
+ .compat_from_user = mark_mt_compat_from_user,
+ .compat_to_user = mark_mt_compat_to_user,
+#endif
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_mark_m_init(void)
+{
+ return xt_register_match(&ebt_mark_mt_reg);
+}
+
+static void __exit ebt_mark_m_fini(void)
+{
+ xt_unregister_match(&ebt_mark_mt_reg);
+}
+
+module_init(ebt_mark_m_init);
+module_exit(ebt_mark_m_fini);
+MODULE_DESCRIPTION("Ebtables: Packet mark match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
new file mode 100644
index 000000000..59ac79520
--- /dev/null
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -0,0 +1,73 @@
+/*
+ * ebt_nflog
+ *
+ * Author:
+ * Peter Warasin <peter@endian.com>
+ *
+ * February, 2008
+ *
+ * Based on:
+ * xt_NFLOG.c, (C) 2006 by Patrick McHardy <kaber@trash.net>
+ * ebt_ulog.c, (C) 2004 by Bart De Schuymer <bdschuym@pandora.be>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nflog.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int
+ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ebt_nflog_info *info = par->targinfo;
+ struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
+
+ li.type = NF_LOG_TYPE_ULOG;
+ li.u.ulog.copy_len = info->len;
+ li.u.ulog.group = info->group;
+ li.u.ulog.qthreshold = info->threshold;
+
+ nf_log_packet(net, PF_BRIDGE, par->hooknum, skb, par->in,
+ par->out, &li, "%s", info->prefix);
+ return EBT_CONTINUE;
+}
+
+static int ebt_nflog_tg_check(const struct xt_tgchk_param *par)
+{
+ struct ebt_nflog_info *info = par->targinfo;
+
+ if (info->flags & ~EBT_NFLOG_MASK)
+ return -EINVAL;
+ info->prefix[EBT_NFLOG_PREFIX_SIZE - 1] = '\0';
+ return 0;
+}
+
+static struct xt_target ebt_nflog_tg_reg __read_mostly = {
+ .name = "nflog",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .target = ebt_nflog_tg,
+ .checkentry = ebt_nflog_tg_check,
+ .targetsize = sizeof(struct ebt_nflog_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_nflog_init(void)
+{
+ return xt_register_target(&ebt_nflog_tg_reg);
+}
+
+static void __exit ebt_nflog_fini(void)
+{
+ xt_unregister_target(&ebt_nflog_tg_reg);
+}
+
+module_init(ebt_nflog_init);
+module_exit(ebt_nflog_fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Peter Warasin <peter@endian.com>");
+MODULE_DESCRIPTION("ebtables NFLOG netfilter logging module");
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
new file mode 100644
index 000000000..496a56515
--- /dev/null
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -0,0 +1,56 @@
+/*
+ * ebt_pkttype
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * April, 2003
+ *
+ */
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_pkttype.h>
+
+static bool
+ebt_pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_pkttype_info *info = par->matchinfo;
+
+ return (skb->pkt_type == info->pkt_type) ^ info->invert;
+}
+
+static int ebt_pkttype_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_pkttype_info *info = par->matchinfo;
+
+ if (info->invert != 0 && info->invert != 1)
+ return -EINVAL;
+ /* Allow any pkt_type value */
+ return 0;
+}
+
+static struct xt_match ebt_pkttype_mt_reg __read_mostly = {
+ .name = "pkttype",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_pkttype_mt,
+ .checkentry = ebt_pkttype_mt_check,
+ .matchsize = sizeof(struct ebt_pkttype_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_pkttype_init(void)
+{
+ return xt_register_match(&ebt_pkttype_mt_reg);
+}
+
+static void __exit ebt_pkttype_fini(void)
+{
+ xt_unregister_match(&ebt_pkttype_mt_reg);
+}
+
+module_init(ebt_pkttype_init);
+module_exit(ebt_pkttype_fini);
+MODULE_DESCRIPTION("Ebtables: Link layer packet type match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
new file mode 100644
index 000000000..203964997
--- /dev/null
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -0,0 +1,80 @@
+/*
+ * ebt_redirect
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * April, 2002
+ *
+ */
+#include <linux/module.h>
+#include <net/sock.h>
+#include "../br_private.h"
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_redirect.h>
+
+static unsigned int
+ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ebt_redirect_info *info = par->targinfo;
+
+ if (!skb_make_writable(skb, 0))
+ return EBT_DROP;
+
+ if (par->hooknum != NF_BR_BROUTING)
+ /* rcu_read_lock()ed by nf_hook_slow */
+ ether_addr_copy(eth_hdr(skb)->h_dest,
+ br_port_get_rcu(par->in)->br->dev->dev_addr);
+ else
+ ether_addr_copy(eth_hdr(skb)->h_dest, par->in->dev_addr);
+ skb->pkt_type = PACKET_HOST;
+ return info->target;
+}
+
+static int ebt_redirect_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ebt_redirect_info *info = par->targinfo;
+ unsigned int hook_mask;
+
+ if (BASE_CHAIN && info->target == EBT_RETURN)
+ return -EINVAL;
+
+ hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS);
+ if ((strcmp(par->table, "nat") != 0 ||
+ hook_mask & ~(1 << NF_BR_PRE_ROUTING)) &&
+ (strcmp(par->table, "broute") != 0 ||
+ hook_mask & ~(1 << NF_BR_BROUTING)))
+ return -EINVAL;
+ if (INVALID_TARGET)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target ebt_redirect_tg_reg __read_mostly = {
+ .name = "redirect",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_BROUTING),
+ .target = ebt_redirect_tg,
+ .checkentry = ebt_redirect_tg_check,
+ .targetsize = sizeof(struct ebt_redirect_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_redirect_init(void)
+{
+ return xt_register_target(&ebt_redirect_tg_reg);
+}
+
+static void __exit ebt_redirect_fini(void)
+{
+ xt_unregister_target(&ebt_redirect_tg_reg);
+}
+
+module_init(ebt_redirect_init);
+module_exit(ebt_redirect_fini);
+MODULE_DESCRIPTION("Ebtables: Packet redirection to localhost");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
new file mode 100644
index 000000000..e56ccd060
--- /dev/null
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -0,0 +1,87 @@
+/*
+ * ebt_snat
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * June, 2002
+ *
+ */
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/if_arp.h>
+#include <net/arp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nat.h>
+
+static unsigned int
+ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ebt_nat_info *info = par->targinfo;
+
+ if (!skb_make_writable(skb, 0))
+ return EBT_DROP;
+
+ ether_addr_copy(eth_hdr(skb)->h_source, info->mac);
+ if (!(info->target & NAT_ARP_BIT) &&
+ eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
+ const struct arphdr *ap;
+ struct arphdr _ah;
+
+ ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah);
+ if (ap == NULL)
+ return EBT_DROP;
+ if (ap->ar_hln != ETH_ALEN)
+ goto out;
+ if (skb_store_bits(skb, sizeof(_ah), info->mac, ETH_ALEN))
+ return EBT_DROP;
+ }
+out:
+ return info->target | ~EBT_VERDICT_BITS;
+}
+
+static int ebt_snat_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ebt_nat_info *info = par->targinfo;
+ int tmp;
+
+ tmp = info->target | ~EBT_VERDICT_BITS;
+ if (BASE_CHAIN && tmp == EBT_RETURN)
+ return -EINVAL;
+
+ if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
+ return -EINVAL;
+ tmp = info->target | EBT_VERDICT_BITS;
+ if ((tmp & ~NAT_ARP_BIT) != ~NAT_ARP_BIT)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target ebt_snat_tg_reg __read_mostly = {
+ .name = "snat",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .table = "nat",
+ .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_POST_ROUTING),
+ .target = ebt_snat_tg,
+ .checkentry = ebt_snat_tg_check,
+ .targetsize = sizeof(struct ebt_nat_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_snat_init(void)
+{
+ return xt_register_target(&ebt_snat_tg_reg);
+}
+
+static void __exit ebt_snat_fini(void)
+{
+ xt_unregister_target(&ebt_snat_tg_reg);
+}
+
+module_init(ebt_snat_init);
+module_exit(ebt_snat_fini);
+MODULE_DESCRIPTION("Ebtables: Source MAC address translation");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
new file mode 100644
index 000000000..071d87214
--- /dev/null
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -0,0 +1,197 @@
+/*
+ * ebt_stp
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * Stephen Hemminger <shemminger@osdl.org>
+ *
+ * July, 2003
+ */
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_stp.h>
+
+#define BPDU_TYPE_CONFIG 0
+#define BPDU_TYPE_TCN 0x80
+
+struct stp_header {
+ uint8_t dsap;
+ uint8_t ssap;
+ uint8_t ctrl;
+ uint8_t pid;
+ uint8_t vers;
+ uint8_t type;
+};
+
+struct stp_config_pdu {
+ uint8_t flags;
+ uint8_t root[8];
+ uint8_t root_cost[4];
+ uint8_t sender[8];
+ uint8_t port[2];
+ uint8_t msg_age[2];
+ uint8_t max_age[2];
+ uint8_t hello_time[2];
+ uint8_t forward_delay[2];
+};
+
+#define NR16(p) (p[0] << 8 | p[1])
+#define NR32(p) ((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3])
+
+static bool ebt_filter_config(const struct ebt_stp_info *info,
+ const struct stp_config_pdu *stpc)
+{
+ const struct ebt_stp_config_info *c;
+ uint16_t v16;
+ uint32_t v32;
+ int verdict, i;
+
+ c = &info->config;
+ if ((info->bitmask & EBT_STP_FLAGS) &&
+ FWINV(c->flags != stpc->flags, EBT_STP_FLAGS))
+ return false;
+ if (info->bitmask & EBT_STP_ROOTPRIO) {
+ v16 = NR16(stpc->root);
+ if (FWINV(v16 < c->root_priol ||
+ v16 > c->root_priou, EBT_STP_ROOTPRIO))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_ROOTADDR) {
+ verdict = 0;
+ for (i = 0; i < 6; i++)
+ verdict |= (stpc->root[2+i] ^ c->root_addr[i]) &
+ c->root_addrmsk[i];
+ if (FWINV(verdict != 0, EBT_STP_ROOTADDR))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_ROOTCOST) {
+ v32 = NR32(stpc->root_cost);
+ if (FWINV(v32 < c->root_costl ||
+ v32 > c->root_costu, EBT_STP_ROOTCOST))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_SENDERPRIO) {
+ v16 = NR16(stpc->sender);
+ if (FWINV(v16 < c->sender_priol ||
+ v16 > c->sender_priou, EBT_STP_SENDERPRIO))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_SENDERADDR) {
+ verdict = 0;
+ for (i = 0; i < 6; i++)
+ verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) &
+ c->sender_addrmsk[i];
+ if (FWINV(verdict != 0, EBT_STP_SENDERADDR))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_PORT) {
+ v16 = NR16(stpc->port);
+ if (FWINV(v16 < c->portl ||
+ v16 > c->portu, EBT_STP_PORT))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_MSGAGE) {
+ v16 = NR16(stpc->msg_age);
+ if (FWINV(v16 < c->msg_agel ||
+ v16 > c->msg_ageu, EBT_STP_MSGAGE))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_MAXAGE) {
+ v16 = NR16(stpc->max_age);
+ if (FWINV(v16 < c->max_agel ||
+ v16 > c->max_ageu, EBT_STP_MAXAGE))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_HELLOTIME) {
+ v16 = NR16(stpc->hello_time);
+ if (FWINV(v16 < c->hello_timel ||
+ v16 > c->hello_timeu, EBT_STP_HELLOTIME))
+ return false;
+ }
+ if (info->bitmask & EBT_STP_FWDD) {
+ v16 = NR16(stpc->forward_delay);
+ if (FWINV(v16 < c->forward_delayl ||
+ v16 > c->forward_delayu, EBT_STP_FWDD))
+ return false;
+ }
+ return true;
+}
+
+static bool
+ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_stp_info *info = par->matchinfo;
+ const struct stp_header *sp;
+ struct stp_header _stph;
+ const uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00};
+
+ sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph);
+ if (sp == NULL)
+ return false;
+
+ /* The stp code only considers these */
+ if (memcmp(sp, header, sizeof(header)))
+ return false;
+
+ if (info->bitmask & EBT_STP_TYPE &&
+ FWINV(info->type != sp->type, EBT_STP_TYPE))
+ return false;
+
+ if (sp->type == BPDU_TYPE_CONFIG &&
+ info->bitmask & EBT_STP_CONFIG_MASK) {
+ const struct stp_config_pdu *st;
+ struct stp_config_pdu _stpc;
+
+ st = skb_header_pointer(skb, sizeof(_stph),
+ sizeof(_stpc), &_stpc);
+ if (st == NULL)
+ return false;
+ return ebt_filter_config(info, st);
+ }
+ return true;
+}
+
+static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ebt_stp_info *info = par->matchinfo;
+ const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00};
+ const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ const struct ebt_entry *e = par->entryinfo;
+
+ if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
+ !(info->bitmask & EBT_STP_MASK))
+ return -EINVAL;
+ /* Make sure the match only receives stp frames */
+ if (!ether_addr_equal(e->destmac, bridge_ula) ||
+ !ether_addr_equal(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_match ebt_stp_mt_reg __read_mostly = {
+ .name = "stp",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_stp_mt,
+ .checkentry = ebt_stp_mt_check,
+ .matchsize = sizeof(struct ebt_stp_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_stp_init(void)
+{
+ return xt_register_match(&ebt_stp_mt_reg);
+}
+
+static void __exit ebt_stp_fini(void)
+{
+ xt_unregister_match(&ebt_stp_mt_reg);
+}
+
+module_init(ebt_stp_init);
+module_exit(ebt_stp_fini);
+MODULE_DESCRIPTION("Ebtables: Spanning Tree Protocol packet match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
new file mode 100644
index 000000000..618568888
--- /dev/null
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -0,0 +1,180 @@
+/*
+ * Description: EBTables 802.1Q match extension kernelspace module.
+ * Authors: Nick Fedchik <nick@fedchik.org.ua>
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_vlan.h>
+
+#define MODULE_VERS "0.6"
+
+MODULE_AUTHOR("Nick Fedchik <nick@fedchik.org.ua>");
+MODULE_DESCRIPTION("Ebtables: 802.1Q VLAN tag match");
+MODULE_LICENSE("GPL");
+
+#define GET_BITMASK(_BIT_MASK_) info->bitmask & _BIT_MASK_
+#define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; }
+
+static bool
+ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ebt_vlan_info *info = par->matchinfo;
+
+ unsigned short TCI; /* Whole TCI, given from parsed frame */
+ unsigned short id; /* VLAN ID, given from frame TCI */
+ unsigned char prio; /* user_priority, given from frame TCI */
+ /* VLAN encapsulated Type/Length field, given from orig frame */
+ __be16 encap;
+
+ if (skb_vlan_tag_present(skb)) {
+ TCI = skb_vlan_tag_get(skb);
+ encap = skb->protocol;
+ } else {
+ const struct vlan_hdr *fp;
+ struct vlan_hdr _frame;
+
+ fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
+ if (fp == NULL)
+ return false;
+
+ TCI = ntohs(fp->h_vlan_TCI);
+ encap = fp->h_vlan_encapsulated_proto;
+ }
+
+ /* Tag Control Information (TCI) consists of the following elements:
+ * - User_priority. The user_priority field is three bits in length,
+ * interpreted as a binary number.
+ * - Canonical Format Indicator (CFI). The Canonical Format Indicator
+ * (CFI) is a single bit flag value. Currently ignored.
+ * - VLAN Identifier (VID). The VID is encoded as
+ * an unsigned binary number. */
+ id = TCI & VLAN_VID_MASK;
+ prio = (TCI >> 13) & 0x7;
+
+ /* Checking VLAN Identifier (VID) */
+ if (GET_BITMASK(EBT_VLAN_ID))
+ EXIT_ON_MISMATCH(id, EBT_VLAN_ID);
+
+ /* Checking user_priority */
+ if (GET_BITMASK(EBT_VLAN_PRIO))
+ EXIT_ON_MISMATCH(prio, EBT_VLAN_PRIO);
+
+ /* Checking Encapsulated Proto (Length/Type) field */
+ if (GET_BITMASK(EBT_VLAN_ENCAP))
+ EXIT_ON_MISMATCH(encap, EBT_VLAN_ENCAP);
+
+ return true;
+}
+
+static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
+{
+ struct ebt_vlan_info *info = par->matchinfo;
+ const struct ebt_entry *e = par->entryinfo;
+
+ /* Is it 802.1Q frame checked? */
+ if (e->ethproto != htons(ETH_P_8021Q)) {
+ pr_debug("passed entry proto %2.4X is not 802.1Q (8100)\n",
+ ntohs(e->ethproto));
+ return -EINVAL;
+ }
+
+ /* Check for bitmask range
+ * True if even one bit is out of mask */
+ if (info->bitmask & ~EBT_VLAN_MASK) {
+ pr_debug("bitmask %2X is out of mask (%2X)\n",
+ info->bitmask, EBT_VLAN_MASK);
+ return -EINVAL;
+ }
+
+ /* Check for inversion flags range */
+ if (info->invflags & ~EBT_VLAN_MASK) {
+ pr_debug("inversion flags %2X is out of mask (%2X)\n",
+ info->invflags, EBT_VLAN_MASK);
+ return -EINVAL;
+ }
+
+ /* Reserved VLAN ID (VID) values
+ * -----------------------------
+ * 0 - The null VLAN ID.
+ * 1 - The default Port VID (PVID)
+ * 0x0FFF - Reserved for implementation use.
+ * if_vlan.h: VLAN_N_VID 4096. */
+ if (GET_BITMASK(EBT_VLAN_ID)) {
+ if (!!info->id) { /* if id!=0 => check vid range */
+ if (info->id > VLAN_N_VID) {
+ pr_debug("id %d is out of range (1-4096)\n",
+ info->id);
+ return -EINVAL;
+ }
+ /* Note: This is valid VLAN-tagged frame point.
+ * Any value of user_priority are acceptable,
+ * but should be ignored according to 802.1Q Std.
+ * So we just drop the prio flag. */
+ info->bitmask &= ~EBT_VLAN_PRIO;
+ }
+ /* Else, id=0 (null VLAN ID) => user_priority range (any?) */
+ }
+
+ if (GET_BITMASK(EBT_VLAN_PRIO)) {
+ if ((unsigned char) info->prio > 7) {
+ pr_debug("prio %d is out of range (0-7)\n",
+ info->prio);
+ return -EINVAL;
+ }
+ }
+ /* Check for encapsulated proto range - it is possible to be
+ * any value for u_short range.
+ * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS */
+ if (GET_BITMASK(EBT_VLAN_ENCAP)) {
+ if ((unsigned short) ntohs(info->encap) < ETH_ZLEN) {
+ pr_debug("encap frame length %d is less than "
+ "minimal\n", ntohs(info->encap));
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static struct xt_match ebt_vlan_mt_reg __read_mostly = {
+ .name = "vlan",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .match = ebt_vlan_mt,
+ .checkentry = ebt_vlan_mt_check,
+ .matchsize = sizeof(struct ebt_vlan_info),
+ .me = THIS_MODULE,
+};
+
+static int __init ebt_vlan_init(void)
+{
+ pr_debug("ebtables 802.1Q extension module v" MODULE_VERS "\n");
+ return xt_register_match(&ebt_vlan_mt_reg);
+}
+
+static void __exit ebt_vlan_fini(void)
+{
+ xt_unregister_match(&ebt_vlan_mt_reg);
+}
+
+module_init(ebt_vlan_init);
+module_exit(ebt_vlan_fini);
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
new file mode 100644
index 000000000..d2cdf5d6e
--- /dev/null
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -0,0 +1,100 @@
+/*
+ * ebtable_broute
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * April, 2002
+ *
+ * This table lets you choose between routing and bridging for frames
+ * entering on a bridge enslaved nic. This table is traversed before any
+ * other ebtables table. See net/bridge/br_input.c.
+ */
+
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/module.h>
+#include <linux/if_bridge.h>
+
+/* EBT_ACCEPT means the frame will be bridged
+ * EBT_DROP means the frame will be routed
+ */
+static struct ebt_entries initial_chain = {
+ .name = "BROUTING",
+ .policy = EBT_ACCEPT,
+};
+
+static struct ebt_replace_kernel initial_table = {
+ .name = "broute",
+ .valid_hooks = 1 << NF_BR_BROUTING,
+ .entries_size = sizeof(struct ebt_entries),
+ .hook_entry = {
+ [NF_BR_BROUTING] = &initial_chain,
+ },
+ .entries = (char *)&initial_chain,
+};
+
+static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
+{
+ if (valid_hooks & ~(1 << NF_BR_BROUTING))
+ return -EINVAL;
+ return 0;
+}
+
+static const struct ebt_table broute_table = {
+ .name = "broute",
+ .table = &initial_table,
+ .valid_hooks = 1 << NF_BR_BROUTING,
+ .check = check,
+ .me = THIS_MODULE,
+};
+
+static int ebt_broute(struct sk_buff *skb)
+{
+ int ret;
+
+ ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL,
+ dev_net(skb->dev)->xt.broute_table);
+ if (ret == NF_DROP)
+ return 1; /* route it */
+ return 0; /* bridge it */
+}
+
+static int __net_init broute_net_init(struct net *net)
+{
+ net->xt.broute_table = ebt_register_table(net, &broute_table);
+ return PTR_ERR_OR_ZERO(net->xt.broute_table);
+}
+
+static void __net_exit broute_net_exit(struct net *net)
+{
+ ebt_unregister_table(net, net->xt.broute_table);
+}
+
+static struct pernet_operations broute_net_ops = {
+ .init = broute_net_init,
+ .exit = broute_net_exit,
+};
+
+static int __init ebtable_broute_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&broute_net_ops);
+ if (ret < 0)
+ return ret;
+ /* see br_input.c */
+ RCU_INIT_POINTER(br_should_route_hook,
+ (br_should_route_hook_t *)ebt_broute);
+ return 0;
+}
+
+static void __exit ebtable_broute_fini(void)
+{
+ RCU_INIT_POINTER(br_should_route_hook, NULL);
+ synchronize_net();
+ unregister_pernet_subsys(&broute_net_ops);
+}
+
+module_init(ebtable_broute_init);
+module_exit(ebtable_broute_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
new file mode 100644
index 000000000..8a3f63b2e
--- /dev/null
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -0,0 +1,136 @@
+/*
+ * ebtable_filter
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * April, 2002
+ *
+ */
+
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/module.h>
+
+#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
+ (1 << NF_BR_LOCAL_OUT))
+
+static struct ebt_entries initial_chains[] = {
+ {
+ .name = "INPUT",
+ .policy = EBT_ACCEPT,
+ },
+ {
+ .name = "FORWARD",
+ .policy = EBT_ACCEPT,
+ },
+ {
+ .name = "OUTPUT",
+ .policy = EBT_ACCEPT,
+ },
+};
+
+static struct ebt_replace_kernel initial_table = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .entries_size = 3 * sizeof(struct ebt_entries),
+ .hook_entry = {
+ [NF_BR_LOCAL_IN] = &initial_chains[0],
+ [NF_BR_FORWARD] = &initial_chains[1],
+ [NF_BR_LOCAL_OUT] = &initial_chains[2],
+ },
+ .entries = (char *)initial_chains,
+};
+
+static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
+{
+ if (valid_hooks & ~FILTER_VALID_HOOKS)
+ return -EINVAL;
+ return 0;
+}
+
+static const struct ebt_table frame_filter = {
+ .name = "filter",
+ .table = &initial_table,
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .check = check,
+ .me = THIS_MODULE,
+};
+
+static unsigned int
+ebt_in_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ebt_do_table(ops->hooknum, skb, state->in, state->out,
+ dev_net(state->in)->xt.frame_filter);
+}
+
+static unsigned int
+ebt_out_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ebt_do_table(ops->hooknum, skb, state->in, state->out,
+ dev_net(state->out)->xt.frame_filter);
+}
+
+static struct nf_hook_ops ebt_ops_filter[] __read_mostly = {
+ {
+ .hook = ebt_in_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_IN,
+ .priority = NF_BR_PRI_FILTER_BRIDGED,
+ },
+ {
+ .hook = ebt_in_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_FORWARD,
+ .priority = NF_BR_PRI_FILTER_BRIDGED,
+ },
+ {
+ .hook = ebt_out_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_OUT,
+ .priority = NF_BR_PRI_FILTER_OTHER,
+ },
+};
+
+static int __net_init frame_filter_net_init(struct net *net)
+{
+ net->xt.frame_filter = ebt_register_table(net, &frame_filter);
+ return PTR_ERR_OR_ZERO(net->xt.frame_filter);
+}
+
+static void __net_exit frame_filter_net_exit(struct net *net)
+{
+ ebt_unregister_table(net, net->xt.frame_filter);
+}
+
+static struct pernet_operations frame_filter_net_ops = {
+ .init = frame_filter_net_init,
+ .exit = frame_filter_net_exit,
+};
+
+static int __init ebtable_filter_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&frame_filter_net_ops);
+ if (ret < 0)
+ return ret;
+ ret = nf_register_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
+ if (ret < 0)
+ unregister_pernet_subsys(&frame_filter_net_ops);
+ return ret;
+}
+
+static void __exit ebtable_filter_fini(void)
+{
+ nf_unregister_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
+ unregister_pernet_subsys(&frame_filter_net_ops);
+}
+
+module_init(ebtable_filter_init);
+module_exit(ebtable_filter_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
new file mode 100644
index 000000000..c5ef5b1ab
--- /dev/null
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -0,0 +1,136 @@
+/*
+ * ebtable_nat
+ *
+ * Authors:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * April, 2002
+ *
+ */
+
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/module.h>
+
+#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
+ (1 << NF_BR_POST_ROUTING))
+
+static struct ebt_entries initial_chains[] = {
+ {
+ .name = "PREROUTING",
+ .policy = EBT_ACCEPT,
+ },
+ {
+ .name = "OUTPUT",
+ .policy = EBT_ACCEPT,
+ },
+ {
+ .name = "POSTROUTING",
+ .policy = EBT_ACCEPT,
+ }
+};
+
+static struct ebt_replace_kernel initial_table = {
+ .name = "nat",
+ .valid_hooks = NAT_VALID_HOOKS,
+ .entries_size = 3 * sizeof(struct ebt_entries),
+ .hook_entry = {
+ [NF_BR_PRE_ROUTING] = &initial_chains[0],
+ [NF_BR_LOCAL_OUT] = &initial_chains[1],
+ [NF_BR_POST_ROUTING] = &initial_chains[2],
+ },
+ .entries = (char *)initial_chains,
+};
+
+static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
+{
+ if (valid_hooks & ~NAT_VALID_HOOKS)
+ return -EINVAL;
+ return 0;
+}
+
+static struct ebt_table frame_nat = {
+ .name = "nat",
+ .table = &initial_table,
+ .valid_hooks = NAT_VALID_HOOKS,
+ .check = check,
+ .me = THIS_MODULE,
+};
+
+static unsigned int
+ebt_nat_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ebt_do_table(ops->hooknum, skb, state->in, state->out,
+ dev_net(state->in)->xt.frame_nat);
+}
+
+static unsigned int
+ebt_nat_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ebt_do_table(ops->hooknum, skb, state->in, state->out,
+ dev_net(state->out)->xt.frame_nat);
+}
+
+static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
+ {
+ .hook = ebt_nat_out,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_OUT,
+ .priority = NF_BR_PRI_NAT_DST_OTHER,
+ },
+ {
+ .hook = ebt_nat_out,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_POST_ROUTING,
+ .priority = NF_BR_PRI_NAT_SRC,
+ },
+ {
+ .hook = ebt_nat_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_BR_PRI_NAT_DST_BRIDGED,
+ },
+};
+
+static int __net_init frame_nat_net_init(struct net *net)
+{
+ net->xt.frame_nat = ebt_register_table(net, &frame_nat);
+ return PTR_ERR_OR_ZERO(net->xt.frame_nat);
+}
+
+static void __net_exit frame_nat_net_exit(struct net *net)
+{
+ ebt_unregister_table(net, net->xt.frame_nat);
+}
+
+static struct pernet_operations frame_nat_net_ops = {
+ .init = frame_nat_net_init,
+ .exit = frame_nat_net_exit,
+};
+
+static int __init ebtable_nat_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&frame_nat_net_ops);
+ if (ret < 0)
+ return ret;
+ ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
+ if (ret < 0)
+ unregister_pernet_subsys(&frame_nat_net_ops);
+ return ret;
+}
+
+static void __exit ebtable_nat_fini(void)
+{
+ nf_unregister_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
+ unregister_pernet_subsys(&frame_nat_net_ops);
+}
+
+module_init(ebtable_nat_init);
+module_exit(ebtable_nat_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
new file mode 100644
index 000000000..91180a7fc
--- /dev/null
+++ b/net/bridge/netfilter/ebtables.c
@@ -0,0 +1,2432 @@
+/*
+ * ebtables
+ *
+ * Author:
+ * Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * ebtables.c,v 2.0, July, 2002
+ *
+ * This code is stongly inspired on the iptables code which is
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/audit.h>
+#include <net/sock.h>
+/* needed for logical [in,out]-dev filtering */
+#include "../br_private.h"
+
+#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\
+ "report to author: "format, ## args)
+/* #define BUGPRINT(format, args...) */
+
+/*
+ * Each cpu has its own set of counters, so there is no need for write_lock in
+ * the softirq
+ * For reading or updating the counters, the user context needs to
+ * get a write_lock
+ */
+
+/* The size of each set of counters is altered to get cache alignment */
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+#define COUNTER_OFFSET(n) (SMP_ALIGN(n * sizeof(struct ebt_counter)))
+#define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \
+ COUNTER_OFFSET(n) * cpu))
+
+
+
+static DEFINE_MUTEX(ebt_mutex);
+
+#ifdef CONFIG_COMPAT
+static void ebt_standard_compat_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v >= 0)
+ v += xt_compat_calc_jump(NFPROTO_BRIDGE, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int ebt_standard_compat_to_user(void __user *dst, const void *src)
+{
+ compat_int_t cv = *(int *)src;
+
+ if (cv >= 0)
+ cv -= xt_compat_calc_jump(NFPROTO_BRIDGE, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+#endif
+
+
+static struct xt_target ebt_standard_target = {
+ .name = "standard",
+ .revision = 0,
+ .family = NFPROTO_BRIDGE,
+ .targetsize = sizeof(int),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = ebt_standard_compat_from_user,
+ .compat_to_user = ebt_standard_compat_to_user,
+#endif
+};
+
+static inline int
+ebt_do_watcher(const struct ebt_entry_watcher *w, struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ par->target = w->u.watcher;
+ par->targinfo = w->data;
+ w->u.watcher->target(skb, par);
+ /* watchers don't give a verdict */
+ return 0;
+}
+
+static inline int
+ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb,
+ struct xt_action_param *par)
+{
+ par->match = m->u.match;
+ par->matchinfo = m->data;
+ return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH;
+}
+
+static inline int
+ebt_dev_check(const char *entry, const struct net_device *device)
+{
+ int i = 0;
+ const char *devname;
+
+ if (*entry == '\0')
+ return 0;
+ if (!device)
+ return 1;
+ devname = device->name;
+ /* 1 is the wildcard token */
+ while (entry[i] != '\0' && entry[i] != 1 && entry[i] == devname[i])
+ i++;
+ return devname[i] != entry[i] && entry[i] != 1;
+}
+
+#define FWINV2(bool, invflg) ((bool) ^ !!(e->invflags & invflg))
+/* process standard matches */
+static inline int
+ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out)
+{
+ const struct ethhdr *h = eth_hdr(skb);
+ const struct net_bridge_port *p;
+ __be16 ethproto;
+ int verdict, i;
+
+ if (skb_vlan_tag_present(skb))
+ ethproto = htons(ETH_P_8021Q);
+ else
+ ethproto = h->h_proto;
+
+ if (e->bitmask & EBT_802_3) {
+ if (FWINV2(ntohs(ethproto) >= ETH_P_802_3_MIN, EBT_IPROTO))
+ return 1;
+ } else if (!(e->bitmask & EBT_NOPROTO) &&
+ FWINV2(e->ethproto != ethproto, EBT_IPROTO))
+ return 1;
+
+ if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN))
+ return 1;
+ if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
+ return 1;
+ /* rcu_read_lock()ed by nf_hook_slow */
+ if (in && (p = br_port_get_rcu(in)) != NULL &&
+ FWINV2(ebt_dev_check(e->logical_in, p->br->dev), EBT_ILOGICALIN))
+ return 1;
+ if (out && (p = br_port_get_rcu(out)) != NULL &&
+ FWINV2(ebt_dev_check(e->logical_out, p->br->dev), EBT_ILOGICALOUT))
+ return 1;
+
+ if (e->bitmask & EBT_SOURCEMAC) {
+ verdict = 0;
+ for (i = 0; i < 6; i++)
+ verdict |= (h->h_source[i] ^ e->sourcemac[i]) &
+ e->sourcemsk[i];
+ if (FWINV2(verdict != 0, EBT_ISOURCE) )
+ return 1;
+ }
+ if (e->bitmask & EBT_DESTMAC) {
+ verdict = 0;
+ for (i = 0; i < 6; i++)
+ verdict |= (h->h_dest[i] ^ e->destmac[i]) &
+ e->destmsk[i];
+ if (FWINV2(verdict != 0, EBT_IDEST) )
+ return 1;
+ }
+ return 0;
+}
+
+static inline __pure
+struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
+{
+ return (void *)entry + entry->next_offset;
+}
+
+/* Do some firewalling */
+unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ struct ebt_table *table)
+{
+ int i, nentries;
+ struct ebt_entry *point;
+ struct ebt_counter *counter_base, *cb_base;
+ const struct ebt_entry_target *t;
+ int verdict, sp = 0;
+ struct ebt_chainstack *cs;
+ struct ebt_entries *chaininfo;
+ const char *base;
+ const struct ebt_table_info *private;
+ struct xt_action_param acpar;
+
+ acpar.family = NFPROTO_BRIDGE;
+ acpar.in = in;
+ acpar.out = out;
+ acpar.hotdrop = false;
+ acpar.hooknum = hook;
+
+ read_lock_bh(&table->lock);
+ private = table->private;
+ cb_base = COUNTER_BASE(private->counters, private->nentries,
+ smp_processor_id());
+ if (private->chainstack)
+ cs = private->chainstack[smp_processor_id()];
+ else
+ cs = NULL;
+ chaininfo = private->hook_entry[hook];
+ nentries = private->hook_entry[hook]->nentries;
+ point = (struct ebt_entry *)(private->hook_entry[hook]->data);
+ counter_base = cb_base + private->hook_entry[hook]->counter_offset;
+ /* base for chain jumps */
+ base = private->entries;
+ i = 0;
+ while (i < nentries) {
+ if (ebt_basic_match(point, skb, in, out))
+ goto letscontinue;
+
+ if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
+ goto letscontinue;
+ if (acpar.hotdrop) {
+ read_unlock_bh(&table->lock);
+ return NF_DROP;
+ }
+
+ /* increase counter */
+ (*(counter_base + i)).pcnt++;
+ (*(counter_base + i)).bcnt += skb->len;
+
+ /* these should only watch: not modify, nor tell us
+ what to do with the packet */
+ EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
+
+ t = (struct ebt_entry_target *)
+ (((char *)point) + point->target_offset);
+ /* standard target */
+ if (!t->u.target->target)
+ verdict = ((struct ebt_standard_target *)t)->verdict;
+ else {
+ acpar.target = t->u.target;
+ acpar.targinfo = t->data;
+ verdict = t->u.target->target(skb, &acpar);
+ }
+ if (verdict == EBT_ACCEPT) {
+ read_unlock_bh(&table->lock);
+ return NF_ACCEPT;
+ }
+ if (verdict == EBT_DROP) {
+ read_unlock_bh(&table->lock);
+ return NF_DROP;
+ }
+ if (verdict == EBT_RETURN) {
+letsreturn:
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (sp == 0) {
+ BUGPRINT("RETURN on base chain");
+ /* act like this is EBT_CONTINUE */
+ goto letscontinue;
+ }
+#endif
+ sp--;
+ /* put all the local variables right */
+ i = cs[sp].n;
+ chaininfo = cs[sp].chaininfo;
+ nentries = chaininfo->nentries;
+ point = cs[sp].e;
+ counter_base = cb_base +
+ chaininfo->counter_offset;
+ continue;
+ }
+ if (verdict == EBT_CONTINUE)
+ goto letscontinue;
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (verdict < 0) {
+ BUGPRINT("bogus standard verdict\n");
+ read_unlock_bh(&table->lock);
+ return NF_DROP;
+ }
+#endif
+ /* jump to a udc */
+ cs[sp].n = i + 1;
+ cs[sp].chaininfo = chaininfo;
+ cs[sp].e = ebt_next_entry(point);
+ i = 0;
+ chaininfo = (struct ebt_entries *) (base + verdict);
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (chaininfo->distinguisher) {
+ BUGPRINT("jump to non-chain\n");
+ read_unlock_bh(&table->lock);
+ return NF_DROP;
+ }
+#endif
+ nentries = chaininfo->nentries;
+ point = (struct ebt_entry *)chaininfo->data;
+ counter_base = cb_base + chaininfo->counter_offset;
+ sp++;
+ continue;
+letscontinue:
+ point = ebt_next_entry(point);
+ i++;
+ }
+
+ /* I actually like this :) */
+ if (chaininfo->policy == EBT_RETURN)
+ goto letsreturn;
+ if (chaininfo->policy == EBT_ACCEPT) {
+ read_unlock_bh(&table->lock);
+ return NF_ACCEPT;
+ }
+ read_unlock_bh(&table->lock);
+ return NF_DROP;
+}
+
+/* If it succeeds, returns element and locks mutex */
+static inline void *
+find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
+ struct mutex *mutex)
+{
+ struct {
+ struct list_head list;
+ char name[EBT_FUNCTION_MAXNAMELEN];
+ } *e;
+
+ mutex_lock(mutex);
+ list_for_each_entry(e, head, list) {
+ if (strcmp(e->name, name) == 0)
+ return e;
+ }
+ *error = -ENOENT;
+ mutex_unlock(mutex);
+ return NULL;
+}
+
+static void *
+find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
+ int *error, struct mutex *mutex)
+{
+ return try_then_request_module(
+ find_inlist_lock_noload(head, name, error, mutex),
+ "%s%s", prefix, name);
+}
+
+static inline struct ebt_table *
+find_table_lock(struct net *net, const char *name, int *error,
+ struct mutex *mutex)
+{
+ return find_inlist_lock(&net->xt.tables[NFPROTO_BRIDGE], name,
+ "ebtable_", error, mutex);
+}
+
+static inline int
+ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
+ unsigned int *cnt)
+{
+ const struct ebt_entry *e = par->entryinfo;
+ struct xt_match *match;
+ size_t left = ((char *)e + e->watchers_offset) - (char *)m;
+ int ret;
+
+ if (left < sizeof(struct ebt_entry_match) ||
+ left - sizeof(struct ebt_entry_match) < m->match_size)
+ return -EINVAL;
+
+ match = xt_request_find_match(NFPROTO_BRIDGE, m->u.name, 0);
+ if (IS_ERR(match))
+ return PTR_ERR(match);
+ m->u.match = match;
+
+ par->match = match;
+ par->matchinfo = m->data;
+ ret = xt_check_match(par, m->match_size,
+ e->ethproto, e->invflags & EBT_IPROTO);
+ if (ret < 0) {
+ module_put(match->me);
+ return ret;
+ }
+
+ (*cnt)++;
+ return 0;
+}
+
+static inline int
+ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par,
+ unsigned int *cnt)
+{
+ const struct ebt_entry *e = par->entryinfo;
+ struct xt_target *watcher;
+ size_t left = ((char *)e + e->target_offset) - (char *)w;
+ int ret;
+
+ if (left < sizeof(struct ebt_entry_watcher) ||
+ left - sizeof(struct ebt_entry_watcher) < w->watcher_size)
+ return -EINVAL;
+
+ watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0);
+ if (IS_ERR(watcher))
+ return PTR_ERR(watcher);
+ w->u.watcher = watcher;
+
+ par->target = watcher;
+ par->targinfo = w->data;
+ ret = xt_check_target(par, w->watcher_size,
+ e->ethproto, e->invflags & EBT_IPROTO);
+ if (ret < 0) {
+ module_put(watcher->me);
+ return ret;
+ }
+
+ (*cnt)++;
+ return 0;
+}
+
+static int ebt_verify_pointers(const struct ebt_replace *repl,
+ struct ebt_table_info *newinfo)
+{
+ unsigned int limit = repl->entries_size;
+ unsigned int valid_hooks = repl->valid_hooks;
+ unsigned int offset = 0;
+ int i;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++)
+ newinfo->hook_entry[i] = NULL;
+
+ newinfo->entries_size = repl->entries_size;
+ newinfo->nentries = repl->nentries;
+
+ while (offset < limit) {
+ size_t left = limit - offset;
+ struct ebt_entry *e = (void *)newinfo->entries + offset;
+
+ if (left < sizeof(unsigned int))
+ break;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if ((valid_hooks & (1 << i)) == 0)
+ continue;
+ if ((char __user *)repl->hook_entry[i] ==
+ repl->entries + offset)
+ break;
+ }
+
+ if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) {
+ if (e->bitmask != 0) {
+ /* we make userspace set this right,
+ so there is no misunderstanding */
+ BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set "
+ "in distinguisher\n");
+ return -EINVAL;
+ }
+ if (i != NF_BR_NUMHOOKS)
+ newinfo->hook_entry[i] = (struct ebt_entries *)e;
+ if (left < sizeof(struct ebt_entries))
+ break;
+ offset += sizeof(struct ebt_entries);
+ } else {
+ if (left < sizeof(struct ebt_entry))
+ break;
+ if (left < e->next_offset)
+ break;
+ if (e->next_offset < sizeof(struct ebt_entry))
+ return -EINVAL;
+ offset += e->next_offset;
+ }
+ }
+ if (offset != limit) {
+ BUGPRINT("entries_size too small\n");
+ return -EINVAL;
+ }
+
+ /* check if all valid hooks have a chain */
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if (!newinfo->hook_entry[i] &&
+ (valid_hooks & (1 << i))) {
+ BUGPRINT("Valid hook without chain\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * this one is very careful, as it is the first function
+ * to parse the userspace data
+ */
+static inline int
+ebt_check_entry_size_and_hooks(const struct ebt_entry *e,
+ const struct ebt_table_info *newinfo,
+ unsigned int *n, unsigned int *cnt,
+ unsigned int *totalcnt, unsigned int *udc_cnt)
+{
+ int i;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if ((void *)e == (void *)newinfo->hook_entry[i])
+ break;
+ }
+ /* beginning of a new chain
+ if i == NF_BR_NUMHOOKS it must be a user defined chain */
+ if (i != NF_BR_NUMHOOKS || !e->bitmask) {
+ /* this checks if the previous chain has as many entries
+ as it said it has */
+ if (*n != *cnt) {
+ BUGPRINT("nentries does not equal the nr of entries "
+ "in the chain\n");
+ return -EINVAL;
+ }
+ if (((struct ebt_entries *)e)->policy != EBT_DROP &&
+ ((struct ebt_entries *)e)->policy != EBT_ACCEPT) {
+ /* only RETURN from udc */
+ if (i != NF_BR_NUMHOOKS ||
+ ((struct ebt_entries *)e)->policy != EBT_RETURN) {
+ BUGPRINT("bad policy\n");
+ return -EINVAL;
+ }
+ }
+ if (i == NF_BR_NUMHOOKS) /* it's a user defined chain */
+ (*udc_cnt)++;
+ if (((struct ebt_entries *)e)->counter_offset != *totalcnt) {
+ BUGPRINT("counter_offset != totalcnt");
+ return -EINVAL;
+ }
+ *n = ((struct ebt_entries *)e)->nentries;
+ *cnt = 0;
+ return 0;
+ }
+ /* a plain old entry, heh */
+ if (sizeof(struct ebt_entry) > e->watchers_offset ||
+ e->watchers_offset > e->target_offset ||
+ e->target_offset >= e->next_offset) {
+ BUGPRINT("entry offsets not in right order\n");
+ return -EINVAL;
+ }
+ /* this is not checked anywhere else */
+ if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) {
+ BUGPRINT("target size too small\n");
+ return -EINVAL;
+ }
+ (*cnt)++;
+ (*totalcnt)++;
+ return 0;
+}
+
+struct ebt_cl_stack
+{
+ struct ebt_chainstack cs;
+ int from;
+ unsigned int hookmask;
+};
+
+/*
+ * we need these positions to check that the jumps to a different part of the
+ * entries is a jump to the beginning of a new chain.
+ */
+static inline int
+ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo,
+ unsigned int *n, struct ebt_cl_stack *udc)
+{
+ int i;
+
+ /* we're only interested in chain starts */
+ if (e->bitmask)
+ return 0;
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if (newinfo->hook_entry[i] == (struct ebt_entries *)e)
+ break;
+ }
+ /* only care about udc */
+ if (i != NF_BR_NUMHOOKS)
+ return 0;
+
+ udc[*n].cs.chaininfo = (struct ebt_entries *)e;
+ /* these initialisations are depended on later in check_chainloops() */
+ udc[*n].cs.n = 0;
+ udc[*n].hookmask = 0;
+
+ (*n)++;
+ return 0;
+}
+
+static inline int
+ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i)
+{
+ struct xt_mtdtor_param par;
+
+ if (i && (*i)-- == 0)
+ return 1;
+
+ par.net = net;
+ par.match = m->u.match;
+ par.matchinfo = m->data;
+ par.family = NFPROTO_BRIDGE;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+ module_put(par.match->me);
+ return 0;
+}
+
+static inline int
+ebt_cleanup_watcher(struct ebt_entry_watcher *w, struct net *net, unsigned int *i)
+{
+ struct xt_tgdtor_param par;
+
+ if (i && (*i)-- == 0)
+ return 1;
+
+ par.net = net;
+ par.target = w->u.watcher;
+ par.targinfo = w->data;
+ par.family = NFPROTO_BRIDGE;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+ return 0;
+}
+
+static inline int
+ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
+{
+ struct xt_tgdtor_param par;
+ struct ebt_entry_target *t;
+
+ if (e->bitmask == 0)
+ return 0;
+ /* we're done */
+ if (cnt && (*cnt)-- == 0)
+ return 1;
+ EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
+ EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
+ t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+
+ par.net = net;
+ par.target = t->u.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_BRIDGE;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+ return 0;
+}
+
+static inline int
+ebt_check_entry(struct ebt_entry *e, struct net *net,
+ const struct ebt_table_info *newinfo,
+ const char *name, unsigned int *cnt,
+ struct ebt_cl_stack *cl_s, unsigned int udc_cnt)
+{
+ struct ebt_entry_target *t;
+ struct xt_target *target;
+ unsigned int i, j, hook = 0, hookmask = 0;
+ size_t gap;
+ int ret;
+ struct xt_mtchk_param mtpar;
+ struct xt_tgchk_param tgpar;
+
+ /* don't mess with the struct ebt_entries */
+ if (e->bitmask == 0)
+ return 0;
+
+ if (e->bitmask & ~EBT_F_MASK) {
+ BUGPRINT("Unknown flag for bitmask\n");
+ return -EINVAL;
+ }
+ if (e->invflags & ~EBT_INV_MASK) {
+ BUGPRINT("Unknown flag for inv bitmask\n");
+ return -EINVAL;
+ }
+ if ( (e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3) ) {
+ BUGPRINT("NOPROTO & 802_3 not allowed\n");
+ return -EINVAL;
+ }
+ /* what hook do we belong to? */
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if (!newinfo->hook_entry[i])
+ continue;
+ if ((char *)newinfo->hook_entry[i] < (char *)e)
+ hook = i;
+ else
+ break;
+ }
+ /* (1 << NF_BR_NUMHOOKS) tells the check functions the rule is on
+ a base chain */
+ if (i < NF_BR_NUMHOOKS)
+ hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS);
+ else {
+ for (i = 0; i < udc_cnt; i++)
+ if ((char *)(cl_s[i].cs.chaininfo) > (char *)e)
+ break;
+ if (i == 0)
+ hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS);
+ else
+ hookmask = cl_s[i - 1].hookmask;
+ }
+ i = 0;
+
+ mtpar.net = tgpar.net = net;
+ mtpar.table = tgpar.table = name;
+ mtpar.entryinfo = tgpar.entryinfo = e;
+ mtpar.hook_mask = tgpar.hook_mask = hookmask;
+ mtpar.family = tgpar.family = NFPROTO_BRIDGE;
+ ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i);
+ if (ret != 0)
+ goto cleanup_matches;
+ j = 0;
+ ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
+ if (ret != 0)
+ goto cleanup_watchers;
+ t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+ gap = e->next_offset - e->target_offset;
+
+ target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
+ if (IS_ERR(target)) {
+ ret = PTR_ERR(target);
+ goto cleanup_watchers;
+ }
+
+ t->u.target = target;
+ if (t->u.target == &ebt_standard_target) {
+ if (gap < sizeof(struct ebt_standard_target)) {
+ BUGPRINT("Standard target size too big\n");
+ ret = -EFAULT;
+ goto cleanup_watchers;
+ }
+ if (((struct ebt_standard_target *)t)->verdict <
+ -NUM_STANDARD_TARGETS) {
+ BUGPRINT("Invalid standard target\n");
+ ret = -EFAULT;
+ goto cleanup_watchers;
+ }
+ } else if (t->target_size > gap - sizeof(struct ebt_entry_target)) {
+ module_put(t->u.target->me);
+ ret = -EFAULT;
+ goto cleanup_watchers;
+ }
+
+ tgpar.target = target;
+ tgpar.targinfo = t->data;
+ ret = xt_check_target(&tgpar, t->target_size,
+ e->ethproto, e->invflags & EBT_IPROTO);
+ if (ret < 0) {
+ module_put(target->me);
+ goto cleanup_watchers;
+ }
+ (*cnt)++;
+ return 0;
+cleanup_watchers:
+ EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, &j);
+cleanup_matches:
+ EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i);
+ return ret;
+}
+
+/*
+ * checks for loops and sets the hook mask for udc
+ * the hook mask for udc tells us from which base chains the udc can be
+ * accessed. This mask is a parameter to the check() functions of the extensions
+ */
+static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack *cl_s,
+ unsigned int udc_cnt, unsigned int hooknr, char *base)
+{
+ int i, chain_nr = -1, pos = 0, nentries = chain->nentries, verdict;
+ const struct ebt_entry *e = (struct ebt_entry *)chain->data;
+ const struct ebt_entry_target *t;
+
+ while (pos < nentries || chain_nr != -1) {
+ /* end of udc, go back one 'recursion' step */
+ if (pos == nentries) {
+ /* put back values of the time when this chain was called */
+ e = cl_s[chain_nr].cs.e;
+ if (cl_s[chain_nr].from != -1)
+ nentries =
+ cl_s[cl_s[chain_nr].from].cs.chaininfo->nentries;
+ else
+ nentries = chain->nentries;
+ pos = cl_s[chain_nr].cs.n;
+ /* make sure we won't see a loop that isn't one */
+ cl_s[chain_nr].cs.n = 0;
+ chain_nr = cl_s[chain_nr].from;
+ if (pos == nentries)
+ continue;
+ }
+ t = (struct ebt_entry_target *)
+ (((char *)e) + e->target_offset);
+ if (strcmp(t->u.name, EBT_STANDARD_TARGET))
+ goto letscontinue;
+ if (e->target_offset + sizeof(struct ebt_standard_target) >
+ e->next_offset) {
+ BUGPRINT("Standard target size too big\n");
+ return -1;
+ }
+ verdict = ((struct ebt_standard_target *)t)->verdict;
+ if (verdict >= 0) { /* jump to another chain */
+ struct ebt_entries *hlp2 =
+ (struct ebt_entries *)(base + verdict);
+ for (i = 0; i < udc_cnt; i++)
+ if (hlp2 == cl_s[i].cs.chaininfo)
+ break;
+ /* bad destination or loop */
+ if (i == udc_cnt) {
+ BUGPRINT("bad destination\n");
+ return -1;
+ }
+ if (cl_s[i].cs.n) {
+ BUGPRINT("loop\n");
+ return -1;
+ }
+ if (cl_s[i].hookmask & (1 << hooknr))
+ goto letscontinue;
+ /* this can't be 0, so the loop test is correct */
+ cl_s[i].cs.n = pos + 1;
+ pos = 0;
+ cl_s[i].cs.e = ebt_next_entry(e);
+ e = (struct ebt_entry *)(hlp2->data);
+ nentries = hlp2->nentries;
+ cl_s[i].from = chain_nr;
+ chain_nr = i;
+ /* this udc is accessible from the base chain for hooknr */
+ cl_s[i].hookmask |= (1 << hooknr);
+ continue;
+ }
+letscontinue:
+ e = ebt_next_entry(e);
+ pos++;
+ }
+ return 0;
+}
+
+/* do the parsing of the table/chains/entries/matches/watchers/targets, heh */
+static int translate_table(struct net *net, const char *name,
+ struct ebt_table_info *newinfo)
+{
+ unsigned int i, j, k, udc_cnt;
+ int ret;
+ struct ebt_cl_stack *cl_s = NULL; /* used in the checking for chain loops */
+
+ i = 0;
+ while (i < NF_BR_NUMHOOKS && !newinfo->hook_entry[i])
+ i++;
+ if (i == NF_BR_NUMHOOKS) {
+ BUGPRINT("No valid hooks specified\n");
+ return -EINVAL;
+ }
+ if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries) {
+ BUGPRINT("Chains don't start at beginning\n");
+ return -EINVAL;
+ }
+ /* make sure chains are ordered after each other in same order
+ as their corresponding hooks */
+ for (j = i + 1; j < NF_BR_NUMHOOKS; j++) {
+ if (!newinfo->hook_entry[j])
+ continue;
+ if (newinfo->hook_entry[j] <= newinfo->hook_entry[i]) {
+ BUGPRINT("Hook order must be followed\n");
+ return -EINVAL;
+ }
+ i = j;
+ }
+
+ /* do some early checkings and initialize some things */
+ i = 0; /* holds the expected nr. of entries for the chain */
+ j = 0; /* holds the up to now counted entries for the chain */
+ k = 0; /* holds the total nr. of entries, should equal
+ newinfo->nentries afterwards */
+ udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */
+ ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+ ebt_check_entry_size_and_hooks, newinfo,
+ &i, &j, &k, &udc_cnt);
+
+ if (ret != 0)
+ return ret;
+
+ if (i != j) {
+ BUGPRINT("nentries does not equal the nr of entries in the "
+ "(last) chain\n");
+ return -EINVAL;
+ }
+ if (k != newinfo->nentries) {
+ BUGPRINT("Total nentries is wrong\n");
+ return -EINVAL;
+ }
+
+ /* get the location of the udc, put them in an array
+ while we're at it, allocate the chainstack */
+ if (udc_cnt) {
+ /* this will get free'd in do_replace()/ebt_register_table()
+ if an error occurs */
+ newinfo->chainstack =
+ vmalloc(nr_cpu_ids * sizeof(*(newinfo->chainstack)));
+ if (!newinfo->chainstack)
+ return -ENOMEM;
+ for_each_possible_cpu(i) {
+ newinfo->chainstack[i] =
+ vmalloc(udc_cnt * sizeof(*(newinfo->chainstack[0])));
+ if (!newinfo->chainstack[i]) {
+ while (i)
+ vfree(newinfo->chainstack[--i]);
+ vfree(newinfo->chainstack);
+ newinfo->chainstack = NULL;
+ return -ENOMEM;
+ }
+ }
+
+ cl_s = vmalloc(udc_cnt * sizeof(*cl_s));
+ if (!cl_s)
+ return -ENOMEM;
+ i = 0; /* the i'th udc */
+ EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+ ebt_get_udc_positions, newinfo, &i, cl_s);
+ /* sanity check */
+ if (i != udc_cnt) {
+ BUGPRINT("i != udc_cnt\n");
+ vfree(cl_s);
+ return -EFAULT;
+ }
+ }
+
+ /* Check for loops */
+ for (i = 0; i < NF_BR_NUMHOOKS; i++)
+ if (newinfo->hook_entry[i])
+ if (check_chainloops(newinfo->hook_entry[i],
+ cl_s, udc_cnt, i, newinfo->entries)) {
+ vfree(cl_s);
+ return -EINVAL;
+ }
+
+ /* we now know the following (along with E=mc²):
+ - the nr of entries in each chain is right
+ - the size of the allocated space is right
+ - all valid hooks have a corresponding chain
+ - there are no loops
+ - wrong data can still be on the level of a single entry
+ - could be there are jumps to places that are not the
+ beginning of a chain. This can only occur in chains that
+ are not accessible from any base chains, so we don't care. */
+
+ /* used to know what we need to clean up if something goes wrong */
+ i = 0;
+ ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+ ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt);
+ if (ret != 0) {
+ EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+ ebt_cleanup_entry, net, &i);
+ }
+ vfree(cl_s);
+ return ret;
+}
+
+/* called under write_lock */
+static void get_counters(const struct ebt_counter *oldcounters,
+ struct ebt_counter *counters, unsigned int nentries)
+{
+ int i, cpu;
+ struct ebt_counter *counter_base;
+
+ /* counters of cpu 0 */
+ memcpy(counters, oldcounters,
+ sizeof(struct ebt_counter) * nentries);
+
+ /* add other counters to those of cpu 0 */
+ for_each_possible_cpu(cpu) {
+ if (cpu == 0)
+ continue;
+ counter_base = COUNTER_BASE(oldcounters, nentries, cpu);
+ for (i = 0; i < nentries; i++) {
+ counters[i].pcnt += counter_base[i].pcnt;
+ counters[i].bcnt += counter_base[i].bcnt;
+ }
+ }
+}
+
+static int do_replace_finish(struct net *net, struct ebt_replace *repl,
+ struct ebt_table_info *newinfo)
+{
+ int ret, i;
+ struct ebt_counter *counterstmp = NULL;
+ /* used to be able to unlock earlier */
+ struct ebt_table_info *table;
+ struct ebt_table *t;
+
+ /* the user wants counters back
+ the check on the size is done later, when we have the lock */
+ if (repl->num_counters) {
+ unsigned long size = repl->num_counters * sizeof(*counterstmp);
+ counterstmp = vmalloc(size);
+ if (!counterstmp)
+ return -ENOMEM;
+ }
+
+ newinfo->chainstack = NULL;
+ ret = ebt_verify_pointers(repl, newinfo);
+ if (ret != 0)
+ goto free_counterstmp;
+
+ ret = translate_table(net, repl->name, newinfo);
+
+ if (ret != 0)
+ goto free_counterstmp;
+
+ t = find_table_lock(net, repl->name, &ret, &ebt_mutex);
+ if (!t) {
+ ret = -ENOENT;
+ goto free_iterate;
+ }
+
+ /* the table doesn't like it */
+ if (t->check && (ret = t->check(newinfo, repl->valid_hooks)))
+ goto free_unlock;
+
+ if (repl->num_counters && repl->num_counters != t->private->nentries) {
+ BUGPRINT("Wrong nr. of counters requested\n");
+ ret = -EINVAL;
+ goto free_unlock;
+ }
+
+ /* we have the mutex lock, so no danger in reading this pointer */
+ table = t->private;
+ /* make sure the table can only be rmmod'ed if it contains no rules */
+ if (!table->nentries && newinfo->nentries && !try_module_get(t->me)) {
+ ret = -ENOENT;
+ goto free_unlock;
+ } else if (table->nentries && !newinfo->nentries)
+ module_put(t->me);
+ /* we need an atomic snapshot of the counters */
+ write_lock_bh(&t->lock);
+ if (repl->num_counters)
+ get_counters(t->private->counters, counterstmp,
+ t->private->nentries);
+
+ t->private = newinfo;
+ write_unlock_bh(&t->lock);
+ mutex_unlock(&ebt_mutex);
+ /* so, a user can change the chains while having messed up her counter
+ allocation. Only reason why this is done is because this way the lock
+ is held only once, while this doesn't bring the kernel into a
+ dangerous state. */
+ if (repl->num_counters &&
+ copy_to_user(repl->counters, counterstmp,
+ repl->num_counters * sizeof(struct ebt_counter))) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n");
+ }
+
+ /* decrease module count and free resources */
+ EBT_ENTRY_ITERATE(table->entries, table->entries_size,
+ ebt_cleanup_entry, net, NULL);
+
+ vfree(table->entries);
+ if (table->chainstack) {
+ for_each_possible_cpu(i)
+ vfree(table->chainstack[i]);
+ vfree(table->chainstack);
+ }
+ vfree(table);
+
+ vfree(counterstmp);
+
+#ifdef CONFIG_AUDIT
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_NETFILTER_CFG);
+ if (ab) {
+ audit_log_format(ab, "table=%s family=%u entries=%u",
+ repl->name, AF_BRIDGE, repl->nentries);
+ audit_log_end(ab);
+ }
+ }
+#endif
+ return ret;
+
+free_unlock:
+ mutex_unlock(&ebt_mutex);
+free_iterate:
+ EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+ ebt_cleanup_entry, net, NULL);
+free_counterstmp:
+ vfree(counterstmp);
+ /* can be initialized in translate_table() */
+ if (newinfo->chainstack) {
+ for_each_possible_cpu(i)
+ vfree(newinfo->chainstack[i]);
+ vfree(newinfo->chainstack);
+ }
+ return ret;
+}
+
+/* replace the table */
+static int do_replace(struct net *net, const void __user *user,
+ unsigned int len)
+{
+ int ret, countersize;
+ struct ebt_table_info *newinfo;
+ struct ebt_replace tmp;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ if (len != sizeof(tmp) + tmp.entries_size) {
+ BUGPRINT("Wrong len argument\n");
+ return -EINVAL;
+ }
+
+ if (tmp.entries_size == 0) {
+ BUGPRINT("Entries_size never zero\n");
+ return -EINVAL;
+ }
+ /* overflow check */
+ if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) /
+ NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter))
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
+ return -ENOMEM;
+
+ tmp.name[sizeof(tmp.name) - 1] = 0;
+
+ countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
+ newinfo = vmalloc(sizeof(*newinfo) + countersize);
+ if (!newinfo)
+ return -ENOMEM;
+
+ if (countersize)
+ memset(newinfo->counters, 0, countersize);
+
+ newinfo->entries = vmalloc(tmp.entries_size);
+ if (!newinfo->entries) {
+ ret = -ENOMEM;
+ goto free_newinfo;
+ }
+ if (copy_from_user(
+ newinfo->entries, tmp.entries, tmp.entries_size) != 0) {
+ BUGPRINT("Couldn't copy entries from userspace\n");
+ ret = -EFAULT;
+ goto free_entries;
+ }
+
+ ret = do_replace_finish(net, &tmp, newinfo);
+ if (ret == 0)
+ return ret;
+free_entries:
+ vfree(newinfo->entries);
+free_newinfo:
+ vfree(newinfo);
+ return ret;
+}
+
+struct ebt_table *
+ebt_register_table(struct net *net, const struct ebt_table *input_table)
+{
+ struct ebt_table_info *newinfo;
+ struct ebt_table *t, *table;
+ struct ebt_replace_kernel *repl;
+ int ret, i, countersize;
+ void *p;
+
+ if (input_table == NULL || (repl = input_table->table) == NULL ||
+ repl->entries == NULL || repl->entries_size == 0 ||
+ repl->counters != NULL || input_table->private != NULL) {
+ BUGPRINT("Bad table data for ebt_register_table!!!\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Don't add one table to multiple lists. */
+ table = kmemdup(input_table, sizeof(struct ebt_table), GFP_KERNEL);
+ if (!table) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ countersize = COUNTER_OFFSET(repl->nentries) * nr_cpu_ids;
+ newinfo = vmalloc(sizeof(*newinfo) + countersize);
+ ret = -ENOMEM;
+ if (!newinfo)
+ goto free_table;
+
+ p = vmalloc(repl->entries_size);
+ if (!p)
+ goto free_newinfo;
+
+ memcpy(p, repl->entries, repl->entries_size);
+ newinfo->entries = p;
+
+ newinfo->entries_size = repl->entries_size;
+ newinfo->nentries = repl->nentries;
+
+ if (countersize)
+ memset(newinfo->counters, 0, countersize);
+
+ /* fill in newinfo and parse the entries */
+ newinfo->chainstack = NULL;
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ if ((repl->valid_hooks & (1 << i)) == 0)
+ newinfo->hook_entry[i] = NULL;
+ else
+ newinfo->hook_entry[i] = p +
+ ((char *)repl->hook_entry[i] - repl->entries);
+ }
+ ret = translate_table(net, repl->name, newinfo);
+ if (ret != 0) {
+ BUGPRINT("Translate_table failed\n");
+ goto free_chainstack;
+ }
+
+ if (table->check && table->check(newinfo, table->valid_hooks)) {
+ BUGPRINT("The table doesn't like its own initial data, lol\n");
+ ret = -EINVAL;
+ goto free_chainstack;
+ }
+
+ table->private = newinfo;
+ rwlock_init(&table->lock);
+ mutex_lock(&ebt_mutex);
+ list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
+ if (strcmp(t->name, table->name) == 0) {
+ ret = -EEXIST;
+ BUGPRINT("Table name already exists\n");
+ goto free_unlock;
+ }
+ }
+
+ /* Hold a reference count if the chains aren't empty */
+ if (newinfo->nentries && !try_module_get(table->me)) {
+ ret = -ENOENT;
+ goto free_unlock;
+ }
+ list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
+ mutex_unlock(&ebt_mutex);
+ return table;
+free_unlock:
+ mutex_unlock(&ebt_mutex);
+free_chainstack:
+ if (newinfo->chainstack) {
+ for_each_possible_cpu(i)
+ vfree(newinfo->chainstack[i]);
+ vfree(newinfo->chainstack);
+ }
+ vfree(newinfo->entries);
+free_newinfo:
+ vfree(newinfo);
+free_table:
+ kfree(table);
+out:
+ return ERR_PTR(ret);
+}
+
+void ebt_unregister_table(struct net *net, struct ebt_table *table)
+{
+ int i;
+
+ if (!table) {
+ BUGPRINT("Request to unregister NULL table!!!\n");
+ return;
+ }
+ mutex_lock(&ebt_mutex);
+ list_del(&table->list);
+ mutex_unlock(&ebt_mutex);
+ EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
+ ebt_cleanup_entry, net, NULL);
+ if (table->private->nentries)
+ module_put(table->me);
+ vfree(table->private->entries);
+ if (table->private->chainstack) {
+ for_each_possible_cpu(i)
+ vfree(table->private->chainstack[i]);
+ vfree(table->private->chainstack);
+ }
+ vfree(table->private);
+ kfree(table);
+}
+
+/* userspace just supplied us with counters */
+static int do_update_counters(struct net *net, const char *name,
+ struct ebt_counter __user *counters,
+ unsigned int num_counters,
+ const void __user *user, unsigned int len)
+{
+ int i, ret;
+ struct ebt_counter *tmp;
+ struct ebt_table *t;
+
+ if (num_counters == 0)
+ return -EINVAL;
+
+ tmp = vmalloc(num_counters * sizeof(*tmp));
+ if (!tmp)
+ return -ENOMEM;
+
+ t = find_table_lock(net, name, &ret, &ebt_mutex);
+ if (!t)
+ goto free_tmp;
+
+ if (num_counters != t->private->nentries) {
+ BUGPRINT("Wrong nr of counters\n");
+ ret = -EINVAL;
+ goto unlock_mutex;
+ }
+
+ if (copy_from_user(tmp, counters, num_counters * sizeof(*counters))) {
+ ret = -EFAULT;
+ goto unlock_mutex;
+ }
+
+ /* we want an atomic add of the counters */
+ write_lock_bh(&t->lock);
+
+ /* we add to the counters of the first cpu */
+ for (i = 0; i < num_counters; i++) {
+ t->private->counters[i].pcnt += tmp[i].pcnt;
+ t->private->counters[i].bcnt += tmp[i].bcnt;
+ }
+
+ write_unlock_bh(&t->lock);
+ ret = 0;
+unlock_mutex:
+ mutex_unlock(&ebt_mutex);
+free_tmp:
+ vfree(tmp);
+ return ret;
+}
+
+static int update_counters(struct net *net, const void __user *user,
+ unsigned int len)
+{
+ struct ebt_replace hlp;
+
+ if (copy_from_user(&hlp, user, sizeof(hlp)))
+ return -EFAULT;
+
+ if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
+ return -EINVAL;
+
+ return do_update_counters(net, hlp.name, hlp.counters,
+ hlp.num_counters, user, len);
+}
+
+static inline int ebt_make_matchname(const struct ebt_entry_match *m,
+ const char *base, char __user *ubase)
+{
+ char __user *hlp = ubase + ((char *)m - base);
+ char name[EBT_FUNCTION_MAXNAMELEN] = {};
+
+ /* ebtables expects 32 bytes long names but xt_match names are 29 bytes
+ long. Copy 29 bytes and fill remaining bytes with zeroes. */
+ strlcpy(name, m->u.match->name, sizeof(name));
+ if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
+ return -EFAULT;
+ return 0;
+}
+
+static inline int ebt_make_watchername(const struct ebt_entry_watcher *w,
+ const char *base, char __user *ubase)
+{
+ char __user *hlp = ubase + ((char *)w - base);
+ char name[EBT_FUNCTION_MAXNAMELEN] = {};
+
+ strlcpy(name, w->u.watcher->name, sizeof(name));
+ if (copy_to_user(hlp , name, EBT_FUNCTION_MAXNAMELEN))
+ return -EFAULT;
+ return 0;
+}
+
+static inline int
+ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
+{
+ int ret;
+ char __user *hlp;
+ const struct ebt_entry_target *t;
+ char name[EBT_FUNCTION_MAXNAMELEN] = {};
+
+ if (e->bitmask == 0)
+ return 0;
+
+ hlp = ubase + (((char *)e + e->target_offset) - base);
+ t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+
+ ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase);
+ if (ret != 0)
+ return ret;
+ ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
+ if (ret != 0)
+ return ret;
+ strlcpy(name, t->u.target->name, sizeof(name));
+ if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
+ return -EFAULT;
+ return 0;
+}
+
+static int copy_counters_to_user(struct ebt_table *t,
+ const struct ebt_counter *oldcounters,
+ void __user *user, unsigned int num_counters,
+ unsigned int nentries)
+{
+ struct ebt_counter *counterstmp;
+ int ret = 0;
+
+ /* userspace might not need the counters */
+ if (num_counters == 0)
+ return 0;
+
+ if (num_counters != nentries) {
+ BUGPRINT("Num_counters wrong\n");
+ return -EINVAL;
+ }
+
+ counterstmp = vmalloc(nentries * sizeof(*counterstmp));
+ if (!counterstmp)
+ return -ENOMEM;
+
+ write_lock_bh(&t->lock);
+ get_counters(oldcounters, counterstmp, nentries);
+ write_unlock_bh(&t->lock);
+
+ if (copy_to_user(user, counterstmp,
+ nentries * sizeof(struct ebt_counter)))
+ ret = -EFAULT;
+ vfree(counterstmp);
+ return ret;
+}
+
+/* called with ebt_mutex locked */
+static int copy_everything_to_user(struct ebt_table *t, void __user *user,
+ const int *len, int cmd)
+{
+ struct ebt_replace tmp;
+ const struct ebt_counter *oldcounters;
+ unsigned int entries_size, nentries;
+ int ret;
+ char *entries;
+
+ if (cmd == EBT_SO_GET_ENTRIES) {
+ entries_size = t->private->entries_size;
+ nentries = t->private->nentries;
+ entries = t->private->entries;
+ oldcounters = t->private->counters;
+ } else {
+ entries_size = t->table->entries_size;
+ nentries = t->table->nentries;
+ entries = t->table->entries;
+ oldcounters = t->table->counters;
+ }
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ if (*len != sizeof(struct ebt_replace) + entries_size +
+ (tmp.num_counters ? nentries * sizeof(struct ebt_counter) : 0))
+ return -EINVAL;
+
+ if (tmp.nentries != nentries) {
+ BUGPRINT("Nentries wrong\n");
+ return -EINVAL;
+ }
+
+ if (tmp.entries_size != entries_size) {
+ BUGPRINT("Wrong size\n");
+ return -EINVAL;
+ }
+
+ ret = copy_counters_to_user(t, oldcounters, tmp.counters,
+ tmp.num_counters, nentries);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(tmp.entries, entries, entries_size)) {
+ BUGPRINT("Couldn't copy entries to userspace\n");
+ return -EFAULT;
+ }
+ /* set the match/watcher/target names right */
+ return EBT_ENTRY_ITERATE(entries, entries_size,
+ ebt_make_names, entries, tmp.entries);
+}
+
+static int do_ebt_set_ctl(struct sock *sk,
+ int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+ struct net *net = sock_net(sk);
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case EBT_SO_SET_ENTRIES:
+ ret = do_replace(net, user, len);
+ break;
+ case EBT_SO_SET_COUNTERS:
+ ret = update_counters(net, user, len);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+ struct ebt_replace tmp;
+ struct ebt_table *t;
+ struct net *net = sock_net(sk);
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
+ if (!t)
+ return ret;
+
+ switch (cmd) {
+ case EBT_SO_GET_INFO:
+ case EBT_SO_GET_INIT_INFO:
+ if (*len != sizeof(struct ebt_replace)) {
+ ret = -EINVAL;
+ mutex_unlock(&ebt_mutex);
+ break;
+ }
+ if (cmd == EBT_SO_GET_INFO) {
+ tmp.nentries = t->private->nentries;
+ tmp.entries_size = t->private->entries_size;
+ tmp.valid_hooks = t->valid_hooks;
+ } else {
+ tmp.nentries = t->table->nentries;
+ tmp.entries_size = t->table->entries_size;
+ tmp.valid_hooks = t->table->valid_hooks;
+ }
+ mutex_unlock(&ebt_mutex);
+ if (copy_to_user(user, &tmp, *len) != 0) {
+ BUGPRINT("c2u Didn't work\n");
+ ret = -EFAULT;
+ break;
+ }
+ ret = 0;
+ break;
+
+ case EBT_SO_GET_ENTRIES:
+ case EBT_SO_GET_INIT_ENTRIES:
+ ret = copy_everything_to_user(t, user, len, cmd);
+ mutex_unlock(&ebt_mutex);
+ break;
+
+ default:
+ mutex_unlock(&ebt_mutex);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+/* 32 bit-userspace compatibility definitions. */
+struct compat_ebt_replace {
+ char name[EBT_TABLE_MAXNAMELEN];
+ compat_uint_t valid_hooks;
+ compat_uint_t nentries;
+ compat_uint_t entries_size;
+ /* start of the chains */
+ compat_uptr_t hook_entry[NF_BR_NUMHOOKS];
+ /* nr of counters userspace expects back */
+ compat_uint_t num_counters;
+ /* where the kernel will put the old counters. */
+ compat_uptr_t counters;
+ compat_uptr_t entries;
+};
+
+/* struct ebt_entry_match, _target and _watcher have same layout */
+struct compat_ebt_entry_mwt {
+ union {
+ char name[EBT_FUNCTION_MAXNAMELEN];
+ compat_uptr_t ptr;
+ } u;
+ compat_uint_t match_size;
+ compat_uint_t data[0];
+};
+
+/* account for possible padding between match_size and ->data */
+static int ebt_compat_entry_padsize(void)
+{
+ BUILD_BUG_ON(XT_ALIGN(sizeof(struct ebt_entry_match)) <
+ COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt)));
+ return (int) XT_ALIGN(sizeof(struct ebt_entry_match)) -
+ COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt));
+}
+
+static int ebt_compat_match_offset(const struct xt_match *match,
+ unsigned int userlen)
+{
+ /*
+ * ebt_among needs special handling. The kernel .matchsize is
+ * set to -1 at registration time; at runtime an EBT_ALIGN()ed
+ * value is expected.
+ * Example: userspace sends 4500, ebt_among.c wants 4504.
+ */
+ if (unlikely(match->matchsize == -1))
+ return XT_ALIGN(userlen) - COMPAT_XT_ALIGN(userlen);
+ return xt_compat_match_offset(match);
+}
+
+static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
+ unsigned int *size)
+{
+ const struct xt_match *match = m->u.match;
+ struct compat_ebt_entry_mwt __user *cm = *dstptr;
+ int off = ebt_compat_match_offset(match, m->match_size);
+ compat_uint_t msize = m->match_size - off;
+
+ BUG_ON(off >= m->match_size);
+
+ if (copy_to_user(cm->u.name, match->name,
+ strlen(match->name) + 1) || put_user(msize, &cm->match_size))
+ return -EFAULT;
+
+ if (match->compat_to_user) {
+ if (match->compat_to_user(cm->data, m->data))
+ return -EFAULT;
+ } else if (copy_to_user(cm->data, m->data, msize))
+ return -EFAULT;
+
+ *size -= ebt_compat_entry_padsize() + off;
+ *dstptr = cm->data;
+ *dstptr += msize;
+ return 0;
+}
+
+static int compat_target_to_user(struct ebt_entry_target *t,
+ void __user **dstptr,
+ unsigned int *size)
+{
+ const struct xt_target *target = t->u.target;
+ struct compat_ebt_entry_mwt __user *cm = *dstptr;
+ int off = xt_compat_target_offset(target);
+ compat_uint_t tsize = t->target_size - off;
+
+ BUG_ON(off >= t->target_size);
+
+ if (copy_to_user(cm->u.name, target->name,
+ strlen(target->name) + 1) || put_user(tsize, &cm->match_size))
+ return -EFAULT;
+
+ if (target->compat_to_user) {
+ if (target->compat_to_user(cm->data, t->data))
+ return -EFAULT;
+ } else if (copy_to_user(cm->data, t->data, tsize))
+ return -EFAULT;
+
+ *size -= ebt_compat_entry_padsize() + off;
+ *dstptr = cm->data;
+ *dstptr += tsize;
+ return 0;
+}
+
+static int compat_watcher_to_user(struct ebt_entry_watcher *w,
+ void __user **dstptr,
+ unsigned int *size)
+{
+ return compat_target_to_user((struct ebt_entry_target *)w,
+ dstptr, size);
+}
+
+static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
+ unsigned int *size)
+{
+ struct ebt_entry_target *t;
+ struct ebt_entry __user *ce;
+ u32 watchers_offset, target_offset, next_offset;
+ compat_uint_t origsize;
+ int ret;
+
+ if (e->bitmask == 0) {
+ if (*size < sizeof(struct ebt_entries))
+ return -EINVAL;
+ if (copy_to_user(*dstptr, e, sizeof(struct ebt_entries)))
+ return -EFAULT;
+
+ *dstptr += sizeof(struct ebt_entries);
+ *size -= sizeof(struct ebt_entries);
+ return 0;
+ }
+
+ if (*size < sizeof(*ce))
+ return -EINVAL;
+
+ ce = (struct ebt_entry __user *)*dstptr;
+ if (copy_to_user(ce, e, sizeof(*ce)))
+ return -EFAULT;
+
+ origsize = *size;
+ *dstptr += sizeof(*ce);
+
+ ret = EBT_MATCH_ITERATE(e, compat_match_to_user, dstptr, size);
+ if (ret)
+ return ret;
+ watchers_offset = e->watchers_offset - (origsize - *size);
+
+ ret = EBT_WATCHER_ITERATE(e, compat_watcher_to_user, dstptr, size);
+ if (ret)
+ return ret;
+ target_offset = e->target_offset - (origsize - *size);
+
+ t = (struct ebt_entry_target *) ((char *) e + e->target_offset);
+
+ ret = compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+
+ if (put_user(watchers_offset, &ce->watchers_offset) ||
+ put_user(target_offset, &ce->target_offset) ||
+ put_user(next_offset, &ce->next_offset))
+ return -EFAULT;
+
+ *size -= sizeof(*ce);
+ return 0;
+}
+
+static int compat_calc_match(struct ebt_entry_match *m, int *off)
+{
+ *off += ebt_compat_match_offset(m->u.match, m->match_size);
+ *off += ebt_compat_entry_padsize();
+ return 0;
+}
+
+static int compat_calc_watcher(struct ebt_entry_watcher *w, int *off)
+{
+ *off += xt_compat_target_offset(w->u.watcher);
+ *off += ebt_compat_entry_padsize();
+ return 0;
+}
+
+static int compat_calc_entry(const struct ebt_entry *e,
+ const struct ebt_table_info *info,
+ const void *base,
+ struct compat_ebt_replace *newinfo)
+{
+ const struct ebt_entry_target *t;
+ unsigned int entry_offset;
+ int off, ret, i;
+
+ if (e->bitmask == 0)
+ return 0;
+
+ off = 0;
+ entry_offset = (void *)e - base;
+
+ EBT_MATCH_ITERATE(e, compat_calc_match, &off);
+ EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
+
+ t = (const struct ebt_entry_target *) ((char *) e + e->target_offset);
+
+ off += xt_compat_target_offset(t->u.target);
+ off += ebt_compat_entry_padsize();
+
+ newinfo->entries_size -= off;
+
+ ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ const void *hookptr = info->hook_entry[i];
+ if (info->hook_entry[i] &&
+ (e < (struct ebt_entry *)(base - hookptr))) {
+ newinfo->hook_entry[i] -= off;
+ pr_debug("0x%08X -> 0x%08X\n",
+ newinfo->hook_entry[i] + off,
+ newinfo->hook_entry[i]);
+ }
+ }
+
+ return 0;
+}
+
+
+static int compat_table_info(const struct ebt_table_info *info,
+ struct compat_ebt_replace *newinfo)
+{
+ unsigned int size = info->entries_size;
+ const void *entries = info->entries;
+
+ newinfo->entries_size = size;
+
+ xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries);
+ return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
+ entries, newinfo);
+}
+
+static int compat_copy_everything_to_user(struct ebt_table *t,
+ void __user *user, int *len, int cmd)
+{
+ struct compat_ebt_replace repl, tmp;
+ struct ebt_counter *oldcounters;
+ struct ebt_table_info tinfo;
+ int ret;
+ void __user *pos;
+
+ memset(&tinfo, 0, sizeof(tinfo));
+
+ if (cmd == EBT_SO_GET_ENTRIES) {
+ tinfo.entries_size = t->private->entries_size;
+ tinfo.nentries = t->private->nentries;
+ tinfo.entries = t->private->entries;
+ oldcounters = t->private->counters;
+ } else {
+ tinfo.entries_size = t->table->entries_size;
+ tinfo.nentries = t->table->nentries;
+ tinfo.entries = t->table->entries;
+ oldcounters = t->table->counters;
+ }
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.nentries != tinfo.nentries ||
+ (tmp.num_counters && tmp.num_counters != tinfo.nentries))
+ return -EINVAL;
+
+ memcpy(&repl, &tmp, sizeof(repl));
+ if (cmd == EBT_SO_GET_ENTRIES)
+ ret = compat_table_info(t->private, &repl);
+ else
+ ret = compat_table_info(&tinfo, &repl);
+ if (ret)
+ return ret;
+
+ if (*len != sizeof(tmp) + repl.entries_size +
+ (tmp.num_counters? tinfo.nentries * sizeof(struct ebt_counter): 0)) {
+ pr_err("wrong size: *len %d, entries_size %u, replsz %d\n",
+ *len, tinfo.entries_size, repl.entries_size);
+ return -EINVAL;
+ }
+
+ /* userspace might not need the counters */
+ ret = copy_counters_to_user(t, oldcounters, compat_ptr(tmp.counters),
+ tmp.num_counters, tinfo.nentries);
+ if (ret)
+ return ret;
+
+ pos = compat_ptr(tmp.entries);
+ return EBT_ENTRY_ITERATE(tinfo.entries, tinfo.entries_size,
+ compat_copy_entry_to_user, &pos, &tmp.entries_size);
+}
+
+struct ebt_entries_buf_state {
+ char *buf_kern_start; /* kernel buffer to copy (translated) data to */
+ u32 buf_kern_len; /* total size of kernel buffer */
+ u32 buf_kern_offset; /* amount of data copied so far */
+ u32 buf_user_offset; /* read position in userspace buffer */
+};
+
+static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz)
+{
+ state->buf_kern_offset += sz;
+ return state->buf_kern_offset >= sz ? 0 : -EINVAL;
+}
+
+static int ebt_buf_add(struct ebt_entries_buf_state *state,
+ void *data, unsigned int sz)
+{
+ if (state->buf_kern_start == NULL)
+ goto count_only;
+
+ BUG_ON(state->buf_kern_offset + sz > state->buf_kern_len);
+
+ memcpy(state->buf_kern_start + state->buf_kern_offset, data, sz);
+
+ count_only:
+ state->buf_user_offset += sz;
+ return ebt_buf_count(state, sz);
+}
+
+static int ebt_buf_add_pad(struct ebt_entries_buf_state *state, unsigned int sz)
+{
+ char *b = state->buf_kern_start;
+
+ BUG_ON(b && state->buf_kern_offset > state->buf_kern_len);
+
+ if (b != NULL && sz > 0)
+ memset(b + state->buf_kern_offset, 0, sz);
+ /* do not adjust ->buf_user_offset here, we added kernel-side padding */
+ return ebt_buf_count(state, sz);
+}
+
+enum compat_mwt {
+ EBT_COMPAT_MATCH,
+ EBT_COMPAT_WATCHER,
+ EBT_COMPAT_TARGET,
+};
+
+static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
+ enum compat_mwt compat_mwt,
+ struct ebt_entries_buf_state *state,
+ const unsigned char *base)
+{
+ char name[EBT_FUNCTION_MAXNAMELEN];
+ struct xt_match *match;
+ struct xt_target *wt;
+ void *dst = NULL;
+ int off, pad = 0;
+ unsigned int size_kern, match_size = mwt->match_size;
+
+ strlcpy(name, mwt->u.name, sizeof(name));
+
+ if (state->buf_kern_start)
+ dst = state->buf_kern_start + state->buf_kern_offset;
+
+ switch (compat_mwt) {
+ case EBT_COMPAT_MATCH:
+ match = xt_request_find_match(NFPROTO_BRIDGE, name, 0);
+ if (IS_ERR(match))
+ return PTR_ERR(match);
+
+ off = ebt_compat_match_offset(match, match_size);
+ if (dst) {
+ if (match->compat_from_user)
+ match->compat_from_user(dst, mwt->data);
+ else
+ memcpy(dst, mwt->data, match_size);
+ }
+
+ size_kern = match->matchsize;
+ if (unlikely(size_kern == -1))
+ size_kern = match_size;
+ module_put(match->me);
+ break;
+ case EBT_COMPAT_WATCHER: /* fallthrough */
+ case EBT_COMPAT_TARGET:
+ wt = xt_request_find_target(NFPROTO_BRIDGE, name, 0);
+ if (IS_ERR(wt))
+ return PTR_ERR(wt);
+ off = xt_compat_target_offset(wt);
+
+ if (dst) {
+ if (wt->compat_from_user)
+ wt->compat_from_user(dst, mwt->data);
+ else
+ memcpy(dst, mwt->data, match_size);
+ }
+
+ size_kern = wt->targetsize;
+ module_put(wt->me);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ state->buf_kern_offset += match_size + off;
+ state->buf_user_offset += match_size;
+ pad = XT_ALIGN(size_kern) - size_kern;
+
+ if (pad > 0 && dst) {
+ BUG_ON(state->buf_kern_len <= pad);
+ BUG_ON(state->buf_kern_offset - (match_size + off) + size_kern > state->buf_kern_len - pad);
+ memset(dst + size_kern, 0, pad);
+ }
+ return off + match_size;
+}
+
+/*
+ * return size of all matches, watchers or target, including necessary
+ * alignment and padding.
+ */
+static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
+ unsigned int size_left, enum compat_mwt type,
+ struct ebt_entries_buf_state *state, const void *base)
+{
+ int growth = 0;
+ char *buf;
+
+ if (size_left == 0)
+ return 0;
+
+ buf = (char *) match32;
+
+ while (size_left >= sizeof(*match32)) {
+ struct ebt_entry_match *match_kern;
+ int ret;
+
+ match_kern = (struct ebt_entry_match *) state->buf_kern_start;
+ if (match_kern) {
+ char *tmp;
+ tmp = state->buf_kern_start + state->buf_kern_offset;
+ match_kern = (struct ebt_entry_match *) tmp;
+ }
+ ret = ebt_buf_add(state, buf, sizeof(*match32));
+ if (ret < 0)
+ return ret;
+ size_left -= sizeof(*match32);
+
+ /* add padding before match->data (if any) */
+ ret = ebt_buf_add_pad(state, ebt_compat_entry_padsize());
+ if (ret < 0)
+ return ret;
+
+ if (match32->match_size > size_left)
+ return -EINVAL;
+
+ size_left -= match32->match_size;
+
+ ret = compat_mtw_from_user(match32, type, state, base);
+ if (ret < 0)
+ return ret;
+
+ BUG_ON(ret < match32->match_size);
+ growth += ret - match32->match_size;
+ growth += ebt_compat_entry_padsize();
+
+ buf += sizeof(*match32);
+ buf += match32->match_size;
+
+ if (match_kern)
+ match_kern->match_size = ret;
+
+ WARN_ON(type == EBT_COMPAT_TARGET && size_left);
+ match32 = (struct compat_ebt_entry_mwt *) buf;
+ }
+
+ return growth;
+}
+
+/* called for all ebt_entry structures. */
+static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
+ unsigned int *total,
+ struct ebt_entries_buf_state *state)
+{
+ unsigned int i, j, startoff, new_offset = 0;
+ /* stores match/watchers/targets & offset of next struct ebt_entry: */
+ unsigned int offsets[4];
+ unsigned int *offsets_update = NULL;
+ int ret;
+ char *buf_start;
+
+ if (*total < sizeof(struct ebt_entries))
+ return -EINVAL;
+
+ if (!entry->bitmask) {
+ *total -= sizeof(struct ebt_entries);
+ return ebt_buf_add(state, entry, sizeof(struct ebt_entries));
+ }
+ if (*total < sizeof(*entry) || entry->next_offset < sizeof(*entry))
+ return -EINVAL;
+
+ startoff = state->buf_user_offset;
+ /* pull in most part of ebt_entry, it does not need to be changed. */
+ ret = ebt_buf_add(state, entry,
+ offsetof(struct ebt_entry, watchers_offset));
+ if (ret < 0)
+ return ret;
+
+ offsets[0] = sizeof(struct ebt_entry); /* matches come first */
+ memcpy(&offsets[1], &entry->watchers_offset,
+ sizeof(offsets) - sizeof(offsets[0]));
+
+ if (state->buf_kern_start) {
+ buf_start = state->buf_kern_start + state->buf_kern_offset;
+ offsets_update = (unsigned int *) buf_start;
+ }
+ ret = ebt_buf_add(state, &offsets[1],
+ sizeof(offsets) - sizeof(offsets[0]));
+ if (ret < 0)
+ return ret;
+ buf_start = (char *) entry;
+ /*
+ * 0: matches offset, always follows ebt_entry.
+ * 1: watchers offset, from ebt_entry structure
+ * 2: target offset, from ebt_entry structure
+ * 3: next ebt_entry offset, from ebt_entry structure
+ *
+ * offsets are relative to beginning of struct ebt_entry (i.e., 0).
+ */
+ for (i = 0, j = 1 ; j < 4 ; j++, i++) {
+ struct compat_ebt_entry_mwt *match32;
+ unsigned int size;
+ char *buf = buf_start;
+
+ buf = buf_start + offsets[i];
+ if (offsets[i] > offsets[j])
+ return -EINVAL;
+
+ match32 = (struct compat_ebt_entry_mwt *) buf;
+ size = offsets[j] - offsets[i];
+ ret = ebt_size_mwt(match32, size, i, state, base);
+ if (ret < 0)
+ return ret;
+ new_offset += ret;
+ if (offsets_update && new_offset) {
+ pr_debug("change offset %d to %d\n",
+ offsets_update[i], offsets[j] + new_offset);
+ offsets_update[i] = offsets[j] + new_offset;
+ }
+ }
+
+ if (state->buf_kern_start == NULL) {
+ unsigned int offset = buf_start - (char *) base;
+
+ ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset);
+ if (ret < 0)
+ return ret;
+ }
+
+ startoff = state->buf_user_offset - startoff;
+
+ BUG_ON(*total < startoff);
+ *total -= startoff;
+ return 0;
+}
+
+/*
+ * repl->entries_size is the size of the ebt_entry blob in userspace.
+ * It might need more memory when copied to a 64 bit kernel in case
+ * userspace is 32-bit. So, first task: find out how much memory is needed.
+ *
+ * Called before validation is performed.
+ */
+static int compat_copy_entries(unsigned char *data, unsigned int size_user,
+ struct ebt_entries_buf_state *state)
+{
+ unsigned int size_remaining = size_user;
+ int ret;
+
+ ret = EBT_ENTRY_ITERATE(data, size_user, size_entry_mwt, data,
+ &size_remaining, state);
+ if (ret < 0)
+ return ret;
+
+ WARN_ON(size_remaining);
+ return state->buf_kern_offset;
+}
+
+
+static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl,
+ void __user *user, unsigned int len)
+{
+ struct compat_ebt_replace tmp;
+ int i;
+
+ if (len < sizeof(tmp))
+ return -EINVAL;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ if (len != sizeof(tmp) + tmp.entries_size)
+ return -EINVAL;
+
+ if (tmp.entries_size == 0)
+ return -EINVAL;
+
+ if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) /
+ NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter))
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
+ return -ENOMEM;
+
+ memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry));
+
+ /* starting with hook_entry, 32 vs. 64 bit structures are different */
+ for (i = 0; i < NF_BR_NUMHOOKS; i++)
+ repl->hook_entry[i] = compat_ptr(tmp.hook_entry[i]);
+
+ repl->num_counters = tmp.num_counters;
+ repl->counters = compat_ptr(tmp.counters);
+ repl->entries = compat_ptr(tmp.entries);
+ return 0;
+}
+
+static int compat_do_replace(struct net *net, void __user *user,
+ unsigned int len)
+{
+ int ret, i, countersize, size64;
+ struct ebt_table_info *newinfo;
+ struct ebt_replace tmp;
+ struct ebt_entries_buf_state state;
+ void *entries_tmp;
+
+ ret = compat_copy_ebt_replace_from_user(&tmp, user, len);
+ if (ret) {
+ /* try real handler in case userland supplied needed padding */
+ if (ret == -EINVAL && do_replace(net, user, len) == 0)
+ ret = 0;
+ return ret;
+ }
+
+ countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
+ newinfo = vmalloc(sizeof(*newinfo) + countersize);
+ if (!newinfo)
+ return -ENOMEM;
+
+ if (countersize)
+ memset(newinfo->counters, 0, countersize);
+
+ memset(&state, 0, sizeof(state));
+
+ newinfo->entries = vmalloc(tmp.entries_size);
+ if (!newinfo->entries) {
+ ret = -ENOMEM;
+ goto free_newinfo;
+ }
+ if (copy_from_user(
+ newinfo->entries, tmp.entries, tmp.entries_size) != 0) {
+ ret = -EFAULT;
+ goto free_entries;
+ }
+
+ entries_tmp = newinfo->entries;
+
+ xt_compat_lock(NFPROTO_BRIDGE);
+
+ xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
+ ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
+ if (ret < 0)
+ goto out_unlock;
+
+ pr_debug("tmp.entries_size %d, kern off %d, user off %d delta %d\n",
+ tmp.entries_size, state.buf_kern_offset, state.buf_user_offset,
+ xt_compat_calc_jump(NFPROTO_BRIDGE, tmp.entries_size));
+
+ size64 = ret;
+ newinfo->entries = vmalloc(size64);
+ if (!newinfo->entries) {
+ vfree(entries_tmp);
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memset(&state, 0, sizeof(state));
+ state.buf_kern_start = newinfo->entries;
+ state.buf_kern_len = size64;
+
+ ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
+ BUG_ON(ret < 0); /* parses same data again */
+
+ vfree(entries_tmp);
+ tmp.entries_size = size64;
+
+ for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+ char __user *usrptr;
+ if (tmp.hook_entry[i]) {
+ unsigned int delta;
+ usrptr = (char __user *) tmp.hook_entry[i];
+ delta = usrptr - tmp.entries;
+ usrptr += xt_compat_calc_jump(NFPROTO_BRIDGE, delta);
+ tmp.hook_entry[i] = (struct ebt_entries __user *)usrptr;
+ }
+ }
+
+ xt_compat_flush_offsets(NFPROTO_BRIDGE);
+ xt_compat_unlock(NFPROTO_BRIDGE);
+
+ ret = do_replace_finish(net, &tmp, newinfo);
+ if (ret == 0)
+ return ret;
+free_entries:
+ vfree(newinfo->entries);
+free_newinfo:
+ vfree(newinfo);
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(NFPROTO_BRIDGE);
+ xt_compat_unlock(NFPROTO_BRIDGE);
+ goto free_entries;
+}
+
+static int compat_update_counters(struct net *net, void __user *user,
+ unsigned int len)
+{
+ struct compat_ebt_replace hlp;
+
+ if (copy_from_user(&hlp, user, sizeof(hlp)))
+ return -EFAULT;
+
+ /* try real handler in case userland supplied needed padding */
+ if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
+ return update_counters(net, user, len);
+
+ return do_update_counters(net, hlp.name, compat_ptr(hlp.counters),
+ hlp.num_counters, user, len);
+}
+
+static int compat_do_ebt_set_ctl(struct sock *sk,
+ int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+ struct net *net = sock_net(sk);
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case EBT_SO_SET_ENTRIES:
+ ret = compat_do_replace(net, user, len);
+ break;
+ case EBT_SO_SET_COUNTERS:
+ ret = compat_update_counters(net, user, len);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
+ void __user *user, int *len)
+{
+ int ret;
+ struct compat_ebt_replace tmp;
+ struct ebt_table *t;
+ struct net *net = sock_net(sk);
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* try real handler in case userland supplied needed padding */
+ if ((cmd == EBT_SO_GET_INFO ||
+ cmd == EBT_SO_GET_INIT_INFO) && *len != sizeof(tmp))
+ return do_ebt_get_ctl(sk, cmd, user, len);
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
+ if (!t)
+ return ret;
+
+ xt_compat_lock(NFPROTO_BRIDGE);
+ switch (cmd) {
+ case EBT_SO_GET_INFO:
+ tmp.nentries = t->private->nentries;
+ ret = compat_table_info(t->private, &tmp);
+ if (ret)
+ goto out;
+ tmp.valid_hooks = t->valid_hooks;
+
+ if (copy_to_user(user, &tmp, *len) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = 0;
+ break;
+ case EBT_SO_GET_INIT_INFO:
+ tmp.nentries = t->table->nentries;
+ tmp.entries_size = t->table->entries_size;
+ tmp.valid_hooks = t->table->valid_hooks;
+
+ if (copy_to_user(user, &tmp, *len) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = 0;
+ break;
+ case EBT_SO_GET_ENTRIES:
+ case EBT_SO_GET_INIT_ENTRIES:
+ /*
+ * try real handler first in case of userland-side padding.
+ * in case we are dealing with an 'ordinary' 32 bit binary
+ * without 64bit compatibility padding, this will fail right
+ * after copy_from_user when the *len argument is validated.
+ *
+ * the compat_ variant needs to do one pass over the kernel
+ * data set to adjust for size differences before it the check.
+ */
+ if (copy_everything_to_user(t, user, len, cmd) == 0)
+ ret = 0;
+ else
+ ret = compat_copy_everything_to_user(t, user, len, cmd);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ out:
+ xt_compat_flush_offsets(NFPROTO_BRIDGE);
+ xt_compat_unlock(NFPROTO_BRIDGE);
+ mutex_unlock(&ebt_mutex);
+ return ret;
+}
+#endif
+
+static struct nf_sockopt_ops ebt_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = EBT_BASE_CTL,
+ .set_optmax = EBT_SO_SET_MAX + 1,
+ .set = do_ebt_set_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_set = compat_do_ebt_set_ctl,
+#endif
+ .get_optmin = EBT_BASE_CTL,
+ .get_optmax = EBT_SO_GET_MAX + 1,
+ .get = do_ebt_get_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_get = compat_do_ebt_get_ctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+static int __init ebtables_init(void)
+{
+ int ret;
+
+ ret = xt_register_target(&ebt_standard_target);
+ if (ret < 0)
+ return ret;
+ ret = nf_register_sockopt(&ebt_sockopts);
+ if (ret < 0) {
+ xt_unregister_target(&ebt_standard_target);
+ return ret;
+ }
+
+ printk(KERN_INFO "Ebtables v2.0 registered\n");
+ return 0;
+}
+
+static void __exit ebtables_fini(void)
+{
+ nf_unregister_sockopt(&ebt_sockopts);
+ xt_unregister_target(&ebt_standard_target);
+ printk(KERN_INFO "Ebtables v2.0 unregistered\n");
+}
+
+EXPORT_SYMBOL(ebt_register_table);
+EXPORT_SYMBOL(ebt_unregister_table);
+EXPORT_SYMBOL(ebt_do_table);
+module_init(ebtables_init);
+module_exit(ebtables_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
new file mode 100644
index 000000000..5d9953a90
--- /dev/null
+++ b/net/bridge/netfilter/nf_log_bridge.c
@@ -0,0 +1,96 @@
+/*
+ * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_bridge.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_log.h>
+
+static void nf_log_bridge_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP):
+ nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
+ loginfo, "%s", prefix);
+ break;
+ }
+}
+
+static struct nf_logger nf_bridge_logger __read_mostly = {
+ .name = "nf_log_bridge",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_bridge_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_bridge_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_bridge_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_bridge_logger);
+}
+
+static struct pernet_operations nf_log_bridge_net_ops = {
+ .init = nf_log_bridge_net_init,
+ .exit = nf_log_bridge_net_exit,
+};
+
+static int __init nf_log_bridge_init(void)
+{
+ int ret;
+
+ /* Request to load the real packet loggers. */
+ nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG);
+ nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG);
+ nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG);
+
+ ret = register_pernet_subsys(&nf_log_bridge_net_ops);
+ if (ret < 0)
+ return ret;
+
+ nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger);
+ return 0;
+}
+
+static void __exit nf_log_bridge_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_bridge_net_ops);
+ nf_log_unregister(&nf_bridge_logger);
+}
+
+module_init(nf_log_bridge_init);
+module_exit(nf_log_bridge_exit);
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter bridge packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0);
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
new file mode 100644
index 000000000..a343e6244
--- /dev/null
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_bridge.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+
+int nft_bridge_iphdr_validate(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ u32 len;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 || iph->version != 4)
+ return 0;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ return 0;
+ else if (len < (iph->ihl*4))
+ return 0;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nft_bridge_iphdr_validate);
+
+int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
+{
+ struct ipv6hdr *hdr;
+ u32 pkt_len;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return 0;
+
+ pkt_len = ntohs(hdr->payload_len);
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate);
+
+static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
+ const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (nft_bridge_iphdr_validate(skb))
+ nft_set_pktinfo_ipv4(pkt, ops, skb, state);
+ else
+ nft_set_pktinfo(pkt, ops, skb, state);
+}
+
+static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+ const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (nft_bridge_ip6hdr_validate(skb) &&
+ nft_set_pktinfo_ipv6(pkt, ops, skb, state) == 0)
+ return;
+#endif
+ nft_set_pktinfo(pkt, ops, skb, state);
+}
+
+static unsigned int
+nft_do_chain_bridge(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ nft_bridge_set_pktinfo_ipv4(&pkt, ops, skb, state);
+ break;
+ case htons(ETH_P_IPV6):
+ nft_bridge_set_pktinfo_ipv6(&pkt, ops, skb, state);
+ break;
+ default:
+ nft_set_pktinfo(&pkt, ops, skb, state);
+ break;
+ }
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static struct nft_af_info nft_af_bridge __read_mostly = {
+ .family = NFPROTO_BRIDGE,
+ .nhooks = NF_BR_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_BR_PRE_ROUTING] = nft_do_chain_bridge,
+ [NF_BR_LOCAL_IN] = nft_do_chain_bridge,
+ [NF_BR_FORWARD] = nft_do_chain_bridge,
+ [NF_BR_LOCAL_OUT] = nft_do_chain_bridge,
+ [NF_BR_POST_ROUTING] = nft_do_chain_bridge,
+ },
+};
+
+static int nf_tables_bridge_init_net(struct net *net)
+{
+ net->nft.bridge = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.bridge == NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.bridge, &nft_af_bridge, sizeof(nft_af_bridge));
+
+ if (nft_register_afinfo(net, net->nft.bridge) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.bridge);
+ return -ENOMEM;
+}
+
+static void nf_tables_bridge_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.bridge);
+ kfree(net->nft.bridge);
+}
+
+static struct pernet_operations nf_tables_bridge_net_ops = {
+ .init = nf_tables_bridge_init_net,
+ .exit = nf_tables_bridge_exit_net,
+};
+
+static const struct nf_chain_type filter_bridge = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_BRIDGE,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_LOCAL_IN) |
+ (1 << NF_BR_FORWARD) |
+ (1 << NF_BR_LOCAL_OUT) |
+ (1 << NF_BR_POST_ROUTING),
+};
+
+static int __init nf_tables_bridge_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_bridge);
+ ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_bridge);
+
+ return ret;
+}
+
+static void __exit nf_tables_bridge_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_bridge_net_ops);
+ nft_unregister_chain_type(&filter_bridge);
+}
+
+module_init(nf_tables_bridge_init);
+module_exit(nf_tables_bridge_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_BRIDGE);
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
new file mode 100644
index 000000000..a21269b83
--- /dev/null
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2014 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+
+#include "../br_private.h"
+
+static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct net_device *in = pkt->in, *out = pkt->out;
+ u32 *dest = &regs->data[priv->dreg];
+ const struct net_bridge_port *p;
+
+ switch (priv->key) {
+ case NFT_META_BRI_IIFNAME:
+ if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
+ goto err;
+ break;
+ case NFT_META_BRI_OIFNAME:
+ if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
+ goto err;
+ break;
+ default:
+ goto out;
+ }
+
+ strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+ return;
+out:
+ return nft_meta_get_eval(expr, regs, pkt);
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_BRI_IIFNAME:
+ case NFT_META_BRI_OIFNAME:
+ len = IFNAMSIZ;
+ break;
+ default:
+ return nft_meta_get_init(ctx, expr, tb);
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+static struct nft_expr_type nft_meta_bridge_type;
+static const struct nft_expr_ops nft_meta_bridge_get_ops = {
+ .type = &nft_meta_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_bridge_get_eval,
+ .init = nft_meta_bridge_get_init,
+ .dump = nft_meta_get_dump,
+};
+
+static const struct nft_expr_ops nft_meta_bridge_set_ops = {
+ .type = &nft_meta_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_set_eval,
+ .init = nft_meta_set_init,
+ .dump = nft_meta_set_dump,
+};
+
+static const struct nft_expr_ops *
+nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_META_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG])
+ return &nft_meta_bridge_get_ops;
+
+ if (tb[NFTA_META_SREG])
+ return &nft_meta_bridge_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
+ .family = NFPROTO_BRIDGE,
+ .name = "meta",
+ .select_ops = &nft_meta_bridge_select_ops,
+ .policy = nft_meta_policy,
+ .maxattr = NFTA_META_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_meta_bridge_module_init(void)
+{
+ return nft_register_expr(&nft_meta_bridge_type);
+}
+
+static void __exit nft_meta_bridge_module_exit(void)
+{
+ nft_unregister_expr(&nft_meta_bridge_type);
+}
+
+module_init(nft_meta_bridge_module_init);
+module_exit(nft_meta_bridge_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
new file mode 100644
index 000000000..858d84856
--- /dev/null
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2014 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/nf_tables_bridge.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv6.h>
+#include "../br_private.h"
+
+static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
+ struct sk_buff *nskb)
+{
+ struct ethhdr *eth;
+
+ eth = (struct ethhdr *)skb_push(nskb, ETH_HLEN);
+ skb_reset_mac_header(nskb);
+ ether_addr_copy(eth->h_source, eth_hdr(oldskb)->h_dest);
+ ether_addr_copy(eth->h_dest, eth_hdr(oldskb)->h_source);
+ eth->h_proto = eth_hdr(oldskb)->h_proto;
+ skb_pull(nskb, ETH_HLEN);
+}
+
+/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
+ * or the bridge port (NF_BRIDGE PREROUTING).
+ */
+static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+ struct iphdr *niph;
+ const struct tcphdr *oth;
+ struct tcphdr _oth;
+
+ if (!nft_bridge_iphdr_validate(oldskb))
+ return;
+
+ oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+ if (!oth)
+ return;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ sysctl_ip_default_ttl);
+ nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+ niph->ttl = sysctl_ip_default_ttl;
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+
+ nft_reject_br_push_etherhdr(oldskb, nskb);
+
+ br_deliver(br_port_get_rcu(dev), nskb);
+}
+
+static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct iphdr *niph;
+ struct icmphdr *icmph;
+ unsigned int len;
+ void *payload;
+ __wsum csum;
+ u8 proto;
+
+ if (oldskb->csum_bad || !nft_bridge_iphdr_validate(oldskb))
+ return;
+
+ /* IP header checks: fragment. */
+ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+ return;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+ len = min_t(unsigned int, 536, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return;
+
+ if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
+ return;
+
+ if (ip_hdr(oldskb)->protocol == IPPROTO_TCP ||
+ ip_hdr(oldskb)->protocol == IPPROTO_UDP)
+ proto = ip_hdr(oldskb)->protocol;
+ else
+ proto = 0;
+
+ if (!skb_csum_unnecessary(oldskb) &&
+ nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
+ return;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
+ sysctl_ip_default_ttl);
+
+ skb_reset_transport_header(nskb);
+ icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
+ memset(icmph, 0, sizeof(*icmph));
+ icmph->type = ICMP_DEST_UNREACH;
+ icmph->code = code;
+
+ payload = skb_put(nskb, len);
+ memcpy(payload, skb_network_header(oldskb), len);
+
+ csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0);
+ icmph->checksum = csum_fold(csum);
+
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+
+ nft_reject_br_push_etherhdr(oldskb, nskb);
+
+ br_deliver(br_port_get_rcu(dev), nskb);
+}
+
+static void nft_reject_br_send_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+ const struct tcphdr *oth;
+ struct tcphdr _oth;
+ unsigned int otcplen;
+ struct ipv6hdr *nip6h;
+
+ if (!nft_bridge_ip6hdr_validate(oldskb))
+ return;
+
+ oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook);
+ if (!oth)
+ return;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
+ net->ipv6.devconf_all->hop_limit);
+ nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ nft_reject_br_push_etherhdr(oldskb, nskb);
+
+ br_deliver(br_port_get_rcu(dev), nskb);
+}
+
+static bool reject6_br_csum_ok(struct sk_buff *skb, int hook)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int thoff;
+ __be16 fo;
+ u8 proto = ip6h->nexthdr;
+
+ if (skb->csum_bad)
+ return false;
+
+ if (skb_csum_unnecessary(skb))
+ return true;
+
+ if (ip6h->payload_len &&
+ pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
+ return false;
+
+ thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
+ if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
+ return false;
+
+ return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
+}
+
+static void nft_reject_br_send_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *nip6h;
+ struct icmp6hdr *icmp6h;
+ unsigned int len;
+ void *payload;
+
+ if (!nft_bridge_ip6hdr_validate(oldskb))
+ return;
+
+ /* Include "As much of invoking packet as possible without the ICMPv6
+ * packet exceeding the minimum IPv6 MTU" in the ICMP payload.
+ */
+ len = min_t(unsigned int, 1220, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return;
+
+ if (!reject6_br_csum_ok(oldskb, hook))
+ return;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmp6hdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6,
+ net->ipv6.devconf_all->hop_limit);
+
+ skb_reset_transport_header(nskb);
+ icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
+ memset(icmp6h, 0, sizeof(*icmp6h));
+ icmp6h->icmp6_type = ICMPV6_DEST_UNREACH;
+ icmp6h->icmp6_code = code;
+
+ payload = skb_put(nskb, len);
+ memcpy(payload, skb_network_header(oldskb), len);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ icmp6h->icmp6_cksum =
+ csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
+ nskb->len - sizeof(struct ipv6hdr),
+ IPPROTO_ICMPV6,
+ csum_partial(icmp6h,
+ nskb->len - sizeof(struct ipv6hdr),
+ 0));
+
+ nft_reject_br_push_etherhdr(oldskb, nskb);
+
+ br_deliver(br_port_get_rcu(dev), nskb);
+}
+
+static void nft_reject_bridge_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+ struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out);
+ const unsigned char *dest = eth_hdr(pkt->skb)->h_dest;
+
+ if (is_broadcast_ether_addr(dest) ||
+ is_multicast_ether_addr(dest))
+ goto out;
+
+ switch (eth_hdr(pkt->skb)->h_proto) {
+ case htons(ETH_P_IP):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_br_send_v4_unreach(pkt->skb, pkt->in,
+ pkt->ops->hooknum,
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_br_send_v4_tcp_reset(pkt->skb, pkt->in,
+ pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_br_send_v4_unreach(pkt->skb, pkt->in,
+ pkt->ops->hooknum,
+ nft_reject_icmp_code(priv->icmp_code));
+ break;
+ }
+ break;
+ case htons(ETH_P_IPV6):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in,
+ pkt->ops->hooknum,
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_br_send_v6_tcp_reset(net, pkt->skb, pkt->in,
+ pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_br_send_v6_unreach(net, pkt->skb, pkt->in,
+ pkt->ops->hooknum,
+ nft_reject_icmpv6_code(priv->icmp_code));
+ break;
+ }
+ break;
+ default:
+ /* No explicit way to reject this protocol, drop it. */
+ break;
+ }
+out:
+ regs->verdict.code = NF_DROP;
+}
+
+static int nft_reject_bridge_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_BR_PRE_ROUTING) |
+ (1 << NF_BR_LOCAL_IN));
+}
+
+static int nft_reject_bridge_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+ int icmp_code, err;
+
+ err = nft_reject_bridge_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_REJECT_TYPE] == NULL)
+ return -EINVAL;
+
+ priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
+ if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
+ return -EINVAL;
+
+ icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+ if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
+ icmp_code > NFT_REJECT_ICMPX_MAX)
+ return -EINVAL;
+
+ priv->icmp_code = icmp_code;
+ break;
+ case NFT_REJECT_TCP_RST:
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nft_reject_bridge_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_reject *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
+ goto nla_put_failure;
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
+ if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
+ goto nla_put_failure;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_reject_bridge_type;
+static const struct nft_expr_ops nft_reject_bridge_ops = {
+ .type = &nft_reject_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_bridge_eval,
+ .init = nft_reject_bridge_init,
+ .dump = nft_reject_bridge_dump,
+ .validate = nft_reject_bridge_validate,
+};
+
+static struct nft_expr_type nft_reject_bridge_type __read_mostly = {
+ .family = NFPROTO_BRIDGE,
+ .name = "reject",
+ .ops = &nft_reject_bridge_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_bridge_module_init(void)
+{
+ return nft_register_expr(&nft_reject_bridge_type);
+}
+
+static void __exit nft_reject_bridge_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_bridge_type);
+}
+
+module_init(nft_reject_bridge_module_init);
+module_exit(nft_reject_bridge_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "reject");
diff --git a/net/caif/Kconfig b/net/caif/Kconfig
new file mode 100644
index 000000000..d3694953b
--- /dev/null
+++ b/net/caif/Kconfig
@@ -0,0 +1,53 @@
+#
+# CAIF net configurations
+#
+
+menuconfig CAIF
+ tristate "CAIF support"
+ select CRC_CCITT
+ default n
+ ---help---
+ The "Communication CPU to Application CPU Interface" (CAIF) is a packet
+ based connection-oriented MUX protocol developed by ST-Ericsson for use
+ with its modems. It is accessed from user space as sockets (PF_CAIF).
+
+ Say Y (or M) here if you build for a phone product (e.g. Android or
+ MeeGo ) that uses CAIF as transport, if unsure say N.
+
+ If you select to build it as module then CAIF_NETDEV also needs to be
+ built as modules. You will also need to say yes to any CAIF physical
+ devices that your platform requires.
+
+ See Documentation/networking/caif for a further explanation on how to
+ use and configure CAIF.
+
+config CAIF_DEBUG
+ bool "Enable Debug"
+ depends on CAIF
+ default n
+ ---help---
+ Enable the inclusion of debug code in the CAIF stack.
+ Be aware that doing this will impact performance.
+ If unsure say N.
+
+config CAIF_NETDEV
+ tristate "CAIF GPRS Network device"
+ depends on CAIF
+ default CAIF
+ ---help---
+ Say Y if you will be using a CAIF based GPRS network device.
+ This can be either built-in or a loadable module,
+ If you select to build it as a built-in then the main CAIF device must
+ also be a built-in.
+ If unsure say Y.
+
+config CAIF_USB
+ tristate "CAIF USB support"
+ depends on CAIF
+ default n
+ ---help---
+ Say Y if you are using CAIF over USB CDC NCM.
+ This can be either built-in or a loadable module,
+ If you select to build it as a built-in then the main CAIF device must
+ also be a built-in.
+ If unsure say N.
diff --git a/net/caif/Makefile b/net/caif/Makefile
new file mode 100644
index 000000000..cc2b51154
--- /dev/null
+++ b/net/caif/Makefile
@@ -0,0 +1,15 @@
+ccflags-$(CONFIG_CAIF_DEBUG) := -DDEBUG
+
+caif-y := caif_dev.o \
+ cfcnfg.o cfmuxl.o cfctrl.o \
+ cffrml.o cfveil.o cfdbgl.o\
+ cfserl.o cfdgml.o \
+ cfrfml.o cfvidl.o cfutill.o \
+ cfsrvl.o cfpkt_skbuff.o
+
+obj-$(CONFIG_CAIF) += caif.o
+obj-$(CONFIG_CAIF_NETDEV) += chnl_net.o
+obj-$(CONFIG_CAIF) += caif_socket.o
+obj-$(CONFIG_CAIF_USB) += caif_usb.o
+
+export-y := caif.o
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
new file mode 100644
index 000000000..edbca468f
--- /dev/null
+++ b/net/caif/caif_dev.c
@@ -0,0 +1,574 @@
+/*
+ * CAIF Interface registration.
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ *
+ * Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont
+ * and Sakari Ailus <sakari.ailus@nokia.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <net/netns/generic.h>
+#include <net/net_namespace.h>
+#include <net/pkt_sched.h>
+#include <net/caif/caif_device.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/caif_dev.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfcnfg.h>
+#include <net/caif/cfserl.h>
+
+MODULE_LICENSE("GPL");
+
+/* Used for local tracking of the CAIF net devices */
+struct caif_device_entry {
+ struct cflayer layer;
+ struct list_head list;
+ struct net_device *netdev;
+ int __percpu *pcpu_refcnt;
+ spinlock_t flow_lock;
+ struct sk_buff *xoff_skb;
+ void (*xoff_skb_dtor)(struct sk_buff *skb);
+ bool xoff;
+};
+
+struct caif_device_entry_list {
+ struct list_head list;
+ /* Protects simulanous deletes in list */
+ struct mutex lock;
+};
+
+struct caif_net {
+ struct cfcnfg *cfg;
+ struct caif_device_entry_list caifdevs;
+};
+
+static int caif_net_id;
+static int q_high = 50; /* Percent */
+
+struct cfcnfg *get_cfcnfg(struct net *net)
+{
+ struct caif_net *caifn;
+ caifn = net_generic(net, caif_net_id);
+ return caifn->cfg;
+}
+EXPORT_SYMBOL(get_cfcnfg);
+
+static struct caif_device_entry_list *caif_device_list(struct net *net)
+{
+ struct caif_net *caifn;
+ caifn = net_generic(net, caif_net_id);
+ return &caifn->caifdevs;
+}
+
+static void caifd_put(struct caif_device_entry *e)
+{
+ this_cpu_dec(*e->pcpu_refcnt);
+}
+
+static void caifd_hold(struct caif_device_entry *e)
+{
+ this_cpu_inc(*e->pcpu_refcnt);
+}
+
+static int caifd_refcnt_read(struct caif_device_entry *e)
+{
+ int i, refcnt = 0;
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(e->pcpu_refcnt, i);
+ return refcnt;
+}
+
+/* Allocate new CAIF device. */
+static struct caif_device_entry *caif_device_alloc(struct net_device *dev)
+{
+ struct caif_device_entry *caifd;
+
+ caifd = kzalloc(sizeof(*caifd), GFP_KERNEL);
+ if (!caifd)
+ return NULL;
+ caifd->pcpu_refcnt = alloc_percpu(int);
+ if (!caifd->pcpu_refcnt) {
+ kfree(caifd);
+ return NULL;
+ }
+ caifd->netdev = dev;
+ dev_hold(dev);
+ return caifd;
+}
+
+static struct caif_device_entry *caif_get(struct net_device *dev)
+{
+ struct caif_device_entry_list *caifdevs =
+ caif_device_list(dev_net(dev));
+ struct caif_device_entry *caifd;
+
+ list_for_each_entry_rcu(caifd, &caifdevs->list, list) {
+ if (caifd->netdev == dev)
+ return caifd;
+ }
+ return NULL;
+}
+
+static void caif_flow_cb(struct sk_buff *skb)
+{
+ struct caif_device_entry *caifd;
+ void (*dtor)(struct sk_buff *skb) = NULL;
+ bool send_xoff;
+
+ WARN_ON(skb->dev == NULL);
+
+ rcu_read_lock();
+ caifd = caif_get(skb->dev);
+
+ WARN_ON(caifd == NULL);
+ if (caifd == NULL)
+ return;
+
+ caifd_hold(caifd);
+ rcu_read_unlock();
+
+ spin_lock_bh(&caifd->flow_lock);
+ send_xoff = caifd->xoff;
+ caifd->xoff = 0;
+ dtor = caifd->xoff_skb_dtor;
+
+ if (WARN_ON(caifd->xoff_skb != skb))
+ skb = NULL;
+
+ caifd->xoff_skb = NULL;
+ caifd->xoff_skb_dtor = NULL;
+
+ spin_unlock_bh(&caifd->flow_lock);
+
+ if (dtor && skb)
+ dtor(skb);
+
+ if (send_xoff)
+ caifd->layer.up->
+ ctrlcmd(caifd->layer.up,
+ _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND,
+ caifd->layer.id);
+ caifd_put(caifd);
+}
+
+static int transmit(struct cflayer *layer, struct cfpkt *pkt)
+{
+ int err, high = 0, qlen = 0;
+ struct caif_device_entry *caifd =
+ container_of(layer, struct caif_device_entry, layer);
+ struct sk_buff *skb;
+ struct netdev_queue *txq;
+
+ rcu_read_lock_bh();
+
+ skb = cfpkt_tonative(pkt);
+ skb->dev = caifd->netdev;
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_CAIF);
+
+ /* Check if we need to handle xoff */
+ if (likely(caifd->netdev->tx_queue_len == 0))
+ goto noxoff;
+
+ if (unlikely(caifd->xoff))
+ goto noxoff;
+
+ if (likely(!netif_queue_stopped(caifd->netdev))) {
+ /* If we run with a TX queue, check if the queue is too long*/
+ txq = netdev_get_tx_queue(skb->dev, 0);
+ qlen = qdisc_qlen(rcu_dereference_bh(txq->qdisc));
+
+ if (likely(qlen == 0))
+ goto noxoff;
+
+ high = (caifd->netdev->tx_queue_len * q_high) / 100;
+ if (likely(qlen < high))
+ goto noxoff;
+ }
+
+ /* Hold lock while accessing xoff */
+ spin_lock_bh(&caifd->flow_lock);
+ if (caifd->xoff) {
+ spin_unlock_bh(&caifd->flow_lock);
+ goto noxoff;
+ }
+
+ /*
+ * Handle flow off, we do this by temporary hi-jacking this
+ * skb's destructor function, and replace it with our own
+ * flow-on callback. The callback will set flow-on and call
+ * the original destructor.
+ */
+
+ pr_debug("queue has stopped(%d) or is full (%d > %d)\n",
+ netif_queue_stopped(caifd->netdev),
+ qlen, high);
+ caifd->xoff = 1;
+ caifd->xoff_skb = skb;
+ caifd->xoff_skb_dtor = skb->destructor;
+ skb->destructor = caif_flow_cb;
+ spin_unlock_bh(&caifd->flow_lock);
+
+ caifd->layer.up->ctrlcmd(caifd->layer.up,
+ _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND,
+ caifd->layer.id);
+noxoff:
+ rcu_read_unlock_bh();
+
+ err = dev_queue_xmit(skb);
+ if (err > 0)
+ err = -EIO;
+
+ return err;
+}
+
+/*
+ * Stuff received packets into the CAIF stack.
+ * On error, returns non-zero and releases the skb.
+ */
+static int receive(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pkttype, struct net_device *orig_dev)
+{
+ struct cfpkt *pkt;
+ struct caif_device_entry *caifd;
+ int err;
+
+ pkt = cfpkt_fromnative(CAIF_DIR_IN, skb);
+
+ rcu_read_lock();
+ caifd = caif_get(dev);
+
+ if (!caifd || !caifd->layer.up || !caifd->layer.up->receive ||
+ !netif_oper_up(caifd->netdev)) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ /* Hold reference to netdevice while using CAIF stack */
+ caifd_hold(caifd);
+ rcu_read_unlock();
+
+ err = caifd->layer.up->receive(caifd->layer.up, pkt);
+
+ /* For -EILSEQ the packet is not freed so so it now */
+ if (err == -EILSEQ)
+ cfpkt_destroy(pkt);
+
+ /* Release reference to stack upwards */
+ caifd_put(caifd);
+
+ if (err != 0)
+ err = NET_RX_DROP;
+ return err;
+}
+
+static struct packet_type caif_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CAIF),
+ .func = receive,
+};
+
+static void dev_flowctrl(struct net_device *dev, int on)
+{
+ struct caif_device_entry *caifd;
+
+ rcu_read_lock();
+
+ caifd = caif_get(dev);
+ if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) {
+ rcu_read_unlock();
+ return;
+ }
+
+ caifd_hold(caifd);
+ rcu_read_unlock();
+
+ caifd->layer.up->ctrlcmd(caifd->layer.up,
+ on ?
+ _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND :
+ _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND,
+ caifd->layer.id);
+ caifd_put(caifd);
+}
+
+void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
+ struct cflayer *link_support, int head_room,
+ struct cflayer **layer,
+ int (**rcv_func)(struct sk_buff *, struct net_device *,
+ struct packet_type *,
+ struct net_device *))
+{
+ struct caif_device_entry *caifd;
+ enum cfcnfg_phy_preference pref;
+ struct cfcnfg *cfg = get_cfcnfg(dev_net(dev));
+ struct caif_device_entry_list *caifdevs;
+
+ caifdevs = caif_device_list(dev_net(dev));
+ caifd = caif_device_alloc(dev);
+ if (!caifd)
+ return;
+ *layer = &caifd->layer;
+ spin_lock_init(&caifd->flow_lock);
+
+ switch (caifdev->link_select) {
+ case CAIF_LINK_HIGH_BANDW:
+ pref = CFPHYPREF_HIGH_BW;
+ break;
+ case CAIF_LINK_LOW_LATENCY:
+ pref = CFPHYPREF_LOW_LAT;
+ break;
+ default:
+ pref = CFPHYPREF_HIGH_BW;
+ break;
+ }
+ mutex_lock(&caifdevs->lock);
+ list_add_rcu(&caifd->list, &caifdevs->list);
+
+ strncpy(caifd->layer.name, dev->name,
+ sizeof(caifd->layer.name) - 1);
+ caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0;
+ caifd->layer.transmit = transmit;
+ cfcnfg_add_phy_layer(cfg,
+ dev,
+ &caifd->layer,
+ pref,
+ link_support,
+ caifdev->use_fcs,
+ head_room);
+ mutex_unlock(&caifdevs->lock);
+ if (rcv_func)
+ *rcv_func = receive;
+}
+EXPORT_SYMBOL(caif_enroll_dev);
+
+/* notify Caif of device events */
+static int caif_device_notify(struct notifier_block *me, unsigned long what,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct caif_device_entry *caifd = NULL;
+ struct caif_dev_common *caifdev;
+ struct cfcnfg *cfg;
+ struct cflayer *layer, *link_support;
+ int head_room = 0;
+ struct caif_device_entry_list *caifdevs;
+
+ cfg = get_cfcnfg(dev_net(dev));
+ caifdevs = caif_device_list(dev_net(dev));
+
+ caifd = caif_get(dev);
+ if (caifd == NULL && dev->type != ARPHRD_CAIF)
+ return 0;
+
+ switch (what) {
+ case NETDEV_REGISTER:
+ if (caifd != NULL)
+ break;
+
+ caifdev = netdev_priv(dev);
+
+ link_support = NULL;
+ if (caifdev->use_frag) {
+ head_room = 1;
+ link_support = cfserl_create(dev->ifindex,
+ caifdev->use_stx);
+ if (!link_support) {
+ pr_warn("Out of memory\n");
+ break;
+ }
+ }
+ caif_enroll_dev(dev, caifdev, link_support, head_room,
+ &layer, NULL);
+ caifdev->flowctrl = dev_flowctrl;
+ break;
+
+ case NETDEV_UP:
+ rcu_read_lock();
+
+ caifd = caif_get(dev);
+ if (caifd == NULL) {
+ rcu_read_unlock();
+ break;
+ }
+
+ caifd->xoff = 0;
+ cfcnfg_set_phy_state(cfg, &caifd->layer, true);
+ rcu_read_unlock();
+
+ break;
+
+ case NETDEV_DOWN:
+ rcu_read_lock();
+
+ caifd = caif_get(dev);
+ if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ cfcnfg_set_phy_state(cfg, &caifd->layer, false);
+ caifd_hold(caifd);
+ rcu_read_unlock();
+
+ caifd->layer.up->ctrlcmd(caifd->layer.up,
+ _CAIF_CTRLCMD_PHYIF_DOWN_IND,
+ caifd->layer.id);
+
+ spin_lock_bh(&caifd->flow_lock);
+
+ /*
+ * Replace our xoff-destructor with original destructor.
+ * We trust that skb->destructor *always* is called before
+ * the skb reference is invalid. The hijacked SKB destructor
+ * takes the flow_lock so manipulating the skb->destructor here
+ * should be safe.
+ */
+ if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL)
+ caifd->xoff_skb->destructor = caifd->xoff_skb_dtor;
+
+ caifd->xoff = 0;
+ caifd->xoff_skb_dtor = NULL;
+ caifd->xoff_skb = NULL;
+
+ spin_unlock_bh(&caifd->flow_lock);
+ caifd_put(caifd);
+ break;
+
+ case NETDEV_UNREGISTER:
+ mutex_lock(&caifdevs->lock);
+
+ caifd = caif_get(dev);
+ if (caifd == NULL) {
+ mutex_unlock(&caifdevs->lock);
+ break;
+ }
+ list_del_rcu(&caifd->list);
+
+ /*
+ * NETDEV_UNREGISTER is called repeatedly until all reference
+ * counts for the net-device are released. If references to
+ * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for
+ * the next call to NETDEV_UNREGISTER.
+ *
+ * If any packets are in flight down the CAIF Stack,
+ * cfcnfg_del_phy_layer will return nonzero.
+ * If no packets are in flight, the CAIF Stack associated
+ * with the net-device un-registering is freed.
+ */
+
+ if (caifd_refcnt_read(caifd) != 0 ||
+ cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) {
+
+ pr_info("Wait for device inuse\n");
+ /* Enrole device if CAIF Stack is still in use */
+ list_add_rcu(&caifd->list, &caifdevs->list);
+ mutex_unlock(&caifdevs->lock);
+ break;
+ }
+
+ synchronize_rcu();
+ dev_put(caifd->netdev);
+ free_percpu(caifd->pcpu_refcnt);
+ kfree(caifd);
+
+ mutex_unlock(&caifdevs->lock);
+ break;
+ }
+ return 0;
+}
+
+static struct notifier_block caif_device_notifier = {
+ .notifier_call = caif_device_notify,
+ .priority = 0,
+};
+
+/* Per-namespace Caif devices handling */
+static int caif_init_net(struct net *net)
+{
+ struct caif_net *caifn = net_generic(net, caif_net_id);
+ INIT_LIST_HEAD(&caifn->caifdevs.list);
+ mutex_init(&caifn->caifdevs.lock);
+
+ caifn->cfg = cfcnfg_create();
+ if (!caifn->cfg)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void caif_exit_net(struct net *net)
+{
+ struct caif_device_entry *caifd, *tmp;
+ struct caif_device_entry_list *caifdevs =
+ caif_device_list(net);
+ struct cfcnfg *cfg = get_cfcnfg(net);
+
+ rtnl_lock();
+ mutex_lock(&caifdevs->lock);
+
+ list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) {
+ int i = 0;
+ list_del_rcu(&caifd->list);
+ cfcnfg_set_phy_state(cfg, &caifd->layer, false);
+
+ while (i < 10 &&
+ (caifd_refcnt_read(caifd) != 0 ||
+ cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) {
+
+ pr_info("Wait for device inuse\n");
+ msleep(250);
+ i++;
+ }
+ synchronize_rcu();
+ dev_put(caifd->netdev);
+ free_percpu(caifd->pcpu_refcnt);
+ kfree(caifd);
+ }
+ cfcnfg_remove(cfg);
+
+ mutex_unlock(&caifdevs->lock);
+ rtnl_unlock();
+}
+
+static struct pernet_operations caif_net_ops = {
+ .init = caif_init_net,
+ .exit = caif_exit_net,
+ .id = &caif_net_id,
+ .size = sizeof(struct caif_net),
+};
+
+/* Initialize Caif devices list */
+static int __init caif_device_init(void)
+{
+ int result;
+
+ result = register_pernet_subsys(&caif_net_ops);
+
+ if (result)
+ return result;
+
+ register_netdevice_notifier(&caif_device_notifier);
+ dev_add_pack(&caif_packet_type);
+
+ return result;
+}
+
+static void __exit caif_device_exit(void)
+{
+ unregister_netdevice_notifier(&caif_device_notifier);
+ dev_remove_pack(&caif_packet_type);
+ unregister_pernet_subsys(&caif_net_ops);
+}
+
+module_init(caif_device_init);
+module_exit(caif_device_exit);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
new file mode 100644
index 000000000..112ad7848
--- /dev/null
+++ b/net/caif/caif_socket.c
@@ -0,0 +1,1124 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/tcp.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/caif/caif_socket.h>
+#include <linux/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/caif_dev.h>
+#include <net/caif/cfpkt.h>
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(AF_CAIF);
+
+/*
+ * CAIF state is re-using the TCP socket states.
+ * caif_states stored in sk_state reflect the state as reported by
+ * the CAIF stack, while sk_socket->state is the state of the socket.
+ */
+enum caif_states {
+ CAIF_CONNECTED = TCP_ESTABLISHED,
+ CAIF_CONNECTING = TCP_SYN_SENT,
+ CAIF_DISCONNECTED = TCP_CLOSE
+};
+
+#define TX_FLOW_ON_BIT 1
+#define RX_FLOW_ON_BIT 2
+
+struct caifsock {
+ struct sock sk; /* must be first member */
+ struct cflayer layer;
+ u32 flow_state;
+ struct caif_connect_request conn_req;
+ struct mutex readlock;
+ struct dentry *debugfs_socket_dir;
+ int headroom, tailroom, maxframe;
+};
+
+static int rx_flow_is_on(struct caifsock *cf_sk)
+{
+ return test_bit(RX_FLOW_ON_BIT,
+ (void *) &cf_sk->flow_state);
+}
+
+static int tx_flow_is_on(struct caifsock *cf_sk)
+{
+ return test_bit(TX_FLOW_ON_BIT,
+ (void *) &cf_sk->flow_state);
+}
+
+static void set_rx_flow_off(struct caifsock *cf_sk)
+{
+ clear_bit(RX_FLOW_ON_BIT,
+ (void *) &cf_sk->flow_state);
+}
+
+static void set_rx_flow_on(struct caifsock *cf_sk)
+{
+ set_bit(RX_FLOW_ON_BIT,
+ (void *) &cf_sk->flow_state);
+}
+
+static void set_tx_flow_off(struct caifsock *cf_sk)
+{
+ clear_bit(TX_FLOW_ON_BIT,
+ (void *) &cf_sk->flow_state);
+}
+
+static void set_tx_flow_on(struct caifsock *cf_sk)
+{
+ set_bit(TX_FLOW_ON_BIT,
+ (void *) &cf_sk->flow_state);
+}
+
+static void caif_read_lock(struct sock *sk)
+{
+ struct caifsock *cf_sk;
+ cf_sk = container_of(sk, struct caifsock, sk);
+ mutex_lock(&cf_sk->readlock);
+}
+
+static void caif_read_unlock(struct sock *sk)
+{
+ struct caifsock *cf_sk;
+ cf_sk = container_of(sk, struct caifsock, sk);
+ mutex_unlock(&cf_sk->readlock);
+}
+
+static int sk_rcvbuf_lowwater(struct caifsock *cf_sk)
+{
+ /* A quarter of full buffer is used a low water mark */
+ return cf_sk->sk.sk_rcvbuf / 4;
+}
+
+static void caif_flow_ctrl(struct sock *sk, int mode)
+{
+ struct caifsock *cf_sk;
+ cf_sk = container_of(sk, struct caifsock, sk);
+ if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd)
+ cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode);
+}
+
+/*
+ * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are
+ * not dropped, but CAIF is sending flow off instead.
+ */
+static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+ unsigned long flags;
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+ (unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) {
+ net_dbg_ratelimited("sending flow OFF (queue len = %d %d)\n",
+ atomic_read(&cf_sk->sk.sk_rmem_alloc),
+ sk_rcvbuf_lowwater(cf_sk));
+ set_rx_flow_off(cf_sk);
+ caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
+ }
+
+ err = sk_filter(sk, skb);
+ if (err)
+ return err;
+ if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
+ set_rx_flow_off(cf_sk);
+ net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
+ caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
+ }
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+ /* Cache the SKB length before we tack it onto the receive
+ * queue. Once it is added it no longer belongs to us and
+ * may be freed by other threads of control pulling packets
+ * from the queue.
+ */
+ spin_lock_irqsave(&list->lock, flags);
+ if (!sock_flag(sk, SOCK_DEAD))
+ __skb_queue_tail(list, skb);
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ else
+ kfree_skb(skb);
+ return 0;
+}
+
+/* Packet Receive Callback function called from CAIF Stack */
+static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct caifsock *cf_sk;
+ struct sk_buff *skb;
+
+ cf_sk = container_of(layr, struct caifsock, layer);
+ skb = cfpkt_tonative(pkt);
+
+ if (unlikely(cf_sk->sk.sk_state != CAIF_CONNECTED)) {
+ kfree_skb(skb);
+ return 0;
+ }
+ caif_queue_rcv_skb(&cf_sk->sk, skb);
+ return 0;
+}
+
+static void cfsk_hold(struct cflayer *layr)
+{
+ struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+ sock_hold(&cf_sk->sk);
+}
+
+static void cfsk_put(struct cflayer *layr)
+{
+ struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+ sock_put(&cf_sk->sk);
+}
+
+/* Packet Control Callback function called from CAIF */
+static void caif_ctrl_cb(struct cflayer *layr,
+ enum caif_ctrlcmd flow,
+ int phyid)
+{
+ struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+ switch (flow) {
+ case CAIF_CTRLCMD_FLOW_ON_IND:
+ /* OK from modem to start sending again */
+ set_tx_flow_on(cf_sk);
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_FLOW_OFF_IND:
+ /* Modem asks us to shut up */
+ set_tx_flow_off(cf_sk);
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_INIT_RSP:
+ /* We're now connected */
+ caif_client_register_refcnt(&cf_sk->layer,
+ cfsk_hold, cfsk_put);
+ cf_sk->sk.sk_state = CAIF_CONNECTED;
+ set_tx_flow_on(cf_sk);
+ cf_sk->sk.sk_shutdown = 0;
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_DEINIT_RSP:
+ /* We're now disconnected */
+ cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_INIT_FAIL_RSP:
+ /* Connect request failed */
+ cf_sk->sk.sk_err = ECONNREFUSED;
+ cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+ cf_sk->sk.sk_shutdown = SHUTDOWN_MASK;
+ /*
+ * Socket "standards" seems to require POLLOUT to
+ * be set at connect failure.
+ */
+ set_tx_flow_on(cf_sk);
+ cf_sk->sk.sk_state_change(&cf_sk->sk);
+ break;
+
+ case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+ /* Modem has closed this connection, or device is down. */
+ cf_sk->sk.sk_shutdown = SHUTDOWN_MASK;
+ cf_sk->sk.sk_err = ECONNRESET;
+ set_rx_flow_on(cf_sk);
+ cf_sk->sk.sk_error_report(&cf_sk->sk);
+ break;
+
+ default:
+ pr_debug("Unexpected flow command %d\n", flow);
+ }
+}
+
+static void caif_check_flow_release(struct sock *sk)
+{
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+ if (rx_flow_is_on(cf_sk))
+ return;
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) {
+ set_rx_flow_on(cf_sk);
+ caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ);
+ }
+}
+
+/*
+ * Copied from unix_dgram_recvmsg, but removed credit checks,
+ * changed locking, address handling and added MSG_TRUNC.
+ */
+static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m,
+ size_t len, int flags)
+
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int ret;
+ int copylen;
+
+ ret = -EOPNOTSUPP;
+ if (flags & MSG_OOB)
+ goto read_error;
+
+ skb = skb_recv_datagram(sk, flags, 0 , &ret);
+ if (!skb)
+ goto read_error;
+ copylen = skb->len;
+ if (len < copylen) {
+ m->msg_flags |= MSG_TRUNC;
+ copylen = len;
+ }
+
+ ret = skb_copy_datagram_msg(skb, 0, m, copylen);
+ if (ret)
+ goto out_free;
+
+ ret = (flags & MSG_TRUNC) ? skb->len : copylen;
+out_free:
+ skb_free_datagram(sk, skb);
+ caif_check_flow_release(sk);
+ return ret;
+
+read_error:
+ return ret;
+}
+
+
+/* Copied from unix_stream_wait_data, identical except for lock call. */
+static long caif_stream_data_wait(struct sock *sk, long timeo)
+{
+ DEFINE_WAIT(wait);
+ lock_sock(sk);
+
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ sk->sk_err ||
+ sk->sk_state != CAIF_CONNECTED ||
+ sock_flag(sk, SOCK_DEAD) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current) ||
+ !timeo)
+ break;
+
+ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+
+ if (sock_flag(sk, SOCK_DEAD))
+ break;
+
+ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ }
+
+ finish_wait(sk_sleep(sk), &wait);
+ release_sock(sk);
+ return timeo;
+}
+
+
+/*
+ * Copied from unix_stream_recvmsg, but removed credit checks,
+ * changed locking calls, changed address handling.
+ */
+static int caif_stream_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int copied = 0;
+ int target;
+ int err = 0;
+ long timeo;
+
+ err = -EOPNOTSUPP;
+ if (flags&MSG_OOB)
+ goto out;
+
+ /*
+ * Lock the socket to prevent queue disordering
+ * while sleeps in memcpy_tomsg
+ */
+ err = -EAGAIN;
+ if (sk->sk_state == CAIF_CONNECTING)
+ goto out;
+
+ caif_read_lock(sk);
+ target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
+ timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
+
+ do {
+ int chunk;
+ struct sk_buff *skb;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ err = -ECONNRESET;
+ goto unlock;
+ }
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ caif_check_flow_release(sk);
+
+ if (skb == NULL) {
+ if (copied >= target)
+ goto unlock;
+ /*
+ * POSIX 1003.1g mandates this order.
+ */
+ err = sock_error(sk);
+ if (err)
+ goto unlock;
+ err = -ECONNRESET;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto unlock;
+
+ err = -EPIPE;
+ if (sk->sk_state != CAIF_CONNECTED)
+ goto unlock;
+ if (sock_flag(sk, SOCK_DEAD))
+ goto unlock;
+
+ release_sock(sk);
+
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+
+ caif_read_unlock(sk);
+
+ timeo = caif_stream_data_wait(sk, timeo);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+ caif_read_lock(sk);
+ continue;
+unlock:
+ release_sock(sk);
+ break;
+ }
+ release_sock(sk);
+ chunk = min_t(unsigned int, skb->len, size);
+ if (memcpy_to_msg(msg, skb->data, chunk)) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ if (copied == 0)
+ copied = -EFAULT;
+ break;
+ }
+ copied += chunk;
+ size -= chunk;
+
+ /* Mark read part of skb as used */
+ if (!(flags & MSG_PEEK)) {
+ skb_pull(skb, chunk);
+
+ /* put the skb back if we didn't use it up. */
+ if (skb->len) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ kfree_skb(skb);
+
+ } else {
+ /*
+ * It is questionable, see note in unix_dgram_recvmsg.
+ */
+ /* put message back and return */
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ } while (size);
+ caif_read_unlock(sk);
+
+out:
+ return copied ? : err;
+}
+
+/*
+ * Copied from sock.c:sock_wait_for_wmem, but change to wait for
+ * CAIF flow-on and sock_writable.
+ */
+static long caif_wait_for_flow_on(struct caifsock *cf_sk,
+ int wait_writeable, long timeo, int *err)
+{
+ struct sock *sk = &cf_sk->sk;
+ DEFINE_WAIT(wait);
+ for (;;) {
+ *err = 0;
+ if (tx_flow_is_on(cf_sk) &&
+ (!wait_writeable || sock_writeable(&cf_sk->sk)))
+ break;
+ *err = -ETIMEDOUT;
+ if (!timeo)
+ break;
+ *err = -ERESTARTSYS;
+ if (signal_pending(current))
+ break;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ *err = -ECONNRESET;
+ if (sk->sk_shutdown & SHUTDOWN_MASK)
+ break;
+ *err = -sk->sk_err;
+ if (sk->sk_err)
+ break;
+ *err = -EPIPE;
+ if (cf_sk->sk.sk_state != CAIF_CONNECTED)
+ break;
+ timeo = schedule_timeout(timeo);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return timeo;
+}
+
+/*
+ * Transmit a SKB. The device may temporarily request re-transmission
+ * by returning EAGAIN.
+ */
+static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk,
+ int noblock, long timeo)
+{
+ struct cfpkt *pkt;
+
+ pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb);
+ memset(skb->cb, 0, sizeof(struct caif_payload_info));
+ cfpkt_set_prio(pkt, cf_sk->sk.sk_priority);
+
+ if (cf_sk->layer.dn == NULL) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt);
+}
+
+/* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */
+static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ int buffer_size;
+ int ret = 0;
+ struct sk_buff *skb = NULL;
+ int noblock;
+ long timeo;
+ caif_assert(cf_sk);
+ ret = sock_error(sk);
+ if (ret)
+ goto err;
+
+ ret = -EOPNOTSUPP;
+ if (msg->msg_flags&MSG_OOB)
+ goto err;
+
+ ret = -EOPNOTSUPP;
+ if (msg->msg_namelen)
+ goto err;
+
+ ret = -EINVAL;
+ if (unlikely(msg->msg_iter.iov->iov_base == NULL))
+ goto err;
+ noblock = msg->msg_flags & MSG_DONTWAIT;
+
+ timeo = sock_sndtimeo(sk, noblock);
+ timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk),
+ 1, timeo, &ret);
+
+ if (ret)
+ goto err;
+ ret = -EPIPE;
+ if (cf_sk->sk.sk_state != CAIF_CONNECTED ||
+ sock_flag(sk, SOCK_DEAD) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ goto err;
+
+ /* Error if trying to write more than maximum frame size. */
+ ret = -EMSGSIZE;
+ if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM)
+ goto err;
+
+ buffer_size = len + cf_sk->headroom + cf_sk->tailroom;
+
+ ret = -ENOMEM;
+ skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret);
+
+ if (!skb || skb_tailroom(skb) < buffer_size)
+ goto err;
+
+ skb_reserve(skb, cf_sk->headroom);
+
+ ret = memcpy_from_msg(skb_put(skb, len), msg, len);
+
+ if (ret)
+ goto err;
+ ret = transmit_skb(skb, cf_sk, noblock, timeo);
+ if (ret < 0)
+ /* skb is already freed */
+ return ret;
+
+ return len;
+err:
+ kfree_skb(skb);
+ return ret;
+}
+
+/*
+ * Copied from unix_stream_sendmsg and adapted to CAIF:
+ * Changed removed permission handling and added waiting for flow on
+ * and other minor adaptations.
+ */
+static int caif_stream_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ int err, size;
+ struct sk_buff *skb;
+ int sent = 0;
+ long timeo;
+
+ err = -EOPNOTSUPP;
+ if (unlikely(msg->msg_flags&MSG_OOB))
+ goto out_err;
+
+ if (unlikely(msg->msg_namelen))
+ goto out_err;
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ timeo = caif_wait_for_flow_on(cf_sk, 1, timeo, &err);
+
+ if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
+ goto pipe_err;
+
+ while (sent < len) {
+
+ size = len-sent;
+
+ if (size > cf_sk->maxframe)
+ size = cf_sk->maxframe;
+
+ /* If size is more than half of sndbuf, chop up message */
+ if (size > ((sk->sk_sndbuf >> 1) - 64))
+ size = (sk->sk_sndbuf >> 1) - 64;
+
+ if (size > SKB_MAX_ALLOC)
+ size = SKB_MAX_ALLOC;
+
+ skb = sock_alloc_send_skb(sk,
+ size + cf_sk->headroom +
+ cf_sk->tailroom,
+ msg->msg_flags&MSG_DONTWAIT,
+ &err);
+ if (skb == NULL)
+ goto out_err;
+
+ skb_reserve(skb, cf_sk->headroom);
+ /*
+ * If you pass two values to the sock_alloc_send_skb
+ * it tries to grab the large buffer with GFP_NOFS
+ * (which can fail easily), and if it fails grab the
+ * fallback size buffer which is under a page and will
+ * succeed. [Alan]
+ */
+ size = min_t(int, size, skb_tailroom(skb));
+
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err) {
+ kfree_skb(skb);
+ goto out_err;
+ }
+ err = transmit_skb(skb, cf_sk,
+ msg->msg_flags&MSG_DONTWAIT, timeo);
+ if (err < 0)
+ /* skb is already freed */
+ goto pipe_err;
+
+ sent += size;
+ }
+
+ return sent;
+
+pipe_err:
+ if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ err = -EPIPE;
+out_err:
+ return sent ? : err;
+}
+
+static int setsockopt(struct socket *sock,
+ int lvl, int opt, char __user *ov, unsigned int ol)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ int linksel;
+
+ if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED)
+ return -ENOPROTOOPT;
+
+ switch (opt) {
+ case CAIFSO_LINK_SELECT:
+ if (ol < sizeof(int))
+ return -EINVAL;
+ if (lvl != SOL_CAIF)
+ goto bad_sol;
+ if (copy_from_user(&linksel, ov, sizeof(int)))
+ return -EINVAL;
+ lock_sock(&(cf_sk->sk));
+ cf_sk->conn_req.link_selector = linksel;
+ release_sock(&cf_sk->sk);
+ return 0;
+
+ case CAIFSO_REQ_PARAM:
+ if (lvl != SOL_CAIF)
+ goto bad_sol;
+ if (cf_sk->sk.sk_protocol != CAIFPROTO_UTIL)
+ return -ENOPROTOOPT;
+ lock_sock(&(cf_sk->sk));
+ if (ol > sizeof(cf_sk->conn_req.param.data) ||
+ copy_from_user(&cf_sk->conn_req.param.data, ov, ol)) {
+ release_sock(&cf_sk->sk);
+ return -EINVAL;
+ }
+ cf_sk->conn_req.param.size = ol;
+ release_sock(&cf_sk->sk);
+ return 0;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ return 0;
+bad_sol:
+ return -ENOPROTOOPT;
+
+}
+
+/*
+ * caif_connect() - Connect a CAIF Socket
+ * Copied and modified af_irda.c:irda_connect().
+ *
+ * Note : by consulting "errno", the user space caller may learn the cause
+ * of the failure. Most of them are visible in the function, others may come
+ * from subroutines called and are listed here :
+ * o -EAFNOSUPPORT: bad socket family or type.
+ * o -ESOCKTNOSUPPORT: bad socket type or protocol
+ * o -EINVAL: bad socket address, or CAIF link type
+ * o -ECONNREFUSED: remote end refused the connection.
+ * o -EINPROGRESS: connect request sent but timed out (or non-blocking)
+ * o -EISCONN: already connected.
+ * o -ETIMEDOUT: Connection timed out (send timeout)
+ * o -ENODEV: No link layer to send request
+ * o -ECONNRESET: Received Shutdown indication or lost link layer
+ * o -ENOMEM: Out of memory
+ *
+ * State Strategy:
+ * o sk_state: holds the CAIF_* protocol state, it's updated by
+ * caif_ctrl_cb.
+ * o sock->state: holds the SS_* socket state and is updated by connect and
+ * disconnect.
+ */
+static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ long timeo;
+ int err;
+ int ifindex, headroom, tailroom;
+ unsigned int mtu;
+ struct net_device *dev;
+
+ lock_sock(sk);
+
+ err = -EAFNOSUPPORT;
+ if (uaddr->sa_family != AF_CAIF)
+ goto out;
+
+ switch (sock->state) {
+ case SS_UNCONNECTED:
+ /* Normal case, a fresh connect */
+ caif_assert(sk->sk_state == CAIF_DISCONNECTED);
+ break;
+ case SS_CONNECTING:
+ switch (sk->sk_state) {
+ case CAIF_CONNECTED:
+ sock->state = SS_CONNECTED;
+ err = -EISCONN;
+ goto out;
+ case CAIF_DISCONNECTED:
+ /* Reconnect allowed */
+ break;
+ case CAIF_CONNECTING:
+ err = -EALREADY;
+ if (flags & O_NONBLOCK)
+ goto out;
+ goto wait_connect;
+ }
+ break;
+ case SS_CONNECTED:
+ caif_assert(sk->sk_state == CAIF_CONNECTED ||
+ sk->sk_state == CAIF_DISCONNECTED);
+ if (sk->sk_shutdown & SHUTDOWN_MASK) {
+ /* Allow re-connect after SHUTDOWN_IND */
+ caif_disconnect_client(sock_net(sk), &cf_sk->layer);
+ caif_free_client(&cf_sk->layer);
+ break;
+ }
+ /* No reconnect on a seqpacket socket */
+ err = -EISCONN;
+ goto out;
+ case SS_DISCONNECTING:
+ case SS_FREE:
+ caif_assert(1); /*Should never happen */
+ break;
+ }
+ sk->sk_state = CAIF_DISCONNECTED;
+ sock->state = SS_UNCONNECTED;
+ sk_stream_kill_queues(&cf_sk->sk);
+
+ err = -EINVAL;
+ if (addr_len != sizeof(struct sockaddr_caif))
+ goto out;
+
+ memcpy(&cf_sk->conn_req.sockaddr, uaddr,
+ sizeof(struct sockaddr_caif));
+
+ /* Move to connecting socket, start sending Connect Requests */
+ sock->state = SS_CONNECTING;
+ sk->sk_state = CAIF_CONNECTING;
+
+ /* Check priority value comming from socket */
+ /* if priority value is out of range it will be ajusted */
+ if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX)
+ cf_sk->conn_req.priority = CAIF_PRIO_MAX;
+ else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN)
+ cf_sk->conn_req.priority = CAIF_PRIO_MIN;
+ else
+ cf_sk->conn_req.priority = cf_sk->sk.sk_priority;
+
+ /*ifindex = id of the interface.*/
+ cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if;
+
+ cf_sk->layer.receive = caif_sktrecv_cb;
+
+ err = caif_connect_client(sock_net(sk), &cf_sk->conn_req,
+ &cf_sk->layer, &ifindex, &headroom, &tailroom);
+
+ if (err < 0) {
+ cf_sk->sk.sk_socket->state = SS_UNCONNECTED;
+ cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+ goto out;
+ }
+
+ err = -ENODEV;
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ goto out;
+ }
+ cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom);
+ mtu = dev->mtu;
+ rcu_read_unlock();
+
+ cf_sk->tailroom = tailroom;
+ cf_sk->maxframe = mtu - (headroom + tailroom);
+ if (cf_sk->maxframe < 1) {
+ pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu);
+ err = -ENODEV;
+ goto out;
+ }
+
+ err = -EINPROGRESS;
+wait_connect:
+
+ if (sk->sk_state != CAIF_CONNECTED && (flags & O_NONBLOCK))
+ goto out;
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ release_sock(sk);
+ err = -ERESTARTSYS;
+ timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+ sk->sk_state != CAIF_CONNECTING,
+ timeo);
+ lock_sock(sk);
+ if (timeo < 0)
+ goto out; /* -ERESTARTSYS */
+
+ err = -ETIMEDOUT;
+ if (timeo == 0 && sk->sk_state != CAIF_CONNECTED)
+ goto out;
+ if (sk->sk_state != CAIF_CONNECTED) {
+ sock->state = SS_UNCONNECTED;
+ err = sock_error(sk);
+ if (!err)
+ err = -ECONNREFUSED;
+ goto out;
+ }
+ sock->state = SS_CONNECTED;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * caif_release() - Disconnect a CAIF Socket
+ * Copied and modified af_irda.c:irda_release().
+ */
+static int caif_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+ if (!sk)
+ return 0;
+
+ set_tx_flow_off(cf_sk);
+
+ /*
+ * Ensure that packets are not queued after this point in time.
+ * caif_queue_rcv_skb checks SOCK_DEAD holding the queue lock,
+ * this ensures no packets when sock is dead.
+ */
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ sock_set_flag(sk, SOCK_DEAD);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ sock->sk = NULL;
+
+ WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir));
+ debugfs_remove_recursive(cf_sk->debugfs_socket_dir);
+
+ lock_sock(&(cf_sk->sk));
+ sk->sk_state = CAIF_DISCONNECTED;
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ caif_disconnect_client(sock_net(sk), &cf_sk->layer);
+ cf_sk->sk.sk_socket->state = SS_DISCONNECTING;
+ wake_up_interruptible_poll(sk_sleep(sk), POLLERR|POLLHUP);
+
+ sock_orphan(sk);
+ sk_stream_kill_queues(&cf_sk->sk);
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+/* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
+static unsigned int caif_poll(struct file *file,
+ struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask;
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err)
+ mask |= POLLERR;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ mask |= POLLIN | POLLRDNORM;
+
+ /*
+ * we set writable also when the other side has shut down the
+ * connection. This prevents stuck sockets.
+ */
+ if (sock_writeable(sk) && tx_flow_is_on(cf_sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+ return mask;
+}
+
+static const struct proto_ops caif_seqpacket_ops = {
+ .family = PF_CAIF,
+ .owner = THIS_MODULE,
+ .release = caif_release,
+ .bind = sock_no_bind,
+ .connect = caif_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = caif_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = caif_seqpkt_sendmsg,
+ .recvmsg = caif_seqpkt_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static const struct proto_ops caif_stream_ops = {
+ .family = PF_CAIF,
+ .owner = THIS_MODULE,
+ .release = caif_release,
+ .bind = sock_no_bind,
+ .connect = caif_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = caif_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = caif_stream_sendmsg,
+ .recvmsg = caif_stream_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+/* This function is called when a socket is finally destroyed. */
+static void caif_sock_destructor(struct sock *sk)
+{
+ struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+ caif_assert(!atomic_read(&sk->sk_wmem_alloc));
+ caif_assert(sk_unhashed(sk));
+ caif_assert(!sk->sk_socket);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_debug("Attempt to release alive CAIF socket: %p\n", sk);
+ return;
+ }
+ sk_stream_kill_queues(&cf_sk->sk);
+ caif_free_client(&cf_sk->layer);
+}
+
+static int caif_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk = NULL;
+ struct caifsock *cf_sk = NULL;
+ static struct proto prot = {.name = "PF_CAIF",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct caifsock),
+ };
+
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ /*
+ * The sock->type specifies the socket type to use.
+ * The CAIF socket is a packet stream in the sense
+ * that it is packet based. CAIF trusts the reliability
+ * of the link, no resending is implemented.
+ */
+ if (sock->type == SOCK_SEQPACKET)
+ sock->ops = &caif_seqpacket_ops;
+ else if (sock->type == SOCK_STREAM)
+ sock->ops = &caif_stream_ops;
+ else
+ return -ESOCKTNOSUPPORT;
+
+ if (protocol < 0 || protocol >= CAIFPROTO_MAX)
+ return -EPROTONOSUPPORT;
+ /*
+ * Set the socket state to unconnected. The socket state
+ * is really not used at all in the net/core or socket.c but the
+ * initialization makes sure that sock->state is not uninitialized.
+ */
+ sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot);
+ if (!sk)
+ return -ENOMEM;
+
+ cf_sk = container_of(sk, struct caifsock, sk);
+
+ /* Store the protocol */
+ sk->sk_protocol = (unsigned char) protocol;
+
+ /* Initialize default priority for well-known cases */
+ switch (protocol) {
+ case CAIFPROTO_AT:
+ sk->sk_priority = TC_PRIO_CONTROL;
+ break;
+ case CAIFPROTO_RFM:
+ sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ break;
+ default:
+ sk->sk_priority = TC_PRIO_BESTEFFORT;
+ }
+
+ /*
+ * Lock in order to try to stop someone from opening the socket
+ * too early.
+ */
+ lock_sock(&(cf_sk->sk));
+
+ /* Initialize the nozero default sock structure data. */
+ sock_init_data(sock, sk);
+ sk->sk_destruct = caif_sock_destructor;
+
+ mutex_init(&cf_sk->readlock); /* single task reading lock */
+ cf_sk->layer.ctrlcmd = caif_ctrl_cb;
+ cf_sk->sk.sk_socket->state = SS_UNCONNECTED;
+ cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+
+ set_tx_flow_off(cf_sk);
+ set_rx_flow_on(cf_sk);
+
+ /* Set default options on configuration */
+ cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY;
+ cf_sk->conn_req.protocol = protocol;
+ release_sock(&cf_sk->sk);
+ return 0;
+}
+
+
+static struct net_proto_family caif_family_ops = {
+ .family = PF_CAIF,
+ .create = caif_create,
+ .owner = THIS_MODULE,
+};
+
+static int __init caif_sktinit_module(void)
+{
+ int err = sock_register(&caif_family_ops);
+ if (!err)
+ return err;
+ return 0;
+}
+
+static void __exit caif_sktexit_module(void)
+{
+ sock_unregister(PF_CAIF);
+}
+module_init(caif_sktinit_module);
+module_exit(caif_sktexit_module);
diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c
new file mode 100644
index 000000000..5cd44f001
--- /dev/null
+++ b/net/caif/caif_usb.c
@@ -0,0 +1,203 @@
+/*
+ * CAIF USB handler
+ * Copyright (C) ST-Ericsson AB 2011
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/mii.h>
+#include <linux/usb.h>
+#include <linux/usb/usbnet.h>
+#include <linux/etherdevice.h>
+#include <net/netns/generic.h>
+#include <net/caif/caif_dev.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfcnfg.h>
+
+MODULE_LICENSE("GPL");
+
+#define CFUSB_PAD_DESCR_SZ 1 /* Alignment descriptor length */
+#define CFUSB_ALIGNMENT 4 /* Number of bytes to align. */
+#define CFUSB_MAX_HEADLEN (CFUSB_PAD_DESCR_SZ + CFUSB_ALIGNMENT-1)
+#define STE_USB_VID 0x04cc /* USB Product ID for ST-Ericsson */
+#define STE_USB_PID_CAIF 0x230f /* Product id for CAIF Modems */
+
+struct cfusbl {
+ struct cflayer layer;
+ u8 tx_eth_hdr[ETH_HLEN];
+};
+
+static bool pack_added;
+
+static int cfusbl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 hpad;
+
+ /* Remove padding. */
+ cfpkt_extr_head(pkt, &hpad, 1);
+ cfpkt_extr_head(pkt, NULL, hpad);
+ return layr->up->receive(layr->up, pkt);
+}
+
+static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct caif_payload_info *info;
+ u8 hpad;
+ u8 zeros[CFUSB_ALIGNMENT];
+ struct sk_buff *skb;
+ struct cfusbl *usbl = container_of(layr, struct cfusbl, layer);
+
+ skb = cfpkt_tonative(pkt);
+
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+
+ info = cfpkt_info(pkt);
+ hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1);
+
+ if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) {
+ pr_warn("Headroom to small\n");
+ kfree_skb(skb);
+ return -EIO;
+ }
+ memset(zeros, 0, hpad);
+
+ cfpkt_add_head(pkt, zeros, hpad);
+ cfpkt_add_head(pkt, &hpad, 1);
+ cfpkt_add_head(pkt, usbl->tx_eth_hdr, sizeof(usbl->tx_eth_hdr));
+ return layr->dn->transmit(layr->dn, pkt);
+}
+
+static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ if (layr->up && layr->up->ctrlcmd)
+ layr->up->ctrlcmd(layr->up, ctrl, layr->id);
+}
+
+static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN],
+ u8 braddr[ETH_ALEN])
+{
+ struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC);
+
+ if (!this)
+ return NULL;
+
+ caif_assert(offsetof(struct cfusbl, layer) == 0);
+
+ memset(&this->layer, 0, sizeof(this->layer));
+ this->layer.receive = cfusbl_receive;
+ this->layer.transmit = cfusbl_transmit;
+ this->layer.ctrlcmd = cfusbl_ctrlcmd;
+ snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "usb%d", phyid);
+ this->layer.id = phyid;
+
+ /*
+ * Construct TX ethernet header:
+ * 0-5 destination address
+ * 5-11 source address
+ * 12-13 protocol type
+ */
+ ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], braddr);
+ ether_addr_copy(&this->tx_eth_hdr[ETH_ALEN], ethaddr);
+ this->tx_eth_hdr[12] = cpu_to_be16(ETH_P_802_EX1) & 0xff;
+ this->tx_eth_hdr[13] = (cpu_to_be16(ETH_P_802_EX1) >> 8) & 0xff;
+ pr_debug("caif ethernet TX-header dst:%pM src:%pM type:%02x%02x\n",
+ this->tx_eth_hdr, this->tx_eth_hdr + ETH_ALEN,
+ this->tx_eth_hdr[12], this->tx_eth_hdr[13]);
+
+ return (struct cflayer *) this;
+}
+
+static struct packet_type caif_usb_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_802_EX1),
+};
+
+static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct caif_dev_common common;
+ struct cflayer *layer, *link_support;
+ struct usbnet *usbnet;
+ struct usb_device *usbdev;
+
+ /* Check whether we have a NCM device, and find its VID/PID. */
+ if (!(dev->dev.parent && dev->dev.parent->driver &&
+ strcmp(dev->dev.parent->driver->name, "cdc_ncm") == 0))
+ return 0;
+
+ usbnet = netdev_priv(dev);
+ usbdev = usbnet->udev;
+
+ pr_debug("USB CDC NCM device VID:0x%4x PID:0x%4x\n",
+ le16_to_cpu(usbdev->descriptor.idVendor),
+ le16_to_cpu(usbdev->descriptor.idProduct));
+
+ /* Check for VID/PID that supports CAIF */
+ if (!(le16_to_cpu(usbdev->descriptor.idVendor) == STE_USB_VID &&
+ le16_to_cpu(usbdev->descriptor.idProduct) == STE_USB_PID_CAIF))
+ return 0;
+
+ if (what == NETDEV_UNREGISTER)
+ module_put(THIS_MODULE);
+
+ if (what != NETDEV_REGISTER)
+ return 0;
+
+ __module_get(THIS_MODULE);
+
+ memset(&common, 0, sizeof(common));
+ common.use_frag = false;
+ common.use_fcs = false;
+ common.use_stx = false;
+ common.link_select = CAIF_LINK_HIGH_BANDW;
+ common.flowctrl = NULL;
+
+ link_support = cfusbl_create(dev->ifindex, dev->dev_addr,
+ dev->broadcast);
+
+ if (!link_support)
+ return -ENOMEM;
+
+ if (dev->num_tx_queues > 1)
+ pr_warn("USB device uses more than one tx queue\n");
+
+ caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN,
+ &layer, &caif_usb_type.func);
+ if (!pack_added)
+ dev_add_pack(&caif_usb_type);
+ pack_added = true;
+
+ strncpy(layer->name, dev->name,
+ sizeof(layer->name) - 1);
+ layer->name[sizeof(layer->name) - 1] = 0;
+
+ return 0;
+}
+
+static struct notifier_block caif_device_notifier = {
+ .notifier_call = cfusbl_device_notify,
+ .priority = 0,
+};
+
+static int __init cfusbl_init(void)
+{
+ return register_netdevice_notifier(&caif_device_notifier);
+}
+
+static void __exit cfusbl_exit(void)
+{
+ unregister_netdevice_notifier(&caif_device_notifier);
+ dev_remove_pack(&caif_usb_type);
+}
+
+module_init(cfusbl_init);
+module_exit(cfusbl_exit);
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
new file mode 100644
index 000000000..fa39fc298
--- /dev/null
+++ b/net/caif/cfcnfg.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfcnfg.h>
+#include <net/caif/cfctrl.h>
+#include <net/caif/cfmuxl.h>
+#include <net/caif/cffrml.h>
+#include <net/caif/cfserl.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/caif_dev.h>
+
+#define container_obj(layr) container_of(layr, struct cfcnfg, layer)
+
+/* Information about CAIF physical interfaces held by Config Module in order
+ * to manage physical interfaces
+ */
+struct cfcnfg_phyinfo {
+ struct list_head node;
+ bool up;
+
+ /* Pointer to the layer below the MUX (framing layer) */
+ struct cflayer *frm_layer;
+ /* Pointer to the lowest actual physical layer */
+ struct cflayer *phy_layer;
+ /* Unique identifier of the physical interface */
+ unsigned int id;
+ /* Preference of the physical in interface */
+ enum cfcnfg_phy_preference pref;
+
+ /* Information about the physical device */
+ struct dev_info dev_info;
+
+ /* Interface index */
+ int ifindex;
+
+ /* Protocol head room added for CAIF link layer */
+ int head_room;
+
+ /* Use Start of frame checksum */
+ bool use_fcs;
+};
+
+struct cfcnfg {
+ struct cflayer layer;
+ struct cflayer *ctrl;
+ struct cflayer *mux;
+ struct list_head phys;
+ struct mutex lock;
+};
+
+static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id,
+ enum cfctrl_srv serv, u8 phyid,
+ struct cflayer *adapt_layer);
+static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id);
+static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id,
+ struct cflayer *adapt_layer);
+static void cfctrl_resp_func(void);
+static void cfctrl_enum_resp(void);
+
+struct cfcnfg *cfcnfg_create(void)
+{
+ struct cfcnfg *this;
+ struct cfctrl_rsp *resp;
+
+ might_sleep();
+
+ /* Initiate this layer */
+ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC);
+ if (!this)
+ return NULL;
+ this->mux = cfmuxl_create();
+ if (!this->mux)
+ goto out_of_mem;
+ this->ctrl = cfctrl_create();
+ if (!this->ctrl)
+ goto out_of_mem;
+ /* Initiate response functions */
+ resp = cfctrl_get_respfuncs(this->ctrl);
+ resp->enum_rsp = cfctrl_enum_resp;
+ resp->linkerror_ind = cfctrl_resp_func;
+ resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp;
+ resp->sleep_rsp = cfctrl_resp_func;
+ resp->wake_rsp = cfctrl_resp_func;
+ resp->restart_rsp = cfctrl_resp_func;
+ resp->radioset_rsp = cfctrl_resp_func;
+ resp->linksetup_rsp = cfcnfg_linkup_rsp;
+ resp->reject_rsp = cfcnfg_reject_rsp;
+ INIT_LIST_HEAD(&this->phys);
+
+ cfmuxl_set_uplayer(this->mux, this->ctrl, 0);
+ layer_set_dn(this->ctrl, this->mux);
+ layer_set_up(this->ctrl, this);
+ mutex_init(&this->lock);
+
+ return this;
+out_of_mem:
+ synchronize_rcu();
+
+ kfree(this->mux);
+ kfree(this->ctrl);
+ kfree(this);
+ return NULL;
+}
+
+void cfcnfg_remove(struct cfcnfg *cfg)
+{
+ might_sleep();
+ if (cfg) {
+ synchronize_rcu();
+
+ kfree(cfg->mux);
+ cfctrl_remove(cfg->ctrl);
+ kfree(cfg);
+ }
+}
+
+static void cfctrl_resp_func(void)
+{
+}
+
+static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg,
+ u8 phyid)
+{
+ struct cfcnfg_phyinfo *phy;
+
+ list_for_each_entry_rcu(phy, &cnfg->phys, node)
+ if (phy->id == phyid)
+ return phy;
+ return NULL;
+}
+
+static void cfctrl_enum_resp(void)
+{
+}
+
+static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg,
+ enum cfcnfg_phy_preference phy_pref)
+{
+ /* Try to match with specified preference */
+ struct cfcnfg_phyinfo *phy;
+
+ list_for_each_entry_rcu(phy, &cnfg->phys, node) {
+ if (phy->up && phy->pref == phy_pref &&
+ phy->frm_layer != NULL)
+
+ return &phy->dev_info;
+ }
+
+ /* Otherwise just return something */
+ list_for_each_entry_rcu(phy, &cnfg->phys, node)
+ if (phy->up)
+ return &phy->dev_info;
+
+ return NULL;
+}
+
+static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi)
+{
+ struct cfcnfg_phyinfo *phy;
+
+ list_for_each_entry_rcu(phy, &cnfg->phys, node)
+ if (phy->ifindex == ifi && phy->up)
+ return phy->id;
+ return -ENODEV;
+}
+
+int caif_disconnect_client(struct net *net, struct cflayer *adap_layer)
+{
+ u8 channel_id;
+ struct cfcnfg *cfg = get_cfcnfg(net);
+
+ caif_assert(adap_layer != NULL);
+ cfctrl_cancel_req(cfg->ctrl, adap_layer);
+ channel_id = adap_layer->id;
+ if (channel_id != 0) {
+ struct cflayer *servl;
+ servl = cfmuxl_remove_uplayer(cfg->mux, channel_id);
+ cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer);
+ if (servl != NULL)
+ layer_set_up(servl, NULL);
+ } else
+ pr_debug("nothing to disconnect\n");
+
+ /* Do RCU sync before initiating cleanup */
+ synchronize_rcu();
+ if (adap_layer->ctrlcmd != NULL)
+ adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0);
+ return 0;
+
+}
+EXPORT_SYMBOL(caif_disconnect_client);
+
+static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id)
+{
+}
+
+static const int protohead[CFCTRL_SRV_MASK] = {
+ [CFCTRL_SRV_VEI] = 4,
+ [CFCTRL_SRV_DATAGRAM] = 7,
+ [CFCTRL_SRV_UTIL] = 4,
+ [CFCTRL_SRV_RFM] = 3,
+ [CFCTRL_SRV_DBG] = 3,
+};
+
+
+static int caif_connect_req_to_link_param(struct cfcnfg *cnfg,
+ struct caif_connect_request *s,
+ struct cfctrl_link_param *l)
+{
+ struct dev_info *dev_info;
+ enum cfcnfg_phy_preference pref;
+ int res;
+
+ memset(l, 0, sizeof(*l));
+ /* In caif protocol low value is high priority */
+ l->priority = CAIF_PRIO_MAX - s->priority + 1;
+
+ if (s->ifindex != 0) {
+ res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex);
+ if (res < 0)
+ return res;
+ l->phyid = res;
+ } else {
+ switch (s->link_selector) {
+ case CAIF_LINK_HIGH_BANDW:
+ pref = CFPHYPREF_HIGH_BW;
+ break;
+ case CAIF_LINK_LOW_LATENCY:
+ pref = CFPHYPREF_LOW_LAT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ dev_info = cfcnfg_get_phyid(cnfg, pref);
+ if (dev_info == NULL)
+ return -ENODEV;
+ l->phyid = dev_info->id;
+ }
+ switch (s->protocol) {
+ case CAIFPROTO_AT:
+ l->linktype = CFCTRL_SRV_VEI;
+ l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3;
+ l->chtype = s->sockaddr.u.at.type & 0x3;
+ break;
+ case CAIFPROTO_DATAGRAM:
+ l->linktype = CFCTRL_SRV_DATAGRAM;
+ l->chtype = 0x00;
+ l->u.datagram.connid = s->sockaddr.u.dgm.connection_id;
+ break;
+ case CAIFPROTO_DATAGRAM_LOOP:
+ l->linktype = CFCTRL_SRV_DATAGRAM;
+ l->chtype = 0x03;
+ l->endpoint = 0x00;
+ l->u.datagram.connid = s->sockaddr.u.dgm.connection_id;
+ break;
+ case CAIFPROTO_RFM:
+ l->linktype = CFCTRL_SRV_RFM;
+ l->u.datagram.connid = s->sockaddr.u.rfm.connection_id;
+ strncpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume,
+ sizeof(l->u.rfm.volume)-1);
+ l->u.rfm.volume[sizeof(l->u.rfm.volume)-1] = 0;
+ break;
+ case CAIFPROTO_UTIL:
+ l->linktype = CFCTRL_SRV_UTIL;
+ l->endpoint = 0x00;
+ l->chtype = 0x00;
+ strncpy(l->u.utility.name, s->sockaddr.u.util.service,
+ sizeof(l->u.utility.name)-1);
+ l->u.utility.name[sizeof(l->u.utility.name)-1] = 0;
+ caif_assert(sizeof(l->u.utility.name) > 10);
+ l->u.utility.paramlen = s->param.size;
+ if (l->u.utility.paramlen > sizeof(l->u.utility.params))
+ l->u.utility.paramlen = sizeof(l->u.utility.params);
+
+ memcpy(l->u.utility.params, s->param.data,
+ l->u.utility.paramlen);
+
+ break;
+ case CAIFPROTO_DEBUG:
+ l->linktype = CFCTRL_SRV_DBG;
+ l->endpoint = s->sockaddr.u.dbg.service;
+ l->chtype = s->sockaddr.u.dbg.type;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int caif_connect_client(struct net *net, struct caif_connect_request *conn_req,
+ struct cflayer *adap_layer, int *ifindex,
+ int *proto_head, int *proto_tail)
+{
+ struct cflayer *frml;
+ struct cfcnfg_phyinfo *phy;
+ int err;
+ struct cfctrl_link_param param;
+ struct cfcnfg *cfg = get_cfcnfg(net);
+
+ rcu_read_lock();
+ err = caif_connect_req_to_link_param(cfg, conn_req, &param);
+ if (err)
+ goto unlock;
+
+ phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid);
+ if (!phy) {
+ err = -ENODEV;
+ goto unlock;
+ }
+ err = -EINVAL;
+
+ if (adap_layer == NULL) {
+ pr_err("adap_layer is zero\n");
+ goto unlock;
+ }
+ if (adap_layer->receive == NULL) {
+ pr_err("adap_layer->receive is NULL\n");
+ goto unlock;
+ }
+ if (adap_layer->ctrlcmd == NULL) {
+ pr_err("adap_layer->ctrlcmd == NULL\n");
+ goto unlock;
+ }
+
+ err = -ENODEV;
+ frml = phy->frm_layer;
+ if (frml == NULL) {
+ pr_err("Specified PHY type does not exist!\n");
+ goto unlock;
+ }
+ caif_assert(param.phyid == phy->id);
+ caif_assert(phy->frm_layer->id ==
+ param.phyid);
+ caif_assert(phy->phy_layer->id ==
+ param.phyid);
+
+ *ifindex = phy->ifindex;
+ *proto_tail = 2;
+ *proto_head = protohead[param.linktype] + phy->head_room;
+
+ rcu_read_unlock();
+
+ /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */
+ cfctrl_enum_req(cfg->ctrl, param.phyid);
+ return cfctrl_linkup_request(cfg->ctrl, &param, adap_layer);
+
+unlock:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(caif_connect_client);
+
+static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id,
+ struct cflayer *adapt_layer)
+{
+ if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL)
+ adapt_layer->ctrlcmd(adapt_layer,
+ CAIF_CTRLCMD_INIT_FAIL_RSP, 0);
+}
+
+static void
+cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
+ u8 phyid, struct cflayer *adapt_layer)
+{
+ struct cfcnfg *cnfg = container_obj(layer);
+ struct cflayer *servicel = NULL;
+ struct cfcnfg_phyinfo *phyinfo;
+ struct net_device *netdev;
+
+ if (channel_id == 0) {
+ pr_warn("received channel_id zero\n");
+ if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL)
+ adapt_layer->ctrlcmd(adapt_layer,
+ CAIF_CTRLCMD_INIT_FAIL_RSP, 0);
+ return;
+ }
+
+ rcu_read_lock();
+
+ if (adapt_layer == NULL) {
+ pr_debug("link setup response but no client exist,"
+ "send linkdown back\n");
+ cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
+ goto unlock;
+ }
+
+ caif_assert(cnfg != NULL);
+ caif_assert(phyid != 0);
+
+ phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
+ if (phyinfo == NULL) {
+ pr_err("ERROR: Link Layer Device disappeared"
+ "while connecting\n");
+ goto unlock;
+ }
+
+ caif_assert(phyinfo != NULL);
+ caif_assert(phyinfo->id == phyid);
+ caif_assert(phyinfo->phy_layer != NULL);
+ caif_assert(phyinfo->phy_layer->id == phyid);
+
+ adapt_layer->id = channel_id;
+
+ switch (serv) {
+ case CFCTRL_SRV_VEI:
+ servicel = cfvei_create(channel_id, &phyinfo->dev_info);
+ break;
+ case CFCTRL_SRV_DATAGRAM:
+ servicel = cfdgml_create(channel_id,
+ &phyinfo->dev_info);
+ break;
+ case CFCTRL_SRV_RFM:
+ netdev = phyinfo->dev_info.dev;
+ servicel = cfrfml_create(channel_id, &phyinfo->dev_info,
+ netdev->mtu);
+ break;
+ case CFCTRL_SRV_UTIL:
+ servicel = cfutill_create(channel_id, &phyinfo->dev_info);
+ break;
+ case CFCTRL_SRV_VIDEO:
+ servicel = cfvidl_create(channel_id, &phyinfo->dev_info);
+ break;
+ case CFCTRL_SRV_DBG:
+ servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
+ break;
+ default:
+ pr_err("Protocol error. Link setup response "
+ "- unknown channel type\n");
+ goto unlock;
+ }
+ if (!servicel)
+ goto unlock;
+ layer_set_dn(servicel, cnfg->mux);
+ cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id);
+ layer_set_up(servicel, adapt_layer);
+ layer_set_dn(adapt_layer, servicel);
+
+ rcu_read_unlock();
+
+ servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0);
+ return;
+unlock:
+ rcu_read_unlock();
+}
+
+void
+cfcnfg_add_phy_layer(struct cfcnfg *cnfg,
+ struct net_device *dev, struct cflayer *phy_layer,
+ enum cfcnfg_phy_preference pref,
+ struct cflayer *link_support,
+ bool fcs, int head_room)
+{
+ struct cflayer *frml;
+ struct cfcnfg_phyinfo *phyinfo = NULL;
+ int i;
+ u8 phyid;
+
+ mutex_lock(&cnfg->lock);
+
+ /* CAIF protocol allow maximum 6 link-layers */
+ for (i = 0; i < 7; i++) {
+ phyid = (dev->ifindex + i) & 0x7;
+ if (phyid == 0)
+ continue;
+ if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL)
+ goto got_phyid;
+ }
+ pr_warn("Too many CAIF Link Layers (max 6)\n");
+ goto out;
+
+got_phyid:
+ phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC);
+ if (!phyinfo)
+ goto out_err;
+
+ phy_layer->id = phyid;
+ phyinfo->pref = pref;
+ phyinfo->id = phyid;
+ phyinfo->dev_info.id = phyid;
+ phyinfo->dev_info.dev = dev;
+ phyinfo->phy_layer = phy_layer;
+ phyinfo->ifindex = dev->ifindex;
+ phyinfo->head_room = head_room;
+ phyinfo->use_fcs = fcs;
+
+ frml = cffrml_create(phyid, fcs);
+
+ if (!frml)
+ goto out_err;
+ phyinfo->frm_layer = frml;
+ layer_set_up(frml, cnfg->mux);
+
+ if (link_support != NULL) {
+ link_support->id = phyid;
+ layer_set_dn(frml, link_support);
+ layer_set_up(link_support, frml);
+ layer_set_dn(link_support, phy_layer);
+ layer_set_up(phy_layer, link_support);
+ } else {
+ layer_set_dn(frml, phy_layer);
+ layer_set_up(phy_layer, frml);
+ }
+
+ list_add_rcu(&phyinfo->node, &cnfg->phys);
+out:
+ mutex_unlock(&cnfg->lock);
+ return;
+
+out_err:
+ kfree(phyinfo);
+ mutex_unlock(&cnfg->lock);
+}
+EXPORT_SYMBOL(cfcnfg_add_phy_layer);
+
+int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer,
+ bool up)
+{
+ struct cfcnfg_phyinfo *phyinfo;
+
+ rcu_read_lock();
+ phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id);
+ if (phyinfo == NULL) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ if (phyinfo->up == up) {
+ rcu_read_unlock();
+ return 0;
+ }
+ phyinfo->up = up;
+
+ if (up) {
+ cffrml_hold(phyinfo->frm_layer);
+ cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer,
+ phy_layer->id);
+ } else {
+ cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id);
+ cffrml_put(phyinfo->frm_layer);
+ }
+
+ rcu_read_unlock();
+ return 0;
+}
+EXPORT_SYMBOL(cfcnfg_set_phy_state);
+
+int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer)
+{
+ struct cflayer *frml, *frml_dn;
+ u16 phyid;
+ struct cfcnfg_phyinfo *phyinfo;
+
+ might_sleep();
+
+ mutex_lock(&cnfg->lock);
+
+ phyid = phy_layer->id;
+ phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
+
+ if (phyinfo == NULL) {
+ mutex_unlock(&cnfg->lock);
+ return 0;
+ }
+ caif_assert(phyid == phyinfo->id);
+ caif_assert(phy_layer == phyinfo->phy_layer);
+ caif_assert(phy_layer->id == phyid);
+ caif_assert(phyinfo->frm_layer->id == phyid);
+
+ list_del_rcu(&phyinfo->node);
+ synchronize_rcu();
+
+ /* Fail if reference count is not zero */
+ if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) {
+ pr_info("Wait for device inuse\n");
+ list_add_rcu(&phyinfo->node, &cnfg->phys);
+ mutex_unlock(&cnfg->lock);
+ return -EAGAIN;
+ }
+
+ frml = phyinfo->frm_layer;
+ frml_dn = frml->dn;
+ cffrml_set_uplayer(frml, NULL);
+ cffrml_set_dnlayer(frml, NULL);
+ if (phy_layer != frml_dn) {
+ layer_set_up(frml_dn, NULL);
+ layer_set_dn(frml_dn, NULL);
+ }
+ layer_set_up(phy_layer, NULL);
+
+ if (phyinfo->phy_layer != frml_dn)
+ kfree(frml_dn);
+
+ cffrml_free(frml);
+ kfree(phyinfo);
+ mutex_unlock(&cnfg->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(cfcnfg_del_phy_layer);
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
new file mode 100644
index 000000000..f5afda1ab
--- /dev/null
+++ b/net/caif/cfctrl.c
@@ -0,0 +1,641 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfctrl.h>
+
+#define container_obj(layr) container_of(layr, struct cfctrl, serv.layer)
+#define UTILITY_NAME_LENGTH 16
+#define CFPKT_CTRL_PKT_LEN 20
+
+#ifdef CAIF_NO_LOOP
+static int handle_loop(struct cfctrl *ctrl,
+ int cmd, struct cfpkt *pkt){
+ return -1;
+}
+#else
+static int handle_loop(struct cfctrl *ctrl,
+ int cmd, struct cfpkt *pkt);
+#endif
+static int cfctrl_recv(struct cflayer *layr, struct cfpkt *pkt);
+static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid);
+
+
+struct cflayer *cfctrl_create(void)
+{
+ struct dev_info dev_info;
+ struct cfctrl *this =
+ kzalloc(sizeof(struct cfctrl), GFP_ATOMIC);
+ if (!this)
+ return NULL;
+ caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
+ memset(&dev_info, 0, sizeof(dev_info));
+ dev_info.id = 0xff;
+ cfsrvl_init(&this->serv, 0, &dev_info, false);
+ atomic_set(&this->req_seq_no, 1);
+ atomic_set(&this->rsp_seq_no, 1);
+ this->serv.layer.receive = cfctrl_recv;
+ sprintf(this->serv.layer.name, "ctrl");
+ this->serv.layer.ctrlcmd = cfctrl_ctrlcmd;
+#ifndef CAIF_NO_LOOP
+ spin_lock_init(&this->loop_linkid_lock);
+ this->loop_linkid = 1;
+#endif
+ spin_lock_init(&this->info_list_lock);
+ INIT_LIST_HEAD(&this->list);
+ return &this->serv.layer;
+}
+
+void cfctrl_remove(struct cflayer *layer)
+{
+ struct cfctrl_request_info *p, *tmp;
+ struct cfctrl *ctrl = container_obj(layer);
+
+ spin_lock_bh(&ctrl->info_list_lock);
+ list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+ list_del(&p->list);
+ kfree(p);
+ }
+ spin_unlock_bh(&ctrl->info_list_lock);
+ kfree(layer);
+}
+
+static bool param_eq(const struct cfctrl_link_param *p1,
+ const struct cfctrl_link_param *p2)
+{
+ bool eq =
+ p1->linktype == p2->linktype &&
+ p1->priority == p2->priority &&
+ p1->phyid == p2->phyid &&
+ p1->endpoint == p2->endpoint && p1->chtype == p2->chtype;
+
+ if (!eq)
+ return false;
+
+ switch (p1->linktype) {
+ case CFCTRL_SRV_VEI:
+ return true;
+ case CFCTRL_SRV_DATAGRAM:
+ return p1->u.datagram.connid == p2->u.datagram.connid;
+ case CFCTRL_SRV_RFM:
+ return
+ p1->u.rfm.connid == p2->u.rfm.connid &&
+ strcmp(p1->u.rfm.volume, p2->u.rfm.volume) == 0;
+ case CFCTRL_SRV_UTIL:
+ return
+ p1->u.utility.fifosize_kb == p2->u.utility.fifosize_kb
+ && p1->u.utility.fifosize_bufs ==
+ p2->u.utility.fifosize_bufs
+ && strcmp(p1->u.utility.name, p2->u.utility.name) == 0
+ && p1->u.utility.paramlen == p2->u.utility.paramlen
+ && memcmp(p1->u.utility.params, p2->u.utility.params,
+ p1->u.utility.paramlen) == 0;
+
+ case CFCTRL_SRV_VIDEO:
+ return p1->u.video.connid == p2->u.video.connid;
+ case CFCTRL_SRV_DBG:
+ return true;
+ case CFCTRL_SRV_DECM:
+ return false;
+ default:
+ return false;
+ }
+ return false;
+}
+
+static bool cfctrl_req_eq(const struct cfctrl_request_info *r1,
+ const struct cfctrl_request_info *r2)
+{
+ if (r1->cmd != r2->cmd)
+ return false;
+ if (r1->cmd == CFCTRL_CMD_LINK_SETUP)
+ return param_eq(&r1->param, &r2->param);
+ else
+ return r1->channel_id == r2->channel_id;
+}
+
+/* Insert request at the end */
+static void cfctrl_insert_req(struct cfctrl *ctrl,
+ struct cfctrl_request_info *req)
+{
+ spin_lock_bh(&ctrl->info_list_lock);
+ atomic_inc(&ctrl->req_seq_no);
+ req->sequence_no = atomic_read(&ctrl->req_seq_no);
+ list_add_tail(&req->list, &ctrl->list);
+ spin_unlock_bh(&ctrl->info_list_lock);
+}
+
+/* Compare and remove request */
+static struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl,
+ struct cfctrl_request_info *req)
+{
+ struct cfctrl_request_info *p, *tmp, *first;
+
+ first = list_first_entry(&ctrl->list, struct cfctrl_request_info, list);
+
+ list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+ if (cfctrl_req_eq(req, p)) {
+ if (p != first)
+ pr_warn("Requests are not received in order\n");
+
+ atomic_set(&ctrl->rsp_seq_no,
+ p->sequence_no);
+ list_del(&p->list);
+ goto out;
+ }
+ }
+ p = NULL;
+out:
+ return p;
+}
+
+struct cfctrl_rsp *cfctrl_get_respfuncs(struct cflayer *layer)
+{
+ struct cfctrl *this = container_obj(layer);
+ return &this->res;
+}
+
+static void init_info(struct caif_payload_info *info, struct cfctrl *cfctrl)
+{
+ info->hdr_len = 0;
+ info->channel_id = cfctrl->serv.layer.id;
+ info->dev_info = &cfctrl->serv.dev_info;
+}
+
+void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
+{
+ struct cfpkt *pkt;
+ struct cfctrl *cfctrl = container_obj(layer);
+ struct cflayer *dn = cfctrl->serv.layer.dn;
+
+ if (!dn) {
+ pr_debug("not able to send enum request\n");
+ return;
+ }
+ pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+ if (!pkt)
+ return;
+ caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
+ init_info(cfpkt_info(pkt), cfctrl);
+ cfpkt_info(pkt)->dev_info->id = physlinkid;
+ cfctrl->serv.dev_info.id = physlinkid;
+ cfpkt_addbdy(pkt, CFCTRL_CMD_ENUM);
+ cfpkt_addbdy(pkt, physlinkid);
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ dn->transmit(dn, pkt);
+}
+
+int cfctrl_linkup_request(struct cflayer *layer,
+ struct cfctrl_link_param *param,
+ struct cflayer *user_layer)
+{
+ struct cfctrl *cfctrl = container_obj(layer);
+ u32 tmp32;
+ u16 tmp16;
+ u8 tmp8;
+ struct cfctrl_request_info *req;
+ int ret;
+ char utility_name[16];
+ struct cfpkt *pkt;
+ struct cflayer *dn = cfctrl->serv.layer.dn;
+
+ if (!dn) {
+ pr_debug("not able to send linkup request\n");
+ return -ENODEV;
+ }
+
+ if (cfctrl_cancel_req(layer, user_layer) > 0) {
+ /* Slight Paranoia, check if already connecting */
+ pr_err("Duplicate connect request for same client\n");
+ WARN_ON(1);
+ return -EALREADY;
+ }
+
+ pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+ if (!pkt)
+ return -ENOMEM;
+ cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP);
+ cfpkt_addbdy(pkt, (param->chtype << 4) | param->linktype);
+ cfpkt_addbdy(pkt, (param->priority << 3) | param->phyid);
+ cfpkt_addbdy(pkt, param->endpoint & 0x03);
+
+ switch (param->linktype) {
+ case CFCTRL_SRV_VEI:
+ break;
+ case CFCTRL_SRV_VIDEO:
+ cfpkt_addbdy(pkt, (u8) param->u.video.connid);
+ break;
+ case CFCTRL_SRV_DBG:
+ break;
+ case CFCTRL_SRV_DATAGRAM:
+ tmp32 = cpu_to_le32(param->u.datagram.connid);
+ cfpkt_add_body(pkt, &tmp32, 4);
+ break;
+ case CFCTRL_SRV_RFM:
+ /* Construct a frame, convert DatagramConnectionID to network
+ * format long and copy it out...
+ */
+ tmp32 = cpu_to_le32(param->u.rfm.connid);
+ cfpkt_add_body(pkt, &tmp32, 4);
+ /* Add volume name, including zero termination... */
+ cfpkt_add_body(pkt, param->u.rfm.volume,
+ strlen(param->u.rfm.volume) + 1);
+ break;
+ case CFCTRL_SRV_UTIL:
+ tmp16 = cpu_to_le16(param->u.utility.fifosize_kb);
+ cfpkt_add_body(pkt, &tmp16, 2);
+ tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs);
+ cfpkt_add_body(pkt, &tmp16, 2);
+ memset(utility_name, 0, sizeof(utility_name));
+ strncpy(utility_name, param->u.utility.name,
+ UTILITY_NAME_LENGTH - 1);
+ cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH);
+ tmp8 = param->u.utility.paramlen;
+ cfpkt_add_body(pkt, &tmp8, 1);
+ cfpkt_add_body(pkt, param->u.utility.params,
+ param->u.utility.paramlen);
+ break;
+ default:
+ pr_warn("Request setup of bad link type = %d\n",
+ param->linktype);
+ return -EINVAL;
+ }
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+ req->client_layer = user_layer;
+ req->cmd = CFCTRL_CMD_LINK_SETUP;
+ req->param = *param;
+ cfctrl_insert_req(cfctrl, req);
+ init_info(cfpkt_info(pkt), cfctrl);
+ /*
+ * NOTE:Always send linkup and linkdown request on the same
+ * device as the payload. Otherwise old queued up payload
+ * might arrive with the newly allocated channel ID.
+ */
+ cfpkt_info(pkt)->dev_info->id = param->phyid;
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ ret =
+ dn->transmit(dn, pkt);
+ if (ret < 0) {
+ int count;
+
+ count = cfctrl_cancel_req(&cfctrl->serv.layer,
+ user_layer);
+ if (count != 1) {
+ pr_err("Could not remove request (%d)", count);
+ return -ENODEV;
+ }
+ }
+ return 0;
+}
+
+int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
+ struct cflayer *client)
+{
+ int ret;
+ struct cfpkt *pkt;
+ struct cfctrl *cfctrl = container_obj(layer);
+ struct cflayer *dn = cfctrl->serv.layer.dn;
+
+ if (!dn) {
+ pr_debug("not able to send link-down request\n");
+ return -ENODEV;
+ }
+ pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+ if (!pkt)
+ return -ENOMEM;
+ cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY);
+ cfpkt_addbdy(pkt, channelid);
+ init_info(cfpkt_info(pkt), cfctrl);
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ ret =
+ dn->transmit(dn, pkt);
+#ifndef CAIF_NO_LOOP
+ cfctrl->loop_linkused[channelid] = 0;
+#endif
+ return ret;
+}
+
+int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer)
+{
+ struct cfctrl_request_info *p, *tmp;
+ struct cfctrl *ctrl = container_obj(layr);
+ int found = 0;
+ spin_lock_bh(&ctrl->info_list_lock);
+
+ list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+ if (p->client_layer == adap_layer) {
+ list_del(&p->list);
+ kfree(p);
+ found++;
+ }
+ }
+
+ spin_unlock_bh(&ctrl->info_list_lock);
+ return found;
+}
+
+static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
+{
+ u8 cmdrsp;
+ u8 cmd;
+ int ret = -1;
+ u16 tmp16;
+ u8 len;
+ u8 param[255];
+ u8 linkid;
+ struct cfctrl *cfctrl = container_obj(layer);
+ struct cfctrl_request_info rsp, *req;
+
+
+ cfpkt_extr_head(pkt, &cmdrsp, 1);
+ cmd = cmdrsp & CFCTRL_CMD_MASK;
+ if (cmd != CFCTRL_CMD_LINK_ERR
+ && CFCTRL_RSP_BIT != (CFCTRL_RSP_BIT & cmdrsp)
+ && CFCTRL_ERR_BIT != (CFCTRL_ERR_BIT & cmdrsp)) {
+ if (handle_loop(cfctrl, cmd, pkt) != 0)
+ cmdrsp |= CFCTRL_ERR_BIT;
+ }
+
+ switch (cmd) {
+ case CFCTRL_CMD_LINK_SETUP:
+ {
+ enum cfctrl_srv serv;
+ enum cfctrl_srv servtype;
+ u8 endpoint;
+ u8 physlinkid;
+ u8 prio;
+ u8 tmp;
+ u32 tmp32;
+ u8 *cp;
+ int i;
+ struct cfctrl_link_param linkparam;
+ memset(&linkparam, 0, sizeof(linkparam));
+
+ cfpkt_extr_head(pkt, &tmp, 1);
+
+ serv = tmp & CFCTRL_SRV_MASK;
+ linkparam.linktype = serv;
+
+ servtype = tmp >> 4;
+ linkparam.chtype = servtype;
+
+ cfpkt_extr_head(pkt, &tmp, 1);
+ physlinkid = tmp & 0x07;
+ prio = tmp >> 3;
+
+ linkparam.priority = prio;
+ linkparam.phyid = physlinkid;
+ cfpkt_extr_head(pkt, &endpoint, 1);
+ linkparam.endpoint = endpoint & 0x03;
+
+ switch (serv) {
+ case CFCTRL_SRV_VEI:
+ case CFCTRL_SRV_DBG:
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ cfpkt_extr_head(pkt, &linkid, 1);
+ break;
+ case CFCTRL_SRV_VIDEO:
+ cfpkt_extr_head(pkt, &tmp, 1);
+ linkparam.u.video.connid = tmp;
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ cfpkt_extr_head(pkt, &linkid, 1);
+ break;
+
+ case CFCTRL_SRV_DATAGRAM:
+ cfpkt_extr_head(pkt, &tmp32, 4);
+ linkparam.u.datagram.connid =
+ le32_to_cpu(tmp32);
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ cfpkt_extr_head(pkt, &linkid, 1);
+ break;
+ case CFCTRL_SRV_RFM:
+ /* Construct a frame, convert
+ * DatagramConnectionID
+ * to network format long and copy it out...
+ */
+ cfpkt_extr_head(pkt, &tmp32, 4);
+ linkparam.u.rfm.connid =
+ le32_to_cpu(tmp32);
+ cp = (u8 *) linkparam.u.rfm.volume;
+ for (cfpkt_extr_head(pkt, &tmp, 1);
+ cfpkt_more(pkt) && tmp != '\0';
+ cfpkt_extr_head(pkt, &tmp, 1))
+ *cp++ = tmp;
+ *cp = '\0';
+
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ cfpkt_extr_head(pkt, &linkid, 1);
+
+ break;
+ case CFCTRL_SRV_UTIL:
+ /* Construct a frame, convert
+ * DatagramConnectionID
+ * to network format long and copy it out...
+ */
+ /* Fifosize KB */
+ cfpkt_extr_head(pkt, &tmp16, 2);
+ linkparam.u.utility.fifosize_kb =
+ le16_to_cpu(tmp16);
+ /* Fifosize bufs */
+ cfpkt_extr_head(pkt, &tmp16, 2);
+ linkparam.u.utility.fifosize_bufs =
+ le16_to_cpu(tmp16);
+ /* name */
+ cp = (u8 *) linkparam.u.utility.name;
+ caif_assert(sizeof(linkparam.u.utility.name)
+ >= UTILITY_NAME_LENGTH);
+ for (i = 0;
+ i < UTILITY_NAME_LENGTH
+ && cfpkt_more(pkt); i++) {
+ cfpkt_extr_head(pkt, &tmp, 1);
+ *cp++ = tmp;
+ }
+ /* Length */
+ cfpkt_extr_head(pkt, &len, 1);
+ linkparam.u.utility.paramlen = len;
+ /* Param Data */
+ cp = linkparam.u.utility.params;
+ while (cfpkt_more(pkt) && len--) {
+ cfpkt_extr_head(pkt, &tmp, 1);
+ *cp++ = tmp;
+ }
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ cfpkt_extr_head(pkt, &linkid, 1);
+ /* Length */
+ cfpkt_extr_head(pkt, &len, 1);
+ /* Param Data */
+ cfpkt_extr_head(pkt, &param, len);
+ break;
+ default:
+ pr_warn("Request setup, invalid type (%d)\n",
+ serv);
+ goto error;
+ }
+
+ rsp.cmd = cmd;
+ rsp.param = linkparam;
+ spin_lock_bh(&cfctrl->info_list_lock);
+ req = cfctrl_remove_req(cfctrl, &rsp);
+
+ if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) ||
+ cfpkt_erroneous(pkt)) {
+ pr_err("Invalid O/E bit or parse error "
+ "on CAIF control channel\n");
+ cfctrl->res.reject_rsp(cfctrl->serv.layer.up,
+ 0,
+ req ? req->client_layer
+ : NULL);
+ } else {
+ cfctrl->res.linksetup_rsp(cfctrl->serv.
+ layer.up, linkid,
+ serv, physlinkid,
+ req ? req->
+ client_layer : NULL);
+ }
+
+ kfree(req);
+
+ spin_unlock_bh(&cfctrl->info_list_lock);
+ }
+ break;
+ case CFCTRL_CMD_LINK_DESTROY:
+ cfpkt_extr_head(pkt, &linkid, 1);
+ cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid);
+ break;
+ case CFCTRL_CMD_LINK_ERR:
+ pr_err("Frame Error Indication received\n");
+ cfctrl->res.linkerror_ind();
+ break;
+ case CFCTRL_CMD_ENUM:
+ cfctrl->res.enum_rsp();
+ break;
+ case CFCTRL_CMD_SLEEP:
+ cfctrl->res.sleep_rsp();
+ break;
+ case CFCTRL_CMD_WAKE:
+ cfctrl->res.wake_rsp();
+ break;
+ case CFCTRL_CMD_LINK_RECONF:
+ cfctrl->res.restart_rsp();
+ break;
+ case CFCTRL_CMD_RADIO_SET:
+ cfctrl->res.radioset_rsp();
+ break;
+ default:
+ pr_err("Unrecognized Control Frame\n");
+ goto error;
+ }
+ ret = 0;
+error:
+ cfpkt_destroy(pkt);
+ return ret;
+}
+
+static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ struct cfctrl *this = container_obj(layr);
+ switch (ctrl) {
+ case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND:
+ case CAIF_CTRLCMD_FLOW_OFF_IND:
+ spin_lock_bh(&this->info_list_lock);
+ if (!list_empty(&this->list))
+ pr_debug("Received flow off in control layer\n");
+ spin_unlock_bh(&this->info_list_lock);
+ break;
+ case _CAIF_CTRLCMD_PHYIF_DOWN_IND: {
+ struct cfctrl_request_info *p, *tmp;
+
+ /* Find all connect request and report failure */
+ spin_lock_bh(&this->info_list_lock);
+ list_for_each_entry_safe(p, tmp, &this->list, list) {
+ if (p->param.phyid == phyid) {
+ list_del(&p->list);
+ p->client_layer->ctrlcmd(p->client_layer,
+ CAIF_CTRLCMD_INIT_FAIL_RSP,
+ phyid);
+ kfree(p);
+ }
+ }
+ spin_unlock_bh(&this->info_list_lock);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+#ifndef CAIF_NO_LOOP
+static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt)
+{
+ static int last_linkid;
+ static int dec;
+ u8 linkid, linktype, tmp;
+ switch (cmd) {
+ case CFCTRL_CMD_LINK_SETUP:
+ spin_lock_bh(&ctrl->loop_linkid_lock);
+ if (!dec) {
+ for (linkid = last_linkid + 1; linkid < 254; linkid++)
+ if (!ctrl->loop_linkused[linkid])
+ goto found;
+ }
+ dec = 1;
+ for (linkid = last_linkid - 1; linkid > 1; linkid--)
+ if (!ctrl->loop_linkused[linkid])
+ goto found;
+ spin_unlock_bh(&ctrl->loop_linkid_lock);
+ return -1;
+found:
+ if (linkid < 10)
+ dec = 0;
+
+ if (!ctrl->loop_linkused[linkid])
+ ctrl->loop_linkused[linkid] = 1;
+
+ last_linkid = linkid;
+
+ cfpkt_add_trail(pkt, &linkid, 1);
+ spin_unlock_bh(&ctrl->loop_linkid_lock);
+ cfpkt_peek_head(pkt, &linktype, 1);
+ if (linktype == CFCTRL_SRV_UTIL) {
+ tmp = 0x01;
+ cfpkt_add_trail(pkt, &tmp, 1);
+ cfpkt_add_trail(pkt, &tmp, 1);
+ }
+ break;
+
+ case CFCTRL_CMD_LINK_DESTROY:
+ spin_lock_bh(&ctrl->loop_linkid_lock);
+ cfpkt_peek_head(pkt, &linkid, 1);
+ ctrl->loop_linkused[linkid] = 0;
+ spin_unlock_bh(&ctrl->loop_linkid_lock);
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+#endif
diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c
new file mode 100644
index 000000000..7aae0b568
--- /dev/null
+++ b/net/caif/cfdbgl.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *dbg = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!dbg)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ cfsrvl_init(dbg, channel_id, dev_info, false);
+ dbg->layer.receive = cfdbgl_receive;
+ dbg->layer.transmit = cfdbgl_transmit;
+ snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ - 1, "dbg%d", channel_id);
+ return &dbg->layer;
+}
+
+static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ return layr->up->receive(layr->up, pkt);
+}
+
+static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct cfsrvl *service = container_obj(layr);
+ struct caif_payload_info *info;
+ int ret;
+
+ if (!cfsrvl_ready(service, &ret)) {
+ cfpkt_destroy(pkt);
+ return ret;
+ }
+
+ /* Add info for MUX-layer to route the packet out */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->dev_info = &service->dev_info;
+
+ return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
new file mode 100644
index 000000000..3bdddb32d
--- /dev/null
+++ b/net/caif/cfdgml.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+#define DGM_CMD_BIT 0x80
+#define DGM_FLOW_OFF 0x81
+#define DGM_FLOW_ON 0x80
+#define DGM_MTU 1500
+
+static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *dgm = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!dgm)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ cfsrvl_init(dgm, channel_id, dev_info, true);
+ dgm->layer.receive = cfdgml_receive;
+ dgm->layer.transmit = cfdgml_transmit;
+ snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ - 1, "dgm%d", channel_id);
+ dgm->layer.name[CAIF_LAYER_NAME_SZ - 1] = '\0';
+ return &dgm->layer;
+}
+
+static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 cmd = -1;
+ u8 dgmhdr[3];
+ int ret;
+ caif_assert(layr->up != NULL);
+ caif_assert(layr->receive != NULL);
+ caif_assert(layr->ctrlcmd != NULL);
+
+ if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+
+ if ((cmd & DGM_CMD_BIT) == 0) {
+ if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ ret = layr->up->receive(layr->up, pkt);
+ return ret;
+ }
+
+ switch (cmd) {
+ case DGM_FLOW_OFF: /* FLOW OFF */
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case DGM_FLOW_ON: /* FLOW ON */
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ default:
+ cfpkt_destroy(pkt);
+ pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd);
+ return -EPROTO;
+ }
+}
+
+static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 packet_type;
+ u32 zero = 0;
+ struct caif_payload_info *info;
+ struct cfsrvl *service = container_obj(layr);
+ int ret;
+
+ if (!cfsrvl_ready(service, &ret)) {
+ cfpkt_destroy(pkt);
+ return ret;
+ }
+
+ /* STE Modem cannot handle more than 1500 bytes datagrams */
+ if (cfpkt_getlen(pkt) > DGM_MTU) {
+ cfpkt_destroy(pkt);
+ return -EMSGSIZE;
+ }
+
+ cfpkt_add_head(pkt, &zero, 3);
+ packet_type = 0x08; /* B9 set - UNCLASSIFIED */
+ cfpkt_add_head(pkt, &packet_type, 1);
+
+ /* Add info for MUX-layer to route the packet out. */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ /* To optimize alignment, we add up the size of CAIF header
+ * before payload.
+ */
+ info->hdr_len = 4;
+ info->dev_info = &service->dev_info;
+ return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
new file mode 100644
index 000000000..434ba8557
--- /dev/null
+++ b/net/caif/cffrml.c
@@ -0,0 +1,197 @@
+/*
+ * CAIF Framing Layer.
+ *
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/crc-ccitt.h>
+#include <linux/netdevice.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cffrml.h>
+
+#define container_obj(layr) container_of(layr, struct cffrml, layer)
+
+struct cffrml {
+ struct cflayer layer;
+ bool dofcs; /* !< FCS active */
+ int __percpu *pcpu_refcnt;
+};
+
+static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid);
+
+static u32 cffrml_rcv_error;
+static u32 cffrml_rcv_checsum_error;
+struct cflayer *cffrml_create(u16 phyid, bool use_fcs)
+{
+ struct cffrml *this = kzalloc(sizeof(struct cffrml), GFP_ATOMIC);
+ if (!this)
+ return NULL;
+ this->pcpu_refcnt = alloc_percpu(int);
+ if (this->pcpu_refcnt == NULL) {
+ kfree(this);
+ return NULL;
+ }
+
+ caif_assert(offsetof(struct cffrml, layer) == 0);
+
+ this->layer.receive = cffrml_receive;
+ this->layer.transmit = cffrml_transmit;
+ this->layer.ctrlcmd = cffrml_ctrlcmd;
+ snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "frm%d", phyid);
+ this->dofcs = use_fcs;
+ this->layer.id = phyid;
+ return (struct cflayer *) this;
+}
+
+void cffrml_free(struct cflayer *layer)
+{
+ struct cffrml *this = container_obj(layer);
+ free_percpu(this->pcpu_refcnt);
+ kfree(layer);
+}
+
+void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up)
+{
+ this->up = up;
+}
+
+void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn)
+{
+ this->dn = dn;
+}
+
+static u16 cffrml_checksum(u16 chks, void *buf, u16 len)
+{
+ /* FIXME: FCS should be moved to glue in order to use OS-Specific
+ * solutions
+ */
+ return crc_ccitt(chks, buf, len);
+}
+
+static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u16 tmp;
+ u16 len;
+ u16 hdrchks;
+ int pktchks;
+ struct cffrml *this;
+ this = container_obj(layr);
+
+ cfpkt_extr_head(pkt, &tmp, 2);
+ len = le16_to_cpu(tmp);
+
+ /* Subtract for FCS on length if FCS is not used. */
+ if (!this->dofcs)
+ len -= 2;
+
+ if (cfpkt_setlen(pkt, len) < 0) {
+ ++cffrml_rcv_error;
+ pr_err("Framing length error (%d)\n", len);
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ /*
+ * Don't do extract if FCS is false, rather do setlen - then we don't
+ * get a cache-miss.
+ */
+ if (this->dofcs) {
+ cfpkt_extr_trail(pkt, &tmp, 2);
+ hdrchks = le16_to_cpu(tmp);
+ pktchks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff);
+ if (pktchks != hdrchks) {
+ cfpkt_add_trail(pkt, &tmp, 2);
+ ++cffrml_rcv_error;
+ ++cffrml_rcv_checsum_error;
+ pr_info("Frame checksum error (0x%x != 0x%x)\n",
+ hdrchks, pktchks);
+ return -EILSEQ;
+ }
+ }
+ if (cfpkt_erroneous(pkt)) {
+ ++cffrml_rcv_error;
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+
+ if (layr->up == NULL) {
+ pr_err("Layr up is missing!\n");
+ cfpkt_destroy(pkt);
+ return -EINVAL;
+ }
+
+ return layr->up->receive(layr->up, pkt);
+}
+
+static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u16 chks;
+ u16 len;
+ __le16 data;
+
+ struct cffrml *this = container_obj(layr);
+ if (this->dofcs) {
+ chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff);
+ data = cpu_to_le16(chks);
+ cfpkt_add_trail(pkt, &data, 2);
+ } else {
+ cfpkt_pad_trail(pkt, 2);
+ }
+ len = cfpkt_getlen(pkt);
+ data = cpu_to_le16(len);
+ cfpkt_add_head(pkt, &data, 2);
+ cfpkt_info(pkt)->hdr_len += 2;
+ if (cfpkt_erroneous(pkt)) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+
+ if (layr->dn == NULL) {
+ cfpkt_destroy(pkt);
+ return -ENODEV;
+
+ }
+ return layr->dn->transmit(layr->dn, pkt);
+}
+
+static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ if (layr->up && layr->up->ctrlcmd)
+ layr->up->ctrlcmd(layr->up, ctrl, layr->id);
+}
+
+void cffrml_put(struct cflayer *layr)
+{
+ struct cffrml *this = container_obj(layr);
+ if (layr != NULL && this->pcpu_refcnt != NULL)
+ this_cpu_dec(*this->pcpu_refcnt);
+}
+
+void cffrml_hold(struct cflayer *layr)
+{
+ struct cffrml *this = container_obj(layr);
+ if (layr != NULL && this->pcpu_refcnt != NULL)
+ this_cpu_inc(*this->pcpu_refcnt);
+}
+
+int cffrml_refcnt_read(struct cflayer *layr)
+{
+ int i, refcnt = 0;
+ struct cffrml *this = container_obj(layr);
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(this->pcpu_refcnt, i);
+ return refcnt;
+}
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
new file mode 100644
index 000000000..510aa5a75
--- /dev/null
+++ b/net/caif/cfmuxl.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfmuxl.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cffrml.h>
+
+#define container_obj(layr) container_of(layr, struct cfmuxl, layer)
+
+#define CAIF_CTRL_CHANNEL 0
+#define UP_CACHE_SIZE 8
+#define DN_CACHE_SIZE 8
+
+struct cfmuxl {
+ struct cflayer layer;
+ struct list_head srvl_list;
+ struct list_head frml_list;
+ struct cflayer *up_cache[UP_CACHE_SIZE];
+ struct cflayer *dn_cache[DN_CACHE_SIZE];
+ /*
+ * Set when inserting or removing downwards layers.
+ */
+ spinlock_t transmit_lock;
+
+ /*
+ * Set when inserting or removing upwards layers.
+ */
+ spinlock_t receive_lock;
+
+};
+
+static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid);
+static struct cflayer *get_up(struct cfmuxl *muxl, u16 id);
+
+struct cflayer *cfmuxl_create(void)
+{
+ struct cfmuxl *this = kzalloc(sizeof(struct cfmuxl), GFP_ATOMIC);
+
+ if (!this)
+ return NULL;
+ this->layer.receive = cfmuxl_receive;
+ this->layer.transmit = cfmuxl_transmit;
+ this->layer.ctrlcmd = cfmuxl_ctrlcmd;
+ INIT_LIST_HEAD(&this->srvl_list);
+ INIT_LIST_HEAD(&this->frml_list);
+ spin_lock_init(&this->transmit_lock);
+ spin_lock_init(&this->receive_lock);
+ snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "mux");
+ return &this->layer;
+}
+
+int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *dn, u8 phyid)
+{
+ struct cfmuxl *muxl = (struct cfmuxl *) layr;
+
+ spin_lock_bh(&muxl->transmit_lock);
+ list_add_rcu(&dn->node, &muxl->frml_list);
+ spin_unlock_bh(&muxl->transmit_lock);
+ return 0;
+}
+
+static struct cflayer *get_from_id(struct list_head *list, u16 id)
+{
+ struct cflayer *lyr;
+ list_for_each_entry_rcu(lyr, list, node) {
+ if (lyr->id == id)
+ return lyr;
+ }
+
+ return NULL;
+}
+
+int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid)
+{
+ struct cfmuxl *muxl = container_obj(layr);
+ struct cflayer *old;
+
+ spin_lock_bh(&muxl->receive_lock);
+
+ /* Two entries with same id is wrong, so remove old layer from mux */
+ old = get_from_id(&muxl->srvl_list, linkid);
+ if (old != NULL)
+ list_del_rcu(&old->node);
+
+ list_add_rcu(&up->node, &muxl->srvl_list);
+ spin_unlock_bh(&muxl->receive_lock);
+
+ return 0;
+}
+
+struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid)
+{
+ struct cfmuxl *muxl = container_obj(layr);
+ struct cflayer *dn;
+ int idx = phyid % DN_CACHE_SIZE;
+
+ spin_lock_bh(&muxl->transmit_lock);
+ RCU_INIT_POINTER(muxl->dn_cache[idx], NULL);
+ dn = get_from_id(&muxl->frml_list, phyid);
+ if (dn == NULL)
+ goto out;
+
+ list_del_rcu(&dn->node);
+ caif_assert(dn != NULL);
+out:
+ spin_unlock_bh(&muxl->transmit_lock);
+ return dn;
+}
+
+static struct cflayer *get_up(struct cfmuxl *muxl, u16 id)
+{
+ struct cflayer *up;
+ int idx = id % UP_CACHE_SIZE;
+ up = rcu_dereference(muxl->up_cache[idx]);
+ if (up == NULL || up->id != id) {
+ spin_lock_bh(&muxl->receive_lock);
+ up = get_from_id(&muxl->srvl_list, id);
+ rcu_assign_pointer(muxl->up_cache[idx], up);
+ spin_unlock_bh(&muxl->receive_lock);
+ }
+ return up;
+}
+
+static struct cflayer *get_dn(struct cfmuxl *muxl, struct dev_info *dev_info)
+{
+ struct cflayer *dn;
+ int idx = dev_info->id % DN_CACHE_SIZE;
+ dn = rcu_dereference(muxl->dn_cache[idx]);
+ if (dn == NULL || dn->id != dev_info->id) {
+ spin_lock_bh(&muxl->transmit_lock);
+ dn = get_from_id(&muxl->frml_list, dev_info->id);
+ rcu_assign_pointer(muxl->dn_cache[idx], dn);
+ spin_unlock_bh(&muxl->transmit_lock);
+ }
+ return dn;
+}
+
+struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 id)
+{
+ struct cflayer *up;
+ struct cfmuxl *muxl = container_obj(layr);
+ int idx = id % UP_CACHE_SIZE;
+
+ if (id == 0) {
+ pr_warn("Trying to remove control layer\n");
+ return NULL;
+ }
+
+ spin_lock_bh(&muxl->receive_lock);
+ up = get_from_id(&muxl->srvl_list, id);
+ if (up == NULL)
+ goto out;
+
+ RCU_INIT_POINTER(muxl->up_cache[idx], NULL);
+ list_del_rcu(&up->node);
+out:
+ spin_unlock_bh(&muxl->receive_lock);
+ return up;
+}
+
+static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ int ret;
+ struct cfmuxl *muxl = container_obj(layr);
+ u8 id;
+ struct cflayer *up;
+ if (cfpkt_extr_head(pkt, &id, 1) < 0) {
+ pr_err("erroneous Caif Packet\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ rcu_read_lock();
+ up = get_up(muxl, id);
+
+ if (up == NULL) {
+ pr_debug("Received data on unknown link ID = %d (0x%x)"
+ " up == NULL", id, id);
+ cfpkt_destroy(pkt);
+ /*
+ * Don't return ERROR, since modem misbehaves and sends out
+ * flow on before linksetup response.
+ */
+
+ rcu_read_unlock();
+ return /* CFGLU_EPROT; */ 0;
+ }
+
+ /* We can't hold rcu_lock during receive, so take a ref count instead */
+ cfsrvl_get(up);
+ rcu_read_unlock();
+
+ ret = up->receive(up, pkt);
+
+ cfsrvl_put(up);
+ return ret;
+}
+
+static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct cfmuxl *muxl = container_obj(layr);
+ int err;
+ u8 linkid;
+ struct cflayer *dn;
+ struct caif_payload_info *info = cfpkt_info(pkt);
+ BUG_ON(!info);
+
+ rcu_read_lock();
+
+ dn = get_dn(muxl, info->dev_info);
+ if (dn == NULL) {
+ pr_debug("Send data on unknown phy ID = %d (0x%x)\n",
+ info->dev_info->id, info->dev_info->id);
+ rcu_read_unlock();
+ cfpkt_destroy(pkt);
+ return -ENOTCONN;
+ }
+
+ info->hdr_len += 1;
+ linkid = info->channel_id;
+ cfpkt_add_head(pkt, &linkid, 1);
+
+ /* We can't hold rcu_lock during receive, so take a ref count instead */
+ cffrml_hold(dn);
+
+ rcu_read_unlock();
+
+ err = dn->transmit(dn, pkt);
+
+ cffrml_put(dn);
+ return err;
+}
+
+static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ struct cfmuxl *muxl = container_obj(layr);
+ struct cflayer *layer;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(layer, &muxl->srvl_list, node) {
+
+ if (cfsrvl_phyid_match(layer, phyid) && layer->ctrlcmd) {
+
+ if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND ||
+ ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) &&
+ layer->id != 0)
+ cfmuxl_remove_uplayer(layr, layer->id);
+
+ /* NOTE: ctrlcmd is not allowed to block */
+ layer->ctrlcmd(layer, ctrl, phyid);
+ }
+ }
+ rcu_read_unlock();
+}
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
new file mode 100644
index 000000000..f6c3b2137
--- /dev/null
+++ b/net/caif/cfpkt_skbuff.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <net/caif/cfpkt.h>
+
+#define PKT_PREFIX 48
+#define PKT_POSTFIX 2
+#define PKT_LEN_WHEN_EXTENDING 128
+#define PKT_ERROR(pkt, errmsg) \
+do { \
+ cfpkt_priv(pkt)->erronous = true; \
+ skb_reset_tail_pointer(&pkt->skb); \
+ pr_warn(errmsg); \
+} while (0)
+
+struct cfpktq {
+ struct sk_buff_head head;
+ atomic_t count;
+ /* Lock protects count updates */
+ spinlock_t lock;
+};
+
+/*
+ * net/caif/ is generic and does not
+ * understand SKB, so we do this typecast
+ */
+struct cfpkt {
+ struct sk_buff skb;
+};
+
+/* Private data inside SKB */
+struct cfpkt_priv_data {
+ struct dev_info dev_info;
+ bool erronous;
+};
+
+static inline struct cfpkt_priv_data *cfpkt_priv(struct cfpkt *pkt)
+{
+ return (struct cfpkt_priv_data *) pkt->skb.cb;
+}
+
+static inline bool is_erronous(struct cfpkt *pkt)
+{
+ return cfpkt_priv(pkt)->erronous;
+}
+
+static inline struct sk_buff *pkt_to_skb(struct cfpkt *pkt)
+{
+ return &pkt->skb;
+}
+
+static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb)
+{
+ return (struct cfpkt *) skb;
+}
+
+struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt)
+{
+ struct cfpkt *pkt = skb_to_pkt(nativepkt);
+ cfpkt_priv(pkt)->erronous = false;
+ return pkt;
+}
+EXPORT_SYMBOL(cfpkt_fromnative);
+
+void *cfpkt_tonative(struct cfpkt *pkt)
+{
+ return (void *) pkt;
+}
+EXPORT_SYMBOL(cfpkt_tonative);
+
+static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx)
+{
+ struct sk_buff *skb;
+
+ if (likely(in_interrupt()))
+ skb = alloc_skb(len + pfx, GFP_ATOMIC);
+ else
+ skb = alloc_skb(len + pfx, GFP_KERNEL);
+
+ if (unlikely(skb == NULL))
+ return NULL;
+
+ skb_reserve(skb, pfx);
+ return skb_to_pkt(skb);
+}
+
+inline struct cfpkt *cfpkt_create(u16 len)
+{
+ return cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX);
+}
+
+void cfpkt_destroy(struct cfpkt *pkt)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ kfree_skb(skb);
+}
+
+inline bool cfpkt_more(struct cfpkt *pkt)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ return skb->len > 0;
+}
+
+int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ if (skb_headlen(skb) >= len) {
+ memcpy(data, skb->data, len);
+ return 0;
+ }
+ return !cfpkt_extr_head(pkt, data, len) &&
+ !cfpkt_add_head(pkt, data, len);
+}
+
+int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ u8 *from;
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+
+ if (unlikely(len > skb->len)) {
+ PKT_ERROR(pkt, "read beyond end of packet\n");
+ return -EPROTO;
+ }
+
+ if (unlikely(len > skb_headlen(skb))) {
+ if (unlikely(skb_linearize(skb) != 0)) {
+ PKT_ERROR(pkt, "linearize failed\n");
+ return -EPROTO;
+ }
+ }
+ from = skb_pull(skb, len);
+ from -= len;
+ if (data)
+ memcpy(data, from, len);
+ return 0;
+}
+EXPORT_SYMBOL(cfpkt_extr_head);
+
+int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ u8 *data = dta;
+ u8 *from;
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+
+ if (unlikely(skb_linearize(skb) != 0)) {
+ PKT_ERROR(pkt, "linearize failed\n");
+ return -EPROTO;
+ }
+ if (unlikely(skb->data + len > skb_tail_pointer(skb))) {
+ PKT_ERROR(pkt, "read beyond end of packet\n");
+ return -EPROTO;
+ }
+ from = skb_tail_pointer(skb) - len;
+ skb_trim(skb, skb->len - len);
+ memcpy(data, from, len);
+ return 0;
+}
+
+int cfpkt_pad_trail(struct cfpkt *pkt, u16 len)
+{
+ return cfpkt_add_body(pkt, NULL, len);
+}
+
+int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ struct sk_buff *lastskb;
+ u8 *to;
+ u16 addlen = 0;
+
+
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+
+ lastskb = skb;
+
+ /* Check whether we need to add space at the tail */
+ if (unlikely(skb_tailroom(skb) < len)) {
+ if (likely(len < PKT_LEN_WHEN_EXTENDING))
+ addlen = PKT_LEN_WHEN_EXTENDING;
+ else
+ addlen = len;
+ }
+
+ /* Check whether we need to change the SKB before writing to the tail */
+ if (unlikely((addlen > 0) || skb_cloned(skb) || skb_shared(skb))) {
+
+ /* Make sure data is writable */
+ if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) {
+ PKT_ERROR(pkt, "cow failed\n");
+ return -EPROTO;
+ }
+ }
+
+ /* All set to put the last SKB and optionally write data there. */
+ to = pskb_put(skb, lastskb, len);
+ if (likely(data))
+ memcpy(to, data, len);
+ return 0;
+}
+
+inline int cfpkt_addbdy(struct cfpkt *pkt, u8 data)
+{
+ return cfpkt_add_body(pkt, &data, 1);
+}
+
+int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ struct sk_buff *lastskb;
+ u8 *to;
+ const u8 *data = data2;
+ int ret;
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+ if (unlikely(skb_headroom(skb) < len)) {
+ PKT_ERROR(pkt, "no headroom\n");
+ return -EPROTO;
+ }
+
+ /* Make sure data is writable */
+ ret = skb_cow_data(skb, 0, &lastskb);
+ if (unlikely(ret < 0)) {
+ PKT_ERROR(pkt, "cow failed\n");
+ return ret;
+ }
+
+ to = skb_push(skb, len);
+ memcpy(to, data, len);
+ return 0;
+}
+EXPORT_SYMBOL(cfpkt_add_head);
+
+inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len)
+{
+ return cfpkt_add_body(pkt, data, len);
+}
+
+inline u16 cfpkt_getlen(struct cfpkt *pkt)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ return skb->len;
+}
+
+int cfpkt_iterate(struct cfpkt *pkt,
+ u16 (*iter_func)(u16, void *, u16),
+ u16 data)
+{
+ /*
+ * Don't care about the performance hit of linearizing,
+ * Checksum should not be used on high-speed interfaces anyway.
+ */
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+ if (unlikely(skb_linearize(&pkt->skb) != 0)) {
+ PKT_ERROR(pkt, "linearize failed\n");
+ return -EPROTO;
+ }
+ return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt));
+}
+
+int cfpkt_setlen(struct cfpkt *pkt, u16 len)
+{
+ struct sk_buff *skb = pkt_to_skb(pkt);
+
+
+ if (unlikely(is_erronous(pkt)))
+ return -EPROTO;
+
+ if (likely(len <= skb->len)) {
+ if (unlikely(skb->data_len))
+ ___pskb_trim(skb, len);
+ else
+ skb_trim(skb, len);
+
+ return cfpkt_getlen(pkt);
+ }
+
+ /* Need to expand SKB */
+ if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len)))
+ PKT_ERROR(pkt, "skb_pad_trail failed\n");
+
+ return cfpkt_getlen(pkt);
+}
+
+struct cfpkt *cfpkt_append(struct cfpkt *dstpkt,
+ struct cfpkt *addpkt,
+ u16 expectlen)
+{
+ struct sk_buff *dst = pkt_to_skb(dstpkt);
+ struct sk_buff *add = pkt_to_skb(addpkt);
+ u16 addlen = skb_headlen(add);
+ u16 neededtailspace;
+ struct sk_buff *tmp;
+ u16 dstlen;
+ u16 createlen;
+ if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) {
+ return dstpkt;
+ }
+ if (expectlen > addlen)
+ neededtailspace = expectlen;
+ else
+ neededtailspace = addlen;
+
+ if (dst->tail + neededtailspace > dst->end) {
+ /* Create a dumplicate of 'dst' with more tail space */
+ struct cfpkt *tmppkt;
+ dstlen = skb_headlen(dst);
+ createlen = dstlen + neededtailspace;
+ tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX);
+ if (tmppkt == NULL)
+ return NULL;
+ tmp = pkt_to_skb(tmppkt);
+ skb_set_tail_pointer(tmp, dstlen);
+ tmp->len = dstlen;
+ memcpy(tmp->data, dst->data, dstlen);
+ cfpkt_destroy(dstpkt);
+ dst = tmp;
+ }
+ memcpy(skb_tail_pointer(dst), add->data, skb_headlen(add));
+ cfpkt_destroy(addpkt);
+ dst->tail += addlen;
+ dst->len += addlen;
+ return skb_to_pkt(dst);
+}
+
+struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
+{
+ struct sk_buff *skb2;
+ struct sk_buff *skb = pkt_to_skb(pkt);
+ struct cfpkt *tmppkt;
+ u8 *split = skb->data + pos;
+ u16 len2nd = skb_tail_pointer(skb) - split;
+
+ if (unlikely(is_erronous(pkt)))
+ return NULL;
+
+ if (skb->data + pos > skb_tail_pointer(skb)) {
+ PKT_ERROR(pkt, "trying to split beyond end of packet\n");
+ return NULL;
+ }
+
+ /* Create a new packet for the second part of the data */
+ tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX,
+ PKT_PREFIX);
+ if (tmppkt == NULL)
+ return NULL;
+ skb2 = pkt_to_skb(tmppkt);
+
+
+ if (skb2 == NULL)
+ return NULL;
+
+ /* Reduce the length of the original packet */
+ skb_set_tail_pointer(skb, pos);
+ skb->len = pos;
+
+ memcpy(skb2->data, split, len2nd);
+ skb2->tail += len2nd;
+ skb2->len += len2nd;
+ skb2->priority = skb->priority;
+ return skb_to_pkt(skb2);
+}
+
+bool cfpkt_erroneous(struct cfpkt *pkt)
+{
+ return cfpkt_priv(pkt)->erronous;
+}
+
+struct caif_payload_info *cfpkt_info(struct cfpkt *pkt)
+{
+ return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb;
+}
+EXPORT_SYMBOL(cfpkt_info);
+
+void cfpkt_set_prio(struct cfpkt *pkt, int prio)
+{
+ pkt_to_skb(pkt)->priority = prio;
+}
+EXPORT_SYMBOL(cfpkt_set_prio);
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
new file mode 100644
index 000000000..61d7617d9
--- /dev/null
+++ b/net/caif/cfrfml.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) container_of(layr, struct cfrfml, serv.layer)
+#define RFM_SEGMENTATION_BIT 0x01
+#define RFM_HEAD_SIZE 7
+
+static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cfrfml {
+ struct cfsrvl serv;
+ struct cfpkt *incomplete_frm;
+ int fragment_size;
+ u8 seghead[6];
+ u16 pdu_size;
+ /* Protects serialized processing of packets */
+ spinlock_t sync;
+};
+
+static void cfrfml_release(struct cflayer *layer)
+{
+ struct cfsrvl *srvl = container_of(layer, struct cfsrvl, layer);
+ struct cfrfml *rfml = container_obj(&srvl->layer);
+
+ if (rfml->incomplete_frm)
+ cfpkt_destroy(rfml->incomplete_frm);
+
+ kfree(srvl);
+}
+
+struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info,
+ int mtu_size)
+{
+ int tmp;
+ struct cfrfml *this = kzalloc(sizeof(struct cfrfml), GFP_ATOMIC);
+
+ if (!this)
+ return NULL;
+
+ cfsrvl_init(&this->serv, channel_id, dev_info, false);
+ this->serv.release = cfrfml_release;
+ this->serv.layer.receive = cfrfml_receive;
+ this->serv.layer.transmit = cfrfml_transmit;
+
+ /* Round down to closest multiple of 16 */
+ tmp = (mtu_size - RFM_HEAD_SIZE - 6) / 16;
+ tmp *= 16;
+
+ this->fragment_size = tmp;
+ spin_lock_init(&this->sync);
+ snprintf(this->serv.layer.name, CAIF_LAYER_NAME_SZ,
+ "rfm%d", channel_id);
+
+ return &this->serv.layer;
+}
+
+static struct cfpkt *rfm_append(struct cfrfml *rfml, char *seghead,
+ struct cfpkt *pkt, int *err)
+{
+ struct cfpkt *tmppkt;
+ *err = -EPROTO;
+ /* n-th but not last segment */
+
+ if (cfpkt_extr_head(pkt, seghead, 6) < 0)
+ return NULL;
+
+ /* Verify correct header */
+ if (memcmp(seghead, rfml->seghead, 6) != 0)
+ return NULL;
+
+ tmppkt = cfpkt_append(rfml->incomplete_frm, pkt,
+ rfml->pdu_size + RFM_HEAD_SIZE);
+
+ /* If cfpkt_append failes input pkts are not freed */
+ *err = -ENOMEM;
+ if (tmppkt == NULL)
+ return NULL;
+
+ *err = 0;
+ return tmppkt;
+}
+
+static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 tmp;
+ bool segmented;
+ int err;
+ u8 seghead[6];
+ struct cfrfml *rfml;
+ struct cfpkt *tmppkt = NULL;
+
+ caif_assert(layr->up != NULL);
+ caif_assert(layr->receive != NULL);
+ rfml = container_obj(layr);
+ spin_lock(&rfml->sync);
+
+ err = -EPROTO;
+ if (cfpkt_extr_head(pkt, &tmp, 1) < 0)
+ goto out;
+ segmented = tmp & RFM_SEGMENTATION_BIT;
+
+ if (segmented) {
+ if (rfml->incomplete_frm == NULL) {
+ /* Initial Segment */
+ if (cfpkt_peek_head(pkt, rfml->seghead, 6) < 0)
+ goto out;
+
+ rfml->pdu_size = get_unaligned_le16(rfml->seghead+4);
+
+ if (cfpkt_erroneous(pkt))
+ goto out;
+ rfml->incomplete_frm = pkt;
+ pkt = NULL;
+ } else {
+
+ tmppkt = rfm_append(rfml, seghead, pkt, &err);
+ if (tmppkt == NULL)
+ goto out;
+
+ if (cfpkt_erroneous(tmppkt))
+ goto out;
+
+ rfml->incomplete_frm = tmppkt;
+
+
+ if (cfpkt_erroneous(tmppkt))
+ goto out;
+ }
+ err = 0;
+ goto out;
+ }
+
+ if (rfml->incomplete_frm) {
+
+ /* Last Segment */
+ tmppkt = rfm_append(rfml, seghead, pkt, &err);
+ if (tmppkt == NULL)
+ goto out;
+
+ if (cfpkt_erroneous(tmppkt))
+ goto out;
+
+ rfml->incomplete_frm = NULL;
+ pkt = tmppkt;
+ tmppkt = NULL;
+
+ /* Verify that length is correct */
+ err = EPROTO;
+ if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1)
+ goto out;
+ }
+
+ err = rfml->serv.layer.up->receive(rfml->serv.layer.up, pkt);
+
+out:
+
+ if (err != 0) {
+ if (tmppkt)
+ cfpkt_destroy(tmppkt);
+ if (pkt)
+ cfpkt_destroy(pkt);
+ if (rfml->incomplete_frm)
+ cfpkt_destroy(rfml->incomplete_frm);
+ rfml->incomplete_frm = NULL;
+
+ pr_info("Connection error %d triggered on RFM link\n", err);
+
+ /* Trigger connection error upon failure.*/
+ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
+ rfml->serv.dev_info.id);
+ }
+ spin_unlock(&rfml->sync);
+
+ if (unlikely(err == -EAGAIN))
+ /* It is not possible to recover after drop of a fragment */
+ err = -EIO;
+
+ return err;
+}
+
+
+static int cfrfml_transmit_segment(struct cfrfml *rfml, struct cfpkt *pkt)
+{
+ caif_assert(cfpkt_getlen(pkt) < rfml->fragment_size + RFM_HEAD_SIZE);
+
+ /* Add info for MUX-layer to route the packet out. */
+ cfpkt_info(pkt)->channel_id = rfml->serv.layer.id;
+
+ /*
+ * To optimize alignment, we add up the size of CAIF header before
+ * payload.
+ */
+ cfpkt_info(pkt)->hdr_len = RFM_HEAD_SIZE;
+ cfpkt_info(pkt)->dev_info = &rfml->serv.dev_info;
+
+ return rfml->serv.layer.dn->transmit(rfml->serv.layer.dn, pkt);
+}
+
+static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ int err;
+ u8 seg;
+ u8 head[6];
+ struct cfpkt *rearpkt = NULL;
+ struct cfpkt *frontpkt = pkt;
+ struct cfrfml *rfml = container_obj(layr);
+
+ caif_assert(layr->dn != NULL);
+ caif_assert(layr->dn->transmit != NULL);
+
+ if (!cfsrvl_ready(&rfml->serv, &err))
+ goto out;
+
+ err = -EPROTO;
+ if (cfpkt_getlen(pkt) <= RFM_HEAD_SIZE-1)
+ goto out;
+
+ err = 0;
+ if (cfpkt_getlen(pkt) > rfml->fragment_size + RFM_HEAD_SIZE)
+ err = cfpkt_peek_head(pkt, head, 6);
+
+ if (err < 0)
+ goto out;
+
+ while (cfpkt_getlen(frontpkt) > rfml->fragment_size + RFM_HEAD_SIZE) {
+
+ seg = 1;
+ err = -EPROTO;
+
+ if (cfpkt_add_head(frontpkt, &seg, 1) < 0)
+ goto out;
+ /*
+ * On OOM error cfpkt_split returns NULL.
+ *
+ * NOTE: Segmented pdu is not correctly aligned.
+ * This has negative performance impact.
+ */
+
+ rearpkt = cfpkt_split(frontpkt, rfml->fragment_size);
+ if (rearpkt == NULL)
+ goto out;
+
+ err = cfrfml_transmit_segment(rfml, frontpkt);
+
+ if (err != 0) {
+ frontpkt = NULL;
+ goto out;
+ }
+
+ frontpkt = rearpkt;
+ rearpkt = NULL;
+
+ err = -ENOMEM;
+ if (frontpkt == NULL)
+ goto out;
+ err = -EPROTO;
+ if (cfpkt_add_head(frontpkt, head, 6) < 0)
+ goto out;
+
+ }
+
+ seg = 0;
+ err = -EPROTO;
+
+ if (cfpkt_add_head(frontpkt, &seg, 1) < 0)
+ goto out;
+
+ err = cfrfml_transmit_segment(rfml, frontpkt);
+
+ frontpkt = NULL;
+out:
+
+ if (err != 0) {
+ pr_info("Connection error %d triggered on RFM link\n", err);
+ /* Trigger connection error upon failure.*/
+
+ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
+ rfml->serv.dev_info.id);
+
+ if (rearpkt)
+ cfpkt_destroy(rearpkt);
+
+ if (frontpkt)
+ cfpkt_destroy(frontpkt);
+ }
+
+ return err;
+}
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
new file mode 100644
index 000000000..ce60f06d7
--- /dev/null
+++ b/net/caif/cfserl.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfserl.h>
+
+#define container_obj(layr) ((struct cfserl *) layr)
+
+#define CFSERL_STX 0x02
+#define SERIAL_MINIUM_PACKET_SIZE 4
+#define SERIAL_MAX_FRAMESIZE 4096
+struct cfserl {
+ struct cflayer layer;
+ struct cfpkt *incomplete_frm;
+ /* Protects parallel processing of incoming packets */
+ spinlock_t sync;
+ bool usestx;
+};
+
+static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid);
+
+struct cflayer *cfserl_create(int instance, bool use_stx)
+{
+ struct cfserl *this = kzalloc(sizeof(struct cfserl), GFP_ATOMIC);
+ if (!this)
+ return NULL;
+ caif_assert(offsetof(struct cfserl, layer) == 0);
+ this->layer.receive = cfserl_receive;
+ this->layer.transmit = cfserl_transmit;
+ this->layer.ctrlcmd = cfserl_ctrlcmd;
+ this->usestx = use_stx;
+ spin_lock_init(&this->sync);
+ snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "ser1");
+ return &this->layer;
+}
+
+static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt)
+{
+ struct cfserl *layr = container_obj(l);
+ u16 pkt_len;
+ struct cfpkt *pkt = NULL;
+ struct cfpkt *tail_pkt = NULL;
+ u8 tmp8;
+ u16 tmp;
+ u8 stx = CFSERL_STX;
+ int ret;
+ u16 expectlen = 0;
+
+ caif_assert(newpkt != NULL);
+ spin_lock(&layr->sync);
+
+ if (layr->incomplete_frm != NULL) {
+ layr->incomplete_frm =
+ cfpkt_append(layr->incomplete_frm, newpkt, expectlen);
+ pkt = layr->incomplete_frm;
+ if (pkt == NULL) {
+ spin_unlock(&layr->sync);
+ return -ENOMEM;
+ }
+ } else {
+ pkt = newpkt;
+ }
+ layr->incomplete_frm = NULL;
+
+ do {
+ /* Search for STX at start of pkt if STX is used */
+ if (layr->usestx) {
+ cfpkt_extr_head(pkt, &tmp8, 1);
+ if (tmp8 != CFSERL_STX) {
+ while (cfpkt_more(pkt)
+ && tmp8 != CFSERL_STX) {
+ cfpkt_extr_head(pkt, &tmp8, 1);
+ }
+ if (!cfpkt_more(pkt)) {
+ cfpkt_destroy(pkt);
+ layr->incomplete_frm = NULL;
+ spin_unlock(&layr->sync);
+ return -EPROTO;
+ }
+ }
+ }
+
+ pkt_len = cfpkt_getlen(pkt);
+
+ /*
+ * pkt_len is the accumulated length of the packet data
+ * we have received so far.
+ * Exit if frame doesn't hold length.
+ */
+
+ if (pkt_len < 2) {
+ if (layr->usestx)
+ cfpkt_add_head(pkt, &stx, 1);
+ layr->incomplete_frm = pkt;
+ spin_unlock(&layr->sync);
+ return 0;
+ }
+
+ /*
+ * Find length of frame.
+ * expectlen is the length we need for a full frame.
+ */
+ cfpkt_peek_head(pkt, &tmp, 2);
+ expectlen = le16_to_cpu(tmp) + 2;
+ /*
+ * Frame error handling
+ */
+ if (expectlen < SERIAL_MINIUM_PACKET_SIZE
+ || expectlen > SERIAL_MAX_FRAMESIZE) {
+ if (!layr->usestx) {
+ if (pkt != NULL)
+ cfpkt_destroy(pkt);
+ layr->incomplete_frm = NULL;
+ expectlen = 0;
+ spin_unlock(&layr->sync);
+ return -EPROTO;
+ }
+ continue;
+ }
+
+ if (pkt_len < expectlen) {
+ /* Too little received data */
+ if (layr->usestx)
+ cfpkt_add_head(pkt, &stx, 1);
+ layr->incomplete_frm = pkt;
+ spin_unlock(&layr->sync);
+ return 0;
+ }
+
+ /*
+ * Enough data for at least one frame.
+ * Split the frame, if too long
+ */
+ if (pkt_len > expectlen)
+ tail_pkt = cfpkt_split(pkt, expectlen);
+ else
+ tail_pkt = NULL;
+
+ /* Send the first part of packet upwards.*/
+ spin_unlock(&layr->sync);
+ ret = layr->layer.up->receive(layr->layer.up, pkt);
+ spin_lock(&layr->sync);
+ if (ret == -EILSEQ) {
+ if (layr->usestx) {
+ if (tail_pkt != NULL)
+ pkt = cfpkt_append(pkt, tail_pkt, 0);
+ /* Start search for next STX if frame failed */
+ continue;
+ } else {
+ cfpkt_destroy(pkt);
+ pkt = NULL;
+ }
+ }
+
+ pkt = tail_pkt;
+
+ } while (pkt != NULL);
+
+ spin_unlock(&layr->sync);
+ return 0;
+}
+
+static int cfserl_transmit(struct cflayer *layer, struct cfpkt *newpkt)
+{
+ struct cfserl *layr = container_obj(layer);
+ u8 tmp8 = CFSERL_STX;
+ if (layr->usestx)
+ cfpkt_add_head(newpkt, &tmp8, 1);
+ return layer->dn->transmit(layer->dn, newpkt);
+}
+
+static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+}
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
new file mode 100644
index 000000000..a6e115463
--- /dev/null
+++ b/net/caif/cfsrvl.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/pkt_sched.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/caif_dev.h>
+
+#define SRVL_CTRL_PKT_SIZE 1
+#define SRVL_FLOW_OFF 0x81
+#define SRVL_FLOW_ON 0x80
+#define SRVL_SET_PIN 0x82
+#define SRVL_CTRL_PKT_SIZE 1
+
+#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
+
+static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+ int phyid)
+{
+ struct cfsrvl *service = container_obj(layr);
+
+ if (layr->up == NULL || layr->up->ctrlcmd == NULL)
+ return;
+
+ switch (ctrl) {
+ case CAIF_CTRLCMD_INIT_RSP:
+ service->open = true;
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+ break;
+ case CAIF_CTRLCMD_DEINIT_RSP:
+ case CAIF_CTRLCMD_INIT_FAIL_RSP:
+ service->open = false;
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+ break;
+ case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND:
+ if (phyid != service->dev_info.id)
+ break;
+ if (service->modem_flow_on)
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_FLOW_OFF_IND, phyid);
+ service->phy_flow_on = false;
+ break;
+ case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND:
+ if (phyid != service->dev_info.id)
+ return;
+ if (service->modem_flow_on) {
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_FLOW_ON_IND,
+ phyid);
+ }
+ service->phy_flow_on = true;
+ break;
+ case CAIF_CTRLCMD_FLOW_OFF_IND:
+ if (service->phy_flow_on) {
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_FLOW_OFF_IND, phyid);
+ }
+ service->modem_flow_on = false;
+ break;
+ case CAIF_CTRLCMD_FLOW_ON_IND:
+ if (service->phy_flow_on) {
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_FLOW_ON_IND, phyid);
+ }
+ service->modem_flow_on = true;
+ break;
+ case _CAIF_CTRLCMD_PHYIF_DOWN_IND:
+ /* In case interface is down, let's fake a remove shutdown */
+ layr->up->ctrlcmd(layr->up,
+ CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid);
+ break;
+ case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+ break;
+ default:
+ pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl);
+ /* We have both modem and phy flow on, send flow on */
+ layr->up->ctrlcmd(layr->up, ctrl, phyid);
+ service->phy_flow_on = true;
+ break;
+ }
+}
+
+static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
+{
+ struct cfsrvl *service = container_obj(layr);
+
+ caif_assert(layr != NULL);
+ caif_assert(layr->dn != NULL);
+ caif_assert(layr->dn->transmit != NULL);
+
+ if (!service->supports_flowctrl)
+ return 0;
+
+ switch (ctrl) {
+ case CAIF_MODEMCMD_FLOW_ON_REQ:
+ {
+ struct cfpkt *pkt;
+ struct caif_payload_info *info;
+ u8 flow_on = SRVL_FLOW_ON;
+ pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
+ if (!pkt)
+ return -ENOMEM;
+
+ if (cfpkt_add_head(pkt, &flow_on, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->hdr_len = 1;
+ info->dev_info = &service->dev_info;
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ return layr->dn->transmit(layr->dn, pkt);
+ }
+ case CAIF_MODEMCMD_FLOW_OFF_REQ:
+ {
+ struct cfpkt *pkt;
+ struct caif_payload_info *info;
+ u8 flow_off = SRVL_FLOW_OFF;
+ pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
+ if (!pkt)
+ return -ENOMEM;
+
+ if (cfpkt_add_head(pkt, &flow_off, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->hdr_len = 1;
+ info->dev_info = &service->dev_info;
+ cfpkt_set_prio(pkt, TC_PRIO_CONTROL);
+ return layr->dn->transmit(layr->dn, pkt);
+ }
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+static void cfsrvl_release(struct cflayer *layer)
+{
+ struct cfsrvl *service = container_of(layer, struct cfsrvl, layer);
+ kfree(service);
+}
+
+void cfsrvl_init(struct cfsrvl *service,
+ u8 channel_id,
+ struct dev_info *dev_info,
+ bool supports_flowctrl)
+{
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ service->open = false;
+ service->modem_flow_on = true;
+ service->phy_flow_on = true;
+ service->layer.id = channel_id;
+ service->layer.ctrlcmd = cfservl_ctrlcmd;
+ service->layer.modemcmd = cfservl_modemcmd;
+ service->dev_info = *dev_info;
+ service->supports_flowctrl = supports_flowctrl;
+ service->release = cfsrvl_release;
+}
+
+bool cfsrvl_ready(struct cfsrvl *service, int *err)
+{
+ if (!service->open) {
+ *err = -ENOTCONN;
+ return false;
+ }
+ return true;
+}
+
+u8 cfsrvl_getphyid(struct cflayer *layer)
+{
+ struct cfsrvl *servl = container_obj(layer);
+ return servl->dev_info.id;
+}
+
+bool cfsrvl_phyid_match(struct cflayer *layer, int phyid)
+{
+ struct cfsrvl *servl = container_obj(layer);
+ return servl->dev_info.id == phyid;
+}
+
+void caif_free_client(struct cflayer *adap_layer)
+{
+ struct cfsrvl *servl;
+ if (adap_layer == NULL || adap_layer->dn == NULL)
+ return;
+ servl = container_obj(adap_layer->dn);
+ servl->release(&servl->layer);
+}
+EXPORT_SYMBOL(caif_free_client);
+
+void caif_client_register_refcnt(struct cflayer *adapt_layer,
+ void (*hold)(struct cflayer *lyr),
+ void (*put)(struct cflayer *lyr))
+{
+ struct cfsrvl *service;
+
+ if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL))
+ return;
+ service = container_of(adapt_layer->dn, struct cfsrvl, layer);
+ service->hold = hold;
+ service->put = put;
+}
+EXPORT_SYMBOL(caif_client_register_refcnt);
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
new file mode 100644
index 000000000..1728fa447
--- /dev/null
+++ b/net/caif/cfutill.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+#define UTIL_PAYLOAD 0x00
+#define UTIL_CMD_BIT 0x80
+#define UTIL_REMOTE_SHUTDOWN 0x82
+#define UTIL_FLOW_OFF 0x81
+#define UTIL_FLOW_ON 0x80
+
+static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *util = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!util)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ cfsrvl_init(util, channel_id, dev_info, true);
+ util->layer.receive = cfutill_receive;
+ util->layer.transmit = cfutill_transmit;
+ snprintf(util->layer.name, CAIF_LAYER_NAME_SZ - 1, "util1");
+ return &util->layer;
+}
+
+static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 cmd = -1;
+ struct cfsrvl *service = container_obj(layr);
+ caif_assert(layr != NULL);
+ caif_assert(layr->up != NULL);
+ caif_assert(layr->up->receive != NULL);
+ caif_assert(layr->up->ctrlcmd != NULL);
+ if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+
+ switch (cmd) {
+ case UTIL_PAYLOAD:
+ return layr->up->receive(layr->up, pkt);
+ case UTIL_FLOW_OFF:
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case UTIL_FLOW_ON:
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case UTIL_REMOTE_SHUTDOWN: /* Remote Shutdown Request */
+ pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n");
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0);
+ service->open = false;
+ cfpkt_destroy(pkt);
+ return 0;
+ default:
+ cfpkt_destroy(pkt);
+ pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd);
+ return -EPROTO;
+ }
+}
+
+static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 zero = 0;
+ struct caif_payload_info *info;
+ int ret;
+ struct cfsrvl *service = container_obj(layr);
+ caif_assert(layr != NULL);
+ caif_assert(layr->dn != NULL);
+ caif_assert(layr->dn->transmit != NULL);
+
+ if (!cfsrvl_ready(service, &ret)) {
+ cfpkt_destroy(pkt);
+ return ret;
+ }
+
+ cfpkt_add_head(pkt, &zero, 1);
+ /* Add info for MUX-layer to route the packet out. */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ /*
+ * To optimize alignment, we add up the size of CAIF header before
+ * payload.
+ */
+ info->hdr_len = 1;
+ info->dev_info = &service->dev_info;
+ return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
new file mode 100644
index 000000000..262224581
--- /dev/null
+++ b/net/caif/cfveil.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define VEI_PAYLOAD 0x00
+#define VEI_CMD_BIT 0x80
+#define VEI_FLOW_OFF 0x81
+#define VEI_FLOW_ON 0x80
+#define VEI_SET_PIN 0x82
+
+#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
+
+static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *vei = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!vei)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+ cfsrvl_init(vei, channel_id, dev_info, true);
+ vei->layer.receive = cfvei_receive;
+ vei->layer.transmit = cfvei_transmit;
+ snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ - 1, "vei%d", channel_id);
+ return &vei->layer;
+}
+
+static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 cmd;
+ int ret;
+ caif_assert(layr->up != NULL);
+ caif_assert(layr->receive != NULL);
+ caif_assert(layr->ctrlcmd != NULL);
+
+
+ if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ switch (cmd) {
+ case VEI_PAYLOAD:
+ ret = layr->up->receive(layr->up, pkt);
+ return ret;
+ case VEI_FLOW_OFF:
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case VEI_FLOW_ON:
+ layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+ cfpkt_destroy(pkt);
+ return 0;
+ case VEI_SET_PIN: /* SET RS232 PIN */
+ cfpkt_destroy(pkt);
+ return 0;
+ default: /* SET RS232 PIN */
+ pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd);
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+}
+
+static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u8 tmp = 0;
+ struct caif_payload_info *info;
+ int ret;
+ struct cfsrvl *service = container_obj(layr);
+ if (!cfsrvl_ready(service, &ret))
+ goto err;
+ caif_assert(layr->dn != NULL);
+ caif_assert(layr->dn->transmit != NULL);
+
+ if (cfpkt_add_head(pkt, &tmp, 1) < 0) {
+ pr_err("Packet is erroneous!\n");
+ ret = -EPROTO;
+ goto err;
+ }
+
+ /* Add info-> for MUX-layer to route the packet out. */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->hdr_len = 1;
+ info->dev_info = &service->dev_info;
+ return layr->dn->transmit(layr->dn, pkt);
+err:
+ cfpkt_destroy(pkt);
+ return ret;
+}
diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c
new file mode 100644
index 000000000..b3b110e8a
--- /dev/null
+++ b/net/caif/cfvidl.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author: Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info)
+{
+ struct cfsrvl *vid = kzalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+ if (!vid)
+ return NULL;
+ caif_assert(offsetof(struct cfsrvl, layer) == 0);
+
+ cfsrvl_init(vid, channel_id, dev_info, false);
+ vid->layer.receive = cfvidl_receive;
+ vid->layer.transmit = cfvidl_transmit;
+ snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ - 1, "vid1");
+ return &vid->layer;
+}
+
+static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+ u32 videoheader;
+ if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) {
+ pr_err("Packet is erroneous!\n");
+ cfpkt_destroy(pkt);
+ return -EPROTO;
+ }
+ return layr->up->receive(layr->up, pkt);
+}
+
+static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct cfsrvl *service = container_obj(layr);
+ struct caif_payload_info *info;
+ u32 videoheader = 0;
+ int ret;
+
+ if (!cfsrvl_ready(service, &ret)) {
+ cfpkt_destroy(pkt);
+ return ret;
+ }
+
+ cfpkt_add_head(pkt, &videoheader, 4);
+ /* Add info for MUX-layer to route the packet out */
+ info = cfpkt_info(pkt);
+ info->channel_id = service->layer.id;
+ info->dev_info = &service->dev_info;
+ return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
new file mode 100644
index 000000000..67a4a36fe
--- /dev/null
+++ b/net/caif/chnl_net.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Authors: Sjur Brendeland
+ * Daniel Martensson
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/fs.h>
+#include <linux/hardirq.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/sched.h>
+#include <linux/sockios.h>
+#include <linux/caif/if_caif.h>
+#include <net/rtnetlink.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/caif_dev.h>
+
+/* GPRS PDP connection has MTU to 1500 */
+#define GPRS_PDP_MTU 1500
+/* 5 sec. connect timeout */
+#define CONNECT_TIMEOUT (5 * HZ)
+#define CAIF_NET_DEFAULT_QUEUE_LEN 500
+#define UNDEF_CONNID 0xffffffff
+
+/*This list is protected by the rtnl lock. */
+static LIST_HEAD(chnl_net_list);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("caif");
+
+enum caif_states {
+ CAIF_CONNECTED = 1,
+ CAIF_CONNECTING,
+ CAIF_DISCONNECTED,
+ CAIF_SHUTDOWN
+};
+
+struct chnl_net {
+ struct cflayer chnl;
+ struct net_device_stats stats;
+ struct caif_connect_request conn_req;
+ struct list_head list_field;
+ struct net_device *netdev;
+ char name[256];
+ wait_queue_head_t netmgmt_wq;
+ /* Flow status to remember and control the transmission. */
+ bool flowenabled;
+ enum caif_states state;
+};
+
+static void robust_list_del(struct list_head *delete_node)
+{
+ struct list_head *list_node;
+ struct list_head *n;
+ ASSERT_RTNL();
+ list_for_each_safe(list_node, n, &chnl_net_list) {
+ if (list_node == delete_node) {
+ list_del(list_node);
+ return;
+ }
+ }
+ WARN_ON(1);
+}
+
+static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt)
+{
+ struct sk_buff *skb;
+ struct chnl_net *priv;
+ int pktlen;
+ const u8 *ip_version;
+ u8 buf;
+
+ priv = container_of(layr, struct chnl_net, chnl);
+ if (!priv)
+ return -EINVAL;
+
+ skb = (struct sk_buff *) cfpkt_tonative(pkt);
+
+ /* Get length of CAIF packet. */
+ pktlen = skb->len;
+
+ /* Pass some minimum information and
+ * send the packet to the net stack.
+ */
+ skb->dev = priv->netdev;
+
+ /* check the version of IP */
+ ip_version = skb_header_pointer(skb, 0, 1, &buf);
+ if (!ip_version) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ switch (*ip_version >> 4) {
+ case 4:
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ case 6:
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ default:
+ kfree_skb(skb);
+ priv->netdev->stats.rx_errors++;
+ return -EINVAL;
+ }
+
+ /* If we change the header in loop mode, the checksum is corrupted. */
+ if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ else
+ skb->ip_summed = CHECKSUM_NONE;
+
+ if (in_interrupt())
+ netif_rx(skb);
+ else
+ netif_rx_ni(skb);
+
+ /* Update statistics. */
+ priv->netdev->stats.rx_packets++;
+ priv->netdev->stats.rx_bytes += pktlen;
+
+ return 0;
+}
+
+static int delete_device(struct chnl_net *dev)
+{
+ ASSERT_RTNL();
+ if (dev->netdev)
+ unregister_netdevice(dev->netdev);
+ return 0;
+}
+
+static void close_work(struct work_struct *work)
+{
+ struct chnl_net *dev = NULL;
+ struct list_head *list_node;
+ struct list_head *_tmp;
+
+ rtnl_lock();
+ list_for_each_safe(list_node, _tmp, &chnl_net_list) {
+ dev = list_entry(list_node, struct chnl_net, list_field);
+ if (dev->state == CAIF_SHUTDOWN)
+ dev_close(dev->netdev);
+ }
+ rtnl_unlock();
+}
+static DECLARE_WORK(close_worker, close_work);
+
+static void chnl_hold(struct cflayer *lyr)
+{
+ struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl);
+ dev_hold(priv->netdev);
+}
+
+static void chnl_put(struct cflayer *lyr)
+{
+ struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl);
+ dev_put(priv->netdev);
+}
+
+static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
+ int phyid)
+{
+ struct chnl_net *priv = container_of(layr, struct chnl_net, chnl);
+ pr_debug("NET flowctrl func called flow: %s\n",
+ flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" :
+ flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" :
+ flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" :
+ flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" :
+ flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" :
+ flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ?
+ "REMOTE_SHUTDOWN" : "UKNOWN CTRL COMMAND");
+
+
+
+ switch (flow) {
+ case CAIF_CTRLCMD_FLOW_OFF_IND:
+ priv->flowenabled = false;
+ netif_stop_queue(priv->netdev);
+ break;
+ case CAIF_CTRLCMD_DEINIT_RSP:
+ priv->state = CAIF_DISCONNECTED;
+ break;
+ case CAIF_CTRLCMD_INIT_FAIL_RSP:
+ priv->state = CAIF_DISCONNECTED;
+ wake_up_interruptible(&priv->netmgmt_wq);
+ break;
+ case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+ priv->state = CAIF_SHUTDOWN;
+ netif_tx_disable(priv->netdev);
+ schedule_work(&close_worker);
+ break;
+ case CAIF_CTRLCMD_FLOW_ON_IND:
+ priv->flowenabled = true;
+ netif_wake_queue(priv->netdev);
+ break;
+ case CAIF_CTRLCMD_INIT_RSP:
+ caif_client_register_refcnt(&priv->chnl, chnl_hold, chnl_put);
+ priv->state = CAIF_CONNECTED;
+ priv->flowenabled = true;
+ netif_wake_queue(priv->netdev);
+ wake_up_interruptible(&priv->netmgmt_wq);
+ break;
+ default:
+ break;
+ }
+}
+
+static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct chnl_net *priv;
+ struct cfpkt *pkt = NULL;
+ int len;
+ int result = -1;
+ /* Get our private data. */
+ priv = netdev_priv(dev);
+
+ if (skb->len > priv->netdev->mtu) {
+ pr_warn("Size of skb exceeded MTU\n");
+ kfree_skb(skb);
+ dev->stats.tx_errors++;
+ return NETDEV_TX_OK;
+ }
+
+ if (!priv->flowenabled) {
+ pr_debug("dropping packets flow off\n");
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+
+ if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP)
+ swap(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+
+ /* Store original SKB length. */
+ len = skb->len;
+
+ pkt = cfpkt_fromnative(CAIF_DIR_OUT, (void *) skb);
+
+ /* Send the packet down the stack. */
+ result = priv->chnl.dn->transmit(priv->chnl.dn, pkt);
+ if (result) {
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
+
+ /* Update statistics. */
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += len;
+
+ return NETDEV_TX_OK;
+}
+
+static int chnl_net_open(struct net_device *dev)
+{
+ struct chnl_net *priv = NULL;
+ int result = -1;
+ int llifindex, headroom, tailroom, mtu;
+ struct net_device *lldev;
+ ASSERT_RTNL();
+ priv = netdev_priv(dev);
+ if (!priv) {
+ pr_debug("chnl_net_open: no priv\n");
+ return -ENODEV;
+ }
+
+ if (priv->state != CAIF_CONNECTING) {
+ priv->state = CAIF_CONNECTING;
+ result = caif_connect_client(dev_net(dev), &priv->conn_req,
+ &priv->chnl, &llifindex,
+ &headroom, &tailroom);
+ if (result != 0) {
+ pr_debug("err: "
+ "Unable to register and open device,"
+ " Err:%d\n",
+ result);
+ goto error;
+ }
+
+ lldev = __dev_get_by_index(dev_net(dev), llifindex);
+
+ if (lldev == NULL) {
+ pr_debug("no interface?\n");
+ result = -ENODEV;
+ goto error;
+ }
+
+ dev->needed_tailroom = tailroom + lldev->needed_tailroom;
+ dev->hard_header_len = headroom + lldev->hard_header_len +
+ lldev->needed_tailroom;
+
+ /*
+ * MTU, head-room etc is not know before we have a
+ * CAIF link layer device available. MTU calculation may
+ * override initial RTNL configuration.
+ * MTU is minimum of current mtu, link layer mtu pluss
+ * CAIF head and tail, and PDP GPRS contexts max MTU.
+ */
+ mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom));
+ mtu = min_t(int, GPRS_PDP_MTU, mtu);
+ dev_set_mtu(dev, mtu);
+
+ if (mtu < 100) {
+ pr_warn("CAIF Interface MTU too small (%d)\n", mtu);
+ result = -ENODEV;
+ goto error;
+ }
+ }
+
+ rtnl_unlock(); /* Release RTNL lock during connect wait */
+
+ result = wait_event_interruptible_timeout(priv->netmgmt_wq,
+ priv->state != CAIF_CONNECTING,
+ CONNECT_TIMEOUT);
+
+ rtnl_lock();
+
+ if (result == -ERESTARTSYS) {
+ pr_debug("wait_event_interruptible woken by a signal\n");
+ result = -ERESTARTSYS;
+ goto error;
+ }
+
+ if (result == 0) {
+ pr_debug("connect timeout\n");
+ caif_disconnect_client(dev_net(dev), &priv->chnl);
+ priv->state = CAIF_DISCONNECTED;
+ pr_debug("state disconnected\n");
+ result = -ETIMEDOUT;
+ goto error;
+ }
+
+ if (priv->state != CAIF_CONNECTED) {
+ pr_debug("connect failed\n");
+ result = -ECONNREFUSED;
+ goto error;
+ }
+ pr_debug("CAIF Netdevice connected\n");
+ return 0;
+
+error:
+ caif_disconnect_client(dev_net(dev), &priv->chnl);
+ priv->state = CAIF_DISCONNECTED;
+ pr_debug("state disconnected\n");
+ return result;
+
+}
+
+static int chnl_net_stop(struct net_device *dev)
+{
+ struct chnl_net *priv;
+
+ ASSERT_RTNL();
+ priv = netdev_priv(dev);
+ priv->state = CAIF_DISCONNECTED;
+ caif_disconnect_client(dev_net(dev), &priv->chnl);
+ return 0;
+}
+
+static int chnl_net_init(struct net_device *dev)
+{
+ struct chnl_net *priv;
+ ASSERT_RTNL();
+ priv = netdev_priv(dev);
+ strncpy(priv->name, dev->name, sizeof(priv->name));
+ return 0;
+}
+
+static void chnl_net_uninit(struct net_device *dev)
+{
+ struct chnl_net *priv;
+ ASSERT_RTNL();
+ priv = netdev_priv(dev);
+ robust_list_del(&priv->list_field);
+}
+
+static const struct net_device_ops netdev_ops = {
+ .ndo_open = chnl_net_open,
+ .ndo_stop = chnl_net_stop,
+ .ndo_init = chnl_net_init,
+ .ndo_uninit = chnl_net_uninit,
+ .ndo_start_xmit = chnl_net_start_xmit,
+};
+
+static void chnl_net_destructor(struct net_device *dev)
+{
+ struct chnl_net *priv = netdev_priv(dev);
+ caif_free_client(&priv->chnl);
+ free_netdev(dev);
+}
+
+static void ipcaif_net_setup(struct net_device *dev)
+{
+ struct chnl_net *priv;
+ dev->netdev_ops = &netdev_ops;
+ dev->destructor = chnl_net_destructor;
+ dev->flags |= IFF_NOARP;
+ dev->flags |= IFF_POINTOPOINT;
+ dev->mtu = GPRS_PDP_MTU;
+ dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN;
+
+ priv = netdev_priv(dev);
+ priv->chnl.receive = chnl_recv_cb;
+ priv->chnl.ctrlcmd = chnl_flowctrl_cb;
+ priv->netdev = dev;
+ priv->conn_req.protocol = CAIFPROTO_DATAGRAM;
+ priv->conn_req.link_selector = CAIF_LINK_HIGH_BANDW;
+ priv->conn_req.priority = CAIF_PRIO_LOW;
+ /* Insert illegal value */
+ priv->conn_req.sockaddr.u.dgm.connection_id = UNDEF_CONNID;
+ priv->flowenabled = false;
+
+ init_waitqueue_head(&priv->netmgmt_wq);
+}
+
+
+static int ipcaif_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct chnl_net *priv;
+ u8 loop;
+ priv = netdev_priv(dev);
+ if (nla_put_u32(skb, IFLA_CAIF_IPV4_CONNID,
+ priv->conn_req.sockaddr.u.dgm.connection_id) ||
+ nla_put_u32(skb, IFLA_CAIF_IPV6_CONNID,
+ priv->conn_req.sockaddr.u.dgm.connection_id))
+ goto nla_put_failure;
+ loop = priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP;
+ if (nla_put_u8(skb, IFLA_CAIF_LOOPBACK, loop))
+ goto nla_put_failure;
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+
+}
+
+static void caif_netlink_parms(struct nlattr *data[],
+ struct caif_connect_request *conn_req)
+{
+ if (!data) {
+ pr_warn("no params data found\n");
+ return;
+ }
+ if (data[IFLA_CAIF_IPV4_CONNID])
+ conn_req->sockaddr.u.dgm.connection_id =
+ nla_get_u32(data[IFLA_CAIF_IPV4_CONNID]);
+ if (data[IFLA_CAIF_IPV6_CONNID])
+ conn_req->sockaddr.u.dgm.connection_id =
+ nla_get_u32(data[IFLA_CAIF_IPV6_CONNID]);
+ if (data[IFLA_CAIF_LOOPBACK]) {
+ if (nla_get_u8(data[IFLA_CAIF_LOOPBACK]))
+ conn_req->protocol = CAIFPROTO_DATAGRAM_LOOP;
+ else
+ conn_req->protocol = CAIFPROTO_DATAGRAM;
+ }
+}
+
+static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ int ret;
+ struct chnl_net *caifdev;
+ ASSERT_RTNL();
+ caifdev = netdev_priv(dev);
+ caif_netlink_parms(data, &caifdev->conn_req);
+
+ ret = register_netdevice(dev);
+ if (ret)
+ pr_warn("device rtml registration failed\n");
+ else
+ list_add(&caifdev->list_field, &chnl_net_list);
+
+ /* Use ifindex as connection id, and use loopback channel default. */
+ if (caifdev->conn_req.sockaddr.u.dgm.connection_id == UNDEF_CONNID) {
+ caifdev->conn_req.sockaddr.u.dgm.connection_id = dev->ifindex;
+ caifdev->conn_req.protocol = CAIFPROTO_DATAGRAM_LOOP;
+ }
+ return ret;
+}
+
+static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct chnl_net *caifdev;
+ ASSERT_RTNL();
+ caifdev = netdev_priv(dev);
+ caif_netlink_parms(data, &caifdev->conn_req);
+ netdev_state_change(dev);
+ return 0;
+}
+
+static size_t ipcaif_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_CAIF_IPV4_CONNID */
+ nla_total_size(4) +
+ /* IFLA_CAIF_IPV6_CONNID */
+ nla_total_size(4) +
+ /* IFLA_CAIF_LOOPBACK */
+ nla_total_size(2) +
+ 0;
+}
+
+static const struct nla_policy ipcaif_policy[IFLA_CAIF_MAX + 1] = {
+ [IFLA_CAIF_IPV4_CONNID] = { .type = NLA_U32 },
+ [IFLA_CAIF_IPV6_CONNID] = { .type = NLA_U32 },
+ [IFLA_CAIF_LOOPBACK] = { .type = NLA_U8 }
+};
+
+
+static struct rtnl_link_ops ipcaif_link_ops __read_mostly = {
+ .kind = "caif",
+ .priv_size = sizeof(struct chnl_net),
+ .setup = ipcaif_net_setup,
+ .maxtype = IFLA_CAIF_MAX,
+ .policy = ipcaif_policy,
+ .newlink = ipcaif_newlink,
+ .changelink = ipcaif_changelink,
+ .get_size = ipcaif_get_size,
+ .fill_info = ipcaif_fill_info,
+
+};
+
+static int __init chnl_init_module(void)
+{
+ return rtnl_link_register(&ipcaif_link_ops);
+}
+
+static void __exit chnl_exit_module(void)
+{
+ struct chnl_net *dev = NULL;
+ struct list_head *list_node;
+ struct list_head *_tmp;
+ rtnl_link_unregister(&ipcaif_link_ops);
+ rtnl_lock();
+ list_for_each_safe(list_node, _tmp, &chnl_net_list) {
+ dev = list_entry(list_node, struct chnl_net, list_field);
+ list_del(list_node);
+ delete_device(dev);
+ }
+ rtnl_unlock();
+}
+
+module_init(chnl_init_module);
+module_exit(chnl_exit_module);
diff --git a/net/can/Kconfig b/net/can/Kconfig
new file mode 100644
index 000000000..a15c0e0d1
--- /dev/null
+++ b/net/can/Kconfig
@@ -0,0 +1,56 @@
+#
+# Controller Area Network (CAN) network layer core configuration
+#
+
+menuconfig CAN
+ depends on NET
+ tristate "CAN bus subsystem support"
+ ---help---
+ Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial
+ communications protocol which was developed by Bosch in
+ 1991, mainly for automotive, but now widely used in marine
+ (NMEA2000), industrial, and medical applications.
+ More information on the CAN network protocol family PF_CAN
+ is contained in <Documentation/networking/can.txt>.
+
+ If you want CAN support you should say Y here and also to the
+ specific driver for your controller(s) below.
+
+if CAN
+
+config CAN_RAW
+ tristate "Raw CAN Protocol (raw access with CAN-ID filtering)"
+ default y
+ ---help---
+ The raw CAN protocol option offers access to the CAN bus via
+ the BSD socket API. You probably want to use the raw socket in
+ most cases where no higher level protocol is being used. The raw
+ socket has several filter options e.g. ID masking / error frames.
+ To receive/send raw CAN messages, use AF_CAN with protocol CAN_RAW.
+
+config CAN_BCM
+ tristate "Broadcast Manager CAN Protocol (with content filtering)"
+ default y
+ ---help---
+ The Broadcast Manager offers content filtering, timeout monitoring,
+ sending of RTR frames, and cyclic CAN messages without permanent user
+ interaction. The BCM can be 'programmed' via the BSD socket API and
+ informs you on demand e.g. only on content updates / timeouts.
+ You probably want to use the bcm socket in most cases where cyclic
+ CAN messages are used on the bus (e.g. in automotive environments).
+ To use the Broadcast Manager, use AF_CAN with protocol CAN_BCM.
+
+config CAN_GW
+ tristate "CAN Gateway/Router (with netlink configuration)"
+ default y
+ ---help---
+ The CAN Gateway/Router is used to route (and modify) CAN frames.
+ It is based on the PF_CAN core infrastructure for msg filtering and
+ msg sending and can optionally modify routed CAN frames on the fly.
+ CAN frames can be routed between CAN network interfaces (one hop).
+ They can be modified with AND/OR/XOR/SET operations as configured
+ by the netlink configuration interface known e.g. from iptables.
+
+source "drivers/net/can/Kconfig"
+
+endif
diff --git a/net/can/Makefile b/net/can/Makefile
new file mode 100644
index 000000000..cef49eb1f
--- /dev/null
+++ b/net/can/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the Linux Controller Area Network core.
+#
+
+obj-$(CONFIG_CAN) += can.o
+can-y := af_can.o proc.o
+
+obj-$(CONFIG_CAN_RAW) += can-raw.o
+can-raw-y := raw.o
+
+obj-$(CONFIG_CAN_BCM) += can-bcm.o
+can-bcm-y := bcm.o
+
+obj-$(CONFIG_CAN_GW) += can-gw.o
+can-gw-y := gw.o
diff --git a/net/can/af_can.c b/net/can/af_can.c
new file mode 100644
index 000000000..689c818ed
--- /dev/null
+++ b/net/can/af_can.c
@@ -0,0 +1,965 @@
+/*
+ * af_can.c - Protocol family CAN core module
+ * (used by different CAN protocol modules)
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/ratelimit.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "af_can.h"
+
+MODULE_DESCRIPTION("Controller Area Network PF_CAN core");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, "
+ "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
+
+MODULE_ALIAS_NETPROTO(PF_CAN);
+
+static int stats_timer __read_mostly = 1;
+module_param(stats_timer, int, S_IRUGO);
+MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)");
+
+/* receive filters subscribed for 'all' CAN devices */
+struct dev_rcv_lists can_rx_alldev_list;
+static DEFINE_SPINLOCK(can_rcvlists_lock);
+
+static struct kmem_cache *rcv_cache __read_mostly;
+
+/* table of registered CAN protocols */
+static const struct can_proto *proto_tab[CAN_NPROTO] __read_mostly;
+static DEFINE_MUTEX(proto_tab_lock);
+
+struct timer_list can_stattimer; /* timer for statistics update */
+struct s_stats can_stats; /* packet statistics */
+struct s_pstats can_pstats; /* receive list statistics */
+
+/*
+ * af_can socket functions
+ */
+
+int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+
+ case SIOCGSTAMP:
+ return sock_get_timestamp(sk, (struct timeval __user *)arg);
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+EXPORT_SYMBOL(can_ioctl);
+
+static void can_sock_destruct(struct sock *sk)
+{
+ skb_queue_purge(&sk->sk_receive_queue);
+}
+
+static const struct can_proto *can_get_proto(int protocol)
+{
+ const struct can_proto *cp;
+
+ rcu_read_lock();
+ cp = rcu_dereference(proto_tab[protocol]);
+ if (cp && !try_module_get(cp->prot->owner))
+ cp = NULL;
+ rcu_read_unlock();
+
+ return cp;
+}
+
+static inline void can_put_proto(const struct can_proto *cp)
+{
+ module_put(cp->prot->owner);
+}
+
+static int can_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ const struct can_proto *cp;
+ int err = 0;
+
+ sock->state = SS_UNCONNECTED;
+
+ if (protocol < 0 || protocol >= CAN_NPROTO)
+ return -EINVAL;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ cp = can_get_proto(protocol);
+
+#ifdef CONFIG_MODULES
+ if (!cp) {
+ /* try to load protocol module if kernel is modular */
+
+ err = request_module("can-proto-%d", protocol);
+
+ /*
+ * In case of error we only print a message but don't
+ * return the error code immediately. Below we will
+ * return -EPROTONOSUPPORT
+ */
+ if (err)
+ printk_ratelimited(KERN_ERR "can: request_module "
+ "(can-proto-%d) failed.\n", protocol);
+
+ cp = can_get_proto(protocol);
+ }
+#endif
+
+ /* check for available protocol and correct usage */
+
+ if (!cp)
+ return -EPROTONOSUPPORT;
+
+ if (cp->type != sock->type) {
+ err = -EPROTOTYPE;
+ goto errout;
+ }
+
+ sock->ops = cp->ops;
+
+ sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot);
+ if (!sk) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ sock_init_data(sock, sk);
+ sk->sk_destruct = can_sock_destruct;
+
+ if (sk->sk_prot->init)
+ err = sk->sk_prot->init(sk);
+
+ if (err) {
+ /* release sk on errors */
+ sock_orphan(sk);
+ sock_put(sk);
+ }
+
+ errout:
+ can_put_proto(cp);
+ return err;
+}
+
+/*
+ * af_can tx path
+ */
+
+/**
+ * can_send - transmit a CAN frame (optional with local loopback)
+ * @skb: pointer to socket buffer with CAN frame in data section
+ * @loop: loopback for listeners on local CAN sockets (recommended default!)
+ *
+ * Due to the loopback this routine must not be called from hardirq context.
+ *
+ * Return:
+ * 0 on success
+ * -ENETDOWN when the selected interface is down
+ * -ENOBUFS on full driver queue (see net_xmit_errno())
+ * -ENOMEM when local loopback failed at calling skb_clone()
+ * -EPERM when trying to send on a non-CAN interface
+ * -EMSGSIZE CAN frame size is bigger than CAN interface MTU
+ * -EINVAL when the skb->data does not contain a valid CAN frame
+ */
+int can_send(struct sk_buff *skb, int loop)
+{
+ struct sk_buff *newskb = NULL;
+ struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+ int err = -EINVAL;
+
+ if (skb->len == CAN_MTU) {
+ skb->protocol = htons(ETH_P_CAN);
+ if (unlikely(cfd->len > CAN_MAX_DLEN))
+ goto inval_skb;
+ } else if (skb->len == CANFD_MTU) {
+ skb->protocol = htons(ETH_P_CANFD);
+ if (unlikely(cfd->len > CANFD_MAX_DLEN))
+ goto inval_skb;
+ } else
+ goto inval_skb;
+
+ /*
+ * Make sure the CAN frame can pass the selected CAN netdevice.
+ * As structs can_frame and canfd_frame are similar, we can provide
+ * CAN FD frames to legacy CAN drivers as long as the length is <= 8
+ */
+ if (unlikely(skb->len > skb->dev->mtu && cfd->len > CAN_MAX_DLEN)) {
+ err = -EMSGSIZE;
+ goto inval_skb;
+ }
+
+ if (unlikely(skb->dev->type != ARPHRD_CAN)) {
+ err = -EPERM;
+ goto inval_skb;
+ }
+
+ if (unlikely(!(skb->dev->flags & IFF_UP))) {
+ err = -ENETDOWN;
+ goto inval_skb;
+ }
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
+ if (loop) {
+ /* local loopback of sent CAN frames */
+
+ /* indication for the CAN driver: do loopback */
+ skb->pkt_type = PACKET_LOOPBACK;
+
+ /*
+ * The reference to the originating sock may be required
+ * by the receiving socket to check whether the frame is
+ * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS
+ * Therefore we have to ensure that skb->sk remains the
+ * reference to the originating sock by restoring skb->sk
+ * after each skb_clone() or skb_orphan() usage.
+ */
+
+ if (!(skb->dev->flags & IFF_ECHO)) {
+ /*
+ * If the interface is not capable to do loopback
+ * itself, we do it here.
+ */
+ newskb = skb_clone(skb, GFP_ATOMIC);
+ if (!newskb) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ can_skb_set_owner(newskb, skb->sk);
+ newskb->ip_summed = CHECKSUM_UNNECESSARY;
+ newskb->pkt_type = PACKET_BROADCAST;
+ }
+ } else {
+ /* indication for the CAN driver: no loopback required */
+ skb->pkt_type = PACKET_HOST;
+ }
+
+ /* send to netdevice */
+ err = dev_queue_xmit(skb);
+ if (err > 0)
+ err = net_xmit_errno(err);
+
+ if (err) {
+ kfree_skb(newskb);
+ return err;
+ }
+
+ if (newskb) {
+ if (!(newskb->tstamp.tv64))
+ __net_timestamp(newskb);
+
+ netif_rx_ni(newskb);
+ }
+
+ /* update statistics */
+ can_stats.tx_frames++;
+ can_stats.tx_frames_delta++;
+
+ return 0;
+
+inval_skb:
+ kfree_skb(skb);
+ return err;
+}
+EXPORT_SYMBOL(can_send);
+
+/*
+ * af_can rx path
+ */
+
+static struct dev_rcv_lists *find_dev_rcv_lists(struct net_device *dev)
+{
+ if (!dev)
+ return &can_rx_alldev_list;
+ else
+ return (struct dev_rcv_lists *)dev->ml_priv;
+}
+
+/**
+ * effhash - hash function for 29 bit CAN identifier reduction
+ * @can_id: 29 bit CAN identifier
+ *
+ * Description:
+ * To reduce the linear traversal in one linked list of _single_ EFF CAN
+ * frame subscriptions the 29 bit identifier is mapped to 10 bits.
+ * (see CAN_EFF_RCV_HASH_BITS definition)
+ *
+ * Return:
+ * Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask )
+ */
+static unsigned int effhash(canid_t can_id)
+{
+ unsigned int hash;
+
+ hash = can_id;
+ hash ^= can_id >> CAN_EFF_RCV_HASH_BITS;
+ hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS);
+
+ return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1);
+}
+
+/**
+ * find_rcv_list - determine optimal filterlist inside device filter struct
+ * @can_id: pointer to CAN identifier of a given can_filter
+ * @mask: pointer to CAN mask of a given can_filter
+ * @d: pointer to the device filter struct
+ *
+ * Description:
+ * Returns the optimal filterlist to reduce the filter handling in the
+ * receive path. This function is called by service functions that need
+ * to register or unregister a can_filter in the filter lists.
+ *
+ * A filter matches in general, when
+ *
+ * <received_can_id> & mask == can_id & mask
+ *
+ * so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe
+ * relevant bits for the filter.
+ *
+ * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
+ * filter for error messages (CAN_ERR_FLAG bit set in mask). For error msg
+ * frames there is a special filterlist and a special rx path filter handling.
+ *
+ * Return:
+ * Pointer to optimal filterlist for the given can_id/mask pair.
+ * Constistency checked mask.
+ * Reduced can_id to have a preprocessed filter compare value.
+ */
+static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
+ struct dev_rcv_lists *d)
+{
+ canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */
+
+ /* filter for error message frames in extra filterlist */
+ if (*mask & CAN_ERR_FLAG) {
+ /* clear CAN_ERR_FLAG in filter entry */
+ *mask &= CAN_ERR_MASK;
+ return &d->rx[RX_ERR];
+ }
+
+ /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */
+
+#define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG)
+
+ /* ensure valid values in can_mask for 'SFF only' frame filtering */
+ if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG))
+ *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS);
+
+ /* reduce condition testing at receive time */
+ *can_id &= *mask;
+
+ /* inverse can_id/can_mask filter */
+ if (inv)
+ return &d->rx[RX_INV];
+
+ /* mask == 0 => no condition testing at receive time */
+ if (!(*mask))
+ return &d->rx[RX_ALL];
+
+ /* extra filterlists for the subscription of a single non-RTR can_id */
+ if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) &&
+ !(*can_id & CAN_RTR_FLAG)) {
+
+ if (*can_id & CAN_EFF_FLAG) {
+ if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS))
+ return &d->rx_eff[effhash(*can_id)];
+ } else {
+ if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS))
+ return &d->rx_sff[*can_id];
+ }
+ }
+
+ /* default: filter via can_id/can_mask */
+ return &d->rx[RX_FIL];
+}
+
+/**
+ * can_rx_register - subscribe CAN frames from a specific interface
+ * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list)
+ * @can_id: CAN identifier (see description)
+ * @mask: CAN mask (see description)
+ * @func: callback function on filter match
+ * @data: returned parameter for callback function
+ * @ident: string for calling module identification
+ *
+ * Description:
+ * Invokes the callback function with the received sk_buff and the given
+ * parameter 'data' on a matching receive filter. A filter matches, when
+ *
+ * <received_can_id> & mask == can_id & mask
+ *
+ * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
+ * filter for error message frames (CAN_ERR_FLAG bit set in mask).
+ *
+ * The provided pointer to the sk_buff is guaranteed to be valid as long as
+ * the callback function is running. The callback function must *not* free
+ * the given sk_buff while processing it's task. When the given sk_buff is
+ * needed after the end of the callback function it must be cloned inside
+ * the callback function with skb_clone().
+ *
+ * Return:
+ * 0 on success
+ * -ENOMEM on missing cache mem to create subscription entry
+ * -ENODEV unknown device
+ */
+int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
+ void (*func)(struct sk_buff *, void *), void *data,
+ char *ident)
+{
+ struct receiver *r;
+ struct hlist_head *rl;
+ struct dev_rcv_lists *d;
+ int err = 0;
+
+ /* insert new receiver (dev,canid,mask) -> (func,data) */
+
+ if (dev && dev->type != ARPHRD_CAN)
+ return -ENODEV;
+
+ r = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ spin_lock(&can_rcvlists_lock);
+
+ d = find_dev_rcv_lists(dev);
+ if (d) {
+ rl = find_rcv_list(&can_id, &mask, d);
+
+ r->can_id = can_id;
+ r->mask = mask;
+ r->matches = 0;
+ r->func = func;
+ r->data = data;
+ r->ident = ident;
+
+ hlist_add_head_rcu(&r->list, rl);
+ d->entries++;
+
+ can_pstats.rcv_entries++;
+ if (can_pstats.rcv_entries_max < can_pstats.rcv_entries)
+ can_pstats.rcv_entries_max = can_pstats.rcv_entries;
+ } else {
+ kmem_cache_free(rcv_cache, r);
+ err = -ENODEV;
+ }
+
+ spin_unlock(&can_rcvlists_lock);
+
+ return err;
+}
+EXPORT_SYMBOL(can_rx_register);
+
+/*
+ * can_rx_delete_receiver - rcu callback for single receiver entry removal
+ */
+static void can_rx_delete_receiver(struct rcu_head *rp)
+{
+ struct receiver *r = container_of(rp, struct receiver, rcu);
+
+ kmem_cache_free(rcv_cache, r);
+}
+
+/**
+ * can_rx_unregister - unsubscribe CAN frames from a specific interface
+ * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list)
+ * @can_id: CAN identifier
+ * @mask: CAN mask
+ * @func: callback function on filter match
+ * @data: returned parameter for callback function
+ *
+ * Description:
+ * Removes subscription entry depending on given (subscription) values.
+ */
+void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
+ void (*func)(struct sk_buff *, void *), void *data)
+{
+ struct receiver *r = NULL;
+ struct hlist_head *rl;
+ struct dev_rcv_lists *d;
+
+ if (dev && dev->type != ARPHRD_CAN)
+ return;
+
+ spin_lock(&can_rcvlists_lock);
+
+ d = find_dev_rcv_lists(dev);
+ if (!d) {
+ pr_err("BUG: receive list not found for "
+ "dev %s, id %03X, mask %03X\n",
+ DNAME(dev), can_id, mask);
+ goto out;
+ }
+
+ rl = find_rcv_list(&can_id, &mask, d);
+
+ /*
+ * Search the receiver list for the item to delete. This should
+ * exist, since no receiver may be unregistered that hasn't
+ * been registered before.
+ */
+
+ hlist_for_each_entry_rcu(r, rl, list) {
+ if (r->can_id == can_id && r->mask == mask &&
+ r->func == func && r->data == data)
+ break;
+ }
+
+ /*
+ * Check for bugs in CAN protocol implementations using af_can.c:
+ * 'r' will be NULL if no matching list item was found for removal.
+ */
+
+ if (!r) {
+ WARN(1, "BUG: receive list entry not found for dev %s, "
+ "id %03X, mask %03X\n", DNAME(dev), can_id, mask);
+ goto out;
+ }
+
+ hlist_del_rcu(&r->list);
+ d->entries--;
+
+ if (can_pstats.rcv_entries > 0)
+ can_pstats.rcv_entries--;
+
+ /* remove device structure requested by NETDEV_UNREGISTER */
+ if (d->remove_on_zero_entries && !d->entries) {
+ kfree(d);
+ dev->ml_priv = NULL;
+ }
+
+ out:
+ spin_unlock(&can_rcvlists_lock);
+
+ /* schedule the receiver item for deletion */
+ if (r)
+ call_rcu(&r->rcu, can_rx_delete_receiver);
+}
+EXPORT_SYMBOL(can_rx_unregister);
+
+static inline void deliver(struct sk_buff *skb, struct receiver *r)
+{
+ r->func(skb, r->data);
+ r->matches++;
+}
+
+static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb)
+{
+ struct receiver *r;
+ int matches = 0;
+ struct can_frame *cf = (struct can_frame *)skb->data;
+ canid_t can_id = cf->can_id;
+
+ if (d->entries == 0)
+ return 0;
+
+ if (can_id & CAN_ERR_FLAG) {
+ /* check for error message frame entries only */
+ hlist_for_each_entry_rcu(r, &d->rx[RX_ERR], list) {
+ if (can_id & r->mask) {
+ deliver(skb, r);
+ matches++;
+ }
+ }
+ return matches;
+ }
+
+ /* check for unfiltered entries */
+ hlist_for_each_entry_rcu(r, &d->rx[RX_ALL], list) {
+ deliver(skb, r);
+ matches++;
+ }
+
+ /* check for can_id/mask entries */
+ hlist_for_each_entry_rcu(r, &d->rx[RX_FIL], list) {
+ if ((can_id & r->mask) == r->can_id) {
+ deliver(skb, r);
+ matches++;
+ }
+ }
+
+ /* check for inverted can_id/mask entries */
+ hlist_for_each_entry_rcu(r, &d->rx[RX_INV], list) {
+ if ((can_id & r->mask) != r->can_id) {
+ deliver(skb, r);
+ matches++;
+ }
+ }
+
+ /* check filterlists for single non-RTR can_ids */
+ if (can_id & CAN_RTR_FLAG)
+ return matches;
+
+ if (can_id & CAN_EFF_FLAG) {
+ hlist_for_each_entry_rcu(r, &d->rx_eff[effhash(can_id)], list) {
+ if (r->can_id == can_id) {
+ deliver(skb, r);
+ matches++;
+ }
+ }
+ } else {
+ can_id &= CAN_SFF_MASK;
+ hlist_for_each_entry_rcu(r, &d->rx_sff[can_id], list) {
+ deliver(skb, r);
+ matches++;
+ }
+ }
+
+ return matches;
+}
+
+static void can_receive(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dev_rcv_lists *d;
+ int matches;
+
+ /* update statistics */
+ can_stats.rx_frames++;
+ can_stats.rx_frames_delta++;
+
+ rcu_read_lock();
+
+ /* deliver the packet to sockets listening on all devices */
+ matches = can_rcv_filter(&can_rx_alldev_list, skb);
+
+ /* find receive list for this device */
+ d = find_dev_rcv_lists(dev);
+ if (d)
+ matches += can_rcv_filter(d, skb);
+
+ rcu_read_unlock();
+
+ /* consume the skbuff allocated by the netdevice driver */
+ consume_skb(skb);
+
+ if (matches > 0) {
+ can_stats.matches++;
+ can_stats.matches_delta++;
+ }
+}
+
+static int can_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
+ if (unlikely(!net_eq(dev_net(dev), &init_net)))
+ goto drop;
+
+ if (WARN_ONCE(dev->type != ARPHRD_CAN ||
+ skb->len != CAN_MTU ||
+ cfd->len > CAN_MAX_DLEN,
+ "PF_CAN: dropped non conform CAN skbuf: "
+ "dev type %d, len %d, datalen %d\n",
+ dev->type, skb->len, cfd->len))
+ goto drop;
+
+ can_receive(skb, dev);
+ return NET_RX_SUCCESS;
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
+ if (unlikely(!net_eq(dev_net(dev), &init_net)))
+ goto drop;
+
+ if (WARN_ONCE(dev->type != ARPHRD_CAN ||
+ skb->len != CANFD_MTU ||
+ cfd->len > CANFD_MAX_DLEN,
+ "PF_CAN: dropped non conform CAN FD skbuf: "
+ "dev type %d, len %d, datalen %d\n",
+ dev->type, skb->len, cfd->len))
+ goto drop;
+
+ can_receive(skb, dev);
+ return NET_RX_SUCCESS;
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+/*
+ * af_can protocol functions
+ */
+
+/**
+ * can_proto_register - register CAN transport protocol
+ * @cp: pointer to CAN protocol structure
+ *
+ * Return:
+ * 0 on success
+ * -EINVAL invalid (out of range) protocol number
+ * -EBUSY protocol already in use
+ * -ENOBUF if proto_register() fails
+ */
+int can_proto_register(const struct can_proto *cp)
+{
+ int proto = cp->protocol;
+ int err = 0;
+
+ if (proto < 0 || proto >= CAN_NPROTO) {
+ pr_err("can: protocol number %d out of range\n", proto);
+ return -EINVAL;
+ }
+
+ err = proto_register(cp->prot, 0);
+ if (err < 0)
+ return err;
+
+ mutex_lock(&proto_tab_lock);
+
+ if (proto_tab[proto]) {
+ pr_err("can: protocol %d already registered\n", proto);
+ err = -EBUSY;
+ } else
+ RCU_INIT_POINTER(proto_tab[proto], cp);
+
+ mutex_unlock(&proto_tab_lock);
+
+ if (err < 0)
+ proto_unregister(cp->prot);
+
+ return err;
+}
+EXPORT_SYMBOL(can_proto_register);
+
+/**
+ * can_proto_unregister - unregister CAN transport protocol
+ * @cp: pointer to CAN protocol structure
+ */
+void can_proto_unregister(const struct can_proto *cp)
+{
+ int proto = cp->protocol;
+
+ mutex_lock(&proto_tab_lock);
+ BUG_ON(proto_tab[proto] != cp);
+ RCU_INIT_POINTER(proto_tab[proto], NULL);
+ mutex_unlock(&proto_tab_lock);
+
+ synchronize_rcu();
+
+ proto_unregister(cp->prot);
+}
+EXPORT_SYMBOL(can_proto_unregister);
+
+/*
+ * af_can notifier to create/remove CAN netdevice specific structs
+ */
+static int can_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct dev_rcv_lists *d;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+
+ switch (msg) {
+
+ case NETDEV_REGISTER:
+
+ /* create new dev_rcv_lists for this device */
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return NOTIFY_DONE;
+ BUG_ON(dev->ml_priv);
+ dev->ml_priv = d;
+
+ break;
+
+ case NETDEV_UNREGISTER:
+ spin_lock(&can_rcvlists_lock);
+
+ d = dev->ml_priv;
+ if (d) {
+ if (d->entries)
+ d->remove_on_zero_entries = 1;
+ else {
+ kfree(d);
+ dev->ml_priv = NULL;
+ }
+ } else
+ pr_err("can: notifier: receive list not found for dev "
+ "%s\n", dev->name);
+
+ spin_unlock(&can_rcvlists_lock);
+
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * af_can module init/exit functions
+ */
+
+static struct packet_type can_packet __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CAN),
+ .func = can_rcv,
+};
+
+static struct packet_type canfd_packet __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CANFD),
+ .func = canfd_rcv,
+};
+
+static const struct net_proto_family can_family_ops = {
+ .family = PF_CAN,
+ .create = can_create,
+ .owner = THIS_MODULE,
+};
+
+/* notifier block for netdevice event */
+static struct notifier_block can_netdev_notifier __read_mostly = {
+ .notifier_call = can_notifier,
+};
+
+static __init int can_init(void)
+{
+ /* check for correct padding to be able to use the structs similarly */
+ BUILD_BUG_ON(offsetof(struct can_frame, can_dlc) !=
+ offsetof(struct canfd_frame, len) ||
+ offsetof(struct can_frame, data) !=
+ offsetof(struct canfd_frame, data));
+
+ pr_info("can: controller area network core (" CAN_VERSION_STRING ")\n");
+
+ memset(&can_rx_alldev_list, 0, sizeof(can_rx_alldev_list));
+
+ rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver),
+ 0, 0, NULL);
+ if (!rcv_cache)
+ return -ENOMEM;
+
+ if (stats_timer) {
+ /* the statistics are updated every second (timer triggered) */
+ setup_timer(&can_stattimer, can_stat_update, 0);
+ mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
+ } else
+ can_stattimer.function = NULL;
+
+ can_init_proc();
+
+ /* protocol register */
+ sock_register(&can_family_ops);
+ register_netdevice_notifier(&can_netdev_notifier);
+ dev_add_pack(&can_packet);
+ dev_add_pack(&canfd_packet);
+
+ return 0;
+}
+
+static __exit void can_exit(void)
+{
+ struct net_device *dev;
+
+ if (stats_timer)
+ del_timer_sync(&can_stattimer);
+
+ can_remove_proc();
+
+ /* protocol unregister */
+ dev_remove_pack(&canfd_packet);
+ dev_remove_pack(&can_packet);
+ unregister_netdevice_notifier(&can_netdev_notifier);
+ sock_unregister(PF_CAN);
+
+ /* remove created dev_rcv_lists from still registered CAN devices */
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if (dev->type == ARPHRD_CAN && dev->ml_priv) {
+
+ struct dev_rcv_lists *d = dev->ml_priv;
+
+ BUG_ON(d->entries);
+ kfree(d);
+ dev->ml_priv = NULL;
+ }
+ }
+ rcu_read_unlock();
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ kmem_cache_destroy(rcv_cache);
+}
+
+module_init(can_init);
+module_exit(can_exit);
diff --git a/net/can/af_can.h b/net/can/af_can.h
new file mode 100644
index 000000000..fca0fe9fc
--- /dev/null
+++ b/net/can/af_can.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef AF_CAN_H
+#define AF_CAN_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/can.h>
+
+/* af_can rx dispatcher structures */
+
+struct receiver {
+ struct hlist_node list;
+ struct rcu_head rcu;
+ canid_t can_id;
+ canid_t mask;
+ unsigned long matches;
+ void (*func)(struct sk_buff *, void *);
+ void *data;
+ char *ident;
+};
+
+#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
+#define CAN_EFF_RCV_HASH_BITS 10
+#define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS)
+
+enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX };
+
+/* per device receive filters linked at dev->ml_priv */
+struct dev_rcv_lists {
+ struct hlist_head rx[RX_MAX];
+ struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ];
+ struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ];
+ int remove_on_zero_entries;
+ int entries;
+};
+
+/* statistic structures */
+
+/* can be reset e.g. by can_init_stats() */
+struct s_stats {
+ unsigned long jiffies_init;
+
+ unsigned long rx_frames;
+ unsigned long tx_frames;
+ unsigned long matches;
+
+ unsigned long total_rx_rate;
+ unsigned long total_tx_rate;
+ unsigned long total_rx_match_ratio;
+
+ unsigned long current_rx_rate;
+ unsigned long current_tx_rate;
+ unsigned long current_rx_match_ratio;
+
+ unsigned long max_rx_rate;
+ unsigned long max_tx_rate;
+ unsigned long max_rx_match_ratio;
+
+ unsigned long rx_frames_delta;
+ unsigned long tx_frames_delta;
+ unsigned long matches_delta;
+};
+
+/* persistent statistics */
+struct s_pstats {
+ unsigned long stats_reset;
+ unsigned long user_reset;
+ unsigned long rcv_entries;
+ unsigned long rcv_entries_max;
+};
+
+/* receive filters subscribed for 'all' CAN devices */
+extern struct dev_rcv_lists can_rx_alldev_list;
+
+/* function prototypes for the CAN networklayer procfs (proc.c) */
+void can_init_proc(void);
+void can_remove_proc(void);
+void can_stat_update(unsigned long data);
+
+/* structures and variables from af_can.c needed in proc.c for reading */
+extern struct timer_list can_stattimer; /* timer for statistics update */
+extern struct s_stats can_stats; /* packet statistics */
+extern struct s_pstats can_pstats; /* receive list statistics */
+extern struct hlist_head can_rx_dev_list; /* rx dispatcher structures */
+
+#endif /* AF_CAN_H */
diff --git a/net/can/bcm.c b/net/can/bcm.c
new file mode 100644
index 000000000..b52345358
--- /dev/null
+++ b/net/can/bcm.c
@@ -0,0 +1,1634 @@
+/*
+ * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/bcm.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+/*
+ * To send multiple CAN frame content within TX_SETUP or to filter
+ * CAN messages with multiplex index within RX_SETUP, the number of
+ * different filters is limited to 256 due to the one byte index value.
+ */
+#define MAX_NFRAMES 256
+
+/* use of last_frames[index].can_dlc */
+#define RX_RECV 0x40 /* received data for this element */
+#define RX_THR 0x80 /* element not been sent due to throttle feature */
+#define BCM_CAN_DLC_MASK 0x0F /* clean private flags in can_dlc by masking */
+
+/* get best masking value for can_rx_register() for a given single can_id */
+#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \
+ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
+ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
+
+#define CAN_BCM_VERSION CAN_VERSION
+
+MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
+MODULE_ALIAS("can-proto-2");
+
+/* easy access to can_frame payload */
+static inline u64 GET_U64(const struct can_frame *cp)
+{
+ return *(u64 *)cp->data;
+}
+
+struct bcm_op {
+ struct list_head list;
+ int ifindex;
+ canid_t can_id;
+ u32 flags;
+ unsigned long frames_abs, frames_filtered;
+ struct timeval ival1, ival2;
+ struct hrtimer timer, thrtimer;
+ struct tasklet_struct tsklet, thrtsklet;
+ ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
+ int rx_ifindex;
+ u32 count;
+ u32 nframes;
+ u32 currframe;
+ struct can_frame *frames;
+ struct can_frame *last_frames;
+ struct can_frame sframe;
+ struct can_frame last_sframe;
+ struct sock *sk;
+ struct net_device *rx_reg_dev;
+};
+
+static struct proc_dir_entry *proc_dir;
+
+struct bcm_sock {
+ struct sock sk;
+ int bound;
+ int ifindex;
+ struct notifier_block notifier;
+ struct list_head rx_ops;
+ struct list_head tx_ops;
+ unsigned long dropped_usr_msgs;
+ struct proc_dir_entry *bcm_proc_read;
+ char procname [32]; /* inode number in decimal with \0 */
+};
+
+static inline struct bcm_sock *bcm_sk(const struct sock *sk)
+{
+ return (struct bcm_sock *)sk;
+}
+
+#define CFSIZ sizeof(struct can_frame)
+#define OPSIZ sizeof(struct bcm_op)
+#define MHSIZ sizeof(struct bcm_msg_head)
+
+/*
+ * procfs functions
+ */
+static char *bcm_proc_getifname(char *result, int ifindex)
+{
+ struct net_device *dev;
+
+ if (!ifindex)
+ return "any";
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(&init_net, ifindex);
+ if (dev)
+ strcpy(result, dev->name);
+ else
+ strcpy(result, "???");
+ rcu_read_unlock();
+
+ return result;
+}
+
+static int bcm_proc_show(struct seq_file *m, void *v)
+{
+ char ifname[IFNAMSIZ];
+ struct sock *sk = (struct sock *)m->private;
+ struct bcm_sock *bo = bcm_sk(sk);
+ struct bcm_op *op;
+
+ seq_printf(m, ">>> socket %pK", sk->sk_socket);
+ seq_printf(m, " / sk %pK", sk);
+ seq_printf(m, " / bo %pK", bo);
+ seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs);
+ seq_printf(m, " / bound %s", bcm_proc_getifname(ifname, bo->ifindex));
+ seq_printf(m, " <<<\n");
+
+ list_for_each_entry(op, &bo->rx_ops, list) {
+
+ unsigned long reduction;
+
+ /* print only active entries & prevent division by zero */
+ if (!op->frames_abs)
+ continue;
+
+ seq_printf(m, "rx_op: %03X %-5s ",
+ op->can_id, bcm_proc_getifname(ifname, op->ifindex));
+ seq_printf(m, "[%u]%c ", op->nframes,
+ (op->flags & RX_CHECK_DLC)?'d':' ');
+ if (op->kt_ival1.tv64)
+ seq_printf(m, "timeo=%lld ",
+ (long long)
+ ktime_to_us(op->kt_ival1));
+
+ if (op->kt_ival2.tv64)
+ seq_printf(m, "thr=%lld ",
+ (long long)
+ ktime_to_us(op->kt_ival2));
+
+ seq_printf(m, "# recv %ld (%ld) => reduction: ",
+ op->frames_filtered, op->frames_abs);
+
+ reduction = 100 - (op->frames_filtered * 100) / op->frames_abs;
+
+ seq_printf(m, "%s%ld%%\n",
+ (reduction == 100)?"near ":"", reduction);
+ }
+
+ list_for_each_entry(op, &bo->tx_ops, list) {
+
+ seq_printf(m, "tx_op: %03X %s [%u] ",
+ op->can_id,
+ bcm_proc_getifname(ifname, op->ifindex),
+ op->nframes);
+
+ if (op->kt_ival1.tv64)
+ seq_printf(m, "t1=%lld ",
+ (long long) ktime_to_us(op->kt_ival1));
+
+ if (op->kt_ival2.tv64)
+ seq_printf(m, "t2=%lld ",
+ (long long) ktime_to_us(op->kt_ival2));
+
+ seq_printf(m, "# sent %ld\n", op->frames_abs);
+ }
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int bcm_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, bcm_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations bcm_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = bcm_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface
+ * of the given bcm tx op
+ */
+static void bcm_can_tx(struct bcm_op *op)
+{
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct can_frame *cf = &op->frames[op->currframe];
+
+ /* no target device? => exit */
+ if (!op->ifindex)
+ return;
+
+ dev = dev_get_by_index(&init_net, op->ifindex);
+ if (!dev) {
+ /* RFC: should this bcm_op remove itself here? */
+ return;
+ }
+
+ skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), gfp_any());
+ if (!skb)
+ goto out;
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+
+ memcpy(skb_put(skb, CFSIZ), cf, CFSIZ);
+
+ /* send with loopback */
+ skb->dev = dev;
+ can_skb_set_owner(skb, op->sk);
+ can_send(skb, 1);
+
+ /* update statistics */
+ op->currframe++;
+ op->frames_abs++;
+
+ /* reached last frame? */
+ if (op->currframe >= op->nframes)
+ op->currframe = 0;
+ out:
+ dev_put(dev);
+}
+
+/*
+ * bcm_send_to_user - send a BCM message to the userspace
+ * (consisting of bcm_msg_head + x CAN frames)
+ */
+static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
+ struct can_frame *frames, int has_timestamp)
+{
+ struct sk_buff *skb;
+ struct can_frame *firstframe;
+ struct sockaddr_can *addr;
+ struct sock *sk = op->sk;
+ unsigned int datalen = head->nframes * CFSIZ;
+ int err;
+
+ skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
+ if (!skb)
+ return;
+
+ memcpy(skb_put(skb, sizeof(*head)), head, sizeof(*head));
+
+ if (head->nframes) {
+ /* can_frames starting here */
+ firstframe = (struct can_frame *)skb_tail_pointer(skb);
+
+ memcpy(skb_put(skb, datalen), frames, datalen);
+
+ /*
+ * the BCM uses the can_dlc-element of the can_frame
+ * structure for internal purposes. This is only
+ * relevant for updates that are generated by the
+ * BCM, where nframes is 1
+ */
+ if (head->nframes == 1)
+ firstframe->can_dlc &= BCM_CAN_DLC_MASK;
+ }
+
+ if (has_timestamp) {
+ /* restore rx timestamp */
+ skb->tstamp = op->rx_stamp;
+ }
+
+ /*
+ * Put the datagram to the queue so that bcm_recvmsg() can
+ * get it from there. We need to pass the interface index to
+ * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb
+ * containing the interface index.
+ */
+
+ sock_skb_cb_check_size(sizeof(struct sockaddr_can));
+ addr = (struct sockaddr_can *)skb->cb;
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = op->rx_ifindex;
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err < 0) {
+ struct bcm_sock *bo = bcm_sk(sk);
+
+ kfree_skb(skb);
+ /* don't care about overflows in this statistic */
+ bo->dropped_usr_msgs++;
+ }
+}
+
+static void bcm_tx_start_timer(struct bcm_op *op)
+{
+ if (op->kt_ival1.tv64 && op->count)
+ hrtimer_start(&op->timer,
+ ktime_add(ktime_get(), op->kt_ival1),
+ HRTIMER_MODE_ABS);
+ else if (op->kt_ival2.tv64)
+ hrtimer_start(&op->timer,
+ ktime_add(ktime_get(), op->kt_ival2),
+ HRTIMER_MODE_ABS);
+}
+
+static void bcm_tx_timeout_tsklet(unsigned long data)
+{
+ struct bcm_op *op = (struct bcm_op *)data;
+ struct bcm_msg_head msg_head;
+
+ if (op->kt_ival1.tv64 && (op->count > 0)) {
+
+ op->count--;
+ if (!op->count && (op->flags & TX_COUNTEVT)) {
+
+ /* create notification to user */
+ msg_head.opcode = TX_EXPIRED;
+ msg_head.flags = op->flags;
+ msg_head.count = op->count;
+ msg_head.ival1 = op->ival1;
+ msg_head.ival2 = op->ival2;
+ msg_head.can_id = op->can_id;
+ msg_head.nframes = 0;
+
+ bcm_send_to_user(op, &msg_head, NULL, 0);
+ }
+ bcm_can_tx(op);
+
+ } else if (op->kt_ival2.tv64)
+ bcm_can_tx(op);
+
+ bcm_tx_start_timer(op);
+}
+
+/*
+ * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
+ */
+static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
+{
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
+
+ tasklet_schedule(&op->tsklet);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * bcm_rx_changed - create a RX_CHANGED notification due to changed content
+ */
+static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
+{
+ struct bcm_msg_head head;
+
+ /* update statistics */
+ op->frames_filtered++;
+
+ /* prevent statistics overflow */
+ if (op->frames_filtered > ULONG_MAX/100)
+ op->frames_filtered = op->frames_abs = 0;
+
+ /* this element is not throttled anymore */
+ data->can_dlc &= (BCM_CAN_DLC_MASK|RX_RECV);
+
+ head.opcode = RX_CHANGED;
+ head.flags = op->flags;
+ head.count = op->count;
+ head.ival1 = op->ival1;
+ head.ival2 = op->ival2;
+ head.can_id = op->can_id;
+ head.nframes = 1;
+
+ bcm_send_to_user(op, &head, data, 1);
+}
+
+/*
+ * bcm_rx_update_and_send - process a detected relevant receive content change
+ * 1. update the last received data
+ * 2. send a notification to the user (if possible)
+ */
+static void bcm_rx_update_and_send(struct bcm_op *op,
+ struct can_frame *lastdata,
+ const struct can_frame *rxdata)
+{
+ memcpy(lastdata, rxdata, CFSIZ);
+
+ /* mark as used and throttled by default */
+ lastdata->can_dlc |= (RX_RECV|RX_THR);
+
+ /* throttling mode inactive ? */
+ if (!op->kt_ival2.tv64) {
+ /* send RX_CHANGED to the user immediately */
+ bcm_rx_changed(op, lastdata);
+ return;
+ }
+
+ /* with active throttling timer we are just done here */
+ if (hrtimer_active(&op->thrtimer))
+ return;
+
+ /* first reception with enabled throttling mode */
+ if (!op->kt_lastmsg.tv64)
+ goto rx_changed_settime;
+
+ /* got a second frame inside a potential throttle period? */
+ if (ktime_us_delta(ktime_get(), op->kt_lastmsg) <
+ ktime_to_us(op->kt_ival2)) {
+ /* do not send the saved data - only start throttle timer */
+ hrtimer_start(&op->thrtimer,
+ ktime_add(op->kt_lastmsg, op->kt_ival2),
+ HRTIMER_MODE_ABS);
+ return;
+ }
+
+ /* the gap was that big, that throttling was not needed here */
+rx_changed_settime:
+ bcm_rx_changed(op, lastdata);
+ op->kt_lastmsg = ktime_get();
+}
+
+/*
+ * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly
+ * received data stored in op->last_frames[]
+ */
+static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
+ const struct can_frame *rxdata)
+{
+ /*
+ * no one uses the MSBs of can_dlc for comparison,
+ * so we use it here to detect the first time of reception
+ */
+
+ if (!(op->last_frames[index].can_dlc & RX_RECV)) {
+ /* received data for the first time => send update to user */
+ bcm_rx_update_and_send(op, &op->last_frames[index], rxdata);
+ return;
+ }
+
+ /* do a real check in can_frame data section */
+
+ if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) !=
+ (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) {
+ bcm_rx_update_and_send(op, &op->last_frames[index], rxdata);
+ return;
+ }
+
+ if (op->flags & RX_CHECK_DLC) {
+ /* do a real check in can_frame dlc */
+ if (rxdata->can_dlc != (op->last_frames[index].can_dlc &
+ BCM_CAN_DLC_MASK)) {
+ bcm_rx_update_and_send(op, &op->last_frames[index],
+ rxdata);
+ return;
+ }
+ }
+}
+
+/*
+ * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception
+ */
+static void bcm_rx_starttimer(struct bcm_op *op)
+{
+ if (op->flags & RX_NO_AUTOTIMER)
+ return;
+
+ if (op->kt_ival1.tv64)
+ hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
+}
+
+static void bcm_rx_timeout_tsklet(unsigned long data)
+{
+ struct bcm_op *op = (struct bcm_op *)data;
+ struct bcm_msg_head msg_head;
+
+ /* create notification to user */
+ msg_head.opcode = RX_TIMEOUT;
+ msg_head.flags = op->flags;
+ msg_head.count = op->count;
+ msg_head.ival1 = op->ival1;
+ msg_head.ival2 = op->ival2;
+ msg_head.can_id = op->can_id;
+ msg_head.nframes = 0;
+
+ bcm_send_to_user(op, &msg_head, NULL, 0);
+}
+
+/*
+ * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
+ */
+static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
+{
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
+
+ /* schedule before NET_RX_SOFTIRQ */
+ tasklet_hi_schedule(&op->tsklet);
+
+ /* no restart of the timer is done here! */
+
+ /* if user wants to be informed, when cyclic CAN-Messages come back */
+ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
+ /* clear received can_frames to indicate 'nothing received' */
+ memset(op->last_frames, 0, op->nframes * CFSIZ);
+ }
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * bcm_rx_do_flush - helper for bcm_rx_thr_flush
+ */
+static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
+ unsigned int index)
+{
+ if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) {
+ if (update)
+ bcm_rx_changed(op, &op->last_frames[index]);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
+ *
+ * update == 0 : just check if throttled data is available (any irq context)
+ * update == 1 : check and send throttled data to userspace (soft_irq context)
+ */
+static int bcm_rx_thr_flush(struct bcm_op *op, int update)
+{
+ int updated = 0;
+
+ if (op->nframes > 1) {
+ unsigned int i;
+
+ /* for MUX filter we start at index 1 */
+ for (i = 1; i < op->nframes; i++)
+ updated += bcm_rx_do_flush(op, update, i);
+
+ } else {
+ /* for RX_FILTER_ID and simple filter */
+ updated += bcm_rx_do_flush(op, update, 0);
+ }
+
+ return updated;
+}
+
+static void bcm_rx_thr_tsklet(unsigned long data)
+{
+ struct bcm_op *op = (struct bcm_op *)data;
+
+ /* push the changed data to the userspace */
+ bcm_rx_thr_flush(op, 1);
+}
+
+/*
+ * bcm_rx_thr_handler - the time for blocked content updates is over now:
+ * Check for throttled data and send it to the userspace
+ */
+static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
+{
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
+
+ tasklet_schedule(&op->thrtsklet);
+
+ if (bcm_rx_thr_flush(op, 0)) {
+ hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
+ return HRTIMER_RESTART;
+ } else {
+ /* rearm throttle handling */
+ op->kt_lastmsg = ktime_set(0, 0);
+ return HRTIMER_NORESTART;
+ }
+}
+
+/*
+ * bcm_rx_handler - handle a CAN frame reception
+ */
+static void bcm_rx_handler(struct sk_buff *skb, void *data)
+{
+ struct bcm_op *op = (struct bcm_op *)data;
+ const struct can_frame *rxframe = (struct can_frame *)skb->data;
+ unsigned int i;
+
+ /* disable timeout */
+ hrtimer_cancel(&op->timer);
+
+ if (op->can_id != rxframe->can_id)
+ return;
+
+ /* save rx timestamp */
+ op->rx_stamp = skb->tstamp;
+ /* save originator for recvfrom() */
+ op->rx_ifindex = skb->dev->ifindex;
+ /* update statistics */
+ op->frames_abs++;
+
+ if (op->flags & RX_RTR_FRAME) {
+ /* send reply for RTR-request (placed in op->frames[0]) */
+ bcm_can_tx(op);
+ return;
+ }
+
+ if (op->flags & RX_FILTER_ID) {
+ /* the easiest case */
+ bcm_rx_update_and_send(op, &op->last_frames[0], rxframe);
+ goto rx_starttimer;
+ }
+
+ if (op->nframes == 1) {
+ /* simple compare with index 0 */
+ bcm_rx_cmp_to_index(op, 0, rxframe);
+ goto rx_starttimer;
+ }
+
+ if (op->nframes > 1) {
+ /*
+ * multiplex compare
+ *
+ * find the first multiplex mask that fits.
+ * Remark: The MUX-mask is stored in index 0
+ */
+
+ for (i = 1; i < op->nframes; i++) {
+ if ((GET_U64(&op->frames[0]) & GET_U64(rxframe)) ==
+ (GET_U64(&op->frames[0]) &
+ GET_U64(&op->frames[i]))) {
+ bcm_rx_cmp_to_index(op, i, rxframe);
+ break;
+ }
+ }
+ }
+
+rx_starttimer:
+ bcm_rx_starttimer(op);
+}
+
+/*
+ * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements
+ */
+static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id,
+ int ifindex)
+{
+ struct bcm_op *op;
+
+ list_for_each_entry(op, ops, list) {
+ if ((op->can_id == can_id) && (op->ifindex == ifindex))
+ return op;
+ }
+
+ return NULL;
+}
+
+static void bcm_remove_op(struct bcm_op *op)
+{
+ hrtimer_cancel(&op->timer);
+ hrtimer_cancel(&op->thrtimer);
+
+ if (op->tsklet.func)
+ tasklet_kill(&op->tsklet);
+
+ if (op->thrtsklet.func)
+ tasklet_kill(&op->thrtsklet);
+
+ if ((op->frames) && (op->frames != &op->sframe))
+ kfree(op->frames);
+
+ if ((op->last_frames) && (op->last_frames != &op->last_sframe))
+ kfree(op->last_frames);
+
+ kfree(op);
+}
+
+static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
+{
+ if (op->rx_reg_dev == dev) {
+ can_rx_unregister(dev, op->can_id, REGMASK(op->can_id),
+ bcm_rx_handler, op);
+
+ /* mark as removed subscription */
+ op->rx_reg_dev = NULL;
+ } else
+ printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device "
+ "mismatch %p %p\n", op->rx_reg_dev, dev);
+}
+
+/*
+ * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops)
+ */
+static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex)
+{
+ struct bcm_op *op, *n;
+
+ list_for_each_entry_safe(op, n, ops, list) {
+ if ((op->can_id == can_id) && (op->ifindex == ifindex)) {
+
+ /*
+ * Don't care if we're bound or not (due to netdev
+ * problems) can_rx_unregister() is always a save
+ * thing to do here.
+ */
+ if (op->ifindex) {
+ /*
+ * Only remove subscriptions that had not
+ * been removed due to NETDEV_UNREGISTER
+ * in bcm_notifier()
+ */
+ if (op->rx_reg_dev) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(&init_net,
+ op->ifindex);
+ if (dev) {
+ bcm_rx_unreg(dev, op);
+ dev_put(dev);
+ }
+ }
+ } else
+ can_rx_unregister(NULL, op->can_id,
+ REGMASK(op->can_id),
+ bcm_rx_handler, op);
+
+ list_del(&op->list);
+ bcm_remove_op(op);
+ return 1; /* done */
+ }
+ }
+
+ return 0; /* not found */
+}
+
+/*
+ * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops)
+ */
+static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex)
+{
+ struct bcm_op *op, *n;
+
+ list_for_each_entry_safe(op, n, ops, list) {
+ if ((op->can_id == can_id) && (op->ifindex == ifindex)) {
+ list_del(&op->list);
+ bcm_remove_op(op);
+ return 1; /* done */
+ }
+ }
+
+ return 0; /* not found */
+}
+
+/*
+ * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg)
+ */
+static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head,
+ int ifindex)
+{
+ struct bcm_op *op = bcm_find_op(ops, msg_head->can_id, ifindex);
+
+ if (!op)
+ return -EINVAL;
+
+ /* put current values into msg_head */
+ msg_head->flags = op->flags;
+ msg_head->count = op->count;
+ msg_head->ival1 = op->ival1;
+ msg_head->ival2 = op->ival2;
+ msg_head->nframes = op->nframes;
+
+ bcm_send_to_user(op, msg_head, op->frames, 0);
+
+ return MHSIZ;
+}
+
+/*
+ * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg)
+ */
+static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
+ int ifindex, struct sock *sk)
+{
+ struct bcm_sock *bo = bcm_sk(sk);
+ struct bcm_op *op;
+ unsigned int i;
+ int err;
+
+ /* we need a real device to send frames */
+ if (!ifindex)
+ return -ENODEV;
+
+ /* check nframes boundaries - we need at least one can_frame */
+ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
+ return -EINVAL;
+
+ /* check the given can_id */
+ op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex);
+
+ if (op) {
+ /* update existing BCM operation */
+
+ /*
+ * Do we need more space for the can_frames than currently
+ * allocated? -> This is a _really_ unusual use-case and
+ * therefore (complexity / locking) it is not supported.
+ */
+ if (msg_head->nframes > op->nframes)
+ return -E2BIG;
+
+ /* update can_frames content */
+ for (i = 0; i < msg_head->nframes; i++) {
+ err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ);
+
+ if (op->frames[i].can_dlc > 8)
+ err = -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ if (msg_head->flags & TX_CP_CAN_ID) {
+ /* copy can_id into frame */
+ op->frames[i].can_id = msg_head->can_id;
+ }
+ }
+
+ } else {
+ /* insert new BCM operation for the given can_id */
+
+ op = kzalloc(OPSIZ, GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ op->can_id = msg_head->can_id;
+
+ /* create array for can_frames and copy the data */
+ if (msg_head->nframes > 1) {
+ op->frames = kmalloc(msg_head->nframes * CFSIZ,
+ GFP_KERNEL);
+ if (!op->frames) {
+ kfree(op);
+ return -ENOMEM;
+ }
+ } else
+ op->frames = &op->sframe;
+
+ for (i = 0; i < msg_head->nframes; i++) {
+ err = memcpy_from_msg((u8 *)&op->frames[i], msg, CFSIZ);
+
+ if (op->frames[i].can_dlc > 8)
+ err = -EINVAL;
+
+ if (err < 0) {
+ if (op->frames != &op->sframe)
+ kfree(op->frames);
+ kfree(op);
+ return err;
+ }
+
+ if (msg_head->flags & TX_CP_CAN_ID) {
+ /* copy can_id into frame */
+ op->frames[i].can_id = msg_head->can_id;
+ }
+ }
+
+ /* tx_ops never compare with previous received messages */
+ op->last_frames = NULL;
+
+ /* bcm_can_tx / bcm_tx_timeout_handler needs this */
+ op->sk = sk;
+ op->ifindex = ifindex;
+
+ /* initialize uninitialized (kzalloc) structure */
+ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ op->timer.function = bcm_tx_timeout_handler;
+
+ /* initialize tasklet for tx countevent notification */
+ tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
+ (unsigned long) op);
+
+ /* currently unused in tx_ops */
+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+
+ /* add this bcm_op to the list of the tx_ops */
+ list_add(&op->list, &bo->tx_ops);
+
+ } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */
+
+ if (op->nframes != msg_head->nframes) {
+ op->nframes = msg_head->nframes;
+ /* start multiple frame transmission with index 0 */
+ op->currframe = 0;
+ }
+
+ /* check flags */
+
+ op->flags = msg_head->flags;
+
+ if (op->flags & TX_RESET_MULTI_IDX) {
+ /* start multiple frame transmission with index 0 */
+ op->currframe = 0;
+ }
+
+ if (op->flags & SETTIMER) {
+ /* set timer values */
+ op->count = msg_head->count;
+ op->ival1 = msg_head->ival1;
+ op->ival2 = msg_head->ival2;
+ op->kt_ival1 = timeval_to_ktime(msg_head->ival1);
+ op->kt_ival2 = timeval_to_ktime(msg_head->ival2);
+
+ /* disable an active timer due to zero values? */
+ if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64)
+ hrtimer_cancel(&op->timer);
+ }
+
+ if (op->flags & STARTTIMER) {
+ hrtimer_cancel(&op->timer);
+ /* spec: send can_frame when starting timer */
+ op->flags |= TX_ANNOUNCE;
+ }
+
+ if (op->flags & TX_ANNOUNCE) {
+ bcm_can_tx(op);
+ if (op->count)
+ op->count--;
+ }
+
+ if (op->flags & STARTTIMER)
+ bcm_tx_start_timer(op);
+
+ return msg_head->nframes * CFSIZ + MHSIZ;
+}
+
+/*
+ * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg)
+ */
+static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
+ int ifindex, struct sock *sk)
+{
+ struct bcm_sock *bo = bcm_sk(sk);
+ struct bcm_op *op;
+ int do_rx_register;
+ int err = 0;
+
+ if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) {
+ /* be robust against wrong usage ... */
+ msg_head->flags |= RX_FILTER_ID;
+ /* ignore trailing garbage */
+ msg_head->nframes = 0;
+ }
+
+ /* the first element contains the mux-mask => MAX_NFRAMES + 1 */
+ if (msg_head->nframes > MAX_NFRAMES + 1)
+ return -EINVAL;
+
+ if ((msg_head->flags & RX_RTR_FRAME) &&
+ ((msg_head->nframes != 1) ||
+ (!(msg_head->can_id & CAN_RTR_FLAG))))
+ return -EINVAL;
+
+ /* check the given can_id */
+ op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex);
+ if (op) {
+ /* update existing BCM operation */
+
+ /*
+ * Do we need more space for the can_frames than currently
+ * allocated? -> This is a _really_ unusual use-case and
+ * therefore (complexity / locking) it is not supported.
+ */
+ if (msg_head->nframes > op->nframes)
+ return -E2BIG;
+
+ if (msg_head->nframes) {
+ /* update can_frames content */
+ err = memcpy_from_msg((u8 *)op->frames, msg,
+ msg_head->nframes * CFSIZ);
+ if (err < 0)
+ return err;
+
+ /* clear last_frames to indicate 'nothing received' */
+ memset(op->last_frames, 0, msg_head->nframes * CFSIZ);
+ }
+
+ op->nframes = msg_head->nframes;
+
+ /* Only an update -> do not call can_rx_register() */
+ do_rx_register = 0;
+
+ } else {
+ /* insert new BCM operation for the given can_id */
+ op = kzalloc(OPSIZ, GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ op->can_id = msg_head->can_id;
+ op->nframes = msg_head->nframes;
+
+ if (msg_head->nframes > 1) {
+ /* create array for can_frames and copy the data */
+ op->frames = kmalloc(msg_head->nframes * CFSIZ,
+ GFP_KERNEL);
+ if (!op->frames) {
+ kfree(op);
+ return -ENOMEM;
+ }
+
+ /* create and init array for received can_frames */
+ op->last_frames = kzalloc(msg_head->nframes * CFSIZ,
+ GFP_KERNEL);
+ if (!op->last_frames) {
+ kfree(op->frames);
+ kfree(op);
+ return -ENOMEM;
+ }
+
+ } else {
+ op->frames = &op->sframe;
+ op->last_frames = &op->last_sframe;
+ }
+
+ if (msg_head->nframes) {
+ err = memcpy_from_msg((u8 *)op->frames, msg,
+ msg_head->nframes * CFSIZ);
+ if (err < 0) {
+ if (op->frames != &op->sframe)
+ kfree(op->frames);
+ if (op->last_frames != &op->last_sframe)
+ kfree(op->last_frames);
+ kfree(op);
+ return err;
+ }
+ }
+
+ /* bcm_can_tx / bcm_tx_timeout_handler needs this */
+ op->sk = sk;
+ op->ifindex = ifindex;
+
+ /* ifindex for timeout events w/o previous frame reception */
+ op->rx_ifindex = ifindex;
+
+ /* initialize uninitialized (kzalloc) structure */
+ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ op->timer.function = bcm_rx_timeout_handler;
+
+ /* initialize tasklet for rx timeout notification */
+ tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
+ (unsigned long) op);
+
+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ op->thrtimer.function = bcm_rx_thr_handler;
+
+ /* initialize tasklet for rx throttle handling */
+ tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
+ (unsigned long) op);
+
+ /* add this bcm_op to the list of the rx_ops */
+ list_add(&op->list, &bo->rx_ops);
+
+ /* call can_rx_register() */
+ do_rx_register = 1;
+
+ } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */
+
+ /* check flags */
+ op->flags = msg_head->flags;
+
+ if (op->flags & RX_RTR_FRAME) {
+
+ /* no timers in RTR-mode */
+ hrtimer_cancel(&op->thrtimer);
+ hrtimer_cancel(&op->timer);
+
+ /*
+ * funny feature in RX(!)_SETUP only for RTR-mode:
+ * copy can_id into frame BUT without RTR-flag to
+ * prevent a full-load-loopback-test ... ;-]
+ */
+ if ((op->flags & TX_CP_CAN_ID) ||
+ (op->frames[0].can_id == op->can_id))
+ op->frames[0].can_id = op->can_id & ~CAN_RTR_FLAG;
+
+ } else {
+ if (op->flags & SETTIMER) {
+
+ /* set timer value */
+ op->ival1 = msg_head->ival1;
+ op->ival2 = msg_head->ival2;
+ op->kt_ival1 = timeval_to_ktime(msg_head->ival1);
+ op->kt_ival2 = timeval_to_ktime(msg_head->ival2);
+
+ /* disable an active timer due to zero value? */
+ if (!op->kt_ival1.tv64)
+ hrtimer_cancel(&op->timer);
+
+ /*
+ * In any case cancel the throttle timer, flush
+ * potentially blocked msgs and reset throttle handling
+ */
+ op->kt_lastmsg = ktime_set(0, 0);
+ hrtimer_cancel(&op->thrtimer);
+ bcm_rx_thr_flush(op, 1);
+ }
+
+ if ((op->flags & STARTTIMER) && op->kt_ival1.tv64)
+ hrtimer_start(&op->timer, op->kt_ival1,
+ HRTIMER_MODE_REL);
+ }
+
+ /* now we can register for can_ids, if we added a new bcm_op */
+ if (do_rx_register) {
+ if (ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(&init_net, ifindex);
+ if (dev) {
+ err = can_rx_register(dev, op->can_id,
+ REGMASK(op->can_id),
+ bcm_rx_handler, op,
+ "bcm");
+
+ op->rx_reg_dev = dev;
+ dev_put(dev);
+ }
+
+ } else
+ err = can_rx_register(NULL, op->can_id,
+ REGMASK(op->can_id),
+ bcm_rx_handler, op, "bcm");
+ if (err) {
+ /* this bcm rx op is broken -> remove it */
+ list_del(&op->list);
+ bcm_remove_op(op);
+ return err;
+ }
+ }
+
+ return msg_head->nframes * CFSIZ + MHSIZ;
+}
+
+/*
+ * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg)
+ */
+static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct net_device *dev;
+ int err;
+
+ /* we need a real device to send frames */
+ if (!ifindex)
+ return -ENODEV;
+
+ skb = alloc_skb(CFSIZ + sizeof(struct can_skb_priv), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ can_skb_reserve(skb);
+
+ err = memcpy_from_msg(skb_put(skb, CFSIZ), msg, CFSIZ);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ dev = dev_get_by_index(&init_net, ifindex);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ skb->dev = dev;
+ can_skb_set_owner(skb, sk);
+ err = can_send(skb, 1); /* send with loopback */
+ dev_put(dev);
+
+ if (err)
+ return err;
+
+ return CFSIZ + MHSIZ;
+}
+
+/*
+ * bcm_sendmsg - process BCM commands (opcodes) from the userspace
+ */
+static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct bcm_sock *bo = bcm_sk(sk);
+ int ifindex = bo->ifindex; /* default ifindex for this bcm_op */
+ struct bcm_msg_head msg_head;
+ int ret; /* read bytes or error codes as return value */
+
+ if (!bo->bound)
+ return -ENOTCONN;
+
+ /* check for valid message length from userspace */
+ if (size < MHSIZ || (size - MHSIZ) % CFSIZ)
+ return -EINVAL;
+
+ /* check for alternative ifindex for this bcm_op */
+
+ if (!ifindex && msg->msg_name) {
+ /* no bound device as default => check msg_name */
+ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
+
+ if (msg->msg_namelen < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+
+ /* ifindex from sendto() */
+ ifindex = addr->can_ifindex;
+
+ if (ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(&init_net, ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->type != ARPHRD_CAN) {
+ dev_put(dev);
+ return -ENODEV;
+ }
+
+ dev_put(dev);
+ }
+ }
+
+ /* read message head information */
+
+ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
+ if (ret < 0)
+ return ret;
+
+ lock_sock(sk);
+
+ switch (msg_head.opcode) {
+
+ case TX_SETUP:
+ ret = bcm_tx_setup(&msg_head, msg, ifindex, sk);
+ break;
+
+ case RX_SETUP:
+ ret = bcm_rx_setup(&msg_head, msg, ifindex, sk);
+ break;
+
+ case TX_DELETE:
+ if (bcm_delete_tx_op(&bo->tx_ops, msg_head.can_id, ifindex))
+ ret = MHSIZ;
+ else
+ ret = -EINVAL;
+ break;
+
+ case RX_DELETE:
+ if (bcm_delete_rx_op(&bo->rx_ops, msg_head.can_id, ifindex))
+ ret = MHSIZ;
+ else
+ ret = -EINVAL;
+ break;
+
+ case TX_READ:
+ /* reuse msg_head for the reply to TX_READ */
+ msg_head.opcode = TX_STATUS;
+ ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex);
+ break;
+
+ case RX_READ:
+ /* reuse msg_head for the reply to RX_READ */
+ msg_head.opcode = RX_STATUS;
+ ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex);
+ break;
+
+ case TX_SEND:
+ /* we need exactly one can_frame behind the msg head */
+ if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ))
+ ret = -EINVAL;
+ else
+ ret = bcm_tx_send(msg, ifindex, sk);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ release_sock(sk);
+
+ return ret;
+}
+
+/*
+ * notification handler for netdevice status changes
+ */
+static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier);
+ struct sock *sk = &bo->sk;
+ struct bcm_op *op;
+ int notify_enodev = 0;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+
+ switch (msg) {
+
+ case NETDEV_UNREGISTER:
+ lock_sock(sk);
+
+ /* remove device specific receive entries */
+ list_for_each_entry(op, &bo->rx_ops, list)
+ if (op->rx_reg_dev == dev)
+ bcm_rx_unreg(dev, op);
+
+ /* remove device reference, if this is our bound device */
+ if (bo->bound && bo->ifindex == dev->ifindex) {
+ bo->bound = 0;
+ bo->ifindex = 0;
+ notify_enodev = 1;
+ }
+
+ release_sock(sk);
+
+ if (notify_enodev) {
+ sk->sk_err = ENODEV;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ }
+ break;
+
+ case NETDEV_DOWN:
+ if (bo->bound && bo->ifindex == dev->ifindex) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * initial settings for all BCM sockets to be set at socket creation time
+ */
+static int bcm_init(struct sock *sk)
+{
+ struct bcm_sock *bo = bcm_sk(sk);
+
+ bo->bound = 0;
+ bo->ifindex = 0;
+ bo->dropped_usr_msgs = 0;
+ bo->bcm_proc_read = NULL;
+
+ INIT_LIST_HEAD(&bo->tx_ops);
+ INIT_LIST_HEAD(&bo->rx_ops);
+
+ /* set notifier */
+ bo->notifier.notifier_call = bcm_notifier;
+
+ register_netdevice_notifier(&bo->notifier);
+
+ return 0;
+}
+
+/*
+ * standard socket functions
+ */
+static int bcm_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct bcm_sock *bo;
+ struct bcm_op *op, *next;
+
+ if (sk == NULL)
+ return 0;
+
+ bo = bcm_sk(sk);
+
+ /* remove bcm_ops, timer, rx_unregister(), etc. */
+
+ unregister_netdevice_notifier(&bo->notifier);
+
+ lock_sock(sk);
+
+ list_for_each_entry_safe(op, next, &bo->tx_ops, list)
+ bcm_remove_op(op);
+
+ list_for_each_entry_safe(op, next, &bo->rx_ops, list) {
+ /*
+ * Don't care if we're bound or not (due to netdev problems)
+ * can_rx_unregister() is always a save thing to do here.
+ */
+ if (op->ifindex) {
+ /*
+ * Only remove subscriptions that had not
+ * been removed due to NETDEV_UNREGISTER
+ * in bcm_notifier()
+ */
+ if (op->rx_reg_dev) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(&init_net, op->ifindex);
+ if (dev) {
+ bcm_rx_unreg(dev, op);
+ dev_put(dev);
+ }
+ }
+ } else
+ can_rx_unregister(NULL, op->can_id,
+ REGMASK(op->can_id),
+ bcm_rx_handler, op);
+
+ bcm_remove_op(op);
+ }
+
+ /* remove procfs entry */
+ if (proc_dir && bo->bcm_proc_read)
+ remove_proc_entry(bo->procname, proc_dir);
+
+ /* remove device reference */
+ if (bo->bound) {
+ bo->bound = 0;
+ bo->ifindex = 0;
+ }
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
+ int flags)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct bcm_sock *bo = bcm_sk(sk);
+
+ if (len < sizeof(*addr))
+ return -EINVAL;
+
+ if (bo->bound)
+ return -EISCONN;
+
+ /* bind a device to this socket */
+ if (addr->can_ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(&init_net, addr->can_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->type != ARPHRD_CAN) {
+ dev_put(dev);
+ return -ENODEV;
+ }
+
+ bo->ifindex = dev->ifindex;
+ dev_put(dev);
+
+ } else {
+ /* no interface reference for ifindex = 0 ('any' CAN device) */
+ bo->ifindex = 0;
+ }
+
+ bo->bound = 1;
+
+ if (proc_dir) {
+ /* unique socket address as filename */
+ sprintf(bo->procname, "%lu", sock_i_ino(sk));
+ bo->bcm_proc_read = proc_create_data(bo->procname, 0644,
+ proc_dir,
+ &bcm_proc_fops, sk);
+ }
+
+ return 0;
+}
+
+static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int error = 0;
+ int noblock;
+ int err;
+
+ noblock = flags & MSG_DONTWAIT;
+ flags &= ~MSG_DONTWAIT;
+ skb = skb_recv_datagram(sk, flags, noblock, &error);
+ if (!skb)
+ return error;
+
+ if (skb->len < size)
+ size = skb->len;
+
+ err = memcpy_to_msg(msg, skb->data, size);
+ if (err < 0) {
+ skb_free_datagram(sk, skb);
+ return err;
+ }
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (msg->msg_name) {
+ __sockaddr_check_size(sizeof(struct sockaddr_can));
+ msg->msg_namelen = sizeof(struct sockaddr_can);
+ memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+ }
+
+ skb_free_datagram(sk, skb);
+
+ return size;
+}
+
+static const struct proto_ops bcm_ops = {
+ .family = PF_CAN,
+ .release = bcm_release,
+ .bind = sock_no_bind,
+ .connect = bcm_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = bcm_sendmsg,
+ .recvmsg = bcm_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct proto bcm_proto __read_mostly = {
+ .name = "CAN_BCM",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct bcm_sock),
+ .init = bcm_init,
+};
+
+static const struct can_proto bcm_can_proto = {
+ .type = SOCK_DGRAM,
+ .protocol = CAN_BCM,
+ .ops = &bcm_ops,
+ .prot = &bcm_proto,
+};
+
+static int __init bcm_module_init(void)
+{
+ int err;
+
+ pr_info("can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n");
+
+ err = can_proto_register(&bcm_can_proto);
+ if (err < 0) {
+ printk(KERN_ERR "can: registration of bcm protocol failed\n");
+ return err;
+ }
+
+ /* create /proc/net/can-bcm directory */
+ proc_dir = proc_mkdir("can-bcm", init_net.proc_net);
+ return 0;
+}
+
+static void __exit bcm_module_exit(void)
+{
+ can_proto_unregister(&bcm_can_proto);
+
+ if (proc_dir)
+ remove_proc_entry("can-bcm", init_net.proc_net);
+}
+
+module_init(bcm_module_init);
+module_exit(bcm_module_exit);
diff --git a/net/can/gw.c b/net/can/gw.c
new file mode 100644
index 000000000..a6f448e18
--- /dev/null
+++ b/net/can/gw.c
@@ -0,0 +1,998 @@
+/*
+ * gw.c - CAN frame Gateway/Router/Bridge with netlink interface
+ *
+ * Copyright (c) 2011 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/gw.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#define CAN_GW_VERSION "20130117"
+#define CAN_GW_NAME "can-gw"
+
+MODULE_DESCRIPTION("PF_CAN netlink gateway");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
+MODULE_ALIAS(CAN_GW_NAME);
+
+#define CGW_MIN_HOPS 1
+#define CGW_MAX_HOPS 6
+#define CGW_DEFAULT_HOPS 1
+
+static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS;
+module_param(max_hops, uint, S_IRUGO);
+MODULE_PARM_DESC(max_hops,
+ "maximum " CAN_GW_NAME " routing hops for CAN frames "
+ "(valid values: " __stringify(CGW_MIN_HOPS) "-"
+ __stringify(CGW_MAX_HOPS) " hops, "
+ "default: " __stringify(CGW_DEFAULT_HOPS) ")");
+
+static HLIST_HEAD(cgw_list);
+static struct notifier_block notifier;
+
+static struct kmem_cache *cgw_cache __read_mostly;
+
+/* structure that contains the (on-the-fly) CAN frame modifications */
+struct cf_mod {
+ struct {
+ struct can_frame and;
+ struct can_frame or;
+ struct can_frame xor;
+ struct can_frame set;
+ } modframe;
+ struct {
+ u8 and;
+ u8 or;
+ u8 xor;
+ u8 set;
+ } modtype;
+ void (*modfunc[MAX_MODFUNCTIONS])(struct can_frame *cf,
+ struct cf_mod *mod);
+
+ /* CAN frame checksum calculation after CAN frame modifications */
+ struct {
+ struct cgw_csum_xor xor;
+ struct cgw_csum_crc8 crc8;
+ } csum;
+ struct {
+ void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor);
+ void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8);
+ } csumfunc;
+};
+
+
+/*
+ * So far we just support CAN -> CAN routing and frame modifications.
+ *
+ * The internal can_can_gw structure contains data and attributes for
+ * a CAN -> CAN gateway job.
+ */
+struct can_can_gw {
+ struct can_filter filter;
+ int src_idx;
+ int dst_idx;
+};
+
+/* list entry for CAN gateways jobs */
+struct cgw_job {
+ struct hlist_node list;
+ struct rcu_head rcu;
+ u32 handled_frames;
+ u32 dropped_frames;
+ u32 deleted_frames;
+ struct cf_mod mod;
+ union {
+ /* CAN frame data source */
+ struct net_device *dev;
+ } src;
+ union {
+ /* CAN frame data destination */
+ struct net_device *dev;
+ } dst;
+ union {
+ struct can_can_gw ccgw;
+ /* tbc */
+ };
+ u8 gwtype;
+ u8 limit_hops;
+ u16 flags;
+};
+
+/* modification functions that are invoked in the hot path in can_can_gw_rcv */
+
+#define MODFUNC(func, op) static void func(struct can_frame *cf, \
+ struct cf_mod *mod) { op ; }
+
+MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id)
+MODFUNC(mod_and_dlc, cf->can_dlc &= mod->modframe.and.can_dlc)
+MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data)
+MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id)
+MODFUNC(mod_or_dlc, cf->can_dlc |= mod->modframe.or.can_dlc)
+MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data)
+MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id)
+MODFUNC(mod_xor_dlc, cf->can_dlc ^= mod->modframe.xor.can_dlc)
+MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data)
+MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id)
+MODFUNC(mod_set_dlc, cf->can_dlc = mod->modframe.set.can_dlc)
+MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data)
+
+static inline void canframecpy(struct can_frame *dst, struct can_frame *src)
+{
+ /*
+ * Copy the struct members separately to ensure that no uninitialized
+ * data are copied in the 3 bytes hole of the struct. This is needed
+ * to make easy compares of the data in the struct cf_mod.
+ */
+
+ dst->can_id = src->can_id;
+ dst->can_dlc = src->can_dlc;
+ *(u64 *)dst->data = *(u64 *)src->data;
+}
+
+static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re)
+{
+ /*
+ * absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
+ * relative to received dlc -1 .. -8 :
+ * e.g. for received dlc = 8
+ * -1 => index = 7 (data[7])
+ * -3 => index = 5 (data[5])
+ * -8 => index = 0 (data[0])
+ */
+
+ if (fr > -9 && fr < 8 &&
+ to > -9 && to < 8 &&
+ re > -9 && re < 8)
+ return 0;
+ else
+ return -EINVAL;
+}
+
+static inline int calc_idx(int idx, int rx_dlc)
+{
+ if (idx < 0)
+ return rx_dlc + idx;
+ else
+ return idx;
+}
+
+static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor)
+{
+ int from = calc_idx(xor->from_idx, cf->can_dlc);
+ int to = calc_idx(xor->to_idx, cf->can_dlc);
+ int res = calc_idx(xor->result_idx, cf->can_dlc);
+ u8 val = xor->init_xor_val;
+ int i;
+
+ if (from < 0 || to < 0 || res < 0)
+ return;
+
+ if (from <= to) {
+ for (i = from; i <= to; i++)
+ val ^= cf->data[i];
+ } else {
+ for (i = from; i >= to; i--)
+ val ^= cf->data[i];
+ }
+
+ cf->data[res] = val;
+}
+
+static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor)
+{
+ u8 val = xor->init_xor_val;
+ int i;
+
+ for (i = xor->from_idx; i <= xor->to_idx; i++)
+ val ^= cf->data[i];
+
+ cf->data[xor->result_idx] = val;
+}
+
+static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor)
+{
+ u8 val = xor->init_xor_val;
+ int i;
+
+ for (i = xor->from_idx; i >= xor->to_idx; i--)
+ val ^= cf->data[i];
+
+ cf->data[xor->result_idx] = val;
+}
+
+static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+{
+ int from = calc_idx(crc8->from_idx, cf->can_dlc);
+ int to = calc_idx(crc8->to_idx, cf->can_dlc);
+ int res = calc_idx(crc8->result_idx, cf->can_dlc);
+ u8 crc = crc8->init_crc_val;
+ int i;
+
+ if (from < 0 || to < 0 || res < 0)
+ return;
+
+ if (from <= to) {
+ for (i = crc8->from_idx; i <= crc8->to_idx; i++)
+ crc = crc8->crctab[crc^cf->data[i]];
+ } else {
+ for (i = crc8->from_idx; i >= crc8->to_idx; i--)
+ crc = crc8->crctab[crc^cf->data[i]];
+ }
+
+ switch (crc8->profile) {
+
+ case CGW_CRC8PRF_1U8:
+ crc = crc8->crctab[crc^crc8->profile_data[0]];
+ break;
+
+ case CGW_CRC8PRF_16U8:
+ crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ break;
+
+ case CGW_CRC8PRF_SFFID_XOR:
+ crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ (cf->can_id >> 8 & 0xFF)];
+ break;
+
+ }
+
+ cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+}
+
+static void cgw_csum_crc8_pos(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+{
+ u8 crc = crc8->init_crc_val;
+ int i;
+
+ for (i = crc8->from_idx; i <= crc8->to_idx; i++)
+ crc = crc8->crctab[crc^cf->data[i]];
+
+ switch (crc8->profile) {
+
+ case CGW_CRC8PRF_1U8:
+ crc = crc8->crctab[crc^crc8->profile_data[0]];
+ break;
+
+ case CGW_CRC8PRF_16U8:
+ crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ break;
+
+ case CGW_CRC8PRF_SFFID_XOR:
+ crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ (cf->can_id >> 8 & 0xFF)];
+ break;
+ }
+
+ cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+}
+
+static void cgw_csum_crc8_neg(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+{
+ u8 crc = crc8->init_crc_val;
+ int i;
+
+ for (i = crc8->from_idx; i >= crc8->to_idx; i--)
+ crc = crc8->crctab[crc^cf->data[i]];
+
+ switch (crc8->profile) {
+
+ case CGW_CRC8PRF_1U8:
+ crc = crc8->crctab[crc^crc8->profile_data[0]];
+ break;
+
+ case CGW_CRC8PRF_16U8:
+ crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ break;
+
+ case CGW_CRC8PRF_SFFID_XOR:
+ crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ (cf->can_id >> 8 & 0xFF)];
+ break;
+ }
+
+ cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+}
+
+/* the receive & process & send function */
+static void can_can_gw_rcv(struct sk_buff *skb, void *data)
+{
+ struct cgw_job *gwj = (struct cgw_job *)data;
+ struct can_frame *cf;
+ struct sk_buff *nskb;
+ int modidx = 0;
+
+ /*
+ * Do not handle CAN frames routed more than 'max_hops' times.
+ * In general we should never catch this delimiter which is intended
+ * to cover a misconfiguration protection (e.g. circular CAN routes).
+ *
+ * The Controller Area Network controllers only accept CAN frames with
+ * correct CRCs - which are not visible in the controller registers.
+ * According to skbuff.h documentation the csum_start element for IP
+ * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY.
+ * Only CAN skbs can be processed here which already have this property.
+ */
+
+#define cgw_hops(skb) ((skb)->csum_start)
+
+ BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY);
+
+ if (cgw_hops(skb) >= max_hops) {
+ /* indicate deleted frames due to misconfiguration */
+ gwj->deleted_frames++;
+ return;
+ }
+
+ if (!(gwj->dst.dev->flags & IFF_UP)) {
+ gwj->dropped_frames++;
+ return;
+ }
+
+ /* is sending the skb back to the incoming interface not allowed? */
+ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) &&
+ can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex)
+ return;
+
+ /*
+ * clone the given skb, which has not been done in can_rcv()
+ *
+ * When there is at least one modification function activated,
+ * we need to copy the skb as we want to modify skb->data.
+ */
+ if (gwj->mod.modfunc[0])
+ nskb = skb_copy(skb, GFP_ATOMIC);
+ else
+ nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (!nskb) {
+ gwj->dropped_frames++;
+ return;
+ }
+
+ /* put the incremented hop counter in the cloned skb */
+ cgw_hops(nskb) = cgw_hops(skb) + 1;
+
+ /* first processing of this CAN frame -> adjust to private hop limit */
+ if (gwj->limit_hops && cgw_hops(nskb) == 1)
+ cgw_hops(nskb) = max_hops - gwj->limit_hops + 1;
+
+ nskb->dev = gwj->dst.dev;
+
+ /* pointer to modifiable CAN frame */
+ cf = (struct can_frame *)nskb->data;
+
+ /* perform preprocessed modification functions if there are any */
+ while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx])
+ (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod);
+
+ /* check for checksum updates when the CAN frame has been modified */
+ if (modidx) {
+ if (gwj->mod.csumfunc.crc8)
+ (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
+
+ if (gwj->mod.csumfunc.xor)
+ (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor);
+ }
+
+ /* clear the skb timestamp if not configured the other way */
+ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP))
+ nskb->tstamp.tv64 = 0;
+
+ /* send to netdevice */
+ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO))
+ gwj->dropped_frames++;
+ else
+ gwj->handled_frames++;
+}
+
+static inline int cgw_register_filter(struct cgw_job *gwj)
+{
+ return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id,
+ gwj->ccgw.filter.can_mask, can_can_gw_rcv,
+ gwj, "gw");
+}
+
+static inline void cgw_unregister_filter(struct cgw_job *gwj)
+{
+ can_rx_unregister(gwj->src.dev, gwj->ccgw.filter.can_id,
+ gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj);
+}
+
+static int cgw_notifier(struct notifier_block *nb,
+ unsigned long msg, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+
+ if (msg == NETDEV_UNREGISTER) {
+
+ struct cgw_job *gwj = NULL;
+ struct hlist_node *nx;
+
+ ASSERT_RTNL();
+
+ hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) {
+
+ if (gwj->src.dev == dev || gwj->dst.dev == dev) {
+ hlist_del(&gwj->list);
+ cgw_unregister_filter(gwj);
+ kmem_cache_free(cgw_cache, gwj);
+ }
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
+ u32 pid, u32 seq, int flags)
+{
+ struct cgw_frame_mod mb;
+ struct rtcanmsg *rtcan;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtcan = nlmsg_data(nlh);
+ rtcan->can_family = AF_CAN;
+ rtcan->gwtype = gwj->gwtype;
+ rtcan->flags = gwj->flags;
+
+ /* add statistics if available */
+
+ if (gwj->handled_frames) {
+ if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0)
+ goto cancel;
+ }
+
+ if (gwj->dropped_frames) {
+ if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0)
+ goto cancel;
+ }
+
+ if (gwj->deleted_frames) {
+ if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0)
+ goto cancel;
+ }
+
+ /* check non default settings of attributes */
+
+ if (gwj->limit_hops) {
+ if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.modtype.and) {
+ memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.and;
+ if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.modtype.or) {
+ memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.or;
+ if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.modtype.xor) {
+ memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.xor;
+ if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.modtype.set) {
+ memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.set;
+ if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.csumfunc.crc8) {
+ if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN,
+ &gwj->mod.csum.crc8) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.csumfunc.xor) {
+ if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN,
+ &gwj->mod.csum.xor) < 0)
+ goto cancel;
+ }
+
+ if (gwj->gwtype == CGW_TYPE_CAN_CAN) {
+
+ if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) {
+ if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter),
+ &gwj->ccgw.filter) < 0)
+ goto cancel;
+ }
+
+ if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0)
+ goto cancel;
+
+ if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0)
+ goto cancel;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+/* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */
+static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cgw_job *gwj = NULL;
+ int idx = 0;
+ int s_idx = cb->args[0];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(gwj, &cgw_list, list) {
+ if (idx < s_idx)
+ goto cont;
+
+ if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0)
+ break;
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static const struct nla_policy cgw_policy[CGW_MAX+1] = {
+ [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) },
+ [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) },
+ [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) },
+ [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) },
+ [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) },
+ [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) },
+ [CGW_SRC_IF] = { .type = NLA_U32 },
+ [CGW_DST_IF] = { .type = NLA_U32 },
+ [CGW_FILTER] = { .len = sizeof(struct can_filter) },
+ [CGW_LIM_HOPS] = { .type = NLA_U8 },
+};
+
+/* check for common and gwtype specific attributes */
+static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
+ u8 gwtype, void *gwtypeattr, u8 *limhops)
+{
+ struct nlattr *tb[CGW_MAX+1];
+ struct cgw_frame_mod mb;
+ int modidx = 0;
+ int err = 0;
+
+ /* initialize modification & checksum data space */
+ memset(mod, 0, sizeof(*mod));
+
+ err = nlmsg_parse(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX,
+ cgw_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[CGW_LIM_HOPS]) {
+ *limhops = nla_get_u8(tb[CGW_LIM_HOPS]);
+
+ if (*limhops < 1 || *limhops > max_hops)
+ return -EINVAL;
+ }
+
+ /* check for AND/OR/XOR/SET modifications */
+
+ if (tb[CGW_MOD_AND]) {
+ nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.and, &mb.cf);
+ mod->modtype.and = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_and_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_and_dlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_and_data;
+ }
+
+ if (tb[CGW_MOD_OR]) {
+ nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.or, &mb.cf);
+ mod->modtype.or = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_or_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_or_dlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_or_data;
+ }
+
+ if (tb[CGW_MOD_XOR]) {
+ nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.xor, &mb.cf);
+ mod->modtype.xor = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_xor_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_xor_dlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_xor_data;
+ }
+
+ if (tb[CGW_MOD_SET]) {
+ nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.set, &mb.cf);
+ mod->modtype.set = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_set_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_set_dlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_set_data;
+ }
+
+ /* check for checksum operations after CAN frame modifications */
+ if (modidx) {
+
+ if (tb[CGW_CS_CRC8]) {
+ struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]);
+
+ err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
+ c->result_idx);
+ if (err)
+ return err;
+
+ nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8],
+ CGW_CS_CRC8_LEN);
+
+ /*
+ * select dedicated processing function to reduce
+ * runtime operations in receive hot path.
+ */
+ if (c->from_idx < 0 || c->to_idx < 0 ||
+ c->result_idx < 0)
+ mod->csumfunc.crc8 = cgw_csum_crc8_rel;
+ else if (c->from_idx <= c->to_idx)
+ mod->csumfunc.crc8 = cgw_csum_crc8_pos;
+ else
+ mod->csumfunc.crc8 = cgw_csum_crc8_neg;
+ }
+
+ if (tb[CGW_CS_XOR]) {
+ struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]);
+
+ err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
+ c->result_idx);
+ if (err)
+ return err;
+
+ nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR],
+ CGW_CS_XOR_LEN);
+
+ /*
+ * select dedicated processing function to reduce
+ * runtime operations in receive hot path.
+ */
+ if (c->from_idx < 0 || c->to_idx < 0 ||
+ c->result_idx < 0)
+ mod->csumfunc.xor = cgw_csum_xor_rel;
+ else if (c->from_idx <= c->to_idx)
+ mod->csumfunc.xor = cgw_csum_xor_pos;
+ else
+ mod->csumfunc.xor = cgw_csum_xor_neg;
+ }
+ }
+
+ if (gwtype == CGW_TYPE_CAN_CAN) {
+
+ /* check CGW_TYPE_CAN_CAN specific attributes */
+
+ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr;
+ memset(ccgw, 0, sizeof(*ccgw));
+
+ /* check for can_filter in attributes */
+ if (tb[CGW_FILTER])
+ nla_memcpy(&ccgw->filter, tb[CGW_FILTER],
+ sizeof(struct can_filter));
+
+ err = -ENODEV;
+
+ /* specifying two interfaces is mandatory */
+ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF])
+ return err;
+
+ ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]);
+ ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]);
+
+ /* both indices set to 0 for flushing all routing entries */
+ if (!ccgw->src_idx && !ccgw->dst_idx)
+ return 0;
+
+ /* only one index set to 0 is an error */
+ if (!ccgw->src_idx || !ccgw->dst_idx)
+ return err;
+ }
+
+ /* add the checks for other gwtypes here */
+
+ return 0;
+}
+
+static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct rtcanmsg *r;
+ struct cgw_job *gwj;
+ u8 limhops = 0;
+ int err = 0;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (nlmsg_len(nlh) < sizeof(*r))
+ return -EINVAL;
+
+ r = nlmsg_data(nlh);
+ if (r->can_family != AF_CAN)
+ return -EPFNOSUPPORT;
+
+ /* so far we only support CAN -> CAN routings */
+ if (r->gwtype != CGW_TYPE_CAN_CAN)
+ return -EINVAL;
+
+ gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL);
+ if (!gwj)
+ return -ENOMEM;
+
+ gwj->handled_frames = 0;
+ gwj->dropped_frames = 0;
+ gwj->deleted_frames = 0;
+ gwj->flags = r->flags;
+ gwj->gwtype = r->gwtype;
+
+ err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw,
+ &limhops);
+ if (err < 0)
+ goto out;
+
+ err = -ENODEV;
+
+ /* ifindex == 0 is not allowed for job creation */
+ if (!gwj->ccgw.src_idx || !gwj->ccgw.dst_idx)
+ goto out;
+
+ gwj->src.dev = __dev_get_by_index(&init_net, gwj->ccgw.src_idx);
+
+ if (!gwj->src.dev)
+ goto out;
+
+ if (gwj->src.dev->type != ARPHRD_CAN)
+ goto out;
+
+ gwj->dst.dev = __dev_get_by_index(&init_net, gwj->ccgw.dst_idx);
+
+ if (!gwj->dst.dev)
+ goto out;
+
+ if (gwj->dst.dev->type != ARPHRD_CAN)
+ goto out;
+
+ gwj->limit_hops = limhops;
+
+ ASSERT_RTNL();
+
+ err = cgw_register_filter(gwj);
+ if (!err)
+ hlist_add_head_rcu(&gwj->list, &cgw_list);
+out:
+ if (err)
+ kmem_cache_free(cgw_cache, gwj);
+
+ return err;
+}
+
+static void cgw_remove_all_jobs(void)
+{
+ struct cgw_job *gwj = NULL;
+ struct hlist_node *nx;
+
+ ASSERT_RTNL();
+
+ hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) {
+ hlist_del(&gwj->list);
+ cgw_unregister_filter(gwj);
+ kmem_cache_free(cgw_cache, gwj);
+ }
+}
+
+static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct cgw_job *gwj = NULL;
+ struct hlist_node *nx;
+ struct rtcanmsg *r;
+ struct cf_mod mod;
+ struct can_can_gw ccgw;
+ u8 limhops = 0;
+ int err = 0;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (nlmsg_len(nlh) < sizeof(*r))
+ return -EINVAL;
+
+ r = nlmsg_data(nlh);
+ if (r->can_family != AF_CAN)
+ return -EPFNOSUPPORT;
+
+ /* so far we only support CAN -> CAN routings */
+ if (r->gwtype != CGW_TYPE_CAN_CAN)
+ return -EINVAL;
+
+ err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
+ if (err < 0)
+ return err;
+
+ /* two interface indices both set to 0 => remove all entries */
+ if (!ccgw.src_idx && !ccgw.dst_idx) {
+ cgw_remove_all_jobs();
+ return 0;
+ }
+
+ err = -EINVAL;
+
+ ASSERT_RTNL();
+
+ /* remove only the first matching entry */
+ hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) {
+
+ if (gwj->flags != r->flags)
+ continue;
+
+ if (gwj->limit_hops != limhops)
+ continue;
+
+ if (memcmp(&gwj->mod, &mod, sizeof(mod)))
+ continue;
+
+ /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */
+ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw)))
+ continue;
+
+ hlist_del(&gwj->list);
+ cgw_unregister_filter(gwj);
+ kmem_cache_free(cgw_cache, gwj);
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static __init int cgw_module_init(void)
+{
+ /* sanitize given module parameter */
+ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS);
+
+ pr_info("can: netlink gateway (rev " CAN_GW_VERSION ") max_hops=%d\n",
+ max_hops);
+
+ cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job),
+ 0, 0, NULL);
+
+ if (!cgw_cache)
+ return -ENOMEM;
+
+ /* set notifier */
+ notifier.notifier_call = cgw_notifier;
+ register_netdevice_notifier(&notifier);
+
+ if (__rtnl_register(PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, NULL)) {
+ unregister_netdevice_notifier(&notifier);
+ kmem_cache_destroy(cgw_cache);
+ return -ENOBUFS;
+ }
+
+ /* Only the first call to __rtnl_register can fail */
+ __rtnl_register(PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, NULL);
+ __rtnl_register(PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, NULL);
+
+ return 0;
+}
+
+static __exit void cgw_module_exit(void)
+{
+ rtnl_unregister_all(PF_CAN);
+
+ unregister_netdevice_notifier(&notifier);
+
+ rtnl_lock();
+ cgw_remove_all_jobs();
+ rtnl_unlock();
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ kmem_cache_destroy(cgw_cache);
+}
+
+module_init(cgw_module_init);
+module_exit(cgw_module_exit);
diff --git a/net/can/proc.c b/net/can/proc.c
new file mode 100644
index 000000000..1a19b985a
--- /dev/null
+++ b/net/can/proc.c
@@ -0,0 +1,580 @@
+/*
+ * proc.c - procfs support for Protocol family CAN core module
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/can/core.h>
+
+#include "af_can.h"
+
+/*
+ * proc filenames for the PF_CAN core
+ */
+
+#define CAN_PROC_VERSION "version"
+#define CAN_PROC_STATS "stats"
+#define CAN_PROC_RESET_STATS "reset_stats"
+#define CAN_PROC_RCVLIST_ALL "rcvlist_all"
+#define CAN_PROC_RCVLIST_FIL "rcvlist_fil"
+#define CAN_PROC_RCVLIST_INV "rcvlist_inv"
+#define CAN_PROC_RCVLIST_SFF "rcvlist_sff"
+#define CAN_PROC_RCVLIST_EFF "rcvlist_eff"
+#define CAN_PROC_RCVLIST_ERR "rcvlist_err"
+
+static struct proc_dir_entry *can_dir;
+static struct proc_dir_entry *pde_version;
+static struct proc_dir_entry *pde_stats;
+static struct proc_dir_entry *pde_reset_stats;
+static struct proc_dir_entry *pde_rcvlist_all;
+static struct proc_dir_entry *pde_rcvlist_fil;
+static struct proc_dir_entry *pde_rcvlist_inv;
+static struct proc_dir_entry *pde_rcvlist_sff;
+static struct proc_dir_entry *pde_rcvlist_eff;
+static struct proc_dir_entry *pde_rcvlist_err;
+
+static int user_reset;
+
+static const char rx_list_name[][8] = {
+ [RX_ERR] = "rx_err",
+ [RX_ALL] = "rx_all",
+ [RX_FIL] = "rx_fil",
+ [RX_INV] = "rx_inv",
+};
+
+/*
+ * af_can statistics stuff
+ */
+
+static void can_init_stats(void)
+{
+ /*
+ * This memset function is called from a timer context (when
+ * can_stattimer is active which is the default) OR in a process
+ * context (reading the proc_fs when can_stattimer is disabled).
+ */
+ memset(&can_stats, 0, sizeof(can_stats));
+ can_stats.jiffies_init = jiffies;
+
+ can_pstats.stats_reset++;
+
+ if (user_reset) {
+ user_reset = 0;
+ can_pstats.user_reset++;
+ }
+}
+
+static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif,
+ unsigned long count)
+{
+ unsigned long rate;
+
+ if (oldjif == newjif)
+ return 0;
+
+ /* see can_stat_update() - this should NEVER happen! */
+ if (count > (ULONG_MAX / HZ)) {
+ printk(KERN_ERR "can: calc_rate: count exceeded! %ld\n",
+ count);
+ return 99999999;
+ }
+
+ rate = (count * HZ) / (newjif - oldjif);
+
+ return rate;
+}
+
+void can_stat_update(unsigned long data)
+{
+ unsigned long j = jiffies; /* snapshot */
+
+ /* restart counting in timer context on user request */
+ if (user_reset)
+ can_init_stats();
+
+ /* restart counting on jiffies overflow */
+ if (j < can_stats.jiffies_init)
+ can_init_stats();
+
+ /* prevent overflow in calc_rate() */
+ if (can_stats.rx_frames > (ULONG_MAX / HZ))
+ can_init_stats();
+
+ /* prevent overflow in calc_rate() */
+ if (can_stats.tx_frames > (ULONG_MAX / HZ))
+ can_init_stats();
+
+ /* matches overflow - very improbable */
+ if (can_stats.matches > (ULONG_MAX / 100))
+ can_init_stats();
+
+ /* calc total values */
+ if (can_stats.rx_frames)
+ can_stats.total_rx_match_ratio = (can_stats.matches * 100) /
+ can_stats.rx_frames;
+
+ can_stats.total_tx_rate = calc_rate(can_stats.jiffies_init, j,
+ can_stats.tx_frames);
+ can_stats.total_rx_rate = calc_rate(can_stats.jiffies_init, j,
+ can_stats.rx_frames);
+
+ /* calc current values */
+ if (can_stats.rx_frames_delta)
+ can_stats.current_rx_match_ratio =
+ (can_stats.matches_delta * 100) /
+ can_stats.rx_frames_delta;
+
+ can_stats.current_tx_rate = calc_rate(0, HZ, can_stats.tx_frames_delta);
+ can_stats.current_rx_rate = calc_rate(0, HZ, can_stats.rx_frames_delta);
+
+ /* check / update maximum values */
+ if (can_stats.max_tx_rate < can_stats.current_tx_rate)
+ can_stats.max_tx_rate = can_stats.current_tx_rate;
+
+ if (can_stats.max_rx_rate < can_stats.current_rx_rate)
+ can_stats.max_rx_rate = can_stats.current_rx_rate;
+
+ if (can_stats.max_rx_match_ratio < can_stats.current_rx_match_ratio)
+ can_stats.max_rx_match_ratio = can_stats.current_rx_match_ratio;
+
+ /* clear values for 'current rate' calculation */
+ can_stats.tx_frames_delta = 0;
+ can_stats.rx_frames_delta = 0;
+ can_stats.matches_delta = 0;
+
+ /* restart timer (one second) */
+ mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
+}
+
+/*
+ * proc read functions
+ */
+
+static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list,
+ struct net_device *dev)
+{
+ struct receiver *r;
+
+ hlist_for_each_entry_rcu(r, rx_list, list) {
+ char *fmt = (r->can_id & CAN_EFF_FLAG)?
+ " %-5s %08x %08x %pK %pK %8ld %s\n" :
+ " %-5s %03x %08x %pK %pK %8ld %s\n";
+
+ seq_printf(m, fmt, DNAME(dev), r->can_id, r->mask,
+ r->func, r->data, r->matches, r->ident);
+ }
+}
+
+static void can_print_recv_banner(struct seq_file *m)
+{
+ /*
+ * can1. 00000000 00000000 00000000
+ * ....... 0 tp20
+ */
+ seq_puts(m, " device can_id can_mask function"
+ " userdata matches ident\n");
+}
+
+static int can_stats_proc_show(struct seq_file *m, void *v)
+{
+ seq_putc(m, '\n');
+ seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats.tx_frames);
+ seq_printf(m, " %8ld received frames (RXF)\n", can_stats.rx_frames);
+ seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats.matches);
+
+ seq_putc(m, '\n');
+
+ if (can_stattimer.function == can_stat_update) {
+ seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
+ can_stats.total_rx_match_ratio);
+
+ seq_printf(m, " %8ld frames/s total tx rate (TXR)\n",
+ can_stats.total_tx_rate);
+ seq_printf(m, " %8ld frames/s total rx rate (RXR)\n",
+ can_stats.total_rx_rate);
+
+ seq_putc(m, '\n');
+
+ seq_printf(m, " %8ld %% current match ratio (CRXMR)\n",
+ can_stats.current_rx_match_ratio);
+
+ seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n",
+ can_stats.current_tx_rate);
+ seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n",
+ can_stats.current_rx_rate);
+
+ seq_putc(m, '\n');
+
+ seq_printf(m, " %8ld %% max match ratio (MRXMR)\n",
+ can_stats.max_rx_match_ratio);
+
+ seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n",
+ can_stats.max_tx_rate);
+ seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n",
+ can_stats.max_rx_rate);
+
+ seq_putc(m, '\n');
+ }
+
+ seq_printf(m, " %8ld current receive list entries (CRCV)\n",
+ can_pstats.rcv_entries);
+ seq_printf(m, " %8ld maximum receive list entries (MRCV)\n",
+ can_pstats.rcv_entries_max);
+
+ if (can_pstats.stats_reset)
+ seq_printf(m, "\n %8ld statistic resets (STR)\n",
+ can_pstats.stats_reset);
+
+ if (can_pstats.user_reset)
+ seq_printf(m, " %8ld user statistic resets (USTR)\n",
+ can_pstats.user_reset);
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int can_stats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, can_stats_proc_show, NULL);
+}
+
+static const struct file_operations can_stats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = can_stats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int can_reset_stats_proc_show(struct seq_file *m, void *v)
+{
+ user_reset = 1;
+
+ if (can_stattimer.function == can_stat_update) {
+ seq_printf(m, "Scheduled statistic reset #%ld.\n",
+ can_pstats.stats_reset + 1);
+
+ } else {
+ if (can_stats.jiffies_init != jiffies)
+ can_init_stats();
+
+ seq_printf(m, "Performed statistic reset #%ld.\n",
+ can_pstats.stats_reset);
+ }
+ return 0;
+}
+
+static int can_reset_stats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, can_reset_stats_proc_show, NULL);
+}
+
+static const struct file_operations can_reset_stats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = can_reset_stats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int can_version_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%s\n", CAN_VERSION_STRING);
+ return 0;
+}
+
+static int can_version_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, can_version_proc_show, NULL);
+}
+
+static const struct file_operations can_version_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = can_version_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
+ struct net_device *dev,
+ struct dev_rcv_lists *d)
+{
+ if (!hlist_empty(&d->rx[idx])) {
+ can_print_recv_banner(m);
+ can_print_rcvlist(m, &d->rx[idx], dev);
+ } else
+ seq_printf(m, " (%s: no entry)\n", DNAME(dev));
+
+}
+
+static int can_rcvlist_proc_show(struct seq_file *m, void *v)
+{
+ /* double cast to prevent GCC warning */
+ int idx = (int)(long)m->private;
+ struct net_device *dev;
+ struct dev_rcv_lists *d;
+
+ seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]);
+
+ rcu_read_lock();
+
+ /* receive list for 'all' CAN devices (dev == NULL) */
+ d = &can_rx_alldev_list;
+ can_rcvlist_proc_show_one(m, idx, NULL, d);
+
+ /* receive list for registered CAN devices */
+ for_each_netdev_rcu(&init_net, dev) {
+ if (dev->type == ARPHRD_CAN && dev->ml_priv)
+ can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv);
+ }
+
+ rcu_read_unlock();
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int can_rcvlist_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, can_rcvlist_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations can_rcvlist_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = can_rcvlist_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static inline void can_rcvlist_proc_show_array(struct seq_file *m,
+ struct net_device *dev,
+ struct hlist_head *rcv_array,
+ unsigned int rcv_array_sz)
+{
+ unsigned int i;
+ int all_empty = 1;
+
+ /* check whether at least one list is non-empty */
+ for (i = 0; i < rcv_array_sz; i++)
+ if (!hlist_empty(&rcv_array[i])) {
+ all_empty = 0;
+ break;
+ }
+
+ if (!all_empty) {
+ can_print_recv_banner(m);
+ for (i = 0; i < rcv_array_sz; i++) {
+ if (!hlist_empty(&rcv_array[i]))
+ can_print_rcvlist(m, &rcv_array[i], dev);
+ }
+ } else
+ seq_printf(m, " (%s: no entry)\n", DNAME(dev));
+}
+
+static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
+{
+ struct net_device *dev;
+ struct dev_rcv_lists *d;
+
+ /* RX_SFF */
+ seq_puts(m, "\nreceive list 'rx_sff':\n");
+
+ rcu_read_lock();
+
+ /* sff receive list for 'all' CAN devices (dev == NULL) */
+ d = &can_rx_alldev_list;
+ can_rcvlist_proc_show_array(m, NULL, d->rx_sff, ARRAY_SIZE(d->rx_sff));
+
+ /* sff receive list for registered CAN devices */
+ for_each_netdev_rcu(&init_net, dev) {
+ if (dev->type == ARPHRD_CAN && dev->ml_priv) {
+ d = dev->ml_priv;
+ can_rcvlist_proc_show_array(m, dev, d->rx_sff,
+ ARRAY_SIZE(d->rx_sff));
+ }
+ }
+
+ rcu_read_unlock();
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, can_rcvlist_sff_proc_show, NULL);
+}
+
+static const struct file_operations can_rcvlist_sff_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = can_rcvlist_sff_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+
+static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
+{
+ struct net_device *dev;
+ struct dev_rcv_lists *d;
+
+ /* RX_EFF */
+ seq_puts(m, "\nreceive list 'rx_eff':\n");
+
+ rcu_read_lock();
+
+ /* eff receive list for 'all' CAN devices (dev == NULL) */
+ d = &can_rx_alldev_list;
+ can_rcvlist_proc_show_array(m, NULL, d->rx_eff, ARRAY_SIZE(d->rx_eff));
+
+ /* eff receive list for registered CAN devices */
+ for_each_netdev_rcu(&init_net, dev) {
+ if (dev->type == ARPHRD_CAN && dev->ml_priv) {
+ d = dev->ml_priv;
+ can_rcvlist_proc_show_array(m, dev, d->rx_eff,
+ ARRAY_SIZE(d->rx_eff));
+ }
+ }
+
+ rcu_read_unlock();
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int can_rcvlist_eff_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, can_rcvlist_eff_proc_show, NULL);
+}
+
+static const struct file_operations can_rcvlist_eff_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = can_rcvlist_eff_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * proc utility functions
+ */
+
+static void can_remove_proc_readentry(const char *name)
+{
+ if (can_dir)
+ remove_proc_entry(name, can_dir);
+}
+
+/*
+ * can_init_proc - create main CAN proc directory and procfs entries
+ */
+void can_init_proc(void)
+{
+ /* create /proc/net/can directory */
+ can_dir = proc_mkdir("can", init_net.proc_net);
+
+ if (!can_dir) {
+ printk(KERN_INFO "can: failed to create /proc/net/can . "
+ "CONFIG_PROC_FS missing?\n");
+ return;
+ }
+
+ /* own procfs entries from the AF_CAN core */
+ pde_version = proc_create(CAN_PROC_VERSION, 0644, can_dir,
+ &can_version_proc_fops);
+ pde_stats = proc_create(CAN_PROC_STATS, 0644, can_dir,
+ &can_stats_proc_fops);
+ pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644, can_dir,
+ &can_reset_stats_proc_fops);
+ pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644, can_dir,
+ &can_rcvlist_proc_fops, (void *)RX_ERR);
+ pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644, can_dir,
+ &can_rcvlist_proc_fops, (void *)RX_ALL);
+ pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644, can_dir,
+ &can_rcvlist_proc_fops, (void *)RX_FIL);
+ pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644, can_dir,
+ &can_rcvlist_proc_fops, (void *)RX_INV);
+ pde_rcvlist_eff = proc_create(CAN_PROC_RCVLIST_EFF, 0644, can_dir,
+ &can_rcvlist_eff_proc_fops);
+ pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644, can_dir,
+ &can_rcvlist_sff_proc_fops);
+}
+
+/*
+ * can_remove_proc - remove procfs entries and main CAN proc directory
+ */
+void can_remove_proc(void)
+{
+ if (pde_version)
+ can_remove_proc_readentry(CAN_PROC_VERSION);
+
+ if (pde_stats)
+ can_remove_proc_readentry(CAN_PROC_STATS);
+
+ if (pde_reset_stats)
+ can_remove_proc_readentry(CAN_PROC_RESET_STATS);
+
+ if (pde_rcvlist_err)
+ can_remove_proc_readentry(CAN_PROC_RCVLIST_ERR);
+
+ if (pde_rcvlist_all)
+ can_remove_proc_readentry(CAN_PROC_RCVLIST_ALL);
+
+ if (pde_rcvlist_fil)
+ can_remove_proc_readentry(CAN_PROC_RCVLIST_FIL);
+
+ if (pde_rcvlist_inv)
+ can_remove_proc_readentry(CAN_PROC_RCVLIST_INV);
+
+ if (pde_rcvlist_eff)
+ can_remove_proc_readentry(CAN_PROC_RCVLIST_EFF);
+
+ if (pde_rcvlist_sff)
+ can_remove_proc_readentry(CAN_PROC_RCVLIST_SFF);
+
+ if (can_dir)
+ remove_proc_entry("can", init_net.proc_net);
+}
diff --git a/net/can/raw.c b/net/can/raw.c
new file mode 100644
index 000000000..31b9748cb
--- /dev/null
+++ b/net/can/raw.c
@@ -0,0 +1,875 @@
+/*
+ * raw.c - Raw sockets for protocol family CAN
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/raw.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+#define CAN_RAW_VERSION CAN_VERSION
+
+MODULE_DESCRIPTION("PF_CAN raw protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>");
+MODULE_ALIAS("can-proto-1");
+
+#define MASK_ALL 0
+
+/*
+ * A raw socket has a list of can_filters attached to it, each receiving
+ * the CAN frames matching that filter. If the filter list is empty,
+ * no CAN frames will be received by the socket. The default after
+ * opening the socket, is to have one filter which receives all frames.
+ * The filter list is allocated dynamically with the exception of the
+ * list containing only one item. This common case is optimized by
+ * storing the single filter in dfilter, to avoid using dynamic memory.
+ */
+
+struct uniqframe {
+ ktime_t tstamp;
+ const struct sk_buff *skb;
+ unsigned int join_rx_count;
+};
+
+struct raw_sock {
+ struct sock sk;
+ int bound;
+ int ifindex;
+ struct notifier_block notifier;
+ int loopback;
+ int recv_own_msgs;
+ int fd_frames;
+ int join_filters;
+ int count; /* number of active filters */
+ struct can_filter dfilter; /* default/single filter */
+ struct can_filter *filter; /* pointer to filter(s) */
+ can_err_mask_t err_mask;
+ struct uniqframe __percpu *uniq;
+};
+
+/*
+ * Return pointer to store the extra msg flags for raw_recvmsg().
+ * We use the space of one unsigned int beyond the 'struct sockaddr_can'
+ * in skb->cb.
+ */
+static inline unsigned int *raw_flags(struct sk_buff *skb)
+{
+ sock_skb_cb_check_size(sizeof(struct sockaddr_can) +
+ sizeof(unsigned int));
+
+ /* return pointer after struct sockaddr_can */
+ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
+}
+
+static inline struct raw_sock *raw_sk(const struct sock *sk)
+{
+ return (struct raw_sock *)sk;
+}
+
+static void raw_rcv(struct sk_buff *oskb, void *data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct raw_sock *ro = raw_sk(sk);
+ struct sockaddr_can *addr;
+ struct sk_buff *skb;
+ unsigned int *pflags;
+
+ /* check the received tx sock reference */
+ if (!ro->recv_own_msgs && oskb->sk == sk)
+ return;
+
+ /* do not pass non-CAN2.0 frames to a legacy socket */
+ if (!ro->fd_frames && oskb->len != CAN_MTU)
+ return;
+
+ /* eliminate multiple filter matches for the same skb */
+ if (this_cpu_ptr(ro->uniq)->skb == oskb &&
+ ktime_equal(this_cpu_ptr(ro->uniq)->tstamp, oskb->tstamp)) {
+ if (ro->join_filters) {
+ this_cpu_inc(ro->uniq->join_rx_count);
+ /* drop frame until all enabled filters matched */
+ if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count)
+ return;
+ } else {
+ return;
+ }
+ } else {
+ this_cpu_ptr(ro->uniq)->skb = oskb;
+ this_cpu_ptr(ro->uniq)->tstamp = oskb->tstamp;
+ this_cpu_ptr(ro->uniq)->join_rx_count = 1;
+ /* drop first frame to check all enabled filters? */
+ if (ro->join_filters && ro->count > 1)
+ return;
+ }
+
+ /* clone the given skb to be able to enqueue it into the rcv queue */
+ skb = skb_clone(oskb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /*
+ * Put the datagram to the queue so that raw_recvmsg() can
+ * get it from there. We need to pass the interface index to
+ * raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb
+ * containing the interface index.
+ */
+
+ sock_skb_cb_check_size(sizeof(struct sockaddr_can));
+ addr = (struct sockaddr_can *)skb->cb;
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = skb->dev->ifindex;
+
+ /* add CAN specific message flags for raw_recvmsg() */
+ pflags = raw_flags(skb);
+ *pflags = 0;
+ if (oskb->sk)
+ *pflags |= MSG_DONTROUTE;
+ if (oskb->sk == sk)
+ *pflags |= MSG_CONFIRM;
+
+ if (sock_queue_rcv_skb(sk, skb) < 0)
+ kfree_skb(skb);
+}
+
+static int raw_enable_filters(struct net_device *dev, struct sock *sk,
+ struct can_filter *filter, int count)
+{
+ int err = 0;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ err = can_rx_register(dev, filter[i].can_id,
+ filter[i].can_mask,
+ raw_rcv, sk, "raw");
+ if (err) {
+ /* clean up successfully registered filters */
+ while (--i >= 0)
+ can_rx_unregister(dev, filter[i].can_id,
+ filter[i].can_mask,
+ raw_rcv, sk);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int raw_enable_errfilter(struct net_device *dev, struct sock *sk,
+ can_err_mask_t err_mask)
+{
+ int err = 0;
+
+ if (err_mask)
+ err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG,
+ raw_rcv, sk, "raw");
+
+ return err;
+}
+
+static void raw_disable_filters(struct net_device *dev, struct sock *sk,
+ struct can_filter *filter, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++)
+ can_rx_unregister(dev, filter[i].can_id, filter[i].can_mask,
+ raw_rcv, sk);
+}
+
+static inline void raw_disable_errfilter(struct net_device *dev,
+ struct sock *sk,
+ can_err_mask_t err_mask)
+
+{
+ if (err_mask)
+ can_rx_unregister(dev, 0, err_mask | CAN_ERR_FLAG,
+ raw_rcv, sk);
+}
+
+static inline void raw_disable_allfilters(struct net_device *dev,
+ struct sock *sk)
+{
+ struct raw_sock *ro = raw_sk(sk);
+
+ raw_disable_filters(dev, sk, ro->filter, ro->count);
+ raw_disable_errfilter(dev, sk, ro->err_mask);
+}
+
+static int raw_enable_allfilters(struct net_device *dev, struct sock *sk)
+{
+ struct raw_sock *ro = raw_sk(sk);
+ int err;
+
+ err = raw_enable_filters(dev, sk, ro->filter, ro->count);
+ if (!err) {
+ err = raw_enable_errfilter(dev, sk, ro->err_mask);
+ if (err)
+ raw_disable_filters(dev, sk, ro->filter, ro->count);
+ }
+
+ return err;
+}
+
+static int raw_notifier(struct notifier_block *nb,
+ unsigned long msg, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct raw_sock *ro = container_of(nb, struct raw_sock, notifier);
+ struct sock *sk = &ro->sk;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+
+ if (ro->ifindex != dev->ifindex)
+ return NOTIFY_DONE;
+
+ switch (msg) {
+
+ case NETDEV_UNREGISTER:
+ lock_sock(sk);
+ /* remove current filters & unregister */
+ if (ro->bound)
+ raw_disable_allfilters(dev, sk);
+
+ if (ro->count > 1)
+ kfree(ro->filter);
+
+ ro->ifindex = 0;
+ ro->bound = 0;
+ ro->count = 0;
+ release_sock(sk);
+
+ sk->sk_err = ENODEV;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ break;
+
+ case NETDEV_DOWN:
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int raw_init(struct sock *sk)
+{
+ struct raw_sock *ro = raw_sk(sk);
+
+ ro->bound = 0;
+ ro->ifindex = 0;
+
+ /* set default filter to single entry dfilter */
+ ro->dfilter.can_id = 0;
+ ro->dfilter.can_mask = MASK_ALL;
+ ro->filter = &ro->dfilter;
+ ro->count = 1;
+
+ /* set default loopback behaviour */
+ ro->loopback = 1;
+ ro->recv_own_msgs = 0;
+ ro->fd_frames = 0;
+ ro->join_filters = 0;
+
+ /* alloc_percpu provides zero'ed memory */
+ ro->uniq = alloc_percpu(struct uniqframe);
+ if (unlikely(!ro->uniq))
+ return -ENOMEM;
+
+ /* set notifier */
+ ro->notifier.notifier_call = raw_notifier;
+
+ register_netdevice_notifier(&ro->notifier);
+
+ return 0;
+}
+
+static int raw_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro;
+
+ if (!sk)
+ return 0;
+
+ ro = raw_sk(sk);
+
+ unregister_netdevice_notifier(&ro->notifier);
+
+ lock_sock(sk);
+
+ /* remove current filters & unregister */
+ if (ro->bound) {
+ if (ro->ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(&init_net, ro->ifindex);
+ if (dev) {
+ raw_disable_allfilters(dev, sk);
+ dev_put(dev);
+ }
+ } else
+ raw_disable_allfilters(NULL, sk);
+ }
+
+ if (ro->count > 1)
+ kfree(ro->filter);
+
+ ro->ifindex = 0;
+ ro->bound = 0;
+ ro->count = 0;
+ free_percpu(ro->uniq);
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+ int ifindex;
+ int err = 0;
+ int notify_enetdown = 0;
+
+ if (len < sizeof(*addr))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (ro->bound && addr->can_ifindex == ro->ifindex)
+ goto out;
+
+ if (addr->can_ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(&init_net, addr->can_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (dev->type != ARPHRD_CAN) {
+ dev_put(dev);
+ err = -ENODEV;
+ goto out;
+ }
+ if (!(dev->flags & IFF_UP))
+ notify_enetdown = 1;
+
+ ifindex = dev->ifindex;
+
+ /* filters set by default/setsockopt */
+ err = raw_enable_allfilters(dev, sk);
+ dev_put(dev);
+ } else {
+ ifindex = 0;
+
+ /* filters set by default/setsockopt */
+ err = raw_enable_allfilters(NULL, sk);
+ }
+
+ if (!err) {
+ if (ro->bound) {
+ /* unregister old filters */
+ if (ro->ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(&init_net, ro->ifindex);
+ if (dev) {
+ raw_disable_allfilters(dev, sk);
+ dev_put(dev);
+ }
+ } else
+ raw_disable_allfilters(NULL, sk);
+ }
+ ro->ifindex = ifindex;
+ ro->bound = 1;
+ }
+
+ out:
+ release_sock(sk);
+
+ if (notify_enetdown) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ }
+
+ return err;
+}
+
+static int raw_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *len, int peer)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+
+ if (peer)
+ return -EOPNOTSUPP;
+
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = ro->ifindex;
+
+ *len = sizeof(*addr);
+
+ return 0;
+}
+
+static int raw_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+ struct can_filter *filter = NULL; /* dyn. alloc'ed filters */
+ struct can_filter sfilter; /* single filter */
+ struct net_device *dev = NULL;
+ can_err_mask_t err_mask = 0;
+ int count = 0;
+ int err = 0;
+
+ if (level != SOL_CAN_RAW)
+ return -EINVAL;
+
+ switch (optname) {
+
+ case CAN_RAW_FILTER:
+ if (optlen % sizeof(struct can_filter) != 0)
+ return -EINVAL;
+
+ count = optlen / sizeof(struct can_filter);
+
+ if (count > 1) {
+ /* filter does not fit into dfilter => alloc space */
+ filter = memdup_user(optval, optlen);
+ if (IS_ERR(filter))
+ return PTR_ERR(filter);
+ } else if (count == 1) {
+ if (copy_from_user(&sfilter, optval, sizeof(sfilter)))
+ return -EFAULT;
+ }
+
+ lock_sock(sk);
+
+ if (ro->bound && ro->ifindex)
+ dev = dev_get_by_index(&init_net, ro->ifindex);
+
+ if (ro->bound) {
+ /* (try to) register the new filters */
+ if (count == 1)
+ err = raw_enable_filters(dev, sk, &sfilter, 1);
+ else
+ err = raw_enable_filters(dev, sk, filter,
+ count);
+ if (err) {
+ if (count > 1)
+ kfree(filter);
+ goto out_fil;
+ }
+
+ /* remove old filter registrations */
+ raw_disable_filters(dev, sk, ro->filter, ro->count);
+ }
+
+ /* remove old filter space */
+ if (ro->count > 1)
+ kfree(ro->filter);
+
+ /* link new filters to the socket */
+ if (count == 1) {
+ /* copy filter data for single filter */
+ ro->dfilter = sfilter;
+ filter = &ro->dfilter;
+ }
+ ro->filter = filter;
+ ro->count = count;
+
+ out_fil:
+ if (dev)
+ dev_put(dev);
+
+ release_sock(sk);
+
+ break;
+
+ case CAN_RAW_ERR_FILTER:
+ if (optlen != sizeof(err_mask))
+ return -EINVAL;
+
+ if (copy_from_user(&err_mask, optval, optlen))
+ return -EFAULT;
+
+ err_mask &= CAN_ERR_MASK;
+
+ lock_sock(sk);
+
+ if (ro->bound && ro->ifindex)
+ dev = dev_get_by_index(&init_net, ro->ifindex);
+
+ /* remove current error mask */
+ if (ro->bound) {
+ /* (try to) register the new err_mask */
+ err = raw_enable_errfilter(dev, sk, err_mask);
+
+ if (err)
+ goto out_err;
+
+ /* remove old err_mask registration */
+ raw_disable_errfilter(dev, sk, ro->err_mask);
+ }
+
+ /* link new err_mask to the socket */
+ ro->err_mask = err_mask;
+
+ out_err:
+ if (dev)
+ dev_put(dev);
+
+ release_sock(sk);
+
+ break;
+
+ case CAN_RAW_LOOPBACK:
+ if (optlen != sizeof(ro->loopback))
+ return -EINVAL;
+
+ if (copy_from_user(&ro->loopback, optval, optlen))
+ return -EFAULT;
+
+ break;
+
+ case CAN_RAW_RECV_OWN_MSGS:
+ if (optlen != sizeof(ro->recv_own_msgs))
+ return -EINVAL;
+
+ if (copy_from_user(&ro->recv_own_msgs, optval, optlen))
+ return -EFAULT;
+
+ break;
+
+ case CAN_RAW_FD_FRAMES:
+ if (optlen != sizeof(ro->fd_frames))
+ return -EINVAL;
+
+ if (copy_from_user(&ro->fd_frames, optval, optlen))
+ return -EFAULT;
+
+ break;
+
+ case CAN_RAW_JOIN_FILTERS:
+ if (optlen != sizeof(ro->join_filters))
+ return -EINVAL;
+
+ if (copy_from_user(&ro->join_filters, optval, optlen))
+ return -EFAULT;
+
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+ return err;
+}
+
+static int raw_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+ int len;
+ void *val;
+ int err = 0;
+
+ if (level != SOL_CAN_RAW)
+ return -EINVAL;
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+
+ case CAN_RAW_FILTER:
+ lock_sock(sk);
+ if (ro->count > 0) {
+ int fsize = ro->count * sizeof(struct can_filter);
+ if (len > fsize)
+ len = fsize;
+ if (copy_to_user(optval, ro->filter, len))
+ err = -EFAULT;
+ } else
+ len = 0;
+ release_sock(sk);
+
+ if (!err)
+ err = put_user(len, optlen);
+ return err;
+
+ case CAN_RAW_ERR_FILTER:
+ if (len > sizeof(can_err_mask_t))
+ len = sizeof(can_err_mask_t);
+ val = &ro->err_mask;
+ break;
+
+ case CAN_RAW_LOOPBACK:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ val = &ro->loopback;
+ break;
+
+ case CAN_RAW_RECV_OWN_MSGS:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ val = &ro->recv_own_msgs;
+ break;
+
+ case CAN_RAW_FD_FRAMES:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ val = &ro->fd_frames;
+ break;
+
+ case CAN_RAW_JOIN_FILTERS:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ val = &ro->join_filters;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct raw_sock *ro = raw_sk(sk);
+ struct sk_buff *skb;
+ struct net_device *dev;
+ int ifindex;
+ int err;
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
+
+ if (msg->msg_namelen < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+
+ ifindex = addr->can_ifindex;
+ } else
+ ifindex = ro->ifindex;
+
+ if (ro->fd_frames) {
+ if (unlikely(size != CANFD_MTU && size != CAN_MTU))
+ return -EINVAL;
+ } else {
+ if (unlikely(size != CAN_MTU))
+ return -EINVAL;
+ }
+
+ dev = dev_get_by_index(&init_net, ifindex);
+ if (!dev)
+ return -ENXIO;
+
+ skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv),
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto put_dev;
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err < 0)
+ goto free_skb;
+
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+
+ skb->dev = dev;
+ skb->sk = sk;
+ skb->priority = sk->sk_priority;
+
+ err = can_send(skb, ro->loopback);
+
+ dev_put(dev);
+
+ if (err)
+ goto send_failed;
+
+ return size;
+
+free_skb:
+ kfree_skb(skb);
+put_dev:
+ dev_put(dev);
+send_failed:
+ return err;
+}
+
+static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int err = 0;
+ int noblock;
+
+ noblock = flags & MSG_DONTWAIT;
+ flags &= ~MSG_DONTWAIT;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ return err;
+
+ if (size < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ size = skb->len;
+
+ err = memcpy_to_msg(msg, skb->data, size);
+ if (err < 0) {
+ skb_free_datagram(sk, skb);
+ return err;
+ }
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (msg->msg_name) {
+ __sockaddr_check_size(sizeof(struct sockaddr_can));
+ msg->msg_namelen = sizeof(struct sockaddr_can);
+ memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+ }
+
+ /* assign the flags that have been recorded in raw_rcv() */
+ msg->msg_flags |= *(raw_flags(skb));
+
+ skb_free_datagram(sk, skb);
+
+ return size;
+}
+
+static const struct proto_ops raw_ops = {
+ .family = PF_CAN,
+ .release = raw_release,
+ .bind = raw_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = raw_getname,
+ .poll = datagram_poll,
+ .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = raw_setsockopt,
+ .getsockopt = raw_getsockopt,
+ .sendmsg = raw_sendmsg,
+ .recvmsg = raw_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct proto raw_proto __read_mostly = {
+ .name = "CAN_RAW",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct raw_sock),
+ .init = raw_init,
+};
+
+static const struct can_proto raw_can_proto = {
+ .type = SOCK_RAW,
+ .protocol = CAN_RAW,
+ .ops = &raw_ops,
+ .prot = &raw_proto,
+};
+
+static __init int raw_module_init(void)
+{
+ int err;
+
+ pr_info("can: raw protocol (rev " CAN_RAW_VERSION ")\n");
+
+ err = can_proto_register(&raw_can_proto);
+ if (err < 0)
+ printk(KERN_ERR "can: registration of raw protocol failed\n");
+
+ return err;
+}
+
+static __exit void raw_module_exit(void)
+{
+ can_proto_unregister(&raw_can_proto);
+}
+
+module_init(raw_module_init);
+module_exit(raw_module_exit);
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 000000000..f8cceb99e
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,44 @@
+config CEPH_LIB
+ tristate "Ceph core library"
+ depends on INET
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO
+ select KEYS
+ default n
+ help
+ Choose Y or M here to include cephlib, which provides the
+ common functionality to both the Ceph filesystem and
+ to the rados block device (rbd).
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+ bool "Include file:line in ceph debug output"
+ depends on CEPH_LIB
+ default n
+ help
+ If you say Y here, debug output will include a filename and
+ line to aid debugging. This increases kernel size and slows
+ execution slightly when debug call sites are enabled (e.g.,
+ via CONFIG_DYNAMIC_DEBUG).
+
+ If unsure, say N.
+
+config CEPH_LIB_USE_DNS_RESOLVER
+ bool "Use in-kernel support for DNS lookup"
+ depends on CEPH_LIB
+ select DNS_RESOLVER
+ default n
+ help
+ If you say Y here, hostnames (e.g. monitor addresses) will
+ be resolved using the CONFIG_DNS_RESOLVER facility.
+
+ For information on how to use CONFIG_DNS_RESOLVER consult
+ Documentation/networking/dns_resolver.txt
+
+ If unsure, say N.
+
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 000000000..958d98569
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for CEPH filesystem.
+#
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+ mon_client.o \
+ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+ debugfs.o \
+ auth.o auth_none.o \
+ crypto.o armor.o \
+ auth_x.o \
+ ceph_fs.o ceph_strings.o ceph_hash.o \
+ pagevec.o snapshot.o
+
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 000000000..1fc1ee11d
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,105 @@
+
+#include <linux/errno.h>
+
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+/*
+ * base64 encode/decode.
+ */
+
+static const char *pem_key =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+ return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+ if (c >= 'A' && c <= 'Z')
+ return c - 'A';
+ if (c >= 'a' && c <= 'z')
+ return c - 'a' + 26;
+ if (c >= '0' && c <= '9')
+ return c - '0' + 52;
+ if (c == '+')
+ return 62;
+ if (c == '/')
+ return 63;
+ if (c == '=')
+ return 0; /* just non-negative, please */
+ return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+ int olen = 0;
+ int line = 0;
+
+ while (src < end) {
+ unsigned char a, b, c;
+
+ a = *src++;
+ *dst++ = encode_bits(a >> 2);
+ if (src < end) {
+ b = *src++;
+ *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+ if (src < end) {
+ c = *src++;
+ *dst++ = encode_bits(((b & 15) << 2) |
+ (c >> 6));
+ *dst++ = encode_bits(c & 63);
+ } else {
+ *dst++ = encode_bits((b & 15) << 2);
+ *dst++ = '=';
+ }
+ } else {
+ *dst++ = encode_bits(((a & 3) << 4));
+ *dst++ = '=';
+ *dst++ = '=';
+ }
+ olen += 4;
+ line += 4;
+ if (line == 64) {
+ line = 0;
+ *(dst++) = '\n';
+ olen++;
+ }
+ }
+ return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+ int olen = 0;
+
+ while (src < end) {
+ int a, b, c, d;
+
+ if (src[0] == '\n') {
+ src++;
+ continue;
+ }
+ if (src + 4 > end)
+ return -EINVAL;
+ a = decode_bits(src[0]);
+ b = decode_bits(src[1]);
+ c = decode_bits(src[2]);
+ d = decode_bits(src[3]);
+ if (a < 0 || b < 0 || c < 0 || d < 0)
+ return -EINVAL;
+
+ *dst++ = (a << 2) | (b >> 4);
+ if (src[2] == '=')
+ return olen + 1;
+ *dst++ = ((b & 15) << 4) | (c >> 2);
+ if (src[3] == '=')
+ return olen + 2;
+ *dst++ = ((c & 3) << 6) | d;
+ olen += 3;
+ src += 4;
+ }
+ return olen;
+}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 000000000..6b923bcaa
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,340 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include "auth_none.h"
+#include "auth_x.h"
+
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+ CEPH_AUTH_NONE,
+ CEPH_AUTH_CEPHX
+};
+
+static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+ switch (protocol) {
+ case CEPH_AUTH_NONE:
+ return ceph_auth_none_init(ac);
+ case CEPH_AUTH_CEPHX:
+ return ceph_x_init(ac);
+ default:
+ return -ENOENT;
+ }
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+{
+ struct ceph_auth_client *ac;
+ int ret;
+
+ dout("auth_init name '%s'\n", name);
+
+ ret = -ENOMEM;
+ ac = kzalloc(sizeof(*ac), GFP_NOFS);
+ if (!ac)
+ goto out;
+
+ mutex_init(&ac->mutex);
+ ac->negotiating = true;
+ if (name)
+ ac->name = name;
+ else
+ ac->name = CEPH_AUTH_NAME_DEFAULT;
+ dout("auth_init name %s\n", ac->name);
+ ac->key = key;
+ return ac;
+
+out:
+ return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+ dout("auth_destroy %p\n", ac);
+ if (ac->ops)
+ ac->ops->destroy(ac);
+ kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+ mutex_lock(&ac->mutex);
+ dout("auth_reset %p\n", ac);
+ if (ac->ops && !ac->negotiating)
+ ac->ops->reset(ac);
+ ac->negotiating = true;
+ mutex_unlock(&ac->mutex);
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+ int len = strlen(name);
+
+ if (*p + 2*sizeof(u32) + len > end)
+ return -ERANGE;
+ ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+ ceph_encode_32(p, len);
+ ceph_encode_copy(p, name, len);
+ return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor. Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+ struct ceph_mon_request_header *monhdr = buf;
+ void *p = monhdr + 1, *end = buf + len, *lenp;
+ int i, num;
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ dout("auth_build_hello\n");
+ monhdr->have_version = 0;
+ monhdr->session_mon = cpu_to_le16(-1);
+ monhdr->session_mon_tid = 0;
+
+ ceph_encode_32(&p, 0); /* no protocol, yet */
+
+ lenp = p;
+ p += sizeof(u32);
+
+ ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+ ceph_encode_8(&p, 1);
+ num = ARRAY_SIZE(supported_protocols);
+ ceph_encode_32(&p, num);
+ ceph_decode_need(&p, end, num * sizeof(u32), bad);
+ for (i = 0; i < num; i++)
+ ceph_encode_32(&p, supported_protocols[i]);
+
+ ret = ceph_entity_name_encode(ac->name, &p, end);
+ if (ret < 0)
+ goto out;
+ ceph_decode_need(&p, end, sizeof(u64), bad);
+ ceph_encode_64(&p, ac->global_id);
+
+ ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+ ret = p - buf;
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+bad:
+ ret = -ERANGE;
+ goto out;
+}
+
+static int ceph_build_auth_request(struct ceph_auth_client *ac,
+ void *msg_buf, size_t msg_len)
+{
+ struct ceph_mon_request_header *monhdr = msg_buf;
+ void *p = monhdr + 1;
+ void *end = msg_buf + msg_len;
+ int ret;
+
+ monhdr->have_version = 0;
+ monhdr->session_mon = cpu_to_le16(-1);
+ monhdr->session_mon_tid = 0;
+
+ ceph_encode_32(&p, ac->protocol);
+
+ ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+ if (ret < 0) {
+ pr_err("error %d building auth method %s request\n", ret,
+ ac->ops->name);
+ goto out;
+ }
+ dout(" built request %d bytes\n", ret);
+ ceph_encode_32(&p, ret);
+ ret = p + ret - msg_buf;
+out:
+ return ret;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+ void *buf, size_t len,
+ void *reply_buf, size_t reply_len)
+{
+ void *p = buf;
+ void *end = buf + len;
+ int protocol;
+ s32 result;
+ u64 global_id;
+ void *payload, *payload_end;
+ int payload_len;
+ char *result_msg;
+ int result_msg_len;
+ int ret = -EINVAL;
+
+ mutex_lock(&ac->mutex);
+ dout("handle_auth_reply %p %p\n", p, end);
+ ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+ protocol = ceph_decode_32(&p);
+ result = ceph_decode_32(&p);
+ global_id = ceph_decode_64(&p);
+ payload_len = ceph_decode_32(&p);
+ payload = p;
+ p += payload_len;
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ result_msg_len = ceph_decode_32(&p);
+ result_msg = p;
+ p += result_msg_len;
+ if (p != end)
+ goto bad;
+
+ dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+ result_msg, global_id, payload_len);
+
+ payload_end = payload + payload_len;
+
+ if (global_id && ac->global_id != global_id) {
+ dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+ ac->global_id = global_id;
+ }
+
+ if (ac->negotiating) {
+ /* server does not support our protocols? */
+ if (!protocol && result < 0) {
+ ret = result;
+ goto out;
+ }
+ /* set up (new) protocol handler? */
+ if (ac->protocol && ac->protocol != protocol) {
+ ac->ops->destroy(ac);
+ ac->protocol = 0;
+ ac->ops = NULL;
+ }
+ if (ac->protocol != protocol) {
+ ret = ceph_auth_init_protocol(ac, protocol);
+ if (ret) {
+ pr_err("error %d on auth protocol %d init\n",
+ ret, protocol);
+ goto out;
+ }
+ }
+
+ ac->negotiating = false;
+ }
+
+ ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+ if (ret == -EAGAIN) {
+ ret = ceph_build_auth_request(ac, reply_buf, reply_len);
+ } else if (ret) {
+ pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
+ }
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+bad:
+ pr_err("failed to decode auth msg\n");
+ ret = -EINVAL;
+ goto out;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+ void *msg_buf, size_t msg_len)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (!ac->protocol)
+ ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
+ else if (ac->ops->should_authenticate(ac))
+ ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops)
+ ret = ac->ops->is_authenticated(ac);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_is_authenticated);
+
+int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
+ int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->create_authorizer)
+ ret = ac->ops->create_authorizer(ac, peer_type, auth);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_create_authorizer);
+
+void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a)
+{
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->destroy_authorizer)
+ ac->ops->destroy_authorizer(ac, a);
+ mutex_unlock(&ac->mutex);
+}
+EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
+
+int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
+ int peer_type,
+ struct ceph_auth_handshake *a)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->update_authorizer)
+ ret = ac->ops->update_authorizer(ac, peer_type, a);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_update_authorizer);
+
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a, size_t len)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->verify_authorizer_reply)
+ ret = ac->ops->verify_authorizer_reply(ac, a, len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply);
+
+void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
+{
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, peer_type);
+ mutex_unlock(&ac->mutex);
+}
+EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 000000000..8c93fa8d8
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,137 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "auth_none.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ xi->starting = true;
+ xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+ kfree(ac->private);
+ ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return !xi->starting;
+}
+
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return xi->starting;
+}
+
+static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
+{
+ return 0;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+ void *buf, void *end)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ xi->starting = false;
+ return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id. we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ struct ceph_auth_none_info *ai = ac->private;
+ struct ceph_none_authorizer *au = &ai->au;
+ void *p, *end;
+ int ret;
+
+ if (!ai->built_authorizer) {
+ p = au->buf;
+ end = p + sizeof(au->buf);
+ ceph_encode_8(&p, 1);
+ ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+ if (ret < 0)
+ goto bad;
+ ceph_decode_need(&p, end, sizeof(u64), bad2);
+ ceph_encode_64(&p, ac->global_id);
+ au->buf_len = p - (void *)au->buf;
+ ai->built_authorizer = true;
+ dout("built authorizer len %d\n", au->buf_len);
+ }
+
+ auth->authorizer = (struct ceph_authorizer *) au;
+ auth->authorizer_buf = au->buf;
+ auth->authorizer_buf_len = au->buf_len;
+ auth->authorizer_reply_buf = au->reply_buf;
+ auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
+
+ return 0;
+
+bad2:
+ ret = -ERANGE;
+bad:
+ return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a)
+{
+ /* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+ .name = "none",
+ .reset = reset,
+ .destroy = destroy,
+ .is_authenticated = is_authenticated,
+ .should_authenticate = should_authenticate,
+ .build_request = build_request,
+ .handle_reply = handle_reply,
+ .create_authorizer = ceph_auth_none_create_authorizer,
+ .destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi;
+
+ dout("ceph_auth_none_init %p\n", ac);
+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)
+ return -ENOMEM;
+
+ xi->starting = true;
+ xi->built_authorizer = false;
+
+ ac->protocol = CEPH_AUTH_NONE;
+ ac->private = xi;
+ ac->ops = &ceph_auth_none_ops;
+ return 0;
+}
+
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 000000000..059a3ce4b
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include <linux/slab.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+ char buf[128];
+ int buf_len;
+ char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+ bool starting;
+ bool built_authorizer;
+ struct ceph_none_authorizer au; /* we only need one; it's static */
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 000000000..ba6eb1722
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,782 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h"
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+
+ ceph_x_validate_tickets(ac, &need);
+ dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+ ac->want_keys, need, xi->have_keys);
+ return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+
+ ceph_x_validate_tickets(ac, &need);
+ dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+ ac->want_keys, need, xi->have_keys);
+ return need != 0;
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+ return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+ sizeof(u32);
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+ void *ibuf, int ilen, void *obuf, size_t olen)
+{
+ struct ceph_x_encrypt_header head = {
+ .struct_v = 1,
+ .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+ };
+ size_t len = olen - sizeof(u32);
+ int ret;
+
+ ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+ &head, sizeof(head), ibuf, ilen);
+ if (ret)
+ return ret;
+ ceph_encode_32(&obuf, len);
+ return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+ void **p, void *end, void **obuf, size_t olen)
+{
+ struct ceph_x_encrypt_header head;
+ size_t head_len = sizeof(head);
+ int len, ret;
+
+ len = ceph_decode_32(p);
+ if (*p + len > end)
+ return -EINVAL;
+
+ dout("ceph_x_decrypt len %d\n", len);
+ if (*obuf == NULL) {
+ *obuf = kmalloc(len, GFP_NOFS);
+ if (!*obuf)
+ return -ENOMEM;
+ olen = len;
+ }
+
+ ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len);
+ if (ret)
+ return ret;
+ if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+ return -EPERM;
+ *p += len;
+ return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+static struct ceph_x_ticket_handler *
+get_ticket_handler(struct ceph_auth_client *ac, int service)
+{
+ struct ceph_x_ticket_handler *th;
+ struct ceph_x_info *xi = ac->private;
+ struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+ while (*p) {
+ parent = *p;
+ th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+ if (service < th->service)
+ p = &(*p)->rb_left;
+ else if (service > th->service)
+ p = &(*p)->rb_right;
+ else
+ return th;
+ }
+
+ /* add it */
+ th = kzalloc(sizeof(*th), GFP_NOFS);
+ if (!th)
+ return ERR_PTR(-ENOMEM);
+ th->service = service;
+ rb_link_node(&th->node, parent, p);
+ rb_insert_color(&th->node, &xi->ticket_handlers);
+ return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+ struct ceph_x_ticket_handler *th)
+{
+ struct ceph_x_info *xi = ac->private;
+
+ dout("remove_ticket_handler %p %d\n", th, th->service);
+ rb_erase(&th->node, &xi->ticket_handlers);
+ ceph_crypto_key_destroy(&th->session_key);
+ if (th->ticket_blob)
+ ceph_buffer_put(th->ticket_blob);
+ kfree(th);
+}
+
+static int process_one_ticket(struct ceph_auth_client *ac,
+ struct ceph_crypto_key *secret,
+ void **p, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ int type;
+ u8 tkt_struct_v, blob_struct_v;
+ struct ceph_x_ticket_handler *th;
+ void *dbuf = NULL;
+ void *dp, *dend;
+ int dlen;
+ char is_enc;
+ struct timespec validity;
+ struct ceph_crypto_key old_key;
+ void *ticket_buf = NULL;
+ void *tp, *tpend;
+ void **ptp;
+ struct ceph_timespec new_validity;
+ struct ceph_crypto_key new_session_key;
+ struct ceph_buffer *new_ticket_blob;
+ unsigned long new_expires, new_renew_after;
+ u64 new_secret_id;
+ int ret;
+
+ ceph_decode_need(p, end, sizeof(u32) + 1, bad);
+
+ type = ceph_decode_32(p);
+ dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+ tkt_struct_v = ceph_decode_8(p);
+ if (tkt_struct_v != 1)
+ goto bad;
+
+ th = get_ticket_handler(ac, type);
+ if (IS_ERR(th)) {
+ ret = PTR_ERR(th);
+ goto out;
+ }
+
+ /* blob for me */
+ dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0);
+ if (dlen <= 0) {
+ ret = dlen;
+ goto out;
+ }
+ dout(" decrypted %d bytes\n", dlen);
+ dp = dbuf;
+ dend = dp + dlen;
+
+ tkt_struct_v = ceph_decode_8(&dp);
+ if (tkt_struct_v != 1)
+ goto bad;
+
+ memcpy(&old_key, &th->session_key, sizeof(old_key));
+ ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+ if (ret)
+ goto out;
+
+ ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+ ceph_decode_timespec(&validity, &new_validity);
+ new_expires = get_seconds() + validity.tv_sec;
+ new_renew_after = new_expires - (validity.tv_sec / 4);
+ dout(" expires=%lu renew_after=%lu\n", new_expires,
+ new_renew_after);
+
+ /* ticket blob for service */
+ ceph_decode_8_safe(p, end, is_enc, bad);
+ if (is_enc) {
+ /* encrypted */
+ dout(" encrypted ticket\n");
+ dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0);
+ if (dlen < 0) {
+ ret = dlen;
+ goto out;
+ }
+ tp = ticket_buf;
+ ptp = &tp;
+ tpend = *ptp + dlen;
+ } else {
+ /* unencrypted */
+ ptp = p;
+ tpend = end;
+ }
+ ceph_decode_32_safe(ptp, tpend, dlen, bad);
+ dout(" ticket blob is %d bytes\n", dlen);
+ ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad);
+ blob_struct_v = ceph_decode_8(ptp);
+ new_secret_id = ceph_decode_64(ptp);
+ ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend);
+ if (ret)
+ goto out;
+
+ /* all is well, update our ticket */
+ ceph_crypto_key_destroy(&th->session_key);
+ if (th->ticket_blob)
+ ceph_buffer_put(th->ticket_blob);
+ th->session_key = new_session_key;
+ th->ticket_blob = new_ticket_blob;
+ th->validity = new_validity;
+ th->secret_id = new_secret_id;
+ th->expires = new_expires;
+ th->renew_after = new_renew_after;
+ dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+ type, ceph_entity_type_name(type), th->secret_id,
+ (int)th->ticket_blob->vec.iov_len);
+ xi->have_keys |= th->service;
+
+out:
+ kfree(ticket_buf);
+ kfree(dbuf);
+ return ret;
+
+bad:
+ ret = -EINVAL;
+ goto out;
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+ struct ceph_crypto_key *secret,
+ void *buf, void *end)
+{
+ void *p = buf;
+ u8 reply_struct_v;
+ u32 num;
+ int ret;
+
+ ceph_decode_8_safe(&p, end, reply_struct_v, bad);
+ if (reply_struct_v != 1)
+ return -EINVAL;
+
+ ceph_decode_32_safe(&p, end, num, bad);
+ dout("%d tickets\n", num);
+
+ while (num--) {
+ ret = process_one_ticket(ac, secret, &p, end);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+ struct ceph_x_ticket_handler *th,
+ struct ceph_x_authorizer *au)
+{
+ int maxlen;
+ struct ceph_x_authorize_a *msg_a;
+ struct ceph_x_authorize_b msg_b;
+ void *p, *end;
+ int ret;
+ int ticket_blob_len =
+ (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+ dout("build_authorizer for %s %p\n",
+ ceph_entity_type_name(th->service), au);
+
+ ceph_crypto_key_destroy(&au->session_key);
+ ret = ceph_crypto_key_clone(&au->session_key, &th->session_key);
+ if (ret)
+ return ret;
+
+ maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+ ceph_x_encrypt_buflen(ticket_blob_len);
+ dout(" need len %d\n", maxlen);
+ if (au->buf && au->buf->alloc_len < maxlen) {
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ }
+ if (!au->buf) {
+ au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+ if (!au->buf) {
+ ceph_crypto_key_destroy(&au->session_key);
+ return -ENOMEM;
+ }
+ }
+ au->service = th->service;
+ au->secret_id = th->secret_id;
+
+ msg_a = au->buf->vec.iov_base;
+ msg_a->struct_v = 1;
+ msg_a->global_id = cpu_to_le64(ac->global_id);
+ msg_a->service_id = cpu_to_le32(th->service);
+ msg_a->ticket_blob.struct_v = 1;
+ msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+ msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+ if (ticket_blob_len) {
+ memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+ th->ticket_blob->vec.iov_len);
+ }
+ dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+ le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+ p = msg_a + 1;
+ p += ticket_blob_len;
+ end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+ get_random_bytes(&au->nonce, sizeof(au->nonce));
+ msg_b.struct_v = 1;
+ msg_b.nonce = cpu_to_le64(au->nonce);
+ ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b),
+ p, end - p);
+ if (ret < 0)
+ goto out_buf;
+ p += ret;
+ au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+ dout(" built authorizer nonce %llx len %d\n", au->nonce,
+ (int)au->buf->vec.iov_len);
+ BUG_ON(au->buf->vec.iov_len > maxlen);
+ return 0;
+
+out_buf:
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+ void **p, void *end)
+{
+ ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+ ceph_encode_8(p, 1);
+ ceph_encode_64(p, th->secret_id);
+ if (th->ticket_blob) {
+ const char *buf = th->ticket_blob->vec.iov_base;
+ u32 len = th->ticket_blob->vec.iov_len;
+
+ ceph_encode_32_safe(p, end, len, bad);
+ ceph_encode_copy_safe(p, end, buf, len, bad);
+ } else {
+ ceph_encode_32_safe(p, end, 0, bad);
+ }
+
+ return 0;
+bad:
+ return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+ int want = ac->want_keys;
+ struct ceph_x_info *xi = ac->private;
+ int service;
+
+ *pneed = ac->want_keys & ~(xi->have_keys);
+
+ for (service = 1; service <= want; service <<= 1) {
+ struct ceph_x_ticket_handler *th;
+
+ if (!(ac->want_keys & service))
+ continue;
+
+ if (*pneed & service)
+ continue;
+
+ th = get_ticket_handler(ac, service);
+
+ if (IS_ERR(th)) {
+ *pneed |= service;
+ continue;
+ }
+
+ if (get_seconds() >= th->renew_after)
+ *pneed |= service;
+ if (get_seconds() >= th->expires)
+ xi->have_keys &= ~service;
+ }
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+ struct ceph_x_request_header *head = buf;
+ int ret;
+ struct ceph_x_ticket_handler *th =
+ get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ ceph_x_validate_tickets(ac, &need);
+
+ dout("build_request want %x have %x need %x\n",
+ ac->want_keys, xi->have_keys, need);
+
+ if (need & CEPH_ENTITY_TYPE_AUTH) {
+ struct ceph_x_authenticate *auth = (void *)(head + 1);
+ void *p = auth + 1;
+ struct ceph_x_challenge_blob tmp;
+ char tmp_enc[40];
+ u64 *u;
+
+ if (p > end)
+ return -ERANGE;
+
+ dout(" get_auth_session_key\n");
+ head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+ /* encrypt and hash */
+ get_random_bytes(&auth->client_challenge, sizeof(u64));
+ tmp.client_challenge = auth->client_challenge;
+ tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+ ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+ tmp_enc, sizeof(tmp_enc));
+ if (ret < 0)
+ return ret;
+
+ auth->struct_v = 1;
+ auth->key = 0;
+ for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+ auth->key ^= *(__le64 *)u;
+ dout(" server_challenge %llx client_challenge %llx key %llx\n",
+ xi->server_challenge, le64_to_cpu(auth->client_challenge),
+ le64_to_cpu(auth->key));
+
+ /* now encode the old ticket if exists */
+ ret = ceph_x_encode_ticket(th, &p, end);
+ if (ret < 0)
+ return ret;
+
+ return p - buf;
+ }
+
+ if (need) {
+ void *p = head + 1;
+ struct ceph_x_service_ticket_request *req;
+
+ if (p > end)
+ return -ERANGE;
+ head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+ ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+ if (ret)
+ return ret;
+ ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+ xi->auth_authorizer.buf->vec.iov_len);
+
+ req = p;
+ req->keys = cpu_to_le32(need);
+ p += sizeof(*req);
+ return p - buf;
+ }
+
+ return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct ceph_x_reply_header *head = buf;
+ struct ceph_x_ticket_handler *th;
+ int len = end - buf;
+ int op;
+ int ret;
+
+ if (result)
+ return result; /* XXX hmm? */
+
+ if (xi->starting) {
+ /* it's a hello */
+ struct ceph_x_server_challenge *sc = buf;
+
+ if (len != sizeof(*sc))
+ return -EINVAL;
+ xi->server_challenge = le64_to_cpu(sc->server_challenge);
+ dout("handle_reply got server challenge %llx\n",
+ xi->server_challenge);
+ xi->starting = false;
+ xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+ return -EAGAIN;
+ }
+
+ op = le16_to_cpu(head->op);
+ result = le32_to_cpu(head->result);
+ dout("handle_reply op %d result %d\n", op, result);
+ switch (op) {
+ case CEPHX_GET_AUTH_SESSION_KEY:
+ /* verify auth key */
+ ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+ buf + sizeof(*head), end);
+ break;
+
+ case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+ th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+ buf + sizeof(*head), end);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (ret)
+ return ret;
+ if (ac->want_keys == xi->have_keys)
+ return 0;
+ return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ struct ceph_x_authorizer *au;
+ struct ceph_x_ticket_handler *th;
+ int ret;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ au = kzalloc(sizeof(*au), GFP_NOFS);
+ if (!au)
+ return -ENOMEM;
+
+ ret = ceph_x_build_authorizer(ac, th, au);
+ if (ret) {
+ kfree(au);
+ return ret;
+ }
+
+ auth->authorizer = (struct ceph_authorizer *) au;
+ auth->authorizer_buf = au->buf->vec.iov_base;
+ auth->authorizer_buf_len = au->buf->vec.iov_len;
+ auth->authorizer_reply_buf = au->reply_buf;
+ auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
+ auth->sign_message = ac->ops->sign_message;
+ auth->check_message_signature = ac->ops->check_message_signature;
+
+ return 0;
+}
+
+static int ceph_x_update_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth)
+{
+ struct ceph_x_authorizer *au;
+ struct ceph_x_ticket_handler *th;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ au = (struct ceph_x_authorizer *)auth->authorizer;
+ if (au->secret_id < th->secret_id) {
+ dout("ceph_x_update_authorizer service %u secret %llu < %llu\n",
+ au->service, au->secret_id, th->secret_id);
+ return ceph_x_build_authorizer(ac, th, au);
+ }
+ return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a, size_t len)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+ int ret = 0;
+ struct ceph_x_authorize_reply reply;
+ void *preply = &reply;
+ void *p = au->reply_buf;
+ void *end = p + sizeof(au->reply_buf);
+
+ ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply));
+ if (ret < 0)
+ return ret;
+ if (ret != sizeof(reply))
+ return -EPERM;
+
+ if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+ ret = -EPERM;
+ else
+ ret = 0;
+ dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+ au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+ return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+
+ ceph_crypto_key_destroy(&au->session_key);
+ ceph_buffer_put(au->buf);
+ kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+
+ dout("reset\n");
+ xi->starting = true;
+ xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct rb_node *p;
+
+ dout("ceph_x_destroy %p\n", ac);
+ ceph_crypto_key_destroy(&xi->secret);
+
+ while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+ struct ceph_x_ticket_handler *th =
+ rb_entry(p, struct ceph_x_ticket_handler, node);
+ remove_ticket_handler(ac, th);
+ }
+
+ if (xi->auth_authorizer.buf)
+ ceph_buffer_put(xi->auth_authorizer.buf);
+
+ kfree(ac->private);
+ ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+ int peer_type)
+{
+ struct ceph_x_ticket_handler *th;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (!IS_ERR(th))
+ memset(&th->validity, 0, sizeof(th->validity));
+}
+
+static int calcu_signature(struct ceph_x_authorizer *au,
+ struct ceph_msg *msg, __le64 *sig)
+{
+ int ret;
+ char tmp_enc[40];
+ __le32 tmp[5] = {
+ cpu_to_le32(16), msg->hdr.crc, msg->footer.front_crc,
+ msg->footer.middle_crc, msg->footer.data_crc,
+ };
+ ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp),
+ tmp_enc, sizeof(tmp_enc));
+ if (ret < 0)
+ return ret;
+ *sig = *(__le64*)(tmp_enc + 4);
+ return 0;
+}
+
+static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ int ret;
+ if (!auth->authorizer)
+ return 0;
+ ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
+ msg, &msg->footer.sig);
+ if (ret < 0)
+ return ret;
+ msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED;
+ return 0;
+}
+
+static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ __le64 sig_check;
+ int ret;
+
+ if (!auth->authorizer)
+ return 0;
+ ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
+ msg, &sig_check);
+ if (ret < 0)
+ return ret;
+ if (sig_check == msg->footer.sig)
+ return 0;
+ if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED)
+ dout("ceph_x_check_message_signature %p has signature %llx "
+ "expect %llx\n", msg, msg->footer.sig, sig_check);
+ else
+ dout("ceph_x_check_message_signature %p sender did not set "
+ "CEPH_MSG_FOOTER_SIGNED\n", msg);
+ return -EBADMSG;
+}
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+ .name = "x",
+ .is_authenticated = ceph_x_is_authenticated,
+ .should_authenticate = ceph_x_should_authenticate,
+ .build_request = ceph_x_build_request,
+ .handle_reply = ceph_x_handle_reply,
+ .create_authorizer = ceph_x_create_authorizer,
+ .update_authorizer = ceph_x_update_authorizer,
+ .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+ .destroy_authorizer = ceph_x_destroy_authorizer,
+ .invalidate_authorizer = ceph_x_invalidate_authorizer,
+ .reset = ceph_x_reset,
+ .destroy = ceph_x_destroy,
+ .sign_message = ceph_x_sign_message,
+ .check_message_signature = ceph_x_check_message_signature,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi;
+ int ret;
+
+ dout("ceph_x_init %p\n", ac);
+ ret = -ENOMEM;
+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)
+ goto out;
+
+ ret = -EINVAL;
+ if (!ac->key) {
+ pr_err("no secret set (for auth_x protocol)\n");
+ goto out_nomem;
+ }
+
+ ret = ceph_crypto_key_clone(&xi->secret, ac->key);
+ if (ret < 0) {
+ pr_err("cannot clone key: %d\n", ret);
+ goto out_nomem;
+ }
+
+ xi->starting = true;
+ xi->ticket_handlers = RB_ROOT;
+
+ ac->protocol = CEPH_AUTH_CEPHX;
+ ac->private = xi;
+ ac->ops = &ceph_x_ops;
+ return 0;
+
+out_nomem:
+ kfree(xi);
+out:
+ return ret;
+}
+
+
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 000000000..e8b7c6917
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,52 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+ struct rb_node node;
+ unsigned int service;
+
+ struct ceph_crypto_key session_key;
+ struct ceph_timespec validity;
+
+ u64 secret_id;
+ struct ceph_buffer *ticket_blob;
+
+ unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+ struct ceph_crypto_key session_key;
+ struct ceph_buffer *buf;
+ unsigned int service;
+ u64 nonce;
+ u64 secret_id;
+ char reply_buf[128]; /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+ struct ceph_crypto_key secret;
+
+ bool starting;
+ u64 server_challenge;
+
+ unsigned int have_keys;
+ struct rb_root ticket_handlers;
+
+ struct ceph_x_authorizer auth_authorizer;
+};
+
+int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 000000000..671d30576
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY 0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+ __u8 struct_v;
+ __le64 secret_id;
+ __le32 blob_len;
+ char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+ __le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+ __le16 op;
+ __le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+ __u8 struct_v;
+ __le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+ __u8 struct_v;
+ __le64 client_challenge;
+ __le64 key;
+ /* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+ __u8 struct_v;
+ __le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+ __le64 server_challenge;
+ __le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ * a - service id, ticket blob
+ * b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+ __u8 struct_v;
+ __le64 global_id;
+ __le32 service_id;
+ struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+ __u8 struct_v;
+ __le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+ __u8 struct_v;
+ __le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+ __u8 struct_v;
+ __le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 000000000..add5f921a
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,58 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h> /* for ceph_kvmalloc */
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+ struct ceph_buffer *b;
+
+ b = kmalloc(sizeof(*b), gfp);
+ if (!b)
+ return NULL;
+
+ b->vec.iov_base = ceph_kvmalloc(len, gfp);
+ if (!b->vec.iov_base) {
+ kfree(b);
+ return NULL;
+ }
+
+ kref_init(&b->kref);
+ b->alloc_len = len;
+ b->vec.iov_len = len;
+ dout("buffer_new %p\n", b);
+ return b;
+}
+EXPORT_SYMBOL(ceph_buffer_new);
+
+void ceph_buffer_release(struct kref *kref)
+{
+ struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+ dout("buffer_release %p\n", b);
+ kvfree(b->vec.iov_base);
+ kfree(b);
+}
+EXPORT_SYMBOL(ceph_buffer_release);
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+ size_t len;
+
+ ceph_decode_need(p, end, sizeof(u32), bad);
+ len = ceph_decode_32(p);
+ dout("decode_buffer len %d\n", (int)len);
+ ceph_decode_need(p, end, len, bad);
+ *b = ceph_buffer_new(len, GFP_NOFS);
+ if (!*b)
+ return -ENOMEM;
+ ceph_decode_copy(p, (*b)->vec.iov_base, len);
+ return 0;
+bad:
+ return -EINVAL;
+}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 000000000..79e8f71ae
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,725 @@
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/key.h>
+#include <keys/ceph-type.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include "crypto.h"
+
+
+/*
+ * Module compatibility interface. For now it doesn't do anything,
+ * but its existence signals a certain level of functionality.
+ *
+ * The data buffer is used to pass information both to and from
+ * libceph. The return value indicates whether libceph determines
+ * it is compatible with the caller (from another kernel module),
+ * given the provided data.
+ *
+ * The data pointer can be null.
+ */
+bool libceph_compatible(void *data)
+{
+ return true;
+}
+EXPORT_SYMBOL(libceph_compatible);
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+ const char *e = s + len;
+
+ while (e != s && *(e-1) != '/')
+ e--;
+ return e;
+}
+EXPORT_SYMBOL(ceph_file_part);
+
+const char *ceph_msg_type_name(int type)
+{
+ switch (type) {
+ case CEPH_MSG_SHUTDOWN: return "shutdown";
+ case CEPH_MSG_PING: return "ping";
+ case CEPH_MSG_AUTH: return "auth";
+ case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+ case CEPH_MSG_MON_MAP: return "mon_map";
+ case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+ case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+ case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+ case CEPH_MSG_STATFS: return "statfs";
+ case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+ case CEPH_MSG_MON_GET_VERSION: return "mon_get_version";
+ case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply";
+ case CEPH_MSG_MDS_MAP: return "mds_map";
+ case CEPH_MSG_CLIENT_SESSION: return "client_session";
+ case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+ case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+ case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+ case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+ case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+ case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+ case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+ case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+ case CEPH_MSG_OSD_MAP: return "osd_map";
+ case CEPH_MSG_OSD_OP: return "osd_op";
+ case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+ case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
+ default: return "unknown";
+ }
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+ if (client->have_fsid) {
+ if (ceph_fsid_compare(&client->fsid, fsid)) {
+ pr_err("bad fsid, had %pU got %pU",
+ &client->fsid, fsid);
+ return -1;
+ }
+ } else {
+ memcpy(&client->fsid, fsid, sizeof(*fsid));
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+ if (!s1 && !s2)
+ return 0;
+ if (s1 && !s2)
+ return -1;
+ if (!s1 && s2)
+ return 1;
+ return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+ struct ceph_client *client)
+{
+ struct ceph_options *opt1 = new_opt;
+ struct ceph_options *opt2 = client->options;
+ int ofs = offsetof(struct ceph_options, mon_addr);
+ int i;
+ int ret;
+
+ ret = memcmp(opt1, opt2, ofs);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(opt1->name, opt2->name);
+ if (ret)
+ return ret;
+
+ if (opt1->key && !opt2->key)
+ return -1;
+ if (!opt1->key && opt2->key)
+ return 1;
+ if (opt1->key && opt2->key) {
+ if (opt1->key->type != opt2->key->type)
+ return -1;
+ if (opt1->key->created.tv_sec != opt2->key->created.tv_sec)
+ return -1;
+ if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec)
+ return -1;
+ if (opt1->key->len != opt2->key->len)
+ return -1;
+ if (opt1->key->key && !opt2->key->key)
+ return -1;
+ if (!opt1->key->key && opt2->key->key)
+ return 1;
+ if (opt1->key->key && opt2->key->key) {
+ ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* any matching mon ip implies a match */
+ for (i = 0; i < opt1->num_mon; i++) {
+ if (ceph_monmap_contains(client->monc.monmap,
+ &opt1->mon_addr[i]))
+ return 0;
+ }
+ return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+void *ceph_kvmalloc(size_t size, gfp_t flags)
+{
+ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ void *ptr = kmalloc(size, flags | __GFP_NOWARN);
+ if (ptr)
+ return ptr;
+ }
+
+ return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+
+
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+ int i = 0;
+ char tmp[3];
+ int err = -EINVAL;
+ int d;
+
+ dout("parse_fsid '%s'\n", str);
+ tmp[2] = 0;
+ while (*str && i < 16) {
+ if (ispunct(*str)) {
+ str++;
+ continue;
+ }
+ if (!isxdigit(str[0]) || !isxdigit(str[1]))
+ break;
+ tmp[0] = str[0];
+ tmp[1] = str[1];
+ if (sscanf(tmp, "%x", &d) < 1)
+ break;
+ fsid->fsid[i] = d & 0xff;
+ i++;
+ str += 2;
+ }
+
+ if (i == 16)
+ err = 0;
+ dout("parse_fsid ret %d got fsid %pU", err, fsid);
+ return err;
+}
+
+/*
+ * ceph options
+ */
+enum {
+ Opt_osdtimeout,
+ Opt_osdkeepalivetimeout,
+ Opt_mount_timeout,
+ Opt_osd_idle_ttl,
+ Opt_last_int,
+ /* int args above */
+ Opt_fsid,
+ Opt_name,
+ Opt_secret,
+ Opt_key,
+ Opt_ip,
+ Opt_last_string,
+ /* string args above */
+ Opt_share,
+ Opt_noshare,
+ Opt_crc,
+ Opt_nocrc,
+ Opt_cephx_require_signatures,
+ Opt_nocephx_require_signatures,
+ Opt_tcp_nodelay,
+ Opt_notcp_nodelay,
+};
+
+static match_table_t opt_tokens = {
+ {Opt_osdtimeout, "osdtimeout=%d"},
+ {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+ {Opt_mount_timeout, "mount_timeout=%d"},
+ {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+ /* int args above */
+ {Opt_fsid, "fsid=%s"},
+ {Opt_name, "name=%s"},
+ {Opt_secret, "secret=%s"},
+ {Opt_key, "key=%s"},
+ {Opt_ip, "ip=%s"},
+ /* string args above */
+ {Opt_share, "share"},
+ {Opt_noshare, "noshare"},
+ {Opt_crc, "crc"},
+ {Opt_nocrc, "nocrc"},
+ {Opt_cephx_require_signatures, "cephx_require_signatures"},
+ {Opt_nocephx_require_signatures, "nocephx_require_signatures"},
+ {Opt_tcp_nodelay, "tcp_nodelay"},
+ {Opt_notcp_nodelay, "notcp_nodelay"},
+ {-1, NULL}
+};
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+ dout("destroy_options %p\n", opt);
+ kfree(opt->name);
+ if (opt->key) {
+ ceph_crypto_key_destroy(opt->key);
+ kfree(opt->key);
+ }
+ kfree(opt->mon_addr);
+ kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+/* get secret from key store */
+static int get_secret(struct ceph_crypto_key *dst, const char *name) {
+ struct key *ukey;
+ int key_err;
+ int err = 0;
+ struct ceph_crypto_key *ckey;
+
+ ukey = request_key(&key_type_ceph, name, NULL);
+ if (!ukey || IS_ERR(ukey)) {
+ /* request_key errors don't map nicely to mount(2)
+ errors; don't even try, but still printk */
+ key_err = PTR_ERR(ukey);
+ switch (key_err) {
+ case -ENOKEY:
+ pr_warn("ceph: Mount failed due to key not found: %s\n",
+ name);
+ break;
+ case -EKEYEXPIRED:
+ pr_warn("ceph: Mount failed due to expired key: %s\n",
+ name);
+ break;
+ case -EKEYREVOKED:
+ pr_warn("ceph: Mount failed due to revoked key: %s\n",
+ name);
+ break;
+ default:
+ pr_warn("ceph: Mount failed due to unknown key error %d: %s\n",
+ key_err, name);
+ }
+ err = -EPERM;
+ goto out;
+ }
+
+ ckey = ukey->payload.data;
+ err = ceph_crypto_key_clone(dst, ckey);
+ if (err)
+ goto out_key;
+ /* pass through, err is 0 */
+
+out_key:
+ key_put(ukey);
+out:
+ return err;
+}
+
+struct ceph_options *
+ceph_parse_options(char *options, const char *dev_name,
+ const char *dev_name_end,
+ int (*parse_extra_token)(char *c, void *private),
+ void *private)
+{
+ struct ceph_options *opt;
+ const char *c;
+ int err = -ENOMEM;
+ substring_t argstr[MAX_OPT_ARGS];
+
+ if (current->nsproxy->net_ns != &init_net)
+ return ERR_PTR(-EINVAL);
+
+ opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+ if (!opt)
+ return ERR_PTR(-ENOMEM);
+ opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+ GFP_KERNEL);
+ if (!opt->mon_addr)
+ goto out;
+
+ dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
+ dev_name);
+
+ /* start with defaults */
+ opt->flags = CEPH_OPT_DEFAULT;
+ opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
+
+ /* get mon ip(s) */
+ /* ip1[:port1][,ip2[:port2]...] */
+ err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
+ CEPH_MAX_MON, &opt->num_mon);
+ if (err < 0)
+ goto out;
+
+ /* parse mount options */
+ while ((c = strsep(&options, ",")) != NULL) {
+ int token, intval, ret;
+ if (!*c)
+ continue;
+ err = -EINVAL;
+ token = match_token((char *)c, opt_tokens, argstr);
+ if (token < 0 && parse_extra_token) {
+ /* extra? */
+ err = parse_extra_token((char *)c, private);
+ if (err < 0) {
+ pr_err("bad option at '%s'\n", c);
+ goto out;
+ }
+ continue;
+ }
+ if (token < Opt_last_int) {
+ ret = match_int(&argstr[0], &intval);
+ if (ret < 0) {
+ pr_err("bad mount option arg (not int) "
+ "at '%s'\n", c);
+ continue;
+ }
+ dout("got int token %d val %d\n", token, intval);
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
+ } else {
+ dout("got token %d\n", token);
+ }
+ switch (token) {
+ case Opt_ip:
+ err = ceph_parse_ips(argstr[0].from,
+ argstr[0].to,
+ &opt->my_addr,
+ 1, NULL);
+ if (err < 0)
+ goto out;
+ opt->flags |= CEPH_OPT_MYIP;
+ break;
+
+ case Opt_fsid:
+ err = parse_fsid(argstr[0].from, &opt->fsid);
+ if (err == 0)
+ opt->flags |= CEPH_OPT_FSID;
+ break;
+ case Opt_name:
+ opt->name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+ case Opt_secret:
+ opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+ if (!opt->key) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = ceph_crypto_key_unarmor(opt->key, argstr[0].from);
+ if (err < 0)
+ goto out;
+ break;
+ case Opt_key:
+ opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+ if (!opt->key) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = get_secret(opt->key, argstr[0].from);
+ if (err < 0)
+ goto out;
+ break;
+
+ /* misc */
+ case Opt_osdtimeout:
+ pr_warn("ignoring deprecated osdtimeout option\n");
+ break;
+ case Opt_osdkeepalivetimeout:
+ opt->osd_keepalive_timeout = intval;
+ break;
+ case Opt_osd_idle_ttl:
+ opt->osd_idle_ttl = intval;
+ break;
+ case Opt_mount_timeout:
+ opt->mount_timeout = intval;
+ break;
+
+ case Opt_share:
+ opt->flags &= ~CEPH_OPT_NOSHARE;
+ break;
+ case Opt_noshare:
+ opt->flags |= CEPH_OPT_NOSHARE;
+ break;
+
+ case Opt_crc:
+ opt->flags &= ~CEPH_OPT_NOCRC;
+ break;
+ case Opt_nocrc:
+ opt->flags |= CEPH_OPT_NOCRC;
+ break;
+
+ case Opt_cephx_require_signatures:
+ opt->flags &= ~CEPH_OPT_NOMSGAUTH;
+ break;
+ case Opt_nocephx_require_signatures:
+ opt->flags |= CEPH_OPT_NOMSGAUTH;
+ break;
+
+ case Opt_tcp_nodelay:
+ opt->flags |= CEPH_OPT_TCP_NODELAY;
+ break;
+ case Opt_notcp_nodelay:
+ opt->flags &= ~CEPH_OPT_TCP_NODELAY;
+ break;
+
+ default:
+ BUG_ON(token);
+ }
+ }
+
+ /* success */
+ return opt;
+
+out:
+ ceph_destroy_options(opt);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_parse_options);
+
+int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
+{
+ struct ceph_options *opt = client->options;
+ size_t pos = m->count;
+
+ if (opt->name)
+ seq_printf(m, "name=%s,", opt->name);
+ if (opt->key)
+ seq_puts(m, "secret=<hidden>,");
+
+ if (opt->flags & CEPH_OPT_FSID)
+ seq_printf(m, "fsid=%pU,", &opt->fsid);
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ seq_puts(m, "noshare,");
+ if (opt->flags & CEPH_OPT_NOCRC)
+ seq_puts(m, "nocrc,");
+ if (opt->flags & CEPH_OPT_NOMSGAUTH)
+ seq_puts(m, "nocephx_require_signatures,");
+ if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
+ seq_puts(m, "notcp_nodelay,");
+
+ if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+ seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
+ if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+ seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
+ if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+ seq_printf(m, "osdkeepalivetimeout=%d,",
+ opt->osd_keepalive_timeout);
+
+ /* drop redundant comma */
+ if (m->count != pos)
+ m->count--;
+
+ return 0;
+}
+EXPORT_SYMBOL(ceph_print_client_options);
+
+u64 ceph_client_id(struct ceph_client *client)
+{
+ return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_id);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
+ u64 supported_features,
+ u64 required_features)
+{
+ struct ceph_client *client;
+ struct ceph_entity_addr *myaddr = NULL;
+ int err = -ENOMEM;
+
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (client == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ client->private = private;
+ client->options = opt;
+
+ mutex_init(&client->mount_mutex);
+ init_waitqueue_head(&client->auth_wq);
+ client->auth_err = 0;
+
+ if (!ceph_test_opt(client, NOMSGAUTH))
+ required_features |= CEPH_FEATURE_MSG_AUTH;
+
+ client->extra_mon_dispatch = NULL;
+ client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
+ supported_features;
+ client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
+ required_features;
+
+ /* msgr */
+ if (ceph_test_opt(client, MYIP))
+ myaddr = &client->options->my_addr;
+
+ ceph_messenger_init(&client->msgr, myaddr,
+ client->supported_features,
+ client->required_features,
+ ceph_test_opt(client, NOCRC),
+ ceph_test_opt(client, TCP_NODELAY));
+
+ /* subsystems */
+ err = ceph_monc_init(&client->monc, client);
+ if (err < 0)
+ goto fail;
+ err = ceph_osdc_init(&client->osdc, client);
+ if (err < 0)
+ goto fail_monc;
+
+ return client;
+
+fail_monc:
+ ceph_monc_stop(&client->monc);
+fail:
+ kfree(client);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+ dout("destroy_client %p\n", client);
+
+ atomic_set(&client->msgr.stopping, 1);
+
+ /* unmount */
+ ceph_osdc_stop(&client->osdc);
+
+ ceph_monc_stop(&client->monc);
+
+ ceph_debugfs_client_cleanup(client);
+
+ ceph_destroy_options(client->options);
+
+ kfree(client);
+ dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+ return client->monc.monmap && client->monc.monmap->epoch &&
+ client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client, unsigned long started)
+{
+ int err;
+ unsigned long timeout = client->options->mount_timeout * HZ;
+
+ /* open session, and wait for mon and osd maps */
+ err = ceph_monc_open_session(&client->monc);
+ if (err < 0)
+ return err;
+
+ while (!have_mon_and_osd_map(client)) {
+ err = -EIO;
+ if (timeout && time_after_eq(jiffies, started + timeout))
+ return err;
+
+ /* wait */
+ dout("mount waiting for mon_map\n");
+ err = wait_event_interruptible_timeout(client->auth_wq,
+ have_mon_and_osd_map(client) || (client->auth_err < 0),
+ timeout);
+ if (err == -EINTR || err == -ERESTARTSYS)
+ return err;
+ if (client->auth_err < 0)
+ return client->auth_err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+
+int ceph_open_session(struct ceph_client *client)
+{
+ int ret;
+ unsigned long started = jiffies; /* note the start time */
+
+ dout("open_session start\n");
+ mutex_lock(&client->mount_mutex);
+
+ ret = __ceph_open_session(client, started);
+
+ mutex_unlock(&client->mount_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+
+static int __init init_ceph_lib(void)
+{
+ int ret = 0;
+
+ ret = ceph_debugfs_init();
+ if (ret < 0)
+ goto out;
+
+ ret = ceph_crypto_init();
+ if (ret < 0)
+ goto out_debugfs;
+
+ ret = ceph_msgr_init();
+ if (ret < 0)
+ goto out_crypto;
+
+ ret = ceph_osdc_setup();
+ if (ret < 0)
+ goto out_msgr;
+
+ pr_info("loaded (mon/osd proto %d/%d)\n",
+ CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+
+ return 0;
+
+out_msgr:
+ ceph_msgr_exit();
+out_crypto:
+ ceph_crypto_shutdown();
+out_debugfs:
+ ceph_debugfs_cleanup();
+out:
+ return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+ dout("exit_ceph_lib\n");
+ ceph_osdc_cleanup();
+ ceph_msgr_exit();
+ ceph_crypto_shutdown();
+ ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
new file mode 100644
index 000000000..41466ccb9
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,78 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+ __u32 su = le32_to_cpu(layout->fl_stripe_unit);
+ __u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ __u32 os = le32_to_cpu(layout->fl_object_size);
+
+ /* stripe unit, object size must be non-zero, 64k increment */
+ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ /* object size must be a multiple of stripe unit */
+ if (os < su || os % su)
+ return 0;
+ /* stripe count must be non-zero */
+ if (!sc)
+ return 0;
+ return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+ int mode;
+
+#ifdef O_DIRECTORY /* fixme */
+ if ((flags & O_DIRECTORY) == O_DIRECTORY)
+ return CEPH_FILE_MODE_PIN;
+#endif
+
+ switch (flags & O_ACCMODE) {
+ case O_WRONLY:
+ mode = CEPH_FILE_MODE_WR;
+ break;
+ case O_RDONLY:
+ mode = CEPH_FILE_MODE_RD;
+ break;
+ case O_RDWR:
+ case O_ACCMODE: /* this is what the VFS does */
+ mode = CEPH_FILE_MODE_RDWR;
+ break;
+ }
+#ifdef O_LAZY
+ if (flags & O_LAZY)
+ mode |= CEPH_FILE_MODE_LAZY;
+#endif
+
+ return mode;
+}
+EXPORT_SYMBOL(ceph_flags_to_mode);
+
+int ceph_caps_for_mode(int mode)
+{
+ int caps = CEPH_CAP_PIN;
+
+ if (mode & CEPH_FILE_MODE_RD)
+ caps |= CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+ if (mode & CEPH_FILE_MODE_WR)
+ caps |= CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+ if (mode & CEPH_FILE_MODE_LAZY)
+ caps |= CEPH_CAP_FILE_LAZYIO;
+
+ return caps;
+}
+EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 000000000..67bb1f11e
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,121 @@
+
+#include <linux/ceph/types.h>
+#include <linux/module.h>
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c) \
+ do { \
+ a = a - b; a = a - c; a = a ^ (c >> 13); \
+ b = b - c; b = b - a; b = b ^ (a << 8); \
+ c = c - a; c = c - b; c = c ^ (b >> 13); \
+ a = a - b; a = a - c; a = a ^ (c >> 12); \
+ b = b - c; b = b - a; b = b ^ (a << 16); \
+ c = c - a; c = c - b; c = c ^ (b >> 5); \
+ a = a - b; a = a - c; a = a ^ (c >> 3); \
+ b = b - c; b = b - a; b = b ^ (a << 10); \
+ c = c - a; c = c - b; c = c ^ (b >> 15); \
+ } while (0)
+
+unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length)
+{
+ const unsigned char *k = (const unsigned char *)str;
+ __u32 a, b, c; /* the internal state */
+ __u32 len; /* how many key bytes still need mixing */
+
+ /* Set up the internal state */
+ len = length;
+ a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
+ b = a;
+ c = 0; /* variable initialization of internal state */
+
+ /* handle most of the key */
+ while (len >= 12) {
+ a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+ ((__u32)k[3] << 24));
+ b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+ ((__u32)k[7] << 24));
+ c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+ ((__u32)k[11] << 24));
+ mix(a, b, c);
+ k = k + 12;
+ len = len - 12;
+ }
+
+ /* handle the last 11 bytes */
+ c = c + length;
+ switch (len) { /* all the case statements fall through */
+ case 11:
+ c = c + ((__u32)k[10] << 24);
+ case 10:
+ c = c + ((__u32)k[9] << 16);
+ case 9:
+ c = c + ((__u32)k[8] << 8);
+ /* the first byte of c is reserved for the length */
+ case 8:
+ b = b + ((__u32)k[7] << 24);
+ case 7:
+ b = b + ((__u32)k[6] << 16);
+ case 6:
+ b = b + ((__u32)k[5] << 8);
+ case 5:
+ b = b + k[4];
+ case 4:
+ a = a + ((__u32)k[3] << 24);
+ case 3:
+ a = a + ((__u32)k[2] << 16);
+ case 2:
+ a = a + ((__u32)k[1] << 8);
+ case 1:
+ a = a + k[0];
+ /* case 0: nothing left to add */
+ }
+ mix(a, b, c);
+
+ return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned int ceph_str_hash_linux(const char *str, unsigned int length)
+{
+ unsigned long hash = 0;
+ unsigned char c;
+
+ while (length--) {
+ c = *str++;
+ hash = (hash + (c << 4) + (c >> 4)) * 11;
+ }
+ return hash;
+}
+
+
+unsigned int ceph_str_hash(int type, const char *s, unsigned int len)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return ceph_str_hash_linux(s, len);
+ case CEPH_STR_HASH_RJENKINS:
+ return ceph_str_hash_rjenkins(s, len);
+ default:
+ return -1;
+ }
+}
+EXPORT_SYMBOL(ceph_str_hash);
+
+const char *ceph_str_hash_name(int type)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return "linux";
+ case CEPH_STR_HASH_RJENKINS:
+ return "rjenkins";
+ default:
+ return "unknown";
+ }
+}
+EXPORT_SYMBOL(ceph_str_hash_name);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 000000000..139a9cb19
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,44 @@
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+ switch (type) {
+ case CEPH_ENTITY_TYPE_MDS: return "mds";
+ case CEPH_ENTITY_TYPE_OSD: return "osd";
+ case CEPH_ENTITY_TYPE_MON: return "mon";
+ case CEPH_ENTITY_TYPE_CLIENT: return "client";
+ case CEPH_ENTITY_TYPE_AUTH: return "auth";
+ default: return "unknown";
+ }
+}
+
+const char *ceph_osd_op_name(int op)
+{
+ switch (op) {
+#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return (str);
+__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
+#undef GENERATE_CASE
+ default:
+ return "???";
+ }
+}
+
+const char *ceph_osd_state_name(int s)
+{
+ switch (s) {
+ case CEPH_OSD_EXISTS:
+ return "exists";
+ case CEPH_OSD_UP:
+ return "up";
+ case CEPH_OSD_AUTOOUT:
+ return "autoout";
+ case CEPH_OSD_NEW:
+ return "new";
+ default:
+ return "???";
+ }
+}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 000000000..9d84ce4ea
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,143 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include <linux/crush/crush.h>
+
+const char *crush_bucket_alg_name(int alg)
+{
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM: return "uniform";
+ case CRUSH_BUCKET_LIST: return "list";
+ case CRUSH_BUCKET_TREE: return "tree";
+ case CRUSH_BUCKET_STRAW: return "straw";
+ case CRUSH_BUCKET_STRAW2: return "straw2";
+ default: return "unknown";
+ }
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
+{
+ if ((__u32)p >= b->size)
+ return 0;
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return ((struct crush_bucket_uniform *)b)->item_weight;
+ case CRUSH_BUCKET_LIST:
+ return ((struct crush_bucket_list *)b)->item_weights[p];
+ case CRUSH_BUCKET_TREE:
+ return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)];
+ case CRUSH_BUCKET_STRAW:
+ return ((struct crush_bucket_straw *)b)->item_weights[p];
+ case CRUSH_BUCKET_STRAW2:
+ return ((struct crush_bucket_straw2 *)b)->item_weights[p];
+ }
+ return 0;
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+ kfree(b->item_weights);
+ kfree(b->sum_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b->node_weights);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+ kfree(b->straws);
+ kfree(b->item_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
+{
+ kfree(b->item_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+ break;
+ case CRUSH_BUCKET_LIST:
+ crush_destroy_bucket_list((struct crush_bucket_list *)b);
+ break;
+ case CRUSH_BUCKET_TREE:
+ crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+ break;
+ case CRUSH_BUCKET_STRAW2:
+ crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b);
+ break;
+ }
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+ /* buckets */
+ if (map->buckets) {
+ __s32 b;
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ crush_destroy_bucket(map->buckets[b]);
+ }
+ kfree(map->buckets);
+ }
+
+ /* rules */
+ if (map->rules) {
+ __u32 b;
+ for (b = 0; b < map->max_rules; b++)
+ crush_destroy_rule(map->rules[b]);
+ kfree(map->rules);
+ }
+
+ kfree(map);
+}
+
+void crush_destroy_rule(struct crush_rule *rule)
+{
+ kfree(rule);
+}
diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h
new file mode 100644
index 000000000..6192c7fc9
--- /dev/null
+++ b/net/ceph/crush/crush_ln_table.h
@@ -0,0 +1,166 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+
+#ifndef CEPH_CRUSH_LN_H
+#define CEPH_CRUSH_LN_H
+
+
+// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+
+static int64_t __RH_LH_tbl[128*2+2] = {
+ 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
+ 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
+ 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
+ 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll,
+ 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll,
+ 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll,
+ 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll,
+ 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell,
+ 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll,
+ 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll,
+ 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll,
+ 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll,
+ 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll,
+ 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll,
+ 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all,
+ 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll,
+ 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all,
+ 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell,
+ 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll,
+ 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll,
+ 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll,
+ 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll,
+ 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll,
+ 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll,
+ 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll,
+ 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll,
+ 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell,
+ 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll,
+ 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll,
+ 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll,
+ 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll,
+ 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll,
+ 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll,
+ 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll,
+ 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll,
+ 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll,
+ 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll,
+ 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll,
+ 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll,
+ 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll,
+ 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll,
+ 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll,
+ 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll,
+ 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll,
+ 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll,
+ 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll,
+ 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll,
+ 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll,
+ 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll,
+ 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll,
+ 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll,
+ 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll,
+ 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll,
+ 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell,
+ 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell,
+ 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll,
+ 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell,
+ 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll,
+ 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll,
+ 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll,
+ 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll,
+ 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll,
+ 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
+ 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
+ 0x0000800000000000ll, 0x0000ffff00000000ll,
+ };
+
+
+ // LL_tbl[k] = 2^48*log2(1.0+k/2^15);
+static int64_t __LL_tbl[256] = {
+ 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
+ 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
+ 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
+ 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull,
+ 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull,
+ 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull,
+ 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull,
+ 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull,
+ 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull,
+ 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull,
+ 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull,
+ 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull,
+ 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull,
+ 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull,
+ 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull,
+ 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull,
+ 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull,
+ 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull,
+ 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull,
+ 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull,
+ 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull,
+ 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull,
+ 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull,
+ 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull,
+ 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull,
+ 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull,
+ 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull,
+ 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull,
+ 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull,
+ 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull,
+ 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull,
+ 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull,
+ 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull,
+ 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull,
+ 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull,
+ 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull,
+ 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull,
+ 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull,
+ 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull,
+ 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull,
+ 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull,
+ 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull,
+ 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull,
+ 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull,
+ 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull,
+ 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull,
+ 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull,
+ 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull,
+ 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull,
+ 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull,
+ 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull,
+ 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull,
+ 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull,
+ 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull,
+ 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull,
+ 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull,
+ 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull,
+ 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull,
+ 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull,
+ 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull,
+ 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull,
+ 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull,
+ 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull,
+ 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
+};
+
+
+
+
+#endif
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 000000000..5bb63e37a
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do { \
+ a = a-b; a = a-c; a = a^(c>>13); \
+ b = b-c; b = b-a; b = b^(a<<8); \
+ c = c-a; c = c-b; c = c^(b>>13); \
+ a = a-b; a = a-c; a = a^(c>>12); \
+ b = b-c; b = b-a; b = b^(a<<16); \
+ c = c-a; c = c-b; c = c^(b>>5); \
+ a = a-b; a = a-c; a = a^(c>>3); \
+ b = b-c; b = b-a; b = b^(a<<10); \
+ c = c-a; c = c-b; c = c^(b>>15); \
+ } while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+ __u32 hash = crush_hash_seed ^ a;
+ __u32 b = a;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, a, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(x, a, hash);
+ crush_hashmix(b, y, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(a, x, hash);
+ crush_hashmix(y, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, d, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(e, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ crush_hashmix(d, x, hash);
+ crush_hashmix(y, e, hash);
+ return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1(a);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_2(a, b);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_3(a, b, c);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_4(a, b, c, d);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_5(a, b, c, d, e);
+ default:
+ return 0;
+ }
+}
+
+const char *crush_hash_name(int type)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return "rjenkins1";
+ default:
+ return "unknown";
+ }
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 000000000..5b47736d2
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,927 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+# define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include <linux/crush/crush.h>
+#include <linux/crush/hash.h>
+#include "crush_ln_table.h"
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
+{
+ __u32 i;
+
+ for (i = 0; i < map->max_rules; i++) {
+ if (map->rules[i] &&
+ map->rules[i]->mask.ruleset == ruleset &&
+ map->rules[i]->mask.type == type &&
+ map->rules[i]->mask.min_size <= size &&
+ map->rules[i]->mask.max_size >= size)
+ return i;
+ }
+ return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors. Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+ int x, int r)
+{
+ unsigned int pr = r % bucket->size;
+ unsigned int i, s;
+
+ /* start a new permutation if @x has changed */
+ if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
+ dprintk("bucket %d new x=%d\n", bucket->id, x);
+ bucket->perm_x = x;
+
+ /* optimize common r=0 case */
+ if (pr == 0) {
+ s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+ bucket->size;
+ bucket->perm[0] = s;
+ bucket->perm_n = 0xffff; /* magic value, see below */
+ goto out;
+ }
+
+ for (i = 0; i < bucket->size; i++)
+ bucket->perm[i] = i;
+ bucket->perm_n = 0;
+ } else if (bucket->perm_n == 0xffff) {
+ /* clean up after the r=0 case above */
+ for (i = 1; i < bucket->size; i++)
+ bucket->perm[i] = i;
+ bucket->perm[bucket->perm[0]] = 0;
+ bucket->perm_n = 1;
+ }
+
+ /* calculate permutation up to pr */
+ for (i = 0; i < bucket->perm_n; i++)
+ dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+ while (bucket->perm_n <= pr) {
+ unsigned int p = bucket->perm_n;
+ /* no point in swapping the final entry */
+ if (p < bucket->size - 1) {
+ i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+ (bucket->size - p);
+ if (i) {
+ unsigned int t = bucket->perm[p + i];
+ bucket->perm[p + i] = bucket->perm[p];
+ bucket->perm[p] = t;
+ }
+ dprintk(" perm_choose swap %d with %d\n", p, p+i);
+ }
+ bucket->perm_n++;
+ }
+ for (i = 0; i < bucket->size; i++)
+ dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
+
+ s = bucket->perm[pr];
+out:
+ dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+ bucket->size, x, r, pr, s);
+ return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+ int x, int r)
+{
+ return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+ int x, int r)
+{
+ int i;
+
+ for (i = bucket->h.size-1; i >= 0; i--) {
+ __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+ r, bucket->h.id);
+ w &= 0xffff;
+ dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+ "sw %x rand %llx",
+ i, x, r, bucket->h.items[i], bucket->item_weights[i],
+ bucket->sum_weights[i], w);
+ w *= bucket->sum_weights[i];
+ w = w >> 16;
+ /*dprintk(" scaled %llx\n", w);*/
+ if (w < bucket->item_weights[i])
+ return bucket->h.items[i];
+ }
+
+ dprintk("bad list sums for bucket %d\n", bucket->h.id);
+ return bucket->h.items[0];
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+
+static int left(int x)
+{
+ int h = height(x);
+ return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+ int h = height(x);
+ return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+ return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+ int x, int r)
+{
+ int n;
+ __u32 w;
+ __u64 t;
+
+ /* start at root */
+ n = bucket->num_nodes >> 1;
+
+ while (!terminal(n)) {
+ int l;
+ /* pick point in [0, w) */
+ w = bucket->node_weights[n];
+ t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+ bucket->h.id) * (__u64)w;
+ t = t >> 32;
+
+ /* descend to the left or right? */
+ l = left(n);
+ if (t < bucket->node_weights[l])
+ n = l;
+ else
+ n = right(n);
+ }
+
+ return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+ int x, int r)
+{
+ __u32 i;
+ int high = 0;
+ __u64 high_draw = 0;
+ __u64 draw;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+ draw &= 0xffff;
+ draw *= bucket->straws[i];
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+// compute 2^44*log2(input+1)
+uint64_t crush_ln(unsigned xin)
+{
+ unsigned x=xin, x1;
+ int iexpon, index1, index2;
+ uint64_t RH, LH, LL, xl64, result;
+
+ x++;
+
+ // normalize input
+ iexpon = 15;
+ while(!(x&0x18000)) { x<<=1; iexpon--; }
+
+ index1 = (x>>8)<<1;
+ // RH ~ 2^56/index1
+ RH = __RH_LH_tbl[index1 - 256];
+ // LH ~ 2^48 * log2(index1/256)
+ LH = __RH_LH_tbl[index1 + 1 - 256];
+
+ // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
+ xl64 = (int64_t)x * RH;
+ xl64 >>= 48;
+ x1 = xl64;
+
+ result = iexpon;
+ result <<= (12 + 32);
+
+ index2 = x1 & 0xff;
+ // LL ~ 2^48*log2(1.0+index2/2^15)
+ LL = __LL_tbl[index2];
+
+ LH = LH + LL;
+
+ LH >>= (48-12 - 32);
+ result += LH;
+
+ return result;
+}
+
+
+/*
+ * straw2
+ *
+ * for reference, see:
+ *
+ * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
+ *
+ */
+
+static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+ int x, int r)
+{
+ unsigned i, high = 0;
+ unsigned u;
+ unsigned w;
+ __s64 ln, draw, high_draw = 0;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ w = bucket->item_weights[i];
+ if (w) {
+ u = crush_hash32_3(bucket->h.hash, x,
+ bucket->h.items[i], r);
+ u &= 0xffff;
+
+ /*
+ * for some reason slightly less than 0x10000 produces
+ * a slightly more accurate distribution... probably a
+ * rounding effect.
+ *
+ * the natural log lookup table maps [0,0xffff]
+ * (corresponding to real numbers [1/0x10000, 1] to
+ * [0, 0xffffffffffff] (corresponding to real numbers
+ * [-11.090355,0]).
+ */
+ ln = crush_ln(u) - 0x1000000000000ll;
+
+ /*
+ * divide by 16.16 fixed-point weight. note
+ * that the ln value is negative, so a larger
+ * weight means a larger (less negative) value
+ * for draw.
+ */
+ draw = div64_s64(ln, w);
+ } else {
+ draw = S64_MIN;
+ }
+
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+ dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+ BUG_ON(in->size == 0);
+ switch (in->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+ x, r);
+ case CRUSH_BUCKET_LIST:
+ return bucket_list_choose((struct crush_bucket_list *)in,
+ x, r);
+ case CRUSH_BUCKET_TREE:
+ return bucket_tree_choose((struct crush_bucket_tree *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW:
+ return bucket_straw_choose((struct crush_bucket_straw *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW2:
+ return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
+ x, r);
+ default:
+ dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
+ return in->items[0];
+ }
+}
+
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(const struct crush_map *map,
+ const __u32 *weight, int weight_max,
+ int item, int x)
+{
+ if (item >= weight_max)
+ return 1;
+ if (weight[item] >= 0x10000)
+ return 0;
+ if (weight[item] == 0)
+ return 1;
+ if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+ < weight[item])
+ return 0;
+ return 1;
+}
+
+/**
+ * crush_choose_firstn - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @out_size: size of the out vector
+ * @tries: number of attempts to make
+ * @recurse_tries: number of attempts to have recursive chooseleaf make
+ * @local_retries: localized retries
+ * @local_fallback_retries: localized fallback retries
+ * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @vary_r: pass r to recursive calls
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
+ */
+static int crush_choose_firstn(const struct crush_map *map,
+ struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ int out_size,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ unsigned int local_retries,
+ unsigned int local_fallback_retries,
+ int recurse_to_leaf,
+ unsigned int vary_r,
+ int *out2,
+ int parent_r)
+{
+ int rep;
+ unsigned int ftotal, flocal;
+ int retry_descent, retry_bucket, skip_rep;
+ struct crush_bucket *in = bucket;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide, reject;
+ int count = out_size;
+
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+ recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep,
+ tries, recurse_tries, local_retries, local_fallback_retries,
+ parent_r);
+
+ for (rep = outpos; rep < numrep && count > 0 ; rep++) {
+ /* keep trying until we get a non-out, non-colliding item */
+ ftotal = 0;
+ skip_rep = 0;
+ do {
+ retry_descent = 0;
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ flocal = 0;
+ do {
+ collide = 0;
+ retry_bucket = 0;
+ r = rep + parent_r;
+ /* r' = r + f_total */
+ r += ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ reject = 1;
+ goto reject;
+ }
+ if (local_fallback_retries > 0 &&
+ flocal >= (in->size>>1) &&
+ flocal > local_fallback_retries)
+ item = bucket_perm_choose(in, x, r);
+ else
+ item = crush_bucket_choose(in, x, r);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ skip_rep = 1;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ skip_rep = 1;
+ break;
+ }
+ in = map->buckets[-1-item];
+ retry_bucket = 1;
+ continue;
+ }
+
+ /* collision? */
+ for (i = 0; i < outpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+
+ reject = 0;
+ if (!collide && recurse_to_leaf) {
+ if (item < 0) {
+ int sub_r;
+ if (vary_r)
+ sub_r = r >> (vary_r-1);
+ else
+ sub_r = 0;
+ if (crush_choose_firstn(map,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, outpos+1, 0,
+ out2, outpos, count,
+ recurse_tries, 0,
+ local_retries,
+ local_fallback_retries,
+ 0,
+ vary_r,
+ NULL,
+ sub_r) <= outpos)
+ /* didn't get leaf */
+ reject = 1;
+ } else {
+ /* we already have a leaf! */
+ out2[outpos] = item;
+ }
+ }
+
+ if (!reject) {
+ /* out? */
+ if (itemtype == 0)
+ reject = is_out(map, weight,
+ weight_max,
+ item, x);
+ else
+ reject = 0;
+ }
+
+reject:
+ if (reject || collide) {
+ ftotal++;
+ flocal++;
+
+ if (collide && flocal <= local_retries)
+ /* retry locally a few times */
+ retry_bucket = 1;
+ else if (local_fallback_retries > 0 &&
+ flocal <= in->size + local_fallback_retries)
+ /* exhaustive bucket search */
+ retry_bucket = 1;
+ else if (ftotal < tries)
+ /* then retry descent */
+ retry_descent = 1;
+ else
+ /* else give up */
+ skip_rep = 1;
+ dprintk(" reject %d collide %d "
+ "ftotal %u flocal %u\n",
+ reject, collide, ftotal,
+ flocal);
+ }
+ } while (retry_bucket);
+ } while (retry_descent);
+
+ if (skip_rep) {
+ dprintk("skip rep\n");
+ continue;
+ }
+
+ dprintk("CHOOSE got %d\n", item);
+ out[outpos] = item;
+ outpos++;
+ count--;
+ }
+
+ dprintk("CHOOSE returns %d\n", outpos);
+ return outpos;
+}
+
+
+/**
+ * crush_choose_indep: alternative breadth-first positionally stable mapping
+ *
+ */
+static void crush_choose_indep(const struct crush_map *map,
+ struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int left, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ int recurse_to_leaf,
+ int *out2,
+ int parent_r)
+{
+ struct crush_bucket *in = bucket;
+ int endpos = outpos + left;
+ int rep;
+ unsigned int ftotal;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide;
+
+ dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep);
+
+ /* initially my result is undefined */
+ for (rep = outpos; rep < endpos; rep++) {
+ out[rep] = CRUSH_ITEM_UNDEF;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_UNDEF;
+ }
+
+ for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] != CRUSH_ITEM_UNDEF)
+ continue;
+
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ for (;;) {
+ /* note: we base the choice on the position
+ * even in the nested call. that means that
+ * if the first layer chooses the same bucket
+ * in a different position, we will tend to
+ * choose a different item in that bucket.
+ * this will involve more devices in data
+ * movement and tend to distribute the load.
+ */
+ r = rep + parent_r;
+
+ /* be careful */
+ if (in->alg == CRUSH_BUCKET_UNIFORM &&
+ in->size % numrep == 0)
+ /* r'=r+(n+1)*f_total */
+ r += (numrep+1) * ftotal;
+ else
+ /* r' = r + n*f_total */
+ r += numrep * ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ dprintk(" empty bucket\n");
+ break;
+ }
+
+ item = crush_bucket_choose(in, x, r);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] =
+ CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+ in = map->buckets[-1-item];
+ continue;
+ }
+
+ /* collision? */
+ collide = 0;
+ for (i = outpos; i < endpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+ if (collide)
+ break;
+
+ if (recurse_to_leaf) {
+ if (item < 0) {
+ crush_choose_indep(map,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, 1, numrep, 0,
+ out2, rep,
+ recurse_tries, 0,
+ 0, NULL, r);
+ if (out2[rep] == CRUSH_ITEM_NONE) {
+ /* placed nothing; no leaf */
+ break;
+ }
+ } else {
+ /* we already have a leaf! */
+ out2[rep] = item;
+ }
+ }
+
+ /* out? */
+ if (itemtype == 0 &&
+ is_out(map, weight, weight_max, item, x))
+ break;
+
+ /* yay! */
+ out[rep] = item;
+ left--;
+ break;
+ }
+ }
+ }
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] == CRUSH_ITEM_UNDEF) {
+ out[rep] = CRUSH_ITEM_NONE;
+ }
+ if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
+ out2[rep] = CRUSH_ITEM_NONE;
+ }
+ }
+}
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @scratch: scratch vector for private use; must be >= 3 * result_max
+ */
+int crush_do_rule(const struct crush_map *map,
+ int ruleno, int x, int *result, int result_max,
+ const __u32 *weight, int weight_max,
+ int *scratch)
+{
+ int result_len;
+ int *a = scratch;
+ int *b = scratch + result_max;
+ int *c = scratch + result_max*2;
+ int recurse_to_leaf;
+ int *w;
+ int wsize = 0;
+ int *o;
+ int osize;
+ int *tmp;
+ struct crush_rule *rule;
+ __u32 step;
+ int i, j;
+ int numrep;
+ int out_size;
+ /*
+ * the original choose_total_tries value was off by one (it
+ * counted "retries" and not "tries"). add one.
+ */
+ int choose_tries = map->choose_total_tries + 1;
+ int choose_leaf_tries = 0;
+ /*
+ * the local tries values were counted as "retries", though,
+ * and need no adjustment
+ */
+ int choose_local_retries = map->choose_local_tries;
+ int choose_local_fallback_retries = map->choose_local_fallback_tries;
+
+ int vary_r = map->chooseleaf_vary_r;
+
+ if ((__u32)ruleno >= map->max_rules) {
+ dprintk(" bad ruleno %d\n", ruleno);
+ return 0;
+ }
+
+ rule = map->rules[ruleno];
+ result_len = 0;
+ w = a;
+ o = b;
+
+ for (step = 0; step < rule->len; step++) {
+ int firstn = 0;
+ struct crush_rule_step *curstep = &rule->steps[step];
+
+ switch (curstep->op) {
+ case CRUSH_RULE_TAKE:
+ w[0] = curstep->arg1;
+ wsize = 1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_TRIES:
+ if (curstep->arg1 > 0)
+ choose_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+ if (curstep->arg1 > 0)
+ choose_leaf_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+ if (curstep->arg1 >= 0)
+ choose_local_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+ if (curstep->arg1 >= 0)
+ choose_local_fallback_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+ if (curstep->arg1 >= 0)
+ vary_r = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ firstn = 1;
+ /* fall through */
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ if (wsize == 0)
+ break;
+
+ recurse_to_leaf =
+ curstep->op ==
+ CRUSH_RULE_CHOOSELEAF_FIRSTN ||
+ curstep->op ==
+ CRUSH_RULE_CHOOSELEAF_INDEP;
+
+ /* reset output */
+ osize = 0;
+
+ for (i = 0; i < wsize; i++) {
+ /*
+ * see CRUSH_N, CRUSH_N_MINUS macros.
+ * basically, numrep <= 0 means relative to
+ * the provided result_max
+ */
+ numrep = curstep->arg1;
+ if (numrep <= 0) {
+ numrep += result_max;
+ if (numrep <= 0)
+ continue;
+ }
+ j = 0;
+ if (firstn) {
+ int recurse_tries;
+ if (choose_leaf_tries)
+ recurse_tries =
+ choose_leaf_tries;
+ else if (map->chooseleaf_descend_once)
+ recurse_tries = 1;
+ else
+ recurse_tries = choose_tries;
+ osize += crush_choose_firstn(
+ map,
+ map->buckets[-1-w[i]],
+ weight, weight_max,
+ x, numrep,
+ curstep->arg2,
+ o+osize, j,
+ result_max-osize,
+ choose_tries,
+ recurse_tries,
+ choose_local_retries,
+ choose_local_fallback_retries,
+ recurse_to_leaf,
+ vary_r,
+ c+osize,
+ 0);
+ } else {
+ out_size = ((numrep < (result_max-osize)) ?
+ numrep : (result_max-osize));
+ crush_choose_indep(
+ map,
+ map->buckets[-1-w[i]],
+ weight, weight_max,
+ x, out_size, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ choose_leaf_tries ?
+ choose_leaf_tries : 1,
+ recurse_to_leaf,
+ c+osize,
+ 0);
+ osize += out_size;
+ }
+ }
+
+ if (recurse_to_leaf)
+ /* copy final _leaf_ values to output set */
+ memcpy(o, c, osize*sizeof(*o));
+
+ /* swap o and w arrays */
+ tmp = o;
+ o = w;
+ w = tmp;
+ wsize = osize;
+ break;
+
+
+ case CRUSH_RULE_EMIT:
+ for (i = 0; i < wsize && result_len < result_max; i++) {
+ result[result_len] = w[i];
+ result_len++;
+ }
+ wsize = 0;
+ break;
+
+ default:
+ dprintk(" unknown op %d at step %d\n",
+ curstep->op, step);
+ break;
+ }
+ }
+ return result_len;
+}
+
+
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 000000000..790fe89d9
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,583 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+#include <linux/key-type.h>
+
+#include <keys/ceph-type.h>
+#include <keys/user-type.h>
+#include <linux/ceph/decode.h>
+#include "crypto.h"
+
+int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
+ const struct ceph_crypto_key *src)
+{
+ memcpy(dst, src, sizeof(struct ceph_crypto_key));
+ dst->key = kmemdup(src->key, src->len, GFP_NOFS);
+ if (!dst->key)
+ return -ENOMEM;
+ return 0;
+}
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+ if (*p + sizeof(u16) + sizeof(key->created) +
+ sizeof(u16) + key->len > end)
+ return -ERANGE;
+ ceph_encode_16(p, key->type);
+ ceph_encode_copy(p, &key->created, sizeof(key->created));
+ ceph_encode_16(p, key->len);
+ ceph_encode_copy(p, key->key, key->len);
+ return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+ ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+ key->type = ceph_decode_16(p);
+ ceph_decode_copy(p, &key->created, sizeof(key->created));
+ key->len = ceph_decode_16(p);
+ ceph_decode_need(p, end, key->len, bad);
+ key->key = kmalloc(key->len, GFP_NOFS);
+ if (!key->key)
+ return -ENOMEM;
+ ceph_decode_copy(p, key->key, key->len);
+ return 0;
+
+bad:
+ dout("failed to decode crypto key\n");
+ return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+ int inlen = strlen(inkey);
+ int blen = inlen * 3 / 4;
+ void *buf, *p;
+ int ret;
+
+ dout("crypto_key_unarmor %s\n", inkey);
+ buf = kmalloc(blen, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+ blen = ceph_unarmor(buf, inkey, inkey+inlen);
+ if (blen < 0) {
+ kfree(buf);
+ return blen;
+ }
+
+ p = buf;
+ ret = ceph_crypto_key_decode(key, &p, p + blen);
+ kfree(buf);
+ if (ret)
+ return ret;
+ dout("crypto_key_unarmor key %p type %d len %d\n", key,
+ key->type, key->len);
+ return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+ return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
+
+/*
+ * Should be used for buffers allocated with ceph_kvmalloc().
+ * Currently these are encrypt out-buffer (ceph_buffer) and decrypt
+ * in-buffer (msg front).
+ *
+ * Dispose of @sgt with teardown_sgtable().
+ *
+ * @prealloc_sg is to avoid memory allocation inside sg_alloc_table()
+ * in cases where a single sg is sufficient. No attempt to reduce the
+ * number of sgs by squeezing physically contiguous pages together is
+ * made though, for simplicity.
+ */
+static int setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
+ const void *buf, unsigned int buf_len)
+{
+ struct scatterlist *sg;
+ const bool is_vmalloc = is_vmalloc_addr(buf);
+ unsigned int off = offset_in_page(buf);
+ unsigned int chunk_cnt = 1;
+ unsigned int chunk_len = PAGE_ALIGN(off + buf_len);
+ int i;
+ int ret;
+
+ if (buf_len == 0) {
+ memset(sgt, 0, sizeof(*sgt));
+ return -EINVAL;
+ }
+
+ if (is_vmalloc) {
+ chunk_cnt = chunk_len >> PAGE_SHIFT;
+ chunk_len = PAGE_SIZE;
+ }
+
+ if (chunk_cnt > 1) {
+ ret = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS);
+ if (ret)
+ return ret;
+ } else {
+ WARN_ON(chunk_cnt != 1);
+ sg_init_table(prealloc_sg, 1);
+ sgt->sgl = prealloc_sg;
+ sgt->nents = sgt->orig_nents = 1;
+ }
+
+ for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) {
+ struct page *page;
+ unsigned int len = min(chunk_len - off, buf_len);
+
+ if (is_vmalloc)
+ page = vmalloc_to_page(buf);
+ else
+ page = virt_to_page(buf);
+
+ sg_set_page(sg, page, len, off);
+
+ off = 0;
+ buf += len;
+ buf_len -= len;
+ }
+ WARN_ON(buf_len != 0);
+
+ return 0;
+}
+
+static void teardown_sgtable(struct sg_table *sgt)
+{
+ if (sgt->orig_nents > 1)
+ sg_free_table(sgt);
+}
+
+static int ceph_aes_encrypt(const void *key, int key_len,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ struct scatterlist sg_in[2], prealloc_sg;
+ struct sg_table sg_out;
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+ int ret;
+ void *iv;
+ int ivsize;
+ size_t zero_padding = (0x10 - (src_len & 0x0f));
+ char pad[16];
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ memset(pad, zero_padding, zero_padding);
+
+ *dst_len = src_len + zero_padding;
+
+ sg_init_table(sg_in, 2);
+ sg_set_buf(&sg_in[0], src, src_len);
+ sg_set_buf(&sg_in[1], pad, zero_padding);
+ ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
+ if (ret)
+ goto out_tfm;
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+ memcpy(iv, aes_iv, ivsize);
+
+ /*
+ print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+ pad, zero_padding, 1);
+ */
+ ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in,
+ src_len + zero_padding);
+ if (ret < 0) {
+ pr_err("ceph_aes_crypt failed %d\n", ret);
+ goto out_sg;
+ }
+ /*
+ print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+
+out_sg:
+ teardown_sgtable(&sg_out);
+out_tfm:
+ crypto_free_blkcipher(tfm);
+ return ret;
+}
+
+static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
+ size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len)
+{
+ struct scatterlist sg_in[3], prealloc_sg;
+ struct sg_table sg_out;
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+ int ret;
+ void *iv;
+ int ivsize;
+ size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+ char pad[16];
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ memset(pad, zero_padding, zero_padding);
+
+ *dst_len = src1_len + src2_len + zero_padding;
+
+ sg_init_table(sg_in, 3);
+ sg_set_buf(&sg_in[0], src1, src1_len);
+ sg_set_buf(&sg_in[1], src2, src2_len);
+ sg_set_buf(&sg_in[2], pad, zero_padding);
+ ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
+ if (ret)
+ goto out_tfm;
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+ memcpy(iv, aes_iv, ivsize);
+
+ /*
+ print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+ src1, src1_len, 1);
+ print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+ src2, src2_len, 1);
+ print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+ pad, zero_padding, 1);
+ */
+ ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in,
+ src1_len + src2_len + zero_padding);
+ if (ret < 0) {
+ pr_err("ceph_aes_crypt2 failed %d\n", ret);
+ goto out_sg;
+ }
+ /*
+ print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+
+out_sg:
+ teardown_sgtable(&sg_out);
+out_tfm:
+ crypto_free_blkcipher(tfm);
+ return ret;
+}
+
+static int ceph_aes_decrypt(const void *key, int key_len,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ struct sg_table sg_in;
+ struct scatterlist sg_out[2], prealloc_sg;
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm };
+ char pad[16];
+ void *iv;
+ int ivsize;
+ int ret;
+ int last_byte;
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ sg_init_table(sg_out, 2);
+ sg_set_buf(&sg_out[0], dst, *dst_len);
+ sg_set_buf(&sg_out[1], pad, sizeof(pad));
+ ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
+ if (ret)
+ goto out_tfm;
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+ memcpy(iv, aes_iv, ivsize);
+
+ /*
+ print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ */
+ ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len);
+ if (ret < 0) {
+ pr_err("ceph_aes_decrypt failed %d\n", ret);
+ goto out_sg;
+ }
+
+ if (src_len <= *dst_len)
+ last_byte = ((char *)dst)[src_len - 1];
+ else
+ last_byte = pad[src_len - *dst_len - 1];
+ if (last_byte <= 16 && src_len >= last_byte) {
+ *dst_len = src_len - last_byte;
+ } else {
+ pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+ last_byte, (int)src_len);
+ return -EPERM; /* bad padding */
+ }
+ /*
+ print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+
+out_sg:
+ teardown_sgtable(&sg_in);
+out_tfm:
+ crypto_free_blkcipher(tfm);
+ return ret;
+}
+
+static int ceph_aes_decrypt2(const void *key, int key_len,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len)
+{
+ struct sg_table sg_in;
+ struct scatterlist sg_out[3], prealloc_sg;
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm };
+ char pad[16];
+ void *iv;
+ int ivsize;
+ int ret;
+ int last_byte;
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ sg_init_table(sg_out, 3);
+ sg_set_buf(&sg_out[0], dst1, *dst1_len);
+ sg_set_buf(&sg_out[1], dst2, *dst2_len);
+ sg_set_buf(&sg_out[2], pad, sizeof(pad));
+ ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
+ if (ret)
+ goto out_tfm;
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+ memcpy(iv, aes_iv, ivsize);
+
+ /*
+ print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ */
+ ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len);
+ if (ret < 0) {
+ pr_err("ceph_aes_decrypt failed %d\n", ret);
+ goto out_sg;
+ }
+
+ if (src_len <= *dst1_len)
+ last_byte = ((char *)dst1)[src_len - 1];
+ else if (src_len <= *dst1_len + *dst2_len)
+ last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+ else
+ last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+ if (last_byte <= 16 && src_len >= last_byte) {
+ src_len -= last_byte;
+ } else {
+ pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+ last_byte, (int)src_len);
+ return -EPERM; /* bad padding */
+ }
+
+ if (src_len < *dst1_len) {
+ *dst1_len = src_len;
+ *dst2_len = 0;
+ } else {
+ *dst2_len = src_len - *dst1_len;
+ }
+ /*
+ print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
+ dst1, *dst1_len, 1);
+ print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
+ dst2, *dst2_len, 1);
+ */
+
+out_sg:
+ teardown_sgtable(&sg_in);
+out_tfm:
+ crypto_free_blkcipher(tfm);
+ return ret;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src_len)
+ return -ERANGE;
+ memcpy(dst, src, src_len);
+ *dst_len = src_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_decrypt(secret->key, secret->len, dst,
+ dst_len, src, src_len);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len)
+{
+ size_t t;
+
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst1_len + *dst2_len < src_len)
+ return -ERANGE;
+ t = min(*dst1_len, src_len);
+ memcpy(dst1, src, t);
+ *dst1_len = t;
+ src += t;
+ src_len -= t;
+ if (src_len) {
+ t = min(*dst2_len, src_len);
+ memcpy(dst2, src, t);
+ *dst2_len = t;
+ }
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_decrypt2(secret->key, secret->len,
+ dst1, dst1_len, dst2, dst2_len,
+ src, src_len);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src_len)
+ return -ERANGE;
+ memcpy(dst, src, src_len);
+ *dst_len = src_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_encrypt(secret->key, secret->len, dst,
+ dst_len, src, src_len);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src1_len + src2_len)
+ return -ERANGE;
+ memcpy(dst, src1, src1_len);
+ memcpy(dst + src1_len, src2, src2_len);
+ *dst_len = src1_len + src2_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+ src1, src1_len, src2, src2_len);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static int ceph_key_preparse(struct key_preparsed_payload *prep)
+{
+ struct ceph_crypto_key *ckey;
+ size_t datalen = prep->datalen;
+ int ret;
+ void *p;
+
+ ret = -EINVAL;
+ if (datalen <= 0 || datalen > 32767 || !prep->data)
+ goto err;
+
+ ret = -ENOMEM;
+ ckey = kmalloc(sizeof(*ckey), GFP_KERNEL);
+ if (!ckey)
+ goto err;
+
+ /* TODO ceph_crypto_key_decode should really take const input */
+ p = (void *)prep->data;
+ ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen);
+ if (ret < 0)
+ goto err_ckey;
+
+ prep->payload[0] = ckey;
+ prep->quotalen = datalen;
+ return 0;
+
+err_ckey:
+ kfree(ckey);
+err:
+ return ret;
+}
+
+static void ceph_key_free_preparse(struct key_preparsed_payload *prep)
+{
+ struct ceph_crypto_key *ckey = prep->payload[0];
+ ceph_crypto_key_destroy(ckey);
+ kfree(ckey);
+}
+
+static void ceph_key_destroy(struct key *key)
+{
+ struct ceph_crypto_key *ckey = key->payload.data;
+
+ ceph_crypto_key_destroy(ckey);
+ kfree(ckey);
+}
+
+struct key_type key_type_ceph = {
+ .name = "ceph",
+ .preparse = ceph_key_preparse,
+ .free_preparse = ceph_key_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .destroy = ceph_key_destroy,
+};
+
+int ceph_crypto_init(void) {
+ return register_key_type(&key_type_ceph);
+}
+
+void ceph_crypto_shutdown(void) {
+ unregister_key_type(&key_type_ceph);
+}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 000000000..d1498224c
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,51 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+ int type;
+ struct ceph_timespec created;
+ int len;
+ void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+ if (key)
+ kfree(key->key);
+}
+
+int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
+ const struct ceph_crypto_key *src);
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end);
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end);
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+int ceph_decrypt(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len);
+int ceph_encrypt(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len);
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len);
+int ceph_encrypt2(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len);
+int ceph_crypto_init(void);
+void ceph_crypto_shutdown(void);
+
+/* armor.c */
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 000000000..593dc2eab
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,309 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client* - an instance of the ceph client
+ * .../osdmap - current osdmap
+ * .../monmap - current monmap
+ * .../osdc - active osd requests
+ * .../monc - mon client state
+ * .../client_options - libceph-only (i.e. not rbd or cephfs) options
+ * .../dentry_lru - dump contents of dentry lru
+ * .../caps - expose cap (reservation) stats
+ * .../bdi - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+
+ if (client->monc.monmap == NULL)
+ return 0;
+
+ seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+ for (i = 0; i < client->monc.monmap->num_mon; i++) {
+ struct ceph_entity_inst *inst =
+ &client->monc.monmap->mon_inst[i];
+
+ seq_printf(s, "\t%s%lld\t%s\n",
+ ENTITY_NAME(inst->name),
+ ceph_pr_addr(&inst->addr.in_addr));
+ }
+ return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+ struct ceph_osdmap *map = client->osdc.osdmap;
+ struct rb_node *n;
+
+ if (map == NULL)
+ return 0;
+
+ seq_printf(s, "epoch %d\n", map->epoch);
+ seq_printf(s, "flags%s%s\n",
+ (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
+ (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
+
+ for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pool =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+
+ seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
+ pool->id, pool->pg_num, pool->pg_num_mask,
+ pool->read_tier, pool->write_tier);
+ }
+ for (i = 0; i < map->max_osd; i++) {
+ struct ceph_entity_addr *addr = &map->osd_addr[i];
+ int state = map->osd_state[i];
+ char sb[64];
+
+ seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
+ i, ceph_pr_addr(&addr->in_addr),
+ ((map->osd_weight[i]*100) >> 16),
+ ceph_osdmap_state_str(sb, sizeof(sb), state),
+ ((ceph_get_primary_affinity(map, i)*100) >> 16));
+ }
+ for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool,
+ pg->pgid.seed);
+ for (i = 0; i < pg->pg_temp.len; i++)
+ seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+ pg->pg_temp.osds[i]);
+ seq_printf(s, "]\n");
+ }
+ for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
+ pg->pgid.seed, pg->primary_temp.osd);
+ }
+
+ return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_client *monc = &client->monc;
+ struct rb_node *rp;
+
+ mutex_lock(&monc->mutex);
+
+ if (monc->have_mdsmap)
+ seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap);
+ if (monc->have_osdmap)
+ seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap);
+ if (monc->want_next_osdmap)
+ seq_printf(s, "want next osdmap\n");
+
+ for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+ __u16 op;
+ req = rb_entry(rp, struct ceph_mon_generic_request, node);
+ op = le16_to_cpu(req->request->hdr.type);
+ if (op == CEPH_MSG_STATFS)
+ seq_printf(s, "%llu statfs\n", req->tid);
+ else if (op == CEPH_MSG_MON_GET_VERSION)
+ seq_printf(s, "%llu mon_get_version", req->tid);
+ else
+ seq_printf(s, "%llu unknown\n", req->tid);
+ }
+
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct rb_node *p;
+
+ mutex_lock(&osdc->request_mutex);
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req;
+ unsigned int i;
+ int opcode;
+
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+
+ seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1,
+ req->r_pgid.pool, req->r_pgid.seed);
+
+ seq_printf(s, "%.*s", req->r_base_oid.name_len,
+ req->r_base_oid.name);
+
+ if (req->r_reassert_version.epoch)
+ seq_printf(s, "\t%u'%llu",
+ (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
+ le64_to_cpu(req->r_reassert_version.version));
+ else
+ seq_printf(s, "\t");
+
+ for (i = 0; i < req->r_num_ops; i++) {
+ opcode = req->r_ops[i].op;
+ seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+ ceph_osd_op_name(opcode));
+ }
+
+ seq_printf(s, "\n");
+ }
+ mutex_unlock(&osdc->request_mutex);
+ return 0;
+}
+
+static int client_options_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;
+ int ret;
+
+ ret = ceph_print_client_options(s, client);
+ if (ret)
+ return ret;
+
+ seq_putc(s, '\n');
+ return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(monmap_show)
+CEPH_DEFINE_SHOW_FUNC(osdmap_show)
+CEPH_DEFINE_SHOW_FUNC(monc_show)
+CEPH_DEFINE_SHOW_FUNC(osdc_show)
+CEPH_DEFINE_SHOW_FUNC(client_options_show)
+
+int ceph_debugfs_init(void)
+{
+ ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+ if (!ceph_debugfs_dir)
+ return -ENOMEM;
+ return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+ debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+ int ret = -ENOMEM;
+ char name[80];
+
+ snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+ client->monc.auth->global_id);
+
+ dout("ceph_debugfs_client_init %p %s\n", client, name);
+
+ BUG_ON(client->debugfs_dir);
+ client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+ if (!client->debugfs_dir)
+ goto out;
+
+ client->monc.debugfs_file = debugfs_create_file("monc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &monc_show_fops);
+ if (!client->monc.debugfs_file)
+ goto out;
+
+ client->osdc.debugfs_file = debugfs_create_file("osdc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &osdc_show_fops);
+ if (!client->osdc.debugfs_file)
+ goto out;
+
+ client->debugfs_monmap = debugfs_create_file("monmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &monmap_show_fops);
+ if (!client->debugfs_monmap)
+ goto out;
+
+ client->debugfs_osdmap = debugfs_create_file("osdmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &osdmap_show_fops);
+ if (!client->debugfs_osdmap)
+ goto out;
+
+ client->debugfs_options = debugfs_create_file("client_options",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &client_options_show_fops);
+ if (!client->debugfs_options)
+ goto out;
+
+ return 0;
+
+out:
+ ceph_debugfs_client_cleanup(client);
+ return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+ dout("ceph_debugfs_client_cleanup %p\n", client);
+ debugfs_remove(client->debugfs_options);
+ debugfs_remove(client->debugfs_osdmap);
+ debugfs_remove(client->debugfs_monmap);
+ debugfs_remove(client->osdc.debugfs_file);
+ debugfs_remove(client->monc.debugfs_file);
+ debugfs_remove(client->debugfs_dir);
+}
+
+#else /* CONFIG_DEBUG_FS */
+
+int ceph_debugfs_init(void)
+{
+ return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+ return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+EXPORT_SYMBOL(ceph_debugfs_init);
+EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 000000000..967080a9f
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,3395 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif /* CONFIG_BLOCK */
+#include <linux/dns_resolver.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/export.h>
+
+#define list_entry_next(pos, member) \
+ list_entry(pos->member.next, typeof(*pos), member)
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system. The messenger provides ordered and reliable
+ * delivery. We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error). Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/*
+ * We track the state of the socket on a given connection using
+ * values defined below. The transition to a new socket state is
+ * handled by a function which verifies we aren't coming from an
+ * unexpected state.
+ *
+ * --------
+ * | NEW* | transient initial state
+ * --------
+ * | con_sock_state_init()
+ * v
+ * ----------
+ * | CLOSED | initialized, but no socket (and no
+ * ---------- TCP connection)
+ * ^ \
+ * | \ con_sock_state_connecting()
+ * | ----------------------
+ * | \
+ * + con_sock_state_closed() \
+ * |+--------------------------- \
+ * | \ \ \
+ * | ----------- \ \
+ * | | CLOSING | socket event; \ \
+ * | ----------- await close \ \
+ * | ^ \ |
+ * | | \ |
+ * | + con_sock_state_closing() \ |
+ * | / \ | |
+ * | / --------------- | |
+ * | / \ v v
+ * | / --------------
+ * | / -----------------| CONNECTING | socket created, TCP
+ * | | / -------------- connect initiated
+ * | | | con_sock_state_connected()
+ * | | v
+ * -------------
+ * | CONNECTED | TCP connection established
+ * -------------
+ *
+ * State values for ceph_connection->sock_state; NEW is assumed to be 0.
+ */
+
+#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */
+#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */
+#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */
+#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
+#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
+
+/*
+ * connection states
+ */
+#define CON_STATE_CLOSED 1 /* -> PREOPEN */
+#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
+#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
+#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
+#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
+#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
+
+/*
+ * ceph_connection flag bits
+ */
+#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
+ * messages on errors */
+#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
+#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
+#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
+#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
+
+static bool con_flag_valid(unsigned long con_flag)
+{
+ switch (con_flag) {
+ case CON_FLAG_LOSSYTX:
+ case CON_FLAG_KEEPALIVE_PENDING:
+ case CON_FLAG_WRITE_PENDING:
+ case CON_FLAG_SOCK_CLOSED:
+ case CON_FLAG_BACKOFF:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ clear_bit(con_flag, &con->flags);
+}
+
+static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ set_bit(con_flag, &con->flags);
+}
+
+static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ return test_bit(con_flag, &con->flags);
+}
+
+static bool con_flag_test_and_clear(struct ceph_connection *con,
+ unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ return test_and_clear_bit(con_flag, &con->flags);
+}
+
+static bool con_flag_test_and_set(struct ceph_connection *con,
+ unsigned long con_flag)
+{
+ BUG_ON(!con_flag_valid(con_flag));
+
+ return test_and_set_bit(con_flag, &con->flags);
+}
+
+/* Slab caches for frequently-allocated structures */
+
+static struct kmem_cache *ceph_msg_cache;
+static struct kmem_cache *ceph_msg_data_cache;
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
+/*
+ * When skipping (ignoring) a block of input we read it into a "skip
+ * buffer," which is this many bytes in size.
+ */
+#define SKIP_BUF_SIZE 1024
+
+static void queue_con(struct ceph_connection *con);
+static void cancel_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void con_fault(struct ceph_connection *con);
+
+/*
+ * Nicely render a sockaddr as a string. An array of formatted
+ * strings is used, to approximate reentrancy.
+ */
+#define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */
+#define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG)
+#define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1)
+#define MAX_ADDR_STR_LEN 64 /* 54 is enough */
+
+static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
+static atomic_t addr_str_seq = ATOMIC_INIT(0);
+
+static struct page *zero_page; /* used in certain error cases */
+
+const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+{
+ int i;
+ char *s;
+ struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
+
+ i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
+ s = addr_str[i];
+
+ switch (ss->ss_family) {
+ case AF_INET:
+ snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
+ ntohs(in4->sin_port));
+ break;
+
+ case AF_INET6:
+ snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
+ ntohs(in6->sin6_port));
+ break;
+
+ default:
+ snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
+ ss->ss_family);
+ }
+
+ return s;
+}
+EXPORT_SYMBOL(ceph_pr_addr);
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+ ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+static struct workqueue_struct *ceph_msgr_wq;
+
+static int ceph_msgr_slab_init(void)
+{
+ BUG_ON(ceph_msg_cache);
+ ceph_msg_cache = kmem_cache_create("ceph_msg",
+ sizeof (struct ceph_msg),
+ __alignof__(struct ceph_msg), 0, NULL);
+
+ if (!ceph_msg_cache)
+ return -ENOMEM;
+
+ BUG_ON(ceph_msg_data_cache);
+ ceph_msg_data_cache = kmem_cache_create("ceph_msg_data",
+ sizeof (struct ceph_msg_data),
+ __alignof__(struct ceph_msg_data),
+ 0, NULL);
+ if (ceph_msg_data_cache)
+ return 0;
+
+ kmem_cache_destroy(ceph_msg_cache);
+ ceph_msg_cache = NULL;
+
+ return -ENOMEM;
+}
+
+static void ceph_msgr_slab_exit(void)
+{
+ BUG_ON(!ceph_msg_data_cache);
+ kmem_cache_destroy(ceph_msg_data_cache);
+ ceph_msg_data_cache = NULL;
+
+ BUG_ON(!ceph_msg_cache);
+ kmem_cache_destroy(ceph_msg_cache);
+ ceph_msg_cache = NULL;
+}
+
+static void _ceph_msgr_exit(void)
+{
+ if (ceph_msgr_wq) {
+ destroy_workqueue(ceph_msgr_wq);
+ ceph_msgr_wq = NULL;
+ }
+
+ ceph_msgr_slab_exit();
+
+ BUG_ON(zero_page == NULL);
+ kunmap(zero_page);
+ page_cache_release(zero_page);
+ zero_page = NULL;
+}
+
+int ceph_msgr_init(void)
+{
+ BUG_ON(zero_page != NULL);
+ zero_page = ZERO_PAGE(0);
+ page_cache_get(zero_page);
+
+ if (ceph_msgr_slab_init())
+ return -ENOMEM;
+
+ /*
+ * The number of active work items is limited by the number of
+ * connections, so leave @max_active at default.
+ */
+ ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0);
+ if (ceph_msgr_wq)
+ return 0;
+
+ pr_err("msgr_init failed to create workqueue\n");
+ _ceph_msgr_exit();
+
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(ceph_msgr_init);
+
+void ceph_msgr_exit(void)
+{
+ BUG_ON(ceph_msgr_wq == NULL);
+
+ _ceph_msgr_exit();
+}
+EXPORT_SYMBOL(ceph_msgr_exit);
+
+void ceph_msgr_flush(void)
+{
+ flush_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_flush);
+
+/* Connection socket state transition functions */
+
+static void con_sock_state_init(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSED);
+}
+
+static void con_sock_state_connecting(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CONNECTING);
+}
+
+static void con_sock_state_connected(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CONNECTED);
+}
+
+static void con_sock_state_closing(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
+ old_state != CON_SOCK_STATE_CONNECTED &&
+ old_state != CON_SOCK_STATE_CLOSING))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSING);
+}
+
+static void con_sock_state_closed(struct ceph_connection *con)
+{
+ int old_state;
+
+ old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
+ if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
+ old_state != CON_SOCK_STATE_CLOSING &&
+ old_state != CON_SOCK_STATE_CONNECTING &&
+ old_state != CON_SOCK_STATE_CLOSED))
+ printk("%s: unexpected old state %d\n", __func__, old_state);
+ dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
+ CON_SOCK_STATE_CLOSED);
+}
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_sock_data_ready(struct sock *sk)
+{
+ struct ceph_connection *con = sk->sk_user_data;
+ if (atomic_read(&con->msgr->stopping)) {
+ return;
+ }
+
+ if (sk->sk_state != TCP_CLOSE_WAIT) {
+ dout("%s on %p state = %lu, queueing work\n", __func__,
+ con, con->state);
+ queue_con(con);
+ }
+}
+
+/* socket has buffer space for writing */
+static void ceph_sock_write_space(struct sock *sk)
+{
+ struct ceph_connection *con = sk->sk_user_data;
+
+ /* only queue to workqueue if there is data we want to write,
+ * and there is sufficient space in the socket buffer to accept
+ * more data. clear SOCK_NOSPACE so that ceph_sock_write_space()
+ * doesn't get called again until try_write() fills the socket
+ * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
+ * and net/core/stream.c:sk_stream_write_space().
+ */
+ if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
+ if (sk_stream_is_writeable(sk)) {
+ dout("%s %p queueing write work\n", __func__, con);
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ queue_con(con);
+ }
+ } else {
+ dout("%s %p nothing to write\n", __func__, con);
+ }
+}
+
+/* socket's state has changed */
+static void ceph_sock_state_change(struct sock *sk)
+{
+ struct ceph_connection *con = sk->sk_user_data;
+
+ dout("%s %p state = %lu sk_state = %u\n", __func__,
+ con, con->state, sk->sk_state);
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ dout("%s TCP_CLOSE\n", __func__);
+ case TCP_CLOSE_WAIT:
+ dout("%s TCP_CLOSE_WAIT\n", __func__);
+ con_sock_state_closing(con);
+ con_flag_set(con, CON_FLAG_SOCK_CLOSED);
+ queue_con(con);
+ break;
+ case TCP_ESTABLISHED:
+ dout("%s TCP_ESTABLISHED\n", __func__);
+ con_sock_state_connected(con);
+ queue_con(con);
+ break;
+ default: /* Everything else is uninteresting */
+ break;
+ }
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+ struct ceph_connection *con)
+{
+ struct sock *sk = sock->sk;
+ sk->sk_user_data = con;
+ sk->sk_data_ready = ceph_sock_data_ready;
+ sk->sk_write_space = ceph_sock_write_space;
+ sk->sk_state_change = ceph_sock_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static int ceph_tcp_connect(struct ceph_connection *con)
+{
+ struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+ struct socket *sock;
+ int ret;
+
+ BUG_ON(con->sock);
+ ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+ if (ret)
+ return ret;
+ sock->sk->sk_allocation = GFP_NOFS;
+
+#ifdef CONFIG_LOCKDEP
+ lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
+ set_sock_callbacks(sock, con);
+
+ dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+
+ con_sock_state_connecting(con);
+ ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+ O_NONBLOCK);
+ if (ret == -EINPROGRESS) {
+ dout("connect %s EINPROGRESS sk_state = %u\n",
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ sock->sk->sk_state);
+ } else if (ret < 0) {
+ pr_err("connect %s error %d\n",
+ ceph_pr_addr(&con->peer_addr.in_addr), ret);
+ sock_release(sock);
+ return ret;
+ }
+
+ if (con->msgr->tcp_nodelay) {
+ int optval = 1;
+
+ ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (char *)&optval, sizeof(optval));
+ if (ret)
+ pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d",
+ ret);
+ }
+
+ con->sock = sock;
+ return 0;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+ int page_offset, size_t length)
+{
+ void *kaddr;
+ int ret;
+
+ BUG_ON(page_offset + length > PAGE_SIZE);
+
+ kaddr = kmap(page);
+ BUG_ON(!kaddr);
+ ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length);
+ kunmap(page);
+
+ return ret;
+}
+
+/*
+ * write something. @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+ size_t kvlen, size_t len, int more)
+{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+static int __ceph_tcp_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, bool more)
+{
+ int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
+ int ret;
+
+ ret = kernel_sendpage(sock, page, offset, size, flags);
+ if (ret == -EAGAIN)
+ ret = 0;
+
+ return ret;
+}
+
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, bool more)
+{
+ int ret;
+ struct kvec iov;
+
+ /* sendpage cannot properly handle pages with page_count == 0,
+ * we need to fallback to sendmsg if that's the case */
+ if (page_count(page) >= 1)
+ return __ceph_tcp_sendpage(sock, page, offset, size, more);
+
+ iov.iov_base = kmap(page) + offset;
+ iov.iov_len = size;
+ ret = ceph_tcp_sendmsg(sock, &iov, 1, size, more);
+ kunmap(page);
+
+ return ret;
+}
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+ int rc = 0;
+
+ dout("con_close_socket on %p sock %p\n", con, con->sock);
+ if (con->sock) {
+ rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+ sock_release(con->sock);
+ con->sock = NULL;
+ }
+
+ /*
+ * Forcibly clear the SOCK_CLOSED flag. It gets set
+ * independent of the connection mutex, and we could have
+ * received a socket close event before we had the chance to
+ * shut the socket down.
+ */
+ con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
+
+ con_sock_state_closed(con);
+ return rc;
+}
+
+/*
+ * Reset a connection. Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+ list_del_init(&msg->list_head);
+ BUG_ON(msg->con == NULL);
+ msg->con->ops->put(msg->con);
+ msg->con = NULL;
+
+ ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+ list_head);
+ ceph_msg_remove(msg);
+ }
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+ /* reset connection, out_queue, msg_ and connect_seq */
+ /* discard existing out_queue and msg_seq */
+ dout("reset_connection %p\n", con);
+ ceph_msg_remove_list(&con->out_queue);
+ ceph_msg_remove_list(&con->out_sent);
+
+ if (con->in_msg) {
+ BUG_ON(con->in_msg->con != con);
+ con->in_msg->con = NULL;
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ con->ops->put(con);
+ }
+
+ con->connect_seq = 0;
+ con->out_seq = 0;
+ if (con->out_msg) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+ con->in_seq = 0;
+ con->in_seq_acked = 0;
+}
+
+/*
+ * mark a peer down. drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+ mutex_lock(&con->mutex);
+ dout("con_close %p peer %s\n", con,
+ ceph_pr_addr(&con->peer_addr.in_addr));
+ con->state = CON_STATE_CLOSED;
+
+ con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */
+ con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
+ con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+ con_flag_clear(con, CON_FLAG_BACKOFF);
+
+ reset_connection(con);
+ con->peer_global_seq = 0;
+ cancel_con(con);
+ con_close_socket(con);
+ mutex_unlock(&con->mutex);
+}
+EXPORT_SYMBOL(ceph_con_close);
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con,
+ __u8 entity_type, __u64 entity_num,
+ struct ceph_entity_addr *addr)
+{
+ mutex_lock(&con->mutex);
+ dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+
+ WARN_ON(con->state != CON_STATE_CLOSED);
+ con->state = CON_STATE_PREOPEN;
+
+ con->peer_name.type = (__u8) entity_type;
+ con->peer_name.num = cpu_to_le64(entity_num);
+
+ memcpy(&con->peer_addr, addr, sizeof(*addr));
+ con->delay = 0; /* reset backoff memory */
+ mutex_unlock(&con->mutex);
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_open);
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+ return con->connect_seq > 0;
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_connection *con, void *private,
+ const struct ceph_connection_operations *ops,
+ struct ceph_messenger *msgr)
+{
+ dout("con_init %p\n", con);
+ memset(con, 0, sizeof(*con));
+ con->private = private;
+ con->ops = ops;
+ con->msgr = msgr;
+
+ con_sock_state_init(con);
+
+ mutex_init(&con->mutex);
+ INIT_LIST_HEAD(&con->out_queue);
+ INIT_LIST_HEAD(&con->out_sent);
+ INIT_DELAYED_WORK(&con->work, con_work);
+
+ con->state = CON_STATE_CLOSED;
+}
+EXPORT_SYMBOL(ceph_con_init);
+
+
+/*
+ * We maintain a global counter to order connection attempts. Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+ u32 ret;
+
+ spin_lock(&msgr->global_seq_lock);
+ if (msgr->global_seq < gt)
+ msgr->global_seq = gt;
+ ret = ++msgr->global_seq;
+ spin_unlock(&msgr->global_seq_lock);
+ return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+ con->out_kvec_left = 0;
+ con->out_kvec_bytes = 0;
+ con->out_kvec_cur = &con->out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+ size_t size, void *data)
+{
+ int index;
+
+ index = con->out_kvec_left;
+ BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+
+ con->out_kvec[index].iov_len = size;
+ con->out_kvec[index].iov_base = data;
+ con->out_kvec_left++;
+ con->out_kvec_bytes += size;
+}
+
+#ifdef CONFIG_BLOCK
+
+/*
+ * For a bio data item, a piece is whatever remains of the next
+ * entry in the current bio iovec, or the first entry in the next
+ * bio in the list.
+ */
+static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct bio *bio;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+ bio = data->bio;
+ BUG_ON(!bio);
+
+ cursor->resid = min(length, data->bio_length);
+ cursor->bio = bio;
+ cursor->bvec_iter = bio->bi_iter;
+ cursor->last_piece =
+ cursor->resid <= bio_iter_len(bio, cursor->bvec_iter);
+}
+
+static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset,
+ size_t *length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct bio *bio;
+ struct bio_vec bio_vec;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+ bio = cursor->bio;
+ BUG_ON(!bio);
+
+ bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
+
+ *page_offset = (size_t) bio_vec.bv_offset;
+ BUG_ON(*page_offset >= PAGE_SIZE);
+ if (cursor->last_piece) /* pagelist offset is always 0 */
+ *length = cursor->resid;
+ else
+ *length = (size_t) bio_vec.bv_len;
+ BUG_ON(*length > cursor->resid);
+ BUG_ON(*page_offset + *length > PAGE_SIZE);
+
+ return bio_vec.bv_page;
+}
+
+static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ struct bio *bio;
+ struct bio_vec bio_vec;
+
+ BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
+
+ bio = cursor->bio;
+ BUG_ON(!bio);
+
+ bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
+
+ /* Advance the cursor offset */
+
+ BUG_ON(cursor->resid < bytes);
+ cursor->resid -= bytes;
+
+ bio_advance_iter(bio, &cursor->bvec_iter, bytes);
+
+ if (bytes < bio_vec.bv_len)
+ return false; /* more bytes to process in this segment */
+
+ /* Move on to the next segment, and possibly the next bio */
+
+ if (!cursor->bvec_iter.bi_size) {
+ bio = bio->bi_next;
+ cursor->bio = bio;
+ if (bio)
+ cursor->bvec_iter = bio->bi_iter;
+ else
+ memset(&cursor->bvec_iter, 0,
+ sizeof(cursor->bvec_iter));
+ }
+
+ if (!cursor->last_piece) {
+ BUG_ON(!cursor->resid);
+ BUG_ON(!bio);
+ /* A short read is OK, so use <= rather than == */
+ if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter))
+ cursor->last_piece = true;
+ }
+
+ return true;
+}
+#endif /* CONFIG_BLOCK */
+
+/*
+ * For a page array, a piece comes from the first page in the array
+ * that has not already been fully consumed.
+ */
+static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ int page_count;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+ BUG_ON(!data->pages);
+ BUG_ON(!data->length);
+
+ cursor->resid = min(length, data->length);
+ page_count = calc_pages_for(data->alignment, (u64)data->length);
+ cursor->page_offset = data->alignment & ~PAGE_MASK;
+ cursor->page_index = 0;
+ BUG_ON(page_count > (int)USHRT_MAX);
+ cursor->page_count = (unsigned short)page_count;
+ BUG_ON(length > SIZE_MAX - cursor->page_offset);
+ cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE;
+}
+
+static struct page *
+ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
+{
+ struct ceph_msg_data *data = cursor->data;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+ BUG_ON(cursor->page_index >= cursor->page_count);
+ BUG_ON(cursor->page_offset >= PAGE_SIZE);
+
+ *page_offset = cursor->page_offset;
+ if (cursor->last_piece)
+ *length = cursor->resid;
+ else
+ *length = PAGE_SIZE - *page_offset;
+
+ return data->pages[cursor->page_index];
+}
+
+static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
+
+ BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
+
+ /* Advance the cursor page offset */
+
+ cursor->resid -= bytes;
+ cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK;
+ if (!bytes || cursor->page_offset)
+ return false; /* more bytes to process in the current page */
+
+ if (!cursor->resid)
+ return false; /* no more data */
+
+ /* Move on to the next page; offset is already at 0 */
+
+ BUG_ON(cursor->page_index >= cursor->page_count);
+ cursor->page_index++;
+ cursor->last_piece = cursor->resid <= PAGE_SIZE;
+
+ return true;
+}
+
+/*
+ * For a pagelist, a piece is whatever remains to be consumed in the
+ * first page in the list, or the front of the next page.
+ */
+static void
+ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+ struct page *page;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ if (!length)
+ return; /* pagelist can be assigned but empty */
+
+ BUG_ON(list_empty(&pagelist->head));
+ page = list_first_entry(&pagelist->head, struct page, lru);
+
+ cursor->resid = min(length, pagelist->length);
+ cursor->page = page;
+ cursor->offset = 0;
+ cursor->last_piece = cursor->resid <= PAGE_SIZE;
+}
+
+static struct page *
+ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ BUG_ON(!cursor->page);
+ BUG_ON(cursor->offset + cursor->resid != pagelist->length);
+
+ /* offset of first page in pagelist is always 0 */
+ *page_offset = cursor->offset & ~PAGE_MASK;
+ if (cursor->last_piece)
+ *length = cursor->resid;
+ else
+ *length = PAGE_SIZE - *page_offset;
+
+ return cursor->page;
+}
+
+static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ struct ceph_msg_data *data = cursor->data;
+ struct ceph_pagelist *pagelist;
+
+ BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+ pagelist = data->pagelist;
+ BUG_ON(!pagelist);
+
+ BUG_ON(cursor->offset + cursor->resid != pagelist->length);
+ BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
+
+ /* Advance the cursor offset */
+
+ cursor->resid -= bytes;
+ cursor->offset += bytes;
+ /* offset of first page in pagelist is always 0 */
+ if (!bytes || cursor->offset & ~PAGE_MASK)
+ return false; /* more bytes to process in the current page */
+
+ if (!cursor->resid)
+ return false; /* no more data */
+
+ /* Move on to the next page */
+
+ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
+ cursor->page = list_entry_next(cursor->page, lru);
+ cursor->last_piece = cursor->resid <= PAGE_SIZE;
+
+ return true;
+}
+
+/*
+ * Message data is handled (sent or received) in pieces, where each
+ * piece resides on a single page. The network layer might not
+ * consume an entire piece at once. A data item's cursor keeps
+ * track of which piece is next to process and how much remains to
+ * be processed in that piece. It also tracks whether the current
+ * piece is the last one in the data item.
+ */
+static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
+{
+ size_t length = cursor->total_resid;
+
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ ceph_msg_data_pagelist_cursor_init(cursor, length);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ ceph_msg_data_pages_cursor_init(cursor, length);
+ break;
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+ ceph_msg_data_bio_cursor_init(cursor, length);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_NONE:
+ default:
+ /* BUG(); */
+ break;
+ }
+ cursor->need_crc = true;
+}
+
+static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+{
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ struct ceph_msg_data *data;
+
+ BUG_ON(!length);
+ BUG_ON(length > msg->data_length);
+ BUG_ON(list_empty(&msg->data));
+
+ cursor->data_head = &msg->data;
+ cursor->total_resid = length;
+ data = list_first_entry(&msg->data, struct ceph_msg_data, links);
+ cursor->data = data;
+
+ __ceph_msg_data_cursor_init(cursor);
+}
+
+/*
+ * Return the page containing the next piece to process for a given
+ * data item, and supply the page offset and length of that piece.
+ * Indicate whether this is the last piece in this data item.
+ */
+static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length,
+ bool *last_piece)
+{
+ struct page *page;
+
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ page = ceph_msg_data_pages_next(cursor, page_offset, length);
+ break;
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+ page = ceph_msg_data_bio_next(cursor, page_offset, length);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_NONE:
+ default:
+ page = NULL;
+ break;
+ }
+ BUG_ON(!page);
+ BUG_ON(*page_offset + *length > PAGE_SIZE);
+ BUG_ON(!*length);
+ if (last_piece)
+ *last_piece = cursor->last_piece;
+
+ return page;
+}
+
+/*
+ * Returns true if the result moves the cursor on to the next piece
+ * of the data item.
+ */
+static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ bool new_piece;
+
+ BUG_ON(bytes > cursor->resid);
+ switch (cursor->data->type) {
+ case CEPH_MSG_DATA_PAGELIST:
+ new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
+ break;
+ case CEPH_MSG_DATA_PAGES:
+ new_piece = ceph_msg_data_pages_advance(cursor, bytes);
+ break;
+#ifdef CONFIG_BLOCK
+ case CEPH_MSG_DATA_BIO:
+ new_piece = ceph_msg_data_bio_advance(cursor, bytes);
+ break;
+#endif /* CONFIG_BLOCK */
+ case CEPH_MSG_DATA_NONE:
+ default:
+ BUG();
+ break;
+ }
+ cursor->total_resid -= bytes;
+
+ if (!cursor->resid && cursor->total_resid) {
+ WARN_ON(!cursor->last_piece);
+ BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
+ cursor->data = list_entry_next(cursor->data, links);
+ __ceph_msg_data_cursor_init(cursor);
+ new_piece = true;
+ }
+ cursor->need_crc = new_piece;
+
+ return new_piece;
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+ BUG_ON(!msg);
+ BUG_ON(!data_len);
+
+ /* Initialize data cursor */
+
+ ceph_msg_data_cursor_init(msg, (size_t)data_len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off. Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->out_msg;
+ int v = con->out_kvec_left;
+
+ m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+ dout("prepare_write_message_footer %p\n", con);
+ con->out_kvec_is_msg = true;
+ con->out_kvec[v].iov_base = &m->footer;
+ if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+ if (con->ops->sign_message)
+ con->ops->sign_message(con, m);
+ else
+ m->footer.sig = 0;
+ con->out_kvec[v].iov_len = sizeof(m->footer);
+ con->out_kvec_bytes += sizeof(m->footer);
+ } else {
+ m->old_footer.flags = m->footer.flags;
+ con->out_kvec[v].iov_len = sizeof(m->old_footer);
+ con->out_kvec_bytes += sizeof(m->old_footer);
+ }
+ con->out_kvec_left++;
+ con->out_more = m->more_to_follow;
+ con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ u32 crc;
+
+ con_out_kvec_reset(con);
+ con->out_kvec_is_msg = true;
+ con->out_msg_done = false;
+
+ /* Sneak an ack in there first? If we can get it into the same
+ * TCP packet that's a good thing. */
+ if (con->in_seq > con->in_seq_acked) {
+ con->in_seq_acked = con->in_seq;
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof (con->out_temp_ack),
+ &con->out_temp_ack);
+ }
+
+ BUG_ON(list_empty(&con->out_queue));
+ m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
+ con->out_msg = m;
+ BUG_ON(m->con != con);
+
+ /* put message on sent list */
+ ceph_msg_get(m);
+ list_move_tail(&m->list_head, &con->out_sent);
+
+ /*
+ * only assign outgoing seq # if we haven't sent this message
+ * yet. if it is requeued, resend with it's original seq.
+ */
+ if (m->needs_out_seq) {
+ m->hdr.seq = cpu_to_le64(++con->out_seq);
+ m->needs_out_seq = false;
+ }
+ WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+ m, con->out_seq, le16_to_cpu(m->hdr.type),
+ le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+ m->data_length);
+ BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+ /* tag + hdr + front + middle */
+ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+ con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+ con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+ if (m->middle)
+ con_out_kvec_add(con, m->middle->vec.iov_len,
+ m->middle->vec.iov_base);
+
+ /* fill in crc (except data pages), footer */
+ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+ con->out_msg->hdr.crc = cpu_to_le32(crc);
+ con->out_msg->footer.flags = 0;
+
+ crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+ con->out_msg->footer.front_crc = cpu_to_le32(crc);
+ if (m->middle) {
+ crc = crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len);
+ con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+ } else
+ con->out_msg->footer.middle_crc = 0;
+ dout("%s front_crc %u middle_crc %u\n", __func__,
+ le32_to_cpu(con->out_msg->footer.front_crc),
+ le32_to_cpu(con->out_msg->footer.middle_crc));
+
+ /* is there a data payload? */
+ con->out_msg->footer.data_crc = 0;
+ if (m->data_length) {
+ prepare_message_data(con->out_msg, m->data_length);
+ con->out_more = 1; /* data + footer will follow */
+ } else {
+ /* no, queue up footer too and be done */
+ prepare_write_message_footer(con);
+ }
+
+ con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+ dout("prepare_write_ack %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof (con->out_temp_ack),
+ &con->out_temp_ack);
+
+ con->out_more = 1; /* more will follow.. eventually.. */
+ con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+ dout("prepare_write_seq %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof (con->out_temp_ack),
+ &con->out_temp_ack);
+
+ con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+ dout("prepare_write_keepalive %p\n", con);
+ con_out_kvec_reset(con);
+ con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+ con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con,
+ int *auth_proto)
+{
+ struct ceph_auth_handshake *auth;
+
+ if (!con->ops->get_authorizer) {
+ con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+ con->out_connect.authorizer_len = 0;
+ return NULL;
+ }
+
+ /* Can't hold the mutex while getting authorizer */
+ mutex_unlock(&con->mutex);
+ auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
+ mutex_lock(&con->mutex);
+
+ if (IS_ERR(auth))
+ return auth;
+ if (con->state != CON_STATE_NEGOTIATING)
+ return ERR_PTR(-EAGAIN);
+
+ con->auth_reply_buf = auth->authorizer_reply_buf;
+ con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
+ return auth;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+ con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+ &con->msgr->my_enc_addr);
+
+ con->out_more = 0;
+ con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+ unsigned int global_seq = get_global_seq(con->msgr, 0);
+ int proto;
+ int auth_proto;
+ struct ceph_auth_handshake *auth;
+
+ switch (con->peer_name.type) {
+ case CEPH_ENTITY_TYPE_MON:
+ proto = CEPH_MONC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_OSD:
+ proto = CEPH_OSDC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_MDS:
+ proto = CEPH_MDSC_PROTOCOL;
+ break;
+ default:
+ BUG();
+ }
+
+ dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+ con->connect_seq, global_seq, proto);
+
+ con->out_connect.features = cpu_to_le64(con->msgr->supported_features);
+ con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+ con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+ con->out_connect.global_seq = cpu_to_le32(global_seq);
+ con->out_connect.protocol_version = cpu_to_le32(proto);
+ con->out_connect.flags = 0;
+
+ auth_proto = CEPH_AUTH_UNKNOWN;
+ auth = get_connect_authorizer(con, &auth_proto);
+ if (IS_ERR(auth))
+ return PTR_ERR(auth);
+
+ con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+ con->out_connect.authorizer_len = auth ?
+ cpu_to_le32(auth->authorizer_buf_len) : 0;
+
+ con_out_kvec_add(con, sizeof (con->out_connect),
+ &con->out_connect);
+ if (auth && auth->authorizer_buf_len)
+ con_out_kvec_add(con, auth->authorizer_buf_len,
+ auth->authorizer_buf);
+
+ con->out_more = 0;
+ con_flag_set(con, CON_FLAG_WRITE_PENDING);
+
+ return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ * 1 -> done
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+ while (con->out_kvec_bytes > 0) {
+ ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+ con->out_kvec_left, con->out_kvec_bytes,
+ con->out_more);
+ if (ret <= 0)
+ goto out;
+ con->out_kvec_bytes -= ret;
+ if (con->out_kvec_bytes == 0)
+ break; /* done */
+
+ /* account for full iov entries consumed */
+ while (ret >= con->out_kvec_cur->iov_len) {
+ BUG_ON(!con->out_kvec_left);
+ ret -= con->out_kvec_cur->iov_len;
+ con->out_kvec_cur++;
+ con->out_kvec_left--;
+ }
+ /* and for a partially-consumed entry */
+ if (ret) {
+ con->out_kvec_cur->iov_len -= ret;
+ con->out_kvec_cur->iov_base += ret;
+ }
+ }
+ con->out_kvec_left = 0;
+ con->out_kvec_is_msg = false;
+ ret = 1;
+out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+ con->out_kvec_bytes, con->out_kvec_left, ret);
+ return ret; /* done! */
+}
+
+static u32 ceph_crc32c_page(u32 crc, struct page *page,
+ unsigned int page_offset,
+ unsigned int length)
+{
+ char *kaddr;
+
+ kaddr = kmap(page);
+ BUG_ON(kaddr == NULL);
+ crc = crc32c(crc, kaddr + page_offset, length);
+ kunmap(page);
+
+ return crc;
+}
+/*
+ * Write as much message data payload as we can. If we finish, queue
+ * up the footer.
+ * 1 -> done, footer is now queued in out_kvec[].
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ bool do_datacrc = !con->msgr->nocrc;
+ u32 crc;
+
+ dout("%s %p msg %p\n", __func__, con, msg);
+
+ if (list_empty(&msg->data))
+ return -EINVAL;
+
+ /*
+ * Iterate through each page that contains data to be
+ * written, and send as much as possible for each.
+ *
+ * If we are calculating the data crc (the default), we will
+ * need to map the page. If we have no pages, they have
+ * been revoked, so use the zero page.
+ */
+ crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+ while (cursor->resid) {
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ bool last_piece;
+ bool need_crc;
+ int ret;
+
+ page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
+ &last_piece);
+ ret = ceph_tcp_sendpage(con->sock, page, page_offset,
+ length, last_piece);
+ if (ret <= 0) {
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+
+ return ret;
+ }
+ if (do_datacrc && cursor->need_crc)
+ crc = ceph_crc32c_page(crc, page, page_offset, length);
+ need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);
+ }
+
+ dout("%s %p msg %p done\n", __func__, con, msg);
+
+ /* prepare and queue up footer, too */
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+ else
+ msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con_out_kvec_reset(con);
+ prepare_write_message_footer(con);
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+ int ret;
+
+ while (con->out_skip > 0) {
+ size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
+
+ ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
+ if (ret <= 0)
+ goto out;
+ con->out_skip -= ret;
+ }
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+ dout("prepare_read_banner %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+ dout("prepare_read_connect %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_ack %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+ dout("prepare_read_seq %p\n", con);
+ con->in_base_pos = 0;
+ con->in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+ dout("prepare_read_tag %p\n", con);
+ con->in_base_pos = 0;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+ dout("prepare_read_message %p\n", con);
+ BUG_ON(con->in_msg != NULL);
+ con->in_base_pos = 0;
+ con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+ return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+ int end, int size, void *object)
+{
+ while (con->in_base_pos < end) {
+ int left = end - con->in_base_pos;
+ int have = size - left;
+ int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ }
+ return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+ /* peer's banner */
+ size = strlen(CEPH_BANNER);
+ end = size;
+ ret = read_partial(con, end, size, con->in_banner);
+ if (ret <= 0)
+ goto out;
+
+ size = sizeof (con->actual_peer_addr);
+ end += size;
+ ret = read_partial(con, end, size, &con->actual_peer_addr);
+ if (ret <= 0)
+ goto out;
+
+ size = sizeof (con->peer_addr_for_me);
+ end += size;
+ ret = read_partial(con, end, size, &con->peer_addr_for_me);
+ if (ret <= 0)
+ goto out;
+
+out:
+ return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+ size = sizeof (con->in_reply);
+ end = size;
+ ret = read_partial(con, end, size, &con->in_reply);
+ if (ret <= 0)
+ goto out;
+
+ size = le32_to_cpu(con->in_reply.authorizer_len);
+ end += size;
+ ret = read_partial(con, end, size, con->auth_reply_buf);
+ if (ret <= 0)
+ goto out;
+
+ dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+ con, (int)con->in_reply.tag,
+ le32_to_cpu(con->in_reply.connect_seq),
+ le32_to_cpu(con->in_reply.global_seq));
+out:
+ return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+ if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ pr_err("connect to %s got bad banner\n",
+ ceph_pr_addr(&con->peer_addr.in_addr));
+ con->error_msg = "protocol error, bad banner";
+ return -1;
+ }
+ return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+ case AF_INET6:
+ return
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+ }
+ return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ return ntohs(((struct sockaddr_in *)ss)->sin_port);
+ case AF_INET6:
+ return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+ }
+ return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)ss)->sin_port = htons(p);
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+ break;
+ }
+}
+
+/*
+ * Unlike other *_pton function semantics, zero indicates success.
+ */
+static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
+ char delim, const char **ipend)
+{
+ struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
+
+ memset(ss, 0, sizeof(*ss));
+
+ if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) {
+ ss->ss_family = AF_INET;
+ return 0;
+ }
+
+ if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) {
+ ss->ss_family = AF_INET6;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * Extract hostname string and resolve using kernel DNS facility.
+ */
+#ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
+static int ceph_dns_resolve_name(const char *name, size_t namelen,
+ struct sockaddr_storage *ss, char delim, const char **ipend)
+{
+ const char *end, *delim_p;
+ char *colon_p, *ip_addr = NULL;
+ int ip_len, ret;
+
+ /*
+ * The end of the hostname occurs immediately preceding the delimiter or
+ * the port marker (':') where the delimiter takes precedence.
+ */
+ delim_p = memchr(name, delim, namelen);
+ colon_p = memchr(name, ':', namelen);
+
+ if (delim_p && colon_p)
+ end = delim_p < colon_p ? delim_p : colon_p;
+ else if (!delim_p && colon_p)
+ end = colon_p;
+ else {
+ end = delim_p;
+ if (!end) /* case: hostname:/ */
+ end = name + namelen;
+ }
+
+ if (end <= name)
+ return -EINVAL;
+
+ /* do dns_resolve upcall */
+ ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL);
+ if (ip_len > 0)
+ ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL);
+ else
+ ret = -ESRCH;
+
+ kfree(ip_addr);
+
+ *ipend = end;
+
+ pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
+ ret, ret ? "failed" : ceph_pr_addr(ss));
+
+ return ret;
+}
+#else
+static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
+ struct sockaddr_storage *ss, char delim, const char **ipend)
+{
+ return -EINVAL;
+}
+#endif
+
+/*
+ * Parse a server name (IP or hostname). If a valid IP address is not found
+ * then try to extract a hostname to resolve using userspace DNS upcall.
+ */
+static int ceph_parse_server_name(const char *name, size_t namelen,
+ struct sockaddr_storage *ss, char delim, const char **ipend)
+{
+ int ret;
+
+ ret = ceph_pton(name, namelen, ss, delim, ipend);
+ if (ret)
+ ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend);
+
+ return ret;
+}
+
+/*
+ * Parse an ip[:port] list into an addr array. Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count)
+{
+ int i, ret = -EINVAL;
+ const char *p = c;
+
+ dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+ for (i = 0; i < max_count; i++) {
+ const char *ipend;
+ struct sockaddr_storage *ss = &addr[i].in_addr;
+ int port;
+ char delim = ',';
+
+ if (*p == '[') {
+ delim = ']';
+ p++;
+ }
+
+ ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend);
+ if (ret)
+ goto bad;
+ ret = -EINVAL;
+
+ p = ipend;
+
+ if (delim == ']') {
+ if (*p != ']') {
+ dout("missing matching ']'\n");
+ goto bad;
+ }
+ p++;
+ }
+
+ /* port? */
+ if (p < end && *p == ':') {
+ port = 0;
+ p++;
+ while (p < end && *p >= '0' && *p <= '9') {
+ port = (port * 10) + (*p - '0');
+ p++;
+ }
+ if (port == 0)
+ port = CEPH_MON_PORT;
+ else if (port > 65535)
+ goto bad;
+ } else {
+ port = CEPH_MON_PORT;
+ }
+
+ addr_set_port(ss, port);
+
+ dout("parse_ips got %s\n", ceph_pr_addr(ss));
+
+ if (p == end)
+ break;
+ if (*p != ',')
+ goto bad;
+ p++;
+ }
+
+ if (p != end)
+ goto bad;
+
+ if (count)
+ *count = i + 1;
+ return 0;
+
+bad:
+ pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_parse_ips);
+
+static int process_banner(struct ceph_connection *con)
+{
+ dout("process_banner on %p\n", con);
+
+ if (verify_hello(con) < 0)
+ return -1;
+
+ ceph_decode_addr(&con->actual_peer_addr);
+ ceph_decode_addr(&con->peer_addr_for_me);
+
+ /*
+ * Make sure the other end is who we wanted. note that the other
+ * end may not yet know their ip address, so if it's 0.0.0.0, give
+ * them the benefit of the doubt.
+ */
+ if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+ sizeof(con->peer_addr)) != 0 &&
+ !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+ con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+ pr_warn("wrong peer, want %s/%d, got %s/%d\n",
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ (int)le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&con->actual_peer_addr.in_addr),
+ (int)le32_to_cpu(con->actual_peer_addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -1;
+ }
+
+ /*
+ * did we learn our address?
+ */
+ if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+ int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+ memcpy(&con->msgr->inst.addr.in_addr,
+ &con->peer_addr_for_me.in_addr,
+ sizeof(con->peer_addr_for_me.in_addr));
+ addr_set_port(&con->msgr->inst.addr.in_addr, port);
+ encode_my_addr(con->msgr);
+ dout("process_banner learned my addr is %s\n",
+ ceph_pr_addr(&con->msgr->inst.addr.in_addr));
+ }
+
+ return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+ u64 sup_feat = con->msgr->supported_features;
+ u64 req_feat = con->msgr->required_features;
+ u64 server_feat = ceph_sanitize_features(
+ le64_to_cpu(con->in_reply.features));
+ int ret;
+
+ dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+ switch (con->in_reply.tag) {
+ case CEPH_MSGR_TAG_FEATURES:
+ pr_err("%s%lld %s feature set mismatch,"
+ " my %llx < server's %llx, missing %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ sup_feat, server_feat, server_feat & ~sup_feat);
+ con->error_msg = "missing required protocol features";
+ reset_connection(con);
+ return -1;
+
+ case CEPH_MSGR_TAG_BADPROTOVER:
+ pr_err("%s%lld %s protocol version mismatch,"
+ " my %d != server's %d\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ le32_to_cpu(con->out_connect.protocol_version),
+ le32_to_cpu(con->in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+ reset_connection(con);
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+ con->auth_retry++;
+ dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+ con->auth_retry);
+ if (con->auth_retry == 2) {
+ con->error_msg = "connect authorization failure";
+ return -1;
+ }
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RESETSESSION:
+ /*
+ * If we connected with a large connect_seq but the peer
+ * has no record of a session with us (no connection, or
+ * connect_seq == 0), they will send RESETSESION to indicate
+ * that they must have reset their session, and may have
+ * dropped messages.
+ */
+ dout("process_connect got RESET peer seq %u\n",
+ le32_to_cpu(con->in_reply.connect_seq));
+ pr_err("%s%lld %s connection reset\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr));
+ reset_connection(con);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+
+ /* Tell ceph about it. */
+ mutex_unlock(&con->mutex);
+ pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CON_STATE_NEGOTIATING)
+ return -EAGAIN;
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_SESSION:
+ /*
+ * If we sent a smaller connect_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+ le32_to_cpu(con->out_connect.connect_seq),
+ le32_to_cpu(con->in_reply.connect_seq));
+ con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_GLOBAL:
+ /*
+ * If we sent a smaller global_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+ con->peer_global_seq,
+ le32_to_cpu(con->in_reply.global_seq));
+ get_global_seq(con->msgr,
+ le32_to_cpu(con->in_reply.global_seq));
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_SEQ:
+ case CEPH_MSGR_TAG_READY:
+ if (req_feat & ~server_feat) {
+ pr_err("%s%lld %s protocol feature mismatch,"
+ " my required %llx > server's %llx, need %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ req_feat, server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ reset_connection(con);
+ return -1;
+ }
+
+ WARN_ON(con->state != CON_STATE_NEGOTIATING);
+ con->state = CON_STATE_OPEN;
+ con->auth_retry = 0; /* we authenticated; clear flag */
+ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+ con->connect_seq++;
+ con->peer_features = server_feat;
+ dout("process_connect got READY gseq %d cseq %d (%d)\n",
+ con->peer_global_seq,
+ le32_to_cpu(con->in_reply.connect_seq),
+ con->connect_seq);
+ WARN_ON(con->connect_seq !=
+ le32_to_cpu(con->in_reply.connect_seq));
+
+ if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+ con_flag_set(con, CON_FLAG_LOSSYTX);
+
+ con->delay = 0; /* reset backoff memory */
+
+ if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+ prepare_write_seq(con);
+ prepare_read_seq(con);
+ } else {
+ prepare_read_tag(con);
+ }
+ break;
+
+ case CEPH_MSGR_TAG_WAIT:
+ /*
+ * If there is a connection race (we are opening
+ * connections to each other), one of us may just have
+ * to WAIT. This shouldn't happen if we are the
+ * client.
+ */
+ con->error_msg = "protocol error, got WAIT as client";
+ return -1;
+
+ default:
+ con->error_msg = "protocol error, garbage tag during connect";
+ return -1;
+ }
+ return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+ int size = sizeof (con->in_temp_ack);
+ int end = size;
+
+ return read_partial(con, end, size, &con->in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ u64 ack = le64_to_cpu(con->in_temp_ack);
+ u64 seq;
+
+ while (!list_empty(&con->out_sent)) {
+ m = list_first_entry(&con->out_sent, struct ceph_msg,
+ list_head);
+ seq = le64_to_cpu(m->hdr.seq);
+ if (seq > ack)
+ break;
+ dout("got ack for seq %llu type %d at %p\n", seq,
+ le16_to_cpu(m->hdr.type), m);
+ m->ack_stamp = jiffies;
+ ceph_msg_remove(m);
+ }
+ prepare_read_tag(con);
+}
+
+
+static int read_partial_message_section(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ int ret, left;
+
+ BUG_ON(!section);
+
+ while (section->iov_len < sec_len) {
+ BUG_ON(section->iov_base == NULL);
+ left = sec_len - section->iov_len;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+ section->iov_len, left);
+ if (ret <= 0)
+ return ret;
+ section->iov_len += ret;
+ }
+ if (section->iov_len == sec_len)
+ *crc = crc32c(0, section->iov_base, section->iov_len);
+
+ return 1;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ const bool do_datacrc = !con->msgr->nocrc;
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ u32 crc = 0;
+ int ret;
+
+ BUG_ON(!msg);
+ if (list_empty(&msg->data))
+ return -EIO;
+
+ if (do_datacrc)
+ crc = con->in_data_crc;
+ while (cursor->resid) {
+ page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
+ NULL);
+ ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+ if (ret <= 0) {
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return ret;
+ }
+
+ if (do_datacrc)
+ crc = ceph_crc32c_page(crc, page, page_offset, ret);
+ (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);
+ }
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
+
+static int read_partial_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->in_msg;
+ int size;
+ int end;
+ int ret;
+ unsigned int front_len, middle_len, data_len;
+ bool do_datacrc = !con->msgr->nocrc;
+ bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+ u64 seq;
+ u32 crc;
+
+ dout("read_partial_message con %p msg %p\n", con, m);
+
+ /* header */
+ size = sizeof (con->in_hdr);
+ end = size;
+ ret = read_partial(con, end, size, &con->in_hdr);
+ if (ret <= 0)
+ return ret;
+
+ crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
+ if (cpu_to_le32(crc) != con->in_hdr.crc) {
+ pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+ crc, con->in_hdr.crc);
+ return -EBADMSG;
+ }
+
+ front_len = le32_to_cpu(con->in_hdr.front_len);
+ if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+ return -EIO;
+ middle_len = le32_to_cpu(con->in_hdr.middle_len);
+ if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+ return -EIO;
+ data_len = le32_to_cpu(con->in_hdr.data_len);
+ if (data_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+
+ /* verify seq# */
+ seq = le64_to_cpu(con->in_hdr.seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ seq, con->in_seq + 1);
+ con->in_base_pos = -front_len - middle_len - data_len -
+ sizeof(m->footer);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ return 0;
+ } else if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("read_partial_message bad seq %lld expected %lld\n",
+ seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ /* allocate message? */
+ if (!con->in_msg) {
+ int skip = 0;
+
+ dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+ front_len, data_len);
+ ret = ceph_con_in_msg_alloc(con, &skip);
+ if (ret < 0)
+ return ret;
+
+ BUG_ON(!con->in_msg ^ skip);
+ if (con->in_msg && data_len > con->in_msg->data_length) {
+ pr_warn("%s skipping long message (%u > %zd)\n",
+ __func__, data_len, con->in_msg->data_length);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ skip = 1;
+ }
+ if (skip) {
+ /* skip this message */
+ dout("alloc_msg said skip message\n");
+ con->in_base_pos = -front_len - middle_len - data_len -
+ sizeof(m->footer);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ return 0;
+ }
+
+ BUG_ON(!con->in_msg);
+ BUG_ON(con->in_msg->con != con);
+ m = con->in_msg;
+ m->front.iov_len = 0; /* haven't read it yet */
+ if (m->middle)
+ m->middle->vec.iov_len = 0;
+
+ /* prepare for data payload, if any */
+
+ if (data_len)
+ prepare_message_data(con->in_msg, data_len);
+ }
+
+ /* front */
+ ret = read_partial_message_section(con, &m->front, front_len,
+ &con->in_front_crc);
+ if (ret <= 0)
+ return ret;
+
+ /* middle */
+ if (m->middle) {
+ ret = read_partial_message_section(con, &m->middle->vec,
+ middle_len,
+ &con->in_middle_crc);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* (page) data */
+ if (data_len) {
+ ret = read_partial_msg_data(con);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* footer */
+ if (need_sign)
+ size = sizeof(m->footer);
+ else
+ size = sizeof(m->old_footer);
+
+ end += size;
+ ret = read_partial(con, end, size, &m->footer);
+ if (ret <= 0)
+ return ret;
+
+ if (!need_sign) {
+ m->footer.flags = m->old_footer.flags;
+ m->footer.sig = 0;
+ }
+
+ dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+ m, front_len, m->footer.front_crc, middle_len,
+ m->footer.middle_crc, data_len, m->footer.data_crc);
+
+ /* crc ok? */
+ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+ pr_err("read_partial_message %p front crc %u != exp. %u\n",
+ m, con->in_front_crc, m->footer.front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+ pr_err("read_partial_message %p middle crc %u != exp %u\n",
+ m, con->in_middle_crc, m->footer.middle_crc);
+ return -EBADMSG;
+ }
+ if (do_datacrc &&
+ (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+ con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+ pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+ con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+ return -EBADMSG;
+ }
+
+ if (need_sign && con->ops->check_message_signature &&
+ con->ops->check_message_signature(con, m)) {
+ pr_err("read_partial_message %p signature check failed\n", m);
+ return -EBADMSG;
+ }
+
+ return 1; /* done! */
+}
+
+/*
+ * Process message. This happens in the worker thread. The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+
+ BUG_ON(con->in_msg->con != con);
+ con->in_msg->con = NULL;
+ msg = con->in_msg;
+ con->in_msg = NULL;
+ con->ops->put(con);
+
+ /* if first message, set peer_name */
+ if (con->peer_name.type == 0)
+ con->peer_name = msg->hdr.src;
+
+ con->in_seq++;
+ mutex_unlock(&con->mutex);
+
+ dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+ msg, le64_to_cpu(msg->hdr.seq),
+ ENTITY_NAME(msg->hdr.src),
+ le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.data_len),
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+ con->ops->dispatch(con, msg);
+
+ mutex_lock(&con->mutex);
+}
+
+
+/*
+ * Write something to the socket. Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+ int ret = 1;
+
+ dout("try_write start %p state %lu\n", con, con->state);
+
+more:
+ dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+ /* open the socket first? */
+ if (con->state == CON_STATE_PREOPEN) {
+ BUG_ON(con->sock);
+ con->state = CON_STATE_CONNECTING;
+
+ con_out_kvec_reset(con);
+ prepare_write_banner(con);
+ prepare_read_banner(con);
+
+ BUG_ON(con->in_msg);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ dout("try_write initiating connect on %p new state %lu\n",
+ con, con->state);
+ ret = ceph_tcp_connect(con);
+ if (ret < 0) {
+ con->error_msg = "connect error";
+ goto out;
+ }
+ }
+
+more_kvec:
+ /* kvec data queued? */
+ if (con->out_skip) {
+ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto out;
+ }
+ if (con->out_kvec_left) {
+ ret = write_partial_kvec(con);
+ if (ret <= 0)
+ goto out;
+ }
+
+ /* msg pages? */
+ if (con->out_msg) {
+ if (con->out_msg_done) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL; /* we're done with this one */
+ goto do_next;
+ }
+
+ ret = write_partial_message_data(con);
+ if (ret == 1)
+ goto more_kvec; /* we need to send the footer, too! */
+ if (ret == 0)
+ goto out;
+ if (ret < 0) {
+ dout("try_write write_partial_message_data err %d\n",
+ ret);
+ goto out;
+ }
+ }
+
+do_next:
+ if (con->state == CON_STATE_OPEN) {
+ /* is anything else pending? */
+ if (!list_empty(&con->out_queue)) {
+ prepare_write_message(con);
+ goto more;
+ }
+ if (con->in_seq > con->in_seq_acked) {
+ prepare_write_ack(con);
+ goto more;
+ }
+ if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
+ }
+
+ /* Nothing to do! */
+ con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+ dout("try_write nothing else to write.\n");
+ ret = 0;
+out:
+ dout("try_write done on %p ret %d\n", con, ret);
+ return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+ int ret = -1;
+
+more:
+ dout("try_read start on %p state %lu\n", con, con->state);
+ if (con->state != CON_STATE_CONNECTING &&
+ con->state != CON_STATE_NEGOTIATING &&
+ con->state != CON_STATE_OPEN)
+ return 0;
+
+ BUG_ON(!con->sock);
+
+ dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+ con->in_base_pos);
+
+ if (con->state == CON_STATE_CONNECTING) {
+ dout("try_read connecting\n");
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_banner(con);
+ if (ret < 0)
+ goto out;
+
+ con->state = CON_STATE_NEGOTIATING;
+
+ /*
+ * Received banner is good, exchange connection info.
+ * Do not reset out_kvec, as sending our banner raced
+ * with receiving peer banner after connect completed.
+ */
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ goto out;
+ prepare_read_connect(con);
+
+ /* Send connection info before awaiting response */
+ goto out;
+ }
+
+ if (con->state == CON_STATE_NEGOTIATING) {
+ dout("try_read negotiating\n");
+ ret = read_partial_connect(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_connect(con);
+ if (ret < 0)
+ goto out;
+ goto more;
+ }
+
+ WARN_ON(con->state != CON_STATE_OPEN);
+
+ if (con->in_base_pos < 0) {
+ /*
+ * skipping + discarding content.
+ *
+ * FIXME: there must be a better way to do this!
+ */
+ static char buf[SKIP_BUF_SIZE];
+ int skip = min((int) sizeof (buf), -con->in_base_pos);
+
+ dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+ ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+ if (ret <= 0)
+ goto out;
+ con->in_base_pos += ret;
+ if (con->in_base_pos)
+ goto more;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_READY) {
+ /*
+ * what's next?
+ */
+ ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+ if (ret <= 0)
+ goto out;
+ dout("try_read got tag %d\n", (int)con->in_tag);
+ switch (con->in_tag) {
+ case CEPH_MSGR_TAG_MSG:
+ prepare_read_message(con);
+ break;
+ case CEPH_MSGR_TAG_ACK:
+ prepare_read_ack(con);
+ break;
+ case CEPH_MSGR_TAG_CLOSE:
+ con_close_socket(con);
+ con->state = CON_STATE_CLOSED;
+ goto out;
+ default:
+ goto bad_tag;
+ }
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+ ret = read_partial_message(con);
+ if (ret <= 0) {
+ switch (ret) {
+ case -EBADMSG:
+ con->error_msg = "bad crc";
+ /* fall through */
+ case -EBADE:
+ ret = -EIO;
+ break;
+ case -EIO:
+ con->error_msg = "io error";
+ break;
+ }
+ goto out;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_READY)
+ goto more;
+ process_message(con);
+ if (con->state == CON_STATE_OPEN)
+ prepare_read_tag(con);
+ goto more;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_ACK ||
+ con->in_tag == CEPH_MSGR_TAG_SEQ) {
+ /*
+ * the final handshake seq exchange is semantically
+ * equivalent to an ACK
+ */
+ ret = read_partial_ack(con);
+ if (ret <= 0)
+ goto out;
+ process_ack(con);
+ goto more;
+ }
+
+out:
+ dout("try_read done on %p ret %d\n", con, ret);
+ return ret;
+
+bad_tag:
+ pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+ con->error_msg = "protocol error, garbage tag";
+ ret = -1;
+ goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection after the specified delay.
+ * Bump @con reference to avoid races with connection teardown.
+ * Returns 0 if work was queued, or an error code otherwise.
+ */
+static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
+{
+ if (!con->ops->get(con)) {
+ dout("%s %p ref count 0\n", __func__, con);
+ return -ENOENT;
+ }
+
+ if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
+ dout("%s %p - already queued\n", __func__, con);
+ con->ops->put(con);
+ return -EBUSY;
+ }
+
+ dout("%s %p %lu\n", __func__, con, delay);
+ return 0;
+}
+
+static void queue_con(struct ceph_connection *con)
+{
+ (void) queue_con_delay(con, 0);
+}
+
+static void cancel_con(struct ceph_connection *con)
+{
+ if (cancel_delayed_work(&con->work)) {
+ dout("%s %p\n", __func__, con);
+ con->ops->put(con);
+ }
+}
+
+static bool con_sock_closed(struct ceph_connection *con)
+{
+ if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
+ return false;
+
+#define CASE(x) \
+ case CON_STATE_ ## x: \
+ con->error_msg = "socket closed (con state " #x ")"; \
+ break;
+
+ switch (con->state) {
+ CASE(CLOSED);
+ CASE(PREOPEN);
+ CASE(CONNECTING);
+ CASE(NEGOTIATING);
+ CASE(OPEN);
+ CASE(STANDBY);
+ default:
+ pr_warn("%s con %p unrecognized state %lu\n",
+ __func__, con, con->state);
+ con->error_msg = "unrecognized con state";
+ BUG();
+ break;
+ }
+#undef CASE
+
+ return true;
+}
+
+static bool con_backoff(struct ceph_connection *con)
+{
+ int ret;
+
+ if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+ return false;
+
+ ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+ if (ret) {
+ dout("%s: con %p FAILED to back off %lu\n", __func__,
+ con, con->delay);
+ BUG_ON(ret == -ENOENT);
+ con_flag_set(con, CON_FLAG_BACKOFF);
+ }
+
+ return true;
+}
+
+/* Finish fault handling; con->mutex must *not* be held here */
+
+static void con_fault_finish(struct ceph_connection *con)
+{
+ /*
+ * in case we faulted due to authentication, invalidate our
+ * current tickets so that we can get new ones.
+ */
+ if (con->auth_retry && con->ops->invalidate_authorizer) {
+ dout("calling invalidate_authorizer()\n");
+ con->ops->invalidate_authorizer(con);
+ }
+
+ if (con->ops->fault)
+ con->ops->fault(con);
+}
+
+/*
+ * Do some work on a connection. Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+ struct ceph_connection *con = container_of(work, struct ceph_connection,
+ work.work);
+ bool fault;
+
+ mutex_lock(&con->mutex);
+ while (true) {
+ int ret;
+
+ if ((fault = con_sock_closed(con))) {
+ dout("%s: con %p SOCK_CLOSED\n", __func__, con);
+ break;
+ }
+ if (con_backoff(con)) {
+ dout("%s: con %p BACKOFF\n", __func__, con);
+ break;
+ }
+ if (con->state == CON_STATE_STANDBY) {
+ dout("%s: con %p STANDBY\n", __func__, con);
+ break;
+ }
+ if (con->state == CON_STATE_CLOSED) {
+ dout("%s: con %p CLOSED\n", __func__, con);
+ BUG_ON(con->sock);
+ break;
+ }
+ if (con->state == CON_STATE_PREOPEN) {
+ dout("%s: con %p PREOPEN\n", __func__, con);
+ BUG_ON(con->sock);
+ }
+
+ ret = try_read(con);
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ continue;
+ if (!con->error_msg)
+ con->error_msg = "socket error on read";
+ fault = true;
+ break;
+ }
+
+ ret = try_write(con);
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ continue;
+ if (!con->error_msg)
+ con->error_msg = "socket error on write";
+ fault = true;
+ }
+
+ break; /* If we make it to here, we're done */
+ }
+ if (fault)
+ con_fault(con);
+ mutex_unlock(&con->mutex);
+
+ if (fault)
+ con_fault_finish(con);
+
+ con->ops->put(con);
+}
+
+/*
+ * Generic error/fault handler. A retry mechanism is used with
+ * exponential backoff
+ */
+static void con_fault(struct ceph_connection *con)
+{
+ dout("fault %p state %lu to peer %s\n",
+ con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+
+ pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+ con->error_msg = NULL;
+
+ WARN_ON(con->state != CON_STATE_CONNECTING &&
+ con->state != CON_STATE_NEGOTIATING &&
+ con->state != CON_STATE_OPEN);
+
+ con_close_socket(con);
+
+ if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
+ dout("fault on LOSSYTX channel, marking CLOSED\n");
+ con->state = CON_STATE_CLOSED;
+ return;
+ }
+
+ if (con->in_msg) {
+ BUG_ON(con->in_msg->con != con);
+ con->in_msg->con = NULL;
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ con->ops->put(con);
+ }
+
+ /* Requeue anything that hasn't been acked */
+ list_splice_init(&con->out_sent, &con->out_queue);
+
+ /* If there are no messages queued or keepalive pending, place
+ * the connection in a STANDBY state */
+ if (list_empty(&con->out_queue) &&
+ !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
+ dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
+ con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+ con->state = CON_STATE_STANDBY;
+ } else {
+ /* retry after a delay. */
+ con->state = CON_STATE_PREOPEN;
+ if (con->delay == 0)
+ con->delay = BASE_DELAY_INTERVAL;
+ else if (con->delay < MAX_DELAY_INTERVAL)
+ con->delay *= 2;
+ con_flag_set(con, CON_FLAG_BACKOFF);
+ queue_con(con);
+ }
+}
+
+
+
+/*
+ * initialize a new messenger instance
+ */
+void ceph_messenger_init(struct ceph_messenger *msgr,
+ struct ceph_entity_addr *myaddr,
+ u64 supported_features,
+ u64 required_features,
+ bool nocrc,
+ bool tcp_nodelay)
+{
+ msgr->supported_features = supported_features;
+ msgr->required_features = required_features;
+
+ spin_lock_init(&msgr->global_seq_lock);
+
+ if (myaddr)
+ msgr->inst.addr = *myaddr;
+
+ /* select a random nonce */
+ msgr->inst.addr.type = 0;
+ get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+ encode_my_addr(msgr);
+ msgr->nocrc = nocrc;
+ msgr->tcp_nodelay = tcp_nodelay;
+
+ atomic_set(&msgr->stopping, 0);
+
+ dout("%s %p\n", __func__, msgr);
+}
+EXPORT_SYMBOL(ceph_messenger_init);
+
+static void clear_standby(struct ceph_connection *con)
+{
+ /* come back from STANDBY? */
+ if (con->state == CON_STATE_STANDBY) {
+ dout("clear_standby %p and ++connect_seq\n", con);
+ con->state = CON_STATE_PREOPEN;
+ con->connect_seq++;
+ WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
+ WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
+ }
+}
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ /* set src+dst */
+ msg->hdr.src = con->msgr->inst.name;
+ BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+ msg->needs_out_seq = true;
+
+ mutex_lock(&con->mutex);
+
+ if (con->state == CON_STATE_CLOSED) {
+ dout("con_send %p closed, dropping %p\n", con, msg);
+ ceph_msg_put(msg);
+ mutex_unlock(&con->mutex);
+ return;
+ }
+
+ BUG_ON(msg->con != NULL);
+ msg->con = con->ops->get(con);
+ BUG_ON(msg->con == NULL);
+
+ BUG_ON(!list_empty(&msg->list_head));
+ list_add_tail(&msg->list_head, &con->out_queue);
+ dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+ ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
+ le32_to_cpu(msg->hdr.data_len));
+
+ clear_standby(con);
+ mutex_unlock(&con->mutex);
+
+ /* if there wasn't anything waiting to send before, queue
+ * new work */
+ if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_send);
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_msg_revoke(struct ceph_msg *msg)
+{
+ struct ceph_connection *con = msg->con;
+
+ if (!con)
+ return; /* Message not in our possession */
+
+ mutex_lock(&con->mutex);
+ if (!list_empty(&msg->list_head)) {
+ dout("%s %p msg %p - was on queue\n", __func__, con, msg);
+ list_del_init(&msg->list_head);
+ BUG_ON(msg->con == NULL);
+ msg->con->ops->put(msg->con);
+ msg->con = NULL;
+ msg->hdr.seq = 0;
+
+ ceph_msg_put(msg);
+ }
+ if (con->out_msg == msg) {
+ dout("%s %p msg %p - was sending\n", __func__, con, msg);
+ con->out_msg = NULL;
+ if (con->out_kvec_is_msg) {
+ con->out_skip = con->out_kvec_bytes;
+ con->out_kvec_is_msg = false;
+ }
+ msg->hdr.seq = 0;
+
+ ceph_msg_put(msg);
+ }
+ mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_msg_revoke_incoming(struct ceph_msg *msg)
+{
+ struct ceph_connection *con;
+
+ BUG_ON(msg == NULL);
+ if (!msg->con) {
+ dout("%s msg %p null con\n", __func__, msg);
+
+ return; /* Message not in our possession */
+ }
+
+ con = msg->con;
+ mutex_lock(&con->mutex);
+ if (con->in_msg == msg) {
+ unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
+ unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
+ unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+
+ /* skip rest of message */
+ dout("%s %p msg %p revoked\n", __func__, con, msg);
+ con->in_base_pos = con->in_base_pos -
+ sizeof(struct ceph_msg_header) -
+ front_len -
+ middle_len -
+ data_len -
+ sizeof(struct ceph_msg_footer);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ } else {
+ dout("%s %p in_msg %p msg %p no-op\n",
+ __func__, con, con->in_msg, msg);
+ }
+ mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+ dout("con_keepalive %p\n", con);
+ mutex_lock(&con->mutex);
+ clear_standby(con);
+ mutex_unlock(&con->mutex);
+ if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 &&
+ con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_keepalive);
+
+static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
+{
+ struct ceph_msg_data *data;
+
+ if (WARN_ON(!ceph_msg_data_type_valid(type)))
+ return NULL;
+
+ data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS);
+ if (data)
+ data->type = type;
+ INIT_LIST_HEAD(&data->links);
+
+ return data;
+}
+
+static void ceph_msg_data_destroy(struct ceph_msg_data *data)
+{
+ if (!data)
+ return;
+
+ WARN_ON(!list_empty(&data->links));
+ if (data->type == CEPH_MSG_DATA_PAGELIST)
+ ceph_pagelist_release(data->pagelist);
+ kmem_cache_free(ceph_msg_data_cache, data);
+}
+
+void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
+ size_t length, size_t alignment)
+{
+ struct ceph_msg_data *data;
+
+ BUG_ON(!pages);
+ BUG_ON(!length);
+
+ data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
+ BUG_ON(!data);
+ data->pages = pages;
+ data->length = length;
+ data->alignment = alignment & ~PAGE_MASK;
+
+ list_add_tail(&data->links, &msg->data);
+ msg->data_length += length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_pages);
+
+void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
+ struct ceph_pagelist *pagelist)
+{
+ struct ceph_msg_data *data;
+
+ BUG_ON(!pagelist);
+ BUG_ON(!pagelist->length);
+
+ data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
+ BUG_ON(!data);
+ data->pagelist = pagelist;
+
+ list_add_tail(&data->links, &msg->data);
+ msg->data_length += pagelist->length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
+
+#ifdef CONFIG_BLOCK
+void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
+ size_t length)
+{
+ struct ceph_msg_data *data;
+
+ BUG_ON(!bio);
+
+ data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
+ BUG_ON(!data);
+ data->bio = bio;
+ data->bio_length = length;
+
+ list_add_tail(&data->links, &msg->data);
+ msg->data_length += length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_bio);
+#endif /* CONFIG_BLOCK */
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
+ bool can_fail)
+{
+ struct ceph_msg *m;
+
+ m = kmem_cache_zalloc(ceph_msg_cache, flags);
+ if (m == NULL)
+ goto out;
+
+ m->hdr.type = cpu_to_le16(type);
+ m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+ m->hdr.front_len = cpu_to_le32(front_len);
+
+ INIT_LIST_HEAD(&m->list_head);
+ kref_init(&m->kref);
+ INIT_LIST_HEAD(&m->data);
+
+ /* front */
+ if (front_len) {
+ m->front.iov_base = ceph_kvmalloc(front_len, flags);
+ if (m->front.iov_base == NULL) {
+ dout("ceph_msg_new can't allocate %d bytes\n",
+ front_len);
+ goto out2;
+ }
+ } else {
+ m->front.iov_base = NULL;
+ }
+ m->front_alloc_len = m->front.iov_len = front_len;
+
+ dout("ceph_msg_new %p front %d\n", m, front_len);
+ return m;
+
+out2:
+ ceph_msg_put(m);
+out:
+ if (!can_fail) {
+ pr_err("msg_new can't create type %d front %d\n", type,
+ front_len);
+ WARN_ON(1);
+ } else {
+ dout("msg_new can't create type %d front %d\n", type,
+ front_len);
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(ceph_msg_new);
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg. This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ int type = le16_to_cpu(msg->hdr.type);
+ int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+ dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+ ceph_msg_type_name(type), middle_len);
+ BUG_ON(!middle_len);
+ BUG_ON(msg->middle);
+
+ msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+ if (!msg->middle)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Allocate a message for receiving an incoming message on a
+ * connection, and save the result in con->in_msg. Uses the
+ * connection's private alloc_msg op if available.
+ *
+ * Returns 0 on success, or a negative error code.
+ *
+ * On success, if we set *skip = 1:
+ * - the next message should be skipped and ignored.
+ * - con->in_msg == NULL
+ * or if we set *skip = 0:
+ * - con->in_msg is non-null.
+ * On error (ENOMEM, EAGAIN, ...),
+ * - con->in_msg == NULL
+ */
+static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
+{
+ struct ceph_msg_header *hdr = &con->in_hdr;
+ int middle_len = le32_to_cpu(hdr->middle_len);
+ struct ceph_msg *msg;
+ int ret = 0;
+
+ BUG_ON(con->in_msg != NULL);
+ BUG_ON(!con->ops->alloc_msg);
+
+ mutex_unlock(&con->mutex);
+ msg = con->ops->alloc_msg(con, hdr, skip);
+ mutex_lock(&con->mutex);
+ if (con->state != CON_STATE_OPEN) {
+ if (msg)
+ ceph_msg_put(msg);
+ return -EAGAIN;
+ }
+ if (msg) {
+ BUG_ON(*skip);
+ con->in_msg = msg;
+ con->in_msg->con = con->ops->get(con);
+ BUG_ON(con->in_msg->con == NULL);
+ } else {
+ /*
+ * Null message pointer means either we should skip
+ * this message or we couldn't allocate memory. The
+ * former is not an error.
+ */
+ if (*skip)
+ return 0;
+
+ con->error_msg = "error allocating memory for incoming message";
+ return -ENOMEM;
+ }
+ memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+ if (middle_len && !con->in_msg->middle) {
+ ret = ceph_alloc_middle(con, con->in_msg);
+ if (ret < 0) {
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+ }
+
+ return ret;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+static void ceph_msg_free(struct ceph_msg *m)
+{
+ dout("%s %p\n", __func__, m);
+ kvfree(m->front.iov_base);
+ kmem_cache_free(ceph_msg_cache, m);
+}
+
+static void ceph_msg_release(struct kref *kref)
+{
+ struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+ LIST_HEAD(data);
+ struct list_head *links;
+ struct list_head *next;
+
+ dout("%s %p\n", __func__, m);
+ WARN_ON(!list_empty(&m->list_head));
+
+ /* drop middle, data, if any */
+ if (m->middle) {
+ ceph_buffer_put(m->middle);
+ m->middle = NULL;
+ }
+
+ list_splice_init(&m->data, &data);
+ list_for_each_safe(links, next, &data) {
+ struct ceph_msg_data *data;
+
+ data = list_entry(links, struct ceph_msg_data, links);
+ list_del_init(links);
+ ceph_msg_data_destroy(data);
+ }
+ m->data_length = 0;
+
+ if (m->pool)
+ ceph_msgpool_put(m->pool, m);
+ else
+ ceph_msg_free(m);
+}
+
+struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+ dout("%s %p (was %d)\n", __func__, msg,
+ atomic_read(&msg->kref.refcount));
+ kref_get(&msg->kref);
+ return msg;
+}
+EXPORT_SYMBOL(ceph_msg_get);
+
+void ceph_msg_put(struct ceph_msg *msg)
+{
+ dout("%s %p (was %d)\n", __func__, msg,
+ atomic_read(&msg->kref.refcount));
+ kref_put(&msg->kref, ceph_msg_release);
+}
+EXPORT_SYMBOL(ceph_msg_put);
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+ pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
+ msg->front_alloc_len, msg->data_length);
+ print_hex_dump(KERN_DEBUG, "header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ &msg->hdr, sizeof(msg->hdr), true);
+ print_hex_dump(KERN_DEBUG, " front: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ msg->front.iov_base, msg->front.iov_len, true);
+ if (msg->middle)
+ print_hex_dump(KERN_DEBUG, "middle: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ msg->middle->vec.iov_base,
+ msg->middle->vec.iov_len, true);
+ print_hex_dump(KERN_DEBUG, "footer: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ &msg->footer, sizeof(msg->footer), true);
+}
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 000000000..2b3cf05e8
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1121 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * Interact with Ceph monitor cluster. Handle requests for new map
+ * versions, and periodically resend as needed. Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information. An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates. We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure. If the connection does break, we
+ * randomly hunt for a new monitor. Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+static const struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+ struct ceph_monmap *m = NULL;
+ int i, err = -EINVAL;
+ struct ceph_fsid fsid;
+ u32 epoch, num_mon;
+ u16 version;
+ u32 len;
+
+ ceph_decode_32_safe(&p, end, len, bad);
+ ceph_decode_need(&p, end, len, bad);
+
+ dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+ ceph_decode_16_safe(&p, end, version, bad);
+
+ ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(&p);
+
+ num_mon = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+ if (num_mon >= CEPH_MAX_MON)
+ goto bad;
+ m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+ if (m == NULL)
+ return ERR_PTR(-ENOMEM);
+ m->fsid = fsid;
+ m->epoch = epoch;
+ m->num_mon = num_mon;
+ ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+ for (i = 0; i < num_mon; i++)
+ ceph_decode_addr(&m->mon_inst[i].addr);
+
+ dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+ m->num_mon);
+ for (i = 0; i < m->num_mon; i++)
+ dout("monmap_decode mon%d is %s\n", i,
+ ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
+ return m;
+
+bad:
+ dout("monmap_decode failed with %d\n", err);
+ kfree(m);
+ return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+ int i;
+
+ for (i = 0; i < m->num_mon; i++)
+ if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+ monc->pending_auth = 1;
+ monc->m_auth->front.iov_len = len;
+ monc->m_auth->hdr.front_len = cpu_to_le32(len);
+ ceph_msg_revoke(monc->m_auth);
+ ceph_msg_get(monc->m_auth); /* keep our ref */
+ ceph_con_send(&monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+ dout("__close_session closing mon%d\n", monc->cur_mon);
+ ceph_msg_revoke(monc->m_auth);
+ ceph_msg_revoke_incoming(monc->m_auth_reply);
+ ceph_msg_revoke(monc->m_subscribe);
+ ceph_msg_revoke_incoming(monc->m_subscribe_ack);
+ ceph_con_close(&monc->con);
+ monc->cur_mon = -1;
+ monc->pending_auth = 0;
+ ceph_auth_reset(monc->auth);
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+ char r;
+ int ret;
+
+ if (monc->cur_mon < 0) {
+ get_random_bytes(&r, 1);
+ monc->cur_mon = r % monc->monmap->num_mon;
+ dout("open_session num=%d r=%d -> mon%d\n",
+ monc->monmap->num_mon, r, monc->cur_mon);
+ monc->sub_sent = 0;
+ monc->sub_renew_after = jiffies; /* i.e., expired */
+ monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+ dout("open_session mon%d opening\n", monc->cur_mon);
+ ceph_con_open(&monc->con,
+ CEPH_ENTITY_TYPE_MON, monc->cur_mon,
+ &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+ /* initiatiate authentication handshake */
+ ret = ceph_auth_build_hello(monc->auth,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ __send_prepared_auth_request(monc, ret);
+ } else {
+ dout("open_session mon%d already open\n", monc->cur_mon);
+ }
+ return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+ return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+ unsigned int delay;
+
+ if (monc->cur_mon < 0 || __sub_expired(monc))
+ delay = 10 * HZ;
+ else
+ delay = 20 * HZ;
+ dout("__schedule_delayed after %u\n", delay);
+ schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+ dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+ (unsigned int)monc->sub_sent, __sub_expired(monc),
+ monc->want_next_osdmap);
+ if ((__sub_expired(monc) && !monc->sub_sent) ||
+ monc->want_next_osdmap == 1) {
+ struct ceph_msg *msg = monc->m_subscribe;
+ struct ceph_mon_subscribe_item *i;
+ void *p, *end;
+ int num;
+
+ p = msg->front.iov_base;
+ end = p + msg->front_alloc_len;
+
+ num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
+ ceph_encode_32(&p, num);
+
+ if (monc->want_next_osdmap) {
+ dout("__send_subscribe to 'osdmap' %u\n",
+ (unsigned int)monc->have_osdmap);
+ ceph_encode_string(&p, end, "osdmap", 6);
+ i = p;
+ i->have = cpu_to_le64(monc->have_osdmap);
+ i->onetime = 1;
+ p += sizeof(*i);
+ monc->want_next_osdmap = 2; /* requested */
+ }
+ if (monc->want_mdsmap) {
+ dout("__send_subscribe to 'mdsmap' %u+\n",
+ (unsigned int)monc->have_mdsmap);
+ ceph_encode_string(&p, end, "mdsmap", 6);
+ i = p;
+ i->have = cpu_to_le64(monc->have_mdsmap);
+ i->onetime = 0;
+ p += sizeof(*i);
+ }
+ ceph_encode_string(&p, end, "monmap", 6);
+ i = p;
+ i->have = 0;
+ i->onetime = 0;
+ p += sizeof(*i);
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ ceph_msg_revoke(msg);
+ ceph_con_send(&monc->con, ceph_msg_get(msg));
+
+ monc->sub_sent = jiffies | 1; /* never 0 */
+ }
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ unsigned int seconds;
+ struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ seconds = le32_to_cpu(h->duration);
+
+ mutex_lock(&monc->mutex);
+ if (monc->hunting) {
+ pr_info("mon%d %s session established\n",
+ monc->cur_mon,
+ ceph_pr_addr(&monc->con.peer_addr.in_addr));
+ monc->hunting = false;
+ }
+ dout("handle_subscribe_ack after %d seconds\n", seconds);
+ monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+ monc->sub_sent = 0;
+ mutex_unlock(&monc->mutex);
+ return;
+bad:
+ pr_err("got corrupt subscribe-ack msg\n");
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+ mutex_lock(&monc->mutex);
+ monc->have_mdsmap = got;
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_got_mdsmap);
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+ mutex_lock(&monc->mutex);
+ monc->have_osdmap = got;
+ monc->want_next_osdmap = 0;
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+ dout("request_next_osdmap have %u\n", monc->have_osdmap);
+ mutex_lock(&monc->mutex);
+ if (!monc->want_next_osdmap)
+ monc->want_next_osdmap = 1;
+ if (monc->want_next_osdmap < 2)
+ __send_subscribe(monc);
+ mutex_unlock(&monc->mutex);
+}
+EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
+
+int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
+ unsigned long timeout)
+{
+ unsigned long started = jiffies;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ while (monc->have_osdmap < epoch) {
+ mutex_unlock(&monc->mutex);
+
+ if (timeout != 0 && time_after_eq(jiffies, started + timeout))
+ return -ETIMEDOUT;
+
+ ret = wait_event_interruptible_timeout(monc->client->auth_wq,
+ monc->have_osdmap >= epoch, timeout);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&monc->mutex);
+ }
+
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_wait_osdmap);
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+ mutex_lock(&monc->mutex);
+ __open_session(monc);
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_open_session);
+
+/*
+ * We require the fsid and global_id in order to initialize our
+ * debugfs dir.
+ */
+static bool have_debugfs_info(struct ceph_mon_client *monc)
+{
+ dout("have_debugfs_info fsid %d globalid %lld\n",
+ (int)monc->client->have_fsid, monc->auth->global_id);
+ return monc->client->have_fsid && monc->auth->global_id > 0;
+}
+
+/*
+ * The monitor responds with mount ack indicate mount success. The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_client *client = monc->client;
+ struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+ void *p, *end;
+ int had_debugfs_info, init_debugfs = 0;
+
+ mutex_lock(&monc->mutex);
+
+ had_debugfs_info = have_debugfs_info(monc);
+
+ dout("handle_monmap\n");
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ monmap = ceph_monmap_decode(p, end);
+ if (IS_ERR(monmap)) {
+ pr_err("problem decoding monmap, %d\n",
+ (int)PTR_ERR(monmap));
+ goto out;
+ }
+
+ if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+ kfree(monmap);
+ goto out;
+ }
+
+ client->monc.monmap = monmap;
+ kfree(old);
+
+ if (!client->have_fsid) {
+ client->have_fsid = true;
+ if (!had_debugfs_info && have_debugfs_info(monc)) {
+ pr_info("client%lld fsid %pU\n",
+ ceph_client_id(monc->client),
+ &monc->client->fsid);
+ init_debugfs = 1;
+ }
+ mutex_unlock(&monc->mutex);
+
+ if (init_debugfs) {
+ /*
+ * do debugfs initialization without mutex to avoid
+ * creating a locking dependency
+ */
+ ceph_debugfs_client_init(monc->client);
+ }
+
+ goto out_unlocked;
+ }
+out:
+ mutex_unlock(&monc->mutex);
+out_unlocked:
+ wake_up_all(&client->auth_wq);
+}
+
+/*
+ * generic requests (currently statfs, mon_get_version)
+ */
+static struct ceph_mon_generic_request *__lookup_generic_req(
+ struct ceph_mon_client *monc, u64 tid)
+{
+ struct ceph_mon_generic_request *req;
+ struct rb_node *n = monc->generic_request_tree.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_mon_generic_request, node);
+ if (tid < req->tid)
+ n = n->rb_left;
+ else if (tid > req->tid)
+ n = n->rb_right;
+ else
+ return req;
+ }
+ return NULL;
+}
+
+static void __insert_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *new)
+{
+ struct rb_node **p = &monc->generic_request_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_mon_generic_request *req = NULL;
+
+ while (*p) {
+ parent = *p;
+ req = rb_entry(parent, struct ceph_mon_generic_request, node);
+ if (new->tid < req->tid)
+ p = &(*p)->rb_left;
+ else if (new->tid > req->tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+ struct ceph_mon_generic_request *req =
+ container_of(kref, struct ceph_mon_generic_request, kref);
+
+ if (req->reply)
+ ceph_msg_put(req->reply);
+ if (req->request)
+ ceph_msg_put(req->request);
+
+ kfree(req);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+ kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+ kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_mon_client *monc = con->private;
+ struct ceph_mon_generic_request *req;
+ u64 tid = le64_to_cpu(hdr->tid);
+ struct ceph_msg *m;
+
+ mutex_lock(&monc->mutex);
+ req = __lookup_generic_req(monc, tid);
+ if (!req) {
+ dout("get_generic_reply %lld dne\n", tid);
+ *skip = 1;
+ m = NULL;
+ } else {
+ dout("get_generic_reply %lld got %p\n", tid, req->reply);
+ *skip = 0;
+ m = ceph_msg_get(req->reply);
+ /*
+ * we don't need to track the connection reading into
+ * this reply because we only have one open connection
+ * at a time, ever.
+ */
+ }
+ mutex_unlock(&monc->mutex);
+ return m;
+}
+
+static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
+ struct ceph_mon_generic_request *req)
+{
+ int err;
+
+ /* register request */
+ req->tid = tid != 0 ? tid : ++monc->last_tid;
+ req->request->hdr.tid = cpu_to_le64(req->tid);
+ __insert_generic_request(monc, req);
+ monc->num_generic_requests++;
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
+ mutex_unlock(&monc->mutex);
+
+ err = wait_for_completion_interruptible(&req->completion);
+
+ mutex_lock(&monc->mutex);
+ rb_erase(&req->node, &monc->generic_request_tree);
+ monc->num_generic_requests--;
+
+ if (!err)
+ err = req->result;
+ return err;
+}
+
+static int do_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *req)
+{
+ int err;
+
+ mutex_lock(&monc->mutex);
+ err = __do_generic_request(monc, 0, req);
+ mutex_unlock(&monc->mutex);
+
+ return err;
+}
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+
+ if (msg->front.iov_len != sizeof(*reply))
+ goto bad;
+ dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+ mutex_lock(&monc->mutex);
+ req = __lookup_generic_req(monc, tid);
+ if (req) {
+ *(struct ceph_statfs *)req->buf = reply->st;
+ req->result = 0;
+ get_generic_request(req);
+ }
+ mutex_unlock(&monc->mutex);
+ if (req) {
+ complete_all(&req->completion);
+ put_generic_request(req);
+ }
+ return;
+
+bad:
+ pr_err("corrupt statfs reply, tid %llu\n", tid);
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_statfs *h;
+ int err;
+
+ req = kzalloc(sizeof(*req), GFP_NOFS);
+ if (!req)
+ return -ENOMEM;
+
+ kref_init(&req->kref);
+ req->buf = buf;
+ init_completion(&req->completion);
+
+ err = -ENOMEM;
+ req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
+ true);
+ if (!req->request)
+ goto out;
+ req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
+ true);
+ if (!req->reply)
+ goto out;
+
+ /* fill out request */
+ h = req->request->front.iov_base;
+ h->monhdr.have_version = 0;
+ h->monhdr.session_mon = cpu_to_le16(-1);
+ h->monhdr.session_mon_tid = 0;
+ h->fsid = monc->monmap->fsid;
+
+ err = do_generic_request(monc, req);
+
+out:
+ put_generic_request(req);
+ return err;
+}
+EXPORT_SYMBOL(ceph_monc_do_statfs);
+
+static void handle_get_version_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_generic_request *req;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front_alloc_len;
+ u64 handle;
+
+ dout("%s %p tid %llu\n", __func__, msg, tid);
+
+ ceph_decode_need(&p, end, 2*sizeof(u64), bad);
+ handle = ceph_decode_64(&p);
+ if (tid != 0 && tid != handle)
+ goto bad;
+
+ mutex_lock(&monc->mutex);
+ req = __lookup_generic_req(monc, handle);
+ if (req) {
+ *(u64 *)req->buf = ceph_decode_64(&p);
+ req->result = 0;
+ get_generic_request(req);
+ }
+ mutex_unlock(&monc->mutex);
+ if (req) {
+ complete_all(&req->completion);
+ put_generic_request(req);
+ }
+
+ return;
+bad:
+ pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Send MMonGetVersion and wait for the reply.
+ *
+ * @what: one of "mdsmap", "osdmap" or "monmap"
+ */
+int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
+ u64 *newest)
+{
+ struct ceph_mon_generic_request *req;
+ void *p, *end;
+ u64 tid;
+ int err;
+
+ req = kzalloc(sizeof(*req), GFP_NOFS);
+ if (!req)
+ return -ENOMEM;
+
+ kref_init(&req->kref);
+ req->buf = newest;
+ init_completion(&req->completion);
+
+ req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
+ sizeof(u64) + sizeof(u32) + strlen(what),
+ GFP_NOFS, true);
+ if (!req->request) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
+ GFP_NOFS, true);
+ if (!req->reply) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ p = req->request->front.iov_base;
+ end = p + req->request->front_alloc_len;
+
+ /* fill out request */
+ mutex_lock(&monc->mutex);
+ tid = ++monc->last_tid;
+ ceph_encode_64(&p, tid); /* handle */
+ ceph_encode_string(&p, end, what, strlen(what));
+
+ err = __do_generic_request(monc, tid, req);
+
+ mutex_unlock(&monc->mutex);
+out:
+ put_generic_request(req);
+ return err;
+}
+EXPORT_SYMBOL(ceph_monc_do_get_version);
+
+/*
+ * Resend pending generic requests.
+ */
+static void __resend_generic_request(struct ceph_mon_client *monc)
+{
+ struct ceph_mon_generic_request *req;
+ struct rb_node *p;
+
+ for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_mon_generic_request, node);
+ ceph_msg_revoke(req->request);
+ ceph_msg_revoke_incoming(req->reply);
+ ceph_con_send(&monc->con, ceph_msg_get(req->request));
+ }
+}
+
+/*
+ * Delayed work. If we haven't mounted yet, retry. Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM). And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+ struct ceph_mon_client *monc =
+ container_of(work, struct ceph_mon_client, delayed_work.work);
+
+ dout("monc delayed_work\n");
+ mutex_lock(&monc->mutex);
+ if (monc->hunting) {
+ __close_session(monc);
+ __open_session(monc); /* continue hunting */
+ } else {
+ ceph_con_keepalive(&monc->con);
+
+ __validate_auth(monc);
+
+ if (ceph_auth_is_authenticated(monc->auth))
+ __send_subscribe(monc);
+ }
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+ struct ceph_options *opt = monc->client->options;
+ struct ceph_entity_addr *mon_addr = opt->mon_addr;
+ int num_mon = opt->num_mon;
+ int i;
+
+ /* build initial monmap */
+ monc->monmap = kzalloc(sizeof(*monc->monmap) +
+ num_mon*sizeof(monc->monmap->mon_inst[0]),
+ GFP_KERNEL);
+ if (!monc->monmap)
+ return -ENOMEM;
+ for (i = 0; i < num_mon; i++) {
+ monc->monmap->mon_inst[i].addr = mon_addr[i];
+ monc->monmap->mon_inst[i].addr.nonce = 0;
+ monc->monmap->mon_inst[i].name.type =
+ CEPH_ENTITY_TYPE_MON;
+ monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+ }
+ monc->monmap->num_mon = num_mon;
+ return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+ int err = 0;
+
+ dout("init\n");
+ memset(monc, 0, sizeof(*monc));
+ monc->client = cl;
+ monc->monmap = NULL;
+ mutex_init(&monc->mutex);
+
+ err = build_initial_monmap(monc);
+ if (err)
+ goto out;
+
+ /* connection */
+ /* authentication */
+ monc->auth = ceph_auth_init(cl->options->name,
+ cl->options->key);
+ if (IS_ERR(monc->auth)) {
+ err = PTR_ERR(monc->auth);
+ goto out_monmap;
+ }
+ monc->auth->want_keys =
+ CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+ CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+ /* msgs */
+ err = -ENOMEM;
+ monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+ sizeof(struct ceph_mon_subscribe_ack),
+ GFP_NOFS, true);
+ if (!monc->m_subscribe_ack)
+ goto out_auth;
+
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
+ true);
+ if (!monc->m_subscribe)
+ goto out_subscribe_ack;
+
+ monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
+ true);
+ if (!monc->m_auth_reply)
+ goto out_subscribe;
+
+ monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
+ monc->pending_auth = 0;
+ if (!monc->m_auth)
+ goto out_auth_reply;
+
+ ceph_con_init(&monc->con, monc, &mon_con_ops,
+ &monc->client->msgr);
+
+ monc->cur_mon = -1;
+ monc->hunting = true;
+ monc->sub_renew_after = jiffies;
+ monc->sub_sent = 0;
+
+ INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+ monc->generic_request_tree = RB_ROOT;
+ monc->num_generic_requests = 0;
+ monc->last_tid = 0;
+
+ monc->have_mdsmap = 0;
+ monc->have_osdmap = 0;
+ monc->want_next_osdmap = 1;
+ return 0;
+
+out_auth_reply:
+ ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+ ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+ ceph_msg_put(monc->m_subscribe_ack);
+out_auth:
+ ceph_auth_destroy(monc->auth);
+out_monmap:
+ kfree(monc->monmap);
+out:
+ return err;
+}
+EXPORT_SYMBOL(ceph_monc_init);
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+ dout("stop\n");
+ cancel_delayed_work_sync(&monc->delayed_work);
+
+ mutex_lock(&monc->mutex);
+ __close_session(monc);
+
+ mutex_unlock(&monc->mutex);
+
+ /*
+ * flush msgr queue before we destroy ourselves to ensure that:
+ * - any work that references our embedded con is finished.
+ * - any osd_client or other work that may reference an authorizer
+ * finishes before we shut down the auth subsystem.
+ */
+ ceph_msgr_flush();
+
+ ceph_auth_destroy(monc->auth);
+
+ ceph_msg_put(monc->m_auth);
+ ceph_msg_put(monc->m_auth_reply);
+ ceph_msg_put(monc->m_subscribe);
+ ceph_msg_put(monc->m_subscribe_ack);
+
+ kfree(monc->monmap);
+}
+EXPORT_SYMBOL(ceph_monc_stop);
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ int ret;
+ int was_auth = 0;
+ int had_debugfs_info, init_debugfs = 0;
+
+ mutex_lock(&monc->mutex);
+ had_debugfs_info = have_debugfs_info(monc);
+ was_auth = ceph_auth_is_authenticated(monc->auth);
+ monc->pending_auth = 0;
+ ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+ msg->front.iov_len,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ if (ret < 0) {
+ monc->client->auth_err = ret;
+ wake_up_all(&monc->client->auth_wq);
+ } else if (ret > 0) {
+ __send_prepared_auth_request(monc, ret);
+ } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
+ dout("authenticated, starting session\n");
+
+ monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+ monc->client->msgr.inst.name.num =
+ cpu_to_le64(monc->auth->global_id);
+
+ __send_subscribe(monc);
+ __resend_generic_request(monc);
+ }
+
+ if (!had_debugfs_info && have_debugfs_info(monc)) {
+ pr_info("client%lld fsid %pU\n",
+ ceph_client_id(monc->client),
+ &monc->client->fsid);
+ init_debugfs = 1;
+ }
+ mutex_unlock(&monc->mutex);
+
+ if (init_debugfs) {
+ /*
+ * do debugfs initialization without mutex to avoid
+ * creating a locking dependency
+ */
+ ceph_debugfs_client_init(monc->client);
+ }
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ if (monc->pending_auth)
+ return 0;
+
+ ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ if (ret <= 0)
+ return ret; /* either an error, or no need to authenticate */
+ __send_prepared_auth_request(monc, ret);
+ return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = __validate_auth(monc);
+ mutex_unlock(&monc->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_monc_validate_auth);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mon_client *monc = con->private;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ if (!monc)
+ return;
+
+ switch (type) {
+ case CEPH_MSG_AUTH_REPLY:
+ handle_auth_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ handle_subscribe_ack(monc, msg);
+ break;
+
+ case CEPH_MSG_STATFS_REPLY:
+ handle_statfs_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_GET_VERSION_REPLY:
+ handle_get_version_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_MAP:
+ ceph_monc_handle_map(monc, msg);
+ break;
+
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(&monc->client->osdc, msg);
+ break;
+
+ default:
+ /* can the chained handler handle it? */
+ if (monc->client->extra_mon_dispatch &&
+ monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+ break;
+
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+ ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_mon_client *monc = con->private;
+ int type = le16_to_cpu(hdr->type);
+ int front_len = le32_to_cpu(hdr->front_len);
+ struct ceph_msg *m = NULL;
+
+ *skip = 0;
+
+ switch (type) {
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ m = ceph_msg_get(monc->m_subscribe_ack);
+ break;
+ case CEPH_MSG_STATFS_REPLY:
+ return get_generic_reply(con, hdr, skip);
+ case CEPH_MSG_AUTH_REPLY:
+ m = ceph_msg_get(monc->m_auth_reply);
+ break;
+ case CEPH_MSG_MON_GET_VERSION_REPLY:
+ if (le64_to_cpu(hdr->tid) != 0)
+ return get_generic_reply(con, hdr, skip);
+
+ /*
+ * Older OSDs don't set reply tid even if the orignal
+ * request had a non-zero tid. Workaround this weirdness
+ * by falling through to the allocate case.
+ */
+ case CEPH_MSG_MON_MAP:
+ case CEPH_MSG_MDS_MAP:
+ case CEPH_MSG_OSD_MAP:
+ m = ceph_msg_new(type, front_len, GFP_NOFS, false);
+ if (!m)
+ return NULL; /* ENOMEM--return skip == 0 */
+ break;
+ }
+
+ if (!m) {
+ pr_info("alloc_msg unknown type %d\n", type);
+ *skip = 1;
+ } else if (front_len > m->front_alloc_len) {
+ pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n",
+ front_len, m->front_alloc_len,
+ (unsigned int)con->peer_name.type,
+ le64_to_cpu(con->peer_name.num));
+ ceph_msg_put(m);
+ m = ceph_msg_new(type, front_len, GFP_NOFS, false);
+ }
+
+ return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+ struct ceph_mon_client *monc = con->private;
+
+ if (!monc)
+ return;
+
+ dout("mon_fault\n");
+ mutex_lock(&monc->mutex);
+ if (!con->private)
+ goto out;
+
+ if (!monc->hunting)
+ pr_info("mon%d %s session lost, "
+ "hunting for new mon\n", monc->cur_mon,
+ ceph_pr_addr(&monc->con.peer_addr.in_addr));
+
+ __close_session(monc);
+ if (!monc->hunting) {
+ /* start hunting */
+ monc->hunting = true;
+ __open_session(monc);
+ } else {
+ /* already hunting, let's wait a bit */
+ __schedule_delayed(monc);
+ }
+out:
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ * We can ignore refcounting on the connection struct, as all references
+ * will come from the messenger workqueue, which is drained prior to
+ * mon_client destruction.
+ */
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+ return con;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+}
+
+static const struct ceph_connection_operations mon_con_ops = {
+ .get = con_get,
+ .put = con_put,
+ .dispatch = dispatch,
+ .fault = mon_fault,
+ .alloc_msg = mon_alloc_msg,
+};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 000000000..ddec1c10a
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,83 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <linux/ceph/msgpool.h>
+
+static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
+{
+ struct ceph_msgpool *pool = arg;
+ struct ceph_msg *msg;
+
+ msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true);
+ if (!msg) {
+ dout("msgpool_alloc %s failed\n", pool->name);
+ } else {
+ dout("msgpool_alloc %s %p\n", pool->name, msg);
+ msg->pool = pool;
+ }
+ return msg;
+}
+
+static void msgpool_free(void *element, void *arg)
+{
+ struct ceph_msgpool *pool = arg;
+ struct ceph_msg *msg = element;
+
+ dout("msgpool_release %s %p\n", pool->name, msg);
+ msg->pool = NULL;
+ ceph_msg_put(msg);
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
+ int front_len, int size, bool blocking, const char *name)
+{
+ dout("msgpool %s init\n", name);
+ pool->type = type;
+ pool->front_len = front_len;
+ pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
+ if (!pool->pool)
+ return -ENOMEM;
+ pool->name = name;
+ return 0;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+ dout("msgpool %s destroy\n", pool->name);
+ mempool_destroy(pool->pool);
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+ int front_len)
+{
+ struct ceph_msg *msg;
+
+ if (front_len > pool->front_len) {
+ dout("msgpool_get %s need front %d, pool size is %d\n",
+ pool->name, front_len, pool->front_len);
+ WARN_ON(1);
+
+ /* try to alloc a fresh message */
+ return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);
+ }
+
+ msg = mempool_alloc(pool->pool, GFP_NOFS);
+ dout("msgpool_get %s %p\n", pool->name, msg);
+ return msg;
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+ dout("msgpool_put %s %p\n", pool->name, msg);
+
+ /* reset msg front_len; user may have changed it */
+ msg->front.iov_len = pool->front_len;
+ msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+ kref_init(&msg->kref); /* retake single ref */
+ mempool_free(msg, pool->pool);
+}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 000000000..c4ec92392
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,3008 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+
+#define OSD_OP_FRONT_LEN 4096
+#define OSD_OPREPLY_FRONT_LEN 512
+
+static struct kmem_cache *ceph_osd_request_cache;
+
+static const struct ceph_connection_operations osd_con_ops;
+
+static void __send_queued(struct ceph_osd_client *osdc);
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+static void __register_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+static void __unregister_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+static void __unregister_linger_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+static void __enqueue_request(struct ceph_osd_request *req);
+static void __send_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices." (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly. shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
+ u64 *objnum, u64 *objoff, u64 *objlen)
+{
+ u64 orig_len = *plen;
+ int r;
+
+ /* object extent? */
+ r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
+ objoff, objlen);
+ if (r < 0)
+ return r;
+ if (*objlen < orig_len) {
+ *plen = *objlen;
+ dout(" skipping last %llu, final file extent %llu~%llu\n",
+ orig_len - *plen, off, *plen);
+ }
+
+ dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
+
+ return 0;
+}
+
+static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
+{
+ memset(osd_data, 0, sizeof (*osd_data));
+ osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
+}
+
+static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
+ struct page **pages, u64 length, u32 alignment,
+ bool pages_from_pool, bool own_pages)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
+ osd_data->pages = pages;
+ osd_data->length = length;
+ osd_data->alignment = alignment;
+ osd_data->pages_from_pool = pages_from_pool;
+ osd_data->own_pages = own_pages;
+}
+
+static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
+ struct ceph_pagelist *pagelist)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
+ osd_data->pagelist = pagelist;
+}
+
+#ifdef CONFIG_BLOCK
+static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
+ struct bio *bio, size_t bio_length)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
+ osd_data->bio = bio;
+ osd_data->bio_length = bio_length;
+}
+#endif /* CONFIG_BLOCK */
+
+#define osd_req_op_data(oreq, whch, typ, fld) \
+ ({ \
+ BUG_ON(whch >= (oreq)->r_num_ops); \
+ &(oreq)->r_ops[whch].typ.fld; \
+ })
+
+static struct ceph_osd_data *
+osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
+{
+ BUG_ON(which >= osd_req->r_num_ops);
+
+ return &osd_req->r_ops[which].raw_data_in;
+}
+
+struct ceph_osd_data *
+osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
+ unsigned int which)
+{
+ return osd_req_op_data(osd_req, which, extent, osd_data);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data);
+
+struct ceph_osd_data *
+osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
+ unsigned int which)
+{
+ return osd_req_op_data(osd_req, which, cls, response_data);
+}
+EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
+
+void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
+ unsigned int which, struct page **pages,
+ u64 length, u32 alignment,
+ bool pages_from_pool, bool own_pages)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_raw_data_in(osd_req, which);
+ ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+ pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
+
+void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
+ unsigned int which, struct page **pages,
+ u64 length, u32 alignment,
+ bool pages_from_pool, bool own_pages)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+ pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
+
+void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
+ unsigned int which, struct ceph_pagelist *pagelist)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_data_pagelist_init(osd_data, pagelist);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
+
+#ifdef CONFIG_BLOCK
+void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
+ unsigned int which, struct bio *bio, size_t bio_length)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_data_bio_init(osd_data, bio, bio_length);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
+#endif /* CONFIG_BLOCK */
+
+static void osd_req_op_cls_request_info_pagelist(
+ struct ceph_osd_request *osd_req,
+ unsigned int which, struct ceph_pagelist *pagelist)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, cls, request_info);
+ ceph_osd_data_pagelist_init(osd_data, pagelist);
+}
+
+void osd_req_op_cls_request_data_pagelist(
+ struct ceph_osd_request *osd_req,
+ unsigned int which, struct ceph_pagelist *pagelist)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+ ceph_osd_data_pagelist_init(osd_data, pagelist);
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
+
+void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
+ unsigned int which, struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool, bool own_pages)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+ ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+ pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
+
+void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
+ unsigned int which, struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool, bool own_pages)
+{
+ struct ceph_osd_data *osd_data;
+
+ osd_data = osd_req_op_data(osd_req, which, cls, response_data);
+ ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+ pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
+
+static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
+{
+ switch (osd_data->type) {
+ case CEPH_OSD_DATA_TYPE_NONE:
+ return 0;
+ case CEPH_OSD_DATA_TYPE_PAGES:
+ return osd_data->length;
+ case CEPH_OSD_DATA_TYPE_PAGELIST:
+ return (u64)osd_data->pagelist->length;
+#ifdef CONFIG_BLOCK
+ case CEPH_OSD_DATA_TYPE_BIO:
+ return (u64)osd_data->bio_length;
+#endif /* CONFIG_BLOCK */
+ default:
+ WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
+ return 0;
+ }
+}
+
+static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
+{
+ if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
+ int num_pages;
+
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ ceph_release_page_vector(osd_data->pages, num_pages);
+ }
+ ceph_osd_data_init(osd_data);
+}
+
+static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
+ unsigned int which)
+{
+ struct ceph_osd_req_op *op;
+
+ BUG_ON(which >= osd_req->r_num_ops);
+ op = &osd_req->r_ops[which];
+
+ switch (op->op) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_WRITE:
+ ceph_osd_data_release(&op->extent.osd_data);
+ break;
+ case CEPH_OSD_OP_CALL:
+ ceph_osd_data_release(&op->cls.request_info);
+ ceph_osd_data_release(&op->cls.request_data);
+ ceph_osd_data_release(&op->cls.response_data);
+ break;
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ ceph_osd_data_release(&op->xattr.osd_data);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * requests
+ */
+static void ceph_osdc_release_request(struct kref *kref)
+{
+ struct ceph_osd_request *req = container_of(kref,
+ struct ceph_osd_request, r_kref);
+ unsigned int which;
+
+ dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
+ req->r_request, req->r_reply);
+ WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+ WARN_ON(!list_empty(&req->r_req_lru_item));
+ WARN_ON(!list_empty(&req->r_osd_item));
+ WARN_ON(!list_empty(&req->r_linger_item));
+ WARN_ON(!list_empty(&req->r_linger_osd_item));
+ WARN_ON(req->r_osd);
+
+ if (req->r_request)
+ ceph_msg_put(req->r_request);
+ if (req->r_reply) {
+ ceph_msg_revoke_incoming(req->r_reply);
+ ceph_msg_put(req->r_reply);
+ }
+
+ for (which = 0; which < req->r_num_ops; which++)
+ osd_req_op_data_release(req, which);
+
+ ceph_put_snap_context(req->r_snapc);
+ if (req->r_mempool)
+ mempool_free(req, req->r_osdc->req_mempool);
+ else
+ kmem_cache_free(ceph_osd_request_cache, req);
+
+}
+
+void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+ dout("%s %p (was %d)\n", __func__, req,
+ atomic_read(&req->r_kref.refcount));
+ kref_get(&req->r_kref);
+}
+EXPORT_SYMBOL(ceph_osdc_get_request);
+
+void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+ dout("%s %p (was %d)\n", __func__, req,
+ atomic_read(&req->r_kref.refcount));
+ kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+EXPORT_SYMBOL(ceph_osdc_put_request);
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+ struct ceph_snap_context *snapc,
+ unsigned int num_ops,
+ bool use_mempool,
+ gfp_t gfp_flags)
+{
+ struct ceph_osd_request *req;
+ struct ceph_msg *msg;
+ size_t msg_size;
+
+ BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
+ BUG_ON(num_ops > CEPH_OSD_MAX_OP);
+
+ msg_size = 4 + 4 + 8 + 8 + 4+8;
+ msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+ msg_size += 1 + 8 + 4 + 4; /* pg_t */
+ msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
+ msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
+ msg_size += 8; /* snapid */
+ msg_size += 8; /* snap_seq */
+ msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+ msg_size += 4;
+
+ if (use_mempool) {
+ req = mempool_alloc(osdc->req_mempool, gfp_flags);
+ memset(req, 0, sizeof(*req));
+ } else {
+ req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
+ }
+ if (req == NULL)
+ return NULL;
+
+ req->r_osdc = osdc;
+ req->r_mempool = use_mempool;
+ req->r_num_ops = num_ops;
+
+ kref_init(&req->r_kref);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ RB_CLEAR_NODE(&req->r_node);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+ INIT_LIST_HEAD(&req->r_linger_item);
+ INIT_LIST_HEAD(&req->r_linger_osd_item);
+ INIT_LIST_HEAD(&req->r_req_lru_item);
+ INIT_LIST_HEAD(&req->r_osd_item);
+
+ req->r_base_oloc.pool = -1;
+ req->r_target_oloc.pool = -1;
+
+ /* create reply message */
+ if (use_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+ OSD_OPREPLY_FRONT_LEN, gfp_flags, true);
+ if (!msg) {
+ ceph_osdc_put_request(req);
+ return NULL;
+ }
+ req->r_reply = msg;
+
+ /* create request message; allow space for oid */
+ if (use_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
+ if (!msg) {
+ ceph_osdc_put_request(req);
+ return NULL;
+ }
+
+ memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+ req->r_request = msg;
+
+ return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static bool osd_req_opcode_valid(u16 opcode)
+{
+ switch (opcode) {
+#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
+__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
+#undef GENERATE_CASE
+ default:
+ return false;
+ }
+}
+
+/*
+ * This is an osd op init function for opcodes that have no data or
+ * other information associated with them. It also serves as a
+ * common init routine for all the other init functions, below.
+ */
+static struct ceph_osd_req_op *
+_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
+ u16 opcode)
+{
+ struct ceph_osd_req_op *op;
+
+ BUG_ON(which >= osd_req->r_num_ops);
+ BUG_ON(!osd_req_opcode_valid(opcode));
+
+ op = &osd_req->r_ops[which];
+ memset(op, 0, sizeof (*op));
+ op->op = opcode;
+
+ return op;
+}
+
+void osd_req_op_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode)
+{
+ (void)_osd_req_op_init(osd_req, which, opcode);
+}
+EXPORT_SYMBOL(osd_req_op_init);
+
+void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode,
+ u64 offset, u64 length,
+ u64 truncate_size, u32 truncate_seq)
+{
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+ size_t payload_len = 0;
+
+ BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
+ opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE);
+
+ op->extent.offset = offset;
+ op->extent.length = length;
+ op->extent.truncate_size = truncate_size;
+ op->extent.truncate_seq = truncate_seq;
+ if (opcode == CEPH_OSD_OP_WRITE)
+ payload_len += length;
+
+ op->payload_len = payload_len;
+}
+EXPORT_SYMBOL(osd_req_op_extent_init);
+
+void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
+ unsigned int which, u64 length)
+{
+ struct ceph_osd_req_op *op;
+ u64 previous;
+
+ BUG_ON(which >= osd_req->r_num_ops);
+ op = &osd_req->r_ops[which];
+ previous = op->extent.length;
+
+ if (length == previous)
+ return; /* Nothing to do */
+ BUG_ON(length > previous);
+
+ op->extent.length = length;
+ op->payload_len -= previous - length;
+}
+EXPORT_SYMBOL(osd_req_op_extent_update);
+
+void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
+ u16 opcode, const char *class, const char *method)
+{
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+ struct ceph_pagelist *pagelist;
+ size_t payload_len = 0;
+ size_t size;
+
+ BUG_ON(opcode != CEPH_OSD_OP_CALL);
+
+ pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+ BUG_ON(!pagelist);
+ ceph_pagelist_init(pagelist);
+
+ op->cls.class_name = class;
+ size = strlen(class);
+ BUG_ON(size > (size_t) U8_MAX);
+ op->cls.class_len = size;
+ ceph_pagelist_append(pagelist, class, size);
+ payload_len += size;
+
+ op->cls.method_name = method;
+ size = strlen(method);
+ BUG_ON(size > (size_t) U8_MAX);
+ op->cls.method_len = size;
+ ceph_pagelist_append(pagelist, method, size);
+ payload_len += size;
+
+ osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
+
+ op->cls.argc = 0; /* currently unused */
+
+ op->payload_len = payload_len;
+}
+EXPORT_SYMBOL(osd_req_op_cls_init);
+
+int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
+ u16 opcode, const char *name, const void *value,
+ size_t size, u8 cmp_op, u8 cmp_mode)
+{
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+ struct ceph_pagelist *pagelist;
+ size_t payload_len;
+
+ BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
+
+ pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ if (!pagelist)
+ return -ENOMEM;
+
+ ceph_pagelist_init(pagelist);
+
+ payload_len = strlen(name);
+ op->xattr.name_len = payload_len;
+ ceph_pagelist_append(pagelist, name, payload_len);
+
+ op->xattr.value_len = size;
+ ceph_pagelist_append(pagelist, value, size);
+ payload_len += size;
+
+ op->xattr.cmp_op = cmp_op;
+ op->xattr.cmp_mode = cmp_mode;
+
+ ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
+ op->payload_len = payload_len;
+ return 0;
+}
+EXPORT_SYMBOL(osd_req_op_xattr_init);
+
+void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode,
+ u64 cookie, u64 version, int flag)
+{
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+
+ BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+
+ op->watch.cookie = cookie;
+ op->watch.ver = version;
+ if (opcode == CEPH_OSD_OP_WATCH && flag)
+ op->watch.flag = (u8)1;
+}
+EXPORT_SYMBOL(osd_req_op_watch_init);
+
+void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ u64 expected_object_size,
+ u64 expected_write_size)
+{
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+ CEPH_OSD_OP_SETALLOCHINT);
+
+ op->alloc_hint.expected_object_size = expected_object_size;
+ op->alloc_hint.expected_write_size = expected_write_size;
+
+ /*
+ * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+ * not worth a feature bit. Set FAILOK per-op flag to make
+ * sure older osds don't trip over an unsupported opcode.
+ */
+ op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
+}
+EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
+
+static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
+ struct ceph_osd_data *osd_data)
+{
+ u64 length = ceph_osd_data_length(osd_data);
+
+ if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+ BUG_ON(length > (u64) SIZE_MAX);
+ if (length)
+ ceph_msg_data_add_pages(msg, osd_data->pages,
+ length, osd_data->alignment);
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
+ BUG_ON(!length);
+ ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
+#ifdef CONFIG_BLOCK
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
+ ceph_msg_data_add_bio(msg, osd_data->bio, length);
+#endif
+ } else {
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
+ }
+}
+
+static u64 osd_req_encode_op(struct ceph_osd_request *req,
+ struct ceph_osd_op *dst, unsigned int which)
+{
+ struct ceph_osd_req_op *src;
+ struct ceph_osd_data *osd_data;
+ u64 request_data_len = 0;
+ u64 data_length;
+
+ BUG_ON(which >= req->r_num_ops);
+ src = &req->r_ops[which];
+ if (WARN_ON(!osd_req_opcode_valid(src->op))) {
+ pr_err("unrecognized osd opcode %d\n", src->op);
+
+ return 0;
+ }
+
+ switch (src->op) {
+ case CEPH_OSD_OP_STAT:
+ osd_data = &src->raw_data_in;
+ ceph_osdc_msg_data_add(req->r_reply, osd_data);
+ break;
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_ZERO:
+ case CEPH_OSD_OP_TRUNCATE:
+ if (src->op == CEPH_OSD_OP_WRITE)
+ request_data_len = src->extent.length;
+ dst->extent.offset = cpu_to_le64(src->extent.offset);
+ dst->extent.length = cpu_to_le64(src->extent.length);
+ dst->extent.truncate_size =
+ cpu_to_le64(src->extent.truncate_size);
+ dst->extent.truncate_seq =
+ cpu_to_le32(src->extent.truncate_seq);
+ osd_data = &src->extent.osd_data;
+ if (src->op == CEPH_OSD_OP_WRITE)
+ ceph_osdc_msg_data_add(req->r_request, osd_data);
+ else
+ ceph_osdc_msg_data_add(req->r_reply, osd_data);
+ break;
+ case CEPH_OSD_OP_CALL:
+ dst->cls.class_len = src->cls.class_len;
+ dst->cls.method_len = src->cls.method_len;
+ osd_data = &src->cls.request_info;
+ ceph_osdc_msg_data_add(req->r_request, osd_data);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
+ request_data_len = osd_data->pagelist->length;
+
+ osd_data = &src->cls.request_data;
+ data_length = ceph_osd_data_length(osd_data);
+ if (data_length) {
+ BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
+ dst->cls.indata_len = cpu_to_le32(data_length);
+ ceph_osdc_msg_data_add(req->r_request, osd_data);
+ src->payload_len += data_length;
+ request_data_len += data_length;
+ }
+ osd_data = &src->cls.response_data;
+ ceph_osdc_msg_data_add(req->r_reply, osd_data);
+ break;
+ case CEPH_OSD_OP_STARTSYNC:
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_WATCH:
+ dst->watch.cookie = cpu_to_le64(src->watch.cookie);
+ dst->watch.ver = cpu_to_le64(src->watch.ver);
+ dst->watch.flag = src->watch.flag;
+ break;
+ case CEPH_OSD_OP_SETALLOCHINT:
+ dst->alloc_hint.expected_object_size =
+ cpu_to_le64(src->alloc_hint.expected_object_size);
+ dst->alloc_hint.expected_write_size =
+ cpu_to_le64(src->alloc_hint.expected_write_size);
+ break;
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+ dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+ dst->xattr.cmp_op = src->xattr.cmp_op;
+ dst->xattr.cmp_mode = src->xattr.cmp_mode;
+ osd_data = &src->xattr.osd_data;
+ ceph_osdc_msg_data_add(req->r_request, osd_data);
+ request_data_len = osd_data->pagelist->length;
+ break;
+ case CEPH_OSD_OP_CREATE:
+ case CEPH_OSD_OP_DELETE:
+ break;
+ default:
+ pr_err("unsupported osd opcode %s\n",
+ ceph_osd_op_name(src->op));
+ WARN_ON(1);
+
+ return 0;
+ }
+
+ dst->op = cpu_to_le16(src->op);
+ dst->flags = cpu_to_le32(src->flags);
+ dst->payload_len = cpu_to_le32(src->payload_len);
+
+ return request_data_len;
+}
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately. (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 off, u64 *plen,
+ unsigned int which, int num_ops,
+ int opcode, int flags,
+ struct ceph_snap_context *snapc,
+ u32 truncate_seq,
+ u64 truncate_size,
+ bool use_mempool)
+{
+ struct ceph_osd_request *req;
+ u64 objnum = 0;
+ u64 objoff = 0;
+ u64 objlen = 0;
+ int r;
+
+ BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
+ opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
+ opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
+
+ req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
+ GFP_NOFS);
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ req->r_flags = flags;
+
+ /* calculate max write size */
+ r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
+ if (r < 0) {
+ ceph_osdc_put_request(req);
+ return ERR_PTR(r);
+ }
+
+ if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
+ osd_req_op_init(req, which, opcode);
+ } else {
+ u32 object_size = le32_to_cpu(layout->fl_object_size);
+ u32 object_base = off - objoff;
+ if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
+ if (truncate_size <= object_base) {
+ truncate_size = 0;
+ } else {
+ truncate_size -= object_base;
+ if (truncate_size > object_size)
+ truncate_size = object_size;
+ }
+ }
+ osd_req_op_extent_init(req, which, opcode, objoff, objlen,
+ truncate_size, truncate_seq);
+ }
+
+ req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+
+ snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
+ "%llx.%08llx", vino.ino, objnum);
+ req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+
+ return req;
+}
+EXPORT_SYMBOL(ceph_osdc_new_request);
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *new)
+{
+ struct rb_node **p = &osdc->requests.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd_request *req = NULL;
+
+ while (*p) {
+ parent = *p;
+ req = rb_entry(parent, struct ceph_osd_request, r_node);
+ if (new->r_tid < req->r_tid)
+ p = &(*p)->rb_left;
+ else if (new->r_tid > req->r_tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+ u64 tid)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *n = osdc->requests.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_osd_request, r_node);
+ if (tid < req->r_tid)
+ n = n->rb_left;
+ else if (tid > req->r_tid)
+ n = n->rb_right;
+ else
+ return req;
+ }
+ return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+ u64 tid)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *n = osdc->requests.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_osd_request, r_node);
+ if (tid < req->r_tid) {
+ if (!n->rb_left)
+ return req;
+ n = n->rb_left;
+ } else if (tid > req->r_tid) {
+ n = n->rb_right;
+ } else {
+ return req;
+ }
+ }
+ return NULL;
+}
+
+static void __kick_linger_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+ struct ceph_osd *osd = req->r_osd;
+
+ /*
+ * Linger requests need to be resent with a new tid to avoid
+ * the dup op detection logic on the OSDs. Achieve this with
+ * a re-register dance instead of open-coding.
+ */
+ ceph_osdc_get_request(req);
+ if (!list_empty(&req->r_linger_item))
+ __unregister_linger_request(osdc, req);
+ else
+ __unregister_request(osdc, req);
+ __register_request(osdc, req);
+ ceph_osdc_put_request(req);
+
+ /*
+ * Unless request has been registered as both normal and
+ * lingering, __unregister{,_linger}_request clears r_osd.
+ * However, here we need to preserve r_osd to make sure we
+ * requeue on the same OSD.
+ */
+ WARN_ON(req->r_osd || !osd);
+ req->r_osd = osd;
+
+ dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
+ __enqueue_request(req);
+}
+
+/*
+ * Resubmit requests pending on the given osd.
+ */
+static void __kick_osd_requests(struct ceph_osd_client *osdc,
+ struct ceph_osd *osd)
+{
+ struct ceph_osd_request *req, *nreq;
+ LIST_HEAD(resend);
+ LIST_HEAD(resend_linger);
+ int err;
+
+ dout("%s osd%d\n", __func__, osd->o_osd);
+ err = __reset_osd(osdc, osd);
+ if (err)
+ return;
+
+ /*
+ * Build up a list of requests to resend by traversing the
+ * osd's list of requests. Requests for a given object are
+ * sent in tid order, and that is also the order they're
+ * kept on this list. Therefore all requests that are in
+ * flight will be found first, followed by all requests that
+ * have not yet been sent. And to resend requests while
+ * preserving this order we will want to put any sent
+ * requests back on the front of the osd client's unsent
+ * list.
+ *
+ * So we build a separate ordered list of already-sent
+ * requests for the affected osd and splice it onto the
+ * front of the osd client's unsent list. Once we've seen a
+ * request that has not yet been sent we're done. Those
+ * requests are already sitting right where they belong.
+ */
+ list_for_each_entry(req, &osd->o_requests, r_osd_item) {
+ if (!req->r_sent)
+ break;
+
+ if (!req->r_linger) {
+ dout("%s requeueing %p tid %llu\n", __func__, req,
+ req->r_tid);
+ list_move_tail(&req->r_req_lru_item, &resend);
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ } else {
+ list_move_tail(&req->r_req_lru_item, &resend_linger);
+ }
+ }
+ list_splice(&resend, &osdc->req_unsent);
+
+ /*
+ * Both registered and not yet registered linger requests are
+ * enqueued with a new tid on the same OSD. We add/move them
+ * to req_unsent/o_requests at the end to keep things in tid
+ * order.
+ */
+ list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
+ r_linger_osd_item) {
+ WARN_ON(!list_empty(&req->r_req_lru_item));
+ __kick_linger_request(req);
+ }
+
+ list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
+ __kick_linger_request(req);
+}
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+
+ if (!osd)
+ return;
+ dout("osd_reset osd%d\n", osd->o_osd);
+ osdc = osd->o_osdc;
+ down_read(&osdc->map_sem);
+ mutex_lock(&osdc->request_mutex);
+ __kick_osd_requests(osdc, osd);
+ __send_queued(osdc);
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
+{
+ struct ceph_osd *osd;
+
+ osd = kzalloc(sizeof(*osd), GFP_NOFS);
+ if (!osd)
+ return NULL;
+
+ atomic_set(&osd->o_ref, 1);
+ osd->o_osdc = osdc;
+ osd->o_osd = onum;
+ RB_CLEAR_NODE(&osd->o_node);
+ INIT_LIST_HEAD(&osd->o_requests);
+ INIT_LIST_HEAD(&osd->o_linger_requests);
+ INIT_LIST_HEAD(&osd->o_osd_lru);
+ osd->o_incarnation = 1;
+
+ ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
+
+ INIT_LIST_HEAD(&osd->o_keepalive_item);
+ return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+ if (atomic_inc_not_zero(&osd->o_ref)) {
+ dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+ atomic_read(&osd->o_ref));
+ return osd;
+ } else {
+ dout("get_osd %p FAIL\n", osd);
+ return NULL;
+ }
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+ dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+ atomic_read(&osd->o_ref) - 1);
+ if (atomic_dec_and_test(&osd->o_ref)) {
+ struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+
+ if (osd->o_auth.authorizer)
+ ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
+ kfree(osd);
+ }
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
+ WARN_ON(!list_empty(&osd->o_requests));
+ WARN_ON(!list_empty(&osd->o_linger_requests));
+
+ list_del_init(&osd->o_osd_lru);
+ rb_erase(&osd->o_node, &osdc->osds);
+ RB_CLEAR_NODE(&osd->o_node);
+}
+
+static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
+
+ if (!RB_EMPTY_NODE(&osd->o_node)) {
+ ceph_con_close(&osd->o_con);
+ __remove_osd(osdc, osd);
+ put_osd(osd);
+ }
+}
+
+static void remove_all_osds(struct ceph_osd_client *osdc)
+{
+ dout("%s %p\n", __func__, osdc);
+ mutex_lock(&osdc->request_mutex);
+ while (!RB_EMPTY_ROOT(&osdc->osds)) {
+ struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
+ struct ceph_osd, o_node);
+ remove_osd(osdc, osd);
+ }
+ mutex_unlock(&osdc->request_mutex);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+ struct ceph_osd *osd)
+{
+ dout("%s %p\n", __func__, osd);
+ BUG_ON(!list_empty(&osd->o_osd_lru));
+
+ list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+ osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+}
+
+static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
+ struct ceph_osd *osd)
+{
+ dout("%s %p\n", __func__, osd);
+
+ if (list_empty(&osd->o_requests) &&
+ list_empty(&osd->o_linger_requests))
+ __move_osd_to_lru(osdc, osd);
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+ dout("__remove_osd_from_lru %p\n", osd);
+ if (!list_empty(&osd->o_osd_lru))
+ list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd *osd, *nosd;
+
+ dout("__remove_old_osds %p\n", osdc);
+ mutex_lock(&osdc->request_mutex);
+ list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+ if (time_before(jiffies, osd->lru_ttl))
+ break;
+ remove_osd(osdc, osd);
+ }
+ mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ struct ceph_entity_addr *peer_addr;
+
+ dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+ if (list_empty(&osd->o_requests) &&
+ list_empty(&osd->o_linger_requests)) {
+ remove_osd(osdc, osd);
+ return -ENODEV;
+ }
+
+ peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+ if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
+ !ceph_con_opened(&osd->o_con)) {
+ struct ceph_osd_request *req;
+
+ dout("osd addr hasn't changed and connection never opened, "
+ "letting msgr retry\n");
+ /* touch each r_stamp for handle_timeout()'s benfit */
+ list_for_each_entry(req, &osd->o_requests, r_osd_item)
+ req->r_stamp = jiffies;
+
+ return -EAGAIN;
+ }
+
+ ceph_con_close(&osd->o_con);
+ ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
+ osd->o_incarnation++;
+
+ return 0;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+ struct rb_node **p = &osdc->osds.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd *osd = NULL;
+
+ dout("__insert_osd %p osd%d\n", new, new->o_osd);
+ while (*p) {
+ parent = *p;
+ osd = rb_entry(parent, struct ceph_osd, o_node);
+ if (new->o_osd < osd->o_osd)
+ p = &(*p)->rb_left;
+ else if (new->o_osd > osd->o_osd)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->o_node, parent, p);
+ rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+ struct ceph_osd *osd;
+ struct rb_node *n = osdc->osds.rb_node;
+
+ while (n) {
+ osd = rb_entry(n, struct ceph_osd, o_node);
+ if (o < osd->o_osd)
+ n = n->rb_left;
+ else if (o > osd->o_osd)
+ n = n->rb_right;
+ else
+ return osd;
+ }
+ return NULL;
+}
+
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+ cancel_delayed_work(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid. If this is the first request, set up
+ * the timeout event.
+ */
+static void __register_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ req->r_tid = ++osdc->last_tid;
+ req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+ dout("__register_request %p tid %lld\n", req, req->r_tid);
+ __insert_request(osdc, req);
+ ceph_osdc_get_request(req);
+ osdc->num_requests++;
+ if (osdc->num_requests == 1) {
+ dout(" first request, scheduling timeout\n");
+ __schedule_osd_timeout(osdc);
+ }
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ if (RB_EMPTY_NODE(&req->r_node)) {
+ dout("__unregister_request %p tid %lld not registered\n",
+ req, req->r_tid);
+ return;
+ }
+
+ dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ rb_erase(&req->r_node, &osdc->requests);
+ RB_CLEAR_NODE(&req->r_node);
+ osdc->num_requests--;
+
+ if (req->r_osd) {
+ /* make sure the original request isn't in flight. */
+ ceph_msg_revoke(req->r_request);
+
+ list_del_init(&req->r_osd_item);
+ maybe_move_osd_to_lru(osdc, req->r_osd);
+ if (list_empty(&req->r_linger_osd_item))
+ req->r_osd = NULL;
+ }
+
+ list_del_init(&req->r_req_lru_item);
+ ceph_osdc_put_request(req);
+
+ if (osdc->num_requests == 0) {
+ dout(" no requests, canceling timeout\n");
+ __cancel_osd_timeout(osdc);
+ }
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+ if (req->r_sent && req->r_osd) {
+ ceph_msg_revoke(req->r_request);
+ req->r_sent = 0;
+ }
+}
+
+static void __register_linger_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+ WARN_ON(!req->r_linger);
+
+ ceph_osdc_get_request(req);
+ list_add_tail(&req->r_linger_item, &osdc->req_linger);
+ if (req->r_osd)
+ list_add_tail(&req->r_linger_osd_item,
+ &req->r_osd->o_linger_requests);
+}
+
+static void __unregister_linger_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ WARN_ON(!req->r_linger);
+
+ if (list_empty(&req->r_linger_item)) {
+ dout("%s %p tid %llu not registered\n", __func__, req,
+ req->r_tid);
+ return;
+ }
+
+ dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+ list_del_init(&req->r_linger_item);
+
+ if (req->r_osd) {
+ list_del_init(&req->r_linger_osd_item);
+ maybe_move_osd_to_lru(osdc, req->r_osd);
+ if (list_empty(&req->r_osd_item))
+ req->r_osd = NULL;
+ }
+ ceph_osdc_put_request(req);
+}
+
+void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ if (!req->r_linger) {
+ dout("set_request_linger %p\n", req);
+ req->r_linger = 1;
+ }
+}
+EXPORT_SYMBOL(ceph_osdc_set_request_linger);
+
+/*
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
+ *
+ * Caller should hold map_sem for read.
+ */
+static bool __req_should_be_paused(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+ bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+ return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
+ (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+/*
+ * Calculate mapping of a request to a PG. Takes tiering into account.
+ */
+static int __calc_request_pg(struct ceph_osdmap *osdmap,
+ struct ceph_osd_request *req,
+ struct ceph_pg *pg_out)
+{
+ bool need_check_tiering;
+
+ need_check_tiering = false;
+ if (req->r_target_oloc.pool == -1) {
+ req->r_target_oloc = req->r_base_oloc; /* struct */
+ need_check_tiering = true;
+ }
+ if (req->r_target_oid.name_len == 0) {
+ ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
+ need_check_tiering = true;
+ }
+
+ if (need_check_tiering &&
+ (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
+ if (pi) {
+ if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+ pi->read_tier >= 0)
+ req->r_target_oloc.pool = pi->read_tier;
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ pi->write_tier >= 0)
+ req->r_target_oloc.pool = pi->write_tier;
+ }
+ /* !pi is caught in ceph_oloc_oid_to_pg() */
+ }
+
+ return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
+ &req->r_target_oid, pg_out);
+}
+
+static void __enqueue_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+
+ dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1);
+
+ if (req->r_osd) {
+ __remove_osd_from_lru(req->r_osd);
+ list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
+ list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
+ } else {
+ list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
+ }
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately. If there is
+ * no up osd, set r_osd to NULL. Move the request to the appropriate list
+ * (unsent, homeless) or leave on in-flight lru.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req, int force_resend)
+{
+ struct ceph_pg pgid;
+ int acting[CEPH_PG_MAX_SIZE];
+ int num, o;
+ int err;
+ bool was_paused;
+
+ dout("map_request %p tid %lld\n", req, req->r_tid);
+
+ err = __calc_request_pg(osdc->osdmap, req, &pgid);
+ if (err) {
+ list_move(&req->r_req_lru_item, &osdc->req_notarget);
+ return err;
+ }
+ req->r_pgid = pgid;
+
+ num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
+ if (num < 0)
+ num = 0;
+
+ was_paused = req->r_paused;
+ req->r_paused = __req_should_be_paused(osdc, req);
+ if (was_paused && !req->r_paused)
+ force_resend = 1;
+
+ if ((!force_resend &&
+ req->r_osd && req->r_osd->o_osd == o &&
+ req->r_sent >= req->r_osd->o_incarnation &&
+ req->r_num_pg_osds == num &&
+ memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
+ (req->r_osd == NULL && o == -1) ||
+ req->r_paused)
+ return 0; /* no change */
+
+ dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
+ req->r_tid, pgid.pool, pgid.seed, o,
+ req->r_osd ? req->r_osd->o_osd : -1);
+
+ /* record full pg acting set */
+ memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+ req->r_num_pg_osds = num;
+
+ if (req->r_osd) {
+ __cancel_request(req);
+ list_del_init(&req->r_osd_item);
+ list_del_init(&req->r_linger_osd_item);
+ req->r_osd = NULL;
+ }
+
+ req->r_osd = __lookup_osd(osdc, o);
+ if (!req->r_osd && o >= 0) {
+ err = -ENOMEM;
+ req->r_osd = create_osd(osdc, o);
+ if (!req->r_osd) {
+ list_move(&req->r_req_lru_item, &osdc->req_notarget);
+ goto out;
+ }
+
+ dout("map_request osd %p is osd%d\n", req->r_osd, o);
+ __insert_osd(osdc, req->r_osd);
+
+ ceph_con_open(&req->r_osd->o_con,
+ CEPH_ENTITY_TYPE_OSD, o,
+ &osdc->osdmap->osd_addr[o]);
+ }
+
+ __enqueue_request(req);
+ err = 1; /* osd or pg changed */
+
+out:
+ return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static void __send_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ void *p;
+
+ dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
+ req, req->r_tid, req->r_osd->o_osd, req->r_flags,
+ (unsigned long long)req->r_pgid.pool, req->r_pgid.seed);
+
+ /* fill in message content that changes each time we send it */
+ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
+ put_unaligned_le32(req->r_flags, req->r_request_flags);
+ put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
+ p = req->r_request_pgid;
+ ceph_encode_64(&p, req->r_pgid.pool);
+ ceph_encode_32(&p, req->r_pgid.seed);
+ put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
+ memcpy(req->r_request_reassert_version, &req->r_reassert_version,
+ sizeof(req->r_reassert_version));
+
+ req->r_stamp = jiffies;
+ list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+
+ ceph_msg_get(req->r_request); /* send consumes a ref */
+
+ req->r_sent = req->r_osd->o_incarnation;
+
+ ceph_con_send(&req->r_osd->o_con, req->r_request);
+}
+
+/*
+ * Send any requests in the queue (req_unsent).
+ */
+static void __send_queued(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_request *req, *tmp;
+
+ dout("__send_queued\n");
+ list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
+ __send_request(osdc, req);
+}
+
+/*
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
+{
+ int rc;
+
+ __register_request(osdc, req);
+ req->r_sent = 0;
+ req->r_got_reply = 0;
+ rc = __map_request(osdc, req, 0);
+ if (rc < 0) {
+ if (nofail) {
+ dout("osdc_start_request failed map, "
+ " will retry %lld\n", req->r_tid);
+ rc = 0;
+ } else {
+ __unregister_request(osdc, req);
+ }
+ return rc;
+ }
+
+ if (req->r_osd == NULL) {
+ dout("send_request %p no up osds in pg\n", req);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ } else {
+ __send_queued(osdc);
+ }
+
+ return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds. When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected. Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client, timeout_work.work);
+ struct ceph_osd_request *req;
+ struct ceph_osd *osd;
+ unsigned long keepalive =
+ osdc->client->options->osd_keepalive_timeout * HZ;
+ struct list_head slow_osds;
+ dout("timeout\n");
+ down_read(&osdc->map_sem);
+
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+ mutex_lock(&osdc->request_mutex);
+
+ /*
+ * ping osds that are a bit slow. this ensures that if there
+ * is a break in the TCP connection we will notice, and reopen
+ * a connection with that osd (from the fault callback).
+ */
+ INIT_LIST_HEAD(&slow_osds);
+ list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+ if (time_before(jiffies, req->r_stamp + keepalive))
+ break;
+
+ osd = req->r_osd;
+ BUG_ON(!osd);
+ dout(" tid %llu is slow, will send keepalive on osd%d\n",
+ req->r_tid, osd->o_osd);
+ list_move_tail(&osd->o_keepalive_item, &slow_osds);
+ }
+ while (!list_empty(&slow_osds)) {
+ osd = list_entry(slow_osds.next, struct ceph_osd,
+ o_keepalive_item);
+ list_del_init(&osd->o_keepalive_item);
+ ceph_con_keepalive(&osd->o_con);
+ }
+
+ __schedule_osd_timeout(osdc);
+ __send_queued(osdc);
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client,
+ osds_timeout_work.work);
+ unsigned long delay =
+ osdc->client->options->osd_idle_ttl * HZ >> 2;
+
+ dout("osds timeout\n");
+ down_read(&osdc->map_sem);
+ remove_old_osds(osdc);
+ up_read(&osdc->map_sem);
+
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(delay));
+}
+
+static int ceph_oloc_decode(void **p, void *end,
+ struct ceph_object_locator *oloc)
+{
+ u8 struct_v, struct_cv;
+ u32 len;
+ void *struct_end;
+ int ret = 0;
+
+ ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
+ struct_v = ceph_decode_8(p);
+ struct_cv = ceph_decode_8(p);
+ if (struct_v < 3) {
+ pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ if (struct_cv > 6) {
+ pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, e_inval);
+ struct_end = *p + len;
+
+ oloc->pool = ceph_decode_64(p);
+ *p += 4; /* skip preferred */
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_object_locator::key is set\n");
+ goto e_inval;
+ }
+
+ if (struct_v >= 5) {
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_object_locator::nspace is set\n");
+ goto e_inval;
+ }
+ }
+
+ if (struct_v >= 6) {
+ s64 hash = ceph_decode_64(p);
+ if (hash != -1) {
+ pr_warn("ceph_object_locator::hash is set\n");
+ goto e_inval;
+ }
+ }
+
+ /* skip the rest */
+ *p = struct_end;
+out:
+ return ret;
+
+e_inval:
+ ret = -EINVAL;
+ goto out;
+}
+
+static int ceph_redirect_decode(void **p, void *end,
+ struct ceph_request_redirect *redir)
+{
+ u8 struct_v, struct_cv;
+ u32 len;
+ void *struct_end;
+ int ret;
+
+ ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
+ struct_v = ceph_decode_8(p);
+ struct_cv = ceph_decode_8(p);
+ if (struct_cv > 1) {
+ pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, e_inval);
+ struct_end = *p + len;
+
+ ret = ceph_oloc_decode(p, end, &redir->oloc);
+ if (ret)
+ goto out;
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_request_redirect::object_name is set\n");
+ goto e_inval;
+ }
+
+ len = ceph_decode_32(p);
+ *p += len; /* skip osd_instructions */
+
+ /* skip the rest */
+ *p = struct_end;
+out:
+ return ret;
+
+e_inval:
+ ret = -EINVAL;
+ goto out;
+}
+
+static void complete_request(struct ceph_osd_request *req)
+{
+ complete_all(&req->r_safe_completion); /* fsync waiter */
+}
+
+/*
+ * handle osd op reply. either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+ struct ceph_connection *con)
+{
+ void *p, *end;
+ struct ceph_osd_request *req;
+ struct ceph_request_redirect redir;
+ u64 tid;
+ int object_len;
+ unsigned int numops;
+ int payload_len, flags;
+ s32 result;
+ s32 retry_attempt;
+ struct ceph_pg pg;
+ int err;
+ u32 reassert_epoch;
+ u64 reassert_version;
+ u32 osdmap_epoch;
+ int already_completed;
+ u32 bytes;
+ unsigned int i;
+
+ tid = le64_to_cpu(msg->hdr.tid);
+ dout("handle_reply %p tid %llu\n", msg, tid);
+
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ ceph_decode_need(&p, end, 4, bad);
+ object_len = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, object_len, bad);
+ p += object_len;
+
+ err = ceph_decode_pgid(&p, end, &pg);
+ if (err)
+ goto bad;
+
+ ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
+ flags = ceph_decode_64(&p);
+ result = ceph_decode_32(&p);
+ reassert_epoch = ceph_decode_32(&p);
+ reassert_version = ceph_decode_64(&p);
+ osdmap_epoch = ceph_decode_32(&p);
+
+ /* lookup */
+ down_read(&osdc->map_sem);
+ mutex_lock(&osdc->request_mutex);
+ req = __lookup_request(osdc, tid);
+ if (req == NULL) {
+ dout("handle_reply tid %llu dne\n", tid);
+ goto bad_mutex;
+ }
+ ceph_osdc_get_request(req);
+
+ dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
+ req, result);
+
+ ceph_decode_need(&p, end, 4, bad_put);
+ numops = ceph_decode_32(&p);
+ if (numops > CEPH_OSD_MAX_OP)
+ goto bad_put;
+ if (numops != req->r_num_ops)
+ goto bad_put;
+ payload_len = 0;
+ ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
+ for (i = 0; i < numops; i++) {
+ struct ceph_osd_op *op = p;
+ int len;
+
+ len = le32_to_cpu(op->payload_len);
+ req->r_reply_op_len[i] = len;
+ dout(" op %d has %d bytes\n", i, len);
+ payload_len += len;
+ p += sizeof(*op);
+ }
+ bytes = le32_to_cpu(msg->hdr.data_len);
+ if (payload_len != bytes) {
+ pr_warn("sum of op payload lens %d != data_len %d\n",
+ payload_len, bytes);
+ goto bad_put;
+ }
+
+ ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
+ retry_attempt = ceph_decode_32(&p);
+ for (i = 0; i < numops; i++)
+ req->r_reply_op_result[i] = ceph_decode_32(&p);
+
+ if (le16_to_cpu(msg->hdr.version) >= 6) {
+ p += 8 + 4; /* skip replay_version */
+ p += 8; /* skip user_version */
+
+ err = ceph_redirect_decode(&p, end, &redir);
+ if (err)
+ goto bad_put;
+ } else {
+ redir.oloc.pool = -1;
+ }
+
+ if (redir.oloc.pool != -1) {
+ dout("redirect pool %lld\n", redir.oloc.pool);
+
+ __unregister_request(osdc, req);
+
+ req->r_target_oloc = redir.oloc; /* struct */
+
+ /*
+ * Start redirect requests with nofail=true. If
+ * mapping fails, request will end up on the notarget
+ * list, waiting for the new osdmap (which can take
+ * a while), even though the original request mapped
+ * successfully. In the future we might want to follow
+ * original request's nofail setting here.
+ */
+ err = __ceph_osdc_start_request(osdc, req, true);
+ BUG_ON(err);
+
+ goto out_unlock;
+ }
+
+ already_completed = req->r_got_reply;
+ if (!req->r_got_reply) {
+ req->r_result = result;
+ dout("handle_reply result %d bytes %d\n", req->r_result,
+ bytes);
+ if (req->r_result == 0)
+ req->r_result = bytes;
+
+ /* in case this is a write and we need to replay, */
+ req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
+ req->r_reassert_version.version = cpu_to_le64(reassert_version);
+
+ req->r_got_reply = 1;
+ } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+ dout("handle_reply tid %llu dup ack\n", tid);
+ goto out_unlock;
+ }
+
+ dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+ if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
+ __register_linger_request(osdc, req);
+
+ /* either this is a read, or we got the safe response */
+ if (result < 0 ||
+ (flags & CEPH_OSD_FLAG_ONDISK) ||
+ ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+ __unregister_request(osdc, req);
+
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+
+ if (!already_completed) {
+ if (req->r_unsafe_callback &&
+ result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
+ req->r_unsafe_callback(req, true);
+ if (req->r_callback)
+ req->r_callback(req, msg);
+ else
+ complete_all(&req->r_completion);
+ }
+
+ if (flags & CEPH_OSD_FLAG_ONDISK) {
+ if (req->r_unsafe_callback && already_completed)
+ req->r_unsafe_callback(req, false);
+ complete_request(req);
+ }
+
+out:
+ dout("req=%p req->r_linger=%d\n", req, req->r_linger);
+ ceph_osdc_put_request(req);
+ return;
+out_unlock:
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+ goto out;
+
+bad_put:
+ req->r_result = -EIO;
+ __unregister_request(osdc, req);
+ if (req->r_callback)
+ req->r_callback(req, msg);
+ else
+ complete_all(&req->r_completion);
+ complete_request(req);
+ ceph_osdc_put_request(req);
+bad_mutex:
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+bad:
+ pr_err("corrupt osd_op_reply got %d %d\n",
+ (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
+ ceph_msg_dump(msg);
+}
+
+static void reset_changed_osds(struct ceph_osd_client *osdc)
+{
+ struct rb_node *p, *n;
+
+ dout("%s %p\n", __func__, osdc);
+ for (p = rb_first(&osdc->osds); p; p = n) {
+ struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+
+ n = rb_next(p);
+ if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+ memcmp(&osd->o_con.peer_addr,
+ ceph_osd_addr(osdc->osdmap,
+ osd->o_osd),
+ sizeof(struct ceph_entity_addr)) != 0)
+ __reset_osd(osdc, osd);
+ }
+}
+
+/*
+ * Requeue requests whose mapping to an OSD has changed. If requests map to
+ * no osd, request a new map.
+ *
+ * Caller should hold map_sem for read.
+ */
+static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
+ bool force_resend_writes)
+{
+ struct ceph_osd_request *req, *nreq;
+ struct rb_node *p;
+ int needmap = 0;
+ int err;
+ bool force_resend_req;
+
+ dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
+ force_resend_writes ? " (force resend writes)" : "");
+ mutex_lock(&osdc->request_mutex);
+ for (p = rb_first(&osdc->requests); p; ) {
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+ p = rb_next(p);
+
+ /*
+ * For linger requests that have not yet been
+ * registered, move them to the linger list; they'll
+ * be sent to the osd in the loop below. Unregister
+ * the request before re-registering it as a linger
+ * request to ensure the __map_request() below
+ * will decide it needs to be sent.
+ */
+ if (req->r_linger && list_empty(&req->r_linger_item)) {
+ dout("%p tid %llu restart on osd%d\n",
+ req, req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1);
+ ceph_osdc_get_request(req);
+ __unregister_request(osdc, req);
+ __register_linger_request(osdc, req);
+ ceph_osdc_put_request(req);
+ continue;
+ }
+
+ force_resend_req = force_resend ||
+ (force_resend_writes &&
+ req->r_flags & CEPH_OSD_FLAG_WRITE);
+ err = __map_request(osdc, req, force_resend_req);
+ if (err < 0)
+ continue; /* error */
+ if (req->r_osd == NULL) {
+ dout("%p tid %llu maps to no osd\n", req, req->r_tid);
+ needmap++; /* request a newer map */
+ } else if (err > 0) {
+ if (!req->r_linger) {
+ dout("%p tid %llu requeued on osd%d\n", req,
+ req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1);
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ }
+ }
+ }
+
+ list_for_each_entry_safe(req, nreq, &osdc->req_linger,
+ r_linger_item) {
+ dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
+
+ err = __map_request(osdc, req,
+ force_resend || force_resend_writes);
+ dout("__map_request returned %d\n", err);
+ if (err < 0)
+ continue; /* hrm! */
+ if (req->r_osd == NULL || err > 0) {
+ if (req->r_osd == NULL) {
+ dout("lingering %p tid %llu maps to no osd\n",
+ req, req->r_tid);
+ /*
+ * A homeless lingering request makes
+ * no sense, as it's job is to keep
+ * a particular OSD connection open.
+ * Request a newer map and kick the
+ * request, knowing that it won't be
+ * resent until we actually get a map
+ * that can tell us where to send it.
+ */
+ needmap++;
+ }
+
+ dout("kicking lingering %p tid %llu osd%d\n", req,
+ req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
+ __register_request(osdc, req);
+ __unregister_linger_request(osdc, req);
+ }
+ }
+ reset_changed_osds(osdc);
+ mutex_unlock(&osdc->request_mutex);
+
+ if (needmap) {
+ dout("%d requests for down osds, need new map\n", needmap);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ }
+}
+
+
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster. Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+ void *p, *end, *next;
+ u32 nr_maps, maplen;
+ u32 epoch;
+ struct ceph_osdmap *newmap = NULL, *oldmap;
+ int err;
+ struct ceph_fsid fsid;
+ bool was_full;
+
+ dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ /* verify fsid */
+ ceph_decode_need(&p, end, sizeof(fsid), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ if (ceph_check_fsid(osdc->client, &fsid) < 0)
+ return;
+
+ down_write(&osdc->map_sem);
+
+ was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+
+ /* incremental maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d inc maps\n", nr_maps);
+ while (nr_maps > 0) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ next = p + maplen;
+ if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+ dout("applying incremental map %u len %d\n",
+ epoch, maplen);
+ newmap = osdmap_apply_incremental(&p, next,
+ osdc->osdmap,
+ &osdc->client->msgr);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad;
+ }
+ BUG_ON(!newmap);
+ if (newmap != osdc->osdmap) {
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = newmap;
+ }
+ was_full = was_full ||
+ ceph_osdmap_flag(osdc->osdmap,
+ CEPH_OSDMAP_FULL);
+ kick_requests(osdc, 0, was_full);
+ } else {
+ dout("ignoring incremental map %u len %d\n",
+ epoch, maplen);
+ }
+ p = next;
+ nr_maps--;
+ }
+ if (newmap)
+ goto done;
+
+ /* full maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d full maps\n", nr_maps);
+ while (nr_maps) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ if (nr_maps > 1) {
+ dout("skipping non-latest full map %u len %d\n",
+ epoch, maplen);
+ } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+ dout("skipping full map %u len %d, "
+ "older than our %u\n", epoch, maplen,
+ osdc->osdmap->epoch);
+ } else {
+ int skipped_map = 0;
+
+ dout("taking full map %u len %d\n", epoch, maplen);
+ newmap = ceph_osdmap_decode(&p, p+maplen);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad;
+ }
+ BUG_ON(!newmap);
+ oldmap = osdc->osdmap;
+ osdc->osdmap = newmap;
+ if (oldmap) {
+ if (oldmap->epoch + 1 < newmap->epoch)
+ skipped_map = 1;
+ ceph_osdmap_destroy(oldmap);
+ }
+ was_full = was_full ||
+ ceph_osdmap_flag(osdc->osdmap,
+ CEPH_OSDMAP_FULL);
+ kick_requests(osdc, skipped_map, was_full);
+ }
+ p += maplen;
+ nr_maps--;
+ }
+
+ if (!osdc->osdmap)
+ goto bad;
+done:
+ downgrade_write(&osdc->map_sem);
+ ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+
+ /*
+ * subscribe to subsequent osdmap updates if full to ensure
+ * we find out when we are no longer full and stop returning
+ * ENOSPC.
+ */
+ if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+ mutex_lock(&osdc->request_mutex);
+ __send_queued(osdc);
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+ wake_up_all(&osdc->client->auth_wq);
+ return;
+
+bad:
+ pr_err("osdc handle_map corrupt msg\n");
+ ceph_msg_dump(msg);
+ up_write(&osdc->map_sem);
+}
+
+/*
+ * watch/notify callback event infrastructure
+ *
+ * These callbacks are used both for watch and notify operations.
+ */
+static void __release_event(struct kref *kref)
+{
+ struct ceph_osd_event *event =
+ container_of(kref, struct ceph_osd_event, kref);
+
+ dout("__release_event %p\n", event);
+ kfree(event);
+}
+
+static void get_event(struct ceph_osd_event *event)
+{
+ kref_get(&event->kref);
+}
+
+void ceph_osdc_put_event(struct ceph_osd_event *event)
+{
+ kref_put(&event->kref, __release_event);
+}
+EXPORT_SYMBOL(ceph_osdc_put_event);
+
+static void __insert_event(struct ceph_osd_client *osdc,
+ struct ceph_osd_event *new)
+{
+ struct rb_node **p = &osdc->event_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd_event *event = NULL;
+
+ while (*p) {
+ parent = *p;
+ event = rb_entry(parent, struct ceph_osd_event, node);
+ if (new->cookie < event->cookie)
+ p = &(*p)->rb_left;
+ else if (new->cookie > event->cookie)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, &osdc->event_tree);
+}
+
+static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
+ u64 cookie)
+{
+ struct rb_node **p = &osdc->event_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd_event *event = NULL;
+
+ while (*p) {
+ parent = *p;
+ event = rb_entry(parent, struct ceph_osd_event, node);
+ if (cookie < event->cookie)
+ p = &(*p)->rb_left;
+ else if (cookie > event->cookie)
+ p = &(*p)->rb_right;
+ else
+ return event;
+ }
+ return NULL;
+}
+
+static void __remove_event(struct ceph_osd_event *event)
+{
+ struct ceph_osd_client *osdc = event->osdc;
+
+ if (!RB_EMPTY_NODE(&event->node)) {
+ dout("__remove_event removed %p\n", event);
+ rb_erase(&event->node, &osdc->event_tree);
+ ceph_osdc_put_event(event);
+ } else {
+ dout("__remove_event didn't remove %p\n", event);
+ }
+}
+
+int ceph_osdc_create_event(struct ceph_osd_client *osdc,
+ void (*event_cb)(u64, u64, u8, void *),
+ void *data, struct ceph_osd_event **pevent)
+{
+ struct ceph_osd_event *event;
+
+ event = kmalloc(sizeof(*event), GFP_NOIO);
+ if (!event)
+ return -ENOMEM;
+
+ dout("create_event %p\n", event);
+ event->cb = event_cb;
+ event->one_shot = 0;
+ event->data = data;
+ event->osdc = osdc;
+ INIT_LIST_HEAD(&event->osd_node);
+ RB_CLEAR_NODE(&event->node);
+ kref_init(&event->kref); /* one ref for us */
+ kref_get(&event->kref); /* one ref for the caller */
+
+ spin_lock(&osdc->event_lock);
+ event->cookie = ++osdc->event_count;
+ __insert_event(osdc, event);
+ spin_unlock(&osdc->event_lock);
+
+ *pevent = event;
+ return 0;
+}
+EXPORT_SYMBOL(ceph_osdc_create_event);
+
+void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+{
+ struct ceph_osd_client *osdc = event->osdc;
+
+ dout("cancel_event %p\n", event);
+ spin_lock(&osdc->event_lock);
+ __remove_event(event);
+ spin_unlock(&osdc->event_lock);
+ ceph_osdc_put_event(event); /* caller's */
+}
+EXPORT_SYMBOL(ceph_osdc_cancel_event);
+
+
+static void do_event_work(struct work_struct *work)
+{
+ struct ceph_osd_event_work *event_work =
+ container_of(work, struct ceph_osd_event_work, work);
+ struct ceph_osd_event *event = event_work->event;
+ u64 ver = event_work->ver;
+ u64 notify_id = event_work->notify_id;
+ u8 opcode = event_work->opcode;
+
+ dout("do_event_work completing %p\n", event);
+ event->cb(ver, notify_id, opcode, event->data);
+ dout("do_event_work completed %p\n", event);
+ ceph_osdc_put_event(event);
+ kfree(event_work);
+}
+
+
+/*
+ * Process osd watch notifications
+ */
+static void handle_watch_notify(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg)
+{
+ void *p, *end;
+ u8 proto_ver;
+ u64 cookie, ver, notify_id;
+ u8 opcode;
+ struct ceph_osd_event *event;
+ struct ceph_osd_event_work *event_work;
+
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ ceph_decode_8_safe(&p, end, proto_ver, bad);
+ ceph_decode_8_safe(&p, end, opcode, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+ ceph_decode_64_safe(&p, end, ver, bad);
+ ceph_decode_64_safe(&p, end, notify_id, bad);
+
+ spin_lock(&osdc->event_lock);
+ event = __find_event(osdc, cookie);
+ if (event) {
+ BUG_ON(event->one_shot);
+ get_event(event);
+ }
+ spin_unlock(&osdc->event_lock);
+ dout("handle_watch_notify cookie %lld ver %lld event %p\n",
+ cookie, ver, event);
+ if (event) {
+ event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
+ if (!event_work) {
+ pr_err("couldn't allocate event_work\n");
+ ceph_osdc_put_event(event);
+ return;
+ }
+ INIT_WORK(&event_work->work, do_event_work);
+ event_work->event = event;
+ event_work->ver = ver;
+ event_work->notify_id = notify_id;
+ event_work->opcode = opcode;
+
+ queue_work(osdc->notify_wq, &event_work->work);
+ }
+
+ return;
+
+bad:
+ pr_err("osdc handle_watch_notify corrupt msg\n");
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
+ struct ceph_snap_context *snapc, u64 snap_id,
+ struct timespec *mtime)
+{
+ struct ceph_msg *msg = req->r_request;
+ void *p;
+ size_t msg_size;
+ int flags = req->r_flags;
+ u64 data_len;
+ unsigned int i;
+
+ req->r_snapid = snap_id;
+ req->r_snapc = ceph_get_snap_context(snapc);
+
+ /* encode request */
+ msg->hdr.version = cpu_to_le16(4);
+
+ p = msg->front.iov_base;
+ ceph_encode_32(&p, 1); /* client_inc is always 1 */
+ req->r_request_osdmap_epoch = p;
+ p += 4;
+ req->r_request_flags = p;
+ p += 4;
+ if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+ ceph_encode_timespec(p, mtime);
+ p += sizeof(struct ceph_timespec);
+ req->r_request_reassert_version = p;
+ p += sizeof(struct ceph_eversion); /* will get filled in */
+
+ /* oloc */
+ ceph_encode_8(&p, 4);
+ ceph_encode_8(&p, 4);
+ ceph_encode_32(&p, 8 + 4 + 4);
+ req->r_request_pool = p;
+ p += 8;
+ ceph_encode_32(&p, -1); /* preferred */
+ ceph_encode_32(&p, 0); /* key len */
+
+ ceph_encode_8(&p, 1);
+ req->r_request_pgid = p;
+ p += 8 + 4;
+ ceph_encode_32(&p, -1); /* preferred */
+
+ /* oid */
+ ceph_encode_32(&p, req->r_base_oid.name_len);
+ memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
+ dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
+ req->r_base_oid.name, req->r_base_oid.name_len);
+ p += req->r_base_oid.name_len;
+
+ /* ops--can imply data */
+ ceph_encode_16(&p, (u16)req->r_num_ops);
+ data_len = 0;
+ for (i = 0; i < req->r_num_ops; i++) {
+ data_len += osd_req_encode_op(req, p, i);
+ p += sizeof(struct ceph_osd_op);
+ }
+
+ /* snaps */
+ ceph_encode_64(&p, req->r_snapid);
+ ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
+ ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
+ if (req->r_snapc) {
+ for (i = 0; i < snapc->num_snaps; i++) {
+ ceph_encode_64(&p, req->r_snapc->snaps[i]);
+ }
+ }
+
+ req->r_request_attempts = p;
+ p += 4;
+
+ /* data */
+ if (flags & CEPH_OSD_FLAG_WRITE) {
+ u16 data_off;
+
+ /*
+ * The header "data_off" is a hint to the receiver
+ * allowing it to align received data into its
+ * buffers such that there's no need to re-copy
+ * it before writing it to disk (direct I/O).
+ */
+ data_off = (u16) (off & 0xffff);
+ req->r_request->hdr.data_off = cpu_to_le16(data_off);
+ }
+ req->r_request->hdr.data_len = cpu_to_le32(data_len);
+
+ BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+ msg_size = p - msg->front.iov_base;
+ msg->front.iov_len = msg_size;
+ msg->hdr.front_len = cpu_to_le32(msg_size);
+
+ dout("build_request msg_size was %d\n", (int)msg_size);
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
+{
+ int rc;
+
+ down_read(&osdc->map_sem);
+ mutex_lock(&osdc->request_mutex);
+
+ rc = __ceph_osdc_start_request(osdc, req, nofail);
+
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+
+ return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_start_request);
+
+/*
+ * Unregister a registered request. The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+
+ mutex_lock(&osdc->request_mutex);
+ if (req->r_linger)
+ __unregister_linger_request(osdc, req);
+ __unregister_request(osdc, req);
+ mutex_unlock(&osdc->request_mutex);
+
+ dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ int rc;
+
+ dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+
+ rc = wait_for_completion_interruptible(&req->r_completion);
+ if (rc < 0) {
+ dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
+ ceph_osdc_cancel_request(req);
+ complete_request(req);
+ return rc;
+ }
+
+ dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
+ req->r_result);
+ return req->r_result;
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
+
+/*
+ * sync - wait for all in-flight requests to flush. avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_request *req;
+ u64 last_tid, next_tid = 0;
+
+ mutex_lock(&osdc->request_mutex);
+ last_tid = osdc->last_tid;
+ while (1) {
+ req = __lookup_request_ge(osdc, next_tid);
+ if (!req)
+ break;
+ if (req->r_tid > last_tid)
+ break;
+
+ next_tid = req->r_tid + 1;
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+ continue;
+
+ ceph_osdc_get_request(req);
+ mutex_unlock(&osdc->request_mutex);
+ dout("sync waiting on tid %llu (last is %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ mutex_lock(&osdc->request_mutex);
+ ceph_osdc_put_request(req);
+ }
+ mutex_unlock(&osdc->request_mutex);
+ dout("sync done (thru tid %llu)\n", last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
+
+/*
+ * Call all pending notify callbacks - for use after a watch is
+ * unregistered, to make sure no more callbacks for it will be invoked
+ */
+void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
+{
+ flush_workqueue(osdc->notify_wq);
+}
+EXPORT_SYMBOL(ceph_osdc_flush_notifies);
+
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+ int err;
+
+ dout("init\n");
+ osdc->client = client;
+ osdc->osdmap = NULL;
+ init_rwsem(&osdc->map_sem);
+ init_completion(&osdc->map_waiters);
+ osdc->last_requested_map = 0;
+ mutex_init(&osdc->request_mutex);
+ osdc->last_tid = 0;
+ osdc->osds = RB_ROOT;
+ INIT_LIST_HEAD(&osdc->osd_lru);
+ osdc->requests = RB_ROOT;
+ INIT_LIST_HEAD(&osdc->req_lru);
+ INIT_LIST_HEAD(&osdc->req_unsent);
+ INIT_LIST_HEAD(&osdc->req_notarget);
+ INIT_LIST_HEAD(&osdc->req_linger);
+ osdc->num_requests = 0;
+ INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+ INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+ spin_lock_init(&osdc->event_lock);
+ osdc->event_tree = RB_ROOT;
+ osdc->event_count = 0;
+
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+
+ err = -ENOMEM;
+ osdc->req_mempool = mempool_create_kmalloc_pool(10,
+ sizeof(struct ceph_osd_request));
+ if (!osdc->req_mempool)
+ goto out;
+
+ err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
+ OSD_OP_FRONT_LEN, 10, true,
+ "osd_op");
+ if (err < 0)
+ goto out_mempool;
+ err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
+ OSD_OPREPLY_FRONT_LEN, 10, true,
+ "osd_op_reply");
+ if (err < 0)
+ goto out_msgpool;
+
+ err = -ENOMEM;
+ osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
+ if (!osdc->notify_wq)
+ goto out_msgpool_reply;
+
+ return 0;
+
+out_msgpool_reply:
+ ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+out_msgpool:
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+ mempool_destroy(osdc->req_mempool);
+out:
+ return err;
+}
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+ flush_workqueue(osdc->notify_wq);
+ destroy_workqueue(osdc->notify_wq);
+ cancel_delayed_work_sync(&osdc->timeout_work);
+ cancel_delayed_work_sync(&osdc->osds_timeout_work);
+ if (osdc->osdmap) {
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = NULL;
+ }
+ remove_all_osds(osdc);
+ mempool_destroy(osdc->req_mempool);
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+ ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+
+/*
+ * Read some contiguous pages. If we cross a stripe boundary, shorten
+ * *plen. Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino, struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u32 truncate_seq, u64 truncate_size,
+ struct page **pages, int num_pages, int page_align)
+{
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+ vino.snap, off, *plen);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, truncate_seq, truncate_size,
+ false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short read due to an object boundary */
+
+ osd_req_op_extent_osd_data_pages(req, 0,
+ pages, *plen, page_align, false, false);
+
+ dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
+ off, *plen, *plen, page_align);
+
+ ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
+
+ rc = ceph_osdc_start_request(osdc, req, false);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ dout("readpages result %d\n", rc);
+ return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_readpages);
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *snapc,
+ u64 off, u64 len,
+ u32 truncate_seq, u64 truncate_size,
+ struct timespec *mtime,
+ struct page **pages, int num_pages)
+{
+ struct ceph_osd_request *req;
+ int rc = 0;
+ int page_align = off & ~PAGE_MASK;
+
+ BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
+ req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
+ snapc, truncate_seq, truncate_size,
+ true);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* it may be a short write due to an object boundary */
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+ false, false);
+ dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
+
+ ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
+
+ rc = ceph_osdc_start_request(osdc, req, true);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ if (rc == 0)
+ rc = len;
+ dout("writepages result %d\n", rc);
+ return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_writepages);
+
+int ceph_osdc_setup(void)
+{
+ BUG_ON(ceph_osd_request_cache);
+ ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
+ sizeof (struct ceph_osd_request),
+ __alignof__(struct ceph_osd_request),
+ 0, NULL);
+
+ return ceph_osd_request_cache ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(ceph_osdc_setup);
+
+void ceph_osdc_cleanup(void)
+{
+ BUG_ON(!ceph_osd_request_cache);
+ kmem_cache_destroy(ceph_osd_request_cache);
+ ceph_osd_request_cache = NULL;
+}
+EXPORT_SYMBOL(ceph_osdc_cleanup);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ if (!osd)
+ goto out;
+ osdc = osd->o_osdc;
+
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(osdc, msg);
+ break;
+ case CEPH_MSG_OSD_OPREPLY:
+ handle_reply(osdc, msg, con);
+ break;
+ case CEPH_MSG_WATCH_NOTIFY:
+ handle_watch_notify(osdc, msg);
+ break;
+
+ default:
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+out:
+ ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply. set up reply message
+ * pages.
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct ceph_msg *m;
+ struct ceph_osd_request *req;
+ int front_len = le32_to_cpu(hdr->front_len);
+ int data_len = le32_to_cpu(hdr->data_len);
+ u64 tid;
+
+ tid = le64_to_cpu(hdr->tid);
+ mutex_lock(&osdc->request_mutex);
+ req = __lookup_request(osdc, tid);
+ if (!req) {
+ *skip = 1;
+ m = NULL;
+ dout("get_reply unknown tid %llu from osd%d\n", tid,
+ osd->o_osd);
+ goto out;
+ }
+
+ if (req->r_reply->con)
+ dout("%s revoking msg %p from old con %p\n", __func__,
+ req->r_reply, req->r_reply->con);
+ ceph_msg_revoke_incoming(req->r_reply);
+
+ if (front_len > req->r_reply->front_alloc_len) {
+ pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n",
+ front_len, req->r_reply->front_alloc_len,
+ (unsigned int)con->peer_name.type,
+ le64_to_cpu(con->peer_name.num));
+ m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
+ false);
+ if (!m)
+ goto out;
+ ceph_msg_put(req->r_reply);
+ req->r_reply = m;
+ }
+ m = ceph_msg_get(req->r_reply);
+
+ if (data_len > 0) {
+ struct ceph_osd_data *osd_data;
+
+ /*
+ * XXX This is assuming there is only one op containing
+ * XXX page data. Probably OK for reads, but this
+ * XXX ought to be done more generally.
+ */
+ osd_data = osd_req_op_extent_osd_data(req, 0);
+ if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+ if (osd_data->pages &&
+ unlikely(osd_data->length < data_len)) {
+
+ pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n",
+ tid, data_len, osd_data->length);
+ *skip = 1;
+ ceph_msg_put(m);
+ m = NULL;
+ goto out;
+ }
+ }
+ }
+ *skip = 0;
+ dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+ mutex_unlock(&osdc->request_mutex);
+ return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_osd *osd = con->private;
+ int type = le16_to_cpu(hdr->type);
+ int front = le32_to_cpu(hdr->front_len);
+
+ *skip = 0;
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ case CEPH_MSG_WATCH_NOTIFY:
+ return ceph_msg_new(type, front, GFP_NOFS, false);
+ case CEPH_MSG_OSD_OPREPLY:
+ return get_reply(con, hdr, skip);
+ default:
+ pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+ osd->o_osd);
+ *skip = 1;
+ return NULL;
+ }
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ if (get_osd(osd))
+ return con;
+ return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+/*
+ * Note: returned pointer is the address of a structure that's
+ * managed separately. Caller must *not* attempt to free it.
+ */
+static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
+ int *proto, int force_new)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+
+ if (force_new && auth->authorizer) {
+ ceph_auth_destroy_authorizer(ac, auth->authorizer);
+ auth->authorizer = NULL;
+ }
+ if (!auth->authorizer) {
+ int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
+ auth);
+ if (ret)
+ return ERR_PTR(ret);
+ } else {
+ int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
+ auth);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ *proto = ac->protocol;
+
+ return auth;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+ return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ return ceph_auth_sign_message(auth, msg);
+}
+
+static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ return ceph_auth_check_message_signature(auth, msg);
+}
+
+static const struct ceph_connection_operations osd_con_ops = {
+ .get = get_osd_con,
+ .put = put_osd_con,
+ .dispatch = dispatch,
+ .get_authorizer = get_authorizer,
+ .verify_authorizer_reply = verify_authorizer_reply,
+ .invalidate_authorizer = invalidate_authorizer,
+ .alloc_msg = alloc_msg,
+ .sign_message = sign_message,
+ .check_message_signature = check_message_signature,
+ .fault = osd_reset,
+};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 000000000..4a3125836
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1754 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+ if (!len)
+ return str;
+
+ if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
+ snprintf(str, len, "exists, up");
+ else if (state & CEPH_OSD_EXISTS)
+ snprintf(str, len, "exists");
+ else if (state & CEPH_OSD_UP)
+ snprintf(str, len, "up");
+ else
+ snprintf(str, len, "doesn't exist");
+
+ return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned int t)
+{
+ int b = 0;
+ while (t) {
+ t = t >> 1;
+ b++;
+ }
+ return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+ pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
+ pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+ struct crush_bucket_uniform *b)
+{
+ dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+ ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+ b->item_weight = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+ struct crush_bucket_list *b)
+{
+ int j;
+ dout("crush_decode_list_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->sum_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->sum_weights[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+ struct crush_bucket_tree *b)
+{
+ int j;
+ dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+ ceph_decode_8_safe(p, end, b->num_nodes, bad);
+ b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+ if (b->node_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+ for (j = 0; j < b->num_nodes; j++)
+ b->node_weights[j] = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+ struct crush_bucket_straw *b)
+{
+ int j;
+ dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->straws == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->straws[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_straw2_bucket(void **p, void *end,
+ struct crush_bucket_straw2 *b)
+{
+ int j;
+ dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++)
+ b->item_weights[j] = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int skip_name_map(void **p, void *end)
+{
+ int len;
+ ceph_decode_32_safe(p, end, len ,bad);
+ while (len--) {
+ int strlen;
+ *p += sizeof(u32);
+ ceph_decode_32_safe(p, end, strlen, bad);
+ *p += strlen;
+}
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+ struct crush_map *c;
+ int err = -EINVAL;
+ int i, j;
+ void **p = &pbyval;
+ void *start = pbyval;
+ u32 magic;
+ u32 num_name_maps;
+
+ dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+ c = kzalloc(sizeof(*c), GFP_NOFS);
+ if (c == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ /* set tunables to default values */
+ c->choose_local_tries = 2;
+ c->choose_local_fallback_tries = 5;
+ c->choose_total_tries = 19;
+ c->chooseleaf_descend_once = 0;
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ magic = ceph_decode_32(p);
+ if (magic != CRUSH_MAGIC) {
+ pr_err("crush_decode magic %x != current %x\n",
+ (unsigned int)magic, (unsigned int)CRUSH_MAGIC);
+ goto bad;
+ }
+ c->max_buckets = ceph_decode_32(p);
+ c->max_rules = ceph_decode_32(p);
+ c->max_devices = ceph_decode_32(p);
+
+ c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+ if (c->buckets == NULL)
+ goto badmem;
+ c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+ if (c->rules == NULL)
+ goto badmem;
+
+ /* buckets */
+ for (i = 0; i < c->max_buckets; i++) {
+ int size = 0;
+ u32 alg;
+ struct crush_bucket *b;
+
+ ceph_decode_32_safe(p, end, alg, bad);
+ if (alg == 0) {
+ c->buckets[i] = NULL;
+ continue;
+ }
+ dout("crush_decode bucket %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ size = sizeof(struct crush_bucket_uniform);
+ break;
+ case CRUSH_BUCKET_LIST:
+ size = sizeof(struct crush_bucket_list);
+ break;
+ case CRUSH_BUCKET_TREE:
+ size = sizeof(struct crush_bucket_tree);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ size = sizeof(struct crush_bucket_straw);
+ break;
+ case CRUSH_BUCKET_STRAW2:
+ size = sizeof(struct crush_bucket_straw2);
+ break;
+ default:
+ err = -EINVAL;
+ goto bad;
+ }
+ BUG_ON(size == 0);
+ b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+ if (b == NULL)
+ goto badmem;
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ b->id = ceph_decode_32(p);
+ b->type = ceph_decode_16(p);
+ b->alg = ceph_decode_8(p);
+ b->hash = ceph_decode_8(p);
+ b->weight = ceph_decode_32(p);
+ b->size = ceph_decode_32(p);
+
+ dout("crush_decode bucket size %d off %x %p to %p\n",
+ b->size, (int)(*p-start), *p, end);
+
+ b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+ if (b->items == NULL)
+ goto badmem;
+ b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+ if (b->perm == NULL)
+ goto badmem;
+ b->perm_n = 0;
+
+ ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+ for (j = 0; j < b->size; j++)
+ b->items[j] = ceph_decode_32(p);
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ err = crush_decode_uniform_bucket(p, end,
+ (struct crush_bucket_uniform *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_LIST:
+ err = crush_decode_list_bucket(p, end,
+ (struct crush_bucket_list *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_TREE:
+ err = crush_decode_tree_bucket(p, end,
+ (struct crush_bucket_tree *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_STRAW:
+ err = crush_decode_straw_bucket(p, end,
+ (struct crush_bucket_straw *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_STRAW2:
+ err = crush_decode_straw2_bucket(p, end,
+ (struct crush_bucket_straw2 *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ }
+ }
+
+ /* rules */
+ dout("rule vec is %p\n", c->rules);
+ for (i = 0; i < c->max_rules; i++) {
+ u32 yes;
+ struct crush_rule *r;
+
+ ceph_decode_32_safe(p, end, yes, bad);
+ if (!yes) {
+ dout("crush_decode NO rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+ c->rules[i] = NULL;
+ continue;
+ }
+
+ dout("crush_decode rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ /* len */
+ ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+ err = -EINVAL;
+ if (yes > (ULONG_MAX - sizeof(*r))
+ / sizeof(struct crush_rule_step))
+ goto bad;
+#endif
+ r = c->rules[i] = kmalloc(sizeof(*r) +
+ yes*sizeof(struct crush_rule_step),
+ GFP_NOFS);
+ if (r == NULL)
+ goto badmem;
+ dout(" rule %d is at %p\n", i, r);
+ r->len = yes;
+ ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+ for (j = 0; j < r->len; j++) {
+ r->steps[j].op = ceph_decode_32(p);
+ r->steps[j].arg1 = ceph_decode_32(p);
+ r->steps[j].arg2 = ceph_decode_32(p);
+ }
+ }
+
+ /* ignore trailing name maps. */
+ for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
+ err = skip_name_map(p, end);
+ if (err < 0)
+ goto done;
+ }
+
+ /* tunables */
+ ceph_decode_need(p, end, 3*sizeof(u32), done);
+ c->choose_local_tries = ceph_decode_32(p);
+ c->choose_local_fallback_tries = ceph_decode_32(p);
+ c->choose_total_tries = ceph_decode_32(p);
+ dout("crush decode tunable choose_local_tries = %d",
+ c->choose_local_tries);
+ dout("crush decode tunable choose_local_fallback_tries = %d",
+ c->choose_local_fallback_tries);
+ dout("crush decode tunable choose_total_tries = %d",
+ c->choose_total_tries);
+
+ ceph_decode_need(p, end, sizeof(u32), done);
+ c->chooseleaf_descend_once = ceph_decode_32(p);
+ dout("crush decode tunable chooseleaf_descend_once = %d",
+ c->chooseleaf_descend_once);
+
+ ceph_decode_need(p, end, sizeof(u8), done);
+ c->chooseleaf_vary_r = ceph_decode_8(p);
+ dout("crush decode tunable chooseleaf_vary_r = %d",
+ c->chooseleaf_vary_r);
+
+done:
+ dout("crush_decode success\n");
+ return c;
+
+badmem:
+ err = -ENOMEM;
+bad:
+ dout("crush_decode fail %d\n", err);
+ crush_destroy(c);
+ return ERR_PTR(err);
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+ if (l.pool < r.pool)
+ return -1;
+ if (l.pool > r.pool)
+ return 1;
+ if (l.seed < r.seed)
+ return -1;
+ if (l.seed > r.seed)
+ return 1;
+ return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+ struct rb_root *root)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_mapping *pg = NULL;
+ int c;
+
+ dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new);
+ while (*p) {
+ parent = *p;
+ pg = rb_entry(parent, struct ceph_pg_mapping, node);
+ c = pgid_cmp(new->pgid, pg->pgid);
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+ return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+ struct ceph_pg pgid)
+{
+ struct rb_node *n = root->rb_node;
+ struct ceph_pg_mapping *pg;
+ int c;
+
+ while (n) {
+ pg = rb_entry(n, struct ceph_pg_mapping, node);
+ c = pgid_cmp(pgid, pg->pgid);
+ if (c < 0) {
+ n = n->rb_left;
+ } else if (c > 0) {
+ n = n->rb_right;
+ } else {
+ dout("__lookup_pg_mapping %lld.%x got %p\n",
+ pgid.pool, pgid.seed, pg);
+ return pg;
+ }
+ }
+ return NULL;
+}
+
+static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
+{
+ struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
+
+ if (pg) {
+ dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
+ pg);
+ rb_erase(&pg->node, root);
+ kfree(pg);
+ return 0;
+ }
+ dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
+ return -ENOENT;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_pool_info *pi = NULL;
+
+ while (*p) {
+ parent = *p;
+ pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+ if (new->id < pi->id)
+ p = &(*p)->rb_left;
+ else if (new->id > pi->id)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+ return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
+{
+ struct ceph_pg_pool_info *pi;
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ pi = rb_entry(n, struct ceph_pg_pool_info, node);
+ if (id < pi->id)
+ n = n->rb_left;
+ else if (id > pi->id)
+ n = n->rb_right;
+ else
+ return pi;
+ }
+ return NULL;
+}
+
+struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
+{
+ return __lookup_pg_pool(&map->pg_pools, id);
+}
+
+const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ if (id == CEPH_NOPOOL)
+ return NULL;
+
+ if (WARN_ON_ONCE(id > (u64) INT_MAX))
+ return NULL;
+
+ pi = __lookup_pg_pool(&map->pg_pools, (int) id);
+
+ return pi ? pi->name : NULL;
+}
+EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
+
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+ struct rb_node *rbp;
+
+ for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rbp, struct ceph_pg_pool_info, node);
+ if (pi->name && strcmp(pi->name, name) == 0)
+ return pi->id;
+ }
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+ rb_erase(&pi->node, root);
+ kfree(pi->name);
+ kfree(pi);
+}
+
+static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+{
+ u8 ev, cv;
+ unsigned len, num;
+ void *pool_end;
+
+ ceph_decode_need(p, end, 2 + 4, bad);
+ ev = ceph_decode_8(p); /* encoding version */
+ cv = ceph_decode_8(p); /* compat version */
+ if (ev < 5) {
+ pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
+ return -EINVAL;
+ }
+ if (cv > 9) {
+ pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
+ return -EINVAL;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, bad);
+ pool_end = *p + len;
+
+ pi->type = ceph_decode_8(p);
+ pi->size = ceph_decode_8(p);
+ pi->crush_ruleset = ceph_decode_8(p);
+ pi->object_hash = ceph_decode_8(p);
+
+ pi->pg_num = ceph_decode_32(p);
+ pi->pgp_num = ceph_decode_32(p);
+
+ *p += 4 + 4; /* skip lpg* */
+ *p += 4; /* skip last_change */
+ *p += 8 + 4; /* skip snap_seq, snap_epoch */
+
+ /* skip snaps */
+ num = ceph_decode_32(p);
+ while (num--) {
+ *p += 8; /* snapid key */
+ *p += 1 + 1; /* versions */
+ len = ceph_decode_32(p);
+ *p += len;
+ }
+
+ /* skip removed_snaps */
+ num = ceph_decode_32(p);
+ *p += num * (8 + 8);
+
+ *p += 8; /* skip auid */
+ pi->flags = ceph_decode_64(p);
+ *p += 4; /* skip crash_replay_interval */
+
+ if (ev >= 7)
+ *p += 1; /* skip min_size */
+
+ if (ev >= 8)
+ *p += 8 + 8; /* skip quota_max_* */
+
+ if (ev >= 9) {
+ /* skip tiers */
+ num = ceph_decode_32(p);
+ *p += num * 8;
+
+ *p += 8; /* skip tier_of */
+ *p += 1; /* skip cache_mode */
+
+ pi->read_tier = ceph_decode_64(p);
+ pi->write_tier = ceph_decode_64(p);
+ } else {
+ pi->read_tier = -1;
+ pi->write_tier = -1;
+ }
+
+ /* ignore the rest */
+
+ *p = pool_end;
+ calc_pg_masks(pi);
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+ struct ceph_pg_pool_info *pi;
+ u32 num, len;
+ u64 pool;
+
+ ceph_decode_32_safe(p, end, num, bad);
+ dout(" %d pool names\n", num);
+ while (num--) {
+ ceph_decode_64_safe(p, end, pool, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ dout(" pool %llu len %d\n", pool, len);
+ ceph_decode_need(p, end, len, bad);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi) {
+ char *name = kstrndup(*p, len, GFP_NOFS);
+
+ if (!name)
+ return -ENOMEM;
+ kfree(pi->name);
+ pi->name = name;
+ dout(" name is %s\n", pi->name);
+ }
+ *p += len;
+ }
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+ dout("osdmap_destroy %p\n", map);
+ if (map->crush)
+ crush_destroy(map->crush);
+ while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_temp),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_temp);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->primary_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->primary_temp),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->primary_temp);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rb_first(&map->pg_pools),
+ struct ceph_pg_pool_info, node);
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
+ kfree(map->osd_state);
+ kfree(map->osd_weight);
+ kfree(map->osd_addr);
+ kfree(map->osd_primary_affinity);
+ kfree(map);
+}
+
+/*
+ * Adjust max_osd value, (re)allocate arrays.
+ *
+ * The new elements are properly initialized.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+ u8 *state;
+ u32 *weight;
+ struct ceph_entity_addr *addr;
+ int i;
+
+ state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
+ if (!state)
+ return -ENOMEM;
+ map->osd_state = state;
+
+ weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
+ if (!weight)
+ return -ENOMEM;
+ map->osd_weight = weight;
+
+ addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
+ if (!addr)
+ return -ENOMEM;
+ map->osd_addr = addr;
+
+ for (i = map->max_osd; i < max; i++) {
+ map->osd_state[i] = 0;
+ map->osd_weight[i] = CEPH_OSD_OUT;
+ memset(map->osd_addr + i, 0, sizeof(*map->osd_addr));
+ }
+
+ if (map->osd_primary_affinity) {
+ u32 *affinity;
+
+ affinity = krealloc(map->osd_primary_affinity,
+ max*sizeof(*affinity), GFP_NOFS);
+ if (!affinity)
+ return -ENOMEM;
+ map->osd_primary_affinity = affinity;
+
+ for (i = map->max_osd; i < max; i++)
+ map->osd_primary_affinity[i] =
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+ }
+
+ map->max_osd = max;
+
+ return 0;
+}
+
+#define OSDMAP_WRAPPER_COMPAT_VER 7
+#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
+
+/*
+ * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps,
+ * to struct_v of the client_data section for new (v7 and above)
+ * osdmaps.
+ */
+static int get_osdmap_client_data_v(void **p, void *end,
+ const char *prefix, u8 *v)
+{
+ u8 struct_v;
+
+ ceph_decode_8_safe(p, end, struct_v, e_inval);
+ if (struct_v >= 7) {
+ u8 struct_compat;
+
+ ceph_decode_8_safe(p, end, struct_compat, e_inval);
+ if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
+ pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n",
+ struct_v, struct_compat,
+ OSDMAP_WRAPPER_COMPAT_VER, prefix);
+ return -EINVAL;
+ }
+ *p += 4; /* ignore wrapper struct_len */
+
+ ceph_decode_8_safe(p, end, struct_v, e_inval);
+ ceph_decode_8_safe(p, end, struct_compat, e_inval);
+ if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
+ pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n",
+ struct_v, struct_compat,
+ OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
+ return -EINVAL;
+ }
+ *p += 4; /* ignore client data struct_len */
+ } else {
+ u16 version;
+
+ *p -= 1;
+ ceph_decode_16_safe(p, end, version, e_inval);
+ if (version < 6) {
+ pr_warn("got v %d < 6 of %s ceph_osdmap\n",
+ version, prefix);
+ return -EINVAL;
+ }
+
+ /* old osdmap enconding */
+ struct_v = 0;
+ }
+
+ *v = struct_v;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg_pool_info *pi;
+ u64 pool;
+ int ret;
+
+ ceph_decode_64_safe(p, end, pool, e_inval);
+
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (!incremental || !pi) {
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
+ return -ENOMEM;
+
+ pi->id = pool;
+
+ ret = __insert_pg_pool(&map->pg_pools, pi);
+ if (ret) {
+ kfree(pi);
+ return ret;
+ }
+ }
+
+ ret = decode_pool(p, end, pi);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pools(p, end, map, false);
+}
+
+static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pools(p, end, map, true);
+}
+
+static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg pgid;
+ u32 len, i;
+ int ret;
+
+ ret = ceph_decode_pgid(p, end, &pgid);
+ if (ret)
+ return ret;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+
+ ret = __remove_pg_mapping(&map->pg_temp, pgid);
+ BUG_ON(!incremental && ret != -ENOENT);
+
+ if (!incremental || len > 0) {
+ struct ceph_pg_mapping *pg;
+
+ ceph_decode_need(p, end, len*sizeof(u32), e_inval);
+
+ if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+ return -EINVAL;
+
+ pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
+ if (!pg)
+ return -ENOMEM;
+
+ pg->pgid = pgid;
+ pg->pg_temp.len = len;
+ for (i = 0; i < len; i++)
+ pg->pg_temp.osds[i] = ceph_decode_32(p);
+
+ ret = __insert_pg_mapping(pg, &map->pg_temp);
+ if (ret) {
+ kfree(pg);
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pg_temp(p, end, map, false);
+}
+
+static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pg_temp(p, end, map, true);
+}
+
+static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg pgid;
+ u32 osd;
+ int ret;
+
+ ret = ceph_decode_pgid(p, end, &pgid);
+ if (ret)
+ return ret;
+
+ ceph_decode_32_safe(p, end, osd, e_inval);
+
+ ret = __remove_pg_mapping(&map->primary_temp, pgid);
+ BUG_ON(!incremental && ret != -ENOENT);
+
+ if (!incremental || osd != (u32)-1) {
+ struct ceph_pg_mapping *pg;
+
+ pg = kzalloc(sizeof(*pg), GFP_NOFS);
+ if (!pg)
+ return -ENOMEM;
+
+ pg->pgid = pgid;
+ pg->primary_temp.osd = osd;
+
+ ret = __insert_pg_mapping(pg, &map->primary_temp);
+ if (ret) {
+ kfree(pg);
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_primary_temp(p, end, map, false);
+}
+
+static int decode_new_primary_temp(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ return __decode_primary_temp(p, end, map, true);
+}
+
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+ BUG_ON(osd >= map->max_osd);
+
+ if (!map->osd_primary_affinity)
+ return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+ return map->osd_primary_affinity[osd];
+}
+
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+ BUG_ON(osd >= map->max_osd);
+
+ if (!map->osd_primary_affinity) {
+ int i;
+
+ map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
+ GFP_NOFS);
+ if (!map->osd_primary_affinity)
+ return -ENOMEM;
+
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_primary_affinity[i] =
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+ }
+
+ map->osd_primary_affinity[osd] = aff;
+
+ return 0;
+}
+
+static int decode_primary_affinity(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ u32 len, i;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len == 0) {
+ kfree(map->osd_primary_affinity);
+ map->osd_primary_affinity = NULL;
+ return 0;
+ }
+ if (len != map->max_osd)
+ goto e_inval;
+
+ ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+
+ for (i = 0; i < map->max_osd; i++) {
+ int ret;
+
+ ret = set_primary_affinity(map, i, ceph_decode_32(p));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_new_primary_affinity(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ u32 osd, aff;
+ int ret;
+
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ ceph_decode_32_safe(p, end, aff, e_inval);
+
+ ret = set_primary_affinity(map, osd, aff);
+ if (ret)
+ return ret;
+
+ pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
+ * decode a full map.
+ */
+static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+{
+ u8 struct_v;
+ u32 epoch = 0;
+ void *start = *p;
+ u32 max;
+ u32 len, i;
+ int err;
+
+ dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
+
+ err = get_osdmap_client_data_v(p, end, "full", &struct_v);
+ if (err)
+ goto bad;
+
+ /* fsid, epoch, created, modified */
+ ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
+ sizeof(map->created) + sizeof(map->modified), e_inval);
+ ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+ epoch = map->epoch = ceph_decode_32(p);
+ ceph_decode_copy(p, &map->created, sizeof(map->created));
+ ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+ /* pools */
+ err = decode_pools(p, end, map);
+ if (err)
+ goto bad;
+
+ /* pool_name */
+ err = decode_pool_names(p, end, map);
+ if (err)
+ goto bad;
+
+ ceph_decode_32_safe(p, end, map->pool_max, e_inval);
+
+ ceph_decode_32_safe(p, end, map->flags, e_inval);
+
+ /* max_osd */
+ ceph_decode_32_safe(p, end, max, e_inval);
+
+ /* (re)alloc osd arrays */
+ err = osdmap_set_max_osd(map, max);
+ if (err)
+ goto bad;
+
+ /* osd_state, osd_weight, osd_addrs->client_addr */
+ ceph_decode_need(p, end, 3*sizeof(u32) +
+ map->max_osd*(1 + sizeof(*map->osd_weight) +
+ sizeof(*map->osd_addr)), e_inval);
+
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
+ ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_weight[i] = ceph_decode_32(p);
+
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
+ ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+ for (i = 0; i < map->max_osd; i++)
+ ceph_decode_addr(&map->osd_addr[i]);
+
+ /* pg_temp */
+ err = decode_pg_temp(p, end, map);
+ if (err)
+ goto bad;
+
+ /* primary_temp */
+ if (struct_v >= 1) {
+ err = decode_primary_temp(p, end, map);
+ if (err)
+ goto bad;
+ }
+
+ /* primary_affinity */
+ if (struct_v >= 2) {
+ err = decode_primary_affinity(p, end, map);
+ if (err)
+ goto bad;
+ } else {
+ /* XXX can this happen? */
+ kfree(map->osd_primary_affinity);
+ map->osd_primary_affinity = NULL;
+ }
+
+ /* crush */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ map->crush = crush_decode(*p, min(*p + len, end));
+ if (IS_ERR(map->crush)) {
+ err = PTR_ERR(map->crush);
+ map->crush = NULL;
+ goto bad;
+ }
+ *p += len;
+
+ /* ignore the rest */
+ *p = end;
+
+ dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
+ return 0;
+
+e_inval:
+ err = -EINVAL;
+bad:
+ pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
+ err, epoch, (int)(*p - start), *p, start, end);
+ print_hex_dump(KERN_DEBUG, "osdmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
+ return err;
+}
+
+/*
+ * Allocate and decode a full map.
+ */
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+{
+ struct ceph_osdmap *map;
+ int ret;
+
+ map = kzalloc(sizeof(*map), GFP_NOFS);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ map->pg_temp = RB_ROOT;
+ map->primary_temp = RB_ROOT;
+ mutex_init(&map->crush_scratch_mutex);
+
+ ret = osdmap_decode(p, end, map);
+ if (ret) {
+ ceph_osdmap_destroy(map);
+ return ERR_PTR(ret);
+ }
+
+ return map;
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+ struct ceph_osdmap *map,
+ struct ceph_messenger *msgr)
+{
+ struct crush_map *newcrush = NULL;
+ struct ceph_fsid fsid;
+ u32 epoch = 0;
+ struct ceph_timespec modified;
+ s32 len;
+ u64 pool;
+ __s64 new_pool_max;
+ __s32 new_flags, max;
+ void *start = *p;
+ int err;
+ u8 struct_v;
+
+ dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
+
+ err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
+ if (err)
+ goto bad;
+
+ /* fsid, epoch, modified, new_pool_max, new_flags */
+ ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
+ sizeof(u64) + sizeof(u32), e_inval);
+ ceph_decode_copy(p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(p);
+ BUG_ON(epoch != map->epoch+1);
+ ceph_decode_copy(p, &modified, sizeof(modified));
+ new_pool_max = ceph_decode_64(p);
+ new_flags = ceph_decode_32(p);
+
+ /* full map? */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len > 0) {
+ dout("apply_incremental full map len %d, %p to %p\n",
+ len, *p, end);
+ return ceph_osdmap_decode(p, min(*p+len, end));
+ }
+
+ /* new crush? */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len > 0) {
+ newcrush = crush_decode(*p, min(*p+len, end));
+ if (IS_ERR(newcrush)) {
+ err = PTR_ERR(newcrush);
+ newcrush = NULL;
+ goto bad;
+ }
+ *p += len;
+ }
+
+ /* new flags? */
+ if (new_flags >= 0)
+ map->flags = new_flags;
+ if (new_pool_max >= 0)
+ map->pool_max = new_pool_max;
+
+ /* new max? */
+ ceph_decode_32_safe(p, end, max, e_inval);
+ if (max >= 0) {
+ err = osdmap_set_max_osd(map, max);
+ if (err)
+ goto bad;
+ }
+
+ map->epoch++;
+ map->modified = modified;
+ if (newcrush) {
+ if (map->crush)
+ crush_destroy(map->crush);
+ map->crush = newcrush;
+ newcrush = NULL;
+ }
+
+ /* new_pools */
+ err = decode_new_pools(p, end, map);
+ if (err)
+ goto bad;
+
+ /* new_pool_names */
+ err = decode_pool_names(p, end, map);
+ if (err)
+ goto bad;
+
+ /* old_pool */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ struct ceph_pg_pool_info *pi;
+
+ ceph_decode_64_safe(p, end, pool, e_inval);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi)
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
+
+ /* new_up */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ u32 osd;
+ struct ceph_entity_addr addr;
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
+ ceph_decode_addr(&addr);
+ pr_info("osd%d up\n", osd);
+ BUG_ON(osd >= map->max_osd);
+ map->osd_state[osd] |= CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ /* new_state */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ u32 osd;
+ u8 xorstate;
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ xorstate = **(u8 **)p;
+ (*p)++; /* clean flag */
+ if (xorstate == 0)
+ xorstate = CEPH_OSD_UP;
+ if (xorstate & CEPH_OSD_UP)
+ pr_info("osd%d down\n", osd);
+ if (osd < map->max_osd)
+ map->osd_state[osd] ^= xorstate;
+ }
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ while (len--) {
+ u32 osd, off;
+ ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
+ osd = ceph_decode_32(p);
+ off = ceph_decode_32(p);
+ pr_info("osd%d weight 0x%x %s\n", osd, off,
+ off == CEPH_OSD_IN ? "(in)" :
+ (off == CEPH_OSD_OUT ? "(out)" : ""));
+ if (osd < map->max_osd)
+ map->osd_weight[osd] = off;
+ }
+
+ /* new_pg_temp */
+ err = decode_new_pg_temp(p, end, map);
+ if (err)
+ goto bad;
+
+ /* new_primary_temp */
+ if (struct_v >= 1) {
+ err = decode_new_primary_temp(p, end, map);
+ if (err)
+ goto bad;
+ }
+
+ /* new_primary_affinity */
+ if (struct_v >= 2) {
+ err = decode_new_primary_affinity(p, end, map);
+ if (err)
+ goto bad;
+ }
+
+ /* ignore the rest */
+ *p = end;
+
+ dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
+ return map;
+
+e_inval:
+ err = -EINVAL;
+bad:
+ pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
+ err, epoch, (int)(*p - start), *p, start, end);
+ print_hex_dump(KERN_DEBUG, "osdmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
+ if (newcrush)
+ crush_destroy(newcrush);
+ return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+ u64 off, u64 len,
+ u64 *ono,
+ u64 *oxoff, u64 *oxlen)
+{
+ u32 osize = le32_to_cpu(layout->fl_object_size);
+ u32 su = le32_to_cpu(layout->fl_stripe_unit);
+ u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ u32 bl, stripeno, stripepos, objsetno;
+ u32 su_per_object;
+ u64 t, su_offset;
+
+ dout("mapping %llu~%llu osize %u fl_su %u\n", off, len,
+ osize, su);
+ if (su == 0 || sc == 0)
+ goto invalid;
+ su_per_object = osize / su;
+ if (su_per_object == 0)
+ goto invalid;
+ dout("osize %u / su %u = su_per_object %u\n", osize, su,
+ su_per_object);
+
+ if ((su & ~PAGE_MASK) != 0)
+ goto invalid;
+
+ /* bl = *off / su; */
+ t = off;
+ do_div(t, su);
+ bl = t;
+ dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+ stripeno = bl / sc;
+ stripepos = bl % sc;
+ objsetno = stripeno / su_per_object;
+
+ *ono = objsetno * sc + stripepos;
+ dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono);
+
+ /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
+ t = off;
+ su_offset = do_div(t, su);
+ *oxoff = su_offset + (stripeno % su_per_object) * su;
+
+ /*
+ * Calculate the length of the extent being written to the selected
+ * object. This is the minimum of the full length requested (len) or
+ * the remainder of the current stripe being written to.
+ */
+ *oxlen = min_t(u64, len, su - su_offset);
+
+ dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+ return 0;
+
+invalid:
+ dout(" invalid layout\n");
+ *ono = 0;
+ *oxoff = 0;
+ *oxlen = 0;
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
+
+/*
+ * Calculate mapping of a (oloc, oid) pair to a PG. Should only be
+ * called with target's (oloc, oid), since tiering isn't taken into
+ * account.
+ */
+int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+ struct ceph_object_locator *oloc,
+ struct ceph_object_id *oid,
+ struct ceph_pg *pg_out)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+ if (!pi)
+ return -EIO;
+
+ pg_out->pool = oloc->pool;
+ pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
+
+ dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+ pg_out->pool, pg_out->seed);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+
+static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
+ int *result, int result_max,
+ const __u32 *weight, int weight_max)
+{
+ int r;
+
+ BUG_ON(result_max > CEPH_PG_MAX_SIZE);
+
+ mutex_lock(&map->crush_scratch_mutex);
+ r = crush_do_rule(map->crush, ruleno, x, result, result_max,
+ weight, weight_max, map->crush_scratch_ary);
+ mutex_unlock(&map->crush_scratch_mutex);
+
+ return r;
+}
+
+/*
+ * Calculate raw (crush) set for given pgid.
+ *
+ * Return raw set length, or error.
+ */
+static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pool,
+ struct ceph_pg pgid, u32 pps, int *osds)
+{
+ int ruleno;
+ int len;
+
+ /* crush */
+ ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+ pool->type, pool->size);
+ if (ruleno < 0) {
+ pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
+ pgid.pool, pool->crush_ruleset, pool->type,
+ pool->size);
+ return -ENOENT;
+ }
+
+ len = do_crush(osdmap, ruleno, pps, osds,
+ min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+ osdmap->osd_weight, osdmap->max_osd);
+ if (len < 0) {
+ pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
+ len, ruleno, pgid.pool, pool->crush_ruleset,
+ pool->type, pool->size);
+ return len;
+ }
+
+ return len;
+}
+
+/*
+ * Given raw set, calculate up set and up primary.
+ *
+ * Return up set length. *primary is set to up primary osd id, or -1
+ * if up set is empty.
+ */
+static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pool,
+ int *osds, int len, int *primary)
+{
+ int up_primary = -1;
+ int i;
+
+ if (ceph_can_shift_osds(pool)) {
+ int removed = 0;
+
+ for (i = 0; i < len; i++) {
+ if (ceph_osd_is_down(osdmap, osds[i])) {
+ removed++;
+ continue;
+ }
+ if (removed)
+ osds[i - removed] = osds[i];
+ }
+
+ len -= removed;
+ if (len > 0)
+ up_primary = osds[0];
+ } else {
+ for (i = len - 1; i >= 0; i--) {
+ if (ceph_osd_is_down(osdmap, osds[i]))
+ osds[i] = CRUSH_ITEM_NONE;
+ else
+ up_primary = osds[i];
+ }
+ }
+
+ *primary = up_primary;
+ return len;
+}
+
+static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+ struct ceph_pg_pool_info *pool,
+ int *osds, int len, int *primary)
+{
+ int i;
+ int pos = -1;
+
+ /*
+ * Do we have any non-default primary_affinity values for these
+ * osds?
+ */
+ if (!osdmap->osd_primary_affinity)
+ return;
+
+ for (i = 0; i < len; i++) {
+ int osd = osds[i];
+
+ if (osd != CRUSH_ITEM_NONE &&
+ osdmap->osd_primary_affinity[osd] !=
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+ break;
+ }
+ }
+ if (i == len)
+ return;
+
+ /*
+ * Pick the primary. Feed both the seed (for the pg) and the
+ * osd into the hash/rng so that a proportional fraction of an
+ * osd's pgs get rejected as primary.
+ */
+ for (i = 0; i < len; i++) {
+ int osd = osds[i];
+ u32 aff;
+
+ if (osd == CRUSH_ITEM_NONE)
+ continue;
+
+ aff = osdmap->osd_primary_affinity[osd];
+ if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+ (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ pps, osd) >> 16) >= aff) {
+ /*
+ * We chose not to use this primary. Note it
+ * anyway as a fallback in case we don't pick
+ * anyone else, but keep looking.
+ */
+ if (pos < 0)
+ pos = i;
+ } else {
+ pos = i;
+ break;
+ }
+ }
+ if (pos < 0)
+ return;
+
+ *primary = osds[pos];
+
+ if (ceph_can_shift_osds(pool) && pos > 0) {
+ /* move the new primary to the front */
+ for (i = pos; i > 0; i--)
+ osds[i] = osds[i - 1];
+ osds[0] = *primary;
+ }
+}
+
+/*
+ * Given up set, apply pg_temp and primary_temp mappings.
+ *
+ * Return acting set length. *primary is set to acting primary osd id,
+ * or -1 if acting set is empty.
+ */
+static int apply_temps(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+ int *osds, int len, int *primary)
+{
+ struct ceph_pg_mapping *pg;
+ int temp_len;
+ int temp_primary;
+ int i;
+
+ /* raw_pg -> pg */
+ pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
+ pool->pg_num_mask);
+
+ /* pg_temp? */
+ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+ if (pg) {
+ temp_len = 0;
+ temp_primary = -1;
+
+ for (i = 0; i < pg->pg_temp.len; i++) {
+ if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
+ if (ceph_can_shift_osds(pool))
+ continue;
+ else
+ osds[temp_len++] = CRUSH_ITEM_NONE;
+ } else {
+ osds[temp_len++] = pg->pg_temp.osds[i];
+ }
+ }
+
+ /* apply pg_temp's primary */
+ for (i = 0; i < temp_len; i++) {
+ if (osds[i] != CRUSH_ITEM_NONE) {
+ temp_primary = osds[i];
+ break;
+ }
+ }
+ } else {
+ temp_len = len;
+ temp_primary = *primary;
+ }
+
+ /* primary_temp? */
+ pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+ if (pg)
+ temp_primary = pg->primary_temp.osd;
+
+ *primary = temp_primary;
+ return temp_len;
+}
+
+/*
+ * Calculate acting set for given pgid.
+ *
+ * Return acting set length, or error. *primary is set to acting
+ * primary osd id, or -1 if acting set is empty or on error.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+ int *osds, int *primary)
+{
+ struct ceph_pg_pool_info *pool;
+ u32 pps;
+ int len;
+
+ pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+ if (!pool) {
+ *primary = -1;
+ return -ENOENT;
+ }
+
+ if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+ /* hash pool id and seed so that pool PGs do not overlap */
+ pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ ceph_stable_mod(pgid.seed, pool->pgp_num,
+ pool->pgp_num_mask),
+ pgid.pool);
+ } else {
+ /*
+ * legacy behavior: add ps and pool together. this is
+ * not a great approach because the PGs from each pool
+ * will overlap on top of each other: 0.5 == 1.4 ==
+ * 2.3 == ...
+ */
+ pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
+ pool->pgp_num_mask) +
+ (unsigned)pgid.pool;
+ }
+
+ len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
+ if (len < 0) {
+ *primary = -1;
+ return len;
+ }
+
+ len = raw_to_up_osds(osdmap, pool, osds, len, primary);
+
+ apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
+
+ len = apply_temps(osdmap, pool, pgid, osds, len, primary);
+
+ return len;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+ int osds[CEPH_PG_MAX_SIZE];
+ int primary;
+
+ ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+
+ return primary;
+}
+EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 000000000..c7c220a73
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,150 @@
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+ if (pl->mapped_tail) {
+ struct page *page = list_entry(pl->head.prev, struct page, lru);
+ kunmap(page);
+ pl->mapped_tail = NULL;
+ }
+}
+
+void ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+ if (!atomic_dec_and_test(&pl->refcnt))
+ return;
+ ceph_pagelist_unmap_tail(pl);
+ while (!list_empty(&pl->head)) {
+ struct page *page = list_first_entry(&pl->head, struct page,
+ lru);
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ ceph_pagelist_free_reserve(pl);
+ kfree(pl);
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+ struct page *page;
+
+ if (!pl->num_pages_free) {
+ page = __page_cache_alloc(GFP_NOFS);
+ } else {
+ page = list_first_entry(&pl->free_list, struct page, lru);
+ list_del(&page->lru);
+ --pl->num_pages_free;
+ }
+ if (!page)
+ return -ENOMEM;
+ pl->room += PAGE_SIZE;
+ ceph_pagelist_unmap_tail(pl);
+ list_add_tail(&page->lru, &pl->head);
+ pl->mapped_tail = kmap(page);
+ return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+ while (pl->room < len) {
+ size_t bit = pl->room;
+ int ret;
+
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+ buf, bit);
+ pl->length += bit;
+ pl->room -= bit;
+ buf += bit;
+ len -= bit;
+ ret = ceph_pagelist_addpage(pl);
+ if (ret)
+ return ret;
+ }
+
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+ pl->length += len;
+ pl->room -= len;
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
+
+/* Allocate enough pages for a pagelist to append the given amount
+ * of data without without allocating.
+ * Returns: 0 on success, -ENOMEM on error.
+ */
+int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
+{
+ if (space <= pl->room)
+ return 0;
+ space -= pl->room;
+ space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */
+
+ while (space > pl->num_pages_free) {
+ struct page *page = __page_cache_alloc(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ list_add_tail(&page->lru, &pl->free_list);
+ ++pl->num_pages_free;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_reserve);
+
+/* Free any pages that have been preallocated. */
+int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
+{
+ while (!list_empty(&pl->free_list)) {
+ struct page *page = list_first_entry(&pl->free_list,
+ struct page, lru);
+ list_del(&page->lru);
+ __free_page(page);
+ --pl->num_pages_free;
+ }
+ BUG_ON(pl->num_pages_free);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_free_reserve);
+
+/* Create a truncation point. */
+void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+ struct ceph_pagelist_cursor *c)
+{
+ c->pl = pl;
+ c->page_lru = pl->head.prev;
+ c->room = pl->room;
+}
+EXPORT_SYMBOL(ceph_pagelist_set_cursor);
+
+/* Truncate a pagelist to the given point. Move extra pages to reserve.
+ * This won't sleep.
+ * Returns: 0 on success,
+ * -EINVAL if the pagelist doesn't match the trunc point pagelist
+ */
+int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+ struct ceph_pagelist_cursor *c)
+{
+ struct page *page;
+
+ if (pl != c->pl)
+ return -EINVAL;
+ ceph_pagelist_unmap_tail(pl);
+ while (pl->head.prev != c->page_lru) {
+ page = list_entry(pl->head.prev, struct page, lru);
+ /* move from pagelist to reserve */
+ list_move_tail(&page->lru, &pl->free_list);
+ ++pl->num_pages_free;
+ }
+ pl->room = c->room;
+ if (!list_empty(&pl->head)) {
+ page = list_entry(pl->head.prev, struct page, lru);
+ pl->mapped_tail = kmap(page);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 000000000..096d91447
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,202 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+/*
+ * build a vector of user pages
+ */
+struct page **ceph_get_direct_page_vector(const void __user *data,
+ int num_pages, bool write_page)
+{
+ struct page **pages;
+ int got = 0;
+ int rc = 0;
+
+ pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ while (got < num_pages) {
+ rc = get_user_pages_unlocked(current, current->mm,
+ (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
+ num_pages - got, write_page, 0, pages + got);
+ if (rc < 0)
+ break;
+ BUG_ON(rc == 0);
+ got += rc;
+ }
+ if (rc < 0)
+ goto fail;
+ return pages;
+
+fail:
+ ceph_put_page_vector(pages, got, false);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ceph_get_direct_page_vector);
+
+void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++) {
+ if (dirty)
+ set_page_dirty_lock(pages[i]);
+ put_page(pages[i]);
+ }
+ if (is_vmalloc_addr(pages))
+ vfree(pages);
+ else
+ kfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ __free_pages(pages[i], 0);
+ kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+ struct page **pages;
+ int i;
+
+ pages = kmalloc(sizeof(*pages) * num_pages, flags);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = __page_cache_alloc(flags);
+ if (pages[i] == NULL) {
+ ceph_release_page_vector(pages, i);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+/*
+ * copy user data into a page vector
+ */
+int ceph_copy_user_to_page_vector(struct page **pages,
+ const void __user *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ int po = off & ~PAGE_CACHE_MASK;
+ int left = len;
+ int l, bad;
+
+ while (left > 0) {
+ l = min_t(int, PAGE_CACHE_SIZE-po, left);
+ bad = copy_from_user(page_address(pages[i]) + po, data, l);
+ if (bad == l)
+ return -EFAULT;
+ data += l - bad;
+ left -= l - bad;
+ po += l - bad;
+ if (po == PAGE_CACHE_SIZE) {
+ po = 0;
+ i++;
+ }
+ }
+ return len;
+}
+EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
+
+void ceph_copy_to_page_vector(struct page **pages,
+ const void *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ size_t po = off & ~PAGE_CACHE_MASK;
+ size_t left = len;
+
+ while (left > 0) {
+ size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+
+ memcpy(page_address(pages[i]) + po, data, l);
+ data += l;
+ left -= l;
+ po += l;
+ if (po == PAGE_CACHE_SIZE) {
+ po = 0;
+ i++;
+ }
+ }
+}
+EXPORT_SYMBOL(ceph_copy_to_page_vector);
+
+void ceph_copy_from_page_vector(struct page **pages,
+ void *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ size_t po = off & ~PAGE_CACHE_MASK;
+ size_t left = len;
+
+ while (left > 0) {
+ size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+
+ memcpy(data, page_address(pages[i]) + po, l);
+ data += l;
+ left -= l;
+ po += l;
+ if (po == PAGE_CACHE_SIZE) {
+ po = 0;
+ i++;
+ }
+ }
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * Zero an extent within a page vector. Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+ int i = off >> PAGE_CACHE_SHIFT;
+
+ off &= ~PAGE_CACHE_MASK;
+
+ dout("zero_page_vector_page %u~%u\n", off, len);
+
+ /* leading partial page? */
+ if (off) {
+ int end = min((int)PAGE_CACHE_SIZE, off + len);
+ dout("zeroing %d %p head from %d\n", i, pages[i],
+ (int)off);
+ zero_user_segment(pages[i], off, end);
+ len -= (end - off);
+ i++;
+ }
+ while (len >= PAGE_CACHE_SIZE) {
+ dout("zeroing %d %p len=%d\n", i, pages[i], len);
+ zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+ len -= PAGE_CACHE_SIZE;
+ i++;
+ }
+ /* trailing partial page? */
+ if (len) {
+ dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+ zero_user_segment(pages[i], 0, len);
+ }
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
+
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
new file mode 100644
index 000000000..154683f5f
--- /dev/null
+++ b/net/ceph/snapshot.c
@@ -0,0 +1,78 @@
+/*
+ * snapshot.c Ceph snapshot context utility routines (part of libceph)
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <stddef.h>
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/ceph/libceph.h>
+
+/*
+ * Ceph snapshot contexts are reference counted objects, and the
+ * returned structure holds a single reference. Acquire additional
+ * references with ceph_get_snap_context(), and release them with
+ * ceph_put_snap_context(). When the reference count reaches zero
+ * the entire structure is freed.
+ */
+
+/*
+ * Create a new ceph snapshot context large enough to hold the
+ * indicated number of snapshot ids (which can be 0). Caller has
+ * to fill in snapc->seq and snapc->snaps[0..snap_count-1].
+ *
+ * Returns a null pointer if an error occurs.
+ */
+struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
+ gfp_t gfp_flags)
+{
+ struct ceph_snap_context *snapc;
+ size_t size;
+
+ size = sizeof (struct ceph_snap_context);
+ size += snap_count * sizeof (snapc->snaps[0]);
+ snapc = kzalloc(size, gfp_flags);
+ if (!snapc)
+ return NULL;
+
+ atomic_set(&snapc->nref, 1);
+ snapc->num_snaps = snap_count;
+
+ return snapc;
+}
+EXPORT_SYMBOL(ceph_create_snap_context);
+
+struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+ if (sc)
+ atomic_inc(&sc->nref);
+ return sc;
+}
+EXPORT_SYMBOL(ceph_get_snap_context);
+
+void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+ if (!sc)
+ return;
+ if (atomic_dec_and_test(&sc->nref)) {
+ /*printk(" deleting snap_context %p\n", sc);*/
+ kfree(sc);
+ }
+}
+EXPORT_SYMBOL(ceph_put_snap_context);
diff --git a/net/compat.c b/net/compat.c
new file mode 100644
index 000000000..5cfd26a00
--- /dev/null
+++ b/net/compat.c
@@ -0,0 +1,851 @@
+/*
+ * 32bit Socket syscall emulation. Based on arch/sparc64/kernel/sys_sparc32.c.
+ *
+ * Copyright (C) 2000 VA Linux Co
+ * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2000 Hewlett-Packard Co.
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000,2001 Andi Kleen, SuSE Labs
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/icmpv6.h>
+#include <linux/socket.h>
+#include <linux/syscalls.h>
+#include <linux/filter.h>
+#include <linux/compat.h>
+#include <linux/security.h>
+#include <linux/export.h>
+
+#include <net/scm.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <asm/uaccess.h>
+#include <net/compat.h>
+
+int get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec **iov)
+{
+ compat_uptr_t uaddr, uiov, tmp3;
+ compat_size_t nr_segs;
+ ssize_t err;
+
+ if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
+ __get_user(uaddr, &umsg->msg_name) ||
+ __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
+ __get_user(uiov, &umsg->msg_iov) ||
+ __get_user(nr_segs, &umsg->msg_iovlen) ||
+ __get_user(tmp3, &umsg->msg_control) ||
+ __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
+ __get_user(kmsg->msg_flags, &umsg->msg_flags))
+ return -EFAULT;
+
+ if (!uaddr)
+ kmsg->msg_namelen = 0;
+
+ if (kmsg->msg_namelen < 0)
+ return -EINVAL;
+
+ if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
+ kmsg->msg_namelen = sizeof(struct sockaddr_storage);
+ kmsg->msg_control = compat_ptr(tmp3);
+
+ if (save_addr)
+ *save_addr = compat_ptr(uaddr);
+
+ if (uaddr && kmsg->msg_namelen) {
+ if (!save_addr) {
+ err = move_addr_to_kernel(compat_ptr(uaddr),
+ kmsg->msg_namelen,
+ kmsg->msg_name);
+ if (err < 0)
+ return err;
+ }
+ } else {
+ kmsg->msg_name = NULL;
+ kmsg->msg_namelen = 0;
+ }
+
+ if (nr_segs > UIO_MAXIOV)
+ return -EMSGSIZE;
+
+ kmsg->msg_iocb = NULL;
+
+ return compat_import_iovec(save_addr ? READ : WRITE,
+ compat_ptr(uiov), nr_segs,
+ UIO_FASTIOV, iov, &kmsg->msg_iter);
+}
+
+/* Bleech... */
+#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32))
+
+#define CMSG_COMPAT_DATA(cmsg) \
+ ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))))
+#define CMSG_COMPAT_SPACE(len) \
+ (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len))
+#define CMSG_COMPAT_LEN(len) \
+ (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len))
+
+#define CMSG_COMPAT_FIRSTHDR(msg) \
+ (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \
+ (struct compat_cmsghdr __user *)((msg)->msg_control) : \
+ (struct compat_cmsghdr __user *)NULL)
+
+#define CMSG_COMPAT_OK(ucmlen, ucmsg, mhdr) \
+ ((ucmlen) >= sizeof(struct compat_cmsghdr) && \
+ (ucmlen) <= (unsigned long) \
+ ((mhdr)->msg_controllen - \
+ ((char *)(ucmsg) - (char *)(mhdr)->msg_control)))
+
+static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg,
+ struct compat_cmsghdr __user *cmsg, int cmsg_len)
+{
+ char __user *ptr = (char __user *)cmsg + CMSG_COMPAT_ALIGN(cmsg_len);
+ if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control) >
+ msg->msg_controllen)
+ return NULL;
+ return (struct compat_cmsghdr __user *)ptr;
+}
+
+/* There is a lot of hair here because the alignment rules (and
+ * thus placement) of cmsg headers and length are different for
+ * 32-bit apps. -DaveM
+ */
+int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
+ unsigned char *stackbuf, int stackbuf_size)
+{
+ struct compat_cmsghdr __user *ucmsg;
+ struct cmsghdr *kcmsg, *kcmsg_base;
+ compat_size_t ucmlen;
+ __kernel_size_t kcmlen, tmp;
+ int err = -EFAULT;
+
+ kcmlen = 0;
+ kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
+ ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
+ while (ucmsg != NULL) {
+ if (get_user(ucmlen, &ucmsg->cmsg_len))
+ return -EFAULT;
+
+ /* Catch bogons. */
+ if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+ return -EINVAL;
+
+ tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
+ CMSG_ALIGN(sizeof(struct cmsghdr)));
+ tmp = CMSG_ALIGN(tmp);
+ kcmlen += tmp;
+ ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
+ }
+ if (kcmlen == 0)
+ return -EINVAL;
+
+ /* The kcmlen holds the 64-bit version of the control length.
+ * It may not be modified as we do not stick it into the kmsg
+ * until we have successfully copied over all of the data
+ * from the user.
+ */
+ if (kcmlen > stackbuf_size)
+ kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL);
+ if (kcmsg == NULL)
+ return -ENOBUFS;
+
+ /* Now copy them over neatly. */
+ memset(kcmsg, 0, kcmlen);
+ ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
+ while (ucmsg != NULL) {
+ if (__get_user(ucmlen, &ucmsg->cmsg_len))
+ goto Efault;
+ if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+ goto Einval;
+ tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
+ CMSG_ALIGN(sizeof(struct cmsghdr)));
+ if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
+ goto Einval;
+ kcmsg->cmsg_len = tmp;
+ tmp = CMSG_ALIGN(tmp);
+ if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) ||
+ __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
+ copy_from_user(CMSG_DATA(kcmsg),
+ CMSG_COMPAT_DATA(ucmsg),
+ (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+ goto Efault;
+
+ /* Advance. */
+ kcmsg = (struct cmsghdr *)((char *)kcmsg + tmp);
+ ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
+ }
+
+ /* Ok, looks like we made it. Hook it up and return success. */
+ kmsg->msg_control = kcmsg_base;
+ kmsg->msg_controllen = kcmlen;
+ return 0;
+
+Einval:
+ err = -EINVAL;
+Efault:
+ if (kcmsg_base != (struct cmsghdr *)stackbuf)
+ sock_kfree_s(sk, kcmsg_base, kcmlen);
+ return err;
+}
+
+int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data)
+{
+ struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
+ struct compat_cmsghdr cmhdr;
+ struct compat_timeval ctv;
+ struct compat_timespec cts[3];
+ int cmlen;
+
+ if (cm == NULL || kmsg->msg_controllen < sizeof(*cm)) {
+ kmsg->msg_flags |= MSG_CTRUNC;
+ return 0; /* XXX: return error? check spec. */
+ }
+
+ if (!COMPAT_USE_64BIT_TIME) {
+ if (level == SOL_SOCKET && type == SCM_TIMESTAMP) {
+ struct timeval *tv = (struct timeval *)data;
+ ctv.tv_sec = tv->tv_sec;
+ ctv.tv_usec = tv->tv_usec;
+ data = &ctv;
+ len = sizeof(ctv);
+ }
+ if (level == SOL_SOCKET &&
+ (type == SCM_TIMESTAMPNS || type == SCM_TIMESTAMPING)) {
+ int count = type == SCM_TIMESTAMPNS ? 1 : 3;
+ int i;
+ struct timespec *ts = (struct timespec *)data;
+ for (i = 0; i < count; i++) {
+ cts[i].tv_sec = ts[i].tv_sec;
+ cts[i].tv_nsec = ts[i].tv_nsec;
+ }
+ data = &cts;
+ len = sizeof(cts[0]) * count;
+ }
+ }
+
+ cmlen = CMSG_COMPAT_LEN(len);
+ if (kmsg->msg_controllen < cmlen) {
+ kmsg->msg_flags |= MSG_CTRUNC;
+ cmlen = kmsg->msg_controllen;
+ }
+ cmhdr.cmsg_level = level;
+ cmhdr.cmsg_type = type;
+ cmhdr.cmsg_len = cmlen;
+
+ if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
+ return -EFAULT;
+ if (copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr)))
+ return -EFAULT;
+ cmlen = CMSG_COMPAT_SPACE(len);
+ if (kmsg->msg_controllen < cmlen)
+ cmlen = kmsg->msg_controllen;
+ kmsg->msg_control += cmlen;
+ kmsg->msg_controllen -= cmlen;
+ return 0;
+}
+
+void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
+{
+ struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
+ int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int);
+ int fdnum = scm->fp->count;
+ struct file **fp = scm->fp->fp;
+ int __user *cmfptr;
+ int err = 0, i;
+
+ if (fdnum < fdmax)
+ fdmax = fdnum;
+
+ for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) {
+ int new_fd;
+ err = security_file_receive(fp[i]);
+ if (err)
+ break;
+ err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & kmsg->msg_flags
+ ? O_CLOEXEC : 0);
+ if (err < 0)
+ break;
+ new_fd = err;
+ err = put_user(new_fd, cmfptr);
+ if (err) {
+ put_unused_fd(new_fd);
+ break;
+ }
+ /* Bump the usage count and install the file. */
+ fd_install(new_fd, get_file(fp[i]));
+ }
+
+ if (i > 0) {
+ int cmlen = CMSG_COMPAT_LEN(i * sizeof(int));
+ err = put_user(SOL_SOCKET, &cm->cmsg_level);
+ if (!err)
+ err = put_user(SCM_RIGHTS, &cm->cmsg_type);
+ if (!err)
+ err = put_user(cmlen, &cm->cmsg_len);
+ if (!err) {
+ cmlen = CMSG_COMPAT_SPACE(i * sizeof(int));
+ kmsg->msg_control += cmlen;
+ kmsg->msg_controllen -= cmlen;
+ }
+ }
+ if (i < fdnum)
+ kmsg->msg_flags |= MSG_CTRUNC;
+
+ /*
+ * All of the files that fit in the message have had their
+ * usage counts incremented, so we just free the list.
+ */
+ __scm_destroy(scm);
+}
+
+static int do_set_attach_filter(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
+ struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
+ compat_uptr_t ptr;
+ u16 len;
+
+ if (!access_ok(VERIFY_READ, fprog32, sizeof(*fprog32)) ||
+ !access_ok(VERIFY_WRITE, kfprog, sizeof(struct sock_fprog)) ||
+ __get_user(len, &fprog32->len) ||
+ __get_user(ptr, &fprog32->filter) ||
+ __put_user(len, &kfprog->len) ||
+ __put_user(compat_ptr(ptr), &kfprog->filter))
+ return -EFAULT;
+
+ return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
+ sizeof(struct sock_fprog));
+}
+
+static int do_set_sock_timeout(struct socket *sock, int level,
+ int optname, char __user *optval, unsigned int optlen)
+{
+ struct compat_timeval __user *up = (struct compat_timeval __user *)optval;
+ struct timeval ktime;
+ mm_segment_t old_fs;
+ int err;
+
+ if (optlen < sizeof(*up))
+ return -EINVAL;
+ if (!access_ok(VERIFY_READ, up, sizeof(*up)) ||
+ __get_user(ktime.tv_sec, &up->tv_sec) ||
+ __get_user(ktime.tv_usec, &up->tv_usec))
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sock_setsockopt(sock, level, optname, (char *)&ktime, sizeof(ktime));
+ set_fs(old_fs);
+
+ return err;
+}
+
+static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (optname == SO_ATTACH_FILTER)
+ return do_set_attach_filter(sock, level, optname,
+ optval, optlen);
+ if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+ return do_set_sock_timeout(sock, level, optname, optval, optlen);
+
+ return sock_setsockopt(sock, level, optname, optval, optlen);
+}
+
+COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
+ char __user *, optval, unsigned int, optlen)
+{
+ int err;
+ struct socket *sock = sockfd_lookup(fd, &err);
+
+ if (sock) {
+ err = security_socket_setsockopt(sock, level, optname);
+ if (err) {
+ sockfd_put(sock);
+ return err;
+ }
+
+ if (level == SOL_SOCKET)
+ err = compat_sock_setsockopt(sock, level,
+ optname, optval, optlen);
+ else if (sock->ops->compat_setsockopt)
+ err = sock->ops->compat_setsockopt(sock, level,
+ optname, optval, optlen);
+ else
+ err = sock->ops->setsockopt(sock, level,
+ optname, optval, optlen);
+ sockfd_put(sock);
+ }
+ return err;
+}
+
+static int do_get_sock_timeout(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct compat_timeval __user *up;
+ struct timeval ktime;
+ mm_segment_t old_fs;
+ int len, err;
+
+ up = (struct compat_timeval __user *) optval;
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < sizeof(*up))
+ return -EINVAL;
+ len = sizeof(ktime);
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sock_getsockopt(sock, level, optname, (char *) &ktime, &len);
+ set_fs(old_fs);
+
+ if (!err) {
+ if (put_user(sizeof(*up), optlen) ||
+ !access_ok(VERIFY_WRITE, up, sizeof(*up)) ||
+ __put_user(ktime.tv_sec, &up->tv_sec) ||
+ __put_user(ktime.tv_usec, &up->tv_usec))
+ err = -EFAULT;
+ }
+ return err;
+}
+
+static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+ return do_get_sock_timeout(sock, level, optname, optval, optlen);
+ return sock_getsockopt(sock, level, optname, optval, optlen);
+}
+
+int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+{
+ struct compat_timeval __user *ctv;
+ int err;
+ struct timeval tv;
+
+ if (COMPAT_USE_64BIT_TIME)
+ return sock_get_timestamp(sk, userstamp);
+
+ ctv = (struct compat_timeval __user *) userstamp;
+ err = -ENOENT;
+ if (!sock_flag(sk, SOCK_TIMESTAMP))
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ tv = ktime_to_timeval(sk->sk_stamp);
+ if (tv.tv_sec == -1)
+ return err;
+ if (tv.tv_sec == 0) {
+ sk->sk_stamp = ktime_get_real();
+ tv = ktime_to_timeval(sk->sk_stamp);
+ }
+ err = 0;
+ if (put_user(tv.tv_sec, &ctv->tv_sec) ||
+ put_user(tv.tv_usec, &ctv->tv_usec))
+ err = -EFAULT;
+ return err;
+}
+EXPORT_SYMBOL(compat_sock_get_timestamp);
+
+int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+ struct compat_timespec __user *ctv;
+ int err;
+ struct timespec ts;
+
+ if (COMPAT_USE_64BIT_TIME)
+ return sock_get_timestampns (sk, userstamp);
+
+ ctv = (struct compat_timespec __user *) userstamp;
+ err = -ENOENT;
+ if (!sock_flag(sk, SOCK_TIMESTAMP))
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ ts = ktime_to_timespec(sk->sk_stamp);
+ if (ts.tv_sec == -1)
+ return err;
+ if (ts.tv_sec == 0) {
+ sk->sk_stamp = ktime_get_real();
+ ts = ktime_to_timespec(sk->sk_stamp);
+ }
+ err = 0;
+ if (put_user(ts.tv_sec, &ctv->tv_sec) ||
+ put_user(ts.tv_nsec, &ctv->tv_nsec))
+ err = -EFAULT;
+ return err;
+}
+EXPORT_SYMBOL(compat_sock_get_timestampns);
+
+COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
+ char __user *, optval, int __user *, optlen)
+{
+ int err;
+ struct socket *sock = sockfd_lookup(fd, &err);
+
+ if (sock) {
+ err = security_socket_getsockopt(sock, level, optname);
+ if (err) {
+ sockfd_put(sock);
+ return err;
+ }
+
+ if (level == SOL_SOCKET)
+ err = compat_sock_getsockopt(sock, level,
+ optname, optval, optlen);
+ else if (sock->ops->compat_getsockopt)
+ err = sock->ops->compat_getsockopt(sock, level,
+ optname, optval, optlen);
+ else
+ err = sock->ops->getsockopt(sock, level,
+ optname, optval, optlen);
+ sockfd_put(sock);
+ }
+ return err;
+}
+
+struct compat_group_req {
+ __u32 gr_interface;
+ struct __kernel_sockaddr_storage gr_group
+ __aligned(4);
+} __packed;
+
+struct compat_group_source_req {
+ __u32 gsr_interface;
+ struct __kernel_sockaddr_storage gsr_group
+ __aligned(4);
+ struct __kernel_sockaddr_storage gsr_source
+ __aligned(4);
+} __packed;
+
+struct compat_group_filter {
+ __u32 gf_interface;
+ struct __kernel_sockaddr_storage gf_group
+ __aligned(4);
+ __u32 gf_fmode;
+ __u32 gf_numsrc;
+ struct __kernel_sockaddr_storage gf_slist[1]
+ __aligned(4);
+} __packed;
+
+#define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \
+ sizeof(struct __kernel_sockaddr_storage))
+
+
+int compat_mc_setsockopt(struct sock *sock, int level, int optname,
+ char __user *optval, unsigned int optlen,
+ int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int))
+{
+ char __user *koptval = optval;
+ int koptlen = optlen;
+
+ switch (optname) {
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ {
+ struct compat_group_req __user *gr32 = (void *)optval;
+ struct group_req __user *kgr =
+ compat_alloc_user_space(sizeof(struct group_req));
+ u32 interface;
+
+ if (!access_ok(VERIFY_READ, gr32, sizeof(*gr32)) ||
+ !access_ok(VERIFY_WRITE, kgr, sizeof(struct group_req)) ||
+ __get_user(interface, &gr32->gr_interface) ||
+ __put_user(interface, &kgr->gr_interface) ||
+ copy_in_user(&kgr->gr_group, &gr32->gr_group,
+ sizeof(kgr->gr_group)))
+ return -EFAULT;
+ koptval = (char __user *)kgr;
+ koptlen = sizeof(struct group_req);
+ break;
+ }
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ {
+ struct compat_group_source_req __user *gsr32 = (void *)optval;
+ struct group_source_req __user *kgsr = compat_alloc_user_space(
+ sizeof(struct group_source_req));
+ u32 interface;
+
+ if (!access_ok(VERIFY_READ, gsr32, sizeof(*gsr32)) ||
+ !access_ok(VERIFY_WRITE, kgsr,
+ sizeof(struct group_source_req)) ||
+ __get_user(interface, &gsr32->gsr_interface) ||
+ __put_user(interface, &kgsr->gsr_interface) ||
+ copy_in_user(&kgsr->gsr_group, &gsr32->gsr_group,
+ sizeof(kgsr->gsr_group)) ||
+ copy_in_user(&kgsr->gsr_source, &gsr32->gsr_source,
+ sizeof(kgsr->gsr_source)))
+ return -EFAULT;
+ koptval = (char __user *)kgsr;
+ koptlen = sizeof(struct group_source_req);
+ break;
+ }
+ case MCAST_MSFILTER:
+ {
+ struct compat_group_filter __user *gf32 = (void *)optval;
+ struct group_filter __user *kgf;
+ u32 interface, fmode, numsrc;
+
+ if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
+ __get_user(interface, &gf32->gf_interface) ||
+ __get_user(fmode, &gf32->gf_fmode) ||
+ __get_user(numsrc, &gf32->gf_numsrc))
+ return -EFAULT;
+ koptlen = optlen + sizeof(struct group_filter) -
+ sizeof(struct compat_group_filter);
+ if (koptlen < GROUP_FILTER_SIZE(numsrc))
+ return -EINVAL;
+ kgf = compat_alloc_user_space(koptlen);
+ if (!access_ok(VERIFY_WRITE, kgf, koptlen) ||
+ __put_user(interface, &kgf->gf_interface) ||
+ __put_user(fmode, &kgf->gf_fmode) ||
+ __put_user(numsrc, &kgf->gf_numsrc) ||
+ copy_in_user(&kgf->gf_group, &gf32->gf_group,
+ sizeof(kgf->gf_group)) ||
+ (numsrc && copy_in_user(kgf->gf_slist, gf32->gf_slist,
+ numsrc * sizeof(kgf->gf_slist[0]))))
+ return -EFAULT;
+ koptval = (char __user *)kgf;
+ break;
+ }
+
+ default:
+ break;
+ }
+ return setsockopt(sock, level, optname, koptval, koptlen);
+}
+EXPORT_SYMBOL(compat_mc_setsockopt);
+
+int compat_mc_getsockopt(struct sock *sock, int level, int optname,
+ char __user *optval, int __user *optlen,
+ int (*getsockopt)(struct sock *, int, int, char __user *, int __user *))
+{
+ struct compat_group_filter __user *gf32 = (void *)optval;
+ struct group_filter __user *kgf;
+ int __user *koptlen;
+ u32 interface, fmode, numsrc;
+ int klen, ulen, err;
+
+ if (optname != MCAST_MSFILTER)
+ return getsockopt(sock, level, optname, optval, optlen);
+
+ koptlen = compat_alloc_user_space(sizeof(*koptlen));
+ if (!access_ok(VERIFY_READ, optlen, sizeof(*optlen)) ||
+ __get_user(ulen, optlen))
+ return -EFAULT;
+
+ /* adjust len for pad */
+ klen = ulen + sizeof(*kgf) - sizeof(*gf32);
+
+ if (klen < GROUP_FILTER_SIZE(0))
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_WRITE, koptlen, sizeof(*koptlen)) ||
+ __put_user(klen, koptlen))
+ return -EFAULT;
+
+ /* have to allow space for previous compat_alloc_user_space, too */
+ kgf = compat_alloc_user_space(klen+sizeof(*optlen));
+
+ if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
+ __get_user(interface, &gf32->gf_interface) ||
+ __get_user(fmode, &gf32->gf_fmode) ||
+ __get_user(numsrc, &gf32->gf_numsrc) ||
+ __put_user(interface, &kgf->gf_interface) ||
+ __put_user(fmode, &kgf->gf_fmode) ||
+ __put_user(numsrc, &kgf->gf_numsrc) ||
+ copy_in_user(&kgf->gf_group, &gf32->gf_group, sizeof(kgf->gf_group)))
+ return -EFAULT;
+
+ err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen);
+ if (err)
+ return err;
+
+ if (!access_ok(VERIFY_READ, koptlen, sizeof(*koptlen)) ||
+ __get_user(klen, koptlen))
+ return -EFAULT;
+
+ ulen = klen - (sizeof(*kgf)-sizeof(*gf32));
+
+ if (!access_ok(VERIFY_WRITE, optlen, sizeof(*optlen)) ||
+ __put_user(ulen, optlen))
+ return -EFAULT;
+
+ if (!access_ok(VERIFY_READ, kgf, klen) ||
+ !access_ok(VERIFY_WRITE, gf32, ulen) ||
+ __get_user(interface, &kgf->gf_interface) ||
+ __get_user(fmode, &kgf->gf_fmode) ||
+ __get_user(numsrc, &kgf->gf_numsrc) ||
+ __put_user(interface, &gf32->gf_interface) ||
+ __put_user(fmode, &gf32->gf_fmode) ||
+ __put_user(numsrc, &gf32->gf_numsrc))
+ return -EFAULT;
+ if (numsrc) {
+ int copylen;
+
+ klen -= GROUP_FILTER_SIZE(0);
+ copylen = numsrc * sizeof(gf32->gf_slist[0]);
+ if (copylen > klen)
+ copylen = klen;
+ if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen))
+ return -EFAULT;
+ }
+ return err;
+}
+EXPORT_SYMBOL(compat_mc_getsockopt);
+
+
+/* Argument list sizes for compat_sys_socketcall */
+#define AL(x) ((x) * sizeof(u32))
+static unsigned char nas[21] = {
+ AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
+ AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
+ AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
+ AL(4), AL(5), AL(4)
+};
+#undef AL
+
+COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags)
+{
+ return __sys_sendmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+}
+
+COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags)
+{
+ return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT);
+}
+
+COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags)
+{
+ return __sys_recvmsg(fd, (struct user_msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+}
+
+COMPAT_SYSCALL_DEFINE4(recv, int, fd, void __user *, buf, compat_size_t, len, unsigned int, flags)
+{
+ return sys_recv(fd, buf, len, flags | MSG_CMSG_COMPAT);
+}
+
+COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len,
+ unsigned int, flags, struct sockaddr __user *, addr,
+ int __user *, addrlen)
+{
+ return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen);
+}
+
+COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags,
+ struct compat_timespec __user *, timeout)
+{
+ int datagrams;
+ struct timespec ktspec;
+
+ if (timeout == NULL)
+ return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, NULL);
+
+ if (compat_get_timespec(&ktspec, timeout))
+ return -EFAULT;
+
+ datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, &ktspec);
+ if (datagrams > 0 && compat_put_timespec(&ktspec, timeout))
+ datagrams = -EFAULT;
+
+ return datagrams;
+}
+
+COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
+{
+ int ret;
+ u32 a[6];
+ u32 a0, a1;
+
+ if (call < SYS_SOCKET || call > SYS_SENDMMSG)
+ return -EINVAL;
+ if (copy_from_user(a, args, nas[call]))
+ return -EFAULT;
+ a0 = a[0];
+ a1 = a[1];
+
+ switch (call) {
+ case SYS_SOCKET:
+ ret = sys_socket(a0, a1, a[2]);
+ break;
+ case SYS_BIND:
+ ret = sys_bind(a0, compat_ptr(a1), a[2]);
+ break;
+ case SYS_CONNECT:
+ ret = sys_connect(a0, compat_ptr(a1), a[2]);
+ break;
+ case SYS_LISTEN:
+ ret = sys_listen(a0, a1);
+ break;
+ case SYS_ACCEPT:
+ ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
+ break;
+ case SYS_GETSOCKNAME:
+ ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
+ break;
+ case SYS_GETPEERNAME:
+ ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
+ break;
+ case SYS_SOCKETPAIR:
+ ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
+ break;
+ case SYS_SEND:
+ ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
+ break;
+ case SYS_SENDTO:
+ ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
+ break;
+ case SYS_RECV:
+ ret = compat_sys_recv(a0, compat_ptr(a1), a[2], a[3]);
+ break;
+ case SYS_RECVFROM:
+ ret = compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3],
+ compat_ptr(a[4]), compat_ptr(a[5]));
+ break;
+ case SYS_SHUTDOWN:
+ ret = sys_shutdown(a0, a1);
+ break;
+ case SYS_SETSOCKOPT:
+ ret = compat_sys_setsockopt(a0, a1, a[2],
+ compat_ptr(a[3]), a[4]);
+ break;
+ case SYS_GETSOCKOPT:
+ ret = compat_sys_getsockopt(a0, a1, a[2],
+ compat_ptr(a[3]), compat_ptr(a[4]));
+ break;
+ case SYS_SENDMSG:
+ ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
+ break;
+ case SYS_SENDMMSG:
+ ret = compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]);
+ break;
+ case SYS_RECVMSG:
+ ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
+ break;
+ case SYS_RECVMMSG:
+ ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
+ compat_ptr(a[4]));
+ break;
+ case SYS_ACCEPT4:
+ ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
diff --git a/net/core/Makefile b/net/core/Makefile
new file mode 100644
index 000000000..fec0856dd
--- /dev/null
+++ b/net/core/Makefile
@@ -0,0 +1,25 @@
+#
+# Makefile for the Linux networking core.
+#
+
+obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
+
+obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
+
+obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
+ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+ sock_diag.o dev_ioctl.o tso.o
+
+obj-$(CONFIG_XFRM) += flow.o
+obj-y += net-sysfs.o
+obj-$(CONFIG_PROC_FS) += net-procfs.o
+obj-$(CONFIG_NET_PKTGEN) += pktgen.o
+obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_FIB_RULES) += fib_rules.o
+obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
+obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
+obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
new file mode 100644
index 000000000..b80fb91bb
--- /dev/null
+++ b/net/core/datagram.c
@@ -0,0 +1,753 @@
+/*
+ * SUCS NET3:
+ *
+ * Generic datagram handling routines. These are generic for all
+ * protocols. Possibly a generic IP version on top of these would
+ * make sense. Not tonight however 8-).
+ * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
+ * NetROM layer all have identical poll code and mostly
+ * identical recvmsg() code. So we share it here. The poll was
+ * shared before but buried in udp.c so I moved it.
+ *
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
+ * udp.c code)
+ *
+ * Fixes:
+ * Alan Cox : NULL return from skb_peek_copy()
+ * understood
+ * Alan Cox : Rewrote skb_read_datagram to avoid the
+ * skb_peek_copy stuff.
+ * Alan Cox : Added support for SOCK_SEQPACKET.
+ * IPX can no longer use the SO_TYPE hack
+ * but AX.25 now works right, and SPX is
+ * feasible.
+ * Alan Cox : Fixed write poll of non IP protocol
+ * crash.
+ * Florian La Roche: Changed for my new skbuff handling.
+ * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
+ * Linus Torvalds : BSD semantic fixes.
+ * Alan Cox : Datagram iovec handling
+ * Darryl Miles : Fixed non-blocking SOCK_STREAM.
+ * Alan Cox : POSIXisms
+ * Pete Wyckoff : Unconnected accept() fix.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/poll.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+
+#include <net/checksum.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
+
+/*
+ * Is a socket 'connection oriented' ?
+ */
+static inline int connection_based(struct sock *sk)
+{
+ return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
+}
+
+static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
+ void *key)
+{
+ unsigned long bits = (unsigned long)key;
+
+ /*
+ * Avoid a wakeup if event not interesting for us
+ */
+ if (bits && !(bits & (POLLIN | POLLERR)))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+/*
+ * Wait for the last received packet to be different from skb
+ */
+static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+ const struct sk_buff *skb)
+{
+ int error;
+ DEFINE_WAIT_FUNC(wait, receiver_wake_function);
+
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ /* Socket errors? */
+ error = sock_error(sk);
+ if (error)
+ goto out_err;
+
+ if (sk->sk_receive_queue.prev != skb)
+ goto out;
+
+ /* Socket shut down? */
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto out_noerr;
+
+ /* Sequenced packets can come disconnected.
+ * If so we report the problem
+ */
+ error = -ENOTCONN;
+ if (connection_based(sk) &&
+ !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
+ goto out_err;
+
+ /* handle signals */
+ if (signal_pending(current))
+ goto interrupted;
+
+ error = 0;
+ *timeo_p = schedule_timeout(*timeo_p);
+out:
+ finish_wait(sk_sleep(sk), &wait);
+ return error;
+interrupted:
+ error = sock_intr_errno(*timeo_p);
+out_err:
+ *err = error;
+ goto out;
+out_noerr:
+ *err = 0;
+ error = 1;
+ goto out;
+}
+
+/**
+ * __skb_recv_datagram - Receive a datagram skbuff
+ * @sk: socket
+ * @flags: MSG_ flags
+ * @peeked: returns non-zero if this packet has been seen before
+ * @off: an offset in bytes to peek skb from. Returns an offset
+ * within an skb where data actually starts
+ * @err: error code returned
+ *
+ * Get a datagram skbuff, understands the peeking, nonblocking wakeups
+ * and possible races. This replaces identical code in packet, raw and
+ * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
+ * the long standing peek and read race for datagram sockets. If you
+ * alter this routine remember it must be re-entrant.
+ *
+ * This function will lock the socket if a skb is returned, so the caller
+ * needs to unlock the socket in that case (usually by calling
+ * skb_free_datagram)
+ *
+ * * It does not lock socket since today. This function is
+ * * free of race conditions. This measure should/can improve
+ * * significantly datagram socket latencies at high loads,
+ * * when data copying to user space takes lots of time.
+ * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
+ * * 8) Great win.)
+ * * --ANK (980729)
+ *
+ * The order of the tests when we find no data waiting are specified
+ * quite explicitly by POSIX 1003.1g, don't change them without having
+ * the standard around please.
+ */
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int *peeked, int *off, int *err)
+{
+ struct sk_buff *skb, *last;
+ long timeo;
+ /*
+ * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
+ */
+ int error = sock_error(sk);
+
+ if (error)
+ goto no_packet;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ /* Again only user level code calls this function, so nothing
+ * interrupt level will suddenly eat the receive_queue.
+ *
+ * Look at current nfs client by the way...
+ * However, this function was correct in any case. 8)
+ */
+ unsigned long cpu_flags;
+ struct sk_buff_head *queue = &sk->sk_receive_queue;
+ int _off = *off;
+
+ last = (struct sk_buff *)queue;
+ spin_lock_irqsave(&queue->lock, cpu_flags);
+ skb_queue_walk(queue, skb) {
+ last = skb;
+ *peeked = skb->peeked;
+ if (flags & MSG_PEEK) {
+ if (_off >= skb->len && (skb->len || _off ||
+ skb->peeked)) {
+ _off -= skb->len;
+ continue;
+ }
+ skb->peeked = 1;
+ atomic_inc(&skb->users);
+ } else
+ __skb_unlink(skb, queue);
+
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
+ *off = _off;
+ return skb;
+ }
+ spin_unlock_irqrestore(&queue->lock, cpu_flags);
+
+ if (sk_can_busy_loop(sk) &&
+ sk_busy_loop(sk, flags & MSG_DONTWAIT))
+ continue;
+
+ /* User doesn't want to wait */
+ error = -EAGAIN;
+ if (!timeo)
+ goto no_packet;
+
+ } while (!wait_for_more_packets(sk, err, &timeo, last));
+
+ return NULL;
+
+no_packet:
+ *err = error;
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_recv_datagram);
+
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int noblock, int *err)
+{
+ int peeked, off = 0;
+
+ return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ &peeked, &off, err);
+}
+EXPORT_SYMBOL(skb_recv_datagram);
+
+void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
+{
+ consume_skb(skb);
+ sk_mem_reclaim_partial(sk);
+}
+EXPORT_SYMBOL(skb_free_datagram);
+
+void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
+{
+ bool slow;
+
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+
+ slow = lock_sock_fast(sk);
+ skb_orphan(skb);
+ sk_mem_reclaim_partial(sk);
+ unlock_sock_fast(sk, slow);
+
+ /* skb is now orphaned, can be freed outside of locked section */
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(skb_free_datagram_locked);
+
+/**
+ * skb_kill_datagram - Free a datagram skbuff forcibly
+ * @sk: socket
+ * @skb: datagram skbuff
+ * @flags: MSG_ flags
+ *
+ * This function frees a datagram skbuff that was received by
+ * skb_recv_datagram. The flags argument must match the one
+ * used for skb_recv_datagram.
+ *
+ * If the MSG_PEEK flag is set, and the packet is still on the
+ * receive queue of the socket, it will be taken off the queue
+ * before it is freed.
+ *
+ * This function currently only disables BH when acquiring the
+ * sk_receive_queue lock. Therefore it must not be used in a
+ * context where that lock is acquired in an IRQ context.
+ *
+ * It returns 0 if the packet was removed by us.
+ */
+
+int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+{
+ int err = 0;
+
+ if (flags & MSG_PEEK) {
+ err = -ENOENT;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ atomic_dec(&skb->users);
+ err = 0;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ kfree_skb(skb);
+ atomic_inc(&sk->sk_drops);
+ sk_mem_reclaim_partial(sk);
+
+ return err;
+}
+EXPORT_SYMBOL(skb_kill_datagram);
+
+/**
+ * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ trace_skb_copy_datagram_iovec(skb, len);
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (copy_to_iter(skb->data + offset, copy, to) != copy)
+ goto short_copy;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (copy_page_to_iter(skb_frag_page(frag),
+ frag->page_offset + offset -
+ start, copy, to) != copy)
+ goto short_copy;
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_iter(frag_iter, offset - start,
+ to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+ /* This is not really a user copy fault, but rather someone
+ * gave us a bogus length on the skb. We should probably
+ * print a warning here as it may indicate a kernel bug.
+ */
+
+fault:
+ return -EFAULT;
+
+short_copy:
+ if (iov_iter_count(to))
+ goto fault;
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iter);
+
+/**
+ * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying to
+ * @from: the copy source
+ * @len: amount of data to copy to buffer from iovec
+ *
+ * Returns 0 or -EFAULT.
+ */
+int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
+ struct iov_iter *from,
+ int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (copy_from_iter(skb->data + offset, copy, from) != copy)
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ size_t copied;
+
+ if (copy > len)
+ copy = len;
+ copied = copy_page_from_iter(skb_frag_page(frag),
+ frag->page_offset + offset - start,
+ copy, from);
+ if (copied != copy)
+ goto fault;
+
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_from_iter(frag_iter,
+ offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iter);
+
+/**
+ * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
+ * @skb: buffer to copy
+ * @from: the source to copy from
+ *
+ * The function will first copy up to headlen, and then pin the userspace
+ * pages and build frags through them.
+ *
+ * Returns 0, -EFAULT or -EMSGSIZE.
+ */
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+ int len = iov_iter_count(from);
+ int copy = min_t(int, skb_headlen(skb), len);
+ int frag = 0;
+
+ /* copy up to skb headlen */
+ if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+ return -EFAULT;
+
+ while (iov_iter_count(from)) {
+ struct page *pages[MAX_SKB_FRAGS];
+ size_t start;
+ ssize_t copied;
+ unsigned long truesize;
+ int n = 0;
+
+ if (frag == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ copied = iov_iter_get_pages(from, pages, ~0U,
+ MAX_SKB_FRAGS - frag, &start);
+ if (copied < 0)
+ return -EFAULT;
+
+ iov_iter_advance(from, copied);
+
+ truesize = PAGE_ALIGN(copied + start);
+ skb->data_len += copied;
+ skb->len += copied;
+ skb->truesize += truesize;
+ atomic_add(truesize, &skb->sk->sk_wmem_alloc);
+ while (copied) {
+ int size = min_t(int, copied, PAGE_SIZE - start);
+ skb_fill_page_desc(skb, frag++, pages[n], start, size);
+ start = 0;
+ copied -= size;
+ n++;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(zerocopy_sg_from_iter);
+
+static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len,
+ __wsum *csump)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int pos = 0;
+ int n;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
+ if (n != copy)
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ __wsum csum2 = 0;
+ struct page *page = skb_frag_page(frag);
+ u8 *vaddr = kmap(page);
+
+ if (copy > len)
+ copy = len;
+ n = csum_and_copy_to_iter(vaddr + frag->page_offset +
+ offset - start, copy,
+ &csum2, to);
+ kunmap(page);
+ if (n != copy)
+ goto fault;
+ *csump = csum_block_add(*csump, csum2, pos);
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2 = 0;
+ if (copy > len)
+ copy = len;
+ if (skb_copy_and_csum_datagram(frag_iter,
+ offset - start,
+ to, copy,
+ &csum2))
+ goto fault;
+ *csump = csum_block_add(*csump, csum2, pos);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
+{
+ __sum16 sum;
+
+ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+ skb->csum_valid = !sum;
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
+
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+ __wsum csum;
+ __sum16 sum;
+
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ /* skb->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(skb->csum, csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
+/**
+ * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
+ * @skb: skbuff
+ * @hlen: hardware length
+ * @msg: destination
+ *
+ * Caller _must_ check that skb will fit to this iovec.
+ *
+ * Returns: 0 - success.
+ * -EINVAL - checksum failure.
+ * -EFAULT - fault during copy.
+ */
+int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
+ int hlen, struct msghdr *msg)
+{
+ __wsum csum;
+ int chunk = skb->len - hlen;
+
+ if (!chunk)
+ return 0;
+
+ if (msg_data_left(msg) < chunk) {
+ if (__skb_checksum_complete(skb))
+ goto csum_error;
+ if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
+ goto fault;
+ } else {
+ csum = csum_partial(skb->data, hlen, skb->csum);
+ if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
+ chunk, &csum))
+ goto fault;
+ if (csum_fold(csum))
+ goto csum_error;
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+ netdev_rx_csum_fault(skb->dev);
+ }
+ return 0;
+csum_error:
+ return -EINVAL;
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
+
+/**
+ * datagram_poll - generic datagram poll
+ * @file: file struct
+ * @sock: socket
+ * @wait: poll table
+ *
+ * Datagram poll: Again totally generic. This also handles
+ * sequenced packet sockets providing the socket receive queue
+ * is only ever holding data ready to receive.
+ *
+ * Note: when you _don't_ use this routine for this protocol,
+ * and you use a different write policy from sock_writeable()
+ * then please supply your own write_space callback.
+ */
+unsigned int datagram_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask;
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connection-based need to check for termination and startup */
+ if (connection_based(sk)) {
+ if (sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+ /* connection hasn't started yet? */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return mask;
+ }
+
+ /* writable? */
+ if (sock_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ return mask;
+}
+EXPORT_SYMBOL(datagram_poll);
diff --git a/net/core/dev.c b/net/core/dev.c
new file mode 100644
index 000000000..aa82f9ab6
--- /dev/null
+++ b/net/core/dev.c
@@ -0,0 +1,7523 @@
+/*
+ * NET3 Protocol independent device support routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the non IP parts of dev.c 1.0.19
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Florian la Roche <rzsfl@rz.uni-sb.de>
+ * Alan Cox <gw4pts@gw4pts.ampr.org>
+ * David Hinds <dahinds@users.sourceforge.net>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Adam Sulmicki <adam@cfar.umd.edu>
+ * Pekka Riikonen <priikone@poesidon.pspt.fi>
+ *
+ * Changes:
+ * D.J. Barrow : Fixed bug where dev->refcnt gets set
+ * to 2 if register_netdev gets called
+ * before net_dev_init & also removed a
+ * few lines of code in the process.
+ * Alan Cox : device private ioctl copies fields back.
+ * Alan Cox : Transmit queue code does relevant
+ * stunts to keep the queue safe.
+ * Alan Cox : Fixed double lock.
+ * Alan Cox : Fixed promisc NULL pointer trap
+ * ???????? : Support the full private ioctl range
+ * Alan Cox : Moved ioctl permission check into
+ * drivers
+ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
+ * Alan Cox : 100 backlog just doesn't cut it when
+ * you start doing multicast video 8)
+ * Alan Cox : Rewrote net_bh and list manager.
+ * Alan Cox : Fix ETH_P_ALL echoback lengths.
+ * Alan Cox : Took out transmit every packet pass
+ * Saved a few bytes in the ioctl handler
+ * Alan Cox : Network driver sets packet type before
+ * calling netif_rx. Saves a function
+ * call a packet.
+ * Alan Cox : Hashed net_bh()
+ * Richard Kooijman: Timestamp fixes.
+ * Alan Cox : Wrong field in SIOCGIFDSTADDR
+ * Alan Cox : Device lock protection.
+ * Alan Cox : Fixed nasty side effect of device close
+ * changes.
+ * Rudi Cilibrasi : Pass the right thing to
+ * set_mac_address()
+ * Dave Miller : 32bit quantity for the device lock to
+ * make it work out on a Sparc.
+ * Bjorn Ekwall : Added KERNELD hack.
+ * Alan Cox : Cleaned up the backlog initialise.
+ * Craig Metz : SIOCGIFCONF fix if space for under
+ * 1 device.
+ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
+ * is no device open function.
+ * Andi Kleen : Fix error reporting for SIOCGIFCONF
+ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
+ * Cyrus Durgin : Cleaned for KMOD
+ * Adam Sulmicki : Bug Fix : Network Device Unload
+ * A network device unload needs to purge
+ * the backlog queue.
+ * Paul Rusty Russell : SIOCSIFNAME
+ * Pekka Riikonen : Netdev boot-time settings code
+ * Andrew Morton : Make unregister_netdevice wait
+ * indefinitely on dev->refcnt
+ * J Hadi Salim : - Backlog queue sampling
+ * - netif_rx() feedback
+ */
+
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/notifier.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <linux/rtnetlink.h>
+#include <linux/stat.h>
+#include <net/dst.h>
+#include <net/pkt_sched.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netpoll.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+#include <net/iw_handler.h>
+#include <asm/current.h>
+#include <linux/audit.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/ctype.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/mpls.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
+#include <linux/pci.h>
+#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
+#include <linux/static_key.h>
+#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
+#include <linux/if_macvlan.h>
+#include <linux/errqueue.h>
+#include <linux/hrtimer.h>
+
+#include "net-sysfs.h"
+
+/* Instead of increasing this, you should create a hash table. */
+#define MAX_GRO_SKBS 8
+
+/* This should be increased if a protocol with a bigger head is added. */
+#define GRO_MAX_HEAD (MAX_HEADER + 128)
+
+static DEFINE_SPINLOCK(ptype_lock);
+static DEFINE_SPINLOCK(offload_lock);
+struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+struct list_head ptype_all __read_mostly; /* Taps */
+static struct list_head offload_base __read_mostly;
+
+static int netif_rx_internal(struct sk_buff *skb);
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info);
+
+/*
+ * The @dev_base_head list is protected by @dev_base_lock and the rtnl
+ * semaphore.
+ *
+ * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
+ *
+ * Writers must hold the rtnl semaphore while they loop through the
+ * dev_base_head list, and hold dev_base_lock for writing when they do the
+ * actual updates. This allows pure readers to access the list even
+ * while a writer is preparing to update it.
+ *
+ * To put it another way, dev_base_lock is held for writing only to
+ * protect against pure readers; the rtnl semaphore provides the
+ * protection against other writers.
+ *
+ * See, for example usages, register_netdevice() and
+ * unregister_netdevice(), which must be called with the rtnl
+ * semaphore held.
+ */
+DEFINE_RWLOCK(dev_base_lock);
+EXPORT_SYMBOL(dev_base_lock);
+
+/* protects napi_hash addition/deletion and napi_gen_id */
+static DEFINE_SPINLOCK(napi_hash_lock);
+
+static unsigned int napi_gen_id;
+static DEFINE_HASHTABLE(napi_hash, 8);
+
+static seqcount_t devnet_rename_seq;
+
+static inline void dev_base_seq_inc(struct net *net)
+{
+ while (++net->dev_base_seq == 0);
+}
+
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+ unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+
+ return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+ return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+}
+
+static inline void rps_lock(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ spin_lock(&sd->input_pkt_queue.lock);
+#endif
+}
+
+static inline void rps_unlock(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ spin_unlock(&sd->input_pkt_queue.lock);
+#endif
+}
+
+/* Device list insertion */
+static void list_netdevice(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ ASSERT_RTNL();
+
+ write_lock_bh(&dev_base_lock);
+ list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
+ hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ hlist_add_head_rcu(&dev->index_hlist,
+ dev_index_hash(net, dev->ifindex));
+ write_unlock_bh(&dev_base_lock);
+
+ dev_base_seq_inc(net);
+}
+
+/* Device list removal
+ * caller must respect a RCU grace period before freeing/reusing dev
+ */
+static void unlist_netdevice(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ /* Unlink dev from the device chain */
+ write_lock_bh(&dev_base_lock);
+ list_del_rcu(&dev->dev_list);
+ hlist_del_rcu(&dev->name_hlist);
+ hlist_del_rcu(&dev->index_hlist);
+ write_unlock_bh(&dev_base_lock);
+
+ dev_base_seq_inc(dev_net(dev));
+}
+
+/*
+ * Our notifier list
+ */
+
+static RAW_NOTIFIER_HEAD(netdev_chain);
+
+/*
+ * Device drivers call our routines to queue packets here. We empty the
+ * queue in the local softnet handler.
+ */
+
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+EXPORT_PER_CPU_SYMBOL(softnet_data);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] =
+ {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
+ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
+ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
+
+static const char *const netdev_lock_name[] =
+ {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+ if (netdev_lock_type[i] == dev_type)
+ return i;
+ /* the last key is used by default */
+ return ARRAY_SIZE(netdev_lock_type) - 1;
+}
+
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+ int i;
+
+ i = netdev_lock_pos(dev_type);
+ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+ netdev_lock_name[i]);
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+ int i;
+
+ i = netdev_lock_pos(dev->type);
+ lockdep_set_class_and_name(&dev->addr_list_lock,
+ &netdev_addr_lock_key[i],
+ netdev_lock_name[i]);
+}
+#else
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+}
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+}
+#endif
+
+/*******************************************************************************
+
+ Protocol management and registration routines
+
+*******************************************************************************/
+
+/*
+ * Add a protocol ID to the list. Now that the input handler is
+ * smarter we can dispense with all the messy stuff that used to be
+ * here.
+ *
+ * BEWARE!!! Protocol handlers, mangling input packets,
+ * MUST BE last in hash buckets and checking protocol handlers
+ * MUST start from promiscuous ptype_all chain in net_bh.
+ * It is true now, do not change it.
+ * Explanation follows: if protocol handler, mangling packet, will
+ * be the first on list, it is not able to sense, that packet
+ * is cloned and should be copied-on-write, so that it will
+ * change it and subsequent readers will get broken packet.
+ * --ANK (980803)
+ */
+
+static inline struct list_head *ptype_head(const struct packet_type *pt)
+{
+ if (pt->type == htons(ETH_P_ALL))
+ return pt->dev ? &pt->dev->ptype_all : &ptype_all;
+ else
+ return pt->dev ? &pt->dev->ptype_specific :
+ &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+}
+
+/**
+ * dev_add_pack - add packet handler
+ * @pt: packet type declaration
+ *
+ * Add a protocol handler to the networking stack. The passed &packet_type
+ * is linked into kernel lists and may not be freed until it has been
+ * removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new packet type (until the next received packet).
+ */
+
+void dev_add_pack(struct packet_type *pt)
+{
+ struct list_head *head = ptype_head(pt);
+
+ spin_lock(&ptype_lock);
+ list_add_rcu(&pt->list, head);
+ spin_unlock(&ptype_lock);
+}
+EXPORT_SYMBOL(dev_add_pack);
+
+/**
+ * __dev_remove_pack - remove packet handler
+ * @pt: packet type declaration
+ *
+ * Remove a protocol handler that was previously added to the kernel
+ * protocol handlers by dev_add_pack(). The passed &packet_type is removed
+ * from the kernel lists and can be freed or reused once this function
+ * returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+void __dev_remove_pack(struct packet_type *pt)
+{
+ struct list_head *head = ptype_head(pt);
+ struct packet_type *pt1;
+
+ spin_lock(&ptype_lock);
+
+ list_for_each_entry(pt1, head, list) {
+ if (pt == pt1) {
+ list_del_rcu(&pt->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_pack: %p not found\n", pt);
+out:
+ spin_unlock(&ptype_lock);
+}
+EXPORT_SYMBOL(__dev_remove_pack);
+
+/**
+ * dev_remove_pack - remove packet handler
+ * @pt: packet type declaration
+ *
+ * Remove a protocol handler that was previously added to the kernel
+ * protocol handlers by dev_add_pack(). The passed &packet_type is removed
+ * from the kernel lists and can be freed or reused once this function
+ * returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_pack(struct packet_type *pt)
+{
+ __dev_remove_pack(pt);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_pack);
+
+
+/**
+ * dev_add_offload - register offload handlers
+ * @po: protocol offload declaration
+ *
+ * Add protocol offload handlers to the networking stack. The passed
+ * &proto_offload is linked into kernel lists and may not be freed until
+ * it has been removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+
+ spin_lock(&offload_lock);
+ list_add_rcu(&po->list, head);
+ spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ * __dev_remove_offload - remove offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a protocol offload handler that was previously added to the
+ * kernel offload handlers by dev_add_offload(). The passed &offload_type
+ * is removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+static void __dev_remove_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+ struct packet_offload *po1;
+
+ spin_lock(&offload_lock);
+
+ list_for_each_entry(po1, head, list) {
+ if (po == po1) {
+ list_del_rcu(&po->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+ spin_unlock(&offload_lock);
+}
+
+/**
+ * dev_remove_offload - remove packet offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a packet offload handler that was previously added to the kernel
+ * offload handlers by dev_add_offload(). The passed &offload_type is
+ * removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+ __dev_remove_offload(po);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
+
+/******************************************************************************
+
+ Device Boot-time Settings Routines
+
+*******************************************************************************/
+
+/* Boot time configuration table */
+static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
+
+/**
+ * netdev_boot_setup_add - add new setup entry
+ * @name: name of the device
+ * @map: configured settings for the device
+ *
+ * Adds new setup entry to the dev_boot_setup list. The function
+ * returns 0 on error and 1 on success. This is a generic routine to
+ * all netdevices.
+ */
+static int netdev_boot_setup_add(char *name, struct ifmap *map)
+{
+ struct netdev_boot_setup *s;
+ int i;
+
+ s = dev_boot_setup;
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+ if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
+ memset(s[i].name, 0, sizeof(s[i].name));
+ strlcpy(s[i].name, name, IFNAMSIZ);
+ memcpy(&s[i].map, map, sizeof(s[i].map));
+ break;
+ }
+ }
+
+ return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
+}
+
+/**
+ * netdev_boot_setup_check - check boot time settings
+ * @dev: the netdevice
+ *
+ * Check boot time settings for the device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found, 1 if they are.
+ */
+int netdev_boot_setup_check(struct net_device *dev)
+{
+ struct netdev_boot_setup *s = dev_boot_setup;
+ int i;
+
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+ if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
+ !strcmp(dev->name, s[i].name)) {
+ dev->irq = s[i].map.irq;
+ dev->base_addr = s[i].map.base_addr;
+ dev->mem_start = s[i].map.mem_start;
+ dev->mem_end = s[i].map.mem_end;
+ return 1;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(netdev_boot_setup_check);
+
+
+/**
+ * netdev_boot_base - get address from boot time settings
+ * @prefix: prefix for network device
+ * @unit: id for network device
+ *
+ * Check boot time settings for the base address of device.
+ * The found settings are set for the device to be used
+ * later in the device probing.
+ * Returns 0 if no settings found.
+ */
+unsigned long netdev_boot_base(const char *prefix, int unit)
+{
+ const struct netdev_boot_setup *s = dev_boot_setup;
+ char name[IFNAMSIZ];
+ int i;
+
+ sprintf(name, "%s%d", prefix, unit);
+
+ /*
+ * If device already registered then return base of 1
+ * to indicate not to probe for this interface
+ */
+ if (__dev_get_by_name(&init_net, name))
+ return 1;
+
+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
+ if (!strcmp(name, s[i].name))
+ return s[i].map.base_addr;
+ return 0;
+}
+
+/*
+ * Saves at boot time configured settings for any netdevice.
+ */
+int __init netdev_boot_setup(char *str)
+{
+ int ints[5];
+ struct ifmap map;
+
+ str = get_options(str, ARRAY_SIZE(ints), ints);
+ if (!str || !*str)
+ return 0;
+
+ /* Save settings */
+ memset(&map, 0, sizeof(map));
+ if (ints[0] > 0)
+ map.irq = ints[1];
+ if (ints[0] > 1)
+ map.base_addr = ints[2];
+ if (ints[0] > 2)
+ map.mem_start = ints[3];
+ if (ints[0] > 3)
+ map.mem_end = ints[4];
+
+ /* Add new entry to the list */
+ return netdev_boot_setup_add(str, &map);
+}
+
+__setup("netdev=", netdev_boot_setup);
+
+/*******************************************************************************
+
+ Device Interface Subroutines
+
+*******************************************************************************/
+
+/**
+ * dev_get_iflink - get 'iflink' value of a interface
+ * @dev: targeted interface
+ *
+ * Indicates the ifindex the interface is linked to.
+ * Physical interfaces have the same 'ifindex' and 'iflink' values.
+ */
+
+int dev_get_iflink(const struct net_device *dev)
+{
+ if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
+ return dev->netdev_ops->ndo_get_iflink(dev);
+
+ /* If dev->rtnl_link_ops is set, it's a virtual interface. */
+ if (dev->rtnl_link_ops)
+ return 0;
+
+ return dev->ifindex;
+}
+EXPORT_SYMBOL(dev_get_iflink);
+
+/**
+ * __dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name. Must be called under RTNL semaphore
+ * or @dev_base_lock. If the name is found a pointer to the device
+ * is returned. If the name is not found then %NULL is returned. The
+ * reference counters are not incremented so the caller must be
+ * careful with locks.
+ */
+
+struct net_device *__dev_get_by_name(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_name_hash(net, name);
+
+ hlist_for_each_entry(dev, head, name_hlist)
+ if (!strncmp(dev->name, name, IFNAMSIZ))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_get_by_name);
+
+/**
+ * dev_get_by_name_rcu - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned.
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_name_hash(net, name);
+
+ hlist_for_each_entry_rcu(dev, head, name_hlist)
+ if (!strncmp(dev->name, name, IFNAMSIZ))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_name_rcu);
+
+/**
+ * dev_get_by_name - find a device by its name
+ * @net: the applicable net namespace
+ * @name: name to find
+ *
+ * Find an interface by name. This can be called from any
+ * context and does its own locking. The returned handle has
+ * the usage count incremented and the caller must use dev_put() to
+ * release it when it is no longer needed. %NULL is returned if no
+ * matching device is found.
+ */
+
+struct net_device *dev_get_by_name(struct net *net, const char *name)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_name);
+
+/**
+ * __dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not
+ * had its reference counter increased so the caller must be careful
+ * about locking. The caller must hold either the RTNL semaphore
+ * or @dev_base_lock.
+ */
+
+struct net_device *__dev_get_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_index_hash(net, ifindex);
+
+ hlist_for_each_entry(dev, head, index_hlist)
+ if (dev->ifindex == ifindex)
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_get_by_index);
+
+/**
+ * dev_get_by_index_rcu - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns %NULL if the device
+ * is not found or a pointer to the device. The device has not
+ * had its reference counter increased so the caller must be careful
+ * about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct hlist_head *head = dev_index_hash(net, ifindex);
+
+ hlist_for_each_entry_rcu(dev, head, index_hlist)
+ if (dev->ifindex == ifindex)
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_index_rcu);
+
+
+/**
+ * dev_get_by_index - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. Returns NULL if the device
+ * is not found or a pointer to the device. The device returned has
+ * had a reference added and the pointer is safe until the user calls
+ * dev_put to indicate they have finished with it.
+ */
+
+struct net_device *dev_get_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_index);
+
+/**
+ * netdev_get_name - get a netdevice name, knowing its ifindex.
+ * @net: network namespace
+ * @name: a pointer to the buffer where the name will be stored.
+ * @ifindex: the ifindex of the interface to get the name from.
+ *
+ * The use of raw_seqcount_begin() and cond_resched() before
+ * retrying is required as we want to give the writers a chance
+ * to complete when CONFIG_PREEMPT is not set.
+ */
+int netdev_get_name(struct net *net, char *name, int ifindex)
+{
+ struct net_device *dev;
+ unsigned int seq;
+
+retry:
+ seq = raw_seqcount_begin(&devnet_rename_seq);
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ strcpy(name, dev->name);
+ rcu_read_unlock();
+ if (read_seqcount_retry(&devnet_rename_seq, seq)) {
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+/**
+ * dev_getbyhwaddr_rcu - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Search for an interface by MAC address. Returns NULL if the device
+ * is not found or a pointer to the device.
+ * The caller must hold RCU or RTNL.
+ * The returned device has not had its ref count increased
+ * and the caller must therefore be careful about locking
+ *
+ */
+
+struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
+ const char *ha)
+{
+ struct net_device *dev;
+
+ for_each_netdev_rcu(net, dev)
+ if (dev->type == type &&
+ !memcmp(dev->dev_addr, ha, dev->addr_len))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+
+struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev->type == type)
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(__dev_getfirstbyhwtype);
+
+struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+ struct net_device *dev, *ret = NULL;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev)
+ if (dev->type == type) {
+ dev_hold(dev);
+ ret = dev;
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(dev_getfirstbyhwtype);
+
+/**
+ * __dev_get_by_flags - find any device with given flags
+ * @net: the applicable net namespace
+ * @if_flags: IFF_* values
+ * @mask: bitmask of bits in if_flags to check
+ *
+ * Search for any interface with the given flags. Returns NULL if a device
+ * is not found or a pointer to the device. Must be called inside
+ * rtnl_lock(), and result refcount is unchanged.
+ */
+
+struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
+ unsigned short mask)
+{
+ struct net_device *dev, *ret;
+
+ ASSERT_RTNL();
+
+ ret = NULL;
+ for_each_netdev(net, dev) {
+ if (((dev->flags ^ if_flags) & mask) == 0) {
+ ret = dev;
+ break;
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__dev_get_by_flags);
+
+/**
+ * dev_valid_name - check if name is okay for network device
+ * @name: name string
+ *
+ * Network device names need to be valid file names to
+ * to allow sysfs to work. We also disallow any kind of
+ * whitespace.
+ */
+bool dev_valid_name(const char *name)
+{
+ if (*name == '\0')
+ return false;
+ if (strlen(name) >= IFNAMSIZ)
+ return false;
+ if (!strcmp(name, ".") || !strcmp(name, ".."))
+ return false;
+
+ while (*name) {
+ if (*name == '/' || *name == ':' || isspace(*name))
+ return false;
+ name++;
+ }
+ return true;
+}
+EXPORT_SYMBOL(dev_valid_name);
+
+/**
+ * __dev_alloc_name - allocate a name for a device
+ * @net: network namespace to allocate the device name in
+ * @name: name format string
+ * @buf: scratch buffer and result name string
+ *
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
+ */
+
+static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+{
+ int i = 0;
+ const char *p;
+ const int max_netdevices = 8*PAGE_SIZE;
+ unsigned long *inuse;
+ struct net_device *d;
+
+ p = strnchr(name, IFNAMSIZ-1, '%');
+ if (p) {
+ /*
+ * Verify the string as this thing may have come from
+ * the user. There must be either one "%d" and no other "%"
+ * characters.
+ */
+ if (p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
+
+ /* Use one page as a bit array of possible slots */
+ inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
+ if (!inuse)
+ return -ENOMEM;
+
+ for_each_netdev(net, d) {
+ if (!sscanf(d->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
+
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, d->name, IFNAMSIZ))
+ set_bit(i, inuse);
+ }
+
+ i = find_first_zero_bit(inuse, max_netdevices);
+ free_page((unsigned long) inuse);
+ }
+
+ if (buf != name)
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!__dev_get_by_name(net, buf))
+ return i;
+
+ /* It is possible to run out of possible slots
+ * when the name is long and there isn't enough space left
+ * for the digits, or if all bits are used.
+ */
+ return -ENFILE;
+}
+
+/**
+ * dev_alloc_name - allocate a name for a device
+ * @dev: device
+ * @name: name format string
+ *
+ * Passed a format string - eg "lt%d" it will try and find a suitable
+ * id. It scans list of devices to build up a free map, then chooses
+ * the first empty slot. The caller must hold the dev_base or rtnl lock
+ * while allocating the name and adding the device in order to avoid
+ * duplicates.
+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ * Returns the number of the unit assigned or a negative errno code.
+ */
+
+int dev_alloc_name(struct net_device *dev, const char *name)
+{
+ char buf[IFNAMSIZ];
+ struct net *net;
+ int ret;
+
+ BUG_ON(!dev_net(dev));
+ net = dev_net(dev);
+ ret = __dev_alloc_name(net, name, buf);
+ if (ret >= 0)
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return ret;
+}
+EXPORT_SYMBOL(dev_alloc_name);
+
+static int dev_alloc_name_ns(struct net *net,
+ struct net_device *dev,
+ const char *name)
+{
+ char buf[IFNAMSIZ];
+ int ret;
+
+ ret = __dev_alloc_name(net, name, buf);
+ if (ret >= 0)
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return ret;
+}
+
+static int dev_get_valid_name(struct net *net,
+ struct net_device *dev,
+ const char *name)
+{
+ BUG_ON(!net);
+
+ if (!dev_valid_name(name))
+ return -EINVAL;
+
+ if (strchr(name, '%'))
+ return dev_alloc_name_ns(net, dev, name);
+ else if (__dev_get_by_name(net, name))
+ return -EEXIST;
+ else if (dev->name != name)
+ strlcpy(dev->name, name, IFNAMSIZ);
+
+ return 0;
+}
+
+/**
+ * dev_change_name - change name of a device
+ * @dev: device
+ * @newname: name (or format string) must be at least IFNAMSIZ
+ *
+ * Change name of a device, can pass format strings "eth%d".
+ * for wildcarding.
+ */
+int dev_change_name(struct net_device *dev, const char *newname)
+{
+ unsigned char old_assign_type;
+ char oldname[IFNAMSIZ];
+ int err = 0;
+ int ret;
+ struct net *net;
+
+ ASSERT_RTNL();
+ BUG_ON(!dev_net(dev));
+
+ net = dev_net(dev);
+ if (dev->flags & IFF_UP)
+ return -EBUSY;
+
+ write_seqcount_begin(&devnet_rename_seq);
+
+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
+ write_seqcount_end(&devnet_rename_seq);
+ return 0;
+ }
+
+ memcpy(oldname, dev->name, IFNAMSIZ);
+
+ err = dev_get_valid_name(net, dev, newname);
+ if (err < 0) {
+ write_seqcount_end(&devnet_rename_seq);
+ return err;
+ }
+
+ if (oldname[0] && !strchr(oldname, '%'))
+ netdev_info(dev, "renamed from %s\n", oldname);
+
+ old_assign_type = dev->name_assign_type;
+ dev->name_assign_type = NET_NAME_RENAMED;
+
+rollback:
+ ret = device_rename(&dev->dev, dev->name);
+ if (ret) {
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ dev->name_assign_type = old_assign_type;
+ write_seqcount_end(&devnet_rename_seq);
+ return ret;
+ }
+
+ write_seqcount_end(&devnet_rename_seq);
+
+ netdev_adjacent_rename_links(dev, oldname);
+
+ write_lock_bh(&dev_base_lock);
+ hlist_del_rcu(&dev->name_hlist);
+ write_unlock_bh(&dev_base_lock);
+
+ synchronize_rcu();
+
+ write_lock_bh(&dev_base_lock);
+ hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ write_unlock_bh(&dev_base_lock);
+
+ ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
+ ret = notifier_to_errno(ret);
+
+ if (ret) {
+ /* err >= 0 after dev_alloc_name() or stores the first errno */
+ if (err >= 0) {
+ err = ret;
+ write_seqcount_begin(&devnet_rename_seq);
+ memcpy(dev->name, oldname, IFNAMSIZ);
+ memcpy(oldname, newname, IFNAMSIZ);
+ dev->name_assign_type = old_assign_type;
+ old_assign_type = NET_NAME_RENAMED;
+ goto rollback;
+ } else {
+ pr_err("%s: name change rollback failed: %d\n",
+ dev->name, ret);
+ }
+ }
+
+ return err;
+}
+
+/**
+ * dev_set_alias - change ifalias of a device
+ * @dev: device
+ * @alias: name up to IFALIASZ
+ * @len: limit of bytes to copy from info
+ *
+ * Set ifalias for a device,
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ char *new_ifalias;
+
+ ASSERT_RTNL();
+
+ if (len >= IFALIASZ)
+ return -EINVAL;
+
+ if (!len) {
+ kfree(dev->ifalias);
+ dev->ifalias = NULL;
+ return 0;
+ }
+
+ new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
+ if (!new_ifalias)
+ return -ENOMEM;
+ dev->ifalias = new_ifalias;
+
+ strlcpy(dev->ifalias, alias, len+1);
+ return len;
+}
+
+
+/**
+ * netdev_features_change - device changes features
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed features.
+ */
+void netdev_features_change(struct net_device *dev)
+{
+ call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL(netdev_features_change);
+
+/**
+ * netdev_state_change - device changes state
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed state. This function calls
+ * the notifier chains for netdev_chain and sends a NEWLINK message
+ * to the routing socket.
+ */
+void netdev_state_change(struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = 0;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+ }
+}
+EXPORT_SYMBOL(netdev_state_change);
+
+/**
+ * netdev_notify_peers - notify network peers about existence of @dev
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void netdev_notify_peers(struct net_device *dev)
+{
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(netdev_notify_peers);
+
+static int __dev_open(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ /* Block netpoll from trying to do any rx path servicing.
+ * If we don't do this there is a chance ndo_poll_controller
+ * or ndo_poll may be running while we open the device
+ */
+ netpoll_poll_disable(dev);
+
+ ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ return ret;
+
+ set_bit(__LINK_STATE_START, &dev->state);
+
+ if (ops->ndo_validate_addr)
+ ret = ops->ndo_validate_addr(dev);
+
+ if (!ret && ops->ndo_open)
+ ret = ops->ndo_open(dev);
+
+ netpoll_poll_enable(dev);
+
+ if (ret)
+ clear_bit(__LINK_STATE_START, &dev->state);
+ else {
+ dev->flags |= IFF_UP;
+ dev_set_rx_mode(dev);
+ dev_activate(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+ }
+
+ return ret;
+}
+
+/**
+ * dev_open - prepare an interface for use.
+ * @dev: device to open
+ *
+ * Takes a device from down to up state. The device's private open
+ * function is invoked and then the multicast lists are loaded. Finally
+ * the device is moved into the up state and a %NETDEV_UP message is
+ * sent to the netdev notifier chain.
+ *
+ * Calling this function on an active interface is a nop. On a failure
+ * a negative errno code is returned.
+ */
+int dev_open(struct net_device *dev)
+{
+ int ret;
+
+ if (dev->flags & IFF_UP)
+ return 0;
+
+ ret = __dev_open(dev);
+ if (ret < 0)
+ return ret;
+
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ call_netdevice_notifiers(NETDEV_UP, dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_open);
+
+static int __dev_close_many(struct list_head *head)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ might_sleep();
+
+ list_for_each_entry(dev, head, close_list) {
+ /* Temporarily disable netpoll until the interface is down */
+ netpoll_poll_disable(dev);
+
+ call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+
+ clear_bit(__LINK_STATE_START, &dev->state);
+
+ /* Synchronize to scheduled poll. We cannot touch poll list, it
+ * can be even on different cpu. So just clear netif_running().
+ *
+ * dev->stop() will invoke napi_disable() on all of it's
+ * napi_struct instances on this device.
+ */
+ smp_mb__after_atomic(); /* Commit netif_running(). */
+ }
+
+ dev_deactivate_many(head);
+
+ list_for_each_entry(dev, head, close_list) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /*
+ * Call the device specific close. This cannot fail.
+ * Only if device is UP
+ *
+ * We allow it to be called even after a DETACH hot-plug
+ * event.
+ */
+ if (ops->ndo_stop)
+ ops->ndo_stop(dev);
+
+ dev->flags &= ~IFF_UP;
+ netpoll_poll_enable(dev);
+ }
+
+ return 0;
+}
+
+static int __dev_close(struct net_device *dev)
+{
+ int retval;
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ retval = __dev_close_many(&single);
+ list_del(&single);
+
+ return retval;
+}
+
+int dev_close_many(struct list_head *head, bool unlink)
+{
+ struct net_device *dev, *tmp;
+
+ /* Remove the devices that don't need to be closed */
+ list_for_each_entry_safe(dev, tmp, head, close_list)
+ if (!(dev->flags & IFF_UP))
+ list_del_init(&dev->close_list);
+
+ __dev_close_many(head);
+
+ list_for_each_entry_safe(dev, tmp, head, close_list) {
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ if (unlink)
+ list_del_init(&dev->close_list);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(dev_close_many);
+
+/**
+ * dev_close - shutdown an interface.
+ * @dev: device to shutdown
+ *
+ * This function moves an active device into down state. A
+ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ * chain.
+ */
+int dev_close(struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ dev_close_many(&single, true);
+ list_del(&single);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(dev_close);
+
+
+/**
+ * dev_disable_lro - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ dev->wanted_features &= ~NETIF_F_LRO;
+ netdev_update_features(dev);
+
+ if (unlikely(dev->features & NETIF_F_LRO))
+ netdev_WARN(dev, "failed to disable LRO!\n");
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter)
+ dev_disable_lro(lower_dev);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
+ struct net_device *dev)
+{
+ struct netdev_notifier_info info;
+
+ netdev_notifier_info_init(&info, dev);
+ return nb->notifier_call(nb, val, &info);
+}
+
+static int dev_boot_phase = 1;
+
+/**
+ * register_netdevice_notifier - register a network notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ *
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
+ */
+
+int register_netdevice_notifier(struct notifier_block *nb)
+{
+ struct net_device *dev;
+ struct net_device *last;
+ struct net *net;
+ int err;
+
+ rtnl_lock();
+ err = raw_notifier_chain_register(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+ if (dev_boot_phase)
+ goto unlock;
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ goto rollback;
+
+ if (!(dev->flags & IFF_UP))
+ continue;
+
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
+ }
+ }
+
+unlock:
+ rtnl_unlock();
+ return err;
+
+rollback:
+ last = dev;
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ if (dev == last)
+ goto outroll;
+
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+ }
+ }
+
+outroll:
+ raw_notifier_chain_unregister(&netdev_chain, nb);
+ goto unlock;
+}
+EXPORT_SYMBOL(register_netdevice_notifier);
+
+/**
+ * unregister_netdevice_notifier - unregister a network notifier block
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
+ */
+
+int unregister_netdevice_notifier(struct notifier_block *nb)
+{
+ struct net_device *dev;
+ struct net *net;
+ int err;
+
+ rtnl_lock();
+ err = raw_notifier_chain_unregister(&netdev_chain, nb);
+ if (err)
+ goto unlock;
+
+ for_each_net(net) {
+ for_each_netdev(net, dev) {
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+ }
+ }
+unlock:
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier);
+
+/**
+ * call_netdevice_notifiers_info - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ * @info: notifier information data
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+static int call_netdevice_notifiers_info(unsigned long val,
+ struct net_device *dev,
+ struct netdev_notifier_info *info)
+{
+ ASSERT_RTNL();
+ netdev_notifier_info_init(info, dev);
+ return raw_notifier_call_chain(&netdev_chain, val, info);
+}
+
+/**
+ * call_netdevice_notifiers - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+
+int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
+{
+ struct netdev_notifier_info info;
+
+ return call_netdevice_notifiers_info(val, dev, &info);
+}
+EXPORT_SYMBOL(call_netdevice_notifiers);
+
+#ifdef CONFIG_NET_CLS_ACT
+static struct static_key ingress_needed __read_mostly;
+
+void net_inc_ingress_queue(void)
+{
+ static_key_slow_inc(&ingress_needed);
+}
+EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
+
+void net_dec_ingress_queue(void)
+{
+ static_key_slow_dec(&ingress_needed);
+}
+EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
+#endif
+
+static struct static_key netstamp_needed __read_mostly;
+#ifdef HAVE_JUMP_LABEL
+/* We are not allowed to call static_key_slow_dec() from irq context
+ * If net_disable_timestamp() is called from irq context, defer the
+ * static_key_slow_dec() calls.
+ */
+static atomic_t netstamp_needed_deferred;
+#endif
+
+void net_enable_timestamp(void)
+{
+#ifdef HAVE_JUMP_LABEL
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+
+ if (deferred) {
+ while (--deferred)
+ static_key_slow_dec(&netstamp_needed);
+ return;
+ }
+#endif
+ static_key_slow_inc(&netstamp_needed);
+}
+EXPORT_SYMBOL(net_enable_timestamp);
+
+void net_disable_timestamp(void)
+{
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_inc(&netstamp_needed_deferred);
+ return;
+ }
+#endif
+ static_key_slow_dec(&netstamp_needed);
+}
+EXPORT_SYMBOL(net_disable_timestamp);
+
+static inline void net_timestamp_set(struct sk_buff *skb)
+{
+ skb->tstamp.tv64 = 0;
+ if (static_key_false(&netstamp_needed))
+ __net_timestamp(skb);
+}
+
+#define net_timestamp_check(COND, SKB) \
+ if (static_key_false(&netstamp_needed)) { \
+ if ((COND) && !(SKB)->tstamp.tv64) \
+ __net_timestamp(SKB); \
+ } \
+
+bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
+{
+ unsigned int len;
+
+ if (!(dev->flags & IFF_UP))
+ return false;
+
+ len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+ if (skb->len <= len)
+ return true;
+
+ /* if TSO is enabled, we don't care about the length as the packet
+ * could be forwarded without being segmented before
+ */
+ if (skb_is_gso(skb))
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(is_skb_forwardable);
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ if (skb_orphan_frags(skb, GFP_ATOMIC) ||
+ unlikely(!is_skb_forwardable(dev, skb))) {
+ atomic_long_inc(&dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ skb_scrub_packet(skb, true);
+ skb->priority = 0;
+ skb->protocol = eth_type_trans(skb, dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__dev_forward_skb);
+
+/**
+ * dev_forward_skb - loopback an skb to another netif
+ *
+ * @dev: destination network device
+ * @skb: buffer to forward
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped, but freed)
+ *
+ * dev_forward_skb can be used for injecting an skb from the
+ * start_xmit function of one device into the receive queue
+ * of another device.
+ *
+ * The receiving device may be in another namespace, so
+ * we have to clear all information in the skb that could
+ * impact namespace isolation.
+ */
+int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
+}
+EXPORT_SYMBOL_GPL(dev_forward_skb);
+
+static inline int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ return -ENOMEM;
+ atomic_inc(&skb->users);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static inline void deliver_ptype_list_skb(struct sk_buff *skb,
+ struct packet_type **pt,
+ struct net_device *orig_dev,
+ __be16 type,
+ struct list_head *ptype_list)
+{
+ struct packet_type *ptype, *pt_prev = *pt;
+
+ list_for_each_entry_rcu(ptype, ptype_list, list) {
+ if (ptype->type != type)
+ continue;
+ if (pt_prev)
+ deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+ *pt = pt_prev;
+}
+
+static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
+{
+ if (!ptype->af_packet_priv || !skb->sk)
+ return false;
+
+ if (ptype->id_match)
+ return ptype->id_match(ptype, skb->sk);
+ else if ((struct sock *)ptype->af_packet_priv == skb->sk)
+ return true;
+
+ return false;
+}
+
+/*
+ * Support routine. Sends outgoing frames to any network
+ * taps currently in use.
+ */
+
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct packet_type *ptype;
+ struct sk_buff *skb2 = NULL;
+ struct packet_type *pt_prev = NULL;
+ struct list_head *ptype_list = &ptype_all;
+
+ rcu_read_lock();
+again:
+ list_for_each_entry_rcu(ptype, ptype_list, list) {
+ /* Never send packets back to the socket
+ * they originated from - MvS (miquels@drinkel.ow.org)
+ */
+ if (skb_loop_sk(ptype, skb))
+ continue;
+
+ if (pt_prev) {
+ deliver_skb(skb2, pt_prev, skb->dev);
+ pt_prev = ptype;
+ continue;
+ }
+
+ /* need to clone skb, done only once */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ goto out_unlock;
+
+ net_timestamp_set(skb2);
+
+ /* skb->nh should be correctly
+ * set by sender, so that the second statement is
+ * just protection against buggy protocols.
+ */
+ skb_reset_mac_header(skb2);
+
+ if (skb_network_header(skb2) < skb2->data ||
+ skb_network_header(skb2) > skb_tail_pointer(skb2)) {
+ net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
+ ntohs(skb2->protocol),
+ dev->name);
+ skb_reset_network_header(skb2);
+ }
+
+ skb2->transport_header = skb2->network_header;
+ skb2->pkt_type = PACKET_OUTGOING;
+ pt_prev = ptype;
+ }
+
+ if (ptype_list == &ptype_all) {
+ ptype_list = &dev->ptype_all;
+ goto again;
+ }
+out_unlock:
+ if (pt_prev)
+ pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
+ rcu_read_unlock();
+}
+
+/**
+ * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+ int i;
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+ /* If TC0 is invalidated disable TC mapping */
+ if (tc->offset + tc->count > txq) {
+ pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
+ dev->num_tc = 0;
+ return;
+ }
+
+ /* Invalidated prio to tc mappings set to TC0 */
+ for (i = 1; i < TC_BITMASK + 1; i++) {
+ int q = netdev_get_prio_tc_map(dev, i);
+
+ tc = &dev->tc_to_txq[q];
+ if (tc->offset + tc->count > txq) {
+ pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
+ i, q);
+ netdev_set_prio_tc_map(dev, i, 0);
+ }
+ }
+}
+
+#ifdef CONFIG_XPS
+static DEFINE_MUTEX(xps_map_mutex);
+#define xmap_dereference(P) \
+ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+
+static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int cpu, u16 index)
+{
+ struct xps_map *map = NULL;
+ int pos;
+
+ if (dev_maps)
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] == index) {
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
+ } else {
+ RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
+ kfree_rcu(map, rcu);
+ map = NULL;
+ }
+ break;
+ }
+ }
+
+ return map;
+}
+
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ struct xps_dev_maps *dev_maps;
+ int cpu, i;
+ bool active = false;
+
+ mutex_lock(&xps_map_mutex);
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ for_each_possible_cpu(cpu) {
+ for (i = index; i < dev->num_tx_queues; i++) {
+ if (!remove_xps_queue(dev_maps, cpu, i))
+ break;
+ }
+ if (i == dev->num_tx_queues)
+ active = true;
+ }
+
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+
+ for (i = index; i < dev->num_tx_queues; i++)
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+}
+
+static struct xps_map *expand_xps_map(struct xps_map *map,
+ int cpu, u16 index)
+{
+ struct xps_map *new_map;
+ int alloc_len = XPS_MIN_MAP_ALLOC;
+ int i, pos;
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] != index)
+ continue;
+ return map;
+ }
+
+ /* Need to add queue to this CPU's existing map */
+ if (map) {
+ if (pos < map->alloc_len)
+ return map;
+
+ alloc_len = map->alloc_len * 2;
+ }
+
+ /* Need to allocate new map to store queue on this CPU's map */
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!new_map)
+ return NULL;
+
+ for (i = 0; i < pos; i++)
+ new_map->queues[i] = map->queues[i];
+ new_map->alloc_len = alloc_len;
+ new_map->len = pos;
+
+ return new_map;
+}
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
+{
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_map *map, *new_map;
+ int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
+ int cpu, numa_node_id = -2;
+ bool active = false;
+
+ mutex_lock(&xps_map_mutex);
+
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ /* allocate memory for queue storage */
+ for_each_online_cpu(cpu) {
+ if (!cpumask_test_cpu(cpu, mask))
+ continue;
+
+ if (!new_dev_maps)
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ NULL;
+
+ map = expand_xps_map(map, cpu, index);
+ if (!map)
+ goto error;
+
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ }
+
+ if (!new_dev_maps)
+ goto out_no_new_maps;
+
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
+ /* add queue to CPU maps */
+ int pos = 0;
+
+ map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ while ((pos < map->len) && (map->queues[pos] != index))
+ pos++;
+
+ if (pos == map->len)
+ map->queues[map->len++] = index;
+#ifdef CONFIG_NUMA
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(cpu);
+ else if (numa_node_id != cpu_to_node(cpu))
+ numa_node_id = -1;
+#endif
+ } else if (dev_maps) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ }
+
+ }
+
+ rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+
+ /* Cleanup old maps */
+ if (dev_maps) {
+ for_each_possible_cpu(cpu) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ if (map && map != new_map)
+ kfree_rcu(map, rcu);
+ }
+
+ kfree_rcu(dev_maps, rcu);
+ }
+
+ dev_maps = new_dev_maps;
+ active = true;
+
+out_no_new_maps:
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ? numa_node_id :
+ NUMA_NO_NODE);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ /* removes queue from unused CPUs */
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
+ continue;
+
+ if (remove_xps_queue(dev_maps, cpu, index))
+ active = true;
+ }
+
+ /* free map if not active */
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+
+ return 0;
+error:
+ /* remove any maps that we added */
+ for_each_possible_cpu(cpu) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
+
+ mutex_unlock(&xps_map_mutex);
+
+ kfree(new_dev_maps);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(netif_set_xps_queue);
+
+#endif
+/*
+ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
+ * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
+ */
+int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+{
+ int rc;
+
+ if (txq < 1 || txq > dev->num_tx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING) {
+ ASSERT_RTNL();
+
+ rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
+ txq);
+ if (rc)
+ return rc;
+
+ if (dev->num_tc)
+ netif_setup_tc(dev, txq);
+
+ if (txq < dev->real_num_tx_queues) {
+ qdisc_reset_all_tx_gt(dev, txq);
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, txq);
+#endif
+ }
+ }
+
+ dev->real_num_tx_queues = txq;
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_tx_queues);
+
+#ifdef CONFIG_SYSFS
+/**
+ * netif_set_real_num_rx_queues - set actual number of RX queues used
+ * @dev: Network device
+ * @rxq: Actual number of RX queues
+ *
+ * This must be called either with the rtnl_lock held or before
+ * registration of the net device. Returns 0 on success, or a
+ * negative error code. If called before registration, it always
+ * succeeds.
+ */
+int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
+{
+ int rc;
+
+ if (rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED) {
+ ASSERT_RTNL();
+
+ rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
+ rxq);
+ if (rc)
+ return rc;
+ }
+
+ dev->real_num_rx_queues = rxq;
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_rx_queues);
+#endif
+
+/**
+ * netif_get_num_default_rss_queues - default number of RSS queues
+ *
+ * This routine should set an upper limit on the number of RSS queues
+ * used by default by multiqueue devices.
+ */
+int netif_get_num_default_rss_queues(void)
+{
+ return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+}
+EXPORT_SYMBOL(netif_get_num_default_rss_queues);
+
+static inline void __netif_reschedule(struct Qdisc *q)
+{
+ struct softnet_data *sd;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sd = this_cpu_ptr(&softnet_data);
+ q->next_sched = NULL;
+ *sd->output_queue_tailp = q;
+ sd->output_queue_tailp = &q->next_sched;
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+void __netif_schedule(struct Qdisc *q)
+{
+ if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+ __netif_reschedule(q);
+}
+EXPORT_SYMBOL(__netif_schedule);
+
+struct dev_kfree_skb_cb {
+ enum skb_free_reason reason;
+};
+
+static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
+{
+ return (struct dev_kfree_skb_cb *)skb->cb;
+}
+
+void netif_schedule_queue(struct netdev_queue *txq)
+{
+ rcu_read_lock();
+ if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+ struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+ __netif_schedule(q);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netif_schedule_queue);
+
+/**
+ * netif_wake_subqueue - allow sending packets on subqueue
+ * @dev: network device
+ * @queue_index: sub queue index
+ *
+ * Resume individual transmit queue of a device with multiple transmit queues.
+ */
+void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
+
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
+ struct Qdisc *q;
+
+ rcu_read_lock();
+ q = rcu_dereference(txq->qdisc);
+ __netif_schedule(q);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(netif_wake_subqueue);
+
+void netif_tx_wake_queue(struct netdev_queue *dev_queue)
+{
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
+ struct Qdisc *q;
+
+ rcu_read_lock();
+ q = rcu_dereference(dev_queue->qdisc);
+ __netif_schedule(q);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(netif_tx_wake_queue);
+
+void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
+{
+ unsigned long flags;
+
+ if (likely(atomic_read(&skb->users) == 1)) {
+ smp_rmb();
+ atomic_set(&skb->users, 0);
+ } else if (likely(!atomic_dec_and_test(&skb->users))) {
+ return;
+ }
+ get_kfree_skb_cb(skb)->reason = reason;
+ local_irq_save(flags);
+ skb->next = __this_cpu_read(softnet_data.completion_queue);
+ __this_cpu_write(softnet_data.completion_queue, skb);
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__dev_kfree_skb_irq);
+
+void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
+{
+ if (in_irq() || irqs_disabled())
+ __dev_kfree_skb_irq(skb, reason);
+ else
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(__dev_kfree_skb_any);
+
+
+/**
+ * netif_device_detach - mark device as removed
+ * @dev: network device
+ *
+ * Mark device as removed from system and therefore no longer available.
+ */
+void netif_device_detach(struct net_device *dev)
+{
+ if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
+ netif_running(dev)) {
+ netif_tx_stop_all_queues(dev);
+ }
+}
+EXPORT_SYMBOL(netif_device_detach);
+
+/**
+ * netif_device_attach - mark device as attached
+ * @dev: network device
+ *
+ * Mark device as attached from system and restart if needed.
+ */
+void netif_device_attach(struct net_device *dev)
+{
+ if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
+ netif_running(dev)) {
+ netif_tx_wake_all_queues(dev);
+ __netdev_watchdog_up(dev);
+ }
+}
+EXPORT_SYMBOL(netif_device_attach);
+
+static void skb_warn_bad_offload(const struct sk_buff *skb)
+{
+ static const netdev_features_t null_features = 0;
+ struct net_device *dev = skb->dev;
+ const char *driver = "";
+
+ if (!net_ratelimit())
+ return;
+
+ if (dev && dev->dev.parent)
+ driver = dev_driver_string(dev->dev.parent);
+
+ WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
+ "gso_type=%d ip_summed=%d\n",
+ driver, dev ? &dev->features : &null_features,
+ skb->sk ? &skb->sk->sk_route_caps : &null_features,
+ skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
+ skb_shinfo(skb)->gso_type, skb->ip_summed);
+}
+
+/*
+ * Invalidate hardware checksum when packet is to be mangled, and
+ * complete checksum manually on outgoing path.
+ */
+int skb_checksum_help(struct sk_buff *skb)
+{
+ __wsum csum;
+ int ret = 0, offset;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ goto out_set_summed;
+
+ if (unlikely(skb_shinfo(skb)->gso_size)) {
+ skb_warn_bad_offload(skb);
+ return -EINVAL;
+ }
+
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (skb_has_shared_frag(skb)) {
+ ret = __skb_linearize(skb);
+ if (ret)
+ goto out;
+ }
+
+ offset = skb_checksum_start_offset(skb);
+ BUG_ON(offset >= skb_headlen(skb));
+ csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+ offset += skb->csum_offset;
+ BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
+
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, offset + sizeof(__sum16))) {
+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (ret)
+ goto out;
+ }
+
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+out_set_summed:
+ skb->ip_summed = CHECKSUM_NONE;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(skb_checksum_help);
+
+__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
+{
+ __be16 type = skb->protocol;
+
+ /* Tunnel gso handlers can set protocol to ethernet. */
+ if (type == htons(ETH_P_TEB)) {
+ struct ethhdr *eth;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+ return 0;
+
+ eth = (struct ethhdr *)skb_mac_header(skb);
+ type = eth->h_proto;
+ }
+
+ return __vlan_get_protocol(skb, type, depth);
+}
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+
+/* openvswitch calls this on rx path, so we need a different check.
+ */
+static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
+{
+ if (tx_path)
+ return skb->ip_summed != CHECKSUM_PARTIAL;
+ else
+ return skb->ip_summed == CHECKSUM_NONE;
+}
+
+/**
+ * __skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @tx_path: whether it is called in TX path
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, bool tx_path)
+{
+ if (unlikely(skb_needs_check(skb, tx_path))) {
+ int err;
+
+ skb_warn_bad_offload(skb);
+
+ err = skb_cow_head(skb, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ SKB_GSO_CB(skb)->encap_level = 0;
+
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb_mac_gso_segment(skb, features);
+}
+EXPORT_SYMBOL(__skb_gso_segment);
+
+/* Take action when hardware reception checksum errors are detected. */
+#ifdef CONFIG_BUG
+void netdev_rx_csum_fault(struct net_device *dev)
+{
+ if (net_ratelimit()) {
+ pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(netdev_rx_csum_fault);
+#endif
+
+/* Actually, we should eliminate this check as soon as we know, that:
+ * 1. IOMMU is present and allows to map all the memory.
+ * 2. No high memory really exists on this machine.
+ */
+
+static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_HIGHMEM
+ int i;
+ if (!(dev->features & NETIF_F_HIGHDMA)) {
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ if (PageHighMem(skb_frag_page(frag)))
+ return 1;
+ }
+ }
+
+ if (PCI_DMA_BUS_IS_PHYS) {
+ struct device *pdev = dev->dev.parent;
+
+ if (!pdev)
+ return 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ dma_addr_t addr = page_to_phys(skb_frag_page(frag));
+ if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
+/* If MPLS offload request, verify we are testing hardware MPLS features
+ * instead of standard features for the netdev.
+ */
+#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ if (eth_p_mpls(type))
+ features &= skb->dev->mpls_features;
+
+ return features;
+}
+#else
+static netdev_features_t net_mpls_features(struct sk_buff *skb,
+ netdev_features_t features,
+ __be16 type)
+{
+ return features;
+}
+#endif
+
+static netdev_features_t harmonize_features(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ int tmp;
+ __be16 type;
+
+ type = skb_network_protocol(skb, &tmp);
+ features = net_mpls_features(skb, features, type);
+
+ if (skb->ip_summed != CHECKSUM_NONE &&
+ !can_checksum_protocol(features, type)) {
+ features &= ~NETIF_F_ALL_CSUM;
+ } else if (illegal_highdma(skb->dev, skb)) {
+ features &= ~NETIF_F_SG;
+ }
+
+ return features;
+}
+
+netdev_features_t passthru_features_check(struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ return features;
+}
+EXPORT_SYMBOL(passthru_features_check);
+
+static netdev_features_t dflt_features_check(const struct sk_buff *skb,
+ struct net_device *dev,
+ netdev_features_t features)
+{
+ return vlan_features_check(skb, features);
+}
+
+netdev_features_t netif_skb_features(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ netdev_features_t features = dev->features;
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
+
+ if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
+ features &= ~NETIF_F_GSO_MASK;
+
+ /* If encapsulation offload request, verify we are testing
+ * hardware encapsulation features instead of standard
+ * features for the netdev
+ */
+ if (skb->encapsulation)
+ features &= dev->hw_enc_features;
+
+ if (skb_vlan_tagged(skb))
+ features = netdev_intersect_features(features,
+ dev->vlan_features |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
+
+ if (dev->netdev_ops->ndo_features_check)
+ features &= dev->netdev_ops->ndo_features_check(skb, dev,
+ features);
+ else
+ features &= dflt_features_check(skb, dev, features);
+
+ return harmonize_features(skb, features);
+}
+EXPORT_SYMBOL(netif_skb_features);
+
+static int xmit_one(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq, bool more)
+{
+ unsigned int len;
+ int rc;
+
+ if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
+ dev_queue_xmit_nit(skb, dev);
+
+ len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
+ rc = netdev_start_xmit(skb, dev, txq, more);
+ trace_net_dev_xmit(skb, rc, dev, len);
+
+ return rc;
+}
+
+struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
+ struct netdev_queue *txq, int *ret)
+{
+ struct sk_buff *skb = first;
+ int rc = NETDEV_TX_OK;
+
+ while (skb) {
+ struct sk_buff *next = skb->next;
+
+ skb->next = NULL;
+ rc = xmit_one(skb, dev, txq, next != NULL);
+ if (unlikely(!dev_xmit_complete(rc))) {
+ skb->next = next;
+ goto out;
+ }
+
+ skb = next;
+ if (netif_xmit_stopped(txq) && skb) {
+ rc = NETDEV_TX_BUSY;
+ break;
+ }
+ }
+
+out:
+ *ret = rc;
+ return skb;
+}
+
+static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (skb_vlan_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto))
+ skb = __vlan_hwaccel_push_inside(skb);
+ return skb;
+}
+
+static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
+{
+ netdev_features_t features;
+
+ if (skb->next)
+ return skb;
+
+ features = netif_skb_features(skb);
+ skb = validate_xmit_vlan(skb, features);
+ if (unlikely(!skb))
+ goto out_null;
+
+ if (netif_needs_gso(skb, features)) {
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, features);
+ if (IS_ERR(segs)) {
+ goto out_kfree_skb;
+ } else if (segs) {
+ consume_skb(skb);
+ skb = segs;
+ }
+ } else {
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto out_kfree_skb;
+
+ /* If packet is not checksummed and device does not
+ * support checksumming for this protocol, complete
+ * checksumming here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (skb->encapsulation)
+ skb_set_inner_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ else
+ skb_set_transport_header(skb,
+ skb_checksum_start_offset(skb));
+ if (!(features & NETIF_F_ALL_CSUM) &&
+ skb_checksum_help(skb))
+ goto out_kfree_skb;
+ }
+ }
+
+ return skb;
+
+out_kfree_skb:
+ kfree_skb(skb);
+out_null:
+ return NULL;
+}
+
+struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
+{
+ struct sk_buff *next, *head = NULL, *tail;
+
+ for (; skb != NULL; skb = next) {
+ next = skb->next;
+ skb->next = NULL;
+
+ /* in case skb wont be segmented, point to itself */
+ skb->prev = skb;
+
+ skb = validate_xmit_skb(skb, dev);
+ if (!skb)
+ continue;
+
+ if (!head)
+ head = skb;
+ else
+ tail->next = skb;
+ /* If skb was segmented, skb->prev points to
+ * the last segment. If not, it still contains skb.
+ */
+ tail = skb->prev;
+ }
+ return head;
+}
+
+static void qdisc_pkt_len_init(struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
+
+ /* To get more precise estimation of bytes sent on wire,
+ * we add to pkt_len the headers size of all segments
+ */
+ if (shinfo->gso_size) {
+ unsigned int hdr_len;
+ u16 gso_segs = shinfo->gso_segs;
+
+ /* mac layer + network layer */
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ hdr_len += tcp_hdrlen(skb);
+ else
+ hdr_len += sizeof(struct udphdr);
+
+ if (shinfo->gso_type & SKB_GSO_DODGY)
+ gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+
+ qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
+ }
+}
+
+static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
+ struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ spinlock_t *root_lock = qdisc_lock(q);
+ bool contended;
+ int rc;
+
+ qdisc_pkt_len_init(skb);
+ qdisc_calculate_pkt_len(skb, q);
+ /*
+ * Heuristic to force contended enqueues to serialize on a
+ * separate lock before trying to get qdisc main lock.
+ * This permits __QDISC___STATE_RUNNING owner to get the lock more
+ * often and dequeue packets faster.
+ */
+ contended = qdisc_is_running(q);
+ if (unlikely(contended))
+ spin_lock(&q->busylock);
+
+ spin_lock(root_lock);
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+ kfree_skb(skb);
+ rc = NET_XMIT_DROP;
+ } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+ qdisc_run_begin(q)) {
+ /*
+ * This is a work-conserving queue; there are no old skbs
+ * waiting to be sent out; and the qdisc is not running -
+ * xmit the skb directly.
+ */
+
+ qdisc_bstats_update(q, skb);
+
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
+ if (unlikely(contended)) {
+ spin_unlock(&q->busylock);
+ contended = false;
+ }
+ __qdisc_run(q);
+ } else
+ qdisc_run_end(q);
+
+ rc = NET_XMIT_SUCCESS;
+ } else {
+ rc = q->enqueue(skb, q) & NET_XMIT_MASK;
+ if (qdisc_run_begin(q)) {
+ if (unlikely(contended)) {
+ spin_unlock(&q->busylock);
+ contended = false;
+ }
+ __qdisc_run(q);
+ }
+ }
+ spin_unlock(root_lock);
+ if (unlikely(contended))
+ spin_unlock(&q->busylock);
+ return rc;
+}
+
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+static void skb_update_prio(struct sk_buff *skb)
+{
+ struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
+
+ if (!skb->priority && skb->sk && map) {
+ unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
+
+ if (prioidx < map->priomap_len)
+ skb->priority = map->priomap[prioidx];
+ }
+}
+#else
+#define skb_update_prio(skb)
+#endif
+
+DEFINE_PER_CPU(int, xmit_recursion);
+EXPORT_SYMBOL(xmit_recursion);
+
+#define RECURSION_LIMIT 10
+
+/**
+ * dev_loopback_xmit - loop back @skb
+ * @skb: buffer to transmit
+ */
+int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ __skb_pull(skb, skb_network_offset(skb));
+ skb->pkt_type = PACKET_LOOPBACK;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ WARN_ON(!skb_dst(skb));
+ skb_dst_force(skb);
+ netif_rx_ni(skb);
+ return 0;
+}
+EXPORT_SYMBOL(dev_loopback_xmit);
+
+/**
+ * __dev_queue_xmit - transmit a buffer
+ * @skb: buffer to transmit
+ * @accel_priv: private data used for L2 forwarding offload
+ *
+ * Queue a buffer for transmission to a network device. The caller must
+ * have set the device and priority and built the buffer before calling
+ * this function. The function can be called from an interrupt.
+ *
+ * A negative errno code is returned on a failure. A success does not
+ * guarantee the frame will be transmitted as it may be dropped due
+ * to congestion or traffic shaping.
+ *
+ * -----------------------------------------------------------------------------------
+ * I notice this method can also return errors from the queue disciplines,
+ * including NET_XMIT_DROP, which is a positive value. So, errors can also
+ * be positive.
+ *
+ * Regardless of the return value, the skb is consumed, so it is currently
+ * difficult to retry a send to this method. (You can bump the ref count
+ * before sending to hold a reference for retry if you are careful.)
+ *
+ * When calling this method, interrupts MUST be enabled. This is because
+ * the BH enable code must have IRQs enabled so that it will not deadlock.
+ * --BLG
+ */
+static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
+{
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+ struct Qdisc *q;
+ int rc = -ENOMEM;
+
+ skb_reset_mac_header(skb);
+
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+
+ /* Disable soft irqs for various locks below. Also
+ * stops preemption for RCU.
+ */
+ rcu_read_lock_bh();
+
+ skb_update_prio(skb);
+
+ /* If device/qdisc don't need skb->dst, release it right now while
+ * its hot in this cpu cache.
+ */
+ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
+
+ txq = netdev_pick_tx(dev, skb, accel_priv);
+ q = rcu_dereference_bh(txq->qdisc);
+
+#ifdef CONFIG_NET_CLS_ACT
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+#endif
+ trace_net_dev_queue(skb);
+ if (q->enqueue) {
+ rc = __dev_xmit_skb(skb, q, dev, txq);
+ goto out;
+ }
+
+ /* The device has no queue. Common case for software devices:
+ loopback, all the sorts of tunnels...
+
+ Really, it is unlikely that netif_tx_lock protection is necessary
+ here. (f.e. loopback and IP tunnels are clean ignoring statistics
+ counters.)
+ However, it is possible, that they rely on protection
+ made by us here.
+
+ Check this and shot the lock. It is not prone from deadlocks.
+ Either shot noqueue qdisc, it is even simpler 8)
+ */
+ if (dev->flags & IFF_UP) {
+ int cpu = smp_processor_id(); /* ok because BHs are off */
+
+ if (txq->xmit_lock_owner != cpu) {
+
+ if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+ goto recursion_alert;
+
+ skb = validate_xmit_skb(skb, dev);
+ if (!skb)
+ goto drop;
+
+ HARD_TX_LOCK(dev, txq, cpu);
+
+ if (!netif_xmit_stopped(txq)) {
+ __this_cpu_inc(xmit_recursion);
+ skb = dev_hard_start_xmit(skb, dev, txq, &rc);
+ __this_cpu_dec(xmit_recursion);
+ if (dev_xmit_complete(rc)) {
+ HARD_TX_UNLOCK(dev, txq);
+ goto out;
+ }
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
+ dev->name);
+ } else {
+ /* Recursion is detected! It is possible,
+ * unfortunately
+ */
+recursion_alert:
+ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
+ dev->name);
+ }
+ }
+
+ rc = -ENETDOWN;
+drop:
+ rcu_read_unlock_bh();
+
+ atomic_long_inc(&dev->tx_dropped);
+ kfree_skb_list(skb);
+ return rc;
+out:
+ rcu_read_unlock_bh();
+ return rc;
+}
+
+int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
+{
+ return __dev_queue_xmit(skb, NULL);
+}
+EXPORT_SYMBOL(dev_queue_xmit_sk);
+
+int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+{
+ return __dev_queue_xmit(skb, accel_priv);
+}
+EXPORT_SYMBOL(dev_queue_xmit_accel);
+
+
+/*=======================================================================
+ Receiver routines
+ =======================================================================*/
+
+int netdev_max_backlog __read_mostly = 1000;
+EXPORT_SYMBOL(netdev_max_backlog);
+
+int netdev_tstamp_prequeue __read_mostly = 1;
+int netdev_budget __read_mostly = 300;
+int weight_p __read_mostly = 64; /* old backlog weight */
+
+/* Called with irq disabled */
+static inline void ____napi_schedule(struct softnet_data *sd,
+ struct napi_struct *napi)
+{
+ list_add_tail(&napi->poll_list, &sd->poll_list);
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+u32 rps_cpu_mask __read_mostly;
+EXPORT_SYMBOL(rps_cpu_mask);
+
+struct static_key rps_needed __read_mostly;
+
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu)
+{
+ if (next_cpu < nr_cpu_ids) {
+#ifdef CONFIG_RFS_ACCEL
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ u32 flow_id;
+ u16 rxq_index;
+ int rc;
+
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+ !(dev->features & NETIF_F_NTUPLE))
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
+
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+ flow_id = skb_get_hash(skb) & flow_table->mask;
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+ rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+ old_rflow = rflow;
+ rflow = &flow_table->flows[flow_id];
+ rflow->filter = rc;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = RPS_NO_FILTER;
+ out:
+#endif
+ rflow->last_qtail =
+ per_cpu(softnet_data, next_cpu).input_queue_head;
+ }
+
+ rflow->cpu = next_cpu;
+ return rflow;
+}
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow **rflowp)
+{
+ const struct rps_sock_flow_table *sock_flow_table;
+ struct netdev_rx_queue *rxqueue = dev->_rx;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_map *map;
+ int cpu = -1;
+ u32 tcpu;
+ u32 hash;
+
+ if (skb_rx_queue_recorded(skb)) {
+ u16 index = skb_get_rx_queue(skb);
+
+ if (unlikely(index >= dev->real_num_rx_queues)) {
+ WARN_ONCE(dev->real_num_rx_queues > 1,
+ "%s received packet on queue %u, but number "
+ "of RX queues is %u\n",
+ dev->name, index, dev->real_num_rx_queues);
+ goto done;
+ }
+ rxqueue += index;
+ }
+
+ /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
+
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ map = rcu_dereference(rxqueue->rps_map);
+ if (!flow_table && !map)
+ goto done;
+
+ skb_reset_network_header(skb);
+ hash = skb_get_hash(skb);
+ if (!hash)
+ goto done;
+
+ sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ if (flow_table && sock_flow_table) {
+ struct rps_dev_flow *rflow;
+ u32 next_cpu;
+ u32 ident;
+
+ /* First check into global flow table if there is a match */
+ ident = sock_flow_table->ents[hash & sock_flow_table->mask];
+ if ((ident ^ hash) & ~rps_cpu_mask)
+ goto try_rps;
+
+ next_cpu = ident & rps_cpu_mask;
+
+ /* OK, now we know there is a match,
+ * we can look at the local (per receive queue) flow table
+ */
+ rflow = &flow_table->flows[hash & flow_table->mask];
+ tcpu = rflow->cpu;
+
+ /*
+ * If the desired CPU (where last recvmsg was done) is
+ * different from current CPU (one in the rx-queue flow
+ * table entry), switch if one of the following holds:
+ * - Current CPU is unset (>= nr_cpu_ids).
+ * - Current CPU is offline.
+ * - The current CPU's queue tail has advanced beyond the
+ * last packet that was enqueued using this table entry.
+ * This guarantees that all previous packets for the flow
+ * have been dequeued, thus preserving in order delivery.
+ */
+ if (unlikely(tcpu != next_cpu) &&
+ (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
+ ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+ rflow->last_qtail)) >= 0)) {
+ tcpu = next_cpu;
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ }
+
+ if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
+ *rflowp = rflow;
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
+try_rps:
+
+ if (map) {
+ tcpu = map->cpus[reciprocal_scale(hash, map->len)];
+ if (cpu_online(tcpu)) {
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
+done:
+ return cpu;
+}
+
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+ unsigned int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = ACCESS_ONCE(rflow->cpu);
+ if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
+ ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ rflow->last_qtail) <
+ (int)(10 * flow_table->mask)))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
+/* Called from hardirq (IPI) context */
+static void rps_trigger_softirq(void *data)
+{
+ struct softnet_data *sd = data;
+
+ ____napi_schedule(sd, &sd->backlog);
+ sd->received_rps++;
+}
+
+#endif /* CONFIG_RPS */
+
+/*
+ * Check if this softnet_data structure is another cpu one
+ * If yes, queue it to our IPI list and return 1
+ * If no, return 0
+ */
+static int rps_ipi_queued(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+
+ if (sd != mysd) {
+ sd->rps_ipi_next = mysd->rps_ipi_list;
+ mysd->rps_ipi_list = sd;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ return 1;
+ }
+#endif /* CONFIG_RPS */
+ return 0;
+}
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+ struct softnet_data *sd;
+ unsigned int old_flow, new_flow;
+
+ if (qlen < (netdev_max_backlog >> 1))
+ return false;
+
+ sd = this_cpu_ptr(&softnet_data);
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl) {
+ new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ old_flow = fl->history[fl->history_head];
+ fl->history[fl->history_head] = new_flow;
+
+ fl->history_head++;
+ fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+ if (likely(fl->buckets[old_flow]))
+ fl->buckets[old_flow]--;
+
+ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+ fl->count++;
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+#endif
+ return false;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+ unsigned int *qtail)
+{
+ struct softnet_data *sd;
+ unsigned long flags;
+ unsigned int qlen;
+
+ sd = &per_cpu(softnet_data, cpu);
+
+ local_irq_save(flags);
+
+ rps_lock(sd);
+ qlen = skb_queue_len(&sd->input_pkt_queue);
+ if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (qlen) {
+enqueue:
+ __skb_queue_tail(&sd->input_pkt_queue, skb);
+ input_queue_tail_incr_save(sd, qtail);
+ rps_unlock(sd);
+ local_irq_restore(flags);
+ return NET_RX_SUCCESS;
+ }
+
+ /* Schedule NAPI for backlog device
+ * We can use non atomic operation since we own the queue lock
+ */
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
+ if (!rps_ipi_queued(sd))
+ ____napi_schedule(sd, &sd->backlog);
+ }
+ goto enqueue;
+ }
+
+ sd->dropped++;
+ rps_unlock(sd);
+
+ local_irq_restore(flags);
+
+ atomic_long_inc(&skb->dev->rx_dropped);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static int netif_rx_internal(struct sk_buff *skb)
+{
+ int ret;
+
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+
+ trace_netif_rx(skb);
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
+ if (cpu < 0)
+ cpu = smp_processor_id();
+
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
+ rcu_read_unlock();
+ preempt_enable();
+ } else
+#endif
+ {
+ unsigned int qtail;
+ ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
+ put_cpu();
+ }
+ return ret;
+}
+
+/**
+ * netif_rx - post buffer to the network code
+ * @skb: buffer to post
+ *
+ * This function receives a packet from a device driver and queues it for
+ * the upper (protocol) levels to process. It always succeeds. The buffer
+ * may be dropped during processing for congestion control or by the
+ * protocol layers.
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_DROP (packet was dropped)
+ *
+ */
+
+int netif_rx(struct sk_buff *skb)
+{
+ trace_netif_rx_entry(skb);
+
+ return netif_rx_internal(skb);
+}
+EXPORT_SYMBOL(netif_rx);
+
+int netif_rx_ni(struct sk_buff *skb)
+{
+ int err;
+
+ trace_netif_rx_ni_entry(skb);
+
+ preempt_disable();
+ err = netif_rx_internal(skb);
+ if (local_softirq_pending())
+ do_softirq();
+ preempt_enable();
+
+ return err;
+}
+EXPORT_SYMBOL(netif_rx_ni);
+
+static void net_tx_action(struct softirq_action *h)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+
+ if (sd->completion_queue) {
+ struct sk_buff *clist;
+
+ local_irq_disable();
+ clist = sd->completion_queue;
+ sd->completion_queue = NULL;
+ local_irq_enable();
+
+ while (clist) {
+ struct sk_buff *skb = clist;
+ clist = clist->next;
+
+ WARN_ON(atomic_read(&skb->users));
+ if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+ trace_consume_skb(skb);
+ else
+ trace_kfree_skb(skb, net_tx_action);
+ __kfree_skb(skb);
+ }
+ }
+
+ if (sd->output_queue) {
+ struct Qdisc *head;
+
+ local_irq_disable();
+ head = sd->output_queue;
+ sd->output_queue = NULL;
+ sd->output_queue_tailp = &sd->output_queue;
+ local_irq_enable();
+
+ while (head) {
+ struct Qdisc *q = head;
+ spinlock_t *root_lock;
+
+ head = head->next_sched;
+
+ root_lock = qdisc_lock(q);
+ if (spin_trylock(root_lock)) {
+ smp_mb__before_atomic();
+ clear_bit(__QDISC_STATE_SCHED,
+ &q->state);
+ qdisc_run(q);
+ spin_unlock(root_lock);
+ } else {
+ if (!test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state)) {
+ __netif_reschedule(q);
+ } else {
+ smp_mb__before_atomic();
+ clear_bit(__QDISC_STATE_SCHED,
+ &q->state);
+ }
+ }
+ }
+ }
+}
+
+#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
+ (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+/* This hook is defined here for ATM LANE */
+int (*br_fdb_test_addr_hook)(struct net_device *dev,
+ unsigned char *addr) __read_mostly;
+EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
+#endif
+
+#ifdef CONFIG_NET_CLS_ACT
+/* TODO: Maybe we should just force sch_ingress to be compiled in
+ * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
+ * a compare and 2 stores extra right now if we dont have it on
+ * but have CONFIG_NET_CLS_ACT
+ * NOTE: This doesn't stop any functionality; if you dont have
+ * the ingress scheduler, you just can't add policies on ingress.
+ *
+ */
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
+{
+ struct net_device *dev = skb->dev;
+ u32 ttl = G_TC_RTTL(skb->tc_verd);
+ int result = TC_ACT_OK;
+ struct Qdisc *q;
+
+ if (unlikely(MAX_RED_LOOP < ttl++)) {
+ net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
+ skb->skb_iif, dev->ifindex);
+ return TC_ACT_SHOT;
+ }
+
+ skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+
+ q = rcu_dereference(rxq->qdisc);
+ if (q != &noop_qdisc) {
+ spin_lock(qdisc_lock(q));
+ if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
+ result = qdisc_enqueue_root(skb, q);
+ spin_unlock(qdisc_lock(q));
+ }
+
+ return result;
+}
+
+static inline struct sk_buff *handle_ing(struct sk_buff *skb,
+ struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+
+ if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
+ return skb;
+
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ switch (ing_filter(skb, rxq)) {
+ case TC_ACT_SHOT:
+ case TC_ACT_STOLEN:
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ return skb;
+}
+#endif
+
+/**
+ * netdev_rx_handler_register - register receive handler
+ * @dev: device to register a handler for
+ * @rx_handler: receive handler to register
+ * @rx_handler_data: data pointer that is used by rx handler
+ *
+ * Register a receive handler for a device. This handler will then be
+ * called from __netif_receive_skb. A negative errno code is returned
+ * on a failure.
+ *
+ * The caller must hold the rtnl_mutex.
+ *
+ * For a general description of rx_handler, see enum rx_handler_result.
+ */
+int netdev_rx_handler_register(struct net_device *dev,
+ rx_handler_func_t *rx_handler,
+ void *rx_handler_data)
+{
+ ASSERT_RTNL();
+
+ if (dev->rx_handler)
+ return -EBUSY;
+
+ /* Note: rx_handler_data must be set before rx_handler */
+ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
+ rcu_assign_pointer(dev->rx_handler, rx_handler);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
+
+/**
+ * netdev_rx_handler_unregister - unregister receive handler
+ * @dev: device to unregister a handler from
+ *
+ * Unregister a receive handler from a device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void netdev_rx_handler_unregister(struct net_device *dev)
+{
+
+ ASSERT_RTNL();
+ RCU_INIT_POINTER(dev->rx_handler, NULL);
+ /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
+ * section has a guarantee to see a non NULL rx_handler_data
+ * as well.
+ */
+ synchronize_net();
+ RCU_INIT_POINTER(dev->rx_handler_data, NULL);
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+
+/*
+ * Limit the use of PFMEMALLOC reserves to those protocols that implement
+ * the special handling of PFMEMALLOC skbs.
+ */
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+{
+ struct packet_type *ptype, *pt_prev;
+ rx_handler_func_t *rx_handler;
+ struct net_device *orig_dev;
+ bool deliver_exact = false;
+ int ret = NET_RX_DROP;
+ __be16 type;
+
+ net_timestamp_check(!netdev_tstamp_prequeue, skb);
+
+ trace_netif_receive_skb(skb);
+
+ orig_dev = skb->dev;
+
+ skb_reset_network_header(skb);
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
+
+ pt_prev = NULL;
+
+ rcu_read_lock();
+
+another_round:
+ skb->skb_iif = skb->dev->ifindex;
+
+ __this_cpu_inc(softnet_data.processed);
+
+ if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+ skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ skb = skb_vlan_untag(skb);
+ if (unlikely(!skb))
+ goto unlock;
+ }
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (skb->tc_verd & TC_NCLS) {
+ skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
+ goto ncls;
+ }
+#endif
+
+ if (pfmemalloc)
+ goto skip_taps;
+
+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+
+ list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+
+skip_taps:
+#ifdef CONFIG_NET_CLS_ACT
+ if (static_key_false(&ingress_needed)) {
+ skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+ if (!skb)
+ goto unlock;
+ }
+
+ skb->tc_verd = 0;
+ncls:
+#endif
+ if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
+ goto drop;
+
+ if (skb_vlan_tag_present(skb)) {
+ if (pt_prev) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ if (vlan_do_receive(&skb))
+ goto another_round;
+ else if (unlikely(!skb))
+ goto unlock;
+ }
+
+ rx_handler = rcu_dereference(skb->dev->rx_handler);
+ if (rx_handler) {
+ if (pt_prev) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ switch (rx_handler(&skb)) {
+ case RX_HANDLER_CONSUMED:
+ ret = NET_RX_SUCCESS;
+ goto unlock;
+ case RX_HANDLER_ANOTHER:
+ goto another_round;
+ case RX_HANDLER_EXACT:
+ deliver_exact = true;
+ case RX_HANDLER_PASS:
+ break;
+ default:
+ BUG();
+ }
+ }
+
+ if (unlikely(skb_vlan_tag_present(skb))) {
+ if (skb_vlan_tag_get_id(skb))
+ skb->pkt_type = PACKET_OTHERHOST;
+ /* Note: we might in the future use prio bits
+ * and set skb->priority like in vlan_do_receive()
+ * For the time being, just ignore Priority Code Point
+ */
+ skb->vlan_tci = 0;
+ }
+
+ type = skb->protocol;
+
+ /* deliver only exact match when indicated */
+ if (likely(!deliver_exact)) {
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &ptype_base[ntohs(type) &
+ PTYPE_HASH_MASK]);
+ }
+
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &orig_dev->ptype_specific);
+
+ if (unlikely(skb->dev != orig_dev)) {
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &skb->dev->ptype_specific);
+ }
+
+ if (pt_prev) {
+ if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ goto drop;
+ else
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ } else {
+drop:
+ atomic_long_inc(&skb->dev->rx_dropped);
+ kfree_skb(skb);
+ /* Jamal, now you will not able to escape explaining
+ * me how you were going to use this. :-)
+ */
+ ret = NET_RX_DROP;
+ }
+
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+
+static int __netif_receive_skb(struct sk_buff *skb)
+{
+ int ret;
+
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
+ unsigned long pflags = current->flags;
+
+ /*
+ * PFMEMALLOC skbs are special, they should
+ * - be delivered to SOCK_MEMALLOC sockets only
+ * - stay away from userspace
+ * - have bounded memory usage
+ *
+ * Use PF_MEMALLOC as this saves us from propagating the allocation
+ * context down to all allocation sites.
+ */
+ current->flags |= PF_MEMALLOC;
+ ret = __netif_receive_skb_core(skb, true);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ } else
+ ret = __netif_receive_skb_core(skb, false);
+
+ return ret;
+}
+
+static int netif_receive_skb_internal(struct sk_buff *skb)
+{
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+
+ if (skb_defer_rx_timestamp(skb))
+ return NET_RX_SUCCESS;
+
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu, ret;
+
+ rcu_read_lock();
+
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ rcu_read_unlock();
+ return ret;
+ }
+ rcu_read_unlock();
+ }
+#endif
+ return __netif_receive_skb(skb);
+}
+
+/**
+ * netif_receive_skb - process receive buffer from network
+ * @skb: buffer to process
+ *
+ * netif_receive_skb() is the main receive data processing function.
+ * It always succeeds. The buffer may be dropped during processing
+ * for congestion control or by the protocol layers.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ *
+ * Return values (usually ignored):
+ * NET_RX_SUCCESS: no congestion
+ * NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
+{
+ trace_netif_receive_skb_entry(skb);
+
+ return netif_receive_skb_internal(skb);
+}
+EXPORT_SYMBOL(netif_receive_skb_sk);
+
+/* Network device is going away, flush any packets still pending
+ * Called with irqs disabled.
+ */
+static void flush_backlog(void *arg)
+{
+ struct net_device *dev = arg;
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+ struct sk_buff *skb, *tmp;
+
+ rps_lock(sd);
+ skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+ if (skb->dev == dev) {
+ __skb_unlink(skb, &sd->input_pkt_queue);
+ kfree_skb(skb);
+ input_queue_head_incr(sd);
+ }
+ }
+ rps_unlock(sd);
+
+ skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+ if (skb->dev == dev) {
+ __skb_unlink(skb, &sd->process_queue);
+ kfree_skb(skb);
+ input_queue_head_incr(sd);
+ }
+ }
+}
+
+static int napi_gro_complete(struct sk_buff *skb)
+{
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct list_head *head = &offload_base;
+ int err = -ENOENT;
+
+ BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
+ if (NAPI_GRO_CB(skb)->count == 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+
+ err = ptype->callbacks.gro_complete(skb, 0);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err) {
+ WARN_ON(&ptype->list == head);
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
+out:
+ return netif_receive_skb_internal(skb);
+}
+
+/* napi->gro_list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ struct sk_buff *skb, *prev = NULL;
+
+ /* scan list and build reverse chain */
+ for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
+ skb->prev = prev;
+ prev = skb;
+ }
+
+ for (skb = prev; skb; skb = prev) {
+ skb->next = NULL;
+
+ if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+ return;
+
+ prev = skb->prev;
+ napi_gro_complete(skb);
+ napi->gro_count--;
+ }
+
+ napi->gro_list = NULL;
+}
+EXPORT_SYMBOL(napi_gro_flush);
+
+static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct sk_buff *p;
+ unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
+
+ for (p = napi->gro_list; p; p = p->next) {
+ unsigned long diffs;
+
+ NAPI_GRO_CB(p)->flush = 0;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= p->vlan_tci ^ skb->vlan_tci;
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+ NAPI_GRO_CB(p)->same_flow = !diffs;
+ }
+}
+
+static void skb_gro_reset_offset(struct sk_buff *skb)
+{
+ const struct skb_shared_info *pinfo = skb_shinfo(skb);
+ const skb_frag_t *frag0 = &pinfo->frags[0];
+
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ NAPI_GRO_CB(skb)->frag0 = NULL;
+ NAPI_GRO_CB(skb)->frag0_len = 0;
+
+ if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
+ pinfo->nr_frags &&
+ !PageHighMem(skb_frag_page(frag0))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ pinfo->frags[0].page_offset += grow;
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
+static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct list_head *head = &offload_base;
+ int same_flow;
+ enum gro_result ret;
+ int grow;
+
+ if (!(skb->dev->features & NETIF_F_GRO))
+ goto normal;
+
+ if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
+ goto normal;
+
+ gro_list_prepare(napi, skb);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ skb_reset_mac_len(skb);
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 0;
+ NAPI_GRO_CB(skb)->free = 0;
+ NAPI_GRO_CB(skb)->udp_mark = 0;
+ NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
+
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ break;
+ default:
+ NAPI_GRO_CB(skb)->csum_cnt = 0;
+ NAPI_GRO_CB(skb)->csum_valid = 0;
+ }
+
+ pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (&ptype->list == head)
+ goto normal;
+
+ same_flow = NAPI_GRO_CB(skb)->same_flow;
+ ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
+
+ if (pp) {
+ struct sk_buff *nskb = *pp;
+
+ *pp = nskb->next;
+ nskb->next = NULL;
+ napi_gro_complete(nskb);
+ napi->gro_count--;
+ }
+
+ if (same_flow)
+ goto ok;
+
+ if (NAPI_GRO_CB(skb)->flush)
+ goto normal;
+
+ if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
+ struct sk_buff *nskb = napi->gro_list;
+
+ /* locate the end of the list to select the 'oldest' flow */
+ while (nskb->next) {
+ pp = &nskb->next;
+ nskb = *pp;
+ }
+ *pp = NULL;
+ nskb->next = NULL;
+ napi_gro_complete(nskb);
+ } else {
+ napi->gro_count++;
+ }
+ NAPI_GRO_CB(skb)->count = 1;
+ NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+ skb->next = napi->gro_list;
+ napi->gro_list = skb;
+ ret = GRO_HELD;
+
+pull:
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
+ok:
+ return ret;
+
+normal:
+ ret = GRO_NORMAL;
+ goto pull;
+}
+
+struct packet_offload *gro_find_receive_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
+
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_complete_by_type);
+
+static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ if (netif_receive_skb_internal(skb))
+ ret = GRO_DROP;
+ break;
+
+ case GRO_DROP:
+ kfree_skb(skb);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ kmem_cache_free(skbuff_head_cache, skb);
+ else
+ __kfree_skb(skb);
+ break;
+
+ case GRO_HELD:
+ case GRO_MERGED:
+ break;
+ }
+
+ return ret;
+}
+
+gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ trace_napi_gro_receive_entry(skb);
+
+ skb_gro_reset_offset(skb);
+
+ return napi_skb_finish(dev_gro_receive(napi, skb), skb);
+}
+EXPORT_SYMBOL(napi_gro_receive);
+
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+ if (unlikely(skb->pfmemalloc)) {
+ consume_skb(skb);
+ return;
+ }
+ __skb_pull(skb, skb_headlen(skb));
+ /* restore the reserve we had after netdev_alloc_skb_ip_align() */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
+ skb->vlan_tci = 0;
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+
+ napi->skb = skb;
+}
+
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+
+ if (!skb) {
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
+ napi->skb = skb;
+ }
+ return skb;
+}
+EXPORT_SYMBOL(napi_get_frags);
+
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
+ ret = GRO_DROP;
+ break;
+
+ case GRO_DROP:
+ case GRO_MERGED_FREE:
+ napi_reuse_skb(napi, skb);
+ break;
+
+ case GRO_MERGED:
+ break;
+ }
+
+ return ret;
+}
+
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
+static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ eth = skb_gro_header_fast(skb, 0);
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
+ }
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
+
+ return skb;
+}
+
+gro_result_t napi_gro_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi_frags_skb(napi);
+
+ if (!skb)
+ return GRO_DROP;
+
+ trace_napi_gro_frags_entry(skb);
+
+ return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+}
+EXPORT_SYMBOL(napi_gro_frags);
+
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+ __wsum wsum;
+ __sum16 sum;
+
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+ /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ }
+
+ NAPI_GRO_CB(skb)->csum = wsum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
+/*
+ * net_rps_action_and_irq_enable sends any pending IPI's for rps.
+ * Note: called with local irq disabled, but exits with local irq enabled.
+ */
+static void net_rps_action_and_irq_enable(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ struct softnet_data *remsd = sd->rps_ipi_list;
+
+ if (remsd) {
+ sd->rps_ipi_list = NULL;
+
+ local_irq_enable();
+
+ /* Send pending IPI's to kick RPS processing on remote cpus. */
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu))
+ smp_call_function_single_async(remsd->cpu,
+ &remsd->csd);
+ remsd = next;
+ }
+ } else
+#endif
+ local_irq_enable();
+}
+
+static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+ return sd->rps_ipi_list != NULL;
+#else
+ return false;
+#endif
+}
+
+static int process_backlog(struct napi_struct *napi, int quota)
+{
+ int work = 0;
+ struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+
+ /* Check if we have pending ipi, its better to send them now,
+ * not waiting net_rx_action() end.
+ */
+ if (sd_has_rps_ipi_waiting(sd)) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+
+ napi->weight = weight_p;
+ local_irq_disable();
+ while (1) {
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(&sd->process_queue))) {
+ local_irq_enable();
+ __netif_receive_skb(skb);
+ local_irq_disable();
+ input_queue_head_incr(sd);
+ if (++work >= quota) {
+ local_irq_enable();
+ return work;
+ }
+ }
+
+ rps_lock(sd);
+ if (skb_queue_empty(&sd->input_pkt_queue)) {
+ /*
+ * Inline a custom version of __napi_complete().
+ * only current cpu owns and manipulates this napi,
+ * and NAPI_STATE_SCHED is the only possible flag set
+ * on backlog.
+ * We can use a plain write instead of clear_bit(),
+ * and we dont need an smp_mb() memory barrier.
+ */
+ napi->state = 0;
+ rps_unlock(sd);
+
+ break;
+ }
+
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
+ rps_unlock(sd);
+ }
+ local_irq_enable();
+
+ return work;
+}
+
+/**
+ * __napi_schedule - schedule for receive
+ * @n: entry to schedule
+ *
+ * The entry's receive function will be scheduled to run.
+ * Consider using __napi_schedule_irqoff() if hard irqs are masked.
+ */
+void __napi_schedule(struct napi_struct *n)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__napi_schedule);
+
+/**
+ * __napi_schedule_irqoff - schedule for receive
+ * @n: entry to schedule
+ *
+ * Variant of __napi_schedule() assuming hard irqs are masked
+ */
+void __napi_schedule_irqoff(struct napi_struct *n)
+{
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+}
+EXPORT_SYMBOL(__napi_schedule_irqoff);
+
+void __napi_complete(struct napi_struct *n)
+{
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+
+ list_del_init(&n->poll_list);
+ smp_mb__before_atomic();
+ clear_bit(NAPI_STATE_SCHED, &n->state);
+}
+EXPORT_SYMBOL(__napi_complete);
+
+void napi_complete_done(struct napi_struct *n, int work_done)
+{
+ unsigned long flags;
+
+ /*
+ * don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu
+ */
+ if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
+ return;
+
+ if (n->gro_list) {
+ unsigned long timeout = 0;
+
+ if (work_done)
+ timeout = n->dev->gro_flush_timeout;
+
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ else
+ napi_gro_flush(n, false);
+ }
+ if (likely(list_empty(&n->poll_list))) {
+ WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+ } else {
+ /* If n->poll_list is not empty, we need to mask irqs */
+ local_irq_save(flags);
+ __napi_complete(n);
+ local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL(napi_complete_done);
+
+/* must be called under rcu_read_lock(), as we dont take a reference */
+struct napi_struct *napi_by_id(unsigned int napi_id)
+{
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
+
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(napi_by_id);
+
+void napi_hash_add(struct napi_struct *napi)
+{
+ if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
+
+ spin_lock(&napi_hash_lock);
+
+ /* 0 is not a valid id, we also skip an id that is taken
+ * we expect both events to be extremely rare
+ */
+ napi->napi_id = 0;
+ while (!napi->napi_id) {
+ napi->napi_id = ++napi_gen_id;
+ if (napi_by_id(napi->napi_id))
+ napi->napi_id = 0;
+ }
+
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+ spin_unlock(&napi_hash_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(napi_hash_add);
+
+/* Warning : caller is responsible to make sure rcu grace period
+ * is respected before freeing memory containing @napi
+ */
+void napi_hash_del(struct napi_struct *napi)
+{
+ spin_lock(&napi_hash_lock);
+
+ if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
+ hlist_del_rcu(&napi->napi_hash_node);
+
+ spin_unlock(&napi_hash_lock);
+}
+EXPORT_SYMBOL_GPL(napi_hash_del);
+
+static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
+{
+ struct napi_struct *napi;
+
+ napi = container_of(timer, struct napi_struct, timer);
+ if (napi->gro_list)
+ napi_schedule(napi);
+
+ return HRTIMER_NORESTART;
+}
+
+void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int weight)
+{
+ INIT_LIST_HEAD(&napi->poll_list);
+ hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ napi->timer.function = napi_watchdog;
+ napi->gro_count = 0;
+ napi->gro_list = NULL;
+ napi->skb = NULL;
+ napi->poll = poll;
+ if (weight > NAPI_POLL_WEIGHT)
+ pr_err_once("netif_napi_add() called with weight %d on device %s\n",
+ weight, dev->name);
+ napi->weight = weight;
+ list_add(&napi->dev_list, &dev->napi_list);
+ napi->dev = dev;
+#ifdef CONFIG_NETPOLL
+ spin_lock_init(&napi->poll_lock);
+ napi->poll_owner = -1;
+#endif
+ set_bit(NAPI_STATE_SCHED, &napi->state);
+}
+EXPORT_SYMBOL(netif_napi_add);
+
+void napi_disable(struct napi_struct *n)
+{
+ might_sleep();
+ set_bit(NAPI_STATE_DISABLE, &n->state);
+
+ while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
+ msleep(1);
+
+ hrtimer_cancel(&n->timer);
+
+ clear_bit(NAPI_STATE_DISABLE, &n->state);
+}
+EXPORT_SYMBOL(napi_disable);
+
+void netif_napi_del(struct napi_struct *napi)
+{
+ list_del_init(&napi->dev_list);
+ napi_free_frags(napi);
+
+ kfree_skb_list(napi->gro_list);
+ napi->gro_list = NULL;
+ napi->gro_count = 0;
+}
+EXPORT_SYMBOL(netif_napi_del);
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+ void *have;
+ int work, weight;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ weight = n->weight;
+
+ /* This NAPI_STATE_SCHED test is for avoiding a race
+ * with netpoll's poll_napi(). Only the entity which
+ * obtains the lock and sees NAPI_STATE_SCHED set will
+ * actually make the ->poll() call. Therefore we avoid
+ * accidentally calling ->poll() when NAPI is not scheduled.
+ */
+ work = 0;
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ work = n->poll(n, weight);
+ trace_napi_poll(n);
+ }
+
+ WARN_ON_ONCE(work > weight);
+
+ if (likely(work < weight))
+ goto out_unlock;
+
+ /* Drivers must not modify the NAPI state if they
+ * consume the entire weight. In such cases this code
+ * still "owns" the NAPI instance and therefore can
+ * move the instance around on the list at-will.
+ */
+ if (unlikely(napi_disable_pending(n))) {
+ napi_complete(n);
+ goto out_unlock;
+ }
+
+ if (n->gro_list) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(n, HZ >= 1000);
+ }
+
+ /* Some drivers may have called napi_schedule
+ * prior to exhausting their budget.
+ */
+ if (unlikely(!list_empty(&n->poll_list))) {
+ pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
+ n->dev ? n->dev->name : "backlog");
+ goto out_unlock;
+ }
+
+ list_add_tail(&n->poll_list, repoll);
+
+out_unlock:
+ netpoll_poll_unlock(have);
+
+ return work;
+}
+
+static void net_rx_action(struct softirq_action *h)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+ unsigned long time_limit = jiffies + 2;
+ int budget = netdev_budget;
+ LIST_HEAD(list);
+ LIST_HEAD(repoll);
+
+ local_irq_disable();
+ list_splice_init(&sd->poll_list, &list);
+ local_irq_enable();
+
+ for (;;) {
+ struct napi_struct *n;
+
+ if (list_empty(&list)) {
+ if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
+ return;
+ break;
+ }
+
+ n = list_first_entry(&list, struct napi_struct, poll_list);
+ budget -= napi_poll(n, &repoll);
+
+ /* If softirq window is exhausted then punt.
+ * Allow this to run for 2 jiffies since which will allow
+ * an average latency of 1.5/HZ.
+ */
+ if (unlikely(budget <= 0 ||
+ time_after_eq(jiffies, time_limit))) {
+ sd->time_squeeze++;
+ break;
+ }
+ }
+
+ local_irq_disable();
+
+ list_splice_tail_init(&sd->poll_list, &list);
+ list_splice_tail(&repoll, &list);
+ list_splice(&list, &sd->poll_list);
+ if (!list_empty(&sd->poll_list))
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+
+ net_rps_action_and_irq_enable(sd);
+}
+
+struct netdev_adjacent {
+ struct net_device *dev;
+
+ /* upper master flag, there can only be one master device per list */
+ bool master;
+
+ /* counter for the number of times this device was added to us */
+ u16 ref_nr;
+
+ /* private field for the users */
+ void *private;
+
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ list_for_each_entry(adj, adj_list, list) {
+ if (adj->dev == adj_dev)
+ return adj;
+ }
+ return NULL;
+}
+
+/**
+ * netdev_has_upper_dev - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks only immediate upper device,
+ * not through a complete stack of devices. The caller must hold the RTNL lock.
+ */
+bool netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ ASSERT_RTNL();
+
+ return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev);
+
+/**
+ * netdev_has_any_upper_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to an upper device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+static bool netdev_has_any_upper_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ return !list_empty(&dev->all_adj_list.upper);
+}
+
+/**
+ * netdev_master_upper_dev_get - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RTNL lock.
+ */
+struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
+
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master))
+ return upper->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_get);
+
+void *netdev_adjacent_get_private(struct list_head *adj_list)
+{
+ struct netdev_adjacent *adj;
+
+ adj = list_entry(adj_list, struct netdev_adjacent, list);
+
+ return adj->private;
+}
+EXPORT_SYMBOL(netdev_adjacent_get_private);
+
+/**
+ * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+
+/**
+ * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's upper list, starting from iter
+ * position. The caller must hold RCU read lock.
+ */
+struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *upper;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+ upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->all_adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+
+ return upper->dev;
+}
+EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
+
+/**
+ * netdev_lower_get_next_private - Get the next ->private from the
+ * lower neighbour list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold either hold the
+ * RTNL lock or its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
+ */
+void *netdev_lower_get_next_private(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry(*iter, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = lower->list.next;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_get_next_private);
+
+/**
+ * netdev_lower_get_next_private_rcu - Get the next ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent->private from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_next_private_rcu(struct net_device *dev,
+ struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
+
+/**
+ * netdev_lower_get_next - Get the next device from the lower neighbour
+ * list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next netdev_adjacent from the dev's lower neighbour
+ * list, starting from iter position. The caller must hold RTNL lock or
+ * its own locking that guarantees that the neighbour lower
+ * list will remain unchainged.
+ */
+void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+EXPORT_SYMBOL(netdev_lower_get_next);
+
+/**
+ * netdev_lower_get_first_private_rcu - Get the first ->private from the
+ * lower neighbour list, RCU
+ * variant
+ * @dev: device
+ *
+ * Gets the first netdev_adjacent->private from the dev's lower neighbour
+ * list. The caller must hold RCU read lock.
+ */
+void *netdev_lower_get_first_private_rcu(struct net_device *dev)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_first_or_null_rcu(&dev->adj_list.lower,
+ struct netdev_adjacent, list);
+ if (lower)
+ return lower->private;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
+
+/**
+ * netdev_master_upper_dev_get_rcu - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RCU read lock.
+ */
+struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ upper = list_first_or_null_rcu(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (upper && likely(upper->master))
+ return upper->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
+
+static int netdev_adjacent_sysfs_add(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", adj_dev->name);
+ return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
+ linkname);
+}
+static void netdev_adjacent_sysfs_del(struct net_device *dev,
+ char *name,
+ struct list_head *dev_list)
+{
+ char linkname[IFNAMSIZ+7];
+ sprintf(linkname, dev_list == &dev->adj_list.upper ?
+ "upper_%s" : "lower_%s", name);
+ sysfs_remove_link(&(dev->dev.kobj), linkname);
+}
+
+static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ return (dev_list == &dev->adj_list.upper ||
+ dev_list == &dev->adj_list.lower) &&
+ net_eq(dev_net(dev), dev_net(adj_dev));
+}
+
+static int __netdev_adjacent_dev_insert(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list,
+ void *private, bool master)
+{
+ struct netdev_adjacent *adj;
+ int ret;
+
+ adj = __netdev_find_adj(dev, adj_dev, dev_list);
+
+ if (adj) {
+ adj->ref_nr++;
+ return 0;
+ }
+
+ adj = kmalloc(sizeof(*adj), GFP_KERNEL);
+ if (!adj)
+ return -ENOMEM;
+
+ adj->dev = adj_dev;
+ adj->master = master;
+ adj->ref_nr = 1;
+ adj->private = private;
+ dev_hold(adj_dev);
+
+ pr_debug("dev_hold for %s, because of link added from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
+ ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
+ if (ret)
+ goto free_adj;
+ }
+
+ /* Ensure that master link is always the first item in list. */
+ if (master) {
+ ret = sysfs_create_link(&(dev->dev.kobj),
+ &(adj_dev->dev.kobj), "master");
+ if (ret)
+ goto remove_symlinks;
+
+ list_add_rcu(&adj->list, dev_list);
+ } else {
+ list_add_tail_rcu(&adj->list, dev_list);
+ }
+
+ return 0;
+
+remove_symlinks:
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+free_adj:
+ kfree(adj);
+ dev_put(adj_dev);
+
+ return ret;
+}
+
+static void __netdev_adjacent_dev_remove(struct net_device *dev,
+ struct net_device *adj_dev,
+ struct list_head *dev_list)
+{
+ struct netdev_adjacent *adj;
+
+ adj = __netdev_find_adj(dev, adj_dev, dev_list);
+
+ if (!adj) {
+ pr_err("tried to remove device %s from %s\n",
+ dev->name, adj_dev->name);
+ BUG();
+ }
+
+ if (adj->ref_nr > 1) {
+ pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
+ adj->ref_nr-1);
+ adj->ref_nr--;
+ return;
+ }
+
+ if (adj->master)
+ sysfs_remove_link(&(dev->dev.kobj), "master");
+
+ if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
+ netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
+
+ list_del_rcu(&adj->list);
+ pr_debug("dev_put for %s, because link removed from %s to %s\n",
+ adj_dev->name, dev->name, adj_dev->name);
+ dev_put(adj_dev);
+ kfree_rcu(adj, rcu);
+}
+
+static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list,
+ void *private, bool master)
+{
+ int ret;
+
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
+ master);
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
+ false);
+ if (ret) {
+ __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __netdev_adjacent_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->all_adj_list.upper,
+ &upper_dev->all_adj_list.lower,
+ NULL, false);
+}
+
+static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct list_head *up_list,
+ struct list_head *down_list)
+{
+ __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
+}
+
+static void __netdev_adjacent_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ &dev->all_adj_list.upper,
+ &upper_dev->all_adj_list.lower);
+}
+
+static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private, bool master)
+{
+ int ret = __netdev_adjacent_dev_link(dev, upper_dev);
+
+ if (ret)
+ return ret;
+
+ ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
+ if (ret) {
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ __netdev_adjacent_dev_unlink(dev, upper_dev);
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower);
+}
+
+static int __netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev, bool master,
+ void *private)
+{
+ struct netdev_adjacent *i, *j, *to_i, *to_j;
+ int ret = 0;
+
+ ASSERT_RTNL();
+
+ if (dev == upper_dev)
+ return -EBUSY;
+
+ /* To prevent loops, check if dev is not upper device to upper_dev. */
+ if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
+ return -EBUSY;
+
+ if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
+ return -EEXIST;
+
+ if (master && netdev_master_upper_dev_get(dev))
+ return -EBUSY;
+
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
+ master);
+ if (ret)
+ return ret;
+
+ /* Now that we linked these devs, make all the upper_dev's
+ * all_adj_list.upper visible to every dev's all_adj_list.lower an
+ * versa, and don't forget the devices itself. All of these
+ * links are non-neighbours.
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
+ pr_debug("Interlinking %s with %s, non-neighbour\n",
+ i->dev->name, j->dev->name);
+ ret = __netdev_adjacent_dev_link(i->dev, j->dev);
+ if (ret)
+ goto rollback_mesh;
+ }
+ }
+
+ /* add dev to every upper_dev's upper device */
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
+ pr_debug("linking %s's upper device %s with %s\n",
+ upper_dev->name, i->dev->name, dev->name);
+ ret = __netdev_adjacent_dev_link(dev, i->dev);
+ if (ret)
+ goto rollback_upper_mesh;
+ }
+
+ /* add upper_dev to every dev's lower device */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ pr_debug("linking %s's lower device %s with %s\n", dev->name,
+ i->dev->name, upper_dev->name);
+ ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
+ if (ret)
+ goto rollback_lower_mesh;
+ }
+
+ call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+ return 0;
+
+rollback_lower_mesh:
+ to_i = i;
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+ }
+
+ i = NULL;
+
+rollback_upper_mesh:
+ to_i = i;
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
+ if (i == to_i)
+ break;
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+ }
+
+ i = j = NULL;
+
+rollback_mesh:
+ to_i = i;
+ to_j = j;
+ list_for_each_entry(i, &dev->all_adj_list.lower, list) {
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
+ if (i == to_i && j == to_j)
+ break;
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+ }
+ if (i == to_i)
+ break;
+ }
+
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ return ret;
+}
+
+/**
+ * netdev_upper_dev_link - Add a link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Adds a link to device which is upper to this one. The caller must hold
+ * the RTNL lock. On a failure a negative errno code is returned.
+ * On success the reference counts are adjusted and the function
+ * returns zero.
+ */
+int netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
+}
+EXPORT_SYMBOL(netdev_upper_dev_link);
+
+/**
+ * netdev_master_upper_dev_link - Add a master link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Adds a link to device which is upper to this one. In this case, only
+ * one master upper device can be linked, although other non-master devices
+ * might be linked as well. The caller must hold the RTNL lock.
+ * On a failure a negative errno code is returned. On success the reference
+ * counts are adjusted and the function returns zero.
+ */
+int netdev_master_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_link);
+
+int netdev_master_upper_dev_link_private(struct net_device *dev,
+ struct net_device *upper_dev,
+ void *private)
+{
+ return __netdev_upper_dev_link(dev, upper_dev, true, private);
+}
+EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
+
+/**
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
+ */
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_adjacent *i, *j;
+ ASSERT_RTNL();
+
+ __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
+
+ /* Here is the tricky part. We must remove all dev's lower
+ * devices from all upper_dev's upper devices and vice
+ * versa, to maintain the graph relationship.
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list)
+ list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
+ __netdev_adjacent_dev_unlink(i->dev, j->dev);
+
+ /* remove also the devices itself from lower/upper device
+ * list
+ */
+ list_for_each_entry(i, &dev->all_adj_list.lower, list)
+ __netdev_adjacent_dev_unlink(i->dev, upper_dev);
+
+ list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
+ __netdev_adjacent_dev_unlink(dev, i->dev);
+
+ call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+}
+EXPORT_SYMBOL(netdev_upper_dev_unlink);
+
+/**
+ * netdev_bonding_info_change - Dispatch event about slave change
+ * @dev: device
+ * @bonding_info: info to dispatch
+ *
+ * Send NETDEV_BONDING_INFO to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_bonding_info_change(struct net_device *dev,
+ struct netdev_bonding_info *bonding_info)
+{
+ struct netdev_notifier_bonding_info info;
+
+ memcpy(&info.bonding_info, bonding_info,
+ sizeof(struct netdev_bonding_info));
+ call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
+ &info.info);
+}
+EXPORT_SYMBOL(netdev_bonding_info_change);
+
+static void netdev_adjacent_add_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.lower);
+ }
+}
+
+static void netdev_adjacent_del_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.lower);
+ }
+}
+
+void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, oldname,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ }
+}
+
+void *netdev_lower_dev_get_private(struct net_device *dev,
+ struct net_device *lower_dev)
+{
+ struct netdev_adjacent *lower;
+
+ if (!lower_dev)
+ return NULL;
+ lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
+ if (!lower)
+ return NULL;
+
+ return lower->private;
+}
+EXPORT_SYMBOL(netdev_lower_dev_get_private);
+
+
+int dev_get_nest_level(struct net_device *dev,
+ bool (*type_check)(struct net_device *dev))
+{
+ struct net_device *lower = NULL;
+ struct list_head *iter;
+ int max_nest = -1;
+ int nest;
+
+ ASSERT_RTNL();
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ nest = dev_get_nest_level(lower, type_check);
+ if (max_nest < nest)
+ max_nest = nest;
+ }
+
+ if (type_check(dev))
+ max_nest++;
+
+ return max_nest;
+}
+EXPORT_SYMBOL(dev_get_nest_level);
+
+static void dev_change_rx_flags(struct net_device *dev, int flags)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_change_rx_flags)
+ ops->ndo_change_rx_flags(dev, flags);
+}
+
+static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
+{
+ unsigned int old_flags = dev->flags;
+ kuid_t uid;
+ kgid_t gid;
+
+ ASSERT_RTNL();
+
+ dev->flags |= IFF_PROMISC;
+ dev->promiscuity += inc;
+ if (dev->promiscuity == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch promisc and return error.
+ */
+ if (inc < 0)
+ dev->flags &= ~IFF_PROMISC;
+ else {
+ dev->promiscuity -= inc;
+ pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
+ dev->name);
+ return -EOVERFLOW;
+ }
+ }
+ if (dev->flags != old_flags) {
+ pr_info("device %s %s promiscuous mode\n",
+ dev->name,
+ dev->flags & IFF_PROMISC ? "entered" : "left");
+ if (audit_enabled) {
+ current_uid_gid(&uid, &gid);
+ audit_log(current->audit_context, GFP_ATOMIC,
+ AUDIT_ANOM_PROMISCUOUS,
+ "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
+ dev->name, (dev->flags & IFF_PROMISC),
+ (old_flags & IFF_PROMISC),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ audit_get_sessionid(current));
+ }
+
+ dev_change_rx_flags(dev, IFF_PROMISC);
+ }
+ if (notify)
+ __dev_notify_flags(dev, old_flags, IFF_PROMISC);
+ return 0;
+}
+
+/**
+ * dev_set_promiscuity - update promiscuity count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove promiscuity from a device. While the count in the device
+ * remains above zero the interface remains promiscuous. Once it hits zero
+ * the device reverts back to normal filtering operation. A negative inc
+ * value is used to drop promiscuity on the device.
+ * Return 0 if successful or a negative errno code on error.
+ */
+int dev_set_promiscuity(struct net_device *dev, int inc)
+{
+ unsigned int old_flags = dev->flags;
+ int err;
+
+ err = __dev_set_promiscuity(dev, inc, true);
+ if (err < 0)
+ return err;
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_set_promiscuity);
+
+static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
+{
+ unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
+
+ ASSERT_RTNL();
+
+ dev->flags |= IFF_ALLMULTI;
+ dev->allmulti += inc;
+ if (dev->allmulti == 0) {
+ /*
+ * Avoid overflow.
+ * If inc causes overflow, untouch allmulti and return error.
+ */
+ if (inc < 0)
+ dev->flags &= ~IFF_ALLMULTI;
+ else {
+ dev->allmulti -= inc;
+ pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
+ dev->name);
+ return -EOVERFLOW;
+ }
+ }
+ if (dev->flags ^ old_flags) {
+ dev_change_rx_flags(dev, IFF_ALLMULTI);
+ dev_set_rx_mode(dev);
+ if (notify)
+ __dev_notify_flags(dev, old_flags,
+ dev->gflags ^ old_gflags);
+ }
+ return 0;
+}
+
+/**
+ * dev_set_allmulti - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ * Return 0 if successful or a negative errno code on error.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ return __dev_set_allmulti(dev, inc, true);
+}
+EXPORT_SYMBOL(dev_set_allmulti);
+
+/*
+ * Upload unicast and multicast address lists to device and
+ * configure RX filtering. When the device doesn't support unicast
+ * filtering it is put in promiscuous mode while unicast addresses
+ * are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ /* dev_open will call this function so the list will stay sane. */
+ if (!(dev->flags&IFF_UP))
+ return;
+
+ if (!netif_device_present(dev))
+ return;
+
+ if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
+ /* Unicast addresses changes may only happen under the rtnl,
+ * therefore calling __dev_set_promiscuity here is safe.
+ */
+ if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
+ __dev_set_promiscuity(dev, 1, false);
+ dev->uc_promisc = true;
+ } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
+ __dev_set_promiscuity(dev, -1, false);
+ dev->uc_promisc = false;
+ }
+ }
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+}
+
+/**
+ * dev_get_flags - get flags reported to userspace
+ * @dev: device
+ *
+ * Get the combination of flag bits exported through APIs to userspace.
+ */
+unsigned int dev_get_flags(const struct net_device *dev)
+{
+ unsigned int flags;
+
+ flags = (dev->flags & ~(IFF_PROMISC |
+ IFF_ALLMULTI |
+ IFF_RUNNING |
+ IFF_LOWER_UP |
+ IFF_DORMANT)) |
+ (dev->gflags & (IFF_PROMISC |
+ IFF_ALLMULTI));
+
+ if (netif_running(dev)) {
+ if (netif_oper_up(dev))
+ flags |= IFF_RUNNING;
+ if (netif_carrier_ok(dev))
+ flags |= IFF_LOWER_UP;
+ if (netif_dormant(dev))
+ flags |= IFF_DORMANT;
+ }
+
+ return flags;
+}
+EXPORT_SYMBOL(dev_get_flags);
+
+int __dev_change_flags(struct net_device *dev, unsigned int flags)
+{
+ unsigned int old_flags = dev->flags;
+ int ret;
+
+ ASSERT_RTNL();
+
+ /*
+ * Set the flags on our device.
+ */
+
+ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
+ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
+ IFF_AUTOMEDIA)) |
+ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
+ IFF_ALLMULTI));
+
+ /*
+ * Load in the correct multicast list now the flags have changed.
+ */
+
+ if ((old_flags ^ flags) & IFF_MULTICAST)
+ dev_change_rx_flags(dev, IFF_MULTICAST);
+
+ dev_set_rx_mode(dev);
+
+ /*
+ * Have we downed the interface. We handle IFF_UP ourselves
+ * according to user attempts to set it, rather than blindly
+ * setting it.
+ */
+
+ ret = 0;
+ if ((old_flags ^ flags) & IFF_UP)
+ ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
+
+ if ((flags ^ dev->gflags) & IFF_PROMISC) {
+ int inc = (flags & IFF_PROMISC) ? 1 : -1;
+ unsigned int old_flags = dev->flags;
+
+ dev->gflags ^= IFF_PROMISC;
+
+ if (__dev_set_promiscuity(dev, inc, false) >= 0)
+ if (dev->flags != old_flags)
+ dev_set_rx_mode(dev);
+ }
+
+ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
+ is important. Some (broken) drivers set IFF_PROMISC, when
+ IFF_ALLMULTI is requested not asking us and not reporting.
+ */
+ if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
+ int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
+
+ dev->gflags ^= IFF_ALLMULTI;
+ __dev_set_allmulti(dev, inc, false);
+ }
+
+ return ret;
+}
+
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges)
+{
+ unsigned int changes = dev->flags ^ old_flags;
+
+ if (gchanges)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+
+ if (changes & IFF_UP) {
+ if (dev->flags & IFF_UP)
+ call_netdevice_notifiers(NETDEV_UP, dev);
+ else
+ call_netdevice_notifiers(NETDEV_DOWN, dev);
+ }
+
+ if (dev->flags & IFF_UP &&
+ (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
+ struct netdev_notifier_change_info change_info;
+
+ change_info.flags_changed = changes;
+ call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+ &change_info.info);
+ }
+}
+
+/**
+ * dev_change_flags - change device settings
+ * @dev: device
+ * @flags: device state flags
+ *
+ * Change settings on device based state flags. The flags are
+ * in the userspace exported format.
+ */
+int dev_change_flags(struct net_device *dev, unsigned int flags)
+{
+ int ret;
+ unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
+
+ ret = __dev_change_flags(dev, flags);
+ if (ret < 0)
+ return ret;
+
+ changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
+ __dev_notify_flags(dev, old_flags, changes);
+ return ret;
+}
+EXPORT_SYMBOL(dev_change_flags);
+
+static int __dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_change_mtu)
+ return ops->ndo_change_mtu(dev, new_mtu);
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+/**
+ * dev_set_mtu - Change maximum transfer unit
+ * @dev: device
+ * @new_mtu: new transfer unit
+ *
+ * Change the maximum transfer size of the network device.
+ */
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ int err, orig_mtu;
+
+ if (new_mtu == dev->mtu)
+ return 0;
+
+ /* MTU must be positive. */
+ if (new_mtu < 0)
+ return -EINVAL;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+
+ orig_mtu = dev->mtu;
+ err = __dev_set_mtu(dev, new_mtu);
+
+ if (!err) {
+ err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ err = notifier_to_errno(err);
+ if (err) {
+ /* setting mtu back and notifying everyone again,
+ * so that they have a chance to revert changes.
+ */
+ __dev_set_mtu(dev, orig_mtu);
+ call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(dev_set_mtu);
+
+/**
+ * dev_set_group - Change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
+/**
+ * dev_set_mac_address - Change Media Access Control Address
+ * @dev: device
+ * @sa: new address
+ *
+ * Change the hardware (MAC) address of the device
+ */
+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+
+ if (!ops->ndo_set_mac_address)
+ return -EOPNOTSUPP;
+ if (sa->sa_family != dev->type)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ err = ops->ndo_set_mac_address(dev, sa);
+ if (err)
+ return err;
+ dev->addr_assign_type = NET_ADDR_SET;
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+ return 0;
+}
+EXPORT_SYMBOL(dev_set_mac_address);
+
+/**
+ * dev_change_carrier - Change device carrier
+ * @dev: device
+ * @new_carrier: new value
+ *
+ * Change device carrier
+ */
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_carrier(dev, new_carrier);
+}
+EXPORT_SYMBOL(dev_change_carrier);
+
+/**
+ * dev_get_phys_port_id - Get device physical port ID
+ * @dev: device
+ * @ppid: port ID
+ *
+ * Get device physical port ID
+ */
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_phys_port_id)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_id(dev, ppid);
+}
+EXPORT_SYMBOL(dev_get_phys_port_id);
+
+/**
+ * dev_get_phys_port_name - Get device physical port name
+ * @dev: device
+ * @name: port name
+ *
+ * Get device physical port name
+ */
+int dev_get_phys_port_name(struct net_device *dev,
+ char *name, size_t len)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_phys_port_name)
+ return -EOPNOTSUPP;
+ return ops->ndo_get_phys_port_name(dev, name, len);
+}
+EXPORT_SYMBOL(dev_get_phys_port_name);
+
+/**
+ * dev_new_index - allocate an ifindex
+ * @net: the applicable net namespace
+ *
+ * Returns a suitable unique value for a new device interface
+ * number. The caller must hold the rtnl semaphore or the
+ * dev_base_lock to be sure it remains unique.
+ */
+static int dev_new_index(struct net *net)
+{
+ int ifindex = net->ifindex;
+ for (;;) {
+ if (++ifindex <= 0)
+ ifindex = 1;
+ if (!__dev_get_by_index(net, ifindex))
+ return net->ifindex = ifindex;
+ }
+}
+
+/* Delayed registration/unregisteration */
+static LIST_HEAD(net_todo_list);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+
+static void net_set_todo(struct net_device *dev)
+{
+ list_add_tail(&dev->todo_list, &net_todo_list);
+ dev_net(dev)->dev_unreg_count++;
+}
+
+static void rollback_registered_many(struct list_head *head)
+{
+ struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
+
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ /* Some devices call without registering
+ * for initialization unwind. Remove those
+ * devices and proceed with the remaining.
+ */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
+
+ WARN_ON(1);
+ list_del(&dev->unreg_list);
+ continue;
+ }
+ dev->dismantle = true;
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ }
+
+ /* If device is running, close it first. */
+ list_for_each_entry(dev, head, unreg_list)
+ list_add_tail(&dev->close_list, &close_head);
+ dev_close_many(&close_head, true);
+
+ list_for_each_entry(dev, head, unreg_list) {
+ /* And unlink it from device chain. */
+ unlist_netdevice(dev);
+
+ dev->reg_state = NETREG_UNREGISTERING;
+ }
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+
+ /* Notify protocols, that we are about to destroy
+ this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+ GFP_KERNEL);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
+
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ }
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list)
+ dev_put(dev);
+}
+
+static void rollback_registered(struct net_device *dev)
+{
+ LIST_HEAD(single);
+
+ list_add(&dev->unreg_list, &single);
+ rollback_registered_many(&single);
+ list_del(&single);
+}
+
+static netdev_features_t netdev_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ /* Fix illegal checksum combinations */
+ if ((features & NETIF_F_HW_CSUM) &&
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_warn(dev, "mixed HW and IP checksum settings.\n");
+ features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+ }
+
+ /* TSO requires that SG is present as well. */
+ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
+ features &= ~NETIF_F_ALL_TSO;
+ }
+
+ if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IP_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO;
+ features &= ~NETIF_F_TSO_ECN;
+ }
+
+ if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
+ !(features & NETIF_F_IPV6_CSUM)) {
+ netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
+ features &= ~NETIF_F_TSO6;
+ }
+
+ /* TSO ECN requires that TSO is present as well. */
+ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
+ features &= ~NETIF_F_TSO_ECN;
+
+ /* Software GSO depends on SG. */
+ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+ netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+ features &= ~NETIF_F_GSO;
+ }
+
+ /* UFO needs SG and checksumming */
+ if (features & NETIF_F_UFO) {
+ /* maybe split UFO into V4 and V6? */
+ if (!((features & NETIF_F_GEN_CSUM) ||
+ (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
+ == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ netdev_dbg(dev,
+ "Dropping NETIF_F_UFO since no checksum offload features.\n");
+ features &= ~NETIF_F_UFO;
+ }
+
+ if (!(features & NETIF_F_SG)) {
+ netdev_dbg(dev,
+ "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
+ features &= ~NETIF_F_UFO;
+ }
+ }
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (dev->netdev_ops->ndo_busy_poll)
+ features |= NETIF_F_BUSY_POLL;
+ else
+#endif
+ features &= ~NETIF_F_BUSY_POLL;
+
+ return features;
+}
+
+int __netdev_update_features(struct net_device *dev)
+{
+ netdev_features_t features;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ features = netdev_get_wanted_features(dev);
+
+ if (dev->netdev_ops->ndo_fix_features)
+ features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+ /* driver might be less strict about feature dependencies */
+ features = netdev_fix_features(dev, features);
+
+ if (dev->features == features)
+ return 0;
+
+ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
+ &dev->features, &features);
+
+ if (dev->netdev_ops->ndo_set_features)
+ err = dev->netdev_ops->ndo_set_features(dev, features);
+
+ if (unlikely(err < 0)) {
+ netdev_err(dev,
+ "set_features() failed (%d); wanted %pNF, left %pNF\n",
+ err, &features, &dev->features);
+ return -1;
+ }
+
+ if (!err)
+ dev->features = features;
+
+ return 1;
+}
+
+/**
+ * netdev_update_features - recalculate device features
+ * @dev: the device to check
+ *
+ * Recalculate dev->features set and send notifications if it
+ * has changed. Should be called after driver or hardware dependent
+ * conditions might have changed that influence the features.
+ */
+void netdev_update_features(struct net_device *dev)
+{
+ if (__netdev_update_features(dev))
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_update_features);
+
+/**
+ * netdev_change_features - recalculate device features
+ * @dev: the device to check
+ *
+ * Recalculate dev->features set and send notifications even
+ * if they have not changed. Should be called instead of
+ * netdev_update_features() if also dev->vlan_features might
+ * have changed to allow the changes to be propagated to stacked
+ * VLAN devices.
+ */
+void netdev_change_features(struct net_device *dev)
+{
+ __netdev_update_features(dev);
+ netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_change_features);
+
+/**
+ * netif_stacked_transfer_operstate - transfer operstate
+ * @rootdev: the root or lower level device to transfer state from
+ * @dev: the device to transfer operstate to
+ *
+ * Transfer operational state from root to device. This is normally
+ * called when a stacking relationship exists between the root
+ * device and the device(a leaf device).
+ */
+void netif_stacked_transfer_operstate(const struct net_device *rootdev,
+ struct net_device *dev)
+{
+ if (rootdev->operstate == IF_OPER_DORMANT)
+ netif_dormant_on(dev);
+ else
+ netif_dormant_off(dev);
+
+ if (netif_carrier_ok(rootdev)) {
+ if (!netif_carrier_ok(dev))
+ netif_carrier_on(dev);
+ } else {
+ if (netif_carrier_ok(dev))
+ netif_carrier_off(dev);
+ }
+}
+EXPORT_SYMBOL(netif_stacked_transfer_operstate);
+
+#ifdef CONFIG_SYSFS
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+ unsigned int i, count = dev->num_rx_queues;
+ struct netdev_rx_queue *rx;
+ size_t sz = count * sizeof(*rx);
+
+ BUG_ON(count < 1);
+
+ rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!rx) {
+ rx = vzalloc(sz);
+ if (!rx)
+ return -ENOMEM;
+ }
+ dev->_rx = rx;
+
+ for (i = 0; i < count; i++)
+ rx[i].dev = dev;
+ return 0;
+}
+#endif
+
+static void netdev_init_one_queue(struct net_device *dev,
+ struct netdev_queue *queue, void *_unused)
+{
+ /* Initialize queue lock */
+ spin_lock_init(&queue->_xmit_lock);
+ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+ queue->xmit_lock_owner = -1;
+ netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
+ queue->dev = dev;
+#ifdef CONFIG_BQL
+ dql_init(&queue->dql, HZ);
+#endif
+}
+
+static void netif_free_tx_queues(struct net_device *dev)
+{
+ kvfree(dev->_tx);
+}
+
+static int netif_alloc_netdev_queues(struct net_device *dev)
+{
+ unsigned int count = dev->num_tx_queues;
+ struct netdev_queue *tx;
+ size_t sz = count * sizeof(*tx);
+
+ BUG_ON(count < 1 || count > 0xffff);
+
+ tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!tx) {
+ tx = vzalloc(sz);
+ if (!tx)
+ return -ENOMEM;
+ }
+ dev->_tx = tx;
+
+ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+ spin_lock_init(&dev->tx_global_lock);
+
+ return 0;
+}
+
+/**
+ * register_netdevice - register a network device
+ * @dev: device to register
+ *
+ * Take a completed network device structure and add it to the kernel
+ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
+ * chain. 0 is returned on success. A negative errno code is returned
+ * on a failure to set up the device, or if the name is a duplicate.
+ *
+ * Callers must hold the rtnl semaphore. You may want
+ * register_netdev() instead of this.
+ *
+ * BUGS:
+ * The locking appears insufficient to guarantee two parallel registers
+ * will not get the same name.
+ */
+
+int register_netdevice(struct net_device *dev)
+{
+ int ret;
+ struct net *net = dev_net(dev);
+
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ might_sleep();
+
+ /* When net_device's are persistent, this will be fatal. */
+ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
+ BUG_ON(!net);
+
+ spin_lock_init(&dev->addr_list_lock);
+ netdev_set_addr_lockdep_class(dev);
+
+ ret = dev_get_valid_name(net, dev, dev->name);
+ if (ret < 0)
+ goto out;
+
+ /* Init, if this function is available */
+ if (dev->netdev_ops->ndo_init) {
+ ret = dev->netdev_ops->ndo_init(dev);
+ if (ret) {
+ if (ret > 0)
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+ if (((dev->hw_features | dev->features) &
+ NETIF_F_HW_VLAN_CTAG_FILTER) &&
+ (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
+ !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
+ netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
+ ret = -EINVAL;
+ goto err_uninit;
+ }
+
+ ret = -EBUSY;
+ if (!dev->ifindex)
+ dev->ifindex = dev_new_index(net);
+ else if (__dev_get_by_index(net, dev->ifindex))
+ goto err_uninit;
+
+ /* Transfer changeable features to wanted_features and enable
+ * software offloads (GSO and GRO).
+ */
+ dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->features |= NETIF_F_SOFT_FEATURES;
+ dev->wanted_features = dev->features & dev->hw_features;
+
+ if (!(dev->flags & IFF_LOOPBACK)) {
+ dev->hw_features |= NETIF_F_NOCACHE_COPY;
+ }
+
+ /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
+ */
+ dev->vlan_features |= NETIF_F_HIGHDMA;
+
+ /* Make NETIF_F_SG inheritable to tunnel devices.
+ */
+ dev->hw_enc_features |= NETIF_F_SG;
+
+ /* Make NETIF_F_SG inheritable to MPLS.
+ */
+ dev->mpls_features |= NETIF_F_SG;
+
+ ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto err_uninit;
+
+ ret = netdev_register_kobject(dev);
+ if (ret)
+ goto err_uninit;
+ dev->reg_state = NETREG_REGISTERED;
+
+ __netdev_update_features(dev);
+
+ /*
+ * Default initial state at registry is that the
+ * device is present.
+ */
+
+ set_bit(__LINK_STATE_PRESENT, &dev->state);
+
+ linkwatch_init_dev(dev);
+
+ dev_init_scheduler(dev);
+ dev_hold(dev);
+ list_netdevice(dev);
+ add_device_randomness(dev->dev_addr, dev->addr_len);
+
+ /* If the device has permanent device address, driver should
+ * set dev_addr and also addr_assign_type should be set to
+ * NET_ADDR_PERM (default value).
+ */
+ if (dev->addr_assign_type == NET_ADDR_PERM)
+ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
+
+ /* Notify protocols, that a new device appeared. */
+ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ rollback_registered(dev);
+ dev->reg_state = NETREG_UNREGISTERED;
+ }
+ /*
+ * Prevent userspace races by waiting until the network
+ * device is fully setup before sending notifications.
+ */
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+
+out:
+ return ret;
+
+err_uninit:
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+ goto out;
+}
+EXPORT_SYMBOL(register_netdevice);
+
+/**
+ * init_dummy_netdev - init a dummy network device for NAPI
+ * @dev: device to init
+ *
+ * This takes a network device structure and initialize the minimum
+ * amount of fields so it can be used to schedule NAPI polls without
+ * registering a full blown interface. This is to be used by drivers
+ * that need to tie several hardware interfaces to a single NAPI
+ * poll scheduler due to HW limitations.
+ */
+int init_dummy_netdev(struct net_device *dev)
+{
+ /* Clear everything. Note we don't initialize spinlocks
+ * are they aren't supposed to be taken by any of the
+ * NAPI code and this dummy netdev is supposed to be
+ * only ever used for NAPI polls
+ */
+ memset(dev, 0, sizeof(struct net_device));
+
+ /* make sure we BUG if trying to hit standard
+ * register/unregister code path
+ */
+ dev->reg_state = NETREG_DUMMY;
+
+ /* NAPI wants this */
+ INIT_LIST_HEAD(&dev->napi_list);
+
+ /* a dummy interface is started by default */
+ set_bit(__LINK_STATE_PRESENT, &dev->state);
+ set_bit(__LINK_STATE_START, &dev->state);
+
+ /* Note : We dont allocate pcpu_refcnt for dummy devices,
+ * because users of this 'device' dont need to change
+ * its refcount.
+ */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(init_dummy_netdev);
+
+
+/**
+ * register_netdev - register a network device
+ * @dev: device to register
+ *
+ * Take a completed network device structure and add it to the kernel
+ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
+ * chain. 0 is returned on success. A negative errno code is returned
+ * on a failure to set up the device, or if the name is a duplicate.
+ *
+ * This is a wrapper around register_netdevice that takes the rtnl semaphore
+ * and expands the device name if you passed a format string to
+ * alloc_netdev.
+ */
+int register_netdev(struct net_device *dev)
+{
+ int err;
+
+ rtnl_lock();
+ err = register_netdevice(dev);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_netdev);
+
+int netdev_refcnt_read(const struct net_device *dev)
+{
+ int i, refcnt = 0;
+
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+ return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
+/**
+ * netdev_wait_allrefs - wait until all references are gone.
+ * @dev: target net_device
+ *
+ * This is called when unregistering network devices.
+ *
+ * Any protocol or device that holds a reference should register
+ * for netdevice notification, and cleanup and put back the
+ * reference if they receive an UNREGISTER event.
+ * We can get stuck here if buggy protocols don't correctly
+ * call dev_put.
+ */
+static void netdev_wait_allrefs(struct net_device *dev)
+{
+ unsigned long rebroadcast_time, warning_time;
+ int refcnt;
+
+ linkwatch_forget_dev(dev);
+
+ rebroadcast_time = warning_time = jiffies;
+ refcnt = netdev_refcnt_read(dev);
+
+ while (refcnt != 0) {
+ if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
+ rtnl_lock();
+
+ /* Rebroadcast unregister notification */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ __rtnl_unlock();
+ rcu_barrier();
+ rtnl_lock();
+
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+ &dev->state)) {
+ /* We must not have linkwatch events
+ * pending on unregister. If this
+ * happens, we simply run the queue
+ * unscheduled, resulting in a noop
+ * for this device.
+ */
+ linkwatch_run_queue();
+ }
+
+ __rtnl_unlock();
+
+ rebroadcast_time = jiffies;
+ }
+
+ msleep(250);
+
+ refcnt = netdev_refcnt_read(dev);
+
+ if (time_after(jiffies, warning_time + 10 * HZ)) {
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, refcnt);
+ warning_time = jiffies;
+ }
+ }
+}
+
+/* The sequence is:
+ *
+ * rtnl_lock();
+ * ...
+ * register_netdevice(x1);
+ * register_netdevice(x2);
+ * ...
+ * unregister_netdevice(y1);
+ * unregister_netdevice(y2);
+ * ...
+ * rtnl_unlock();
+ * free_netdev(y1);
+ * free_netdev(y2);
+ *
+ * We are invoked by rtnl_unlock().
+ * This allows us to deal with problems:
+ * 1) We can delete sysfs objects which invoke hotplug
+ * without deadlocking with linkwatch via keventd.
+ * 2) Since we run with the RTNL semaphore not held, we can sleep
+ * safely in order to wait for the netdev refcnt to drop to zero.
+ *
+ * We must not return until all unregister events added during
+ * the interval the lock was held have been completed.
+ */
+void netdev_run_todo(void)
+{
+ struct list_head list;
+
+ /* Snapshot list, allow later requests */
+ list_replace_init(&net_todo_list, &list);
+
+ __rtnl_unlock();
+
+
+ /* Wait for rcu callbacks to finish before next phase */
+ if (!list_empty(&list))
+ rcu_barrier();
+
+ while (!list_empty(&list)) {
+ struct net_device *dev
+ = list_first_entry(&list, struct net_device, todo_list);
+ list_del(&dev->todo_list);
+
+ rtnl_lock();
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+ __rtnl_unlock();
+
+ if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
+ pr_err("network todo '%s' but state %d\n",
+ dev->name, dev->reg_state);
+ dump_stack();
+ continue;
+ }
+
+ dev->reg_state = NETREG_UNREGISTERED;
+
+ on_each_cpu(flush_backlog, dev, 1);
+
+ netdev_wait_allrefs(dev);
+
+ /* paranoia */
+ BUG_ON(netdev_refcnt_read(dev));
+ BUG_ON(!list_empty(&dev->ptype_all));
+ BUG_ON(!list_empty(&dev->ptype_specific));
+ WARN_ON(rcu_access_pointer(dev->ip_ptr));
+ WARN_ON(rcu_access_pointer(dev->ip6_ptr));
+ WARN_ON(dev->dn_ptr);
+
+ if (dev->destructor)
+ dev->destructor(dev);
+
+ /* Report a network device has been unregistered */
+ rtnl_lock();
+ dev_net(dev)->dev_unreg_count--;
+ __rtnl_unlock();
+ wake_up(&netdev_unregistering_wq);
+
+ /* Free network device */
+ kobject_put(&dev->dev.kobj);
+ }
+}
+
+/* Convert net_device_stats to rtnl_link_stats64. They have the same
+ * fields in the same order, with only the type differing.
+ */
+void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+ const struct net_device_stats *netdev_stats)
+{
+#if BITS_PER_LONG == 64
+ BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+ memcpy(stats64, netdev_stats, sizeof(*stats64));
+#else
+ size_t i, n = sizeof(*stats64) / sizeof(u64);
+ const unsigned long *src = (const unsigned long *)netdev_stats;
+ u64 *dst = (u64 *)stats64;
+
+ BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
+ sizeof(*stats64) / sizeof(u64));
+ for (i = 0; i < n; i++)
+ dst[i] = src[i];
+#endif
+}
+EXPORT_SYMBOL(netdev_stats_to_stats64);
+
+/**
+ * dev_get_stats - get network device statistics
+ * @dev: device to get statistics from
+ * @storage: place to store stats
+ *
+ * Get network statistics from device. Return @storage.
+ * The device driver may provide its own method by setting
+ * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
+ * otherwise the internal statistics structure is used.
+ */
+struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *storage)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_get_stats64) {
+ memset(storage, 0, sizeof(*storage));
+ ops->ndo_get_stats64(dev, storage);
+ } else if (ops->ndo_get_stats) {
+ netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+ } else {
+ netdev_stats_to_stats64(storage, &dev->stats);
+ }
+ storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
+ storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
+ return storage;
+}
+EXPORT_SYMBOL(dev_get_stats);
+
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+ struct netdev_queue *queue = dev_ingress_queue(dev);
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (queue)
+ return queue;
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+ if (!queue)
+ return NULL;
+ netdev_init_one_queue(dev, queue, NULL);
+ RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
+ queue->qdisc_sleeping = &noop_qdisc;
+ rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+ return queue;
+}
+
+static const struct ethtool_ops default_ethtool_ops;
+
+void netdev_set_default_ethtool_ops(struct net_device *dev,
+ const struct ethtool_ops *ops)
+{
+ if (dev->ethtool_ops == &default_ethtool_ops)
+ dev->ethtool_ops = ops;
+}
+EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+
+void netdev_freemem(struct net_device *dev)
+{
+ char *addr = (char *)dev - dev->padded;
+
+ kvfree(addr);
+}
+
+/**
+ * alloc_netdev_mqs - allocate network device
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
+ *
+ * Allocates a struct net_device with private data area for driver use
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
+ */
+struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ unsigned char name_assign_type,
+ void (*setup)(struct net_device *),
+ unsigned int txqs, unsigned int rxqs)
+{
+ struct net_device *dev;
+ size_t alloc_size;
+ struct net_device *p;
+
+ BUG_ON(strlen(name) >= sizeof(dev->name));
+
+ if (txqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
+ return NULL;
+ }
+
+#ifdef CONFIG_SYSFS
+ if (rxqs < 1) {
+ pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
+ return NULL;
+ }
+#endif
+
+ alloc_size = sizeof(struct net_device);
+ if (sizeof_priv) {
+ /* ensure 32-byte alignment of private area */
+ alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
+ alloc_size += sizeof_priv;
+ }
+ /* ensure 32-byte alignment of whole construct */
+ alloc_size += NETDEV_ALIGN - 1;
+
+ p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!p)
+ p = vzalloc(alloc_size);
+ if (!p)
+ return NULL;
+
+ dev = PTR_ALIGN(p, NETDEV_ALIGN);
+ dev->padded = (char *)dev - (char *)p;
+
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
+ goto free_dev;
+
+ if (dev_addr_init(dev))
+ goto free_pcpu;
+
+ dev_mc_init(dev);
+ dev_uc_init(dev);
+
+ dev_net_set(dev, &init_net);
+
+ dev->gso_max_size = GSO_MAX_SIZE;
+ dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->gso_min_segs = 0;
+
+ INIT_LIST_HEAD(&dev->napi_list);
+ INIT_LIST_HEAD(&dev->unreg_list);
+ INIT_LIST_HEAD(&dev->close_list);
+ INIT_LIST_HEAD(&dev->link_watch_list);
+ INIT_LIST_HEAD(&dev->adj_list.upper);
+ INIT_LIST_HEAD(&dev->adj_list.lower);
+ INIT_LIST_HEAD(&dev->all_adj_list.upper);
+ INIT_LIST_HEAD(&dev->all_adj_list.lower);
+ INIT_LIST_HEAD(&dev->ptype_all);
+ INIT_LIST_HEAD(&dev->ptype_specific);
+ dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
+ setup(dev);
+
+ dev->num_tx_queues = txqs;
+ dev->real_num_tx_queues = txqs;
+ if (netif_alloc_netdev_queues(dev))
+ goto free_all;
+
+#ifdef CONFIG_SYSFS
+ dev->num_rx_queues = rxqs;
+ dev->real_num_rx_queues = rxqs;
+ if (netif_alloc_rx_queues(dev))
+ goto free_all;
+#endif
+
+ strcpy(dev->name, name);
+ dev->name_assign_type = name_assign_type;
+ dev->group = INIT_NETDEV_GROUP;
+ if (!dev->ethtool_ops)
+ dev->ethtool_ops = &default_ethtool_ops;
+ return dev;
+
+free_all:
+ free_netdev(dev);
+ return NULL;
+
+free_pcpu:
+ free_percpu(dev->pcpu_refcnt);
+free_dev:
+ netdev_freemem(dev);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_netdev_mqs);
+
+/**
+ * free_netdev - free network device
+ * @dev: device
+ *
+ * This function does the last stage of destroying an allocated device
+ * interface. The reference to the device object is released.
+ * If this is the last reference then it will be freed.
+ */
+void free_netdev(struct net_device *dev)
+{
+ struct napi_struct *p, *n;
+
+ netif_free_tx_queues(dev);
+#ifdef CONFIG_SYSFS
+ kvfree(dev->_rx);
+#endif
+
+ kfree(rcu_dereference_protected(dev->ingress_queue, 1));
+
+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ netif_napi_del(p);
+
+ free_percpu(dev->pcpu_refcnt);
+ dev->pcpu_refcnt = NULL;
+
+ /* Compatibility with error handling in drivers */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ netdev_freemem(dev);
+ return;
+ }
+
+ BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
+ dev->reg_state = NETREG_RELEASED;
+
+ /* will free via device release */
+ put_device(&dev->dev);
+}
+EXPORT_SYMBOL(free_netdev);
+
+/**
+ * synchronize_net - Synchronize with packet receive processing
+ *
+ * Wait for packets currently being received to be done.
+ * Does not block later packets from starting.
+ */
+void synchronize_net(void)
+{
+ might_sleep();
+ if (rtnl_is_locked())
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(synchronize_net);
+
+/**
+ * unregister_netdevice_queue - remove device from the kernel
+ * @dev: device
+ * @head: list
+ *
+ * This function shuts down a device interface and removes it
+ * from the kernel tables.
+ * If head not NULL, device is queued to be unregistered later.
+ *
+ * Callers must hold the rtnl semaphore. You may want
+ * unregister_netdev() instead of this.
+ */
+
+void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
+{
+ ASSERT_RTNL();
+
+ if (head) {
+ list_move_tail(&dev->unreg_list, head);
+ } else {
+ rollback_registered(dev);
+ /* Finish processing unregister after unlock */
+ net_set_todo(dev);
+ }
+}
+EXPORT_SYMBOL(unregister_netdevice_queue);
+
+/**
+ * unregister_netdevice_many - unregister many devices
+ * @head: list of devices
+ *
+ * Note: As most callers use a stack allocated list_head,
+ * we force a list_del() to make sure stack wont be corrupted later.
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+ struct net_device *dev;
+
+ if (!list_empty(head)) {
+ rollback_registered_many(head);
+ list_for_each_entry(dev, head, unreg_list)
+ net_set_todo(dev);
+ list_del(head);
+ }
+}
+EXPORT_SYMBOL(unregister_netdevice_many);
+
+/**
+ * unregister_netdev - remove device from the kernel
+ * @dev: device
+ *
+ * This function shuts down a device interface and removes it
+ * from the kernel tables.
+ *
+ * This is just a wrapper for unregister_netdevice that takes
+ * the rtnl semaphore. In general you want to use this and not
+ * unregister_netdevice.
+ */
+void unregister_netdev(struct net_device *dev)
+{
+ rtnl_lock();
+ unregister_netdevice(dev);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(unregister_netdev);
+
+/**
+ * dev_change_net_namespace - move device to different nethost namespace
+ * @dev: device
+ * @net: network namespace
+ * @pat: If not NULL name pattern to try if the current device name
+ * is already taken in the destination network namespace.
+ *
+ * This function shuts down a device interface and moves it
+ * to a new network namespace. On success 0 is returned, on
+ * a failure a netagive errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore.
+ */
+
+int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ /* Don't allow namespace local devices to be moved. */
+ err = -EINVAL;
+ if (dev->features & NETIF_F_NETNS_LOCAL)
+ goto out;
+
+ /* Ensure the device has been registrered */
+ if (dev->reg_state != NETREG_REGISTERED)
+ goto out;
+
+ /* Get out if there is nothing todo */
+ err = 0;
+ if (net_eq(dev_net(dev), net))
+ goto out;
+
+ /* Pick the destination device name, and ensure
+ * we can use it in the destination network namespace.
+ */
+ err = -EEXIST;
+ if (__dev_get_by_name(net, dev->name)) {
+ /* We get here if we can't use the current device name */
+ if (!pat)
+ goto out;
+ if (dev_get_valid_name(net, dev, pat) < 0)
+ goto out;
+ }
+
+ /*
+ * And now a mini version of register_netdevice unregister_netdevice.
+ */
+
+ /* If device is running close it first. */
+ dev_close(dev);
+
+ /* And unlink it from device chain */
+ err = -ENODEV;
+ unlist_netdevice(dev);
+
+ synchronize_net();
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+ /* Notify protocols, that we are about to destroy
+ this device. They should clean all the things.
+
+ Note that dev->reg_state stays at NETREG_REGISTERED.
+ This is wanted because this way 8021q and macvlan know
+ the device is just moving and can keep their slaves up.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ rcu_barrier();
+ call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ /* Send a netdev-removed uevent to the old namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+ netdev_adjacent_del_links(dev);
+
+ /* Actually switch the network namespace */
+ dev_net_set(dev, net);
+
+ /* If there is an ifindex conflict assign a new one */
+ if (__dev_get_by_index(net, dev->ifindex))
+ dev->ifindex = dev_new_index(net);
+
+ /* Send a netdev-add uevent to the new namespace */
+ kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+ netdev_adjacent_add_links(dev);
+
+ /* Fixup kobjects */
+ err = device_rename(&dev->dev, dev->name);
+ WARN_ON(err);
+
+ /* Add the device back in the hashes */
+ list_netdevice(dev);
+
+ /* Notify protocols, that a new device appeared. */
+ call_netdevice_notifiers(NETDEV_REGISTER, dev);
+
+ /*
+ * Prevent userspace races by waiting until the network
+ * device is fully setup before sending notifications.
+ */
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+
+ synchronize_net();
+ err = 0;
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+
+static int dev_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *ocpu)
+{
+ struct sk_buff **list_skb;
+ struct sk_buff *skb;
+ unsigned int cpu, oldcpu = (unsigned long)ocpu;
+ struct softnet_data *sd, *oldsd;
+
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
+ return NOTIFY_OK;
+
+ local_irq_disable();
+ cpu = smp_processor_id();
+ sd = &per_cpu(softnet_data, cpu);
+ oldsd = &per_cpu(softnet_data, oldcpu);
+
+ /* Find end of our completion_queue. */
+ list_skb = &sd->completion_queue;
+ while (*list_skb)
+ list_skb = &(*list_skb)->next;
+ /* Append completion queue from offline CPU. */
+ *list_skb = oldsd->completion_queue;
+ oldsd->completion_queue = NULL;
+
+ /* Append output queue from offline CPU. */
+ if (oldsd->output_queue) {
+ *sd->output_queue_tailp = oldsd->output_queue;
+ sd->output_queue_tailp = oldsd->output_queue_tailp;
+ oldsd->output_queue = NULL;
+ oldsd->output_queue_tailp = &oldsd->output_queue;
+ }
+ /* Append NAPI poll list from offline CPU, with one exception :
+ * process_backlog() must be called by cpu owning percpu backlog.
+ * We properly handle process_queue & input_pkt_queue later.
+ */
+ while (!list_empty(&oldsd->poll_list)) {
+ struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
+ struct napi_struct,
+ poll_list);
+
+ list_del_init(&napi->poll_list);
+ if (napi->poll == process_backlog)
+ napi->state = 0;
+ else
+ ____napi_schedule(sd, napi);
+ }
+
+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ local_irq_enable();
+
+ /* Process offline CPU's input_pkt_queue */
+ while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+ netif_rx_ni(skb);
+ input_queue_head_incr(oldsd);
+ }
+ while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
+ netif_rx_ni(skb);
+ input_queue_head_incr(oldsd);
+ }
+
+ return NOTIFY_OK;
+}
+
+
+/**
+ * netdev_increment_features - increment feature set by one
+ * @all: current feature set
+ * @one: new feature set
+ * @mask: mask feature set
+ *
+ * Computes a new feature set after adding a device with feature set
+ * @one to the master device with current feature set @all. Will not
+ * enable anything that is off in @mask. Returns the new feature set.
+ */
+netdev_features_t netdev_increment_features(netdev_features_t all,
+ netdev_features_t one, netdev_features_t mask)
+{
+ if (mask & NETIF_F_GEN_CSUM)
+ mask |= NETIF_F_ALL_CSUM;
+ mask |= NETIF_F_VLAN_CHALLENGED;
+
+ all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
+ all &= one | ~NETIF_F_ALL_FOR_ALL;
+
+ /* If one device supports hw checksumming, set for all. */
+ if (all & NETIF_F_GEN_CSUM)
+ all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+
+ return all;
+}
+EXPORT_SYMBOL(netdev_increment_features);
+
+static struct hlist_head * __net_init netdev_create_hash(void)
+{
+ int i;
+ struct hlist_head *hash;
+
+ hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
+ if (hash != NULL)
+ for (i = 0; i < NETDEV_HASHENTRIES; i++)
+ INIT_HLIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+/* Initialize per network namespace state */
+static int __net_init netdev_init(struct net *net)
+{
+ if (net != &init_net)
+ INIT_LIST_HEAD(&net->dev_base_head);
+
+ net->dev_name_head = netdev_create_hash();
+ if (net->dev_name_head == NULL)
+ goto err_name;
+
+ net->dev_index_head = netdev_create_hash();
+ if (net->dev_index_head == NULL)
+ goto err_idx;
+
+ return 0;
+
+err_idx:
+ kfree(net->dev_name_head);
+err_name:
+ return -ENOMEM;
+}
+
+/**
+ * netdev_drivername - network driver for the device
+ * @dev: network device
+ *
+ * Determine network driver for device.
+ */
+const char *netdev_drivername(const struct net_device *dev)
+{
+ const struct device_driver *driver;
+ const struct device *parent;
+ const char *empty = "";
+
+ parent = dev->dev.parent;
+ if (!parent)
+ return empty;
+
+ driver = parent->driver;
+ if (driver && driver->name)
+ return driver->name;
+ return empty;
+}
+
+static void __netdev_printk(const char *level, const struct net_device *dev,
+ struct va_format *vaf)
+{
+ if (dev && dev->dev.parent) {
+ dev_printk_emit(level[1] - '0',
+ dev->dev.parent,
+ "%s %s %s%s: %pV",
+ dev_driver_string(dev->dev.parent),
+ dev_name(dev->dev.parent),
+ netdev_name(dev), netdev_reg_state(dev),
+ vaf);
+ } else if (dev) {
+ printk("%s%s%s: %pV",
+ level, netdev_name(dev), netdev_reg_state(dev), vaf);
+ } else {
+ printk("%s(NULL net_device): %pV", level, vaf);
+ }
+}
+
+void netdev_printk(const char *level, const struct net_device *dev,
+ const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ __netdev_printk(level, dev, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(netdev_printk);
+
+#define define_netdev_printk_level(func, level) \
+void func(const struct net_device *dev, const char *fmt, ...) \
+{ \
+ struct va_format vaf; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ __netdev_printk(level, dev, &vaf); \
+ \
+ va_end(args); \
+} \
+EXPORT_SYMBOL(func);
+
+define_netdev_printk_level(netdev_emerg, KERN_EMERG);
+define_netdev_printk_level(netdev_alert, KERN_ALERT);
+define_netdev_printk_level(netdev_crit, KERN_CRIT);
+define_netdev_printk_level(netdev_err, KERN_ERR);
+define_netdev_printk_level(netdev_warn, KERN_WARNING);
+define_netdev_printk_level(netdev_notice, KERN_NOTICE);
+define_netdev_printk_level(netdev_info, KERN_INFO);
+
+static void __net_exit netdev_exit(struct net *net)
+{
+ kfree(net->dev_name_head);
+ kfree(net->dev_index_head);
+}
+
+static struct pernet_operations __net_initdata netdev_net_ops = {
+ .init = netdev_init,
+ .exit = netdev_exit,
+};
+
+static void __net_exit default_device_exit(struct net *net)
+{
+ struct net_device *dev, *aux;
+ /*
+ * Push all migratable network devices back to the
+ * initial network namespace
+ */
+ rtnl_lock();
+ for_each_netdev_safe(net, dev, aux) {
+ int err;
+ char fb_name[IFNAMSIZ];
+
+ /* Ignore unmoveable devices (i.e. loopback) */
+ if (dev->features & NETIF_F_NETNS_LOCAL)
+ continue;
+
+ /* Leave virtual devices for the generic cleanup */
+ if (dev->rtnl_link_ops)
+ continue;
+
+ /* Push remaining network devices to init_net */
+ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+ err = dev_change_net_namespace(dev, &init_net, fb_name);
+ if (err) {
+ pr_emerg("%s: failed to move %s to init_net: %d\n",
+ __func__, dev->name, err);
+ BUG();
+ }
+ }
+ rtnl_unlock();
+}
+
+static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
+{
+ /* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace in net_list.
+ */
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(&netdev_unregistering_wq, &wait);
+ for (;;) {
+ unregistering = false;
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&netdev_unregistering_wq, &wait);
+}
+
+static void __net_exit default_device_exit_batch(struct list_head *net_list)
+{
+ /* At exit all network devices most be removed from a network
+ * namespace. Do this in the reverse order of registration.
+ * Do this across as many network namespaces as possible to
+ * improve batching efficiency.
+ */
+ struct net_device *dev;
+ struct net *net;
+ LIST_HEAD(dev_kill_list);
+
+ /* To prevent network device cleanup code from dereferencing
+ * loopback devices or network devices that have been freed
+ * wait here for all pending unregistrations to complete,
+ * before unregistring the loopback device and allowing the
+ * network namespace be freed.
+ *
+ * The netdev todo list containing all network devices
+ * unregistrations that happen in default_device_exit_batch
+ * will run in the rtnl_unlock() at the end of
+ * default_device_exit_batch.
+ */
+ rtnl_lock_unregistering(net_list);
+ list_for_each_entry(net, net_list, exit_list) {
+ for_each_netdev_reverse(net, dev) {
+ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
+ dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
+ else
+ unregister_netdevice_queue(dev, &dev_kill_list);
+ }
+ }
+ unregister_netdevice_many(&dev_kill_list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations __net_initdata default_device_ops = {
+ .exit = default_device_exit,
+ .exit_batch = default_device_exit_batch,
+};
+
+/*
+ * Initialize the DEV module. At boot time this walks the device list and
+ * unhooks any devices that fail to initialise (normally hardware not
+ * present) and leaves us with a valid list of present and active devices.
+ *
+ */
+
+/*
+ * This is called single threaded during boot, so no need
+ * to take the rtnl semaphore.
+ */
+static int __init net_dev_init(void)
+{
+ int i, rc = -ENOMEM;
+
+ BUG_ON(!dev_boot_phase);
+
+ if (dev_proc_init())
+ goto out;
+
+ if (netdev_kobject_init())
+ goto out;
+
+ INIT_LIST_HEAD(&ptype_all);
+ for (i = 0; i < PTYPE_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&ptype_base[i]);
+
+ INIT_LIST_HEAD(&offload_base);
+
+ if (register_pernet_subsys(&netdev_net_ops))
+ goto out;
+
+ /*
+ * Initialise the packet receive queues.
+ */
+
+ for_each_possible_cpu(i) {
+ struct softnet_data *sd = &per_cpu(softnet_data, i);
+
+ skb_queue_head_init(&sd->input_pkt_queue);
+ skb_queue_head_init(&sd->process_queue);
+ INIT_LIST_HEAD(&sd->poll_list);
+ sd->output_queue_tailp = &sd->output_queue;
+#ifdef CONFIG_RPS
+ sd->csd.func = rps_trigger_softirq;
+ sd->csd.info = sd;
+ sd->cpu = i;
+#endif
+
+ sd->backlog.poll = process_backlog;
+ sd->backlog.weight = weight_p;
+ }
+
+ dev_boot_phase = 0;
+
+ /* The loopback device is special if any other network devices
+ * is present in a network namespace the loopback device must
+ * be present. Since we now dynamically allocate and free the
+ * loopback device ensure this invariant is maintained by
+ * keeping the loopback device as the first device on the
+ * list of network devices. Ensuring the loopback devices
+ * is the first device that appears and the last network device
+ * that disappears.
+ */
+ if (register_pernet_device(&loopback_net_ops))
+ goto out;
+
+ if (register_pernet_device(&default_device_ops))
+ goto out;
+
+ open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+ open_softirq(NET_RX_SOFTIRQ, net_rx_action);
+
+ hotcpu_notifier(dev_cpu_callback, 0);
+ dst_init();
+ rc = 0;
+out:
+ return rc;
+}
+
+subsys_initcall(net_dev_init);
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
new file mode 100644
index 000000000..c0548d268
--- /dev/null
+++ b/net/core/dev_addr_lists.c
@@ -0,0 +1,851 @@
+/*
+ * net/core/dev_addr_lists.c - Functions for handling net device lists
+ * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This file contains functions for working with unicast, multicast and device
+ * addresses lists.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/export.h>
+#include <linux/list.h>
+
+/*
+ * General list handling functions
+ */
+
+static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global,
+ bool sync)
+{
+ struct netdev_hw_addr *ha;
+ int alloc_size;
+
+ alloc_size = sizeof(*ha);
+ if (alloc_size < L1_CACHE_BYTES)
+ alloc_size = L1_CACHE_BYTES;
+ ha = kmalloc(alloc_size, GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->type = addr_type;
+ ha->refcount = 1;
+ ha->global_use = global;
+ ha->synced = sync ? 1 : 0;
+ ha->sync_cnt = 0;
+ list_add_tail_rcu(&ha->list, &list->list);
+ list->count++;
+
+ return 0;
+}
+
+static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync,
+ int sync_count)
+{
+ struct netdev_hw_addr *ha;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (!memcmp(ha->addr, addr, addr_len) &&
+ ha->type == addr_type) {
+ if (global) {
+ /* check if addr is already used as global */
+ if (ha->global_use)
+ return 0;
+ else
+ ha->global_use = true;
+ }
+ if (sync) {
+ if (ha->synced && sync_count)
+ return -EEXIST;
+ else
+ ha->synced++;
+ }
+ ha->refcount++;
+ return 0;
+ }
+ }
+
+ return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
+ sync);
+}
+
+static int __hw_addr_add(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
+ 0);
+}
+
+static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *ha, bool global,
+ bool sync)
+{
+ if (global && !ha->global_use)
+ return -ENOENT;
+
+ if (sync && !ha->synced)
+ return -ENOENT;
+
+ if (global)
+ ha->global_use = false;
+
+ if (sync)
+ ha->synced--;
+
+ if (--ha->refcount)
+ return 0;
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ list->count--;
+ return 0;
+}
+
+static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
+{
+ struct netdev_hw_addr *ha;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (!memcmp(ha->addr, addr, addr_len) &&
+ (ha->type == addr_type || !addr_type))
+ return __hw_addr_del_entry(list, ha, global, sync);
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_del(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
+}
+
+static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true, ha->sync_cnt);
+ if (err && err != -EEXIST)
+ return err;
+
+ if (!err) {
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+
+static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ struct netdev_hw_addr *ha,
+ int addr_len)
+{
+ int err;
+
+ err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
+ false, true);
+ if (err)
+ return;
+ ha->sync_cnt--;
+ /* address on from list is not marked synced */
+ __hw_addr_del_entry(from_list, ha, false, false);
+}
+
+static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt == ha->refcount) {
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ } else {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ }
+ }
+ return err;
+}
+
+/* This function only works where there is a strict 1-1 relationship
+ * between source and destionation of they synch. If you ever need to
+ * sync addresses to more then 1 destination, you need to use
+ * __hw_addr_sync_multiple().
+ */
+int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ int err = 0;
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (!ha->sync_cnt) {
+ err = __hw_addr_sync_one(to_list, ha, addr_len);
+ if (err)
+ break;
+ } else if (ha->refcount == 1)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ }
+ return err;
+}
+EXPORT_SYMBOL(__hw_addr_sync);
+
+void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+ if (ha->sync_cnt)
+ __hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync);
+
+/**
+ * __hw_addr_sync_dev - Synchonize device's multicast list
+ * @list: address list to syncronize
+ * @dev: device to sync
+ * @sync: function to call if address should be added
+ * @unsync: function to call if address should be removed
+ *
+ * This funciton is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address add/remove
+ * notifications. The unsync function may be NULL in which case
+ * the addresses requiring removal will simply be removed without
+ * any notification to the device.
+ **/
+int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *, const unsigned char *),
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err;
+
+ /* first go through and flush out any stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt || ha->refcount != 1)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (ha->sync_cnt)
+ continue;
+
+ err = sync(dev, ha->addr);
+ if (err)
+ return err;
+
+ ha->sync_cnt++;
+ ha->refcount++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_sync_dev);
+
+/**
+ * __hw_addr_unsync_dev - Remove synchronized addresses from device
+ * @list: address list to remove synchronized addresses from
+ * @dev: device to sync
+ * @unsync: function to call if address should be removed
+ *
+ * Remove all addresses that were added to the device by __hw_addr_sync_dev().
+ * This function is intended to be called from the ndo_stop or ndo_open
+ * functions on devices that require explicit address add/remove
+ * notifications. If the unsync function pointer is NULL then this function
+ * can be used to just reset the sync_cnt for the addresses in the list.
+ **/
+void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if unsync is defined and fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr))
+ continue;
+
+ ha->sync_cnt--;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_unsync_dev);
+
+static void __hw_addr_flush(struct netdev_hw_addr_list *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ list_del_rcu(&ha->list);
+ kfree_rcu(ha, rcu_head);
+ }
+ list->count = 0;
+}
+
+void __hw_addr_init(struct netdev_hw_addr_list *list)
+{
+ INIT_LIST_HEAD(&list->list);
+ list->count = 0;
+}
+EXPORT_SYMBOL(__hw_addr_init);
+
+/*
+ * Device addresses handling functions
+ */
+
+/**
+ * dev_addr_flush - Flush device address list
+ * @dev: device
+ *
+ * Flush device address list and reset ->dev_addr.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void dev_addr_flush(struct net_device *dev)
+{
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_flush(&dev->dev_addrs);
+ dev->dev_addr = NULL;
+}
+EXPORT_SYMBOL(dev_addr_flush);
+
+/**
+ * dev_addr_init - Init device address list
+ * @dev: device
+ *
+ * Init device address list and create the first element,
+ * used by ->dev_addr.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_init(&dev->dev_addrs);
+ memset(addr, 0, sizeof(addr));
+ err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
+ NETDEV_HW_ADDR_T_LAN);
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addrs.list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_init);
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ * @addr_type: address type
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, const unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ * @addr_type: address type
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, const unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+ struct netdev_hw_addr *ha;
+
+ ASSERT_RTNL();
+
+ /*
+ * We can not remove the first address from the list because
+ * dev->dev_addr points to that.
+ */
+ ha = list_first_entry(&dev->dev_addrs.list,
+ struct netdev_hw_addr, list);
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == addr_type && ha->refcount == 1)
+ return -ENOENT;
+
+ err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
+ addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/*
+ * Unicast list handling functions
+ */
+
+/**
+ * dev_uc_add_excl - Add a global secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+
+ netif_addr_lock_bh(dev);
+ list_for_each_entry(ha, &dev->uc.list, list) {
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == NETDEV_HW_ADDR_T_UNICAST) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+ err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, true, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_add_excl);
+
+/**
+ * dev_uc_add - Add a secondary unicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a secondary unicast address to the device or increase
+ * the reference count if it already exists.
+ */
+int dev_uc_add(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_add);
+
+/**
+ * dev_uc_del - Release secondary unicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a secondary unicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_uc_del(struct net_device *dev, const unsigned char *addr)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_del);
+
+/**
+ * dev_uc_sync - Synchronize device's unicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. This function assumes that
+ * addresses will only ever be synced to the @to devices and no other.
+ */
+int dev_uc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync);
+
+/**
+ * dev_uc_sync_multiple - Synchronize device's unicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have been deleted from the source. The source device
+ * must be locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the dev->set_rx_mode
+ * function of layered software devices. It allows for a single source
+ * device to be synced to multiple destination devices.
+ */
+int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_uc_sync_multiple);
+
+/**
+ * dev_uc_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_uc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_uc_unsync(struct net_device *to, struct net_device *from)
+{
+ if (to->addr_len != from->addr_len)
+ return;
+
+ netif_addr_lock_bh(from);
+ netif_addr_lock_nested(to);
+ __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_uc_unsync);
+
+/**
+ * dev_uc_flush - Flush unicast addresses
+ * @dev: device
+ *
+ * Flush unicast addresses.
+ */
+void dev_uc_flush(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->uc);
+ netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_uc_flush);
+
+/**
+ * dev_uc_flush - Init unicast address list
+ * @dev: device
+ *
+ * Init unicast address list.
+ */
+void dev_uc_init(struct net_device *dev)
+{
+ __hw_addr_init(&dev->uc);
+}
+EXPORT_SYMBOL(dev_uc_init);
+
+/*
+ * Multicast list handling functions
+ */
+
+/**
+ * dev_mc_add_excl - Add a global secondary multicast address
+ * @dev: device
+ * @addr: address to add
+ */
+int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+
+ netif_addr_lock_bh(dev);
+ list_for_each_entry(ha, &dev->mc.list, list) {
+ if (!memcmp(ha->addr, addr, dev->addr_len) &&
+ ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+ err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, true, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+out:
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_add_excl);
+
+static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
+ bool global)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+/**
+ * dev_mc_add - Add a multicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a multicast address to the device or increase
+ * the reference count if it already exists.
+ */
+int dev_mc_add(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_add(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_add);
+
+/**
+ * dev_mc_add_global - Add a global multicast address
+ * @dev: device
+ * @addr: address to add
+ *
+ * Add a global multicast address to the device.
+ */
+int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_add(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_add_global);
+
+static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
+ bool global)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, global, false);
+ if (!err)
+ __dev_set_rx_mode(dev);
+ netif_addr_unlock_bh(dev);
+ return err;
+}
+
+/**
+ * dev_mc_del - Delete a multicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a multicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_mc_del(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_del(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_del);
+
+/**
+ * dev_mc_del_global - Delete a global multicast address.
+ * @dev: device
+ * @addr: address to delete
+ *
+ * Release reference to a multicast address and remove it
+ * from the device if the reference count drops to zero.
+ */
+int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
+{
+ return __dev_mc_del(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_del_global);
+
+/**
+ * dev_mc_sync - Synchronize device's multicast list to another device
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices.
+ */
+int dev_mc_sync(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync);
+
+/**
+ * dev_mc_sync_multiple - Synchronize device's multicast list to another
+ * device, but allow for multiple calls to sync to multiple devices.
+ * @to: destination device
+ * @from: source device
+ *
+ * Add newly added addresses to the destination device and release
+ * addresses that have no users left. The source device must be
+ * locked by netif_addr_lock_bh.
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices. It allows for a single
+ * source device to be synced to multiple destination devices.
+ */
+int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+ int err = 0;
+
+ if (to->addr_len != from->addr_len)
+ return -EINVAL;
+
+ netif_addr_lock_nested(to);
+ err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
+ if (!err)
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ return err;
+}
+EXPORT_SYMBOL(dev_mc_sync_multiple);
+
+/**
+ * dev_mc_unsync - Remove synchronized addresses from the destination device
+ * @to: destination device
+ * @from: source device
+ *
+ * Remove all addresses that were added to the destination device by
+ * dev_mc_sync(). This function is intended to be called from the
+ * dev->stop function of layered software devices.
+ */
+void dev_mc_unsync(struct net_device *to, struct net_device *from)
+{
+ if (to->addr_len != from->addr_len)
+ return;
+
+ netif_addr_lock_bh(from);
+ netif_addr_lock_nested(to);
+ __hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
+ __dev_set_rx_mode(to);
+ netif_addr_unlock(to);
+ netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_mc_unsync);
+
+/**
+ * dev_mc_flush - Flush multicast addresses
+ * @dev: device
+ *
+ * Flush multicast addresses.
+ */
+void dev_mc_flush(struct net_device *dev)
+{
+ netif_addr_lock_bh(dev);
+ __hw_addr_flush(&dev->mc);
+ netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_mc_flush);
+
+/**
+ * dev_mc_flush - Init multicast address list
+ * @dev: device
+ *
+ * Init multicast address list.
+ */
+void dev_mc_init(struct net_device *dev)
+{
+ __hw_addr_init(&dev->mc);
+}
+EXPORT_SYMBOL(dev_mc_init);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
new file mode 100644
index 000000000..b94b1d293
--- /dev/null
+++ b/net/core/dev_ioctl.c
@@ -0,0 +1,567 @@
+#include <linux/kmod.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/net_tstamp.h>
+#include <linux/wireless.h>
+#include <net/wext.h>
+
+/*
+ * Map an interface index to its name (SIOCGIFNAME)
+ */
+
+/*
+ * We need this ioctl for efficient implementation of the
+ * if_indextoname() function required by the IPv6 API. Without
+ * it, we would have to search all the interfaces to find a
+ * match. --pb
+ */
+
+static int dev_ifname(struct net *net, struct ifreq __user *arg)
+{
+ struct ifreq ifr;
+ int error;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ return 0;
+}
+
+static gifconf_func_t *gifconf_list[NPROTO];
+
+/**
+ * register_gifconf - register a SIOCGIF handler
+ * @family: Address family
+ * @gifconf: Function handler
+ *
+ * Register protocol dependent address dumping routines. The handler
+ * that is passed must not be freed or reused until it has been replaced
+ * by another handler.
+ */
+int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
+{
+ if (family >= NPROTO)
+ return -EINVAL;
+ gifconf_list[family] = gifconf;
+ return 0;
+}
+EXPORT_SYMBOL(register_gifconf);
+
+/*
+ * Perform a SIOCGIFCONF call. This structure will change
+ * size eventually, and there is nothing I can do about it.
+ * Thus we will need a 'compatibility mode'.
+ */
+
+static int dev_ifconf(struct net *net, char __user *arg)
+{
+ struct ifconf ifc;
+ struct net_device *dev;
+ char __user *pos;
+ int len;
+ int total;
+ int i;
+
+ /*
+ * Fetch the caller's info block.
+ */
+
+ if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+
+ /*
+ * Loop over the interfaces, and write an info block for each.
+ */
+
+ total = 0;
+ for_each_netdev(net, dev) {
+ for (i = 0; i < NPROTO; i++) {
+ if (gifconf_list[i]) {
+ int done;
+ if (!pos)
+ done = gifconf_list[i](dev, NULL, 0);
+ else
+ done = gifconf_list[i](dev, pos + total,
+ len - total);
+ if (done < 0)
+ return -EFAULT;
+ total += done;
+ }
+ }
+ }
+
+ /*
+ * All done. Write the updated control block back to the caller.
+ */
+ ifc.ifc_len = total;
+
+ /*
+ * Both BSD and Solaris return 0 here, so we do too.
+ */
+ return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
+ */
+static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ switch (cmd) {
+ case SIOCGIFFLAGS: /* Get interface flags */
+ ifr->ifr_flags = (short) dev_get_flags(dev);
+ return 0;
+
+ case SIOCGIFMETRIC: /* Get the metric on the interface
+ (currently unused) */
+ ifr->ifr_metric = 0;
+ return 0;
+
+ case SIOCGIFMTU: /* Get the MTU of a device */
+ ifr->ifr_mtu = dev->mtu;
+ return 0;
+
+ case SIOCGIFHWADDR:
+ if (!dev->addr_len)
+ memset(ifr->ifr_hwaddr.sa_data, 0,
+ sizeof(ifr->ifr_hwaddr.sa_data));
+ else
+ memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
+ ifr->ifr_hwaddr.sa_family = dev->type;
+ return 0;
+
+ case SIOCGIFSLAVE:
+ err = -EINVAL;
+ break;
+
+ case SIOCGIFMAP:
+ ifr->ifr_map.mem_start = dev->mem_start;
+ ifr->ifr_map.mem_end = dev->mem_end;
+ ifr->ifr_map.base_addr = dev->base_addr;
+ ifr->ifr_map.irq = dev->irq;
+ ifr->ifr_map.dma = dev->dma;
+ ifr->ifr_map.port = dev->if_port;
+ return 0;
+
+ case SIOCGIFINDEX:
+ ifr->ifr_ifindex = dev->ifindex;
+ return 0;
+
+ case SIOCGIFTXQLEN:
+ ifr->ifr_qlen = dev->tx_queue_len;
+ return 0;
+
+ default:
+ /* dev_ioctl() should ensure this case
+ * is never reached
+ */
+ WARN_ON(1);
+ err = -ENOTTY;
+ break;
+
+ }
+ return err;
+}
+
+static int net_hwtstamp_validate(struct ifreq *ifr)
+{
+ struct hwtstamp_config cfg;
+ enum hwtstamp_tx_types tx_type;
+ enum hwtstamp_rx_filters rx_filter;
+ int tx_type_valid = 0;
+ int rx_filter_valid = 0;
+
+ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+ return -EFAULT;
+
+ if (cfg.flags) /* reserved for future extensions */
+ return -EINVAL;
+
+ tx_type = cfg.tx_type;
+ rx_filter = cfg.rx_filter;
+
+ switch (tx_type) {
+ case HWTSTAMP_TX_OFF:
+ case HWTSTAMP_TX_ON:
+ case HWTSTAMP_TX_ONESTEP_SYNC:
+ tx_type_valid = 1;
+ break;
+ }
+
+ switch (rx_filter) {
+ case HWTSTAMP_FILTER_NONE:
+ case HWTSTAMP_FILTER_ALL:
+ case HWTSTAMP_FILTER_SOME:
+ case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ rx_filter_valid = 1;
+ break;
+ }
+
+ if (!tx_type_valid || !rx_filter_valid)
+ return -ERANGE;
+
+ return 0;
+}
+
+/*
+ * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ */
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+ int err;
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ const struct net_device_ops *ops;
+
+ if (!dev)
+ return -ENODEV;
+
+ ops = dev->netdev_ops;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS: /* Set interface flags */
+ return dev_change_flags(dev, ifr->ifr_flags);
+
+ case SIOCSIFMETRIC: /* Set the metric on the interface
+ (currently unused) */
+ return -EOPNOTSUPP;
+
+ case SIOCSIFMTU: /* Set the MTU of a device */
+ return dev_set_mtu(dev, ifr->ifr_mtu);
+
+ case SIOCSIFHWADDR:
+ return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+
+ case SIOCSIFHWBROADCAST:
+ if (ifr->ifr_hwaddr.sa_family != dev->type)
+ return -EINVAL;
+ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
+ min(sizeof(ifr->ifr_hwaddr.sa_data),
+ (size_t)dev->addr_len));
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return 0;
+
+ case SIOCSIFMAP:
+ if (ops->ndo_set_config) {
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_set_config(dev, &ifr->ifr_map);
+ }
+ return -EOPNOTSUPP;
+
+ case SIOCADDMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+
+ case SIOCDELMULTI:
+ if (!ops->ndo_set_rx_mode ||
+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+ return -EINVAL;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+
+ case SIOCSIFTXQLEN:
+ if (ifr->ifr_qlen < 0)
+ return -EINVAL;
+ dev->tx_queue_len = ifr->ifr_qlen;
+ return 0;
+
+ case SIOCSIFNAME:
+ ifr->ifr_newname[IFNAMSIZ-1] = '\0';
+ return dev_change_name(dev, ifr->ifr_newname);
+
+ case SIOCSHWTSTAMP:
+ err = net_hwtstamp_validate(ifr);
+ if (err)
+ return err;
+ /* fall through */
+
+ /*
+ * Unknown or private ioctl
+ */
+ default:
+ if ((cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15) ||
+ cmd == SIOCBONDENSLAVE ||
+ cmd == SIOCBONDRELEASE ||
+ cmd == SIOCBONDSETHWADDR ||
+ cmd == SIOCBONDSLAVEINFOQUERY ||
+ cmd == SIOCBONDINFOQUERY ||
+ cmd == SIOCBONDCHANGEACTIVE ||
+ cmd == SIOCGMIIPHY ||
+ cmd == SIOCGMIIREG ||
+ cmd == SIOCSMIIREG ||
+ cmd == SIOCBRADDIF ||
+ cmd == SIOCBRDELIF ||
+ cmd == SIOCSHWTSTAMP ||
+ cmd == SIOCGHWTSTAMP ||
+ cmd == SIOCWANDEV) {
+ err = -EOPNOTSUPP;
+ if (ops->ndo_do_ioctl) {
+ if (netif_device_present(dev))
+ err = ops->ndo_do_ioctl(dev, ifr, cmd);
+ else
+ err = -ENODEV;
+ }
+ } else
+ err = -EINVAL;
+
+ }
+ return err;
+}
+
+/**
+ * dev_load - load a network module
+ * @net: the applicable net namespace
+ * @name: name of interface
+ *
+ * If a network interface is not present and the process has suitable
+ * privileges this function loads the module. If module loading is not
+ * available in this kernel then it becomes a nop.
+ */
+
+void dev_load(struct net *net, const char *name)
+{
+ struct net_device *dev;
+ int no_module;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ rcu_read_unlock();
+
+ no_module = !dev;
+ if (no_module && capable(CAP_NET_ADMIN))
+ no_module = request_module("netdev-%s", name);
+ if (no_module && capable(CAP_SYS_MODULE))
+ request_module("%s", name);
+}
+EXPORT_SYMBOL(dev_load);
+
+/*
+ * This function handles all "interface"-type I/O control requests. The actual
+ * 'doing' part of this is dev_ifsioc above.
+ */
+
+/**
+ * dev_ioctl - network device ioctl
+ * @net: the applicable net namespace
+ * @cmd: command to issue
+ * @arg: pointer to a struct ifreq in user space
+ *
+ * Issue ioctl functions to devices. This is normally called by the
+ * user space syscall interfaces but can sometimes be useful for
+ * other purposes. The return value is the return from the syscall if
+ * positive or a negative errno code on error.
+ */
+
+int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ int ret;
+ char *colon;
+
+ /* One special case: SIOCGIFCONF takes ifconf argument
+ and requires shared lock, because it sleeps writing
+ to user space.
+ */
+
+ if (cmd == SIOCGIFCONF) {
+ rtnl_lock();
+ ret = dev_ifconf(net, (char __user *) arg);
+ rtnl_unlock();
+ return ret;
+ }
+ if (cmd == SIOCGIFNAME)
+ return dev_ifname(net, (struct ifreq __user *)arg);
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ /*
+ * See which interface the caller is talking about.
+ */
+
+ switch (cmd) {
+ /*
+ * These ioctl calls:
+ * - can be done by all.
+ * - atomic and do not require locking.
+ * - return a value
+ */
+ case SIOCGIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCGIFHWADDR:
+ case SIOCGIFSLAVE:
+ case SIOCGIFMAP:
+ case SIOCGIFINDEX:
+ case SIOCGIFTXQLEN:
+ dev_load(net, ifr.ifr_name);
+ rcu_read_lock();
+ ret = dev_ifsioc_locked(net, &ifr, cmd);
+ rcu_read_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ case SIOCETHTOOL:
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ethtool(net, &ifr);
+ rtnl_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - return a value
+ */
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSIFNAME:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ if (!ret) {
+ if (colon)
+ *colon = ':';
+ if (copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ }
+ return ret;
+
+ /*
+ * These ioctl calls:
+ * - require superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFMAP:
+ case SIOCSIFTXQLEN:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ /*
+ * These ioctl calls:
+ * - require local superuser power.
+ * - require strict serialization.
+ * - do not return a value
+ */
+ case SIOCSIFFLAGS:
+ case SIOCSIFMETRIC:
+ case SIOCSIFMTU:
+ case SIOCSIFHWADDR:
+ case SIOCSIFSLAVE:
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ case SIOCSIFHWBROADCAST:
+ case SIOCSMIIREG:
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDCHANGEACTIVE:
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ case SIOCSHWTSTAMP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ /* fall through */
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ return ret;
+
+ case SIOCGIFMEM:
+ /* Get the per device memory space. We can add this but
+ * currently do not support it */
+ case SIOCSIFMEM:
+ /* Set the per device memory buffer space.
+ * Not applicable in our case */
+ case SIOCSIFLINK:
+ return -ENOTTY;
+
+ /*
+ * Unknown or private ioctl.
+ */
+ default:
+ if (cmd == SIOCWANDEV ||
+ cmd == SIOCGHWTSTAMP ||
+ (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15)) {
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ if (!ret && copy_to_user(arg, &ifr,
+ sizeof(struct ifreq)))
+ ret = -EFAULT;
+ return ret;
+ }
+ /* Take care of Wireless Extensions */
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
+ return wext_handle_ioctl(net, &ifr, cmd, arg);
+ return -ENOTTY;
+ }
+}
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 000000000..252e155c8
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,437 @@
+/*
+ * Monitoring code for network dropped packet alerts
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/genetlink.h>
+#include <net/netevent.h>
+
+#include <trace/events/skb.h>
+#include <trace/events/napi.h>
+
+#include <asm/unaligned.h>
+
+#define TRACE_ON 1
+#define TRACE_OFF 0
+
+/*
+ * Globals, our netlink socket pointer
+ * and the work handle that will send up
+ * netlink alerts
+ */
+static int trace_state = TRACE_OFF;
+static DEFINE_MUTEX(trace_state_mutex);
+
+struct per_cpu_dm_data {
+ spinlock_t lock;
+ struct sk_buff *skb;
+ struct work_struct dm_alert_work;
+ struct timer_list send_timer;
+};
+
+struct dm_hw_stat_delta {
+ struct net_device *dev;
+ unsigned long last_rx;
+ struct list_head list;
+ struct rcu_head rcu;
+ unsigned long last_drop_val;
+};
+
+static struct genl_family net_drop_monitor_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 2,
+};
+
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+
+static int dm_hit_limit = 64;
+static int dm_delay = 1;
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
+
+static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
+{
+ size_t al;
+ struct net_dm_alert_msg *msg;
+ struct nlattr *nla;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ al = sizeof(struct net_dm_alert_msg);
+ al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+ al += sizeof(struct nlattr);
+
+ skb = genlmsg_new(al, GFP_KERNEL);
+
+ if (skb) {
+ genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
+ 0, NET_DM_CMD_ALERT);
+ nla = nla_reserve(skb, NLA_UNSPEC,
+ sizeof(struct net_dm_alert_msg));
+ msg = nla_data(nla);
+ memset(msg, 0, al);
+ } else {
+ mod_timer(&data->send_timer, jiffies + HZ / 10);
+ }
+
+ spin_lock_irqsave(&data->lock, flags);
+ swap(data->skb, skb);
+ spin_unlock_irqrestore(&data->lock, flags);
+
+ return skb;
+}
+
+static struct genl_multicast_group dropmon_mcgrps[] = {
+ { .name = "events", },
+};
+
+static void send_dm_alert(struct work_struct *work)
+{
+ struct sk_buff *skb;
+ struct per_cpu_dm_data *data;
+
+ data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ skb = reset_per_cpu_data(data);
+
+ if (skb)
+ genlmsg_multicast(&net_drop_monitor_family, skb, 0,
+ 0, GFP_KERNEL);
+}
+
+/*
+ * This is the timer function to delay the sending of an alert
+ * in the event that more drops will arrive during the
+ * hysteresis period.
+ */
+static void sched_send_work(unsigned long _data)
+{
+ struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;
+
+ schedule_work(&data->dm_alert_work);
+}
+
+static void trace_drop_common(struct sk_buff *skb, void *location)
+{
+ struct net_dm_alert_msg *msg;
+ struct nlmsghdr *nlh;
+ struct nlattr *nla;
+ int i;
+ struct sk_buff *dskb;
+ struct per_cpu_dm_data *data;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ data = this_cpu_ptr(&dm_cpu_data);
+ spin_lock(&data->lock);
+ dskb = data->skb;
+
+ if (!dskb)
+ goto out;
+
+ nlh = (struct nlmsghdr *)dskb->data;
+ nla = genlmsg_data(nlmsg_data(nlh));
+ msg = nla_data(nla);
+ for (i = 0; i < msg->entries; i++) {
+ if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
+ msg->points[i].count++;
+ goto out;
+ }
+ }
+ if (msg->entries == dm_hit_limit)
+ goto out;
+ /*
+ * We need to create a new entry
+ */
+ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
+ nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
+ memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
+ msg->points[msg->entries].count = 1;
+ msg->entries++;
+
+ if (!timer_pending(&data->send_timer)) {
+ data->send_timer.expires = jiffies + dm_delay * HZ;
+ add_timer(&data->send_timer);
+ }
+
+out:
+ spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
+{
+ trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
+{
+ struct dm_hw_stat_delta *new_stat;
+
+ /*
+ * Don't check napi structures with no associated device
+ */
+ if (!napi->dev)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+ /*
+ * only add a note to our monitor buffer if:
+ * 1) this is the dev we received on
+ * 2) its after the last_rx delta
+ * 3) our rx_dropped count has gone up
+ */
+ if ((new_stat->dev == napi->dev) &&
+ (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+ trace_drop_common(NULL, NULL);
+ new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+ new_stat->last_rx = jiffies;
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int set_all_monitor_traces(int state)
+{
+ int rc = 0;
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *temp;
+
+ mutex_lock(&trace_state_mutex);
+
+ if (state == trace_state) {
+ rc = -EAGAIN;
+ goto out_unlock;
+ }
+
+ switch (state) {
+ case TRACE_ON:
+ if (!try_module_get(THIS_MODULE)) {
+ rc = -ENODEV;
+ break;
+ }
+
+ rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+ rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
+ break;
+
+ case TRACE_OFF:
+ rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
+
+ tracepoint_synchronize_unregister();
+
+ /*
+ * Clean the device list
+ */
+ list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+ if (new_stat->dev == NULL) {
+ list_del_rcu(&new_stat->list);
+ kfree_rcu(new_stat, rcu);
+ }
+ }
+
+ module_put(THIS_MODULE);
+
+ break;
+ default:
+ rc = 1;
+ break;
+ }
+
+ if (!rc)
+ trace_state = state;
+ else
+ rc = -EINPROGRESS;
+
+out_unlock:
+ mutex_unlock(&trace_state_mutex);
+
+ return rc;
+}
+
+
+static int net_dm_cmd_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ return -ENOTSUPP;
+}
+
+static int net_dm_cmd_trace(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ switch (info->genlhdr->cmd) {
+ case NET_DM_CMD_START:
+ return set_all_monitor_traces(TRACE_ON);
+ case NET_DM_CMD_STOP:
+ return set_all_monitor_traces(TRACE_OFF);
+ }
+
+ return -ENOTSUPP;
+}
+
+static int dropmon_net_event(struct notifier_block *ev_block,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *tmp;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+ if (!new_stat)
+ goto out;
+
+ new_stat->dev = dev;
+ new_stat->last_rx = jiffies;
+ mutex_lock(&trace_state_mutex);
+ list_add_rcu(&new_stat->list, &hw_stats_list);
+ mutex_unlock(&trace_state_mutex);
+ break;
+ case NETDEV_UNREGISTER:
+ mutex_lock(&trace_state_mutex);
+ list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+ if (new_stat->dev == dev) {
+ new_stat->dev = NULL;
+ if (trace_state == TRACE_OFF) {
+ list_del_rcu(&new_stat->list);
+ kfree_rcu(new_stat, rcu);
+ break;
+ }
+ }
+ }
+ mutex_unlock(&trace_state_mutex);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static const struct genl_ops dropmon_ops[] = {
+ {
+ .cmd = NET_DM_CMD_CONFIG,
+ .doit = net_dm_cmd_config,
+ },
+ {
+ .cmd = NET_DM_CMD_START,
+ .doit = net_dm_cmd_trace,
+ },
+ {
+ .cmd = NET_DM_CMD_STOP,
+ .doit = net_dm_cmd_trace,
+ },
+};
+
+static struct notifier_block dropmon_net_notifier = {
+ .notifier_call = dropmon_net_event
+};
+
+static int __init init_net_drop_monitor(void)
+{
+ struct per_cpu_dm_data *data;
+ int cpu, rc;
+
+ pr_info("Initializing network drop monitor service\n");
+
+ if (sizeof(void *) > 8) {
+ pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
+ return -ENOSPC;
+ }
+
+ rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
+ dropmon_ops, dropmon_mcgrps);
+ if (rc) {
+ pr_err("Could not create drop monitor netlink family\n");
+ return rc;
+ }
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
+
+ rc = register_netdevice_notifier(&dropmon_net_notifier);
+ if (rc < 0) {
+ pr_crit("Failed to register netdevice notifier\n");
+ goto out_unreg;
+ }
+
+ rc = 0;
+
+ for_each_possible_cpu(cpu) {
+ data = &per_cpu(dm_cpu_data, cpu);
+ INIT_WORK(&data->dm_alert_work, send_dm_alert);
+ init_timer(&data->send_timer);
+ data->send_timer.data = (unsigned long)data;
+ data->send_timer.function = sched_send_work;
+ spin_lock_init(&data->lock);
+ reset_per_cpu_data(data);
+ }
+
+
+ goto out;
+
+out_unreg:
+ genl_unregister_family(&net_drop_monitor_family);
+out:
+ return rc;
+}
+
+static void exit_net_drop_monitor(void)
+{
+ struct per_cpu_dm_data *data;
+ int cpu;
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
+
+ /*
+ * Because of the module_get/put we do in the trace state change path
+ * we are guarnateed not to have any current users when we get here
+ * all we need to do is make sure that we don't have any running timers
+ * or pending schedule calls
+ */
+
+ for_each_possible_cpu(cpu) {
+ data = &per_cpu(dm_cpu_data, cpu);
+ del_timer_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ /*
+ * At this point, we should have exclusive access
+ * to this struct and can free the skb inside it
+ */
+ kfree_skb(data->skb);
+ }
+
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+}
+
+module_init(init_net_drop_monitor);
+module_exit(exit_net_drop_monitor);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
+MODULE_ALIAS_GENL_FAMILY("NET_DM");
diff --git a/net/core/dst.c b/net/core/dst.c
new file mode 100644
index 000000000..e956ce6d1
--- /dev/null
+++ b/net/core/dst.c
@@ -0,0 +1,397 @@
+/*
+ * net/core/dst.c Protocol independent destination cache.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/sched.h>
+#include <linux/prefetch.h>
+
+#include <net/dst.h>
+
+/*
+ * Theory of operations:
+ * 1) We use a list, protected by a spinlock, to add
+ * new entries from both BH and non-BH context.
+ * 2) In order to keep spinlock held for a small delay,
+ * we use a second list where are stored long lived
+ * entries, that are handled by the garbage collect thread
+ * fired by a workqueue.
+ * 3) This list is guarded by a mutex,
+ * so that the gc_task and dst_dev_event() can be synchronized.
+ */
+
+/*
+ * We want to keep lock & list close together
+ * to dirty as few cache lines as possible in __dst_free().
+ * As this is not a very strong hint, we dont force an alignment on SMP.
+ */
+static struct {
+ spinlock_t lock;
+ struct dst_entry *list;
+ unsigned long timer_inc;
+ unsigned long timer_expires;
+} dst_garbage = {
+ .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
+ .timer_inc = DST_GC_MAX,
+};
+static void dst_gc_task(struct work_struct *work);
+static void ___dst_free(struct dst_entry *dst);
+
+static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
+
+static DEFINE_MUTEX(dst_gc_mutex);
+/*
+ * long lived entries are maintained in this list, guarded by dst_gc_mutex
+ */
+static struct dst_entry *dst_busy_list;
+
+static void dst_gc_task(struct work_struct *work)
+{
+ int delayed = 0;
+ int work_performed = 0;
+ unsigned long expires = ~0L;
+ struct dst_entry *dst, *next, head;
+ struct dst_entry *last = &head;
+
+ mutex_lock(&dst_gc_mutex);
+ next = dst_busy_list;
+
+loop:
+ while ((dst = next) != NULL) {
+ next = dst->next;
+ prefetch(&next->next);
+ cond_resched();
+ if (likely(atomic_read(&dst->__refcnt))) {
+ last->next = dst;
+ last = dst;
+ delayed++;
+ continue;
+ }
+ work_performed++;
+
+ dst = dst_destroy(dst);
+ if (dst) {
+ /* NOHASH and still referenced. Unless it is already
+ * on gc list, invalidate it and add to gc list.
+ *
+ * Note: this is temporary. Actually, NOHASH dst's
+ * must be obsoleted when parent is obsoleted.
+ * But we do not have state "obsoleted, but
+ * referenced by parent", so it is right.
+ */
+ if (dst->obsolete > 0)
+ continue;
+
+ ___dst_free(dst);
+ dst->next = next;
+ next = dst;
+ }
+ }
+
+ spin_lock_bh(&dst_garbage.lock);
+ next = dst_garbage.list;
+ if (next) {
+ dst_garbage.list = NULL;
+ spin_unlock_bh(&dst_garbage.lock);
+ goto loop;
+ }
+ last->next = NULL;
+ dst_busy_list = head.next;
+ if (!dst_busy_list)
+ dst_garbage.timer_inc = DST_GC_MAX;
+ else {
+ /*
+ * if we freed less than 1/10 of delayed entries,
+ * we can sleep longer.
+ */
+ if (work_performed <= delayed/10) {
+ dst_garbage.timer_expires += dst_garbage.timer_inc;
+ if (dst_garbage.timer_expires > DST_GC_MAX)
+ dst_garbage.timer_expires = DST_GC_MAX;
+ dst_garbage.timer_inc += DST_GC_INC;
+ } else {
+ dst_garbage.timer_inc = DST_GC_INC;
+ dst_garbage.timer_expires = DST_GC_MIN;
+ }
+ expires = dst_garbage.timer_expires;
+ /*
+ * if the next desired timer is more than 4 seconds in the
+ * future then round the timer to whole seconds
+ */
+ if (expires > 4*HZ)
+ expires = round_jiffies_relative(expires);
+ schedule_delayed_work(&dst_gc_work, expires);
+ }
+
+ spin_unlock_bh(&dst_garbage.lock);
+ mutex_unlock(&dst_gc_mutex);
+}
+
+int dst_discard_sk(struct sock *sk, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(dst_discard_sk);
+
+const u32 dst_default_metrics[RTAX_MAX + 1] = {
+ /* This initializer is needed to force linker to place this variable
+ * into const section. Otherwise it might end into bss section.
+ * We really want to avoid false sharing on this variable, and catch
+ * any writes on it.
+ */
+ [RTAX_MAX] = 0xdeadbeef,
+};
+
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+ int initial_ref, int initial_obsolete, unsigned short flags)
+{
+ struct dst_entry *dst;
+
+ if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc(ops))
+ return NULL;
+ }
+ dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+ if (!dst)
+ return NULL;
+ dst->child = NULL;
+ dst->dev = dev;
+ if (dev)
+ dev_hold(dev);
+ dst->ops = ops;
+ dst_init_metrics(dst, dst_default_metrics, true);
+ dst->expires = 0UL;
+ dst->path = dst;
+ dst->from = NULL;
+#ifdef CONFIG_XFRM
+ dst->xfrm = NULL;
+#endif
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
+ dst->error = 0;
+ dst->obsolete = initial_obsolete;
+ dst->header_len = 0;
+ dst->trailer_len = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ dst->tclassid = 0;
+#endif
+ atomic_set(&dst->__refcnt, initial_ref);
+ dst->__use = 0;
+ dst->lastuse = jiffies;
+ dst->flags = flags;
+ dst->pending_confirm = 0;
+ dst->next = NULL;
+ if (!(flags & DST_NOCOUNT))
+ dst_entries_add(ops, 1);
+ return dst;
+}
+EXPORT_SYMBOL(dst_alloc);
+
+static void ___dst_free(struct dst_entry *dst)
+{
+ /* The first case (dev==NULL) is required, when
+ protocol module is unloaded.
+ */
+ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
+ }
+ dst->obsolete = DST_OBSOLETE_DEAD;
+}
+
+void __dst_free(struct dst_entry *dst)
+{
+ spin_lock_bh(&dst_garbage.lock);
+ ___dst_free(dst);
+ dst->next = dst_garbage.list;
+ dst_garbage.list = dst;
+ if (dst_garbage.timer_inc > DST_GC_INC) {
+ dst_garbage.timer_inc = DST_GC_INC;
+ dst_garbage.timer_expires = DST_GC_MIN;
+ mod_delayed_work(system_wq, &dst_gc_work,
+ dst_garbage.timer_expires);
+ }
+ spin_unlock_bh(&dst_garbage.lock);
+}
+EXPORT_SYMBOL(__dst_free);
+
+struct dst_entry *dst_destroy(struct dst_entry * dst)
+{
+ struct dst_entry *child;
+
+ smp_rmb();
+
+again:
+ child = dst->child;
+
+ if (!(dst->flags & DST_NOCOUNT))
+ dst_entries_add(dst->ops, -1);
+
+ if (dst->ops->destroy)
+ dst->ops->destroy(dst);
+ if (dst->dev)
+ dev_put(dst->dev);
+ kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+ dst = child;
+ if (dst) {
+ int nohash = dst->flags & DST_NOHASH;
+
+ if (atomic_dec_and_test(&dst->__refcnt)) {
+ /* We were real parent of this dst, so kill child. */
+ if (nohash)
+ goto again;
+ } else {
+ /* Child is still referenced, return it for freeing. */
+ if (nohash)
+ return dst;
+ /* Child is still in his hash table */
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(dst_destroy);
+
+static void dst_destroy_rcu(struct rcu_head *head)
+{
+ struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+
+ dst = dst_destroy(dst);
+ if (dst)
+ __dst_free(dst);
+}
+
+void dst_release(struct dst_entry *dst)
+{
+ if (dst) {
+ int newrefcnt;
+
+ newrefcnt = atomic_dec_return(&dst->__refcnt);
+ WARN_ON(newrefcnt < 0);
+ if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
+ call_rcu(&dst->rcu_head, dst_destroy_rcu);
+ }
+}
+EXPORT_SYMBOL(dst_release);
+
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+ if (p) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ kfree(p);
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false. */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+ unsigned long prev, new;
+
+ new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
+ prev = cmpxchg(&dst->_metrics, old, new);
+ if (prev == old)
+ kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
+/* Dirty hack. We did it in 2.2 (in __dst_free),
+ * we have _very_ good reasons not to repeat
+ * this mistake in 2.3, but we have no choice
+ * now. _It_ _is_ _explicit_ _deliberate_
+ * _race_ _condition_.
+ *
+ * Commented and originally written by Alexey.
+ */
+static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+ int unregister)
+{
+ if (dst->ops->ifdown)
+ dst->ops->ifdown(dst, dev, unregister);
+
+ if (dev != dst->dev)
+ return;
+
+ if (!unregister) {
+ dst->input = dst_discard;
+ dst->output = dst_discard_sk;
+ } else {
+ dst->dev = dev_net(dst->dev)->loopback_dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+ }
+}
+
+static int dst_dev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct dst_entry *dst, *last = NULL;
+
+ switch (event) {
+ case NETDEV_UNREGISTER_FINAL:
+ case NETDEV_DOWN:
+ mutex_lock(&dst_gc_mutex);
+ for (dst = dst_busy_list; dst; dst = dst->next) {
+ last = dst;
+ dst_ifdown(dst, dev, event != NETDEV_DOWN);
+ }
+
+ spin_lock_bh(&dst_garbage.lock);
+ dst = dst_garbage.list;
+ dst_garbage.list = NULL;
+ spin_unlock_bh(&dst_garbage.lock);
+
+ if (last)
+ last->next = dst;
+ else
+ dst_busy_list = dst;
+ for (; dst; dst = dst->next)
+ dst_ifdown(dst, dev, event != NETDEV_DOWN);
+ mutex_unlock(&dst_gc_mutex);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dst_dev_notifier = {
+ .notifier_call = dst_dev_event,
+ .priority = -10, /* must be called after other network notifiers */
+};
+
+void __init dst_init(void)
+{
+ register_netdevice_notifier(&dst_dev_notifier);
+}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
new file mode 100644
index 000000000..1d00b8922
--- /dev/null
+++ b/net/core/ethtool.c
@@ -0,0 +1,1994 @@
+/*
+ * net/core/ethtool.c - Ethtool ioctl handler
+ * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
+ *
+ * This file is where we call all the ethtool_ops commands to get
+ * the information ethtool needs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
+#include <linux/net.h>
+
+/*
+ * Some useful ethtool_ops methods that're device independent.
+ * If we find that all drivers want to do the same thing here,
+ * we can turn these into dev_() function calls.
+ */
+
+u32 ethtool_op_get_link(struct net_device *dev)
+{
+ return netif_carrier_ok(dev) ? 1 : 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_link);
+
+int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
+{
+ info->so_timestamping =
+ SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ return 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_ts_info);
+
+/* Handlers for each ethtool command */
+
+#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
+
+static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
+ [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
+ [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_LLTX_BIT] = "tx-lockless",
+ [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+ [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
+ [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
+ [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+ [NETIF_F_RXFCS_BIT] = "rx-fcs",
+ [NETIF_F_RXALL_BIT] = "rx-all",
+ [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
+ [NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload",
+};
+
+static const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
+ [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
+ [ETH_RSS_HASH_XOR_BIT] = "xor",
+};
+
+static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gfeatures cmd = {
+ .cmd = ETHTOOL_GFEATURES,
+ .size = ETHTOOL_DEV_FEATURE_WORDS,
+ };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 __user *sizeaddr;
+ u32 copy_size;
+ int i;
+
+ /* in case feature bits run out again */
+ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t));
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ features[i].available = (u32)(dev->hw_features >> (32 * i));
+ features[i].requested = (u32)(dev->wanted_features >> (32 * i));
+ features[i].active = (u32)(dev->features >> (32 * i));
+ features[i].never_changed =
+ (u32)(NETIF_F_NEVER_CHANGE >> (32 * i));
+ }
+
+ sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
+ if (get_user(copy_size, sizeaddr))
+ return -EFAULT;
+
+ if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
+ copy_size = ETHTOOL_DEV_FEATURE_WORDS;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+ if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_sfeatures cmd;
+ struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+ netdev_features_t wanted = 0, valid = 0;
+ int i, ret = 0;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+ useraddr += sizeof(cmd);
+
+ if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
+ return -EINVAL;
+
+ if (copy_from_user(features, useraddr, sizeof(features)))
+ return -EFAULT;
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ valid |= (netdev_features_t)features[i].valid << (32 * i);
+ wanted |= (netdev_features_t)features[i].requested << (32 * i);
+ }
+
+ if (valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
+
+ if (valid & ~dev->hw_features) {
+ valid &= dev->hw_features;
+ ret |= ETHTOOL_F_UNSUPPORTED;
+ }
+
+ dev->wanted_features &= ~valid;
+ dev->wanted_features |= wanted & valid;
+ __netdev_update_features(dev);
+
+ if ((dev->wanted_features ^ dev->features) & valid)
+ ret |= ETHTOOL_F_WISH;
+
+ return ret;
+}
+
+static int __ethtool_get_sset_count(struct net_device *dev, int sset)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (sset == ETH_SS_FEATURES)
+ return ARRAY_SIZE(netdev_features_strings);
+
+ if (sset == ETH_SS_RSS_HASH_FUNCS)
+ return ARRAY_SIZE(rss_hash_func_strings);
+
+ if (ops->get_sset_count && ops->get_strings)
+ return ops->get_sset_count(dev, sset);
+ else
+ return -EOPNOTSUPP;
+}
+
+static void __ethtool_get_strings(struct net_device *dev,
+ u32 stringset, u8 *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (stringset == ETH_SS_FEATURES)
+ memcpy(data, netdev_features_strings,
+ sizeof(netdev_features_strings));
+ else if (stringset == ETH_SS_RSS_HASH_FUNCS)
+ memcpy(data, rss_hash_func_strings,
+ sizeof(rss_hash_func_strings));
+ else
+ /* ops->get_strings is valid because checked earlier */
+ ops->get_strings(dev, stringset, data);
+}
+
+static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
+{
+ /* feature masks of legacy discrete ethtool ops */
+
+ switch (eth_cmd) {
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_STXCSUM:
+ return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_SRXCSUM:
+ return NETIF_F_RXCSUM;
+ case ETHTOOL_GSG:
+ case ETHTOOL_SSG:
+ return NETIF_F_SG;
+ case ETHTOOL_GTSO:
+ case ETHTOOL_STSO:
+ return NETIF_F_ALL_TSO;
+ case ETHTOOL_GUFO:
+ case ETHTOOL_SUFO:
+ return NETIF_F_UFO;
+ case ETHTOOL_GGSO:
+ case ETHTOOL_SGSO:
+ return NETIF_F_GSO;
+ case ETHTOOL_GGRO:
+ case ETHTOOL_SGRO:
+ return NETIF_F_GRO;
+ default:
+ BUG();
+ }
+}
+
+static int ethtool_get_one_feature(struct net_device *dev,
+ char __user *useraddr, u32 ethcmd)
+{
+ netdev_features_t mask = ethtool_get_feature_mask(ethcmd);
+ struct ethtool_value edata = {
+ .cmd = ethcmd,
+ .data = !!(dev->features & mask),
+ };
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_one_feature(struct net_device *dev,
+ void __user *useraddr, u32 ethcmd)
+{
+ struct ethtool_value edata;
+ netdev_features_t mask;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ mask = ethtool_get_feature_mask(ethcmd);
+ mask &= dev->hw_features;
+ if (!mask)
+ return -EOPNOTSUPP;
+
+ if (edata.data)
+ dev->wanted_features |= mask;
+ else
+ dev->wanted_features &= ~mask;
+
+ __netdev_update_features(dev);
+
+ return 0;
+}
+
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
+ NETIF_F_RXHASH)
+
+static u32 __ethtool_get_flags(struct net_device *dev)
+{
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO)
+ flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+ flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
+ flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE)
+ flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH)
+ flags |= ETH_FLAG_RXHASH;
+
+ return flags;
+}
+
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+ netdev_features_t features = 0, changed;
+
+ if (data & ~ETH_ALL_FLAGS)
+ return -EINVAL;
+
+ if (data & ETH_FLAG_LRO)
+ features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_RX;
+ if (data & ETH_FLAG_TXVLAN)
+ features |= NETIF_F_HW_VLAN_CTAG_TX;
+ if (data & ETH_FLAG_NTUPLE)
+ features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH)
+ features |= NETIF_F_RXHASH;
+
+ /* allow changing only bits set in hw_features */
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
+ if (changed & ~dev->hw_features)
+ return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+ dev->wanted_features =
+ (dev->wanted_features & ~changed) | (features & changed);
+
+ __netdev_update_features(dev);
+
+ return 0;
+}
+
+int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
+
+ memset(cmd, 0, sizeof(struct ethtool_cmd));
+ cmd->cmd = ETHTOOL_GSET;
+ return dev->ethtool_ops->get_settings(dev, cmd);
+}
+EXPORT_SYMBOL(__ethtool_get_settings);
+
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ int err;
+ struct ethtool_cmd cmd;
+
+ err = __ethtool_get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+
+ if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_cmd cmd;
+
+ if (!dev->ethtool_ops->set_settings)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_settings(dev, &cmd);
+}
+
+static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_drvinfo info;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GDRVINFO;
+ if (ops->get_drvinfo) {
+ ops->get_drvinfo(dev, &info);
+ } else if (dev->dev.parent && dev->dev.parent->driver) {
+ strlcpy(info.bus_info, dev_name(dev->dev.parent),
+ sizeof(info.bus_info));
+ strlcpy(info.driver, dev->dev.parent->driver->name,
+ sizeof(info.driver));
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * this method of obtaining string set info is deprecated;
+ * Use ETHTOOL_GSSET_INFO instead.
+ */
+ if (ops->get_sset_count) {
+ int rc;
+
+ rc = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (rc >= 0)
+ info.testinfo_len = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (rc >= 0)
+ info.n_stats = rc;
+ rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+ if (rc >= 0)
+ info.n_priv_flags = rc;
+ }
+ if (ops->get_regs_len)
+ info.regdump_len = ops->get_regs_len(dev);
+ if (ops->get_eeprom_len)
+ info.eedump_len = ops->get_eeprom_len(dev);
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_sset_info info;
+ u64 sset_mask;
+ int i, idx = 0, n_bits = 0, ret, rc;
+ u32 *info_buf = NULL;
+
+ if (copy_from_user(&info, useraddr, sizeof(info)))
+ return -EFAULT;
+
+ /* store copy of mask, because we zero struct later on */
+ sset_mask = info.sset_mask;
+ if (!sset_mask)
+ return 0;
+
+ /* calculate size of return buffer */
+ n_bits = hweight64(sset_mask);
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GSSET_INFO;
+
+ info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER);
+ if (!info_buf)
+ return -ENOMEM;
+
+ /*
+ * fill return buffer based on input bitmask and successful
+ * get_sset_count return
+ */
+ for (i = 0; i < 64; i++) {
+ if (!(sset_mask & (1ULL << i)))
+ continue;
+
+ rc = __ethtool_get_sset_count(dev, i);
+ if (rc >= 0) {
+ info.sset_mask |= (1ULL << i);
+ info_buf[idx++] = rc;
+ }
+ }
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ goto out;
+
+ useraddr += offsetof(struct ethtool_sset_info, data);
+ if (copy_to_user(useraddr, info_buf, idx * sizeof(u32)))
+ goto out;
+
+ ret = 0;
+
+out:
+ kfree(info_buf);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!dev->ethtool_ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ /* struct ethtool_rxnfc was originally defined for
+ * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+ * members. User-space might still be using that
+ * definition. */
+ if (cmd == ETHTOOL_SRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+
+ rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS &&
+ copy_to_user(useraddr, &info, info_size))
+ return -EFAULT;
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ void *rule_buf = NULL;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ /* struct ethtool_rxnfc was originally defined for
+ * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+ * members. User-space might still be using that
+ * definition. */
+ if (cmd == ETHTOOL_GRXFH)
+ info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info.data));
+
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+
+ if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+ if (info.rule_cnt > 0) {
+ if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
+ rule_buf = kzalloc(info.rule_cnt * sizeof(u32),
+ GFP_USER);
+ if (!rule_buf)
+ return -ENOMEM;
+ }
+ }
+
+ ret = ops->get_rxnfc(dev, &info, rule_buf);
+ if (ret < 0)
+ goto err_out;
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &info, info_size))
+ goto err_out;
+
+ if (rule_buf) {
+ useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+ if (copy_to_user(useraddr, rule_buf,
+ info.rule_cnt * sizeof(u32)))
+ goto err_out;
+ }
+ ret = 0;
+
+err_out:
+ kfree(rule_buf);
+
+ return ret;
+}
+
+static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
+ struct ethtool_rxnfc *rx_rings,
+ u32 size)
+{
+ int i;
+
+ if (copy_from_user(indir, useraddr, size * sizeof(indir[0])))
+ return -EFAULT;
+
+ /* Validate ring indices */
+ for (i = 0; i < size; i++)
+ if (indir[i] >= rx_rings->data)
+ return -EINVAL;
+
+ return 0;
+}
+
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+
+void netdev_rss_key_fill(void *buffer, size_t len)
+{
+ BUG_ON(len > sizeof(netdev_rss_key));
+ net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key));
+ memcpy(buffer, netdev_rss_key, len);
+}
+EXPORT_SYMBOL(netdev_rss_key_fill);
+
+static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
+ void __user *useraddr)
+{
+ u32 user_size, dev_size;
+ u32 *indir;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&user_size,
+ useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ sizeof(user_size)))
+ return -EFAULT;
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ &dev_size, sizeof(dev_size)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size. Otherwise, if it's smaller than the
+ * device table size it's an error.
+ */
+ if (user_size < dev_size)
+ return user_size == 0 ? 0 : -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh_indir, ring_index[0]),
+ indir, dev_size * sizeof(indir[0])))
+ ret = -EFAULT;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_rxnfc rx_rings;
+ u32 user_size, dev_size, i;
+ u32 *indir;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int ret;
+ u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
+
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
+ !ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ dev_size = ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&user_size,
+ useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ sizeof(user_size)))
+ return -EFAULT;
+
+ if (user_size != 0 && user_size != dev_size)
+ return -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
+
+ if (user_size == 0) {
+ for (i = 0; i < dev_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + ringidx_offset,
+ &rx_rings,
+ dev_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 user_indir_size, user_key_size;
+ u32 dev_indir_size = 0, dev_key_size = 0;
+ struct ethtool_rxfh rxfh;
+ u32 total_size;
+ u32 indir_bytes;
+ u32 *indir = NULL;
+ u8 dev_hfunc = 0;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+
+ if (!ops->get_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+ user_indir_size = rxfh.indir_size;
+ user_key_size = rxfh.key_size;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
+ rxfh.rsvd8[2] || rxfh.rsvd32)
+ return -EINVAL;
+
+ rxfh.indir_size = dev_indir_size;
+ rxfh.key_size = dev_key_size;
+ if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
+ return -EFAULT;
+
+ if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
+ (user_key_size && (user_key_size != dev_key_size)))
+ return -EINVAL;
+
+ indir_bytes = user_indir_size * sizeof(indir[0]);
+ total_size = indir_bytes + user_key_size;
+ rss_config = kzalloc(total_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ if (user_indir_size)
+ indir = (u32 *)rss_config;
+
+ if (user_key_size)
+ hkey = rss_config + indir_bytes;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
+ &dev_hfunc, sizeof(rxfh.hfunc))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, rss_config[0]),
+ rss_config, total_size)) {
+ ret = -EFAULT;
+ }
+out:
+ kfree(rss_config);
+
+ return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc rx_rings;
+ struct ethtool_rxfh rxfh;
+ u32 dev_indir_size = 0, dev_key_size = 0, i;
+ u32 *indir = NULL, indir_bytes = 0;
+ u8 *hkey = NULL;
+ u8 *rss_config;
+ u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+
+ if (!ops->get_rxnfc || !ops->set_rxfh)
+ return -EOPNOTSUPP;
+
+ if (ops->get_rxfh_indir_size)
+ dev_indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ dev_key_size = ops->get_rxfh_key_size(dev);
+
+ if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
+ return -EFAULT;
+
+ /* Check that reserved fields are 0 for now */
+ if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
+ rxfh.rsvd8[2] || rxfh.rsvd32)
+ return -EINVAL;
+
+ /* If either indir, hash key or function is valid, proceed further.
+ * Must request at least one change: indir size, hash key or function.
+ */
+ if ((rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.indir_size != dev_indir_size) ||
+ (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
+ (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
+ rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE))
+ return -EINVAL;
+
+ if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ indir_bytes = dev_indir_size * sizeof(indir[0]);
+
+ rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER);
+ if (!rss_config)
+ return -ENOMEM;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
+ goto out;
+
+ /* rxfh.indir_size == 0 means reset the indir table to default.
+ * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
+ */
+ if (rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
+ indir = (u32 *)rss_config;
+ ret = ethtool_copy_validate_indir(indir,
+ useraddr + rss_cfg_offset,
+ &rx_rings,
+ rxfh.indir_size);
+ if (ret)
+ goto out;
+ } else if (rxfh.indir_size == 0) {
+ indir = (u32 *)rss_config;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ }
+
+ if (rxfh.key_size) {
+ hkey = rss_config + indir_bytes;
+ if (copy_from_user(hkey,
+ useraddr + rss_cfg_offset + indir_bytes,
+ rxfh.key_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+
+out:
+ kfree(rss_config);
+ return ret;
+}
+
+static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_regs regs;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *regbuf;
+ int reglen, ret;
+
+ if (!ops->get_regs || !ops->get_regs_len)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&regs, useraddr, sizeof(regs)))
+ return -EFAULT;
+
+ reglen = ops->get_regs_len(dev);
+ if (regs.len > reglen)
+ regs.len = reglen;
+
+ regbuf = vzalloc(reglen);
+ if (reglen && !regbuf)
+ return -ENOMEM;
+
+ ops->get_regs(dev, &regs, regbuf);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &regs, sizeof(regs)))
+ goto out;
+ useraddr += offsetof(struct ethtool_regs, data);
+ if (regbuf && copy_to_user(useraddr, regbuf, regs.len))
+ goto out;
+ ret = 0;
+
+ out:
+ vfree(regbuf);
+ return ret;
+}
+
+static int ethtool_reset(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value reset;
+ int ret;
+
+ if (!dev->ethtool_ops->reset)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&reset, useraddr, sizeof(reset)))
+ return -EFAULT;
+
+ ret = dev->ethtool_ops->reset(dev, &reset.data);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &reset, sizeof(reset)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+
+ if (!dev->ethtool_ops->get_wol)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_wol(dev, &wol);
+
+ if (copy_to_user(useraddr, &wol, sizeof(wol)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_wolinfo wol;
+
+ if (!dev->ethtool_ops->set_wol)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&wol, useraddr, sizeof(wol)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_wol(dev, &wol);
+}
+
+static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_eee edata;
+ int rc;
+
+ if (!dev->ethtool_ops->get_eee)
+ return -EOPNOTSUPP;
+
+ memset(&edata, 0, sizeof(struct ethtool_eee));
+ edata.cmd = ETHTOOL_GEEE;
+ rc = dev->ethtool_ops->get_eee(dev, &edata);
+
+ if (rc)
+ return rc;
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_eee edata;
+
+ if (!dev->ethtool_ops->set_eee)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_eee(dev, &edata);
+}
+
+static int ethtool_nway_reset(struct net_device *dev)
+{
+ if (!dev->ethtool_ops->nway_reset)
+ return -EOPNOTSUPP;
+
+ return dev->ethtool_ops->nway_reset(dev);
+}
+
+static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
+
+ if (!dev->ethtool_ops->get_link)
+ return -EOPNOTSUPP;
+
+ edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev);
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr,
+ int (*getter)(struct net_device *,
+ struct ethtool_eeprom *, u8 *),
+ u32 total_len)
+{
+ struct ethtool_eeprom eeprom;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
+ u8 *data;
+ int ret = 0;
+
+ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+ return -EFAULT;
+
+ /* Check for wrap and zero */
+ if (eeprom.offset + eeprom.len <= eeprom.offset)
+ return -EINVAL;
+
+ /* Check for exceeding total eeprom len */
+ if (eeprom.offset + eeprom.len > total_len)
+ return -EINVAL;
+
+ data = kmalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ ret = getter(dev, &eeprom, data);
+ if (ret)
+ break;
+ if (copy_to_user(userbuf, data, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
+
+ eeprom.len = userbuf - (useraddr + sizeof(eeprom));
+ eeprom.offset -= eeprom.len;
+ if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
+ ret = -EFAULT;
+
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_eeprom || !ops->get_eeprom_len ||
+ !ops->get_eeprom_len(dev))
+ return -EOPNOTSUPP;
+
+ return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom,
+ ops->get_eeprom_len(dev));
+}
+
+static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_eeprom eeprom;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void __user *userbuf = useraddr + sizeof(eeprom);
+ u32 bytes_remaining;
+ u8 *data;
+ int ret = 0;
+
+ if (!ops->set_eeprom || !ops->get_eeprom_len ||
+ !ops->get_eeprom_len(dev))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+ return -EFAULT;
+
+ /* Check for wrap and zero */
+ if (eeprom.offset + eeprom.len <= eeprom.offset)
+ return -EINVAL;
+
+ /* Check for exceeding total eeprom len */
+ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+ return -EINVAL;
+
+ data = kmalloc(PAGE_SIZE, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ bytes_remaining = eeprom.len;
+ while (bytes_remaining > 0) {
+ eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+ if (copy_from_user(data, userbuf, eeprom.len)) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = ops->set_eeprom(dev, &eeprom, data);
+ if (ret)
+ break;
+ userbuf += eeprom.len;
+ eeprom.offset += eeprom.len;
+ bytes_remaining -= eeprom.len;
+ }
+
+ kfree(data);
+ return ret;
+}
+
+static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+ if (!dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_coalesce(dev, &coalesce);
+
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_coalesce coalesce;
+
+ if (!dev->ethtool_ops->set_coalesce)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_coalesce(dev, &coalesce);
+}
+
+static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
+
+ if (!dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_ringparam(dev, &ringparam);
+
+ if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_ringparam ringparam;
+
+ if (!dev->ethtool_ops->set_ringparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_ringparam(dev, &ringparam);
+}
+
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_channels(dev, &channels);
+
+ if (copy_to_user(useraddr, &channels, sizeof(channels)))
+ return -EFAULT;
+ return 0;
+}
+
+static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_channels channels;
+
+ if (!dev->ethtool_ops->set_channels)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&channels, useraddr, sizeof(channels)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_channels(dev, &channels);
+}
+
+static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
+
+ if (!dev->ethtool_ops->get_pauseparam)
+ return -EOPNOTSUPP;
+
+ dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
+
+ if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_pauseparam pauseparam;
+
+ if (!dev->ethtool_ops->set_pauseparam)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+}
+
+static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
+{
+ struct ethtool_test test;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, test_len;
+
+ if (!ops->self_test || !ops->get_sset_count)
+ return -EOPNOTSUPP;
+
+ test_len = ops->get_sset_count(dev, ETH_SS_TEST);
+ if (test_len < 0)
+ return test_len;
+ WARN_ON(test_len == 0);
+
+ if (copy_from_user(&test, useraddr, sizeof(test)))
+ return -EFAULT;
+
+ test.len = test_len;
+ data = kmalloc(test_len * sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ops->self_test(dev, &test, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &test, sizeof(test)))
+ goto out;
+ useraddr += sizeof(test);
+ if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_gstrings gstrings;
+ u8 *data;
+ int ret;
+
+ if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+ return -EFAULT;
+
+ ret = __ethtool_get_sset_count(dev, gstrings.string_set);
+ if (ret < 0)
+ return ret;
+
+ gstrings.len = ret;
+
+ data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ __ethtool_get_strings(dev, gstrings.string_set, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
+ goto out;
+ useraddr += sizeof(gstrings);
+ if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_value id;
+ static bool busy;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int rc;
+
+ if (!ops->set_phys_id)
+ return -EOPNOTSUPP;
+
+ if (busy)
+ return -EBUSY;
+
+ if (copy_from_user(&id, useraddr, sizeof(id)))
+ return -EFAULT;
+
+ rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);
+ if (rc < 0)
+ return rc;
+
+ /* Drop the RTNL lock while waiting, but prevent reentry or
+ * removal of the device.
+ */
+ busy = true;
+ dev_hold(dev);
+ rtnl_unlock();
+
+ if (rc == 0) {
+ /* Driver will handle this itself */
+ schedule_timeout_interruptible(
+ id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
+ } else {
+ /* Driver expects to be called at twice the frequency in rc */
+ int n = rc * 2, i, interval = HZ / n;
+
+ /* Count down seconds */
+ do {
+ /* Count down iterations per second */
+ i = n;
+ do {
+ rtnl_lock();
+ rc = ops->set_phys_id(dev,
+ (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+ rtnl_unlock();
+ if (rc)
+ break;
+ schedule_timeout_interruptible(interval);
+ } while (!signal_pending(current) && --i != 0);
+ } while (!signal_pending(current) &&
+ (id.data == 0 || --id.data != 0));
+ }
+
+ rtnl_lock();
+ dev_put(dev);
+ busy = false;
+
+ (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
+ return rc;
+}
+
+static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u64 *data;
+ int ret, n_stats;
+
+ if (!ops->get_ethtool_stats || !ops->get_sset_count)
+ return -EOPNOTSUPP;
+
+ n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
+ if (n_stats < 0)
+ return n_stats;
+ WARN_ON(n_stats == 0);
+
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ return -EFAULT;
+
+ stats.n_stats = n_stats;
+ data = kmalloc(n_stats * sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ops->get_ethtool_stats(dev, &stats, data);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ goto out;
+ useraddr += sizeof(stats);
+ if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_perm_addr epaddr;
+
+ if (copy_from_user(&epaddr, useraddr, sizeof(epaddr)))
+ return -EFAULT;
+
+ if (epaddr.size < dev->addr_len)
+ return -ETOOSMALL;
+ epaddr.size = dev->addr_len;
+
+ if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+ return -EFAULT;
+ useraddr += sizeof(epaddr);
+ if (copy_to_user(useraddr, dev->perm_addr, epaddr.size))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_value(struct net_device *dev, char __user *useraddr,
+ u32 cmd, u32 (*actor)(struct net_device *))
+{
+ struct ethtool_value edata = { .cmd = cmd };
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ edata.data = actor(dev);
+
+ if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr,
+ void (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ actor(dev, edata.data);
+ return 0;
+}
+
+static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
+ int (*actor)(struct net_device *, u32))
+{
+ struct ethtool_value edata;
+
+ if (!actor)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ return -EFAULT;
+
+ return actor(dev, edata.data);
+}
+
+static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
+ char __user *useraddr)
+{
+ struct ethtool_flash efl;
+
+ if (copy_from_user(&efl, useraddr, sizeof(efl)))
+ return -EFAULT;
+
+ if (!dev->ethtool_ops->flash_device)
+ return -EOPNOTSUPP;
+
+ efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
+
+ return dev->ethtool_ops->flash_device(dev, &efl);
+}
+
+static int ethtool_set_dump(struct net_device *dev,
+ void __user *useraddr)
+{
+ struct ethtool_dump dump;
+
+ if (!dev->ethtool_ops->set_dump)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ return dev->ethtool_ops->set_dump(dev, &dump);
+}
+
+static int ethtool_get_dump_flag(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_dump dump;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_dump_flag)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ ret = ops->get_dump_flag(dev, &dump);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &dump, sizeof(dump)))
+ return -EFAULT;
+ return 0;
+}
+
+static int ethtool_get_dump_data(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ __u32 len;
+ struct ethtool_dump dump, tmp;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data = NULL;
+
+ if (!ops->get_dump_data || !ops->get_dump_flag)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&dump, useraddr, sizeof(dump)))
+ return -EFAULT;
+
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.cmd = ETHTOOL_GET_DUMP_FLAG;
+ ret = ops->get_dump_flag(dev, &tmp);
+ if (ret)
+ return ret;
+
+ len = min(tmp.len, dump.len);
+ if (!len)
+ return -EFAULT;
+
+ /* Don't ever let the driver think there's more space available
+ * than it requested with .get_dump_flag().
+ */
+ dump.len = len;
+
+ /* Always allocate enough space to hold the whole thing so that the
+ * driver does not need to check the length and bother with partial
+ * dumping.
+ */
+ data = vzalloc(tmp.len);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_dump_data(dev, &dump, data);
+ if (ret)
+ goto out;
+
+ /* There are two sane possibilities:
+ * 1. The driver's .get_dump_data() does not touch dump.len.
+ * 2. Or it may set dump.len to how much it really writes, which
+ * should be tmp.len (or len if it can do a partial dump).
+ * In any case respond to userspace with the actual length of data
+ * it's receiving.
+ */
+ WARN_ON(dump.len != len && dump.len != tmp.len);
+ dump.len = len;
+
+ if (copy_to_user(useraddr, &dump, sizeof(dump))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ useraddr += offsetof(struct ethtool_dump, data);
+ if (copy_to_user(useraddr, data, len))
+ ret = -EFAULT;
+out:
+ vfree(data);
+ return ret;
+}
+
+static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
+{
+ int err = 0;
+ struct ethtool_ts_info info;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ memset(&info, 0, sizeof(info));
+ info.cmd = ETHTOOL_GET_TS_INFO;
+
+ if (phydev && phydev->drv && phydev->drv->ts_info) {
+ err = phydev->drv->ts_info(phydev, &info);
+ } else if (ops->get_ts_info) {
+ err = ops->get_ts_info(dev, &info);
+ } else {
+ info.so_timestamping =
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info.phc_index = -1;
+ }
+
+ if (err)
+ return err;
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static int __ethtool_get_module_info(struct net_device *dev,
+ struct ethtool_modinfo *modinfo)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ if (phydev && phydev->drv && phydev->drv->module_info)
+ return phydev->drv->module_info(phydev, modinfo);
+
+ if (ops->get_module_info)
+ return ops->get_module_info(dev, modinfo);
+
+ return -EOPNOTSUPP;
+}
+
+static int ethtool_get_module_info(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_modinfo modinfo;
+
+ if (copy_from_user(&modinfo, useraddr, sizeof(modinfo)))
+ return -EFAULT;
+
+ ret = __ethtool_get_module_info(dev, &modinfo);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(useraddr, &modinfo, sizeof(modinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int __ethtool_get_module_eeprom(struct net_device *dev,
+ struct ethtool_eeprom *ee, u8 *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ if (phydev && phydev->drv && phydev->drv->module_eeprom)
+ return phydev->drv->module_eeprom(phydev, ee, data);
+
+ if (ops->get_module_eeprom)
+ return ops->get_module_eeprom(dev, ee, data);
+
+ return -EOPNOTSUPP;
+}
+
+static int ethtool_get_module_eeprom(struct net_device *dev,
+ void __user *useraddr)
+{
+ int ret;
+ struct ethtool_modinfo modinfo;
+
+ ret = __ethtool_get_module_info(dev, &modinfo);
+ if (ret)
+ return ret;
+
+ return ethtool_get_any_eeprom(dev, useraddr,
+ __ethtool_get_module_eeprom,
+ modinfo.eeprom_len);
+}
+
+static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_RX_COPYBREAK:
+ case ETHTOOL_TX_COPYBREAK:
+ if (tuna->len != sizeof(u32) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U32)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->get_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ ret = ops->get_tunable(dev, &tuna, data);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *data;
+
+ if (!ops->set_tunable)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_from_user(data, useraddr, tuna.len))
+ goto out;
+ ret = ops->set_tunable(dev, &tuna, data);
+
+out:
+ kfree(data);
+ return ret;
+}
+
+/* The main entry point in this file. Called from net/core/dev_ioctl.c */
+
+int dev_ethtool(struct net *net, struct ifreq *ifr)
+{
+ struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+ void __user *useraddr = ifr->ifr_data;
+ u32 ethcmd;
+ int rc;
+ netdev_features_t old_features;
+
+ if (!dev || !netif_device_present(dev))
+ return -ENODEV;
+
+ if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
+ return -EFAULT;
+
+ /* Allow some commands to be done by anyone */
+ switch (ethcmd) {
+ case ETHTOOL_GSET:
+ case ETHTOOL_GDRVINFO:
+ case ETHTOOL_GMSGLVL:
+ case ETHTOOL_GLINK:
+ case ETHTOOL_GCOALESCE:
+ case ETHTOOL_GRINGPARAM:
+ case ETHTOOL_GPAUSEPARAM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GSSET_INFO:
+ case ETHTOOL_GSTRINGS:
+ case ETHTOOL_GSTATS:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GPERMADDR:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ case ETHTOOL_GFLAGS:
+ case ETHTOOL_GPFLAGS:
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ case ETHTOOL_GRXFHINDIR:
+ case ETHTOOL_GRSSH:
+ case ETHTOOL_GFEATURES:
+ case ETHTOOL_GCHANNELS:
+ case ETHTOOL_GET_TS_INFO:
+ case ETHTOOL_GEEE:
+ case ETHTOOL_GTUNABLE:
+ break;
+ default:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ }
+
+ if (dev->ethtool_ops->begin) {
+ rc = dev->ethtool_ops->begin(dev);
+ if (rc < 0)
+ return rc;
+ }
+ old_features = dev->features;
+
+ switch (ethcmd) {
+ case ETHTOOL_GSET:
+ rc = ethtool_get_settings(dev, useraddr);
+ break;
+ case ETHTOOL_SSET:
+ rc = ethtool_set_settings(dev, useraddr);
+ break;
+ case ETHTOOL_GDRVINFO:
+ rc = ethtool_get_drvinfo(dev, useraddr);
+ break;
+ case ETHTOOL_GREGS:
+ rc = ethtool_get_regs(dev, useraddr);
+ break;
+ case ETHTOOL_GWOL:
+ rc = ethtool_get_wol(dev, useraddr);
+ break;
+ case ETHTOOL_SWOL:
+ rc = ethtool_set_wol(dev, useraddr);
+ break;
+ case ETHTOOL_GMSGLVL:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_msglevel);
+ break;
+ case ETHTOOL_SMSGLVL:
+ rc = ethtool_set_value_void(dev, useraddr,
+ dev->ethtool_ops->set_msglevel);
+ break;
+ case ETHTOOL_GEEE:
+ rc = ethtool_get_eee(dev, useraddr);
+ break;
+ case ETHTOOL_SEEE:
+ rc = ethtool_set_eee(dev, useraddr);
+ break;
+ case ETHTOOL_NWAY_RST:
+ rc = ethtool_nway_reset(dev);
+ break;
+ case ETHTOOL_GLINK:
+ rc = ethtool_get_link(dev, useraddr);
+ break;
+ case ETHTOOL_GEEPROM:
+ rc = ethtool_get_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_SEEPROM:
+ rc = ethtool_set_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_GCOALESCE:
+ rc = ethtool_get_coalesce(dev, useraddr);
+ break;
+ case ETHTOOL_SCOALESCE:
+ rc = ethtool_set_coalesce(dev, useraddr);
+ break;
+ case ETHTOOL_GRINGPARAM:
+ rc = ethtool_get_ringparam(dev, useraddr);
+ break;
+ case ETHTOOL_SRINGPARAM:
+ rc = ethtool_set_ringparam(dev, useraddr);
+ break;
+ case ETHTOOL_GPAUSEPARAM:
+ rc = ethtool_get_pauseparam(dev, useraddr);
+ break;
+ case ETHTOOL_SPAUSEPARAM:
+ rc = ethtool_set_pauseparam(dev, useraddr);
+ break;
+ case ETHTOOL_TEST:
+ rc = ethtool_self_test(dev, useraddr);
+ break;
+ case ETHTOOL_GSTRINGS:
+ rc = ethtool_get_strings(dev, useraddr);
+ break;
+ case ETHTOOL_PHYS_ID:
+ rc = ethtool_phys_id(dev, useraddr);
+ break;
+ case ETHTOOL_GSTATS:
+ rc = ethtool_get_stats(dev, useraddr);
+ break;
+ case ETHTOOL_GPERMADDR:
+ rc = ethtool_get_perm_addr(dev, useraddr);
+ break;
+ case ETHTOOL_GFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ __ethtool_get_flags);
+ break;
+ case ETHTOOL_SFLAGS:
+ rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
+ break;
+ case ETHTOOL_GPFLAGS:
+ rc = ethtool_get_value(dev, useraddr, ethcmd,
+ dev->ethtool_ops->get_priv_flags);
+ break;
+ case ETHTOOL_SPFLAGS:
+ rc = ethtool_set_value(dev, useraddr,
+ dev->ethtool_ops->set_priv_flags);
+ break;
+ case ETHTOOL_GRXFH:
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_GRXCLSRLALL:
+ rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_SRXFH:
+ case ETHTOOL_SRXCLSRLDEL:
+ case ETHTOOL_SRXCLSRLINS:
+ rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_FLASHDEV:
+ rc = ethtool_flash_device(dev, useraddr);
+ break;
+ case ETHTOOL_RESET:
+ rc = ethtool_reset(dev, useraddr);
+ break;
+ case ETHTOOL_GSSET_INFO:
+ rc = ethtool_get_sset_info(dev, useraddr);
+ break;
+ case ETHTOOL_GRXFHINDIR:
+ rc = ethtool_get_rxfh_indir(dev, useraddr);
+ break;
+ case ETHTOOL_SRXFHINDIR:
+ rc = ethtool_set_rxfh_indir(dev, useraddr);
+ break;
+ case ETHTOOL_GRSSH:
+ rc = ethtool_get_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_SRSSH:
+ rc = ethtool_set_rxfh(dev, useraddr);
+ break;
+ case ETHTOOL_GFEATURES:
+ rc = ethtool_get_features(dev, useraddr);
+ break;
+ case ETHTOOL_SFEATURES:
+ rc = ethtool_set_features(dev, useraddr);
+ break;
+ case ETHTOOL_GTXCSUM:
+ case ETHTOOL_GRXCSUM:
+ case ETHTOOL_GSG:
+ case ETHTOOL_GTSO:
+ case ETHTOOL_GUFO:
+ case ETHTOOL_GGSO:
+ case ETHTOOL_GGRO:
+ rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_STXCSUM:
+ case ETHTOOL_SRXCSUM:
+ case ETHTOOL_SSG:
+ case ETHTOOL_STSO:
+ case ETHTOOL_SUFO:
+ case ETHTOOL_SGSO:
+ case ETHTOOL_SGRO:
+ rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
+ break;
+ case ETHTOOL_GCHANNELS:
+ rc = ethtool_get_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SCHANNELS:
+ rc = ethtool_set_channels(dev, useraddr);
+ break;
+ case ETHTOOL_SET_DUMP:
+ rc = ethtool_set_dump(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_FLAG:
+ rc = ethtool_get_dump_flag(dev, useraddr);
+ break;
+ case ETHTOOL_GET_DUMP_DATA:
+ rc = ethtool_get_dump_data(dev, useraddr);
+ break;
+ case ETHTOOL_GET_TS_INFO:
+ rc = ethtool_get_ts_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEINFO:
+ rc = ethtool_get_module_info(dev, useraddr);
+ break;
+ case ETHTOOL_GMODULEEEPROM:
+ rc = ethtool_get_module_eeprom(dev, useraddr);
+ break;
+ case ETHTOOL_GTUNABLE:
+ rc = ethtool_get_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_STUNABLE:
+ rc = ethtool_set_tunable(dev, useraddr);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ if (dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+
+ if (old_features != dev->features)
+ netdev_features_change(dev);
+
+ return rc;
+}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
new file mode 100644
index 000000000..9a12668f7
--- /dev/null
+++ b/net/core/fib_rules.c
@@ -0,0 +1,799 @@
+/*
+ * net/core/fib_rules.c Generic Routing Rules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/fib_rules.h>
+
+int fib_default_rule_add(struct fib_rules_ops *ops,
+ u32 pref, u32 table, u32 flags)
+{
+ struct fib_rule *r;
+
+ r = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (r == NULL)
+ return -ENOMEM;
+
+ atomic_set(&r->refcnt, 1);
+ r->action = FR_ACT_TO_TBL;
+ r->pref = pref;
+ r->table = table;
+ r->flags = flags;
+ r->fr_net = ops->fro_net;
+
+ r->suppress_prefixlen = -1;
+ r->suppress_ifgroup = -1;
+
+ /* The lock is not required here, the list in unreacheable
+ * at the moment this function is called */
+ list_add_tail(&r->list, &ops->rules_list);
+ return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_add);
+
+u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+{
+ struct list_head *pos;
+ struct fib_rule *rule;
+
+ if (!list_empty(&ops->rules_list)) {
+ pos = ops->rules_list.next;
+ if (pos->next != &ops->rules_list) {
+ rule = list_entry(pos->next, struct fib_rule, list);
+ if (rule->pref)
+ return rule->pref - 1;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_pref);
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+ struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+ u32 pid);
+
+static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
+{
+ struct fib_rules_ops *ops;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (ops->family == family) {
+ if (!try_module_get(ops->owner))
+ ops = NULL;
+ rcu_read_unlock();
+ return ops;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void rules_ops_put(struct fib_rules_ops *ops)
+{
+ if (ops)
+ module_put(ops->owner);
+}
+
+static void flush_route_cache(struct fib_rules_ops *ops)
+{
+ if (ops->flush_cache)
+ ops->flush_cache(ops);
+}
+
+static int __fib_rules_register(struct fib_rules_ops *ops)
+{
+ int err = -EEXIST;
+ struct fib_rules_ops *o;
+ struct net *net;
+
+ net = ops->fro_net;
+
+ if (ops->rule_size < sizeof(struct fib_rule))
+ return -EINVAL;
+
+ if (ops->match == NULL || ops->configure == NULL ||
+ ops->compare == NULL || ops->fill == NULL ||
+ ops->action == NULL)
+ return -EINVAL;
+
+ spin_lock(&net->rules_mod_lock);
+ list_for_each_entry(o, &net->rules_ops, list)
+ if (ops->family == o->family)
+ goto errout;
+
+ list_add_tail_rcu(&ops->list, &net->rules_ops);
+ err = 0;
+errout:
+ spin_unlock(&net->rules_mod_lock);
+
+ return err;
+}
+
+struct fib_rules_ops *
+fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
+{
+ struct fib_rules_ops *ops;
+ int err;
+
+ ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+ if (ops == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ops->rules_list);
+ ops->fro_net = net;
+
+ err = __fib_rules_register(ops);
+ if (err) {
+ kfree(ops);
+ ops = ERR_PTR(err);
+ }
+
+ return ops;
+}
+EXPORT_SYMBOL_GPL(fib_rules_register);
+
+static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
+{
+ struct fib_rule *rule, *tmp;
+
+ list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
+ list_del_rcu(&rule->list);
+ if (ops->delete)
+ ops->delete(rule);
+ fib_rule_put(rule);
+ }
+}
+
+void fib_rules_unregister(struct fib_rules_ops *ops)
+{
+ struct net *net = ops->fro_net;
+
+ spin_lock(&net->rules_mod_lock);
+ list_del_rcu(&ops->list);
+ spin_unlock(&net->rules_mod_lock);
+
+ fib_rules_cleanup_ops(ops);
+ kfree_rcu(ops, rcu);
+}
+EXPORT_SYMBOL_GPL(fib_rules_unregister);
+
+static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
+ struct flowi *fl, int flags)
+{
+ int ret = 0;
+
+ if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
+ goto out;
+
+ if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
+ goto out;
+
+ if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
+ goto out;
+
+ ret = ops->match(rule, fl, flags);
+out:
+ return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
+}
+
+int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct fib_rule *rule;
+ int err;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+jumped:
+ if (!fib_rule_match(rule, ops, fl, flags))
+ continue;
+
+ if (rule->action == FR_ACT_GOTO) {
+ struct fib_rule *target;
+
+ target = rcu_dereference(rule->ctarget);
+ if (target == NULL) {
+ continue;
+ } else {
+ rule = target;
+ goto jumped;
+ }
+ } else if (rule->action == FR_ACT_NOP)
+ continue;
+ else
+ err = ops->action(rule, fl, flags, arg);
+
+ if (!err && ops->suppress && ops->suppress(rule, arg))
+ continue;
+
+ if (err != -EAGAIN) {
+ if ((arg->flags & FIB_LOOKUP_NOREF) ||
+ likely(atomic_inc_not_zero(&rule->refcnt))) {
+ arg->rule = rule;
+ goto out;
+ }
+ break;
+ }
+ }
+
+ err = -ESRCH;
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_rules_lookup);
+
+static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
+ struct fib_rules_ops *ops)
+{
+ int err = -EINVAL;
+
+ if (frh->src_len)
+ if (tb[FRA_SRC] == NULL ||
+ frh->src_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_SRC]) != ops->addr_size)
+ goto errout;
+
+ if (frh->dst_len)
+ if (tb[FRA_DST] == NULL ||
+ frh->dst_len > (ops->addr_size * 8) ||
+ nla_len(tb[FRA_DST]) != ops->addr_size)
+ goto errout;
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule, *r, *last = NULL;
+ struct nlattr *tb[FRA_MAX+1];
+ int err = -EINVAL, unresolved = 0;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ goto errout;
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (ops == NULL) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+ if (err < 0)
+ goto errout;
+
+ err = validate_rulemsg(frh, tb, ops);
+ if (err < 0)
+ goto errout;
+
+ rule = kzalloc(ops->rule_size, GFP_KERNEL);
+ if (rule == NULL) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ rule->fr_net = net;
+
+ if (tb[FRA_PRIORITY])
+ rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+
+ if (tb[FRA_IIFNAME]) {
+ struct net_device *dev;
+
+ rule->iifindex = -1;
+ nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, rule->iifname);
+ if (dev)
+ rule->iifindex = dev->ifindex;
+ }
+
+ if (tb[FRA_OIFNAME]) {
+ struct net_device *dev;
+
+ rule->oifindex = -1;
+ nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ dev = __dev_get_by_name(net, rule->oifname);
+ if (dev)
+ rule->oifindex = dev->ifindex;
+ }
+
+ if (tb[FRA_FWMARK]) {
+ rule->mark = nla_get_u32(tb[FRA_FWMARK]);
+ if (rule->mark)
+ /* compatibility: if the mark value is non-zero all bits
+ * are compared unless a mask is explicitly specified.
+ */
+ rule->mark_mask = 0xFFFFFFFF;
+ }
+
+ if (tb[FRA_FWMASK])
+ rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+
+ rule->action = frh->action;
+ rule->flags = frh->flags;
+ rule->table = frh_get_table(frh, tb);
+ if (tb[FRA_SUPPRESS_PREFIXLEN])
+ rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+ else
+ rule->suppress_prefixlen = -1;
+
+ if (tb[FRA_SUPPRESS_IFGROUP])
+ rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+ else
+ rule->suppress_ifgroup = -1;
+
+ if (!tb[FRA_PRIORITY] && ops->default_pref)
+ rule->pref = ops->default_pref(ops);
+
+ err = -EINVAL;
+ if (tb[FRA_GOTO]) {
+ if (rule->action != FR_ACT_GOTO)
+ goto errout_free;
+
+ rule->target = nla_get_u32(tb[FRA_GOTO]);
+ /* Backward jumps are prohibited to avoid endless loops */
+ if (rule->target <= rule->pref)
+ goto errout_free;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref == rule->target) {
+ RCU_INIT_POINTER(rule->ctarget, r);
+ break;
+ }
+ }
+
+ if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+ unresolved = 1;
+ } else if (rule->action == FR_ACT_GOTO)
+ goto errout_free;
+
+ err = ops->configure(rule, skb, frh, tb);
+ if (err < 0)
+ goto errout_free;
+
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->pref > rule->pref)
+ break;
+ last = r;
+ }
+
+ fib_rule_get(rule);
+
+ if (last)
+ list_add_rcu(&rule->list, &last->list);
+ else
+ list_add_rcu(&rule->list, &ops->rules_list);
+
+ if (ops->unresolved_rules) {
+ /*
+ * There are unresolved goto rules in the list, check if
+ * any of them are pointing to this new rule.
+ */
+ list_for_each_entry(r, &ops->rules_list, list) {
+ if (r->action == FR_ACT_GOTO &&
+ r->target == rule->pref &&
+ rtnl_dereference(r->ctarget) == NULL) {
+ rcu_assign_pointer(r->ctarget, rule);
+ if (--ops->unresolved_rules == 0)
+ break;
+ }
+ }
+ }
+
+ if (rule->action == FR_ACT_GOTO)
+ ops->nr_goto_rules++;
+
+ if (unresolved)
+ ops->unresolved_rules++;
+
+ notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ return 0;
+
+errout_free:
+ kfree(rule);
+errout:
+ rules_ops_put(ops);
+ return err;
+}
+
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rules_ops *ops = NULL;
+ struct fib_rule *rule, *tmp;
+ struct nlattr *tb[FRA_MAX+1];
+ int err = -EINVAL;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+ goto errout;
+
+ ops = lookup_rules_ops(net, frh->family);
+ if (ops == NULL) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+ if (err < 0)
+ goto errout;
+
+ err = validate_rulemsg(frh, tb, ops);
+ if (err < 0)
+ goto errout;
+
+ list_for_each_entry(rule, &ops->rules_list, list) {
+ if (frh->action && (frh->action != rule->action))
+ continue;
+
+ if (frh_get_table(frh, tb) &&
+ (frh_get_table(frh, tb) != rule->table))
+ continue;
+
+ if (tb[FRA_PRIORITY] &&
+ (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
+ continue;
+
+ if (tb[FRA_IIFNAME] &&
+ nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
+ continue;
+
+ if (tb[FRA_OIFNAME] &&
+ nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
+ continue;
+
+ if (tb[FRA_FWMARK] &&
+ (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
+ continue;
+
+ if (tb[FRA_FWMASK] &&
+ (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
+ continue;
+
+ if (!ops->compare(rule, frh, tb))
+ continue;
+
+ if (rule->flags & FIB_RULE_PERMANENT) {
+ err = -EPERM;
+ goto errout;
+ }
+
+ if (ops->delete) {
+ err = ops->delete(rule);
+ if (err)
+ goto errout;
+ }
+
+ list_del_rcu(&rule->list);
+
+ if (rule->action == FR_ACT_GOTO) {
+ ops->nr_goto_rules--;
+ if (rtnl_dereference(rule->ctarget) == NULL)
+ ops->unresolved_rules--;
+ }
+
+ /*
+ * Check if this rule is a target to any of them. If so,
+ * disable them. As this operation is eventually very
+ * expensive, it is only performed if goto rules have
+ * actually been added.
+ */
+ if (ops->nr_goto_rules > 0) {
+ list_for_each_entry(tmp, &ops->rules_list, list) {
+ if (rtnl_dereference(tmp->ctarget) == rule) {
+ RCU_INIT_POINTER(tmp->ctarget, NULL);
+ ops->unresolved_rules++;
+ }
+ }
+ }
+
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+ NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
+ flush_route_cache(ops);
+ rules_ops_put(ops);
+ return 0;
+ }
+
+ err = -ENOENT;
+errout:
+ rules_ops_put(ops);
+ return err;
+}
+
+static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ struct fib_rule *rule)
+{
+ size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
+ + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
+ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
+ + nla_total_size(4) /* FRA_PRIORITY */
+ + nla_total_size(4) /* FRA_TABLE */
+ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ + nla_total_size(4) /* FRA_FWMARK */
+ + nla_total_size(4); /* FRA_FWMASK */
+
+ if (ops->nlmsg_payload)
+ payload += ops->nlmsg_payload(rule);
+
+ return payload;
+}
+
+static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
+ u32 pid, u32 seq, int type, int flags,
+ struct fib_rules_ops *ops)
+{
+ struct nlmsghdr *nlh;
+ struct fib_rule_hdr *frh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ frh = nlmsg_data(nlh);
+ frh->family = ops->family;
+ frh->table = rule->table;
+ if (nla_put_u32(skb, FRA_TABLE, rule->table))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
+ goto nla_put_failure;
+ frh->res1 = 0;
+ frh->res2 = 0;
+ frh->action = rule->action;
+ frh->flags = rule->flags;
+
+ if (rule->action == FR_ACT_GOTO &&
+ rcu_access_pointer(rule->ctarget) == NULL)
+ frh->flags |= FIB_RULE_UNRESOLVED;
+
+ if (rule->iifname[0]) {
+ if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
+ goto nla_put_failure;
+ if (rule->iifindex == -1)
+ frh->flags |= FIB_RULE_IIF_DETACHED;
+ }
+
+ if (rule->oifname[0]) {
+ if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
+ goto nla_put_failure;
+ if (rule->oifindex == -1)
+ frh->flags |= FIB_RULE_OIF_DETACHED;
+ }
+
+ if ((rule->pref &&
+ nla_put_u32(skb, FRA_PRIORITY, rule->pref)) ||
+ (rule->mark &&
+ nla_put_u32(skb, FRA_FWMARK, rule->mark)) ||
+ ((rule->mark_mask || rule->mark) &&
+ nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
+ (rule->target &&
+ nla_put_u32(skb, FRA_GOTO, rule->target)))
+ goto nla_put_failure;
+
+ if (rule->suppress_ifgroup != -1) {
+ if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
+ goto nla_put_failure;
+ }
+
+ if (ops->fill(rule, skb, frh) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_rules_ops *ops)
+{
+ int idx = 0;
+ struct fib_rule *rule;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+ if (idx < cb->args[1])
+ goto skip;
+
+ if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWRULE,
+ NLM_F_MULTI, ops) < 0)
+ break;
+skip:
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ rules_ops_put(ops);
+
+ return skb->len;
+}
+
+static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_rules_ops *ops;
+ int idx = 0, family;
+
+ family = rtnl_msg_family(cb->nlh);
+ if (family != AF_UNSPEC) {
+ /* Protocol specific dump request */
+ ops = lookup_rules_ops(net, family);
+ if (ops == NULL)
+ return -EAFNOSUPPORT;
+
+ return dump_rules(skb, cb, ops);
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+ if (idx < cb->args[0] || !try_module_get(ops->owner))
+ goto skip;
+
+ if (dump_rules(skb, cb, ops) < 0)
+ break;
+
+ cb->args[1] = 0;
+skip:
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+ struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+ u32 pid)
+{
+ struct net *net;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ net = ops->fro_net;
+ skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, ops->nlgroup, err);
+}
+
+static void attach_rules(struct list_head *rules, struct net_device *dev)
+{
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, rules, list) {
+ if (rule->iifindex == -1 &&
+ strcmp(dev->name, rule->iifname) == 0)
+ rule->iifindex = dev->ifindex;
+ if (rule->oifindex == -1 &&
+ strcmp(dev->name, rule->oifname) == 0)
+ rule->oifindex = dev->ifindex;
+ }
+}
+
+static void detach_rules(struct list_head *rules, struct net_device *dev)
+{
+ struct fib_rule *rule;
+
+ list_for_each_entry(rule, rules, list) {
+ if (rule->iifindex == dev->ifindex)
+ rule->iifindex = -1;
+ if (rule->oifindex == dev->ifindex)
+ rule->oifindex = -1;
+ }
+}
+
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct fib_rules_ops *ops;
+
+ ASSERT_RTNL();
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ list_for_each_entry(ops, &net->rules_ops, list)
+ attach_rules(&ops->rules_list, dev);
+ break;
+
+ case NETDEV_CHANGENAME:
+ list_for_each_entry(ops, &net->rules_ops, list) {
+ detach_rules(&ops->rules_list, dev);
+ attach_rules(&ops->rules_list, dev);
+ }
+ break;
+
+ case NETDEV_UNREGISTER:
+ list_for_each_entry(ops, &net->rules_ops, list)
+ detach_rules(&ops->rules_list, dev);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_rules_notifier = {
+ .notifier_call = fib_rules_event,
+};
+
+static int __net_init fib_rules_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->rules_ops);
+ spin_lock_init(&net->rules_mod_lock);
+ return 0;
+}
+
+static struct pernet_operations fib_rules_net_ops = {
+ .init = fib_rules_net_init,
+};
+
+static int __init fib_rules_init(void)
+{
+ int err;
+ rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);
+
+ err = register_pernet_subsys(&fib_rules_net_ops);
+ if (err < 0)
+ goto fail;
+
+ err = register_netdevice_notifier(&fib_rules_notifier);
+ if (err < 0)
+ goto fail_unregister;
+
+ return 0;
+
+fail_unregister:
+ unregister_pernet_subsys(&fib_rules_net_ops);
+fail:
+ rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
+ rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
+ rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
+ return err;
+}
+
+subsys_initcall(fib_rules_init);
diff --git a/net/core/filter.c b/net/core/filter.c
new file mode 100644
index 000000000..bf831a85c
--- /dev/null
+++ b/net/core/filter.c
@@ -0,0 +1,1553 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
+ *
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_packet.h>
+#include <linux/gfp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/netlink.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/filter.h>
+#include <linux/ratelimit.h>
+#include <linux/seccomp.h>
+#include <linux/if_vlan.h>
+#include <linux/bpf.h>
+
+/**
+ * sk_filter - run a packet through a socket filter
+ * @sk: sock associated with &sk_buff
+ * @skb: buffer to filter
+ *
+ * Run the filter code and then cut skb->data to correct size returned by
+ * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to SK_RUN_FILTER. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
+ *
+ */
+int sk_filter(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+ struct sk_filter *filter;
+
+ /*
+ * If the skb was allocated from pfmemalloc reserves, only
+ * allow SOCK_MEMALLOC sockets to use it as this socket is
+ * helping free memory
+ */
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+ return -ENOMEM;
+
+ err = security_sock_rcv_skb(sk, skb);
+ if (err)
+ return err;
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter) {
+ unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
+
+ err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+ }
+ rcu_read_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL(sk_filter);
+
+static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+}
+
+static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+ struct nlattr *nla;
+
+ if (skb_is_nonlinear(skb))
+ return 0;
+
+ if (skb->len < sizeof(struct nlattr))
+ return 0;
+
+ if (a > skb->len - sizeof(struct nlattr))
+ return 0;
+
+ nla = (struct nlattr *) &skb->data[a];
+ if (nla->nla_len > skb->len - a)
+ return 0;
+
+ nla = nla_find_nested(nla, x);
+ if (nla)
+ return (void *) nla - (void *) skb->data;
+
+ return 0;
+}
+
+static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+/* note that this only generates 32-bit random numbers */
+static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+ return prandom_u32();
+}
+
+static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (skb_field) {
+ case SKF_AD_MARK:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, mark));
+ break;
+
+ case SKF_AD_PKTTYPE:
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
+#endif
+ break;
+
+ case SKF_AD_QUEUE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, queue_mapping));
+ break;
+
+ case SKF_AD_VLAN_TAG:
+ case SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_tci));
+ if (skb_field == SKF_AD_VLAN_TAG) {
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ /* dst_reg >>= 12 */
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
+ /* dst_reg &= 1 */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
+ }
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static bool convert_bpf_extensions(struct sock_filter *fp,
+ struct bpf_insn **insnp)
+{
+ struct bpf_insn *insn = *insnp;
+ u32 cnt;
+
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PROTOCOL:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(protocol)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, protocol));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PKTTYPE:
+ cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_IFINDEX:
+ case SKF_AD_OFF + SKF_AD_HATYPE:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+ BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
+
+ *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ BPF_REG_TMP, BPF_REG_CTX,
+ offsetof(struct sk_buff, dev));
+ /* if (tmp != 0) goto pc + 1 */
+ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
+ *insn++ = BPF_EXIT_INSN();
+ if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, ifindex));
+ else
+ *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
+ offsetof(struct net_device, type));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_MARK:
+ cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_RXHASH:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, hash));
+ break;
+
+ case SKF_AD_OFF + SKF_AD_QUEUE:
+ cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+ cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
+ BPF_REG_A, BPF_REG_CTX, insn);
+ insn += cnt - 1;
+ break;
+
+ case SKF_AD_OFF + SKF_AD_VLAN_TPID:
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
+
+ /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
+ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
+ offsetof(struct sk_buff, vlan_proto));
+ /* A = ntohs(A) [emitting a nop or swap16] */
+ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
+ break;
+
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ case SKF_AD_OFF + SKF_AD_CPU:
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ /* arg1 = CTX */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+ /* arg2 = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
+ /* arg3 = X */
+ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
+ /* Emit call(arg1=CTX, arg2=A, arg3=X) */
+ switch (fp->k) {
+ case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+ *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr);
+ break;
+ case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+ *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+ break;
+ case SKF_AD_OFF + SKF_AD_CPU:
+ *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+ break;
+ case SKF_AD_OFF + SKF_AD_RANDOM:
+ *insn = BPF_EMIT_CALL(__get_random_u32);
+ break;
+ }
+ break;
+
+ case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
+ /* A ^= X */
+ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
+ break;
+
+ default:
+ /* This is just a dummy call to avoid letting the compiler
+ * evict __bpf_call_base() as an optimization. Placed here
+ * where no-one bothers.
+ */
+ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
+ return false;
+ }
+
+ *insnp = insn;
+ return true;
+}
+
+/**
+ * bpf_convert_filter - convert filter program
+ * @prog: the user passed filter program
+ * @len: the length of the user passed filter program
+ * @new_prog: buffer where converted program will be stored
+ * @new_len: pointer to store length of converted program
+ *
+ * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
+ * Conversion workflow:
+ *
+ * 1) First pass for calculating the new program length:
+ * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
+ *
+ * 2) 2nd pass to remap in two passes: 1st pass finds new
+ * jump offsets, 2nd pass remapping:
+ * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
+ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *
+ * User BPF's register A is mapped to our BPF register 6, user BPF
+ * register X is mapped to BPF register 7; frame pointer is always
+ * register 10; Context 'void *ctx' is stored in register 1, that is,
+ * for socket filters: ctx == 'struct sk_buff *', for seccomp:
+ * ctx == 'struct seccomp_data *'.
+ */
+int bpf_convert_filter(struct sock_filter *prog, int len,
+ struct bpf_insn *new_prog, int *new_len)
+{
+ int new_flen = 0, pass = 0, target, i;
+ struct bpf_insn *new_insn;
+ struct sock_filter *fp;
+ int *addrs = NULL;
+ u8 bpf_src;
+
+ BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
+ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+
+ if (len <= 0 || len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (new_prog) {
+ addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+ }
+
+do_pass:
+ new_insn = new_prog;
+ fp = prog;
+
+ if (new_insn)
+ *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ new_insn++;
+
+ for (i = 0; i < len; fp++, i++) {
+ struct bpf_insn tmp_insns[6] = { };
+ struct bpf_insn *insn = tmp_insns;
+
+ if (addrs)
+ addrs[i] = new_insn - new_prog;
+
+ switch (fp->code) {
+ /* All arithmetic insns and skb loads map as-is. */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ /* Check for overloaded BPF extension and
+ * directly convert it if found, otherwise
+ * just move on with mapping.
+ */
+ if (BPF_CLASS(fp->code) == BPF_LD &&
+ BPF_MODE(fp->code) == BPF_ABS &&
+ convert_bpf_extensions(fp, &insn))
+ break;
+
+ *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
+ break;
+
+ /* Jump transformation cannot use BPF block macros
+ * everywhere as offset calculation and target updates
+ * require a bit more work than the rest, i.e. jump
+ * opcodes map as-is, but offsets need adjustment.
+ */
+
+#define BPF_EMIT_JMP \
+ do { \
+ if (target >= len || target < 0) \
+ goto err; \
+ insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
+ /* Adjust pc relative offset for 2nd or 3rd insn. */ \
+ insn->off -= insn - tmp_insns; \
+ } while (0)
+
+ case BPF_JMP | BPF_JA:
+ target = i + fp->k + 1;
+ insn->code = fp->code;
+ BPF_EMIT_JMP;
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
+ /* BPF immediates are signed, zero extend
+ * immediate into tmp register and use it
+ * in compare insn.
+ */
+ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
+
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_TMP;
+ bpf_src = BPF_X;
+ } else {
+ insn->dst_reg = BPF_REG_A;
+ insn->src_reg = BPF_REG_X;
+ insn->imm = fp->k;
+ bpf_src = BPF_SRC(fp->code);
+ }
+
+ /* Common case where 'jump_false' is next insn. */
+ if (fp->jf == 0) {
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ target = i + fp->jt + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Convert JEQ into JNE when 'jump_true' is next insn. */
+ if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
+ insn->code = BPF_JMP | BPF_JNE | bpf_src;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+ }
+
+ /* Other jumps are mapped into two insns: Jxx and JA. */
+ target = i + fp->jt + 1;
+ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+ BPF_EMIT_JMP;
+ insn++;
+
+ insn->code = BPF_JMP | BPF_JA;
+ target = i + fp->jf + 1;
+ BPF_EMIT_JMP;
+ break;
+
+ /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+ case BPF_LDX | BPF_MSH | BPF_B:
+ /* tmp = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+ /* A = BPF_R0 = *(u8 *) (skb->data + K) */
+ *insn++ = BPF_LD_ABS(BPF_B, fp->k);
+ /* A &= 0xf */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
+ /* A <<= 2 */
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+ /* X = A */
+ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ /* A = tmp */
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
+ break;
+
+ /* RET_K, RET_A are remaped into 2 insns. */
+ case BPF_RET | BPF_A:
+ case BPF_RET | BPF_K:
+ *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
+ BPF_K : BPF_X, BPF_REG_0,
+ BPF_REG_A, fp->k);
+ *insn = BPF_EXIT_INSN();
+ break;
+
+ /* Store to stack. */
+ case BPF_ST:
+ case BPF_STX:
+ *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
+ BPF_ST ? BPF_REG_A : BPF_REG_X,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* Load from stack. */
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_FP,
+ -(BPF_MEMWORDS - fp->k) * 4);
+ break;
+
+ /* A = K or X = K */
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, fp->k);
+ break;
+
+ /* X = A */
+ case BPF_MISC | BPF_TAX:
+ *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
+ break;
+
+ /* A = X */
+ case BPF_MISC | BPF_TXA:
+ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
+ break;
+
+ /* A = skb->len or X = skb->len */
+ case BPF_LD | BPF_W | BPF_LEN:
+ case BPF_LDX | BPF_W | BPF_LEN:
+ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
+ BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
+ offsetof(struct sk_buff, len));
+ break;
+
+ /* Access seccomp_data fields. */
+ case BPF_LDX | BPF_ABS | BPF_W:
+ /* A = *(u32 *) (ctx + K) */
+ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
+ break;
+
+ /* Unknown instruction. */
+ default:
+ goto err;
+ }
+
+ insn++;
+ if (new_prog)
+ memcpy(new_insn, tmp_insns,
+ sizeof(*insn) * (insn - tmp_insns));
+ new_insn += insn - tmp_insns;
+ }
+
+ if (!new_prog) {
+ /* Only calculating new length. */
+ *new_len = new_insn - new_prog;
+ return 0;
+ }
+
+ pass++;
+ if (new_flen != new_insn - new_prog) {
+ new_flen = new_insn - new_prog;
+ if (pass > 2)
+ goto err;
+ goto do_pass;
+ }
+
+ kfree(addrs);
+ BUG_ON(*new_len != new_flen);
+ return 0;
+err:
+ kfree(addrs);
+ return -EINVAL;
+}
+
+/* Security:
+ *
+ * As we dont want to clear mem[] array for each packet going through
+ * __bpf_prog_run(), we check that filter loaded by user never try to read
+ * a cell if not previously written, and we check all branches to be sure
+ * a malicious user doesn't try to abuse us.
+ */
+static int check_load_and_stores(const struct sock_filter *filter, int flen)
+{
+ u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
+ int pc, ret = 0;
+
+ BUILD_BUG_ON(BPF_MEMWORDS > 16);
+
+ masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ return -ENOMEM;
+
+ memset(masks, 0xff, flen * sizeof(*masks));
+
+ for (pc = 0; pc < flen; pc++) {
+ memvalid &= masks[pc];
+
+ switch (filter[pc].code) {
+ case BPF_ST:
+ case BPF_STX:
+ memvalid |= (1 << filter[pc].k);
+ break;
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ if (!(memvalid & (1 << filter[pc].k))) {
+ ret = -EINVAL;
+ goto error;
+ }
+ break;
+ case BPF_JMP | BPF_JA:
+ /* A jump must set masks on target */
+ masks[pc + 1 + filter[pc].k] &= memvalid;
+ memvalid = ~0;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* A jump must set masks on targets */
+ masks[pc + 1 + filter[pc].jt] &= memvalid;
+ masks[pc + 1 + filter[pc].jf] &= memvalid;
+ memvalid = ~0;
+ break;
+ }
+ }
+error:
+ kfree(masks);
+ return ret;
+}
+
+static bool chk_code_allowed(u16 code_to_probe)
+{
+ static const bool codes[] = {
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_K] = true,
+ [BPF_ALU | BPF_ADD | BPF_X] = true,
+ [BPF_ALU | BPF_SUB | BPF_K] = true,
+ [BPF_ALU | BPF_SUB | BPF_X] = true,
+ [BPF_ALU | BPF_MUL | BPF_K] = true,
+ [BPF_ALU | BPF_MUL | BPF_X] = true,
+ [BPF_ALU | BPF_DIV | BPF_K] = true,
+ [BPF_ALU | BPF_DIV | BPF_X] = true,
+ [BPF_ALU | BPF_MOD | BPF_K] = true,
+ [BPF_ALU | BPF_MOD | BPF_X] = true,
+ [BPF_ALU | BPF_AND | BPF_K] = true,
+ [BPF_ALU | BPF_AND | BPF_X] = true,
+ [BPF_ALU | BPF_OR | BPF_K] = true,
+ [BPF_ALU | BPF_OR | BPF_X] = true,
+ [BPF_ALU | BPF_XOR | BPF_K] = true,
+ [BPF_ALU | BPF_XOR | BPF_X] = true,
+ [BPF_ALU | BPF_LSH | BPF_K] = true,
+ [BPF_ALU | BPF_LSH | BPF_X] = true,
+ [BPF_ALU | BPF_RSH | BPF_K] = true,
+ [BPF_ALU | BPF_RSH | BPF_X] = true,
+ [BPF_ALU | BPF_NEG] = true,
+ /* Load instructions */
+ [BPF_LD | BPF_W | BPF_ABS] = true,
+ [BPF_LD | BPF_H | BPF_ABS] = true,
+ [BPF_LD | BPF_B | BPF_ABS] = true,
+ [BPF_LD | BPF_W | BPF_LEN] = true,
+ [BPF_LD | BPF_W | BPF_IND] = true,
+ [BPF_LD | BPF_H | BPF_IND] = true,
+ [BPF_LD | BPF_B | BPF_IND] = true,
+ [BPF_LD | BPF_IMM] = true,
+ [BPF_LD | BPF_MEM] = true,
+ [BPF_LDX | BPF_W | BPF_LEN] = true,
+ [BPF_LDX | BPF_B | BPF_MSH] = true,
+ [BPF_LDX | BPF_IMM] = true,
+ [BPF_LDX | BPF_MEM] = true,
+ /* Store instructions */
+ [BPF_ST] = true,
+ [BPF_STX] = true,
+ /* Misc instructions */
+ [BPF_MISC | BPF_TAX] = true,
+ [BPF_MISC | BPF_TXA] = true,
+ /* Return instructions */
+ [BPF_RET | BPF_K] = true,
+ [BPF_RET | BPF_A] = true,
+ /* Jump instructions */
+ [BPF_JMP | BPF_JA] = true,
+ [BPF_JMP | BPF_JEQ | BPF_K] = true,
+ [BPF_JMP | BPF_JEQ | BPF_X] = true,
+ [BPF_JMP | BPF_JGE | BPF_K] = true,
+ [BPF_JMP | BPF_JGE | BPF_X] = true,
+ [BPF_JMP | BPF_JGT | BPF_K] = true,
+ [BPF_JMP | BPF_JGT | BPF_X] = true,
+ [BPF_JMP | BPF_JSET | BPF_K] = true,
+ [BPF_JMP | BPF_JSET | BPF_X] = true,
+ };
+
+ if (code_to_probe >= ARRAY_SIZE(codes))
+ return false;
+
+ return codes[code_to_probe];
+}
+
+/**
+ * bpf_check_classic - verify socket filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Check the user's filter code. If we let some ugly
+ * filter code slip through kaboom! The filter must contain
+ * no references or jumps that are out of range, no illegal
+ * instructions, and must end with a RET instruction.
+ *
+ * All jumps are forward as they are not signed.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
+{
+ bool anc_found;
+ int pc;
+
+ if (flen == 0 || flen > BPF_MAXINSNS)
+ return -EINVAL;
+
+ /* Check the filter code now */
+ for (pc = 0; pc < flen; pc++) {
+ const struct sock_filter *ftest = &filter[pc];
+
+ /* May we actually operate on this code? */
+ if (!chk_code_allowed(ftest->code))
+ return -EINVAL;
+
+ /* Some instructions need special checks */
+ switch (ftest->code) {
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ /* Check for division by zero */
+ if (ftest->k == 0)
+ return -EINVAL;
+ break;
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ /* Check for invalid memory addresses */
+ if (ftest->k >= BPF_MEMWORDS)
+ return -EINVAL;
+ break;
+ case BPF_JMP | BPF_JA:
+ /* Note, the large ftest->k might cause loops.
+ * Compare this with conditional jumps below,
+ * where offsets are limited. --ANK (981016)
+ */
+ if (ftest->k >= (unsigned int)(flen - pc - 1))
+ return -EINVAL;
+ break;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* Both conditionals must be safe */
+ if (pc + ftest->jt + 1 >= flen ||
+ pc + ftest->jf + 1 >= flen)
+ return -EINVAL;
+ break;
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_B | BPF_ABS:
+ anc_found = false;
+ if (bpf_anc_helper(ftest) & BPF_ANC)
+ anc_found = true;
+ /* Ancillary operation unknown or unsupported */
+ if (anc_found == false && ftest->k >= SKF_AD_OFF)
+ return -EINVAL;
+ }
+ }
+
+ /* Last instruction must be a RET code */
+ switch (filter[flen - 1].code) {
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ return check_load_and_stores(filter, flen);
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(bpf_check_classic);
+
+static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
+ const struct sock_fprog *fprog)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct sock_fprog_kern *fkprog;
+
+ fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
+ if (!fp->orig_prog)
+ return -ENOMEM;
+
+ fkprog = fp->orig_prog;
+ fkprog->len = fprog->len;
+ fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
+ if (!fkprog->filter) {
+ kfree(fp->orig_prog);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void bpf_release_orig_filter(struct bpf_prog *fp)
+{
+ struct sock_fprog_kern *fprog = fp->orig_prog;
+
+ if (fprog) {
+ kfree(fprog->filter);
+ kfree(fprog);
+ }
+}
+
+static void __bpf_prog_release(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(prog);
+ } else {
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+ }
+}
+
+static void __sk_filter_release(struct sk_filter *fp)
+{
+ __bpf_prog_release(fp->prog);
+ kfree(fp);
+}
+
+/**
+ * sk_filter_release_rcu - Release a socket filter by rcu_head
+ * @rcu: rcu_head that contains the sk_filter to free
+ */
+static void sk_filter_release_rcu(struct rcu_head *rcu)
+{
+ struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+ __sk_filter_release(fp);
+}
+
+/**
+ * sk_filter_release - release a socket filter
+ * @fp: filter to remove
+ *
+ * Remove a filter from a socket and release its resources.
+ */
+static void sk_filter_release(struct sk_filter *fp)
+{
+ if (atomic_dec_and_test(&fp->refcnt))
+ call_rcu(&fp->rcu, sk_filter_release_rcu);
+}
+
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ atomic_sub(filter_size, &sk->sk_omem_alloc);
+ sk_filter_release(fp);
+}
+
+/* try to charge the socket memory if there is space available
+ * return true on success
+ */
+bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ /* same check as in sock_kmalloc() */
+ if (filter_size <= sysctl_optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+ atomic_inc(&fp->refcnt);
+ atomic_add(filter_size, &sk->sk_omem_alloc);
+ return true;
+ }
+ return false;
+}
+
+static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
+{
+ struct sock_filter *old_prog;
+ struct bpf_prog *old_fp;
+ int err, new_len, old_len = fp->len;
+
+ /* We are free to overwrite insns et al right here as it
+ * won't be used at this point in time anymore internally
+ * after the migration to the internal BPF instruction
+ * representation.
+ */
+ BUILD_BUG_ON(sizeof(struct sock_filter) !=
+ sizeof(struct bpf_insn));
+
+ /* Conversion cannot happen on overlapping memory areas,
+ * so we need to keep the user BPF around until the 2nd
+ * pass. At this time, the user BPF is stored in fp->insns.
+ */
+ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
+ GFP_KERNEL);
+ if (!old_prog) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ /* 1st pass: calculate the new program length. */
+ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
+ if (err)
+ goto out_err_free;
+
+ /* Expand fp for appending the new filter representation. */
+ old_fp = fp;
+ fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
+ if (!fp) {
+ /* The old_fp is still around in case we couldn't
+ * allocate new memory, so uncharge on that one.
+ */
+ fp = old_fp;
+ err = -ENOMEM;
+ goto out_err_free;
+ }
+
+ fp->len = new_len;
+
+ /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
+ err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+ if (err)
+ /* 2nd bpf_convert_filter() can fail only if it fails
+ * to allocate memory, remapping must succeed. Note,
+ * that at this time old_fp has already been released
+ * by krealloc().
+ */
+ goto out_err_free;
+
+ bpf_prog_select_runtime(fp);
+
+ kfree(old_prog);
+ return fp;
+
+out_err_free:
+ kfree(old_prog);
+out_err:
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+}
+
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
+{
+ int err;
+
+ fp->bpf_func = NULL;
+ fp->jited = false;
+
+ err = bpf_check_classic(fp->insns, fp->len);
+ if (err) {
+ __bpf_prog_release(fp);
+ return ERR_PTR(err);
+ }
+
+ /* Probe if we can JIT compile the filter and if so, do
+ * the compilation of the filter.
+ */
+ bpf_jit_compile(fp);
+
+ /* JIT compiler couldn't process this filter, so do the
+ * internal BPF translation for the optimized interpreter.
+ */
+ if (!fp->jited)
+ fp = bpf_migrate_filter(fp);
+
+ return fp;
+}
+
+/**
+ * bpf_prog_create - create an unattached filter
+ * @pfp: the unattached filter that is created
+ * @fprog: the filter program
+ *
+ * Create a filter independent of any socket. We first run some
+ * sanity checks on it to make sure it does not explode on us later.
+ * If an error occurs or there is insufficient memory for the filter
+ * a negative errno code is returned. On success the return is zero.
+ */
+int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (fprog->filter == NULL)
+ return -EINVAL;
+
+ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+ if (!fp)
+ return -ENOMEM;
+
+ memcpy(fp->insns, fprog->filter, fsize);
+
+ fp->len = fprog->len;
+ /* Since unattached filters are not copied back to user
+ * space through sk_get_filter(), we do not need to hold
+ * a copy here, and can spare us the work.
+ */
+ fp->orig_prog = NULL;
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ fp = bpf_prepare_filter(fp);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ *pfp = fp;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_create);
+
+void bpf_prog_destroy(struct bpf_prog *fp)
+{
+ __bpf_prog_release(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_destroy);
+
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
+{
+ struct sk_filter *fp, *old_fp;
+
+ fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ fp->prog = prog;
+ atomic_set(&fp->refcnt, 0);
+
+ if (!sk_filter_charge(sk, fp)) {
+ kfree(fp);
+ return -ENOMEM;
+ }
+
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ rcu_assign_pointer(sk->sk_filter, fp);
+
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+
+ return 0;
+}
+
+/**
+ * sk_attach_filter - attach a socket filter
+ * @fprog: the filter program
+ * @sk: the socket to use
+ *
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later. If an error
+ * occurs or there is insufficient memory for the filter a negative
+ * errno code is returned. On success the return is zero.
+ */
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ unsigned int bpf_fsize = bpf_prog_size(fprog->len);
+ struct bpf_prog *prog;
+ int err;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ /* Make sure new filter is there and in the right amounts. */
+ if (fprog->filter == NULL)
+ return -EINVAL;
+
+ prog = bpf_prog_alloc(bpf_fsize, 0);
+ if (!prog)
+ return -ENOMEM;
+
+ if (copy_from_user(prog->insns, fprog->filter, fsize)) {
+ __bpf_prog_free(prog);
+ return -EFAULT;
+ }
+
+ prog->len = fprog->len;
+
+ err = bpf_prog_store_orig_filter(prog, fprog);
+ if (err) {
+ __bpf_prog_free(prog);
+ return -ENOMEM;
+ }
+
+ /* bpf_prepare_filter() already takes care of freeing
+ * memory in case something goes wrong.
+ */
+ prog = bpf_prepare_filter(prog);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __sk_attach_prog(prog, sk);
+ if (err < 0) {
+ __bpf_prog_release(prog);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
+
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog;
+ int err;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ prog = bpf_prog_get(ufd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ err = __sk_attach_prog(prog, sk);
+ if (err < 0) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * bpf_skb_clone_not_writable - is the header of a clone not writable
+ * @skb: buffer to check
+ * @len: length up to which to write, can be negative
+ *
+ * Returns true if modifying the header part of the cloned buffer
+ * does require the data to be copied. I.e. this version works with
+ * negative lengths needed for eBPF case!
+ */
+static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len)
+{
+ return skb_header_cloned(skb) ||
+ (int) skb_headroom(skb) + len > skb->hdr_len;
+}
+
+#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
+
+static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ int offset = (int) r2;
+ void *from = (void *) (long) r3;
+ unsigned int len = (unsigned int) r4;
+ char buf[16];
+ void *ptr;
+
+ /* bpf verifier guarantees that:
+ * 'from' pointer points to bpf program stack
+ * 'len' bytes of it were initialized
+ * 'len' > 0
+ * 'skb' is a valid pointer to 'struct sk_buff'
+ *
+ * so check for invalid 'offset' and too large 'len'
+ */
+ if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
+ return -EFAULT;
+
+ offset -= skb->data - skb_mac_header(skb);
+ if (unlikely(skb_cloned(skb) &&
+ bpf_skb_clone_unwritable(skb, offset + len)))
+ return -EFAULT;
+
+ ptr = skb_header_pointer(skb, offset, len, buf);
+ if (unlikely(!ptr))
+ return -EFAULT;
+
+ if (BPF_RECOMPUTE_CSUM(flags))
+ skb_postpull_rcsum(skb, ptr, len);
+
+ memcpy(ptr, from, len);
+
+ if (ptr == buf)
+ /* skb_store_bits cannot return -EFAULT here */
+ skb_store_bits(skb, offset, ptr, len);
+
+ if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0));
+ return 0;
+}
+
+const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+ .func = bpf_skb_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_STACK,
+ .arg4_type = ARG_CONST_STACK_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f)
+#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10)
+
+static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ int offset = (int) r2;
+ __sum16 sum, *ptr;
+
+ if (unlikely((u32) offset > 0xffff))
+ return -EFAULT;
+
+ offset -= skb->data - skb_mac_header(skb);
+ if (unlikely(skb_cloned(skb) &&
+ bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
+ return -EFAULT;
+
+ ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
+ if (unlikely(!ptr))
+ return -EFAULT;
+
+ switch (BPF_HEADER_FIELD_SIZE(flags)) {
+ case 2:
+ csum_replace2(ptr, from, to);
+ break;
+ case 4:
+ csum_replace4(ptr, from, to);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (ptr == &sum)
+ /* skb_store_bits guaranteed to not return -EFAULT here */
+ skb_store_bits(skb, offset, ptr, sizeof(sum));
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+ .func = bpf_l3_csum_replace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags);
+ int offset = (int) r2;
+ __sum16 sum, *ptr;
+
+ if (unlikely((u32) offset > 0xffff))
+ return -EFAULT;
+
+ offset -= skb->data - skb_mac_header(skb);
+ if (unlikely(skb_cloned(skb) &&
+ bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
+ return -EFAULT;
+
+ ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
+ if (unlikely(!ptr))
+ return -EFAULT;
+
+ switch (BPF_HEADER_FIELD_SIZE(flags)) {
+ case 2:
+ inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
+ break;
+ case 4:
+ inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (ptr == &sum)
+ /* skb_store_bits guaranteed to not return -EFAULT here */
+ skb_store_bits(skb, offset, ptr, sizeof(sum));
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+ .func = bpf_l4_csum_replace,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ default:
+ return NULL;
+ }
+}
+
+static const struct bpf_func_proto *
+tc_cls_act_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static bool sk_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type)
+{
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct __sk_buff))
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ /* all __sk_buff fields are __u32 */
+ if (size != 4)
+ return false;
+
+ return true;
+}
+
+static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct __sk_buff, len):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, len));
+ break;
+
+ case offsetof(struct __sk_buff, protocol):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, protocol));
+ break;
+
+ case offsetof(struct __sk_buff, vlan_proto):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_proto));
+ break;
+
+ case offsetof(struct __sk_buff, priority):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, priority));
+ break;
+
+ case offsetof(struct __sk_buff, mark):
+ return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, pkt_type):
+ return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, queue_mapping):
+ return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, vlan_present):
+ return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
+ dst_reg, src_reg, insn);
+
+ case offsetof(struct __sk_buff, vlan_tci):
+ return convert_skb_access(SKF_AD_VLAN_TAG,
+ dst_reg, src_reg, insn);
+ }
+
+ return insn - insn_buf;
+}
+
+static const struct bpf_verifier_ops sk_filter_ops = {
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops tc_cls_act_ops = {
+ .get_func_proto = tc_cls_act_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static struct bpf_prog_type_list sk_filter_type __read_mostly = {
+ .ops = &sk_filter_ops,
+ .type = BPF_PROG_TYPE_SOCKET_FILTER,
+};
+
+static struct bpf_prog_type_list sched_cls_type __read_mostly = {
+ .ops = &tc_cls_act_ops,
+ .type = BPF_PROG_TYPE_SCHED_CLS,
+};
+
+static struct bpf_prog_type_list sched_act_type __read_mostly = {
+ .ops = &tc_cls_act_ops,
+ .type = BPF_PROG_TYPE_SCHED_ACT,
+};
+
+static int __init register_sk_filter_ops(void)
+{
+ bpf_register_prog_type(&sk_filter_type);
+ bpf_register_prog_type(&sched_cls_type);
+ bpf_register_prog_type(&sched_act_type);
+
+ return 0;
+}
+late_initcall(register_sk_filter_ops);
+
+int sk_detach_filter(struct sock *sk)
+{
+ int ret = -ENOENT;
+ struct sk_filter *filter;
+
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ filter = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ if (filter) {
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
+ sk_filter_uncharge(sk, filter);
+ ret = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
+
+int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
+ unsigned int len)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ int ret = 0;
+
+ lock_sock(sk);
+ filter = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
+ if (!filter)
+ goto out;
+
+ /* We're copying the filter that has been originally attached,
+ * so no conversion/decode needed anymore.
+ */
+ fprog = filter->prog->orig_prog;
+
+ ret = fprog->len;
+ if (!len)
+ /* User space only enquires number of filter blocks. */
+ goto out;
+
+ ret = -EINVAL;
+ if (len < fprog->len)
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
+ goto out;
+
+ /* Instead of bytes, the API requests to return the number
+ * of filter blocks.
+ */
+ ret = fprog->len;
+out:
+ release_sock(sk);
+ return ret;
+}
diff --git a/net/core/flow.c b/net/core/flow.c
new file mode 100644
index 000000000..1033725be
--- /dev/null
+++ b/net/core/flow.c
@@ -0,0 +1,511 @@
+/* flow.c: Generic flow cache.
+ *
+ * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/completion.h>
+#include <linux/percpu.h>
+#include <linux/bitops.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/mutex.h>
+#include <net/flow.h>
+#include <linux/atomic.h>
+#include <linux/security.h>
+#include <net/net_namespace.h>
+
+struct flow_cache_entry {
+ union {
+ struct hlist_node hlist;
+ struct list_head gc_list;
+ } u;
+ struct net *net;
+ u16 family;
+ u8 dir;
+ u32 genid;
+ struct flowi key;
+ struct flow_cache_object *object;
+};
+
+struct flow_flush_info {
+ struct flow_cache *cache;
+ atomic_t cpuleft;
+ struct completion completion;
+};
+
+static struct kmem_cache *flow_cachep __read_mostly;
+
+#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
+
+static void flow_cache_new_hashrnd(unsigned long arg)
+{
+ struct flow_cache *fc = (void *) arg;
+ int i;
+
+ for_each_possible_cpu(i)
+ per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
+
+ fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+ add_timer(&fc->rnd_timer);
+}
+
+static int flow_entry_valid(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
+{
+ if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
+ return 0;
+ if (fle->object && !fle->object->ops->check(fle->object))
+ return 0;
+ return 1;
+}
+
+static void flow_entry_kill(struct flow_cache_entry *fle,
+ struct netns_xfrm *xfrm)
+{
+ if (fle->object)
+ fle->object->ops->delete(fle->object);
+ kmem_cache_free(flow_cachep, fle);
+}
+
+static void flow_cache_gc_task(struct work_struct *work)
+{
+ struct list_head gc_list;
+ struct flow_cache_entry *fce, *n;
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_gc_work);
+
+ INIT_LIST_HEAD(&gc_list);
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+
+ list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+ flow_entry_kill(fce, xfrm);
+}
+
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
+ int deleted, struct list_head *gc_list,
+ struct netns_xfrm *xfrm)
+{
+ if (deleted) {
+ fcp->hash_count -= deleted;
+ spin_lock_bh(&xfrm->flow_cache_gc_lock);
+ list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
+ spin_unlock_bh(&xfrm->flow_cache_gc_lock);
+ schedule_work(&xfrm->flow_cache_gc_work);
+ }
+}
+
+static void __flow_cache_shrink(struct flow_cache *fc,
+ struct flow_cache_percpu *fcp,
+ int shrink_to)
+{
+ struct flow_cache_entry *fle;
+ struct hlist_node *tmp;
+ LIST_HEAD(gc_list);
+ int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
+
+ for (i = 0; i < flow_cache_hash_size(fc); i++) {
+ int saved = 0;
+
+ hlist_for_each_entry_safe(fle, tmp,
+ &fcp->hash_table[i], u.hlist) {
+ if (saved < shrink_to &&
+ flow_entry_valid(fle, xfrm)) {
+ saved++;
+ } else {
+ deleted++;
+ hlist_del(&fle->u.hlist);
+ list_add_tail(&fle->u.gc_list, &gc_list);
+ }
+ }
+ }
+
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
+}
+
+static void flow_cache_shrink(struct flow_cache *fc,
+ struct flow_cache_percpu *fcp)
+{
+ int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
+
+ __flow_cache_shrink(fc, fcp, shrink_to);
+}
+
+static void flow_new_hash_rnd(struct flow_cache *fc,
+ struct flow_cache_percpu *fcp)
+{
+ get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+ fcp->hash_rnd_recalc = 0;
+ __flow_cache_shrink(fc, fcp, 0);
+}
+
+static u32 flow_hash_code(struct flow_cache *fc,
+ struct flow_cache_percpu *fcp,
+ const struct flowi *key,
+ size_t keysize)
+{
+ const u32 *k = (const u32 *) key;
+ const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
+
+ return jhash2(k, length, fcp->hash_rnd)
+ & (flow_cache_hash_size(fc) - 1);
+}
+
+/* I hear what you're saying, use memcmp. But memcmp cannot make
+ * important assumptions that we can here, such as alignment.
+ */
+static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
+ size_t keysize)
+{
+ const flow_compare_t *k1, *k1_lim, *k2;
+
+ k1 = (const flow_compare_t *) key1;
+ k1_lim = k1 + keysize;
+
+ k2 = (const flow_compare_t *) key2;
+
+ do {
+ if (*k1++ != *k2++)
+ return 1;
+ } while (k1 < k1_lim);
+
+ return 0;
+}
+
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
+ flow_resolve_t resolver, void *ctx)
+{
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
+ struct flow_cache_percpu *fcp;
+ struct flow_cache_entry *fle, *tfle;
+ struct flow_cache_object *flo;
+ size_t keysize;
+ unsigned int hash;
+
+ local_bh_disable();
+ fcp = this_cpu_ptr(fc->percpu);
+
+ fle = NULL;
+ flo = NULL;
+
+ keysize = flow_key_size(family);
+ if (!keysize)
+ goto nocache;
+
+ /* Packet really early in init? Making flow_cache_init a
+ * pre-smp initcall would solve this. --RR */
+ if (!fcp->hash_table)
+ goto nocache;
+
+ if (fcp->hash_rnd_recalc)
+ flow_new_hash_rnd(fc, fcp);
+
+ hash = flow_hash_code(fc, fcp, key, keysize);
+ hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
+ if (tfle->net == net &&
+ tfle->family == family &&
+ tfle->dir == dir &&
+ flow_key_compare(key, &tfle->key, keysize) == 0) {
+ fle = tfle;
+ break;
+ }
+ }
+
+ if (unlikely(!fle)) {
+ if (fcp->hash_count > fc->high_watermark)
+ flow_cache_shrink(fc, fcp);
+
+ fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
+ if (fle) {
+ fle->net = net;
+ fle->family = family;
+ fle->dir = dir;
+ memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
+ fle->object = NULL;
+ hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
+ fcp->hash_count++;
+ }
+ } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
+ flo = fle->object;
+ if (!flo)
+ goto ret_object;
+ flo = flo->ops->get(flo);
+ if (flo)
+ goto ret_object;
+ } else if (fle->object) {
+ flo = fle->object;
+ flo->ops->delete(flo);
+ fle->object = NULL;
+ }
+
+nocache:
+ flo = NULL;
+ if (fle) {
+ flo = fle->object;
+ fle->object = NULL;
+ }
+ flo = resolver(net, key, family, dir, flo, ctx);
+ if (fle) {
+ fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
+ if (!IS_ERR(flo))
+ fle->object = flo;
+ else
+ fle->genid--;
+ } else {
+ if (!IS_ERR_OR_NULL(flo))
+ flo->ops->delete(flo);
+ }
+ret_object:
+ local_bh_enable();
+ return flo;
+}
+EXPORT_SYMBOL(flow_cache_lookup);
+
+static void flow_cache_flush_tasklet(unsigned long data)
+{
+ struct flow_flush_info *info = (void *)data;
+ struct flow_cache *fc = info->cache;
+ struct flow_cache_percpu *fcp;
+ struct flow_cache_entry *fle;
+ struct hlist_node *tmp;
+ LIST_HEAD(gc_list);
+ int i, deleted = 0;
+ struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
+ flow_cache_global);
+
+ fcp = this_cpu_ptr(fc->percpu);
+ for (i = 0; i < flow_cache_hash_size(fc); i++) {
+ hlist_for_each_entry_safe(fle, tmp,
+ &fcp->hash_table[i], u.hlist) {
+ if (flow_entry_valid(fle, xfrm))
+ continue;
+
+ deleted++;
+ hlist_del(&fle->u.hlist);
+ list_add_tail(&fle->u.gc_list, &gc_list);
+ }
+ }
+
+ flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
+
+ if (atomic_dec_and_test(&info->cpuleft))
+ complete(&info->completion);
+}
+
+/*
+ * Return whether a cpu needs flushing. Conservatively, we assume
+ * the presence of any entries means the core may require flushing,
+ * since the flow_cache_ops.check() function may assume it's running
+ * on the same core as the per-cpu cache component.
+ */
+static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
+{
+ struct flow_cache_percpu *fcp;
+ int i;
+
+ fcp = per_cpu_ptr(fc->percpu, cpu);
+ for (i = 0; i < flow_cache_hash_size(fc); i++)
+ if (!hlist_empty(&fcp->hash_table[i]))
+ return 0;
+ return 1;
+}
+
+static void flow_cache_flush_per_cpu(void *data)
+{
+ struct flow_flush_info *info = data;
+ struct tasklet_struct *tasklet;
+
+ tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
+ tasklet->data = (unsigned long)info;
+ tasklet_schedule(tasklet);
+}
+
+void flow_cache_flush(struct net *net)
+{
+ struct flow_flush_info info;
+ cpumask_var_t mask;
+ int i, self;
+
+ /* Track which cpus need flushing to avoid disturbing all cores. */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return;
+ cpumask_clear(mask);
+
+ /* Don't want cpus going down or up during this. */
+ get_online_cpus();
+ mutex_lock(&net->xfrm.flow_flush_sem);
+ info.cache = &net->xfrm.flow_cache_global;
+ for_each_online_cpu(i)
+ if (!flow_cache_percpu_empty(info.cache, i))
+ cpumask_set_cpu(i, mask);
+ atomic_set(&info.cpuleft, cpumask_weight(mask));
+ if (atomic_read(&info.cpuleft) == 0)
+ goto done;
+
+ init_completion(&info.completion);
+
+ local_bh_disable();
+ self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
+ on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
+ if (self)
+ flow_cache_flush_tasklet((unsigned long)&info);
+ local_bh_enable();
+
+ wait_for_completion(&info.completion);
+
+done:
+ mutex_unlock(&net->xfrm.flow_flush_sem);
+ put_online_cpus();
+ free_cpumask_var(mask);
+}
+
+static void flow_cache_flush_task(struct work_struct *work)
+{
+ struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
+ flow_cache_flush_work);
+ struct net *net = container_of(xfrm, struct net, xfrm);
+
+ flow_cache_flush(net);
+}
+
+void flow_cache_flush_deferred(struct net *net)
+{
+ schedule_work(&net->xfrm.flow_cache_flush_work);
+}
+
+static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
+{
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+ size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
+
+ if (!fcp->hash_table) {
+ fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
+ if (!fcp->hash_table) {
+ pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
+ return -ENOMEM;
+ }
+ fcp->hash_rnd_recalc = 1;
+ fcp->hash_count = 0;
+ tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+ }
+ return 0;
+}
+
+static int flow_cache_cpu(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ struct flow_cache *fc = container_of(nfb, struct flow_cache,
+ hotcpu_notifier);
+ int res, cpu = (unsigned long) hcpu;
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ res = flow_cache_cpu_prepare(fc, cpu);
+ if (res)
+ return notifier_from_errno(res);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ __flow_cache_shrink(fc, fcp, 0);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+int flow_cache_init(struct net *net)
+{
+ int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
+
+ if (!flow_cachep)
+ flow_cachep = kmem_cache_create("flow_cache",
+ sizeof(struct flow_cache_entry),
+ 0, SLAB_PANIC, NULL);
+ spin_lock_init(&net->xfrm.flow_cache_gc_lock);
+ INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
+ INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
+ INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
+ mutex_init(&net->xfrm.flow_flush_sem);
+
+ fc->hash_shift = 10;
+ fc->low_watermark = 2 * flow_cache_hash_size(fc);
+ fc->high_watermark = 4 * flow_cache_hash_size(fc);
+
+ fc->percpu = alloc_percpu(struct flow_cache_percpu);
+ if (!fc->percpu)
+ return -ENOMEM;
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(i) {
+ if (flow_cache_cpu_prepare(fc, i))
+ goto err;
+ }
+ fc->hotcpu_notifier = (struct notifier_block){
+ .notifier_call = flow_cache_cpu,
+ };
+ __register_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpu_notifier_register_done();
+
+ setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
+ (unsigned long) fc);
+ fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+ add_timer(&fc->rnd_timer);
+
+ return 0;
+
+err:
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
+
+ cpu_notifier_register_done();
+
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(flow_cache_init);
+
+void flow_cache_fini(struct net *net)
+{
+ int i;
+ struct flow_cache *fc = &net->xfrm.flow_cache_global;
+
+ del_timer_sync(&fc->rnd_timer);
+ unregister_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
+
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+}
+EXPORT_SYMBOL(flow_cache_fini);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
new file mode 100644
index 000000000..2c35c02a9
--- /dev/null
+++ b/net/core/flow_dissector.c
@@ -0,0 +1,490 @@
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/igmp.h>
+#include <linux/icmp.h>
+#include <linux/sctp.h>
+#include <linux/dccp.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <net/flow_keys.h>
+#include <scsi/fc/fc_fcoe.h>
+
+/* copy saddr & daddr, possibly using 64bit load/store
+ * Equivalent to : flow->src = iph->saddr;
+ * flow->dst = iph->daddr;
+ */
+static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
+{
+ BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
+ offsetof(typeof(*flow), src) + sizeof(flow->src));
+ memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
+}
+
+/**
+ * __skb_flow_get_ports - extract the upper layer ports and return them
+ * @skb: sk_buff to extract the ports from
+ * @thoff: transport header offset
+ * @ip_proto: protocol for which to get port offset
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ *
+ * The function will try to retrieve the ports at offset thoff + poff where poff
+ * is the protocol port offset returned from proto_ports_offset
+ */
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+ void *data, int hlen)
+{
+ int poff = proto_ports_offset(ip_proto);
+
+ if (!data) {
+ data = skb->data;
+ hlen = skb_headlen(skb);
+ }
+
+ if (poff >= 0) {
+ __be32 *ports, _ports;
+
+ ports = __skb_header_pointer(skb, thoff + poff,
+ sizeof(_ports), data, hlen, &_ports);
+ if (ports)
+ return *ports;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__skb_flow_get_ports);
+
+/**
+ * __skb_flow_dissect - extract the flow_keys struct and return it
+ * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @data: raw buffer pointer to the packet, if NULL use skb->data
+ * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
+ * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
+ * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ *
+ * The function will try to retrieve the struct flow_keys from either the skbuff
+ * or a raw buffer specified by the rest parameters
+ */
+bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+ void *data, __be16 proto, int nhoff, int hlen)
+{
+ u8 ip_proto;
+
+ if (!data) {
+ data = skb->data;
+ proto = skb->protocol;
+ nhoff = skb_network_offset(skb);
+ hlen = skb_headlen(skb);
+ }
+
+ memset(flow, 0, sizeof(*flow));
+
+again:
+ switch (proto) {
+ case htons(ETH_P_IP): {
+ const struct iphdr *iph;
+ struct iphdr _iph;
+ip:
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
+ if (!iph || iph->ihl < 5)
+ return false;
+ nhoff += iph->ihl * 4;
+
+ ip_proto = iph->protocol;
+ if (ip_is_fragment(iph))
+ ip_proto = 0;
+
+ /* skip the address processing if skb is NULL. The assumption
+ * here is that if there is no skb we are not looking for flow
+ * info but lengths and protocols.
+ */
+ if (!skb)
+ break;
+
+ iph_to_flow_copy_addrs(flow, iph);
+ break;
+ }
+ case htons(ETH_P_IPV6): {
+ const struct ipv6hdr *iph;
+ struct ipv6hdr _iph;
+ __be32 flow_label;
+
+ipv6:
+ iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
+ if (!iph)
+ return false;
+
+ ip_proto = iph->nexthdr;
+ nhoff += sizeof(struct ipv6hdr);
+
+ /* see comment above in IPv4 section */
+ if (!skb)
+ break;
+
+ flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+ flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+
+ flow_label = ip6_flowlabel(iph);
+ if (flow_label) {
+ /* Awesome, IPv6 packet has a flow label so we can
+ * use that to represent the ports without any
+ * further dissection.
+ */
+ flow->n_proto = proto;
+ flow->ip_proto = ip_proto;
+ flow->ports = flow_label;
+ flow->thoff = (u16)nhoff;
+
+ return true;
+ }
+
+ break;
+ }
+ case htons(ETH_P_8021AD):
+ case htons(ETH_P_8021Q): {
+ const struct vlan_hdr *vlan;
+ struct vlan_hdr _vlan;
+
+ vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
+ if (!vlan)
+ return false;
+
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ goto again;
+ }
+ case htons(ETH_P_PPP_SES): {
+ struct {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *hdr, _hdr;
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return false;
+ proto = hdr->proto;
+ nhoff += PPPOE_SES_HLEN;
+ switch (proto) {
+ case htons(PPP_IP):
+ goto ip;
+ case htons(PPP_IPV6):
+ goto ipv6;
+ default:
+ return false;
+ }
+ }
+ case htons(ETH_P_TIPC): {
+ struct {
+ __be32 pre[3];
+ __be32 srcnode;
+ } *hdr, _hdr;
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return false;
+ flow->src = hdr->srcnode;
+ flow->dst = 0;
+ flow->n_proto = proto;
+ flow->thoff = (u16)nhoff;
+ return true;
+ }
+ case htons(ETH_P_FCOE):
+ flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+ /* fall through */
+ default:
+ return false;
+ }
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ } *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return false;
+ /*
+ * Only look inside GRE if version zero and no
+ * routing
+ */
+ if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
+ proto = hdr->proto;
+ nhoff += 4;
+ if (hdr->flags & GRE_CSUM)
+ nhoff += 4;
+ if (hdr->flags & GRE_KEY)
+ nhoff += 4;
+ if (hdr->flags & GRE_SEQ)
+ nhoff += 4;
+ if (proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = __skb_header_pointer(skb, nhoff,
+ sizeof(_eth),
+ data, hlen, &_eth);
+ if (!eth)
+ return false;
+ proto = eth->h_proto;
+ nhoff += sizeof(*eth);
+ }
+ goto again;
+ }
+ break;
+ }
+ case IPPROTO_IPIP:
+ proto = htons(ETH_P_IP);
+ goto ip;
+ case IPPROTO_IPV6:
+ proto = htons(ETH_P_IPV6);
+ goto ipv6;
+ default:
+ break;
+ }
+
+ flow->n_proto = proto;
+ flow->ip_proto = ip_proto;
+ flow->thoff = (u16) nhoff;
+
+ /* unless skb is set we don't need to record port info */
+ if (skb)
+ flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+ data, hlen);
+
+ return true;
+}
+EXPORT_SYMBOL(__skb_flow_dissect);
+
+static u32 hashrnd __read_mostly;
+static __always_inline void __flow_hash_secret_init(void)
+{
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+}
+
+static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
+{
+ __flow_hash_secret_init();
+ return jhash_3words(a, b, c, hashrnd);
+}
+
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
+{
+ u32 hash;
+
+ /* get a consistent hash (same value on both flow directions) */
+ if (((__force u32)keys->dst < (__force u32)keys->src) ||
+ (((__force u32)keys->dst == (__force u32)keys->src) &&
+ ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
+ swap(keys->dst, keys->src);
+ swap(keys->port16[0], keys->port16[1]);
+ }
+
+ hash = __flow_hash_3words((__force u32)keys->dst,
+ (__force u32)keys->src,
+ (__force u32)keys->ports);
+ if (!hash)
+ hash = 1;
+
+ return hash;
+}
+
+u32 flow_hash_from_keys(struct flow_keys *keys)
+{
+ return __flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(flow_hash_from_keys);
+
+/*
+ * __skb_get_hash: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. Sets hash in skb to non-zero hash value
+ * on success, zero indicates no valid hash. Also, sets l4_hash in skb
+ * if hash is a canonical 4-tuple hash over transport ports.
+ */
+void __skb_get_hash(struct sk_buff *skb)
+{
+ struct flow_keys keys;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return;
+
+ if (keys.ports)
+ skb->l4_hash = 1;
+
+ skb->sw_hash = 1;
+
+ skb->hash = __flow_hash_from_keys(&keys);
+}
+EXPORT_SYMBOL(__skb_get_hash);
+
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+ unsigned int num_tx_queues)
+{
+ u32 hash;
+ u16 qoffset = 0;
+ u16 qcount = num_tx_queues;
+
+ if (skb_rx_queue_recorded(skb)) {
+ hash = skb_get_rx_queue(skb);
+ while (unlikely(hash >= num_tx_queues))
+ hash -= num_tx_queues;
+ return hash;
+ }
+
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+ qoffset = dev->tc_to_txq[tc].offset;
+ qcount = dev->tc_to_txq[tc].count;
+ }
+
+ return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+ const struct flow_keys *keys, int hlen)
+{
+ u32 poff = keys->thoff;
+
+ switch (keys->ip_proto) {
+ case IPPROTO_TCP: {
+ /* access doff as u8 to avoid unaligned access */
+ const u8 *doff;
+ u8 _doff;
+
+ doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
+ data, hlen, &_doff);
+ if (!doff)
+ return poff;
+
+ poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ poff += sizeof(struct udphdr);
+ break;
+ /* For the rest, we do not really care about header
+ * extensions at this point for now.
+ */
+ case IPPROTO_ICMP:
+ poff += sizeof(struct icmphdr);
+ break;
+ case IPPROTO_ICMPV6:
+ poff += sizeof(struct icmp6hdr);
+ break;
+ case IPPROTO_IGMP:
+ poff += sizeof(struct igmphdr);
+ break;
+ case IPPROTO_DCCP:
+ poff += sizeof(struct dccp_hdr);
+ break;
+ case IPPROTO_SCTP:
+ poff += sizeof(struct sctphdr);
+ break;
+ }
+
+ return poff;
+}
+
+/* skb_get_poff() returns the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 skb_get_poff(const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+
+ if (!skb_flow_dissect(skb, &keys))
+ return 0;
+
+ return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
+}
+
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ int queue_index = -1;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ map = rcu_dereference(
+ dev_maps->cpu_map[skb->sender_cpu - 1]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else
+ queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+ map->len)];
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ }
+ rcu_read_unlock();
+
+ return queue_index;
+#else
+ return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ int queue_index = sk_tx_queue_get(sk);
+
+ if (queue_index < 0 || skb->ooo_okay ||
+ queue_index >= dev->real_num_tx_queues) {
+ int new_index = get_xps_queue(dev, skb);
+ if (new_index < 0)
+ new_index = skb_tx_hash(dev, skb);
+
+ if (queue_index != new_index && sk &&
+ rcu_access_pointer(sk->sk_dst_cache))
+ sk_tx_queue_set(sk, new_index);
+
+ queue_index = new_index;
+ }
+
+ return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ void *accel_priv)
+{
+ int queue_index = 0;
+
+#ifdef CONFIG_XPS
+ if (skb->sender_cpu == 0)
+ skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+ if (dev->real_num_tx_queues != 1) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ if (ops->ndo_select_queue)
+ queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ __netdev_pick_tx);
+ else
+ queue_index = __netdev_pick_tx(dev, skb);
+
+ if (!accel_priv)
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+ return netdev_get_tx_queue(dev, queue_index);
+}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
new file mode 100644
index 000000000..9dfb88a93
--- /dev/null
+++ b/net/core/gen_estimator.c
@@ -0,0 +1,328 @@
+/*
+ * net/sched/gen_estimator.c Simple rate estimator.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Jamal Hadi Salim - moved it to net/core and reshulfed
+ * names to make it usable in general net subsystem.
+ */
+
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/gen_stats.h>
+
+/*
+ This code is NOT intended to be used for statistics collection,
+ its purpose is to provide a base for statistical multiplexing
+ for controlled load service.
+ If you need only statistics, run a user level daemon which
+ periodically reads byte counters.
+
+ Unfortunately, rate estimation is not a very easy task.
+ F.e. I did not find a simple way to estimate the current peak rate
+ and even failed to formulate the problem 8)8)
+
+ So I preferred not to built an estimator into the scheduler,
+ but run this task separately.
+ Ideally, it should be kernel thread(s), but for now it runs
+ from timers, which puts apparent top bounds on the number of rated
+ flows, has minimal overhead on small, but is enough
+ to handle controlled load service, sets of aggregates.
+
+ We measure rate over A=(1<<interval) seconds and evaluate EWMA:
+
+ avrate = avrate*(1-W) + rate*W
+
+ where W is chosen as negative power of 2: W = 2^(-ewma_log)
+
+ The resulting time constant is:
+
+ T = A/(-ln(1-W))
+
+
+ NOTES.
+
+ * avbps is scaled by 2^5, avpps is scaled by 2^10.
+ * both values are reported as 32 bit unsigned values. bps can
+ overflow for fast links : max speed being 34360Mbit/sec
+ * Minimal interval is HZ/4=250msec (it is the greatest common divisor
+ for HZ=100 and HZ=1024 8)), maximal interval
+ is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
+ are too expensive, longer ones can be implemented
+ at user level painlessly.
+ */
+
+#define EST_MAX_INTERVAL 5
+
+struct gen_estimator
+{
+ struct list_head list;
+ struct gnet_stats_basic_packed *bstats;
+ struct gnet_stats_rate_est64 *rate_est;
+ spinlock_t *stats_lock;
+ int ewma_log;
+ u64 last_bytes;
+ u64 avbps;
+ u32 last_packets;
+ u32 avpps;
+ struct rcu_head e_rcu;
+ struct rb_node node;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ struct rcu_head head;
+};
+
+struct gen_estimator_head
+{
+ struct timer_list timer;
+ struct list_head list;
+};
+
+static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
+
+/* Protects against NULL dereference */
+static DEFINE_RWLOCK(est_lock);
+
+/* Protects against soft lockup during large deletion */
+static struct rb_root est_root = RB_ROOT;
+static DEFINE_SPINLOCK(est_tree_lock);
+
+static void est_timer(unsigned long arg)
+{
+ int idx = (int)arg;
+ struct gen_estimator *e;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &elist[idx].list, list) {
+ struct gnet_stats_basic_packed b = {0};
+ u64 brate;
+ u32 rate;
+
+ spin_lock(e->stats_lock);
+ read_lock(&est_lock);
+ if (e->bstats == NULL)
+ goto skip;
+
+ __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
+
+ brate = (b.bytes - e->last_bytes)<<(7 - idx);
+ e->last_bytes = b.bytes;
+ e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
+ e->rate_est->bps = (e->avbps+0xF)>>5;
+
+ rate = (b.packets - e->last_packets)<<(12 - idx);
+ e->last_packets = b.packets;
+ e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
+ e->rate_est->pps = (e->avpps+0x1FF)>>10;
+skip:
+ read_unlock(&est_lock);
+ spin_unlock(e->stats_lock);
+ }
+
+ if (!list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
+ rcu_read_unlock();
+}
+
+static void gen_add_node(struct gen_estimator *est)
+{
+ struct rb_node **p = &est_root.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct gen_estimator *e;
+
+ parent = *p;
+ e = rb_entry(parent, struct gen_estimator, node);
+
+ if (est->bstats > e->bstats)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&est->node, parent, p);
+ rb_insert_color(&est->node, &est_root);
+}
+
+static
+struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
+ const struct gnet_stats_rate_est64 *rate_est)
+{
+ struct rb_node *p = est_root.rb_node;
+
+ while (p) {
+ struct gen_estimator *e;
+
+ e = rb_entry(p, struct gen_estimator, node);
+
+ if (bstats > e->bstats)
+ p = p->rb_right;
+ else if (bstats < e->bstats || rate_est != e->rate_est)
+ p = p->rb_left;
+ else
+ return e;
+ }
+ return NULL;
+}
+
+/**
+ * gen_new_estimator - create a new rate estimator
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ * @stats_lock: statistics lock
+ * @opt: rate estimator configuration TLV
+ *
+ * Creates a new rate estimator with &bstats as source and &rate_est
+ * as destination. A new timer with the interval specified in the
+ * configuration TLV is created. Upon each interval, the latest statistics
+ * will be read from &bstats and the estimated rate will be stored in
+ * &rate_est with the statistics lock grabbed during this period.
+ *
+ * Returns 0 on success or a negative error code.
+ *
+ */
+int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats,
+ struct gnet_stats_rate_est64 *rate_est,
+ spinlock_t *stats_lock,
+ struct nlattr *opt)
+{
+ struct gen_estimator *est;
+ struct gnet_estimator *parm = nla_data(opt);
+ struct gnet_stats_basic_packed b = {0};
+ int idx;
+
+ if (nla_len(opt) < sizeof(*parm))
+ return -EINVAL;
+
+ if (parm->interval < -2 || parm->interval > 3)
+ return -EINVAL;
+
+ est = kzalloc(sizeof(*est), GFP_KERNEL);
+ if (est == NULL)
+ return -ENOBUFS;
+
+ __gnet_stats_copy_basic(&b, cpu_bstats, bstats);
+
+ idx = parm->interval + 2;
+ est->bstats = bstats;
+ est->rate_est = rate_est;
+ est->stats_lock = stats_lock;
+ est->ewma_log = parm->ewma_log;
+ est->last_bytes = b.bytes;
+ est->avbps = rate_est->bps<<5;
+ est->last_packets = b.packets;
+ est->avpps = rate_est->pps<<10;
+ est->cpu_bstats = cpu_bstats;
+
+ spin_lock_bh(&est_tree_lock);
+ if (!elist[idx].timer.function) {
+ INIT_LIST_HEAD(&elist[idx].list);
+ setup_timer(&elist[idx].timer, est_timer, idx);
+ }
+
+ if (list_empty(&elist[idx].list))
+ mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
+
+ list_add_rcu(&est->list, &elist[idx].list);
+ gen_add_node(est);
+ spin_unlock_bh(&est_tree_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(gen_new_estimator);
+
+/**
+ * gen_kill_estimator - remove a rate estimator
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ *
+ * Removes the rate estimator specified by &bstats and &rate_est.
+ *
+ * Note : Caller should respect an RCU grace period before freeing stats_lock
+ */
+void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_rate_est64 *rate_est)
+{
+ struct gen_estimator *e;
+
+ spin_lock_bh(&est_tree_lock);
+ while ((e = gen_find_node(bstats, rate_est))) {
+ rb_erase(&e->node, &est_root);
+
+ write_lock(&est_lock);
+ e->bstats = NULL;
+ write_unlock(&est_lock);
+
+ list_del_rcu(&e->list);
+ kfree_rcu(e, e_rcu);
+ }
+ spin_unlock_bh(&est_tree_lock);
+}
+EXPORT_SYMBOL(gen_kill_estimator);
+
+/**
+ * gen_replace_estimator - replace rate estimator configuration
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ * @stats_lock: statistics lock
+ * @opt: rate estimator configuration TLV
+ *
+ * Replaces the configuration of a rate estimator by calling
+ * gen_kill_estimator() and gen_new_estimator().
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats,
+ struct gnet_stats_rate_est64 *rate_est,
+ spinlock_t *stats_lock, struct nlattr *opt)
+{
+ gen_kill_estimator(bstats, rate_est);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
+}
+EXPORT_SYMBOL(gen_replace_estimator);
+
+/**
+ * gen_estimator_active - test if estimator is currently in use
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ *
+ * Returns true if estimator is active, and false if not.
+ */
+bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
+ const struct gnet_stats_rate_est64 *rate_est)
+{
+ bool res;
+
+ ASSERT_RTNL();
+
+ spin_lock_bh(&est_tree_lock);
+ res = gen_find_node(bstats, rate_est) != NULL;
+ spin_unlock_bh(&est_tree_lock);
+
+ return res;
+}
+EXPORT_SYMBOL(gen_estimator_active);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
new file mode 100644
index 000000000..1e2f46a69
--- /dev/null
+++ b/net/core/gen_stats.c
@@ -0,0 +1,364 @@
+/*
+ * net/core/gen_stats.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ * Jamal Hadi Salim
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * See Documentation/networking/gen_stats.txt
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h>
+#include <linux/gen_stats.h>
+#include <net/netlink.h>
+#include <net/gen_stats.h>
+
+
+static inline int
+gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
+{
+ if (nla_put(d->skb, type, size, buf))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ kfree(d->xstats);
+ d->xstats = NULL;
+ d->xstats_len = 0;
+ spin_unlock_bh(d->lock);
+ return -1;
+}
+
+/**
+ * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * @skb: socket buffer to put statistics TLVs into
+ * @type: TLV type for top level statistic TLV
+ * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV
+ * @xstats_type: TLV type for backward compatibility xstats TLV
+ * @lock: statistics lock
+ * @d: dumping handle
+ *
+ * Initializes the dumping handle, grabs the statistic lock and appends
+ * an empty TLV header to the socket buffer for use a container for all
+ * other statistic TLVS.
+ *
+ * The dumping handle is marked to be in backward compatibility mode telling
+ * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.
+ *
+ * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
+ int xstats_type, spinlock_t *lock, struct gnet_dump *d)
+ __acquires(lock)
+{
+ memset(d, 0, sizeof(*d));
+
+ spin_lock_bh(lock);
+ d->lock = lock;
+ if (type)
+ d->tail = (struct nlattr *)skb_tail_pointer(skb);
+ d->skb = skb;
+ d->compat_tc_stats = tc_stats_type;
+ d->compat_xstats = xstats_type;
+
+ if (d->tail)
+ return gnet_stats_copy(d, type, NULL, 0);
+
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_start_copy_compat);
+
+/**
+ * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * @skb: socket buffer to put statistics TLVs into
+ * @type: TLV type for top level statistic TLV
+ * @lock: statistics lock
+ * @d: dumping handle
+ *
+ * Initializes the dumping handle, grabs the statistic lock and appends
+ * an empty TLV header to the socket buffer for use a container for all
+ * other statistic TLVS.
+ *
+ * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
+ struct gnet_dump *d)
+{
+ return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
+}
+EXPORT_SYMBOL(gnet_stats_start_copy);
+
+static void
+__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
+ unsigned int start;
+ u64 bytes;
+ u32 packets;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ bytes = bcpu->bstats.bytes;
+ packets = bcpu->bstats.packets;
+ } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+
+ bstats->bytes += bytes;
+ bstats->packets += packets;
+ }
+}
+
+void
+__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
+{
+ if (cpu) {
+ __gnet_stats_copy_basic_cpu(bstats, cpu);
+ } else {
+ bstats->bytes = b->bytes;
+ bstats->packets = b->packets;
+ }
+}
+EXPORT_SYMBOL(__gnet_stats_copy_basic);
+
+/**
+ * gnet_stats_copy_basic - copy basic statistics into statistic TLV
+ * @d: dumping handle
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
+{
+ struct gnet_stats_basic_packed bstats = {0};
+
+ __gnet_stats_copy_basic(&bstats, cpu, b);
+
+ if (d->compat_tc_stats) {
+ d->tc_stats.bytes = bstats.bytes;
+ d->tc_stats.packets = bstats.packets;
+ }
+
+ if (d->tail) {
+ struct gnet_stats_basic sb;
+
+ memset(&sb, 0, sizeof(sb));
+ sb.bytes = bstats.bytes;
+ sb.packets = bstats.packets;
+ return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
+ }
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic);
+
+/**
+ * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
+ * @d: dumping handle
+ * @b: basic statistics
+ * @r: rate estimator statistics
+ *
+ * Appends the rate estimator statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_rate_est(struct gnet_dump *d,
+ const struct gnet_stats_basic_packed *b,
+ struct gnet_stats_rate_est64 *r)
+{
+ struct gnet_stats_rate_est est;
+ int res;
+
+ if (b && !gen_estimator_active(b, r))
+ return 0;
+
+ est.bps = min_t(u64, UINT_MAX, r->bps);
+ /* we have some time before reaching 2^32 packets per second */
+ est.pps = r->pps;
+
+ if (d->compat_tc_stats) {
+ d->tc_stats.bps = est.bps;
+ d->tc_stats.pps = est.pps;
+ }
+
+ if (d->tail) {
+ res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est));
+ if (res < 0 || est.bps == r->bps)
+ return res;
+ /* emit 64bit stats only if needed */
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r));
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_rate_est);
+
+static void
+__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *q)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
+
+ qstats->qlen = 0;
+ qstats->backlog += qcpu->backlog;
+ qstats->drops += qcpu->drops;
+ qstats->requeues += qcpu->requeues;
+ qstats->overlimits += qcpu->overlimits;
+ }
+}
+
+static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *cpu,
+ const struct gnet_stats_queue *q,
+ __u32 qlen)
+{
+ if (cpu) {
+ __gnet_stats_copy_queue_cpu(qstats, cpu);
+ } else {
+ qstats->qlen = q->qlen;
+ qstats->backlog = q->backlog;
+ qstats->drops = q->drops;
+ qstats->requeues = q->requeues;
+ qstats->overlimits = q->overlimits;
+ }
+
+ qstats->qlen = qlen;
+}
+
+/**
+ * gnet_stats_copy_queue - copy queue statistics into statistics TLV
+ * @d: dumping handle
+ * @cpu_q: per cpu queue statistics
+ * @q: queue statistics
+ * @qlen: queue length statistics
+ *
+ * Appends the queue statistics to the top level TLV created by
+ * gnet_stats_start_copy(). Using per cpu queue statistics if
+ * they are available.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_queue(struct gnet_dump *d,
+ struct gnet_stats_queue __percpu *cpu_q,
+ struct gnet_stats_queue *q, __u32 qlen)
+{
+ struct gnet_stats_queue qstats = {0};
+
+ __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
+
+ if (d->compat_tc_stats) {
+ d->tc_stats.drops = qstats.drops;
+ d->tc_stats.qlen = qstats.qlen;
+ d->tc_stats.backlog = qstats.backlog;
+ d->tc_stats.overlimits = qstats.overlimits;
+ }
+
+ if (d->tail)
+ return gnet_stats_copy(d, TCA_STATS_QUEUE,
+ &qstats, sizeof(qstats));
+
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_queue);
+
+/**
+ * gnet_stats_copy_app - copy application specific statistics into statistics TLV
+ * @d: dumping handle
+ * @st: application specific statistics data
+ * @len: length of data
+ *
+ * Appends the application specific statistics to the top level TLV created by
+ * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
+ * handle is in backward compatibility mode.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
+{
+ if (d->compat_xstats) {
+ d->xstats = kmemdup(st, len, GFP_ATOMIC);
+ if (!d->xstats)
+ goto err_out;
+ d->xstats_len = len;
+ }
+
+ if (d->tail)
+ return gnet_stats_copy(d, TCA_STATS_APP, st, len);
+
+ return 0;
+
+err_out:
+ d->xstats_len = 0;
+ spin_unlock_bh(d->lock);
+ return -1;
+}
+EXPORT_SYMBOL(gnet_stats_copy_app);
+
+/**
+ * gnet_stats_finish_copy - finish dumping procedure
+ * @d: dumping handle
+ *
+ * Corrects the length of the top level TLV to include all TLVs added
+ * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs
+ * if gnet_stats_start_copy_compat() was used and releases the statistics
+ * lock.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_finish_copy(struct gnet_dump *d)
+{
+ if (d->tail)
+ d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail;
+
+ if (d->compat_tc_stats)
+ if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
+ sizeof(d->tc_stats)) < 0)
+ return -1;
+
+ if (d->compat_xstats && d->xstats) {
+ if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
+ d->xstats_len) < 0)
+ return -1;
+ }
+
+ kfree(d->xstats);
+ d->xstats = NULL;
+ d->xstats_len = 0;
+ spin_unlock_bh(d->lock);
+ return 0;
+}
+EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
new file mode 100644
index 000000000..982861607
--- /dev/null
+++ b/net/core/link_watch.c
@@ -0,0 +1,253 @@
+/*
+ * Linux network device link state notification
+ *
+ * Author:
+ * Stefan Rompf <sux@loplof.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+
+
+enum lw_bits {
+ LW_URGENT = 0,
+};
+
+static unsigned long linkwatch_flags;
+static unsigned long linkwatch_nextevent;
+
+static void linkwatch_event(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
+
+static LIST_HEAD(lweventlist);
+static DEFINE_SPINLOCK(lweventlist_lock);
+
+static unsigned char default_operstate(const struct net_device *dev)
+{
+ if (!netif_carrier_ok(dev))
+ return (dev->ifindex != dev_get_iflink(dev) ?
+ IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
+
+ if (netif_dormant(dev))
+ return IF_OPER_DORMANT;
+
+ return IF_OPER_UP;
+}
+
+
+static void rfc2863_policy(struct net_device *dev)
+{
+ unsigned char operstate = default_operstate(dev);
+
+ if (operstate == dev->operstate)
+ return;
+
+ write_lock_bh(&dev_base_lock);
+
+ switch(dev->link_mode) {
+ case IF_LINK_MODE_DORMANT:
+ if (operstate == IF_OPER_UP)
+ operstate = IF_OPER_DORMANT;
+ break;
+
+ case IF_LINK_MODE_DEFAULT:
+ default:
+ break;
+ }
+
+ dev->operstate = operstate;
+
+ write_unlock_bh(&dev_base_lock);
+}
+
+
+void linkwatch_init_dev(struct net_device *dev)
+{
+ /* Handle pre-registration link state changes */
+ if (!netif_carrier_ok(dev) || netif_dormant(dev))
+ rfc2863_policy(dev);
+}
+
+
+static bool linkwatch_urgent_event(struct net_device *dev)
+{
+ if (!netif_running(dev))
+ return false;
+
+ if (dev->ifindex != dev_get_iflink(dev))
+ return true;
+
+ if (dev->priv_flags & IFF_TEAM_PORT)
+ return true;
+
+ return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
+}
+
+
+static void linkwatch_add_event(struct net_device *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ if (list_empty(&dev->link_watch_list)) {
+ list_add_tail(&dev->link_watch_list, &lweventlist);
+ dev_hold(dev);
+ }
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+}
+
+
+static void linkwatch_schedule_work(int urgent)
+{
+ unsigned long delay = linkwatch_nextevent - jiffies;
+
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ return;
+
+ /* Minimise down-time: drop delay for up event. */
+ if (urgent) {
+ if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
+ return;
+ delay = 0;
+ }
+
+ /* If we wrap around we'll delay it by at most HZ. */
+ if (delay > HZ)
+ delay = 0;
+
+ /*
+ * If urgent, schedule immediate execution; otherwise, don't
+ * override the existing timer.
+ */
+ if (test_bit(LW_URGENT, &linkwatch_flags))
+ mod_delayed_work(system_wq, &linkwatch_work, 0);
+ else
+ schedule_delayed_work(&linkwatch_work, delay);
+}
+
+
+static void linkwatch_do_dev(struct net_device *dev)
+{
+ /*
+ * Make sure the above read is complete since it can be
+ * rewritten as soon as we clear the bit below.
+ */
+ smp_mb__before_atomic();
+
+ /* We are about to handle this device,
+ * so new events can be accepted
+ */
+ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+
+ rfc2863_policy(dev);
+ if (dev->flags & IFF_UP) {
+ if (netif_carrier_ok(dev))
+ dev_activate(dev);
+ else
+ dev_deactivate(dev);
+
+ netdev_state_change(dev);
+ }
+ dev_put(dev);
+}
+
+static void __linkwatch_run_queue(int urgent_only)
+{
+ struct net_device *dev;
+ LIST_HEAD(wrk);
+
+ /*
+ * Limit the number of linkwatch events to one
+ * per second so that a runaway driver does not
+ * cause a storm of messages on the netlink
+ * socket. This limit does not apply to up events
+ * while the device qdisc is down.
+ */
+ if (!urgent_only)
+ linkwatch_nextevent = jiffies + HZ;
+ /* Limit wrap-around effect on delay. */
+ else if (time_after(linkwatch_nextevent, jiffies + HZ))
+ linkwatch_nextevent = jiffies;
+
+ clear_bit(LW_URGENT, &linkwatch_flags);
+
+ spin_lock_irq(&lweventlist_lock);
+ list_splice_init(&lweventlist, &wrk);
+
+ while (!list_empty(&wrk)) {
+
+ dev = list_first_entry(&wrk, struct net_device, link_watch_list);
+ list_del_init(&dev->link_watch_list);
+
+ if (urgent_only && !linkwatch_urgent_event(dev)) {
+ list_add_tail(&dev->link_watch_list, &lweventlist);
+ continue;
+ }
+ spin_unlock_irq(&lweventlist_lock);
+ linkwatch_do_dev(dev);
+ spin_lock_irq(&lweventlist_lock);
+ }
+
+ if (!list_empty(&lweventlist))
+ linkwatch_schedule_work(0);
+ spin_unlock_irq(&lweventlist_lock);
+}
+
+void linkwatch_forget_dev(struct net_device *dev)
+{
+ unsigned long flags;
+ int clean = 0;
+
+ spin_lock_irqsave(&lweventlist_lock, flags);
+ if (!list_empty(&dev->link_watch_list)) {
+ list_del_init(&dev->link_watch_list);
+ clean = 1;
+ }
+ spin_unlock_irqrestore(&lweventlist_lock, flags);
+ if (clean)
+ linkwatch_do_dev(dev);
+}
+
+
+/* Must be called with the rtnl semaphore held */
+void linkwatch_run_queue(void)
+{
+ __linkwatch_run_queue(0);
+}
+
+
+static void linkwatch_event(struct work_struct *dummy)
+{
+ rtnl_lock();
+ __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
+ rtnl_unlock();
+}
+
+
+void linkwatch_fire_event(struct net_device *dev)
+{
+ bool urgent = linkwatch_urgent_event(dev);
+
+ if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
+ linkwatch_add_event(dev);
+ } else if (!urgent)
+ return;
+
+ linkwatch_schedule_work(urgent);
+}
+EXPORT_SYMBOL(linkwatch_fire_event);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
new file mode 100644
index 000000000..2237c1b3c
--- /dev/null
+++ b/net/core/neighbour.c
@@ -0,0 +1,3172 @@
+/*
+ * Generic address resolution entity
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
+ * Harald Welte Add neighbour cache statistics like rtstat
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/times.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/netevent.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/log2.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+
+#define DEBUG
+#define NEIGH_DEBUG 1
+#define neigh_dbg(level, fmt, ...) \
+do { \
+ if (level <= NEIGH_DEBUG) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define PNEIGH_HASHMASK 0xF
+
+static void neigh_timer_handler(unsigned long arg);
+static void __neigh_notify(struct neighbour *n, int type, int flags);
+static void neigh_update_notify(struct neighbour *neigh);
+static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
+
+#ifdef CONFIG_PROC_FS
+static const struct file_operations neigh_stat_seq_fops;
+#endif
+
+/*
+ Neighbour hash table buckets are protected with rwlock tbl->lock.
+
+ - All the scans/updates to hash buckets MUST be made under this lock.
+ - NOTHING clever should be made under this lock: no callbacks
+ to protocol backends, no attempts to send something to network.
+ It will result in deadlocks, if backend/driver wants to use neighbour
+ cache.
+ - If the entry requires some non-trivial actions, increase
+ its reference count and release table lock.
+
+ Neighbour entries are protected:
+ - with reference count.
+ - with rwlock neigh->lock
+
+ Reference count prevents destruction.
+
+ neigh->lock mainly serializes ll address data and its validity state.
+ However, the same lock is used to protect another entry fields:
+ - timer
+ - resolution queue
+
+ Again, nothing clever shall be made under neigh->lock,
+ the most complicated procedure, which we allow is dev->hard_header.
+ It is supposed, that dev->hard_header is simplistic and does
+ not make callbacks to neighbour tables.
+ */
+
+static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static void neigh_cleanup_and_release(struct neighbour *neigh)
+{
+ if (neigh->parms->neigh_cleanup)
+ neigh->parms->neigh_cleanup(neigh);
+
+ __neigh_notify(neigh, RTM_DELNEIGH, 0);
+ neigh_release(neigh);
+}
+
+/*
+ * It is random distribution in the interval (1/2)*base...(3/2)*base.
+ * It corresponds to default IPv6 settings and is not overridable,
+ * because it is really reasonable choice.
+ */
+
+unsigned long neigh_rand_reach_time(unsigned long base)
+{
+ return base ? (prandom_u32() % base) + (base >> 1) : 0;
+}
+EXPORT_SYMBOL(neigh_rand_reach_time);
+
+
+static int neigh_forced_gc(struct neigh_table *tbl)
+{
+ int shrunk = 0;
+ int i;
+ struct neigh_hash_table *nht;
+
+ NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
+
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+
+ np = &nht->hash_buckets[i];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ /* Neighbour record may be discarded if:
+ * - nobody refers to it.
+ * - it is not permanent
+ */
+ write_lock(&n->lock);
+ if (atomic_read(&n->refcnt) == 1 &&
+ !(n->nud_state & NUD_PERMANENT)) {
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
+ n->dead = 1;
+ shrunk = 1;
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ continue;
+ }
+ write_unlock(&n->lock);
+ np = &n->next;
+ }
+ }
+
+ tbl->last_flush = jiffies;
+
+ write_unlock_bh(&tbl->lock);
+
+ return shrunk;
+}
+
+static void neigh_add_timer(struct neighbour *n, unsigned long when)
+{
+ neigh_hold(n);
+ if (unlikely(mod_timer(&n->timer, when))) {
+ printk("NEIGH: BUG, double timer add, state is %x\n",
+ n->nud_state);
+ dump_stack();
+ }
+}
+
+static int neigh_del_timer(struct neighbour *n)
+{
+ if ((n->nud_state & NUD_IN_TIMER) &&
+ del_timer(&n->timer)) {
+ neigh_release(n);
+ return 1;
+ }
+ return 0;
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(list)) != NULL) {
+ dev_put(skb->dev);
+ kfree_skb(skb);
+ }
+}
+
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
+{
+ int i;
+ struct neigh_hash_table *nht;
+
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+
+ for (i = 0; i < (1 << nht->hash_shift); i++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np = &nht->hash_buckets[i];
+
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ if (dev && n->dev != dev) {
+ np = &n->next;
+ continue;
+ }
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
+ write_lock(&n->lock);
+ neigh_del_timer(n);
+ n->dead = 1;
+
+ if (atomic_read(&n->refcnt) != 1) {
+ /* The most unpleasant situation.
+ We must destroy neighbour entry,
+ but someone still uses it.
+
+ The destroy will be delayed until
+ the last user releases us, but
+ we must kill timers etc. and move
+ it to safe state.
+ */
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
+ n->output = neigh_blackhole;
+ if (n->nud_state & NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+ neigh_dbg(2, "neigh %p is stray\n", n);
+ }
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ }
+ }
+}
+
+void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
+{
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
+ write_unlock_bh(&tbl->lock);
+}
+EXPORT_SYMBOL(neigh_changeaddr);
+
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ write_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev);
+ pneigh_ifdown(tbl, dev);
+ write_unlock_bh(&tbl->lock);
+
+ del_timer_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue);
+ return 0;
+}
+EXPORT_SYMBOL(neigh_ifdown);
+
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
+{
+ struct neighbour *n = NULL;
+ unsigned long now = jiffies;
+ int entries;
+
+ entries = atomic_inc_return(&tbl->entries) - 1;
+ if (entries >= tbl->gc_thresh3 ||
+ (entries >= tbl->gc_thresh2 &&
+ time_after(now, tbl->last_flush + 5 * HZ))) {
+ if (!neigh_forced_gc(tbl) &&
+ entries >= tbl->gc_thresh3)
+ goto out_entries;
+ }
+
+ n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
+ if (!n)
+ goto out_entries;
+
+ __skb_queue_head_init(&n->arp_queue);
+ rwlock_init(&n->lock);
+ seqlock_init(&n->ha_lock);
+ n->updated = n->used = now;
+ n->nud_state = NUD_NONE;
+ n->output = neigh_blackhole;
+ seqlock_init(&n->hh.hh_lock);
+ n->parms = neigh_parms_clone(&tbl->parms);
+ setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
+
+ NEIGH_CACHE_STAT_INC(tbl, allocs);
+ n->tbl = tbl;
+ atomic_set(&n->refcnt, 1);
+ n->dead = 1;
+out:
+ return n;
+
+out_entries:
+ atomic_dec(&tbl->entries);
+ goto out;
+}
+
+static void neigh_get_hash_rnd(u32 *x)
+{
+ get_random_bytes(x, sizeof(*x));
+ *x |= 1;
+}
+
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
+{
+ size_t size = (1 << shift) * sizeof(struct neighbour *);
+ struct neigh_hash_table *ret;
+ struct neighbour __rcu **buckets;
+ int i;
+
+ ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+ if (!ret)
+ return NULL;
+ if (size <= PAGE_SIZE)
+ buckets = kzalloc(size, GFP_ATOMIC);
+ else
+ buckets = (struct neighbour __rcu **)
+ __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+ get_order(size));
+ if (!buckets) {
+ kfree(ret);
+ return NULL;
+ }
+ ret->hash_buckets = buckets;
+ ret->hash_shift = shift;
+ for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
+ neigh_get_hash_rnd(&ret->hash_rnd[i]);
+ return ret;
+}
+
+static void neigh_hash_free_rcu(struct rcu_head *head)
+{
+ struct neigh_hash_table *nht = container_of(head,
+ struct neigh_hash_table,
+ rcu);
+ size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
+ struct neighbour __rcu **buckets = nht->hash_buckets;
+
+ if (size <= PAGE_SIZE)
+ kfree(buckets);
+ else
+ free_pages((unsigned long)buckets, get_order(size));
+ kfree(nht);
+}
+
+static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
+ unsigned long new_shift)
+{
+ unsigned int i, hash;
+ struct neigh_hash_table *new_nht, *old_nht;
+
+ NEIGH_CACHE_STAT_INC(tbl, hash_grows);
+
+ old_nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ new_nht = neigh_hash_alloc(new_shift);
+ if (!new_nht)
+ return old_nht;
+
+ for (i = 0; i < (1 << old_nht->hash_shift); i++) {
+ struct neighbour *n, *next;
+
+ for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
+ lockdep_is_held(&tbl->lock));
+ n != NULL;
+ n = next) {
+ hash = tbl->hash(n->primary_key, n->dev,
+ new_nht->hash_rnd);
+
+ hash >>= (32 - new_nht->hash_shift);
+ next = rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock));
+
+ rcu_assign_pointer(n->next,
+ rcu_dereference_protected(
+ new_nht->hash_buckets[hash],
+ lockdep_is_held(&tbl->lock)));
+ rcu_assign_pointer(new_nht->hash_buckets[hash], n);
+ }
+ }
+
+ rcu_assign_pointer(tbl->nht, new_nht);
+ call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
+ return new_nht;
+}
+
+struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev)
+{
+ struct neighbour *n;
+
+ NEIGH_CACHE_STAT_INC(tbl, lookups);
+
+ rcu_read_lock_bh();
+ n = __neigh_lookup_noref(tbl, pkey, dev);
+ if (n) {
+ if (!atomic_inc_not_zero(&n->refcnt))
+ n = NULL;
+ NEIGH_CACHE_STAT_INC(tbl, hits);
+ }
+
+ rcu_read_unlock_bh();
+ return n;
+}
+EXPORT_SYMBOL(neigh_lookup);
+
+struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
+ const void *pkey)
+{
+ struct neighbour *n;
+ int key_len = tbl->key_len;
+ u32 hash_val;
+ struct neigh_hash_table *nht;
+
+ NEIGH_CACHE_STAT_INC(tbl, lookups);
+
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
+
+ for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
+ if (!memcmp(n->primary_key, pkey, key_len) &&
+ net_eq(dev_net(n->dev), net)) {
+ if (!atomic_inc_not_zero(&n->refcnt))
+ n = NULL;
+ NEIGH_CACHE_STAT_INC(tbl, hits);
+ break;
+ }
+ }
+
+ rcu_read_unlock_bh();
+ return n;
+}
+EXPORT_SYMBOL(neigh_lookup_nodev);
+
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, bool want_ref)
+{
+ u32 hash_val;
+ int key_len = tbl->key_len;
+ int error;
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
+ struct neigh_hash_table *nht;
+
+ if (!n) {
+ rc = ERR_PTR(-ENOBUFS);
+ goto out;
+ }
+
+ memcpy(n->primary_key, pkey, key_len);
+ n->dev = dev;
+ dev_hold(dev);
+
+ /* Protocol specific setup. */
+ if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+
+ if (dev->netdev_ops->ndo_neigh_construct) {
+ error = dev->netdev_ops->ndo_neigh_construct(n);
+ if (error < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+ }
+
+ /* Device specific setup. */
+ if (n->parms->neigh_setup &&
+ (error = n->parms->neigh_setup(n)) < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+
+ n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
+
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+
+ if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
+ nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
+
+ hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
+
+ if (n->parms->dead) {
+ rc = ERR_PTR(-EINVAL);
+ goto out_tbl_unlock;
+ }
+
+ for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
+ lockdep_is_held(&tbl->lock));
+ n1 != NULL;
+ n1 = rcu_dereference_protected(n1->next,
+ lockdep_is_held(&tbl->lock))) {
+ if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
+ if (want_ref)
+ neigh_hold(n1);
+ rc = n1;
+ goto out_tbl_unlock;
+ }
+ }
+
+ n->dead = 0;
+ if (want_ref)
+ neigh_hold(n);
+ rcu_assign_pointer(n->next,
+ rcu_dereference_protected(nht->hash_buckets[hash_val],
+ lockdep_is_held(&tbl->lock)));
+ rcu_assign_pointer(nht->hash_buckets[hash_val], n);
+ write_unlock_bh(&tbl->lock);
+ neigh_dbg(2, "neigh %p is created\n", n);
+ rc = n;
+out:
+ return rc;
+out_tbl_unlock:
+ write_unlock_bh(&tbl->lock);
+out_neigh_release:
+ neigh_release(n);
+ goto out;
+}
+EXPORT_SYMBOL(__neigh_create);
+
+static u32 pneigh_hash(const void *pkey, int key_len)
+{
+ u32 hash_val = *(u32 *)(pkey + key_len - 4);
+ hash_val ^= (hash_val >> 16);
+ hash_val ^= hash_val >> 8;
+ hash_val ^= hash_val >> 4;
+ hash_val &= PNEIGH_HASHMASK;
+ return hash_val;
+}
+
+static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
+ struct net *net,
+ const void *pkey,
+ int key_len,
+ struct net_device *dev)
+{
+ while (n) {
+ if (!memcmp(n->key, pkey, key_len) &&
+ net_eq(pneigh_net(n), net) &&
+ (n->dev == dev || !n->dev))
+ return n;
+ n = n->next;
+ }
+ return NULL;
+}
+
+struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey, struct net_device *dev)
+{
+ int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+ net, pkey, key_len, dev);
+}
+EXPORT_SYMBOL_GPL(__pneigh_lookup);
+
+struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey,
+ struct net_device *dev, int creat)
+{
+ struct pneigh_entry *n;
+ int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ read_lock_bh(&tbl->lock);
+ n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+ net, pkey, key_len, dev);
+ read_unlock_bh(&tbl->lock);
+
+ if (n || !creat)
+ goto out;
+
+ ASSERT_RTNL();
+
+ n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
+ if (!n)
+ goto out;
+
+ write_pnet(&n->net, net);
+ memcpy(n->key, pkey, key_len);
+ n->dev = dev;
+ if (dev)
+ dev_hold(dev);
+
+ if (tbl->pconstructor && tbl->pconstructor(n)) {
+ if (dev)
+ dev_put(dev);
+ kfree(n);
+ n = NULL;
+ goto out;
+ }
+
+ write_lock_bh(&tbl->lock);
+ n->next = tbl->phash_buckets[hash_val];
+ tbl->phash_buckets[hash_val] = n;
+ write_unlock_bh(&tbl->lock);
+out:
+ return n;
+}
+EXPORT_SYMBOL(pneigh_lookup);
+
+
+int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
+ struct net_device *dev)
+{
+ struct pneigh_entry *n, **np;
+ int key_len = tbl->key_len;
+ u32 hash_val = pneigh_hash(pkey, key_len);
+
+ write_lock_bh(&tbl->lock);
+ for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
+ np = &n->next) {
+ if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
+ net_eq(pneigh_net(n), net)) {
+ *np = n->next;
+ write_unlock_bh(&tbl->lock);
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ if (n->dev)
+ dev_put(n->dev);
+ kfree(n);
+ return 0;
+ }
+ }
+ write_unlock_bh(&tbl->lock);
+ return -ENOENT;
+}
+
+static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ struct pneigh_entry *n, **np;
+ u32 h;
+
+ for (h = 0; h <= PNEIGH_HASHMASK; h++) {
+ np = &tbl->phash_buckets[h];
+ while ((n = *np) != NULL) {
+ if (!dev || n->dev == dev) {
+ *np = n->next;
+ if (tbl->pdestructor)
+ tbl->pdestructor(n);
+ if (n->dev)
+ dev_put(n->dev);
+ kfree(n);
+ continue;
+ }
+ np = &n->next;
+ }
+ }
+ return -ENOENT;
+}
+
+static void neigh_parms_destroy(struct neigh_parms *parms);
+
+static inline void neigh_parms_put(struct neigh_parms *parms)
+{
+ if (atomic_dec_and_test(&parms->refcnt))
+ neigh_parms_destroy(parms);
+}
+
+/*
+ * neighbour must already be out of the table;
+ *
+ */
+void neigh_destroy(struct neighbour *neigh)
+{
+ struct net_device *dev = neigh->dev;
+
+ NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
+
+ if (!neigh->dead) {
+ pr_warn("Destroying alive neighbour %p\n", neigh);
+ dump_stack();
+ return;
+ }
+
+ if (neigh_del_timer(neigh))
+ pr_warn("Impossible event\n");
+
+ write_lock_bh(&neigh->lock);
+ __skb_queue_purge(&neigh->arp_queue);
+ write_unlock_bh(&neigh->lock);
+ neigh->arp_queue_len_bytes = 0;
+
+ if (dev->netdev_ops->ndo_neigh_destroy)
+ dev->netdev_ops->ndo_neigh_destroy(neigh);
+
+ dev_put(dev);
+ neigh_parms_put(neigh->parms);
+
+ neigh_dbg(2, "neigh %p is destroyed\n", neigh);
+
+ atomic_dec(&neigh->tbl->entries);
+ kfree_rcu(neigh, rcu);
+}
+EXPORT_SYMBOL(neigh_destroy);
+
+/* Neighbour state is suspicious;
+ disable fast path.
+
+ Called with write_locked neigh.
+ */
+static void neigh_suspect(struct neighbour *neigh)
+{
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
+
+ neigh->output = neigh->ops->output;
+}
+
+/* Neighbour state is OK;
+ enable fast path.
+
+ Called with write_locked neigh.
+ */
+static void neigh_connect(struct neighbour *neigh)
+{
+ neigh_dbg(2, "neigh %p is connected\n", neigh);
+
+ neigh->output = neigh->ops->connected_output;
+}
+
+static void neigh_periodic_work(struct work_struct *work)
+{
+ struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+ unsigned int i;
+ struct neigh_hash_table *nht;
+
+ NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
+
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+
+ /*
+ * periodically recompute ReachableTime from random function
+ */
+
+ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
+ struct neigh_parms *p;
+ tbl->last_rand = jiffies;
+ list_for_each_entry(p, &tbl->parms_list, list)
+ p->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ }
+
+ if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
+ goto out;
+
+ for (i = 0 ; i < (1 << nht->hash_shift); i++) {
+ np = &nht->hash_buckets[i];
+
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ unsigned int state;
+
+ write_lock(&n->lock);
+
+ state = n->nud_state;
+ if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+ write_unlock(&n->lock);
+ goto next_elt;
+ }
+
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
+
+ if (atomic_read(&n->refcnt) == 1 &&
+ (state == NUD_FAILED ||
+ time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
+ *np = n->next;
+ n->dead = 1;
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
+ continue;
+ }
+ write_unlock(&n->lock);
+
+next_elt:
+ np = &n->next;
+ }
+ /*
+ * It's fine to release lock here, even if hash table
+ * grows while we are preempted.
+ */
+ write_unlock_bh(&tbl->lock);
+ cond_resched();
+ write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ }
+out:
+ /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
+ * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
+ * BASE_REACHABLE_TIME.
+ */
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
+ write_unlock_bh(&tbl->lock);
+}
+
+static __inline__ int neigh_max_probes(struct neighbour *n)
+{
+ struct neigh_parms *p = n->parms;
+ return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) +
+ (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) :
+ NEIGH_VAR(p, MCAST_PROBES));
+}
+
+static void neigh_invalidate(struct neighbour *neigh)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
+{
+ struct sk_buff *skb;
+
+ NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
+ neigh_dbg(2, "neigh %p is failed\n", neigh);
+ neigh->updated = jiffies;
+
+ /* It is very thin place. report_unreachable is very complicated
+ routine. Particularly, it can hit the same neighbour entry!
+
+ So that, we try to be accurate and avoid dead loop. --ANK
+ */
+ while (neigh->nud_state == NUD_FAILED &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ write_unlock(&neigh->lock);
+ neigh->ops->error_report(neigh, skb);
+ write_lock(&neigh->lock);
+ }
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
+static void neigh_probe(struct neighbour *neigh)
+ __releases(neigh->lock)
+{
+ struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
+ /* keep skb alive even if arp_queue overflows */
+ if (skb)
+ skb = skb_copy(skb, GFP_ATOMIC);
+ write_unlock(&neigh->lock);
+ neigh->ops->solicit(neigh, skb);
+ atomic_inc(&neigh->probes);
+ kfree_skb(skb);
+}
+
+/* Called when a timer expires for a neighbour entry. */
+
+static void neigh_timer_handler(unsigned long arg)
+{
+ unsigned long now, next;
+ struct neighbour *neigh = (struct neighbour *)arg;
+ unsigned int state;
+ int notify = 0;
+
+ write_lock(&neigh->lock);
+
+ state = neigh->nud_state;
+ now = jiffies;
+ next = now + HZ;
+
+ if (!(state & NUD_IN_TIMER))
+ goto out;
+
+ if (state & NUD_REACHABLE) {
+ if (time_before_eq(now,
+ neigh->confirmed + neigh->parms->reachable_time)) {
+ neigh_dbg(2, "neigh %p is still alive\n", neigh);
+ next = neigh->confirmed + neigh->parms->reachable_time;
+ } else if (time_before_eq(now,
+ neigh->used +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
+ neigh->nud_state = NUD_DELAY;
+ neigh->updated = jiffies;
+ neigh_suspect(neigh);
+ next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
+ } else {
+ neigh_dbg(2, "neigh %p is suspected\n", neigh);
+ neigh->nud_state = NUD_STALE;
+ neigh->updated = jiffies;
+ neigh_suspect(neigh);
+ notify = 1;
+ }
+ } else if (state & NUD_DELAY) {
+ if (time_before_eq(now,
+ neigh->confirmed +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
+ neigh_dbg(2, "neigh %p is now reachable\n", neigh);
+ neigh->nud_state = NUD_REACHABLE;
+ neigh->updated = jiffies;
+ neigh_connect(neigh);
+ notify = 1;
+ next = neigh->confirmed + neigh->parms->reachable_time;
+ } else {
+ neigh_dbg(2, "neigh %p is probed\n", neigh);
+ neigh->nud_state = NUD_PROBE;
+ neigh->updated = jiffies;
+ atomic_set(&neigh->probes, 0);
+ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ }
+ } else {
+ /* NUD_PROBE|NUD_INCOMPLETE */
+ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ }
+
+ if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
+ atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
+ neigh->nud_state = NUD_FAILED;
+ notify = 1;
+ neigh_invalidate(neigh);
+ goto out;
+ }
+
+ if (neigh->nud_state & NUD_IN_TIMER) {
+ if (time_before(next, jiffies + HZ/2))
+ next = jiffies + HZ/2;
+ if (!mod_timer(&neigh->timer, next))
+ neigh_hold(neigh);
+ }
+ if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
+ neigh_probe(neigh);
+ } else {
+out:
+ write_unlock(&neigh->lock);
+ }
+
+ if (notify)
+ neigh_update_notify(neigh);
+
+ neigh_release(neigh);
+}
+
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+{
+ int rc;
+ bool immediate_probe = false;
+
+ write_lock_bh(&neigh->lock);
+
+ rc = 0;
+ if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
+ goto out_unlock_bh;
+ if (neigh->dead)
+ goto out_dead;
+
+ if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
+ if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
+ NEIGH_VAR(neigh->parms, APP_PROBES)) {
+ unsigned long next, now = jiffies;
+
+ atomic_set(&neigh->probes,
+ NEIGH_VAR(neigh->parms, UCAST_PROBES));
+ neigh->nud_state = NUD_INCOMPLETE;
+ neigh->updated = now;
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/2);
+ neigh_add_timer(neigh, next);
+ immediate_probe = true;
+ } else {
+ neigh->nud_state = NUD_FAILED;
+ neigh->updated = jiffies;
+ write_unlock_bh(&neigh->lock);
+
+ kfree_skb(skb);
+ return 1;
+ }
+ } else if (neigh->nud_state & NUD_STALE) {
+ neigh_dbg(2, "neigh %p is delayed\n", neigh);
+ neigh->nud_state = NUD_DELAY;
+ neigh->updated = jiffies;
+ neigh_add_timer(neigh, jiffies +
+ NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
+ }
+
+ if (neigh->nud_state == NUD_INCOMPLETE) {
+ if (skb) {
+ while (neigh->arp_queue_len_bytes + skb->truesize >
+ NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
+ struct sk_buff *buff;
+
+ buff = __skb_dequeue(&neigh->arp_queue);
+ if (!buff)
+ break;
+ neigh->arp_queue_len_bytes -= buff->truesize;
+ kfree_skb(buff);
+ NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
+ }
+ skb_dst_force(skb);
+ __skb_queue_tail(&neigh->arp_queue, skb);
+ neigh->arp_queue_len_bytes += skb->truesize;
+ }
+ rc = 1;
+ }
+out_unlock_bh:
+ if (immediate_probe)
+ neigh_probe(neigh);
+ else
+ write_unlock(&neigh->lock);
+ local_bh_enable();
+ return rc;
+
+out_dead:
+ if (neigh->nud_state & NUD_STALE)
+ goto out_unlock_bh;
+ write_unlock_bh(&neigh->lock);
+ kfree_skb(skb);
+ return 1;
+}
+EXPORT_SYMBOL(__neigh_event_send);
+
+static void neigh_update_hhs(struct neighbour *neigh)
+{
+ struct hh_cache *hh;
+ void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
+ = NULL;
+
+ if (neigh->dev->header_ops)
+ update = neigh->dev->header_ops->cache_update;
+
+ if (update) {
+ hh = &neigh->hh;
+ if (hh->hh_len) {
+ write_seqlock_bh(&hh->hh_lock);
+ update(hh, neigh->dev, neigh->ha);
+ write_sequnlock_bh(&hh->hh_lock);
+ }
+ }
+}
+
+
+
+/* Generic update routine.
+ -- lladdr is new lladdr or NULL, if it is not supplied.
+ -- new is new state.
+ -- flags
+ NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
+ if it is different.
+ NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
+ lladdr instead of overriding it
+ if it is different.
+ It also allows to retain current state
+ if lladdr is unchanged.
+ NEIGH_UPDATE_F_ADMIN means that the change is administrative.
+
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
+ NTF_ROUTER flag.
+ NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
+ a router.
+
+ Caller MUST hold reference count on the entry.
+ */
+
+int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
+ u32 flags)
+{
+ u8 old;
+ int err;
+ int notify = 0;
+ struct net_device *dev;
+ int update_isrouter = 0;
+
+ write_lock_bh(&neigh->lock);
+
+ dev = neigh->dev;
+ old = neigh->nud_state;
+ err = -EPERM;
+
+ if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+ (old & (NUD_NOARP | NUD_PERMANENT)))
+ goto out;
+ if (neigh->dead)
+ goto out;
+
+ if (!(new & NUD_VALID)) {
+ neigh_del_timer(neigh);
+ if (old & NUD_CONNECTED)
+ neigh_suspect(neigh);
+ neigh->nud_state = new;
+ err = 0;
+ notify = old & NUD_VALID;
+ if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
+ (new & NUD_FAILED)) {
+ neigh_invalidate(neigh);
+ notify = 1;
+ }
+ goto out;
+ }
+
+ /* Compare new lladdr with cached one */
+ if (!dev->addr_len) {
+ /* First case: device needs no address. */
+ lladdr = neigh->ha;
+ } else if (lladdr) {
+ /* The second case: if something is already cached
+ and a new address is proposed:
+ - compare new & old
+ - if they are different, check override flag
+ */
+ if ((old & NUD_VALID) &&
+ !memcmp(lladdr, neigh->ha, dev->addr_len))
+ lladdr = neigh->ha;
+ } else {
+ /* No address is supplied; if we know something,
+ use it, otherwise discard the request.
+ */
+ err = -EINVAL;
+ if (!(old & NUD_VALID))
+ goto out;
+ lladdr = neigh->ha;
+ }
+
+ if (new & NUD_CONNECTED)
+ neigh->confirmed = jiffies;
+ neigh->updated = jiffies;
+
+ /* If entry was valid and address is not changed,
+ do not change entry state, if new one is STALE.
+ */
+ err = 0;
+ update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
+ if (old & NUD_VALID) {
+ if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
+ update_isrouter = 0;
+ if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
+ (old & NUD_CONNECTED)) {
+ lladdr = neigh->ha;
+ new = NUD_STALE;
+ } else
+ goto out;
+ } else {
+ if (lladdr == neigh->ha && new == NUD_STALE &&
+ ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
+ (old & NUD_CONNECTED))
+ )
+ new = old;
+ }
+ }
+
+ if (new != old) {
+ neigh_del_timer(neigh);
+ if (new & NUD_IN_TIMER)
+ neigh_add_timer(neigh, (jiffies +
+ ((new & NUD_REACHABLE) ?
+ neigh->parms->reachable_time :
+ 0)));
+ neigh->nud_state = new;
+ notify = 1;
+ }
+
+ if (lladdr != neigh->ha) {
+ write_seqlock(&neigh->ha_lock);
+ memcpy(&neigh->ha, lladdr, dev->addr_len);
+ write_sequnlock(&neigh->ha_lock);
+ neigh_update_hhs(neigh);
+ if (!(new & NUD_CONNECTED))
+ neigh->confirmed = jiffies -
+ (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
+ notify = 1;
+ }
+ if (new == old)
+ goto out;
+ if (new & NUD_CONNECTED)
+ neigh_connect(neigh);
+ else
+ neigh_suspect(neigh);
+ if (!(old & NUD_VALID)) {
+ struct sk_buff *skb;
+
+ /* Again: avoid dead loop if something went wrong */
+
+ while (neigh->nud_state & NUD_VALID &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ struct dst_entry *dst = skb_dst(skb);
+ struct neighbour *n2, *n1 = neigh;
+ write_unlock_bh(&neigh->lock);
+
+ rcu_read_lock();
+
+ /* Why not just use 'neigh' as-is? The problem is that
+ * things such as shaper, eql, and sch_teql can end up
+ * using alternative, different, neigh objects to output
+ * the packet in the output path. So what we need to do
+ * here is re-lookup the top-level neigh in the path so
+ * we can reinject the packet there.
+ */
+ n2 = NULL;
+ if (dst) {
+ n2 = dst_neigh_lookup_skb(dst, skb);
+ if (n2)
+ n1 = n2;
+ }
+ n1->output(n1, skb);
+ if (n2)
+ neigh_release(n2);
+ rcu_read_unlock();
+
+ write_lock_bh(&neigh->lock);
+ }
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+ }
+out:
+ if (update_isrouter) {
+ neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
+ (neigh->flags | NTF_ROUTER) :
+ (neigh->flags & ~NTF_ROUTER);
+ }
+ write_unlock_bh(&neigh->lock);
+
+ if (notify)
+ neigh_update_notify(neigh);
+
+ return err;
+}
+EXPORT_SYMBOL(neigh_update);
+
+/* Update the neigh to listen temporarily for probe responses, even if it is
+ * in a NUD_FAILED state. The caller has to hold neigh->lock for writing.
+ */
+void __neigh_set_probe_once(struct neighbour *neigh)
+{
+ if (neigh->dead)
+ return;
+ neigh->updated = jiffies;
+ if (!(neigh->nud_state & NUD_FAILED))
+ return;
+ neigh->nud_state = NUD_INCOMPLETE;
+ atomic_set(&neigh->probes, neigh_max_probes(neigh));
+ neigh_add_timer(neigh,
+ jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+}
+EXPORT_SYMBOL(__neigh_set_probe_once);
+
+struct neighbour *neigh_event_ns(struct neigh_table *tbl,
+ u8 *lladdr, void *saddr,
+ struct net_device *dev)
+{
+ struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
+ lladdr || !dev->addr_len);
+ if (neigh)
+ neigh_update(neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_OVERRIDE);
+ return neigh;
+}
+EXPORT_SYMBOL(neigh_event_ns);
+
+/* called with read_lock_bh(&n->lock); */
+static void neigh_hh_init(struct neighbour *n)
+{
+ struct net_device *dev = n->dev;
+ __be16 prot = n->tbl->protocol;
+ struct hh_cache *hh = &n->hh;
+
+ write_lock_bh(&n->lock);
+
+ /* Only one thread can come in here and initialize the
+ * hh_cache entry.
+ */
+ if (!hh->hh_len)
+ dev->header_ops->cache(n, hh, prot);
+
+ write_unlock_bh(&n->lock);
+}
+
+/* Slow and careful. */
+
+int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ int rc = 0;
+
+ if (!neigh_event_send(neigh, skb)) {
+ int err;
+ struct net_device *dev = neigh->dev;
+ unsigned int seq;
+
+ if (dev->header_ops->cache && !neigh->hh.hh_len)
+ neigh_hh_init(neigh);
+
+ do {
+ __skb_pull(skb, skb_network_offset(skb));
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
+ if (err >= 0)
+ rc = dev_queue_xmit(skb);
+ else
+ goto out_kfree_skb;
+ }
+out:
+ return rc;
+out_kfree_skb:
+ rc = -EINVAL;
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_resolve_output);
+
+/* As fast as possible without hh cache */
+
+int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ struct net_device *dev = neigh->dev;
+ unsigned int seq;
+ int err;
+
+ do {
+ __skb_pull(skb, skb_network_offset(skb));
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
+ if (err >= 0)
+ err = dev_queue_xmit(skb);
+ else {
+ err = -EINVAL;
+ kfree_skb(skb);
+ }
+ return err;
+}
+EXPORT_SYMBOL(neigh_connected_output);
+
+int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+EXPORT_SYMBOL(neigh_direct_output);
+
+static void neigh_proxy_process(unsigned long arg)
+{
+ struct neigh_table *tbl = (struct neigh_table *)arg;
+ long sched_next = 0;
+ unsigned long now = jiffies;
+ struct sk_buff *skb, *n;
+
+ spin_lock(&tbl->proxy_queue.lock);
+
+ skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
+ long tdif = NEIGH_CB(skb)->sched_next - now;
+
+ if (tdif <= 0) {
+ struct net_device *dev = skb->dev;
+
+ __skb_unlink(skb, &tbl->proxy_queue);
+ if (tbl->proxy_redo && netif_running(dev)) {
+ rcu_read_lock();
+ tbl->proxy_redo(skb);
+ rcu_read_unlock();
+ } else {
+ kfree_skb(skb);
+ }
+
+ dev_put(dev);
+ } else if (!sched_next || tdif < sched_next)
+ sched_next = tdif;
+ }
+ del_timer(&tbl->proxy_timer);
+ if (sched_next)
+ mod_timer(&tbl->proxy_timer, jiffies + sched_next);
+ spin_unlock(&tbl->proxy_queue.lock);
+}
+
+void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
+ struct sk_buff *skb)
+{
+ unsigned long now = jiffies;
+
+ unsigned long sched_next = now + (prandom_u32() %
+ NEIGH_VAR(p, PROXY_DELAY));
+
+ if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ NEIGH_CB(skb)->sched_next = sched_next;
+ NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
+
+ spin_lock(&tbl->proxy_queue.lock);
+ if (del_timer(&tbl->proxy_timer)) {
+ if (time_before(tbl->proxy_timer.expires, sched_next))
+ sched_next = tbl->proxy_timer.expires;
+ }
+ skb_dst_drop(skb);
+ dev_hold(skb->dev);
+ __skb_queue_tail(&tbl->proxy_queue, skb);
+ mod_timer(&tbl->proxy_timer, sched_next);
+ spin_unlock(&tbl->proxy_queue.lock);
+}
+EXPORT_SYMBOL(pneigh_enqueue);
+
+static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
+ struct net *net, int ifindex)
+{
+ struct neigh_parms *p;
+
+ list_for_each_entry(p, &tbl->parms_list, list) {
+ if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
+ (!p->dev && !ifindex && net_eq(net, &init_net)))
+ return p;
+ }
+
+ return NULL;
+}
+
+struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
+ struct neigh_table *tbl)
+{
+ struct neigh_parms *p;
+ struct net *net = dev_net(dev);
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
+ if (p) {
+ p->tbl = tbl;
+ atomic_set(&p->refcnt, 1);
+ p->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ dev_hold(dev);
+ p->dev = dev;
+ write_pnet(&p->net, net);
+ p->sysctl_table = NULL;
+
+ if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
+ dev_put(dev);
+ kfree(p);
+ return NULL;
+ }
+
+ write_lock_bh(&tbl->lock);
+ list_add(&p->list, &tbl->parms.list);
+ write_unlock_bh(&tbl->lock);
+
+ neigh_parms_data_state_cleanall(p);
+ }
+ return p;
+}
+EXPORT_SYMBOL(neigh_parms_alloc);
+
+static void neigh_rcu_free_parms(struct rcu_head *head)
+{
+ struct neigh_parms *parms =
+ container_of(head, struct neigh_parms, rcu_head);
+
+ neigh_parms_put(parms);
+}
+
+void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
+{
+ if (!parms || parms == &tbl->parms)
+ return;
+ write_lock_bh(&tbl->lock);
+ list_del(&parms->list);
+ parms->dead = 1;
+ write_unlock_bh(&tbl->lock);
+ if (parms->dev)
+ dev_put(parms->dev);
+ call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
+}
+EXPORT_SYMBOL(neigh_parms_release);
+
+static void neigh_parms_destroy(struct neigh_parms *parms)
+{
+ kfree(parms);
+}
+
+static struct lock_class_key neigh_table_proxy_queue_class;
+
+static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
+
+void neigh_table_init(int index, struct neigh_table *tbl)
+{
+ unsigned long now = jiffies;
+ unsigned long phsize;
+
+ INIT_LIST_HEAD(&tbl->parms_list);
+ list_add(&tbl->parms.list, &tbl->parms_list);
+ write_pnet(&tbl->parms.net, &init_net);
+ atomic_set(&tbl->parms.refcnt, 1);
+ tbl->parms.reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
+
+ tbl->stats = alloc_percpu(struct neigh_statistics);
+ if (!tbl->stats)
+ panic("cannot create neighbour cache statistics");
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat,
+ &neigh_stat_seq_fops, tbl))
+ panic("cannot create neighbour proc dir entry");
+#endif
+
+ RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
+
+ phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
+ tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
+
+ if (!tbl->nht || !tbl->phash_buckets)
+ panic("cannot allocate neighbour cache hashes");
+
+ if (!tbl->entry_size)
+ tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
+ tbl->key_len, NEIGH_PRIV_ALIGN);
+ else
+ WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
+
+ rwlock_init(&tbl->lock);
+ INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
+ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
+ tbl->parms.reachable_time);
+ setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
+ skb_queue_head_init_class(&tbl->proxy_queue,
+ &neigh_table_proxy_queue_class);
+
+ tbl->last_flush = now;
+ tbl->last_rand = now + tbl->parms.reachable_time * 20;
+
+ neigh_tables[index] = tbl;
+}
+EXPORT_SYMBOL(neigh_table_init);
+
+int neigh_table_clear(int index, struct neigh_table *tbl)
+{
+ neigh_tables[index] = NULL;
+ /* It is not clean... Fix it to unload IPv6 module safely */
+ cancel_delayed_work_sync(&tbl->gc_work);
+ del_timer_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue);
+ neigh_ifdown(tbl, NULL);
+ if (atomic_read(&tbl->entries))
+ pr_crit("neighbour leakage\n");
+
+ call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
+ neigh_hash_free_rcu);
+ tbl->nht = NULL;
+
+ kfree(tbl->phash_buckets);
+ tbl->phash_buckets = NULL;
+
+ remove_proc_entry(tbl->id, init_net.proc_net_stat);
+
+ free_percpu(tbl->stats);
+ tbl->stats = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL(neigh_table_clear);
+
+static struct neigh_table *neigh_find_table(int family)
+{
+ struct neigh_table *tbl = NULL;
+
+ switch (family) {
+ case AF_INET:
+ tbl = neigh_tables[NEIGH_ARP_TABLE];
+ break;
+ case AF_INET6:
+ tbl = neigh_tables[NEIGH_ND_TABLE];
+ break;
+ case AF_DECnet:
+ tbl = neigh_tables[NEIGH_DN_TABLE];
+ break;
+ }
+
+ return tbl;
+}
+
+static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *dst_attr;
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+ struct net_device *dev = NULL;
+ int err = -EINVAL;
+
+ ASSERT_RTNL();
+ if (nlmsg_len(nlh) < sizeof(*ndm))
+ goto out;
+
+ dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
+ if (dst_attr == NULL)
+ goto out;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex) {
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+ }
+
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
+
+ if (nla_len(dst_attr) < tbl->key_len)
+ goto out;
+
+ if (ndm->ndm_flags & NTF_PROXY) {
+ err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
+ goto out;
+ }
+
+ if (dev == NULL)
+ goto out;
+
+ neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+ if (neigh == NULL) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+
+out:
+ return err;
+}
+
+static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct neigh_table *tbl;
+ struct net_device *dev = NULL;
+ struct neighbour *neigh;
+ void *dst, *lladdr;
+ int err;
+
+ ASSERT_RTNL();
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ goto out;
+
+ err = -EINVAL;
+ if (tb[NDA_DST] == NULL)
+ goto out;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex) {
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
+ goto out;
+ }
+
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (tbl == NULL)
+ return -EAFNOSUPPORT;
+
+ if (nla_len(tb[NDA_DST]) < tbl->key_len)
+ goto out;
+ dst = nla_data(tb[NDA_DST]);
+ lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
+
+ if (ndm->ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
+
+ err = -ENOBUFS;
+ pn = pneigh_lookup(tbl, net, dst, dev, 1);
+ if (pn) {
+ pn->flags = ndm->ndm_flags;
+ err = 0;
+ }
+ goto out;
+ }
+
+ if (dev == NULL)
+ goto out;
+
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (neigh == NULL) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ neigh = __neigh_lookup_errno(tbl, dst, dev);
+ if (IS_ERR(neigh)) {
+ err = PTR_ERR(neigh);
+ goto out;
+ }
+ } else {
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ err = -EEXIST;
+ neigh_release(neigh);
+ goto out;
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
+ flags &= ~NEIGH_UPDATE_F_OVERRIDE;
+ }
+
+ if (ndm->ndm_flags & NTF_USE) {
+ neigh_event_send(neigh, NULL);
+ err = 0;
+ } else
+ err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+ neigh_release(neigh);
+
+out:
+ return err;
+}
+
+static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NDTA_PARMS);
+ if (nest == NULL)
+ return -ENOBUFS;
+
+ if ((parms->dev &&
+ nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
+ nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) ||
+ nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
+ /* approximative value for deprecated QUEUE_LEN (in packets) */
+ nla_put_u32(skb, NDTPA_QUEUE_LEN,
+ NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
+ nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) ||
+ nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) ||
+ nla_put_u32(skb, NDTPA_UCAST_PROBES,
+ NEIGH_VAR(parms, UCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_PROBES,
+ NEIGH_VAR(parms, MCAST_PROBES)) ||
+ nla_put_u32(skb, NDTPA_MCAST_REPROBES,
+ NEIGH_VAR(parms, MCAST_REPROBES)) ||
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) ||
+ nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
+ NEIGH_VAR(parms, BASE_REACHABLE_TIME)) ||
+ nla_put_msecs(skb, NDTPA_GC_STALETIME,
+ NEIGH_VAR(parms, GC_STALETIME)) ||
+ nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
+ NEIGH_VAR(parms, DELAY_PROBE_TIME)) ||
+ nla_put_msecs(skb, NDTPA_RETRANS_TIME,
+ NEIGH_VAR(parms, RETRANS_TIME)) ||
+ nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
+ NEIGH_VAR(parms, ANYCAST_DELAY)) ||
+ nla_put_msecs(skb, NDTPA_PROXY_DELAY,
+ NEIGH_VAR(parms, PROXY_DELAY)) ||
+ nla_put_msecs(skb, NDTPA_LOCKTIME,
+ NEIGH_VAR(parms, LOCKTIME)))
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
+ u32 pid, u32 seq, int type, int flags)
+{
+ struct nlmsghdr *nlh;
+ struct ndtmsg *ndtmsg;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndtmsg = nlmsg_data(nlh);
+
+ read_lock_bh(&tbl->lock);
+ ndtmsg->ndtm_family = tbl->family;
+ ndtmsg->ndtm_pad1 = 0;
+ ndtmsg->ndtm_pad2 = 0;
+
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) ||
+ nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
+ nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
+ nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
+ goto nla_put_failure;
+ {
+ unsigned long now = jiffies;
+ unsigned int flush_delta = now - tbl->last_flush;
+ unsigned int rand_delta = now - tbl->last_rand;
+ struct neigh_hash_table *nht;
+ struct ndt_config ndc = {
+ .ndtc_key_len = tbl->key_len,
+ .ndtc_entry_size = tbl->entry_size,
+ .ndtc_entries = atomic_read(&tbl->entries),
+ .ndtc_last_flush = jiffies_to_msecs(flush_delta),
+ .ndtc_last_rand = jiffies_to_msecs(rand_delta),
+ .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
+ };
+
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ ndc.ndtc_hash_rnd = nht->hash_rnd[0];
+ ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
+ rcu_read_unlock_bh();
+
+ if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
+ goto nla_put_failure;
+ }
+
+ {
+ int cpu;
+ struct ndt_stats ndst;
+
+ memset(&ndst, 0, sizeof(ndst));
+
+ for_each_possible_cpu(cpu) {
+ struct neigh_statistics *st;
+
+ st = per_cpu_ptr(tbl->stats, cpu);
+ ndst.ndts_allocs += st->allocs;
+ ndst.ndts_destroys += st->destroys;
+ ndst.ndts_hash_grows += st->hash_grows;
+ ndst.ndts_res_failed += st->res_failed;
+ ndst.ndts_lookups += st->lookups;
+ ndst.ndts_hits += st->hits;
+ ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
+ ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
+ ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
+ ndst.ndts_forced_gc_runs += st->forced_gc_runs;
+ }
+
+ if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst))
+ goto nla_put_failure;
+ }
+
+ BUG_ON(tbl->parms.dev);
+ if (neightbl_fill_parms(skb, &tbl->parms) < 0)
+ goto nla_put_failure;
+
+ read_unlock_bh(&tbl->lock);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ read_unlock_bh(&tbl->lock);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int neightbl_fill_param_info(struct sk_buff *skb,
+ struct neigh_table *tbl,
+ struct neigh_parms *parms,
+ u32 pid, u32 seq, int type,
+ unsigned int flags)
+{
+ struct ndtmsg *ndtmsg;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndtmsg = nlmsg_data(nlh);
+
+ read_lock_bh(&tbl->lock);
+ ndtmsg->ndtm_family = tbl->family;
+ ndtmsg->ndtm_pad1 = 0;
+ ndtmsg->ndtm_pad2 = 0;
+
+ if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 ||
+ neightbl_fill_parms(skb, parms) < 0)
+ goto errout;
+
+ read_unlock_bh(&tbl->lock);
+ nlmsg_end(skb, nlh);
+ return 0;
+errout:
+ read_unlock_bh(&tbl->lock);
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
+ [NDTA_NAME] = { .type = NLA_STRING },
+ [NDTA_THRESH1] = { .type = NLA_U32 },
+ [NDTA_THRESH2] = { .type = NLA_U32 },
+ [NDTA_THRESH3] = { .type = NLA_U32 },
+ [NDTA_GC_INTERVAL] = { .type = NLA_U64 },
+ [NDTA_PARMS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
+ [NDTPA_IFINDEX] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
+ [NDTPA_APP_PROBES] = { .type = NLA_U32 },
+ [NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_MCAST_PROBES] = { .type = NLA_U32 },
+ [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 },
+ [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 },
+ [NDTPA_GC_STALETIME] = { .type = NLA_U64 },
+ [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 },
+ [NDTPA_RETRANS_TIME] = { .type = NLA_U64 },
+ [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
+ [NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
+ [NDTPA_LOCKTIME] = { .type = NLA_U64 },
+};
+
+static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct neigh_table *tbl;
+ struct ndtmsg *ndtmsg;
+ struct nlattr *tb[NDTA_MAX+1];
+ bool found = false;
+ int err, tidx;
+
+ err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
+ nl_neightbl_policy);
+ if (err < 0)
+ goto errout;
+
+ if (tb[NDTA_NAME] == NULL) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ ndtmsg = nlmsg_data(nlh);
+
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
+ tbl = neigh_tables[tidx];
+ if (!tbl)
+ continue;
+ if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
+ continue;
+ if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return -ENOENT;
+
+ /*
+ * We acquire tbl->lock to be nice to the periodic timers and
+ * make sure they always see a consistent set of values.
+ */
+ write_lock_bh(&tbl->lock);
+
+ if (tb[NDTA_PARMS]) {
+ struct nlattr *tbp[NDTPA_MAX+1];
+ struct neigh_parms *p;
+ int i, ifindex = 0;
+
+ err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
+ nl_ntbl_parm_policy);
+ if (err < 0)
+ goto errout_tbl_lock;
+
+ if (tbp[NDTPA_IFINDEX])
+ ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);
+
+ p = lookup_neigh_parms(tbl, net, ifindex);
+ if (p == NULL) {
+ err = -ENOENT;
+ goto errout_tbl_lock;
+ }
+
+ for (i = 1; i <= NDTPA_MAX; i++) {
+ if (tbp[i] == NULL)
+ continue;
+
+ switch (i) {
+ case NDTPA_QUEUE_LEN:
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]) *
+ SKB_TRUESIZE(ETH_FRAME_LEN));
+ break;
+ case NDTPA_QUEUE_LENBYTES:
+ NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_PROXY_QLEN:
+ NEIGH_VAR_SET(p, PROXY_QLEN,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_APP_PROBES:
+ NEIGH_VAR_SET(p, APP_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_UCAST_PROBES:
+ NEIGH_VAR_SET(p, UCAST_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_MCAST_PROBES:
+ NEIGH_VAR_SET(p, MCAST_PROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_MCAST_REPROBES:
+ NEIGH_VAR_SET(p, MCAST_REPROBES,
+ nla_get_u32(tbp[i]));
+ break;
+ case NDTPA_BASE_REACHABLE_TIME:
+ NEIGH_VAR_SET(p, BASE_REACHABLE_TIME,
+ nla_get_msecs(tbp[i]));
+ /* update reachable_time as well, otherwise, the change will
+ * only be effective after the next time neigh_periodic_work
+ * decides to recompute it (can be multiple minutes)
+ */
+ p->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ break;
+ case NDTPA_GC_STALETIME:
+ NEIGH_VAR_SET(p, GC_STALETIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_DELAY_PROBE_TIME:
+ NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_RETRANS_TIME:
+ NEIGH_VAR_SET(p, RETRANS_TIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_ANYCAST_DELAY:
+ NEIGH_VAR_SET(p, ANYCAST_DELAY,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_PROXY_DELAY:
+ NEIGH_VAR_SET(p, PROXY_DELAY,
+ nla_get_msecs(tbp[i]));
+ break;
+ case NDTPA_LOCKTIME:
+ NEIGH_VAR_SET(p, LOCKTIME,
+ nla_get_msecs(tbp[i]));
+ break;
+ }
+ }
+ }
+
+ err = -ENOENT;
+ if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
+ tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
+ !net_eq(net, &init_net))
+ goto errout_tbl_lock;
+
+ if (tb[NDTA_THRESH1])
+ tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
+
+ if (tb[NDTA_THRESH2])
+ tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
+
+ if (tb[NDTA_THRESH3])
+ tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
+
+ if (tb[NDTA_GC_INTERVAL])
+ tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
+
+ err = 0;
+
+errout_tbl_lock:
+ write_unlock_bh(&tbl->lock);
+errout:
+ return err;
+}
+
+static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int family, tidx, nidx = 0;
+ int tbl_skip = cb->args[0];
+ int neigh_skip = cb->args[1];
+ struct neigh_table *tbl;
+
+ family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+ for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
+ struct neigh_parms *p;
+
+ tbl = neigh_tables[tidx];
+ if (!tbl)
+ continue;
+
+ if (tidx < tbl_skip || (family && tbl->family != family))
+ continue;
+
+ if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) < 0)
+ break;
+
+ nidx = 0;
+ p = list_next_entry(&tbl->parms, list);
+ list_for_each_entry_from(p, &tbl->parms_list, list) {
+ if (!net_eq(neigh_parms_net(p), net))
+ continue;
+
+ if (nidx < neigh_skip)
+ goto next;
+
+ if (neightbl_fill_param_info(skb, tbl, p,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGHTBL,
+ NLM_F_MULTI) < 0)
+ goto out;
+ next:
+ nidx++;
+ }
+
+ neigh_skip = 0;
+ }
+out:
+ cb->args[0] = tidx;
+ cb->args[1] = nidx;
+
+ return skb->len;
+}
+
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
+{
+ unsigned long now = jiffies;
+ struct nda_cacheinfo ci;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = neigh->ops->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = neigh->flags;
+ ndm->ndm_type = neigh->type;
+ ndm->ndm_ifindex = neigh->dev->ifindex;
+
+ if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
+ goto nla_put_failure;
+
+ read_lock_bh(&neigh->lock);
+ ndm->ndm_state = neigh->nud_state;
+ if (neigh->nud_state & NUD_VALID) {
+ char haddr[MAX_ADDR_LEN];
+
+ neigh_ha_snapshot(haddr, neigh, neigh->dev);
+ if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
+ read_unlock_bh(&neigh->lock);
+ goto nla_put_failure;
+ }
+ }
+
+ ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
+ ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
+ ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
+ ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
+ read_unlock_bh(&neigh->lock);
+
+ if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
+ nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
+ u32 pid, u32 seq, int type, unsigned int flags,
+ struct neigh_table *tbl)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = tbl->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = pn->flags | NTF_PROXY;
+ ndm->ndm_type = RTN_UNICAST;
+ ndm->ndm_ifindex = pn->dev->ifindex;
+ ndm->ndm_state = NUD_NONE;
+
+ if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void neigh_update_notify(struct neighbour *neigh)
+{
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0);
+}
+
+static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct neighbour *n;
+ int rc, h, s_h = cb->args[1];
+ int idx, s_idx = idx = cb->args[2];
+ struct neigh_hash_table *nht;
+
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+
+ for (h = s_h; h < (1 << nht->hash_shift); h++) {
+ if (h > s_h)
+ s_idx = 0;
+ for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
+ if (!net_eq(dev_net(n->dev), net))
+ continue;
+ if (idx < s_idx)
+ goto next;
+ if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI) < 0) {
+ rc = -1;
+ goto out;
+ }
+next:
+ idx++;
+ }
+ }
+ rc = skb->len;
+out:
+ rcu_read_unlock_bh();
+ cb->args[1] = h;
+ cb->args[2] = idx;
+ return rc;
+}
+
+static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct pneigh_entry *n;
+ struct net *net = sock_net(skb->sk);
+ int rc, h, s_h = cb->args[3];
+ int idx, s_idx = idx = cb->args[4];
+
+ read_lock_bh(&tbl->lock);
+
+ for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
+ if (h > s_h)
+ s_idx = 0;
+ for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
+ if (dev_net(n->dev) != net)
+ continue;
+ if (idx < s_idx)
+ goto next;
+ if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI, tbl) < 0) {
+ read_unlock_bh(&tbl->lock);
+ rc = -1;
+ goto out;
+ }
+ next:
+ idx++;
+ }
+ }
+
+ read_unlock_bh(&tbl->lock);
+ rc = skb->len;
+out:
+ cb->args[3] = h;
+ cb->args[4] = idx;
+ return rc;
+
+}
+
+static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct neigh_table *tbl;
+ int t, family, s_t;
+ int proxy = 0;
+ int err;
+
+ family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+ /* check for full ndmsg structure presence, family member is
+ * the same for both structures
+ */
+ if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
+ ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
+ proxy = 1;
+
+ s_t = cb->args[0];
+
+ for (t = 0; t < NEIGH_NR_TABLES; t++) {
+ tbl = neigh_tables[t];
+
+ if (!tbl)
+ continue;
+ if (t < s_t || (family && tbl->family != family))
+ continue;
+ if (t > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args) -
+ sizeof(cb->args[0]));
+ if (proxy)
+ err = pneigh_dump_table(tbl, skb, cb);
+ else
+ err = neigh_dump_table(tbl, skb, cb);
+ if (err < 0)
+ break;
+ }
+
+ cb->args[0] = t;
+ return skb->len;
+}
+
+void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
+{
+ int chain;
+ struct neigh_hash_table *nht;
+
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+
+ read_lock(&tbl->lock); /* avoid resizes */
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct neighbour *n;
+
+ for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next))
+ cb(n, cookie);
+ }
+ read_unlock(&tbl->lock);
+ rcu_read_unlock_bh();
+}
+EXPORT_SYMBOL(neigh_for_each);
+
+/* The tbl->lock must be held as a writer and BH disabled. */
+void __neigh_for_each_release(struct neigh_table *tbl,
+ int (*cb)(struct neighbour *))
+{
+ int chain;
+ struct neigh_hash_table *nht;
+
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+
+ np = &nht->hash_buckets[chain];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
+ int release;
+
+ write_lock(&n->lock);
+ release = cb(n);
+ if (release) {
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
+ n->dead = 1;
+ } else
+ np = &n->next;
+ write_unlock(&n->lock);
+ if (release)
+ neigh_cleanup_and_release(n);
+ }
+ }
+}
+EXPORT_SYMBOL(__neigh_for_each_release);
+
+int neigh_xmit(int index, struct net_device *dev,
+ const void *addr, struct sk_buff *skb)
+{
+ int err = -EAFNOSUPPORT;
+ if (likely(index < NEIGH_NR_TABLES)) {
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+
+ tbl = neigh_tables[index];
+ if (!tbl)
+ goto out;
+ neigh = __neigh_lookup_noref(tbl, addr, dev);
+ if (!neigh)
+ neigh = __neigh_create(tbl, addr, dev, false);
+ err = PTR_ERR(neigh);
+ if (IS_ERR(neigh))
+ goto out_kfree_skb;
+ err = neigh->output(neigh, skb);
+ }
+ else if (index == NEIGH_LINK_TABLE) {
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ addr, NULL, skb->len);
+ if (err < 0)
+ goto out_kfree_skb;
+ err = dev_queue_xmit(skb);
+ }
+out:
+ return err;
+out_kfree_skb:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(neigh_xmit);
+
+#ifdef CONFIG_PROC_FS
+
+static struct neighbour *neigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_hash_table *nht = state->nht;
+ struct neighbour *n = NULL;
+ int bucket = state->bucket;
+
+ state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
+ for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
+ n = rcu_dereference_bh(nht->hash_buckets[bucket]);
+
+ while (n) {
+ if (!net_eq(dev_net(n->dev), net))
+ goto next;
+ if (state->neigh_sub_iter) {
+ loff_t fakep = 0;
+ void *v;
+
+ v = state->neigh_sub_iter(state, n, &fakep);
+ if (!v)
+ goto next;
+ }
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ break;
+ if (n->nud_state & ~NUD_NOARP)
+ break;
+next:
+ n = rcu_dereference_bh(n->next);
+ }
+
+ if (n)
+ break;
+ }
+ state->bucket = bucket;
+
+ return n;
+}
+
+static struct neighbour *neigh_get_next(struct seq_file *seq,
+ struct neighbour *n,
+ loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_hash_table *nht = state->nht;
+
+ if (state->neigh_sub_iter) {
+ void *v = state->neigh_sub_iter(state, n, pos);
+ if (v)
+ return n;
+ }
+ n = rcu_dereference_bh(n->next);
+
+ while (1) {
+ while (n) {
+ if (!net_eq(dev_net(n->dev), net))
+ goto next;
+ if (state->neigh_sub_iter) {
+ void *v = state->neigh_sub_iter(state, n, pos);
+ if (v)
+ return n;
+ goto next;
+ }
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ break;
+
+ if (n->nud_state & ~NUD_NOARP)
+ break;
+next:
+ n = rcu_dereference_bh(n->next);
+ }
+
+ if (n)
+ break;
+
+ if (++state->bucket >= (1 << nht->hash_shift))
+ break;
+
+ n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
+ }
+
+ if (n && pos)
+ --(*pos);
+ return n;
+}
+
+static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct neighbour *n = neigh_get_first(seq);
+
+ if (n) {
+ --(*pos);
+ while (*pos) {
+ n = neigh_get_next(seq, n, pos);
+ if (!n)
+ break;
+ }
+ }
+ return *pos ? NULL : n;
+}
+
+static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_table *tbl = state->tbl;
+ struct pneigh_entry *pn = NULL;
+ int bucket = state->bucket;
+
+ state->flags |= NEIGH_SEQ_IS_PNEIGH;
+ for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
+ pn = tbl->phash_buckets[bucket];
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = pn->next;
+ if (pn)
+ break;
+ }
+ state->bucket = bucket;
+
+ return pn;
+}
+
+static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
+ struct pneigh_entry *pn,
+ loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct neigh_table *tbl = state->tbl;
+
+ do {
+ pn = pn->next;
+ } while (pn && !net_eq(pneigh_net(pn), net));
+
+ while (!pn) {
+ if (++state->bucket > PNEIGH_HASHMASK)
+ break;
+ pn = tbl->phash_buckets[state->bucket];
+ while (pn && !net_eq(pneigh_net(pn), net))
+ pn = pn->next;
+ if (pn)
+ break;
+ }
+
+ if (pn && pos)
+ --(*pos);
+
+ return pn;
+}
+
+static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct pneigh_entry *pn = pneigh_get_first(seq);
+
+ if (pn) {
+ --(*pos);
+ while (*pos) {
+ pn = pneigh_get_next(seq, pn, pos);
+ if (!pn)
+ break;
+ }
+ }
+ return *pos ? NULL : pn;
+}
+
+static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
+{
+ struct neigh_seq_state *state = seq->private;
+ void *rc;
+ loff_t idxpos = *pos;
+
+ rc = neigh_get_idx(seq, &idxpos);
+ if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
+ rc = pneigh_get_idx(seq, &idxpos);
+
+ return rc;
+}
+
+void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
+ __acquires(rcu_bh)
+{
+ struct neigh_seq_state *state = seq->private;
+
+ state->tbl = tbl;
+ state->bucket = 0;
+ state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
+
+ rcu_read_lock_bh();
+ state->nht = rcu_dereference_bh(tbl->nht);
+
+ return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL(neigh_seq_start);
+
+void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct neigh_seq_state *state;
+ void *rc;
+
+ if (v == SEQ_START_TOKEN) {
+ rc = neigh_get_first(seq);
+ goto out;
+ }
+
+ state = seq->private;
+ if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) {
+ rc = neigh_get_next(seq, v, NULL);
+ if (rc)
+ goto out;
+ if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY))
+ rc = pneigh_get_first(seq);
+ } else {
+ BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY);
+ rc = pneigh_get_next(seq, v, NULL);
+ }
+out:
+ ++(*pos);
+ return rc;
+}
+EXPORT_SYMBOL(neigh_seq_next);
+
+void neigh_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu_bh)
+{
+ rcu_read_unlock_bh();
+}
+EXPORT_SYMBOL(neigh_seq_stop);
+
+/* statistics via seq_file */
+
+static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct neigh_table *tbl = seq->private;
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(tbl->stats, cpu);
+ }
+ return NULL;
+}
+
+static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct neigh_table *tbl = seq->private;
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(tbl->stats, cpu);
+ }
+ return NULL;
+}
+
+static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int neigh_stat_seq_show(struct seq_file *seq, void *v)
+{
+ struct neigh_table *tbl = seq->private;
+ struct neigh_statistics *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
+ "%08lx %08lx %08lx %08lx %08lx\n",
+ atomic_read(&tbl->entries),
+
+ st->allocs,
+ st->destroys,
+ st->hash_grows,
+
+ st->lookups,
+ st->hits,
+
+ st->res_failed,
+
+ st->rcv_probes_mcast,
+ st->rcv_probes_ucast,
+
+ st->periodic_gc_runs,
+ st->forced_gc_runs,
+ st->unres_discards
+ );
+
+ return 0;
+}
+
+static const struct seq_operations neigh_stat_seq_ops = {
+ .start = neigh_stat_seq_start,
+ .next = neigh_stat_seq_next,
+ .stop = neigh_stat_seq_stop,
+ .show = neigh_stat_seq_show,
+};
+
+static int neigh_stat_seq_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &neigh_stat_seq_ops);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ sf->private = PDE_DATA(inode);
+ }
+ return ret;
+};
+
+static const struct file_operations neigh_stat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = neigh_stat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static inline size_t neigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(4); /* NDA_PROBES */
+}
+
+static void __neigh_notify(struct neighbour *n, int type, int flags)
+{
+ struct net *net = dev_net(n->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = neigh_fill_info(skb, n, 0, 0, type, flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+void neigh_app_ns(struct neighbour *n)
+{
+ __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
+}
+EXPORT_SYMBOL(neigh_app_ns);
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+static int int_max = INT_MAX;
+static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
+
+static int proc_unres_qlen(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int size, ret;
+ struct ctl_table tmp = *ctl;
+
+ tmp.extra1 = &zero;
+ tmp.extra2 = &unres_qlen_max;
+ tmp.data = &size;
+
+ size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && !ret)
+ *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
+ return ret;
+}
+
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+ int family)
+{
+ switch (family) {
+ case AF_INET:
+ return __in_dev_arp_parms_get_rcu(dev);
+ case AF_INET6:
+ return __in6_dev_nd_parms_get_rcu(dev);
+ }
+ return NULL;
+}
+
+static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
+ int index)
+{
+ struct net_device *dev;
+ int family = neigh_parms_family(p);
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ struct neigh_parms *dst_p =
+ neigh_get_dev_parms_rcu(dev, family);
+
+ if (dst_p && !test_bit(index, dst_p->data_state))
+ dst_p->data[index] = p->data[index];
+ }
+ rcu_read_unlock();
+}
+
+static void neigh_proc_update(struct ctl_table *ctl, int write)
+{
+ struct net_device *dev = ctl->extra1;
+ struct neigh_parms *p = ctl->extra2;
+ struct net *net = neigh_parms_net(p);
+ int index = (int *) ctl->data - p->data;
+
+ if (!write)
+ return;
+
+ set_bit(index, p->data_state);
+ if (!dev) /* NULL dev means this is default value */
+ neigh_copy_dflt_parms(net, p, index);
+}
+
+static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *ctl;
+ int ret;
+
+ tmp.extra1 = &zero;
+ tmp.extra2 = &int_max;
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec);
+
+int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
+
+static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
+
+static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
+
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct neigh_parms *p = ctl->extra2;
+ int ret;
+
+ if (strcmp(ctl->procname, "base_reachable_time") == 0)
+ ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
+ else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0)
+ ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
+ else
+ ret = -1;
+
+ if (write && ret == 0) {
+ /* update reachable_time as well, otherwise, the change will
+ * only be effective after the next time neigh_periodic_work
+ * decides to recompute it
+ */
+ p->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ }
+ return ret;
+}
+
+#define NEIGH_PARMS_DATA_OFFSET(index) \
+ (&((struct neigh_parms *) 0)->data[index])
+
+#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \
+ [NEIGH_VAR_ ## attr] = { \
+ .procname = name, \
+ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ }
+
+#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax)
+
+#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies)
+
+#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+
+#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)
+
+static struct neigh_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
+} neigh_sysctl_template __read_mostly = {
+ .neigh_vars = {
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+ NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
+ NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
+ NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
+ NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
+ NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
+ [NEIGH_VAR_GC_INTERVAL] = {
+ .procname = "gc_interval",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NEIGH_VAR_GC_THRESH1] = {
+ .procname = "gc_thresh1",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
+ },
+ [NEIGH_VAR_GC_THRESH2] = {
+ .procname = "gc_thresh2",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
+ },
+ [NEIGH_VAR_GC_THRESH3] = {
+ .procname = "gc_thresh3",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ .proc_handler = proc_dointvec_minmax,
+ },
+ {},
+ },
+};
+
+int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
+ proc_handler *handler)
+{
+ int i;
+ struct neigh_sysctl_table *t;
+ const char *dev_name_source;
+ char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
+ char *p_name;
+
+ t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto err;
+
+ for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
+ t->neigh_vars[i].data += (long) p;
+ t->neigh_vars[i].extra1 = dev;
+ t->neigh_vars[i].extra2 = p;
+ }
+
+ if (dev) {
+ dev_name_source = dev->name;
+ /* Terminate the table early */
+ memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
+ sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
+ } else {
+ struct neigh_table *tbl = p->tbl;
+ dev_name_source = "default";
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
+ }
+
+ if (handler) {
+ /* RetransTime */
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
+ /* ReachableTime */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
+ /* RetransTime (in milliseconds)*/
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
+ /* ReachableTime (in milliseconds) */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
+ } else {
+ /* Those handlers will update p->reachable_time after
+ * base_reachable_time(_ms) is set to ensure the new timer starts being
+ * applied after the next neighbour update instead of waiting for
+ * neigh_periodic_work to update its value (can be multiple minutes)
+ * So any handler that replaces them should do this as well
+ */
+ /* ReachableTime */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler =
+ neigh_proc_base_reachable_time;
+ /* ReachableTime (in milliseconds) */
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler =
+ neigh_proc_base_reachable_time;
+ }
+
+ /* Don't export sysctls to unprivileged users */
+ if (neigh_parms_net(p)->user_ns != &init_user_ns)
+ t->neigh_vars[0].procname = NULL;
+
+ switch (neigh_parms_family(p)) {
+ case AF_INET:
+ p_name = "ipv4";
+ break;
+ case AF_INET6:
+ p_name = "ipv6";
+ break;
+ default:
+ BUG();
+ }
+
+ snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
+ p_name, dev_name_source);
+ t->sysctl_header =
+ register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);
+ if (!t->sysctl_header)
+ goto free;
+
+ p->sysctl_table = t;
+ return 0;
+
+free:
+ kfree(t);
+err:
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(neigh_sysctl_register);
+
+void neigh_sysctl_unregister(struct neigh_parms *p)
+{
+ if (p->sysctl_table) {
+ struct neigh_sysctl_table *t = p->sysctl_table;
+ p->sysctl_table = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t);
+ }
+}
+EXPORT_SYMBOL(neigh_sysctl_unregister);
+
+#endif /* CONFIG_SYSCTL */
+
+static int __init neigh_init(void)
+{
+ rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL);
+
+ rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
+ NULL);
+ rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL);
+
+ return 0;
+}
+
+subsys_initcall(neigh_init);
+
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
new file mode 100644
index 000000000..2bf832996
--- /dev/null
+++ b/net/core/net-procfs.c
@@ -0,0 +1,423 @@
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/wext.h>
+
+#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
+
+#define get_bucket(x) ((x) >> BUCKET_SPACE)
+#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
+#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
+
+extern struct list_head ptype_all __read_mostly;
+extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+
+static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct net_device *dev;
+ struct hlist_head *h;
+ unsigned int count = 0, offset = get_offset(*pos);
+
+ h = &net->dev_name_head[get_bucket(*pos)];
+ hlist_for_each_entry_rcu(dev, h, name_hlist) {
+ if (++count == offset)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
+{
+ struct net_device *dev;
+ unsigned int bucket;
+
+ do {
+ dev = dev_from_same_bucket(seq, pos);
+ if (dev)
+ return dev;
+
+ bucket = get_bucket(*pos) + 1;
+ *pos = set_bucket_offset(bucket, 1);
+ } while (bucket < NETDEV_HASHENTRIES);
+
+ return NULL;
+}
+
+/*
+ * This is invoked by the /proc filesystem handler to display a device
+ * in detail.
+ */
+static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
+ return NULL;
+
+ return dev_from_bucket(seq, pos);
+}
+
+static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return dev_from_bucket(seq, pos);
+}
+
+static void dev_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+{
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+ seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+ "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
+ dev->name, stats->rx_bytes, stats->rx_packets,
+ stats->rx_errors,
+ stats->rx_dropped + stats->rx_missed_errors,
+ stats->rx_fifo_errors,
+ stats->rx_length_errors + stats->rx_over_errors +
+ stats->rx_crc_errors + stats->rx_frame_errors,
+ stats->rx_compressed, stats->multicast,
+ stats->tx_bytes, stats->tx_packets,
+ stats->tx_errors, stats->tx_dropped,
+ stats->tx_fifo_errors, stats->collisions,
+ stats->tx_carrier_errors +
+ stats->tx_aborted_errors +
+ stats->tx_window_errors +
+ stats->tx_heartbeat_errors,
+ stats->tx_compressed);
+}
+
+/*
+ * Called from the PROCfs module. This now uses the new arbitrary sized
+ * /proc/net interface to create /proc/net/dev
+ */
+static int dev_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Inter-| Receive "
+ " | Transmit\n"
+ " face |bytes packets errs drop fifo frame "
+ "compressed multicast|bytes packets errs "
+ "drop fifo colls carrier compressed\n");
+ else
+ dev_seq_printf_stats(seq, v);
+ return 0;
+}
+
+static struct softnet_data *softnet_get_online(loff_t *pos)
+{
+ struct softnet_data *sd = NULL;
+
+ while (*pos < nr_cpu_ids)
+ if (cpu_online(*pos)) {
+ sd = &per_cpu(softnet_data, *pos);
+ break;
+ } else
+ ++*pos;
+ return sd;
+}
+
+static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return softnet_get_online(pos);
+}
+
+static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return softnet_get_online(pos);
+}
+
+static void softnet_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int softnet_seq_show(struct seq_file *seq, void *v)
+{
+ struct softnet_data *sd = v;
+ unsigned int flow_limit_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+ struct sd_flow_limit *fl;
+
+ rcu_read_lock();
+ fl = rcu_dereference(sd->flow_limit);
+ if (fl)
+ flow_limit_count = fl->count;
+ rcu_read_unlock();
+#endif
+
+ seq_printf(seq,
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ sd->processed, sd->dropped, sd->time_squeeze, 0,
+ 0, 0, 0, 0, /* was fastroute */
+ sd->cpu_collision, sd->received_rps, flow_limit_count);
+ return 0;
+}
+
+static const struct seq_operations dev_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_seq_show,
+};
+
+static int dev_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dev_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dev_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static const struct seq_operations softnet_seq_ops = {
+ .start = softnet_seq_start,
+ .next = softnet_seq_next,
+ .stop = softnet_seq_stop,
+ .show = softnet_seq_show,
+};
+
+static int softnet_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &softnet_seq_ops);
+}
+
+static const struct file_operations softnet_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = softnet_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *ptype_get_idx(loff_t pos)
+{
+ struct packet_type *pt = NULL;
+ loff_t i = 0;
+ int t;
+
+ list_for_each_entry_rcu(pt, &ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ for (t = 0; t < PTYPE_HASH_SIZE; t++) {
+ list_for_each_entry_rcu(pt, &ptype_base[t], list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+ return NULL;
+}
+
+static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct packet_type *pt;
+ struct list_head *nxt;
+ int hash;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ptype_get_idx(0);
+
+ pt = v;
+ nxt = pt->list.next;
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (nxt != &ptype_all)
+ goto found;
+ hash = 0;
+ nxt = ptype_base[0].next;
+ } else
+ hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+
+ while (nxt == &ptype_base[hash]) {
+ if (++hash >= PTYPE_HASH_SIZE)
+ return NULL;
+ nxt = ptype_base[hash].next;
+ }
+found:
+ return list_entry(nxt, struct packet_type, list);
+}
+
+static void ptype_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ptype_seq_show(struct seq_file *seq, void *v)
+{
+ struct packet_type *pt = v;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Type Device Function\n");
+ else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
+ if (pt->type == htons(ETH_P_ALL))
+ seq_puts(seq, "ALL ");
+ else
+ seq_printf(seq, "%04x", ntohs(pt->type));
+
+ seq_printf(seq, " %-8s %pf\n",
+ pt->dev ? pt->dev->name : "", pt->func);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations ptype_seq_ops = {
+ .start = ptype_seq_start,
+ .next = ptype_seq_next,
+ .stop = ptype_seq_stop,
+ .show = ptype_seq_show,
+};
+
+static int ptype_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ptype_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ptype_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ptype_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+
+static int __net_init dev_proc_net_init(struct net *net)
+{
+ int rc = -ENOMEM;
+
+ if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
+ goto out;
+ if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
+ &softnet_seq_fops))
+ goto out_dev;
+ if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
+ goto out_softnet;
+
+ if (wext_proc_init(net))
+ goto out_ptype;
+ rc = 0;
+out:
+ return rc;
+out_ptype:
+ remove_proc_entry("ptype", net->proc_net);
+out_softnet:
+ remove_proc_entry("softnet_stat", net->proc_net);
+out_dev:
+ remove_proc_entry("dev", net->proc_net);
+ goto out;
+}
+
+static void __net_exit dev_proc_net_exit(struct net *net)
+{
+ wext_proc_exit(net);
+
+ remove_proc_entry("ptype", net->proc_net);
+ remove_proc_entry("softnet_stat", net->proc_net);
+ remove_proc_entry("dev", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_proc_ops = {
+ .init = dev_proc_net_init,
+ .exit = dev_proc_net_exit,
+};
+
+static int dev_mc_seq_show(struct seq_file *seq, void *v)
+{
+ struct netdev_hw_addr *ha;
+ struct net_device *dev = v;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ netif_addr_lock_bh(dev);
+ netdev_for_each_mc_addr(ha, dev) {
+ int i;
+
+ seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
+ dev->name, ha->refcount, ha->global_use);
+
+ for (i = 0; i < dev->addr_len; i++)
+ seq_printf(seq, "%02x", ha->addr[i]);
+
+ seq_putc(seq, '\n');
+ }
+ netif_addr_unlock_bh(dev);
+ return 0;
+}
+
+static const struct seq_operations dev_mc_seq_ops = {
+ .start = dev_seq_start,
+ .next = dev_seq_next,
+ .stop = dev_seq_stop,
+ .show = dev_mc_seq_show,
+};
+
+static int dev_mc_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dev_mc_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dev_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init dev_mc_net_init(struct net *net)
+{
+ if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit dev_mc_net_exit(struct net *net)
+{
+ remove_proc_entry("dev_mcast", net->proc_net);
+}
+
+static struct pernet_operations __net_initdata dev_mc_net_ops = {
+ .init = dev_mc_net_init,
+ .exit = dev_mc_net_exit,
+};
+
+int __init dev_proc_init(void)
+{
+ int ret = register_pernet_subsys(&dev_proc_ops);
+ if (!ret)
+ return register_pernet_subsys(&dev_mc_net_ops);
+ return ret;
+}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
new file mode 100644
index 000000000..4238d6da5
--- /dev/null
+++ b/net/core/net-sysfs.c
@@ -0,0 +1,1556 @@
+/*
+ * net-sysfs.c - network device class and attributes
+ *
+ * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <net/switchdev.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/jiffies.h>
+#include <linux/pm_runtime.h>
+#include <linux/of.h>
+
+#include "net-sysfs.h"
+
+#ifdef CONFIG_SYSFS
+static const char fmt_hex[] = "%#x\n";
+static const char fmt_long_hex[] = "%#lx\n";
+static const char fmt_dec[] = "%d\n";
+static const char fmt_udec[] = "%u\n";
+static const char fmt_ulong[] = "%lu\n";
+static const char fmt_u64[] = "%llu\n";
+
+static inline int dev_isalive(const struct net_device *dev)
+{
+ return dev->reg_state <= NETREG_REGISTERED;
+}
+
+/* use same locking rules as GIF* ioctl's */
+static ssize_t netdev_show(const struct device *dev,
+ struct device_attribute *attr, char *buf,
+ ssize_t (*format)(const struct net_device *, char *))
+{
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(ndev))
+ ret = (*format)(ndev, buf);
+ read_unlock(&dev_base_lock);
+
+ return ret;
+}
+
+/* generate a show function for simple field */
+#define NETDEVICE_SHOW(field, format_string) \
+static ssize_t format_##field(const struct net_device *dev, char *buf) \
+{ \
+ return sprintf(buf, format_string, dev->field); \
+} \
+static ssize_t field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return netdev_show(dev, attr, buf, format_##field); \
+} \
+
+#define NETDEVICE_SHOW_RO(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RO(field)
+
+#define NETDEVICE_SHOW_RW(field, format_string) \
+NETDEVICE_SHOW(field, format_string); \
+static DEVICE_ATTR_RW(field)
+
+/* use same locking and permission rules as SIF* ioctl's */
+static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ unsigned long new;
+ int ret = -EINVAL;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
+ goto err;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ if ((ret = (*set)(netdev, new)) == 0)
+ ret = len;
+ }
+ rtnl_unlock();
+ err:
+ return ret;
+}
+
+NETDEVICE_SHOW_RO(dev_id, fmt_hex);
+NETDEVICE_SHOW_RO(dev_port, fmt_dec);
+NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
+NETDEVICE_SHOW_RO(addr_len, fmt_dec);
+NETDEVICE_SHOW_RO(ifindex, fmt_dec);
+NETDEVICE_SHOW_RO(type, fmt_dec);
+NETDEVICE_SHOW_RO(link_mode, fmt_dec);
+
+static ssize_t iflink_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+
+ return sprintf(buf, fmt_dec, dev_get_iflink(ndev));
+}
+static DEVICE_ATTR_RO(iflink);
+
+static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
+{
+ return sprintf(buf, fmt_dec, dev->name_assign_type);
+}
+
+static ssize_t name_assign_type_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (ndev->name_assign_type != NET_NAME_UNKNOWN)
+ ret = netdev_show(dev, attr, buf, format_name_assign_type);
+
+ return ret;
+}
+static DEVICE_ATTR_RO(name_assign_type);
+
+/* use same locking rules as GIFHWADDR ioctl's */
+static ssize_t address_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(ndev))
+ ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len);
+ read_unlock(&dev_base_lock);
+ return ret;
+}
+static DEVICE_ATTR_RO(address);
+
+static ssize_t broadcast_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+ if (dev_isalive(ndev))
+ return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
+ return -EINVAL;
+}
+static DEVICE_ATTR_RO(broadcast);
+
+static int change_carrier(struct net_device *dev, unsigned long new_carrier)
+{
+ if (!netif_running(dev))
+ return -EINVAL;
+ return dev_change_carrier(dev, (bool) new_carrier);
+}
+
+static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_carrier);
+}
+
+static ssize_t carrier_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ if (netif_running(netdev)) {
+ return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ }
+ return -EINVAL;
+}
+static DEVICE_ATTR_RW(carrier);
+
+static ssize_t speed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (netif_running(netdev)) {
+ struct ethtool_cmd cmd;
+ if (!__ethtool_get_settings(netdev, &cmd))
+ ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
+ }
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RO(speed);
+
+static ssize_t duplex_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ int ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (netif_running(netdev)) {
+ struct ethtool_cmd cmd;
+ if (!__ethtool_get_settings(netdev, &cmd)) {
+ const char *duplex;
+ switch (cmd.duplex) {
+ case DUPLEX_HALF:
+ duplex = "half";
+ break;
+ case DUPLEX_FULL:
+ duplex = "full";
+ break;
+ default:
+ duplex = "unknown";
+ break;
+ }
+ ret = sprintf(buf, "%s\n", duplex);
+ }
+ }
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RO(duplex);
+
+static ssize_t dormant_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+
+ return -EINVAL;
+}
+static DEVICE_ATTR_RO(dormant);
+
+static const char *const operstates[] = {
+ "unknown",
+ "notpresent", /* currently unused */
+ "down",
+ "lowerlayerdown",
+ "testing", /* currently unused */
+ "dormant",
+ "up"
+};
+
+static ssize_t operstate_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const struct net_device *netdev = to_net_dev(dev);
+ unsigned char operstate;
+
+ read_lock(&dev_base_lock);
+ operstate = netdev->operstate;
+ if (!netif_running(netdev))
+ operstate = IF_OPER_DOWN;
+ read_unlock(&dev_base_lock);
+
+ if (operstate >= ARRAY_SIZE(operstates))
+ return -EINVAL; /* should not happen */
+
+ return sprintf(buf, "%s\n", operstates[operstate]);
+}
+static DEVICE_ATTR_RO(operstate);
+
+static ssize_t carrier_changes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ return sprintf(buf, fmt_dec,
+ atomic_read(&netdev->carrier_changes));
+}
+static DEVICE_ATTR_RO(carrier_changes);
+
+/* read-write attributes */
+
+static int change_mtu(struct net_device *dev, unsigned long new_mtu)
+{
+ return dev_set_mtu(dev, (int) new_mtu);
+}
+
+static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_mtu);
+}
+NETDEVICE_SHOW_RW(mtu, fmt_dec);
+
+static int change_flags(struct net_device *dev, unsigned long new_flags)
+{
+ return dev_change_flags(dev, (unsigned int) new_flags);
+}
+
+static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_flags);
+}
+NETDEVICE_SHOW_RW(flags, fmt_hex);
+
+static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+ dev->tx_queue_len = new_len;
+ return 0;
+}
+
+static ssize_t tx_queue_len_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_tx_queue_len);
+}
+NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong);
+
+static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
+{
+ dev->gro_flush_timeout = val;
+ return 0;
+}
+
+static ssize_t gro_flush_timeout_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+}
+NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+
+static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ size_t count = len;
+ ssize_t ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* ignore trailing newline */
+ if (len > 0 && buf[len - 1] == '\n')
+ --count;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+ ret = dev_set_alias(netdev, buf, count);
+ rtnl_unlock();
+
+ return ret < 0 ? ret : len;
+}
+
+static ssize_t ifalias_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = 0;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+ if (netdev->ifalias)
+ ret = sprintf(buf, "%s\n", netdev->ifalias);
+ rtnl_unlock();
+ return ret;
+}
+static DEVICE_ATTR_RW(ifalias);
+
+static int change_group(struct net_device *dev, unsigned long new_group)
+{
+ dev_set_group(dev, (int) new_group);
+ return 0;
+}
+
+static ssize_t group_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, change_group);
+}
+NETDEVICE_SHOW(group, fmt_dec);
+static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
+
+static ssize_t phys_port_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ struct netdev_phys_item_id ppid;
+
+ ret = dev_get_phys_port_id(netdev, &ppid);
+ if (!ret)
+ ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_id);
+
+static ssize_t phys_port_name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ char name[IFNAMSIZ];
+
+ ret = dev_get_phys_port_name(netdev, name, sizeof(name));
+ if (!ret)
+ ret = sprintf(buf, "%s\n", name);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_port_name);
+
+static ssize_t phys_switch_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev)) {
+ struct netdev_phys_item_id ppid;
+
+ ret = netdev_switch_parent_id_get(netdev, &ppid);
+ if (!ret)
+ ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+static DEVICE_ATTR_RO(phys_switch_id);
+
+static struct attribute *net_class_attrs[] = {
+ &dev_attr_netdev_group.attr,
+ &dev_attr_type.attr,
+ &dev_attr_dev_id.attr,
+ &dev_attr_dev_port.attr,
+ &dev_attr_iflink.attr,
+ &dev_attr_ifindex.attr,
+ &dev_attr_name_assign_type.attr,
+ &dev_attr_addr_assign_type.attr,
+ &dev_attr_addr_len.attr,
+ &dev_attr_link_mode.attr,
+ &dev_attr_address.attr,
+ &dev_attr_broadcast.attr,
+ &dev_attr_speed.attr,
+ &dev_attr_duplex.attr,
+ &dev_attr_dormant.attr,
+ &dev_attr_operstate.attr,
+ &dev_attr_carrier_changes.attr,
+ &dev_attr_ifalias.attr,
+ &dev_attr_carrier.attr,
+ &dev_attr_mtu.attr,
+ &dev_attr_flags.attr,
+ &dev_attr_tx_queue_len.attr,
+ &dev_attr_gro_flush_timeout.attr,
+ &dev_attr_phys_port_id.attr,
+ &dev_attr_phys_port_name.attr,
+ &dev_attr_phys_switch_id.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(net_class);
+
+/* Show a given an attribute in the statistics group */
+static ssize_t netstat_show(const struct device *d,
+ struct device_attribute *attr, char *buf,
+ unsigned long offset)
+{
+ struct net_device *dev = to_net_dev(d);
+ ssize_t ret = -EINVAL;
+
+ WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
+ offset % sizeof(u64) != 0);
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(dev)) {
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+ ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset));
+ }
+ read_unlock(&dev_base_lock);
+ return ret;
+}
+
+/* generate a read-only statistics attribute */
+#define NETSTAT_ENTRY(name) \
+static ssize_t name##_show(struct device *d, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return netstat_show(d, attr, buf, \
+ offsetof(struct rtnl_link_stats64, name)); \
+} \
+static DEVICE_ATTR_RO(name)
+
+NETSTAT_ENTRY(rx_packets);
+NETSTAT_ENTRY(tx_packets);
+NETSTAT_ENTRY(rx_bytes);
+NETSTAT_ENTRY(tx_bytes);
+NETSTAT_ENTRY(rx_errors);
+NETSTAT_ENTRY(tx_errors);
+NETSTAT_ENTRY(rx_dropped);
+NETSTAT_ENTRY(tx_dropped);
+NETSTAT_ENTRY(multicast);
+NETSTAT_ENTRY(collisions);
+NETSTAT_ENTRY(rx_length_errors);
+NETSTAT_ENTRY(rx_over_errors);
+NETSTAT_ENTRY(rx_crc_errors);
+NETSTAT_ENTRY(rx_frame_errors);
+NETSTAT_ENTRY(rx_fifo_errors);
+NETSTAT_ENTRY(rx_missed_errors);
+NETSTAT_ENTRY(tx_aborted_errors);
+NETSTAT_ENTRY(tx_carrier_errors);
+NETSTAT_ENTRY(tx_fifo_errors);
+NETSTAT_ENTRY(tx_heartbeat_errors);
+NETSTAT_ENTRY(tx_window_errors);
+NETSTAT_ENTRY(rx_compressed);
+NETSTAT_ENTRY(tx_compressed);
+
+static struct attribute *netstat_attrs[] = {
+ &dev_attr_rx_packets.attr,
+ &dev_attr_tx_packets.attr,
+ &dev_attr_rx_bytes.attr,
+ &dev_attr_tx_bytes.attr,
+ &dev_attr_rx_errors.attr,
+ &dev_attr_tx_errors.attr,
+ &dev_attr_rx_dropped.attr,
+ &dev_attr_tx_dropped.attr,
+ &dev_attr_multicast.attr,
+ &dev_attr_collisions.attr,
+ &dev_attr_rx_length_errors.attr,
+ &dev_attr_rx_over_errors.attr,
+ &dev_attr_rx_crc_errors.attr,
+ &dev_attr_rx_frame_errors.attr,
+ &dev_attr_rx_fifo_errors.attr,
+ &dev_attr_rx_missed_errors.attr,
+ &dev_attr_tx_aborted_errors.attr,
+ &dev_attr_tx_carrier_errors.attr,
+ &dev_attr_tx_fifo_errors.attr,
+ &dev_attr_tx_heartbeat_errors.attr,
+ &dev_attr_tx_window_errors.attr,
+ &dev_attr_rx_compressed.attr,
+ &dev_attr_tx_compressed.attr,
+ NULL
+};
+
+
+static struct attribute_group netstat_group = {
+ .name = "statistics",
+ .attrs = netstat_attrs,
+};
+
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
+static struct attribute *wireless_attrs[] = {
+ NULL
+};
+
+static struct attribute_group wireless_group = {
+ .name = "wireless",
+ .attrs = wireless_attrs,
+};
+#endif
+
+#else /* CONFIG_SYSFS */
+#define net_class_groups NULL
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_SYSFS
+#define to_rx_queue_attr(_attr) container_of(_attr, \
+ struct rx_queue_attribute, attr)
+
+#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
+
+static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(queue, attribute, buf, count);
+}
+
+static const struct sysfs_ops rx_queue_sysfs_ops = {
+ .show = rx_queue_attr_show,
+ .store = rx_queue_attr_store,
+};
+
+#ifdef CONFIG_RPS
+static ssize_t show_rps_map(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute, char *buf)
+{
+ struct rps_map *map;
+ cpumask_var_t mask;
+ int i, len;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rcu_read_lock();
+ map = rcu_dereference(queue->rps_map);
+ if (map)
+ for (i = 0; i < map->len; i++)
+ cpumask_set_cpu(map->cpus[i], mask);
+
+ len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
+ rcu_read_unlock();
+ free_cpumask_var(mask);
+
+ return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct rps_map *old_map, *map;
+ cpumask_var_t mask;
+ int err, cpu, i;
+ static DEFINE_SPINLOCK(rps_map_lock);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ map = kzalloc(max_t(unsigned int,
+ RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
+ GFP_KERNEL);
+ if (!map) {
+ free_cpumask_var(mask);
+ return -ENOMEM;
+ }
+
+ i = 0;
+ for_each_cpu_and(cpu, mask, cpu_online_mask)
+ map->cpus[i++] = cpu;
+
+ if (i)
+ map->len = i;
+ else {
+ kfree(map);
+ map = NULL;
+ }
+
+ spin_lock(&rps_map_lock);
+ old_map = rcu_dereference_protected(queue->rps_map,
+ lockdep_is_held(&rps_map_lock));
+ rcu_assign_pointer(queue->rps_map, map);
+ spin_unlock(&rps_map_lock);
+
+ if (map)
+ static_key_slow_inc(&rps_needed);
+ if (old_map) {
+ kfree_rcu(old_map, rcu);
+ static_key_slow_dec(&rps_needed);
+ }
+ free_cpumask_var(mask);
+ return len;
+}
+
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ char *buf)
+{
+ struct rps_dev_flow_table *flow_table;
+ unsigned long val = 0;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(queue->rps_flow_table);
+ if (flow_table)
+ val = (unsigned long)flow_table->mask + 1;
+ rcu_read_unlock();
+
+ return sprintf(buf, "%lu\n", val);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+ struct rps_dev_flow_table *table = container_of(rcu,
+ struct rps_dev_flow_table, rcu);
+ vfree(table);
+}
+
+static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned long mask, count;
+ struct rps_dev_flow_table *table, *old_table;
+ static DEFINE_SPINLOCK(rps_dev_flow_lock);
+ int rc;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ rc = kstrtoul(buf, 0, &count);
+ if (rc < 0)
+ return rc;
+
+ if (count) {
+ mask = count - 1;
+ /* mask = roundup_pow_of_two(count) - 1;
+ * without overflows...
+ */
+ while ((mask | (mask >> 1)) != mask)
+ mask |= (mask >> 1);
+ /* On 64 bit arches, must check mask fits in table->mask (u32),
+ * and on 32bit arches, must check
+ * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
+ */
+#if BITS_PER_LONG > 32
+ if (mask > (unsigned long)(u32)mask)
+ return -EINVAL;
+#else
+ if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
+ / sizeof(struct rps_dev_flow)) {
+ /* Enforce a limit to prevent overflow */
+ return -EINVAL;
+ }
+#endif
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
+ if (!table)
+ return -ENOMEM;
+
+ table->mask = mask;
+ for (count = 0; count <= mask; count++)
+ table->flows[count].cpu = RPS_NO_CPU;
+ } else
+ table = NULL;
+
+ spin_lock(&rps_dev_flow_lock);
+ old_table = rcu_dereference_protected(queue->rps_flow_table,
+ lockdep_is_held(&rps_dev_flow_lock));
+ rcu_assign_pointer(queue->rps_flow_table, table);
+ spin_unlock(&rps_dev_flow_lock);
+
+ if (old_table)
+ call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+ return len;
+}
+
+static struct rx_queue_attribute rps_cpus_attribute =
+ __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
+ __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+ show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+#endif /* CONFIG_RPS */
+
+static struct attribute *rx_queue_default_attrs[] = {
+#ifdef CONFIG_RPS
+ &rps_cpus_attribute.attr,
+ &rps_dev_flow_table_cnt_attribute.attr,
+#endif
+ NULL
+};
+
+static void rx_queue_release(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+#ifdef CONFIG_RPS
+ struct rps_map *map;
+ struct rps_dev_flow_table *flow_table;
+
+
+ map = rcu_dereference_protected(queue->rps_map, 1);
+ if (map) {
+ RCU_INIT_POINTER(queue->rps_map, NULL);
+ kfree_rcu(map, rcu);
+ }
+
+ flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);
+ if (flow_table) {
+ RCU_INIT_POINTER(queue->rps_flow_table, NULL);
+ call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
+ }
+#endif
+
+ memset(kobj, 0, sizeof(*kobj));
+ dev_put(queue->dev);
+}
+
+static const void *rx_queue_namespace(struct kobject *kobj)
+{
+ struct netdev_rx_queue *queue = to_rx_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->ns_type)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
+static struct kobj_type rx_queue_ktype = {
+ .sysfs_ops = &rx_queue_sysfs_ops,
+ .release = rx_queue_release,
+ .default_attrs = rx_queue_default_attrs,
+ .namespace = rx_queue_namespace
+};
+
+static int rx_queue_add_kobject(struct net_device *dev, int index)
+{
+ struct netdev_rx_queue *queue = dev->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ kobj->kset = dev->queues_kset;
+ error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
+ "rx-%u", index);
+ if (error)
+ goto exit;
+
+ if (dev->sysfs_rx_queue_group) {
+ error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
+ if (error)
+ goto exit;
+ }
+
+ kobject_uevent(kobj, KOBJ_ADD);
+ dev_hold(queue->dev);
+
+ return error;
+exit:
+ kobject_put(kobj);
+ return error;
+}
+#endif /* CONFIG_SYSFS */
+
+int
+net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ int error = 0;
+
+#ifndef CONFIG_RPS
+ if (!dev->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = old_num; i < new_num; i++) {
+ error = rx_queue_add_kobject(dev, i);
+ if (error) {
+ new_num = old_num;
+ break;
+ }
+ }
+
+ while (--i >= new_num) {
+ if (dev->sysfs_rx_queue_group)
+ sysfs_remove_group(&dev->_rx[i].kobj,
+ dev->sysfs_rx_queue_group);
+ kobject_put(&dev->_rx[i].kobj);
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * netdev_queue sysfs structures and functions.
+ */
+struct netdev_queue_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr, char *buf);
+ ssize_t (*store)(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr, const char *buf, size_t len);
+};
+#define to_netdev_queue_attr(_attr) container_of(_attr, \
+ struct netdev_queue_attribute, attr)
+
+#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
+
+static ssize_t netdev_queue_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t netdev_queue_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(queue, attribute, buf, count);
+}
+
+static const struct sysfs_ops netdev_queue_sysfs_ops = {
+ .show = netdev_queue_attr_show,
+ .store = netdev_queue_attr_store,
+};
+
+static ssize_t show_trans_timeout(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ unsigned long trans_timeout;
+
+ spin_lock_irq(&queue->_xmit_lock);
+ trans_timeout = queue->trans_timeout;
+ spin_unlock_irq(&queue->_xmit_lock);
+
+ return sprintf(buf, "%lu", trans_timeout);
+}
+
+#ifdef CONFIG_XPS
+static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++)
+ if (queue == &dev->_tx[i])
+ break;
+
+ BUG_ON(i >= dev->num_tx_queues);
+
+ return i;
+}
+
+static ssize_t show_tx_maxrate(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", queue->tx_maxrate);
+}
+
+static ssize_t set_tx_maxrate(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct net_device *dev = queue->dev;
+ int err, index = get_netdev_queue_index(queue);
+ u32 rate = 0;
+
+ err = kstrtou32(buf, 10, &rate);
+ if (err < 0)
+ return err;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ err = -EOPNOTSUPP;
+ if (dev->netdev_ops->ndo_set_tx_maxrate)
+ err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate);
+
+ rtnl_unlock();
+ if (!err) {
+ queue->tx_maxrate = rate;
+ return len;
+ }
+ return err;
+}
+
+static struct netdev_queue_attribute queue_tx_maxrate =
+ __ATTR(tx_maxrate, S_IRUGO | S_IWUSR,
+ show_tx_maxrate, set_tx_maxrate);
+#endif
+
+static struct netdev_queue_attribute queue_trans_timeout =
+ __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+
+#ifdef CONFIG_BQL
+/*
+ * Byte queue limits sysfs structures and functions.
+ */
+static ssize_t bql_show(char *buf, unsigned int value)
+{
+ return sprintf(buf, "%u\n", value);
+}
+
+static ssize_t bql_set(const char *buf, const size_t count,
+ unsigned int *pvalue)
+{
+ unsigned int value;
+ int err;
+
+ if (!strcmp(buf, "max") || !strcmp(buf, "max\n"))
+ value = DQL_MAX_LIMIT;
+ else {
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+ if (value > DQL_MAX_LIMIT)
+ return -EINVAL;
+ }
+
+ *pvalue = value;
+
+ return count;
+}
+
+static ssize_t bql_show_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+}
+
+static ssize_t bql_set_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned int value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ dql->slack_hold_time = msecs_to_jiffies(value);
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_hold_time_attribute =
+ __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time,
+ bql_set_hold_time);
+
+static ssize_t bql_show_inflight(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+}
+
+static struct netdev_queue_attribute bql_inflight_attribute =
+ __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL);
+
+#define BQL_ATTR(NAME, FIELD) \
+static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ char *buf) \
+{ \
+ return bql_show(buf, queue->dql.FIELD); \
+} \
+ \
+static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ return bql_set(buf, len, &queue->dql.FIELD); \
+} \
+ \
+static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \
+ __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \
+ bql_set_ ## NAME);
+
+BQL_ATTR(limit, limit)
+BQL_ATTR(limit_max, max_limit)
+BQL_ATTR(limit_min, min_limit)
+
+static struct attribute *dql_attrs[] = {
+ &bql_limit_attribute.attr,
+ &bql_limit_max_attribute.attr,
+ &bql_limit_min_attribute.attr,
+ &bql_hold_time_attribute.attr,
+ &bql_inflight_attribute.attr,
+ NULL
+};
+
+static struct attribute_group dql_group = {
+ .name = "byte_queue_limits",
+ .attrs = dql_attrs,
+};
+#endif /* CONFIG_BQL */
+
+#ifdef CONFIG_XPS
+static ssize_t show_xps_map(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ struct xps_dev_maps *dev_maps;
+ cpumask_var_t mask;
+ unsigned long index;
+ int i, len;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps);
+ if (dev_maps) {
+ for_each_possible_cpu(i) {
+ struct xps_map *map =
+ rcu_dereference(dev_maps->cpu_map[i]);
+ if (map) {
+ int j;
+ for (j = 0; j < map->len; j++) {
+ if (map->queues[j] == index) {
+ cpumask_set_cpu(i, mask);
+ break;
+ }
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
+ free_cpumask_var(mask);
+ return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t store_xps_map(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct net_device *dev = queue->dev;
+ unsigned long index;
+ cpumask_var_t mask;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
+ err = netif_set_xps_queue(dev, mask, index);
+
+ free_cpumask_var(mask);
+
+ return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_cpus_attribute =
+ __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+#endif /* CONFIG_XPS */
+
+static struct attribute *netdev_queue_default_attrs[] = {
+ &queue_trans_timeout.attr,
+#ifdef CONFIG_XPS
+ &xps_cpus_attribute.attr,
+ &queue_tx_maxrate.attr,
+#endif
+ NULL
+};
+
+static void netdev_queue_release(struct kobject *kobj)
+{
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+
+ memset(kobj, 0, sizeof(*kobj));
+ dev_put(queue->dev);
+}
+
+static const void *netdev_queue_namespace(struct kobject *kobj)
+{
+ struct netdev_queue *queue = to_netdev_queue(kobj);
+ struct device *dev = &queue->dev->dev;
+ const void *ns = NULL;
+
+ if (dev->class && dev->class->ns_type)
+ ns = dev->class->namespace(dev);
+
+ return ns;
+}
+
+static struct kobj_type netdev_queue_ktype = {
+ .sysfs_ops = &netdev_queue_sysfs_ops,
+ .release = netdev_queue_release,
+ .default_attrs = netdev_queue_default_attrs,
+ .namespace = netdev_queue_namespace,
+};
+
+static int netdev_queue_add_kobject(struct net_device *dev, int index)
+{
+ struct netdev_queue *queue = dev->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error = 0;
+
+ kobj->kset = dev->queues_kset;
+ error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
+ "tx-%u", index);
+ if (error)
+ goto exit;
+
+#ifdef CONFIG_BQL
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto exit;
+#endif
+
+ kobject_uevent(kobj, KOBJ_ADD);
+ dev_hold(queue->dev);
+
+ return 0;
+exit:
+ kobject_put(kobj);
+ return error;
+}
+#endif /* CONFIG_SYSFS */
+
+int
+netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ int error = 0;
+
+ for (i = old_num; i < new_num; i++) {
+ error = netdev_queue_add_kobject(dev, i);
+ if (error) {
+ new_num = old_num;
+ break;
+ }
+ }
+
+ while (--i >= new_num) {
+ struct netdev_queue *queue = dev->_tx + i;
+
+#ifdef CONFIG_BQL
+ sysfs_remove_group(&queue->kobj, &dql_group);
+#endif
+ kobject_put(&queue->kobj);
+ }
+
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
+static int register_queue_kobjects(struct net_device *dev)
+{
+ int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ dev->queues_kset = kset_create_and_add("queues",
+ NULL, &dev->dev.kobj);
+ if (!dev->queues_kset)
+ return -ENOMEM;
+ real_rx = dev->real_num_rx_queues;
+#endif
+ real_tx = dev->real_num_tx_queues;
+
+ error = net_rx_queue_update_kobjects(dev, 0, real_rx);
+ if (error)
+ goto error;
+ rxq = real_rx;
+
+ error = netdev_queue_update_kobjects(dev, 0, real_tx);
+ if (error)
+ goto error;
+ txq = real_tx;
+
+ return 0;
+
+error:
+ netdev_queue_update_kobjects(dev, txq, 0);
+ net_rx_queue_update_kobjects(dev, rxq, 0);
+ return error;
+}
+
+static void remove_queue_kobjects(struct net_device *dev)
+{
+ int real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ real_rx = dev->real_num_rx_queues;
+#endif
+ real_tx = dev->real_num_tx_queues;
+
+ net_rx_queue_update_kobjects(dev, real_rx, 0);
+ netdev_queue_update_kobjects(dev, real_tx, 0);
+#ifdef CONFIG_SYSFS
+ kset_unregister(dev->queues_kset);
+#endif
+}
+
+static bool net_current_may_mount(void)
+{
+ struct net *net = current->nsproxy->net_ns;
+
+ return ns_capable(net->user_ns, CAP_SYS_ADMIN);
+}
+
+static void *net_grab_current_ns(void)
+{
+ struct net *ns = current->nsproxy->net_ns;
+#ifdef CONFIG_NET_NS
+ if (ns)
+ atomic_inc(&ns->passive);
+#endif
+ return ns;
+}
+
+static const void *net_initial_ns(void)
+{
+ return &init_net;
+}
+
+static const void *net_netlink_ns(struct sock *sk)
+{
+ return sock_net(sk);
+}
+
+struct kobj_ns_type_operations net_ns_type_operations = {
+ .type = KOBJ_NS_TYPE_NET,
+ .current_may_mount = net_current_may_mount,
+ .grab_current_ns = net_grab_current_ns,
+ .netlink_ns = net_netlink_ns,
+ .initial_ns = net_initial_ns,
+ .drop_ns = net_drop_ns,
+};
+EXPORT_SYMBOL_GPL(net_ns_type_operations);
+
+static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
+{
+ struct net_device *dev = to_net_dev(d);
+ int retval;
+
+ /* pass interface to uevent. */
+ retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
+ if (retval)
+ goto exit;
+
+ /* pass ifindex to uevent.
+ * ifindex is useful as it won't change (interface name may change)
+ * and is what RtNetlink uses natively. */
+ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex);
+
+exit:
+ return retval;
+}
+
+/*
+ * netdev_release -- destroy and free a dead device.
+ * Called when last reference to device kobject is gone.
+ */
+static void netdev_release(struct device *d)
+{
+ struct net_device *dev = to_net_dev(d);
+
+ BUG_ON(dev->reg_state != NETREG_RELEASED);
+
+ kfree(dev->ifalias);
+ netdev_freemem(dev);
+}
+
+static const void *net_namespace(struct device *d)
+{
+ struct net_device *dev;
+ dev = container_of(d, struct net_device, dev);
+ return dev_net(dev);
+}
+
+static struct class net_class = {
+ .name = "net",
+ .dev_release = netdev_release,
+ .dev_groups = net_class_groups,
+ .dev_uevent = netdev_uevent,
+ .ns_type = &net_ns_type_operations,
+ .namespace = net_namespace,
+};
+
+#ifdef CONFIG_OF_NET
+static int of_dev_node_match(struct device *dev, const void *data)
+{
+ int ret = 0;
+
+ if (dev->parent)
+ ret = dev->parent->of_node == data;
+
+ return ret == 0 ? dev->of_node == data : ret;
+}
+
+struct net_device *of_find_net_device_by_node(struct device_node *np)
+{
+ struct device *dev;
+
+ dev = class_find_device(&net_class, NULL, np, of_dev_node_match);
+ if (!dev)
+ return NULL;
+
+ return to_net_dev(dev);
+}
+EXPORT_SYMBOL(of_find_net_device_by_node);
+#endif
+
+/* Delete sysfs entries but hold kobject reference until after all
+ * netdev references are gone.
+ */
+void netdev_unregister_kobject(struct net_device *ndev)
+{
+ struct device *dev = &(ndev->dev);
+
+ kobject_get(&dev->kobj);
+
+ remove_queue_kobjects(ndev);
+
+ pm_runtime_set_memalloc_noio(dev, false);
+
+ device_del(dev);
+}
+
+/* Create sysfs entries for network device. */
+int netdev_register_kobject(struct net_device *ndev)
+{
+ struct device *dev = &(ndev->dev);
+ const struct attribute_group **groups = ndev->sysfs_groups;
+ int error = 0;
+
+ device_initialize(dev);
+ dev->class = &net_class;
+ dev->platform_data = ndev;
+ dev->groups = groups;
+
+ dev_set_name(dev, "%s", ndev->name);
+
+#ifdef CONFIG_SYSFS
+ /* Allow for a device specific group */
+ if (*groups)
+ groups++;
+
+ *groups++ = &netstat_group;
+
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
+ if (ndev->ieee80211_ptr)
+ *groups++ = &wireless_group;
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
+ else if (ndev->wireless_handlers)
+ *groups++ = &wireless_group;
+#endif
+#endif
+#endif /* CONFIG_SYSFS */
+
+ error = device_add(dev);
+ if (error)
+ return error;
+
+ error = register_queue_kobjects(ndev);
+ if (error) {
+ device_del(dev);
+ return error;
+ }
+
+ pm_runtime_set_memalloc_noio(dev, true);
+
+ return error;
+}
+
+int netdev_class_create_file_ns(struct class_attribute *class_attr,
+ const void *ns)
+{
+ return class_create_file_ns(&net_class, class_attr, ns);
+}
+EXPORT_SYMBOL(netdev_class_create_file_ns);
+
+void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+ const void *ns)
+{
+ class_remove_file_ns(&net_class, class_attr, ns);
+}
+EXPORT_SYMBOL(netdev_class_remove_file_ns);
+
+int __init netdev_kobject_init(void)
+{
+ kobj_ns_type_register(&net_ns_type_operations);
+ return class_register(&net_class);
+}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
new file mode 100644
index 000000000..2745a1b51
--- /dev/null
+++ b/net/core/net-sysfs.h
@@ -0,0 +1,11 @@
+#ifndef __NET_SYSFS_H__
+#define __NET_SYSFS_H__
+
+int __init netdev_kobject_init(void);
+int netdev_register_kobject(struct net_device *);
+void netdev_unregister_kobject(struct net_device *);
+int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+int netdev_queue_update_kobjects(struct net_device *net,
+ int old_num, int new_num);
+
+#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
new file mode 100644
index 000000000..ba3c01207
--- /dev/null
+++ b/net/core/net-traces.c
@@ -0,0 +1,37 @@
+/*
+ * consolidates trace point definitions
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/slab.h>
+
+#include <asm/unaligned.h>
+#include <asm/bitops.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/skb.h>
+#include <trace/events/net.h>
+#include <trace/events/napi.h>
+#include <trace/events/sock.h>
+#include <trace/events/udp.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
new file mode 100644
index 000000000..572af0011
--- /dev/null
+++ b/net/core/net_namespace.c
@@ -0,0 +1,965 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/workqueue.h>
+#include <linux/rtnetlink.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/rculist.h>
+#include <linux/nsproxy.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
+#include <linux/file.h>
+#include <linux/export.h>
+#include <linux/user_namespace.h>
+#include <linux/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/*
+ * Our network namespace constructor/destructor lists
+ */
+
+static LIST_HEAD(pernet_list);
+static struct list_head *first_device = &pernet_list;
+DEFINE_MUTEX(net_mutex);
+
+LIST_HEAD(net_namespace_list);
+EXPORT_SYMBOL_GPL(net_namespace_list);
+
+struct net init_net = {
+ .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
+};
+EXPORT_SYMBOL(init_net);
+
+#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
+
+static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+
+static struct net_generic *net_alloc_generic(void)
+{
+ struct net_generic *ng;
+ size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+ ng = kzalloc(generic_size, GFP_KERNEL);
+ if (ng)
+ ng->len = max_gen_ptrs;
+
+ return ng;
+}
+
+static int net_assign_generic(struct net *net, int id, void *data)
+{
+ struct net_generic *ng, *old_ng;
+
+ BUG_ON(!mutex_is_locked(&net_mutex));
+ BUG_ON(id == 0);
+
+ old_ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&net_mutex));
+ ng = old_ng;
+ if (old_ng->len >= id)
+ goto assign;
+
+ ng = net_alloc_generic();
+ if (ng == NULL)
+ return -ENOMEM;
+
+ /*
+ * Some synchronisation notes:
+ *
+ * The net_generic explores the net->gen array inside rcu
+ * read section. Besides once set the net->gen->ptr[x]
+ * pointer never changes (see rules in netns/generic.h).
+ *
+ * That said, we simply duplicate this array and schedule
+ * the old copy for kfree after a grace period.
+ */
+
+ memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+
+ rcu_assign_pointer(net->gen, ng);
+ kfree_rcu(old_ng, rcu);
+assign:
+ ng->ptr[id - 1] = data;
+ return 0;
+}
+
+static int ops_init(const struct pernet_operations *ops, struct net *net)
+{
+ int err = -ENOMEM;
+ void *data = NULL;
+
+ if (ops->id && ops->size) {
+ data = kzalloc(ops->size, GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ err = net_assign_generic(net, *ops->id, data);
+ if (err)
+ goto cleanup;
+ }
+ err = 0;
+ if (ops->init)
+ err = ops->init(net);
+ if (!err)
+ return 0;
+
+cleanup:
+ kfree(data);
+
+out:
+ return err;
+}
+
+static void ops_free(const struct pernet_operations *ops, struct net *net)
+{
+ if (ops->id && ops->size) {
+ int id = *ops->id;
+ kfree(net_generic(net, id));
+ }
+}
+
+static void ops_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+ if (ops->exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->exit(net);
+ }
+ if (ops->exit_batch)
+ ops->exit_batch(net_exit_list);
+}
+
+static void ops_free_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+ if (ops->size && ops->id) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops_free(ops, net);
+ }
+}
+
+static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
+ int id);
+static int alloc_netid(struct net *net, struct net *peer, int reqid)
+{
+ int min = 0, max = 0, id;
+
+ ASSERT_RTNL();
+
+ if (reqid >= 0) {
+ min = reqid;
+ max = reqid + 1;
+ }
+
+ id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
+ if (id >= 0)
+ rtnl_net_notifyid(net, peer, RTM_NEWNSID, id);
+
+ return id;
+}
+
+/* This function is used by idr_for_each(). If net is equal to peer, the
+ * function returns the id so that idr_for_each() stops. Because we cannot
+ * returns the id 0 (idr_for_each() will not stop), we return the magic value
+ * NET_ID_ZERO (-1) for it.
+ */
+#define NET_ID_ZERO -1
+static int net_eq_idr(int id, void *net, void *peer)
+{
+ if (net_eq(net, peer))
+ return id ? : NET_ID_ZERO;
+ return 0;
+}
+
+static int __peernet2id(struct net *net, struct net *peer, bool alloc)
+{
+ int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
+
+ ASSERT_RTNL();
+
+ /* Magic value for id 0. */
+ if (id == NET_ID_ZERO)
+ return 0;
+ if (id > 0)
+ return id;
+
+ if (alloc)
+ return alloc_netid(net, peer, -1);
+
+ return -ENOENT;
+}
+
+/* This function returns the id of a peer netns. If no id is assigned, one will
+ * be allocated and returned.
+ */
+int peernet2id(struct net *net, struct net *peer)
+{
+ bool alloc = atomic_read(&peer->count) == 0 ? false : true;
+ int id;
+
+ id = __peernet2id(net, peer, alloc);
+ return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+}
+EXPORT_SYMBOL(peernet2id);
+
+struct net *get_net_ns_by_id(struct net *net, int id)
+{
+ struct net *peer;
+
+ if (id < 0)
+ return NULL;
+
+ rcu_read_lock();
+ peer = idr_find(&net->netns_ids, id);
+ if (peer)
+ get_net(peer);
+ rcu_read_unlock();
+
+ return peer;
+}
+
+/*
+ * setup_net runs the initializers for the network namespace object.
+ */
+static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
+{
+ /* Must be called with net_mutex held */
+ const struct pernet_operations *ops, *saved_ops;
+ int error = 0;
+ LIST_HEAD(net_exit_list);
+
+ atomic_set(&net->count, 1);
+ atomic_set(&net->passive, 1);
+ net->dev_base_seq = 1;
+ net->user_ns = user_ns;
+ idr_init(&net->netns_ids);
+
+ list_for_each_entry(ops, &pernet_list, list) {
+ error = ops_init(ops, net);
+ if (error < 0)
+ goto out_undo;
+ }
+out:
+ return error;
+
+out_undo:
+ /* Walk through the list backwards calling the exit functions
+ * for the pernet modules whose init functions did not fail.
+ */
+ list_add(&net->exit_list, &net_exit_list);
+ saved_ops = ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_exit_list(ops, &net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_free_list(ops, &net_exit_list);
+
+ rcu_barrier();
+ goto out;
+}
+
+
+#ifdef CONFIG_NET_NS
+static struct kmem_cache *net_cachep;
+static struct workqueue_struct *netns_wq;
+
+static struct net *net_alloc(void)
+{
+ struct net *net = NULL;
+ struct net_generic *ng;
+
+ ng = net_alloc_generic();
+ if (!ng)
+ goto out;
+
+ net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+ if (!net)
+ goto out_free;
+
+ rcu_assign_pointer(net->gen, ng);
+out:
+ return net;
+
+out_free:
+ kfree(ng);
+ goto out;
+}
+
+static void net_free(struct net *net)
+{
+ kfree(rcu_access_pointer(net->gen));
+ kmem_cache_free(net_cachep, net);
+}
+
+void net_drop_ns(void *p)
+{
+ struct net *ns = p;
+ if (ns && atomic_dec_and_test(&ns->passive))
+ net_free(ns);
+}
+
+struct net *copy_net_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct net *old_net)
+{
+ struct net *net;
+ int rv;
+
+ if (!(flags & CLONE_NEWNET))
+ return get_net(old_net);
+
+ net = net_alloc();
+ if (!net)
+ return ERR_PTR(-ENOMEM);
+
+ get_user_ns(user_ns);
+
+ mutex_lock(&net_mutex);
+ rv = setup_net(net, user_ns);
+ if (rv == 0) {
+ rtnl_lock();
+ list_add_tail_rcu(&net->list, &net_namespace_list);
+ rtnl_unlock();
+ }
+ mutex_unlock(&net_mutex);
+ if (rv < 0) {
+ put_user_ns(user_ns);
+ net_drop_ns(net);
+ return ERR_PTR(rv);
+ }
+ return net;
+}
+
+static DEFINE_SPINLOCK(cleanup_list_lock);
+static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */
+
+static void cleanup_net(struct work_struct *work)
+{
+ const struct pernet_operations *ops;
+ struct net *net, *tmp;
+ struct list_head net_kill_list;
+ LIST_HEAD(net_exit_list);
+
+ /* Atomically snapshot the list of namespaces to cleanup */
+ spin_lock_irq(&cleanup_list_lock);
+ list_replace_init(&cleanup_list, &net_kill_list);
+ spin_unlock_irq(&cleanup_list_lock);
+
+ mutex_lock(&net_mutex);
+
+ /* Don't let anyone else find us. */
+ rtnl_lock();
+ list_for_each_entry(net, &net_kill_list, cleanup_list) {
+ list_del_rcu(&net->list);
+ list_add_tail(&net->exit_list, &net_exit_list);
+ for_each_net(tmp) {
+ int id = __peernet2id(tmp, net, false);
+
+ if (id >= 0) {
+ rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
+ idr_remove(&tmp->netns_ids, id);
+ }
+ }
+ idr_destroy(&net->netns_ids);
+
+ }
+ rtnl_unlock();
+
+ /*
+ * Another CPU might be rcu-iterating the list, wait for it.
+ * This needs to be before calling the exit() notifiers, so
+ * the rcu_barrier() below isn't sufficient alone.
+ */
+ synchronize_rcu();
+
+ /* Run all of the network namespace exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_exit_list(ops, &net_exit_list);
+
+ /* Free the net generic variables */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_free_list(ops, &net_exit_list);
+
+ mutex_unlock(&net_mutex);
+
+ /* Ensure there are no outstanding rcu callbacks using this
+ * network namespace.
+ */
+ rcu_barrier();
+
+ /* Finally it is safe to free my network namespace structure */
+ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
+ list_del_init(&net->exit_list);
+ put_user_ns(net->user_ns);
+ net_drop_ns(net);
+ }
+}
+static DECLARE_WORK(net_cleanup_work, cleanup_net);
+
+void __put_net(struct net *net)
+{
+ /* Cleanup the network namespace in process context */
+ unsigned long flags;
+
+ spin_lock_irqsave(&cleanup_list_lock, flags);
+ list_add(&net->cleanup_list, &cleanup_list);
+ spin_unlock_irqrestore(&cleanup_list_lock, flags);
+
+ queue_work(netns_wq, &net_cleanup_work);
+}
+EXPORT_SYMBOL_GPL(__put_net);
+
+struct net *get_net_ns_by_fd(int fd)
+{
+ struct file *file;
+ struct ns_common *ns;
+ struct net *net;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ ns = get_proc_ns(file_inode(file));
+ if (ns->ops == &netns_operations)
+ net = get_net(container_of(ns, struct net, ns));
+ else
+ net = ERR_PTR(-EINVAL);
+
+ fput(file);
+ return net;
+}
+
+#else
+struct net *get_net_ns_by_fd(int fd)
+{
+ return ERR_PTR(-EINVAL);
+}
+#endif
+EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
+
+struct net *get_net_ns_by_pid(pid_t pid)
+{
+ struct task_struct *tsk;
+ struct net *net;
+
+ /* Lookup the network namespace */
+ net = ERR_PTR(-ESRCH);
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk) {
+ struct nsproxy *nsproxy;
+ task_lock(tsk);
+ nsproxy = tsk->nsproxy;
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ task_unlock(tsk);
+ }
+ rcu_read_unlock();
+ return net;
+}
+EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
+
+static __net_init int net_ns_net_init(struct net *net)
+{
+#ifdef CONFIG_NET_NS
+ net->ns.ops = &netns_operations;
+#endif
+ return ns_alloc_inum(&net->ns);
+}
+
+static __net_exit void net_ns_net_exit(struct net *net)
+{
+ ns_free_inum(&net->ns);
+}
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+ .init = net_ns_net_init,
+ .exit = net_ns_net_exit,
+};
+
+static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
+ [NETNSA_NONE] = { .type = NLA_UNSPEC },
+ [NETNSA_NSID] = { .type = NLA_S32 },
+ [NETNSA_PID] = { .type = NLA_U32 },
+ [NETNSA_FD] = { .type = NLA_U32 },
+};
+
+static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NETNSA_MAX + 1];
+ struct net *peer;
+ int nsid, err;
+
+ err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
+ rtnl_net_policy);
+ if (err < 0)
+ return err;
+ if (!tb[NETNSA_NSID])
+ return -EINVAL;
+ nsid = nla_get_s32(tb[NETNSA_NSID]);
+
+ if (tb[NETNSA_PID])
+ peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
+ else if (tb[NETNSA_FD])
+ peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
+ else
+ return -EINVAL;
+ if (IS_ERR(peer))
+ return PTR_ERR(peer);
+
+ if (__peernet2id(net, peer, false) >= 0) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ err = alloc_netid(net, peer, nsid);
+ if (err > 0)
+ err = 0;
+out:
+ put_net(peer);
+ return err;
+}
+
+static int rtnl_net_get_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
+ ;
+}
+
+static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
+ int cmd, struct net *net, struct net *peer,
+ int nsid)
+{
+ struct nlmsghdr *nlh;
+ struct rtgenmsg *rth;
+ int id;
+
+ ASSERT_RTNL();
+
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rth = nlmsg_data(nlh);
+ rth->rtgen_family = AF_UNSPEC;
+
+ if (nsid >= 0) {
+ id = nsid;
+ } else {
+ id = __peernet2id(net, peer, false);
+ if (id < 0)
+ id = NETNSA_NSID_NOT_ASSIGNED;
+ }
+ if (nla_put_s32(skb, NETNSA_NSID, id))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NETNSA_MAX + 1];
+ struct sk_buff *msg;
+ struct net *peer;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
+ rtnl_net_policy);
+ if (err < 0)
+ return err;
+ if (tb[NETNSA_PID])
+ peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
+ else if (tb[NETNSA_FD])
+ peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
+ else
+ return -EINVAL;
+
+ if (IS_ERR(peer))
+ return PTR_ERR(peer);
+
+ msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ RTM_NEWNSID, net, peer, -1);
+ if (err < 0)
+ goto err_out;
+
+ err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
+ goto out;
+
+err_out:
+ nlmsg_free(msg);
+out:
+ put_net(peer);
+ return err;
+}
+
+struct rtnl_net_dump_cb {
+ struct net *net;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ int idx;
+ int s_idx;
+};
+
+static int rtnl_net_dumpid_one(int id, void *peer, void *data)
+{
+ struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
+ int ret;
+
+ if (net_cb->idx < net_cb->s_idx)
+ goto cont;
+
+ ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
+ net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWNSID, net_cb->net, peer, id);
+ if (ret < 0)
+ return ret;
+
+cont:
+ net_cb->idx++;
+ return 0;
+}
+
+static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct rtnl_net_dump_cb net_cb = {
+ .net = net,
+ .skb = skb,
+ .cb = cb,
+ .idx = 0,
+ .s_idx = cb->args[0],
+ };
+
+ ASSERT_RTNL();
+
+ idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+
+ cb->args[0] = net_cb.idx;
+ return skb->len;
+}
+
+static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
+ int id)
+{
+ struct sk_buff *msg;
+ int err = -ENOMEM;
+
+ msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id);
+ if (err < 0)
+ goto err_out;
+
+ rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0);
+ return;
+
+err_out:
+ nlmsg_free(msg);
+out:
+ rtnl_set_sk_err(net, RTNLGRP_NSID, err);
+}
+
+static int __init net_ns_init(void)
+{
+ struct net_generic *ng;
+
+#ifdef CONFIG_NET_NS
+ net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
+ SMP_CACHE_BYTES,
+ SLAB_PANIC, NULL);
+
+ /* Create workqueue for cleanup */
+ netns_wq = create_singlethread_workqueue("netns");
+ if (!netns_wq)
+ panic("Could not create netns workq");
+#endif
+
+ ng = net_alloc_generic();
+ if (!ng)
+ panic("Could not allocate generic netns");
+
+ rcu_assign_pointer(init_net.gen, ng);
+
+ mutex_lock(&net_mutex);
+ if (setup_net(&init_net, &init_user_ns))
+ panic("Could not setup the initial network namespace");
+
+ rtnl_lock();
+ list_add_tail_rcu(&init_net.list, &net_namespace_list);
+ rtnl_unlock();
+
+ mutex_unlock(&net_mutex);
+
+ register_pernet_subsys(&net_ns_ops);
+
+ rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
+ NULL);
+
+ return 0;
+}
+
+pure_initcall(net_ns_init);
+
+#ifdef CONFIG_NET_NS
+static int __register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ struct net *net;
+ int error;
+ LIST_HEAD(net_exit_list);
+
+ list_add_tail(&ops->list, list);
+ if (ops->init || (ops->id && ops->size)) {
+ for_each_net(net) {
+ error = ops_init(ops, net);
+ if (error)
+ goto out_undo;
+ list_add_tail(&net->exit_list, &net_exit_list);
+ }
+ }
+ return 0;
+
+out_undo:
+ /* If I have an error cleanup all namespaces I initialized */
+ list_del(&ops->list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+ return error;
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+ struct net *net;
+ LIST_HEAD(net_exit_list);
+
+ list_del(&ops->list);
+ for_each_net(net)
+ list_add_tail(&net->exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+}
+
+#else
+
+static int __register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ return ops_init(ops, &init_net);
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+ LIST_HEAD(net_exit_list);
+ list_add(&init_net.exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+}
+
+#endif /* CONFIG_NET_NS */
+
+static DEFINE_IDA(net_generic_ids);
+
+static int register_pernet_operations(struct list_head *list,
+ struct pernet_operations *ops)
+{
+ int error;
+
+ if (ops->id) {
+again:
+ error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+ if (error < 0) {
+ if (error == -EAGAIN) {
+ ida_pre_get(&net_generic_ids, GFP_KERNEL);
+ goto again;
+ }
+ return error;
+ }
+ max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+ }
+ error = __register_pernet_operations(list, ops);
+ if (error) {
+ rcu_barrier();
+ if (ops->id)
+ ida_remove(&net_generic_ids, *ops->id);
+ }
+
+ return error;
+}
+
+static void unregister_pernet_operations(struct pernet_operations *ops)
+{
+
+ __unregister_pernet_operations(ops);
+ rcu_barrier();
+ if (ops->id)
+ ida_remove(&net_generic_ids, *ops->id);
+}
+
+/**
+ * register_pernet_subsys - register a network namespace subsystem
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a subsystem which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_subsys(struct pernet_operations *ops)
+{
+ int error;
+ mutex_lock(&net_mutex);
+ error = register_pernet_operations(first_device, ops);
+ mutex_unlock(&net_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_subsys);
+
+/**
+ * unregister_pernet_subsys - unregister a network namespace subsystem
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_subsys(struct pernet_operations *ops)
+{
+ mutex_lock(&net_mutex);
+ unregister_pernet_operations(ops);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
+
+/**
+ * register_pernet_device - register a network namespace device
+ * @ops: pernet operations structure for the subsystem
+ *
+ * Register a device which has init and exit functions
+ * that are called when network namespaces are created and
+ * destroyed respectively.
+ *
+ * When registered all network namespace init functions are
+ * called for every existing network namespace. Allowing kernel
+ * modules to have a race free view of the set of network namespaces.
+ *
+ * When a new network namespace is created all of the init
+ * methods are called in the order in which they were registered.
+ *
+ * When a network namespace is destroyed all of the exit methods
+ * are called in the reverse of the order with which they were
+ * registered.
+ */
+int register_pernet_device(struct pernet_operations *ops)
+{
+ int error;
+ mutex_lock(&net_mutex);
+ error = register_pernet_operations(&pernet_list, ops);
+ if (!error && (first_device == &pernet_list))
+ first_device = &ops->list;
+ mutex_unlock(&net_mutex);
+ return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_device);
+
+/**
+ * unregister_pernet_device - unregister a network namespace netdevice
+ * @ops: pernet operations structure to manipulate
+ *
+ * Remove the pernet operations structure from the list to be
+ * used when network namespaces are created or destroyed. In
+ * addition run the exit method for all existing network
+ * namespaces.
+ */
+void unregister_pernet_device(struct pernet_operations *ops)
+{
+ mutex_lock(&net_mutex);
+ if (&ops->list == first_device)
+ first_device = first_device->next;
+ unregister_pernet_operations(ops);
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_device);
+
+#ifdef CONFIG_NET_NS
+static struct ns_common *netns_get(struct task_struct *task)
+{
+ struct net *net = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy)
+ net = get_net(nsproxy->net_ns);
+ task_unlock(task);
+
+ return net ? &net->ns : NULL;
+}
+
+static inline struct net *to_net_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct net, ns);
+}
+
+static void netns_put(struct ns_common *ns)
+{
+ put_net(to_net_ns(ns));
+}
+
+static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct net *net = to_net_ns(ns);
+
+ if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ put_net(nsproxy->net_ns);
+ nsproxy->net_ns = get_net(net);
+ return 0;
+}
+
+const struct proc_ns_operations netns_operations = {
+ .name = "net",
+ .type = CLONE_NEWNET,
+ .get = netns_get,
+ .put = netns_put,
+ .install = netns_install,
+};
+#endif
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
new file mode 100644
index 000000000..1f2a126f4
--- /dev/null
+++ b/net/core/netclassid_cgroup.c
@@ -0,0 +1,111 @@
+/*
+ * net/core/netclassid_cgroup.c Classid Cgroupfs Handling
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fdtable.h>
+#include <net/cls_cgroup.h>
+#include <net/sock.h>
+
+static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
+}
+
+struct cgroup_cls_state *task_cls_state(struct task_struct *p)
+{
+ return css_cls_state(task_css(p, net_cls_cgrp_id));
+}
+EXPORT_SYMBOL_GPL(task_cls_state);
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_cls_state *cs;
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ return &cs->css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ struct cgroup_cls_state *parent = css_cls_state(css->parent);
+
+ if (parent)
+ cs->classid = parent->classid;
+
+ return 0;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_cls_state(css));
+}
+
+static int update_classid(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct socket *sock = sock_from_file(file, &err);
+
+ if (sock)
+ sock->sk->sk_classid = (u32)(unsigned long)v;
+
+ return 0;
+}
+
+static void cgrp_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct cgroup_cls_state *cs = css_cls_state(css);
+ void *v = (void *)(unsigned long)cs->classid;
+ struct task_struct *p;
+
+ cgroup_taskset_for_each(p, tset) {
+ task_lock(p);
+ iterate_fd(p->files, 0, update_classid, v);
+ task_unlock(p);
+ }
+}
+
+static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css_cls_state(css)->classid;
+}
+
+static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 value)
+{
+ css_cls_state(css)->classid = (u32) value;
+
+ return 0;
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "classid",
+ .read_u64 = read_classid,
+ .write_u64 = write_classid,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_cls_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = cgrp_attach,
+ .legacy_cftypes = ss_files,
+};
diff --git a/net/core/netevent.c b/net/core/netevent.c
new file mode 100644
index 000000000..f17ccd291
--- /dev/null
+++ b/net/core/netevent.c
@@ -0,0 +1,70 @@
+/*
+ * Network event notifiers
+ *
+ * Authors:
+ * Tom Tucker <tom@opengridcomputing.com>
+ * Steve Wise <swise@opengridcomputing.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <net/netevent.h>
+
+static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
+
+/**
+ * register_netevent_notifier - register a netevent notifier block
+ * @nb: notifier
+ *
+ * Register a notifier to be called when a netevent occurs.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ */
+int register_netevent_notifier(struct notifier_block *nb)
+{
+ int err;
+
+ err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_netevent_notifier);
+
+/**
+ * netevent_unregister_notifier - unregister a netevent notifier block
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_neigh_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ */
+
+int unregister_netevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
+
+/**
+ * call_netevent_notifiers - call all netevent notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @v: pointer passed unmodified to notifier function
+ *
+ * Call all neighbour notifier blocks. Parameters and return value
+ * are as for notifier_call_chain().
+ */
+
+int call_netevent_notifiers(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
+}
+EXPORT_SYMBOL_GPL(call_netevent_notifiers);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
new file mode 100644
index 000000000..c126a878c
--- /dev/null
+++ b/net/core/netpoll.c
@@ -0,0 +1,852 @@
+/*
+ * Common framework for low-level network console, dump, and debugger code
+ *
+ * Sep 8 2003 Matt Mackall <mpm@selenic.com>
+ *
+ * based on the netconsole code from:
+ *
+ * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Red Hat, Inc.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/if_vlan.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+#include <net/ip6_checksum.h>
+#include <asm/unaligned.h>
+#include <trace/events/napi.h>
+
+/*
+ * We maintain a small pool of fully-sized skbs, to make sure the
+ * message gets out even in extreme OOM situations.
+ */
+
+#define MAX_UDP_CHUNK 1460
+#define MAX_SKBS 32
+
+static struct sk_buff_head skb_pool;
+
+DEFINE_STATIC_SRCU(netpoll_srcu);
+
+#define USEC_PER_POLL 50
+
+#define MAX_SKB_SIZE \
+ (sizeof(struct ethhdr) + \
+ sizeof(struct iphdr) + \
+ sizeof(struct udphdr) + \
+ MAX_UDP_CHUNK)
+
+static void zap_completion_queue(void);
+static void netpoll_async_cleanup(struct work_struct *work);
+
+static unsigned int carrier_timeout = 4;
+module_param(carrier_timeout, uint, 0644);
+
+#define np_info(np, fmt, ...) \
+ pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_err(np, fmt, ...) \
+ pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_notice(np, fmt, ...) \
+ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
+
+static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ int status = NETDEV_TX_OK;
+ netdev_features_t features;
+
+ features = netif_skb_features(skb);
+
+ if (skb_vlan_tag_present(skb) &&
+ !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+ skb = __vlan_hwaccel_push_inside(skb);
+ if (unlikely(!skb)) {
+ /* This is actually a packet drop, but we
+ * don't want the code that calls this
+ * function to try and operate on a NULL skb.
+ */
+ goto out;
+ }
+ }
+
+ status = netdev_start_xmit(skb, dev, txq, false);
+
+out:
+ return status;
+}
+
+static void queue_process(struct work_struct *work)
+{
+ struct netpoll_info *npinfo =
+ container_of(work, struct netpoll_info, tx_work.work);
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ while ((skb = skb_dequeue(&npinfo->txq))) {
+ struct net_device *dev = skb->dev;
+ struct netdev_queue *txq;
+
+ if (!netif_device_present(dev) || !netif_running(dev)) {
+ kfree_skb(skb);
+ continue;
+ }
+
+ txq = skb_get_tx_queue(dev, skb);
+
+ local_irq_save(flags);
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (netif_xmit_frozen_or_stopped(txq) ||
+ netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
+ skb_queue_head(&npinfo->txq, skb);
+ HARD_TX_UNLOCK(dev, txq);
+ local_irq_restore(flags);
+
+ schedule_delayed_work(&npinfo->tx_work, HZ/10);
+ return;
+ }
+ HARD_TX_UNLOCK(dev, txq);
+ local_irq_restore(flags);
+ }
+}
+
+/*
+ * Check whether delayed processing was scheduled for our NIC. If so,
+ * we attempt to grab the poll lock and use ->poll() to pump the card.
+ * If this fails, either we've recursed in ->poll() or it's already
+ * running on another CPU.
+ *
+ * Note: we don't mask interrupts with this lock because we're using
+ * trylock here and interrupts are already disabled in the softirq
+ * case. Further, we test the poll_owner to avoid recursion on UP
+ * systems where the lock doesn't exist.
+ */
+static int poll_one_napi(struct napi_struct *napi, int budget)
+{
+ int work;
+
+ /* net_rx_action's ->poll() invocations and our's are
+ * synchronized by this test which is only made while
+ * holding the napi->poll_lock.
+ */
+ if (!test_bit(NAPI_STATE_SCHED, &napi->state))
+ return budget;
+
+ set_bit(NAPI_STATE_NPSVC, &napi->state);
+
+ work = napi->poll(napi, budget);
+ WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
+ trace_napi_poll(napi);
+
+ clear_bit(NAPI_STATE_NPSVC, &napi->state);
+
+ return budget - work;
+}
+
+static void poll_napi(struct net_device *dev, int budget)
+{
+ struct napi_struct *napi;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (napi->poll_owner != smp_processor_id() &&
+ spin_trylock(&napi->poll_lock)) {
+ budget = poll_one_napi(napi, budget);
+ spin_unlock(&napi->poll_lock);
+ }
+ }
+}
+
+static void netpoll_poll_dev(struct net_device *dev)
+{
+ const struct net_device_ops *ops;
+ struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+ int budget = 0;
+
+ /* Don't do any rx activity if the dev_lock mutex is held
+ * the dev_open/close paths use this to block netpoll activity
+ * while changing device state
+ */
+ if (down_trylock(&ni->dev_lock))
+ return;
+
+ if (!netif_running(dev)) {
+ up(&ni->dev_lock);
+ return;
+ }
+
+ ops = dev->netdev_ops;
+ if (!ops->ndo_poll_controller) {
+ up(&ni->dev_lock);
+ return;
+ }
+
+ /* Process pending work on NIC */
+ ops->ndo_poll_controller(dev);
+
+ poll_napi(dev, budget);
+
+ up(&ni->dev_lock);
+
+ zap_completion_queue();
+}
+
+void netpoll_poll_disable(struct net_device *dev)
+{
+ struct netpoll_info *ni;
+ int idx;
+ might_sleep();
+ idx = srcu_read_lock(&netpoll_srcu);
+ ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
+ if (ni)
+ down(&ni->dev_lock);
+ srcu_read_unlock(&netpoll_srcu, idx);
+}
+EXPORT_SYMBOL(netpoll_poll_disable);
+
+void netpoll_poll_enable(struct net_device *dev)
+{
+ struct netpoll_info *ni;
+ rcu_read_lock();
+ ni = rcu_dereference(dev->npinfo);
+ if (ni)
+ up(&ni->dev_lock);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netpoll_poll_enable);
+
+static void refill_skbs(void)
+{
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skb_pool.lock, flags);
+ while (skb_pool.qlen < MAX_SKBS) {
+ skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
+ if (!skb)
+ break;
+
+ __skb_queue_tail(&skb_pool, skb);
+ }
+ spin_unlock_irqrestore(&skb_pool.lock, flags);
+}
+
+static void zap_completion_queue(void)
+{
+ unsigned long flags;
+ struct softnet_data *sd = &get_cpu_var(softnet_data);
+
+ if (sd->completion_queue) {
+ struct sk_buff *clist;
+
+ local_irq_save(flags);
+ clist = sd->completion_queue;
+ sd->completion_queue = NULL;
+ local_irq_restore(flags);
+
+ while (clist != NULL) {
+ struct sk_buff *skb = clist;
+ clist = clist->next;
+ if (!skb_irq_freeable(skb)) {
+ atomic_inc(&skb->users);
+ dev_kfree_skb_any(skb); /* put this one back */
+ } else {
+ __kfree_skb(skb);
+ }
+ }
+ }
+
+ put_cpu_var(softnet_data);
+}
+
+static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
+{
+ int count = 0;
+ struct sk_buff *skb;
+
+ zap_completion_queue();
+ refill_skbs();
+repeat:
+
+ skb = alloc_skb(len, GFP_ATOMIC);
+ if (!skb)
+ skb = skb_dequeue(&skb_pool);
+
+ if (!skb) {
+ if (++count < 10) {
+ netpoll_poll_dev(np->dev);
+ goto repeat;
+ }
+ return NULL;
+ }
+
+ atomic_set(&skb->users, 1);
+ skb_reserve(skb, reserve);
+ return skb;
+}
+
+static int netpoll_owner_active(struct net_device *dev)
+{
+ struct napi_struct *napi;
+
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (napi->poll_owner == smp_processor_id())
+ return 1;
+ }
+ return 0;
+}
+
+/* call with IRQ disabled */
+void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int status = NETDEV_TX_BUSY;
+ unsigned long tries;
+ /* It is up to the caller to keep npinfo alive. */
+ struct netpoll_info *npinfo;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ npinfo = rcu_dereference_bh(np->dev->npinfo);
+ if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
+ dev_kfree_skb_irq(skb);
+ return;
+ }
+
+ /* don't get messages out of order, and no recursion */
+ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
+ struct netdev_queue *txq;
+
+ txq = netdev_pick_tx(dev, skb, NULL);
+
+ /* try until next clock tick */
+ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
+ tries > 0; --tries) {
+ if (HARD_TX_TRYLOCK(dev, txq)) {
+ if (!netif_xmit_stopped(txq))
+ status = netpoll_start_xmit(skb, dev, txq);
+
+ HARD_TX_UNLOCK(dev, txq);
+
+ if (status == NETDEV_TX_OK)
+ break;
+
+ }
+
+ /* tickle device maybe there is some cleanup */
+ netpoll_poll_dev(np->dev);
+
+ udelay(USEC_PER_POLL);
+ }
+
+ WARN_ONCE(!irqs_disabled(),
+ "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
+ dev->name, dev->netdev_ops->ndo_start_xmit);
+
+ }
+
+ if (status != NETDEV_TX_OK) {
+ skb_queue_tail(&npinfo->txq, skb);
+ schedule_delayed_work(&npinfo->tx_work,0);
+ }
+}
+EXPORT_SYMBOL(netpoll_send_skb_on_dev);
+
+void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+{
+ int total_len, ip_len, udp_len;
+ struct sk_buff *skb;
+ struct udphdr *udph;
+ struct iphdr *iph;
+ struct ethhdr *eth;
+ static atomic_t ip_ident;
+ struct ipv6hdr *ip6h;
+
+ udp_len = len + sizeof(*udph);
+ if (np->ipv6)
+ ip_len = udp_len + sizeof(*ip6h);
+ else
+ ip_len = udp_len + sizeof(*iph);
+
+ total_len = ip_len + LL_RESERVED_SPACE(np->dev);
+
+ skb = find_skb(np, total_len + np->dev->needed_tailroom,
+ total_len - len);
+ if (!skb)
+ return;
+
+ skb_copy_to_linear_data(skb, msg, len);
+ skb_put(skb, len);
+
+ skb_push(skb, sizeof(*udph));
+ skb_reset_transport_header(skb);
+ udph = udp_hdr(skb);
+ udph->source = htons(np->local_port);
+ udph->dest = htons(np->remote_port);
+ udph->len = htons(udp_len);
+
+ if (np->ipv6) {
+ udph->check = 0;
+ udph->check = csum_ipv6_magic(&np->local_ip.in6,
+ &np->remote_ip.in6,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+
+ /* ip6h->version = 6; ip6h->priority = 0; */
+ put_unaligned(0x60, (unsigned char *)ip6h);
+ ip6h->flow_lbl[0] = 0;
+ ip6h->flow_lbl[1] = 0;
+ ip6h->flow_lbl[2] = 0;
+
+ ip6h->payload_len = htons(sizeof(struct udphdr) + len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 32;
+ ip6h->saddr = np->local_ip.in6;
+ ip6h->daddr = np->remote_ip.in6;
+
+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
+ } else {
+ udph->check = 0;
+ udph->check = csum_tcpudp_magic(np->local_ip.ip,
+ np->remote_ip.ip,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ /* iph->version = 4; iph->ihl = 5; */
+ put_unaligned(0x45, (unsigned char *)iph);
+ iph->tos = 0;
+ put_unaligned(htons(ip_len), &(iph->tot_len));
+ iph->id = htons(atomic_inc_return(&ip_ident));
+ iph->frag_off = 0;
+ iph->ttl = 64;
+ iph->protocol = IPPROTO_UDP;
+ iph->check = 0;
+ put_unaligned(np->local_ip.ip, &(iph->saddr));
+ put_unaligned(np->remote_ip.ip, &(iph->daddr));
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth->h_proto = htons(ETH_P_IP);
+ }
+
+ ether_addr_copy(eth->h_source, np->dev->dev_addr);
+ ether_addr_copy(eth->h_dest, np->remote_mac);
+
+ skb->dev = np->dev;
+
+ netpoll_send_skb(np, skb);
+}
+EXPORT_SYMBOL(netpoll_send_udp);
+
+void netpoll_print_options(struct netpoll *np)
+{
+ np_info(np, "local port %d\n", np->local_port);
+ if (np->ipv6)
+ np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
+ else
+ np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
+ np_info(np, "interface '%s'\n", np->dev_name);
+ np_info(np, "remote port %d\n", np->remote_port);
+ if (np->ipv6)
+ np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
+ else
+ np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
+ np_info(np, "remote ethernet address %pM\n", np->remote_mac);
+}
+EXPORT_SYMBOL(netpoll_print_options);
+
+static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
+{
+ const char *end;
+
+ if (!strchr(str, ':') &&
+ in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
+ if (!*end)
+ return 0;
+ }
+ if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!*end)
+ return 1;
+#else
+ return -1;
+#endif
+ }
+ return -1;
+}
+
+int netpoll_parse_options(struct netpoll *np, char *opt)
+{
+ char *cur=opt, *delim;
+ int ipv6;
+ bool ipversion_set = false;
+
+ if (*cur != '@') {
+ if ((delim = strchr(cur, '@')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ if (kstrtou16(cur, 10, &np->local_port))
+ goto parse_failed;
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != '/') {
+ ipversion_set = true;
+ if ((delim = strchr(cur, '/')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
+ if (ipv6 < 0)
+ goto parse_failed;
+ else
+ np->ipv6 = (bool)ipv6;
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != ',') {
+ /* parse out dev name */
+ if ((delim = strchr(cur, ',')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ strlcpy(np->dev_name, cur, sizeof(np->dev_name));
+ cur = delim;
+ }
+ cur++;
+
+ if (*cur != '@') {
+ /* dst port */
+ if ((delim = strchr(cur, '@')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ if (*cur == ' ' || *cur == '\t')
+ np_info(np, "warning: whitespace is not allowed\n");
+ if (kstrtou16(cur, 10, &np->remote_port))
+ goto parse_failed;
+ cur = delim;
+ }
+ cur++;
+
+ /* dst ip */
+ if ((delim = strchr(cur, '/')) == NULL)
+ goto parse_failed;
+ *delim = 0;
+ ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
+ if (ipv6 < 0)
+ goto parse_failed;
+ else if (ipversion_set && np->ipv6 != (bool)ipv6)
+ goto parse_failed;
+ else
+ np->ipv6 = (bool)ipv6;
+ cur = delim + 1;
+
+ if (*cur != 0) {
+ /* MAC address */
+ if (!mac_pton(cur, np->remote_mac))
+ goto parse_failed;
+ }
+
+ netpoll_print_options(np);
+
+ return 0;
+
+ parse_failed:
+ np_info(np, "couldn't parse config at '%s'!\n", cur);
+ return -1;
+}
+EXPORT_SYMBOL(netpoll_parse_options);
+
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
+{
+ struct netpoll_info *npinfo;
+ const struct net_device_ops *ops;
+ int err;
+
+ np->dev = ndev;
+ strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
+ INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
+
+ if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+ !ndev->netdev_ops->ndo_poll_controller) {
+ np_err(np, "%s doesn't support polling, aborting\n",
+ np->dev_name);
+ err = -ENOTSUPP;
+ goto out;
+ }
+
+ if (!ndev->npinfo) {
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+ if (!npinfo) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ sema_init(&npinfo->dev_lock, 1);
+ skb_queue_head_init(&npinfo->txq);
+ INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
+
+ atomic_set(&npinfo->refcnt, 1);
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_setup) {
+ err = ops->ndo_netpoll_setup(ndev, npinfo);
+ if (err)
+ goto free_npinfo;
+ }
+ } else {
+ npinfo = rtnl_dereference(ndev->npinfo);
+ atomic_inc(&npinfo->refcnt);
+ }
+
+ npinfo->netpoll = np;
+
+ /* last thing to do is link it to the net device structure */
+ rcu_assign_pointer(ndev->npinfo, npinfo);
+
+ return 0;
+
+free_npinfo:
+ kfree(npinfo);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(__netpoll_setup);
+
+int netpoll_setup(struct netpoll *np)
+{
+ struct net_device *ndev = NULL;
+ struct in_device *in_dev;
+ int err;
+
+ rtnl_lock();
+ if (np->dev_name) {
+ struct net *net = current->nsproxy->net_ns;
+ ndev = __dev_get_by_name(net, np->dev_name);
+ }
+ if (!ndev) {
+ np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
+ err = -ENODEV;
+ goto unlock;
+ }
+ dev_hold(ndev);
+
+ if (netdev_master_upper_dev_get(ndev)) {
+ np_err(np, "%s is a slave device, aborting\n", np->dev_name);
+ err = -EBUSY;
+ goto put;
+ }
+
+ if (!netif_running(ndev)) {
+ unsigned long atmost, atleast;
+
+ np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
+
+ err = dev_open(ndev);
+
+ if (err) {
+ np_err(np, "failed to open %s\n", ndev->name);
+ goto put;
+ }
+
+ rtnl_unlock();
+ atleast = jiffies + HZ/10;
+ atmost = jiffies + carrier_timeout * HZ;
+ while (!netif_carrier_ok(ndev)) {
+ if (time_after(jiffies, atmost)) {
+ np_notice(np, "timeout waiting for carrier\n");
+ break;
+ }
+ msleep(1);
+ }
+
+ /* If carrier appears to come up instantly, we don't
+ * trust it and pause so that we don't pump all our
+ * queued console messages into the bitbucket.
+ */
+
+ if (time_before(jiffies, atleast)) {
+ np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");
+ msleep(4000);
+ }
+ rtnl_lock();
+ }
+
+ if (!np->local_ip.ip) {
+ if (!np->ipv6) {
+ in_dev = __in_dev_get_rtnl(ndev);
+
+ if (!in_dev || !in_dev->ifa_list) {
+ np_err(np, "no IP address for %s, aborting\n",
+ np->dev_name);
+ err = -EDESTADDRREQ;
+ goto put;
+ }
+
+ np->local_ip.ip = in_dev->ifa_list->ifa_local;
+ np_info(np, "local IP %pI4\n", &np->local_ip.ip);
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_dev *idev;
+
+ err = -EDESTADDRREQ;
+ idev = __in6_dev_get(ndev);
+ if (idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)
+ continue;
+ np->local_ip.in6 = ifp->addr;
+ err = 0;
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ if (err) {
+ np_err(np, "no IPv6 address for %s, aborting\n",
+ np->dev_name);
+ goto put;
+ } else
+ np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
+#else
+ np_err(np, "IPv6 is not supported %s, aborting\n",
+ np->dev_name);
+ err = -EINVAL;
+ goto put;
+#endif
+ }
+ }
+
+ /* fill up the skb queue */
+ refill_skbs();
+
+ err = __netpoll_setup(np, ndev);
+ if (err)
+ goto put;
+
+ rtnl_unlock();
+ return 0;
+
+put:
+ dev_put(ndev);
+unlock:
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(netpoll_setup);
+
+static int __init netpoll_init(void)
+{
+ skb_queue_head_init(&skb_pool);
+ return 0;
+}
+core_initcall(netpoll_init);
+
+static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
+{
+ struct netpoll_info *npinfo =
+ container_of(rcu_head, struct netpoll_info, rcu);
+
+ skb_queue_purge(&npinfo->txq);
+
+ /* we can't call cancel_delayed_work_sync here, as we are in softirq */
+ cancel_delayed_work(&npinfo->tx_work);
+
+ /* clean after last, unfinished work */
+ __skb_queue_purge(&npinfo->txq);
+ /* now cancel it again */
+ cancel_delayed_work(&npinfo->tx_work);
+ kfree(npinfo);
+}
+
+void __netpoll_cleanup(struct netpoll *np)
+{
+ struct netpoll_info *npinfo;
+
+ /* rtnl_dereference would be preferable here but
+ * rcu_cleanup_netpoll path can put us in here safely without
+ * holding the rtnl, so plain rcu_dereference it is
+ */
+ npinfo = rtnl_dereference(np->dev->npinfo);
+ if (!npinfo)
+ return;
+
+ synchronize_srcu(&netpoll_srcu);
+
+ if (atomic_dec_and_test(&npinfo->refcnt)) {
+ const struct net_device_ops *ops;
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_cleanup)
+ ops->ndo_netpoll_cleanup(np->dev);
+
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
+ call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
+ } else
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
+}
+EXPORT_SYMBOL_GPL(__netpoll_cleanup);
+
+static void netpoll_async_cleanup(struct work_struct *work)
+{
+ struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
+
+ rtnl_lock();
+ __netpoll_cleanup(np);
+ rtnl_unlock();
+ kfree(np);
+}
+
+void __netpoll_free_async(struct netpoll *np)
+{
+ schedule_work(&np->cleanup_work);
+}
+EXPORT_SYMBOL_GPL(__netpoll_free_async);
+
+void netpoll_cleanup(struct netpoll *np)
+{
+ rtnl_lock();
+ if (!np->dev)
+ goto out;
+ __netpoll_cleanup(np);
+ dev_put(np->dev);
+ np->dev = NULL;
+out:
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(netpoll_cleanup);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 000000000..cbd0a199b
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,288 @@
+/*
+ * net/core/netprio_cgroup.c Priority Control Group
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/netprio_cgroup.h>
+
+#include <linux/fdtable.h>
+
+#define PRIOMAP_MIN_SZ 128
+
+/*
+ * Extend @dev->priomap so that it's large enough to accommodate
+ * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
+ * return. Must be called under rtnl lock.
+ */
+static int extend_netdev_table(struct net_device *dev, u32 target_idx)
+{
+ struct netprio_map *old, *new;
+ size_t new_sz, new_len;
+
+ /* is the existing priomap large enough? */
+ old = rtnl_dereference(dev->priomap);
+ if (old && old->priomap_len > target_idx)
+ return 0;
+
+ /*
+ * Determine the new size. Let's keep it power-of-two. We start
+ * from PRIOMAP_MIN_SZ and double it until it's large enough to
+ * accommodate @target_idx.
+ */
+ new_sz = PRIOMAP_MIN_SZ;
+ while (true) {
+ new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
+ sizeof(new->priomap[0]);
+ if (new_len > target_idx)
+ break;
+ new_sz *= 2;
+ /* overflowed? */
+ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
+ return -ENOSPC;
+ }
+
+ /* allocate & copy */
+ new = kzalloc(new_sz, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (old)
+ memcpy(new->priomap, old->priomap,
+ old->priomap_len * sizeof(old->priomap[0]));
+
+ new->priomap_len = new_len;
+
+ /* install the new priomap */
+ rcu_assign_pointer(dev->priomap, new);
+ if (old)
+ kfree_rcu(old, rcu);
+ return 0;
+}
+
+/**
+ * netprio_prio - return the effective netprio of a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ *
+ * Should be called under RCU read or rtnl lock.
+ */
+static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
+{
+ struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
+ int id = css->cgroup->id;
+
+ if (map && id < map->priomap_len)
+ return map->priomap[id];
+ return 0;
+}
+
+/**
+ * netprio_set_prio - set netprio on a cgroup-net_device pair
+ * @css: css part of the target pair
+ * @dev: net_device part of the target pair
+ * @prio: prio to set
+ *
+ * Set netprio to @prio on @css-@dev pair. Should be called under rtnl
+ * lock and may fail under memory pressure for non-zero @prio.
+ */
+static int netprio_set_prio(struct cgroup_subsys_state *css,
+ struct net_device *dev, u32 prio)
+{
+ struct netprio_map *map;
+ int id = css->cgroup->id;
+ int ret;
+
+ /* avoid extending priomap for zero writes */
+ map = rtnl_dereference(dev->priomap);
+ if (!prio && (!map || map->priomap_len <= id))
+ return 0;
+
+ ret = extend_netdev_table(dev, id);
+ if (ret)
+ return ret;
+
+ map = rtnl_dereference(dev->priomap);
+ map->priomap[id] = prio;
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kzalloc(sizeof(*css), GFP_KERNEL);
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static int cgrp_css_online(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *parent_css = css->parent;
+ struct net_device *dev;
+ int ret = 0;
+
+ if (!parent_css)
+ return 0;
+
+ rtnl_lock();
+ /*
+ * Inherit prios from the parent. As all prios are set during
+ * onlining, there is no need to clear them on offline.
+ */
+ for_each_netdev(&init_net, dev) {
+ u32 prio = netprio_prio(parent_css, dev);
+
+ ret = netprio_set_prio(css, dev, prio);
+ if (ret)
+ break;
+ }
+ rtnl_unlock();
+ return ret;
+}
+
+static void cgrp_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return css->cgroup->id;
+}
+
+static int read_priomap(struct seq_file *sf, void *v)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev)
+ seq_printf(sf, "%s %u\n", dev->name,
+ netprio_prio(seq_css(sf), dev));
+ rcu_read_unlock();
+ return 0;
+}
+
+static ssize_t write_priomap(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ char devname[IFNAMSIZ + 1];
+ struct net_device *dev;
+ u32 prio;
+ int ret;
+
+ if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
+ return -EINVAL;
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (!dev)
+ return -ENODEV;
+
+ rtnl_lock();
+
+ ret = netprio_set_prio(of_css(of), dev, prio);
+
+ rtnl_unlock();
+ dev_put(dev);
+ return ret ?: nbytes;
+}
+
+static int update_netprio(const void *v, struct file *file, unsigned n)
+{
+ int err;
+ struct socket *sock = sock_from_file(file, &err);
+ if (sock)
+ sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+ return 0;
+}
+
+static void net_prio_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *p;
+ void *v = (void *)(unsigned long)css->cgroup->id;
+
+ cgroup_taskset_for_each(p, tset) {
+ task_lock(p);
+ iterate_fd(p->files, 0, update_netprio, v);
+ task_unlock(p);
+ }
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "prioidx",
+ .read_u64 = read_prioidx,
+ },
+ {
+ .name = "ifpriomap",
+ .seq_show = read_priomap,
+ .write = write_priomap,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys net_prio_cgrp_subsys = {
+ .css_alloc = cgrp_css_alloc,
+ .css_online = cgrp_css_online,
+ .css_free = cgrp_css_free,
+ .attach = net_prio_attach,
+ .legacy_cftypes = ss_files,
+};
+
+static int netprio_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netprio_map *old;
+
+ /*
+ * Note this is called with rtnl_lock held so we have update side
+ * protection on our rcu assignments
+ */
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block netprio_device_notifier = {
+ .notifier_call = netprio_device_event
+};
+
+static int __init init_cgroup_netprio(void)
+{
+ register_netdevice_notifier(&netprio_device_notifier);
+ return 0;
+}
+
+subsys_initcall(init_cgroup_netprio);
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
new file mode 100644
index 000000000..508155b28
--- /dev/null
+++ b/net/core/pktgen.c
@@ -0,0 +1,3862 @@
+/*
+ * Authors:
+ * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se>
+ * Uppsala University and
+ * Swedish University of Agricultural Sciences
+ *
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Ben Greear <greearb@candelatech.com>
+ * Jens Låås <jens.laas@data.slu.se>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * A tool for loading the network with preconfigurated packets.
+ * The tool is implemented as a linux module. Parameters are output
+ * device, delay (to hard_xmit), number of packets, and whether
+ * to use multiple SKBs or just the same one.
+ * pktgen uses the installed interface's output routine.
+ *
+ * Additional hacking by:
+ *
+ * Jens.Laas@data.slu.se
+ * Improved by ANK. 010120.
+ * Improved by ANK even more. 010212.
+ * MAC address typo fixed. 010417 --ro
+ * Integrated. 020301 --DaveM
+ * Added multiskb option 020301 --DaveM
+ * Scaling of results. 020417--sigurdur@linpro.no
+ * Significant re-work of the module:
+ * * Convert to threaded model to more efficiently be able to transmit
+ * and receive on multiple interfaces at once.
+ * * Converted many counters to __u64 to allow longer runs.
+ * * Allow configuration of ranges, like min/max IP address, MACs,
+ * and UDP-ports, for both source and destination, and can
+ * set to use a random distribution or sequentially walk the range.
+ * * Can now change most values after starting.
+ * * Place 12-byte packet in UDP payload with magic number,
+ * sequence number, and timestamp.
+ * * Add receiver code that detects dropped pkts, re-ordered pkts, and
+ * latencies (with micro-second) precision.
+ * * Add IOCTL interface to easily get counters & configuration.
+ * --Ben Greear <greearb@candelatech.com>
+ *
+ * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
+ * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
+ * as a "fastpath" with a configurable number of clones after alloc's.
+ * clone_skb=0 means all packets are allocated this also means ranges time
+ * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
+ * clones.
+ *
+ * Also moved to /proc/net/pktgen/
+ * --ro
+ *
+ * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever
+ * mistakes. Also merged in DaveM's patch in the -pre6 patch.
+ * --Ben Greear <greearb@candelatech.com>
+ *
+ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
+ *
+ *
+ * 021124 Finished major redesign and rewrite for new functionality.
+ * See Documentation/networking/pktgen.txt for how to use this.
+ *
+ * The new operation:
+ * For each CPU one thread/process is created at start. This process checks
+ * for running devices in the if_list and sends packets until count is 0 it
+ * also the thread checks the thread->control which is used for inter-process
+ * communication. controlling process "posts" operations to the threads this
+ * way.
+ * The if_list is RCU protected, and the if_lock remains to protect updating
+ * of if_list, from "add_device" as it invoked from userspace (via proc write).
+ *
+ * By design there should only be *one* "controlling" process. In practice
+ * multiple write accesses gives unpredictable result. Understood by "write"
+ * to /proc gives result code thats should be read be the "writer".
+ * For practical use this should be no problem.
+ *
+ * Note when adding devices to a specific CPU there good idea to also assign
+ * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
+ * --ro
+ *
+ * Fix refcount off by one if first packet fails, potential null deref,
+ * memleak 030710- KJP
+ *
+ * First "ranges" functionality for ipv6 030726 --ro
+ *
+ * Included flow support. 030802 ANK.
+ *
+ * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org>
+ *
+ * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419
+ * ia64 compilation fix from Aron Griffis <aron@hp.com> 040604
+ *
+ * New xmit() return, do_div and misc clean up by Stephen Hemminger
+ * <shemminger@osdl.org> 040923
+ *
+ * Randy Dunlap fixed u64 printk compiler warning
+ *
+ * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org>
+ * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213
+ *
+ * Corrections from Nikolai Malykh (nmalykh@bilim.com)
+ * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230
+ *
+ * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
+ * 050103
+ *
+ * MPLS support by Steven Whitehouse <steve@chygwyn.com>
+ *
+ * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com>
+ *
+ * Fixed src_mac command to set source mac of packet to value specified in
+ * command by Adit Ranadive <adit.262@gmail.com>
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sys.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/unistd.h>
+#include <linux/string.h>
+#include <linux/ptrace.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/hrtimer.h>
+#include <linux/freezer.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/wait.h>
+#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <net/net_namespace.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+#include <net/addrconf.h>
+#ifdef CONFIG_XFRM
+#include <net/xfrm.h>
+#endif
+#include <net/netns/generic.h>
+#include <asm/byteorder.h>
+#include <linux/rcupdate.h>
+#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/timex.h>
+#include <linux/uaccess.h>
+#include <asm/dma.h>
+#include <asm/div64.h> /* do_div */
+
+#define VERSION "2.74"
+#define IP_NAME_SZ 32
+#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
+#define MPLS_STACK_BOTTOM htonl(0x00000100)
+
+#define func_enter() pr_debug("entering %s\n", __func__);
+
+/* Device flag bits */
+#define F_IPSRC_RND (1<<0) /* IP-Src Random */
+#define F_IPDST_RND (1<<1) /* IP-Dst Random */
+#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */
+#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */
+#define F_MACSRC_RND (1<<4) /* MAC-Src Random */
+#define F_MACDST_RND (1<<5) /* MAC-Dst Random */
+#define F_TXSIZE_RND (1<<6) /* Transmit size is random */
+#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */
+#define F_MPLS_RND (1<<8) /* Random MPLS labels */
+#define F_VID_RND (1<<9) /* Random VLAN ID */
+#define F_SVID_RND (1<<10) /* Random SVLAN ID */
+#define F_FLOW_SEQ (1<<11) /* Sequential flows */
+#define F_IPSEC_ON (1<<12) /* ipsec on for flows */
+#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */
+#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */
+#define F_NODE (1<<15) /* Node memory alloc*/
+#define F_UDPCSUM (1<<16) /* Include UDP checksum */
+#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */
+
+/* Thread control flag bits */
+#define T_STOP (1<<0) /* Stop run */
+#define T_RUN (1<<1) /* Start run */
+#define T_REMDEVALL (1<<2) /* Remove all devs */
+#define T_REMDEV (1<<3) /* Remove one dev */
+
+/* If lock -- protects updating of if_list */
+#define if_lock(t) spin_lock(&(t->if_lock));
+#define if_unlock(t) spin_unlock(&(t->if_lock));
+
+/* Used to help with determining the pkts on receive */
+#define PKTGEN_MAGIC 0xbe9be955
+#define PG_PROC_DIR "pktgen"
+#define PGCTRL "pgctrl"
+
+#define MAX_CFLOWS 65536
+
+#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
+#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+
+struct flow_state {
+ __be32 cur_daddr;
+ int count;
+#ifdef CONFIG_XFRM
+ struct xfrm_state *x;
+#endif
+ __u32 flags;
+};
+
+/* flow flag bits */
+#define F_INIT (1<<0) /* flow has been initialized */
+
+struct pktgen_dev {
+ /*
+ * Try to keep frequent/infrequent used vars. separated.
+ */
+ struct proc_dir_entry *entry; /* proc file */
+ struct pktgen_thread *pg_thread;/* the owner */
+ struct list_head list; /* chaining in the thread's run-queue */
+ struct rcu_head rcu; /* freed by RCU */
+
+ int running; /* if false, the test will stop */
+
+ /* If min != max, then we will either do a linear iteration, or
+ * we will do a random selection from within the range.
+ */
+ __u32 flags;
+ int removal_mark; /* non-zero => the device is marked for
+ * removal by worker thread */
+
+ int min_pkt_size;
+ int max_pkt_size;
+ int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
+ int nfrags;
+ struct page *page;
+ u64 delay; /* nano-seconds */
+
+ __u64 count; /* Default No packets to send */
+ __u64 sofar; /* How many pkts we've sent so far */
+ __u64 tx_bytes; /* How many bytes we've transmitted */
+ __u64 errors; /* Errors when trying to transmit, */
+
+ /* runtime counters relating to clone_skb */
+
+ __u64 allocated_skbs;
+ __u32 clone_count;
+ int last_ok; /* Was last skb sent?
+ * Or a failed transmit of some sort?
+ * This will keep sequence numbers in order
+ */
+ ktime_t next_tx;
+ ktime_t started_at;
+ ktime_t stopped_at;
+ u64 idle_acc; /* nano-seconds */
+
+ __u32 seq_num;
+
+ int clone_skb; /*
+ * Use multiple SKBs during packet gen.
+ * If this number is greater than 1, then
+ * that many copies of the same packet will be
+ * sent before a new packet is allocated.
+ * If you want to send 1024 identical packets
+ * before creating a new packet,
+ * set clone_skb to 1024.
+ */
+
+ char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+ char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */
+
+ struct in6_addr in6_saddr;
+ struct in6_addr in6_daddr;
+ struct in6_addr cur_in6_daddr;
+ struct in6_addr cur_in6_saddr;
+ /* For ranges */
+ struct in6_addr min_in6_daddr;
+ struct in6_addr max_in6_daddr;
+ struct in6_addr min_in6_saddr;
+ struct in6_addr max_in6_saddr;
+
+ /* If we're doing ranges, random or incremental, then this
+ * defines the min/max for those ranges.
+ */
+ __be32 saddr_min; /* inclusive, source IP address */
+ __be32 saddr_max; /* exclusive, source IP address */
+ __be32 daddr_min; /* inclusive, dest IP address */
+ __be32 daddr_max; /* exclusive, dest IP address */
+
+ __u16 udp_src_min; /* inclusive, source UDP port */
+ __u16 udp_src_max; /* exclusive, source UDP port */
+ __u16 udp_dst_min; /* inclusive, dest UDP port */
+ __u16 udp_dst_max; /* exclusive, dest UDP port */
+
+ /* DSCP + ECN */
+ __u8 tos; /* six MSB of (former) IPv4 TOS
+ are for dscp codepoint */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
+ (see RFC 3260, sec. 4) */
+
+ /* MPLS */
+ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */
+ __be32 labels[MAX_MPLS_LABELS];
+
+ /* VLAN/SVLAN (802.1Q/Q-in-Q) */
+ __u8 vlan_p;
+ __u8 vlan_cfi;
+ __u16 vlan_id; /* 0xffff means no vlan tag */
+
+ __u8 svlan_p;
+ __u8 svlan_cfi;
+ __u16 svlan_id; /* 0xffff means no svlan tag */
+
+ __u32 src_mac_count; /* How many MACs to iterate through */
+ __u32 dst_mac_count; /* How many MACs to iterate through */
+
+ unsigned char dst_mac[ETH_ALEN];
+ unsigned char src_mac[ETH_ALEN];
+
+ __u32 cur_dst_mac_offset;
+ __u32 cur_src_mac_offset;
+ __be32 cur_saddr;
+ __be32 cur_daddr;
+ __u16 ip_id;
+ __u16 cur_udp_dst;
+ __u16 cur_udp_src;
+ __u16 cur_queue_map;
+ __u32 cur_pkt_size;
+ __u32 last_pkt_size;
+
+ __u8 hh[14];
+ /* = {
+ 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+
+ We fill in SRC address later
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x08, 0x00
+ };
+ */
+ __u16 pad; /* pad out the hh struct to an even 16 bytes */
+
+ struct sk_buff *skb; /* skb we are to transmit next, used for when we
+ * are transmitting the same one multiple times
+ */
+ struct net_device *odev; /* The out-going device.
+ * Note that the device should have it's
+ * pg_info pointer pointing back to this
+ * device.
+ * Set when the user specifies the out-going
+ * device name (not when the inject is
+ * started as it used to do.)
+ */
+ char odevname[32];
+ struct flow_state *flows;
+ unsigned int cflows; /* Concurrent flows (config) */
+ unsigned int lflow; /* Flow length (config) */
+ unsigned int nflows; /* accumulated flows (stats) */
+ unsigned int curfl; /* current sequenced flow (state)*/
+
+ u16 queue_map_min;
+ u16 queue_map_max;
+ __u32 skb_priority; /* skb priority field */
+ unsigned int burst; /* number of duplicated packets to burst */
+ int node; /* Memory node */
+
+#ifdef CONFIG_XFRM
+ __u8 ipsmode; /* IPSEC mode (config) */
+ __u8 ipsproto; /* IPSEC type (config) */
+ __u32 spi;
+ struct dst_entry dst;
+ struct dst_ops dstops;
+#endif
+ char result[512];
+};
+
+struct pktgen_hdr {
+ __be32 pgh_magic;
+ __be32 seq_num;
+ __be32 tv_sec;
+ __be32 tv_usec;
+};
+
+
+static int pg_net_id __read_mostly;
+
+struct pktgen_net {
+ struct net *net;
+ struct proc_dir_entry *proc_dir;
+ struct list_head pktgen_threads;
+ bool pktgen_exiting;
+};
+
+struct pktgen_thread {
+ spinlock_t if_lock; /* for list of devices */
+ struct list_head if_list; /* All device here */
+ struct list_head th_list;
+ struct task_struct *tsk;
+ char result[512];
+
+ /* Field for thread to receive "posted" events terminate,
+ stop ifs etc. */
+
+ u32 control;
+ int cpu;
+
+ wait_queue_head_t queue;
+ struct completion start_done;
+ struct pktgen_net *net;
+};
+
+#define REMOVE 1
+#define FIND 0
+
+static const char version[] =
+ "Packet Generator for packet performance testing. "
+ "Version: " VERSION "\n";
+
+static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+ const char *ifname, bool exact);
+static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
+static void pktgen_run_all_threads(struct pktgen_net *pn);
+static void pktgen_reset_all_threads(struct pktgen_net *pn);
+static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);
+
+static void pktgen_stop(struct pktgen_thread *t);
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
+
+/* Module parameters, defaults. */
+static int pg_count_d __read_mostly = 1000;
+static int pg_delay_d __read_mostly;
+static int pg_clone_skb_d __read_mostly;
+static int debug __read_mostly;
+
+static DEFINE_MUTEX(pktgen_thread_lock);
+
+static struct notifier_block pktgen_notifier_block = {
+ .notifier_call = pktgen_device_event,
+};
+
+/*
+ * /proc handling functions
+ *
+ */
+
+static int pgctrl_show(struct seq_file *seq, void *v)
+{
+ seq_puts(seq, version);
+ return 0;
+}
+
+static ssize_t pgctrl_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char data[128];
+ struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (count == 0)
+ return -EINVAL;
+
+ if (count > sizeof(data))
+ count = sizeof(data);
+
+ if (copy_from_user(data, buf, count))
+ return -EFAULT;
+
+ data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
+
+ if (!strcmp(data, "stop"))
+ pktgen_stop_all_threads_ifs(pn);
+
+ else if (!strcmp(data, "start"))
+ pktgen_run_all_threads(pn);
+
+ else if (!strcmp(data, "reset"))
+ pktgen_reset_all_threads(pn);
+
+ else
+ pr_warn("Unknown command: %s\n", data);
+
+ return count;
+}
+
+static int pgctrl_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pgctrl_show, PDE_DATA(inode));
+}
+
+static const struct file_operations pktgen_fops = {
+ .owner = THIS_MODULE,
+ .open = pgctrl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pgctrl_write,
+ .release = single_release,
+};
+
+static int pktgen_if_show(struct seq_file *seq, void *v)
+{
+ const struct pktgen_dev *pkt_dev = seq->private;
+ ktime_t stopped;
+ u64 idle;
+
+ seq_printf(seq,
+ "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n",
+ (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
+ pkt_dev->max_pkt_size);
+
+ seq_printf(seq,
+ " frags: %d delay: %llu clone_skb: %d ifname: %s\n",
+ pkt_dev->nfrags, (unsigned long long) pkt_dev->delay,
+ pkt_dev->clone_skb, pkt_dev->odevname);
+
+ seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows,
+ pkt_dev->lflow);
+
+ seq_printf(seq,
+ " queue_map_min: %u queue_map_max: %u\n",
+ pkt_dev->queue_map_min,
+ pkt_dev->queue_map_max);
+
+ if (pkt_dev->skb_priority)
+ seq_printf(seq, " skb_priority: %u\n",
+ pkt_dev->skb_priority);
+
+ if (pkt_dev->flags & F_IPV6) {
+ seq_printf(seq,
+ " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n"
+ " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n",
+ &pkt_dev->in6_saddr,
+ &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr,
+ &pkt_dev->in6_daddr,
+ &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr);
+ } else {
+ seq_printf(seq,
+ " dst_min: %s dst_max: %s\n",
+ pkt_dev->dst_min, pkt_dev->dst_max);
+ seq_printf(seq,
+ " src_min: %s src_max: %s\n",
+ pkt_dev->src_min, pkt_dev->src_max);
+ }
+
+ seq_puts(seq, " src_mac: ");
+
+ seq_printf(seq, "%pM ",
+ is_zero_ether_addr(pkt_dev->src_mac) ?
+ pkt_dev->odev->dev_addr : pkt_dev->src_mac);
+
+ seq_puts(seq, "dst_mac: ");
+ seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
+
+ seq_printf(seq,
+ " udp_src_min: %d udp_src_max: %d"
+ " udp_dst_min: %d udp_dst_max: %d\n",
+ pkt_dev->udp_src_min, pkt_dev->udp_src_max,
+ pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
+
+ seq_printf(seq,
+ " src_mac_count: %d dst_mac_count: %d\n",
+ pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
+
+ if (pkt_dev->nr_labels) {
+ unsigned int i;
+ seq_puts(seq, " mpls: ");
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
+ i == pkt_dev->nr_labels-1 ? "\n" : ", ");
+ }
+
+ if (pkt_dev->vlan_id != 0xffff)
+ seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n",
+ pkt_dev->vlan_id, pkt_dev->vlan_p,
+ pkt_dev->vlan_cfi);
+
+ if (pkt_dev->svlan_id != 0xffff)
+ seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n",
+ pkt_dev->svlan_id, pkt_dev->svlan_p,
+ pkt_dev->svlan_cfi);
+
+ if (pkt_dev->tos)
+ seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos);
+
+ if (pkt_dev->traffic_class)
+ seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
+
+ if (pkt_dev->burst > 1)
+ seq_printf(seq, " burst: %d\n", pkt_dev->burst);
+
+ if (pkt_dev->node >= 0)
+ seq_printf(seq, " node: %d\n", pkt_dev->node);
+
+ seq_puts(seq, " Flags: ");
+
+ if (pkt_dev->flags & F_IPV6)
+ seq_puts(seq, "IPV6 ");
+
+ if (pkt_dev->flags & F_IPSRC_RND)
+ seq_puts(seq, "IPSRC_RND ");
+
+ if (pkt_dev->flags & F_IPDST_RND)
+ seq_puts(seq, "IPDST_RND ");
+
+ if (pkt_dev->flags & F_TXSIZE_RND)
+ seq_puts(seq, "TXSIZE_RND ");
+
+ if (pkt_dev->flags & F_UDPSRC_RND)
+ seq_puts(seq, "UDPSRC_RND ");
+
+ if (pkt_dev->flags & F_UDPDST_RND)
+ seq_puts(seq, "UDPDST_RND ");
+
+ if (pkt_dev->flags & F_UDPCSUM)
+ seq_puts(seq, "UDPCSUM ");
+
+ if (pkt_dev->flags & F_NO_TIMESTAMP)
+ seq_puts(seq, "NO_TIMESTAMP ");
+
+ if (pkt_dev->flags & F_MPLS_RND)
+ seq_puts(seq, "MPLS_RND ");
+
+ if (pkt_dev->flags & F_QUEUE_MAP_RND)
+ seq_puts(seq, "QUEUE_MAP_RND ");
+
+ if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+ seq_puts(seq, "QUEUE_MAP_CPU ");
+
+ if (pkt_dev->cflows) {
+ if (pkt_dev->flags & F_FLOW_SEQ)
+ seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/
+ else
+ seq_puts(seq, "FLOW_RND ");
+ }
+
+#ifdef CONFIG_XFRM
+ if (pkt_dev->flags & F_IPSEC_ON) {
+ seq_puts(seq, "IPSEC ");
+ if (pkt_dev->spi)
+ seq_printf(seq, "spi:%u", pkt_dev->spi);
+ }
+#endif
+
+ if (pkt_dev->flags & F_MACSRC_RND)
+ seq_puts(seq, "MACSRC_RND ");
+
+ if (pkt_dev->flags & F_MACDST_RND)
+ seq_puts(seq, "MACDST_RND ");
+
+ if (pkt_dev->flags & F_VID_RND)
+ seq_puts(seq, "VID_RND ");
+
+ if (pkt_dev->flags & F_SVID_RND)
+ seq_puts(seq, "SVID_RND ");
+
+ if (pkt_dev->flags & F_NODE)
+ seq_puts(seq, "NODE_ALLOC ");
+
+ seq_puts(seq, "\n");
+
+ /* not really stopped, more like last-running-at */
+ stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at;
+ idle = pkt_dev->idle_acc;
+ do_div(idle, NSEC_PER_USEC);
+
+ seq_printf(seq,
+ "Current:\n pkts-sofar: %llu errors: %llu\n",
+ (unsigned long long)pkt_dev->sofar,
+ (unsigned long long)pkt_dev->errors);
+
+ seq_printf(seq,
+ " started: %lluus stopped: %lluus idle: %lluus\n",
+ (unsigned long long) ktime_to_us(pkt_dev->started_at),
+ (unsigned long long) ktime_to_us(stopped),
+ (unsigned long long) idle);
+
+ seq_printf(seq,
+ " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n",
+ pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset,
+ pkt_dev->cur_src_mac_offset);
+
+ if (pkt_dev->flags & F_IPV6) {
+ seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n",
+ &pkt_dev->cur_in6_saddr,
+ &pkt_dev->cur_in6_daddr);
+ } else
+ seq_printf(seq, " cur_saddr: %pI4 cur_daddr: %pI4\n",
+ &pkt_dev->cur_saddr, &pkt_dev->cur_daddr);
+
+ seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n",
+ pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
+
+ seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map);
+
+ seq_printf(seq, " flows: %u\n", pkt_dev->nflows);
+
+ if (pkt_dev->result[0])
+ seq_printf(seq, "Result: %s\n", pkt_dev->result);
+ else
+ seq_puts(seq, "Result: Idle\n");
+
+ return 0;
+}
+
+
+static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
+ __u32 *num)
+{
+ int i = 0;
+ *num = 0;
+
+ for (; i < maxlen; i++) {
+ int value;
+ char c;
+ *num <<= 4;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ value = hex_to_bin(c);
+ if (value >= 0)
+ *num |= value;
+ else
+ break;
+ }
+ return i;
+}
+
+static int count_trail_chars(const char __user * user_buffer,
+ unsigned int maxlen)
+{
+ int i;
+
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ switch (c) {
+ case '\"':
+ case '\n':
+ case '\r':
+ case '\t':
+ case ' ':
+ case '=':
+ break;
+ default:
+ goto done;
+ }
+ }
+done:
+ return i;
+}
+
+static long num_arg(const char __user *user_buffer, unsigned long maxlen,
+ unsigned long *num)
+{
+ int i;
+ *num = 0;
+
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ if ((c >= '0') && (c <= '9')) {
+ *num *= 10;
+ *num += c - '0';
+ } else
+ break;
+ }
+ return i;
+}
+
+static int strn_len(const char __user * user_buffer, unsigned int maxlen)
+{
+ int i;
+
+ for (i = 0; i < maxlen; i++) {
+ char c;
+ if (get_user(c, &user_buffer[i]))
+ return -EFAULT;
+ switch (c) {
+ case '\"':
+ case '\n':
+ case '\r':
+ case '\t':
+ case ' ':
+ goto done_str;
+ default:
+ break;
+ }
+ }
+done_str:
+ return i;
+}
+
+static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+{
+ unsigned int n = 0;
+ char c;
+ ssize_t i = 0;
+ int len;
+
+ pkt_dev->nr_labels = 0;
+ do {
+ __u32 tmp;
+ len = hex32_arg(&buffer[i], 8, &tmp);
+ if (len <= 0)
+ return len;
+ pkt_dev->labels[n] = htonl(tmp);
+ if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
+ pkt_dev->flags |= F_MPLS_RND;
+ i += len;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ i++;
+ n++;
+ if (n >= MAX_MPLS_LABELS)
+ return -E2BIG;
+ } while (c == ',');
+
+ pkt_dev->nr_labels = n;
+ return i;
+}
+
+static ssize_t pktgen_if_write(struct file *file,
+ const char __user * user_buffer, size_t count,
+ loff_t * offset)
+{
+ struct seq_file *seq = file->private_data;
+ struct pktgen_dev *pkt_dev = seq->private;
+ int i, max, len;
+ char name[16], valstr[32];
+ unsigned long value = 0;
+ char *pg_result = NULL;
+ int tmp = 0;
+ char buf[128];
+
+ pg_result = &(pkt_dev->result[0]);
+
+ if (count < 1) {
+ pr_warn("wrong command format\n");
+ return -EINVAL;
+ }
+
+ max = count;
+ tmp = count_trail_chars(user_buffer, max);
+ if (tmp < 0) {
+ pr_warn("illegal format\n");
+ return tmp;
+ }
+ i = tmp;
+
+ /* Read variable name */
+
+ len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ if (len < 0)
+ return len;
+
+ memset(name, 0, sizeof(name));
+ if (copy_from_user(name, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ max = count - i;
+ len = count_trail_chars(&user_buffer[i], max);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (debug) {
+ size_t copy = min_t(size_t, count, 1023);
+ char tb[copy + 1];
+ if (copy_from_user(tb, user_buffer, copy))
+ return -EFAULT;
+ tb[copy] = 0;
+ pr_debug("%s,%lu buffer -:%s:-\n",
+ name, (unsigned long)count, tb);
+ }
+
+ if (!strcmp(name, "min_pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->min_pkt_size) {
+ pkt_dev->min_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: min_pkt_size=%u",
+ pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "max_pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->max_pkt_size) {
+ pkt_dev->max_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: max_pkt_size=%u",
+ pkt_dev->max_pkt_size);
+ return count;
+ }
+
+ /* Shortcut for min = max */
+
+ if (!strcmp(name, "pkt_size")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value < 14 + 20 + 8)
+ value = 14 + 20 + 8;
+ if (value != pkt_dev->min_pkt_size) {
+ pkt_dev->min_pkt_size = value;
+ pkt_dev->max_pkt_size = value;
+ pkt_dev->cur_pkt_size = value;
+ }
+ sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "debug")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ debug = value;
+ sprintf(pg_result, "OK: debug=%u", debug);
+ return count;
+ }
+
+ if (!strcmp(name, "frags")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->nfrags = value;
+ sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
+ return count;
+ }
+ if (!strcmp(name, "delay")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value == 0x7FFFFFFF)
+ pkt_dev->delay = ULLONG_MAX;
+ else
+ pkt_dev->delay = (u64)value;
+
+ sprintf(pg_result, "OK: delay=%llu",
+ (unsigned long long) pkt_dev->delay);
+ return count;
+ }
+ if (!strcmp(name, "rate")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (!value)
+ return len;
+ pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
+ if (debug)
+ pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+ sprintf(pg_result, "OK: rate=%lu", value);
+ return count;
+ }
+ if (!strcmp(name, "ratep")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (!value)
+ return len;
+ pkt_dev->delay = NSEC_PER_SEC/value;
+ if (debug)
+ pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+ sprintf(pg_result, "OK: rate=%lu", value);
+ return count;
+ }
+ if (!strcmp(name, "udp_src_min")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_src_min) {
+ pkt_dev->udp_src_min = value;
+ pkt_dev->cur_udp_src = value;
+ }
+ sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min);
+ return count;
+ }
+ if (!strcmp(name, "udp_dst_min")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_dst_min) {
+ pkt_dev->udp_dst_min = value;
+ pkt_dev->cur_udp_dst = value;
+ }
+ sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min);
+ return count;
+ }
+ if (!strcmp(name, "udp_src_max")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_src_max) {
+ pkt_dev->udp_src_max = value;
+ pkt_dev->cur_udp_src = value;
+ }
+ sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max);
+ return count;
+ }
+ if (!strcmp(name, "udp_dst_max")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value != pkt_dev->udp_dst_max) {
+ pkt_dev->udp_dst_max = value;
+ pkt_dev->cur_udp_dst = value;
+ }
+ sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max);
+ return count;
+ }
+ if (!strcmp(name, "clone_skb")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+ if ((value > 0) &&
+ (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ return -ENOTSUPP;
+ i += len;
+ pkt_dev->clone_skb = value;
+
+ sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
+ return count;
+ }
+ if (!strcmp(name, "count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->count = value;
+ sprintf(pg_result, "OK: count=%llu",
+ (unsigned long long)pkt_dev->count);
+ return count;
+ }
+ if (!strcmp(name, "src_mac_count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (pkt_dev->src_mac_count != value) {
+ pkt_dev->src_mac_count = value;
+ pkt_dev->cur_src_mac_offset = 0;
+ }
+ sprintf(pg_result, "OK: src_mac_count=%d",
+ pkt_dev->src_mac_count);
+ return count;
+ }
+ if (!strcmp(name, "dst_mac_count")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (pkt_dev->dst_mac_count != value) {
+ pkt_dev->dst_mac_count = value;
+ pkt_dev->cur_dst_mac_offset = 0;
+ }
+ sprintf(pg_result, "OK: dst_mac_count=%d",
+ pkt_dev->dst_mac_count);
+ return count;
+ }
+ if (!strcmp(name, "burst")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value > 1) &&
+ (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+ return -ENOTSUPP;
+ pkt_dev->burst = value < 1 ? 1 : value;
+ sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
+ return count;
+ }
+ if (!strcmp(name, "node")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (node_possible(value)) {
+ pkt_dev->node = value;
+ sprintf(pg_result, "OK: node=%d", pkt_dev->node);
+ if (pkt_dev->page) {
+ put_page(pkt_dev->page);
+ pkt_dev->page = NULL;
+ }
+ }
+ else
+ sprintf(pg_result, "ERROR: node not possible");
+ return count;
+ }
+ if (!strcmp(name, "flag")) {
+ char f[32];
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+ if (strcmp(f, "IPSRC_RND") == 0)
+ pkt_dev->flags |= F_IPSRC_RND;
+
+ else if (strcmp(f, "!IPSRC_RND") == 0)
+ pkt_dev->flags &= ~F_IPSRC_RND;
+
+ else if (strcmp(f, "TXSIZE_RND") == 0)
+ pkt_dev->flags |= F_TXSIZE_RND;
+
+ else if (strcmp(f, "!TXSIZE_RND") == 0)
+ pkt_dev->flags &= ~F_TXSIZE_RND;
+
+ else if (strcmp(f, "IPDST_RND") == 0)
+ pkt_dev->flags |= F_IPDST_RND;
+
+ else if (strcmp(f, "!IPDST_RND") == 0)
+ pkt_dev->flags &= ~F_IPDST_RND;
+
+ else if (strcmp(f, "UDPSRC_RND") == 0)
+ pkt_dev->flags |= F_UDPSRC_RND;
+
+ else if (strcmp(f, "!UDPSRC_RND") == 0)
+ pkt_dev->flags &= ~F_UDPSRC_RND;
+
+ else if (strcmp(f, "UDPDST_RND") == 0)
+ pkt_dev->flags |= F_UDPDST_RND;
+
+ else if (strcmp(f, "!UDPDST_RND") == 0)
+ pkt_dev->flags &= ~F_UDPDST_RND;
+
+ else if (strcmp(f, "MACSRC_RND") == 0)
+ pkt_dev->flags |= F_MACSRC_RND;
+
+ else if (strcmp(f, "!MACSRC_RND") == 0)
+ pkt_dev->flags &= ~F_MACSRC_RND;
+
+ else if (strcmp(f, "MACDST_RND") == 0)
+ pkt_dev->flags |= F_MACDST_RND;
+
+ else if (strcmp(f, "!MACDST_RND") == 0)
+ pkt_dev->flags &= ~F_MACDST_RND;
+
+ else if (strcmp(f, "MPLS_RND") == 0)
+ pkt_dev->flags |= F_MPLS_RND;
+
+ else if (strcmp(f, "!MPLS_RND") == 0)
+ pkt_dev->flags &= ~F_MPLS_RND;
+
+ else if (strcmp(f, "VID_RND") == 0)
+ pkt_dev->flags |= F_VID_RND;
+
+ else if (strcmp(f, "!VID_RND") == 0)
+ pkt_dev->flags &= ~F_VID_RND;
+
+ else if (strcmp(f, "SVID_RND") == 0)
+ pkt_dev->flags |= F_SVID_RND;
+
+ else if (strcmp(f, "!SVID_RND") == 0)
+ pkt_dev->flags &= ~F_SVID_RND;
+
+ else if (strcmp(f, "FLOW_SEQ") == 0)
+ pkt_dev->flags |= F_FLOW_SEQ;
+
+ else if (strcmp(f, "QUEUE_MAP_RND") == 0)
+ pkt_dev->flags |= F_QUEUE_MAP_RND;
+
+ else if (strcmp(f, "!QUEUE_MAP_RND") == 0)
+ pkt_dev->flags &= ~F_QUEUE_MAP_RND;
+
+ else if (strcmp(f, "QUEUE_MAP_CPU") == 0)
+ pkt_dev->flags |= F_QUEUE_MAP_CPU;
+
+ else if (strcmp(f, "!QUEUE_MAP_CPU") == 0)
+ pkt_dev->flags &= ~F_QUEUE_MAP_CPU;
+#ifdef CONFIG_XFRM
+ else if (strcmp(f, "IPSEC") == 0)
+ pkt_dev->flags |= F_IPSEC_ON;
+#endif
+
+ else if (strcmp(f, "!IPV6") == 0)
+ pkt_dev->flags &= ~F_IPV6;
+
+ else if (strcmp(f, "NODE_ALLOC") == 0)
+ pkt_dev->flags |= F_NODE;
+
+ else if (strcmp(f, "!NODE_ALLOC") == 0)
+ pkt_dev->flags &= ~F_NODE;
+
+ else if (strcmp(f, "UDPCSUM") == 0)
+ pkt_dev->flags |= F_UDPCSUM;
+
+ else if (strcmp(f, "!UDPCSUM") == 0)
+ pkt_dev->flags &= ~F_UDPCSUM;
+
+ else if (strcmp(f, "NO_TIMESTAMP") == 0)
+ pkt_dev->flags |= F_NO_TIMESTAMP;
+
+ else {
+ sprintf(pg_result,
+ "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
+ f,
+ "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
+ "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
+ "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
+ "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
+ "NO_TIMESTAMP, "
+#ifdef CONFIG_XFRM
+ "IPSEC, "
+#endif
+ "NODE_ALLOC\n");
+ return count;
+ }
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+ return count;
+ }
+ if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->dst_min) != 0) {
+ memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
+ strncpy(pkt_dev->dst_min, buf, len);
+ pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+ pkt_dev->cur_daddr = pkt_dev->daddr_min;
+ }
+ if (debug)
+ pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);
+ i += len;
+ sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
+ return count;
+ }
+ if (!strcmp(name, "dst_max")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
+ if (len < 0)
+ return len;
+
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->dst_max) != 0) {
+ memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
+ strncpy(pkt_dev->dst_max, buf, len);
+ pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+ pkt_dev->cur_daddr = pkt_dev->daddr_max;
+ }
+ if (debug)
+ pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);
+ i += len;
+ sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
+ return count;
+ }
+ if (!strcmp(name, "dst6")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
+
+ pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
+
+ if (debug)
+ pr_debug("dst6 set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "dst6_min")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
+
+ pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
+ if (debug)
+ pr_debug("dst6_min set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6_min=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "dst6_max")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);
+
+ if (debug)
+ pr_debug("dst6_max set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: dst6_max=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "src6")) {
+ len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ if (len < 0)
+ return len;
+
+ pkt_dev->flags |= F_IPV6;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+
+ in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL);
+ snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
+
+ pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
+
+ if (debug)
+ pr_debug("src6 set to: %s\n", buf);
+
+ i += len;
+ sprintf(pg_result, "OK: src6=%s", buf);
+ return count;
+ }
+ if (!strcmp(name, "src_min")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->src_min) != 0) {
+ memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
+ strncpy(pkt_dev->src_min, buf, len);
+ pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+ pkt_dev->cur_saddr = pkt_dev->saddr_min;
+ }
+ if (debug)
+ pr_debug("src_min set to: %s\n", pkt_dev->src_min);
+ i += len;
+ sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
+ return count;
+ }
+ if (!strcmp(name, "src_max")) {
+ len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
+ if (len < 0)
+ return len;
+
+ if (copy_from_user(buf, &user_buffer[i], len))
+ return -EFAULT;
+ buf[len] = 0;
+ if (strcmp(buf, pkt_dev->src_max) != 0) {
+ memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
+ strncpy(pkt_dev->src_max, buf, len);
+ pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+ pkt_dev->cur_saddr = pkt_dev->saddr_max;
+ }
+ if (debug)
+ pr_debug("src_max set to: %s\n", pkt_dev->src_max);
+ i += len;
+ sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
+ return count;
+ }
+ if (!strcmp(name, "dst_mac")) {
+ len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ if (len < 0)
+ return len;
+
+ memset(valstr, 0, sizeof(valstr));
+ if (copy_from_user(valstr, &user_buffer[i], len))
+ return -EFAULT;
+
+ if (!mac_pton(valstr, pkt_dev->dst_mac))
+ return -EINVAL;
+ /* Set up Dest MAC */
+ ether_addr_copy(&pkt_dev->hh[0], pkt_dev->dst_mac);
+
+ sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);
+ return count;
+ }
+ if (!strcmp(name, "src_mac")) {
+ len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ if (len < 0)
+ return len;
+
+ memset(valstr, 0, sizeof(valstr));
+ if (copy_from_user(valstr, &user_buffer[i], len))
+ return -EFAULT;
+
+ if (!mac_pton(valstr, pkt_dev->src_mac))
+ return -EINVAL;
+ /* Set up Src MAC */
+ ether_addr_copy(&pkt_dev->hh[6], pkt_dev->src_mac);
+
+ sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);
+ return count;
+ }
+
+ if (!strcmp(name, "clear_counters")) {
+ pktgen_clear_counters(pkt_dev);
+ sprintf(pg_result, "OK: Clearing counters.\n");
+ return count;
+ }
+
+ if (!strcmp(name, "flows")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value > MAX_CFLOWS)
+ value = MAX_CFLOWS;
+
+ pkt_dev->cflows = value;
+ sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);
+ return count;
+ }
+#ifdef CONFIG_XFRM
+ if (!strcmp(name, "spi")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->spi = value;
+ sprintf(pg_result, "OK: spi=%u", pkt_dev->spi);
+ return count;
+ }
+#endif
+ if (!strcmp(name, "flowlen")) {
+ len = num_arg(&user_buffer[i], 10, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->lflow = value;
+ sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
+ return count;
+ }
+
+ if (!strcmp(name, "queue_map_min")) {
+ len = num_arg(&user_buffer[i], 5, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->queue_map_min = value;
+ sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
+ return count;
+ }
+
+ if (!strcmp(name, "queue_map_max")) {
+ len = num_arg(&user_buffer[i], 5, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->queue_map_max = value;
+ sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
+ return count;
+ }
+
+ if (!strcmp(name, "mpls")) {
+ unsigned int n, cnt;
+
+ len = get_labels(&user_buffer[i], pkt_dev);
+ if (len < 0)
+ return len;
+ i += len;
+ cnt = sprintf(pg_result, "OK: mpls=");
+ for (n = 0; n < pkt_dev->nr_labels; n++)
+ cnt += sprintf(pg_result + cnt,
+ "%08x%s", ntohl(pkt_dev->labels[n]),
+ n == pkt_dev->nr_labels-1 ? "" : ",");
+
+ if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN auto turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_id")) {
+ len = num_arg(&user_buffer[i], 4, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (value <= 4095) {
+ pkt_dev->vlan_id = value; /* turn on VLAN */
+
+ if (debug)
+ pr_debug("VLAN turned on\n");
+
+ if (debug && pkt_dev->nr_labels)
+ pr_debug("MPLS auto turned off\n");
+
+ pkt_dev->nr_labels = 0; /* turn off MPLS */
+ sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id);
+ } else {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_p")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_p = value;
+ sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
+ } else {
+ sprintf(pg_result, "ERROR: vlan_p must be 0-7");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "vlan_cfi")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_cfi = value;
+ sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
+ } else {
+ sprintf(pg_result, "ERROR: vlan_cfi must be 0-1");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_id")) {
+ len = num_arg(&user_buffer[i], 4, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
+ pkt_dev->svlan_id = value; /* turn on SVLAN */
+
+ if (debug)
+ pr_debug("SVLAN turned on\n");
+
+ if (debug && pkt_dev->nr_labels)
+ pr_debug("MPLS auto turned off\n");
+
+ pkt_dev->nr_labels = 0; /* turn off MPLS */
+ sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id);
+ } else {
+ pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+ pkt_dev->svlan_id = 0xffff;
+
+ if (debug)
+ pr_debug("VLAN/SVLAN turned off\n");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_p")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_p = value;
+ sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
+ } else {
+ sprintf(pg_result, "ERROR: svlan_p must be 0-7");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "svlan_cfi")) {
+ len = num_arg(&user_buffer[i], 1, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_cfi = value;
+ sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
+ } else {
+ sprintf(pg_result, "ERROR: svlan_cfi must be 0-1");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "tos")) {
+ __u32 tmp_value = 0;
+ len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (len == 2) {
+ pkt_dev->tos = tmp_value;
+ sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
+ } else {
+ sprintf(pg_result, "ERROR: tos must be 00-ff");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "traffic_class")) {
+ __u32 tmp_value = 0;
+ len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ if (len == 2) {
+ pkt_dev->traffic_class = tmp_value;
+ sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
+ } else {
+ sprintf(pg_result, "ERROR: traffic_class must be 00-ff");
+ }
+ return count;
+ }
+
+ if (!strcmp(name, "skb_priority")) {
+ len = num_arg(&user_buffer[i], 9, &value);
+ if (len < 0)
+ return len;
+
+ i += len;
+ pkt_dev->skb_priority = value;
+ sprintf(pg_result, "OK: skb_priority=%i",
+ pkt_dev->skb_priority);
+ return count;
+ }
+
+ sprintf(pkt_dev->result, "No such parameter \"%s\"", name);
+ return -EINVAL;
+}
+
+static int pktgen_if_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pktgen_if_show, PDE_DATA(inode));
+}
+
+static const struct file_operations pktgen_if_fops = {
+ .owner = THIS_MODULE,
+ .open = pktgen_if_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_if_write,
+ .release = single_release,
+};
+
+static int pktgen_thread_show(struct seq_file *seq, void *v)
+{
+ struct pktgen_thread *t = seq->private;
+ const struct pktgen_dev *pkt_dev;
+
+ BUG_ON(!t);
+
+ seq_puts(seq, "Running: ");
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running)
+ seq_printf(seq, "%s ", pkt_dev->odevname);
+
+ seq_puts(seq, "\nStopped: ");
+
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+ if (!pkt_dev->running)
+ seq_printf(seq, "%s ", pkt_dev->odevname);
+
+ if (t->result[0])
+ seq_printf(seq, "\nResult: %s\n", t->result);
+ else
+ seq_puts(seq, "\nResult: NA\n");
+
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static ssize_t pktgen_thread_write(struct file *file,
+ const char __user * user_buffer,
+ size_t count, loff_t * offset)
+{
+ struct seq_file *seq = file->private_data;
+ struct pktgen_thread *t = seq->private;
+ int i, max, len, ret;
+ char name[40];
+ char *pg_result;
+
+ if (count < 1) {
+ // sprintf(pg_result, "Wrong command format");
+ return -EINVAL;
+ }
+
+ max = count;
+ len = count_trail_chars(user_buffer, max);
+ if (len < 0)
+ return len;
+
+ i = len;
+
+ /* Read variable name */
+
+ len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ if (len < 0)
+ return len;
+
+ memset(name, 0, sizeof(name));
+ if (copy_from_user(name, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+
+ max = count - i;
+ len = count_trail_chars(&user_buffer[i], max);
+ if (len < 0)
+ return len;
+
+ i += len;
+
+ if (debug)
+ pr_debug("t=%s, count=%lu\n", name, (unsigned long)count);
+
+ if (!t) {
+ pr_err("ERROR: No thread\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pg_result = &(t->result[0]);
+
+ if (!strcmp(name, "add_device")) {
+ char f[32];
+ memset(f, 0, 32);
+ len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ if (len < 0) {
+ ret = len;
+ goto out;
+ }
+ if (copy_from_user(f, &user_buffer[i], len))
+ return -EFAULT;
+ i += len;
+ mutex_lock(&pktgen_thread_lock);
+ ret = pktgen_add_device(t, f);
+ mutex_unlock(&pktgen_thread_lock);
+ if (!ret) {
+ ret = count;
+ sprintf(pg_result, "OK: add_device=%s", f);
+ } else
+ sprintf(pg_result, "ERROR: can not add device %s", f);
+ goto out;
+ }
+
+ if (!strcmp(name, "rem_device_all")) {
+ mutex_lock(&pktgen_thread_lock);
+ t->control |= T_REMDEVALL;
+ mutex_unlock(&pktgen_thread_lock);
+ schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */
+ ret = count;
+ sprintf(pg_result, "OK: rem_device_all");
+ goto out;
+ }
+
+ if (!strcmp(name, "max_before_softirq")) {
+ sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use");
+ ret = count;
+ goto out;
+ }
+
+ ret = -EINVAL;
+out:
+ return ret;
+}
+
+static int pktgen_thread_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pktgen_thread_show, PDE_DATA(inode));
+}
+
+static const struct file_operations pktgen_thread_fops = {
+ .owner = THIS_MODULE,
+ .open = pktgen_thread_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = pktgen_thread_write,
+ .release = single_release,
+};
+
+/* Think find or remove for NN */
+static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn,
+ const char *ifname, int remove)
+{
+ struct pktgen_thread *t;
+ struct pktgen_dev *pkt_dev = NULL;
+ bool exact = (remove == FIND);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ pkt_dev = pktgen_find_dev(t, ifname, exact);
+ if (pkt_dev) {
+ if (remove) {
+ pkt_dev->removal_mark = 1;
+ t->control |= T_REMDEV;
+ }
+ break;
+ }
+ }
+ return pkt_dev;
+}
+
+/*
+ * mark a device for removal
+ */
+static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname)
+{
+ struct pktgen_dev *pkt_dev = NULL;
+ const int max_tries = 10, msec_per_try = 125;
+ int i = 0;
+
+ mutex_lock(&pktgen_thread_lock);
+ pr_debug("%s: marking %s for removal\n", __func__, ifname);
+
+ while (1) {
+
+ pkt_dev = __pktgen_NN_threads(pn, ifname, REMOVE);
+ if (pkt_dev == NULL)
+ break; /* success */
+
+ mutex_unlock(&pktgen_thread_lock);
+ pr_debug("%s: waiting for %s to disappear....\n",
+ __func__, ifname);
+ schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
+ mutex_lock(&pktgen_thread_lock);
+
+ if (++i >= max_tries) {
+ pr_err("%s: timed out after waiting %d msec for device %s to be removed\n",
+ __func__, msec_per_try * i, ifname);
+ break;
+ }
+
+ }
+
+ mutex_unlock(&pktgen_thread_lock);
+}
+
+static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *dev)
+{
+ struct pktgen_thread *t;
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ struct pktgen_dev *pkt_dev;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+ if (pkt_dev->odev != dev)
+ continue;
+
+ proc_remove(pkt_dev->entry);
+
+ pkt_dev->entry = proc_create_data(dev->name, 0600,
+ pn->proc_dir,
+ &pktgen_if_fops,
+ pkt_dev);
+ if (!pkt_dev->entry)
+ pr_err("can't move proc entry for '%s'\n",
+ dev->name);
+ break;
+ }
+ rcu_read_unlock();
+ }
+}
+
+static int pktgen_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id);
+
+ if (pn->pktgen_exiting)
+ return NOTIFY_DONE;
+
+ /* It is OK that we do not hold the group lock right now,
+ * as we run under the RTNL lock.
+ */
+
+ switch (event) {
+ case NETDEV_CHANGENAME:
+ pktgen_change_name(pn, dev);
+ break;
+
+ case NETDEV_UNREGISTER:
+ pktgen_mark_device(pn, dev->name);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct net_device *pktgen_dev_get_by_name(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev,
+ const char *ifname)
+{
+ char b[IFNAMSIZ+5];
+ int i;
+
+ for (i = 0; ifname[i] != '@'; i++) {
+ if (i == IFNAMSIZ)
+ break;
+
+ b[i] = ifname[i];
+ }
+ b[i] = 0;
+
+ return dev_get_by_name(pn->net, b);
+}
+
+
+/* Associate pktgen_dev with a device. */
+
+static int pktgen_setup_dev(const struct pktgen_net *pn,
+ struct pktgen_dev *pkt_dev, const char *ifname)
+{
+ struct net_device *odev;
+ int err;
+
+ /* Clean old setups */
+ if (pkt_dev->odev) {
+ dev_put(pkt_dev->odev);
+ pkt_dev->odev = NULL;
+ }
+
+ odev = pktgen_dev_get_by_name(pn, pkt_dev, ifname);
+ if (!odev) {
+ pr_err("no such netdevice: \"%s\"\n", ifname);
+ return -ENODEV;
+ }
+
+ if (odev->type != ARPHRD_ETHER) {
+ pr_err("not an ethernet device: \"%s\"\n", ifname);
+ err = -EINVAL;
+ } else if (!netif_running(odev)) {
+ pr_err("device is down: \"%s\"\n", ifname);
+ err = -ENETDOWN;
+ } else {
+ pkt_dev->odev = odev;
+ return 0;
+ }
+
+ dev_put(odev);
+ return err;
+}
+
+/* Read pkt_dev from the interface and set up internal pktgen_dev
+ * structure to have the right information to create/send packets
+ */
+static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
+{
+ int ntxq;
+
+ if (!pkt_dev->odev) {
+ pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n");
+ sprintf(pkt_dev->result,
+ "ERROR: pkt_dev->odev == NULL in setup_inject.\n");
+ return;
+ }
+
+ /* make sure that we don't pick a non-existing transmit queue */
+ ntxq = pkt_dev->odev->real_num_tx_queues;
+
+ if (ntxq <= pkt_dev->queue_map_min) {
+ pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
+ pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
+ }
+ if (pkt_dev->queue_map_max >= ntxq) {
+ pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+ pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
+ pkt_dev->odevname);
+ pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
+ }
+
+ /* Default to the interface's mac if not explicitly set. */
+
+ if (is_zero_ether_addr(pkt_dev->src_mac))
+ ether_addr_copy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr);
+
+ /* Set up Dest MAC */
+ ether_addr_copy(&(pkt_dev->hh[0]), pkt_dev->dst_mac);
+
+ if (pkt_dev->flags & F_IPV6) {
+ int i, set = 0, err = 1;
+ struct inet6_dev *idev;
+
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ if (pkt_dev->cur_in6_saddr.s6_addr[i]) {
+ set = 1;
+ break;
+ }
+
+ if (!set) {
+
+ /*
+ * Use linklevel address if unconfigured.
+ *
+ * use ipv6_get_lladdr if/when it's get exported
+ */
+
+ rcu_read_lock();
+ idev = __in6_dev_get(pkt_dev->odev);
+ if (idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if ((ifp->scope & IFA_LINK) &&
+ !(ifp->flags & IFA_F_TENTATIVE)) {
+ pkt_dev->cur_in6_saddr = ifp->addr;
+ err = 0;
+ break;
+ }
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+ if (err)
+ pr_err("ERROR: IPv6 link address not available\n");
+ }
+ } else {
+ if (pkt_dev->min_pkt_size == 0) {
+ pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr)
+ + sizeof(struct udphdr)
+ + sizeof(struct pktgen_hdr)
+ + pkt_dev->pkt_overhead;
+ }
+
+ pkt_dev->saddr_min = 0;
+ pkt_dev->saddr_max = 0;
+ if (strlen(pkt_dev->src_min) == 0) {
+
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(pkt_dev->odev);
+ if (in_dev) {
+ if (in_dev->ifa_list) {
+ pkt_dev->saddr_min =
+ in_dev->ifa_list->ifa_address;
+ pkt_dev->saddr_max = pkt_dev->saddr_min;
+ }
+ }
+ rcu_read_unlock();
+ } else {
+ pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+ pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+ }
+
+ pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+ pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+ }
+ /* Initialize current values. */
+ pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+ if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size)
+ pkt_dev->max_pkt_size = pkt_dev->min_pkt_size;
+
+ pkt_dev->cur_dst_mac_offset = 0;
+ pkt_dev->cur_src_mac_offset = 0;
+ pkt_dev->cur_saddr = pkt_dev->saddr_min;
+ pkt_dev->cur_daddr = pkt_dev->daddr_min;
+ pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+ pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+ pkt_dev->nflows = 0;
+}
+
+
+static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
+{
+ ktime_t start_time, end_time;
+ s64 remaining;
+ struct hrtimer_sleeper t;
+
+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_set_expires(&t.timer, spin_until);
+
+ remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
+ if (remaining <= 0) {
+ pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+ return;
+ }
+
+ start_time = ktime_get();
+ if (remaining < 100000) {
+ /* for small delays (<100us), just loop until limit is reached */
+ do {
+ end_time = ktime_get();
+ } while (ktime_compare(end_time, spin_until) < 0);
+ } else {
+ /* see do_nanosleep */
+ hrtimer_init_sleeper(&t, current);
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ } while (t.task && pkt_dev->running && !signal_pending(current));
+ __set_current_state(TASK_RUNNING);
+ end_time = ktime_get();
+ }
+
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+ pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+}
+
+static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
+{
+ pkt_dev->pkt_overhead = 0;
+ pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
+ pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
+ pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
+}
+
+static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow)
+{
+ return !!(pkt_dev->flows[flow].flags & F_INIT);
+}
+
+static inline int f_pick(struct pktgen_dev *pkt_dev)
+{
+ int flow = pkt_dev->curfl;
+
+ if (pkt_dev->flags & F_FLOW_SEQ) {
+ if (pkt_dev->flows[flow].count >= pkt_dev->lflow) {
+ /* reset time */
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ pkt_dev->curfl += 1;
+ if (pkt_dev->curfl >= pkt_dev->cflows)
+ pkt_dev->curfl = 0; /*reset */
+ }
+ } else {
+ flow = prandom_u32() % pkt_dev->cflows;
+ pkt_dev->curfl = flow;
+
+ if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
+ pkt_dev->flows[flow].count = 0;
+ pkt_dev->flows[flow].flags = 0;
+ }
+ }
+
+ return pkt_dev->curfl;
+}
+
+
+#ifdef CONFIG_XFRM
+/* If there was already an IPSEC SA, we keep it as is, else
+ * we go look for it ...
+*/
+#define DUMMY_MARK 0
+static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
+{
+ struct xfrm_state *x = pkt_dev->flows[flow].x;
+ struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
+ if (!x) {
+
+ if (pkt_dev->spi) {
+ /* We need as quick as possible to find the right SA
+ * Searching with minimum criteria to archieve this.
+ */
+ x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
+ } else {
+ /* slow path: we dont already have xfrm_state */
+ x = xfrm_stateonly_find(pn->net, DUMMY_MARK,
+ (xfrm_address_t *)&pkt_dev->cur_daddr,
+ (xfrm_address_t *)&pkt_dev->cur_saddr,
+ AF_INET,
+ pkt_dev->ipsmode,
+ pkt_dev->ipsproto, 0);
+ }
+ if (x) {
+ pkt_dev->flows[flow].x = x;
+ set_pkt_overhead(pkt_dev);
+ pkt_dev->pkt_overhead += x->props.header_len;
+ }
+
+ }
+}
+#endif
+static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
+{
+
+ if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+ pkt_dev->cur_queue_map = smp_processor_id();
+
+ else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
+ __u16 t;
+ if (pkt_dev->flags & F_QUEUE_MAP_RND) {
+ t = prandom_u32() %
+ (pkt_dev->queue_map_max -
+ pkt_dev->queue_map_min + 1)
+ + pkt_dev->queue_map_min;
+ } else {
+ t = pkt_dev->cur_queue_map + 1;
+ if (t > pkt_dev->queue_map_max)
+ t = pkt_dev->queue_map_min;
+ }
+ pkt_dev->cur_queue_map = t;
+ }
+ pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues;
+}
+
+/* Increment/randomize headers according to flags and current values
+ * for IP src/dest, UDP src/dst port, MAC-Addr src/dst
+ */
+static void mod_cur_headers(struct pktgen_dev *pkt_dev)
+{
+ __u32 imn;
+ __u32 imx;
+ int flow = 0;
+
+ if (pkt_dev->cflows)
+ flow = f_pick(pkt_dev);
+
+ /* Deal with source MAC */
+ if (pkt_dev->src_mac_count > 1) {
+ __u32 mc;
+ __u32 tmp;
+
+ if (pkt_dev->flags & F_MACSRC_RND)
+ mc = prandom_u32() % pkt_dev->src_mac_count;
+ else {
+ mc = pkt_dev->cur_src_mac_offset++;
+ if (pkt_dev->cur_src_mac_offset >=
+ pkt_dev->src_mac_count)
+ pkt_dev->cur_src_mac_offset = 0;
+ }
+
+ tmp = pkt_dev->src_mac[5] + (mc & 0xFF);
+ pkt_dev->hh[11] = tmp;
+ tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[10] = tmp;
+ tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[9] = tmp;
+ tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[8] = tmp;
+ tmp = (pkt_dev->src_mac[1] + (tmp >> 8));
+ pkt_dev->hh[7] = tmp;
+ }
+
+ /* Deal with Destination MAC */
+ if (pkt_dev->dst_mac_count > 1) {
+ __u32 mc;
+ __u32 tmp;
+
+ if (pkt_dev->flags & F_MACDST_RND)
+ mc = prandom_u32() % pkt_dev->dst_mac_count;
+
+ else {
+ mc = pkt_dev->cur_dst_mac_offset++;
+ if (pkt_dev->cur_dst_mac_offset >=
+ pkt_dev->dst_mac_count) {
+ pkt_dev->cur_dst_mac_offset = 0;
+ }
+ }
+
+ tmp = pkt_dev->dst_mac[5] + (mc & 0xFF);
+ pkt_dev->hh[5] = tmp;
+ tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[4] = tmp;
+ tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[3] = tmp;
+ tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+ pkt_dev->hh[2] = tmp;
+ tmp = (pkt_dev->dst_mac[1] + (tmp >> 8));
+ pkt_dev->hh[1] = tmp;
+ }
+
+ if (pkt_dev->flags & F_MPLS_RND) {
+ unsigned int i;
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
+ pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
+ ((__force __be32)prandom_u32() &
+ htonl(0x000fffff));
+ }
+
+ if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
+ pkt_dev->vlan_id = prandom_u32() & (4096 - 1);
+ }
+
+ if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
+ pkt_dev->svlan_id = prandom_u32() & (4096 - 1);
+ }
+
+ if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
+ if (pkt_dev->flags & F_UDPSRC_RND)
+ pkt_dev->cur_udp_src = prandom_u32() %
+ (pkt_dev->udp_src_max - pkt_dev->udp_src_min)
+ + pkt_dev->udp_src_min;
+
+ else {
+ pkt_dev->cur_udp_src++;
+ if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max)
+ pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+ }
+ }
+
+ if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
+ if (pkt_dev->flags & F_UDPDST_RND) {
+ pkt_dev->cur_udp_dst = prandom_u32() %
+ (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
+ + pkt_dev->udp_dst_min;
+ } else {
+ pkt_dev->cur_udp_dst++;
+ if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
+ pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+ }
+ }
+
+ if (!(pkt_dev->flags & F_IPV6)) {
+
+ imn = ntohl(pkt_dev->saddr_min);
+ imx = ntohl(pkt_dev->saddr_max);
+ if (imn < imx) {
+ __u32 t;
+ if (pkt_dev->flags & F_IPSRC_RND)
+ t = prandom_u32() % (imx - imn) + imn;
+ else {
+ t = ntohl(pkt_dev->cur_saddr);
+ t++;
+ if (t > imx)
+ t = imn;
+
+ }
+ pkt_dev->cur_saddr = htonl(t);
+ }
+
+ if (pkt_dev->cflows && f_seen(pkt_dev, flow)) {
+ pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr;
+ } else {
+ imn = ntohl(pkt_dev->daddr_min);
+ imx = ntohl(pkt_dev->daddr_max);
+ if (imn < imx) {
+ __u32 t;
+ __be32 s;
+ if (pkt_dev->flags & F_IPDST_RND) {
+
+ do {
+ t = prandom_u32() %
+ (imx - imn) + imn;
+ s = htonl(t);
+ } while (ipv4_is_loopback(s) ||
+ ipv4_is_multicast(s) ||
+ ipv4_is_lbcast(s) ||
+ ipv4_is_zeronet(s) ||
+ ipv4_is_local_multicast(s));
+ pkt_dev->cur_daddr = s;
+ } else {
+ t = ntohl(pkt_dev->cur_daddr);
+ t++;
+ if (t > imx) {
+ t = imn;
+ }
+ pkt_dev->cur_daddr = htonl(t);
+ }
+ }
+ if (pkt_dev->cflows) {
+ pkt_dev->flows[flow].flags |= F_INIT;
+ pkt_dev->flows[flow].cur_daddr =
+ pkt_dev->cur_daddr;
+#ifdef CONFIG_XFRM
+ if (pkt_dev->flags & F_IPSEC_ON)
+ get_ipsec_sa(pkt_dev, flow);
+#endif
+ pkt_dev->nflows++;
+ }
+ }
+ } else { /* IPV6 * */
+
+ if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) {
+ int i;
+
+ /* Only random destinations yet */
+
+ for (i = 0; i < 4; i++) {
+ pkt_dev->cur_in6_daddr.s6_addr32[i] =
+ (((__force __be32)prandom_u32() |
+ pkt_dev->min_in6_daddr.s6_addr32[i]) &
+ pkt_dev->max_in6_daddr.s6_addr32[i]);
+ }
+ }
+ }
+
+ if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
+ __u32 t;
+ if (pkt_dev->flags & F_TXSIZE_RND) {
+ t = prandom_u32() %
+ (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
+ + pkt_dev->min_pkt_size;
+ } else {
+ t = pkt_dev->cur_pkt_size + 1;
+ if (t > pkt_dev->max_pkt_size)
+ t = pkt_dev->min_pkt_size;
+ }
+ pkt_dev->cur_pkt_size = t;
+ }
+
+ set_cur_queue_map(pkt_dev);
+
+ pkt_dev->flows[flow].count++;
+}
+
+
+#ifdef CONFIG_XFRM
+static u32 pktgen_dst_metrics[RTAX_MAX + 1] = {
+
+ [RTAX_HOPLIMIT] = 0x5, /* Set a static hoplimit */
+};
+
+static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
+{
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int err = 0;
+ struct net *net = dev_net(pkt_dev->odev);
+
+ if (!x)
+ return 0;
+ /* XXX: we dont support tunnel mode for now until
+ * we resolve the dst issue */
+ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
+ return 0;
+
+ /* But when user specify an valid SPI, transformation
+ * supports both transport/tunnel mode + ESP/AH type.
+ */
+ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0))
+ skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF;
+
+ rcu_read_lock_bh();
+ err = x->outer_mode->output(x, skb);
+ rcu_read_unlock_bh();
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
+ goto error;
+ }
+ err = x->type->output(x, skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+ goto error;
+ }
+ spin_lock_bh(&x->lock);
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+ spin_unlock_bh(&x->lock);
+error:
+ return err;
+}
+
+static void free_SAs(struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->cflows) {
+ /* let go of the SAs if we have them */
+ int i;
+ for (i = 0; i < pkt_dev->cflows; i++) {
+ struct xfrm_state *x = pkt_dev->flows[i].x;
+ if (x) {
+ xfrm_state_put(x);
+ pkt_dev->flows[i].x = NULL;
+ }
+ }
+ }
+}
+
+static int process_ipsec(struct pktgen_dev *pkt_dev,
+ struct sk_buff *skb, __be16 protocol)
+{
+ if (pkt_dev->flags & F_IPSEC_ON) {
+ struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+ int nhead = 0;
+ if (x) {
+ int ret;
+ __u8 *eth;
+ struct iphdr *iph;
+
+ nhead = x->props.header_len - skb_headroom(skb);
+ if (nhead > 0) {
+ ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
+ if (ret < 0) {
+ pr_err("Error expanding ipsec packet %d\n",
+ ret);
+ goto err;
+ }
+ }
+
+ /* ipsec is not expecting ll header */
+ skb_pull(skb, ETH_HLEN);
+ ret = pktgen_output_ipsec(skb, pkt_dev);
+ if (ret) {
+ pr_err("Error creating ipsec packet %d\n", ret);
+ goto err;
+ }
+ /* restore ll */
+ eth = (__u8 *) skb_push(skb, ETH_HLEN);
+ memcpy(eth, pkt_dev->hh, 12);
+ *(u16 *) &eth[12] = protocol;
+
+ /* Update IPv4 header len as well as checksum value */
+ iph = ip_hdr(skb);
+ iph->tot_len = htons(skb->len - ETH_HLEN);
+ ip_send_check(iph);
+ }
+ }
+ return 1;
+err:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
+static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
+{
+ unsigned int i;
+ for (i = 0; i < pkt_dev->nr_labels; i++)
+ *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
+
+ mpls--;
+ *mpls |= MPLS_STACK_BOTTOM;
+}
+
+static inline __be16 build_tci(unsigned int id, unsigned int cfi,
+ unsigned int prio)
+{
+ return htons(id | (cfi << 12) | (prio << 13));
+}
+
+static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
+ int datalen)
+{
+ struct timeval timestamp;
+ struct pktgen_hdr *pgh;
+
+ pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
+ datalen -= sizeof(*pgh);
+
+ if (pkt_dev->nfrags <= 0) {
+ memset(skb_put(skb, datalen), 0, datalen);
+ } else {
+ int frags = pkt_dev->nfrags;
+ int i, len;
+ int frag_len;
+
+
+ if (frags > MAX_SKB_FRAGS)
+ frags = MAX_SKB_FRAGS;
+ len = datalen - frags * PAGE_SIZE;
+ if (len > 0) {
+ memset(skb_put(skb, len), 0, len);
+ datalen = frags * PAGE_SIZE;
+ }
+
+ i = 0;
+ frag_len = (datalen/frags) < PAGE_SIZE ?
+ (datalen/frags) : PAGE_SIZE;
+ while (datalen > 0) {
+ if (unlikely(!pkt_dev->page)) {
+ int node = numa_node_id();
+
+ if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
+ node = pkt_dev->node;
+ pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!pkt_dev->page)
+ break;
+ }
+ get_page(pkt_dev->page);
+ skb_frag_set_page(skb, i, pkt_dev->page);
+ skb_shinfo(skb)->frags[i].page_offset = 0;
+ /*last fragment, fill rest of data*/
+ if (i == (frags - 1))
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i],
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE));
+ else
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len);
+ datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ i++;
+ skb_shinfo(skb)->nr_frags = i;
+ }
+ }
+
+ /* Stamp the time, and sequence number,
+ * convert them to network byte order
+ */
+ pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+ pgh->seq_num = htonl(pkt_dev->seq_num);
+
+ if (pkt_dev->flags & F_NO_TIMESTAMP) {
+ pgh->tv_sec = 0;
+ pgh->tv_usec = 0;
+ } else {
+ do_gettimeofday(&timestamp);
+ pgh->tv_sec = htonl(timestamp.tv_sec);
+ pgh->tv_usec = htonl(timestamp.tv_usec);
+ }
+}
+
+static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
+ struct pktgen_dev *pkt_dev,
+ unsigned int extralen)
+{
+ struct sk_buff *skb = NULL;
+ unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen +
+ pkt_dev->pkt_overhead;
+
+ if (pkt_dev->flags & F_NODE) {
+ int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
+
+ skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node);
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+ }
+ } else {
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ struct sk_buff *skb = NULL;
+ __u8 *eth;
+ struct udphdr *udph;
+ int datalen, iplen;
+ struct iphdr *iph;
+ __be16 protocol = htons(ETH_P_IP);
+ __be32 *mpls;
+ __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
+ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
+ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
+ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
+
+ if (pkt_dev->nr_labels)
+ protocol = htons(ETH_P_MPLS_UC);
+
+ if (pkt_dev->vlan_id != 0xffff)
+ protocol = htons(ETH_P_8021Q);
+
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ mod_cur_headers(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
+
+ datalen = (odev->hard_header_len + 16) & ~0xf;
+
+ skb = pktgen_alloc_skb(odev, pkt_dev, datalen);
+ if (!skb) {
+ sprintf(pkt_dev->result, "No memory");
+ return NULL;
+ }
+
+ prefetchw(skb->data);
+ skb_reserve(skb, datalen);
+
+ /* Reserve for ethernet and IP header */
+ eth = (__u8 *) skb_push(skb, 14);
+ mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ if (pkt_dev->nr_labels)
+ mpls_push(mpls, pkt_dev);
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_tci = build_tci(pkt_dev->svlan_id,
+ pkt_dev->svlan_cfi,
+ pkt_dev->svlan_p);
+ svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
+ }
+ vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_tci = build_tci(pkt_dev->vlan_id,
+ pkt_dev->vlan_cfi,
+ pkt_dev->vlan_p);
+ vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IP);
+ }
+
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, skb->len);
+ iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ skb->priority = pkt_dev->skb_priority;
+
+ memcpy(eth, pkt_dev->hh, 12);
+ *(__be16 *) & eth[12] = protocol;
+
+ /* Eth + IPh + UDPh + mpls */
+ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
+ pkt_dev->pkt_overhead;
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr))
+ datalen = sizeof(struct pktgen_hdr);
+
+ udph->source = htons(pkt_dev->cur_udp_src);
+ udph->dest = htons(pkt_dev->cur_udp_dst);
+ udph->len = htons(datalen + 8); /* DATA + udphdr */
+ udph->check = 0;
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->ttl = 32;
+ iph->tos = pkt_dev->tos;
+ iph->protocol = IPPROTO_UDP; /* UDP */
+ iph->saddr = pkt_dev->cur_saddr;
+ iph->daddr = pkt_dev->cur_daddr;
+ iph->id = htons(pkt_dev->ip_id);
+ pkt_dev->ip_id++;
+ iph->frag_off = 0;
+ iplen = 20 + 8 + datalen;
+ iph->tot_len = htons(iplen);
+ ip_send_check(iph);
+ skb->protocol = protocol;
+ skb->dev = odev;
+ skb->pkt_type = PACKET_HOST;
+
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
+
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & NETIF_F_V4_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum = 0;
+ udp4_hwcsum(skb, iph->saddr, iph->daddr);
+ } else {
+ __wsum csum = skb_checksum(skb, skb_transport_offset(skb), datalen + 8, 0);
+
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen + 8, IPPROTO_UDP, csum);
+
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+#ifdef CONFIG_XFRM
+ if (!process_ipsec(pkt_dev, skb, protocol))
+ return NULL;
+#endif
+
+ return skb;
+}
+
+static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ struct sk_buff *skb = NULL;
+ __u8 *eth;
+ struct udphdr *udph;
+ int datalen, udplen;
+ struct ipv6hdr *iph;
+ __be16 protocol = htons(ETH_P_IPV6);
+ __be32 *mpls;
+ __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
+ __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */
+ __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */
+ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+ u16 queue_map;
+
+ if (pkt_dev->nr_labels)
+ protocol = htons(ETH_P_MPLS_UC);
+
+ if (pkt_dev->vlan_id != 0xffff)
+ protocol = htons(ETH_P_8021Q);
+
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ mod_cur_headers(pkt_dev);
+ queue_map = pkt_dev->cur_queue_map;
+
+ skb = pktgen_alloc_skb(odev, pkt_dev, 16);
+ if (!skb) {
+ sprintf(pkt_dev->result, "No memory");
+ return NULL;
+ }
+
+ prefetchw(skb->data);
+ skb_reserve(skb, 16);
+
+ /* Reserve for ethernet and IP header */
+ eth = (__u8 *) skb_push(skb, 14);
+ mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+ if (pkt_dev->nr_labels)
+ mpls_push(mpls, pkt_dev);
+
+ if (pkt_dev->vlan_id != 0xffff) {
+ if (pkt_dev->svlan_id != 0xffff) {
+ svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_tci = build_tci(pkt_dev->svlan_id,
+ pkt_dev->svlan_cfi,
+ pkt_dev->svlan_p);
+ svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *svlan_encapsulated_proto = htons(ETH_P_8021Q);
+ }
+ vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_tci = build_tci(pkt_dev->vlan_id,
+ pkt_dev->vlan_cfi,
+ pkt_dev->vlan_p);
+ vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+ *vlan_encapsulated_proto = htons(ETH_P_IPV6);
+ }
+
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, skb->len);
+ iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
+
+ skb_set_transport_header(skb, skb->len);
+ udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr));
+ skb_set_queue_mapping(skb, queue_map);
+ skb->priority = pkt_dev->skb_priority;
+
+ memcpy(eth, pkt_dev->hh, 12);
+ *(__be16 *) &eth[12] = protocol;
+
+ /* Eth + IPh + UDPh + mpls */
+ datalen = pkt_dev->cur_pkt_size - 14 -
+ sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
+ pkt_dev->pkt_overhead;
+
+ if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) {
+ datalen = sizeof(struct pktgen_hdr);
+ net_info_ratelimited("increased datalen to %d\n", datalen);
+ }
+
+ udplen = datalen + sizeof(struct udphdr);
+ udph->source = htons(pkt_dev->cur_udp_src);
+ udph->dest = htons(pkt_dev->cur_udp_dst);
+ udph->len = htons(udplen);
+ udph->check = 0;
+
+ *(__be32 *) iph = htonl(0x60000000); /* Version + flow */
+
+ if (pkt_dev->traffic_class) {
+ /* Version + traffic class + flow (0) */
+ *(__be32 *)iph |= htonl(0x60000000 | (pkt_dev->traffic_class << 20));
+ }
+
+ iph->hop_limit = 32;
+
+ iph->payload_len = htons(udplen);
+ iph->nexthdr = IPPROTO_UDP;
+
+ iph->daddr = pkt_dev->cur_in6_daddr;
+ iph->saddr = pkt_dev->cur_in6_saddr;
+
+ skb->protocol = protocol;
+ skb->dev = odev;
+ skb->pkt_type = PACKET_HOST;
+
+ pktgen_finalize_skb(pkt_dev, skb, datalen);
+
+ if (!(pkt_dev->flags & F_UDPCSUM)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ } else if (odev->features & NETIF_F_V6_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ udph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, 0);
+ } else {
+ __wsum csum = skb_checksum(skb, skb_transport_offset(skb), udplen, 0);
+
+ /* add protocol-dependent pseudo-header */
+ udph->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, udplen, IPPROTO_UDP, csum);
+
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return skb;
+}
+
+static struct sk_buff *fill_packet(struct net_device *odev,
+ struct pktgen_dev *pkt_dev)
+{
+ if (pkt_dev->flags & F_IPV6)
+ return fill_packet_ipv6(odev, pkt_dev);
+ else
+ return fill_packet_ipv4(odev, pkt_dev);
+}
+
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev)
+{
+ pkt_dev->seq_num = 1;
+ pkt_dev->idle_acc = 0;
+ pkt_dev->sofar = 0;
+ pkt_dev->tx_bytes = 0;
+ pkt_dev->errors = 0;
+}
+
+/* Set up structure for sending pkts, clear counters */
+
+static void pktgen_run(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev;
+ int started = 0;
+
+ func_enter();
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+
+ /*
+ * setup odev and create initial packet.
+ */
+ pktgen_setup_inject(pkt_dev);
+
+ if (pkt_dev->odev) {
+ pktgen_clear_counters(pkt_dev);
+ pkt_dev->skb = NULL;
+ pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
+
+ set_pkt_overhead(pkt_dev);
+
+ strcpy(pkt_dev->result, "Starting");
+ pkt_dev->running = 1; /* Cranke yeself! */
+ started++;
+ } else
+ strcpy(pkt_dev->result, "Error starting");
+ }
+ rcu_read_unlock();
+ if (started)
+ t->control &= ~(T_STOP);
+}
+
+static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+
+ func_enter();
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= T_STOP;
+
+ mutex_unlock(&pktgen_thread_lock);
+}
+
+static int thread_is_running(const struct pktgen_thread *t)
+{
+ const struct pktgen_dev *pkt_dev;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running) {
+ rcu_read_unlock();
+ return 1;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int pktgen_wait_thread_run(struct pktgen_thread *t)
+{
+ while (thread_is_running(t)) {
+
+ msleep_interruptible(100);
+
+ if (signal_pending(current))
+ goto signal;
+ }
+ return 1;
+signal:
+ return 0;
+}
+
+static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+ int sig = 1;
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list) {
+ sig = pktgen_wait_thread_run(t);
+ if (sig == 0)
+ break;
+ }
+
+ if (sig == 0)
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (T_STOP);
+
+ mutex_unlock(&pktgen_thread_lock);
+ return sig;
+}
+
+static void pktgen_run_all_threads(struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+
+ func_enter();
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (T_RUN);
+
+ mutex_unlock(&pktgen_thread_lock);
+
+ /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125));
+
+ pktgen_wait_all_threads_run(pn);
+}
+
+static void pktgen_reset_all_threads(struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+
+ func_enter();
+
+ mutex_lock(&pktgen_thread_lock);
+
+ list_for_each_entry(t, &pn->pktgen_threads, th_list)
+ t->control |= (T_REMDEVALL);
+
+ mutex_unlock(&pktgen_thread_lock);
+
+ /* Propagate thread->control */
+ schedule_timeout_interruptible(msecs_to_jiffies(125));
+
+ pktgen_wait_all_threads_run(pn);
+}
+
+static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
+{
+ __u64 bps, mbps, pps;
+ char *p = pkt_dev->result;
+ ktime_t elapsed = ktime_sub(pkt_dev->stopped_at,
+ pkt_dev->started_at);
+ ktime_t idle = ns_to_ktime(pkt_dev->idle_acc);
+
+ p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
+ (unsigned long long)ktime_to_us(elapsed),
+ (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),
+ (unsigned long long)ktime_to_us(idle),
+ (unsigned long long)pkt_dev->sofar,
+ pkt_dev->cur_pkt_size, nr_frags);
+
+ pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC,
+ ktime_to_ns(elapsed));
+
+ bps = pps * 8 * pkt_dev->cur_pkt_size;
+
+ mbps = bps;
+ do_div(mbps, 1000000);
+ p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu",
+ (unsigned long long)pps,
+ (unsigned long long)mbps,
+ (unsigned long long)bps,
+ (unsigned long long)pkt_dev->errors);
+}
+
+/* Set stopped-at timer, remove from running list, do counters & statistics */
+static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
+{
+ int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
+
+ if (!pkt_dev->running) {
+ pr_warn("interface: %s is already stopped\n",
+ pkt_dev->odevname);
+ return -EINVAL;
+ }
+
+ pkt_dev->running = 0;
+ kfree_skb(pkt_dev->skb);
+ pkt_dev->skb = NULL;
+ pkt_dev->stopped_at = ktime_get();
+
+ show_results(pkt_dev, nr_frags);
+
+ return 0;
+}
+
+static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev, *best = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+ if (!pkt_dev->running)
+ continue;
+ if (best == NULL)
+ best = pkt_dev;
+ else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
+ best = pkt_dev;
+ }
+ rcu_read_unlock();
+
+ return best;
+}
+
+static void pktgen_stop(struct pktgen_thread *t)
+{
+ struct pktgen_dev *pkt_dev;
+
+ func_enter();
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+ pktgen_stop_device(pkt_dev);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * one of our devices needs to be removed - find it
+ * and remove it
+ */
+static void pktgen_rem_one_if(struct pktgen_thread *t)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *cur;
+
+ func_enter();
+
+ list_for_each_safe(q, n, &t->if_list) {
+ cur = list_entry(q, struct pktgen_dev, list);
+
+ if (!cur->removal_mark)
+ continue;
+
+ kfree_skb(cur->skb);
+ cur->skb = NULL;
+
+ pktgen_remove_device(t, cur);
+
+ break;
+ }
+}
+
+static void pktgen_rem_all_ifs(struct pktgen_thread *t)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *cur;
+
+ func_enter();
+
+ /* Remove all devices, free mem */
+
+ list_for_each_safe(q, n, &t->if_list) {
+ cur = list_entry(q, struct pktgen_dev, list);
+
+ kfree_skb(cur->skb);
+ cur->skb = NULL;
+
+ pktgen_remove_device(t, cur);
+ }
+}
+
+static void pktgen_rem_thread(struct pktgen_thread *t)
+{
+ /* Remove from the thread list */
+ remove_proc_entry(t->tsk->comm, t->net->proc_dir);
+}
+
+static void pktgen_resched(struct pktgen_dev *pkt_dev)
+{
+ ktime_t idle_start = ktime_get();
+ schedule();
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
+}
+
+static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
+{
+ ktime_t idle_start = ktime_get();
+
+ while (atomic_read(&(pkt_dev->skb->users)) != 1) {
+ if (signal_pending(current))
+ break;
+
+ if (need_resched())
+ pktgen_resched(pkt_dev);
+ else
+ cpu_relax();
+ }
+ pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
+}
+
+static void pktgen_xmit(struct pktgen_dev *pkt_dev)
+{
+ unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
+ struct net_device *odev = pkt_dev->odev;
+ struct netdev_queue *txq;
+ int ret;
+
+ /* If device is offline, then don't send */
+ if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
+ pktgen_stop_device(pkt_dev);
+ return;
+ }
+
+ /* This is max DELAY, this has special meaning of
+ * "never transmit"
+ */
+ if (unlikely(pkt_dev->delay == ULLONG_MAX)) {
+ pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX);
+ return;
+ }
+
+ /* If no skb or clone count exhausted then get new one */
+ if (!pkt_dev->skb || (pkt_dev->last_ok &&
+ ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ /* build a new pkt */
+ kfree_skb(pkt_dev->skb);
+
+ pkt_dev->skb = fill_packet(odev, pkt_dev);
+ if (pkt_dev->skb == NULL) {
+ pr_err("ERROR: couldn't allocate skb in fill_packet\n");
+ schedule();
+ pkt_dev->clone_count--; /* back out increment, OOM */
+ return;
+ }
+ pkt_dev->last_pkt_size = pkt_dev->skb->len;
+ pkt_dev->allocated_skbs++;
+ pkt_dev->clone_count = 0; /* reset counter */
+ }
+
+ if (pkt_dev->delay && pkt_dev->last_ok)
+ spin(pkt_dev, pkt_dev->next_tx);
+
+ txq = skb_get_tx_queue(odev, pkt_dev->skb);
+
+ local_bh_disable();
+
+ HARD_TX_LOCK(odev, txq, smp_processor_id());
+
+ if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
+ ret = NETDEV_TX_BUSY;
+ pkt_dev->last_ok = 0;
+ goto unlock;
+ }
+ atomic_add(burst, &pkt_dev->skb->users);
+
+xmit_more:
+ ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
+
+ switch (ret) {
+ case NETDEV_TX_OK:
+ pkt_dev->last_ok = 1;
+ pkt_dev->sofar++;
+ pkt_dev->seq_num++;
+ pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+ if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq))
+ goto xmit_more;
+ break;
+ case NET_XMIT_DROP:
+ case NET_XMIT_CN:
+ case NET_XMIT_POLICED:
+ /* skb has been consumed */
+ pkt_dev->errors++;
+ break;
+ default: /* Drivers are not supposed to return other values! */
+ net_info_ratelimited("%s xmit error: %d\n",
+ pkt_dev->odevname, ret);
+ pkt_dev->errors++;
+ /* fallthru */
+ case NETDEV_TX_LOCKED:
+ case NETDEV_TX_BUSY:
+ /* Retry it next time */
+ atomic_dec(&(pkt_dev->skb->users));
+ pkt_dev->last_ok = 0;
+ }
+ if (unlikely(burst))
+ atomic_sub(burst, &pkt_dev->skb->users);
+unlock:
+ HARD_TX_UNLOCK(odev, txq);
+
+ local_bh_enable();
+
+ /* If pkt_dev->count is zero, then run forever */
+ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
+ pktgen_wait_for_skb(pkt_dev);
+
+ /* Done with this */
+ pktgen_stop_device(pkt_dev);
+ }
+}
+
+/*
+ * Main loop of the thread goes here
+ */
+
+static int pktgen_thread_worker(void *arg)
+{
+ DEFINE_WAIT(wait);
+ struct pktgen_thread *t = arg;
+ struct pktgen_dev *pkt_dev = NULL;
+ int cpu = t->cpu;
+
+ BUG_ON(smp_processor_id() != cpu);
+
+ init_waitqueue_head(&t->queue);
+ complete(&t->start_done);
+
+ pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
+
+ set_freezable();
+
+ __set_current_state(TASK_RUNNING);
+
+ while (!kthread_should_stop()) {
+ pkt_dev = next_to_run(t);
+
+ if (unlikely(!pkt_dev && t->control == 0)) {
+ if (t->net->pktgen_exiting)
+ break;
+ wait_event_interruptible_timeout(t->queue,
+ t->control != 0,
+ HZ/10);
+ try_to_freeze();
+ continue;
+ }
+
+ if (likely(pkt_dev)) {
+ pktgen_xmit(pkt_dev);
+
+ if (need_resched())
+ pktgen_resched(pkt_dev);
+ else
+ cpu_relax();
+ }
+
+ if (t->control & T_STOP) {
+ pktgen_stop(t);
+ t->control &= ~(T_STOP);
+ }
+
+ if (t->control & T_RUN) {
+ pktgen_run(t);
+ t->control &= ~(T_RUN);
+ }
+
+ if (t->control & T_REMDEVALL) {
+ pktgen_rem_all_ifs(t);
+ t->control &= ~(T_REMDEVALL);
+ }
+
+ if (t->control & T_REMDEV) {
+ pktgen_rem_one_if(t);
+ t->control &= ~(T_REMDEV);
+ }
+
+ try_to_freeze();
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ pr_debug("%s stopping all device\n", t->tsk->comm);
+ pktgen_stop(t);
+
+ pr_debug("%s removing all device\n", t->tsk->comm);
+ pktgen_rem_all_ifs(t);
+
+ pr_debug("%s removing thread\n", t->tsk->comm);
+ pktgen_rem_thread(t);
+
+ /* Wait for kthread_stop */
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+ const char *ifname, bool exact)
+{
+ struct pktgen_dev *p, *pkt_dev = NULL;
+ size_t len = strlen(ifname);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &t->if_list, list)
+ if (strncmp(p->odevname, ifname, len) == 0) {
+ if (p->odevname[len]) {
+ if (exact || p->odevname[len] != '@')
+ continue;
+ }
+ pkt_dev = p;
+ break;
+ }
+
+ rcu_read_unlock();
+ pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
+ return pkt_dev;
+}
+
+/*
+ * Adds a dev at front of if_list.
+ */
+
+static int add_dev_to_thread(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+ int rv = 0;
+
+ /* This function cannot be called concurrently, as its called
+ * under pktgen_thread_lock mutex, but it can run from
+ * userspace on another CPU than the kthread. The if_lock()
+ * is used here to sync with concurrent instances of
+ * _rem_dev_from_if_list() invoked via kthread, which is also
+ * updating the if_list */
+ if_lock(t);
+
+ if (pkt_dev->pg_thread) {
+ pr_err("ERROR: already assigned to a thread\n");
+ rv = -EBUSY;
+ goto out;
+ }
+
+ pkt_dev->running = 0;
+ pkt_dev->pg_thread = t;
+ list_add_rcu(&pkt_dev->list, &t->if_list);
+
+out:
+ if_unlock(t);
+ return rv;
+}
+
+/* Called under thread lock */
+
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
+{
+ struct pktgen_dev *pkt_dev;
+ int err;
+ int node = cpu_to_node(t->cpu);
+
+ /* We don't allow a device to be on several threads */
+
+ pkt_dev = __pktgen_NN_threads(t->net, ifname, FIND);
+ if (pkt_dev) {
+ pr_err("ERROR: interface already used\n");
+ return -EBUSY;
+ }
+
+ pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node);
+ if (!pkt_dev)
+ return -ENOMEM;
+
+ strcpy(pkt_dev->odevname, ifname);
+ pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state),
+ node);
+ if (pkt_dev->flows == NULL) {
+ kfree(pkt_dev);
+ return -ENOMEM;
+ }
+
+ pkt_dev->removal_mark = 0;
+ pkt_dev->nfrags = 0;
+ pkt_dev->delay = pg_delay_d;
+ pkt_dev->count = pg_count_d;
+ pkt_dev->sofar = 0;
+ pkt_dev->udp_src_min = 9; /* sink port */
+ pkt_dev->udp_src_max = 9;
+ pkt_dev->udp_dst_min = 9;
+ pkt_dev->udp_dst_max = 9;
+ pkt_dev->vlan_p = 0;
+ pkt_dev->vlan_cfi = 0;
+ pkt_dev->vlan_id = 0xffff;
+ pkt_dev->svlan_p = 0;
+ pkt_dev->svlan_cfi = 0;
+ pkt_dev->svlan_id = 0xffff;
+ pkt_dev->burst = 1;
+ pkt_dev->node = -1;
+
+ err = pktgen_setup_dev(t->net, pkt_dev, ifname);
+ if (err)
+ goto out1;
+ if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)
+ pkt_dev->clone_skb = pg_clone_skb_d;
+
+ pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,
+ &pktgen_if_fops, pkt_dev);
+ if (!pkt_dev->entry) {
+ pr_err("cannot create %s/%s procfs entry\n",
+ PG_PROC_DIR, ifname);
+ err = -EINVAL;
+ goto out2;
+ }
+#ifdef CONFIG_XFRM
+ pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
+ pkt_dev->ipsproto = IPPROTO_ESP;
+
+ /* xfrm tunnel mode needs additional dst to extract outter
+ * ip header protocol/ttl/id field, here creat a phony one.
+ * instead of looking for a valid rt, which definitely hurting
+ * performance under such circumstance.
+ */
+ pkt_dev->dstops.family = AF_INET;
+ pkt_dev->dst.dev = pkt_dev->odev;
+ dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false);
+ pkt_dev->dst.child = &pkt_dev->dst;
+ pkt_dev->dst.ops = &pkt_dev->dstops;
+#endif
+
+ return add_dev_to_thread(t, pkt_dev);
+out2:
+ dev_put(pkt_dev->odev);
+out1:
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ vfree(pkt_dev->flows);
+ kfree(pkt_dev);
+ return err;
+}
+
+static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
+{
+ struct pktgen_thread *t;
+ struct proc_dir_entry *pe;
+ struct task_struct *p;
+
+ t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!t) {
+ pr_err("ERROR: out of memory, can't create new thread\n");
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&t->if_lock);
+ t->cpu = cpu;
+
+ INIT_LIST_HEAD(&t->if_list);
+
+ list_add_tail(&t->th_list, &pn->pktgen_threads);
+ init_completion(&t->start_done);
+
+ p = kthread_create_on_node(pktgen_thread_worker,
+ t,
+ cpu_to_node(cpu),
+ "kpktgend_%d", cpu);
+ if (IS_ERR(p)) {
+ pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
+ list_del(&t->th_list);
+ kfree(t);
+ return PTR_ERR(p);
+ }
+ kthread_bind(p, cpu);
+ t->tsk = p;
+
+ pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
+ &pktgen_thread_fops, t);
+ if (!pe) {
+ pr_err("cannot create %s/%s procfs entry\n",
+ PG_PROC_DIR, t->tsk->comm);
+ kthread_stop(p);
+ list_del(&t->th_list);
+ kfree(t);
+ return -EINVAL;
+ }
+
+ t->net = pn;
+ wake_up_process(p);
+ wait_for_completion(&t->start_done);
+
+ return 0;
+}
+
+/*
+ * Removes a device from the thread if_list.
+ */
+static void _rem_dev_from_if_list(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+ struct list_head *q, *n;
+ struct pktgen_dev *p;
+
+ if_lock(t);
+ list_for_each_safe(q, n, &t->if_list) {
+ p = list_entry(q, struct pktgen_dev, list);
+ if (p == pkt_dev)
+ list_del_rcu(&p->list);
+ }
+ if_unlock(t);
+}
+
+static int pktgen_remove_device(struct pktgen_thread *t,
+ struct pktgen_dev *pkt_dev)
+{
+ pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
+
+ if (pkt_dev->running) {
+ pr_warn("WARNING: trying to remove a running interface, stopping it now\n");
+ pktgen_stop_device(pkt_dev);
+ }
+
+ /* Dis-associate from the interface */
+
+ if (pkt_dev->odev) {
+ dev_put(pkt_dev->odev);
+ pkt_dev->odev = NULL;
+ }
+
+ /* Remove proc before if_list entry, because add_device uses
+ * list to determine if interface already exist, avoid race
+ * with proc_create_data() */
+ proc_remove(pkt_dev->entry);
+
+ /* And update the thread if_list */
+ _rem_dev_from_if_list(t, pkt_dev);
+
+#ifdef CONFIG_XFRM
+ free_SAs(pkt_dev);
+#endif
+ vfree(pkt_dev->flows);
+ if (pkt_dev->page)
+ put_page(pkt_dev->page);
+ kfree_rcu(pkt_dev, rcu);
+ return 0;
+}
+
+static int __net_init pg_net_init(struct net *net)
+{
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
+ struct proc_dir_entry *pe;
+ int cpu, ret = 0;
+
+ pn->net = net;
+ INIT_LIST_HEAD(&pn->pktgen_threads);
+ pn->pktgen_exiting = false;
+ pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net);
+ if (!pn->proc_dir) {
+ pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);
+ return -ENODEV;
+ }
+ pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops);
+ if (pe == NULL) {
+ pr_err("cannot create %s procfs entry\n", PGCTRL);
+ ret = -EINVAL;
+ goto remove;
+ }
+
+ for_each_online_cpu(cpu) {
+ int err;
+
+ err = pktgen_create_thread(cpu, pn);
+ if (err)
+ pr_warn("Cannot create thread for cpu %d (%d)\n",
+ cpu, err);
+ }
+
+ if (list_empty(&pn->pktgen_threads)) {
+ pr_err("Initialization failed for all threads\n");
+ ret = -ENODEV;
+ goto remove_entry;
+ }
+
+ return 0;
+
+remove_entry:
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+remove:
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+ return ret;
+}
+
+static void __net_exit pg_net_exit(struct net *net)
+{
+ struct pktgen_net *pn = net_generic(net, pg_net_id);
+ struct pktgen_thread *t;
+ struct list_head *q, *n;
+ LIST_HEAD(list);
+
+ /* Stop all interfaces & threads */
+ pn->pktgen_exiting = true;
+
+ mutex_lock(&pktgen_thread_lock);
+ list_splice_init(&pn->pktgen_threads, &list);
+ mutex_unlock(&pktgen_thread_lock);
+
+ list_for_each_safe(q, n, &list) {
+ t = list_entry(q, struct pktgen_thread, th_list);
+ list_del(&t->th_list);
+ kthread_stop(t->tsk);
+ kfree(t);
+ }
+
+ remove_proc_entry(PGCTRL, pn->proc_dir);
+ remove_proc_entry(PG_PROC_DIR, pn->net->proc_net);
+}
+
+static struct pernet_operations pg_net_ops = {
+ .init = pg_net_init,
+ .exit = pg_net_exit,
+ .id = &pg_net_id,
+ .size = sizeof(struct pktgen_net),
+};
+
+static int __init pg_init(void)
+{
+ int ret = 0;
+
+ pr_info("%s", version);
+ ret = register_pernet_subsys(&pg_net_ops);
+ if (ret)
+ return ret;
+ ret = register_netdevice_notifier(&pktgen_notifier_block);
+ if (ret)
+ unregister_pernet_subsys(&pg_net_ops);
+
+ return ret;
+}
+
+static void __exit pg_cleanup(void)
+{
+ unregister_netdevice_notifier(&pktgen_notifier_block);
+ unregister_pernet_subsys(&pg_net_ops);
+ /* Don't need rcu_barrier() due to use of kfree_rcu() */
+}
+
+module_init(pg_init);
+module_exit(pg_cleanup);
+
+MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se>");
+MODULE_DESCRIPTION("Packet Generator tool");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VERSION);
+module_param(pg_count_d, int, 0);
+MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject");
+module_param(pg_delay_d, int, 0);
+MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)");
+module_param(pg_clone_skb_d, int, 0);
+MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet");
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Enable debugging of pktgen module");
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
new file mode 100644
index 000000000..4eab4a94a
--- /dev/null
+++ b/net/core/ptp_classifier.c
@@ -0,0 +1,193 @@
+/* PTP classifier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* The below program is the bpf_asm (tools/net/) representation of
+ * the opcode array in the ptp_filter structure.
+ *
+ * For convenience, this can easily be altered and reviewed with
+ * bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
+ * simple file containing the below program:
+ *
+ * ldh [12] ; load ethertype
+ *
+ * ; PTP over UDP over IPv4 over Ethernet
+ * test_ipv4:
+ * jneq #0x800, test_ipv6 ; ETH_P_IP ?
+ * ldb [23] ; load proto
+ * jneq #17, drop_ipv4 ; IPPROTO_UDP ?
+ * ldh [20] ; load frag offset field
+ * jset #0x1fff, drop_ipv4 ; don't allow fragments
+ * ldxb 4*([14]&0xf) ; load IP header len
+ * ldh [x + 16] ; load UDP dst port
+ * jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 22] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x10 ; PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over Ethernet
+ * test_ipv6:
+ * jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
+ * ldb [20] ; load proto
+ * jneq #17, drop_ipv6 ; IPPROTO_UDP ?
+ * ldh [56] ; load UDP dst port
+ * jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [62] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x20 ; PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over 802.1Q over Ethernet
+ * test_8021q:
+ * jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
+ * ldh [16] ; load inner type
+ * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ?
+ * ldb [18] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [18] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2
+ * ret a ; return PTP class
+ *
+ * ; PTP over UDP over IPv4 over 802.1Q over Ethernet
+ * test_8021q_ipv4:
+ * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ?
+ * ldb [27] ; load proto
+ * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ?
+ * ldh [24] ; load frag offset field
+ * jset #0x1fff, drop_8021q_ipv4; don't allow fragments
+ * ldxb 4*([18]&0xf) ; load IP header len
+ * ldh [x + 20] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 26] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over 802.1Q over Ethernet
+ * test_8021q_ipv6:
+ * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ?
+ * ldb [24] ; load proto
+ * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ?
+ * ldh [60] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [66] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
+ * ret a ; return PTP class
+ * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over Ethernet
+ * test_ieee1588:
+ * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * ldb [14] ; load payload
+ * and #0x8 ; as we don't have ports here, test
+ * jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
+ * ldh [14] ; reload payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x30 ; PTP_CLASS_L2
+ * ret a ; return PTP class
+ * drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
+ */
+
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/ptp_classify.h>
+
+static struct bpf_prog *ptp_insns __read_mostly;
+
+unsigned int ptp_classify_raw(const struct sk_buff *skb)
+{
+ return BPF_PROG_RUN(ptp_insns, skb);
+}
+EXPORT_SYMBOL_GPL(ptp_classify_raw);
+
+void __init ptp_classifier_init(void)
+{
+ static struct sock_filter ptp_filter[] __initdata = {
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x00000017 },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000014 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x0000000e },
+ { 0x48, 0, 0, 0x00000010 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x00000016 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000010 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 9, 0x000086dd },
+ { 0x30, 0, 0, 0x00000014 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x00000038 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x0000003e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000020 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 32, 0x00008100 },
+ { 0x28, 0, 0, 0x00000010 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 35, 0x00000000 },
+ { 0x28, 0, 0, 0x00000012 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000070 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x0000001b },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000018 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x00000012 },
+ { 0x48, 0, 0, 0x00000014 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x0000001a },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000050 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 8, 0x000086dd },
+ { 0x30, 0, 0, 0x00000018 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x0000003c },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x00000042 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000060 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 7, 0x000088f7 },
+ { 0x30, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x00000008 },
+ { 0x15, 0, 4, 0x00000000 },
+ { 0x28, 0, 0, 0x0000000e },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000030 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ };
+ struct sock_fprog_kern ptp_prog = {
+ .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
+ };
+
+ BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
+}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
new file mode 100644
index 000000000..87b22c0bc
--- /dev/null
+++ b/net/core/request_sock.c
@@ -0,0 +1,206 @@
+/*
+ * NET Generic infrastructure for Network protocols.
+ *
+ * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * From code originally in include/net/tcp.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/tcp.h>
+#include <linux/vmalloc.h>
+
+#include <net/request_sock.h>
+
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * The minimum value of it is 128. Experiments with real servers show that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems.
+ * This value is adjusted to 128 for low memory machines,
+ * and it will increase in proportion to the memory of machine.
+ * Note : Dont forget somaxconn that may limit backlog too.
+ */
+int sysctl_max_syn_backlog = 256;
+EXPORT_SYMBOL(sysctl_max_syn_backlog);
+
+int reqsk_queue_alloc(struct request_sock_queue *queue,
+ unsigned int nr_table_entries)
+{
+ size_t lopt_size = sizeof(struct listen_sock);
+ struct listen_sock *lopt = NULL;
+
+ nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
+ nr_table_entries = max_t(u32, nr_table_entries, 8);
+ nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
+ lopt_size += nr_table_entries * sizeof(struct request_sock *);
+
+ if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ lopt = kzalloc(lopt_size, GFP_KERNEL |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
+ if (!lopt)
+ lopt = vzalloc(lopt_size);
+ if (!lopt)
+ return -ENOMEM;
+
+ get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
+ spin_lock_init(&queue->syn_wait_lock);
+ queue->rskq_accept_head = NULL;
+ lopt->nr_table_entries = nr_table_entries;
+ lopt->max_qlen_log = ilog2(nr_table_entries);
+
+ spin_lock_bh(&queue->syn_wait_lock);
+ queue->listen_opt = lopt;
+ spin_unlock_bh(&queue->syn_wait_lock);
+
+ return 0;
+}
+
+void __reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+ /* This is an error recovery path only, no locking needed */
+ kvfree(queue->listen_opt);
+}
+
+static inline struct listen_sock *reqsk_queue_yank_listen_sk(
+ struct request_sock_queue *queue)
+{
+ struct listen_sock *lopt;
+
+ spin_lock_bh(&queue->syn_wait_lock);
+ lopt = queue->listen_opt;
+ queue->listen_opt = NULL;
+ spin_unlock_bh(&queue->syn_wait_lock);
+
+ return lopt;
+}
+
+void reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+ /* make all the listen_opt local to us */
+ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+
+ if (listen_sock_qlen(lopt) != 0) {
+ unsigned int i;
+
+ for (i = 0; i < lopt->nr_table_entries; i++) {
+ struct request_sock *req;
+
+ spin_lock_bh(&queue->syn_wait_lock);
+ while ((req = lopt->syn_table[i]) != NULL) {
+ lopt->syn_table[i] = req->dl_next;
+ atomic_inc(&lopt->qlen_dec);
+ if (del_timer(&req->rsk_timer))
+ reqsk_put(req);
+ reqsk_put(req);
+ }
+ spin_unlock_bh(&queue->syn_wait_lock);
+ }
+ }
+
+ if (WARN_ON(listen_sock_qlen(lopt) != 0))
+ pr_err("qlen %u\n", listen_sock_qlen(lopt));
+ kvfree(lopt);
+}
+
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock. A corner
+ * case might also exist in tcp_v4_hnd_req() that will trigger this locking
+ * order.
+ *
+ * This function also sets "treq->tfo_listener" to false.
+ * treq->tfo_listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
+{
+ struct sock *lsk = req->rsk_listener;
+ struct fastopen_queue *fastopenq;
+
+ fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ tcp_sk(sk)->fastopen_rsk = NULL;
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->tfo_listener = false;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ reqsk_put(req);
+ return;
+ }
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->rsk_timer.expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
+ else
+ fastopenq->rskq_rst_tail->dl_next = req;
+
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
new file mode 100644
index 000000000..8de368240
--- /dev/null
+++ b/net/core/rtnetlink.c
@@ -0,0 +1,3343 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Routing netlink socket interface: protocol independent part.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/mutex.h>
+#include <linux/if_addr.h>
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/switchdev.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/fib_rules.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+
+struct rtnl_link {
+ rtnl_doit_func doit;
+ rtnl_dumpit_func dumpit;
+ rtnl_calcit_func calcit;
+};
+
+static DEFINE_MUTEX(rtnl_mutex);
+
+void rtnl_lock(void)
+{
+ mutex_lock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock);
+
+void __rtnl_unlock(void)
+{
+ mutex_unlock(&rtnl_mutex);
+}
+
+void rtnl_unlock(void)
+{
+ /* This fellow will unlock it for us. */
+ netdev_run_todo();
+}
+EXPORT_SYMBOL(rtnl_unlock);
+
+int rtnl_trylock(void)
+{
+ return mutex_trylock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_trylock);
+
+int rtnl_is_locked(void)
+{
+ return mutex_is_locked(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_is_locked);
+
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rtnl_is_held(void)
+{
+ return lockdep_is_held(&rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_is_held);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+
+static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+
+static inline int rtm_msgindex(int msgtype)
+{
+ int msgindex = msgtype - RTM_BASE;
+
+ /*
+ * msgindex < 0 implies someone tried to register a netlink
+ * control code. msgindex >= RTM_NR_MSGTYPES may indicate that
+ * the message type has not been added to linux/rtnetlink.h
+ */
+ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES);
+
+ return msgindex;
+}
+
+static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
+{
+ struct rtnl_link *tab;
+
+ if (protocol <= RTNL_FAMILY_MAX)
+ tab = rtnl_msg_handlers[protocol];
+ else
+ tab = NULL;
+
+ if (tab == NULL || tab[msgindex].doit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab[msgindex].doit;
+}
+
+static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
+{
+ struct rtnl_link *tab;
+
+ if (protocol <= RTNL_FAMILY_MAX)
+ tab = rtnl_msg_handlers[protocol];
+ else
+ tab = NULL;
+
+ if (tab == NULL || tab[msgindex].dumpit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab[msgindex].dumpit;
+}
+
+static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex)
+{
+ struct rtnl_link *tab;
+
+ if (protocol <= RTNL_FAMILY_MAX)
+ tab = rtnl_msg_handlers[protocol];
+ else
+ tab = NULL;
+
+ if (tab == NULL || tab[msgindex].calcit == NULL)
+ tab = rtnl_msg_handlers[PF_UNSPEC];
+
+ return tab[msgindex].calcit;
+}
+
+/**
+ * __rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ * @calcit: Function pointer to calc size of dump message
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ rtnl_calcit_func calcit)
+{
+ struct rtnl_link *tab;
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+ msgindex = rtm_msgindex(msgtype);
+
+ tab = rtnl_msg_handlers[protocol];
+ if (tab == NULL) {
+ tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
+ if (tab == NULL)
+ return -ENOBUFS;
+
+ rtnl_msg_handlers[protocol] = tab;
+ }
+
+ if (doit)
+ tab[msgindex].doit = doit;
+
+ if (dumpit)
+ tab[msgindex].dumpit = dumpit;
+
+ if (calcit)
+ tab[msgindex].calcit = calcit;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__rtnl_register);
+
+/**
+ * rtnl_register - Register a rtnetlink message type
+ *
+ * Identical to __rtnl_register() but panics on failure. This is useful
+ * as failure of this function is very unlikely, it can only happen due
+ * to lack of memory when allocating the chain to store all message
+ * handlers for a protocol. Meant for use in init functions where lack
+ * of memory implies no sense in continuing.
+ */
+void rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func doit, rtnl_dumpit_func dumpit,
+ rtnl_calcit_func calcit)
+{
+ if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)
+ panic("Unable to register rtnetlink message handler, "
+ "protocol = %d, message type = %d\n",
+ protocol, msgtype);
+}
+EXPORT_SYMBOL_GPL(rtnl_register);
+
+/**
+ * rtnl_unregister - Unregister a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_unregister(int protocol, int msgtype)
+{
+ int msgindex;
+
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+ msgindex = rtm_msgindex(msgtype);
+
+ if (rtnl_msg_handlers[protocol] == NULL)
+ return -ENOENT;
+
+ rtnl_msg_handlers[protocol][msgindex].doit = NULL;
+ rtnl_msg_handlers[protocol][msgindex].dumpit = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister);
+
+/**
+ * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
+ * @protocol : Protocol family or PF_UNSPEC
+ *
+ * Identical to calling rtnl_unregster() for all registered message types
+ * of a certain protocol family.
+ */
+void rtnl_unregister_all(int protocol)
+{
+ BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+
+ kfree(rtnl_msg_handlers[protocol]);
+ rtnl_msg_handlers[protocol] = NULL;
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister_all);
+
+static LIST_HEAD(link_ops);
+
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+ const struct rtnl_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind))
+ return ops;
+ }
+ return NULL;
+}
+
+/**
+ * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * The caller must hold the rtnl_mutex. This function should be used
+ * by drivers that create devices during module initialization. It
+ * must be called before registering the devices.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ if (rtnl_link_ops_get(ops->kind))
+ return -EEXIST;
+
+ /* The check for setup is here because if ops
+ * does not have that filled up, it is not possible
+ * to use the ops for creating device. So do not
+ * fill up dellink as well. That disables rtnl_dellink.
+ */
+ if (ops->setup && !ops->dellink)
+ ops->dellink = unregister_netdevice_queue;
+
+ list_add_tail(&ops->list, &link_ops);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__rtnl_link_register);
+
+/**
+ * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_link_register(struct rtnl_link_ops *ops)
+{
+ int err;
+
+ rtnl_lock();
+ err = __rtnl_link_register(ops);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL_GPL(rtnl_link_register);
+
+static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
+{
+ struct net_device *dev;
+ LIST_HEAD(list_kill);
+
+ for_each_netdev(net, dev) {
+ if (dev->rtnl_link_ops == ops)
+ ops->dellink(dev, &list_kill);
+ }
+ unregister_netdevice_many(&list_kill);
+}
+
+/**
+ * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void __rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ struct net *net;
+
+ for_each_net(net) {
+ __rtnl_kill_links(net, ops);
+ }
+ list_del(&ops->list);
+}
+EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+
+/* Return with the rtnl_lock held when there are no network
+ * devices unregistering in any network namespace.
+ */
+static void rtnl_lock_unregistering_all(void)
+{
+ struct net *net;
+ bool unregistering;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(&netdev_unregistering_wq, &wait);
+ for (;;) {
+ unregistering = false;
+ rtnl_lock();
+ for_each_net(net) {
+ if (net->dev_unreg_count > 0) {
+ unregistering = true;
+ break;
+ }
+ }
+ if (!unregistering)
+ break;
+ __rtnl_unlock();
+
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&netdev_unregistering_wq, &wait);
+}
+
+/**
+ * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ */
+void rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+ /* Close the race with cleanup_net() */
+ mutex_lock(&net_mutex);
+ rtnl_lock_unregistering_all();
+ __rtnl_link_unregister(ops);
+ rtnl_unlock();
+ mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(rtnl_link_unregister);
+
+static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
+
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (!master_dev)
+ return 0;
+ ops = master_dev->rtnl_link_ops;
+ if (!ops || !ops->get_slave_size)
+ return 0;
+ /* IFLA_INFO_SLAVE_DATA + nested data */
+ return nla_total_size(sizeof(struct nlattr)) +
+ ops->get_slave_size(master_dev, dev);
+}
+
+static size_t rtnl_link_get_size(const struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ size_t size;
+
+ if (!ops)
+ return 0;
+
+ size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
+ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */
+
+ if (ops->get_size)
+ /* IFLA_INFO_DATA + nested data */
+ size += nla_total_size(sizeof(struct nlattr)) +
+ ops->get_size(dev);
+
+ if (ops->get_xstats_size)
+ /* IFLA_INFO_XSTATS */
+ size += nla_total_size(ops->get_xstats_size(dev));
+
+ size += rtnl_link_get_slave_info_data_size(dev);
+
+ return size;
+}
+
+static LIST_HEAD(rtnl_af_ops);
+
+static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
+{
+ const struct rtnl_af_ops *ops;
+
+ list_for_each_entry(ops, &rtnl_af_ops, list) {
+ if (ops->family == family)
+ return ops;
+ }
+
+ return NULL;
+}
+
+/**
+ * rtnl_af_register - Register rtnl_af_ops with rtnetlink.
+ * @ops: struct rtnl_af_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+void rtnl_af_register(struct rtnl_af_ops *ops)
+{
+ rtnl_lock();
+ list_add_tail(&ops->list, &rtnl_af_ops);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(rtnl_af_register);
+
+/**
+ * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
+ * @ops: struct rtnl_af_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void __rtnl_af_unregister(struct rtnl_af_ops *ops)
+{
+ list_del(&ops->list);
+}
+EXPORT_SYMBOL_GPL(__rtnl_af_unregister);
+
+/**
+ * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
+ * @ops: struct rtnl_af_ops * to unregister
+ */
+void rtnl_af_unregister(struct rtnl_af_ops *ops)
+{
+ rtnl_lock();
+ __rtnl_af_unregister(ops);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(rtnl_af_unregister);
+
+static size_t rtnl_link_get_af_size(const struct net_device *dev)
+{
+ struct rtnl_af_ops *af_ops;
+ size_t size;
+
+ /* IFLA_AF_SPEC */
+ size = nla_total_size(sizeof(struct nlattr));
+
+ list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->get_link_af_size) {
+ /* AF_* + nested data */
+ size += nla_total_size(sizeof(struct nlattr)) +
+ af_ops->get_link_af_size(dev);
+ }
+ }
+
+ return size;
+}
+
+static bool rtnl_have_link_slave_info(const struct net_device *dev)
+{
+ struct net_device *master_dev;
+
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (master_dev && master_dev->rtnl_link_ops)
+ return true;
+ return false;
+}
+
+static int rtnl_link_slave_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct net_device *master_dev;
+ const struct rtnl_link_ops *ops;
+ struct nlattr *slave_data;
+ int err;
+
+ master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+ if (!master_dev)
+ return 0;
+ ops = master_dev->rtnl_link_ops;
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_slave_info) {
+ slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA);
+ if (!slave_data)
+ return -EMSGSIZE;
+ err = ops->fill_slave_info(skb, master_dev, dev);
+ if (err < 0)
+ goto err_cancel_slave_data;
+ nla_nest_end(skb, slave_data);
+ }
+ return 0;
+
+err_cancel_slave_data:
+ nla_nest_cancel(skb, slave_data);
+ return err;
+}
+
+static int rtnl_link_info_fill(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+ struct nlattr *data;
+ int err;
+
+ if (!ops)
+ return 0;
+ if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
+ return -EMSGSIZE;
+ if (ops->fill_xstats) {
+ err = ops->fill_xstats(skb, dev);
+ if (err < 0)
+ return err;
+ }
+ if (ops->fill_info) {
+ data = nla_nest_start(skb, IFLA_INFO_DATA);
+ if (data == NULL)
+ return -EMSGSIZE;
+ err = ops->fill_info(skb, dev);
+ if (err < 0)
+ goto err_cancel_data;
+ nla_nest_end(skb, data);
+ }
+ return 0;
+
+err_cancel_data:
+ nla_nest_cancel(skb, data);
+ return err;
+}
+
+static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct nlattr *linkinfo;
+ int err = -EMSGSIZE;
+
+ linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
+ if (linkinfo == NULL)
+ goto out;
+
+ err = rtnl_link_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ err = rtnl_link_slave_info_fill(skb, dev);
+ if (err < 0)
+ goto err_cancel_link;
+
+ nla_nest_end(skb, linkinfo);
+ return 0;
+
+err_cancel_link:
+ nla_nest_cancel(skb, linkinfo);
+out:
+ return err;
+}
+
+int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
+{
+ struct sock *rtnl = net->rtnl;
+ int err = 0;
+
+ NETLINK_CB(skb).dst_group = group;
+ if (echo)
+ atomic_inc(&skb->users);
+ netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
+ if (echo)
+ err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+ return err;
+}
+
+int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
+{
+ struct sock *rtnl = net->rtnl;
+
+ return nlmsg_unicast(rtnl, skb, pid);
+}
+EXPORT_SYMBOL(rtnl_unicast);
+
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+ struct nlmsghdr *nlh, gfp_t flags)
+{
+ struct sock *rtnl = net->rtnl;
+ int report = 0;
+
+ if (nlh)
+ report = nlmsg_report(nlh);
+
+ nlmsg_notify(rtnl, skb, pid, group, report, flags);
+}
+EXPORT_SYMBOL(rtnl_notify);
+
+void rtnl_set_sk_err(struct net *net, u32 group, int error)
+{
+ struct sock *rtnl = net->rtnl;
+
+ netlink_set_err(rtnl, 0, group, error);
+}
+EXPORT_SYMBOL(rtnl_set_sk_err);
+
+int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
+{
+ struct nlattr *mx;
+ int i, valid = 0;
+
+ mx = nla_nest_start(skb, RTA_METRICS);
+ if (mx == NULL)
+ return -ENOBUFS;
+
+ for (i = 0; i < RTAX_MAX; i++) {
+ if (metrics[i]) {
+ if (i == RTAX_CC_ALGO - 1) {
+ char tmp[TCP_CA_NAME_MAX], *name;
+
+ name = tcp_ca_get_name_by_key(metrics[i], tmp);
+ if (!name)
+ continue;
+ if (nla_put_string(skb, i + 1, name))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u32(skb, i + 1, metrics[i]))
+ goto nla_put_failure;
+ }
+ valid++;
+ }
+ }
+
+ if (!valid) {
+ nla_nest_cancel(skb, mx);
+ return 0;
+ }
+
+ return nla_nest_end(skb, mx);
+
+nla_put_failure:
+ nla_nest_cancel(skb, mx);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(rtnetlink_put_metrics);
+
+int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
+ long expires, u32 error)
+{
+ struct rta_cacheinfo ci = {
+ .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
+ .rta_used = dst->__use,
+ .rta_clntref = atomic_read(&(dst->__refcnt)),
+ .rta_error = error,
+ .rta_id = id,
+ };
+
+ if (expires) {
+ unsigned long clock;
+
+ clock = jiffies_to_clock_t(abs(expires));
+ clock = min_t(unsigned long, clock, INT_MAX);
+ ci.rta_expires = (expires > 0) ? clock : -clock;
+ }
+ return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+}
+EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
+
+static void set_operstate(struct net_device *dev, unsigned char transition)
+{
+ unsigned char operstate = dev->operstate;
+
+ switch (transition) {
+ case IF_OPER_UP:
+ if ((operstate == IF_OPER_DORMANT ||
+ operstate == IF_OPER_UNKNOWN) &&
+ !netif_dormant(dev))
+ operstate = IF_OPER_UP;
+ break;
+
+ case IF_OPER_DORMANT:
+ if (operstate == IF_OPER_UP ||
+ operstate == IF_OPER_UNKNOWN)
+ operstate = IF_OPER_DORMANT;
+ break;
+ }
+
+ if (dev->operstate != operstate) {
+ write_lock_bh(&dev_base_lock);
+ dev->operstate = operstate;
+ write_unlock_bh(&dev_base_lock);
+ netdev_state_change(dev);
+ }
+}
+
+static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
+{
+ return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) |
+ (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI));
+}
+
+static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
+ const struct ifinfomsg *ifm)
+{
+ unsigned int flags = ifm->ifi_flags;
+
+ /* bugwards compatibility: ifi_change == 0 is treated as ~0 */
+ if (ifm->ifi_change)
+ flags = (flags & ifm->ifi_change) |
+ (rtnl_dev_get_flags(dev) & ~ifm->ifi_change);
+
+ return flags;
+}
+
+static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
+ const struct rtnl_link_stats64 *b)
+{
+ a->rx_packets = b->rx_packets;
+ a->tx_packets = b->tx_packets;
+ a->rx_bytes = b->rx_bytes;
+ a->tx_bytes = b->tx_bytes;
+ a->rx_errors = b->rx_errors;
+ a->tx_errors = b->tx_errors;
+ a->rx_dropped = b->rx_dropped;
+ a->tx_dropped = b->tx_dropped;
+
+ a->multicast = b->multicast;
+ a->collisions = b->collisions;
+
+ a->rx_length_errors = b->rx_length_errors;
+ a->rx_over_errors = b->rx_over_errors;
+ a->rx_crc_errors = b->rx_crc_errors;
+ a->rx_frame_errors = b->rx_frame_errors;
+ a->rx_fifo_errors = b->rx_fifo_errors;
+ a->rx_missed_errors = b->rx_missed_errors;
+
+ a->tx_aborted_errors = b->tx_aborted_errors;
+ a->tx_carrier_errors = b->tx_carrier_errors;
+ a->tx_fifo_errors = b->tx_fifo_errors;
+ a->tx_heartbeat_errors = b->tx_heartbeat_errors;
+ a->tx_window_errors = b->tx_window_errors;
+
+ a->rx_compressed = b->rx_compressed;
+ a->tx_compressed = b->tx_compressed;
+}
+
+static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
+{
+ memcpy(v, b, sizeof(*b));
+}
+
+/* All VF info */
+static inline int rtnl_vfinfo_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
+ (ext_filter_mask & RTEXT_FILTER_VF)) {
+ int num_vfs = dev_num_vf(dev->dev.parent);
+ size_t size = nla_total_size(sizeof(struct nlattr));
+ size += nla_total_size(num_vfs * sizeof(struct nlattr));
+ size += num_vfs *
+ (nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_rate)) +
+ nla_total_size(sizeof(struct ifla_vf_link_state)) +
+ nla_total_size(sizeof(struct ifla_vf_rss_query_en)));
+ return size;
+ } else
+ return 0;
+}
+
+static size_t rtnl_port_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ size_t port_size = nla_total_size(4) /* PORT_VF */
+ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */
+ + nla_total_size(sizeof(struct ifla_port_vsi))
+ /* PORT_VSI_TYPE */
+ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */
+ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */
+ + nla_total_size(1) /* PROT_VDP_REQUEST */
+ + nla_total_size(2); /* PORT_VDP_RESPONSE */
+ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr));
+ size_t vf_port_size = nla_total_size(sizeof(struct nlattr))
+ + port_size;
+ size_t port_self_size = nla_total_size(sizeof(struct nlattr))
+ + port_size;
+
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
+ return 0;
+ if (dev_num_vf(dev->dev.parent))
+ return port_self_size + vf_ports_size +
+ vf_port_size * dev_num_vf(dev->dev.parent);
+ else
+ return port_self_size;
+}
+
+static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
+ + nla_total_size(sizeof(struct rtnl_link_ifmap))
+ + nla_total_size(sizeof(struct rtnl_link_stats))
+ + nla_total_size(sizeof(struct rtnl_link_stats64))
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ + nla_total_size(4) /* IFLA_TXQLEN */
+ + nla_total_size(4) /* IFLA_WEIGHT */
+ + nla_total_size(4) /* IFLA_MTU */
+ + nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(1) /* IFLA_CARRIER */
+ + nla_total_size(4) /* IFLA_PROMISCUITY */
+ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ + nla_total_size(1) /* IFLA_OPERSTATE */
+ + nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ + nla_total_size(4) /* IFLA_LINK_NETNSID */
+ + nla_total_size(ext_filter_mask
+ & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
+ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */
+}
+
+static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *vf_ports;
+ struct nlattr *vf_port;
+ int vf;
+ int err;
+
+ vf_ports = nla_nest_start(skb, IFLA_VF_PORTS);
+ if (!vf_ports)
+ return -EMSGSIZE;
+
+ for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
+ vf_port = nla_nest_start(skb, IFLA_VF_PORT);
+ if (!vf_port)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_PORT_VF, vf))
+ goto nla_put_failure;
+ err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
+ if (err == -EMSGSIZE)
+ goto nla_put_failure;
+ if (err) {
+ nla_nest_cancel(skb, vf_port);
+ continue;
+ }
+ nla_nest_end(skb, vf_port);
+ }
+
+ nla_nest_end(skb, vf_ports);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, vf_ports);
+ return -EMSGSIZE;
+}
+
+static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *port_self;
+ int err;
+
+ port_self = nla_nest_start(skb, IFLA_PORT_SELF);
+ if (!port_self)
+ return -EMSGSIZE;
+
+ err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
+ if (err) {
+ nla_nest_cancel(skb, port_self);
+ return (err == -EMSGSIZE) ? err : 0;
+ }
+
+ nla_nest_end(skb, port_self);
+
+ return 0;
+}
+
+static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
+ u32 ext_filter_mask)
+{
+ int err;
+
+ if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
+ !(ext_filter_mask & RTEXT_FILTER_VF))
+ return 0;
+
+ err = rtnl_port_self_fill(skb, dev);
+ if (err)
+ return err;
+
+ if (dev_num_vf(dev->dev.parent)) {
+ err = rtnl_vf_ports_fill(skb, dev);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct netdev_phys_item_id ppid;
+
+ err = dev_get_phys_port_id(dev, &ppid);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ char name[IFNAMSIZ];
+ int err;
+
+ err = dev_get_phys_port_name(dev, name, sizeof(name));
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ int err;
+ struct netdev_phys_item_id psid;
+
+ err = netdev_switch_parent_id_get(dev, &psid);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags, u32 ext_filter_mask)
+{
+ struct ifinfomsg *ifm;
+ struct nlmsghdr *nlh;
+ struct rtnl_link_stats64 temp;
+ const struct rtnl_link_stats64 *stats;
+ struct nlattr *attr, *af_spec;
+ struct rtnl_af_ops *af_ops;
+ struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+
+ ASSERT_RTNL();
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_change = change;
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
+ nla_put_u8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
+ nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u32(skb, IFLA_GROUP, dev->group) ||
+ nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
+ nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
+#ifdef CONFIG_RPS
+ nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
+#endif
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) ||
+ (upper_dev &&
+ nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) ||
+ nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
+ (dev->qdisc &&
+ nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
+ (dev->ifalias &&
+ nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+ nla_put_u32(skb, IFLA_CARRIER_CHANGES,
+ atomic_read(&dev->carrier_changes)))
+ goto nla_put_failure;
+
+ if (1) {
+ struct rtnl_link_ifmap map = {
+ .mem_start = dev->mem_start,
+ .mem_end = dev->mem_end,
+ .base_addr = dev->base_addr,
+ .irq = dev->irq,
+ .dma = dev->dma,
+ .port = dev->if_port,
+ };
+ if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
+ goto nla_put_failure;
+ }
+
+ if (dev->addr_len) {
+ if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
+ nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
+ goto nla_put_failure;
+ }
+
+ if (rtnl_phys_port_id_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_phys_port_name_fill(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_phys_switch_id_fill(skb, dev))
+ goto nla_put_failure;
+
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
+ if (attr == NULL)
+ goto nla_put_failure;
+
+ stats = dev_get_stats(dev, &temp);
+ copy_rtnl_link_stats(nla_data(attr), stats);
+
+ attr = nla_reserve(skb, IFLA_STATS64,
+ sizeof(struct rtnl_link_stats64));
+ if (attr == NULL)
+ goto nla_put_failure;
+ copy_rtnl_link_stats64(nla_data(attr), stats);
+
+ if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) &&
+ nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)))
+ goto nla_put_failure;
+
+ if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent
+ && (ext_filter_mask & RTEXT_FILTER_VF)) {
+ int i;
+
+ struct nlattr *vfinfo, *vf;
+ int num_vfs = dev_num_vf(dev->dev.parent);
+
+ vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+ if (!vfinfo)
+ goto nla_put_failure;
+ for (i = 0; i < num_vfs; i++) {
+ struct ifla_vf_info ivi;
+ struct ifla_vf_mac vf_mac;
+ struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
+ struct ifla_vf_tx_rate vf_tx_rate;
+ struct ifla_vf_spoofchk vf_spoofchk;
+ struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_rss_query_en vf_rss_query_en;
+
+ /*
+ * Not all SR-IOV capable drivers support the
+ * spoofcheck and "RSS query enable" query. Preset to
+ * -1 so the user space tool can detect that the driver
+ * didn't report anything.
+ */
+ ivi.spoofchk = -1;
+ ivi.rss_query_en = -1;
+ memset(ivi.mac, 0, sizeof(ivi.mac));
+ /* The default value for VF link state is "auto"
+ * IFLA_VF_LINK_STATE_AUTO which equals zero
+ */
+ ivi.linkstate = 0;
+ if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
+ break;
+ vf_mac.vf =
+ vf_vlan.vf =
+ vf_rate.vf =
+ vf_tx_rate.vf =
+ vf_spoofchk.vf =
+ vf_linkstate.vf =
+ vf_rss_query_en.vf = ivi.vf;
+
+ memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ vf_vlan.vlan = ivi.vlan;
+ vf_vlan.qos = ivi.qos;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
+ vf_spoofchk.setting = ivi.spoofchk;
+ vf_linkstate.link_state = ivi.linkstate;
+ vf_rss_query_en.setting = ivi.rss_query_en;
+ vf = nla_nest_start(skb, IFLA_VF_INFO);
+ if (!vf) {
+ nla_nest_cancel(skb, vfinfo);
+ goto nla_put_failure;
+ }
+ if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
+ nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
+ &vf_tx_rate) ||
+ nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
+ &vf_spoofchk) ||
+ nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
+ &vf_linkstate) ||
+ nla_put(skb, IFLA_VF_RSS_QUERY_EN,
+ sizeof(vf_rss_query_en),
+ &vf_rss_query_en))
+ goto nla_put_failure;
+ nla_nest_end(skb, vf);
+ }
+ nla_nest_end(skb, vfinfo);
+ }
+
+ if (rtnl_port_fill(skb, dev, ext_filter_mask))
+ goto nla_put_failure;
+
+ if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
+ if (rtnl_link_fill(skb, dev) < 0)
+ goto nla_put_failure;
+ }
+
+ if (dev->rtnl_link_ops &&
+ dev->rtnl_link_ops->get_link_net) {
+ struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
+
+ if (!net_eq(dev_net(dev), link_net)) {
+ int id = peernet2id(dev_net(dev), link_net);
+
+ if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
+ goto nla_put_failure;
+ }
+ }
+
+ if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
+ goto nla_put_failure;
+
+ list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+ if (af_ops->fill_link_af) {
+ struct nlattr *af;
+ int err;
+
+ if (!(af = nla_nest_start(skb, af_ops->family)))
+ goto nla_put_failure;
+
+ err = af_ops->fill_link_af(skb, dev);
+
+ /*
+ * Caller may return ENODATA to indicate that there
+ * was no data to be dumped. This is not an error, it
+ * means we should trim the attribute header and
+ * continue.
+ */
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, af);
+ else if (err < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, af);
+ }
+ }
+
+ nla_nest_end(skb, af_spec);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
+ [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) },
+ [IFLA_MTU] = { .type = NLA_U32 },
+ [IFLA_LINK] = { .type = NLA_U32 },
+ [IFLA_MASTER] = { .type = NLA_U32 },
+ [IFLA_CARRIER] = { .type = NLA_U8 },
+ [IFLA_TXQLEN] = { .type = NLA_U32 },
+ [IFLA_WEIGHT] = { .type = NLA_U32 },
+ [IFLA_OPERSTATE] = { .type = NLA_U8 },
+ [IFLA_LINKMODE] = { .type = NLA_U8 },
+ [IFLA_LINKINFO] = { .type = NLA_NESTED },
+ [IFLA_NET_NS_PID] = { .type = NLA_U32 },
+ [IFLA_NET_NS_FD] = { .type = NLA_U32 },
+ [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 },
+ [IFLA_VFINFO_LIST] = {. type = NLA_NESTED },
+ [IFLA_VF_PORTS] = { .type = NLA_NESTED },
+ [IFLA_PORT_SELF] = { .type = NLA_NESTED },
+ [IFLA_AF_SPEC] = { .type = NLA_NESTED },
+ [IFLA_EXT_MASK] = { .type = NLA_U32 },
+ [IFLA_PROMISCUITY] = { .type = NLA_U32 },
+ [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
+ [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
+ [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
+ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
+ [IFLA_LINK_NETNSID] = { .type = NLA_S32 },
+};
+
+static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+ [IFLA_INFO_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_DATA] = { .type = NLA_NESTED },
+ [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING },
+ [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
+ [IFLA_VF_INFO] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
+ [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
+ [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
+ [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
+ [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) },
+ [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) },
+};
+
+static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
+ [IFLA_PORT_VF] = { .type = NLA_U32 },
+ [IFLA_PORT_PROFILE] = { .type = NLA_STRING,
+ .len = PORT_PROFILE_MAX },
+ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY,
+ .len = sizeof(struct ifla_port_vsi)},
+ [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
+ .len = PORT_UUID_MAX },
+ [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING,
+ .len = PORT_UUID_MAX },
+ [IFLA_PORT_REQUEST] = { .type = NLA_U8, },
+ [IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
+};
+
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx = 0, s_idx;
+ struct net_device *dev;
+ struct hlist_head *head;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ int err;
+ int hdrlen;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ cb->seq = net->dev_base_seq;
+
+ /* A hack to preserve kernel<->userspace interface.
+ * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
+ * However, before Linux v3.9 the code here assumed rtgenmsg and that's
+ * what iproute2 < v3.9.0 used.
+ * We can detect the old iproute2. Even including the IFLA_EXT_MASK
+ * attribute, its netlink message is shorter than struct ifinfomsg.
+ */
+ hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ }
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ NLM_F_MULTI,
+ ext_filter_mask);
+ /* If we ran out of room on the first message,
+ * we're in trouble
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err < 0)
+ goto out;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+out:
+ cb->args[1] = idx;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len)
+{
+ return nla_parse(tb, IFLA_MAX, head, len, ifla_policy);
+}
+EXPORT_SYMBOL(rtnl_nla_parse_ifla);
+
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+ struct net *net;
+ /* Examine the link attributes and figure out which
+ * network namespace we are talking about.
+ */
+ if (tb[IFLA_NET_NS_PID])
+ net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+ else if (tb[IFLA_NET_NS_FD])
+ net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
+ else
+ net = get_net(src_net);
+ return net;
+}
+EXPORT_SYMBOL(rtnl_link_get_net);
+
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+{
+ if (dev) {
+ if (tb[IFLA_ADDRESS] &&
+ nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
+ return -EINVAL;
+
+ if (tb[IFLA_BROADCAST] &&
+ nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_AF_SPEC]) {
+ struct nlattr *af;
+ int rem, err;
+
+ nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+ const struct rtnl_af_ops *af_ops;
+
+ if (!(af_ops = rtnl_af_lookup(nla_type(af))))
+ return -EAFNOSUPPORT;
+
+ if (!af_ops->set_link_af)
+ return -EOPNOTSUPP;
+
+ if (af_ops->validate_link_af) {
+ err = af_ops->validate_link_af(dev, af);
+ if (err < 0)
+ return err;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
+{
+ int rem, err = -EINVAL;
+ struct nlattr *vf;
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ nla_for_each_nested(vf, attr, rem) {
+ switch (nla_type(vf)) {
+ case IFLA_VF_MAC: {
+ struct ifla_vf_mac *ivm;
+ ivm = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_mac)
+ err = ops->ndo_set_vf_mac(dev, ivm->vf,
+ ivm->mac);
+ break;
+ }
+ case IFLA_VF_VLAN: {
+ struct ifla_vf_vlan *ivv;
+ ivv = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_vlan)
+ err = ops->ndo_set_vf_vlan(dev, ivv->vf,
+ ivv->vlan,
+ ivv->qos);
+ break;
+ }
+ case IFLA_VF_TX_RATE: {
+ struct ifla_vf_tx_rate *ivt;
+ struct ifla_vf_info ivf;
+ ivt = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_get_vf_config)
+ err = ops->ndo_get_vf_config(dev, ivt->vf,
+ &ivf);
+ if (err)
+ break;
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate,
+ ivt->rate);
+ break;
+ }
+ case IFLA_VF_RATE: {
+ struct ifla_vf_rate *ivt;
+ ivt = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rate)
+ err = ops->ndo_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate,
+ ivt->max_tx_rate);
+ break;
+ }
+ case IFLA_VF_SPOOFCHK: {
+ struct ifla_vf_spoofchk *ivs;
+ ivs = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_spoofchk)
+ err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
+ ivs->setting);
+ break;
+ }
+ case IFLA_VF_LINK_STATE: {
+ struct ifla_vf_link_state *ivl;
+ ivl = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_link_state)
+ err = ops->ndo_set_vf_link_state(dev, ivl->vf,
+ ivl->link_state);
+ break;
+ }
+ case IFLA_VF_RSS_QUERY_EN: {
+ struct ifla_vf_rss_query_en *ivrssq_en;
+
+ ivrssq_en = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_rss_query_en)
+ err = ops->ndo_set_vf_rss_query_en(dev,
+ ivrssq_en->vf,
+ ivrssq_en->setting);
+ break;
+ }
+ default:
+ err = -EINVAL;
+ break;
+ }
+ if (err)
+ break;
+ }
+ return err;
+}
+
+static int do_set_master(struct net_device *dev, int ifindex)
+{
+ struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops;
+ int err;
+
+ if (upper_dev) {
+ if (upper_dev->ifindex == ifindex)
+ return 0;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_del_slave) {
+ err = ops->ndo_del_slave(upper_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (ifindex) {
+ upper_dev = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!upper_dev)
+ return -EINVAL;
+ ops = upper_dev->netdev_ops;
+ if (ops->ndo_add_slave) {
+ err = ops->ndo_add_slave(upper_dev, dev);
+ if (err)
+ return err;
+ } else {
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+#define DO_SETLINK_MODIFIED 0x01
+/* notify flag means notify + modified. */
+#define DO_SETLINK_NOTIFY 0x03
+static int do_setlink(const struct sk_buff *skb,
+ struct net_device *dev, struct ifinfomsg *ifm,
+ struct nlattr **tb, char *ifname, int status)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+
+ if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
+ struct net *net = rtnl_link_get_net(dev_net(dev), tb);
+ if (IS_ERR(net)) {
+ err = PTR_ERR(net);
+ goto errout;
+ }
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ err = -EPERM;
+ goto errout;
+ }
+ err = dev_change_net_namespace(dev, net, ifname);
+ put_net(net);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_MAP]) {
+ struct rtnl_link_ifmap *u_map;
+ struct ifmap k_map;
+
+ if (!ops->ndo_set_config) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (!netif_device_present(dev)) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ u_map = nla_data(tb[IFLA_MAP]);
+ k_map.mem_start = (unsigned long) u_map->mem_start;
+ k_map.mem_end = (unsigned long) u_map->mem_end;
+ k_map.base_addr = (unsigned short) u_map->base_addr;
+ k_map.irq = (unsigned char) u_map->irq;
+ k_map.dma = (unsigned char) u_map->dma;
+ k_map.port = (unsigned char) u_map->port;
+
+ err = ops->ndo_set_config(dev, &k_map);
+ if (err < 0)
+ goto errout;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_ADDRESS]) {
+ struct sockaddr *sa;
+ int len;
+
+ len = sizeof(sa_family_t) + dev->addr_len;
+ sa = kmalloc(len, GFP_KERNEL);
+ if (!sa) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ sa->sa_family = dev->type;
+ memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
+ dev->addr_len);
+ err = dev_set_mac_address(dev, sa);
+ kfree(sa);
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_MTU]) {
+ err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_GROUP]) {
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ /*
+ * Interface selected by interface index but interface
+ * name provided implies that a name change has been
+ * requested.
+ */
+ if (ifm->ifi_index > 0 && ifname[0]) {
+ err = dev_change_name(dev, ifname);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_IFALIAS]) {
+ err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+ nla_len(tb[IFLA_IFALIAS]));
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_BROADCAST]) {
+ nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ }
+
+ if (ifm->ifi_flags || ifm->ifi_change) {
+ err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ if (err < 0)
+ goto errout;
+ }
+
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_CARRIER]) {
+ err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_MODIFIED;
+ }
+
+ if (tb[IFLA_TXQLEN]) {
+ unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
+
+ if (dev->tx_queue_len ^ value)
+ status |= DO_SETLINK_NOTIFY;
+
+ dev->tx_queue_len = value;
+ }
+
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+
+ if (tb[IFLA_LINKMODE]) {
+ unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
+
+ write_lock_bh(&dev_base_lock);
+ if (dev->link_mode ^ value)
+ status |= DO_SETLINK_NOTIFY;
+ dev->link_mode = value;
+ write_unlock_bh(&dev_base_lock);
+ }
+
+ if (tb[IFLA_VFINFO_LIST]) {
+ struct nlattr *attr;
+ int rem;
+ nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_INFO) {
+ err = -EINVAL;
+ goto errout;
+ }
+ err = do_setvfinfo(dev, attr);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+ err = 0;
+
+ if (tb[IFLA_VF_PORTS]) {
+ struct nlattr *port[IFLA_PORT_MAX+1];
+ struct nlattr *attr;
+ int vf;
+ int rem;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_port)
+ goto errout;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
+ if (nla_type(attr) != IFLA_VF_PORT)
+ continue;
+ err = nla_parse_nested(port, IFLA_PORT_MAX,
+ attr, ifla_port_policy);
+ if (err < 0)
+ goto errout;
+ if (!port[IFLA_PORT_VF]) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+ vf = nla_get_u32(port[IFLA_PORT_VF]);
+ err = ops->ndo_set_vf_port(dev, vf, port);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+ err = 0;
+
+ if (tb[IFLA_PORT_SELF]) {
+ struct nlattr *port[IFLA_PORT_MAX+1];
+
+ err = nla_parse_nested(port, IFLA_PORT_MAX,
+ tb[IFLA_PORT_SELF], ifla_port_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_port)
+ err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
+ if (err < 0)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (tb[IFLA_AF_SPEC]) {
+ struct nlattr *af;
+ int rem;
+
+ nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+ const struct rtnl_af_ops *af_ops;
+
+ if (!(af_ops = rtnl_af_lookup(nla_type(af))))
+ BUG();
+
+ err = af_ops->set_link_af(dev, af);
+ if (err < 0)
+ goto errout;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+ err = 0;
+
+errout:
+ if (status & DO_SETLINK_MODIFIED) {
+ if (status & DO_SETLINK_NOTIFY)
+ netdev_state_change(dev);
+
+ if (err < 0)
+ net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
+ dev->name);
+ }
+
+ return err;
+}
+
+static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ int err;
+ struct nlattr *tb[IFLA_MAX+1];
+ char ifname[IFNAMSIZ];
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ goto errout;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
+ err = -EINVAL;
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ goto errout;
+
+ if (dev == NULL) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ goto errout;
+
+ err = do_setlink(skb, dev, ifm, tb, ifname, 0);
+errout:
+ return err;
+}
+
+static int rtnl_group_dellink(const struct net *net, int group)
+{
+ struct net_device *dev, *aux;
+ LIST_HEAD(list_kill);
+ bool found = false;
+
+ if (!group)
+ return -EPERM;
+
+ for_each_netdev(net, dev) {
+ if (dev->group == group) {
+ const struct rtnl_link_ops *ops;
+
+ found = true;
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+ }
+ }
+
+ if (!found)
+ return -ENODEV;
+
+ for_each_netdev_safe(net, dev, aux) {
+ if (dev->group == group) {
+ const struct rtnl_link_ops *ops;
+
+ ops = dev->rtnl_link_ops;
+ ops->dellink(dev, &list_kill);
+ }
+ }
+ unregister_netdevice_many(&list_kill);
+
+ return 0;
+}
+
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct rtnl_link_ops *ops;
+ struct net_device *dev;
+ struct ifinfomsg *ifm;
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ int err;
+ LIST_HEAD(list_kill);
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(net, ifname);
+ else if (tb[IFLA_GROUP])
+ return rtnl_group_dellink(net, nla_get_u32(tb[IFLA_GROUP]));
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ return 0;
+}
+
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+{
+ unsigned int old_flags;
+ int err;
+
+ old_flags = dev->flags;
+ if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
+ err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ if (err < 0)
+ return err;
+ }
+
+ dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+
+ __dev_notify_flags(dev, old_flags, ~0U);
+ return 0;
+}
+EXPORT_SYMBOL(rtnl_configure_link);
+
+struct net_device *rtnl_create_link(struct net *net,
+ const char *ifname, unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops, struct nlattr *tb[])
+{
+ int err;
+ struct net_device *dev;
+ unsigned int num_tx_queues = 1;
+ unsigned int num_rx_queues = 1;
+
+ if (tb[IFLA_NUM_TX_QUEUES])
+ num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
+ else if (ops->get_num_tx_queues)
+ num_tx_queues = ops->get_num_tx_queues();
+
+ if (tb[IFLA_NUM_RX_QUEUES])
+ num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
+ else if (ops->get_num_rx_queues)
+ num_rx_queues = ops->get_num_rx_queues();
+
+ err = -ENOMEM;
+ dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
+ ops->setup, num_tx_queues, num_rx_queues);
+ if (!dev)
+ goto err;
+
+ dev_net_set(dev, net);
+ dev->rtnl_link_ops = ops;
+ dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
+
+ if (tb[IFLA_MTU])
+ dev->mtu = nla_get_u32(tb[IFLA_MTU]);
+ if (tb[IFLA_ADDRESS]) {
+ memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
+ nla_len(tb[IFLA_ADDRESS]));
+ dev->addr_assign_type = NET_ADDR_SET;
+ }
+ if (tb[IFLA_BROADCAST])
+ memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
+ nla_len(tb[IFLA_BROADCAST]));
+ if (tb[IFLA_TXQLEN])
+ dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+ if (tb[IFLA_OPERSTATE])
+ set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+ if (tb[IFLA_LINKMODE])
+ dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+ if (tb[IFLA_GROUP])
+ dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+
+ return dev;
+
+err:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(rtnl_create_link);
+
+static int rtnl_group_changelink(const struct sk_buff *skb,
+ struct net *net, int group,
+ struct ifinfomsg *ifm,
+ struct nlattr **tb)
+{
+ struct net_device *dev, *aux;
+ int err;
+
+ for_each_netdev_safe(net, dev, aux) {
+ if (dev->group == group) {
+ err = do_setlink(skb, dev, ifm, tb, NULL, 0);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct rtnl_link_ops *ops;
+ const struct rtnl_link_ops *m_ops = NULL;
+ struct net_device *dev;
+ struct net_device *master_dev = NULL;
+ struct ifinfomsg *ifm;
+ char kind[MODULE_NAME_LEN];
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+ unsigned char name_assign_type = NET_NAME_USER;
+ int err;
+
+#ifdef CONFIG_MODULES
+replay:
+#endif
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else {
+ if (ifname[0])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ dev = NULL;
+ }
+
+ if (dev) {
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ m_ops = master_dev->rtnl_link_ops;
+ }
+
+ err = validate_linkmsg(dev, tb);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_LINKINFO]) {
+ err = nla_parse_nested(linkinfo, IFLA_INFO_MAX,
+ tb[IFLA_LINKINFO], ifla_info_policy);
+ if (err < 0)
+ return err;
+ } else
+ memset(linkinfo, 0, sizeof(linkinfo));
+
+ if (linkinfo[IFLA_INFO_KIND]) {
+ nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind);
+ } else {
+ kind[0] = '\0';
+ ops = NULL;
+ }
+
+ if (1) {
+ struct nlattr *attr[ops ? ops->maxtype + 1 : 1];
+ struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1];
+ struct nlattr **data = NULL;
+ struct nlattr **slave_data = NULL;
+ struct net *dest_net, *link_net = NULL;
+
+ if (ops) {
+ if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+ err = nla_parse_nested(attr, ops->maxtype,
+ linkinfo[IFLA_INFO_DATA],
+ ops->policy);
+ if (err < 0)
+ return err;
+ data = attr;
+ }
+ if (ops->validate) {
+ err = ops->validate(tb, data);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ if (m_ops) {
+ if (m_ops->slave_maxtype &&
+ linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ err = nla_parse_nested(slave_attr,
+ m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy);
+ if (err < 0)
+ return err;
+ slave_data = slave_attr;
+ }
+ if (m_ops->slave_validate) {
+ err = m_ops->slave_validate(tb, slave_data);
+ if (err < 0)
+ return err;
+ }
+ }
+
+ if (dev) {
+ int status = 0;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops ||
+ !ops->changelink)
+ return -EOPNOTSUPP;
+
+ err = ops->changelink(dev, tb, data);
+ if (err < 0)
+ return err;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
+
+ err = m_ops->slave_changelink(master_dev, dev,
+ tb, slave_data);
+ if (err < 0)
+ return err;
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ return do_setlink(skb, dev, ifm, tb, ifname, status);
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+ return rtnl_group_changelink(skb, net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, tb);
+ return -ENODEV;
+ }
+
+ if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
+
+ if (!ops) {
+#ifdef CONFIG_MODULES
+ if (kind[0]) {
+ __rtnl_unlock();
+ request_module("rtnl-link-%s", kind);
+ rtnl_lock();
+ ops = rtnl_link_ops_get(kind);
+ if (ops)
+ goto replay;
+ }
+#endif
+ return -EOPNOTSUPP;
+ }
+
+ if (!ops->setup)
+ return -EOPNOTSUPP;
+
+ if (!ifname[0]) {
+ snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+ name_assign_type = NET_NAME_ENUM;
+ }
+
+ dest_net = rtnl_link_get_net(net, tb);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+
+ err = -EPERM;
+ if (!netlink_ns_capable(skb, dest_net->user_ns, CAP_NET_ADMIN))
+ goto out;
+
+ if (tb[IFLA_LINK_NETNSID]) {
+ int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
+
+ link_net = get_net_ns_by_id(dest_net, id);
+ if (!link_net) {
+ err = -EINVAL;
+ goto out;
+ }
+ err = -EPERM;
+ if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
+ goto out;
+ }
+
+ dev = rtnl_create_link(link_net ? : dest_net, ifname,
+ name_assign_type, ops, tb);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out;
+ }
+
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink) {
+ err = ops->newlink(link_net ? : net, dev, tb, data);
+ /* Drivers should call free_netdev() in ->destructor
+ * and unregister it on failure after registration
+ * so that device could be finally freed in rtnl_unlock.
+ */
+ if (err < 0) {
+ /* If device is not registered at all, free it now */
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ free_netdev(dev);
+ goto out;
+ }
+ } else {
+ err = register_netdevice(dev);
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
+ }
+ }
+ err = rtnl_configure_link(dev, ifm);
+ if (err < 0)
+ goto out_unregister;
+ if (link_net) {
+ err = dev_change_net_namespace(dev, dest_net, ifname);
+ if (err < 0)
+ goto out_unregister;
+ }
+out:
+ if (link_net)
+ put_net(link_net);
+ put_net(dest_net);
+ return err;
+out_unregister:
+ if (ops->newlink) {
+ LIST_HEAD(list_kill);
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ } else {
+ unregister_netdevice(dev);
+ }
+ goto out;
+ }
+}
+
+static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ char ifname[IFNAMSIZ];
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev = NULL;
+ struct sk_buff *nskb;
+ int err;
+ u32 ext_filter_mask = 0;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_IFNAME])
+ nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME])
+ dev = __dev_get_by_name(net, ifname);
+ else
+ return -EINVAL;
+
+ if (dev == NULL)
+ return -ENODEV;
+
+ nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
+ if (nskb == NULL)
+ return -ENOBUFS;
+
+ err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, 0, 0, ext_filter_mask);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(nskb);
+ } else
+ err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+
+ return err;
+}
+
+static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ struct nlattr *tb[IFLA_MAX+1];
+ u32 ext_filter_mask = 0;
+ u16 min_ifinfo_dump_size = 0;
+ int hdrlen;
+
+ /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) {
+ if (tb[IFLA_EXT_MASK])
+ ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ }
+
+ if (!ext_filter_mask)
+ return NLMSG_GOODSIZE;
+ /*
+ * traverse the list of net devices and compute the minimum
+ * buffer size based upon the filter mask.
+ */
+ list_for_each_entry(dev, &net->dev_base_head, dev_list) {
+ min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
+ if_nlmsg_size(dev,
+ ext_filter_mask));
+ }
+
+ return min_ifinfo_dump_size;
+}
+
+static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx;
+ int s_idx = cb->family;
+
+ if (s_idx == 0)
+ s_idx = 1;
+ for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
+ int type = cb->nlh->nlmsg_type-RTM_BASE;
+ if (idx < s_idx || idx == PF_PACKET)
+ continue;
+ if (rtnl_msg_handlers[idx] == NULL ||
+ rtnl_msg_handlers[idx][type].dumpit == NULL)
+ continue;
+ if (idx > s_idx) {
+ memset(&cb->args[0], 0, sizeof(cb->args));
+ cb->prev_seq = 0;
+ cb->seq = 0;
+ }
+ if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
+ break;
+ }
+ cb->family = idx;
+
+ return skb->len;
+}
+
+struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
+ unsigned int change, gfp_t flags)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ size_t if_info_size;
+
+ skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags);
+ if (skb == NULL)
+ goto errout;
+
+ err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in if_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ return skb;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return NULL;
+}
+
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+ gfp_t flags)
+{
+ struct sk_buff *skb;
+
+ if (dev->reg_state != NETREG_REGISTERED)
+ return;
+
+ skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, flags);
+}
+EXPORT_SYMBOL(rtmsg_ifinfo);
+
+static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
+ struct net_device *dev,
+ u8 *addr, u16 vid, u32 pid, u32 seq,
+ int type, unsigned int flags,
+ int nlflags)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = flags;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dev->ifindex;
+ ndm->ndm_state = NUD_PERMANENT;
+
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+ goto nla_put_failure;
+ if (vid)
+ if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t rtnl_fdb_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
+}
+
+static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, addr, vid,
+ 0, 0, type, NTF_SELF, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/**
+ * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
+ */
+int ndo_dflt_fdb_add(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid,
+ u16 flags)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return err;
+ }
+
+ if (vid) {
+ pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name);
+ return err;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_add_excl(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_add_excl(dev, addr);
+
+ /* Only return duplicate errors if NLM_F_EXCL is set */
+ if (err == -EEXIST && !(flags & NLM_F_EXCL))
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_add);
+
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
+{
+ u16 vid = 0;
+
+ if (vlan_attr) {
+ if (nla_len(vlan_attr) != sizeof(u16)) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
+ return -EINVAL;
+ }
+
+ vid = nla_get_u16(vlan_attr);
+
+ if (!vid || vid >= VLAN_VID_MASK) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
+ vid);
+ return -EINVAL;
+ }
+ }
+ *p_vid = vid;
+ return 0;
+}
+
+static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ u8 *addr;
+ u16 vid;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n");
+ return -EINVAL;
+ }
+
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+ if (err)
+ return err;
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
+
+ err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
+ nlh->nlmsg_flags);
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if ((ndm->ndm_flags & NTF_SELF)) {
+ if (dev->netdev_ops->ndo_fdb_add)
+ err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+ vid,
+ nlh->nlmsg_flags);
+ else
+ err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
+ nlh->nlmsg_flags);
+
+ if (!err) {
+ rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
+}
+
+/**
+ * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
+ */
+int ndo_dflt_fdb_del(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid)
+{
+ int err = -EINVAL;
+
+ /* If aging addresses are supported device will need to
+ * implement its own handler for this.
+ */
+ if (!(ndm->ndm_state & NUD_PERMANENT)) {
+ pr_info("%s: FDB only supports static addresses\n", dev->name);
+ return err;
+ }
+
+ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+ err = dev_uc_del(dev, addr);
+ else if (is_multicast_ether_addr(addr))
+ err = dev_mc_del(dev, addr);
+
+ return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_del);
+
+static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ndmsg *ndm;
+ struct nlattr *tb[NDA_MAX+1];
+ struct net_device *dev;
+ int err = -EINVAL;
+ __u8 *addr;
+ u16 vid;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_ifindex == 0) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+ if (dev == NULL) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n");
+ return -EINVAL;
+ }
+
+ addr = nla_data(tb[NDA_LLADDR]);
+
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+ if (err)
+ return err;
+
+ err = -EOPNOTSUPP;
+
+ /* Support fdb on master device the net/bridge default case */
+ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
+ (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+ const struct net_device_ops *ops = br_dev->netdev_ops;
+
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
+
+ if (err)
+ goto out;
+ else
+ ndm->ndm_flags &= ~NTF_MASTER;
+ }
+
+ /* Embedded bridge, macvlan, and any other device support */
+ if (ndm->ndm_flags & NTF_SELF) {
+ if (dev->netdev_ops->ndo_fdb_del)
+ err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
+ vid);
+ else
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
+
+ if (!err) {
+ rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH);
+ ndm->ndm_flags &= ~NTF_SELF;
+ }
+ }
+out:
+ return err;
+}
+
+static int nlmsg_populate_fdb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ int *idx,
+ struct netdev_hw_addr_list *list)
+{
+ struct netdev_hw_addr *ha;
+ int err;
+ u32 portid, seq;
+
+ portid = NETLINK_CB(cb->skb).portid;
+ seq = cb->nlh->nlmsg_seq;
+
+ list_for_each_entry(ha, &list->list, list) {
+ if (*idx < cb->args[0])
+ goto skip;
+
+ err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
+ portid, seq,
+ RTM_NEWNEIGH, NTF_SELF,
+ NLM_F_MULTI);
+ if (err < 0)
+ return err;
+skip:
+ *idx += 1;
+ }
+ return 0;
+}
+
+/**
+ * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
+ * @nlh: netlink message header
+ * @dev: netdevice
+ *
+ * Default netdevice operation to dump the existing unicast address list.
+ * Returns number of addresses from list put in skb.
+ */
+int ndo_dflt_fdb_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ struct net_device *filter_dev,
+ int idx)
+{
+ int err;
+
+ netif_addr_lock_bh(dev);
+ err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc);
+ if (err)
+ goto out;
+ nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
+out:
+ netif_addr_unlock_bh(dev);
+ return idx;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_dump);
+
+static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net_device *dev;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *br_dev = NULL;
+ const struct net_device_ops *ops = NULL;
+ const struct net_device_ops *cops = NULL;
+ struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ int brport_idx = 0;
+ int br_idx = 0;
+ int idx = 0;
+
+ if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
+ ifla_policy) == 0) {
+ if (tb[IFLA_MASTER])
+ br_idx = nla_get_u32(tb[IFLA_MASTER]);
+ }
+
+ brport_idx = ifm->ifi_index;
+
+ if (br_idx) {
+ br_dev = __dev_get_by_index(net, br_idx);
+ if (!br_dev)
+ return -ENODEV;
+
+ ops = br_dev->netdev_ops;
+ }
+
+ for_each_netdev(net, dev) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
+
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
+ }
+
+ } else {
+ if (dev != br_dev &&
+ !(dev->priv_flags & IFF_BRIDGE_PORT))
+ continue;
+
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !(dev->priv_flags & IFF_EBRIDGE))
+ continue;
+
+ cops = ops;
+ }
+
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (cops && cops->ndo_fdb_dump)
+ idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ idx);
+ }
+
+ if (dev->netdev_ops->ndo_fdb_dump)
+ idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+ idx);
+ else
+ idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+
+ cops = NULL;
+ }
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
+ unsigned int attrnum, unsigned int flag)
+{
+ if (mask & flag)
+ return nla_put_u8(skb, attrnum, !!(flags & flag));
+ return 0;
+}
+
+int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u16 mode,
+ u32 flags, u32 mask, int nlflags)
+{
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ struct nlattr *br_afspec;
+ struct nlattr *protinfo;
+ u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifi_family = AF_BRIDGE;
+ ifm->__ifi_pad = 0;
+ ifm->ifi_type = dev->type;
+ ifm->ifi_index = dev->ifindex;
+ ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_change = 0;
+
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
+ (br_dev &&
+ nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
+ goto nla_put_failure;
+
+ br_afspec = nla_nest_start(skb, IFLA_AF_SPEC);
+ if (!br_afspec)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+
+ if (mode != BRIDGE_MODE_UNDEF) {
+ if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
+ nla_nest_cancel(skb, br_afspec);
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(skb, br_afspec);
+
+ protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
+ if (!protinfo)
+ goto nla_put_failure;
+
+ if (brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_FAST_LEAVE,
+ BR_MULTICAST_FAST_LEAVE) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING, BR_LEARNING) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+ nla_nest_cancel(skb, protinfo);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, protinfo);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ndo_dflt_bridge_getlink);
+
+static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx = 0;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ u32 seq = cb->nlh->nlmsg_seq;
+ u32 filter_mask = 0;
+
+ if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) {
+ struct nlattr *extfilt;
+
+ extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
+ IFLA_EXT_MASK);
+ if (extfilt) {
+ if (nla_len(extfilt) < sizeof(filter_mask))
+ return -EINVAL;
+
+ filter_mask = nla_get_u32(extfilt);
+ }
+ }
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0] &&
+ br_dev->netdev_ops->ndo_bridge_getlink(
+ skb, portid, seq, dev, filter_mask,
+ NLM_F_MULTI) < 0)
+ break;
+ idx++;
+ }
+
+ if (ops->ndo_bridge_getlink) {
+ if (idx >= cb->args[0] &&
+ ops->ndo_bridge_getlink(skb, portid, seq, dev,
+ filter_mask,
+ NLM_F_MULTI) < 0)
+ break;
+ idx++;
+ }
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static inline size_t bridge_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */
+ + nla_total_size(sizeof(u32)) /* IFLA_MTU */
+ + nla_total_size(sizeof(u32)) /* IFLA_LINK */
+ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */
+ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */
+ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */
+ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */
+ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */
+}
+
+static int rtnl_bridge_notify(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -EOPNOTSUPP;
+
+ if (!dev->netdev_ops->ndo_bridge_getlink)
+ return 0;
+
+ skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
+ if (err < 0)
+ goto errout;
+
+ if (!skb->len)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+ return 0;
+errout:
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ if (err)
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ return err;
+}
+
+static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 flags = 0;
+ bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
+
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
+ }
+ }
+ }
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_setlink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
+ flags);
+ if (!err) {
+ flags &= ~BRIDGE_FLAGS_SELF;
+
+ /* Generate event to notify upper layer of bridge
+ * change
+ */
+ err = rtnl_bridge_notify(dev);
+ }
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+out:
+ return err;
+}
+
+static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+ struct nlattr *br_spec, *attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 flags = 0;
+ bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_family != AF_BRIDGE)
+ return -EPFNOSUPPORT;
+
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ if (!dev) {
+ pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+ return -ENODEV;
+ }
+
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
+
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
+ }
+ }
+ }
+
+ if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
+ struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+
+ if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
+ if (err)
+ goto out;
+
+ flags &= ~BRIDGE_FLAGS_MASTER;
+ }
+
+ if ((flags & BRIDGE_FLAGS_SELF)) {
+ if (!dev->netdev_ops->ndo_bridge_dellink)
+ err = -EOPNOTSUPP;
+ else
+ err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh,
+ flags);
+
+ if (!err) {
+ flags &= ~BRIDGE_FLAGS_SELF;
+
+ /* Generate event to notify upper layer of bridge
+ * change
+ */
+ err = rtnl_bridge_notify(dev);
+ }
+ }
+
+ if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+out:
+ return err;
+}
+
+/* Process one rtnetlink message. */
+
+static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ rtnl_doit_func doit;
+ int sz_idx, kind;
+ int family;
+ int type;
+ int err;
+
+ type = nlh->nlmsg_type;
+ if (type > RTM_MAX)
+ return -EOPNOTSUPP;
+
+ type -= RTM_BASE;
+
+ /* All the messages must have at least 1 byte length */
+ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
+ return 0;
+
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+ sz_idx = type>>2;
+ kind = type&3;
+
+ if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+ struct sock *rtnl;
+ rtnl_dumpit_func dumpit;
+ rtnl_calcit_func calcit;
+ u16 min_dump_alloc = 0;
+
+ dumpit = rtnl_get_dumpit(family, type);
+ if (dumpit == NULL)
+ return -EOPNOTSUPP;
+ calcit = rtnl_get_calcit(family, type);
+ if (calcit)
+ min_dump_alloc = calcit(skb, nlh);
+
+ __rtnl_unlock();
+ rtnl = net->rtnl;
+ {
+ struct netlink_dump_control c = {
+ .dump = dumpit,
+ .min_dump_alloc = min_dump_alloc,
+ };
+ err = netlink_dump_start(rtnl, skb, nlh, &c);
+ }
+ rtnl_lock();
+ return err;
+ }
+
+ doit = rtnl_get_doit(family, type);
+ if (doit == NULL)
+ return -EOPNOTSUPP;
+
+ return doit(skb, nlh);
+}
+
+static void rtnetlink_rcv(struct sk_buff *skb)
+{
+ rtnl_lock();
+ netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
+ rtnl_unlock();
+}
+
+static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_PRE_UP:
+ case NETDEV_POST_INIT:
+ case NETDEV_REGISTER:
+ case NETDEV_CHANGE:
+ case NETDEV_PRE_TYPE_CHANGE:
+ case NETDEV_GOING_DOWN:
+ case NETDEV_UNREGISTER:
+ case NETDEV_UNREGISTER_FINAL:
+ case NETDEV_RELEASE:
+ case NETDEV_JOIN:
+ case NETDEV_BONDING_INFO:
+ break;
+ default:
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rtnetlink_dev_notifier = {
+ .notifier_call = rtnetlink_event,
+};
+
+
+static int __net_init rtnetlink_net_init(struct net *net)
+{
+ struct sock *sk;
+ struct netlink_kernel_cfg cfg = {
+ .groups = RTNLGRP_MAX,
+ .input = rtnetlink_rcv,
+ .cb_mutex = &rtnl_mutex,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
+ if (!sk)
+ return -ENOMEM;
+ net->rtnl = sk;
+ return 0;
+}
+
+static void __net_exit rtnetlink_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->rtnl);
+ net->rtnl = NULL;
+}
+
+static struct pernet_operations rtnetlink_net_ops = {
+ .init = rtnetlink_net_init,
+ .exit = rtnetlink_net_exit,
+};
+
+void __init rtnetlink_init(void)
+{
+ if (register_pernet_subsys(&rtnetlink_net_ops))
+ panic("rtnetlink_init: cannot initialize rtnetlink\n");
+
+ register_netdevice_notifier(&rtnetlink_dev_notifier);
+
+ rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
+ rtnl_dump_ifinfo, rtnl_calcit);
+ rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
+
+ rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
+
+ rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
+
+ rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
+ rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
+ rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
+}
+
diff --git a/net/core/scm.c b/net/core/scm.c
new file mode 100644
index 000000000..3b6899b7d
--- /dev/null
+++ b/net/core/scm.c
@@ -0,0 +1,340 @@
+/* scm.c - Socket level control messages processing.
+ *
+ * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Alignment and value checking mods by Craig Metz
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <linux/pid_namespace.h>
+#include <linux/pid.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/compat.h>
+#include <net/scm.h>
+#include <net/cls_cgroup.h>
+
+
+/*
+ * Only allow a user to send credentials, that they could set with
+ * setu(g)id.
+ */
+
+static __inline__ int scm_check_creds(struct ucred *creds)
+{
+ const struct cred *cred = current_cred();
+ kuid_t uid = make_kuid(cred->user_ns, creds->uid);
+ kgid_t gid = make_kgid(cred->user_ns, creds->gid);
+
+ if (!uid_valid(uid) || !gid_valid(gid))
+ return -EINVAL;
+
+ if ((creds->pid == task_tgid_vnr(current) ||
+ ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
+ ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
+ uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
+ ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
+ gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) {
+ return 0;
+ }
+ return -EPERM;
+}
+
+static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
+{
+ int *fdp = (int*)CMSG_DATA(cmsg);
+ struct scm_fp_list *fpl = *fplp;
+ struct file **fpp;
+ int i, num;
+
+ num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+
+ if (num <= 0)
+ return 0;
+
+ if (num > SCM_MAX_FD)
+ return -EINVAL;
+
+ if (!fpl)
+ {
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
+ *fplp = fpl;
+ fpl->count = 0;
+ fpl->max = SCM_MAX_FD;
+ }
+ fpp = &fpl->fp[fpl->count];
+
+ if (fpl->count + num > fpl->max)
+ return -EINVAL;
+
+ /*
+ * Verify the descriptors and increment the usage count.
+ */
+
+ for (i=0; i< num; i++)
+ {
+ int fd = fdp[i];
+ struct file *file;
+
+ if (fd < 0 || !(file = fget_raw(fd)))
+ return -EBADF;
+ *fpp++ = file;
+ fpl->count++;
+ }
+ return num;
+}
+
+void __scm_destroy(struct scm_cookie *scm)
+{
+ struct scm_fp_list *fpl = scm->fp;
+ int i;
+
+ if (fpl) {
+ scm->fp = NULL;
+ for (i=fpl->count-1; i>=0; i--)
+ fput(fpl->fp[i]);
+ kfree(fpl);
+ }
+}
+EXPORT_SYMBOL(__scm_destroy);
+
+int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
+{
+ struct cmsghdr *cmsg;
+ int err;
+
+ for_each_cmsghdr(cmsg, msg) {
+ err = -EINVAL;
+
+ /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
+ /* The first check was omitted in <= 2.2.5. The reasoning was
+ that parser checks cmsg_len in any case, so that
+ additional check would be work duplication.
+ But if cmsg_level is not SOL_SOCKET, we do not check
+ for too short ancillary data object at all! Oops.
+ OK, let's add it...
+ */
+ if (!CMSG_OK(msg, cmsg))
+ goto error;
+
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ continue;
+
+ switch (cmsg->cmsg_type)
+ {
+ case SCM_RIGHTS:
+ if (!sock->ops || sock->ops->family != PF_UNIX)
+ goto error;
+ err=scm_fp_copy(cmsg, &p->fp);
+ if (err<0)
+ goto error;
+ break;
+ case SCM_CREDENTIALS:
+ {
+ struct ucred creds;
+ kuid_t uid;
+ kgid_t gid;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
+ goto error;
+ memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+ err = scm_check_creds(&creds);
+ if (err)
+ goto error;
+
+ p->creds.pid = creds.pid;
+ if (!p->pid || pid_vnr(p->pid) != creds.pid) {
+ struct pid *pid;
+ err = -ESRCH;
+ pid = find_get_pid(creds.pid);
+ if (!pid)
+ goto error;
+ put_pid(p->pid);
+ p->pid = pid;
+ }
+
+ err = -EINVAL;
+ uid = make_kuid(current_user_ns(), creds.uid);
+ gid = make_kgid(current_user_ns(), creds.gid);
+ if (!uid_valid(uid) || !gid_valid(gid))
+ goto error;
+
+ p->creds.uid = uid;
+ p->creds.gid = gid;
+ break;
+ }
+ default:
+ goto error;
+ }
+ }
+
+ if (p->fp && !p->fp->count)
+ {
+ kfree(p->fp);
+ p->fp = NULL;
+ }
+ return 0;
+
+error:
+ scm_destroy(p);
+ return err;
+}
+EXPORT_SYMBOL(__scm_send);
+
+int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
+{
+ struct cmsghdr __user *cm
+ = (__force struct cmsghdr __user *)msg->msg_control;
+ struct cmsghdr cmhdr;
+ int cmlen = CMSG_LEN(len);
+ int err;
+
+ if (MSG_CMSG_COMPAT & msg->msg_flags)
+ return put_cmsg_compat(msg, level, type, len, data);
+
+ if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return 0; /* XXX: return error? check spec. */
+ }
+ if (msg->msg_controllen < cmlen) {
+ msg->msg_flags |= MSG_CTRUNC;
+ cmlen = msg->msg_controllen;
+ }
+ cmhdr.cmsg_level = level;
+ cmhdr.cmsg_type = type;
+ cmhdr.cmsg_len = cmlen;
+
+ err = -EFAULT;
+ if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
+ goto out;
+ if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
+ goto out;
+ cmlen = CMSG_SPACE(len);
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
+ msg->msg_control += cmlen;
+ msg->msg_controllen -= cmlen;
+ err = 0;
+out:
+ return err;
+}
+EXPORT_SYMBOL(put_cmsg);
+
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct cmsghdr __user *cm
+ = (__force struct cmsghdr __user*)msg->msg_control;
+
+ int fdmax = 0;
+ int fdnum = scm->fp->count;
+ struct file **fp = scm->fp->fp;
+ int __user *cmfptr;
+ int err = 0, i;
+
+ if (MSG_CMSG_COMPAT & msg->msg_flags) {
+ scm_detach_fds_compat(msg, scm);
+ return;
+ }
+
+ if (msg->msg_controllen > sizeof(struct cmsghdr))
+ fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
+ / sizeof(int));
+
+ if (fdnum < fdmax)
+ fdmax = fdnum;
+
+ for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
+ i++, cmfptr++)
+ {
+ struct socket *sock;
+ int new_fd;
+ err = security_file_receive(fp[i]);
+ if (err)
+ break;
+ err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
+ ? O_CLOEXEC : 0);
+ if (err < 0)
+ break;
+ new_fd = err;
+ err = put_user(new_fd, cmfptr);
+ if (err) {
+ put_unused_fd(new_fd);
+ break;
+ }
+ /* Bump the usage count and install the file. */
+ sock = sock_from_file(fp[i], &err);
+ if (sock) {
+ sock_update_netprioidx(sock->sk);
+ sock_update_classid(sock->sk);
+ }
+ fd_install(new_fd, get_file(fp[i]));
+ }
+
+ if (i > 0)
+ {
+ int cmlen = CMSG_LEN(i*sizeof(int));
+ err = put_user(SOL_SOCKET, &cm->cmsg_level);
+ if (!err)
+ err = put_user(SCM_RIGHTS, &cm->cmsg_type);
+ if (!err)
+ err = put_user(cmlen, &cm->cmsg_len);
+ if (!err) {
+ cmlen = CMSG_SPACE(i*sizeof(int));
+ msg->msg_control += cmlen;
+ msg->msg_controllen -= cmlen;
+ }
+ }
+ if (i < fdnum || (fdnum && fdmax <= 0))
+ msg->msg_flags |= MSG_CTRUNC;
+
+ /*
+ * All of the files that fit in the message have had their
+ * usage counts incremented, so we just free the list.
+ */
+ __scm_destroy(scm);
+}
+EXPORT_SYMBOL(scm_detach_fds);
+
+struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
+{
+ struct scm_fp_list *new_fpl;
+ int i;
+
+ if (!fpl)
+ return NULL;
+
+ new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
+ GFP_KERNEL);
+ if (new_fpl) {
+ for (i = 0; i < fpl->count; i++)
+ get_file(fpl->fp[i]);
+ new_fpl->max = new_fpl->count;
+ }
+ return new_fpl;
+}
+EXPORT_SYMBOL(scm_fp_dup);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
new file mode 100644
index 000000000..d0c430921
--- /dev/null
+++ b/net/core/secure_seq.c
@@ -0,0 +1,273 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cryptohash.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/random.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <net/tcp.h>
+#include <net/secure_seq.h>
+
+#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
+
+static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+
+static __always_inline void net_secret_init(void)
+{
+ net_get_random_once(net_secret, sizeof(net_secret));
+}
+#endif
+
+#ifdef CONFIG_INET
+static u32 seq_scale(u32 seq)
+{
+ /*
+ * As close as possible to RFC 793, which
+ * suggests using a 250 kHz clock.
+ * Further reading shows this assumes 2 Mb/s networks.
+ * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
+ * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
+ * we also need to limit the resolution so that the u32 seq
+ * overlaps less than one time per MSL (2 minutes).
+ * Choosing a clock of 64 ns period is OK. (period of 274 s)
+ */
+ return seq + (ktime_get_real_ns() >> 6);
+}
+#endif
+
+#ifdef CONFIG_TCP_STEALTH
+u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr,
+ u32 daddr_size, __be16 dport)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *md5;
+
+ __u32 sec[MD5_MESSAGE_BYTES / sizeof(__u32)];
+ __u32 i;
+ __u32 tsval = 0;
+
+ __be32 iv[MD5_DIGEST_WORDS] = { 0 };
+ __be32 isn;
+
+ memcpy(iv, daddr, (daddr_size > sizeof(iv)) ? sizeof(iv) : daddr_size);
+
+#ifdef CONFIG_TCP_MD5SIG
+ md5 = tp->af_specific->md5_lookup(sk, sk);
+#else
+ md5 = NULL;
+#endif
+ if (likely(sysctl_tcp_timestamps && !md5) || tp->stealth.saw_tsval)
+ tsval = tp->stealth.mstamp.stamp_jiffies;
+
+ ((__be16 *)iv)[2] ^= cpu_to_be16(tp->stealth.integrity_hash);
+ iv[2] ^= cpu_to_be32(tsval);
+ ((__be16 *)iv)[6] ^= dport;
+
+ for (i = 0; i < MD5_DIGEST_WORDS; i++)
+ iv[i] = le32_to_cpu(iv[i]);
+ for (i = 0; i < MD5_MESSAGE_BYTES / sizeof(__le32); i++)
+ sec[i] = le32_to_cpu(((__le32 *)tp->stealth.secret)[i]);
+
+ md5_transform(iv, sec);
+
+ isn = cpu_to_be32(iv[0]) ^ cpu_to_be32(iv[1]) ^
+ cpu_to_be32(iv[2]) ^ cpu_to_be32(iv[3]);
+
+ if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY)
+ be32_isn_to_be16_ih(isn) =
+ cpu_to_be16(tp->stealth.integrity_hash);
+
+ return be32_to_cpu(isn);
+}
+EXPORT_SYMBOL(tcp_stealth_sequence_number);
+
+u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcphdr *th = tcp_hdr(skb);
+ __be32 isn = th->seq;
+ __be32 hash;
+ __be32 *daddr;
+ u32 daddr_size;
+
+ tp->stealth.saw_tsval =
+ tcp_parse_tsval_option(&tp->stealth.mstamp.stamp_jiffies, th);
+
+ if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN)
+ tp->stealth.integrity_hash =
+ be16_to_cpu(be32_isn_to_be16_ih(isn));
+
+ switch (tp->inet_conn.icsk_inet.sk.sk_family) {
+#if IS_ENABLED(CONFIG_IPV6)
+ case PF_INET6:
+ daddr_size = sizeof(ipv6_hdr(skb)->daddr.s6_addr32);
+ daddr = ipv6_hdr(skb)->daddr.s6_addr32;
+ break;
+#endif
+ case PF_INET:
+ daddr_size = sizeof(ip_hdr(skb)->daddr);
+ daddr = &ip_hdr(skb)->daddr;
+ break;
+ default:
+ pr_err("TCP Stealth: Unknown network layer protocol, stop!\n");
+ return 1;
+ }
+
+ hash = tcp_stealth_sequence_number(sk, daddr, daddr_size, th->dest);
+ cpu_to_be32s(&hash);
+
+ if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+ tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN &&
+ be32_isn_to_be16_av(isn) == be32_isn_to_be16_av(hash))
+ return 0;
+
+ if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+ !(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+ isn == hash)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL(tcp_stealth_do_auth);
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + (__force u32)daddr[i];
+ secret[4] = net_secret[4] +
+ (((__force u16)sport << 16) + (__force u16)dport);
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ return seq_scale(hash[0]);
+}
+EXPORT_SYMBOL(secure_tcpv6_sequence_number);
+
+u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+ __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + (__force u32) daddr[i];
+ secret[4] = net_secret[4] + (__force u32)dport;
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ return hash[0];
+}
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
+#endif
+
+#ifdef CONFIG_INET
+
+__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ return seq_scale(hash[0]);
+}
+
+u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = (__force u32)dport ^ net_secret[14];
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ return hash[0];
+}
+EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
+#endif
+
+#if IS_ENABLED(CONFIG_IP_DCCP)
+u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 hash[MD5_DIGEST_WORDS];
+ u64 seq;
+
+ net_secret_init();
+ hash[0] = (__force u32)saddr;
+ hash[1] = (__force u32)daddr;
+ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
+ hash[3] = net_secret[15];
+
+ md5_transform(hash, net_secret);
+
+ seq = hash[0] | (((u64)hash[1]) << 32);
+ seq += ktime_get_real_ns();
+ seq &= (1ull << 48) - 1;
+
+ return seq;
+}
+EXPORT_SYMBOL(secure_dccp_sequence_number);
+
+#if IS_ENABLED(CONFIG_IPV6)
+u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ u32 secret[MD5_MESSAGE_BYTES / 4];
+ u32 hash[MD5_DIGEST_WORDS];
+ u64 seq;
+ u32 i;
+
+ net_secret_init();
+ memcpy(hash, saddr, 16);
+ for (i = 0; i < 4; i++)
+ secret[i] = net_secret[i] + daddr[i];
+ secret[4] = net_secret[4] +
+ (((__force u16)sport << 16) + (__force u16)dport);
+ for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+ secret[i] = net_secret[i];
+
+ md5_transform(hash, secret);
+
+ seq = hash[0] | (((u64)hash[1]) << 32);
+ seq += ktime_get_real_ns();
+ seq &= (1ull << 48) - 1;
+
+ return seq;
+}
+EXPORT_SYMBOL(secure_dccpv6_sequence_number);
+#endif
+#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
new file mode 100644
index 000000000..41ec02242
--- /dev/null
+++ b/net/core/skbuff.c
@@ -0,0 +1,4430 @@
+/*
+ * Routines having to do with the 'struct sk_buff' memory handlers.
+ *
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
+ * Florian La Roche <rzsfl@rz.uni-sb.de>
+ *
+ * Fixes:
+ * Alan Cox : Fixed the worst of the load
+ * balancer bugs.
+ * Dave Platt : Interrupt stacking fix.
+ * Richard Kooijman : Timestamp fixes.
+ * Alan Cox : Changed buffer format.
+ * Alan Cox : destructor hook for AF_UNIX etc.
+ * Linus Torvalds : Better skb_clone.
+ * Alan Cox : Added skb_copy.
+ * Alan Cox : Added all the changed routines Linus
+ * only put in the headers
+ * Ray VanTassle : Fixed --skb->lock in free
+ * Alan Cox : skb_copy copy arp field
+ * Andi Kleen : slabified it.
+ * Robert Olsson : Removed skb_head_pool
+ *
+ * NOTE:
+ * The __skb_ routines should be called with interrupts
+ * disabled, or you better be *real* sure that the operation is atomic
+ * with respect to whatever list is being frobbed (e.g. via lock_sock()
+ * or via disabling bottom half handlers, etc).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * The functions in this file will not compile correctly with gcc 2.4.x
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_NET_CLS_ACT
+#include <net/pkt_sched.h>
+#endif
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/splice.h>
+#include <linux/cache.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <linux/errqueue.h>
+#include <linux/prefetch.h>
+#include <linux/if_vlan.h>
+
+#include <net/protocol.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip6_checksum.h>
+#include <net/xfrm.h>
+
+#include <asm/uaccess.h>
+#include <trace/events/skb.h>
+#include <linux/highmem.h>
+#include <linux/capability.h>
+#include <linux/user_namespace.h>
+
+struct kmem_cache *skbuff_head_cache __read_mostly;
+static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+
+/**
+ * skb_panic - private function for out-of-line support
+ * @skb: buffer
+ * @sz: size
+ * @addr: address
+ * @msg: skb_over_panic or skb_under_panic
+ *
+ * Out-of-line support for skb_put() and skb_push().
+ * Called via the wrapper skb_over_panic() or skb_under_panic().
+ * Keep out of line to prevent kernel bloat.
+ * __builtin_return_address is not used because it is not always reliable.
+ */
+static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
+ const char msg[])
+{
+ pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ msg, addr, skb->len, sz, skb->head, skb->data,
+ (unsigned long)skb->tail, (unsigned long)skb->end,
+ skb->dev ? skb->dev->name : "<NULL>");
+ BUG();
+}
+
+static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
+{
+ skb_panic(skb, sz, addr, __func__);
+}
+
+static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
+{
+ skb_panic(skb, sz, addr, __func__);
+}
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
+
+static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
+ unsigned long ip, bool *pfmemalloc)
+{
+ void *obj;
+ bool ret_pfmemalloc = false;
+
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
+}
+
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
+
+struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
+{
+ struct sk_buff *skb;
+
+ /* Get the HEAD */
+ skb = kmem_cache_alloc_node(skbuff_head_cache,
+ gfp_mask & ~__GFP_DMA, node);
+ if (!skb)
+ goto out;
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->head = NULL;
+ skb->truesize = sizeof(struct sk_buff);
+ atomic_set(&skb->users, 1);
+
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+out:
+ return skb;
+}
+
+/**
+ * __alloc_skb - allocate a network buffer
+ * @size: size to allocate
+ * @gfp_mask: allocation mask
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
+ * @node: numa node to allocate memory on
+ *
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
+ *
+ * Buffers may only be allocated from interrupts using a @gfp_mask of
+ * %GFP_ATOMIC.
+ */
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int flags, int node)
+{
+ struct kmem_cache *cache;
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+ u8 *data;
+ bool pfmemalloc;
+
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
+
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ /* Get the HEAD */
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
+ if (!skb)
+ goto out;
+ prefetchw(skb);
+
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
+ size = SKB_DATA_ALIGN(size);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
+ if (!data)
+ goto nodata;
+ /* kmalloc(size) might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ size = SKB_WITH_OVERHEAD(ksize(data));
+ prefetchw(data + size);
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ /* Account for allocated memory : skb + skb->head */
+ skb->truesize = SKB_TRUESIZE(size);
+ skb->pfmemalloc = pfmemalloc;
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+ if (flags & SKB_ALLOC_FCLONE) {
+ struct sk_buff_fclones *fclones;
+
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
+ skb->fclone = SKB_FCLONE_ORIG;
+ atomic_set(&fclones->fclone_ref, 1);
+
+ fclones->skb2.fclone = SKB_FCLONE_CLONE;
+ fclones->skb2.pfmemalloc = pfmemalloc;
+ }
+out:
+ return skb;
+nodata:
+ kmem_cache_free(cache, skb);
+ skb = NULL;
+ goto out;
+}
+EXPORT_SYMBOL(__alloc_skb);
+
+/**
+ * __build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. @data must have been allocated by kmalloc() only if
+ * @frag_size is 0, otherwise data should come from the page allocator
+ * or vmalloc()
+ * The return is the new skb buffer.
+ * On a failure the return is %NULL, and @data is not freed.
+ * Notes :
+ * Before IO, driver allocates only data buffer where NIC put incoming frame
+ * Driver should add room at head (NET_SKB_PAD) and
+ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
+ * After IO, driver calls build_skb(), to allocate sk_buff and populate it
+ * before giving packet to stack.
+ * RX rings only contains data buffers, not full skbs.
+ */
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
+{
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+ unsigned int size = frag_size ? : ksize(data);
+
+ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = SKB_TRUESIZE(size);
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+ return skb;
+}
+
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ * This means that if @frag_size is not zero, then @data must be backed
+ * by a page fragment, not kmalloc() or vmalloc()
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb = __build_skb(data, frag_size);
+
+ if (skb && frag_size) {
+ skb->head_frag = 1;
+ if (virt_to_head_page(data)->pfmemalloc)
+ skb->pfmemalloc = 1;
+ }
+ return skb;
+}
+EXPORT_SYMBOL(build_skb);
+
+struct netdev_alloc_cache {
+ struct page_frag frag;
+ /* we maintain a pagecount bias, so that we dont dirty cache line
+ * containing page->_count every time we allocate a fragment.
+ */
+ unsigned int pagecnt_bias;
+};
+static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
+
+static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
+ gfp_t gfp_mask)
+{
+ const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
+ struct page *page = NULL;
+ gfp_t gfp = gfp_mask;
+
+ if (order) {
+ gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC;
+ page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+ nc->frag.size = PAGE_SIZE << (page ? order : 0);
+ }
+
+ if (unlikely(!page))
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+ nc->frag.page = page;
+
+ return page;
+}
+
+static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
+ unsigned int fragsz, gfp_t gfp_mask)
+{
+ struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
+ struct page *page = nc->frag.page;
+ unsigned int size;
+ int offset;
+
+ if (unlikely(!page)) {
+refill:
+ page = __page_frag_refill(nc, gfp_mask);
+ if (!page)
+ return NULL;
+
+ /* if size can vary use frag.size else just use PAGE_SIZE */
+ size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
+
+ /* Even if we own the page, we do not use atomic_set().
+ * This would break get_page_unless_zero() users.
+ */
+ atomic_add(size - 1, &page->_count);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = size;
+ nc->frag.offset = size;
+ }
+
+ offset = nc->frag.offset - fragsz;
+ if (unlikely(offset < 0)) {
+ if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ goto refill;
+
+ /* if size can vary use frag.size else just use PAGE_SIZE */
+ size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
+
+ /* OK, page count is 0, we can safely set it */
+ atomic_set(&page->_count, size);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = size;
+ offset = size - fragsz;
+ }
+
+ nc->pagecnt_bias--;
+ nc->frag.offset = offset;
+
+ return page_address(page) + offset;
+}
+
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ unsigned long flags;
+ void *data;
+
+ local_irq_save(flags);
+ data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
+ local_irq_restore(flags);
+ return data;
+}
+
+/**
+ * netdev_alloc_frag - allocate a page fragment
+ * @fragsz: fragment size
+ *
+ * Allocates a frag from a page for receive buffer.
+ * Uses GFP_ATOMIC allocations.
+ */
+void *netdev_alloc_frag(unsigned int fragsz)
+{
+ return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(netdev_alloc_frag);
+
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+ return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+}
+
+void *napi_alloc_frag(unsigned int fragsz)
+{
+ return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(napi_alloc_frag);
+
+/**
+ * __alloc_rx_skb - allocate an skbuff for rx
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb
+ * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case we have to fallback to __alloc_skb()
+ * If SKB_ALLOC_NAPI is set, page fragment will be allocated
+ * from napi_cache instead of netdev_cache.
+ *
+ * Allocate a new &sk_buff and assign it a usage count of one. The
+ * buffer has unspecified headroom built in. Users should allocate
+ * the headroom they think they need without accounting for the
+ * built in space. The built in space is used for optimisations.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
+ int flags)
+{
+ struct sk_buff *skb = NULL;
+ unsigned int fragsz = SKB_DATA_ALIGN(length) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ void *data;
+
+ if (sk_memalloc_socks())
+ gfp_mask |= __GFP_MEMALLOC;
+
+ data = (flags & SKB_ALLOC_NAPI) ?
+ __napi_alloc_frag(fragsz, gfp_mask) :
+ __netdev_alloc_frag(fragsz, gfp_mask);
+
+ if (likely(data)) {
+ skb = build_skb(data, fragsz);
+ if (unlikely(!skb))
+ put_page(virt_to_head_page(data));
+ }
+ } else {
+ skb = __alloc_skb(length, gfp_mask,
+ SKB_ALLOC_RX, NUMA_NO_NODE);
+ }
+ return skb;
+}
+
+/**
+ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ * @dev: network device to receive on
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ * Allocate a new &sk_buff and assign it a usage count of one. The
+ * buffer has NET_SKB_PAD headroom built in. Users should allocate
+ * the headroom they think they need without accounting for the
+ * built in space. The built in space is used for optimisations.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+ unsigned int length, gfp_t gfp_mask)
+{
+ struct sk_buff *skb;
+
+ length += NET_SKB_PAD;
+ skb = __alloc_rx_skb(length, gfp_mask, 0);
+
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD);
+ skb->dev = dev;
+ }
+
+ return skb;
+}
+EXPORT_SYMBOL(__netdev_alloc_skb);
+
+/**
+ * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * @napi: napi instance this buffer was allocated for
+ * @length: length to allocate
+ * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
+ *
+ * Allocate a new sk_buff for use in NAPI receive. This buffer will
+ * attempt to allocate the head from a special reserved region used
+ * only for NAPI Rx allocation. By doing this we can save several
+ * CPU cycles by avoiding having to disable and re-enable IRQs.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
+ unsigned int length, gfp_t gfp_mask)
+{
+ struct sk_buff *skb;
+
+ length += NET_SKB_PAD + NET_IP_ALIGN;
+ skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+
+ if (likely(skb)) {
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ skb->dev = napi->dev;
+ }
+
+ return skb;
+}
+EXPORT_SYMBOL(__napi_alloc_skb);
+
+void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+ int size, unsigned int truesize)
+{
+ skb_fill_page_desc(skb, i, page, off, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_add_rx_frag);
+
+void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
+ unsigned int truesize)
+{
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ skb_frag_size_add(frag, size);
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += truesize;
+}
+EXPORT_SYMBOL(skb_coalesce_rx_frag);
+
+static void skb_drop_list(struct sk_buff **listp)
+{
+ kfree_skb_list(*listp);
+ *listp = NULL;
+}
+
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+ skb_drop_list(&skb_shinfo(skb)->frag_list);
+}
+
+static void skb_clone_fraglist(struct sk_buff *skb)
+{
+ struct sk_buff *list;
+
+ skb_walk_frags(skb, list)
+ skb_get(list);
+}
+
+static void skb_free_head(struct sk_buff *skb)
+{
+ if (skb->head_frag)
+ put_page(virt_to_head_page(skb->head));
+ else
+ kfree(skb->head);
+}
+
+static void skb_release_data(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ int i;
+
+ if (skb->cloned &&
+ atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+ &shinfo->dataref))
+ return;
+
+ for (i = 0; i < shinfo->nr_frags; i++)
+ __skb_frag_unref(&shinfo->frags[i]);
+
+ /*
+ * If skb buf is from userspace, we need to notify the caller
+ * the lower device DMA has done;
+ */
+ if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
+
+ uarg = shinfo->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, true);
+ }
+
+ if (shinfo->frag_list)
+ kfree_skb_list(shinfo->frag_list);
+
+ skb_free_head(skb);
+}
+
+/*
+ * Free an skbuff by memory without cleaning the state.
+ */
+static void kfree_skbmem(struct sk_buff *skb)
+{
+ struct sk_buff_fclones *fclones;
+
+ switch (skb->fclone) {
+ case SKB_FCLONE_UNAVAILABLE:
+ kmem_cache_free(skbuff_head_cache, skb);
+ return;
+
+ case SKB_FCLONE_ORIG:
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ /* We usually free the clone (TX completion) before original skb
+ * This test would have no chance to be true for the clone,
+ * while here, branch prediction will be good.
+ */
+ if (atomic_read(&fclones->fclone_ref) == 1)
+ goto fastpath;
+ break;
+
+ default: /* SKB_FCLONE_CLONE */
+ fclones = container_of(skb, struct sk_buff_fclones, skb2);
+ break;
+ }
+ if (!atomic_dec_and_test(&fclones->fclone_ref))
+ return;
+fastpath:
+ kmem_cache_free(skbuff_fclone_cache, fclones);
+}
+
+static void skb_release_head_state(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+#ifdef CONFIG_XFRM
+ secpath_put(skb->sp);
+#endif
+ if (skb->destructor) {
+ WARN_ON(in_irq());
+ skb->destructor(skb);
+ }
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ nf_conntrack_put(skb->nfct);
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ nf_bridge_put(skb->nf_bridge);
+#endif
+}
+
+/* Free everything but the sk_buff shell. */
+static void skb_release_all(struct sk_buff *skb)
+{
+ skb_release_head_state(skb);
+ if (likely(skb->head))
+ skb_release_data(skb);
+}
+
+/**
+ * __kfree_skb - private function
+ * @skb: buffer
+ *
+ * Free an sk_buff. Release anything attached to the buffer.
+ * Clean the state. This is an internal helper function. Users should
+ * always call kfree_skb
+ */
+
+void __kfree_skb(struct sk_buff *skb)
+{
+ skb_release_all(skb);
+ kfree_skbmem(skb);
+}
+EXPORT_SYMBOL(__kfree_skb);
+
+/**
+ * kfree_skb - free an sk_buff
+ * @skb: buffer to free
+ *
+ * Drop a reference to the buffer and free it if the usage count has
+ * hit zero.
+ */
+void kfree_skb(struct sk_buff *skb)
+{
+ if (unlikely(!skb))
+ return;
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ trace_kfree_skb(skb, __builtin_return_address(0));
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(kfree_skb);
+
+void kfree_skb_list(struct sk_buff *segs)
+{
+ while (segs) {
+ struct sk_buff *next = segs->next;
+
+ kfree_skb(segs);
+ segs = next;
+ }
+}
+EXPORT_SYMBOL(kfree_skb_list);
+
+/**
+ * skb_tx_error - report an sk_buff xmit error
+ * @skb: buffer that triggered an error
+ *
+ * Report xmit error if a device callback is tracking this skb.
+ * skb must be freed afterwards.
+ */
+void skb_tx_error(struct sk_buff *skb)
+{
+ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+ struct ubuf_info *uarg;
+
+ uarg = skb_shinfo(skb)->destructor_arg;
+ if (uarg->callback)
+ uarg->callback(uarg, false);
+ skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+ }
+}
+EXPORT_SYMBOL(skb_tx_error);
+
+/**
+ * consume_skb - free an skbuff
+ * @skb: buffer to free
+ *
+ * Drop a ref to the buffer and free it if the usage count has hit zero
+ * Functions identically to kfree_skb, but kfree_skb assumes that the frame
+ * is being dropped after a failure and notes that
+ */
+void consume_skb(struct sk_buff *skb)
+{
+ if (unlikely(!skb))
+ return;
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ trace_consume_skb(skb);
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(consume_skb);
+
+/* Make sure a field is enclosed inside headers_start/headers_end section */
+#define CHECK_SKB_FIELD(field) \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
+ offsetof(struct sk_buff, headers_start)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
+ offsetof(struct sk_buff, headers_end)); \
+
+static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+ new->tstamp = old->tstamp;
+ /* We do not copy old->sk */
+ new->dev = old->dev;
+ memcpy(new->cb, old->cb, sizeof(old->cb));
+ skb_dst_copy(new, old);
+#ifdef CONFIG_XFRM
+ new->sp = secpath_get(old->sp);
+#endif
+ __nf_copy(new, old, false);
+
+ /* Note : this field could be in headers_start/headers_end section
+ * It is not yet because we do not want to have a 16 bit hole
+ */
+ new->queue_mapping = old->queue_mapping;
+
+ memcpy(&new->headers_start, &old->headers_start,
+ offsetof(struct sk_buff, headers_end) -
+ offsetof(struct sk_buff, headers_start));
+ CHECK_SKB_FIELD(protocol);
+ CHECK_SKB_FIELD(csum);
+ CHECK_SKB_FIELD(hash);
+ CHECK_SKB_FIELD(priority);
+ CHECK_SKB_FIELD(skb_iif);
+ CHECK_SKB_FIELD(vlan_proto);
+ CHECK_SKB_FIELD(vlan_tci);
+ CHECK_SKB_FIELD(transport_header);
+ CHECK_SKB_FIELD(network_header);
+ CHECK_SKB_FIELD(mac_header);
+ CHECK_SKB_FIELD(inner_protocol);
+ CHECK_SKB_FIELD(inner_transport_header);
+ CHECK_SKB_FIELD(inner_network_header);
+ CHECK_SKB_FIELD(inner_mac_header);
+ CHECK_SKB_FIELD(mark);
+#ifdef CONFIG_NETWORK_SECMARK
+ CHECK_SKB_FIELD(secmark);
+#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ CHECK_SKB_FIELD(napi_id);
+#endif
+#ifdef CONFIG_XPS
+ CHECK_SKB_FIELD(sender_cpu);
+#endif
+#ifdef CONFIG_NET_SCHED
+ CHECK_SKB_FIELD(tc_index);
+#ifdef CONFIG_NET_CLS_ACT
+ CHECK_SKB_FIELD(tc_verd);
+#endif
+#endif
+
+}
+
+/*
+ * You should not add any new code to this function. Add it to
+ * __copy_skb_header above instead.
+ */
+static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+{
+#define C(x) n->x = skb->x
+
+ n->next = n->prev = NULL;
+ n->sk = NULL;
+ __copy_skb_header(n, skb);
+
+ C(len);
+ C(data_len);
+ C(mac_len);
+ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
+ n->cloned = 1;
+ n->nohdr = 0;
+ n->destructor = NULL;
+ C(tail);
+ C(end);
+ C(head);
+ C(head_frag);
+ C(data);
+ C(truesize);
+ atomic_set(&n->users, 1);
+
+ atomic_inc(&(skb_shinfo(skb)->dataref));
+ skb->cloned = 1;
+
+ return n;
+#undef C
+}
+
+/**
+ * skb_morph - morph one skb into another
+ * @dst: the skb to receive the contents
+ * @src: the skb to supply the contents
+ *
+ * This is identical to skb_clone except that the target skb is
+ * supplied by the user.
+ *
+ * The target skb is returned upon exit.
+ */
+struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
+{
+ skb_release_all(dst);
+ return __skb_clone(dst, src);
+}
+EXPORT_SYMBOL_GPL(skb_morph);
+
+/**
+ * skb_copy_ubufs - copy userspace skb frags buffers to kernel
+ * @skb: the skb to modify
+ * @gfp_mask: allocation priority
+ *
+ * This must be called on SKBTX_DEV_ZEROCOPY skb.
+ * It will copy all frags into kernel and drop the reference
+ * to userspace pages.
+ *
+ * If this function is called from an interrupt gfp_mask() must be
+ * %GFP_ATOMIC.
+ *
+ * Returns 0 on success or a negative error code on failure
+ * to allocate kernel memory to copy to.
+ */
+int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ int i;
+ int num_frags = skb_shinfo(skb)->nr_frags;
+ struct page *page, *head = NULL;
+ struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+
+ for (i = 0; i < num_frags; i++) {
+ u8 *vaddr;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+
+ page = alloc_page(gfp_mask);
+ if (!page) {
+ while (head) {
+ struct page *next = (struct page *)page_private(head);
+ put_page(head);
+ head = next;
+ }
+ return -ENOMEM;
+ }
+ vaddr = kmap_atomic(skb_frag_page(f));
+ memcpy(page_address(page),
+ vaddr + f->page_offset, skb_frag_size(f));
+ kunmap_atomic(vaddr);
+ set_page_private(page, (unsigned long)head);
+ head = page;
+ }
+
+ /* skb frags release userspace buffers */
+ for (i = 0; i < num_frags; i++)
+ skb_frag_unref(skb, i);
+
+ uarg->callback(uarg, false);
+
+ /* skb frags point to kernel buffers */
+ for (i = num_frags - 1; i >= 0; i--) {
+ __skb_fill_page_desc(skb, i, head, 0,
+ skb_shinfo(skb)->frags[i].size);
+ head = (struct page *)page_private(head);
+ }
+
+ skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_copy_ubufs);
+
+/**
+ * skb_clone - duplicate an sk_buff
+ * @skb: buffer to clone
+ * @gfp_mask: allocation priority
+ *
+ * Duplicate an &sk_buff. The new one is not owned by a socket. Both
+ * copies share the same packet data but not structure. The new
+ * buffer has a reference count of 1. If the allocation fails the
+ * function returns %NULL otherwise the new buffer is returned.
+ *
+ * If this function is called from an interrupt gfp_mask() must be
+ * %GFP_ATOMIC.
+ */
+
+struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ struct sk_buff_fclones *fclones = container_of(skb,
+ struct sk_buff_fclones,
+ skb1);
+ struct sk_buff *n;
+
+ if (skb_orphan_frags(skb, gfp_mask))
+ return NULL;
+
+ if (skb->fclone == SKB_FCLONE_ORIG &&
+ atomic_read(&fclones->fclone_ref) == 1) {
+ n = &fclones->skb2;
+ atomic_set(&fclones->fclone_ref, 2);
+ } else {
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ if (!n)
+ return NULL;
+
+ kmemcheck_annotate_bitfield(n, flags1);
+ n->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
+
+ return __skb_clone(n, skb);
+}
+EXPORT_SYMBOL(skb_clone);
+
+static void skb_headers_offset_update(struct sk_buff *skb, int off)
+{
+ /* Only adjust this if it actually is csum_start rather than csum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start += off;
+ /* {transport,network,mac}_header and tail are relative to skb->head */
+ skb->transport_header += off;
+ skb->network_header += off;
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header += off;
+ skb->inner_transport_header += off;
+ skb->inner_network_header += off;
+ skb->inner_mac_header += off;
+}
+
+static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+ __copy_skb_header(new, old);
+
+ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
+}
+
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+ if (skb_pfmemalloc(skb))
+ return SKB_ALLOC_RX;
+ return 0;
+}
+
+/**
+ * skb_copy - create private copy of an sk_buff
+ * @skb: buffer to copy
+ * @gfp_mask: allocation priority
+ *
+ * Make a copy of both an &sk_buff and its data. This is used when the
+ * caller wishes to modify the data and needs a private copy of the
+ * data to alter. Returns %NULL on failure or the pointer to the buffer
+ * on success. The returned buffer has a reference count of 1.
+ *
+ * As by-product this function converts non-linear &sk_buff to linear
+ * one, so that &sk_buff becomes completely private and caller is allowed
+ * to modify all the data of returned buffer. This means that this
+ * function is not recommended for use in circumstances when only
+ * header is going to be modified. Use pskb_copy() instead.
+ */
+
+struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
+{
+ int headerlen = skb_headroom(skb);
+ unsigned int size = skb_end_offset(skb) + skb->data_len;
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+
+ if (!n)
+ return NULL;
+
+ /* Set the data pointer */
+ skb_reserve(n, headerlen);
+ /* Set the tail pointer and length */
+ skb_put(n, skb->len);
+
+ if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
+ BUG();
+
+ copy_skb_header(n, skb);
+ return n;
+}
+EXPORT_SYMBOL(skb_copy);
+
+/**
+ * __pskb_copy_fclone - create copy of an sk_buff with private head.
+ * @skb: buffer to copy
+ * @headroom: headroom of new skb
+ * @gfp_mask: allocation priority
+ * @fclone: if true allocate the copy of the skb from the fclone
+ * cache instead of the head cache; it is recommended to set this
+ * to true for the cases where the copy will likely be cloned
+ *
+ * Make a copy of both an &sk_buff and part of its data, located
+ * in header. Fragmented data remain shared. This is used when
+ * the caller wishes to modify only header of &sk_buff and needs
+ * private copy of the header to alter. Returns %NULL on failure
+ * or the pointer to the buffer on success.
+ * The returned buffer has a reference count of 1.
+ */
+
+struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
+ gfp_t gfp_mask, bool fclone)
+{
+ unsigned int size = skb_headlen(skb) + headroom;
+ int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
+
+ if (!n)
+ goto out;
+
+ /* Set the data pointer */
+ skb_reserve(n, headroom);
+ /* Set the tail pointer and length */
+ skb_put(n, skb_headlen(skb));
+ /* Copy the bytes */
+ skb_copy_from_linear_data(skb, n->data, n->len);
+
+ n->truesize += skb->data_len;
+ n->data_len = skb->data_len;
+ n->len = skb->len;
+
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+
+ if (skb_orphan_frags(skb, gfp_mask)) {
+ kfree_skb(n);
+ n = NULL;
+ goto out;
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
+ skb_frag_ref(skb, i);
+ }
+ skb_shinfo(n)->nr_frags = i;
+ }
+
+ if (skb_has_frag_list(skb)) {
+ skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
+ skb_clone_fraglist(n);
+ }
+
+ copy_skb_header(n, skb);
+out:
+ return n;
+}
+EXPORT_SYMBOL(__pskb_copy_fclone);
+
+/**
+ * pskb_expand_head - reallocate header of &sk_buff
+ * @skb: buffer to reallocate
+ * @nhead: room to add at head
+ * @ntail: room to add at tail
+ * @gfp_mask: allocation priority
+ *
+ * Expands (or creates identical copy, if @nhead and @ntail are zero)
+ * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
+ * reference count of 1. Returns zero in the case of success or error,
+ * if expansion failed. In the last case, &sk_buff is not changed.
+ *
+ * All the pointers pointing into skb header may change and must be
+ * reloaded after call to this function.
+ */
+
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ gfp_t gfp_mask)
+{
+ int i;
+ u8 *data;
+ int size = nhead + skb_end_offset(skb) + ntail;
+ long off;
+
+ BUG_ON(nhead < 0);
+
+ if (skb_shared(skb))
+ BUG();
+
+ size = SKB_DATA_ALIGN(size);
+
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+ gfp_mask, NUMA_NO_NODE, NULL);
+ if (!data)
+ goto nodata;
+ size = SKB_WITH_OVERHEAD(ksize(data));
+
+ /* Copy only real data... and, alas, header. This should be
+ * optimized for the cases when header is void.
+ */
+ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
+ offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
+
+ /*
+ * if shinfo is shared we must drop the old head gracefully, but if it
+ * is not we can just drop the old head and let the existing refcount
+ * be since all we did is relocate the values
+ */
+ if (skb_cloned(skb)) {
+ /* copy this zero copy skb frags */
+ if (skb_orphan_frags(skb, gfp_mask))
+ goto nofrags;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+
+ skb_release_data(skb);
+ } else {
+ skb_free_head(skb);
+ }
+ off = (data + nhead) - skb->head;
+
+ skb->head = data;
+ skb->head_frag = 0;
+ skb->data += off;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->end = size;
+ off = nhead;
+#else
+ skb->end = skb->head + size;
+#endif
+ skb->tail += off;
+ skb_headers_offset_update(skb, nhead);
+ skb->cloned = 0;
+ skb->hdr_len = 0;
+ skb->nohdr = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
+ return 0;
+
+nofrags:
+ kfree(data);
+nodata:
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(pskb_expand_head);
+
+/* Make private copy of skb with writable head and some headroom */
+
+struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
+{
+ struct sk_buff *skb2;
+ int delta = headroom - skb_headroom(skb);
+
+ if (delta <= 0)
+ skb2 = pskb_copy(skb, GFP_ATOMIC);
+ else {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
+ GFP_ATOMIC)) {
+ kfree_skb(skb2);
+ skb2 = NULL;
+ }
+ }
+ return skb2;
+}
+EXPORT_SYMBOL(skb_realloc_headroom);
+
+/**
+ * skb_copy_expand - copy and expand sk_buff
+ * @skb: buffer to copy
+ * @newheadroom: new free bytes at head
+ * @newtailroom: new free bytes at tail
+ * @gfp_mask: allocation priority
+ *
+ * Make a copy of both an &sk_buff and its data and while doing so
+ * allocate additional space.
+ *
+ * This is used when the caller wishes to modify the data and needs a
+ * private copy of the data to alter as well as more space for new fields.
+ * Returns %NULL on failure or the pointer to the buffer
+ * on success. The returned buffer has a reference count of 1.
+ *
+ * You must pass %GFP_ATOMIC as the allocation priority if this function
+ * is called from an interrupt.
+ */
+struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
+ int newheadroom, int newtailroom,
+ gfp_t gfp_mask)
+{
+ /*
+ * Allocate the copy buffer
+ */
+ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
+ int oldheadroom = skb_headroom(skb);
+ int head_copy_len, head_copy_off;
+
+ if (!n)
+ return NULL;
+
+ skb_reserve(n, newheadroom);
+
+ /* Set the tail pointer and length */
+ skb_put(n, skb->len);
+
+ head_copy_len = oldheadroom;
+ head_copy_off = 0;
+ if (newheadroom <= head_copy_len)
+ head_copy_len = newheadroom;
+ else
+ head_copy_off = newheadroom - head_copy_len;
+
+ /* Copy the linear header and data. */
+ if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
+ skb->len + head_copy_len))
+ BUG();
+
+ copy_skb_header(n, skb);
+
+ skb_headers_offset_update(n, newheadroom - oldheadroom);
+
+ return n;
+}
+EXPORT_SYMBOL(skb_copy_expand);
+
+/**
+ * skb_pad - zero pad the tail of an skb
+ * @skb: buffer to pad
+ * @pad: space to pad
+ *
+ * Ensure that a buffer is followed by a padding area that is zero
+ * filled. Used by network drivers which may DMA or transfer data
+ * beyond the buffer end onto the wire.
+ *
+ * May return error in out of memory cases. The skb is freed on error.
+ */
+
+int skb_pad(struct sk_buff *skb, int pad)
+{
+ int err;
+ int ntail;
+
+ /* If the skbuff is non linear tailroom is always zero.. */
+ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
+ memset(skb->data+skb->len, 0, pad);
+ return 0;
+ }
+
+ ntail = skb->data_len + pad - (skb->end - skb->tail);
+ if (likely(skb_cloned(skb) || ntail > 0)) {
+ err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+ if (unlikely(err))
+ goto free_skb;
+ }
+
+ /* FIXME: The use of this function with non-linear skb's really needs
+ * to be audited.
+ */
+ err = skb_linearize(skb);
+ if (unlikely(err))
+ goto free_skb;
+
+ memset(skb->data + skb->len, 0, pad);
+ return 0;
+
+free_skb:
+ kfree_skb(skb);
+ return err;
+}
+EXPORT_SYMBOL(skb_pad);
+
+/**
+ * pskb_put - add data to the tail of a potentially fragmented buffer
+ * @skb: start of the buffer to use
+ * @tail: tail fragment of the buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the potentially
+ * fragmented buffer. @tail must be the last fragment of @skb -- or
+ * @skb itself. If this would exceed the total buffer size the kernel
+ * will panic. A pointer to the first byte of the extra data is
+ * returned.
+ */
+
+unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
+{
+ if (tail != skb) {
+ skb->data_len += len;
+ skb->len += len;
+ }
+ return skb_put(tail, len);
+}
+EXPORT_SYMBOL_GPL(pskb_put);
+
+/**
+ * skb_put - add data to a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer. If this would
+ * exceed the total buffer size the kernel will panic. A pointer to the
+ * first byte of the extra data is returned.
+ */
+unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
+{
+ unsigned char *tmp = skb_tail_pointer(skb);
+ SKB_LINEAR_ASSERT(skb);
+ skb->tail += len;
+ skb->len += len;
+ if (unlikely(skb->tail > skb->end))
+ skb_over_panic(skb, len, __builtin_return_address(0));
+ return tmp;
+}
+EXPORT_SYMBOL(skb_put);
+
+/**
+ * skb_push - add data to the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to add
+ *
+ * This function extends the used data area of the buffer at the buffer
+ * start. If this would exceed the total buffer headroom the kernel will
+ * panic. A pointer to the first byte of the extra data is returned.
+ */
+unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
+{
+ skb->data -= len;
+ skb->len += len;
+ if (unlikely(skb->data<skb->head))
+ skb_under_panic(skb, len, __builtin_return_address(0));
+ return skb->data;
+}
+EXPORT_SYMBOL(skb_push);
+
+/**
+ * skb_pull - remove data from the start of a buffer
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the next data in the buffer
+ * is returned. Once the data has been pulled future pushes will overwrite
+ * the old data.
+ */
+unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
+{
+ return skb_pull_inline(skb, len);
+}
+EXPORT_SYMBOL(skb_pull);
+
+/**
+ * skb_trim - remove end from a buffer
+ * @skb: buffer to alter
+ * @len: new length
+ *
+ * Cut the length of a buffer down by removing data from the tail. If
+ * the buffer is already under the length specified it is not modified.
+ * The skb must be linear.
+ */
+void skb_trim(struct sk_buff *skb, unsigned int len)
+{
+ if (skb->len > len)
+ __skb_trim(skb, len);
+}
+EXPORT_SYMBOL(skb_trim);
+
+/* Trims skb to length len. It can change skb pointers.
+ */
+
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
+{
+ struct sk_buff **fragp;
+ struct sk_buff *frag;
+ int offset = skb_headlen(skb);
+ int nfrags = skb_shinfo(skb)->nr_frags;
+ int i;
+ int err;
+
+ if (skb_cloned(skb) &&
+ unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
+ return err;
+
+ i = 0;
+ if (offset >= len)
+ goto drop_pages;
+
+ for (; i < nfrags; i++) {
+ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (end < len) {
+ offset = end;
+ continue;
+ }
+
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
+
+drop_pages:
+ skb_shinfo(skb)->nr_frags = i;
+
+ for (; i < nfrags; i++)
+ skb_frag_unref(skb, i);
+
+ if (skb_has_frag_list(skb))
+ skb_drop_fraglist(skb);
+ goto done;
+ }
+
+ for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
+ fragp = &frag->next) {
+ int end = offset + frag->len;
+
+ if (skb_shared(frag)) {
+ struct sk_buff *nfrag;
+
+ nfrag = skb_clone(frag, GFP_ATOMIC);
+ if (unlikely(!nfrag))
+ return -ENOMEM;
+
+ nfrag->next = frag->next;
+ consume_skb(frag);
+ frag = nfrag;
+ *fragp = frag;
+ }
+
+ if (end < len) {
+ offset = end;
+ continue;
+ }
+
+ if (end > len &&
+ unlikely((err = pskb_trim(frag, len - offset))))
+ return err;
+
+ if (frag->next)
+ skb_drop_list(&frag->next);
+ break;
+ }
+
+done:
+ if (len > skb_headlen(skb)) {
+ skb->data_len -= skb->len - len;
+ skb->len = len;
+ } else {
+ skb->len = len;
+ skb->data_len = 0;
+ skb_set_tail_pointer(skb, len);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(___pskb_trim);
+
+/**
+ * __pskb_pull_tail - advance tail of skb header
+ * @skb: buffer to reallocate
+ * @delta: number of bytes to advance tail
+ *
+ * The function makes a sense only on a fragmented &sk_buff,
+ * it expands header moving its tail forward and copying necessary
+ * data from fragmented part.
+ *
+ * &sk_buff MUST have reference count of 1.
+ *
+ * Returns %NULL (and &sk_buff does not change) if pull failed
+ * or value of new tail of skb in the case of success.
+ *
+ * All the pointers pointing into skb header may change and must be
+ * reloaded after call to this function.
+ */
+
+/* Moves tail of skb head forward, copying data from fragmented part,
+ * when it is necessary.
+ * 1. It may fail due to malloc failure.
+ * 2. It may change skb pointers.
+ *
+ * It is pretty complicated. Luckily, it is called only in exceptional cases.
+ */
+unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
+{
+ /* If skb has not enough free space at tail, get new one
+ * plus 128 bytes for future expansions. If we have enough
+ * room at tail, reallocate without expansion only if skb is cloned.
+ */
+ int i, k, eat = (skb->tail + delta) - skb->end;
+
+ if (eat > 0 || skb_cloned(skb)) {
+ if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
+ GFP_ATOMIC))
+ return NULL;
+ }
+
+ if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
+ BUG();
+
+ /* Optimization: no fragments, no reasons to preestimate
+ * size of pulled pages. Superb.
+ */
+ if (!skb_has_frag_list(skb))
+ goto pull_pages;
+
+ /* Estimate size of pulled pages. */
+ eat = delta;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size >= eat)
+ goto pull_pages;
+ eat -= size;
+ }
+
+ /* If we need update frag list, we are in troubles.
+ * Certainly, it possible to add an offset to skb data,
+ * but taking into account that pulling is expected to
+ * be very rare operation, it is worth to fight against
+ * further bloating skb head and crucify ourselves here instead.
+ * Pure masohism, indeed. 8)8)
+ */
+ if (eat) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+ struct sk_buff *clone = NULL;
+ struct sk_buff *insp = NULL;
+
+ do {
+ BUG_ON(!list);
+
+ if (list->len <= eat) {
+ /* Eaten as whole. */
+ eat -= list->len;
+ list = list->next;
+ insp = list;
+ } else {
+ /* Eaten partially. */
+
+ if (skb_shared(list)) {
+ /* Sucks! We need to fork list. :-( */
+ clone = skb_clone(list, GFP_ATOMIC);
+ if (!clone)
+ return NULL;
+ insp = list->next;
+ list = clone;
+ } else {
+ /* This may be pulled without
+ * problems. */
+ insp = list;
+ }
+ if (!pskb_pull(list, eat)) {
+ kfree_skb(clone);
+ return NULL;
+ }
+ break;
+ }
+ } while (eat);
+
+ /* Free pulled out fragments. */
+ while ((list = skb_shinfo(skb)->frag_list) != insp) {
+ skb_shinfo(skb)->frag_list = list->next;
+ kfree_skb(list);
+ }
+ /* And insert new clone at head. */
+ if (clone) {
+ clone->next = list;
+ skb_shinfo(skb)->frag_list = clone;
+ }
+ }
+ /* Success! Now we may commit changes to skb data. */
+
+pull_pages:
+ eat = delta;
+ k = 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
+ } else {
+ skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ if (eat) {
+ skb_shinfo(skb)->frags[k].page_offset += eat;
+ skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
+ eat = 0;
+ }
+ k++;
+ }
+ }
+ skb_shinfo(skb)->nr_frags = k;
+
+ skb->tail += delta;
+ skb->data_len -= delta;
+
+ return skb_tail_pointer(skb);
+}
+EXPORT_SYMBOL(__pskb_pull_tail);
+
+/**
+ * skb_copy_bits - copy bits from skb to kernel buffer
+ * @skb: source skb
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy the specified number of bytes from the source skb to the
+ * destination buffer.
+ *
+ * CAUTION ! :
+ * If its prototype is ever changed,
+ * check arch/{*}/net/{*}.S files,
+ * since it is called from BPF assembly code.
+ */
+int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
+{
+ int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
+
+ if (offset > (int)skb->len - len)
+ goto fault;
+
+ /* Copy header. */
+ if ((copy = start - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ skb_copy_from_linear_data_offset(skb, offset, to, copy);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(f);
+ if ((copy = end - offset) > 0) {
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ vaddr = kmap_atomic(skb_frag_page(f));
+ memcpy(to,
+ vaddr + f->page_offset + offset - start,
+ copy);
+ kunmap_atomic(vaddr);
+
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_bits(frag_iter, offset - start, to, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_bits);
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+ put_page(spd->pages[i]);
+}
+
+static struct page *linear_to_page(struct page *page, unsigned int *len,
+ unsigned int *offset,
+ struct sock *sk)
+{
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return NULL;
+
+ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
+
+ memcpy(page_address(pfrag->page) + pfrag->offset,
+ page_address(page) + *offset, *len);
+ *offset = pfrag->offset;
+ pfrag->offset += *len;
+
+ return pfrag->page;
+}
+
+static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+ struct page *page,
+ unsigned int offset)
+{
+ return spd->nr_pages &&
+ spd->pages[spd->nr_pages - 1] == page &&
+ (spd->partial[spd->nr_pages - 1].offset +
+ spd->partial[spd->nr_pages - 1].len == offset);
+}
+
+/*
+ * Fill page/offset/length into spd, if it can hold more pages.
+ */
+static bool spd_fill_page(struct splice_pipe_desc *spd,
+ struct pipe_inode_info *pipe, struct page *page,
+ unsigned int *len, unsigned int offset,
+ bool linear,
+ struct sock *sk)
+{
+ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+ return true;
+
+ if (linear) {
+ page = linear_to_page(page, len, &offset, sk);
+ if (!page)
+ return true;
+ }
+ if (spd_can_coalesce(spd, page, offset)) {
+ spd->partial[spd->nr_pages - 1].len += *len;
+ return false;
+ }
+ get_page(page);
+ spd->pages[spd->nr_pages] = page;
+ spd->partial[spd->nr_pages].len = *len;
+ spd->partial[spd->nr_pages].offset = offset;
+ spd->nr_pages++;
+
+ return false;
+}
+
+static bool __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len,
+ struct splice_pipe_desc *spd, bool linear,
+ struct sock *sk,
+ struct pipe_inode_info *pipe)
+{
+ if (!*len)
+ return true;
+
+ /* skip this segment if already processed */
+ if (*off >= plen) {
+ *off -= plen;
+ return false;
+ }
+
+ /* ignore any bits we already processed */
+ poff += *off;
+ plen -= *off;
+ *off = 0;
+
+ do {
+ unsigned int flen = min(*len, plen);
+
+ if (spd_fill_page(spd, pipe, page, &flen, poff,
+ linear, sk))
+ return true;
+ poff += flen;
+ plen -= flen;
+ *len -= flen;
+ } while (*len && plen);
+
+ return false;
+}
+
+/*
+ * Map linear and fragment data from the skb to spd. It reports true if the
+ * pipe is full or if we already spliced the requested length.
+ */
+static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+ unsigned int *offset, unsigned int *len,
+ struct splice_pipe_desc *spd, struct sock *sk)
+{
+ int seg;
+
+ /* map the linear part :
+ * If skb->head_frag is set, this 'linear' part is backed by a
+ * fragment, and if the head is not shared with any clones then
+ * we can avoid a copy since we own the head portion of this page.
+ */
+ if (__splice_segment(virt_to_page(skb->data),
+ (unsigned long) skb->data & (PAGE_SIZE - 1),
+ skb_headlen(skb),
+ offset, len, spd,
+ skb_head_is_locked(skb),
+ sk, pipe))
+ return true;
+
+ /*
+ * then map the fragments
+ */
+ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
+ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+
+ if (__splice_segment(skb_frag_page(f),
+ f->page_offset, skb_frag_size(f),
+ offset, len, spd, false, sk, pipe))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Map data from the skb to a pipe. Should handle both the linear part,
+ * the fragments, and the frag list. It does NOT handle frag lists within
+ * the frag list, if such a thing exists. We'd probably need to recurse to
+ * handle that cleanly.
+ */
+int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+ struct pipe_inode_info *pipe, unsigned int tlen,
+ unsigned int flags)
+{
+ struct partial_page partial[MAX_SKB_FRAGS];
+ struct page *pages[MAX_SKB_FRAGS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .nr_pages_max = MAX_SKB_FRAGS,
+ .flags = flags,
+ .ops = &nosteal_pipe_buf_ops,
+ .spd_release = sock_spd_release,
+ };
+ struct sk_buff *frag_iter;
+ struct sock *sk = skb->sk;
+ int ret = 0;
+
+ /*
+ * __skb_splice_bits() only fails if the output has no room left,
+ * so no point in going over the frag_list for the error case.
+ */
+ if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
+ goto done;
+ else if (!tlen)
+ goto done;
+
+ /*
+ * now see if we have a frag_list to map
+ */
+ skb_walk_frags(skb, frag_iter) {
+ if (!tlen)
+ break;
+ if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
+ break;
+ }
+
+done:
+ if (spd.nr_pages) {
+ /*
+ * Drop the socket lock, otherwise we have reverse
+ * locking dependencies between sk_lock and i_mutex
+ * here as compared to sendfile(). We enter here
+ * with the socket lock held, and splice_to_pipe() will
+ * grab the pipe inode lock. For sendfile() emulation,
+ * we call into ->sendpage() with the i_mutex lock held
+ * and networking will grab the socket lock.
+ */
+ release_sock(sk);
+ ret = splice_to_pipe(pipe, &spd);
+ lock_sock(sk);
+ }
+
+ return ret;
+}
+
+/**
+ * skb_store_bits - store bits from kernel buffer to skb
+ * @skb: destination buffer
+ * @offset: offset in destination
+ * @from: source buffer
+ * @len: number of bytes to copy
+ *
+ * Copy the specified number of bytes from the source buffer to the
+ * destination skb. This function handles all the messy bits of
+ * traversing fragment lists and such.
+ */
+
+int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
+{
+ int start = skb_headlen(skb);
+ struct sk_buff *frag_iter;
+ int i, copy;
+
+ if (offset > (int)skb->len - len)
+ goto fault;
+
+ if ((copy = start - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ skb_copy_to_linear_data_offset(skb, offset, from, copy);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ vaddr = kmap_atomic(skb_frag_page(frag));
+ memcpy(vaddr + frag->page_offset + offset - start,
+ from, copy);
+ kunmap_atomic(vaddr);
+
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_store_bits(frag_iter, offset - start,
+ from, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ from += copy;
+ }
+ start = end;
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_store_bits);
+
+/* Checksum skb data. */
+__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
+ __wsum csum, const struct skb_checksum_ops *ops)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int pos = 0;
+
+ /* Checksum header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ csum = ops->update(skb->data + offset, copy, csum);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap_atomic(skb_frag_page(frag));
+ csum2 = ops->update(vaddr + frag->page_offset +
+ offset - start, copy, 0);
+ kunmap_atomic(vaddr);
+ csum = ops->combine(csum, csum2, pos, copy);
+ if (!(len -= copy))
+ return csum;
+ offset += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ if (copy > len)
+ copy = len;
+ csum2 = __skb_checksum(frag_iter, offset - start,
+ copy, 0, ops);
+ csum = ops->combine(csum, csum2, pos, copy);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+
+ return csum;
+}
+EXPORT_SYMBOL(__skb_checksum);
+
+__wsum skb_checksum(const struct sk_buff *skb, int offset,
+ int len, __wsum csum)
+{
+ const struct skb_checksum_ops ops = {
+ .update = csum_partial_ext,
+ .combine = csum_block_add_ext,
+ };
+
+ return __skb_checksum(skb, offset, len, csum, &ops);
+}
+EXPORT_SYMBOL(skb_checksum);
+
+/* Both of above in one bottle. */
+
+__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
+ u8 *to, int len, __wsum csum)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int pos = 0;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ csum = csum_partial_copy_nocheck(skb->data + offset, to,
+ copy, csum);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ to += copy;
+ pos = copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ if ((copy = end - offset) > 0) {
+ __wsum csum2;
+ u8 *vaddr;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap_atomic(skb_frag_page(frag));
+ csum2 = csum_partial_copy_nocheck(vaddr +
+ frag->page_offset +
+ offset - start, to,
+ copy, 0);
+ kunmap_atomic(vaddr);
+ csum = csum_block_add(csum, csum2, pos);
+ if (!(len -= copy))
+ return csum;
+ offset += copy;
+ to += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ __wsum csum2;
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ csum2 = skb_copy_and_csum_bits(frag_iter,
+ offset - start,
+ to, copy, 0);
+ csum = csum_block_add(csum, csum2, pos);
+ if ((len -= copy) == 0)
+ return csum;
+ offset += copy;
+ to += copy;
+ pos += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+ return csum;
+}
+EXPORT_SYMBOL(skb_copy_and_csum_bits);
+
+ /**
+ * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
+ * @from: source buffer
+ *
+ * Calculates the amount of linear headroom needed in the 'to' skb passed
+ * into skb_zerocopy().
+ */
+unsigned int
+skb_zerocopy_headlen(const struct sk_buff *from)
+{
+ unsigned int hlen = 0;
+
+ if (!from->head_frag ||
+ skb_headlen(from) < L1_CACHE_BYTES ||
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ hlen = skb_headlen(from);
+
+ if (skb_has_frag_list(from))
+ hlen = from->len;
+
+ return hlen;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
+
+/**
+ * skb_zerocopy - Zero copy skb to skb
+ * @to: destination buffer
+ * @from: source buffer
+ * @len: number of bytes to copy from source buffer
+ * @hlen: size of linear headroom in destination buffer
+ *
+ * Copies up to `len` bytes from `from` to `to` by creating references
+ * to the frags in the source buffer.
+ *
+ * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
+ * headroom in the `to` buffer.
+ *
+ * Return value:
+ * 0: everything is OK
+ * -ENOMEM: couldn't orphan frags of @from due to lack of memory
+ * -EFAULT: skb_copy_bits() found some problem with skb geometry
+ */
+int
+skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ int ret;
+ struct page *page;
+ unsigned int offset;
+
+ BUG_ON(!from->head_frag && !hlen);
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to))
+ return skb_copy_bits(from, 0, skb_put(to, len), len);
+
+ if (hlen) {
+ ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ if (unlikely(ret))
+ return ret;
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_page_desc(to, 0, page, offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ to->truesize += len + plen;
+ to->len += len + plen;
+ to->data_len += len + plen;
+
+ if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
+ skb_tx_error(from);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+ len -= skb_shinfo(to)->frags[j].size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy);
+
+void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
+{
+ __wsum csum;
+ long csstart;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ csstart = skb_checksum_start_offset(skb);
+ else
+ csstart = skb_headlen(skb);
+
+ BUG_ON(csstart > skb_headlen(skb));
+
+ skb_copy_from_linear_data(skb, to, csstart);
+
+ csum = 0;
+ if (csstart != skb->len)
+ csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
+ skb->len - csstart, 0);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ long csstuff = csstart + skb->csum_offset;
+
+ *((__sum16 *)(to + csstuff)) = csum_fold(csum);
+ }
+}
+EXPORT_SYMBOL(skb_copy_and_csum_dev);
+
+/**
+ * skb_dequeue - remove from the head of the queue
+ * @list: list to dequeue from
+ *
+ * Remove the head of the list. The list lock is taken so the function
+ * may be used safely with other locking list functions. The head item is
+ * returned or %NULL if the list is empty.
+ */
+
+struct sk_buff *skb_dequeue(struct sk_buff_head *list)
+{
+ unsigned long flags;
+ struct sk_buff *result;
+
+ spin_lock_irqsave(&list->lock, flags);
+ result = __skb_dequeue(list);
+ spin_unlock_irqrestore(&list->lock, flags);
+ return result;
+}
+EXPORT_SYMBOL(skb_dequeue);
+
+/**
+ * skb_dequeue_tail - remove from the tail of the queue
+ * @list: list to dequeue from
+ *
+ * Remove the tail of the list. The list lock is taken so the function
+ * may be used safely with other locking list functions. The tail item is
+ * returned or %NULL if the list is empty.
+ */
+struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
+{
+ unsigned long flags;
+ struct sk_buff *result;
+
+ spin_lock_irqsave(&list->lock, flags);
+ result = __skb_dequeue_tail(list);
+ spin_unlock_irqrestore(&list->lock, flags);
+ return result;
+}
+EXPORT_SYMBOL(skb_dequeue_tail);
+
+/**
+ * skb_queue_purge - empty a list
+ * @list: list to empty
+ *
+ * Delete all buffers on an &sk_buff list. Each buffer is removed from
+ * the list and one reference dropped. This function takes the list
+ * lock and is atomic with respect to other list locking functions.
+ */
+void skb_queue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue(list)) != NULL)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL(skb_queue_purge);
+
+/**
+ * skb_queue_head - queue a buffer at the list head
+ * @list: list to use
+ * @newsk: buffer to queue
+ *
+ * Queue a buffer at the start of the list. This function takes the
+ * list lock and can be used safely with other locking &sk_buff functions
+ * safely.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_head(list, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_queue_head);
+
+/**
+ * skb_queue_tail - queue a buffer at the list tail
+ * @list: list to use
+ * @newsk: buffer to queue
+ *
+ * Queue a buffer at the tail of the list. This function takes the
+ * list lock and can be used safely with other locking &sk_buff functions
+ * safely.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_tail(list, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_queue_tail);
+
+/**
+ * skb_unlink - remove a buffer from a list
+ * @skb: buffer to remove
+ * @list: list to use
+ *
+ * Remove a packet from a list. The list locks are taken and this
+ * function is atomic with respect to other list locked calls
+ *
+ * You must know what list the SKB is on.
+ */
+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_unlink(skb, list);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_unlink);
+
+/**
+ * skb_append - append a buffer
+ * @old: buffer to insert after
+ * @newsk: buffer to insert
+ * @list: list to use
+ *
+ * Place a packet after a given packet in a list. The list locks are taken
+ * and this function is atomic with respect to other list locked calls.
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_queue_after(list, old, newsk);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_append);
+
+/**
+ * skb_insert - insert a buffer
+ * @old: buffer to insert before
+ * @newsk: buffer to insert
+ * @list: list to use
+ *
+ * Place a packet before a given packet in a list. The list locks are
+ * taken and this function is atomic with respect to other list locked
+ * calls.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_insert(newsk, old->prev, old, list);
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_insert);
+
+static inline void skb_split_inside_header(struct sk_buff *skb,
+ struct sk_buff* skb1,
+ const u32 len, const int pos)
+{
+ int i;
+
+ skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
+ pos - len);
+ /* And move data appendix as is. */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
+
+ skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb_shinfo(skb)->nr_frags = 0;
+ skb1->data_len = skb->data_len;
+ skb1->len += skb1->data_len;
+ skb->data_len = 0;
+ skb->len = len;
+ skb_set_tail_pointer(skb, len);
+}
+
+static inline void skb_split_no_header(struct sk_buff *skb,
+ struct sk_buff* skb1,
+ const u32 len, int pos)
+{
+ int i, k = 0;
+ const int nfrags = skb_shinfo(skb)->nr_frags;
+
+ skb_shinfo(skb)->nr_frags = 0;
+ skb1->len = skb1->data_len = skb->len - len;
+ skb->len = len;
+ skb->data_len = len - pos;
+
+ for (i = 0; i < nfrags; i++) {
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (pos + size > len) {
+ skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
+
+ if (pos < len) {
+ /* Split frag.
+ * We have two variants in this case:
+ * 1. Move all the frag to the second
+ * part, if it is possible. F.e.
+ * this approach is mandatory for TUX,
+ * where splitting is expensive.
+ * 2. Split is accurately. We make this.
+ */
+ skb_frag_ref(skb, i);
+ skb_shinfo(skb1)->frags[0].page_offset += len - pos;
+ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
+ skb_shinfo(skb)->nr_frags++;
+ }
+ k++;
+ } else
+ skb_shinfo(skb)->nr_frags++;
+ pos += size;
+ }
+ skb_shinfo(skb1)->nr_frags = k;
+}
+
+/**
+ * skb_split - Split fragmented skb to two parts at length len.
+ * @skb: the buffer to split
+ * @skb1: the buffer to receive the second part
+ * @len: new length for skb
+ */
+void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
+{
+ int pos = skb_headlen(skb);
+
+ skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
+ if (len < pos) /* Split line is inside header. */
+ skb_split_inside_header(skb, skb1, len, pos);
+ else /* Second chunk has no header, nothing to copy. */
+ skb_split_no_header(skb, skb1, len, pos);
+}
+EXPORT_SYMBOL(skb_split);
+
+/* Shifting from/to a cloned skb is a no-go.
+ *
+ * Caller cannot keep skb_shinfo related pointers past calling here!
+ */
+static int skb_prepare_for_shift(struct sk_buff *skb)
+{
+ return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+
+/**
+ * skb_shift - Shifts paged data partially from skb to another
+ * @tgt: buffer into which tail data gets added
+ * @skb: buffer from which the paged data comes from
+ * @shiftlen: shift up to this many bytes
+ *
+ * Attempts to shift up to shiftlen worth of bytes, which may be less than
+ * the length of the skb, from skb to tgt. Returns number bytes shifted.
+ * It's up to caller to free skb if everything was shifted.
+ *
+ * If @tgt runs out of frags, the whole operation is aborted.
+ *
+ * Skb cannot include anything else but paged data while tgt is allowed
+ * to have non-paged data as well.
+ *
+ * TODO: full sized shift could be optimized but that would need
+ * specialized skb free'er to handle frags without up-to-date nr_frags.
+ */
+int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
+{
+ int from, to, merge, todo;
+ struct skb_frag_struct *fragfrom, *fragto;
+
+ BUG_ON(shiftlen > skb->len);
+ BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
+
+ todo = shiftlen;
+ from = 0;
+ to = skb_shinfo(tgt)->nr_frags;
+ fragfrom = &skb_shinfo(skb)->frags[from];
+
+ /* Actual merge is delayed until the point when we know we can
+ * commit all, so that we don't have to undo partial changes
+ */
+ if (!to ||
+ !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ fragfrom->page_offset)) {
+ merge = -1;
+ } else {
+ merge = to - 1;
+
+ todo -= skb_frag_size(fragfrom);
+ if (todo < 0) {
+ if (skb_prepare_for_shift(skb) ||
+ skb_prepare_for_shift(tgt))
+ return 0;
+
+ /* All previous frag pointers might be stale! */
+ fragfrom = &skb_shinfo(skb)->frags[from];
+ fragto = &skb_shinfo(tgt)->frags[merge];
+
+ skb_frag_size_add(fragto, shiftlen);
+ skb_frag_size_sub(fragfrom, shiftlen);
+ fragfrom->page_offset += shiftlen;
+
+ goto onlymerged;
+ }
+
+ from++;
+ }
+
+ /* Skip full, not-fitting skb to avoid expensive operations */
+ if ((shiftlen == skb->len) &&
+ (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
+ return 0;
+
+ if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
+ return 0;
+
+ while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
+ if (to == MAX_SKB_FRAGS)
+ return 0;
+
+ fragfrom = &skb_shinfo(skb)->frags[from];
+ fragto = &skb_shinfo(tgt)->frags[to];
+
+ if (todo >= skb_frag_size(fragfrom)) {
+ *fragto = *fragfrom;
+ todo -= skb_frag_size(fragfrom);
+ from++;
+ to++;
+
+ } else {
+ __skb_frag_ref(fragfrom);
+ fragto->page = fragfrom->page;
+ fragto->page_offset = fragfrom->page_offset;
+ skb_frag_size_set(fragto, todo);
+
+ fragfrom->page_offset += todo;
+ skb_frag_size_sub(fragfrom, todo);
+ todo = 0;
+
+ to++;
+ break;
+ }
+ }
+
+ /* Ready to "commit" this state change to tgt */
+ skb_shinfo(tgt)->nr_frags = to;
+
+ if (merge >= 0) {
+ fragfrom = &skb_shinfo(skb)->frags[0];
+ fragto = &skb_shinfo(tgt)->frags[merge];
+
+ skb_frag_size_add(fragto, skb_frag_size(fragfrom));
+ __skb_frag_unref(fragfrom);
+ }
+
+ /* Reposition in the original skb */
+ to = 0;
+ while (from < skb_shinfo(skb)->nr_frags)
+ skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
+ skb_shinfo(skb)->nr_frags = to;
+
+ BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
+
+onlymerged:
+ /* Most likely the tgt won't ever need its checksum anymore, skb on
+ * the other hand might need it if it needs to be resent
+ */
+ tgt->ip_summed = CHECKSUM_PARTIAL;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ /* Yak, is it really working this way? Some helper please? */
+ skb->len -= shiftlen;
+ skb->data_len -= shiftlen;
+ skb->truesize -= shiftlen;
+ tgt->len += shiftlen;
+ tgt->data_len += shiftlen;
+ tgt->truesize += shiftlen;
+
+ return shiftlen;
+}
+
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+ unsigned int to, struct skb_seq_state *st)
+{
+ st->lower_offset = from;
+ st->upper_offset = to;
+ st->root_skb = st->cur_skb = skb;
+ st->frag_idx = st->stepped_offset = 0;
+ st->frag_data = NULL;
+}
+EXPORT_SYMBOL(skb_prepare_seq_read);
+
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at @consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to @data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. @consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note 1: The size of each block of data returned can be arbitrary,
+ * this limitation is the cost for zerocopy sequential
+ * reads of potentially non linear data.
+ *
+ * Note 2: Fragment lists within fragments are not implemented
+ * at the moment, state->root_skb could be replaced with
+ * a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+ struct skb_seq_state *st)
+{
+ unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+ skb_frag_t *frag;
+
+ if (unlikely(abs_offset >= st->upper_offset)) {
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
+ return 0;
+ }
+
+next_skb:
+ block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
+
+ if (abs_offset < block_limit && !st->frag_data) {
+ *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
+ return block_limit - abs_offset;
+ }
+
+ if (st->frag_idx == 0 && !st->frag_data)
+ st->stepped_offset += skb_headlen(st->cur_skb);
+
+ while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+ frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+ block_limit = skb_frag_size(frag) + st->stepped_offset;
+
+ if (abs_offset < block_limit) {
+ if (!st->frag_data)
+ st->frag_data = kmap_atomic(skb_frag_page(frag));
+
+ *data = (u8 *) st->frag_data + frag->page_offset +
+ (abs_offset - st->stepped_offset);
+
+ return block_limit - abs_offset;
+ }
+
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ st->frag_idx++;
+ st->stepped_offset += skb_frag_size(frag);
+ }
+
+ if (st->frag_data) {
+ kunmap_atomic(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
+ st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+ st->frag_idx = 0;
+ goto next_skb;
+ } else if (st->cur_skb->next) {
+ st->cur_skb = st->cur_skb->next;
+ st->frag_idx = 0;
+ goto next_skb;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_seq_read);
+
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+ if (st->frag_data)
+ kunmap_atomic(st->frag_data);
+}
+EXPORT_SYMBOL(skb_abort_seq_read);
+
+#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
+
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+ struct ts_config *conf,
+ struct ts_state *state)
+{
+ return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+ skb_abort_seq_read(TS_SKB_CB(state));
+}
+
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+ unsigned int to, struct ts_config *config)
+{
+ struct ts_state state;
+ unsigned int ret;
+
+ config->get_next_block = skb_ts_get_next_block;
+ config->finish = skb_ts_finish;
+
+ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
+
+ ret = textsearch_find(config, &state);
+ return (ret <= to - from ? ret : UINT_MAX);
+}
+EXPORT_SYMBOL(skb_find_text);
+
+/**
+ * skb_append_datato_frags - append the user data to a skb
+ * @sk: sock structure
+ * @skb: skb structure to be appended with user data.
+ * @getfrag: call back function to be used for getting the user data
+ * @from: pointer to user message iov
+ * @length: length of the iov message
+ *
+ * Description: This procedure append the user data in the fragment part
+ * of the skb if any page alloc fails user this procedure returns -ENOMEM
+ */
+int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
+ int (*getfrag)(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length)
+{
+ int frg_cnt = skb_shinfo(skb)->nr_frags;
+ int copy;
+ int offset = 0;
+ int ret;
+ struct page_frag *pfrag = &current->task_frag;
+
+ do {
+ /* Return error if we don't have space for new frag */
+ if (frg_cnt >= MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return -ENOMEM;
+
+ /* copy the user data to page */
+ copy = min_t(int, length, pfrag->size - pfrag->offset);
+
+ ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
+ offset, copy, 0, skb);
+ if (ret < 0)
+ return -EFAULT;
+
+ /* copy was successful so update the size parameters */
+ skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
+ copy);
+ frg_cnt++;
+ pfrag->offset += copy;
+ get_page(pfrag->page);
+
+ skb->truesize += copy;
+ atomic_add(copy, &sk->sk_wmem_alloc);
+ skb->len += copy;
+ skb->data_len += copy;
+ offset += copy;
+ length -= copy;
+
+ } while (length > 0);
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_append_datato_frags);
+
+/**
+ * skb_pull_rcsum - pull skb and update receive checksum
+ * @skb: buffer to update
+ * @len: length of data pulled
+ *
+ * This function performs an skb_pull on the packet and updates
+ * the CHECKSUM_COMPLETE checksum. It should be used on
+ * receive path processing instead of skb_pull unless you know
+ * that the checksum difference is zero (e.g., a valid IP header)
+ * or you are setting ip_summed to CHECKSUM_NONE.
+ */
+unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+{
+ BUG_ON(len > skb->len);
+ skb->len -= len;
+ BUG_ON(skb->len < skb->data_len);
+ skb_postpull_rcsum(skb, skb->data, len);
+ return skb->data += len;
+}
+EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+
+/**
+ * skb_segment - Perform protocol segmentation on skb.
+ * @head_skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ *
+ * This function performs segmentation on the given skb. It returns
+ * a pointer to the first in a list of new skbs for the segments.
+ * In case of error it returns ERR_PTR(err).
+ */
+struct sk_buff *skb_segment(struct sk_buff *head_skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = NULL;
+ struct sk_buff *tail = NULL;
+ struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
+ skb_frag_t *frag = skb_shinfo(head_skb)->frags;
+ unsigned int mss = skb_shinfo(head_skb)->gso_size;
+ unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
+ struct sk_buff *frag_skb = head_skb;
+ unsigned int offset = doffset;
+ unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+ unsigned int headroom;
+ unsigned int len;
+ __be16 proto;
+ bool csum;
+ int sg = !!(features & NETIF_F_SG);
+ int nfrags = skb_shinfo(head_skb)->nr_frags;
+ int err = -ENOMEM;
+ int i = 0;
+ int pos;
+ int dummy;
+
+ __skb_push(head_skb, doffset);
+ proto = skb_network_protocol(head_skb, &dummy);
+ if (unlikely(!proto))
+ return ERR_PTR(-EINVAL);
+
+ csum = !head_skb->encap_hdr_csum &&
+ !!can_checksum_protocol(features, proto);
+
+ headroom = skb_headroom(head_skb);
+ pos = skb_headlen(head_skb);
+
+ do {
+ struct sk_buff *nskb;
+ skb_frag_t *nskb_frag;
+ int hsize;
+ int size;
+
+ len = head_skb->len - offset;
+ if (len > mss)
+ len = mss;
+
+ hsize = skb_headlen(head_skb) - offset;
+ if (hsize < 0)
+ hsize = 0;
+ if (hsize > len || !sg)
+ hsize = len;
+
+ if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+ (skb_headlen(list_skb) == len || sg)) {
+ BUG_ON(skb_headlen(list_skb) > len);
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+ pos += skb_headlen(list_skb);
+
+ while (pos < offset + len) {
+ BUG_ON(i >= nfrags);
+
+ size = skb_frag_size(frag);
+ if (pos + size > offset + len)
+ break;
+
+ i++;
+ pos += size;
+ frag++;
+ }
+
+ nskb = skb_clone(list_skb, GFP_ATOMIC);
+ list_skb = list_skb->next;
+
+ if (unlikely(!nskb))
+ goto err;
+
+ if (unlikely(pskb_trim(nskb, len))) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ hsize = skb_end_offset(nskb);
+ if (skb_cow_head(nskb, doffset + headroom)) {
+ kfree_skb(nskb);
+ goto err;
+ }
+
+ nskb->truesize += skb_end_offset(nskb) - hsize;
+ skb_release_head_state(nskb);
+ __skb_push(nskb, doffset);
+ } else {
+ nskb = __alloc_skb(hsize + doffset + headroom,
+ GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
+ NUMA_NO_NODE);
+
+ if (unlikely(!nskb))
+ goto err;
+
+ skb_reserve(nskb, headroom);
+ __skb_put(nskb, doffset);
+ }
+
+ if (segs)
+ tail->next = nskb;
+ else
+ segs = nskb;
+ tail = nskb;
+
+ __copy_skb_header(nskb, head_skb);
+
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+ skb_reset_mac_len(nskb);
+
+ skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ doffset + tnl_hlen);
+
+ if (nskb->len == len + doffset)
+ goto perform_csum_check;
+
+ if (!sg && !nskb->remcsum_offload) {
+ nskb->ip_summed = CHECKSUM_NONE;
+ nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb, len),
+ len, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ continue;
+ }
+
+ nskb_frag = skb_shinfo(nskb)->frags;
+
+ skb_copy_from_linear_data_offset(head_skb, offset,
+ skb_put(nskb, hsize), hsize);
+
+ skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
+ SKBTX_SHARED_FRAG;
+
+ while (pos < offset + len) {
+ if (i >= nfrags) {
+ BUG_ON(skb_headlen(list_skb));
+
+ i = 0;
+ nfrags = skb_shinfo(list_skb)->nr_frags;
+ frag = skb_shinfo(list_skb)->frags;
+ frag_skb = list_skb;
+
+ BUG_ON(!nfrags);
+
+ list_skb = list_skb->next;
+ }
+
+ if (unlikely(skb_shinfo(nskb)->nr_frags >=
+ MAX_SKB_FRAGS)) {
+ net_warn_ratelimited(
+ "skb_segment: too many frags: %u %u\n",
+ pos, mss);
+ goto err;
+ }
+
+ if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
+ goto err;
+
+ *nskb_frag = *frag;
+ __skb_frag_ref(nskb_frag);
+ size = skb_frag_size(nskb_frag);
+
+ if (pos < offset) {
+ nskb_frag->page_offset += offset - pos;
+ skb_frag_size_sub(nskb_frag, offset - pos);
+ }
+
+ skb_shinfo(nskb)->nr_frags++;
+
+ if (pos + size <= offset + len) {
+ i++;
+ frag++;
+ pos += size;
+ } else {
+ skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
+ goto skip_fraglist;
+ }
+
+ nskb_frag++;
+ }
+
+skip_fraglist:
+ nskb->data_len = len - hsize;
+ nskb->len += nskb->data_len;
+ nskb->truesize += nskb->data_len;
+
+perform_csum_check:
+ if (!csum && !nskb->remcsum_offload) {
+ nskb->csum = skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ }
+ } while ((offset += len) < head_skb->len);
+
+ /* Some callers want to get the end of the list.
+ * Put it in segs->prev to avoid walking the list.
+ * (see validate_xmit_skb_list() for example)
+ */
+ segs->prev = tail;
+
+ /* Following permits correct backpressure, for protocols
+ * using skb_set_owner_w().
+ * Idea is to tranfert ownership from head_skb to last segment.
+ */
+ if (head_skb->destructor == sock_wfree) {
+ swap(tail->truesize, head_skb->truesize);
+ swap(tail->destructor, head_skb->destructor);
+ swap(tail->sk, head_skb->sk);
+ }
+ return segs;
+
+err:
+ kfree_skb_list(segs);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(skb_segment);
+
+int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
+ unsigned int offset = skb_gro_offset(skb);
+ unsigned int headlen = skb_headlen(skb);
+ unsigned int len = skb_gro_len(skb);
+ struct sk_buff *lp, *p = *head;
+ unsigned int delta_truesize;
+
+ if (unlikely(p->len + len >= 65536))
+ return -E2BIG;
+
+ lp = NAPI_GRO_CB(p)->last;
+ pinfo = skb_shinfo(lp);
+
+ if (headlen <= offset) {
+ skb_frag_t *frag;
+ skb_frag_t *frag2;
+ int i = skbinfo->nr_frags;
+ int nr_frags = pinfo->nr_frags + i;
+
+ if (nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ offset -= headlen;
+ pinfo->nr_frags = nr_frags;
+ skbinfo->nr_frags = 0;
+
+ frag = pinfo->frags + nr_frags;
+ frag2 = skbinfo->frags + i;
+ do {
+ *--frag = *--frag2;
+ } while (--i);
+
+ frag->page_offset += offset;
+ skb_frag_size_sub(frag, offset);
+
+ /* all fragments truesize : remove (head size + sk_buff) */
+ delta_truesize = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
+
+ skb->truesize -= skb->data_len;
+ skb->len -= skb->data_len;
+ skb->data_len = 0;
+
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ frag->page.p = page;
+ frag->page_offset = first_offset;
+ skb_frag_size_set(frag, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
+ goto done;
+ }
+
+merge:
+ delta_truesize = skb->truesize;
+ if (offset > headlen) {
+ unsigned int eat = offset - headlen;
+
+ skbinfo->frags[0].page_offset += eat;
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
+ skb->data_len -= eat;
+ skb->len -= eat;
+ offset = headlen;
+ }
+
+ __skb_pull(skb, offset);
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+ NAPI_GRO_CB(p)->last = skb;
+ __skb_header_release(skb);
+ lp = p;
+
+done:
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += len;
+ p->truesize += delta_truesize;
+ p->len += len;
+ if (lp != p) {
+ lp->data_len += len;
+ lp->truesize += delta_truesize;
+ lp->len += len;
+ }
+ NAPI_GRO_CB(skb)->same_flow = 1;
+ return 0;
+}
+
+void __init skb_init(void)
+{
+ skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
+ sizeof(struct sk_buff),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+ sizeof(struct sk_buff_fclones),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+}
+
+/**
+ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
+ * @skb: Socket buffer containing the buffers to be mapped
+ * @sg: The scatter-gather list to map into
+ * @offset: The offset into the buffer's contents to start mapping
+ * @len: Length of buffer space to be mapped
+ *
+ * Fill the specified scatter-gather list with mappings/pointers into a
+ * region of the buffer space attached to a socket buffer.
+ */
+static int
+__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+ int elt = 0;
+
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ sg_set_buf(sg, skb->data + offset, copy);
+ elt++;
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ if ((copy = end - offset) > 0) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ if (copy > len)
+ copy = len;
+ sg_set_page(&sg[elt], skb_frag_page(frag), copy,
+ frag->page_offset+offset-start);
+ elt++;
+ if (!(len -= copy))
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
+ copy);
+ if ((len -= copy) == 0)
+ return elt;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+ return elt;
+}
+
+/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
+ * sglist without mark the sg which contain last skb data as the end.
+ * So the caller can mannipulate sg list as will when padding new data after
+ * the first call without calling sg_unmark_end to expend sg list.
+ *
+ * Scenario to use skb_to_sgvec_nomark:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec_nomark(payload1)
+ * 3. skb_to_sgvec_nomark(payload2)
+ *
+ * This is equivalent to:
+ * 1. sg_init_table
+ * 2. skb_to_sgvec(payload1)
+ * 3. sg_unmark_end
+ * 4. skb_to_sgvec(payload2)
+ *
+ * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * is more preferable.
+ */
+int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+ int offset, int len)
+{
+ return __skb_to_sgvec(skb, sg, offset, len);
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
+
+int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+ int nsg = __skb_to_sgvec(skb, sg, offset, len);
+
+ sg_mark_end(&sg[nsg - 1]);
+
+ return nsg;
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec);
+
+/**
+ * skb_cow_data - Check that a socket buffer's data buffers are writable
+ * @skb: The socket buffer to check.
+ * @tailbits: Amount of trailing space to be added
+ * @trailer: Returned pointer to the skb where the @tailbits space begins
+ *
+ * Make sure that the data buffers attached to a socket buffer are
+ * writable. If they are not, private copies are made of the data buffers
+ * and the socket buffer is set to use these instead.
+ *
+ * If @tailbits is given, make sure that there is space to write @tailbits
+ * bytes of data beyond current end of socket buffer. @trailer will be
+ * set to point to the skb in which this space begins.
+ *
+ * The number of scatterlist elements required to completely map the
+ * COW'd and extended socket buffer will be returned.
+ */
+int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
+{
+ int copyflag;
+ int elt;
+ struct sk_buff *skb1, **skb_p;
+
+ /* If skb is cloned or its head is paged, reallocate
+ * head pulling out all the pages (pages are considered not writable
+ * at the moment even if they are anonymous).
+ */
+ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
+ __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ /* Easy case. Most of packets will go this way. */
+ if (!skb_has_frag_list(skb)) {
+ /* A little of trouble, not enough of space for trailer.
+ * This should not happen, when stack is tuned to generate
+ * good frames. OK, on miss we reallocate and reserve even more
+ * space, 128 bytes is fair. */
+
+ if (skb_tailroom(skb) < tailbits &&
+ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* Voila! */
+ *trailer = skb;
+ return 1;
+ }
+
+ /* Misery. We are in troubles, going to mincer fragments... */
+
+ elt = 1;
+ skb_p = &skb_shinfo(skb)->frag_list;
+ copyflag = 0;
+
+ while ((skb1 = *skb_p) != NULL) {
+ int ntail = 0;
+
+ /* The fragment is partially pulled by someone,
+ * this can happen on input. Copy it and everything
+ * after it. */
+
+ if (skb_shared(skb1))
+ copyflag = 1;
+
+ /* If the skb is the last, worry about trailer. */
+
+ if (skb1->next == NULL && tailbits) {
+ if (skb_shinfo(skb1)->nr_frags ||
+ skb_has_frag_list(skb1) ||
+ skb_tailroom(skb1) < tailbits)
+ ntail = tailbits + 128;
+ }
+
+ if (copyflag ||
+ skb_cloned(skb1) ||
+ ntail ||
+ skb_shinfo(skb1)->nr_frags ||
+ skb_has_frag_list(skb1)) {
+ struct sk_buff *skb2;
+
+ /* Fuck, we are miserable poor guys... */
+ if (ntail == 0)
+ skb2 = skb_copy(skb1, GFP_ATOMIC);
+ else
+ skb2 = skb_copy_expand(skb1,
+ skb_headroom(skb1),
+ ntail,
+ GFP_ATOMIC);
+ if (unlikely(skb2 == NULL))
+ return -ENOMEM;
+
+ if (skb1->sk)
+ skb_set_owner_w(skb2, skb1->sk);
+
+ /* Looking around. Are we still alive?
+ * OK, link new skb, drop old one */
+
+ skb2->next = skb1->next;
+ *skb_p = skb2;
+ kfree_skb(skb1);
+ skb1 = skb2;
+ }
+ elt++;
+ *trailer = skb1;
+ skb_p = &skb1->next;
+ }
+
+ return elt;
+}
+EXPORT_SYMBOL_GPL(skb_cow_data);
+
+static void sock_rmem_free(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+}
+
+/*
+ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
+ */
+int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+ (unsigned int)sk->sk_rcvbuf)
+ return -ENOMEM;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_rmem_free;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+ /* before exiting rcu section, make sure dst is refcounted */
+ skb_dst_force(skb);
+
+ skb_queue_tail(&sk->sk_error_queue, skb);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return 0;
+}
+EXPORT_SYMBOL(sock_queue_err_skb);
+
+struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
+{
+ struct sk_buff_head *q = &sk->sk_error_queue;
+ struct sk_buff *skb, *skb_next;
+ unsigned long flags;
+ int err = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ skb = __skb_dequeue(q);
+ if (skb && (skb_next = skb_peek(q)))
+ err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ sk->sk_err = err;
+ if (err)
+ sk->sk_error_report(sk);
+
+ return skb;
+}
+EXPORT_SYMBOL(sock_dequeue_err_skb);
+
+/**
+ * skb_clone_sk - create clone of skb, and take reference to socket
+ * @skb: the skb to clone
+ *
+ * This function creates a clone of a buffer that holds a reference on
+ * sk_refcnt. Buffers created via this function are meant to be
+ * returned using sock_queue_err_skb, or free via kfree_skb.
+ *
+ * When passing buffers allocated with this function to sock_queue_err_skb
+ * it is necessary to wrap the call with sock_hold/sock_put in order to
+ * prevent the socket from being released prior to being enqueued on
+ * the sk_error_queue.
+ */
+struct sk_buff *skb_clone_sk(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct sk_buff *clone;
+
+ if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return NULL;
+ }
+
+ clone->sk = sk;
+ clone->destructor = sock_efree;
+
+ return clone;
+}
+EXPORT_SYMBOL(skb_clone_sk);
+
+static void __skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct sock *sk,
+ int tstype)
+{
+ struct sock_exterr_skb *serr;
+ int err;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = tstype;
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
+ serr->ee.ee_data = skb_shinfo(skb)->tskey;
+ if (sk->sk_protocol == IPPROTO_TCP)
+ serr->ee.ee_data -= sk->sk_tskey;
+ }
+
+ err = sock_queue_err_skb(sk, skb);
+
+ if (err)
+ kfree_skb(skb);
+}
+
+static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
+{
+ bool ret;
+
+ if (likely(sysctl_tstamp_allow_data || tsonly))
+ return true;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ ret = sk->sk_socket && sk->sk_socket->file &&
+ file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
+ read_unlock_bh(&sk->sk_callback_lock);
+ return ret;
+}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ struct sock *sk = skb->sk;
+
+ if (!skb_may_tx_timestamp(sk, false))
+ return;
+
+ /* take a reference to prevent skb_orphan() from freeing the socket */
+ sock_hold(sk);
+
+ *skb_hwtstamps(skb) = *hwtstamps;
+ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
+
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
+{
+ struct sk_buff *skb;
+ bool tsonly;
+
+ if (!sk)
+ return;
+
+ tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
+ if (!skb_may_tx_timestamp(sk, tsonly))
+ return;
+
+ if (tsonly)
+ skb = alloc_skb(0, GFP_ATOMIC);
+ else
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ if (tsonly) {
+ skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags;
+ skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
+ }
+
+ if (hwtstamps)
+ *skb_hwtstamps(skb) = *hwtstamps;
+ else
+ skb->tstamp = ktime_get_real();
+
+ __skb_complete_tx_timestamp(skb, sk, tstype);
+}
+EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
+
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+ SCM_TSTAMP_SND);
+}
+EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
+{
+ struct sock *sk = skb->sk;
+ struct sock_exterr_skb *serr;
+ int err;
+
+ skb->wifi_acked_valid = 1;
+ skb->wifi_acked = acked;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+
+ /* take a reference to prevent skb_orphan() from freeing the socket */
+ sock_hold(sk);
+
+ err = sock_queue_err_skb(sk, skb);
+ if (err)
+ kfree_skb(skb);
+
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+
+/**
+ * skb_partial_csum_set - set up and verify partial csum values for packet
+ * @skb: the skb to set
+ * @start: the number of bytes after skb->data to start checksumming.
+ * @off: the offset from start to place the checksum.
+ *
+ * For untrusted partially-checksummed packets, we need to make sure the values
+ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
+ *
+ * This function checks and sets those values and skb->ip_summed: if this
+ * returns false you should drop the packet.
+ */
+bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
+{
+ if (unlikely(start > skb_headlen(skb)) ||
+ unlikely((int)start + off > skb_headlen(skb) - 2)) {
+ net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
+ start, off, skb_headlen(skb));
+ return false;
+ }
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + start;
+ skb->csum_offset = off;
+ skb_set_transport_header(skb, start);
+ return true;
+}
+EXPORT_SYMBOL_GPL(skb_partial_csum_set);
+
+static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
+ unsigned int max)
+{
+ if (skb_headlen(skb) >= len)
+ return 0;
+
+ /* If we need to pullup then pullup to the max, so we
+ * won't need to do it again.
+ */
+ if (max > skb->len)
+ max = skb->len;
+
+ if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+
+ if (skb_headlen(skb) < len)
+ return -EPROTO;
+
+ return 0;
+}
+
+#define MAX_TCP_HDR_LEN (15 * 4)
+
+static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
+ typeof(IPPROTO_IP) proto,
+ unsigned int off)
+{
+ switch (proto) {
+ int err;
+
+ case IPPROTO_TCP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
+ off + MAX_TCP_HDR_LEN);
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct tcphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
+
+ case IPPROTO_UDP:
+ err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
+ off + sizeof(struct udphdr));
+ if (!err && !skb_partial_csum_set(skb, off,
+ offsetof(struct udphdr,
+ check)))
+ err = -EPROTO;
+ return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
+ }
+
+ return ERR_PTR(-EPROTO);
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * maximally sized IP and TCP or UDP headers.
+ */
+#define MAX_IP_HDR_LEN 128
+
+static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
+{
+ unsigned int off;
+ bool fragment;
+ __sum16 *csum;
+ int err;
+
+ fragment = false;
+
+ err = skb_maybe_pull_tail(skb,
+ sizeof(struct iphdr),
+ MAX_IP_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF))
+ fragment = true;
+
+ off = ip_hdrlen(skb);
+
+ err = -EPROTO;
+
+ if (fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - off,
+ ip_hdr(skb)->protocol, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/* This value should be large enough to cover a tagged ethernet header plus
+ * an IPv6 header, all options, and a maximal TCP or UDP header.
+ */
+#define MAX_IPV6_HDR_LEN 256
+
+#define OPT_HDR(type, skb, off) \
+ (type *)(skb_network_header(skb) + (off))
+
+static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+ u8 nexthdr;
+ unsigned int off;
+ unsigned int len;
+ bool fragment;
+ bool done;
+ __sum16 *csum;
+
+ fragment = false;
+ done = false;
+
+ off = sizeof(struct ipv6hdr);
+
+ err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ while (off <= len && !done) {
+ switch (nexthdr) {
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING: {
+ struct ipv6_opt_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ipv6_opt_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_optlen(hp);
+ break;
+ }
+ case IPPROTO_AH: {
+ struct ip_auth_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct ip_auth_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct ip_auth_hdr, skb, off);
+ nexthdr = hp->nexthdr;
+ off += ipv6_authlen(hp);
+ break;
+ }
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr *hp;
+
+ err = skb_maybe_pull_tail(skb,
+ off +
+ sizeof(struct frag_hdr),
+ MAX_IPV6_HDR_LEN);
+ if (err < 0)
+ goto out;
+
+ hp = OPT_HDR(struct frag_hdr, skb, off);
+
+ if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
+ fragment = true;
+
+ nexthdr = hp->nexthdr;
+ off += sizeof(struct frag_hdr);
+ break;
+ }
+ default:
+ done = true;
+ break;
+ }
+ }
+
+ err = -EPROTO;
+
+ if (!done || fragment)
+ goto out;
+
+ csum = skb_checksum_setup_ip(skb, nexthdr, off);
+ if (IS_ERR(csum))
+ return PTR_ERR(csum);
+
+ if (recalculate)
+ *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - off, nexthdr, 0);
+ err = 0;
+
+out:
+ return err;
+}
+
+/**
+ * skb_checksum_setup - set up partial checksum offset
+ * @skb: the skb to set up
+ * @recalculate: if true the pseudo-header checksum will be recalculated
+ */
+int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
+{
+ int err;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ err = skb_checksum_setup_ipv4(skb, recalculate);
+ break;
+
+ case htons(ETH_P_IPV6):
+ err = skb_checksum_setup_ipv6(skb, recalculate);
+ break;
+
+ default:
+ err = -EPROTO;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(skb_checksum_setup);
+
+void __skb_warn_lro_forwarding(const struct sk_buff *skb)
+{
+ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
+ skb->dev->name);
+}
+EXPORT_SYMBOL(__skb_warn_lro_forwarding);
+
+void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
+{
+ if (head_stolen) {
+ skb_release_head_state(skb);
+ kmem_cache_free(skbuff_head_cache, skb);
+ } else {
+ __kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL(kfree_skb_partial);
+
+/**
+ * skb_try_coalesce - try to merge skb to prior one
+ * @to: prior buffer
+ * @from: buffer to add
+ * @fragstolen: pointer to boolean
+ * @delta_truesize: how much more was allocated than was requested
+ */
+bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
+ bool *fragstolen, int *delta_truesize)
+{
+ int i, delta, len = from->len;
+
+ *fragstolen = false;
+
+ if (skb_cloned(to))
+ return false;
+
+ if (len <= skb_tailroom(to)) {
+ if (len)
+ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
+ *delta_truesize = 0;
+ return true;
+ }
+
+ if (skb_has_frag_list(to) || skb_has_frag_list(from))
+ return false;
+
+ if (skb_headlen(from) != 0) {
+ struct page *page;
+ unsigned int offset;
+
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ return false;
+
+ if (skb_head_is_locked(from))
+ return false;
+
+ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+
+ skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
+ page, offset, skb_headlen(from));
+ *fragstolen = true;
+ } else {
+ if (skb_shinfo(to)->nr_frags +
+ skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
+ return false;
+
+ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
+ }
+
+ WARN_ON_ONCE(delta < len);
+
+ memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
+ skb_shinfo(from)->frags,
+ skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
+ skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
+
+ if (!skb_cloned(from))
+ skb_shinfo(from)->nr_frags = 0;
+
+ /* if the skb is not cloned this does nothing
+ * since we set nr_frags to 0.
+ */
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
+ skb_frag_ref(from, i);
+
+ to->truesize += delta;
+ to->len += len;
+ to->data_len += len;
+
+ *delta_truesize = delta;
+ return true;
+}
+EXPORT_SYMBOL(skb_try_coalesce);
+
+/**
+ * skb_scrub_packet - scrub an skb
+ *
+ * @skb: buffer to clean
+ * @xnet: packet is crossing netns
+ *
+ * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * into/from a tunnel. Some information have to be cleared during these
+ * operations.
+ * skb_scrub_packet can also be used to clean a skb before injecting it in
+ * another namespace (@xnet == true). We have to clear all information in the
+ * skb that could impact namespace isolation.
+ */
+void skb_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ skb->tstamp.tv64 = 0;
+ skb->pkt_type = PACKET_HOST;
+ skb->skb_iif = 0;
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ skb_sender_cpu_clear(skb);
+ secpath_reset(skb);
+ nf_reset(skb);
+ nf_reset_trace(skb);
+
+ if (!xnet)
+ return;
+
+ skb_orphan(skb);
+ skb->mark = 0;
+}
+EXPORT_SYMBOL_GPL(skb_scrub_packet);
+
+/**
+ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_transport_seglen is used to determine the real size of the
+ * individual segments, including Layer4 headers (TCP/UDP).
+ *
+ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
+ */
+unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int thlen = 0;
+
+ if (skb->encapsulation) {
+ thlen = skb_inner_transport_header(skb) -
+ skb_transport_header(skb);
+
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ thlen += inner_tcp_hdrlen(skb);
+ } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ thlen = tcp_hdrlen(skb);
+ }
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return thlen + shinfo->gso_size;
+}
+EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
+
+static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
+{
+ if (skb_cow(skb, skb_headroom(skb)) < 0) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+ skb->mac_header += VLAN_HLEN;
+ return skb;
+}
+
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
+{
+ struct vlan_hdr *vhdr;
+ u16 vlan_tci;
+
+ if (unlikely(skb_vlan_tag_present(skb))) {
+ /* vlan_tci is already set-up so leave this for another time */
+ return skb;
+ }
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto err_free;
+
+ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
+ goto err_free;
+
+ vhdr = (struct vlan_hdr *)skb->data;
+ vlan_tci = ntohs(vhdr->h_vlan_TCI);
+ __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
+
+ skb_pull_rcsum(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vhdr);
+
+ skb = skb_reorder_vlan_header(skb);
+ if (unlikely(!skb))
+ goto err_free;
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb;
+
+err_free:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(skb_vlan_untag);
+
+int skb_ensure_writable(struct sk_buff *skb, int write_len)
+{
+ if (!pskb_may_pull(skb, write_len))
+ return -ENOMEM;
+
+ if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
+ return 0;
+
+ return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable);
+
+/* remove VLAN header from packet and update csum accordingly. */
+static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+{
+ struct vlan_hdr *vhdr;
+ unsigned int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ __skb_push(skb, offset);
+ err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
+ if (unlikely(err))
+ goto pull;
+
+ skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+
+ vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+ *vlan_tci = ntohs(vhdr->h_vlan_TCI);
+
+ memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+ __skb_pull(skb, VLAN_HLEN);
+
+ vlan_set_encap_proto(skb, vhdr);
+ skb->mac_header += VLAN_HLEN;
+
+ if (skb_network_offset(skb) < ETH_HLEN)
+ skb_set_network_header(skb, ETH_HLEN);
+
+ skb_reset_mac_len(skb);
+pull:
+ __skb_pull(skb, offset);
+
+ return err;
+}
+
+int skb_vlan_pop(struct sk_buff *skb)
+{
+ u16 vlan_tci;
+ __be16 vlan_proto;
+ int err;
+
+ if (likely(skb_vlan_tag_present(skb))) {
+ skb->vlan_tci = 0;
+ } else {
+ if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
+ skb->protocol != htons(ETH_P_8021AD)) ||
+ skb->len < VLAN_ETH_HLEN))
+ return 0;
+
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (err)
+ return err;
+ }
+ /* move next vlan tag to hw accel tag */
+ if (likely((skb->protocol != htons(ETH_P_8021Q) &&
+ skb->protocol != htons(ETH_P_8021AD)) ||
+ skb->len < VLAN_ETH_HLEN))
+ return 0;
+
+ vlan_proto = skb->protocol;
+ err = __skb_vlan_pop(skb, &vlan_tci);
+ if (unlikely(err))
+ return err;
+
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_pop);
+
+int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+{
+ if (skb_vlan_tag_present(skb)) {
+ unsigned int offset = skb->data - skb_mac_header(skb);
+ int err;
+
+ /* __vlan_insert_tag expect skb->data pointing to mac header.
+ * So change skb->data before calling it and change back to
+ * original position later
+ */
+ __skb_push(skb, offset);
+ err = __vlan_insert_tag(skb, skb->vlan_proto,
+ skb_vlan_tag_get(skb));
+ if (err)
+ return err;
+ skb->protocol = skb->vlan_proto;
+ skb->mac_len += VLAN_HLEN;
+ __skb_pull(skb, offset);
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(skb->data
+ + (2 * ETH_ALEN), VLAN_HLEN, 0));
+ }
+ __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+ return 0;
+}
+EXPORT_SYMBOL(skb_vlan_push);
+
+/**
+ * alloc_skb_with_frags - allocate skb with page frags
+ *
+ * @header_len: size of linear part
+ * @data_len: needed length in frags
+ * @max_page_order: max page order desired.
+ * @errcode: pointer to error code if any
+ * @gfp_mask: allocation mask
+ *
+ * This can be used to allocate a paged skb, given a maximal order for frags.
+ */
+struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
+ unsigned long data_len,
+ int max_page_order,
+ int *errcode,
+ gfp_t gfp_mask)
+{
+ int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+ unsigned long chunk;
+ struct sk_buff *skb;
+ struct page *page;
+ gfp_t gfp_head;
+ int i;
+
+ *errcode = -EMSGSIZE;
+ /* Note this test could be relaxed, if we succeed to allocate
+ * high order pages...
+ */
+ if (npages > MAX_SKB_FRAGS)
+ return NULL;
+
+ gfp_head = gfp_mask;
+ if (gfp_head & __GFP_WAIT)
+ gfp_head |= __GFP_REPEAT;
+
+ *errcode = -ENOBUFS;
+ skb = alloc_skb(header_len, gfp_head);
+ if (!skb)
+ return NULL;
+
+ skb->truesize += npages << PAGE_SHIFT;
+
+ for (i = 0; npages > 0; i++) {
+ int order = max_page_order;
+
+ while (order) {
+ if (npages >= 1 << order) {
+ page = alloc_pages((gfp_mask & ~__GFP_WAIT) |
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
+ order);
+ if (page)
+ goto fill_page;
+ /* Do not retry other high order allocations */
+ order = 1;
+ max_page_order = 0;
+ }
+ order--;
+ }
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
+fill_page:
+ chunk = min_t(unsigned long, data_len,
+ PAGE_SIZE << order);
+ skb_fill_page_desc(skb, i, page, 0, chunk);
+ data_len -= chunk;
+ npages -= 1 << order;
+ }
+ return skb;
+
+failure:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(alloc_skb_with_frags);
diff --git a/net/core/sock.c b/net/core/sock.c
new file mode 100644
index 000000000..dc30dc5bb
--- /dev/null
+++ b/net/core/sock.c
@@ -0,0 +1,2979 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic socket support routines. Memory allocators, socket lock/release
+ * handler for protocols to use and generic option handler.
+ *
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Fixes:
+ * Alan Cox : Numerous verify_area() problems
+ * Alan Cox : Connecting on a connecting socket
+ * now returns an error for tcp.
+ * Alan Cox : sock->protocol is set correctly.
+ * and is not sometimes left as 0.
+ * Alan Cox : connect handles icmp errors on a
+ * connect properly. Unfortunately there
+ * is a restart syscall nasty there. I
+ * can't match BSD without hacking the C
+ * library. Ideas urgently sought!
+ * Alan Cox : Disallow bind() to addresses that are
+ * not ours - especially broadcast ones!!
+ * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
+ * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
+ * instead they leave that for the DESTROY timer.
+ * Alan Cox : Clean up error flag in accept
+ * Alan Cox : TCP ack handling is buggy, the DESTROY timer
+ * was buggy. Put a remove_sock() in the handler
+ * for memory when we hit 0. Also altered the timer
+ * code. The ACK stuff can wait and needs major
+ * TCP layer surgery.
+ * Alan Cox : Fixed TCP ack bug, removed remove sock
+ * and fixed timer/inet_bh race.
+ * Alan Cox : Added zapped flag for TCP
+ * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
+ * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
+ * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
+ * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
+ * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
+ * Rick Sladkey : Relaxed UDP rules for matching packets.
+ * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
+ * Pauline Middelink : identd support
+ * Alan Cox : Fixed connect() taking signals I think.
+ * Alan Cox : SO_LINGER supported
+ * Alan Cox : Error reporting fixes
+ * Anonymous : inet_create tidied up (sk->reuse setting)
+ * Alan Cox : inet sockets don't set sk->type!
+ * Alan Cox : Split socket option code
+ * Alan Cox : Callbacks
+ * Alan Cox : Nagle flag for Charles & Johannes stuff
+ * Alex : Removed restriction on inet fioctl
+ * Alan Cox : Splitting INET from NET core
+ * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
+ * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
+ * Alan Cox : Split IP from generic code
+ * Alan Cox : New kfree_skbmem()
+ * Alan Cox : Make SO_DEBUG superuser only.
+ * Alan Cox : Allow anyone to clear SO_DEBUG
+ * (compatibility fix)
+ * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
+ * Alan Cox : Allocator for a socket is settable.
+ * Alan Cox : SO_ERROR includes soft errors.
+ * Alan Cox : Allow NULL arguments on some SO_ opts
+ * Alan Cox : Generic socket allocation to make hooks
+ * easier (suggested by Craig Metz).
+ * Michael Pall : SO_ERROR returns positive errno again
+ * Steve Whitehouse: Added default destructor to free
+ * protocol private data.
+ * Steve Whitehouse: Added various other default routines
+ * common to several socket families.
+ * Chris Evans : Call suser() check last on F_SETOWN
+ * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
+ * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
+ * Andi Kleen : Fix write_space callback
+ * Chris Evans : Security fixes - signedness again
+ * Arnaldo C. Melo : cleanups, use skb_queue_purge
+ *
+ * To Fix:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/poll.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/user_namespace.h>
+#include <linux/static_key.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/netdevice.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/sock.h>
+#include <linux/net_tstamp.h>
+#include <net/xfrm.h>
+#include <linux/ipsec.h>
+#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
+
+#include <linux/filter.h>
+
+#include <trace/events/sock.h>
+
+#ifdef CONFIG_INET
+#include <net/tcp.h>
+#endif
+
+#include <net/busy_poll.h>
+
+static DEFINE_MUTEX(proto_list_mutex);
+static LIST_HEAD(proto_list);
+
+/**
+ * sk_ns_capable - General socket capability test
+ * @sk: Socket to use a capability on or through
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in the user
+ * namespace @user_ns.
+ */
+bool sk_ns_capable(const struct sock *sk,
+ struct user_namespace *user_ns, int cap)
+{
+ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(sk_ns_capable);
+
+/**
+ * sk_capable - Socket global capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The global capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was
+ * created and the current process has the capability @cap in all user
+ * namespaces.
+ */
+bool sk_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(sk_capable);
+
+/**
+ * sk_net_capable - Network namespace socket capability test
+ * @sk: Socket to use a capability on or through
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket had when the socket was created
+ * and the current process has the capability @cap over the network namespace
+ * the socket is a member of.
+ */
+bool sk_net_capable(const struct sock *sk, int cap)
+{
+ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(sk_net_capable);
+
+
+#ifdef CONFIG_MEMCG_KMEM
+int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ struct proto *proto;
+ int ret = 0;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry(proto, &proto_list, node) {
+ if (proto->init_cgroup) {
+ ret = proto->init_cgroup(memcg, ss);
+ if (ret)
+ goto out;
+ }
+ }
+
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+out:
+ list_for_each_entry_continue_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(memcg);
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+}
+
+void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
+{
+ struct proto *proto;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(memcg);
+ mutex_unlock(&proto_list_mutex);
+}
+#endif
+
+/*
+ * Each address family might have different locking rules, so we have
+ * one slock key per address family:
+ */
+static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_slock_keys[AF_MAX];
+
+#if defined(CONFIG_MEMCG_KMEM)
+struct static_key memcg_socket_limit_enabled;
+EXPORT_SYMBOL(memcg_socket_limit_enabled);
+#endif
+
+/*
+ * Make lock validator output more readable. (we pre-construct these
+ * strings build-time, so that runtime initialization of socket
+ * locks is fast):
+ */
+static const char *const af_family_key_strings[AF_MAX+1] = {
+ "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
+ "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
+ "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
+ "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
+ "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
+ "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
+ "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
+ "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
+ "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
+ "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
+ "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
+ "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
+ "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
+ "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
+};
+static const char *const af_family_slock_key_strings[AF_MAX+1] = {
+ "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
+ "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
+ "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
+ "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
+ "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
+ "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
+ "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
+ "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
+ "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
+ "slock-27" , "slock-28" , "slock-AF_CAN" ,
+ "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
+ "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
+ "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
+};
+static const char *const af_family_clock_key_strings[AF_MAX+1] = {
+ "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
+ "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
+ "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
+ "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
+ "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
+ "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
+ "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
+ "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
+ "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
+ "clock-27" , "clock-28" , "clock-AF_CAN" ,
+ "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
+ "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
+ "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
+ "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
+};
+
+/*
+ * sk_callback_lock locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+static struct lock_class_key af_callback_keys[AF_MAX];
+
+/* Take into consideration the size of the struct sk_buff overhead in the
+ * determination of these values, since that is non-constant across
+ * platforms. This makes socket queueing behavior and performance
+ * not depend upon such differences.
+ */
+#define _SK_MEM_PACKETS 256
+#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
+#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+
+/* Run time adjustable parameters. */
+__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+EXPORT_SYMBOL(sysctl_wmem_max);
+__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+EXPORT_SYMBOL(sysctl_rmem_max);
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+
+/* Maximal space eaten by iovec or ancillary data plus some space */
+int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
+EXPORT_SYMBOL(sysctl_optmem_max);
+
+int sysctl_tstamp_allow_data __read_mostly = 1;
+
+struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(memalloc_socks);
+
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
+ */
+void sk_set_memalloc(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation |= __GFP_MEMALLOC;
+ static_key_slow_inc(&memalloc_socks);
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
+void sk_clear_memalloc(struct sock *sk)
+{
+ sock_reset_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation &= ~__GFP_MEMALLOC;
+ static_key_slow_dec(&memalloc_socks);
+
+ /*
+ * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
+ * progress of swapping. SOCK_MEMALLOC may be cleared while
+ * it has rmem allocations due to the last swapfile being deactivated
+ * but there is a risk that the socket is unusable due to exceeding
+ * the rmem limits. Reclaim the reserves and obey rmem limits again.
+ */
+ sk_mem_reclaim(sk);
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+ unsigned long pflags = current->flags;
+
+ /* these should have been dropped before queueing */
+ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+ current->flags |= PF_MEMALLOC;
+ ret = sk->sk_backlog_rcv(sk, skb);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+ return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
+
+static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
+{
+ struct timeval tv;
+
+ if (optlen < sizeof(tv))
+ return -EINVAL;
+ if (copy_from_user(&tv, optval, sizeof(tv)))
+ return -EFAULT;
+ if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
+ return -EDOM;
+
+ if (tv.tv_sec < 0) {
+ static int warned __read_mostly;
+
+ *timeo_p = 0;
+ if (warned < 10 && net_ratelimit()) {
+ warned++;
+ pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
+ __func__, current->comm, task_pid_nr(current));
+ }
+ return 0;
+ }
+ *timeo_p = MAX_SCHEDULE_TIMEOUT;
+ if (tv.tv_sec == 0 && tv.tv_usec == 0)
+ return 0;
+ if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
+ *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+ return 0;
+}
+
+static void sock_warn_obsolete_bsdism(const char *name)
+{
+ static int warned;
+ static char warncomm[TASK_COMM_LEN];
+ if (strcmp(warncomm, current->comm) && warned < 5) {
+ strcpy(warncomm, current->comm);
+ pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
+ warncomm, name);
+ warned++;
+ }
+}
+
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
+static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
+{
+ if (sk->sk_flags & flags) {
+ sk->sk_flags &= ~flags;
+ if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+ net_disable_timestamp();
+ }
+}
+
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+ unsigned long flags;
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
+ atomic_inc(&sk->sk_drops);
+ trace_sock_rcvqueue_full(sk, skb);
+ return -ENOMEM;
+ }
+
+ err = sk_filter(sk, skb);
+ if (err)
+ return err;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
+ atomic_inc(&sk->sk_drops);
+ return -ENOBUFS;
+ }
+
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+
+ /* we escape from rcu protected region, make sure we dont leak
+ * a norefcounted dst
+ */
+ skb_dst_force(skb);
+
+ spin_lock_irqsave(&list->lock, flags);
+ sock_skb_set_dropcount(sk, skb);
+ __skb_queue_tail(list, skb);
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return 0;
+}
+EXPORT_SYMBOL(sock_queue_rcv_skb);
+
+int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
+{
+ int rc = NET_RX_SUCCESS;
+
+ if (sk_filter(sk, skb))
+ goto discard_and_relse;
+
+ skb->dev = NULL;
+
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
+ atomic_inc(&sk->sk_drops);
+ goto discard_and_relse;
+ }
+ if (nested)
+ bh_lock_sock_nested(sk);
+ else
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ /*
+ * trylock + unlock semantics:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+
+ rc = sk_backlog_rcv(sk, skb);
+
+ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+ } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ bh_unlock_sock(sk);
+ atomic_inc(&sk->sk_drops);
+ goto discard_and_relse;
+ }
+
+ bh_unlock_sock(sk);
+out:
+ sock_put(sk);
+ return rc;
+discard_and_relse:
+ kfree_skb(skb);
+ goto out;
+}
+EXPORT_SYMBOL(sk_receive_skb);
+
+struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ sk_tx_queue_clear(sk);
+ RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL(__sk_dst_check);
+
+struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ sk_dst_reset(sk);
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL(sk_dst_check);
+
+static int sock_setbindtodevice(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+ int index;
+
+ /* Sorry... */
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ goto out;
+
+ ret = -EINVAL;
+ if (optlen < 0)
+ goto out;
+
+ /* Bind this socket to a particular device like "eth0",
+ * as specified in the passed interface name. If the
+ * name is "" or the option length is zero the socket
+ * is not bound.
+ */
+ if (optlen > IFNAMSIZ - 1)
+ optlen = IFNAMSIZ - 1;
+ memset(devname, 0, sizeof(devname));
+
+ ret = -EFAULT;
+ if (copy_from_user(devname, optval, optlen))
+ goto out;
+
+ index = 0;
+ if (devname[0] != '\0') {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, devname);
+ if (dev)
+ index = dev->ifindex;
+ rcu_read_unlock();
+ ret = -ENODEV;
+ if (!dev)
+ goto out;
+ }
+
+ lock_sock(sk);
+ sk->sk_bound_dev_if = index;
+ sk_dst_reset(sk);
+ release_sock(sk);
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+static int sock_getbindtodevice(struct sock *sk, char __user *optval,
+ int __user *optlen, int len)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+
+ if (sk->sk_bound_dev_if == 0) {
+ len = 0;
+ goto zero;
+ }
+
+ ret = -EINVAL;
+ if (len < IFNAMSIZ)
+ goto out;
+
+ ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ if (ret)
+ goto out;
+
+ len = strlen(devname) + 1;
+
+ ret = -EFAULT;
+ if (copy_to_user(optval, devname, len))
+ goto out;
+
+zero:
+ ret = -EFAULT;
+ if (put_user(len, optlen))
+ goto out;
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+{
+ if (valbool)
+ sock_set_flag(sk, bit);
+ else
+ sock_reset_flag(sk, bit);
+}
+
+bool sk_mc_loop(struct sock *sk)
+{
+ if (dev_recursion_level())
+ return false;
+ if (!sk)
+ return true;
+ switch (sk->sk_family) {
+ case AF_INET:
+ return inet_sk(sk)->mc_loop;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return inet6_sk(sk)->mc_loop;
+#endif
+ }
+ WARN_ON(1);
+ return true;
+}
+EXPORT_SYMBOL(sk_mc_loop);
+
+/*
+ * This is meant for all protocols to use and covers goings on
+ * at the socket level. Everything here is generic.
+ */
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int val;
+ int valbool;
+ struct linger ling;
+ int ret = 0;
+
+ /*
+ * Options without arguments
+ */
+
+ if (optname == SO_BINDTODEVICE)
+ return sock_setbindtodevice(sk, optval, optlen);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case SO_DEBUG:
+ if (val && !capable(CAP_NET_ADMIN))
+ ret = -EACCES;
+ else
+ sock_valbool_flag(sk, SOCK_DBG, valbool);
+ break;
+ case SO_REUSEADDR:
+ sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+ break;
+ case SO_REUSEPORT:
+ sk->sk_reuseport = valbool;
+ break;
+ case SO_TYPE:
+ case SO_PROTOCOL:
+ case SO_DOMAIN:
+ case SO_ERROR:
+ ret = -ENOPROTOOPT;
+ break;
+ case SO_DONTROUTE:
+ sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+ break;
+ case SO_BROADCAST:
+ sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
+ break;
+ case SO_SNDBUF:
+ /* Don't error on this BSD doesn't and if you think
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_wmem_max);
+set_sndbuf:
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
+ /* Wake up sending tasks if we upped the value. */
+ sk->sk_write_space(sk);
+ break;
+
+ case SO_SNDBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_sndbuf;
+
+ case SO_RCVBUF:
+ /* Don't error on this BSD doesn't and if you think
+ * about it this is right. Otherwise apps have to
+ * play 'guess the biggest size' games. RCVBUF/SNDBUF
+ * are treated in BSD as hints
+ */
+ val = min_t(u32, val, sysctl_rmem_max);
+set_rcvbuf:
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ /*
+ * We double it on the way in to account for
+ * "struct sk_buff" etc. overhead. Applications
+ * assume that the SO_RCVBUF setting they make will
+ * allow that much actual data to be received on that
+ * socket.
+ *
+ * Applications are unaware that "struct sk_buff" and
+ * other overheads allocate from the receive buffer
+ * during socket buffer allocation.
+ *
+ * And after considering the possible alternatives,
+ * returning the value we actually used in getsockopt
+ * is the most desirable behavior.
+ */
+ sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
+ break;
+
+ case SO_RCVBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_rcvbuf;
+
+ case SO_KEEPALIVE:
+#ifdef CONFIG_INET
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
+ tcp_set_keepalive(sk, valbool);
+#endif
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+ break;
+
+ case SO_OOBINLINE:
+ sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+ break;
+
+ case SO_NO_CHECK:
+ sk->sk_no_check_tx = valbool;
+ break;
+
+ case SO_PRIORITY:
+ if ((val >= 0 && val <= 6) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ sk->sk_priority = val;
+ else
+ ret = -EPERM;
+ break;
+
+ case SO_LINGER:
+ if (optlen < sizeof(ling)) {
+ ret = -EINVAL; /* 1003.1g */
+ break;
+ }
+ if (copy_from_user(&ling, optval, sizeof(ling))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (!ling.l_onoff)
+ sock_reset_flag(sk, SOCK_LINGER);
+ else {
+#if (BITS_PER_LONG == 32)
+ if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+ else
+#endif
+ sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+ sock_set_flag(sk, SOCK_LINGER);
+ }
+ break;
+
+ case SO_BSDCOMPAT:
+ sock_warn_obsolete_bsdism("setsockopt");
+ break;
+
+ case SO_PASSCRED:
+ if (valbool)
+ set_bit(SOCK_PASSCRED, &sock->flags);
+ else
+ clear_bit(SOCK_PASSCRED, &sock->flags);
+ break;
+
+ case SO_TIMESTAMP:
+ case SO_TIMESTAMPNS:
+ if (valbool) {
+ if (optname == SO_TIMESTAMP)
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ else
+ sock_set_flag(sk, SOCK_RCVTSTAMPNS);
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ } else {
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ }
+ break;
+
+ case SO_TIMESTAMPING:
+ if (val & ~SOF_TIMESTAMPING_MASK) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_tskey = tcp_sk(sk)->snd_una;
+ } else {
+ sk->sk_tskey = 0;
+ }
+ }
+ sk->sk_tsflags = val;
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ break;
+
+ case SO_RCVLOWAT:
+ if (val < 0)
+ val = INT_MAX;
+ sk->sk_rcvlowat = val ? : 1;
+ break;
+
+ case SO_RCVTIMEO:
+ ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+ break;
+
+ case SO_SNDTIMEO:
+ ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+ break;
+
+ case SO_ATTACH_FILTER:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
+ break;
+
+ ret = sk_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_ATTACH_BPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_attach_bpf(ufd, sk);
+ }
+ break;
+
+ case SO_DETACH_FILTER:
+ ret = sk_detach_filter(sk);
+ break;
+
+ case SO_LOCK_FILTER:
+ if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
+ ret = -EPERM;
+ else
+ sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
+ break;
+
+ case SO_PASSSEC:
+ if (valbool)
+ set_bit(SOCK_PASSSEC, &sock->flags);
+ else
+ clear_bit(SOCK_PASSSEC, &sock->flags);
+ break;
+ case SO_MARK:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ sk->sk_mark = val;
+ break;
+
+ case SO_RXQ_OVFL:
+ sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
+ break;
+
+ case SO_WIFI_STATUS:
+ sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+ break;
+
+ case SO_PEEK_OFF:
+ if (sock->ops->set_peek_off)
+ ret = sock->ops->set_peek_off(sk, val);
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_NOFCS:
+ sock_valbool_flag(sk, SOCK_NOFCS, valbool);
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ /* allow unprivileged users to decrease the value */
+ if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else {
+ if (val < 0)
+ ret = -EINVAL;
+ else
+ sk->sk_ll_usec = val;
+ }
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
+
+ default:
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ release_sock(sk);
+ return ret;
+}
+EXPORT_SYMBOL(sock_setsockopt);
+
+
+static void cred_to_ucred(struct pid *pid, const struct cred *cred,
+ struct ucred *ucred)
+{
+ ucred->pid = pid_vnr(pid);
+ ucred->uid = ucred->gid = -1;
+ if (cred) {
+ struct user_namespace *current_ns = current_user_ns();
+
+ ucred->uid = from_kuid_munged(current_ns, cred->euid);
+ ucred->gid = from_kgid_munged(current_ns, cred->egid);
+ }
+}
+
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ union {
+ int val;
+ struct linger ling;
+ struct timeval tm;
+ } v;
+
+ int lv = sizeof(int);
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ memset(&v, 0, sizeof(v));
+
+ switch (optname) {
+ case SO_DEBUG:
+ v.val = sock_flag(sk, SOCK_DBG);
+ break;
+
+ case SO_DONTROUTE:
+ v.val = sock_flag(sk, SOCK_LOCALROUTE);
+ break;
+
+ case SO_BROADCAST:
+ v.val = sock_flag(sk, SOCK_BROADCAST);
+ break;
+
+ case SO_SNDBUF:
+ v.val = sk->sk_sndbuf;
+ break;
+
+ case SO_RCVBUF:
+ v.val = sk->sk_rcvbuf;
+ break;
+
+ case SO_REUSEADDR:
+ v.val = sk->sk_reuse;
+ break;
+
+ case SO_REUSEPORT:
+ v.val = sk->sk_reuseport;
+ break;
+
+ case SO_KEEPALIVE:
+ v.val = sock_flag(sk, SOCK_KEEPOPEN);
+ break;
+
+ case SO_TYPE:
+ v.val = sk->sk_type;
+ break;
+
+ case SO_PROTOCOL:
+ v.val = sk->sk_protocol;
+ break;
+
+ case SO_DOMAIN:
+ v.val = sk->sk_family;
+ break;
+
+ case SO_ERROR:
+ v.val = -sock_error(sk);
+ if (v.val == 0)
+ v.val = xchg(&sk->sk_err_soft, 0);
+ break;
+
+ case SO_OOBINLINE:
+ v.val = sock_flag(sk, SOCK_URGINLINE);
+ break;
+
+ case SO_NO_CHECK:
+ v.val = sk->sk_no_check_tx;
+ break;
+
+ case SO_PRIORITY:
+ v.val = sk->sk_priority;
+ break;
+
+ case SO_LINGER:
+ lv = sizeof(v.ling);
+ v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
+ v.ling.l_linger = sk->sk_lingertime / HZ;
+ break;
+
+ case SO_BSDCOMPAT:
+ sock_warn_obsolete_bsdism("getsockopt");
+ break;
+
+ case SO_TIMESTAMP:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+ !sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_TIMESTAMPNS:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+ break;
+
+ case SO_TIMESTAMPING:
+ v.val = sk->sk_tsflags;
+ break;
+
+ case SO_RCVTIMEO:
+ lv = sizeof(struct timeval);
+ if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
+ v.tm.tv_sec = 0;
+ v.tm.tv_usec = 0;
+ } else {
+ v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
+ v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+ }
+ break;
+
+ case SO_SNDTIMEO:
+ lv = sizeof(struct timeval);
+ if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
+ v.tm.tv_sec = 0;
+ v.tm.tv_usec = 0;
+ } else {
+ v.tm.tv_sec = sk->sk_sndtimeo / HZ;
+ v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+ }
+ break;
+
+ case SO_RCVLOWAT:
+ v.val = sk->sk_rcvlowat;
+ break;
+
+ case SO_SNDLOWAT:
+ v.val = 1;
+ break;
+
+ case SO_PASSCRED:
+ v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
+ break;
+
+ case SO_PEERCRED:
+ {
+ struct ucred peercred;
+ if (len > sizeof(peercred))
+ len = sizeof(peercred);
+ cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
+ if (copy_to_user(optval, &peercred, len))
+ return -EFAULT;
+ goto lenout;
+ }
+
+ case SO_PEERNAME:
+ {
+ char address[128];
+
+ if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+ return -ENOTCONN;
+ if (lv < len)
+ return -EINVAL;
+ if (copy_to_user(optval, address, len))
+ return -EFAULT;
+ goto lenout;
+ }
+
+ /* Dubious BSD thing... Probably nobody even uses it, but
+ * the UNIX standard wants it for whatever reason... -DaveM
+ */
+ case SO_ACCEPTCONN:
+ v.val = sk->sk_state == TCP_LISTEN;
+ break;
+
+ case SO_PASSSEC:
+ v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
+ break;
+
+ case SO_PEERSEC:
+ return security_socket_getpeersec_stream(sock, optval, optlen, len);
+
+ case SO_MARK:
+ v.val = sk->sk_mark;
+ break;
+
+ case SO_RXQ_OVFL:
+ v.val = sock_flag(sk, SOCK_RXQ_OVFL);
+ break;
+
+ case SO_WIFI_STATUS:
+ v.val = sock_flag(sk, SOCK_WIFI_STATUS);
+ break;
+
+ case SO_PEEK_OFF:
+ if (!sock->ops->set_peek_off)
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_peek_off;
+ break;
+ case SO_NOFCS:
+ v.val = sock_flag(sk, SOCK_NOFCS);
+ break;
+
+ case SO_BINDTODEVICE:
+ return sock_getbindtodevice(sk, optval, optlen, len);
+
+ case SO_GET_FILTER:
+ len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ if (len < 0)
+ return len;
+
+ goto lenout;
+
+ case SO_LOCK_FILTER:
+ v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
+ break;
+
+ case SO_BPF_EXTENSIONS:
+ v.val = bpf_tell_extensions();
+ break;
+
+ case SO_SELECT_ERR_QUEUE:
+ v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
+ break;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ v.val = sk->sk_ll_usec;
+ break;
+#endif
+
+ case SO_MAX_PACING_RATE:
+ v.val = sk->sk_max_pacing_rate;
+ break;
+
+ case SO_INCOMING_CPU:
+ v.val = sk->sk_incoming_cpu;
+ break;
+
+ default:
+ /* We implement the SO_SNDLOWAT etc to not be settable
+ * (1003.1g 7).
+ */
+ return -ENOPROTOOPT;
+ }
+
+ if (len > lv)
+ len = lv;
+ if (copy_to_user(optval, &v, len))
+ return -EFAULT;
+lenout:
+ if (put_user(len, optlen))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * Initialize an sk_lock.
+ *
+ * (We also register the sk_lock with the lock validator.)
+ */
+static inline void sock_lock_init(struct sock *sk)
+{
+ sock_lock_init_class_and_name(sk,
+ af_family_slock_key_strings[sk->sk_family],
+ af_family_slock_keys + sk->sk_family,
+ af_family_key_strings[sk->sk_family],
+ af_family_keys + sk->sk_family);
+}
+
+/*
+ * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
+ * even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
+ */
+static void sock_copy(struct sock *nsk, const struct sock *osk)
+{
+#ifdef CONFIG_SECURITY_NETWORK
+ void *sptr = nsk->sk_security;
+#endif
+ memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
+
+ memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+#ifdef CONFIG_SECURITY_NETWORK
+ nsk->sk_security = sptr;
+ security_sk_clone(osk, nsk);
+#endif
+}
+
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
+{
+ unsigned long nulls1, nulls2;
+
+ nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
+ nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
+ if (nulls1 > nulls2)
+ swap(nulls1, nulls2);
+
+ if (nulls1 != 0)
+ memset((char *)sk, 0, nulls1);
+ memset((char *)sk + nulls1 + sizeof(void *), 0,
+ nulls2 - nulls1 - sizeof(void *));
+ memset((char *)sk + nulls2 + sizeof(void *), 0,
+ size - nulls2 - sizeof(void *));
+}
+EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
+
+static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+ int family)
+{
+ struct sock *sk;
+ struct kmem_cache *slab;
+
+ slab = prot->slab;
+ if (slab != NULL) {
+ sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
+ if (!sk)
+ return sk;
+ if (priority & __GFP_ZERO) {
+ if (prot->clear_sk)
+ prot->clear_sk(sk, prot->obj_size);
+ else
+ sk_prot_clear_nulls(sk, prot->obj_size);
+ }
+ } else
+ sk = kmalloc(prot->obj_size, priority);
+
+ if (sk != NULL) {
+ kmemcheck_annotate_bitfield(sk, flags);
+
+ if (security_sk_alloc(sk, family, priority))
+ goto out_free;
+
+ if (!try_module_get(prot->owner))
+ goto out_free_sec;
+ sk_tx_queue_clear(sk);
+ }
+
+ return sk;
+
+out_free_sec:
+ security_sk_free(sk);
+out_free:
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ return NULL;
+}
+
+static void sk_prot_free(struct proto *prot, struct sock *sk)
+{
+ struct kmem_cache *slab;
+ struct module *owner;
+
+ owner = prot->owner;
+ slab = prot->slab;
+
+ security_sk_free(sk);
+ if (slab != NULL)
+ kmem_cache_free(slab, sk);
+ else
+ kfree(sk);
+ module_put(owner);
+}
+
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+void sock_update_netprioidx(struct sock *sk)
+{
+ if (in_interrupt())
+ return;
+
+ sk->sk_cgrp_prioidx = task_netprioidx(current);
+}
+EXPORT_SYMBOL_GPL(sock_update_netprioidx);
+#endif
+
+/**
+ * sk_alloc - All socket objects are allocated here
+ * @net: the applicable net namespace
+ * @family: protocol family
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @prot: struct proto associated with this new sock instance
+ */
+struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+ struct proto *prot)
+{
+ struct sock *sk;
+
+ sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
+ if (sk) {
+ sk->sk_family = family;
+ /*
+ * See comment in struct sock definition to understand
+ * why we need sk_prot_creator -acme
+ */
+ sk->sk_prot = sk->sk_prot_creator = prot;
+ sock_lock_init(sk);
+ sock_net_set(sk, get_net(net));
+ atomic_set(&sk->sk_wmem_alloc, 1);
+
+ sock_update_classid(sk);
+ sock_update_netprioidx(sk);
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL(sk_alloc);
+
+static void __sk_free(struct sock *sk)
+{
+ struct sk_filter *filter;
+
+ if (sk->sk_destruct)
+ sk->sk_destruct(sk);
+
+ filter = rcu_dereference_check(sk->sk_filter,
+ atomic_read(&sk->sk_wmem_alloc) == 0);
+ if (filter) {
+ sk_filter_uncharge(sk, filter);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
+ }
+
+ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
+
+ if (atomic_read(&sk->sk_omem_alloc))
+ pr_debug("%s: optmem leakage (%d bytes) detected\n",
+ __func__, atomic_read(&sk->sk_omem_alloc));
+
+ if (sk->sk_peer_cred)
+ put_cred(sk->sk_peer_cred);
+ put_pid(sk->sk_peer_pid);
+ put_net(sock_net(sk));
+ sk_prot_free(sk->sk_prot_creator, sk);
+}
+
+void sk_free(struct sock *sk)
+{
+ /*
+ * We subtract one from sk_wmem_alloc and can know if
+ * some packets are still in some tx queue.
+ * If not null, sock_wfree() will call __sk_free(sk) later
+ */
+ if (atomic_dec_and_test(&sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+EXPORT_SYMBOL(sk_free);
+
+/*
+ * Last sock_put should drop reference to sk->sk_net. It has already
+ * been dropped in sk_change_net. Taking reference to stopping namespace
+ * is not an option.
+ * Take reference to a socket to remove it from hash _alive_ and after that
+ * destroy it in the context of init_net.
+ */
+void sk_release_kernel(struct sock *sk)
+{
+ if (sk == NULL || sk->sk_socket == NULL)
+ return;
+
+ sock_hold(sk);
+ sock_release(sk->sk_socket);
+ sock_net_set(sk, get_net(&init_net));
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sk_release_kernel);
+
+static void sk_update_clone(const struct sock *sk, struct sock *newsk)
+{
+ if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+ sock_update_memcg(newsk);
+}
+
+/**
+ * sk_clone_lock - clone a socket, and lock its clone
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
+{
+ struct sock *newsk;
+ bool is_charged = true;
+
+ newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+ if (newsk != NULL) {
+ struct sk_filter *filter;
+
+ sock_copy(newsk, sk);
+
+ /* SANITY */
+ get_net(sock_net(newsk));
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
+
+ atomic_set(&newsk->sk_rmem_alloc, 0);
+ /*
+ * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
+ */
+ atomic_set(&newsk->sk_wmem_alloc, 1);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ skb_queue_head_init(&newsk->sk_receive_queue);
+ skb_queue_head_init(&newsk->sk_write_queue);
+
+ spin_lock_init(&newsk->sk_dst_lock);
+ rwlock_init(&newsk->sk_callback_lock);
+ lockdep_set_class_and_name(&newsk->sk_callback_lock,
+ af_callback_keys + newsk->sk_family,
+ af_family_clock_key_strings[newsk->sk_family]);
+
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+
+ sock_reset_flag(newsk, SOCK_DONE);
+ skb_queue_head_init(&newsk->sk_error_queue);
+
+ filter = rcu_dereference_protected(newsk->sk_filter, 1);
+ if (filter != NULL)
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
+ */
+ is_charged = sk_filter_charge(newsk, filter);
+
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ bh_unlock_sock(newsk);
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
+ }
+
+ newsk->sk_err = 0;
+ newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
+ atomic64_set(&newsk->sk_cookie, 0);
+ /*
+ * Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.txt for details)
+ */
+ smp_wmb();
+ atomic_set(&newsk->sk_refcnt, 2);
+
+ /*
+ * Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ sk_set_socket(newsk, NULL);
+ newsk->sk_wq = NULL;
+
+ sk_update_clone(sk, newsk);
+
+ if (newsk->sk_prot->sockets_allocated)
+ sk_sockets_allocated_inc(newsk);
+
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
+ }
+out:
+ return newsk;
+}
+EXPORT_SYMBOL_GPL(sk_clone_lock);
+
+void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+ __sk_dst_set(sk, dst);
+ sk->sk_route_caps = dst->dev->features;
+ if (sk->sk_route_caps & NETIF_F_GSO)
+ sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+ sk->sk_route_caps &= ~sk->sk_route_nocaps;
+ if (sk_can_gso(sk)) {
+ if (dst->header_len) {
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ } else {
+ sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ sk->sk_gso_max_size = dst->dev->gso_max_size;
+ sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(sk_setup_caps);
+
+/*
+ * Simple resource managers for sockets.
+ */
+
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ */
+void sock_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ unsigned int len = skb->truesize;
+
+ if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ /*
+ * Keep a reference on sk_wmem_alloc, this will be released
+ * after sk_write_space() call
+ */
+ atomic_sub(len - 1, &sk->sk_wmem_alloc);
+ sk->sk_write_space(sk);
+ len = 1;
+ }
+ /*
+ * if sk_wmem_alloc reaches 0, we must finish what sk_free()
+ * could not do because of in-flight packets
+ */
+ if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+EXPORT_SYMBOL(sock_wfree);
+
+void skb_orphan_partial(struct sk_buff *skb)
+{
+ /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
+ * so we do not completely orphan skb, but transfert all
+ * accounted bytes but one, to avoid unexpected reorders.
+ */
+ if (skb->destructor == sock_wfree
+#ifdef CONFIG_INET
+ || skb->destructor == tcp_wfree
+#endif
+ ) {
+ atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
+ skb->truesize = 1;
+ } else {
+ skb_orphan(skb);
+ }
+}
+EXPORT_SYMBOL(skb_orphan_partial);
+
+/*
+ * Read buffer destructor automatically called from kfree_skb.
+ */
+void sock_rfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ unsigned int len = skb->truesize;
+
+ atomic_sub(len, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, len);
+}
+EXPORT_SYMBOL(sock_rfree);
+
+/*
+ * Buffer destructor for skbs that are not used directly in read or write
+ * path, e.g. for error handler skbs. Automatically called from kfree_skb.
+ */
+void sock_efree(struct sk_buff *skb)
+{
+ sock_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_efree);
+
+kuid_t sock_i_uid(struct sock *sk)
+{
+ kuid_t uid;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
+ read_unlock_bh(&sk->sk_callback_lock);
+ return uid;
+}
+EXPORT_SYMBOL(sock_i_uid);
+
+unsigned long sock_i_ino(struct sock *sk)
+{
+ unsigned long ino;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
+ read_unlock_bh(&sk->sk_callback_lock);
+ return ino;
+}
+EXPORT_SYMBOL(sock_i_ino);
+
+/*
+ * Allocate a skb from the socket's send buffer.
+ */
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+ gfp_t priority)
+{
+ if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ struct sk_buff *skb = alloc_skb(size, priority);
+ if (skb) {
+ skb_set_owner_w(skb, sk);
+ return skb;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(sock_wmalloc);
+
+/*
+ * Allocate a memory block from the socket's option memory buffer.
+ */
+void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
+{
+ if ((unsigned int)size <= sysctl_optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ void *mem;
+ /* First do the add, to avoid the race if kmalloc
+ * might sleep.
+ */
+ atomic_add(size, &sk->sk_omem_alloc);
+ mem = kmalloc(size, priority);
+ if (mem)
+ return mem;
+ atomic_sub(size, &sk->sk_omem_alloc);
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(sock_kmalloc);
+
+/* Free an option memory block. Note, we actually want the inline
+ * here as this allows gcc to detect the nullify and fold away the
+ * condition entirely.
+ */
+static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
+ const bool nullify)
+{
+ if (WARN_ON_ONCE(!mem))
+ return;
+ if (nullify)
+ kzfree(mem);
+ else
+ kfree(mem);
+ atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+void sock_kfree_s(struct sock *sk, void *mem, int size)
+{
+ __sock_kfree_s(sk, mem, size, false);
+}
+EXPORT_SYMBOL(sock_kfree_s);
+
+void sock_kzfree_s(struct sock *sk, void *mem, int size)
+{
+ __sock_kfree_s(sk, mem, size, true);
+}
+EXPORT_SYMBOL(sock_kzfree_s);
+
+/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
+ I think, these locks should be removed for datagram sockets.
+ */
+static long sock_wait_for_wmem(struct sock *sk, long timeo)
+{
+ DEFINE_WAIT(wait);
+
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ for (;;) {
+ if (!timeo)
+ break;
+ if (signal_pending(current))
+ break;
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ break;
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ break;
+ if (sk->sk_err)
+ break;
+ timeo = schedule_timeout(timeo);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return timeo;
+}
+
+
+/*
+ * Generic send/receive buffer handlers
+ */
+
+struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
+ unsigned long data_len, int noblock,
+ int *errcode, int max_page_order)
+{
+ struct sk_buff *skb;
+ long timeo;
+ int err;
+
+ timeo = sock_sndtimeo(sk, noblock);
+ for (;;) {
+ err = sock_error(sk);
+ if (err != 0)
+ goto failure;
+
+ err = -EPIPE;
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ goto failure;
+
+ if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ break;
+
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
+ goto failure;
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = sock_wait_for_wmem(sk, timeo);
+ }
+ skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
+ errcode, sk->sk_allocation);
+ if (skb)
+ skb_set_owner_w(skb, sk);
+ return skb;
+
+interrupted:
+ err = sock_intr_errno(timeo);
+failure:
+ *errcode = err;
+ return NULL;
+}
+EXPORT_SYMBOL(sock_alloc_send_pskb);
+
+struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
+ int noblock, int *errcode)
+{
+ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
+}
+EXPORT_SYMBOL(sock_alloc_send_skb);
+
+/* On 32bit arches, an skb frag is limited to 2^15 */
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
+
+/**
+ * skb_page_frag_refill - check that a page_frag contains enough room
+ * @sz: minimum size of the fragment we want to get
+ * @pfrag: pointer to page_frag
+ * @gfp: priority for memory allocation
+ *
+ * Note: While this allocator tries to use high order pages, there is
+ * no guarantee that allocations succeed. Therefore, @sz MUST be
+ * less or equal than PAGE_SIZE.
+ */
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
+{
+ if (pfrag->page) {
+ if (atomic_read(&pfrag->page->_count) == 1) {
+ pfrag->offset = 0;
+ return true;
+ }
+ if (pfrag->offset + sz <= pfrag->size)
+ return true;
+ put_page(pfrag->page);
+ }
+
+ pfrag->offset = 0;
+ if (SKB_FRAG_PAGE_ORDER) {
+ pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
+ __GFP_NOWARN | __GFP_NORETRY,
+ SKB_FRAG_PAGE_ORDER);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
+ return true;
+ }
+ }
+ pfrag->page = alloc_page(gfp);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE;
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(skb_page_frag_refill);
+
+bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
+ return true;
+
+ sk_enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
+}
+EXPORT_SYMBOL(sk_page_frag_refill);
+
+static void __lock_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
+{
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_bh(&sk->sk_lock.slock);
+ schedule();
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (!sock_owned_by_user(sk))
+ break;
+ }
+ finish_wait(&sk->sk_lock.wq, &wait);
+}
+
+static void __release_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
+{
+ struct sk_buff *skb = sk->sk_backlog.head;
+
+ do {
+ sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+ bh_unlock_sock(sk);
+
+ do {
+ struct sk_buff *next = skb->next;
+
+ prefetch(next);
+ WARN_ON_ONCE(skb_dst_is_noref(skb));
+ skb->next = NULL;
+ sk_backlog_rcv(sk, skb);
+
+ /*
+ * We are in process context here with softirqs
+ * disabled, use cond_resched_softirq() to preempt.
+ * This is safe to do because we've taken the backlog
+ * queue private:
+ */
+ cond_resched_softirq();
+
+ skb = next;
+ } while (skb != NULL);
+
+ bh_lock_sock(sk);
+ } while ((skb = sk->sk_backlog.head) != NULL);
+
+ /*
+ * Doing the zeroing here guarantee we can not loop forever
+ * while a wild producer attempts to flood us.
+ */
+ sk->sk_backlog.len = 0;
+}
+
+/**
+ * sk_wait_data - wait for data to arrive at sk_receive_queue
+ * @sk: sock to wait on
+ * @timeo: for how long
+ *
+ * Now socket state including sk->sk_err is changed only under lock,
+ * hence we may omit checks after joining wait queue.
+ * We check receive queue before schedule() only as optimization;
+ * it is very likely that release_sock() added new data.
+ */
+int sk_wait_data(struct sock *sk, long *timeo)
+{
+ int rc;
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
+ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ finish_wait(sk_sleep(sk), &wait);
+ return rc;
+}
+EXPORT_SYMBOL(sk_wait_data);
+
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ struct proto *prot = sk->sk_prot;
+ int amt = sk_mem_pages(size);
+ long allocated;
+ int parent_status = UNDER_LIMIT;
+
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+
+ allocated = sk_memory_allocated_add(sk, amt, &parent_status);
+
+ /* Under limit. */
+ if (parent_status == UNDER_LIMIT &&
+ allocated <= sk_prot_mem_limits(sk, 0)) {
+ sk_leave_memory_pressure(sk);
+ return 1;
+ }
+
+ /* Under pressure. (we or our parents) */
+ if ((parent_status > SOFT_LIMIT) ||
+ allocated > sk_prot_mem_limits(sk, 1))
+ sk_enter_memory_pressure(sk);
+
+ /* Over hard limit (we or our parents) */
+ if ((parent_status == OVER_LIMIT) ||
+ (allocated > sk_prot_mem_limits(sk, 2)))
+ goto suppress_allocation;
+
+ /* guarantee minimum buffer size under pressure */
+ if (kind == SK_MEM_RECV) {
+ if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+ return 1;
+
+ } else { /* SK_MEM_SEND */
+ if (sk->sk_type == SOCK_STREAM) {
+ if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+ return 1;
+ } else if (atomic_read(&sk->sk_wmem_alloc) <
+ prot->sysctl_wmem[0])
+ return 1;
+ }
+
+ if (sk_has_memory_pressure(sk)) {
+ int alloc;
+
+ if (!sk_under_memory_pressure(sk))
+ return 1;
+ alloc = sk_sockets_allocated_read_positive(sk);
+ if (sk_prot_mem_limits(sk, 2) > alloc *
+ sk_mem_pages(sk->sk_wmem_queued +
+ atomic_read(&sk->sk_rmem_alloc) +
+ sk->sk_forward_alloc))
+ return 1;
+ }
+
+suppress_allocation:
+
+ if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+ sk_stream_moderate_sndbuf(sk);
+
+ /* Fail only if socket is _under_ its sndbuf.
+ * In this case we cannot block, so that we have to fail.
+ */
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ return 1;
+ }
+
+ trace_sock_exceed_buf_limit(sk, prot, allocated);
+
+ /* Alas. Undo changes. */
+ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+
+ sk_memory_allocated_sub(sk, amt);
+
+ return 0;
+}
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ * __sk_reclaim - reclaim memory_allocated
+ * @sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+ sk_memory_allocated_sub(sk,
+ sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
+ sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+ if (sk_under_memory_pressure(sk) &&
+ (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+ sk_leave_memory_pressure(sk);
+}
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
+/*
+ * Set of default routines for initialising struct proto_ops when
+ * the protocol does not support a particular function. In certain
+ * cases where it makes no sense for a protocol to have a "do nothing"
+ * function, some default processing is provided.
+ */
+
+int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_bind);
+
+int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+ int len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_connect);
+
+int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_socketpair);
+
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_accept);
+
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
+ int *len, int peer)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_getname);
+
+unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
+{
+ return 0;
+}
+EXPORT_SYMBOL(sock_no_poll);
+
+int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_ioctl);
+
+int sock_no_listen(struct socket *sock, int backlog)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_listen);
+
+int sock_no_shutdown(struct socket *sock, int how)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_shutdown);
+
+int sock_no_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_setsockopt);
+
+int sock_no_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_getsockopt);
+
+int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_sendmsg);
+
+int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
+ int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_recvmsg);
+
+int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+{
+ /* Mirror missing mmap method error code */
+ return -ENODEV;
+}
+EXPORT_SYMBOL(sock_no_mmap);
+
+ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+ ssize_t res;
+ struct msghdr msg = {.msg_flags = flags};
+ struct kvec iov;
+ char *kaddr = kmap(page);
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ res = kernel_sendmsg(sock, &msg, &iov, 1, size);
+ kunmap(page);
+ return res;
+}
+EXPORT_SYMBOL(sock_no_sendpage);
+
+/*
+ * Default Socket Callbacks
+ */
+
+static void sock_def_wakeup(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static void sock_def_error_report(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, POLLERR);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ rcu_read_unlock();
+}
+
+static void sock_def_readable(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+ POLLRDNORM | POLLRDBAND);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
+}
+
+static void sock_def_write_space(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+ POLLWRNORM | POLLWRBAND);
+
+ /* Should agree with poll, otherwise some programs break */
+ if (sock_writeable(sk))
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+
+ rcu_read_unlock();
+}
+
+static void sock_def_destruct(struct sock *sk)
+{
+ kfree(sk->sk_protinfo);
+}
+
+void sk_send_sigurg(struct sock *sk)
+{
+ if (sk->sk_socket && sk->sk_socket->file)
+ if (send_sigurg(&sk->sk_socket->file->f_owner))
+ sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
+}
+EXPORT_SYMBOL(sk_send_sigurg);
+
+void sk_reset_timer(struct sock *sk, struct timer_list* timer,
+ unsigned long expires)
+{
+ if (!mod_timer(timer, expires))
+ sock_hold(sk);
+}
+EXPORT_SYMBOL(sk_reset_timer);
+
+void sk_stop_timer(struct sock *sk, struct timer_list* timer)
+{
+ if (del_timer(timer))
+ __sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer);
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+ skb_queue_head_init(&sk->sk_receive_queue);
+ skb_queue_head_init(&sk->sk_write_queue);
+ skb_queue_head_init(&sk->sk_error_queue);
+
+ sk->sk_send_head = NULL;
+
+ init_timer(&sk->sk_timer);
+
+ sk->sk_allocation = GFP_KERNEL;
+ sk->sk_rcvbuf = sysctl_rmem_default;
+ sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_state = TCP_CLOSE;
+ sk_set_socket(sk, sock);
+
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+ if (sock) {
+ sk->sk_type = sock->type;
+ sk->sk_wq = sock->wq;
+ sock->sk = sk;
+ } else
+ sk->sk_wq = NULL;
+
+ spin_lock_init(&sk->sk_dst_lock);
+ rwlock_init(&sk->sk_callback_lock);
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_callback_keys + sk->sk_family,
+ af_family_clock_key_strings[sk->sk_family]);
+
+ sk->sk_state_change = sock_def_wakeup;
+ sk->sk_data_ready = sock_def_readable;
+ sk->sk_write_space = sock_def_write_space;
+ sk->sk_error_report = sock_def_error_report;
+ sk->sk_destruct = sock_def_destruct;
+
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
+ sk->sk_peek_off = -1;
+
+ sk->sk_peer_pid = NULL;
+ sk->sk_peer_cred = NULL;
+ sk->sk_write_pending = 0;
+ sk->sk_rcvlowat = 1;
+ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+
+ sk->sk_stamp = ktime_set(-1L, 0);
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ sk->sk_napi_id = 0;
+ sk->sk_ll_usec = sysctl_net_busy_read;
+#endif
+
+ sk->sk_max_pacing_rate = ~0U;
+ sk->sk_pacing_rate = ~0U;
+ /*
+ * Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.txt for details)
+ */
+ smp_wmb();
+ atomic_set(&sk->sk_refcnt, 1);
+ atomic_set(&sk->sk_drops, 0);
+}
+EXPORT_SYMBOL(sock_init_data);
+
+void lock_sock_nested(struct sock *sk, int subclass)
+{
+ might_sleep();
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (sk->sk_lock.owned)
+ __lock_sock(sk);
+ sk->sk_lock.owned = 1;
+ spin_unlock(&sk->sk_lock.slock);
+ /*
+ * The sk_lock has mutex_lock() semantics here:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(lock_sock_nested);
+
+void release_sock(struct sock *sk)
+{
+ /*
+ * The sk_lock has mutex_unlock() semantics:
+ */
+ mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+
+ spin_lock_bh(&sk->sk_lock.slock);
+ if (sk->sk_backlog.tail)
+ __release_sock(sk);
+
+ /* Warning : release_cb() might need to release sk ownership,
+ * ie call sock_release_ownership(sk) before us.
+ */
+ if (sk->sk_prot->release_cb)
+ sk->sk_prot->release_cb(sk);
+
+ sock_release_ownership(sk);
+ if (waitqueue_active(&sk->sk_lock.wq))
+ wake_up(&sk->sk_lock.wq);
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+EXPORT_SYMBOL(release_sock);
+
+/**
+ * lock_sock_fast - fast version of lock_sock
+ * @sk: socket
+ *
+ * This version should be used for very small section, where process wont block
+ * return false if fast path is taken
+ * sk_lock.slock locked, owned = 0, BH disabled
+ * return true if slow path is taken
+ * sk_lock.slock unlocked, owned = 1, BH enabled
+ */
+bool lock_sock_fast(struct sock *sk)
+{
+ might_sleep();
+ spin_lock_bh(&sk->sk_lock.slock);
+
+ if (!sk->sk_lock.owned)
+ /*
+ * Note : We must disable BH
+ */
+ return false;
+
+ __lock_sock(sk);
+ sk->sk_lock.owned = 1;
+ spin_unlock(&sk->sk_lock.slock);
+ /*
+ * The sk_lock has mutex_lock() semantics here:
+ */
+ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+ local_bh_enable();
+ return true;
+}
+EXPORT_SYMBOL(lock_sock_fast);
+
+int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+{
+ struct timeval tv;
+ if (!sock_flag(sk, SOCK_TIMESTAMP))
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ tv = ktime_to_timeval(sk->sk_stamp);
+ if (tv.tv_sec == -1)
+ return -ENOENT;
+ if (tv.tv_sec == 0) {
+ sk->sk_stamp = ktime_get_real();
+ tv = ktime_to_timeval(sk->sk_stamp);
+ }
+ return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestamp);
+
+int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+ struct timespec ts;
+ if (!sock_flag(sk, SOCK_TIMESTAMP))
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ ts = ktime_to_timespec(sk->sk_stamp);
+ if (ts.tv_sec == -1)
+ return -ENOENT;
+ if (ts.tv_sec == 0) {
+ sk->sk_stamp = ktime_get_real();
+ ts = ktime_to_timespec(sk->sk_stamp);
+ }
+ return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestampns);
+
+void sock_enable_timestamp(struct sock *sk, int flag)
+{
+ if (!sock_flag(sk, flag)) {
+ unsigned long previous_flags = sk->sk_flags;
+
+ sock_set_flag(sk, flag);
+ /*
+ * we just set one of the two flags which require net
+ * time stamping, but time stamping might have been on
+ * already because of the other one
+ */
+ if (!(previous_flags & SK_FLAGS_TIMESTAMP))
+ net_enable_timestamp();
+ }
+}
+
+int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
+ int level, int type)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ int copied, err;
+
+ err = -EAGAIN;
+ skb = sock_dequeue_err_skb(sk);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+ put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+EXPORT_SYMBOL(sock_recv_errqueue);
+
+/*
+ * Get a socket option on an socket.
+ *
+ * FIX: POSIX 1003.1g is very ambiguous here. It states that
+ * asynchronous errors should be reported by getsockopt. We assume
+ * this means if you specify SO_ERROR (otherwise whats the point of it).
+ */
+int sock_common_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(sock_common_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->compat_getsockopt != NULL)
+ return sk->sk_prot->compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_getsockopt);
+#endif
+
+int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(sock_common_recvmsg);
+
+/*
+ * Set socket options on an inet socket.
+ */
+int sock_common_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+
+ return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(sock_common_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->compat_setsockopt != NULL)
+ return sk->sk_prot->compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_setsockopt);
+#endif
+
+void sk_common_release(struct sock *sk)
+{
+ if (sk->sk_prot->destroy)
+ sk->sk_prot->destroy(sk);
+
+ /*
+ * Observation: when sock_common_release is called, processes have
+ * no access to socket. But net still has.
+ * Step one, detach it from networking:
+ *
+ * A. Remove from hash tables.
+ */
+
+ sk->sk_prot->unhash(sk);
+
+ /*
+ * In this point socket cannot receive new packets, but it is possible
+ * that some packets are in flight because some CPU runs receiver and
+ * did hash table lookup before we unhashed socket. They will achieve
+ * receive queue and will be purged by socket destructor.
+ *
+ * Also we still have packets pending on receive queue and probably,
+ * our own packets waiting in device queues. sock_destroy will drain
+ * receive queue, but transmitted packets will delay socket destruction
+ * until the last reference will be released.
+ */
+
+ sock_orphan(sk);
+
+ xfrm_sk_free_policy(sk);
+
+ sk_refcnt_debug_release(sk);
+
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sk_common_release);
+
+#ifdef CONFIG_PROC_FS
+#define PROTO_INUSE_NR 64 /* should be enough for the first time */
+struct prot_inuse {
+ int val[PROTO_INUSE_NR];
+};
+
+static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
+
+#ifdef CONFIG_NET_NS
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+ __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+
+static int __net_init sock_inuse_init_net(struct net *net)
+{
+ net->core.inuse = alloc_percpu(struct prot_inuse);
+ return net->core.inuse ? 0 : -ENOMEM;
+}
+
+static void __net_exit sock_inuse_exit_net(struct net *net)
+{
+ free_percpu(net->core.inuse);
+}
+
+static struct pernet_operations net_inuse_ops = {
+ .init = sock_inuse_init_net,
+ .exit = sock_inuse_exit_net,
+};
+
+static __init int net_inuse_init(void)
+{
+ if (register_pernet_subsys(&net_inuse_ops))
+ panic("Cannot initialize net inuse counters");
+
+ return 0;
+}
+
+core_initcall(net_inuse_init);
+#else
+static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
+
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+ __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += per_cpu(prot_inuse, cpu).val[idx];
+
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+#endif
+
+static void assign_proto_idx(struct proto *prot)
+{
+ prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
+
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ pr_err("PROTO_INUSE_NR exhausted\n");
+ return;
+ }
+
+ set_bit(prot->inuse_idx, proto_inuse_idx);
+}
+
+static void release_proto_idx(struct proto *prot)
+{
+ if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ clear_bit(prot->inuse_idx, proto_inuse_idx);
+}
+#else
+static inline void assign_proto_idx(struct proto *prot)
+{
+}
+
+static inline void release_proto_idx(struct proto *prot)
+{
+}
+#endif
+
+static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
+{
+ if (!rsk_prot)
+ return;
+ kfree(rsk_prot->slab_name);
+ rsk_prot->slab_name = NULL;
+ if (rsk_prot->slab) {
+ kmem_cache_destroy(rsk_prot->slab);
+ rsk_prot->slab = NULL;
+ }
+}
+
+static int req_prot_init(const struct proto *prot)
+{
+ struct request_sock_ops *rsk_prot = prot->rsk_prot;
+
+ if (!rsk_prot)
+ return 0;
+
+ rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
+ prot->name);
+ if (!rsk_prot->slab_name)
+ return -ENOMEM;
+
+ rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
+ rsk_prot->obj_size, 0,
+ 0, NULL);
+
+ if (!rsk_prot->slab) {
+ pr_crit("%s: Can't create request sock SLAB cache!\n",
+ prot->name);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int proto_register(struct proto *prot, int alloc_slab)
+{
+ if (alloc_slab) {
+ prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
+ SLAB_HWCACHE_ALIGN | prot->slab_flags,
+ NULL);
+
+ if (prot->slab == NULL) {
+ pr_crit("%s: Can't create sock SLAB cache!\n",
+ prot->name);
+ goto out;
+ }
+
+ if (req_prot_init(prot))
+ goto out_free_request_sock_slab;
+
+ if (prot->twsk_prot != NULL) {
+ prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
+
+ if (prot->twsk_prot->twsk_slab_name == NULL)
+ goto out_free_request_sock_slab;
+
+ prot->twsk_prot->twsk_slab =
+ kmem_cache_create(prot->twsk_prot->twsk_slab_name,
+ prot->twsk_prot->twsk_obj_size,
+ 0,
+ prot->slab_flags,
+ NULL);
+ if (prot->twsk_prot->twsk_slab == NULL)
+ goto out_free_timewait_sock_slab_name;
+ }
+ }
+
+ mutex_lock(&proto_list_mutex);
+ list_add(&prot->node, &proto_list);
+ assign_proto_idx(prot);
+ mutex_unlock(&proto_list_mutex);
+ return 0;
+
+out_free_timewait_sock_slab_name:
+ kfree(prot->twsk_prot->twsk_slab_name);
+out_free_request_sock_slab:
+ req_prot_cleanup(prot->rsk_prot);
+
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+out:
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(proto_register);
+
+void proto_unregister(struct proto *prot)
+{
+ mutex_lock(&proto_list_mutex);
+ release_proto_idx(prot);
+ list_del(&prot->node);
+ mutex_unlock(&proto_list_mutex);
+
+ if (prot->slab != NULL) {
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+ }
+
+ req_prot_cleanup(prot->rsk_prot);
+
+ if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+ kmem_cache_destroy(prot->twsk_prot->twsk_slab);
+ kfree(prot->twsk_prot->twsk_slab_name);
+ prot->twsk_prot->twsk_slab = NULL;
+ }
+}
+EXPORT_SYMBOL(proto_unregister);
+
+#ifdef CONFIG_PROC_FS
+static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(proto_list_mutex)
+{
+ mutex_lock(&proto_list_mutex);
+ return seq_list_start_head(&proto_list, *pos);
+}
+
+static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &proto_list, pos);
+}
+
+static void proto_seq_stop(struct seq_file *seq, void *v)
+ __releases(proto_list_mutex)
+{
+ mutex_unlock(&proto_list_mutex);
+}
+
+static char proto_method_implemented(const void *method)
+{
+ return method == NULL ? 'n' : 'y';
+}
+static long sock_prot_memory_allocated(struct proto *proto)
+{
+ return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
+}
+
+static char *sock_prot_memory_pressure(struct proto *proto)
+{
+ return proto->memory_pressure != NULL ?
+ proto_memory_pressure(proto) ? "yes" : "no" : "NI";
+}
+
+static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
+{
+
+ seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
+ "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
+ proto->name,
+ proto->obj_size,
+ sock_prot_inuse_get(seq_file_net(seq), proto),
+ sock_prot_memory_allocated(proto),
+ sock_prot_memory_pressure(proto),
+ proto->max_header,
+ proto->slab == NULL ? "no" : "yes",
+ module_name(proto->owner),
+ proto_method_implemented(proto->close),
+ proto_method_implemented(proto->connect),
+ proto_method_implemented(proto->disconnect),
+ proto_method_implemented(proto->accept),
+ proto_method_implemented(proto->ioctl),
+ proto_method_implemented(proto->init),
+ proto_method_implemented(proto->destroy),
+ proto_method_implemented(proto->shutdown),
+ proto_method_implemented(proto->setsockopt),
+ proto_method_implemented(proto->getsockopt),
+ proto_method_implemented(proto->sendmsg),
+ proto_method_implemented(proto->recvmsg),
+ proto_method_implemented(proto->sendpage),
+ proto_method_implemented(proto->bind),
+ proto_method_implemented(proto->backlog_rcv),
+ proto_method_implemented(proto->hash),
+ proto_method_implemented(proto->unhash),
+ proto_method_implemented(proto->get_port),
+ proto_method_implemented(proto->enter_memory_pressure));
+}
+
+static int proto_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == &proto_list)
+ seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
+ "protocol",
+ "size",
+ "sockets",
+ "memory",
+ "press",
+ "maxhdr",
+ "slab",
+ "module",
+ "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
+ else
+ proto_seq_printf(seq, list_entry(v, struct proto, node));
+ return 0;
+}
+
+static const struct seq_operations proto_seq_ops = {
+ .start = proto_seq_start,
+ .next = proto_seq_next,
+ .stop = proto_seq_stop,
+ .show = proto_seq_show,
+};
+
+static int proto_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &proto_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations proto_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = proto_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static __net_init int proto_init_net(struct net *net)
+{
+ if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __net_exit void proto_exit_net(struct net *net)
+{
+ remove_proc_entry("protocols", net->proc_net);
+}
+
+
+static __net_initdata struct pernet_operations proto_net_ops = {
+ .init = proto_init_net,
+ .exit = proto_exit_net,
+};
+
+static int __init proto_init(void)
+{
+ return register_pernet_subsys(&proto_net_ops);
+}
+
+subsys_initcall(proto_init);
+
+#endif /* PROC_FS */
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
new file mode 100644
index 000000000..74dddf84a
--- /dev/null
+++ b/net/core/sock_diag.c
@@ -0,0 +1,248 @@
+#include <linux/mutex.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+#include <net/sock.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
+static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
+static DEFINE_MUTEX(sock_diag_table_mutex);
+
+static u64 sock_gen_cookie(struct sock *sk)
+{
+ while (1) {
+ u64 res = atomic64_read(&sk->sk_cookie);
+
+ if (res)
+ return res;
+ res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
+ atomic64_cmpxchg(&sk->sk_cookie, 0, res);
+ }
+}
+
+int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
+{
+ u64 res;
+
+ if (cookie[0] == INET_DIAG_NOCOOKIE && cookie[1] == INET_DIAG_NOCOOKIE)
+ return 0;
+
+ res = sock_gen_cookie(sk);
+ if ((u32)res != cookie[0] || (u32)(res >> 32) != cookie[1])
+ return -ESTALE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
+
+void sock_diag_save_cookie(struct sock *sk, __u32 *cookie)
+{
+ u64 res = sock_gen_cookie(sk);
+
+ cookie[0] = (u32)res;
+ cookie[1] = (u32)(res >> 32);
+}
+EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
+
+int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+ u32 mem[SK_MEMINFO_VARS];
+
+ mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+ mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+ mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+
+ return nla_put(skb, attrtype, sizeof(mem), &mem);
+}
+EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
+
+int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
+ struct sk_buff *skb, int attrtype)
+{
+ struct sock_fprog_kern *fprog;
+ struct sk_filter *filter;
+ struct nlattr *attr;
+ unsigned int flen;
+ int err = 0;
+
+ if (!may_report_filterinfo) {
+ nla_reserve(skb, attrtype, 0);
+ return 0;
+ }
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (!filter)
+ goto out;
+
+ fprog = filter->prog->orig_prog;
+ flen = bpf_classic_proglen(fprog);
+
+ attr = nla_reserve(skb, attrtype, flen);
+ if (attr == NULL) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ memcpy(nla_data(attr), fprog->filter, flen);
+out:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(sock_diag_put_filterinfo);
+
+void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = fn;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
+
+void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
+
+int sock_diag_register(const struct sock_diag_handler *hndl)
+{
+ int err = 0;
+
+ if (hndl->family >= AF_MAX)
+ return -EINVAL;
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (sock_diag_handlers[hndl->family])
+ err = -EBUSY;
+ else
+ sock_diag_handlers[hndl->family] = hndl;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sock_diag_register);
+
+void sock_diag_unregister(const struct sock_diag_handler *hnld)
+{
+ int family = hnld->family;
+
+ if (family >= AF_MAX)
+ return;
+
+ mutex_lock(&sock_diag_table_mutex);
+ BUG_ON(sock_diag_handlers[family] != hnld);
+ sock_diag_handlers[family] = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister);
+
+static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock_diag_req *req = nlmsg_data(nlh);
+ const struct sock_diag_handler *hndl;
+
+ if (nlmsg_len(nlh) < sizeof(*req))
+ return -EINVAL;
+
+ if (req->sdiag_family >= AF_MAX)
+ return -EINVAL;
+
+ if (sock_diag_handlers[req->sdiag_family] == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, req->sdiag_family);
+
+ mutex_lock(&sock_diag_table_mutex);
+ hndl = sock_diag_handlers[req->sdiag_family];
+ if (hndl == NULL)
+ err = -ENOENT;
+ else
+ err = hndl->dump(skb, nlh);
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+
+static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int ret;
+
+ switch (nlh->nlmsg_type) {
+ case TCPDIAG_GETSOCK:
+ case DCCPDIAG_GETSOCK:
+ if (inet_rcv_compat == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (inet_rcv_compat != NULL)
+ ret = inet_rcv_compat(skb, nlh);
+ else
+ ret = -EOPNOTSUPP;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return ret;
+ case SOCK_DIAG_BY_FAMILY:
+ return __sock_diag_rcv_msg(skb, nlh);
+ default:
+ return -EINVAL;
+ }
+}
+
+static DEFINE_MUTEX(sock_diag_mutex);
+
+static void sock_diag_rcv(struct sk_buff *skb)
+{
+ mutex_lock(&sock_diag_mutex);
+ netlink_rcv_skb(skb, &sock_diag_rcv_msg);
+ mutex_unlock(&sock_diag_mutex);
+}
+
+static int __net_init diag_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = sock_diag_rcv,
+ };
+
+ net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
+ return net->diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __net_exit diag_net_exit(struct net *net)
+{
+ netlink_kernel_release(net->diag_nlsk);
+ net->diag_nlsk = NULL;
+}
+
+static struct pernet_operations diag_net_ops = {
+ .init = diag_net_init,
+ .exit = diag_net_exit,
+};
+
+static int __init sock_diag_init(void)
+{
+ return register_pernet_subsys(&diag_net_ops);
+}
+
+static void __exit sock_diag_exit(void)
+{
+ unregister_pernet_subsys(&diag_net_ops);
+}
+
+module_init(sock_diag_init);
+module_exit(sock_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);
diff --git a/net/core/stream.c b/net/core/stream.c
new file mode 100644
index 000000000..301c05f26
--- /dev/null
+++ b/net/core/stream.c
@@ -0,0 +1,208 @@
+/*
+ * SUCS NET3:
+ *
+ * Generic stream handling routines. These are generic for most
+ * protocols. Even IP. Tonight 8-).
+ * This is used because TCP, LLC (others too) layer all have mostly
+ * identical sendmsg() and recvmsg() code.
+ * So we (will) share it here.
+ *
+ * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * (from old tcp.c code)
+ * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/signal.h>
+#include <linux/tcp.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+
+/**
+ * sk_stream_write_space - stream socket write_space callback.
+ * @sk: socket
+ *
+ * FIXME: write proper description
+ */
+void sk_stream_write_space(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct socket_wq *wq;
+
+ if (sk_stream_is_writeable(sk) && sock) {
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait, POLLOUT |
+ POLLWRNORM | POLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(sk_stream_write_space);
+
+/**
+ * sk_stream_wait_connect - Wait for a socket to get into the connected state
+ * @sk: sock to wait on
+ * @timeo_p: for how long to wait
+ *
+ * Must be called with the socket locked.
+ */
+int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
+{
+ struct task_struct *tsk = current;
+ DEFINE_WAIT(wait);
+ int done;
+
+ do {
+ int err = sock_error(sk);
+ if (err)
+ return err;
+ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
+ return -EPIPE;
+ if (!*timeo_p)
+ return -EAGAIN;
+ if (signal_pending(tsk))
+ return sock_intr_errno(*timeo_p);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending++;
+ done = sk_wait_event(sk, timeo_p,
+ !sk->sk_err &&
+ !((1 << sk->sk_state) &
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
+ finish_wait(sk_sleep(sk), &wait);
+ sk->sk_write_pending--;
+ } while (!done);
+ return 0;
+}
+EXPORT_SYMBOL(sk_stream_wait_connect);
+
+/**
+ * sk_stream_closing - Return 1 if we still have things to send in our buffers.
+ * @sk: socket to verify
+ */
+static inline int sk_stream_closing(struct sock *sk)
+{
+ return (1 << sk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
+}
+
+void sk_stream_wait_close(struct sock *sk, long timeout)
+{
+ if (timeout) {
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ finish_wait(sk_sleep(sk), &wait);
+ }
+}
+EXPORT_SYMBOL(sk_stream_wait_close);
+
+/**
+ * sk_stream_wait_memory - Wait for more memory for a socket
+ * @sk: socket to wait for memory
+ * @timeo_p: for how long
+ */
+int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+{
+ int err = 0;
+ long vm_wait = 0;
+ long current_timeo = *timeo_p;
+ DEFINE_WAIT(wait);
+
+ if (sk_stream_memory_free(sk))
+ current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+
+ while (1) {
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto do_error;
+ if (!*timeo_p)
+ goto do_nonblock;
+ if (signal_pending(current))
+ goto do_interrupted;
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ if (sk_stream_memory_free(sk) && !vm_wait)
+ break;
+
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk->sk_write_pending++;
+ sk_wait_event(sk, &current_timeo, sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ (sk_stream_memory_free(sk) &&
+ !vm_wait));
+ sk->sk_write_pending--;
+
+ if (vm_wait) {
+ vm_wait -= current_timeo;
+ current_timeo = *timeo_p;
+ if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
+ (current_timeo -= vm_wait) < 0)
+ current_timeo = 0;
+ vm_wait = 0;
+ }
+ *timeo_p = current_timeo;
+ }
+out:
+ finish_wait(sk_sleep(sk), &wait);
+ return err;
+
+do_error:
+ err = -EPIPE;
+ goto out;
+do_nonblock:
+ err = -EAGAIN;
+ goto out;
+do_interrupted:
+ err = sock_intr_errno(*timeo_p);
+ goto out;
+}
+EXPORT_SYMBOL(sk_stream_wait_memory);
+
+int sk_stream_error(struct sock *sk, int flags, int err)
+{
+ if (err == -EPIPE)
+ err = sock_error(sk) ? : -EPIPE;
+ if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ return err;
+}
+EXPORT_SYMBOL(sk_stream_error);
+
+void sk_stream_kill_queues(struct sock *sk)
+{
+ /* First the read buffer. */
+ __skb_queue_purge(&sk->sk_receive_queue);
+
+ /* Next, the error queue. */
+ __skb_queue_purge(&sk->sk_error_queue);
+
+ /* Next, the write queue. */
+ WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+
+ /* Account for returned memory. */
+ sk_mem_reclaim(sk);
+
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
+
+ /* It is _impossible_ for the backlog to contain anything
+ * when we get here. All user references to this socket
+ * have gone away, only the net layer knows can touch it.
+ */
+}
+EXPORT_SYMBOL(sk_stream_kill_queues);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
new file mode 100644
index 000000000..95b6139d7
--- /dev/null
+++ b/net/core/sysctl_net_core.c
@@ -0,0 +1,464 @@
+/* -*- linux-c -*-
+ * sysctl_net_core.c: sysctl interface to net core subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/core directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/net_ratelimit.h>
+#include <net/busy_poll.h>
+#include <net/pkt_sched.h>
+
+static int zero = 0;
+static int one = 1;
+static int min_sndbuf = SOCK_MIN_SNDBUF;
+static int min_rcvbuf = SOCK_MIN_RCVBUF;
+
+static int net_msg_warn; /* Unused, but still a sysctl */
+
+#ifdef CONFIG_RPS
+static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned int orig_size, size;
+ int ret, i;
+ struct ctl_table tmp = {
+ .data = &size,
+ .maxlen = sizeof(size),
+ .mode = table->mode
+ };
+ struct rps_sock_flow_table *orig_sock_table, *sock_table;
+ static DEFINE_MUTEX(sock_flow_mutex);
+
+ mutex_lock(&sock_flow_mutex);
+
+ orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
+ lockdep_is_held(&sock_flow_mutex));
+ size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
+
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+ if (write) {
+ if (size) {
+ if (size > 1<<29) {
+ /* Enforce limit to prevent overflow */
+ mutex_unlock(&sock_flow_mutex);
+ return -EINVAL;
+ }
+ size = roundup_pow_of_two(size);
+ if (size != orig_size) {
+ sock_table =
+ vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
+ if (!sock_table) {
+ mutex_unlock(&sock_flow_mutex);
+ return -ENOMEM;
+ }
+ rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
+ sock_table->mask = size - 1;
+ } else
+ sock_table = orig_sock_table;
+
+ for (i = 0; i < size; i++)
+ sock_table->ents[i] = RPS_NO_CPU;
+ } else
+ sock_table = NULL;
+
+ if (sock_table != orig_sock_table) {
+ rcu_assign_pointer(rps_sock_flow_table, sock_table);
+ if (sock_table)
+ static_key_slow_inc(&rps_needed);
+ if (orig_sock_table) {
+ static_key_slow_dec(&rps_needed);
+ synchronize_rcu();
+ vfree(orig_sock_table);
+ }
+ }
+ }
+
+ mutex_unlock(&sock_flow_mutex);
+
+ return ret;
+}
+#endif /* CONFIG_RPS */
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct sd_flow_limit *cur;
+ struct softnet_data *sd;
+ cpumask_var_t mask;
+ int i, len, ret = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (write) {
+ ret = cpumask_parse_user(buffer, *lenp, mask);
+ if (ret)
+ goto done;
+
+ mutex_lock(&flow_limit_update_mutex);
+ len = sizeof(*cur) + netdev_flow_limit_table_len;
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ cur = rcu_dereference_protected(sd->flow_limit,
+ lockdep_is_held(&flow_limit_update_mutex));
+ if (cur && !cpumask_test_cpu(i, mask)) {
+ RCU_INIT_POINTER(sd->flow_limit, NULL);
+ synchronize_rcu();
+ kfree(cur);
+ } else if (!cur && cpumask_test_cpu(i, mask)) {
+ cur = kzalloc_node(len, GFP_KERNEL,
+ cpu_to_node(i));
+ if (!cur) {
+ /* not unwinding previous changes */
+ ret = -ENOMEM;
+ goto write_unlock;
+ }
+ cur->num_buckets = netdev_flow_limit_table_len;
+ rcu_assign_pointer(sd->flow_limit, cur);
+ }
+ }
+write_unlock:
+ mutex_unlock(&flow_limit_update_mutex);
+ } else {
+ char kbuf[128];
+
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ goto done;
+ }
+
+ cpumask_clear(mask);
+ rcu_read_lock();
+ for_each_possible_cpu(i) {
+ sd = &per_cpu(softnet_data, i);
+ if (rcu_dereference(sd->flow_limit))
+ cpumask_set_cpu(i, mask);
+ }
+ rcu_read_unlock();
+
+ len = min(sizeof(kbuf) - 1, *lenp);
+ len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
+ if (!len) {
+ *lenp = 0;
+ goto done;
+ }
+ if (len < *lenp)
+ kbuf[len++] = '\n';
+ if (copy_to_user(buffer, kbuf, len)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+
+done:
+ free_cpumask_var(mask);
+ return ret;
+}
+
+static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old, *ptr;
+ int ret;
+
+ mutex_lock(&flow_limit_update_mutex);
+
+ ptr = table->data;
+ old = *ptr;
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write && !is_power_of_2(*ptr)) {
+ *ptr = old;
+ ret = -EINVAL;
+ }
+
+ mutex_unlock(&flow_limit_update_mutex);
+ return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
+#ifdef CONFIG_NET_SCHED
+static int set_default_qdisc(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char id[IFNAMSIZ];
+ struct ctl_table tbl = {
+ .data = id,
+ .maxlen = IFNAMSIZ,
+ };
+ int ret;
+
+ qdisc_get_default(id, IFNAMSIZ);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = qdisc_set_default(id);
+ return ret;
+}
+#endif
+
+static int proc_do_rss_key(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table fake_table;
+ char buf[NETDEV_RSS_KEY_LEN * 3];
+
+ snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
+ fake_table.data = buf;
+ fake_table.maxlen = sizeof(buf);
+ return proc_dostring(&fake_table, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table net_core_table[] = {
+#ifdef CONFIG_NET
+ {
+ .procname = "wmem_max",
+ .data = &sysctl_wmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_max",
+ .data = &sysctl_rmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ {
+ .procname = "wmem_default",
+ .data = &sysctl_wmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_default",
+ .data = &sysctl_rmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ {
+ .procname = "dev_weight",
+ .data = &weight_p,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "netdev_max_backlog",
+ .data = &netdev_max_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "netdev_rss_key",
+ .data = &netdev_rss_key,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_do_rss_key,
+ },
+#ifdef CONFIG_BPF_JIT
+ {
+ .procname = "bpf_jit_enable",
+ .data = &bpf_jit_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ {
+ .procname = "netdev_tstamp_prequeue",
+ .data = &netdev_tstamp_prequeue,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "message_cost",
+ .data = &net_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "message_burst",
+ .data = &net_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "optmem_max",
+ .data = &sysctl_optmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tstamp_allow_data",
+ .data = &sysctl_tstamp_allow_data,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one
+ },
+#ifdef CONFIG_RPS
+ {
+ .procname = "rps_sock_flow_entries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rps_sock_flow_sysctl
+ },
+#endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+ {
+ .procname = "flow_limit_cpu_bitmap",
+ .mode = 0644,
+ .proc_handler = flow_limit_cpu_sysctl
+ },
+ {
+ .procname = "flow_limit_table_len",
+ .data = &netdev_flow_limit_table_len,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = flow_limit_table_len_sysctl
+ },
+#endif /* CONFIG_NET_FLOW_LIMIT */
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ {
+ .procname = "busy_poll",
+ .data = &sysctl_net_busy_poll,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "busy_read",
+ .data = &sysctl_net_busy_read,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+#ifdef CONFIG_NET_SCHED
+ {
+ .procname = "default_qdisc",
+ .mode = 0644,
+ .maxlen = IFNAMSIZ,
+ .proc_handler = set_default_qdisc
+ },
+#endif
+#endif /* CONFIG_NET */
+ {
+ .procname = "netdev_budget",
+ .data = &netdev_budget,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "warnings",
+ .data = &net_msg_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ { }
+};
+
+static struct ctl_table netns_core_table[] = {
+ {
+ .procname = "somaxconn",
+ .data = &init_net.core.sysctl_somaxconn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = &zero,
+ .proc_handler = proc_dointvec_minmax
+ },
+ { }
+};
+
+static __net_init int sysctl_core_net_init(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ net->core.sysctl_somaxconn = SOMAXCONN;
+
+ tbl = netns_core_table;
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_dup;
+
+ tbl[0].data = &net->core.sysctl_somaxconn;
+
+ /* Don't export any sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns) {
+ tbl[0].procname = NULL;
+ }
+ }
+
+ net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
+ if (net->core.sysctl_hdr == NULL)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ if (tbl != netns_core_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
+
+static __net_exit void sysctl_core_net_exit(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = net->core.sysctl_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->core.sysctl_hdr);
+ BUG_ON(tbl == netns_core_table);
+ kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_core_ops = {
+ .init = sysctl_core_net_init,
+ .exit = sysctl_core_net_exit,
+};
+
+static __init int sysctl_core_init(void)
+{
+ register_net_sysctl(&init_net, "net/core", net_core_table);
+ return register_pernet_subsys(&sysctl_core_ops);
+}
+
+fs_initcall(sysctl_core_init);
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
new file mode 100644
index 000000000..43d3dd62f
--- /dev/null
+++ b/net/core/timestamping.c
@@ -0,0 +1,80 @@
+/*
+ * PTP 1588 clock support - support for timestamping in PHY devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/errqueue.h>
+#include <linux/phy.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+
+static unsigned int classify(const struct sk_buff *skb)
+{
+ if (likely(skb->dev && skb->dev->phydev &&
+ skb->dev->phydev->drv))
+ return ptp_classify_raw(skb);
+ else
+ return PTP_CLASS_NONE;
+}
+
+void skb_clone_tx_timestamp(struct sk_buff *skb)
+{
+ struct phy_device *phydev;
+ struct sk_buff *clone;
+ unsigned int type;
+
+ if (!skb->sk)
+ return;
+
+ type = classify(skb);
+ if (type == PTP_CLASS_NONE)
+ return;
+
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->txtstamp)) {
+ clone = skb_clone_sk(skb);
+ if (!clone)
+ return;
+ phydev->drv->txtstamp(phydev, clone, type);
+ }
+}
+EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
+
+bool skb_defer_rx_timestamp(struct sk_buff *skb)
+{
+ struct phy_device *phydev;
+ unsigned int type;
+
+ if (skb_headroom(skb) < ETH_HLEN)
+ return false;
+ __skb_push(skb, ETH_HLEN);
+
+ type = classify(skb);
+
+ __skb_pull(skb, ETH_HLEN);
+
+ if (type == PTP_CLASS_NONE)
+ return false;
+
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->rxtstamp))
+ return phydev->drv->rxtstamp(phydev, skb, type);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
diff --git a/net/core/tso.c b/net/core/tso.c
new file mode 100644
index 000000000..630b30b4f
--- /dev/null
+++ b/net/core/tso.c
@@ -0,0 +1,78 @@
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/tso.h>
+#include <asm/unaligned.h>
+
+/* Calculate expected number of TX descriptors */
+int tso_count_descs(struct sk_buff *skb)
+{
+ /* The Marvell Way */
+ return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
+}
+EXPORT_SYMBOL(tso_count_descs);
+
+void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
+ int size, bool is_last)
+{
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int mac_hdr_len = skb_network_offset(skb);
+
+ memcpy(hdr, skb->data, hdr_len);
+ iph = (struct iphdr *)(hdr + mac_hdr_len);
+ iph->id = htons(tso->ip_id);
+ iph->tot_len = htons(size + hdr_len - mac_hdr_len);
+ tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
+ put_unaligned_be32(tso->tcp_seq, &tcph->seq);
+ tso->ip_id++;
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+}
+EXPORT_SYMBOL(tso_build_hdr);
+
+void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
+{
+ tso->tcp_seq += size;
+ tso->size -= size;
+ tso->data += size;
+
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_build_data);
+
+void tso_start(struct sk_buff *skb, struct tso_t *tso)
+{
+ int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+
+ tso->ip_id = ntohs(ip_hdr(skb)->id);
+ tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ tso->next_frag_idx = 0;
+
+ /* Build first data */
+ tso->size = skb_headlen(skb) - hdr_len;
+ tso->data = skb->data + hdr_len;
+ if ((tso->size == 0) &&
+ (tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
+
+ /* Move to next segment */
+ tso->size = frag->size;
+ tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->next_frag_idx++;
+ }
+}
+EXPORT_SYMBOL(tso_start);
diff --git a/net/core/utils.c b/net/core/utils.c
new file mode 100644
index 000000000..7b803884c
--- /dev/null
+++ b/net/core/utils.c
@@ -0,0 +1,384 @@
+/*
+ * Generic address resultion entity
+ *
+ * Authors:
+ * net_random Alan Cox
+ * net_ratelimit Andi Kleen
+ * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
+ *
+ * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/ratelimit.h>
+
+#include <net/sock.h>
+#include <net/net_ratelimit.h>
+
+#include <asm/byteorder.h>
+#include <asm/uaccess.h>
+
+DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
+/*
+ * All net warning printk()s should be guarded by this function.
+ */
+int net_ratelimit(void)
+{
+ return __ratelimit(&net_ratelimit_state);
+}
+EXPORT_SYMBOL(net_ratelimit);
+
+/*
+ * Convert an ASCII string to binary IP.
+ * This is outside of net/ipv4/ because various code that uses IP addresses
+ * is otherwise not dependent on the TCP/IP stack.
+ */
+
+__be32 in_aton(const char *str)
+{
+ unsigned long l;
+ unsigned int val;
+ int i;
+
+ l = 0;
+ for (i = 0; i < 4; i++) {
+ l <<= 8;
+ if (*str != '\0') {
+ val = 0;
+ while (*str != '\0' && *str != '.' && *str != '\n') {
+ val *= 10;
+ val += *str - '0';
+ str++;
+ }
+ l |= val;
+ if (*str != '\0')
+ str++;
+ }
+ }
+ return htonl(l);
+}
+EXPORT_SYMBOL(in_aton);
+
+#define IN6PTON_XDIGIT 0x00010000
+#define IN6PTON_DIGIT 0x00020000
+#define IN6PTON_COLON_MASK 0x00700000
+#define IN6PTON_COLON_1 0x00100000 /* single : requested */
+#define IN6PTON_COLON_2 0x00200000 /* second : requested */
+#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */
+#define IN6PTON_DOT 0x00800000 /* . */
+#define IN6PTON_DELIM 0x10000000
+#define IN6PTON_NULL 0x20000000 /* first/tail */
+#define IN6PTON_UNKNOWN 0x40000000
+
+static inline int xdigit2bin(char c, int delim)
+{
+ int val;
+
+ if (c == delim || c == '\0')
+ return IN6PTON_DELIM;
+ if (c == ':')
+ return IN6PTON_COLON_MASK;
+ if (c == '.')
+ return IN6PTON_DOT;
+
+ val = hex_to_bin(c);
+ if (val >= 0)
+ return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
+
+ if (delim == -1)
+ return IN6PTON_DELIM;
+ return IN6PTON_UNKNOWN;
+}
+
+/**
+ * in4_pton - convert an IPv4 address from literal to binary representation
+ * @src: the start of the IPv4 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[4] array) representation of the IPv4 address
+ * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
+int in4_pton(const char *src, int srclen,
+ u8 *dst,
+ int delim, const char **end)
+{
+ const char *s;
+ u8 *d;
+ u8 dbuf[4];
+ int ret = 0;
+ int i;
+ int w = 0;
+
+ if (srclen < 0)
+ srclen = strlen(src);
+ s = src;
+ d = dbuf;
+ i = 0;
+ while(1) {
+ int c;
+ c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+ if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
+ goto out;
+ }
+ if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ if (w == 0)
+ goto out;
+ *d++ = w & 0xff;
+ w = 0;
+ i++;
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ if (i != 4)
+ goto out;
+ break;
+ }
+ goto cont;
+ }
+ w = (w * 10) + c;
+ if ((w & 0xffff) > 255) {
+ goto out;
+ }
+cont:
+ if (i >= 4)
+ goto out;
+ s++;
+ srclen--;
+ }
+ ret = 1;
+ memcpy(dst, dbuf, sizeof(dbuf));
+out:
+ if (end)
+ *end = s;
+ return ret;
+}
+EXPORT_SYMBOL(in4_pton);
+
+/**
+ * in6_pton - convert an IPv6 address from literal to binary representation
+ * @src: the start of the IPv6 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[16] array) representation of the IPv6 address
+ * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
+int in6_pton(const char *src, int srclen,
+ u8 *dst,
+ int delim, const char **end)
+{
+ const char *s, *tok = NULL;
+ u8 *d, *dc = NULL;
+ u8 dbuf[16];
+ int ret = 0;
+ int i;
+ int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL;
+ int w = 0;
+
+ memset(dbuf, 0, sizeof(dbuf));
+
+ s = src;
+ d = dbuf;
+ if (srclen < 0)
+ srclen = strlen(src);
+
+ while (1) {
+ int c;
+
+ c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+ if (!(c & state))
+ goto out;
+ if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+ /* process one 16-bit word */
+ if (!(state & IN6PTON_NULL)) {
+ *d++ = (w >> 8) & 0xff;
+ *d++ = w & 0xff;
+ }
+ w = 0;
+ if (c & IN6PTON_DELIM) {
+ /* We've processed last word */
+ break;
+ }
+ /*
+ * COLON_1 => XDIGIT
+ * COLON_2 => XDIGIT|DELIM
+ * COLON_1_2 => COLON_2
+ */
+ switch (state & IN6PTON_COLON_MASK) {
+ case IN6PTON_COLON_2:
+ dc = d;
+ state = IN6PTON_XDIGIT | IN6PTON_DELIM;
+ if (dc - dbuf >= sizeof(dbuf))
+ state |= IN6PTON_NULL;
+ break;
+ case IN6PTON_COLON_1|IN6PTON_COLON_1_2:
+ state = IN6PTON_XDIGIT | IN6PTON_COLON_2;
+ break;
+ case IN6PTON_COLON_1:
+ state = IN6PTON_XDIGIT;
+ break;
+ case IN6PTON_COLON_1_2:
+ state = IN6PTON_COLON_2;
+ break;
+ default:
+ state = 0;
+ }
+ tok = s + 1;
+ goto cont;
+ }
+
+ if (c & IN6PTON_DOT) {
+ ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s);
+ if (ret > 0) {
+ d += 4;
+ break;
+ }
+ goto out;
+ }
+
+ w = (w << 4) | (0xff & c);
+ state = IN6PTON_COLON_1 | IN6PTON_DELIM;
+ if (!(w & 0xf000)) {
+ state |= IN6PTON_XDIGIT;
+ }
+ if (!dc && d + 2 < dbuf + sizeof(dbuf)) {
+ state |= IN6PTON_COLON_1_2;
+ state &= ~IN6PTON_DELIM;
+ }
+ if (d + 2 >= dbuf + sizeof(dbuf)) {
+ state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2);
+ }
+cont:
+ if ((dc && d + 4 < dbuf + sizeof(dbuf)) ||
+ d + 4 == dbuf + sizeof(dbuf)) {
+ state |= IN6PTON_DOT;
+ }
+ if (d >= dbuf + sizeof(dbuf)) {
+ state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK);
+ }
+ s++;
+ srclen--;
+ }
+
+ i = 15; d--;
+
+ if (dc) {
+ while(d >= dc)
+ dst[i--] = *d--;
+ while(i >= dc - dbuf)
+ dst[i--] = 0;
+ while(i >= 0)
+ dst[i--] = *d--;
+ } else
+ memcpy(dst, dbuf, sizeof(dbuf));
+
+ ret = 1;
+out:
+ if (end)
+ *end = s;
+ return ret;
+}
+EXPORT_SYMBOL(in6_pton);
+
+void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
+ __be32 from, __be32 to, int pseudohdr)
+{
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
+ to));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
+ to));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace4);
+
+void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
+ const __be32 *from, const __be32 *to,
+ int pseudohdr)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], ~from[2], ~from[3],
+ to[0], to[1], to[2], to[3],
+ };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_partial(diff, sizeof(diff),
+ ~skb->csum);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace16);
+
+struct __net_random_once_work {
+ struct work_struct work;
+ struct static_key *key;
+};
+
+static void __net_random_once_deferred(struct work_struct *w)
+{
+ struct __net_random_once_work *work =
+ container_of(w, struct __net_random_once_work, work);
+ BUG_ON(!static_key_enabled(work->key));
+ static_key_slow_dec(work->key);
+ kfree(work);
+}
+
+static void __net_random_once_disable_jump(struct static_key *key)
+{
+ struct __net_random_once_work *w;
+
+ w = kmalloc(sizeof(*w), GFP_ATOMIC);
+ if (!w)
+ return;
+
+ INIT_WORK(&w->work, __net_random_once_deferred);
+ w->key = key;
+ schedule_work(&w->work);
+}
+
+bool __net_get_random_once(void *buf, int nbytes, bool *done,
+ struct static_key *once_key)
+{
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&lock, flags);
+ if (*done) {
+ spin_unlock_irqrestore(&lock, flags);
+ return false;
+ }
+
+ get_random_bytes(buf, nbytes);
+ *done = true;
+ spin_unlock_irqrestore(&lock, flags);
+
+ __net_random_once_disable_jump(once_key);
+
+ return true;
+}
+EXPORT_SYMBOL(__net_get_random_once);
diff --git a/net/dcb/Kconfig b/net/dcb/Kconfig
new file mode 100644
index 000000000..4066d59c8
--- /dev/null
+++ b/net/dcb/Kconfig
@@ -0,0 +1,22 @@
+config DCB
+ bool "Data Center Bridging support"
+ default n
+ ---help---
+ This enables support for configuring Data Center Bridging (DCB)
+ features on DCB capable Ethernet adapters via rtnetlink. Say 'Y'
+ if you have a DCB capable Ethernet adapter which supports this
+ interface and you are connected to a DCB capable switch.
+
+ DCB is a collection of Ethernet enhancements which allow DCB capable
+ NICs and switches to support network traffic with differing
+ requirements (highly reliable, no drops vs. best effort vs. low
+ latency) to co-exist on Ethernet.
+
+ DCB features include:
+ Enhanced Transmission Selection (aka Priority Grouping) - provides a
+ framework for assigning bandwidth guarantees to traffic classes.
+ Priority-based Flow Control (PFC) - a MAC control pause frame which
+ works at the granularity of the 802.1p priority instead of the
+ link (802.3x).
+
+ If unsure, say N.
diff --git a/net/dcb/Makefile b/net/dcb/Makefile
new file mode 100644
index 000000000..c1282c9e6
--- /dev/null
+++ b/net/dcb/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DCB) += dcbnl.o dcbevent.o
diff --git a/net/dcb/dcbevent.c b/net/dcb/dcbevent.c
new file mode 100644
index 000000000..a520d8004
--- /dev/null
+++ b/net/dcb/dcbevent.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: John Fastabend <john.r.fastabend@intel.com>
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <net/dcbevent.h>
+
+static ATOMIC_NOTIFIER_HEAD(dcbevent_notif_chain);
+
+int register_dcbevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&dcbevent_notif_chain, nb);
+}
+EXPORT_SYMBOL(register_dcbevent_notifier);
+
+int unregister_dcbevent_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&dcbevent_notif_chain, nb);
+}
+EXPORT_SYMBOL(unregister_dcbevent_notifier);
+
+int call_dcbevent_notifiers(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&dcbevent_notif_chain, val, v);
+}
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
new file mode 100644
index 000000000..5b21f6f88
--- /dev/null
+++ b/net/dcb/dcbnl.c
@@ -0,0 +1,1968 @@
+/*
+ * Copyright (c) 2008-2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Lucy Liu <lucy.liu@intel.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include <linux/dcbnl.h>
+#include <net/dcbevent.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <net/sock.h>
+
+/* Data Center Bridging (DCB) is a collection of Ethernet enhancements
+ * intended to allow network traffic with differing requirements
+ * (highly reliable, no drops vs. best effort vs. low latency) to operate
+ * and co-exist on Ethernet. Current DCB features are:
+ *
+ * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a
+ * framework for assigning bandwidth guarantees to traffic classes.
+ *
+ * Priority-based Flow Control (PFC) - provides a flow control mechanism which
+ * can work independently for each 802.1p priority.
+ *
+ * Congestion Notification - provides a mechanism for end-to-end congestion
+ * control for protocols which do not have built-in congestion management.
+ *
+ * More information about the emerging standards for these Ethernet features
+ * can be found at: http://www.ieee802.org/1/pages/dcbridges.html
+ *
+ * This file implements an rtnetlink interface to allow configuration of DCB
+ * features for capable devices.
+ */
+
+MODULE_AUTHOR("Lucy Liu, <lucy.liu@intel.com>");
+MODULE_DESCRIPTION("Data Center Bridging netlink interface");
+MODULE_LICENSE("GPL");
+
+/**************** DCB attribute policies *************************************/
+
+/* DCB netlink attributes policy */
+static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
+ [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1},
+ [DCB_ATTR_STATE] = {.type = NLA_U8},
+ [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED},
+ [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED},
+ [DCB_ATTR_SET_ALL] = {.type = NLA_U8},
+ [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
+ [DCB_ATTR_CAP] = {.type = NLA_NESTED},
+ [DCB_ATTR_PFC_STATE] = {.type = NLA_U8},
+ [DCB_ATTR_BCN] = {.type = NLA_NESTED},
+ [DCB_ATTR_APP] = {.type = NLA_NESTED},
+ [DCB_ATTR_IEEE] = {.type = NLA_NESTED},
+ [DCB_ATTR_DCBX] = {.type = NLA_U8},
+ [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED},
+};
+
+/* DCB priority flow control to User Priority nested attributes */
+static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = {
+ [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8},
+ [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB priority grouping nested attributes */
+static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = {
+ [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED},
+ [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8},
+ [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB traffic class nested attributes. */
+static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = {
+ [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8},
+ [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8},
+ [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8},
+ [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8},
+ [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB capabilities nested attributes. */
+static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = {
+ [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG},
+ [DCB_CAP_ATTR_PG] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_PFC] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_GSP] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_BCN] = {.type = NLA_U8},
+ [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8},
+};
+
+/* DCB capabilities nested attributes. */
+static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = {
+ [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG},
+ [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8},
+ [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8},
+};
+
+/* DCB BCN nested attributes. */
+static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = {
+ [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8},
+ [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG},
+ [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_BETA] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_GD] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_GI] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_TD] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_W] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_RD] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_RU] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_RI] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_C] = {.type = NLA_U32},
+ [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB APP nested attributes. */
+static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = {
+ [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8},
+ [DCB_APP_ATTR_ID] = {.type = NLA_U16},
+ [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8},
+};
+
+/* IEEE 802.1Qaz nested attributes. */
+static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
+ [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)},
+ [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)},
+ [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED},
+ [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)},
+ [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)},
+ [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)},
+};
+
+static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = {
+ [DCB_ATTR_IEEE_APP] = {.len = sizeof(struct dcb_app)},
+};
+
+/* DCB number of traffic classes nested attributes. */
+static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = {
+ [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG},
+ [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8},
+ [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8},
+ [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8},
+};
+
+static LIST_HEAD(dcb_app_list);
+static DEFINE_SPINLOCK(dcb_lock);
+
+static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq,
+ u32 flags, struct nlmsghdr **nlhp)
+{
+ struct sk_buff *skb;
+ struct dcbmsg *dcb;
+ struct nlmsghdr *nlh;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags);
+ BUG_ON(!nlh);
+
+ dcb = nlmsg_data(nlh);
+ dcb->dcb_family = AF_UNSPEC;
+ dcb->cmd = cmd;
+ dcb->dcb_pad = 0;
+
+ if (nlhp)
+ *nlhp = nlh;
+
+ return skb;
+}
+
+static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */
+ if (!netdev->dcbnl_ops->getstate)
+ return -EOPNOTSUPP;
+
+ return nla_put_u8(skb, DCB_ATTR_STATE,
+ netdev->dcbnl_ops->getstate(netdev));
+}
+
+static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest;
+ u8 value;
+ int ret;
+ int i;
+ int getall = 0;
+
+ if (!tb[DCB_ATTR_PFC_CFG])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getpfccfg)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX,
+ tb[DCB_ATTR_PFC_CFG],
+ dcbnl_pfc_up_nest);
+ if (ret)
+ return ret;
+
+ nest = nla_nest_start(skb, DCB_ATTR_PFC_CFG);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data[DCB_PFC_UP_ATTR_ALL])
+ getall = 1;
+
+ for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+ if (!getall && !data[i])
+ continue;
+
+ netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0,
+ &value);
+ ret = nla_put_u8(skb, i, value);
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ return ret;
+ }
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ u8 perm_addr[MAX_ADDR_LEN];
+
+ if (!netdev->dcbnl_ops->getpermhwaddr)
+ return -EOPNOTSUPP;
+
+ memset(perm_addr, 0, sizeof(perm_addr));
+ netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr);
+
+ return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr);
+}
+
+static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest;
+ u8 value;
+ int ret;
+ int i;
+ int getall = 0;
+
+ if (!tb[DCB_ATTR_CAP])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getcap)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP],
+ dcbnl_cap_nest);
+ if (ret)
+ return ret;
+
+ nest = nla_nest_start(skb, DCB_ATTR_CAP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data[DCB_CAP_ATTR_ALL])
+ getall = 1;
+
+ for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) {
+ if (!getall && !data[i])
+ continue;
+
+ if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) {
+ ret = nla_put_u8(skb, i, value);
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ return ret;
+ }
+ }
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest;
+ u8 value;
+ int ret;
+ int i;
+ int getall = 0;
+
+ if (!tb[DCB_ATTR_NUMTCS])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getnumtcs)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS],
+ dcbnl_numtcs_nest);
+ if (ret)
+ return ret;
+
+ nest = nla_nest_start(skb, DCB_ATTR_NUMTCS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data[DCB_NUMTCS_ATTR_ALL])
+ getall = 1;
+
+ for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
+ if (!getall && !data[i])
+ continue;
+
+ ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value);
+ if (!ret) {
+ ret = nla_put_u8(skb, i, value);
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ return ret;
+ }
+ } else
+ return -EINVAL;
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1];
+ int ret;
+ u8 value;
+ int i;
+
+ if (!tb[DCB_ATTR_NUMTCS])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setnumtcs)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS],
+ dcbnl_numtcs_nest);
+ if (ret)
+ return ret;
+
+ for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
+ if (data[i] == NULL)
+ continue;
+
+ value = nla_get_u8(data[i]);
+
+ ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value);
+ if (ret)
+ break;
+ }
+
+ return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret);
+}
+
+static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ if (!netdev->dcbnl_ops->getpfcstate)
+ return -EOPNOTSUPP;
+
+ return nla_put_u8(skb, DCB_ATTR_PFC_STATE,
+ netdev->dcbnl_ops->getpfcstate(netdev));
+}
+
+static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ u8 value;
+
+ if (!tb[DCB_ATTR_PFC_STATE])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setpfcstate)
+ return -EOPNOTSUPP;
+
+ value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]);
+
+ netdev->dcbnl_ops->setpfcstate(netdev, value);
+
+ return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0);
+}
+
+static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *app_nest;
+ struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1];
+ u16 id;
+ u8 up, idtype;
+ int ret;
+
+ if (!tb[DCB_ATTR_APP])
+ return -EINVAL;
+
+ ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP],
+ dcbnl_app_nest);
+ if (ret)
+ return ret;
+
+ /* all must be non-null */
+ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) ||
+ (!app_tb[DCB_APP_ATTR_ID]))
+ return -EINVAL;
+
+ /* either by eth type or by socket number */
+ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]);
+ if ((idtype != DCB_APP_IDTYPE_ETHTYPE) &&
+ (idtype != DCB_APP_IDTYPE_PORTNUM))
+ return -EINVAL;
+
+ id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
+
+ if (netdev->dcbnl_ops->getapp) {
+ ret = netdev->dcbnl_ops->getapp(netdev, idtype, id);
+ if (ret < 0)
+ return ret;
+ else
+ up = ret;
+ } else {
+ struct dcb_app app = {
+ .selector = idtype,
+ .protocol = id,
+ };
+ up = dcb_getapp(netdev, &app);
+ }
+
+ app_nest = nla_nest_start(skb, DCB_ATTR_APP);
+ if (!app_nest)
+ return -EMSGSIZE;
+
+ ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype);
+ if (ret)
+ goto out_cancel;
+
+ ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id);
+ if (ret)
+ goto out_cancel;
+
+ ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up);
+ if (ret)
+ goto out_cancel;
+
+ nla_nest_end(skb, app_nest);
+
+ return 0;
+
+out_cancel:
+ nla_nest_cancel(skb, app_nest);
+ return ret;
+}
+
+static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ int ret;
+ u16 id;
+ u8 up, idtype;
+ struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1];
+
+ if (!tb[DCB_ATTR_APP])
+ return -EINVAL;
+
+ ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP],
+ dcbnl_app_nest);
+ if (ret)
+ return ret;
+
+ /* all must be non-null */
+ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) ||
+ (!app_tb[DCB_APP_ATTR_ID]) ||
+ (!app_tb[DCB_APP_ATTR_PRIORITY]))
+ return -EINVAL;
+
+ /* either by eth type or by socket number */
+ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]);
+ if ((idtype != DCB_APP_IDTYPE_ETHTYPE) &&
+ (idtype != DCB_APP_IDTYPE_PORTNUM))
+ return -EINVAL;
+
+ id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
+ up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]);
+
+ if (netdev->dcbnl_ops->setapp) {
+ ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up);
+ if (ret < 0)
+ return ret;
+ } else {
+ struct dcb_app app;
+ app.selector = idtype;
+ app.protocol = id;
+ app.priority = up;
+ ret = dcb_setapp(netdev, &app);
+ }
+
+ ret = nla_put_u8(skb, DCB_ATTR_APP, ret);
+ dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0);
+
+ return ret;
+}
+
+static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ struct nlattr **tb, struct sk_buff *skb, int dir)
+{
+ struct nlattr *pg_nest, *param_nest, *data;
+ struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
+ struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
+ u8 prio, pgid, tc_pct, up_map;
+ int ret;
+ int getall = 0;
+ int i;
+
+ if (!tb[DCB_ATTR_PG_CFG])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getpgtccfgtx ||
+ !netdev->dcbnl_ops->getpgtccfgrx ||
+ !netdev->dcbnl_ops->getpgbwgcfgtx ||
+ !netdev->dcbnl_ops->getpgbwgcfgrx)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX,
+ tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest);
+ if (ret)
+ return ret;
+
+ pg_nest = nla_nest_start(skb, DCB_ATTR_PG_CFG);
+ if (!pg_nest)
+ return -EMSGSIZE;
+
+ if (pg_tb[DCB_PG_ATTR_TC_ALL])
+ getall = 1;
+
+ for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+ if (!getall && !pg_tb[i])
+ continue;
+
+ if (pg_tb[DCB_PG_ATTR_TC_ALL])
+ data = pg_tb[DCB_PG_ATTR_TC_ALL];
+ else
+ data = pg_tb[i];
+ ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX,
+ data, dcbnl_tc_param_nest);
+ if (ret)
+ goto err_pg;
+
+ param_nest = nla_nest_start(skb, i);
+ if (!param_nest)
+ goto err_pg;
+
+ pgid = DCB_ATTR_VALUE_UNDEFINED;
+ prio = DCB_ATTR_VALUE_UNDEFINED;
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+ up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (dir) {
+ /* Rx */
+ netdev->dcbnl_ops->getpgtccfgrx(netdev,
+ i - DCB_PG_ATTR_TC_0, &prio,
+ &pgid, &tc_pct, &up_map);
+ } else {
+ /* Tx */
+ netdev->dcbnl_ops->getpgtccfgtx(netdev,
+ i - DCB_PG_ATTR_TC_0, &prio,
+ &pgid, &tc_pct, &up_map);
+ }
+
+ if (param_tb[DCB_TC_ATTR_PARAM_PGID] ||
+ param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+ ret = nla_put_u8(skb,
+ DCB_TC_ATTR_PARAM_PGID, pgid);
+ if (ret)
+ goto err_param;
+ }
+ if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] ||
+ param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+ ret = nla_put_u8(skb,
+ DCB_TC_ATTR_PARAM_UP_MAPPING, up_map);
+ if (ret)
+ goto err_param;
+ }
+ if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] ||
+ param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+ ret = nla_put_u8(skb,
+ DCB_TC_ATTR_PARAM_STRICT_PRIO, prio);
+ if (ret)
+ goto err_param;
+ }
+ if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] ||
+ param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+ ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT,
+ tc_pct);
+ if (ret)
+ goto err_param;
+ }
+ nla_nest_end(skb, param_nest);
+ }
+
+ if (pg_tb[DCB_PG_ATTR_BW_ID_ALL])
+ getall = 1;
+ else
+ getall = 0;
+
+ for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+ if (!getall && !pg_tb[i])
+ continue;
+
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (dir) {
+ /* Rx */
+ netdev->dcbnl_ops->getpgbwgcfgrx(netdev,
+ i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
+ } else {
+ /* Tx */
+ netdev->dcbnl_ops->getpgbwgcfgtx(netdev,
+ i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
+ }
+ ret = nla_put_u8(skb, i, tc_pct);
+ if (ret)
+ goto err_pg;
+ }
+
+ nla_nest_end(skb, pg_nest);
+
+ return 0;
+
+err_param:
+ nla_nest_cancel(skb, param_nest);
+err_pg:
+ nla_nest_cancel(skb, pg_nest);
+
+ return -EMSGSIZE;
+}
+
+static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0);
+}
+
+static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1);
+}
+
+static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ u8 value;
+
+ if (!tb[DCB_ATTR_STATE])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setstate)
+ return -EOPNOTSUPP;
+
+ value = nla_get_u8(tb[DCB_ATTR_STATE]);
+
+ return nla_put_u8(skb, DCB_ATTR_STATE,
+ netdev->dcbnl_ops->setstate(netdev, value));
+}
+
+static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1];
+ int i;
+ int ret;
+ u8 value;
+
+ if (!tb[DCB_ATTR_PFC_CFG])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setpfccfg)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX,
+ tb[DCB_ATTR_PFC_CFG],
+ dcbnl_pfc_up_nest);
+ if (ret)
+ return ret;
+
+ for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+ if (data[i] == NULL)
+ continue;
+ value = nla_get_u8(data[i]);
+ netdev->dcbnl_ops->setpfccfg(netdev,
+ data[i]->nla_type - DCB_PFC_UP_ATTR_0, value);
+ }
+
+ return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0);
+}
+
+static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ int ret;
+
+ if (!tb[DCB_ATTR_SET_ALL])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setall)
+ return -EOPNOTSUPP;
+
+ ret = nla_put_u8(skb, DCB_ATTR_SET_ALL,
+ netdev->dcbnl_ops->setall(netdev));
+ dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0);
+
+ return ret;
+}
+
+static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb,
+ int dir)
+{
+ struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
+ struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
+ int ret;
+ int i;
+ u8 pgid;
+ u8 up_map;
+ u8 prio;
+ u8 tc_pct;
+
+ if (!tb[DCB_ATTR_PG_CFG])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setpgtccfgtx ||
+ !netdev->dcbnl_ops->setpgtccfgrx ||
+ !netdev->dcbnl_ops->setpgbwgcfgtx ||
+ !netdev->dcbnl_ops->setpgbwgcfgrx)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX,
+ tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest);
+ if (ret)
+ return ret;
+
+ for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+ if (!pg_tb[i])
+ continue;
+
+ ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX,
+ pg_tb[i], dcbnl_tc_param_nest);
+ if (ret)
+ return ret;
+
+ pgid = DCB_ATTR_VALUE_UNDEFINED;
+ prio = DCB_ATTR_VALUE_UNDEFINED;
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+ up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO])
+ prio =
+ nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]);
+
+ if (param_tb[DCB_TC_ATTR_PARAM_PGID])
+ pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]);
+
+ if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT])
+ tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]);
+
+ if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING])
+ up_map =
+ nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]);
+
+ /* dir: Tx = 0, Rx = 1 */
+ if (dir) {
+ /* Rx */
+ netdev->dcbnl_ops->setpgtccfgrx(netdev,
+ i - DCB_PG_ATTR_TC_0,
+ prio, pgid, tc_pct, up_map);
+ } else {
+ /* Tx */
+ netdev->dcbnl_ops->setpgtccfgtx(netdev,
+ i - DCB_PG_ATTR_TC_0,
+ prio, pgid, tc_pct, up_map);
+ }
+ }
+
+ for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+ if (!pg_tb[i])
+ continue;
+
+ tc_pct = nla_get_u8(pg_tb[i]);
+
+ /* dir: Tx = 0, Rx = 1 */
+ if (dir) {
+ /* Rx */
+ netdev->dcbnl_ops->setpgbwgcfgrx(netdev,
+ i - DCB_PG_ATTR_BW_ID_0, tc_pct);
+ } else {
+ /* Tx */
+ netdev->dcbnl_ops->setpgbwgcfgtx(netdev,
+ i - DCB_PG_ATTR_BW_ID_0, tc_pct);
+ }
+ }
+
+ return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0);
+}
+
+static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0);
+}
+
+static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1);
+}
+
+static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *bcn_nest;
+ struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1];
+ u8 value_byte;
+ u32 value_integer;
+ int ret;
+ bool getall = false;
+ int i;
+
+ if (!tb[DCB_ATTR_BCN])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->getbcnrp ||
+ !netdev->dcbnl_ops->getbcncfg)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(bcn_tb, DCB_BCN_ATTR_MAX,
+ tb[DCB_ATTR_BCN], dcbnl_bcn_nest);
+ if (ret)
+ return ret;
+
+ bcn_nest = nla_nest_start(skb, DCB_ATTR_BCN);
+ if (!bcn_nest)
+ return -EMSGSIZE;
+
+ if (bcn_tb[DCB_BCN_ATTR_ALL])
+ getall = true;
+
+ for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
+ if (!getall && !bcn_tb[i])
+ continue;
+
+ netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0,
+ &value_byte);
+ ret = nla_put_u8(skb, i, value_byte);
+ if (ret)
+ goto err_bcn;
+ }
+
+ for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
+ if (!getall && !bcn_tb[i])
+ continue;
+
+ netdev->dcbnl_ops->getbcncfg(netdev, i,
+ &value_integer);
+ ret = nla_put_u32(skb, i, value_integer);
+ if (ret)
+ goto err_bcn;
+ }
+
+ nla_nest_end(skb, bcn_nest);
+
+ return 0;
+
+err_bcn:
+ nla_nest_cancel(skb, bcn_nest);
+ return ret;
+}
+
+static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_BCN_ATTR_MAX + 1];
+ int i;
+ int ret;
+ u8 value_byte;
+ u32 value_int;
+
+ if (!tb[DCB_ATTR_BCN])
+ return -EINVAL;
+
+ if (!netdev->dcbnl_ops->setbcncfg ||
+ !netdev->dcbnl_ops->setbcnrp)
+ return -EOPNOTSUPP;
+
+ ret = nla_parse_nested(data, DCB_BCN_ATTR_MAX,
+ tb[DCB_ATTR_BCN],
+ dcbnl_pfc_up_nest);
+ if (ret)
+ return ret;
+
+ for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
+ if (data[i] == NULL)
+ continue;
+ value_byte = nla_get_u8(data[i]);
+ netdev->dcbnl_ops->setbcnrp(netdev,
+ data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte);
+ }
+
+ for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
+ if (data[i] == NULL)
+ continue;
+ value_int = nla_get_u32(data[i]);
+ netdev->dcbnl_ops->setbcncfg(netdev,
+ i, value_int);
+ }
+
+ return nla_put_u8(skb, DCB_ATTR_BCN, 0);
+}
+
+static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,
+ int app_nested_type, int app_info_type,
+ int app_entry_type)
+{
+ struct dcb_peer_app_info info;
+ struct dcb_app *table = NULL;
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ u16 app_count;
+ int err;
+
+
+ /**
+ * retrieve the peer app configuration form the driver. If the driver
+ * handlers fail exit without doing anything
+ */
+ err = ops->peer_getappinfo(netdev, &info, &app_count);
+ if (!err && app_count) {
+ table = kmalloc(sizeof(struct dcb_app) * app_count, GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ err = ops->peer_getapptable(netdev, table);
+ }
+
+ if (!err) {
+ u16 i;
+ struct nlattr *app;
+
+ /**
+ * build the message, from here on the only possible failure
+ * is due to the skb size
+ */
+ err = -EMSGSIZE;
+
+ app = nla_nest_start(skb, app_nested_type);
+ if (!app)
+ goto nla_put_failure;
+
+ if (app_info_type &&
+ nla_put(skb, app_info_type, sizeof(info), &info))
+ goto nla_put_failure;
+
+ for (i = 0; i < app_count; i++) {
+ if (nla_put(skb, app_entry_type, sizeof(struct dcb_app),
+ &table[i]))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, app);
+ }
+ err = 0;
+
+nla_put_failure:
+ kfree(table);
+ return err;
+}
+
+/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */
+static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct nlattr *ieee, *app;
+ struct dcb_app_type *itr;
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ int dcbx;
+ int err;
+
+ if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name))
+ return -EMSGSIZE;
+
+ ieee = nla_nest_start(skb, DCB_ATTR_IEEE);
+ if (!ieee)
+ return -EMSGSIZE;
+
+ if (ops->ieee_getets) {
+ struct ieee_ets ets;
+ memset(&ets, 0, sizeof(ets));
+ err = ops->ieee_getets(netdev, &ets);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets))
+ return -EMSGSIZE;
+ }
+
+ if (ops->ieee_getmaxrate) {
+ struct ieee_maxrate maxrate;
+ memset(&maxrate, 0, sizeof(maxrate));
+ err = ops->ieee_getmaxrate(netdev, &maxrate);
+ if (!err) {
+ err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE,
+ sizeof(maxrate), &maxrate);
+ if (err)
+ return -EMSGSIZE;
+ }
+ }
+
+ if (ops->ieee_getqcn) {
+ struct ieee_qcn qcn;
+
+ memset(&qcn, 0, sizeof(qcn));
+ err = ops->ieee_getqcn(netdev, &qcn);
+ if (!err) {
+ err = nla_put(skb, DCB_ATTR_IEEE_QCN,
+ sizeof(qcn), &qcn);
+ if (err)
+ return -EMSGSIZE;
+ }
+ }
+
+ if (ops->ieee_getqcnstats) {
+ struct ieee_qcn_stats qcn_stats;
+
+ memset(&qcn_stats, 0, sizeof(qcn_stats));
+ err = ops->ieee_getqcnstats(netdev, &qcn_stats);
+ if (!err) {
+ err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS,
+ sizeof(qcn_stats), &qcn_stats);
+ if (err)
+ return -EMSGSIZE;
+ }
+ }
+
+ if (ops->ieee_getpfc) {
+ struct ieee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
+ err = ops->ieee_getpfc(netdev, &pfc);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc))
+ return -EMSGSIZE;
+ }
+
+ app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
+ if (!app)
+ return -EMSGSIZE;
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == netdev->ifindex) {
+ err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app),
+ &itr->app);
+ if (err) {
+ spin_unlock_bh(&dcb_lock);
+ return -EMSGSIZE;
+ }
+ }
+ }
+
+ if (netdev->dcbnl_ops->getdcbx)
+ dcbx = netdev->dcbnl_ops->getdcbx(netdev);
+ else
+ dcbx = -EOPNOTSUPP;
+
+ spin_unlock_bh(&dcb_lock);
+ nla_nest_end(skb, app);
+
+ /* get peer info if available */
+ if (ops->ieee_peer_getets) {
+ struct ieee_ets ets;
+ memset(&ets, 0, sizeof(ets));
+ err = ops->ieee_peer_getets(netdev, &ets);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets))
+ return -EMSGSIZE;
+ }
+
+ if (ops->ieee_peer_getpfc) {
+ struct ieee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
+ err = ops->ieee_peer_getpfc(netdev, &pfc);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc))
+ return -EMSGSIZE;
+ }
+
+ if (ops->peer_getappinfo && ops->peer_getapptable) {
+ err = dcbnl_build_peer_app(netdev, skb,
+ DCB_ATTR_IEEE_PEER_APP,
+ DCB_ATTR_IEEE_APP_UNSPEC,
+ DCB_ATTR_IEEE_APP);
+ if (err)
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, ieee);
+ if (dcbx >= 0) {
+ err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx);
+ if (err)
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+
+static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev,
+ int dir)
+{
+ u8 pgid, up_map, prio, tc_pct;
+ const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
+ int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG;
+ struct nlattr *pg = nla_nest_start(skb, i);
+
+ if (!pg)
+ return -EMSGSIZE;
+
+ for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+ struct nlattr *tc_nest = nla_nest_start(skb, i);
+
+ if (!tc_nest)
+ return -EMSGSIZE;
+
+ pgid = DCB_ATTR_VALUE_UNDEFINED;
+ prio = DCB_ATTR_VALUE_UNDEFINED;
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+ up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (!dir)
+ ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0,
+ &prio, &pgid, &tc_pct, &up_map);
+ else
+ ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0,
+ &prio, &pgid, &tc_pct, &up_map);
+
+ if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) ||
+ nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) ||
+ nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) ||
+ nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct))
+ return -EMSGSIZE;
+ nla_nest_end(skb, tc_nest);
+ }
+
+ for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+ tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+
+ if (!dir)
+ ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0,
+ &tc_pct);
+ else
+ ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0,
+ &tc_pct);
+ if (nla_put_u8(skb, i, tc_pct))
+ return -EMSGSIZE;
+ }
+ nla_nest_end(skb, pg);
+ return 0;
+}
+
+static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct nlattr *cee, *app;
+ struct dcb_app_type *itr;
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ int dcbx, i, err = -EMSGSIZE;
+ u8 value;
+
+ if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name))
+ goto nla_put_failure;
+ cee = nla_nest_start(skb, DCB_ATTR_CEE);
+ if (!cee)
+ goto nla_put_failure;
+
+ /* local pg */
+ if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) {
+ err = dcbnl_cee_pg_fill(skb, netdev, 1);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) {
+ err = dcbnl_cee_pg_fill(skb, netdev, 0);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ /* local pfc */
+ if (ops->getpfccfg) {
+ struct nlattr *pfc_nest = nla_nest_start(skb, DCB_ATTR_CEE_PFC);
+
+ if (!pfc_nest)
+ goto nla_put_failure;
+
+ for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+ ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value);
+ if (nla_put_u8(skb, i, value))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, pfc_nest);
+ }
+
+ /* local app */
+ spin_lock_bh(&dcb_lock);
+ app = nla_nest_start(skb, DCB_ATTR_CEE_APP_TABLE);
+ if (!app)
+ goto dcb_unlock;
+
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == netdev->ifindex) {
+ struct nlattr *app_nest = nla_nest_start(skb,
+ DCB_ATTR_APP);
+ if (!app_nest)
+ goto dcb_unlock;
+
+ err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE,
+ itr->app.selector);
+ if (err)
+ goto dcb_unlock;
+
+ err = nla_put_u16(skb, DCB_APP_ATTR_ID,
+ itr->app.protocol);
+ if (err)
+ goto dcb_unlock;
+
+ err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY,
+ itr->app.priority);
+ if (err)
+ goto dcb_unlock;
+
+ nla_nest_end(skb, app_nest);
+ }
+ }
+ nla_nest_end(skb, app);
+
+ if (netdev->dcbnl_ops->getdcbx)
+ dcbx = netdev->dcbnl_ops->getdcbx(netdev);
+ else
+ dcbx = -EOPNOTSUPP;
+
+ spin_unlock_bh(&dcb_lock);
+
+ /* features flags */
+ if (ops->getfeatcfg) {
+ struct nlattr *feat = nla_nest_start(skb, DCB_ATTR_CEE_FEAT);
+ if (!feat)
+ goto nla_put_failure;
+
+ for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX;
+ i++)
+ if (!ops->getfeatcfg(netdev, i, &value) &&
+ nla_put_u8(skb, i, value))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, feat);
+ }
+
+ /* peer info if available */
+ if (ops->cee_peer_getpg) {
+ struct cee_pg pg;
+ memset(&pg, 0, sizeof(pg));
+ err = ops->cee_peer_getpg(netdev, &pg);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg))
+ goto nla_put_failure;
+ }
+
+ if (ops->cee_peer_getpfc) {
+ struct cee_pfc pfc;
+ memset(&pfc, 0, sizeof(pfc));
+ err = ops->cee_peer_getpfc(netdev, &pfc);
+ if (!err &&
+ nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc))
+ goto nla_put_failure;
+ }
+
+ if (ops->peer_getappinfo && ops->peer_getapptable) {
+ err = dcbnl_build_peer_app(netdev, skb,
+ DCB_ATTR_CEE_PEER_APP_TABLE,
+ DCB_ATTR_CEE_PEER_APP_INFO,
+ DCB_ATTR_CEE_PEER_APP);
+ if (err)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, cee);
+
+ /* DCBX state */
+ if (dcbx >= 0) {
+ err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx);
+ if (err)
+ goto nla_put_failure;
+ }
+ return 0;
+
+dcb_unlock:
+ spin_unlock_bh(&dcb_lock);
+nla_put_failure:
+ return err;
+}
+
+static int dcbnl_notify(struct net_device *dev, int event, int cmd,
+ u32 seq, u32 portid, int dcbx_ver)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
+ int err;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE)
+ err = dcbnl_ieee_fill(skb, dev);
+ else
+ err = dcbnl_cee_fill(skb, dev);
+
+ if (err < 0) {
+ /* Report error to broadcast listeners */
+ nlmsg_free(skb);
+ rtnl_set_sk_err(net, RTNLGRP_DCB, err);
+ } else {
+ /* End nlmsg and notify broadcast listeners */
+ nlmsg_end(skb, nlh);
+ rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL);
+ }
+
+ return err;
+}
+
+int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd,
+ u32 seq, u32 portid)
+{
+ return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE);
+}
+EXPORT_SYMBOL(dcbnl_ieee_notify);
+
+int dcbnl_cee_notify(struct net_device *dev, int event, int cmd,
+ u32 seq, u32 portid)
+{
+ return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE);
+}
+EXPORT_SYMBOL(dcbnl_cee_notify);
+
+/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands.
+ * If any requested operation can not be completed
+ * the entire msg is aborted and error value is returned.
+ * No attempt is made to reconcile the case where only part of the
+ * cmd can be completed.
+ */
+static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+ int err;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_IEEE])
+ return -EINVAL;
+
+ err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX,
+ tb[DCB_ATTR_IEEE], dcbnl_ieee_policy);
+ if (err)
+ return err;
+
+ if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) {
+ struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]);
+ err = ops->ieee_setets(netdev, ets);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) {
+ struct ieee_maxrate *maxrate =
+ nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]);
+ err = ops->ieee_setmaxrate(netdev, maxrate);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) {
+ struct ieee_qcn *qcn =
+ nla_data(ieee[DCB_ATTR_IEEE_QCN]);
+
+ err = ops->ieee_setqcn(netdev, qcn);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) {
+ struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
+ err = ops->ieee_setpfc(netdev, pfc);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
+ struct nlattr *attr;
+ int rem;
+
+ nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
+ struct dcb_app *app_data;
+ if (nla_type(attr) != DCB_ATTR_IEEE_APP)
+ continue;
+ app_data = nla_data(attr);
+ if (ops->ieee_setapp)
+ err = ops->ieee_setapp(netdev, app_data);
+ else
+ err = dcb_ieee_setapp(netdev, app_data);
+ if (err)
+ goto err;
+ }
+ }
+
+err:
+ err = nla_put_u8(skb, DCB_ATTR_IEEE, err);
+ dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0);
+ return err;
+}
+
+static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return dcbnl_ieee_fill(skb, netdev);
+}
+
+static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+ int err;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_IEEE])
+ return -EINVAL;
+
+ err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX,
+ tb[DCB_ATTR_IEEE], dcbnl_ieee_policy);
+ if (err)
+ return err;
+
+ if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
+ struct nlattr *attr;
+ int rem;
+
+ nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
+ struct dcb_app *app_data;
+
+ if (nla_type(attr) != DCB_ATTR_IEEE_APP)
+ continue;
+ app_data = nla_data(attr);
+ if (ops->ieee_delapp)
+ err = ops->ieee_delapp(netdev, app_data);
+ else
+ err = dcb_ieee_delapp(netdev, app_data);
+ if (err)
+ goto err;
+ }
+ }
+
+err:
+ err = nla_put_u8(skb, DCB_ATTR_IEEE, err);
+ dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0);
+ return err;
+}
+
+
+/* DCBX configuration */
+static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ if (!netdev->dcbnl_ops->getdcbx)
+ return -EOPNOTSUPP;
+
+ return nla_put_u8(skb, DCB_ATTR_DCBX,
+ netdev->dcbnl_ops->getdcbx(netdev));
+}
+
+static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ u8 value;
+
+ if (!netdev->dcbnl_ops->setdcbx)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_DCBX])
+ return -EINVAL;
+
+ value = nla_get_u8(tb[DCB_ATTR_DCBX]);
+
+ return nla_put_u8(skb, DCB_ATTR_DCBX,
+ netdev->dcbnl_ops->setdcbx(netdev, value));
+}
+
+static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest;
+ u8 value;
+ int ret, i;
+ int getall = 0;
+
+ if (!netdev->dcbnl_ops->getfeatcfg)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_FEATCFG])
+ return -EINVAL;
+
+ ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG],
+ dcbnl_featcfg_nest);
+ if (ret)
+ return ret;
+
+ nest = nla_nest_start(skb, DCB_ATTR_FEATCFG);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data[DCB_FEATCFG_ATTR_ALL])
+ getall = 1;
+
+ for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) {
+ if (!getall && !data[i])
+ continue;
+
+ ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value);
+ if (!ret)
+ ret = nla_put_u8(skb, i, value);
+
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(skb, nest);
+
+nla_put_failure:
+ return ret;
+}
+
+static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1];
+ int ret, i;
+ u8 value;
+
+ if (!netdev->dcbnl_ops->setfeatcfg)
+ return -ENOTSUPP;
+
+ if (!tb[DCB_ATTR_FEATCFG])
+ return -EINVAL;
+
+ ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG],
+ dcbnl_featcfg_nest);
+
+ if (ret)
+ goto err;
+
+ for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) {
+ if (data[i] == NULL)
+ continue;
+
+ value = nla_get_u8(data[i]);
+
+ ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value);
+
+ if (ret)
+ goto err;
+ }
+err:
+ ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret);
+
+ return ret;
+}
+
+/* Handle CEE DCBX GET commands. */
+static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh,
+ u32 seq, struct nlattr **tb, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return dcbnl_cee_fill(skb, netdev);
+}
+
+struct reply_func {
+ /* reply netlink message type */
+ int type;
+
+ /* function to fill message contents */
+ int (*cb)(struct net_device *, struct nlmsghdr *, u32,
+ struct nlattr **, struct sk_buff *);
+};
+
+static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = {
+ [DCB_CMD_GSTATE] = { RTM_GETDCB, dcbnl_getstate },
+ [DCB_CMD_SSTATE] = { RTM_SETDCB, dcbnl_setstate },
+ [DCB_CMD_PFC_GCFG] = { RTM_GETDCB, dcbnl_getpfccfg },
+ [DCB_CMD_PFC_SCFG] = { RTM_SETDCB, dcbnl_setpfccfg },
+ [DCB_CMD_GPERM_HWADDR] = { RTM_GETDCB, dcbnl_getperm_hwaddr },
+ [DCB_CMD_GCAP] = { RTM_GETDCB, dcbnl_getcap },
+ [DCB_CMD_GNUMTCS] = { RTM_GETDCB, dcbnl_getnumtcs },
+ [DCB_CMD_SNUMTCS] = { RTM_SETDCB, dcbnl_setnumtcs },
+ [DCB_CMD_PFC_GSTATE] = { RTM_GETDCB, dcbnl_getpfcstate },
+ [DCB_CMD_PFC_SSTATE] = { RTM_SETDCB, dcbnl_setpfcstate },
+ [DCB_CMD_GAPP] = { RTM_GETDCB, dcbnl_getapp },
+ [DCB_CMD_SAPP] = { RTM_SETDCB, dcbnl_setapp },
+ [DCB_CMD_PGTX_GCFG] = { RTM_GETDCB, dcbnl_pgtx_getcfg },
+ [DCB_CMD_PGTX_SCFG] = { RTM_SETDCB, dcbnl_pgtx_setcfg },
+ [DCB_CMD_PGRX_GCFG] = { RTM_GETDCB, dcbnl_pgrx_getcfg },
+ [DCB_CMD_PGRX_SCFG] = { RTM_SETDCB, dcbnl_pgrx_setcfg },
+ [DCB_CMD_SET_ALL] = { RTM_SETDCB, dcbnl_setall },
+ [DCB_CMD_BCN_GCFG] = { RTM_GETDCB, dcbnl_bcn_getcfg },
+ [DCB_CMD_BCN_SCFG] = { RTM_SETDCB, dcbnl_bcn_setcfg },
+ [DCB_CMD_IEEE_GET] = { RTM_GETDCB, dcbnl_ieee_get },
+ [DCB_CMD_IEEE_SET] = { RTM_SETDCB, dcbnl_ieee_set },
+ [DCB_CMD_IEEE_DEL] = { RTM_SETDCB, dcbnl_ieee_del },
+ [DCB_CMD_GDCBX] = { RTM_GETDCB, dcbnl_getdcbx },
+ [DCB_CMD_SDCBX] = { RTM_SETDCB, dcbnl_setdcbx },
+ [DCB_CMD_GFEATCFG] = { RTM_GETDCB, dcbnl_getfeatcfg },
+ [DCB_CMD_SFEATCFG] = { RTM_SETDCB, dcbnl_setfeatcfg },
+ [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get },
+};
+
+static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ struct dcbmsg *dcb = nlmsg_data(nlh);
+ struct nlattr *tb[DCB_ATTR_MAX + 1];
+ u32 portid = skb ? NETLINK_CB(skb).portid : 0;
+ int ret = -EINVAL;
+ struct sk_buff *reply_skb;
+ struct nlmsghdr *reply_nlh = NULL;
+ const struct reply_func *fn;
+
+ if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,
+ dcbnl_rtnl_policy);
+ if (ret < 0)
+ return ret;
+
+ if (dcb->cmd > DCB_CMD_MAX)
+ return -EINVAL;
+
+ /* check if a reply function has been defined for the command */
+ fn = &reply_funcs[dcb->cmd];
+ if (!fn->cb)
+ return -EOPNOTSUPP;
+
+ if (!tb[DCB_ATTR_IFNAME])
+ return -EINVAL;
+
+ netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME]));
+ if (!netdev)
+ return -ENODEV;
+
+ if (!netdev->dcbnl_ops)
+ return -EOPNOTSUPP;
+
+ reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq,
+ nlh->nlmsg_flags, &reply_nlh);
+ if (!reply_skb)
+ return -ENOBUFS;
+
+ ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb);
+ if (ret < 0) {
+ nlmsg_free(reply_skb);
+ goto out;
+ }
+
+ nlmsg_end(reply_skb, reply_nlh);
+
+ ret = rtnl_unicast(reply_skb, net, portid);
+out:
+ return ret;
+}
+
+static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app,
+ int ifindex, int prio)
+{
+ struct dcb_app_type *itr;
+
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->app.selector == app->selector &&
+ itr->app.protocol == app->protocol &&
+ itr->ifindex == ifindex &&
+ (!prio || itr->app.priority == prio))
+ return itr;
+ }
+
+ return NULL;
+}
+
+static int dcb_app_add(const struct dcb_app *app, int ifindex)
+{
+ struct dcb_app_type *entry;
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+
+ memcpy(&entry->app, app, sizeof(*app));
+ entry->ifindex = ifindex;
+ list_add(&entry->list, &dcb_app_list);
+
+ return 0;
+}
+
+/**
+ * dcb_getapp - retrieve the DCBX application user priority
+ *
+ * On success returns a non-zero 802.1p user priority bitmap
+ * otherwise returns 0 as the invalid user priority bitmap to
+ * indicate an error.
+ */
+u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
+{
+ struct dcb_app_type *itr;
+ u8 prio = 0;
+
+ spin_lock_bh(&dcb_lock);
+ if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
+ prio = itr->app.priority;
+ spin_unlock_bh(&dcb_lock);
+
+ return prio;
+}
+EXPORT_SYMBOL(dcb_getapp);
+
+/**
+ * dcb_setapp - add CEE dcb application data to app list
+ *
+ * Priority 0 is an invalid priority in CEE spec. This routine
+ * removes applications from the app list if the priority is
+ * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap
+ */
+int dcb_setapp(struct net_device *dev, struct dcb_app *new)
+{
+ struct dcb_app_type *itr;
+ struct dcb_app_type event;
+ int err = 0;
+
+ event.ifindex = dev->ifindex;
+ memcpy(&event.app, new, sizeof(event.app));
+ if (dev->dcbnl_ops->getdcbx)
+ event.dcbx = dev->dcbnl_ops->getdcbx(dev);
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and replace */
+ if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) {
+ if (new->priority)
+ itr->app.priority = new->priority;
+ else {
+ list_del(&itr->list);
+ kfree(itr);
+ }
+ goto out;
+ }
+ /* App type does not exist add new application type */
+ if (new->priority)
+ err = dcb_app_add(new, dev->ifindex);
+out:
+ spin_unlock_bh(&dcb_lock);
+ if (!err)
+ call_dcbevent_notifiers(DCB_APP_EVENT, &event);
+ return err;
+}
+EXPORT_SYMBOL(dcb_setapp);
+
+/**
+ * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority
+ *
+ * Helper routine which on success returns a non-zero 802.1Qaz user
+ * priority bitmap otherwise returns 0 to indicate the dcb_app was
+ * not found in APP list.
+ */
+u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
+{
+ struct dcb_app_type *itr;
+ u8 prio = 0;
+
+ spin_lock_bh(&dcb_lock);
+ if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
+ prio |= 1 << itr->app.priority;
+ spin_unlock_bh(&dcb_lock);
+
+ return prio;
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_mask);
+
+/**
+ * dcb_ieee_setapp - add IEEE dcb application data to app list
+ *
+ * This adds Application data to the list. Multiple application
+ * entries may exists for the same selector and protocol as long
+ * as the priorities are different. Priority is expected to be a
+ * 3-bit unsigned integer
+ */
+int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new)
+{
+ struct dcb_app_type event;
+ int err = 0;
+
+ event.ifindex = dev->ifindex;
+ memcpy(&event.app, new, sizeof(event.app));
+ if (dev->dcbnl_ops->getdcbx)
+ event.dcbx = dev->dcbnl_ops->getdcbx(dev);
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and abort if found */
+ if (dcb_app_lookup(new, dev->ifindex, new->priority)) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ err = dcb_app_add(new, dev->ifindex);
+out:
+ spin_unlock_bh(&dcb_lock);
+ if (!err)
+ call_dcbevent_notifiers(DCB_APP_EVENT, &event);
+ return err;
+}
+EXPORT_SYMBOL(dcb_ieee_setapp);
+
+/**
+ * dcb_ieee_delapp - delete IEEE dcb application data from list
+ *
+ * This removes a matching APP data from the APP list
+ */
+int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
+{
+ struct dcb_app_type *itr;
+ struct dcb_app_type event;
+ int err = -ENOENT;
+
+ event.ifindex = dev->ifindex;
+ memcpy(&event.app, del, sizeof(event.app));
+ if (dev->dcbnl_ops->getdcbx)
+ event.dcbx = dev->dcbnl_ops->getdcbx(dev);
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and remove it. */
+ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) {
+ list_del(&itr->list);
+ kfree(itr);
+ err = 0;
+ }
+
+ spin_unlock_bh(&dcb_lock);
+ if (!err)
+ call_dcbevent_notifiers(DCB_APP_EVENT, &event);
+ return err;
+}
+EXPORT_SYMBOL(dcb_ieee_delapp);
+
+static void dcb_flushapp(void)
+{
+ struct dcb_app_type *app;
+ struct dcb_app_type *tmp;
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry_safe(app, tmp, &dcb_app_list, list) {
+ list_del(&app->list);
+ kfree(app);
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+
+static int __init dcbnl_init(void)
+{
+ INIT_LIST_HEAD(&dcb_app_list);
+
+ rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, NULL);
+
+ return 0;
+}
+module_init(dcbnl_init);
+
+static void __exit dcbnl_exit(void)
+{
+ rtnl_unregister(PF_UNSPEC, RTM_GETDCB);
+ rtnl_unregister(PF_UNSPEC, RTM_SETDCB);
+ dcb_flushapp();
+}
+module_exit(dcbnl_exit);
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
new file mode 100644
index 000000000..8c0ef71be
--- /dev/null
+++ b/net/dccp/Kconfig
@@ -0,0 +1,62 @@
+menuconfig IP_DCCP
+ tristate "The DCCP Protocol"
+ depends on INET
+ ---help---
+ Datagram Congestion Control Protocol (RFC 4340)
+
+ From http://www.ietf.org/rfc/rfc4340.txt:
+
+ The Datagram Congestion Control Protocol (DCCP) is a transport
+ protocol that implements bidirectional, unicast connections of
+ congestion-controlled, unreliable datagrams. It should be suitable
+ for use by applications such as streaming media, Internet telephony,
+ and on-line games.
+
+ To compile this protocol support as a module, choose M here: the
+ module will be called dccp.
+
+ If in doubt, say N.
+
+if IP_DCCP
+
+config INET_DCCP_DIAG
+ depends on INET_DIAG
+ def_tristate y if (IP_DCCP = y && INET_DIAG = y)
+ def_tristate m
+
+source "net/dccp/ccids/Kconfig"
+
+menu "DCCP Kernel Hacking"
+ depends on DEBUG_KERNEL=y
+
+config IP_DCCP_DEBUG
+ bool "DCCP debug messages"
+ ---help---
+ Only use this if you're hacking DCCP.
+
+ When compiling DCCP as a module, this debugging output can be toggled
+ by setting the parameter dccp_debug of the `dccp' module to 0 or 1.
+
+ Just say N.
+
+config NET_DCCPPROBE
+ tristate "DCCP connection probing"
+ depends on PROC_FS && KPROBES
+ ---help---
+ This module allows for capturing the changes to DCCP connection
+ state in response to incoming packets. It is used for debugging
+ DCCP congestion avoidance modules. If you don't understand
+ what was just said, you don't need it: say N.
+
+ Documentation on how to use DCCP connection probing can be found
+ at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
+
+ To compile this code as a module, choose M here: the
+ module will be called dccp_probe.
+
+
+endmenu
+
+endif # IP_DDCP
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
new file mode 100644
index 000000000..5c8362b03
--- /dev/null
+++ b/net/dccp/Makefile
@@ -0,0 +1,28 @@
+obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
+
+dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
+ qpolicy.o
+#
+# CCID algorithms to be used by dccp.ko
+#
+# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency
+dccp-y += ccids/ccid2.o ackvec.o
+dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o
+dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \
+ ccids/lib/tfrc_equation.o \
+ ccids/lib/packet_history.o \
+ ccids/lib/loss_interval.o
+
+dccp_ipv4-y := ipv4.o
+
+# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module
+obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
+dccp_ipv6-y := ipv6.o
+
+obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
+obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
+
+dccp-$(CONFIG_SYSCTL) += sysctl.o
+
+dccp_diag-y := diag.o
+dccp_probe-y := probe.o
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
new file mode 100644
index 000000000..bd9e718c2
--- /dev/null
+++ b/net/dccp/ackvec.c
@@ -0,0 +1,409 @@
+/*
+ * net/dccp/ackvec.c
+ *
+ * An implementation of Ack Vectors for the DCCP protocol
+ * Copyright (c) 2007 University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License;
+ */
+#include "dccp.h"
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+static struct kmem_cache *dccp_ackvec_slab;
+static struct kmem_cache *dccp_ackvec_record_slab;
+
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
+{
+ struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
+
+ if (av != NULL) {
+ av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
+ INIT_LIST_HEAD(&av->av_records);
+ }
+ return av;
+}
+
+static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
+{
+ struct dccp_ackvec_record *cur, *next;
+
+ list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
+ kmem_cache_free(dccp_ackvec_record_slab, cur);
+ INIT_LIST_HEAD(&av->av_records);
+}
+
+void dccp_ackvec_free(struct dccp_ackvec *av)
+{
+ if (likely(av != NULL)) {
+ dccp_ackvec_purge_records(av);
+ kmem_cache_free(dccp_ackvec_slab, av);
+ }
+}
+
+/**
+ * dccp_ackvec_update_records - Record information about sent Ack Vectors
+ * @av: Ack Vector records to update
+ * @seqno: Sequence number of the packet carrying the Ack Vector just sent
+ * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
+ */
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
+{
+ struct dccp_ackvec_record *avr;
+
+ avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+ if (avr == NULL)
+ return -ENOBUFS;
+
+ avr->avr_ack_seqno = seqno;
+ avr->avr_ack_ptr = av->av_buf_head;
+ avr->avr_ack_ackno = av->av_buf_ackno;
+ avr->avr_ack_nonce = nonce_sum;
+ avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
+ /*
+ * When the buffer overflows, we keep no more than one record. This is
+ * the simplest way of disambiguating sender-Acks dating from before the
+ * overflow from sender-Acks which refer to after the overflow; a simple
+ * solution is preferable here since we are handling an exception.
+ */
+ if (av->av_overflow)
+ dccp_ackvec_purge_records(av);
+ /*
+ * Since GSS is incremented for each packet, the list is automatically
+ * arranged in descending order of @ack_seqno.
+ */
+ list_add(&avr->avr_node, &av->av_records);
+
+ dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
+ (unsigned long long)avr->avr_ack_seqno,
+ (unsigned long long)avr->avr_ack_ackno,
+ avr->avr_ack_runlen);
+ return 0;
+}
+
+static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
+ const u64 ackno)
+{
+ struct dccp_ackvec_record *avr;
+ /*
+ * Exploit that records are inserted in descending order of sequence
+ * number, start with the oldest record first. If @ackno is `before'
+ * the earliest ack_ackno, the packet is too old to be considered.
+ */
+ list_for_each_entry_reverse(avr, av_list, avr_node) {
+ if (avr->avr_ack_seqno == ackno)
+ return avr;
+ if (before48(ackno, avr->avr_ack_seqno))
+ break;
+ }
+ return NULL;
+}
+
+/*
+ * Buffer index and length computation using modulo-buffersize arithmetic.
+ * Note that, as pointers move from right to left, head is `before' tail.
+ */
+static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
+{
+ return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
+}
+
+static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
+{
+ return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
+}
+
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
+{
+ if (unlikely(av->av_overflow))
+ return DCCPAV_MAX_ACKVEC_LEN;
+ return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
+}
+
+/**
+ * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
+ * @av: non-empty buffer to update
+ * @distance: negative or zero distance of @seqno from buf_ackno downward
+ * @seqno: the (old) sequence number whose record is to be updated
+ * @state: state in which packet carrying @seqno was received
+ */
+static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
+ u64 seqno, enum dccp_ackvec_states state)
+{
+ u16 ptr = av->av_buf_head;
+
+ BUG_ON(distance > 0);
+ if (unlikely(dccp_ackvec_is_empty(av)))
+ return;
+
+ do {
+ u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
+
+ if (distance + runlen >= 0) {
+ /*
+ * Only update the state if packet has not been received
+ * yet. This is OK as per the second table in RFC 4340,
+ * 11.4.1; i.e. here we are using the following table:
+ * RECEIVED
+ * 0 1 3
+ * S +---+---+---+
+ * T 0 | 0 | 0 | 0 |
+ * O +---+---+---+
+ * R 1 | 1 | 1 | 1 |
+ * E +---+---+---+
+ * D 3 | 0 | 1 | 3 |
+ * +---+---+---+
+ * The "Not Received" state was set by reserve_seats().
+ */
+ if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
+ av->av_buf[ptr] = state;
+ else
+ dccp_pr_debug("Not changing %llu state to %u\n",
+ (unsigned long long)seqno, state);
+ break;
+ }
+
+ distance += runlen + 1;
+ ptr = __ackvec_idx_add(ptr, 1);
+
+ } while (ptr != av->av_buf_tail);
+}
+
+/* Mark @num entries after buf_head as "Not yet received". */
+static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
+{
+ u16 start = __ackvec_idx_add(av->av_buf_head, 1),
+ len = DCCPAV_MAX_ACKVEC_LEN - start;
+
+ /* check for buffer wrap-around */
+ if (num > len) {
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
+ start = 0;
+ num -= len;
+ }
+ if (num)
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
+}
+
+/**
+ * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
+ * @av: container of buffer to update (can be empty or non-empty)
+ * @num_packets: number of packets to register (must be >= 1)
+ * @seqno: sequence number of the first packet in @num_packets
+ * @state: state in which packet carrying @seqno was received
+ */
+static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
+ u64 seqno, enum dccp_ackvec_states state)
+{
+ u32 num_cells = num_packets;
+
+ if (num_packets > DCCPAV_BURST_THRESH) {
+ u32 lost_packets = num_packets - 1;
+
+ DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
+ /*
+ * We received 1 packet and have a loss of size "num_packets-1"
+ * which we squeeze into num_cells-1 rather than reserving an
+ * entire byte for each lost packet.
+ * The reason is that the vector grows in O(burst_length); when
+ * it grows too large there will no room left for the payload.
+ * This is a trade-off: if a few packets out of the burst show
+ * up later, their state will not be changed; it is simply too
+ * costly to reshuffle/reallocate/copy the buffer each time.
+ * Should such problems persist, we will need to switch to a
+ * different underlying data structure.
+ */
+ for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
+ u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN);
+
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
+ av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+
+ lost_packets -= len;
+ }
+ }
+
+ if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
+ DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
+ av->av_overflow = true;
+ }
+
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
+ if (av->av_overflow)
+ av->av_buf_tail = av->av_buf_head;
+
+ av->av_buf[av->av_buf_head] = state;
+ av->av_buf_ackno = seqno;
+
+ if (num_packets > 1)
+ dccp_ackvec_reserve_seats(av, num_packets - 1);
+}
+
+/**
+ * dccp_ackvec_input - Register incoming packet in the buffer
+ */
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
+{
+ u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ enum dccp_ackvec_states state = DCCPAV_RECEIVED;
+
+ if (dccp_ackvec_is_empty(av)) {
+ dccp_ackvec_add_new(av, 1, seqno, state);
+ av->av_tail_ackno = seqno;
+
+ } else {
+ s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
+ u8 *current_head = av->av_buf + av->av_buf_head;
+
+ if (num_packets == 1 &&
+ dccp_ackvec_state(current_head) == state &&
+ dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
+
+ *current_head += 1;
+ av->av_buf_ackno = seqno;
+
+ } else if (num_packets > 0) {
+ dccp_ackvec_add_new(av, num_packets, seqno, state);
+ } else {
+ dccp_ackvec_update_old(av, num_packets, seqno, state);
+ }
+ }
+}
+
+/**
+ * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
+ * This routine is called when the peer acknowledges the receipt of Ack Vectors
+ * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * are additional precautions to prevent corrupted buffer state. In particular,
+ * we use tail_ackno to identify outdated records; it always marks the earliest
+ * packet of group (2) in 11.4.2.
+ */
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
+{
+ struct dccp_ackvec_record *avr, *next;
+ u8 runlen_now, eff_runlen;
+ s64 delta;
+
+ avr = dccp_ackvec_lookup(&av->av_records, ackno);
+ if (avr == NULL)
+ return;
+ /*
+ * Deal with outdated acknowledgments: this arises when e.g. there are
+ * several old records and the acks from the peer come in slowly. In
+ * that case we may still have records that pre-date tail_ackno.
+ */
+ delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
+ if (delta < 0)
+ goto free_records;
+ /*
+ * Deal with overlapping Ack Vectors: don't subtract more than the
+ * number of packets between tail_ackno and ack_ackno.
+ */
+ eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
+
+ runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
+ /*
+ * The run length of Ack Vector cells does not decrease over time. If
+ * the run length is the same as at the time the Ack Vector was sent, we
+ * free the ack_ptr cell. That cell can however not be freed if the run
+ * length has increased: in this case we need to move the tail pointer
+ * backwards (towards higher indices), to its next-oldest neighbour.
+ */
+ if (runlen_now > eff_runlen) {
+
+ av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
+ av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
+
+ /* This move may not have cleared the overflow flag. */
+ if (av->av_overflow)
+ av->av_overflow = (av->av_buf_head == av->av_buf_tail);
+ } else {
+ av->av_buf_tail = avr->avr_ack_ptr;
+ /*
+ * We have made sure that avr points to a valid cell within the
+ * buffer. This cell is either older than head, or equals head
+ * (empty buffer): in both cases we no longer have any overflow.
+ */
+ av->av_overflow = 0;
+ }
+
+ /*
+ * The peer has acknowledged up to and including ack_ackno. Hence the
+ * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
+ */
+ av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
+
+free_records:
+ list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+ list_del(&avr->avr_node);
+ kmem_cache_free(dccp_ackvec_record_slab, avr);
+ }
+}
+
+/*
+ * Routines to keep track of Ack Vectors received in an skb
+ */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
+{
+ struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
+
+ if (new == NULL)
+ return -ENOBUFS;
+ new->vec = vec;
+ new->len = len;
+ new->nonce = nonce;
+
+ list_add_tail(&new->node, head);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
+
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
+{
+ struct dccp_ackvec_parsed *cur, *next;
+
+ list_for_each_entry_safe(cur, next, parsed_chunks, node)
+ kfree(cur);
+ INIT_LIST_HEAD(parsed_chunks);
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
+
+int __init dccp_ackvec_init(void)
+{
+ dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
+ sizeof(struct dccp_ackvec), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (dccp_ackvec_slab == NULL)
+ goto out_err;
+
+ dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
+ sizeof(struct dccp_ackvec_record),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (dccp_ackvec_record_slab == NULL)
+ goto out_destroy_slab;
+
+ return 0;
+
+out_destroy_slab:
+ kmem_cache_destroy(dccp_ackvec_slab);
+ dccp_ackvec_slab = NULL;
+out_err:
+ DCCP_CRIT("Unable to create Ack Vector slab cache");
+ return -ENOBUFS;
+}
+
+void dccp_ackvec_exit(void)
+{
+ if (dccp_ackvec_slab != NULL) {
+ kmem_cache_destroy(dccp_ackvec_slab);
+ dccp_ackvec_slab = NULL;
+ }
+ if (dccp_ackvec_record_slab != NULL) {
+ kmem_cache_destroy(dccp_ackvec_record_slab);
+ dccp_ackvec_record_slab = NULL;
+ }
+}
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
new file mode 100644
index 000000000..3284bfa98
--- /dev/null
+++ b/net/dccp/ackvec.h
@@ -0,0 +1,138 @@
+#ifndef _ACKVEC_H
+#define _ACKVEC_H
+/*
+ * net/dccp/ackvec.h
+ *
+ * An implementation of Ack Vectors for the DCCP protocol
+ * Copyright (c) 2007 University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/dccp.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+/*
+ * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
+ * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
+ * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
+ * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ * The maximum value is bounded by the u16 types for indices and functions.
+ */
+#define DCCPAV_NUM_ACKVECS 2
+#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
+
+/* Estimated minimum average Ack Vector length - used for updating MPS */
+#define DCCPAV_MIN_OPTLEN 16
+
+/* Threshold for coping with large bursts of losses */
+#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
+
+enum dccp_ackvec_states {
+ DCCPAV_RECEIVED = 0x00,
+ DCCPAV_ECN_MARKED = 0x40,
+ DCCPAV_RESERVED = 0x80,
+ DCCPAV_NOT_RECEIVED = 0xC0
+};
+#define DCCPAV_MAX_RUNLEN 0x3F
+
+static inline u8 dccp_ackvec_runlen(const u8 *cell)
+{
+ return *cell & DCCPAV_MAX_RUNLEN;
+}
+
+static inline u8 dccp_ackvec_state(const u8 *cell)
+{
+ return *cell & ~DCCPAV_MAX_RUNLEN;
+}
+
+/**
+ * struct dccp_ackvec - Ack Vector main data structure
+ *
+ * This implements a fixed-size circular buffer within an array and is largely
+ * based on Appendix A of RFC 4340.
+ *
+ * @av_buf: circular buffer storage area
+ * @av_buf_head: head index; begin of live portion in @av_buf
+ * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
+ * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
+ * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
+ * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
+ */
+struct dccp_ackvec {
+ u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
+ u16 av_buf_head;
+ u16 av_buf_tail;
+ u64 av_buf_ackno:48;
+ u64 av_tail_ackno:48;
+ bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
+ u8 av_overflow:1;
+ struct list_head av_records;
+};
+
+/**
+ * struct dccp_ackvec_record - Records information about sent Ack Vectors
+ *
+ * These list entries define the additional information which the HC-Receiver
+ * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
+ *
+ * @avr_node: the list node in @av_records
+ * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
+ * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
+ * @avr_ack_ptr: pointer into @av_buf where this record starts
+ * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
+ * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
+ *
+ * The list as a whole is sorted in descending order by @avr_ack_seqno.
+ */
+struct dccp_ackvec_record {
+ struct list_head avr_node;
+ u64 avr_ack_seqno:48;
+ u64 avr_ack_ackno:48;
+ u16 avr_ack_ptr;
+ u8 avr_ack_runlen;
+ u8 avr_ack_nonce:1;
+};
+
+int dccp_ackvec_init(void);
+void dccp_ackvec_exit(void);
+
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
+void dccp_ackvec_free(struct dccp_ackvec *av);
+
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
+
+static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
+{
+ return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
+}
+
+/**
+ * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
+ * @vec: start of vector (offset into skb)
+ * @len: length of @vec
+ * @nonce: whether @vec had an ECN nonce of 0 or 1
+ * @node: FIFO - arranged in descending order of ack_ackno
+ *
+ * This structure is used by CCIDs to access Ack Vectors in a received skb.
+ */
+struct dccp_ackvec_parsed {
+ u8 *vec,
+ len,
+ nonce:1;
+ struct list_head node;
+};
+
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce);
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
+#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
new file mode 100644
index 000000000..834989751
--- /dev/null
+++ b/net/dccp/ccid.c
@@ -0,0 +1,223 @@
+/*
+ * net/dccp/ccid.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * CCID infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+
+#include "ccid.h"
+#include "ccids/lib/tfrc.h"
+
+static struct ccid_operations *ccids[] = {
+ &ccid2_ops,
+#ifdef CONFIG_IP_DCCP_CCID3
+ &ccid3_ops,
+#endif
+};
+
+static struct ccid_operations *ccid_by_number(const u8 id)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ccids); i++)
+ if (ccids[i]->ccid_id == id)
+ return ccids[i];
+ return NULL;
+}
+
+/* check that up to @array_len members in @ccid_array are supported */
+bool ccid_support_check(u8 const *ccid_array, u8 array_len)
+{
+ while (array_len > 0)
+ if (ccid_by_number(ccid_array[--array_len]) == NULL)
+ return false;
+ return true;
+}
+
+/**
+ * ccid_get_builtin_ccids - Populate a list of built-in CCIDs
+ * @ccid_array: pointer to copy into
+ * @array_len: value to return length into
+ *
+ * This function allocates memory - caller must see that it is freed after use.
+ */
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
+{
+ *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any());
+ if (*ccid_array == NULL)
+ return -ENOBUFS;
+
+ for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1)
+ (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id;
+ return 0;
+}
+
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ u8 *ccid_array, array_len;
+ int err = 0;
+
+ if (ccid_get_builtin_ccids(&ccid_array, &array_len))
+ return -ENOBUFS;
+
+ if (put_user(array_len, optlen))
+ err = -EFAULT;
+ else if (len > 0 && copy_to_user(optval, ccid_array,
+ len > array_len ? array_len : len))
+ err = -EFAULT;
+
+ kfree(ccid_array);
+ return err;
+}
+
+static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
+{
+ struct kmem_cache *slab;
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args);
+ va_end(args);
+
+ slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ return slab;
+}
+
+static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
+{
+ if (slab != NULL)
+ kmem_cache_destroy(slab);
+}
+
+static int __init ccid_activate(struct ccid_operations *ccid_ops)
+{
+ int err = -ENOBUFS;
+
+ ccid_ops->ccid_hc_rx_slab =
+ ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
+ ccid_ops->ccid_hc_rx_slab_name,
+ "ccid%u_hc_rx_sock",
+ ccid_ops->ccid_id);
+ if (ccid_ops->ccid_hc_rx_slab == NULL)
+ goto out;
+
+ ccid_ops->ccid_hc_tx_slab =
+ ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
+ ccid_ops->ccid_hc_tx_slab_name,
+ "ccid%u_hc_tx_sock",
+ ccid_ops->ccid_id);
+ if (ccid_ops->ccid_hc_tx_slab == NULL)
+ goto out_free_rx_slab;
+
+ pr_info("DCCP: Activated CCID %d (%s)\n",
+ ccid_ops->ccid_id, ccid_ops->ccid_name);
+ err = 0;
+out:
+ return err;
+out_free_rx_slab:
+ ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
+ ccid_ops->ccid_hc_rx_slab = NULL;
+ goto out;
+}
+
+static void ccid_deactivate(struct ccid_operations *ccid_ops)
+{
+ ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
+ ccid_ops->ccid_hc_tx_slab = NULL;
+ ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
+ ccid_ops->ccid_hc_rx_slab = NULL;
+
+ pr_info("DCCP: Deactivated CCID %d (%s)\n",
+ ccid_ops->ccid_id, ccid_ops->ccid_name);
+}
+
+struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx)
+{
+ struct ccid_operations *ccid_ops = ccid_by_number(id);
+ struct ccid *ccid = NULL;
+
+ if (ccid_ops == NULL)
+ goto out;
+
+ ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
+ ccid_ops->ccid_hc_tx_slab, gfp_any());
+ if (ccid == NULL)
+ goto out;
+ ccid->ccid_ops = ccid_ops;
+ if (rx) {
+ memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
+ if (ccid->ccid_ops->ccid_hc_rx_init != NULL &&
+ ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0)
+ goto out_free_ccid;
+ } else {
+ memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size);
+ if (ccid->ccid_ops->ccid_hc_tx_init != NULL &&
+ ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0)
+ goto out_free_ccid;
+ }
+out:
+ return ccid;
+out_free_ccid:
+ kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
+ ccid_ops->ccid_hc_tx_slab, ccid);
+ ccid = NULL;
+ goto out;
+}
+
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid != NULL) {
+ if (ccid->ccid_ops->ccid_hc_rx_exit != NULL)
+ ccid->ccid_ops->ccid_hc_rx_exit(sk);
+ kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid);
+ }
+}
+
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid != NULL) {
+ if (ccid->ccid_ops->ccid_hc_tx_exit != NULL)
+ ccid->ccid_ops->ccid_hc_tx_exit(sk);
+ kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid);
+ }
+}
+
+int __init ccid_initialize_builtins(void)
+{
+ int i, err = tfrc_lib_init();
+
+ if (err)
+ return err;
+
+ for (i = 0; i < ARRAY_SIZE(ccids); i++) {
+ err = ccid_activate(ccids[i]);
+ if (err)
+ goto unwind_registrations;
+ }
+ return 0;
+
+unwind_registrations:
+ while(--i >= 0)
+ ccid_deactivate(ccids[i]);
+ tfrc_lib_exit();
+ return err;
+}
+
+void ccid_cleanup_builtins(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ccids); i++)
+ ccid_deactivate(ccids[i]);
+ tfrc_lib_exit();
+}
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
new file mode 100644
index 000000000..6eb837a47
--- /dev/null
+++ b/net/dccp/ccid.h
@@ -0,0 +1,265 @@
+#ifndef _CCID_H
+#define _CCID_H
+/*
+ * net/dccp/ccid.h
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * CCID infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/sock.h>
+#include <linux/compiler.h>
+#include <linux/dccp.h>
+#include <linux/list.h>
+#include <linux/module.h>
+
+/* maximum value for a CCID (RFC 4340, 19.5) */
+#define CCID_MAX 255
+#define CCID_SLAB_NAME_LENGTH 32
+
+struct tcp_info;
+
+/**
+ * struct ccid_operations - Interface to Congestion-Control Infrastructure
+ *
+ * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.)
+ * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled)
+ * @ccid_name: alphabetical identifier string for @ccid_id
+ * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection
+ * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket
+ *
+ * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup)
+ * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction)
+ * @ccid_hc_rx_packet_recv: implements the HC-receiver side
+ * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options
+ * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options
+ * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender
+ * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender
+ * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender
+ * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender
+ * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender
+ */
+struct ccid_operations {
+ unsigned char ccid_id;
+ __u32 ccid_ccmps;
+ const char *ccid_name;
+ struct kmem_cache *ccid_hc_rx_slab,
+ *ccid_hc_tx_slab;
+ char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH];
+ char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH];
+ __u32 ccid_hc_rx_obj_size,
+ ccid_hc_tx_obj_size;
+ /* Interface Routines */
+ int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
+ int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
+ void (*ccid_hc_rx_exit)(struct sock *sk);
+ void (*ccid_hc_tx_exit)(struct sock *sk);
+ void (*ccid_hc_rx_packet_recv)(struct sock *sk,
+ struct sk_buff *skb);
+ int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
+ int (*ccid_hc_rx_insert_options)(struct sock *sk,
+ struct sk_buff *skb);
+ void (*ccid_hc_tx_packet_recv)(struct sock *sk,
+ struct sk_buff *skb);
+ int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
+ int (*ccid_hc_tx_send_packet)(struct sock *sk,
+ struct sk_buff *skb);
+ void (*ccid_hc_tx_packet_sent)(struct sock *sk,
+ unsigned int len);
+ void (*ccid_hc_rx_get_info)(struct sock *sk,
+ struct tcp_info *info);
+ void (*ccid_hc_tx_get_info)(struct sock *sk,
+ struct tcp_info *info);
+ int (*ccid_hc_rx_getsockopt)(struct sock *sk,
+ const int optname, int len,
+ u32 __user *optval,
+ int __user *optlen);
+ int (*ccid_hc_tx_getsockopt)(struct sock *sk,
+ const int optname, int len,
+ u32 __user *optval,
+ int __user *optlen);
+};
+
+extern struct ccid_operations ccid2_ops;
+#ifdef CONFIG_IP_DCCP_CCID3
+extern struct ccid_operations ccid3_ops;
+#endif
+
+int ccid_initialize_builtins(void);
+void ccid_cleanup_builtins(void);
+
+struct ccid {
+ struct ccid_operations *ccid_ops;
+ char ccid_priv[0];
+};
+
+static inline void *ccid_priv(const struct ccid *ccid)
+{
+ return (void *)ccid->ccid_priv;
+}
+
+bool ccid_support_check(u8 const *ccid_array, u8 array_len);
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+ char __user *, int __user *);
+
+struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
+
+static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
+{
+ struct ccid *ccid = dp->dccps_hc_rx_ccid;
+
+ if (ccid == NULL || ccid->ccid_ops == NULL)
+ return -1;
+ return ccid->ccid_ops->ccid_id;
+}
+
+static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
+{
+ struct ccid *ccid = dp->dccps_hc_tx_ccid;
+
+ if (ccid == NULL || ccid->ccid_ops == NULL)
+ return -1;
+ return ccid->ccid_ops->ccid_id;
+}
+
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
+
+/*
+ * Congestion control of queued data packets via CCID decision.
+ *
+ * The TX CCID performs its congestion-control by indicating whether and when a
+ * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
+ * The following modes are supported via the symbolic constants below:
+ * - timer-based pacing (CCID returns a delay value in milliseconds);
+ * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
+ */
+
+enum ccid_dequeueing_decision {
+ CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
+ CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
+ CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
+ CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
+ CCID_PACKET_ERR = 0xF0000, /* error condition */
+};
+
+static inline int ccid_packet_dequeue_eval(const int return_code)
+{
+ if (return_code < 0)
+ return CCID_PACKET_ERR;
+ if (return_code == 0)
+ return CCID_PACKET_SEND_AT_ONCE;
+ if (return_code <= CCID_PACKET_DELAY_MAX)
+ return CCID_PACKET_DELAY;
+ return return_code;
+}
+
+static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
+ return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+ return CCID_PACKET_SEND_AT_ONCE;
+}
+
+static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
+ unsigned int len)
+{
+ if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
+ ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
+}
+
+static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL)
+ ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb);
+}
+
+static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL)
+ ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
+}
+
+/**
+ * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
+ * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
+ * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
+ * @val: value of @opt
+ * @len: length of @val in bytes
+ */
+static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
+ u8 pkt, u8 opt, u8 *val, u8 len)
+{
+ if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
+}
+
+/**
+ * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
+ * Arguments are analogous to ccid_hc_tx_parse_options()
+ */
+static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
+ u8 pkt, u8 opt, u8 *val, u8 len)
+{
+ if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
+}
+
+static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL)
+ return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb);
+ return 0;
+}
+
+static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
+ struct tcp_info *info)
+{
+ if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL)
+ ccid->ccid_ops->ccid_hc_rx_get_info(sk, info);
+}
+
+static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
+ struct tcp_info *info)
+{
+ if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL)
+ ccid->ccid_ops->ccid_hc_tx_get_info(sk, info);
+}
+
+static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
+ const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ int rc = -ENOPROTOOPT;
+ if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
+ rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
+ optval, optlen);
+ return rc;
+}
+
+static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
+ const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ int rc = -ENOPROTOOPT;
+ if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
+ rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
+ optval, optlen);
+ return rc;
+}
+#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
new file mode 100644
index 000000000..8ba3fc9d6
--- /dev/null
+++ b/net/dccp/ccids/Kconfig
@@ -0,0 +1,54 @@
+menu "DCCP CCIDs Configuration"
+
+config IP_DCCP_CCID2_DEBUG
+ bool "CCID-2 debugging messages"
+ ---help---
+ Enable CCID-2 specific debugging messages.
+
+ The debugging output can additionally be toggled by setting the
+ ccid2_debug parameter to 0 or 1.
+
+ If in doubt, say N.
+
+config IP_DCCP_CCID3
+ bool "CCID-3 (TCP-Friendly)"
+ def_bool y if (IP_DCCP = y || IP_DCCP = m)
+ ---help---
+ CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
+ rate-controlled congestion control mechanism. TFRC is designed to
+ be reasonably fair when competing for bandwidth with TCP-like flows,
+ where a flow is "reasonably fair" if its sending rate is generally
+ within a factor of two of the sending rate of a TCP flow under the
+ same conditions. However, TFRC has a much lower variation of
+ throughput over time compared with TCP, which makes CCID-3 more
+ suitable than CCID-2 for applications such streaming media where a
+ relatively smooth sending rate is of importance.
+
+ CCID-3 is further described in RFC 4342,
+ http://www.ietf.org/rfc/rfc4342.txt
+
+ The TFRC congestion control algorithms were initially described in
+ RFC 5348.
+
+ This text was extracted from RFC 4340 (sec. 10.2),
+ http://www.ietf.org/rfc/rfc4340.txt
+
+ If in doubt, say N.
+
+config IP_DCCP_CCID3_DEBUG
+ bool "CCID-3 debugging messages"
+ depends on IP_DCCP_CCID3
+ ---help---
+ Enable CCID-3 specific debugging messages.
+
+ The debugging output can additionally be toggled by setting the
+ ccid3_debug parameter to 0 or 1.
+
+ If in doubt, say N.
+
+config IP_DCCP_TFRC_LIB
+ def_bool y if IP_DCCP_CCID3
+
+config IP_DCCP_TFRC_DEBUG
+ def_bool y if IP_DCCP_CCID3_DEBUG
+endmenu
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
new file mode 100644
index 000000000..f053198e7
--- /dev/null
+++ b/net/dccp/ccids/ccid2.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ * Changes to meet Linux coding standards, and DCCP infrastructure fixes.
+ *
+ * Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * This implementation should follow RFC 4341
+ */
+#include <linux/slab.h>
+#include "../feat.h"
+#include "ccid2.h"
+
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+static bool ccid2_debug;
+#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
+#else
+#define ccid2_pr_debug(format, a...)
+#endif
+
+static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
+{
+ struct ccid2_seq *seqp;
+ int i;
+
+ /* check if we have space to preserve the pointer to the buffer */
+ if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) /
+ sizeof(struct ccid2_seq *)))
+ return -ENOMEM;
+
+ /* allocate buffer and initialize linked list */
+ seqp = kmalloc(CCID2_SEQBUF_LEN * sizeof(struct ccid2_seq), gfp_any());
+ if (seqp == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) {
+ seqp[i].ccid2s_next = &seqp[i + 1];
+ seqp[i + 1].ccid2s_prev = &seqp[i];
+ }
+ seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp;
+ seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+
+ /* This is the first allocation. Initiate the head and tail. */
+ if (hc->tx_seqbufc == 0)
+ hc->tx_seqh = hc->tx_seqt = seqp;
+ else {
+ /* link the existing list with the one we just created */
+ hc->tx_seqh->ccid2s_next = seqp;
+ seqp->ccid2s_prev = hc->tx_seqh;
+
+ hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+ seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt;
+ }
+
+ /* store the original pointer to the buffer so we can free it */
+ hc->tx_seqbuf[hc->tx_seqbufc] = seqp;
+ hc->tx_seqbufc++;
+
+ return 0;
+}
+
+static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
+{
+ if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
+ return CCID_PACKET_WILL_DEQUEUE_LATER;
+ return CCID_PACKET_SEND_AT_ONCE;
+}
+
+static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
+{
+ u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
+
+ /*
+ * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
+ * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always
+ * acceptable since this causes starvation/deadlock whenever cwnd < 2.
+ * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled).
+ */
+ if (val == 0 || val > max_ratio) {
+ DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
+ val = max_ratio;
+ }
+ dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO,
+ min_t(u32, val, DCCPF_ACK_RATIO_MAX));
+}
+
+static void ccid2_check_l_ack_ratio(struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ /*
+ * After a loss, idle period, application limited period, or RTO we
+ * need to check that the ack ratio is still less than the congestion
+ * window. Otherwise, we will send an entire congestion window of
+ * packets and got no response because we haven't sent ack ratio
+ * packets yet.
+ * If the ack ratio does need to be reduced, we reduce it to half of
+ * the congestion window (or 1 if that's zero) instead of to the
+ * congestion window. This prevents problems if one ack is lost.
+ */
+ if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd)
+ ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U);
+}
+
+static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
+{
+ dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
+ clamp_val(val, DCCPF_SEQ_WMIN,
+ DCCPF_SEQ_WMAX));
+}
+
+static void ccid2_hc_tx_rto_expire(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5);
+ goto out;
+ }
+
+ ccid2_pr_debug("RTO_EXPIRE\n");
+
+ /* back-off timer */
+ hc->tx_rto <<= 1;
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
+
+ /* adjust pipe, cwnd etc */
+ hc->tx_ssthresh = hc->tx_cwnd / 2;
+ if (hc->tx_ssthresh < 2)
+ hc->tx_ssthresh = 2;
+ hc->tx_cwnd = 1;
+ hc->tx_pipe = 0;
+
+ /* clear state about stuff we sent */
+ hc->tx_seqt = hc->tx_seqh;
+ hc->tx_packets_acked = 0;
+
+ /* clear ack ratio state. */
+ hc->tx_rpseq = 0;
+ hc->tx_rpdupack = -1;
+ ccid2_change_l_ack_ratio(sk, 1);
+
+ /* if we were blocked before, we may now send cwnd=1 packet */
+ if (sender_was_blocked)
+ tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ /* restart backed-off timer */
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * Congestion window validation (RFC 2861).
+ */
+static bool ccid2_do_cwv = true;
+module_param(ccid2_do_cwv, bool, 0644);
+MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation");
+
+/**
+ * ccid2_update_used_window - Track how much of cwnd is actually used
+ * This is done in addition to CWV. The sender needs to have an idea of how many
+ * packets may be in flight, to set the local Sequence Window value accordingly
+ * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the
+ * maximum-used window. We use an EWMA low-pass filter to filter out noise.
+ */
+static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd)
+{
+ hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4;
+}
+
+/* This borrows the code of tcp_cwnd_application_limited() */
+static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ /* don't reduce cwnd below the initial window (IW) */
+ u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache),
+ win_used = max(hc->tx_cwnd_used, init_win);
+
+ if (win_used < hc->tx_cwnd) {
+ hc->tx_ssthresh = max(hc->tx_ssthresh,
+ (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2));
+ hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1;
+ }
+ hc->tx_cwnd_used = 0;
+ hc->tx_cwnd_stamp = now;
+
+ ccid2_check_l_ack_ratio(sk);
+}
+
+/* This borrows the code of tcp_cwnd_restart() */
+static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ u32 cwnd = hc->tx_cwnd, restart_cwnd,
+ iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
+
+ hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
+
+ /* don't reduce cwnd below the initial window (IW) */
+ restart_cwnd = min(cwnd, iwnd);
+ cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto;
+ hc->tx_cwnd = max(cwnd, restart_cwnd);
+
+ hc->tx_cwnd_stamp = now;
+ hc->tx_cwnd_used = 0;
+
+ ccid2_check_l_ack_ratio(sk);
+}
+
+static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const u32 now = ccid2_time_stamp;
+ struct ccid2_seq *next;
+
+ /* slow-start after idle periods (RFC 2581, RFC 2861) */
+ if (ccid2_do_cwv && !hc->tx_pipe &&
+ (s32)(now - hc->tx_lsndtime) >= hc->tx_rto)
+ ccid2_cwnd_restart(sk, now);
+
+ hc->tx_lsndtime = now;
+ hc->tx_pipe += 1;
+
+ /* see whether cwnd was fully used (RFC 2861), update expected window */
+ if (ccid2_cwnd_network_limited(hc)) {
+ ccid2_update_used_window(hc, hc->tx_cwnd);
+ hc->tx_cwnd_used = 0;
+ hc->tx_cwnd_stamp = now;
+ } else {
+ if (hc->tx_pipe > hc->tx_cwnd_used)
+ hc->tx_cwnd_used = hc->tx_pipe;
+
+ ccid2_update_used_window(hc, hc->tx_cwnd_used);
+
+ if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto)
+ ccid2_cwnd_application_limited(sk, now);
+ }
+
+ hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
+ hc->tx_seqh->ccid2s_acked = 0;
+ hc->tx_seqh->ccid2s_sent = now;
+
+ next = hc->tx_seqh->ccid2s_next;
+ /* check if we need to alloc more space */
+ if (next == hc->tx_seqt) {
+ if (ccid2_hc_tx_alloc_seq(hc)) {
+ DCCP_CRIT("packet history - out of memory!");
+ /* FIXME: find a more graceful way to bail out */
+ return;
+ }
+ next = hc->tx_seqh->ccid2s_next;
+ BUG_ON(next == hc->tx_seqt);
+ }
+ hc->tx_seqh = next;
+
+ ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe);
+
+ /*
+ * FIXME: The code below is broken and the variables have been removed
+ * from the socket struct. The `ackloss' variable was always set to 0,
+ * and with arsent there are several problems:
+ * (i) it doesn't just count the number of Acks, but all sent packets;
+ * (ii) it is expressed in # of packets, not # of windows, so the
+ * comparison below uses the wrong formula: Appendix A of RFC 4341
+ * comes up with the number K = cwnd / (R^2 - R) of consecutive windows
+ * of data with no lost or marked Ack packets. If arsent were the # of
+ * consecutive Acks received without loss, then Ack Ratio needs to be
+ * decreased by 1 when
+ * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2)
+ * where cwnd / R is the number of Acks received per window of data
+ * (cf. RFC 4341, App. A). The problems are that
+ * - arsent counts other packets as well;
+ * - the comparison uses a formula different from RFC 4341;
+ * - computing a cubic/quadratic equation each time is too complicated.
+ * Hence a different algorithm is needed.
+ */
+#if 0
+ /* Ack Ratio. Need to maintain a concept of how many windows we sent */
+ hc->tx_arsent++;
+ /* We had an ack loss in this window... */
+ if (hc->tx_ackloss) {
+ if (hc->tx_arsent >= hc->tx_cwnd) {
+ hc->tx_arsent = 0;
+ hc->tx_ackloss = 0;
+ }
+ } else {
+ /* No acks lost up to now... */
+ /* decrease ack ratio if enough packets were sent */
+ if (dp->dccps_l_ack_ratio > 1) {
+ /* XXX don't calculate denominator each time */
+ int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
+ dp->dccps_l_ack_ratio;
+
+ denom = hc->tx_cwnd * hc->tx_cwnd / denom;
+
+ if (hc->tx_arsent >= denom) {
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
+ hc->tx_arsent = 0;
+ }
+ } else {
+ /* we can't increase ack ratio further [1] */
+ hc->tx_arsent = 0; /* or maybe set it to cwnd*/
+ }
+ }
+#endif
+
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+ do {
+ struct ccid2_seq *seqp = hc->tx_seqt;
+
+ while (seqp != hc->tx_seqh) {
+ ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
+ (unsigned long long)seqp->ccid2s_seq,
+ seqp->ccid2s_acked, seqp->ccid2s_sent);
+ seqp = seqp->ccid2s_next;
+ }
+ } while (0);
+ ccid2_pr_debug("=========\n");
+#endif
+}
+
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ * which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
+ */
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ long m = mrtt ? : 1;
+
+ if (hc->tx_srtt == 0) {
+ /* First measurement m */
+ hc->tx_srtt = m << 3;
+ hc->tx_mdev = m << 1;
+
+ hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
+ hc->tx_rttvar = hc->tx_mdev_max;
+
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ } else {
+ /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+ m -= (hc->tx_srtt >> 3);
+ hc->tx_srtt += m;
+
+ /* Similarly, update scaled mdev with regard to |m| */
+ if (m < 0) {
+ m = -m;
+ m -= (hc->tx_mdev >> 2);
+ /*
+ * This neutralises RTO increase when RTT < SRTT - mdev
+ * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+ * in Linux TCP", USENIX 2002, pp. 49-62).
+ */
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (hc->tx_mdev >> 2);
+ }
+ hc->tx_mdev += m;
+
+ if (hc->tx_mdev > hc->tx_mdev_max) {
+ hc->tx_mdev_max = hc->tx_mdev;
+ if (hc->tx_mdev_max > hc->tx_rttvar)
+ hc->tx_rttvar = hc->tx_mdev_max;
+ }
+
+ /*
+ * Decay RTTVAR at most once per flight, exploiting that
+ * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
+ * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
+ * GAR is a useful bound for FlightSize = pipe.
+ * AWL is probably too low here, as it over-estimates pipe.
+ */
+ if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
+ if (hc->tx_mdev_max < hc->tx_rttvar)
+ hc->tx_rttvar -= (hc->tx_rttvar -
+ hc->tx_mdev_max) >> 2;
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ hc->tx_mdev_max = tcp_rto_min(sk);
+ }
+ }
+
+ /*
+ * Set RTO from SRTT and RTTVAR
+ * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
+ * This agrees with RFC 4341, 5:
+ * "Because DCCP does not retransmit data, DCCP does not require
+ * TCP's recommended minimum timeout of one second".
+ */
+ hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
+
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
+}
+
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+ unsigned int *maxincr)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio;
+
+ if (hc->tx_cwnd < dp->dccps_l_seq_win &&
+ r_seq_used < dp->dccps_r_seq_win) {
+ if (hc->tx_cwnd < hc->tx_ssthresh) {
+ if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) {
+ hc->tx_cwnd += 1;
+ *maxincr -= 1;
+ hc->tx_packets_acked = 0;
+ }
+ } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
+ hc->tx_cwnd += 1;
+ hc->tx_packets_acked = 0;
+ }
+ }
+
+ /*
+ * Adjust the local sequence window and the ack ratio to allow about
+ * 5 times the number of packets in the network (RFC 4340 7.5.2)
+ */
+ if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2);
+ else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U);
+
+ if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2);
+ else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2);
+
+ /*
+ * FIXME: RTT is sampled several times per acknowledgment (for each
+ * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+ * This causes the RTT to be over-estimated, since the older entries
+ * in the Ack Vector have earlier sending times.
+ * The cleanest solution is to not use the ccid2s_sent field at all
+ * and instead use DCCP timestamps: requires changes in other places.
+ */
+ ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
+}
+
+static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
+ ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
+ return;
+ }
+
+ hc->tx_last_cong = ccid2_time_stamp;
+
+ hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
+ hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
+
+ ccid2_check_l_ack_ratio(sk);
+}
+
+static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ switch (option) {
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
+ option - DCCPO_ACK_VECTOR_0);
+ }
+ return 0;
+}
+
+static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+ struct dccp_ackvec_parsed *avp;
+ u64 ackno, seqno;
+ struct ccid2_seq *seqp;
+ int done = 0;
+ unsigned int maxincr = 0;
+
+ /* check reverse path congestion */
+ seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ /* XXX this whole "algorithm" is broken. Need to fix it to keep track
+ * of the seqnos of the dupacks so that rpseq and rpdupack are correct
+ * -sorbo.
+ */
+ /* need to bootstrap */
+ if (hc->tx_rpdupack == -1) {
+ hc->tx_rpdupack = 0;
+ hc->tx_rpseq = seqno;
+ } else {
+ /* check if packet is consecutive */
+ if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1)
+ hc->tx_rpseq = seqno;
+ /* it's a later packet */
+ else if (after48(seqno, hc->tx_rpseq)) {
+ hc->tx_rpdupack++;
+
+ /* check if we got enough dupacks */
+ if (hc->tx_rpdupack >= NUMDUPACK) {
+ hc->tx_rpdupack = -1; /* XXX lame */
+ hc->tx_rpseq = 0;
+#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__
+ /*
+ * FIXME: Ack Congestion Control is broken; in
+ * the current state instabilities occurred with
+ * Ack Ratios greater than 1; causing hang-ups
+ * and long RTO timeouts. This needs to be fixed
+ * before opening up dynamic changes. -- gerrit
+ */
+ ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
+#endif
+ }
+ }
+ }
+
+ /* check forward path congestion */
+ if (dccp_packet_without_ack(skb))
+ return;
+
+ /* still didn't send out new data packets */
+ if (hc->tx_seqh == hc->tx_seqt)
+ goto done;
+
+ ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+ if (after48(ackno, hc->tx_high_ack))
+ hc->tx_high_ack = ackno;
+
+ seqp = hc->tx_seqt;
+ while (before48(seqp->ccid2s_seq, ackno)) {
+ seqp = seqp->ccid2s_next;
+ if (seqp == hc->tx_seqh) {
+ seqp = hc->tx_seqh->ccid2s_prev;
+ break;
+ }
+ }
+
+ /*
+ * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2
+ * packets per acknowledgement. Rounding up avoids that cwnd is not
+ * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
+ */
+ if (hc->tx_cwnd < hc->tx_ssthresh)
+ maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
+
+ /* go through all ack vectors */
+ list_for_each_entry(avp, &hc->tx_av_chunks, node) {
+ /* go through this ack vector */
+ for (; avp->len--; avp->vec++) {
+ u64 ackno_end_rl = SUB48(ackno,
+ dccp_ackvec_runlen(avp->vec));
+
+ ccid2_pr_debug("ackvec %llu |%u,%u|\n",
+ (unsigned long long)ackno,
+ dccp_ackvec_state(avp->vec) >> 6,
+ dccp_ackvec_runlen(avp->vec));
+ /* if the seqno we are analyzing is larger than the
+ * current ackno, then move towards the tail of our
+ * seqnos.
+ */
+ while (after48(seqp->ccid2s_seq, ackno)) {
+ if (seqp == hc->tx_seqt) {
+ done = 1;
+ break;
+ }
+ seqp = seqp->ccid2s_prev;
+ }
+ if (done)
+ break;
+
+ /* check all seqnos in the range of the vector
+ * run length
+ */
+ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
+ const u8 state = dccp_ackvec_state(avp->vec);
+
+ /* new packet received or marked */
+ if (state != DCCPAV_NOT_RECEIVED &&
+ !seqp->ccid2s_acked) {
+ if (state == DCCPAV_ECN_MARKED)
+ ccid2_congestion_event(sk,
+ seqp);
+ else
+ ccid2_new_ack(sk, seqp,
+ &maxincr);
+
+ seqp->ccid2s_acked = 1;
+ ccid2_pr_debug("Got ack for %llu\n",
+ (unsigned long long)seqp->ccid2s_seq);
+ hc->tx_pipe--;
+ }
+ if (seqp == hc->tx_seqt) {
+ done = 1;
+ break;
+ }
+ seqp = seqp->ccid2s_prev;
+ }
+ if (done)
+ break;
+
+ ackno = SUB48(ackno_end_rl, 1);
+ }
+ if (done)
+ break;
+ }
+
+ /* The state about what is acked should be correct now
+ * Check for NUMDUPACK
+ */
+ seqp = hc->tx_seqt;
+ while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) {
+ seqp = seqp->ccid2s_next;
+ if (seqp == hc->tx_seqh) {
+ seqp = hc->tx_seqh->ccid2s_prev;
+ break;
+ }
+ }
+ done = 0;
+ while (1) {
+ if (seqp->ccid2s_acked) {
+ done++;
+ if (done == NUMDUPACK)
+ break;
+ }
+ if (seqp == hc->tx_seqt)
+ break;
+ seqp = seqp->ccid2s_prev;
+ }
+
+ /* If there are at least 3 acknowledgements, anything unacknowledged
+ * below the last sequence number is considered lost
+ */
+ if (done == NUMDUPACK) {
+ struct ccid2_seq *last_acked = seqp;
+
+ /* check for lost packets */
+ while (1) {
+ if (!seqp->ccid2s_acked) {
+ ccid2_pr_debug("Packet lost: %llu\n",
+ (unsigned long long)seqp->ccid2s_seq);
+ /* XXX need to traverse from tail -> head in
+ * order to detect multiple congestion events in
+ * one ack vector.
+ */
+ ccid2_congestion_event(sk, seqp);
+ hc->tx_pipe--;
+ }
+ if (seqp == hc->tx_seqt)
+ break;
+ seqp = seqp->ccid2s_prev;
+ }
+
+ hc->tx_seqt = last_acked;
+ }
+
+ /* trim acked packets in tail */
+ while (hc->tx_seqt != hc->tx_seqh) {
+ if (!hc->tx_seqt->ccid2s_acked)
+ break;
+
+ hc->tx_seqt = hc->tx_seqt->ccid2s_next;
+ }
+
+ /* restart RTO timer if not all outstanding data has been acked */
+ if (hc->tx_pipe == 0)
+ sk_stop_timer(sk, &hc->tx_rtotimer);
+ else
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+done:
+ /* check if incoming Acks allow pending packets to be sent */
+ if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
+ tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
+}
+
+static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
+ struct dccp_sock *dp = dccp_sk(sk);
+ u32 max_ratio;
+
+ /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
+ hc->tx_ssthresh = ~0U;
+
+ /* Use larger initial windows (RFC 4341, section 5). */
+ hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
+ hc->tx_expected_wnd = hc->tx_cwnd;
+
+ /* Make sure that Ack Ratio is enabled and within bounds. */
+ max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
+ if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
+ dp->dccps_l_ack_ratio = max_ratio;
+
+ /* XXX init ~ to window size... */
+ if (ccid2_hc_tx_alloc_seq(hc))
+ return -ENOMEM;
+
+ hc->tx_rto = DCCP_TIMEOUT_INIT;
+ hc->tx_rpdupack = -1;
+ hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp;
+ hc->tx_cwnd_used = 0;
+ setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
+ (unsigned long)sk);
+ INIT_LIST_HEAD(&hc->tx_av_chunks);
+ return 0;
+}
+
+static void ccid2_hc_tx_exit(struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ int i;
+
+ sk_stop_timer(sk, &hc->tx_rtotimer);
+
+ for (i = 0; i < hc->tx_seqbufc; i++)
+ kfree(hc->tx_seqbuf[i]);
+ hc->tx_seqbufc = 0;
+}
+
+static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk);
+
+ if (!dccp_data_packet(skb))
+ return;
+
+ if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) {
+ dccp_send_ack(sk);
+ hc->rx_num_data_pkts = 0;
+ }
+}
+
+struct ccid_operations ccid2_ops = {
+ .ccid_id = DCCPC_CCID2,
+ .ccid_name = "TCP-like",
+ .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
+ .ccid_hc_tx_init = ccid2_hc_tx_init,
+ .ccid_hc_tx_exit = ccid2_hc_tx_exit,
+ .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
+ .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
+ .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
+ .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
+ .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
+ .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
+};
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+module_param(ccid2_debug, bool, 0644);
+MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages");
+#endif
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
new file mode 100644
index 000000000..18c97543e
--- /dev/null
+++ b/net/dccp/ccids/ccid2.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID2_H_
+#define _DCCP_CCID2_H_
+
+#include <linux/timer.h>
+#include <linux/types.h>
+#include "../ccid.h"
+#include "../dccp.h"
+
+/*
+ * CCID-2 timestamping faces the same issues as TCP timestamping.
+ * Hence we reuse/share as much of the code as possible.
+ */
+#define ccid2_time_stamp tcp_time_stamp
+
+/* NUMDUPACK parameter from RFC 4341, p. 6 */
+#define NUMDUPACK 3
+
+struct ccid2_seq {
+ u64 ccid2s_seq;
+ u32 ccid2s_sent;
+ int ccid2s_acked;
+ struct ccid2_seq *ccid2s_prev;
+ struct ccid2_seq *ccid2s_next;
+};
+
+#define CCID2_SEQBUF_LEN 1024
+#define CCID2_SEQBUF_MAX 128
+
+/*
+ * Multiple of congestion window to keep the sequence window at
+ * (RFC 4340 7.5.2)
+ */
+#define CCID2_WIN_CHANGE_FACTOR 5
+
+/**
+ * struct ccid2_hc_tx_sock - CCID2 TX half connection
+ * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
+ * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
+ * @tx_srtt: smoothed RTT estimate, scaled by 2^3
+ * @tx_mdev: smoothed RTT variation, scaled by 2^2
+ * @tx_mdev_max: maximum of @mdev during one flight
+ * @tx_rttvar: moving average/maximum of @mdev_max
+ * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
+ * @tx_rtt_seq: to decay RTTVAR at most once per flight
+ * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861
+ * @tx_expected_wnd: moving average of @tx_cwnd_used
+ * @tx_cwnd_stamp: to track idle periods in CWV
+ * @tx_lsndtime: last time (in jiffies) a data packet was sent
+ * @tx_rpseq: last consecutive seqno
+ * @tx_rpdupack: dupacks since rpseq
+ * @tx_av_chunks: list of Ack Vectors received on current skb
+ */
+struct ccid2_hc_tx_sock {
+ u32 tx_cwnd;
+ u32 tx_ssthresh;
+ u32 tx_pipe;
+ u32 tx_packets_acked;
+ struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX];
+ int tx_seqbufc;
+ struct ccid2_seq *tx_seqh;
+ struct ccid2_seq *tx_seqt;
+
+ /* RTT measurement: variables/principles are the same as in TCP */
+ u32 tx_srtt,
+ tx_mdev,
+ tx_mdev_max,
+ tx_rttvar,
+ tx_rto;
+ u64 tx_rtt_seq:48;
+ struct timer_list tx_rtotimer;
+
+ /* Congestion Window validation (optional, RFC 2861) */
+ u32 tx_cwnd_used,
+ tx_expected_wnd,
+ tx_cwnd_stamp,
+ tx_lsndtime;
+
+ u64 tx_rpseq;
+ int tx_rpdupack;
+ u32 tx_last_cong;
+ u64 tx_high_ack;
+ struct list_head tx_av_chunks;
+};
+
+static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
+{
+ return hc->tx_pipe >= hc->tx_cwnd;
+}
+
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ * This is based on the numbers specified in RFC 5681, 3.1.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+ return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
+}
+
+/**
+ * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection
+ * @rx_num_data_pkts: number of data packets received since last feedback
+ */
+struct ccid2_hc_rx_sock {
+ u32 rx_num_data_pkts;
+};
+
+static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
+{
+ return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
+}
+
+static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
+{
+ return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
+}
+#endif /* _DCCP_CCID2_H_ */
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
new file mode 100644
index 000000000..119c04317
--- /dev/null
+++ b/net/dccp/ccids/ccid3.c
@@ -0,0 +1,870 @@
+/*
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include "../dccp.h"
+#include "ccid3.h"
+
+#include <asm/unaligned.h>
+
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static bool ccid3_debug;
+#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
+#else
+#define ccid3_pr_debug(format, a...)
+#endif
+
+/*
+ * Transmitter Half-Connection Routines
+ */
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
+{
+ static const char *const ccid3_state_names[] = {
+ [TFRC_SSTATE_NO_SENT] = "NO_SENT",
+ [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
+ [TFRC_SSTATE_FBACK] = "FBACK",
+ };
+
+ return ccid3_state_names[state];
+}
+#endif
+
+static void ccid3_hc_tx_set_state(struct sock *sk,
+ enum ccid3_hc_tx_states state)
+{
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ enum ccid3_hc_tx_states oldstate = hc->tx_state;
+
+ ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+ dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
+ ccid3_tx_state_name(state));
+ WARN_ON(state == oldstate);
+ hc->tx_state = state;
+}
+
+/*
+ * Compute the initial sending rate X_init in the manner of RFC 3390:
+ *
+ * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT
+ *
+ * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
+ * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
+ * For consistency with other parts of the code, X_init is scaled by 2^6.
+ */
+static inline u64 rfc3390_initial_rate(struct sock *sk)
+{
+ const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s);
+
+ return scaled_div(w_init << 6, hc->tx_rtt);
+}
+
+/**
+ * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
+ */
+static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
+{
+ hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
+
+ DCCP_BUG_ON(hc->tx_t_ipi == 0);
+ ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
+ hc->tx_s, (unsigned int)(hc->tx_x >> 6));
+}
+
+static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
+{
+ u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count);
+
+ return delta / hc->tx_rtt;
+}
+
+/**
+ * ccid3_hc_tx_update_x - Update allowed sending rate X
+ * @stamp: most recent time if available - can be left NULL.
+ *
+ * This function tracks draft rfc3448bis, check there for latest details.
+ *
+ * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
+ * fine-grained resolution of sending rates. This requires scaling by 2^6
+ * throughout the code. Only X_calc is unscaled (in bytes/second).
+ *
+ */
+static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
+{
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ __u64 min_rate = 2 * hc->tx_x_recv;
+ const __u64 old_x = hc->tx_x;
+ ktime_t now = stamp ? *stamp : ktime_get_real();
+
+ /*
+ * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
+ * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
+ * a sender is idle if it has not sent anything over a 2-RTT-period.
+ * For consistency with X and X_recv, min_rate is also scaled by 2^6.
+ */
+ if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) {
+ min_rate = rfc3390_initial_rate(sk);
+ min_rate = max(min_rate, 2 * hc->tx_x_recv);
+ }
+
+ if (hc->tx_p > 0) {
+
+ hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate);
+ hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
+
+ } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) {
+
+ hc->tx_x = min(2 * hc->tx_x, min_rate);
+ hc->tx_x = max(hc->tx_x,
+ scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt));
+ hc->tx_t_ld = now;
+ }
+
+ if (hc->tx_x != old_x) {
+ ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
+ "X_recv=%u\n", (unsigned int)(old_x >> 6),
+ (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc,
+ (unsigned int)(hc->tx_x_recv >> 6));
+
+ ccid3_update_send_interval(hc);
+ }
+}
+
+/**
+ * ccid3_hc_tx_update_s - Track the mean packet size `s'
+ * @len: DCCP packet payload size in bytes
+ *
+ * cf. RFC 4342, 5.3 and RFC 3448, 4.1
+ */
+static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
+{
+ const u16 old_s = hc->tx_s;
+
+ hc->tx_s = tfrc_ewma(hc->tx_s, len, 9);
+
+ if (hc->tx_s != old_s)
+ ccid3_update_send_interval(hc);
+}
+
+/*
+ * Update Window Counter using the algorithm from [RFC 4342, 8.1].
+ * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
+ */
+static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc,
+ ktime_t now)
+{
+ u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count),
+ quarter_rtts = (4 * delta) / hc->tx_rtt;
+
+ if (quarter_rtts > 0) {
+ hc->tx_t_last_win_count = now;
+ hc->tx_last_win_count += min(quarter_rtts, 5U);
+ hc->tx_last_win_count &= 0xF; /* mod 16 */
+ }
+}
+
+static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ unsigned long t_nfb = USEC_PER_SEC / 5;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ /* XXX: set some sensible MIB */
+ goto restart_timer;
+ }
+
+ ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
+ ccid3_tx_state_name(hc->tx_state));
+
+ /* Ignore and do not restart after leaving the established state */
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+ goto out;
+
+ /* Reset feedback state to "no feedback received" */
+ if (hc->tx_state == TFRC_SSTATE_FBACK)
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+
+ /*
+ * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+ * RTO is 0 if and only if no feedback has been received yet.
+ */
+ if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
+
+ /* halve send rate directly */
+ hc->tx_x = max(hc->tx_x / 2,
+ (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
+ ccid3_update_send_interval(hc);
+ } else {
+ /*
+ * Modify the cached value of X_recv
+ *
+ * If (X_calc > 2 * X_recv)
+ * X_recv = max(X_recv / 2, s / (2 * t_mbi));
+ * Else
+ * X_recv = X_calc / 4;
+ *
+ * Note that X_recv is scaled by 2^6 while X_calc is not
+ */
+ if (hc->tx_x_calc > (hc->tx_x_recv >> 5))
+ hc->tx_x_recv =
+ max(hc->tx_x_recv / 2,
+ (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI));
+ else {
+ hc->tx_x_recv = hc->tx_x_calc;
+ hc->tx_x_recv <<= 4;
+ }
+ ccid3_hc_tx_update_x(sk, NULL);
+ }
+ ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
+ (unsigned long long)hc->tx_x);
+
+ /*
+ * Set new timeout for the nofeedback timer.
+ * See comments in packet_recv() regarding the value of t_RTO.
+ */
+ if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
+ t_nfb = TFRC_INITIAL_TIMEOUT;
+ else
+ t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
+
+restart_timer:
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer,
+ jiffies + usecs_to_jiffies(t_nfb));
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/**
+ * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
+ * @skb: next packet candidate to send on @sk
+ *
+ * This function uses the convention of ccid_packet_dequeue_eval() and
+ * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
+ */
+static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ ktime_t now = ktime_get_real();
+ s64 delay;
+
+ /*
+ * This function is called only for Data and DataAck packets. Sending
+ * zero-sized Data(Ack)s is theoretically possible, but for congestion
+ * control this case is pathological - ignore it.
+ */
+ if (unlikely(skb->len == 0))
+ return -EBADMSG;
+
+ if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
+ usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
+ hc->tx_last_win_count = 0;
+ hc->tx_t_last_win_count = now;
+
+ /* Set t_0 for initial packet */
+ hc->tx_t_nom = now;
+
+ hc->tx_s = skb->len;
+
+ /*
+ * Use initial RTT sample when available: recommended by erratum
+ * to RFC 4342. This implements the initialisation procedure of
+ * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
+ */
+ if (dp->dccps_syn_rtt) {
+ ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
+ hc->tx_rtt = dp->dccps_syn_rtt;
+ hc->tx_x = rfc3390_initial_rate(sk);
+ hc->tx_t_ld = now;
+ } else {
+ /*
+ * Sender does not have RTT sample:
+ * - set fallback RTT (RFC 4340, 3.4) since a RTT value
+ * is needed in several parts (e.g. window counter);
+ * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
+ */
+ hc->tx_rtt = DCCP_FALLBACK_RTT;
+ hc->tx_x = hc->tx_s;
+ hc->tx_x <<= 6;
+ }
+ ccid3_update_send_interval(hc);
+
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+
+ } else {
+ delay = ktime_us_delta(hc->tx_t_nom, now);
+ ccid3_pr_debug("delay=%ld\n", (long)delay);
+ /*
+ * Scheduling of packet transmissions (RFC 5348, 8.3)
+ *
+ * if (t_now > t_nom - delta)
+ * // send the packet now
+ * else
+ * // send the packet in (t_nom - t_now) milliseconds.
+ */
+ if (delay >= TFRC_T_DELTA)
+ return (u32)delay / USEC_PER_MSEC;
+
+ ccid3_hc_tx_update_win_count(hc, now);
+ }
+
+ /* prepare to send now (add options etc.) */
+ dp->dccps_hc_tx_insert_options = 1;
+ DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count;
+
+ /* set the nominal send time for the next following packet */
+ hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
+ return CCID_PACKET_SEND_AT_ONCE;
+}
+
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+{
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+
+ ccid3_hc_tx_update_s(hc, len);
+
+ if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss))
+ DCCP_CRIT("packet history - out of memory!");
+}
+
+static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ struct tfrc_tx_hist_entry *acked;
+ ktime_t now;
+ unsigned long t_nfb;
+ u32 r_sample;
+
+ /* we are only interested in ACKs */
+ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
+ DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
+ return;
+ /*
+ * Locate the acknowledged packet in the TX history.
+ *
+ * Returning "entry not found" here can for instance happen when
+ * - the host has not sent out anything (e.g. a passive server),
+ * - the Ack is outdated (packet with higher Ack number was received),
+ * - it is a bogus Ack (for a packet not sent on this connection).
+ */
+ acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
+ if (acked == NULL)
+ return;
+ /* For the sake of RTT sampling, ignore/remove all older entries */
+ tfrc_tx_hist_purge(&acked->next);
+
+ /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+ now = ktime_get_real();
+ r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
+ hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
+
+ /*
+ * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
+ */
+ if (hc->tx_state == TFRC_SSTATE_NO_FBACK) {
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
+
+ if (hc->tx_t_rto == 0) {
+ /*
+ * Initial feedback packet: Larger Initial Windows (4.2)
+ */
+ hc->tx_x = rfc3390_initial_rate(sk);
+ hc->tx_t_ld = now;
+
+ ccid3_update_send_interval(hc);
+
+ goto done_computing_x;
+ } else if (hc->tx_p == 0) {
+ /*
+ * First feedback after nofeedback timer expiry (4.3)
+ */
+ goto done_computing_x;
+ }
+ }
+
+ /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
+ if (hc->tx_p > 0)
+ hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p);
+ ccid3_hc_tx_update_x(sk, &now);
+
+done_computing_x:
+ ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
+ "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
+ dccp_role(sk), sk, hc->tx_rtt, r_sample,
+ hc->tx_s, hc->tx_p, hc->tx_x_calc,
+ (unsigned int)(hc->tx_x_recv >> 6),
+ (unsigned int)(hc->tx_x >> 6));
+
+ /* unschedule no feedback timer */
+ sk_stop_timer(sk, &hc->tx_no_feedback_timer);
+
+ /*
+ * As we have calculated new ipi, delta, t_nom it is possible
+ * that we now can send a packet, so wake up dccp_wait_for_ccid
+ */
+ sk->sk_write_space(sk);
+
+ /*
+ * Update timeout interval for the nofeedback timer. In order to control
+ * rate halving on networks with very low RTTs (<= 1 ms), use per-route
+ * tunable RTAX_RTO_MIN value as the lower bound.
+ */
+ hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
+ USEC_PER_SEC/HZ * tcp_rto_min(sk));
+ /*
+ * Schedule no feedback timer to expire in
+ * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
+ */
+ t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
+
+ ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
+ "expire in %lu jiffies (%luus)\n",
+ dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
+
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer,
+ jiffies + usecs_to_jiffies(t_nfb));
+}
+
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
+{
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ __be32 opt_val;
+
+ switch (option) {
+ case TFRC_OPT_RECEIVE_RATE:
+ case TFRC_OPT_LOSS_EVENT_RATE:
+ /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+ if (packet_type == DCCP_PKT_DATA)
+ break;
+ if (unlikely(optlen != 4)) {
+ DCCP_WARN("%s(%p), invalid len %d for %u\n",
+ dccp_role(sk), sk, optlen, option);
+ return -EINVAL;
+ }
+ opt_val = ntohl(get_unaligned((__be32 *)optval));
+
+ if (option == TFRC_OPT_RECEIVE_RATE) {
+ /* Receive Rate is kept in units of 64 bytes/second */
+ hc->tx_x_recv = opt_val;
+ hc->tx_x_recv <<= 6;
+
+ ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
+ dccp_role(sk), sk, opt_val);
+ } else {
+ /* Update the fixpoint Loss Event Rate fraction */
+ hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
+
+ ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+ dccp_role(sk), sk, opt_val);
+ }
+ }
+ return 0;
+}
+
+static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+ struct ccid3_hc_tx_sock *hc = ccid_priv(ccid);
+
+ hc->tx_state = TFRC_SSTATE_NO_SENT;
+ hc->tx_hist = NULL;
+ setup_timer(&hc->tx_no_feedback_timer,
+ ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
+ return 0;
+}
+
+static void ccid3_hc_tx_exit(struct sock *sk)
+{
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+
+ sk_stop_timer(sk, &hc->tx_no_feedback_timer);
+ tfrc_tx_hist_purge(&hc->tx_hist);
+}
+
+static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
+{
+ info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
+ info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
+}
+
+static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ struct tfrc_tx_info tfrc;
+ const void *val;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_CCID_TX_INFO:
+ if (len < sizeof(tfrc))
+ return -EINVAL;
+ memset(&tfrc, 0, sizeof(tfrc));
+ tfrc.tfrctx_x = hc->tx_x;
+ tfrc.tfrctx_x_recv = hc->tx_x_recv;
+ tfrc.tfrctx_x_calc = hc->tx_x_calc;
+ tfrc.tfrctx_rtt = hc->tx_rtt;
+ tfrc.tfrctx_p = hc->tx_p;
+ tfrc.tfrctx_rto = hc->tx_t_rto;
+ tfrc.tfrctx_ipi = hc->tx_t_ipi;
+ len = sizeof(tfrc);
+ val = &tfrc;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen) || copy_to_user(optval, val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * Receiver Half-Connection Routines
+ */
+
+/* CCID3 feedback types */
+enum ccid3_fback_type {
+ CCID3_FBACK_NONE = 0,
+ CCID3_FBACK_INITIAL,
+ CCID3_FBACK_PERIODIC,
+ CCID3_FBACK_PARAM_CHANGE
+};
+
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
+{
+ static const char *const ccid3_rx_state_names[] = {
+ [TFRC_RSTATE_NO_DATA] = "NO_DATA",
+ [TFRC_RSTATE_DATA] = "DATA",
+ };
+
+ return ccid3_rx_state_names[state];
+}
+#endif
+
+static void ccid3_hc_rx_set_state(struct sock *sk,
+ enum ccid3_hc_rx_states state)
+{
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ enum ccid3_hc_rx_states oldstate = hc->rx_state;
+
+ ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+ dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
+ ccid3_rx_state_name(state));
+ WARN_ON(state == oldstate);
+ hc->rx_state = state;
+}
+
+static void ccid3_hc_rx_send_feedback(struct sock *sk,
+ const struct sk_buff *skb,
+ enum ccid3_fback_type fbtype)
+{
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ ktime_t now = ktime_get_real();
+ s64 delta = 0;
+
+ switch (fbtype) {
+ case CCID3_FBACK_INITIAL:
+ hc->rx_x_recv = 0;
+ hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */
+ break;
+ case CCID3_FBACK_PARAM_CHANGE:
+ /*
+ * When parameters change (new loss or p > p_prev), we do not
+ * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
+ * need to reuse the previous value of X_recv. However, when
+ * X_recv was 0 (due to early loss), this would kill X down to
+ * s/t_mbi (i.e. one packet in 64 seconds).
+ * To avoid such drastic reduction, we approximate X_recv as
+ * the number of bytes since last feedback.
+ * This is a safe fallback, since X is bounded above by X_calc.
+ */
+ if (hc->rx_x_recv > 0)
+ break;
+ /* fall through */
+ case CCID3_FBACK_PERIODIC:
+ delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback);
+ if (delta <= 0)
+ DCCP_BUG("delta (%ld) <= 0", (long)delta);
+ else
+ hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
+ break;
+ default:
+ return;
+ }
+
+ ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
+ hc->rx_x_recv, hc->rx_pinv);
+
+ hc->rx_tstamp_last_feedback = now;
+ hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval;
+ hc->rx_bytes_recv = 0;
+
+ dp->dccps_hc_rx_insert_options = 1;
+ dccp_send_ack(sk);
+}
+
+static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ __be32 x_recv, pinv;
+
+ if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
+ return 0;
+
+ if (dccp_packet_without_ack(skb))
+ return 0;
+
+ x_recv = htonl(hc->rx_x_recv);
+ pinv = htonl(hc->rx_pinv);
+
+ if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
+ &pinv, sizeof(pinv)) ||
+ dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
+ &x_recv, sizeof(x_recv)))
+ return -1;
+
+ return 0;
+}
+
+/**
+ * ccid3_first_li - Implements [RFC 5348, 6.3.1]
+ *
+ * Determine the length of the first loss interval via inverse lookup.
+ * Assume that X_recv can be computed by the throughput equation
+ * s
+ * X_recv = --------
+ * R * fval
+ * Find some p such that f(p) = fval; return 1/p (scaled).
+ */
+static u32 ccid3_first_li(struct sock *sk)
+{
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ u32 x_recv, p, delta;
+ u64 fval;
+
+ if (hc->rx_rtt == 0) {
+ DCCP_WARN("No RTT estimate available, using fallback RTT\n");
+ hc->rx_rtt = DCCP_FALLBACK_RTT;
+ }
+
+ delta = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback));
+ x_recv = scaled_div32(hc->rx_bytes_recv, delta);
+ if (x_recv == 0) { /* would also trigger divide-by-zero */
+ DCCP_WARN("X_recv==0\n");
+ if (hc->rx_x_recv == 0) {
+ DCCP_BUG("stored value of X_recv is zero");
+ return ~0U;
+ }
+ x_recv = hc->rx_x_recv;
+ }
+
+ fval = scaled_div(hc->rx_s, hc->rx_rtt);
+ fval = scaled_div32(fval, x_recv);
+ p = tfrc_calc_x_reverse_lookup(fval);
+
+ ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
+ "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
+
+ return p == 0 ? ~0U : scaled_div(1, p);
+}
+
+static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
+ const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
+ const bool is_data_packet = dccp_data_packet(skb);
+
+ if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) {
+ if (is_data_packet) {
+ const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+ do_feedback = CCID3_FBACK_INITIAL;
+ ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+ hc->rx_s = payload;
+ /*
+ * Not necessary to update rx_bytes_recv here,
+ * since X_recv = 0 for the first feedback packet (cf.
+ * RFC 3448, 6.3) -- gerrit
+ */
+ }
+ goto update_records;
+ }
+
+ if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb))
+ return; /* done receiving */
+
+ if (is_data_packet) {
+ const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+ /*
+ * Update moving-average of s and the sum of received payload bytes
+ */
+ hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9);
+ hc->rx_bytes_recv += payload;
+ }
+
+ /*
+ * Perform loss detection and handle pending losses
+ */
+ if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist,
+ skb, ndp, ccid3_first_li, sk)) {
+ do_feedback = CCID3_FBACK_PARAM_CHANGE;
+ goto done_receiving;
+ }
+
+ if (tfrc_rx_hist_loss_pending(&hc->rx_hist))
+ return; /* done receiving */
+
+ /*
+ * Handle data packets: RTT sampling and monitoring p
+ */
+ if (unlikely(!is_data_packet))
+ goto update_records;
+
+ if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) {
+ const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb);
+ /*
+ * Empty loss history: no loss so far, hence p stays 0.
+ * Sample RTT values, since an RTT estimate is required for the
+ * computation of p when the first loss occurs; RFC 3448, 6.3.1.
+ */
+ if (sample != 0)
+ hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9);
+
+ } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) {
+ /*
+ * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
+ * has decreased (resp. p has increased), send feedback now.
+ */
+ do_feedback = CCID3_FBACK_PARAM_CHANGE;
+ }
+
+ /*
+ * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
+ */
+ if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3)
+ do_feedback = CCID3_FBACK_PERIODIC;
+
+update_records:
+ tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp);
+
+done_receiving:
+ if (do_feedback)
+ ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
+}
+
+static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
+{
+ struct ccid3_hc_rx_sock *hc = ccid_priv(ccid);
+
+ hc->rx_state = TFRC_RSTATE_NO_DATA;
+ tfrc_lh_init(&hc->rx_li_hist);
+ return tfrc_rx_hist_alloc(&hc->rx_hist);
+}
+
+static void ccid3_hc_rx_exit(struct sock *sk)
+{
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+
+ tfrc_rx_hist_purge(&hc->rx_hist);
+ tfrc_lh_cleanup(&hc->rx_li_hist);
+}
+
+static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
+{
+ info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
+ info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
+}
+
+static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ struct tfrc_rx_info rx_info;
+ const void *val;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_CCID_RX_INFO:
+ if (len < sizeof(rx_info))
+ return -EINVAL;
+ rx_info.tfrcrx_x_recv = hc->rx_x_recv;
+ rx_info.tfrcrx_rtt = hc->rx_rtt;
+ rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
+ len = sizeof(rx_info);
+ val = &rx_info;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen) || copy_to_user(optval, val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+struct ccid_operations ccid3_ops = {
+ .ccid_id = DCCPC_CCID3,
+ .ccid_name = "TCP-Friendly Rate Control",
+ .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
+ .ccid_hc_tx_init = ccid3_hc_tx_init,
+ .ccid_hc_tx_exit = ccid3_hc_tx_exit,
+ .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
+ .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
+ .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
+ .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
+ .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock),
+ .ccid_hc_rx_init = ccid3_hc_rx_init,
+ .ccid_hc_rx_exit = ccid3_hc_rx_exit,
+ .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
+ .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
+ .ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
+ .ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
+ .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt,
+ .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
+};
+
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+module_param(ccid3_debug, bool, 0644);
+MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages");
+#endif
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
new file mode 100644
index 000000000..1a9933c29
--- /dev/null
+++ b/net/dccp/ccids/ccid3.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID3_H_
+#define _DCCP_CCID3_H_
+
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/tfrc.h>
+#include "lib/tfrc.h"
+#include "../ccid.h"
+
+/* Two seconds as per RFC 5348, 4.2 */
+#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
+
+/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
+#define TFRC_T_MBI 64
+
+/*
+ * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
+ * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
+ * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
+ * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
+ * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
+ */
+#if (HZ >= 500)
+# define TFRC_T_DELTA USEC_PER_MSEC
+#else
+# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
+#endif
+
+enum ccid3_options {
+ TFRC_OPT_LOSS_EVENT_RATE = 192,
+ TFRC_OPT_LOSS_INTERVALS = 193,
+ TFRC_OPT_RECEIVE_RATE = 194,
+};
+
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+ TFRC_SSTATE_NO_SENT = 1,
+ TFRC_SSTATE_NO_FBACK,
+ TFRC_SSTATE_FBACK,
+};
+
+/**
+ * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
+ * @tx_x: Current sending rate in 64 * bytes per second
+ * @tx_x_recv: Receive rate in 64 * bytes per second
+ * @tx_x_calc: Calculated rate in bytes per second
+ * @tx_rtt: Estimate of current round trip time in usecs
+ * @tx_p: Current loss event rate (0-1) scaled by 1000000
+ * @tx_s: Packet size in bytes
+ * @tx_t_rto: Nofeedback Timer setting in usecs
+ * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs
+ * @tx_state: Sender state, one of %ccid3_hc_tx_states
+ * @tx_last_win_count: Last window counter sent
+ * @tx_t_last_win_count: Timestamp of earliest packet
+ * with last_win_count value sent
+ * @tx_no_feedback_timer: Handle to no feedback timer
+ * @tx_t_ld: Time last doubled during slow start
+ * @tx_t_nom: Nominal send time of next packet
+ * @tx_hist: Packet history
+ */
+struct ccid3_hc_tx_sock {
+ u64 tx_x;
+ u64 tx_x_recv;
+ u32 tx_x_calc;
+ u32 tx_rtt;
+ u32 tx_p;
+ u32 tx_t_rto;
+ u32 tx_t_ipi;
+ u16 tx_s;
+ enum ccid3_hc_tx_states tx_state:8;
+ u8 tx_last_win_count;
+ ktime_t tx_t_last_win_count;
+ struct timer_list tx_no_feedback_timer;
+ ktime_t tx_t_ld;
+ ktime_t tx_t_nom;
+ struct tfrc_tx_hist_entry *tx_hist;
+};
+
+static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
+{
+ struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
+ BUG_ON(hctx == NULL);
+ return hctx;
+}
+
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+ TFRC_RSTATE_NO_DATA = 1,
+ TFRC_RSTATE_DATA,
+};
+
+/**
+ * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
+ * @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
+ * @rx_state: Receiver state, one of %ccid3_hc_rx_states
+ * @rx_bytes_recv: Total sum of DCCP payload bytes
+ * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
+ * @rx_rtt: Receiver estimate of RTT
+ * @rx_tstamp_last_feedback: Time at which last feedback was sent
+ * @rx_hist: Packet history (loss detection + RTT sampling)
+ * @rx_li_hist: Loss Interval database
+ * @rx_s: Received packet size in bytes
+ * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
+ */
+struct ccid3_hc_rx_sock {
+ u8 rx_last_counter:4;
+ enum ccid3_hc_rx_states rx_state:8;
+ u32 rx_bytes_recv;
+ u32 rx_x_recv;
+ u32 rx_rtt;
+ ktime_t rx_tstamp_last_feedback;
+ struct tfrc_rx_hist rx_hist;
+ struct tfrc_loss_hist rx_li_hist;
+ u16 rx_s;
+#define rx_pinv rx_li_hist.i_mean
+};
+
+static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
+{
+ struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
+ BUG_ON(hcrx == NULL);
+ return hcrx;
+}
+
+#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
new file mode 100644
index 000000000..57f9fd78c
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <net/sock.h>
+#include "tfrc.h"
+
+static struct kmem_cache *tfrc_lh_slab __read_mostly;
+/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */
+static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 };
+
+/* implements LIFO semantics on the array */
+static inline u8 LIH_INDEX(const u8 ctr)
+{
+ return LIH_SIZE - 1 - (ctr % LIH_SIZE);
+}
+
+/* the `counter' index always points at the next entry to be populated */
+static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh)
+{
+ return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL;
+}
+
+/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */
+static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i)
+{
+ BUG_ON(i >= lh->counter);
+ return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length;
+}
+
+/*
+ * On-demand allocation and de-allocation of entries
+ */
+static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh)
+{
+ if (lh->ring[LIH_INDEX(lh->counter)] == NULL)
+ lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab,
+ GFP_ATOMIC);
+ return lh->ring[LIH_INDEX(lh->counter)];
+}
+
+void tfrc_lh_cleanup(struct tfrc_loss_hist *lh)
+{
+ if (!tfrc_lh_is_initialised(lh))
+ return;
+
+ for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++)
+ if (lh->ring[LIH_INDEX(lh->counter)] != NULL) {
+ kmem_cache_free(tfrc_lh_slab,
+ lh->ring[LIH_INDEX(lh->counter)]);
+ lh->ring[LIH_INDEX(lh->counter)] = NULL;
+ }
+}
+
+static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
+{
+ u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
+ int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
+
+ if (k <= 0)
+ return;
+
+ for (i = 0; i <= k; i++) {
+ i_i = tfrc_lh_get_interval(lh, i);
+
+ if (i < k) {
+ i_tot0 += i_i * tfrc_lh_weights[i];
+ w_tot += tfrc_lh_weights[i];
+ }
+ if (i > 0)
+ i_tot1 += i_i * tfrc_lh_weights[i-1];
+ }
+
+ lh->i_mean = max(i_tot0, i_tot1) / w_tot;
+}
+
+/**
+ * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
+ * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
+ */
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
+{
+ struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
+ u32 old_i_mean = lh->i_mean;
+ s64 len;
+
+ if (cur == NULL) /* not initialised */
+ return 0;
+
+ len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
+
+ if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
+ return 0;
+
+ if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
+ /*
+ * Implements RFC 4342, 10.2:
+ * If a packet S (skb) exists whose seqno comes `after' the one
+ * starting the current loss interval (cur) and if the modulo-16
+ * distance from C(cur) to C(S) is greater than 4, consider all
+ * subsequent packets as belonging to a new loss interval. This
+ * test is necessary since CCVal may wrap between intervals.
+ */
+ cur->li_is_closed = 1;
+
+ if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
+ return 0;
+
+ cur->li_length = len;
+ tfrc_lh_calc_i_mean(lh);
+
+ return lh->i_mean < old_i_mean;
+}
+
+/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
+static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
+ struct tfrc_rx_hist_entry *new_loss)
+{
+ return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 &&
+ (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4);
+}
+
+/**
+ * tfrc_lh_interval_add - Insert new record into the Loss Interval database
+ * @lh: Loss Interval database
+ * @rh: Receive history containing a fresh loss event
+ * @calc_first_li: Caller-dependent routine to compute length of first interval
+ * @sk: Used by @calc_first_li in caller-specific way (subtyping)
+ *
+ * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
+ */
+int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
+ u32 (*calc_first_li)(struct sock *), struct sock *sk)
+{
+ struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
+
+ if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
+ return 0;
+
+ new = tfrc_lh_demand_next(lh);
+ if (unlikely(new == NULL)) {
+ DCCP_CRIT("Cannot allocate/add loss record.");
+ return 0;
+ }
+
+ new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
+ new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval;
+ new->li_is_closed = 0;
+
+ if (++lh->counter == 1)
+ lh->i_mean = new->li_length = (*calc_first_li)(sk);
+ else {
+ cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno);
+ new->li_length = dccp_delta_seqno(new->li_seqno,
+ tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1;
+ if (lh->counter > (2*LIH_SIZE))
+ lh->counter -= LIH_SIZE;
+
+ tfrc_lh_calc_i_mean(lh);
+ }
+ return 1;
+}
+
+int __init tfrc_li_init(void)
+{
+ tfrc_lh_slab = kmem_cache_create("tfrc_li_hist",
+ sizeof(struct tfrc_loss_interval), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ return tfrc_lh_slab == NULL ? -ENOBUFS : 0;
+}
+
+void tfrc_li_exit(void)
+{
+ if (tfrc_lh_slab != NULL) {
+ kmem_cache_destroy(tfrc_lh_slab);
+ tfrc_lh_slab = NULL;
+ }
+}
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
new file mode 100644
index 000000000..57f631a86
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -0,0 +1,73 @@
+#ifndef _DCCP_LI_HIST_
+#define _DCCP_LI_HIST_
+/*
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+/*
+ * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than
+ * NINTERVAL, since the `open' interval I_0 is always stored as the first entry.
+ */
+#define NINTERVAL 8
+#define LIH_SIZE (NINTERVAL + 1)
+
+/**
+ * tfrc_loss_interval - Loss history record for TFRC-based protocols
+ * @li_seqno: Highest received seqno before the start of loss
+ * @li_ccval: The CCVal belonging to @li_seqno
+ * @li_is_closed: Whether @li_seqno is older than 1 RTT
+ * @li_length: Loss interval sequence length
+ */
+struct tfrc_loss_interval {
+ u64 li_seqno:48,
+ li_ccval:4,
+ li_is_closed:1;
+ u32 li_length;
+};
+
+/**
+ * tfrc_loss_hist - Loss record database
+ * @ring: Circular queue managed in LIFO manner
+ * @counter: Current count of entries (can be more than %LIH_SIZE)
+ * @i_mean: Current Average Loss Interval [RFC 3448, 5.4]
+ */
+struct tfrc_loss_hist {
+ struct tfrc_loss_interval *ring[LIH_SIZE];
+ u8 counter;
+ u32 i_mean;
+};
+
+static inline void tfrc_lh_init(struct tfrc_loss_hist *lh)
+{
+ memset(lh, 0, sizeof(struct tfrc_loss_hist));
+}
+
+static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh)
+{
+ return lh->counter > 0;
+}
+
+static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
+{
+ return min(lh->counter, (u8)LIH_SIZE);
+}
+
+struct tfrc_rx_hist;
+
+int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
+ u32 (*first_li)(struct sock *), struct sock *);
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
+void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
+
+#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
new file mode 100644
index 000000000..08df7a3ac
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "packet_history.h"
+#include "../../dccp.h"
+
+/*
+ * Transmitter History Routines
+ */
+static struct kmem_cache *tfrc_tx_hist_slab;
+
+int __init tfrc_tx_packet_history_init(void)
+{
+ tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist",
+ sizeof(struct tfrc_tx_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0;
+}
+
+void tfrc_tx_packet_history_exit(void)
+{
+ if (tfrc_tx_hist_slab != NULL) {
+ kmem_cache_destroy(tfrc_tx_hist_slab);
+ tfrc_tx_hist_slab = NULL;
+ }
+}
+
+int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
+{
+ struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
+
+ if (entry == NULL)
+ return -ENOBUFS;
+ entry->seqno = seqno;
+ entry->stamp = ktime_get_real();
+ entry->next = *headp;
+ *headp = entry;
+ return 0;
+}
+
+void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
+{
+ struct tfrc_tx_hist_entry *head = *headp;
+
+ while (head != NULL) {
+ struct tfrc_tx_hist_entry *next = head->next;
+
+ kmem_cache_free(tfrc_tx_hist_slab, head);
+ head = next;
+ }
+
+ *headp = NULL;
+}
+
+/*
+ * Receiver History Routines
+ */
+static struct kmem_cache *tfrc_rx_hist_slab;
+
+int __init tfrc_rx_packet_history_init(void)
+{
+ tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache",
+ sizeof(struct tfrc_rx_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0;
+}
+
+void tfrc_rx_packet_history_exit(void)
+{
+ if (tfrc_rx_hist_slab != NULL) {
+ kmem_cache_destroy(tfrc_rx_hist_slab);
+ tfrc_rx_hist_slab = NULL;
+ }
+}
+
+static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
+ const struct sk_buff *skb,
+ const u64 ndp)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+
+ entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ entry->tfrchrx_ccval = dh->dccph_ccval;
+ entry->tfrchrx_type = dh->dccph_type;
+ entry->tfrchrx_ndp = ndp;
+ entry->tfrchrx_tstamp = ktime_get_real();
+}
+
+void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
+ const struct sk_buff *skb,
+ const u64 ndp)
+{
+ struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h);
+
+ tfrc_rx_hist_entry_from_skb(entry, skb, ndp);
+}
+
+/* has the packet contained in skb been seen before? */
+int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
+{
+ const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq;
+ int i;
+
+ if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0)
+ return 1;
+
+ for (i = 1; i <= h->loss_count; i++)
+ if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq)
+ return 1;
+
+ return 0;
+}
+
+static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
+{
+ const u8 idx_a = tfrc_rx_hist_index(h, a),
+ idx_b = tfrc_rx_hist_index(h, b);
+ struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
+
+ h->ring[idx_a] = h->ring[idx_b];
+ h->ring[idx_b] = tmp;
+}
+
+/*
+ * Private helper functions for loss detection.
+ *
+ * In the descriptions, `Si' refers to the sequence number of entry number i,
+ * whose NDP count is `Ni' (lower case is used for variables).
+ * Note: All __xxx_loss functions expect that a test against duplicates has been
+ * performed already: the seqno of the skb must not be less than the seqno
+ * of loss_prev; and it must not equal that of any valid history entry.
+ */
+static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
+{
+ u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+ s1 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */
+ h->loss_count = 1;
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
+ }
+}
+
+static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
+{
+ u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+ s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+ s2 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */
+ h->loss_count = 2;
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2);
+ return;
+ }
+
+ /* S0 < S2 < S1 */
+
+ if (dccp_loss_free(s0, s2, n2)) {
+ u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
+
+ if (dccp_loss_free(s2, s1, n1)) {
+ /* hole is filled: S0, S2, and S1 are consecutive */
+ h->loss_count = 0;
+ h->loss_start = tfrc_rx_hist_index(h, 1);
+ } else
+ /* gap between S2 and S1: just update loss_prev */
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
+
+ } else { /* gap between S0 and S2 */
+ /*
+ * Reorder history to insert S2 between S0 and S1
+ */
+ tfrc_rx_hist_swap(h, 0, 3);
+ h->loss_start = tfrc_rx_hist_index(h, 3);
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2);
+ h->loss_count = 2;
+ }
+}
+
+/* return 1 if a new loss event has been identified */
+static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
+{
+ u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+ s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+ s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
+ s3 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */
+ h->loss_count = 3;
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3);
+ return 1;
+ }
+
+ /* S3 < S2 */
+
+ if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */
+ /*
+ * Reorder history to insert S3 between S1 and S2
+ */
+ tfrc_rx_hist_swap(h, 2, 3);
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3);
+ h->loss_count = 3;
+ return 1;
+ }
+
+ /* S0 < S3 < S1 */
+
+ if (dccp_loss_free(s0, s3, n3)) {
+ u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
+
+ if (dccp_loss_free(s3, s1, n1)) {
+ /* hole between S0 and S1 filled by S3 */
+ u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
+
+ if (dccp_loss_free(s1, s2, n2)) {
+ /* entire hole filled by S0, S3, S1, S2 */
+ h->loss_start = tfrc_rx_hist_index(h, 2);
+ h->loss_count = 0;
+ } else {
+ /* gap remains between S1 and S2 */
+ h->loss_start = tfrc_rx_hist_index(h, 1);
+ h->loss_count = 1;
+ }
+
+ } else /* gap exists between S3 and S1, loss_count stays at 2 */
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3);
+
+ return 0;
+ }
+
+ /*
+ * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3
+ * Reorder history to insert S3 between S0 and S1.
+ */
+ tfrc_rx_hist_swap(h, 0, 3);
+ h->loss_start = tfrc_rx_hist_index(h, 3);
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3);
+ h->loss_count = 3;
+
+ return 1;
+}
+
+/* recycle RX history records to continue loss detection if necessary */
+static void __three_after_loss(struct tfrc_rx_hist *h)
+{
+ /*
+ * At this stage we know already that there is a gap between S0 and S1
+ * (since S0 was the highest sequence number received before detecting
+ * the loss). To recycle the loss record, it is thus only necessary to
+ * check for other possible gaps between S1/S2 and between S2/S3.
+ */
+ u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+ s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
+ s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno;
+ u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
+ n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp;
+
+ if (dccp_loss_free(s1, s2, n2)) {
+
+ if (dccp_loss_free(s2, s3, n3)) {
+ /* no gap between S2 and S3: entire hole is filled */
+ h->loss_start = tfrc_rx_hist_index(h, 3);
+ h->loss_count = 0;
+ } else {
+ /* gap between S2 and S3 */
+ h->loss_start = tfrc_rx_hist_index(h, 2);
+ h->loss_count = 1;
+ }
+
+ } else { /* gap between S1 and S2 */
+ h->loss_start = tfrc_rx_hist_index(h, 1);
+ h->loss_count = 2;
+ }
+}
+
+/**
+ * tfrc_rx_handle_loss - Loss detection and further processing
+ * @h: The non-empty RX history object
+ * @lh: Loss Intervals database to update
+ * @skb: Currently received packet
+ * @ndp: The NDP count belonging to @skb
+ * @calc_first_li: Caller-dependent computation of first loss interval in @lh
+ * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
+ *
+ * Chooses action according to pending loss, updates LI database when a new
+ * loss was detected, and does required post-processing. Returns 1 when caller
+ * should send feedback, 0 otherwise.
+ * Since it also takes care of reordering during loss detection and updates the
+ * records accordingly, the caller should not perform any more RX history
+ * operations when loss_count is greater than 0 after calling this function.
+ */
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
+ struct tfrc_loss_hist *lh,
+ struct sk_buff *skb, const u64 ndp,
+ u32 (*calc_first_li)(struct sock *), struct sock *sk)
+{
+ int is_new_loss = 0;
+
+ if (h->loss_count == 0) {
+ __do_track_loss(h, skb, ndp);
+ } else if (h->loss_count == 1) {
+ __one_after_loss(h, skb, ndp);
+ } else if (h->loss_count != 2) {
+ DCCP_BUG("invalid loss_count %d", h->loss_count);
+ } else if (__two_after_loss(h, skb, ndp)) {
+ /*
+ * Update Loss Interval database and recycle RX records
+ */
+ is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
+ __three_after_loss(h);
+ }
+ return is_new_loss;
+}
+
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
+{
+ int i;
+
+ for (i = 0; i <= TFRC_NDUPACK; i++) {
+ h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
+ if (h->ring[i] == NULL)
+ goto out_free;
+ }
+
+ h->loss_count = h->loss_start = 0;
+ return 0;
+
+out_free:
+ while (i-- != 0) {
+ kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
+ h->ring[i] = NULL;
+ }
+ return -ENOBUFS;
+}
+
+void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
+{
+ int i;
+
+ for (i = 0; i <= TFRC_NDUPACK; ++i)
+ if (h->ring[i] != NULL) {
+ kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
+ h->ring[i] = NULL;
+ }
+}
+
+/**
+ * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
+{
+ return h->ring[0];
+}
+
+/**
+ * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
+{
+ return h->ring[h->rtt_sample_prev];
+}
+
+/**
+ * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
+ * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
+ * to compute a sample with given data - calling function should check this.
+ */
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
+{
+ u32 sample = 0,
+ delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
+ tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+
+ if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
+ if (h->rtt_sample_prev == 2) { /* previous candidate stored */
+ sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+ tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+ if (sample)
+ sample = 4 / sample *
+ ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
+ tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
+ else /*
+ * FIXME: This condition is in principle not
+ * possible but occurs when CCID is used for
+ * two-way data traffic. I have tried to trace
+ * it, but the cause does not seem to be here.
+ */
+ DCCP_BUG("please report to dccp@vger.kernel.org"
+ " => prev = %u, last = %u",
+ tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+ tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+ } else if (delta_v < 1) {
+ h->rtt_sample_prev = 1;
+ goto keep_ref_for_next_time;
+ }
+
+ } else if (delta_v == 4) /* optimal match */
+ sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
+ else { /* suboptimal match */
+ h->rtt_sample_prev = 2;
+ goto keep_ref_for_next_time;
+ }
+
+ if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
+ DCCP_WARN("RTT sample %u too large, using max\n", sample);
+ sample = DCCP_SANE_RTT_MAX;
+ }
+
+ h->rtt_sample_prev = 0; /* use current entry as next reference */
+keep_ref_for_next_time:
+
+ return sample;
+}
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
new file mode 100644
index 000000000..ee362b0b6
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -0,0 +1,155 @@
+/*
+ * Packet RX/TX history data structures and routines for TFRC-based protocols.
+ *
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DCCP_PKT_HIST_
+#define _DCCP_PKT_HIST_
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "tfrc.h"
+
+/**
+ * tfrc_tx_hist_entry - Simple singly-linked TX history list
+ * @next: next oldest entry (LIFO order)
+ * @seqno: sequence number of this entry
+ * @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+ struct tfrc_tx_hist_entry *next;
+ u64 seqno;
+ ktime_t stamp;
+};
+
+static inline struct tfrc_tx_hist_entry *
+ tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+ while (head != NULL && head->seqno != seqno)
+ head = head->next;
+ return head;
+}
+
+int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
+void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
+
+/* Subtraction a-b modulo-16, respects circular wrap-around */
+#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
+
+/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */
+#define TFRC_NDUPACK 3
+
+/**
+ * tfrc_rx_hist_entry - Store information about a single received packet
+ * @tfrchrx_seqno: DCCP packet sequence number
+ * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1)
+ * @tfrchrx_ndp: the NDP count (if any) of the packet
+ * @tfrchrx_tstamp: actual receive time of packet
+ */
+struct tfrc_rx_hist_entry {
+ u64 tfrchrx_seqno:48,
+ tfrchrx_ccval:4,
+ tfrchrx_type:4;
+ u64 tfrchrx_ndp:48;
+ ktime_t tfrchrx_tstamp;
+};
+
+/**
+ * tfrc_rx_hist - RX history structure for TFRC-based protocols
+ * @ring: Packet history for RTT sampling and loss detection
+ * @loss_count: Number of entries in circular history
+ * @loss_start: Movable index (for loss detection)
+ * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
+ */
+struct tfrc_rx_hist {
+ struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
+ u8 loss_count:2,
+ loss_start:2;
+#define rtt_sample_prev loss_start
+};
+
+/**
+ * tfrc_rx_hist_index - index to reach n-th entry after loss_start
+ */
+static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n)
+{
+ return (h->loss_start + n) & TFRC_NDUPACK;
+}
+
+/**
+ * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h)
+{
+ return h->ring[tfrc_rx_hist_index(h, h->loss_count)];
+}
+
+/**
+ * tfrc_rx_hist_entry - return the n-th history entry after loss_start
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n)
+{
+ return h->ring[tfrc_rx_hist_index(h, n)];
+}
+
+/**
+ * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h)
+{
+ return h->ring[h->loss_start];
+}
+
+/* indicate whether previously a packet was detected missing */
+static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
+{
+ return h->loss_count > 0;
+}
+
+void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb,
+ const u64 ndp);
+
+int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
+
+struct tfrc_loss_hist;
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh,
+ struct sk_buff *skb, const u64 ndp,
+ u32 (*first_li)(struct sock *sk), struct sock *sk);
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb);
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
+void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
+
+#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
new file mode 100644
index 000000000..62b5828ac
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -0,0 +1,45 @@
+/*
+ * TFRC library initialisation
+ *
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ */
+#include <linux/moduleparam.h>
+#include "tfrc.h"
+
+#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
+bool tfrc_debug;
+module_param(tfrc_debug, bool, 0644);
+MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages");
+#endif
+
+int __init tfrc_lib_init(void)
+{
+ int rc = tfrc_li_init();
+
+ if (rc)
+ goto out;
+
+ rc = tfrc_tx_packet_history_init();
+ if (rc)
+ goto out_free_loss_intervals;
+
+ rc = tfrc_rx_packet_history_init();
+ if (rc)
+ goto out_free_tx_history;
+ return 0;
+
+out_free_tx_history:
+ tfrc_tx_packet_history_exit();
+out_free_loss_intervals:
+ tfrc_li_exit();
+out:
+ return rc;
+}
+
+void tfrc_lib_exit(void)
+{
+ tfrc_rx_packet_history_exit();
+ tfrc_tx_packet_history_exit();
+ tfrc_li_exit();
+}
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
new file mode 100644
index 000000000..40ee7d62b
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -0,0 +1,77 @@
+#ifndef _TFRC_H_
+#define _TFRC_H_
+/*
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <linux/math64.h>
+#include "../../dccp.h"
+
+/* internal includes that this library exports: */
+#include "loss_interval.h"
+#include "packet_history.h"
+
+#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
+extern bool tfrc_debug;
+#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a)
+#else
+#define tfrc_pr_debug(format, a...)
+#endif
+
+/* integer-arithmetic divisions of type (a * 1000000)/b */
+static inline u64 scaled_div(u64 a, u64 b)
+{
+ BUG_ON(b == 0);
+ return div64_u64(a * 1000000, b);
+}
+
+static inline u32 scaled_div32(u64 a, u64 b)
+{
+ u64 result = scaled_div(a, b);
+
+ if (result > UINT_MAX) {
+ DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
+ (unsigned long long)a, (unsigned long long)b);
+ return UINT_MAX;
+ }
+ return result;
+}
+
+/**
+ * tfrc_ewma - Exponentially weighted moving average
+ * @weight: Weight to be used as damping factor, in units of 1/10
+ */
+static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
+{
+ return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
+}
+
+u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
+
+int tfrc_tx_packet_history_init(void);
+void tfrc_tx_packet_history_exit(void);
+int tfrc_rx_packet_history_init(void);
+void tfrc_rx_packet_history_exit(void);
+
+int tfrc_li_init(void);
+void tfrc_li_exit(void);
+
+#ifdef CONFIG_IP_DCCP_TFRC_LIB
+int tfrc_lib_init(void);
+void tfrc_lib_exit(void);
+#else
+#define tfrc_lib_init() (0)
+#define tfrc_lib_exit()
+#endif
+#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
new file mode 100644
index 000000000..88ef98285
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -0,0 +1,705 @@
+/*
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include "../../dccp.h"
+#include "tfrc.h"
+
+#define TFRC_CALC_X_ARRSIZE 500
+#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */
+#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
+
+/*
+ TFRC TCP Reno Throughput Equation Lookup Table for f(p)
+
+ The following two-column lookup table implements a part of the TCP throughput
+ equation from [RFC 3448, sec. 3.1]:
+
+ s
+ X_calc = --------------------------------------------------------------
+ R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
+
+ Where:
+ X is the transmit rate in bytes/second
+ s is the packet size in bytes
+ R is the round trip time in seconds
+ p is the loss event rate, between 0 and 1.0, of the number of loss
+ events as a fraction of the number of packets transmitted
+ t_RTO is the TCP retransmission timeout value in seconds
+ b is the number of packets acknowledged by a single TCP ACK
+
+ We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
+
+ s
+ X_calc = -------------------------------------------------------
+ R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
+
+ which we can break down into:
+
+ s
+ X_calc = ---------
+ R * f(p)
+
+ where f(p) is given for 0 < p <= 1 by:
+
+ f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3)
+
+ Since this is kernel code, floating-point arithmetic is avoided in favour of
+ integer arithmetic. This means that nearly all fractional parameters are
+ scaled by 1000000:
+ * the parameters p and R
+ * the return result f(p)
+ The lookup table therefore actually tabulates the following function g(q):
+
+ g(q) = 1000000 * f(q/1000000)
+
+ Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
+ granularity for the practically more relevant case of small values of p (up to
+ 5%), the second column is used; the first one ranges up to 100%. This split
+ corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
+ determines the smallest resolution possible with this lookup table:
+
+ TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
+
+ The entire table is generated by:
+ for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
+ lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
+ lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
+ }
+
+ With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
+ lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
+ lookup[M][0] = g(1000000) = 1000000 * f(100%)
+ lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
+ lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
+
+ In summary, the two columns represent f(p) for the following ranges:
+ * The first column is for 0.002 <= p <= 1.0
+ * The second column is for 0.0001 <= p <= 0.05
+ Where the columns overlap, the second (finer-grained) is given preference,
+ i.e. the first column is used only for p >= 0.05.
+ */
+static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
+ { 37172, 8172 },
+ { 53499, 11567 },
+ { 66664, 14180 },
+ { 78298, 16388 },
+ { 89021, 18339 },
+ { 99147, 20108 },
+ { 108858, 21738 },
+ { 118273, 23260 },
+ { 127474, 24693 },
+ { 136520, 26052 },
+ { 145456, 27348 },
+ { 154316, 28589 },
+ { 163130, 29783 },
+ { 171919, 30935 },
+ { 180704, 32049 },
+ { 189502, 33130 },
+ { 198328, 34180 },
+ { 207194, 35202 },
+ { 216114, 36198 },
+ { 225097, 37172 },
+ { 234153, 38123 },
+ { 243294, 39055 },
+ { 252527, 39968 },
+ { 261861, 40864 },
+ { 271305, 41743 },
+ { 280866, 42607 },
+ { 290553, 43457 },
+ { 300372, 44293 },
+ { 310333, 45117 },
+ { 320441, 45929 },
+ { 330705, 46729 },
+ { 341131, 47518 },
+ { 351728, 48297 },
+ { 362501, 49066 },
+ { 373460, 49826 },
+ { 384609, 50577 },
+ { 395958, 51320 },
+ { 407513, 52054 },
+ { 419281, 52780 },
+ { 431270, 53499 },
+ { 443487, 54211 },
+ { 455940, 54916 },
+ { 468635, 55614 },
+ { 481581, 56306 },
+ { 494785, 56991 },
+ { 508254, 57671 },
+ { 521996, 58345 },
+ { 536019, 59014 },
+ { 550331, 59677 },
+ { 564939, 60335 },
+ { 579851, 60988 },
+ { 595075, 61636 },
+ { 610619, 62279 },
+ { 626491, 62918 },
+ { 642700, 63553 },
+ { 659253, 64183 },
+ { 676158, 64809 },
+ { 693424, 65431 },
+ { 711060, 66050 },
+ { 729073, 66664 },
+ { 747472, 67275 },
+ { 766266, 67882 },
+ { 785464, 68486 },
+ { 805073, 69087 },
+ { 825103, 69684 },
+ { 845562, 70278 },
+ { 866460, 70868 },
+ { 887805, 71456 },
+ { 909606, 72041 },
+ { 931873, 72623 },
+ { 954614, 73202 },
+ { 977839, 73778 },
+ { 1001557, 74352 },
+ { 1025777, 74923 },
+ { 1050508, 75492 },
+ { 1075761, 76058 },
+ { 1101544, 76621 },
+ { 1127867, 77183 },
+ { 1154739, 77741 },
+ { 1182172, 78298 },
+ { 1210173, 78852 },
+ { 1238753, 79405 },
+ { 1267922, 79955 },
+ { 1297689, 80503 },
+ { 1328066, 81049 },
+ { 1359060, 81593 },
+ { 1390684, 82135 },
+ { 1422947, 82675 },
+ { 1455859, 83213 },
+ { 1489430, 83750 },
+ { 1523671, 84284 },
+ { 1558593, 84817 },
+ { 1594205, 85348 },
+ { 1630518, 85878 },
+ { 1667543, 86406 },
+ { 1705290, 86932 },
+ { 1743770, 87457 },
+ { 1782994, 87980 },
+ { 1822973, 88501 },
+ { 1863717, 89021 },
+ { 1905237, 89540 },
+ { 1947545, 90057 },
+ { 1990650, 90573 },
+ { 2034566, 91087 },
+ { 2079301, 91600 },
+ { 2124869, 92111 },
+ { 2171279, 92622 },
+ { 2218543, 93131 },
+ { 2266673, 93639 },
+ { 2315680, 94145 },
+ { 2365575, 94650 },
+ { 2416371, 95154 },
+ { 2468077, 95657 },
+ { 2520707, 96159 },
+ { 2574271, 96660 },
+ { 2628782, 97159 },
+ { 2684250, 97658 },
+ { 2740689, 98155 },
+ { 2798110, 98651 },
+ { 2856524, 99147 },
+ { 2915944, 99641 },
+ { 2976382, 100134 },
+ { 3037850, 100626 },
+ { 3100360, 101117 },
+ { 3163924, 101608 },
+ { 3228554, 102097 },
+ { 3294263, 102586 },
+ { 3361063, 103073 },
+ { 3428966, 103560 },
+ { 3497984, 104045 },
+ { 3568131, 104530 },
+ { 3639419, 105014 },
+ { 3711860, 105498 },
+ { 3785467, 105980 },
+ { 3860253, 106462 },
+ { 3936229, 106942 },
+ { 4013410, 107422 },
+ { 4091808, 107902 },
+ { 4171435, 108380 },
+ { 4252306, 108858 },
+ { 4334431, 109335 },
+ { 4417825, 109811 },
+ { 4502501, 110287 },
+ { 4588472, 110762 },
+ { 4675750, 111236 },
+ { 4764349, 111709 },
+ { 4854283, 112182 },
+ { 4945564, 112654 },
+ { 5038206, 113126 },
+ { 5132223, 113597 },
+ { 5227627, 114067 },
+ { 5324432, 114537 },
+ { 5422652, 115006 },
+ { 5522299, 115474 },
+ { 5623389, 115942 },
+ { 5725934, 116409 },
+ { 5829948, 116876 },
+ { 5935446, 117342 },
+ { 6042439, 117808 },
+ { 6150943, 118273 },
+ { 6260972, 118738 },
+ { 6372538, 119202 },
+ { 6485657, 119665 },
+ { 6600342, 120128 },
+ { 6716607, 120591 },
+ { 6834467, 121053 },
+ { 6953935, 121514 },
+ { 7075025, 121976 },
+ { 7197752, 122436 },
+ { 7322131, 122896 },
+ { 7448175, 123356 },
+ { 7575898, 123815 },
+ { 7705316, 124274 },
+ { 7836442, 124733 },
+ { 7969291, 125191 },
+ { 8103877, 125648 },
+ { 8240216, 126105 },
+ { 8378321, 126562 },
+ { 8518208, 127018 },
+ { 8659890, 127474 },
+ { 8803384, 127930 },
+ { 8948702, 128385 },
+ { 9095861, 128840 },
+ { 9244875, 129294 },
+ { 9395760, 129748 },
+ { 9548529, 130202 },
+ { 9703198, 130655 },
+ { 9859782, 131108 },
+ { 10018296, 131561 },
+ { 10178755, 132014 },
+ { 10341174, 132466 },
+ { 10505569, 132917 },
+ { 10671954, 133369 },
+ { 10840345, 133820 },
+ { 11010757, 134271 },
+ { 11183206, 134721 },
+ { 11357706, 135171 },
+ { 11534274, 135621 },
+ { 11712924, 136071 },
+ { 11893673, 136520 },
+ { 12076536, 136969 },
+ { 12261527, 137418 },
+ { 12448664, 137867 },
+ { 12637961, 138315 },
+ { 12829435, 138763 },
+ { 13023101, 139211 },
+ { 13218974, 139658 },
+ { 13417071, 140106 },
+ { 13617407, 140553 },
+ { 13819999, 140999 },
+ { 14024862, 141446 },
+ { 14232012, 141892 },
+ { 14441465, 142339 },
+ { 14653238, 142785 },
+ { 14867346, 143230 },
+ { 15083805, 143676 },
+ { 15302632, 144121 },
+ { 15523842, 144566 },
+ { 15747453, 145011 },
+ { 15973479, 145456 },
+ { 16201939, 145900 },
+ { 16432847, 146345 },
+ { 16666221, 146789 },
+ { 16902076, 147233 },
+ { 17140429, 147677 },
+ { 17381297, 148121 },
+ { 17624696, 148564 },
+ { 17870643, 149007 },
+ { 18119154, 149451 },
+ { 18370247, 149894 },
+ { 18623936, 150336 },
+ { 18880241, 150779 },
+ { 19139176, 151222 },
+ { 19400759, 151664 },
+ { 19665007, 152107 },
+ { 19931936, 152549 },
+ { 20201564, 152991 },
+ { 20473907, 153433 },
+ { 20748982, 153875 },
+ { 21026807, 154316 },
+ { 21307399, 154758 },
+ { 21590773, 155199 },
+ { 21876949, 155641 },
+ { 22165941, 156082 },
+ { 22457769, 156523 },
+ { 22752449, 156964 },
+ { 23049999, 157405 },
+ { 23350435, 157846 },
+ { 23653774, 158287 },
+ { 23960036, 158727 },
+ { 24269236, 159168 },
+ { 24581392, 159608 },
+ { 24896521, 160049 },
+ { 25214642, 160489 },
+ { 25535772, 160929 },
+ { 25859927, 161370 },
+ { 26187127, 161810 },
+ { 26517388, 162250 },
+ { 26850728, 162690 },
+ { 27187165, 163130 },
+ { 27526716, 163569 },
+ { 27869400, 164009 },
+ { 28215234, 164449 },
+ { 28564236, 164889 },
+ { 28916423, 165328 },
+ { 29271815, 165768 },
+ { 29630428, 166208 },
+ { 29992281, 166647 },
+ { 30357392, 167087 },
+ { 30725779, 167526 },
+ { 31097459, 167965 },
+ { 31472452, 168405 },
+ { 31850774, 168844 },
+ { 32232445, 169283 },
+ { 32617482, 169723 },
+ { 33005904, 170162 },
+ { 33397730, 170601 },
+ { 33792976, 171041 },
+ { 34191663, 171480 },
+ { 34593807, 171919 },
+ { 34999428, 172358 },
+ { 35408544, 172797 },
+ { 35821174, 173237 },
+ { 36237335, 173676 },
+ { 36657047, 174115 },
+ { 37080329, 174554 },
+ { 37507197, 174993 },
+ { 37937673, 175433 },
+ { 38371773, 175872 },
+ { 38809517, 176311 },
+ { 39250924, 176750 },
+ { 39696012, 177190 },
+ { 40144800, 177629 },
+ { 40597308, 178068 },
+ { 41053553, 178507 },
+ { 41513554, 178947 },
+ { 41977332, 179386 },
+ { 42444904, 179825 },
+ { 42916290, 180265 },
+ { 43391509, 180704 },
+ { 43870579, 181144 },
+ { 44353520, 181583 },
+ { 44840352, 182023 },
+ { 45331092, 182462 },
+ { 45825761, 182902 },
+ { 46324378, 183342 },
+ { 46826961, 183781 },
+ { 47333531, 184221 },
+ { 47844106, 184661 },
+ { 48358706, 185101 },
+ { 48877350, 185541 },
+ { 49400058, 185981 },
+ { 49926849, 186421 },
+ { 50457743, 186861 },
+ { 50992759, 187301 },
+ { 51531916, 187741 },
+ { 52075235, 188181 },
+ { 52622735, 188622 },
+ { 53174435, 189062 },
+ { 53730355, 189502 },
+ { 54290515, 189943 },
+ { 54854935, 190383 },
+ { 55423634, 190824 },
+ { 55996633, 191265 },
+ { 56573950, 191706 },
+ { 57155606, 192146 },
+ { 57741621, 192587 },
+ { 58332014, 193028 },
+ { 58926806, 193470 },
+ { 59526017, 193911 },
+ { 60129666, 194352 },
+ { 60737774, 194793 },
+ { 61350361, 195235 },
+ { 61967446, 195677 },
+ { 62589050, 196118 },
+ { 63215194, 196560 },
+ { 63845897, 197002 },
+ { 64481179, 197444 },
+ { 65121061, 197886 },
+ { 65765563, 198328 },
+ { 66414705, 198770 },
+ { 67068508, 199213 },
+ { 67726992, 199655 },
+ { 68390177, 200098 },
+ { 69058085, 200540 },
+ { 69730735, 200983 },
+ { 70408147, 201426 },
+ { 71090343, 201869 },
+ { 71777343, 202312 },
+ { 72469168, 202755 },
+ { 73165837, 203199 },
+ { 73867373, 203642 },
+ { 74573795, 204086 },
+ { 75285124, 204529 },
+ { 76001380, 204973 },
+ { 76722586, 205417 },
+ { 77448761, 205861 },
+ { 78179926, 206306 },
+ { 78916102, 206750 },
+ { 79657310, 207194 },
+ { 80403571, 207639 },
+ { 81154906, 208084 },
+ { 81911335, 208529 },
+ { 82672880, 208974 },
+ { 83439562, 209419 },
+ { 84211402, 209864 },
+ { 84988421, 210309 },
+ { 85770640, 210755 },
+ { 86558080, 211201 },
+ { 87350762, 211647 },
+ { 88148708, 212093 },
+ { 88951938, 212539 },
+ { 89760475, 212985 },
+ { 90574339, 213432 },
+ { 91393551, 213878 },
+ { 92218133, 214325 },
+ { 93048107, 214772 },
+ { 93883493, 215219 },
+ { 94724314, 215666 },
+ { 95570590, 216114 },
+ { 96422343, 216561 },
+ { 97279594, 217009 },
+ { 98142366, 217457 },
+ { 99010679, 217905 },
+ { 99884556, 218353 },
+ { 100764018, 218801 },
+ { 101649086, 219250 },
+ { 102539782, 219698 },
+ { 103436128, 220147 },
+ { 104338146, 220596 },
+ { 105245857, 221046 },
+ { 106159284, 221495 },
+ { 107078448, 221945 },
+ { 108003370, 222394 },
+ { 108934074, 222844 },
+ { 109870580, 223294 },
+ { 110812910, 223745 },
+ { 111761087, 224195 },
+ { 112715133, 224646 },
+ { 113675069, 225097 },
+ { 114640918, 225548 },
+ { 115612702, 225999 },
+ { 116590442, 226450 },
+ { 117574162, 226902 },
+ { 118563882, 227353 },
+ { 119559626, 227805 },
+ { 120561415, 228258 },
+ { 121569272, 228710 },
+ { 122583219, 229162 },
+ { 123603278, 229615 },
+ { 124629471, 230068 },
+ { 125661822, 230521 },
+ { 126700352, 230974 },
+ { 127745083, 231428 },
+ { 128796039, 231882 },
+ { 129853241, 232336 },
+ { 130916713, 232790 },
+ { 131986475, 233244 },
+ { 133062553, 233699 },
+ { 134144966, 234153 },
+ { 135233739, 234608 },
+ { 136328894, 235064 },
+ { 137430453, 235519 },
+ { 138538440, 235975 },
+ { 139652876, 236430 },
+ { 140773786, 236886 },
+ { 141901190, 237343 },
+ { 143035113, 237799 },
+ { 144175576, 238256 },
+ { 145322604, 238713 },
+ { 146476218, 239170 },
+ { 147636442, 239627 },
+ { 148803298, 240085 },
+ { 149976809, 240542 },
+ { 151156999, 241000 },
+ { 152343890, 241459 },
+ { 153537506, 241917 },
+ { 154737869, 242376 },
+ { 155945002, 242835 },
+ { 157158929, 243294 },
+ { 158379673, 243753 },
+ { 159607257, 244213 },
+ { 160841704, 244673 },
+ { 162083037, 245133 },
+ { 163331279, 245593 },
+ { 164586455, 246054 },
+ { 165848586, 246514 },
+ { 167117696, 246975 },
+ { 168393810, 247437 },
+ { 169676949, 247898 },
+ { 170967138, 248360 },
+ { 172264399, 248822 },
+ { 173568757, 249284 },
+ { 174880235, 249747 },
+ { 176198856, 250209 },
+ { 177524643, 250672 },
+ { 178857621, 251136 },
+ { 180197813, 251599 },
+ { 181545242, 252063 },
+ { 182899933, 252527 },
+ { 184261908, 252991 },
+ { 185631191, 253456 },
+ { 187007807, 253920 },
+ { 188391778, 254385 },
+ { 189783129, 254851 },
+ { 191181884, 255316 },
+ { 192588065, 255782 },
+ { 194001698, 256248 },
+ { 195422805, 256714 },
+ { 196851411, 257181 },
+ { 198287540, 257648 },
+ { 199731215, 258115 },
+ { 201182461, 258582 },
+ { 202641302, 259050 },
+ { 204107760, 259518 },
+ { 205581862, 259986 },
+ { 207063630, 260454 },
+ { 208553088, 260923 },
+ { 210050262, 261392 },
+ { 211555174, 261861 },
+ { 213067849, 262331 },
+ { 214588312, 262800 },
+ { 216116586, 263270 },
+ { 217652696, 263741 },
+ { 219196666, 264211 },
+ { 220748520, 264682 },
+ { 222308282, 265153 },
+ { 223875978, 265625 },
+ { 225451630, 266097 },
+ { 227035265, 266569 },
+ { 228626905, 267041 },
+ { 230226576, 267514 },
+ { 231834302, 267986 },
+ { 233450107, 268460 },
+ { 235074016, 268933 },
+ { 236706054, 269407 },
+ { 238346244, 269881 },
+ { 239994613, 270355 },
+ { 241651183, 270830 },
+ { 243315981, 271305 }
+};
+
+/* return largest index i such that fval <= lookup[i][small] */
+static inline u32 tfrc_binsearch(u32 fval, u8 small)
+{
+ u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
+
+ while (low < high) {
+ try = (low + high) / 2;
+ if (fval <= tfrc_calc_x_lookup[try][small])
+ high = try;
+ else
+ low = try + 1;
+ }
+ return high;
+}
+
+/**
+ * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
+ * @s: packet size in bytes
+ * @R: RTT scaled by 1000000 (i.e., microseconds)
+ * @p: loss ratio estimate scaled by 1000000
+ *
+ * Returns X_calc in bytes per second (not scaled).
+ */
+u32 tfrc_calc_x(u16 s, u32 R, u32 p)
+{
+ u16 index;
+ u32 f;
+ u64 result;
+
+ /* check against invalid parameters and divide-by-zero */
+ BUG_ON(p > 1000000); /* p must not exceed 100% */
+ BUG_ON(p == 0); /* f(0) = 0, divide by zero */
+ if (R == 0) { /* possible divide by zero */
+ DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
+ return ~0U;
+ }
+
+ if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
+ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
+ DCCP_WARN("Value of p (%d) below resolution. "
+ "Substituting %d\n", p, TFRC_SMALLEST_P);
+ index = 0;
+ } else /* 0.0001 <= p <= 0.05 */
+ index = p/TFRC_SMALLEST_P - 1;
+
+ f = tfrc_calc_x_lookup[index][1];
+
+ } else { /* 0.05 < p <= 1.00 */
+ index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
+
+ f = tfrc_calc_x_lookup[index][0];
+ }
+
+ /*
+ * Compute X = s/(R*f(p)) in bytes per second.
+ * Since f(p) and R are both scaled by 1000000, we need to multiply by
+ * 1000000^2. To avoid overflow, the result is computed in two stages.
+ * This works under almost all reasonable operational conditions, for a
+ * wide range of parameters. Yet, should some strange combination of
+ * parameters result in overflow, the use of scaled_div32 will catch
+ * this and return UINT_MAX - which is a logically adequate consequence.
+ */
+ result = scaled_div(s, R);
+ return scaled_div32(result, f);
+}
+
+/**
+ * tfrc_calc_x_reverse_lookup - try to find p given f(p)
+ * @fvalue: function value to match, scaled by 1000000
+ *
+ * Returns closest match for p, also scaled by 1000000
+ */
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
+{
+ int index;
+
+ if (fvalue == 0) /* f(p) = 0 whenever p = 0 */
+ return 0;
+
+ /* Error cases. */
+ if (fvalue < tfrc_calc_x_lookup[0][1]) {
+ DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
+ return TFRC_SMALLEST_P;
+ }
+ if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
+ DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
+ return 1000000;
+ }
+
+ if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
+ index = tfrc_binsearch(fvalue, 1);
+ return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
+ }
+
+ /* else ... it must be in the coarse-grained column */
+ index = tfrc_binsearch(fvalue, 0);
+ return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
+}
+
+/**
+ * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
+ * When @loss_event_rate is large, there is a chance that p is truncated to 0.
+ * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
+ */
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
+{
+ if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
+ return 0;
+ if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
+ return 1000000;
+ return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
+}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
new file mode 100644
index 000000000..bebc735f5
--- /dev/null
+++ b/net/dccp/dccp.h
@@ -0,0 +1,499 @@
+#ifndef _DCCP_H
+#define _DCCP_H
+/*
+ * net/dccp/dccp.h
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/dccp.h>
+#include <linux/ktime.h>
+#include <net/snmp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include "ackvec.h"
+
+/*
+ * DCCP - specific warning and debugging macros.
+ */
+#define DCCP_WARN(fmt, ...) \
+ net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__)
+#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \
+ __FILE__, __LINE__, __func__)
+#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0)
+#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \
+ DCCP_BUG("\"%s\" holds (exception!)", \
+ __stringify(cond)); \
+ } while (0)
+
+#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \
+ printk(fmt, ##args); \
+ } while(0)
+#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \
+ "%s: " fmt, __func__, ##a)
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern bool dccp_debug;
+#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
+#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
+#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
+#else
+#define dccp_pr_debug(format, a...)
+#define dccp_pr_debug_cat(format, a...)
+#define dccp_debug(format, a...)
+#endif
+
+extern struct inet_hashinfo dccp_hashinfo;
+
+extern struct percpu_counter dccp_orphan_count;
+
+void dccp_time_wait(struct sock *sk, int state, int timeo);
+
+/*
+ * Set safe upper bounds for header and option length. Since Data Offset is 8
+ * bits (RFC 4340, sec. 5.1), the total header length can never be more than
+ * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1):
+ * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR
+ * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
+ * Hence a safe upper bound for the maximum option length is 1020-28 = 992
+ */
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
+#define DCCP_MAX_PACKET_HDR 28
+#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
+#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
+
+/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
+#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
+
+#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
+ * state, about 60 seconds */
+
+/* RFC 1122, 4.2.3.1 initial RTO value */
+#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ))
+
+/*
+ * The maximum back-off value for retransmissions. This is needed for
+ * - retransmitting client-Requests (sec. 8.1.1),
+ * - retransmitting Close/CloseReq when closing (sec. 8.3),
+ * - feature-negotiation retransmission (sec. 6.6.3),
+ * - Acks in client-PARTOPEN state (sec. 8.1.5).
+ */
+#define DCCP_RTO_MAX ((unsigned int)(64 * HZ))
+
+/*
+ * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
+ */
+#define DCCP_SANE_RTT_MIN 100
+#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
+#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
+
+/* sysctl variables for DCCP */
+extern int sysctl_dccp_request_retries;
+extern int sysctl_dccp_retries1;
+extern int sysctl_dccp_retries2;
+extern int sysctl_dccp_tx_qlen;
+extern int sysctl_dccp_sync_ratelimit;
+
+/*
+ * 48-bit sequence number arithmetic (signed and unsigned)
+ */
+#define INT48_MIN 0x800000000000LL /* 2^47 */
+#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */
+#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */
+#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x)))
+#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x)))
+#define ADD48(a, b) (((a) + (b)) & UINT48_MAX)
+#define SUB48(a, b) ADD48((a), COMPLEMENT48(b))
+
+static inline void dccp_set_seqno(u64 *seqno, u64 value)
+{
+ *seqno = value & UINT48_MAX;
+}
+
+static inline void dccp_inc_seqno(u64 *seqno)
+{
+ *seqno = ADD48(*seqno, 1);
+}
+
+/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */
+static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2)
+{
+ u64 delta = SUB48(seqno2, seqno1);
+
+ return TO_SIGNED48(delta);
+}
+
+/* is seq1 < seq2 ? */
+static inline int before48(const u64 seq1, const u64 seq2)
+{
+ return (s64)((seq2 << 16) - (seq1 << 16)) > 0;
+}
+
+/* is seq1 > seq2 ? */
+#define after48(seq1, seq2) before48(seq2, seq1)
+
+/* is seq2 <= seq1 <= seq3 ? */
+static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
+{
+ return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
+}
+
+static inline u64 max48(const u64 seq1, const u64 seq2)
+{
+ return after48(seq1, seq2) ? seq1 : seq2;
+}
+
+/**
+ * dccp_loss_count - Approximate the number of lost data packets in a burst loss
+ * @s1: last known sequence number before the loss ('hole')
+ * @s2: first sequence number seen after the 'hole'
+ * @ndp: NDP count on packet with sequence number @s2
+ */
+static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
+{
+ s64 delta = dccp_delta_seqno(s1, s2);
+
+ WARN_ON(delta < 0);
+ delta -= ndp + 1;
+
+ return delta > 0 ? delta : 0;
+}
+
+/**
+ * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
+ */
+static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+{
+ return dccp_loss_count(s1, s2, ndp) == 0;
+}
+
+enum {
+ DCCP_MIB_NUM = 0,
+ DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
+ DCCP_MIB_ESTABRESETS, /* EstabResets */
+ DCCP_MIB_CURRESTAB, /* CurrEstab */
+ DCCP_MIB_OUTSEGS, /* OutSegs */
+ DCCP_MIB_OUTRSTS,
+ DCCP_MIB_ABORTONTIMEOUT,
+ DCCP_MIB_TIMEOUTS,
+ DCCP_MIB_ABORTFAILED,
+ DCCP_MIB_PASSIVEOPENS,
+ DCCP_MIB_ATTEMPTFAILS,
+ DCCP_MIB_OUTDATAGRAMS,
+ DCCP_MIB_INERRS,
+ DCCP_MIB_OPTMANDATORYERROR,
+ DCCP_MIB_INVALIDOPT,
+ __DCCP_MIB_MAX
+};
+
+#define DCCP_MIB_MAX __DCCP_MIB_MAX
+struct dccp_mib {
+ unsigned long mibs[DCCP_MIB_MAX];
+};
+
+DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
+#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
+#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
+#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
+
+/*
+ * Checksumming routines
+ */
+static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb)
+{
+ const struct dccp_hdr* dh = dccp_hdr(skb);
+
+ if (dh->dccph_cscov == 0)
+ return skb->len;
+ return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32);
+}
+
+static inline void dccp_csum_outgoing(struct sk_buff *skb)
+{
+ unsigned int cov = dccp_csum_coverage(skb);
+
+ if (cov >= skb->len)
+ dccp_hdr(skb)->dccph_cscov = 0;
+
+ skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
+}
+
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
+
+int dccp_retransmit_skb(struct sock *sk);
+
+void dccp_send_ack(struct sock *sk);
+void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *rsk);
+
+void dccp_send_sync(struct sock *sk, const u64 seq,
+ const enum dccp_pkt_type pkt_type);
+
+/*
+ * TX Packet Dequeueing Interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
+bool dccp_qpolicy_full(struct sock *sk);
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
+struct sk_buff *dccp_qpolicy_top(struct sock *sk);
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
+
+/*
+ * TX Packet Output and TX Timers
+ */
+void dccp_write_xmit(struct sock *sk);
+void dccp_write_space(struct sock *sk);
+void dccp_flush_write_queue(struct sock *sk, long *time_budget);
+
+void dccp_init_xmit_timers(struct sock *sk);
+static inline void dccp_clear_xmit_timers(struct sock *sk)
+{
+ inet_csk_clear_xmit_timers(sk);
+}
+
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+
+const char *dccp_packet_name(const int type);
+
+void dccp_set_state(struct sock *sk, const int state);
+void dccp_done(struct sock *sk);
+
+int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
+ struct sk_buff const *skb);
+
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
+
+struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
+
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst);
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req);
+
+int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb);
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned int len);
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned int len);
+
+int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
+void dccp_destroy_sock(struct sock *sk);
+
+void dccp_close(struct sock *sk, long timeout);
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req);
+
+int dccp_connect(struct sock *sk);
+int dccp_disconnect(struct sock *sk, int flags);
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+#ifdef CONFIG_COMPAT
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+#endif
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len);
+void dccp_shutdown(struct sock *sk, int how);
+int inet_dccp_listen(struct socket *sock, int backlog);
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait);
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+void dccp_req_err(struct sock *sk, u64 seq);
+
+struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb);
+int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
+void dccp_send_close(struct sock *sk, const int active);
+int dccp_invalid_packet(struct sk_buff *skb);
+u32 dccp_sample_rtt(struct sock *sk, long delta);
+
+static inline int dccp_bad_service_code(const struct sock *sk,
+ const __be32 service)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dp->dccps_service == service)
+ return 0;
+ return !dccp_list_has_service(dp->dccps_service_list, service);
+}
+
+/**
+ * dccp_skb_cb - DCCP per-packet control information
+ * @dccpd_type: one of %dccp_pkt_type (or unknown)
+ * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1
+ * @dccpd_reset_code: one of %dccp_reset_codes
+ * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code)
+ * @dccpd_opt_len: total length of all options (5.8) in the packet
+ * @dccpd_seq: sequence number
+ * @dccpd_ack_seq: acknowledgment number subheader field value
+ *
+ * This is used for transmission as well as for reception.
+ */
+struct dccp_skb_cb {
+ union {
+ struct inet_skb_parm h4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_skb_parm h6;
+#endif
+ } header;
+ __u8 dccpd_type:4;
+ __u8 dccpd_ccval:4;
+ __u8 dccpd_reset_code,
+ dccpd_reset_data[3];
+ __u16 dccpd_opt_len;
+ __u64 dccpd_seq;
+ __u64 dccpd_ack_seq;
+};
+
+#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
+
+/* RFC 4340, sec. 7.7 */
+static inline int dccp_non_data_packet(const struct sk_buff *skb)
+{
+ const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+ return type == DCCP_PKT_ACK ||
+ type == DCCP_PKT_CLOSE ||
+ type == DCCP_PKT_CLOSEREQ ||
+ type == DCCP_PKT_RESET ||
+ type == DCCP_PKT_SYNC ||
+ type == DCCP_PKT_SYNCACK;
+}
+
+/* RFC 4340, sec. 7.7 */
+static inline int dccp_data_packet(const struct sk_buff *skb)
+{
+ const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+ return type == DCCP_PKT_DATA ||
+ type == DCCP_PKT_DATAACK ||
+ type == DCCP_PKT_REQUEST ||
+ type == DCCP_PKT_RESPONSE;
+}
+
+static inline int dccp_packet_without_ack(const struct sk_buff *skb)
+{
+ const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+ return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
+}
+
+#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2)
+
+static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
+{
+ struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
+ sizeof(*dh));
+ dh->dccph_seq2 = 0;
+ dh->dccph_seq = htons((gss >> 32) & 0xfffff);
+ dhx->dccph_seq_low = htonl(gss & 0xffffffff);
+}
+
+static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
+ const u64 gsr)
+{
+ dhack->dccph_reserved1 = 0;
+ dhack->dccph_ack_nr_high = htons(gsr >> 32);
+ dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
+}
+
+static inline void dccp_update_gsr(struct sock *sk, u64 seq)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (after48(seq, dp->dccps_gsr))
+ dp->dccps_gsr = seq;
+ /* Sequence validity window depends on remote Sequence Window (7.5.1) */
+ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+ /*
+ * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
+ * 7.5.1 we perform this check beyond the initial handshake: W/W' are
+ * always > 32, so for the first W/W' packets in the lifetime of a
+ * connection we always have to adjust SWL.
+ * A second reason why we are doing this is that the window depends on
+ * the feature-remote value of Sequence Window: nothing stops the peer
+ * from updating this value while we are busy adjusting SWL for the
+ * first W packets (we would have to count from scratch again then).
+ * Therefore it is safer to always make sure that the Sequence Window
+ * is not artificially extended by a peer who grows SWL downwards by
+ * continually updating the feature-remote Sequence-Window.
+ * If sequence numbers wrap it is bad luck. But that will take a while
+ * (48 bit), and this measure prevents Sequence-number attacks.
+ */
+ if (before48(dp->dccps_swl, dp->dccps_isr))
+ dp->dccps_swl = dp->dccps_isr;
+ dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
+}
+
+static inline void dccp_update_gss(struct sock *sk, u64 seq)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ dp->dccps_gss = seq;
+ /* Ack validity window depends on local Sequence Window value (7.5.1) */
+ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+ /* Adjust AWL so that it is not below ISS - see comment above for SWL */
+ if (before48(dp->dccps_awl, dp->dccps_iss))
+ dp->dccps_awl = dp->dccps_iss;
+ dp->dccps_awh = dp->dccps_gss;
+}
+
+static inline int dccp_ackvec_pending(const struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
+ !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
+}
+
+static inline int dccp_ack_pending(const struct sock *sk)
+{
+ return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
+}
+
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
+int dccp_feat_finalise_settings(struct dccp_sock *dp);
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
+int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
+ struct sk_buff *skb);
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
+void dccp_feat_list_purge(struct list_head *fn_list);
+
+int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *);
+u32 dccp_timestamp(void);
+void dccp_timestamping_init(void);
+int dccp_insert_option(struct sk_buff *skb, unsigned char option,
+ const void *value, unsigned char len);
+
+#ifdef CONFIG_SYSCTL
+int dccp_sysctl_init(void);
+void dccp_sysctl_exit(void);
+#else
+static inline int dccp_sysctl_init(void)
+{
+ return 0;
+}
+
+static inline void dccp_sysctl_exit(void)
+{
+}
+#endif
+
+#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
new file mode 100644
index 000000000..5a45f8de5
--- /dev/null
+++ b/net/dccp/diag.c
@@ -0,0 +1,87 @@
+/*
+ * net/dccp/diag.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_get_info(struct sock *sk, struct tcp_info *info)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ memset(info, 0, sizeof(*info));
+
+ info->tcpi_state = sk->sk_state;
+ info->tcpi_retransmits = icsk->icsk_retransmits;
+ info->tcpi_probes = icsk->icsk_probes_out;
+ info->tcpi_backoff = icsk->icsk_backoff;
+ info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
+
+ if (dp->dccps_hc_rx_ackvec != NULL)
+ info->tcpi_options |= TCPI_OPT_SACK;
+
+ if (dp->dccps_hc_rx_ccid != NULL)
+ ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+
+ if (dp->dccps_hc_tx_ccid != NULL)
+ ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
+}
+
+static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
+{
+ r->idiag_rqueue = r->idiag_wqueue = 0;
+
+ if (_info != NULL)
+ dccp_get_info(sk, _info);
+}
+
+static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
+}
+
+static int dccp_diag_dump_one(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req);
+}
+
+static const struct inet_diag_handler dccp_diag_handler = {
+ .dump = dccp_diag_dump,
+ .dump_one = dccp_diag_dump_one,
+ .idiag_get_info = dccp_diag_get_info,
+ .idiag_type = IPPROTO_DCCP,
+};
+
+static int __init dccp_diag_init(void)
+{
+ return inet_diag_register(&dccp_diag_handler);
+}
+
+static void __exit dccp_diag_fini(void)
+{
+ inet_diag_unregister(&dccp_diag_handler);
+}
+
+module_init(dccp_diag_init);
+module_exit(dccp_diag_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP inet_diag handler");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
new file mode 100644
index 000000000..1704948e6
--- /dev/null
+++ b/net/dccp/feat.c
@@ -0,0 +1,1561 @@
+/*
+ * net/dccp/feat.c
+ *
+ * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ *
+ * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ * Rewrote from scratch, some bits from earlier code by
+ * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *
+ * ASSUMPTIONS
+ * -----------
+ * o Feature negotiation is coordinated with connection setup (as in TCP), wild
+ * changes of parameters of an established connection are not supported.
+ * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN.
+ * o All currently known SP features have 1-byte quantities. If in the future
+ * extensions of RFCs 4340..42 define features with item lengths larger than
+ * one byte, a feature-specific extension of the code will be required.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "ccid.h"
+#include "feat.h"
+
+/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
+unsigned long sysctl_dccp_sequence_window __read_mostly = 100;
+int sysctl_dccp_rx_ccid __read_mostly = 2,
+ sysctl_dccp_tx_ccid __read_mostly = 2;
+
+/*
+ * Feature activation handlers.
+ *
+ * These all use an u64 argument, to provide enough room for NN/SP features. At
+ * this stage the negotiated values have been checked to be within their range.
+ */
+static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid *new_ccid = ccid_new(ccid, sk, rx);
+
+ if (new_ccid == NULL)
+ return -ENOMEM;
+
+ if (rx) {
+ ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+ dp->dccps_hc_rx_ccid = new_ccid;
+ } else {
+ ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_tx_ccid = new_ccid;
+ }
+ return 0;
+}
+
+static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (rx) {
+ dp->dccps_r_seq_win = seq_win;
+ /* propagate changes to update SWL/SWH */
+ dccp_update_gsr(sk, dp->dccps_gsr);
+ } else {
+ dp->dccps_l_seq_win = seq_win;
+ /* propagate changes to update AWL */
+ dccp_update_gss(sk, dp->dccps_gss);
+ }
+ return 0;
+}
+
+static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
+{
+ if (rx)
+ dccp_sk(sk)->dccps_r_ack_ratio = ratio;
+ else
+ dccp_sk(sk)->dccps_l_ack_ratio = ratio;
+ return 0;
+}
+
+static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (rx) {
+ if (enable && dp->dccps_hc_rx_ackvec == NULL) {
+ dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
+ if (dp->dccps_hc_rx_ackvec == NULL)
+ return -ENOMEM;
+ } else if (!enable) {
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ }
+ }
+ return 0;
+}
+
+static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
+{
+ if (!rx)
+ dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
+ return 0;
+}
+
+/*
+ * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
+ * `rx' holds when the sending peer informs about his partial coverage via a
+ * ChangeR() option. In the other case, we are the sender and the receiver
+ * announces its coverage via ChangeL() options. The policy here is to honour
+ * such communication by enabling the corresponding partial coverage - but only
+ * if it has not been set manually before; the warning here means that all
+ * packets will be dropped.
+ */
+static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (rx)
+ dp->dccps_pcrlen = cscov;
+ else {
+ if (dp->dccps_pcslen == 0)
+ dp->dccps_pcslen = cscov;
+ else if (cscov > dp->dccps_pcslen)
+ DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
+ dp->dccps_pcslen, (u8)cscov);
+ }
+ return 0;
+}
+
+static const struct {
+ u8 feat_num; /* DCCPF_xxx */
+ enum dccp_feat_type rxtx; /* RX or TX */
+ enum dccp_feat_type reconciliation; /* SP or NN */
+ u8 default_value; /* as in 6.4 */
+ int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
+/*
+ * Lookup table for location and type of features (from RFC 4340/4342)
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ * | Feature | Location | Reconc. | Initial | Section |
+ * | | RX | TX | SP | NN | Value | Reference |
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ * | DCCPF_CCID | | X | X | | 2 | 10 |
+ * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 |
+ * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 |
+ * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 |
+ * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 |
+ * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 |
+ * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 |
+ * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 |
+ * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 |
+ * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 |
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ */
+} dccp_feat_table[] = {
+ { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid },
+ { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL },
+ { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win },
+ { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL },
+ { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio},
+ { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec },
+ { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp },
+ { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov},
+ { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL },
+ { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL },
+};
+#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table)
+
+/**
+ * dccp_feat_index - Hash function to map feature number into array position
+ * Returns consecutive array index or -1 if the feature is not understood.
+ */
+static int dccp_feat_index(u8 feat_num)
+{
+ /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
+ if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
+ return feat_num - 1;
+
+ /*
+ * Other features: add cases for new feature types here after adding
+ * them to the above table.
+ */
+ switch (feat_num) {
+ case DCCPF_SEND_LEV_RATE:
+ return DCCP_FEAT_SUPPORTED_MAX - 1;
+ }
+ return -1;
+}
+
+static u8 dccp_feat_type(u8 feat_num)
+{
+ int idx = dccp_feat_index(feat_num);
+
+ if (idx < 0)
+ return FEAT_UNKNOWN;
+ return dccp_feat_table[idx].reconciliation;
+}
+
+static int dccp_feat_default_value(u8 feat_num)
+{
+ int idx = dccp_feat_index(feat_num);
+ /*
+ * There are no default values for unknown features, so encountering a
+ * negative index here indicates a serious problem somewhere else.
+ */
+ DCCP_BUG_ON(idx < 0);
+
+ return idx < 0 ? 0 : dccp_feat_table[idx].default_value;
+}
+
+/*
+ * Debugging and verbose-printing section
+ */
+static const char *dccp_feat_fname(const u8 feat)
+{
+ static const char *const feature_names[] = {
+ [DCCPF_RESERVED] = "Reserved",
+ [DCCPF_CCID] = "CCID",
+ [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
+ [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
+ [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
+ [DCCPF_ACK_RATIO] = "Ack Ratio",
+ [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
+ [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
+ [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
+ [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
+ };
+ if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
+ return feature_names[DCCPF_RESERVED];
+
+ if (feat == DCCPF_SEND_LEV_RATE)
+ return "Send Loss Event Rate";
+ if (feat >= DCCPF_MIN_CCID_SPECIFIC)
+ return "CCID-specific";
+
+ return feature_names[feat];
+}
+
+static const char *const dccp_feat_sname[] = {
+ "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE",
+};
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_feat_oname(const u8 opt)
+{
+ switch (opt) {
+ case DCCPO_CHANGE_L: return "Change_L";
+ case DCCPO_CONFIRM_L: return "Confirm_L";
+ case DCCPO_CHANGE_R: return "Change_R";
+ case DCCPO_CONFIRM_R: return "Confirm_R";
+ }
+ return NULL;
+}
+
+static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
+{
+ u8 i, type = dccp_feat_type(feat_num);
+
+ if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
+ dccp_pr_debug_cat("(NULL)");
+ else if (type == FEAT_SP)
+ for (i = 0; i < val->sp.len; i++)
+ dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
+ else if (type == FEAT_NN)
+ dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
+ else
+ dccp_pr_debug_cat("unknown type %u", type);
+}
+
+static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
+{
+ u8 type = dccp_feat_type(feat_num);
+ dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
+
+ if (type == FEAT_NN)
+ fval.nn = dccp_decode_value_var(list, len);
+ dccp_feat_printval(feat_num, &fval);
+}
+
+static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
+{
+ dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote",
+ dccp_feat_fname(entry->feat_num));
+ dccp_feat_printval(entry->feat_num, &entry->val);
+ dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
+ entry->needs_confirm ? "(Confirm pending)" : "");
+}
+
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \
+ dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
+ dccp_feat_printvals(feat, val, len); \
+ dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0)
+
+#define dccp_feat_print_fnlist(fn_list) { \
+ const struct dccp_feat_entry *___entry; \
+ \
+ dccp_pr_debug("List Dump:\n"); \
+ list_for_each_entry(___entry, fn_list, node) \
+ dccp_feat_print_entry(___entry); \
+}
+#else /* ! CONFIG_IP_DCCP_DEBUG */
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
+#define dccp_feat_print_fnlist(fn_list)
+#endif
+
+static int __dccp_feat_activate(struct sock *sk, const int idx,
+ const bool is_local, dccp_feat_val const *fval)
+{
+ bool rx;
+ u64 val;
+
+ if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
+ return -1;
+ if (dccp_feat_table[idx].activation_hdlr == NULL)
+ return 0;
+
+ if (fval == NULL) {
+ val = dccp_feat_table[idx].default_value;
+ } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
+ if (fval->sp.vec == NULL) {
+ /*
+ * This can happen when an empty Confirm is sent
+ * for an SP (i.e. known) feature. In this case
+ * we would be using the default anyway.
+ */
+ DCCP_CRIT("Feature #%d undefined: using default", idx);
+ val = dccp_feat_table[idx].default_value;
+ } else {
+ val = fval->sp.vec[0];
+ }
+ } else {
+ val = fval->nn;
+ }
+
+ /* Location is RX if this is a local-RX or remote-TX feature */
+ rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
+
+ dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
+ dccp_feat_fname(dccp_feat_table[idx].feat_num),
+ fval ? "" : "default ", (unsigned long long)val);
+
+ return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
+}
+
+/**
+ * dccp_feat_activate - Activate feature value on socket
+ * @sk: fully connected DCCP socket (after handshake is complete)
+ * @feat_num: feature to activate, one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @fval: the value (SP or NN) to activate, or NULL to use the default value
+ *
+ * For general use this function is preferable over __dccp_feat_activate().
+ */
+static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
+ dccp_feat_val const *fval)
+{
+ return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
+}
+
+/* Test for "Req'd" feature (RFC 4340, 6.4) */
+static inline int dccp_feat_must_be_understood(u8 feat_num)
+{
+ return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
+ feat_num == DCCPF_SEQUENCE_WINDOW;
+}
+
+/* copy constructor, fval must not already contain allocated memory */
+static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
+{
+ fval->sp.len = len;
+ if (fval->sp.len > 0) {
+ fval->sp.vec = kmemdup(val, len, gfp_any());
+ if (fval->sp.vec == NULL) {
+ fval->sp.len = 0;
+ return -ENOBUFS;
+ }
+ }
+ return 0;
+}
+
+static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
+{
+ if (unlikely(val == NULL))
+ return;
+ if (dccp_feat_type(feat_num) == FEAT_SP)
+ kfree(val->sp.vec);
+ memset(val, 0, sizeof(*val));
+}
+
+static struct dccp_feat_entry *
+ dccp_feat_clone_entry(struct dccp_feat_entry const *original)
+{
+ struct dccp_feat_entry *new;
+ u8 type = dccp_feat_type(original->feat_num);
+
+ if (type == FEAT_UNKNOWN)
+ return NULL;
+
+ new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
+ if (new == NULL)
+ return NULL;
+
+ if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
+ original->val.sp.vec,
+ original->val.sp.len)) {
+ kfree(new);
+ return NULL;
+ }
+ return new;
+}
+
+static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
+{
+ if (entry != NULL) {
+ dccp_feat_val_destructor(entry->feat_num, &entry->val);
+ kfree(entry);
+ }
+}
+
+/*
+ * List management functions
+ *
+ * Feature negotiation lists rely on and maintain the following invariants:
+ * - each feat_num in the list is known, i.e. we know its type and default value
+ * - each feat_num/is_local combination is unique (old entries are overwritten)
+ * - SP values are always freshly allocated
+ * - list is sorted in increasing order of feature number (faster lookup)
+ */
+static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
+ u8 feat_num, bool is_local)
+{
+ struct dccp_feat_entry *entry;
+
+ list_for_each_entry(entry, fn_list, node) {
+ if (entry->feat_num == feat_num && entry->is_local == is_local)
+ return entry;
+ else if (entry->feat_num > feat_num)
+ break;
+ }
+ return NULL;
+}
+
+/**
+ * dccp_feat_entry_new - Central list update routine (called by all others)
+ * @head: list to add to
+ * @feat: feature number
+ * @local: whether the local (1) or remote feature with number @feat is meant
+ *
+ * This is the only constructor and serves to ensure the above invariants.
+ */
+static struct dccp_feat_entry *
+ dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
+{
+ struct dccp_feat_entry *entry;
+
+ list_for_each_entry(entry, head, node)
+ if (entry->feat_num == feat && entry->is_local == local) {
+ dccp_feat_val_destructor(entry->feat_num, &entry->val);
+ return entry;
+ } else if (entry->feat_num > feat) {
+ head = &entry->node;
+ break;
+ }
+
+ entry = kmalloc(sizeof(*entry), gfp_any());
+ if (entry != NULL) {
+ entry->feat_num = feat;
+ entry->is_local = local;
+ list_add_tail(&entry->node, head);
+ }
+ return entry;
+}
+
+/**
+ * dccp_feat_push_change - Add/overwrite a Change option in the list
+ * @fn_list: feature-negotiation list to update
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @mandatory: whether to use Mandatory feature negotiation options
+ * @fval: pointer to NN/SP value to be inserted (will be copied)
+ */
+static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
+ u8 mandatory, dccp_feat_val *fval)
+{
+ struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+
+ if (new == NULL)
+ return -ENOMEM;
+
+ new->feat_num = feat;
+ new->is_local = local;
+ new->state = FEAT_INITIALISING;
+ new->needs_confirm = false;
+ new->empty_confirm = false;
+ new->val = *fval;
+ new->needs_mandatory = mandatory;
+
+ return 0;
+}
+
+/**
+ * dccp_feat_push_confirm - Add a Confirm entry to the FN list
+ * @fn_list: feature-negotiation list to add to
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is being confirmed
+ * @fval: pointer to NN/SP value to be inserted or NULL
+ *
+ * Returns 0 on success, a Reset code for further processing otherwise.
+ */
+static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
+ dccp_feat_val *fval)
+{
+ struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+
+ if (new == NULL)
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ new->feat_num = feat;
+ new->is_local = local;
+ new->state = FEAT_STABLE; /* transition in 6.6.2 */
+ new->needs_confirm = true;
+ new->empty_confirm = (fval == NULL);
+ new->val.nn = 0; /* zeroes the whole structure */
+ if (!new->empty_confirm)
+ new->val = *fval;
+ new->needs_mandatory = false;
+
+ return 0;
+}
+
+static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
+{
+ return dccp_feat_push_confirm(fn_list, feat, local, NULL);
+}
+
+static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
+{
+ list_del(&entry->node);
+ dccp_feat_entry_destructor(entry);
+}
+
+void dccp_feat_list_purge(struct list_head *fn_list)
+{
+ struct dccp_feat_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, fn_list, node)
+ dccp_feat_entry_destructor(entry);
+ INIT_LIST_HEAD(fn_list);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
+
+/* generate @to as full clone of @from - @to must not contain any nodes */
+int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
+{
+ struct dccp_feat_entry *entry, *new;
+
+ INIT_LIST_HEAD(to);
+ list_for_each_entry(entry, from, node) {
+ new = dccp_feat_clone_entry(entry);
+ if (new == NULL)
+ goto cloning_failed;
+ list_add_tail(&new->node, to);
+ }
+ return 0;
+
+cloning_failed:
+ dccp_feat_list_purge(to);
+ return -ENOMEM;
+}
+
+/**
+ * dccp_feat_valid_nn_length - Enforce length constraints on NN options
+ * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
+ * incoming options are accepted as long as their values are valid.
+ */
+static u8 dccp_feat_valid_nn_length(u8 feat_num)
+{
+ if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */
+ return 2;
+ if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */
+ return 6;
+ return 0;
+}
+
+static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
+{
+ switch (feat_num) {
+ case DCCPF_ACK_RATIO:
+ return val <= DCCPF_ACK_RATIO_MAX;
+ case DCCPF_SEQUENCE_WINDOW:
+ return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
+ }
+ return 0; /* feature unknown - so we can't tell */
+}
+
+/* check that SP values are within the ranges defined in RFC 4340 */
+static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
+{
+ switch (feat_num) {
+ case DCCPF_CCID:
+ return val == DCCPC_CCID2 || val == DCCPC_CCID3;
+ /* Type-check Boolean feature values: */
+ case DCCPF_SHORT_SEQNOS:
+ case DCCPF_ECN_INCAPABLE:
+ case DCCPF_SEND_ACK_VECTOR:
+ case DCCPF_SEND_NDP_COUNT:
+ case DCCPF_DATA_CHECKSUM:
+ case DCCPF_SEND_LEV_RATE:
+ return val < 2;
+ case DCCPF_MIN_CSUM_COVER:
+ return val < 16;
+ }
+ return 0; /* feature unknown */
+}
+
+static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
+{
+ if (sp_list == NULL || sp_len < 1)
+ return 0;
+ while (sp_len--)
+ if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
+ return 0;
+ return 1;
+}
+
+/**
+ * dccp_feat_insert_opts - Generate FN options from current list state
+ * @skb: next sk_buff to be sent to the peer
+ * @dp: for client during handshake and general negotiation
+ * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
+ */
+int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
+ struct sk_buff *skb)
+{
+ struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+ struct dccp_feat_entry *pos, *next;
+ u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
+ bool rpt;
+
+ /* put entries into @skb in the order they appear in the list */
+ list_for_each_entry_safe_reverse(pos, next, fn, node) {
+ opt = dccp_feat_genopt(pos);
+ type = dccp_feat_type(pos->feat_num);
+ rpt = false;
+
+ if (pos->empty_confirm) {
+ len = 0;
+ ptr = NULL;
+ } else {
+ if (type == FEAT_SP) {
+ len = pos->val.sp.len;
+ ptr = pos->val.sp.vec;
+ rpt = pos->needs_confirm;
+ } else if (type == FEAT_NN) {
+ len = dccp_feat_valid_nn_length(pos->feat_num);
+ ptr = nn_in_nbo;
+ dccp_encode_value_var(pos->val.nn, ptr, len);
+ } else {
+ DCCP_BUG("unknown feature %u", pos->feat_num);
+ return -1;
+ }
+ }
+ dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
+
+ if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
+ return -1;
+ if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
+ return -1;
+
+ if (skb->sk->sk_state == DCCP_OPEN &&
+ (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) {
+ /*
+ * Confirms don't get retransmitted (6.6.3) once the
+ * connection is in state OPEN
+ */
+ dccp_feat_list_pop(pos);
+ } else {
+ /*
+ * Enter CHANGING after transmitting the Change
+ * option (6.6.2).
+ */
+ if (pos->state == FEAT_INITIALISING)
+ pos->state = FEAT_CHANGING;
+ }
+ }
+ return 0;
+}
+
+/**
+ * __feat_register_nn - Register new NN value on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an NN feature from %dccp_feature_numbers
+ * @mandatory: use Mandatory option if 1
+ * @nn_val: value to register (restricted to 4 bytes)
+ *
+ * Note that NN features are local by definition (RFC 4340, 6.3.2).
+ */
+static int __feat_register_nn(struct list_head *fn, u8 feat,
+ u8 mandatory, u64 nn_val)
+{
+ dccp_feat_val fval = { .nn = nn_val };
+
+ if (dccp_feat_type(feat) != FEAT_NN ||
+ !dccp_feat_is_valid_nn_val(feat, nn_val))
+ return -EINVAL;
+
+ /* Don't bother with default values, they will be activated anyway. */
+ if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
+ return 0;
+
+ return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
+}
+
+/**
+ * __feat_register_sp - Register new SP value/list on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an SP feature from %dccp_feature_numbers
+ * @is_local: whether the local (1) or the remote (0) @feat is meant
+ * @mandatory: use Mandatory option if 1
+ * @sp_val: SP value followed by optional preference list
+ * @sp_len: length of @sp_val in bytes
+ */
+static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
+ u8 mandatory, u8 const *sp_val, u8 sp_len)
+{
+ dccp_feat_val fval;
+
+ if (dccp_feat_type(feat) != FEAT_SP ||
+ !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
+ return -EINVAL;
+
+ /* Avoid negotiating alien CCIDs by only advertising supported ones */
+ if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
+ return -EOPNOTSUPP;
+
+ if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
+ return -ENOMEM;
+
+ return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
+}
+
+/**
+ * dccp_feat_register_sp - Register requests to change SP feature values
+ * @sk: client or listening socket
+ * @feat: one of %dccp_feature_numbers
+ * @is_local: whether the local (1) or remote (0) @feat is meant
+ * @list: array of preferred values, in descending order of preference
+ * @len: length of @list in bytes
+ */
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+ u8 const *list, u8 len)
+{ /* any changes must be registered before establishing the connection */
+ if (sk->sk_state != DCCP_CLOSED)
+ return -EISCONN;
+ if (dccp_feat_type(feat) != FEAT_SP)
+ return -EINVAL;
+ return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
+ 0, list, len);
+}
+
+/**
+ * dccp_feat_nn_get - Query current/pending value of NN feature
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ *
+ * For a known NN feature, returns value currently being negotiated, or
+ * current (confirmed) value if no negotiation is going on.
+ */
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat)
+{
+ if (dccp_feat_type(feat) == FEAT_NN) {
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_feat_entry *entry;
+
+ entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1);
+ if (entry != NULL)
+ return entry->val.nn;
+
+ switch (feat) {
+ case DCCPF_ACK_RATIO:
+ return dp->dccps_l_ack_ratio;
+ case DCCPF_SEQUENCE_WINDOW:
+ return dp->dccps_l_seq_win;
+ }
+ }
+ DCCP_BUG("attempt to look up unsupported feature %u", feat);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_feat_nn_get);
+
+/**
+ * dccp_feat_signal_nn_change - Update NN values for an established connection
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ * @nn_val: the new value to use
+ *
+ * This function is used to communicate NN updates out-of-band.
+ */
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ dccp_feat_val fval = { .nn = nn_val };
+ struct dccp_feat_entry *entry;
+
+ if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
+ return 0;
+
+ if (dccp_feat_type(feat) != FEAT_NN ||
+ !dccp_feat_is_valid_nn_val(feat, nn_val))
+ return -EINVAL;
+
+ if (nn_val == dccp_feat_nn_get(sk, feat))
+ return 0; /* already set or negotiation under way */
+
+ entry = dccp_feat_list_lookup(fn, feat, 1);
+ if (entry != NULL) {
+ dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n",
+ (unsigned long long)entry->val.nn,
+ (unsigned long long)nn_val);
+ dccp_feat_list_pop(entry);
+ }
+
+ inet_csk_schedule_ack(sk);
+ return dccp_feat_push_change(fn, feat, 1, 0, &fval);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
+
+/*
+ * Tracking features whose value depend on the choice of CCID
+ *
+ * This is designed with an extension in mind so that a list walk could be done
+ * before activating any features. However, the existing framework was found to
+ * work satisfactorily up until now, the automatic verification is left open.
+ * When adding new CCIDs, add a corresponding dependency table here.
+ */
+static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
+{
+ static const struct ccid_dependency ccid2_dependencies[2][2] = {
+ /*
+ * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
+ * feature and Send Ack Vector is an RX feature, `is_local'
+ * needs to be reversed.
+ */
+ { /* Dependencies of the receiver-side (remote) CCID2 */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = true,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ },
+ { /* Dependencies of the sender-side (local) CCID2 */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ }
+ };
+ static const struct ccid_dependency ccid3_dependencies[2][5] = {
+ { /*
+ * Dependencies of the receiver-side CCID3
+ */
+ { /* locally disable Ack Vectors */
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 0
+ },
+ { /* see below why Send Loss Event Rate is on */
+ .dependent_feat = DCCPF_SEND_LEV_RATE,
+ .is_local = true,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { /* NDP Count is needed as per RFC 4342, 6.1.1 */
+ .dependent_feat = DCCPF_SEND_NDP_COUNT,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 },
+ },
+ { /*
+ * CCID3 at the TX side: we request that the HC-receiver
+ * will not send Ack Vectors (they will be ignored, so
+ * Mandatory is not set); we enable Send Loss Event Rate
+ * (Mandatory since the implementation does not support
+ * the Loss Intervals option of RFC 4342, 8.6).
+ * The last two options are for peer's information only.
+ */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = false,
+ .is_mandatory = false,
+ .val = 0
+ },
+ {
+ .dependent_feat = DCCPF_SEND_LEV_RATE,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { /* this CCID does not support Ack Ratio */
+ .dependent_feat = DCCPF_ACK_RATIO,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 0
+ },
+ { /* tell receiver we are sending NDP counts */
+ .dependent_feat = DCCPF_SEND_NDP_COUNT,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ }
+ };
+ switch (ccid) {
+ case DCCPC_CCID2:
+ return ccid2_dependencies[is_local];
+ case DCCPC_CCID3:
+ return ccid3_dependencies[is_local];
+ default:
+ return NULL;
+ }
+}
+
+/**
+ * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
+ * @fn: feature-negotiation list to update
+ * @id: CCID number to track
+ * @is_local: whether TX CCID (1) or RX CCID (0) is meant
+ *
+ * This function needs to be called after registering all other features.
+ */
+static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
+{
+ const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
+ int i, rc = (table == NULL);
+
+ for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
+ if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
+ rc = __feat_register_sp(fn, table[i].dependent_feat,
+ table[i].is_local,
+ table[i].is_mandatory,
+ &table[i].val, 1);
+ else
+ rc = __feat_register_nn(fn, table[i].dependent_feat,
+ table[i].is_mandatory,
+ table[i].val);
+ return rc;
+}
+
+/**
+ * dccp_feat_finalise_settings - Finalise settings before starting negotiation
+ * @dp: client or listening socket (settings will be inherited)
+ *
+ * This is called after all registrations (socket initialisation, sysctls, and
+ * sockopt calls), and before sending the first packet containing Change options
+ * (ie. client-Request or server-Response), to ensure internal consistency.
+ */
+int dccp_feat_finalise_settings(struct dccp_sock *dp)
+{
+ struct list_head *fn = &dp->dccps_featneg;
+ struct dccp_feat_entry *entry;
+ int i = 2, ccids[2] = { -1, -1 };
+
+ /*
+ * Propagating CCIDs:
+ * 1) not useful to propagate CCID settings if this host advertises more
+ * than one CCID: the choice of CCID may still change - if this is
+ * the client, or if this is the server and the client sends
+ * singleton CCID values.
+ * 2) since is that propagate_ccid changes the list, we defer changing
+ * the sorted list until after the traversal.
+ */
+ list_for_each_entry(entry, fn, node)
+ if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
+ ccids[entry->is_local] = entry->val.sp.vec[0];
+ while (i--)
+ if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
+ return -1;
+ dccp_feat_print_fnlist(fn);
+ return 0;
+}
+
+/**
+ * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features
+ * It is the server which resolves the dependencies once the CCID has been
+ * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
+ */
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
+{
+ struct list_head *fn = &dreq->dreq_featneg;
+ struct dccp_feat_entry *entry;
+ u8 is_local, ccid;
+
+ for (is_local = 0; is_local <= 1; is_local++) {
+ entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
+
+ if (entry != NULL && !entry->empty_confirm)
+ ccid = entry->val.sp.vec[0];
+ else
+ ccid = dccp_feat_default_value(DCCPF_CCID);
+
+ if (dccp_feat_propagate_ccid(fn, ccid, is_local))
+ return -1;
+ }
+ return 0;
+}
+
+/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
+static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
+{
+ u8 c, s;
+
+ for (s = 0; s < slen; s++)
+ for (c = 0; c < clen; c++)
+ if (servlist[s] == clilist[c])
+ return servlist[s];
+ return -1;
+}
+
+/**
+ * dccp_feat_prefer - Move preferred entry to the start of array
+ * Reorder the @array_len elements in @array so that @preferred_value comes
+ * first. Returns >0 to indicate that @preferred_value does occur in @array.
+ */
+static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
+{
+ u8 i, does_occur = 0;
+
+ if (array != NULL) {
+ for (i = 0; i < array_len; i++)
+ if (array[i] == preferred_value) {
+ array[i] = array[0];
+ does_occur++;
+ }
+ if (does_occur)
+ array[0] = preferred_value;
+ }
+ return does_occur;
+}
+
+/**
+ * dccp_feat_reconcile - Reconcile SP preference lists
+ * @fv: SP list to reconcile into
+ * @arr: received SP preference list
+ * @len: length of @arr in bytes
+ * @is_server: whether this side is the server (and @fv is the server's list)
+ * @reorder: whether to reorder the list in @fv after reconciling with @arr
+ * When successful, > 0 is returned and the reconciled list is in @fval.
+ * A value of 0 means that negotiation failed (no shared entry).
+ */
+static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
+ bool is_server, bool reorder)
+{
+ int rc;
+
+ if (!fv->sp.vec || !arr) {
+ DCCP_CRIT("NULL feature value or array");
+ return 0;
+ }
+
+ if (is_server)
+ rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
+ else
+ rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
+
+ if (!reorder)
+ return rc;
+ if (rc < 0)
+ return 0;
+
+ /*
+ * Reorder list: used for activating features and in dccp_insert_fn_opt.
+ */
+ return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
+}
+
+/**
+ * dccp_feat_change_recv - Process incoming ChangeL/R options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether the Change was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is the server (1) or the client (0)
+ */
+static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len, const bool server)
+{
+ u8 defval, type = dccp_feat_type(feat);
+ const bool local = (opt == DCCPO_CHANGE_R);
+ struct dccp_feat_entry *entry;
+ dccp_feat_val fval;
+
+ if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */
+ goto unknown_feature_or_value;
+
+ dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
+ /*
+ * Negotiation of NN features: Change R is invalid, so there is no
+ * simultaneous negotiation; hence we do not look up in the list.
+ */
+ if (type == FEAT_NN) {
+ if (local || len > sizeof(fval.nn))
+ goto unknown_feature_or_value;
+
+ /* 6.3.2: "The feature remote MUST accept any valid value..." */
+ fval.nn = dccp_decode_value_var(val, len);
+ if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+ goto unknown_feature_or_value;
+
+ return dccp_feat_push_confirm(fn, feat, local, &fval);
+ }
+
+ /*
+ * Unidirectional/simultaneous negotiation of SP features (6.3.1)
+ */
+ entry = dccp_feat_list_lookup(fn, feat, local);
+ if (entry == NULL) {
+ /*
+ * No particular preferences have been registered. We deal with
+ * this situation by assuming that all valid values are equally
+ * acceptable, and apply the following checks:
+ * - if the peer's list is a singleton, we accept a valid value;
+ * - if we are the server, we first try to see if the peer (the
+ * client) advertises the default value. If yes, we use it,
+ * otherwise we accept the preferred value;
+ * - else if we are the client, we use the first list element.
+ */
+ if (dccp_feat_clone_sp_val(&fval, val, 1))
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ if (len > 1 && server) {
+ defval = dccp_feat_default_value(feat);
+ if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
+ fval.sp.vec[0] = defval;
+ } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
+ kfree(fval.sp.vec);
+ goto unknown_feature_or_value;
+ }
+
+ /* Treat unsupported CCIDs like invalid values */
+ if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
+ kfree(fval.sp.vec);
+ goto not_valid_or_not_known;
+ }
+
+ return dccp_feat_push_confirm(fn, feat, local, &fval);
+
+ } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */
+ return 0;
+ }
+
+ if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
+ entry->empty_confirm = false;
+ } else if (is_mandatory) {
+ return DCCP_RESET_CODE_MANDATORY_ERROR;
+ } else if (entry->state == FEAT_INITIALISING) {
+ /*
+ * Failed simultaneous negotiation (server only): try to `save'
+ * the connection by checking whether entry contains the default
+ * value for @feat. If yes, send an empty Confirm to signal that
+ * the received Change was not understood - which implies using
+ * the default value.
+ * If this also fails, we use Reset as the last resort.
+ */
+ WARN_ON(!server);
+ defval = dccp_feat_default_value(feat);
+ if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
+ return DCCP_RESET_CODE_OPTION_ERROR;
+ entry->empty_confirm = true;
+ }
+ entry->needs_confirm = true;
+ entry->needs_mandatory = false;
+ entry->state = FEAT_STABLE;
+ return 0;
+
+unknown_feature_or_value:
+ if (!is_mandatory)
+ return dccp_push_empty_confirm(fn, feat, local);
+
+not_valid_or_not_known:
+ return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_confirm_recv - Process received Confirm options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is server (1) or client (0)
+ */
+static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len, const bool server)
+{
+ u8 *plist, plen, type = dccp_feat_type(feat);
+ const bool local = (opt == DCCPO_CONFIRM_R);
+ struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
+
+ dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
+ if (entry == NULL) { /* nothing queued: ignore or handle error */
+ if (is_mandatory && type == FEAT_UNKNOWN)
+ return DCCP_RESET_CODE_MANDATORY_ERROR;
+
+ if (!local && type == FEAT_NN) /* 6.3.2 */
+ goto confirmation_failed;
+ return 0;
+ }
+
+ if (entry->state != FEAT_CHANGING) /* 6.6.2 */
+ return 0;
+
+ if (len == 0) {
+ if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */
+ goto confirmation_failed;
+ /*
+ * Empty Confirm during connection setup: this means reverting
+ * to the `old' value, which in this case is the default. Since
+ * we handle default values automatically when no other values
+ * have been set, we revert to the old value by removing this
+ * entry from the list.
+ */
+ dccp_feat_list_pop(entry);
+ return 0;
+ }
+
+ if (type == FEAT_NN) {
+ if (len > sizeof(entry->val.nn))
+ goto confirmation_failed;
+
+ if (entry->val.nn == dccp_decode_value_var(val, len))
+ goto confirmation_succeeded;
+
+ DCCP_WARN("Bogus Confirm for non-existing value\n");
+ goto confirmation_failed;
+ }
+
+ /*
+ * Parsing SP Confirms: the first element of @val is the preferred
+ * SP value which the peer confirms, the remainder depends on @len.
+ * Note that only the confirmed value need to be a valid SP value.
+ */
+ if (!dccp_feat_is_valid_sp_val(feat, *val))
+ goto confirmation_failed;
+
+ if (len == 1) { /* peer didn't supply a preference list */
+ plist = val;
+ plen = len;
+ } else { /* preferred value + preference list */
+ plist = val + 1;
+ plen = len - 1;
+ }
+
+ /* Check whether the peer got the reconciliation right (6.6.8) */
+ if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
+ DCCP_WARN("Confirm selected the wrong value %u\n", *val);
+ return DCCP_RESET_CODE_OPTION_ERROR;
+ }
+ entry->val.sp.vec[0] = *val;
+
+confirmation_succeeded:
+ entry->state = FEAT_STABLE;
+ return 0;
+
+confirmation_failed:
+ DCCP_WARN("Confirmation failed\n");
+ return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_handle_nn_established - Fast-path reception of NN options
+ * @sk: socket of an established DCCP connection
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
+ * @feat: NN number, one of %dccp_feature_numbers
+ * @val: NN value
+ * @len: length of @val in bytes
+ *
+ * This function combines the functionality of change_recv/confirm_recv, with
+ * the following differences (reset codes are the same):
+ * - cleanup after receiving the Confirm;
+ * - values are directly activated after successful parsing;
+ * - deliberately restricted to NN features.
+ * The restriction to NN features is essential since SP features can have non-
+ * predictable outcomes (depending on the remote configuration), and are inter-
+ * dependent (CCIDs for instance cause further dependencies).
+ */
+static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ const bool local = (opt == DCCPO_CONFIRM_R);
+ struct dccp_feat_entry *entry;
+ u8 type = dccp_feat_type(feat);
+ dccp_feat_val fval;
+
+ dccp_feat_print_opt(opt, feat, val, len, mandatory);
+
+ /* Ignore non-mandatory unknown and non-NN features */
+ if (type == FEAT_UNKNOWN) {
+ if (local && !mandatory)
+ return 0;
+ goto fast_path_unknown;
+ } else if (type != FEAT_NN) {
+ return 0;
+ }
+
+ /*
+ * We don't accept empty Confirms, since in fast-path feature
+ * negotiation the values are enabled immediately after sending
+ * the Change option.
+ * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
+ */
+ if (len == 0 || len > sizeof(fval.nn))
+ goto fast_path_unknown;
+
+ if (opt == DCCPO_CHANGE_L) {
+ fval.nn = dccp_decode_value_var(val, len);
+ if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+ goto fast_path_unknown;
+
+ if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
+ dccp_feat_activate(sk, feat, local, &fval))
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ /* set the `Ack Pending' flag to piggyback a Confirm */
+ inet_csk_schedule_ack(sk);
+
+ } else if (opt == DCCPO_CONFIRM_R) {
+ entry = dccp_feat_list_lookup(fn, feat, local);
+ if (entry == NULL || entry->state != FEAT_CHANGING)
+ return 0;
+
+ fval.nn = dccp_decode_value_var(val, len);
+ /*
+ * Just ignore a value that doesn't match our current value.
+ * If the option changes twice within two RTTs, then at least
+ * one CONFIRM will be received for the old value after a
+ * new CHANGE was sent.
+ */
+ if (fval.nn != entry->val.nn)
+ return 0;
+
+ /* Only activate after receiving the Confirm option (6.6.1). */
+ dccp_feat_activate(sk, feat, local, &fval);
+
+ /* It has been confirmed - so remove the entry */
+ dccp_feat_list_pop(entry);
+
+ } else {
+ DCCP_WARN("Received illegal option %u\n", opt);
+ goto fast_path_failed;
+ }
+ return 0;
+
+fast_path_unknown:
+ if (!mandatory)
+ return dccp_push_empty_confirm(fn, feat, local);
+
+fast_path_failed:
+ return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_parse_options - Process Feature-Negotiation Options
+ * @sk: for general use and used by the client during connection setup
+ * @dreq: used by the server during connection setup
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: value contents of @opt
+ * @len: length of @val in bytes
+ *
+ * Returns 0 on success, a Reset code for ending the connection otherwise.
+ */
+int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
+ u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+ bool server = false;
+
+ switch (sk->sk_state) {
+ /*
+ * Negotiation during connection setup
+ */
+ case DCCP_LISTEN:
+ server = true; /* fall through */
+ case DCCP_REQUESTING:
+ switch (opt) {
+ case DCCPO_CHANGE_L:
+ case DCCPO_CHANGE_R:
+ return dccp_feat_change_recv(fn, mandatory, opt, feat,
+ val, len, server);
+ case DCCPO_CONFIRM_R:
+ case DCCPO_CONFIRM_L:
+ return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
+ val, len, server);
+ }
+ break;
+ /*
+ * Support for exchanging NN options on an established connection.
+ */
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
+ val, len);
+ }
+ return 0; /* ignore FN options in all other states */
+}
+
+/**
+ * dccp_feat_init - Seed feature negotiation with host-specific defaults
+ * This initialises global defaults, depending on the value of the sysctls.
+ * These can later be overridden by registering changes via setsockopt calls.
+ * The last link in the chain is finalise_settings, to make sure that between
+ * here and the start of actual feature negotiation no inconsistencies enter.
+ *
+ * All features not appearing below use either defaults or are otherwise
+ * later adjusted through dccp_feat_finalise_settings().
+ */
+int dccp_feat_init(struct sock *sk)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ u8 on = 1, off = 0;
+ int rc;
+ struct {
+ u8 *val;
+ u8 len;
+ } tx, rx;
+
+ /* Non-negotiable (NN) features */
+ rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
+ sysctl_dccp_sequence_window);
+ if (rc)
+ return rc;
+
+ /* Server-priority (SP) features */
+
+ /* Advertise that short seqnos are not supported (7.6.1) */
+ rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
+ if (rc)
+ return rc;
+
+ /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
+ rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
+ if (rc)
+ return rc;
+
+ /*
+ * We advertise the available list of CCIDs and reorder according to
+ * preferences, to avoid failure resulting from negotiating different
+ * singleton values (which always leads to failure).
+ * These settings can still (later) be overridden via sockopts.
+ */
+ if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
+ ccid_get_builtin_ccids(&rx.val, &rx.len))
+ return -ENOBUFS;
+
+ if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
+ !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
+ goto free_ccid_lists;
+
+ rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
+ if (rc)
+ goto free_ccid_lists;
+
+ rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
+
+free_ccid_lists:
+ kfree(tx.val);
+ kfree(rx.val);
+ return rc;
+}
+
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_feat_entry *cur, *next;
+ int idx;
+ dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
+ [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
+ };
+
+ list_for_each_entry(cur, fn_list, node) {
+ /*
+ * An empty Confirm means that either an unknown feature type
+ * or an invalid value was present. In the first case there is
+ * nothing to activate, in the other the default value is used.
+ */
+ if (cur->empty_confirm)
+ continue;
+
+ idx = dccp_feat_index(cur->feat_num);
+ if (idx < 0) {
+ DCCP_BUG("Unknown feature %u", cur->feat_num);
+ goto activation_failed;
+ }
+ if (cur->state != FEAT_STABLE) {
+ DCCP_CRIT("Negotiation of %s %s failed in state %s",
+ cur->is_local ? "local" : "remote",
+ dccp_feat_fname(cur->feat_num),
+ dccp_feat_sname[cur->state]);
+ goto activation_failed;
+ }
+ fvals[idx][cur->is_local] = &cur->val;
+ }
+
+ /*
+ * Activate in decreasing order of index, so that the CCIDs are always
+ * activated as the last feature. This avoids the case where a CCID
+ * relies on the initialisation of one or more features that it depends
+ * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
+ */
+ for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
+ if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
+ __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
+ DCCP_CRIT("Could not activate %d", idx);
+ goto activation_failed;
+ }
+
+ /* Clean up Change options which have been confirmed already */
+ list_for_each_entry_safe(cur, next, fn_list, node)
+ if (!cur->needs_confirm)
+ dccp_feat_list_pop(cur);
+
+ dccp_pr_debug("Activation OK\n");
+ return 0;
+
+activation_failed:
+ /*
+ * We clean up everything that may have been allocated, since
+ * it is difficult to track at which stage negotiation failed.
+ * This is ok, since all allocation functions below are robust
+ * against NULL arguments.
+ */
+ ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ return -1;
+}
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
new file mode 100644
index 000000000..0e75cebb2
--- /dev/null
+++ b/net/dccp/feat.h
@@ -0,0 +1,137 @@
+#ifndef _DCCP_FEAT_H
+#define _DCCP_FEAT_H
+/*
+ * net/dccp/feat.h
+ *
+ * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include "dccp.h"
+
+/*
+ * Known limit values
+ */
+/* Ack Ratio takes 2-byte integer values (11.3) */
+#define DCCPF_ACK_RATIO_MAX 0xFFFF
+/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
+#define DCCPF_SEQ_WMIN 32
+#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
+/* Maximum number of SP values that fit in a single (Confirm) option */
+#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
+
+enum dccp_feat_type {
+ FEAT_AT_RX = 1, /* located at RX side of half-connection */
+ FEAT_AT_TX = 2, /* located at TX side of half-connection */
+ FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
+ FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
+ FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
+};
+
+enum dccp_feat_state {
+ FEAT_DEFAULT = 0, /* using default values from 6.4 */
+ FEAT_INITIALISING, /* feature is being initialised */
+ FEAT_CHANGING, /* Change sent but not confirmed yet */
+ FEAT_UNSTABLE, /* local modification in state CHANGING */
+ FEAT_STABLE /* both ends (think they) agree */
+};
+
+/**
+ * dccp_feat_val - Container for SP or NN feature values
+ * @nn: single NN value
+ * @sp.vec: single SP value plus optional preference list
+ * @sp.len: length of @sp.vec in bytes
+ */
+typedef union {
+ u64 nn;
+ struct {
+ u8 *vec;
+ u8 len;
+ } sp;
+} dccp_feat_val;
+
+/**
+ * struct feat_entry - Data structure to perform feature negotiation
+ * @val: feature's current value (SP features may have preference list)
+ * @state: feature's current state
+ * @feat_num: one of %dccp_feature_numbers
+ * @needs_mandatory: whether Mandatory options should be sent
+ * @needs_confirm: whether to send a Confirm instead of a Change
+ * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
+ * @is_local: feature location (1) or feature-remote (0)
+ * @node: list pointers, entries arranged in FIFO order
+ */
+struct dccp_feat_entry {
+ dccp_feat_val val;
+ enum dccp_feat_state state:8;
+ u8 feat_num;
+
+ bool needs_mandatory,
+ needs_confirm,
+ empty_confirm,
+ is_local;
+
+ struct list_head node;
+};
+
+static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
+{
+ if (entry->needs_confirm)
+ return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
+ return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
+}
+
+/**
+ * struct ccid_dependency - Track changes resulting from choosing a CCID
+ * @dependent_feat: one of %dccp_feature_numbers
+ * @is_local: local (1) or remote (0) @dependent_feat
+ * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
+ * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
+ */
+struct ccid_dependency {
+ u8 dependent_feat;
+ bool is_local:1,
+ is_mandatory:1;
+ u8 val;
+};
+
+/*
+ * Sysctls to seed defaults for feature negotiation
+ */
+extern unsigned long sysctl_dccp_sequence_window;
+extern int sysctl_dccp_rx_ccid;
+extern int sysctl_dccp_tx_ccid;
+
+int dccp_feat_init(struct sock *sk);
+void dccp_feat_initialise_sysctls(void);
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+ u8 const *list, u8 len);
+int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
+ u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
+int dccp_feat_clone_list(struct list_head const *, struct list_head *);
+
+/*
+ * Encoding variable-length options and their maximum length.
+ *
+ * This affects NN options (SP options are all u8) and other variable-length
+ * options (see table 3 in RFC 4340). The limit is currently given the Sequence
+ * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
+ * options consume less than 6 bytes (timestamps are 4 bytes).
+ * When updating this constant (e.g. due to new internet drafts / RFCs), make
+ * sure that you also update all code which refers to it.
+ */
+#define DCCP_OPTVAL_MAXLEN 6
+
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
+u64 dccp_decode_value_var(const u8 *bf, const u8 len);
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat);
+
+int dccp_insert_option_mandatory(struct sk_buff *skb);
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len,
+ bool repeat_first);
+#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
new file mode 100644
index 000000000..3bd14e885
--- /dev/null
+++ b/net/dccp/input.c
@@ -0,0 +1,732 @@
+/*
+ * net/dccp/input.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */
+int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8;
+
+static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ sk->sk_data_ready(sk);
+}
+
+static void dccp_fin(struct sock *sk, struct sk_buff *skb)
+{
+ /*
+ * On receiving Close/CloseReq, both RD/WR shutdown are performed.
+ * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after
+ * receiving the closing segment, but there is no guarantee that such
+ * data will be processed at all.
+ */
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ sock_set_flag(sk, SOCK_DONE);
+ dccp_enqueue_skb(sk, skb);
+}
+
+static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
+{
+ int queued = 0;
+
+ switch (sk->sk_state) {
+ /*
+ * We ignore Close when received in one of the following states:
+ * - CLOSED (may be a late or duplicate packet)
+ * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier)
+ * - RESPOND (already handled by dccp_check_req)
+ */
+ case DCCP_CLOSING:
+ /*
+ * Simultaneous-close: receiving a Close after sending one. This
+ * can happen if both client and server perform active-close and
+ * will result in an endless ping-pong of crossing and retrans-
+ * mitted Close packets, which only terminates when one of the
+ * nodes times out (min. 64 seconds). Quicker convergence can be
+ * achieved when one of the nodes acts as tie-breaker.
+ * This is ok as both ends are done with data transfer and each
+ * end is just waiting for the other to acknowledge termination.
+ */
+ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT)
+ break;
+ /* fall through */
+ case DCCP_REQUESTING:
+ case DCCP_ACTIVE_CLOSEREQ:
+ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+ dccp_done(sk);
+ break;
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ /* Give waiting application a chance to read pending data */
+ queued = 1;
+ dccp_fin(sk, skb);
+ dccp_set_state(sk, DCCP_PASSIVE_CLOSE);
+ /* fall through */
+ case DCCP_PASSIVE_CLOSE:
+ /*
+ * Retransmitted Close: we have already enqueued the first one.
+ */
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ }
+ return queued;
+}
+
+static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
+{
+ int queued = 0;
+
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == CloseReq)
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+ return queued;
+ }
+
+ /* Step 13: process relevant Client states < CLOSEREQ */
+ switch (sk->sk_state) {
+ case DCCP_REQUESTING:
+ dccp_send_close(sk, 0);
+ dccp_set_state(sk, DCCP_CLOSING);
+ break;
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ /* Give waiting application a chance to read pending data */
+ queued = 1;
+ dccp_fin(sk, skb);
+ dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ);
+ /* fall through */
+ case DCCP_PASSIVE_CLOSEREQ:
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ }
+ return queued;
+}
+
+static u16 dccp_reset_code_convert(const u8 code)
+{
+ const u16 error_code[] = {
+ [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */
+ [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */
+ [DCCP_RESET_CODE_ABORTED] = ECONNRESET,
+
+ [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED,
+ [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED,
+ [DCCP_RESET_CODE_TOO_BUSY] = EUSERS,
+ [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT,
+
+ [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG,
+ [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR,
+ [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC,
+ [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ,
+ [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP,
+ };
+
+ return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code];
+}
+
+static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
+{
+ u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
+
+ sk->sk_err = err;
+
+ /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */
+ dccp_fin(sk, skb);
+
+ if (err && !sock_flag(sk, SOCK_DEAD))
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+}
+
+static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
+
+ if (av == NULL)
+ return;
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ dccp_ackvec_input(av, skb);
+}
+
+static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ /* Don't deliver to RX CCID when node has shut down read end. */
+ if (!(sk->sk_shutdown & RCV_SHUTDOWN))
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ /*
+ * Until the TX queue has been drained, we can not honour SHUT_WR, since
+ * we need received feedback as input to adjust congestion control.
+ */
+ if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN))
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+}
+
+static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ struct dccp_sock *dp = dccp_sk(sk);
+ u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq,
+ ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+
+ /*
+ * Step 5: Prepare sequence numbers for Sync
+ * If P.type == Sync or P.type == SyncAck,
+ * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
+ * / * P is valid, so update sequence number variables
+ * accordingly. After this update, P will pass the tests
+ * in Step 6. A SyncAck is generated if necessary in
+ * Step 15 * /
+ * Update S.GSR, S.SWL, S.SWH
+ * Otherwise,
+ * Drop packet and return
+ */
+ if (dh->dccph_type == DCCP_PKT_SYNC ||
+ dh->dccph_type == DCCP_PKT_SYNCACK) {
+ if (between48(ackno, dp->dccps_awl, dp->dccps_awh) &&
+ dccp_delta_seqno(dp->dccps_swl, seqno) >= 0)
+ dccp_update_gsr(sk, seqno);
+ else
+ return -1;
+ }
+
+ /*
+ * Step 6: Check sequence numbers
+ * Let LSWL = S.SWL and LAWL = S.AWL
+ * If P.type == CloseReq or P.type == Close or P.type == Reset,
+ * LSWL := S.GSR + 1, LAWL := S.GAR
+ * If LSWL <= P.seqno <= S.SWH
+ * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
+ * Update S.GSR, S.SWL, S.SWH
+ * If P.type != Sync,
+ * Update S.GAR
+ */
+ lswl = dp->dccps_swl;
+ lawl = dp->dccps_awl;
+
+ if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
+ dh->dccph_type == DCCP_PKT_CLOSE ||
+ dh->dccph_type == DCCP_PKT_RESET) {
+ lswl = ADD48(dp->dccps_gsr, 1);
+ lawl = dp->dccps_gar;
+ }
+
+ if (between48(seqno, lswl, dp->dccps_swh) &&
+ (ackno == DCCP_PKT_WITHOUT_ACK_SEQ ||
+ between48(ackno, lawl, dp->dccps_awh))) {
+ dccp_update_gsr(sk, seqno);
+
+ if (dh->dccph_type != DCCP_PKT_SYNC &&
+ ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
+ after48(ackno, dp->dccps_gar))
+ dp->dccps_gar = ackno;
+ } else {
+ unsigned long now = jiffies;
+ /*
+ * Step 6: Check sequence numbers
+ * Otherwise,
+ * If P.type == Reset,
+ * Send Sync packet acknowledging S.GSR
+ * Otherwise,
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ *
+ * These Syncs are rate-limited as per RFC 4340, 7.5.4:
+ * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second.
+ */
+ if (time_before(now, (dp->dccps_rate_last +
+ sysctl_dccp_sync_ratelimit)))
+ return -1;
+
+ DCCP_WARN("Step 6 failed for %s packet, "
+ "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
+ "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
+ "sending SYNC...\n", dccp_packet_name(dh->dccph_type),
+ (unsigned long long) lswl, (unsigned long long) seqno,
+ (unsigned long long) dp->dccps_swh,
+ (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist"
+ : "exists",
+ (unsigned long long) lawl, (unsigned long long) ackno,
+ (unsigned long long) dp->dccps_awh);
+
+ dp->dccps_rate_last = now;
+
+ if (dh->dccph_type == DCCP_PKT_RESET)
+ seqno = dp->dccps_gsr;
+ dccp_send_sync(sk, seqno, DCCP_PKT_SYNC);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned int len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ switch (dccp_hdr(skb)->dccph_type) {
+ case DCCP_PKT_DATAACK:
+ case DCCP_PKT_DATA:
+ /*
+ * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when
+ * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening"
+ * - sk_receive_queue is full, use Code 2, "Receive Buffer"
+ */
+ dccp_enqueue_skb(sk, skb);
+ return 0;
+ case DCCP_PKT_ACK:
+ goto discard;
+ case DCCP_PKT_RESET:
+ /*
+ * Step 9: Process Reset
+ * If P.type == Reset,
+ * Tear down connection
+ * S.state := TIMEWAIT
+ * Set TIMEWAIT timer
+ * Drop packet and return
+ */
+ dccp_rcv_reset(sk, skb);
+ return 0;
+ case DCCP_PKT_CLOSEREQ:
+ if (dccp_rcv_closereq(sk, skb))
+ return 0;
+ goto discard;
+ case DCCP_PKT_CLOSE:
+ if (dccp_rcv_close(sk, skb))
+ return 0;
+ goto discard;
+ case DCCP_PKT_REQUEST:
+ /* Step 7
+ * or (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state >= OPEN and P.type == Request
+ * and P.seqno >= S.OSR)
+ * or (S.state >= OPEN and P.type == Response
+ * and P.seqno >= S.OSR)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if (dp->dccps_role != DCCP_ROLE_LISTEN)
+ goto send_sync;
+ goto check_seq;
+ case DCCP_PKT_RESPONSE:
+ if (dp->dccps_role != DCCP_ROLE_CLIENT)
+ goto send_sync;
+check_seq:
+ if (dccp_delta_seqno(dp->dccps_osr,
+ DCCP_SKB_CB(skb)->dccpd_seq) >= 0) {
+send_sync:
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_PKT_SYNC);
+ }
+ break;
+ case DCCP_PKT_SYNC:
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_PKT_SYNCACK);
+ /*
+ * From RFC 4340, sec. 5.7
+ *
+ * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
+ * MAY have non-zero-length application data areas, whose
+ * contents receivers MUST ignore.
+ */
+ goto discard;
+ }
+
+ DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
+discard:
+ __kfree_skb(skb);
+ return 0;
+}
+
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned int len)
+{
+ if (dccp_check_seqno(sk, skb))
+ goto discard;
+
+ if (dccp_parse_options(sk, NULL, skb))
+ return 1;
+
+ dccp_handle_ackvec_processing(sk, skb);
+ dccp_deliver_input_to_ccids(sk, skb);
+
+ return __dccp_rcv_established(sk, skb, dh, len);
+discard:
+ __kfree_skb(skb);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rcv_established);
+
+static int dccp_rcv_request_sent_state_process(struct sock *sk,
+ struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ const unsigned int len)
+{
+ /*
+ * Step 4: Prepare sequence numbers in REQUEST
+ * If S.state == REQUEST,
+ * If (P.type == Response or P.type == Reset)
+ * and S.AWL <= P.ackno <= S.AWH,
+ * / * Set sequence number variables corresponding to the
+ * other endpoint, so P will pass the tests in Step 6 * /
+ * Set S.GSR, S.ISR, S.SWL, S.SWH
+ * / * Response processing continues in Step 10; Reset
+ * processing continues in Step 9 * /
+ */
+ if (dh->dccph_type == DCCP_PKT_RESPONSE) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ long tstamp = dccp_timestamp();
+
+ if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dp->dccps_awl, dp->dccps_awh)) {
+ dccp_pr_debug("invalid ackno: S.AWL=%llu, "
+ "P.ackno=%llu, S.AWH=%llu\n",
+ (unsigned long long)dp->dccps_awl,
+ (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ (unsigned long long)dp->dccps_awh);
+ goto out_invalid_packet;
+ }
+
+ /*
+ * If option processing (Step 8) failed, return 1 here so that
+ * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
+ * the option type and is set in dccp_parse_options().
+ */
+ if (dccp_parse_options(sk, NULL, skb))
+ return 1;
+
+ /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+ if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
+ dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
+ dp->dccps_options_received.dccpor_timestamp_echo));
+
+ /* Stop the REQUEST timer */
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+ WARN_ON(sk->sk_send_head == NULL);
+ kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+
+ /*
+ * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
+ * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
+ * is done as part of activating the feature values below, since
+ * these settings depend on the local/remote Sequence Window
+ * features, which were undefined or not confirmed until now.
+ */
+ dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+
+ /*
+ * Step 10: Process REQUEST state (second part)
+ * If S.state == REQUEST,
+ * / * If we get here, P is a valid Response from the
+ * server (see Step 4), and we should move to
+ * PARTOPEN state. PARTOPEN means send an Ack,
+ * don't send Data packets, retransmit Acks
+ * periodically, and always include any Init Cookie
+ * from the Response * /
+ * S.state := PARTOPEN
+ * Set PARTOPEN timer
+ * Continue with S.state == PARTOPEN
+ * / * Step 12 will send the Ack completing the
+ * three-way handshake * /
+ */
+ dccp_set_state(sk, DCCP_PARTOPEN);
+
+ /*
+ * If feature negotiation was successful, activate features now;
+ * an activation failure means that this host could not activate
+ * one ore more features (e.g. insufficient memory), which would
+ * leave at least one feature in an undefined state.
+ */
+ if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
+ goto unable_to_proceed;
+
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+
+ if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
+ icsk->icsk_accept_queue.rskq_defer_accept) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+ * It may be deleted, but with this feature tcpdumps
+ * look so _wonderfully_ clever, that I was not able
+ * to stand against the temptation 8) --ANK
+ */
+ /*
+ * OK, in DCCP we can as well do a similar trick, its
+ * even in the draft, but there is no need for us to
+ * schedule an ack here, as dccp_sendmsg does this for
+ * us, also stated in the draft. -acme
+ */
+ __kfree_skb(skb);
+ return 0;
+ }
+ dccp_send_ack(sk);
+ return -1;
+ }
+
+out_invalid_packet:
+ /* dccp_v4_do_rcv will send a reset */
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+ return 1;
+
+unable_to_proceed:
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
+ /*
+ * We mark this socket as no longer usable, so that the loop in
+ * dccp_sendmsg() terminates and the application gets notified.
+ */
+ dccp_set_state(sk, DCCP_CLOSED);
+ sk->sk_err = ECOMM;
+ return 1;
+}
+
+static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
+ struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ const unsigned int len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
+ int queued = 0;
+
+ switch (dh->dccph_type) {
+ case DCCP_PKT_RESET:
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+ break;
+ case DCCP_PKT_DATA:
+ if (sk->sk_state == DCCP_RESPOND)
+ break;
+ case DCCP_PKT_DATAACK:
+ case DCCP_PKT_ACK:
+ /*
+ * FIXME: we should be resetting the PARTOPEN (DELACK) timer
+ * here but only if we haven't used the DELACK timer for
+ * something else, like sending a delayed ack for a TIMESTAMP
+ * echo, etc, for now were not clearing it, sending an extra
+ * ACK when there is nothing else to do in DELACK is not a big
+ * deal after all.
+ */
+
+ /* Stop the PARTOPEN timer */
+ if (sk->sk_state == DCCP_PARTOPEN)
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+
+ /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+ if (likely(sample)) {
+ long delta = dccp_timestamp() - sample;
+
+ dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
+ }
+
+ dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+ dccp_set_state(sk, DCCP_OPEN);
+
+ if (dh->dccph_type == DCCP_PKT_DATAACK ||
+ dh->dccph_type == DCCP_PKT_DATA) {
+ __dccp_rcv_established(sk, skb, dh, len);
+ queued = 1; /* packet was queued
+ (by __dccp_rcv_established) */
+ }
+ break;
+ }
+
+ return queued;
+}
+
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned int len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ const int old_state = sk->sk_state;
+ int queued = 0;
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * If S.state == LISTEN,
+ * If P.type == Request or P contains a valid Init Cookie option,
+ * (* Must scan the packet's options to check for Init
+ * Cookies. Only Init Cookies are processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *)
+ * (* Generate a new socket and switch to that socket *)
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookies
+ * Initialize S.GAR := S.ISS
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
+ * Cookies Continue with S.state == RESPOND
+ * (* A Response packet will be generated in Step 11 *)
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (sk->sk_state == DCCP_LISTEN) {
+ if (dh->dccph_type == DCCP_PKT_REQUEST) {
+ if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
+ skb) < 0)
+ return 1;
+ goto discard;
+ }
+ if (dh->dccph_type == DCCP_PKT_RESET)
+ goto discard;
+
+ /* Caller (dccp_v4_do_rcv) will send Reset */
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ return 1;
+ } else if (sk->sk_state == DCCP_CLOSED) {
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ return 1;
+ }
+
+ /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
+ if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
+ goto discard;
+
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_RESPONSE) ||
+ (dp->dccps_role == DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_REQUEST) ||
+ (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
+ dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+ goto discard;
+ }
+
+ /* Step 8: Process options */
+ if (dccp_parse_options(sk, NULL, skb))
+ return 1;
+
+ /*
+ * Step 9: Process Reset
+ * If P.type == Reset,
+ * Tear down connection
+ * S.state := TIMEWAIT
+ * Set TIMEWAIT timer
+ * Drop packet and return
+ */
+ if (dh->dccph_type == DCCP_PKT_RESET) {
+ dccp_rcv_reset(sk, skb);
+ return 0;
+ } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */
+ if (dccp_rcv_closereq(sk, skb))
+ return 0;
+ goto discard;
+ } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */
+ if (dccp_rcv_close(sk, skb))
+ return 0;
+ goto discard;
+ }
+
+ switch (sk->sk_state) {
+ case DCCP_REQUESTING:
+ queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
+ if (queued >= 0)
+ return queued;
+
+ __kfree_skb(skb);
+ return 0;
+
+ case DCCP_PARTOPEN:
+ /* Step 8: if using Ack Vectors, mark packet acknowledgeable */
+ dccp_handle_ackvec_processing(sk, skb);
+ dccp_deliver_input_to_ccids(sk, skb);
+ /* fall through */
+ case DCCP_RESPOND:
+ queued = dccp_rcv_respond_partopen_state_process(sk, skb,
+ dh, len);
+ break;
+ }
+
+ if (dh->dccph_type == DCCP_PKT_ACK ||
+ dh->dccph_type == DCCP_PKT_DATAACK) {
+ switch (old_state) {
+ case DCCP_PARTOPEN:
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ break;
+ }
+ } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
+ dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
+ goto discard;
+ }
+
+ if (!queued) {
+discard:
+ __kfree_skb(skb);
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
+
+/**
+ * dccp_sample_rtt - Validate and finalise computation of RTT sample
+ * @delta: number of microseconds between packet and acknowledgment
+ *
+ * The routine is kept generic to work in different contexts. It should be
+ * called immediately when the ACK used for the RTT sample arrives.
+ */
+u32 dccp_sample_rtt(struct sock *sk, long delta)
+{
+ /* dccpor_elapsed_time is either zeroed out or set and > 0 */
+ delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
+
+ if (unlikely(delta <= 0)) {
+ DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
+ return DCCP_SANE_RTT_MIN;
+ }
+ if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
+ DCCP_WARN("RTT sample %ld too large, using max\n", delta);
+ return DCCP_SANE_RTT_MAX;
+ }
+
+ return delta;
+}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
new file mode 100644
index 000000000..ccf4c5629
--- /dev/null
+++ b/net/dccp/ipv4.c
@@ -0,0 +1,1091 @@
+/*
+ * net/dccp/ipv4.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+
+#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+#include <net/timewait_sock.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+#include <net/secure_seq.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+#include "feat.h"
+
+/*
+ * The per-net dccp.v4_ctl_sk socket is used for responding to
+ * the Out-of-the-blue (OOTB) packets. A control sock will be created
+ * for this socket at the initialization time.
+ */
+
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct inet_sock *inet = inet_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ __be16 orig_sport, orig_dport;
+ __be32 daddr, nexthop;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ int err;
+ struct ip_options_rcu *inet_opt;
+
+ dp->dccps_role = DCCP_ROLE_CLIENT;
+
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ nexthop = daddr = usin->sin_addr.s_addr;
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt != NULL && inet_opt->opt.srr) {
+ if (daddr == 0)
+ return -EINVAL;
+ nexthop = inet_opt->opt.faddr;
+ }
+
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_DCCP,
+ orig_sport, orig_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+ ip_rt_put(rt);
+ return -ENETUNREACH;
+ }
+
+ if (inet_opt == NULL || !inet_opt->opt.srr)
+ daddr = fl4->daddr;
+
+ if (inet->inet_saddr == 0)
+ inet->inet_saddr = fl4->saddr;
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
+ inet->inet_dport = usin->sin_port;
+ sk_daddr_set(sk, daddr);
+
+ inet_csk(sk)->icsk_ext_hdr_len = 0;
+ if (inet_opt)
+ inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+ /*
+ * Socket identity is still unknown (sport may be zero).
+ * However we set state to DCCP_REQUESTING and not releasing socket
+ * lock select source port, enter ourselves into the hash tables and
+ * complete initialization after this.
+ */
+ dccp_set_state(sk, DCCP_REQUESTING);
+ err = inet_hash_connect(&dccp_death_row, sk);
+ if (err != 0)
+ goto failure;
+
+ rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ goto failure;
+ }
+ /* OK, now commit destination to socket. */
+ sk_setup_caps(sk, &rt->dst);
+
+ dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ inet->inet_dport);
+ inet->inet_id = dp->dccps_iss ^ jiffies;
+
+ err = dccp_connect(sk);
+ rt = NULL;
+ if (err != 0)
+ goto failure;
+out:
+ return err;
+failure:
+ /*
+ * This unhashes the socket and releases the local port, if necessary.
+ */
+ dccp_set_state(sk, DCCP_CLOSED);
+ ip_rt_put(rt);
+ sk->sk_route_caps = 0;
+ inet->inet_dport = 0;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(dccp_v4_connect);
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void dccp_do_pmtu_discovery(struct sock *sk,
+ const struct iphdr *iph,
+ u32 mtu)
+{
+ struct dst_entry *dst;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
+ * send out by Linux are always < 576bytes so they should go through
+ * unfragmented).
+ */
+ if (sk->sk_state == DCCP_LISTEN)
+ return;
+
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
+ return;
+
+ /* Something is about to be wrong... Remember soft error
+ * for the case, if this connection will not able to recover.
+ */
+ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ sk->sk_err_soft = EMSGSIZE;
+
+ mtu = dst_mtu(dst);
+
+ if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
+ inet_csk(sk)->icsk_pmtu_cookie > mtu) {
+ dccp_sync_mss(sk, mtu);
+
+ /*
+ * From RFC 4340, sec. 14.1:
+ *
+ * DCCP-Sync packets are the best choice for upward
+ * probing, since DCCP-Sync probes do not risk application
+ * data loss.
+ */
+ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+ } /* else let the usual retransmit timer handle it */
+}
+
+static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
+void dccp_req_err(struct sock *sk, u64 seq)
+ {
+ struct request_sock *req = inet_reqsk(sk);
+ struct net *net = sock_net(sk);
+
+ /*
+ * ICMPs are not backlogged, hence we cannot get an established
+ * socket here.
+ */
+ WARN_ON(req->sk);
+
+ if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ reqsk_put(req);
+ } else {
+ /*
+ * Still in RESPOND, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ }
+}
+EXPORT_SYMBOL(dccp_req_err);
+
+/*
+ * This routine is called by the ICMP module when it gets some sort of error
+ * condition. If err < 0 then the socket should be closed and the error
+ * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
+ * After adjustment header points to the first 8 bytes of the tcp header. We
+ * need to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When someone else
+ * accesses the socket the ICMP is just dropped and for some paths there is no
+ * check at all. A more general error queue to queue errors for later handling
+ * is probably better.
+ */
+static void dccp_v4_err(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (struct iphdr *)skb->data;
+ const u8 offset = iph->ihl << 2;
+ const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+ struct dccp_sock *dp;
+ struct inet_sock *inet;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct sock *sk;
+ __u64 seq;
+ int err;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->len < offset + sizeof(*dh) ||
+ skb->len < offset + __dccp_basic_hdr_len(dh)) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return;
+ }
+
+ sk = __inet_lookup_established(net, &dccp_hashinfo,
+ iph->daddr, dh->dccph_dport,
+ iph->saddr, ntohs(dh->dccph_sport),
+ inet_iif(skb));
+ if (!sk) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return;
+ }
+
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ inet_twsk_put(inet_twsk(sk));
+ return;
+ }
+ seq = dccp_hdr_seq(dh);
+ if (sk->sk_state == DCCP_NEW_SYN_RECV)
+ return dccp_req_err(sk, seq);
+
+ bh_lock_sock(sk);
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ if (sock_owned_by_user(sk))
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == DCCP_CLOSED)
+ goto out;
+
+ dp = dccp_sk(sk);
+ if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
+ !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ switch (type) {
+ case ICMP_REDIRECT:
+ dccp_do_redirect(skb, sk);
+ goto out;
+ case ICMP_SOURCE_QUENCH:
+ /* Just silently ignore these. */
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code > NR_ICMP_UNREACH)
+ goto out;
+
+ if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ if (!sock_owned_by_user(sk))
+ dccp_do_pmtu_discovery(sk, iph, info);
+ goto out;
+ }
+
+ err = icmp_err_convert[code].errno;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ default:
+ goto out;
+ }
+
+ switch (sk->sk_state) {
+ case DCCP_REQUESTING:
+ case DCCP_RESPOND:
+ if (!sock_owned_by_user(sk)) {
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ sk->sk_err = err;
+
+ sk->sk_error_report(sk);
+
+ dccp_done(sk);
+ } else
+ sk->sk_err_soft = err;
+ goto out;
+ }
+
+ /* If we've already connected we will keep trying
+ * until we time out, or the user gives up.
+ *
+ * rfc1122 4.2.3.9 allows to consider as hard errors
+ * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+ * but it is obsoleted by pmtu discovery).
+ *
+ * Note, that in modern internet, where routing is unreliable
+ * and in each dark corner broken firewalls sit, sending random
+ * errors ordered by their masters even this two messages finally lose
+ * their original sense (even Linux sends invalid PORT_UNREACHs)
+ *
+ * Now we are in compliance with RFCs.
+ * --ANK (980905)
+ */
+
+ inet = inet_sk(sk);
+ if (!sock_owned_by_user(sk) && inet->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else /* Only an error on timeout */
+ sk->sk_err_soft = err;
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
+ __be32 src, __be32 dst)
+{
+ return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum);
+}
+
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dccp_csum_outgoing(skb);
+ dh->dccph_checksum = dccp_v4_csum_finish(skb,
+ inet->inet_saddr,
+ inet->inet_daddr);
+}
+EXPORT_SYMBOL_GPL(dccp_v4_send_check);
+
+static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb)
+{
+ return secure_dccp_sequence_number(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
+ dccp_hdr(skb)->dccph_dport,
+ dccp_hdr(skb)->dccph_sport);
+}
+
+/*
+ * The three way handshake has completed - we got a valid ACK or DATAACK -
+ * now create the new socket.
+ *
+ * This is the equivalent of TCP's tcp_v4_syn_recv_sock
+ */
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq;
+ struct inet_sock *newinet;
+ struct sock *newsk;
+
+ if (sk_acceptq_is_full(sk))
+ goto exit_overflow;
+
+ newsk = dccp_create_openreq_child(sk, req, skb);
+ if (newsk == NULL)
+ goto exit_nonewsk;
+
+ newinet = inet_sk(newsk);
+ ireq = inet_rsk(req);
+ sk_daddr_set(newsk, ireq->ir_rmt_addr);
+ sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+ newinet->inet_saddr = ireq->ir_loc_addr;
+ newinet->inet_opt = ireq->opt;
+ ireq->opt = NULL;
+ newinet->mc_index = inet_iif(skb);
+ newinet->mc_ttl = ip_hdr(skb)->ttl;
+ newinet->inet_id = jiffies;
+
+ if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
+ goto put_and_exit;
+
+ sk_setup_caps(newsk, dst);
+
+ dccp_sync_mss(newsk, dst_mtu(dst));
+
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+ __inet_hash_nolisten(newsk, NULL);
+
+ return newsk;
+
+exit_overflow:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+ dst_release(dst);
+exit:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ dccp_done(newsk);
+ goto exit;
+}
+EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
+
+static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct sock *nsk;
+ /* Find possible connection requests. */
+ struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport,
+ iph->saddr, iph->daddr);
+ if (req) {
+ nsk = dccp_check_req(sk, skb, req);
+ if (!nsk)
+ reqsk_put(req);
+ return nsk;
+ }
+ nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
+ iph->saddr, dh->dccph_sport,
+ iph->daddr, dh->dccph_dport,
+ inet_iif(skb));
+ if (nsk != NULL) {
+ if (nsk->sk_state != DCCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put(inet_twsk(nsk));
+ return NULL;
+ }
+
+ return sk;
+}
+
+static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct rtable *rt;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .flowi4_oif = inet_iif(skb),
+ .daddr = iph->saddr,
+ .saddr = iph->daddr,
+ .flowi4_tos = RT_CONN_FLAGS(sk),
+ .flowi4_proto = sk->sk_protocol,
+ .fl4_sport = dccp_hdr(skb)->dccph_dport,
+ .fl4_dport = dccp_hdr(skb)->dccph_sport,
+ };
+
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+
+ return &rt->dst;
+}
+
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req)
+{
+ int err = -1;
+ struct sk_buff *skb;
+ struct dst_entry *dst;
+ struct flowi4 fl4;
+
+ dst = inet_csk_route_req(sk, &fl4, req);
+ if (dst == NULL)
+ goto out;
+
+ skb = dccp_make_response(sk, dst, req);
+ if (skb != NULL) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr);
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
+ ireq->opt);
+ err = net_xmit_eval(err);
+ }
+
+out:
+ dst_release(dst);
+ return err;
+}
+
+static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
+{
+ int err;
+ const struct iphdr *rxiph;
+ struct sk_buff *skb;
+ struct dst_entry *dst;
+ struct net *net = dev_net(skb_dst(rxskb)->dev);
+ struct sock *ctl_sk = net->dccp.v4_ctl_sk;
+
+ /* Never send a reset in response to a reset. */
+ if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
+ return;
+
+ if (skb_rtable(rxskb)->rt_type != RTN_LOCAL)
+ return;
+
+ dst = dccp_v4_route_skb(net, ctl_sk, rxskb);
+ if (dst == NULL)
+ return;
+
+ skb = dccp_ctl_make_reset(ctl_sk, rxskb);
+ if (skb == NULL)
+ goto out;
+
+ rxiph = ip_hdr(rxskb);
+ dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr,
+ rxiph->daddr);
+ skb_dst_set(skb, dst_clone(dst));
+
+ bh_lock_sock(ctl_sk);
+ err = ip_build_and_send_pkt(skb, ctl_sk,
+ rxiph->daddr, rxiph->saddr, NULL);
+ bh_unlock_sock(ctl_sk);
+
+ if (net_xmit_eval(err) == 0) {
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ }
+out:
+ dst_release(dst);
+}
+
+static void dccp_v4_reqsk_destructor(struct request_sock *req)
+{
+ dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+ kfree(inet_rsk(req)->opt);
+}
+
+void dccp_syn_ack_timeout(const struct request_sock *req)
+{
+}
+EXPORT_SYMBOL(dccp_syn_ack_timeout);
+
+static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
+ .family = PF_INET,
+ .obj_size = sizeof(struct dccp_request_sock),
+ .rtx_syn_ack = dccp_v4_send_response,
+ .send_ack = dccp_reqsk_send_ack,
+ .destructor = dccp_v4_reqsk_destructor,
+ .send_reset = dccp_v4_ctl_send_reset,
+ .syn_ack_timeout = dccp_syn_ack_timeout,
+};
+
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq;
+ struct request_sock *req;
+ struct dccp_request_sock *dreq;
+ const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+
+ /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ return 0; /* discard, don't send a reset here */
+
+ if (dccp_bad_service_code(sk, service)) {
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+ goto drop;
+ }
+ /*
+ * TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+ if (inet_csk_reqsk_queue_is_full(sk))
+ goto drop;
+
+ /*
+ * Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ goto drop;
+
+ req = inet_reqsk_alloc(&dccp_request_sock_ops, sk);
+ if (req == NULL)
+ goto drop;
+
+ if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+ goto drop_and_free;
+
+ dreq = dccp_rsk(req);
+ if (dccp_parse_options(sk, dreq, skb))
+ goto drop_and_free;
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
+
+ ireq = inet_rsk(req);
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+ ireq->ireq_family = AF_INET;
+ ireq->ir_iif = sk->sk_bound_dev_if;
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ *
+ * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
+ */
+ dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_gsr = dreq->dreq_isr;
+ dreq->dreq_iss = dccp_v4_init_sequence(skb);
+ dreq->dreq_gss = dreq->dreq_iss;
+ dreq->dreq_service = service;
+
+ if (dccp_v4_send_response(sk, req))
+ goto drop_and_free;
+
+ inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+ return 0;
+
+drop_and_free:
+ reqsk_free(req);
+drop:
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ return -1;
+}
+EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
+
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+ if (dccp_rcv_established(sk, skb, dh, skb->len))
+ goto reset;
+ return 0;
+ }
+
+ /*
+ * Step 3: Process LISTEN state
+ * If P.type == Request or P contains a valid Init Cookie option,
+ * (* Must scan the packet's options to check for Init
+ * Cookies. Only Init Cookies are processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *)
+ * (* Generate a new socket and switch to that socket *)
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookies
+ * Initialize S.GAR := S.ISS
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
+ * Continue with S.state == RESPOND
+ * (* A Response packet will be generated in Step 11 *)
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ *
+ * NOTE: the check for the packet types is done in
+ * dccp_rcv_state_process
+ */
+ if (sk->sk_state == DCCP_LISTEN) {
+ struct sock *nsk = dccp_v4_hnd_req(sk, skb);
+
+ if (nsk == NULL)
+ goto discard;
+
+ if (nsk != sk) {
+ if (dccp_child_process(sk, nsk, skb))
+ goto reset;
+ return 0;
+ }
+ }
+
+ if (dccp_rcv_state_process(sk, skb, dh, skb->len))
+ goto reset;
+ return 0;
+
+reset:
+ dccp_v4_ctl_send_reset(sk, skb);
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
+
+/**
+ * dccp_invalid_packet - check for malformed packets
+ * Implements RFC 4340, 8.5: Step 1: Check header basics
+ * Packets that fail these checks are ignored and do not receive Resets.
+ */
+int dccp_invalid_packet(struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh;
+ unsigned int cscov;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return 1;
+
+ /* If the packet is shorter than 12 bytes, drop packet and return */
+ if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
+ DCCP_WARN("pskb_may_pull failed\n");
+ return 1;
+ }
+
+ dh = dccp_hdr(skb);
+
+ /* If P.type is not understood, drop packet and return */
+ if (dh->dccph_type >= DCCP_PKT_INVALID) {
+ DCCP_WARN("invalid packet type\n");
+ return 1;
+ }
+
+ /*
+ * If P.Data Offset is too small for packet type, drop packet and return
+ */
+ if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+ DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff);
+ return 1;
+ }
+ /*
+ * If P.Data Offset is too too large for packet, drop packet and return
+ */
+ if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
+ DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff);
+ return 1;
+ }
+
+ /*
+ * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
+ * has short sequence numbers), drop packet and return
+ */
+ if ((dh->dccph_type < DCCP_PKT_DATA ||
+ dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) {
+ DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
+ dccp_packet_name(dh->dccph_type));
+ return 1;
+ }
+
+ /*
+ * If P.CsCov is too large for the packet size, drop packet and return.
+ * This must come _before_ checksumming (not as RFC 4340 suggests).
+ */
+ cscov = dccp_csum_coverage(skb);
+ if (cscov > skb->len) {
+ DCCP_WARN("P.CsCov %u exceeds packet length %d\n",
+ dh->dccph_cscov, skb->len);
+ return 1;
+ }
+
+ /* If header checksum is incorrect, drop packet and return.
+ * (This step is completed in the AF-dependent functions.) */
+ skb->csum = skb_checksum(skb, 0, cscov, 0);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_invalid_packet);
+
+/* this is called when real data arrives */
+static int dccp_v4_rcv(struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh;
+ const struct iphdr *iph;
+ struct sock *sk;
+ int min_cov;
+
+ /* Step 1: Check header basics */
+
+ if (dccp_invalid_packet(skb))
+ goto discard_it;
+
+ iph = ip_hdr(skb);
+ /* Step 1: If header checksum is incorrect, drop packet and return */
+ if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) {
+ DCCP_WARN("dropped packet with invalid checksum\n");
+ goto discard_it;
+ }
+
+ dh = dccp_hdr(skb);
+
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
+ DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+ dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu",
+ dccp_packet_name(dh->dccph_type),
+ &iph->saddr, ntohs(dh->dccph_sport),
+ &iph->daddr, ntohs(dh->dccph_dport),
+ (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+
+ if (dccp_packet_without_ack(skb)) {
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+ dccp_pr_debug_cat("\n");
+ } else {
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+ dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ }
+
+ /* Step 2:
+ * Look up flow ID in table and get corresponding socket */
+ sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+ dh->dccph_sport, dh->dccph_dport);
+ /*
+ * Step 2:
+ * If no socket ...
+ */
+ if (sk == NULL) {
+ dccp_pr_debug("failed to look up flow ID in table and "
+ "get corresponding socket\n");
+ goto no_dccp_socket;
+ }
+
+ /*
+ * Step 2:
+ * ... or S.state == TIMEWAIT,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
+ inet_twsk_put(inet_twsk(sk));
+ goto no_dccp_socket;
+ }
+
+ /*
+ * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
+ * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
+ * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
+ */
+ min_cov = dccp_sk(sk)->dccps_pcrlen;
+ if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
+ dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
+ dh->dccph_cscov, min_cov);
+ /* FIXME: "Such packets SHOULD be reported using Data Dropped
+ * options (Section 11.7) with Drop Code 0, Protocol
+ * Constraints." */
+ goto discard_and_relse;
+ }
+
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_and_relse;
+ nf_reset(skb);
+
+ return sk_receive_skb(sk, skb, 1);
+
+no_dccp_socket:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+ /*
+ * Step 2:
+ * If no socket ...
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (dh->dccph_type != DCCP_PKT_RESET) {
+ DCCP_SKB_CB(skb)->dccpd_reset_code =
+ DCCP_RESET_CODE_NO_CONNECTION;
+ dccp_v4_ctl_send_reset(sk, skb);
+ }
+
+discard_it:
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+}
+
+static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = dccp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .conn_request = dccp_v4_conn_request,
+ .syn_recv_sock = dccp_v4_request_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .addr2sockaddr = inet_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in),
+ .bind_conflict = inet_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ip_setsockopt,
+ .compat_getsockopt = compat_ip_getsockopt,
+#endif
+};
+
+static int dccp_v4_init_sock(struct sock *sk)
+{
+ static __u8 dccp_v4_ctl_sock_initialized;
+ int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized);
+
+ if (err == 0) {
+ if (unlikely(!dccp_v4_ctl_sock_initialized))
+ dccp_v4_ctl_sock_initialized = 1;
+ inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
+ }
+
+ return err;
+}
+
+static struct timewait_sock_ops dccp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct inet_timewait_sock),
+};
+
+static struct proto dccp_v4_prot = {
+ .name = "DCCP",
+ .owner = THIS_MODULE,
+ .close = dccp_close,
+ .connect = dccp_v4_connect,
+ .disconnect = dccp_disconnect,
+ .ioctl = dccp_ioctl,
+ .init = dccp_v4_init_sock,
+ .setsockopt = dccp_setsockopt,
+ .getsockopt = dccp_getsockopt,
+ .sendmsg = dccp_sendmsg,
+ .recvmsg = dccp_recvmsg,
+ .backlog_rcv = dccp_v4_do_rcv,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .accept = inet_csk_accept,
+ .get_port = inet_csk_get_port,
+ .shutdown = dccp_shutdown,
+ .destroy = dccp_destroy_sock,
+ .orphan_count = &dccp_orphan_count,
+ .max_header = MAX_DCCP_HEADER,
+ .obj_size = sizeof(struct dccp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .rsk_prot = &dccp_request_sock_ops,
+ .twsk_prot = &dccp_timewait_sock_ops,
+ .h.hashinfo = &dccp_hashinfo,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_dccp_setsockopt,
+ .compat_getsockopt = compat_dccp_getsockopt,
+#endif
+};
+
+static const struct net_protocol dccp_v4_protocol = {
+ .handler = dccp_v4_rcv,
+ .err_handler = dccp_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
+};
+
+static const struct proto_ops inet_dccp_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
+ .poll = dccp_poll,
+ .ioctl = inet_ioctl,
+ /* FIXME: work on inet_listen to rename it to sock_common_listen */
+ .listen = inet_dccp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw dccp_v4_protosw = {
+ .type = SOCK_DCCP,
+ .protocol = IPPROTO_DCCP,
+ .prot = &dccp_v4_prot,
+ .ops = &inet_dccp_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+static int __net_init dccp_v4_init_net(struct net *net)
+{
+ if (dccp_hashinfo.bhash == NULL)
+ return -ESOCKTNOSUPPORT;
+
+ return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET,
+ SOCK_DCCP, IPPROTO_DCCP, net);
+}
+
+static void __net_exit dccp_v4_exit_net(struct net *net)
+{
+ inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
+}
+
+static struct pernet_operations dccp_v4_ops = {
+ .init = dccp_v4_init_net,
+ .exit = dccp_v4_exit_net,
+};
+
+static int __init dccp_v4_init(void)
+{
+ int err = proto_register(&dccp_v4_prot, 1);
+
+ if (err != 0)
+ goto out;
+
+ err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+ if (err != 0)
+ goto out_proto_unregister;
+
+ inet_register_protosw(&dccp_v4_protosw);
+
+ err = register_pernet_subsys(&dccp_v4_ops);
+ if (err)
+ goto out_destroy_ctl_sock;
+out:
+ return err;
+out_destroy_ctl_sock:
+ inet_unregister_protosw(&dccp_v4_protosw);
+ inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+out_proto_unregister:
+ proto_unregister(&dccp_v4_prot);
+ goto out;
+}
+
+static void __exit dccp_v4_exit(void)
+{
+ unregister_pernet_subsys(&dccp_v4_ops);
+ inet_unregister_protosw(&dccp_v4_protosw);
+ inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+ proto_unregister(&dccp_v4_prot);
+}
+
+module_init(dccp_v4_init);
+module_exit(dccp_v4_exit);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
new file mode 100644
index 000000000..5165571f3
--- /dev/null
+++ b/net/dccp/ipv6.c
@@ -0,0 +1,1147 @@
+/*
+ * DCCP over IPv6
+ * Linux INET6 implementation
+ *
+ * Based on net/dccp6/ipv6.c
+ *
+ * Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/xfrm.h>
+
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
+#include <net/inet6_connection_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/ip6_checksum.h>
+#include <net/xfrm.h>
+#include <net/secure_seq.h>
+
+#include "dccp.h"
+#include "ipv6.h"
+#include "feat.h"
+
+/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */
+
+static const struct inet_connection_sock_af_ops dccp_ipv6_mapped;
+static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
+
+/* add pseudo-header to DCCP checksum stored in skb->csum */
+static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
+{
+ return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
+}
+
+static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dccp_csum_outgoing(skb);
+ dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr);
+}
+
+static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
+{
+ return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32,
+ dccp_hdr(skb)->dccph_dport,
+ dccp_hdr(skb)->dccph_sport );
+
+}
+
+static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
+ const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+ struct dccp_sock *dp;
+ struct ipv6_pinfo *np;
+ struct sock *sk;
+ int err;
+ __u64 seq;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->len < offset + sizeof(*dh) ||
+ skb->len < offset + __dccp_basic_hdr_len(dh)) {
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return;
+ }
+
+ sk = __inet6_lookup_established(net, &dccp_hashinfo,
+ &hdr->daddr, dh->dccph_dport,
+ &hdr->saddr, ntohs(dh->dccph_sport),
+ inet6_iif(skb));
+
+ if (!sk) {
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return;
+ }
+
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ inet_twsk_put(inet_twsk(sk));
+ return;
+ }
+ seq = dccp_hdr_seq(dh);
+ if (sk->sk_state == DCCP_NEW_SYN_RECV)
+ return dccp_req_err(sk, seq);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == DCCP_CLOSED)
+ goto out;
+
+ dp = dccp_sk(sk);
+ if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
+ !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ np = inet6_sk(sk);
+
+ if (type == NDISC_REDIRECT) {
+ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+ goto out;
+ }
+
+ if (type == ICMPV6_PKT_TOOBIG) {
+ struct dst_entry *dst = NULL;
+
+ if (!ip6_sk_accept_pmtu(sk))
+ goto out;
+
+ if (sock_owned_by_user(sk))
+ goto out;
+ if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
+ goto out;
+
+ dst = inet6_csk_update_pmtu(sk, ntohl(info));
+ if (!dst)
+ goto out;
+
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst))
+ dccp_sync_mss(sk, dst_mtu(dst));
+ goto out;
+ }
+
+ icmpv6_err_convert(type, code, &err);
+
+ /* Might be for an request_sock */
+ switch (sk->sk_state) {
+ case DCCP_REQUESTING:
+ case DCCP_RESPOND: /* Cannot happen.
+ It can, it SYNs are crossed. --ANK */
+ if (!sock_owned_by_user(sk)) {
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ sk->sk_err = err;
+ /*
+ * Wake people up to see the error
+ * (see connect in sock.c)
+ */
+ sk->sk_error_report(sk);
+ dccp_done(sk);
+ } else
+ sk->sk_err_soft = err;
+ goto out;
+ }
+
+ if (!sock_owned_by_user(sk) && np->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else
+ sk->sk_err_soft = err;
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *skb;
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+ int err = -1;
+ struct dst_entry *dst;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowlabel = 0;
+ fl6.flowi6_oif = ireq->ir_iif;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = htons(ireq->ir_num);
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+
+
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto done;
+ }
+
+ skb = dccp_make_response(sk, dst, req);
+ if (skb != NULL) {
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dh->dccph_checksum = dccp_v6_csum_finish(skb,
+ &ireq->ir_v6_loc_addr,
+ &ireq->ir_v6_rmt_addr);
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+ err = net_xmit_eval(err);
+ }
+
+done:
+ dst_release(dst);
+ return err;
+}
+
+static void dccp_v6_reqsk_destructor(struct request_sock *req)
+{
+ dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+ kfree_skb(inet_rsk(req)->pktopts);
+}
+
+static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
+{
+ const struct ipv6hdr *rxip6h;
+ struct sk_buff *skb;
+ struct flowi6 fl6;
+ struct net *net = dev_net(skb_dst(rxskb)->dev);
+ struct sock *ctl_sk = net->dccp.v6_ctl_sk;
+ struct dst_entry *dst;
+
+ if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
+ return;
+
+ if (!ipv6_unicast_destination(rxskb))
+ return;
+
+ skb = dccp_ctl_make_reset(ctl_sk, rxskb);
+ if (skb == NULL)
+ return;
+
+ rxip6h = ipv6_hdr(rxskb);
+ dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
+ &rxip6h->daddr);
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = rxip6h->saddr;
+ fl6.saddr = rxip6h->daddr;
+
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.flowi6_oif = inet6_iif(rxskb);
+ fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
+ fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
+ security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
+
+ /* sk = NULL, but it is safe for now. RST socket required. */
+ dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
+ if (!IS_ERR(dst)) {
+ skb_dst_set(skb, dst);
+ ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ return;
+ }
+
+ kfree_skb(skb);
+}
+
+static struct request_sock_ops dccp6_request_sock_ops = {
+ .family = AF_INET6,
+ .obj_size = sizeof(struct dccp6_request_sock),
+ .rtx_syn_ack = dccp_v6_send_response,
+ .send_ack = dccp_reqsk_send_ack,
+ .destructor = dccp_v6_reqsk_destructor,
+ .send_reset = dccp_v6_ctl_send_reset,
+ .syn_ack_timeout = dccp_syn_ack_timeout,
+};
+
+static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct request_sock *req;
+ struct sock *nsk;
+
+ req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr,
+ &iph->daddr, inet6_iif(skb));
+ if (req) {
+ nsk = dccp_check_req(sk, skb, req);
+ if (!nsk)
+ reqsk_put(req);
+ return nsk;
+ }
+ nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
+ &iph->saddr, dh->dccph_sport,
+ &iph->daddr, ntohs(dh->dccph_dport),
+ inet6_iif(skb));
+ if (nsk != NULL) {
+ if (nsk->sk_state != DCCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put(inet_twsk(nsk));
+ return NULL;
+ }
+
+ return sk;
+}
+
+static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct request_sock *req;
+ struct dccp_request_sock *dreq;
+ struct inet_request_sock *ireq;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return dccp_v4_conn_request(sk, skb);
+
+ if (!ipv6_unicast_destination(skb))
+ return 0; /* discard, don't send a reset here */
+
+ if (dccp_bad_service_code(sk, service)) {
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+ goto drop;
+ }
+ /*
+ * There are no SYN attacks on IPv6, yet...
+ */
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+ if (inet_csk_reqsk_queue_is_full(sk))
+ goto drop;
+
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ goto drop;
+
+ req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk);
+ if (req == NULL)
+ goto drop;
+
+ if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+ goto drop_and_free;
+
+ dreq = dccp_rsk(req);
+ if (dccp_parse_options(sk, dreq, skb))
+ goto drop_and_free;
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
+
+ ireq = inet_rsk(req);
+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+ ireq->ireq_family = AF_INET6;
+
+ if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) ||
+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+ atomic_inc(&skb->users);
+ ireq->pktopts = skb;
+ }
+ ireq->ir_iif = sk->sk_bound_dev_if;
+
+ /* So that link locals have meaning */
+ if (!sk->sk_bound_dev_if &&
+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq->ir_iif = inet6_iif(skb);
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ *
+ * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
+ */
+ dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_gsr = dreq->dreq_isr;
+ dreq->dreq_iss = dccp_v6_init_sequence(skb);
+ dreq->dreq_gss = dreq->dreq_iss;
+ dreq->dreq_service = service;
+
+ if (dccp_v6_send_response(sk, req))
+ goto drop_and_free;
+
+ inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+ return 0;
+
+drop_and_free:
+ reqsk_free(req);
+drop:
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ return -1;
+}
+
+static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+ struct inet_sock *newinet;
+ struct dccp6_sock *newdp6;
+ struct sock *newsk;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ /*
+ * v6 mapped
+ */
+ newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
+ if (newsk == NULL)
+ return NULL;
+
+ newdp6 = (struct dccp6_sock *)newsk;
+ newinet = inet_sk(newsk);
+ newinet->pinet6 = &newdp6->inet6;
+ newnp = inet6_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ newnp->saddr = newsk->sk_v6_rcv_saddr;
+
+ inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
+ newsk->sk_backlog_rcv = dccp_v4_do_rcv;
+ newnp->pktoptions = NULL;
+ newnp->opt = NULL;
+ newnp->mcast_oif = inet6_iif(skb);
+ newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
+
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+ * here, dccp_create_openreq_child now does this for us, see the comment in
+ * that function for the gory details. -acme
+ */
+
+ /* It is tricky place. Until this moment IPv4 tcp
+ worked with IPv6 icsk.icsk_af_ops.
+ Sync it now.
+ */
+ dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
+
+ return newsk;
+ }
+
+
+ if (sk_acceptq_is_full(sk))
+ goto out_overflow;
+
+ if (dst == NULL) {
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = htons(ireq->ir_num);
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst))
+ goto out;
+ }
+
+ newsk = dccp_create_openreq_child(sk, req, skb);
+ if (newsk == NULL)
+ goto out_nonewsk;
+
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks
+ * count here, dccp_create_openreq_child now does this for us, see the
+ * comment in that function for the gory details. -acme
+ */
+
+ __ip6_dst_store(newsk, dst, NULL, NULL);
+ newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
+ NETIF_F_TSO);
+ newdp6 = (struct dccp6_sock *)newsk;
+ newinet = inet_sk(newsk);
+ newinet->pinet6 = &newdp6->inet6;
+ newnp = inet6_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ newnp->saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_bound_dev_if = ireq->ir_iif;
+
+ /* Now IPv6 options...
+
+ First: no IPv4 options.
+ */
+ newinet->inet_opt = NULL;
+
+ /* Clone RX bits */
+ newnp->rxopt.all = np->rxopt.all;
+
+ /* Clone pktoptions received with SYN */
+ newnp->pktoptions = NULL;
+ if (ireq->pktopts != NULL) {
+ newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
+ if (newnp->pktoptions)
+ skb_set_owner_r(newnp->pktoptions, newsk);
+ }
+ newnp->opt = NULL;
+ newnp->mcast_oif = inet6_iif(skb);
+ newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
+
+ /*
+ * Clone native IPv6 options from listening socket (if any)
+ *
+ * Yes, keeping reference count would be much more clever, but we make
+ * one more one thing there: reattach optmem to newsk.
+ */
+ if (np->opt != NULL)
+ newnp->opt = ipv6_dup_options(newsk, np->opt);
+
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
+ if (newnp->opt != NULL)
+ inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+ newnp->opt->opt_flen);
+
+ dccp_sync_mss(newsk, dst_mtu(dst));
+
+ newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
+ newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ inet_csk_prepare_forced_close(newsk);
+ dccp_done(newsk);
+ goto out;
+ }
+ __inet_hash(newsk, NULL);
+
+ return newsk;
+
+out_overflow:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+ dst_release(dst);
+out:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return NULL;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *opt_skb = NULL;
+
+ /* Imagine: socket is IPv6. IPv4 packet arrives,
+ goes to IPv4 receive handler and backlogged.
+ From backlog it always goes here. Kerboom...
+ Fortunately, dccp_rcv_established and rcv_established
+ handle them correctly, but it is not case with
+ dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK
+ */
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return dccp_v4_do_rcv(sk, skb);
+
+ if (sk_filter(sk, skb))
+ goto discard;
+
+ /*
+ * socket locking is here for SMP purposes as backlog rcv is currently
+ * called with bh processing disabled.
+ */
+
+ /* Do Stevens' IPV6_PKTOPTIONS.
+
+ Yes, guys, it is the only place in our code, where we
+ may make it not affecting IPv4.
+ The rest of code is protocol independent,
+ and I do not like idea to uglify IPv4.
+
+ Actually, all the idea behind IPV6_PKTOPTIONS
+ looks not very well thought. For now we latch
+ options, received in the last packet, enqueued
+ by tcp. Feel free to propose better solution.
+ --ANK (980728)
+ */
+ if (np->rxopt.all)
+ /*
+ * FIXME: Add handling of IPV6_PKTOPTIONS skb. See the comments below
+ * (wrt ipv6_pktopions) and net/ipv6/tcp_ipv6.c for an example.
+ */
+ opt_skb = skb_clone(skb, GFP_ATOMIC);
+
+ if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
+ goto reset;
+ if (opt_skb) {
+ /* XXX This is where we would goto ipv6_pktoptions. */
+ __kfree_skb(opt_skb);
+ }
+ return 0;
+ }
+
+ /*
+ * Step 3: Process LISTEN state
+ * If S.state == LISTEN,
+ * If P.type == Request or P contains a valid Init Cookie option,
+ * (* Must scan the packet's options to check for Init
+ * Cookies. Only Init Cookies are processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *)
+ * (* Generate a new socket and switch to that socket *)
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookies
+ * Initialize S.GAR := S.ISS
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
+ * Continue with S.state == RESPOND
+ * (* A Response packet will be generated in Step 11 *)
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ *
+ * NOTE: the check for the packet types is done in
+ * dccp_rcv_state_process
+ */
+ if (sk->sk_state == DCCP_LISTEN) {
+ struct sock *nsk = dccp_v6_hnd_req(sk, skb);
+
+ if (nsk == NULL)
+ goto discard;
+ /*
+ * Queue it on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket..
+ */
+ if (nsk != sk) {
+ if (dccp_child_process(sk, nsk, skb))
+ goto reset;
+ if (opt_skb != NULL)
+ __kfree_skb(opt_skb);
+ return 0;
+ }
+ }
+
+ if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
+ goto reset;
+ if (opt_skb) {
+ /* XXX This is where we would goto ipv6_pktoptions. */
+ __kfree_skb(opt_skb);
+ }
+ return 0;
+
+reset:
+ dccp_v6_ctl_send_reset(sk, skb);
+discard:
+ if (opt_skb != NULL)
+ __kfree_skb(opt_skb);
+ kfree_skb(skb);
+ return 0;
+}
+
+static int dccp_v6_rcv(struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh;
+ struct sock *sk;
+ int min_cov;
+
+ /* Step 1: Check header basics */
+
+ if (dccp_invalid_packet(skb))
+ goto discard_it;
+
+ /* Step 1: If header checksum is incorrect, drop packet and return. */
+ if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr)) {
+ DCCP_WARN("dropped packet with invalid checksum\n");
+ goto discard_it;
+ }
+
+ dh = dccp_hdr(skb);
+
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
+ DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+ if (dccp_packet_without_ack(skb))
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+ else
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+
+ /* Step 2:
+ * Look up flow ID in table and get corresponding socket */
+ sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
+ dh->dccph_sport, dh->dccph_dport,
+ inet6_iif(skb));
+ /*
+ * Step 2:
+ * If no socket ...
+ */
+ if (sk == NULL) {
+ dccp_pr_debug("failed to look up flow ID in table and "
+ "get corresponding socket\n");
+ goto no_dccp_socket;
+ }
+
+ /*
+ * Step 2:
+ * ... or S.state == TIMEWAIT,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
+ inet_twsk_put(inet_twsk(sk));
+ goto no_dccp_socket;
+ }
+
+ /*
+ * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
+ * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
+ * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
+ */
+ min_cov = dccp_sk(sk)->dccps_pcrlen;
+ if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
+ dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
+ dh->dccph_cscov, min_cov);
+ /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */
+ goto discard_and_relse;
+ }
+
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_and_relse;
+
+ return sk_receive_skb(sk, skb, 1) ? -1 : 0;
+
+no_dccp_socket:
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+ /*
+ * Step 2:
+ * If no socket ...
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (dh->dccph_type != DCCP_PKT_RESET) {
+ DCCP_SKB_CB(skb)->dccpd_reset_code =
+ DCCP_RESET_CODE_NO_CONNECTION;
+ dccp_v6_ctl_send_reset(sk, skb);
+ }
+
+discard_it:
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+}
+
+static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct in6_addr *saddr = NULL, *final_p, final;
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+ int addr_type;
+ int err;
+
+ dp->dccps_role = DCCP_ROLE_CLIENT;
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ memset(&fl6, 0, sizeof(fl6));
+
+ if (np->sndflow) {
+ fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6.flowlabel);
+ if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
+ struct ip6_flowlabel *flowlabel;
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (flowlabel == NULL)
+ return -EINVAL;
+ fl6_sock_release(flowlabel);
+ }
+ }
+ /*
+ * connect() to INADDR_ANY means loopback (BSD'ism).
+ */
+ if (ipv6_addr_any(&usin->sin6_addr))
+ usin->sin6_addr.s6_addr[15] = 1;
+
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+ if (addr_type & IPV6_ADDR_MULTICAST)
+ return -ENETUNREACH;
+
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ usin->sin6_scope_id) {
+ /* If interface is set while binding, indices
+ * must coincide.
+ */
+ if (sk->sk_bound_dev_if &&
+ sk->sk_bound_dev_if != usin->sin6_scope_id)
+ return -EINVAL;
+
+ sk->sk_bound_dev_if = usin->sin6_scope_id;
+ }
+
+ /* Connect to link-local address requires an interface */
+ if (!sk->sk_bound_dev_if)
+ return -EINVAL;
+ }
+
+ sk->sk_v6_daddr = usin->sin6_addr;
+ np->flow_label = fl6.flowlabel;
+
+ /*
+ * DCCP over IPv4
+ */
+ if (addr_type == IPV6_ADDR_MAPPED) {
+ u32 exthdrlen = icsk->icsk_ext_hdr_len;
+ struct sockaddr_in sin;
+
+ SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
+
+ if (__ipv6_only_sock(sk))
+ return -ENETUNREACH;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = usin->sin6_port;
+ sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+ icsk->icsk_af_ops = &dccp_ipv6_mapped;
+ sk->sk_backlog_rcv = dccp_v4_do_rcv;
+
+ err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+ if (err) {
+ icsk->icsk_ext_hdr_len = exthdrlen;
+ icsk->icsk_af_ops = &dccp_ipv6_af_ops;
+ sk->sk_backlog_rcv = dccp_v6_do_rcv;
+ goto failure;
+ }
+ np->saddr = sk->sk_v6_rcv_saddr;
+ return err;
+ }
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ saddr = &sk->sk_v6_rcv_saddr;
+
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = saddr ? *saddr : np->saddr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = usin->sin6_port;
+ fl6.fl6_sport = inet->inet_sport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto failure;
+ }
+
+ if (saddr == NULL) {
+ saddr = &fl6.saddr;
+ sk->sk_v6_rcv_saddr = *saddr;
+ }
+
+ /* set the source address */
+ np->saddr = *saddr;
+ inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+ __ip6_dst_store(sk, dst, NULL, NULL);
+
+ icsk->icsk_ext_hdr_len = 0;
+ if (np->opt != NULL)
+ icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+ np->opt->opt_nflen);
+
+ inet->inet_dport = usin->sin6_port;
+
+ dccp_set_state(sk, DCCP_REQUESTING);
+ err = inet6_hash_connect(&dccp_death_row, sk);
+ if (err)
+ goto late_failure;
+
+ dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport);
+ err = dccp_connect(sk);
+ if (err)
+ goto late_failure;
+
+ return 0;
+
+late_failure:
+ dccp_set_state(sk, DCCP_CLOSED);
+ __sk_dst_reset(sk);
+failure:
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ return err;
+}
+
+static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
+ .queue_xmit = inet6_csk_xmit,
+ .send_check = dccp_v6_send_check,
+ .rebuild_header = inet6_sk_rebuild_header,
+ .conn_request = dccp_v6_conn_request,
+ .syn_recv_sock = dccp_v6_request_recv_sock,
+ .net_header_len = sizeof(struct ipv6hdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6),
+ .bind_conflict = inet6_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ipv6_setsockopt,
+ .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+/*
+ * DCCP over IPv4 via INET6 API
+ */
+static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = dccp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .conn_request = dccp_v6_conn_request,
+ .syn_recv_sock = dccp_v6_request_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6),
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ipv6_setsockopt,
+ .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+static int dccp_v6_init_sock(struct sock *sk)
+{
+ static __u8 dccp_v6_ctl_sock_initialized;
+ int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized);
+
+ if (err == 0) {
+ if (unlikely(!dccp_v6_ctl_sock_initialized))
+ dccp_v6_ctl_sock_initialized = 1;
+ inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+ }
+
+ return err;
+}
+
+static void dccp_v6_destroy_sock(struct sock *sk)
+{
+ dccp_destroy_sock(sk);
+ inet6_destroy_sock(sk);
+}
+
+static struct timewait_sock_ops dccp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
+};
+
+static struct proto dccp_v6_prot = {
+ .name = "DCCPv6",
+ .owner = THIS_MODULE,
+ .close = dccp_close,
+ .connect = dccp_v6_connect,
+ .disconnect = dccp_disconnect,
+ .ioctl = dccp_ioctl,
+ .init = dccp_v6_init_sock,
+ .setsockopt = dccp_setsockopt,
+ .getsockopt = dccp_getsockopt,
+ .sendmsg = dccp_sendmsg,
+ .recvmsg = dccp_recvmsg,
+ .backlog_rcv = dccp_v6_do_rcv,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .accept = inet_csk_accept,
+ .get_port = inet_csk_get_port,
+ .shutdown = dccp_shutdown,
+ .destroy = dccp_v6_destroy_sock,
+ .orphan_count = &dccp_orphan_count,
+ .max_header = MAX_DCCP_HEADER,
+ .obj_size = sizeof(struct dccp6_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .rsk_prot = &dccp6_request_sock_ops,
+ .twsk_prot = &dccp6_timewait_sock_ops,
+ .h.hashinfo = &dccp_hashinfo,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_dccp_setsockopt,
+ .compat_getsockopt = compat_dccp_getsockopt,
+#endif
+};
+
+static const struct inet6_protocol dccp_v6_protocol = {
+ .handler = dccp_v6_rcv,
+ .err_handler = dccp_v6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+static const struct proto_ops inet6_dccp_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet6_getname,
+ .poll = dccp_poll,
+ .ioctl = inet6_ioctl,
+ .listen = inet_dccp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw dccp_v6_protosw = {
+ .type = SOCK_DCCP,
+ .protocol = IPPROTO_DCCP,
+ .prot = &dccp_v6_prot,
+ .ops = &inet6_dccp_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+static int __net_init dccp_v6_init_net(struct net *net)
+{
+ if (dccp_hashinfo.bhash == NULL)
+ return -ESOCKTNOSUPPORT;
+
+ return inet_ctl_sock_create(&net->dccp.v6_ctl_sk, PF_INET6,
+ SOCK_DCCP, IPPROTO_DCCP, net);
+}
+
+static void __net_exit dccp_v6_exit_net(struct net *net)
+{
+ inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
+}
+
+static struct pernet_operations dccp_v6_ops = {
+ .init = dccp_v6_init_net,
+ .exit = dccp_v6_exit_net,
+};
+
+static int __init dccp_v6_init(void)
+{
+ int err = proto_register(&dccp_v6_prot, 1);
+
+ if (err != 0)
+ goto out;
+
+ err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+ if (err != 0)
+ goto out_unregister_proto;
+
+ inet6_register_protosw(&dccp_v6_protosw);
+
+ err = register_pernet_subsys(&dccp_v6_ops);
+ if (err != 0)
+ goto out_destroy_ctl_sock;
+out:
+ return err;
+
+out_destroy_ctl_sock:
+ inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+ inet6_unregister_protosw(&dccp_v6_protosw);
+out_unregister_proto:
+ proto_unregister(&dccp_v6_prot);
+ goto out;
+}
+
+static void __exit dccp_v6_exit(void)
+{
+ unregister_pernet_subsys(&dccp_v6_ops);
+ inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+ inet6_unregister_protosw(&dccp_v6_protosw);
+ proto_unregister(&dccp_v6_prot);
+}
+
+module_init(dccp_v6_init);
+module_exit(dccp_v6_exit);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
new file mode 100644
index 000000000..af259e15e
--- /dev/null
+++ b/net/dccp/ipv6.h
@@ -0,0 +1,34 @@
+#ifndef _DCCP_IPV6_H
+#define _DCCP_IPV6_H
+/*
+ * net/dccp/ipv6.h
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/dccp.h>
+#include <linux/ipv6.h>
+
+struct dccp6_sock {
+ struct dccp_sock dccp;
+ /*
+ * ipv6_pinfo has to be the last member of dccp6_sock,
+ * see inet6_sk_generic.
+ */
+ struct ipv6_pinfo inet6;
+};
+
+struct dccp6_request_sock {
+ struct dccp_request_sock dccp;
+};
+
+struct dccp6_timewait_sock {
+ struct inet_timewait_sock inet;
+};
+
+#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
new file mode 100644
index 000000000..30addee2d
--- /dev/null
+++ b/net/dccp/minisocks.c
@@ -0,0 +1,261 @@
+/*
+ * net/dccp/minisocks.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/inet_timewait_sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+#include "feat.h"
+
+struct inet_timewait_death_row dccp_death_row = {
+ .sysctl_max_tw_buckets = NR_FILE * 2,
+ .hashinfo = &dccp_hashinfo,
+};
+
+EXPORT_SYMBOL_GPL(dccp_death_row);
+
+void dccp_time_wait(struct sock *sk, int state, int timeo)
+{
+ struct inet_timewait_sock *tw;
+
+ tw = inet_twsk_alloc(sk, &dccp_death_row, state);
+
+ if (tw != NULL) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tw->tw_family == PF_INET6) {
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ tw->tw_ipv6only = sk->sk_ipv6only;
+ }
+#endif
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
+
+ /* Get the TIME_WAIT timeout firing. */
+ if (timeo < rto)
+ timeo = rto;
+
+ tw->tw_timeout = DCCP_TIMEWAIT_LEN;
+ if (state == DCCP_TIME_WAIT)
+ timeo = DCCP_TIMEWAIT_LEN;
+
+ inet_twsk_schedule(tw, timeo);
+ inet_twsk_put(tw);
+ } else {
+ /* Sorry, if we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ DCCP_WARN("time wait bucket table overflow\n");
+ }
+
+ dccp_done(sk);
+}
+
+struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * (* Generate a new socket and switch to that socket *)
+ * Set S := new socket for this port pair
+ */
+ struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+
+ if (newsk != NULL) {
+ struct dccp_request_sock *dreq = dccp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+ struct dccp_sock *newdp = dccp_sk(newsk);
+
+ newdp->dccps_role = DCCP_ROLE_SERVER;
+ newdp->dccps_hc_rx_ackvec = NULL;
+ newdp->dccps_service_list = NULL;
+ newdp->dccps_service = dreq->dreq_service;
+ newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo;
+ newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
+ newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
+
+ INIT_LIST_HEAD(&newdp->dccps_featneg);
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Choose S.ISS (initial seqno) or set from Init Cookies
+ * Initialize S.GAR := S.ISS
+ * Set S.ISR, S.GSR from packet (or Init Cookies)
+ *
+ * Setting AWL/AWH and SWL/SWH happens as part of the feature
+ * activation below, as these windows all depend on the local
+ * and remote Sequence Window feature values (7.5.2).
+ */
+ newdp->dccps_iss = dreq->dreq_iss;
+ newdp->dccps_gss = dreq->dreq_gss;
+ newdp->dccps_gar = newdp->dccps_iss;
+ newdp->dccps_isr = dreq->dreq_isr;
+ newdp->dccps_gsr = dreq->dreq_gsr;
+
+ /*
+ * Activate features: initialise CCIDs, sequence windows etc.
+ */
+ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ return NULL;
+ }
+ dccp_init_xmit_timers(newsk);
+
+ DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
+ }
+ return newsk;
+}
+
+EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
+
+/*
+ * Process an incoming packet for RESPOND sockets represented
+ * as an request_sock.
+ */
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req)
+{
+ struct sock *child = NULL;
+ struct dccp_request_sock *dreq = dccp_rsk(req);
+
+ /* Check for retransmitted REQUEST */
+ if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
+
+ if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) {
+ dccp_pr_debug("Retransmitted REQUEST\n");
+ dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq;
+ /*
+ * Send another RESPONSE packet
+ * To protect against Request floods, increment retrans
+ * counter (backoff, monitored by dccp_response_timer).
+ */
+ inet_rtx_syn_ack(sk, req);
+ }
+ /* Network Duplicate, discard packet */
+ return NULL;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+
+ if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
+ dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
+ goto drop;
+
+ /* Invalid ACK */
+ if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dreq->dreq_iss, dreq->dreq_gss)) {
+ dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
+ "dreq_iss=%llu, dreq_gss=%llu\n",
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ (unsigned long long) dreq->dreq_iss,
+ (unsigned long long) dreq->dreq_gss);
+ goto drop;
+ }
+
+ if (dccp_parse_options(sk, dreq, skb))
+ goto drop;
+
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ if (child == NULL)
+ goto listen_overflow;
+
+ inet_csk_reqsk_queue_drop(sk, req);
+ inet_csk_reqsk_queue_add(sk, req, child);
+out:
+ return child;
+listen_overflow:
+ dccp_pr_debug("listen_overflow!\n");
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+drop:
+ if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
+ req->rsk_ops->send_reset(sk, skb);
+
+ inet_csk_reqsk_queue_drop(sk, req);
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_check_req);
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+ const int state = child->sk_state;
+
+ if (!sock_owned_by_user(child)) {
+ ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
+ skb->len);
+
+ /* Wakeup parent, send SIGIO */
+ if (state == DCCP_RESPOND && child->sk_state != state)
+ parent->sk_data_ready(parent);
+ } else {
+ /* Alas, it is possible again, because we do lookup
+ * in main socket hash table and lock on listening
+ * socket does not protect us more.
+ */
+ __sk_add_backlog(child, skb);
+ }
+
+ bh_unlock_sock(child);
+ sock_put(child);
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(dccp_child_process);
+
+void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *rsk)
+{
+ DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
+}
+
+EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
+
+int dccp_reqsk_init(struct request_sock *req,
+ struct dccp_sock const *dp, struct sk_buff const *skb)
+{
+ struct dccp_request_sock *dreq = dccp_rsk(req);
+
+ inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
+ inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
+ inet_rsk(req)->acked = 0;
+ dreq->dreq_timestamp_echo = 0;
+
+ /* inherit feature negotiation options from listening socket */
+ return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
+}
+
+EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
new file mode 100644
index 000000000..9bce31886
--- /dev/null
+++ b/net/dccp/options.c
@@ -0,0 +1,609 @@
+/*
+ * net/dccp/options.c
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+#include "feat.h"
+
+u64 dccp_decode_value_var(const u8 *bf, const u8 len)
+{
+ u64 value = 0;
+
+ if (len >= DCCP_OPTVAL_MAXLEN)
+ value += ((u64)*bf++) << 40;
+ if (len > 4)
+ value += ((u64)*bf++) << 32;
+ if (len > 3)
+ value += ((u64)*bf++) << 24;
+ if (len > 2)
+ value += ((u64)*bf++) << 16;
+ if (len > 1)
+ value += ((u64)*bf++) << 8;
+ if (len > 0)
+ value += *bf;
+
+ return value;
+}
+
+/**
+ * dccp_parse_options - Parse DCCP options present in @skb
+ * @sk: client|server|listening dccp socket (when @dreq != NULL)
+ * @dreq: request socket to use during connection setup, or NULL
+ */
+int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
+ struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
+ unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+ unsigned char *opt_ptr = options;
+ const unsigned char *opt_end = (unsigned char *)dh +
+ (dh->dccph_doff * 4);
+ struct dccp_options_received *opt_recv = &dp->dccps_options_received;
+ unsigned char opt, len;
+ unsigned char *uninitialized_var(value);
+ u32 elapsed_time;
+ __be32 opt_val;
+ int rc;
+ int mandatory = 0;
+
+ memset(opt_recv, 0, sizeof(*opt_recv));
+
+ opt = len = 0;
+ while (opt_ptr != opt_end) {
+ opt = *opt_ptr++;
+ len = 0;
+ value = NULL;
+
+ /* Check if this isn't a single byte option */
+ if (opt > DCCPO_MAX_RESERVED) {
+ if (opt_ptr == opt_end)
+ goto out_nonsensical_length;
+
+ len = *opt_ptr++;
+ if (len < 2)
+ goto out_nonsensical_length;
+ /*
+ * Remove the type and len fields, leaving
+ * just the value size
+ */
+ len -= 2;
+ value = opt_ptr;
+ opt_ptr += len;
+
+ if (opt_ptr > opt_end)
+ goto out_nonsensical_length;
+ }
+
+ /*
+ * CCID-specific options are ignored during connection setup, as
+ * negotiation may still be in progress (see RFC 4340, 10.3).
+ * The same applies to Ack Vectors, as these depend on the CCID.
+ */
+ if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
+ opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
+ goto ignore_option;
+
+ switch (opt) {
+ case DCCPO_PADDING:
+ break;
+ case DCCPO_MANDATORY:
+ if (mandatory)
+ goto out_invalid_option;
+ if (pkt_type != DCCP_PKT_DATA)
+ mandatory = 1;
+ break;
+ case DCCPO_NDP_COUNT:
+ if (len > 6)
+ goto out_invalid_option;
+
+ opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
+ dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
+ (unsigned long long)opt_recv->dccpor_ndp);
+ break;
+ case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
+ if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
+ break;
+ if (len == 0)
+ goto out_invalid_option;
+ rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
+ *value, value + 1, len - 1);
+ if (rc)
+ goto out_featneg_failed;
+ break;
+ case DCCPO_TIMESTAMP:
+ if (len != 4)
+ goto out_invalid_option;
+ /*
+ * RFC 4340 13.1: "The precise time corresponding to
+ * Timestamp Value zero is not specified". We use
+ * zero to indicate absence of a meaningful timestamp.
+ */
+ opt_val = get_unaligned((__be32 *)value);
+ if (unlikely(opt_val == 0)) {
+ DCCP_WARN("Timestamp with zero value\n");
+ break;
+ }
+
+ if (dreq != NULL) {
+ dreq->dreq_timestamp_echo = ntohl(opt_val);
+ dreq->dreq_timestamp_time = dccp_timestamp();
+ } else {
+ opt_recv->dccpor_timestamp =
+ dp->dccps_timestamp_echo = ntohl(opt_val);
+ dp->dccps_timestamp_time = dccp_timestamp();
+ }
+ dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n",
+ dccp_role(sk), ntohl(opt_val),
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ /* schedule an Ack in case this sender is quiescent */
+ inet_csk_schedule_ack(sk);
+ break;
+ case DCCPO_TIMESTAMP_ECHO:
+ if (len != 4 && len != 6 && len != 8)
+ goto out_invalid_option;
+
+ opt_val = get_unaligned((__be32 *)value);
+ opt_recv->dccpor_timestamp_echo = ntohl(opt_val);
+
+ dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, "
+ "ackno=%llu", dccp_role(sk),
+ opt_recv->dccpor_timestamp_echo,
+ len + 2,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+
+ value += 4;
+
+ if (len == 4) { /* no elapsed time included */
+ dccp_pr_debug_cat("\n");
+ break;
+ }
+
+ if (len == 6) { /* 2-byte elapsed time */
+ __be16 opt_val2 = get_unaligned((__be16 *)value);
+ elapsed_time = ntohs(opt_val2);
+ } else { /* 4-byte elapsed time */
+ opt_val = get_unaligned((__be32 *)value);
+ elapsed_time = ntohl(opt_val);
+ }
+
+ dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time);
+
+ /* Give precedence to the biggest ELAPSED_TIME */
+ if (elapsed_time > opt_recv->dccpor_elapsed_time)
+ opt_recv->dccpor_elapsed_time = elapsed_time;
+ break;
+ case DCCPO_ELAPSED_TIME:
+ if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */
+ break;
+
+ if (len == 2) {
+ __be16 opt_val2 = get_unaligned((__be16 *)value);
+ elapsed_time = ntohs(opt_val2);
+ } else if (len == 4) {
+ opt_val = get_unaligned((__be32 *)value);
+ elapsed_time = ntohl(opt_val);
+ } else {
+ goto out_invalid_option;
+ }
+
+ if (elapsed_time > opt_recv->dccpor_elapsed_time)
+ opt_recv->dccpor_elapsed_time = elapsed_time;
+
+ dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
+ dccp_role(sk), elapsed_time);
+ break;
+ case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
+ if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
+ pkt_type, opt, value, len))
+ goto out_invalid_option;
+ break;
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
+ break;
+ /*
+ * Ack vectors are processed by the TX CCID if it is
+ * interested. The RX CCID need not parse Ack Vectors,
+ * since it is only interested in clearing old state.
+ * Fall through.
+ */
+ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
+ if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
+ pkt_type, opt, value, len))
+ goto out_invalid_option;
+ break;
+ default:
+ DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
+ "implemented, ignoring", sk, opt, len);
+ break;
+ }
+ignore_option:
+ if (opt != DCCPO_MANDATORY)
+ mandatory = 0;
+ }
+
+ /* mandatory was the last byte in option list -> reset connection */
+ if (mandatory)
+ goto out_invalid_option;
+
+out_nonsensical_length:
+ /* RFC 4340, 5.8: ignore option and all remaining option space */
+ return 0;
+
+out_invalid_option:
+ DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
+ rc = DCCP_RESET_CODE_OPTION_ERROR;
+out_featneg_failed:
+ DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
+ DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
+ return -1;
+}
+
+EXPORT_SYMBOL_GPL(dccp_parse_options);
+
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
+{
+ if (len >= DCCP_OPTVAL_MAXLEN)
+ *to++ = (value & 0xFF0000000000ull) >> 40;
+ if (len > 4)
+ *to++ = (value & 0xFF00000000ull) >> 32;
+ if (len > 3)
+ *to++ = (value & 0xFF000000) >> 24;
+ if (len > 2)
+ *to++ = (value & 0xFF0000) >> 16;
+ if (len > 1)
+ *to++ = (value & 0xFF00) >> 8;
+ if (len > 0)
+ *to++ = (value & 0xFF);
+}
+
+static inline u8 dccp_ndp_len(const u64 ndp)
+{
+ if (likely(ndp <= 0xFF))
+ return 1;
+ return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
+}
+
+int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
+ const void *value, const unsigned char len)
+{
+ unsigned char *to;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN)
+ return -1;
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
+
+ to = skb_push(skb, len + 2);
+ *to++ = option;
+ *to++ = len + 2;
+
+ memcpy(to, value, len);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option);
+
+static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ u64 ndp = dp->dccps_ndp_count;
+
+ if (dccp_non_data_packet(skb))
+ ++dp->dccps_ndp_count;
+ else
+ dp->dccps_ndp_count = 0;
+
+ if (ndp > 0) {
+ unsigned char *ptr;
+ const int ndp_len = dccp_ndp_len(ndp);
+ const int len = ndp_len + 2;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+ return -1;
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ ptr = skb_push(skb, len);
+ *ptr++ = DCCPO_NDP_COUNT;
+ *ptr++ = len;
+ dccp_encode_value_var(ndp, ptr, ndp_len);
+ }
+
+ return 0;
+}
+
+static inline int dccp_elapsed_time_len(const u32 elapsed_time)
+{
+ return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
+}
+
+static int dccp_insert_option_timestamp(struct sk_buff *skb)
+{
+ __be32 now = htonl(dccp_timestamp());
+ /* yes this will overflow but that is the point as we want a
+ * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
+
+ return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+}
+
+static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
+ struct dccp_request_sock *dreq,
+ struct sk_buff *skb)
+{
+ __be32 tstamp_echo;
+ unsigned char *to;
+ u32 elapsed_time, elapsed_time_len, len;
+
+ if (dreq != NULL) {
+ elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time;
+ tstamp_echo = htonl(dreq->dreq_timestamp_echo);
+ dreq->dreq_timestamp_echo = 0;
+ } else {
+ elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time;
+ tstamp_echo = htonl(dp->dccps_timestamp_echo);
+ dp->dccps_timestamp_echo = 0;
+ }
+
+ elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+ len = 6 + elapsed_time_len;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+ return -1;
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ *to++ = DCCPO_TIMESTAMP_ECHO;
+ *to++ = len;
+
+ memcpy(to, &tstamp_echo, 4);
+ to += 4;
+
+ if (elapsed_time_len == 2) {
+ const __be16 var16 = htons((u16)elapsed_time);
+ memcpy(to, &var16, 2);
+ } else if (elapsed_time_len == 4) {
+ const __be32 var32 = htonl(elapsed_time);
+ memcpy(to, &var32, 4);
+ }
+
+ return 0;
+}
+
+static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ const u16 buflen = dccp_ackvec_buflen(av);
+ /* Figure out how many options do we need to represent the ackvec */
+ const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
+ u16 len = buflen + 2 * nr_opts;
+ u8 i, nonce = 0;
+ const unsigned char *tail, *from;
+ unsigned char *to;
+
+ if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
+ dccp_packet_name(dcb->dccpd_type));
+ return -1;
+ }
+ /*
+ * Since Ack Vectors are variable-length, we can not always predict
+ * their size. To catch exception cases where the space is running out
+ * on the skb, a separate Sync is scheduled to carry the Ack Vector.
+ */
+ if (len > DCCPAV_MIN_OPTLEN &&
+ len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
+ DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
+ "MPS=%u ==> reduce payload size?\n", len, skb->len,
+ dcb->dccpd_opt_len, dp->dccps_mss_cache);
+ dp->dccps_sync_scheduled = 1;
+ return 0;
+ }
+ dcb->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ len = buflen;
+ from = av->av_buf + av->av_buf_head;
+ tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
+
+ for (i = 0; i < nr_opts; ++i) {
+ int copylen = len;
+
+ if (len > DCCP_SINGLE_OPT_MAXLEN)
+ copylen = DCCP_SINGLE_OPT_MAXLEN;
+
+ /*
+ * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+ * its type; ack_nonce is the sum of all individual buf_nonce's.
+ */
+ nonce ^= av->av_buf_nonce[i];
+
+ *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
+ *to++ = copylen + 2;
+
+ /* Check if buf_head wraps */
+ if (from + copylen > tail) {
+ const u16 tailsize = tail - from;
+
+ memcpy(to, from, tailsize);
+ to += tailsize;
+ len -= tailsize;
+ copylen -= tailsize;
+ from = av->av_buf;
+ }
+
+ memcpy(to, from, copylen);
+ from += copylen;
+ to += copylen;
+ len -= copylen;
+ }
+ /*
+ * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
+ */
+ if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
+ return -ENOBUFS;
+ return 0;
+}
+
+/**
+ * dccp_insert_option_mandatory - Mandatory option (5.8.2)
+ * Note that since we are using skb_push, this function needs to be called
+ * _after_ inserting the option it is supposed to influence (stack order).
+ */
+int dccp_insert_option_mandatory(struct sk_buff *skb)
+{
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
+ return -1;
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len++;
+ *skb_push(skb, 1) = DCCPO_MANDATORY;
+ return 0;
+}
+
+/**
+ * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb
+ * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
+ * @feat: one out of %dccp_feature_numbers
+ * @val: NN value or SP array (preferred element first) to copy
+ * @len: true length of @val in bytes (excluding first element repetition)
+ * @repeat_first: whether to copy the first element of @val twice
+ *
+ * The last argument is used to construct Confirm options, where the preferred
+ * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
+ * lists are kept such that the preferred entry is always first, so we only need
+ * to copy twice, and avoid the overhead of cloning into a bigger array.
+ */
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
+ u8 *val, u8 len, bool repeat_first)
+{
+ u8 tot_len, *to;
+
+ /* take the `Feature' field and possible repetition into account */
+ if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
+ DCCP_WARN("length %u for feature %u too large\n", len, feat);
+ return -1;
+ }
+
+ if (unlikely(val == NULL || len == 0))
+ len = repeat_first = false;
+ tot_len = 3 + repeat_first + len;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
+ DCCP_WARN("packet too small for feature %d option!\n", feat);
+ return -1;
+ }
+ DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
+
+ to = skb_push(skb, tot_len);
+ *to++ = type;
+ *to++ = tot_len;
+ *to++ = feat;
+
+ if (repeat_first)
+ *to++ = *val;
+ if (len)
+ memcpy(to, val, len);
+ return 0;
+}
+
+/* The length of all options needs to be a multiple of 4 (5.8) */
+static void dccp_insert_option_padding(struct sk_buff *skb)
+{
+ int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
+
+ if (padding != 0) {
+ padding = 4 - padding;
+ memset(skb_push(skb, padding), 0, padding);
+ DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
+ }
+}
+
+int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+
+ if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
+ return -1;
+
+ if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
+
+ /* Feature Negotiation */
+ if (dccp_feat_insert_opts(dp, NULL, skb))
+ return -1;
+
+ if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
+ /*
+ * Obtain RTT sample from Request/Response exchange.
+ * This is currently used for TFRC initialisation.
+ */
+ if (dccp_insert_option_timestamp(skb))
+ return -1;
+
+ } else if (dccp_ackvec_pending(sk) &&
+ dccp_insert_option_ackvec(sk, skb)) {
+ return -1;
+ }
+ }
+
+ if (dp->dccps_hc_rx_insert_options) {
+ if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb))
+ return -1;
+ dp->dccps_hc_rx_insert_options = 0;
+ }
+
+ if (dp->dccps_timestamp_echo != 0 &&
+ dccp_insert_option_timestamp_echo(dp, NULL, skb))
+ return -1;
+
+ dccp_insert_option_padding(skb);
+ return 0;
+}
+
+int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
+{
+ DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+
+ if (dccp_feat_insert_opts(NULL, dreq, skb))
+ return -1;
+
+ /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
+ if (dccp_insert_option_timestamp(skb))
+ return -1;
+
+ if (dreq->dreq_timestamp_echo != 0 &&
+ dccp_insert_option_timestamp_echo(NULL, dreq, skb))
+ return -1;
+
+ dccp_insert_option_padding(skb);
+ return 0;
+}
diff --git a/net/dccp/output.c b/net/dccp/output.c
new file mode 100644
index 000000000..0248e8a34
--- /dev/null
+++ b/net/dccp/output.c
@@ -0,0 +1,698 @@
+/*
+ * net/dccp/output.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/inet_sock.h>
+#include <net/sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+static inline void dccp_event_ack_sent(struct sock *sk)
+{
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
+static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+{
+ skb_set_owner_w(skb, sk);
+ WARN_ON(sk->sk_send_head);
+ sk->sk_send_head = skb;
+ return skb_clone(sk->sk_send_head, gfp_any());
+}
+
+/*
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the DCCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ */
+static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (likely(skb != NULL)) {
+ struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ struct dccp_hdr *dh;
+ /* XXX For now we're using only 48 bits sequence numbers */
+ const u32 dccp_header_size = sizeof(*dh) +
+ sizeof(struct dccp_hdr_ext) +
+ dccp_packet_hdr_len(dcb->dccpd_type);
+ int err, set_ack = 1;
+ u64 ackno = dp->dccps_gsr;
+ /*
+ * Increment GSS here already in case the option code needs it.
+ * Update GSS for real only if option processing below succeeds.
+ */
+ dcb->dccpd_seq = ADD48(dp->dccps_gss, 1);
+
+ switch (dcb->dccpd_type) {
+ case DCCP_PKT_DATA:
+ set_ack = 0;
+ /* fall through */
+ case DCCP_PKT_DATAACK:
+ case DCCP_PKT_RESET:
+ break;
+
+ case DCCP_PKT_REQUEST:
+ set_ack = 0;
+ /* Use ISS on the first (non-retransmitted) Request. */
+ if (icsk->icsk_retransmits == 0)
+ dcb->dccpd_seq = dp->dccps_iss;
+ /* fall through */
+
+ case DCCP_PKT_SYNC:
+ case DCCP_PKT_SYNCACK:
+ ackno = dcb->dccpd_ack_seq;
+ /* fall through */
+ default:
+ /*
+ * Set owner/destructor: some skbs are allocated via
+ * alloc_skb (e.g. when retransmission may happen).
+ * Only Data, DataAck, and Reset packets should come
+ * through here with skb->sk set.
+ */
+ WARN_ON(skb->sk);
+ skb_set_owner_w(skb, sk);
+ break;
+ }
+
+ if (dccp_insert_options(sk, skb)) {
+ kfree_skb(skb);
+ return -EPROTO;
+ }
+
+
+ /* Build DCCP header and checksum it. */
+ dh = dccp_zeroed_hdr(skb, dccp_header_size);
+ dh->dccph_type = dcb->dccpd_type;
+ dh->dccph_sport = inet->inet_sport;
+ dh->dccph_dport = inet->inet_dport;
+ dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
+ dh->dccph_ccval = dcb->dccpd_ccval;
+ dh->dccph_cscov = dp->dccps_pcslen;
+ /* XXX For now we're using only 48 bits sequence numbers */
+ dh->dccph_x = 1;
+
+ dccp_update_gss(sk, dcb->dccpd_seq);
+ dccp_hdr_set_seq(dh, dp->dccps_gss);
+ if (set_ack)
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
+
+ switch (dcb->dccpd_type) {
+ case DCCP_PKT_REQUEST:
+ dccp_hdr_request(skb)->dccph_req_service =
+ dp->dccps_service;
+ /*
+ * Limit Ack window to ISS <= P.ackno <= GSS, so that
+ * only Responses to Requests we sent are considered.
+ */
+ dp->dccps_awl = dp->dccps_iss;
+ break;
+ case DCCP_PKT_RESET:
+ dccp_hdr_reset(skb)->dccph_reset_code =
+ dcb->dccpd_reset_code;
+ break;
+ }
+
+ icsk->icsk_af_ops->send_check(sk, skb);
+
+ if (set_ack)
+ dccp_event_ack_sent(sk);
+
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+ return net_xmit_eval(err);
+ }
+ return -ENOBUFS;
+}
+
+/**
+ * dccp_determine_ccmps - Find out about CCID-specific packet-size limits
+ * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
+ * since the RX CCID is restricted to feedback packets (Acks), which are small
+ * in comparison with the data traffic. A value of 0 means "no current CCMPS".
+ */
+static u32 dccp_determine_ccmps(const struct dccp_sock *dp)
+{
+ const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid;
+
+ if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL)
+ return 0;
+ return tx_ccid->ccid_ops->ccid_ccmps;
+}
+
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ u32 ccmps = dccp_determine_ccmps(dp);
+ u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
+
+ /* Account for header lengths and IPv4/v6 option overhead */
+ cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
+ sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
+
+ /*
+ * Leave enough headroom for common DCCP header options.
+ * This only considers options which may appear on DCCP-Data packets, as
+ * per table 3 in RFC 4340, 5.8. When running out of space for other
+ * options (eg. Ack Vector which can take up to 255 bytes), it is better
+ * to schedule a separate Ack. Thus we leave headroom for the following:
+ * - 1 byte for Slow Receiver (11.6)
+ * - 6 bytes for Timestamp (13.1)
+ * - 10 bytes for Timestamp Echo (13.3)
+ * - 8 bytes for NDP count (7.7, when activated)
+ * - 6 bytes for Data Checksum (9.3)
+ * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
+ */
+ cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
+ (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
+
+ /* And store cached results */
+ icsk->icsk_pmtu_cookie = pmtu;
+ dp->dccps_mss_cache = cur_mps;
+
+ return cur_mps;
+}
+
+EXPORT_SYMBOL_GPL(dccp_sync_mss);
+
+void dccp_write_space(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible(&wq->wait);
+ /* Should agree with poll, otherwise some programs break */
+ if (sock_writeable(sk))
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+
+ rcu_read_unlock();
+}
+
+/**
+ * dccp_wait_for_ccid - Await CCID send permission
+ * @sk: socket to wait for
+ * @delay: timeout in jiffies
+ *
+ * This is used by CCIDs which need to delay the send time in process context.
+ */
+static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
+{
+ DEFINE_WAIT(wait);
+ long remaining;
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending++;
+ release_sock(sk);
+
+ remaining = schedule_timeout(delay);
+
+ lock_sock(sk);
+ sk->sk_write_pending--;
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (signal_pending(current) || sk->sk_err)
+ return -1;
+ return remaining;
+}
+
+/**
+ * dccp_xmit_packet - Send data packet under control of CCID
+ * Transmits next-queued payload and informs CCID to account for the packet.
+ */
+static void dccp_xmit_packet(struct sock *sk)
+{
+ int err, len;
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb = dccp_qpolicy_pop(sk);
+
+ if (unlikely(skb == NULL))
+ return;
+ len = skb->len;
+
+ if (sk->sk_state == DCCP_PARTOPEN) {
+ const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+ /*
+ * See 8.1.5 - Handshake Completion.
+ *
+ * For robustness we resend Confirm options until the client has
+ * entered OPEN. During the initial feature negotiation, the MPS
+ * is smaller than usual, reduced by the Change/Confirm options.
+ */
+ if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+ DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+ dccp_send_ack(sk);
+ dccp_feat_list_purge(&dp->dccps_featneg);
+ }
+
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ inet_csk(sk)->icsk_rto,
+ DCCP_RTO_MAX);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+ } else if (dccp_ack_pending(sk)) {
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+ } else {
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
+ }
+
+ err = dccp_transmit_skb(sk, skb);
+ if (err)
+ dccp_pr_debug("transmit_skb() returned err=%d\n", err);
+ /*
+ * Register this one as sent even if an error occurred. To the remote
+ * end a local packet drop is indistinguishable from network loss, i.e.
+ * any local drop will eventually be reported via receiver feedback.
+ */
+ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
+
+ /*
+ * If the CCID needs to transfer additional header options out-of-band
+ * (e.g. Ack Vectors or feature-negotiation options), it activates this
+ * flag to schedule a Sync. The Sync will automatically incorporate all
+ * currently pending header options, thus clearing the backlog.
+ */
+ if (dp->dccps_sync_scheduled)
+ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+}
+
+/**
+ * dccp_flush_write_queue - Drain queue at end of connection
+ * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
+ * happen that the TX queue is not empty at the end of a connection. We give the
+ * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
+ * returns with a non-empty write queue, it will be purged later.
+ */
+void dccp_flush_write_queue(struct sock *sk, long *time_budget)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+ long delay, rc;
+
+ while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
+ rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+ switch (ccid_packet_dequeue_eval(rc)) {
+ case CCID_PACKET_WILL_DEQUEUE_LATER:
+ /*
+ * If the CCID determines when to send, the next sending
+ * time is unknown or the CCID may not even send again
+ * (e.g. remote host crashes or lost Ack packets).
+ */
+ DCCP_WARN("CCID did not manage to send all packets\n");
+ return;
+ case CCID_PACKET_DELAY:
+ delay = msecs_to_jiffies(rc);
+ if (delay > *time_budget)
+ return;
+ rc = dccp_wait_for_ccid(sk, delay);
+ if (rc < 0)
+ return;
+ *time_budget -= (delay - rc);
+ /* check again if we can send now */
+ break;
+ case CCID_PACKET_SEND_AT_ONCE:
+ dccp_xmit_packet(sk);
+ break;
+ case CCID_PACKET_ERR:
+ skb_dequeue(&sk->sk_write_queue);
+ kfree_skb(skb);
+ dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+ }
+ }
+}
+
+void dccp_write_xmit(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+
+ while ((skb = dccp_qpolicy_top(sk))) {
+ int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+ switch (ccid_packet_dequeue_eval(rc)) {
+ case CCID_PACKET_WILL_DEQUEUE_LATER:
+ return;
+ case CCID_PACKET_DELAY:
+ sk_reset_timer(sk, &dp->dccps_xmit_timer,
+ jiffies + msecs_to_jiffies(rc));
+ return;
+ case CCID_PACKET_SEND_AT_ONCE:
+ dccp_xmit_packet(sk);
+ break;
+ case CCID_PACKET_ERR:
+ dccp_qpolicy_drop(sk, skb);
+ dccp_pr_debug("packet discarded due to err=%d\n", rc);
+ }
+ }
+}
+
+/**
+ * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets
+ * There are only four retransmittable packet types in DCCP:
+ * - Request in client-REQUEST state (sec. 8.1.1),
+ * - CloseReq in server-CLOSEREQ state (sec. 8.3),
+ * - Close in node-CLOSING state (sec. 8.3),
+ * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()).
+ * This function expects sk->sk_send_head to contain the original skb.
+ */
+int dccp_retransmit_skb(struct sock *sk)
+{
+ WARN_ON(sk->sk_send_head == NULL);
+
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
+ return -EHOSTUNREACH; /* Routing failure or similar. */
+
+ /* this count is used to distinguish original and retransmitted skb */
+ inet_csk(sk)->icsk_retransmits++;
+
+ return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC));
+}
+
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req)
+{
+ struct dccp_hdr *dh;
+ struct dccp_request_sock *dreq;
+ const u32 dccp_header_size = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_response);
+ struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1,
+ GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, sk->sk_prot->max_header);
+
+ skb_dst_set(skb, dst_clone(dst));
+
+ dreq = dccp_rsk(req);
+ if (inet_rsk(req)->acked) /* increase GSS upon retransmission */
+ dccp_inc_seqno(&dreq->dreq_gss);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
+ DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss;
+
+ /* Resolve feature dependencies resulting from choice of CCID */
+ if (dccp_feat_server_ccid_dependencies(dreq))
+ goto response_failed;
+
+ if (dccp_insert_options_rsk(dreq, skb))
+ goto response_failed;
+
+ /* Build and checksum header */
+ dh = dccp_zeroed_hdr(skb, dccp_header_size);
+
+ dh->dccph_sport = htons(inet_rsk(req)->ir_num);
+ dh->dccph_dport = inet_rsk(req)->ir_rmt_port;
+ dh->dccph_doff = (dccp_header_size +
+ DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+ dh->dccph_type = DCCP_PKT_RESPONSE;
+ dh->dccph_x = 1;
+ dccp_hdr_set_seq(dh, dreq->dreq_gss);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr);
+ dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
+
+ dccp_csum_outgoing(skb);
+
+ /* We use `acked' to remember that a Response was already sent. */
+ inet_rsk(req)->acked = 1;
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ return skb;
+response_failed:
+ kfree_skb(skb);
+ return NULL;
+}
+
+EXPORT_SYMBOL_GPL(dccp_make_response);
+
+/* answer offending packet in @rcv_skb with Reset from control socket @ctl */
+struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb)
+{
+ struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb);
+ const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_reset);
+ struct dccp_hdr_reset *dhr;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ skb_reserve(skb, sk->sk_prot->max_header);
+
+ /* Swap the send and the receive. */
+ dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
+ dh->dccph_type = DCCP_PKT_RESET;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_reset_len / 4;
+ dh->dccph_x = 1;
+
+ dhr = dccp_hdr_reset(skb);
+ dhr->dccph_reset_code = dcb->dccpd_reset_code;
+
+ switch (dcb->dccpd_reset_code) {
+ case DCCP_RESET_CODE_PACKET_ERROR:
+ dhr->dccph_reset_data[0] = rxdh->dccph_type;
+ break;
+ case DCCP_RESET_CODE_OPTION_ERROR: /* fall through */
+ case DCCP_RESET_CODE_MANDATORY_ERROR:
+ memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3);
+ break;
+ }
+ /*
+ * From RFC 4340, 8.3.1:
+ * If P.ackno exists, set R.seqno := P.ackno + 1.
+ * Else set R.seqno := 0.
+ */
+ if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1));
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq);
+
+ dccp_csum_outgoing(skb);
+ return skb;
+}
+
+EXPORT_SYMBOL_GPL(dccp_ctl_make_reset);
+
+/* send Reset on established socket, to close or abort the connection */
+int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
+{
+ struct sk_buff *skb;
+ /*
+ * FIXME: what if rebuild_header fails?
+ * Should we be doing a rebuild_header here?
+ */
+ int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk);
+
+ if (err != 0)
+ return err;
+
+ skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOBUFS;
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, sk->sk_prot->max_header);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
+ DCCP_SKB_CB(skb)->dccpd_reset_code = code;
+
+ return dccp_transmit_skb(sk, skb);
+}
+
+/*
+ * Do all connect socket setups that can be done AF independent.
+ */
+int dccp_connect(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk->sk_err = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ dccp_sync_mss(sk, dst_mtu(dst));
+
+ /* do not connect if feature negotiation setup fails */
+ if (dccp_feat_finalise_settings(dccp_sk(sk)))
+ return -EPROTO;
+
+ /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
+ dp->dccps_gar = dp->dccps_iss;
+
+ skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
+ if (unlikely(skb == NULL))
+ return -ENOBUFS;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, sk->sk_prot->max_header);
+
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
+
+ dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
+ DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
+
+ /* Timer for repeating the REQUEST until an answer. */
+ icsk->icsk_retransmits = 0;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ icsk->icsk_rto, DCCP_RTO_MAX);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_connect);
+
+void dccp_send_ack(struct sock *sk)
+{
+ /* If we have been reset, we may not send again. */
+ if (sk->sk_state != DCCP_CLOSED) {
+ struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header,
+ GFP_ATOMIC);
+
+ if (skb == NULL) {
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX,
+ DCCP_RTO_MAX);
+ return;
+ }
+
+ /* Reserve space for headers */
+ skb_reserve(skb, sk->sk_prot->max_header);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
+ dccp_transmit_skb(sk, skb);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_send_ack);
+
+#if 0
+/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
+void dccp_send_delayed_ack(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ /*
+ * FIXME: tune this timer. elapsed time fixes the skew, so no problem
+ * with using 2s, and active senders also piggyback the ACK into a
+ * DATAACK packet, so this is really for quiescent senders.
+ */
+ unsigned long timeout = jiffies + 2 * HZ;
+
+ /* Use new timeout only if there wasn't a older one earlier. */
+ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+ /* If delack timer was blocked or is about to expire,
+ * send ACK now.
+ *
+ * FIXME: check the "about to expire" part
+ */
+ if (icsk->icsk_ack.blocked) {
+ dccp_send_ack(sk);
+ return;
+ }
+
+ if (!time_before(timeout, icsk->icsk_ack.timeout))
+ timeout = icsk->icsk_ack.timeout;
+ }
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+#endif
+
+void dccp_send_sync(struct sock *sk, const u64 ackno,
+ const enum dccp_pkt_type pkt_type)
+{
+ /*
+ * We are not putting this on the write queue, so
+ * dccp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
+
+ if (skb == NULL) {
+ /* FIXME: how to make sure the sync is sent? */
+ DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type));
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, sk->sk_prot->max_header);
+ DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
+
+ /*
+ * Clear the flag in case the Sync was scheduled for out-of-band data,
+ * such as carrying a long Ack Vector.
+ */
+ dccp_sk(sk)->dccps_sync_scheduled = 0;
+
+ dccp_transmit_skb(sk, skb);
+}
+
+EXPORT_SYMBOL_GPL(dccp_send_sync);
+
+/*
+ * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
+ * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
+ * any circumstances.
+ */
+void dccp_send_close(struct sock *sk, const int active)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+ const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC;
+
+ skb = alloc_skb(sk->sk_prot->max_header, prio);
+ if (skb == NULL)
+ return;
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, sk->sk_prot->max_header);
+ if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait)
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ;
+ else
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
+
+ if (active) {
+ skb = dccp_skb_entail(sk, skb);
+ /*
+ * Retransmission timer for active-close: RFC 4340, 8.3 requires
+ * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
+ * state can be left. The initial timeout is 2 RTTs.
+ * Since RTT measurement is done by the CCIDs, there is no easy
+ * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4
+ * is too low (200ms); we use a high value to avoid unnecessary
+ * retransmissions when the link RTT is > 0.2 seconds.
+ * FIXME: Let main module sample RTTs and use that instead.
+ */
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
+ }
+ dccp_transmit_skb(sk, skb);
+}
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
new file mode 100644
index 000000000..d8346d0ea
--- /dev/null
+++ b/net/dccp/probe.c
@@ -0,0 +1,202 @@
+/*
+ * dccp_probe - Observe the DCCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * Modified for DCCP from Stephen Hemminger's code
+ * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/dccp.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/vmalloc.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+
+#include "dccp.h"
+#include "ccid.h"
+#include "ccids/ccid3.h"
+
+static int port;
+
+static int bufsize = 64 * 1024;
+
+static const char procname[] = "dccpprobe";
+
+static struct {
+ struct kfifo fifo;
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ struct timespec tstart;
+} dccpw;
+
+static void printl(const char *fmt, ...)
+{
+ va_list args;
+ int len;
+ struct timespec now;
+ char tbuf[256];
+
+ va_start(args, fmt);
+ getnstimeofday(&now);
+
+ now = timespec_sub(now, dccpw.tstart);
+
+ len = sprintf(tbuf, "%lu.%06lu ",
+ (unsigned long) now.tv_sec,
+ (unsigned long) now.tv_nsec / NSEC_PER_USEC);
+ len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
+ va_end(args);
+
+ kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
+ wake_up(&dccpw.wait);
+}
+
+static int jdccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ struct ccid3_hc_tx_sock *hc = NULL;
+
+ if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+ hc = ccid3_hc_tx_sk(sk);
+
+ if (port == 0 || ntohs(inet->inet_dport) == port ||
+ ntohs(inet->inet_sport) == port) {
+ if (hc)
+ printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
+ &inet->inet_saddr, ntohs(inet->inet_sport),
+ &inet->inet_daddr, ntohs(inet->inet_dport), size,
+ hc->tx_s, hc->tx_rtt, hc->tx_p,
+ hc->tx_x_calc, hc->tx_x_recv >> 6,
+ hc->tx_x >> 6, hc->tx_t_ipi);
+ else
+ printl("%pI4:%u %pI4:%u %d\n",
+ &inet->inet_saddr, ntohs(inet->inet_sport),
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ size);
+ }
+
+ jprobe_return();
+ return 0;
+}
+
+static struct jprobe dccp_send_probe = {
+ .kp = {
+ .symbol_name = "dccp_sendmsg",
+ },
+ .entry = jdccp_sendmsg,
+};
+
+static int dccpprobe_open(struct inode *inode, struct file *file)
+{
+ kfifo_reset(&dccpw.fifo);
+ getnstimeofday(&dccpw.tstart);
+ return 0;
+}
+
+static ssize_t dccpprobe_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ int error = 0, cnt = 0;
+ unsigned char *tbuf;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (len == 0)
+ return 0;
+
+ tbuf = vmalloc(len);
+ if (!tbuf)
+ return -ENOMEM;
+
+ error = wait_event_interruptible(dccpw.wait,
+ kfifo_len(&dccpw.fifo) != 0);
+ if (error)
+ goto out_free;
+
+ cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
+ error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
+
+out_free:
+ vfree(tbuf);
+
+ return error ? error : cnt;
+}
+
+static const struct file_operations dccpprobe_fops = {
+ .owner = THIS_MODULE,
+ .open = dccpprobe_open,
+ .read = dccpprobe_read,
+ .llseek = noop_llseek,
+};
+
+static __init int dccpprobe_init(void)
+{
+ int ret = -ENOMEM;
+
+ init_waitqueue_head(&dccpw.wait);
+ spin_lock_init(&dccpw.lock);
+ if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
+ return ret;
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops))
+ goto err0;
+
+ ret = register_jprobe(&dccp_send_probe);
+ if (ret) {
+ ret = request_module("dccp");
+ if (!ret)
+ ret = register_jprobe(&dccp_send_probe);
+ }
+
+ if (ret)
+ goto err1;
+
+ pr_info("DCCP watch registered (port=%d)\n", port);
+ return 0;
+err1:
+ remove_proc_entry(procname, init_net.proc_net);
+err0:
+ kfifo_free(&dccpw.fifo);
+ return ret;
+}
+module_init(dccpprobe_init);
+
+static __exit void dccpprobe_exit(void)
+{
+ kfifo_free(&dccpw.fifo);
+ remove_proc_entry(procname, init_net.proc_net);
+ unregister_jprobe(&dccp_send_probe);
+
+}
+module_exit(dccpprobe_exit);
+
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
+module_param(bufsize, int, 0);
+
+MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
+MODULE_DESCRIPTION("DCCP snooper");
+MODULE_LICENSE("GPL");
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
new file mode 100644
index 000000000..52a940165
--- /dev/null
+++ b/net/dccp/proto.c
@@ -0,0 +1,1253 @@
+/*
+ * net/dccp/proto.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/checksum.h>
+
+#include <net/inet_sock.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+
+#include <asm/ioctls.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/poll.h>
+
+#include "ccid.h"
+#include "dccp.h"
+#include "feat.h"
+
+DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
+
+EXPORT_SYMBOL_GPL(dccp_statistics);
+
+struct percpu_counter dccp_orphan_count;
+EXPORT_SYMBOL_GPL(dccp_orphan_count);
+
+struct inet_hashinfo dccp_hashinfo;
+EXPORT_SYMBOL_GPL(dccp_hashinfo);
+
+/* the maximum queue length for tx in packets. 0 is no limit */
+int sysctl_dccp_tx_qlen __read_mostly = 5;
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_state_name(const int state)
+{
+ static const char *const dccp_state_names[] = {
+ [DCCP_OPEN] = "OPEN",
+ [DCCP_REQUESTING] = "REQUESTING",
+ [DCCP_PARTOPEN] = "PARTOPEN",
+ [DCCP_LISTEN] = "LISTEN",
+ [DCCP_RESPOND] = "RESPOND",
+ [DCCP_CLOSING] = "CLOSING",
+ [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
+ [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
+ [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
+ [DCCP_TIME_WAIT] = "TIME_WAIT",
+ [DCCP_CLOSED] = "CLOSED",
+ };
+
+ if (state >= DCCP_MAX_STATES)
+ return "INVALID STATE!";
+ else
+ return dccp_state_names[state];
+}
+#endif
+
+void dccp_set_state(struct sock *sk, const int state)
+{
+ const int oldstate = sk->sk_state;
+
+ dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk,
+ dccp_state_name(oldstate), dccp_state_name(state));
+ WARN_ON(state == oldstate);
+
+ switch (state) {
+ case DCCP_OPEN:
+ if (oldstate != DCCP_OPEN)
+ DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+ /* Client retransmits all Confirm options until entering OPEN */
+ if (oldstate == DCCP_PARTOPEN)
+ dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
+ break;
+
+ case DCCP_CLOSED:
+ if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
+ oldstate == DCCP_CLOSING)
+ DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
+
+ sk->sk_prot->unhash(sk);
+ if (inet_csk(sk)->icsk_bind_hash != NULL &&
+ !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ inet_put_port(sk);
+ /* fall through */
+ default:
+ if (oldstate == DCCP_OPEN)
+ DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
+ }
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ sk->sk_state = state;
+}
+
+EXPORT_SYMBOL_GPL(dccp_set_state);
+
+static void dccp_finish_passive_close(struct sock *sk)
+{
+ switch (sk->sk_state) {
+ case DCCP_PASSIVE_CLOSE:
+ /* Node (client or server) has received Close packet. */
+ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+ dccp_set_state(sk, DCCP_CLOSED);
+ break;
+ case DCCP_PASSIVE_CLOSEREQ:
+ /*
+ * Client received CloseReq. We set the `active' flag so that
+ * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
+ */
+ dccp_send_close(sk, 1);
+ dccp_set_state(sk, DCCP_CLOSING);
+ }
+}
+
+void dccp_done(struct sock *sk)
+{
+ dccp_set_state(sk, DCCP_CLOSED);
+ dccp_clear_xmit_timers(sk);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ else
+ inet_csk_destroy_sock(sk);
+}
+
+EXPORT_SYMBOL_GPL(dccp_done);
+
+const char *dccp_packet_name(const int type)
+{
+ static const char *const dccp_packet_names[] = {
+ [DCCP_PKT_REQUEST] = "REQUEST",
+ [DCCP_PKT_RESPONSE] = "RESPONSE",
+ [DCCP_PKT_DATA] = "DATA",
+ [DCCP_PKT_ACK] = "ACK",
+ [DCCP_PKT_DATAACK] = "DATAACK",
+ [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
+ [DCCP_PKT_CLOSE] = "CLOSE",
+ [DCCP_PKT_RESET] = "RESET",
+ [DCCP_PKT_SYNC] = "SYNC",
+ [DCCP_PKT_SYNCACK] = "SYNCACK",
+ };
+
+ if (type >= DCCP_NR_PKT_TYPES)
+ return "INVALID";
+ else
+ return dccp_packet_names[type];
+}
+
+EXPORT_SYMBOL_GPL(dccp_packet_name);
+
+int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_rto = DCCP_TIMEOUT_INIT;
+ icsk->icsk_syn_retries = sysctl_dccp_request_retries;
+ sk->sk_state = DCCP_CLOSED;
+ sk->sk_write_space = dccp_write_space;
+ icsk->icsk_sync_mss = dccp_sync_mss;
+ dp->dccps_mss_cache = 536;
+ dp->dccps_rate_last = jiffies;
+ dp->dccps_role = DCCP_ROLE_UNDEFINED;
+ dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
+ dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
+
+ dccp_init_xmit_timers(sk);
+
+ INIT_LIST_HEAD(&dp->dccps_featneg);
+ /* control socket doesn't need feat nego */
+ if (likely(ctl_sock_initialized))
+ return dccp_feat_init(sk);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_init_sock);
+
+void dccp_destroy_sock(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ /*
+ * DCCP doesn't use sk_write_queue, just sk_send_head
+ * for retransmissions
+ */
+ if (sk->sk_send_head != NULL) {
+ kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+ }
+
+ /* Clean up a referenced DCCP bind bucket. */
+ if (inet_csk(sk)->icsk_bind_hash != NULL)
+ inet_put_port(sk);
+
+ kfree(dp->dccps_service_list);
+ dp->dccps_service_list = NULL;
+
+ if (dp->dccps_hc_rx_ackvec != NULL) {
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ }
+ ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+
+ /* clean up feature negotiation state */
+ dccp_feat_list_purge(&dp->dccps_featneg);
+}
+
+EXPORT_SYMBOL_GPL(dccp_destroy_sock);
+
+static inline int dccp_listen_start(struct sock *sk, int backlog)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ dp->dccps_role = DCCP_ROLE_LISTEN;
+ /* do not start to listen if feature negotiation setup fails */
+ if (dccp_feat_finalise_settings(dp))
+ return -EPROTO;
+ return inet_csk_listen_start(sk, backlog);
+}
+
+static inline int dccp_need_reset(int state)
+{
+ return state != DCCP_CLOSED && state != DCCP_LISTEN &&
+ state != DCCP_REQUESTING;
+}
+
+int dccp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ int err = 0;
+ const int old_state = sk->sk_state;
+
+ if (old_state != DCCP_CLOSED)
+ dccp_set_state(sk, DCCP_CLOSED);
+
+ /*
+ * This corresponds to the ABORT function of RFC793, sec. 3.8
+ * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
+ */
+ if (old_state == DCCP_LISTEN) {
+ inet_csk_listen_stop(sk);
+ } else if (dccp_need_reset(old_state)) {
+ dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+ sk->sk_err = ECONNRESET;
+ } else if (old_state == DCCP_REQUESTING)
+ sk->sk_err = ECONNRESET;
+
+ dccp_clear_xmit_timers(sk);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_write_queue);
+ if (sk->sk_send_head != NULL) {
+ __kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+ }
+
+ inet->inet_dport = 0;
+
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ sk->sk_shutdown = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ icsk->icsk_backoff = 0;
+ inet_csk_delack_init(sk);
+ __sk_dst_reset(sk);
+
+ WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
+
+ sk->sk_error_report(sk);
+ return err;
+}
+
+EXPORT_SYMBOL_GPL(dccp_disconnect);
+
+/*
+ * Wait for a DCCP event.
+ *
+ * Note that we don't need to lock the socket, as the upper poll layers
+ * take care of normal races (between the test and the event) and we don't
+ * go look at any of the socket buffers directly.
+ */
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ if (sk->sk_state == DCCP_LISTEN)
+ return inet_csk_listen_poll(sk);
+
+ /* Socket is not locked. We are protected from async events
+ by poll logic and correct handling of state changes
+ made by another threads is impossible in any case.
+ */
+
+ mask = 0;
+ if (sk->sk_err)
+ mask = POLLERR;
+
+ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+
+ /* Connected? */
+ if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
+ if (atomic_read(&sk->sk_rmem_alloc) > 0)
+ mask |= POLLIN | POLLRDNORM;
+
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (sk_stream_is_writeable(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else { /* send SIGIO later */
+ set_bit(SOCK_ASYNC_NOSPACE,
+ &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* Race breaker. If space is freed after
+ * wspace test but before the flags are set,
+ * IO signal will be lost.
+ */
+ if (sk_stream_is_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ }
+ }
+ return mask;
+}
+
+EXPORT_SYMBOL_GPL(dccp_poll);
+
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ int rc = -ENOTCONN;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == DCCP_LISTEN)
+ goto out;
+
+ switch (cmd) {
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ unsigned long amount = 0;
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb != NULL) {
+ /*
+ * We will only return the amount of this packet since
+ * that is all that will be read.
+ */
+ amount = skb->len;
+ }
+ rc = put_user(amount, (int __user *)arg);
+ }
+ break;
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
+ }
+out:
+ release_sock(sk);
+ return rc;
+}
+
+EXPORT_SYMBOL_GPL(dccp_ioctl);
+
+static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
+ char __user *optval, unsigned int optlen)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_service_list *sl = NULL;
+
+ if (service == DCCP_SERVICE_INVALID_VALUE ||
+ optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
+ return -EINVAL;
+
+ if (optlen > sizeof(service)) {
+ sl = kmalloc(optlen, GFP_KERNEL);
+ if (sl == NULL)
+ return -ENOMEM;
+
+ sl->dccpsl_nr = optlen / sizeof(u32) - 1;
+ if (copy_from_user(sl->dccpsl_list,
+ optval + sizeof(service),
+ optlen - sizeof(service)) ||
+ dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
+ kfree(sl);
+ return -EFAULT;
+ }
+ }
+
+ lock_sock(sk);
+ dp->dccps_service = service;
+
+ kfree(dp->dccps_service_list);
+
+ dp->dccps_service_list = sl;
+ release_sock(sk);
+ return 0;
+}
+
+static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
+{
+ u8 *list, len;
+ int i, rc;
+
+ if (cscov < 0 || cscov > 15)
+ return -EINVAL;
+ /*
+ * Populate a list of permissible values, in the range cscov...15. This
+ * is necessary since feature negotiation of single values only works if
+ * both sides incidentally choose the same value. Since the list starts
+ * lowest-value first, negotiation will pick the smallest shared value.
+ */
+ if (cscov == 0)
+ return 0;
+ len = 16 - cscov;
+
+ list = kmalloc(len, GFP_KERNEL);
+ if (list == NULL)
+ return -ENOBUFS;
+
+ for (i = 0; i < len; i++)
+ list[i] = cscov++;
+
+ rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
+
+ if (rc == 0) {
+ if (rx)
+ dccp_sk(sk)->dccps_pcrlen = cscov;
+ else
+ dccp_sk(sk)->dccps_pcslen = cscov;
+ }
+ kfree(list);
+ return rc;
+}
+
+static int dccp_setsockopt_ccid(struct sock *sk, int type,
+ char __user *optval, unsigned int optlen)
+{
+ u8 *val;
+ int rc = 0;
+
+ if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+ return -EINVAL;
+
+ val = memdup_user(optval, optlen);
+ if (IS_ERR(val))
+ return PTR_ERR(val);
+
+ lock_sock(sk);
+ if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
+ rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+
+ if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
+ rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
+ release_sock(sk);
+
+ kfree(val);
+ return rc;
+}
+
+static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ int val, err = 0;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_PACKET_SIZE:
+ DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+ return 0;
+ case DCCP_SOCKOPT_CHANGE_L:
+ case DCCP_SOCKOPT_CHANGE_R:
+ DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
+ return 0;
+ case DCCP_SOCKOPT_CCID:
+ case DCCP_SOCKOPT_RX_CCID:
+ case DCCP_SOCKOPT_TX_CCID:
+ return dccp_setsockopt_ccid(sk, optname, optval, optlen);
+ }
+
+ if (optlen < (int)sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ if (optname == DCCP_SOCKOPT_SERVICE)
+ return dccp_setsockopt_service(sk, val, optval, optlen);
+
+ lock_sock(sk);
+ switch (optname) {
+ case DCCP_SOCKOPT_SERVER_TIMEWAIT:
+ if (dp->dccps_role != DCCP_ROLE_SERVER)
+ err = -EOPNOTSUPP;
+ else
+ dp->dccps_server_timewait = (val != 0);
+ break;
+ case DCCP_SOCKOPT_SEND_CSCOV:
+ err = dccp_setsockopt_cscov(sk, val, false);
+ break;
+ case DCCP_SOCKOPT_RECV_CSCOV:
+ err = dccp_setsockopt_cscov(sk, val, true);
+ break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ if (sk->sk_state != DCCP_CLOSED)
+ err = -EISCONN;
+ else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+ err = -EINVAL;
+ else
+ dp->dccps_qpolicy = val;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ if (val < 0)
+ err = -EINVAL;
+ else
+ dp->dccps_tx_qlen = val;
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+ release_sock(sk);
+
+ return err;
+}
+
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_DCCP)
+ return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
+ optname, optval,
+ optlen);
+ return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(dccp_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_DCCP)
+ return inet_csk_compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
+#endif
+
+static int dccp_getsockopt_service(struct sock *sk, int len,
+ __be32 __user *optval,
+ int __user *optlen)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ const struct dccp_service_list *sl;
+ int err = -ENOENT, slen = 0, total_len = sizeof(u32);
+
+ lock_sock(sk);
+ if ((sl = dp->dccps_service_list) != NULL) {
+ slen = sl->dccpsl_nr * sizeof(u32);
+ total_len += slen;
+ }
+
+ err = -EINVAL;
+ if (total_len > len)
+ goto out;
+
+ err = 0;
+ if (put_user(total_len, optlen) ||
+ put_user(dp->dccps_service, optval) ||
+ (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
+ err = -EFAULT;
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct dccp_sock *dp;
+ int val, len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < (int)sizeof(int))
+ return -EINVAL;
+
+ dp = dccp_sk(sk);
+
+ switch (optname) {
+ case DCCP_SOCKOPT_PACKET_SIZE:
+ DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+ return 0;
+ case DCCP_SOCKOPT_SERVICE:
+ return dccp_getsockopt_service(sk, len,
+ (__be32 __user *)optval, optlen);
+ case DCCP_SOCKOPT_GET_CUR_MPS:
+ val = dp->dccps_mss_cache;
+ break;
+ case DCCP_SOCKOPT_AVAILABLE_CCIDS:
+ return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
+ case DCCP_SOCKOPT_TX_CCID:
+ val = ccid_get_current_tx_ccid(dp);
+ if (val < 0)
+ return -ENOPROTOOPT;
+ break;
+ case DCCP_SOCKOPT_RX_CCID:
+ val = ccid_get_current_rx_ccid(dp);
+ if (val < 0)
+ return -ENOPROTOOPT;
+ break;
+ case DCCP_SOCKOPT_SERVER_TIMEWAIT:
+ val = dp->dccps_server_timewait;
+ break;
+ case DCCP_SOCKOPT_SEND_CSCOV:
+ val = dp->dccps_pcslen;
+ break;
+ case DCCP_SOCKOPT_RECV_CSCOV:
+ val = dp->dccps_pcrlen;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ val = dp->dccps_qpolicy;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ val = dp->dccps_tx_qlen;
+ break;
+ case 128 ... 191:
+ return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
+ len, (u32 __user *)optval, optlen);
+ case 192 ... 255:
+ return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
+ len, (u32 __user *)optval, optlen);
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ len = sizeof(val);
+ if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_DCCP)
+ return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
+ optname, optval,
+ optlen);
+ return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(dccp_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_DCCP)
+ return inet_csk_compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
+#endif
+
+static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct cmsghdr *cmsg;
+
+ /*
+ * Assign an (opaque) qpolicy priority value to skb->priority.
+ *
+ * We are overloading this skb field for use with the qpolicy subystem.
+ * The skb->priority is normally used for the SO_PRIORITY option, which
+ * is initialised from sk_priority. Since the assignment of sk_priority
+ * to skb->priority happens later (on layer 3), we overload this field
+ * for use with queueing priorities as long as the skb is on layer 4.
+ * The default priority value (if nothing is set) is 0.
+ */
+ skb->priority = 0;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_DCCP)
+ continue;
+
+ if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
+ !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
+ return -EINVAL;
+
+ switch (cmsg->cmsg_type) {
+ case DCCP_SCM_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
+ return -EINVAL;
+ skb->priority = *(__u32 *)CMSG_DATA(cmsg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ const int flags = msg->msg_flags;
+ const int noblock = flags & MSG_DONTWAIT;
+ struct sk_buff *skb;
+ int rc, size;
+ long timeo;
+
+ if (len > dp->dccps_mss_cache)
+ return -EMSGSIZE;
+
+ lock_sock(sk);
+
+ if (dccp_qpolicy_full(sk)) {
+ rc = -EAGAIN;
+ goto out_release;
+ }
+
+ timeo = sock_sndtimeo(sk, noblock);
+
+ /*
+ * We have to use sk_stream_wait_connect here to set sk_write_pending,
+ * so that the trick in dccp_rcv_request_sent_state_process.
+ */
+ /* Wait for a connection to finish. */
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+ if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto out_release;
+
+ size = sk->sk_prot->max_header + len;
+ release_sock(sk);
+ skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+ lock_sock(sk);
+ if (skb == NULL)
+ goto out_release;
+
+ skb_reserve(skb, sk->sk_prot->max_header);
+ rc = memcpy_from_msg(skb_put(skb, len), msg, len);
+ if (rc != 0)
+ goto out_discard;
+
+ rc = dccp_msghdr_parse(msg, skb);
+ if (rc != 0)
+ goto out_discard;
+
+ dccp_qpolicy_push(sk, skb);
+ /*
+ * The xmit_timer is set if the TX CCID is rate-based and will expire
+ * when congestion control permits to release further packets into the
+ * network. Window-based CCIDs do not use this timer.
+ */
+ if (!timer_pending(&dp->dccps_xmit_timer))
+ dccp_write_xmit(sk);
+out_release:
+ release_sock(sk);
+ return rc ? : len;
+out_discard:
+ kfree_skb(skb);
+ goto out_release;
+}
+
+EXPORT_SYMBOL_GPL(dccp_sendmsg);
+
+int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len)
+{
+ const struct dccp_hdr *dh;
+ long timeo;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ len = -ENOTCONN;
+ goto out;
+ }
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ do {
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+ if (skb == NULL)
+ goto verify_sock_status;
+
+ dh = dccp_hdr(skb);
+
+ switch (dh->dccph_type) {
+ case DCCP_PKT_DATA:
+ case DCCP_PKT_DATAACK:
+ goto found_ok_skb;
+
+ case DCCP_PKT_CLOSE:
+ case DCCP_PKT_CLOSEREQ:
+ if (!(flags & MSG_PEEK))
+ dccp_finish_passive_close(sk);
+ /* fall through */
+ case DCCP_PKT_RESET:
+ dccp_pr_debug("found fin (%s) ok!\n",
+ dccp_packet_name(dh->dccph_type));
+ len = 0;
+ goto found_fin_ok;
+ default:
+ dccp_pr_debug("packet_type=%s\n",
+ dccp_packet_name(dh->dccph_type));
+ sk_eat_skb(sk, skb);
+ }
+verify_sock_status:
+ if (sock_flag(sk, SOCK_DONE)) {
+ len = 0;
+ break;
+ }
+
+ if (sk->sk_err) {
+ len = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ len = 0;
+ break;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ len = -ENOTCONN;
+ break;
+ }
+ len = 0;
+ break;
+ }
+
+ if (!timeo) {
+ len = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ len = sock_intr_errno(timeo);
+ break;
+ }
+
+ sk_wait_data(sk, &timeo);
+ continue;
+ found_ok_skb:
+ if (len > skb->len)
+ len = skb->len;
+ else if (len < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+
+ if (skb_copy_datagram_msg(skb, 0, msg, len)) {
+ /* Exception. Bailout! */
+ len = -EFAULT;
+ break;
+ }
+ if (flags & MSG_TRUNC)
+ len = skb->len;
+ found_fin_ok:
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ break;
+ } while (1);
+out:
+ release_sock(sk);
+ return len;
+}
+
+EXPORT_SYMBOL_GPL(dccp_recvmsg);
+
+int inet_dccp_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
+ goto out;
+
+ old_state = sk->sk_state;
+ if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
+ goto out;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != DCCP_LISTEN) {
+ /*
+ * FIXME: here it probably should be sk->sk_prot->listen_start
+ * see tcp_listen_start
+ */
+ err = dccp_listen_start(sk, backlog);
+ if (err)
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+EXPORT_SYMBOL_GPL(inet_dccp_listen);
+
+static void dccp_terminate_connection(struct sock *sk)
+{
+ u8 next_state = DCCP_CLOSED;
+
+ switch (sk->sk_state) {
+ case DCCP_PASSIVE_CLOSE:
+ case DCCP_PASSIVE_CLOSEREQ:
+ dccp_finish_passive_close(sk);
+ break;
+ case DCCP_PARTOPEN:
+ dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+ /* fall through */
+ case DCCP_OPEN:
+ dccp_send_close(sk, 1);
+
+ if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
+ !dccp_sk(sk)->dccps_server_timewait)
+ next_state = DCCP_ACTIVE_CLOSEREQ;
+ else
+ next_state = DCCP_CLOSING;
+ /* fall through */
+ default:
+ dccp_set_state(sk, next_state);
+ }
+}
+
+void dccp_close(struct sock *sk, long timeout)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+ u32 data_was_unread = 0;
+ int state;
+
+ lock_sock(sk);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ dccp_set_state(sk, DCCP_CLOSED);
+
+ /* Special case. */
+ inet_csk_listen_stop(sk);
+
+ goto adjudge_to_death;
+ }
+
+ sk_stop_timer(sk, &dp->dccps_xmit_timer);
+
+ /*
+ * We need to flush the recv. buffs. We do this only on the
+ * descriptor close, not protocol-sourced closes, because the
+ *reader process may not have drained the data yet!
+ */
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ data_was_unread += skb->len;
+ __kfree_skb(skb);
+ }
+
+ if (data_was_unread) {
+ /* Unread data was tossed, send an appropriate Reset Code */
+ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
+ dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+ dccp_set_state(sk, DCCP_CLOSED);
+ } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+ /* Check zero linger _after_ checking for unread data. */
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (sk->sk_state != DCCP_CLOSED) {
+ /*
+ * Normal connection termination. May need to wait if there are
+ * still packets in the TX queue that are delayed by the CCID.
+ */
+ dccp_flush_write_queue(sk, &timeout);
+ dccp_terminate_connection(sk);
+ }
+
+ /*
+ * Flush write queue. This may be necessary in several cases:
+ * - we have been closed by the peer but still have application data;
+ * - abortive termination (unread data or zero linger time),
+ * - normal termination but queue could not be flushed within time limit
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+
+ sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+ state = sk->sk_state;
+ sock_hold(sk);
+ sock_orphan(sk);
+
+ /*
+ * It is the last release_sock in its life. It will remove backlog.
+ */
+ release_sock(sk);
+ /*
+ * Now socket is owned by kernel and we acquire BH lock
+ * to finish close. No need to check for user refs.
+ */
+ local_bh_disable();
+ bh_lock_sock(sk);
+ WARN_ON(sock_owned_by_user(sk));
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ /* Have we already been destroyed by a softirq or backlog? */
+ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
+ goto out;
+
+ if (sk->sk_state == DCCP_CLOSED)
+ inet_csk_destroy_sock(sk);
+
+ /* Otherwise, socket is reprieved until protocol close. */
+
+out:
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+}
+
+EXPORT_SYMBOL_GPL(dccp_close);
+
+void dccp_shutdown(struct sock *sk, int how)
+{
+ dccp_pr_debug("called shutdown(%x)\n", how);
+}
+
+EXPORT_SYMBOL_GPL(dccp_shutdown);
+
+static inline int __init dccp_mib_init(void)
+{
+ dccp_statistics = alloc_percpu(struct dccp_mib);
+ if (!dccp_statistics)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void dccp_mib_exit(void)
+{
+ free_percpu(dccp_statistics);
+}
+
+static int thash_entries;
+module_param(thash_entries, int, 0444);
+MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+bool dccp_debug;
+module_param(dccp_debug, bool, 0644);
+MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+
+EXPORT_SYMBOL_GPL(dccp_debug);
+#endif
+
+static int __init dccp_init(void)
+{
+ unsigned long goal;
+ int ehash_order, bhash_order, i;
+ int rc;
+
+ BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
+ FIELD_SIZEOF(struct sk_buff, cb));
+ rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
+ if (rc)
+ goto out_fail;
+ rc = -ENOBUFS;
+ inet_hashinfo_init(&dccp_hashinfo);
+ dccp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("dccp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!dccp_hashinfo.bind_bucket_cachep)
+ goto out_free_percpu;
+
+ /*
+ * Size and allocate the main established and bind bucket
+ * hash tables.
+ *
+ * The methodology is similar to that of the buffer cache.
+ */
+ if (totalram_pages >= (128 * 1024))
+ goal = totalram_pages >> (21 - PAGE_SHIFT);
+ else
+ goal = totalram_pages >> (23 - PAGE_SHIFT);
+
+ if (thash_entries)
+ goal = (thash_entries *
+ sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
+ for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
+ ;
+ do {
+ unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
+ sizeof(struct inet_ehash_bucket);
+
+ while (hash_size & (hash_size - 1))
+ hash_size--;
+ dccp_hashinfo.ehash_mask = hash_size - 1;
+ dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
+ __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
+ } while (!dccp_hashinfo.ehash && --ehash_order > 0);
+
+ if (!dccp_hashinfo.ehash) {
+ DCCP_CRIT("Failed to allocate DCCP established hash table");
+ goto out_free_bind_bucket_cachep;
+ }
+
+ for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
+ INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
+
+ if (inet_ehash_locks_alloc(&dccp_hashinfo))
+ goto out_free_dccp_ehash;
+
+ bhash_order = ehash_order;
+
+ do {
+ dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
+ sizeof(struct inet_bind_hashbucket);
+ if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
+ bhash_order > 0)
+ continue;
+ dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
+ __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
+ } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
+
+ if (!dccp_hashinfo.bhash) {
+ DCCP_CRIT("Failed to allocate DCCP bind hash table");
+ goto out_free_dccp_locks;
+ }
+
+ for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
+ spin_lock_init(&dccp_hashinfo.bhash[i].lock);
+ INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+ }
+
+ rc = dccp_mib_init();
+ if (rc)
+ goto out_free_dccp_bhash;
+
+ rc = dccp_ackvec_init();
+ if (rc)
+ goto out_free_dccp_mib;
+
+ rc = dccp_sysctl_init();
+ if (rc)
+ goto out_ackvec_exit;
+
+ rc = ccid_initialize_builtins();
+ if (rc)
+ goto out_sysctl_exit;
+
+ dccp_timestamping_init();
+
+ return 0;
+
+out_sysctl_exit:
+ dccp_sysctl_exit();
+out_ackvec_exit:
+ dccp_ackvec_exit();
+out_free_dccp_mib:
+ dccp_mib_exit();
+out_free_dccp_bhash:
+ free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
+out_free_dccp_locks:
+ inet_ehash_locks_free(&dccp_hashinfo);
+out_free_dccp_ehash:
+ free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
+out_free_bind_bucket_cachep:
+ kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+out_free_percpu:
+ percpu_counter_destroy(&dccp_orphan_count);
+out_fail:
+ dccp_hashinfo.bhash = NULL;
+ dccp_hashinfo.ehash = NULL;
+ dccp_hashinfo.bind_bucket_cachep = NULL;
+ return rc;
+}
+
+static void __exit dccp_fini(void)
+{
+ ccid_cleanup_builtins();
+ dccp_mib_exit();
+ free_pages((unsigned long)dccp_hashinfo.bhash,
+ get_order(dccp_hashinfo.bhash_size *
+ sizeof(struct inet_bind_hashbucket)));
+ free_pages((unsigned long)dccp_hashinfo.ehash,
+ get_order((dccp_hashinfo.ehash_mask + 1) *
+ sizeof(struct inet_ehash_bucket)));
+ inet_ehash_locks_free(&dccp_hashinfo);
+ kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+ dccp_ackvec_exit();
+ dccp_sysctl_exit();
+ percpu_counter_destroy(&dccp_orphan_count);
+}
+
+module_init(dccp_init);
+module_exit(dccp_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 000000000..63c30bfa4
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
+/*
+ * net/dccp/qpolicy.c
+ *
+ * Policy-based packet dequeueing interface for DCCP.
+ *
+ * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License v2
+ * as published by the Free Software Foundation.
+ */
+#include "dccp.h"
+
+/*
+ * Simple Dequeueing Policy:
+ * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
+ */
+static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
+{
+ skb_queue_tail(&sk->sk_write_queue, skb);
+}
+
+static bool qpolicy_simple_full(struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_tx_qlen &&
+ sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
+}
+
+static struct sk_buff *qpolicy_simple_top(struct sock *sk)
+{
+ return skb_peek(&sk->sk_write_queue);
+}
+
+/*
+ * Priority-based Dequeueing Policy:
+ * If tx_qlen is different from 0 and the queue has reached its upper bound
+ * of tx_qlen elements, replace older packets lowest-priority-first.
+ */
+static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *best = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (best == NULL || skb->priority > best->priority)
+ best = skb;
+ return best;
+}
+
+static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *worst = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (worst == NULL || skb->priority < worst->priority)
+ worst = skb;
+ return worst;
+}
+
+static bool qpolicy_prio_full(struct sock *sk)
+{
+ if (qpolicy_simple_full(sk))
+ dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
+ return false;
+}
+
+/**
+ * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
+ * @push: add a new @skb to the write queue
+ * @full: indicates that no more packets will be admitted
+ * @top: peeks at whatever the queueing policy defines as its `top'
+ */
+static struct dccp_qpolicy_operations {
+ void (*push) (struct sock *sk, struct sk_buff *skb);
+ bool (*full) (struct sock *sk);
+ struct sk_buff* (*top) (struct sock *sk);
+ __be32 params;
+
+} qpol_table[DCCPQ_POLICY_MAX] = {
+ [DCCPQ_POLICY_SIMPLE] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_simple_full,
+ .top = qpolicy_simple_top,
+ .params = 0,
+ },
+ [DCCPQ_POLICY_PRIO] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_prio_full,
+ .top = qpolicy_prio_best_skb,
+ .params = DCCP_SCM_PRIORITY,
+ },
+};
+
+/*
+ * Externally visible interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
+{
+ qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
+}
+
+bool dccp_qpolicy_full(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
+}
+
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb != NULL) {
+ skb_unlink(skb, &sk->sk_write_queue);
+ kfree_skb(skb);
+ }
+}
+
+struct sk_buff *dccp_qpolicy_top(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
+}
+
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
+{
+ struct sk_buff *skb = dccp_qpolicy_top(sk);
+
+ if (skb != NULL) {
+ /* Clear any skb fields that we used internally */
+ skb->priority = 0;
+ skb_unlink(skb, &sk->sk_write_queue);
+ }
+ return skb;
+}
+
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
+{
+ /* check if exactly one bit is set */
+ if (!param || (param & (param - 1)))
+ return false;
+ return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
+}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
new file mode 100644
index 000000000..53731e454
--- /dev/null
+++ b/net/dccp/sysctl.c
@@ -0,0 +1,118 @@
+/*
+ * net/dccp/sysctl.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License v2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include "dccp.h"
+#include "feat.h"
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+/* Boundary values */
+static int zero = 0,
+ one = 1,
+ u8_max = 0xFF;
+static unsigned long seqw_min = DCCPF_SEQ_WMIN,
+ seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */
+
+static struct ctl_table dccp_default_table[] = {
+ {
+ .procname = "seq_window",
+ .data = &sysctl_dccp_sequence_window,
+ .maxlen = sizeof(sysctl_dccp_sequence_window),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
+ .extra2 = &seqw_max,
+ },
+ {
+ .procname = "rx_ccid",
+ .data = &sysctl_dccp_rx_ccid,
+ .maxlen = sizeof(sysctl_dccp_rx_ccid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max, /* RFC 4340, 10. */
+ },
+ {
+ .procname = "tx_ccid",
+ .data = &sysctl_dccp_tx_ccid,
+ .maxlen = sizeof(sysctl_dccp_tx_ccid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max, /* RFC 4340, 10. */
+ },
+ {
+ .procname = "request_retries",
+ .data = &sysctl_dccp_request_retries,
+ .maxlen = sizeof(sysctl_dccp_request_retries),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &u8_max,
+ },
+ {
+ .procname = "retries1",
+ .data = &sysctl_dccp_retries1,
+ .maxlen = sizeof(sysctl_dccp_retries1),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max,
+ },
+ {
+ .procname = "retries2",
+ .data = &sysctl_dccp_retries2,
+ .maxlen = sizeof(sysctl_dccp_retries2),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max,
+ },
+ {
+ .procname = "tx_qlen",
+ .data = &sysctl_dccp_tx_qlen,
+ .maxlen = sizeof(sysctl_dccp_tx_qlen),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "sync_ratelimit",
+ .data = &sysctl_dccp_sync_ratelimit,
+ .maxlen = sizeof(sysctl_dccp_sync_ratelimit),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+
+ { }
+};
+
+static struct ctl_table_header *dccp_table_header;
+
+int __init dccp_sysctl_init(void)
+{
+ dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default",
+ dccp_default_table);
+
+ return dccp_table_header != NULL ? 0 : -ENOMEM;
+}
+
+void dccp_sysctl_exit(void)
+{
+ if (dccp_table_header != NULL) {
+ unregister_net_sysctl_table(dccp_table_header);
+ dccp_table_header = NULL;
+ }
+}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
new file mode 100644
index 000000000..3ef7acef3
--- /dev/null
+++ b/net/dccp/timer.c
@@ -0,0 +1,271 @@
+/*
+ * net/dccp/timer.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+
+#include "dccp.h"
+
+/* sysctl variables governing numbers of retransmission attempts */
+int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES;
+int sysctl_dccp_retries1 __read_mostly = TCP_RETR1;
+int sysctl_dccp_retries2 __read_mostly = TCP_RETR2;
+
+static void dccp_write_err(struct sock *sk)
+{
+ sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+ sk->sk_error_report(sk);
+
+ dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+ dccp_done(sk);
+ DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int dccp_write_timeout(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ int retry_until;
+
+ if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
+ if (icsk->icsk_retransmits != 0)
+ dst_negative_advice(sk);
+ retry_until = icsk->icsk_syn_retries ?
+ : sysctl_dccp_request_retries;
+ } else {
+ if (icsk->icsk_retransmits >= sysctl_dccp_retries1) {
+ /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
+ black hole detection. :-(
+
+ It is place to make it. It is not made. I do not want
+ to make it. It is disguisting. It does not work in any
+ case. Let me to cite the same draft, which requires for
+ us to implement this:
+
+ "The one security concern raised by this memo is that ICMP black holes
+ are often caused by over-zealous security administrators who block
+ all ICMP messages. It is vitally important that those who design and
+ deploy security systems understand the impact of strict filtering on
+ upper-layer protocols. The safest web site in the world is worthless
+ if most TCP implementations cannot transfer data from it. It would
+ be far nicer to have all of the black holes fixed rather than fixing
+ all of the TCP implementations."
+
+ Golden words :-).
+ */
+
+ dst_negative_advice(sk);
+ }
+
+ retry_until = sysctl_dccp_retries2;
+ /*
+ * FIXME: see tcp_write_timout and tcp_out_of_resources
+ */
+ }
+
+ if (icsk->icsk_retransmits >= retry_until) {
+ /* Has it gone just too far? */
+ dccp_write_err(sk);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * The DCCP retransmit timer.
+ */
+static void dccp_retransmit_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /*
+ * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
+ * sent, no need to retransmit, this sock is dead.
+ */
+ if (dccp_write_timeout(sk))
+ return;
+
+ /*
+ * We want to know the number of packets retransmitted, not the
+ * total number of retransmissions of clones of original packets.
+ */
+ if (icsk->icsk_retransmits == 0)
+ DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
+
+ if (dccp_retransmit_skb(sk) != 0) {
+ /*
+ * Retransmission failed because of local congestion,
+ * do not backoff.
+ */
+ if (--icsk->icsk_retransmits == 0)
+ icsk->icsk_retransmits = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ min(icsk->icsk_rto,
+ TCP_RESOURCE_PROBE_INTERVAL),
+ DCCP_RTO_MAX);
+ return;
+ }
+
+ icsk->icsk_backoff++;
+
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
+ DCCP_RTO_MAX);
+ if (icsk->icsk_retransmits > sysctl_dccp_retries1)
+ __sk_dst_reset(sk);
+}
+
+static void dccp_write_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int event = 0;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later */
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+ jiffies + (HZ / 20));
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
+ goto out;
+
+ if (time_after(icsk->icsk_timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+ icsk->icsk_timeout);
+ goto out;
+ }
+
+ event = icsk->icsk_pending;
+ icsk->icsk_pending = 0;
+
+ switch (event) {
+ case ICSK_TIME_RETRANS:
+ dccp_retransmit_timer(sk);
+ break;
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+static void dccp_keepalive_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ pr_err("dccp should not use a keepalive timer !\n");
+ sock_put(sk);
+}
+
+/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
+static void dccp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ icsk->icsk_ack.blocked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ jiffies + TCP_DELACK_MIN);
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED ||
+ !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ goto out;
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ icsk->icsk_ack.timeout);
+ goto out;
+ }
+
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
+ /* Delayed ACK missed: inflate ATO. */
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
+ icsk->icsk_rto);
+ } else {
+ /* Delayed ACK missed: leave pingpong mode and
+ * deflate ATO.
+ */
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ }
+ dccp_send_ack(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/**
+ * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
+ * See the comments above %ccid_dequeueing_decision for supported modes.
+ */
+static void dccp_write_xmitlet(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
+ else
+ dccp_write_xmit(sk);
+ bh_unlock_sock(sk);
+}
+
+static void dccp_write_xmit_timer(unsigned long data)
+{
+ dccp_write_xmitlet(data);
+ sock_put((struct sock *)data);
+}
+
+void dccp_init_xmit_timers(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
+ setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+ (unsigned long)sk);
+ inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
+ &dccp_keepalive_timer);
+}
+
+static ktime_t dccp_timestamp_seed;
+/**
+ * dccp_timestamp - 10s of microseconds time source
+ * Returns the number of 10s of microseconds since loading DCCP. This is native
+ * DCCP time difference format (RFC 4340, sec. 13).
+ * Please note: This will wrap around about circa every 11.9 hours.
+ */
+u32 dccp_timestamp(void)
+{
+ u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
+
+ do_div(delta, 10);
+ return delta;
+}
+EXPORT_SYMBOL_GPL(dccp_timestamp);
+
+void __init dccp_timestamping_init(void)
+{
+ dccp_timestamp_seed = ktime_get_real();
+}
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
new file mode 100644
index 000000000..f3393e154
--- /dev/null
+++ b/net/decnet/Kconfig
@@ -0,0 +1,43 @@
+#
+# DECnet configuration
+#
+config DECNET
+ tristate "DECnet Support"
+ ---help---
+ The DECnet networking protocol was used in many products made by
+ Digital (now Compaq). It provides reliable stream and sequenced
+ packet communications over which run a variety of services similar
+ to those which run over TCP/IP.
+
+ To find some tools to use with the kernel layer support, please
+ look at Patrick Caulfield's web site:
+ <http://linux-decnet.sourceforge.net/>.
+
+ More detailed documentation is available in
+ <file:Documentation/networking/decnet.txt>.
+
+ Be sure to say Y to "/proc file system support" and "Sysctl support"
+ below when using DECnet, since you will need sysctl support to aid
+ in configuration at run time.
+
+ The DECnet code is also available as a module ( = code which can be
+ inserted in and removed from the running kernel whenever you want).
+ The module is called decnet.
+
+config DECNET_ROUTER
+ bool "DECnet: router support"
+ depends on DECNET
+ select FIB_RULES
+ ---help---
+ Add support for turning your DECnet Endnode into a level 1 or 2
+ router. This is an experimental, but functional option. If you
+ do say Y here, then make sure that you also say Y to "Kernel/User
+ network link driver", "Routing messages" and "Network packet
+ filtering". The first two are required to allow configuration via
+ rtnetlink (you will need Alexey Kuznetsov's iproute2 package
+ from <ftp://ftp.tux.org/pub/net/ip-routing/>). The "Network packet
+ filtering" option will be required for the forthcoming routing daemon
+ to work.
+
+ See <file:Documentation/networking/decnet.txt> for more information.
+
diff --git a/net/decnet/Makefile b/net/decnet/Makefile
new file mode 100644
index 000000000..e44003af7
--- /dev/null
+++ b/net/decnet/Makefile
@@ -0,0 +1,10 @@
+
+obj-$(CONFIG_DECNET) += decnet.o
+
+decnet-y := af_decnet.o dn_nsp_in.o dn_nsp_out.o \
+ dn_route.o dn_dev.o dn_neigh.o dn_timer.o
+decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o
+decnet-y += sysctl_net_decnet.o
+
+obj-$(CONFIG_NETFILTER) += netfilter/
+
diff --git a/net/decnet/README b/net/decnet/README
new file mode 100644
index 000000000..60e7ec88c
--- /dev/null
+++ b/net/decnet/README
@@ -0,0 +1,8 @@
+ Linux DECnet Project
+ ======================
+
+The documentation for this kernel subsystem is available in the
+Documentation/networking subdirectory of this distribution and also
+on line at http://www.chygwyn.com/DECnet/
+
+Steve Whitehouse <SteveW@ACM.org>
diff --git a/net/decnet/TODO b/net/decnet/TODO
new file mode 100644
index 000000000..ebb5ac69d
--- /dev/null
+++ b/net/decnet/TODO
@@ -0,0 +1,41 @@
+Steve's quick list of things that need finishing off:
+[they are in no particular order and range from the trivial to the long winded]
+
+ o Proper timeouts on each neighbour (in routing mode) rather than
+ just the 60 second On-Ethernet cache value.
+
+ o Support for X.25 linklayer
+
+ o Support for DDCMP link layer
+
+ o The DDCMP device itself
+
+ o PPP support (rfc1762)
+
+ o Lots of testing with real applications
+
+ o Verify errors etc. against POSIX 1003.1g (draft)
+
+ o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g)
+ [maybe this should be done at socket level... the control data in the
+ send/recvmsg() calls should simply be a vector of set/getsockopt()
+ calls]
+
+ o check MSG_CTRUNC is set where it should be.
+
+ o Find all the commonality between DECnet and IPv4 routing code and extract
+ it into a small library of routines. [probably a project for 2.7.xx]
+
+ o Add perfect socket hashing - an idea suggested by Paul Koning. Currently
+ we have a half-way house scheme which seems to work reasonably well, but
+ the full scheme is still worth implementing, its not not top of my list
+ right now.
+
+ o Add session control message flow control
+
+ o Add NSP message flow control
+
+ o DECnet sendpages() function
+
+ o AIO for DECnet
+
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
new file mode 100644
index 000000000..754484b3c
--- /dev/null
+++ b/net/decnet/af_decnet.c
@@ -0,0 +1,2416 @@
+
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Socket Layer Interface
+ *
+ * Authors: Eduardo Marcelo Serrat <emserrat@geocities.com>
+ * Patrick Caulfield <patrick@pandh.demon.co.uk>
+ *
+ * Changes:
+ * Steve Whitehouse: Copied from Eduardo Serrat and Patrick Caulfield's
+ * version of the code. Original copyright preserved
+ * below.
+ * Steve Whitehouse: Some bug fixes, cleaning up some code to make it
+ * compatible with my routing layer.
+ * Steve Whitehouse: Merging changes from Eduardo Serrat and Patrick
+ * Caulfield.
+ * Steve Whitehouse: Further bug fixes, checking module code still works
+ * with new routing layer.
+ * Steve Whitehouse: Additional set/get_sockopt() calls.
+ * Steve Whitehouse: Fixed TIOCINQ ioctl to be same as Eduardo's new
+ * code.
+ * Steve Whitehouse: recvmsg() changed to try and behave in a POSIX like
+ * way. Didn't manage it entirely, but its better.
+ * Steve Whitehouse: ditto for sendmsg().
+ * Steve Whitehouse: A selection of bug fixes to various things.
+ * Steve Whitehouse: Added TIOCOUTQ ioctl.
+ * Steve Whitehouse: Fixes to username2sockaddr & sockaddr2username.
+ * Steve Whitehouse: Fixes to connect() error returns.
+ * Patrick Caulfield: Fixes to delayed acceptance logic.
+ * David S. Miller: New socket locking
+ * Steve Whitehouse: Socket list hashing/locking
+ * Arnaldo C. Melo: use capable, not suser
+ * Steve Whitehouse: Removed unused code. Fix to use sk->allocation
+ * when required.
+ * Patrick Caulfield: /proc/net/decnet now has object name/number
+ * Steve Whitehouse: Fixed local port allocation, hashed sk list
+ * Matthew Wilcox: Fixes for dn_ioctl()
+ * Steve Whitehouse: New connect/accept logic to allow timeouts and
+ * prepare for sendpage etc.
+ */
+
+
+/******************************************************************************
+ (c) 1995-1998 E.M. Serrat emserrat@geocities.com
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+HISTORY:
+
+Version Kernel Date Author/Comments
+------- ------ ---- ---------------
+Version 0.0.1 2.0.30 01-dic-97 Eduardo Marcelo Serrat
+ (emserrat@geocities.com)
+
+ First Development of DECnet Socket La-
+ yer for Linux. Only supports outgoing
+ connections.
+
+Version 0.0.2 2.1.105 20-jun-98 Patrick J. Caulfield
+ (patrick@pandh.demon.co.uk)
+
+ Port to new kernel development version.
+
+Version 0.0.3 2.1.106 25-jun-98 Eduardo Marcelo Serrat
+ (emserrat@geocities.com)
+ _
+ Added support for incoming connections
+ so we can start developing server apps
+ on Linux.
+ -
+ Module Support
+Version 0.0.4 2.1.109 21-jul-98 Eduardo Marcelo Serrat
+ (emserrat@geocities.com)
+ _
+ Added support for X11R6.4. Now we can
+ use DECnet transport for X on Linux!!!
+ -
+Version 0.0.5 2.1.110 01-aug-98 Eduardo Marcelo Serrat
+ (emserrat@geocities.com)
+ Removed bugs on flow control
+ Removed bugs on incoming accessdata
+ order
+ -
+Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
+ dn_recvmsg fixes
+
+ Patrick J. Caulfield
+ dn_bind fixes
+*******************************************************************************/
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/route.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/flow.h>
+#include <asm/ioctls.h>
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/jiffies.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_nsp.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+#include <net/dn_fib.h>
+#include <net/dn_neigh.h>
+
+struct dn_sock {
+ struct sock sk;
+ struct dn_scp scp;
+};
+
+static void dn_keepalive(struct sock *sk);
+
+#define DN_SK_HASH_SHIFT 8
+#define DN_SK_HASH_SIZE (1 << DN_SK_HASH_SHIFT)
+#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
+
+
+static const struct proto_ops dn_proto_ops;
+static DEFINE_RWLOCK(dn_hash_lock);
+static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
+static struct hlist_head dn_wild_sk;
+static atomic_long_t decnet_memory_allocated;
+
+static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags);
+static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
+
+static struct hlist_head *dn_find_list(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ if (scp->addr.sdn_flags & SDF_WILD)
+ return hlist_empty(&dn_wild_sk) ? &dn_wild_sk : NULL;
+
+ return &dn_sk_hash[le16_to_cpu(scp->addrloc) & DN_SK_HASH_MASK];
+}
+
+/*
+ * Valid ports are those greater than zero and not already in use.
+ */
+static int check_port(__le16 port)
+{
+ struct sock *sk;
+
+ if (port == 0)
+ return -1;
+
+ sk_for_each(sk, &dn_sk_hash[le16_to_cpu(port) & DN_SK_HASH_MASK]) {
+ struct dn_scp *scp = DN_SK(sk);
+ if (scp->addrloc == port)
+ return -1;
+ }
+ return 0;
+}
+
+static unsigned short port_alloc(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+static unsigned short port = 0x2000;
+ unsigned short i_port = port;
+
+ while(check_port(cpu_to_le16(++port)) != 0) {
+ if (port == i_port)
+ return 0;
+ }
+
+ scp->addrloc = cpu_to_le16(port);
+
+ return 1;
+}
+
+/*
+ * Since this is only ever called from user
+ * level, we don't need a write_lock() version
+ * of this.
+ */
+static int dn_hash_sock(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct hlist_head *list;
+ int rv = -EUSERS;
+
+ BUG_ON(sk_hashed(sk));
+
+ write_lock_bh(&dn_hash_lock);
+
+ if (!scp->addrloc && !port_alloc(sk))
+ goto out;
+
+ rv = -EADDRINUSE;
+ if ((list = dn_find_list(sk)) == NULL)
+ goto out;
+
+ sk_add_node(sk, list);
+ rv = 0;
+out:
+ write_unlock_bh(&dn_hash_lock);
+ return rv;
+}
+
+static void dn_unhash_sock(struct sock *sk)
+{
+ write_lock(&dn_hash_lock);
+ sk_del_node_init(sk);
+ write_unlock(&dn_hash_lock);
+}
+
+static void dn_unhash_sock_bh(struct sock *sk)
+{
+ write_lock_bh(&dn_hash_lock);
+ sk_del_node_init(sk);
+ write_unlock_bh(&dn_hash_lock);
+}
+
+static struct hlist_head *listen_hash(struct sockaddr_dn *addr)
+{
+ int i;
+ unsigned int hash = addr->sdn_objnum;
+
+ if (hash == 0) {
+ hash = addr->sdn_objnamel;
+ for(i = 0; i < le16_to_cpu(addr->sdn_objnamel); i++) {
+ hash ^= addr->sdn_objname[i];
+ hash ^= (hash << 3);
+ }
+ }
+
+ return &dn_sk_hash[hash & DN_SK_HASH_MASK];
+}
+
+/*
+ * Called to transform a socket from bound (i.e. with a local address)
+ * into a listening socket (doesn't need a local port number) and rehashes
+ * based upon the object name/number.
+ */
+static void dn_rehash_sock(struct sock *sk)
+{
+ struct hlist_head *list;
+ struct dn_scp *scp = DN_SK(sk);
+
+ if (scp->addr.sdn_flags & SDF_WILD)
+ return;
+
+ write_lock_bh(&dn_hash_lock);
+ sk_del_node_init(sk);
+ DN_SK(sk)->addrloc = 0;
+ list = listen_hash(&DN_SK(sk)->addr);
+ sk_add_node(sk, list);
+ write_unlock_bh(&dn_hash_lock);
+}
+
+int dn_sockaddr2username(struct sockaddr_dn *sdn, unsigned char *buf, unsigned char type)
+{
+ int len = 2;
+
+ *buf++ = type;
+
+ switch (type) {
+ case 0:
+ *buf++ = sdn->sdn_objnum;
+ break;
+ case 1:
+ *buf++ = 0;
+ *buf++ = le16_to_cpu(sdn->sdn_objnamel);
+ memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel));
+ len = 3 + le16_to_cpu(sdn->sdn_objnamel);
+ break;
+ case 2:
+ memset(buf, 0, 5);
+ buf += 5;
+ *buf++ = le16_to_cpu(sdn->sdn_objnamel);
+ memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel));
+ len = 7 + le16_to_cpu(sdn->sdn_objnamel);
+ break;
+ }
+
+ return len;
+}
+
+/*
+ * On reception of usernames, we handle types 1 and 0 for destination
+ * addresses only. Types 2 and 4 are used for source addresses, but the
+ * UIC, GIC are ignored and they are both treated the same way. Type 3
+ * is never used as I've no idea what its purpose might be or what its
+ * format is.
+ */
+int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, unsigned char *fmt)
+{
+ unsigned char type;
+ int size = len;
+ int namel = 12;
+
+ sdn->sdn_objnum = 0;
+ sdn->sdn_objnamel = cpu_to_le16(0);
+ memset(sdn->sdn_objname, 0, DN_MAXOBJL);
+
+ if (len < 2)
+ return -1;
+
+ len -= 2;
+ *fmt = *data++;
+ type = *data++;
+
+ switch (*fmt) {
+ case 0:
+ sdn->sdn_objnum = type;
+ return 2;
+ case 1:
+ namel = 16;
+ break;
+ case 2:
+ len -= 4;
+ data += 4;
+ break;
+ case 4:
+ len -= 8;
+ data += 8;
+ break;
+ default:
+ return -1;
+ }
+
+ len -= 1;
+
+ if (len < 0)
+ return -1;
+
+ sdn->sdn_objnamel = cpu_to_le16(*data++);
+ len -= le16_to_cpu(sdn->sdn_objnamel);
+
+ if ((len < 0) || (le16_to_cpu(sdn->sdn_objnamel) > namel))
+ return -1;
+
+ memcpy(sdn->sdn_objname, data, le16_to_cpu(sdn->sdn_objnamel));
+
+ return size - len;
+}
+
+struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr)
+{
+ struct hlist_head *list = listen_hash(addr);
+ struct sock *sk;
+
+ read_lock(&dn_hash_lock);
+ sk_for_each(sk, list) {
+ struct dn_scp *scp = DN_SK(sk);
+ if (sk->sk_state != TCP_LISTEN)
+ continue;
+ if (scp->addr.sdn_objnum) {
+ if (scp->addr.sdn_objnum != addr->sdn_objnum)
+ continue;
+ } else {
+ if (addr->sdn_objnum)
+ continue;
+ if (scp->addr.sdn_objnamel != addr->sdn_objnamel)
+ continue;
+ if (memcmp(scp->addr.sdn_objname, addr->sdn_objname, le16_to_cpu(addr->sdn_objnamel)) != 0)
+ continue;
+ }
+ sock_hold(sk);
+ read_unlock(&dn_hash_lock);
+ return sk;
+ }
+
+ sk = sk_head(&dn_wild_sk);
+ if (sk) {
+ if (sk->sk_state == TCP_LISTEN)
+ sock_hold(sk);
+ else
+ sk = NULL;
+ }
+
+ read_unlock(&dn_hash_lock);
+ return sk;
+}
+
+struct sock *dn_find_by_skb(struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct sock *sk;
+ struct dn_scp *scp;
+
+ read_lock(&dn_hash_lock);
+ sk_for_each(sk, &dn_sk_hash[le16_to_cpu(cb->dst_port) & DN_SK_HASH_MASK]) {
+ scp = DN_SK(sk);
+ if (cb->src != dn_saddr2dn(&scp->peer))
+ continue;
+ if (cb->dst_port != scp->addrloc)
+ continue;
+ if (scp->addrrem && (cb->src_port != scp->addrrem))
+ continue;
+ sock_hold(sk);
+ goto found;
+ }
+ sk = NULL;
+found:
+ read_unlock(&dn_hash_lock);
+ return sk;
+}
+
+
+
+static void dn_destruct(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ skb_queue_purge(&scp->data_xmit_queue);
+ skb_queue_purge(&scp->other_xmit_queue);
+ skb_queue_purge(&scp->other_receive_queue);
+
+ dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+}
+
+static int dn_memory_pressure;
+
+static void dn_enter_memory_pressure(struct sock *sk)
+{
+ if (!dn_memory_pressure) {
+ dn_memory_pressure = 1;
+ }
+}
+
+static struct proto dn_proto = {
+ .name = "NSP",
+ .owner = THIS_MODULE,
+ .enter_memory_pressure = dn_enter_memory_pressure,
+ .memory_pressure = &dn_memory_pressure,
+ .memory_allocated = &decnet_memory_allocated,
+ .sysctl_mem = sysctl_decnet_mem,
+ .sysctl_wmem = sysctl_decnet_wmem,
+ .sysctl_rmem = sysctl_decnet_rmem,
+ .max_header = DN_MAX_NSP_DATA_HEADER + 64,
+ .obj_size = sizeof(struct dn_sock),
+};
+
+static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp)
+{
+ struct dn_scp *scp;
+ struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto);
+
+ if (!sk)
+ goto out;
+
+ if (sock)
+ sock->ops = &dn_proto_ops;
+ sock_init_data(sock, sk);
+
+ sk->sk_backlog_rcv = dn_nsp_backlog_rcv;
+ sk->sk_destruct = dn_destruct;
+ sk->sk_no_check_tx = 1;
+ sk->sk_family = PF_DECnet;
+ sk->sk_protocol = 0;
+ sk->sk_allocation = gfp;
+ sk->sk_sndbuf = sysctl_decnet_wmem[1];
+ sk->sk_rcvbuf = sysctl_decnet_rmem[1];
+
+ /* Initialization of DECnet Session Control Port */
+ scp = DN_SK(sk);
+ scp->state = DN_O; /* Open */
+ scp->numdat = 1; /* Next data seg to tx */
+ scp->numoth = 1; /* Next oth data to tx */
+ scp->ackxmt_dat = 0; /* Last data seg ack'ed */
+ scp->ackxmt_oth = 0; /* Last oth data ack'ed */
+ scp->ackrcv_dat = 0; /* Highest data ack recv*/
+ scp->ackrcv_oth = 0; /* Last oth data ack rec*/
+ scp->flowrem_sw = DN_SEND;
+ scp->flowloc_sw = DN_SEND;
+ scp->flowrem_dat = 0;
+ scp->flowrem_oth = 1;
+ scp->flowloc_dat = 0;
+ scp->flowloc_oth = 1;
+ scp->services_rem = 0;
+ scp->services_loc = 1 | NSP_FC_NONE;
+ scp->info_rem = 0;
+ scp->info_loc = 0x03; /* NSP version 4.1 */
+ scp->segsize_rem = 230 - DN_MAX_NSP_DATA_HEADER; /* Default: Updated by remote segsize */
+ scp->nonagle = 0;
+ scp->multi_ireq = 1;
+ scp->accept_mode = ACC_IMMED;
+ scp->addr.sdn_family = AF_DECnet;
+ scp->peer.sdn_family = AF_DECnet;
+ scp->accessdata.acc_accl = 5;
+ memcpy(scp->accessdata.acc_acc, "LINUX", 5);
+
+ scp->max_window = NSP_MAX_WINDOW;
+ scp->snd_window = NSP_MIN_WINDOW;
+ scp->nsp_srtt = NSP_INITIAL_SRTT;
+ scp->nsp_rttvar = NSP_INITIAL_RTTVAR;
+ scp->nsp_rxtshift = 0;
+
+ skb_queue_head_init(&scp->data_xmit_queue);
+ skb_queue_head_init(&scp->other_xmit_queue);
+ skb_queue_head_init(&scp->other_receive_queue);
+
+ scp->persist = 0;
+ scp->persist_fxn = NULL;
+ scp->keepalive = 10 * HZ;
+ scp->keepalive_fxn = dn_keepalive;
+
+ init_timer(&scp->delack_timer);
+ scp->delack_pending = 0;
+ scp->delack_fxn = dn_nsp_delayed_ack;
+
+ dn_start_slow_timer(sk);
+out:
+ return sk;
+}
+
+/*
+ * Keepalive timer.
+ * FIXME: Should respond to SO_KEEPALIVE etc.
+ */
+static void dn_keepalive(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ /*
+ * By checking the other_data transmit queue is empty
+ * we are double checking that we are not sending too
+ * many of these keepalive frames.
+ */
+ if (skb_queue_empty(&scp->other_xmit_queue))
+ dn_nsp_send_link(sk, DN_NOCHANGE, 0);
+}
+
+
+/*
+ * Timer for shutdown/destroyed sockets.
+ * When socket is dead & no packets have been sent for a
+ * certain amount of time, they are removed by this
+ * routine. Also takes care of sending out DI & DC
+ * frames at correct times.
+ */
+int dn_destroy_timer(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ scp->persist = dn_nsp_persist(sk);
+
+ switch (scp->state) {
+ case DN_DI:
+ dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
+ if (scp->nsp_rxtshift >= decnet_di_count)
+ scp->state = DN_CN;
+ return 0;
+
+ case DN_DR:
+ dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
+ if (scp->nsp_rxtshift >= decnet_dr_count)
+ scp->state = DN_DRC;
+ return 0;
+
+ case DN_DN:
+ if (scp->nsp_rxtshift < decnet_dn_count) {
+ /* printk(KERN_DEBUG "dn_destroy_timer: DN\n"); */
+ dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC,
+ GFP_ATOMIC);
+ return 0;
+ }
+ }
+
+ scp->persist = (HZ * decnet_time_wait);
+
+ if (sk->sk_socket)
+ return 0;
+
+ if (time_after_eq(jiffies, scp->stamp + HZ * decnet_time_wait)) {
+ dn_unhash_sock(sk);
+ sock_put(sk);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void dn_destroy_sock(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ scp->nsp_rxtshift = 0; /* reset back off */
+
+ if (sk->sk_socket) {
+ if (sk->sk_socket->state != SS_UNCONNECTED)
+ sk->sk_socket->state = SS_DISCONNECTING;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+
+ switch (scp->state) {
+ case DN_DN:
+ dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC,
+ sk->sk_allocation);
+ scp->persist_fxn = dn_destroy_timer;
+ scp->persist = dn_nsp_persist(sk);
+ break;
+ case DN_CR:
+ scp->state = DN_DR;
+ goto disc_reject;
+ case DN_RUN:
+ scp->state = DN_DI;
+ case DN_DI:
+ case DN_DR:
+disc_reject:
+ dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation);
+ case DN_NC:
+ case DN_NR:
+ case DN_RJ:
+ case DN_DIC:
+ case DN_CN:
+ case DN_DRC:
+ case DN_CI:
+ case DN_CD:
+ scp->persist_fxn = dn_destroy_timer;
+ scp->persist = dn_nsp_persist(sk);
+ break;
+ default:
+ printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n");
+ case DN_O:
+ dn_stop_slow_timer(sk);
+
+ dn_unhash_sock_bh(sk);
+ sock_put(sk);
+
+ break;
+ }
+}
+
+char *dn_addr2asc(__u16 addr, char *buf)
+{
+ unsigned short node, area;
+
+ node = addr & 0x03ff;
+ area = addr >> 10;
+ sprintf(buf, "%hd.%hd", area, node);
+
+ return buf;
+}
+
+
+
+static int dn_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ switch (sock->type) {
+ case SOCK_SEQPACKET:
+ if (protocol != DNPROTO_NSP)
+ return -EPROTONOSUPPORT;
+ break;
+ case SOCK_STREAM:
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+
+ if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL)
+ return -ENOBUFS;
+
+ sk->sk_protocol = protocol;
+
+ return 0;
+}
+
+
+static int
+dn_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ sock_orphan(sk);
+ sock_hold(sk);
+ lock_sock(sk);
+ dn_destroy_sock(sk);
+ release_sock(sk);
+ sock_put(sk);
+ }
+
+ return 0;
+}
+
+static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr;
+ struct net_device *dev, *ldev;
+ int rv;
+
+ if (addr_len != sizeof(struct sockaddr_dn))
+ return -EINVAL;
+
+ if (saddr->sdn_family != AF_DECnet)
+ return -EINVAL;
+
+ if (le16_to_cpu(saddr->sdn_nodeaddrl) && (le16_to_cpu(saddr->sdn_nodeaddrl) != 2))
+ return -EINVAL;
+
+ if (le16_to_cpu(saddr->sdn_objnamel) > DN_MAXOBJL)
+ return -EINVAL;
+
+ if (saddr->sdn_flags & ~SDF_WILD)
+ return -EINVAL;
+
+ if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum ||
+ (saddr->sdn_flags & SDF_WILD)))
+ return -EACCES;
+
+ if (!(saddr->sdn_flags & SDF_WILD)) {
+ if (le16_to_cpu(saddr->sdn_nodeaddrl)) {
+ rcu_read_lock();
+ ldev = NULL;
+ for_each_netdev_rcu(&init_net, dev) {
+ if (!dev->dn_ptr)
+ continue;
+ if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) {
+ ldev = dev;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ldev == NULL)
+ return -EADDRNOTAVAIL;
+ }
+ }
+
+ rv = -EINVAL;
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ memcpy(&scp->addr, saddr, addr_len);
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ rv = dn_hash_sock(sk);
+ if (rv)
+ sock_set_flag(sk, SOCK_ZAPPED);
+ }
+ release_sock(sk);
+
+ return rv;
+}
+
+
+static int dn_auto_bind(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ int rv;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ scp->addr.sdn_flags = 0;
+ scp->addr.sdn_objnum = 0;
+
+ /*
+ * This stuff is to keep compatibility with Eduardo's
+ * patch. I hope I can dispense with it shortly...
+ */
+ if ((scp->accessdata.acc_accl != 0) &&
+ (scp->accessdata.acc_accl <= 12)) {
+
+ scp->addr.sdn_objnamel = cpu_to_le16(scp->accessdata.acc_accl);
+ memcpy(scp->addr.sdn_objname, scp->accessdata.acc_acc, le16_to_cpu(scp->addr.sdn_objnamel));
+
+ scp->accessdata.acc_accl = 0;
+ memset(scp->accessdata.acc_acc, 0, 40);
+ }
+ /* End of compatibility stuff */
+
+ scp->addr.sdn_add.a_len = cpu_to_le16(2);
+ rv = dn_dev_bind_default((__le16 *)scp->addr.sdn_add.a_addr);
+ if (rv == 0) {
+ rv = dn_hash_sock(sk);
+ if (rv)
+ sock_set_flag(sk, SOCK_ZAPPED);
+ }
+
+ return rv;
+}
+
+static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ DEFINE_WAIT(wait);
+ int err;
+
+ if (scp->state != DN_CR)
+ return -EINVAL;
+
+ scp->state = DN_CC;
+ scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk));
+ dn_send_conn_conf(sk, allocation);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ for(;;) {
+ release_sock(sk);
+ if (scp->state == DN_CC)
+ *timeo = schedule_timeout(*timeo);
+ lock_sock(sk);
+ err = 0;
+ if (scp->state == DN_RUN)
+ break;
+ err = sock_error(sk);
+ if (err)
+ break;
+ err = sock_intr_errno(*timeo);
+ if (signal_pending(current))
+ break;
+ err = -EAGAIN;
+ if (!*timeo)
+ break;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (err == 0) {
+ sk->sk_socket->state = SS_CONNECTED;
+ } else if (scp->state != DN_CC) {
+ sk->sk_socket->state = SS_UNCONNECTED;
+ }
+ return err;
+}
+
+static int dn_wait_run(struct sock *sk, long *timeo)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ DEFINE_WAIT(wait);
+ int err = 0;
+
+ if (scp->state == DN_RUN)
+ goto out;
+
+ if (!*timeo)
+ return -EALREADY;
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ for(;;) {
+ release_sock(sk);
+ if (scp->state == DN_CI || scp->state == DN_CC)
+ *timeo = schedule_timeout(*timeo);
+ lock_sock(sk);
+ err = 0;
+ if (scp->state == DN_RUN)
+ break;
+ err = sock_error(sk);
+ if (err)
+ break;
+ err = sock_intr_errno(*timeo);
+ if (signal_pending(current))
+ break;
+ err = -ETIMEDOUT;
+ if (!*timeo)
+ break;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+out:
+ if (err == 0) {
+ sk->sk_socket->state = SS_CONNECTED;
+ } else if (scp->state != DN_CI && scp->state != DN_CC) {
+ sk->sk_socket->state = SS_UNCONNECTED;
+ }
+ return err;
+}
+
+static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
+{
+ struct socket *sock = sk->sk_socket;
+ struct dn_scp *scp = DN_SK(sk);
+ int err = -EISCONN;
+ struct flowidn fld;
+ struct dst_entry *dst;
+
+ if (sock->state == SS_CONNECTED)
+ goto out;
+
+ if (sock->state == SS_CONNECTING) {
+ err = 0;
+ if (scp->state == DN_RUN) {
+ sock->state = SS_CONNECTED;
+ goto out;
+ }
+ err = -ECONNREFUSED;
+ if (scp->state != DN_CI && scp->state != DN_CC) {
+ sock->state = SS_UNCONNECTED;
+ goto out;
+ }
+ return dn_wait_run(sk, timeo);
+ }
+
+ err = -EINVAL;
+ if (scp->state != DN_O)
+ goto out;
+
+ if (addr == NULL || addrlen != sizeof(struct sockaddr_dn))
+ goto out;
+ if (addr->sdn_family != AF_DECnet)
+ goto out;
+ if (addr->sdn_flags & SDF_WILD)
+ goto out;
+
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ err = dn_auto_bind(sk->sk_socket);
+ if (err)
+ goto out;
+ }
+
+ memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn));
+
+ err = -EHOSTUNREACH;
+ memset(&fld, 0, sizeof(fld));
+ fld.flowidn_oif = sk->sk_bound_dev_if;
+ fld.daddr = dn_saddr2dn(&scp->peer);
+ fld.saddr = dn_saddr2dn(&scp->addr);
+ dn_sk_ports_copy(&fld, scp);
+ fld.flowidn_proto = DNPROTO_NSP;
+ if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, flags) < 0)
+ goto out;
+ dst = __sk_dst_get(sk);
+ sk->sk_route_caps = dst->dev->features;
+ sock->state = SS_CONNECTING;
+ scp->state = DN_CI;
+ scp->segsize_loc = dst_metric_advmss(dst);
+
+ dn_nsp_send_conninit(sk, NSP_CI);
+ err = -EINPROGRESS;
+ if (*timeo) {
+ err = dn_wait_run(sk, timeo);
+ }
+out:
+ return err;
+}
+
+static int dn_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags)
+{
+ struct sockaddr_dn *addr = (struct sockaddr_dn *)uaddr;
+ struct sock *sk = sock->sk;
+ int err;
+ long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ lock_sock(sk);
+ err = __dn_connect(sk, addr, addrlen, &timeo, 0);
+ release_sock(sk);
+
+ return err;
+}
+
+static inline int dn_check_state(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ switch (scp->state) {
+ case DN_RUN:
+ return 0;
+ case DN_CR:
+ return dn_confirm_accept(sk, timeo, sk->sk_allocation);
+ case DN_CI:
+ case DN_CC:
+ return dn_wait_run(sk, timeo);
+ case DN_O:
+ return __dn_connect(sk, addr, addrlen, timeo, flags);
+ }
+
+ return -EINVAL;
+}
+
+
+static void dn_access_copy(struct sk_buff *skb, struct accessdata_dn *acc)
+{
+ unsigned char *ptr = skb->data;
+
+ acc->acc_userl = *ptr++;
+ memcpy(&acc->acc_user, ptr, acc->acc_userl);
+ ptr += acc->acc_userl;
+
+ acc->acc_passl = *ptr++;
+ memcpy(&acc->acc_pass, ptr, acc->acc_passl);
+ ptr += acc->acc_passl;
+
+ acc->acc_accl = *ptr++;
+ memcpy(&acc->acc_acc, ptr, acc->acc_accl);
+
+ skb_pull(skb, acc->acc_accl + acc->acc_passl + acc->acc_userl + 3);
+
+}
+
+static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt)
+{
+ unsigned char *ptr = skb->data;
+ u16 len = *ptr++; /* yes, it's 8bit on the wire */
+
+ BUG_ON(len > 16); /* we've checked the contents earlier */
+ opt->opt_optl = cpu_to_le16(len);
+ opt->opt_status = 0;
+ memcpy(opt->opt_data, ptr, len);
+ skb_pull(skb, len + 1);
+}
+
+static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
+{
+ DEFINE_WAIT(wait);
+ struct sk_buff *skb = NULL;
+ int err = 0;
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ for(;;) {
+ release_sock(sk);
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (skb == NULL) {
+ *timeo = schedule_timeout(*timeo);
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ }
+ lock_sock(sk);
+ if (skb != NULL)
+ break;
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ break;
+ err = sock_intr_errno(*timeo);
+ if (signal_pending(current))
+ break;
+ err = -EAGAIN;
+ if (!*timeo)
+ break;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+
+ return skb == NULL ? ERR_PTR(err) : skb;
+}
+
+static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk = sock->sk, *newsk;
+ struct sk_buff *skb = NULL;
+ struct dn_skb_cb *cb;
+ unsigned char menuver;
+ int err = 0;
+ unsigned char type;
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ struct dst_entry *dst;
+
+ lock_sock(sk);
+
+ if (sk->sk_state != TCP_LISTEN || DN_SK(sk)->state != DN_O) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (skb == NULL) {
+ skb = dn_wait_for_connect(sk, &timeo);
+ if (IS_ERR(skb)) {
+ release_sock(sk);
+ return PTR_ERR(skb);
+ }
+ }
+
+ cb = DN_SKB_CB(skb);
+ sk->sk_ack_backlog--;
+ newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation);
+ if (newsk == NULL) {
+ release_sock(sk);
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+ release_sock(sk);
+
+ dst = skb_dst(skb);
+ sk_dst_set(newsk, dst);
+ skb_dst_set(skb, NULL);
+
+ DN_SK(newsk)->state = DN_CR;
+ DN_SK(newsk)->addrrem = cb->src_port;
+ DN_SK(newsk)->services_rem = cb->services;
+ DN_SK(newsk)->info_rem = cb->info;
+ DN_SK(newsk)->segsize_rem = cb->segsize;
+ DN_SK(newsk)->accept_mode = DN_SK(sk)->accept_mode;
+
+ if (DN_SK(newsk)->segsize_rem < 230)
+ DN_SK(newsk)->segsize_rem = 230;
+
+ if ((DN_SK(newsk)->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
+ DN_SK(newsk)->max_window = decnet_no_fc_max_cwnd;
+
+ newsk->sk_state = TCP_LISTEN;
+ memcpy(&(DN_SK(newsk)->addr), &(DN_SK(sk)->addr), sizeof(struct sockaddr_dn));
+
+ /*
+ * If we are listening on a wild socket, we don't want
+ * the newly created socket on the wrong hash queue.
+ */
+ DN_SK(newsk)->addr.sdn_flags &= ~SDF_WILD;
+
+ skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type));
+ skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type));
+ *(__le16 *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src;
+ *(__le16 *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst;
+
+ menuver = *skb->data;
+ skb_pull(skb, 1);
+
+ if (menuver & DN_MENUVER_ACC)
+ dn_access_copy(skb, &(DN_SK(newsk)->accessdata));
+
+ if (menuver & DN_MENUVER_USR)
+ dn_user_copy(skb, &(DN_SK(newsk)->conndata_in));
+
+ if (menuver & DN_MENUVER_PRX)
+ DN_SK(newsk)->peer.sdn_flags |= SDF_PROXY;
+
+ if (menuver & DN_MENUVER_UIC)
+ DN_SK(newsk)->peer.sdn_flags |= SDF_UICPROXY;
+
+ kfree_skb(skb);
+
+ memcpy(&(DN_SK(newsk)->conndata_out), &(DN_SK(sk)->conndata_out),
+ sizeof(struct optdata_dn));
+ memcpy(&(DN_SK(newsk)->discdata_out), &(DN_SK(sk)->discdata_out),
+ sizeof(struct optdata_dn));
+
+ lock_sock(newsk);
+ err = dn_hash_sock(newsk);
+ if (err == 0) {
+ sock_reset_flag(newsk, SOCK_ZAPPED);
+ dn_send_conn_ack(newsk);
+
+ /*
+ * Here we use sk->sk_allocation since although the conn conf is
+ * for the newsk, the context is the old socket.
+ */
+ if (DN_SK(newsk)->accept_mode == ACC_IMMED)
+ err = dn_confirm_accept(newsk, &timeo,
+ sk->sk_allocation);
+ }
+ release_sock(newsk);
+ return err;
+}
+
+
+static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len,int peer)
+{
+ struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr;
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+
+ *uaddr_len = sizeof(struct sockaddr_dn);
+
+ lock_sock(sk);
+
+ if (peer) {
+ if ((sock->state != SS_CONNECTED &&
+ sock->state != SS_CONNECTING) &&
+ scp->accept_mode == ACC_IMMED) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn));
+ } else {
+ memcpy(sa, &scp->addr, sizeof(struct sockaddr_dn));
+ }
+
+ release_sock(sk);
+
+ return 0;
+}
+
+
+static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ int mask = datagram_poll(file, sock, wait);
+
+ if (!skb_queue_empty(&scp->other_receive_queue))
+ mask |= POLLRDBAND;
+
+ return mask;
+}
+
+static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ int err = -EOPNOTSUPP;
+ long amount = 0;
+ struct sk_buff *skb;
+ int val;
+
+ switch(cmd)
+ {
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ return dn_dev_ioctl(cmd, (void __user *)arg);
+
+ case SIOCATMARK:
+ lock_sock(sk);
+ val = !skb_queue_empty(&scp->other_receive_queue);
+ if (scp->state != DN_RUN)
+ val = -ENOTCONN;
+ release_sock(sk);
+ return val;
+
+ case TIOCOUTQ:
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ err = put_user(amount, (int __user *)arg);
+ break;
+
+ case TIOCINQ:
+ lock_sock(sk);
+ skb = skb_peek(&scp->other_receive_queue);
+ if (skb) {
+ amount = skb->len;
+ } else {
+ skb_queue_walk(&sk->sk_receive_queue, skb)
+ amount += skb->len;
+ }
+ release_sock(sk);
+ err = put_user(amount, (int __user *)arg);
+ break;
+
+ default:
+ err = -ENOIOCTLCMD;
+ break;
+ }
+
+ return err;
+}
+
+static int dn_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err = -EINVAL;
+
+ lock_sock(sk);
+
+ if (sock_flag(sk, SOCK_ZAPPED))
+ goto out;
+
+ if ((DN_SK(sk)->state != DN_O) || (sk->sk_state == TCP_LISTEN))
+ goto out;
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = TCP_LISTEN;
+ err = 0;
+ dn_rehash_sock(sk);
+
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+
+static int dn_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ int err = -ENOTCONN;
+
+ lock_sock(sk);
+
+ if (sock->state == SS_UNCONNECTED)
+ goto out;
+
+ err = 0;
+ if (sock->state == SS_DISCONNECTING)
+ goto out;
+
+ err = -EINVAL;
+ if (scp->state == DN_O)
+ goto out;
+
+ if (how != SHUT_RDWR)
+ goto out;
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ dn_destroy_sock(sk);
+ err = 0;
+
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int err;
+
+ lock_sock(sk);
+ err = __dn_setsockopt(sock, level, optname, optval, optlen, 0);
+ release_sock(sk);
+
+ return err;
+}
+
+static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ long timeo;
+ union {
+ struct optdata_dn opt;
+ struct accessdata_dn acc;
+ int mode;
+ unsigned long win;
+ int val;
+ unsigned char services;
+ unsigned char info;
+ } u;
+ int err;
+
+ if (optlen && !optval)
+ return -EINVAL;
+
+ if (optlen > sizeof(u))
+ return -EINVAL;
+
+ if (copy_from_user(&u, optval, optlen))
+ return -EFAULT;
+
+ switch (optname) {
+ case DSO_CONDATA:
+ if (sock->state == SS_CONNECTED)
+ return -EISCONN;
+ if ((scp->state != DN_O) && (scp->state != DN_CR))
+ return -EINVAL;
+
+ if (optlen != sizeof(struct optdata_dn))
+ return -EINVAL;
+
+ if (le16_to_cpu(u.opt.opt_optl) > 16)
+ return -EINVAL;
+
+ memcpy(&scp->conndata_out, &u.opt, optlen);
+ break;
+
+ case DSO_DISDATA:
+ if (sock->state != SS_CONNECTED &&
+ scp->accept_mode == ACC_IMMED)
+ return -ENOTCONN;
+
+ if (optlen != sizeof(struct optdata_dn))
+ return -EINVAL;
+
+ if (le16_to_cpu(u.opt.opt_optl) > 16)
+ return -EINVAL;
+
+ memcpy(&scp->discdata_out, &u.opt, optlen);
+ break;
+
+ case DSO_CONACCESS:
+ if (sock->state == SS_CONNECTED)
+ return -EISCONN;
+ if (scp->state != DN_O)
+ return -EINVAL;
+
+ if (optlen != sizeof(struct accessdata_dn))
+ return -EINVAL;
+
+ if ((u.acc.acc_accl > DN_MAXACCL) ||
+ (u.acc.acc_passl > DN_MAXACCL) ||
+ (u.acc.acc_userl > DN_MAXACCL))
+ return -EINVAL;
+
+ memcpy(&scp->accessdata, &u.acc, optlen);
+ break;
+
+ case DSO_ACCEPTMODE:
+ if (sock->state == SS_CONNECTED)
+ return -EISCONN;
+ if (scp->state != DN_O)
+ return -EINVAL;
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ if ((u.mode != ACC_IMMED) && (u.mode != ACC_DEFER))
+ return -EINVAL;
+
+ scp->accept_mode = (unsigned char)u.mode;
+ break;
+
+ case DSO_CONACCEPT:
+ if (scp->state != DN_CR)
+ return -EINVAL;
+ timeo = sock_rcvtimeo(sk, 0);
+ err = dn_confirm_accept(sk, &timeo, sk->sk_allocation);
+ return err;
+
+ case DSO_CONREJECT:
+ if (scp->state != DN_CR)
+ return -EINVAL;
+
+ scp->state = DN_DR;
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation);
+ break;
+
+ default:
+#ifdef CONFIG_NETFILTER
+ return nf_setsockopt(sk, PF_DECnet, optname, optval, optlen);
+#endif
+ case DSO_LINKINFO:
+ case DSO_STREAM:
+ case DSO_SEQPACKET:
+ return -ENOPROTOOPT;
+
+ case DSO_MAXWINDOW:
+ if (optlen != sizeof(unsigned long))
+ return -EINVAL;
+ if (u.win > NSP_MAX_WINDOW)
+ u.win = NSP_MAX_WINDOW;
+ if (u.win == 0)
+ return -EINVAL;
+ scp->max_window = u.win;
+ if (scp->snd_window > u.win)
+ scp->snd_window = u.win;
+ break;
+
+ case DSO_NODELAY:
+ if (optlen != sizeof(int))
+ return -EINVAL;
+ if (scp->nonagle == 2)
+ return -EINVAL;
+ scp->nonagle = (u.val == 0) ? 0 : 1;
+ /* if (scp->nonagle == 1) { Push pending frames } */
+ break;
+
+ case DSO_CORK:
+ if (optlen != sizeof(int))
+ return -EINVAL;
+ if (scp->nonagle == 1)
+ return -EINVAL;
+ scp->nonagle = (u.val == 0) ? 0 : 2;
+ /* if (scp->nonagle == 0) { Push pending frames } */
+ break;
+
+ case DSO_SERVICES:
+ if (optlen != sizeof(unsigned char))
+ return -EINVAL;
+ if ((u.services & ~NSP_FC_MASK) != 0x01)
+ return -EINVAL;
+ if ((u.services & NSP_FC_MASK) == NSP_FC_MASK)
+ return -EINVAL;
+ scp->services_loc = u.services;
+ break;
+
+ case DSO_INFO:
+ if (optlen != sizeof(unsigned char))
+ return -EINVAL;
+ if (u.info & 0xfc)
+ return -EINVAL;
+ scp->info_loc = u.info;
+ break;
+ }
+
+ return 0;
+}
+
+static int dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int err;
+
+ lock_sock(sk);
+ err = __dn_getsockopt(sock, level, optname, optval, optlen, 0);
+ release_sock(sk);
+
+ return err;
+}
+
+static int __dn_getsockopt(struct socket *sock, int level,int optname, char __user *optval,int __user *optlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ struct linkinfo_dn link;
+ unsigned int r_len;
+ void *r_data = NULL;
+ unsigned int val;
+
+ if(get_user(r_len , optlen))
+ return -EFAULT;
+
+ switch (optname) {
+ case DSO_CONDATA:
+ if (r_len > sizeof(struct optdata_dn))
+ r_len = sizeof(struct optdata_dn);
+ r_data = &scp->conndata_in;
+ break;
+
+ case DSO_DISDATA:
+ if (r_len > sizeof(struct optdata_dn))
+ r_len = sizeof(struct optdata_dn);
+ r_data = &scp->discdata_in;
+ break;
+
+ case DSO_CONACCESS:
+ if (r_len > sizeof(struct accessdata_dn))
+ r_len = sizeof(struct accessdata_dn);
+ r_data = &scp->accessdata;
+ break;
+
+ case DSO_ACCEPTMODE:
+ if (r_len > sizeof(unsigned char))
+ r_len = sizeof(unsigned char);
+ r_data = &scp->accept_mode;
+ break;
+
+ case DSO_LINKINFO:
+ if (r_len > sizeof(struct linkinfo_dn))
+ r_len = sizeof(struct linkinfo_dn);
+
+ memset(&link, 0, sizeof(link));
+
+ switch (sock->state) {
+ case SS_CONNECTING:
+ link.idn_linkstate = LL_CONNECTING;
+ break;
+ case SS_DISCONNECTING:
+ link.idn_linkstate = LL_DISCONNECTING;
+ break;
+ case SS_CONNECTED:
+ link.idn_linkstate = LL_RUNNING;
+ break;
+ default:
+ link.idn_linkstate = LL_INACTIVE;
+ }
+
+ link.idn_segsize = scp->segsize_rem;
+ r_data = &link;
+ break;
+
+ default:
+#ifdef CONFIG_NETFILTER
+ {
+ int ret, len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ ret = nf_getsockopt(sk, PF_DECnet, optname, optval, &len);
+ if (ret >= 0)
+ ret = put_user(len, optlen);
+ return ret;
+ }
+#endif
+ case DSO_STREAM:
+ case DSO_SEQPACKET:
+ case DSO_CONACCEPT:
+ case DSO_CONREJECT:
+ return -ENOPROTOOPT;
+
+ case DSO_MAXWINDOW:
+ if (r_len > sizeof(unsigned long))
+ r_len = sizeof(unsigned long);
+ r_data = &scp->max_window;
+ break;
+
+ case DSO_NODELAY:
+ if (r_len > sizeof(int))
+ r_len = sizeof(int);
+ val = (scp->nonagle == 1);
+ r_data = &val;
+ break;
+
+ case DSO_CORK:
+ if (r_len > sizeof(int))
+ r_len = sizeof(int);
+ val = (scp->nonagle == 2);
+ r_data = &val;
+ break;
+
+ case DSO_SERVICES:
+ if (r_len > sizeof(unsigned char))
+ r_len = sizeof(unsigned char);
+ r_data = &scp->services_rem;
+ break;
+
+ case DSO_INFO:
+ if (r_len > sizeof(unsigned char))
+ r_len = sizeof(unsigned char);
+ r_data = &scp->info_rem;
+ break;
+ }
+
+ if (r_data) {
+ if (copy_to_user(optval, r_data, r_len))
+ return -EFAULT;
+ if (put_user(r_len, optlen))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+
+static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int target)
+{
+ struct sk_buff *skb;
+ int len = 0;
+
+ if (flags & MSG_OOB)
+ return !skb_queue_empty(q) ? 1 : 0;
+
+ skb_queue_walk(q, skb) {
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ len += skb->len;
+
+ if (cb->nsp_flags & 0x40) {
+ /* SOCK_SEQPACKET reads to EOM */
+ if (sk->sk_type == SOCK_SEQPACKET)
+ return 1;
+ /* so does SOCK_STREAM unless WAITALL is specified */
+ if (!(flags & MSG_WAITALL))
+ return 1;
+ }
+
+ /* minimum data length for read exceeded */
+ if (len >= target)
+ return 1;
+ }
+
+ return 0;
+}
+
+
+static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ struct sk_buff_head *queue = &sk->sk_receive_queue;
+ size_t target = size > 1 ? 1 : 0;
+ size_t copied = 0;
+ int rv = 0;
+ struct sk_buff *skb, *n;
+ struct dn_skb_cb *cb = NULL;
+ unsigned char eor = 0;
+ long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ lock_sock(sk);
+
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ rv = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ rv = 0;
+ goto out;
+ }
+
+ rv = dn_check_state(sk, NULL, 0, &timeo, flags);
+ if (rv)
+ goto out;
+
+ if (flags & ~(MSG_CMSG_COMPAT|MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) {
+ rv = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (flags & MSG_OOB)
+ queue = &scp->other_receive_queue;
+
+ if (flags & MSG_WAITALL)
+ target = size;
+
+
+ /*
+ * See if there is data ready to read, sleep if there isn't
+ */
+ for(;;) {
+ DEFINE_WAIT(wait);
+
+ if (sk->sk_err)
+ goto out;
+
+ if (!skb_queue_empty(&scp->other_receive_queue)) {
+ if (!(flags & MSG_OOB)) {
+ msg->msg_flags |= MSG_OOB;
+ if (!scp->other_report) {
+ scp->other_report = 1;
+ goto out;
+ }
+ }
+ }
+
+ if (scp->state != DN_RUN)
+ goto out;
+
+ if (signal_pending(current)) {
+ rv = sock_intr_errno(timeo);
+ goto out;
+ }
+
+ if (dn_data_ready(sk, queue, flags, target))
+ break;
+
+ if (flags & MSG_DONTWAIT) {
+ rv = -EWOULDBLOCK;
+ goto out;
+ }
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
+ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ finish_wait(sk_sleep(sk), &wait);
+ }
+
+ skb_queue_walk_safe(queue, skb, n) {
+ unsigned int chunk = skb->len;
+ cb = DN_SKB_CB(skb);
+
+ if ((chunk + copied) > size)
+ chunk = size - copied;
+
+ if (memcpy_to_msg(msg, skb->data, chunk)) {
+ rv = -EFAULT;
+ break;
+ }
+ copied += chunk;
+
+ if (!(flags & MSG_PEEK))
+ skb_pull(skb, chunk);
+
+ eor = cb->nsp_flags & 0x40;
+
+ if (skb->len == 0) {
+ skb_unlink(skb, queue);
+ kfree_skb(skb);
+ /*
+ * N.B. Don't refer to skb or cb after this point
+ * in loop.
+ */
+ if ((scp->flowloc_sw == DN_DONTSEND) && !dn_congested(sk)) {
+ scp->flowloc_sw = DN_SEND;
+ dn_nsp_send_link(sk, DN_SEND, 0);
+ }
+ }
+
+ if (eor) {
+ if (sk->sk_type == SOCK_SEQPACKET)
+ break;
+ if (!(flags & MSG_WAITALL))
+ break;
+ }
+
+ if (flags & MSG_OOB)
+ break;
+
+ if (copied >= target)
+ break;
+ }
+
+ rv = copied;
+
+
+ if (eor && (sk->sk_type == SOCK_SEQPACKET))
+ msg->msg_flags |= MSG_EOR;
+
+out:
+ if (rv == 0)
+ rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk);
+
+ if ((rv >= 0) && msg->msg_name) {
+ __sockaddr_check_size(sizeof(struct sockaddr_dn));
+ memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn));
+ msg->msg_namelen = sizeof(struct sockaddr_dn);
+ }
+
+ release_sock(sk);
+
+ return rv;
+}
+
+
+static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *queue, int flags)
+{
+ unsigned char fctype = scp->services_rem & NSP_FC_MASK;
+ if (skb_queue_len(queue) >= scp->snd_window)
+ return 1;
+ if (fctype != NSP_FC_NONE) {
+ if (flags & MSG_OOB) {
+ if (scp->flowrem_oth == 0)
+ return 1;
+ } else {
+ if (scp->flowrem_dat == 0)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * The DECnet spec requires that the "routing layer" accepts packets which
+ * are at least 230 bytes in size. This excludes any headers which the NSP
+ * layer might add, so we always assume that we'll be using the maximal
+ * length header on data packets. The variation in length is due to the
+ * inclusion (or not) of the two 16 bit acknowledgement fields so it doesn't
+ * make much practical difference.
+ */
+unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu)
+{
+ unsigned int mss = 230 - DN_MAX_NSP_DATA_HEADER;
+ if (dev) {
+ struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+ mtu -= LL_RESERVED_SPACE(dev);
+ if (dn_db->use_long)
+ mtu -= 21;
+ else
+ mtu -= 6;
+ mtu -= DN_MAX_NSP_DATA_HEADER;
+ } else {
+ /*
+ * 21 = long header, 16 = guess at MAC header length
+ */
+ mtu -= (21 + DN_MAX_NSP_DATA_HEADER + 16);
+ }
+ if (mtu > mss)
+ mss = mtu;
+ return mss;
+}
+
+static inline unsigned int dn_current_mss(struct sock *sk, int flags)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct dn_scp *scp = DN_SK(sk);
+ int mss_now = min_t(int, scp->segsize_loc, scp->segsize_rem);
+
+ /* Other data messages are limited to 16 bytes per packet */
+ if (flags & MSG_OOB)
+ return 16;
+
+ /* This works out the maximum size of segment we can send out */
+ if (dst) {
+ u32 mtu = dst_mtu(dst);
+ mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now);
+ }
+
+ return mss_now;
+}
+
+/*
+ * N.B. We get the timeout wrong here, but then we always did get it
+ * wrong before and this is another step along the road to correcting
+ * it. It ought to get updated each time we pass through the routine,
+ * but in practise it probably doesn't matter too much for now.
+ */
+static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk,
+ unsigned long datalen, int noblock,
+ int *errcode)
+{
+ struct sk_buff *skb = sock_alloc_send_skb(sk, datalen,
+ noblock, errcode);
+ if (skb) {
+ skb->protocol = htons(ETH_P_DNA_RT);
+ skb->pkt_type = PACKET_OUTGOING;
+ }
+ return skb;
+}
+
+static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ size_t mss;
+ struct sk_buff_head *queue = &scp->data_xmit_queue;
+ int flags = msg->msg_flags;
+ int err = 0;
+ size_t sent = 0;
+ int addr_len = msg->msg_namelen;
+ DECLARE_SOCKADDR(struct sockaddr_dn *, addr, msg->msg_name);
+ struct sk_buff *skb = NULL;
+ struct dn_skb_cb *cb;
+ size_t len;
+ unsigned char fctype;
+ long timeo;
+
+ if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT))
+ return -EOPNOTSUPP;
+
+ if (addr_len && (addr_len != sizeof(struct sockaddr_dn)))
+ return -EINVAL;
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ /*
+ * The only difference between stream sockets and sequenced packet
+ * sockets is that the stream sockets always behave as if MSG_EOR
+ * has been set.
+ */
+ if (sock->type == SOCK_STREAM) {
+ if (flags & MSG_EOR) {
+ err = -EINVAL;
+ goto out;
+ }
+ flags |= MSG_EOR;
+ }
+
+
+ err = dn_check_state(sk, addr, addr_len, &timeo, flags);
+ if (err)
+ goto out_err;
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ err = -EPIPE;
+ if (!(flags & MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ goto out_err;
+ }
+
+ if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
+ dst_negative_advice(sk);
+
+ mss = scp->segsize_rem;
+ fctype = scp->services_rem & NSP_FC_MASK;
+
+ mss = dn_current_mss(sk, flags);
+
+ if (flags & MSG_OOB) {
+ queue = &scp->other_xmit_queue;
+ if (size > mss) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+ }
+
+ scp->persist_fxn = dn_nsp_xmit_timeout;
+
+ while(sent < size) {
+ err = sock_error(sk);
+ if (err)
+ goto out;
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+
+ /*
+ * Calculate size that we wish to send.
+ */
+ len = size - sent;
+
+ if (len > mss)
+ len = mss;
+
+ /*
+ * Wait for queue size to go down below the window
+ * size.
+ */
+ if (dn_queue_too_long(scp, queue, flags)) {
+ DEFINE_WAIT(wait);
+
+ if (flags & MSG_DONTWAIT) {
+ err = -EWOULDBLOCK;
+ goto out;
+ }
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_wait_event(sk, &timeo,
+ !dn_queue_too_long(scp, queue, flags));
+ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ finish_wait(sk_sleep(sk), &wait);
+ continue;
+ }
+
+ /*
+ * Get a suitably sized skb.
+ * 64 is a bit of a hack really, but its larger than any
+ * link-layer headers and has served us well as a good
+ * guess as to their real length.
+ */
+ skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER,
+ flags & MSG_DONTWAIT, &err);
+
+ if (err)
+ break;
+
+ if (!skb)
+ continue;
+
+ cb = DN_SKB_CB(skb);
+
+ skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
+
+ if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ if (flags & MSG_OOB) {
+ cb->nsp_flags = 0x30;
+ if (fctype != NSP_FC_NONE)
+ scp->flowrem_oth--;
+ } else {
+ cb->nsp_flags = 0x00;
+ if (scp->seg_total == 0)
+ cb->nsp_flags |= 0x20;
+
+ scp->seg_total += len;
+
+ if (((sent + len) == size) && (flags & MSG_EOR)) {
+ cb->nsp_flags |= 0x40;
+ scp->seg_total = 0;
+ if (fctype == NSP_FC_SCMC)
+ scp->flowrem_dat--;
+ }
+ if (fctype == NSP_FC_SRC)
+ scp->flowrem_dat--;
+ }
+
+ sent += len;
+ dn_nsp_queue_xmit(sk, skb, sk->sk_allocation, flags & MSG_OOB);
+ skb = NULL;
+
+ scp->persist = dn_nsp_persist(sk);
+
+ }
+out:
+
+ kfree_skb(skb);
+
+ release_sock(sk);
+
+ return sent ? sent : err;
+
+out_err:
+ err = sk_stream_error(sk, flags, err);
+ release_sock(sk);
+ return err;
+}
+
+static int dn_device_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ dn_dev_up(dev);
+ break;
+ case NETDEV_DOWN:
+ dn_dev_down(dev);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dn_dev_notifier = {
+ .notifier_call = dn_device_event,
+};
+
+static struct packet_type dn_dix_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_DNA_RT),
+ .func = dn_route_rcv,
+};
+
+#ifdef CONFIG_PROC_FS
+struct dn_iter_state {
+ int bucket;
+};
+
+static struct sock *dn_socket_get_first(struct seq_file *seq)
+{
+ struct dn_iter_state *state = seq->private;
+ struct sock *n = NULL;
+
+ for(state->bucket = 0;
+ state->bucket < DN_SK_HASH_SIZE;
+ ++state->bucket) {
+ n = sk_head(&dn_sk_hash[state->bucket]);
+ if (n)
+ break;
+ }
+
+ return n;
+}
+
+static struct sock *dn_socket_get_next(struct seq_file *seq,
+ struct sock *n)
+{
+ struct dn_iter_state *state = seq->private;
+
+ n = sk_next(n);
+try_again:
+ if (n)
+ goto out;
+ if (++state->bucket >= DN_SK_HASH_SIZE)
+ goto out;
+ n = sk_head(&dn_sk_hash[state->bucket]);
+ goto try_again;
+out:
+ return n;
+}
+
+static struct sock *socket_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct sock *sk = dn_socket_get_first(seq);
+
+ if (sk) {
+ while(*pos && (sk = dn_socket_get_next(seq, sk)))
+ --*pos;
+ }
+ return *pos ? NULL : sk;
+}
+
+static void *dn_socket_get_idx(struct seq_file *seq, loff_t pos)
+{
+ void *rc;
+ read_lock_bh(&dn_hash_lock);
+ rc = socket_get_idx(seq, &pos);
+ if (!rc) {
+ read_unlock_bh(&dn_hash_lock);
+ }
+ return rc;
+}
+
+static void *dn_socket_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return *pos ? dn_socket_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *dn_socket_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ void *rc;
+
+ if (v == SEQ_START_TOKEN) {
+ rc = dn_socket_get_idx(seq, 0);
+ goto out;
+ }
+
+ rc = dn_socket_get_next(seq, v);
+ if (rc)
+ goto out;
+ read_unlock_bh(&dn_hash_lock);
+out:
+ ++*pos;
+ return rc;
+}
+
+static void dn_socket_seq_stop(struct seq_file *seq, void *v)
+{
+ if (v && v != SEQ_START_TOKEN)
+ read_unlock_bh(&dn_hash_lock);
+}
+
+#define IS_NOT_PRINTABLE(x) ((x) < 32 || (x) > 126)
+
+static void dn_printable_object(struct sockaddr_dn *dn, unsigned char *buf)
+{
+ int i;
+
+ switch (le16_to_cpu(dn->sdn_objnamel)) {
+ case 0:
+ sprintf(buf, "%d", dn->sdn_objnum);
+ break;
+ default:
+ for (i = 0; i < le16_to_cpu(dn->sdn_objnamel); i++) {
+ buf[i] = dn->sdn_objname[i];
+ if (IS_NOT_PRINTABLE(buf[i]))
+ buf[i] = '.';
+ }
+ buf[i] = 0;
+ }
+}
+
+static char *dn_state2asc(unsigned char state)
+{
+ switch (state) {
+ case DN_O:
+ return "OPEN";
+ case DN_CR:
+ return " CR";
+ case DN_DR:
+ return " DR";
+ case DN_DRC:
+ return " DRC";
+ case DN_CC:
+ return " CC";
+ case DN_CI:
+ return " CI";
+ case DN_NR:
+ return " NR";
+ case DN_NC:
+ return " NC";
+ case DN_CD:
+ return " CD";
+ case DN_RJ:
+ return " RJ";
+ case DN_RUN:
+ return " RUN";
+ case DN_DI:
+ return " DI";
+ case DN_DIC:
+ return " DIC";
+ case DN_DN:
+ return " DN";
+ case DN_CL:
+ return " CL";
+ case DN_CN:
+ return " CN";
+ }
+
+ return "????";
+}
+
+static inline void dn_socket_format_entry(struct seq_file *seq, struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ char buf1[DN_ASCBUF_LEN];
+ char buf2[DN_ASCBUF_LEN];
+ char local_object[DN_MAXOBJL+3];
+ char remote_object[DN_MAXOBJL+3];
+
+ dn_printable_object(&scp->addr, local_object);
+ dn_printable_object(&scp->peer, remote_object);
+
+ seq_printf(seq,
+ "%6s/%04X %04d:%04d %04d:%04d %01d %-16s "
+ "%6s/%04X %04d:%04d %04d:%04d %01d %-16s %4s %s\n",
+ dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->addr)), buf1),
+ scp->addrloc,
+ scp->numdat,
+ scp->numoth,
+ scp->ackxmt_dat,
+ scp->ackxmt_oth,
+ scp->flowloc_sw,
+ local_object,
+ dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->peer)), buf2),
+ scp->addrrem,
+ scp->numdat_rcv,
+ scp->numoth_rcv,
+ scp->ackrcv_dat,
+ scp->ackrcv_oth,
+ scp->flowrem_sw,
+ remote_object,
+ dn_state2asc(scp->state),
+ ((scp->accept_mode == ACC_IMMED) ? "IMMED" : "DEFER"));
+}
+
+static int dn_socket_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Local Remote\n");
+ } else {
+ dn_socket_format_entry(seq, v);
+ }
+ return 0;
+}
+
+static const struct seq_operations dn_socket_seq_ops = {
+ .start = dn_socket_seq_start,
+ .next = dn_socket_seq_next,
+ .stop = dn_socket_seq_stop,
+ .show = dn_socket_seq_show,
+};
+
+static int dn_socket_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &dn_socket_seq_ops,
+ sizeof(struct dn_iter_state));
+}
+
+static const struct file_operations dn_socket_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dn_socket_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif
+
+static const struct net_proto_family dn_family_ops = {
+ .family = AF_DECnet,
+ .create = dn_create,
+ .owner = THIS_MODULE,
+};
+
+static const struct proto_ops dn_proto_ops = {
+ .family = AF_DECnet,
+ .owner = THIS_MODULE,
+ .release = dn_release,
+ .bind = dn_bind,
+ .connect = dn_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = dn_accept,
+ .getname = dn_getname,
+ .poll = dn_poll,
+ .ioctl = dn_ioctl,
+ .listen = dn_listen,
+ .shutdown = dn_shutdown,
+ .setsockopt = dn_setsockopt,
+ .getsockopt = dn_getsockopt,
+ .sendmsg = dn_sendmsg,
+ .recvmsg = dn_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+MODULE_DESCRIPTION("The Linux DECnet Network Protocol");
+MODULE_AUTHOR("Linux DECnet Project Team");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_DECnet);
+
+static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n";
+
+static int __init decnet_init(void)
+{
+ int rc;
+
+ printk(banner);
+
+ rc = proto_register(&dn_proto, 1);
+ if (rc != 0)
+ goto out;
+
+ dn_neigh_init();
+ dn_dev_init();
+ dn_route_init();
+ dn_fib_init();
+
+ sock_register(&dn_family_ops);
+ dev_add_pack(&dn_dix_packet_type);
+ register_netdevice_notifier(&dn_dev_notifier);
+
+ proc_create("decnet", S_IRUGO, init_net.proc_net, &dn_socket_seq_fops);
+ dn_register_sysctl();
+out:
+ return rc;
+
+}
+module_init(decnet_init);
+
+/*
+ * Prevent DECnet module unloading until its fixed properly.
+ * Requires an audit of the code to check for memory leaks and
+ * initialisation problems etc.
+ */
+#if 0
+static void __exit decnet_exit(void)
+{
+ sock_unregister(AF_DECnet);
+ rtnl_unregister_all(PF_DECnet);
+ dev_remove_pack(&dn_dix_packet_type);
+
+ dn_unregister_sysctl();
+
+ unregister_netdevice_notifier(&dn_dev_notifier);
+
+ dn_route_cleanup();
+ dn_dev_cleanup();
+ dn_neigh_cleanup();
+ dn_fib_cleanup();
+
+ remove_proc_entry("decnet", init_net.proc_net);
+
+ proto_unregister(&dn_proto);
+
+ rcu_barrier_bh(); /* Wait for completion of call_rcu_bh()'s */
+}
+module_exit(decnet_exit);
+#endif
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
new file mode 100644
index 000000000..b2c26b081
--- /dev/null
+++ b/net/decnet/dn_dev.c
@@ -0,0 +1,1446 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Device Layer
+ *
+ * Authors: Steve Whitehouse <SteveW@ACM.org>
+ * Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *
+ * Changes:
+ * Steve Whitehouse : Devices now see incoming frames so they
+ * can mark on who it came from.
+ * Steve Whitehouse : Fixed bug in creating neighbours. Each neighbour
+ * can now have a device specific setup func.
+ * Steve Whitehouse : Added /proc/sys/net/decnet/conf/<dev>/
+ * Steve Whitehouse : Fixed bug which sometimes killed timer
+ * Steve Whitehouse : Multiple ifaddr support
+ * Steve Whitehouse : SIOCGIFCONF is now a compile time option
+ * Steve Whitehouse : /proc/sys/net/decnet/conf/<sys>/forwarding
+ * Steve Whitehouse : Removed timer1 - it's a user space issue now
+ * Patrick Caulfield : Fixed router hello message format
+ * Steve Whitehouse : Got rid of constant sizes for blksize for
+ * devices. All mtu based now.
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <asm/uaccess.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/netlink.h>
+#include <net/dn.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+#include <net/dn_neigh.h>
+#include <net/dn_fib.h>
+
+#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn))
+
+static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00};
+static char dn_rt_all_rt_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x03,0x00,0x00};
+static char dn_hiord[ETH_ALEN] = {0xAA,0x00,0x04,0x00,0x00,0x00};
+static unsigned char dn_eco_version[3] = {0x02,0x00,0x00};
+
+extern struct neigh_table dn_neigh_table;
+
+/*
+ * decnet_address is kept in network order.
+ */
+__le16 decnet_address = 0;
+
+static DEFINE_SPINLOCK(dndev_lock);
+static struct net_device *decnet_default_device;
+static BLOCKING_NOTIFIER_HEAD(dnaddr_chain);
+
+static struct dn_dev *dn_dev_create(struct net_device *dev, int *err);
+static void dn_dev_delete(struct net_device *dev);
+static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa);
+
+static int dn_eth_up(struct net_device *);
+static void dn_eth_down(struct net_device *);
+static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa);
+static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa);
+
+static struct dn_dev_parms dn_dev_list[] = {
+{
+ .type = ARPHRD_ETHER, /* Ethernet */
+ .mode = DN_DEV_BCAST,
+ .state = DN_DEV_S_RU,
+ .t2 = 1,
+ .t3 = 10,
+ .name = "ethernet",
+ .up = dn_eth_up,
+ .down = dn_eth_down,
+ .timer3 = dn_send_brd_hello,
+},
+{
+ .type = ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
+ .mode = DN_DEV_BCAST,
+ .state = DN_DEV_S_RU,
+ .t2 = 1,
+ .t3 = 10,
+ .name = "ipgre",
+ .timer3 = dn_send_brd_hello,
+},
+#if 0
+{
+ .type = ARPHRD_X25, /* Bog standard X.25 */
+ .mode = DN_DEV_UCAST,
+ .state = DN_DEV_S_DS,
+ .t2 = 1,
+ .t3 = 120,
+ .name = "x25",
+ .timer3 = dn_send_ptp_hello,
+},
+#endif
+#if 0
+{
+ .type = ARPHRD_PPP, /* DECnet over PPP */
+ .mode = DN_DEV_BCAST,
+ .state = DN_DEV_S_RU,
+ .t2 = 1,
+ .t3 = 10,
+ .name = "ppp",
+ .timer3 = dn_send_brd_hello,
+},
+#endif
+{
+ .type = ARPHRD_DDCMP, /* DECnet over DDCMP */
+ .mode = DN_DEV_UCAST,
+ .state = DN_DEV_S_DS,
+ .t2 = 1,
+ .t3 = 120,
+ .name = "ddcmp",
+ .timer3 = dn_send_ptp_hello,
+},
+{
+ .type = ARPHRD_LOOPBACK, /* Loopback interface - always last */
+ .mode = DN_DEV_BCAST,
+ .state = DN_DEV_S_RU,
+ .t2 = 1,
+ .t3 = 10,
+ .name = "loopback",
+ .timer3 = dn_send_brd_hello,
+}
+};
+
+#define DN_DEV_LIST_SIZE ARRAY_SIZE(dn_dev_list)
+
+#define DN_DEV_PARMS_OFFSET(x) offsetof(struct dn_dev_parms, x)
+
+#ifdef CONFIG_SYSCTL
+
+static int min_t2[] = { 1 };
+static int max_t2[] = { 60 }; /* No max specified, but this seems sensible */
+static int min_t3[] = { 1 };
+static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MULT or T3MULT */
+
+static int min_priority[1];
+static int max_priority[] = { 127 }; /* From DECnet spec */
+
+static int dn_forwarding_proc(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
+static struct dn_dev_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ struct ctl_table dn_dev_vars[5];
+} dn_dev_sysctl = {
+ NULL,
+ {
+ {
+ .procname = "forwarding",
+ .data = (void *)DN_DEV_PARMS_OFFSET(forwarding),
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = dn_forwarding_proc,
+ },
+ {
+ .procname = "priority",
+ .data = (void *)DN_DEV_PARMS_OFFSET(priority),
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_priority,
+ .extra2 = &max_priority
+ },
+ {
+ .procname = "t2",
+ .data = (void *)DN_DEV_PARMS_OFFSET(t2),
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_t2,
+ .extra2 = &max_t2
+ },
+ {
+ .procname = "t3",
+ .data = (void *)DN_DEV_PARMS_OFFSET(t3),
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_t3,
+ .extra2 = &max_t3
+ },
+ {0}
+ },
+};
+
+static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
+{
+ struct dn_dev_sysctl_table *t;
+ int i;
+
+ char path[sizeof("net/decnet/conf/") + IFNAMSIZ];
+
+ t = kmemdup(&dn_dev_sysctl, sizeof(*t), GFP_KERNEL);
+ if (t == NULL)
+ return;
+
+ for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) {
+ long offset = (long)t->dn_dev_vars[i].data;
+ t->dn_dev_vars[i].data = ((char *)parms) + offset;
+ }
+
+ snprintf(path, sizeof(path), "net/decnet/conf/%s",
+ dev? dev->name : parms->name);
+
+ t->dn_dev_vars[0].extra1 = (void *)dev;
+
+ t->sysctl_header = register_net_sysctl(&init_net, path, t->dn_dev_vars);
+ if (t->sysctl_header == NULL)
+ kfree(t);
+ else
+ parms->sysctl = t;
+}
+
+static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
+{
+ if (parms->sysctl) {
+ struct dn_dev_sysctl_table *t = parms->sysctl;
+ parms->sysctl = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t);
+ }
+}
+
+static int dn_forwarding_proc(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+#ifdef CONFIG_DECNET_ROUTER
+ struct net_device *dev = table->extra1;
+ struct dn_dev *dn_db;
+ int err;
+ int tmp, old;
+
+ if (table->extra1 == NULL)
+ return -EINVAL;
+
+ dn_db = rcu_dereference_raw(dev->dn_ptr);
+ old = dn_db->parms.forwarding;
+
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if ((err >= 0) && write) {
+ if (dn_db->parms.forwarding < 0)
+ dn_db->parms.forwarding = 0;
+ if (dn_db->parms.forwarding > 2)
+ dn_db->parms.forwarding = 2;
+ /*
+ * What an ugly hack this is... its works, just. It
+ * would be nice if sysctl/proc were just that little
+ * bit more flexible so I don't have to write a special
+ * routine, or suffer hacks like this - SJW
+ */
+ tmp = dn_db->parms.forwarding;
+ dn_db->parms.forwarding = old;
+ if (dn_db->parms.down)
+ dn_db->parms.down(dev);
+ dn_db->parms.forwarding = tmp;
+ if (dn_db->parms.up)
+ dn_db->parms.up(dev);
+ }
+
+ return err;
+#else
+ return -EINVAL;
+#endif
+}
+
+#else /* CONFIG_SYSCTL */
+static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
+{
+}
+static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
+static inline __u16 mtu2blksize(struct net_device *dev)
+{
+ u32 blksize = dev->mtu;
+ if (blksize > 0xffff)
+ blksize = 0xffff;
+
+ if (dev->type == ARPHRD_ETHER ||
+ dev->type == ARPHRD_PPP ||
+ dev->type == ARPHRD_IPGRE ||
+ dev->type == ARPHRD_LOOPBACK)
+ blksize -= 2;
+
+ return (__u16)blksize;
+}
+
+static struct dn_ifaddr *dn_dev_alloc_ifa(void)
+{
+ struct dn_ifaddr *ifa;
+
+ ifa = kzalloc(sizeof(*ifa), GFP_KERNEL);
+
+ return ifa;
+}
+
+static void dn_dev_free_ifa(struct dn_ifaddr *ifa)
+{
+ kfree_rcu(ifa, rcu);
+}
+
+static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr __rcu **ifap, int destroy)
+{
+ struct dn_ifaddr *ifa1 = rtnl_dereference(*ifap);
+ unsigned char mac_addr[6];
+ struct net_device *dev = dn_db->dev;
+
+ ASSERT_RTNL();
+
+ *ifap = ifa1->ifa_next;
+
+ if (dn_db->dev->type == ARPHRD_ETHER) {
+ if (ifa1->ifa_local != dn_eth2dn(dev->dev_addr)) {
+ dn_dn2eth(mac_addr, ifa1->ifa_local);
+ dev_mc_del(dev, mac_addr);
+ }
+ }
+
+ dn_ifaddr_notify(RTM_DELADDR, ifa1);
+ blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
+ if (destroy) {
+ dn_dev_free_ifa(ifa1);
+
+ if (dn_db->ifa_list == NULL)
+ dn_dev_delete(dn_db->dev);
+ }
+}
+
+static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
+{
+ struct net_device *dev = dn_db->dev;
+ struct dn_ifaddr *ifa1;
+ unsigned char mac_addr[6];
+
+ ASSERT_RTNL();
+
+ /* Check for duplicates */
+ for (ifa1 = rtnl_dereference(dn_db->ifa_list);
+ ifa1 != NULL;
+ ifa1 = rtnl_dereference(ifa1->ifa_next)) {
+ if (ifa1->ifa_local == ifa->ifa_local)
+ return -EEXIST;
+ }
+
+ if (dev->type == ARPHRD_ETHER) {
+ if (ifa->ifa_local != dn_eth2dn(dev->dev_addr)) {
+ dn_dn2eth(mac_addr, ifa->ifa_local);
+ dev_mc_add(dev, mac_addr);
+ }
+ }
+
+ ifa->ifa_next = dn_db->ifa_list;
+ rcu_assign_pointer(dn_db->ifa_list, ifa);
+
+ dn_ifaddr_notify(RTM_NEWADDR, ifa);
+ blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
+
+ return 0;
+}
+
+static int dn_dev_set_ifa(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+ struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
+ int rv;
+
+ if (dn_db == NULL) {
+ int err;
+ dn_db = dn_dev_create(dev, &err);
+ if (dn_db == NULL)
+ return err;
+ }
+
+ ifa->ifa_dev = dn_db;
+
+ if (dev->flags & IFF_LOOPBACK)
+ ifa->ifa_scope = RT_SCOPE_HOST;
+
+ rv = dn_dev_insert_ifa(dn_db, ifa);
+ if (rv)
+ dn_dev_free_ifa(ifa);
+ return rv;
+}
+
+
+int dn_dev_ioctl(unsigned int cmd, void __user *arg)
+{
+ char buffer[DN_IFREQ_SIZE];
+ struct ifreq *ifr = (struct ifreq *)buffer;
+ struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr;
+ struct dn_dev *dn_db;
+ struct net_device *dev;
+ struct dn_ifaddr *ifa = NULL;
+ struct dn_ifaddr __rcu **ifap = NULL;
+ int ret = 0;
+
+ if (copy_from_user(ifr, arg, DN_IFREQ_SIZE))
+ return -EFAULT;
+ ifr->ifr_name[IFNAMSIZ-1] = 0;
+
+ dev_load(&init_net, ifr->ifr_name);
+
+ switch (cmd) {
+ case SIOCGIFADDR:
+ break;
+ case SIOCSIFADDR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EACCES;
+ if (sdn->sdn_family != AF_DECnet)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ rtnl_lock();
+
+ if ((dev = __dev_get_by_name(&init_net, ifr->ifr_name)) == NULL) {
+ ret = -ENODEV;
+ goto done;
+ }
+
+ if ((dn_db = rtnl_dereference(dev->dn_ptr)) != NULL) {
+ for (ifap = &dn_db->ifa_list;
+ (ifa = rtnl_dereference(*ifap)) != NULL;
+ ifap = &ifa->ifa_next)
+ if (strcmp(ifr->ifr_name, ifa->ifa_label) == 0)
+ break;
+ }
+
+ if (ifa == NULL && cmd != SIOCSIFADDR) {
+ ret = -EADDRNOTAVAIL;
+ goto done;
+ }
+
+ switch (cmd) {
+ case SIOCGIFADDR:
+ *((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local;
+ goto rarok;
+
+ case SIOCSIFADDR:
+ if (!ifa) {
+ if ((ifa = dn_dev_alloc_ifa()) == NULL) {
+ ret = -ENOBUFS;
+ break;
+ }
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ } else {
+ if (ifa->ifa_local == dn_saddr2dn(sdn))
+ break;
+ dn_dev_del_ifa(dn_db, ifap, 0);
+ }
+
+ ifa->ifa_local = ifa->ifa_address = dn_saddr2dn(sdn);
+
+ ret = dn_dev_set_ifa(dev, ifa);
+ }
+done:
+ rtnl_unlock();
+
+ return ret;
+rarok:
+ if (copy_to_user(arg, ifr, DN_IFREQ_SIZE))
+ ret = -EFAULT;
+ goto done;
+}
+
+struct net_device *dn_dev_get_default(void)
+{
+ struct net_device *dev;
+
+ spin_lock(&dndev_lock);
+ dev = decnet_default_device;
+ if (dev) {
+ if (dev->dn_ptr)
+ dev_hold(dev);
+ else
+ dev = NULL;
+ }
+ spin_unlock(&dndev_lock);
+
+ return dev;
+}
+
+int dn_dev_set_default(struct net_device *dev, int force)
+{
+ struct net_device *old = NULL;
+ int rv = -EBUSY;
+ if (!dev->dn_ptr)
+ return -ENODEV;
+
+ spin_lock(&dndev_lock);
+ if (force || decnet_default_device == NULL) {
+ old = decnet_default_device;
+ decnet_default_device = dev;
+ rv = 0;
+ }
+ spin_unlock(&dndev_lock);
+
+ if (old)
+ dev_put(old);
+ return rv;
+}
+
+static void dn_dev_check_default(struct net_device *dev)
+{
+ spin_lock(&dndev_lock);
+ if (dev == decnet_default_device) {
+ decnet_default_device = NULL;
+ } else {
+ dev = NULL;
+ }
+ spin_unlock(&dndev_lock);
+
+ if (dev)
+ dev_put(dev);
+}
+
+/*
+ * Called with RTNL
+ */
+static struct dn_dev *dn_dev_by_index(int ifindex)
+{
+ struct net_device *dev;
+ struct dn_dev *dn_dev = NULL;
+
+ dev = __dev_get_by_index(&init_net, ifindex);
+ if (dev)
+ dn_dev = rtnl_dereference(dev->dn_ptr);
+
+ return dn_dev;
+}
+
+static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = {
+ [IFA_ADDRESS] = { .type = NLA_U16 },
+ [IFA_LOCAL] = { .type = NLA_U16 },
+ [IFA_LABEL] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
+ [IFA_FLAGS] = { .type = NLA_U32 },
+};
+
+static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFA_MAX+1];
+ struct dn_dev *dn_db;
+ struct ifaddrmsg *ifm;
+ struct dn_ifaddr *ifa;
+ struct dn_ifaddr __rcu **ifap;
+ int err = -EINVAL;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!net_eq(net, &init_net))
+ goto errout;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -ENODEV;
+ ifm = nlmsg_data(nlh);
+ if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL)
+ goto errout;
+
+ err = -EADDRNOTAVAIL;
+ for (ifap = &dn_db->ifa_list;
+ (ifa = rtnl_dereference(*ifap)) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (tb[IFA_LOCAL] &&
+ nla_memcmp(tb[IFA_LOCAL], &ifa->ifa_local, 2))
+ continue;
+
+ if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
+ continue;
+
+ dn_dev_del_ifa(dn_db, ifap, 1);
+ return 0;
+ }
+
+errout:
+ return err;
+}
+
+static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFA_MAX+1];
+ struct net_device *dev;
+ struct dn_dev *dn_db;
+ struct ifaddrmsg *ifm;
+ struct dn_ifaddr *ifa;
+ int err;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!net_eq(net, &init_net))
+ return -EINVAL;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFA_LOCAL] == NULL)
+ return -EINVAL;
+
+ ifm = nlmsg_data(nlh);
+ if ((dev = __dev_get_by_index(&init_net, ifm->ifa_index)) == NULL)
+ return -ENODEV;
+
+ if ((dn_db = rtnl_dereference(dev->dn_ptr)) == NULL) {
+ dn_db = dn_dev_create(dev, &err);
+ if (!dn_db)
+ return err;
+ }
+
+ if ((ifa = dn_dev_alloc_ifa()) == NULL)
+ return -ENOBUFS;
+
+ if (tb[IFA_ADDRESS] == NULL)
+ tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+
+ ifa->ifa_local = nla_get_le16(tb[IFA_LOCAL]);
+ ifa->ifa_address = nla_get_le16(tb[IFA_ADDRESS]);
+ ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
+ ifm->ifa_flags;
+ ifa->ifa_scope = ifm->ifa_scope;
+ ifa->ifa_dev = dn_db;
+
+ if (tb[IFA_LABEL])
+ nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+ err = dn_dev_insert_ifa(dn_db, ifa);
+ if (err)
+ dn_dev_free_ifa(ifa);
+
+ return err;
+}
+
+static inline size_t dn_ifaddr_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ + nla_total_size(2) /* IFA_ADDRESS */
+ + nla_total_size(2) /* IFA_LOCAL */
+ + nla_total_size(4); /* IFA_FLAGS */
+}
+
+static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
+ u32 portid, u32 seq, int event, unsigned int flags)
+{
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+ u32 ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_DECnet;
+ ifm->ifa_prefixlen = 16;
+ ifm->ifa_flags = ifa_flags;
+ ifm->ifa_scope = ifa->ifa_scope;
+ ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+
+ if ((ifa->ifa_address &&
+ nla_put_le16(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ (ifa->ifa_local &&
+ nla_put_le16(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ (ifa->ifa_label[0] &&
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ nla_put_u32(skb, IFA_FLAGS, ifa_flags))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = alloc_skb(dn_ifaddr_nlmsg_size(), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = dn_nl_fill_ifaddr(skb, ifa, 0, 0, event, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in dn_ifaddr_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err);
+}
+
+static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int idx, dn_idx = 0, skip_ndevs, skip_naddr;
+ struct net_device *dev;
+ struct dn_dev *dn_db;
+ struct dn_ifaddr *ifa;
+
+ if (!net_eq(net, &init_net))
+ return 0;
+
+ skip_ndevs = cb->args[0];
+ skip_naddr = cb->args[1];
+
+ idx = 0;
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if (idx < skip_ndevs)
+ goto cont;
+ else if (idx > skip_ndevs) {
+ /* Only skip over addresses for first dev dumped
+ * in this iteration (idx == skip_ndevs) */
+ skip_naddr = 0;
+ }
+
+ if ((dn_db = rcu_dereference(dev->dn_ptr)) == NULL)
+ goto cont;
+
+ for (ifa = rcu_dereference(dn_db->ifa_list), dn_idx = 0; ifa;
+ ifa = rcu_dereference(ifa->ifa_next), dn_idx++) {
+ if (dn_idx < skip_naddr)
+ continue;
+
+ if (dn_nl_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWADDR,
+ NLM_F_MULTI) < 0)
+ goto done;
+ }
+cont:
+ idx++;
+ }
+done:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ cb->args[1] = dn_idx;
+
+ return skb->len;
+}
+
+static int dn_dev_get_first(struct net_device *dev, __le16 *addr)
+{
+ struct dn_dev *dn_db;
+ struct dn_ifaddr *ifa;
+ int rv = -ENODEV;
+
+ rcu_read_lock();
+ dn_db = rcu_dereference(dev->dn_ptr);
+ if (dn_db == NULL)
+ goto out;
+
+ ifa = rcu_dereference(dn_db->ifa_list);
+ if (ifa != NULL) {
+ *addr = ifa->ifa_local;
+ rv = 0;
+ }
+out:
+ rcu_read_unlock();
+ return rv;
+}
+
+/*
+ * Find a default address to bind to.
+ *
+ * This is one of those areas where the initial VMS concepts don't really
+ * map onto the Linux concepts, and since we introduced multiple addresses
+ * per interface we have to cope with slightly odd ways of finding out what
+ * "our address" really is. Mostly it's not a problem; for this we just guess
+ * a sensible default. Eventually the routing code will take care of all the
+ * nasties for us I hope.
+ */
+int dn_dev_bind_default(__le16 *addr)
+{
+ struct net_device *dev;
+ int rv;
+ dev = dn_dev_get_default();
+last_chance:
+ if (dev) {
+ rv = dn_dev_get_first(dev, addr);
+ dev_put(dev);
+ if (rv == 0 || dev == init_net.loopback_dev)
+ return rv;
+ }
+ dev = init_net.loopback_dev;
+ dev_hold(dev);
+ goto last_chance;
+}
+
+static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+ struct endnode_hello_message *msg;
+ struct sk_buff *skb = NULL;
+ __le16 *pktlen;
+ struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+ if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL)
+ return;
+
+ skb->dev = dev;
+
+ msg = (struct endnode_hello_message *)skb_put(skb,sizeof(*msg));
+
+ msg->msgflg = 0x0D;
+ memcpy(msg->tiver, dn_eco_version, 3);
+ dn_dn2eth(msg->id, ifa->ifa_local);
+ msg->iinfo = DN_RT_INFO_ENDN;
+ msg->blksize = cpu_to_le16(mtu2blksize(dev));
+ msg->area = 0x00;
+ memset(msg->seed, 0, 8);
+ memcpy(msg->neighbor, dn_hiord, ETH_ALEN);
+
+ if (dn_db->router) {
+ struct dn_neigh *dn = (struct dn_neigh *)dn_db->router;
+ dn_dn2eth(msg->neighbor, dn->addr);
+ }
+
+ msg->timer = cpu_to_le16((unsigned short)dn_db->parms.t3);
+ msg->mpd = 0x00;
+ msg->datalen = 0x02;
+ memset(msg->data, 0xAA, 2);
+
+ pktlen = (__le16 *)skb_push(skb,2);
+ *pktlen = cpu_to_le16(skb->len - 2);
+
+ skb_reset_network_header(skb);
+
+ dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id);
+}
+
+
+#define DRDELAY (5 * HZ)
+
+static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa)
+{
+ /* First check time since device went up */
+ if (time_before(jiffies, dn_db->uptime + DRDELAY))
+ return 0;
+
+ /* If there is no router, then yes... */
+ if (!dn_db->router)
+ return 1;
+
+ /* otherwise only if we have a higher priority or.. */
+ if (dn->priority < dn_db->parms.priority)
+ return 1;
+
+ /* if we have equal priority and a higher node number */
+ if (dn->priority != dn_db->parms.priority)
+ return 0;
+
+ if (le16_to_cpu(dn->addr) < le16_to_cpu(ifa->ifa_local))
+ return 1;
+
+ return 0;
+}
+
+static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+ int n;
+ struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+ struct dn_neigh *dn = (struct dn_neigh *)dn_db->router;
+ struct sk_buff *skb;
+ size_t size;
+ unsigned char *ptr;
+ unsigned char *i1, *i2;
+ __le16 *pktlen;
+ char *src;
+
+ if (mtu2blksize(dev) < (26 + 7))
+ return;
+
+ n = mtu2blksize(dev) - 26;
+ n /= 7;
+
+ if (n > 32)
+ n = 32;
+
+ size = 2 + 26 + 7 * n;
+
+ if ((skb = dn_alloc_skb(NULL, size, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb->dev = dev;
+ ptr = skb_put(skb, size);
+
+ *ptr++ = DN_RT_PKT_CNTL | DN_RT_PKT_ERTH;
+ *ptr++ = 2; /* ECO */
+ *ptr++ = 0;
+ *ptr++ = 0;
+ dn_dn2eth(ptr, ifa->ifa_local);
+ src = ptr;
+ ptr += ETH_ALEN;
+ *ptr++ = dn_db->parms.forwarding == 1 ?
+ DN_RT_INFO_L1RT : DN_RT_INFO_L2RT;
+ *((__le16 *)ptr) = cpu_to_le16(mtu2blksize(dev));
+ ptr += 2;
+ *ptr++ = dn_db->parms.priority; /* Priority */
+ *ptr++ = 0; /* Area: Reserved */
+ *((__le16 *)ptr) = cpu_to_le16((unsigned short)dn_db->parms.t3);
+ ptr += 2;
+ *ptr++ = 0; /* MPD: Reserved */
+ i1 = ptr++;
+ memset(ptr, 0, 7); /* Name: Reserved */
+ ptr += 7;
+ i2 = ptr++;
+
+ n = dn_neigh_elist(dev, ptr, n);
+
+ *i2 = 7 * n;
+ *i1 = 8 + *i2;
+
+ skb_trim(skb, (27 + *i2));
+
+ pktlen = (__le16 *)skb_push(skb, 2);
+ *pktlen = cpu_to_le16(skb->len - 2);
+
+ skb_reset_network_header(skb);
+
+ if (dn_am_i_a_router(dn, dn_db, ifa)) {
+ struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
+ if (skb2) {
+ dn_rt_finish_output(skb2, dn_rt_all_end_mcast, src);
+ }
+ }
+
+ dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
+}
+
+static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+ struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+ if (dn_db->parms.forwarding == 0)
+ dn_send_endnode_hello(dev, ifa);
+ else
+ dn_send_router_hello(dev, ifa);
+}
+
+static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+ int tdlen = 16;
+ int size = dev->hard_header_len + 2 + 4 + tdlen;
+ struct sk_buff *skb = dn_alloc_skb(NULL, size, GFP_ATOMIC);
+ int i;
+ unsigned char *ptr;
+ char src[ETH_ALEN];
+
+ if (skb == NULL)
+ return ;
+
+ skb->dev = dev;
+ skb_push(skb, dev->hard_header_len);
+ ptr = skb_put(skb, 2 + 4 + tdlen);
+
+ *ptr++ = DN_RT_PKT_HELO;
+ *((__le16 *)ptr) = ifa->ifa_local;
+ ptr += 2;
+ *ptr++ = tdlen;
+
+ for(i = 0; i < tdlen; i++)
+ *ptr++ = 0252;
+
+ dn_dn2eth(src, ifa->ifa_local);
+ dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
+}
+
+static int dn_eth_up(struct net_device *dev)
+{
+ struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+ if (dn_db->parms.forwarding == 0)
+ dev_mc_add(dev, dn_rt_all_end_mcast);
+ else
+ dev_mc_add(dev, dn_rt_all_rt_mcast);
+
+ dn_db->use_long = 1;
+
+ return 0;
+}
+
+static void dn_eth_down(struct net_device *dev)
+{
+ struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+ if (dn_db->parms.forwarding == 0)
+ dev_mc_del(dev, dn_rt_all_end_mcast);
+ else
+ dev_mc_del(dev, dn_rt_all_rt_mcast);
+}
+
+static void dn_dev_set_timer(struct net_device *dev);
+
+static void dn_dev_timer_func(unsigned long arg)
+{
+ struct net_device *dev = (struct net_device *)arg;
+ struct dn_dev *dn_db;
+ struct dn_ifaddr *ifa;
+
+ rcu_read_lock();
+ dn_db = rcu_dereference(dev->dn_ptr);
+ if (dn_db->t3 <= dn_db->parms.t2) {
+ if (dn_db->parms.timer3) {
+ for (ifa = rcu_dereference(dn_db->ifa_list);
+ ifa;
+ ifa = rcu_dereference(ifa->ifa_next)) {
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+ dn_db->parms.timer3(dev, ifa);
+ }
+ }
+ dn_db->t3 = dn_db->parms.t3;
+ } else {
+ dn_db->t3 -= dn_db->parms.t2;
+ }
+ rcu_read_unlock();
+ dn_dev_set_timer(dev);
+}
+
+static void dn_dev_set_timer(struct net_device *dev)
+{
+ struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+ if (dn_db->parms.t2 > dn_db->parms.t3)
+ dn_db->parms.t2 = dn_db->parms.t3;
+
+ dn_db->timer.data = (unsigned long)dev;
+ dn_db->timer.function = dn_dev_timer_func;
+ dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ);
+
+ add_timer(&dn_db->timer);
+}
+
+static struct dn_dev *dn_dev_create(struct net_device *dev, int *err)
+{
+ int i;
+ struct dn_dev_parms *p = dn_dev_list;
+ struct dn_dev *dn_db;
+
+ for(i = 0; i < DN_DEV_LIST_SIZE; i++, p++) {
+ if (p->type == dev->type)
+ break;
+ }
+
+ *err = -ENODEV;
+ if (i == DN_DEV_LIST_SIZE)
+ return NULL;
+
+ *err = -ENOBUFS;
+ if ((dn_db = kzalloc(sizeof(struct dn_dev), GFP_ATOMIC)) == NULL)
+ return NULL;
+
+ memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms));
+
+ rcu_assign_pointer(dev->dn_ptr, dn_db);
+ dn_db->dev = dev;
+ init_timer(&dn_db->timer);
+
+ dn_db->uptime = jiffies;
+
+ dn_db->neigh_parms = neigh_parms_alloc(dev, &dn_neigh_table);
+ if (!dn_db->neigh_parms) {
+ RCU_INIT_POINTER(dev->dn_ptr, NULL);
+ kfree(dn_db);
+ return NULL;
+ }
+
+ if (dn_db->parms.up) {
+ if (dn_db->parms.up(dev) < 0) {
+ neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
+ dev->dn_ptr = NULL;
+ kfree(dn_db);
+ return NULL;
+ }
+ }
+
+ dn_dev_sysctl_register(dev, &dn_db->parms);
+
+ dn_dev_set_timer(dev);
+
+ *err = 0;
+ return dn_db;
+}
+
+
+/*
+ * This processes a device up event. We only start up
+ * the loopback device & ethernet devices with correct
+ * MAC addresses automatically. Others must be started
+ * specifically.
+ *
+ * FIXME: How should we configure the loopback address ? If we could dispense
+ * with using decnet_address here and for autobind, it will be one less thing
+ * for users to worry about setting up.
+ */
+
+void dn_dev_up(struct net_device *dev)
+{
+ struct dn_ifaddr *ifa;
+ __le16 addr = decnet_address;
+ int maybe_default = 0;
+ struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
+
+ if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
+ return;
+
+ /*
+ * Need to ensure that loopback device has a dn_db attached to it
+ * to allow creation of neighbours against it, even though it might
+ * not have a local address of its own. Might as well do the same for
+ * all autoconfigured interfaces.
+ */
+ if (dn_db == NULL) {
+ int err;
+ dn_db = dn_dev_create(dev, &err);
+ if (dn_db == NULL)
+ return;
+ }
+
+ if (dev->type == ARPHRD_ETHER) {
+ if (memcmp(dev->dev_addr, dn_hiord, 4) != 0)
+ return;
+ addr = dn_eth2dn(dev->dev_addr);
+ maybe_default = 1;
+ }
+
+ if (addr == 0)
+ return;
+
+ if ((ifa = dn_dev_alloc_ifa()) == NULL)
+ return;
+
+ ifa->ifa_local = ifa->ifa_address = addr;
+ ifa->ifa_flags = 0;
+ ifa->ifa_scope = RT_SCOPE_UNIVERSE;
+ strcpy(ifa->ifa_label, dev->name);
+
+ dn_dev_set_ifa(dev, ifa);
+
+ /*
+ * Automagically set the default device to the first automatically
+ * configured ethernet card in the system.
+ */
+ if (maybe_default) {
+ dev_hold(dev);
+ if (dn_dev_set_default(dev, 0))
+ dev_put(dev);
+ }
+}
+
+static void dn_dev_delete(struct net_device *dev)
+{
+ struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
+
+ if (dn_db == NULL)
+ return;
+
+ del_timer_sync(&dn_db->timer);
+ dn_dev_sysctl_unregister(&dn_db->parms);
+ dn_dev_check_default(dev);
+ neigh_ifdown(&dn_neigh_table, dev);
+
+ if (dn_db->parms.down)
+ dn_db->parms.down(dev);
+
+ dev->dn_ptr = NULL;
+
+ neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
+ neigh_ifdown(&dn_neigh_table, dev);
+
+ if (dn_db->router)
+ neigh_release(dn_db->router);
+ if (dn_db->peer)
+ neigh_release(dn_db->peer);
+
+ kfree(dn_db);
+}
+
+void dn_dev_down(struct net_device *dev)
+{
+ struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
+ struct dn_ifaddr *ifa;
+
+ if (dn_db == NULL)
+ return;
+
+ while ((ifa = rtnl_dereference(dn_db->ifa_list)) != NULL) {
+ dn_dev_del_ifa(dn_db, &dn_db->ifa_list, 0);
+ dn_dev_free_ifa(ifa);
+ }
+
+ dn_dev_delete(dev);
+}
+
+void dn_dev_init_pkt(struct sk_buff *skb)
+{
+}
+
+void dn_dev_veri_pkt(struct sk_buff *skb)
+{
+}
+
+void dn_dev_hello(struct sk_buff *skb)
+{
+}
+
+void dn_dev_devices_off(void)
+{
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev)
+ dn_dev_down(dev);
+ rtnl_unlock();
+
+}
+
+void dn_dev_devices_on(void)
+{
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ if (dev->flags & IFF_UP)
+ dn_dev_up(dev);
+ }
+ rtnl_unlock();
+}
+
+int register_dnaddr_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&dnaddr_chain, nb);
+}
+
+int unregister_dnaddr_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&dnaddr_chain, nb);
+}
+
+#ifdef CONFIG_PROC_FS
+static inline int is_dn_dev(struct net_device *dev)
+{
+ return dev->dn_ptr != NULL;
+}
+
+static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ int i;
+ struct net_device *dev;
+
+ rcu_read_lock();
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ i = 1;
+ for_each_netdev_rcu(&init_net, dev) {
+ if (!is_dn_dev(dev))
+ continue;
+
+ if (i++ == *pos)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net_device *dev;
+
+ ++*pos;
+
+ dev = v;
+ if (v == SEQ_START_TOKEN)
+ dev = net_device_entry(&init_net.dev_base_head);
+
+ for_each_netdev_continue_rcu(&init_net, dev) {
+ if (!is_dn_dev(dev))
+ continue;
+
+ return dev;
+ }
+
+ return NULL;
+}
+
+static void dn_dev_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static char *dn_type2asc(char type)
+{
+ switch (type) {
+ case DN_DEV_BCAST:
+ return "B";
+ case DN_DEV_UCAST:
+ return "U";
+ case DN_DEV_MPOINT:
+ return "M";
+ }
+
+ return "?";
+}
+
+static int dn_dev_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Name Flags T1 Timer1 T3 Timer3 BlkSize Pri State DevType Router Peer\n");
+ else {
+ struct net_device *dev = v;
+ char peer_buf[DN_ASCBUF_LEN];
+ char router_buf[DN_ASCBUF_LEN];
+ struct dn_dev *dn_db = rcu_dereference(dev->dn_ptr);
+
+ seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu"
+ " %04hu %03d %02x %-10s %-7s %-7s\n",
+ dev->name ? dev->name : "???",
+ dn_type2asc(dn_db->parms.mode),
+ 0, 0,
+ dn_db->t3, dn_db->parms.t3,
+ mtu2blksize(dev),
+ dn_db->parms.priority,
+ dn_db->parms.state, dn_db->parms.name,
+ dn_db->router ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->router->primary_key), router_buf) : "",
+ dn_db->peer ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->peer->primary_key), peer_buf) : "");
+ }
+ return 0;
+}
+
+static const struct seq_operations dn_dev_seq_ops = {
+ .start = dn_dev_seq_start,
+ .next = dn_dev_seq_next,
+ .stop = dn_dev_seq_stop,
+ .show = dn_dev_seq_show,
+};
+
+static int dn_dev_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &dn_dev_seq_ops);
+}
+
+static const struct file_operations dn_dev_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dn_dev_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static int addr[2];
+module_param_array(addr, int, NULL, 0444);
+MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node");
+
+void __init dn_dev_init(void)
+{
+ if (addr[0] > 63 || addr[0] < 0) {
+ printk(KERN_ERR "DECnet: Area must be between 0 and 63");
+ return;
+ }
+
+ if (addr[1] > 1023 || addr[1] < 0) {
+ printk(KERN_ERR "DECnet: Node must be between 0 and 1023");
+ return;
+ }
+
+ decnet_address = cpu_to_le16((addr[0] << 10) | addr[1]);
+
+ dn_dev_devices_on();
+
+ rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, NULL);
+ rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, NULL);
+ rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, NULL);
+
+ proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops);
+
+#ifdef CONFIG_SYSCTL
+ {
+ int i;
+ for(i = 0; i < DN_DEV_LIST_SIZE; i++)
+ dn_dev_sysctl_register(NULL, &dn_dev_list[i]);
+ }
+#endif /* CONFIG_SYSCTL */
+}
+
+void __exit dn_dev_cleanup(void)
+{
+#ifdef CONFIG_SYSCTL
+ {
+ int i;
+ for(i = 0; i < DN_DEV_LIST_SIZE; i++)
+ dn_dev_sysctl_unregister(&dn_dev_list[i]);
+ }
+#endif /* CONFIG_SYSCTL */
+
+ remove_proc_entry("decnet_dev", init_net.proc_net);
+
+ dn_dev_devices_off();
+}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
new file mode 100644
index 000000000..df4803437
--- /dev/null
+++ b/net/decnet/dn_fib.c
@@ -0,0 +1,791 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Routing Forwarding Information Base (Glue/Info List)
+ *
+ * Author: Steve Whitehouse <SteveW@ACM.org>
+ *
+ *
+ * Changes:
+ * Alexey Kuznetsov : SMP locking changes
+ * Steve Whitehouse : Rewrote it... Well to be more correct, I
+ * copied most of it from the ipv4 fib code.
+ * Steve Whitehouse : Updated it in style and fixed a few bugs
+ * which were fixed in the ipv4 code since
+ * this code was copied from it.
+ *
+ */
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/sockios.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/uaccess.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_route.h>
+#include <net/dn_fib.h>
+#include <net/dn_neigh.h>
+#include <net/dn_dev.h>
+
+#define RT_MIN_TABLE 1
+
+#define for_fib_info() { struct dn_fib_info *fi;\
+ for(fi = dn_fib_info_list; fi; fi = fi->fib_next)
+#define endfor_fib_info() }
+
+#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
+ for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define change_nexthops(fi) { int nhsel; struct dn_fib_nh *nh;\
+ for(nhsel = 0, nh = (struct dn_fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define endfor_nexthops(fi) }
+
+static DEFINE_SPINLOCK(dn_fib_multipath_lock);
+static struct dn_fib_info *dn_fib_info_list;
+static DEFINE_SPINLOCK(dn_fib_info_lock);
+
+static struct
+{
+ int error;
+ u8 scope;
+} dn_fib_props[RTN_MAX+1] = {
+ [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE },
+ [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE },
+ [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST },
+ [RTN_BROADCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
+ [RTN_ANYCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
+ [RTN_MULTICAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
+ [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE },
+ [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE },
+ [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE },
+ [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE },
+ [RTN_NAT] = { .error = 0, .scope = RT_SCOPE_NOWHERE },
+ [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
+};
+
+static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force);
+static int dn_fib_sync_up(struct net_device *dev);
+
+void dn_fib_free_info(struct dn_fib_info *fi)
+{
+ if (fi->fib_dead == 0) {
+ printk(KERN_DEBUG "DECnet: BUG! Attempt to free alive dn_fib_info\n");
+ return;
+ }
+
+ change_nexthops(fi) {
+ if (nh->nh_dev)
+ dev_put(nh->nh_dev);
+ nh->nh_dev = NULL;
+ } endfor_nexthops(fi);
+ kfree(fi);
+}
+
+void dn_fib_release_info(struct dn_fib_info *fi)
+{
+ spin_lock(&dn_fib_info_lock);
+ if (fi && --fi->fib_treeref == 0) {
+ if (fi->fib_next)
+ fi->fib_next->fib_prev = fi->fib_prev;
+ if (fi->fib_prev)
+ fi->fib_prev->fib_next = fi->fib_next;
+ if (fi == dn_fib_info_list)
+ dn_fib_info_list = fi->fib_next;
+ fi->fib_dead = 1;
+ dn_fib_info_put(fi);
+ }
+ spin_unlock(&dn_fib_info_lock);
+}
+
+static inline int dn_fib_nh_comp(const struct dn_fib_info *fi, const struct dn_fib_info *ofi)
+{
+ const struct dn_fib_nh *onh = ofi->fib_nh;
+
+ for_nexthops(fi) {
+ if (nh->nh_oif != onh->nh_oif ||
+ nh->nh_gw != onh->nh_gw ||
+ nh->nh_scope != onh->nh_scope ||
+ nh->nh_weight != onh->nh_weight ||
+ ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ return -1;
+ onh++;
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi)
+{
+ for_fib_info() {
+ if (fi->fib_nhs != nfi->fib_nhs)
+ continue;
+ if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_prefsrc == fi->fib_prefsrc &&
+ nfi->fib_priority == fi->fib_priority &&
+ memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 &&
+ ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+ (nfi->fib_nhs == 0 || dn_fib_nh_comp(fi, nfi) == 0))
+ return fi;
+ } endfor_fib_info();
+ return NULL;
+}
+
+static int dn_fib_count_nhs(const struct nlattr *attr)
+{
+ struct rtnexthop *nhp = nla_data(attr);
+ int nhs = 0, nhlen = nla_len(attr);
+
+ while(nhlen >= (int)sizeof(struct rtnexthop)) {
+ if ((nhlen -= nhp->rtnh_len) < 0)
+ return 0;
+ nhs++;
+ nhp = RTNH_NEXT(nhp);
+ }
+
+ return nhs;
+}
+
+static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr,
+ const struct rtmsg *r)
+{
+ struct rtnexthop *nhp = nla_data(attr);
+ int nhlen = nla_len(attr);
+
+ change_nexthops(fi) {
+ int attrlen = nhlen - sizeof(struct rtnexthop);
+ if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ return -EINVAL;
+
+ nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
+ nh->nh_oif = nhp->rtnh_ifindex;
+ nh->nh_weight = nhp->rtnh_hops + 1;
+
+ if (attrlen) {
+ struct nlattr *gw_attr;
+
+ gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
+ nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0;
+ }
+ nhp = RTNH_NEXT(nhp);
+ } endfor_nexthops(fi);
+
+ return 0;
+}
+
+
+static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct dn_fib_nh *nh)
+{
+ int err;
+
+ if (nh->nh_gw) {
+ struct flowidn fld;
+ struct dn_fib_res res;
+
+ if (nh->nh_flags&RTNH_F_ONLINK) {
+ struct net_device *dev;
+
+ if (r->rtm_scope >= RT_SCOPE_LINK)
+ return -EINVAL;
+ if (dnet_addr_type(nh->nh_gw) != RTN_UNICAST)
+ return -EINVAL;
+ if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL)
+ return -ENODEV;
+ if (!(dev->flags&IFF_UP))
+ return -ENETDOWN;
+ nh->nh_dev = dev;
+ dev_hold(dev);
+ nh->nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+
+ memset(&fld, 0, sizeof(fld));
+ fld.daddr = nh->nh_gw;
+ fld.flowidn_oif = nh->nh_oif;
+ fld.flowidn_scope = r->rtm_scope + 1;
+
+ if (fld.flowidn_scope < RT_SCOPE_LINK)
+ fld.flowidn_scope = RT_SCOPE_LINK;
+
+ if ((err = dn_fib_lookup(&fld, &res)) != 0)
+ return err;
+
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+ goto out;
+ nh->nh_scope = res.scope;
+ nh->nh_oif = DN_FIB_RES_OIF(res);
+ nh->nh_dev = DN_FIB_RES_DEV(res);
+ if (nh->nh_dev == NULL)
+ goto out;
+ dev_hold(nh->nh_dev);
+ err = -ENETDOWN;
+ if (!(nh->nh_dev->flags & IFF_UP))
+ goto out;
+ err = 0;
+out:
+ dn_fib_res_put(&res);
+ return err;
+ } else {
+ struct net_device *dev;
+
+ if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+ return -EINVAL;
+
+ dev = __dev_get_by_index(&init_net, nh->nh_oif);
+ if (dev == NULL || dev->dn_ptr == NULL)
+ return -ENODEV;
+ if (!(dev->flags&IFF_UP))
+ return -ENETDOWN;
+ nh->nh_dev = dev;
+ dev_hold(nh->nh_dev);
+ nh->nh_scope = RT_SCOPE_HOST;
+ }
+
+ return 0;
+}
+
+
+struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *attrs[],
+ const struct nlmsghdr *nlh, int *errp)
+{
+ int err;
+ struct dn_fib_info *fi = NULL;
+ struct dn_fib_info *ofi;
+ int nhs = 1;
+
+ if (r->rtm_type > RTN_MAX)
+ goto err_inval;
+
+ if (dn_fib_props[r->rtm_type].scope > r->rtm_scope)
+ goto err_inval;
+
+ if (attrs[RTA_MULTIPATH] &&
+ (nhs = dn_fib_count_nhs(attrs[RTA_MULTIPATH])) == 0)
+ goto err_inval;
+
+ fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL);
+ err = -ENOBUFS;
+ if (fi == NULL)
+ goto failure;
+
+ fi->fib_protocol = r->rtm_protocol;
+ fi->fib_nhs = nhs;
+ fi->fib_flags = r->rtm_flags;
+
+ if (attrs[RTA_PRIORITY])
+ fi->fib_priority = nla_get_u32(attrs[RTA_PRIORITY]);
+
+ if (attrs[RTA_METRICS]) {
+ struct nlattr *attr;
+ int rem;
+
+ nla_for_each_nested(attr, attrs[RTA_METRICS], rem) {
+ int type = nla_type(attr);
+
+ if (type) {
+ if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
+ nla_len(attr) < 4)
+ goto err_inval;
+
+ fi->fib_metrics[type-1] = nla_get_u32(attr);
+ }
+ }
+ }
+
+ if (attrs[RTA_PREFSRC])
+ fi->fib_prefsrc = nla_get_le16(attrs[RTA_PREFSRC]);
+
+ if (attrs[RTA_MULTIPATH]) {
+ if ((err = dn_fib_get_nhs(fi, attrs[RTA_MULTIPATH], r)) != 0)
+ goto failure;
+
+ if (attrs[RTA_OIF] &&
+ fi->fib_nh->nh_oif != nla_get_u32(attrs[RTA_OIF]))
+ goto err_inval;
+
+ if (attrs[RTA_GATEWAY] &&
+ fi->fib_nh->nh_gw != nla_get_le16(attrs[RTA_GATEWAY]))
+ goto err_inval;
+ } else {
+ struct dn_fib_nh *nh = fi->fib_nh;
+
+ if (attrs[RTA_OIF])
+ nh->nh_oif = nla_get_u32(attrs[RTA_OIF]);
+
+ if (attrs[RTA_GATEWAY])
+ nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]);
+
+ nh->nh_flags = r->rtm_flags;
+ nh->nh_weight = 1;
+ }
+
+ if (r->rtm_type == RTN_NAT) {
+ if (!attrs[RTA_GATEWAY] || nhs != 1 || attrs[RTA_OIF])
+ goto err_inval;
+
+ fi->fib_nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]);
+ goto link_it;
+ }
+
+ if (dn_fib_props[r->rtm_type].error) {
+ if (attrs[RTA_GATEWAY] || attrs[RTA_OIF] || attrs[RTA_MULTIPATH])
+ goto err_inval;
+
+ goto link_it;
+ }
+
+ if (r->rtm_scope > RT_SCOPE_HOST)
+ goto err_inval;
+
+ if (r->rtm_scope == RT_SCOPE_HOST) {
+ struct dn_fib_nh *nh = fi->fib_nh;
+
+ /* Local address is added */
+ if (nhs != 1 || nh->nh_gw)
+ goto err_inval;
+ nh->nh_scope = RT_SCOPE_NOWHERE;
+ nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif);
+ err = -ENODEV;
+ if (nh->nh_dev == NULL)
+ goto failure;
+ } else {
+ change_nexthops(fi) {
+ if ((err = dn_fib_check_nh(r, fi, nh)) != 0)
+ goto failure;
+ } endfor_nexthops(fi)
+ }
+
+ if (fi->fib_prefsrc) {
+ if (r->rtm_type != RTN_LOCAL || !attrs[RTA_DST] ||
+ fi->fib_prefsrc != nla_get_le16(attrs[RTA_DST]))
+ if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+ goto err_inval;
+ }
+
+link_it:
+ if ((ofi = dn_fib_find_info(fi)) != NULL) {
+ fi->fib_dead = 1;
+ dn_fib_free_info(fi);
+ ofi->fib_treeref++;
+ return ofi;
+ }
+
+ fi->fib_treeref++;
+ atomic_inc(&fi->fib_clntref);
+ spin_lock(&dn_fib_info_lock);
+ fi->fib_next = dn_fib_info_list;
+ fi->fib_prev = NULL;
+ if (dn_fib_info_list)
+ dn_fib_info_list->fib_prev = fi;
+ dn_fib_info_list = fi;
+ spin_unlock(&dn_fib_info_lock);
+ return fi;
+
+err_inval:
+ err = -EINVAL;
+
+failure:
+ *errp = err;
+ if (fi) {
+ fi->fib_dead = 1;
+ dn_fib_free_info(fi);
+ }
+
+ return NULL;
+}
+
+int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowidn *fld, struct dn_fib_res *res)
+{
+ int err = dn_fib_props[type].error;
+
+ if (err == 0) {
+ if (fi->fib_flags & RTNH_F_DEAD)
+ return 1;
+
+ res->fi = fi;
+
+ switch (type) {
+ case RTN_NAT:
+ DN_FIB_RES_RESET(*res);
+ atomic_inc(&fi->fib_clntref);
+ return 0;
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ for_nexthops(fi) {
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (!fld->flowidn_oif ||
+ fld->flowidn_oif == nh->nh_oif)
+ break;
+ }
+ if (nhsel < fi->fib_nhs) {
+ res->nh_sel = nhsel;
+ atomic_inc(&fi->fib_clntref);
+ return 0;
+ }
+ endfor_nexthops(fi);
+ res->fi = NULL;
+ return 1;
+ default:
+ net_err_ratelimited("DECnet: impossible routing event : dn_fib_semantic_match type=%d\n",
+ type);
+ res->fi = NULL;
+ return -EINVAL;
+ }
+ }
+ return err;
+}
+
+void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res)
+{
+ struct dn_fib_info *fi = res->fi;
+ int w;
+
+ spin_lock_bh(&dn_fib_multipath_lock);
+ if (fi->fib_power <= 0) {
+ int power = 0;
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ power += nh->nh_weight;
+ nh->nh_power = nh->nh_weight;
+ }
+ } endfor_nexthops(fi);
+ fi->fib_power = power;
+ if (power < 0) {
+ spin_unlock_bh(&dn_fib_multipath_lock);
+ res->nh_sel = 0;
+ return;
+ }
+ }
+
+ w = jiffies % fi->fib_power;
+
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+ if ((w -= nh->nh_power) <= 0) {
+ nh->nh_power--;
+ fi->fib_power--;
+ res->nh_sel = nhsel;
+ spin_unlock_bh(&dn_fib_multipath_lock);
+ return;
+ }
+ }
+ } endfor_nexthops(fi);
+ res->nh_sel = 0;
+ spin_unlock_bh(&dn_fib_multipath_lock);
+}
+
+static inline u32 rtm_get_table(struct nlattr *attrs[], u8 table)
+{
+ if (attrs[RTA_TABLE])
+ table = nla_get_u32(attrs[RTA_TABLE]);
+
+ return table;
+}
+
+static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct dn_fib_table *tb;
+ struct rtmsg *r = nlmsg_data(nlh);
+ struct nlattr *attrs[RTA_MAX+1];
+ int err;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!net_eq(net, &init_net))
+ return -EINVAL;
+
+ err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy);
+ if (err < 0)
+ return err;
+
+ tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 0);
+ if (!tb)
+ return -ESRCH;
+
+ return tb->delete(tb, r, attrs, nlh, &NETLINK_CB(skb));
+}
+
+static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct dn_fib_table *tb;
+ struct rtmsg *r = nlmsg_data(nlh);
+ struct nlattr *attrs[RTA_MAX+1];
+ int err;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!net_eq(net, &init_net))
+ return -EINVAL;
+
+ err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy);
+ if (err < 0)
+ return err;
+
+ tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 1);
+ if (!tb)
+ return -ENOBUFS;
+
+ return tb->insert(tb, r, attrs, nlh, &NETLINK_CB(skb));
+}
+
+static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa)
+{
+ struct dn_fib_table *tb;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtmsg rtm;
+ } req;
+ struct {
+ struct nlattr hdr;
+ __le16 dst;
+ } dst_attr = {
+ .dst = dst,
+ };
+ struct {
+ struct nlattr hdr;
+ __le16 prefsrc;
+ } prefsrc_attr = {
+ .prefsrc = ifa->ifa_local,
+ };
+ struct {
+ struct nlattr hdr;
+ u32 oif;
+ } oif_attr = {
+ .oif = ifa->ifa_dev->dev->ifindex,
+ };
+ struct nlattr *attrs[RTA_MAX+1] = {
+ [RTA_DST] = (struct nlattr *) &dst_attr,
+ [RTA_PREFSRC] = (struct nlattr * ) &prefsrc_attr,
+ [RTA_OIF] = (struct nlattr *) &oif_attr,
+ };
+
+ memset(&req.rtm, 0, sizeof(req.rtm));
+
+ if (type == RTN_UNICAST)
+ tb = dn_fib_get_table(RT_MIN_TABLE, 1);
+ else
+ tb = dn_fib_get_table(RT_TABLE_LOCAL, 1);
+
+ if (tb == NULL)
+ return;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = cmd;
+ req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = 0;
+
+ req.rtm.rtm_dst_len = dst_len;
+ req.rtm.rtm_table = tb->n;
+ req.rtm.rtm_protocol = RTPROT_KERNEL;
+ req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
+ req.rtm.rtm_type = type;
+
+ if (cmd == RTM_NEWROUTE)
+ tb->insert(tb, &req.rtm, attrs, &req.nlh, NULL);
+ else
+ tb->delete(tb, &req.rtm, attrs, &req.nlh, NULL);
+}
+
+static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa)
+{
+
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
+
+#if 0
+ if (!(dev->flags&IFF_UP))
+ return;
+ /* In the future, we will want to add default routes here */
+
+#endif
+}
+
+static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa)
+{
+ int found_it = 0;
+ struct net_device *dev;
+ struct dn_dev *dn_db;
+ struct dn_ifaddr *ifa2;
+
+ ASSERT_RTNL();
+
+ /* Scan device list */
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ dn_db = rcu_dereference(dev->dn_ptr);
+ if (dn_db == NULL)
+ continue;
+ for (ifa2 = rcu_dereference(dn_db->ifa_list);
+ ifa2 != NULL;
+ ifa2 = rcu_dereference(ifa2->ifa_next)) {
+ if (ifa2->ifa_local == ifa->ifa_local) {
+ found_it = 1;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ if (found_it == 0) {
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
+
+ if (dnet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+ if (dn_fib_sync_down(ifa->ifa_local, NULL, 0))
+ dn_fib_flush();
+ }
+ }
+}
+
+static void dn_fib_disable_addr(struct net_device *dev, int force)
+{
+ if (dn_fib_sync_down(0, dev, force))
+ dn_fib_flush();
+ dn_rt_cache_flush(0);
+ neigh_ifdown(&dn_neigh_table, dev);
+}
+
+static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct dn_ifaddr *ifa = (struct dn_ifaddr *)ptr;
+
+ switch (event) {
+ case NETDEV_UP:
+ dn_fib_add_ifaddr(ifa);
+ dn_fib_sync_up(ifa->ifa_dev->dev);
+ dn_rt_cache_flush(-1);
+ break;
+ case NETDEV_DOWN:
+ dn_fib_del_ifaddr(ifa);
+ if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+ dn_fib_disable_addr(ifa->ifa_dev->dev, 1);
+ } else {
+ dn_rt_cache_flush(-1);
+ }
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
+{
+ int ret = 0;
+ int scope = RT_SCOPE_NOWHERE;
+
+ if (force)
+ scope = -1;
+
+ for_fib_info() {
+ /*
+ * This makes no sense for DECnet.... we will almost
+ * certainly have more than one local address the same
+ * over all our interfaces. It needs thinking about
+ * some more.
+ */
+ if (local && fi->fib_prefsrc == local) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ } else if (dev && fi->fib_nhs) {
+ int dead = 0;
+
+ change_nexthops(fi) {
+ if (nh->nh_flags&RTNH_F_DEAD)
+ dead++;
+ else if (nh->nh_dev == dev &&
+ nh->nh_scope != scope) {
+ spin_lock_bh(&dn_fib_multipath_lock);
+ nh->nh_flags |= RTNH_F_DEAD;
+ fi->fib_power -= nh->nh_power;
+ nh->nh_power = 0;
+ spin_unlock_bh(&dn_fib_multipath_lock);
+ dead++;
+ }
+ } endfor_nexthops(fi)
+ if (dead == fi->fib_nhs) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ } endfor_fib_info();
+ return ret;
+}
+
+
+static int dn_fib_sync_up(struct net_device *dev)
+{
+ int ret = 0;
+
+ if (!(dev->flags&IFF_UP))
+ return 0;
+
+ for_fib_info() {
+ int alive = 0;
+
+ change_nexthops(fi) {
+ if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ alive++;
+ continue;
+ }
+ if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+ continue;
+ if (nh->nh_dev != dev || dev->dn_ptr == NULL)
+ continue;
+ alive++;
+ spin_lock_bh(&dn_fib_multipath_lock);
+ nh->nh_power = 0;
+ nh->nh_flags &= ~RTNH_F_DEAD;
+ spin_unlock_bh(&dn_fib_multipath_lock);
+ } endfor_nexthops(fi);
+
+ if (alive > 0) {
+ fi->fib_flags &= ~RTNH_F_DEAD;
+ ret++;
+ }
+ } endfor_fib_info();
+ return ret;
+}
+
+static struct notifier_block dn_fib_dnaddr_notifier = {
+ .notifier_call = dn_fib_dnaddr_event,
+};
+
+void __exit dn_fib_cleanup(void)
+{
+ dn_fib_table_cleanup();
+ dn_fib_rules_cleanup();
+
+ unregister_dnaddr_notifier(&dn_fib_dnaddr_notifier);
+}
+
+
+void __init dn_fib_init(void)
+{
+ dn_fib_table_init();
+ dn_fib_rules_init();
+
+ register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
+
+ rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, NULL);
+}
+
+
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
new file mode 100644
index 000000000..4507b188f
--- /dev/null
+++ b/net/decnet/dn_neigh.c
@@ -0,0 +1,616 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Neighbour Functions (Adjacency Database and
+ * On-Ethernet Cache)
+ *
+ * Author: Steve Whitehouse <SteveW@ACM.org>
+ *
+ *
+ * Changes:
+ * Steve Whitehouse : Fixed router listing routine
+ * Steve Whitehouse : Added error_report functions
+ * Steve Whitehouse : Added default router detection
+ * Steve Whitehouse : Hop counts in outgoing messages
+ * Steve Whitehouse : Fixed src/dst in outgoing messages so
+ * forwarding now stands a good chance of
+ * working.
+ * Steve Whitehouse : Fixed neighbour states (for now anyway).
+ * Steve Whitehouse : Made error_report functions dummies. This
+ * is not the right place to return skbs.
+ * Steve Whitehouse : Convert to seq_file
+ *
+ */
+
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/string.h>
+#include <linux/netfilter_decnet.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/atomic.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/dn.h>
+#include <net/dn_dev.h>
+#include <net/dn_neigh.h>
+#include <net/dn_route.h>
+
+static int dn_neigh_construct(struct neighbour *);
+static void dn_neigh_error_report(struct neighbour *, struct sk_buff *);
+static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb);
+
+/*
+ * Operations for adding the link layer header.
+ */
+static const struct neigh_ops dn_neigh_ops = {
+ .family = AF_DECnet,
+ .error_report = dn_neigh_error_report,
+ .output = dn_neigh_output,
+ .connected_output = dn_neigh_output,
+};
+
+static u32 dn_neigh_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd)
+{
+ return jhash_2words(*(__u16 *)pkey, 0, hash_rnd[0]);
+}
+
+static bool dn_key_eq(const struct neighbour *neigh, const void *pkey)
+{
+ return neigh_key_eq16(neigh, pkey);
+}
+
+struct neigh_table dn_neigh_table = {
+ .family = PF_DECnet,
+ .entry_size = NEIGH_ENTRY_SIZE(sizeof(struct dn_neigh)),
+ .key_len = sizeof(__le16),
+ .protocol = cpu_to_be16(ETH_P_DNA_RT),
+ .hash = dn_neigh_hash,
+ .key_eq = dn_key_eq,
+ .constructor = dn_neigh_construct,
+ .id = "dn_neigh_cache",
+ .parms ={
+ .tbl = &dn_neigh_table,
+ .reachable_time = 30 * HZ,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 0,
+ [NEIGH_VAR_UCAST_PROBES] = 0,
+ [NEIGH_VAR_APP_PROBES] = 0,
+ [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = 64*1024,
+ [NEIGH_VAR_PROXY_QLEN] = 0,
+ [NEIGH_VAR_ANYCAST_DELAY] = 0,
+ [NEIGH_VAR_PROXY_DELAY] = 0,
+ [NEIGH_VAR_LOCKTIME] = 1 * HZ,
+ },
+ },
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
+};
+
+static int dn_neigh_construct(struct neighbour *neigh)
+{
+ struct net_device *dev = neigh->dev;
+ struct dn_neigh *dn = (struct dn_neigh *)neigh;
+ struct dn_dev *dn_db;
+ struct neigh_parms *parms;
+
+ rcu_read_lock();
+ dn_db = rcu_dereference(dev->dn_ptr);
+ if (dn_db == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ parms = dn_db->neigh_parms;
+ if (!parms) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ __neigh_parms_put(neigh->parms);
+ neigh->parms = neigh_parms_clone(parms);
+ rcu_read_unlock();
+
+ neigh->ops = &dn_neigh_ops;
+ neigh->nud_state = NUD_NOARP;
+ neigh->output = neigh->ops->connected_output;
+
+ if ((dev->type == ARPHRD_IPGRE) || (dev->flags & IFF_POINTOPOINT))
+ memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+ else if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK))
+ dn_dn2eth(neigh->ha, dn->addr);
+ else {
+ net_dbg_ratelimited("Trying to create neigh for hw %d\n",
+ dev->type);
+ return -EINVAL;
+ }
+
+ /*
+ * Make an estimate of the remote block size by assuming that its
+ * two less then the device mtu, which it true for ethernet (and
+ * other things which support long format headers) since there is
+ * an extra length field (of 16 bits) which isn't part of the
+ * ethernet headers and which the DECnet specs won't admit is part
+ * of the DECnet routing headers either.
+ *
+ * If we over estimate here its no big deal, the NSP negotiations
+ * will prevent us from sending packets which are too large for the
+ * remote node to handle. In any case this figure is normally updated
+ * by a hello message in most cases.
+ */
+ dn->blksize = dev->mtu - 2;
+
+ return 0;
+}
+
+static void dn_neigh_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+ printk(KERN_DEBUG "dn_neigh_error_report: called\n");
+ kfree_skb(skb);
+}
+
+static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct dn_route *rt = (struct dn_route *)dst;
+ struct net_device *dev = neigh->dev;
+ char mac_addr[ETH_ALEN];
+ unsigned int seq;
+ int err;
+
+ dn_dn2eth(mac_addr, rt->rt_local_src);
+ do {
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, mac_addr, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
+ if (err >= 0)
+ err = dev_queue_xmit(skb);
+ else {
+ kfree_skb(skb);
+ err = -EINVAL;
+ }
+ return err;
+}
+
+static int dn_neigh_output_packet(struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct dn_route *rt = (struct dn_route *)dst;
+ struct neighbour *neigh = rt->n;
+
+ return neigh->output(neigh, skb);
+}
+
+/*
+ * For talking to broadcast devices: Ethernet & PPP
+ */
+static int dn_long_output(struct neighbour *neigh, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct net_device *dev = neigh->dev;
+ int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3;
+ unsigned char *data;
+ struct dn_long_packet *lp;
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+
+ if (skb_headroom(skb) < headroom) {
+ struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
+ if (skb2 == NULL) {
+ net_crit_ratelimited("dn_long_output: no memory\n");
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+ consume_skb(skb);
+ skb = skb2;
+ net_info_ratelimited("dn_long_output: Increasing headroom\n");
+ }
+
+ data = skb_push(skb, sizeof(struct dn_long_packet) + 3);
+ lp = (struct dn_long_packet *)(data+3);
+
+ *((__le16 *)data) = cpu_to_le16(skb->len - 2);
+ *(data + 2) = 1 | DN_RT_F_PF; /* Padding */
+
+ lp->msgflg = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS));
+ lp->d_area = lp->d_subarea = 0;
+ dn_dn2eth(lp->d_id, cb->dst);
+ lp->s_area = lp->s_subarea = 0;
+ dn_dn2eth(lp->s_id, cb->src);
+ lp->nl2 = 0;
+ lp->visit_ct = cb->hops & 0x3f;
+ lp->s_class = 0;
+ lp->pt = 0;
+
+ skb_reset_network_header(skb);
+
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb,
+ NULL, neigh->dev, dn_neigh_output_packet);
+}
+
+/*
+ * For talking to pointopoint and multidrop devices: DDCMP and X.25
+ */
+static int dn_short_output(struct neighbour *neigh, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct net_device *dev = neigh->dev;
+ int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
+ struct dn_short_packet *sp;
+ unsigned char *data;
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+
+ if (skb_headroom(skb) < headroom) {
+ struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
+ if (skb2 == NULL) {
+ net_crit_ratelimited("dn_short_output: no memory\n");
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+ consume_skb(skb);
+ skb = skb2;
+ net_info_ratelimited("dn_short_output: Increasing headroom\n");
+ }
+
+ data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
+ *((__le16 *)data) = cpu_to_le16(skb->len - 2);
+ sp = (struct dn_short_packet *)(data+2);
+
+ sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
+ sp->dstnode = cb->dst;
+ sp->srcnode = cb->src;
+ sp->forward = cb->hops & 0x3f;
+
+ skb_reset_network_header(skb);
+
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb,
+ NULL, neigh->dev, dn_neigh_output_packet);
+}
+
+/*
+ * For talking to DECnet phase III nodes
+ * Phase 3 output is the same as short output, execpt that
+ * it clears the area bits before transmission.
+ */
+static int dn_phase3_output(struct neighbour *neigh, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct net_device *dev = neigh->dev;
+ int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
+ struct dn_short_packet *sp;
+ unsigned char *data;
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+ if (skb_headroom(skb) < headroom) {
+ struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
+ if (skb2 == NULL) {
+ net_crit_ratelimited("dn_phase3_output: no memory\n");
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+ consume_skb(skb);
+ skb = skb2;
+ net_info_ratelimited("dn_phase3_output: Increasing headroom\n");
+ }
+
+ data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
+ *((__le16 *)data) = cpu_to_le16(skb->len - 2);
+ sp = (struct dn_short_packet *)(data + 2);
+
+ sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
+ sp->dstnode = cb->dst & cpu_to_le16(0x03ff);
+ sp->srcnode = cb->src & cpu_to_le16(0x03ff);
+ sp->forward = cb->hops & 0x3f;
+
+ skb_reset_network_header(skb);
+
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, sk, skb,
+ NULL, neigh->dev, dn_neigh_output_packet);
+}
+
+int dn_to_neigh_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct dn_route *rt = (struct dn_route *) dst;
+ struct neighbour *neigh = rt->n;
+ struct dn_neigh *dn = (struct dn_neigh *)neigh;
+ struct dn_dev *dn_db;
+ bool use_long;
+
+ rcu_read_lock();
+ dn_db = rcu_dereference(neigh->dev->dn_ptr);
+ if (dn_db == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ use_long = dn_db->use_long;
+ rcu_read_unlock();
+
+ if (dn->flags & DN_NDFLAG_P3)
+ return dn_phase3_output(neigh, sk, skb);
+ if (use_long)
+ return dn_long_output(neigh, sk, skb);
+ else
+ return dn_short_output(neigh, sk, skb);
+}
+
+/*
+ * Unfortunately, the neighbour code uses the device in its hash
+ * function, so we don't get any advantage from it. This function
+ * basically does a neigh_lookup(), but without comparing the device
+ * field. This is required for the On-Ethernet cache
+ */
+
+/*
+ * Pointopoint link receives a hello message
+ */
+void dn_neigh_pointopoint_hello(struct sk_buff *skb)
+{
+ kfree_skb(skb);
+}
+
+/*
+ * Ethernet router hello message received
+ */
+int dn_neigh_router_hello(struct sock *sk, struct sk_buff *skb)
+{
+ struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data;
+
+ struct neighbour *neigh;
+ struct dn_neigh *dn;
+ struct dn_dev *dn_db;
+ __le16 src;
+
+ src = dn_eth2dn(msg->id);
+
+ neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
+
+ dn = (struct dn_neigh *)neigh;
+
+ if (neigh) {
+ write_lock(&neigh->lock);
+
+ neigh->used = jiffies;
+ dn_db = rcu_dereference(neigh->dev->dn_ptr);
+
+ if (!(neigh->nud_state & NUD_PERMANENT)) {
+ neigh->updated = jiffies;
+
+ if (neigh->dev->type == ARPHRD_ETHER)
+ memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
+
+ dn->blksize = le16_to_cpu(msg->blksize);
+ dn->priority = msg->priority;
+
+ dn->flags &= ~DN_NDFLAG_P3;
+
+ switch (msg->iinfo & DN_RT_INFO_TYPE) {
+ case DN_RT_INFO_L1RT:
+ dn->flags &=~DN_NDFLAG_R2;
+ dn->flags |= DN_NDFLAG_R1;
+ break;
+ case DN_RT_INFO_L2RT:
+ dn->flags |= DN_NDFLAG_R2;
+ }
+ }
+
+ /* Only use routers in our area */
+ if ((le16_to_cpu(src)>>10) == (le16_to_cpu((decnet_address))>>10)) {
+ if (!dn_db->router) {
+ dn_db->router = neigh_clone(neigh);
+ } else {
+ if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
+ neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
+ }
+ }
+ write_unlock(&neigh->lock);
+ neigh_release(neigh);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+/*
+ * Endnode hello message received
+ */
+int dn_neigh_endnode_hello(struct sock *sk, struct sk_buff *skb)
+{
+ struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data;
+ struct neighbour *neigh;
+ struct dn_neigh *dn;
+ __le16 src;
+
+ src = dn_eth2dn(msg->id);
+
+ neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
+
+ dn = (struct dn_neigh *)neigh;
+
+ if (neigh) {
+ write_lock(&neigh->lock);
+
+ neigh->used = jiffies;
+
+ if (!(neigh->nud_state & NUD_PERMANENT)) {
+ neigh->updated = jiffies;
+
+ if (neigh->dev->type == ARPHRD_ETHER)
+ memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
+ dn->flags &= ~(DN_NDFLAG_R1 | DN_NDFLAG_R2);
+ dn->blksize = le16_to_cpu(msg->blksize);
+ dn->priority = 0;
+ }
+
+ write_unlock(&neigh->lock);
+ neigh_release(neigh);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static char *dn_find_slot(char *base, int max, int priority)
+{
+ int i;
+ unsigned char *min = NULL;
+
+ base += 6; /* skip first id */
+
+ for(i = 0; i < max; i++) {
+ if (!min || (*base < *min))
+ min = base;
+ base += 7; /* find next priority */
+ }
+
+ if (!min)
+ return NULL;
+
+ return (*min < priority) ? (min - 6) : NULL;
+}
+
+struct elist_cb_state {
+ struct net_device *dev;
+ unsigned char *ptr;
+ unsigned char *rs;
+ int t, n;
+};
+
+static void neigh_elist_cb(struct neighbour *neigh, void *_info)
+{
+ struct elist_cb_state *s = _info;
+ struct dn_neigh *dn;
+
+ if (neigh->dev != s->dev)
+ return;
+
+ dn = (struct dn_neigh *) neigh;
+ if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2)))
+ return;
+
+ if (s->t == s->n)
+ s->rs = dn_find_slot(s->ptr, s->n, dn->priority);
+ else
+ s->t++;
+ if (s->rs == NULL)
+ return;
+
+ dn_dn2eth(s->rs, dn->addr);
+ s->rs += 6;
+ *(s->rs) = neigh->nud_state & NUD_CONNECTED ? 0x80 : 0x0;
+ *(s->rs) |= dn->priority;
+ s->rs++;
+}
+
+int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n)
+{
+ struct elist_cb_state state;
+
+ state.dev = dev;
+ state.t = 0;
+ state.n = n;
+ state.ptr = ptr;
+ state.rs = ptr;
+
+ neigh_for_each(&dn_neigh_table, neigh_elist_cb, &state);
+
+ return state.t;
+}
+
+
+#ifdef CONFIG_PROC_FS
+
+static inline void dn_neigh_format_entry(struct seq_file *seq,
+ struct neighbour *n)
+{
+ struct dn_neigh *dn = (struct dn_neigh *) n;
+ char buf[DN_ASCBUF_LEN];
+
+ read_lock(&n->lock);
+ seq_printf(seq, "%-7s %s%s%s %02x %02d %07ld %-8s\n",
+ dn_addr2asc(le16_to_cpu(dn->addr), buf),
+ (dn->flags&DN_NDFLAG_R1) ? "1" : "-",
+ (dn->flags&DN_NDFLAG_R2) ? "2" : "-",
+ (dn->flags&DN_NDFLAG_P3) ? "3" : "-",
+ dn->n.nud_state,
+ atomic_read(&dn->n.refcnt),
+ dn->blksize,
+ (dn->n.dev) ? dn->n.dev->name : "?");
+ read_unlock(&n->lock);
+}
+
+static int dn_neigh_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Addr Flags State Use Blksize Dev\n");
+ } else {
+ dn_neigh_format_entry(seq, v);
+ }
+
+ return 0;
+}
+
+static void *dn_neigh_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return neigh_seq_start(seq, pos, &dn_neigh_table,
+ NEIGH_SEQ_NEIGH_ONLY);
+}
+
+static const struct seq_operations dn_neigh_seq_ops = {
+ .start = dn_neigh_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = dn_neigh_seq_show,
+};
+
+static int dn_neigh_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &dn_neigh_seq_ops,
+ sizeof(struct neigh_seq_state));
+}
+
+static const struct file_operations dn_neigh_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dn_neigh_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif
+
+void __init dn_neigh_init(void)
+{
+ neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table);
+ proc_create("decnet_neigh", S_IRUGO, init_net.proc_net,
+ &dn_neigh_seq_fops);
+}
+
+void __exit dn_neigh_cleanup(void)
+{
+ remove_proc_entry("decnet_neigh", init_net.proc_net);
+ neigh_table_clear(NEIGH_DN_TABLE, &dn_neigh_table);
+}
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
new file mode 100644
index 000000000..a321eac9f
--- /dev/null
+++ b/net/decnet/dn_nsp_in.c
@@ -0,0 +1,917 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Network Services Protocol (Input)
+ *
+ * Author: Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *
+ * Changes:
+ *
+ * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from
+ * original dn_nsp.c.
+ * Steve Whitehouse: Updated to work with my new routing architecture.
+ * Steve Whitehouse: Add changes from Eduardo Serrat's patches.
+ * Steve Whitehouse: Put all ack handling code in a common routine.
+ * Steve Whitehouse: Put other common bits into dn_nsp_rx()
+ * Steve Whitehouse: More checks on skb->len to catch bogus packets
+ * Fixed various race conditions and possible nasties.
+ * Steve Whitehouse: Now handles returned conninit frames.
+ * David S. Miller: New socket locking
+ * Steve Whitehouse: Fixed lockup when socket filtering was enabled.
+ * Paul Koning: Fix to push CC sockets into RUN when acks are
+ * received.
+ * Steve Whitehouse:
+ * Patrick Caulfield: Checking conninits for correctness & sending of error
+ * responses.
+ * Steve Whitehouse: Added backlog congestion level return codes.
+ * Patrick Caulfield:
+ * Steve Whitehouse: Added flow control support (outbound)
+ * Steve Whitehouse: Prepare for nonlinear skbs
+ */
+
+/******************************************************************************
+ (c) 1995-1998 E.M. Serrat emserrat@geocities.com
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+*******************************************************************************/
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/route.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/termios.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_decnet.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/dn.h>
+#include <net/dn_nsp.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+
+extern int decnet_log_martians;
+
+static void dn_log_martian(struct sk_buff *skb, const char *msg)
+{
+ if (decnet_log_martians) {
+ char *devname = skb->dev ? skb->dev->name : "???";
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ net_info_ratelimited("DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n",
+ msg, devname,
+ le16_to_cpu(cb->src),
+ le16_to_cpu(cb->dst),
+ le16_to_cpu(cb->src_port),
+ le16_to_cpu(cb->dst_port));
+ }
+}
+
+/*
+ * For this function we've flipped the cross-subchannel bit
+ * if the message is an otherdata or linkservice message. Thus
+ * we can use it to work out what to update.
+ */
+static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ unsigned short type = ((ack >> 12) & 0x0003);
+ int wakeup = 0;
+
+ switch (type) {
+ case 0: /* ACK - Data */
+ if (dn_after(ack, scp->ackrcv_dat)) {
+ scp->ackrcv_dat = ack & 0x0fff;
+ wakeup |= dn_nsp_check_xmit_queue(sk, skb,
+ &scp->data_xmit_queue,
+ ack);
+ }
+ break;
+ case 1: /* NAK - Data */
+ break;
+ case 2: /* ACK - OtherData */
+ if (dn_after(ack, scp->ackrcv_oth)) {
+ scp->ackrcv_oth = ack & 0x0fff;
+ wakeup |= dn_nsp_check_xmit_queue(sk, skb,
+ &scp->other_xmit_queue,
+ ack);
+ }
+ break;
+ case 3: /* NAK - OtherData */
+ break;
+ }
+
+ if (wakeup && !sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+}
+
+/*
+ * This function is a universal ack processor.
+ */
+static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth)
+{
+ __le16 *ptr = (__le16 *)skb->data;
+ int len = 0;
+ unsigned short ack;
+
+ if (skb->len < 2)
+ return len;
+
+ if ((ack = le16_to_cpu(*ptr)) & 0x8000) {
+ skb_pull(skb, 2);
+ ptr++;
+ len += 2;
+ if ((ack & 0x4000) == 0) {
+ if (oth)
+ ack ^= 0x2000;
+ dn_ack(sk, skb, ack);
+ }
+ }
+
+ if (skb->len < 2)
+ return len;
+
+ if ((ack = le16_to_cpu(*ptr)) & 0x8000) {
+ skb_pull(skb, 2);
+ len += 2;
+ if ((ack & 0x4000) == 0) {
+ if (oth)
+ ack ^= 0x2000;
+ dn_ack(sk, skb, ack);
+ }
+ }
+
+ return len;
+}
+
+
+/**
+ * dn_check_idf - Check an image data field format is correct.
+ * @pptr: Pointer to pointer to image data
+ * @len: Pointer to length of image data
+ * @max: The maximum allowed length of the data in the image data field
+ * @follow_on: Check that this many bytes exist beyond the end of the image data
+ *
+ * Returns: 0 if ok, -1 on error
+ */
+static inline int dn_check_idf(unsigned char **pptr, int *len, unsigned char max, unsigned char follow_on)
+{
+ unsigned char *ptr = *pptr;
+ unsigned char flen = *ptr++;
+
+ (*len)--;
+ if (flen > max)
+ return -1;
+ if ((flen + follow_on) > *len)
+ return -1;
+
+ *len -= flen;
+ *pptr = ptr + flen;
+ return 0;
+}
+
+/*
+ * Table of reason codes to pass back to node which sent us a badly
+ * formed message, plus text messages for the log. A zero entry in
+ * the reason field means "don't reply" otherwise a disc init is sent with
+ * the specified reason code.
+ */
+static struct {
+ unsigned short reason;
+ const char *text;
+} ci_err_table[] = {
+ { 0, "CI: Truncated message" },
+ { NSP_REASON_ID, "CI: Destination username error" },
+ { NSP_REASON_ID, "CI: Destination username type" },
+ { NSP_REASON_US, "CI: Source username error" },
+ { 0, "CI: Truncated at menuver" },
+ { 0, "CI: Truncated before access or user data" },
+ { NSP_REASON_IO, "CI: Access data format error" },
+ { NSP_REASON_IO, "CI: User data format error" }
+};
+
+/*
+ * This function uses a slightly different lookup method
+ * to find its sockets, since it searches on object name/number
+ * rather than port numbers. Various tests are done to ensure that
+ * the incoming data is in the correct format before it is queued to
+ * a socket.
+ */
+static struct sock *dn_find_listener(struct sk_buff *skb, unsigned short *reason)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct nsp_conn_init_msg *msg = (struct nsp_conn_init_msg *)skb->data;
+ struct sockaddr_dn dstaddr;
+ struct sockaddr_dn srcaddr;
+ unsigned char type = 0;
+ int dstlen;
+ int srclen;
+ unsigned char *ptr;
+ int len;
+ int err = 0;
+ unsigned char menuver;
+
+ memset(&dstaddr, 0, sizeof(struct sockaddr_dn));
+ memset(&srcaddr, 0, sizeof(struct sockaddr_dn));
+
+ /*
+ * 1. Decode & remove message header
+ */
+ cb->src_port = msg->srcaddr;
+ cb->dst_port = msg->dstaddr;
+ cb->services = msg->services;
+ cb->info = msg->info;
+ cb->segsize = le16_to_cpu(msg->segsize);
+
+ if (!pskb_may_pull(skb, sizeof(*msg)))
+ goto err_out;
+
+ skb_pull(skb, sizeof(*msg));
+
+ len = skb->len;
+ ptr = skb->data;
+
+ /*
+ * 2. Check destination end username format
+ */
+ dstlen = dn_username2sockaddr(ptr, len, &dstaddr, &type);
+ err++;
+ if (dstlen < 0)
+ goto err_out;
+
+ err++;
+ if (type > 1)
+ goto err_out;
+
+ len -= dstlen;
+ ptr += dstlen;
+
+ /*
+ * 3. Check source end username format
+ */
+ srclen = dn_username2sockaddr(ptr, len, &srcaddr, &type);
+ err++;
+ if (srclen < 0)
+ goto err_out;
+
+ len -= srclen;
+ ptr += srclen;
+ err++;
+ if (len < 1)
+ goto err_out;
+
+ menuver = *ptr;
+ ptr++;
+ len--;
+
+ /*
+ * 4. Check that optional data actually exists if menuver says it does
+ */
+ err++;
+ if ((menuver & (DN_MENUVER_ACC | DN_MENUVER_USR)) && (len < 1))
+ goto err_out;
+
+ /*
+ * 5. Check optional access data format
+ */
+ err++;
+ if (menuver & DN_MENUVER_ACC) {
+ if (dn_check_idf(&ptr, &len, 39, 1))
+ goto err_out;
+ if (dn_check_idf(&ptr, &len, 39, 1))
+ goto err_out;
+ if (dn_check_idf(&ptr, &len, 39, (menuver & DN_MENUVER_USR) ? 1 : 0))
+ goto err_out;
+ }
+
+ /*
+ * 6. Check optional user data format
+ */
+ err++;
+ if (menuver & DN_MENUVER_USR) {
+ if (dn_check_idf(&ptr, &len, 16, 0))
+ goto err_out;
+ }
+
+ /*
+ * 7. Look up socket based on destination end username
+ */
+ return dn_sklist_find_listener(&dstaddr);
+err_out:
+ dn_log_martian(skb, ci_err_table[err].text);
+ *reason = ci_err_table[err].reason;
+ return NULL;
+}
+
+
+static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb)
+{
+ if (sk_acceptq_is_full(sk)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ sk->sk_ack_backlog++;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_state_change(sk);
+}
+
+static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct dn_scp *scp = DN_SK(sk);
+ unsigned char *ptr;
+
+ if (skb->len < 4)
+ goto out;
+
+ ptr = skb->data;
+ cb->services = *ptr++;
+ cb->info = *ptr++;
+ cb->segsize = le16_to_cpu(*(__le16 *)ptr);
+
+ if ((scp->state == DN_CI) || (scp->state == DN_CD)) {
+ scp->persist = 0;
+ scp->addrrem = cb->src_port;
+ sk->sk_state = TCP_ESTABLISHED;
+ scp->state = DN_RUN;
+ scp->services_rem = cb->services;
+ scp->info_rem = cb->info;
+ scp->segsize_rem = cb->segsize;
+
+ if ((scp->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
+ scp->max_window = decnet_no_fc_max_cwnd;
+
+ if (skb->len > 0) {
+ u16 dlen = *skb->data;
+ if ((dlen <= 16) && (dlen <= skb->len)) {
+ scp->conndata_in.opt_optl = cpu_to_le16(dlen);
+ skb_copy_from_linear_data_offset(skb, 1,
+ scp->conndata_in.opt_data, dlen);
+ }
+ }
+ dn_nsp_send_link(sk, DN_NOCHANGE, 0);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ }
+
+out:
+ kfree_skb(skb);
+}
+
+static void dn_nsp_conn_ack(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ if (scp->state == DN_CI) {
+ scp->state = DN_CD;
+ scp->persist = 0;
+ }
+
+ kfree_skb(skb);
+}
+
+static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ unsigned short reason;
+
+ if (skb->len < 2)
+ goto out;
+
+ reason = le16_to_cpu(*(__le16 *)skb->data);
+ skb_pull(skb, 2);
+
+ scp->discdata_in.opt_status = cpu_to_le16(reason);
+ scp->discdata_in.opt_optl = 0;
+ memset(scp->discdata_in.opt_data, 0, 16);
+
+ if (skb->len > 0) {
+ u16 dlen = *skb->data;
+ if ((dlen <= 16) && (dlen <= skb->len)) {
+ scp->discdata_in.opt_optl = cpu_to_le16(dlen);
+ skb_copy_from_linear_data_offset(skb, 1, scp->discdata_in.opt_data, dlen);
+ }
+ }
+
+ scp->addrrem = cb->src_port;
+ sk->sk_state = TCP_CLOSE;
+
+ switch (scp->state) {
+ case DN_CI:
+ case DN_CD:
+ scp->state = DN_RJ;
+ sk->sk_err = ECONNREFUSED;
+ break;
+ case DN_RUN:
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ scp->state = DN_DN;
+ break;
+ case DN_DI:
+ scp->state = DN_DIC;
+ break;
+ }
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ if (sk->sk_socket->state != SS_UNCONNECTED)
+ sk->sk_socket->state = SS_DISCONNECTING;
+ sk->sk_state_change(sk);
+ }
+
+ /*
+ * It appears that its possible for remote machines to send disc
+ * init messages with no port identifier if we are in the CI and
+ * possibly also the CD state. Obviously we shouldn't reply with
+ * a message if we don't know what the end point is.
+ */
+ if (scp->addrrem) {
+ dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC);
+ }
+ scp->persist_fxn = dn_destroy_timer;
+ scp->persist = dn_nsp_persist(sk);
+
+out:
+ kfree_skb(skb);
+}
+
+/*
+ * disc_conf messages are also called no_resources or no_link
+ * messages depending upon the "reason" field.
+ */
+static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ unsigned short reason;
+
+ if (skb->len != 2)
+ goto out;
+
+ reason = le16_to_cpu(*(__le16 *)skb->data);
+
+ sk->sk_state = TCP_CLOSE;
+
+ switch (scp->state) {
+ case DN_CI:
+ scp->state = DN_NR;
+ break;
+ case DN_DR:
+ if (reason == NSP_REASON_DC)
+ scp->state = DN_DRC;
+ if (reason == NSP_REASON_NL)
+ scp->state = DN_CN;
+ break;
+ case DN_DI:
+ scp->state = DN_DIC;
+ break;
+ case DN_RUN:
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ case DN_CC:
+ scp->state = DN_CN;
+ }
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ if (sk->sk_socket->state != SS_UNCONNECTED)
+ sk->sk_socket->state = SS_DISCONNECTING;
+ sk->sk_state_change(sk);
+ }
+
+ scp->persist_fxn = dn_destroy_timer;
+ scp->persist = dn_nsp_persist(sk);
+
+out:
+ kfree_skb(skb);
+}
+
+static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ unsigned short segnum;
+ unsigned char lsflags;
+ signed char fcval;
+ int wake_up = 0;
+ char *ptr = skb->data;
+ unsigned char fctype = scp->services_rem & NSP_FC_MASK;
+
+ if (skb->len != 4)
+ goto out;
+
+ segnum = le16_to_cpu(*(__le16 *)ptr);
+ ptr += 2;
+ lsflags = *(unsigned char *)ptr++;
+ fcval = *ptr;
+
+ /*
+ * Here we ignore erronous packets which should really
+ * should cause a connection abort. It is not critical
+ * for now though.
+ */
+ if (lsflags & 0xf8)
+ goto out;
+
+ if (seq_next(scp->numoth_rcv, segnum)) {
+ seq_add(&scp->numoth_rcv, 1);
+ switch(lsflags & 0x04) { /* FCVAL INT */
+ case 0x00: /* Normal Request */
+ switch(lsflags & 0x03) { /* FCVAL MOD */
+ case 0x00: /* Request count */
+ if (fcval < 0) {
+ unsigned char p_fcval = -fcval;
+ if ((scp->flowrem_dat > p_fcval) &&
+ (fctype == NSP_FC_SCMC)) {
+ scp->flowrem_dat -= p_fcval;
+ }
+ } else if (fcval > 0) {
+ scp->flowrem_dat += fcval;
+ wake_up = 1;
+ }
+ break;
+ case 0x01: /* Stop outgoing data */
+ scp->flowrem_sw = DN_DONTSEND;
+ break;
+ case 0x02: /* Ok to start again */
+ scp->flowrem_sw = DN_SEND;
+ dn_nsp_output(sk);
+ wake_up = 1;
+ }
+ break;
+ case 0x04: /* Interrupt Request */
+ if (fcval > 0) {
+ scp->flowrem_oth += fcval;
+ wake_up = 1;
+ }
+ break;
+ }
+ if (wake_up && !sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ }
+
+ dn_nsp_send_oth_ack(sk);
+
+out:
+ kfree_skb(skb);
+}
+
+/*
+ * Copy of sock_queue_rcv_skb (from sock.h) without
+ * bh_lock_sock() (its already held when this is called) which
+ * also allows data and other data to be queued to a socket.
+ */
+static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue)
+{
+ int err;
+
+ /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
+ number of warnings when compiling with -W --ANK
+ */
+ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+ (unsigned int)sk->sk_rcvbuf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_filter(sk, skb);
+ if (err)
+ goto out;
+
+ skb_set_owner_r(skb, sk);
+ skb_queue_tail(queue, skb);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+out:
+ return err;
+}
+
+static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ unsigned short segnum;
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ int queued = 0;
+
+ if (skb->len < 2)
+ goto out;
+
+ cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data);
+ skb_pull(skb, 2);
+
+ if (seq_next(scp->numoth_rcv, segnum)) {
+
+ if (dn_queue_skb(sk, skb, SIGURG, &scp->other_receive_queue) == 0) {
+ seq_add(&scp->numoth_rcv, 1);
+ scp->other_report = 0;
+ queued = 1;
+ }
+ }
+
+ dn_nsp_send_oth_ack(sk);
+out:
+ if (!queued)
+ kfree_skb(skb);
+}
+
+static void dn_nsp_data(struct sock *sk, struct sk_buff *skb)
+{
+ int queued = 0;
+ unsigned short segnum;
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct dn_scp *scp = DN_SK(sk);
+
+ if (skb->len < 2)
+ goto out;
+
+ cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data);
+ skb_pull(skb, 2);
+
+ if (seq_next(scp->numdat_rcv, segnum)) {
+ if (dn_queue_skb(sk, skb, SIGIO, &sk->sk_receive_queue) == 0) {
+ seq_add(&scp->numdat_rcv, 1);
+ queued = 1;
+ }
+
+ if ((scp->flowloc_sw == DN_SEND) && dn_congested(sk)) {
+ scp->flowloc_sw = DN_DONTSEND;
+ dn_nsp_send_link(sk, DN_DONTSEND, 0);
+ }
+ }
+
+ dn_nsp_send_data_ack(sk);
+out:
+ if (!queued)
+ kfree_skb(skb);
+}
+
+/*
+ * If one of our conninit messages is returned, this function
+ * deals with it. It puts the socket into the NO_COMMUNICATION
+ * state.
+ */
+static void dn_returned_conn_init(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ if (scp->state == DN_CI) {
+ scp->state = DN_NC;
+ sk->sk_state = TCP_CLOSE;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ }
+
+ kfree_skb(skb);
+}
+
+static int dn_nsp_no_socket(struct sk_buff *skb, unsigned short reason)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ int ret = NET_RX_DROP;
+
+ /* Must not reply to returned packets */
+ if (cb->rt_flags & DN_RT_F_RTS)
+ goto out;
+
+ if ((reason != NSP_REASON_OK) && ((cb->nsp_flags & 0x0c) == 0x08)) {
+ switch (cb->nsp_flags & 0x70) {
+ case 0x10:
+ case 0x60: /* (Retransmitted) Connect Init */
+ dn_nsp_return_disc(skb, NSP_DISCINIT, reason);
+ ret = NET_RX_SUCCESS;
+ break;
+ case 0x20: /* Connect Confirm */
+ dn_nsp_return_disc(skb, NSP_DISCCONF, reason);
+ ret = NET_RX_SUCCESS;
+ break;
+ }
+ }
+
+out:
+ kfree_skb(skb);
+ return ret;
+}
+
+static int dn_nsp_rx_packet(struct sock *sk2, struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct sock *sk = NULL;
+ unsigned char *ptr = (unsigned char *)skb->data;
+ unsigned short reason = NSP_REASON_NL;
+
+ if (!pskb_may_pull(skb, 2))
+ goto free_out;
+
+ skb_reset_transport_header(skb);
+ cb->nsp_flags = *ptr++;
+
+ if (decnet_debug_level & 2)
+ printk(KERN_DEBUG "dn_nsp_rx: Message type 0x%02x\n", (int)cb->nsp_flags);
+
+ if (cb->nsp_flags & 0x83)
+ goto free_out;
+
+ /*
+ * Filter out conninits and useless packet types
+ */
+ if ((cb->nsp_flags & 0x0c) == 0x08) {
+ switch (cb->nsp_flags & 0x70) {
+ case 0x00: /* NOP */
+ case 0x70: /* Reserved */
+ case 0x50: /* Reserved, Phase II node init */
+ goto free_out;
+ case 0x10:
+ case 0x60:
+ if (unlikely(cb->rt_flags & DN_RT_F_RTS))
+ goto free_out;
+ sk = dn_find_listener(skb, &reason);
+ goto got_it;
+ }
+ }
+
+ if (!pskb_may_pull(skb, 3))
+ goto free_out;
+
+ /*
+ * Grab the destination address.
+ */
+ cb->dst_port = *(__le16 *)ptr;
+ cb->src_port = 0;
+ ptr += 2;
+
+ /*
+ * If not a connack, grab the source address too.
+ */
+ if (pskb_may_pull(skb, 5)) {
+ cb->src_port = *(__le16 *)ptr;
+ ptr += 2;
+ skb_pull(skb, 5);
+ }
+
+ /*
+ * Returned packets...
+ * Swap src & dst and look up in the normal way.
+ */
+ if (unlikely(cb->rt_flags & DN_RT_F_RTS)) {
+ __le16 tmp = cb->dst_port;
+ cb->dst_port = cb->src_port;
+ cb->src_port = tmp;
+ tmp = cb->dst;
+ cb->dst = cb->src;
+ cb->src = tmp;
+ }
+
+ /*
+ * Find the socket to which this skb is destined.
+ */
+ sk = dn_find_by_skb(skb);
+got_it:
+ if (sk != NULL) {
+ struct dn_scp *scp = DN_SK(sk);
+
+ /* Reset backoff */
+ scp->nsp_rxtshift = 0;
+
+ /*
+ * We linearize everything except data segments here.
+ */
+ if (cb->nsp_flags & ~0x60) {
+ if (unlikely(skb_linearize(skb)))
+ goto free_out;
+ }
+
+ return sk_receive_skb(sk, skb, 0);
+ }
+
+ return dn_nsp_no_socket(skb, reason);
+
+free_out:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+int dn_nsp_rx(struct sk_buff *skb)
+{
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, NULL, skb,
+ skb->dev, NULL,
+ dn_nsp_rx_packet);
+}
+
+/*
+ * This is the main receive routine for sockets. It is called
+ * from the above when the socket is not busy, and also from
+ * sock_release() when there is a backlog queued up.
+ */
+int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+ if (cb->rt_flags & DN_RT_F_RTS) {
+ if (cb->nsp_flags == 0x18 || cb->nsp_flags == 0x68)
+ dn_returned_conn_init(sk, skb);
+ else
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
+ /*
+ * Control packet.
+ */
+ if ((cb->nsp_flags & 0x0c) == 0x08) {
+ switch (cb->nsp_flags & 0x70) {
+ case 0x10:
+ case 0x60:
+ dn_nsp_conn_init(sk, skb);
+ break;
+ case 0x20:
+ dn_nsp_conn_conf(sk, skb);
+ break;
+ case 0x30:
+ dn_nsp_disc_init(sk, skb);
+ break;
+ case 0x40:
+ dn_nsp_disc_conf(sk, skb);
+ break;
+ }
+
+ } else if (cb->nsp_flags == 0x24) {
+ /*
+ * Special for connacks, 'cos they don't have
+ * ack data or ack otherdata info.
+ */
+ dn_nsp_conn_ack(sk, skb);
+ } else {
+ int other = 1;
+
+ /* both data and ack frames can kick a CC socket into RUN */
+ if ((scp->state == DN_CC) && !sock_flag(sk, SOCK_DEAD)) {
+ scp->state = DN_RUN;
+ sk->sk_state = TCP_ESTABLISHED;
+ sk->sk_state_change(sk);
+ }
+
+ if ((cb->nsp_flags & 0x1c) == 0)
+ other = 0;
+ if (cb->nsp_flags == 0x04)
+ other = 0;
+
+ /*
+ * Read out ack data here, this applies equally
+ * to data, other data, link serivce and both
+ * ack data and ack otherdata.
+ */
+ dn_process_ack(sk, skb, other);
+
+ /*
+ * If we've some sort of data here then call a
+ * suitable routine for dealing with it, otherwise
+ * the packet is an ack and can be discarded.
+ */
+ if ((cb->nsp_flags & 0x0c) == 0) {
+
+ if (scp->state != DN_RUN)
+ goto free_out;
+
+ switch (cb->nsp_flags) {
+ case 0x10: /* LS */
+ dn_nsp_linkservice(sk, skb);
+ break;
+ case 0x30: /* OD */
+ dn_nsp_otherdata(sk, skb);
+ break;
+ default:
+ dn_nsp_data(sk, skb);
+ }
+
+ } else { /* Ack, chuck it out here */
+free_out:
+ kfree_skb(skb);
+ }
+ }
+
+ return NET_RX_SUCCESS;
+}
+
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
new file mode 100644
index 000000000..1aaa51ebb
--- /dev/null
+++ b/net/decnet/dn_nsp_out.c
@@ -0,0 +1,718 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Network Services Protocol (Output)
+ *
+ * Author: Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *
+ * Changes:
+ *
+ * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from
+ * original dn_nsp.c.
+ * Steve Whitehouse: Updated to work with my new routing architecture.
+ * Steve Whitehouse: Added changes from Eduardo Serrat's patches.
+ * Steve Whitehouse: Now conninits have the "return" bit set.
+ * Steve Whitehouse: Fixes to check alloc'd skbs are non NULL!
+ * Moved output state machine into one function
+ * Steve Whitehouse: New output state machine
+ * Paul Koning: Connect Confirm message fix.
+ * Eduardo Serrat: Fix to stop dn_nsp_do_disc() sending malformed packets.
+ * Steve Whitehouse: dn_nsp_output() and friends needed a spring clean
+ * Steve Whitehouse: Moved dn_nsp_send() in here from route.h
+ */
+
+/******************************************************************************
+ (c) 1995-1998 E.M. Serrat emserrat@geocities.com
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+*******************************************************************************/
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/route.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/termios.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/if_packet.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/dn.h>
+#include <net/dn_nsp.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+
+
+static int nsp_backoff[NSP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
+
+static void dn_nsp_send(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct dn_scp *scp = DN_SK(sk);
+ struct dst_entry *dst;
+ struct flowidn fld;
+
+ skb_reset_transport_header(skb);
+ scp->stamp = jiffies;
+
+ dst = sk_dst_check(sk, 0);
+ if (dst) {
+try_again:
+ skb_dst_set(skb, dst);
+ dst_output(skb);
+ return;
+ }
+
+ memset(&fld, 0, sizeof(fld));
+ fld.flowidn_oif = sk->sk_bound_dev_if;
+ fld.saddr = dn_saddr2dn(&scp->addr);
+ fld.daddr = dn_saddr2dn(&scp->peer);
+ dn_sk_ports_copy(&fld, scp);
+ fld.flowidn_proto = DNPROTO_NSP;
+ if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, 0) == 0) {
+ dst = sk_dst_get(sk);
+ sk->sk_route_caps = dst->dev->features;
+ goto try_again;
+ }
+
+ sk->sk_err = EHOSTUNREACH;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+}
+
+
+/*
+ * If sk == NULL, then we assume that we are supposed to be making
+ * a routing layer skb. If sk != NULL, then we are supposed to be
+ * creating an skb for the NSP layer.
+ *
+ * The eventual aim is for each socket to have a cached header size
+ * for its outgoing packets, and to set hdr from this when sk != NULL.
+ */
+struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri)
+{
+ struct sk_buff *skb;
+ int hdr = 64;
+
+ if ((skb = alloc_skb(size + hdr, pri)) == NULL)
+ return NULL;
+
+ skb->protocol = htons(ETH_P_DNA_RT);
+ skb->pkt_type = PACKET_OUTGOING;
+
+ if (sk)
+ skb_set_owner_w(skb, sk);
+
+ skb_reserve(skb, hdr);
+
+ return skb;
+}
+
+/*
+ * Calculate persist timer based upon the smoothed round
+ * trip time and the variance. Backoff according to the
+ * nsp_backoff[] array.
+ */
+unsigned long dn_nsp_persist(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
+
+ t *= nsp_backoff[scp->nsp_rxtshift];
+
+ if (t < HZ) t = HZ;
+ if (t > (600*HZ)) t = (600*HZ);
+
+ if (scp->nsp_rxtshift < NSP_MAXRXTSHIFT)
+ scp->nsp_rxtshift++;
+
+ /* printk(KERN_DEBUG "rxtshift %lu, t=%lu\n", scp->nsp_rxtshift, t); */
+
+ return t;
+}
+
+/*
+ * This is called each time we get an estimate for the rtt
+ * on the link.
+ */
+static void dn_nsp_rtt(struct sock *sk, long rtt)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ long srtt = (long)scp->nsp_srtt;
+ long rttvar = (long)scp->nsp_rttvar;
+ long delta;
+
+ /*
+ * If the jiffies clock flips over in the middle of timestamp
+ * gathering this value might turn out negative, so we make sure
+ * that is it always positive here.
+ */
+ if (rtt < 0)
+ rtt = -rtt;
+ /*
+ * Add new rtt to smoothed average
+ */
+ delta = ((rtt << 3) - srtt);
+ srtt += (delta >> 3);
+ if (srtt >= 1)
+ scp->nsp_srtt = (unsigned long)srtt;
+ else
+ scp->nsp_srtt = 1;
+
+ /*
+ * Add new rtt varience to smoothed varience
+ */
+ delta >>= 1;
+ rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2);
+ if (rttvar >= 1)
+ scp->nsp_rttvar = (unsigned long)rttvar;
+ else
+ scp->nsp_rttvar = 1;
+
+ /* printk(KERN_DEBUG "srtt=%lu rttvar=%lu\n", scp->nsp_srtt, scp->nsp_rttvar); */
+}
+
+/**
+ * dn_nsp_clone_and_send - Send a data packet by cloning it
+ * @skb: The packet to clone and transmit
+ * @gfp: memory allocation flag
+ *
+ * Clone a queued data or other data packet and transmit it.
+ *
+ * Returns: The number of times the packet has been sent previously
+ */
+static inline unsigned int dn_nsp_clone_and_send(struct sk_buff *skb,
+ gfp_t gfp)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct sk_buff *skb2;
+ int ret = 0;
+
+ if ((skb2 = skb_clone(skb, gfp)) != NULL) {
+ ret = cb->xmit_count;
+ cb->xmit_count++;
+ cb->stamp = jiffies;
+ skb2->sk = skb->sk;
+ dn_nsp_send(skb2);
+ }
+
+ return ret;
+}
+
+/**
+ * dn_nsp_output - Try and send something from socket queues
+ * @sk: The socket whose queues are to be investigated
+ *
+ * Try and send the packet on the end of the data and other data queues.
+ * Other data gets priority over data, and if we retransmit a packet we
+ * reduce the window by dividing it in two.
+ *
+ */
+void dn_nsp_output(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct sk_buff *skb;
+ unsigned int reduce_win = 0;
+
+ /*
+ * First we check for otherdata/linkservice messages
+ */
+ if ((skb = skb_peek(&scp->other_xmit_queue)) != NULL)
+ reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
+
+ /*
+ * If we may not send any data, we don't.
+ * If we are still trying to get some other data down the
+ * channel, we don't try and send any data.
+ */
+ if (reduce_win || (scp->flowrem_sw != DN_SEND))
+ goto recalc_window;
+
+ if ((skb = skb_peek(&scp->data_xmit_queue)) != NULL)
+ reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
+
+ /*
+ * If we've sent any frame more than once, we cut the
+ * send window size in half. There is always a minimum
+ * window size of one available.
+ */
+recalc_window:
+ if (reduce_win) {
+ scp->snd_window >>= 1;
+ if (scp->snd_window < NSP_MIN_WINDOW)
+ scp->snd_window = NSP_MIN_WINDOW;
+ }
+}
+
+int dn_nsp_xmit_timeout(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ dn_nsp_output(sk);
+
+ if (!skb_queue_empty(&scp->data_xmit_queue) ||
+ !skb_queue_empty(&scp->other_xmit_queue))
+ scp->persist = dn_nsp_persist(sk);
+
+ return 0;
+}
+
+static inline __le16 *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len)
+{
+ unsigned char *ptr = skb_push(skb, len);
+
+ BUG_ON(len < 5);
+
+ *ptr++ = msgflag;
+ *((__le16 *)ptr) = scp->addrrem;
+ ptr += 2;
+ *((__le16 *)ptr) = scp->addrloc;
+ ptr += 2;
+ return (__le16 __force *)ptr;
+}
+
+static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ unsigned short acknum = scp->numdat_rcv & 0x0FFF;
+ unsigned short ackcrs = scp->numoth_rcv & 0x0FFF;
+ __le16 *ptr;
+
+ BUG_ON(hlen < 9);
+
+ scp->ackxmt_dat = acknum;
+ scp->ackxmt_oth = ackcrs;
+ acknum |= 0x8000;
+ ackcrs |= 0x8000;
+
+ /* If this is an "other data/ack" message, swap acknum and ackcrs */
+ if (other) {
+ unsigned short tmp = acknum;
+ acknum = ackcrs;
+ ackcrs = tmp;
+ }
+
+ /* Set "cross subchannel" bit in ackcrs */
+ ackcrs |= 0x2000;
+
+ ptr = dn_mk_common_header(scp, skb, msgflag, hlen);
+
+ *ptr++ = cpu_to_le16(acknum);
+ *ptr++ = cpu_to_le16(ackcrs);
+
+ return ptr;
+}
+
+static __le16 *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ __le16 *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth);
+
+ if (unlikely(oth)) {
+ cb->segnum = scp->numoth;
+ seq_add(&scp->numoth, 1);
+ } else {
+ cb->segnum = scp->numdat;
+ seq_add(&scp->numdat, 1);
+ }
+ *(ptr++) = cpu_to_le16(cb->segnum);
+
+ return ptr;
+}
+
+void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb,
+ gfp_t gfp, int oth)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
+
+ cb->xmit_count = 0;
+ dn_nsp_mk_data_header(sk, skb, oth);
+
+ /*
+ * Slow start: If we have been idle for more than
+ * one RTT, then reset window to min size.
+ */
+ if ((jiffies - scp->stamp) > t)
+ scp->snd_window = NSP_MIN_WINDOW;
+
+ if (oth)
+ skb_queue_tail(&scp->other_xmit_queue, skb);
+ else
+ skb_queue_tail(&scp->data_xmit_queue, skb);
+
+ if (scp->flowrem_sw != DN_SEND)
+ return;
+
+ dn_nsp_clone_and_send(skb, gfp);
+}
+
+
+int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct dn_scp *scp = DN_SK(sk);
+ struct sk_buff *skb2, *n, *ack = NULL;
+ int wakeup = 0;
+ int try_retrans = 0;
+ unsigned long reftime = cb->stamp;
+ unsigned long pkttime;
+ unsigned short xmit_count;
+ unsigned short segnum;
+
+ skb_queue_walk_safe(q, skb2, n) {
+ struct dn_skb_cb *cb2 = DN_SKB_CB(skb2);
+
+ if (dn_before_or_equal(cb2->segnum, acknum))
+ ack = skb2;
+
+ /* printk(KERN_DEBUG "ack: %s %04x %04x\n", ack ? "ACK" : "SKIP", (int)cb2->segnum, (int)acknum); */
+
+ if (ack == NULL)
+ continue;
+
+ /* printk(KERN_DEBUG "check_xmit_queue: %04x, %d\n", acknum, cb2->xmit_count); */
+
+ /* Does _last_ packet acked have xmit_count > 1 */
+ try_retrans = 0;
+ /* Remember to wake up the sending process */
+ wakeup = 1;
+ /* Keep various statistics */
+ pkttime = cb2->stamp;
+ xmit_count = cb2->xmit_count;
+ segnum = cb2->segnum;
+ /* Remove and drop ack'ed packet */
+ skb_unlink(ack, q);
+ kfree_skb(ack);
+ ack = NULL;
+
+ /*
+ * We don't expect to see acknowledgements for packets we
+ * haven't sent yet.
+ */
+ WARN_ON(xmit_count == 0);
+
+ /*
+ * If the packet has only been sent once, we can use it
+ * to calculate the RTT and also open the window a little
+ * further.
+ */
+ if (xmit_count == 1) {
+ if (dn_equal(segnum, acknum))
+ dn_nsp_rtt(sk, (long)(pkttime - reftime));
+
+ if (scp->snd_window < scp->max_window)
+ scp->snd_window++;
+ }
+
+ /*
+ * Packet has been sent more than once. If this is the last
+ * packet to be acknowledged then we want to send the next
+ * packet in the send queue again (assumes the remote host does
+ * go-back-N error control).
+ */
+ if (xmit_count > 1)
+ try_retrans = 1;
+ }
+
+ if (try_retrans)
+ dn_nsp_output(sk);
+
+ return wakeup;
+}
+
+void dn_nsp_send_data_ack(struct sock *sk)
+{
+ struct sk_buff *skb = NULL;
+
+ if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skb, 9);
+ dn_mk_ack_header(sk, skb, 0x04, 9, 0);
+ dn_nsp_send(skb);
+}
+
+void dn_nsp_send_oth_ack(struct sock *sk)
+{
+ struct sk_buff *skb = NULL;
+
+ if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skb, 9);
+ dn_mk_ack_header(sk, skb, 0x14, 9, 1);
+ dn_nsp_send(skb);
+}
+
+
+void dn_send_conn_ack (struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct sk_buff *skb = NULL;
+ struct nsp_conn_ack_msg *msg;
+
+ if ((skb = dn_alloc_skb(sk, 3, sk->sk_allocation)) == NULL)
+ return;
+
+ msg = (struct nsp_conn_ack_msg *)skb_put(skb, 3);
+ msg->msgflg = 0x24;
+ msg->dstaddr = scp->addrrem;
+
+ dn_nsp_send(skb);
+}
+
+void dn_nsp_delayed_ack(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ if (scp->ackxmt_oth != scp->numoth_rcv)
+ dn_nsp_send_oth_ack(sk);
+
+ if (scp->ackxmt_dat != scp->numdat_rcv)
+ dn_nsp_send_data_ack(sk);
+}
+
+static int dn_nsp_retrans_conn_conf(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ if (scp->state == DN_CC)
+ dn_send_conn_conf(sk, GFP_ATOMIC);
+
+ return 0;
+}
+
+void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct sk_buff *skb = NULL;
+ struct nsp_conn_init_msg *msg;
+ __u8 len = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
+
+ if ((skb = dn_alloc_skb(sk, 50 + len, gfp)) == NULL)
+ return;
+
+ msg = (struct nsp_conn_init_msg *)skb_put(skb, sizeof(*msg));
+ msg->msgflg = 0x28;
+ msg->dstaddr = scp->addrrem;
+ msg->srcaddr = scp->addrloc;
+ msg->services = scp->services_loc;
+ msg->info = scp->info_loc;
+ msg->segsize = cpu_to_le16(scp->segsize_loc);
+
+ *skb_put(skb,1) = len;
+
+ if (len > 0)
+ memcpy(skb_put(skb, len), scp->conndata_out.opt_data, len);
+
+
+ dn_nsp_send(skb);
+
+ scp->persist = dn_nsp_persist(sk);
+ scp->persist_fxn = dn_nsp_retrans_conn_conf;
+}
+
+
+static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
+ unsigned short reason, gfp_t gfp,
+ struct dst_entry *dst,
+ int ddl, unsigned char *dd, __le16 rem, __le16 loc)
+{
+ struct sk_buff *skb = NULL;
+ int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0);
+ unsigned char *msg;
+
+ if ((dst == NULL) || (rem == 0)) {
+ net_dbg_ratelimited("DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n",
+ le16_to_cpu(rem), dst);
+ return;
+ }
+
+ if ((skb = dn_alloc_skb(sk, size, gfp)) == NULL)
+ return;
+
+ msg = skb_put(skb, size);
+ *msg++ = msgflg;
+ *(__le16 *)msg = rem;
+ msg += 2;
+ *(__le16 *)msg = loc;
+ msg += 2;
+ *(__le16 *)msg = cpu_to_le16(reason);
+ msg += 2;
+ if (msgflg == NSP_DISCINIT)
+ *msg++ = ddl;
+
+ if (ddl) {
+ memcpy(msg, dd, ddl);
+ }
+
+ /*
+ * This doesn't go via the dn_nsp_send() function since we need
+ * to be able to send disc packets out which have no socket
+ * associations.
+ */
+ skb_dst_set(skb, dst_clone(dst));
+ dst_output(skb);
+}
+
+
+void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg,
+ unsigned short reason, gfp_t gfp)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ int ddl = 0;
+
+ if (msgflg == NSP_DISCINIT)
+ ddl = le16_to_cpu(scp->discdata_out.opt_optl);
+
+ if (reason == 0)
+ reason = le16_to_cpu(scp->discdata_out.opt_status);
+
+ dn_nsp_do_disc(sk, msgflg, reason, gfp, __sk_dst_get(sk), ddl,
+ scp->discdata_out.opt_data, scp->addrrem, scp->addrloc);
+}
+
+
+void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg,
+ unsigned short reason)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ int ddl = 0;
+ gfp_t gfp = GFP_ATOMIC;
+
+ dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb_dst(skb), ddl,
+ NULL, cb->src_port, cb->dst_port);
+}
+
+
+void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct sk_buff *skb;
+ unsigned char *ptr;
+ gfp_t gfp = GFP_ATOMIC;
+
+ if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL)
+ return;
+
+ skb_reserve(skb, DN_MAX_NSP_DATA_HEADER);
+ ptr = skb_put(skb, 2);
+ DN_SKB_CB(skb)->nsp_flags = 0x10;
+ *ptr++ = lsflags;
+ *ptr = fcval;
+
+ dn_nsp_queue_xmit(sk, skb, gfp, 1);
+
+ scp->persist = dn_nsp_persist(sk);
+ scp->persist_fxn = dn_nsp_xmit_timeout;
+}
+
+static int dn_nsp_retrans_conninit(struct sock *sk)
+{
+ struct dn_scp *scp = DN_SK(sk);
+
+ if (scp->state == DN_CI)
+ dn_nsp_send_conninit(sk, NSP_RCI);
+
+ return 0;
+}
+
+void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
+{
+ struct dn_scp *scp = DN_SK(sk);
+ struct nsp_conn_init_msg *msg;
+ unsigned char aux;
+ unsigned char menuver;
+ struct dn_skb_cb *cb;
+ unsigned char type = 1;
+ gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
+ struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation);
+
+ if (!skb)
+ return;
+
+ cb = DN_SKB_CB(skb);
+ msg = (struct nsp_conn_init_msg *)skb_put(skb,sizeof(*msg));
+
+ msg->msgflg = msgflg;
+ msg->dstaddr = 0x0000; /* Remote Node will assign it*/
+
+ msg->srcaddr = scp->addrloc;
+ msg->services = scp->services_loc; /* Requested flow control */
+ msg->info = scp->info_loc; /* Version Number */
+ msg->segsize = cpu_to_le16(scp->segsize_loc); /* Max segment size */
+
+ if (scp->peer.sdn_objnum)
+ type = 0;
+
+ skb_put(skb, dn_sockaddr2username(&scp->peer,
+ skb_tail_pointer(skb), type));
+ skb_put(skb, dn_sockaddr2username(&scp->addr,
+ skb_tail_pointer(skb), 2));
+
+ menuver = DN_MENUVER_ACC | DN_MENUVER_USR;
+ if (scp->peer.sdn_flags & SDF_PROXY)
+ menuver |= DN_MENUVER_PRX;
+ if (scp->peer.sdn_flags & SDF_UICPROXY)
+ menuver |= DN_MENUVER_UIC;
+
+ *skb_put(skb, 1) = menuver; /* Menu Version */
+
+ aux = scp->accessdata.acc_userl;
+ *skb_put(skb, 1) = aux;
+ if (aux > 0)
+ memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux);
+
+ aux = scp->accessdata.acc_passl;
+ *skb_put(skb, 1) = aux;
+ if (aux > 0)
+ memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux);
+
+ aux = scp->accessdata.acc_accl;
+ *skb_put(skb, 1) = aux;
+ if (aux > 0)
+ memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
+
+ aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
+ *skb_put(skb, 1) = aux;
+ if (aux > 0)
+ memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux);
+
+ scp->persist = dn_nsp_persist(sk);
+ scp->persist_fxn = dn_nsp_retrans_conninit;
+
+ cb->rt_flags = DN_RT_F_RQR;
+
+ dn_nsp_send(skb);
+}
+
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
new file mode 100644
index 000000000..03227ffd1
--- /dev/null
+++ b/net/decnet/dn_route.c
@@ -0,0 +1,1939 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Routing Functions (Endnode and Router)
+ *
+ * Authors: Steve Whitehouse <SteveW@ACM.org>
+ * Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *
+ * Changes:
+ * Steve Whitehouse : Fixes to allow "intra-ethernet" and
+ * "return-to-sender" bits on outgoing
+ * packets.
+ * Steve Whitehouse : Timeouts for cached routes.
+ * Steve Whitehouse : Use dst cache for input routes too.
+ * Steve Whitehouse : Fixed error values in dn_send_skb.
+ * Steve Whitehouse : Rework routing functions to better fit
+ * DECnet routing design
+ * Alexey Kuznetsov : New SMP locking
+ * Steve Whitehouse : More SMP locking changes & dn_cache_dump()
+ * Steve Whitehouse : Prerouting NF hook, now really is prerouting.
+ * Fixed possible skb leak in rtnetlink funcs.
+ * Steve Whitehouse : Dave Miller's dynamic hash table sizing and
+ * Alexey Kuznetsov's finer grained locking
+ * from ipv4/route.c.
+ * Steve Whitehouse : Routing is now starting to look like a
+ * sensible set of code now, mainly due to
+ * my copying the IPv4 routing code. The
+ * hooks here are modified and will continue
+ * to evolve for a while.
+ * Steve Whitehouse : Real SMP at last :-) Also new netfilter
+ * stuff. Look out raw sockets your days
+ * are numbered!
+ * Steve Whitehouse : Added return-to-sender functions. Added
+ * backlog congestion level return codes.
+ * Steve Whitehouse : Fixed bug where routes were set up with
+ * no ref count on net devices.
+ * Steve Whitehouse : RCU for the route cache
+ * Steve Whitehouse : Preparations for the flow cache
+ * Steve Whitehouse : Prepare for nonlinear skbs
+ */
+
+/******************************************************************************
+ (c) 1995-1998 E.M. Serrat emserrat@geocities.com
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+*******************************************************************************/
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/route.h>
+#include <linux/in_route.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/rtnetlink.h>
+#include <linux/string.h>
+#include <linux/netfilter_decnet.h>
+#include <linux/rcupdate.h>
+#include <linux/times.h>
+#include <linux/export.h>
+#include <asm/errno.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_dev.h>
+#include <net/dn_nsp.h>
+#include <net/dn_route.h>
+#include <net/dn_neigh.h>
+#include <net/dn_fib.h>
+
+struct dn_rt_hash_bucket
+{
+ struct dn_route __rcu *chain;
+ spinlock_t lock;
+};
+
+extern struct neigh_table dn_neigh_table;
+
+
+static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00};
+
+static const int dn_rt_min_delay = 2 * HZ;
+static const int dn_rt_max_delay = 10 * HZ;
+static const int dn_rt_mtu_expires = 10 * 60 * HZ;
+
+static unsigned long dn_rt_deadline;
+
+static int dn_dst_gc(struct dst_ops *ops);
+static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
+static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
+static unsigned int dn_dst_mtu(const struct dst_entry *dst);
+static void dn_dst_destroy(struct dst_entry *);
+static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how);
+static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
+static void dn_dst_link_failure(struct sk_buff *);
+static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb , u32 mtu);
+static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb);
+static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr);
+static int dn_route_input(struct sk_buff *);
+static void dn_run_flush(unsigned long dummy);
+
+static struct dn_rt_hash_bucket *dn_rt_hash_table;
+static unsigned int dn_rt_hash_mask;
+
+static struct timer_list dn_route_timer;
+static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0);
+int decnet_dst_gc_interval = 2;
+
+static struct dst_ops dn_dst_ops = {
+ .family = PF_DECnet,
+ .gc_thresh = 128,
+ .gc = dn_dst_gc,
+ .check = dn_dst_check,
+ .default_advmss = dn_dst_default_advmss,
+ .mtu = dn_dst_mtu,
+ .cow_metrics = dst_cow_metrics_generic,
+ .destroy = dn_dst_destroy,
+ .ifdown = dn_dst_ifdown,
+ .negative_advice = dn_dst_negative_advice,
+ .link_failure = dn_dst_link_failure,
+ .update_pmtu = dn_dst_update_pmtu,
+ .redirect = dn_dst_redirect,
+ .neigh_lookup = dn_dst_neigh_lookup,
+};
+
+static void dn_dst_destroy(struct dst_entry *dst)
+{
+ struct dn_route *rt = (struct dn_route *) dst;
+
+ if (rt->n)
+ neigh_release(rt->n);
+ dst_destroy_metrics_generic(dst);
+}
+
+static void dn_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how)
+{
+ if (how) {
+ struct dn_route *rt = (struct dn_route *) dst;
+ struct neighbour *n = rt->n;
+
+ if (n && n->dev == dev) {
+ n->dev = dev_net(dev)->loopback_dev;
+ dev_hold(n->dev);
+ dev_put(dev);
+ }
+ }
+}
+
+static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
+{
+ __u16 tmp = (__u16 __force)(src ^ dst);
+ tmp ^= (tmp >> 3);
+ tmp ^= (tmp >> 5);
+ tmp ^= (tmp >> 10);
+ return dn_rt_hash_mask & (unsigned int)tmp;
+}
+
+static inline void dnrt_free(struct dn_route *rt)
+{
+ call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static inline void dnrt_drop(struct dn_route *rt)
+{
+ dst_release(&rt->dst);
+ call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static void dn_dst_check_expire(unsigned long dummy)
+{
+ int i;
+ struct dn_route *rt;
+ struct dn_route __rcu **rtp;
+ unsigned long now = jiffies;
+ unsigned long expire = 120 * HZ;
+
+ for (i = 0; i <= dn_rt_hash_mask; i++) {
+ rtp = &dn_rt_hash_table[i].chain;
+
+ spin_lock(&dn_rt_hash_table[i].lock);
+ while ((rt = rcu_dereference_protected(*rtp,
+ lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
+ if (atomic_read(&rt->dst.__refcnt) ||
+ (now - rt->dst.lastuse) < expire) {
+ rtp = &rt->dst.dn_next;
+ continue;
+ }
+ *rtp = rt->dst.dn_next;
+ rt->dst.dn_next = NULL;
+ dnrt_free(rt);
+ }
+ spin_unlock(&dn_rt_hash_table[i].lock);
+
+ if ((jiffies - now) > 0)
+ break;
+ }
+
+ mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ);
+}
+
+static int dn_dst_gc(struct dst_ops *ops)
+{
+ struct dn_route *rt;
+ struct dn_route __rcu **rtp;
+ int i;
+ unsigned long now = jiffies;
+ unsigned long expire = 10 * HZ;
+
+ for (i = 0; i <= dn_rt_hash_mask; i++) {
+
+ spin_lock_bh(&dn_rt_hash_table[i].lock);
+ rtp = &dn_rt_hash_table[i].chain;
+
+ while ((rt = rcu_dereference_protected(*rtp,
+ lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
+ if (atomic_read(&rt->dst.__refcnt) ||
+ (now - rt->dst.lastuse) < expire) {
+ rtp = &rt->dst.dn_next;
+ continue;
+ }
+ *rtp = rt->dst.dn_next;
+ rt->dst.dn_next = NULL;
+ dnrt_drop(rt);
+ break;
+ }
+ spin_unlock_bh(&dn_rt_hash_table[i].lock);
+ }
+
+ return 0;
+}
+
+/*
+ * The decnet standards don't impose a particular minimum mtu, what they
+ * do insist on is that the routing layer accepts a datagram of at least
+ * 230 bytes long. Here we have to subtract the routing header length from
+ * 230 to get the minimum acceptable mtu. If there is no neighbour, then we
+ * assume the worst and use a long header size.
+ *
+ * We update both the mtu and the advertised mss (i.e. the segment size we
+ * advertise to the other end).
+ */
+static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct dn_route *rt = (struct dn_route *) dst;
+ struct neighbour *n = rt->n;
+ u32 min_mtu = 230;
+ struct dn_dev *dn;
+
+ dn = n ? rcu_dereference_raw(n->dev->dn_ptr) : NULL;
+
+ if (dn && dn->use_long == 0)
+ min_mtu -= 6;
+ else
+ min_mtu -= 21;
+
+ if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= min_mtu) {
+ if (!(dst_metric_locked(dst, RTAX_MTU))) {
+ dst_metric_set(dst, RTAX_MTU, mtu);
+ dst_set_expires(dst, dn_rt_mtu_expires);
+ }
+ if (!(dst_metric_locked(dst, RTAX_ADVMSS))) {
+ u32 mss = mtu - DN_MAX_NSP_DATA_HEADER;
+ u32 existing_mss = dst_metric_raw(dst, RTAX_ADVMSS);
+ if (!existing_mss || existing_mss > mss)
+ dst_metric_set(dst, RTAX_ADVMSS, mss);
+ }
+ }
+}
+
+static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+}
+
+/*
+ * When a route has been marked obsolete. (e.g. routing cache flush)
+ */
+static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie)
+{
+ return NULL;
+}
+
+static struct dst_entry *dn_dst_negative_advice(struct dst_entry *dst)
+{
+ dst_release(dst);
+ return NULL;
+}
+
+static void dn_dst_link_failure(struct sk_buff *skb)
+{
+}
+
+static inline int compare_keys(struct flowidn *fl1, struct flowidn *fl2)
+{
+ return ((fl1->daddr ^ fl2->daddr) |
+ (fl1->saddr ^ fl2->saddr) |
+ (fl1->flowidn_mark ^ fl2->flowidn_mark) |
+ (fl1->flowidn_scope ^ fl2->flowidn_scope) |
+ (fl1->flowidn_oif ^ fl2->flowidn_oif) |
+ (fl1->flowidn_iif ^ fl2->flowidn_iif)) == 0;
+}
+
+static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_route **rp)
+{
+ struct dn_route *rth;
+ struct dn_route __rcu **rthp;
+ unsigned long now = jiffies;
+
+ rthp = &dn_rt_hash_table[hash].chain;
+
+ spin_lock_bh(&dn_rt_hash_table[hash].lock);
+ while ((rth = rcu_dereference_protected(*rthp,
+ lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) {
+ if (compare_keys(&rth->fld, &rt->fld)) {
+ /* Put it first */
+ *rthp = rth->dst.dn_next;
+ rcu_assign_pointer(rth->dst.dn_next,
+ dn_rt_hash_table[hash].chain);
+ rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
+
+ dst_use(&rth->dst, now);
+ spin_unlock_bh(&dn_rt_hash_table[hash].lock);
+
+ dnrt_drop(rt);
+ *rp = rth;
+ return 0;
+ }
+ rthp = &rth->dst.dn_next;
+ }
+
+ rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain);
+ rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
+
+ dst_use(&rt->dst, now);
+ spin_unlock_bh(&dn_rt_hash_table[hash].lock);
+ *rp = rt;
+ return 0;
+}
+
+static void dn_run_flush(unsigned long dummy)
+{
+ int i;
+ struct dn_route *rt, *next;
+
+ for (i = 0; i < dn_rt_hash_mask; i++) {
+ spin_lock_bh(&dn_rt_hash_table[i].lock);
+
+ if ((rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL)) == NULL)
+ goto nothing_to_declare;
+
+ for(; rt; rt = next) {
+ next = rcu_dereference_raw(rt->dst.dn_next);
+ RCU_INIT_POINTER(rt->dst.dn_next, NULL);
+ dst_free((struct dst_entry *)rt);
+ }
+
+nothing_to_declare:
+ spin_unlock_bh(&dn_rt_hash_table[i].lock);
+ }
+}
+
+static DEFINE_SPINLOCK(dn_rt_flush_lock);
+
+void dn_rt_cache_flush(int delay)
+{
+ unsigned long now = jiffies;
+ int user_mode = !in_interrupt();
+
+ if (delay < 0)
+ delay = dn_rt_min_delay;
+
+ spin_lock_bh(&dn_rt_flush_lock);
+
+ if (del_timer(&dn_rt_flush_timer) && delay > 0 && dn_rt_deadline) {
+ long tmo = (long)(dn_rt_deadline - now);
+
+ if (user_mode && tmo < dn_rt_max_delay - dn_rt_min_delay)
+ tmo = 0;
+
+ if (delay > tmo)
+ delay = tmo;
+ }
+
+ if (delay <= 0) {
+ spin_unlock_bh(&dn_rt_flush_lock);
+ dn_run_flush(0);
+ return;
+ }
+
+ if (dn_rt_deadline == 0)
+ dn_rt_deadline = now + dn_rt_max_delay;
+
+ dn_rt_flush_timer.expires = now + delay;
+ add_timer(&dn_rt_flush_timer);
+ spin_unlock_bh(&dn_rt_flush_lock);
+}
+
+/**
+ * dn_return_short - Return a short packet to its sender
+ * @skb: The packet to return
+ *
+ */
+static int dn_return_short(struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb;
+ unsigned char *ptr;
+ __le16 *src;
+ __le16 *dst;
+
+ /* Add back headers */
+ skb_push(skb, skb->data - skb_network_header(skb));
+
+ if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
+ return NET_RX_DROP;
+
+ cb = DN_SKB_CB(skb);
+ /* Skip packet length and point to flags */
+ ptr = skb->data + 2;
+ *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
+
+ dst = (__le16 *)ptr;
+ ptr += 2;
+ src = (__le16 *)ptr;
+ ptr += 2;
+ *ptr = 0; /* Zero hop count */
+
+ swap(*src, *dst);
+
+ skb->pkt_type = PACKET_OUTGOING;
+ dn_rt_finish_output(skb, NULL, NULL);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * dn_return_long - Return a long packet to its sender
+ * @skb: The long format packet to return
+ *
+ */
+static int dn_return_long(struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb;
+ unsigned char *ptr;
+ unsigned char *src_addr, *dst_addr;
+ unsigned char tmp[ETH_ALEN];
+
+ /* Add back all headers */
+ skb_push(skb, skb->data - skb_network_header(skb));
+
+ if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
+ return NET_RX_DROP;
+
+ cb = DN_SKB_CB(skb);
+ /* Ignore packet length and point to flags */
+ ptr = skb->data + 2;
+
+ /* Skip padding */
+ if (*ptr & DN_RT_F_PF) {
+ char padlen = (*ptr & ~DN_RT_F_PF);
+ ptr += padlen;
+ }
+
+ *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
+ ptr += 2;
+ dst_addr = ptr;
+ ptr += 8;
+ src_addr = ptr;
+ ptr += 6;
+ *ptr = 0; /* Zero hop count */
+
+ /* Swap source and destination */
+ memcpy(tmp, src_addr, ETH_ALEN);
+ memcpy(src_addr, dst_addr, ETH_ALEN);
+ memcpy(dst_addr, tmp, ETH_ALEN);
+
+ skb->pkt_type = PACKET_OUTGOING;
+ dn_rt_finish_output(skb, dst_addr, src_addr);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * dn_route_rx_packet - Try and find a route for an incoming packet
+ * @skb: The packet to find a route for
+ *
+ * Returns: result of input function if route is found, error code otherwise
+ */
+static int dn_route_rx_packet(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb;
+ int err;
+
+ if ((err = dn_route_input(skb)) == 0)
+ return dst_input(skb);
+
+ cb = DN_SKB_CB(skb);
+ if (decnet_debug_level & 4) {
+ char *devname = skb->dev ? skb->dev->name : "???";
+
+ printk(KERN_DEBUG
+ "DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n",
+ (int)cb->rt_flags, devname, skb->len,
+ le16_to_cpu(cb->src), le16_to_cpu(cb->dst),
+ err, skb->pkt_type);
+ }
+
+ if ((skb->pkt_type == PACKET_HOST) && (cb->rt_flags & DN_RT_F_RQR)) {
+ switch (cb->rt_flags & DN_RT_PKT_MSK) {
+ case DN_RT_PKT_SHORT:
+ return dn_return_short(skb);
+ case DN_RT_PKT_LONG:
+ return dn_return_long(skb);
+ }
+ }
+
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static int dn_route_rx_long(struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ unsigned char *ptr = skb->data;
+
+ if (!pskb_may_pull(skb, 21)) /* 20 for long header, 1 for shortest nsp */
+ goto drop_it;
+
+ skb_pull(skb, 20);
+ skb_reset_transport_header(skb);
+
+ /* Destination info */
+ ptr += 2;
+ cb->dst = dn_eth2dn(ptr);
+ if (memcmp(ptr, dn_hiord_addr, 4) != 0)
+ goto drop_it;
+ ptr += 6;
+
+
+ /* Source info */
+ ptr += 2;
+ cb->src = dn_eth2dn(ptr);
+ if (memcmp(ptr, dn_hiord_addr, 4) != 0)
+ goto drop_it;
+ ptr += 6;
+ /* Other junk */
+ ptr++;
+ cb->hops = *ptr++; /* Visit Count */
+
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb,
+ skb->dev, NULL,
+ dn_route_rx_packet);
+
+drop_it:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+
+
+static int dn_route_rx_short(struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ unsigned char *ptr = skb->data;
+
+ if (!pskb_may_pull(skb, 6)) /* 5 for short header + 1 for shortest nsp */
+ goto drop_it;
+
+ skb_pull(skb, 5);
+ skb_reset_transport_header(skb);
+
+ cb->dst = *(__le16 *)ptr;
+ ptr += 2;
+ cb->src = *(__le16 *)ptr;
+ ptr += 2;
+ cb->hops = *ptr & 0x3f;
+
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, NULL, skb,
+ skb->dev, NULL,
+ dn_route_rx_packet);
+
+drop_it:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static int dn_route_discard(struct sock *sk, struct sk_buff *skb)
+{
+ /*
+ * I know we drop the packet here, but thats considered success in
+ * this case
+ */
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+}
+
+static int dn_route_ptp_hello(struct sock *sk, struct sk_buff *skb)
+{
+ dn_dev_hello(skb);
+ dn_neigh_pointopoint_hello(skb);
+ return NET_RX_SUCCESS;
+}
+
+int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct dn_skb_cb *cb;
+ unsigned char flags = 0;
+ __u16 len = le16_to_cpu(*(__le16 *)skb->data);
+ struct dn_dev *dn = rcu_dereference(dev->dn_ptr);
+ unsigned char padlen = 0;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto dump_it;
+
+ if (dn == NULL)
+ goto dump_it;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto out;
+
+ if (!pskb_may_pull(skb, 3))
+ goto dump_it;
+
+ skb_pull(skb, 2);
+
+ if (len > skb->len)
+ goto dump_it;
+
+ skb_trim(skb, len);
+
+ flags = *skb->data;
+
+ cb = DN_SKB_CB(skb);
+ cb->stamp = jiffies;
+ cb->iif = dev->ifindex;
+
+ /*
+ * If we have padding, remove it.
+ */
+ if (flags & DN_RT_F_PF) {
+ padlen = flags & ~DN_RT_F_PF;
+ if (!pskb_may_pull(skb, padlen + 1))
+ goto dump_it;
+ skb_pull(skb, padlen);
+ flags = *skb->data;
+ }
+
+ skb_reset_network_header(skb);
+
+ /*
+ * Weed out future version DECnet
+ */
+ if (flags & DN_RT_F_VER)
+ goto dump_it;
+
+ cb->rt_flags = flags;
+
+ if (decnet_debug_level & 1)
+ printk(KERN_DEBUG
+ "dn_route_rcv: got 0x%02x from %s [%d %d %d]\n",
+ (int)flags, (dev) ? dev->name : "???", len, skb->len,
+ padlen);
+
+ if (flags & DN_RT_PKT_CNTL) {
+ if (unlikely(skb_linearize(skb)))
+ goto dump_it;
+
+ switch (flags & DN_RT_CNTL_MSK) {
+ case DN_RT_PKT_INIT:
+ dn_dev_init_pkt(skb);
+ break;
+ case DN_RT_PKT_VERI:
+ dn_dev_veri_pkt(skb);
+ break;
+ }
+
+ if (dn->parms.state != DN_DEV_S_RU)
+ goto dump_it;
+
+ switch (flags & DN_RT_CNTL_MSK) {
+ case DN_RT_PKT_HELO:
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
+ NULL, skb, skb->dev, NULL,
+ dn_route_ptp_hello);
+
+ case DN_RT_PKT_L1RT:
+ case DN_RT_PKT_L2RT:
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE,
+ NULL, skb, skb->dev, NULL,
+ dn_route_discard);
+ case DN_RT_PKT_ERTH:
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
+ NULL, skb, skb->dev, NULL,
+ dn_neigh_router_hello);
+
+ case DN_RT_PKT_EEDH:
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
+ NULL, skb, skb->dev, NULL,
+ dn_neigh_endnode_hello);
+ }
+ } else {
+ if (dn->parms.state != DN_DEV_S_RU)
+ goto dump_it;
+
+ skb_pull(skb, 1); /* Pull flags */
+
+ switch (flags & DN_RT_PKT_MSK) {
+ case DN_RT_PKT_LONG:
+ return dn_route_rx_long(skb);
+ case DN_RT_PKT_SHORT:
+ return dn_route_rx_short(skb);
+ }
+ }
+
+dump_it:
+ kfree_skb(skb);
+out:
+ return NET_RX_DROP;
+}
+
+static int dn_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct dn_route *rt = (struct dn_route *)dst;
+ struct net_device *dev = dst->dev;
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+ int err = -EINVAL;
+
+ if (rt->n == NULL)
+ goto error;
+
+ skb->dev = dev;
+
+ cb->src = rt->rt_saddr;
+ cb->dst = rt->rt_daddr;
+
+ /*
+ * Always set the Intra-Ethernet bit on all outgoing packets
+ * originated on this node. Only valid flag from upper layers
+ * is return-to-sender-requested. Set hop count to 0 too.
+ */
+ cb->rt_flags &= ~DN_RT_F_RQR;
+ cb->rt_flags |= DN_RT_F_IE;
+ cb->hops = 0;
+
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, sk, skb,
+ NULL, dev,
+ dn_to_neigh_output);
+
+error:
+ net_dbg_ratelimited("dn_output: This should not happen\n");
+
+ kfree_skb(skb);
+
+ return err;
+}
+
+static int dn_forward(struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct dst_entry *dst = skb_dst(skb);
+ struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr);
+ struct dn_route *rt;
+ int header_len;
+#ifdef CONFIG_NETFILTER
+ struct net_device *dev = skb->dev;
+#endif
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ /* Ensure that we have enough space for headers */
+ rt = (struct dn_route *)skb_dst(skb);
+ header_len = dn_db->use_long ? 21 : 6;
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+header_len))
+ goto drop;
+
+ /*
+ * Hop count exceeded.
+ */
+ if (++cb->hops > 30)
+ goto drop;
+
+ skb->dev = rt->dst.dev;
+
+ /*
+ * If packet goes out same interface it came in on, then set
+ * the Intra-Ethernet bit. This has no effect for short
+ * packets, so we don't need to test for them here.
+ */
+ cb->rt_flags &= ~DN_RT_F_IE;
+ if (rt->rt_flags & RTCF_DOREDIRECT)
+ cb->rt_flags |= DN_RT_F_IE;
+
+ return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, NULL, skb,
+ dev, skb->dev,
+ dn_to_neigh_output);
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+/*
+ * Used to catch bugs. This should never normally get
+ * called.
+ */
+static int dn_rt_bug_sk(struct sock *sk, struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+ net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n",
+ le16_to_cpu(cb->src), le16_to_cpu(cb->dst));
+
+ kfree_skb(skb);
+
+ return NET_RX_DROP;
+}
+
+static int dn_rt_bug(struct sk_buff *skb)
+{
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+ net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n",
+ le16_to_cpu(cb->src), le16_to_cpu(cb->dst));
+
+ kfree_skb(skb);
+
+ return NET_RX_DROP;
+}
+
+static unsigned int dn_dst_default_advmss(const struct dst_entry *dst)
+{
+ return dn_mss_from_pmtu(dst->dev, dst_mtu(dst));
+}
+
+static unsigned int dn_dst_mtu(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
+}
+
+static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ return __neigh_lookup_errno(&dn_neigh_table, daddr, dst->dev);
+}
+
+static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
+{
+ struct dn_fib_info *fi = res->fi;
+ struct net_device *dev = rt->dst.dev;
+ unsigned int mss_metric;
+ struct neighbour *n;
+
+ if (fi) {
+ if (DN_FIB_RES_GW(*res) &&
+ DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+ rt->rt_gateway = DN_FIB_RES_GW(*res);
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ }
+ rt->rt_type = res->type;
+
+ if (dev != NULL && rt->n == NULL) {
+ n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev);
+ if (IS_ERR(n))
+ return PTR_ERR(n);
+ rt->n = n;
+ }
+
+ if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
+ dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
+ mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
+ if (mss_metric) {
+ unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
+ if (mss_metric > mss)
+ dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
+ }
+ return 0;
+}
+
+static inline int dn_match_addr(__le16 addr1, __le16 addr2)
+{
+ __u16 tmp = le16_to_cpu(addr1) ^ le16_to_cpu(addr2);
+ int match = 16;
+ while(tmp) {
+ tmp >>= 1;
+ match--;
+ }
+ return match;
+}
+
+static __le16 dnet_select_source(const struct net_device *dev, __le16 daddr, int scope)
+{
+ __le16 saddr = 0;
+ struct dn_dev *dn_db;
+ struct dn_ifaddr *ifa;
+ int best_match = 0;
+ int ret;
+
+ rcu_read_lock();
+ dn_db = rcu_dereference(dev->dn_ptr);
+ for (ifa = rcu_dereference(dn_db->ifa_list);
+ ifa != NULL;
+ ifa = rcu_dereference(ifa->ifa_next)) {
+ if (ifa->ifa_scope > scope)
+ continue;
+ if (!daddr) {
+ saddr = ifa->ifa_local;
+ break;
+ }
+ ret = dn_match_addr(daddr, ifa->ifa_local);
+ if (ret > best_match)
+ saddr = ifa->ifa_local;
+ if (best_match == 0)
+ saddr = ifa->ifa_local;
+ }
+ rcu_read_unlock();
+
+ return saddr;
+}
+
+static inline __le16 __dn_fib_res_prefsrc(struct dn_fib_res *res)
+{
+ return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope);
+}
+
+static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_res *res)
+{
+ __le16 mask = dnet_make_mask(res->prefixlen);
+ return (daddr&~mask)|res->fi->fib_nh->nh_gw;
+}
+
+static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *oldflp, int try_hard)
+{
+ struct flowidn fld = {
+ .daddr = oldflp->daddr,
+ .saddr = oldflp->saddr,
+ .flowidn_scope = RT_SCOPE_UNIVERSE,
+ .flowidn_mark = oldflp->flowidn_mark,
+ .flowidn_iif = LOOPBACK_IFINDEX,
+ .flowidn_oif = oldflp->flowidn_oif,
+ };
+ struct dn_route *rt = NULL;
+ struct net_device *dev_out = NULL, *dev;
+ struct neighbour *neigh = NULL;
+ unsigned int hash;
+ unsigned int flags = 0;
+ struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST };
+ int err;
+ int free_res = 0;
+ __le16 gateway = 0;
+
+ if (decnet_debug_level & 16)
+ printk(KERN_DEBUG
+ "dn_route_output_slow: dst=%04x src=%04x mark=%d"
+ " iif=%d oif=%d\n", le16_to_cpu(oldflp->daddr),
+ le16_to_cpu(oldflp->saddr),
+ oldflp->flowidn_mark, LOOPBACK_IFINDEX,
+ oldflp->flowidn_oif);
+
+ /* If we have an output interface, verify its a DECnet device */
+ if (oldflp->flowidn_oif) {
+ dev_out = dev_get_by_index(&init_net, oldflp->flowidn_oif);
+ err = -ENODEV;
+ if (dev_out && dev_out->dn_ptr == NULL) {
+ dev_put(dev_out);
+ dev_out = NULL;
+ }
+ if (dev_out == NULL)
+ goto out;
+ }
+
+ /* If we have a source address, verify that its a local address */
+ if (oldflp->saddr) {
+ err = -EADDRNOTAVAIL;
+
+ if (dev_out) {
+ if (dn_dev_islocal(dev_out, oldflp->saddr))
+ goto source_ok;
+ dev_put(dev_out);
+ goto out;
+ }
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if (!dev->dn_ptr)
+ continue;
+ if (!dn_dev_islocal(dev, oldflp->saddr))
+ continue;
+ if ((dev->flags & IFF_LOOPBACK) &&
+ oldflp->daddr &&
+ !dn_dev_islocal(dev, oldflp->daddr))
+ continue;
+
+ dev_out = dev;
+ break;
+ }
+ rcu_read_unlock();
+ if (dev_out == NULL)
+ goto out;
+ dev_hold(dev_out);
+source_ok:
+ ;
+ }
+
+ /* No destination? Assume its local */
+ if (!fld.daddr) {
+ fld.daddr = fld.saddr;
+
+ err = -EADDRNOTAVAIL;
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = init_net.loopback_dev;
+ dev_hold(dev_out);
+ if (!fld.daddr) {
+ fld.daddr =
+ fld.saddr = dnet_select_source(dev_out, 0,
+ RT_SCOPE_HOST);
+ if (!fld.daddr)
+ goto out;
+ }
+ fld.flowidn_oif = LOOPBACK_IFINDEX;
+ res.type = RTN_LOCAL;
+ goto make_route;
+ }
+
+ if (decnet_debug_level & 16)
+ printk(KERN_DEBUG
+ "dn_route_output_slow: initial checks complete."
+ " dst=%04x src=%04x oif=%d try_hard=%d\n",
+ le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr),
+ fld.flowidn_oif, try_hard);
+
+ /*
+ * N.B. If the kernel is compiled without router support then
+ * dn_fib_lookup() will evaluate to non-zero so this if () block
+ * will always be executed.
+ */
+ err = -ESRCH;
+ if (try_hard || (err = dn_fib_lookup(&fld, &res)) != 0) {
+ struct dn_dev *dn_db;
+ if (err != -ESRCH)
+ goto out;
+ /*
+ * Here the fallback is basically the standard algorithm for
+ * routing in endnodes which is described in the DECnet routing
+ * docs
+ *
+ * If we are not trying hard, look in neighbour cache.
+ * The result is tested to ensure that if a specific output
+ * device/source address was requested, then we honour that
+ * here
+ */
+ if (!try_hard) {
+ neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fld.daddr);
+ if (neigh) {
+ if ((oldflp->flowidn_oif &&
+ (neigh->dev->ifindex != oldflp->flowidn_oif)) ||
+ (oldflp->saddr &&
+ (!dn_dev_islocal(neigh->dev,
+ oldflp->saddr)))) {
+ neigh_release(neigh);
+ neigh = NULL;
+ } else {
+ if (dev_out)
+ dev_put(dev_out);
+ if (dn_dev_islocal(neigh->dev, fld.daddr)) {
+ dev_out = init_net.loopback_dev;
+ res.type = RTN_LOCAL;
+ } else {
+ dev_out = neigh->dev;
+ }
+ dev_hold(dev_out);
+ goto select_source;
+ }
+ }
+ }
+
+ /* Not there? Perhaps its a local address */
+ if (dev_out == NULL)
+ dev_out = dn_dev_get_default();
+ err = -ENODEV;
+ if (dev_out == NULL)
+ goto out;
+ dn_db = rcu_dereference_raw(dev_out->dn_ptr);
+ /* Possible improvement - check all devices for local addr */
+ if (dn_dev_islocal(dev_out, fld.daddr)) {
+ dev_put(dev_out);
+ dev_out = init_net.loopback_dev;
+ dev_hold(dev_out);
+ res.type = RTN_LOCAL;
+ goto select_source;
+ }
+ /* Not local either.... try sending it to the default router */
+ neigh = neigh_clone(dn_db->router);
+ BUG_ON(neigh && neigh->dev != dev_out);
+
+ /* Ok then, we assume its directly connected and move on */
+select_source:
+ if (neigh)
+ gateway = ((struct dn_neigh *)neigh)->addr;
+ if (gateway == 0)
+ gateway = fld.daddr;
+ if (fld.saddr == 0) {
+ fld.saddr = dnet_select_source(dev_out, gateway,
+ res.type == RTN_LOCAL ?
+ RT_SCOPE_HOST :
+ RT_SCOPE_LINK);
+ if (fld.saddr == 0 && res.type != RTN_LOCAL)
+ goto e_addr;
+ }
+ fld.flowidn_oif = dev_out->ifindex;
+ goto make_route;
+ }
+ free_res = 1;
+
+ if (res.type == RTN_NAT)
+ goto e_inval;
+
+ if (res.type == RTN_LOCAL) {
+ if (!fld.saddr)
+ fld.saddr = fld.daddr;
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = init_net.loopback_dev;
+ dev_hold(dev_out);
+ fld.flowidn_oif = dev_out->ifindex;
+ if (res.fi)
+ dn_fib_info_put(res.fi);
+ res.fi = NULL;
+ goto make_route;
+ }
+
+ if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
+ dn_fib_select_multipath(&fld, &res);
+
+ /*
+ * We could add some logic to deal with default routes here and
+ * get rid of some of the special casing above.
+ */
+
+ if (!fld.saddr)
+ fld.saddr = DN_FIB_RES_PREFSRC(res);
+
+ if (dev_out)
+ dev_put(dev_out);
+ dev_out = DN_FIB_RES_DEV(res);
+ dev_hold(dev_out);
+ fld.flowidn_oif = dev_out->ifindex;
+ gateway = DN_FIB_RES_GW(res);
+
+make_route:
+ if (dev_out->flags & IFF_LOOPBACK)
+ flags |= RTCF_LOCAL;
+
+ rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
+ if (rt == NULL)
+ goto e_nobufs;
+
+ memset(&rt->fld, 0, sizeof(rt->fld));
+ rt->fld.saddr = oldflp->saddr;
+ rt->fld.daddr = oldflp->daddr;
+ rt->fld.flowidn_oif = oldflp->flowidn_oif;
+ rt->fld.flowidn_iif = 0;
+ rt->fld.flowidn_mark = oldflp->flowidn_mark;
+
+ rt->rt_saddr = fld.saddr;
+ rt->rt_daddr = fld.daddr;
+ rt->rt_gateway = gateway ? gateway : fld.daddr;
+ rt->rt_local_src = fld.saddr;
+
+ rt->rt_dst_map = fld.daddr;
+ rt->rt_src_map = fld.saddr;
+
+ rt->n = neigh;
+ neigh = NULL;
+
+ rt->dst.lastuse = jiffies;
+ rt->dst.output = dn_output;
+ rt->dst.input = dn_rt_bug;
+ rt->rt_flags = flags;
+ if (flags & RTCF_LOCAL)
+ rt->dst.input = dn_nsp_rx;
+
+ err = dn_rt_set_next_hop(rt, &res);
+ if (err)
+ goto e_neighbour;
+
+ hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
+ dn_insert_route(rt, hash, (struct dn_route **)pprt);
+
+done:
+ if (neigh)
+ neigh_release(neigh);
+ if (free_res)
+ dn_fib_res_put(&res);
+ if (dev_out)
+ dev_put(dev_out);
+out:
+ return err;
+
+e_addr:
+ err = -EADDRNOTAVAIL;
+ goto done;
+e_inval:
+ err = -EINVAL;
+ goto done;
+e_nobufs:
+ err = -ENOBUFS;
+ goto done;
+e_neighbour:
+ dst_free(&rt->dst);
+ goto e_nobufs;
+}
+
+
+/*
+ * N.B. The flags may be moved into the flowi at some future stage.
+ */
+static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *flp, int flags)
+{
+ unsigned int hash = dn_hash(flp->saddr, flp->daddr);
+ struct dn_route *rt = NULL;
+
+ if (!(flags & MSG_TRYHARD)) {
+ rcu_read_lock_bh();
+ for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
+ rt = rcu_dereference_bh(rt->dst.dn_next)) {
+ if ((flp->daddr == rt->fld.daddr) &&
+ (flp->saddr == rt->fld.saddr) &&
+ (flp->flowidn_mark == rt->fld.flowidn_mark) &&
+ dn_is_output_route(rt) &&
+ (rt->fld.flowidn_oif == flp->flowidn_oif)) {
+ dst_use(&rt->dst, jiffies);
+ rcu_read_unlock_bh();
+ *pprt = &rt->dst;
+ return 0;
+ }
+ }
+ rcu_read_unlock_bh();
+ }
+
+ return dn_route_output_slow(pprt, flp, flags);
+}
+
+static int dn_route_output_key(struct dst_entry **pprt, struct flowidn *flp, int flags)
+{
+ int err;
+
+ err = __dn_route_output_key(pprt, flp, flags);
+ if (err == 0 && flp->flowidn_proto) {
+ *pprt = xfrm_lookup(&init_net, *pprt,
+ flowidn_to_flowi(flp), NULL, 0);
+ if (IS_ERR(*pprt)) {
+ err = PTR_ERR(*pprt);
+ *pprt = NULL;
+ }
+ }
+ return err;
+}
+
+int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *fl, struct sock *sk, int flags)
+{
+ int err;
+
+ err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD);
+ if (err == 0 && fl->flowidn_proto) {
+ *pprt = xfrm_lookup(&init_net, *pprt,
+ flowidn_to_flowi(fl), sk, 0);
+ if (IS_ERR(*pprt)) {
+ err = PTR_ERR(*pprt);
+ *pprt = NULL;
+ }
+ }
+ return err;
+}
+
+static int dn_route_input_slow(struct sk_buff *skb)
+{
+ struct dn_route *rt = NULL;
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ struct net_device *in_dev = skb->dev;
+ struct net_device *out_dev = NULL;
+ struct dn_dev *dn_db;
+ struct neighbour *neigh = NULL;
+ unsigned int hash;
+ int flags = 0;
+ __le16 gateway = 0;
+ __le16 local_src = 0;
+ struct flowidn fld = {
+ .daddr = cb->dst,
+ .saddr = cb->src,
+ .flowidn_scope = RT_SCOPE_UNIVERSE,
+ .flowidn_mark = skb->mark,
+ .flowidn_iif = skb->dev->ifindex,
+ };
+ struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE };
+ int err = -EINVAL;
+ int free_res = 0;
+
+ dev_hold(in_dev);
+
+ if ((dn_db = rcu_dereference(in_dev->dn_ptr)) == NULL)
+ goto out;
+
+ /* Zero source addresses are not allowed */
+ if (fld.saddr == 0)
+ goto out;
+
+ /*
+ * In this case we've just received a packet from a source
+ * outside ourselves pretending to come from us. We don't
+ * allow it any further to prevent routing loops, spoofing and
+ * other nasties. Loopback packets already have the dst attached
+ * so this only affects packets which have originated elsewhere.
+ */
+ err = -ENOTUNIQ;
+ if (dn_dev_islocal(in_dev, cb->src))
+ goto out;
+
+ err = dn_fib_lookup(&fld, &res);
+ if (err) {
+ if (err != -ESRCH)
+ goto out;
+ /*
+ * Is the destination us ?
+ */
+ if (!dn_dev_islocal(in_dev, cb->dst))
+ goto e_inval;
+
+ res.type = RTN_LOCAL;
+ } else {
+ __le16 src_map = fld.saddr;
+ free_res = 1;
+
+ out_dev = DN_FIB_RES_DEV(res);
+ if (out_dev == NULL) {
+ net_crit_ratelimited("Bug in dn_route_input_slow() No output device\n");
+ goto e_inval;
+ }
+ dev_hold(out_dev);
+
+ if (res.r)
+ src_map = fld.saddr; /* no NAT support for now */
+
+ gateway = DN_FIB_RES_GW(res);
+ if (res.type == RTN_NAT) {
+ fld.daddr = dn_fib_rules_map_destination(fld.daddr, &res);
+ dn_fib_res_put(&res);
+ free_res = 0;
+ if (dn_fib_lookup(&fld, &res))
+ goto e_inval;
+ free_res = 1;
+ if (res.type != RTN_UNICAST)
+ goto e_inval;
+ flags |= RTCF_DNAT;
+ gateway = fld.daddr;
+ }
+ fld.saddr = src_map;
+ }
+
+ switch(res.type) {
+ case RTN_UNICAST:
+ /*
+ * Forwarding check here, we only check for forwarding
+ * being turned off, if you want to only forward intra
+ * area, its up to you to set the routing tables up
+ * correctly.
+ */
+ if (dn_db->parms.forwarding == 0)
+ goto e_inval;
+
+ if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
+ dn_fib_select_multipath(&fld, &res);
+
+ /*
+ * Check for out_dev == in_dev. We use the RTCF_DOREDIRECT
+ * flag as a hint to set the intra-ethernet bit when
+ * forwarding. If we've got NAT in operation, we don't do
+ * this optimisation.
+ */
+ if (out_dev == in_dev && !(flags & RTCF_NAT))
+ flags |= RTCF_DOREDIRECT;
+
+ local_src = DN_FIB_RES_PREFSRC(res);
+
+ case RTN_BLACKHOLE:
+ case RTN_UNREACHABLE:
+ break;
+ case RTN_LOCAL:
+ flags |= RTCF_LOCAL;
+ fld.saddr = cb->dst;
+ fld.daddr = cb->src;
+
+ /* Routing tables gave us a gateway */
+ if (gateway)
+ goto make_route;
+
+ /* Packet was intra-ethernet, so we know its on-link */
+ if (cb->rt_flags & DN_RT_F_IE) {
+ gateway = cb->src;
+ goto make_route;
+ }
+
+ /* Use the default router if there is one */
+ neigh = neigh_clone(dn_db->router);
+ if (neigh) {
+ gateway = ((struct dn_neigh *)neigh)->addr;
+ goto make_route;
+ }
+
+ /* Close eyes and pray */
+ gateway = cb->src;
+ goto make_route;
+ default:
+ goto e_inval;
+ }
+
+make_route:
+ rt = dst_alloc(&dn_dst_ops, out_dev, 0, DST_OBSOLETE_NONE, DST_HOST);
+ if (rt == NULL)
+ goto e_nobufs;
+
+ memset(&rt->fld, 0, sizeof(rt->fld));
+ rt->rt_saddr = fld.saddr;
+ rt->rt_daddr = fld.daddr;
+ rt->rt_gateway = fld.daddr;
+ if (gateway)
+ rt->rt_gateway = gateway;
+ rt->rt_local_src = local_src ? local_src : rt->rt_saddr;
+
+ rt->rt_dst_map = fld.daddr;
+ rt->rt_src_map = fld.saddr;
+
+ rt->fld.saddr = cb->src;
+ rt->fld.daddr = cb->dst;
+ rt->fld.flowidn_oif = 0;
+ rt->fld.flowidn_iif = in_dev->ifindex;
+ rt->fld.flowidn_mark = fld.flowidn_mark;
+
+ rt->n = neigh;
+ rt->dst.lastuse = jiffies;
+ rt->dst.output = dn_rt_bug_sk;
+ switch (res.type) {
+ case RTN_UNICAST:
+ rt->dst.input = dn_forward;
+ break;
+ case RTN_LOCAL:
+ rt->dst.output = dn_output;
+ rt->dst.input = dn_nsp_rx;
+ rt->dst.dev = in_dev;
+ flags |= RTCF_LOCAL;
+ break;
+ default:
+ case RTN_UNREACHABLE:
+ case RTN_BLACKHOLE:
+ rt->dst.input = dst_discard;
+ }
+ rt->rt_flags = flags;
+
+ err = dn_rt_set_next_hop(rt, &res);
+ if (err)
+ goto e_neighbour;
+
+ hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
+ dn_insert_route(rt, hash, &rt);
+ skb_dst_set(skb, &rt->dst);
+
+done:
+ if (neigh)
+ neigh_release(neigh);
+ if (free_res)
+ dn_fib_res_put(&res);
+ dev_put(in_dev);
+ if (out_dev)
+ dev_put(out_dev);
+out:
+ return err;
+
+e_inval:
+ err = -EINVAL;
+ goto done;
+
+e_nobufs:
+ err = -ENOBUFS;
+ goto done;
+
+e_neighbour:
+ dst_free(&rt->dst);
+ goto done;
+}
+
+static int dn_route_input(struct sk_buff *skb)
+{
+ struct dn_route *rt;
+ struct dn_skb_cb *cb = DN_SKB_CB(skb);
+ unsigned int hash = dn_hash(cb->src, cb->dst);
+
+ if (skb_dst(skb))
+ return 0;
+
+ rcu_read_lock();
+ for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
+ rt = rcu_dereference(rt->dst.dn_next)) {
+ if ((rt->fld.saddr == cb->src) &&
+ (rt->fld.daddr == cb->dst) &&
+ (rt->fld.flowidn_oif == 0) &&
+ (rt->fld.flowidn_mark == skb->mark) &&
+ (rt->fld.flowidn_iif == cb->iif)) {
+ dst_use(&rt->dst, jiffies);
+ rcu_read_unlock();
+ skb_dst_set(skb, (struct dst_entry *)rt);
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ return dn_route_input_slow(skb);
+}
+
+static int dn_rt_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, int nowait, unsigned int flags)
+{
+ struct dn_route *rt = (struct dn_route *)skb_dst(skb);
+ struct rtmsg *r;
+ struct nlmsghdr *nlh;
+ long expires;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ r->rtm_family = AF_DECnet;
+ r->rtm_dst_len = 16;
+ r->rtm_src_len = 0;
+ r->rtm_tos = 0;
+ r->rtm_table = RT_TABLE_MAIN;
+ r->rtm_type = rt->rt_type;
+ r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+ r->rtm_scope = RT_SCOPE_UNIVERSE;
+ r->rtm_protocol = RTPROT_UNSPEC;
+
+ if (rt->rt_flags & RTCF_NOTIFY)
+ r->rtm_flags |= RTM_F_NOTIFY;
+
+ if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN) < 0 ||
+ nla_put_le16(skb, RTA_DST, rt->rt_daddr) < 0)
+ goto errout;
+
+ if (rt->fld.saddr) {
+ r->rtm_src_len = 16;
+ if (nla_put_le16(skb, RTA_SRC, rt->fld.saddr) < 0)
+ goto errout;
+ }
+ if (rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex) < 0)
+ goto errout;
+
+ /*
+ * Note to self - change this if input routes reverse direction when
+ * they deal only with inputs and not with replies like they do
+ * currently.
+ */
+ if (nla_put_le16(skb, RTA_PREFSRC, rt->rt_local_src) < 0)
+ goto errout;
+
+ if (rt->rt_daddr != rt->rt_gateway &&
+ nla_put_le16(skb, RTA_GATEWAY, rt->rt_gateway) < 0)
+ goto errout;
+
+ if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+ goto errout;
+
+ expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires,
+ rt->dst.error) < 0)
+ goto errout;
+
+ if (dn_is_input_route(rt) &&
+ nla_put_u32(skb, RTA_IIF, rt->fld.flowidn_iif) < 0)
+ goto errout;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+const struct nla_policy rtm_dn_policy[RTA_MAX + 1] = {
+ [RTA_DST] = { .type = NLA_U16 },
+ [RTA_SRC] = { .type = NLA_U16 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_OIF] = { .type = NLA_U32 },
+ [RTA_GATEWAY] = { .type = NLA_U16 },
+ [RTA_PRIORITY] = { .type = NLA_U32 },
+ [RTA_PREFSRC] = { .type = NLA_U16 },
+ [RTA_METRICS] = { .type = NLA_NESTED },
+ [RTA_MULTIPATH] = { .type = NLA_NESTED },
+ [RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_MARK] = { .type = NLA_U32 },
+};
+
+/*
+ * This is called by both endnodes and routers now.
+ */
+static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct rtmsg *rtm = nlmsg_data(nlh);
+ struct dn_route *rt = NULL;
+ struct dn_skb_cb *cb;
+ int err;
+ struct sk_buff *skb;
+ struct flowidn fld;
+ struct nlattr *tb[RTA_MAX+1];
+
+ if (!net_eq(net, &init_net))
+ return -EINVAL;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_dn_policy);
+ if (err < 0)
+ return err;
+
+ memset(&fld, 0, sizeof(fld));
+ fld.flowidn_proto = DNPROTO_NSP;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb == NULL)
+ return -ENOBUFS;
+ skb_reset_mac_header(skb);
+ cb = DN_SKB_CB(skb);
+
+ if (tb[RTA_SRC])
+ fld.saddr = nla_get_le16(tb[RTA_SRC]);
+
+ if (tb[RTA_DST])
+ fld.daddr = nla_get_le16(tb[RTA_DST]);
+
+ if (tb[RTA_IIF])
+ fld.flowidn_iif = nla_get_u32(tb[RTA_IIF]);
+
+ if (fld.flowidn_iif) {
+ struct net_device *dev;
+ dev = __dev_get_by_index(&init_net, fld.flowidn_iif);
+ if (!dev || !dev->dn_ptr) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ skb->protocol = htons(ETH_P_DNA_RT);
+ skb->dev = dev;
+ cb->src = fld.saddr;
+ cb->dst = fld.daddr;
+ local_bh_disable();
+ err = dn_route_input(skb);
+ local_bh_enable();
+ memset(cb, 0, sizeof(struct dn_skb_cb));
+ rt = (struct dn_route *)skb_dst(skb);
+ if (!err && -rt->dst.error)
+ err = rt->dst.error;
+ } else {
+ if (tb[RTA_OIF])
+ fld.flowidn_oif = nla_get_u32(tb[RTA_OIF]);
+
+ err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0);
+ }
+
+ skb->dev = NULL;
+ if (err)
+ goto out_free;
+ skb_dst_set(skb, &rt->dst);
+ if (rtm->rtm_flags & RTM_F_NOTIFY)
+ rt->rt_flags |= RTCF_NOTIFY;
+
+ err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
+ if (err < 0) {
+ err = -EMSGSIZE;
+ goto out_free;
+ }
+
+ return rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).portid);
+
+out_free:
+ kfree_skb(skb);
+ return err;
+}
+
+/*
+ * For routers, this is called from dn_fib_dump, but for endnodes its
+ * called directly from the rtnetlink dispatch table.
+ */
+int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct dn_route *rt;
+ int h, s_h;
+ int idx, s_idx;
+ struct rtmsg *rtm;
+
+ if (!net_eq(net, &init_net))
+ return 0;
+
+ if (nlmsg_len(cb->nlh) < sizeof(struct rtmsg))
+ return -EINVAL;
+
+ rtm = nlmsg_data(cb->nlh);
+ if (!(rtm->rtm_flags & RTM_F_CLONED))
+ return 0;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ for(h = 0; h <= dn_rt_hash_mask; h++) {
+ if (h < s_h)
+ continue;
+ if (h > s_h)
+ s_idx = 0;
+ rcu_read_lock_bh();
+ for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
+ rt;
+ rt = rcu_dereference_bh(rt->dst.dn_next), idx++) {
+ if (idx < s_idx)
+ continue;
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ 1, NLM_F_MULTI) < 0) {
+ skb_dst_drop(skb);
+ rcu_read_unlock_bh();
+ goto done;
+ }
+ skb_dst_drop(skb);
+ }
+ rcu_read_unlock_bh();
+ }
+
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ return skb->len;
+}
+
+#ifdef CONFIG_PROC_FS
+struct dn_rt_cache_iter_state {
+ int bucket;
+};
+
+static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq)
+{
+ struct dn_route *rt = NULL;
+ struct dn_rt_cache_iter_state *s = seq->private;
+
+ for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) {
+ rcu_read_lock_bh();
+ rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
+ if (rt)
+ break;
+ rcu_read_unlock_bh();
+ }
+ return rt;
+}
+
+static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_route *rt)
+{
+ struct dn_rt_cache_iter_state *s = seq->private;
+
+ rt = rcu_dereference_bh(rt->dst.dn_next);
+ while (!rt) {
+ rcu_read_unlock_bh();
+ if (--s->bucket < 0)
+ break;
+ rcu_read_lock_bh();
+ rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
+ }
+ return rt;
+}
+
+static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct dn_route *rt = dn_rt_cache_get_first(seq);
+
+ if (rt) {
+ while(*pos && (rt = dn_rt_cache_get_next(seq, rt)))
+ --*pos;
+ }
+ return *pos ? NULL : rt;
+}
+
+static void *dn_rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct dn_route *rt = dn_rt_cache_get_next(seq, v);
+ ++*pos;
+ return rt;
+}
+
+static void dn_rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+ if (v)
+ rcu_read_unlock_bh();
+}
+
+static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+ struct dn_route *rt = v;
+ char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
+
+ seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
+ rt->dst.dev ? rt->dst.dev->name : "*",
+ dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
+ dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
+ atomic_read(&rt->dst.__refcnt),
+ rt->dst.__use, 0);
+ return 0;
+}
+
+static const struct seq_operations dn_rt_cache_seq_ops = {
+ .start = dn_rt_cache_seq_start,
+ .next = dn_rt_cache_seq_next,
+ .stop = dn_rt_cache_seq_stop,
+ .show = dn_rt_cache_seq_show,
+};
+
+static int dn_rt_cache_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &dn_rt_cache_seq_ops,
+ sizeof(struct dn_rt_cache_iter_state));
+}
+
+static const struct file_operations dn_rt_cache_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = dn_rt_cache_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+void __init dn_route_init(void)
+{
+ int i, goal, order;
+
+ dn_dst_ops.kmem_cachep =
+ kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ dst_entries_init(&dn_dst_ops);
+ setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
+ dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
+ add_timer(&dn_route_timer);
+
+ goal = totalram_pages >> (26 - PAGE_SHIFT);
+
+ for(order = 0; (1UL << order) < goal; order++)
+ /* NOTHING */;
+
+ /*
+ * Only want 1024 entries max, since the table is very, very unlikely
+ * to be larger than that.
+ */
+ while(order && ((((1UL << order) * PAGE_SIZE) /
+ sizeof(struct dn_rt_hash_bucket)) >= 2048))
+ order--;
+
+ do {
+ dn_rt_hash_mask = (1UL << order) * PAGE_SIZE /
+ sizeof(struct dn_rt_hash_bucket);
+ while(dn_rt_hash_mask & (dn_rt_hash_mask - 1))
+ dn_rt_hash_mask--;
+ dn_rt_hash_table = (struct dn_rt_hash_bucket *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (dn_rt_hash_table == NULL && --order > 0);
+
+ if (!dn_rt_hash_table)
+ panic("Failed to allocate DECnet route cache hash table\n");
+
+ printk(KERN_INFO
+ "DECnet: Routing cache hash table of %u buckets, %ldKbytes\n",
+ dn_rt_hash_mask,
+ (long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024);
+
+ dn_rt_hash_mask--;
+ for(i = 0; i <= dn_rt_hash_mask; i++) {
+ spin_lock_init(&dn_rt_hash_table[i].lock);
+ dn_rt_hash_table[i].chain = NULL;
+ }
+
+ dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
+
+ proc_create("decnet_cache", S_IRUGO, init_net.proc_net,
+ &dn_rt_cache_seq_fops);
+
+#ifdef CONFIG_DECNET_ROUTER
+ rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
+ dn_fib_dump, NULL);
+#else
+ rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
+ dn_cache_dump, NULL);
+#endif
+}
+
+void __exit dn_route_cleanup(void)
+{
+ del_timer(&dn_route_timer);
+ dn_run_flush(0);
+
+ remove_proc_entry("decnet_cache", init_net.proc_net);
+ dst_entries_destroy(&dn_dst_ops);
+}
+
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
new file mode 100644
index 000000000..9d66a0f72
--- /dev/null
+++ b/net/decnet/dn_rules.c
@@ -0,0 +1,257 @@
+
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Routing Forwarding Information Base (Rules)
+ *
+ * Author: Steve Whitehouse <SteveW@ACM.org>
+ * Mostly copied from Alexey Kuznetsov's ipv4/fib_rules.c
+ *
+ *
+ * Changes:
+ * Steve Whitehouse <steve@chygwyn.com>
+ * Updated for Thomas Graf's generic rules
+ *
+ */
+#include <linux/net.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_fib.h>
+#include <net/dn_neigh.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+
+static struct fib_rules_ops *dn_fib_rules_ops;
+
+struct dn_fib_rule
+{
+ struct fib_rule common;
+ unsigned char dst_len;
+ unsigned char src_len;
+ __le16 src;
+ __le16 srcmask;
+ __le16 dst;
+ __le16 dstmask;
+ __le16 srcmap;
+ u8 flags;
+};
+
+
+int dn_fib_lookup(struct flowidn *flp, struct dn_fib_res *res)
+{
+ struct fib_lookup_arg arg = {
+ .result = res,
+ };
+ int err;
+
+ err = fib_rules_lookup(dn_fib_rules_ops,
+ flowidn_to_flowi(flp), 0, &arg);
+ res->r = arg.rule;
+
+ return err;
+}
+
+static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct flowidn *fld = &flp->u.dn;
+ int err = -EAGAIN;
+ struct dn_fib_table *tbl;
+
+ switch(rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+
+ case FR_ACT_UNREACHABLE:
+ err = -ENETUNREACH;
+ goto errout;
+
+ case FR_ACT_PROHIBIT:
+ err = -EACCES;
+ goto errout;
+
+ case FR_ACT_BLACKHOLE:
+ default:
+ err = -EINVAL;
+ goto errout;
+ }
+
+ tbl = dn_fib_get_table(rule->table, 0);
+ if (tbl == NULL)
+ goto errout;
+
+ err = tbl->lookup(tbl, fld, (struct dn_fib_res *)arg->result);
+ if (err > 0)
+ err = -EAGAIN;
+errout:
+ return err;
+}
+
+static const struct nla_policy dn_fib_rule_policy[FRA_MAX+1] = {
+ FRA_GENERIC_POLICY,
+};
+
+static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+ struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+ struct flowidn *fld = &fl->u.dn;
+ __le16 daddr = fld->daddr;
+ __le16 saddr = fld->saddr;
+
+ if (((saddr ^ r->src) & r->srcmask) ||
+ ((daddr ^ r->dst) & r->dstmask))
+ return 0;
+
+ return 1;
+}
+
+static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ int err = -EINVAL;
+ struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+
+ if (frh->tos)
+ goto errout;
+
+ if (rule->table == RT_TABLE_UNSPEC) {
+ if (rule->action == FR_ACT_TO_TBL) {
+ struct dn_fib_table *table;
+
+ table = dn_fib_empty_table();
+ if (table == NULL) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ rule->table = table->n;
+ }
+ }
+
+ if (frh->src_len)
+ r->src = nla_get_le16(tb[FRA_SRC]);
+
+ if (frh->dst_len)
+ r->dst = nla_get_le16(tb[FRA_DST]);
+
+ r->src_len = frh->src_len;
+ r->srcmask = dnet_make_mask(r->src_len);
+ r->dst_len = frh->dst_len;
+ r->dstmask = dnet_make_mask(r->dst_len);
+ err = 0;
+errout:
+ return err;
+}
+
+static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+
+ if (frh->src_len && (r->src_len != frh->src_len))
+ return 0;
+
+ if (frh->dst_len && (r->dst_len != frh->dst_len))
+ return 0;
+
+ if (frh->src_len && (r->src != nla_get_le16(tb[FRA_SRC])))
+ return 0;
+
+ if (frh->dst_len && (r->dst != nla_get_le16(tb[FRA_DST])))
+ return 0;
+
+ return 1;
+}
+
+unsigned int dnet_addr_type(__le16 addr)
+{
+ struct flowidn fld = { .daddr = addr };
+ struct dn_fib_res res;
+ unsigned int ret = RTN_UNICAST;
+ struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0);
+
+ res.r = NULL;
+
+ if (tb) {
+ if (!tb->lookup(tb, &fld, &res)) {
+ ret = res.type;
+ dn_fib_res_put(&res);
+ }
+ }
+ return ret;
+}
+
+static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+
+ frh->dst_len = r->dst_len;
+ frh->src_len = r->src_len;
+ frh->tos = 0;
+
+ if ((r->dst_len &&
+ nla_put_le16(skb, FRA_DST, r->dst)) ||
+ (r->src_len &&
+ nla_put_le16(skb, FRA_SRC, r->src)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOBUFS;
+}
+
+static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
+{
+ dn_rt_cache_flush(-1);
+}
+
+static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
+ .family = AF_DECnet,
+ .rule_size = sizeof(struct dn_fib_rule),
+ .addr_size = sizeof(u16),
+ .action = dn_fib_rule_action,
+ .match = dn_fib_rule_match,
+ .configure = dn_fib_rule_configure,
+ .compare = dn_fib_rule_compare,
+ .fill = dn_fib_rule_fill,
+ .default_pref = fib_default_rule_pref,
+ .flush_cache = dn_fib_rule_flush_cache,
+ .nlgroup = RTNLGRP_DECnet_RULE,
+ .policy = dn_fib_rule_policy,
+ .owner = THIS_MODULE,
+ .fro_net = &init_net,
+};
+
+void __init dn_fib_rules_init(void)
+{
+ dn_fib_rules_ops =
+ fib_rules_register(&dn_fib_rules_ops_template, &init_net);
+ BUG_ON(IS_ERR(dn_fib_rules_ops));
+ BUG_ON(fib_default_rule_add(dn_fib_rules_ops, 0x7fff,
+ RT_TABLE_MAIN, 0));
+}
+
+void __exit dn_fib_rules_cleanup(void)
+{
+ rtnl_lock();
+ fib_rules_unregister(dn_fib_rules_ops);
+ rtnl_unlock();
+ rcu_barrier();
+}
+
+
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
new file mode 100644
index 000000000..1540b506e
--- /dev/null
+++ b/net/decnet/dn_table.c
@@ -0,0 +1,926 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Routing Forwarding Information Base (Routing Tables)
+ *
+ * Author: Steve Whitehouse <SteveW@ACM.org>
+ * Mostly copied from the IPv4 routing code
+ *
+ *
+ * Changes:
+ *
+ */
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/sockios.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/route.h> /* RTF_xxx */
+#include <net/neighbour.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_route.h>
+#include <net/dn_fib.h>
+#include <net/dn_neigh.h>
+#include <net/dn_dev.h>
+
+struct dn_zone
+{
+ struct dn_zone *dz_next;
+ struct dn_fib_node **dz_hash;
+ int dz_nent;
+ int dz_divisor;
+ u32 dz_hashmask;
+#define DZ_HASHMASK(dz) ((dz)->dz_hashmask)
+ int dz_order;
+ __le16 dz_mask;
+#define DZ_MASK(dz) ((dz)->dz_mask)
+};
+
+struct dn_hash
+{
+ struct dn_zone *dh_zones[17];
+ struct dn_zone *dh_zone_list;
+};
+
+#define dz_key_0(key) ((key).datum = 0)
+
+#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
+ for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define endfor_nexthops(fi) }
+
+#define DN_MAX_DIVISOR 1024
+#define DN_S_ZOMBIE 1
+#define DN_S_ACCESSED 2
+
+#define DN_FIB_SCAN(f, fp) \
+for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
+
+#define DN_FIB_SCAN_KEY(f, fp, key) \
+for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
+
+#define RT_TABLE_MIN 1
+#define DN_FIB_TABLE_HASHSZ 256
+static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ];
+static DEFINE_RWLOCK(dn_fib_tables_lock);
+
+static struct kmem_cache *dn_hash_kmem __read_mostly;
+static int dn_fib_hash_zombies;
+
+static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
+{
+ u16 h = le16_to_cpu(key.datum)>>(16 - dz->dz_order);
+ h ^= (h >> 10);
+ h ^= (h >> 6);
+ h &= DZ_HASHMASK(dz);
+ return *(dn_fib_idx_t *)&h;
+}
+
+static inline dn_fib_key_t dz_key(__le16 dst, struct dn_zone *dz)
+{
+ dn_fib_key_t k;
+ k.datum = dst & DZ_MASK(dz);
+ return k;
+}
+
+static inline struct dn_fib_node **dn_chain_p(dn_fib_key_t key, struct dn_zone *dz)
+{
+ return &dz->dz_hash[dn_hash(key, dz).datum];
+}
+
+static inline struct dn_fib_node *dz_chain(dn_fib_key_t key, struct dn_zone *dz)
+{
+ return dz->dz_hash[dn_hash(key, dz).datum];
+}
+
+static inline int dn_key_eq(dn_fib_key_t a, dn_fib_key_t b)
+{
+ return a.datum == b.datum;
+}
+
+static inline int dn_key_leq(dn_fib_key_t a, dn_fib_key_t b)
+{
+ return a.datum <= b.datum;
+}
+
+static inline void dn_rebuild_zone(struct dn_zone *dz,
+ struct dn_fib_node **old_ht,
+ int old_divisor)
+{
+ struct dn_fib_node *f, **fp, *next;
+ int i;
+
+ for(i = 0; i < old_divisor; i++) {
+ for(f = old_ht[i]; f; f = next) {
+ next = f->fn_next;
+ for(fp = dn_chain_p(f->fn_key, dz);
+ *fp && dn_key_leq((*fp)->fn_key, f->fn_key);
+ fp = &(*fp)->fn_next)
+ /* NOTHING */;
+ f->fn_next = *fp;
+ *fp = f;
+ }
+ }
+}
+
+static void dn_rehash_zone(struct dn_zone *dz)
+{
+ struct dn_fib_node **ht, **old_ht;
+ int old_divisor, new_divisor;
+ u32 new_hashmask;
+
+ old_divisor = dz->dz_divisor;
+
+ switch (old_divisor) {
+ case 16:
+ new_divisor = 256;
+ new_hashmask = 0xFF;
+ break;
+ default:
+ printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n",
+ old_divisor);
+ case 256:
+ new_divisor = 1024;
+ new_hashmask = 0x3FF;
+ break;
+ }
+
+ ht = kcalloc(new_divisor, sizeof(struct dn_fib_node*), GFP_KERNEL);
+ if (ht == NULL)
+ return;
+
+ write_lock_bh(&dn_fib_tables_lock);
+ old_ht = dz->dz_hash;
+ dz->dz_hash = ht;
+ dz->dz_hashmask = new_hashmask;
+ dz->dz_divisor = new_divisor;
+ dn_rebuild_zone(dz, old_ht, old_divisor);
+ write_unlock_bh(&dn_fib_tables_lock);
+ kfree(old_ht);
+}
+
+static void dn_free_node(struct dn_fib_node *f)
+{
+ dn_fib_release_info(DN_FIB_INFO(f));
+ kmem_cache_free(dn_hash_kmem, f);
+}
+
+
+static struct dn_zone *dn_new_zone(struct dn_hash *table, int z)
+{
+ int i;
+ struct dn_zone *dz = kzalloc(sizeof(struct dn_zone), GFP_KERNEL);
+ if (!dz)
+ return NULL;
+
+ if (z) {
+ dz->dz_divisor = 16;
+ dz->dz_hashmask = 0x0F;
+ } else {
+ dz->dz_divisor = 1;
+ dz->dz_hashmask = 0;
+ }
+
+ dz->dz_hash = kcalloc(dz->dz_divisor, sizeof(struct dn_fib_node *), GFP_KERNEL);
+ if (!dz->dz_hash) {
+ kfree(dz);
+ return NULL;
+ }
+
+ dz->dz_order = z;
+ dz->dz_mask = dnet_make_mask(z);
+
+ for(i = z + 1; i <= 16; i++)
+ if (table->dh_zones[i])
+ break;
+
+ write_lock_bh(&dn_fib_tables_lock);
+ if (i>16) {
+ dz->dz_next = table->dh_zone_list;
+ table->dh_zone_list = dz;
+ } else {
+ dz->dz_next = table->dh_zones[i]->dz_next;
+ table->dh_zones[i]->dz_next = dz;
+ }
+ table->dh_zones[z] = dz;
+ write_unlock_bh(&dn_fib_tables_lock);
+ return dz;
+}
+
+
+static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct nlattr *attrs[], struct dn_fib_info *fi)
+{
+ struct rtnexthop *nhp;
+ int nhlen;
+
+ if (attrs[RTA_PRIORITY] &&
+ nla_get_u32(attrs[RTA_PRIORITY]) != fi->fib_priority)
+ return 1;
+
+ if (attrs[RTA_OIF] || attrs[RTA_GATEWAY]) {
+ if ((!attrs[RTA_OIF] || nla_get_u32(attrs[RTA_OIF]) == fi->fib_nh->nh_oif) &&
+ (!attrs[RTA_GATEWAY] || nla_get_le16(attrs[RTA_GATEWAY]) != fi->fib_nh->nh_gw))
+ return 0;
+ return 1;
+ }
+
+ if (!attrs[RTA_MULTIPATH])
+ return 0;
+
+ nhp = nla_data(attrs[RTA_MULTIPATH]);
+ nhlen = nla_len(attrs[RTA_MULTIPATH]);
+
+ for_nexthops(fi) {
+ int attrlen = nhlen - sizeof(struct rtnexthop);
+ __le16 gw;
+
+ if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+ return -EINVAL;
+ if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
+ return 1;
+ if (attrlen) {
+ struct nlattr *gw_attr;
+
+ gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
+ gw = gw_attr ? nla_get_le16(gw_attr) : 0;
+
+ if (gw && gw != nh->nh_gw)
+ return 1;
+ }
+ nhp = RTNH_NEXT(nhp);
+ } endfor_nexthops(fi);
+
+ return 0;
+}
+
+static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
+{
+ size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(2) /* RTA_DST */
+ + nla_total_size(4) /* RTA_PRIORITY */
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+
+ /* space for nested metrics */
+ payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
+
+ if (fi->fib_nhs) {
+ /* Also handles the special case fib_nhs == 1 */
+
+ /* each nexthop is packed in an attribute */
+ size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+
+ /* may contain a gateway attribute */
+ nhsize += nla_total_size(4);
+
+ /* all nexthops are packed in a nested attribute */
+ payload += nla_total_size(fi->fib_nhs * nhsize);
+ }
+
+ return payload;
+}
+
+static int dn_fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
+ u32 tb_id, u8 type, u8 scope, void *dst, int dst_len,
+ struct dn_fib_info *fi, unsigned int flags)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = AF_DECnet;
+ rtm->rtm_dst_len = dst_len;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = tb_id;
+ rtm->rtm_flags = fi->fib_flags;
+ rtm->rtm_scope = scope;
+ rtm->rtm_type = type;
+ rtm->rtm_protocol = fi->fib_protocol;
+
+ if (nla_put_u32(skb, RTA_TABLE, tb_id) < 0)
+ goto errout;
+
+ if (rtm->rtm_dst_len &&
+ nla_put(skb, RTA_DST, 2, dst) < 0)
+ goto errout;
+
+ if (fi->fib_priority &&
+ nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority) < 0)
+ goto errout;
+
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ goto errout;
+
+ if (fi->fib_nhs == 1) {
+ if (fi->fib_nh->nh_gw &&
+ nla_put_le16(skb, RTA_GATEWAY, fi->fib_nh->nh_gw) < 0)
+ goto errout;
+
+ if (fi->fib_nh->nh_oif &&
+ nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif) < 0)
+ goto errout;
+ }
+
+ if (fi->fib_nhs > 1) {
+ struct rtnexthop *nhp;
+ struct nlattr *mp_head;
+
+ if (!(mp_head = nla_nest_start(skb, RTA_MULTIPATH)))
+ goto errout;
+
+ for_nexthops(fi) {
+ if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp))))
+ goto errout;
+
+ nhp->rtnh_flags = nh->nh_flags & 0xFF;
+ nhp->rtnh_hops = nh->nh_weight - 1;
+ nhp->rtnh_ifindex = nh->nh_oif;
+
+ if (nh->nh_gw &&
+ nla_put_le16(skb, RTA_GATEWAY, nh->nh_gw) < 0)
+ goto errout;
+
+ nhp->rtnh_len = skb_tail_pointer(skb) - (unsigned char *)nhp;
+ } endfor_nexthops(fi);
+
+ nla_nest_end(skb, mp_head);
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+
+static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
+ struct nlmsghdr *nlh, struct netlink_skb_parms *req)
+{
+ struct sk_buff *skb;
+ u32 portid = req ? req->portid : 0;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(dn_fib_nlmsg_size(DN_FIB_INFO(f)), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = dn_fib_dump_info(skb, portid, nlh->nlmsg_seq, event, tb_id,
+ f->fn_type, f->fn_scope, &f->fn_key, z,
+ DN_FIB_INFO(f), 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in dn_fib_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, &init_net, portid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err);
+}
+
+static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct dn_fib_table *tb,
+ struct dn_zone *dz,
+ struct dn_fib_node *f)
+{
+ int i, s_i;
+
+ s_i = cb->args[4];
+ for(i = 0; f; i++, f = f->fn_next) {
+ if (i < s_i)
+ continue;
+ if (f->fn_state & DN_S_ZOMBIE)
+ continue;
+ if (dn_fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->n,
+ (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
+ f->fn_scope, &f->fn_key, dz->dz_order,
+ f->fn_info, NLM_F_MULTI) < 0) {
+ cb->args[4] = i;
+ return -1;
+ }
+ }
+ cb->args[4] = i;
+ return skb->len;
+}
+
+static __inline__ int dn_hash_dump_zone(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct dn_fib_table *tb,
+ struct dn_zone *dz)
+{
+ int h, s_h;
+
+ s_h = cb->args[3];
+ for(h = 0; h < dz->dz_divisor; h++) {
+ if (h < s_h)
+ continue;
+ if (h > s_h)
+ memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0]));
+ if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL)
+ continue;
+ if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) {
+ cb->args[3] = h;
+ return -1;
+ }
+ }
+ cb->args[3] = h;
+ return skb->len;
+}
+
+static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int m, s_m;
+ struct dn_zone *dz;
+ struct dn_hash *table = (struct dn_hash *)tb->data;
+
+ s_m = cb->args[2];
+ read_lock(&dn_fib_tables_lock);
+ for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) {
+ if (m < s_m)
+ continue;
+ if (m > s_m)
+ memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
+
+ if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) {
+ cb->args[2] = m;
+ read_unlock(&dn_fib_tables_lock);
+ return -1;
+ }
+ }
+ read_unlock(&dn_fib_tables_lock);
+ cb->args[2] = m;
+
+ return skb->len;
+}
+
+int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int h, s_h;
+ unsigned int e = 0, s_e;
+ struct dn_fib_table *tb;
+ int dumped = 0;
+
+ if (!net_eq(net, &init_net))
+ return 0;
+
+ if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
+ ((struct rtmsg *)nlmsg_data(cb->nlh))->rtm_flags&RTM_F_CLONED)
+ return dn_cache_dump(skb, cb);
+
+ s_h = cb->args[0];
+ s_e = cb->args[1];
+
+ for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) {
+ e = 0;
+ hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist) {
+ if (e < s_e)
+ goto next;
+ if (dumped)
+ memset(&cb->args[2], 0, sizeof(cb->args) -
+ 2 * sizeof(cb->args[0]));
+ if (tb->dump(tb, skb, cb) < 0)
+ goto out;
+ dumped = 1;
+next:
+ e++;
+ }
+ }
+out:
+ cb->args[1] = e;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[],
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct dn_hash *table = (struct dn_hash *)tb->data;
+ struct dn_fib_node *new_f, *f, **fp, **del_fp;
+ struct dn_zone *dz;
+ struct dn_fib_info *fi;
+ int z = r->rtm_dst_len;
+ int type = r->rtm_type;
+ dn_fib_key_t key;
+ int err;
+
+ if (z > 16)
+ return -EINVAL;
+
+ dz = table->dh_zones[z];
+ if (!dz && !(dz = dn_new_zone(table, z)))
+ return -ENOBUFS;
+
+ dz_key_0(key);
+ if (attrs[RTA_DST]) {
+ __le16 dst = nla_get_le16(attrs[RTA_DST]);
+ if (dst & ~DZ_MASK(dz))
+ return -EINVAL;
+ key = dz_key(dst, dz);
+ }
+
+ if ((fi = dn_fib_create_info(r, attrs, n, &err)) == NULL)
+ return err;
+
+ if (dz->dz_nent > (dz->dz_divisor << 2) &&
+ dz->dz_divisor > DN_MAX_DIVISOR &&
+ (z==16 || (1<<z) > dz->dz_divisor))
+ dn_rehash_zone(dz);
+
+ fp = dn_chain_p(key, dz);
+
+ DN_FIB_SCAN(f, fp) {
+ if (dn_key_leq(key, f->fn_key))
+ break;
+ }
+
+ del_fp = NULL;
+
+ if (f && (f->fn_state & DN_S_ZOMBIE) &&
+ dn_key_eq(f->fn_key, key)) {
+ del_fp = fp;
+ fp = &f->fn_next;
+ f = *fp;
+ goto create;
+ }
+
+ DN_FIB_SCAN_KEY(f, fp, key) {
+ if (fi->fib_priority <= DN_FIB_INFO(f)->fib_priority)
+ break;
+ }
+
+ if (f && dn_key_eq(f->fn_key, key) &&
+ fi->fib_priority == DN_FIB_INFO(f)->fib_priority) {
+ struct dn_fib_node **ins_fp;
+
+ err = -EEXIST;
+ if (n->nlmsg_flags & NLM_F_EXCL)
+ goto out;
+
+ if (n->nlmsg_flags & NLM_F_REPLACE) {
+ del_fp = fp;
+ fp = &f->fn_next;
+ f = *fp;
+ goto replace;
+ }
+
+ ins_fp = fp;
+ err = -EEXIST;
+
+ DN_FIB_SCAN_KEY(f, fp, key) {
+ if (fi->fib_priority != DN_FIB_INFO(f)->fib_priority)
+ break;
+ if (f->fn_type == type &&
+ f->fn_scope == r->rtm_scope &&
+ DN_FIB_INFO(f) == fi)
+ goto out;
+ }
+
+ if (!(n->nlmsg_flags & NLM_F_APPEND)) {
+ fp = ins_fp;
+ f = *fp;
+ }
+ }
+
+create:
+ err = -ENOENT;
+ if (!(n->nlmsg_flags & NLM_F_CREATE))
+ goto out;
+
+replace:
+ err = -ENOBUFS;
+ new_f = kmem_cache_zalloc(dn_hash_kmem, GFP_KERNEL);
+ if (new_f == NULL)
+ goto out;
+
+ new_f->fn_key = key;
+ new_f->fn_type = type;
+ new_f->fn_scope = r->rtm_scope;
+ DN_FIB_INFO(new_f) = fi;
+
+ new_f->fn_next = f;
+ write_lock_bh(&dn_fib_tables_lock);
+ *fp = new_f;
+ write_unlock_bh(&dn_fib_tables_lock);
+ dz->dz_nent++;
+
+ if (del_fp) {
+ f = *del_fp;
+ write_lock_bh(&dn_fib_tables_lock);
+ *del_fp = f->fn_next;
+ write_unlock_bh(&dn_fib_tables_lock);
+
+ if (!(f->fn_state & DN_S_ZOMBIE))
+ dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
+ if (f->fn_state & DN_S_ACCESSED)
+ dn_rt_cache_flush(-1);
+ dn_free_node(f);
+ dz->dz_nent--;
+ } else {
+ dn_rt_cache_flush(-1);
+ }
+
+ dn_rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->n, n, req);
+
+ return 0;
+out:
+ dn_fib_release_info(fi);
+ return err;
+}
+
+
+static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[],
+ struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+ struct dn_hash *table = (struct dn_hash*)tb->data;
+ struct dn_fib_node **fp, **del_fp, *f;
+ int z = r->rtm_dst_len;
+ struct dn_zone *dz;
+ dn_fib_key_t key;
+ int matched;
+
+
+ if (z > 16)
+ return -EINVAL;
+
+ if ((dz = table->dh_zones[z]) == NULL)
+ return -ESRCH;
+
+ dz_key_0(key);
+ if (attrs[RTA_DST]) {
+ __le16 dst = nla_get_le16(attrs[RTA_DST]);
+ if (dst & ~DZ_MASK(dz))
+ return -EINVAL;
+ key = dz_key(dst, dz);
+ }
+
+ fp = dn_chain_p(key, dz);
+
+ DN_FIB_SCAN(f, fp) {
+ if (dn_key_eq(f->fn_key, key))
+ break;
+ if (dn_key_leq(key, f->fn_key))
+ return -ESRCH;
+ }
+
+ matched = 0;
+ del_fp = NULL;
+ DN_FIB_SCAN_KEY(f, fp, key) {
+ struct dn_fib_info *fi = DN_FIB_INFO(f);
+
+ if (f->fn_state & DN_S_ZOMBIE)
+ return -ESRCH;
+
+ matched++;
+
+ if (del_fp == NULL &&
+ (!r->rtm_type || f->fn_type == r->rtm_type) &&
+ (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
+ (!r->rtm_protocol ||
+ fi->fib_protocol == r->rtm_protocol) &&
+ dn_fib_nh_match(r, n, attrs, fi) == 0)
+ del_fp = fp;
+ }
+
+ if (del_fp) {
+ f = *del_fp;
+ dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
+
+ if (matched != 1) {
+ write_lock_bh(&dn_fib_tables_lock);
+ *del_fp = f->fn_next;
+ write_unlock_bh(&dn_fib_tables_lock);
+
+ if (f->fn_state & DN_S_ACCESSED)
+ dn_rt_cache_flush(-1);
+ dn_free_node(f);
+ dz->dz_nent--;
+ } else {
+ f->fn_state |= DN_S_ZOMBIE;
+ if (f->fn_state & DN_S_ACCESSED) {
+ f->fn_state &= ~DN_S_ACCESSED;
+ dn_rt_cache_flush(-1);
+ }
+ if (++dn_fib_hash_zombies > 128)
+ dn_fib_flush();
+ }
+
+ return 0;
+ }
+
+ return -ESRCH;
+}
+
+static inline int dn_flush_list(struct dn_fib_node **fp, int z, struct dn_hash *table)
+{
+ int found = 0;
+ struct dn_fib_node *f;
+
+ while((f = *fp) != NULL) {
+ struct dn_fib_info *fi = DN_FIB_INFO(f);
+
+ if (fi && ((f->fn_state & DN_S_ZOMBIE) || (fi->fib_flags & RTNH_F_DEAD))) {
+ write_lock_bh(&dn_fib_tables_lock);
+ *fp = f->fn_next;
+ write_unlock_bh(&dn_fib_tables_lock);
+
+ dn_free_node(f);
+ found++;
+ continue;
+ }
+ fp = &f->fn_next;
+ }
+
+ return found;
+}
+
+static int dn_fib_table_flush(struct dn_fib_table *tb)
+{
+ struct dn_hash *table = (struct dn_hash *)tb->data;
+ struct dn_zone *dz;
+ int found = 0;
+
+ dn_fib_hash_zombies = 0;
+ for(dz = table->dh_zone_list; dz; dz = dz->dz_next) {
+ int i;
+ int tmp = 0;
+ for(i = dz->dz_divisor-1; i >= 0; i--)
+ tmp += dn_flush_list(&dz->dz_hash[i], dz->dz_order, table);
+ dz->dz_nent -= tmp;
+ found += tmp;
+ }
+
+ return found;
+}
+
+static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowidn *flp, struct dn_fib_res *res)
+{
+ int err;
+ struct dn_zone *dz;
+ struct dn_hash *t = (struct dn_hash *)tb->data;
+
+ read_lock(&dn_fib_tables_lock);
+ for(dz = t->dh_zone_list; dz; dz = dz->dz_next) {
+ struct dn_fib_node *f;
+ dn_fib_key_t k = dz_key(flp->daddr, dz);
+
+ for(f = dz_chain(k, dz); f; f = f->fn_next) {
+ if (!dn_key_eq(k, f->fn_key)) {
+ if (dn_key_leq(k, f->fn_key))
+ break;
+ else
+ continue;
+ }
+
+ f->fn_state |= DN_S_ACCESSED;
+
+ if (f->fn_state&DN_S_ZOMBIE)
+ continue;
+
+ if (f->fn_scope < flp->flowidn_scope)
+ continue;
+
+ err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res);
+
+ if (err == 0) {
+ res->type = f->fn_type;
+ res->scope = f->fn_scope;
+ res->prefixlen = dz->dz_order;
+ goto out;
+ }
+ if (err < 0)
+ goto out;
+ }
+ }
+ err = 1;
+out:
+ read_unlock(&dn_fib_tables_lock);
+ return err;
+}
+
+
+struct dn_fib_table *dn_fib_get_table(u32 n, int create)
+{
+ struct dn_fib_table *t;
+ unsigned int h;
+
+ if (n < RT_TABLE_MIN)
+ return NULL;
+
+ if (n > RT_TABLE_MAX)
+ return NULL;
+
+ h = n & (DN_FIB_TABLE_HASHSZ - 1);
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(t, &dn_fib_table_hash[h], hlist) {
+ if (t->n == n) {
+ rcu_read_unlock();
+ return t;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!create)
+ return NULL;
+
+ if (in_interrupt()) {
+ net_dbg_ratelimited("DECnet: BUG! Attempt to create routing table from interrupt\n");
+ return NULL;
+ }
+
+ t = kzalloc(sizeof(struct dn_fib_table) + sizeof(struct dn_hash),
+ GFP_KERNEL);
+ if (t == NULL)
+ return NULL;
+
+ t->n = n;
+ t->insert = dn_fib_table_insert;
+ t->delete = dn_fib_table_delete;
+ t->lookup = dn_fib_table_lookup;
+ t->flush = dn_fib_table_flush;
+ t->dump = dn_fib_table_dump;
+ hlist_add_head_rcu(&t->hlist, &dn_fib_table_hash[h]);
+
+ return t;
+}
+
+struct dn_fib_table *dn_fib_empty_table(void)
+{
+ u32 id;
+
+ for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++)
+ if (dn_fib_get_table(id, 0) == NULL)
+ return dn_fib_get_table(id, 1);
+ return NULL;
+}
+
+void dn_fib_flush(void)
+{
+ int flushed = 0;
+ struct dn_fib_table *tb;
+ unsigned int h;
+
+ for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
+ hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist)
+ flushed += tb->flush(tb);
+ }
+
+ if (flushed)
+ dn_rt_cache_flush(-1);
+}
+
+void __init dn_fib_table_init(void)
+{
+ dn_hash_kmem = kmem_cache_create("dn_fib_info_cache",
+ sizeof(struct dn_fib_info),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+}
+
+void __exit dn_fib_table_cleanup(void)
+{
+ struct dn_fib_table *t;
+ struct hlist_node *next;
+ unsigned int h;
+
+ write_lock(&dn_fib_tables_lock);
+ for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
+ hlist_for_each_entry_safe(t, next, &dn_fib_table_hash[h],
+ hlist) {
+ hlist_del(&t->hlist);
+ kfree(t);
+ }
+ }
+ write_unlock(&dn_fib_tables_lock);
+}
diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c
new file mode 100644
index 000000000..1d330fd43
--- /dev/null
+++ b/net/decnet/dn_timer.c
@@ -0,0 +1,103 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Socket Timer Functions
+ *
+ * Author: Steve Whitehouse <SteveW@ACM.org>
+ *
+ *
+ * Changes:
+ * Steve Whitehouse : Made keepalive timer part of the same
+ * timer idea.
+ * Steve Whitehouse : Added checks for sk->sock_readers
+ * David S. Miller : New socket locking
+ * Steve Whitehouse : Timer grabs socket ref.
+ */
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <net/sock.h>
+#include <linux/atomic.h>
+#include <linux/jiffies.h>
+#include <net/flow.h>
+#include <net/dn.h>
+
+/*
+ * Slow timer is for everything else (n * 500mS)
+ */
+
+#define SLOW_INTERVAL (HZ/2)
+
+static void dn_slow_timer(unsigned long arg);
+
+void dn_start_slow_timer(struct sock *sk)
+{
+ setup_timer(&sk->sk_timer, dn_slow_timer, (unsigned long)sk);
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL);
+}
+
+void dn_stop_slow_timer(struct sock *sk)
+{
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+static void dn_slow_timer(unsigned long arg)
+{
+ struct sock *sk = (struct sock *)arg;
+ struct dn_scp *scp = DN_SK(sk);
+
+ bh_lock_sock(sk);
+
+ if (sock_owned_by_user(sk)) {
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 10);
+ goto out;
+ }
+
+ /*
+ * The persist timer is the standard slow timer used for retransmits
+ * in both connection establishment and disconnection as well as
+ * in the RUN state. The different states are catered for by changing
+ * the function pointer in the socket. Setting the timer to a value
+ * of zero turns it off. We allow the persist_fxn to turn the
+ * timer off in a permant way by returning non-zero, so that
+ * timer based routines may remove sockets. This is why we have a
+ * sock_hold()/sock_put() around the timer to prevent the socket
+ * going away in the middle.
+ */
+ if (scp->persist && scp->persist_fxn) {
+ if (scp->persist <= SLOW_INTERVAL) {
+ scp->persist = 0;
+
+ if (scp->persist_fxn(sk))
+ goto out;
+ } else {
+ scp->persist -= SLOW_INTERVAL;
+ }
+ }
+
+ /*
+ * Check for keepalive timeout. After the other timer 'cos if
+ * the previous timer caused a retransmit, we don't need to
+ * do this. scp->stamp is the last time that we sent a packet.
+ * The keepalive function sends a link service packet to the
+ * other end. If it remains unacknowledged, the standard
+ * socket timers will eventually shut the socket down. Each
+ * time we do this, scp->stamp will be updated, thus
+ * we won't try and send another until scp->keepalive has passed
+ * since the last successful transmission.
+ */
+ if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) {
+ if (time_after_eq(jiffies, scp->stamp + scp->keepalive))
+ scp->keepalive_fxn(sk);
+ }
+
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL);
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig
new file mode 100644
index 000000000..8d7c109d5
--- /dev/null
+++ b/net/decnet/netfilter/Kconfig
@@ -0,0 +1,16 @@
+#
+# DECnet netfilter configuration
+#
+
+menu "DECnet: Netfilter Configuration"
+ depends on DECNET && NETFILTER
+ depends on NETFILTER_ADVANCED
+
+config DECNET_NF_GRABULATOR
+ tristate "Routing message grabulator (for userland routing daemon)"
+ help
+ Enable this module if you want to use the userland DECnet routing
+ daemon. You will also need to enable routing support for DECnet
+ unless you just want to monitor routing messages from other nodes.
+
+endmenu
diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile
new file mode 100644
index 000000000..255c1ae9d
--- /dev/null
+++ b/net/decnet/netfilter/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for DECnet netfilter modules
+#
+
+obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o
+
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
new file mode 100644
index 000000000..af34fc9bd
--- /dev/null
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -0,0 +1,159 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet Routing Message Grabulator
+ *
+ * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/
+ * This code may be copied under the GPL v.2 or at your option
+ * any later version.
+ *
+ * Author: Steven Whitehouse <steve@chygwyn.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/spinlock.h>
+#include <net/netlink.h>
+#include <linux/netfilter_decnet.h>
+
+#include <net/sock.h>
+#include <net/flow.h>
+#include <net/dn.h>
+#include <net/dn_route.h>
+
+static struct sock *dnrmg = NULL;
+
+
+static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp)
+{
+ struct sk_buff *skb = NULL;
+ size_t size;
+ sk_buff_data_t old_tail;
+ struct nlmsghdr *nlh;
+ unsigned char *ptr;
+ struct nf_dn_rtmsg *rtm;
+
+ size = NLMSG_ALIGN(rt_skb->len) +
+ NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg));
+ skb = nlmsg_new(size, GFP_ATOMIC);
+ if (!skb) {
+ *errp = -ENOMEM;
+ return NULL;
+ }
+ old_tail = skb->tail;
+ nlh = nlmsg_put(skb, 0, 0, 0, size, 0);
+ if (!nlh) {
+ kfree_skb(skb);
+ *errp = -ENOMEM;
+ return NULL;
+ }
+ rtm = (struct nf_dn_rtmsg *)nlmsg_data(nlh);
+ rtm->nfdn_ifindex = rt_skb->dev->ifindex;
+ ptr = NFDN_RTMSG(rtm);
+ skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len);
+ nlh->nlmsg_len = skb->tail - old_tail;
+ return skb;
+}
+
+static void dnrmg_send_peer(struct sk_buff *skb)
+{
+ struct sk_buff *skb2;
+ int status = 0;
+ int group = 0;
+ unsigned char flags = *skb->data;
+
+ switch (flags & DN_RT_CNTL_MSK) {
+ case DN_RT_PKT_L1RT:
+ group = DNRNG_NLGRP_L1;
+ break;
+ case DN_RT_PKT_L2RT:
+ group = DNRNG_NLGRP_L2;
+ break;
+ default:
+ return;
+ }
+
+ skb2 = dnrmg_build_message(skb, &status);
+ if (skb2 == NULL)
+ return;
+ NETLINK_CB(skb2).dst_group = group;
+ netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
+}
+
+
+static unsigned int dnrmg_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ dnrmg_send_peer(skb);
+ return NF_ACCEPT;
+}
+
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ RCV_SKB_FAIL(-EPERM);
+
+ /* Eventually we might send routing messages too */
+
+ RCV_SKB_FAIL(-EINVAL);
+}
+
+static struct nf_hook_ops dnrmg_ops __read_mostly = {
+ .hook = dnrmg_hook,
+ .pf = NFPROTO_DECNET,
+ .hooknum = NF_DN_ROUTE,
+ .priority = NF_DN_PRI_DNRTMSG,
+};
+
+static int __init dn_rtmsg_init(void)
+{
+ int rv = 0;
+ struct netlink_kernel_cfg cfg = {
+ .groups = DNRNG_NLGRP_MAX,
+ .input = dnrmg_receive_user_skb,
+ };
+
+ dnrmg = netlink_kernel_create(&init_net, NETLINK_DNRTMSG, &cfg);
+ if (dnrmg == NULL) {
+ printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
+ return -ENOMEM;
+ }
+
+ rv = nf_register_hook(&dnrmg_ops);
+ if (rv) {
+ netlink_kernel_release(dnrmg);
+ }
+
+ return rv;
+}
+
+static void __exit dn_rtmsg_fini(void)
+{
+ nf_unregister_hook(&dnrmg_ops);
+ netlink_kernel_release(dnrmg);
+}
+
+
+MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
+MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
+
+module_init(dn_rtmsg_init);
+module_exit(dn_rtmsg_fini);
+
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
new file mode 100644
index 000000000..5325b541c
--- /dev/null
+++ b/net/decnet/sysctl_net_decnet.c
@@ -0,0 +1,372 @@
+/*
+ * DECnet An implementation of the DECnet protocol suite for the LINUX
+ * operating system. DECnet is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * DECnet sysctl support functions
+ *
+ * Author: Steve Whitehouse <SteveW@ACM.org>
+ *
+ *
+ * Changes:
+ * Steve Whitehouse - C99 changes and default device handling
+ * Steve Whitehouse - Memory buffer settings, like the tcp ones
+ *
+ */
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/fs.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+
+#include <asm/uaccess.h>
+
+#include <net/dn.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+
+
+int decnet_debug_level;
+int decnet_time_wait = 30;
+int decnet_dn_count = 1;
+int decnet_di_count = 3;
+int decnet_dr_count = 3;
+int decnet_log_martians = 1;
+int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW;
+
+/* Reasonable defaults, I hope, based on tcp's defaults */
+long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
+int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
+int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
+
+#ifdef CONFIG_SYSCTL
+extern int decnet_dst_gc_interval;
+static int min_decnet_time_wait[] = { 5 };
+static int max_decnet_time_wait[] = { 600 };
+static int min_state_count[] = { 1 };
+static int max_state_count[] = { NSP_MAXRXTSHIFT };
+static int min_decnet_dst_gc_interval[] = { 1 };
+static int max_decnet_dst_gc_interval[] = { 60 };
+static int min_decnet_no_fc_max_cwnd[] = { NSP_MIN_WINDOW };
+static int max_decnet_no_fc_max_cwnd[] = { NSP_MAX_WINDOW };
+static char node_name[7] = "???";
+
+static struct ctl_table_header *dn_table_header = NULL;
+
+/*
+ * ctype.h :-)
+ */
+#define ISNUM(x) (((x) >= '0') && ((x) <= '9'))
+#define ISLOWER(x) (((x) >= 'a') && ((x) <= 'z'))
+#define ISUPPER(x) (((x) >= 'A') && ((x) <= 'Z'))
+#define ISALPHA(x) (ISLOWER(x) || ISUPPER(x))
+#define INVALID_END_CHAR(x) (ISNUM(x) || ISALPHA(x))
+
+static void strip_it(char *str)
+{
+ for(;;) {
+ switch (*str) {
+ case ' ':
+ case '\n':
+ case '\r':
+ case ':':
+ *str = 0;
+ /* Fallthrough */
+ case 0:
+ return;
+ }
+ str++;
+ }
+}
+
+/*
+ * Simple routine to parse an ascii DECnet address
+ * into a network order address.
+ */
+static int parse_addr(__le16 *addr, char *str)
+{
+ __u16 area, node;
+
+ while(*str && !ISNUM(*str)) str++;
+
+ if (*str == 0)
+ return -1;
+
+ area = (*str++ - '0');
+ if (ISNUM(*str)) {
+ area *= 10;
+ area += (*str++ - '0');
+ }
+
+ if (*str++ != '.')
+ return -1;
+
+ if (!ISNUM(*str))
+ return -1;
+
+ node = *str++ - '0';
+ if (ISNUM(*str)) {
+ node *= 10;
+ node += (*str++ - '0');
+ }
+ if (ISNUM(*str)) {
+ node *= 10;
+ node += (*str++ - '0');
+ }
+ if (ISNUM(*str)) {
+ node *= 10;
+ node += (*str++ - '0');
+ }
+
+ if ((node > 1023) || (area > 63))
+ return -1;
+
+ if (INVALID_END_CHAR(*str))
+ return -1;
+
+ *addr = cpu_to_le16((area << 10) | node);
+
+ return 0;
+}
+
+static int dn_node_address_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ char addr[DN_ASCBUF_LEN];
+ size_t len;
+ __le16 dnaddr;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
+
+ if (copy_from_user(addr, buffer, len))
+ return -EFAULT;
+
+ addr[len] = 0;
+ strip_it(addr);
+
+ if (parse_addr(&dnaddr, addr))
+ return -EINVAL;
+
+ dn_dev_devices_off();
+
+ decnet_address = dnaddr;
+
+ dn_dev_devices_on();
+
+ *ppos += len;
+
+ return 0;
+ }
+
+ dn_addr2asc(le16_to_cpu(decnet_address), addr);
+ len = strlen(addr);
+ addr[len++] = '\n';
+
+ if (len > *lenp) len = *lenp;
+
+ if (copy_to_user(buffer, addr, len))
+ return -EFAULT;
+
+ *lenp = len;
+ *ppos += len;
+
+ return 0;
+}
+
+static int dn_def_dev_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ size_t len;
+ struct net_device *dev;
+ char devname[17];
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ if (*lenp > 16)
+ return -E2BIG;
+
+ if (copy_from_user(devname, buffer, *lenp))
+ return -EFAULT;
+
+ devname[*lenp] = 0;
+ strip_it(devname);
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (dev == NULL)
+ return -ENODEV;
+
+ if (dev->dn_ptr == NULL) {
+ dev_put(dev);
+ return -ENODEV;
+ }
+
+ if (dn_dev_set_default(dev, 1)) {
+ dev_put(dev);
+ return -ENODEV;
+ }
+ *ppos += *lenp;
+
+ return 0;
+ }
+
+ dev = dn_dev_get_default();
+ if (dev == NULL) {
+ *lenp = 0;
+ return 0;
+ }
+
+ strcpy(devname, dev->name);
+ dev_put(dev);
+ len = strlen(devname);
+ devname[len++] = '\n';
+
+ if (len > *lenp) len = *lenp;
+
+ if (copy_to_user(buffer, devname, len))
+ return -EFAULT;
+
+ *lenp = len;
+ *ppos += len;
+
+ return 0;
+}
+
+static struct ctl_table dn_table[] = {
+ {
+ .procname = "node_address",
+ .maxlen = 7,
+ .mode = 0644,
+ .proc_handler = dn_node_address_handler,
+ },
+ {
+ .procname = "node_name",
+ .data = node_name,
+ .maxlen = 7,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "default_device",
+ .maxlen = 16,
+ .mode = 0644,
+ .proc_handler = dn_def_dev_handler,
+ },
+ {
+ .procname = "time_wait",
+ .data = &decnet_time_wait,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_decnet_time_wait,
+ .extra2 = &max_decnet_time_wait
+ },
+ {
+ .procname = "dn_count",
+ .data = &decnet_dn_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_state_count,
+ .extra2 = &max_state_count
+ },
+ {
+ .procname = "di_count",
+ .data = &decnet_di_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_state_count,
+ .extra2 = &max_state_count
+ },
+ {
+ .procname = "dr_count",
+ .data = &decnet_dr_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_state_count,
+ .extra2 = &max_state_count
+ },
+ {
+ .procname = "dst_gc_interval",
+ .data = &decnet_dst_gc_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_decnet_dst_gc_interval,
+ .extra2 = &max_decnet_dst_gc_interval
+ },
+ {
+ .procname = "no_fc_max_cwnd",
+ .data = &decnet_no_fc_max_cwnd,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_decnet_no_fc_max_cwnd,
+ .extra2 = &max_decnet_no_fc_max_cwnd
+ },
+ {
+ .procname = "decnet_mem",
+ .data = &sysctl_decnet_mem,
+ .maxlen = sizeof(sysctl_decnet_mem),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax
+ },
+ {
+ .procname = "decnet_rmem",
+ .data = &sysctl_decnet_rmem,
+ .maxlen = sizeof(sysctl_decnet_rmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "decnet_wmem",
+ .data = &sysctl_decnet_wmem,
+ .maxlen = sizeof(sysctl_decnet_wmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "debug",
+ .data = &decnet_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+void dn_register_sysctl(void)
+{
+ dn_table_header = register_net_sysctl(&init_net, "net/decnet", dn_table);
+}
+
+void dn_unregister_sysctl(void)
+{
+ unregister_net_sysctl_table(dn_table_header);
+}
+
+#else /* CONFIG_SYSCTL */
+void dn_unregister_sysctl(void)
+{
+}
+void dn_register_sysctl(void)
+{
+}
+
+#endif
diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig
new file mode 100644
index 000000000..50d49f7e0
--- /dev/null
+++ b/net/dns_resolver/Kconfig
@@ -0,0 +1,27 @@
+#
+# Configuration for DNS Resolver
+#
+config DNS_RESOLVER
+ tristate "DNS Resolver support"
+ depends on NET && KEYS
+ help
+ Saying Y here will include support for the DNS Resolver key type
+ which can be used to make upcalls to perform DNS lookups in
+ userspace.
+
+ DNS Resolver is used to query DNS server for information. Examples
+ being resolving a UNC hostname element to an IP address for CIFS or
+ performing a DNS query for AFSDB records so that AFS can locate a
+ cell's volume location database servers.
+
+ DNS Resolver is used by the CIFS and AFS modules, and would support
+ SMB2 later. DNS Resolver is supported by the userspace upcall
+ helper "/sbin/dns.resolver" via /etc/request-key.conf.
+
+ See <file:Documentation/networking/dns_resolver.txt> for further
+ information.
+
+ To compile this as a module, choose M here: the module will be called
+ dnsresolver.
+
+ If unsure, say N.
diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile
new file mode 100644
index 000000000..d5c13c2eb
--- /dev/null
+++ b/net/dns_resolver/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux DNS Resolver.
+#
+
+obj-$(CONFIG_DNS_RESOLVER) += dns_resolver.o
+
+dns_resolver-y := dns_key.o dns_query.o
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
new file mode 100644
index 000000000..31cd4fd75
--- /dev/null
+++ b/net/dns_resolver/dns_key.c
@@ -0,0 +1,319 @@
+/* Key type used to cache DNS lookups made by the kernel
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ * Copyright (c) 2007 Igor Mammedov
+ * Author(s): Igor Mammedov (niallain@gmail.com)
+ * Steve French (sfrench@us.ibm.com)
+ * Wang Lei (wang840925@gmail.com)
+ * David Howells (dhowells@redhat.com)
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/keyctl.h>
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("DNS Resolver");
+MODULE_AUTHOR("Wang Lei");
+MODULE_LICENSE("GPL");
+
+unsigned int dns_resolver_debug;
+module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "DNS Resolver debugging mask");
+
+const struct cred *dns_resolver_cache;
+
+#define DNS_ERRORNO_OPTION "dnserror"
+
+/*
+ * Preparse instantiation data for a dns_resolver key.
+ *
+ * The data must be a NUL-terminated string, with the NUL char accounted in
+ * datalen.
+ *
+ * If the data contains a '#' characters, then we take the clause after each
+ * one to be an option of the form 'key=value'. The actual data of interest is
+ * the string leading up to the first '#'. For instance:
+ *
+ * "ip1,ip2,...#foo=bar"
+ */
+static int
+dns_resolver_preparse(struct key_preparsed_payload *prep)
+{
+ struct user_key_payload *upayload;
+ unsigned long derrno;
+ int ret;
+ int datalen = prep->datalen, result_len = 0;
+ const char *data = prep->data, *end, *opt;
+
+ kenter("'%*.*s',%u", datalen, datalen, data, datalen);
+
+ if (datalen <= 1 || !data || data[datalen - 1] != '\0')
+ return -EINVAL;
+ datalen--;
+
+ /* deal with any options embedded in the data */
+ end = data + datalen;
+ opt = memchr(data, '#', datalen);
+ if (!opt) {
+ /* no options: the entire data is the result */
+ kdebug("no options");
+ result_len = datalen;
+ } else {
+ const char *next_opt;
+
+ result_len = opt - data;
+ opt++;
+ kdebug("options: '%s'", opt);
+ do {
+ const char *eq;
+ int opt_len, opt_nlen, opt_vlen, tmp;
+
+ next_opt = memchr(opt, '#', end - opt) ?: end;
+ opt_len = next_opt - opt;
+ if (!opt_len) {
+ printk(KERN_WARNING
+ "Empty option to dns_resolver key\n");
+ return -EINVAL;
+ }
+
+ eq = memchr(opt, '=', opt_len) ?: end;
+ opt_nlen = eq - opt;
+ eq++;
+ opt_vlen = next_opt - eq; /* will be -1 if no value */
+
+ tmp = opt_vlen >= 0 ? opt_vlen : 0;
+ kdebug("option '%*.*s' val '%*.*s'",
+ opt_nlen, opt_nlen, opt, tmp, tmp, eq);
+
+ /* see if it's an error number representing a DNS error
+ * that's to be recorded as the result in this key */
+ if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 &&
+ memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) {
+ kdebug("dns error number option");
+ if (opt_vlen <= 0)
+ goto bad_option_value;
+
+ ret = kstrtoul(eq, 10, &derrno);
+ if (ret < 0)
+ goto bad_option_value;
+
+ if (derrno < 1 || derrno > 511)
+ goto bad_option_value;
+
+ kdebug("dns error no. = %lu", derrno);
+ prep->type_data[0] = ERR_PTR(-derrno);
+ continue;
+ }
+
+ bad_option_value:
+ printk(KERN_WARNING
+ "Option '%*.*s' to dns_resolver key:"
+ " bad/missing value\n",
+ opt_nlen, opt_nlen, opt);
+ return -EINVAL;
+ } while (opt = next_opt + 1, opt < end);
+ }
+
+ /* don't cache the result if we're caching an error saying there's no
+ * result */
+ if (prep->type_data[0]) {
+ kleave(" = 0 [h_error %ld]", PTR_ERR(prep->type_data[0]));
+ return 0;
+ }
+
+ kdebug("store result");
+ prep->quotalen = result_len;
+
+ upayload = kmalloc(sizeof(*upayload) + result_len + 1, GFP_KERNEL);
+ if (!upayload) {
+ kleave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ upayload->datalen = result_len;
+ memcpy(upayload->data, data, result_len);
+ upayload->data[result_len] = '\0';
+
+ prep->payload[0] = upayload;
+ kleave(" = 0");
+ return 0;
+}
+
+/*
+ * Clean up the preparse data
+ */
+static void dns_resolver_free_preparse(struct key_preparsed_payload *prep)
+{
+ pr_devel("==>%s()\n", __func__);
+
+ kfree(prep->payload[0]);
+}
+
+/*
+ * The description is of the form "[<type>:]<domain_name>"
+ *
+ * The domain name may be a simple name or an absolute domain name (which
+ * should end with a period). The domain name is case-independent.
+ */
+static bool dns_resolver_cmp(const struct key *key,
+ const struct key_match_data *match_data)
+{
+ int slen, dlen, ret = 0;
+ const char *src = key->description, *dsp = match_data->raw_data;
+
+ kenter("%s,%s", src, dsp);
+
+ if (!src || !dsp)
+ goto no_match;
+
+ if (strcasecmp(src, dsp) == 0)
+ goto matched;
+
+ slen = strlen(src);
+ dlen = strlen(dsp);
+ if (slen <= 0 || dlen <= 0)
+ goto no_match;
+ if (src[slen - 1] == '.')
+ slen--;
+ if (dsp[dlen - 1] == '.')
+ dlen--;
+ if (slen != dlen || strncasecmp(src, dsp, slen) != 0)
+ goto no_match;
+
+matched:
+ ret = 1;
+no_match:
+ kleave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Preparse the match criterion.
+ */
+static int dns_resolver_match_preparse(struct key_match_data *match_data)
+{
+ match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE;
+ match_data->cmp = dns_resolver_cmp;
+ return 0;
+}
+
+/*
+ * Describe a DNS key
+ */
+static void dns_resolver_describe(const struct key *key, struct seq_file *m)
+{
+ int err = key->type_data.x[0];
+
+ seq_puts(m, key->description);
+ if (key_is_instantiated(key)) {
+ if (err)
+ seq_printf(m, ": %d", err);
+ else
+ seq_printf(m, ": %u", key->datalen);
+ }
+}
+
+/*
+ * read the DNS data
+ * - the key's semaphore is read-locked
+ */
+static long dns_resolver_read(const struct key *key,
+ char __user *buffer, size_t buflen)
+{
+ if (key->type_data.x[0])
+ return key->type_data.x[0];
+
+ return user_read(key, buffer, buflen);
+}
+
+struct key_type key_type_dns_resolver = {
+ .name = "dns_resolver",
+ .preparse = dns_resolver_preparse,
+ .free_preparse = dns_resolver_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .match_preparse = dns_resolver_match_preparse,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = dns_resolver_describe,
+ .read = dns_resolver_read,
+};
+
+static int __init init_dns_resolver(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret;
+
+ /* create an override credential set with a special thread keyring in
+ * which DNS requests are cached
+ *
+ * this is used to prevent malicious redirections from being installed
+ * with add_key().
+ */
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = keyring_alloc(".dns_resolver",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = register_key_type(&key_type_dns_resolver);
+ if (ret < 0)
+ goto failed_put_key;
+
+ /* instruct request_key() to use this special keyring as a cache for
+ * the results it looks up */
+ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ dns_resolver_cache = cred;
+
+ kdebug("DNS resolver keyring: %d\n", key_serial(keyring));
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+
+static void __exit exit_dns_resolver(void)
+{
+ key_revoke(dns_resolver_cache->thread_keyring);
+ unregister_key_type(&key_type_dns_resolver);
+ put_cred(dns_resolver_cache);
+}
+
+module_init(init_dns_resolver)
+module_exit(exit_dns_resolver)
+MODULE_LICENSE("GPL");
+
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
new file mode 100644
index 000000000..39d2c39bd
--- /dev/null
+++ b/net/dns_resolver/dns_query.c
@@ -0,0 +1,167 @@
+/* Upcall routine, designed to work as a key type and working through
+ * /sbin/request-key to contact userspace when handling DNS queries.
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ * Copyright (c) 2007 Igor Mammedov
+ * Author(s): Igor Mammedov (niallain@gmail.com)
+ * Steve French (sfrench@us.ibm.com)
+ * Wang Lei (wang840925@gmail.com)
+ * David Howells (dhowells@redhat.com)
+ *
+ * The upcall wrapper used to make an arbitrary DNS query.
+ *
+ * This function requires the appropriate userspace tool dns.upcall to be
+ * installed and something like the following lines should be added to the
+ * /etc/request-key.conf file:
+ *
+ * create dns_resolver * * /sbin/dns.upcall %k
+ *
+ * For example to use this module to query AFSDB RR:
+ *
+ * create dns_resolver afsdb:* * /sbin/dns.afsdb %k
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dns_resolver.h>
+#include <linux/err.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+
+#include "internal.h"
+
+/**
+ * dns_query - Query the DNS
+ * @type: Query type (or NULL for straight host->IP lookup)
+ * @name: Name to look up
+ * @namelen: Length of name
+ * @options: Request options (or NULL if no options)
+ * @_result: Where to place the returned data.
+ * @_expiry: Where to store the result expiry time (or NULL)
+ *
+ * The data will be returned in the pointer at *result, and the caller is
+ * responsible for freeing it.
+ *
+ * The description should be of the form "[<query_type>:]<domain_name>", and
+ * the options need to be appropriate for the query type requested. If no
+ * query_type is given, then the query is a straight hostname to IP address
+ * lookup.
+ *
+ * The DNS resolution lookup is performed by upcalling to userspace by way of
+ * requesting a key of type dns_resolver.
+ *
+ * Returns the size of the result on success, -ve error code otherwise.
+ */
+int dns_query(const char *type, const char *name, size_t namelen,
+ const char *options, char **_result, time_t *_expiry)
+{
+ struct key *rkey;
+ struct user_key_payload *upayload;
+ const struct cred *saved_cred;
+ size_t typelen, desclen;
+ char *desc, *cp;
+ int ret, len;
+
+ kenter("%s,%*.*s,%zu,%s",
+ type, (int)namelen, (int)namelen, name, namelen, options);
+
+ if (!name || namelen == 0 || !_result)
+ return -EINVAL;
+
+ /* construct the query key description as "[<type>:]<name>" */
+ typelen = 0;
+ desclen = 0;
+ if (type) {
+ typelen = strlen(type);
+ if (typelen < 1)
+ return -EINVAL;
+ desclen += typelen + 1;
+ }
+
+ if (!namelen)
+ namelen = strnlen(name, 256);
+ if (namelen < 3 || namelen > 255)
+ return -EINVAL;
+ desclen += namelen + 1;
+
+ desc = kmalloc(desclen, GFP_KERNEL);
+ if (!desc)
+ return -ENOMEM;
+
+ cp = desc;
+ if (type) {
+ memcpy(cp, type, typelen);
+ cp += typelen;
+ *cp++ = ':';
+ }
+ memcpy(cp, name, namelen);
+ cp += namelen;
+ *cp = '\0';
+
+ if (!options)
+ options = "";
+ kdebug("call request_key(,%s,%s)", desc, options);
+
+ /* make the upcall, using special credentials to prevent the use of
+ * add_key() to preinstall malicious redirections
+ */
+ saved_cred = override_creds(dns_resolver_cache);
+ rkey = request_key(&key_type_dns_resolver, desc, options);
+ revert_creds(saved_cred);
+ kfree(desc);
+ if (IS_ERR(rkey)) {
+ ret = PTR_ERR(rkey);
+ goto out;
+ }
+
+ down_read(&rkey->sem);
+ set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
+ rkey->perm |= KEY_USR_VIEW;
+
+ ret = key_validate(rkey);
+ if (ret < 0)
+ goto put;
+
+ /* If the DNS server gave an error, return that to the caller */
+ ret = rkey->type_data.x[0];
+ if (ret)
+ goto put;
+
+ upayload = rcu_dereference_protected(rkey->payload.data,
+ lockdep_is_held(&rkey->sem));
+ len = upayload->datalen;
+
+ ret = -ENOMEM;
+ *_result = kmalloc(len + 1, GFP_KERNEL);
+ if (!*_result)
+ goto put;
+
+ memcpy(*_result, upayload->data, len);
+ (*_result)[len] = '\0';
+
+ if (_expiry)
+ *_expiry = rkey->expiry;
+
+ ret = len;
+put:
+ up_read(&rkey->sem);
+ key_put(rkey);
+out:
+ kleave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(dns_query);
diff --git a/net/dns_resolver/internal.h b/net/dns_resolver/internal.h
new file mode 100644
index 000000000..7af1ed39c
--- /dev/null
+++ b/net/dns_resolver/internal.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2010 Wang Lei
+ * Author(s): Wang Lei (wang840925@gmail.com). All Rights Reserved.
+ *
+ * Internal DNS Rsolver stuff
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+/*
+ * dns_key.c
+ */
+extern const struct cred *dns_resolver_cache;
+
+/*
+ * debug tracing
+ */
+extern unsigned int dns_resolver_debug;
+
+#define kdebug(FMT, ...) \
+do { \
+ if (unlikely(dns_resolver_debug)) \
+ printk(KERN_DEBUG "[%-6.6s] "FMT"\n", \
+ current->comm, ##__VA_ARGS__); \
+} while (0)
+
+#define kenter(FMT, ...) kdebug("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) kdebug("<== %s()"FMT"", __func__, ##__VA_ARGS__)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
new file mode 100644
index 000000000..ff7736f7f
--- /dev/null
+++ b/net/dsa/Kconfig
@@ -0,0 +1,41 @@
+config HAVE_NET_DSA
+ def_bool y
+ depends on NETDEVICES && !S390
+
+# Drivers must select NET_DSA and the appropriate tagging format
+
+config NET_DSA
+ tristate "Distributed Switch Architecture"
+ depends on HAVE_NET_DSA && NET_SWITCHDEV
+ select PHYLIB
+ ---help---
+ Say Y if you want to enable support for the hardware switches supported
+ by the Distributed Switch Architecture.
+
+if NET_DSA
+
+config NET_DSA_HWMON
+ bool "Distributed Switch Architecture HWMON support"
+ default y
+ depends on HWMON && !(NET_DSA=y && HWMON=m)
+ ---help---
+ Say Y if you want to expose thermal sensor data on switches supported
+ by the Distributed Switch Architecture.
+
+ Some of those switches contain thermal sensors. This data is available
+ via the hwmon sysfs interface and exposes the onboard sensors.
+
+# tagging formats
+config NET_DSA_TAG_BRCM
+ bool
+
+config NET_DSA_TAG_DSA
+ bool
+
+config NET_DSA_TAG_EDSA
+ bool
+
+config NET_DSA_TAG_TRAILER
+ bool
+
+endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
new file mode 100644
index 000000000..da06ed1df
--- /dev/null
+++ b/net/dsa/Makefile
@@ -0,0 +1,9 @@
+# the core
+obj-$(CONFIG_NET_DSA) += dsa_core.o
+dsa_core-y += dsa.o slave.o
+
+# tagging formats
+dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
+dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
+dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
+dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
new file mode 100644
index 000000000..392e29a02
--- /dev/null
+++ b/net/dsa/dsa.c
@@ -0,0 +1,939 @@
+/*
+ * net/dsa/dsa.c - Hardware switch handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/hwmon.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/dsa.h>
+#include <linux/of.h>
+#include <linux/of_mdio.h>
+#include <linux/of_platform.h>
+#include <linux/of_net.h>
+#include <linux/sysfs.h>
+#include "dsa_priv.h"
+
+char dsa_driver_version[] = "0.1";
+
+
+/* switch driver registration ***********************************************/
+static DEFINE_MUTEX(dsa_switch_drivers_mutex);
+static LIST_HEAD(dsa_switch_drivers);
+
+void register_switch_driver(struct dsa_switch_driver *drv)
+{
+ mutex_lock(&dsa_switch_drivers_mutex);
+ list_add_tail(&drv->list, &dsa_switch_drivers);
+ mutex_unlock(&dsa_switch_drivers_mutex);
+}
+EXPORT_SYMBOL_GPL(register_switch_driver);
+
+void unregister_switch_driver(struct dsa_switch_driver *drv)
+{
+ mutex_lock(&dsa_switch_drivers_mutex);
+ list_del_init(&drv->list);
+ mutex_unlock(&dsa_switch_drivers_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_switch_driver);
+
+static struct dsa_switch_driver *
+dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name)
+{
+ struct dsa_switch_driver *ret;
+ struct list_head *list;
+ char *name;
+
+ ret = NULL;
+ name = NULL;
+
+ mutex_lock(&dsa_switch_drivers_mutex);
+ list_for_each(list, &dsa_switch_drivers) {
+ struct dsa_switch_driver *drv;
+
+ drv = list_entry(list, struct dsa_switch_driver, list);
+
+ name = drv->probe(host_dev, sw_addr);
+ if (name != NULL) {
+ ret = drv;
+ break;
+ }
+ }
+ mutex_unlock(&dsa_switch_drivers_mutex);
+
+ *_name = name;
+
+ return ret;
+}
+
+/* hwmon support ************************************************************/
+
+#ifdef CONFIG_NET_DSA_HWMON
+
+static ssize_t temp1_input_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dsa_switch *ds = dev_get_drvdata(dev);
+ int temp, ret;
+
+ ret = ds->drv->get_temp(ds, &temp);
+ if (ret < 0)
+ return ret;
+
+ return sprintf(buf, "%d\n", temp * 1000);
+}
+static DEVICE_ATTR_RO(temp1_input);
+
+static ssize_t temp1_max_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dsa_switch *ds = dev_get_drvdata(dev);
+ int temp, ret;
+
+ ret = ds->drv->get_temp_limit(ds, &temp);
+ if (ret < 0)
+ return ret;
+
+ return sprintf(buf, "%d\n", temp * 1000);
+}
+
+static ssize_t temp1_max_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct dsa_switch *ds = dev_get_drvdata(dev);
+ int temp, ret;
+
+ ret = kstrtoint(buf, 0, &temp);
+ if (ret < 0)
+ return ret;
+
+ ret = ds->drv->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000));
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+static DEVICE_ATTR_RW(temp1_max);
+
+static ssize_t temp1_max_alarm_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dsa_switch *ds = dev_get_drvdata(dev);
+ bool alarm;
+ int ret;
+
+ ret = ds->drv->get_temp_alarm(ds, &alarm);
+ if (ret < 0)
+ return ret;
+
+ return sprintf(buf, "%d\n", alarm);
+}
+static DEVICE_ATTR_RO(temp1_max_alarm);
+
+static struct attribute *dsa_hwmon_attrs[] = {
+ &dev_attr_temp1_input.attr, /* 0 */
+ &dev_attr_temp1_max.attr, /* 1 */
+ &dev_attr_temp1_max_alarm.attr, /* 2 */
+ NULL
+};
+
+static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
+ struct attribute *attr, int index)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct dsa_switch *ds = dev_get_drvdata(dev);
+ struct dsa_switch_driver *drv = ds->drv;
+ umode_t mode = attr->mode;
+
+ if (index == 1) {
+ if (!drv->get_temp_limit)
+ mode = 0;
+ else if (!drv->set_temp_limit)
+ mode &= ~S_IWUSR;
+ } else if (index == 2 && !drv->get_temp_alarm) {
+ mode = 0;
+ }
+ return mode;
+}
+
+static const struct attribute_group dsa_hwmon_group = {
+ .attrs = dsa_hwmon_attrs,
+ .is_visible = dsa_hwmon_attrs_visible,
+};
+__ATTRIBUTE_GROUPS(dsa_hwmon);
+
+#endif /* CONFIG_NET_DSA_HWMON */
+
+/* basic switch operations **************************************************/
+static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)
+{
+ struct dsa_switch_driver *drv = ds->drv;
+ struct dsa_switch_tree *dst = ds->dst;
+ struct dsa_chip_data *pd = ds->pd;
+ bool valid_name_found = false;
+ int index = ds->index;
+ int i, ret;
+
+ /*
+ * Validate supplied switch configuration.
+ */
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ char *name;
+
+ name = pd->port_names[i];
+ if (name == NULL)
+ continue;
+
+ if (!strcmp(name, "cpu")) {
+ if (dst->cpu_switch != -1) {
+ netdev_err(dst->master_netdev,
+ "multiple cpu ports?!\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ dst->cpu_switch = index;
+ dst->cpu_port = i;
+ } else if (!strcmp(name, "dsa")) {
+ ds->dsa_port_mask |= 1 << i;
+ } else {
+ ds->phys_port_mask |= 1 << i;
+ }
+ valid_name_found = true;
+ }
+
+ if (!valid_name_found && i == DSA_MAX_PORTS) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Make the built-in MII bus mask match the number of ports,
+ * switch drivers can override this later
+ */
+ ds->phys_mii_mask = ds->phys_port_mask;
+
+ /*
+ * If the CPU connects to this switch, set the switch tree
+ * tagging protocol to the preferred tagging format of this
+ * switch.
+ */
+ if (dst->cpu_switch == index) {
+ switch (ds->tag_protocol) {
+#ifdef CONFIG_NET_DSA_TAG_DSA
+ case DSA_TAG_PROTO_DSA:
+ dst->rcv = dsa_netdev_ops.rcv;
+ break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_EDSA
+ case DSA_TAG_PROTO_EDSA:
+ dst->rcv = edsa_netdev_ops.rcv;
+ break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_TRAILER
+ case DSA_TAG_PROTO_TRAILER:
+ dst->rcv = trailer_netdev_ops.rcv;
+ break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_BRCM
+ case DSA_TAG_PROTO_BRCM:
+ dst->rcv = brcm_netdev_ops.rcv;
+ break;
+#endif
+ case DSA_TAG_PROTO_NONE:
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ goto out;
+ }
+
+ dst->tag_protocol = ds->tag_protocol;
+ }
+
+ /*
+ * Do basic register setup.
+ */
+ ret = drv->setup(ds);
+ if (ret < 0)
+ goto out;
+
+ ret = drv->set_addr(ds, dst->master_netdev->dev_addr);
+ if (ret < 0)
+ goto out;
+
+ ds->slave_mii_bus = mdiobus_alloc();
+ if (ds->slave_mii_bus == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dsa_slave_mii_bus_init(ds);
+
+ ret = mdiobus_register(ds->slave_mii_bus);
+ if (ret < 0)
+ goto out_free;
+
+
+ /*
+ * Create network devices for physical switch ports.
+ */
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ if (!(ds->phys_port_mask & (1 << i)))
+ continue;
+
+ ret = dsa_slave_create(ds, parent, i, pd->port_names[i]);
+ if (ret < 0) {
+ netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s)\n",
+ index, i, pd->port_names[i]);
+ ret = 0;
+ }
+ }
+
+#ifdef CONFIG_NET_DSA_HWMON
+ /* If the switch provides a temperature sensor,
+ * register with hardware monitoring subsystem.
+ * Treat registration error as non-fatal and ignore it.
+ */
+ if (drv->get_temp) {
+ const char *netname = netdev_name(dst->master_netdev);
+ char hname[IFNAMSIZ + 1];
+ int i, j;
+
+ /* Create valid hwmon 'name' attribute */
+ for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) {
+ if (isalnum(netname[i]))
+ hname[j++] = netname[i];
+ }
+ hname[j] = '\0';
+ scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d",
+ hname, index);
+ ds->hwmon_dev = hwmon_device_register_with_groups(NULL,
+ ds->hwmon_name, ds, dsa_hwmon_groups);
+ if (IS_ERR(ds->hwmon_dev))
+ ds->hwmon_dev = NULL;
+ }
+#endif /* CONFIG_NET_DSA_HWMON */
+
+ return ret;
+
+out_free:
+ mdiobus_free(ds->slave_mii_bus);
+out:
+ kfree(ds);
+ return ret;
+}
+
+static struct dsa_switch *
+dsa_switch_setup(struct dsa_switch_tree *dst, int index,
+ struct device *parent, struct device *host_dev)
+{
+ struct dsa_chip_data *pd = dst->pd->chip + index;
+ struct dsa_switch_driver *drv;
+ struct dsa_switch *ds;
+ int ret;
+ char *name;
+
+ /*
+ * Probe for switch model.
+ */
+ drv = dsa_switch_probe(host_dev, pd->sw_addr, &name);
+ if (drv == NULL) {
+ netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n",
+ index);
+ return ERR_PTR(-EINVAL);
+ }
+ netdev_info(dst->master_netdev, "[%d]: detected a %s switch\n",
+ index, name);
+
+
+ /*
+ * Allocate and initialise switch state.
+ */
+ ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL);
+ if (ds == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ ds->dst = dst;
+ ds->index = index;
+ ds->pd = pd;
+ ds->drv = drv;
+ ds->tag_protocol = drv->tag_protocol;
+ ds->master_dev = host_dev;
+
+ ret = dsa_switch_setup_one(ds, parent);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return ds;
+}
+
+static void dsa_switch_destroy(struct dsa_switch *ds)
+{
+#ifdef CONFIG_NET_DSA_HWMON
+ if (ds->hwmon_dev)
+ hwmon_device_unregister(ds->hwmon_dev);
+#endif
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int dsa_switch_suspend(struct dsa_switch *ds)
+{
+ int i, ret = 0;
+
+ /* Suspend slave network devices */
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ if (!dsa_is_port_initialized(ds, i))
+ continue;
+
+ ret = dsa_slave_suspend(ds->ports[i]);
+ if (ret)
+ return ret;
+ }
+
+ if (ds->drv->suspend)
+ ret = ds->drv->suspend(ds);
+
+ return ret;
+}
+
+static int dsa_switch_resume(struct dsa_switch *ds)
+{
+ int i, ret = 0;
+
+ if (ds->drv->resume)
+ ret = ds->drv->resume(ds);
+
+ if (ret)
+ return ret;
+
+ /* Resume slave network devices */
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ if (!dsa_is_port_initialized(ds, i))
+ continue;
+
+ ret = dsa_slave_resume(ds->ports[i]);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+#endif
+
+
+/* link polling *************************************************************/
+static void dsa_link_poll_work(struct work_struct *ugly)
+{
+ struct dsa_switch_tree *dst;
+ int i;
+
+ dst = container_of(ugly, struct dsa_switch_tree, link_poll_work);
+
+ for (i = 0; i < dst->pd->nr_chips; i++) {
+ struct dsa_switch *ds = dst->ds[i];
+
+ if (ds != NULL && ds->drv->poll_link != NULL)
+ ds->drv->poll_link(ds);
+ }
+
+ mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ));
+}
+
+static void dsa_link_poll_timer(unsigned long _dst)
+{
+ struct dsa_switch_tree *dst = (void *)_dst;
+
+ schedule_work(&dst->link_poll_work);
+}
+
+
+/* platform driver init and cleanup *****************************************/
+static int dev_is_class(struct device *dev, void *class)
+{
+ if (dev->class != NULL && !strcmp(dev->class->name, class))
+ return 1;
+
+ return 0;
+}
+
+static struct device *dev_find_class(struct device *parent, char *class)
+{
+ if (dev_is_class(parent, class)) {
+ get_device(parent);
+ return parent;
+ }
+
+ return device_find_child(parent, class, dev_is_class);
+}
+
+struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
+{
+ struct device *d;
+
+ d = dev_find_class(dev, "mdio_bus");
+ if (d != NULL) {
+ struct mii_bus *bus;
+
+ bus = to_mii_bus(d);
+ put_device(d);
+
+ return bus;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus);
+
+static struct net_device *dev_to_net_device(struct device *dev)
+{
+ struct device *d;
+
+ d = dev_find_class(dev, "net");
+ if (d != NULL) {
+ struct net_device *nd;
+
+ nd = to_net_dev(d);
+ dev_hold(nd);
+ put_device(d);
+
+ return nd;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_OF
+static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
+ struct dsa_chip_data *cd,
+ int chip_index, int port_index,
+ struct device_node *link)
+{
+ const __be32 *reg;
+ int link_sw_addr;
+ struct device_node *parent_sw;
+ int len;
+
+ parent_sw = of_get_parent(link);
+ if (!parent_sw)
+ return -EINVAL;
+
+ reg = of_get_property(parent_sw, "reg", &len);
+ if (!reg || (len != sizeof(*reg) * 2))
+ return -EINVAL;
+
+ /*
+ * Get the destination switch number from the second field of its 'reg'
+ * property, i.e. for "reg = <0x19 1>" sw_addr is '1'.
+ */
+ link_sw_addr = be32_to_cpup(reg + 1);
+
+ if (link_sw_addr >= pd->nr_chips)
+ return -EINVAL;
+
+ /* First time routing table allocation */
+ if (!cd->rtable) {
+ cd->rtable = kmalloc_array(pd->nr_chips, sizeof(s8),
+ GFP_KERNEL);
+ if (!cd->rtable)
+ return -ENOMEM;
+
+ /* default to no valid uplink/downlink */
+ memset(cd->rtable, -1, pd->nr_chips * sizeof(s8));
+ }
+
+ cd->rtable[link_sw_addr] = port_index;
+
+ return 0;
+}
+
+static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
+{
+ int i;
+ int port_index;
+
+ for (i = 0; i < pd->nr_chips; i++) {
+ port_index = 0;
+ while (port_index < DSA_MAX_PORTS) {
+ kfree(pd->chip[i].port_names[port_index]);
+ port_index++;
+ }
+ kfree(pd->chip[i].rtable);
+ }
+ kfree(pd->chip);
+}
+
+static int dsa_of_probe(struct device *dev)
+{
+ struct device_node *np = dev->of_node;
+ struct device_node *child, *mdio, *ethernet, *port, *link;
+ struct mii_bus *mdio_bus;
+ struct net_device *ethernet_dev;
+ struct dsa_platform_data *pd;
+ struct dsa_chip_data *cd;
+ const char *port_name;
+ int chip_index, port_index;
+ const unsigned int *sw_addr, *port_reg;
+ u32 eeprom_len;
+ int ret;
+
+ mdio = of_parse_phandle(np, "dsa,mii-bus", 0);
+ if (!mdio)
+ return -EINVAL;
+
+ mdio_bus = of_mdio_find_bus(mdio);
+ if (!mdio_bus)
+ return -EPROBE_DEFER;
+
+ ethernet = of_parse_phandle(np, "dsa,ethernet", 0);
+ if (!ethernet)
+ return -EINVAL;
+
+ ethernet_dev = of_find_net_device_by_node(ethernet);
+ if (!ethernet_dev)
+ return -EPROBE_DEFER;
+
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+
+ dev->platform_data = pd;
+ pd->of_netdev = ethernet_dev;
+ pd->nr_chips = of_get_available_child_count(np);
+ if (pd->nr_chips > DSA_MAX_SWITCHES)
+ pd->nr_chips = DSA_MAX_SWITCHES;
+
+ pd->chip = kcalloc(pd->nr_chips, sizeof(struct dsa_chip_data),
+ GFP_KERNEL);
+ if (!pd->chip) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ chip_index = -1;
+ for_each_available_child_of_node(np, child) {
+ chip_index++;
+ cd = &pd->chip[chip_index];
+
+ cd->of_node = child;
+ cd->host_dev = &mdio_bus->dev;
+
+ sw_addr = of_get_property(child, "reg", NULL);
+ if (!sw_addr)
+ continue;
+
+ cd->sw_addr = be32_to_cpup(sw_addr);
+ if (cd->sw_addr > PHY_MAX_ADDR)
+ continue;
+
+ if (!of_property_read_u32(child, "eeprom-length", &eeprom_len))
+ cd->eeprom_len = eeprom_len;
+
+ for_each_available_child_of_node(child, port) {
+ port_reg = of_get_property(port, "reg", NULL);
+ if (!port_reg)
+ continue;
+
+ port_index = be32_to_cpup(port_reg);
+
+ port_name = of_get_property(port, "label", NULL);
+ if (!port_name)
+ continue;
+
+ cd->port_dn[port_index] = port;
+
+ cd->port_names[port_index] = kstrdup(port_name,
+ GFP_KERNEL);
+ if (!cd->port_names[port_index]) {
+ ret = -ENOMEM;
+ goto out_free_chip;
+ }
+
+ link = of_parse_phandle(port, "link", 0);
+
+ if (!strcmp(port_name, "dsa") && link &&
+ pd->nr_chips > 1) {
+ ret = dsa_of_setup_routing_table(pd, cd,
+ chip_index, port_index, link);
+ if (ret)
+ goto out_free_chip;
+ }
+
+ if (port_index == DSA_MAX_PORTS)
+ break;
+ }
+ }
+
+ return 0;
+
+out_free_chip:
+ dsa_of_free_platform_data(pd);
+out_free:
+ kfree(pd);
+ dev->platform_data = NULL;
+ return ret;
+}
+
+static void dsa_of_remove(struct device *dev)
+{
+ struct dsa_platform_data *pd = dev->platform_data;
+
+ if (!dev->of_node)
+ return;
+
+ dsa_of_free_platform_data(pd);
+ kfree(pd);
+}
+#else
+static inline int dsa_of_probe(struct device *dev)
+{
+ return 0;
+}
+
+static inline void dsa_of_remove(struct device *dev)
+{
+}
+#endif
+
+static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
+ struct device *parent, struct dsa_platform_data *pd)
+{
+ int i;
+
+ dst->pd = pd;
+ dst->master_netdev = dev;
+ dst->cpu_switch = -1;
+ dst->cpu_port = -1;
+
+ for (i = 0; i < pd->nr_chips; i++) {
+ struct dsa_switch *ds;
+
+ ds = dsa_switch_setup(dst, i, parent, pd->chip[i].host_dev);
+ if (IS_ERR(ds)) {
+ netdev_err(dev, "[%d]: couldn't create dsa switch instance (error %ld)\n",
+ i, PTR_ERR(ds));
+ continue;
+ }
+
+ dst->ds[i] = ds;
+ if (ds->drv->poll_link != NULL)
+ dst->link_poll_needed = 1;
+ }
+
+ /*
+ * If we use a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point on get
+ * sent to the tag format's receive function.
+ */
+ wmb();
+ dev->dsa_ptr = (void *)dst;
+
+ if (dst->link_poll_needed) {
+ INIT_WORK(&dst->link_poll_work, dsa_link_poll_work);
+ init_timer(&dst->link_poll_timer);
+ dst->link_poll_timer.data = (unsigned long)dst;
+ dst->link_poll_timer.function = dsa_link_poll_timer;
+ dst->link_poll_timer.expires = round_jiffies(jiffies + HZ);
+ add_timer(&dst->link_poll_timer);
+ }
+}
+
+static int dsa_probe(struct platform_device *pdev)
+{
+ struct dsa_platform_data *pd = pdev->dev.platform_data;
+ struct net_device *dev;
+ struct dsa_switch_tree *dst;
+ int ret;
+
+ pr_notice_once("Distributed Switch Architecture driver version %s\n",
+ dsa_driver_version);
+
+ if (pdev->dev.of_node) {
+ ret = dsa_of_probe(&pdev->dev);
+ if (ret)
+ return ret;
+
+ pd = pdev->dev.platform_data;
+ }
+
+ if (pd == NULL || (pd->netdev == NULL && pd->of_netdev == NULL))
+ return -EINVAL;
+
+ if (pd->of_netdev) {
+ dev = pd->of_netdev;
+ dev_hold(dev);
+ } else {
+ dev = dev_to_net_device(pd->netdev);
+ }
+ if (dev == NULL) {
+ ret = -EPROBE_DEFER;
+ goto out;
+ }
+
+ if (dev->dsa_ptr != NULL) {
+ dev_put(dev);
+ ret = -EEXIST;
+ goto out;
+ }
+
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ if (dst == NULL) {
+ dev_put(dev);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ platform_set_drvdata(pdev, dst);
+
+ dsa_setup_dst(dst, dev, &pdev->dev, pd);
+
+ return 0;
+
+out:
+ dsa_of_remove(&pdev->dev);
+
+ return ret;
+}
+
+static void dsa_remove_dst(struct dsa_switch_tree *dst)
+{
+ int i;
+
+ if (dst->link_poll_needed)
+ del_timer_sync(&dst->link_poll_timer);
+
+ flush_work(&dst->link_poll_work);
+
+ for (i = 0; i < dst->pd->nr_chips; i++) {
+ struct dsa_switch *ds = dst->ds[i];
+
+ if (ds != NULL)
+ dsa_switch_destroy(ds);
+ }
+}
+
+static int dsa_remove(struct platform_device *pdev)
+{
+ struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
+
+ dsa_remove_dst(dst);
+ dsa_of_remove(&pdev->dev);
+
+ return 0;
+}
+
+static void dsa_shutdown(struct platform_device *pdev)
+{
+}
+
+static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+
+ if (unlikely(dst == NULL)) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ return dst->rcv(skb, dev, pt, orig_dev);
+}
+
+static struct packet_type dsa_pack_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_XDSA),
+ .func = dsa_switch_rcv,
+};
+
+static struct notifier_block dsa_netdevice_nb __read_mostly = {
+ .notifier_call = dsa_slave_netdevice_event,
+};
+
+#ifdef CONFIG_PM_SLEEP
+static int dsa_suspend(struct device *d)
+{
+ struct platform_device *pdev = to_platform_device(d);
+ struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
+ int i, ret = 0;
+
+ for (i = 0; i < dst->pd->nr_chips; i++) {
+ struct dsa_switch *ds = dst->ds[i];
+
+ if (ds != NULL)
+ ret = dsa_switch_suspend(ds);
+ }
+
+ return ret;
+}
+
+static int dsa_resume(struct device *d)
+{
+ struct platform_device *pdev = to_platform_device(d);
+ struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
+ int i, ret = 0;
+
+ for (i = 0; i < dst->pd->nr_chips; i++) {
+ struct dsa_switch *ds = dst->ds[i];
+
+ if (ds != NULL)
+ ret = dsa_switch_resume(ds);
+ }
+
+ return ret;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume);
+
+static const struct of_device_id dsa_of_match_table[] = {
+ { .compatible = "brcm,bcm7445-switch-v4.0" },
+ { .compatible = "marvell,dsa", },
+ {}
+};
+MODULE_DEVICE_TABLE(of, dsa_of_match_table);
+
+static struct platform_driver dsa_driver = {
+ .probe = dsa_probe,
+ .remove = dsa_remove,
+ .shutdown = dsa_shutdown,
+ .driver = {
+ .name = "dsa",
+ .of_match_table = dsa_of_match_table,
+ .pm = &dsa_pm_ops,
+ },
+};
+
+static int __init dsa_init_module(void)
+{
+ int rc;
+
+ register_netdevice_notifier(&dsa_netdevice_nb);
+
+ rc = platform_driver_register(&dsa_driver);
+ if (rc)
+ return rc;
+
+ dev_add_pack(&dsa_pack_type);
+
+ return 0;
+}
+module_init(dsa_init_module);
+
+static void __exit dsa_cleanup_module(void)
+{
+ unregister_netdevice_notifier(&dsa_netdevice_nb);
+ dev_remove_pack(&dsa_pack_type);
+ platform_driver_unregister(&dsa_driver);
+}
+module_exit(dsa_cleanup_module);
+
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>");
+MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:dsa");
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
new file mode 100644
index 000000000..d5f1f9b86
--- /dev/null
+++ b/net/dsa/dsa_priv.h
@@ -0,0 +1,78 @@
+/*
+ * net/dsa/dsa_priv.h - Hardware switch handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __DSA_PRIV_H
+#define __DSA_PRIV_H
+
+#include <linux/phy.h>
+#include <linux/netdevice.h>
+
+struct dsa_device_ops {
+ netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev);
+ int (*rcv)(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
+};
+
+struct dsa_slave_priv {
+ /*
+ * The linux network interface corresponding to this
+ * switch port.
+ */
+ struct net_device *dev;
+ netdev_tx_t (*xmit)(struct sk_buff *skb,
+ struct net_device *dev);
+
+ /*
+ * Which switch this port is a part of, and the port index
+ * for this port.
+ */
+ struct dsa_switch *parent;
+ u8 port;
+
+ /*
+ * The phylib phy_device pointer for the PHY connected
+ * to this port.
+ */
+ struct phy_device *phy;
+ phy_interface_t phy_interface;
+ int old_link;
+ int old_pause;
+ int old_duplex;
+
+ struct net_device *bridge_dev;
+};
+
+/* dsa.c */
+extern char dsa_driver_version[];
+
+/* slave.c */
+extern const struct dsa_device_ops notag_netdev_ops;
+void dsa_slave_mii_bus_init(struct dsa_switch *ds);
+int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
+ int port, char *name);
+int dsa_slave_suspend(struct net_device *slave_dev);
+int dsa_slave_resume(struct net_device *slave_dev);
+int dsa_slave_netdevice_event(struct notifier_block *unused,
+ unsigned long event, void *ptr);
+
+/* tag_dsa.c */
+extern const struct dsa_device_ops dsa_netdev_ops;
+
+/* tag_edsa.c */
+extern const struct dsa_device_ops edsa_netdev_ops;
+
+/* tag_trailer.c */
+extern const struct dsa_device_ops trailer_netdev_ops;
+
+/* tag_brcm.c */
+extern const struct dsa_device_ops brcm_netdev_ops;
+
+
+#endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
new file mode 100644
index 000000000..827cda560
--- /dev/null
+++ b/net/dsa/slave.c
@@ -0,0 +1,966 @@
+/*
+ * net/dsa/slave.c - Slave device handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/phy_fixed.h>
+#include <linux/of_net.h>
+#include <linux/of_mdio.h>
+#include <net/rtnetlink.h>
+#include <net/switchdev.h>
+#include <linux/if_bridge.h>
+#include "dsa_priv.h"
+
+/* slave mii_bus handling ***************************************************/
+static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
+{
+ struct dsa_switch *ds = bus->priv;
+
+ if (ds->phys_mii_mask & (1 << addr))
+ return ds->drv->phy_read(ds, addr, reg);
+
+ return 0xffff;
+}
+
+static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
+{
+ struct dsa_switch *ds = bus->priv;
+
+ if (ds->phys_mii_mask & (1 << addr))
+ return ds->drv->phy_write(ds, addr, reg, val);
+
+ return 0;
+}
+
+void dsa_slave_mii_bus_init(struct dsa_switch *ds)
+{
+ ds->slave_mii_bus->priv = (void *)ds;
+ ds->slave_mii_bus->name = "dsa slave smi";
+ ds->slave_mii_bus->read = dsa_slave_phy_read;
+ ds->slave_mii_bus->write = dsa_slave_phy_write;
+ snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x",
+ ds->index, ds->pd->sw_addr);
+ ds->slave_mii_bus->parent = ds->master_dev;
+ ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
+}
+
+
+/* slave device handling ****************************************************/
+static int dsa_slave_get_iflink(const struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ return p->parent->dst->master_netdev->ifindex;
+}
+
+static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p)
+{
+ return !!p->bridge_dev;
+}
+
+static int dsa_slave_open(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct net_device *master = p->parent->dst->master_netdev;
+ struct dsa_switch *ds = p->parent;
+ u8 stp_state = dsa_port_is_bridged(p) ?
+ BR_STATE_BLOCKING : BR_STATE_FORWARDING;
+ int err;
+
+ if (!(master->flags & IFF_UP))
+ return -ENETDOWN;
+
+ if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) {
+ err = dev_uc_add(master, dev->dev_addr);
+ if (err < 0)
+ goto out;
+ }
+
+ if (dev->flags & IFF_ALLMULTI) {
+ err = dev_set_allmulti(master, 1);
+ if (err < 0)
+ goto del_unicast;
+ }
+ if (dev->flags & IFF_PROMISC) {
+ err = dev_set_promiscuity(master, 1);
+ if (err < 0)
+ goto clear_allmulti;
+ }
+
+ if (ds->drv->port_enable) {
+ err = ds->drv->port_enable(ds, p->port, p->phy);
+ if (err)
+ goto clear_promisc;
+ }
+
+ if (ds->drv->port_stp_update)
+ ds->drv->port_stp_update(ds, p->port, stp_state);
+
+ if (p->phy)
+ phy_start(p->phy);
+
+ return 0;
+
+clear_promisc:
+ if (dev->flags & IFF_PROMISC)
+ dev_set_promiscuity(master, 0);
+clear_allmulti:
+ if (dev->flags & IFF_ALLMULTI)
+ dev_set_allmulti(master, -1);
+del_unicast:
+ if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
+ dev_uc_del(master, dev->dev_addr);
+out:
+ return err;
+}
+
+static int dsa_slave_close(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct net_device *master = p->parent->dst->master_netdev;
+ struct dsa_switch *ds = p->parent;
+
+ if (p->phy)
+ phy_stop(p->phy);
+
+ dev_mc_unsync(master, dev);
+ dev_uc_unsync(master, dev);
+ if (dev->flags & IFF_ALLMULTI)
+ dev_set_allmulti(master, -1);
+ if (dev->flags & IFF_PROMISC)
+ dev_set_promiscuity(master, -1);
+
+ if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
+ dev_uc_del(master, dev->dev_addr);
+
+ if (ds->drv->port_disable)
+ ds->drv->port_disable(ds, p->port, p->phy);
+
+ if (ds->drv->port_stp_update)
+ ds->drv->port_stp_update(ds, p->port, BR_STATE_DISABLED);
+
+ return 0;
+}
+
+static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct net_device *master = p->parent->dst->master_netdev;
+
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
+ if (change & IFF_PROMISC)
+ dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1);
+}
+
+static void dsa_slave_set_rx_mode(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct net_device *master = p->parent->dst->master_netdev;
+
+ dev_mc_sync(master, dev);
+ dev_uc_sync(master, dev);
+}
+
+static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct net_device *master = p->parent->dst->master_netdev;
+ struct sockaddr *addr = a;
+ int err;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ if (!(dev->flags & IFF_UP))
+ goto out;
+
+ if (!ether_addr_equal(addr->sa_data, master->dev_addr)) {
+ err = dev_uc_add(master, addr->sa_data);
+ if (err < 0)
+ return err;
+ }
+
+ if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
+ dev_uc_del(master, dev->dev_addr);
+
+out:
+ ether_addr_copy(dev->dev_addr, addr->sa_data);
+
+ return 0;
+}
+
+static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid, u16 nlm_flags)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret = -EOPNOTSUPP;
+
+ if (ds->drv->fdb_add)
+ ret = ds->drv->fdb_add(ds, p->port, addr, vid);
+
+ return ret;
+}
+
+static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr, u16 vid)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret = -EOPNOTSUPP;
+
+ if (ds->drv->fdb_del)
+ ret = ds->drv->fdb_del(ds, p->port, addr, vid);
+
+ return ret;
+}
+
+static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb,
+ const unsigned char *addr, u16 vid,
+ bool is_static,
+ u32 portid, u32 seq, int type,
+ unsigned int flags)
+{
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = NTF_EXT_LEARNED;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dev->ifindex;
+ ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE;
+
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+ goto nla_put_failure;
+
+ if (vid && nla_put_u16(skb, NDA_VLAN, vid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+/* Dump information about entries, in response to GETNEIGH */
+static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev,
+ struct net_device *filter_dev, int idx)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ unsigned char addr[ETH_ALEN] = { 0 };
+ int ret;
+
+ if (!ds->drv->fdb_getnext)
+ return -EOPNOTSUPP;
+
+ for (; ; idx++) {
+ bool is_static;
+
+ ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static);
+ if (ret < 0)
+ break;
+
+ if (idx < cb->args[0])
+ continue;
+
+ ret = dsa_slave_fill_info(dev, skb, addr, 0,
+ is_static,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, NLM_F_MULTI);
+ if (ret < 0)
+ break;
+ }
+
+ return idx;
+}
+
+static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ if (p->phy != NULL)
+ return phy_mii_ioctl(p->phy, ifr, cmd);
+
+ return -EOPNOTSUPP;
+}
+
+/* Return a bitmask of all ports being currently bridged within a given bridge
+ * device. Note that on leave, the mask will still return the bitmask of ports
+ * currently bridged, prior to port removal, and this is exactly what we want.
+ */
+static u32 dsa_slave_br_port_mask(struct dsa_switch *ds,
+ struct net_device *bridge)
+{
+ struct dsa_slave_priv *p;
+ unsigned int port;
+ u32 mask = 0;
+
+ for (port = 0; port < DSA_MAX_PORTS; port++) {
+ if (!dsa_is_port_initialized(ds, port))
+ continue;
+
+ p = netdev_priv(ds->ports[port]);
+
+ if (ds->ports[port]->priv_flags & IFF_BRIDGE_PORT &&
+ p->bridge_dev == bridge)
+ mask |= 1 << port;
+ }
+
+ return mask;
+}
+
+static int dsa_slave_stp_update(struct net_device *dev, u8 state)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret = -EOPNOTSUPP;
+
+ if (ds->drv->port_stp_update)
+ ret = ds->drv->port_stp_update(ds, p->port, state);
+
+ return ret;
+}
+
+static int dsa_slave_bridge_port_join(struct net_device *dev,
+ struct net_device *br)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret = -EOPNOTSUPP;
+
+ p->bridge_dev = br;
+
+ if (ds->drv->port_join_bridge)
+ ret = ds->drv->port_join_bridge(ds, p->port,
+ dsa_slave_br_port_mask(ds, br));
+
+ return ret;
+}
+
+static int dsa_slave_bridge_port_leave(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret = -EOPNOTSUPP;
+
+
+ if (ds->drv->port_leave_bridge)
+ ret = ds->drv->port_leave_bridge(ds, p->port,
+ dsa_slave_br_port_mask(ds, p->bridge_dev));
+
+ p->bridge_dev = NULL;
+
+ /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
+ * so allow it to be in BR_STATE_FORWARDING to be kept functional
+ */
+ dsa_slave_stp_update(dev, BR_STATE_FORWARDING);
+
+ return ret;
+}
+
+static int dsa_slave_parent_id_get(struct net_device *dev,
+ struct netdev_phys_item_id *psid)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ psid->id_len = sizeof(ds->index);
+ memcpy(&psid->id, &ds->index, psid->id_len);
+
+ return 0;
+}
+
+static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ return p->xmit(skb, dev);
+}
+
+static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ skb->dev = p->parent->dst->master_netdev;
+ dev_queue_xmit(skb);
+
+ return NETDEV_TX_OK;
+}
+
+
+/* ethtool operations *******************************************************/
+static int
+dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ int err;
+
+ err = -EOPNOTSUPP;
+ if (p->phy != NULL) {
+ err = phy_read_status(p->phy);
+ if (err == 0)
+ err = phy_ethtool_gset(p->phy, cmd);
+ }
+
+ return err;
+}
+
+static int
+dsa_slave_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ if (p->phy != NULL)
+ return phy_ethtool_sset(p->phy, cmd);
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_slave_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
+ strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version));
+ strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
+ strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
+}
+
+static int dsa_slave_get_regs_len(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->drv->get_regs_len)
+ return ds->drv->get_regs_len(ds, p->port);
+
+ return -EOPNOTSUPP;
+}
+
+static void
+dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->drv->get_regs)
+ ds->drv->get_regs(ds, p->port, regs, _p);
+}
+
+static int dsa_slave_nway_reset(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ if (p->phy != NULL)
+ return genphy_restart_aneg(p->phy);
+
+ return -EOPNOTSUPP;
+}
+
+static u32 dsa_slave_get_link(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+
+ if (p->phy != NULL) {
+ genphy_update_link(p->phy);
+ return p->phy->link;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_slave_get_eeprom_len(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->pd->eeprom_len)
+ return ds->pd->eeprom_len;
+
+ if (ds->drv->get_eeprom_len)
+ return ds->drv->get_eeprom_len(ds);
+
+ return 0;
+}
+
+static int dsa_slave_get_eeprom(struct net_device *dev,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->drv->get_eeprom)
+ return ds->drv->get_eeprom(ds, eeprom, data);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_slave_set_eeprom(struct net_device *dev,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->drv->set_eeprom)
+ return ds->drv->set_eeprom(ds, eeprom, data);
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_slave_get_strings(struct net_device *dev,
+ uint32_t stringset, uint8_t *data)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (stringset == ETH_SS_STATS) {
+ int len = ETH_GSTRING_LEN;
+
+ strncpy(data, "tx_packets", len);
+ strncpy(data + len, "tx_bytes", len);
+ strncpy(data + 2 * len, "rx_packets", len);
+ strncpy(data + 3 * len, "rx_bytes", len);
+ if (ds->drv->get_strings != NULL)
+ ds->drv->get_strings(ds, p->port, data + 4 * len);
+ }
+}
+
+static void dsa_slave_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ data[0] = p->dev->stats.tx_packets;
+ data[1] = p->dev->stats.tx_bytes;
+ data[2] = p->dev->stats.rx_packets;
+ data[3] = p->dev->stats.rx_bytes;
+ if (ds->drv->get_ethtool_stats != NULL)
+ ds->drv->get_ethtool_stats(ds, p->port, data + 4);
+}
+
+static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (sset == ETH_SS_STATS) {
+ int count;
+
+ count = 4;
+ if (ds->drv->get_sset_count != NULL)
+ count += ds->drv->get_sset_count(ds);
+
+ return count;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->drv->get_wol)
+ ds->drv->get_wol(ds, p->port, w);
+}
+
+static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret = -EOPNOTSUPP;
+
+ if (ds->drv->set_wol)
+ ret = ds->drv->set_wol(ds, p->port, w);
+
+ return ret;
+}
+
+static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret;
+
+ if (!ds->drv->set_eee)
+ return -EOPNOTSUPP;
+
+ ret = ds->drv->set_eee(ds, p->port, p->phy, e);
+ if (ret)
+ return ret;
+
+ if (p->phy)
+ ret = phy_ethtool_set_eee(p->phy, e);
+
+ return ret;
+}
+
+static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ int ret;
+
+ if (!ds->drv->get_eee)
+ return -EOPNOTSUPP;
+
+ ret = ds->drv->get_eee(ds, p->port, e);
+ if (ret)
+ return ret;
+
+ if (p->phy)
+ ret = phy_ethtool_get_eee(p->phy, e);
+
+ return ret;
+}
+
+static const struct ethtool_ops dsa_slave_ethtool_ops = {
+ .get_settings = dsa_slave_get_settings,
+ .set_settings = dsa_slave_set_settings,
+ .get_drvinfo = dsa_slave_get_drvinfo,
+ .get_regs_len = dsa_slave_get_regs_len,
+ .get_regs = dsa_slave_get_regs,
+ .nway_reset = dsa_slave_nway_reset,
+ .get_link = dsa_slave_get_link,
+ .get_eeprom_len = dsa_slave_get_eeprom_len,
+ .get_eeprom = dsa_slave_get_eeprom,
+ .set_eeprom = dsa_slave_set_eeprom,
+ .get_strings = dsa_slave_get_strings,
+ .get_ethtool_stats = dsa_slave_get_ethtool_stats,
+ .get_sset_count = dsa_slave_get_sset_count,
+ .set_wol = dsa_slave_set_wol,
+ .get_wol = dsa_slave_get_wol,
+ .set_eee = dsa_slave_set_eee,
+ .get_eee = dsa_slave_get_eee,
+};
+
+static const struct net_device_ops dsa_slave_netdev_ops = {
+ .ndo_open = dsa_slave_open,
+ .ndo_stop = dsa_slave_close,
+ .ndo_start_xmit = dsa_slave_xmit,
+ .ndo_change_rx_flags = dsa_slave_change_rx_flags,
+ .ndo_set_rx_mode = dsa_slave_set_rx_mode,
+ .ndo_set_mac_address = dsa_slave_set_mac_address,
+ .ndo_fdb_add = dsa_slave_fdb_add,
+ .ndo_fdb_del = dsa_slave_fdb_del,
+ .ndo_fdb_dump = dsa_slave_fdb_dump,
+ .ndo_do_ioctl = dsa_slave_ioctl,
+ .ndo_get_iflink = dsa_slave_get_iflink,
+};
+
+static const struct swdev_ops dsa_slave_swdev_ops = {
+ .swdev_parent_id_get = dsa_slave_parent_id_get,
+ .swdev_port_stp_update = dsa_slave_stp_update,
+};
+
+static void dsa_slave_adjust_link(struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+ unsigned int status_changed = 0;
+
+ if (p->old_link != p->phy->link) {
+ status_changed = 1;
+ p->old_link = p->phy->link;
+ }
+
+ if (p->old_duplex != p->phy->duplex) {
+ status_changed = 1;
+ p->old_duplex = p->phy->duplex;
+ }
+
+ if (p->old_pause != p->phy->pause) {
+ status_changed = 1;
+ p->old_pause = p->phy->pause;
+ }
+
+ if (ds->drv->adjust_link && status_changed)
+ ds->drv->adjust_link(ds, p->port, p->phy);
+
+ if (status_changed)
+ phy_print_status(p->phy);
+}
+
+static int dsa_slave_fixed_link_update(struct net_device *dev,
+ struct fixed_phy_status *status)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ if (ds->drv->fixed_link_update)
+ ds->drv->fixed_link_update(ds, p->port, status);
+
+ return 0;
+}
+
+/* slave device setup *******************************************************/
+static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
+ struct net_device *slave_dev,
+ int addr)
+{
+ struct dsa_switch *ds = p->parent;
+
+ p->phy = ds->slave_mii_bus->phy_map[addr];
+ if (!p->phy)
+ return -ENODEV;
+
+ /* Use already configured phy mode */
+ p->phy_interface = p->phy->interface;
+ phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
+ p->phy_interface);
+
+ return 0;
+}
+
+static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
+ struct net_device *slave_dev)
+{
+ struct dsa_switch *ds = p->parent;
+ struct dsa_chip_data *cd = ds->pd;
+ struct device_node *phy_dn, *port_dn;
+ bool phy_is_fixed = false;
+ u32 phy_flags = 0;
+ int mode, ret;
+
+ port_dn = cd->port_dn[p->port];
+ mode = of_get_phy_mode(port_dn);
+ if (mode < 0)
+ mode = PHY_INTERFACE_MODE_NA;
+ p->phy_interface = mode;
+
+ phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
+ if (of_phy_is_fixed_link(port_dn)) {
+ /* In the case of a fixed PHY, the DT node associated
+ * to the fixed PHY is the Port DT node
+ */
+ ret = of_phy_register_fixed_link(port_dn);
+ if (ret) {
+ netdev_err(slave_dev, "failed to register fixed PHY\n");
+ return ret;
+ }
+ phy_is_fixed = true;
+ phy_dn = port_dn;
+ }
+
+ if (ds->drv->get_phy_flags)
+ phy_flags = ds->drv->get_phy_flags(ds, p->port);
+
+ if (phy_dn) {
+ ret = of_mdio_parse_addr(&slave_dev->dev, phy_dn);
+ /* If this PHY address is part of phys_mii_mask, which means
+ * that we need to divert reads and writes to/from it, then we
+ * want to bind this device using the slave MII bus created by
+ * DSA to make that happen.
+ */
+ if (!phy_is_fixed && ret >= 0 &&
+ (ds->phys_mii_mask & (1 << ret))) {
+ ret = dsa_slave_phy_connect(p, slave_dev, ret);
+ if (ret)
+ return ret;
+ } else {
+ p->phy = of_phy_connect(slave_dev, phy_dn,
+ dsa_slave_adjust_link,
+ phy_flags,
+ p->phy_interface);
+ }
+ }
+
+ if (p->phy && phy_is_fixed)
+ fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update);
+
+ /* We could not connect to a designated PHY, so use the switch internal
+ * MDIO bus instead
+ */
+ if (!p->phy) {
+ ret = dsa_slave_phy_connect(p, slave_dev, p->port);
+ if (ret)
+ return ret;
+ } else {
+ netdev_info(slave_dev, "attached PHY at address %d [%s]\n",
+ p->phy->addr, p->phy->drv->name);
+ }
+
+ return 0;
+}
+
+int dsa_slave_suspend(struct net_device *slave_dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(slave_dev);
+
+ netif_device_detach(slave_dev);
+
+ if (p->phy) {
+ phy_stop(p->phy);
+ p->old_pause = -1;
+ p->old_link = -1;
+ p->old_duplex = -1;
+ phy_suspend(p->phy);
+ }
+
+ return 0;
+}
+
+int dsa_slave_resume(struct net_device *slave_dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(slave_dev);
+
+ netif_device_attach(slave_dev);
+
+ if (p->phy) {
+ phy_resume(p->phy);
+ phy_start(p->phy);
+ }
+
+ return 0;
+}
+
+int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
+ int port, char *name)
+{
+ struct net_device *master = ds->dst->master_netdev;
+ struct net_device *slave_dev;
+ struct dsa_slave_priv *p;
+ int ret;
+
+ slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name,
+ NET_NAME_UNKNOWN, ether_setup);
+ if (slave_dev == NULL)
+ return -ENOMEM;
+
+ slave_dev->features = master->vlan_features;
+ slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
+ eth_hw_addr_inherit(slave_dev, master);
+ slave_dev->tx_queue_len = 0;
+ slave_dev->netdev_ops = &dsa_slave_netdev_ops;
+ slave_dev->swdev_ops = &dsa_slave_swdev_ops;
+
+ SET_NETDEV_DEV(slave_dev, parent);
+ slave_dev->dev.of_node = ds->pd->port_dn[port];
+ slave_dev->vlan_features = master->vlan_features;
+
+ p = netdev_priv(slave_dev);
+ p->dev = slave_dev;
+ p->parent = ds;
+ p->port = port;
+
+ switch (ds->dst->tag_protocol) {
+#ifdef CONFIG_NET_DSA_TAG_DSA
+ case DSA_TAG_PROTO_DSA:
+ p->xmit = dsa_netdev_ops.xmit;
+ break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_EDSA
+ case DSA_TAG_PROTO_EDSA:
+ p->xmit = edsa_netdev_ops.xmit;
+ break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_TRAILER
+ case DSA_TAG_PROTO_TRAILER:
+ p->xmit = trailer_netdev_ops.xmit;
+ break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_BRCM
+ case DSA_TAG_PROTO_BRCM:
+ p->xmit = brcm_netdev_ops.xmit;
+ break;
+#endif
+ default:
+ p->xmit = dsa_slave_notag_xmit;
+ break;
+ }
+
+ p->old_pause = -1;
+ p->old_link = -1;
+ p->old_duplex = -1;
+
+ ret = dsa_slave_phy_setup(p, slave_dev);
+ if (ret) {
+ free_netdev(slave_dev);
+ return ret;
+ }
+
+ ds->ports[port] = slave_dev;
+ ret = register_netdev(slave_dev);
+ if (ret) {
+ netdev_err(master, "error %d registering interface %s\n",
+ ret, slave_dev->name);
+ phy_disconnect(p->phy);
+ ds->ports[port] = NULL;
+ free_netdev(slave_dev);
+ return ret;
+ }
+
+ netif_carrier_off(slave_dev);
+
+ return 0;
+}
+
+static bool dsa_slave_dev_check(struct net_device *dev)
+{
+ return dev->netdev_ops == &dsa_slave_netdev_ops;
+}
+
+static int dsa_slave_master_changed(struct net_device *dev)
+{
+ struct net_device *master = netdev_master_upper_dev_get(dev);
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ int err = 0;
+
+ if (master && master->rtnl_link_ops &&
+ !strcmp(master->rtnl_link_ops->kind, "bridge"))
+ err = dsa_slave_bridge_port_join(dev, master);
+ else if (dsa_port_is_bridged(p))
+ err = dsa_slave_bridge_port_leave(dev);
+
+ return err;
+}
+
+int dsa_slave_netdevice_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev;
+ int err = 0;
+
+ switch (event) {
+ case NETDEV_CHANGEUPPER:
+ dev = netdev_notifier_info_to_dev(ptr);
+ if (!dsa_slave_dev_check(dev))
+ goto out;
+
+ err = dsa_slave_master_changed(dev);
+ if (err)
+ netdev_warn(dev, "failed to reflect master change\n");
+
+ break;
+ }
+
+out:
+ return NOTIFY_DONE;
+}
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
new file mode 100644
index 000000000..83d3572cd
--- /dev/null
+++ b/net/dsa/tag_brcm.c
@@ -0,0 +1,171 @@
+/*
+ * Broadcom tag support
+ *
+ * Copyright (C) 2014 Broadcom Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "dsa_priv.h"
+
+/* This tag length is 4 bytes, older ones were 6 bytes, we do not
+ * handle them
+ */
+#define BRCM_TAG_LEN 4
+
+/* Tag is constructed and desconstructed using byte by byte access
+ * because the tag is placed after the MAC Source Address, which does
+ * not make it 4-bytes aligned, so this might cause unaligned accesses
+ * on most systems where this is used.
+ */
+
+/* Ingress and egress opcodes */
+#define BRCM_OPCODE_SHIFT 5
+#define BRCM_OPCODE_MASK 0x7
+
+/* Ingress fields */
+/* 1st byte in the tag */
+#define BRCM_IG_TC_SHIFT 2
+#define BRCM_IG_TC_MASK 0x7
+/* 2nd byte in the tag */
+#define BRCM_IG_TE_MASK 0x3
+#define BRCM_IG_TS_SHIFT 7
+/* 3rd byte in the tag */
+#define BRCM_IG_DSTMAP2_MASK 1
+#define BRCM_IG_DSTMAP1_MASK 0xff
+
+/* Egress fields */
+
+/* 2nd byte in the tag */
+#define BRCM_EG_CID_MASK 0xff
+
+/* 3rd byte in the tag */
+#define BRCM_EG_RC_MASK 0xff
+#define BRCM_EG_RC_RSVD (3 << 6)
+#define BRCM_EG_RC_EXCEPTION (1 << 5)
+#define BRCM_EG_RC_PROT_SNOOP (1 << 4)
+#define BRCM_EG_RC_PROT_TERM (1 << 3)
+#define BRCM_EG_RC_SWITCH (1 << 2)
+#define BRCM_EG_RC_MAC_LEARN (1 << 1)
+#define BRCM_EG_RC_MIRROR (1 << 0)
+#define BRCM_EG_TC_SHIFT 5
+#define BRCM_EG_TC_MASK 0x7
+#define BRCM_EG_PID_MASK 0x1f
+
+static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ u8 *brcm_tag;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ if (skb_cow_head(skb, BRCM_TAG_LEN) < 0)
+ goto out_free;
+
+ skb_push(skb, BRCM_TAG_LEN);
+
+ memmove(skb->data, skb->data + BRCM_TAG_LEN, 2 * ETH_ALEN);
+
+ /* Build the tag after the MAC Source Address */
+ brcm_tag = skb->data + 2 * ETH_ALEN;
+
+ /* Set the ingress opcode, traffic class, tag enforcment is
+ * deprecated
+ */
+ brcm_tag[0] = (1 << BRCM_OPCODE_SHIFT) |
+ ((skb->priority << BRCM_IG_TC_SHIFT) & BRCM_IG_TC_MASK);
+ brcm_tag[1] = 0;
+ brcm_tag[2] = 0;
+ if (p->port == 8)
+ brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
+ brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK;
+
+ /* Queue the SKB for transmission on the parent interface, but
+ * do not modify its EtherType
+ */
+ skb->dev = p->parent->dst->master_netdev;
+ dev_queue_xmit(skb);
+
+ return NETDEV_TX_OK;
+
+out_free:
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
+ int source_port;
+ u8 *brcm_tag;
+
+ if (unlikely(dst == NULL))
+ goto out_drop;
+
+ ds = dst->ds[0];
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN)))
+ goto out_drop;
+
+ /* skb->data points to the EtherType, the tag is right before it */
+ brcm_tag = skb->data - 2;
+
+ /* The opcode should never be different than 0b000 */
+ if (unlikely((brcm_tag[0] >> BRCM_OPCODE_SHIFT) & BRCM_OPCODE_MASK))
+ goto out_drop;
+
+ /* We should never see a reserved reason code without knowing how to
+ * handle it
+ */
+ WARN_ON(brcm_tag[2] & BRCM_EG_RC_RSVD);
+
+ /* Locate which port this is coming from */
+ source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
+
+ /* Validate port against switch setup, either the port is totally */
+ if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+ goto out_drop;
+
+ /* Remove Broadcom tag and update checksum */
+ skb_pull_rcsum(skb, BRCM_TAG_LEN);
+
+ /* Move the Ethernet DA and SA */
+ memmove(skb->data - ETH_HLEN,
+ skb->data - ETH_HLEN - BRCM_TAG_LEN,
+ 2 * ETH_ALEN);
+
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->dev = ds->ports[source_port];
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+
+ netif_receive_skb(skb);
+
+ return 0;
+
+out_drop:
+ kfree_skb(skb);
+out:
+ return 0;
+}
+
+const struct dsa_device_ops brcm_netdev_ops = {
+ .xmit = brcm_tag_xmit,
+ .rcv = brcm_tag_rcv,
+};
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
new file mode 100644
index 000000000..2dab27063
--- /dev/null
+++ b/net/dsa/tag_dsa.c
@@ -0,0 +1,189 @@
+/*
+ * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "dsa_priv.h"
+
+#define DSA_HLEN 4
+
+static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ u8 *dsa_header;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ /*
+ * Convert the outermost 802.1q tag to a DSA tag for tagged
+ * packets, or insert a DSA tag between the addresses and
+ * the ethertype field for untagged packets.
+ */
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ if (skb_cow_head(skb, 0) < 0)
+ goto out_free;
+
+ /*
+ * Construct tagged FROM_CPU DSA tag from 802.1q tag.
+ */
+ dsa_header = skb->data + 2 * ETH_ALEN;
+ dsa_header[0] = 0x60 | p->parent->index;
+ dsa_header[1] = p->port << 3;
+
+ /*
+ * Move CFI field from byte 2 to byte 1.
+ */
+ if (dsa_header[2] & 0x10) {
+ dsa_header[1] |= 0x01;
+ dsa_header[2] &= ~0x10;
+ }
+ } else {
+ if (skb_cow_head(skb, DSA_HLEN) < 0)
+ goto out_free;
+ skb_push(skb, DSA_HLEN);
+
+ memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
+
+ /*
+ * Construct untagged FROM_CPU DSA tag.
+ */
+ dsa_header = skb->data + 2 * ETH_ALEN;
+ dsa_header[0] = 0x40 | p->parent->index;
+ dsa_header[1] = p->port << 3;
+ dsa_header[2] = 0x00;
+ dsa_header[3] = 0x00;
+ }
+
+ skb->dev = p->parent->dst->master_netdev;
+ dev_queue_xmit(skb);
+
+ return NETDEV_TX_OK;
+
+out_free:
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
+ u8 *dsa_header;
+ int source_device;
+ int source_port;
+
+ if (unlikely(dst == NULL))
+ goto out_drop;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
+ goto out_drop;
+
+ /*
+ * The ethertype field is part of the DSA header.
+ */
+ dsa_header = skb->data - 2;
+
+ /*
+ * Check that frame type is either TO_CPU or FORWARD.
+ */
+ if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0)
+ goto out_drop;
+
+ /*
+ * Determine source device and port.
+ */
+ source_device = dsa_header[0] & 0x1f;
+ source_port = (dsa_header[1] >> 3) & 0x1f;
+
+ /*
+ * Check that the source device exists and that the source
+ * port is a registered DSA port.
+ */
+ if (source_device >= dst->pd->nr_chips)
+ goto out_drop;
+ ds = dst->ds[source_device];
+ if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+ goto out_drop;
+
+ /*
+ * Convert the DSA header to an 802.1q header if the 'tagged'
+ * bit in the DSA header is set. If the 'tagged' bit is clear,
+ * delete the DSA header entirely.
+ */
+ if (dsa_header[0] & 0x20) {
+ u8 new_header[4];
+
+ /*
+ * Insert 802.1q ethertype and copy the VLAN-related
+ * fields, but clear the bit that will hold CFI (since
+ * DSA uses that bit location for another purpose).
+ */
+ new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
+ new_header[1] = ETH_P_8021Q & 0xff;
+ new_header[2] = dsa_header[2] & ~0x10;
+ new_header[3] = dsa_header[3];
+
+ /*
+ * Move CFI bit from its place in the DSA header to
+ * its 802.1q-designated place.
+ */
+ if (dsa_header[1] & 0x01)
+ new_header[2] |= 0x10;
+
+ /*
+ * Update packet checksum if skb is CHECKSUM_COMPLETE.
+ */
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __wsum c = skb->csum;
+ c = csum_add(c, csum_partial(new_header + 2, 2, 0));
+ c = csum_sub(c, csum_partial(dsa_header + 2, 2, 0));
+ skb->csum = c;
+ }
+
+ memcpy(dsa_header, new_header, DSA_HLEN);
+ } else {
+ /*
+ * Remove DSA tag and update checksum.
+ */
+ skb_pull_rcsum(skb, DSA_HLEN);
+ memmove(skb->data - ETH_HLEN,
+ skb->data - ETH_HLEN - DSA_HLEN,
+ 2 * ETH_ALEN);
+ }
+
+ skb->dev = ds->ports[source_port];
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+
+ netif_receive_skb(skb);
+
+ return 0;
+
+out_drop:
+ kfree_skb(skb);
+out:
+ return 0;
+}
+
+const struct dsa_device_ops dsa_netdev_ops = {
+ .xmit = dsa_xmit,
+ .rcv = dsa_rcv,
+};
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
new file mode 100644
index 000000000..9aeda596f
--- /dev/null
+++ b/net/dsa/tag_edsa.c
@@ -0,0 +1,208 @@
+/*
+ * net/dsa/tag_edsa.c - Ethertype DSA tagging
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "dsa_priv.h"
+
+#define DSA_HLEN 4
+#define EDSA_HLEN 8
+
+static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ u8 *edsa_header;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ /*
+ * Convert the outermost 802.1q tag to a DSA tag and prepend
+ * a DSA ethertype field is the packet is tagged, or insert
+ * a DSA ethertype plus DSA tag between the addresses and the
+ * current ethertype field if the packet is untagged.
+ */
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ if (skb_cow_head(skb, DSA_HLEN) < 0)
+ goto out_free;
+ skb_push(skb, DSA_HLEN);
+
+ memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
+
+ /*
+ * Construct tagged FROM_CPU DSA tag from 802.1q tag.
+ */
+ edsa_header = skb->data + 2 * ETH_ALEN;
+ edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+ edsa_header[1] = ETH_P_EDSA & 0xff;
+ edsa_header[2] = 0x00;
+ edsa_header[3] = 0x00;
+ edsa_header[4] = 0x60 | p->parent->index;
+ edsa_header[5] = p->port << 3;
+
+ /*
+ * Move CFI field from byte 6 to byte 5.
+ */
+ if (edsa_header[6] & 0x10) {
+ edsa_header[5] |= 0x01;
+ edsa_header[6] &= ~0x10;
+ }
+ } else {
+ if (skb_cow_head(skb, EDSA_HLEN) < 0)
+ goto out_free;
+ skb_push(skb, EDSA_HLEN);
+
+ memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN);
+
+ /*
+ * Construct untagged FROM_CPU DSA tag.
+ */
+ edsa_header = skb->data + 2 * ETH_ALEN;
+ edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+ edsa_header[1] = ETH_P_EDSA & 0xff;
+ edsa_header[2] = 0x00;
+ edsa_header[3] = 0x00;
+ edsa_header[4] = 0x40 | p->parent->index;
+ edsa_header[5] = p->port << 3;
+ edsa_header[6] = 0x00;
+ edsa_header[7] = 0x00;
+ }
+
+ skb->dev = p->parent->dst->master_netdev;
+ dev_queue_xmit(skb);
+
+ return NETDEV_TX_OK;
+
+out_free:
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
+ u8 *edsa_header;
+ int source_device;
+ int source_port;
+
+ if (unlikely(dst == NULL))
+ goto out_drop;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
+ goto out_drop;
+
+ /*
+ * Skip the two null bytes after the ethertype.
+ */
+ edsa_header = skb->data + 2;
+
+ /*
+ * Check that frame type is either TO_CPU or FORWARD.
+ */
+ if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0)
+ goto out_drop;
+
+ /*
+ * Determine source device and port.
+ */
+ source_device = edsa_header[0] & 0x1f;
+ source_port = (edsa_header[1] >> 3) & 0x1f;
+
+ /*
+ * Check that the source device exists and that the source
+ * port is a registered DSA port.
+ */
+ if (source_device >= dst->pd->nr_chips)
+ goto out_drop;
+ ds = dst->ds[source_device];
+ if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+ goto out_drop;
+
+ /*
+ * If the 'tagged' bit is set, convert the DSA tag to a 802.1q
+ * tag and delete the ethertype part. If the 'tagged' bit is
+ * clear, delete the ethertype and the DSA tag parts.
+ */
+ if (edsa_header[0] & 0x20) {
+ u8 new_header[4];
+
+ /*
+ * Insert 802.1q ethertype and copy the VLAN-related
+ * fields, but clear the bit that will hold CFI (since
+ * DSA uses that bit location for another purpose).
+ */
+ new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
+ new_header[1] = ETH_P_8021Q & 0xff;
+ new_header[2] = edsa_header[2] & ~0x10;
+ new_header[3] = edsa_header[3];
+
+ /*
+ * Move CFI bit from its place in the DSA header to
+ * its 802.1q-designated place.
+ */
+ if (edsa_header[1] & 0x01)
+ new_header[2] |= 0x10;
+
+ skb_pull_rcsum(skb, DSA_HLEN);
+
+ /*
+ * Update packet checksum if skb is CHECKSUM_COMPLETE.
+ */
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __wsum c = skb->csum;
+ c = csum_add(c, csum_partial(new_header + 2, 2, 0));
+ c = csum_sub(c, csum_partial(edsa_header + 2, 2, 0));
+ skb->csum = c;
+ }
+
+ memcpy(edsa_header, new_header, DSA_HLEN);
+
+ memmove(skb->data - ETH_HLEN,
+ skb->data - ETH_HLEN - DSA_HLEN,
+ 2 * ETH_ALEN);
+ } else {
+ /*
+ * Remove DSA tag and update checksum.
+ */
+ skb_pull_rcsum(skb, EDSA_HLEN);
+ memmove(skb->data - ETH_HLEN,
+ skb->data - ETH_HLEN - EDSA_HLEN,
+ 2 * ETH_ALEN);
+ }
+
+ skb->dev = ds->ports[source_port];
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+
+ netif_receive_skb(skb);
+
+ return 0;
+
+out_drop:
+ kfree_skb(skb);
+out:
+ return 0;
+}
+
+const struct dsa_device_ops edsa_netdev_ops = {
+ .xmit = edsa_xmit,
+ .rcv = edsa_rcv,
+};
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
new file mode 100644
index 000000000..e268f9db8
--- /dev/null
+++ b/net/dsa/tag_trailer.c
@@ -0,0 +1,117 @@
+/*
+ * net/dsa/tag_trailer.c - Trailer tag format handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "dsa_priv.h"
+
+static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct sk_buff *nskb;
+ int padlen;
+ u8 *trailer;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ /*
+ * We have to make sure that the trailer ends up as the very
+ * last 4 bytes of the packet. This means that we have to pad
+ * the packet to the minimum ethernet frame size, if necessary,
+ * before adding the trailer.
+ */
+ padlen = 0;
+ if (skb->len < 60)
+ padlen = 60 - skb->len;
+
+ nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC);
+ if (nskb == NULL) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+ skb_reserve(nskb, NET_IP_ALIGN);
+
+ skb_reset_mac_header(nskb);
+ skb_set_network_header(nskb, skb_network_header(skb) - skb->head);
+ skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head);
+ skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
+ kfree_skb(skb);
+
+ if (padlen) {
+ u8 *pad = skb_put(nskb, padlen);
+ memset(pad, 0, padlen);
+ }
+
+ trailer = skb_put(nskb, 4);
+ trailer[0] = 0x80;
+ trailer[1] = 1 << p->port;
+ trailer[2] = 0x10;
+ trailer[3] = 0x00;
+
+ nskb->dev = p->parent->dst->master_netdev;
+ dev_queue_xmit(nskb);
+
+ return NETDEV_TX_OK;
+}
+
+static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct dsa_switch_tree *dst = dev->dsa_ptr;
+ struct dsa_switch *ds;
+ u8 *trailer;
+ int source_port;
+
+ if (unlikely(dst == NULL))
+ goto out_drop;
+ ds = dst->ds[0];
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out;
+
+ if (skb_linearize(skb))
+ goto out_drop;
+
+ trailer = skb_tail_pointer(skb) - 4;
+ if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 ||
+ (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00)
+ goto out_drop;
+
+ source_port = trailer[1] & 7;
+ if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+ goto out_drop;
+
+ pskb_trim_rcsum(skb, skb->len - 4);
+
+ skb->dev = ds->ports[source_port];
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ skb->dev->stats.rx_packets++;
+ skb->dev->stats.rx_bytes += skb->len;
+
+ netif_receive_skb(skb);
+
+ return 0;
+
+out_drop:
+ kfree_skb(skb);
+out:
+ return 0;
+}
+
+const struct dsa_device_ops trailer_netdev_ops = {
+ .xmit = trailer_xmit,
+ .rcv = trailer_rcv,
+};
diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile
new file mode 100644
index 000000000..323177505
--- /dev/null
+++ b/net/ethernet/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Linux Ethernet layer.
+#
+
+obj-y += eth.o
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
new file mode 100644
index 000000000..d8aa0a221
--- /dev/null
+++ b/net/ethernet/eth.c
@@ -0,0 +1,484 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Ethernet-type device handling.
+ *
+ * Version: @(#)eth.c 1.0.7 05/25/93
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Florian La Roche, <rzsfl@rz.uni-sb.de>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *
+ * Fixes:
+ * Mr Linux : Arp problems
+ * Alan Cox : Generic queue tidyup (very tiny here)
+ * Alan Cox : eth_header ntohs should be htons
+ * Alan Cox : eth_rebuild_header missing an htons and
+ * minor other things.
+ * Tegge : Arp bug fixes.
+ * Florian : Removed many unnecessary functions, code cleanup
+ * and changes for new arp and skbuff.
+ * Alan Cox : Redid header building to reflect new format.
+ * Alan Cox : ARP only when compiled with CONFIG_INET
+ * Greg Page : 802.2 and SNAP stuff.
+ * Alan Cox : MAC layer pointers/new format.
+ * Paul Gortmaker : eth_copy_and_sum shouldn't csum padding.
+ * Alan Cox : Protect against forwarding explosions with
+ * older network drivers and IFF_ALLMULTI.
+ * Christer Weinigel : Better rebuild header message.
+ * Andrew Morton : 26Feb01: kill ether_setup() - use netdev_boot_setup().
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/if_ether.h>
+#include <net/dst.h>
+#include <net/arp.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <net/dsa.h>
+#include <linux/uaccess.h>
+
+__setup("ether=", netdev_boot_setup);
+
+/**
+ * eth_header - create the Ethernet header
+ * @skb: buffer to alter
+ * @dev: source device
+ * @type: Ethernet type field
+ * @daddr: destination address (NULL leave destination address)
+ * @saddr: source address (NULL use device source address)
+ * @len: packet length (<= skb->len)
+ *
+ *
+ * Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length
+ * in here instead.
+ */
+int eth_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
+
+ if (type != ETH_P_802_3 && type != ETH_P_802_2)
+ eth->h_proto = htons(type);
+ else
+ eth->h_proto = htons(len);
+
+ /*
+ * Set the source hardware address.
+ */
+
+ if (!saddr)
+ saddr = dev->dev_addr;
+ memcpy(eth->h_source, saddr, ETH_ALEN);
+
+ if (daddr) {
+ memcpy(eth->h_dest, daddr, ETH_ALEN);
+ return ETH_HLEN;
+ }
+
+ /*
+ * Anyway, the loopback-device should never use this function...
+ */
+
+ if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
+ eth_zero_addr(eth->h_dest);
+ return ETH_HLEN;
+ }
+
+ return -ETH_HLEN;
+}
+EXPORT_SYMBOL(eth_header);
+
+/**
+ * eth_get_headlen - determine the the length of header for an ethernet frame
+ * @data: pointer to start of frame
+ * @len: total length of frame
+ *
+ * Make a best effort attempt to pull the length for all of the headers for
+ * a given frame in a linear buffer.
+ */
+u32 eth_get_headlen(void *data, unsigned int len)
+{
+ const struct ethhdr *eth = (const struct ethhdr *)data;
+ struct flow_keys keys;
+
+ /* this should never happen, but better safe than sorry */
+ if (len < sizeof(*eth))
+ return len;
+
+ /* parse any remaining L2/L3 headers, check for L4 */
+ if (!__skb_flow_dissect(NULL, &keys, data,
+ eth->h_proto, sizeof(*eth), len))
+ return max_t(u32, keys.thoff, sizeof(*eth));
+
+ /* parse for any L4 headers */
+ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
+}
+EXPORT_SYMBOL(eth_get_headlen);
+
+/**
+ * eth_type_trans - determine the packet's protocol ID.
+ * @skb: received socket data
+ * @dev: receiving network device
+ *
+ * The rule here is that we
+ * assume 802.3 if the type field is short enough to be a length.
+ * This is normal practice and works for any 'now in use' protocol.
+ */
+__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+ unsigned short _service_access_point;
+ const unsigned short *sap;
+ const struct ethhdr *eth;
+
+ skb->dev = dev;
+ skb_reset_mac_header(skb);
+ skb_pull_inline(skb, ETH_HLEN);
+ eth = eth_hdr(skb);
+
+ if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
+ if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
+ skb->pkt_type = PACKET_BROADCAST;
+ else
+ skb->pkt_type = PACKET_MULTICAST;
+ }
+ else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
+ dev->dev_addr)))
+ skb->pkt_type = PACKET_OTHERHOST;
+
+ /*
+ * Some variants of DSA tagging don't have an ethertype field
+ * at all, so we check here whether one of those tagging
+ * variants has been configured on the receiving interface,
+ * and if so, set skb->protocol without looking at the packet.
+ */
+ if (unlikely(netdev_uses_dsa(dev)))
+ return htons(ETH_P_XDSA);
+
+ if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN))
+ return eth->h_proto;
+
+ /*
+ * This is a magic hack to spot IPX packets. Older Novell breaks
+ * the protocol design and runs IPX over 802.3 without an 802.2 LLC
+ * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
+ * won't work for fault tolerant netware but does for the rest.
+ */
+ sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point);
+ if (sap && *sap == 0xFFFF)
+ return htons(ETH_P_802_3);
+
+ /*
+ * Real 802.2 LLC
+ */
+ return htons(ETH_P_802_2);
+}
+EXPORT_SYMBOL(eth_type_trans);
+
+/**
+ * eth_header_parse - extract hardware address from packet
+ * @skb: packet to extract header from
+ * @haddr: destination buffer
+ */
+int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+ const struct ethhdr *eth = eth_hdr(skb);
+ memcpy(haddr, eth->h_source, ETH_ALEN);
+ return ETH_ALEN;
+}
+EXPORT_SYMBOL(eth_header_parse);
+
+/**
+ * eth_header_cache - fill cache entry from neighbour
+ * @neigh: source neighbour
+ * @hh: destination cache entry
+ * @type: Ethernet type field
+ *
+ * Create an Ethernet header template from the neighbour.
+ */
+int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
+{
+ struct ethhdr *eth;
+ const struct net_device *dev = neigh->dev;
+
+ eth = (struct ethhdr *)
+ (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth))));
+
+ if (type == htons(ETH_P_802_3))
+ return -1;
+
+ eth->h_proto = type;
+ memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
+ memcpy(eth->h_dest, neigh->ha, ETH_ALEN);
+ hh->hh_len = ETH_HLEN;
+ return 0;
+}
+EXPORT_SYMBOL(eth_header_cache);
+
+/**
+ * eth_header_cache_update - update cache entry
+ * @hh: destination cache entry
+ * @dev: network device
+ * @haddr: new hardware address
+ *
+ * Called by Address Resolution module to notify changes in address.
+ */
+void eth_header_cache_update(struct hh_cache *hh,
+ const struct net_device *dev,
+ const unsigned char *haddr)
+{
+ memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)),
+ haddr, ETH_ALEN);
+}
+EXPORT_SYMBOL(eth_header_cache_update);
+
+/**
+ * eth_prepare_mac_addr_change - prepare for mac change
+ * @dev: network device
+ * @p: socket address
+ */
+int eth_prepare_mac_addr_change(struct net_device *dev, void *p)
+{
+ struct sockaddr *addr = p;
+
+ if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
+ return -EBUSY;
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+ return 0;
+}
+EXPORT_SYMBOL(eth_prepare_mac_addr_change);
+
+/**
+ * eth_commit_mac_addr_change - commit mac change
+ * @dev: network device
+ * @p: socket address
+ */
+void eth_commit_mac_addr_change(struct net_device *dev, void *p)
+{
+ struct sockaddr *addr = p;
+
+ memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+}
+EXPORT_SYMBOL(eth_commit_mac_addr_change);
+
+/**
+ * eth_mac_addr - set new Ethernet hardware address
+ * @dev: network device
+ * @p: socket address
+ *
+ * Change hardware address of device.
+ *
+ * This doesn't change hardware matching, so needs to be overridden
+ * for most real devices.
+ */
+int eth_mac_addr(struct net_device *dev, void *p)
+{
+ int ret;
+
+ ret = eth_prepare_mac_addr_change(dev, p);
+ if (ret < 0)
+ return ret;
+ eth_commit_mac_addr_change(dev, p);
+ return 0;
+}
+EXPORT_SYMBOL(eth_mac_addr);
+
+/**
+ * eth_change_mtu - set new MTU size
+ * @dev: network device
+ * @new_mtu: new Maximum Transfer Unit
+ *
+ * Allow changing MTU size. Needs to be overridden for devices
+ * supporting jumbo frames.
+ */
+int eth_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if (new_mtu < 68 || new_mtu > ETH_DATA_LEN)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL(eth_change_mtu);
+
+int eth_validate_addr(struct net_device *dev)
+{
+ if (!is_valid_ether_addr(dev->dev_addr))
+ return -EADDRNOTAVAIL;
+
+ return 0;
+}
+EXPORT_SYMBOL(eth_validate_addr);
+
+const struct header_ops eth_header_ops ____cacheline_aligned = {
+ .create = eth_header,
+ .parse = eth_header_parse,
+ .cache = eth_header_cache,
+ .cache_update = eth_header_cache_update,
+};
+
+/**
+ * ether_setup - setup Ethernet network device
+ * @dev: network device
+ *
+ * Fill in the fields of the device structure with Ethernet-generic values.
+ */
+void ether_setup(struct net_device *dev)
+{
+ dev->header_ops = &eth_header_ops;
+ dev->type = ARPHRD_ETHER;
+ dev->hard_header_len = ETH_HLEN;
+ dev->mtu = ETH_DATA_LEN;
+ dev->addr_len = ETH_ALEN;
+ dev->tx_queue_len = 50; /* Ethernet wants good latency. Use FreeBSD defaults. */
+ dev->flags = IFF_BROADCAST|IFF_MULTICAST;
+ dev->priv_flags |= IFF_TX_SKB_SHARING;
+
+ eth_broadcast_addr(dev->broadcast);
+
+}
+EXPORT_SYMBOL(ether_setup);
+
+/**
+ * alloc_etherdev_mqs - Allocates and sets up an Ethernet device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ * for this Ethernet device
+ * @txqs: The number of TX queues this device has.
+ * @rxqs: The number of RX queues this device has.
+ *
+ * Fill in the fields of the device structure with Ethernet-generic
+ * values. Basically does everything except registering the device.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size (sizeof_priv). A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+
+struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
+ unsigned int rxqs)
+{
+ return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN,
+ ether_setup, txqs, rxqs);
+}
+EXPORT_SYMBOL(alloc_etherdev_mqs);
+
+ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
+{
+ return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
+}
+EXPORT_SYMBOL(sysfs_format_mac);
+
+struct sk_buff **eth_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct sk_buff *p, **pp = NULL;
+ struct ethhdr *eh, *eh2;
+ unsigned int hlen, off_eth;
+ const struct packet_offload *ptype;
+ __be16 type;
+ int flush = 1;
+
+ off_eth = skb_gro_offset(skb);
+ hlen = off_eth + sizeof(*eh);
+ eh = skb_gro_header_fast(skb, off_eth);
+ if (skb_gro_header_hard(skb, hlen)) {
+ eh = skb_gro_header_slow(skb, hlen, off_eth);
+ if (unlikely(!eh))
+ goto out;
+ }
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ eh2 = (struct ethhdr *)(p->data + off_eth);
+ if (compare_ether_header(eh, eh2)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ type = eh->h_proto;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (ptype == NULL) {
+ flush = 1;
+ goto out_unlock;
+ }
+
+ skb_gro_pull(skb, sizeof(*eh));
+ skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+EXPORT_SYMBOL(eth_gro_receive);
+
+int eth_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
+ __be16 type = eh->h_proto;
+ struct packet_offload *ptype;
+ int err = -ENOSYS;
+
+ if (skb->encapsulation)
+ skb_set_inner_mac_header(skb, nhoff);
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype != NULL)
+ err = ptype->callbacks.gro_complete(skb, nhoff +
+ sizeof(struct ethhdr));
+
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(eth_gro_complete);
+
+static struct packet_offload eth_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_TEB),
+ .callbacks = {
+ .gro_receive = eth_gro_receive,
+ .gro_complete = eth_gro_complete,
+ },
+};
+
+static int __init eth_offload_init(void)
+{
+ dev_add_offload(&eth_packet_offload);
+
+ return 0;
+}
+
+fs_initcall(eth_offload_init);
diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig
new file mode 100644
index 000000000..0d3d70905
--- /dev/null
+++ b/net/hsr/Kconfig
@@ -0,0 +1,27 @@
+#
+# IEC 62439-3 High-availability Seamless Redundancy
+#
+
+config HSR
+ tristate "High-availability Seamless Redundancy (HSR)"
+ ---help---
+ If you say Y here, then your Linux box will be able to act as a
+ DANH ("Doubly attached node implementing HSR"). For this to work,
+ your Linux box needs (at least) two physical Ethernet interfaces,
+ and it must be connected as a node in a ring network together with
+ other HSR capable nodes.
+
+ All Ethernet frames sent over the hsr device will be sent in both
+ directions on the ring (over both slave ports), giving a redundant,
+ instant fail-over network. Each HSR node in the ring acts like a
+ bridge for HSR frames, but filters frames that have been forwarded
+ earlier.
+
+ This code is a "best effort" to comply with the HSR standard as
+ described in IEC 62439-3:2010 (HSRv0), but no compliancy tests have
+ been made.
+
+ You need to perform any and all necessary tests yourself before
+ relying on this code in a safety critical system!
+
+ If unsure, say N.
diff --git a/net/hsr/Makefile b/net/hsr/Makefile
new file mode 100644
index 000000000..9ae972a82
--- /dev/null
+++ b/net/hsr/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for HSR
+#
+
+obj-$(CONFIG_HSR) += hsr.o
+
+hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \
+ hsr_netlink.o hsr_slave.o hsr_forward.o
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
new file mode 100644
index 000000000..44d27469a
--- /dev/null
+++ b/net/hsr/hsr_device.c
@@ -0,0 +1,503 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * This file contains device methods for creating, using and destroying
+ * virtual HSR devices.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include "hsr_device.h"
+#include "hsr_slave.h"
+#include "hsr_framereg.h"
+#include "hsr_main.h"
+#include "hsr_forward.h"
+
+
+static bool is_admin_up(struct net_device *dev)
+{
+ return dev && (dev->flags & IFF_UP);
+}
+
+static bool is_slave_up(struct net_device *dev)
+{
+ return dev && is_admin_up(dev) && netif_oper_up(dev);
+}
+
+static void __hsr_set_operstate(struct net_device *dev, int transition)
+{
+ write_lock_bh(&dev_base_lock);
+ if (dev->operstate != transition) {
+ dev->operstate = transition;
+ write_unlock_bh(&dev_base_lock);
+ netdev_state_change(dev);
+ } else {
+ write_unlock_bh(&dev_base_lock);
+ }
+}
+
+static void hsr_set_operstate(struct hsr_port *master, bool has_carrier)
+{
+ if (!is_admin_up(master->dev)) {
+ __hsr_set_operstate(master->dev, IF_OPER_DOWN);
+ return;
+ }
+
+ if (has_carrier)
+ __hsr_set_operstate(master->dev, IF_OPER_UP);
+ else
+ __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN);
+}
+
+static bool hsr_check_carrier(struct hsr_port *master)
+{
+ struct hsr_port *port;
+ bool has_carrier;
+
+ has_carrier = false;
+
+ rcu_read_lock();
+ hsr_for_each_port(master->hsr, port)
+ if ((port->type != HSR_PT_MASTER) && is_slave_up(port->dev)) {
+ has_carrier = true;
+ break;
+ }
+ rcu_read_unlock();
+
+ if (has_carrier)
+ netif_carrier_on(master->dev);
+ else
+ netif_carrier_off(master->dev);
+
+ return has_carrier;
+}
+
+
+static void hsr_check_announce(struct net_device *hsr_dev,
+ unsigned char old_operstate)
+{
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(hsr_dev);
+
+ if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) {
+ /* Went up */
+ hsr->announce_count = 0;
+ hsr->announce_timer.expires = jiffies +
+ msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+ add_timer(&hsr->announce_timer);
+ }
+
+ if ((hsr_dev->operstate != IF_OPER_UP) && (old_operstate == IF_OPER_UP))
+ /* Went down */
+ del_timer(&hsr->announce_timer);
+}
+
+void hsr_check_carrier_and_operstate(struct hsr_priv *hsr)
+{
+ struct hsr_port *master;
+ unsigned char old_operstate;
+ bool has_carrier;
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ /* netif_stacked_transfer_operstate() cannot be used here since
+ * it doesn't set IF_OPER_LOWERLAYERDOWN (?)
+ */
+ old_operstate = master->dev->operstate;
+ has_carrier = hsr_check_carrier(master);
+ hsr_set_operstate(master, has_carrier);
+ hsr_check_announce(master->dev, old_operstate);
+}
+
+int hsr_get_max_mtu(struct hsr_priv *hsr)
+{
+ unsigned int mtu_max;
+ struct hsr_port *port;
+
+ mtu_max = ETH_DATA_LEN;
+ rcu_read_lock();
+ hsr_for_each_port(hsr, port)
+ if (port->type != HSR_PT_MASTER)
+ mtu_max = min(port->dev->mtu, mtu_max);
+ rcu_read_unlock();
+
+ if (mtu_max < HSR_HLEN)
+ return 0;
+ return mtu_max - HSR_HLEN;
+}
+
+
+static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *master;
+
+ hsr = netdev_priv(dev);
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+
+ if (new_mtu > hsr_get_max_mtu(hsr)) {
+ netdev_info(master->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
+ HSR_HLEN);
+ return -EINVAL;
+ }
+
+ dev->mtu = new_mtu;
+
+ return 0;
+}
+
+static int hsr_dev_open(struct net_device *dev)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *port;
+ char designation;
+
+ hsr = netdev_priv(dev);
+ designation = '\0';
+
+ rcu_read_lock();
+ hsr_for_each_port(hsr, port) {
+ if (port->type == HSR_PT_MASTER)
+ continue;
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ designation = 'A';
+ break;
+ case HSR_PT_SLAVE_B:
+ designation = 'B';
+ break;
+ default:
+ designation = '?';
+ }
+ if (!is_slave_up(port->dev))
+ netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n",
+ designation, port->dev->name);
+ }
+ rcu_read_unlock();
+
+ if (designation == '\0')
+ netdev_warn(dev, "No slave devices configured\n");
+
+ return 0;
+}
+
+
+static int hsr_dev_close(struct net_device *dev)
+{
+ /* Nothing to do here. */
+ return 0;
+}
+
+
+static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr,
+ netdev_features_t features)
+{
+ netdev_features_t mask;
+ struct hsr_port *port;
+
+ mask = features;
+
+ /* Mask out all features that, if supported by one device, should be
+ * enabled for all devices (see NETIF_F_ONE_FOR_ALL).
+ *
+ * Anything that's off in mask will not be enabled - so only things
+ * that were in features originally, and also is in NETIF_F_ONE_FOR_ALL,
+ * may become enabled.
+ */
+ features &= ~NETIF_F_ONE_FOR_ALL;
+ hsr_for_each_port(hsr, port)
+ features = netdev_increment_features(features,
+ port->dev->features,
+ mask);
+
+ return features;
+}
+
+static netdev_features_t hsr_fix_features(struct net_device *dev,
+ netdev_features_t features)
+{
+ struct hsr_priv *hsr = netdev_priv(dev);
+
+ return hsr_features_recompute(hsr, features);
+}
+
+
+static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct hsr_priv *hsr = netdev_priv(dev);
+ struct hsr_port *master;
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ skb->dev = master->dev;
+ hsr_forward_skb(skb, master);
+
+ return NETDEV_TX_OK;
+}
+
+
+static const struct header_ops hsr_header_ops = {
+ .create = eth_header,
+ .parse = eth_header_parse,
+};
+
+
+/* HSR:2010 supervision frames should be padded so that the whole frame,
+ * including headers and FCS, is 64 bytes (without VLAN).
+ */
+static int hsr_pad(int size)
+{
+ const int min_size = ETH_ZLEN - HSR_HLEN - ETH_HLEN;
+
+ if (size >= min_size)
+ return size;
+ return min_size;
+}
+
+static void send_hsr_supervision_frame(struct hsr_port *master, u8 type)
+{
+ struct sk_buff *skb;
+ int hlen, tlen;
+ struct hsr_sup_tag *hsr_stag;
+ struct hsr_sup_payload *hsr_sp;
+ unsigned long irqflags;
+
+ hlen = LL_RESERVED_SPACE(master->dev);
+ tlen = master->dev->needed_tailroom;
+ skb = alloc_skb(hsr_pad(sizeof(struct hsr_sup_payload)) + hlen + tlen,
+ GFP_ATOMIC);
+
+ if (skb == NULL)
+ return;
+
+ skb_reserve(skb, hlen);
+
+ skb->dev = master->dev;
+ skb->protocol = htons(ETH_P_PRP);
+ skb->priority = TC_PRIO_CONTROL;
+
+ if (dev_hard_header(skb, skb->dev, ETH_P_PRP,
+ master->hsr->sup_multicast_addr,
+ skb->dev->dev_addr, skb->len) <= 0)
+ goto out;
+ skb_reset_mac_header(skb);
+
+ hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(*hsr_stag));
+
+ set_hsr_stag_path(hsr_stag, 0xf);
+ set_hsr_stag_HSR_Ver(hsr_stag, 0);
+
+ spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags);
+ hsr_stag->sequence_nr = htons(master->hsr->sequence_nr);
+ master->hsr->sequence_nr++;
+ spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags);
+
+ hsr_stag->HSR_TLV_Type = type;
+ hsr_stag->HSR_TLV_Length = 12;
+
+ /* Payload: MacAddressA */
+ hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp));
+ ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr);
+
+ hsr_forward_skb(skb, master);
+ return;
+
+out:
+ WARN_ON_ONCE("HSR: Could not send supervision frame\n");
+ kfree_skb(skb);
+}
+
+
+/* Announce (supervision frame) timer function
+ */
+static void hsr_announce(unsigned long data)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *master;
+
+ hsr = (struct hsr_priv *) data;
+
+ rcu_read_lock();
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+
+ if (hsr->announce_count < 3) {
+ send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE);
+ hsr->announce_count++;
+ } else {
+ send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK);
+ }
+
+ if (hsr->announce_count < 3)
+ hsr->announce_timer.expires = jiffies +
+ msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+ else
+ hsr->announce_timer.expires = jiffies +
+ msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+
+ if (is_admin_up(master->dev))
+ add_timer(&hsr->announce_timer);
+
+ rcu_read_unlock();
+}
+
+
+/* According to comments in the declaration of struct net_device, this function
+ * is "Called from unregister, can be used to call free_netdev". Ok then...
+ */
+static void hsr_dev_destroy(struct net_device *hsr_dev)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *port;
+
+ hsr = netdev_priv(hsr_dev);
+
+ rtnl_lock();
+ hsr_for_each_port(hsr, port)
+ hsr_del_port(port);
+ rtnl_unlock();
+
+ del_timer_sync(&hsr->prune_timer);
+ del_timer_sync(&hsr->announce_timer);
+
+ synchronize_rcu();
+ free_netdev(hsr_dev);
+}
+
+static const struct net_device_ops hsr_device_ops = {
+ .ndo_change_mtu = hsr_dev_change_mtu,
+ .ndo_open = hsr_dev_open,
+ .ndo_stop = hsr_dev_close,
+ .ndo_start_xmit = hsr_dev_xmit,
+ .ndo_fix_features = hsr_fix_features,
+};
+
+static struct device_type hsr_type = {
+ .name = "hsr",
+};
+
+void hsr_dev_setup(struct net_device *dev)
+{
+ random_ether_addr(dev->dev_addr);
+
+ ether_setup(dev);
+ dev->header_ops = &hsr_header_ops;
+ dev->netdev_ops = &hsr_device_ops;
+ SET_NETDEV_DEVTYPE(dev, &hsr_type);
+ dev->tx_queue_len = 0;
+
+ dev->destructor = hsr_dev_destroy;
+
+ dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
+ NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
+ NETIF_F_HW_VLAN_CTAG_TX;
+
+ dev->features = dev->hw_features;
+
+ /* Prevent recursive tx locking */
+ dev->features |= NETIF_F_LLTX;
+ /* VLAN on top of HSR needs testing and probably some work on
+ * hsr_header_create() etc.
+ */
+ dev->features |= NETIF_F_VLAN_CHALLENGED;
+ /* Not sure about this. Taken from bridge code. netdev_features.h says
+ * it means "Does not change network namespaces".
+ */
+ dev->features |= NETIF_F_NETNS_LOCAL;
+}
+
+
+/* Return true if dev is a HSR master; return false otherwise.
+ */
+inline bool is_hsr_master(struct net_device *dev)
+{
+ return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit);
+}
+
+/* Default multicast address for HSR Supervision frames */
+static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
+ 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00
+};
+
+int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
+ unsigned char multicast_spec)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *port;
+ int res;
+
+ hsr = netdev_priv(hsr_dev);
+ INIT_LIST_HEAD(&hsr->ports);
+ INIT_LIST_HEAD(&hsr->node_db);
+ INIT_LIST_HEAD(&hsr->self_node_db);
+
+ ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr);
+
+ /* Make sure we recognize frames from ourselves in hsr_rcv() */
+ res = hsr_create_self_node(&hsr->self_node_db, hsr_dev->dev_addr,
+ slave[1]->dev_addr);
+ if (res < 0)
+ return res;
+
+ spin_lock_init(&hsr->seqnr_lock);
+ /* Overflow soon to find bugs easier: */
+ hsr->sequence_nr = HSR_SEQNR_START;
+
+ init_timer(&hsr->announce_timer);
+ hsr->announce_timer.function = hsr_announce;
+ hsr->announce_timer.data = (unsigned long) hsr;
+
+ init_timer(&hsr->prune_timer);
+ hsr->prune_timer.function = hsr_prune_nodes;
+ hsr->prune_timer.data = (unsigned long) hsr;
+
+ ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr);
+ hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec;
+
+ /* FIXME: should I modify the value of these?
+ *
+ * - hsr_dev->flags - i.e.
+ * IFF_MASTER/SLAVE?
+ * - hsr_dev->priv_flags - i.e.
+ * IFF_EBRIDGE?
+ * IFF_TX_SKB_SHARING?
+ * IFF_HSR_MASTER/SLAVE?
+ */
+
+ /* Make sure the 1st call to netif_carrier_on() gets through */
+ netif_carrier_off(hsr_dev);
+
+ res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER);
+ if (res)
+ return res;
+
+ res = register_netdevice(hsr_dev);
+ if (res)
+ goto fail;
+
+ res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A);
+ if (res)
+ goto fail;
+ res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B);
+ if (res)
+ goto fail;
+
+ hsr->prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD);
+ add_timer(&hsr->prune_timer);
+
+ return 0;
+
+fail:
+ hsr_for_each_port(hsr, port)
+ hsr_del_port(port);
+
+ return res;
+}
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
new file mode 100644
index 000000000..108a5d59d
--- /dev/null
+++ b/net/hsr/hsr_device.h
@@ -0,0 +1,25 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#ifndef __HSR_DEVICE_H
+#define __HSR_DEVICE_H
+
+#include <linux/netdevice.h>
+#include "hsr_main.h"
+
+void hsr_dev_setup(struct net_device *dev);
+int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
+ unsigned char multicast_spec);
+void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
+bool is_hsr_master(struct net_device *dev);
+int hsr_get_max_mtu(struct hsr_priv *hsr);
+
+#endif /* __HSR_DEVICE_H */
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
new file mode 100644
index 000000000..7871ed6d3
--- /dev/null
+++ b/net/hsr/hsr_forward.c
@@ -0,0 +1,368 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#include "hsr_forward.h"
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+
+
+struct hsr_node;
+
+struct hsr_frame_info {
+ struct sk_buff *skb_std;
+ struct sk_buff *skb_hsr;
+ struct hsr_port *port_rcv;
+ struct hsr_node *node_src;
+ u16 sequence_nr;
+ bool is_supervision;
+ bool is_vlan;
+ bool is_local_dest;
+ bool is_local_exclusive;
+};
+
+
+/* The uses I can see for these HSR supervision frames are:
+ * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type =
+ * 22") to reset any sequence_nr counters belonging to that node. Useful if
+ * the other node's counter has been reset for some reason.
+ * --
+ * Or not - resetting the counter and bridging the frame would create a
+ * loop, unfortunately.
+ *
+ * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck
+ * frame is received from a particular node, we know something is wrong.
+ * We just register these (as with normal frames) and throw them away.
+ *
+ * 3) Allow different MAC addresses for the two slave interfaces, using the
+ * MacAddressA field.
+ */
+static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb)
+{
+ struct hsr_ethhdr_sp *hdr;
+
+ WARN_ON_ONCE(!skb_mac_header_was_set(skb));
+ hdr = (struct hsr_ethhdr_sp *) skb_mac_header(skb);
+
+ if (!ether_addr_equal(hdr->ethhdr.h_dest,
+ hsr->sup_multicast_addr))
+ return false;
+
+ if (get_hsr_stag_path(&hdr->hsr_sup) != 0x0f)
+ return false;
+ if ((hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_ANNOUNCE) &&
+ (hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_LIFE_CHECK))
+ return false;
+ if (hdr->hsr_sup.HSR_TLV_Length != 12)
+ return false;
+
+ return true;
+}
+
+
+static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in,
+ struct hsr_frame_info *frame)
+{
+ struct sk_buff *skb;
+ int copylen;
+ unsigned char *dst, *src;
+
+ skb_pull(skb_in, HSR_HLEN);
+ skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC);
+ skb_push(skb_in, HSR_HLEN);
+ if (skb == NULL)
+ return NULL;
+
+ skb_reset_mac_header(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start -= HSR_HLEN;
+
+ copylen = 2*ETH_ALEN;
+ if (frame->is_vlan)
+ copylen += VLAN_HLEN;
+ src = skb_mac_header(skb_in);
+ dst = skb_mac_header(skb);
+ memcpy(dst, src, copylen);
+
+ skb->protocol = eth_hdr(skb)->h_proto;
+ return skb;
+}
+
+static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ if (!frame->skb_std)
+ frame->skb_std = create_stripped_skb(frame->skb_hsr, frame);
+ return skb_clone(frame->skb_std, GFP_ATOMIC);
+}
+
+
+static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ struct hsr_ethhdr *hsr_ethhdr;
+ int lane_id;
+ int lsdu_size;
+
+ if (port->type == HSR_PT_SLAVE_A)
+ lane_id = 0;
+ else
+ lane_id = 1;
+
+ lsdu_size = skb->len - 14;
+ if (frame->is_vlan)
+ lsdu_size -= 4;
+
+ hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb);
+
+ set_hsr_tag_path(&hsr_ethhdr->hsr_tag, lane_id);
+ set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size);
+ hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr);
+ hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto;
+ hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP);
+}
+
+static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o,
+ struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ int movelen;
+ unsigned char *dst, *src;
+ struct sk_buff *skb;
+
+ /* Create the new skb with enough headroom to fit the HSR tag */
+ skb = __pskb_copy(skb_o, skb_headroom(skb_o) + HSR_HLEN, GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+ skb_reset_mac_header(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->csum_start += HSR_HLEN;
+
+ movelen = ETH_HLEN;
+ if (frame->is_vlan)
+ movelen += VLAN_HLEN;
+
+ src = skb_mac_header(skb);
+ dst = skb_push(skb, HSR_HLEN);
+ memmove(dst, src, movelen);
+ skb_reset_mac_header(skb);
+
+ hsr_fill_tag(skb, frame, port);
+
+ return skb;
+}
+
+/* If the original frame was an HSR tagged frame, just clone it to be sent
+ * unchanged. Otherwise, create a private frame especially tagged for 'port'.
+ */
+static struct sk_buff *frame_get_tagged_skb(struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ if (frame->skb_hsr)
+ return skb_clone(frame->skb_hsr, GFP_ATOMIC);
+
+ if ((port->type != HSR_PT_SLAVE_A) && (port->type != HSR_PT_SLAVE_B)) {
+ WARN_ONCE(1, "HSR: Bug: trying to create a tagged frame for a non-ring port");
+ return NULL;
+ }
+
+ return create_tagged_skb(frame->skb_std, frame, port);
+}
+
+
+static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev,
+ struct hsr_node *node_src)
+{
+ bool was_multicast_frame;
+ int res;
+
+ was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST);
+ hsr_addr_subst_source(node_src, skb);
+ skb_pull(skb, ETH_HLEN);
+ res = netif_rx(skb);
+ if (res == NET_RX_DROP) {
+ dev->stats.rx_dropped++;
+ } else {
+ dev->stats.rx_packets++;
+ dev->stats.rx_bytes += skb->len;
+ if (was_multicast_frame)
+ dev->stats.multicast++;
+ }
+}
+
+static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port,
+ struct hsr_frame_info *frame)
+{
+ if (frame->port_rcv->type == HSR_PT_MASTER) {
+ hsr_addr_subst_dest(frame->node_src, skb, port);
+
+ /* Address substitution (IEC62439-3 pp 26, 50): replace mac
+ * address of outgoing frame with that of the outgoing slave's.
+ */
+ ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr);
+ }
+ return dev_queue_xmit(skb);
+}
+
+
+/* Forward the frame through all devices except:
+ * - Back through the receiving device
+ * - If it's a HSR frame: through a device where it has passed before
+ * - To the local HSR master only if the frame is directly addressed to it, or
+ * a non-supervision multicast or broadcast frame.
+ *
+ * HSR slave devices should insert a HSR tag into the frame, or forward the
+ * frame unchanged if it's already tagged. Interlink devices should strip HSR
+ * tags if they're of the non-HSR type (but only after duplicate discard). The
+ * master device always strips HSR tags.
+ */
+static void hsr_forward_do(struct hsr_frame_info *frame)
+{
+ struct hsr_port *port;
+ struct sk_buff *skb;
+
+ hsr_for_each_port(frame->port_rcv->hsr, port) {
+ /* Don't send frame back the way it came */
+ if (port == frame->port_rcv)
+ continue;
+
+ /* Don't deliver locally unless we should */
+ if ((port->type == HSR_PT_MASTER) && !frame->is_local_dest)
+ continue;
+
+ /* Deliver frames directly addressed to us to master only */
+ if ((port->type != HSR_PT_MASTER) && frame->is_local_exclusive)
+ continue;
+
+ /* Don't send frame over port where it has been sent before */
+ if (hsr_register_frame_out(port, frame->node_src,
+ frame->sequence_nr))
+ continue;
+
+ if (frame->is_supervision && (port->type == HSR_PT_MASTER)) {
+ hsr_handle_sup_frame(frame->skb_hsr,
+ frame->node_src,
+ frame->port_rcv);
+ continue;
+ }
+
+ if (port->type != HSR_PT_MASTER)
+ skb = frame_get_tagged_skb(frame, port);
+ else
+ skb = frame_get_stripped_skb(frame, port);
+ if (skb == NULL) {
+ /* FIXME: Record the dropped frame? */
+ continue;
+ }
+
+ skb->dev = port->dev;
+ if (port->type == HSR_PT_MASTER)
+ hsr_deliver_master(skb, port->dev, frame->node_src);
+ else
+ hsr_xmit(skb, port, frame);
+ }
+}
+
+
+static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb,
+ struct hsr_frame_info *frame)
+{
+ struct net_device *master_dev;
+
+ master_dev = hsr_port_get_hsr(hsr, HSR_PT_MASTER)->dev;
+
+ if (hsr_addr_is_self(hsr, eth_hdr(skb)->h_dest)) {
+ frame->is_local_exclusive = true;
+ skb->pkt_type = PACKET_HOST;
+ } else {
+ frame->is_local_exclusive = false;
+ }
+
+ if ((skb->pkt_type == PACKET_HOST) ||
+ (skb->pkt_type == PACKET_MULTICAST) ||
+ (skb->pkt_type == PACKET_BROADCAST)) {
+ frame->is_local_dest = true;
+ } else {
+ frame->is_local_dest = false;
+ }
+}
+
+
+static int hsr_fill_frame_info(struct hsr_frame_info *frame,
+ struct sk_buff *skb, struct hsr_port *port)
+{
+ struct ethhdr *ethhdr;
+ unsigned long irqflags;
+
+ frame->is_supervision = is_supervision_frame(port->hsr, skb);
+ frame->node_src = hsr_get_node(&port->hsr->node_db, skb,
+ frame->is_supervision);
+ if (frame->node_src == NULL)
+ return -1; /* Unknown node and !is_supervision, or no mem */
+
+ ethhdr = (struct ethhdr *) skb_mac_header(skb);
+ frame->is_vlan = false;
+ if (ethhdr->h_proto == htons(ETH_P_8021Q)) {
+ frame->is_vlan = true;
+ /* FIXME: */
+ WARN_ONCE(1, "HSR: VLAN not yet supported");
+ }
+ if (ethhdr->h_proto == htons(ETH_P_PRP)) {
+ frame->skb_std = NULL;
+ frame->skb_hsr = skb;
+ frame->sequence_nr = hsr_get_skb_sequence_nr(skb);
+ } else {
+ frame->skb_std = skb;
+ frame->skb_hsr = NULL;
+ /* Sequence nr for the master node */
+ spin_lock_irqsave(&port->hsr->seqnr_lock, irqflags);
+ frame->sequence_nr = port->hsr->sequence_nr;
+ port->hsr->sequence_nr++;
+ spin_unlock_irqrestore(&port->hsr->seqnr_lock, irqflags);
+ }
+
+ frame->port_rcv = port;
+ check_local_dest(port->hsr, skb, frame);
+
+ return 0;
+}
+
+/* Must be called holding rcu read lock (because of the port parameter) */
+void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port)
+{
+ struct hsr_frame_info frame;
+
+ if (skb_mac_header(skb) != skb->data) {
+ WARN_ONCE(1, "%s:%d: Malformed frame (port_src %s)\n",
+ __FILE__, __LINE__, port->dev->name);
+ goto out_drop;
+ }
+
+ if (hsr_fill_frame_info(&frame, skb, port) < 0)
+ goto out_drop;
+ hsr_register_frame_in(frame.node_src, port, frame.sequence_nr);
+ hsr_forward_do(&frame);
+
+ if (frame.skb_hsr != NULL)
+ kfree_skb(frame.skb_hsr);
+ if (frame.skb_std != NULL)
+ kfree_skb(frame.skb_std);
+ return;
+
+out_drop:
+ port->dev->stats.tx_dropped++;
+ kfree_skb(skb);
+}
diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h
new file mode 100644
index 000000000..5c5bc4b6b
--- /dev/null
+++ b/net/hsr/hsr_forward.h
@@ -0,0 +1,20 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#ifndef __HSR_FORWARD_H
+#define __HSR_FORWARD_H
+
+#include <linux/netdevice.h>
+#include "hsr_main.h"
+
+void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port);
+
+#endif /* __HSR_FORWARD_H */
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
new file mode 100644
index 000000000..bace124d1
--- /dev/null
+++ b/net/hsr/hsr_framereg.c
@@ -0,0 +1,490 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * The HSR spec says never to forward the same frame twice on the same
+ * interface. A frame is identified by its source MAC address and its HSR
+ * sequence number. This code keeps track of senders and their sequence numbers
+ * to allow filtering of duplicate frames, and to detect HSR ring errors.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+#include "hsr_netlink.h"
+
+
+struct hsr_node {
+ struct list_head mac_list;
+ unsigned char MacAddressA[ETH_ALEN];
+ unsigned char MacAddressB[ETH_ALEN];
+ /* Local slave through which AddrB frames are received from this node */
+ enum hsr_port_type AddrB_port;
+ unsigned long time_in[HSR_PT_PORTS];
+ bool time_in_stale[HSR_PT_PORTS];
+ u16 seq_out[HSR_PT_PORTS];
+ struct rcu_head rcu_head;
+};
+
+
+/* TODO: use hash lists for mac addresses (linux/jhash.h)? */
+
+
+/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b,
+ * false otherwise.
+ */
+static bool seq_nr_after(u16 a, u16 b)
+{
+ /* Remove inconsistency where
+ * seq_nr_after(a, b) == seq_nr_before(a, b)
+ */
+ if ((int) b - a == 32768)
+ return false;
+
+ return (((s16) (b - a)) < 0);
+}
+#define seq_nr_before(a, b) seq_nr_after((b), (a))
+#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b)))
+#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b)))
+
+
+bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
+{
+ struct hsr_node *node;
+
+ node = list_first_or_null_rcu(&hsr->self_node_db, struct hsr_node,
+ mac_list);
+ if (!node) {
+ WARN_ONCE(1, "HSR: No self node\n");
+ return false;
+ }
+
+ if (ether_addr_equal(addr, node->MacAddressA))
+ return true;
+ if (ether_addr_equal(addr, node->MacAddressB))
+ return true;
+
+ return false;
+}
+
+/* Search for mac entry. Caller must hold rcu read lock.
+ */
+static struct hsr_node *find_node_by_AddrA(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN])
+{
+ struct hsr_node *node;
+
+ list_for_each_entry_rcu(node, node_db, mac_list) {
+ if (ether_addr_equal(node->MacAddressA, addr))
+ return node;
+ }
+
+ return NULL;
+}
+
+
+/* Helper for device init; the self_node_db is used in hsr_rcv() to recognize
+ * frames from self that's been looped over the HSR ring.
+ */
+int hsr_create_self_node(struct list_head *self_node_db,
+ unsigned char addr_a[ETH_ALEN],
+ unsigned char addr_b[ETH_ALEN])
+{
+ struct hsr_node *node, *oldnode;
+
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ ether_addr_copy(node->MacAddressA, addr_a);
+ ether_addr_copy(node->MacAddressB, addr_b);
+
+ rcu_read_lock();
+ oldnode = list_first_or_null_rcu(self_node_db,
+ struct hsr_node, mac_list);
+ if (oldnode) {
+ list_replace_rcu(&oldnode->mac_list, &node->mac_list);
+ rcu_read_unlock();
+ synchronize_rcu();
+ kfree(oldnode);
+ } else {
+ rcu_read_unlock();
+ list_add_tail_rcu(&node->mac_list, self_node_db);
+ }
+
+ return 0;
+}
+
+
+/* Allocate an hsr_node and add it to node_db. 'addr' is the node's AddressA;
+ * seq_out is used to initialize filtering of outgoing duplicate frames
+ * originating from the newly added node.
+ */
+struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
+ u16 seq_out)
+{
+ struct hsr_node *node;
+ unsigned long now;
+ int i;
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC);
+ if (!node)
+ return NULL;
+
+ ether_addr_copy(node->MacAddressA, addr);
+
+ /* We are only interested in time diffs here, so use current jiffies
+ * as initialization. (0 could trigger an spurious ring error warning).
+ */
+ now = jiffies;
+ for (i = 0; i < HSR_PT_PORTS; i++)
+ node->time_in[i] = now;
+ for (i = 0; i < HSR_PT_PORTS; i++)
+ node->seq_out[i] = seq_out;
+
+ list_add_tail_rcu(&node->mac_list, node_db);
+
+ return node;
+}
+
+/* Get the hsr_node from which 'skb' was sent.
+ */
+struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
+ bool is_sup)
+{
+ struct hsr_node *node;
+ struct ethhdr *ethhdr;
+ u16 seq_out;
+
+ if (!skb_mac_header_was_set(skb))
+ return NULL;
+
+ ethhdr = (struct ethhdr *) skb_mac_header(skb);
+
+ list_for_each_entry_rcu(node, node_db, mac_list) {
+ if (ether_addr_equal(node->MacAddressA, ethhdr->h_source))
+ return node;
+ if (ether_addr_equal(node->MacAddressB, ethhdr->h_source))
+ return node;
+ }
+
+ if (!is_sup)
+ return NULL; /* Only supervision frame may create node entry */
+
+ if (ethhdr->h_proto == htons(ETH_P_PRP)) {
+ /* Use the existing sequence_nr from the tag as starting point
+ * for filtering duplicate frames.
+ */
+ seq_out = hsr_get_skb_sequence_nr(skb) - 1;
+ } else {
+ WARN_ONCE(1, "%s: Non-HSR frame\n", __func__);
+ seq_out = 0;
+ }
+
+ return hsr_add_node(node_db, ethhdr->h_source, seq_out);
+}
+
+/* Use the Supervision frame's info about an eventual MacAddressB for merging
+ * nodes that has previously had their MacAddressB registered as a separate
+ * node.
+ */
+void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
+ struct hsr_port *port_rcv)
+{
+ struct hsr_node *node_real;
+ struct hsr_sup_payload *hsr_sp;
+ struct list_head *node_db;
+ int i;
+
+ skb_pull(skb, sizeof(struct hsr_ethhdr_sp));
+ hsr_sp = (struct hsr_sup_payload *) skb->data;
+
+ if (ether_addr_equal(eth_hdr(skb)->h_source, hsr_sp->MacAddressA))
+ /* Not sent from MacAddressB of a PICS_SUBS capable node */
+ goto done;
+
+ /* Merge node_curr (registered on MacAddressB) into node_real */
+ node_db = &port_rcv->hsr->node_db;
+ node_real = find_node_by_AddrA(node_db, hsr_sp->MacAddressA);
+ if (!node_real)
+ /* No frame received from AddrA of this node yet */
+ node_real = hsr_add_node(node_db, hsr_sp->MacAddressA,
+ HSR_SEQNR_START - 1);
+ if (!node_real)
+ goto done; /* No mem */
+ if (node_real == node_curr)
+ /* Node has already been merged */
+ goto done;
+
+ ether_addr_copy(node_real->MacAddressB, eth_hdr(skb)->h_source);
+ for (i = 0; i < HSR_PT_PORTS; i++) {
+ if (!node_curr->time_in_stale[i] &&
+ time_after(node_curr->time_in[i], node_real->time_in[i])) {
+ node_real->time_in[i] = node_curr->time_in[i];
+ node_real->time_in_stale[i] = node_curr->time_in_stale[i];
+ }
+ if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i]))
+ node_real->seq_out[i] = node_curr->seq_out[i];
+ }
+ node_real->AddrB_port = port_rcv->type;
+
+ list_del_rcu(&node_curr->mac_list);
+ kfree_rcu(node_curr, rcu_head);
+
+done:
+ skb_push(skb, sizeof(struct hsr_ethhdr_sp));
+}
+
+
+/* 'skb' is a frame meant for this host, that is to be passed to upper layers.
+ *
+ * If the frame was sent by a node's B interface, replace the source
+ * address with that node's "official" address (MacAddressA) so that upper
+ * layers recognize where it came from.
+ */
+void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb)
+{
+ if (!skb_mac_header_was_set(skb)) {
+ WARN_ONCE(1, "%s: Mac header not set\n", __func__);
+ return;
+ }
+
+ memcpy(&eth_hdr(skb)->h_source, node->MacAddressA, ETH_ALEN);
+}
+
+/* 'skb' is a frame meant for another host.
+ * 'port' is the outgoing interface
+ *
+ * Substitute the target (dest) MAC address if necessary, so the it matches the
+ * recipient interface MAC address, regardless of whether that is the
+ * recipient's A or B interface.
+ * This is needed to keep the packets flowing through switches that learn on
+ * which "side" the different interfaces are.
+ */
+void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
+ struct hsr_port *port)
+{
+ struct hsr_node *node_dst;
+
+ if (!skb_mac_header_was_set(skb)) {
+ WARN_ONCE(1, "%s: Mac header not set\n", __func__);
+ return;
+ }
+
+ if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest))
+ return;
+
+ node_dst = find_node_by_AddrA(&port->hsr->node_db, eth_hdr(skb)->h_dest);
+ if (!node_dst) {
+ WARN_ONCE(1, "%s: Unknown node\n", __func__);
+ return;
+ }
+ if (port->type != node_dst->AddrB_port)
+ return;
+
+ ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->MacAddressB);
+}
+
+
+void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
+ u16 sequence_nr)
+{
+ /* Don't register incoming frames without a valid sequence number. This
+ * ensures entries of restarted nodes gets pruned so that they can
+ * re-register and resume communications.
+ */
+ if (seq_nr_before(sequence_nr, node->seq_out[port->type]))
+ return;
+
+ node->time_in[port->type] = jiffies;
+ node->time_in_stale[port->type] = false;
+}
+
+/* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid
+ * ethhdr->h_source address and skb->mac_header set.
+ *
+ * Return:
+ * 1 if frame can be shown to have been sent recently on this interface,
+ * 0 otherwise, or
+ * negative error code on error
+ */
+int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
+ u16 sequence_nr)
+{
+ if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]))
+ return 1;
+
+ node->seq_out[port->type] = sequence_nr;
+ return 0;
+}
+
+
+static struct hsr_port *get_late_port(struct hsr_priv *hsr,
+ struct hsr_node *node)
+{
+ if (node->time_in_stale[HSR_PT_SLAVE_A])
+ return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (node->time_in_stale[HSR_PT_SLAVE_B])
+ return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+
+ if (time_after(node->time_in[HSR_PT_SLAVE_B],
+ node->time_in[HSR_PT_SLAVE_A] +
+ msecs_to_jiffies(MAX_SLAVE_DIFF)))
+ return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (time_after(node->time_in[HSR_PT_SLAVE_A],
+ node->time_in[HSR_PT_SLAVE_B] +
+ msecs_to_jiffies(MAX_SLAVE_DIFF)))
+ return hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+
+ return NULL;
+}
+
+
+/* Remove stale sequence_nr records. Called by timer every
+ * HSR_LIFE_CHECK_INTERVAL (two seconds or so).
+ */
+void hsr_prune_nodes(unsigned long data)
+{
+ struct hsr_priv *hsr;
+ struct hsr_node *node;
+ struct hsr_port *port;
+ unsigned long timestamp;
+ unsigned long time_a, time_b;
+
+ hsr = (struct hsr_priv *) data;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(node, &hsr->node_db, mac_list) {
+ /* Shorthand */
+ time_a = node->time_in[HSR_PT_SLAVE_A];
+ time_b = node->time_in[HSR_PT_SLAVE_B];
+
+ /* Check for timestamps old enough to risk wrap-around */
+ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET/2))
+ node->time_in_stale[HSR_PT_SLAVE_A] = true;
+ if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET/2))
+ node->time_in_stale[HSR_PT_SLAVE_B] = true;
+
+ /* Get age of newest frame from node.
+ * At least one time_in is OK here; nodes get pruned long
+ * before both time_ins can get stale
+ */
+ timestamp = time_a;
+ if (node->time_in_stale[HSR_PT_SLAVE_A] ||
+ (!node->time_in_stale[HSR_PT_SLAVE_B] &&
+ time_after(time_b, time_a)))
+ timestamp = time_b;
+
+ /* Warn of ring error only as long as we get frames at all */
+ if (time_is_after_jiffies(timestamp +
+ msecs_to_jiffies(1.5*MAX_SLAVE_DIFF))) {
+ rcu_read_lock();
+ port = get_late_port(hsr, node);
+ if (port != NULL)
+ hsr_nl_ringerror(hsr, node->MacAddressA, port);
+ rcu_read_unlock();
+ }
+
+ /* Prune old entries */
+ if (time_is_before_jiffies(timestamp +
+ msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
+ hsr_nl_nodedown(hsr, node->MacAddressA);
+ list_del_rcu(&node->mac_list);
+ /* Note that we need to free this entry later: */
+ kfree_rcu(node, rcu_head);
+ }
+ }
+ rcu_read_unlock();
+}
+
+
+void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
+ unsigned char addr[ETH_ALEN])
+{
+ struct hsr_node *node;
+
+ if (!_pos) {
+ node = list_first_or_null_rcu(&hsr->node_db,
+ struct hsr_node, mac_list);
+ if (node)
+ ether_addr_copy(addr, node->MacAddressA);
+ return node;
+ }
+
+ node = _pos;
+ list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) {
+ ether_addr_copy(addr, node->MacAddressA);
+ return node;
+ }
+
+ return NULL;
+}
+
+
+int hsr_get_node_data(struct hsr_priv *hsr,
+ const unsigned char *addr,
+ unsigned char addr_b[ETH_ALEN],
+ unsigned int *addr_b_ifindex,
+ int *if1_age,
+ u16 *if1_seq,
+ int *if2_age,
+ u16 *if2_seq)
+{
+ struct hsr_node *node;
+ struct hsr_port *port;
+ unsigned long tdiff;
+
+
+ rcu_read_lock();
+ node = find_node_by_AddrA(&hsr->node_db, addr);
+ if (!node) {
+ rcu_read_unlock();
+ return -ENOENT; /* No such entry */
+ }
+
+ ether_addr_copy(addr_b, node->MacAddressB);
+
+ tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A];
+ if (node->time_in_stale[HSR_PT_SLAVE_A])
+ *if1_age = INT_MAX;
+#if HZ <= MSEC_PER_SEC
+ else if (tdiff > msecs_to_jiffies(INT_MAX))
+ *if1_age = INT_MAX;
+#endif
+ else
+ *if1_age = jiffies_to_msecs(tdiff);
+
+ tdiff = jiffies - node->time_in[HSR_PT_SLAVE_B];
+ if (node->time_in_stale[HSR_PT_SLAVE_B])
+ *if2_age = INT_MAX;
+#if HZ <= MSEC_PER_SEC
+ else if (tdiff > msecs_to_jiffies(INT_MAX))
+ *if2_age = INT_MAX;
+#endif
+ else
+ *if2_age = jiffies_to_msecs(tdiff);
+
+ /* Present sequence numbers as if they were incoming on interface */
+ *if1_seq = node->seq_out[HSR_PT_SLAVE_B];
+ *if2_seq = node->seq_out[HSR_PT_SLAVE_A];
+
+ if (node->AddrB_port != HSR_PT_NONE) {
+ port = hsr_port_get_hsr(hsr, node->AddrB_port);
+ *addr_b_ifindex = port->dev->ifindex;
+ } else {
+ *addr_b_ifindex = -1;
+ }
+
+ rcu_read_unlock();
+
+ return 0;
+}
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
new file mode 100644
index 000000000..438b40f98
--- /dev/null
+++ b/net/hsr/hsr_framereg.h
@@ -0,0 +1,54 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#ifndef __HSR_FRAMEREG_H
+#define __HSR_FRAMEREG_H
+
+#include "hsr_main.h"
+
+struct hsr_node;
+
+struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
+ u16 seq_out);
+struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
+ bool is_sup);
+void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
+ struct hsr_port *port);
+bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr);
+
+void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb);
+void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
+ struct hsr_port *port);
+
+void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
+ u16 sequence_nr);
+int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
+ u16 sequence_nr);
+
+void hsr_prune_nodes(unsigned long data);
+
+int hsr_create_self_node(struct list_head *self_node_db,
+ unsigned char addr_a[ETH_ALEN],
+ unsigned char addr_b[ETH_ALEN]);
+
+void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
+ unsigned char addr[ETH_ALEN]);
+
+int hsr_get_node_data(struct hsr_priv *hsr,
+ const unsigned char *addr,
+ unsigned char addr_b[ETH_ALEN],
+ unsigned int *addr_b_ifindex,
+ int *if1_age,
+ u16 *if1_seq,
+ int *if2_age,
+ u16 *if2_seq);
+
+#endif /* __HSR_FRAMEREG_H */
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
new file mode 100644
index 000000000..cd37d0011
--- /dev/null
+++ b/net/hsr/hsr_main.c
@@ -0,0 +1,136 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/timer.h>
+#include <linux/etherdevice.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_netlink.h"
+#include "hsr_framereg.h"
+#include "hsr_slave.h"
+
+
+static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev;
+ struct hsr_port *port, *master;
+ struct hsr_priv *hsr;
+ int mtu_max;
+ int res;
+
+ dev = netdev_notifier_info_to_dev(ptr);
+ port = hsr_port_get_rtnl(dev);
+ if (port == NULL) {
+ if (!is_hsr_master(dev))
+ return NOTIFY_DONE; /* Not an HSR device */
+ hsr = netdev_priv(dev);
+ port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ if (port == NULL) {
+ /* Resend of notification concerning removed device? */
+ return NOTIFY_DONE;
+ }
+ } else {
+ hsr = port->hsr;
+ }
+
+ switch (event) {
+ case NETDEV_UP: /* Administrative state DOWN */
+ case NETDEV_DOWN: /* Administrative state UP */
+ case NETDEV_CHANGE: /* Link (carrier) state changes */
+ hsr_check_carrier_and_operstate(hsr);
+ break;
+ case NETDEV_CHANGEADDR:
+ if (port->type == HSR_PT_MASTER) {
+ /* This should not happen since there's no
+ * ndo_set_mac_address() for HSR devices - i.e. not
+ * supported.
+ */
+ break;
+ }
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+
+ if (port->type == HSR_PT_SLAVE_A) {
+ ether_addr_copy(master->dev->dev_addr, dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev);
+ }
+
+ /* Make sure we recognize frames from ourselves in hsr_rcv() */
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ res = hsr_create_self_node(&hsr->self_node_db,
+ master->dev->dev_addr,
+ port ?
+ port->dev->dev_addr :
+ master->dev->dev_addr);
+ if (res)
+ netdev_warn(master->dev,
+ "Could not update HSR node address.\n");
+ break;
+ case NETDEV_CHANGEMTU:
+ if (port->type == HSR_PT_MASTER)
+ break; /* Handled in ndo_change_mtu() */
+ mtu_max = hsr_get_max_mtu(port->hsr);
+ master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER);
+ master->dev->mtu = mtu_max;
+ break;
+ case NETDEV_UNREGISTER:
+ hsr_del_port(port);
+ break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ /* HSR works only on Ethernet devices. Refuse slave to change
+ * its type.
+ */
+ return NOTIFY_BAD;
+ }
+
+ return NOTIFY_DONE;
+}
+
+
+struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt)
+{
+ struct hsr_port *port;
+
+ hsr_for_each_port(hsr, port)
+ if (port->type == pt)
+ return port;
+ return NULL;
+}
+
+static struct notifier_block hsr_nb = {
+ .notifier_call = hsr_netdev_notify, /* Slave event notifications */
+};
+
+
+static int __init hsr_init(void)
+{
+ int res;
+
+ BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN);
+
+ register_netdevice_notifier(&hsr_nb);
+ res = hsr_netlink_init();
+
+ return res;
+}
+
+static void __exit hsr_exit(void)
+{
+ unregister_netdevice_notifier(&hsr_nb);
+ hsr_netlink_exit();
+}
+
+module_init(hsr_init);
+module_exit(hsr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
new file mode 100644
index 000000000..5a9c69962
--- /dev/null
+++ b/net/hsr/hsr_main.h
@@ -0,0 +1,183 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#ifndef __HSR_PRIVATE_H
+#define __HSR_PRIVATE_H
+
+#include <linux/netdevice.h>
+#include <linux/list.h>
+
+
+/* Time constants as specified in the HSR specification (IEC-62439-3 2010)
+ * Table 8.
+ * All values in milliseconds.
+ */
+#define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */
+#define HSR_NODE_FORGET_TIME 60000 /* ms */
+#define HSR_ANNOUNCE_INTERVAL 100 /* ms */
+
+
+/* By how much may slave1 and slave2 timestamps of latest received frame from
+ * each node differ before we notify of communication problem?
+ */
+#define MAX_SLAVE_DIFF 3000 /* ms */
+#define HSR_SEQNR_START (USHRT_MAX - 1024)
+
+
+/* How often shall we check for broken ring and remove node entries older than
+ * HSR_NODE_FORGET_TIME?
+ */
+#define PRUNE_PERIOD 3000 /* ms */
+
+
+#define HSR_TLV_ANNOUNCE 22
+#define HSR_TLV_LIFE_CHECK 23
+
+
+/* HSR Tag.
+ * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB,
+ * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest,
+ * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr,
+ * encapsulated protocol } instead.
+ *
+ * Field names as defined in the IEC:2010 standard for HSR.
+ */
+struct hsr_tag {
+ __be16 path_and_LSDU_size;
+ __be16 sequence_nr;
+ __be16 encap_proto;
+} __packed;
+
+#define HSR_HLEN 6
+
+/* The helper functions below assumes that 'path' occupies the 4 most
+ * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or
+ * equivalently, the 4 most significant bits of HSR tag byte 14).
+ *
+ * This is unclear in the IEC specification; its definition of MAC addresses
+ * indicates the spec is written with the least significant bit first (to the
+ * left). This, however, would mean that the LSDU field would be split in two
+ * with the path field in-between, which seems strange. I'm guessing the MAC
+ * address definition is in error.
+ */
+static inline u16 get_hsr_tag_path(struct hsr_tag *ht)
+{
+ return ntohs(ht->path_and_LSDU_size) >> 12;
+}
+
+static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht)
+{
+ return ntohs(ht->path_and_LSDU_size) & 0x0FFF;
+}
+
+static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path)
+{
+ ht->path_and_LSDU_size = htons(
+ (ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12));
+}
+
+static inline void set_hsr_tag_LSDU_size(struct hsr_tag *ht, u16 LSDU_size)
+{
+ ht->path_and_LSDU_size = htons(
+ (ntohs(ht->path_and_LSDU_size) & 0xF000) |
+ (LSDU_size & 0x0FFF));
+}
+
+struct hsr_ethhdr {
+ struct ethhdr ethhdr;
+ struct hsr_tag hsr_tag;
+} __packed;
+
+
+/* HSR Supervision Frame data types.
+ * Field names as defined in the IEC:2010 standard for HSR.
+ */
+struct hsr_sup_tag {
+ __be16 path_and_HSR_Ver;
+ __be16 sequence_nr;
+ __u8 HSR_TLV_Type;
+ __u8 HSR_TLV_Length;
+} __packed;
+
+struct hsr_sup_payload {
+ unsigned char MacAddressA[ETH_ALEN];
+} __packed;
+
+static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst)
+{
+ return get_hsr_tag_path((struct hsr_tag *) hst);
+}
+
+static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst)
+{
+ return get_hsr_tag_LSDU_size((struct hsr_tag *) hst);
+}
+
+static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path)
+{
+ set_hsr_tag_path((struct hsr_tag *) hst, path);
+}
+
+static inline void set_hsr_stag_HSR_Ver(struct hsr_sup_tag *hst, u16 HSR_Ver)
+{
+ set_hsr_tag_LSDU_size((struct hsr_tag *) hst, HSR_Ver);
+}
+
+struct hsr_ethhdr_sp {
+ struct ethhdr ethhdr;
+ struct hsr_sup_tag hsr_sup;
+} __packed;
+
+
+enum hsr_port_type {
+ HSR_PT_NONE = 0, /* Must be 0, used by framereg */
+ HSR_PT_SLAVE_A,
+ HSR_PT_SLAVE_B,
+ HSR_PT_INTERLINK,
+ HSR_PT_MASTER,
+ HSR_PT_PORTS, /* This must be the last item in the enum */
+};
+
+struct hsr_port {
+ struct list_head port_list;
+ struct net_device *dev;
+ struct hsr_priv *hsr;
+ enum hsr_port_type type;
+};
+
+struct hsr_priv {
+ struct rcu_head rcu_head;
+ struct list_head ports;
+ struct list_head node_db; /* Known HSR nodes */
+ struct list_head self_node_db; /* MACs of slaves */
+ struct timer_list announce_timer; /* Supervision frame dispatch */
+ struct timer_list prune_timer;
+ int announce_count;
+ u16 sequence_nr;
+ spinlock_t seqnr_lock; /* locking for sequence_nr */
+ unsigned char sup_multicast_addr[ETH_ALEN];
+};
+
+#define hsr_for_each_port(hsr, port) \
+ list_for_each_entry_rcu((port), &(hsr)->ports, port_list)
+
+struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt);
+
+/* Caller must ensure skb is a valid HSR frame */
+static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb)
+{
+ struct hsr_ethhdr *hsr_ethhdr;
+
+ hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb);
+ return ntohs(hsr_ethhdr->hsr_tag.sequence_nr);
+}
+
+#endif /* __HSR_PRIVATE_H */
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
new file mode 100644
index 000000000..a2c7e4c0a
--- /dev/null
+++ b/net/hsr/hsr_netlink.c
@@ -0,0 +1,493 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * Routines for handling Netlink messages for HSR.
+ */
+
+#include "hsr_netlink.h"
+#include <linux/kernel.h>
+#include <net/rtnetlink.h>
+#include <net/genetlink.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_framereg.h"
+
+static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
+ [IFLA_HSR_SLAVE1] = { .type = NLA_U32 },
+ [IFLA_HSR_SLAVE2] = { .type = NLA_U32 },
+ [IFLA_HSR_MULTICAST_SPEC] = { .type = NLA_U8 },
+ [IFLA_HSR_SUPERVISION_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 },
+};
+
+
+/* Here, it seems a netdevice has already been allocated for us, and the
+ * hsr_dev_setup routine has been executed. Nice!
+ */
+static int hsr_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net_device *link[2];
+ unsigned char multicast_spec;
+
+ if (!data) {
+ netdev_info(dev, "HSR: No slave devices specified\n");
+ return -EINVAL;
+ }
+ if (!data[IFLA_HSR_SLAVE1]) {
+ netdev_info(dev, "HSR: Slave1 device not specified\n");
+ return -EINVAL;
+ }
+ link[0] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE1]));
+ if (!data[IFLA_HSR_SLAVE2]) {
+ netdev_info(dev, "HSR: Slave2 device not specified\n");
+ return -EINVAL;
+ }
+ link[1] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE2]));
+
+ if (!link[0] || !link[1])
+ return -ENODEV;
+ if (link[0] == link[1])
+ return -EINVAL;
+
+ if (!data[IFLA_HSR_MULTICAST_SPEC])
+ multicast_spec = 0;
+ else
+ multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]);
+
+ return hsr_dev_finalize(dev, link, multicast_spec);
+}
+
+static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *port;
+ int res;
+
+ hsr = netdev_priv(dev);
+
+ res = 0;
+
+ rcu_read_lock();
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (port)
+ res = nla_put_u32(skb, IFLA_HSR_SLAVE1, port->dev->ifindex);
+ rcu_read_unlock();
+ if (res)
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port)
+ res = nla_put_u32(skb, IFLA_HSR_SLAVE2, port->dev->ifindex);
+ rcu_read_unlock();
+ if (res)
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN,
+ hsr->sup_multicast_addr) ||
+ nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops hsr_link_ops __read_mostly = {
+ .kind = "hsr",
+ .maxtype = IFLA_HSR_MAX,
+ .policy = hsr_policy,
+ .priv_size = sizeof(struct hsr_priv),
+ .setup = hsr_dev_setup,
+ .newlink = hsr_newlink,
+ .fill_info = hsr_fill_info,
+};
+
+
+
+/* attribute policy */
+/* NLA_BINARY missing in libnl; use NLA_UNSPEC in userspace instead. */
+static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
+ [HSR_A_NODE_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [HSR_A_NODE_ADDR_B] = { .type = NLA_BINARY, .len = ETH_ALEN },
+ [HSR_A_IFINDEX] = { .type = NLA_U32 },
+ [HSR_A_IF1_AGE] = { .type = NLA_U32 },
+ [HSR_A_IF2_AGE] = { .type = NLA_U32 },
+ [HSR_A_IF1_SEQ] = { .type = NLA_U16 },
+ [HSR_A_IF2_SEQ] = { .type = NLA_U16 },
+};
+
+static struct genl_family hsr_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = "HSR",
+ .version = 1,
+ .maxattr = HSR_A_MAX,
+};
+
+static const struct genl_multicast_group hsr_mcgrps[] = {
+ { .name = "hsr-network", },
+};
+
+
+
+/* This is called if for some node with MAC address addr, we only get frames
+ * over one of the slave interfaces. This would indicate an open network ring
+ * (i.e. a link has failed somewhere).
+ */
+void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN],
+ struct hsr_port *port)
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ struct hsr_port *master;
+ int res;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ goto fail;
+
+ msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_RING_ERROR);
+ if (!msg_head)
+ goto nla_put_failure;
+
+ res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put_u32(skb, HSR_A_IFINDEX, port->dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, msg_head);
+ genlmsg_multicast(&hsr_genl_family, skb, 0, 0, GFP_ATOMIC);
+
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+
+fail:
+ rcu_read_lock();
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ netdev_warn(master->dev, "Could not send HSR ring error message\n");
+ rcu_read_unlock();
+}
+
+/* This is called when we haven't heard from the node with MAC address addr for
+ * some time (just before the node is removed from the node table/list).
+ */
+void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN])
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ struct hsr_port *master;
+ int res;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ goto fail;
+
+ msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_NODE_DOWN);
+ if (!msg_head)
+ goto nla_put_failure;
+
+
+ res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, msg_head);
+ genlmsg_multicast(&hsr_genl_family, skb, 0, 0, GFP_ATOMIC);
+
+ return;
+
+nla_put_failure:
+ kfree_skb(skb);
+
+fail:
+ rcu_read_lock();
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ netdev_warn(master->dev, "Could not send HSR node down\n");
+ rcu_read_unlock();
+}
+
+
+/* HSR_C_GET_NODE_STATUS lets userspace query the internal HSR node table
+ * about the status of a specific node in the network, defined by its MAC
+ * address.
+ *
+ * Input: hsr ifindex, node mac address
+ * Output: hsr ifindex, node mac address (copied from request),
+ * age of latest frame from node over slave 1, slave 2 [ms]
+ */
+static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
+{
+ /* For receiving */
+ struct nlattr *na;
+ struct net_device *hsr_dev;
+
+ /* For sending */
+ struct sk_buff *skb_out;
+ void *msg_head;
+ struct hsr_priv *hsr;
+ struct hsr_port *port;
+ unsigned char hsr_node_addr_b[ETH_ALEN];
+ int hsr_node_if1_age;
+ u16 hsr_node_if1_seq;
+ int hsr_node_if2_age;
+ u16 hsr_node_if2_seq;
+ int addr_b_ifindex;
+ int res;
+
+ if (!info)
+ goto invalid;
+
+ na = info->attrs[HSR_A_IFINDEX];
+ if (!na)
+ goto invalid;
+ na = info->attrs[HSR_A_NODE_ADDR];
+ if (!na)
+ goto invalid;
+
+ hsr_dev = __dev_get_by_index(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ if (!hsr_dev)
+ goto invalid;
+ if (!is_hsr_master(hsr_dev))
+ goto invalid;
+
+
+ /* Send reply */
+
+ skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb_out) {
+ res = -ENOMEM;
+ goto fail;
+ }
+
+ msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
+ info->snd_seq, &hsr_genl_family, 0,
+ HSR_C_SET_NODE_STATUS);
+ if (!msg_head) {
+ res = -ENOMEM;
+ goto nla_put_failure;
+ }
+
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ hsr = netdev_priv(hsr_dev);
+ res = hsr_get_node_data(hsr,
+ (unsigned char *) nla_data(info->attrs[HSR_A_NODE_ADDR]),
+ hsr_node_addr_b,
+ &addr_b_ifindex,
+ &hsr_node_if1_age,
+ &hsr_node_if1_seq,
+ &hsr_node_if2_age,
+ &hsr_node_if2_seq);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN,
+ nla_data(info->attrs[HSR_A_NODE_ADDR]));
+ if (res < 0)
+ goto nla_put_failure;
+
+ if (addr_b_ifindex > -1) {
+ res = nla_put(skb_out, HSR_A_NODE_ADDR_B, ETH_ALEN,
+ hsr_node_addr_b);
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put_u32(skb_out, HSR_A_ADDR_B_IFINDEX, addr_b_ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+ }
+
+ res = nla_put_u32(skb_out, HSR_A_IF1_AGE, hsr_node_if1_age);
+ if (res < 0)
+ goto nla_put_failure;
+ res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq);
+ if (res < 0)
+ goto nla_put_failure;
+ rcu_read_lock();
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (port)
+ res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX,
+ port->dev->ifindex);
+ rcu_read_unlock();
+ if (res < 0)
+ goto nla_put_failure;
+
+ res = nla_put_u32(skb_out, HSR_A_IF2_AGE, hsr_node_if2_age);
+ if (res < 0)
+ goto nla_put_failure;
+ res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq);
+ if (res < 0)
+ goto nla_put_failure;
+ rcu_read_lock();
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port)
+ res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX,
+ port->dev->ifindex);
+ rcu_read_unlock();
+ if (res < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
+
+ return 0;
+
+invalid:
+ netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL);
+ return 0;
+
+nla_put_failure:
+ kfree_skb(skb_out);
+ /* Fall through */
+
+fail:
+ return res;
+}
+
+/* Get a list of MacAddressA of all nodes known to this node (including self).
+ */
+static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
+{
+ /* For receiving */
+ struct nlattr *na;
+ struct net_device *hsr_dev;
+
+ /* For sending */
+ struct sk_buff *skb_out;
+ void *msg_head;
+ struct hsr_priv *hsr;
+ void *pos;
+ unsigned char addr[ETH_ALEN];
+ int res;
+
+ if (!info)
+ goto invalid;
+
+ na = info->attrs[HSR_A_IFINDEX];
+ if (!na)
+ goto invalid;
+
+ hsr_dev = __dev_get_by_index(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ if (!hsr_dev)
+ goto invalid;
+ if (!is_hsr_master(hsr_dev))
+ goto invalid;
+
+
+ /* Send reply */
+
+ skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb_out) {
+ res = -ENOMEM;
+ goto fail;
+ }
+
+ msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
+ info->snd_seq, &hsr_genl_family, 0,
+ HSR_C_SET_NODE_LIST);
+ if (!msg_head) {
+ res = -ENOMEM;
+ goto nla_put_failure;
+ }
+
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+
+ hsr = netdev_priv(hsr_dev);
+
+ rcu_read_lock();
+ pos = hsr_get_next_node(hsr, NULL, addr);
+ while (pos) {
+ res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr);
+ if (res < 0) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+ pos = hsr_get_next_node(hsr, pos, addr);
+ }
+ rcu_read_unlock();
+
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
+
+ return 0;
+
+invalid:
+ netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL);
+ return 0;
+
+nla_put_failure:
+ kfree_skb(skb_out);
+ /* Fall through */
+
+fail:
+ return res;
+}
+
+
+static const struct genl_ops hsr_ops[] = {
+ {
+ .cmd = HSR_C_GET_NODE_STATUS,
+ .flags = 0,
+ .policy = hsr_genl_policy,
+ .doit = hsr_get_node_status,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = HSR_C_GET_NODE_LIST,
+ .flags = 0,
+ .policy = hsr_genl_policy,
+ .doit = hsr_get_node_list,
+ .dumpit = NULL,
+ },
+};
+
+int __init hsr_netlink_init(void)
+{
+ int rc;
+
+ rc = rtnl_link_register(&hsr_link_ops);
+ if (rc)
+ goto fail_rtnl_link_register;
+
+ rc = genl_register_family_with_ops_groups(&hsr_genl_family, hsr_ops,
+ hsr_mcgrps);
+ if (rc)
+ goto fail_genl_register_family;
+
+ return 0;
+
+fail_genl_register_family:
+ rtnl_link_unregister(&hsr_link_ops);
+fail_rtnl_link_register:
+
+ return rc;
+}
+
+void __exit hsr_netlink_exit(void)
+{
+ genl_unregister_family(&hsr_genl_family);
+ rtnl_link_unregister(&hsr_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("hsr");
diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h
new file mode 100644
index 000000000..3f6b95b5b
--- /dev/null
+++ b/net/hsr/hsr_netlink.h
@@ -0,0 +1,31 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#ifndef __HSR_NETLINK_H
+#define __HSR_NETLINK_H
+
+#include <linux/if_ether.h>
+#include <linux/module.h>
+#include <uapi/linux/hsr_netlink.h>
+
+struct hsr_priv;
+struct hsr_port;
+
+int __init hsr_netlink_init(void);
+void __exit hsr_netlink_exit(void);
+
+void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN],
+ struct hsr_port *port);
+void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]);
+void hsr_nl_framedrop(int dropcount, int dev_idx);
+void hsr_nl_linkdown(int dev_idx);
+
+#endif /* __HSR_NETLINK_H */
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
new file mode 100644
index 000000000..7d37366cc
--- /dev/null
+++ b/net/hsr/hsr_slave.c
@@ -0,0 +1,200 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#include "hsr_slave.h"
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include "hsr_main.h"
+#include "hsr_device.h"
+#include "hsr_forward.h"
+#include "hsr_framereg.h"
+
+
+static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct hsr_port *port;
+
+ if (!skb_mac_header_was_set(skb)) {
+ WARN_ONCE(1, "%s: skb invalid", __func__);
+ return RX_HANDLER_PASS;
+ }
+
+ rcu_read_lock(); /* hsr->node_db, hsr->ports */
+ port = hsr_port_get_rcu(skb->dev);
+
+ if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) {
+ /* Directly kill frames sent by ourselves */
+ kfree_skb(skb);
+ goto finish_consume;
+ }
+
+ if (eth_hdr(skb)->h_proto != htons(ETH_P_PRP))
+ goto finish_pass;
+
+ skb_push(skb, ETH_HLEN);
+
+ hsr_forward_skb(skb, port);
+
+finish_consume:
+ rcu_read_unlock(); /* hsr->node_db, hsr->ports */
+ return RX_HANDLER_CONSUMED;
+
+finish_pass:
+ rcu_read_unlock(); /* hsr->node_db, hsr->ports */
+ return RX_HANDLER_PASS;
+}
+
+bool hsr_port_exists(const struct net_device *dev)
+{
+ return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame;
+}
+
+
+static int hsr_check_dev_ok(struct net_device *dev)
+{
+ /* Don't allow HSR on non-ethernet like devices */
+ if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) ||
+ (dev->addr_len != ETH_ALEN)) {
+ netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n");
+ return -EINVAL;
+ }
+
+ /* Don't allow enslaving hsr devices */
+ if (is_hsr_master(dev)) {
+ netdev_info(dev, "Cannot create trees of HSR devices.\n");
+ return -EINVAL;
+ }
+
+ if (hsr_port_exists(dev)) {
+ netdev_info(dev, "This device is already a HSR slave.\n");
+ return -EINVAL;
+ }
+
+ if (dev->priv_flags & IFF_802_1Q_VLAN) {
+ netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
+ return -EINVAL;
+ }
+
+ if (dev->priv_flags & IFF_DONT_BRIDGE) {
+ netdev_info(dev, "This device does not support bridging.\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* HSR over bonded devices has not been tested, but I'm not sure it
+ * won't work...
+ */
+
+ return 0;
+}
+
+
+/* Setup device to be added to the HSR bridge. */
+static int hsr_portdev_setup(struct net_device *dev, struct hsr_port *port)
+{
+ int res;
+
+ dev_hold(dev);
+ res = dev_set_promiscuity(dev, 1);
+ if (res)
+ goto fail_promiscuity;
+
+ /* FIXME:
+ * What does net device "adjacency" mean? Should we do
+ * res = netdev_master_upper_dev_link(port->dev, port->hsr->dev); ?
+ */
+
+ res = netdev_rx_handler_register(dev, hsr_handle_frame, port);
+ if (res)
+ goto fail_rx_handler;
+ dev_disable_lro(dev);
+
+ return 0;
+
+fail_rx_handler:
+ dev_set_promiscuity(dev, -1);
+fail_promiscuity:
+ dev_put(dev);
+
+ return res;
+}
+
+int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
+ enum hsr_port_type type)
+{
+ struct hsr_port *port, *master;
+ int res;
+
+ if (type != HSR_PT_MASTER) {
+ res = hsr_check_dev_ok(dev);
+ if (res)
+ return res;
+ }
+
+ port = hsr_port_get_hsr(hsr, type);
+ if (port != NULL)
+ return -EBUSY; /* This port already exists */
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (port == NULL)
+ return -ENOMEM;
+
+ if (type != HSR_PT_MASTER) {
+ res = hsr_portdev_setup(dev, port);
+ if (res)
+ goto fail_dev_setup;
+ }
+
+ port->hsr = hsr;
+ port->dev = dev;
+ port->type = type;
+
+ list_add_tail_rcu(&port->port_list, &hsr->ports);
+ synchronize_rcu();
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ netdev_update_features(master->dev);
+ dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
+
+ return 0;
+
+fail_dev_setup:
+ kfree(port);
+ return res;
+}
+
+void hsr_del_port(struct hsr_port *port)
+{
+ struct hsr_priv *hsr;
+ struct hsr_port *master;
+
+ hsr = port->hsr;
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ list_del_rcu(&port->port_list);
+
+ if (port != master) {
+ if (master != NULL) {
+ netdev_update_features(master->dev);
+ dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
+ }
+ netdev_rx_handler_unregister(port->dev);
+ dev_set_promiscuity(port->dev, -1);
+ }
+
+ /* FIXME?
+ * netdev_upper_dev_unlink(port->dev, port->hsr->dev);
+ */
+
+ synchronize_rcu();
+
+ if (port != master)
+ dev_put(port->dev);
+}
diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h
new file mode 100644
index 000000000..3ccfbf71c
--- /dev/null
+++ b/net/hsr/hsr_slave.h
@@ -0,0 +1,38 @@
+/* Copyright 2011-2014 Autronica Fire and Security AS
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Author(s):
+ * 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ */
+
+#ifndef __HSR_SLAVE_H
+#define __HSR_SLAVE_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include "hsr_main.h"
+
+int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
+ enum hsr_port_type pt);
+void hsr_del_port(struct hsr_port *port);
+bool hsr_port_exists(const struct net_device *dev);
+
+static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev)
+{
+ ASSERT_RTNL();
+ return hsr_port_exists(dev) ?
+ rtnl_dereference(dev->rx_handler_data) : NULL;
+}
+
+static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev)
+{
+ return hsr_port_exists(dev) ?
+ rcu_dereference(dev->rx_handler_data) : NULL;
+}
+
+#endif /* __HSR_SLAVE_H */
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
new file mode 100644
index 000000000..e50f69da7
--- /dev/null
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -0,0 +1,72 @@
+#ifndef __IEEE802154_6LOWPAN_I_H__
+#define __IEEE802154_6LOWPAN_I_H__
+
+#include <linux/list.h>
+
+#include <net/ieee802154_netdev.h>
+#include <net/inet_frag.h>
+
+struct lowpan_create_arg {
+ u16 tag;
+ u16 d_size;
+ const struct ieee802154_addr *src;
+ const struct ieee802154_addr *dst;
+};
+
+/* Equivalent of ipv4 struct ip
+ */
+struct lowpan_frag_queue {
+ struct inet_frag_queue q;
+
+ u16 tag;
+ u16 d_size;
+ struct ieee802154_addr saddr;
+ struct ieee802154_addr daddr;
+};
+
+static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
+{
+ switch (a->mode) {
+ case IEEE802154_ADDR_LONG:
+ return (((__force u64)a->extended_addr) >> 32) ^
+ (((__force u64)a->extended_addr) & 0xffffffff);
+ case IEEE802154_ADDR_SHORT:
+ return (__force u32)(a->short_addr);
+ default:
+ return 0;
+ }
+}
+
+struct lowpan_dev_record {
+ struct net_device *ldev;
+ struct list_head list;
+};
+
+/* private device info */
+struct lowpan_dev_info {
+ struct net_device *real_dev; /* real WPAN device ptr */
+ struct mutex dev_list_mtx; /* mutex for list ops */
+ u16 fragment_tag;
+};
+
+static inline struct
+lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
+{
+ return netdev_priv(dev);
+}
+
+extern struct list_head lowpan_devices;
+
+int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
+void lowpan_net_frag_exit(void);
+int lowpan_net_frag_init(void);
+
+void lowpan_rx_init(void);
+void lowpan_rx_exit(void);
+
+int lowpan_header_create(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *_daddr,
+ const void *_saddr, unsigned int len);
+netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev);
+
+#endif /* __IEEE802154_6LOWPAN_I_H__ */
diff --git a/net/ieee802154/6lowpan/Kconfig b/net/ieee802154/6lowpan/Kconfig
new file mode 100644
index 000000000..d24f985b0
--- /dev/null
+++ b/net/ieee802154/6lowpan/Kconfig
@@ -0,0 +1,5 @@
+config IEEE802154_6LOWPAN
+ tristate "6lowpan support over IEEE 802.15.4"
+ depends on 6LOWPAN
+ ---help---
+ IPv6 compression over IEEE 802.15.4.
diff --git a/net/ieee802154/6lowpan/Makefile b/net/ieee802154/6lowpan/Makefile
new file mode 100644
index 000000000..6bfb270a8
--- /dev/null
+++ b/net/ieee802154/6lowpan/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_IEEE802154_6LOWPAN) += ieee802154_6lowpan.o
+
+ieee802154_6lowpan-y := core.o rx.o reassembly.o tx.o
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
new file mode 100644
index 000000000..0ae5822ef
--- /dev/null
+++ b/net/ieee802154/6lowpan/core.c
@@ -0,0 +1,306 @@
+/* Copyright 2011, Siemens AG
+ * written by Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+/* Based on patches from Jon Smirl <jonsmirl@gmail.com>
+ * Copyright (c) 2011 Jon Smirl <jonsmirl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Jon's code is based on 6lowpan implementation for Contiki which is:
+ * Copyright (c) 2008, Swedish Institute of Computer Science.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the Institute nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ieee802154.h>
+
+#include <net/ipv6.h>
+
+#include "6lowpan_i.h"
+
+LIST_HEAD(lowpan_devices);
+static int lowpan_open_count;
+
+static __le16 lowpan_get_pan_id(const struct net_device *dev)
+{
+ struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
+
+ return ieee802154_mlme_ops(real_dev)->get_pan_id(real_dev);
+}
+
+static __le16 lowpan_get_short_addr(const struct net_device *dev)
+{
+ struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
+
+ return ieee802154_mlme_ops(real_dev)->get_short_addr(real_dev);
+}
+
+static u8 lowpan_get_dsn(const struct net_device *dev)
+{
+ struct net_device *real_dev = lowpan_dev_info(dev)->real_dev;
+
+ return ieee802154_mlme_ops(real_dev)->get_dsn(real_dev);
+}
+
+static struct header_ops lowpan_header_ops = {
+ .create = lowpan_header_create,
+};
+
+static struct lock_class_key lowpan_tx_busylock;
+static struct lock_class_key lowpan_netdev_xmit_lock_key;
+
+static void lowpan_set_lockdep_class_one(struct net_device *dev,
+ struct netdev_queue *txq,
+ void *_unused)
+{
+ lockdep_set_class(&txq->_xmit_lock,
+ &lowpan_netdev_xmit_lock_key);
+}
+
+static int lowpan_dev_init(struct net_device *dev)
+{
+ netdev_for_each_tx_queue(dev, lowpan_set_lockdep_class_one, NULL);
+ dev->qdisc_tx_busylock = &lowpan_tx_busylock;
+ return 0;
+}
+
+static const struct net_device_ops lowpan_netdev_ops = {
+ .ndo_init = lowpan_dev_init,
+ .ndo_start_xmit = lowpan_xmit,
+};
+
+static struct ieee802154_mlme_ops lowpan_mlme = {
+ .get_pan_id = lowpan_get_pan_id,
+ .get_short_addr = lowpan_get_short_addr,
+ .get_dsn = lowpan_get_dsn,
+};
+
+static void lowpan_setup(struct net_device *dev)
+{
+ dev->addr_len = IEEE802154_ADDR_LEN;
+ memset(dev->broadcast, 0xff, IEEE802154_ADDR_LEN);
+ dev->type = ARPHRD_6LOWPAN;
+ /* Frame Control + Sequence Number + Address fields + Security Header */
+ dev->hard_header_len = 2 + 1 + 20 + 14;
+ dev->needed_tailroom = 2; /* FCS */
+ dev->mtu = IPV6_MIN_MTU;
+ dev->tx_queue_len = 0;
+ dev->flags = IFF_BROADCAST | IFF_MULTICAST;
+ dev->watchdog_timeo = 0;
+
+ dev->netdev_ops = &lowpan_netdev_ops;
+ dev->header_ops = &lowpan_header_ops;
+ dev->ml_priv = &lowpan_mlme;
+ dev->destructor = free_netdev;
+ dev->features |= NETIF_F_NETNS_LOCAL;
+}
+
+static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != IEEE802154_ADDR_LEN)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int lowpan_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net_device *real_dev;
+ struct lowpan_dev_record *entry;
+ int ret;
+
+ ASSERT_RTNL();
+
+ pr_debug("adding new link\n");
+
+ if (!tb[IFLA_LINK] ||
+ !net_eq(dev_net(dev), &init_net))
+ return -EINVAL;
+ /* find and hold real wpan device */
+ real_dev = dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK]));
+ if (!real_dev)
+ return -ENODEV;
+ if (real_dev->type != ARPHRD_IEEE802154) {
+ dev_put(real_dev);
+ return -EINVAL;
+ }
+
+ lowpan_dev_info(dev)->real_dev = real_dev;
+ mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ dev_put(real_dev);
+ lowpan_dev_info(dev)->real_dev = NULL;
+ return -ENOMEM;
+ }
+
+ entry->ldev = dev;
+
+ /* Set the lowpan hardware address to the wpan hardware address. */
+ memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN);
+
+ mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx);
+ INIT_LIST_HEAD(&entry->list);
+ list_add_tail(&entry->list, &lowpan_devices);
+ mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx);
+
+ ret = register_netdevice(dev);
+ if (ret >= 0) {
+ if (!lowpan_open_count)
+ lowpan_rx_init();
+ lowpan_open_count++;
+ }
+
+ return ret;
+}
+
+static void lowpan_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev);
+ struct net_device *real_dev = lowpan_dev->real_dev;
+ struct lowpan_dev_record *entry, *tmp;
+
+ ASSERT_RTNL();
+
+ lowpan_open_count--;
+ if (!lowpan_open_count)
+ lowpan_rx_exit();
+
+ mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx);
+ list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) {
+ if (entry->ldev == dev) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ }
+ mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx);
+
+ mutex_destroy(&lowpan_dev_info(dev)->dev_list_mtx);
+
+ unregister_netdevice_queue(dev, head);
+
+ dev_put(real_dev);
+}
+
+static struct rtnl_link_ops lowpan_link_ops __read_mostly = {
+ .kind = "lowpan",
+ .priv_size = sizeof(struct lowpan_dev_info),
+ .setup = lowpan_setup,
+ .newlink = lowpan_newlink,
+ .dellink = lowpan_dellink,
+ .validate = lowpan_validate,
+};
+
+static inline int __init lowpan_netlink_init(void)
+{
+ return rtnl_link_register(&lowpan_link_ops);
+}
+
+static inline void lowpan_netlink_fini(void)
+{
+ rtnl_link_unregister(&lowpan_link_ops);
+}
+
+static int lowpan_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ LIST_HEAD(del_list);
+ struct lowpan_dev_record *entry, *tmp;
+
+ if (dev->type != ARPHRD_IEEE802154)
+ goto out;
+
+ if (event == NETDEV_UNREGISTER) {
+ list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) {
+ if (lowpan_dev_info(entry->ldev)->real_dev == dev)
+ lowpan_dellink(entry->ldev, &del_list);
+ }
+
+ unregister_netdevice_many(&del_list);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lowpan_dev_notifier = {
+ .notifier_call = lowpan_device_event,
+};
+
+static int __init lowpan_init_module(void)
+{
+ int err = 0;
+
+ err = lowpan_net_frag_init();
+ if (err < 0)
+ goto out;
+
+ err = lowpan_netlink_init();
+ if (err < 0)
+ goto out_frag;
+
+ err = register_netdevice_notifier(&lowpan_dev_notifier);
+ if (err < 0)
+ goto out_pack;
+
+ return 0;
+
+out_pack:
+ lowpan_netlink_fini();
+out_frag:
+ lowpan_net_frag_exit();
+out:
+ return err;
+}
+
+static void __exit lowpan_cleanup_module(void)
+{
+ lowpan_netlink_fini();
+
+ lowpan_net_frag_exit();
+
+ unregister_netdevice_notifier(&lowpan_dev_notifier);
+}
+
+module_init(lowpan_init_module);
+module_exit(lowpan_cleanup_module);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("lowpan");
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
new file mode 100644
index 000000000..f46e4d130
--- /dev/null
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -0,0 +1,585 @@
+/* 6LoWPAN fragment reassembly
+ *
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Based on: net/ipv6/reassembly.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "6LoWPAN: " fmt
+
+#include <linux/net.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/ieee802154_netdev.h>
+#include <net/6lowpan.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+#include "6lowpan_i.h"
+
+static const char lowpan_frags_cache_name[] = "lowpan-frags";
+
+struct lowpan_frag_info {
+ u16 d_tag;
+ u16 d_size;
+ u8 d_offset;
+};
+
+static struct lowpan_frag_info *lowpan_cb(struct sk_buff *skb)
+{
+ return (struct lowpan_frag_info *)skb->cb;
+}
+
+static struct inet_frags lowpan_frags;
+
+static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
+ struct sk_buff *prev, struct net_device *dev);
+
+static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
+ const struct ieee802154_addr *saddr,
+ const struct ieee802154_addr *daddr)
+{
+ net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
+ return jhash_3words(ieee802154_addr_hash(saddr),
+ ieee802154_addr_hash(daddr),
+ (__force u32)(tag + (d_size << 16)),
+ lowpan_frags.rnd);
+}
+
+static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
+{
+ const struct lowpan_frag_queue *fq;
+
+ fq = container_of(q, struct lowpan_frag_queue, q);
+ return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
+}
+
+static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
+{
+ const struct lowpan_frag_queue *fq;
+ const struct lowpan_create_arg *arg = a;
+
+ fq = container_of(q, struct lowpan_frag_queue, q);
+ return fq->tag == arg->tag && fq->d_size == arg->d_size &&
+ ieee802154_addr_equal(&fq->saddr, arg->src) &&
+ ieee802154_addr_equal(&fq->daddr, arg->dst);
+}
+
+static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
+{
+ const struct lowpan_create_arg *arg = a;
+ struct lowpan_frag_queue *fq;
+
+ fq = container_of(q, struct lowpan_frag_queue, q);
+
+ fq->tag = arg->tag;
+ fq->d_size = arg->d_size;
+ fq->saddr = *arg->src;
+ fq->daddr = *arg->dst;
+}
+
+static void lowpan_frag_expire(unsigned long data)
+{
+ struct frag_queue *fq;
+ struct net *net;
+
+ fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+ net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags);
+
+ spin_lock(&fq->q.lock);
+
+ if (fq->q.flags & INET_FRAG_COMPLETE)
+ goto out;
+
+ inet_frag_kill(&fq->q, &lowpan_frags);
+out:
+ spin_unlock(&fq->q.lock);
+ inet_frag_put(&fq->q, &lowpan_frags);
+}
+
+static inline struct lowpan_frag_queue *
+fq_find(struct net *net, const struct lowpan_frag_info *frag_info,
+ const struct ieee802154_addr *src,
+ const struct ieee802154_addr *dst)
+{
+ struct inet_frag_queue *q;
+ struct lowpan_create_arg arg;
+ unsigned int hash;
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ arg.tag = frag_info->d_tag;
+ arg.d_size = frag_info->d_size;
+ arg.src = src;
+ arg.dst = dst;
+
+ hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst);
+
+ q = inet_frag_find(&ieee802154_lowpan->frags,
+ &lowpan_frags, &arg, hash);
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
+ return container_of(q, struct lowpan_frag_queue, q);
+}
+
+static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
+ struct sk_buff *skb, const u8 frag_type)
+{
+ struct sk_buff *prev, *next;
+ struct net_device *dev;
+ int end, offset;
+
+ if (fq->q.flags & INET_FRAG_COMPLETE)
+ goto err;
+
+ offset = lowpan_cb(skb)->d_offset << 3;
+ end = lowpan_cb(skb)->d_size;
+
+ /* Is this the final fragment? */
+ if (offset + skb->len == end) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+ if (end < fq->q.len ||
+ ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
+ goto err;
+ fq->q.flags |= INET_FRAG_LAST_IN;
+ fq->q.len = end;
+ } else {
+ if (end > fq->q.len) {
+ /* Some bits beyond end -> corruption. */
+ if (fq->q.flags & INET_FRAG_LAST_IN)
+ goto err;
+ fq->q.len = end;
+ }
+ }
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = fq->q.fragments_tail;
+ if (!prev || lowpan_cb(prev)->d_offset < lowpan_cb(skb)->d_offset) {
+ next = NULL;
+ goto found;
+ }
+ prev = NULL;
+ for (next = fq->q.fragments; next != NULL; next = next->next) {
+ if (lowpan_cb(next)->d_offset >= lowpan_cb(skb)->d_offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+found:
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (!next)
+ fq->q.fragments_tail = skb;
+ if (prev)
+ prev->next = skb;
+ else
+ fq->q.fragments = skb;
+
+ dev = skb->dev;
+ if (dev)
+ skb->dev = NULL;
+
+ fq->q.stamp = skb->tstamp;
+ if (frag_type == LOWPAN_DISPATCH_FRAG1) {
+ /* Calculate uncomp. 6lowpan header to estimate full size */
+ fq->q.meat += lowpan_uncompress_size(skb, NULL);
+ fq->q.flags |= INET_FRAG_FIRST_IN;
+ } else {
+ fq->q.meat += skb->len;
+ }
+ add_frag_mem_limit(&fq->q, skb->truesize);
+
+ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ fq->q.meat == fq->q.len) {
+ int res;
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ res = lowpan_frag_reasm(fq, prev, dev);
+ skb->_skb_refdst = orefdst;
+ return res;
+ }
+
+ return -1;
+err:
+ kfree_skb(skb);
+ return -1;
+}
+
+/* Check if this packet is complete.
+ * Returns NULL on failure by any reason, and pointer
+ * to current nexthdr field in reassembled frame.
+ *
+ * It is called with locked fq, and caller must check that
+ * queue is eligible for reassembly i.e. it is not COMPLETE,
+ * the last and the first frames arrived and all the bits are here.
+ */
+static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
+ struct net_device *dev)
+{
+ struct sk_buff *fp, *head = fq->q.fragments;
+ int sum_truesize;
+
+ inet_frag_kill(&fq->q, &lowpan_frags);
+
+ /* Make the one we just received the head. */
+ if (prev) {
+ head = prev->next;
+ fp = skb_clone(head, GFP_ATOMIC);
+
+ if (!fp)
+ goto out_oom;
+
+ fp->next = head->next;
+ if (!fp->next)
+ fq->q.fragments_tail = fp;
+ prev->next = fp;
+
+ skb_morph(head, fq->q.fragments);
+ head->next = fq->q.fragments->next;
+
+ consume_skb(fq->q.fragments);
+ fq->q.fragments = head;
+ }
+
+ /* Head of list must not be cloned. */
+ if (skb_unclone(head, GFP_ATOMIC))
+ goto out_oom;
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments.
+ */
+ if (skb_has_frag_list(head)) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
+ goto out_oom;
+ clone->next = head->next;
+ head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->len = head->data_len - plen;
+ clone->data_len = clone->len;
+ head->data_len -= clone->len;
+ head->len -= clone->len;
+ add_frag_mem_limit(&fq->q, clone->truesize);
+ }
+
+ WARN_ON(head == NULL);
+
+ sum_truesize = head->truesize;
+ for (fp = head->next; fp;) {
+ bool headstolen;
+ int delta;
+ struct sk_buff *next = fp->next;
+
+ sum_truesize += fp->truesize;
+ if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
+ kfree_skb_partial(fp, headstolen);
+ } else {
+ if (!skb_shinfo(head)->frag_list)
+ skb_shinfo(head)->frag_list = fp;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+ }
+ fp = next;
+ }
+ sub_frag_mem_limit(&fq->q, sum_truesize);
+
+ head->next = NULL;
+ head->dev = dev;
+ head->tstamp = fq->q.stamp;
+
+ fq->q.fragments = NULL;
+ fq->q.fragments_tail = NULL;
+
+ return 1;
+out_oom:
+ net_dbg_ratelimited("lowpan_frag_reasm: no memory for reassembly\n");
+ return -1;
+}
+
+static int lowpan_get_frag_info(struct sk_buff *skb, const u8 frag_type,
+ struct lowpan_frag_info *frag_info)
+{
+ bool fail;
+ u8 pattern = 0, low = 0;
+ __be16 d_tag = 0;
+
+ fail = lowpan_fetch_skb(skb, &pattern, 1);
+ fail |= lowpan_fetch_skb(skb, &low, 1);
+ frag_info->d_size = (pattern & 7) << 8 | low;
+ fail |= lowpan_fetch_skb(skb, &d_tag, 2);
+ frag_info->d_tag = ntohs(d_tag);
+
+ if (frag_type == LOWPAN_DISPATCH_FRAGN) {
+ fail |= lowpan_fetch_skb(skb, &frag_info->d_offset, 1);
+ } else {
+ skb_reset_network_header(skb);
+ frag_info->d_offset = 0;
+ }
+
+ if (unlikely(fail))
+ return -EIO;
+
+ return 0;
+}
+
+int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type)
+{
+ struct lowpan_frag_queue *fq;
+ struct net *net = dev_net(skb->dev);
+ struct lowpan_frag_info *frag_info = lowpan_cb(skb);
+ struct ieee802154_addr source, dest;
+ int err;
+
+ source = mac_cb(skb)->source;
+ dest = mac_cb(skb)->dest;
+
+ err = lowpan_get_frag_info(skb, frag_type, frag_info);
+ if (err < 0)
+ goto err;
+
+ if (frag_info->d_size > IPV6_MIN_MTU) {
+ net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n");
+ goto err;
+ }
+
+ fq = fq_find(net, frag_info, &source, &dest);
+ if (fq != NULL) {
+ int ret;
+
+ spin_lock(&fq->q.lock);
+ ret = lowpan_frag_queue(fq, skb, frag_type);
+ spin_unlock(&fq->q.lock);
+
+ inet_frag_put(&fq->q, &lowpan_frags);
+ return ret;
+ }
+
+err:
+ kfree_skb(skb);
+ return -1;
+}
+EXPORT_SYMBOL(lowpan_frag_rcv);
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+
+static struct ctl_table lowpan_frags_ns_ctl_table[] = {
+ {
+ .procname = "6lowpanfrag_high_thresh",
+ .data = &init_net.ieee802154_lowpan.frags.high_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh
+ },
+ {
+ .procname = "6lowpanfrag_low_thresh",
+ .data = &init_net.ieee802154_lowpan.frags.low_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh
+ },
+ {
+ .procname = "6lowpanfrag_time",
+ .data = &init_net.ieee802154_lowpan.frags.timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+
+/* secret interval has been deprecated */
+static int lowpan_frags_secret_interval_unused;
+static struct ctl_table lowpan_frags_ctl_table[] = {
+ {
+ .procname = "6lowpanfrag_secret_interval",
+ .data = &lowpan_frags_secret_interval_unused,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+
+static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ table = lowpan_frags_ns_ctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(lowpan_frags_ns_ctl_table),
+ GFP_KERNEL);
+ if (table == NULL)
+ goto err_alloc;
+
+ table[0].data = &ieee802154_lowpan->frags.high_thresh;
+ table[0].extra1 = &ieee802154_lowpan->frags.low_thresh;
+ table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh;
+ table[1].data = &ieee802154_lowpan->frags.low_thresh;
+ table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
+ table[2].data = &ieee802154_lowpan->frags.timeout;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+ }
+
+ hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table);
+ if (hdr == NULL)
+ goto err_reg;
+
+ ieee802154_lowpan->sysctl.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net)
+{
+ struct ctl_table *table;
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ table = ieee802154_lowpan->sysctl.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(ieee802154_lowpan->sysctl.frags_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct ctl_table_header *lowpan_ctl_header;
+
+static int __init lowpan_frags_sysctl_register(void)
+{
+ lowpan_ctl_header = register_net_sysctl(&init_net,
+ "net/ieee802154/6lowpan",
+ lowpan_frags_ctl_table);
+ return lowpan_ctl_header == NULL ? -ENOMEM : 0;
+}
+
+static void lowpan_frags_sysctl_unregister(void)
+{
+ unregister_net_sysctl_table(lowpan_ctl_header);
+}
+#else
+static inline int lowpan_frags_ns_sysctl_register(struct net *net)
+{
+ return 0;
+}
+
+static inline void lowpan_frags_ns_sysctl_unregister(struct net *net)
+{
+}
+
+static inline int __init lowpan_frags_sysctl_register(void)
+{
+ return 0;
+}
+
+static inline void lowpan_frags_sysctl_unregister(void)
+{
+}
+#endif
+
+static int __net_init lowpan_frags_init_net(struct net *net)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+ ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+ ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+
+ inet_frags_init_net(&ieee802154_lowpan->frags);
+
+ return lowpan_frags_ns_sysctl_register(net);
+}
+
+static void __net_exit lowpan_frags_exit_net(struct net *net)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ lowpan_frags_ns_sysctl_unregister(net);
+ inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+}
+
+static struct pernet_operations lowpan_frags_ops = {
+ .init = lowpan_frags_init_net,
+ .exit = lowpan_frags_exit_net,
+};
+
+int __init lowpan_net_frag_init(void)
+{
+ int ret;
+
+ ret = lowpan_frags_sysctl_register();
+ if (ret)
+ return ret;
+
+ ret = register_pernet_subsys(&lowpan_frags_ops);
+ if (ret)
+ goto err_pernet;
+
+ lowpan_frags.hashfn = lowpan_hashfn;
+ lowpan_frags.constructor = lowpan_frag_init;
+ lowpan_frags.destructor = NULL;
+ lowpan_frags.skb_free = NULL;
+ lowpan_frags.qsize = sizeof(struct frag_queue);
+ lowpan_frags.match = lowpan_frag_match;
+ lowpan_frags.frag_expire = lowpan_frag_expire;
+ lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
+ ret = inet_frags_init(&lowpan_frags);
+ if (ret)
+ goto err_pernet;
+
+ return ret;
+err_pernet:
+ lowpan_frags_sysctl_unregister();
+ return ret;
+}
+
+void lowpan_net_frag_exit(void)
+{
+ inet_frags_fini(&lowpan_frags);
+ lowpan_frags_sysctl_unregister();
+ unregister_pernet_subsys(&lowpan_frags_ops);
+}
diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c
new file mode 100644
index 000000000..4be1d289a
--- /dev/null
+++ b/net/ieee802154/6lowpan/rx.c
@@ -0,0 +1,171 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/if_arp.h>
+
+#include <net/6lowpan.h>
+#include <net/ieee802154_netdev.h>
+
+#include "6lowpan_i.h"
+
+static int lowpan_give_skb_to_devices(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct lowpan_dev_record *entry;
+ struct sk_buff *skb_cp;
+ int stat = NET_RX_SUCCESS;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->pkt_type = PACKET_HOST;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &lowpan_devices, list)
+ if (lowpan_dev_info(entry->ldev)->real_dev == skb->dev) {
+ skb_cp = skb_copy(skb, GFP_ATOMIC);
+ if (!skb_cp) {
+ kfree_skb(skb);
+ rcu_read_unlock();
+ return NET_RX_DROP;
+ }
+
+ skb_cp->dev = entry->ldev;
+ stat = netif_rx(skb_cp);
+ if (stat == NET_RX_DROP)
+ break;
+ }
+ rcu_read_unlock();
+
+ consume_skb(skb);
+
+ return stat;
+}
+
+static int
+iphc_decompress(struct sk_buff *skb, const struct ieee802154_hdr *hdr)
+{
+ u8 iphc0, iphc1;
+ struct ieee802154_addr_sa sa, da;
+ void *sap, *dap;
+
+ raw_dump_table(__func__, "raw skb data dump", skb->data, skb->len);
+ /* at least two bytes will be used for the encoding */
+ if (skb->len < 2)
+ return -EINVAL;
+
+ if (lowpan_fetch_skb_u8(skb, &iphc0))
+ return -EINVAL;
+
+ if (lowpan_fetch_skb_u8(skb, &iphc1))
+ return -EINVAL;
+
+ ieee802154_addr_to_sa(&sa, &hdr->source);
+ ieee802154_addr_to_sa(&da, &hdr->dest);
+
+ if (sa.addr_type == IEEE802154_ADDR_SHORT)
+ sap = &sa.short_addr;
+ else
+ sap = &sa.hwaddr;
+
+ if (da.addr_type == IEEE802154_ADDR_SHORT)
+ dap = &da.short_addr;
+ else
+ dap = &da.hwaddr;
+
+ return lowpan_header_decompress(skb, skb->dev, sap, sa.addr_type,
+ IEEE802154_ADDR_LEN, dap, da.addr_type,
+ IEEE802154_ADDR_LEN, iphc0, iphc1);
+}
+
+static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct ieee802154_hdr hdr;
+ int ret;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto drop;
+
+ if (!netif_running(dev))
+ goto drop_skb;
+
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop_skb;
+
+ if (dev->type != ARPHRD_IEEE802154)
+ goto drop_skb;
+
+ if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
+ goto drop_skb;
+
+ /* check that it's our buffer */
+ if (skb->data[0] == LOWPAN_DISPATCH_IPV6) {
+ /* Pull off the 1-byte of 6lowpan header. */
+ skb_pull(skb, 1);
+ return lowpan_give_skb_to_devices(skb, NULL);
+ } else {
+ switch (skb->data[0] & 0xe0) {
+ case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */
+ ret = iphc_decompress(skb, &hdr);
+ if (ret < 0)
+ goto drop_skb;
+
+ return lowpan_give_skb_to_devices(skb, NULL);
+ case LOWPAN_DISPATCH_FRAG1: /* first fragment header */
+ ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAG1);
+ if (ret == 1) {
+ ret = iphc_decompress(skb, &hdr);
+ if (ret < 0)
+ goto drop_skb;
+
+ return lowpan_give_skb_to_devices(skb, NULL);
+ } else if (ret == -1) {
+ return NET_RX_DROP;
+ } else {
+ return NET_RX_SUCCESS;
+ }
+ case LOWPAN_DISPATCH_FRAGN: /* next fragments headers */
+ ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAGN);
+ if (ret == 1) {
+ ret = iphc_decompress(skb, &hdr);
+ if (ret < 0)
+ goto drop_skb;
+
+ return lowpan_give_skb_to_devices(skb, NULL);
+ } else if (ret == -1) {
+ return NET_RX_DROP;
+ } else {
+ return NET_RX_SUCCESS;
+ }
+ default:
+ break;
+ }
+ }
+
+drop_skb:
+ kfree_skb(skb);
+drop:
+ return NET_RX_DROP;
+}
+
+static struct packet_type lowpan_packet_type = {
+ .type = htons(ETH_P_IEEE802154),
+ .func = lowpan_rcv,
+};
+
+void lowpan_rx_init(void)
+{
+ dev_add_pack(&lowpan_packet_type);
+}
+
+void lowpan_rx_exit(void)
+{
+ dev_remove_pack(&lowpan_packet_type);
+}
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
new file mode 100644
index 000000000..2349070bd
--- /dev/null
+++ b/net/ieee802154/6lowpan/tx.c
@@ -0,0 +1,271 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <net/6lowpan.h>
+#include <net/ieee802154_netdev.h>
+
+#include "6lowpan_i.h"
+
+/* don't save pan id, it's intra pan */
+struct lowpan_addr {
+ u8 mode;
+ union {
+ /* IPv6 needs big endian here */
+ __be64 extended_addr;
+ __be16 short_addr;
+ } u;
+};
+
+struct lowpan_addr_info {
+ struct lowpan_addr daddr;
+ struct lowpan_addr saddr;
+};
+
+static inline struct
+lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb)
+{
+ WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct lowpan_addr_info));
+ return (struct lowpan_addr_info *)(skb->data -
+ sizeof(struct lowpan_addr_info));
+}
+
+int lowpan_header_create(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *_daddr,
+ const void *_saddr, unsigned int len)
+{
+ const u8 *saddr = _saddr;
+ const u8 *daddr = _daddr;
+ struct lowpan_addr_info *info;
+
+ /* TODO:
+ * if this package isn't ipv6 one, where should it be routed?
+ */
+ if (type != ETH_P_IPV6)
+ return 0;
+
+ if (!saddr)
+ saddr = dev->dev_addr;
+
+ raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8);
+ raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8);
+
+ info = lowpan_skb_priv(skb);
+
+ /* TODO: Currently we only support extended_addr */
+ info->daddr.mode = IEEE802154_ADDR_LONG;
+ memcpy(&info->daddr.u.extended_addr, daddr,
+ sizeof(info->daddr.u.extended_addr));
+ info->saddr.mode = IEEE802154_ADDR_LONG;
+ memcpy(&info->saddr.u.extended_addr, saddr,
+ sizeof(info->daddr.u.extended_addr));
+
+ return 0;
+}
+
+static struct sk_buff*
+lowpan_alloc_frag(struct sk_buff *skb, int size,
+ const struct ieee802154_hdr *master_hdr)
+{
+ struct net_device *real_dev = lowpan_dev_info(skb->dev)->real_dev;
+ struct sk_buff *frag;
+ int rc;
+
+ frag = alloc_skb(real_dev->hard_header_len +
+ real_dev->needed_tailroom + size,
+ GFP_ATOMIC);
+
+ if (likely(frag)) {
+ frag->dev = real_dev;
+ frag->priority = skb->priority;
+ skb_reserve(frag, real_dev->hard_header_len);
+ skb_reset_network_header(frag);
+ *mac_cb(frag) = *mac_cb(skb);
+
+ rc = dev_hard_header(frag, real_dev, 0, &master_hdr->dest,
+ &master_hdr->source, size);
+ if (rc < 0) {
+ kfree_skb(frag);
+ return ERR_PTR(rc);
+ }
+ } else {
+ frag = ERR_PTR(-ENOMEM);
+ }
+
+ return frag;
+}
+
+static int
+lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr,
+ u8 *frag_hdr, int frag_hdrlen,
+ int offset, int len)
+{
+ struct sk_buff *frag;
+
+ raw_dump_inline(__func__, " fragment header", frag_hdr, frag_hdrlen);
+
+ frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr);
+ if (IS_ERR(frag))
+ return -PTR_ERR(frag);
+
+ memcpy(skb_put(frag, frag_hdrlen), frag_hdr, frag_hdrlen);
+ memcpy(skb_put(frag, len), skb_network_header(skb) + offset, len);
+
+ raw_dump_table(__func__, " fragment dump", frag->data, frag->len);
+
+ return dev_queue_xmit(frag);
+}
+
+static int
+lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *dev,
+ const struct ieee802154_hdr *wpan_hdr)
+{
+ u16 dgram_size, dgram_offset;
+ __be16 frag_tag;
+ u8 frag_hdr[5];
+ int frag_cap, frag_len, payload_cap, rc;
+ int skb_unprocessed, skb_offset;
+
+ dgram_size = lowpan_uncompress_size(skb, &dgram_offset) -
+ skb->mac_len;
+ frag_tag = htons(lowpan_dev_info(dev)->fragment_tag);
+ lowpan_dev_info(dev)->fragment_tag++;
+
+ frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07);
+ frag_hdr[1] = dgram_size & 0xff;
+ memcpy(frag_hdr + 2, &frag_tag, sizeof(frag_tag));
+
+ payload_cap = ieee802154_max_payload(wpan_hdr);
+
+ frag_len = round_down(payload_cap - LOWPAN_FRAG1_HEAD_SIZE -
+ skb_network_header_len(skb), 8);
+
+ skb_offset = skb_network_header_len(skb);
+ skb_unprocessed = skb->len - skb->mac_len - skb_offset;
+
+ rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr,
+ LOWPAN_FRAG1_HEAD_SIZE, 0,
+ frag_len + skb_network_header_len(skb));
+ if (rc) {
+ pr_debug("%s unable to send FRAG1 packet (tag: %d)",
+ __func__, ntohs(frag_tag));
+ goto err;
+ }
+
+ frag_hdr[0] &= ~LOWPAN_DISPATCH_FRAG1;
+ frag_hdr[0] |= LOWPAN_DISPATCH_FRAGN;
+ frag_cap = round_down(payload_cap - LOWPAN_FRAGN_HEAD_SIZE, 8);
+
+ do {
+ dgram_offset += frag_len;
+ skb_offset += frag_len;
+ skb_unprocessed -= frag_len;
+ frag_len = min(frag_cap, skb_unprocessed);
+
+ frag_hdr[4] = dgram_offset >> 3;
+
+ rc = lowpan_xmit_fragment(skb, wpan_hdr, frag_hdr,
+ LOWPAN_FRAGN_HEAD_SIZE, skb_offset,
+ frag_len);
+ if (rc) {
+ pr_debug("%s unable to send a FRAGN packet. (tag: %d, offset: %d)\n",
+ __func__, ntohs(frag_tag), skb_offset);
+ goto err;
+ }
+ } while (skb_unprocessed > frag_cap);
+
+ consume_skb(skb);
+ return NET_XMIT_SUCCESS;
+
+err:
+ kfree_skb(skb);
+ return rc;
+}
+
+static int lowpan_header(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ieee802154_addr sa, da;
+ struct ieee802154_mac_cb *cb = mac_cb_init(skb);
+ struct lowpan_addr_info info;
+ void *daddr, *saddr;
+
+ memcpy(&info, lowpan_skb_priv(skb), sizeof(info));
+
+ /* TODO: Currently we only support extended_addr */
+ daddr = &info.daddr.u.extended_addr;
+ saddr = &info.saddr.u.extended_addr;
+
+ lowpan_header_compress(skb, dev, ETH_P_IPV6, daddr, saddr, skb->len);
+
+ cb->type = IEEE802154_FC_TYPE_DATA;
+
+ /* prepare wpan address data */
+ sa.mode = IEEE802154_ADDR_LONG;
+ sa.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+ sa.extended_addr = ieee802154_devaddr_from_raw(saddr);
+
+ /* intra-PAN communications */
+ da.pan_id = sa.pan_id;
+
+ /* if the destination address is the broadcast address, use the
+ * corresponding short address
+ */
+ if (lowpan_is_addr_broadcast((const u8 *)daddr)) {
+ da.mode = IEEE802154_ADDR_SHORT;
+ da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
+ cb->ackreq = false;
+ } else {
+ da.mode = IEEE802154_ADDR_LONG;
+ da.extended_addr = ieee802154_devaddr_from_raw(daddr);
+ cb->ackreq = true;
+ }
+
+ return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
+ ETH_P_IPV6, (void *)&da, (void *)&sa, 0);
+}
+
+netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ieee802154_hdr wpan_hdr;
+ int max_single, ret;
+
+ pr_debug("package xmit\n");
+
+ /* We must take a copy of the skb before we modify/replace the ipv6
+ * header as the header could be used elsewhere
+ */
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_XMIT_DROP;
+
+ ret = lowpan_header(skb, dev);
+ if (ret < 0) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ if (ieee802154_hdr_peek(skb, &wpan_hdr) < 0) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ max_single = ieee802154_max_payload(&wpan_hdr);
+
+ if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) {
+ skb->dev = lowpan_dev_info(dev)->real_dev;
+ return dev_queue_xmit(skb);
+ } else {
+ netdev_tx_t rc;
+
+ pr_debug("frame is too big, fragmentation is needed\n");
+ rc = lowpan_xmit_fragmented(skb, dev, &wpan_hdr);
+
+ return rc < 0 ? NET_XMIT_DROP : rc;
+ }
+}
diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig
new file mode 100644
index 000000000..1370d5b00
--- /dev/null
+++ b/net/ieee802154/Kconfig
@@ -0,0 +1,25 @@
+menuconfig IEEE802154
+ tristate "IEEE Std 802.15.4 Low-Rate Wireless Personal Area Networks support"
+ ---help---
+ IEEE Std 802.15.4 defines a low data rate, low power and low
+ complexity short range wireless personal area networks. It was
+ designed to organise networks of sensors, switches, etc automation
+ devices. Maximum allowed data rate is 250 kb/s and typical personal
+ operating space around 10m.
+
+ Say Y here to compile LR-WPAN support into the kernel or say M to
+ compile it as modules.
+
+if IEEE802154
+
+config IEEE802154_SOCKET
+ tristate "IEEE 802.15.4 socket interface"
+ default y
+ ---help---
+ Socket interface for IEEE 802.15.4. Contains DGRAM sockets interface
+ for 802.15.4 dataframes. Also RAW socket interface to build MAC
+ header from userspace.
+
+source "net/ieee802154/6lowpan/Kconfig"
+
+endif
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
new file mode 100644
index 000000000..4adfd4d54
--- /dev/null
+++ b/net/ieee802154/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_IEEE802154) += ieee802154.o
+obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o
+obj-y += 6lowpan/
+
+ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \
+ header_ops.o sysfs.o nl802154.o trace.o
+ieee802154_socket-y := socket.o
+
+CFLAGS_trace.o := -I$(src)
+
+ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
new file mode 100644
index 000000000..2ee00e8a0
--- /dev/null
+++ b/net/ieee802154/core.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (C) 2007, 2008, 2009 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+
+#include <net/cfg802154.h>
+#include <net/rtnetlink.h>
+
+#include "ieee802154.h"
+#include "nl802154.h"
+#include "sysfs.h"
+#include "core.h"
+
+/* name for sysfs, %d is appended */
+#define PHY_NAME "phy"
+
+/* RCU-protected (and RTNL for writers) */
+LIST_HEAD(cfg802154_rdev_list);
+int cfg802154_rdev_list_generation;
+
+static int wpan_phy_match(struct device *dev, const void *data)
+{
+ return !strcmp(dev_name(dev), (const char *)data);
+}
+
+struct wpan_phy *wpan_phy_find(const char *str)
+{
+ struct device *dev;
+
+ if (WARN_ON(!str))
+ return NULL;
+
+ dev = class_find_device(&wpan_phy_class, NULL, str, wpan_phy_match);
+ if (!dev)
+ return NULL;
+
+ return container_of(dev, struct wpan_phy, dev);
+}
+EXPORT_SYMBOL(wpan_phy_find);
+
+struct wpan_phy_iter_data {
+ int (*fn)(struct wpan_phy *phy, void *data);
+ void *data;
+};
+
+static int wpan_phy_iter(struct device *dev, void *_data)
+{
+ struct wpan_phy_iter_data *wpid = _data;
+ struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev);
+
+ return wpid->fn(phy, wpid->data);
+}
+
+int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data),
+ void *data)
+{
+ struct wpan_phy_iter_data wpid = {
+ .fn = fn,
+ .data = data,
+ };
+
+ return class_for_each_device(&wpan_phy_class, NULL,
+ &wpid, wpan_phy_iter);
+}
+EXPORT_SYMBOL(wpan_phy_for_each);
+
+struct cfg802154_registered_device *
+cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx)
+{
+ struct cfg802154_registered_device *result = NULL, *rdev;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ if (rdev->wpan_phy_idx == wpan_phy_idx) {
+ result = rdev;
+ break;
+ }
+ }
+
+ return result;
+}
+
+struct wpan_phy *
+wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
+{
+ static atomic_t wpan_phy_counter = ATOMIC_INIT(0);
+ struct cfg802154_registered_device *rdev;
+ size_t alloc_size;
+
+ alloc_size = sizeof(*rdev) + priv_size;
+ rdev = kzalloc(alloc_size, GFP_KERNEL);
+ if (!rdev)
+ return NULL;
+
+ rdev->ops = ops;
+
+ rdev->wpan_phy_idx = atomic_inc_return(&wpan_phy_counter);
+
+ if (unlikely(rdev->wpan_phy_idx < 0)) {
+ /* ugh, wrapped! */
+ atomic_dec(&wpan_phy_counter);
+ kfree(rdev);
+ return NULL;
+ }
+
+ /* atomic_inc_return makes it start at 1, make it start at 0 */
+ rdev->wpan_phy_idx--;
+
+ mutex_init(&rdev->wpan_phy.pib_lock);
+
+ INIT_LIST_HEAD(&rdev->wpan_dev_list);
+ device_initialize(&rdev->wpan_phy.dev);
+ dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx);
+
+ rdev->wpan_phy.dev.class = &wpan_phy_class;
+ rdev->wpan_phy.dev.platform_data = rdev;
+
+ init_waitqueue_head(&rdev->dev_wait);
+
+ return &rdev->wpan_phy;
+}
+EXPORT_SYMBOL(wpan_phy_new);
+
+int wpan_phy_register(struct wpan_phy *phy)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy);
+ int ret;
+
+ rtnl_lock();
+ ret = device_add(&phy->dev);
+ if (ret) {
+ rtnl_unlock();
+ return ret;
+ }
+
+ list_add_rcu(&rdev->list, &cfg802154_rdev_list);
+ cfg802154_rdev_list_generation++;
+
+ /* TODO phy registered lock */
+ rtnl_unlock();
+
+ /* TODO nl802154 phy notify */
+
+ return 0;
+}
+EXPORT_SYMBOL(wpan_phy_register);
+
+void wpan_phy_unregister(struct wpan_phy *phy)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy);
+
+ wait_event(rdev->dev_wait, ({
+ int __count;
+ rtnl_lock();
+ __count = rdev->opencount;
+ rtnl_unlock();
+ __count == 0; }));
+
+ rtnl_lock();
+ /* TODO nl802154 phy notify */
+ /* TODO phy registered lock */
+
+ WARN_ON(!list_empty(&rdev->wpan_dev_list));
+
+ /* First remove the hardware from everywhere, this makes
+ * it impossible to find from userspace.
+ */
+ list_del_rcu(&rdev->list);
+ synchronize_rcu();
+
+ cfg802154_rdev_list_generation++;
+
+ device_del(&phy->dev);
+
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(wpan_phy_unregister);
+
+void wpan_phy_free(struct wpan_phy *phy)
+{
+ put_device(&phy->dev);
+}
+EXPORT_SYMBOL(wpan_phy_free);
+
+void cfg802154_dev_free(struct cfg802154_registered_device *rdev)
+{
+ kfree(rdev);
+}
+
+static void
+cfg802154_update_iface_num(struct cfg802154_registered_device *rdev,
+ int iftype, int num)
+{
+ ASSERT_RTNL();
+
+ rdev->num_running_ifaces += num;
+}
+
+static int cfg802154_netdev_notifier_call(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct cfg802154_registered_device *rdev;
+
+ if (!wpan_dev)
+ return NOTIFY_DONE;
+
+ rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy);
+
+ /* TODO WARN_ON unspec type */
+
+ switch (state) {
+ /* TODO NETDEV_DEVTYPE */
+ case NETDEV_REGISTER:
+ dev->features |= NETIF_F_NETNS_LOCAL;
+ wpan_dev->identifier = ++rdev->wpan_dev_id;
+ list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list);
+ rdev->devlist_generation++;
+
+ wpan_dev->netdev = dev;
+ break;
+ case NETDEV_DOWN:
+ cfg802154_update_iface_num(rdev, wpan_dev->iftype, -1);
+
+ rdev->opencount--;
+ wake_up(&rdev->dev_wait);
+ break;
+ case NETDEV_UP:
+ cfg802154_update_iface_num(rdev, wpan_dev->iftype, 1);
+
+ rdev->opencount++;
+ break;
+ case NETDEV_UNREGISTER:
+ /* It is possible to get NETDEV_UNREGISTER
+ * multiple times. To detect that, check
+ * that the interface is still on the list
+ * of registered interfaces, and only then
+ * remove and clean it up.
+ */
+ if (!list_empty(&wpan_dev->list)) {
+ list_del_rcu(&wpan_dev->list);
+ rdev->devlist_generation++;
+ }
+ /* synchronize (so that we won't find this netdev
+ * from other code any more) and then clear the list
+ * head so that the above code can safely check for
+ * !list_empty() to avoid double-cleanup.
+ */
+ synchronize_rcu();
+ INIT_LIST_HEAD(&wpan_dev->list);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cfg802154_netdev_notifier = {
+ .notifier_call = cfg802154_netdev_notifier_call,
+};
+
+static int __init wpan_phy_class_init(void)
+{
+ int rc;
+
+ rc = wpan_phy_sysfs_init();
+ if (rc)
+ goto err;
+
+ rc = register_netdevice_notifier(&cfg802154_netdev_notifier);
+ if (rc)
+ goto err_nl;
+
+ rc = ieee802154_nl_init();
+ if (rc)
+ goto err_notifier;
+
+ rc = nl802154_init();
+ if (rc)
+ goto err_ieee802154_nl;
+
+ return 0;
+
+err_ieee802154_nl:
+ ieee802154_nl_exit();
+
+err_notifier:
+ unregister_netdevice_notifier(&cfg802154_netdev_notifier);
+err_nl:
+ wpan_phy_sysfs_exit();
+err:
+ return rc;
+}
+subsys_initcall(wpan_phy_class_init);
+
+static void __exit wpan_phy_class_exit(void)
+{
+ nl802154_exit();
+ ieee802154_nl_exit();
+ unregister_netdevice_notifier(&cfg802154_netdev_notifier);
+ wpan_phy_sysfs_exit();
+}
+module_exit(wpan_phy_class_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface");
+MODULE_AUTHOR("Dmitry Eremin-Solenikov");
+
diff --git a/net/ieee802154/core.h b/net/ieee802154/core.h
new file mode 100644
index 000000000..f3e95580c
--- /dev/null
+++ b/net/ieee802154/core.h
@@ -0,0 +1,46 @@
+#ifndef __IEEE802154_CORE_H
+#define __IEEE802154_CORE_H
+
+#include <net/cfg802154.h>
+
+struct cfg802154_registered_device {
+ const struct cfg802154_ops *ops;
+ struct list_head list;
+
+ /* wpan_phy index, internal only */
+ int wpan_phy_idx;
+
+ /* also protected by devlist_mtx */
+ int opencount;
+ wait_queue_head_t dev_wait;
+
+ /* protected by RTNL only */
+ int num_running_ifaces;
+
+ /* associated wpan interfaces, protected by rtnl or RCU */
+ struct list_head wpan_dev_list;
+ int devlist_generation, wpan_dev_id;
+
+ /* must be last because of the way we do wpan_phy_priv(),
+ * and it should at least be aligned to NETDEV_ALIGN
+ */
+ struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN);
+};
+
+static inline struct cfg802154_registered_device *
+wpan_phy_to_rdev(struct wpan_phy *wpan_phy)
+{
+ BUG_ON(!wpan_phy);
+ return container_of(wpan_phy, struct cfg802154_registered_device,
+ wpan_phy);
+}
+
+extern struct list_head cfg802154_rdev_list;
+extern int cfg802154_rdev_list_generation;
+
+/* free object */
+void cfg802154_dev_free(struct cfg802154_registered_device *rdev);
+struct cfg802154_registered_device *
+cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx);
+
+#endif /* __IEEE802154_CORE_H */
diff --git a/net/ieee802154/header_ops.c b/net/ieee802154/header_ops.c
new file mode 100644
index 000000000..a051b6993
--- /dev/null
+++ b/net/ieee802154/header_ops.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (C) 2014 Fraunhofer ITWM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
+ */
+
+#include <linux/ieee802154.h>
+
+#include <net/mac802154.h>
+#include <net/ieee802154_netdev.h>
+
+static int
+ieee802154_hdr_push_addr(u8 *buf, const struct ieee802154_addr *addr,
+ bool omit_pan)
+{
+ int pos = 0;
+
+ if (addr->mode == IEEE802154_ADDR_NONE)
+ return 0;
+
+ if (!omit_pan) {
+ memcpy(buf + pos, &addr->pan_id, 2);
+ pos += 2;
+ }
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_SHORT:
+ memcpy(buf + pos, &addr->short_addr, 2);
+ pos += 2;
+ break;
+
+ case IEEE802154_ADDR_LONG:
+ memcpy(buf + pos, &addr->extended_addr, IEEE802154_ADDR_LEN);
+ pos += IEEE802154_ADDR_LEN;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return pos;
+}
+
+static int
+ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr)
+{
+ int pos = 5;
+
+ memcpy(buf, hdr, 1);
+ memcpy(buf + 1, &hdr->frame_counter, 4);
+
+ switch (hdr->key_id_mode) {
+ case IEEE802154_SCF_KEY_IMPLICIT:
+ return pos;
+
+ case IEEE802154_SCF_KEY_INDEX:
+ break;
+
+ case IEEE802154_SCF_KEY_SHORT_INDEX:
+ memcpy(buf + pos, &hdr->short_src, 4);
+ pos += 4;
+ break;
+
+ case IEEE802154_SCF_KEY_HW_INDEX:
+ memcpy(buf + pos, &hdr->extended_src, IEEE802154_ADDR_LEN);
+ pos += IEEE802154_ADDR_LEN;
+ break;
+ }
+
+ buf[pos++] = hdr->key_id;
+
+ return pos;
+}
+
+int
+ieee802154_hdr_push(struct sk_buff *skb, const struct ieee802154_hdr *hdr)
+{
+ u8 buf[MAC802154_FRAME_HARD_HEADER_LEN];
+ int pos = 2;
+ int rc;
+ struct ieee802154_hdr_fc fc = hdr->fc;
+
+ buf[pos++] = hdr->seq;
+
+ fc.dest_addr_mode = hdr->dest.mode;
+
+ rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false);
+ if (rc < 0)
+ return -EINVAL;
+ pos += rc;
+
+ fc.source_addr_mode = hdr->source.mode;
+
+ if (hdr->source.pan_id == hdr->dest.pan_id &&
+ hdr->dest.mode != IEEE802154_ADDR_NONE)
+ fc.intra_pan = true;
+
+ rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc.intra_pan);
+ if (rc < 0)
+ return -EINVAL;
+ pos += rc;
+
+ if (fc.security_enabled) {
+ fc.version = 1;
+
+ rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec);
+ if (rc < 0)
+ return -EINVAL;
+
+ pos += rc;
+ }
+
+ memcpy(buf, &fc, 2);
+
+ memcpy(skb_push(skb, pos), buf, pos);
+
+ return pos;
+}
+EXPORT_SYMBOL_GPL(ieee802154_hdr_push);
+
+static int
+ieee802154_hdr_get_addr(const u8 *buf, int mode, bool omit_pan,
+ struct ieee802154_addr *addr)
+{
+ int pos = 0;
+
+ addr->mode = mode;
+
+ if (mode == IEEE802154_ADDR_NONE)
+ return 0;
+
+ if (!omit_pan) {
+ memcpy(&addr->pan_id, buf + pos, 2);
+ pos += 2;
+ }
+
+ if (mode == IEEE802154_ADDR_SHORT) {
+ memcpy(&addr->short_addr, buf + pos, 2);
+ return pos + 2;
+ } else {
+ memcpy(&addr->extended_addr, buf + pos, IEEE802154_ADDR_LEN);
+ return pos + IEEE802154_ADDR_LEN;
+ }
+}
+
+static int ieee802154_hdr_addr_len(int mode, bool omit_pan)
+{
+ int pan_len = omit_pan ? 0 : 2;
+
+ switch (mode) {
+ case IEEE802154_ADDR_NONE: return 0;
+ case IEEE802154_ADDR_SHORT: return 2 + pan_len;
+ case IEEE802154_ADDR_LONG: return IEEE802154_ADDR_LEN + pan_len;
+ default: return -EINVAL;
+ }
+}
+
+static int
+ieee802154_hdr_get_sechdr(const u8 *buf, struct ieee802154_sechdr *hdr)
+{
+ int pos = 5;
+
+ memcpy(hdr, buf, 1);
+ memcpy(&hdr->frame_counter, buf + 1, 4);
+
+ switch (hdr->key_id_mode) {
+ case IEEE802154_SCF_KEY_IMPLICIT:
+ return pos;
+
+ case IEEE802154_SCF_KEY_INDEX:
+ break;
+
+ case IEEE802154_SCF_KEY_SHORT_INDEX:
+ memcpy(&hdr->short_src, buf + pos, 4);
+ pos += 4;
+ break;
+
+ case IEEE802154_SCF_KEY_HW_INDEX:
+ memcpy(&hdr->extended_src, buf + pos, IEEE802154_ADDR_LEN);
+ pos += IEEE802154_ADDR_LEN;
+ break;
+ }
+
+ hdr->key_id = buf[pos++];
+
+ return pos;
+}
+
+static int ieee802154_sechdr_lengths[4] = {
+ [IEEE802154_SCF_KEY_IMPLICIT] = 5,
+ [IEEE802154_SCF_KEY_INDEX] = 6,
+ [IEEE802154_SCF_KEY_SHORT_INDEX] = 10,
+ [IEEE802154_SCF_KEY_HW_INDEX] = 14,
+};
+
+static int ieee802154_hdr_sechdr_len(u8 sc)
+{
+ return ieee802154_sechdr_lengths[IEEE802154_SCF_KEY_ID_MODE(sc)];
+}
+
+static int ieee802154_hdr_minlen(const struct ieee802154_hdr *hdr)
+{
+ int dlen, slen;
+
+ dlen = ieee802154_hdr_addr_len(hdr->fc.dest_addr_mode, false);
+ slen = ieee802154_hdr_addr_len(hdr->fc.source_addr_mode,
+ hdr->fc.intra_pan);
+
+ if (slen < 0 || dlen < 0)
+ return -EINVAL;
+
+ return 3 + dlen + slen + hdr->fc.security_enabled;
+}
+
+static int
+ieee802154_hdr_get_addrs(const u8 *buf, struct ieee802154_hdr *hdr)
+{
+ int pos = 0;
+
+ pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.dest_addr_mode,
+ false, &hdr->dest);
+ pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.source_addr_mode,
+ hdr->fc.intra_pan, &hdr->source);
+
+ if (hdr->fc.intra_pan)
+ hdr->source.pan_id = hdr->dest.pan_id;
+
+ return pos;
+}
+
+int
+ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr)
+{
+ int pos = 3, rc;
+
+ if (!pskb_may_pull(skb, 3))
+ return -EINVAL;
+
+ memcpy(hdr, skb->data, 3);
+
+ rc = ieee802154_hdr_minlen(hdr);
+ if (rc < 0 || !pskb_may_pull(skb, rc))
+ return -EINVAL;
+
+ pos += ieee802154_hdr_get_addrs(skb->data + pos, hdr);
+
+ if (hdr->fc.security_enabled) {
+ int want = pos + ieee802154_hdr_sechdr_len(skb->data[pos]);
+
+ if (!pskb_may_pull(skb, want))
+ return -EINVAL;
+
+ pos += ieee802154_hdr_get_sechdr(skb->data + pos, &hdr->sec);
+ }
+
+ skb_pull(skb, pos);
+ return pos;
+}
+EXPORT_SYMBOL_GPL(ieee802154_hdr_pull);
+
+int
+ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr)
+{
+ const u8 *buf = skb_mac_header(skb);
+ int pos = 3, rc;
+
+ if (buf + 3 > skb_tail_pointer(skb))
+ return -EINVAL;
+
+ memcpy(hdr, buf, 3);
+
+ rc = ieee802154_hdr_minlen(hdr);
+ if (rc < 0 || buf + rc > skb_tail_pointer(skb))
+ return -EINVAL;
+
+ pos += ieee802154_hdr_get_addrs(buf + pos, hdr);
+ return pos;
+}
+EXPORT_SYMBOL_GPL(ieee802154_hdr_peek_addrs);
+
+int
+ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr)
+{
+ const u8 *buf = skb_mac_header(skb);
+ int pos;
+
+ pos = ieee802154_hdr_peek_addrs(skb, hdr);
+ if (pos < 0)
+ return -EINVAL;
+
+ if (hdr->fc.security_enabled) {
+ u8 key_id_mode = IEEE802154_SCF_KEY_ID_MODE(*(buf + pos));
+ int want = pos + ieee802154_sechdr_lengths[key_id_mode];
+
+ if (buf + want > skb_tail_pointer(skb))
+ return -EINVAL;
+
+ pos += ieee802154_hdr_get_sechdr(buf + pos, &hdr->sec);
+ }
+
+ return pos;
+}
+EXPORT_SYMBOL_GPL(ieee802154_hdr_peek);
+
+int ieee802154_max_payload(const struct ieee802154_hdr *hdr)
+{
+ int hlen = ieee802154_hdr_minlen(hdr);
+
+ if (hdr->fc.security_enabled) {
+ hlen += ieee802154_sechdr_lengths[hdr->sec.key_id_mode] - 1;
+ hlen += ieee802154_sechdr_authtag_len(&hdr->sec);
+ }
+
+ return IEEE802154_MTU - hlen - IEEE802154_MFR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ieee802154_max_payload);
diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h
new file mode 100644
index 000000000..a5d7515b7
--- /dev/null
+++ b/net/ieee802154/ieee802154.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2007, 2008, 2009 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef IEEE_802154_LOCAL_H
+#define IEEE_802154_LOCAL_H
+
+int __init ieee802154_nl_init(void);
+void ieee802154_nl_exit(void);
+
+#define IEEE802154_OP(_cmd, _func) \
+ { \
+ .cmd = _cmd, \
+ .policy = ieee802154_policy, \
+ .doit = _func, \
+ .dumpit = NULL, \
+ .flags = GENL_ADMIN_PERM, \
+ }
+
+#define IEEE802154_DUMP(_cmd, _func, _dump) \
+ { \
+ .cmd = _cmd, \
+ .policy = ieee802154_policy, \
+ .doit = _func, \
+ .dumpit = _dump, \
+ }
+
+struct genl_info;
+
+struct sk_buff *ieee802154_nl_create(int flags, u8 req);
+int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group);
+struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info,
+ int flags, u8 req);
+int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info);
+
+extern struct genl_family nl802154_family;
+
+/* genetlink ops/groups */
+int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_dump_phy(struct sk_buff *skb, struct netlink_callback *cb);
+int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info);
+
+enum ieee802154_mcgrp_ids {
+ IEEE802154_COORD_MCGRP,
+ IEEE802154_BEACON_MCGRP,
+};
+
+int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb);
+int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info);
+
+int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_keys(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_devs(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_devkeys(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info);
+int ieee802154_llsec_dump_seclevels(struct sk_buff *skb,
+ struct netlink_callback *cb);
+
+#endif
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
new file mode 100644
index 000000000..c8133c07c
--- /dev/null
+++ b/net/ieee802154/netlink.c
@@ -0,0 +1,152 @@
+/*
+ * Netlink interface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <net/genetlink.h>
+#include <linux/nl802154.h>
+
+#include "ieee802154.h"
+
+static unsigned int ieee802154_seq_num;
+static DEFINE_SPINLOCK(ieee802154_seq_lock);
+
+struct genl_family nl802154_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = IEEE802154_NL_NAME,
+ .version = 1,
+ .maxattr = IEEE802154_ATTR_MAX,
+};
+
+/* Requests to userspace */
+struct sk_buff *ieee802154_nl_create(int flags, u8 req)
+{
+ void *hdr;
+ struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ unsigned long f;
+
+ if (!msg)
+ return NULL;
+
+ spin_lock_irqsave(&ieee802154_seq_lock, f);
+ hdr = genlmsg_put(msg, 0, ieee802154_seq_num++,
+ &nl802154_family, flags, req);
+ spin_unlock_irqrestore(&ieee802154_seq_lock, f);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return NULL;
+ }
+
+ return msg;
+}
+
+int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ void *hdr = genlmsg_data(nlmsg_data(nlh));
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_multicast(&nl802154_family, msg, 0, group, GFP_ATOMIC);
+}
+
+struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info,
+ int flags, u8 req)
+{
+ void *hdr;
+ struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+
+ if (!msg)
+ return NULL;
+
+ hdr = genlmsg_put_reply(msg, info,
+ &nl802154_family, flags, req);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return NULL;
+ }
+
+ return msg;
+}
+
+int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ void *hdr = genlmsg_data(nlmsg_data(nlh));
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+}
+
+static const struct genl_ops ieee8021154_ops[] = {
+ /* see nl-phy.c */
+ IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy,
+ ieee802154_dump_phy),
+ IEEE802154_OP(IEEE802154_ADD_IFACE, ieee802154_add_iface),
+ IEEE802154_OP(IEEE802154_DEL_IFACE, ieee802154_del_iface),
+ /* see nl-mac.c */
+ IEEE802154_OP(IEEE802154_ASSOCIATE_REQ, ieee802154_associate_req),
+ IEEE802154_OP(IEEE802154_ASSOCIATE_RESP, ieee802154_associate_resp),
+ IEEE802154_OP(IEEE802154_DISASSOCIATE_REQ, ieee802154_disassociate_req),
+ IEEE802154_OP(IEEE802154_SCAN_REQ, ieee802154_scan_req),
+ IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req),
+ IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface,
+ ieee802154_dump_iface),
+ IEEE802154_OP(IEEE802154_SET_MACPARAMS, ieee802154_set_macparams),
+ IEEE802154_OP(IEEE802154_LLSEC_GETPARAMS, ieee802154_llsec_getparams),
+ IEEE802154_OP(IEEE802154_LLSEC_SETPARAMS, ieee802154_llsec_setparams),
+ IEEE802154_DUMP(IEEE802154_LLSEC_LIST_KEY, NULL,
+ ieee802154_llsec_dump_keys),
+ IEEE802154_OP(IEEE802154_LLSEC_ADD_KEY, ieee802154_llsec_add_key),
+ IEEE802154_OP(IEEE802154_LLSEC_DEL_KEY, ieee802154_llsec_del_key),
+ IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEV, NULL,
+ ieee802154_llsec_dump_devs),
+ IEEE802154_OP(IEEE802154_LLSEC_ADD_DEV, ieee802154_llsec_add_dev),
+ IEEE802154_OP(IEEE802154_LLSEC_DEL_DEV, ieee802154_llsec_del_dev),
+ IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEVKEY, NULL,
+ ieee802154_llsec_dump_devkeys),
+ IEEE802154_OP(IEEE802154_LLSEC_ADD_DEVKEY, ieee802154_llsec_add_devkey),
+ IEEE802154_OP(IEEE802154_LLSEC_DEL_DEVKEY, ieee802154_llsec_del_devkey),
+ IEEE802154_DUMP(IEEE802154_LLSEC_LIST_SECLEVEL, NULL,
+ ieee802154_llsec_dump_seclevels),
+ IEEE802154_OP(IEEE802154_LLSEC_ADD_SECLEVEL,
+ ieee802154_llsec_add_seclevel),
+ IEEE802154_OP(IEEE802154_LLSEC_DEL_SECLEVEL,
+ ieee802154_llsec_del_seclevel),
+};
+
+static const struct genl_multicast_group ieee802154_mcgrps[] = {
+ [IEEE802154_COORD_MCGRP] = { .name = IEEE802154_MCAST_COORD_NAME, },
+ [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
+};
+
+int __init ieee802154_nl_init(void)
+{
+ return genl_register_family_with_ops_groups(&nl802154_family,
+ ieee8021154_ops,
+ ieee802154_mcgrps);
+}
+
+void ieee802154_nl_exit(void)
+{
+ genl_unregister_family(&nl802154_family);
+}
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
new file mode 100644
index 000000000..2b4955d7a
--- /dev/null
+++ b/net/ieee802154/nl-mac.c
@@ -0,0 +1,1349 @@
+/*
+ * Netlink interface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/ieee802154.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include <linux/nl802154.h>
+#include <linux/export.h>
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/cfg802154.h>
+
+#include "ieee802154.h"
+
+static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr)
+{
+ return nla_put_u64(msg, type, swab64((__force u64)hwaddr));
+}
+
+static __le64 nla_get_hwaddr(const struct nlattr *nla)
+{
+ return ieee802154_devaddr_from_raw(nla_data(nla));
+}
+
+static int nla_put_shortaddr(struct sk_buff *msg, int type, __le16 addr)
+{
+ return nla_put_u16(msg, type, le16_to_cpu(addr));
+}
+
+static __le16 nla_get_shortaddr(const struct nlattr *nla)
+{
+ return cpu_to_le16(nla_get_u16(nla));
+}
+
+static int ieee802154_nl_start_confirm(struct net_device *dev, u8 status)
+{
+ struct sk_buff *msg;
+
+ pr_debug("%s\n", __func__);
+
+ msg = ieee802154_nl_create(0, IEEE802154_START_CONF);
+ if (!msg)
+ return -ENOBUFS;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+ dev->dev_addr) ||
+ nla_put_u8(msg, IEEE802154_ATTR_STATUS, status))
+ goto nla_put_failure;
+ return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP);
+
+nla_put_failure:
+ nlmsg_free(msg);
+ return -ENOBUFS;
+}
+
+static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid,
+ u32 seq, int flags, struct net_device *dev)
+{
+ void *hdr;
+ struct wpan_phy *phy;
+ struct ieee802154_mlme_ops *ops;
+ __le16 short_addr, pan_id;
+
+ pr_debug("%s\n", __func__);
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags,
+ IEEE802154_LIST_IFACE);
+ if (!hdr)
+ goto out;
+
+ ops = ieee802154_mlme_ops(dev);
+ phy = dev->ieee802154_ptr->wpan_phy;
+ BUG_ON(!phy);
+ get_device(&phy->dev);
+
+ short_addr = ops->get_short_addr(dev);
+ pan_id = ops->get_pan_id(dev);
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+ dev->dev_addr) ||
+ nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr) ||
+ nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, pan_id))
+ goto nla_put_failure;
+
+ if (ops->get_mac_params) {
+ struct ieee802154_mac_params params;
+
+ rtnl_lock();
+ ops->get_mac_params(dev, &params);
+ rtnl_unlock();
+
+ if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER,
+ params.transmit_power) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE,
+ params.cca.mode) ||
+ nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL,
+ params.cca_ed_level) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES,
+ params.csma_retries) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE,
+ params.min_be) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CSMA_MAX_BE,
+ params.max_be) ||
+ nla_put_s8(msg, IEEE802154_ATTR_FRAME_RETRIES,
+ params.frame_retries))
+ goto nla_put_failure;
+ }
+
+ wpan_phy_put(phy);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ wpan_phy_put(phy);
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+/* Requests from userspace */
+static struct net_device *ieee802154_nl_get_dev(struct genl_info *info)
+{
+ struct net_device *dev;
+
+ if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
+ char name[IFNAMSIZ + 1];
+
+ nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
+ sizeof(name));
+ dev = dev_get_by_name(&init_net, name);
+ } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) {
+ dev = dev_get_by_index(&init_net,
+ nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX]));
+ } else {
+ return NULL;
+ }
+
+ if (!dev)
+ return NULL;
+
+ /* Check on mtu is currently a hacked solution because lowpan
+ * and wpan have the same ARPHRD type.
+ */
+ if (dev->type != ARPHRD_IEEE802154 || dev->mtu != IEEE802154_MTU) {
+ dev_put(dev);
+ return NULL;
+ }
+
+ return dev;
+}
+
+int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ struct ieee802154_addr addr;
+ u8 page;
+ int ret = -EOPNOTSUPP;
+
+ if (!info->attrs[IEEE802154_ATTR_CHANNEL] ||
+ !info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
+ (!info->attrs[IEEE802154_ATTR_COORD_HW_ADDR] &&
+ !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]) ||
+ !info->attrs[IEEE802154_ATTR_CAPABILITY])
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->assoc_req)
+ goto out;
+
+ if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) {
+ addr.mode = IEEE802154_ADDR_LONG;
+ addr.extended_addr = nla_get_hwaddr(
+ info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]);
+ } else {
+ addr.mode = IEEE802154_ADDR_SHORT;
+ addr.short_addr = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]);
+ }
+ addr.pan_id = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
+
+ if (info->attrs[IEEE802154_ATTR_PAGE])
+ page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
+ else
+ page = 0;
+
+ ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr,
+ nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]),
+ page,
+ nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY]));
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ struct ieee802154_addr addr;
+ int ret = -EOPNOTSUPP;
+
+ if (!info->attrs[IEEE802154_ATTR_STATUS] ||
+ !info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] ||
+ !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR])
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->assoc_resp)
+ goto out;
+
+ addr.mode = IEEE802154_ADDR_LONG;
+ addr.extended_addr = nla_get_hwaddr(
+ info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]);
+ addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+
+ ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr,
+ nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]),
+ nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS]));
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ struct ieee802154_addr addr;
+ int ret = -EOPNOTSUPP;
+
+ if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] &&
+ !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) ||
+ !info->attrs[IEEE802154_ATTR_REASON])
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->disassoc_req)
+ goto out;
+
+ if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) {
+ addr.mode = IEEE802154_ADDR_LONG;
+ addr.extended_addr = nla_get_hwaddr(
+ info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]);
+ } else {
+ addr.mode = IEEE802154_ADDR_SHORT;
+ addr.short_addr = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]);
+ }
+ addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+
+ ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr,
+ nla_get_u8(info->attrs[IEEE802154_ATTR_REASON]));
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+/* PANid, channel, beacon_order = 15, superframe_order = 15,
+ * PAN_coordinator, battery_life_extension = 0,
+ * coord_realignment = 0, security_enable = 0
+*/
+int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ struct ieee802154_addr addr;
+
+ u8 channel, bcn_ord, sf_ord;
+ u8 page;
+ int pan_coord, blx, coord_realign;
+ int ret = -EBUSY;
+
+ if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
+ !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] ||
+ !info->attrs[IEEE802154_ATTR_CHANNEL] ||
+ !info->attrs[IEEE802154_ATTR_BCN_ORD] ||
+ !info->attrs[IEEE802154_ATTR_SF_ORD] ||
+ !info->attrs[IEEE802154_ATTR_PAN_COORD] ||
+ !info->attrs[IEEE802154_ATTR_BAT_EXT] ||
+ !info->attrs[IEEE802154_ATTR_COORD_REALIGN]
+ )
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ if (netif_running(dev))
+ goto out;
+
+ if (!ieee802154_mlme_ops(dev)->start_req) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ addr.mode = IEEE802154_ADDR_SHORT;
+ addr.short_addr = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]);
+ addr.pan_id = nla_get_shortaddr(
+ info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
+
+ channel = nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]);
+ bcn_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_BCN_ORD]);
+ sf_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_SF_ORD]);
+ pan_coord = nla_get_u8(info->attrs[IEEE802154_ATTR_PAN_COORD]);
+ blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]);
+ coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]);
+
+ if (info->attrs[IEEE802154_ATTR_PAGE])
+ page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
+ else
+ page = 0;
+
+ if (addr.short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) {
+ ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS);
+ dev_put(dev);
+ return -EINVAL;
+ }
+
+ rtnl_lock();
+ ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel, page,
+ bcn_ord, sf_ord, pan_coord, blx, coord_realign);
+ rtnl_unlock();
+
+ /* FIXME: add validation for unused parameters to be sane
+ * for SoftMAC
+ */
+ ieee802154_nl_start_confirm(dev, IEEE802154_SUCCESS);
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev;
+ int ret = -EOPNOTSUPP;
+ u8 type;
+ u32 channels;
+ u8 duration;
+ u8 page;
+
+ if (!info->attrs[IEEE802154_ATTR_SCAN_TYPE] ||
+ !info->attrs[IEEE802154_ATTR_CHANNELS] ||
+ !info->attrs[IEEE802154_ATTR_DURATION])
+ return -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+ if (!ieee802154_mlme_ops(dev)->scan_req)
+ goto out;
+
+ type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]);
+ channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]);
+ duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]);
+
+ if (info->attrs[IEEE802154_ATTR_PAGE])
+ page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
+ else
+ page = 0;
+
+ ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels,
+ page, duration);
+
+out:
+ dev_put(dev);
+ return ret;
+}
+
+int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info)
+{
+ /* Request for interface name, index, type, IEEE address,
+ * PAN Id, short address
+ */
+ struct sk_buff *msg;
+ struct net_device *dev = NULL;
+ int rc = -ENOBUFS;
+
+ pr_debug("%s\n", __func__);
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out_dev;
+
+ rc = ieee802154_nl_fill_iface(msg, info->snd_portid, info->snd_seq,
+ 0, dev);
+ if (rc < 0)
+ goto out_free;
+
+ dev_put(dev);
+
+ return genlmsg_reply(msg, info);
+out_free:
+ nlmsg_free(msg);
+out_dev:
+ dev_put(dev);
+ return rc;
+}
+
+int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx;
+ int s_idx = cb->args[0];
+
+ pr_debug("%s\n", __func__);
+
+ idx = 0;
+ for_each_netdev(net, dev) {
+ /* Check on mtu is currently a hacked solution because lowpan
+ * and wpan have the same ARPHRD type.
+ */
+ if (idx < s_idx || dev->type != ARPHRD_IEEE802154 ||
+ dev->mtu != IEEE802154_MTU)
+ goto cont;
+
+ if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, dev) < 0)
+ break;
+cont:
+ idx++;
+ }
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev = NULL;
+ struct ieee802154_mlme_ops *ops;
+ struct ieee802154_mac_params params;
+ struct wpan_phy *phy;
+ int rc = -EINVAL;
+
+ pr_debug("%s\n", __func__);
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ ops = ieee802154_mlme_ops(dev);
+
+ if (!ops->get_mac_params || !ops->set_mac_params) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (netif_running(dev)) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ if (!info->attrs[IEEE802154_ATTR_LBT_ENABLED] &&
+ !info->attrs[IEEE802154_ATTR_CCA_MODE] &&
+ !info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL] &&
+ !info->attrs[IEEE802154_ATTR_CSMA_RETRIES] &&
+ !info->attrs[IEEE802154_ATTR_CSMA_MIN_BE] &&
+ !info->attrs[IEEE802154_ATTR_CSMA_MAX_BE] &&
+ !info->attrs[IEEE802154_ATTR_FRAME_RETRIES])
+ goto out;
+
+ phy = dev->ieee802154_ptr->wpan_phy;
+ get_device(&phy->dev);
+
+ rtnl_lock();
+ ops->get_mac_params(dev, &params);
+
+ if (info->attrs[IEEE802154_ATTR_TXPOWER])
+ params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]);
+
+ if (info->attrs[IEEE802154_ATTR_LBT_ENABLED])
+ params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]);
+
+ if (info->attrs[IEEE802154_ATTR_CCA_MODE])
+ params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]);
+
+ if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL])
+ params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]);
+
+ if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES])
+ params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]);
+
+ if (info->attrs[IEEE802154_ATTR_CSMA_MIN_BE])
+ params.min_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MIN_BE]);
+
+ if (info->attrs[IEEE802154_ATTR_CSMA_MAX_BE])
+ params.max_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MAX_BE]);
+
+ if (info->attrs[IEEE802154_ATTR_FRAME_RETRIES])
+ params.frame_retries = nla_get_s8(info->attrs[IEEE802154_ATTR_FRAME_RETRIES]);
+
+ rc = ops->set_mac_params(dev, &params);
+ rtnl_unlock();
+
+ wpan_phy_put(phy);
+ dev_put(dev);
+
+ return 0;
+
+out:
+ dev_put(dev);
+ return rc;
+}
+
+static int
+ieee802154_llsec_parse_key_id(struct genl_info *info,
+ struct ieee802154_llsec_key_id *desc)
+{
+ memset(desc, 0, sizeof(*desc));
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE])
+ return -EINVAL;
+
+ desc->mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]);
+
+ if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) {
+ if (!info->attrs[IEEE802154_ATTR_PAN_ID] &&
+ !(info->attrs[IEEE802154_ATTR_SHORT_ADDR] ||
+ info->attrs[IEEE802154_ATTR_HW_ADDR]))
+ return -EINVAL;
+
+ desc->device_addr.pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]);
+
+ if (info->attrs[IEEE802154_ATTR_SHORT_ADDR]) {
+ desc->device_addr.mode = IEEE802154_ADDR_SHORT;
+ desc->device_addr.short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]);
+ } else {
+ desc->device_addr.mode = IEEE802154_ADDR_LONG;
+ desc->device_addr.extended_addr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+ }
+ }
+
+ if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID])
+ return -EINVAL;
+
+ if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT])
+ return -EINVAL;
+
+ if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED])
+ return -EINVAL;
+
+ if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT)
+ desc->id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]);
+
+ switch (desc->mode) {
+ case IEEE802154_SCF_KEY_SHORT_INDEX:
+ {
+ u32 source = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]);
+
+ desc->short_source = cpu_to_le32(source);
+ break;
+ }
+ case IEEE802154_SCF_KEY_HW_INDEX:
+ desc->extended_source = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]);
+ break;
+ }
+
+ return 0;
+}
+
+static int
+ieee802154_llsec_fill_key_id(struct sk_buff *msg,
+ const struct ieee802154_llsec_key_id *desc)
+{
+ if (nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_MODE, desc->mode))
+ return -EMSGSIZE;
+
+ if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) {
+ if (nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID,
+ desc->device_addr.pan_id))
+ return -EMSGSIZE;
+
+ if (desc->device_addr.mode == IEEE802154_ADDR_SHORT &&
+ nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
+ desc->device_addr.short_addr))
+ return -EMSGSIZE;
+
+ if (desc->device_addr.mode == IEEE802154_ADDR_LONG &&
+ nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR,
+ desc->device_addr.extended_addr))
+ return -EMSGSIZE;
+ }
+
+ if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT &&
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_ID, desc->id))
+ return -EMSGSIZE;
+
+ if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX &&
+ nla_put_u32(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT,
+ le32_to_cpu(desc->short_source)))
+ return -EMSGSIZE;
+
+ if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX &&
+ nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED,
+ desc->extended_source))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct net_device *dev = NULL;
+ int rc = -ENOBUFS;
+ struct ieee802154_mlme_ops *ops;
+ void *hdr;
+ struct ieee802154_llsec_params params;
+
+ pr_debug("%s\n", __func__);
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ ops = ieee802154_mlme_ops(dev);
+ if (!ops->llsec) {
+ rc = -EOPNOTSUPP;
+ goto out_dev;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out_dev;
+
+ hdr = genlmsg_put(msg, 0, info->snd_seq, &nl802154_family, 0,
+ IEEE802154_LLSEC_GETPARAMS);
+ if (!hdr)
+ goto out_free;
+
+ rc = ops->llsec->get_params(dev, &params);
+ if (rc < 0)
+ goto out_free;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_ENABLED, params.enabled) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVEL, params.out_level) ||
+ nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+ be32_to_cpu(params.frame_counter)) ||
+ ieee802154_llsec_fill_key_id(msg, &params.out_key))
+ goto out_free;
+
+ dev_put(dev);
+
+ return ieee802154_nl_reply(msg, info);
+out_free:
+ nlmsg_free(msg);
+out_dev:
+ dev_put(dev);
+ return rc;
+}
+
+int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev = NULL;
+ int rc = -EINVAL;
+ struct ieee802154_mlme_ops *ops;
+ struct ieee802154_llsec_params params;
+ int changed = 0;
+
+ pr_debug("%s\n", __func__);
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_ENABLED] &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE] &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL])
+ goto out;
+
+ ops = ieee802154_mlme_ops(dev);
+ if (!ops->llsec) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL] &&
+ nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) > 7)
+ goto out;
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]) {
+ params.enabled = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]);
+ changed |= IEEE802154_LLSEC_PARAM_ENABLED;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) {
+ if (ieee802154_llsec_parse_key_id(info, &params.out_key))
+ goto out;
+
+ changed |= IEEE802154_LLSEC_PARAM_OUT_KEY;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) {
+ params.out_level = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]);
+ changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]) {
+ u32 fc = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+
+ params.frame_counter = cpu_to_be32(fc);
+ changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER;
+ }
+
+ rc = ops->llsec->set_params(dev, &params, changed);
+
+ dev_put(dev);
+
+ return rc;
+out:
+ dev_put(dev);
+ return rc;
+}
+
+struct llsec_dump_data {
+ struct sk_buff *skb;
+ int s_idx, s_idx2;
+ int portid;
+ int nlmsg_seq;
+ struct net_device *dev;
+ struct ieee802154_mlme_ops *ops;
+ struct ieee802154_llsec_table *table;
+};
+
+static int
+ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb,
+ int (*step)(struct llsec_dump_data *))
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ struct llsec_dump_data data;
+ int idx = 0;
+ int first_dev = cb->args[0];
+ int rc;
+
+ for_each_netdev(net, dev) {
+ /* Check on mtu is currently a hacked solution because lowpan
+ * and wpan have the same ARPHRD type.
+ */
+ if (idx < first_dev || dev->type != ARPHRD_IEEE802154 ||
+ dev->mtu != IEEE802154_MTU)
+ goto skip;
+
+ data.ops = ieee802154_mlme_ops(dev);
+ if (!data.ops->llsec)
+ goto skip;
+
+ data.skb = skb;
+ data.s_idx = cb->args[1];
+ data.s_idx2 = cb->args[2];
+ data.dev = dev;
+ data.portid = NETLINK_CB(cb->skb).portid;
+ data.nlmsg_seq = cb->nlh->nlmsg_seq;
+
+ data.ops->llsec->lock_table(dev);
+ data.ops->llsec->get_table(data.dev, &data.table);
+ rc = step(&data);
+ data.ops->llsec->unlock_table(dev);
+
+ if (rc < 0)
+ break;
+
+skip:
+ idx++;
+ }
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static int
+ieee802154_nl_llsec_change(struct sk_buff *skb, struct genl_info *info,
+ int (*fn)(struct net_device*, struct genl_info*))
+{
+ struct net_device *dev = NULL;
+ int rc = -EINVAL;
+
+ dev = ieee802154_nl_get_dev(info);
+ if (!dev)
+ return -ENODEV;
+
+ if (!ieee802154_mlme_ops(dev)->llsec)
+ rc = -EOPNOTSUPP;
+ else
+ rc = fn(dev, info);
+
+ dev_put(dev);
+ return rc;
+}
+
+static int
+ieee802154_llsec_parse_key(struct genl_info *info,
+ struct ieee802154_llsec_key *key)
+{
+ u8 frames;
+ u32 commands[256 / 32];
+
+ memset(key, 0, sizeof(*key));
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES])
+ return -EINVAL;
+
+ frames = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES]);
+ if ((frames & BIT(IEEE802154_FC_TYPE_MAC_CMD)) &&
+ !info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS])
+ return -EINVAL;
+
+ if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) {
+ nla_memcpy(commands,
+ info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS],
+ 256 / 8);
+
+ if (commands[0] || commands[1] || commands[2] || commands[3] ||
+ commands[4] || commands[5] || commands[6] ||
+ commands[7] >= BIT(IEEE802154_CMD_GTS_REQ + 1))
+ return -EINVAL;
+
+ key->cmd_frame_ids = commands[7];
+ }
+
+ key->frame_types = frames;
+
+ nla_memcpy(key->key, info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES],
+ IEEE802154_LLSEC_KEY_SIZE);
+
+ return 0;
+}
+
+static int llsec_add_key(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_key key;
+ struct ieee802154_llsec_key_id id;
+
+ if (ieee802154_llsec_parse_key(info, &key) ||
+ ieee802154_llsec_parse_key_id(info, &id))
+ return -EINVAL;
+
+ return ops->llsec->add_key(dev, &id, &key);
+}
+
+int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info)
+{
+ if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+ (NLM_F_CREATE | NLM_F_EXCL))
+ return -EINVAL;
+
+ return ieee802154_nl_llsec_change(skb, info, llsec_add_key);
+}
+
+static int llsec_remove_key(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_key_id id;
+
+ if (ieee802154_llsec_parse_key_id(info, &id))
+ return -EINVAL;
+
+ return ops->llsec->del_key(dev, &id);
+}
+
+int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info)
+{
+ return ieee802154_nl_llsec_change(skb, info, llsec_remove_key);
+}
+
+static int
+ieee802154_nl_fill_key(struct sk_buff *msg, u32 portid, u32 seq,
+ const struct ieee802154_llsec_key_entry *key,
+ const struct net_device *dev)
+{
+ void *hdr;
+ u32 commands[256 / 32];
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+ IEEE802154_LLSEC_LIST_KEY);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ ieee802154_llsec_fill_key_id(msg, &key->id) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES,
+ key->key->frame_types))
+ goto nla_put_failure;
+
+ if (key->key->frame_types & BIT(IEEE802154_FC_TYPE_MAC_CMD)) {
+ memset(commands, 0, sizeof(commands));
+ commands[7] = key->key->cmd_frame_ids;
+ if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS,
+ sizeof(commands), commands))
+ goto nla_put_failure;
+ }
+
+ if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_BYTES,
+ IEEE802154_LLSEC_KEY_SIZE, key->key->key))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+static int llsec_iter_keys(struct llsec_dump_data *data)
+{
+ struct ieee802154_llsec_key_entry *pos;
+ int rc = 0, idx = 0;
+
+ list_for_each_entry(pos, &data->table->keys, list) {
+ if (idx++ < data->s_idx)
+ continue;
+
+ if (ieee802154_nl_fill_key(data->skb, data->portid,
+ data->nlmsg_seq, pos, data->dev)) {
+ rc = -EMSGSIZE;
+ break;
+ }
+
+ data->s_idx++;
+ }
+
+ return rc;
+}
+
+int ieee802154_llsec_dump_keys(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ieee802154_llsec_dump_table(skb, cb, llsec_iter_keys);
+}
+
+static int
+llsec_parse_dev(struct genl_info *info,
+ struct ieee802154_llsec_device *dev)
+{
+ memset(dev, 0, sizeof(*dev));
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] ||
+ !info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] ||
+ (!!info->attrs[IEEE802154_ATTR_PAN_ID] !=
+ !!info->attrs[IEEE802154_ATTR_SHORT_ADDR]))
+ return -EINVAL;
+
+ if (info->attrs[IEEE802154_ATTR_PAN_ID]) {
+ dev->pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]);
+ dev->short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]);
+ } else {
+ dev->short_addr = cpu_to_le16(IEEE802154_ADDR_UNDEF);
+ }
+
+ dev->hwaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+ dev->frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+ dev->seclevel_exempt = !!nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]);
+ dev->key_mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE]);
+
+ if (dev->key_mode >= __IEEE802154_LLSEC_DEVKEY_MAX)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int llsec_add_dev(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_device desc;
+
+ if (llsec_parse_dev(info, &desc))
+ return -EINVAL;
+
+ return ops->llsec->add_dev(dev, &desc);
+}
+
+int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info)
+{
+ if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+ (NLM_F_CREATE | NLM_F_EXCL))
+ return -EINVAL;
+
+ return ieee802154_nl_llsec_change(skb, info, llsec_add_dev);
+}
+
+static int llsec_del_dev(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ __le64 devaddr;
+
+ if (!info->attrs[IEEE802154_ATTR_HW_ADDR])
+ return -EINVAL;
+
+ devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+
+ return ops->llsec->del_dev(dev, devaddr);
+}
+
+int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info)
+{
+ return ieee802154_nl_llsec_change(skb, info, llsec_del_dev);
+}
+
+static int
+ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq,
+ const struct ieee802154_llsec_device *desc,
+ const struct net_device *dev)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+ IEEE802154_LLSEC_LIST_DEV);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) ||
+ nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR,
+ desc->short_addr) ||
+ nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr) ||
+ nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+ desc->frame_counter) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
+ desc->seclevel_exempt) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_KEY_MODE, desc->key_mode))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+static int llsec_iter_devs(struct llsec_dump_data *data)
+{
+ struct ieee802154_llsec_device *pos;
+ int rc = 0, idx = 0;
+
+ list_for_each_entry(pos, &data->table->devices, list) {
+ if (idx++ < data->s_idx)
+ continue;
+
+ if (ieee802154_nl_fill_dev(data->skb, data->portid,
+ data->nlmsg_seq, pos, data->dev)) {
+ rc = -EMSGSIZE;
+ break;
+ }
+
+ data->s_idx++;
+ }
+
+ return rc;
+}
+
+int ieee802154_llsec_dump_devs(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devs);
+}
+
+static int llsec_add_devkey(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_device_key key;
+ __le64 devaddr;
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] ||
+ !info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+ ieee802154_llsec_parse_key_id(info, &key.key_id))
+ return -EINVAL;
+
+ devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+ key.frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]);
+
+ return ops->llsec->add_devkey(dev, devaddr, &key);
+}
+
+int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+ if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+ (NLM_F_CREATE | NLM_F_EXCL))
+ return -EINVAL;
+
+ return ieee802154_nl_llsec_change(skb, info, llsec_add_devkey);
+}
+
+static int llsec_del_devkey(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_device_key key;
+ __le64 devaddr;
+
+ if (!info->attrs[IEEE802154_ATTR_HW_ADDR] ||
+ ieee802154_llsec_parse_key_id(info, &key.key_id))
+ return -EINVAL;
+
+ devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
+
+ return ops->llsec->del_devkey(dev, devaddr, &key);
+}
+
+int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info)
+{
+ return ieee802154_nl_llsec_change(skb, info, llsec_del_devkey);
+}
+
+static int
+ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq,
+ __le64 devaddr,
+ const struct ieee802154_llsec_device_key *devkey,
+ const struct net_device *dev)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+ IEEE802154_LLSEC_LIST_DEVKEY);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr) ||
+ nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
+ devkey->frame_counter) ||
+ ieee802154_llsec_fill_key_id(msg, &devkey->key_id))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+static int llsec_iter_devkeys(struct llsec_dump_data *data)
+{
+ struct ieee802154_llsec_device *dpos;
+ struct ieee802154_llsec_device_key *kpos;
+ int rc = 0, idx = 0, idx2;
+
+ list_for_each_entry(dpos, &data->table->devices, list) {
+ if (idx++ < data->s_idx)
+ continue;
+
+ idx2 = 0;
+
+ list_for_each_entry(kpos, &dpos->keys, list) {
+ if (idx2++ < data->s_idx2)
+ continue;
+
+ if (ieee802154_nl_fill_devkey(data->skb, data->portid,
+ data->nlmsg_seq,
+ dpos->hwaddr, kpos,
+ data->dev)) {
+ return rc = -EMSGSIZE;
+ }
+
+ data->s_idx2++;
+ }
+
+ data->s_idx++;
+ }
+
+ return rc;
+}
+
+int ieee802154_llsec_dump_devkeys(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devkeys);
+}
+
+static int
+llsec_parse_seclevel(struct genl_info *info,
+ struct ieee802154_llsec_seclevel *sl)
+{
+ memset(sl, 0, sizeof(*sl));
+
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS] ||
+ !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE])
+ return -EINVAL;
+
+ sl->frame_type = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE]);
+ if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD) {
+ if (!info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID])
+ return -EINVAL;
+
+ sl->cmd_frame_id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]);
+ }
+
+ sl->sec_levels = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS]);
+ sl->device_override = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]);
+
+ return 0;
+}
+
+static int llsec_add_seclevel(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_seclevel sl;
+
+ if (llsec_parse_seclevel(info, &sl))
+ return -EINVAL;
+
+ return ops->llsec->add_seclevel(dev, &sl);
+}
+
+int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info)
+{
+ if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) !=
+ (NLM_F_CREATE | NLM_F_EXCL))
+ return -EINVAL;
+
+ return ieee802154_nl_llsec_change(skb, info, llsec_add_seclevel);
+}
+
+static int llsec_del_seclevel(struct net_device *dev, struct genl_info *info)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct ieee802154_llsec_seclevel sl;
+
+ if (llsec_parse_seclevel(info, &sl))
+ return -EINVAL;
+
+ return ops->llsec->del_seclevel(dev, &sl);
+}
+
+int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info)
+{
+ return ieee802154_nl_llsec_change(skb, info, llsec_del_seclevel);
+}
+
+static int
+ieee802154_nl_fill_seclevel(struct sk_buff *msg, u32 portid, u32 seq,
+ const struct ieee802154_llsec_seclevel *sl,
+ const struct net_device *dev)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI,
+ IEEE802154_LLSEC_LIST_SECLEVEL);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) ||
+ nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_FRAME_TYPE, sl->frame_type) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVELS, sl->sec_levels) ||
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE,
+ sl->device_override))
+ goto nla_put_failure;
+
+ if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD &&
+ nla_put_u8(msg, IEEE802154_ATTR_LLSEC_CMD_FRAME_ID,
+ sl->cmd_frame_id))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+out:
+ return -EMSGSIZE;
+}
+
+static int llsec_iter_seclevels(struct llsec_dump_data *data)
+{
+ struct ieee802154_llsec_seclevel *pos;
+ int rc = 0, idx = 0;
+
+ list_for_each_entry(pos, &data->table->security_levels, list) {
+ if (idx++ < data->s_idx)
+ continue;
+
+ if (ieee802154_nl_fill_seclevel(data->skb, data->portid,
+ data->nlmsg_seq, pos,
+ data->dev)) {
+ rc = -EMSGSIZE;
+ break;
+ }
+
+ data->s_idx++;
+ }
+
+ return rc;
+}
+
+int ieee802154_llsec_dump_seclevels(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return ieee802154_llsec_dump_table(skb, cb, llsec_iter_seclevels);
+}
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
new file mode 100644
index 000000000..346c6665d
--- /dev/null
+++ b/net/ieee802154/nl-phy.c
@@ -0,0 +1,349 @@
+/*
+ * Netlink interface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/if_arp.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/cfg802154.h>
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/rtnetlink.h> /* for rtnl_{un,}lock */
+#include <linux/nl802154.h>
+
+#include "ieee802154.h"
+#include "rdev-ops.h"
+#include "core.h"
+
+static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
+ u32 seq, int flags, struct wpan_phy *phy)
+{
+ void *hdr;
+ int i, pages = 0;
+ uint32_t *buf = kzalloc(32 * sizeof(uint32_t), GFP_KERNEL);
+
+ pr_debug("%s\n", __func__);
+
+ if (!buf)
+ return -EMSGSIZE;
+
+ hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags,
+ IEEE802154_LIST_PHY);
+ if (!hdr)
+ goto out;
+
+ mutex_lock(&phy->pib_lock);
+ if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
+ nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) ||
+ nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel))
+ goto nla_put_failure;
+ for (i = 0; i < 32; i++) {
+ if (phy->channels_supported[i])
+ buf[pages++] = phy->channels_supported[i] | (i << 27);
+ }
+ if (pages &&
+ nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST,
+ pages * sizeof(uint32_t), buf))
+ goto nla_put_failure;
+ mutex_unlock(&phy->pib_lock);
+ kfree(buf);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ mutex_unlock(&phy->pib_lock);
+ genlmsg_cancel(msg, hdr);
+out:
+ kfree(buf);
+ return -EMSGSIZE;
+}
+
+int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info)
+{
+ /* Request for interface name, index, type, IEEE address,
+ * PAN Id, short address
+ */
+ struct sk_buff *msg;
+ struct wpan_phy *phy;
+ const char *name;
+ int rc = -ENOBUFS;
+
+ pr_debug("%s\n", __func__);
+
+ if (!info->attrs[IEEE802154_ATTR_PHY_NAME])
+ return -EINVAL;
+
+ name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+ if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0')
+ return -EINVAL; /* phy name should be null-terminated */
+
+ phy = wpan_phy_find(name);
+ if (!phy)
+ return -ENODEV;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out_dev;
+
+ rc = ieee802154_nl_fill_phy(msg, info->snd_portid, info->snd_seq,
+ 0, phy);
+ if (rc < 0)
+ goto out_free;
+
+ wpan_phy_put(phy);
+
+ return genlmsg_reply(msg, info);
+out_free:
+ nlmsg_free(msg);
+out_dev:
+ wpan_phy_put(phy);
+ return rc;
+}
+
+struct dump_phy_data {
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ int idx, s_idx;
+};
+
+static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data)
+{
+ int rc;
+ struct dump_phy_data *data = _data;
+
+ pr_debug("%s\n", __func__);
+
+ if (data->idx++ < data->s_idx)
+ return 0;
+
+ rc = ieee802154_nl_fill_phy(data->skb,
+ NETLINK_CB(data->cb->skb).portid,
+ data->cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ phy);
+
+ if (rc < 0) {
+ data->idx--;
+ return rc;
+ }
+
+ return 0;
+}
+
+int ieee802154_dump_phy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct dump_phy_data data = {
+ .cb = cb,
+ .skb = skb,
+ .s_idx = cb->args[0],
+ .idx = 0,
+ };
+
+ pr_debug("%s\n", __func__);
+
+ wpan_phy_for_each(ieee802154_dump_phy_iter, &data);
+
+ cb->args[0] = data.idx;
+
+ return skb->len;
+}
+
+int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct wpan_phy *phy;
+ const char *name;
+ const char *devname;
+ int rc = -ENOBUFS;
+ struct net_device *dev;
+ int type = __IEEE802154_DEV_INVALID;
+ unsigned char name_assign_type;
+
+ pr_debug("%s\n", __func__);
+
+ if (!info->attrs[IEEE802154_ATTR_PHY_NAME])
+ return -EINVAL;
+
+ name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+ if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0')
+ return -EINVAL; /* phy name should be null-terminated */
+
+ if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
+ devname = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]);
+ if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
+ != '\0')
+ return -EINVAL; /* phy name should be null-terminated */
+ name_assign_type = NET_NAME_USER;
+ } else {
+ devname = "wpan%d";
+ name_assign_type = NET_NAME_ENUM;
+ }
+
+ if (strlen(devname) >= IFNAMSIZ)
+ return -ENAMETOOLONG;
+
+ phy = wpan_phy_find(name);
+ if (!phy)
+ return -ENODEV;
+
+ msg = ieee802154_nl_new_reply(info, 0, IEEE802154_ADD_IFACE);
+ if (!msg)
+ goto out_dev;
+
+ if (info->attrs[IEEE802154_ATTR_HW_ADDR] &&
+ nla_len(info->attrs[IEEE802154_ATTR_HW_ADDR]) !=
+ IEEE802154_ADDR_LEN) {
+ rc = -EINVAL;
+ goto nla_put_failure;
+ }
+
+ if (info->attrs[IEEE802154_ATTR_DEV_TYPE]) {
+ type = nla_get_u8(info->attrs[IEEE802154_ATTR_DEV_TYPE]);
+ if (type >= __IEEE802154_DEV_MAX) {
+ rc = -EINVAL;
+ goto nla_put_failure;
+ }
+ }
+
+ dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname,
+ name_assign_type, type);
+ if (IS_ERR(dev)) {
+ rc = PTR_ERR(dev);
+ goto nla_put_failure;
+ }
+ dev_hold(dev);
+
+ if (info->attrs[IEEE802154_ATTR_HW_ADDR]) {
+ struct sockaddr addr;
+
+ addr.sa_family = ARPHRD_IEEE802154;
+ nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR],
+ IEEE802154_ADDR_LEN);
+
+ /* strangely enough, some callbacks (inetdev_event) from
+ * dev_set_mac_address require RTNL_LOCK
+ */
+ rtnl_lock();
+ rc = dev_set_mac_address(dev, &addr);
+ rtnl_unlock();
+ if (rc)
+ goto dev_unregister;
+ }
+
+ if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
+ nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name))
+ goto nla_put_failure;
+ dev_put(dev);
+
+ wpan_phy_put(phy);
+
+ return ieee802154_nl_reply(msg, info);
+
+dev_unregister:
+ rtnl_lock(); /* del_iface must be called with RTNL lock */
+ rdev_del_virtual_intf_deprecated(wpan_phy_to_rdev(phy), dev);
+ dev_put(dev);
+ rtnl_unlock();
+nla_put_failure:
+ nlmsg_free(msg);
+out_dev:
+ wpan_phy_put(phy);
+ return rc;
+}
+
+int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct wpan_phy *phy;
+ const char *name;
+ int rc;
+ struct net_device *dev;
+
+ pr_debug("%s\n", __func__);
+
+ if (!info->attrs[IEEE802154_ATTR_DEV_NAME])
+ return -EINVAL;
+
+ name = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]);
+ if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0')
+ return -EINVAL; /* name should be null-terminated */
+
+ dev = dev_get_by_name(genl_info_net(info), name);
+ if (!dev)
+ return -ENODEV;
+
+ phy = dev->ieee802154_ptr->wpan_phy;
+ BUG_ON(!phy);
+ get_device(&phy->dev);
+
+ rc = -EINVAL;
+ /* phy name is optional, but should be checked if it's given */
+ if (info->attrs[IEEE802154_ATTR_PHY_NAME]) {
+ struct wpan_phy *phy2;
+
+ const char *pname =
+ nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+ if (pname[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1]
+ != '\0')
+ /* name should be null-terminated */
+ goto out_dev;
+
+ phy2 = wpan_phy_find(pname);
+ if (!phy2)
+ goto out_dev;
+
+ if (phy != phy2) {
+ wpan_phy_put(phy2);
+ goto out_dev;
+ }
+ }
+
+ rc = -ENOBUFS;
+
+ msg = ieee802154_nl_new_reply(info, 0, IEEE802154_DEL_IFACE);
+ if (!msg)
+ goto out_dev;
+
+ rtnl_lock();
+ rdev_del_virtual_intf_deprecated(wpan_phy_to_rdev(phy), dev);
+
+ /* We don't have device anymore */
+ dev_put(dev);
+ dev = NULL;
+
+ rtnl_unlock();
+
+ if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
+ nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, name))
+ goto nla_put_failure;
+ wpan_phy_put(phy);
+
+ return ieee802154_nl_reply(msg, info);
+
+nla_put_failure:
+ nlmsg_free(msg);
+out_dev:
+ wpan_phy_put(phy);
+ if (dev)
+ dev_put(dev);
+
+ return rc;
+}
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
new file mode 100644
index 000000000..f3c12f6a4
--- /dev/null
+++ b/net/ieee802154/nl802154.c
@@ -0,0 +1,999 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Based on: net/wireless/nl80211.c
+ */
+
+#include <linux/rtnetlink.h>
+
+#include <net/cfg802154.h>
+#include <net/genetlink.h>
+#include <net/mac802154.h>
+#include <net/netlink.h>
+#include <net/nl802154.h>
+#include <net/sock.h>
+
+#include "nl802154.h"
+#include "rdev-ops.h"
+#include "core.h"
+
+static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+
+static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+
+/* the netlink family */
+static struct genl_family nl802154_fam = {
+ .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
+ .name = NL802154_GENL_NAME, /* have users key off the name instead */
+ .hdrsize = 0, /* no private header */
+ .version = 1, /* no particular meaning now */
+ .maxattr = NL802154_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = nl802154_pre_doit,
+ .post_doit = nl802154_post_doit,
+};
+
+/* multicast groups */
+enum nl802154_multicast_groups {
+ NL802154_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group nl802154_mcgrps[] = {
+ [NL802154_MCGRP_CONFIG] = { .name = "config", },
+};
+
+/* returns ERR_PTR values */
+static struct wpan_dev *
+__cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs)
+{
+ struct cfg802154_registered_device *rdev;
+ struct wpan_dev *result = NULL;
+ bool have_ifidx = attrs[NL802154_ATTR_IFINDEX];
+ bool have_wpan_dev_id = attrs[NL802154_ATTR_WPAN_DEV];
+ u64 wpan_dev_id;
+ int wpan_phy_idx = -1;
+ int ifidx = -1;
+
+ ASSERT_RTNL();
+
+ if (!have_ifidx && !have_wpan_dev_id)
+ return ERR_PTR(-EINVAL);
+
+ if (have_ifidx)
+ ifidx = nla_get_u32(attrs[NL802154_ATTR_IFINDEX]);
+ if (have_wpan_dev_id) {
+ wpan_dev_id = nla_get_u64(attrs[NL802154_ATTR_WPAN_DEV]);
+ wpan_phy_idx = wpan_dev_id >> 32;
+ }
+
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ struct wpan_dev *wpan_dev;
+
+ /* TODO netns compare */
+
+ if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx)
+ continue;
+
+ list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
+ if (have_ifidx && wpan_dev->netdev &&
+ wpan_dev->netdev->ifindex == ifidx) {
+ result = wpan_dev;
+ break;
+ }
+ if (have_wpan_dev_id &&
+ wpan_dev->identifier == (u32)wpan_dev_id) {
+ result = wpan_dev;
+ break;
+ }
+ }
+
+ if (result)
+ break;
+ }
+
+ if (result)
+ return result;
+
+ return ERR_PTR(-ENODEV);
+}
+
+static struct cfg802154_registered_device *
+__cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs)
+{
+ struct cfg802154_registered_device *rdev = NULL, *tmp;
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ if (!attrs[NL802154_ATTR_WPAN_PHY] &&
+ !attrs[NL802154_ATTR_IFINDEX] &&
+ !attrs[NL802154_ATTR_WPAN_DEV])
+ return ERR_PTR(-EINVAL);
+
+ if (attrs[NL802154_ATTR_WPAN_PHY])
+ rdev = cfg802154_rdev_by_wpan_phy_idx(
+ nla_get_u32(attrs[NL802154_ATTR_WPAN_PHY]));
+
+ if (attrs[NL802154_ATTR_WPAN_DEV]) {
+ u64 wpan_dev_id = nla_get_u64(attrs[NL802154_ATTR_WPAN_DEV]);
+ struct wpan_dev *wpan_dev;
+ bool found = false;
+
+ tmp = cfg802154_rdev_by_wpan_phy_idx(wpan_dev_id >> 32);
+ if (tmp) {
+ /* make sure wpan_dev exists */
+ list_for_each_entry(wpan_dev, &tmp->wpan_dev_list, list) {
+ if (wpan_dev->identifier != (u32)wpan_dev_id)
+ continue;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ tmp = NULL;
+
+ if (rdev && tmp != rdev)
+ return ERR_PTR(-EINVAL);
+ rdev = tmp;
+ }
+ }
+
+ if (attrs[NL802154_ATTR_IFINDEX]) {
+ int ifindex = nla_get_u32(attrs[NL802154_ATTR_IFINDEX]);
+
+ netdev = __dev_get_by_index(netns, ifindex);
+ if (netdev) {
+ if (netdev->ieee802154_ptr)
+ tmp = wpan_phy_to_rdev(
+ netdev->ieee802154_ptr->wpan_phy);
+ else
+ tmp = NULL;
+
+ /* not wireless device -- return error */
+ if (!tmp)
+ return ERR_PTR(-EINVAL);
+
+ /* mismatch -- return error */
+ if (rdev && tmp != rdev)
+ return ERR_PTR(-EINVAL);
+
+ rdev = tmp;
+ }
+ }
+
+ if (!rdev)
+ return ERR_PTR(-ENODEV);
+
+ /* TODO netns compare */
+
+ return rdev;
+}
+
+/* This function returns a pointer to the driver
+ * that the genl_info item that is passed refers to.
+ *
+ * The result of this can be a PTR_ERR and hence must
+ * be checked with IS_ERR() for errors.
+ */
+static struct cfg802154_registered_device *
+cfg802154_get_dev_from_info(struct net *netns, struct genl_info *info)
+{
+ return __cfg802154_rdev_from_attrs(netns, info->attrs);
+}
+
+/* policy for the attributes */
+static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
+ [NL802154_ATTR_WPAN_PHY] = { .type = NLA_U32 },
+ [NL802154_ATTR_WPAN_PHY_NAME] = { .type = NLA_NUL_STRING,
+ .len = 20-1 },
+
+ [NL802154_ATTR_IFINDEX] = { .type = NLA_U32 },
+ [NL802154_ATTR_IFTYPE] = { .type = NLA_U32 },
+ [NL802154_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 },
+
+ [NL802154_ATTR_WPAN_DEV] = { .type = NLA_U64 },
+
+ [NL802154_ATTR_PAGE] = { .type = NLA_U8, },
+ [NL802154_ATTR_CHANNEL] = { .type = NLA_U8, },
+
+ [NL802154_ATTR_TX_POWER] = { .type = NLA_S8, },
+
+ [NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, },
+ [NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, },
+
+ [NL802154_ATTR_SUPPORTED_CHANNEL] = { .type = NLA_U32, },
+
+ [NL802154_ATTR_PAN_ID] = { .type = NLA_U16, },
+ [NL802154_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 },
+ [NL802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, },
+
+ [NL802154_ATTR_MIN_BE] = { .type = NLA_U8, },
+ [NL802154_ATTR_MAX_BE] = { .type = NLA_U8, },
+ [NL802154_ATTR_MAX_CSMA_BACKOFFS] = { .type = NLA_U8, },
+
+ [NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, },
+
+ [NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, },
+};
+
+/* message building helper */
+static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+ int flags, u8 cmd)
+{
+ /* since there is no private header just add the generic one */
+ return genlmsg_put(skb, portid, seq, &nl802154_fam, flags, cmd);
+}
+
+static int
+nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ struct nlattr *nl_page;
+ unsigned long page;
+
+ nl_page = nla_nest_start(msg, NL802154_ATTR_CHANNELS_SUPPORTED);
+ if (!nl_page)
+ return -ENOBUFS;
+
+ for (page = 0; page <= IEEE802154_MAX_PAGE; page++) {
+ if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL,
+ rdev->wpan_phy.channels_supported[page]))
+ return -ENOBUFS;
+ }
+ nla_nest_end(msg, nl_page);
+
+ return 0;
+}
+
+static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
+ enum nl802154_commands cmd,
+ struct sk_buff *msg, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) ||
+ nla_put_string(msg, NL802154_ATTR_WPAN_PHY_NAME,
+ wpan_phy_name(&rdev->wpan_phy)) ||
+ nla_put_u32(msg, NL802154_ATTR_GENERATION,
+ cfg802154_rdev_list_generation))
+ goto nla_put_failure;
+
+ if (cmd != NL802154_CMD_NEW_WPAN_PHY)
+ goto finish;
+
+ /* DUMP PHY PIB */
+
+ /* current channel settings */
+ if (nla_put_u8(msg, NL802154_ATTR_PAGE,
+ rdev->wpan_phy.current_page) ||
+ nla_put_u8(msg, NL802154_ATTR_CHANNEL,
+ rdev->wpan_phy.current_channel))
+ goto nla_put_failure;
+
+ /* supported channels array */
+ if (nl802154_send_wpan_phy_channels(rdev, msg))
+ goto nla_put_failure;
+
+ /* cca mode */
+ if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE,
+ rdev->wpan_phy.cca.mode))
+ goto nla_put_failure;
+
+ if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) {
+ if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT,
+ rdev->wpan_phy.cca.opt))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_s8(msg, NL802154_ATTR_TX_POWER,
+ rdev->wpan_phy.transmit_power))
+ goto nla_put_failure;
+
+finish:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+struct nl802154_dump_wpan_phy_state {
+ s64 filter_wpan_phy;
+ long start;
+
+};
+
+static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nl802154_dump_wpan_phy_state *state)
+{
+ struct nlattr **tb = nl802154_fam.attrbuf;
+ int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
+ tb, nl802154_fam.maxattr, nl802154_policy);
+
+ /* TODO check if we can handle error here,
+ * we have no backward compatibility
+ */
+ if (ret)
+ return 0;
+
+ if (tb[NL802154_ATTR_WPAN_PHY])
+ state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]);
+ if (tb[NL802154_ATTR_WPAN_DEV])
+ state->filter_wpan_phy = nla_get_u64(tb[NL802154_ATTR_WPAN_DEV]) >> 32;
+ if (tb[NL802154_ATTR_IFINDEX]) {
+ struct net_device *netdev;
+ struct cfg802154_registered_device *rdev;
+ int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]);
+
+ /* TODO netns */
+ netdev = __dev_get_by_index(&init_net, ifidx);
+ if (!netdev)
+ return -ENODEV;
+ if (netdev->ieee802154_ptr) {
+ rdev = wpan_phy_to_rdev(
+ netdev->ieee802154_ptr->wpan_phy);
+ state->filter_wpan_phy = rdev->wpan_phy_idx;
+ }
+ }
+
+ return 0;
+}
+
+static int
+nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx = 0, ret;
+ struct nl802154_dump_wpan_phy_state *state = (void *)cb->args[0];
+ struct cfg802154_registered_device *rdev;
+
+ rtnl_lock();
+ if (!state) {
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state) {
+ rtnl_unlock();
+ return -ENOMEM;
+ }
+ state->filter_wpan_phy = -1;
+ ret = nl802154_dump_wpan_phy_parse(skb, cb, state);
+ if (ret) {
+ kfree(state);
+ rtnl_unlock();
+ return ret;
+ }
+ cb->args[0] = (long)state;
+ }
+
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ /* TODO net ns compare */
+ if (++idx <= state->start)
+ continue;
+ if (state->filter_wpan_phy != -1 &&
+ state->filter_wpan_phy != rdev->wpan_phy_idx)
+ continue;
+ /* attempt to fit multiple wpan_phy data chunks into the skb */
+ ret = nl802154_send_wpan_phy(rdev,
+ NL802154_CMD_NEW_WPAN_PHY,
+ skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (ret < 0) {
+ if ((ret == -ENOBUFS || ret == -EMSGSIZE) &&
+ !skb->len && cb->min_dump_alloc < 4096) {
+ cb->min_dump_alloc = 4096;
+ rtnl_unlock();
+ return 1;
+ }
+ idx--;
+ break;
+ }
+ break;
+ }
+ rtnl_unlock();
+
+ state->start = idx;
+
+ return skb->len;
+}
+
+static int nl802154_dump_wpan_phy_done(struct netlink_callback *cb)
+{
+ kfree((void *)cb->args[0]);
+ return 0;
+}
+
+static int nl802154_get_wpan_phy(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl802154_send_wpan_phy(rdev, NL802154_CMD_NEW_WPAN_PHY, msg,
+ info->snd_portid, info->snd_seq, 0) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static inline u64 wpan_dev_id(struct wpan_dev *wpan_dev)
+{
+ return (u64)wpan_dev->identifier |
+ ((u64)wpan_phy_to_rdev(wpan_dev->wpan_phy)->wpan_phy_idx << 32);
+}
+
+static int
+nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ struct net_device *dev = wpan_dev->netdev;
+ void *hdr;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags,
+ NL802154_CMD_NEW_INTERFACE);
+ if (!hdr)
+ return -1;
+
+ if (dev &&
+ (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put_string(msg, NL802154_ATTR_IFNAME, dev->name)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) ||
+ nla_put_u32(msg, NL802154_ATTR_IFTYPE, wpan_dev->iftype) ||
+ nla_put_u64(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev)) ||
+ nla_put_u32(msg, NL802154_ATTR_GENERATION,
+ rdev->devlist_generation ^
+ (cfg802154_rdev_list_generation << 2)))
+ goto nla_put_failure;
+
+ /* address settings */
+ if (nla_put_le64(msg, NL802154_ATTR_EXTENDED_ADDR,
+ wpan_dev->extended_addr) ||
+ nla_put_le16(msg, NL802154_ATTR_SHORT_ADDR,
+ wpan_dev->short_addr) ||
+ nla_put_le16(msg, NL802154_ATTR_PAN_ID, wpan_dev->pan_id))
+ goto nla_put_failure;
+
+ /* ARET handling */
+ if (nla_put_s8(msg, NL802154_ATTR_MAX_FRAME_RETRIES,
+ wpan_dev->frame_retries) ||
+ nla_put_u8(msg, NL802154_ATTR_MAX_BE, wpan_dev->max_be) ||
+ nla_put_u8(msg, NL802154_ATTR_MAX_CSMA_BACKOFFS,
+ wpan_dev->csma_retries) ||
+ nla_put_u8(msg, NL802154_ATTR_MIN_BE, wpan_dev->min_be))
+ goto nla_put_failure;
+
+ /* listen before transmit */
+ if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int wp_idx = 0;
+ int if_idx = 0;
+ int wp_start = cb->args[0];
+ int if_start = cb->args[1];
+ struct cfg802154_registered_device *rdev;
+ struct wpan_dev *wpan_dev;
+
+ rtnl_lock();
+ list_for_each_entry(rdev, &cfg802154_rdev_list, list) {
+ /* TODO netns compare */
+ if (wp_idx < wp_start) {
+ wp_idx++;
+ continue;
+ }
+ if_idx = 0;
+
+ list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
+ if (if_idx < if_start) {
+ if_idx++;
+ continue;
+ }
+ if (nl802154_send_iface(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wpan_dev) < 0) {
+ goto out;
+ }
+ if_idx++;
+ }
+
+ wp_idx++;
+ }
+out:
+ rtnl_unlock();
+
+ cb->args[0] = wp_idx;
+ cb->args[1] = if_idx;
+
+ return skb->len;
+}
+
+static int nl802154_get_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct wpan_dev *wdev = info->user_ptr[1];
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl802154_send_iface(msg, info->snd_portid, info->snd_seq, 0,
+ rdev, wdev) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ enum nl802154_iftype type = NL802154_IFTYPE_UNSPEC;
+ __le64 extended_addr = cpu_to_le64(0x0000000000000000ULL);
+
+ /* TODO avoid failing a new interface
+ * creation due to pending removal?
+ */
+
+ if (!info->attrs[NL802154_ATTR_IFNAME])
+ return -EINVAL;
+
+ if (info->attrs[NL802154_ATTR_IFTYPE]) {
+ type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]);
+ if (type > NL802154_IFTYPE_MAX)
+ return -EINVAL;
+ }
+
+ /* TODO add nla_get_le64 to netlink */
+ if (info->attrs[NL802154_ATTR_EXTENDED_ADDR])
+ extended_addr = (__force __le64)nla_get_u64(
+ info->attrs[NL802154_ATTR_EXTENDED_ADDR]);
+
+ if (!rdev->ops->add_virtual_intf)
+ return -EOPNOTSUPP;
+
+ return rdev_add_virtual_intf(rdev,
+ nla_data(info->attrs[NL802154_ATTR_IFNAME]),
+ NET_NAME_USER, type, extended_addr);
+}
+
+static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct wpan_dev *wpan_dev = info->user_ptr[1];
+
+ if (!rdev->ops->del_virtual_intf)
+ return -EOPNOTSUPP;
+
+ /* If we remove a wpan device without a netdev then clear
+ * user_ptr[1] so that nl802154_post_doit won't dereference it
+ * to check if it needs to do dev_put(). Otherwise it crashes
+ * since the wpan_dev has been freed, unlike with a netdev where
+ * we need the dev_put() for the netdev to really be freed.
+ */
+ if (!wpan_dev->netdev)
+ info->user_ptr[1] = NULL;
+
+ return rdev_del_virtual_intf(rdev, wpan_dev);
+}
+
+static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ u8 channel, page;
+
+ if (!info->attrs[NL802154_ATTR_PAGE] ||
+ !info->attrs[NL802154_ATTR_CHANNEL])
+ return -EINVAL;
+
+ page = nla_get_u8(info->attrs[NL802154_ATTR_PAGE]);
+ channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]);
+
+ /* check 802.15.4 constraints */
+ if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL)
+ return -EINVAL;
+
+ return rdev_set_channel(rdev, page, channel);
+}
+
+static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct wpan_phy_cca cca;
+
+ if (!info->attrs[NL802154_ATTR_CCA_MODE])
+ return -EINVAL;
+
+ cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]);
+ /* checking 802.15.4 constraints */
+ if (cca.mode < NL802154_CCA_ENERGY || cca.mode > NL802154_CCA_ATTR_MAX)
+ return -EINVAL;
+
+ if (cca.mode == NL802154_CCA_ENERGY_CARRIER) {
+ if (!info->attrs[NL802154_ATTR_CCA_OPT])
+ return -EINVAL;
+
+ cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]);
+ if (cca.opt > NL802154_CCA_OPT_ATTR_MAX)
+ return -EINVAL;
+ }
+
+ return rdev_set_cca_mode(rdev, &cca);
+}
+
+static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ __le16 pan_id;
+
+ /* conflict here while tx/rx calls */
+ if (netif_running(dev))
+ return -EBUSY;
+
+ /* don't change address fields on monitor */
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EINVAL;
+
+ if (!info->attrs[NL802154_ATTR_PAN_ID])
+ return -EINVAL;
+
+ pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]);
+
+ return rdev_set_pan_id(rdev, wpan_dev, pan_id);
+}
+
+static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ __le16 short_addr;
+
+ /* conflict here while tx/rx calls */
+ if (netif_running(dev))
+ return -EBUSY;
+
+ /* don't change address fields on monitor */
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EINVAL;
+
+ if (!info->attrs[NL802154_ATTR_SHORT_ADDR])
+ return -EINVAL;
+
+ short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]);
+
+ return rdev_set_short_addr(rdev, wpan_dev, short_addr);
+}
+
+static int
+nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ u8 min_be, max_be;
+
+ /* should be set on netif open inside phy settings */
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_MIN_BE] ||
+ !info->attrs[NL802154_ATTR_MAX_BE])
+ return -EINVAL;
+
+ min_be = nla_get_u8(info->attrs[NL802154_ATTR_MIN_BE]);
+ max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]);
+
+ /* check 802.15.4 constraints */
+ if (max_be < 3 || max_be > 8 || min_be > max_be)
+ return -EINVAL;
+
+ return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be);
+}
+
+static int
+nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ u8 max_csma_backoffs;
+
+ /* conflict here while other running iface settings */
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS])
+ return -EINVAL;
+
+ max_csma_backoffs = nla_get_u8(
+ info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]);
+
+ /* check 802.15.4 constraints */
+ if (max_csma_backoffs > 5)
+ return -EINVAL;
+
+ return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs);
+}
+
+static int
+nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ s8 max_frame_retries;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES])
+ return -EINVAL;
+
+ max_frame_retries = nla_get_s8(
+ info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]);
+
+ /* check 802.15.4 constraints */
+ if (max_frame_retries < -1 || max_frame_retries > 7)
+ return -EINVAL;
+
+ return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries);
+}
+
+static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ bool mode;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!info->attrs[NL802154_ATTR_LBT_MODE])
+ return -EINVAL;
+
+ mode = !!nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]);
+ return rdev_set_lbt_mode(rdev, wpan_dev, mode);
+}
+
+#define NL802154_FLAG_NEED_WPAN_PHY 0x01
+#define NL802154_FLAG_NEED_NETDEV 0x02
+#define NL802154_FLAG_NEED_RTNL 0x04
+#define NL802154_FLAG_CHECK_NETDEV_UP 0x08
+#define NL802154_FLAG_NEED_NETDEV_UP (NL802154_FLAG_NEED_NETDEV |\
+ NL802154_FLAG_CHECK_NETDEV_UP)
+#define NL802154_FLAG_NEED_WPAN_DEV 0x10
+#define NL802154_FLAG_NEED_WPAN_DEV_UP (NL802154_FLAG_NEED_WPAN_DEV |\
+ NL802154_FLAG_CHECK_NETDEV_UP)
+
+static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev;
+ struct wpan_dev *wpan_dev;
+ struct net_device *dev;
+ bool rtnl = ops->internal_flags & NL802154_FLAG_NEED_RTNL;
+
+ if (rtnl)
+ rtnl_lock();
+
+ if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_PHY) {
+ rdev = cfg802154_get_dev_from_info(genl_info_net(info), info);
+ if (IS_ERR(rdev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return PTR_ERR(rdev);
+ }
+ info->user_ptr[0] = rdev;
+ } else if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV ||
+ ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) {
+ ASSERT_RTNL();
+ wpan_dev = __cfg802154_wpan_dev_from_attrs(genl_info_net(info),
+ info->attrs);
+ if (IS_ERR(wpan_dev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return PTR_ERR(wpan_dev);
+ }
+
+ dev = wpan_dev->netdev;
+ rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy);
+
+ if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV) {
+ if (!dev) {
+ if (rtnl)
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ info->user_ptr[1] = dev;
+ } else {
+ info->user_ptr[1] = wpan_dev;
+ }
+
+ if (dev) {
+ if (ops->internal_flags & NL802154_FLAG_CHECK_NETDEV_UP &&
+ !netif_running(dev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+
+ dev_hold(dev);
+ }
+
+ info->user_ptr[0] = rdev;
+ }
+
+ return 0;
+}
+
+static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info)
+{
+ if (info->user_ptr[1]) {
+ if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) {
+ struct wpan_dev *wpan_dev = info->user_ptr[1];
+
+ if (wpan_dev->netdev)
+ dev_put(wpan_dev->netdev);
+ } else {
+ dev_put(info->user_ptr[1]);
+ }
+ }
+
+ if (ops->internal_flags & NL802154_FLAG_NEED_RTNL)
+ rtnl_unlock();
+}
+
+static const struct genl_ops nl802154_ops[] = {
+ {
+ .cmd = NL802154_CMD_GET_WPAN_PHY,
+ .doit = nl802154_get_wpan_phy,
+ .dumpit = nl802154_dump_wpan_phy,
+ .done = nl802154_dump_wpan_phy_done,
+ .policy = nl802154_policy,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_GET_INTERFACE,
+ .doit = nl802154_get_interface,
+ .dumpit = nl802154_dump_interface,
+ .policy = nl802154_policy,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL802154_FLAG_NEED_WPAN_DEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_NEW_INTERFACE,
+ .doit = nl802154_new_interface,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DEL_INTERFACE,
+ .doit = nl802154_del_interface,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_DEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_CHANNEL,
+ .doit = nl802154_set_channel,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_CCA_MODE,
+ .doit = nl802154_set_cca_mode,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_PAN_ID,
+ .doit = nl802154_set_pan_id,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_SHORT_ADDR,
+ .doit = nl802154_set_short_addr,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_BACKOFF_EXPONENT,
+ .doit = nl802154_set_backoff_exponent,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_MAX_CSMA_BACKOFFS,
+ .doit = nl802154_set_max_csma_backoffs,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_MAX_FRAME_RETRIES,
+ .doit = nl802154_set_max_frame_retries,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_LBT_MODE,
+ .doit = nl802154_set_lbt_mode,
+ .policy = nl802154_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+};
+
+/* initialisation/exit functions */
+int nl802154_init(void)
+{
+ return genl_register_family_with_ops_groups(&nl802154_fam, nl802154_ops,
+ nl802154_mcgrps);
+}
+
+void nl802154_exit(void)
+{
+ genl_unregister_family(&nl802154_fam);
+}
diff --git a/net/ieee802154/nl802154.h b/net/ieee802154/nl802154.h
new file mode 100644
index 000000000..3846a89d0
--- /dev/null
+++ b/net/ieee802154/nl802154.h
@@ -0,0 +1,7 @@
+#ifndef __IEEE802154_NL802154_H
+#define __IEEE802154_NL802154_H
+
+int nl802154_init(void);
+void nl802154_exit(void);
+
+#endif /* __IEEE802154_NL802154_H */
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
new file mode 100644
index 000000000..35c432668
--- /dev/null
+++ b/net/ieee802154/nl_policy.c
@@ -0,0 +1,78 @@
+/*
+ * nl802154.h
+ *
+ * Copyright (C) 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/nl802154.h>
+
+#define NLA_HW_ADDR NLA_U64
+
+const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
+ [IEEE802154_ATTR_DEV_NAME] = { .type = NLA_STRING, },
+ [IEEE802154_ATTR_DEV_INDEX] = { .type = NLA_U32, },
+ [IEEE802154_ATTR_PHY_NAME] = { .type = NLA_STRING, },
+
+ [IEEE802154_ATTR_STATUS] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_PAGE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_SRC_SHORT_ADDR] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_SRC_HW_ADDR] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_SRC_PAN_ID] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_DEST_SHORT_ADDR] = { .type = NLA_U16, },
+ [IEEE802154_ATTR_DEST_HW_ADDR] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_DEST_PAN_ID] = { .type = NLA_U16, },
+
+ [IEEE802154_ATTR_CAPABILITY] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_REASON] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_SCAN_TYPE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CHANNELS] = { .type = NLA_U32, },
+ [IEEE802154_ATTR_DURATION] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_ED_LIST] = { .len = 27 },
+ [IEEE802154_ATTR_CHANNEL_PAGE_LIST] = { .len = 32 * 4, },
+
+ [IEEE802154_ATTR_TXPOWER] = { .type = NLA_S8, },
+ [IEEE802154_ATTR_LBT_ENABLED] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CCA_MODE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, },
+ [IEEE802154_ATTR_CSMA_RETRIES] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CSMA_MIN_BE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_CSMA_MAX_BE] = { .type = NLA_U8, },
+
+ [IEEE802154_ATTR_FRAME_RETRIES] = { .type = NLA_S8, },
+
+ [IEEE802154_ATTR_LLSEC_ENABLED] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_SECLEVEL] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_KEY_MODE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT] = { .type = NLA_U32, },
+ [IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED] = { .type = NLA_HW_ADDR, },
+ [IEEE802154_ATTR_LLSEC_KEY_ID] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_FRAME_COUNTER] = { .type = NLA_U32 },
+ [IEEE802154_ATTR_LLSEC_KEY_BYTES] = { .len = 16, },
+ [IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS] = { .len = 258 / 8 },
+ [IEEE802154_ATTR_LLSEC_FRAME_TYPE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_CMD_FRAME_ID] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_SECLEVELS] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, },
+};
+
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
new file mode 100644
index 000000000..7b5a9dd94
--- /dev/null
+++ b/net/ieee802154/rdev-ops.h
@@ -0,0 +1,155 @@
+#ifndef __CFG802154_RDEV_OPS
+#define __CFG802154_RDEV_OPS
+
+#include <net/cfg802154.h>
+
+#include "core.h"
+#include "trace.h"
+
+static inline struct net_device *
+rdev_add_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
+ const char *name,
+ unsigned char name_assign_type,
+ int type)
+{
+ return rdev->ops->add_virtual_intf_deprecated(&rdev->wpan_phy, name,
+ name_assign_type, type);
+}
+
+static inline void
+rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev,
+ struct net_device *dev)
+{
+ rdev->ops->del_virtual_intf_deprecated(&rdev->wpan_phy, dev);
+}
+
+static inline int
+rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name,
+ unsigned char name_assign_type,
+ enum nl802154_iftype type, __le64 extended_addr)
+{
+ int ret;
+
+ trace_802154_rdev_add_virtual_intf(&rdev->wpan_phy, name, type,
+ extended_addr);
+ ret = rdev->ops->add_virtual_intf(&rdev->wpan_phy, name,
+ name_assign_type, type,
+ extended_addr);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_del_virtual_intf(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ int ret;
+
+ trace_802154_rdev_del_virtual_intf(&rdev->wpan_phy, wpan_dev);
+ ret = rdev->ops->del_virtual_intf(&rdev->wpan_phy, wpan_dev);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_channel(struct cfg802154_registered_device *rdev, u8 page, u8 channel)
+{
+ int ret;
+
+ trace_802154_rdev_set_channel(&rdev->wpan_phy, page, channel);
+ ret = rdev->ops->set_channel(&rdev->wpan_phy, page, channel);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_cca_mode(struct cfg802154_registered_device *rdev,
+ const struct wpan_phy_cca *cca)
+{
+ int ret;
+
+ trace_802154_rdev_set_cca_mode(&rdev->wpan_phy, cca);
+ ret = rdev->ops->set_cca_mode(&rdev->wpan_phy, cca);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_pan_id(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le16 pan_id)
+{
+ int ret;
+
+ trace_802154_rdev_set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
+ ret = rdev->ops->set_pan_id(&rdev->wpan_phy, wpan_dev, pan_id);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_short_addr(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, __le16 short_addr)
+{
+ int ret;
+
+ trace_802154_rdev_set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
+ ret = rdev->ops->set_short_addr(&rdev->wpan_phy, wpan_dev, short_addr);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_backoff_exponent(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, u8 min_be, u8 max_be)
+{
+ int ret;
+
+ trace_802154_rdev_set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
+ min_be, max_be);
+ ret = rdev->ops->set_backoff_exponent(&rdev->wpan_phy, wpan_dev,
+ min_be, max_be);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_max_csma_backoffs(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, u8 max_csma_backoffs)
+{
+ int ret;
+
+ trace_802154_rdev_set_csma_backoffs(&rdev->wpan_phy, wpan_dev,
+ max_csma_backoffs);
+ ret = rdev->ops->set_max_csma_backoffs(&rdev->wpan_phy, wpan_dev,
+ max_csma_backoffs);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_max_frame_retries(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, s8 max_frame_retries)
+{
+ int ret;
+
+ trace_802154_rdev_set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
+ max_frame_retries);
+ ret = rdev->ops->set_max_frame_retries(&rdev->wpan_phy, wpan_dev,
+ max_frame_retries);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_lbt_mode(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, bool mode)
+{
+ int ret;
+
+ trace_802154_rdev_set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
+ ret = rdev->ops->set_lbt_mode(&rdev->wpan_phy, wpan_dev, mode);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+#endif /* __CFG802154_RDEV_OPS */
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
new file mode 100644
index 000000000..627a25376
--- /dev/null
+++ b/net/ieee802154/socket.c
@@ -0,0 +1,1128 @@
+/*
+ * IEEE802154.4 socket interface
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ */
+
+#include <linux/net.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/if.h>
+#include <linux/termios.h> /* For TIOCOUTQ/INQ */
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/datalink.h>
+#include <net/psnap.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/route.h>
+
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+
+/* Utility function for families */
+static struct net_device*
+ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr)
+{
+ struct net_device *dev = NULL;
+ struct net_device *tmp;
+ __le16 pan_id, short_addr;
+ u8 hwaddr[IEEE802154_ADDR_LEN];
+
+ switch (addr->mode) {
+ case IEEE802154_ADDR_LONG:
+ ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr);
+ rcu_read_lock();
+ dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr);
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
+ break;
+ case IEEE802154_ADDR_SHORT:
+ if (addr->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST) ||
+ addr->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) ||
+ addr->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST))
+ break;
+
+ rtnl_lock();
+
+ for_each_netdev(net, tmp) {
+ if (tmp->type != ARPHRD_IEEE802154)
+ continue;
+
+ pan_id = ieee802154_mlme_ops(tmp)->get_pan_id(tmp);
+ short_addr =
+ ieee802154_mlme_ops(tmp)->get_short_addr(tmp);
+
+ if (pan_id == addr->pan_id &&
+ short_addr == addr->short_addr) {
+ dev = tmp;
+ dev_hold(dev);
+ break;
+ }
+ }
+
+ rtnl_unlock();
+ break;
+ default:
+ pr_warn("Unsupported ieee802154 address type: %d\n",
+ addr->mode);
+ break;
+ }
+
+ return dev;
+}
+
+static int ieee802154_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, 0);
+ }
+ return 0;
+}
+
+static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+
+ return sk->sk_prot->sendmsg(sk, msg, len);
+}
+
+static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_prot->bind)
+ return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+ return sock_no_bind(sock, uaddr, addr_len);
+}
+
+static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
+ if (uaddr->sa_family == AF_UNSPEC)
+ return sk->sk_prot->disconnect(sk, flags);
+
+ return sk->sk_prot->connect(sk, uaddr, addr_len);
+}
+
+static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
+ unsigned int cmd)
+{
+ struct ifreq ifr;
+ int ret = -ENOIOCTLCMD;
+ struct net_device *dev;
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+ dev_load(sock_net(sk), ifr.ifr_name);
+ dev = dev_get_by_name(sock_net(sk), ifr.ifr_name);
+
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl)
+ ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd);
+
+ if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ ret = -EFAULT;
+ dev_put(dev);
+
+ return ret;
+}
+
+static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCGSTAMP:
+ return sock_get_timestamp(sk, (struct timeval __user *)arg);
+ case SIOCGSTAMPNS:
+ return sock_get_timestampns(sk, (struct timespec __user *)arg);
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg,
+ cmd);
+ default:
+ if (!sk->sk_prot->ioctl)
+ return -ENOIOCTLCMD;
+ return sk->sk_prot->ioctl(sk, cmd, arg);
+ }
+}
+
+/* RAW Sockets (802.15.4 created in userspace) */
+static HLIST_HEAD(raw_head);
+static DEFINE_RWLOCK(raw_lock);
+
+static void raw_hash(struct sock *sk)
+{
+ write_lock_bh(&raw_lock);
+ sk_add_node(sk, &raw_head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ write_unlock_bh(&raw_lock);
+}
+
+static void raw_unhash(struct sock *sk)
+{
+ write_lock_bh(&raw_lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&raw_lock);
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+ sk_common_release(sk);
+}
+
+static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len)
+{
+ struct ieee802154_addr addr;
+ struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr;
+ int err = 0;
+ struct net_device *dev = NULL;
+
+ if (len < sizeof(*uaddr))
+ return -EINVAL;
+
+ uaddr = (struct sockaddr_ieee802154 *)_uaddr;
+ if (uaddr->family != AF_IEEE802154)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ ieee802154_addr_from_sa(&addr, &uaddr->addr);
+ dev = ieee802154_get_dev(sock_net(sk), &addr);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (dev->type != ARPHRD_IEEE802154) {
+ err = -ENODEV;
+ goto out_put;
+ }
+
+ sk->sk_bound_dev_if = dev->ifindex;
+ sk_dst_reset(sk);
+
+out_put:
+ dev_put(dev);
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int raw_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ return -ENOTSUPP;
+}
+
+static int raw_disconnect(struct sock *sk, int flags)
+{
+ return 0;
+}
+
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct net_device *dev;
+ unsigned int mtu;
+ struct sk_buff *skb;
+ int hlen, tlen;
+ int err;
+
+ if (msg->msg_flags & MSG_OOB) {
+ pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags);
+ return -EOPNOTSUPP;
+ }
+
+ lock_sock(sk);
+ if (!sk->sk_bound_dev_if)
+ dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
+ else
+ dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if);
+ release_sock(sk);
+
+ if (!dev) {
+ pr_debug("no dev\n");
+ err = -ENXIO;
+ goto out;
+ }
+
+ mtu = dev->mtu;
+ pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
+
+ if (size > mtu) {
+ pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+ err = -EINVAL;
+ goto out_dev;
+ }
+
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = sock_alloc_send_skb(sk, hlen + tlen + size,
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto out_dev;
+
+ skb_reserve(skb, hlen);
+
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err < 0)
+ goto out_skb;
+
+ skb->dev = dev;
+ skb->sk = sk;
+ skb->protocol = htons(ETH_P_IEEE802154);
+
+ dev_put(dev);
+
+ err = dev_queue_xmit(skb);
+ if (err > 0)
+ err = net_xmit_errno(err);
+
+ return err ?: size;
+
+out_skb:
+ kfree_skb(skb);
+out_dev:
+ dev_put(dev);
+out:
+ return err;
+}
+
+static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ struct sk_buff *skb;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out:
+ if (err)
+ return err;
+ return copied;
+}
+
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+static void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk;
+
+ read_lock(&raw_lock);
+ sk_for_each(sk, &raw_head) {
+ bh_lock_sock(sk);
+ if (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == dev->ifindex) {
+ struct sk_buff *clone;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone)
+ raw_rcv_skb(sk, clone);
+ }
+ bh_unlock_sock(sk);
+ }
+ read_unlock(&raw_lock);
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return -EOPNOTSUPP;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ return -EOPNOTSUPP;
+}
+
+static struct proto ieee802154_raw_prot = {
+ .name = "IEEE-802.15.4-RAW",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct sock),
+ .close = raw_close,
+ .bind = raw_bind,
+ .sendmsg = raw_sendmsg,
+ .recvmsg = raw_recvmsg,
+ .hash = raw_hash,
+ .unhash = raw_unhash,
+ .connect = raw_connect,
+ .disconnect = raw_disconnect,
+ .getsockopt = raw_getsockopt,
+ .setsockopt = raw_setsockopt,
+};
+
+static const struct proto_ops ieee802154_raw_ops = {
+ .family = PF_IEEE802154,
+ .owner = THIS_MODULE,
+ .release = ieee802154_sock_release,
+ .bind = ieee802154_sock_bind,
+ .connect = ieee802154_sock_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = ieee802154_sock_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = ieee802154_sock_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+/* DGRAM Sockets (802.15.4 dataframes) */
+static HLIST_HEAD(dgram_head);
+static DEFINE_RWLOCK(dgram_lock);
+
+struct dgram_sock {
+ struct sock sk;
+
+ struct ieee802154_addr src_addr;
+ struct ieee802154_addr dst_addr;
+
+ unsigned int bound:1;
+ unsigned int connected:1;
+ unsigned int want_ack:1;
+ unsigned int secen:1;
+ unsigned int secen_override:1;
+ unsigned int seclevel:3;
+ unsigned int seclevel_override:1;
+};
+
+static inline struct dgram_sock *dgram_sk(const struct sock *sk)
+{
+ return container_of(sk, struct dgram_sock, sk);
+}
+
+static void dgram_hash(struct sock *sk)
+{
+ write_lock_bh(&dgram_lock);
+ sk_add_node(sk, &dgram_head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ write_unlock_bh(&dgram_lock);
+}
+
+static void dgram_unhash(struct sock *sk)
+{
+ write_lock_bh(&dgram_lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&dgram_lock);
+}
+
+static int dgram_init(struct sock *sk)
+{
+ struct dgram_sock *ro = dgram_sk(sk);
+
+ ro->want_ack = 1;
+ return 0;
+}
+
+static void dgram_close(struct sock *sk, long timeout)
+{
+ sk_common_release(sk);
+}
+
+static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len)
+{
+ struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
+ struct ieee802154_addr haddr;
+ struct dgram_sock *ro = dgram_sk(sk);
+ int err = -EINVAL;
+ struct net_device *dev;
+
+ lock_sock(sk);
+
+ ro->bound = 0;
+
+ if (len < sizeof(*addr))
+ goto out;
+
+ if (addr->family != AF_IEEE802154)
+ goto out;
+
+ ieee802154_addr_from_sa(&haddr, &addr->addr);
+ dev = ieee802154_get_dev(sock_net(sk), &haddr);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (dev->type != ARPHRD_IEEE802154) {
+ err = -ENODEV;
+ goto out_put;
+ }
+
+ ro->src_addr = haddr;
+
+ ro->bound = 1;
+ err = 0;
+out_put:
+ dev_put(dev);
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ {
+ int amount = sk_wmem_alloc_get(sk);
+
+ return put_user(amount, (int __user *)arg);
+ }
+
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+ unsigned long amount;
+
+ amount = 0;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb) {
+ /* We will only return the amount
+ * of this packet since that is all
+ * that will be read.
+ */
+ amount = skb->len - ieee802154_hdr_length(skb);
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+/* FIXME: autobind */
+static int dgram_connect(struct sock *sk, struct sockaddr *uaddr,
+ int len)
+{
+ struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
+ struct dgram_sock *ro = dgram_sk(sk);
+ int err = 0;
+
+ if (len < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->family != AF_IEEE802154)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (!ro->bound) {
+ err = -ENETUNREACH;
+ goto out;
+ }
+
+ ieee802154_addr_from_sa(&ro->dst_addr, &addr->addr);
+ ro->connected = 1;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int dgram_disconnect(struct sock *sk, int flags)
+{
+ struct dgram_sock *ro = dgram_sk(sk);
+
+ lock_sock(sk);
+ ro->connected = 0;
+ release_sock(sk);
+
+ return 0;
+}
+
+static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct net_device *dev;
+ unsigned int mtu;
+ struct sk_buff *skb;
+ struct ieee802154_mac_cb *cb;
+ struct dgram_sock *ro = dgram_sk(sk);
+ struct ieee802154_addr dst_addr;
+ int hlen, tlen;
+ int err;
+
+ if (msg->msg_flags & MSG_OOB) {
+ pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags);
+ return -EOPNOTSUPP;
+ }
+
+ if (!ro->connected && !msg->msg_name)
+ return -EDESTADDRREQ;
+ else if (ro->connected && msg->msg_name)
+ return -EISCONN;
+
+ if (!ro->bound)
+ dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
+ else
+ dev = ieee802154_get_dev(sock_net(sk), &ro->src_addr);
+
+ if (!dev) {
+ pr_debug("no dev\n");
+ err = -ENXIO;
+ goto out;
+ }
+ mtu = dev->mtu;
+ pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
+
+ if (size > mtu) {
+ pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+ err = -EMSGSIZE;
+ goto out_dev;
+ }
+
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = sock_alloc_send_skb(sk, hlen + tlen + size,
+ msg->msg_flags & MSG_DONTWAIT,
+ &err);
+ if (!skb)
+ goto out_dev;
+
+ skb_reserve(skb, hlen);
+
+ skb_reset_network_header(skb);
+
+ cb = mac_cb_init(skb);
+ cb->type = IEEE802154_FC_TYPE_DATA;
+ cb->ackreq = ro->want_ack;
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_ieee802154*,
+ daddr, msg->msg_name);
+
+ ieee802154_addr_from_sa(&dst_addr, &daddr->addr);
+ } else {
+ dst_addr = ro->dst_addr;
+ }
+
+ cb->secen = ro->secen;
+ cb->secen_override = ro->secen_override;
+ cb->seclevel = ro->seclevel;
+ cb->seclevel_override = ro->seclevel_override;
+
+ err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &dst_addr,
+ ro->bound ? &ro->src_addr : NULL, size);
+ if (err < 0)
+ goto out_skb;
+
+ err = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (err < 0)
+ goto out_skb;
+
+ skb->dev = dev;
+ skb->sk = sk;
+ skb->protocol = htons(ETH_P_IEEE802154);
+
+ dev_put(dev);
+
+ err = dev_queue_xmit(skb);
+ if (err > 0)
+ err = net_xmit_errno(err);
+
+ return err ?: size;
+
+out_skb:
+ kfree_skb(skb);
+out_dev:
+ dev_put(dev);
+out:
+ return err;
+}
+
+static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ struct sk_buff *skb;
+ DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name);
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ /* FIXME: skip headers if necessary ?! */
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (saddr) {
+ /* Clear the implicit padding in struct sockaddr_ieee802154
+ * (16 bits between 'family' and 'addr') and in struct
+ * ieee802154_addr_sa (16 bits at the end of the structure).
+ */
+ memset(saddr, 0, sizeof(*saddr));
+
+ saddr->family = AF_IEEE802154;
+ ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source);
+ *addr_len = sizeof(*saddr);
+ }
+
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out:
+ if (err)
+ return err;
+ return copied;
+}
+
+static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+static inline bool
+ieee802154_match_sock(__le64 hw_addr, __le16 pan_id, __le16 short_addr,
+ struct dgram_sock *ro)
+{
+ if (!ro->bound)
+ return true;
+
+ if (ro->src_addr.mode == IEEE802154_ADDR_LONG &&
+ hw_addr == ro->src_addr.extended_addr)
+ return true;
+
+ if (ro->src_addr.mode == IEEE802154_ADDR_SHORT &&
+ pan_id == ro->src_addr.pan_id &&
+ short_addr == ro->src_addr.short_addr)
+ return true;
+
+ return false;
+}
+
+static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb)
+{
+ struct sock *sk, *prev = NULL;
+ int ret = NET_RX_SUCCESS;
+ __le16 pan_id, short_addr;
+ __le64 hw_addr;
+
+ /* Data frame processing */
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+ short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev);
+ hw_addr = ieee802154_devaddr_from_raw(dev->dev_addr);
+
+ read_lock(&dgram_lock);
+ sk_for_each(sk, &dgram_head) {
+ if (ieee802154_match_sock(hw_addr, pan_id, short_addr,
+ dgram_sk(sk))) {
+ if (prev) {
+ struct sk_buff *clone;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone)
+ dgram_rcv_skb(prev, clone);
+ }
+
+ prev = sk;
+ }
+ }
+
+ if (prev) {
+ dgram_rcv_skb(prev, skb);
+ } else {
+ kfree_skb(skb);
+ ret = NET_RX_DROP;
+ }
+ read_unlock(&dgram_lock);
+
+ return ret;
+}
+
+static int dgram_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct dgram_sock *ro = dgram_sk(sk);
+
+ int val, len;
+
+ if (level != SOL_IEEE802154)
+ return -EOPNOTSUPP;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ switch (optname) {
+ case WPAN_WANTACK:
+ val = ro->want_ack;
+ break;
+ case WPAN_SECURITY:
+ if (!ro->secen_override)
+ val = WPAN_SECURITY_DEFAULT;
+ else if (ro->secen)
+ val = WPAN_SECURITY_ON;
+ else
+ val = WPAN_SECURITY_OFF;
+ break;
+ case WPAN_SECURITY_LEVEL:
+ if (!ro->seclevel_override)
+ val = WPAN_SECURITY_LEVEL_DEFAULT;
+ else
+ val = ro->seclevel;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static int dgram_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct dgram_sock *ro = dgram_sk(sk);
+ struct net *net = sock_net(sk);
+ int val;
+ int err = 0;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case WPAN_WANTACK:
+ ro->want_ack = !!val;
+ break;
+ case WPAN_SECURITY:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN) &&
+ !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ err = -EPERM;
+ break;
+ }
+
+ switch (val) {
+ case WPAN_SECURITY_DEFAULT:
+ ro->secen_override = 0;
+ break;
+ case WPAN_SECURITY_ON:
+ ro->secen_override = 1;
+ ro->secen = 1;
+ break;
+ case WPAN_SECURITY_OFF:
+ ro->secen_override = 1;
+ ro->secen = 0;
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ break;
+ case WPAN_SECURITY_LEVEL:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN) &&
+ !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ err = -EPERM;
+ break;
+ }
+
+ if (val < WPAN_SECURITY_LEVEL_DEFAULT ||
+ val > IEEE802154_SCF_SECLEVEL_ENC_MIC128) {
+ err = -EINVAL;
+ } else if (val == WPAN_SECURITY_LEVEL_DEFAULT) {
+ ro->seclevel_override = 0;
+ } else {
+ ro->seclevel_override = 1;
+ ro->seclevel = val;
+ }
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static struct proto ieee802154_dgram_prot = {
+ .name = "IEEE-802.15.4-MAC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct dgram_sock),
+ .init = dgram_init,
+ .close = dgram_close,
+ .bind = dgram_bind,
+ .sendmsg = dgram_sendmsg,
+ .recvmsg = dgram_recvmsg,
+ .hash = dgram_hash,
+ .unhash = dgram_unhash,
+ .connect = dgram_connect,
+ .disconnect = dgram_disconnect,
+ .ioctl = dgram_ioctl,
+ .getsockopt = dgram_getsockopt,
+ .setsockopt = dgram_setsockopt,
+};
+
+static const struct proto_ops ieee802154_dgram_ops = {
+ .family = PF_IEEE802154,
+ .owner = THIS_MODULE,
+ .release = ieee802154_sock_release,
+ .bind = ieee802154_sock_bind,
+ .connect = ieee802154_sock_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = ieee802154_sock_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = ieee802154_sock_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+/* Create a socket. Initialise the socket, blank the addresses
+ * set the state.
+ */
+static int ieee802154_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
+{
+ struct sock *sk;
+ int rc;
+ struct proto *proto;
+ const struct proto_ops *ops;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ switch (sock->type) {
+ case SOCK_RAW:
+ proto = &ieee802154_raw_prot;
+ ops = &ieee802154_raw_ops;
+ break;
+ case SOCK_DGRAM:
+ proto = &ieee802154_dgram_prot;
+ ops = &ieee802154_dgram_ops;
+ break;
+ default:
+ rc = -ESOCKTNOSUPPORT;
+ goto out;
+ }
+
+ rc = -ENOMEM;
+ sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto);
+ if (!sk)
+ goto out;
+ rc = 0;
+
+ sock->ops = ops;
+
+ sock_init_data(sock, sk);
+ /* FIXME: sk->sk_destruct */
+ sk->sk_family = PF_IEEE802154;
+
+ /* Checksums on by default */
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+ if (sk->sk_prot->hash)
+ sk->sk_prot->hash(sk);
+
+ if (sk->sk_prot->init) {
+ rc = sk->sk_prot->init(sk);
+ if (rc)
+ sk_common_release(sk);
+ }
+out:
+ return rc;
+}
+
+static const struct net_proto_family ieee802154_family_ops = {
+ .family = PF_IEEE802154,
+ .create = ieee802154_create,
+ .owner = THIS_MODULE,
+};
+
+static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ if (!netif_running(dev))
+ goto drop;
+ pr_debug("got frame, type %d, dev %p\n", dev->type, dev);
+#ifdef DEBUG
+ print_hex_dump_bytes("ieee802154_rcv ",
+ DUMP_PREFIX_NONE, skb->data, skb->len);
+#endif
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ ieee802154_raw_deliver(dev, skb);
+
+ if (dev->type != ARPHRD_IEEE802154)
+ goto drop;
+
+ if (skb->pkt_type != PACKET_OTHERHOST)
+ return ieee802154_dgram_deliver(dev, skb);
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static struct packet_type ieee802154_packet_type = {
+ .type = htons(ETH_P_IEEE802154),
+ .func = ieee802154_rcv,
+};
+
+static int __init af_ieee802154_init(void)
+{
+ int rc = -EINVAL;
+
+ rc = proto_register(&ieee802154_raw_prot, 1);
+ if (rc)
+ goto out;
+
+ rc = proto_register(&ieee802154_dgram_prot, 1);
+ if (rc)
+ goto err_dgram;
+
+ /* Tell SOCKET that we are alive */
+ rc = sock_register(&ieee802154_family_ops);
+ if (rc)
+ goto err_sock;
+ dev_add_pack(&ieee802154_packet_type);
+
+ rc = 0;
+ goto out;
+
+err_sock:
+ proto_unregister(&ieee802154_dgram_prot);
+err_dgram:
+ proto_unregister(&ieee802154_raw_prot);
+out:
+ return rc;
+}
+
+static void __exit af_ieee802154_remove(void)
+{
+ dev_remove_pack(&ieee802154_packet_type);
+ sock_unregister(PF_IEEE802154);
+ proto_unregister(&ieee802154_dgram_prot);
+ proto_unregister(&ieee802154_raw_prot);
+}
+
+module_init(af_ieee802154_init);
+module_exit(af_ieee802154_remove);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_IEEE802154);
diff --git a/net/ieee802154/sysfs.c b/net/ieee802154/sysfs.c
new file mode 100644
index 000000000..133b42806
--- /dev/null
+++ b/net/ieee802154/sysfs.c
@@ -0,0 +1,79 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Based on: net/wireless/sysfs.c
+ */
+
+#include <linux/device.h>
+
+#include <net/cfg802154.h>
+
+#include "core.h"
+#include "sysfs.h"
+
+static inline struct cfg802154_registered_device *
+dev_to_rdev(struct device *dev)
+{
+ return container_of(dev, struct cfg802154_registered_device,
+ wpan_phy.dev);
+}
+
+#define SHOW_FMT(name, fmt, member) \
+static ssize_t name ## _show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \
+} \
+static DEVICE_ATTR_RO(name)
+
+SHOW_FMT(index, "%d", wpan_phy_idx);
+
+static ssize_t name_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct wpan_phy *wpan_phy = &dev_to_rdev(dev)->wpan_phy;
+
+ return sprintf(buf, "%s\n", dev_name(&wpan_phy->dev));
+}
+static DEVICE_ATTR_RO(name);
+
+static void wpan_phy_release(struct device *dev)
+{
+ struct cfg802154_registered_device *rdev = dev_to_rdev(dev);
+
+ cfg802154_dev_free(rdev);
+}
+
+static struct attribute *pmib_attrs[] = {
+ &dev_attr_index.attr,
+ &dev_attr_name.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(pmib);
+
+struct class wpan_phy_class = {
+ .name = "ieee802154",
+ .dev_release = wpan_phy_release,
+ .dev_groups = pmib_groups,
+};
+
+int wpan_phy_sysfs_init(void)
+{
+ return class_register(&wpan_phy_class);
+}
+
+void wpan_phy_sysfs_exit(void)
+{
+ class_unregister(&wpan_phy_class);
+}
diff --git a/net/ieee802154/sysfs.h b/net/ieee802154/sysfs.h
new file mode 100644
index 000000000..aa42e39ec
--- /dev/null
+++ b/net/ieee802154/sysfs.h
@@ -0,0 +1,9 @@
+#ifndef __IEEE802154_SYSFS_H
+#define __IEEE802154_SYSFS_H
+
+int wpan_phy_sysfs_init(void);
+void wpan_phy_sysfs_exit(void);
+
+extern struct class wpan_phy_class;
+
+#endif /* __IEEE802154_SYSFS_H */
diff --git a/net/ieee802154/trace.c b/net/ieee802154/trace.c
new file mode 100644
index 000000000..95f997fad
--- /dev/null
+++ b/net/ieee802154/trace.c
@@ -0,0 +1,7 @@
+#include <linux/module.h>
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#endif
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
new file mode 100644
index 000000000..5ac25eb6e
--- /dev/null
+++ b/net/ieee802154/trace.h
@@ -0,0 +1,247 @@
+/* Based on net/wireless/tracing.h */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cfg802154
+
+#if !defined(__RDEV_CFG802154_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __RDEV_CFG802154_OPS_TRACE
+
+#include <linux/tracepoint.h>
+
+#include <net/cfg802154.h>
+
+#define MAXNAME 32
+#define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME)
+#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \
+ wpan_phy_name(wpan_phy), \
+ MAXNAME)
+#define WPAN_PHY_PR_FMT "%s"
+#define WPAN_PHY_PR_ARG __entry->wpan_phy_name
+
+#define WPAN_DEV_ENTRY __field(u32, identifier)
+#define WPAN_DEV_ASSIGN (__entry->identifier) = (!IS_ERR_OR_NULL(wpan_dev) \
+ ? wpan_dev->identifier : 0)
+#define WPAN_DEV_PR_FMT "wpan_dev(%u)"
+#define WPAN_DEV_PR_ARG (__entry->identifier)
+
+#define WPAN_CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \
+ __field(enum nl802154_cca_opts, cca_opt)
+#define WPAN_CCA_ASSIGN \
+ do { \
+ (__entry->cca_mode) = cca->mode; \
+ (__entry->cca_opt) = cca->opt; \
+ } while (0)
+#define WPAN_CCA_PR_FMT "cca_mode: %d, cca_opt: %d"
+#define WPAN_CCA_PR_ARG __entry->cca_mode, __entry->cca_opt
+
+#define BOOL_TO_STR(bo) (bo) ? "true" : "false"
+
+/*************************************************************
+ * rdev->ops traces *
+ *************************************************************/
+
+TRACE_EVENT(802154_rdev_add_virtual_intf,
+ TP_PROTO(struct wpan_phy *wpan_phy, char *name,
+ enum nl802154_iftype type, __le64 extended_addr),
+ TP_ARGS(wpan_phy, name, type, extended_addr),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __string(vir_intf_name, name ? name : "<noname>")
+ __field(enum nl802154_iftype, type)
+ __field(__le64, extended_addr)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __assign_str(vir_intf_name, name ? name : "<noname>");
+ __entry->type = type;
+ __entry->extended_addr = extended_addr;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", virtual intf name: %s, type: %d, ea %llx",
+ WPAN_PHY_PR_ARG, __get_str(vir_intf_name), __entry->type,
+ __le64_to_cpu(__entry->extended_addr))
+);
+
+TRACE_EVENT(802154_rdev_del_virtual_intf,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT, WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG)
+);
+
+TRACE_EVENT(802154_rdev_set_channel,
+ TP_PROTO(struct wpan_phy *wpan_phy, u8 page, u8 channel),
+ TP_ARGS(wpan_phy, page, channel),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(u8, page)
+ __field(u8, channel)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->page = page;
+ __entry->channel = channel;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", page: %d, channel: %d", WPAN_PHY_PR_ARG,
+ __entry->page, __entry->channel)
+);
+
+TRACE_EVENT(802154_rdev_set_cca_mode,
+ TP_PROTO(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca),
+ TP_ARGS(wpan_phy, cca),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_CCA_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_CCA_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_CCA_PR_FMT, WPAN_PHY_PR_ARG,
+ WPAN_CCA_PR_ARG)
+);
+
+DECLARE_EVENT_CLASS(802154_le16_template,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(__le16, le16arg)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->le16arg = le16arg;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", pan id: 0x%04x",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
+ __le16_to_cpu(__entry->le16arg))
+);
+
+DEFINE_EVENT(802154_le16_template, 802154_rdev_set_pan_id,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg)
+);
+
+DEFINE_EVENT_PRINT(802154_le16_template, 802154_rdev_set_short_addr,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 le16arg),
+ TP_ARGS(wpan_phy, wpan_dev, le16arg),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", sa: 0x%04x",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG,
+ __le16_to_cpu(__entry->le16arg))
+);
+
+TRACE_EVENT(802154_rdev_set_backoff_exponent,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ u8 min_be, u8 max_be),
+ TP_ARGS(wpan_phy, wpan_dev, min_be, max_be),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(u8, min_be)
+ __field(u8, max_be)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->min_be = min_be;
+ __entry->max_be = max_be;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", min be: %d, max_be: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->min_be, __entry->max_be)
+);
+
+TRACE_EVENT(802154_rdev_set_csma_backoffs,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ u8 max_csma_backoffs),
+ TP_ARGS(wpan_phy, wpan_dev, max_csma_backoffs),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(u8, max_csma_backoffs)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->max_csma_backoffs = max_csma_backoffs;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", max csma backoffs: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->max_csma_backoffs)
+);
+
+TRACE_EVENT(802154_rdev_set_max_frame_retries,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ s8 max_frame_retries),
+ TP_ARGS(wpan_phy, wpan_dev, max_frame_retries),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(s8, max_frame_retries)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->max_frame_retries = max_frame_retries;
+ ),
+
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", max frame retries: %d", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, __entry->max_frame_retries)
+);
+
+TRACE_EVENT(802154_rdev_set_lbt_mode,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ bool mode),
+ TP_ARGS(wpan_phy, wpan_dev, mode),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(bool, mode)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->mode = mode;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT
+ ", lbt mode: %s", WPAN_PHY_PR_ARG,
+ WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode))
+);
+
+TRACE_EVENT(802154_rdev_return_int,
+ TP_PROTO(struct wpan_phy *wpan_phy, int ret),
+ TP_ARGS(wpan_phy, ret),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", returned: %d", WPAN_PHY_PR_ARG,
+ __entry->ret)
+);
+
+#endif /* !__RDEV_CFG802154_OPS_TRACE || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 000000000..b295af069
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,696 @@
+#
+# IP configuration
+#
+config IP_MULTICAST
+ bool "IP: multicasting"
+ help
+ This is code for addressing several networked computers at once,
+ enlarging your kernel by about 2 KB. You need multicasting if you
+ intend to participate in the MBONE, a high bandwidth network on top
+ of the Internet which carries audio and video broadcasts. More
+ information about the MBONE is on the WWW at
+ <http://www.savetz.com/mbone/>. For most people, it's safe to say N.
+
+config IP_ADVANCED_ROUTER
+ bool "IP: advanced router"
+ ---help---
+ If you intend to run your Linux box mostly as a router, i.e. as a
+ computer that forwards and redistributes network packets, say Y; you
+ will then be presented with several options that allow more precise
+ control about the routing process.
+
+ The answer to this question won't directly affect the kernel:
+ answering N will just cause the configurator to skip all the
+ questions about advanced routing.
+
+ Note that your box can only act as a router if you enable IP
+ forwarding in your kernel; you can do that by saying Y to "/proc
+ file system support" and "Sysctl support" below and executing the
+ line
+
+ echo "1" > /proc/sys/net/ipv4/ip_forward
+
+ at boot time after the /proc file system has been mounted.
+
+ If you turn on IP forwarding, you should consider the rp_filter, which
+ automatically rejects incoming packets if the routing table entry
+ for their source address doesn't match the network interface they're
+ arriving on. This has security advantages because it prevents the
+ so-called IP spoofing, however it can pose problems if you use
+ asymmetric routing (packets from you to a host take a different path
+ than packets from that host to you) or if you operate a non-routing
+ host which has several IP addresses on different interfaces. To turn
+ rp_filter on use:
+
+ echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+ or
+ echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+
+ Note that some distributions enable it in startup scripts.
+ For details about rp_filter strict and loose mode read
+ <file:Documentation/networking/ip-sysctl.txt>.
+
+ If unsure, say N here.
+
+config IP_FIB_TRIE_STATS
+ bool "FIB TRIE statistics"
+ depends on IP_ADVANCED_ROUTER
+ ---help---
+ Keep track of statistics on structure of FIB TRIE table.
+ Useful for testing and measuring TRIE performance.
+
+config IP_MULTIPLE_TABLES
+ bool "IP: policy routing"
+ depends on IP_ADVANCED_ROUTER
+ select FIB_RULES
+ ---help---
+ Normally, a router decides what to do with a received packet based
+ solely on the packet's final destination address. If you say Y here,
+ the Linux router will also be able to take the packet's source
+ address into account. Furthermore, the TOS (Type-Of-Service) field
+ of the packet can be used for routing decisions as well.
+
+ If you are interested in this, please see the preliminary
+ documentation at <http://www.compendium.com.ar/policy-routing.txt>
+ and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
+ You will need supporting software from
+ <ftp://ftp.tux.org/pub/net/ip-routing/>.
+
+ If unsure, say N.
+
+config IP_ROUTE_MULTIPATH
+ bool "IP: equal cost multipath"
+ depends on IP_ADVANCED_ROUTER
+ help
+ Normally, the routing tables specify a single action to be taken in
+ a deterministic manner for a given packet. If you say Y here
+ however, it becomes possible to attach several actions to a packet
+ pattern, in effect specifying several alternative paths to travel
+ for those packets. The router considers all these paths to be of
+ equal "cost" and chooses one of them in a non-deterministic fashion
+ if a matching packet arrives.
+
+config IP_ROUTE_VERBOSE
+ bool "IP: verbose route monitoring"
+ depends on IP_ADVANCED_ROUTER
+ help
+ If you say Y here, which is recommended, then the kernel will print
+ verbose messages regarding the routing, for example warnings about
+ received packets which look strange and could be evidence of an
+ attack or a misconfigured system somewhere. The information is
+ handled by the klogd daemon which is responsible for kernel messages
+ ("man klogd").
+
+config IP_ROUTE_CLASSID
+ bool
+
+config IP_PNP
+ bool "IP: kernel level autoconfiguration"
+ help
+ This enables automatic configuration of IP addresses of devices and
+ of the routing table during kernel boot, based on either information
+ supplied on the kernel command line or by BOOTP or RARP protocols.
+ You need to say Y only for diskless machines requiring network
+ access to boot (in which case you want to say Y to "Root file system
+ on NFS" as well), because all other machines configure the network
+ in their startup scripts.
+
+config IP_PNP_DHCP
+ bool "IP: DHCP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the DHCP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does DHCP itself, providing all necessary information on the kernel
+ command line, you can say N here.
+
+ If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+ must be operating on your network. Read
+ <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+config IP_PNP_BOOTP
+ bool "IP: BOOTP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the BOOTP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does BOOTP itself, providing all necessary information on the kernel
+ command line, you can say N here. If unsure, say Y. Note that if you
+ want to use BOOTP, a BOOTP server must be operating on your network.
+ Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+config IP_PNP_RARP
+ bool "IP: RARP support"
+ depends on IP_PNP
+ help
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the RARP protocol (an
+ older protocol which is being obsoleted by BOOTP and DHCP), say Y
+ here. Note that if you want to use RARP, a RARP server must be
+ operating on your network. Read
+ <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+config NET_IPIP
+ tristate "IP: tunneling"
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ encapsulation of IP within IP, which sounds kind of pointless, but
+ can be useful if you want to make your (or some other) machine
+ appear on a different network than it physically is, or to use
+ mobile-IP facilities (allowing laptops to seamlessly move between
+ networks without changing their IP addresses).
+
+ Saying Y to this option will produce two modules ( = code which can
+ be inserted in and removed from the running kernel whenever you
+ want). Most people won't need this and can say N.
+
+config NET_IPGRE_DEMUX
+ tristate "IP: GRE demultiplexer"
+ help
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
+
+config NET_IP_TUNNEL
+ tristate
+ default n
+
+config NET_IPGRE
+ tristate "IP: GRE tunnels over IP"
+ depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+ select NET_IP_TUNNEL
+ help
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ GRE (Generic Routing Encapsulation) and at this time allows
+ encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+ This driver is useful if the other endpoint is a Cisco router: Cisco
+ likes GRE much better than the other Linux tunneling driver ("IP
+ tunneling" above). In addition, GRE allows multicast redistribution
+ through the tunnel.
+
+config NET_IPGRE_BROADCAST
+ bool "IP: broadcast GRE over IP"
+ depends on IP_MULTICAST && NET_IPGRE
+ help
+ One application of GRE/IP is to construct a broadcast WAN (Wide Area
+ Network), which looks like a normal Ethernet LAN (Local Area
+ Network), but can be distributed all over the Internet. If you want
+ to do that, say Y here and to "IP multicast routing" below.
+
+config IP_MROUTE
+ bool "IP: multicast routing"
+ depends on IP_MULTICAST
+ help
+ This is used if you want your machine to act as a router for IP
+ packets that have several destination addresses. It is needed on the
+ MBONE, a high bandwidth network on top of the Internet which carries
+ audio and video broadcasts. In order to do that, you would most
+ likely run the program mrouted. If you haven't heard about it, you
+ don't need it.
+
+config IP_MROUTE_MULTIPLE_TABLES
+ bool "IP: multicast policy routing"
+ depends on IP_MROUTE && IP_ADVANCED_ROUTER
+ select FIB_RULES
+ help
+ Normally, a multicast router runs a userspace daemon and decides
+ what to do with a multicast packet based on the source and
+ destination addresses. If you say Y here, the multicast router
+ will also be able to take interfaces and packet marks into
+ account and run multiple instances of userspace daemons
+ simultaneously, each one handling a single table.
+
+ If unsure, say N.
+
+config IP_PIMSM_V1
+ bool "IP: PIM-SM version 1 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM (Protocol Independent
+ Multicast) version 1. This multicast routing protocol is used widely
+ because Cisco supports it. You need special software to use it
+ (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
+ information about PIM.
+
+ Say Y if you want to use PIM-SM v1. Note that you can say N here if
+ you just want to use Dense Mode PIM.
+
+config IP_PIMSM_V2
+ bool "IP: PIM-SM version 2 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM version 2. In order to use
+ this, you need an experimental routing daemon supporting it (pimd or
+ gated-5). This routing protocol is not used widely, so say N unless
+ you want to play with it.
+
+config SYN_COOKIES
+ bool "IP: TCP syncookie support"
+ ---help---
+ Normal TCP/IP networking is open to an attack known as "SYN
+ flooding". This denial-of-service attack prevents legitimate remote
+ users from being able to connect to your computer during an ongoing
+ attack and requires very little work from the attacker, who can
+ operate from anywhere on the Internet.
+
+ SYN cookies provide protection against this type of attack. If you
+ say Y here, the TCP/IP stack will use a cryptographic challenge
+ protocol known as "SYN cookies" to enable legitimate users to
+ continue to connect, even when your machine is under attack. There
+ is no need for the legitimate users to change their TCP/IP software;
+ SYN cookies work transparently to them. For technical information
+ about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+
+ If you are SYN flooded, the source address reported by the kernel is
+ likely to have been forged by the attacker; it is only reported as
+ an aid in tracing the packets to their actual source and should not
+ be taken as absolute truth.
+
+ SYN cookies may prevent correct error reporting on clients when the
+ server is really overloaded. If this happens frequently better turn
+ them off.
+
+ If you say Y here, you can disable SYN cookies at run time by
+ saying Y to "/proc file system support" and
+ "Sysctl support" below and executing the command
+
+ echo 0 > /proc/sys/net/ipv4/tcp_syncookies
+
+ after the /proc file system has been mounted.
+
+ If unsure, say N.
+
+config NET_IPVTI
+ tristate "Virtual (secure) IP: tunneling"
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ depends on INET_XFRM_MODE_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This can be used with xfrm mode tunnel to give
+ the notion of a secure tunnel for IPSEC and then use routing protocol
+ on top.
+
+config NET_UDP_TUNNEL
+ tristate
+ select NET_IP_TUNNEL
+ default n
+
+config NET_FOU
+ tristate "IP: Foo (IP protocols) over UDP"
+ select XFRM
+ select NET_UDP_TUNNEL
+ ---help---
+ Foo over UDP allows any IP protocol to be directly encapsulated
+ over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP
+ network mechanisms and optimizations for UDP (such as ECMP
+ and RSS) can be leveraged to provide better service.
+
+config NET_FOU_IP_TUNNELS
+ bool "IP: FOU encapsulation of IP tunnels"
+ depends on NET_IPIP || NET_IPGRE || IPV6_SIT
+ select NET_FOU
+ ---help---
+ Allow configuration of FOU or GUE encapsulation for IP tunnels.
+ When this option is enabled IP tunnels can be configured to use
+ FOU or GUE encapsulation.
+
+config GENEVE
+ tristate "Generic Network Virtualization Encapsulation (Geneve)"
+ depends on INET
+ select NET_UDP_TUNNEL
+ ---help---
+ This allows one to create Geneve virtual interfaces that provide
+ Layer 2 Networks over Layer 3 Networks. Geneve is often used
+ to tunnel virtual network infrastructure in virtualized environments.
+ For more information see:
+ http://tools.ietf.org/html/draft-gross-geneve-01
+
+ To compile this driver as a module, choose M here: the module
+
+
+config INET_AH
+ tristate "IP: AH transformation"
+ select XFRM_ALGO
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ ---help---
+ Support for IPsec AH.
+
+ If unsure, say Y.
+
+config INET_ESP
+ tristate "IP: ESP transformation"
+ select XFRM_ALGO
+ select CRYPTO
+ select CRYPTO_AUTHENC
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_CBC
+ select CRYPTO_SHA1
+ select CRYPTO_DES
+ ---help---
+ Support for IPsec ESP.
+
+ If unsure, say Y.
+
+config INET_IPCOMP
+ tristate "IP: IPComp transformation"
+ select INET_XFRM_TUNNEL
+ select XFRM_IPCOMP
+ ---help---
+ Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+ typically needed for IPsec.
+
+ If unsure, say Y.
+
+config INET_XFRM_TUNNEL
+ tristate
+ select INET_TUNNEL
+ default n
+
+config INET_TUNNEL
+ tristate
+ default n
+
+config INET_XFRM_MODE_TRANSPORT
+ tristate "IP: IPsec transport mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec transport mode.
+
+ If unsure, say Y.
+
+config INET_XFRM_MODE_TUNNEL
+ tristate "IP: IPsec tunnel mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec tunnel mode.
+
+ If unsure, say Y.
+
+config INET_XFRM_MODE_BEET
+ tristate "IP: IPsec BEET mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec BEET mode.
+
+ If unsure, say Y.
+
+config INET_LRO
+ tristate "Large Receive Offload (ipv4/tcp)"
+ default y
+ ---help---
+ Support for Large Receive Offload (ipv4/tcp).
+
+ If unsure, say Y.
+
+config INET_DIAG
+ tristate "INET: socket monitoring interface"
+ default y
+ ---help---
+ Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ native Linux tools such as ss. ss is included in iproute2, currently
+ downloadable at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
+
+ If unsure, say Y.
+
+config INET_TCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+
+config INET_UDP_DIAG
+ tristate "UDP: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ ---help---
+ Support for UDP socket monitoring interface used by the ss tool.
+ If unsure, say Y.
+
+menuconfig TCP_CONG_ADVANCED
+ bool "TCP: advanced congestion control"
+ ---help---
+ Support for selection of various TCP congestion control
+ modules.
+
+ Nearly all users can safely say no here, and a safe default
+ selection will be made (CUBIC with new Reno as a fallback).
+
+ If unsure, say N.
+
+if TCP_CONG_ADVANCED
+
+config TCP_CONG_BIC
+ tristate "Binary Increase Congestion (BIC) control"
+ default m
+ ---help---
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+config TCP_CONG_CUBIC
+ tristate "CUBIC TCP"
+ default y
+ ---help---
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
+config TCP_CONG_WESTWOOD
+ tristate "TCP Westwood+"
+ default m
+ ---help---
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
+
+config TCP_CONG_HTCP
+ tristate "H-TCP"
+ default m
+ ---help---
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
+
+config TCP_CONG_HSTCP
+ tristate "High Speed TCP"
+ default n
+ ---help---
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see http://www.icir.org/floyd/hstcp.html
+
+config TCP_CONG_HYBLA
+ tristate "TCP-Hybla congestion control algorithm"
+ default n
+ ---help---
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, especially when sharing a common bottleneck with normal
+ terrestrial connections.
+
+config TCP_CONG_VEGAS
+ tristate "TCP Vegas"
+ default n
+ ---help---
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
+
+config TCP_CONG_SCALABLE
+ tristate "Scalable TCP"
+ default n
+ ---help---
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www.deneholme.net/tom/scalable/
+
+config TCP_CONG_LP
+ tristate "TCP Low Priority"
+ default n
+ ---help---
+ TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+ to utilize only the excess network bandwidth as compared to the
+ ``fair share`` of bandwidth as targeted by TCP.
+ See http://www-ece.rice.edu/networks/TCP-LP/
+
+config TCP_CONG_VENO
+ tristate "TCP Veno"
+ default n
+ ---help---
+ TCP Veno is a sender-side only enhancement of TCP to obtain better
+ throughput over wireless networks. TCP Veno makes use of state
+ distinguishing to circumvent the difficult judgment of the packet loss
+ type. TCP Veno cuts down less congestion window in response to random
+ loss packets.
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+
+config TCP_CONG_YEAH
+ tristate "YeAH TCP"
+ select TCP_CONG_VEGAS
+ default n
+ ---help---
+ YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+ algorithm, which uses a mixed loss/delay approach to compute the
+ congestion window. It's design goals target high efficiency,
+ internal, RTT and Reno fairness, resilience to link loss while
+ keeping network elements load as low as possible.
+
+ For further details look here:
+ http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+
+config TCP_CONG_ILLINOIS
+ tristate "TCP Illinois"
+ default n
+ ---help---
+ TCP-Illinois is a sender-side modification of TCP Reno for
+ high speed long delay links. It uses round-trip-time to
+ adjust the alpha and beta parameters to achieve a higher average
+ throughput and maintain fairness.
+
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+
+config TCP_CONG_DCTCP
+ tristate "DataCenter TCP (DCTCP)"
+ default n
+ ---help---
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
+
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
+
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
+
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+
+choice
+ prompt "Default TCP congestion control"
+ default DEFAULT_CUBIC
+ help
+ Select the TCP congestion control that will be used by default
+ for all connections.
+
+ config DEFAULT_BIC
+ bool "Bic" if TCP_CONG_BIC=y
+
+ config DEFAULT_CUBIC
+ bool "Cubic" if TCP_CONG_CUBIC=y
+
+ config DEFAULT_HTCP
+ bool "Htcp" if TCP_CONG_HTCP=y
+
+ config DEFAULT_HYBLA
+ bool "Hybla" if TCP_CONG_HYBLA=y
+
+ config DEFAULT_VEGAS
+ bool "Vegas" if TCP_CONG_VEGAS=y
+
+ config DEFAULT_YEAH
+ bool "YeAH" if TCP_CONG_YEAH=y
+
+ config DEFAULT_VENO
+ bool "Veno" if TCP_CONG_VENO=y
+
+ config DEFAULT_WESTWOOD
+ bool "Westwood" if TCP_CONG_WESTWOOD=y
+
+ config DEFAULT_DCTCP
+ bool "DCTCP" if TCP_CONG_DCTCP=y
+
+ config DEFAULT_RENO
+ bool "Reno"
+endchoice
+
+endif
+
+config TCP_CONG_CUBIC
+ tristate
+ depends on !TCP_CONG_ADVANCED
+ default y
+
+config DEFAULT_TCP_CONG
+ string
+ default "bic" if DEFAULT_BIC
+ default "cubic" if DEFAULT_CUBIC
+ default "htcp" if DEFAULT_HTCP
+ default "hybla" if DEFAULT_HYBLA
+ default "vegas" if DEFAULT_VEGAS
+ default "yeah" if DEFAULT_YEAH
+ default "westwood" if DEFAULT_WESTWOOD
+ default "veno" if DEFAULT_VENO
+ default "reno" if DEFAULT_RENO
+ default "dctcp" if DEFAULT_DCTCP
+ default "cubic"
+
+config TCP_MD5SIG
+ bool "TCP: MD5 Signature Option support (RFC2385)"
+ select CRYPTO
+ select CRYPTO_MD5
+ ---help---
+ RFC2385 specifies a method of giving MD5 protection to TCP sessions.
+ Its main (only?) use is to protect BGP sessions between core routers
+ on the Internet.
+
+ If unsure, say N.
+
+config TCP_STEALTH
+ bool "TCP: Stealth TCP socket support"
+ default n
+ ---help---
+ This option enables support for stealth TCP sockets. If you do not
+ know what this means, you do not need it.
+
+ If unsure, say N.
+
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 000000000..518c04ed6
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,62 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+
+obj-y := route.o inetpeer.o protocol.o \
+ ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+ ip_output.o ip_sockglue.o inet_hashtables.o \
+ inet_timewait_sock.o inet_connection_sock.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_offload.o datagram.o raw.o udp.o udplite.o \
+ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ fib_frontend.o fib_semantics.o fib_trie.o \
+ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
+
+obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
+obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+gre-y := gre_demux.o
+obj-$(CONFIG_NET_FOU) += fou.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
+obj-$(CONFIG_NET_IPVTI) += ip_vti.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
+obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
+obj-$(CONFIG_INET_LRO) += inet_lro.o
+obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
+obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_GENEVE) += geneve.o
+
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+ xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 000000000..a5aa54ea6
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1838 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * PF_INET protocol family socket handler.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ * piggy,
+ * Karl Knutson : Socket protocol table
+ * A.N.Kuznetsov : Socket death error in accept().
+ * John Richardson : Fix non blocking error in connect()
+ * so sockets that fail to connect
+ * don't return -EINPROGRESS.
+ * Alan Cox : Asynchronous I/O support
+ * Alan Cox : Keep correct socket pointer on sock
+ * structures
+ * when accept() ed
+ * Alan Cox : Semantics of SO_LINGER aren't state
+ * moved to close when you look carefully.
+ * With this fixed and the accept bug fixed
+ * some RPC stuff seems happier.
+ * Niibe Yutaka : 4.4BSD style write async I/O
+ * Alan Cox,
+ * Tony Gale : Fixed reuse semantics.
+ * Alan Cox : bind() shouldn't abort existing but dead
+ * sockets. Stops FTP netin:.. I hope.
+ * Alan Cox : bind() works correctly for RAW sockets.
+ * Note that FreeBSD at least was broken
+ * in this respect so be careful with
+ * compatibility tests...
+ * Alan Cox : routing cache support
+ * Alan Cox : memzero the socket structure for
+ * compactness.
+ * Matt Day : nonblock connect error handler
+ * Alan Cox : Allow large numbers of pending sockets
+ * (eg for big web sites), but only if
+ * specifically application requested.
+ * Alan Cox : New buffering throughout IP. Used
+ * dumbly.
+ * Alan Cox : New buffering now used smartly.
+ * Alan Cox : BSD rather than common sense
+ * interpretation of listen.
+ * Germano Caronni : Assorted small races.
+ * Alan Cox : sendmsg/recvmsg basic support.
+ * Alan Cox : Only sendmsg/recvmsg now supported.
+ * Alan Cox : Locked down bind (see security list).
+ * Alan Cox : Loosened bind a little.
+ * Mike McLagan : ADD/DEL DLCI Ioctls
+ * Willy Konynenberg : Transparent proxying support.
+ * David S. Miller : New socket lookup architecture.
+ * Some other random speedups.
+ * Cyrus Durgin : Cleaned up file for kmod hacks.
+ * Andi Kleen : Fix inet_stream_connect TCP race.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/capability.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/inet.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/ping.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/secure_seq.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+
+
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+
+/* New destruction routine */
+
+void inet_sock_destruct(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_error_queue);
+
+ sk_mem_reclaim(sk);
+
+ if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+ pr_err("Attempt to release TCP socket in state %d %p\n",
+ sk->sk_state, sk);
+ return;
+ }
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Attempt to release alive inet socket %p\n", sk);
+ return;
+ }
+
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
+
+ kfree(rcu_dereference_protected(inet->inet_opt, 1));
+ dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+ dst_release(sk->sk_rx_dst);
+ sk_refcnt_debug_dec(sk);
+}
+EXPORT_SYMBOL(inet_sock_destruct);
+
+/*
+ * The routines beyond this point handle the behaviour of an AF_INET
+ * socket object. Mostly it punts to the subprotocols of IP to do
+ * the work.
+ */
+
+/*
+ * Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+ struct inet_sock *inet;
+ /* We may need to bind the socket. */
+ lock_sock(sk);
+ inet = inet_sk(sk);
+ if (!inet->inet_num) {
+ if (sk->sk_prot->get_port(sk, 0)) {
+ release_sock(sk);
+ return -EAGAIN;
+ }
+ inet->inet_sport = htons(inet->inet_num);
+ }
+ release_sock(sk);
+ return 0;
+}
+
+/*
+ * Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto out;
+
+ old_state = sk->sk_state;
+ if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ goto out;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != TCP_LISTEN) {
+ /* Check special setups for testing purpose to enable TFO w/o
+ * requiring TCP_FASTOPEN sockopt.
+ * Note that only TCP sockets (SOCK_STREAM) will reach here.
+ * Also fastopenq may already been allocated because this
+ * socket was in TCP_LISTEN state previously but was
+ * shutdown() (rather than close()).
+ */
+ if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
+ !inet_csk(sk)->icsk_accept_queue.fastopenq) {
+ if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
+ err = fastopen_init_queue(sk, backlog);
+ else if ((sysctl_tcp_fastopen &
+ TFO_SERVER_WO_SOCKOPT2) != 0)
+ err = fastopen_init_queue(sk,
+ ((uint)sysctl_tcp_fastopen) >> 16);
+ else
+ err = 0;
+ if (err)
+ goto out;
+
+ tcp_fastopen_init_key_once(true);
+ }
+ err = inet_csk_listen_start(sk, backlog);
+ if (err)
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_listen);
+
+/*
+ * Create an inet socket.
+ */
+
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct inet_protosw *answer;
+ struct inet_sock *inet;
+ struct proto *answer_prot;
+ unsigned char answer_flags;
+ int try_loading_module = 0;
+ int err;
+
+ sock->state = SS_UNCONNECTED;
+
+ /* Look for the requested type/protocol pair. */
+lookup_protocol:
+ err = -ESOCKTNOSUPPORT;
+ rcu_read_lock();
+ list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
+
+ err = 0;
+ /* Check the non-wild match. */
+ if (protocol == answer->protocol) {
+ if (protocol != IPPROTO_IP)
+ break;
+ } else {
+ /* Check for the two wild cases. */
+ if (IPPROTO_IP == protocol) {
+ protocol = answer->protocol;
+ break;
+ }
+ if (IPPROTO_IP == answer->protocol)
+ break;
+ }
+ err = -EPROTONOSUPPORT;
+ }
+
+ if (unlikely(err)) {
+ if (try_loading_module < 2) {
+ rcu_read_unlock();
+ /*
+ * Be more specific, e.g. net-pf-2-proto-132-type-1
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+ */
+ if (++try_loading_module == 1)
+ request_module("net-pf-%d-proto-%d-type-%d",
+ PF_INET, protocol, sock->type);
+ /*
+ * Fall back to generic, e.g. net-pf-2-proto-132
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+ */
+ else
+ request_module("net-pf-%d-proto-%d",
+ PF_INET, protocol);
+ goto lookup_protocol;
+ } else
+ goto out_rcu_unlock;
+ }
+
+ err = -EPERM;
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
+ goto out_rcu_unlock;
+
+ sock->ops = answer->ops;
+ answer_prot = answer->prot;
+ answer_flags = answer->flags;
+ rcu_read_unlock();
+
+ WARN_ON(!answer_prot->slab);
+
+ err = -ENOBUFS;
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+ if (!sk)
+ goto out;
+
+ err = 0;
+ if (INET_PROTOSW_REUSE & answer_flags)
+ sk->sk_reuse = SK_CAN_REUSE;
+
+ inet = inet_sk(sk);
+ inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+
+ inet->nodefrag = 0;
+
+ if (SOCK_RAW == sock->type) {
+ inet->inet_num = protocol;
+ if (IPPROTO_RAW == protocol)
+ inet->hdrincl = 1;
+ }
+
+ if (net->ipv4.sysctl_ip_no_pmtu_disc)
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+ else
+ inet->pmtudisc = IP_PMTUDISC_WANT;
+
+ inet->inet_id = 0;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_destruct = inet_sock_destruct;
+ sk->sk_protocol = protocol;
+ sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+
+ inet->uc_ttl = -1;
+ inet->mc_loop = 1;
+ inet->mc_ttl = 1;
+ inet->mc_all = 1;
+ inet->mc_index = 0;
+ inet->mc_list = NULL;
+ inet->rcv_tos = 0;
+
+ sk_refcnt_debug_inc(sk);
+
+ if (inet->inet_num) {
+ /* It assumes that any protocol which allows
+ * the user to assign a number at socket
+ * creation time automatically
+ * shares.
+ */
+ inet->inet_sport = htons(inet->inet_num);
+ /* Add to protocol hash chains. */
+ sk->sk_prot->hash(sk);
+ }
+
+ if (sk->sk_prot->init) {
+ err = sk->sk_prot->init(sk);
+ if (err)
+ sk_common_release(sk);
+ }
+out:
+ return err;
+out_rcu_unlock:
+ rcu_read_unlock();
+ goto out;
+}
+
+
+/*
+ * The peer socket should always be NULL (or else). When we call this
+ * function we are destroying the object and from then on nobody
+ * should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ long timeout;
+
+ /* Applications forget to leave groups before exiting */
+ ip_mc_drop_socket(sk);
+
+ /* If linger is set, we don't return until the close
+ * is complete. Otherwise we return immediately. The
+ * actually closing is done the same either way.
+ *
+ * If the close is due to the process exiting, we never
+ * linger..
+ */
+ timeout = 0;
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING))
+ timeout = sk->sk_lingertime;
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, timeout);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(inet_release);
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+ unsigned short snum;
+ int chk_addr_ret;
+ int err;
+
+ /* If the socket has its own bind function then use it. (RAW) */
+ if (sk->sk_prot->bind) {
+ err = sk->sk_prot->bind(sk, uaddr, addr_len);
+ goto out;
+ }
+ err = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ if (addr->sin_family != AF_INET) {
+ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
+ * only if s_addr is INADDR_ANY.
+ */
+ err = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_UNSPEC ||
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
+ }
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+
+ /* Not specified by any standard per-se, however it breaks too
+ * many applications when removed. It is unfortunate since
+ * allowing applications to make a non-local bind solves
+ * several problems with systems using dynamic addressing.
+ * (ie. your servers still start up even if your ISDN link
+ * is temporarily down)
+ */
+ err = -EADDRNOTAVAIL;
+ if (!net->ipv4.sysctl_ip_nonlocal_bind &&
+ !(inet->freebind || inet->transparent) &&
+ addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
+ chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST &&
+ chk_addr_ret != RTN_BROADCAST)
+ goto out;
+
+ snum = ntohs(addr->sin_port);
+ err = -EACCES;
+ if (snum && snum < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+ goto out;
+
+ /* We keep a pair of addresses. rcv_saddr is the one
+ * used by hash lookups, and saddr is used for transmit.
+ *
+ * In the BSD API these are the same except where it
+ * would be illegal to use them (multicast/broadcast) in
+ * which case the sending device address is used.
+ */
+ lock_sock(sk);
+
+ /* Check these errors (active socket, double bind). */
+ err = -EINVAL;
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num)
+ goto out_release_sock;
+
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->inet_saddr = 0; /* Use device */
+
+ /* Make sure we are allowed to bind here. */
+ if (sk->sk_prot->get_port(sk, snum)) {
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
+ err = -EADDRINUSE;
+ goto out_release_sock;
+ }
+
+ if (inet->inet_rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
+ sk_dst_reset(sk);
+ err = 0;
+out_release_sock:
+ release_sock(sk);
+out:
+ return err;
+}
+EXPORT_SYMBOL(inet_bind);
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+ if (uaddr->sa_family == AF_UNSPEC)
+ return sk->sk_prot->disconnect(sk, flags);
+
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+ return -EAGAIN;
+ return sk->sk_prot->connect(sk, uaddr, addr_len);
+}
+EXPORT_SYMBOL(inet_dgram_connect);
+
+static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending += writebias;
+
+ /* Basic assumption: if someone sets sk->sk_err, he _must_
+ * change state of the socket from TCP_SYN_*.
+ * Connect() does not allow to get error notifications
+ * without closing the socket.
+ */
+ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ if (signal_pending(current) || !timeo)
+ break;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ sk->sk_write_pending -= writebias;
+ return timeo;
+}
+
+/*
+ * Connect to a remote host. There is regrettably still a little
+ * TCP 'magic' in here.
+ */
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err;
+ long timeo;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
+
+ switch (sock->state) {
+ default:
+ err = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ err = -EISCONN;
+ goto out;
+ case SS_CONNECTING:
+ err = -EALREADY;
+ /* Fall out of switch with err, set for this state */
+ break;
+ case SS_UNCONNECTED:
+ err = -EISCONN;
+ if (sk->sk_state != TCP_CLOSE)
+ goto out;
+
+ err = sk->sk_prot->connect(sk, uaddr, addr_len);
+ if (err < 0)
+ goto out;
+
+ sock->state = SS_CONNECTING;
+
+ /* Just entered SS_CONNECTING state; the only
+ * difference is that return value in non-blocking
+ * case is EINPROGRESS, rather than EALREADY.
+ */
+ err = -EINPROGRESS;
+ break;
+ }
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
+ tcp_sk(sk)->fastopen_req &&
+ tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+
+ /* Error code is set above */
+ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
+ goto out;
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out;
+ }
+
+ /* Connection was closed by RST, timeout, ICMP error
+ * or another process disconnected us.
+ */
+ if (sk->sk_state == TCP_CLOSE)
+ goto sock_error;
+
+ /* sk->sk_err may be not zero now, if RECVERR was ordered by user
+ * and error was received after socket entered established state.
+ * Hence, it is handled normally after connect() return successfully.
+ */
+
+ sock->state = SS_CONNECTED;
+ err = 0;
+out:
+ return err;
+
+sock_error:
+ err = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ if (sk->sk_prot->disconnect(sk, flags))
+ sock->state = SS_DISCONNECTING;
+ goto out;
+}
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ int err;
+
+ lock_sock(sock->sk);
+ err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+ release_sock(sock->sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_stream_connect);
+
+/*
+ * Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk1 = sock->sk;
+ int err = -EINVAL;
+ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+
+ if (!sk2)
+ goto do_err;
+
+ lock_sock(sk2);
+
+ sock_rps_record_flow(sk2);
+ WARN_ON(!((1 << sk2->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+ TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+
+ sock_graft(sk2, newsock);
+
+ newsock->state = SS_CONNECTED;
+ err = 0;
+ release_sock(sk2);
+do_err:
+ return err;
+}
+EXPORT_SYMBOL(inet_accept);
+
+
+/*
+ * This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+
+ sin->sin_family = AF_INET;
+ if (peer) {
+ if (!inet->inet_dport ||
+ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1))
+ return -ENOTCONN;
+ sin->sin_port = inet->inet_dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
+ } else {
+ __be32 addr = inet->inet_rcv_saddr;
+ if (!addr)
+ addr = inet->inet_saddr;
+ sin->sin_port = inet->inet_sport;
+ sin->sin_addr.s_addr = addr;
+ }
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *uaddr_len = sizeof(*sin);
+ return 0;
+}
+EXPORT_SYMBOL(inet_getname);
+
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+
+ sock_rps_record_flow(sk);
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
+ return -EAGAIN;
+
+ return sk->sk_prot->sendmsg(sk, msg, size);
+}
+EXPORT_SYMBOL(inet_sendmsg);
+
+ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ sock_rps_record_flow(sk);
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
+ return -EAGAIN;
+
+ if (sk->sk_prot->sendpage)
+ return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+ return sock_no_sendpage(sock, page, offset, size, flags);
+}
+EXPORT_SYMBOL(inet_sendpage);
+
+int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ sock_rps_record_flow(sk);
+
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
+
+int inet_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ /* This should really check to make sure
+ * the socket is a TCP socket. (WHY AC...)
+ */
+ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+ 1->2 bit 2 snds.
+ 2->3 */
+ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sock->state == SS_CONNECTING) {
+ if ((1 << sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+ sock->state = SS_DISCONNECTING;
+ else
+ sock->state = SS_CONNECTED;
+ }
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ err = -ENOTCONN;
+ /* Hack to wake up other listeners, who can poll for
+ POLLHUP, even on eg. unconnected UDP sockets -- RR */
+ default:
+ sk->sk_shutdown |= how;
+ if (sk->sk_prot->shutdown)
+ sk->sk_prot->shutdown(sk, how);
+ break;
+
+ /* Remaining two branches are temporary solution for missing
+ * close() in multithreaded environment. It is _not_ a good idea,
+ * but we have no choice until close() is repaired at VFS level.
+ */
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ /* Fall through */
+ case TCP_SYN_SENT:
+ err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ break;
+ }
+
+ /* Wake up anyone sleeping in poll. */
+ sk->sk_state_change(sk);
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_shutdown);
+
+/*
+ * ioctl() calls you can issue on an INET socket. Most of these are
+ * device configuration and stuff and very rarely used. Some ioctls
+ * pass on to the socket itself.
+ *
+ * NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ * loads the devconfigure module does its configuring and unloads it.
+ * There's a good 20K of config code hanging around the kernel.
+ */
+
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ struct net *net = sock_net(sk);
+
+ switch (cmd) {
+ case SIOCGSTAMP:
+ err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+ break;
+ case SIOCGSTAMPNS:
+ err = sock_get_timestampns(sk, (struct timespec __user *)arg);
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRTMSG:
+ err = ip_rt_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ err = arp_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCSIFFLAGS:
+ err = devinet_ioctl(net, cmd, (void __user *)arg);
+ break;
+ default:
+ if (sk->sk_prot->ioctl)
+ err = sk->sk_prot->ioctl(sk, cmd, arg);
+ else
+ err = -ENOIOCTLCMD;
+ break;
+ }
+ return err;
+}
+EXPORT_SYMBOL(inet_ioctl);
+
+#ifdef CONFIG_COMPAT
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = -ENOIOCTLCMD;
+
+ if (sk->sk_prot->compat_ioctl)
+ err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+ return err;
+}
+#endif
+
+const struct proto_ops inet_stream_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ .poll = tcp_poll,
+ .ioctl = inet_ioctl,
+ .listen = inet_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+ .splice_read = tcp_splice_read,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_stream_ops);
+
+const struct proto_ops inet_dgram_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = udp_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_dgram_ops);
+
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static const struct proto_ops inet_sockraw_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = datagram_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+
+static const struct net_proto_family inet_family_ops = {
+ .family = PF_INET,
+ .create = inet_create,
+ .owner = THIS_MODULE,
+};
+
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+ {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_TCP,
+ .prot = &tcp_prot,
+ .ops = &inet_stream_ops,
+ .flags = INET_PROTOSW_PERMANENT |
+ INET_PROTOSW_ICSK,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDP,
+ .prot = &udp_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_PERMANENT,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMP,
+ .prot = &ping_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_REUSE,
+ },
+
+ {
+ .type = SOCK_RAW,
+ .protocol = IPPROTO_IP, /* wild card */
+ .prot = &raw_prot,
+ .ops = &inet_sockraw_ops,
+ .flags = INET_PROTOSW_REUSE,
+ }
+};
+
+#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
+
+void inet_register_protosw(struct inet_protosw *p)
+{
+ struct list_head *lh;
+ struct inet_protosw *answer;
+ int protocol = p->protocol;
+ struct list_head *last_perm;
+
+ spin_lock_bh(&inetsw_lock);
+
+ if (p->type >= SOCK_MAX)
+ goto out_illegal;
+
+ /* If we are trying to override a permanent protocol, bail. */
+ answer = NULL;
+ last_perm = &inetsw[p->type];
+ list_for_each(lh, &inetsw[p->type]) {
+ answer = list_entry(lh, struct inet_protosw, list);
+
+ /* Check only the non-wild match. */
+ if (INET_PROTOSW_PERMANENT & answer->flags) {
+ if (protocol == answer->protocol)
+ break;
+ last_perm = lh;
+ }
+
+ answer = NULL;
+ }
+ if (answer)
+ goto out_permanent;
+
+ /* Add the new entry after the last permanent entry if any, so that
+ * the new entry does not override a permanent entry when matched with
+ * a wild-card protocol. But it is allowed to override any existing
+ * non-permanent entry. This means that when we remove this entry, the
+ * system automatically returns to the old behavior.
+ */
+ list_add_rcu(&p->list, last_perm);
+out:
+ spin_unlock_bh(&inetsw_lock);
+
+ return;
+
+out_permanent:
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
+ goto out;
+
+out_illegal:
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
+ p->type);
+ goto out;
+}
+EXPORT_SYMBOL(inet_register_protosw);
+
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+ if (INET_PROTOSW_PERMANENT & p->flags) {
+ pr_err("Attempt to unregister permanent protocol %d\n",
+ p->protocol);
+ } else {
+ spin_lock_bh(&inetsw_lock);
+ list_del_rcu(&p->list);
+ spin_unlock_bh(&inetsw_lock);
+
+ synchronize_net();
+ }
+}
+EXPORT_SYMBOL(inet_unregister_protosw);
+
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr __read_mostly;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ __be32 old_saddr = inet->inet_saddr;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ __be32 new_saddr;
+ struct ip_options_rcu *inet_opt;
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+
+ /* Query new route. */
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if, sk->sk_protocol,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ sk_setup_caps(sk, &rt->dst);
+
+ new_saddr = fl4->saddr;
+
+ if (new_saddr == old_saddr)
+ return 0;
+
+ if (sysctl_ip_dynaddr > 1) {
+ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
+ }
+
+ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
+
+ /*
+ * XXX The only one ugly spot where we need to
+ * XXX really change the sockets identity after
+ * XXX it has entered the hashes. -DaveM
+ *
+ * Besides that, it does not check for connection
+ * uniqueness. Wait for troubles.
+ */
+ __sk_prot_rehash(sk);
+ return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+ __be32 daddr;
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
+ int err;
+
+ /* Route is OK, nothing to do. */
+ if (rt)
+ return 0;
+
+ /* Reroute. */
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ daddr = inet->inet_daddr;
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rcu_read_unlock();
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (!IS_ERR(rt)) {
+ err = 0;
+ sk_setup_caps(sk, &rt->dst);
+ } else {
+ err = PTR_ERR(rt);
+
+ /* Routing failed... */
+ sk->sk_route_caps = 0;
+ /*
+ * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+ * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+ */
+ if (!sysctl_ip_dynaddr ||
+ sk->sk_state != TCP_SYN_SENT ||
+ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+ (err = inet_sk_reselect_saddr(sk)) != 0)
+ sk->sk_err_soft = -err;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ unsigned int offset = 0;
+ bool udpfrag, encap;
+ struct iphdr *iph;
+ int proto;
+ int nhoff;
+ int ihl;
+ int id;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
+ 0)))
+ goto out;
+
+ skb_reset_network_header(skb);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ goto out;
+
+ iph = ip_hdr(skb);
+ ihl = iph->ihl * 4;
+ if (ihl < sizeof(*iph))
+ goto out;
+
+ id = ntohs(iph->id);
+ proto = iph->protocol;
+
+ /* Warning: after this point, iph might be no longer valid */
+ if (unlikely(!pskb_may_pull(skb, ihl)))
+ goto out;
+ __skb_pull(skb, ihl);
+
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features &= skb->dev->hw_enc_features;
+ SKB_GSO_CB(skb)->encap_level += ihl;
+
+ skb_reset_transport_header(skb);
+
+ segs = ERR_PTR(-EPROTONOSUPPORT);
+
+ if (skb->encapsulation &&
+ skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ udpfrag = proto == IPPROTO_UDP && encap;
+ else
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ if (IS_ERR_OR_NULL(segs))
+ goto out;
+
+ skb = segs;
+ do {
+ iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
+ if (udpfrag) {
+ iph->id = htons(id);
+ iph->frag_off = htons(offset >> 3);
+ if (skb->next)
+ iph->frag_off |= htons(IP_MF);
+ offset += skb->len - nhoff - ihl;
+ } else {
+ iph->id = htons(id++);
+ }
+ iph->tot_len = htons(skb->len - nhoff);
+ ip_send_check(iph);
+ if (encap)
+ skb_reset_inner_headers(skb);
+ skb->network_header = (u8 *)iph - skb->head;
+ } while ((skb = skb->next));
+
+out:
+ return segs;
+}
+
+static struct sk_buff **inet_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct iphdr *iph;
+ unsigned int hlen;
+ unsigned int off;
+ unsigned int id;
+ int flush = 1;
+ int proto;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ iph = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+ }
+
+ proto = iph->protocol;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out_unlock;
+
+ if (*(u8 *)iph != 0x45)
+ goto out_unlock;
+
+ if (unlikely(ip_fast_csum((u8 *)iph, 5)))
+ goto out_unlock;
+
+ id = ntohl(*(__be32 *)&iph->id);
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
+ id >>= 16;
+
+ for (p = *head; p; p = p->next) {
+ struct iphdr *iph2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ iph2 = (struct iphdr *)(p->data + off);
+ /* The above works because, with the exception of the top
+ * (inner most) layer, we only aggregate pkts with the same
+ * hdr length so all the hdrs we'll need to verify will start
+ * at the same offset.
+ */
+ if ((iph->protocol ^ iph2->protocol) |
+ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* All fields must match except length and checksum. */
+ NAPI_GRO_CB(p)->flush |=
+ (iph->ttl ^ iph2->ttl) |
+ (iph->tos ^ iph2->tos) |
+ ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
+
+ /* Save the IP ID check to be included later when we get to
+ * the transport layer so only the inner most IP ID is checked.
+ * This is because some GSO/TSO implementations do not
+ * correctly increment the IP ID for the outer hdrs.
+ */
+ NAPI_GRO_CB(p)->flush_id =
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+ NAPI_GRO_CB(p)->flush |= flush;
+ }
+
+ NAPI_GRO_CB(skb)->flush |= flush;
+ skb_set_network_header(skb, off);
+ /* The above will be needed by the transport layer if there is one
+ * immediately following this IP hdr.
+ */
+
+ /* Note : No need to call skb_gro_postpull_rcsum() here,
+ * as we already checked checksum over ipv4 header was 0
+ */
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+ if (sk->sk_family == AF_INET)
+ return ip_recv_error(sk, msg, len, addr_len);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
+#endif
+ return -EINVAL;
+}
+
+static int inet_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ __be16 newlen = htons(skb->len - nhoff);
+ struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ int proto = iph->protocol;
+ int err = -ENOSYS;
+
+ if (skb->encapsulation)
+ skb_set_inner_network_header(skb, nhoff);
+
+ csum_replace2(&iph->check, iph->tot_len, newlen);
+ iph->tot_len = newlen;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ /* Only need to add sizeof(*iph) to get to the next hdr below
+ * because any hdr with option will have been flushed in
+ * inet_gro_receive().
+ */
+ err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+int inet_ctl_sock_create(struct sock **sk, unsigned short family,
+ unsigned short type, unsigned char protocol,
+ struct net *net)
+{
+ struct socket *sock;
+ int rc = sock_create_kern(family, type, protocol, &sock);
+
+ if (rc == 0) {
+ *sk = sock->sk;
+ (*sk)->sk_allocation = GFP_ATOMIC;
+ /*
+ * Unhash it so that IP input processing does not even see it,
+ * we do not wish this socket to see incoming packets.
+ */
+ (*sk)->sk_prot->unhash(*sk);
+
+ sk_change_net(*sk, net);
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+
+unsigned long snmp_fold_field(void __percpu *mib, int offt)
+{
+ unsigned long res = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt);
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field);
+
+#if BITS_PER_LONG==32
+
+u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
+{
+ u64 res = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *bhptr;
+ struct u64_stats_sync *syncp;
+ u64 v;
+ unsigned int start;
+
+ bhptr = per_cpu_ptr(mib, cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin_irq(syncp);
+ v = *(((u64 *) bhptr) + offt);
+ } while (u64_stats_fetch_retry_irq(syncp, start));
+
+ res += v;
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
+#ifdef CONFIG_IP_MULTICAST
+static const struct net_protocol igmp_protocol = {
+ .handler = igmp_rcv,
+ .netns_ok = 1,
+};
+#endif
+
+static const struct net_protocol tcp_protocol = {
+ .early_demux = tcp_v4_early_demux,
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
+};
+
+static const struct net_protocol udp_protocol = {
+ .early_demux = udp_v4_early_demux,
+ .handler = udp_rcv,
+ .err_handler = udp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol icmp_protocol = {
+ .handler = icmp_rcv,
+ .err_handler = icmp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static __net_init int ipv4_mib_init_net(struct net *net)
+{
+ int i;
+
+ net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
+ if (!net->mib.tcp_statistics)
+ goto err_tcp_mib;
+ net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ip_statistics)
+ goto err_ip_mib;
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet_stats;
+ af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
+ u64_stats_init(&af_inet_stats->syncp);
+ }
+
+ net->mib.net_statistics = alloc_percpu(struct linux_mib);
+ if (!net->mib.net_statistics)
+ goto err_net_mib;
+ net->mib.udp_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_statistics)
+ goto err_udp_mib;
+ net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_statistics)
+ goto err_udplite_mib;
+ net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
+ if (!net->mib.icmp_statistics)
+ goto err_icmp_mib;
+ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpmsg_statistics)
+ goto err_icmpmsg_mib;
+
+ tcp_mib_init(net);
+ return 0;
+
+err_icmpmsg_mib:
+ free_percpu(net->mib.icmp_statistics);
+err_icmp_mib:
+ free_percpu(net->mib.udplite_statistics);
+err_udplite_mib:
+ free_percpu(net->mib.udp_statistics);
+err_udp_mib:
+ free_percpu(net->mib.net_statistics);
+err_net_mib:
+ free_percpu(net->mib.ip_statistics);
+err_ip_mib:
+ free_percpu(net->mib.tcp_statistics);
+err_tcp_mib:
+ return -ENOMEM;
+}
+
+static __net_exit void ipv4_mib_exit_net(struct net *net)
+{
+ kfree(net->mib.icmpmsg_statistics);
+ free_percpu(net->mib.icmp_statistics);
+ free_percpu(net->mib.udplite_statistics);
+ free_percpu(net->mib.udp_statistics);
+ free_percpu(net->mib.net_statistics);
+ free_percpu(net->mib.ip_statistics);
+ free_percpu(net->mib.tcp_statistics);
+}
+
+static __net_initdata struct pernet_operations ipv4_mib_ops = {
+ .init = ipv4_mib_init_net,
+ .exit = ipv4_mib_exit_net,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+ return register_pernet_subsys(&ipv4_mib_ops);
+}
+
+static __net_init int inet_init_net(struct net *net)
+{
+ /*
+ * Set defaults for local port range
+ */
+ seqlock_init(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = 32768;
+ net->ipv4.ip_local_ports.range[1] = 61000;
+
+ seqlock_init(&net->ipv4.ping_group_range.lock);
+ /*
+ * Sane defaults - nobody may create ping sockets.
+ * Boot scripts should set this to distro-specific group.
+ */
+ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
+ net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);
+ return 0;
+}
+
+static __net_exit void inet_exit_net(struct net *net)
+{
+}
+
+static __net_initdata struct pernet_operations af_inet_ops = {
+ .init = inet_init_net,
+ .exit = inet_exit_net,
+};
+
+static int __init init_inet_pernet_ops(void)
+{
+ return register_pernet_subsys(&af_inet_ops);
+}
+
+static int ipv4_proc_init(void);
+
+/*
+ * IP protocol layer initialiser
+ */
+
+static struct packet_offload ip_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+};
+
+static const struct net_offload ipip_offload = {
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+};
+
+static int __init ipv4_offload_init(void)
+{
+ /*
+ * Add offloads
+ */
+ if (udpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+ if (tcpv4_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+
+ dev_add_offload(&ip_packet_offload);
+ inet_add_offload(&ipip_offload, IPPROTO_IPIP);
+ return 0;
+}
+
+fs_initcall(ipv4_offload_init);
+
+static struct packet_type ip_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .func = ip_rcv,
+};
+
+static int __init inet_init(void)
+{
+ struct inet_protosw *q;
+ struct list_head *r;
+ int rc = -EINVAL;
+
+ sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
+
+ rc = proto_register(&tcp_prot, 1);
+ if (rc)
+ goto out;
+
+ rc = proto_register(&udp_prot, 1);
+ if (rc)
+ goto out_unregister_tcp_proto;
+
+ rc = proto_register(&raw_prot, 1);
+ if (rc)
+ goto out_unregister_udp_proto;
+
+ rc = proto_register(&ping_prot, 1);
+ if (rc)
+ goto out_unregister_raw_proto;
+
+ /*
+ * Tell SOCKET that we are alive...
+ */
+
+ (void)sock_register(&inet_family_ops);
+
+#ifdef CONFIG_SYSCTL
+ ip_static_sysctl_init();
+#endif
+
+ /*
+ * Add all the base protocols.
+ */
+
+ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+ pr_crit("%s: Cannot add ICMP protocol\n", __func__);
+ if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+ pr_crit("%s: Cannot add UDP protocol\n", __func__);
+ if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+ pr_crit("%s: Cannot add TCP protocol\n", __func__);
+#ifdef CONFIG_IP_MULTICAST
+ if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+ pr_crit("%s: Cannot add IGMP protocol\n", __func__);
+#endif
+
+ /* Register the socket-side information for inet_create. */
+ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+ INIT_LIST_HEAD(r);
+
+ for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+ inet_register_protosw(q);
+
+ /*
+ * Set the ARP module up
+ */
+
+ arp_init();
+
+ /*
+ * Set the IP module up
+ */
+
+ ip_init();
+
+ tcp_v4_init();
+
+ /* Setup TCP slab cache for open requests. */
+ tcp_init();
+
+ /* Setup UDP memory threshold */
+ udp_init();
+
+ /* Add UDP-Lite (RFC 3828) */
+ udplite4_register();
+
+ ping_init();
+
+ /*
+ * Set the ICMP layer up
+ */
+
+ if (icmp_init() < 0)
+ panic("Failed to create the ICMP control socket.\n");
+
+ /*
+ * Initialise the multicast router
+ */
+#if defined(CONFIG_IP_MROUTE)
+ if (ip_mr_init())
+ pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
+#endif
+
+ if (init_inet_pernet_ops())
+ pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
+ /*
+ * Initialise per-cpu ipv4 mibs
+ */
+
+ if (init_ipv4_mibs())
+ pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
+
+ ipv4_proc_init();
+
+ ipfrag_init();
+
+ dev_add_pack(&ip_packet_type);
+
+ rc = 0;
+out:
+ return rc;
+out_unregister_raw_proto:
+ proto_unregister(&raw_prot);
+out_unregister_udp_proto:
+ proto_unregister(&udp_prot);
+out_unregister_tcp_proto:
+ proto_unregister(&tcp_prot);
+ goto out;
+}
+
+fs_initcall(inet_init);
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef CONFIG_PROC_FS
+static int __init ipv4_proc_init(void)
+{
+ int rc = 0;
+
+ if (raw_proc_init())
+ goto out_raw;
+ if (tcp4_proc_init())
+ goto out_tcp;
+ if (udp4_proc_init())
+ goto out_udp;
+ if (ping_proc_init())
+ goto out_ping;
+ if (ip_misc_proc_init())
+ goto out_misc;
+out:
+ return rc;
+out_misc:
+ ping_proc_exit();
+out_ping:
+ udp4_proc_exit();
+out_udp:
+ tcp4_proc_exit();
+out_tcp:
+ raw_proc_exit();
+out_raw:
+ rc = -ENOMEM;
+ goto out;
+}
+
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_ALIAS_NETPROTO(PF_INET);
+
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 000000000..ac9a32ec3
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,589 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/scatterlist.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+
+struct ah_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+ unsigned int size)
+{
+ unsigned int len;
+
+ len = size + crypto_ahash_digestsize(ahash) +
+ (crypto_ahash_alignmask(ahash) &
+ ~(crypto_tfm_ctx_alignment() - 1));
+
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+ len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+ unsigned int offset)
+{
+ return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+ u8 *icv)
+{
+ struct ahash_request *req;
+
+ req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+ crypto_tfm_ctx_alignment());
+
+ ahash_request_set_tfm(req, ahash);
+
+ return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+ struct ahash_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_ahash_reqsize(ahash),
+ __alignof__(struct scatterlist));
+}
+
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
+{
+ unsigned char *optptr = (unsigned char *)(iph+1);
+ int l = iph->ihl*4 - sizeof(struct iphdr);
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return 0;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l)
+ return -EINVAL;
+ switch (*optptr) {
+ case IPOPT_SEC:
+ case 0x85: /* Some "Extended Security" crap. */
+ case IPOPT_CIPSO:
+ case IPOPT_RA:
+ case 0x80|21: /* RFC1770 */
+ break;
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ if (optlen < 6)
+ return -EINVAL;
+ memcpy(daddr, optptr+optlen-4, 4);
+ /* Fall through */
+ default:
+ memset(optptr, 0, optlen);
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+ return 0;
+}
+
+static void ah_output_done(struct crypto_async_request *base, int err)
+{
+ u8 *icv;
+ struct iphdr *iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ struct ah_data *ahp = x->data;
+ struct iphdr *top_iph = ip_hdr(skb);
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+
+ iph = AH_SKB_CB(skb)->tmp;
+ icv = ah_tmp_icv(ahp->ahash, iph, ihl);
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ int nfrags;
+ int ihl;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *top_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ skb_push(skb, -skb_network_offset(skb));
+ ah = ip_auth_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+ err = -ENOMEM;
+ iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
+ if (!iph)
+ goto out;
+ seqhi = (__be32 *)((char *)iph + ihl);
+ icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ top_iph = ip_hdr(skb);
+
+ iph->tos = top_iph->tos;
+ iph->ttl = top_iph->ttl;
+ iph->frag_off = top_iph->frag_off;
+
+ if (top_iph->ihl != 5) {
+ iph->daddr = top_iph->daddr;
+ memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+ if (err)
+ goto out_free;
+ }
+
+ ah->nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_AH;
+
+ top_iph->tos = 0;
+ top_iph->tot_len = htons(skb->len);
+ top_iph->frag_off = 0;
+ top_iph->ttl = 0;
+ top_iph->check = 0;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+
+ ah->reserved = 0;
+ ah->spi = x->id.spi;
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah_output_done, skb);
+
+ AH_SKB_CB(skb)->tmp = iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+ goto out_free;
+ }
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+out_free:
+ kfree(iph);
+out:
+ return err;
+}
+
+static void ah_input_done(struct crypto_async_request *base, int err)
+{
+ u8 *auth_data;
+ u8 *icv;
+ struct iphdr *work_iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ah_data *ahp = x->data;
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+ int ah_hlen = (ah->hdrlen + 2) << 2;
+
+ work_iph = AH_SKB_CB(skb)->tmp;
+ auth_data = ah_tmp_auth(work_iph, ihl);
+ icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out;
+
+ err = ah->nexthdr;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+out:
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_input_resume(skb, err);
+}
+
+static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int ah_hlen;
+ int ihl;
+ int nexthdr;
+ int nfrags;
+ u8 *auth_data;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *work_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ int err = -ENOMEM;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ if (!pskb_may_pull(skb, sizeof(*ah)))
+ goto out;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ nexthdr = ah->nexthdr;
+ ah_hlen = (ah->hdrlen + 2) << 2;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, ah_hlen))
+ goto out;
+
+ /* We are going to _remove_ AH header to keep sockets happy,
+ * so... Later this can change. */
+ if (skb_unclone(skb, GFP_ATOMIC))
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ iph = ip_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+
+ work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
+ ahp->icv_trunc_len + seqhi_len);
+ if (!work_iph)
+ goto out;
+
+ seqhi = (__be32 *)((char *)work_iph + ihl);
+ auth_data = ah_tmp_auth(seqhi, seqhi_len);
+ icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memcpy(work_iph, iph, ihl);
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ iph->ttl = 0;
+ iph->tos = 0;
+ iph->frag_off = 0;
+ iph->check = 0;
+ if (ihl > sizeof(*iph)) {
+ __be32 dummy;
+ err = ip_clear_mutable_options(iph, &dummy);
+ if (err)
+ goto out_free;
+ }
+
+ skb_push(skb, ihl);
+
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah_input_done, skb);
+
+ AH_SKB_CB(skb)->tmp = work_iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ goto out_free;
+ }
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out_free;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+
+ err = nexthdr;
+
+out_free:
+ kfree (work_iph);
+out:
+ return err;
+}
+
+static int ah4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ ah->spi, IPPROTO_AH, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int ah_init_state(struct xfrm_state *x)
+{
+ struct ah_data *ahp = NULL;
+ struct xfrm_algo_desc *aalg_desc;
+ struct crypto_ahash *ahash;
+
+ if (!x->aalg)
+ goto error;
+
+ if (x->encap)
+ goto error;
+
+ ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
+ if (!ahp)
+ return -ENOMEM;
+
+ ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+ if (IS_ERR(ahash))
+ goto error;
+
+ ahp->ahash = ahash;
+ if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8))
+ goto error;
+
+ /*
+ * Lookup the algorithm description maintained by xfrm_algo,
+ * verify crypto transform properties, and store information
+ * we need for AH processing. This lookup cannot fail here
+ * after a successful crypto_alloc_ahash().
+ */
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_ahash_digestsize(ahash)) {
+ pr_info("%s: %s digestsize %u != %hu\n",
+ __func__, x->aalg->alg_name,
+ crypto_ahash_digestsize(ahash),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
+ goto error;
+ }
+
+ ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ x->props.header_len += sizeof(struct iphdr);
+ x->data = ahp;
+
+ return 0;
+
+error:
+ if (ahp) {
+ crypto_free_ahash(ahp->ahash);
+ kfree(ahp);
+ }
+ return -EINVAL;
+}
+
+static void ah_destroy(struct xfrm_state *x)
+{
+ struct ah_data *ahp = x->data;
+
+ if (!ahp)
+ return;
+
+ crypto_free_ahash(ahp->ahash);
+ kfree(ahp);
+}
+
+static int ah4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type ah_type =
+{
+ .description = "AH4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_AH,
+ .flags = XFRM_TYPE_REPLAY_PROT,
+ .init_state = ah_init_state,
+ .destructor = ah_destroy,
+ .input = ah_input,
+ .output = ah_output
+};
+
+static struct xfrm4_protocol ah4_protocol = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ah4_rcv_cb,
+ .err_handler = ah4_err,
+ .priority = 0,
+};
+
+static int __init ah4_init(void)
+{
+ if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&ah_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ah4_fini(void)
+{
+ if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 000000000..933a92820
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1375 @@
+/* linux/net/ipv4/arp.c
+ *
+ * Copyright (C) 1994 by Florian La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Alan Cox : Removed the Ethernet assumptions in
+ * Florian's code
+ * Alan Cox : Fixed some small errors in the ARP
+ * logic
+ * Alan Cox : Allow >4K in /proc
+ * Alan Cox : Make ARP add its own protocol entry
+ * Ross Martin : Rewrote arp_rcv() and arp_get_info()
+ * Stephen Henson : Add AX25 support to arp_get_info()
+ * Alan Cox : Drop data when a device is downed.
+ * Alan Cox : Use init_timer().
+ * Alan Cox : Double lock fixes.
+ * Martin Seine : Move the arphdr structure
+ * to if_arp.h for compatibility.
+ * with BSD based programs.
+ * Andrew Tridgell : Added ARP netmask code and
+ * re-arranged proxy handling.
+ * Alan Cox : Changed to use notifiers.
+ * Niibe Yutaka : Reply for this device or proxies only.
+ * Alan Cox : Don't proxy across hardware types!
+ * Jonathan Naylor : Added support for NET/ROM.
+ * Mike Shaver : RFC1122 checks.
+ * Jonathan Naylor : Only lookup the hardware address for
+ * the correct hardware type.
+ * Germano Caronni : Assorted subtle races.
+ * Craig Schlenter : Don't modify permanent entry
+ * during arp_rcv.
+ * Russ Nelson : Tidied up a few bits.
+ * Alexey Kuznetsov: Major changes to caching and behaviour,
+ * eg intelligent arp probing and
+ * generation
+ * of host down events.
+ * Alan Cox : Missing unlock in device events.
+ * Eckes : ARP ioctl control errors.
+ * Alexey Kuznetsov: Arp free fix.
+ * Manuel Rodriguez: Gratuitous ARP.
+ * Jonathan Layes : Added arpd support through kerneld
+ * message queue (960314)
+ * Mike Shaver : /proc/sys/net/ipv4/arp_* support
+ * Mike McLagan : Routing by source
+ * Stuart Cheshire : Metricom and grat arp fixes
+ * *** FOR 2.1 clean this up ***
+ * Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * folded into the mainstream FDDI code.
+ * Ack spit, Linus how did you allow that
+ * one in...
+ * Jes Sorensen : Make FDDI work again in 2.1.x and
+ * clean up the APFDDI & gen. FDDI bits.
+ * Alexey Kuznetsov: new arp state machine;
+ * now it is in net/core/neighbour.c.
+ * Krzysztof Halasa: Added Frame Relay ARP support.
+ * Arnaldo C. Melo : convert /proc/net/arp to seq_file
+ * Shmulik Hen: Split arp_send to arp_create and
+ * arp_xmit so intermediate drivers like
+ * bonding can change the skb before
+ * sending (e.g. insert 8021q tag).
+ * Harald Welte : convert to make use of jenkins hash
+ * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ax25.h>
+#include <net/netrom.h>
+
+#include <linux/uaccess.h>
+
+#include <linux/netfilter_arp.h>
+
+/*
+ * Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static bool arp_key_eq(const struct neighbour *n, const void *pkey);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+
+static const struct neigh_ops arp_generic_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_connected_output,
+};
+
+static const struct neigh_ops arp_hh_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_resolve_output,
+};
+
+static const struct neigh_ops arp_direct_ops = {
+ .family = AF_INET,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
+};
+
+struct neigh_table arp_tbl = {
+ .family = AF_INET,
+ .key_len = 4,
+ .protocol = cpu_to_be16(ETH_P_IP),
+ .hash = arp_hash,
+ .key_eq = arp_key_eq,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .reachable_time = 30 * HZ,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 3,
+ [NEIGH_VAR_UCAST_PROBES] = 3,
+ [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+ [NEIGH_VAR_PROXY_QLEN] = 64,
+ [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+ [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
+ [NEIGH_VAR_LOCKTIME] = 1 * HZ,
+ },
+ },
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
+};
+EXPORT_SYMBOL(arp_tbl);
+
+int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_FDDI:
+ case ARPHRD_IEEE802:
+ ip_eth_mc_map(addr, haddr);
+ return 0;
+ case ARPHRD_INFINIBAND:
+ ip_ib_mc_map(addr, dev->broadcast, haddr);
+ return 0;
+ case ARPHRD_IPGRE:
+ ip_ipgre_mc_map(addr, dev->broadcast, haddr);
+ return 0;
+ default:
+ if (dir) {
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+
+static u32 arp_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd)
+{
+ return arp_hashfn(pkey, dev, hash_rnd);
+}
+
+static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
+{
+ return neigh_key_eq32(neigh, pkey);
+}
+
+static int arp_constructor(struct neighbour *neigh)
+{
+ __be32 addr = *(__be32 *)neigh->primary_key;
+ struct net_device *dev = neigh->dev;
+ struct in_device *in_dev;
+ struct neigh_parms *parms;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ neigh->type = inet_addr_type(dev_net(dev), addr);
+
+ parms = in_dev->arp_parms;
+ __neigh_parms_put(neigh->parms);
+ neigh->parms = neigh_parms_clone(parms);
+ rcu_read_unlock();
+
+ if (!dev->header_ops) {
+ neigh->nud_state = NUD_NOARP;
+ neigh->ops = &arp_direct_ops;
+ neigh->output = neigh_direct_output;
+ } else {
+ /* Good devices (checked by reading texts, but only Ethernet is
+ tested)
+
+ ARPHRD_ETHER: (ethernet, apfddi)
+ ARPHRD_FDDI: (fddi)
+ ARPHRD_IEEE802: (tr)
+ ARPHRD_METRICOM: (strip)
+ ARPHRD_ARCNET:
+ etc. etc. etc.
+
+ ARPHRD_IPDDP will also work, if author repairs it.
+ I did not it, because this driver does not work even
+ in old paradigm.
+ */
+
+ if (neigh->type == RTN_MULTICAST) {
+ neigh->nud_state = NUD_NOARP;
+ arp_mc_map(addr, neigh->ha, dev, 1);
+ } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+ } else if (neigh->type == RTN_BROADCAST ||
+ (dev->flags & IFF_POINTOPOINT)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+ }
+
+ if (dev->header_ops->cache)
+ neigh->ops = &arp_hh_ops;
+ else
+ neigh->ops = &arp_generic_ops;
+
+ if (neigh->nud_state & NUD_VALID)
+ neigh->output = neigh->ops->connected_output;
+ else
+ neigh->output = neigh->ops->output;
+ }
+ return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+ dst_link_failure(skb);
+ kfree_skb(skb);
+}
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+ __be32 saddr = 0;
+ u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
+ struct net_device *dev = neigh->dev;
+ __be32 target = *(__be32 *)neigh->primary_key;
+ int probes = atomic_read(&neigh->probes);
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return;
+ }
+ switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+ default:
+ case 0: /* By default announce any local IP */
+ if (skb && inet_addr_type(dev_net(dev),
+ ip_hdr(skb)->saddr) == RTN_LOCAL)
+ saddr = ip_hdr(skb)->saddr;
+ break;
+ case 1: /* Restrict announcements of saddr in same subnet */
+ if (!skb)
+ break;
+ saddr = ip_hdr(skb)->saddr;
+ if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+ /* saddr should be known to target */
+ if (inet_addr_onlink(in_dev, target, saddr))
+ break;
+ }
+ saddr = 0;
+ break;
+ case 2: /* Avoid secondary IPs, get a primary/preferred one */
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!saddr)
+ saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+ probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
+ if (probes < 0) {
+ if (!(neigh->nud_state & NUD_VALID))
+ pr_debug("trying to ucast probe in NUD_INVALID\n");
+ neigh_ha_snapshot(dst_ha, neigh, dev);
+ dst_hw = dst_ha;
+ } else {
+ probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
+ if (probes < 0) {
+ neigh_app_ns(neigh);
+ return;
+ }
+ }
+
+ arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_hw, dev->dev_addr, NULL);
+}
+
+static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
+{
+ struct net *net = dev_net(in_dev->dev);
+ int scope;
+
+ switch (IN_DEV_ARP_IGNORE(in_dev)) {
+ case 0: /* Reply, the tip is already validated */
+ return 0;
+ case 1: /* Reply only if tip is configured on the incoming interface */
+ sip = 0;
+ scope = RT_SCOPE_HOST;
+ break;
+ case 2: /*
+ * Reply only if tip is configured on the incoming interface
+ * and is in same subnet as sip
+ */
+ scope = RT_SCOPE_HOST;
+ break;
+ case 3: /* Do not reply for scope host addresses */
+ sip = 0;
+ scope = RT_SCOPE_LINK;
+ in_dev = NULL;
+ break;
+ case 4: /* Reserved */
+ case 5:
+ case 6:
+ case 7:
+ return 0;
+ case 8: /* Do not reply */
+ return 1;
+ default:
+ return 0;
+ }
+ return !inet_confirm_addr(net, in_dev, sip, tip, scope);
+}
+
+static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
+{
+ struct rtable *rt;
+ int flag = 0;
+ /*unsigned long now; */
+ struct net *net = dev_net(dev);
+
+ rt = ip_route_output(net, sip, tip, 0, 0);
+ if (IS_ERR(rt))
+ return 1;
+ if (rt->dst.dev != dev) {
+ NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
+ flag = 1;
+ }
+ ip_rt_put(rt);
+ return flag;
+}
+
+/*
+ * Check if we can use proxy ARP for this path
+ */
+static inline int arp_fwd_proxy(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt)
+{
+ struct in_device *out_dev;
+ int imi, omi = -1;
+
+ if (rt->dst.dev == dev)
+ return 0;
+
+ if (!IN_DEV_PROXY_ARP(in_dev))
+ return 0;
+ imi = IN_DEV_MEDIUM_ID(in_dev);
+ if (imi == 0)
+ return 1;
+ if (imi == -1)
+ return 0;
+
+ /* place to check for proxy_arp for routes */
+
+ out_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (out_dev)
+ omi = IN_DEV_MEDIUM_ID(out_dev);
+
+ return omi != imi && omi != -1;
+}
+
+/*
+ * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
+ *
+ * RFC3069 supports proxy arp replies back to the same interface. This
+ * is done to support (ethernet) switch features, like RFC 3069, where
+ * the individual ports are not allowed to communicate with each
+ * other, BUT they are allowed to talk to the upstream router. As
+ * described in RFC 3069, it is possible to allow these hosts to
+ * communicate through the upstream router, by proxy_arp'ing.
+ *
+ * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
+ *
+ * This technology is known by different names:
+ * In RFC 3069 it is called VLAN Aggregation.
+ * Cisco and Allied Telesyn call it Private VLAN.
+ * Hewlett-Packard call it Source-Port filtering or port-isolation.
+ * Ericsson call it MAC-Forced Forwarding (RFC Draft).
+ *
+ */
+static inline int arp_fwd_pvlan(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt,
+ __be32 sip, __be32 tip)
+{
+ /* Private VLAN is only concerned about the same ethernet segment */
+ if (rt->dst.dev != dev)
+ return 0;
+
+ /* Don't reply on self probes (often done by windowz boxes)*/
+ if (sip == tip)
+ return 0;
+
+ if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ * Create an arp packet. If dest_hw is not set, we create a broadcast
+ * message.
+ */
+struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+
+ /*
+ * Allocate a buffer
+ */
+
+ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, hlen);
+ skb_reset_network_header(skb);
+ arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_ARP);
+ if (!src_hw)
+ src_hw = dev->dev_addr;
+ if (!dest_hw)
+ dest_hw = dev->broadcast;
+
+ /*
+ * Fill the device header for the ARP frame
+ */
+ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
+ goto out;
+
+ /*
+ * Fill out the arp protocol part.
+ *
+ * The arp hardware type should match the device type, except for FDDI,
+ * which (according to RFC 1390) should always equal 1 (Ethernet).
+ */
+ /*
+ * Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+ * DIX code for the protocol. Make these device structure fields.
+ */
+ switch (dev->type) {
+ default:
+ arp->ar_hrd = htons(dev->type);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+
+#if IS_ENABLED(CONFIG_AX25)
+ case ARPHRD_AX25:
+ arp->ar_hrd = htons(ARPHRD_AX25);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+
+#if IS_ENABLED(CONFIG_NETROM)
+ case ARPHRD_NETROM:
+ arp->ar_hrd = htons(ARPHRD_NETROM);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+#endif
+#endif
+
+#if IS_ENABLED(CONFIG_FDDI)
+ case ARPHRD_FDDI:
+ arp->ar_hrd = htons(ARPHRD_ETHER);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+#endif
+ }
+
+ arp->ar_hln = dev->addr_len;
+ arp->ar_pln = 4;
+ arp->ar_op = htons(type);
+
+ arp_ptr = (unsigned char *)(arp + 1);
+
+ memcpy(arp_ptr, src_hw, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ memcpy(arp_ptr, &src_ip, 4);
+ arp_ptr += 4;
+
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ if (target_hw)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ }
+ memcpy(arp_ptr, &dest_ip, 4);
+
+ return skb;
+
+out:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(arp_create);
+
+/*
+ * Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+ /* Send it off, maybe filter it using firewalling first. */
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, NULL, skb,
+ NULL, skb->dev, dev_queue_xmit_sk);
+}
+EXPORT_SYMBOL(arp_xmit);
+
+/*
+ * Create and send an arp packet.
+ */
+void arp_send(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+
+ /*
+ * No arp on this interface.
+ */
+
+ if (dev->flags&IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (!skb)
+ return;
+
+ arp_xmit(skb);
+}
+EXPORT_SYMBOL(arp_send);
+
+/*
+ * Process an arp request.
+ */
+
+static int arp_process(struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ struct rtable *rt;
+ unsigned char *sha;
+ __be32 sip, tip;
+ u16 dev_type = dev->type;
+ int addr_type;
+ struct neighbour *n;
+ struct net *net = dev_net(dev);
+ bool is_garp = false;
+
+ /* arp_rcv below verifies the ARP header and verifies the device
+ * is ARP'able.
+ */
+
+ if (!in_dev)
+ goto out;
+
+ arp = arp_hdr(skb);
+
+ switch (dev_type) {
+ default:
+ if (arp->ar_pro != htons(ETH_P_IP) ||
+ htons(dev_type) != arp->ar_hrd)
+ goto out;
+ break;
+ case ARPHRD_ETHER:
+ case ARPHRD_FDDI:
+ case ARPHRD_IEEE802:
+ /*
+ * ETHERNET, and Fibre Channel (which are IEEE 802
+ * devices, according to RFC 2625) devices will accept ARP
+ * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+ * This is the case also of FDDI, where the RFC 1390 says that
+ * FDDI devices should accept ARP hardware of (1) Ethernet,
+ * however, to be more robust, we'll accept both 1 (Ethernet)
+ * or 6 (IEEE 802.2)
+ */
+ if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+ arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+ arp->ar_pro != htons(ETH_P_IP))
+ goto out;
+ break;
+ case ARPHRD_AX25:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_AX25))
+ goto out;
+ break;
+ case ARPHRD_NETROM:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_NETROM))
+ goto out;
+ break;
+ }
+
+ /* Understand only these message types */
+
+ if (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST))
+ goto out;
+
+/*
+ * Extract fields
+ */
+ arp_ptr = (unsigned char *)(arp + 1);
+ sha = arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&sip, arp_ptr, 4);
+ arp_ptr += 4;
+ switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+ case ARPHRD_IEEE1394:
+ break;
+#endif
+ default:
+ arp_ptr += dev->addr_len;
+ }
+ memcpy(&tip, arp_ptr, 4);
+/*
+ * Check for bad requests for 127.x.x.x and requests for multicast
+ * addresses. If this is one such, delete it.
+ */
+ if (ipv4_is_multicast(tip) ||
+ (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
+ goto out;
+
+/*
+ * Special case: We must set Frame Relay source Q.922 address
+ */
+ if (dev_type == ARPHRD_DLCI)
+ sha = dev->broadcast;
+
+/*
+ * Process entry. The idea here is we want to send a reply if it is a
+ * request for us or if it is a request for someone else that we hold
+ * a proxy for. We want to add an entry to our cache if it is a reply
+ * to us or if it is a request for our address.
+ * (The assumption for this last is that if someone is requesting our
+ * address, they are probably intending to talk to us, so it saves time
+ * if we cache their address. Their address is also probably not in
+ * our cache, since ours is not in their cache.)
+ *
+ * Putting this another way, we only care about replies if they are to
+ * us, in which case we add them to the cache. For requests, we care
+ * about those for us and those for our proxies. We reply to both,
+ * and in the case of requests for us we add the requester to the arp
+ * cache.
+ */
+
+ /* Special case: IPv4 duplicate address detection packet (RFC2131) */
+ if (sip == 0) {
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ inet_addr_type(net, tip) == RTN_LOCAL &&
+ !arp_ignore(in_dev, sip, tip))
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+ dev->dev_addr, sha);
+ goto out;
+ }
+
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+
+ rt = skb_rtable(skb);
+ addr_type = rt->rt_type;
+
+ if (addr_type == RTN_LOCAL) {
+ int dont_send;
+
+ dont_send = arp_ignore(in_dev, sip, tip);
+ if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+ dont_send = arp_filter(sip, tip, dev);
+ if (!dont_send) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
+ neigh_release(n);
+ }
+ }
+ goto out;
+ } else if (IN_DEV_FORWARD(in_dev)) {
+ if (addr_type == RTN_UNICAST &&
+ (arp_fwd_proxy(in_dev, dev, rt) ||
+ arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+ (rt->dst.dev != dev &&
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n)
+ neigh_release(n);
+
+ if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
+ skb->pkt_type == PACKET_HOST ||
+ NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
+ } else {
+ pneigh_enqueue(&arp_tbl,
+ in_dev->arp_parms, skb);
+ return 0;
+ }
+ goto out;
+ }
+ }
+ }
+
+ /* Update our ARP tables */
+
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+ if (IN_DEV_ARP_ACCEPT(in_dev)) {
+ /* Unsolicited ARP is not accepted by default.
+ It is possible, that this option should be enabled for some
+ devices (strip is candidate)
+ */
+ is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
+ inet_addr_type(net, sip) == RTN_UNICAST;
+
+ if (!n &&
+ ((arp->ar_op == htons(ARPOP_REPLY) &&
+ inet_addr_type(net, sip) == RTN_UNICAST) || is_garp))
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
+ }
+
+ if (n) {
+ int state = NUD_REACHABLE;
+ int override;
+
+ /* If several different ARP replies follows back-to-back,
+ use the FIRST one. It is possible, if several proxy
+ agents are active. Taking the first reply prevents
+ arp trashing and chooses the fastest router.
+ */
+ override = time_after(jiffies,
+ n->updated +
+ NEIGH_VAR(n->parms, LOCKTIME)) ||
+ is_garp;
+
+ /* Broadcast replies and request packets
+ do not assert neighbour reachability.
+ */
+ if (arp->ar_op != htons(ARPOP_REPLY) ||
+ skb->pkt_type != PACKET_HOST)
+ state = NUD_STALE;
+ neigh_update(n, sha, state,
+ override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_release(n);
+ }
+
+out:
+ consume_skb(skb);
+ return 0;
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+ arp_process(NULL, skb);
+}
+
+
+/*
+ * Receive an arp request from the device layer.
+ */
+
+static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ const struct arphdr *arp;
+
+ /* do not tweak dropwatch on an ARP we will ignore */
+ if (dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK)
+ goto consumeskb;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out_of_mem;
+
+ /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
+ if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+ goto freeskb;
+
+ arp = arp_hdr(skb);
+ if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
+ goto freeskb;
+
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, NULL, skb,
+ dev, NULL, arp_process);
+
+consumeskb:
+ consume_skb(skb);
+ return 0;
+freeskb:
+ kfree_skb(skb);
+out_of_mem:
+ return 0;
+}
+
+/*
+ * User level interface (ioctl)
+ */
+
+/*
+ * Set (create) an ARP cache entry.
+ */
+
+static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
+{
+ if (!dev) {
+ IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
+ return 0;
+ }
+ if (__in_dev_get_rtnl(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+ return 0;
+ }
+ return -ENXIO;
+}
+
+static int arp_req_set_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask && mask != htonl(0xFFFFFFFF))
+ return -EINVAL;
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
+ r->arp_ha.sa_data);
+ if (!dev)
+ return -ENODEV;
+ }
+ if (mask) {
+ if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
+ return -ENOBUFS;
+ return 0;
+ }
+
+ return arp_req_set_proxy(net, dev, 1);
+}
+
+static int arp_req_set(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip;
+ struct neighbour *neigh;
+ int err;
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_set_public(net, r, dev);
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ if (r->arp_flags & ATF_PERM)
+ r->arp_flags |= ATF_COM;
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FDDI)
+ case ARPHRD_FDDI:
+ /*
+ * According to RFC 1390, FDDI devices should accept ARP
+ * hardware types of 1 (Ethernet). However, to be more
+ * robust, we'll accept hardware types of either 1 (Ethernet)
+ * or 6 (IEEE 802.2).
+ */
+ if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+ r->arp_ha.sa_family != ARPHRD_ETHER &&
+ r->arp_ha.sa_family != ARPHRD_IEEE802)
+ return -EINVAL;
+ break;
+#endif
+ default:
+ if (r->arp_ha.sa_family != dev->type)
+ return -EINVAL;
+ break;
+ }
+
+ neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+ err = PTR_ERR(neigh);
+ if (!IS_ERR(neigh)) {
+ unsigned int state = NUD_STALE;
+ if (r->arp_flags & ATF_PERM)
+ state = NUD_PERMANENT;
+ err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
+ r->arp_ha.sa_data : NULL, state,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+ return err;
+}
+
+static unsigned int arp_state_to_flags(struct neighbour *neigh)
+{
+ if (neigh->nud_state&NUD_PERMANENT)
+ return ATF_PERM | ATF_COM;
+ else if (neigh->nud_state&NUD_VALID)
+ return ATF_COM;
+ else
+ return 0;
+}
+
+/*
+ * Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err = -ENXIO;
+
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ if (neigh) {
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+ r->arp_ha.sa_family = dev->type;
+ strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ neigh_release(neigh);
+ err = 0;
+ }
+ return err;
+}
+
+static int arp_invalidate(struct net_device *dev, __be32 ip)
+{
+ struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ int err = -ENXIO;
+
+ if (neigh) {
+ if (neigh->nud_state & ~NUD_NOARP)
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+
+ return err;
+}
+
+static int arp_req_delete_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask == htonl(0xFFFFFFFF))
+ return pneigh_delete(&arp_tbl, net, &ip, dev);
+
+ if (mask)
+ return -EINVAL;
+
+ return arp_req_set_proxy(net, dev, 0);
+}
+
+static int arp_req_delete(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip;
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_delete_public(net, r, dev);
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ return arp_invalidate(dev, ip);
+}
+
+/*
+ * Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ int err;
+ struct arpreq r;
+ struct net_device *dev = NULL;
+
+ switch (cmd) {
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+
+ if (!(r.arp_flags & ATF_PUBL) &&
+ (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
+ return -EINVAL;
+ if (!(r.arp_flags & ATF_NETMASK))
+ ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+ htonl(0xFFFFFFFFUL);
+ rtnl_lock();
+ if (r.arp_dev[0]) {
+ err = -ENODEV;
+ dev = __dev_get_by_name(net, r.arp_dev);
+ if (!dev)
+ goto out;
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+ if (!r.arp_ha.sa_family)
+ r.arp_ha.sa_family = dev->type;
+ err = -EINVAL;
+ if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+ goto out;
+ } else if (cmd == SIOCGARP) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ switch (cmd) {
+ case SIOCDARP:
+ err = arp_req_delete(net, &r, dev);
+ break;
+ case SIOCSARP:
+ err = arp_req_set(net, &r, dev);
+ break;
+ case SIOCGARP:
+ err = arp_req_get(&r, dev);
+ break;
+ }
+out:
+ rtnl_unlock();
+ if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
+ return err;
+}
+
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
+
+ switch (event) {
+ case NETDEV_CHANGEADDR:
+ neigh_changeaddr(&arp_tbl, dev);
+ rt_cache_flush(dev_net(dev));
+ break;
+ case NETDEV_CHANGE:
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ neigh_changeaddr(&arp_tbl, dev);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block arp_netdev_notifier = {
+ .notifier_call = arp_netdev_event,
+};
+
+/* Note, that it is not on notifier chain.
+ It is necessary, that this routine was called after route cache will be
+ flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+ neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ * Called once on startup.
+ */
+
+static struct packet_type arp_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_ARP),
+ .func = arp_rcv,
+};
+
+static int arp_proc_init(void);
+
+void __init arp_init(void)
+{
+ neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
+
+ dev_add_pack(&arp_packet_type);
+ arp_proc_init();
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
+#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
+}
+
+#ifdef CONFIG_PROC_FS
+#if IS_ENABLED(CONFIG_AX25)
+
+/* ------------------------------------------------------------------------ */
+/*
+ * ax25 -> ASCII conversion
+ */
+static char *ax2asc2(ax25_address *a, char *buf)
+{
+ char c, *s;
+ int n;
+
+ for (n = 0, s = buf; n < 6; n++) {
+ c = (a->ax25_call[n] >> 1) & 0x7F;
+
+ if (c != ' ')
+ *s++ = c;
+ }
+
+ *s++ = '-';
+ n = (a->ax25_call[6] >> 1) & 0x0F;
+ if (n > 9) {
+ *s++ = '1';
+ n -= 10;
+ }
+
+ *s++ = n + '0';
+ *s++ = '\0';
+
+ if (*buf == '\0' || *buf == '-')
+ return "*";
+
+ return buf;
+}
+#endif /* CONFIG_AX25 */
+
+#define HBUFFERLEN 30
+
+static void arp_format_neigh_entry(struct seq_file *seq,
+ struct neighbour *n)
+{
+ char hbuffer[HBUFFERLEN];
+ int k, j;
+ char tbuf[16];
+ struct net_device *dev = n->dev;
+ int hatype = dev->type;
+
+ read_lock(&n->lock);
+ /* Convert hardware address to XX:XX:XX:XX ... form. */
+#if IS_ENABLED(CONFIG_AX25)
+ if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+ ax2asc2((ax25_address *)n->ha, hbuffer);
+ else {
+#endif
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+ hbuffer[k++] = hex_asc_hi(n->ha[j]);
+ hbuffer[k++] = hex_asc_lo(n->ha[j]);
+ hbuffer[k++] = ':';
+ }
+ if (k != 0)
+ --k;
+ hbuffer[k] = 0;
+#if IS_ENABLED(CONFIG_AX25)
+ }
+#endif
+ sprintf(tbuf, "%pI4", n->primary_key);
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+ read_unlock(&n->lock);
+}
+
+static void arp_format_pneigh_entry(struct seq_file *seq,
+ struct pneigh_entry *n)
+{
+ struct net_device *dev = n->dev;
+ int hatype = dev ? dev->type : 0;
+ char tbuf[16];
+
+ sprintf(tbuf, "%pI4", n->key);
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+ dev ? dev->name : "*");
+}
+
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "IP address HW type Flags "
+ "HW address Mask Device\n");
+ } else {
+ struct neigh_seq_state *state = seq->private;
+
+ if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+ arp_format_pneigh_entry(seq, v);
+ else
+ arp_format_neigh_entry(seq, v);
+ }
+
+ return 0;
+}
+
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* Don't want to confuse "arp -a" w/ magic entries,
+ * so we tell the generic iterator to skip NUD_NOARP.
+ */
+ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static const struct seq_operations arp_seq_ops = {
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &arp_seq_ops,
+ sizeof(struct neigh_seq_state));
+}
+
+static const struct file_operations arp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = arp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+
+static int __net_init arp_net_init(struct net *net)
+{
+ if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit arp_net_exit(struct net *net)
+{
+ remove_proc_entry("arp", net->proc_net);
+}
+
+static struct pernet_operations arp_net_ops = {
+ .init = arp_net_init,
+ .exit = arp_net_exit,
+};
+
+static int __init arp_proc_init(void)
+{
+ return register_pernet_subsys(&arp_net_ops);
+}
+
+#else /* CONFIG_PROC_FS */
+
+static int __init arp_proc_init(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
new file mode 100644
index 000000000..bdb2a07ec
--- /dev/null
+++ b/net/ipv4/cipso_ipv4.c
@@ -0,0 +1,2357 @@
+/*
+ * CIPSO - Commercial IP Security Option
+ *
+ * This is an implementation of the CIPSO 2.2 protocol as specified in
+ * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
+ * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors
+ * have chosen to adopt the protocol and over the years it has become a
+ * de-facto standard for labeled networking.
+ *
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ * http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <asm/unaligned.h>
+
+/* List of available DOI definitions */
+/* XXX - This currently assumes a minimal number of different DOIs in use,
+ * if in practice there are a lot of different DOIs this list should
+ * probably be turned into a hash table or something similar so we
+ * can do quick lookups. */
+static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
+static LIST_HEAD(cipso_v4_doi_list);
+
+/* Label mapping cache */
+int cipso_v4_cache_enabled = 1;
+int cipso_v4_cache_bucketsize = 10;
+#define CIPSO_V4_CACHE_BUCKETBITS 7
+#define CIPSO_V4_CACHE_BUCKETS (1 << CIPSO_V4_CACHE_BUCKETBITS)
+#define CIPSO_V4_CACHE_REORDERLIMIT 10
+struct cipso_v4_map_cache_bkt {
+ spinlock_t lock;
+ u32 size;
+ struct list_head list;
+};
+
+struct cipso_v4_map_cache_entry {
+ u32 hash;
+ unsigned char *key;
+ size_t key_len;
+
+ struct netlbl_lsm_cache *lsm_data;
+
+ u32 activity;
+ struct list_head list;
+};
+
+static struct cipso_v4_map_cache_bkt *cipso_v4_cache;
+
+/* Restricted bitmap (tag #1) flags */
+int cipso_v4_rbm_optfmt = 0;
+int cipso_v4_rbm_strictvalid = 1;
+
+/*
+ * Protocol Constants
+ */
+
+/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
+ * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
+#define CIPSO_V4_OPT_LEN_MAX 40
+
+/* Length of the base CIPSO option, this includes the option type (1 byte), the
+ * option length (1 byte), and the DOI (4 bytes). */
+#define CIPSO_V4_HDR_LEN 6
+
+/* Base length of the restrictive category bitmap tag (tag #1). */
+#define CIPSO_V4_TAG_RBM_BLEN 4
+
+/* Base length of the enumerated category tag (tag #2). */
+#define CIPSO_V4_TAG_ENUM_BLEN 4
+
+/* Base length of the ranged categories bitmap tag (tag #5). */
+#define CIPSO_V4_TAG_RNG_BLEN 4
+/* The maximum number of category ranges permitted in the ranged category tag
+ * (tag #5). You may note that the IETF draft states that the maximum number
+ * of category ranges is 7, but if the low end of the last category range is
+ * zero then it is possible to fit 8 category ranges because the zero should
+ * be omitted. */
+#define CIPSO_V4_TAG_RNG_CAT_MAX 8
+
+/* Base length of the local tag (non-standard tag).
+ * Tag definition (may change between kernel versions)
+ *
+ * 0 8 16 24 32
+ * +----------+----------+----------+----------+
+ * | 10000000 | 00000110 | 32-bit secid value |
+ * +----------+----------+----------+----------+
+ * | in (host byte order)|
+ * +----------+----------+
+ *
+ */
+#define CIPSO_V4_TAG_LOC_BLEN 6
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
+ * @bitmap: the bitmap
+ * @bitmap_len: length in bits
+ * @offset: starting offset
+ * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
+ *
+ * Description:
+ * Starting at @offset, walk the bitmap from left to right until either the
+ * desired bit is found or we reach the end. Return the bit offset, -1 if
+ * not found, or -2 if error.
+ */
+static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
+ u32 bitmap_len,
+ u32 offset,
+ u8 state)
+{
+ u32 bit_spot;
+ u32 byte_offset;
+ unsigned char bitmask;
+ unsigned char byte;
+
+ /* gcc always rounds to zero when doing integer division */
+ byte_offset = offset / 8;
+ byte = bitmap[byte_offset];
+ bit_spot = offset;
+ bitmask = 0x80 >> (offset % 8);
+
+ while (bit_spot < bitmap_len) {
+ if ((state && (byte & bitmask) == bitmask) ||
+ (state == 0 && (byte & bitmask) == 0))
+ return bit_spot;
+
+ bit_spot++;
+ bitmask >>= 1;
+ if (bitmask == 0) {
+ byte = bitmap[++byte_offset];
+ bitmask = 0x80;
+ }
+ }
+
+ return -1;
+}
+
+/**
+ * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
+ * @bitmap: the bitmap
+ * @bit: the bit
+ * @state: if non-zero, set the bit (1) else clear the bit (0)
+ *
+ * Description:
+ * Set a single bit in the bitmask. Returns zero on success, negative values
+ * on error.
+ */
+static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
+ u32 bit,
+ u8 state)
+{
+ u32 byte_spot;
+ u8 bitmask;
+
+ /* gcc always rounds to zero when doing integer division */
+ byte_spot = bit / 8;
+ bitmask = 0x80 >> (bit % 8);
+ if (state)
+ bitmap[byte_spot] |= bitmask;
+ else
+ bitmap[byte_spot] &= ~bitmask;
+}
+
+/**
+ * cipso_v4_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry)
+{
+ if (entry->lsm_data)
+ netlbl_secattr_cache_free(entry->lsm_data);
+ kfree(entry->key);
+ kfree(entry);
+}
+
+/**
+ * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CIPSO tag hashing function. Returns a 32-bit hash value.
+ *
+ */
+static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+/*
+ * Label Mapping Cache Functions
+ */
+
+/**
+ * cipso_v4_cache_init - Initialize the CIPSO cache
+ *
+ * Description:
+ * Initializes the CIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file. Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int __init cipso_v4_cache_init(void)
+{
+ u32 iter;
+
+ cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
+ sizeof(struct cipso_v4_map_cache_bkt),
+ GFP_KERNEL);
+ if (!cipso_v4_cache)
+ return -ENOMEM;
+
+ for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+ spin_lock_init(&cipso_v4_cache[iter].lock);
+ cipso_v4_cache[iter].size = 0;
+ INIT_LIST_HEAD(&cipso_v4_cache[iter].list);
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CIPSO cache. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void cipso_v4_cache_invalidate(void)
+{
+ struct cipso_v4_map_cache_entry *entry, *tmp_entry;
+ u32 iter;
+
+ for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+ spin_lock_bh(&cipso_v4_cache[iter].lock);
+ list_for_each_entry_safe(entry,
+ tmp_entry,
+ &cipso_v4_cache[iter].list, list) {
+ list_del(&entry->list);
+ cipso_v4_cache_entry_free(entry);
+ }
+ cipso_v4_cache[iter].size = 0;
+ spin_unlock_bh(&cipso_v4_cache[iter].lock);
+ }
+}
+
+/**
+ * cipso_v4_cache_check - Check the CIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key. If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes. The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ * 1. The cache entry's activity counter is incremented
+ * 2. The previous (higher ranking) entry's activity counter is decremented
+ * 3. If the difference between the two activity counters is geater than
+ * CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int cipso_v4_cache_check(const unsigned char *key,
+ u32 key_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ u32 bkt;
+ struct cipso_v4_map_cache_entry *entry;
+ struct cipso_v4_map_cache_entry *prev_entry = NULL;
+ u32 hash;
+
+ if (!cipso_v4_cache_enabled)
+ return -ENOENT;
+
+ hash = cipso_v4_map_cache_hash(key, key_len);
+ bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+ spin_lock_bh(&cipso_v4_cache[bkt].lock);
+ list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
+ if (entry->hash == hash &&
+ entry->key_len == key_len &&
+ memcmp(entry->key, key, key_len) == 0) {
+ entry->activity += 1;
+ atomic_inc(&entry->lsm_data->refcount);
+ secattr->cache = entry->lsm_data;
+ secattr->flags |= NETLBL_SECATTR_CACHE;
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
+ if (!prev_entry) {
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+ return 0;
+ }
+
+ if (prev_entry->activity > 0)
+ prev_entry->activity -= 1;
+ if (entry->activity > prev_entry->activity &&
+ entry->activity - prev_entry->activity >
+ CIPSO_V4_CACHE_REORDERLIMIT) {
+ __list_del(entry->list.prev, entry->list.next);
+ __list_add(&entry->list,
+ prev_entry->list.prev,
+ &prev_entry->list);
+ }
+
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+ return 0;
+ }
+ prev_entry = entry;
+ }
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+ return -ENOENT;
+}
+
+/**
+ * cipso_v4_cache_add - Add an entry to the CIPSO cache
+ * @skb: the packet
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CIPSO label mapping cache. Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first. It is important to note that there is
+ * currently no checking for duplicate keys. Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int cipso_v4_cache_add(const unsigned char *cipso_ptr,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ u32 bkt;
+ struct cipso_v4_map_cache_entry *entry = NULL;
+ struct cipso_v4_map_cache_entry *old_entry = NULL;
+ u32 cipso_ptr_len;
+
+ if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+ return 0;
+
+ cipso_ptr_len = cipso_ptr[1];
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+ entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
+ if (!entry->key) {
+ ret_val = -ENOMEM;
+ goto cache_add_failure;
+ }
+ entry->key_len = cipso_ptr_len;
+ entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
+ atomic_inc(&secattr->cache->refcount);
+ entry->lsm_data = secattr->cache;
+
+ bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+ spin_lock_bh(&cipso_v4_cache[bkt].lock);
+ if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+ list_add(&entry->list, &cipso_v4_cache[bkt].list);
+ cipso_v4_cache[bkt].size += 1;
+ } else {
+ old_entry = list_entry(cipso_v4_cache[bkt].list.prev,
+ struct cipso_v4_map_cache_entry, list);
+ list_del(&old_entry->list);
+ list_add(&entry->list, &cipso_v4_cache[bkt].list);
+ cipso_v4_cache_entry_free(old_entry);
+ }
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+ return 0;
+
+cache_add_failure:
+ if (entry)
+ cipso_v4_cache_entry_free(entry);
+ return ret_val;
+}
+
+/*
+ * DOI List Functions
+ */
+
+/**
+ * cipso_v4_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
+{
+ struct cipso_v4_doi *iter;
+
+ list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
+ if (iter->doi == doi && atomic_read(&iter->refcount))
+ return iter;
+ return NULL;
+}
+
+/**
+ * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CIPSO engine and calls this
+ * function to add it to the list of acceptable domains. The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see cipso_ipv4.h for details). Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -EINVAL;
+ u32 iter;
+ u32 doi;
+ u32 doi_type;
+ struct audit_buffer *audit_buf;
+
+ doi = doi_def->doi;
+ doi_type = doi_def->type;
+
+ if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+ goto doi_add_return;
+ for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
+ switch (doi_def->tags[iter]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ case CIPSO_V4_TAG_ENUM:
+ if (doi_def->type != CIPSO_V4_MAP_PASS)
+ goto doi_add_return;
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ if (doi_def->type != CIPSO_V4_MAP_LOCAL)
+ goto doi_add_return;
+ break;
+ case CIPSO_V4_TAG_INVALID:
+ if (iter == 0)
+ goto doi_add_return;
+ break;
+ default:
+ goto doi_add_return;
+ }
+ }
+
+ atomic_set(&doi_def->refcount, 1);
+
+ spin_lock(&cipso_v4_doi_list_lock);
+ if (cipso_v4_doi_search(doi_def->doi)) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EEXIST;
+ goto doi_add_return;
+ }
+ list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = 0;
+
+doi_add_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
+ if (audit_buf) {
+ const char *type_str;
+ switch (doi_type) {
+ case CIPSO_V4_MAP_TRANS:
+ type_str = "trans";
+ break;
+ case CIPSO_V4_MAP_PASS:
+ type_str = "pass";
+ break;
+ case CIPSO_V4_MAP_LOCAL:
+ type_str = "local";
+ break;
+ default:
+ type_str = "(unknown)";
+ }
+ audit_log_format(audit_buf,
+ " cipso_doi=%u cipso_type=%s res=%u",
+ doi, type_str, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * cipso_v4_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_TRANS:
+ kfree(doi_def->map.std->lvl.cipso);
+ kfree(doi_def->map.std->lvl.local);
+ kfree(doi_def->map.std->cat.cipso);
+ kfree(doi_def->map.std->cat.local);
+ break;
+ }
+ kfree(doi_def);
+}
+
+/**
+ * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
+{
+ struct cipso_v4_doi *doi_def;
+
+ doi_def = container_of(entry, struct cipso_v4_doi, rcu);
+ cipso_v4_doi_free(doi_def);
+}
+
+/**
+ * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct cipso_v4_doi *doi_def;
+ struct audit_buffer *audit_buf;
+
+ spin_lock(&cipso_v4_doi_list_lock);
+ doi_def = cipso_v4_doi_search(doi);
+ if (!doi_def) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -ENOENT;
+ goto doi_remove_return;
+ }
+ if (!atomic_dec_and_test(&doi_def->refcount)) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EBUSY;
+ goto doi_remove_return;
+ }
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+ ret_val = 0;
+
+doi_remove_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
+ if (audit_buf) {
+ audit_log_format(audit_buf,
+ " cipso_doi=%u res=%u",
+ doi, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller. Otherwise NULL is returned. The caller must ensure that
+ * rcu_read_lock() is held while accessing the returned definition and the DOI
+ * definition reference count is decremented when the caller is done.
+ *
+ */
+struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
+{
+ struct cipso_v4_doi *doi_def;
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(doi);
+ if (!doi_def)
+ goto doi_getdef_return;
+ if (!atomic_inc_not_zero(&doi_def->refcount))
+ doi_def = NULL;
+
+doi_getdef_return:
+ rcu_read_unlock();
+ return doi_def;
+}
+
+/**
+ * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
+ *
+ */
+void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ if (!atomic_dec_and_test(&doi_def->refcount))
+ return;
+ spin_lock(&cipso_v4_doi_list_lock);
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+}
+
+/**
+ * cipso_v4_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return. Updates the value in @skip_cnt upon
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+int cipso_v4_doi_walk(u32 *skip_cnt,
+ int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOENT;
+ u32 doi_cnt = 0;
+ struct cipso_v4_doi *iter_doi;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
+ if (atomic_read(&iter_doi->refcount) > 0) {
+ if (doi_cnt++ < *skip_cnt)
+ continue;
+ ret_val = callback(iter_doi, cb_arg);
+ if (ret_val < 0) {
+ doi_cnt--;
+ goto doi_walk_return;
+ }
+ }
+
+doi_walk_return:
+ rcu_read_unlock();
+ *skip_cnt = doi_cnt;
+ return ret_val;
+}
+
+/*
+ * Label Mapping Functions
+ */
+
+/**
+ * cipso_v4_map_lvl_valid - Checks to see if the given level is understood
+ * @doi_def: the DOI definition
+ * @level: the level to check
+ *
+ * Description:
+ * Checks the given level against the given DOI definition and returns a
+ * negative value if the level does not have a valid mapping and a zero value
+ * if the level is defined by the DOI.
+ *
+ */
+static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
+{
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+ return 0;
+ break;
+ }
+
+ return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network
+ * @doi_def: the DOI definition
+ * @host_lvl: the host MLS level
+ * @net_lvl: the network/CIPSO MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS level to the correct
+ * CIPSO level using the given DOI definition. Returns zero on success,
+ * negative values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
+ u32 host_lvl,
+ u32 *net_lvl)
+{
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ *net_lvl = host_lvl;
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ if (host_lvl < doi_def->map.std->lvl.local_size &&
+ doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
+ *net_lvl = doi_def->map.std->lvl.local[host_lvl];
+ return 0;
+ }
+ return -EPERM;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host
+ * @doi_def: the DOI definition
+ * @net_lvl: the network/CIPSO MLS level
+ * @host_lvl: the host MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO level to the correct local MLS
+ * level using the given DOI definition. Returns zero on success, negative
+ * values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
+ u32 net_lvl,
+ u32 *host_lvl)
+{
+ struct cipso_v4_std_map_tbl *map_tbl;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ *host_lvl = net_lvl;
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ map_tbl = doi_def->map.std;
+ if (net_lvl < map_tbl->lvl.cipso_size &&
+ map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
+ *host_lvl = doi_def->map.std->lvl.cipso[net_lvl];
+ return 0;
+ }
+ return -EPERM;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid
+ * @doi_def: the DOI definition
+ * @bitmap: category bitmap
+ * @bitmap_len: bitmap length in bytes
+ *
+ * Description:
+ * Checks the given category bitmap against the given DOI definition and
+ * returns a negative value if any of the categories in the bitmap do not have
+ * a valid mapping and a zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *bitmap,
+ u32 bitmap_len)
+{
+ int cat = -1;
+ u32 bitmap_len_bits = bitmap_len * 8;
+ u32 cipso_cat_size;
+ u32 *cipso_array;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ cipso_cat_size = doi_def->map.std->cat.cipso_size;
+ cipso_array = doi_def->map.std->cat.cipso;
+ for (;;) {
+ cat = cipso_v4_bitmap_walk(bitmap,
+ bitmap_len_bits,
+ cat + 1,
+ 1);
+ if (cat < 0)
+ break;
+ if (cat >= cipso_cat_size ||
+ cipso_array[cat] >= CIPSO_V4_INV_CAT)
+ return -EFAULT;
+ }
+
+ if (cat == -1)
+ return 0;
+ break;
+ }
+
+ return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO bitmap using the given DOI definition. Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int host_spot = -1;
+ u32 net_spot = CIPSO_V4_INV_CAT;
+ u32 net_spot_max = 0;
+ u32 net_clen_bits = net_cat_len * 8;
+ u32 host_cat_size = 0;
+ u32 *host_cat_array = NULL;
+
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+ host_cat_size = doi_def->map.std->cat.local_size;
+ host_cat_array = doi_def->map.std->cat.local;
+ }
+
+ for (;;) {
+ host_spot = netlbl_catmap_walk(secattr->attr.mls.cat,
+ host_spot + 1);
+ if (host_spot < 0)
+ break;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ net_spot = host_spot;
+ break;
+ case CIPSO_V4_MAP_TRANS:
+ if (host_spot >= host_cat_size)
+ return -EPERM;
+ net_spot = host_cat_array[host_spot];
+ if (net_spot >= CIPSO_V4_INV_CAT)
+ return -EPERM;
+ break;
+ }
+ if (net_spot >= net_clen_bits)
+ return -ENOSPC;
+ cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
+
+ if (net_spot > net_spot_max)
+ net_spot_max = net_spot;
+ }
+
+ if (++net_spot_max % 8)
+ return net_spot_max / 8 + 1;
+ return net_spot_max / 8;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ int net_spot = -1;
+ u32 host_spot = CIPSO_V4_INV_CAT;
+ u32 net_clen_bits = net_cat_len * 8;
+ u32 net_cat_size = 0;
+ u32 *net_cat_array = NULL;
+
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+ net_cat_size = doi_def->map.std->cat.cipso_size;
+ net_cat_array = doi_def->map.std->cat.cipso;
+ }
+
+ for (;;) {
+ net_spot = cipso_v4_bitmap_walk(net_cat,
+ net_clen_bits,
+ net_spot + 1,
+ 1);
+ if (net_spot < 0) {
+ if (net_spot == -2)
+ return -EFAULT;
+ return 0;
+ }
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ host_spot = net_spot;
+ break;
+ case CIPSO_V4_MAP_TRANS:
+ if (net_spot >= net_cat_size)
+ return -EPERM;
+ host_spot = net_cat_array[net_spot];
+ if (host_spot >= CIPSO_V4_INV_CAT)
+ return -EPERM;
+ break;
+ }
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ host_spot,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @enumcat: category list
+ * @enumcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *enumcat,
+ u32 enumcat_len)
+{
+ u16 cat;
+ int cat_prev = -1;
+ u32 iter;
+
+ if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01)
+ return -EFAULT;
+
+ for (iter = 0; iter < enumcat_len; iter += 2) {
+ cat = get_unaligned_be16(&enumcat[iter]);
+ if (cat <= cat_prev)
+ return -EFAULT;
+ cat_prev = cat;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition. Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int cat = -1;
+ u32 cat_iter = 0;
+
+ for (;;) {
+ cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1);
+ if (cat < 0)
+ break;
+ if ((cat_iter + 2) > net_cat_len)
+ return -ENOSPC;
+
+ *((__be16 *)&net_cat[cat_iter]) = htons(cat);
+ cat_iter += 2;
+ }
+
+ return cat_iter;
+}
+
+/**
+ * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 iter;
+
+ for (iter = 0; iter < net_cat_len; iter += 2) {
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ get_unaligned_be16(&net_cat[iter]),
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @rngcat: category list
+ * @rngcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *rngcat,
+ u32 rngcat_len)
+{
+ u16 cat_high;
+ u16 cat_low;
+ u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1;
+ u32 iter;
+
+ if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01)
+ return -EFAULT;
+
+ for (iter = 0; iter < rngcat_len; iter += 4) {
+ cat_high = get_unaligned_be16(&rngcat[iter]);
+ if ((iter + 4) <= rngcat_len)
+ cat_low = get_unaligned_be16(&rngcat[iter + 2]);
+ else
+ cat_low = 0;
+
+ if (cat_high > cat_prev)
+ return -EFAULT;
+
+ cat_prev = cat_low;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition. Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int iter = -1;
+ u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
+ u32 array_cnt = 0;
+ u32 cat_size = 0;
+
+ /* make sure we don't overflow the 'array[]' variable */
+ if (net_cat_len >
+ (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
+ return -ENOSPC;
+
+ for (;;) {
+ iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1);
+ if (iter < 0)
+ break;
+ cat_size += (iter == 0 ? 0 : sizeof(u16));
+ if (cat_size > net_cat_len)
+ return -ENOSPC;
+ array[array_cnt++] = iter;
+
+ iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter);
+ if (iter < 0)
+ return -EFAULT;
+ cat_size += sizeof(u16);
+ if (cat_size > net_cat_len)
+ return -ENOSPC;
+ array[array_cnt++] = iter;
+ }
+
+ for (iter = 0; array_cnt > 0;) {
+ *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]);
+ iter += 2;
+ array_cnt--;
+ if (array[array_cnt] != 0) {
+ *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]);
+ iter += 2;
+ }
+ }
+
+ return cat_size;
+}
+
+/**
+ * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 net_iter;
+ u16 cat_low;
+ u16 cat_high;
+
+ for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
+ cat_high = get_unaligned_be16(&net_cat[net_iter]);
+ if ((net_iter + 4) <= net_cat_len)
+ cat_low = get_unaligned_be16(&net_cat[net_iter + 2]);
+ else
+ cat_low = 0;
+
+ ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat,
+ cat_low,
+ cat_high,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return 0;
+}
+
+/*
+ * Protocol Handling Functions
+ */
+
+/**
+ * cipso_v4_gentag_hdr - Generate a CIPSO option header
+ * @doi_def: the DOI definition
+ * @len: the total tag length in bytes, not including this header
+ * @buf: the CIPSO option buffer
+ *
+ * Description:
+ * Write a CIPSO header into the beginning of @buffer.
+ *
+ */
+static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
+ unsigned char *buf,
+ u32 len)
+{
+ buf[0] = IPOPT_CIPSO;
+ buf[1] = CIPSO_V4_HDR_LEN + len;
+ *(__be32 *)&buf[2] = htonl(doi_def->doi);
+}
+
+/**
+ * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the restricted bitmap tag, tag type #1. The
+ * actual buffer length may be larger than the indicated size due to
+ * translation between host and network category bitmaps. Returns the size of
+ * the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_rbm_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ /* This will send packets using the "optimized" format when
+ * possible as specified in section 3.4.2.6 of the
+ * CIPSO draft. */
+ if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
+ tag_len = 14;
+ else
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_RBITMAP;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security
+ * attributes in @secattr. Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_enum_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_ENUM;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO enumerated tag (tag type #2) and return the security
+ * attributes in @secattr. Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the ranged tag, tag type #5. Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_rng_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_RANGE;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO ranged tag (tag type #5) and return the security attributes
+ * in @secattr. Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the local tag. Returns the size of the tag
+ * on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ if (!(secattr->flags & NETLBL_SECATTR_SECID))
+ return -EPERM;
+
+ buffer[0] = CIPSO_V4_TAG_LOCAL;
+ buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
+ *(u32 *)&buffer[2] = secattr->attr.secid;
+
+ return CIPSO_V4_TAG_LOC_BLEN;
+}
+
+/**
+ * cipso_v4_parsetag_loc - Parse a CIPSO local tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO local tag and return the security attributes in @secattr.
+ * Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ secattr->attr.secid = *(u32 *)&tag[2];
+ secattr->flags |= NETLBL_SECATTR_SECID;
+
+ return 0;
+}
+
+/**
+ * cipso_v4_optptr - Find the CIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CIPSO option. Returns a pointer
+ * to the start of the CIPSO option on success, NULL if one if not found.
+ *
+ */
+unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+ int optlen;
+ int taglen;
+
+ for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
+ if (optptr[0] == IPOPT_CIPSO)
+ return optptr;
+ taglen = optptr[1];
+ optlen -= taglen;
+ optptr += taglen;
+ }
+
+ return NULL;
+}
+
+/**
+ * cipso_v4_validate - Validate a CIPSO option
+ * @option: the start of the option, on error it is set to point to the error
+ *
+ * Description:
+ * This routine is called to validate a CIPSO option, it checks all of the
+ * fields to ensure that they are at least valid, see the draft snippet below
+ * for details. If the option is valid then a zero value is returned and
+ * the value of @option is unchanged. If the option is invalid then a
+ * non-zero value is returned and @option is adjusted to point to the
+ * offending portion of the option. From the IETF draft ...
+ *
+ * "If any field within the CIPSO options, such as the DOI identifier, is not
+ * recognized the IP datagram is discarded and an ICMP 'parameter problem'
+ * (type 12) is generated and returned. The ICMP code field is set to 'bad
+ * parameter' (code 0) and the pointer is set to the start of the CIPSO field
+ * that is unrecognized."
+ *
+ */
+int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
+{
+ unsigned char *opt = *option;
+ unsigned char *tag;
+ unsigned char opt_iter;
+ unsigned char err_offset = 0;
+ u8 opt_len;
+ u8 tag_len;
+ struct cipso_v4_doi *doi_def = NULL;
+ u32 tag_iter;
+
+ /* caller already checks for length values that are too large */
+ opt_len = opt[1];
+ if (opt_len < 8) {
+ err_offset = 1;
+ goto validate_return;
+ }
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
+ if (!doi_def) {
+ err_offset = 2;
+ goto validate_return_locked;
+ }
+
+ opt_iter = CIPSO_V4_HDR_LEN;
+ tag = opt + opt_iter;
+ while (opt_iter < opt_len) {
+ for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
+ if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID ||
+ ++tag_iter == CIPSO_V4_TAG_MAXCNT) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+
+ tag_len = tag[1];
+ if (tag_len > (opt_len - opt_iter)) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ switch (tag[0]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ /* We are already going to do all the verification
+ * necessary at the socket layer so from our point of
+ * view it is safe to turn these checks off (and less
+ * work), however, the CIPSO draft says we should do
+ * all the CIPSO validations here but it doesn't
+ * really specify _exactly_ what we need to validate
+ * ... so, just make it a sysctl tunable. */
+ if (cipso_v4_rbm_strictvalid) {
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
+ cipso_v4_map_cat_rbm_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ }
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
+ cipso_v4_map_cat_enum_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
+ cipso_v4_map_cat_rng_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ /* This is a non-standard tag that we only allow for
+ * local connections, so if the incoming interface is
+ * not the loopback device drop the packet. Further,
+ * there is no legitimate reason for setting this from
+ * userspace so reject it if skb is NULL. */
+ if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+ if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+ break;
+ default:
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+
+ tag += tag_len;
+ opt_iter += tag_len;
+ }
+
+validate_return_locked:
+ rcu_read_unlock();
+validate_return:
+ *option = opt + err_offset;
+ return err_offset;
+}
+
+/**
+ * cipso_v4_error - Send the correct response for a bad packet
+ * @skb: the packet
+ * @error: the error code
+ * @gateway: CIPSO gateway flag
+ *
+ * Description:
+ * Based on the error code given in @error, send an ICMP error message back to
+ * the originating host. From the IETF draft ...
+ *
+ * "If the contents of the CIPSO [option] are valid but the security label is
+ * outside of the configured host or port label range, the datagram is
+ * discarded and an ICMP 'destination unreachable' (type 3) is generated and
+ * returned. The code field of the ICMP is set to 'communication with
+ * destination network administratively prohibited' (code 9) or to
+ * 'communication with destination host administratively prohibited'
+ * (code 10). The value of the code is dependent on whether the originator
+ * of the ICMP message is acting as a CIPSO host or a CIPSO gateway. The
+ * recipient of the ICMP message MUST be able to handle either value. The
+ * same procedure is performed if a CIPSO [option] can not be added to an
+ * IP packet because it is too large to fit in the IP options area."
+ *
+ * "If the error is triggered by receipt of an ICMP message, the message is
+ * discarded and no response is permitted (consistent with general ICMP
+ * processing rules)."
+ *
+ */
+void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
+{
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
+ return;
+
+ if (gateway)
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+ else
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+}
+
+/**
+ * cipso_v4_genopt - Generate a CIPSO option
+ * @buf: the option buffer
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CIPSO option using the DOI definition and security attributes
+ * passed to the function. Returns the length of the option on success and
+ * negative values on failure.
+ *
+ */
+static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 iter;
+
+ if (buf_len <= CIPSO_V4_HDR_LEN)
+ return -ENOSPC;
+
+ /* XXX - This code assumes only one tag per CIPSO option which isn't
+ * really a good assumption to make but since we only support the MAC
+ * tags right now it is a safe assumption. */
+ iter = 0;
+ do {
+ memset(buf, 0, buf_len);
+ switch (doi_def->tags[iter]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ ret_val = cipso_v4_gentag_rbm(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ ret_val = cipso_v4_gentag_enum(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ ret_val = cipso_v4_gentag_rng(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_gentag_loc(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ default:
+ return -EPERM;
+ }
+
+ iter++;
+ } while (ret_val < 0 &&
+ iter < CIPSO_V4_TAG_MAXCNT &&
+ doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
+ if (ret_val < 0)
+ return ret_val;
+ cipso_v4_gentag_hdr(doi_def, buf, ret_val);
+ return CIPSO_V4_HDR_LEN + ret_val;
+}
+
+/**
+ * cipso_v4_sock_setattr - Add a CIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int cipso_v4_sock_setattr(struct sock *sk,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *old, *opt = NULL;
+ struct inet_sock *sk_inet;
+ struct inet_connection_sock *sk_conn;
+
+ /* In the case of sock_create_lite(), the sock->sk field is not
+ * defined yet but it is not a problem as the only users of these
+ * "lite" PF_INET sockets are functions which do an accept() call
+ * afterwards so we will label the socket as part of the accept(). */
+ if (!sk)
+ return 0;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (!buf) {
+ ret_val = -ENOMEM;
+ goto socket_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto socket_setattr_failure;
+ buf_len = ret_val;
+
+ /* We can't use ip_options_get() directly because it makes a call to
+ * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+ * we won't always have CAP_NET_RAW even though we _always_ want to
+ * set the IPOPT_CIPSO option. */
+ opt_len = (buf_len + 3) & ~3;
+ opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+ if (!opt) {
+ ret_val = -ENOMEM;
+ goto socket_setattr_failure;
+ }
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
+ kfree(buf);
+ buf = NULL;
+
+ sk_inet = inet_sk(sk);
+
+ old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+ if (sk_inet->is_icsk) {
+ sk_conn = inet_csk(sk);
+ if (old)
+ sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+ sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
+ sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+ }
+ rcu_assign_pointer(sk_inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
+
+ return 0;
+
+socket_setattr_failure:
+ kfree(buf);
+ kfree(opt);
+ return ret_val;
+}
+
+/**
+ * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int cipso_v4_req_setattr(struct request_sock *req,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *opt = NULL;
+ struct inet_request_sock *req_inet;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (!buf) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto req_setattr_failure;
+ buf_len = ret_val;
+
+ /* We can't use ip_options_get() directly because it makes a call to
+ * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+ * we won't always have CAP_NET_RAW even though we _always_ want to
+ * set the IPOPT_CIPSO option. */
+ opt_len = (buf_len + 3) & ~3;
+ opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+ if (!opt) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
+ kfree(buf);
+ buf = NULL;
+
+ req_inet = inet_rsk(req);
+ opt = xchg(&req_inet->opt, opt);
+ if (opt)
+ kfree_rcu(opt, rcu);
+
+ return 0;
+
+req_setattr_failure:
+ kfree(buf);
+ kfree(opt);
+ return ret_val;
+}
+
+/**
+ * cipso_v4_delopt - Delete the CIPSO option from a set of IP options
+ * @opt_ptr: IP option pointer
+ *
+ * Description:
+ * Deletes the CIPSO IP option from a set of IP options and makes the necessary
+ * adjustments to the IP option structure. Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
+{
+ int hdr_delta = 0;
+ struct ip_options_rcu *opt = *opt_ptr;
+
+ if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
+ u8 cipso_len;
+ u8 cipso_off;
+ unsigned char *cipso_ptr;
+ int iter;
+ int optlen_new;
+
+ cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+ cipso_ptr = &opt->opt.__data[cipso_off];
+ cipso_len = cipso_ptr[1];
+
+ if (opt->opt.srr > opt->opt.cipso)
+ opt->opt.srr -= cipso_len;
+ if (opt->opt.rr > opt->opt.cipso)
+ opt->opt.rr -= cipso_len;
+ if (opt->opt.ts > opt->opt.cipso)
+ opt->opt.ts -= cipso_len;
+ if (opt->opt.router_alert > opt->opt.cipso)
+ opt->opt.router_alert -= cipso_len;
+ opt->opt.cipso = 0;
+
+ memmove(cipso_ptr, cipso_ptr + cipso_len,
+ opt->opt.optlen - cipso_off - cipso_len);
+
+ /* determining the new total option length is tricky because of
+ * the padding necessary, the only thing i can think to do at
+ * this point is walk the options one-by-one, skipping the
+ * padding at the end to determine the actual option size and
+ * from there we can determine the new total option length */
+ iter = 0;
+ optlen_new = 0;
+ while (iter < opt->opt.optlen)
+ if (opt->opt.__data[iter] != IPOPT_NOP) {
+ iter += opt->opt.__data[iter + 1];
+ optlen_new = iter;
+ } else
+ iter++;
+ hdr_delta = opt->opt.optlen;
+ opt->opt.optlen = (optlen_new + 3) & ~3;
+ hdr_delta -= opt->opt.optlen;
+ } else {
+ /* only the cipso option was present on the socket so we can
+ * remove the entire option struct */
+ *opt_ptr = NULL;
+ hdr_delta = opt->opt.optlen;
+ kfree_rcu(opt, rcu);
+ }
+
+ return hdr_delta;
+}
+
+/**
+ * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CIPSO option from a socket, if present.
+ *
+ */
+void cipso_v4_sock_delattr(struct sock *sk)
+{
+ int hdr_delta;
+ struct ip_options_rcu *opt;
+ struct inet_sock *sk_inet;
+
+ sk_inet = inet_sk(sk);
+ opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
+ if (!opt || opt->opt.cipso == 0)
+ return;
+
+ hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
+ if (sk_inet->is_icsk && hdr_delta > 0) {
+ struct inet_connection_sock *sk_conn = inet_csk(sk);
+ sk_conn->icsk_ext_hdr_len -= hdr_delta;
+ sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+ }
+}
+
+/**
+ * cipso_v4_req_delattr - Delete the CIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CIPSO option from a request socket, if present.
+ *
+ */
+void cipso_v4_req_delattr(struct request_sock *req)
+{
+ struct ip_options_rcu *opt;
+ struct inet_request_sock *req_inet;
+
+ req_inet = inet_rsk(req);
+ opt = req_inet->opt;
+ if (!opt || opt->opt.cipso == 0)
+ return;
+
+ cipso_v4_delopt(&req_inet->opt);
+}
+
+/**
+ * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
+ * @cipso: the CIPSO v4 option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @cipso and return the security attributes in @secattr. Returns zero
+ * on success and negative values on failure.
+ *
+ */
+int cipso_v4_getattr(const unsigned char *cipso,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ u32 doi;
+ struct cipso_v4_doi *doi_def;
+
+ if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
+ return 0;
+
+ doi = get_unaligned_be32(&cipso[2]);
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(doi);
+ if (!doi_def)
+ goto getattr_return;
+ /* XXX - This code assumes only one tag per CIPSO option which isn't
+ * really a good assumption to make but since we only support the MAC
+ * tags right now it is a safe assumption. */
+ switch (cipso[6]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
+ break;
+ }
+ if (ret_val == 0)
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
+
+getattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/**
+ * cipso_v4_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CIPSO option attached to the sock and if
+ * there is return the CIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+ struct ip_options_rcu *opt;
+ int res = -ENOMSG;
+
+ rcu_read_lock();
+ opt = rcu_dereference(inet_sk(sk)->inet_opt);
+ if (opt && opt->opt.cipso)
+ res = cipso_v4_getattr(opt->opt.__data +
+ opt->opt.cipso -
+ sizeof(struct iphdr),
+ secattr);
+ rcu_read_unlock();
+ return res;
+}
+
+/**
+ * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int cipso_v4_skbuff_setattr(struct sk_buff *skb,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
+ u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
+ u32 opt_len;
+ int len_delta;
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ return ret_val;
+ buf_len = ret_val;
+ opt_len = (buf_len + 3) & ~3;
+
+ /* we overwrite any existing options to ensure that we have enough
+ * room for the CIPSO option, the reason is that we _need_ to guarantee
+ * that the security label is applied to the packet - we do the same
+ * thing when using the socket options and it hasn't caused a problem,
+ * if we need to we can always revisit this choice later */
+
+ len_delta = opt_len - opt->optlen;
+ /* if we don't ensure enough headroom we could panic on the skb_push()
+ * call below so make sure we have enough, we are also "mangling" the
+ * packet so we should probably do a copy-on-write call anyway */
+ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ if (ret_val < 0)
+ return ret_val;
+
+ if (len_delta > 0) {
+ /* we assume that the header + opt->optlen have already been
+ * "pushed" in ip_options_build() or similar */
+ iph = ip_hdr(skb);
+ skb_push(skb, len_delta);
+ memmove((char *)iph - len_delta, iph, iph->ihl << 2);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ } else if (len_delta < 0) {
+ iph = ip_hdr(skb);
+ memset(iph + 1, IPOPT_NOP, opt->optlen);
+ } else
+ iph = ip_hdr(skb);
+
+ if (opt->optlen > 0)
+ memset(opt, 0, sizeof(*opt));
+ opt->optlen = opt_len;
+ opt->cipso = sizeof(struct iphdr);
+ opt->is_changed = 1;
+
+ /* we have to do the following because we are being called from a
+ * netfilter hook which means the packet already has had the header
+ * fields populated and the checksum calculated - yes this means we
+ * are doing more work than needed but we do it to keep the core
+ * stack clean and tidy */
+ memcpy(iph + 1, buf, buf_len);
+ if (opt_len > buf_len)
+ memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
+ if (len_delta != 0) {
+ iph->ihl = 5 + (opt_len >> 2);
+ iph->tot_len = htons(skb->len);
+ }
+ ip_send_check(iph);
+
+ return 0;
+}
+
+/**
+ * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char *cipso_ptr;
+
+ if (opt->cipso == 0)
+ return 0;
+
+ /* since we are changing the packet we should make a copy */
+ ret_val = skb_cow(skb, skb_headroom(skb));
+ if (ret_val < 0)
+ return ret_val;
+
+ /* the easiest thing to do is just replace the cipso option with noop
+ * options since we don't change the size of the packet, although we
+ * still need to recalculate the checksum */
+
+ iph = ip_hdr(skb);
+ cipso_ptr = (unsigned char *)iph + opt->cipso;
+ memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+ opt->cipso = 0;
+ opt->is_changed = 1;
+
+ ip_send_check(iph);
+
+ return 0;
+}
+
+/*
+ * Setup Functions
+ */
+
+/**
+ * cipso_v4_init - Initialize the CIPSO module
+ *
+ * Description:
+ * Initialize the CIPSO module and prepare it for use. Returns zero on success
+ * and negative values on failure.
+ *
+ */
+static int __init cipso_v4_init(void)
+{
+ int ret_val;
+
+ ret_val = cipso_v4_cache_init();
+ if (ret_val != 0)
+ panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n",
+ ret_val);
+
+ return 0;
+}
+
+subsys_initcall(cipso_v4_init);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 000000000..90c0e8386
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,123 @@
+/*
+ * common UDP/RAW code
+ * Linux INET implementation
+ *
+ * Authors:
+ * Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ __be32 saddr;
+ int oif;
+ int err;
+
+
+ if (addr_len < sizeof(*usin))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ sk_dst_reset(sk);
+
+ lock_sock(sk);
+
+ oif = sk->sk_bound_dev_if;
+ saddr = inet->inet_saddr;
+ if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
+ if (!oif)
+ oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ }
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
+ RT_CONN_FLAGS(sk), oif,
+ sk->sk_protocol,
+ inet->inet_sport, usin->sin_port, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+ ip_rt_put(rt);
+ err = -EACCES;
+ goto out;
+ }
+ if (!inet->inet_saddr)
+ inet->inet_saddr = fl4->saddr; /* Update source address */
+ if (!inet->inet_rcv_saddr) {
+ inet->inet_rcv_saddr = fl4->saddr;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
+ inet->inet_daddr = fl4->daddr;
+ inet->inet_dport = usin->sin_port;
+ sk->sk_state = TCP_ESTABLISHED;
+ inet_set_txhash(sk);
+ inet->inet_id = jiffies;
+
+ sk_dst_set(sk, &rt->dst);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(ip4_datagram_connect);
+
+/* Because UDP xmit path can manipulate sk_dst_cache without holding
+ * socket lock, we need to use sk_dst_set() here,
+ * even if we own the socket lock.
+ */
+void ip4_datagram_release_cb(struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct dst_entry *dst;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+ rcu_read_unlock();
+ return;
+ }
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+
+ dst = !IS_ERR(rt) ? &rt->dst : NULL;
+ sk_dst_set(sk, dst);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 000000000..419d23c53
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,2386 @@
+/*
+ * NET3 IP device support routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the IP parts of dev.c 1.0.19
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ * Additional Authors:
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Alexey Kuznetsov: pa_* fields are replaced with ifaddr
+ * lists.
+ * Cyrus Durgin: updated for kmod
+ * Matthias Andree: in devinet_ioctl, compare label and
+ * address (4.4BSD alias style support),
+ * fall back to comparing just the label
+ * if no match found.
+ */
+
+
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_addr.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/kmod.h>
+#include <linux/netconf.h>
+
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/addrconf.h>
+
+#include "fib_lookup.h"
+
+static struct ipv4_devconf ipv4_devconf = {
+ .data = {
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
+ },
+};
+
+static struct ipv4_devconf ipv4_devconf_dflt = {
+ .data = {
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
+ [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
+ },
+};
+
+#define IPV4_DEVCONF_DFLT(net, attr) \
+ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)
+
+static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
+ [IFA_LOCAL] = { .type = NLA_U32 },
+ [IFA_ADDRESS] = { .type = NLA_U32 },
+ [IFA_BROADCAST] = { .type = NLA_U32 },
+ [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
+ [IFA_FLAGS] = { .type = NLA_U32 },
+};
+
+#define IN4_ADDR_HSIZE_SHIFT 8
+#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
+
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+
+static u32 inet_addr_hash(const struct net *net, __be32 addr)
+{
+ u32 val = (__force u32) addr ^ net_hash_mix(net);
+
+ return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ u32 hash = inet_addr_hash(net, ifa->ifa_local);
+
+ ASSERT_RTNL();
+ hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ ASSERT_RTNL();
+ hlist_del_init_rcu(&ifa->hash);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ u32 hash = inet_addr_hash(net, addr);
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
+ if (ifa->ifa_local == addr) {
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ if (!net_eq(dev_net(dev), net))
+ continue;
+ result = dev;
+ break;
+ }
+ }
+ if (!result) {
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res = { 0 };
+ struct fib_table *local;
+
+ /* Fallback to FIB local table so that communication
+ * over loopback subnets work.
+ */
+ local = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local &&
+ !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+ res.type == RTN_LOCAL)
+ result = FIB_RES_DEV(res);
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
+static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
+
+static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy);
+#ifdef CONFIG_SYSCTL
+static int devinet_sysctl_register(struct in_device *idev);
+static void devinet_sysctl_unregister(struct in_device *idev);
+#else
+static int devinet_sysctl_register(struct in_device *idev)
+{
+ return 0;
+}
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+}
+#endif
+
+/* Locks all the inet devices. */
+
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+ return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+ struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+ if (ifa->ifa_dev)
+ in_dev_put(ifa->ifa_dev);
+ kfree(ifa);
+}
+
+static void inet_free_ifa(struct in_ifaddr *ifa)
+{
+ call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+void in_dev_finish_destroy(struct in_device *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ WARN_ON(idev->ifa_list);
+ WARN_ON(idev->mc_list);
+ kfree(rcu_dereference_protected(idev->mc_hash, 1));
+#ifdef NET_REFCNT_DEBUG
+ pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
+#endif
+ dev_put(dev);
+ if (!idev->dead)
+ pr_err("Freeing alive in_device %p\n", idev);
+ else
+ kfree(idev);
+}
+EXPORT_SYMBOL(in_dev_finish_destroy);
+
+static struct in_device *inetdev_init(struct net_device *dev)
+{
+ struct in_device *in_dev;
+ int err = -ENOMEM;
+
+ ASSERT_RTNL();
+
+ in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
+ if (!in_dev)
+ goto out;
+ memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
+ sizeof(in_dev->cnf));
+ in_dev->cnf.sysctl = NULL;
+ in_dev->dev = dev;
+ in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+ if (!in_dev->arp_parms)
+ goto out_kfree;
+ if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
+ dev_disable_lro(dev);
+ /* Reference in_dev->dev */
+ dev_hold(dev);
+ /* Account for reference dev->ip_ptr (below) */
+ in_dev_hold(in_dev);
+
+ err = devinet_sysctl_register(in_dev);
+ if (err) {
+ in_dev->dead = 1;
+ in_dev_put(in_dev);
+ in_dev = NULL;
+ goto out;
+ }
+ ip_mc_init_dev(in_dev);
+ if (dev->flags & IFF_UP)
+ ip_mc_up(in_dev);
+
+ /* we can receive as soon as ip_ptr is set -- do this last */
+ rcu_assign_pointer(dev->ip_ptr, in_dev);
+out:
+ return in_dev ?: ERR_PTR(err);
+out_kfree:
+ kfree(in_dev);
+ in_dev = NULL;
+ goto out;
+}
+
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+ struct in_device *idev = container_of(head, struct in_device, rcu_head);
+ in_dev_put(idev);
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ dev = in_dev->dev;
+
+ in_dev->dead = 1;
+
+ ip_mc_destroy_dev(in_dev);
+
+ while ((ifa = in_dev->ifa_list) != NULL) {
+ inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+ inet_free_ifa(ifa);
+ }
+
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
+
+ devinet_sysctl_unregister(in_dev);
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ arp_ifdown(dev);
+
+ call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+}
+
+int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
+{
+ rcu_read_lock();
+ for_primary_ifa(in_dev) {
+ if (inet_ifa_match(a, ifa)) {
+ if (!b || inet_ifa_match(b, ifa)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ } endfor_ifa(in_dev);
+ rcu_read_unlock();
+ return 0;
+}
+
+static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy, struct nlmsghdr *nlh, u32 portid)
+{
+ struct in_ifaddr *promote = NULL;
+ struct in_ifaddr *ifa, *ifa1 = *ifap;
+ struct in_ifaddr *last_prim = in_dev->ifa_list;
+ struct in_ifaddr *prev_prom = NULL;
+ int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
+
+ ASSERT_RTNL();
+
+ /* 1. Deleting primary ifaddr forces deletion all secondaries
+ * unless alias promotion is set
+ **/
+
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+ struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+ while ((ifa = *ifap1) != NULL) {
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
+ ifa1->ifa_scope <= ifa->ifa_scope)
+ last_prim = ifa;
+
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+ ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa)) {
+ ifap1 = &ifa->ifa_next;
+ prev_prom = ifa;
+ continue;
+ }
+
+ if (!do_promote) {
+ inet_hash_remove(ifa);
+ *ifap1 = ifa->ifa_next;
+
+ rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
+ blocking_notifier_call_chain(&inetaddr_chain,
+ NETDEV_DOWN, ifa);
+ inet_free_ifa(ifa);
+ } else {
+ promote = ifa;
+ break;
+ }
+ }
+ }
+
+ /* On promotion all secondaries from subnet are changing
+ * the primary IP, we must remove all their routes silently
+ * and later to add them back with new prefsrc. Do this
+ * while all addresses are on the device list.
+ */
+ for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa))
+ fib_del_ifaddr(ifa, ifa1);
+ }
+
+ /* 2. Unlink it */
+
+ *ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
+
+ /* 3. Announce address deletion */
+
+ /* Send message first, then call notifier.
+ At first sight, FIB update triggered by notifier
+ will refer to already deleted ifaddr, that could confuse
+ netlink listeners. It is not true: look, gated sees
+ that route deleted and if it still thinks that ifaddr
+ is valid, it will try to restore deleted routes... Grr.
+ So that, this order is correct.
+ */
+ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+
+ if (promote) {
+ struct in_ifaddr *next_sec = promote->ifa_next;
+
+ if (prev_prom) {
+ prev_prom->ifa_next = promote->ifa_next;
+ promote->ifa_next = last_prim->ifa_next;
+ last_prim->ifa_next = promote;
+ }
+
+ promote->ifa_flags &= ~IFA_F_SECONDARY;
+ rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
+ blocking_notifier_call_chain(&inetaddr_chain,
+ NETDEV_UP, promote);
+ for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa))
+ continue;
+ fib_add_ifaddr(ifa);
+ }
+
+ }
+ if (destroy)
+ inet_free_ifa(ifa1);
+}
+
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy)
+{
+ __inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
+}
+
+static void check_lifetime(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
+
+static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ u32 portid)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap, **last_primary;
+
+ ASSERT_RTNL();
+
+ if (!ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return 0;
+ }
+
+ ifa->ifa_flags &= ~IFA_F_SECONDARY;
+ last_primary = &in_dev->ifa_list;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+ ifa->ifa_scope <= ifa1->ifa_scope)
+ last_primary = &ifa1->ifa_next;
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa)) {
+ if (ifa1->ifa_local == ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return -EEXIST;
+ }
+ if (ifa1->ifa_scope != ifa->ifa_scope) {
+ inet_free_ifa(ifa);
+ return -EINVAL;
+ }
+ ifa->ifa_flags |= IFA_F_SECONDARY;
+ }
+ }
+
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
+ prandom_seed((__force u32) ifa->ifa_local);
+ ifap = last_primary;
+ }
+
+ ifa->ifa_next = *ifap;
+ *ifap = ifa;
+
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
+ cancel_delayed_work(&check_lifetime_work);
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
+ /* Send message first, then call notifier.
+ Notifier will trigger FIB update, so that
+ listeners of netlink will know about new ifaddr */
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+ return 0;
+}
+
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+ return __inet_insert_ifa(ifa, NULL, 0);
+}
+
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ inet_free_ifa(ifa);
+ return -ENOBUFS;
+ }
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ if (ifa->ifa_dev != in_dev) {
+ WARN_ON(ifa->ifa_dev);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ }
+ if (ipv4_is_loopback(ifa->ifa_local))
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ return inet_insert_ifa(ifa);
+}
+
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
+struct in_device *inetdev_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct in_device *in_dev = NULL;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (dev)
+ in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ rcu_read_unlock();
+ return in_dev;
+}
+EXPORT_SYMBOL(inetdev_by_index);
+
+/* Called only from RTNL semaphored context. No locks. */
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
+ __be32 mask)
+{
+ ASSERT_RTNL();
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+ return ifa;
+ } endfor_ifa(in_dev);
+ return NULL;
+}
+
+static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)
+{
+ struct ip_mreqn mreq = {
+ .imr_multiaddr.s_addr = ifa->ifa_address,
+ .imr_ifindex = ifa->ifa_dev->dev->ifindex,
+ };
+ int ret;
+
+ ASSERT_RTNL();
+
+ lock_sock(sk);
+ if (join)
+ ret = ip_mc_join_group(sk, &mreq);
+ else
+ ret = ip_mc_leave_group(sk, &mreq);
+ release_sock(sk);
+
+ return ret;
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFA_MAX+1];
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm;
+ struct in_ifaddr *ifa, **ifap;
+ int err = -EINVAL;
+
+ ASSERT_RTNL();
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ ifm = nlmsg_data(nlh);
+ in_dev = inetdev_by_index(net, ifm->ifa_index);
+ if (!in_dev) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (tb[IFA_LOCAL] &&
+ ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
+ continue;
+
+ if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
+ continue;
+
+ if (tb[IFA_ADDRESS] &&
+ (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+ !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa)))
+ continue;
+
+ if (ipv4_is_multicast(ifa->ifa_address))
+ ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa);
+ __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
+ return 0;
+ }
+
+ err = -EADDRNOTAVAIL;
+errout:
+ return err;
+}
+
+#define INFINITY_LIFE_TIME 0xFFFFFFFF
+
+static void check_lifetime(struct work_struct *work)
+{
+ unsigned long now, next, next_sec, next_sched;
+ struct in_ifaddr *ifa;
+ struct hlist_node *n;
+ int i;
+
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+ bool change_needed = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ change_needed = true;
+ } else if (ifa->ifa_preferred_lft ==
+ INFINITY_LIFE_TIME) {
+ continue;
+ } else if (age >= ifa->ifa_preferred_lft) {
+ if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ, next))
+ next = ifa->ifa_tstamp +
+ ifa->ifa_valid_lft * HZ;
+
+ if (!(ifa->ifa_flags & IFA_F_DEPRECATED))
+ change_needed = true;
+ } else if (time_before(ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ,
+ next)) {
+ next = ifa->ifa_tstamp +
+ ifa->ifa_preferred_lft * HZ;
+ }
+ }
+ rcu_read_unlock();
+ if (!change_needed)
+ continue;
+ rtnl_lock();
+ hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) {
+ unsigned long age;
+
+ if (ifa->ifa_flags & IFA_F_PERMANENT)
+ continue;
+
+ /* We try to batch several events at once. */
+ age = (now - ifa->ifa_tstamp +
+ ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_valid_lft) {
+ struct in_ifaddr **ifap;
+
+ for (ifap = &ifa->ifa_dev->ifa_list;
+ *ifap != NULL; ifap = &(*ifap)->ifa_next) {
+ if (*ifap == ifa) {
+ inet_del_ifa(ifa->ifa_dev,
+ ifap, 1);
+ break;
+ }
+ }
+ } else if (ifa->ifa_preferred_lft !=
+ INFINITY_LIFE_TIME &&
+ age >= ifa->ifa_preferred_lft &&
+ !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+ }
+ rtnl_unlock();
+ }
+
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ now = jiffies;
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
+
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work,
+ next_sched - now);
+}
+
+static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
+ __u32 prefered_lft)
+{
+ unsigned long timeout;
+
+ ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout))
+ ifa->ifa_valid_lft = timeout;
+ else
+ ifa->ifa_flags |= IFA_F_PERMANENT;
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa->ifa_flags |= IFA_F_DEPRECATED;
+ ifa->ifa_preferred_lft = timeout;
+ }
+ ifa->ifa_tstamp = jiffies;
+ if (!ifa->ifa_cstamp)
+ ifa->ifa_cstamp = ifa->ifa_tstamp;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
+ __u32 *pvalid_lft, __u32 *pprefered_lft)
+{
+ struct nlattr *tb[IFA_MAX+1];
+ struct in_ifaddr *ifa;
+ struct ifaddrmsg *ifm;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ ifm = nlmsg_data(nlh);
+ err = -EINVAL;
+ if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL])
+ goto errout;
+
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ err = -ENODEV;
+ if (!dev)
+ goto errout;
+
+ in_dev = __in_dev_get_rtnl(dev);
+ err = -ENOBUFS;
+ if (!in_dev)
+ goto errout;
+
+ ifa = inet_alloc_ifa();
+ if (!ifa)
+ /*
+ * A potential indev allocation can be left alive, it stays
+ * assigned to its device and is destroy with it.
+ */
+ goto errout;
+
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ in_dev_hold(in_dev);
+
+ if (!tb[IFA_ADDRESS])
+ tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+
+ INIT_HLIST_NODE(&ifa->hash);
+ ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+ ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+ ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
+ ifm->ifa_flags;
+ ifa->ifa_scope = ifm->ifa_scope;
+ ifa->ifa_dev = in_dev;
+
+ ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
+ ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);
+
+ if (tb[IFA_BROADCAST])
+ ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
+
+ if (tb[IFA_LABEL])
+ nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ err = -EINVAL;
+ goto errout_free;
+ }
+ *pvalid_lft = ci->ifa_valid;
+ *pprefered_lft = ci->ifa_prefered;
+ }
+
+ return ifa;
+
+errout_free:
+ inet_free_ifa(ifa);
+errout:
+ return ERR_PTR(err);
+}
+
+static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap;
+
+ if (!ifa->ifa_local)
+ return NULL;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa) &&
+ ifa1->ifa_local == ifa->ifa_local)
+ return ifa1;
+ }
+ return NULL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct in_ifaddr *ifa;
+ struct in_ifaddr *ifa_existing;
+ __u32 valid_lft = INFINITY_LIFE_TIME;
+ __u32 prefered_lft = INFINITY_LIFE_TIME;
+
+ ASSERT_RTNL();
+
+ ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
+ if (IS_ERR(ifa))
+ return PTR_ERR(ifa);
+
+ ifa_existing = find_matching_ifa(ifa);
+ if (!ifa_existing) {
+ /* It would be best to check for !NLM_F_CREATE here but
+ * userspace already relies on not having to provide this.
+ */
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
+ int ret = ip_mc_config(net->ipv4.mc_autojoin_sk,
+ true, ifa);
+
+ if (ret < 0) {
+ inet_free_ifa(ifa);
+ return ret;
+ }
+ }
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ } else {
+ inet_free_ifa(ifa);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL ||
+ !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ return -EEXIST;
+ ifa = ifa_existing;
+ set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ cancel_delayed_work(&check_lifetime_work);
+ queue_delayed_work(system_power_efficient_wq,
+ &check_lifetime_work, 0);
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+ }
+ return 0;
+}
+
+/*
+ * Determine a default network mask, based on the IP address.
+ */
+
+static int inet_abc_len(__be32 addr)
+{
+ int rc = -1; /* Something else, probably a multicast. */
+
+ if (ipv4_is_zeronet(addr))
+ rc = 0;
+ else {
+ __u32 haddr = ntohl(addr);
+
+ if (IN_CLASSA(haddr))
+ rc = 8;
+ else if (IN_CLASSB(haddr))
+ rc = 16;
+ else if (IN_CLASSC(haddr))
+ rc = 24;
+ }
+
+ return rc;
+}
+
+
+int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ struct sockaddr_in sin_orig;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+ struct in_device *in_dev;
+ struct in_ifaddr **ifap = NULL;
+ struct in_ifaddr *ifa = NULL;
+ struct net_device *dev;
+ char *colon;
+ int ret = -EFAULT;
+ int tryaddrmatch = 0;
+
+ /*
+ * Fetch the caller's info block into kernel space
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ goto out;
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+ /* save original address for comparison */
+ memcpy(&sin_orig, sin, sizeof(*sin));
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ dev_load(net, ifr.ifr_name);
+
+ switch (cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ /* Note that these ioctls will not sleep,
+ so that we do not impose a lock.
+ One day we will be forced to put shlock here (I mean SMP)
+ */
+ tryaddrmatch = (sin_orig.sin_family == AF_INET);
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ break;
+
+ case SIOCSIFFLAGS:
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto out;
+ break;
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto out;
+ ret = -EINVAL;
+ if (sin->sin_family != AF_INET)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rtnl_lock();
+
+ ret = -ENODEV;
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev)
+ goto done;
+
+ if (colon)
+ *colon = ':';
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
+ if (tryaddrmatch) {
+ /* Matthias Andree */
+ /* compare label and address (4.4BSD style) */
+ /* note: we only do this for a limited set of ioctls
+ and only if the original address family was AF_INET.
+ This is checked above. */
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+ sin_orig.sin_addr.s_addr ==
+ ifa->ifa_local) {
+ break; /* found */
+ }
+ }
+ }
+ /* we didn't get a match, maybe the application is
+ 4.3BSD-style and passed in junk so we fall back to
+ comparing just the label */
+ if (!ifa) {
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next)
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+ break;
+ }
+ }
+
+ ret = -EADDRNOTAVAIL;
+ if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
+ goto done;
+
+ switch (cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ sin->sin_addr.s_addr = ifa->ifa_local;
+ goto rarok;
+
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ sin->sin_addr.s_addr = ifa->ifa_broadcast;
+ goto rarok;
+
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ sin->sin_addr.s_addr = ifa->ifa_address;
+ goto rarok;
+
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ sin->sin_addr.s_addr = ifa->ifa_mask;
+ goto rarok;
+
+ case SIOCSIFFLAGS:
+ if (colon) {
+ ret = -EADDRNOTAVAIL;
+ if (!ifa)
+ break;
+ ret = 0;
+ if (!(ifr.ifr_flags & IFF_UP))
+ inet_del_ifa(in_dev, ifap, 1);
+ break;
+ }
+ ret = dev_change_flags(dev, ifr.ifr_flags);
+ break;
+
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+
+ if (!ifa) {
+ ret = -ENOBUFS;
+ ifa = inet_alloc_ifa();
+ if (!ifa)
+ break;
+ INIT_HLIST_NODE(&ifa->hash);
+ if (colon)
+ memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ } else {
+ ret = 0;
+ if (ifa->ifa_local == sin->sin_addr.s_addr)
+ break;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = 0;
+ ifa->ifa_scope = 0;
+ }
+
+ ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+
+ if (!(dev->flags & IFF_POINTOPOINT)) {
+ ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+ if ((dev->flags & IFF_BROADCAST) &&
+ ifa->ifa_prefixlen < 31)
+ ifa->ifa_broadcast = ifa->ifa_address |
+ ~ifa->ifa_mask;
+ } else {
+ ifa->ifa_prefixlen = 32;
+ ifa->ifa_mask = inet_make_mask(32);
+ }
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+ ret = inet_set_ifa(dev, ifa);
+ break;
+
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ ret = 0;
+ if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ }
+ break;
+
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ ret = 0;
+ if (ifa->ifa_address == sin->sin_addr.s_addr)
+ break;
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+ ret = 0;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_address = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ break;
+
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+
+ /*
+ * The mask we set must be legal.
+ */
+ ret = -EINVAL;
+ if (bad_mask(sin->sin_addr.s_addr, 0))
+ break;
+ ret = 0;
+ if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+ __be32 old_mask = ifa->ifa_mask;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_mask = sin->sin_addr.s_addr;
+ ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+
+ /* See if current broadcast address matches
+ * with current netmask, then recalculate
+ * the broadcast address. Otherwise it's a
+ * funny address, so don't touch it since
+ * the user seems to know what (s)he's doing...
+ */
+ if ((dev->flags & IFF_BROADCAST) &&
+ (ifa->ifa_prefixlen < 31) &&
+ (ifa->ifa_broadcast ==
+ (ifa->ifa_local|~old_mask))) {
+ ifa->ifa_broadcast = (ifa->ifa_local |
+ ~sin->sin_addr.s_addr);
+ }
+ inet_insert_ifa(ifa);
+ }
+ break;
+ }
+done:
+ rtnl_unlock();
+out:
+ return ret;
+rarok:
+ rtnl_unlock();
+ ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
+ goto out;
+}
+
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct in_ifaddr *ifa;
+ struct ifreq ifr;
+ int done = 0;
+
+ if (!in_dev)
+ goto out;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ if (!buf) {
+ done += sizeof(ifr);
+ continue;
+ }
+ if (len < (int) sizeof(ifr))
+ break;
+ memset(&ifr, 0, sizeof(struct ifreq));
+ strcpy(ifr.ifr_name, ifa->ifa_label);
+
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+ ifa->ifa_local;
+
+ if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+ done = -EFAULT;
+ break;
+ }
+ buf += sizeof(struct ifreq);
+ len -= sizeof(struct ifreq);
+ done += sizeof(struct ifreq);
+ }
+out:
+ return done;
+}
+
+__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
+{
+ __be32 addr = 0;
+ struct in_device *in_dev;
+ struct net *net = dev_net(dev);
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto no_in_dev;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope > scope)
+ continue;
+ if (!dst || inet_ifa_match(dst, ifa)) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ if (!addr)
+ addr = ifa->ifa_local;
+ } endfor_ifa(in_dev);
+
+ if (addr)
+ goto out_unlock;
+no_in_dev:
+
+ /* Not loopback addresses on loopback should be preferred
+ in this case. It is important that lo is the first interface
+ in dev_base list.
+ */
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ goto out_unlock;
+ }
+ } endfor_ifa(in_dev);
+ }
+out_unlock:
+ rcu_read_unlock();
+ return addr;
+}
+EXPORT_SYMBOL(inet_select_addr);
+
+static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
+ __be32 local, int scope)
+{
+ int same = 0;
+ __be32 addr = 0;
+
+ for_ifa(in_dev) {
+ if (!addr &&
+ (local == ifa->ifa_local || !local) &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ if (same)
+ break;
+ }
+ if (!same) {
+ same = (!local || inet_ifa_match(local, ifa)) &&
+ (!dst || inet_ifa_match(dst, ifa));
+ if (same && addr) {
+ if (local || !dst)
+ break;
+ /* Is the selected addr into dst subnet? */
+ if (inet_ifa_match(addr, ifa))
+ break;
+ /* No, then can we use new local src? */
+ if (ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ /* search for large dst subnet for addr */
+ same = 0;
+ }
+ }
+ } endfor_ifa(in_dev);
+
+ return same ? addr : 0;
+}
+
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - net: netns to check, cannot be NULL
+ * - in_dev: only on this interface, NULL=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
+ __be32 dst, __be32 local, int scope)
+{
+ __be32 addr = 0;
+ struct net_device *dev;
+
+ if (in_dev)
+ return confirm_addr_indev(in_dev, dst, local, scope);
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
+ addr = confirm_addr_indev(in_dev, dst, local, scope);
+ if (addr)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return addr;
+}
+EXPORT_SYMBOL(inet_confirm_addr);
+
+/*
+ * Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(register_inetaddr_notifier);
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ int named = 0;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ char old[IFNAMSIZ], *dot;
+
+ memcpy(old, ifa->ifa_label, IFNAMSIZ);
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (named++ == 0)
+ goto skip;
+ dot = strchr(old, ':');
+ if (!dot) {
+ sprintf(old, ":%d", named);
+ dot = old;
+ }
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
+ strcat(ifa->ifa_label, dot);
+ else
+ strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
+skip:
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+}
+
+static bool inetdev_valid_mtu(unsigned int mtu)
+{
+ return mtu >= 68;
+}
+
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+ struct in_device *in_dev)
+
+{
+ struct in_ifaddr *ifa;
+
+ for (ifa = in_dev->ifa_list; ifa;
+ ifa = ifa->ifa_next) {
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ifa->ifa_local, dev,
+ ifa->ifa_local, NULL,
+ dev->dev_addr, NULL);
+ }
+}
+
+/* Called only under RTNL semaphore */
+
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ if (event == NETDEV_REGISTER) {
+ in_dev = inetdev_init(dev);
+ if (IS_ERR(in_dev))
+ return notifier_from_errno(PTR_ERR(in_dev));
+ if (dev->flags & IFF_LOOPBACK) {
+ IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
+ IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
+ }
+ } else if (event == NETDEV_CHANGEMTU) {
+ /* Re-enabling IP */
+ if (inetdev_valid_mtu(dev->mtu))
+ in_dev = inetdev_init(dev);
+ }
+ goto out;
+ }
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ pr_debug("%s: bug\n", __func__);
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
+ break;
+ case NETDEV_UP:
+ if (!inetdev_valid_mtu(dev->mtu))
+ break;
+ if (dev->flags & IFF_LOOPBACK) {
+ struct in_ifaddr *ifa = inet_alloc_ifa();
+
+ if (ifa) {
+ INIT_HLIST_NODE(&ifa->hash);
+ ifa->ifa_local =
+ ifa->ifa_address = htonl(INADDR_LOOPBACK);
+ ifa->ifa_prefixlen = 8;
+ ifa->ifa_mask = inet_make_mask(8);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
+ INFINITY_LIFE_TIME);
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ inet_insert_ifa(ifa);
+ }
+ }
+ ip_mc_up(in_dev);
+ /* fall through */
+ case NETDEV_CHANGEADDR:
+ if (!IN_DEV_ARP_NOTIFY(in_dev))
+ break;
+ /* fall through */
+ case NETDEV_NOTIFY_PEERS:
+ /* Send gratuitous ARP to notify of link change */
+ inetdev_send_gratuitous_arp(dev, in_dev);
+ break;
+ case NETDEV_DOWN:
+ ip_mc_down(in_dev);
+ break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ ip_mc_unmap(in_dev);
+ break;
+ case NETDEV_POST_TYPE_CHANGE:
+ ip_mc_remap(in_dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (inetdev_valid_mtu(dev->mtu))
+ break;
+ /* disable IP when MTU is not enough */
+ case NETDEV_UNREGISTER:
+ inetdev_destroy(in_dev);
+ break;
+ case NETDEV_CHANGENAME:
+ /* Do not notify about label change, this event is
+ * not interesting to applications using netlink.
+ */
+ inetdev_changename(dev, in_dev);
+
+ devinet_sysctl_unregister(in_dev);
+ devinet_sysctl_register(in_dev);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_netdev_notifier = {
+ .notifier_call = inetdev_event,
+};
+
+static size_t inet_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(4) /* IFA_ADDRESS */
+ + nla_total_size(4) /* IFA_LOCAL */
+ + nla_total_size(4) /* IFA_BROADCAST */
+ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
+}
+
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+ unsigned long tstamp, u32 preferred, u32 valid)
+{
+ struct ifa_cacheinfo ci;
+
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
+ ci.ifa_prefered = preferred;
+ ci.ifa_valid = valid;
+
+ return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
+}
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+ u32 portid, u32 seq, int event, unsigned int flags)
+{
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+ u32 preferred, valid;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+ ifm->ifa_flags = ifa->ifa_flags;
+ ifm->ifa_scope = ifa->ifa_scope;
+ ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+
+ if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
+ preferred = ifa->ifa_preferred_lft;
+ valid = ifa->ifa_valid_lft;
+ if (preferred != INFINITY_LIFE_TIME) {
+ long tval = (jiffies - ifa->ifa_tstamp) / HZ;
+
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
+ }
+ } else {
+ preferred = INFINITY_LIFE_TIME;
+ valid = INFINITY_LIFE_TIME;
+ }
+ if ((ifa->ifa_address &&
+ nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) ||
+ (ifa->ifa_local &&
+ nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) ||
+ (ifa->ifa_broadcast &&
+ nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
+ (ifa->ifa_label[0] &&
+ nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
+ nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+ put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
+ preferred, valid))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ int ip_idx, s_ip_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ s_ip_idx = ip_idx = cb->args[2];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ if (h > s_h || idx > s_idx)
+ s_ip_idx = 0;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWADDR, NLM_F_MULTI) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ }
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = ip_idx;
+
+ return skb->len;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ u32 portid)
+{
+ struct sk_buff *skb;
+ u32 seq = nlh ? nlh->nlmsg_seq : 0;
+ int err = -ENOBUFS;
+ struct net *net;
+
+ net = dev_net(ifa->ifa_dev->dev);
+ skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
+}
+
+static size_t inet_get_link_af_size(const struct net_device *dev)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+
+ if (!in_dev)
+ return 0;
+
+ return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
+}
+
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ struct nlattr *nla;
+ int i;
+
+ if (!in_dev)
+ return -ENODATA;
+
+ nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
+ if (!nla)
+ return -EMSGSIZE;
+
+ for (i = 0; i < IPV4_DEVCONF_MAX; i++)
+ ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+
+ return 0;
+}
+
+static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
+ [IFLA_INET_CONF] = { .type = NLA_NESTED },
+};
+
+static int inet_validate_link_af(const struct net_device *dev,
+ const struct nlattr *nla)
+{
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int err, rem;
+
+ if (dev && !__in_dev_get_rtnl(dev))
+ return -EAFNOSUPPORT;
+
+ err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
+ int cfgid = nla_type(a);
+
+ if (nla_len(a) < 4)
+ return -EINVAL;
+
+ if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int rem;
+
+ if (!in_dev)
+ return -EAFNOSUPPORT;
+
+ if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
+ BUG();
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
+ ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
+ }
+
+ return 0;
+}
+
+static int inet_netconf_msgsize_devconf(int type)
+{
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+
+ /* type -1 is used for ALL */
+ if (type == -1 || type == NETCONFA_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_RP_FILTER)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ size += nla_total_size(4);
+ if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ size += nla_total_size(4);
+
+ return size;
+}
+
+static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+ struct ipv4_devconf *devconf, u32 portid,
+ u32 seq, int event, unsigned int flags,
+ int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_INET;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+ goto nla_put_failure;
+
+ /* type -1 is used for ALL */
+ if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_FORWARDING,
+ IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+ nla_put_s32(skb, NETCONFA_RP_FILTER,
+ IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+ IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
+ goto nla_put_failure;
+ if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+ IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
+ struct ipv4_devconf *devconf)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+ RTM_NEWNETCONF, 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+}
+
+static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+ [NETCONFA_FORWARDING] = { .len = sizeof(int) },
+ [NETCONFA_RP_FILTER] = { .len = sizeof(int) },
+ [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+};
+
+static int inet_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX+1];
+ struct netconfmsg *ncm;
+ struct sk_buff *skb;
+ struct ipv4_devconf *devconf;
+ struct in_device *in_dev;
+ struct net_device *dev;
+ int ifindex;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+ devconf_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ err = EINVAL;
+ if (!tb[NETCONFA_IFINDEX])
+ goto errout;
+
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ switch (ifindex) {
+ case NETCONFA_IFINDEX_ALL:
+ devconf = net->ipv4.devconf_all;
+ break;
+ case NETCONFA_IFINDEX_DEFAULT:
+ devconf = net->ipv4.devconf_dflt;
+ break;
+ default:
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev)
+ goto errout;
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ goto errout;
+ devconf = &in_dev->cnf;
+ break;
+ }
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet_netconf_fill_devconf(skb, ifindex, devconf,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ -1);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+}
+
+static int inet_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ if (inet_netconf_fill_devconf(skb, dev->ifindex,
+ &in_dev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ -1) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+ if (h == NETDEV_HASHENTRIES) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) < 0)
+ goto done;
+ else
+ h++;
+ }
+ if (h == NETDEV_HASHENTRIES + 1) {
+ if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) < 0)
+ goto done;
+ else
+ h++;
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static void devinet_copy_dflt_conf(struct net *net, int i)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev && !test_bit(i, in_dev->cnf.state))
+ in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
+ }
+ rcu_read_unlock();
+}
+
+/* called with RTNL locked */
+static void inet_forward_change(struct net *net)
+{
+ struct net_device *dev;
+ int on = IPV4_DEVCONF_ALL(net, FORWARDING);
+
+ IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
+ IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
+
+ for_each_netdev(net, dev) {
+ struct in_device *in_dev;
+ if (on)
+ dev_disable_lro(dev);
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
+ IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
+ }
+ rcu_read_unlock();
+ }
+}
+
+static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
+{
+ if (cnf == net->ipv4.devconf_dflt)
+ return NETCONFA_IFINDEX_DEFAULT;
+ else if (cnf == net->ipv4.devconf_all)
+ return NETCONFA_IFINDEX_ALL;
+ else {
+ struct in_device *idev
+ = container_of(cnf, struct in_device, cnf);
+ return idev->dev->ifindex;
+ }
+}
+
+static int devinet_conf_proc(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int old_value = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
+
+ if (write) {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct net *net = ctl->extra2;
+ int i = (int *)ctl->data - cnf->data;
+ int ifindex;
+
+ set_bit(i, cnf->state);
+
+ if (cnf == net->ipv4.devconf_dflt)
+ devinet_copy_dflt_conf(net, i);
+ if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
+ i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
+ if ((new_value == 0) && (old_value != 0))
+ rt_cache_flush(net);
+
+ if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
+ ifindex, cnf);
+ }
+ if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
+ new_value != old_value) {
+ ifindex = devinet_conf_ifindex(net, cnf);
+ inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ ifindex, cnf);
+ }
+ }
+
+ return ret;
+}
+
+static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ if (write && *valp != val) {
+ struct net *net = ctl->extra2;
+
+ if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
+ if (!rtnl_trylock()) {
+ /* Restore the original values before restarting */
+ *valp = val;
+ *ppos = pos;
+ return restart_syscall();
+ }
+ if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
+ inet_forward_change(net);
+ } else {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct in_device *idev =
+ container_of(cnf, struct in_device, cnf);
+ if (*valp)
+ dev_disable_lro(idev->dev);
+ inet_netconf_notify_devconf(net,
+ NETCONFA_FORWARDING,
+ idev->dev->ifindex,
+ cnf);
+ }
+ rtnl_unlock();
+ rt_cache_flush(net);
+ } else
+ inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt);
+ }
+
+ return ret;
+}
+
+static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ struct net *net = ctl->extra2;
+
+ if (write && *valp != val)
+ rt_cache_flush(net);
+
+ return ret;
+}
+
+#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
+ { \
+ .procname = name, \
+ .data = ipv4_devconf.data + \
+ IPV4_DEVCONF_ ## attr - 1, \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ .extra1 = &ipv4_devconf, \
+ }
+
+#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
+
+#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
+ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
+
+static struct devinet_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
+} devinet_sysctl = {
+ .devinet_vars = {
+ DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
+ devinet_sysctl_forward),
+ DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
+ DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
+ "accept_source_route"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+ DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
+ DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
+ DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
+ DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
+ DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
+ DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+ DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
+ "force_igmp_version"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv2_unsolicited_report_interval"),
+ DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+ "igmpv3_unsolicited_report_interval"),
+
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
+ "promote_secondaries"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
+ "route_localnet"),
+ },
+};
+
+static int __devinet_sysctl_register(struct net *net, char *dev_name,
+ struct ipv4_devconf *p)
+{
+ int i;
+ struct devinet_sysctl_table *t;
+ char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
+
+ t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto out;
+
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+ t->devinet_vars[i].extra1 = p;
+ t->devinet_vars[i].extra2 = net;
+ }
+
+ snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);
+
+ t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
+ if (!t->sysctl_header)
+ goto free;
+
+ p->sysctl = t;
+ return 0;
+
+free:
+ kfree(t);
+out:
+ return -ENOBUFS;
+}
+
+static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
+{
+ struct devinet_sysctl_table *t = cnf->sysctl;
+
+ if (!t)
+ return;
+
+ cnf->sysctl = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t);
+}
+
+static int devinet_sysctl_register(struct in_device *idev)
+{
+ int err;
+
+ if (!sysctl_dev_name_is_allowed(idev->dev->name))
+ return -EINVAL;
+
+ err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
+ if (err)
+ return err;
+ err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ &idev->cnf);
+ if (err)
+ neigh_sysctl_unregister(idev->arp_parms);
+ return err;
+}
+
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+ __devinet_sysctl_unregister(&idev->cnf);
+ neigh_sysctl_unregister(idev->arp_parms);
+}
+
+static struct ctl_table ctl_forward_entry[] = {
+ {
+ .procname = "ip_forward",
+ .data = &ipv4_devconf.data[
+ IPV4_DEVCONF_FORWARDING - 1],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = devinet_sysctl_forward,
+ .extra1 = &ipv4_devconf,
+ .extra2 = &init_net,
+ },
+ { },
+};
+#endif
+
+static __net_init int devinet_init_net(struct net *net)
+{
+ int err;
+ struct ipv4_devconf *all, *dflt;
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl = ctl_forward_entry;
+ struct ctl_table_header *forw_hdr;
+#endif
+
+ err = -ENOMEM;
+ all = &ipv4_devconf;
+ dflt = &ipv4_devconf_dflt;
+
+ if (!net_eq(net, &init_net)) {
+ all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
+ if (!all)
+ goto err_alloc_all;
+
+ dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+ if (!dflt)
+ goto err_alloc_dflt;
+
+#ifdef CONFIG_SYSCTL
+ tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
+ if (!tbl)
+ goto err_alloc_ctl;
+
+ tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+ tbl[0].extra1 = all;
+ tbl[0].extra2 = net;
+#endif
+ }
+
+#ifdef CONFIG_SYSCTL
+ err = __devinet_sysctl_register(net, "all", all);
+ if (err < 0)
+ goto err_reg_all;
+
+ err = __devinet_sysctl_register(net, "default", dflt);
+ if (err < 0)
+ goto err_reg_dflt;
+
+ err = -ENOMEM;
+ forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
+ if (!forw_hdr)
+ goto err_reg_ctl;
+ net->ipv4.forw_hdr = forw_hdr;
+#endif
+
+ net->ipv4.devconf_all = all;
+ net->ipv4.devconf_dflt = dflt;
+ return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_ctl:
+ __devinet_sysctl_unregister(dflt);
+err_reg_dflt:
+ __devinet_sysctl_unregister(all);
+err_reg_all:
+ if (tbl != ctl_forward_entry)
+ kfree(tbl);
+err_alloc_ctl:
+#endif
+ if (dflt != &ipv4_devconf_dflt)
+ kfree(dflt);
+err_alloc_dflt:
+ if (all != &ipv4_devconf)
+ kfree(all);
+err_alloc_all:
+ return err;
+}
+
+static __net_exit void devinet_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ tbl = net->ipv4.forw_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.forw_hdr);
+ __devinet_sysctl_unregister(net->ipv4.devconf_dflt);
+ __devinet_sysctl_unregister(net->ipv4.devconf_all);
+ kfree(tbl);
+#endif
+ kfree(net->ipv4.devconf_dflt);
+ kfree(net->ipv4.devconf_all);
+}
+
+static __net_initdata struct pernet_operations devinet_ops = {
+ .init = devinet_init_net,
+ .exit = devinet_exit_net,
+};
+
+static struct rtnl_af_ops inet_af_ops __read_mostly = {
+ .family = AF_INET,
+ .fill_link_af = inet_fill_link_af,
+ .get_link_af_size = inet_get_link_af_size,
+ .validate_link_af = inet_validate_link_af,
+ .set_link_af = inet_set_link_af,
+};
+
+void __init devinet_init(void)
+{
+ int i;
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
+ register_pernet_subsys(&devinet_ops);
+
+ register_gifconf(PF_INET, inet_gifconf);
+ register_netdevice_notifier(&ip_netdev_notifier);
+
+ queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+
+ rtnl_af_register(&inet_af_ops);
+
+ rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+ rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
+ inet_netconf_dump_devconf, NULL);
+}
+
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 000000000..30b544f02
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,731 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/pfkeyv2.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/in6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+
+struct esp_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the IV is placed at the front, followed
+ * by the request and finally the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
+{
+ unsigned int len;
+
+ len = seqhilen;
+
+ len += crypto_aead_ivsize(aead);
+
+ if (len) {
+ len += crypto_aead_alignmask(aead) &
+ ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ }
+
+ len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+ return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+{
+ return crypto_aead_ivsize(aead) ?
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+}
+
+static inline struct aead_givcrypt_request *esp_tmp_givreq(
+ struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_givcrypt_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_givcrypt_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_request_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+ struct aead_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static inline struct scatterlist *esp_givreq_sg(
+ struct crypto_aead *aead, struct aead_givcrypt_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static void esp_output_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct aead_givcrypt_request *req;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+ struct sk_buff *trailer;
+ void *tmp;
+ u8 *iv;
+ u8 *tail;
+ int blksize;
+ int clen;
+ int alen;
+ int plen;
+ int tfclen;
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
+
+ /* skb is pure payload to encrypt */
+
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+
+ tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
+
+ padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ tfclen = padto - skb->len;
+ }
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ clen = ALIGN(skb->len + 2 + tfclen, blksize);
+ plen = clen - skb->len - tfclen;
+
+ err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+ if (err < 0)
+ goto error;
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_givreq(aead, iv);
+ asg = esp_givreq_sg(aead, req);
+ sg = asg + sglists;
+
+ /* Fill padding... */
+ tail = skb_tail_pointer(trailer);
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
+ do {
+ int i;
+ for (i = 0; i < plen - 2; i++)
+ tail[i] = i + 1;
+ } while (0);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = *skb_mac_header(skb);
+ pskb_put(skb, trailer, clen - skb->len + alen);
+
+ skb_push(skb, -skb_network_offset(skb));
+ esph = ip_esp_hdr(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ /* this is non-NULL only with UDP Encapsulation */
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct udphdr *uh;
+ __be32 *udpdata32;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
+ uh = (struct udphdr *)esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(skb->len - skb_transport_offset(skb));
+ uh->check = 0;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = (struct ip_esp_hdr *)(uh + 1);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ udpdata32 = (__be32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ break;
+ }
+
+ *skb_mac_header(skb) = IPPROTO_UDP;
+ }
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
+ clen + alen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
+ aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
+ aead_givcrypt_set_assoc(req, asg, assoclen);
+ aead_givcrypt_set_giv(req, esph->enc_data,
+ XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ err = crypto_aead_givencrypt(req);
+ if (err == -EINPROGRESS)
+ goto error;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+
+ kfree(tmp);
+
+error:
+ return err;
+}
+
+static int esp_input_done2(struct sk_buff *skb, int err)
+{
+ const struct iphdr *iph;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct crypto_aead *aead = x->data;
+ int alen = crypto_aead_authsize(aead);
+ int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ int elen = skb->len - hlen;
+ int ihl;
+ u8 nexthdr[2];
+ int padlen;
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+
+ if (unlikely(err))
+ goto out;
+
+ if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+ BUG();
+
+ err = -EINVAL;
+ padlen = nexthdr[0];
+ if (padlen + 2 + alen >= elen)
+ goto out;
+
+ /* ... check padding bits here. Silly. :-) */
+
+ iph = ip_hdr(skb);
+ ihl = iph->ihl * 4;
+
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+
+ /*
+ * 1) if the NAT-T peer's IP or port changed then
+ * advertize the change to the keying daemon.
+ * This is an inbound SA, so just compare
+ * SRC ports.
+ */
+ if (iph->saddr != x->props.saddr.a4 ||
+ uh->source != encap->encap_sport) {
+ xfrm_address_t ipaddr;
+
+ ipaddr.a4 = iph->saddr;
+ km_new_mapping(x, &ipaddr, uh->source);
+
+ /* XXX: perhaps add an extra
+ * policy check here, to see
+ * if we should allow or
+ * reject a packet from a
+ * different source
+ * address/port.
+ */
+ }
+
+ /*
+ * 2) ignore UDP/TCP checksums in case
+ * of NAT-T in Transport Mode, or
+ * perform other post-processing fixes
+ * as per draft-ietf-ipsec-udp-encaps-06,
+ * section 3.1.2
+ */
+ if (x->props.mode == XFRM_MODE_TRANSPORT)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ pskb_trim(skb, skb->len - alen - padlen - 2);
+ __skb_pull(skb, hlen);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -ihl);
+
+ err = nexthdr[1];
+
+ /* RFC4303: Drop dummy packets without any error */
+ if (err == IPPROTO_NONE)
+ err = -EINVAL;
+
+out:
+ return err;
+}
+
+static void esp_input_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead = x->data;
+ struct aead_request *req;
+ struct sk_buff *trailer;
+ int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
+ void *tmp;
+ u8 *iv;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+ int err = -EINVAL;
+
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ goto out;
+
+ if (elen <= 0)
+ goto out;
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ err = -ENOMEM;
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp)
+ goto out;
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ asg = esp_req_sg(aead, req);
+ sg = asg + sglists;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ esph = (struct ip_esp_hdr *)skb->data;
+
+ /* Get ivec. This can be wrong, check against another impls. */
+ iv = esph->enc_data;
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ aead_request_set_crypt(req, sg, sg, elen, iv);
+ aead_request_set_assoc(req, asg, assoclen);
+
+ err = crypto_aead_decrypt(req);
+ if (err == -EINPROGRESS)
+ goto out;
+
+ err = esp_input_done2(skb, err);
+
+out:
+ return err;
+}
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
+{
+ struct crypto_aead *aead = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ unsigned int net_adj;
+
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ case XFRM_MODE_BEET:
+ net_adj = sizeof(struct iphdr);
+ break;
+ case XFRM_MODE_TUNNEL:
+ net_adj = 0;
+ break;
+ default:
+ BUG();
+ }
+
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
+}
+
+static int esp4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ esph->spi, IPPROTO_ESP, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static void esp_destroy(struct xfrm_state *x)
+{
+ struct crypto_aead *aead = x->data;
+
+ if (!aead)
+ return;
+
+ crypto_free_aead(aead);
+}
+
+static int esp_init_aead(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ int err;
+
+ aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ err = crypto_aead_setkey(aead, x->aead->alg_key,
+ (x->aead->alg_key_len + 7) / 8);
+ if (err)
+ goto error;
+
+ err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+ if (err)
+ goto error;
+
+error:
+ return err;
+}
+
+static int esp_init_authenc(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+ char *key;
+ char *p;
+ char authenc_name[CRYPTO_MAX_ALG_NAME];
+ unsigned int keylen;
+ int err;
+
+ err = -EINVAL;
+ if (!x->ealg)
+ goto error;
+
+ err = -ENAMETOOLONG;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authencesn(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authenc(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ }
+
+ aead = crypto_alloc_aead(authenc_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+ (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+ err = -ENOMEM;
+ key = kmalloc(keylen, GFP_KERNEL);
+ if (!key)
+ goto error;
+
+ p = key;
+ rta = (void *)p;
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ param = RTA_DATA(rta);
+ p += RTA_SPACE(sizeof(*param));
+
+ if (x->aalg) {
+ struct xfrm_algo_desc *aalg_desc;
+
+ memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+ p += (x->aalg->alg_key_len + 7) / 8;
+
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ err = -EINVAL;
+ if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
+ crypto_aead_authsize(aead)) {
+ pr_info("ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_aead_authsize(aead),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
+ goto free_key;
+ }
+
+ err = crypto_aead_setauthsize(
+ aead, x->aalg->alg_trunc_len / 8);
+ if (err)
+ goto free_key;
+ }
+
+ param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+ memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+ err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+ kfree(key);
+
+error:
+ return err;
+}
+
+static int esp_init_state(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ u32 align;
+ int err;
+
+ x->data = NULL;
+
+ if (x->aead)
+ err = esp_init_aead(x);
+ else
+ err = esp_init_authenc(x);
+
+ if (err)
+ goto error;
+
+ aead = x->data;
+
+ x->props.header_len = sizeof(struct ip_esp_hdr) +
+ crypto_aead_ivsize(aead);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ x->props.header_len += sizeof(struct iphdr);
+ else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
+ x->props.header_len += IPV4_BEET_PHMAXLEN;
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+
+ switch (encap->encap_type) {
+ default:
+ goto error;
+ case UDP_ENCAP_ESPINUDP:
+ x->props.header_len += sizeof(struct udphdr);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+ break;
+ }
+ }
+
+ align = ALIGN(crypto_aead_blocksize(aead), 4);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
+
+error:
+ return err;
+}
+
+static int esp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type esp_type =
+{
+ .description = "ESP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .flags = XFRM_TYPE_REPLAY_PROT,
+ .init_state = esp_init_state,
+ .destructor = esp_destroy,
+ .get_mtu = esp4_get_mtu,
+ .input = esp_input,
+ .output = esp_output
+};
+
+static struct xfrm4_protocol esp4_protocol = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = esp4_rcv_cb,
+ .err_handler = esp4_err,
+ .priority = 0,
+};
+
+static int __init esp4_init(void)
+{
+ if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&esp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit esp4_fini(void)
+{
+ if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 000000000..872494e6e
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,1251 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#include <net/rtnetlink.h>
+#include <net/xfrm.h>
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+static int __net_init fib4_rules_init(struct net *net)
+{
+ struct fib_table *local_table, *main_table;
+
+ main_table = fib_trie_table(RT_TABLE_MAIN, NULL);
+ if (!main_table)
+ return -ENOMEM;
+
+ local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
+ if (!local_table)
+ goto fail;
+
+ hlist_add_head_rcu(&local_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
+ hlist_add_head_rcu(&main_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
+ return 0;
+
+fail:
+ fib_free_table(main_table);
+ return -ENOMEM;
+}
+#else
+
+struct fib_table *fib_new_table(struct net *net, u32 id)
+{
+ struct fib_table *tb, *alias = NULL;
+ unsigned int h;
+
+ if (id == 0)
+ id = RT_TABLE_MAIN;
+ tb = fib_get_table(net, id);
+ if (tb)
+ return tb;
+
+ if (id == RT_TABLE_LOCAL)
+ alias = fib_new_table(net, RT_TABLE_MAIN);
+
+ tb = fib_trie_table(id, alias);
+ if (!tb)
+ return NULL;
+
+ switch (id) {
+ case RT_TABLE_LOCAL:
+ rcu_assign_pointer(net->ipv4.fib_local, tb);
+ break;
+ case RT_TABLE_MAIN:
+ rcu_assign_pointer(net->ipv4.fib_main, tb);
+ break;
+ case RT_TABLE_DEFAULT:
+ rcu_assign_pointer(net->ipv4.fib_default, tb);
+ break;
+ default:
+ break;
+ }
+
+ h = id & (FIB_TABLE_HASHSZ - 1);
+ hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
+ return tb;
+}
+
+/* caller must hold either rtnl or rcu read lock */
+struct fib_table *fib_get_table(struct net *net, u32 id)
+{
+ struct fib_table *tb;
+ struct hlist_head *head;
+ unsigned int h;
+
+ if (id == 0)
+ id = RT_TABLE_MAIN;
+ h = id & (FIB_TABLE_HASHSZ - 1);
+
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ if (tb->tb_id == id)
+ return tb;
+ }
+ return NULL;
+}
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+static void fib_replace_table(struct net *net, struct fib_table *old,
+ struct fib_table *new)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ switch (new->tb_id) {
+ case RT_TABLE_LOCAL:
+ rcu_assign_pointer(net->ipv4.fib_local, new);
+ break;
+ case RT_TABLE_MAIN:
+ rcu_assign_pointer(net->ipv4.fib_main, new);
+ break;
+ case RT_TABLE_DEFAULT:
+ rcu_assign_pointer(net->ipv4.fib_default, new);
+ break;
+ default:
+ break;
+ }
+
+#endif
+ /* replace the old table in the hlist */
+ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
+}
+
+int fib_unmerge(struct net *net)
+{
+ struct fib_table *old, *new;
+
+ /* attempt to fetch local table if it has been allocated */
+ old = fib_get_table(net, RT_TABLE_LOCAL);
+ if (!old)
+ return 0;
+
+ new = fib_trie_unmerge(old);
+ if (!new)
+ return -ENOMEM;
+
+ /* replace merged table with clean table */
+ if (new != old) {
+ fib_replace_table(net, old, new);
+ fib_free_table(old);
+ }
+
+ return 0;
+}
+
+static void fib_flush(struct net *net)
+{
+ int flushed = 0;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct hlist_node *tmp;
+ struct fib_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
+ flushed += fib_table_flush(tb);
+ }
+
+ if (flushed)
+ rt_cache_flush(net);
+}
+
+void fib_flush_external(struct net *net)
+{
+ struct fib_table *tb;
+ struct hlist_head *head;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, head, tb_hlist)
+ fib_table_flush_external(tb);
+ }
+}
+
+/*
+ * Find address type as if only "dev" was present in the system. If
+ * on_dev is NULL then all interfaces are taken into consideration.
+ */
+static inline unsigned int __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
+{
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res;
+ unsigned int ret = RTN_BROADCAST;
+ struct fib_table *local_table;
+
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
+ return RTN_BROADCAST;
+ if (ipv4_is_multicast(addr))
+ return RTN_MULTICAST;
+
+ rcu_read_lock();
+
+ local_table = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local_table) {
+ ret = RTN_UNICAST;
+ if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+ if (!dev || dev == res.fi->fib_dev)
+ ret = res.type;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+unsigned int inet_addr_type(struct net *net, __be32 addr)
+{
+ return __inet_dev_addr_type(net, NULL, addr);
+}
+EXPORT_SYMBOL(inet_addr_type);
+
+unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
+ __be32 addr)
+{
+ return __inet_dev_addr_type(net, dev, addr);
+}
+EXPORT_SYMBOL(inet_dev_addr_type);
+
+__be32 fib_compute_spec_dst(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ struct net *net;
+ int scope;
+
+ rt = skb_rtable(skb);
+ if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
+ RTCF_LOCAL)
+ return ip_hdr(skb)->daddr;
+
+ in_dev = __in_dev_get_rcu(dev);
+ BUG_ON(!in_dev);
+
+ net = dev_net(dev);
+
+ scope = RT_SCOPE_UNIVERSE;
+ if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ fl4.daddr = ip_hdr(skb)->saddr;
+ fl4.saddr = 0;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_scope = scope;
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+ if (!fib_lookup(net, &fl4, &res))
+ return FIB_RES_PREFSRC(net, res);
+ } else {
+ scope = RT_SCOPE_LINK;
+ }
+
+ return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
+}
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ * address.
+ * - figure out what "logical" interface this packet arrived
+ * and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
+ */
+static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ int rpf, struct in_device *idev, u32 *itag)
+{
+ int ret, no_addr;
+ struct fib_result res;
+ struct flowi4 fl4;
+ struct net *net;
+ bool dev_match;
+
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.daddr = src;
+ fl4.saddr = dst;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+
+ no_addr = idev->ifa_list == NULL;
+
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
+
+ net = dev_net(dev);
+ if (fib_lookup(net, &fl4, &res))
+ goto last_resort;
+ if (res.type != RTN_UNICAST &&
+ (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
+ goto e_inval;
+ if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
+ (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
+ goto last_resort;
+ fib_combine_itag(itag, &res);
+ dev_match = false;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
+#else
+ if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
+#endif
+ if (dev_match) {
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ return ret;
+ }
+ if (no_addr)
+ goto last_resort;
+ if (rpf == 1)
+ goto e_rpf;
+ fl4.flowi4_oif = dev->ifindex;
+
+ ret = 0;
+ if (fib_lookup(net, &fl4, &res) == 0) {
+ if (res.type == RTN_UNICAST)
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ }
+ return ret;
+
+last_resort:
+ if (rpf)
+ goto e_rpf;
+ *itag = 0;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+e_rpf:
+ return -EXDEV;
+}
+
+/* Ignore rp_filter for packets protected by IPsec. */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
+ u8 tos, int oif, struct net_device *dev,
+ struct in_device *idev, u32 *itag)
+{
+ int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+
+ if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
+ IN_DEV_ACCEPT_LOCAL(idev) &&
+ (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+ *itag = 0;
+ return 0;
+ }
+ return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
+}
+
+static inline __be32 sk_extract_addr(struct sockaddr *addr)
+{
+ return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+}
+
+static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
+{
+ struct nlattr *nla;
+
+ nla = (struct nlattr *) ((char *) mx + len);
+ nla->nla_type = type;
+ nla->nla_len = nla_attr_size(4);
+ *(u32 *) nla_data(nla) = value;
+
+ return len + nla_total_size(4);
+}
+
+static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
+ struct fib_config *cfg)
+{
+ __be32 addr;
+ int plen;
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->fc_nlinfo.nl_net = net;
+
+ if (rt->rt_dst.sa_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ /*
+ * Check mask for validity:
+ * a) it must be contiguous.
+ * b) destination must have all host bits clear.
+ * c) if application forgot to set correct family (AF_INET),
+ * reject request unless it is absolutely clear i.e.
+ * both family and mask are zero.
+ */
+ plen = 32;
+ addr = sk_extract_addr(&rt->rt_dst);
+ if (!(rt->rt_flags & RTF_HOST)) {
+ __be32 mask = sk_extract_addr(&rt->rt_genmask);
+
+ if (rt->rt_genmask.sa_family != AF_INET) {
+ if (mask || rt->rt_genmask.sa_family)
+ return -EAFNOSUPPORT;
+ }
+
+ if (bad_mask(mask, addr))
+ return -EINVAL;
+
+ plen = inet_mask_len(mask);
+ }
+
+ cfg->fc_dst_len = plen;
+ cfg->fc_dst = addr;
+
+ if (cmd != SIOCDELRT) {
+ cfg->fc_nlflags = NLM_F_CREATE;
+ cfg->fc_protocol = RTPROT_BOOT;
+ }
+
+ if (rt->rt_metric)
+ cfg->fc_priority = rt->rt_metric - 1;
+
+ if (rt->rt_flags & RTF_REJECT) {
+ cfg->fc_scope = RT_SCOPE_HOST;
+ cfg->fc_type = RTN_UNREACHABLE;
+ return 0;
+ }
+
+ cfg->fc_scope = RT_SCOPE_NOWHERE;
+ cfg->fc_type = RTN_UNICAST;
+
+ if (rt->rt_dev) {
+ char *colon;
+ struct net_device *dev;
+ char devname[IFNAMSIZ];
+
+ if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+
+ devname[IFNAMSIZ-1] = 0;
+ colon = strchr(devname, ':');
+ if (colon)
+ *colon = 0;
+ dev = __dev_get_by_name(net, devname);
+ if (!dev)
+ return -ENODEV;
+ cfg->fc_oif = dev->ifindex;
+ if (colon) {
+ struct in_ifaddr *ifa;
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return -ENODEV;
+ *colon = ':';
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+ if (strcmp(ifa->ifa_label, devname) == 0)
+ break;
+ if (!ifa)
+ return -ENODEV;
+ cfg->fc_prefsrc = ifa->ifa_local;
+ }
+ }
+
+ addr = sk_extract_addr(&rt->rt_gateway);
+ if (rt->rt_gateway.sa_family == AF_INET && addr) {
+ cfg->fc_gw = addr;
+ if (rt->rt_flags & RTF_GATEWAY &&
+ inet_addr_type(net, addr) == RTN_UNICAST)
+ cfg->fc_scope = RT_SCOPE_UNIVERSE;
+ }
+
+ if (cmd == SIOCDELRT)
+ return 0;
+
+ if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+ return -EINVAL;
+
+ if (cfg->fc_scope == RT_SCOPE_NOWHERE)
+ cfg->fc_scope = RT_SCOPE_LINK;
+
+ if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
+ struct nlattr *mx;
+ int len = 0;
+
+ mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
+ if (!mx)
+ return -ENOMEM;
+
+ if (rt->rt_flags & RTF_MTU)
+ len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
+
+ if (rt->rt_flags & RTF_WINDOW)
+ len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
+
+ if (rt->rt_flags & RTF_IRTT)
+ len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
+
+ cfg->fc_mx = mx;
+ cfg->fc_mx_len = len;
+ }
+
+ return 0;
+}
+
+/*
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
+ */
+int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct fib_config cfg;
+ struct rtentry rt;
+ int err;
+
+ switch (cmd) {
+ case SIOCADDRT: /* Add a route */
+ case SIOCDELRT: /* Delete a route */
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&rt, arg, sizeof(rt)))
+ return -EFAULT;
+
+ rtnl_lock();
+ err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
+ if (err == 0) {
+ struct fib_table *tb;
+
+ if (cmd == SIOCDELRT) {
+ tb = fib_get_table(net, cfg.fc_table);
+ if (tb)
+ err = fib_table_delete(tb, &cfg);
+ else
+ err = -ESRCH;
+ } else {
+ tb = fib_new_table(net, cfg.fc_table);
+ if (tb)
+ err = fib_table_insert(tb, &cfg);
+ else
+ err = -ENOBUFS;
+ }
+
+ /* allocated by rtentry_to_fib_config() */
+ kfree(cfg.fc_mx);
+ }
+ rtnl_unlock();
+ return err;
+ }
+ return -EINVAL;
+}
+
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+ [RTA_DST] = { .type = NLA_U32 },
+ [RTA_SRC] = { .type = NLA_U32 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_OIF] = { .type = NLA_U32 },
+ [RTA_GATEWAY] = { .type = NLA_U32 },
+ [RTA_PRIORITY] = { .type = NLA_U32 },
+ [RTA_PREFSRC] = { .type = NLA_U32 },
+ [RTA_METRICS] = { .type = NLA_NESTED },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+ [RTA_FLOW] = { .type = NLA_U32 },
+};
+
+static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct fib_config *cfg)
+{
+ struct nlattr *attr;
+ int err, remaining;
+ struct rtmsg *rtm;
+
+ err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ memset(cfg, 0, sizeof(*cfg));
+
+ rtm = nlmsg_data(nlh);
+ cfg->fc_dst_len = rtm->rtm_dst_len;
+ cfg->fc_tos = rtm->rtm_tos;
+ cfg->fc_table = rtm->rtm_table;
+ cfg->fc_protocol = rtm->rtm_protocol;
+ cfg->fc_scope = rtm->rtm_scope;
+ cfg->fc_type = rtm->rtm_type;
+ cfg->fc_flags = rtm->rtm_flags;
+ cfg->fc_nlflags = nlh->nlmsg_flags;
+
+ cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->fc_nlinfo.nlh = nlh;
+ cfg->fc_nlinfo.nl_net = net;
+
+ if (cfg->fc_type > RTN_MAX) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
+ switch (nla_type(attr)) {
+ case RTA_DST:
+ cfg->fc_dst = nla_get_be32(attr);
+ break;
+ case RTA_OIF:
+ cfg->fc_oif = nla_get_u32(attr);
+ break;
+ case RTA_GATEWAY:
+ cfg->fc_gw = nla_get_be32(attr);
+ break;
+ case RTA_PRIORITY:
+ cfg->fc_priority = nla_get_u32(attr);
+ break;
+ case RTA_PREFSRC:
+ cfg->fc_prefsrc = nla_get_be32(attr);
+ break;
+ case RTA_METRICS:
+ cfg->fc_mx = nla_data(attr);
+ cfg->fc_mx_len = nla_len(attr);
+ break;
+ case RTA_MULTIPATH:
+ cfg->fc_mp = nla_data(attr);
+ cfg->fc_mp_len = nla_len(attr);
+ break;
+ case RTA_FLOW:
+ cfg->fc_flow = nla_get_u32(attr);
+ break;
+ case RTA_TABLE:
+ cfg->fc_table = nla_get_u32(attr);
+ break;
+ }
+ }
+
+ return 0;
+errout:
+ return err;
+}
+
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_config cfg;
+ struct fib_table *tb;
+ int err;
+
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
+ if (err < 0)
+ goto errout;
+
+ tb = fib_get_table(net, cfg.fc_table);
+ if (!tb) {
+ err = -ESRCH;
+ goto errout;
+ }
+
+ err = fib_table_delete(tb, &cfg);
+errout:
+ return err;
+}
+
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_config cfg;
+ struct fib_table *tb;
+ int err;
+
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
+ if (err < 0)
+ goto errout;
+
+ tb = fib_new_table(net, cfg.fc_table);
+ if (!tb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = fib_table_insert(tb, &cfg);
+errout:
+ return err;
+}
+
+static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int h, s_h;
+ unsigned int e = 0, s_e;
+ struct fib_table *tb;
+ struct hlist_head *head;
+ int dumped = 0;
+
+ if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
+ ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
+ return skb->len;
+
+ s_h = cb->args[0];
+ s_e = cb->args[1];
+
+ rcu_read_lock();
+
+ for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
+ e = 0;
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ if (e < s_e)
+ goto next;
+ if (dumped)
+ memset(&cb->args[2], 0, sizeof(cb->args) -
+ 2 * sizeof(cb->args[0]));
+ if (fib_table_dump(tb, skb, cb) < 0)
+ goto out;
+ dumped = 1;
+next:
+ e++;
+ }
+ }
+out:
+ rcu_read_unlock();
+
+ cb->args[1] = e;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+/* Prepare and feed intra-kernel routing request.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
+ */
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+ struct net *net = dev_net(ifa->ifa_dev->dev);
+ struct fib_table *tb;
+ struct fib_config cfg = {
+ .fc_protocol = RTPROT_KERNEL,
+ .fc_type = type,
+ .fc_dst = dst,
+ .fc_dst_len = dst_len,
+ .fc_prefsrc = ifa->ifa_local,
+ .fc_oif = ifa->ifa_dev->dev->ifindex,
+ .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+ .fc_nlinfo = {
+ .nl_net = net,
+ },
+ };
+
+ if (type == RTN_UNICAST)
+ tb = fib_new_table(net, RT_TABLE_MAIN);
+ else
+ tb = fib_new_table(net, RT_TABLE_LOCAL);
+
+ if (!tb)
+ return;
+
+ cfg.fc_table = tb->tb_id;
+
+ if (type != RTN_LOCAL)
+ cfg.fc_scope = RT_SCOPE_LINK;
+ else
+ cfg.fc_scope = RT_SCOPE_HOST;
+
+ if (cmd == RTM_NEWROUTE)
+ fib_table_insert(tb, &cfg);
+ else
+ fib_table_delete(tb, &cfg);
+}
+
+void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *prim = ifa;
+ __be32 mask = ifa->ifa_mask;
+ __be32 addr = ifa->ifa_local;
+ __be32 prefix = ifa->ifa_address & mask;
+
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, prefix, mask);
+ if (!prim) {
+ pr_warn("%s: bug: prim == NULL\n", __func__);
+ return;
+ }
+ }
+
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+
+ if (!(dev->flags & IFF_UP))
+ return;
+
+ /* Add broadcast address, if it is explicitly assigned. */
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
+ (prefix != addr || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
+
+ /* Add network specific broadcasts, when it takes a sense */
+ if (ifa->ifa_prefixlen < 31) {
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+ 32, prim);
+ }
+ }
+}
+
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *ifa1;
+ struct in_ifaddr *prim = ifa, *prim1 = NULL;
+ __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+ __be32 any = ifa->ifa_address & ifa->ifa_mask;
+#define LOCAL_OK 1
+#define BRD_OK 2
+#define BRD0_OK 4
+#define BRD1_OK 8
+ unsigned int ok = 0;
+ int subnet = 0; /* Primary network */
+ int gone = 1; /* Address is missing */
+ int same_prefsrc = 0; /* Another primary with same IP */
+
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+ if (!prim) {
+ pr_warn("%s: bug: prim == NULL\n", __func__);
+ return;
+ }
+ if (iprim && iprim != prim) {
+ pr_warn("%s: bug: iprim != prim\n", __func__);
+ return;
+ }
+ } else if (!ipv4_is_zeronet(any) &&
+ (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
+ subnet = 1;
+ }
+
+ /* Deletion is more complicated than add.
+ * We should take care of not to delete too much :-)
+ *
+ * Scan address list to be sure that addresses are really gone.
+ */
+
+ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ if (ifa1 == ifa) {
+ /* promotion, keep the IP */
+ gone = 0;
+ continue;
+ }
+ /* Ignore IFAs from our subnet */
+ if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, iprim))
+ continue;
+
+ /* Ignore ifa1 if it uses different primary IP (prefsrc) */
+ if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+ /* Another address from our subnet? */
+ if (ifa1->ifa_mask == prim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, prim))
+ prim1 = prim;
+ else {
+ /* We reached the secondaries, so
+ * same_prefsrc should be determined.
+ */
+ if (!same_prefsrc)
+ continue;
+ /* Search new prim1 if ifa1 is not
+ * using the current prim1
+ */
+ if (!prim1 ||
+ ifa1->ifa_mask != prim1->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, prim1))
+ prim1 = inet_ifa_byprefix(in_dev,
+ ifa1->ifa_address,
+ ifa1->ifa_mask);
+ if (!prim1)
+ continue;
+ if (prim1->ifa_local != prim->ifa_local)
+ continue;
+ }
+ } else {
+ if (prim->ifa_local != ifa1->ifa_local)
+ continue;
+ prim1 = ifa1;
+ if (prim != prim1)
+ same_prefsrc = 1;
+ }
+ if (ifa->ifa_local == ifa1->ifa_local)
+ ok |= LOCAL_OK;
+ if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+ ok |= BRD_OK;
+ if (brd == ifa1->ifa_broadcast)
+ ok |= BRD1_OK;
+ if (any == ifa1->ifa_broadcast)
+ ok |= BRD0_OK;
+ /* primary has network specific broadcasts */
+ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+ __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+ __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+
+ if (!ipv4_is_zeronet(any1)) {
+ if (ifa->ifa_broadcast == brd1 ||
+ ifa->ifa_broadcast == any1)
+ ok |= BRD_OK;
+ if (brd == brd1 || brd == any1)
+ ok |= BRD1_OK;
+ if (any == brd1 || any == any1)
+ ok |= BRD0_OK;
+ }
+ }
+ }
+
+ if (!(ok & BRD_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ if (subnet && ifa->ifa_prefixlen < 31) {
+ if (!(ok & BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ if (!(ok & BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ }
+ if (!(ok & LOCAL_OK)) {
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+
+ /* Check, that this local address finally disappeared. */
+ if (gone &&
+ inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+ /* And the last, but not the least thing.
+ * We must flush stray FIB entries.
+ *
+ * First of all, we scan fib_info list searching
+ * for stray nexthop entries, then ignite fib_flush.
+ */
+ if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+ fib_flush(dev_net(dev));
+ }
+ }
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
+{
+
+ struct fib_result res;
+ struct flowi4 fl4 = {
+ .flowi4_mark = frn->fl_mark,
+ .daddr = frn->fl_addr,
+ .flowi4_tos = frn->fl_tos,
+ .flowi4_scope = frn->fl_scope,
+ };
+ struct fib_table *tb;
+
+ rcu_read_lock();
+
+ tb = fib_get_table(net, frn->tb_id_in);
+
+ frn->err = -ENOENT;
+ if (tb) {
+ local_bh_disable();
+
+ frn->tb_id = tb->tb_id;
+ frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+
+ if (!frn->err) {
+ frn->prefixlen = res.prefixlen;
+ frn->nh_sel = res.nh_sel;
+ frn->type = res.type;
+ frn->scope = res.scope;
+ }
+ local_bh_enable();
+ }
+
+ rcu_read_unlock();
+}
+
+static void nl_fib_input(struct sk_buff *skb)
+{
+ struct net *net;
+ struct fib_result_nl *frn;
+ struct nlmsghdr *nlh;
+ u32 portid;
+
+ net = sock_net(skb->sk);
+ nlh = nlmsg_hdr(skb);
+ if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(*frn))
+ return;
+
+ skb = netlink_skb_clone(skb, GFP_KERNEL);
+ if (!skb)
+ return;
+ nlh = nlmsg_hdr(skb);
+
+ frn = (struct fib_result_nl *) nlmsg_data(nlh);
+ nl_fib_lookup(net, frn);
+
+ portid = NETLINK_CB(skb).portid; /* netlink portid */
+ NETLINK_CB(skb).portid = 0; /* from kernel */
+ NETLINK_CB(skb).dst_group = 0; /* unicast */
+ netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
+}
+
+static int __net_init nl_fib_lookup_init(struct net *net)
+{
+ struct sock *sk;
+ struct netlink_kernel_cfg cfg = {
+ .input = nl_fib_input,
+ };
+
+ sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
+ if (!sk)
+ return -EAFNOSUPPORT;
+ net->ipv4.fibnl = sk;
+ return 0;
+}
+
+static void nl_fib_lookup_exit(struct net *net)
+{
+ netlink_kernel_release(net->ipv4.fibnl);
+ net->ipv4.fibnl = NULL;
+}
+
+static void fib_disable_ip(struct net_device *dev, int force)
+{
+ if (fib_sync_down_dev(dev, force))
+ fib_flush(dev_net(dev));
+ rt_cache_flush(dev_net(dev));
+ arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
+
+ switch (event) {
+ case NETDEV_UP:
+ fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(dev_net(dev));
+ break;
+ case NETDEV_DOWN:
+ fib_del_ifaddr(ifa, NULL);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ if (!ifa->ifa_dev->ifa_list) {
+ /* Last address was deleted from this interface.
+ * Disable IP.
+ */
+ fib_disable_ip(dev, 1);
+ } else {
+ rt_cache_flush(dev_net(dev));
+ }
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_UNREGISTER) {
+ fib_disable_ip(dev, 2);
+ rt_flush_dev(dev);
+ return NOTIFY_DONE;
+ }
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ for_ifa(in_dev) {
+ fib_add_ifaddr(ifa);
+ } endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(net);
+ break;
+ case NETDEV_DOWN:
+ fib_disable_ip(dev, 0);
+ break;
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGE:
+ rt_cache_flush(net);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_inetaddr_notifier = {
+ .notifier_call = fib_inetaddr_event,
+};
+
+static struct notifier_block fib_netdev_notifier = {
+ .notifier_call = fib_netdev_event,
+};
+
+static int __net_init ip_fib_net_init(struct net *net)
+{
+ int err;
+ size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
+ if (!net->ipv4.fib_table_hash)
+ return -ENOMEM;
+
+ err = fib4_rules_init(net);
+ if (err < 0)
+ goto fail;
+ return 0;
+
+fail:
+ kfree(net->ipv4.fib_table_hash);
+ return err;
+}
+
+static void ip_fib_net_exit(struct net *net)
+{
+ unsigned int i;
+
+ rtnl_lock();
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
+ RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
+ RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
+#endif
+ for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[i];
+ struct hlist_node *tmp;
+ struct fib_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+ hlist_del(&tb->tb_hlist);
+ fib_table_flush(tb);
+ fib_free_table(tb);
+ }
+ }
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ fib4_rules_exit(net);
+#endif
+ rtnl_unlock();
+ kfree(net->ipv4.fib_table_hash);
+}
+
+static int __net_init fib_net_init(struct net *net)
+{
+ int error;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ net->ipv4.fib_num_tclassid_users = 0;
+#endif
+ error = ip_fib_net_init(net);
+ if (error < 0)
+ goto out;
+ error = nl_fib_lookup_init(net);
+ if (error < 0)
+ goto out_nlfl;
+ error = fib_proc_init(net);
+ if (error < 0)
+ goto out_proc;
+out:
+ return error;
+
+out_proc:
+ nl_fib_lookup_exit(net);
+out_nlfl:
+ ip_fib_net_exit(net);
+ goto out;
+}
+
+static void __net_exit fib_net_exit(struct net *net)
+{
+ fib_proc_exit(net);
+ nl_fib_lookup_exit(net);
+ ip_fib_net_exit(net);
+}
+
+static struct pernet_operations fib_net_ops = {
+ .init = fib_net_init,
+ .exit = fib_net_exit,
+};
+
+void __init ip_fib_init(void)
+{
+ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+
+ register_pernet_subsys(&fib_net_ops);
+ register_netdevice_notifier(&fib_netdev_notifier);
+ register_inetaddr_notifier(&fib_inetaddr_notifier);
+
+ fib_trie_init();
+}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 000000000..c6211ed60
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,52 @@
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/ip_fib.h>
+
+struct fib_alias {
+ struct hlist_node fa_list;
+ struct fib_info *fa_info;
+ u8 fa_tos;
+ u8 fa_type;
+ u8 fa_state;
+ u8 fa_slen;
+ u32 tb_id;
+ struct rcu_head rcu;
+};
+
+#define FA_S_ACCESSED 0x01
+
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+ if (!(fa->fa_state & FA_S_ACCESSED))
+ fa->fa_state |= FA_S_ACCESSED;
+}
+
+/* Exported by fib_semantics.c */
+void fib_release_info(struct fib_info *);
+struct fib_info *fib_create_info(struct fib_config *cfg);
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
+ u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
+ unsigned int);
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+ u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+
+static inline void fib_result_assign(struct fib_result *res,
+ struct fib_info *fi)
+{
+ /* we used to play games with refcounts, but we now use RCU */
+ res->fi = fi;
+}
+
+struct fib_prop {
+ int error;
+ u8 scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
+
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 000000000..56151982f
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,369 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: policy rules.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Thomas Graf <tgraf@suug.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Rani Assaf : local_rule cannot be deleted
+ * Marc Boucher : routing by fwmark
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/ip_fib.h>
+#include <net/fib_rules.h>
+
+struct fib4_rule {
+ struct fib_rule common;
+ u8 dst_len;
+ u8 src_len;
+ u8 tos;
+ __be32 src;
+ __be32 srcmask;
+ __be32 dst;
+ __be32 dstmask;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ u32 tclassid;
+#endif
+};
+
+int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+{
+ struct fib_lookup_arg arg = {
+ .result = res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+ int err;
+
+ err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (arg.rule)
+ res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
+ else
+ res->tclassid = 0;
+#endif
+
+ if (err == -ESRCH)
+ err = -ENETUNREACH;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(__fib_lookup);
+
+static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ int err = -EAGAIN;
+ struct fib_table *tbl;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+
+ tbl = fib_get_table(rule->fr_net, rule->table);
+ if (tbl)
+ err = fib_table_lookup(tbl, &flp->u.ip4,
+ (struct fib_result *)arg->result,
+ arg->flags);
+
+ rcu_read_unlock();
+ return err;
+}
+
+static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+ struct fib_result *result = (struct fib_result *) arg->result;
+ struct net_device *dev = NULL;
+
+ if (result->fi)
+ dev = result->fi->fib_dev;
+
+ /* do not accept result if the route does
+ * not meet the required prefix length
+ */
+ if (result->prefixlen <= rule->suppress_prefixlen)
+ goto suppress_route;
+
+ /* do not accept result if the route uses a device
+ * belonging to a forbidden interface group
+ */
+ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+ goto suppress_route;
+
+ return false;
+
+suppress_route:
+ if (!(arg->flags & FIB_LOOKUP_NOREF))
+ fib_info_put(result->fi);
+ return true;
+}
+
+static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+ struct fib4_rule *r = (struct fib4_rule *) rule;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ __be32 daddr = fl4->daddr;
+ __be32 saddr = fl4->saddr;
+
+ if (((saddr ^ r->src) & r->srcmask) ||
+ ((daddr ^ r->dst) & r->dstmask))
+ return 0;
+
+ if (r->tos && (r->tos != fl4->flowi4_tos))
+ return 0;
+
+ return 1;
+}
+
+static struct fib_table *fib_empty_table(struct net *net)
+{
+ u32 id;
+
+ for (id = 1; id <= RT_TABLE_MAX; id++)
+ if (!fib_get_table(net, id))
+ return fib_new_table(net, id);
+ return NULL;
+}
+
+static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
+ FRA_GENERIC_POLICY,
+ [FRA_FLOW] = { .type = NLA_U32 },
+};
+
+static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ struct net *net = sock_net(skb->sk);
+ int err = -EINVAL;
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (frh->tos & ~IPTOS_TOS_MASK)
+ goto errout;
+
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
+ if (rule->table == RT_TABLE_UNSPEC) {
+ if (rule->action == FR_ACT_TO_TBL) {
+ struct fib_table *table;
+
+ table = fib_empty_table(net);
+ if (!table) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ rule->table = table->tb_id;
+ }
+ }
+
+ if (frh->src_len)
+ rule4->src = nla_get_in_addr(tb[FRA_SRC]);
+
+ if (frh->dst_len)
+ rule4->dst = nla_get_in_addr(tb[FRA_DST]);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW]) {
+ rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+ if (rule4->tclassid)
+ net->ipv4.fib_num_tclassid_users++;
+ }
+#endif
+
+ rule4->src_len = frh->src_len;
+ rule4->srcmask = inet_make_mask(rule4->src_len);
+ rule4->dst_len = frh->dst_len;
+ rule4->dstmask = inet_make_mask(rule4->dst_len);
+ rule4->tos = frh->tos;
+
+ net->ipv4.fib_has_custom_rules = true;
+ fib_flush_external(rule->fr_net);
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int fib4_rule_delete(struct fib_rule *rule)
+{
+ struct net *net = rule->fr_net;
+ int err;
+
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (((struct fib4_rule *)rule)->tclassid)
+ net->ipv4.fib_num_tclassid_users--;
+#endif
+ net->ipv4.fib_has_custom_rules = true;
+ fib_flush_external(rule->fr_net);
+errout:
+ return err;
+}
+
+static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (frh->src_len && (rule4->src_len != frh->src_len))
+ return 0;
+
+ if (frh->dst_len && (rule4->dst_len != frh->dst_len))
+ return 0;
+
+ if (frh->tos && (rule4->tos != frh->tos))
+ return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
+ return 0;
+#endif
+
+ if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC])))
+ return 0;
+
+ if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST])))
+ return 0;
+
+ return 1;
+}
+
+static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ frh->dst_len = rule4->dst_len;
+ frh->src_len = rule4->src_len;
+ frh->tos = rule4->tos;
+
+ if ((rule4->dst_len &&
+ nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
+ (rule4->src_len &&
+ nla_put_in_addr(skb, FRA_SRC, rule4->src)))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rule4->tclassid &&
+ nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
+ goto nla_put_failure;
+#endif
+ return 0;
+
+nla_put_failure:
+ return -ENOBUFS;
+}
+
+static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
+{
+ return nla_total_size(4) /* dst */
+ + nla_total_size(4) /* src */
+ + nla_total_size(4); /* flow */
+}
+
+static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+{
+ rt_cache_flush(ops->fro_net);
+}
+
+static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
+ .family = AF_INET,
+ .rule_size = sizeof(struct fib4_rule),
+ .addr_size = sizeof(u32),
+ .action = fib4_rule_action,
+ .suppress = fib4_rule_suppress,
+ .match = fib4_rule_match,
+ .configure = fib4_rule_configure,
+ .delete = fib4_rule_delete,
+ .compare = fib4_rule_compare,
+ .fill = fib4_rule_fill,
+ .default_pref = fib_default_rule_pref,
+ .nlmsg_payload = fib4_rule_nlmsg_payload,
+ .flush_cache = fib4_rule_flush_cache,
+ .nlgroup = RTNLGRP_IPV4_RULE,
+ .policy = fib4_rule_policy,
+ .owner = THIS_MODULE,
+};
+
+static int fib_default_rules_init(struct fib_rules_ops *ops)
+{
+ int err;
+
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+int __net_init fib4_rules_init(struct net *net)
+{
+ int err;
+ struct fib_rules_ops *ops;
+
+ ops = fib_rules_register(&fib4_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ err = fib_default_rules_init(ops);
+ if (err < 0)
+ goto fail;
+ net->ipv4.rules_ops = ops;
+ net->ipv4.fib_has_custom_rules = false;
+ return 0;
+
+fail:
+ /* also cleans all rules already added */
+ fib_rules_unregister(ops);
+ return err;
+}
+
+void __net_exit fib4_rules_exit(struct net *net)
+{
+ fib_rules_unregister(net->ipv4.rules_ops);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 000000000..8d695b665
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1328 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: semantics.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/netlink.h>
+#include <net/nexthop.h>
+
+#include "fib_lookup.h"
+
+static DEFINE_SPINLOCK(fib_info_lock);
+static struct hlist_head *fib_info_hash;
+static struct hlist_head *fib_info_laddrhash;
+static unsigned int fib_info_hash_size;
+static unsigned int fib_info_cnt;
+
+#define DEVINDEX_HASHBITS 8
+#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static DEFINE_SPINLOCK(fib_multipath_lock);
+
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh; \
+ for (nhsel = 0, nh = (fi)->fib_nh; \
+ nhsel < (fi)->fib_nhs; \
+ nh++, nhsel++)
+
+#define change_nexthops(fi) { \
+ int nhsel; struct fib_nh *nexthop_nh; \
+ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ nhsel < (fi)->fib_nhs; \
+ nexthop_nh++, nhsel++)
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
+ for (nhsel = 0; nhsel < 1; nhsel++)
+
+#define change_nexthops(fi) { \
+ int nhsel; \
+ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ for (nhsel = 0; nhsel < 1; nhsel++)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define endfor_nexthops(fi) }
+
+
+const struct fib_prop fib_props[RTN_MAX + 1] = {
+ [RTN_UNSPEC] = {
+ .error = 0,
+ .scope = RT_SCOPE_NOWHERE,
+ },
+ [RTN_UNICAST] = {
+ .error = 0,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_LOCAL] = {
+ .error = 0,
+ .scope = RT_SCOPE_HOST,
+ },
+ [RTN_BROADCAST] = {
+ .error = 0,
+ .scope = RT_SCOPE_LINK,
+ },
+ [RTN_ANYCAST] = {
+ .error = 0,
+ .scope = RT_SCOPE_LINK,
+ },
+ [RTN_MULTICAST] = {
+ .error = 0,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_BLACKHOLE] = {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_UNREACHABLE] = {
+ .error = -EHOSTUNREACH,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_PROHIBIT] = {
+ .error = -EACCES,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_THROW] = {
+ .error = -EAGAIN,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
+ [RTN_NAT] = {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_NOWHERE,
+ },
+ [RTN_XRESOLVE] = {
+ .error = -EINVAL,
+ .scope = RT_SCOPE_NOWHERE,
+ },
+};
+
+static void rt_fibinfo_free(struct rtable __rcu **rtp)
+{
+ struct rtable *rt = rcu_dereference_protected(*rtp, 1);
+
+ if (!rt)
+ return;
+
+ /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+ * because we waited an RCU grace period before calling
+ * free_fib_info_rcu()
+ */
+
+ dst_free(&rt->dst);
+}
+
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+ struct fnhe_hash_bucket *hash;
+ int i;
+
+ hash = rcu_dereference_protected(nh->nh_exceptions, 1);
+ if (!hash)
+ return;
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = rcu_dereference_protected(hash[i].chain, 1);
+ while (fnhe) {
+ struct fib_nh_exception *next;
+
+ next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+ rt_fibinfo_free(&fnhe->fnhe_rth_input);
+ rt_fibinfo_free(&fnhe->fnhe_rth_output);
+
+ kfree(fnhe);
+
+ fnhe = next;
+ }
+ }
+ kfree(hash);
+}
+
+static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+ int cpu;
+
+ if (!rtp)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rtable *rt;
+
+ rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+ if (rt)
+ dst_free(&rt->dst);
+ }
+ free_percpu(rtp);
+}
+
+/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+ struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_dev)
+ dev_put(nexthop_nh->nh_dev);
+ free_nh_exceptions(nexthop_nh);
+ rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
+ rt_fibinfo_free(&nexthop_nh->nh_rth_input);
+ } endfor_nexthops(fi);
+
+ if (fi->fib_metrics != (u32 *) dst_default_metrics)
+ kfree(fi->fib_metrics);
+ kfree(fi);
+}
+
+void free_fib_info(struct fib_info *fi)
+{
+ if (fi->fib_dead == 0) {
+ pr_warn("Freeing alive fib_info %p\n", fi);
+ return;
+ }
+ fib_info_cnt--;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users--;
+ } endfor_nexthops(fi);
+#endif
+ call_rcu(&fi->rcu, free_fib_info_rcu);
+}
+
+void fib_release_info(struct fib_info *fi)
+{
+ spin_lock_bh(&fib_info_lock);
+ if (fi && --fi->fib_treeref == 0) {
+ hlist_del(&fi->fib_hash);
+ if (fi->fib_prefsrc)
+ hlist_del(&fi->fib_lhash);
+ change_nexthops(fi) {
+ if (!nexthop_nh->nh_dev)
+ continue;
+ hlist_del(&nexthop_nh->nh_hash);
+ } endfor_nexthops(fi)
+ fi->fib_dead = 1;
+ fib_info_put(fi);
+ }
+ spin_unlock_bh(&fib_info_lock);
+}
+
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+{
+ const struct fib_nh *onh = ofi->fib_nh;
+
+ for_nexthops(fi) {
+ if (nh->nh_oif != onh->nh_oif ||
+ nh->nh_gw != onh->nh_gw ||
+ nh->nh_scope != onh->nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight != onh->nh_weight ||
+#endif
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+ return -1;
+ onh++;
+ } endfor_nexthops(fi);
+ return 0;
+}
+
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+ unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+ return (val ^
+ (val >> DEVINDEX_HASHBITS) ^
+ (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+
+static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+{
+ unsigned int mask = (fib_info_hash_size - 1);
+ unsigned int val = fi->fib_nhs;
+
+ val ^= (fi->fib_protocol << 8) | fi->fib_scope;
+ val ^= (__force u32)fi->fib_prefsrc;
+ val ^= fi->fib_priority;
+ for_nexthops(fi) {
+ val ^= fib_devindex_hashfn(nh->nh_oif);
+ } endfor_nexthops(fi)
+
+ return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+
+static struct fib_info *fib_find_info(const struct fib_info *nfi)
+{
+ struct hlist_head *head;
+ struct fib_info *fi;
+ unsigned int hash;
+
+ hash = fib_info_hashfn(nfi);
+ head = &fib_info_hash[hash];
+
+ hlist_for_each_entry(fi, head, fib_hash) {
+ if (!net_eq(fi->fib_net, nfi->fib_net))
+ continue;
+ if (fi->fib_nhs != nfi->fib_nhs)
+ continue;
+ if (nfi->fib_protocol == fi->fib_protocol &&
+ nfi->fib_scope == fi->fib_scope &&
+ nfi->fib_prefsrc == fi->fib_prefsrc &&
+ nfi->fib_priority == fi->fib_priority &&
+ nfi->fib_type == fi->fib_type &&
+ memcmp(nfi->fib_metrics, fi->fib_metrics,
+ sizeof(u32) * RTAX_MAX) == 0 &&
+ ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
+ (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ return fi;
+ }
+
+ return NULL;
+}
+
+/* Check, that the gateway is already configured.
+ * Used only by redirect accept routine.
+ */
+int ip_fib_check_default(__be32 gw, struct net_device *dev)
+{
+ struct hlist_head *head;
+ struct fib_nh *nh;
+ unsigned int hash;
+
+ spin_lock(&fib_info_lock);
+
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_for_each_entry(nh, head, nh_hash) {
+ if (nh->nh_dev == dev &&
+ nh->nh_gw == gw &&
+ !(nh->nh_flags & RTNH_F_DEAD)) {
+ spin_unlock(&fib_info_lock);
+ return 0;
+ }
+ }
+
+ spin_unlock(&fib_info_lock);
+
+ return -1;
+}
+
+static inline size_t fib_nlmsg_size(struct fib_info *fi)
+{
+ size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(4) /* RTA_DST */
+ + nla_total_size(4) /* RTA_PRIORITY */
+ + nla_total_size(4) /* RTA_PREFSRC */
+ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+
+ /* space for nested metrics */
+ payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
+
+ if (fi->fib_nhs) {
+ /* Also handles the special case fib_nhs == 1 */
+
+ /* each nexthop is packed in an attribute */
+ size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+
+ /* may contain flow and gateway attribute */
+ nhsize += 2 * nla_total_size(4);
+
+ /* all nexthops are packed in a nested attribute */
+ payload += nla_total_size(fi->fib_nhs * nhsize);
+ }
+
+ return payload;
+}
+
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
+ int dst_len, u32 tb_id, const struct nl_info *info,
+ unsigned int nlm_flags)
+{
+ struct sk_buff *skb;
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = fib_dump_info(skb, info->portid, seq, event, tb_id,
+ fa->fa_type, key, dst_len,
+ fa->fa_tos, fa->fa_info, nlm_flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
+ info->nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
+}
+
+static int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort, int *last_idx,
+ int dflt)
+{
+ struct neighbour *n;
+ int state = NUD_NONE;
+
+ n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+ if (n) {
+ state = n->nud_state;
+ neigh_release(n);
+ }
+ if (state == NUD_REACHABLE)
+ return 0;
+ if ((state & NUD_VALID) && order != dflt)
+ return 0;
+ if ((state & NUD_VALID) ||
+ (*last_idx < 0 && order > dflt)) {
+ *last_resort = fi;
+ *last_idx = order;
+ }
+ return 1;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
+{
+ int nhs = 0;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ nhs++;
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ /* leftover implies invalid nexthop configuration, discard it */
+ return remaining > 0 ? 0 : nhs;
+}
+
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+ int remaining, struct fib_config *cfg)
+{
+ change_nexthops(fi) {
+ int attrlen;
+
+ if (!rtnh_ok(rtnh, remaining))
+ return -EINVAL;
+
+ nexthop_nh->nh_flags =
+ (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+ nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
+ nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nla = nla_find(attrs, attrlen, RTA_FLOW);
+ nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+ if (nexthop_nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
+#endif
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ } endfor_nexthops(fi);
+
+ return 0;
+}
+
+#endif
+
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ struct rtnexthop *rtnh;
+ int remaining;
+#endif
+
+ if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
+ return 1;
+
+ if (cfg->fc_oif || cfg->fc_gw) {
+ if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
+ (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
+ return 0;
+ return 1;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (!cfg->fc_mp)
+ return 0;
+
+ rtnh = cfg->fc_mp;
+ remaining = cfg->fc_mp_len;
+
+ for_nexthops(fi) {
+ int attrlen;
+
+ if (!rtnh_ok(rtnh, remaining))
+ return -EINVAL;
+
+ if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
+ return 1;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla && nla_get_in_addr(nla) != nh->nh_gw)
+ return 1;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nla = nla_find(attrs, attrlen, RTA_FLOW);
+ if (nla && nla_get_u32(nla) != nh->nh_tclassid)
+ return 1;
+#endif
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ } endfor_nexthops(fi);
+#endif
+ return 0;
+}
+
+
+/*
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ * so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ * described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ * contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix} -> (gw, oif) [scope link]
+ * |
+ * |-> {link prefix} -> (gw, oif) [scope local]
+ * |
+ * |-> {local prefix} (terminal node)
+ */
+static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
+ struct fib_nh *nh)
+{
+ int err;
+ struct net *net;
+ struct net_device *dev;
+
+ net = cfg->fc_nlinfo.nl_net;
+ if (nh->nh_gw) {
+ struct fib_result res;
+
+ if (nh->nh_flags & RTNH_F_ONLINK) {
+
+ if (cfg->fc_scope >= RT_SCOPE_LINK)
+ return -EINVAL;
+ if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
+ return -EINVAL;
+ dev = __dev_get_by_index(net, nh->nh_oif);
+ if (!dev)
+ return -ENODEV;
+ if (!(dev->flags & IFF_UP))
+ return -ENETDOWN;
+ nh->nh_dev = dev;
+ dev_hold(dev);
+ nh->nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+ rcu_read_lock();
+ {
+ struct flowi4 fl4 = {
+ .daddr = nh->nh_gw,
+ .flowi4_scope = cfg->fc_scope + 1,
+ .flowi4_oif = nh->nh_oif,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ };
+
+ /* It is not necessary, but requires a bit of thinking */
+ if (fl4.flowi4_scope < RT_SCOPE_LINK)
+ fl4.flowi4_scope = RT_SCOPE_LINK;
+ err = fib_lookup(net, &fl4, &res);
+ if (err) {
+ rcu_read_unlock();
+ return err;
+ }
+ }
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+ goto out;
+ nh->nh_scope = res.scope;
+ nh->nh_oif = FIB_RES_OIF(res);
+ nh->nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev)
+ goto out;
+ dev_hold(dev);
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
+ } else {
+ struct in_device *in_dev;
+
+ if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
+ return -EINVAL;
+
+ rcu_read_lock();
+ err = -ENODEV;
+ in_dev = inetdev_by_index(net, nh->nh_oif);
+ if (!in_dev)
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP))
+ goto out;
+ nh->nh_dev = in_dev->dev;
+ dev_hold(nh->nh_dev);
+ nh->nh_scope = RT_SCOPE_HOST;
+ err = 0;
+ }
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static inline unsigned int fib_laddr_hashfn(__be32 val)
+{
+ unsigned int mask = (fib_info_hash_size - 1);
+
+ return ((__force u32)val ^
+ ((__force u32)val >> 7) ^
+ ((__force u32)val >> 14)) & mask;
+}
+
+static struct hlist_head *fib_info_hash_alloc(int bytes)
+{
+ if (bytes <= PAGE_SIZE)
+ return kzalloc(bytes, GFP_KERNEL);
+ else
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(bytes));
+}
+
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
+{
+ if (!hash)
+ return;
+
+ if (bytes <= PAGE_SIZE)
+ kfree(hash);
+ else
+ free_pages((unsigned long) hash, get_order(bytes));
+}
+
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
+{
+ struct hlist_head *old_info_hash, *old_laddrhash;
+ unsigned int old_size = fib_info_hash_size;
+ unsigned int i, bytes;
+
+ spin_lock_bh(&fib_info_lock);
+ old_info_hash = fib_info_hash;
+ old_laddrhash = fib_info_laddrhash;
+ fib_info_hash_size = new_size;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *head = &fib_info_hash[i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, head, fib_hash) {
+ struct hlist_head *dest;
+ unsigned int new_hash;
+
+ hlist_del(&fi->fib_hash);
+
+ new_hash = fib_info_hashfn(fi);
+ dest = &new_info_hash[new_hash];
+ hlist_add_head(&fi->fib_hash, dest);
+ }
+ }
+ fib_info_hash = new_info_hash;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *lhead = &fib_info_laddrhash[i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
+ struct hlist_head *ldest;
+ unsigned int new_hash;
+
+ hlist_del(&fi->fib_lhash);
+
+ new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
+ ldest = &new_laddrhash[new_hash];
+ hlist_add_head(&fi->fib_lhash, ldest);
+ }
+ }
+ fib_info_laddrhash = new_laddrhash;
+
+ spin_unlock_bh(&fib_info_lock);
+
+ bytes = old_size * sizeof(struct hlist_head *);
+ fib_info_hash_free(old_info_hash, bytes);
+ fib_info_hash_free(old_laddrhash, bytes);
+}
+
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+{
+ nh->nh_saddr = inet_select_addr(nh->nh_dev,
+ nh->nh_gw,
+ nh->nh_parent->fib_scope);
+ nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
+
+ return nh->nh_saddr;
+}
+
+struct fib_info *fib_create_info(struct fib_config *cfg)
+{
+ int err;
+ struct fib_info *fi = NULL;
+ struct fib_info *ofi;
+ int nhs = 1;
+ struct net *net = cfg->fc_nlinfo.nl_net;
+
+ if (cfg->fc_type > RTN_MAX)
+ goto err_inval;
+
+ /* Fast check to catch the most weird cases */
+ if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
+ goto err_inval;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (cfg->fc_mp) {
+ nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
+ if (nhs == 0)
+ goto err_inval;
+ }
+#endif
+
+ err = -ENOBUFS;
+ if (fib_info_cnt >= fib_info_hash_size) {
+ unsigned int new_size = fib_info_hash_size << 1;
+ struct hlist_head *new_info_hash;
+ struct hlist_head *new_laddrhash;
+ unsigned int bytes;
+
+ if (!new_size)
+ new_size = 16;
+ bytes = new_size * sizeof(struct hlist_head *);
+ new_info_hash = fib_info_hash_alloc(bytes);
+ new_laddrhash = fib_info_hash_alloc(bytes);
+ if (!new_info_hash || !new_laddrhash) {
+ fib_info_hash_free(new_info_hash, bytes);
+ fib_info_hash_free(new_laddrhash, bytes);
+ } else
+ fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
+
+ if (!fib_info_hash_size)
+ goto failure;
+ }
+
+ fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+ if (!fi)
+ goto failure;
+ fib_info_cnt++;
+ if (cfg->fc_mx) {
+ fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (!fi->fib_metrics)
+ goto failure;
+ } else
+ fi->fib_metrics = (u32 *) dst_default_metrics;
+
+ fi->fib_net = net;
+ fi->fib_protocol = cfg->fc_protocol;
+ fi->fib_scope = cfg->fc_scope;
+ fi->fib_flags = cfg->fc_flags;
+ fi->fib_priority = cfg->fc_priority;
+ fi->fib_prefsrc = cfg->fc_prefsrc;
+ fi->fib_type = cfg->fc_type;
+
+ fi->fib_nhs = nhs;
+ change_nexthops(fi) {
+ nexthop_nh->nh_parent = fi;
+ nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
+ if (!nexthop_nh->nh_pcpu_rth_output)
+ goto failure;
+ } endfor_nexthops(fi)
+
+ if (cfg->fc_mx) {
+ struct nlattr *nla;
+ int remaining;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+
+ if (type) {
+ u32 val;
+
+ if (type > RTAX_MAX)
+ goto err_inval;
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp);
+ if (val == TCP_CA_UNSPEC)
+ goto err_inval;
+ } else {
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ fi->fib_metrics[type - 1] = val;
+ }
+ }
+ }
+
+ if (cfg->fc_mp) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
+ if (err != 0)
+ goto failure;
+ if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
+ goto err_inval;
+ if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
+ goto err_inval;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
+ goto err_inval;
+#endif
+#else
+ goto err_inval;
+#endif
+ } else {
+ struct fib_nh *nh = fi->fib_nh;
+
+ nh->nh_oif = cfg->fc_oif;
+ nh->nh_gw = cfg->fc_gw;
+ nh->nh_flags = cfg->fc_flags;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nh->nh_tclassid = cfg->fc_flow;
+ if (nh->nh_tclassid)
+ fi->fib_net->ipv4.fib_num_tclassid_users++;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->nh_weight = 1;
+#endif
+ }
+
+ if (fib_props[cfg->fc_type].error) {
+ if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
+ goto err_inval;
+ goto link_it;
+ } else {
+ switch (cfg->fc_type) {
+ case RTN_UNICAST:
+ case RTN_LOCAL:
+ case RTN_BROADCAST:
+ case RTN_ANYCAST:
+ case RTN_MULTICAST:
+ break;
+ default:
+ goto err_inval;
+ }
+ }
+
+ if (cfg->fc_scope > RT_SCOPE_HOST)
+ goto err_inval;
+
+ if (cfg->fc_scope == RT_SCOPE_HOST) {
+ struct fib_nh *nh = fi->fib_nh;
+
+ /* Local address is added. */
+ if (nhs != 1 || nh->nh_gw)
+ goto err_inval;
+ nh->nh_scope = RT_SCOPE_NOWHERE;
+ nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
+ err = -ENODEV;
+ if (!nh->nh_dev)
+ goto failure;
+ } else {
+ change_nexthops(fi) {
+ err = fib_check_nh(cfg, fi, nexthop_nh);
+ if (err != 0)
+ goto failure;
+ } endfor_nexthops(fi)
+ }
+
+ if (fi->fib_prefsrc) {
+ if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+ fi->fib_prefsrc != cfg->fc_dst)
+ if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
+ goto err_inval;
+ }
+
+ change_nexthops(fi) {
+ fib_info_update_nh_saddr(net, nexthop_nh);
+ } endfor_nexthops(fi)
+
+link_it:
+ ofi = fib_find_info(fi);
+ if (ofi) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ ofi->fib_treeref++;
+ return ofi;
+ }
+
+ fi->fib_treeref++;
+ atomic_inc(&fi->fib_clntref);
+ spin_lock_bh(&fib_info_lock);
+ hlist_add_head(&fi->fib_hash,
+ &fib_info_hash[fib_info_hashfn(fi)]);
+ if (fi->fib_prefsrc) {
+ struct hlist_head *head;
+
+ head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+ hlist_add_head(&fi->fib_lhash, head);
+ }
+ change_nexthops(fi) {
+ struct hlist_head *head;
+ unsigned int hash;
+
+ if (!nexthop_nh->nh_dev)
+ continue;
+ hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
+ head = &fib_info_devhash[hash];
+ hlist_add_head(&nexthop_nh->nh_hash, head);
+ } endfor_nexthops(fi)
+ spin_unlock_bh(&fib_info_lock);
+ return fi;
+
+err_inval:
+ err = -EINVAL;
+
+failure:
+ if (fi) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ }
+
+ return ERR_PTR(err);
+}
+
+int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
+ u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
+ struct fib_info *fi, unsigned int flags)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = AF_INET;
+ rtm->rtm_dst_len = dst_len;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = tos;
+ if (tb_id < 256)
+ rtm->rtm_table = tb_id;
+ else
+ rtm->rtm_table = RT_TABLE_COMPAT;
+ if (nla_put_u32(skb, RTA_TABLE, tb_id))
+ goto nla_put_failure;
+ rtm->rtm_type = type;
+ rtm->rtm_flags = fi->fib_flags;
+ rtm->rtm_scope = fi->fib_scope;
+ rtm->rtm_protocol = fi->fib_protocol;
+
+ if (rtm->rtm_dst_len &&
+ nla_put_in_addr(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (fi->fib_priority &&
+ nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
+ goto nla_put_failure;
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ goto nla_put_failure;
+
+ if (fi->fib_prefsrc &&
+ nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
+ goto nla_put_failure;
+ if (fi->fib_nhs == 1) {
+ if (fi->fib_nh->nh_gw &&
+ nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
+ goto nla_put_failure;
+ if (fi->fib_nh->nh_oif &&
+ nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (fi->fib_nh[0].nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
+ goto nla_put_failure;
+#endif
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fi->fib_nhs > 1) {
+ struct rtnexthop *rtnh;
+ struct nlattr *mp;
+
+ mp = nla_nest_start(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ for_nexthops(fi) {
+ rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+ if (!rtnh)
+ goto nla_put_failure;
+
+ rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+ rtnh->rtnh_hops = nh->nh_weight - 1;
+ rtnh->rtnh_ifindex = nh->nh_oif;
+
+ if (nh->nh_gw &&
+ nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (nh->nh_tclassid &&
+ nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
+ goto nla_put_failure;
+#endif
+ /* length of rtnetlink header + attributes */
+ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
+ } endfor_nexthops(fi);
+
+ nla_nest_end(skb, mp);
+ }
+#endif
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+/*
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ * referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
+ */
+int fib_sync_down_addr(struct net *net, __be32 local)
+{
+ int ret = 0;
+ unsigned int hash = fib_laddr_hashfn(local);
+ struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct fib_info *fi;
+
+ if (!fib_info_laddrhash || local == 0)
+ return 0;
+
+ hlist_for_each_entry(fi, head, fib_lhash) {
+ if (!net_eq(fi->fib_net, net))
+ continue;
+ if (fi->fib_prefsrc == local) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+ return ret;
+}
+
+int fib_sync_down_dev(struct net_device *dev, int force)
+{
+ int ret = 0;
+ int scope = RT_SCOPE_NOWHERE;
+ struct fib_info *prev_fi = NULL;
+ unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+ struct hlist_head *head = &fib_info_devhash[hash];
+ struct fib_nh *nh;
+
+ if (force)
+ scope = -1;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int dead;
+
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+ prev_fi = fi;
+ dead = 0;
+ change_nexthops(fi) {
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD)
+ dead++;
+ else if (nexthop_nh->nh_dev == dev &&
+ nexthop_nh->nh_scope != scope) {
+ nexthop_nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ spin_lock_bh(&fib_multipath_lock);
+ fi->fib_power -= nexthop_nh->nh_power;
+ nexthop_nh->nh_power = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+#endif
+ dead++;
+ }
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (force > 1 && nexthop_nh->nh_dev == dev) {
+ dead = fi->fib_nhs;
+ break;
+ }
+#endif
+ } endfor_nexthops(fi)
+ if (dead == fi->fib_nhs) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
+/* Must be invoked inside of an RCU protected region. */
+void fib_select_default(struct fib_result *res)
+{
+ struct fib_info *fi = NULL, *last_resort = NULL;
+ struct hlist_head *fa_head = res->fa_head;
+ struct fib_table *tb = res->table;
+ int order = -1, last_idx = -1;
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (next_fi->fib_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+
+ fib_alias_accessed(fa);
+
+ if (!fi) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order <= 0 || !fi) {
+ tb->tb_default = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
+out:
+ return;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/*
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
+ */
+int fib_sync_up(struct net_device *dev)
+{
+ struct fib_info *prev_fi;
+ unsigned int hash;
+ struct hlist_head *head;
+ struct fib_nh *nh;
+ int ret;
+
+ if (!(dev->flags & IFF_UP))
+ return 0;
+
+ prev_fi = NULL;
+ hash = fib_devindex_hashfn(dev->ifindex);
+ head = &fib_info_devhash[hash];
+ ret = 0;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ struct fib_info *fi = nh->nh_parent;
+ int alive;
+
+ BUG_ON(!fi->fib_nhs);
+ if (nh->nh_dev != dev || fi == prev_fi)
+ continue;
+
+ prev_fi = fi;
+ alive = 0;
+ change_nexthops(fi) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+ alive++;
+ continue;
+ }
+ if (!nexthop_nh->nh_dev ||
+ !(nexthop_nh->nh_dev->flags & IFF_UP))
+ continue;
+ if (nexthop_nh->nh_dev != dev ||
+ !__in_dev_get_rtnl(dev))
+ continue;
+ alive++;
+ spin_lock_bh(&fib_multipath_lock);
+ nexthop_nh->nh_power = 0;
+ nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
+ spin_unlock_bh(&fib_multipath_lock);
+ } endfor_nexthops(fi)
+
+ if (alive > 0) {
+ fi->fib_flags &= ~RTNH_F_DEAD;
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
+ */
+void fib_select_multipath(struct fib_result *res)
+{
+ struct fib_info *fi = res->fi;
+ int w;
+
+ spin_lock_bh(&fib_multipath_lock);
+ if (fi->fib_power <= 0) {
+ int power = 0;
+ change_nexthops(fi) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+ power += nexthop_nh->nh_weight;
+ nexthop_nh->nh_power = nexthop_nh->nh_weight;
+ }
+ } endfor_nexthops(fi);
+ fi->fib_power = power;
+ if (power <= 0) {
+ spin_unlock_bh(&fib_multipath_lock);
+ /* Race condition: route has just become dead. */
+ res->nh_sel = 0;
+ return;
+ }
+ }
+
+
+ /* w should be random number [0..fi->fib_power-1],
+ * it is pretty bad approximation.
+ */
+
+ w = jiffies % fi->fib_power;
+
+ change_nexthops(fi) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
+ nexthop_nh->nh_power) {
+ w -= nexthop_nh->nh_power;
+ if (w <= 0) {
+ nexthop_nh->nh_power--;
+ fi->fib_power--;
+ res->nh_sel = nhsel;
+ spin_unlock_bh(&fib_multipath_lock);
+ return;
+ }
+ }
+ } endfor_nexthops(fi);
+
+ /* Race condition: route has just become dead. */
+ res->nh_sel = 0;
+ spin_unlock_bh(&fib_multipath_lock);
+}
+#endif
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 000000000..09b62e17d
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2659 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
+ * & Swedish University of Agricultural Sciences.
+ *
+ * Jens Laas <jens.laas@data.slu.se> Swedish University of
+ * Agricultural Sciences.
+ *
+ * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
+ *
+ * This work is based on the LPC-trie which is originally described in:
+ *
+ * An experimental study of compression methods for dynamic tries
+ * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ * http://www.csc.kth.se/~snilsson/software/dyntrie2/
+ *
+ *
+ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+ *
+ *
+ * Code from fib_hash has been reused which includes the following header:
+ *
+ *
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 FIB: lookup engine and maintenance routines.
+ *
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Substantial contributions to this work comes from:
+ *
+ * David S. Miller, <davem@davemloft.net>
+ * Stephen Hemminger <shemminger@osdl.org>
+ * Paul E. McKenney <paulmck@us.ibm.com>
+ * Patrick McHardy <kaber@trash.net>
+ */
+
+#define VERSION "0.409"
+
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/switchdev.h>
+#include "fib_lookup.h"
+
+#define MAX_STAT_DEPTH 32
+
+#define KEYLENGTH (8*sizeof(t_key))
+#define KEY_MAX ((t_key)~0)
+
+typedef unsigned int t_key;
+
+#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
+
+struct key_vector {
+ t_key key;
+ unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
+ unsigned char slen;
+ union {
+ /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
+ struct hlist_head leaf;
+ /* This array is valid if (pos | bits) > 0 (TNODE) */
+ struct key_vector __rcu *tnode[0];
+ };
+};
+
+struct tnode {
+ struct rcu_head rcu;
+ t_key empty_children; /* KEYLENGTH bits needed */
+ t_key full_children; /* KEYLENGTH bits needed */
+ struct key_vector __rcu *parent;
+ struct key_vector kv[1];
+#define tn_bits kv[0].bits
+};
+
+#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
+#define LEAF_SIZE TNODE_SIZE(1)
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+struct trie_use_stats {
+ unsigned int gets;
+ unsigned int backtrack;
+ unsigned int semantic_match_passed;
+ unsigned int semantic_match_miss;
+ unsigned int null_node_hit;
+ unsigned int resize_node_skipped;
+};
+#endif
+
+struct trie_stat {
+ unsigned int totdepth;
+ unsigned int maxdepth;
+ unsigned int tnodes;
+ unsigned int leaves;
+ unsigned int nullpointers;
+ unsigned int prefixes;
+ unsigned int nodesizes[MAX_STAT_DEPTH];
+};
+
+struct trie {
+ struct key_vector kv[1];
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats;
+#endif
+};
+
+static struct key_vector *resize(struct trie *t, struct key_vector *tn);
+static size_t tnode_free_size;
+
+/*
+ * synchronize_rcu after call_rcu for that many pages; it should be especially
+ * useful before resizing the root node with PREEMPT_NONE configs; the value was
+ * obtained experimentally, aiming to avoid visible slowdown.
+ */
+static const int sync_pages = 128;
+
+static struct kmem_cache *fn_alias_kmem __read_mostly;
+static struct kmem_cache *trie_leaf_kmem __read_mostly;
+
+static inline struct tnode *tn_info(struct key_vector *kv)
+{
+ return container_of(kv, struct tnode, kv[0]);
+}
+
+/* caller must hold RTNL */
+#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
+#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
+
+/* caller must hold RCU read lock or RTNL */
+#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
+#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
+
+/* wrapper for rcu_assign_pointer */
+static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
+{
+ if (n)
+ rcu_assign_pointer(tn_info(n)->parent, tp);
+}
+
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
+
+/* This provides us with the number of children in this node, in the case of a
+ * leaf this will return 0 meaning none of the children are accessible.
+ */
+static inline unsigned long child_length(const struct key_vector *tn)
+{
+ return (1ul << tn->bits) & ~(1ul);
+}
+
+#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
+
+static inline unsigned long get_index(t_key key, struct key_vector *kv)
+{
+ unsigned long index = key ^ kv->key;
+
+ if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
+ return 0;
+
+ return index >> kv->pos;
+}
+
+/* To understand this stuff, an understanding of keys and all their bits is
+ * necessary. Every node in the trie has a key associated with it, but not
+ * all of the bits in that key are significant.
+ *
+ * Consider a node 'n' and its parent 'tp'.
+ *
+ * If n is a leaf, every bit in its key is significant. Its presence is
+ * necessitated by path compression, since during a tree traversal (when
+ * searching for a leaf - unless we are doing an insertion) we will completely
+ * ignore all skipped bits we encounter. Thus we need to verify, at the end of
+ * a potentially successful search, that we have indeed been walking the
+ * correct key path.
+ *
+ * Note that we can never "miss" the correct key in the tree if present by
+ * following the wrong path. Path compression ensures that segments of the key
+ * that are the same for all keys with a given prefix are skipped, but the
+ * skipped part *is* identical for each node in the subtrie below the skipped
+ * bit! trie_insert() in this implementation takes care of that.
+ *
+ * if n is an internal node - a 'tnode' here, the various parts of its key
+ * have many different meanings.
+ *
+ * Example:
+ * _________________________________________________________________
+ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+ * -----------------------------------------------------------------
+ * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
+ *
+ * _________________________________________________________________
+ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+ * -----------------------------------------------------------------
+ * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ *
+ * tp->pos = 22
+ * tp->bits = 3
+ * n->pos = 13
+ * n->bits = 4
+ *
+ * First, let's just ignore the bits that come before the parent tp, that is
+ * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
+ * point we do not use them for anything.
+ *
+ * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+ * index into the parent's child array. That is, they will be used to find
+ * 'n' among tp's children.
+ *
+ * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
+ * for the node n.
+ *
+ * All the bits we have seen so far are significant to the node n. The rest
+ * of the bits are really not needed or indeed known in n->key.
+ *
+ * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+ * n's child array, and will of course be different for each child.
+ *
+ * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
+ * at this point.
+ */
+
+static const int halve_threshold = 25;
+static const int inflate_threshold = 50;
+static const int halve_threshold_root = 15;
+static const int inflate_threshold_root = 30;
+
+static void __alias_free_mem(struct rcu_head *head)
+{
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+ kmem_cache_free(fn_alias_kmem, fa);
+}
+
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
+{
+ call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+#define TNODE_KMALLOC_MAX \
+ ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
+#define TNODE_VMALLOC_MAX \
+ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
+
+static void __node_free_rcu(struct rcu_head *head)
+{
+ struct tnode *n = container_of(head, struct tnode, rcu);
+
+ if (!n->tn_bits)
+ kmem_cache_free(trie_leaf_kmem, n);
+ else if (n->tn_bits <= TNODE_KMALLOC_MAX)
+ kfree(n);
+ else
+ vfree(n);
+}
+
+#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
+
+static struct tnode *tnode_alloc(int bits)
+{
+ size_t size;
+
+ /* verify bits is within bounds */
+ if (bits > TNODE_VMALLOC_MAX)
+ return NULL;
+
+ /* determine size and verify it is non-zero and didn't overflow */
+ size = TNODE_SIZE(1ul << bits);
+
+ if (size <= PAGE_SIZE)
+ return kzalloc(size, GFP_KERNEL);
+ else
+ return vzalloc(size);
+}
+
+static inline void empty_child_inc(struct key_vector *n)
+{
+ ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
+}
+
+static inline void empty_child_dec(struct key_vector *n)
+{
+ tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
+}
+
+static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
+{
+ struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+ struct key_vector *l = kv->kv;
+
+ if (!kv)
+ return NULL;
+
+ /* initialize key vector */
+ l->key = key;
+ l->pos = 0;
+ l->bits = 0;
+ l->slen = fa->fa_slen;
+
+ /* link leaf to fib alias */
+ INIT_HLIST_HEAD(&l->leaf);
+ hlist_add_head(&fa->fa_list, &l->leaf);
+
+ return l;
+}
+
+static struct key_vector *tnode_new(t_key key, int pos, int bits)
+{
+ struct tnode *tnode = tnode_alloc(bits);
+ unsigned int shift = pos + bits;
+ struct key_vector *tn = tnode->kv;
+
+ /* verify bits and pos their msb bits clear and values are valid */
+ BUG_ON(!bits || (shift > KEYLENGTH));
+
+ pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+ sizeof(struct key_vector *) << bits);
+
+ if (!tnode)
+ return NULL;
+
+ if (bits == KEYLENGTH)
+ tnode->full_children = 1;
+ else
+ tnode->empty_children = 1ul << bits;
+
+ tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
+ tn->pos = pos;
+ tn->bits = bits;
+ tn->slen = pos;
+
+ return tn;
+}
+
+/* Check whether a tnode 'n' is "full", i.e. it is an internal node
+ * and no bits are skipped. See discussion in dyntree paper p. 6
+ */
+static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
+{
+ return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
+}
+
+/* Add a child at position i overwriting the old value.
+ * Update the value of full_children and empty_children.
+ */
+static void put_child(struct key_vector *tn, unsigned long i,
+ struct key_vector *n)
+{
+ struct key_vector *chi = get_child(tn, i);
+ int isfull, wasfull;
+
+ BUG_ON(i >= child_length(tn));
+
+ /* update emptyChildren, overflow into fullChildren */
+ if (!n && chi)
+ empty_child_inc(tn);
+ if (n && !chi)
+ empty_child_dec(tn);
+
+ /* update fullChildren */
+ wasfull = tnode_full(tn, chi);
+ isfull = tnode_full(tn, n);
+
+ if (wasfull && !isfull)
+ tn_info(tn)->full_children--;
+ else if (!wasfull && isfull)
+ tn_info(tn)->full_children++;
+
+ if (n && (tn->slen < n->slen))
+ tn->slen = n->slen;
+
+ rcu_assign_pointer(tn->tnode[i], n);
+}
+
+static void update_children(struct key_vector *tn)
+{
+ unsigned long i;
+
+ /* update all of the child parent pointers */
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
+
+ if (!inode)
+ continue;
+
+ /* Either update the children of a tnode that
+ * already belongs to us or update the child
+ * to point to ourselves.
+ */
+ if (node_parent(inode) == tn)
+ update_children(inode);
+ else
+ node_set_parent(inode, tn);
+ }
+}
+
+static inline void put_child_root(struct key_vector *tp, t_key key,
+ struct key_vector *n)
+{
+ if (IS_TRIE(tp))
+ rcu_assign_pointer(tp->tnode[0], n);
+ else
+ put_child(tp, get_index(key, tp), n);
+}
+
+static inline void tnode_free_init(struct key_vector *tn)
+{
+ tn_info(tn)->rcu.next = NULL;
+}
+
+static inline void tnode_free_append(struct key_vector *tn,
+ struct key_vector *n)
+{
+ tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
+ tn_info(tn)->rcu.next = &tn_info(n)->rcu;
+}
+
+static void tnode_free(struct key_vector *tn)
+{
+ struct callback_head *head = &tn_info(tn)->rcu;
+
+ while (head) {
+ head = head->next;
+ tnode_free_size += TNODE_SIZE(1ul << tn->bits);
+ node_free(tn);
+
+ tn = container_of(head, struct tnode, rcu)->kv;
+ }
+
+ if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+ tnode_free_size = 0;
+ synchronize_rcu();
+ }
+}
+
+static struct key_vector *replace(struct trie *t,
+ struct key_vector *oldtnode,
+ struct key_vector *tn)
+{
+ struct key_vector *tp = node_parent(oldtnode);
+ unsigned long i;
+
+ /* setup the parent pointer out of and back into this node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child_root(tp, tn->key, tn);
+
+ /* update all of the child parent pointers */
+ update_children(tn);
+
+ /* all pointers should be clean so we are done */
+ tnode_free(oldtnode);
+
+ /* resize children now that oldtnode is freed */
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
+
+ /* resize child node */
+ if (tnode_full(tn, inode))
+ tn = resize(t, inode);
+ }
+
+ return tp;
+}
+
+static struct key_vector *inflate(struct trie *t,
+ struct key_vector *oldtnode)
+{
+ struct key_vector *tn;
+ unsigned long i;
+ t_key m;
+
+ pr_debug("In inflate\n");
+
+ tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
+ if (!tn)
+ goto notnode;
+
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
+
+ /* Assemble all of the pointers in our cluster, in this case that
+ * represents all of the pointers out of our allocated nodes that
+ * point to existing tnodes and the links between our allocated
+ * nodes.
+ */
+ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
+ struct key_vector *inode = get_child(oldtnode, --i);
+ struct key_vector *node0, *node1;
+ unsigned long j, k;
+
+ /* An empty child */
+ if (!inode)
+ continue;
+
+ /* A leaf or an internal node with skipped bits */
+ if (!tnode_full(oldtnode, inode)) {
+ put_child(tn, get_index(inode->key, tn), inode);
+ continue;
+ }
+
+ /* drop the node in the old tnode free list */
+ tnode_free_append(oldtnode, inode);
+
+ /* An internal node with two children */
+ if (inode->bits == 1) {
+ put_child(tn, 2 * i + 1, get_child(inode, 1));
+ put_child(tn, 2 * i, get_child(inode, 0));
+ continue;
+ }
+
+ /* We will replace this node 'inode' with two new
+ * ones, 'node0' and 'node1', each with half of the
+ * original children. The two new nodes will have
+ * a position one bit further down the key and this
+ * means that the "significant" part of their keys
+ * (see the discussion near the top of this file)
+ * will differ by one bit, which will be "0" in
+ * node0's key and "1" in node1's key. Since we are
+ * moving the key position by one step, the bit that
+ * we are moving away from - the bit at position
+ * (tn->pos) - is the one that will differ between
+ * node0 and node1. So... we synthesize that bit in the
+ * two new keys.
+ */
+ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
+ if (!node1)
+ goto nomem;
+ node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
+
+ tnode_free_append(tn, node1);
+ if (!node0)
+ goto nomem;
+ tnode_free_append(tn, node0);
+
+ /* populate child pointers in new nodes */
+ for (k = child_length(inode), j = k / 2; j;) {
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ }
+
+ /* link new nodes to parent */
+ NODE_INIT_PARENT(node1, tn);
+ NODE_INIT_PARENT(node0, tn);
+
+ /* link parent to nodes */
+ put_child(tn, 2 * i + 1, node1);
+ put_child(tn, 2 * i, node0);
+ }
+
+ /* setup the parent pointers into and out of this node */
+ return replace(t, oldtnode, tn);
+nomem:
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
+}
+
+static struct key_vector *halve(struct trie *t,
+ struct key_vector *oldtnode)
+{
+ struct key_vector *tn;
+ unsigned long i;
+
+ pr_debug("In halve\n");
+
+ tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
+ if (!tn)
+ goto notnode;
+
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
+
+ /* Assemble all of the pointers in our cluster, in this case that
+ * represents all of the pointers out of our allocated nodes that
+ * point to existing tnodes and the links between our allocated
+ * nodes.
+ */
+ for (i = child_length(oldtnode); i;) {
+ struct key_vector *node1 = get_child(oldtnode, --i);
+ struct key_vector *node0 = get_child(oldtnode, --i);
+ struct key_vector *inode;
+
+ /* At least one of the children is empty */
+ if (!node1 || !node0) {
+ put_child(tn, i / 2, node1 ? : node0);
+ continue;
+ }
+
+ /* Two nonempty children */
+ inode = tnode_new(node0->key, oldtnode->pos, 1);
+ if (!inode)
+ goto nomem;
+ tnode_free_append(tn, inode);
+
+ /* initialize pointers out of node */
+ put_child(inode, 1, node1);
+ put_child(inode, 0, node0);
+ NODE_INIT_PARENT(inode, tn);
+
+ /* link parent to node */
+ put_child(tn, i / 2, inode);
+ }
+
+ /* setup the parent pointers into and out of this node */
+ return replace(t, oldtnode, tn);
+nomem:
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
+}
+
+static struct key_vector *collapse(struct trie *t,
+ struct key_vector *oldtnode)
+{
+ struct key_vector *n, *tp;
+ unsigned long i;
+
+ /* scan the tnode looking for that one child that might still exist */
+ for (n = NULL, i = child_length(oldtnode); !n && i;)
+ n = get_child(oldtnode, --i);
+
+ /* compress one level */
+ tp = node_parent(oldtnode);
+ put_child_root(tp, oldtnode->key, n);
+ node_set_parent(n, tp);
+
+ /* drop dead node */
+ node_free(oldtnode);
+
+ return tp;
+}
+
+static unsigned char update_suffix(struct key_vector *tn)
+{
+ unsigned char slen = tn->pos;
+ unsigned long stride, i;
+
+ /* search though the list of children looking for nodes that might
+ * have a suffix greater than the one we currently have. This is
+ * why we start with a stride of 2 since a stride of 1 would
+ * represent the nodes with suffix length equal to tn->pos
+ */
+ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
+ struct key_vector *n = get_child(tn, i);
+
+ if (!n || (n->slen <= slen))
+ continue;
+
+ /* update stride and slen based on new value */
+ stride <<= (n->slen - slen);
+ slen = n->slen;
+ i &= ~(stride - 1);
+
+ /* if slen covers all but the last bit we can stop here
+ * there will be nothing longer than that since only node
+ * 0 and 1 << (bits - 1) could have that as their suffix
+ * length.
+ */
+ if ((slen + 1) >= (tn->pos + tn->bits))
+ break;
+ }
+
+ tn->slen = slen;
+
+ return slen;
+}
+
+/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = child_length(tn) - tn->empty_children -
+ * tn->full_children;
+ *
+ * new_child_length = child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ * new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ * inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ * inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (child_length(tn) - tn->empty_children +
+ * tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (child_length(tn) - tn->empty_children +
+ * tn->full_children) >=
+ * inflate_threshold * child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + child_length(tn) -
+ * tn->empty_children) >= inflate_threshold *
+ * child_length(tn)
+ *
+ */
+static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
+ unsigned long threshold = used;
+
+ /* Keep root node larger */
+ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
+ used -= tn_info(tn)->empty_children;
+ used += tn_info(tn)->full_children;
+
+ /* if bits == KEYLENGTH then pos = 0, and will fail below */
+
+ return (used > 1) && tn->pos && ((50 * used) >= threshold);
+}
+
+static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
+ unsigned long threshold = used;
+
+ /* Keep root node larger */
+ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
+ used -= tn_info(tn)->empty_children;
+
+ /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
+
+ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
+}
+
+static inline bool should_collapse(struct key_vector *tn)
+{
+ unsigned long used = child_length(tn);
+
+ used -= tn_info(tn)->empty_children;
+
+ /* account for bits == KEYLENGTH case */
+ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
+ used -= KEY_MAX;
+
+ /* One child or none, time to drop us from the trie */
+ return used < 2;
+}
+
+#define MAX_WORK 10
+static struct key_vector *resize(struct trie *t, struct key_vector *tn)
+{
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
+ struct key_vector *tp = node_parent(tn);
+ unsigned long cindex = get_index(tn->key, tp);
+ int max_work = MAX_WORK;
+
+ pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+ tn, inflate_threshold, halve_threshold);
+
+ /* track the tnode via the pointer from the parent instead of
+ * doing it ourselves. This way we can let RCU fully do its
+ * thing without us interfering
+ */
+ BUG_ON(tn != get_child(tp, cindex));
+
+ /* Double as long as the resulting node has a number of
+ * nonempty nodes that are above the threshold.
+ */
+ while (should_inflate(tp, tn) && max_work) {
+ tp = inflate(t, tn);
+ if (!tp) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->resize_node_skipped);
+#endif
+ break;
+ }
+
+ max_work--;
+ tn = get_child(tp, cindex);
+ }
+
+ /* update parent in case inflate failed */
+ tp = node_parent(tn);
+
+ /* Return if at least one inflate is run */
+ if (max_work != MAX_WORK)
+ return tp;
+
+ /* Halve as long as the number of empty children in this
+ * node is above threshold.
+ */
+ while (should_halve(tp, tn) && max_work) {
+ tp = halve(t, tn);
+ if (!tp) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->resize_node_skipped);
+#endif
+ break;
+ }
+
+ max_work--;
+ tn = get_child(tp, cindex);
+ }
+
+ /* Only one child remains */
+ if (should_collapse(tn))
+ return collapse(t, tn);
+
+ /* update parent in case halve failed */
+ tp = node_parent(tn);
+
+ /* Return if at least one deflate was run */
+ if (max_work != MAX_WORK)
+ return tp;
+
+ /* push the suffix length to the parent node */
+ if (tn->slen > tn->pos) {
+ unsigned char slen = update_suffix(tn);
+
+ if (slen > tp->slen)
+ tp->slen = slen;
+ }
+
+ return tp;
+}
+
+static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
+{
+ while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
+ if (update_suffix(tp) > l->slen)
+ break;
+ tp = node_parent(tp);
+ }
+}
+
+static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
+{
+ /* if this is a new leaf then tn will be NULL and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ while (tn->slen < l->slen) {
+ tn->slen = l->slen;
+ tn = node_parent(tn);
+ }
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+static struct key_vector *fib_find_node(struct trie *t,
+ struct key_vector **tp, u32 key)
+{
+ struct key_vector *pn, *n = t->kv;
+ unsigned long index = 0;
+
+ do {
+ pn = n;
+ n = get_child_rcu(n, index);
+
+ if (!n)
+ break;
+
+ index = get_cindex(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the bits in the cindex. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if (index >= (1ul << bits))
+ * we have a mismatch in skip bits and failed
+ * else
+ * we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
+ */
+ if (index >= (1ul << n->bits)) {
+ n = NULL;
+ break;
+ }
+
+ /* keep searching until we find a perfect match leaf or NULL */
+ } while (IS_TNODE(n));
+
+ *tp = pn;
+
+ return n;
+}
+
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
+ u8 tos, u32 prio, u32 tb_id)
+{
+ struct fib_alias *fa;
+
+ if (!fah)
+ return NULL;
+
+ hlist_for_each_entry(fa, fah, fa_list) {
+ if (fa->fa_slen < slen)
+ continue;
+ if (fa->fa_slen != slen)
+ break;
+ if (fa->tb_id > tb_id)
+ continue;
+ if (fa->tb_id != tb_id)
+ break;
+ if (fa->fa_tos > tos)
+ continue;
+ if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+ return fa;
+ }
+
+ return NULL;
+}
+
+static void trie_rebalance(struct trie *t, struct key_vector *tn)
+{
+ while (!IS_TRIE(tn))
+ tn = resize(t, tn);
+}
+
+static int fib_insert_node(struct trie *t, struct key_vector *tp,
+ struct fib_alias *new, t_key key)
+{
+ struct key_vector *n, *l;
+
+ l = leaf_new(key, new);
+ if (!l)
+ goto noleaf;
+
+ /* retrieve child from parent node */
+ n = get_child(tp, get_index(key, tp));
+
+ /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
+ *
+ * Add a new tnode here
+ * first tnode need some special handling
+ * leaves us in position for handling as case 3
+ */
+ if (n) {
+ struct key_vector *tn;
+
+ tn = tnode_new(key, __fls(key ^ n->key), 1);
+ if (!tn)
+ goto notnode;
+
+ /* initialize routes out of node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child(tn, get_index(key, tn) ^ 1, n);
+
+ /* start adding routes into the node */
+ put_child_root(tp, key, tn);
+ node_set_parent(n, tn);
+
+ /* parent now has a NULL spot where the leaf can go */
+ tp = tn;
+ }
+
+ /* Case 3: n is NULL, and will just insert a new leaf */
+ NODE_INIT_PARENT(l, tp);
+ put_child_root(tp, key, l);
+ trie_rebalance(t, tp);
+
+ return 0;
+notnode:
+ node_free(l);
+noleaf:
+ return -ENOMEM;
+}
+
+static int fib_insert_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *new,
+ struct fib_alias *fa, t_key key)
+{
+ if (!l)
+ return fib_insert_node(t, tp, new, key);
+
+ if (fa) {
+ hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
+ } else {
+ struct fib_alias *last;
+
+ hlist_for_each_entry(last, &l->leaf, fa_list) {
+ if (new->fa_slen < last->fa_slen)
+ break;
+ if ((new->fa_slen == last->fa_slen) &&
+ (new->tb_id > last->tb_id))
+ break;
+ fa = last;
+ }
+
+ if (fa)
+ hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
+ else
+ hlist_add_head_rcu(&new->fa_list, &l->leaf);
+ }
+
+ /* if we added to the tail node then we need to update slen */
+ if (l->slen < new->fa_slen) {
+ l->slen = new->fa_slen;
+ leaf_push_suffix(tp, l);
+ }
+
+ return 0;
+}
+
+/* Caller must hold RTNL. */
+int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct fib_alias *fa, *new_fa;
+ struct key_vector *l, *tp;
+ struct fib_info *fi;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ u8 tos = cfg->fc_tos;
+ u32 key;
+ int err;
+
+ if (plen > KEYLENGTH)
+ return -EINVAL;
+
+ key = ntohl(cfg->fc_dst);
+
+ pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
+
+ if ((plen < KEYLENGTH) && (key << plen))
+ return -EINVAL;
+
+ fi = fib_create_info(cfg);
+ if (IS_ERR(fi)) {
+ err = PTR_ERR(fi);
+ goto err;
+ }
+
+ l = fib_find_node(t, &tp, key);
+ fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+ tb->tb_id) : NULL;
+
+ /* Now fa, if non-NULL, points to the first fib alias
+ * with the same keys [prefix,tos,priority], if such key already
+ * exists or to the node before which we will insert new one.
+ *
+ * If fa is NULL, we will need to allocate a new one and
+ * insert to the tail of the section matching the suffix length
+ * of the new alias.
+ */
+
+ if (fa && fa->fa_tos == tos &&
+ fa->fa_info->fib_priority == fi->fib_priority) {
+ struct fib_alias *fa_first, *fa_match;
+
+ err = -EEXIST;
+ if (cfg->fc_nlflags & NLM_F_EXCL)
+ goto out;
+
+ /* We have 2 goals:
+ * 1. Find exact match for type, scope, fib_info to avoid
+ * duplicate routes
+ * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
+ */
+ fa_match = NULL;
+ fa_first = fa;
+ hlist_for_each_entry_from(fa, fa_list) {
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
+ break;
+ if (fa->fa_info->fib_priority != fi->fib_priority)
+ break;
+ if (fa->fa_type == cfg->fc_type &&
+ fa->fa_info == fi) {
+ fa_match = fa;
+ break;
+ }
+ }
+
+ if (cfg->fc_nlflags & NLM_F_REPLACE) {
+ struct fib_info *fi_drop;
+ u8 state;
+
+ fa = fa_first;
+ if (fa_match) {
+ if (fa == fa_match)
+ err = 0;
+ goto out;
+ }
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ fi_drop = fa->fa_info;
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = cfg->fc_type;
+ state = fa->fa_state;
+ new_fa->fa_state = state & ~FA_S_ACCESSED;
+ new_fa->fa_slen = fa->fa_slen;
+ new_fa->tb_id = tb->tb_id;
+
+ err = netdev_switch_fib_ipv4_add(key, plen, fi,
+ new_fa->fa_tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ netdev_switch_fib_ipv4_abort(fi);
+ kmem_cache_free(fn_alias_kmem, new_fa);
+ goto out;
+ }
+
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+
+ alias_free_mem_rcu(fa);
+
+ fib_release_info(fi_drop);
+ if (state & FA_S_ACCESSED)
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+ tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+
+ goto succeeded;
+ }
+ /* Error if we find a perfect match which
+ * uses the same scope, type, and nexthop
+ * information.
+ */
+ if (fa_match)
+ goto out;
+
+ if (!(cfg->fc_nlflags & NLM_F_APPEND))
+ fa = fa_first;
+ }
+ err = -ENOENT;
+ if (!(cfg->fc_nlflags & NLM_F_CREATE))
+ goto out;
+
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ new_fa->fa_info = fi;
+ new_fa->fa_tos = tos;
+ new_fa->fa_type = cfg->fc_type;
+ new_fa->fa_state = 0;
+ new_fa->fa_slen = slen;
+ new_fa->tb_id = tb->tb_id;
+
+ /* (Optionally) offload fib entry to switch hardware. */
+ err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ netdev_switch_fib_ipv4_abort(fi);
+ goto out_free_new_fa;
+ }
+
+ /* Insert new entry to the list. */
+ err = fib_insert_alias(t, tp, l, new_fa, fa, key);
+ if (err)
+ goto out_sw_fib_del;
+
+ if (!plen)
+ tb->tb_num_default++;
+
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
+ &cfg->fc_nlinfo, 0);
+succeeded:
+ return 0;
+
+out_sw_fib_del:
+ netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
+out_free_new_fa:
+ kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+ fib_release_info(fi);
+err:
+ return err;
+}
+
+static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
+{
+ t_key prefix = n->key;
+
+ return (key ^ prefix) & (prefix | -prefix);
+}
+
+/* should be called with rcu_read_lock */
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
+ struct fib_result *res, int fib_flags)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
+ const t_key key = ntohl(flp->daddr);
+ struct key_vector *n, *pn;
+ struct fib_alias *fa;
+ unsigned long index;
+ t_key cindex;
+
+ pn = t->kv;
+ cindex = 0;
+
+ n = get_child_rcu(pn, cindex);
+ if (!n)
+ return -EAGAIN;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->gets);
+#endif
+
+ /* Step 1: Travel to the longest prefix match in the trie */
+ for (;;) {
+ index = get_cindex(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the "bits" in the prefix. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if (index >= (1ul << bits))
+ * we have a mismatch in skip bits and failed
+ * else
+ * we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
+ */
+ if (index >= (1ul << n->bits))
+ break;
+
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n))
+ goto found;
+
+ /* only record pn and cindex if we are going to be chopping
+ * bits later. Otherwise we are just wasting cycles.
+ */
+ if (n->slen > n->pos) {
+ pn = n;
+ cindex = index;
+ }
+
+ n = get_child_rcu(n, index);
+ if (unlikely(!n))
+ goto backtrace;
+ }
+
+ /* Step 2: Sort out leaves and begin backtracing for longest prefix */
+ for (;;) {
+ /* record the pointer where our next node pointer is stored */
+ struct key_vector __rcu **cptr = n->tnode;
+
+ /* This test verifies that none of the bits that differ
+ * between the key and the prefix exist in the region of
+ * the lsb and higher in the prefix.
+ */
+ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
+ goto backtrace;
+
+ /* exit out and process leaf */
+ if (unlikely(IS_LEAF(n)))
+ break;
+
+ /* Don't bother recording parent info. Since we are in
+ * prefix match mode we will have to come back to wherever
+ * we started this traversal anyway
+ */
+
+ while ((n = rcu_dereference(*cptr)) == NULL) {
+backtrace:
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ if (!n)
+ this_cpu_inc(stats->null_node_hit);
+#endif
+ /* If we are at cindex 0 there are no more bits for
+ * us to strip at this level so we must ascend back
+ * up one level to see if there are any more bits to
+ * be stripped there.
+ */
+ while (!cindex) {
+ t_key pkey = pn->key;
+
+ /* If we don't have a parent then there is
+ * nothing for us to do as we do not have any
+ * further nodes to parse.
+ */
+ if (IS_TRIE(pn))
+ return -EAGAIN;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->backtrack);
+#endif
+ /* Get Child's index */
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn);
+ }
+
+ /* strip the least significant bit from the cindex */
+ cindex &= cindex - 1;
+
+ /* grab pointer for next child node */
+ cptr = &pn->tnode[cindex];
+ }
+ }
+
+found:
+ /* this line carries forward the xor from earlier in the function */
+ index = key ^ n->key;
+
+ /* Step 3: Process the leaf, if that fails fall back to backtracing */
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
+
+ if ((index >= (1ul << fa->fa_slen)) &&
+ ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
+ continue;
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (unlikely(err < 0)) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_passed);
+#endif
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ continue;
+
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
+
+ res->prefixlen = KEYLENGTH - fa->fa_slen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fi->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &n->leaf;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_passed);
+#endif
+ return err;
+ }
+ }
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_miss);
+#endif
+ goto backtrace;
+}
+EXPORT_SYMBOL_GPL(fib_table_lookup);
+
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old)
+{
+ /* record the location of the previous list_info entry */
+ struct hlist_node **pprev = old->fa_list.pprev;
+ struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
+
+ /* remove the fib_alias from the list */
+ hlist_del_rcu(&old->fa_list);
+
+ /* if we emptied the list this leaf will be freed and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ if (hlist_empty(&l->leaf)) {
+ put_child_root(tp, l->key, NULL);
+ node_free(l);
+ trie_rebalance(t, tp);
+ return;
+ }
+
+ /* only access fa if it is pointing at the last valid hlist_node */
+ if (*pprev)
+ return;
+
+ /* update the trie with the latest suffix length */
+ l->slen = fa->fa_slen;
+ leaf_pull_suffix(tp, l);
+}
+
+/* Caller must hold RTNL. */
+int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+ struct fib_alias *fa, *fa_to_delete;
+ struct key_vector *l, *tp;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ u8 tos = cfg->fc_tos;
+ u32 key;
+
+ if (plen > KEYLENGTH)
+ return -EINVAL;
+
+ key = ntohl(cfg->fc_dst);
+
+ if ((plen < KEYLENGTH) && (key << plen))
+ return -EINVAL;
+
+ l = fib_find_node(t, &tp, key);
+ if (!l)
+ return -ESRCH;
+
+ fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
+ if (!fa)
+ return -ESRCH;
+
+ pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+
+ fa_to_delete = NULL;
+ hlist_for_each_entry_from(fa, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
+ break;
+
+ if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
+ (cfg->fc_scope == RT_SCOPE_NOWHERE ||
+ fa->fa_info->fib_scope == cfg->fc_scope) &&
+ (!cfg->fc_prefsrc ||
+ fi->fib_prefsrc == cfg->fc_prefsrc) &&
+ (!cfg->fc_protocol ||
+ fi->fib_protocol == cfg->fc_protocol) &&
+ fib_nh_match(cfg, fi) == 0) {
+ fa_to_delete = fa;
+ break;
+ }
+ }
+
+ if (!fa_to_delete)
+ return -ESRCH;
+
+ netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+ cfg->fc_type, tb->tb_id);
+
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
+ &cfg->fc_nlinfo, 0);
+
+ if (!plen)
+ tb->tb_num_default--;
+
+ fib_remove_alias(t, tp, l, fa_to_delete);
+
+ if (fa_to_delete->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+ fib_release_info(fa_to_delete->fa_info);
+ alias_free_mem_rcu(fa_to_delete);
+ return 0;
+}
+
+/* Scan for the next leaf starting at the provided key value */
+static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
+{
+ struct key_vector *pn, *n = *tn;
+ unsigned long cindex;
+
+ /* this loop is meant to try and find the key in the trie */
+ do {
+ /* record parent and next child index */
+ pn = n;
+ cindex = key ? get_index(key, pn) : 0;
+
+ if (cindex >> pn->bits)
+ break;
+
+ /* descend into the next child */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ break;
+
+ /* guarantee forward progress on the keys */
+ if (IS_LEAF(n) && (n->key >= key))
+ goto found;
+ } while (IS_TNODE(n));
+
+ /* this loop will search for the next leaf with a greater key */
+ while (!IS_TRIE(pn)) {
+ /* if we exhausted the parent node we will need to climb */
+ if (cindex >= (1ul << pn->bits)) {
+ t_key pkey = pn->key;
+
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ continue;
+
+ /* no need to compare keys since we bumped the index */
+ if (IS_LEAF(n))
+ goto found;
+
+ /* Rescan start scanning in new node */
+ pn = n;
+ cindex = 0;
+ }
+
+ *tn = pn;
+ return NULL; /* Root of trie */
+found:
+ /* if we are at the limit for keys just return NULL for the tnode */
+ *tn = pn;
+ return n;
+}
+
+static void fib_trie_free(struct fib_table *tb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
+
+ /* walk trie in reverse order and free everything */
+ for (;;) {
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ n = pn;
+ pn = node_parent(pn);
+
+ /* drop emptied tnode */
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ }
+
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ free_percpu(t->stats);
+#endif
+ kfree(tb);
+}
+
+struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
+{
+ struct trie *ot = (struct trie *)oldtb->tb_data;
+ struct key_vector *l, *tp = ot->kv;
+ struct fib_table *local_tb;
+ struct fib_alias *fa;
+ struct trie *lt;
+ t_key key = 0;
+
+ if (oldtb->tb_data == oldtb->__data)
+ return oldtb;
+
+ local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
+ if (!local_tb)
+ return NULL;
+
+ lt = (struct trie *)local_tb->tb_data;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ struct key_vector *local_l = NULL, *local_tp;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_alias *new_fa;
+
+ if (local_tb->tb_id != fa->tb_id)
+ continue;
+
+ /* clone fa for new local table */
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ memcpy(new_fa, fa, sizeof(*fa));
+
+ /* insert clone into table */
+ if (!local_l)
+ local_l = fib_find_node(lt, &local_tp, l->key);
+
+ if (fib_insert_alias(lt, local_tp, local_l, new_fa,
+ NULL, l->key))
+ goto out;
+ }
+
+ /* stop loop if key wrapped back to 0 */
+ key = l->key + 1;
+ if (key < l->key)
+ break;
+ }
+
+ return local_tb;
+out:
+ fib_trie_free(local_tb);
+
+ return NULL;
+}
+
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
+
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ /* if alias was cloned to local then we just
+ * need to remove the local copy from main
+ */
+ if (tb->tb_id != fa->tb_id) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ continue;
+ }
+
+ /* record local slen */
+ slen = fa->fa_slen;
+
+ if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
+ continue;
+
+ netdev_switch_fib_ipv4_del(n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos,
+ fa->fa_type, tb->tb_id);
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ } else {
+ leaf_pull_suffix(pn, n);
+ }
+ }
+}
+
+/* Caller must hold RTNL. */
+int fib_table_flush(struct fib_table *tb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
+ int found = 0;
+
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ netdev_switch_fib_ipv4_del(n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos,
+ fa->fa_type, tb->tb_id);
+ hlist_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ found++;
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ } else {
+ leaf_pull_suffix(pn, n);
+ }
+ }
+
+ pr_debug("trie_flush found=%d\n", found);
+ return found;
+}
+
+static void __trie_free_rcu(struct rcu_head *head)
+{
+ struct fib_table *tb = container_of(head, struct fib_table, rcu);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie *t = (struct trie *)tb->tb_data;
+
+ if (tb->tb_data == tb->__data)
+ free_percpu(t->stats);
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
+ kfree(tb);
+}
+
+void fib_free_table(struct fib_table *tb)
+{
+ call_rcu(&tb->rcu, __trie_free_rcu);
+}
+
+static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ __be32 xkey = htonl(l->key);
+ struct fib_alias *fa;
+ int i, s_i;
+
+ s_i = cb->args[4];
+ i = 0;
+
+ /* rcu_read_lock is hold by caller */
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ if (i < s_i) {
+ i++;
+ continue;
+ }
+
+ if (tb->tb_id != fa->tb_id) {
+ i++;
+ continue;
+ }
+
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->tb_id,
+ fa->fa_type,
+ xkey,
+ KEYLENGTH - fa->fa_slen,
+ fa->fa_tos,
+ fa->fa_info, NLM_F_MULTI) < 0) {
+ cb->args[4] = i;
+ return -1;
+ }
+ i++;
+ }
+
+ cb->args[4] = i;
+ return skb->len;
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ /* Dump starting at last key.
+ * Note: 0.0.0.0/0 (ie default) is first key.
+ */
+ int count = cb->args[2];
+ t_key key = cb->args[3];
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+ cb->args[3] = key;
+ cb->args[2] = count;
+ return -1;
+ }
+
+ ++count;
+ key = l->key + 1;
+
+ memset(&cb->args[4], 0,
+ sizeof(cb->args) - 4*sizeof(cb->args[0]));
+
+ /* stop loop if key wrapped back to 0 */
+ if (key < l->key)
+ break;
+ }
+
+ cb->args[3] = key;
+ cb->args[2] = count;
+
+ return skb->len;
+}
+
+void __init fib_trie_init(void)
+{
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
+ 0, SLAB_PANIC, NULL);
+
+ trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+ LEAF_SIZE,
+ 0, SLAB_PANIC, NULL);
+}
+
+struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
+{
+ struct fib_table *tb;
+ struct trie *t;
+ size_t sz = sizeof(*tb);
+
+ if (!alias)
+ sz += sizeof(struct trie);
+
+ tb = kzalloc(sz, GFP_KERNEL);
+ if (!tb)
+ return NULL;
+
+ tb->tb_id = id;
+ tb->tb_default = -1;
+ tb->tb_num_default = 0;
+ tb->tb_data = (alias ? alias->__data : tb->__data);
+
+ if (alias)
+ return tb;
+
+ t = (struct trie *) tb->tb_data;
+ t->kv[0].pos = KEYLENGTH;
+ t->kv[0].slen = KEYLENGTH;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats = alloc_percpu(struct trie_use_stats);
+ if (!t->stats) {
+ kfree(tb);
+ tb = NULL;
+ }
+#endif
+
+ return tb;
+}
+
+#ifdef CONFIG_PROC_FS
+/* Depth first Trie walk iterator */
+struct fib_trie_iter {
+ struct seq_net_private p;
+ struct fib_table *tb;
+ struct key_vector *tnode;
+ unsigned int index;
+ unsigned int depth;
+};
+
+static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
+{
+ unsigned long cindex = iter->index;
+ struct key_vector *pn = iter->tnode;
+ t_key pkey;
+
+ pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
+ iter->tnode, iter->index, iter->depth);
+
+ while (!IS_TRIE(pn)) {
+ while (cindex < child_length(pn)) {
+ struct key_vector *n = get_child_rcu(pn, cindex++);
+
+ if (!n)
+ continue;
+
+ if (IS_LEAF(n)) {
+ iter->tnode = pn;
+ iter->index = cindex;
+ } else {
+ /* push down one level */
+ iter->tnode = n;
+ iter->index = 0;
+ ++iter->depth;
+ }
+
+ return n;
+ }
+
+ /* Current node exhausted, pop back up */
+ pkey = pn->key;
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
+ --iter->depth;
+ }
+
+ /* record root node so further searches know we are done */
+ iter->tnode = pn;
+ iter->index = 0;
+
+ return NULL;
+}
+
+static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
+ struct trie *t)
+{
+ struct key_vector *n, *pn = t->kv;
+
+ if (!t)
+ return NULL;
+
+ n = rcu_dereference(pn->tnode[0]);
+ if (!n)
+ return NULL;
+
+ if (IS_TNODE(n)) {
+ iter->tnode = n;
+ iter->index = 0;
+ iter->depth = 1;
+ } else {
+ iter->tnode = pn;
+ iter->index = 0;
+ iter->depth = 0;
+ }
+
+ return n;
+}
+
+static void trie_collect_stats(struct trie *t, struct trie_stat *s)
+{
+ struct key_vector *n;
+ struct fib_trie_iter iter;
+
+ memset(s, 0, sizeof(*s));
+
+ rcu_read_lock();
+ for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
+ if (IS_LEAF(n)) {
+ struct fib_alias *fa;
+
+ s->leaves++;
+ s->totdepth += iter.depth;
+ if (iter.depth > s->maxdepth)
+ s->maxdepth = iter.depth;
+
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
+ ++s->prefixes;
+ } else {
+ s->tnodes++;
+ if (n->bits < MAX_STAT_DEPTH)
+ s->nodesizes[n->bits]++;
+ s->nullpointers += tn_info(n)->empty_children;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This outputs /proc/net/fib_triestats
+ */
+static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
+{
+ unsigned int i, max, pointers, bytes, avdepth;
+
+ if (stat->leaves)
+ avdepth = stat->totdepth*100 / stat->leaves;
+ else
+ avdepth = 0;
+
+ seq_printf(seq, "\tAver depth: %u.%02d\n",
+ avdepth / 100, avdepth % 100);
+ seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
+
+ seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
+ bytes = LEAF_SIZE * stat->leaves;
+
+ seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
+ bytes += sizeof(struct fib_alias) * stat->prefixes;
+
+ seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
+ bytes += TNODE_SIZE(0) * stat->tnodes;
+
+ max = MAX_STAT_DEPTH;
+ while (max > 0 && stat->nodesizes[max-1] == 0)
+ max--;
+
+ pointers = 0;
+ for (i = 1; i < max; i++)
+ if (stat->nodesizes[i] != 0) {
+ seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
+ pointers += (1<<i) * stat->nodesizes[i];
+ }
+ seq_putc(seq, '\n');
+ seq_printf(seq, "\tPointers: %u\n", pointers);
+
+ bytes += sizeof(struct key_vector *) * pointers;
+ seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
+ seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
+}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+static void trie_show_usage(struct seq_file *seq,
+ const struct trie_use_stats __percpu *stats)
+{
+ struct trie_use_stats s = { 0 };
+ int cpu;
+
+ /* loop through all of the CPUs and gather up the stats */
+ for_each_possible_cpu(cpu) {
+ const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
+
+ s.gets += pcpu->gets;
+ s.backtrack += pcpu->backtrack;
+ s.semantic_match_passed += pcpu->semantic_match_passed;
+ s.semantic_match_miss += pcpu->semantic_match_miss;
+ s.null_node_hit += pcpu->null_node_hit;
+ s.resize_node_skipped += pcpu->resize_node_skipped;
+ }
+
+ seq_printf(seq, "\nCounters:\n---------\n");
+ seq_printf(seq, "gets = %u\n", s.gets);
+ seq_printf(seq, "backtracks = %u\n", s.backtrack);
+ seq_printf(seq, "semantic match passed = %u\n",
+ s.semantic_match_passed);
+ seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
+ seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
+ seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
+}
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
+
+static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
+{
+ if (tb->tb_id == RT_TABLE_LOCAL)
+ seq_puts(seq, "Local:\n");
+ else if (tb->tb_id == RT_TABLE_MAIN)
+ seq_puts(seq, "Main:\n");
+ else
+ seq_printf(seq, "Id %d:\n", tb->tb_id);
+}
+
+
+static int fib_triestat_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = (struct net *)seq->private;
+ unsigned int h;
+
+ seq_printf(seq,
+ "Basic info: size of leaf:"
+ " %Zd bytes, size of tnode: %Zd bytes.\n",
+ LEAF_SIZE, TNODE_SIZE(0));
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct trie *t = (struct trie *) tb->tb_data;
+ struct trie_stat stat;
+
+ if (!t)
+ continue;
+
+ fib_table_print(seq, tb);
+
+ trie_collect_stats(t, &stat);
+ trie_show_stats(seq, &stat);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ trie_show_usage(seq, t->stats);
+#endif
+ }
+ }
+
+ return 0;
+}
+
+static int fib_triestat_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, fib_triestat_seq_show);
+}
+
+static const struct file_operations fib_triestat_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_triestat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct fib_trie_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ loff_t idx = 0;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ struct key_vector *n;
+
+ for (n = fib_trie_get_first(iter,
+ (struct trie *) tb->tb_data);
+ n; n = fib_trie_get_next(iter))
+ if (pos == idx++) {
+ iter->tb = tb;
+ return n;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return fib_trie_get_idx(seq, *pos);
+}
+
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct fib_trie_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct fib_table *tb = iter->tb;
+ struct hlist_node *tb_node;
+ unsigned int h;
+ struct key_vector *n;
+
+ ++*pos;
+ /* next node in same table */
+ n = fib_trie_get_next(iter);
+ if (n)
+ return n;
+
+ /* walk rest of this hash chain */
+ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
+ while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
+ tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
+ n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+ if (n)
+ goto found;
+ }
+
+ /* new hash chain */
+ while (++h < FIB_TABLE_HASHSZ) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+ if (n)
+ goto found;
+ }
+ }
+ return NULL;
+
+found:
+ iter->tb = tb;
+ return n;
+}
+
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static void seq_indent(struct seq_file *seq, int n)
+{
+ while (n-- > 0)
+ seq_puts(seq, " ");
+}
+
+static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
+{
+ switch (s) {
+ case RT_SCOPE_UNIVERSE: return "universe";
+ case RT_SCOPE_SITE: return "site";
+ case RT_SCOPE_LINK: return "link";
+ case RT_SCOPE_HOST: return "host";
+ case RT_SCOPE_NOWHERE: return "nowhere";
+ default:
+ snprintf(buf, len, "scope=%d", s);
+ return buf;
+ }
+}
+
+static const char *const rtn_type_names[__RTN_MAX] = {
+ [RTN_UNSPEC] = "UNSPEC",
+ [RTN_UNICAST] = "UNICAST",
+ [RTN_LOCAL] = "LOCAL",
+ [RTN_BROADCAST] = "BROADCAST",
+ [RTN_ANYCAST] = "ANYCAST",
+ [RTN_MULTICAST] = "MULTICAST",
+ [RTN_BLACKHOLE] = "BLACKHOLE",
+ [RTN_UNREACHABLE] = "UNREACHABLE",
+ [RTN_PROHIBIT] = "PROHIBIT",
+ [RTN_THROW] = "THROW",
+ [RTN_NAT] = "NAT",
+ [RTN_XRESOLVE] = "XRESOLVE",
+};
+
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
+{
+ if (t < __RTN_MAX && rtn_type_names[t])
+ return rtn_type_names[t];
+ snprintf(buf, len, "type %u", t);
+ return buf;
+}
+
+/* Pretty print the trie */
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
+{
+ const struct fib_trie_iter *iter = seq->private;
+ struct key_vector *n = v;
+
+ if (IS_TRIE(node_parent_rcu(n)))
+ fib_table_print(seq, iter->tb);
+
+ if (IS_TNODE(n)) {
+ __be32 prf = htonl(n->key);
+
+ seq_indent(seq, iter->depth-1);
+ seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
+ &prf, KEYLENGTH - n->pos - n->bits, n->bits,
+ tn_info(n)->full_children,
+ tn_info(n)->empty_children);
+ } else {
+ __be32 val = htonl(n->key);
+ struct fib_alias *fa;
+
+ seq_indent(seq, iter->depth);
+ seq_printf(seq, " |-- %pI4\n", &val);
+
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ char buf1[32], buf2[32];
+
+ seq_indent(seq, iter->depth + 1);
+ seq_printf(seq, " /%zu %s %s",
+ KEYLENGTH - fa->fa_slen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_info->fib_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, " tos=%d", fa->fa_tos);
+ seq_putc(seq, '\n');
+ }
+ }
+
+ return 0;
+}
+
+static const struct seq_operations fib_trie_seq_ops = {
+ .start = fib_trie_seq_start,
+ .next = fib_trie_seq_next,
+ .stop = fib_trie_seq_stop,
+ .show = fib_trie_seq_show,
+};
+
+static int fib_trie_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &fib_trie_seq_ops,
+ sizeof(struct fib_trie_iter));
+}
+
+static const struct file_operations fib_trie_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_trie_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+struct fib_route_iter {
+ struct seq_net_private p;
+ struct fib_table *main_tb;
+ struct key_vector *tnode;
+ loff_t pos;
+ t_key key;
+};
+
+static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
+ loff_t pos)
+{
+ struct fib_table *tb = iter->main_tb;
+ struct key_vector *l, **tp = &iter->tnode;
+ struct trie *t;
+ t_key key;
+
+ /* use cache location of next-to-find key */
+ if (iter->pos > 0 && pos >= iter->pos) {
+ pos -= iter->pos;
+ key = iter->key;
+ } else {
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
+ iter->pos = 0;
+ key = 0;
+ }
+
+ while ((l = leaf_walk_rcu(tp, key)) != NULL) {
+ key = l->key + 1;
+ iter->pos++;
+
+ if (pos-- <= 0)
+ break;
+
+ l = NULL;
+
+ /* handle unlikely case of a key wrap */
+ if (!key)
+ break;
+ }
+
+ if (l)
+ iter->key = key; /* remember it */
+ else
+ iter->pos = 0; /* forget it */
+
+ return l;
+}
+
+static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb;
+ struct trie *t;
+
+ rcu_read_lock();
+
+ tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
+ if (!tb)
+ return NULL;
+
+ iter->main_tb = tb;
+
+ if (*pos != 0)
+ return fib_route_get_idx(iter, *pos);
+
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
+ iter->pos = 0;
+ iter->key = 0;
+
+ return SEQ_START_TOKEN;
+}
+
+static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct key_vector *l = NULL;
+ t_key key = iter->key;
+
+ ++*pos;
+
+ /* only allow key of 0 for start of sequence */
+ if ((v == SEQ_START_TOKEN) || key)
+ l = leaf_walk_rcu(&iter->tnode, key);
+
+ if (l) {
+ iter->key = l->key + 1;
+ iter->pos++;
+ } else {
+ iter->pos = 0;
+ }
+
+ return l;
+}
+
+static void fib_route_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+{
+ unsigned int flags = 0;
+
+ if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+ flags = RTF_REJECT;
+ if (fi && fi->fib_nh->nh_gw)
+ flags |= RTF_GATEWAY;
+ if (mask == htonl(0xFFFFFFFF))
+ flags |= RTF_HOST;
+ flags |= RTF_UP;
+ return flags;
+}
+
+/*
+ * This outputs /proc/net/route.
+ * The format of the file is not supposed to be changed
+ * and needs to be same as fib_hash output to avoid breaking
+ * legacy utilities
+ */
+static int fib_route_seq_show(struct seq_file *seq, void *v)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb = iter->main_tb;
+ struct fib_alias *fa;
+ struct key_vector *l = v;
+ __be32 prefix;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+ "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+ "\tWindow\tIRTT");
+ return 0;
+ }
+
+ prefix = htonl(l->key);
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ const struct fib_info *fi = fa->fa_info;
+ __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+
+ if ((fa->fa_type == RTN_BROADCAST) ||
+ (fa->fa_type == RTN_MULTICAST))
+ continue;
+
+ if (fa->tb_id != tb->tb_id)
+ continue;
+
+ seq_setwidth(seq, 127);
+
+ if (fi)
+ seq_printf(seq,
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*",
+ prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ seq_printf(seq,
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
+
+ seq_pad(seq, '\n');
+ }
+
+ return 0;
+}
+
+static const struct seq_operations fib_route_seq_ops = {
+ .start = fib_route_seq_start,
+ .next = fib_route_seq_next,
+ .stop = fib_route_seq_stop,
+ .show = fib_route_seq_show,
+};
+
+static int fib_route_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &fib_route_seq_ops,
+ sizeof(struct fib_route_iter));
+}
+
+static const struct file_operations fib_route_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_route_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+int __net_init fib_proc_init(struct net *net)
+{
+ if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
+ goto out1;
+
+ if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+ &fib_triestat_fops))
+ goto out2;
+
+ if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
+ goto out3;
+
+ return 0;
+
+out3:
+ remove_proc_entry("fib_triestat", net->proc_net);
+out2:
+ remove_proc_entry("fib_trie", net->proc_net);
+out1:
+ return -ENOMEM;
+}
+
+void __net_exit fib_proc_exit(struct net *net)
+{
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ remove_proc_entry("route", net->proc_net);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
new file mode 100644
index 000000000..34968cd5c
--- /dev/null
+++ b/net/ipv4/fou.c
@@ -0,0 +1,999 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/genetlink.h>
+#include <net/gue.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/xfrm.h>
+#include <uapi/linux/fou.h>
+#include <uapi/linux/genetlink.h>
+
+struct fou {
+ struct socket *sock;
+ u8 protocol;
+ u8 flags;
+ __be16 port;
+ u16 type;
+ struct udp_offload udp_offloads;
+ struct list_head list;
+};
+
+#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
+
+struct fou_cfg {
+ u16 type;
+ u8 protocol;
+ u8 flags;
+ struct udp_port_cfg udp_config;
+};
+
+static unsigned int fou_net_id;
+
+struct fou_net {
+ struct list_head fou_list;
+ struct mutex fou_lock;
+};
+
+static inline struct fou *fou_from_sock(struct sock *sk)
+{
+ return sk->sk_user_data;
+}
+
+static void fou_recv_pull(struct sk_buff *skb, size_t len)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ /* Remove 'len' bytes from the packet (UDP header and
+ * FOU header if present).
+ */
+ iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ __skb_pull(skb, len);
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+ skb_reset_transport_header(skb);
+}
+
+static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+
+ if (!fou)
+ return 1;
+
+ fou_recv_pull(skb, sizeof(struct udphdr));
+
+ return -fou->protocol;
+}
+
+static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
+ void *data, size_t hdrlen, u8 ipproto,
+ bool nopartial)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+
+ if (!pskb_may_pull(skb, plen))
+ return NULL;
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ skb_remcsum_process(skb, (void *)guehdr + hdrlen,
+ start, offset, nopartial);
+
+ return guehdr;
+}
+
+static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr)
+{
+ /* No support yet */
+ kfree_skb(skb);
+ return 0;
+}
+
+static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct fou *fou = fou_from_sock(sk);
+ size_t len, optlen, hdrlen;
+ struct guehdr *guehdr;
+ void *data;
+ u16 doffset = 0;
+
+ if (!fou)
+ return 1;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, len))
+ goto drop;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ optlen = guehdr->hlen << 2;
+ len += optlen;
+
+ if (!pskb_may_pull(skb, len))
+ goto drop;
+
+ /* guehdr may change after pull */
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen))
+ goto drop;
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+
+ /* Pull csum through the guehdr now . This can be used if
+ * there is a remote checksum offload.
+ */
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_remcsum(skb, guehdr, data + doffset,
+ hdrlen, guehdr->proto_ctype,
+ !!(fou->flags &
+ FOU_F_REMCSUM_NOPARTIAL));
+ if (!guehdr)
+ goto drop;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
+ }
+
+ if (unlikely(guehdr->control))
+ return gue_control_message(skb, guehdr);
+
+ __skb_pull(skb, sizeof(struct udphdr) + hdrlen);
+ skb_reset_transport_header(skb);
+
+ return -guehdr->proto_ctype;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct sk_buff **fou_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb,
+ struct udp_offload *uoff)
+{
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ u8 proto = NAPI_GRO_CB(skb)->proto;
+ const struct net_offload **offloads;
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out_unlock;
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return pp;
+}
+
+static int fou_gro_complete(struct sk_buff *skb, int nhoff,
+ struct udp_offload *uoff)
+{
+ const struct net_offload *ops;
+ u8 proto = NAPI_GRO_CB(skb)->proto;
+ int err = -ENOSYS;
+ const struct net_offload **offloads;
+
+ udp_tunnel_gro_complete(skb, nhoff);
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ err = ops->callbacks.gro_complete(skb, nhoff);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
+ struct guehdr *guehdr, void *data,
+ size_t hdrlen, u8 ipproto,
+ struct gro_remcsum *grc, bool nopartial)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+
+ if (skb->remcsum_offload)
+ return NULL;
+
+ if (!NAPI_GRO_CB(skb)->csum_valid)
+ return NULL;
+
+ /* Pull checksum that will be written */
+ if (skb_gro_header_hard(skb, off + plen)) {
+ guehdr = skb_gro_header_slow(skb, off + plen, off);
+ if (!guehdr)
+ return NULL;
+ }
+
+ skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
+ start, offset, grc, nopartial);
+
+ skb->remcsum_offload = 1;
+
+ return guehdr;
+}
+
+static struct sk_buff **gue_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb,
+ struct udp_offload *uoff)
+{
+ const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ struct guehdr *guehdr;
+ size_t len, optlen, hdrlen, off;
+ void *data;
+ u16 doffset = 0;
+ int flush = 1;
+ struct fou *fou = container_of(uoff, struct fou, udp_offloads);
+ struct gro_remcsum grc;
+
+ skb_gro_remcsum_init(&grc);
+
+ off = skb_gro_offset(skb);
+ len = off + sizeof(*guehdr);
+
+ guehdr = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
+ }
+
+ optlen = guehdr->hlen << 2;
+ len += optlen;
+
+ if (skb_gro_header_hard(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
+ }
+
+ if (unlikely(guehdr->control) || guehdr->version != 0 ||
+ validate_gue_flags(guehdr, optlen))
+ goto out;
+
+ hdrlen = sizeof(*guehdr) + optlen;
+
+ /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr,
+ * this is needed if there is a remote checkcsum offload.
+ */
+ skb_gro_postpull_rcsum(skb, guehdr, hdrlen);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_gro_remcsum(skb, off, guehdr,
+ data + doffset, hdrlen,
+ guehdr->proto_ctype, &grc,
+ !!(fou->flags &
+ FOU_F_REMCSUM_NOPARTIAL));
+ if (!guehdr)
+ goto out;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
+ }
+
+ skb_gro_pull(skb, hdrlen);
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ const struct guehdr *guehdr2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ guehdr2 = (struct guehdr *)(p->data + off);
+
+ /* Compare base GUE header to be equal (covers
+ * hlen, version, proto_ctype, and flags.
+ */
+ if (guehdr->word != guehdr2->word) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* Compare optional fields are the same. */
+ if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
+ guehdr->hlen << 2)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[guehdr->proto_ctype]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_receive))
+ goto out_unlock;
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+ skb_gro_remcsum_cleanup(skb, &grc);
+
+ return pp;
+}
+
+static int gue_gro_complete(struct sk_buff *skb, int nhoff,
+ struct udp_offload *uoff)
+{
+ const struct net_offload **offloads;
+ struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
+ const struct net_offload *ops;
+ unsigned int guehlen;
+ u8 proto;
+ int err = -ENOENT;
+
+ proto = guehdr->proto_ctype;
+
+ guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[proto]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
+
+out_unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+static int fou_add_to_port_list(struct net *net, struct fou *fou)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fout;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (fou->port == fout->port) {
+ mutex_unlock(&fn->fou_lock);
+ return -EALREADY;
+ }
+ }
+
+ list_add(&fou->list, &fn->fou_list);
+ mutex_unlock(&fn->fou_lock);
+
+ return 0;
+}
+
+static void fou_release(struct fou *fou)
+{
+ struct socket *sock = fou->sock;
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_family == AF_INET)
+ udp_del_offload(&fou->udp_offloads);
+ list_del(&fou->list);
+ udp_tunnel_sock_release(sock);
+
+ kfree(fou);
+}
+
+static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
+{
+ udp_sk(sk)->encap_rcv = fou_udp_recv;
+ fou->protocol = cfg->protocol;
+ fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
+ fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
+ fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+ fou->udp_offloads.ipproto = cfg->protocol;
+
+ return 0;
+}
+
+static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
+{
+ udp_sk(sk)->encap_rcv = gue_udp_recv;
+ fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
+ fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
+ fou->udp_offloads.port = cfg->udp_config.local_udp_port;
+
+ return 0;
+}
+
+static int fou_create(struct net *net, struct fou_cfg *cfg,
+ struct socket **sockp)
+{
+ struct socket *sock = NULL;
+ struct fou *fou = NULL;
+ struct sock *sk;
+ int err;
+
+ /* Open UDP socket */
+ err = udp_sock_create(net, &cfg->udp_config, &sock);
+ if (err < 0)
+ goto error;
+
+ /* Allocate FOU port structure */
+ fou = kzalloc(sizeof(*fou), GFP_KERNEL);
+ if (!fou) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ sk = sock->sk;
+
+ fou->flags = cfg->flags;
+ fou->port = cfg->udp_config.local_udp_port;
+
+ /* Initial for fou type */
+ switch (cfg->type) {
+ case FOU_ENCAP_DIRECT:
+ err = fou_encap_init(sk, fou, cfg);
+ if (err)
+ goto error;
+ break;
+ case FOU_ENCAP_GUE:
+ err = gue_encap_init(sk, fou, cfg);
+ if (err)
+ goto error;
+ break;
+ default:
+ err = -EINVAL;
+ goto error;
+ }
+
+ fou->type = cfg->type;
+
+ udp_sk(sk)->encap_type = 1;
+ udp_encap_enable();
+
+ sk->sk_user_data = fou;
+ fou->sock = sock;
+
+ inet_inc_convert_csum(sk);
+
+ sk->sk_allocation = GFP_ATOMIC;
+
+ if (cfg->udp_config.family == AF_INET) {
+ err = udp_add_offload(&fou->udp_offloads);
+ if (err)
+ goto error;
+ }
+
+ err = fou_add_to_port_list(net, fou);
+ if (err)
+ goto error;
+
+ if (sockp)
+ *sockp = sock;
+
+ return 0;
+
+error:
+ kfree(fou);
+ if (sock)
+ udp_tunnel_sock_release(sock);
+
+ return err;
+}
+
+static int fou_destroy(struct net *net, struct fou_cfg *cfg)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ __be16 port = cfg->udp_config.local_udp_port;
+ int err = -EINVAL;
+ struct fou *fou;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fou, &fn->fou_list, list) {
+ if (fou->port == port) {
+ fou_release(fou);
+ err = 0;
+ break;
+ }
+ }
+ mutex_unlock(&fn->fou_lock);
+
+ return err;
+}
+
+static struct genl_family fou_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = FOU_GENL_NAME,
+ .version = FOU_GENL_VERSION,
+ .maxattr = FOU_ATTR_MAX,
+ .netnsok = true,
+};
+
+static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
+ [FOU_ATTR_PORT] = { .type = NLA_U16, },
+ [FOU_ATTR_AF] = { .type = NLA_U8, },
+ [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
+ [FOU_ATTR_TYPE] = { .type = NLA_U8, },
+ [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
+};
+
+static int parse_nl_config(struct genl_info *info,
+ struct fou_cfg *cfg)
+{
+ memset(cfg, 0, sizeof(*cfg));
+
+ cfg->udp_config.family = AF_INET;
+
+ if (info->attrs[FOU_ATTR_AF]) {
+ u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EINVAL;
+
+ cfg->udp_config.family = family;
+ }
+
+ if (info->attrs[FOU_ATTR_PORT]) {
+ __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
+
+ cfg->udp_config.local_udp_port = port;
+ }
+
+ if (info->attrs[FOU_ATTR_IPPROTO])
+ cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
+
+ if (info->attrs[FOU_ATTR_TYPE])
+ cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
+
+ if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL])
+ cfg->flags |= FOU_F_REMCSUM_NOPARTIAL;
+
+ return 0;
+}
+
+static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_cfg cfg;
+ int err;
+
+ err = parse_nl_config(info, &cfg);
+ if (err)
+ return err;
+
+ return fou_create(net, &cfg, NULL);
+}
+
+static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_cfg cfg;
+ int err;
+
+ err = parse_nl_config(info, &cfg);
+ if (err)
+ return err;
+
+ return fou_destroy(net, &cfg);
+}
+
+static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
+{
+ if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
+ nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
+ nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
+ nla_put_u8(msg, FOU_ATTR_TYPE, fou->type))
+ return -1;
+
+ if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
+ if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
+ return -1;
+ return 0;
+}
+
+static int fou_dump_info(struct fou *fou, u32 portid, u32 seq,
+ u32 flags, struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (fou_fill_info(fou, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct sk_buff *msg;
+ struct fou_cfg cfg;
+ struct fou *fout;
+ __be16 port;
+ int ret;
+
+ ret = parse_nl_config(info, &cfg);
+ if (ret)
+ return ret;
+ port = cfg.udp_config.local_udp_port;
+ if (port == 0)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = -ESRCH;
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (port == fout->port) {
+ ret = fou_dump_info(fout, info->snd_portid,
+ info->snd_seq, 0, msg,
+ info->genlhdr->cmd);
+ break;
+ }
+ }
+ mutex_unlock(&fn->fou_lock);
+ if (ret < 0)
+ goto out_free;
+
+ return genlmsg_reply(msg, info);
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fout;
+ int idx = 0, ret;
+
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry(fout, &fn->fou_list, list) {
+ if (idx++ < cb->args[0])
+ continue;
+ ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, FOU_CMD_GET);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fn->fou_lock);
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static const struct genl_ops fou_nl_ops[] = {
+ {
+ .cmd = FOU_CMD_ADD,
+ .doit = fou_nl_cmd_add_port,
+ .policy = fou_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_DEL,
+ .doit = fou_nl_cmd_rm_port,
+ .policy = fou_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_GET,
+ .doit = fou_nl_cmd_get_port,
+ .dumpit = fou_nl_dump,
+ .policy = fou_nl_policy,
+ },
+};
+
+size_t fou_encap_hlen(struct ip_tunnel_encap *e)
+{
+ return sizeof(struct udphdr);
+}
+EXPORT_SYMBOL(fou_encap_hlen);
+
+size_t gue_encap_hlen(struct ip_tunnel_encap *e)
+{
+ size_t len;
+ bool need_priv = false;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+
+ if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) {
+ len += GUE_PLEN_REMCSUM;
+ need_priv = true;
+ }
+
+ len += need_priv ? GUE_LEN_PRIV : 0;
+
+ return len;
+}
+EXPORT_SYMBOL(gue_encap_hlen);
+
+static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ struct flowi4 *fl4, u8 *protocol, __be16 sport)
+{
+ struct udphdr *uh;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ uh = udp_hdr(skb);
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ uh->check = 0;
+ udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
+ fl4->saddr, fl4->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+}
+
+int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(fou_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ struct guehdr *guehdr;
+ size_t hdrlen, optlen = 0;
+ __be16 sport;
+ void *data;
+ bool need_priv = false;
+
+ if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ csum = false;
+ optlen += GUE_PLEN_REMCSUM;
+ type |= SKB_GSO_TUNNEL_REMCSUM;
+ need_priv = true;
+ }
+
+ optlen += need_priv ? GUE_LEN_PRIV : 0;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Get source port (based on flow hash) before skb_push */
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ skb_push(skb, hdrlen);
+
+ guehdr = (struct guehdr *)skb->data;
+
+ guehdr->control = 0;
+ guehdr->version = 0;
+ guehdr->hlen = optlen >> 2;
+ guehdr->flags = 0;
+ guehdr->proto_ctype = *protocol;
+
+ data = &guehdr[1];
+
+ if (need_priv) {
+ __be32 *flags = data;
+
+ guehdr->flags |= GUE_FLAG_PRIV;
+ *flags = 0;
+ data += GUE_LEN_PRIV;
+
+ if (type & SKB_GSO_TUNNEL_REMCSUM) {
+ u16 csum_start = skb_checksum_start_offset(skb);
+ __be16 *pd = data;
+
+ if (csum_start < hdrlen)
+ return -EINVAL;
+
+ csum_start -= hdrlen;
+ pd[0] = htons(csum_start);
+ pd[1] = htons(csum_start + skb->csum_offset);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->encapsulation = 0;
+ }
+
+ *flags |= GUE_PFLAG_REMCSUM;
+ data += GUE_PLEN_REMCSUM;
+ }
+
+ }
+
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(gue_build_header);
+
+#ifdef CONFIG_NET_FOU_IP_TUNNELS
+
+static const struct ip_tunnel_encap_ops fou_iptun_ops = {
+ .encap_hlen = fou_encap_hlen,
+ .build_header = fou_build_header,
+};
+
+static const struct ip_tunnel_encap_ops gue_iptun_ops = {
+ .encap_hlen = gue_encap_hlen,
+ .build_header = gue_build_header,
+};
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ int ret;
+
+ ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ if (ret < 0) {
+ pr_err("can't add fou ops\n");
+ return ret;
+ }
+
+ ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+ if (ret < 0) {
+ pr_err("can't add gue ops\n");
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
+static __net_init int fou_init_net(struct net *net)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+
+ INIT_LIST_HEAD(&fn->fou_list);
+ mutex_init(&fn->fou_lock);
+ return 0;
+}
+
+static __net_exit void fou_exit_net(struct net *net)
+{
+ struct fou_net *fn = net_generic(net, fou_net_id);
+ struct fou *fou, *next;
+
+ /* Close all the FOU sockets */
+ mutex_lock(&fn->fou_lock);
+ list_for_each_entry_safe(fou, next, &fn->fou_list, list)
+ fou_release(fou);
+ mutex_unlock(&fn->fou_lock);
+}
+
+static struct pernet_operations fou_net_ops = {
+ .init = fou_init_net,
+ .exit = fou_exit_net,
+ .id = &fou_net_id,
+ .size = sizeof(struct fou_net),
+};
+
+static int __init fou_init(void)
+{
+ int ret;
+
+ ret = register_pernet_device(&fou_net_ops);
+ if (ret)
+ goto exit;
+
+ ret = genl_register_family_with_ops(&fou_nl_family,
+ fou_nl_ops);
+ if (ret < 0)
+ goto unregister;
+
+ ret = ip_tunnel_encap_add_fou_ops();
+ if (ret == 0)
+ return 0;
+
+ genl_unregister_family(&fou_nl_family);
+unregister:
+ unregister_pernet_device(&fou_net_ops);
+exit:
+ return ret;
+}
+
+static void __exit fou_fini(void)
+{
+ ip_tunnel_encap_del_fou_ops();
+ genl_unregister_family(&fou_nl_family);
+ unregister_pernet_device(&fou_net_ops);
+}
+
+module_init(fou_init);
+module_exit(fou_fini);
+MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
new file mode 100644
index 000000000..8986e63f3
--- /dev/null
+++ b/net/ipv4/geneve.c
@@ -0,0 +1,453 @@
+/*
+ * Geneve: Generic Network Virtualization Encapsulation
+ *
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/ethtool.h>
+#include <linux/mutex.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/geneve.h>
+#include <net/protocol.h>
+#include <net/udp_tunnel.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#endif
+
+/* Protects sock_list and refcounts. */
+static DEFINE_MUTEX(geneve_mutex);
+
+/* per-network namespace private data for this module */
+struct geneve_net {
+ struct list_head sock_list;
+};
+
+static int geneve_net_id;
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+ return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+static struct geneve_sock *geneve_find_sock(struct net *net,
+ sa_family_t family, __be16 port)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ struct geneve_sock *gs;
+
+ list_for_each_entry(gs, &gn->sock_list, list) {
+ if (inet_sk(gs->sock->sk)->inet_sport == port &&
+ inet_sk(gs->sock->sk)->sk.sk_family == family)
+ return gs;
+ }
+
+ return NULL;
+}
+
+static void geneve_build_header(struct genevehdr *geneveh,
+ __be16 tun_flags, u8 vni[3],
+ u8 options_len, u8 *options)
+{
+ geneveh->ver = GENEVE_VER;
+ geneveh->opt_len = options_len / 4;
+ geneveh->oam = !!(tun_flags & TUNNEL_OAM);
+ geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+ geneveh->rsvd1 = 0;
+ memcpy(geneveh->vni, vni, 3);
+ geneveh->proto_type = htons(ETH_P_TEB);
+ geneveh->rsvd2 = 0;
+
+ memcpy(geneveh->options, options, options_len);
+}
+
+/* Transmit a fully formatted Geneve frame.
+ *
+ * When calling this function. The skb->data should point
+ * to the geneve header which is fully formed.
+ *
+ * This function will add other UDP tunnel headers.
+ */
+int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
+ struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
+ __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
+ __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+ bool csum, bool xnet)
+{
+ struct genevehdr *gnvh;
+ int min_headroom;
+ int err;
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
+ + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+
+ err = skb_cow_head(skb, min_headroom);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (unlikely(!skb))
+ return -ENOMEM;
+
+ skb = udp_tunnel_handle_offloads(skb, csum);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
+ geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
+
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+
+ return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
+ tos, ttl, df, src_port, dst_port, xnet,
+ !csum);
+}
+EXPORT_SYMBOL_GPL(geneve_xmit_skb);
+
+static int geneve_hlen(struct genevehdr *gh)
+{
+ return sizeof(*gh) + gh->opt_len * 4;
+}
+
+static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb,
+ struct udp_offload *uoff)
+{
+ struct sk_buff *p, **pp = NULL;
+ struct genevehdr *gh, *gh2;
+ unsigned int hlen, gh_len, off_gnv;
+ const struct packet_offload *ptype;
+ __be16 type;
+ int flush = 1;
+
+ off_gnv = skb_gro_offset(skb);
+ hlen = off_gnv + sizeof(*gh);
+ gh = skb_gro_header_fast(skb, off_gnv);
+ if (skb_gro_header_hard(skb, hlen)) {
+ gh = skb_gro_header_slow(skb, hlen, off_gnv);
+ if (unlikely(!gh))
+ goto out;
+ }
+
+ if (gh->ver != GENEVE_VER || gh->oam)
+ goto out;
+ gh_len = geneve_hlen(gh);
+
+ hlen = off_gnv + gh_len;
+ if (skb_gro_header_hard(skb, hlen)) {
+ gh = skb_gro_header_slow(skb, hlen, off_gnv);
+ if (unlikely(!gh))
+ goto out;
+ }
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ gh2 = (struct genevehdr *)(p->data + off_gnv);
+ if (gh->opt_len != gh2->opt_len ||
+ memcmp(gh, gh2, gh_len)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ type = gh->proto_type;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype) {
+ flush = 1;
+ goto out_unlock;
+ }
+
+ skb_gro_pull(skb, gh_len);
+ skb_gro_postpull_rcsum(skb, gh, gh_len);
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
+ struct udp_offload *uoff)
+{
+ struct genevehdr *gh;
+ struct packet_offload *ptype;
+ __be16 type;
+ int gh_len;
+ int err = -ENOSYS;
+
+ udp_tunnel_gro_complete(skb, nhoff);
+
+ gh = (struct genevehdr *)(skb->data + nhoff);
+ gh_len = geneve_hlen(gh);
+ type = gh->proto_type;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
+
+ rcu_read_unlock();
+ return err;
+}
+
+static void geneve_notify_add_rx_port(struct geneve_sock *gs)
+{
+ struct sock *sk = gs->sock->sk;
+ sa_family_t sa_family = sk->sk_family;
+ int err;
+
+ if (sa_family == AF_INET) {
+ err = udp_add_offload(&gs->udp_offloads);
+ if (err)
+ pr_warn("geneve: udp_add_offload failed with status %d\n",
+ err);
+ }
+}
+
+static void geneve_notify_del_rx_port(struct geneve_sock *gs)
+{
+ struct sock *sk = gs->sock->sk;
+ sa_family_t sa_family = sk->sk_family;
+
+ if (sa_family == AF_INET)
+ udp_del_offload(&gs->udp_offloads);
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct genevehdr *geneveh;
+ struct geneve_sock *gs;
+ int opts_len;
+
+ /* Need Geneve and inner Ethernet header to be present */
+ if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
+ goto error;
+
+ /* Return packets with reserved bits set */
+ geneveh = geneve_hdr(skb);
+
+ if (unlikely(geneveh->ver != GENEVE_VER))
+ goto error;
+
+ if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
+ goto error;
+
+ opts_len = geneveh->opt_len * 4;
+ if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
+ htons(ETH_P_TEB)))
+ goto drop;
+
+ gs = rcu_dereference_sk_user_data(sk);
+ if (!gs)
+ goto drop;
+
+ gs->rcv(gs, skb);
+ return 0;
+
+drop:
+ /* Consume bad packet */
+ kfree_skb(skb);
+ return 0;
+
+error:
+ /* Let the UDP layer deal with the skb */
+ return 1;
+}
+
+static struct socket *geneve_create_sock(struct net *net, bool ipv6,
+ __be16 port)
+{
+ struct socket *sock;
+ struct udp_port_cfg udp_conf;
+ int err;
+
+ memset(&udp_conf, 0, sizeof(udp_conf));
+
+ if (ipv6) {
+ udp_conf.family = AF_INET6;
+ } else {
+ udp_conf.family = AF_INET;
+ udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+ }
+
+ udp_conf.local_udp_port = port;
+
+ /* Open UDP socket */
+ err = udp_sock_create(net, &udp_conf, &sock);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ return sock;
+}
+
+/* Create new listen socket if needed */
+static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
+ geneve_rcv_t *rcv, void *data,
+ bool ipv6)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ struct geneve_sock *gs;
+ struct socket *sock;
+ struct udp_tunnel_sock_cfg tunnel_cfg;
+
+ gs = kzalloc(sizeof(*gs), GFP_KERNEL);
+ if (!gs)
+ return ERR_PTR(-ENOMEM);
+
+ sock = geneve_create_sock(net, ipv6, port);
+ if (IS_ERR(sock)) {
+ kfree(gs);
+ return ERR_CAST(sock);
+ }
+
+ gs->sock = sock;
+ gs->refcnt = 1;
+ gs->rcv = rcv;
+ gs->rcv_data = data;
+
+ /* Initialize the geneve udp offloads structure */
+ gs->udp_offloads.port = port;
+ gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive;
+ gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
+ geneve_notify_add_rx_port(gs);
+
+ /* Mark socket as an encapsulation socket */
+ tunnel_cfg.sk_user_data = gs;
+ tunnel_cfg.encap_type = 1;
+ tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
+ tunnel_cfg.encap_destroy = NULL;
+ setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
+
+ list_add(&gs->list, &gn->sock_list);
+
+ return gs;
+}
+
+struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
+ geneve_rcv_t *rcv, void *data,
+ bool no_share, bool ipv6)
+{
+ struct geneve_sock *gs;
+
+ mutex_lock(&geneve_mutex);
+
+ gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
+ if (gs) {
+ if (!no_share && gs->rcv == rcv)
+ gs->refcnt++;
+ else
+ gs = ERR_PTR(-EBUSY);
+ } else {
+ gs = geneve_socket_create(net, port, rcv, data, ipv6);
+ }
+
+ mutex_unlock(&geneve_mutex);
+
+ return gs;
+}
+EXPORT_SYMBOL_GPL(geneve_sock_add);
+
+void geneve_sock_release(struct geneve_sock *gs)
+{
+ mutex_lock(&geneve_mutex);
+
+ if (--gs->refcnt)
+ goto unlock;
+
+ list_del(&gs->list);
+ geneve_notify_del_rx_port(gs);
+ udp_tunnel_sock_release(gs->sock);
+ kfree_rcu(gs, rcu);
+
+unlock:
+ mutex_unlock(&geneve_mutex);
+}
+EXPORT_SYMBOL_GPL(geneve_sock_release);
+
+static __net_init int geneve_init_net(struct net *net)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+
+ INIT_LIST_HEAD(&gn->sock_list);
+
+ return 0;
+}
+
+static struct pernet_operations geneve_net_ops = {
+ .init = geneve_init_net,
+ .id = &geneve_net_id,
+ .size = sizeof(struct geneve_net),
+};
+
+static int __init geneve_init_module(void)
+{
+ int rc;
+
+ rc = register_pernet_subsys(&geneve_net_ops);
+ if (rc)
+ return rc;
+
+ pr_info("Geneve driver\n");
+
+ return 0;
+}
+module_init(geneve_init_module);
+
+static void __exit geneve_cleanup_module(void)
+{
+ unregister_pernet_subsys(&geneve_net_ops);
+}
+module_exit(geneve_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
+MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
+MODULE_ALIAS_RTNL_LINK("geneve");
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
new file mode 100644
index 000000000..4a7b5b2a1
--- /dev/null
+++ b/net/ipv4/gre_demux.c
@@ -0,0 +1,367 @@
+/*
+ * GRE over IPv4 demultiplexer driver
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/if.h>
+#include <linux/icmp.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
+ 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+ int ret;
+
+ if (version >= GREPROTO_MAX)
+ return -EINVAL;
+
+ ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
+ 0 : -EBUSY;
+
+ if (ret)
+ return ret;
+
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(tpi->flags);
+ greh->protocol = tpi->proto;
+
+ if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (tpi->flags&TUNNEL_SEQ) {
+ *ptr = tpi->seq;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_KEY) {
+ *ptr = tpi->key;
+ ptr--;
+ }
+ if (tpi->flags&TUNNEL_CSUM &&
+ !(skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(gre_build_header);
+
+static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err)
+{
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = ip_gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else
+ tpi->key = 0;
+
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else
+ tpi->seq = 0;
+
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IP
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = htons(ETH_P_IP);
+ if ((*(u8 *)options & 0xF0) != 0x40) {
+ hdr_len += 4;
+ if (!pskb_may_pull(skb, hdr_len))
+ return -EINVAL;
+ }
+ }
+
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+}
+
+static int gre_cisco_rcv(struct sk_buff *skb)
+{
+ struct tnl_ptk_info tpi;
+ int i;
+ bool csum_err = false;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
+ /* Looped back packet, drop it! */
+ if (rt_is_output_route(skb_rtable(skb)))
+ goto drop;
+ }
+#endif
+
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ goto drop;
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+ int ret;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+ ret = proto->handler(skb, &tpi);
+ if (ret == PACKET_RCVD) {
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void gre_cisco_err(struct sk_buff *skb, u32 info)
+{
+ /* All the routers (except for Linux) return only
+ * 8 bytes of packet payload. It means, that precise relaying of
+ * ICMP in the real Internet is absolutely infeasible.
+ *
+ * Moreover, Cisco "wise men" put GRE key to the third word
+ * in GRE header. It makes impossible maintaining even soft
+ * state for keyed
+ * GRE tunnels with enabled checksum. Tell them "thank you".
+ *
+ * Well, I wonder, rfc1812 was written by Cisco employee,
+ * what the hell these idiots break standards established
+ * by themselves???
+ */
+
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct tnl_ptk_info tpi;
+ bool csum_err = false;
+ int i;
+
+ if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (!csum_err) /* ignore csum errors. */
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ return;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
+ IPPROTO_GRE, 0);
+ return;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
+ struct gre_cisco_protocol *proto;
+
+ proto = rcu_dereference(gre_cisco_proto_list[i]);
+ if (!proto)
+ continue;
+
+ if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
+ goto out;
+
+ }
+out:
+ rcu_read_unlock();
+}
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+ int ret;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->handler)
+ goto drop_unlock;
+ ret = proto->handler(skb);
+ rcu_read_unlock();
+ return ret;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ const struct gre_protocol *proto;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+
+ if (ver >= GREPROTO_MAX)
+ return;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (proto && proto->err_handler)
+ proto->err_handler(skb, info);
+ rcu_read_unlock();
+}
+
+static const struct net_protocol net_gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .netns_ok = 1,
+};
+
+static const struct gre_protocol ipgre_protocol = {
+ .handler = gre_cisco_rcv,
+ .err_handler = gre_cisco_err,
+};
+
+int gre_cisco_register(struct gre_cisco_protocol *newp)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[newp->priority];
+
+ return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_register);
+
+int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
+{
+ struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
+ &gre_cisco_proto_list[del_proto->priority];
+ int ret;
+
+ ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
+
+ if (ret)
+ return ret;
+
+ synchronize_net();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gre_cisco_unregister);
+
+static int __init gre_init(void)
+{
+ pr_info("GRE over IPv4 demultiplexor driver\n");
+
+ if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+ pr_err("can't add protocol\n");
+ goto err;
+ }
+
+ if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
+ pr_info("%s: can't add ipgre handler\n", __func__);
+ goto err_gre;
+ }
+
+ return 0;
+err_gre:
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+err:
+ return -EAGAIN;
+}
+
+static void __exit gre_exit(void)
+{
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
new file mode 100644
index 000000000..5aa46d4b4
--- /dev/null
+++ b/net/ipv4/gre_offload.c
@@ -0,0 +1,268 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * GRE GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t enc_features;
+ int ghl;
+ struct gre_base_hdr *greh;
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ __be16 protocol = skb->protocol;
+ int tnl_hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP)))
+ goto out;
+
+ if (!skb->encapsulation)
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ ghl = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ if (unlikely(ghl < sizeof(*greh)))
+ goto out;
+
+ csum = !!(greh->flags & GRE_CSUM);
+ if (csum)
+ skb->encap_hdr_csum = 1;
+
+ /* setup inner skb. */
+ skb->protocol = greh->protocol;
+ skb->encapsulation = 0;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & features;
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
+ goto out;
+ }
+
+ skb = segs;
+ tnl_hlen = skb_tnl_header_len(skb);
+ do {
+ __skb_push(skb, ghl);
+ if (csum) {
+ __be32 *pcsum;
+
+ if (skb_has_shared_frag(skb)) {
+ int err;
+
+ err = __skb_linearize(skb);
+ if (err) {
+ kfree_skb_list(segs);
+ segs = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ skb_reset_transport_header(skb);
+
+ greh = (struct gre_base_hdr *)
+ skb_transport_header(skb);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ }
+ __skb_push(skb, tnl_hlen - ghl);
+
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+static struct sk_buff **gre_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct gre_base_hdr *greh;
+ unsigned int hlen, grehlen;
+ unsigned int off;
+ int flush = 1;
+ struct packet_offload *ptype;
+ __be16 type;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*greh);
+ greh = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out;
+ }
+
+ /* Only support version 0 and K (key), C (csum) flags. Note that
+ * although the support for the S (seq#) flag can be added easily
+ * for GRO, this is problematic for GSO hence can not be enabled
+ * here because a GRO pkt may end up in the forwarding path, thus
+ * requiring GSO support to break it up correctly.
+ */
+ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
+ goto out;
+
+ type = greh->protocol;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype)
+ goto out_unlock;
+
+ grehlen = GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ hlen = off + grehlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ greh = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out_unlock;
+ }
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
+ if (skb_gro_checksum_simple_validate(skb))
+ goto out_unlock;
+
+ skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ }
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ const struct gre_base_hdr *greh2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ /* The following checks are needed to ensure only pkts
+ * from the same tunnel are considered for aggregation.
+ * The criteria for "the same tunnel" includes:
+ * 1) same version (we only support version 0 here)
+ * 2) same protocol (we only support ETH_P_IP for now)
+ * 3) same set of flags
+ * 4) same key if the key field is present.
+ */
+ greh2 = (struct gre_base_hdr *)(p->data + off);
+
+ if (greh2->flags != greh->flags ||
+ greh2->protocol != greh->protocol) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ if (greh->flags & GRE_KEY) {
+ /* compare keys */
+ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+ }
+
+ skb_gro_pull(skb, grehlen);
+
+ /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
+ skb_gro_postpull_rcsum(skb, greh, grehlen);
+
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int gre_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
+ struct packet_offload *ptype;
+ unsigned int grehlen = sizeof(*greh);
+ int err = -ENOENT;
+ __be16 type;
+
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
+
+ type = greh->protocol;
+ if (greh->flags & GRE_KEY)
+ grehlen += GRE_HEADER_SECTION;
+
+ if (greh->flags & GRE_CSUM)
+ grehlen += GRE_HEADER_SECTION;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
+
+ rcu_read_unlock();
+
+ skb_set_inner_mac_header(skb, nhoff + grehlen);
+
+ return err;
+}
+
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_segment = gre_gso_segment,
+ .gro_receive = gre_gro_receive,
+ .gro_complete = gre_gro_complete,
+ },
+};
+
+static int __init gre_offload_init(void)
+{
+ return inet_add_offload(&gre_offload, IPPROTO_GRE);
+}
+device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 000000000..f5203fba6
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1218 @@
+/*
+ * NET3: Implementation of the ICMP protocol layer.
+ *
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Some of the function names and the icmp unreach table for this
+ * module were derived from [icmp.c 1.0.11 06/02/93] by
+ * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ * Other than that this module is a complete rewrite.
+ *
+ * Fixes:
+ * Clemens Fruhwirth : introduce global icmp rate limiting
+ * with icmp type masking ability instead
+ * of broken per type icmp timeouts.
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Multicast ping reply as self.
+ * Alan Cox : Fix atomicity lockup in ip_build_xmit
+ * call.
+ * Alan Cox : Added 216,128 byte paths to the MTU
+ * code.
+ * Martin Mares : RFC1812 checks.
+ * Martin Mares : Can be configured to follow redirects
+ * if acting as a router _without_ a
+ * routing protocol (RFC 1812).
+ * Martin Mares : Echo requests may be configured to
+ * be ignored (RFC 1812).
+ * Martin Mares : Limitation of ICMP error message
+ * transmit rate (RFC 1812).
+ * Martin Mares : TOS and Precedence set correctly
+ * (RFC 1812).
+ * Martin Mares : Now copying as much data from the
+ * original packet as we can without
+ * exceeding 576 bytes (RFC 1812).
+ * Willy Konynenberg : Transparent proxying support.
+ * Keith Owens : RFC1191 correction for 4.2BSD based
+ * path MTU bug.
+ * Thomas Quinot : ICMP Dest Unreach codes up to 15 are
+ * valid (RFC 1812).
+ * Andi Kleen : Check all packet lengths properly
+ * and moved all kfree_skb() up to
+ * icmp_rcv.
+ * Andi Kleen : Move the rate limit bookkeeping
+ * into the dest entry and use a token
+ * bucket filter (thanks to ANK). Make
+ * the rates sysctl configurable.
+ * Yu Tianli : Fixed two ugly bugs in icmp_send
+ * - IP option length was accounted wrongly
+ * - ICMP header length was not accounted
+ * at all.
+ * Tristan Greaves : Added sysctl option to ignore bogus
+ * broadcast responses from broken routers.
+ *
+ * To Fix:
+ *
+ * - Should use skb_pull() instead of all the manual checking.
+ * This would also greatly simply some upper layer error handlers. --AK
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/slab.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/ping.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+#include <net/ip_fib.h>
+
+/*
+ * Build xmit assembly blocks
+ */
+
+struct icmp_bxm {
+ struct sk_buff *skb;
+ int offset;
+ int data_len;
+
+ struct {
+ struct icmphdr icmph;
+ __be32 times[3];
+ } data;
+ int head_len;
+ struct ip_options_data replyopts;
+};
+
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
+
+const struct icmp_err icmp_err_convert[] = {
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNREACH */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */
+ .fatal = 0,
+ },
+ {
+ .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */,
+ .fatal = 1,
+ },
+ {
+ .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */
+ .fatal = 1,
+ },
+ {
+ .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */
+ .fatal = 0,
+ },
+ {
+ .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */
+ .fatal = 0,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */
+ .fatal = 1,
+ },
+ {
+ .errno = ENONET, /* ICMP_HOST_ISOLATED */
+ .fatal = 1,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_ANO */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */
+ .fatal = 1,
+ },
+ {
+ .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */
+ .fatal = 0,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */
+ .fatal = 1,
+ },
+ {
+ .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */
+ .fatal = 1,
+ },
+};
+EXPORT_SYMBOL(icmp_err_convert);
+
+/*
+ * ICMP control array. This specifies what to do with each ICMP.
+ */
+
+struct icmp_control {
+ bool (*handler)(struct sk_buff *skb);
+ short error; /* This ICMP is classed as an error message */
+};
+
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+/*
+ * The ICMP socket(s). This is the most convenient way to flow control
+ * our ICMP output as well as maintain a clean interface throughout
+ * all layers. All Socketless IP sends will soon be gone.
+ *
+ * On SMP we have one ICMP socket per-cpu.
+ */
+static struct sock *icmp_sk(struct net *net)
+{
+ return *this_cpu_ptr(net->ipv4.icmp_sk);
+}
+
+static inline struct sock *icmp_xmit_lock(struct net *net)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+
+ sk = icmp_sk(net);
+
+ if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
+ /* This can happen if the output path signals a
+ * dst_link_failure() for an outgoing ICMP packet.
+ */
+ local_bh_enable();
+ return NULL;
+ }
+ return sk;
+}
+
+static inline void icmp_xmit_unlock(struct sock *sk)
+{
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+
+int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
+int sysctl_icmp_msgs_burst __read_mostly = 50;
+
+static struct {
+ spinlock_t lock;
+ u32 credit;
+ u32 stamp;
+} icmp_global = {
+ .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
+};
+
+/**
+ * icmp_global_allow - Are we allowed to send one more ICMP message ?
+ *
+ * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
+ * Returns false if we reached the limit and can not send another packet.
+ * Note: called with BH disabled
+ */
+bool icmp_global_allow(void)
+{
+ u32 credit, delta, incr = 0, now = (u32)jiffies;
+ bool rc = false;
+
+ /* Check if token bucket is empty and cannot be refilled
+ * without taking the spinlock.
+ */
+ if (!icmp_global.credit) {
+ delta = min_t(u32, now - icmp_global.stamp, HZ);
+ if (delta < HZ / 50)
+ return false;
+ }
+
+ spin_lock(&icmp_global.lock);
+ delta = min_t(u32, now - icmp_global.stamp, HZ);
+ if (delta >= HZ / 50) {
+ incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
+ if (incr)
+ icmp_global.stamp = now;
+ }
+ credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
+ if (credit) {
+ credit--;
+ rc = true;
+ }
+ icmp_global.credit = credit;
+ spin_unlock(&icmp_global.lock);
+ return rc;
+}
+EXPORT_SYMBOL(icmp_global_allow);
+
+/*
+ * Send an ICMP frame.
+ */
+
+static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+ struct flowi4 *fl4, int type, int code)
+{
+ struct dst_entry *dst = &rt->dst;
+ bool rc = true;
+
+ if (type > NR_ICMP_TYPES)
+ goto out;
+
+ /* Don't limit PMTU discovery. */
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ goto out;
+
+ /* No rate limit on loopback */
+ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+ goto out;
+
+ /* Limit if icmp type is enabled in ratemask. */
+ if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ goto out;
+
+ rc = false;
+ if (icmp_global_allow()) {
+ struct inet_peer *peer;
+
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1);
+ rc = inet_peer_xrlim_allow(peer,
+ net->ipv4.sysctl_icmp_ratelimit);
+ if (peer)
+ inet_putpeer(peer);
+ }
+out:
+ return rc;
+}
+
+/*
+ * Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+void icmp_out_count(struct net *net, unsigned char type)
+{
+ ICMPMSGOUT_INC_STATS(net, type);
+ ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
+}
+
+/*
+ * Checksum each fragment, and on the first include the headers and final
+ * checksum.
+ */
+static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+ __wsum csum;
+
+ csum = skb_copy_and_csum_bits(icmp_param->skb,
+ icmp_param->offset + offset,
+ to, len, 0);
+
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ if (icmp_pointers[icmp_param->data.icmph.type].error)
+ nf_ct_attach(skb, icmp_param->skb);
+ return 0;
+}
+
+static void icmp_push_reply(struct icmp_bxm *icmp_param,
+ struct flowi4 *fl4,
+ struct ipcm_cookie *ipc, struct rtable **rt)
+{
+ struct sock *sk;
+ struct sk_buff *skb;
+
+ sk = icmp_sk(dev_net((*rt)->dst.dev));
+ if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+icmp_param->head_len,
+ icmp_param->head_len,
+ ipc, rt, MSG_DONTWAIT) < 0) {
+ ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
+ ip_flush_pending_frames(sk);
+ } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ struct icmphdr *icmph = icmp_hdr(skb);
+ __wsum csum = 0;
+ struct sk_buff *skb1;
+
+ skb_queue_walk(&sk->sk_write_queue, skb1) {
+ csum = csum_add(csum, skb1->csum);
+ }
+ csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+ (char *)icmph,
+ icmp_param->head_len, csum);
+ icmph->checksum = csum_fold(csum);
+ skb->ip_summed = CHECKSUM_NONE;
+ ip_push_pending_frames(sk, fl4);
+ }
+}
+
+/*
+ * Driving logic for building and sending ICMP messages.
+ */
+
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+ struct ipcm_cookie ipc;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net(rt->dst.dev);
+ struct flowi4 fl4;
+ struct sock *sk;
+ struct inet_sock *inet;
+ __be32 daddr, saddr;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
+
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
+ return;
+
+ sk = icmp_xmit_lock(net);
+ if (!sk)
+ return;
+ inet = inet_sk(sk);
+
+ icmp_param->data.icmph.checksum = 0;
+
+ inet->tos = ip_hdr(skb)->tos;
+ sk->sk_mark = mark;
+ daddr = ipc.addr = ip_hdr(skb)->saddr;
+ saddr = fib_compute_spec_dst(skb);
+ ipc.opt = NULL;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ if (icmp_param->replyopts.opt.opt.optlen) {
+ ipc.opt = &icmp_param->replyopts.opt;
+ if (ipc.opt->opt.srr)
+ daddr = icmp_param->replyopts.opt.opt.faddr;
+ }
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_mark = mark;
+ fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_proto = IPPROTO_ICMP;
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ goto out_unlock;
+ if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
+ icmp_param->data.icmph.code))
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ ip_rt_put(rt);
+out_unlock:
+ icmp_xmit_unlock(sk);
+}
+
+static struct rtable *icmp_route_lookup(struct net *net,
+ struct flowi4 *fl4,
+ struct sk_buff *skb_in,
+ const struct iphdr *iph,
+ __be32 saddr, u8 tos, u32 mark,
+ int type, int code,
+ struct icmp_bxm *param)
+{
+ struct rtable *rt, *rt2;
+ struct flowi4 fl4_dec;
+ int err;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = (param->replyopts.opt.opt.srr ?
+ param->replyopts.opt.opt.faddr : iph->saddr);
+ fl4->saddr = saddr;
+ fl4->flowi4_mark = mark;
+ fl4->flowi4_tos = RT_TOS(tos);
+ fl4->flowi4_proto = IPPROTO_ICMP;
+ fl4->fl4_icmp_type = type;
+ fl4->fl4_icmp_code = code;
+ security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+ rt = __ip_route_output_key(net, fl4);
+ if (IS_ERR(rt))
+ return rt;
+
+ /* No need to clone since we're just using its address. */
+ rt2 = rt;
+
+ rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(fl4), NULL, 0);
+ if (!IS_ERR(rt)) {
+ if (rt != rt2)
+ return rt;
+ } else if (PTR_ERR(rt) == -EPERM) {
+ rt = NULL;
+ } else
+ return rt;
+
+ err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
+ if (err)
+ goto relookup_failed;
+
+ if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+ rt2 = __ip_route_output_key(net, &fl4_dec);
+ if (IS_ERR(rt2))
+ err = PTR_ERR(rt2);
+ } else {
+ struct flowi4 fl4_2 = {};
+ unsigned long orefdst;
+
+ fl4_2.daddr = fl4_dec.saddr;
+ rt2 = ip_route_output_key(net, &fl4_2);
+ if (IS_ERR(rt2)) {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ /* Ugh! */
+ orefdst = skb_in->_skb_refdst; /* save old refdst */
+ err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
+ RT_TOS(tos), rt2->dst.dev);
+
+ dst_release(&rt2->dst);
+ rt2 = skb_rtable(skb_in);
+ skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ }
+
+ if (err)
+ goto relookup_failed;
+
+ rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
+ flowi4_to_flowi(&fl4_dec), NULL,
+ XFRM_LOOKUP_ICMP);
+ if (!IS_ERR(rt2)) {
+ dst_release(&rt->dst);
+ memcpy(fl4, &fl4_dec, sizeof(*fl4));
+ rt = rt2;
+ } else if (PTR_ERR(rt2) == -EPERM) {
+ if (rt)
+ dst_release(&rt->dst);
+ return rt2;
+ } else {
+ err = PTR_ERR(rt2);
+ goto relookup_failed;
+ }
+ return rt;
+
+relookup_failed:
+ if (rt)
+ return rt;
+ return ERR_PTR(err);
+}
+
+/*
+ * Send an ICMP message in response to a situation
+ *
+ * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header.
+ * MAY send more (we do).
+ * MUST NOT change this header information.
+ * MUST NOT reply to a multicast/broadcast IP address.
+ * MUST NOT reply to a multicast/broadcast MAC address.
+ * MUST reply to only the first fragment.
+ */
+
+void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+ struct iphdr *iph;
+ int room;
+ struct icmp_bxm *icmp_param;
+ struct rtable *rt = skb_rtable(skb_in);
+ struct ipcm_cookie ipc;
+ struct flowi4 fl4;
+ __be32 saddr;
+ u8 tos;
+ u32 mark;
+ struct net *net;
+ struct sock *sk;
+
+ if (!rt)
+ goto out;
+ net = dev_net(rt->dst.dev);
+
+ /*
+ * Find the original header. It is expected to be valid, of course.
+ * Check this, icmp_send is called from the most obscure devices
+ * sometimes.
+ */
+ iph = ip_hdr(skb_in);
+
+ if ((u8 *)iph < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(*iph)) >
+ skb_tail_pointer(skb_in))
+ goto out;
+
+ /*
+ * No replies to physical multicast/broadcast
+ */
+ if (skb_in->pkt_type != PACKET_HOST)
+ goto out;
+
+ /*
+ * Now check at the protocol level
+ */
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto out;
+
+ /*
+ * Only reply to fragment 0. We byte re-order the constant
+ * mask for efficiency.
+ */
+ if (iph->frag_off & htons(IP_OFFSET))
+ goto out;
+
+ /*
+ * If we send an ICMP error to an ICMP error a mess would result..
+ */
+ if (icmp_pointers[type].error) {
+ /*
+ * We are an error, check if we are replying to an
+ * ICMP error
+ */
+ if (iph->protocol == IPPROTO_ICMP) {
+ u8 _inner_type, *itp;
+
+ itp = skb_header_pointer(skb_in,
+ skb_network_header(skb_in) +
+ (iph->ihl << 2) +
+ offsetof(struct icmphdr,
+ type) -
+ skb_in->data,
+ sizeof(_inner_type),
+ &_inner_type);
+ if (!itp)
+ goto out;
+
+ /*
+ * Assume any unknown ICMP type is an error. This
+ * isn't specified by the RFC, but think about it..
+ */
+ if (*itp > NR_ICMP_TYPES ||
+ icmp_pointers[*itp].error)
+ goto out;
+ }
+ }
+
+ icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
+ if (!icmp_param)
+ return;
+
+ sk = icmp_xmit_lock(net);
+ if (!sk)
+ goto out_free;
+
+ /*
+ * Construct source address and options.
+ */
+
+ saddr = iph->daddr;
+ if (!(rt->rt_flags & RTCF_LOCAL)) {
+ struct net_device *dev = NULL;
+
+ rcu_read_lock();
+ if (rt_is_input_route(rt) &&
+ net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
+ dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
+
+ if (dev)
+ saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ else
+ saddr = 0;
+ rcu_read_unlock();
+ }
+
+ tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+ IPTOS_PREC_INTERNETCONTROL) :
+ iph->tos;
+ mark = IP4_REPLY_MARK(net, skb_in->mark);
+
+ if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
+ goto out_unlock;
+
+
+ /*
+ * Prepare data for ICMP header.
+ */
+
+ icmp_param->data.icmph.type = type;
+ icmp_param->data.icmph.code = code;
+ icmp_param->data.icmph.un.gateway = info;
+ icmp_param->data.icmph.checksum = 0;
+ icmp_param->skb = skb_in;
+ icmp_param->offset = skb_network_offset(skb_in);
+ inet_sk(sk)->tos = tos;
+ sk->sk_mark = mark;
+ ipc.addr = iph->saddr;
+ ipc.opt = &icmp_param->replyopts.opt;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
+ type, code, icmp_param);
+ if (IS_ERR(rt))
+ goto out_unlock;
+
+ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+ goto ende;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+
+ room = dst_mtu(&rt->dst);
+ if (room > 576)
+ room = 576;
+ room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen;
+ room -= sizeof(struct icmphdr);
+
+ icmp_param->data_len = skb_in->len - icmp_param->offset;
+ if (icmp_param->data_len > room)
+ icmp_param->data_len = room;
+ icmp_param->head_len = sizeof(struct icmphdr);
+
+ icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ende:
+ ip_rt_put(rt);
+out_unlock:
+ icmp_xmit_unlock(sk);
+out_free:
+ kfree(icmp_param);
+out:;
+}
+EXPORT_SYMBOL(icmp_send);
+
+
+static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ /* Checkin full IP header plus 8 bytes of protocol to
+ * avoid additional coding at protocol handlers.
+ */
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ return;
+ }
+
+ raw_icmp_error(skb, protocol, info);
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, info);
+}
+
+static bool icmp_tag_validation(int proto)
+{
+ bool ok;
+
+ rcu_read_lock();
+ ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
+ rcu_read_unlock();
+ return ok;
+}
+
+/*
+ * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
+ * ICMP_PARAMETERPROB.
+ */
+
+static bool icmp_unreach(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ struct icmphdr *icmph;
+ struct net *net;
+ u32 info = 0;
+
+ net = dev_net(skb_dst(skb)->dev);
+
+ /*
+ * Incomplete header ?
+ * Only checks for the IP header, there should be an
+ * additional check for longer headers in upper levels.
+ */
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out_err;
+
+ icmph = icmp_hdr(skb);
+ iph = (const struct iphdr *)skb->data;
+
+ if (iph->ihl < 5) /* Mangled header, drop. */
+ goto out_err;
+
+ if (icmph->type == ICMP_DEST_UNREACH) {
+ switch (icmph->code & 15) {
+ case ICMP_NET_UNREACH:
+ case ICMP_HOST_UNREACH:
+ case ICMP_PROT_UNREACH:
+ case ICMP_PORT_UNREACH:
+ break;
+ case ICMP_FRAG_NEEDED:
+ /* for documentation of the ip_no_pmtu_disc
+ * values please see
+ * Documentation/networking/ip-sysctl.txt
+ */
+ switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
+ default:
+ net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
+ &iph->daddr);
+ break;
+ case 2:
+ goto out;
+ case 3:
+ if (!icmp_tag_validation(iph->protocol))
+ goto out;
+ /* fall through */
+ case 0:
+ info = ntohs(icmph->un.frag.mtu);
+ }
+ break;
+ case ICMP_SR_FAILED:
+ net_dbg_ratelimited("%pI4: Source Route Failed\n",
+ &iph->daddr);
+ break;
+ default:
+ break;
+ }
+ if (icmph->code > NR_ICMP_UNREACH)
+ goto out;
+ } else if (icmph->type == ICMP_PARAMETERPROB)
+ info = ntohl(icmph->un.gateway) >> 24;
+
+ /*
+ * Throw it at our lower layers
+ *
+ * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
+ * header.
+ * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
+ * transport layer.
+ * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
+ * transport layer.
+ */
+
+ /*
+ * Check the other end isn't violating RFC 1122. Some routers send
+ * bogus responses to broadcast frames. If you see this message
+ * first check your netmask matches at both ends, if it does then
+ * get the other vendor to fix their kit.
+ */
+
+ if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
+ inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
+ net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
+ &ip_hdr(skb)->saddr,
+ icmph->type, icmph->code,
+ &iph->daddr, skb->dev->name);
+ goto out;
+ }
+
+ icmp_socket_deliver(skb, info);
+
+out:
+ return true;
+out_err:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return false;
+}
+
+
+/*
+ * Handle ICMP_REDIRECT.
+ */
+
+static bool icmp_redirect(struct sk_buff *skb)
+{
+ if (skb->len < sizeof(struct iphdr)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ return false;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
+ /* there aught to be a stat */
+ return false;
+ }
+
+ icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+ return true;
+}
+
+/*
+ * Handle ICMP_ECHO ("ping") requests.
+ *
+ * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
+ * requests.
+ * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
+ * included in the reply.
+ * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
+ * echo requests, MUST have default=NOT.
+ * See also WRT handling of options once they are done and working.
+ */
+
+static bool icmp_echo(struct sk_buff *skb)
+{
+ struct net *net;
+
+ net = dev_net(skb_dst(skb)->dev);
+ if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
+ struct icmp_bxm icmp_param;
+
+ icmp_param.data.icmph = *icmp_hdr(skb);
+ icmp_param.data.icmph.type = ICMP_ECHOREPLY;
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = skb->len;
+ icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_reply(&icmp_param, skb);
+ }
+ /* should there be an ICMP stat for ignored echos? */
+ return true;
+}
+
+/*
+ * Handle ICMP Timestamp requests.
+ * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ * SHOULD be in the kernel for minimum random latency.
+ * MUST be accurate to a few minutes.
+ * MUST be updated at least at 15Hz.
+ */
+static bool icmp_timestamp(struct sk_buff *skb)
+{
+ struct timespec tv;
+ struct icmp_bxm icmp_param;
+ /*
+ * Too short.
+ */
+ if (skb->len < 4)
+ goto out_err;
+
+ /*
+ * Fill in the current time as ms since midnight UT:
+ */
+ getnstimeofday(&tv);
+ icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
+ tv.tv_nsec / NSEC_PER_MSEC);
+ icmp_param.data.times[2] = icmp_param.data.times[1];
+ if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
+ BUG();
+ icmp_param.data.icmph = *icmp_hdr(skb);
+ icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
+ icmp_param.data.icmph.code = 0;
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = 0;
+ icmp_param.head_len = sizeof(struct icmphdr) + 12;
+ icmp_reply(&icmp_param, skb);
+ return true;
+
+out_err:
+ ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+ return false;
+}
+
+static bool icmp_discard(struct sk_buff *skb)
+{
+ /* pretend it was a success */
+ return true;
+}
+
+/*
+ * Deal with incoming ICMP packets.
+ */
+int icmp_rcv(struct sk_buff *skb)
+{
+ struct icmphdr *icmph;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = dev_net(rt->dst.dev);
+ bool success;
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ struct sec_path *sp = skb_sec_path(skb);
+ int nh;
+
+ if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+ XFRM_STATE_ICMP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
+ goto drop;
+
+ nh = skb_network_offset(skb);
+ skb_set_network_header(skb, sizeof(*icmph));
+
+ if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ skb_set_network_header(skb, nh);
+ }
+
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
+
+ if (skb_checksum_simple_validate(skb))
+ goto csum_error;
+
+ if (!pskb_pull(skb, sizeof(*icmph)))
+ goto error;
+
+ icmph = icmp_hdr(skb);
+
+ ICMPMSGIN_INC_STATS_BH(net, icmph->type);
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES)
+ goto error;
+
+
+ /*
+ * Parse the ICMP message
+ */
+
+ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ /*
+ * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+ * silently ignored (we let user decide with a sysctl).
+ * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+ * discarded if to broadcast/multicast.
+ */
+ if ((icmph->type == ICMP_ECHO ||
+ icmph->type == ICMP_TIMESTAMP) &&
+ net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
+ goto error;
+ }
+ if (icmph->type != ICMP_ECHO &&
+ icmph->type != ICMP_TIMESTAMP &&
+ icmph->type != ICMP_ADDRESS &&
+ icmph->type != ICMP_ADDRESSREPLY) {
+ goto error;
+ }
+ }
+
+ success = icmp_pointers[icmph->type].handler(skb);
+
+ if (success) {
+ consume_skb(skb);
+ return 0;
+ }
+
+drop:
+ kfree_skb(skb);
+ return 0;
+csum_error:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS);
+error:
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ goto drop;
+}
+
+void icmp_err(struct sk_buff *skb, u32 info)
+{
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ int offset = iph->ihl<<2;
+ struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+ int type = icmp_hdr(skb)->type;
+ int code = icmp_hdr(skb)->code;
+ struct net *net = dev_net(skb->dev);
+
+ /*
+ * Use ping_err to handle all icmp errors except those
+ * triggered by ICMP_ECHOREPLY which sent from kernel.
+ */
+ if (icmph->type != ICMP_ECHOREPLY) {
+ ping_err(skb, offset, info);
+ return;
+ }
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ else if (type == ICMP_REDIRECT)
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+}
+
+/*
+ * This table is the definition of how we handle ICMP.
+ */
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+ [ICMP_ECHOREPLY] = {
+ .handler = ping_rcv,
+ },
+ [1] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [2] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_DEST_UNREACH] = {
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_SOURCE_QUENCH] = {
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_REDIRECT] = {
+ .handler = icmp_redirect,
+ .error = 1,
+ },
+ [6] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [7] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_ECHO] = {
+ .handler = icmp_echo,
+ },
+ [9] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [10] = {
+ .handler = icmp_discard,
+ .error = 1,
+ },
+ [ICMP_TIME_EXCEEDED] = {
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_PARAMETERPROB] = {
+ .handler = icmp_unreach,
+ .error = 1,
+ },
+ [ICMP_TIMESTAMP] = {
+ .handler = icmp_timestamp,
+ },
+ [ICMP_TIMESTAMPREPLY] = {
+ .handler = icmp_discard,
+ },
+ [ICMP_INFO_REQUEST] = {
+ .handler = icmp_discard,
+ },
+ [ICMP_INFO_REPLY] = {
+ .handler = icmp_discard,
+ },
+ [ICMP_ADDRESS] = {
+ .handler = icmp_discard,
+ },
+ [ICMP_ADDRESSREPLY] = {
+ .handler = icmp_discard,
+ },
+};
+
+static void __net_exit icmp_sk_exit(struct net *net)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
+ free_percpu(net->ipv4.icmp_sk);
+ net->ipv4.icmp_sk = NULL;
+}
+
+static int __net_init icmp_sk_init(struct net *net)
+{
+ int i, err;
+
+ net->ipv4.icmp_sk = alloc_percpu(struct sock *);
+ if (!net->ipv4.icmp_sk)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct sock *sk;
+
+ err = inet_ctl_sock_create(&sk, PF_INET,
+ SOCK_RAW, IPPROTO_ICMP, net);
+ if (err < 0)
+ goto fail;
+
+ *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff/skb_shared_info struct overhead.
+ */
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+
+ /*
+ * Speedup sock_wfree()
+ */
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+ }
+
+ /* Control parameters for ECHO replies. */
+ net->ipv4.sysctl_icmp_echo_ignore_all = 0;
+ net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
+
+ /* Control parameter - ignore bogus broadcast responses? */
+ net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
+
+ /*
+ * Configurable global rate limit.
+ *
+ * ratelimit defines tokens/packet consumed for dst->rate_token
+ * bucket ratemask defines which icmp types are ratelimited by
+ * setting it's bit position.
+ *
+ * default:
+ * dest unreachable (3), source quench (4),
+ * time exceeded (11), parameter problem (12)
+ */
+
+ net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
+ net->ipv4.sysctl_icmp_ratemask = 0x1818;
+ net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+
+ return 0;
+
+fail:
+ for_each_possible_cpu(i)
+ inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
+ free_percpu(net->ipv4.icmp_sk);
+ return err;
+}
+
+static struct pernet_operations __net_initdata icmp_sk_ops = {
+ .init = icmp_sk_init,
+ .exit = icmp_sk_exit,
+};
+
+int __init icmp_init(void)
+{
+ return register_pernet_subsys(&icmp_sk_ops);
+}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 000000000..a3a697f5f
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,2800 @@
+/*
+ * Linux NET3: Internet Group Management Protocol [IGMP]
+ *
+ * This code implements the IGMP protocol as defined in RFC1112. There has
+ * been a further revision of this protocol since which is now supported.
+ *
+ * If you have trouble with this module be careful what gcc you have used,
+ * the older version didn't come out right using gcc 2.5.8, the newer one
+ * seems to fall out with gcc 2.6.2.
+ *
+ * Authors:
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *
+ * Alan Cox : Added lots of __inline__ to optimise
+ * the memory usage of all the tiny little
+ * functions.
+ * Alan Cox : Dumped the header building experiment.
+ * Alan Cox : Minor tweaks ready for multicast routing
+ * and extended IGMP protocol.
+ * Alan Cox : Removed a load of inline directives. Gcc 2.5.8
+ * writes utterly bogus code otherwise (sigh)
+ * fixed IGMP loopback to behave in the manner
+ * desired by mrouted, fixed the fact it has been
+ * broken since 1.3.6 and cleaned up a few minor
+ * points.
+ *
+ * Chih-Jen Chang : Tried to revise IGMP to Version 2
+ * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ * The enhancements are mainly based on Steve Deering's
+ * ipmulti-3.5 source code.
+ * Chih-Jen Chang : Added the igmp_get_mrouter_info and
+ * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
+ * the mrouted version on that device.
+ * Chih-Jen Chang : Added the max_resp_time parameter to
+ * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter
+ * to identify the multicast router version
+ * and do what the IGMP version 2 specified.
+ * Chih-Jen Chang : Added a timer to revert to IGMP V2 router
+ * Tsu-Sheng Tsao if the specified time expired.
+ * Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
+ * Alan Cox : Use GFP_ATOMIC in the right places.
+ * Christian Daudt : igmp timer wasn't set for local group
+ * memberships but was being deleted,
+ * which caused a "del_timer() called
+ * from %p with timer not initialized\n"
+ * message (960131).
+ * Christian Daudt : removed del_timer from
+ * igmp_timer_expire function (960205).
+ * Christian Daudt : igmp_heard_report now only calls
+ * igmp_timer_expire if tm->running is
+ * true (960216).
+ * Malcolm Beattie : ttl comparison wrong in igmp_rcv made
+ * igmp_heard_query never trigger. Expiry
+ * miscalculation fixed in igmp_heard_query
+ * and random() made to return unsigned to
+ * prevent negative expiry times.
+ * Alexey Kuznetsov: Wrong group leaving behaviour, backport
+ * fix from pending 2.1.x patches.
+ * Alan Cox: Forget to enable FDDI support earlier.
+ * Alexey Kuznetsov: Fixed leaving groups on device down.
+ * Alexey Kuznetsov: Accordance to igmp-v2-06 draft.
+ * David L Stevens: IGMPv3 support, with help from
+ * Vinay Kulkarni
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/times.h>
+#include <linux/pkt_sched.h>
+
+#include <net/net_namespace.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/inet_common.h>
+#include <linux/netfilter_ipv4.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#endif
+
+#define IP_MAX_MEMBERSHIPS 20
+#define IP_MAX_MSF 10
+
+#ifdef CONFIG_IP_MULTICAST
+/* Parameter names and values are taken from igmp-v2-06 draft */
+
+#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ)
+#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ)
+#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ)
+#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ)
+#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
+#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2
+
+
+#define IGMP_INITIAL_REPORT_DELAY (1)
+
+/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+
+#define IGMP_V1_SEEN(in_dev) \
+ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
+ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
+ ((in_dev)->mr_v1_seen && \
+ time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) \
+ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
+ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
+ ((in_dev)->mr_v2_seen && \
+ time_before(jiffies, (in_dev)->mr_v2_seen)))
+
+static int unsolicited_report_interval(struct in_device *in_dev)
+{
+ int interval_ms, interval_jiffies;
+
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV2_UNSOLICITED_REPORT_INTERVAL);
+ else /* v3 */
+ interval_ms = IN_DEV_CONF_GET(
+ in_dev,
+ IGMPV3_UNSOLICITED_REPORT_INTERVAL);
+
+ interval_jiffies = msecs_to_jiffies(interval_ms);
+
+ /* _timer functions can't handle a delay of 0 jiffies so ensure
+ * we always return a positive value.
+ */
+ if (interval_jiffies <= 0)
+ interval_jiffies = 1;
+ return interval_jiffies;
+}
+
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
+static void igmpv3_clear_delrec(struct in_device *in_dev);
+static int sf_setstate(struct ip_mc_list *pmc);
+static void sf_markstate(struct ip_mc_list *pmc);
+#endif
+static void ip_mc_clear_src(struct ip_mc_list *pmc);
+static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+ int sfcount, __be32 *psfsrc, int delta);
+
+static void ip_ma_put(struct ip_mc_list *im)
+{
+ if (atomic_dec_and_test(&im->refcnt)) {
+ in_dev_put(im->interface);
+ kfree_rcu(im, rcu);
+ }
+}
+
+#define for_each_pmc_rcu(in_dev, pmc) \
+ for (pmc = rcu_dereference(in_dev->mc_list); \
+ pmc != NULL; \
+ pmc = rcu_dereference(pmc->next_rcu))
+
+#define for_each_pmc_rtnl(in_dev, pmc) \
+ for (pmc = rtnl_dereference(in_dev->mc_list); \
+ pmc != NULL; \
+ pmc = rtnl_dereference(pmc->next_rcu))
+
+#ifdef CONFIG_IP_MULTICAST
+
+/*
+ * Timer management
+ */
+
+static void igmp_stop_timer(struct ip_mc_list *im)
+{
+ spin_lock_bh(&im->lock);
+ if (del_timer(&im->timer))
+ atomic_dec(&im->refcnt);
+ im->tm_running = 0;
+ im->reporter = 0;
+ im->unsolicit_count = 0;
+ spin_unlock_bh(&im->lock);
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+ int tv = prandom_u32() % max_delay;
+
+ im->tm_running = 1;
+ if (!mod_timer(&im->timer, jiffies+tv+2))
+ atomic_inc(&im->refcnt);
+}
+
+static void igmp_gq_start_timer(struct in_device *in_dev)
+{
+ int tv = prandom_u32() % in_dev->mr_maxdelay;
+
+ in_dev->mr_gq_running = 1;
+ if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ in_dev_hold(in_dev);
+}
+
+static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
+{
+ int tv = prandom_u32() % delay;
+
+ if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
+ in_dev_hold(in_dev);
+}
+
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+ spin_lock_bh(&im->lock);
+ im->unsolicit_count = 0;
+ if (del_timer(&im->timer)) {
+ if ((long)(im->timer.expires-jiffies) < max_delay) {
+ add_timer(&im->timer);
+ im->tm_running = 1;
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ atomic_dec(&im->refcnt);
+ }
+ igmp_start_timer(im, max_delay);
+ spin_unlock_bh(&im->lock);
+}
+
+
+/*
+ * Send an IGMP report.
+ */
+
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+
+static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+ int gdeleted, int sdeleted)
+{
+ switch (type) {
+ case IGMPV3_MODE_IS_INCLUDE:
+ case IGMPV3_MODE_IS_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ if (!(pmc->gsquery && !psf->sf_gsresp)) {
+ if (pmc->sfmode == MCAST_INCLUDE)
+ return 1;
+ /* don't include if this source is excluded
+ * in all filters
+ */
+ if (psf->sf_count[MCAST_INCLUDE])
+ return type == IGMPV3_MODE_IS_INCLUDE;
+ return pmc->sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ }
+ return 0;
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ return psf->sf_count[MCAST_INCLUDE] != 0;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return 0;
+ if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
+ psf->sf_count[MCAST_INCLUDE])
+ return 0;
+ return pmc->sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ if (gdeleted || !psf->sf_crcount)
+ return 0;
+ return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ if (pmc->sfmode == MCAST_INCLUDE)
+ return gdeleted || (psf->sf_crcount && sdeleted);
+ return psf->sf_crcount && !gdeleted && !sdeleted;
+ }
+ return 0;
+}
+
+static int
+igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
+{
+ struct ip_sf_list *psf;
+ int scount = 0;
+
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+ continue;
+ scount++;
+ }
+ return scount;
+}
+
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
+{
+ struct sk_buff *skb;
+ struct rtable *rt;
+ struct iphdr *pip;
+ struct igmpv3_report *pig;
+ struct net *net = dev_net(dev);
+ struct flowi4 fl4;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ unsigned int size = mtu;
+
+ while (1) {
+ skb = alloc_skb(size + hlen + tlen,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (skb)
+ break;
+ size >>= 1;
+ if (size < 256)
+ return NULL;
+ }
+ skb->priority = TC_PRIO_CONTROL;
+
+ rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ skb_dst_set(skb, &rt->dst);
+ skb->dev = dev;
+
+ skb->reserved_tailroom = skb_end_offset(skb) -
+ min(mtu, skb_end_offset(skb));
+ skb_reserve(skb, hlen);
+
+ skb_reset_network_header(skb);
+ pip = ip_hdr(skb);
+ skb_put(skb, sizeof(struct iphdr) + 4);
+
+ pip->version = 4;
+ pip->ihl = (sizeof(struct iphdr)+4)>>2;
+ pip->tos = 0xc0;
+ pip->frag_off = htons(IP_DF);
+ pip->ttl = 1;
+ pip->daddr = fl4.daddr;
+ pip->saddr = fl4.saddr;
+ pip->protocol = IPPROTO_IGMP;
+ pip->tot_len = 0; /* filled in later */
+ ip_select_ident(net, skb, NULL);
+ ((u8 *)&pip[1])[0] = IPOPT_RA;
+ ((u8 *)&pip[1])[1] = 4;
+ ((u8 *)&pip[1])[2] = 0;
+ ((u8 *)&pip[1])[3] = 0;
+
+ skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
+ skb_put(skb, sizeof(*pig));
+ pig = igmpv3_report_hdr(skb);
+ pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+ pig->resv1 = 0;
+ pig->csum = 0;
+ pig->resv2 = 0;
+ pig->ngrec = 0;
+ return skb;
+}
+
+static int igmpv3_sendpack(struct sk_buff *skb)
+{
+ struct igmphdr *pig = igmp_hdr(skb);
+ const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
+
+ pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
+
+ return ip_local_out(skb);
+}
+
+static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+{
+ return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+ int type, struct igmpv3_grec **ppgr)
+{
+ struct net_device *dev = pmc->interface->dev;
+ struct igmpv3_report *pih;
+ struct igmpv3_grec *pgr;
+
+ if (!skb)
+ skb = igmpv3_newpack(dev, dev->mtu);
+ if (!skb)
+ return NULL;
+ pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+ pgr->grec_type = type;
+ pgr->grec_auxwords = 0;
+ pgr->grec_nsrcs = 0;
+ pgr->grec_mca = pmc->multiaddr;
+ pih = igmpv3_report_hdr(skb);
+ pih->ngrec = htons(ntohs(pih->ngrec)+1);
+ *ppgr = pgr;
+ return skb;
+}
+
+#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
+ int type, int gdeleted, int sdeleted)
+{
+ struct net_device *dev = pmc->interface->dev;
+ struct igmpv3_report *pih;
+ struct igmpv3_grec *pgr = NULL;
+ struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+ int scount, stotal, first, isquery, truncate;
+
+ if (pmc->multiaddr == IGMP_ALL_HOSTS)
+ return skb;
+
+ isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+ type == IGMPV3_MODE_IS_EXCLUDE;
+ truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+ type == IGMPV3_CHANGE_TO_EXCLUDE;
+
+ stotal = scount = 0;
+
+ psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
+
+ if (!*psf_list)
+ goto empty_source;
+
+ pih = skb ? igmpv3_report_hdr(skb) : NULL;
+
+ /* EX and TO_EX get a fresh packet, if needed */
+ if (truncate) {
+ if (pih && pih->ngrec &&
+ AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+ if (skb)
+ igmpv3_sendpack(skb);
+ skb = igmpv3_newpack(dev, dev->mtu);
+ }
+ }
+ first = 1;
+ psf_prev = NULL;
+ for (psf = *psf_list; psf; psf = psf_next) {
+ __be32 *psrc;
+
+ psf_next = psf->sf_next;
+
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+ psf_prev = psf;
+ continue;
+ }
+
+ /* clear marks on query responses */
+ if (isquery)
+ psf->sf_gsresp = 0;
+
+ if (AVAILABLE(skb) < sizeof(__be32) +
+ first*sizeof(struct igmpv3_grec)) {
+ if (truncate && !first)
+ break; /* truncate these */
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+ if (skb)
+ igmpv3_sendpack(skb);
+ skb = igmpv3_newpack(dev, dev->mtu);
+ first = 1;
+ scount = 0;
+ }
+ if (first) {
+ skb = add_grhead(skb, pmc, type, &pgr);
+ first = 0;
+ }
+ if (!skb)
+ return NULL;
+ psrc = (__be32 *)skb_put(skb, sizeof(__be32));
+ *psrc = psf->sf_inaddr;
+ scount++; stotal++;
+ if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+ psf->sf_crcount--;
+ if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *psf_list = psf->sf_next;
+ kfree(psf);
+ continue;
+ }
+ }
+ psf_prev = psf;
+ }
+
+empty_source:
+ if (!stotal) {
+ if (type == IGMPV3_ALLOW_NEW_SOURCES ||
+ type == IGMPV3_BLOCK_OLD_SOURCES)
+ return skb;
+ if (pmc->crcount || isquery) {
+ /* make sure we have room for group header */
+ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
+ igmpv3_sendpack(skb);
+ skb = NULL; /* add_grhead will get a new one */
+ }
+ skb = add_grhead(skb, pmc, type, &pgr);
+ }
+ }
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+
+ if (isquery)
+ pmc->gsquery = 0; /* clear query state on report */
+ return skb;
+}
+
+static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
+{
+ struct sk_buff *skb = NULL;
+ int type;
+
+ if (!pmc) {
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ if (pmc->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ type = IGMPV3_MODE_IS_EXCLUDE;
+ else
+ type = IGMPV3_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ spin_unlock_bh(&pmc->lock);
+ }
+ rcu_read_unlock();
+ } else {
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ type = IGMPV3_MODE_IS_EXCLUDE;
+ else
+ type = IGMPV3_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ spin_unlock_bh(&pmc->lock);
+ }
+ if (!skb)
+ return 0;
+ return igmpv3_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
+{
+ struct ip_sf_list *psf_prev, *psf_next, *psf;
+
+ psf_prev = NULL;
+ for (psf = *ppsf; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ if (psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *ppsf = psf->sf_next;
+ kfree(psf);
+ } else
+ psf_prev = psf;
+ }
+}
+
+static void igmpv3_send_cr(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
+ struct sk_buff *skb = NULL;
+ int type, dtype;
+
+ rcu_read_lock();
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+
+ /* deleted MCA's */
+ pmc_prev = NULL;
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
+ pmc_next = pmc->next;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ type = IGMPV3_BLOCK_OLD_SOURCES;
+ dtype = IGMPV3_BLOCK_OLD_SOURCES;
+ skb = add_grec(skb, pmc, type, 1, 0);
+ skb = add_grec(skb, pmc, dtype, 1, 1);
+ }
+ if (pmc->crcount) {
+ if (pmc->sfmode == MCAST_EXCLUDE) {
+ type = IGMPV3_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 1, 0);
+ }
+ pmc->crcount--;
+ if (pmc->crcount == 0) {
+ igmpv3_clear_zeros(&pmc->tomb);
+ igmpv3_clear_zeros(&pmc->sources);
+ }
+ }
+ if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
+ if (pmc_prev)
+ pmc_prev->next = pmc_next;
+ else
+ in_dev->mc_tomb = pmc_next;
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ } else
+ pmc_prev = pmc;
+ }
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ /* change recs */
+ for_each_pmc_rcu(in_dev, pmc) {
+ spin_lock_bh(&pmc->lock);
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ type = IGMPV3_BLOCK_OLD_SOURCES;
+ dtype = IGMPV3_ALLOW_NEW_SOURCES;
+ } else {
+ type = IGMPV3_ALLOW_NEW_SOURCES;
+ dtype = IGMPV3_BLOCK_OLD_SOURCES;
+ }
+ skb = add_grec(skb, pmc, type, 0, 0);
+ skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */
+
+ /* filter mode changes */
+ if (pmc->crcount) {
+ if (pmc->sfmode == MCAST_EXCLUDE)
+ type = IGMPV3_CHANGE_TO_EXCLUDE;
+ else
+ type = IGMPV3_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0);
+ pmc->crcount--;
+ }
+ spin_unlock_bh(&pmc->lock);
+ }
+ rcu_read_unlock();
+
+ if (!skb)
+ return;
+ (void) igmpv3_sendpack(skb);
+}
+
+static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
+ int type)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct igmphdr *ih;
+ struct rtable *rt;
+ struct net_device *dev = in_dev->dev;
+ struct net *net = dev_net(dev);
+ __be32 group = pmc ? pmc->multiaddr : 0;
+ struct flowi4 fl4;
+ __be32 dst;
+ int hlen, tlen;
+
+ if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
+ return igmpv3_send_report(in_dev, pmc);
+ else if (type == IGMP_HOST_LEAVE_MESSAGE)
+ dst = IGMP_ALL_ROUTER;
+ else
+ dst = group;
+
+ rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
+ 0, 0,
+ IPPROTO_IGMP, 0, dev->ifindex);
+ if (IS_ERR(rt))
+ return -1;
+
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
+ if (!skb) {
+ ip_rt_put(rt);
+ return -1;
+ }
+ skb->priority = TC_PRIO_CONTROL;
+
+ skb_dst_set(skb, &rt->dst);
+
+ skb_reserve(skb, hlen);
+
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ skb_put(skb, sizeof(struct iphdr) + 4);
+
+ iph->version = 4;
+ iph->ihl = (sizeof(struct iphdr)+4)>>2;
+ iph->tos = 0xc0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = 1;
+ iph->daddr = dst;
+ iph->saddr = fl4.saddr;
+ iph->protocol = IPPROTO_IGMP;
+ ip_select_ident(net, skb, NULL);
+ ((u8 *)&iph[1])[0] = IPOPT_RA;
+ ((u8 *)&iph[1])[1] = 4;
+ ((u8 *)&iph[1])[2] = 0;
+ ((u8 *)&iph[1])[3] = 0;
+
+ ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ ih->type = type;
+ ih->code = 0;
+ ih->csum = 0;
+ ih->group = group;
+ ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+ return ip_local_out(skb);
+}
+
+static void igmp_gq_timer_expire(unsigned long data)
+{
+ struct in_device *in_dev = (struct in_device *)data;
+
+ in_dev->mr_gq_running = 0;
+ igmpv3_send_report(in_dev, NULL);
+ in_dev_put(in_dev);
+}
+
+static void igmp_ifc_timer_expire(unsigned long data)
+{
+ struct in_device *in_dev = (struct in_device *)data;
+
+ igmpv3_send_cr(in_dev);
+ if (in_dev->mr_ifc_count) {
+ in_dev->mr_ifc_count--;
+ igmp_ifc_start_timer(in_dev,
+ unsolicited_report_interval(in_dev));
+ }
+ in_dev_put(in_dev);
+}
+
+static void igmp_ifc_event(struct in_device *in_dev)
+{
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+ return;
+ in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ igmp_ifc_start_timer(in_dev, 1);
+}
+
+
+static void igmp_timer_expire(unsigned long data)
+{
+ struct ip_mc_list *im = (struct ip_mc_list *)data;
+ struct in_device *in_dev = im->interface;
+
+ spin_lock(&im->lock);
+ im->tm_running = 0;
+
+ if (im->unsolicit_count) {
+ im->unsolicit_count--;
+ igmp_start_timer(im, unsolicited_report_interval(in_dev));
+ }
+ im->reporter = 1;
+ spin_unlock(&im->lock);
+
+ if (IGMP_V1_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+ else if (IGMP_V2_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+ else
+ igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
+
+ ip_ma_put(im);
+}
+
+/* mark EXCLUDE-mode sources */
+static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
+{
+ struct ip_sf_list *psf;
+ int i, scount;
+
+ scount = 0;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i = 0; i < nsrcs; i++) {
+ /* skip inactive filters */
+ if (psf->sf_count[MCAST_INCLUDE] ||
+ pmc->sfcount[MCAST_EXCLUDE] !=
+ psf->sf_count[MCAST_EXCLUDE])
+ break;
+ if (srcs[i] == psf->sf_inaddr) {
+ scount++;
+ break;
+ }
+ }
+ }
+ pmc->gsquery = 0;
+ if (scount == nsrcs) /* all sources excluded */
+ return 0;
+ return 1;
+}
+
+static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
+{
+ struct ip_sf_list *psf;
+ int i, scount;
+
+ if (pmc->sfmode == MCAST_EXCLUDE)
+ return igmp_xmarksources(pmc, nsrcs, srcs);
+
+ /* mark INCLUDE-mode sources */
+ scount = 0;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i = 0; i < nsrcs; i++)
+ if (srcs[i] == psf->sf_inaddr) {
+ psf->sf_gsresp = 1;
+ scount++;
+ break;
+ }
+ }
+ if (!scount) {
+ pmc->gsquery = 0;
+ return 0;
+ }
+ pmc->gsquery = 1;
+ return 1;
+}
+
+/* return true if packet was dropped */
+static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
+{
+ struct ip_mc_list *im;
+
+ /* Timers are only set for non-local groups */
+
+ if (group == IGMP_ALL_HOSTS)
+ return false;
+
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, im) {
+ if (im->multiaddr == group) {
+ igmp_stop_timer(im);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+/* return true if packet was dropped */
+static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+ int len)
+{
+ struct igmphdr *ih = igmp_hdr(skb);
+ struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
+ struct ip_mc_list *im;
+ __be32 group = ih->group;
+ int max_delay;
+ int mark = 0;
+
+
+ if (len == 8) {
+ if (ih->code == 0) {
+ /* Alas, old v1 router presents here. */
+
+ max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
+ in_dev->mr_v1_seen = jiffies +
+ IGMP_V1_ROUTER_PRESENT_TIMEOUT;
+ group = 0;
+ } else {
+ /* v2 router present */
+ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
+ in_dev->mr_v2_seen = jiffies +
+ IGMP_V2_ROUTER_PRESENT_TIMEOUT;
+ }
+ /* cancel the interface change timer */
+ in_dev->mr_ifc_count = 0;
+ if (del_timer(&in_dev->mr_ifc_timer))
+ __in_dev_put(in_dev);
+ /* clear deleted report items */
+ igmpv3_clear_delrec(in_dev);
+ } else if (len < 12) {
+ return true; /* ignore bogus packet; freed by caller */
+ } else if (IGMP_V1_SEEN(in_dev)) {
+ /* This is a v3 query with v1 queriers present */
+ max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
+ group = 0;
+ } else if (IGMP_V2_SEEN(in_dev)) {
+ /* this is a v3 query with v2 queriers present;
+ * Interpretation of the max_delay code is problematic here.
+ * A real v2 host would use ih_code directly, while v3 has a
+ * different encoding. We use the v3 encoding as more likely
+ * to be intended in a v3 query.
+ */
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
+ } else { /* v3 */
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
+ return true;
+
+ ih3 = igmpv3_query_hdr(skb);
+ if (ih3->nsrcs) {
+ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ + ntohs(ih3->nsrcs)*sizeof(__be32)))
+ return true;
+ ih3 = igmpv3_query_hdr(skb);
+ }
+
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+ if (!max_delay)
+ max_delay = 1; /* can't mod w/ 0 */
+ in_dev->mr_maxdelay = max_delay;
+ if (ih3->qrv)
+ in_dev->mr_qrv = ih3->qrv;
+ if (!group) { /* general query */
+ if (ih3->nsrcs)
+ return true; /* no sources allowed */
+ igmp_gq_start_timer(in_dev);
+ return false;
+ }
+ /* mark sources to include, if group & source-specific */
+ mark = ih3->nsrcs != 0;
+ }
+
+ /*
+ * - Start the timers in all of our membership records
+ * that the query applies to for the interface on
+ * which the query arrived excl. those that belong
+ * to a "local" group (224.0.0.X)
+ * - For timers already running check if they need to
+ * be reset.
+ * - Use the igmp->igmp_code field as the maximum
+ * delay possible
+ */
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, im) {
+ int changed;
+
+ if (group && group != im->multiaddr)
+ continue;
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+ spin_lock_bh(&im->lock);
+ if (im->tm_running)
+ im->gsquery = im->gsquery && mark;
+ else
+ im->gsquery = mark;
+ changed = !im->gsquery ||
+ igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+ spin_unlock_bh(&im->lock);
+ if (changed)
+ igmp_mod_timer(im, max_delay);
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+/* called in rcu_read_lock() section */
+int igmp_rcv(struct sk_buff *skb)
+{
+ /* This basically follows the spec line by line -- see RFC1112 */
+ struct igmphdr *ih;
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ int len = skb->len;
+ bool dropped = true;
+
+ if (!in_dev)
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
+ goto drop;
+
+ if (skb_checksum_simple_validate(skb))
+ goto drop;
+
+ ih = igmp_hdr(skb);
+ switch (ih->type) {
+ case IGMP_HOST_MEMBERSHIP_QUERY:
+ dropped = igmp_heard_query(in_dev, skb, len);
+ break;
+ case IGMP_HOST_MEMBERSHIP_REPORT:
+ case IGMPV2_HOST_MEMBERSHIP_REPORT:
+ /* Is it our report looped back? */
+ if (rt_is_output_route(skb_rtable(skb)))
+ break;
+ /* don't rely on MC router hearing unicast reports */
+ if (skb->pkt_type == PACKET_MULTICAST ||
+ skb->pkt_type == PACKET_BROADCAST)
+ dropped = igmp_heard_report(in_dev, ih->group);
+ break;
+ case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+ return pim_rcv_v1(skb);
+#endif
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
+ case IGMP_DVMRP:
+ case IGMP_TRACE:
+ case IGMP_HOST_LEAVE_MESSAGE:
+ case IGMP_MTRACE:
+ case IGMP_MTRACE_RESP:
+ break;
+ default:
+ break;
+ }
+
+drop:
+ if (dropped)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
+ return 0;
+}
+
+#endif
+
+
+/*
+ * Add a filter to a device
+ */
+
+static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct net_device *dev = in_dev->dev;
+
+ /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+ We will get multicast token leakage, when IFF_MULTICAST
+ is changed. This check should be done in ndo_set_rx_mode
+ routine. Something sort of:
+ if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+ --ANK
+ */
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_add(dev, buf);
+}
+
+/*
+ * Remove a filter from a device
+ */
+
+static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
+{
+ char buf[MAX_ADDR_LEN];
+ struct net_device *dev = in_dev->dev;
+
+ if (arp_mc_map(addr, buf, dev, 0) == 0)
+ dev_mc_del(dev, buf);
+}
+
+#ifdef CONFIG_IP_MULTICAST
+/*
+ * deleted ip_mc_list manipulation
+ */
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+{
+ struct ip_mc_list *pmc;
+
+ /* this is an "ip_mc_list" for convenience; only the fields below
+ * are actually used. In particular, the refcnt and users are not
+ * used for management of the delete list. Using the same structure
+ * for deleted items allows change reports to use common code with
+ * non-deleted or query-response MCA's.
+ */
+ pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
+ if (!pmc)
+ return;
+ spin_lock_bh(&im->lock);
+ pmc->interface = im->interface;
+ in_dev_hold(in_dev);
+ pmc->multiaddr = im->multiaddr;
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ pmc->sfmode = im->sfmode;
+ if (pmc->sfmode == MCAST_INCLUDE) {
+ struct ip_sf_list *psf;
+
+ pmc->tomb = im->tomb;
+ pmc->sources = im->sources;
+ im->tomb = im->sources = NULL;
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = pmc->crcount;
+ }
+ spin_unlock_bh(&im->lock);
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc->next = in_dev->mc_tomb;
+ in_dev->mc_tomb = pmc;
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+}
+
+static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
+{
+ struct ip_mc_list *pmc, *pmc_prev;
+ struct ip_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc_prev = NULL;
+ for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
+ if (pmc->multiaddr == multiaddr)
+ break;
+ pmc_prev = pmc;
+ }
+ if (pmc) {
+ if (pmc_prev)
+ pmc_prev->next = pmc->next;
+ else
+ in_dev->mc_tomb = pmc->next;
+ }
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+ if (pmc) {
+ for (psf = pmc->tomb; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ }
+}
+
+static void igmpv3_clear_delrec(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc, *nextpmc;
+
+ spin_lock_bh(&in_dev->mc_tomb_lock);
+ pmc = in_dev->mc_tomb;
+ in_dev->mc_tomb = NULL;
+ spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+ for (; pmc; pmc = nextpmc) {
+ nextpmc = pmc->next;
+ ip_mc_clear_src(pmc);
+ in_dev_put(pmc->interface);
+ kfree(pmc);
+ }
+ /* clear dead sources, too */
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ struct ip_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&pmc->lock);
+ psf = pmc->tomb;
+ pmc->tomb = NULL;
+ spin_unlock_bh(&pmc->lock);
+ for (; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ }
+ rcu_read_unlock();
+}
+#endif
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+ struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+ int reporter;
+#endif
+
+ if (im->loaded) {
+ im->loaded = 0;
+ ip_mc_filter_del(in_dev, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ reporter = im->reporter;
+ igmp_stop_timer(im);
+
+ if (!in_dev->dead) {
+ if (IGMP_V1_SEEN(in_dev))
+ return;
+ if (IGMP_V2_SEEN(in_dev)) {
+ if (reporter)
+ igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
+ return;
+ }
+ /* IGMPv3 */
+ igmpv3_add_delrec(in_dev, im);
+
+ igmp_ifc_event(in_dev);
+ }
+#endif
+}
+
+static void igmp_group_added(struct ip_mc_list *im)
+{
+ struct in_device *in_dev = im->interface;
+
+ if (im->loaded == 0) {
+ im->loaded = 1;
+ ip_mc_filter_add(in_dev, im->multiaddr);
+ }
+
+#ifdef CONFIG_IP_MULTICAST
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ return;
+
+ if (in_dev->dead)
+ return;
+ if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+ spin_lock_bh(&im->lock);
+ igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ /* else, v3 */
+
+ im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ igmp_ifc_event(in_dev);
+#endif
+}
+
+
+/*
+ * Multicast list managers
+ */
+
+static u32 ip_mc_hash(const struct ip_mc_list *im)
+{
+ return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
+}
+
+static void ip_mc_hash_add(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash;
+ u32 hash;
+
+ mc_hash = rtnl_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ rcu_assign_pointer(mc_hash[hash], im);
+ return;
+ }
+
+ /* do not use a hash table for small number of items */
+ if (in_dev->mc_count < 4)
+ return;
+
+ mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
+ GFP_KERNEL);
+ if (!mc_hash)
+ return;
+
+ for_each_pmc_rtnl(in_dev, im) {
+ hash = ip_mc_hash(im);
+ im->next_hash = mc_hash[hash];
+ RCU_INIT_POINTER(mc_hash[hash], im);
+ }
+
+ rcu_assign_pointer(in_dev->mc_hash, mc_hash);
+}
+
+static void ip_mc_hash_remove(struct in_device *in_dev,
+ struct ip_mc_list *im)
+{
+ struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
+ struct ip_mc_list *aux;
+
+ if (!mc_hash)
+ return;
+ mc_hash += ip_mc_hash(im);
+ while ((aux = rtnl_dereference(*mc_hash)) != im)
+ mc_hash = &aux->next_hash;
+ *mc_hash = im->next_hash;
+}
+
+
+/*
+ * A socket has joined a multicast group on device dev.
+ */
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+ struct ip_mc_list *im;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, im) {
+ if (im->multiaddr == addr) {
+ im->users++;
+ ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+ goto out;
+ }
+ }
+
+ im = kzalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
+
+ im->users = 1;
+ im->interface = in_dev;
+ in_dev_hold(in_dev);
+ im->multiaddr = addr;
+ /* initial mode is (EX, empty) */
+ im->sfmode = MCAST_EXCLUDE;
+ im->sfcount[MCAST_EXCLUDE] = 1;
+ atomic_set(&im->refcnt, 1);
+ spin_lock_init(&im->lock);
+#ifdef CONFIG_IP_MULTICAST
+ setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
+ im->unsolicit_count = sysctl_igmp_qrv;
+#endif
+
+ im->next_rcu = in_dev->mc_list;
+ in_dev->mc_count++;
+ rcu_assign_pointer(in_dev->mc_list, im);
+
+ ip_mc_hash_add(in_dev, im);
+
+#ifdef CONFIG_IP_MULTICAST
+ igmpv3_del_delrec(in_dev, im->multiaddr);
+#endif
+ igmp_group_added(im);
+ if (!in_dev->dead)
+ ip_rt_multicast_event(in_dev);
+out:
+ return;
+}
+EXPORT_SYMBOL(ip_mc_inc_group);
+
+/*
+ * Resend IGMP JOIN report; used by netdev notifier.
+ */
+static void ip_mc_rejoin_groups(struct in_device *in_dev)
+{
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_mc_list *im;
+ int type;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, im) {
+ if (im->multiaddr == IGMP_ALL_HOSTS)
+ continue;
+
+ /* a failover is happening and switches
+ * must be notified immediately
+ */
+ if (IGMP_V1_SEEN(in_dev))
+ type = IGMP_HOST_MEMBERSHIP_REPORT;
+ else if (IGMP_V2_SEEN(in_dev))
+ type = IGMPV2_HOST_MEMBERSHIP_REPORT;
+ else
+ type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+ igmp_send_report(in_dev, im, type);
+ }
+#endif
+}
+
+/*
+ * A socket has left a multicast group on device dev
+ */
+
+void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
+{
+ struct ip_mc_list *i;
+ struct ip_mc_list __rcu **ip;
+
+ ASSERT_RTNL();
+
+ for (ip = &in_dev->mc_list;
+ (i = rtnl_dereference(*ip)) != NULL;
+ ip = &i->next_rcu) {
+ if (i->multiaddr == addr) {
+ if (--i->users == 0) {
+ ip_mc_hash_remove(in_dev, i);
+ *ip = i->next_rcu;
+ in_dev->mc_count--;
+ igmp_group_dropped(i);
+ ip_mc_clear_src(i);
+
+ if (!in_dev->dead)
+ ip_rt_multicast_event(in_dev);
+
+ ip_ma_put(i);
+ return;
+ }
+ break;
+ }
+ }
+}
+EXPORT_SYMBOL(ip_mc_dec_group);
+
+/* Device changing type */
+
+void ip_mc_unmap(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_dropped(pmc);
+}
+
+void ip_mc_remap(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_added(pmc);
+}
+
+/* Device going down */
+
+void ip_mc_down(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_dropped(pmc);
+
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_ifc_count = 0;
+ if (del_timer(&in_dev->mr_ifc_timer))
+ __in_dev_put(in_dev);
+ in_dev->mr_gq_running = 0;
+ if (del_timer(&in_dev->mr_gq_timer))
+ __in_dev_put(in_dev);
+ igmpv3_clear_delrec(in_dev);
+#endif
+
+ ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
+ ASSERT_RTNL();
+
+#ifdef CONFIG_IP_MULTICAST
+ setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
+ (unsigned long)in_dev);
+ setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
+ (unsigned long)in_dev);
+ in_dev->mr_qrv = sysctl_igmp_qrv;
+#endif
+
+ spin_lock_init(&in_dev->mc_tomb_lock);
+}
+
+/* Device going up */
+
+void ip_mc_up(struct in_device *in_dev)
+{
+ struct ip_mc_list *pmc;
+
+ ASSERT_RTNL();
+
+#ifdef CONFIG_IP_MULTICAST
+ in_dev->mr_qrv = sysctl_igmp_qrv;
+#endif
+ ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+
+ for_each_pmc_rtnl(in_dev, pmc)
+ igmp_group_added(pmc);
+}
+
+/*
+ * Device is about to be destroyed: clean up.
+ */
+
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+ struct ip_mc_list *i;
+
+ ASSERT_RTNL();
+
+ /* Deactivate timers */
+ ip_mc_down(in_dev);
+
+ while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
+ in_dev->mc_list = i->next_rcu;
+ in_dev->mc_count--;
+
+ /* We've dropped the groups in ip_mc_down already */
+ ip_mc_clear_src(i);
+ ip_ma_put(i);
+ }
+}
+
+/* RTNL is locked */
+static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
+{
+ struct net_device *dev = NULL;
+ struct in_device *idev = NULL;
+
+ if (imr->imr_ifindex) {
+ idev = inetdev_by_index(net, imr->imr_ifindex);
+ return idev;
+ }
+ if (imr->imr_address.s_addr) {
+ dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
+ if (!dev)
+ return NULL;
+ }
+
+ if (!dev) {
+ struct rtable *rt = ip_route_output(net,
+ imr->imr_multiaddr.s_addr,
+ 0, 0, 0);
+ if (!IS_ERR(rt)) {
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
+ }
+ if (dev) {
+ imr->imr_ifindex = dev->ifindex;
+ idev = __in_dev_get_rtnl(dev);
+ }
+ return idev;
+}
+
+/*
+ * Join a socket to a group
+ */
+int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
+int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
+#ifdef CONFIG_IP_MULTICAST
+int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
+#endif
+
+static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
+ __be32 *psfsrc)
+{
+ struct ip_sf_list *psf, *psf_prev;
+ int rv = 0;
+
+ psf_prev = NULL;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (psf->sf_inaddr == *psfsrc)
+ break;
+ psf_prev = psf;
+ }
+ if (!psf || psf->sf_count[sfmode] == 0) {
+ /* source filter not found, or count wrong => bug */
+ return -ESRCH;
+ }
+ psf->sf_count[sfmode]--;
+ if (psf->sf_count[sfmode] == 0) {
+ ip_rt_multicast_event(pmc->interface);
+ }
+ if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+ struct in_device *in_dev = pmc->interface;
+#endif
+
+ /* no more filters for this source */
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ pmc->sources = psf->sf_next;
+#ifdef CONFIG_IP_MULTICAST
+ if (psf->sf_oldin &&
+ !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
+ psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ psf->sf_next = pmc->tomb;
+ pmc->tomb = psf;
+ rv = 1;
+ } else
+#endif
+ kfree(psf);
+ }
+ return rv;
+}
+
+#ifndef CONFIG_IP_MULTICAST
+#define igmp_ifc_event(x) do { } while (0)
+#endif
+
+static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+ int sfcount, __be32 *psfsrc, int delta)
+{
+ struct ip_mc_list *pmc;
+ int changerec = 0;
+ int i, err;
+
+ if (!in_dev)
+ return -ENODEV;
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ if (*pmca == pmc->multiaddr)
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->lock);
+ rcu_read_unlock();
+#ifdef CONFIG_IP_MULTICAST
+ sf_markstate(pmc);
+#endif
+ if (!delta) {
+ err = -EINVAL;
+ if (!pmc->sfcount[sfmode])
+ goto out_unlock;
+ pmc->sfcount[sfmode]--;
+ }
+ err = 0;
+ for (i = 0; i < sfcount; i++) {
+ int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+ changerec |= rv > 0;
+ if (!err && rv < 0)
+ err = rv;
+ }
+ if (pmc->sfmode == MCAST_EXCLUDE &&
+ pmc->sfcount[MCAST_EXCLUDE] == 0 &&
+ pmc->sfcount[MCAST_INCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_sf_list *psf;
+#endif
+
+ /* filter mode change */
+ pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ in_dev->mr_ifc_count = pmc->crcount;
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ igmp_ifc_event(pmc->interface);
+ } else if (sf_setstate(pmc) || changerec) {
+ igmp_ifc_event(pmc->interface);
+#endif
+ }
+out_unlock:
+ spin_unlock_bh(&pmc->lock);
+ return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
+ __be32 *psfsrc)
+{
+ struct ip_sf_list *psf, *psf_prev;
+
+ psf_prev = NULL;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (psf->sf_inaddr == *psfsrc)
+ break;
+ psf_prev = psf;
+ }
+ if (!psf) {
+ psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+ if (!psf)
+ return -ENOBUFS;
+ psf->sf_inaddr = *psfsrc;
+ if (psf_prev) {
+ psf_prev->sf_next = psf;
+ } else
+ pmc->sources = psf;
+ }
+ psf->sf_count[sfmode]++;
+ if (psf->sf_count[sfmode] == 1) {
+ ip_rt_multicast_event(pmc->interface);
+ }
+ return 0;
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static void sf_markstate(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf;
+ int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ psf->sf_oldin = mca_xcount ==
+ psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf, *dpsf;
+ int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+ int qrv = pmc->interface->mr_qrv;
+ int new_in, rv;
+
+ rv = 0;
+ for (psf = pmc->sources; psf; psf = psf->sf_next) {
+ if (pmc->sfcount[MCAST_EXCLUDE]) {
+ new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+ if (new_in) {
+ if (!psf->sf_oldin) {
+ struct ip_sf_list *prev = NULL;
+
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
+ if (dpsf->sf_inaddr == psf->sf_inaddr)
+ break;
+ prev = dpsf;
+ }
+ if (dpsf) {
+ if (prev)
+ prev->sf_next = dpsf->sf_next;
+ else
+ pmc->tomb = dpsf->sf_next;
+ kfree(dpsf);
+ }
+ psf->sf_crcount = qrv;
+ rv++;
+ }
+ } else if (psf->sf_oldin) {
+
+ psf->sf_crcount = 0;
+ /*
+ * add or update "delete" records if an active filter
+ * is now inactive
+ */
+ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
+ if (dpsf->sf_inaddr == psf->sf_inaddr)
+ break;
+ if (!dpsf) {
+ dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ if (!dpsf)
+ continue;
+ *dpsf = *psf;
+ /* pmc->lock held by callers */
+ dpsf->sf_next = pmc->tomb;
+ pmc->tomb = dpsf;
+ }
+ dpsf->sf_crcount = qrv;
+ rv++;
+ }
+ }
+ return rv;
+}
+#endif
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+ int sfcount, __be32 *psfsrc, int delta)
+{
+ struct ip_mc_list *pmc;
+ int isexclude;
+ int i, err;
+
+ if (!in_dev)
+ return -ENODEV;
+ rcu_read_lock();
+ for_each_pmc_rcu(in_dev, pmc) {
+ if (*pmca == pmc->multiaddr)
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->lock);
+ rcu_read_unlock();
+
+#ifdef CONFIG_IP_MULTICAST
+ sf_markstate(pmc);
+#endif
+ isexclude = pmc->sfmode == MCAST_EXCLUDE;
+ if (!delta)
+ pmc->sfcount[sfmode]++;
+ err = 0;
+ for (i = 0; i < sfcount; i++) {
+ err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
+ if (err)
+ break;
+ }
+ if (err) {
+ int j;
+
+ if (!delta)
+ pmc->sfcount[sfmode]--;
+ for (j = 0; j < i; j++)
+ (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
+ } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
+#ifdef CONFIG_IP_MULTICAST
+ struct ip_sf_list *psf;
+ in_dev = pmc->interface;
+#endif
+
+ /* filter mode change */
+ if (pmc->sfcount[MCAST_EXCLUDE])
+ pmc->sfmode = MCAST_EXCLUDE;
+ else if (pmc->sfcount[MCAST_INCLUDE])
+ pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+ /* else no filters; keep old mode for reports */
+
+ pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ in_dev->mr_ifc_count = pmc->crcount;
+ for (psf = pmc->sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ igmp_ifc_event(in_dev);
+ } else if (sf_setstate(pmc)) {
+ igmp_ifc_event(in_dev);
+#endif
+ }
+ spin_unlock_bh(&pmc->lock);
+ return err;
+}
+
+static void ip_mc_clear_src(struct ip_mc_list *pmc)
+{
+ struct ip_sf_list *psf, *nextpsf;
+
+ for (psf = pmc->tomb; psf; psf = nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->tomb = NULL;
+ for (psf = pmc->sources; psf; psf = nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->sources = NULL;
+ pmc->sfmode = MCAST_EXCLUDE;
+ pmc->sfcount[MCAST_INCLUDE] = 0;
+ pmc->sfcount[MCAST_EXCLUDE] = 1;
+}
+
+/* Join a multicast group
+ */
+
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ __be32 addr = imr->imr_multiaddr.s_addr;
+ struct ip_mc_socklist *iml, *i;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+ int ifindex;
+ int count = 0;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ in_dev = ip_mc_find_dev(net, imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+
+ err = -EADDRINUSE;
+ ifindex = imr->imr_ifindex;
+ for_each_pmc_rtnl(inet, i) {
+ if (i->multi.imr_multiaddr.s_addr == addr &&
+ i->multi.imr_ifindex == ifindex)
+ goto done;
+ count++;
+ }
+ err = -ENOBUFS;
+ if (count >= sysctl_igmp_max_memberships)
+ goto done;
+ iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+ if (!iml)
+ goto done;
+
+ memcpy(&iml->multi, imr, sizeof(*imr));
+ iml->next_rcu = inet->mc_list;
+ iml->sflist = NULL;
+ iml->sfmode = MCAST_EXCLUDE;
+ rcu_assign_pointer(inet->mc_list, iml);
+ ip_mc_inc_group(in_dev, addr);
+ err = 0;
+done:
+ return err;
+}
+EXPORT_SYMBOL(ip_mc_join_group);
+
+static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
+ struct in_device *in_dev)
+{
+ struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
+ int err;
+
+ if (!psf) {
+ /* any-source empty exclude case */
+ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+ iml->sfmode, 0, NULL, 0);
+ }
+ err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+ iml->sfmode, psf->sl_count, psf->sl_addr, 0);
+ RCU_INIT_POINTER(iml->sflist, NULL);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psf, rcu);
+ return err;
+}
+
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml;
+ struct ip_mc_socklist __rcu **imlp;
+ struct in_device *in_dev;
+ struct net *net = sock_net(sk);
+ __be32 group = imr->imr_multiaddr.s_addr;
+ u32 ifindex;
+ int ret = -EADDRNOTAVAIL;
+
+ ASSERT_RTNL();
+
+ in_dev = ip_mc_find_dev(net, imr);
+ if (!in_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ ifindex = imr->imr_ifindex;
+ for (imlp = &inet->mc_list;
+ (iml = rtnl_dereference(*imlp)) != NULL;
+ imlp = &iml->next_rcu) {
+ if (iml->multi.imr_multiaddr.s_addr != group)
+ continue;
+ if (ifindex) {
+ if (iml->multi.imr_ifindex != ifindex)
+ continue;
+ } else if (imr->imr_address.s_addr && imr->imr_address.s_addr !=
+ iml->multi.imr_address.s_addr)
+ continue;
+
+ (void) ip_mc_leave_src(sk, iml, in_dev);
+
+ *imlp = iml->next_rcu;
+
+ ip_mc_dec_group(in_dev, group);
+
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ kfree_rcu(iml, rcu);
+ return 0;
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ip_mc_leave_group);
+
+int ip_mc_source(int add, int omode, struct sock *sk, struct
+ ip_mreq_source *mreqs, int ifindex)
+{
+ int err;
+ struct ip_mreqn imr;
+ __be32 addr = mreqs->imr_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+ struct net *net = sock_net(sk);
+ int leavegroup = 0;
+ int i, j, rv;
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ ASSERT_RTNL();
+
+ imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
+ imr.imr_address.s_addr = mreqs->imr_interface;
+ imr.imr_ifindex = ifindex;
+ in_dev = ip_mc_find_dev(net, &imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for_each_pmc_rtnl(inet, pmc) {
+ if ((pmc->multi.imr_multiaddr.s_addr ==
+ imr.imr_multiaddr.s_addr) &&
+ (pmc->multi.imr_ifindex == imr.imr_ifindex))
+ break;
+ }
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
+ goto done;
+ }
+ /* if a source filter was set, must be the same mode as before */
+ if (pmc->sflist) {
+ if (pmc->sfmode != omode) {
+ err = -EINVAL;
+ goto done;
+ }
+ } else if (pmc->sfmode != omode) {
+ /* allow mode switches for empty-set filters */
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
+ NULL, 0);
+ pmc->sfmode = omode;
+ }
+
+ psl = rtnl_dereference(pmc->sflist);
+ if (!add) {
+ if (!psl)
+ goto done; /* err = -EADDRNOTAVAIL */
+ rv = !0;
+ for (i = 0; i < psl->sl_count; i++) {
+ rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+ sizeof(__be32));
+ if (rv == 0)
+ break;
+ }
+ if (rv) /* source not found */
+ goto done; /* err = -EADDRNOTAVAIL */
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+ leavegroup = 1;
+ goto done;
+ }
+
+ /* update the interface filter */
+ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ &mreqs->imr_sourceaddr, 1);
+
+ for (j = i+1; j < psl->sl_count; j++)
+ psl->sl_addr[j-1] = psl->sl_addr[j];
+ psl->sl_count--;
+ err = 0;
+ goto done;
+ }
+ /* else, add a new source to the filter */
+
+ if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ if (!psl || psl->sl_count == psl->sl_max) {
+ struct ip_sf_socklist *newpsl;
+ int count = IP_SFBLOCK;
+
+ if (psl)
+ count += psl->sl_max;
+ newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = count;
+ newpsl->sl_count = count - IP_SFBLOCK;
+ if (psl) {
+ for (i = 0; i < psl->sl_count; i++)
+ newpsl->sl_addr[i] = psl->sl_addr[i];
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
+ }
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ psl = newpsl;
+ }
+ rv = 1; /* > 0 for insert logic below if sl_count is 0 */
+ for (i = 0; i < psl->sl_count; i++) {
+ rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+ sizeof(__be32));
+ if (rv == 0)
+ break;
+ }
+ if (rv == 0) /* address already there is an error */
+ goto done;
+ for (j = psl->sl_count-1; j >= i; j--)
+ psl->sl_addr[j+1] = psl->sl_addr[j];
+ psl->sl_addr[i] = mreqs->imr_sourceaddr;
+ psl->sl_count++;
+ err = 0;
+ /* update the interface list */
+ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+ &mreqs->imr_sourceaddr, 1);
+done:
+ if (leavegroup)
+ err = ip_mc_leave_group(sk, &imr);
+ return err;
+}
+
+int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
+{
+ int err = 0;
+ struct ip_mreqn imr;
+ __be32 addr = msf->imsf_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *newpsl, *psl;
+ struct net *net = sock_net(sk);
+ int leavegroup = 0;
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+ if (msf->imsf_fmode != MCAST_INCLUDE &&
+ msf->imsf_fmode != MCAST_EXCLUDE)
+ return -EINVAL;
+
+ ASSERT_RTNL();
+
+ imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+ imr.imr_address.s_addr = msf->imsf_interface;
+ imr.imr_ifindex = ifindex;
+ in_dev = ip_mc_find_dev(net, &imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+ leavegroup = 1;
+ goto done;
+ }
+
+ for_each_pmc_rtnl(inet, pmc) {
+ if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+ pmc->multi.imr_ifindex == imr.imr_ifindex)
+ break;
+ }
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
+ goto done;
+ }
+ if (msf->imsf_numsrc) {
+ newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc),
+ GFP_KERNEL);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
+ memcpy(newpsl->sl_addr, msf->imsf_slist,
+ msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+ err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+ msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
+ if (err) {
+ sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+ goto done;
+ }
+ } else {
+ newpsl = NULL;
+ (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+ msf->imsf_fmode, 0, NULL, 0);
+ }
+ psl = rtnl_dereference(pmc->sflist);
+ if (psl) {
+ (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
+ } else
+ (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+ 0, NULL, 0);
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ pmc->sfmode = msf->imsf_fmode;
+ err = 0;
+done:
+ if (leavegroup)
+ err = ip_mc_leave_group(sk, &imr);
+ return err;
+}
+
+int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
+ struct ip_msfilter __user *optval, int __user *optlen)
+{
+ int err, len, count, copycount;
+ struct ip_mreqn imr;
+ __be32 addr = msf->imsf_multiaddr;
+ struct ip_mc_socklist *pmc;
+ struct in_device *in_dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+ struct net *net = sock_net(sk);
+
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ rtnl_lock();
+
+ imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+ imr.imr_address.s_addr = msf->imsf_interface;
+ imr.imr_ifindex = 0;
+ in_dev = ip_mc_find_dev(net, &imr);
+
+ if (!in_dev) {
+ err = -ENODEV;
+ goto done;
+ }
+ err = -EADDRNOTAVAIL;
+
+ for_each_pmc_rtnl(inet, pmc) {
+ if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+ pmc->multi.imr_ifindex == imr.imr_ifindex)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ msf->imsf_fmode = pmc->sfmode;
+ psl = rtnl_dereference(pmc->sflist);
+ rtnl_unlock();
+ if (!psl) {
+ len = 0;
+ count = 0;
+ } else {
+ count = psl->sl_count;
+ }
+ copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
+ len = copycount * sizeof(psl->sl_addr[0]);
+ msf->imsf_numsrc = count;
+ if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ if (len &&
+ copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+ return -EFAULT;
+ return 0;
+done:
+ rtnl_unlock();
+ return err;
+}
+
+int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
+ struct group_filter __user *optval, int __user *optlen)
+{
+ int err, i, count, copycount;
+ struct sockaddr_in *psin;
+ __be32 addr;
+ struct ip_mc_socklist *pmc;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_sf_socklist *psl;
+
+ psin = (struct sockaddr_in *)&gsf->gf_group;
+ if (psin->sin_family != AF_INET)
+ return -EINVAL;
+ addr = psin->sin_addr.s_addr;
+ if (!ipv4_is_multicast(addr))
+ return -EINVAL;
+
+ rtnl_lock();
+
+ err = -EADDRNOTAVAIL;
+
+ for_each_pmc_rtnl(inet, pmc) {
+ if (pmc->multi.imr_multiaddr.s_addr == addr &&
+ pmc->multi.imr_ifindex == gsf->gf_interface)
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ gsf->gf_fmode = pmc->sfmode;
+ psl = rtnl_dereference(pmc->sflist);
+ rtnl_unlock();
+ count = psl ? psl->sl_count : 0;
+ copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+ gsf->gf_numsrc = count;
+ if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ for (i = 0; i < copycount; i++) {
+ struct sockaddr_storage ss;
+
+ psin = (struct sockaddr_in *)&ss;
+ memset(&ss, 0, sizeof(ss));
+ psin->sin_family = AF_INET;
+ psin->sin_addr.s_addr = psl->sl_addr[i];
+ if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ return -EFAULT;
+ }
+ return 0;
+done:
+ rtnl_unlock();
+ return err;
+}
+
+/*
+ * check if a multicast source filter allows delivery for a given <src,dst,intf>
+ */
+int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *pmc;
+ struct ip_sf_socklist *psl;
+ int i;
+ int ret;
+
+ ret = 1;
+ if (!ipv4_is_multicast(loc_addr))
+ goto out;
+
+ rcu_read_lock();
+ for_each_pmc_rcu(inet, pmc) {
+ if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
+ pmc->multi.imr_ifindex == dif)
+ break;
+ }
+ ret = inet->mc_all;
+ if (!pmc)
+ goto unlock;
+ psl = rcu_dereference(pmc->sflist);
+ ret = (pmc->sfmode == MCAST_EXCLUDE);
+ if (!psl)
+ goto unlock;
+
+ for (i = 0; i < psl->sl_count; i++) {
+ if (psl->sl_addr[i] == rmt_addr)
+ break;
+ }
+ ret = 0;
+ if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+ goto unlock;
+ if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+ goto unlock;
+ ret = 1;
+unlock:
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+/*
+ * A socket is closing.
+ */
+
+void ip_mc_drop_socket(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml;
+ struct net *net = sock_net(sk);
+
+ if (!inet->mc_list)
+ return;
+
+ rtnl_lock();
+ while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
+ struct in_device *in_dev;
+
+ inet->mc_list = iml->next_rcu;
+ in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
+ (void) ip_mc_leave_src(sk, iml, in_dev);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ kfree_rcu(iml, rcu);
+ }
+ rtnl_unlock();
+}
+
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+{
+ struct ip_mc_list *im;
+ struct ip_mc_list __rcu **mc_hash;
+ struct ip_sf_list *psf;
+ int rv = 0;
+
+ mc_hash = rcu_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
+
+ for (im = rcu_dereference(mc_hash[hash]);
+ im != NULL;
+ im = rcu_dereference(im->next_hash)) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ } else {
+ for_each_pmc_rcu(in_dev, im) {
+ if (im->multiaddr == mc_addr)
+ break;
+ }
+ }
+ if (im && proto == IPPROTO_IGMP) {
+ rv = 1;
+ } else if (im) {
+ if (src_addr) {
+ for (psf = im->sources; psf; psf = psf->sf_next) {
+ if (psf->sf_inaddr == src_addr)
+ break;
+ }
+ if (psf)
+ rv = psf->sf_count[MCAST_INCLUDE] ||
+ psf->sf_count[MCAST_EXCLUDE] !=
+ im->sfcount[MCAST_EXCLUDE];
+ else
+ rv = im->sfcount[MCAST_EXCLUDE] != 0;
+ } else
+ rv = 1; /* unspecified source; tentatively allow */
+ }
+ return rv;
+}
+
+#if defined(CONFIG_PROC_FS)
+struct igmp_mc_iter_state {
+ struct seq_net_private p;
+ struct net_device *dev;
+ struct in_device *in_dev;
+};
+
+#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private)
+
+static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ip_mc_list *im = NULL;
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+ state->in_dev = NULL;
+ for_each_netdev_rcu(net, state->dev) {
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rcu(state->dev);
+ if (!in_dev)
+ continue;
+ im = rcu_dereference(in_dev->mc_list);
+ if (im) {
+ state->in_dev = in_dev;
+ break;
+ }
+ }
+ return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
+{
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+ im = rcu_dereference(im->next_rcu);
+ while (!im) {
+ state->dev = next_net_device_rcu(state->dev);
+ if (!state->dev) {
+ state->in_dev = NULL;
+ break;
+ }
+ state->in_dev = __in_dev_get_rcu(state->dev);
+ if (!state->in_dev)
+ continue;
+ im = rcu_dereference(state->in_dev->mc_list);
+ }
+ return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip_mc_list *im = igmp_mc_get_first(seq);
+ if (im)
+ while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
+ --pos;
+ return pos ? NULL : im;
+}
+
+static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_mc_list *im;
+ if (v == SEQ_START_TOKEN)
+ im = igmp_mc_get_first(seq);
+ else
+ im = igmp_mc_get_next(seq, v);
+ ++*pos;
+ return im;
+}
+
+static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+ state->in_dev = NULL;
+ state->dev = NULL;
+ rcu_read_unlock();
+}
+
+static int igmp_mc_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
+ else {
+ struct ip_mc_list *im = (struct ip_mc_list *)v;
+ struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+ char *querier;
+ long delta;
+
+#ifdef CONFIG_IP_MULTICAST
+ querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
+ IGMP_V2_SEEN(state->in_dev) ? "V2" :
+ "V3";
+#else
+ querier = "NONE";
+#endif
+
+ if (rcu_access_pointer(state->in_dev->mc_list) == im) {
+ seq_printf(seq, "%d\t%-10s: %5d %7s\n",
+ state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
+ }
+
+ delta = im->timer.expires - jiffies;
+ seq_printf(seq,
+ "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
+ im->multiaddr, im->users,
+ im->tm_running,
+ im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
+ im->reporter);
+ }
+ return 0;
+}
+
+static const struct seq_operations igmp_mc_seq_ops = {
+ .start = igmp_mc_seq_start,
+ .next = igmp_mc_seq_next,
+ .stop = igmp_mc_seq_stop,
+ .show = igmp_mc_seq_show,
+};
+
+static int igmp_mc_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &igmp_mc_seq_ops,
+ sizeof(struct igmp_mc_iter_state));
+}
+
+static const struct file_operations igmp_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+struct igmp_mcf_iter_state {
+ struct seq_net_private p;
+ struct net_device *dev;
+ struct in_device *idev;
+ struct ip_mc_list *im;
+};
+
+#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private)
+
+static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ip_sf_list *psf = NULL;
+ struct ip_mc_list *im = NULL;
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ state->idev = NULL;
+ state->im = NULL;
+ for_each_netdev_rcu(net, state->dev) {
+ struct in_device *idev;
+ idev = __in_dev_get_rcu(state->dev);
+ if (unlikely(!idev))
+ continue;
+ im = rcu_dereference(idev->mc_list);
+ if (likely(im)) {
+ spin_lock_bh(&im->lock);
+ psf = im->sources;
+ if (likely(psf)) {
+ state->im = im;
+ state->idev = idev;
+ break;
+ }
+ spin_unlock_bh(&im->lock);
+ }
+ }
+ return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
+{
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ psf = psf->sf_next;
+ while (!psf) {
+ spin_unlock_bh(&state->im->lock);
+ state->im = state->im->next;
+ while (!state->im) {
+ state->dev = next_net_device_rcu(state->dev);
+ if (!state->dev) {
+ state->idev = NULL;
+ goto out;
+ }
+ state->idev = __in_dev_get_rcu(state->dev);
+ if (!state->idev)
+ continue;
+ state->im = rcu_dereference(state->idev->mc_list);
+ }
+ if (!state->im)
+ break;
+ spin_lock_bh(&state->im->lock);
+ psf = state->im->sources;
+ }
+out:
+ return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip_sf_list *psf = igmp_mcf_get_first(seq);
+ if (psf)
+ while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
+ --pos;
+ return pos ? NULL : psf;
+}
+
+static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_sf_list *psf;
+ if (v == SEQ_START_TOKEN)
+ psf = igmp_mcf_get_first(seq);
+ else
+ psf = igmp_mcf_get_next(seq, v);
+ ++*pos;
+ return psf;
+}
+
+static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+ if (likely(state->im)) {
+ spin_unlock_bh(&state->im->lock);
+ state->im = NULL;
+ }
+ state->idev = NULL;
+ state->dev = NULL;
+ rcu_read_unlock();
+}
+
+static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
+{
+ struct ip_sf_list *psf = (struct ip_sf_list *)v;
+ struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Idx Device MCA SRC INC EXC\n");
+ } else {
+ seq_printf(seq,
+ "%3d %6.6s 0x%08x "
+ "0x%08x %6lu %6lu\n",
+ state->dev->ifindex, state->dev->name,
+ ntohl(state->im->multiaddr),
+ ntohl(psf->sf_inaddr),
+ psf->sf_count[MCAST_INCLUDE],
+ psf->sf_count[MCAST_EXCLUDE]);
+ }
+ return 0;
+}
+
+static const struct seq_operations igmp_mcf_seq_ops = {
+ .start = igmp_mcf_seq_start,
+ .next = igmp_mcf_seq_next,
+ .stop = igmp_mcf_seq_stop,
+ .show = igmp_mcf_seq_show,
+};
+
+static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &igmp_mcf_seq_ops,
+ sizeof(struct igmp_mcf_iter_state));
+}
+
+static const struct file_operations igmp_mcf_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp_mcf_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init igmp_net_init(struct net *net)
+{
+ struct proc_dir_entry *pde;
+ int err;
+
+ pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
+ if (!pde)
+ goto out_igmp;
+ pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
+ &igmp_mcf_seq_fops);
+ if (!pde)
+ goto out_mcfilter;
+ err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
+ SOCK_DGRAM, 0, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
+ err);
+ goto out_sock;
+ }
+
+ return 0;
+
+out_sock:
+ remove_proc_entry("mcfilter", net->proc_net);
+out_mcfilter:
+ remove_proc_entry("igmp", net->proc_net);
+out_igmp:
+ return -ENOMEM;
+}
+
+static void __net_exit igmp_net_exit(struct net *net)
+{
+ remove_proc_entry("mcfilter", net->proc_net);
+ remove_proc_entry("igmp", net->proc_net);
+ inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
+}
+
+static struct pernet_operations igmp_net_ops = {
+ .init = igmp_net_init,
+ .exit = igmp_net_exit,
+};
+#endif
+
+static int igmp_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct in_device *in_dev;
+
+ switch (event) {
+ case NETDEV_RESEND_IGMP:
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev)
+ ip_mc_rejoin_groups(in_dev);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block igmp_notifier = {
+ .notifier_call = igmp_netdev_event,
+};
+
+int __init igmp_mc_init(void)
+{
+#if defined(CONFIG_PROC_FS)
+ int err;
+
+ err = register_pernet_subsys(&igmp_net_ops);
+ if (err)
+ return err;
+ err = register_netdevice_notifier(&igmp_notifier);
+ if (err)
+ goto reg_notif_fail;
+ return 0;
+
+reg_notif_fail:
+ unregister_pernet_subsys(&igmp_net_ops);
+ return err;
+#else
+ return register_netdevice_notifier(&igmp_notifier);
+#endif
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 000000000..8976ca423
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,978 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Support for INET connection oriented protocols.
+ *
+ * Authors: See the TCP sources
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+#include <net/tcp.h>
+
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+
+void inet_get_local_port_range(struct net *net, int *low, int *high)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
+
+ *low = net->ipv4.ip_local_ports.range[0];
+ *high = net->ipv4.ip_local_ports.range[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
+}
+EXPORT_SYMBOL(inet_get_local_port_range);
+
+int inet_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb, bool relax)
+{
+ struct sock *sk2;
+ int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ kuid_t uid = sock_i_uid((struct sock *)sk);
+
+ /*
+ * Unlike other sk lookup places we do not check
+ * for sk_net here, since _all_ the socks listed
+ * in tb->owners list belong to the same net - the
+ * one this bucket belongs to.
+ */
+
+ sk_for_each_bound(sk2, &tb->owners) {
+ if (sk != sk2 &&
+ !inet_v6_ipv6only(sk2) &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2))))) {
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ break;
+ }
+ if (!relax && reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+
+ if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
+ sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
+ break;
+ }
+ }
+ }
+ return sk2 != NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret, attempts = 5;
+ struct net *net = sock_net(sk);
+ int smallest_size = -1, smallest_rover;
+ kuid_t uid = sock_i_uid(sk);
+
+ local_bh_disable();
+ if (!snum) {
+ int remaining, rover, low, high;
+
+again:
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+ smallest_rover = rover = prandom_u32() % remaining + low;
+
+ smallest_size = -1;
+ do {
+ if (inet_is_local_reserved_port(net, rover))
+ goto next_nolock;
+ head = &hashinfo->bhash[inet_bhashfn(net, rover,
+ hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == rover) {
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ uid_eq(tb->fastuid, uid))) &&
+ (tb->num_owners < smallest_size || smallest_size == -1)) {
+ smallest_size = tb->num_owners;
+ smallest_rover = rover;
+ if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
+ !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+ snum = smallest_rover;
+ goto tb_found;
+ }
+ }
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
+ snum = rover;
+ goto tb_found;
+ }
+ goto next;
+ }
+ break;
+ next:
+ spin_unlock(&head->lock);
+ next_nolock:
+ if (++rover > high)
+ rover = low;
+ } while (--remaining > 0);
+
+ /* Exhausted local port range during search? It is not
+ * possible for us to be holding one of the bind hash
+ * locks if this test triggers, because if 'remaining'
+ * drops to zero, we broke out of the do/while loop at
+ * the top level, not from the 'break;' statement.
+ */
+ ret = 1;
+ if (remaining <= 0) {
+ if (smallest_size != -1) {
+ snum = smallest_rover;
+ goto have_snum;
+ }
+ goto fail;
+ }
+ /* OK, here is the one we will use. HEAD is
+ * non-NULL and we hold it's mutex.
+ */
+ snum = rover;
+ } else {
+have_snum:
+ head = &hashinfo->bhash[inet_bhashfn(net, snum,
+ hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == snum)
+ goto tb_found;
+ }
+ tb = NULL;
+ goto tb_not_found;
+tb_found:
+ if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse == SK_FORCE_REUSE)
+ goto success;
+
+ if (((tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+ smallest_size == -1) {
+ goto success;
+ } else {
+ ret = 1;
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+ if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+ smallest_size != -1 && --attempts >= 0) {
+ spin_unlock(&head->lock);
+ goto again;
+ }
+
+ goto fail_unlock;
+ }
+ }
+ }
+tb_not_found:
+ ret = 1;
+ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
+ net, head, snum)) == NULL)
+ goto fail_unlock;
+ if (hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+ tb->fastreuse = 1;
+ else
+ tb->fastreuse = 0;
+ if (sk->sk_reuseport) {
+ tb->fastreuseport = 1;
+ tb->fastuid = uid;
+ } else
+ tb->fastreuseport = 0;
+ } else {
+ if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+ if (tb->fastreuseport &&
+ (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+ tb->fastreuseport = 0;
+ }
+success:
+ if (!inet_csk(sk)->icsk_bind_hash)
+ inet_bind_hash(sk, tb, snum);
+ WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
+ ret = 0;
+
+fail_unlock:
+ spin_unlock(&head->lock);
+fail:
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ DEFINE_WAIT(wait);
+ int err;
+
+ /*
+ * True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ *
+ * Subtle issue: "add_wait_queue_exclusive()" will be added
+ * after any current non-exclusive waiters, and we know that
+ * it will always _stay_ after any new non-exclusive waiters
+ * because all non-exclusive waiters are added at the
+ * beginning of the wait-queue. As such, it's ok to "drop"
+ * our exclusiveness temporarily when we get woken up without
+ * having to remove and re-insert us on the wait queue.
+ */
+ for (;;) {
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ release_sock(sk);
+ if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+ timeo = schedule_timeout(timeo);
+ sched_annotate_sleep();
+ lock_sock(sk);
+ err = 0;
+ if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+ break;
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ break;
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct request_sock *req;
+ struct sock *newsk;
+ int error;
+
+ lock_sock(sk);
+
+ /* We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
+ error = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out_err;
+
+ /* Find already established connection */
+ if (reqsk_queue_empty(queue)) {
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ /* If this is a non blocking socket don't sleep */
+ error = -EAGAIN;
+ if (!timeo)
+ goto out_err;
+
+ error = inet_csk_wait_for_connect(sk, timeo);
+ if (error)
+ goto out_err;
+ }
+ req = reqsk_queue_remove(queue);
+ newsk = req->sk;
+
+ sk_acceptq_removed(sk);
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ tcp_rsk(req)->tfo_listener &&
+ queue->fastopenq) {
+ spin_lock_bh(&queue->fastopenq->lock);
+ if (tcp_rsk(req)->tfo_listener) {
+ /* We are still waiting for the final ACK from 3WHS
+ * so can't free req now. Instead, we set req->sk to
+ * NULL to signify that the child socket is taken
+ * so reqsk_fastopen_remove() will free the req
+ * when 3WHS finishes (or is aborted).
+ */
+ req->sk = NULL;
+ req = NULL;
+ }
+ spin_unlock_bh(&queue->fastopenq->lock);
+ }
+out:
+ release_sock(sk);
+ if (req)
+ reqsk_put(req);
+ return newsk;
+out_err:
+ newsk = NULL;
+ req = NULL;
+ *err = error;
+ goto out;
+}
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+ void (*retransmit_handler)(unsigned long),
+ void (*delack_handler)(unsigned long),
+ void (*keepalive_handler)(unsigned long))
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
+ (unsigned long)sk);
+ setup_timer(&icsk->icsk_delack_timer, delack_handler,
+ (unsigned long)sk);
+ setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
+ icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry *inet_csk_route_req(struct sock *sk,
+ struct flowi4 *fl4,
+ const struct request_sock *req)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct net *net = read_pnet(&ireq->ireq_net);
+ struct ip_options_rcu *opt = ireq->opt;
+ struct rtable *rt;
+
+ flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port,
+ htons(ireq->ir_num));
+ security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+ struct sock *newsk,
+ const struct request_sock *req)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct net *net = read_pnet(&ireq->ireq_net);
+ struct inet_sock *newinet = inet_sk(newsk);
+ struct ip_options_rcu *opt;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ fl4 = &newinet->cork.fl.u.ip4;
+
+ rcu_read_lock();
+ opt = rcu_dereference(newinet->inet_opt);
+ flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ sk->sk_protocol, inet_sk_flowi_flags(sk),
+ (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, ireq->ir_rmt_port,
+ htons(ireq->ir_num));
+ security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto route_err;
+ rcu_read_unlock();
+ return &rt->dst;
+
+route_err:
+ ip_rt_put(rt);
+no_route:
+ rcu_read_unlock();
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
+
+static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
+ const u32 rnd, const u32 synq_hsize)
+{
+ return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) true
+#endif
+
+/* Note: this is temporary :
+ * req sock will no longer be in listener hash table
+*/
+struct request_sock *inet_csk_search_req(struct sock *sk,
+ const __be16 rport,
+ const __be32 raddr,
+ const __be32 laddr)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req;
+ u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
+ lopt->nr_table_entries);
+
+ spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
+ for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (ireq->ir_rmt_port == rport &&
+ ireq->ir_rmt_addr == raddr &&
+ ireq->ir_loc_addr == laddr &&
+ AF_INET_FAMILY(req->rsk_ops->family)) {
+ atomic_inc(&req->rsk_refcnt);
+ WARN_ON(req->sk);
+ break;
+ }
+ }
+ spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ return req;
+}
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ unsigned long timeout)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
+ inet_rsk(req)->ir_rmt_port,
+ lopt->hash_rnd, lopt->nr_table_entries);
+
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+ inet_csk_reqsk_queue_added(sk, timeout);
+}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+
+
+/* Decide when to expire the request and when to resend SYN-ACK */
+static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
+ const int max_retries,
+ const u8 rskq_defer_accept,
+ int *expire, int *resend)
+{
+ if (!rskq_defer_accept) {
+ *expire = req->num_timeout >= thresh;
+ *resend = 1;
+ return;
+ }
+ *expire = req->num_timeout >= thresh &&
+ (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
+ /*
+ * Do not resend while waiting for data after ACK,
+ * start to resend on end of deferring period to give
+ * last chance for data or ACK to create established socket.
+ */
+ *resend = !inet_rsk(req)->acked ||
+ req->num_timeout >= rskq_defer_accept - 1;
+}
+
+int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
+{
+ int err = req->rsk_ops->rtx_syn_ack(parent, req);
+
+ if (!err)
+ req->num_retrans++;
+ return err;
+}
+EXPORT_SYMBOL(inet_rtx_syn_ack);
+
+/* return true if req was found in the syn_table[] */
+static bool reqsk_queue_unlink(struct request_sock_queue *queue,
+ struct request_sock *req)
+{
+ struct listen_sock *lopt = queue->listen_opt;
+ struct request_sock **prev;
+ bool found = false;
+
+ spin_lock(&queue->syn_wait_lock);
+
+ for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
+ prev = &(*prev)->dl_next) {
+ if (*prev == req) {
+ *prev = req->dl_next;
+ found = true;
+ break;
+ }
+ }
+
+ spin_unlock(&queue->syn_wait_lock);
+ if (del_timer(&req->rsk_timer))
+ reqsk_put(req);
+ return found;
+}
+
+void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+ if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ reqsk_put(req);
+ }
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+
+static void reqsk_timer_handler(unsigned long data)
+{
+ struct request_sock *req = (struct request_sock *)data;
+ struct sock *sk_listener = req->rsk_listener;
+ struct inet_connection_sock *icsk = inet_csk(sk_listener);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct listen_sock *lopt = queue->listen_opt;
+ int qlen, expire = 0, resend = 0;
+ int max_retries, thresh;
+ u8 defer_accept;
+
+ if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
+ reqsk_put(req);
+ return;
+ }
+
+ max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ thresh = max_retries;
+ /* Normally all the openreqs are young and become mature
+ * (i.e. converted to established socket) for first timeout.
+ * If synack was not acknowledged for 1 second, it means
+ * one of the following things: synack was lost, ack was lost,
+ * rtt is high or nobody planned to ack (i.e. synflood).
+ * When server is a bit loaded, queue is populated with old
+ * open requests, reducing effective size of queue.
+ * When server is well loaded, queue size reduces to zero
+ * after several minutes of work. It is not synflood,
+ * it is normal operation. The solution is pruning
+ * too old entries overriding normal timeout, when
+ * situation becomes dangerous.
+ *
+ * Essentially, we reserve half of room for young
+ * embrions; and abort old ones without pity, if old
+ * ones are about to clog our table.
+ */
+ qlen = listen_sock_qlen(lopt);
+ if (qlen >> (lopt->max_qlen_log - 1)) {
+ int young = listen_sock_young(lopt) << 1;
+
+ while (thresh > 2) {
+ if (qlen < young)
+ break;
+ thresh--;
+ young <<= 1;
+ }
+ }
+ defer_accept = READ_ONCE(queue->rskq_defer_accept);
+ if (defer_accept)
+ max_retries = defer_accept;
+ syn_ack_recalc(req, thresh, max_retries, defer_accept,
+ &expire, &resend);
+ req->rsk_ops->syn_ack_timeout(req);
+ if (!expire &&
+ (!resend ||
+ !inet_rtx_syn_ack(sk_listener, req) ||
+ inet_rsk(req)->acked)) {
+ unsigned long timeo;
+
+ if (req->num_timeout++ == 0)
+ atomic_inc(&lopt->young_dec);
+ timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
+ return;
+ }
+ inet_csk_reqsk_queue_drop(sk_listener, req);
+ reqsk_put(req);
+}
+
+void reqsk_queue_hash_req(struct request_sock_queue *queue,
+ u32 hash, struct request_sock *req,
+ unsigned long timeout)
+{
+ struct listen_sock *lopt = queue->listen_opt;
+
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
+
+ /* before letting lookups find us, make sure all req fields
+ * are committed to memory and refcnt initialized.
+ */
+ smp_wmb();
+ atomic_set(&req->rsk_refcnt, 2);
+ setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
+ req->rsk_hash = hash;
+
+ spin_lock(&queue->syn_wait_lock);
+ req->dl_next = lopt->syn_table[hash];
+ lopt->syn_table[hash] = req;
+ spin_unlock(&queue->syn_wait_lock);
+
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+}
+EXPORT_SYMBOL(reqsk_queue_hash_req);
+
+/**
+ * inet_csk_clone_lock - clone an inet socket, and lock its clone
+ * @sk: the socket to clone
+ * @req: request_sock
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_clone_lock(const struct sock *sk,
+ const struct request_sock *req,
+ const gfp_t priority)
+{
+ struct sock *newsk = sk_clone_lock(sk, priority);
+
+ if (newsk) {
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+ newsk->sk_state = TCP_SYN_RECV;
+ newicsk->icsk_bind_hash = NULL;
+
+ inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
+ inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
+ inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
+ newsk->sk_write_space = sk_stream_write_space;
+
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+ atomic64_set(&newsk->sk_cookie,
+ atomic64_read(&inet_rsk(req)->ir_cookie));
+
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+
+ security_inet_csk_clone(newsk, req);
+ }
+ return newsk;
+}
+EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all. Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+ WARN_ON(sk->sk_state != TCP_CLOSE);
+ WARN_ON(!sock_flag(sk, SOCK_DEAD));
+
+ /* It cannot be in hash table! */
+ WARN_ON(!sk_unhashed(sk));
+
+ /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
+ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
+
+ sk->sk_prot->destroy(sk);
+
+ sk_stream_kill_queues(sk);
+
+ xfrm_sk_free_policy(sk);
+
+ sk_refcnt_debug_release(sk);
+
+ percpu_counter_dec(sk->sk_prot->orphan_count);
+ sock_put(sk);
+}
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+/* This function allows to force a closure of a socket after the call to
+ * tcp/dccp_create_openreq_child().
+ */
+void inet_csk_prepare_forced_close(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ /* sk_clone_lock locked the socket and set refcnt to 2 */
+ bh_unlock_sock(sk);
+ sock_put(sk);
+
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+ inet_sk(sk)->inet_num = 0;
+}
+EXPORT_SYMBOL(inet_csk_prepare_forced_close);
+
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+
+ if (rc != 0)
+ return rc;
+
+ sk->sk_max_ack_backlog = 0;
+ sk->sk_ack_backlog = 0;
+ inet_csk_delack_init(sk);
+
+ /* There is race window here: we announce ourselves listening,
+ * but this transition is still not validated by get_port().
+ * It is OK, because this socket enters to hash table only
+ * after validation is complete.
+ */
+ sk->sk_state = TCP_LISTEN;
+ if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
+ inet->inet_sport = htons(inet->inet_num);
+
+ sk_dst_reset(sk);
+ sk->sk_prot->hash(sk);
+
+ return 0;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ __reqsk_queue_destroy(&icsk->icsk_accept_queue);
+ return -EADDRINUSE;
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+/*
+ * This routine closes sockets which have been at least partially
+ * opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct request_sock *acc_req;
+ struct request_sock *req;
+
+ /* make all the listen_opt local to us */
+ acc_req = reqsk_queue_yank_acceptq(queue);
+
+ /* Following specs, it would be better either to send FIN
+ * (and enter FIN-WAIT-1, it is normal close)
+ * or to send active reset (abort).
+ * Certainly, it is pretty dangerous while synflood, but it is
+ * bad justification for our negligence 8)
+ * To be honest, we are not able to make either
+ * of the variants now. --ANK
+ */
+ reqsk_queue_destroy(queue);
+
+ while ((req = acc_req) != NULL) {
+ struct sock *child = req->sk;
+
+ acc_req = req->dl_next;
+
+ local_bh_disable();
+ bh_lock_sock(child);
+ WARN_ON(sock_owned_by_user(child));
+ sock_hold(child);
+
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != req->rsk_listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ }
+ inet_csk_destroy_sock(child);
+
+ bh_unlock_sock(child);
+ local_bh_enable();
+ sock_put(child);
+
+ sk_acceptq_removed(sk);
+ reqsk_put(req);
+ }
+ if (queue->fastopenq) {
+ /* Free all the reqs queued in rskq_rst_head. */
+ spin_lock_bh(&queue->fastopenq->lock);
+ acc_req = queue->fastopenq->rskq_rst_head;
+ queue->fastopenq->rskq_rst_head = NULL;
+ spin_unlock_bh(&queue->fastopenq->lock);
+ while ((req = acc_req) != NULL) {
+ acc_req = req->dl_next;
+ reqsk_put(req);
+ }
+ }
+ WARN_ON(sk->sk_ack_backlog);
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
+
+void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ const struct inet_sock *inet = inet_sk(sk);
+
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = inet->inet_daddr;
+ sin->sin_port = inet->inet_dport;
+}
+EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
+
+#ifdef CONFIG_COMPAT
+int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_af_ops->compat_getsockopt)
+ return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+ optval, optlen);
+}
+EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
+
+int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_af_ops->compat_setsockopt)
+ return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+ optval, optlen);
+}
+EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
+#endif
+
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ fl4 = &fl->u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+ inet->inet_saddr, inet->inet_dport,
+ inet->inet_sport, sk->sk_protocol,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ rt = NULL;
+ if (rt)
+ sk_setup_caps(sk, &rt->dst);
+ rcu_read_unlock();
+
+ return &rt->dst;
+}
+
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!dst) {
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+ if (!dst)
+ goto out;
+ }
+ dst->ops->update_pmtu(dst, sk, NULL, mtu);
+
+ dst = __sk_dst_check(sk, 0);
+ if (!dst)
+ dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+ return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 000000000..4d32262c7
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,1165 @@
+/*
+ * inet_diag.c Module for monitoring INET transport protocols sockets.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/netlink.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+ const __be32 *saddr;
+ const __be32 *daddr;
+ u16 sport;
+ u16 dport;
+ u16 family;
+ u16 userlocks;
+};
+
+static DEFINE_MUTEX(inet_diag_table_mutex);
+
+static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
+{
+ if (!inet_diag_table[proto])
+ request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET, proto);
+
+ mutex_lock(&inet_diag_table_mutex);
+ if (!inet_diag_table[proto])
+ return ERR_PTR(-ENOENT);
+
+ return inet_diag_table[proto];
+}
+
+static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
+{
+ mutex_unlock(&inet_diag_table_mutex);
+}
+
+static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+{
+ r->idiag_family = sk->sk_family;
+
+ r->id.idiag_sport = htons(sk->sk_num);
+ r->id.idiag_dport = sk->sk_dport;
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ } else
+#endif
+ {
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = sk->sk_daddr;
+ }
+}
+
+static size_t inet_sk_attr_size(void)
+{
+ return nla_total_size(sizeof(struct tcp_info))
+ + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ + nla_total_size(1) /* INET_DIAG_TOS */
+ + nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ + nla_total_size(sizeof(struct inet_diag_msg))
+ + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ + nla_total_size(TCP_CA_NAME_MAX)
+ + nla_total_size(sizeof(struct tcpvegas_info))
+ + 64;
+}
+
+int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
+ struct sk_buff *skb, const struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct tcp_congestion_ops *ca_ops;
+ const struct inet_diag_handler *handler;
+ int ext = req->idiag_ext;
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ struct nlattr *attr;
+ void *info = NULL;
+
+ handler = inet_diag_table[req->sdiag_protocol];
+ BUG_ON(!handler);
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ BUG_ON(!sk_fullsock(sk));
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = sk->sk_state;
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+
+ if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto errout;
+
+ /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+ * hence this needs to be included regardless of socket family.
+ */
+ if (ext & (1 << (INET_DIAG_TOS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (r->idiag_family == AF_INET6) {
+ if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TCLASS,
+ inet6_sk(sk)->tclass) < 0)
+ goto errout;
+ }
+#endif
+
+ r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->idiag_inode = sock_i_ino(sk);
+
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
+ struct inet_diag_meminfo minfo = {
+ .idiag_rmem = sk_rmem_alloc_get(sk),
+ .idiag_wmem = sk->sk_wmem_queued,
+ .idiag_fmem = sk->sk_forward_alloc,
+ .idiag_tmem = sk_wmem_alloc_get(sk),
+ };
+
+ if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
+ goto errout;
+ }
+
+ if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
+ if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
+ goto errout;
+
+ if (!icsk) {
+ handler->idiag_get_info(sk, r, NULL);
+ goto out;
+ }
+
+#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ r->idiag_timer = 1;
+ r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ r->idiag_timer = 4;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (timer_pending(&sk->sk_timer)) {
+ r->idiag_timer = 2;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+ } else {
+ r->idiag_timer = 0;
+ r->idiag_expires = 0;
+ }
+#undef EXPIRES_IN_MS
+
+ if (ext & (1 << (INET_DIAG_INFO - 1))) {
+ attr = nla_reserve(skb, INET_DIAG_INFO,
+ sizeof(struct tcp_info));
+ if (!attr)
+ goto errout;
+
+ info = nla_data(attr);
+ }
+
+ if (ext & (1 << (INET_DIAG_CONG - 1))) {
+ int err = 0;
+
+ rcu_read_lock();
+ ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+ if (ca_ops)
+ err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
+ rcu_read_unlock();
+ if (err < 0)
+ goto errout;
+ }
+
+ handler->idiag_get_info(sk, r, info);
+
+ if (sk->sk_state < TCP_TIME_WAIT) {
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ rcu_read_lock();
+ ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ext, &attr, &info);
+ rcu_read_unlock();
+ if (sz && nla_put(skb, attr, sz, &info) < 0)
+ goto errout;
+ }
+
+out:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
+
+static int inet_csk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ const struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
+ user_ns, portid, seq, nlmsg_flags, unlh);
+}
+
+static int inet_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+
+ tmo = tw->tw_timer.expires - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_retrans = 0;
+
+ r->idiag_state = tw->tw_substate;
+ r->idiag_timer = 3;
+ r->idiag_expires = jiffies_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
+ r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ const struct inet_diag_req_v2 *r,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return inet_twsk_diag_fill(sk, skb, portid, seq,
+ nlmsg_flags, unlh);
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return inet_req_diag_fill(sk, skb, portid, seq,
+ nlmsg_flags, unlh);
+
+ return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
+ nlmsg_flags, unlh);
+}
+
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
+ struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+
+ err = -EINVAL;
+ if (req->sdiag_family == AF_INET)
+ sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6)
+ sk = inet6_lookup(net, hashinfo,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+#endif
+ else
+ goto out_nosk;
+
+ err = -ENOENT;
+ if (!sk)
+ goto out_nosk;
+
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
+ goto out;
+
+ rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_diag_fill(sk, rep, req,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ nlmsg_free(rep);
+ goto out;
+ }
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+
+out:
+ if (sk)
+ sock_gen_put(sk);
+
+out_nosk:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
+
+static int inet_diag_get_exact(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ const struct inet_diag_handler *handler;
+ int err;
+
+ handler = inet_diag_lock_handler(req->sdiag_protocol);
+ if (IS_ERR(handler))
+ err = PTR_ERR(handler);
+ else
+ err = handler->dump_one(in_skb, nlh, req);
+ inet_diag_unlock_handler(handler);
+
+ return err;
+}
+
+static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
+{
+ int words = bits >> 5;
+
+ bits &= 0x1f;
+
+ if (words) {
+ if (memcmp(a1, a2, words << 2))
+ return 0;
+ }
+ if (bits) {
+ __be32 w1, w2;
+ __be32 mask;
+
+ w1 = a1[words];
+ w2 = a2[words];
+
+ mask = htonl((0xffffffff) << (32 - bits));
+
+ if ((w1 ^ w2) & mask)
+ return 0;
+ }
+
+ return 1;
+}
+
+static int inet_diag_bc_run(const struct nlattr *_bc,
+ const struct inet_diag_entry *entry)
+{
+ const void *bc = nla_data(_bc);
+ int len = nla_len(_bc);
+
+ while (len > 0) {
+ int yes = 1;
+ const struct inet_diag_bc_op *op = bc;
+
+ switch (op->code) {
+ case INET_DIAG_BC_NOP:
+ break;
+ case INET_DIAG_BC_JMP:
+ yes = 0;
+ break;
+ case INET_DIAG_BC_S_GE:
+ yes = entry->sport >= op[1].no;
+ break;
+ case INET_DIAG_BC_S_LE:
+ yes = entry->sport <= op[1].no;
+ break;
+ case INET_DIAG_BC_D_GE:
+ yes = entry->dport >= op[1].no;
+ break;
+ case INET_DIAG_BC_D_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case INET_DIAG_BC_AUTO:
+ yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+ break;
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND: {
+ const struct inet_diag_hostcond *cond;
+ const __be32 *addr;
+
+ cond = (const struct inet_diag_hostcond *)(op + 1);
+ if (cond->port != -1 &&
+ cond->port != (op->code == INET_DIAG_BC_S_COND ?
+ entry->sport : entry->dport)) {
+ yes = 0;
+ break;
+ }
+
+ if (op->code == INET_DIAG_BC_S_COND)
+ addr = entry->saddr;
+ else
+ addr = entry->daddr;
+
+ if (cond->family != AF_UNSPEC &&
+ cond->family != entry->family) {
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3,
+ cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
+ if (bitstring_match(addr, cond->addr,
+ cond->prefix_len))
+ break;
+ yes = 0;
+ break;
+ }
+ }
+
+ if (yes) {
+ len -= op->yes;
+ bc += op->yes;
+ } else {
+ len -= op->no;
+ bc += op->no;
+ }
+ }
+ return len == 0;
+}
+
+/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
+ */
+static void entry_fill_addrs(struct inet_diag_entry *entry,
+ const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+ entry->daddr = sk->sk_v6_daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry->saddr = &sk->sk_rcv_saddr;
+ entry->daddr = &sk->sk_daddr;
+ }
+}
+
+int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_entry entry;
+
+ if (!bc)
+ return 1;
+
+ entry.family = sk->sk_family;
+ entry_fill_addrs(&entry, sk);
+ entry.sport = inet->inet_num;
+ entry.dport = ntohs(inet->inet_dport);
+ entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
+
+ return inet_diag_bc_run(bc, &entry);
+}
+EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+ while (len >= 0) {
+ const struct inet_diag_bc_op *op = bc;
+
+ if (cc > len)
+ return 0;
+ if (cc == len)
+ return 1;
+ if (op->yes < 4 || op->yes & 3)
+ return 0;
+ len -= op->yes;
+ bc += op->yes;
+ }
+ return 0;
+}
+
+/* Validate an inet_diag_hostcond. */
+static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ struct inet_diag_hostcond *cond;
+ int addr_len;
+
+ /* Check hostcond space. */
+ *min_len += sizeof(struct inet_diag_hostcond);
+ if (len < *min_len)
+ return false;
+ cond = (struct inet_diag_hostcond *)(op + 1);
+
+ /* Check address family and address length. */
+ switch (cond->family) {
+ case AF_UNSPEC:
+ addr_len = 0;
+ break;
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ break;
+ case AF_INET6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+ default:
+ return false;
+ }
+ *min_len += addr_len;
+ if (len < *min_len)
+ return false;
+
+ /* Check prefix length (in bits) vs address length (in bytes). */
+ if (cond->prefix_len > 8 * addr_len)
+ return false;
+
+ return true;
+}
+
+/* Validate a port comparison operator. */
+static bool valid_port_comparison(const struct inet_diag_bc_op *op,
+ int len, int *min_len)
+{
+ /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
+ *min_len += sizeof(struct inet_diag_bc_op);
+ if (len < *min_len)
+ return false;
+ return true;
+}
+
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+ const void *bc = bytecode;
+ int len = bytecode_len;
+
+ while (len > 0) {
+ int min_len = sizeof(struct inet_diag_bc_op);
+ const struct inet_diag_bc_op *op = bc;
+
+ switch (op->code) {
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND:
+ if (!valid_hostcond(bc, len, &min_len))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_S_GE:
+ case INET_DIAG_BC_S_LE:
+ case INET_DIAG_BC_D_GE:
+ case INET_DIAG_BC_D_LE:
+ if (!valid_port_comparison(bc, len, &min_len))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_AUTO:
+ case INET_DIAG_BC_JMP:
+ case INET_DIAG_BC_NOP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (op->code != INET_DIAG_BC_NOP) {
+ if (op->no < min_len || op->no > len + 4 || op->no & 3)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ }
+
+ if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
+ return -EINVAL;
+ bc += op->yes;
+ len -= op->yes;
+ }
+ return len == 0 ? 0 : -EINVAL;
+}
+
+static int inet_csk_diag_dump(struct sock *sk,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_csk_diag_fill(sk, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static void twsk_build_assert(void)
+{
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
+#endif
+}
+
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ const struct nlattr *bc)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_entry entry;
+ int j, s_j, reqnum, s_reqnum;
+ struct listen_sock *lopt;
+ int err = 0;
+
+ s_j = cb->args[3];
+ s_reqnum = cb->args[4];
+
+ if (s_j > 0)
+ s_j--;
+
+ entry.family = sk->sk_family;
+
+ spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ lopt = icsk->icsk_accept_queue.listen_opt;
+ if (!lopt || !listen_sock_qlen(lopt))
+ goto out;
+
+ if (bc) {
+ entry.sport = inet->inet_num;
+ entry.userlocks = sk->sk_userlocks;
+ }
+
+ for (j = s_j; j < lopt->nr_table_entries; j++) {
+ struct request_sock *req, *head = lopt->syn_table[j];
+
+ reqnum = 0;
+ for (req = head; req; reqnum++, req = req->dl_next) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (reqnum < s_reqnum)
+ continue;
+ if (r->id.idiag_dport != ireq->ir_rmt_port &&
+ r->id.idiag_dport)
+ continue;
+
+ if (bc) {
+ /* Note: entry.sport and entry.userlocks are already set */
+ entry_fill_addrs(&entry, req_to_sk(req));
+ entry.dport = ntohs(ireq->ir_rmt_port);
+
+ if (!inet_diag_bc_run(bc, &entry))
+ continue;
+ }
+
+ err = inet_req_diag_fill(req_to_sk(req), skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, cb->nlh);
+ if (err < 0) {
+ cb->args[3] = j + 1;
+ cb->args[4] = reqnum;
+ goto out;
+ }
+ }
+
+ s_reqnum = 0;
+ }
+
+out:
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ return err;
+}
+
+void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ struct net *net = sock_net(skb->sk);
+ int i, num, s_i, s_num;
+
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ goto skip_listen_ht;
+
+ for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+
+ num = 0;
+ ilb = &hashinfo->listening_hash[i];
+ spin_lock_bh(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!(r->idiag_states & TCPF_LISTEN) ||
+ r->id.idiag_dport ||
+ cb->args[3] > 0)
+ goto syn_recv;
+
+ if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ spin_unlock_bh(&ilb->lock);
+ goto done;
+ }
+
+syn_recv:
+ if (!(r->idiag_states & TCPF_SYN_RECV))
+ goto next_listen;
+
+ if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
+ spin_unlock_bh(&ilb->lock);
+ goto done;
+ }
+
+next_listen:
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ ++num;
+ }
+ spin_unlock_bh(&ilb->lock);
+
+ s_num = 0;
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ }
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ goto out;
+
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+
+ num = 0;
+
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
+ if (i > s_i)
+ s_num = 0;
+
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &head->chain) {
+ int state, res;
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next_normal;
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_substate : sk->sk_state;
+ if (!(r->idiag_states & (1 << state)))
+ goto next_normal;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != sk->sk_dport &&
+ r->id.idiag_dport)
+ goto next_normal;
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(bc, sk))
+ goto next_normal;
+
+ res = sk_diag_fill(sk, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->nlh);
+ if (res < 0) {
+ spin_unlock_bh(lock);
+ goto done;
+ }
+next_normal:
+ ++num;
+ }
+
+ spin_unlock_bh(lock);
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+out:
+ ;
+}
+EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
+
+static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc)
+{
+ const struct inet_diag_handler *handler;
+ int err = 0;
+
+ handler = inet_diag_lock_handler(r->sdiag_protocol);
+ if (!IS_ERR(handler))
+ handler->dump(skb, cb, r, bc);
+ else
+ err = PTR_ERR(handler);
+ inet_diag_unlock_handler(handler);
+
+ return err ? : skb->len;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct nlattr *bc = NULL;
+
+ if (nlmsg_attrlen(cb->nlh, hdrlen))
+ bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
+}
+
+static int inet_diag_type2proto(int type)
+{
+ switch (type) {
+ case TCPDIAG_GETSOCK:
+ return IPPROTO_TCP;
+ case DCCPDIAG_GETSOCK:
+ return IPPROTO_DCCP;
+ default:
+ return 0;
+ }
+}
+
+static int inet_diag_dump_compat(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_req *rc = nlmsg_data(cb->nlh);
+ int hdrlen = sizeof(struct inet_diag_req);
+ struct inet_diag_req_v2 req;
+ struct nlattr *bc = NULL;
+
+ req.sdiag_family = AF_UNSPEC; /* compatibility */
+ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ if (nlmsg_attrlen(cb->nlh, hdrlen))
+ bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+
+ return __inet_diag_dump(skb, cb, &req, bc);
+}
+
+static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh)
+{
+ struct inet_diag_req *rc = nlmsg_data(nlh);
+ struct inet_diag_req_v2 req;
+
+ req.sdiag_family = rc->idiag_family;
+ req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
+ req.idiag_ext = rc->idiag_ext;
+ req.idiag_states = rc->idiag_states;
+ req.id = rc->id;
+
+ return inet_diag_get_exact(in_skb, nlh, &req);
+}
+
+static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int hdrlen = sizeof(struct inet_diag_req);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
+ nlmsg_len(nlh) < hdrlen)
+ return -EINVAL;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (nlmsg_attrlen(nlh, hdrlen)) {
+ struct nlattr *attr;
+
+ attr = nlmsg_find_attr(nlh, hdrlen,
+ INET_DIAG_REQ_BYTECODE);
+ if (!attr ||
+ nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
+ inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
+ return -EINVAL;
+ }
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
+ }
+ }
+
+ return inet_diag_get_exact_compat(skb, nlh);
+}
+
+static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ if (nlmsg_attrlen(h, hdrlen)) {
+ struct nlattr *attr;
+
+ attr = nlmsg_find_attr(h, hdrlen,
+ INET_DIAG_REQ_BYTECODE);
+ if (!attr ||
+ nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
+ inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
+ return -EINVAL;
+ }
+ {
+ struct netlink_dump_control c = {
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ }
+ }
+
+ return inet_diag_get_exact(skb, h, nlmsg_data(h));
+}
+
+static const struct sock_diag_handler inet_diag_handler = {
+ .family = AF_INET,
+ .dump = inet_diag_handler_dump,
+};
+
+static const struct sock_diag_handler inet6_diag_handler = {
+ .family = AF_INET6,
+ .dump = inet_diag_handler_dump,
+};
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+ int err = -EINVAL;
+
+ if (type >= IPPROTO_MAX)
+ goto out;
+
+ mutex_lock(&inet_diag_table_mutex);
+ err = -EEXIST;
+ if (!inet_diag_table[type]) {
+ inet_diag_table[type] = h;
+ err = 0;
+ }
+ mutex_unlock(&inet_diag_table_mutex);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+
+ if (type >= IPPROTO_MAX)
+ return;
+
+ mutex_lock(&inet_diag_table_mutex);
+ inet_diag_table[type] = NULL;
+ mutex_unlock(&inet_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+ const int inet_diag_table_size = (IPPROTO_MAX *
+ sizeof(struct inet_diag_handler *));
+ int err = -ENOMEM;
+
+ inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL);
+ if (!inet_diag_table)
+ goto out;
+
+ err = sock_diag_register(&inet_diag_handler);
+ if (err)
+ goto out_free_nl;
+
+ err = sock_diag_register(&inet6_diag_handler);
+ if (err)
+ goto out_free_inet;
+
+ sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
+out:
+ return err;
+
+out_free_inet:
+ sock_diag_unregister(&inet_diag_handler);
+out_free_nl:
+ kfree(inet_diag_table);
+ goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+ sock_diag_unregister(&inet6_diag_handler);
+ sock_diag_unregister(&inet_diag_handler);
+ sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
+ kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
new file mode 100644
index 000000000..5e346a082
--- /dev/null
+++ b/net/ipv4/inet_fragment.c
@@ -0,0 +1,463 @@
+/*
+ * inet fragments management
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Pavel Emelyanov <xemul@openvz.org>
+ * Started as consolidation of ipv4/ip_fragment.c,
+ * ipv6/reassembly. and ipv6 nf conntrack reassembly
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+
+#define INETFRAGS_EVICT_BUCKETS 128
+#define INETFRAGS_EVICT_MAX 512
+
+/* don't rebuild inetfrag table with new secret more often than this */
+#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+const u8 ip_frag_ecn_table[16] = {
+ /* at least one fragment had CE, and others ECT_0 or ECT_1 */
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
+
+ /* invalid combinations : drop frame */
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+EXPORT_SYMBOL(ip_frag_ecn_table);
+
+static unsigned int
+inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
+{
+ return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
+}
+
+static bool inet_frag_may_rebuild(struct inet_frags *f)
+{
+ return time_after(jiffies,
+ f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
+}
+
+static void inet_frag_secret_rebuild(struct inet_frags *f)
+{
+ int i;
+
+ write_seqlock_bh(&f->rnd_seqlock);
+
+ if (!inet_frag_may_rebuild(f))
+ goto out;
+
+ get_random_bytes(&f->rnd, sizeof(u32));
+
+ for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb;
+ struct inet_frag_queue *q;
+ struct hlist_node *n;
+
+ hb = &f->hash[i];
+ spin_lock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(q, n, &hb->chain, list) {
+ unsigned int hval = inet_frag_hashfn(f, q);
+
+ if (hval != i) {
+ struct inet_frag_bucket *hb_dest;
+
+ hlist_del(&q->list);
+
+ /* Relink to new hash chain. */
+ hb_dest = &f->hash[hval];
+
+ /* This is the only place where we take
+ * another chain_lock while already holding
+ * one. As this will not run concurrently,
+ * we cannot deadlock on hb_dest lock below, if its
+ * already locked it will be released soon since
+ * other caller cannot be waiting for hb lock
+ * that we've taken above.
+ */
+ spin_lock_nested(&hb_dest->chain_lock,
+ SINGLE_DEPTH_NESTING);
+ hlist_add_head(&q->list, &hb_dest->chain);
+ spin_unlock(&hb_dest->chain_lock);
+ }
+ }
+ spin_unlock(&hb->chain_lock);
+ }
+
+ f->rebuild = false;
+ f->last_rebuild_jiffies = jiffies;
+out:
+ write_sequnlock_bh(&f->rnd_seqlock);
+}
+
+static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
+{
+ return q->net->low_thresh == 0 ||
+ frag_mem_limit(q->net) >= q->net->low_thresh;
+}
+
+static unsigned int
+inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
+{
+ struct inet_frag_queue *fq;
+ struct hlist_node *n;
+ unsigned int evicted = 0;
+ HLIST_HEAD(expired);
+
+evict_again:
+ spin_lock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
+ if (!inet_fragq_should_evict(fq))
+ continue;
+
+ if (!del_timer(&fq->timer)) {
+ /* q expiring right now thus increment its refcount so
+ * it won't be freed under us and wait until the timer
+ * has finished executing then destroy it
+ */
+ atomic_inc(&fq->refcnt);
+ spin_unlock(&hb->chain_lock);
+ del_timer_sync(&fq->timer);
+ inet_frag_put(fq, f);
+ goto evict_again;
+ }
+
+ fq->flags |= INET_FRAG_EVICTED;
+ hlist_del(&fq->list);
+ hlist_add_head(&fq->list, &expired);
+ ++evicted;
+ }
+
+ spin_unlock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(fq, n, &expired, list)
+ f->frag_expire((unsigned long) fq);
+
+ return evicted;
+}
+
+static void inet_frag_worker(struct work_struct *work)
+{
+ unsigned int budget = INETFRAGS_EVICT_BUCKETS;
+ unsigned int i, evicted = 0;
+ struct inet_frags *f;
+
+ f = container_of(work, struct inet_frags, frags_work);
+
+ BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
+
+ local_bh_disable();
+
+ for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+ evicted += inet_evict_bucket(f, &f->hash[i]);
+ i = (i + 1) & (INETFRAGS_HASHSZ - 1);
+ if (evicted > INETFRAGS_EVICT_MAX)
+ break;
+ }
+
+ f->next_bucket = i;
+
+ local_bh_enable();
+
+ if (f->rebuild && inet_frag_may_rebuild(f))
+ inet_frag_secret_rebuild(f);
+}
+
+static void inet_frag_schedule_worker(struct inet_frags *f)
+{
+ if (unlikely(!work_pending(&f->frags_work)))
+ schedule_work(&f->frags_work);
+}
+
+int inet_frags_init(struct inet_frags *f)
+{
+ int i;
+
+ INIT_WORK(&f->frags_work, inet_frag_worker);
+
+ for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+ struct inet_frag_bucket *hb = &f->hash[i];
+
+ spin_lock_init(&hb->chain_lock);
+ INIT_HLIST_HEAD(&hb->chain);
+ }
+
+ seqlock_init(&f->rnd_seqlock);
+ f->last_rebuild_jiffies = 0;
+ f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
+ NULL);
+ if (!f->frags_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(inet_frags_init);
+
+void inet_frags_init_net(struct netns_frags *nf)
+{
+ init_frag_mem_limit(nf);
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
+void inet_frags_fini(struct inet_frags *f)
+{
+ cancel_work_sync(&f->frags_work);
+ kmem_cache_destroy(f->frags_cachep);
+}
+EXPORT_SYMBOL(inet_frags_fini);
+
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+{
+ unsigned int seq;
+ int i;
+
+ nf->low_thresh = 0;
+ local_bh_disable();
+
+evict_again:
+ seq = read_seqbegin(&f->rnd_seqlock);
+
+ for (i = 0; i < INETFRAGS_HASHSZ ; i++)
+ inet_evict_bucket(f, &f->hash[i]);
+
+ if (read_seqretry(&f->rnd_seqlock, seq))
+ goto evict_again;
+
+ local_bh_enable();
+
+ percpu_counter_destroy(&nf->mem);
+}
+EXPORT_SYMBOL(inet_frags_exit_net);
+
+static struct inet_frag_bucket *
+get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
+__acquires(hb->chain_lock)
+{
+ struct inet_frag_bucket *hb;
+ unsigned int seq, hash;
+
+ restart:
+ seq = read_seqbegin(&f->rnd_seqlock);
+
+ hash = inet_frag_hashfn(f, fq);
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
+ if (read_seqretry(&f->rnd_seqlock, seq)) {
+ spin_unlock(&hb->chain_lock);
+ goto restart;
+ }
+
+ return hb;
+}
+
+static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+ struct inet_frag_bucket *hb;
+
+ hb = get_frag_bucket_locked(fq, f);
+ if (!(fq->flags & INET_FRAG_EVICTED))
+ hlist_del(&fq->list);
+ spin_unlock(&hb->chain_lock);
+}
+
+void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+ if (del_timer(&fq->timer))
+ atomic_dec(&fq->refcnt);
+
+ if (!(fq->flags & INET_FRAG_COMPLETE)) {
+ fq_unlink(fq, f);
+ atomic_dec(&fq->refcnt);
+ fq->flags |= INET_FRAG_COMPLETE;
+ }
+}
+EXPORT_SYMBOL(inet_frag_kill);
+
+static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
+ struct sk_buff *skb)
+{
+ if (f->skb_free)
+ f->skb_free(skb);
+ kfree_skb(skb);
+}
+
+void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
+{
+ struct sk_buff *fp;
+ struct netns_frags *nf;
+ unsigned int sum, sum_truesize = 0;
+
+ WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
+ WARN_ON(del_timer(&q->timer) != 0);
+
+ /* Release all fragment data. */
+ fp = q->fragments;
+ nf = q->net;
+ while (fp) {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ frag_kfree_skb(nf, f, fp);
+ fp = xp;
+ }
+ sum = sum_truesize + f->qsize;
+ sub_frag_mem_limit(q, sum);
+
+ if (f->destructor)
+ f->destructor(q);
+ kmem_cache_free(f->frags_cachep, q);
+}
+EXPORT_SYMBOL(inet_frag_destroy);
+
+static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
+ struct inet_frag_queue *qp_in,
+ struct inet_frags *f,
+ void *arg)
+{
+ struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
+ struct inet_frag_queue *qp;
+
+#ifdef CONFIG_SMP
+ /* With SMP race we have to recheck hash table, because
+ * such entry could have been created on other cpu before
+ * we acquired hash bucket lock.
+ */
+ hlist_for_each_entry(qp, &hb->chain, list) {
+ if (qp->net == nf && f->match(qp, arg)) {
+ atomic_inc(&qp->refcnt);
+ spin_unlock(&hb->chain_lock);
+ qp_in->flags |= INET_FRAG_COMPLETE;
+ inet_frag_put(qp_in, f);
+ return qp;
+ }
+ }
+#endif
+ qp = qp_in;
+ if (!mod_timer(&qp->timer, jiffies + nf->timeout))
+ atomic_inc(&qp->refcnt);
+
+ atomic_inc(&qp->refcnt);
+ hlist_add_head(&qp->list, &hb->chain);
+
+ spin_unlock(&hb->chain_lock);
+
+ return qp;
+}
+
+static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+ struct inet_frags *f,
+ void *arg)
+{
+ struct inet_frag_queue *q;
+
+ if (frag_mem_limit(nf) > nf->high_thresh) {
+ inet_frag_schedule_worker(f);
+ return NULL;
+ }
+
+ q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
+ if (!q)
+ return NULL;
+
+ q->net = nf;
+ f->constructor(q, arg);
+ add_frag_mem_limit(q, f->qsize);
+
+ setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
+ spin_lock_init(&q->lock);
+ atomic_set(&q->refcnt, 1);
+
+ return q;
+}
+
+static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+ struct inet_frags *f,
+ void *arg)
+{
+ struct inet_frag_queue *q;
+
+ q = inet_frag_alloc(nf, f, arg);
+ if (!q)
+ return NULL;
+
+ return inet_frag_intern(nf, q, f, arg);
+}
+
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
+ struct inet_frags *f, void *key,
+ unsigned int hash)
+{
+ struct inet_frag_bucket *hb;
+ struct inet_frag_queue *q;
+ int depth = 0;
+
+ if (frag_mem_limit(nf) > nf->low_thresh)
+ inet_frag_schedule_worker(f);
+
+ hash &= (INETFRAGS_HASHSZ - 1);
+ hb = &f->hash[hash];
+
+ spin_lock(&hb->chain_lock);
+ hlist_for_each_entry(q, &hb->chain, list) {
+ if (q->net == nf && f->match(q, key)) {
+ atomic_inc(&q->refcnt);
+ spin_unlock(&hb->chain_lock);
+ return q;
+ }
+ depth++;
+ }
+ spin_unlock(&hb->chain_lock);
+
+ if (depth <= INETFRAGS_MAXDEPTH)
+ return inet_frag_create(nf, f, key);
+
+ if (inet_frag_may_rebuild(f)) {
+ if (!f->rebuild)
+ f->rebuild = true;
+ inet_frag_schedule_worker(f);
+ }
+
+ return ERR_PTR(-ENOBUFS);
+}
+EXPORT_SYMBOL(inet_frag_find);
+
+void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
+ const char *prefix)
+{
+ static const char msg[] = "inet_frag_find: Fragment hash bucket"
+ " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
+ ". Dropping fragment.\n";
+
+ if (PTR_ERR(q) == -ENOBUFS)
+ net_dbg_ratelimited("%s%s", prefix, msg);
+}
+EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 000000000..c6fb80bd5
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,618 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic INET transport hashtables
+ *
+ * Authors: Lotsa people, from code originally in tcp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/secure_seq.h>
+#include <net/ip.h>
+
+static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 inet_ehash_secret __read_mostly;
+
+ net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ inet_ehash_secret + net_hash_mix(net));
+}
+
+/* This function handles inet_sock, but also timewait and request sockets
+ * for IPv4/IPv6.
+ */
+u32 sk_ehashfn(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ return inet6_ehashfn(sock_net(sk),
+ &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+#endif
+ return inet_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+}
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+ struct net *net,
+ struct inet_bind_hashbucket *head,
+ const unsigned short snum)
+{
+ struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
+
+ if (tb) {
+ write_pnet(&tb->ib_net, net);
+ tb->port = snum;
+ tb->fastreuse = 0;
+ tb->fastreuseport = 0;
+ tb->num_owners = 0;
+ INIT_HLIST_HEAD(&tb->owners);
+ hlist_add_head(&tb->node, &head->chain);
+ }
+ return tb;
+}
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+{
+ if (hlist_empty(&tb->owners)) {
+ __hlist_del(&tb->node);
+ kmem_cache_free(cachep, tb);
+ }
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+ const unsigned short snum)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+ atomic_inc(&hashinfo->bsockets);
+
+ inet_sk(sk)->inet_num = snum;
+ sk_add_bind_node(sk, &tb->owners);
+ tb->num_owners++;
+ inet_csk(sk)->icsk_bind_hash = tb;
+}
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
+ hashinfo->bhash_size);
+ struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+ struct inet_bind_bucket *tb;
+
+ atomic_dec(&hashinfo->bsockets);
+
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ __sk_del_bind_node(sk);
+ tb->num_owners--;
+ inet_csk(sk)->icsk_bind_hash = NULL;
+ inet_sk(sk)->inet_num = 0;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct sock *sk)
+{
+ local_bh_disable();
+ __inet_put_port(sk);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(inet_put_port);
+
+int __inet_inherit_port(struct sock *sk, struct sock *child)
+{
+ struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
+ unsigned short port = inet_sk(child)->inet_num;
+ const int bhash = inet_bhashfn(sock_net(sk), port,
+ table->bhash_size);
+ struct inet_bind_hashbucket *head = &table->bhash[bhash];
+ struct inet_bind_bucket *tb;
+
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ if (tb->port != port) {
+ /* NOTE: using tproxy and redirecting skbs to a proxy
+ * on a different listener port breaks the assumption
+ * that the listener socket's icsk_bind_hash is the same
+ * as that of the child socket. We have to look up or
+ * create a new bind bucket for the child here. */
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), sock_net(sk)) &&
+ tb->port == port)
+ break;
+ }
+ if (!tb) {
+ tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+ sock_net(sk), head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ return -ENOMEM;
+ }
+ }
+ }
+ inet_bind_hash(child, tb, port);
+ spin_unlock(&head->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__inet_inherit_port);
+
+static inline int compute_score(struct sock *sk, struct net *net,
+ const unsigned short hnum, const __be32 daddr,
+ const int dif)
+{
+ int score = -1;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
+ !ipv6_only_sock(sk)) {
+ __be32 rcv_saddr = inet->inet_rcv_saddr;
+ score = sk->sk_family == PF_INET ? 2 : 1;
+ if (rcv_saddr) {
+ if (rcv_saddr != daddr)
+ return -1;
+ score += 4;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
+ }
+ }
+ return score;
+}
+
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+
+
+struct sock *__inet_lookup_listener(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const __be32 saddr, __be16 sport,
+ const __be32 daddr, const unsigned short hnum,
+ const int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned int hash = inet_lhashfn(net, hnum);
+ struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
+
+ rcu_read_lock();
+begin:
+ result = NULL;
+ hiscore = 0;
+ sk_nulls_for_each_rcu(sk, node, &ilb->head) {
+ score = compute_score(sk, net, hnum, daddr, dif);
+ if (score > hiscore) {
+ result = sk;
+ hiscore = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (reciprocal_scale(phash, matches) == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
+ goto begin;
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+ result = NULL;
+ else if (unlikely(compute_score(result, net, hnum, daddr,
+ dif) < hiscore)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+/* All sockets share common refcount, but have different destructors */
+void sock_gen_put(struct sock *sk)
+{
+ if (!atomic_dec_and_test(&sk->sk_refcnt))
+ return;
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_free(inet_twsk(sk));
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ reqsk_free(inet_reqsk(sk));
+ else
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sock_gen_put);
+
+void sock_edemux(struct sk_buff *skb)
+{
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
+
+struct sock *__inet_lookup_established(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif)
+{
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ struct sock *sk;
+ const struct hlist_nulls_node *node;
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ unsigned int slot = hash & hashinfo->ehash_mask;
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+
+ rcu_read_lock();
+begin:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (likely(INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
+ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+ goto out;
+ if (unlikely(!INET_MATCH(sk, net, acookie,
+ saddr, daddr, ports, dif))) {
+ sock_gen_put(sk);
+ goto begin;
+ }
+ goto found;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+out:
+ sk = NULL;
+found:
+ rcu_read_unlock();
+ return sk;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_established);
+
+/* called with local bh disabled */
+static int __inet_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_sock *inet = inet_sk(sk);
+ __be32 daddr = inet->inet_rcv_saddr;
+ __be32 saddr = inet->inet_daddr;
+ int dif = sk->sk_bound_dev_if;
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+ const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
+ struct net *net = sock_net(sk);
+ unsigned int hash = inet_ehashfn(net, daddr, lport,
+ saddr, inet->inet_dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
+ struct sock *sk2;
+ const struct hlist_nulls_node *node;
+ struct inet_timewait_sock *tw = NULL;
+ int twrefcnt = 0;
+
+ spin_lock(lock);
+
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash)
+ continue;
+
+ if (likely(INET_MATCH(sk2, net, acookie,
+ saddr, daddr, ports, dif))) {
+ if (sk2->sk_state == TCP_TIME_WAIT) {
+ tw = inet_twsk(sk2);
+ if (twsk_unique(sk, sk2, twp))
+ break;
+ }
+ goto not_unique;
+ }
+ }
+
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity.
+ */
+ inet->inet_num = lport;
+ inet->inet_sport = htons(lport);
+ sk->sk_hash = hash;
+ WARN_ON(!sk_unhashed(sk));
+ __sk_nulls_add_node_rcu(sk, &head->chain);
+ if (tw) {
+ twrefcnt = inet_twsk_unhash(tw);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
+ spin_unlock(lock);
+ if (twrefcnt)
+ inet_twsk_put(tw);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ if (twp) {
+ *twp = tw;
+ } else if (tw) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw);
+
+ inet_twsk_put(tw);
+ }
+ return 0;
+
+not_unique:
+ spin_unlock(lock);
+ return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet_sk_port_offset(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
+ inet->inet_daddr,
+ inet->inet_dport);
+}
+
+int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct hlist_nulls_head *list;
+ struct inet_ehash_bucket *head;
+ spinlock_t *lock;
+ int twrefcnt = 0;
+
+ WARN_ON(!sk_unhashed(sk));
+
+ sk->sk_hash = sk_ehashfn(sk);
+ head = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ list = &head->chain;
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock(lock);
+ __sk_nulls_add_node_rcu(sk, list);
+ if (tw) {
+ WARN_ON(sk->sk_hash != tw->tw_hash);
+ twrefcnt = inet_twsk_unhash(tw);
+ }
+ spin_unlock(lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ return twrefcnt;
+}
+EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
+
+int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ struct inet_listen_hashbucket *ilb;
+
+ if (sk->sk_state != TCP_LISTEN)
+ return __inet_hash_nolisten(sk, tw);
+
+ WARN_ON(!sk_unhashed(sk));
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+
+ spin_lock(&ilb->lock);
+ __sk_nulls_add_node_rcu(sk, &ilb->head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ spin_unlock(&ilb->lock);
+ return 0;
+}
+EXPORT_SYMBOL(__inet_hash);
+
+void inet_hash(struct sock *sk)
+{
+ if (sk->sk_state != TCP_CLOSE) {
+ local_bh_disable();
+ __inet_hash(sk, NULL);
+ local_bh_enable();
+ }
+}
+EXPORT_SYMBOL_GPL(inet_hash);
+
+void inet_unhash(struct sock *sk)
+{
+ struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ spinlock_t *lock;
+ int done;
+
+ if (sk_unhashed(sk))
+ return;
+
+ if (sk->sk_state == TCP_LISTEN)
+ lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+ else
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock_bh(lock);
+ done = __sk_nulls_del_node_init_rcu(sk);
+ if (done)
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock_bh(lock);
+}
+EXPORT_SYMBOL_GPL(inet_unhash);
+
+int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk, u32 port_offset,
+ int (*check_established)(struct inet_timewait_death_row *,
+ struct sock *, __u16, struct inet_timewait_sock **))
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ const unsigned short snum = inet_sk(sk)->inet_num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
+ struct net *net = sock_net(sk);
+ int twrefcnt = 1;
+
+ if (!snum) {
+ int i, remaining, low, high, port;
+ static u32 hint;
+ u32 offset = hint + port_offset;
+ struct inet_timewait_sock *tw = NULL;
+
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+
+ local_bh_disable();
+ for (i = 1; i <= remaining; i++) {
+ port = low + (i + offset) % remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), net) &&
+ tb->port == port) {
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
+ goto next_port;
+ WARN_ON(hlist_empty(&tb->owners));
+ if (!check_established(death_row, sk,
+ port, &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ }
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ hint += i;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->inet_sport = htons(port);
+ twrefcnt += __inet_hash_nolisten(sk, tw);
+ }
+ if (tw)
+ twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+ spin_unlock(&head->lock);
+
+ if (tw) {
+ inet_twsk_deschedule(tw);
+ while (twrefcnt) {
+ twrefcnt--;
+ inet_twsk_put(tw);
+ }
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+ __inet_hash_nolisten(sk, NULL);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = check_established(death_row, sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
+ __inet_check_established);
+}
+EXPORT_SYMBOL_GPL(inet_hash_connect);
+
+void inet_hashinfo_init(struct inet_hashinfo *h)
+{
+ int i;
+
+ atomic_set(&h->bsockets, 0);
+ for (i = 0; i < INET_LHTABLE_SIZE; i++) {
+ spin_lock_init(&h->listening_hash[i].lock);
+ INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
+ i + LISTENING_NULLS_BASE);
+ }
+}
+EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
new file mode 100644
index 000000000..f17ea49b2
--- /dev/null
+++ b/net/ipv4/inet_lro.c
@@ -0,0 +1,374 @@
+/*
+ * linux/net/ipv4/inet_lro.c
+ *
+ * Large Receive Offload (ipv4 / tcp)
+ *
+ * (C) Copyright IBM Corp. 2007
+ *
+ * Authors:
+ * Jan-Bernd Themann <themann@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+#include <linux/if_vlan.h>
+#include <linux/inet_lro.h>
+#include <net/checksum.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
+MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
+
+#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
+#define IP_HDR_LEN(iph) (iph->ihl << 2)
+#define TCP_PAYLOAD_LENGTH(iph, tcph) \
+ (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
+
+#define IPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_W_TIMESTAMP 8
+
+#define LRO_MAX_PG_HLEN 64
+
+#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
+
+/*
+ * Basic tcp checks whether packet is suitable for LRO
+ */
+
+static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
+ int len, const struct net_lro_desc *lro_desc)
+{
+ /* check ip header: don't aggregate padded frames */
+ if (ntohs(iph->tot_len) != len)
+ return -1;
+
+ if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
+ return -1;
+
+ if (iph->ihl != IPH_LEN_WO_OPTIONS)
+ return -1;
+
+ if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
+ tcph->rst || tcph->syn || tcph->fin)
+ return -1;
+
+ if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
+ return -1;
+
+ if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
+ tcph->doff != TCPH_LEN_W_TIMESTAMP)
+ return -1;
+
+ /* check tcp options (only timestamp allowed) */
+ if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
+ __be32 *topt = (__be32 *)(tcph + 1);
+
+ if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))
+ return -1;
+
+ /* timestamp should be in right order */
+ topt++;
+ if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
+ ntohl(*topt)))
+ return -1;
+
+ /* timestamp reply should not be zero */
+ topt++;
+ if (*topt == 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
+{
+ struct iphdr *iph = lro_desc->iph;
+ struct tcphdr *tcph = lro_desc->tcph;
+ __be32 *p;
+ __wsum tcp_hdr_csum;
+
+ tcph->ack_seq = lro_desc->tcp_ack;
+ tcph->window = lro_desc->tcp_window;
+
+ if (lro_desc->tcp_saw_tstamp) {
+ p = (__be32 *)(tcph + 1);
+ *(p+2) = lro_desc->tcp_rcv_tsecr;
+ }
+
+ csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
+ iph->tot_len = htons(lro_desc->ip_tot_len);
+
+ tcph->check = 0;
+ tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
+ lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
+ tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ lro_desc->ip_tot_len -
+ IP_HDR_LEN(iph), IPPROTO_TCP,
+ lro_desc->data_csum);
+}
+
+static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
+{
+ __wsum tcp_csum;
+ __wsum tcp_hdr_csum;
+ __wsum tcp_ps_hdr_csum;
+
+ tcp_csum = ~csum_unfold(tcph->check);
+ tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
+
+ tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ len + TCP_HDR_LEN(tcph),
+ IPPROTO_TCP, 0);
+
+ return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
+ tcp_ps_hdr_csum);
+}
+
+static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
+ struct iphdr *iph, struct tcphdr *tcph)
+{
+ int nr_frags;
+ __be32 *ptr;
+ u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+ nr_frags = skb_shinfo(skb)->nr_frags;
+ lro_desc->parent = skb;
+ lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
+ lro_desc->iph = iph;
+ lro_desc->tcph = tcph;
+ lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
+ lro_desc->tcp_ack = tcph->ack_seq;
+ lro_desc->tcp_window = tcph->window;
+
+ lro_desc->pkt_aggr_cnt = 1;
+ lro_desc->ip_tot_len = ntohs(iph->tot_len);
+
+ if (tcph->doff == 8) {
+ ptr = (__be32 *)(tcph+1);
+ lro_desc->tcp_saw_tstamp = 1;
+ lro_desc->tcp_rcv_tsval = *(ptr+1);
+ lro_desc->tcp_rcv_tsecr = *(ptr+2);
+ }
+
+ lro_desc->mss = tcp_data_len;
+ lro_desc->active = 1;
+
+ lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
+ tcp_data_len);
+}
+
+static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
+{
+ memset(lro_desc, 0, sizeof(struct net_lro_desc));
+}
+
+static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
+ struct tcphdr *tcph, int tcp_data_len)
+{
+ struct sk_buff *parent = lro_desc->parent;
+ __be32 *topt;
+
+ lro_desc->pkt_aggr_cnt++;
+ lro_desc->ip_tot_len += tcp_data_len;
+ lro_desc->tcp_next_seq += tcp_data_len;
+ lro_desc->tcp_window = tcph->window;
+ lro_desc->tcp_ack = tcph->ack_seq;
+
+ /* don't update tcp_rcv_tsval, would not work with PAWS */
+ if (lro_desc->tcp_saw_tstamp) {
+ topt = (__be32 *) (tcph + 1);
+ lro_desc->tcp_rcv_tsecr = *(topt + 2);
+ }
+
+ lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
+ lro_tcp_data_csum(iph, tcph,
+ tcp_data_len),
+ parent->len);
+
+ parent->len += tcp_data_len;
+ parent->data_len += tcp_data_len;
+ if (tcp_data_len > lro_desc->mss)
+ lro_desc->mss = tcp_data_len;
+}
+
+static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
+ struct iphdr *iph, struct tcphdr *tcph)
+{
+ struct sk_buff *parent = lro_desc->parent;
+ int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+ lro_add_common(lro_desc, iph, tcph, tcp_data_len);
+
+ skb_pull(skb, (skb->len - tcp_data_len));
+ parent->truesize += skb->truesize;
+
+ if (lro_desc->last_skb)
+ lro_desc->last_skb->next = skb;
+ else
+ skb_shinfo(parent)->frag_list = skb;
+
+ lro_desc->last_skb = skb;
+}
+
+
+static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ if ((lro_desc->iph->saddr != iph->saddr) ||
+ (lro_desc->iph->daddr != iph->daddr) ||
+ (lro_desc->tcph->source != tcph->source) ||
+ (lro_desc->tcph->dest != tcph->dest))
+ return -1;
+ return 0;
+}
+
+static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
+ struct net_lro_desc *lro_arr,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ struct net_lro_desc *lro_desc = NULL;
+ struct net_lro_desc *tmp;
+ int max_desc = lro_mgr->max_desc;
+ int i;
+
+ for (i = 0; i < max_desc; i++) {
+ tmp = &lro_arr[i];
+ if (tmp->active)
+ if (!lro_check_tcp_conn(tmp, iph, tcph)) {
+ lro_desc = tmp;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < max_desc; i++) {
+ if (!lro_arr[i].active) {
+ lro_desc = &lro_arr[i];
+ goto out;
+ }
+ }
+
+ LRO_INC_STATS(lro_mgr, no_desc);
+out:
+ return lro_desc;
+}
+
+static void lro_flush(struct net_lro_mgr *lro_mgr,
+ struct net_lro_desc *lro_desc)
+{
+ if (lro_desc->pkt_aggr_cnt > 1)
+ lro_update_tcp_ip_header(lro_desc);
+
+ skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
+
+ if (lro_mgr->features & LRO_F_NAPI)
+ netif_receive_skb(lro_desc->parent);
+ else
+ netif_rx(lro_desc->parent);
+
+ LRO_INC_STATS(lro_mgr, flushed);
+ lro_clear_desc(lro_desc);
+}
+
+static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
+ void *priv)
+{
+ struct net_lro_desc *lro_desc;
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ u64 flags;
+ int vlan_hdr_len = 0;
+
+ if (!lro_mgr->get_skb_header ||
+ lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
+ &flags, priv))
+ goto out;
+
+ if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
+ goto out;
+
+ lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
+ if (!lro_desc)
+ goto out;
+
+ if ((skb->protocol == htons(ETH_P_8021Q)) &&
+ !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
+ vlan_hdr_len = VLAN_HLEN;
+
+ if (!lro_desc->active) { /* start new lro session */
+ if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
+ goto out;
+
+ skb->ip_summed = lro_mgr->ip_summed_aggr;
+ lro_init_desc(lro_desc, skb, iph, tcph);
+ LRO_INC_STATS(lro_mgr, aggregated);
+ return 0;
+ }
+
+ if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
+ goto out2;
+
+ if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
+ goto out2;
+
+ lro_add_packet(lro_desc, skb, iph, tcph);
+ LRO_INC_STATS(lro_mgr, aggregated);
+
+ if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
+ lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
+ lro_flush(lro_mgr, lro_desc);
+
+ return 0;
+
+out2: /* send aggregated SKBs to stack */
+ lro_flush(lro_mgr, lro_desc);
+
+out:
+ return 1;
+}
+
+void lro_receive_skb(struct net_lro_mgr *lro_mgr,
+ struct sk_buff *skb,
+ void *priv)
+{
+ if (__lro_proc_skb(lro_mgr, skb, priv)) {
+ if (lro_mgr->features & LRO_F_NAPI)
+ netif_receive_skb(skb);
+ else
+ netif_rx(skb);
+ }
+}
+EXPORT_SYMBOL(lro_receive_skb);
+
+void lro_flush_all(struct net_lro_mgr *lro_mgr)
+{
+ int i;
+ struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
+
+ for (i = 0; i < lro_mgr->max_desc; i++) {
+ if (lro_desc[i].active)
+ lro_flush(lro_mgr, &lro_desc[i]);
+ }
+}
+EXPORT_SYMBOL(lro_flush_all);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 000000000..00ec8d5d7
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,328 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic TIME_WAIT sockets functions
+ *
+ * From code orinally in TCP
+ */
+
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+
+/**
+ * inet_twsk_unhash - unhash a timewait socket from established hash
+ * @tw: timewait socket
+ *
+ * unhash a timewait socket from established hash, if hashed.
+ * ehash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_unhash(struct inet_timewait_sock *tw)
+{
+ if (hlist_nulls_unhashed(&tw->tw_node))
+ return 0;
+
+ hlist_nulls_del_rcu(&tw->tw_node);
+ sk_nulls_node_init(&tw->tw_node);
+ /*
+ * We cannot call inet_twsk_put() ourself under lock,
+ * caller must call it for us.
+ */
+ return 1;
+}
+
+/**
+ * inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ * @tw: timewait socket
+ * @hashinfo: hashinfo pointer
+ *
+ * unhash a timewait socket from bind hash, if hashed.
+ * bind hash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+ struct inet_hashinfo *hashinfo)
+{
+ struct inet_bind_bucket *tb = tw->tw_tb;
+
+ if (!tb)
+ return 0;
+
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ /*
+ * We cannot call inet_twsk_put() ourself under lock,
+ * caller must call it for us.
+ */
+ return 1;
+}
+
+/* Must be called with locally disabled BHs. */
+static void inet_twsk_kill(struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+ struct inet_bind_hashbucket *bhead;
+ int refcnt;
+ /* Unlink from established hashes. */
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+
+ spin_lock(lock);
+ refcnt = inet_twsk_unhash(tw);
+ spin_unlock(lock);
+
+ /* Disassociate with bind bucket. */
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
+ hashinfo->bhash_size)];
+
+ spin_lock(&bhead->lock);
+ refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+ spin_unlock(&bhead->lock);
+
+ BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
+ atomic_sub(refcnt, &tw->tw_refcnt);
+ atomic_dec(&tw->tw_dr->tw_count);
+ inet_twsk_put(tw);
+}
+
+void inet_twsk_free(struct inet_timewait_sock *tw)
+{
+ struct module *owner = tw->tw_prot->owner;
+ twsk_destructor((struct sock *)tw);
+#ifdef SOCK_REFCNT_DEBUG
+ pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
+#endif
+ kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+ module_put(owner);
+}
+
+void inet_twsk_put(struct inet_timewait_sock *tw)
+{
+ if (atomic_dec_and_test(&tw->tw_refcnt))
+ inet_twsk_free(tw);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_put);
+
+static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+ struct hlist_nulls_head *list)
+{
+ hlist_nulls_add_head_rcu(&tw->tw_node, list);
+}
+
+static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
+ struct hlist_head *list)
+{
+ hlist_add_head(&tw->tw_bind_node, list);
+}
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+ struct inet_hashinfo *hashinfo)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+ struct inet_bind_hashbucket *bhead;
+ /* Step 1: Put TW into bind hash. Original socket stays there too.
+ Note, that any socket with inet->num != 0 MUST be bound in
+ binding cache, even if it is closed.
+ */
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
+ hashinfo->bhash_size)];
+ spin_lock(&bhead->lock);
+ tw->tw_tb = icsk->icsk_bind_hash;
+ WARN_ON(!icsk->icsk_bind_hash);
+ inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+ spin_unlock(&bhead->lock);
+
+ spin_lock(lock);
+
+ /*
+ * Step 2: Hash TW into tcp ehash chain.
+ * Notes :
+ * - tw_refcnt is set to 3 because :
+ * - We have one reference from bhash chain.
+ * - We have one reference from ehash chain.
+ * We can use atomic_set() because prior spin_lock()/spin_unlock()
+ * committed into memory all tw fields.
+ */
+ atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
+ inet_twsk_add_node_rcu(tw, &ehead->chain);
+
+ /* Step 3: Remove SK from hash chain */
+ if (__sk_nulls_del_node_init_rcu(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+
+void tw_timer_handler(unsigned long data)
+{
+ struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
+
+ if (tw->tw_kill)
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
+ else
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
+ inet_twsk_kill(tw);
+}
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+ struct inet_timewait_death_row *dr,
+ const int state)
+{
+ struct inet_timewait_sock *tw;
+
+ if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+ return NULL;
+
+ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+ GFP_ATOMIC);
+ if (tw) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ kmemcheck_annotate_bitfield(tw, flags);
+
+ tw->tw_dr = dr;
+ /* Give us an identity. */
+ tw->tw_daddr = inet->inet_daddr;
+ tw->tw_rcv_saddr = inet->inet_rcv_saddr;
+ tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+ tw->tw_tos = inet->tos;
+ tw->tw_num = inet->inet_num;
+ tw->tw_state = TCP_TIME_WAIT;
+ tw->tw_substate = state;
+ tw->tw_sport = inet->inet_sport;
+ tw->tw_dport = inet->inet_dport;
+ tw->tw_family = sk->sk_family;
+ tw->tw_reuse = sk->sk_reuse;
+ tw->tw_hash = sk->sk_hash;
+ tw->tw_ipv6only = 0;
+ tw->tw_transparent = inet->transparent;
+ tw->tw_prot = sk->sk_prot_creator;
+ atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
+ twsk_net_set(tw, sock_net(sk));
+ setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
+ /*
+ * Because we use RCU lookups, we should not set tw_refcnt
+ * to a non null value before everything is setup for this
+ * timewait socket.
+ */
+ atomic_set(&tw->tw_refcnt, 0);
+
+ __module_get(tw->tw_prot->owner);
+ }
+
+ return tw;
+}
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* These are always called from BH context. See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw)
+{
+ if (del_timer_sync(&tw->tw_timer))
+ inet_twsk_kill(tw);
+}
+EXPORT_SYMBOL(inet_twsk_deschedule);
+
+void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
+{
+ /* timeout := RTO * 3.5
+ *
+ * 3.5 = 1+2+0.5 to wait for two retransmits.
+ *
+ * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+ * our ACK acking that FIN can be lost. If N subsequent retransmitted
+ * FINs (or previous seqments) are lost (probability of such event
+ * is p^(N+1), where p is probability to lose single packet and
+ * time to detect the loss is about RTO*(2^N - 1) with exponential
+ * backoff). Normal timewait length is calculated so, that we
+ * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+ * [ BTW Linux. following BSD, violates this requirement waiting
+ * only for 60sec, we should wait at least for 240 secs.
+ * Well, 240 consumes too much of resources 8)
+ * ]
+ * This interval is not reduced to catch old duplicate and
+ * responces to our wandering segments living for two MSLs.
+ * However, if we use PAWS to detect
+ * old duplicates, we can reduce the interval to bounds required
+ * by RTO, rather than MSL. So, if peer understands PAWS, we
+ * kill tw bucket after 3.5*RTO (it is important that this number
+ * is greater than TS tick!) and detect old duplicates with help
+ * of PAWS.
+ */
+
+ tw->tw_kill = timeo <= 4*HZ;
+ if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) {
+ atomic_inc(&tw->tw_refcnt);
+ atomic_inc(&tw->tw_dr->tw_count);
+ }
+}
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+
+void inet_twsk_purge(struct inet_hashinfo *hashinfo,
+ struct inet_timewait_death_row *twdr, int family)
+{
+ struct inet_timewait_sock *tw;
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ unsigned int slot;
+
+ for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+ cond_resched();
+ rcu_read_lock();
+restart:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_state != TCP_TIME_WAIT)
+ continue;
+ tw = inet_twsk(sk);
+ if ((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))
+ continue;
+
+ if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
+ continue;
+
+ if (unlikely((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))) {
+ inet_twsk_put(tw);
+ goto restart;
+ }
+
+ rcu_read_unlock();
+ local_bh_disable();
+ inet_twsk_deschedule(tw);
+ local_bh_enable();
+ inet_twsk_put(tw);
+ goto restart_rcu;
+ }
+ /* If the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto restart;
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 000000000..241afd743
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,558 @@
+/*
+ * INETPEER - A storage for permanent information about peers
+ *
+ * This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ * Authors: Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <net/ip.h>
+#include <net/inetpeer.h>
+#include <net/secure_seq.h>
+
+/*
+ * Theory of operations.
+ * We keep one entry for each peer IP address. The nodes contains long-living
+ * information about the peer which doesn't depend on routes.
+ *
+ * Nodes are removed only when reference counter goes to 0.
+ * When it's happened the node may be removed when a sufficient amount of
+ * time has been passed since its last use. The less-recently-used entry can
+ * also be removed if the pool is overloaded i.e. if the total amount of
+ * entries is greater-or-equal than the threshold.
+ *
+ * Node pool is organised as an AVL tree.
+ * Such an implementation has been chosen not just for fun. It's a way to
+ * prevent easy and efficient DoS attacks by creating hash collisions. A huge
+ * amount of long living nodes in a single hash slot would significantly delay
+ * lookups performed with disabled BHs.
+ *
+ * Serialisation issues.
+ * 1. Nodes may appear in the tree only with the pool lock held.
+ * 2. Nodes may disappear from the tree only with the pool lock held
+ * AND reference count being 0.
+ * 3. Global variable peer_total is modified under the pool lock.
+ * 4. struct inet_peer fields modification:
+ * avl_left, avl_right, avl_parent, avl_height: pool lock
+ * refcnt: atomically against modifications on other CPU;
+ * usually under some other lock to prevent node disappearing
+ * daddr: unchangeable
+ */
+
+static struct kmem_cache *peer_cachep __read_mostly;
+
+static LIST_HEAD(gc_list);
+static const int gc_delay = 60 * HZ;
+static struct delayed_work gc_work;
+static DEFINE_SPINLOCK(gc_lock);
+
+#define node_height(x) x->avl_height
+
+#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
+#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
+static const struct inet_peer peer_fake_node = {
+ .avl_left = peer_avl_empty_rcu,
+ .avl_right = peer_avl_empty_rcu,
+ .avl_height = 0
+};
+
+void inet_peer_base_init(struct inet_peer_base *bp)
+{
+ bp->root = peer_avl_empty_rcu;
+ seqlock_init(&bp->lock);
+ bp->total = 0;
+}
+EXPORT_SYMBOL_GPL(inet_peer_base_init);
+
+#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+
+/* Exported for sysctl_net_ipv4. */
+int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
+ * aggressively at this stage */
+int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
+int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
+
+static void inetpeer_gc_worker(struct work_struct *work)
+{
+ struct inet_peer *p, *n, *c;
+ struct list_head list;
+
+ spin_lock_bh(&gc_lock);
+ list_replace_init(&gc_list, &list);
+ spin_unlock_bh(&gc_lock);
+
+ if (list_empty(&list))
+ return;
+
+ list_for_each_entry_safe(p, n, &list, gc_list) {
+
+ if (need_resched())
+ cond_resched();
+
+ c = rcu_dereference_protected(p->avl_left, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_left = peer_avl_empty_rcu;
+ }
+
+ c = rcu_dereference_protected(p->avl_right, 1);
+ if (c != peer_avl_empty) {
+ list_add_tail(&c->gc_list, &list);
+ p->avl_right = peer_avl_empty_rcu;
+ }
+
+ n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
+
+ if (!atomic_read(&p->refcnt)) {
+ list_del(&p->gc_list);
+ kmem_cache_free(peer_cachep, p);
+ }
+ }
+
+ if (list_empty(&list))
+ return;
+
+ spin_lock_bh(&gc_lock);
+ list_splice(&list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
+
+/* Called from ip_output.c:ip_init */
+void __init inet_initpeers(void)
+{
+ struct sysinfo si;
+
+ /* Use the straight interface to information about memory. */
+ si_meminfo(&si);
+ /* The values below were suggested by Alexey Kuznetsov
+ * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
+ * myself. --SAW
+ */
+ if (si.totalram <= (32768*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
+ if (si.totalram <= (16384*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 1; /* about 512KB */
+ if (si.totalram <= (8192*1024)/PAGE_SIZE)
+ inet_peer_threshold >>= 2; /* about 128KB */
+
+ peer_cachep = kmem_cache_create("inet_peer_cache",
+ sizeof(struct inet_peer),
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ NULL);
+
+ INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
+}
+
+static int addr_compare(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
+{
+ int i, n = (a->family == AF_INET ? 1 : 4);
+
+ for (i = 0; i < n; i++) {
+ if (a->addr.a6[i] == b->addr.a6[i])
+ continue;
+ if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
+ return -1;
+ return 1;
+ }
+
+ return 0;
+}
+
+#define rcu_deref_locked(X, BASE) \
+ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
+
+/*
+ * Called with local BH disabled and the pool lock held.
+ */
+#define lookup(_daddr, _stack, _base) \
+({ \
+ struct inet_peer *u; \
+ struct inet_peer __rcu **v; \
+ \
+ stackptr = _stack; \
+ *stackptr++ = &_base->root; \
+ for (u = rcu_deref_locked(_base->root, _base); \
+ u != peer_avl_empty;) { \
+ int cmp = addr_compare(_daddr, &u->daddr); \
+ if (cmp == 0) \
+ break; \
+ if (cmp == -1) \
+ v = &u->avl_left; \
+ else \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = rcu_deref_locked(*v, _base); \
+ } \
+ u; \
+})
+
+/*
+ * Called with rcu_read_lock()
+ * Because we hold no lock against a writer, its quite possible we fall
+ * in an endless loop.
+ * But every pointer we follow is guaranteed to be valid thanks to RCU.
+ * We exit from this function if number of links exceeds PEER_MAXDEPTH
+ */
+static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
+ struct inet_peer_base *base)
+{
+ struct inet_peer *u = rcu_dereference(base->root);
+ int count = 0;
+
+ while (u != peer_avl_empty) {
+ int cmp = addr_compare(daddr, &u->daddr);
+ if (cmp == 0) {
+ /* Before taking a reference, check if this entry was
+ * deleted (refcnt=-1)
+ */
+ if (!atomic_add_unless(&u->refcnt, 1, -1))
+ u = NULL;
+ return u;
+ }
+ if (cmp == -1)
+ u = rcu_dereference(u->avl_left);
+ else
+ u = rcu_dereference(u->avl_right);
+ if (unlikely(++count == PEER_MAXDEPTH))
+ break;
+ }
+ return NULL;
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define lookup_rightempty(start, base) \
+({ \
+ struct inet_peer *u; \
+ struct inet_peer __rcu **v; \
+ *stackptr++ = &start->avl_left; \
+ v = &start->avl_left; \
+ for (u = rcu_deref_locked(*v, base); \
+ u->avl_right != peer_avl_empty_rcu;) { \
+ v = &u->avl_right; \
+ *stackptr++ = v; \
+ u = rcu_deref_locked(*v, base); \
+ } \
+ u; \
+})
+
+/* Called with local BH disabled and the pool lock held.
+ * Variable names are the proof of operation correctness.
+ * Look into mm/map_avl.c for more detail description of the ideas.
+ */
+static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
+ struct inet_peer __rcu ***stackend,
+ struct inet_peer_base *base)
+{
+ struct inet_peer __rcu **nodep;
+ struct inet_peer *node, *l, *r;
+ int lh, rh;
+
+ while (stackend > stack) {
+ nodep = *--stackend;
+ node = rcu_deref_locked(*nodep, base);
+ l = rcu_deref_locked(node->avl_left, base);
+ r = rcu_deref_locked(node->avl_right, base);
+ lh = node_height(l);
+ rh = node_height(r);
+ if (lh > rh + 1) { /* l: RH+2 */
+ struct inet_peer *ll, *lr, *lrl, *lrr;
+ int lrh;
+ ll = rcu_deref_locked(l->avl_left, base);
+ lr = rcu_deref_locked(l->avl_right, base);
+ lrh = node_height(lr);
+ if (lrh <= node_height(ll)) { /* ll: RH+1 */
+ RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
+ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
+ node->avl_height = lrh + 1; /* RH+1 or RH+2 */
+ RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
+ RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
+ l->avl_height = node->avl_height + 1;
+ RCU_INIT_POINTER(*nodep, l);
+ } else { /* ll: RH, lr: RH+1 */
+ lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
+ lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
+ RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
+ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
+ node->avl_height = rh + 1; /* node: RH+1 */
+ RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
+ RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
+ l->avl_height = rh + 1; /* l: RH+1 */
+ RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
+ RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
+ lr->avl_height = rh + 2;
+ RCU_INIT_POINTER(*nodep, lr);
+ }
+ } else if (rh > lh + 1) { /* r: LH+2 */
+ struct inet_peer *rr, *rl, *rlr, *rll;
+ int rlh;
+ rr = rcu_deref_locked(r->avl_right, base);
+ rl = rcu_deref_locked(r->avl_left, base);
+ rlh = node_height(rl);
+ if (rlh <= node_height(rr)) { /* rr: LH+1 */
+ RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
+ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
+ node->avl_height = rlh + 1; /* LH+1 or LH+2 */
+ RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
+ RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
+ r->avl_height = node->avl_height + 1;
+ RCU_INIT_POINTER(*nodep, r);
+ } else { /* rr: RH, rl: RH+1 */
+ rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
+ rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
+ RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
+ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
+ node->avl_height = lh + 1; /* node: LH+1 */
+ RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
+ RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
+ r->avl_height = lh + 1; /* r: LH+1 */
+ RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
+ RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
+ rl->avl_height = lh + 2;
+ RCU_INIT_POINTER(*nodep, rl);
+ }
+ } else {
+ node->avl_height = (lh > rh ? lh : rh) + 1;
+ }
+ }
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define link_to_pool(n, base) \
+do { \
+ n->avl_height = 1; \
+ n->avl_left = peer_avl_empty_rcu; \
+ n->avl_right = peer_avl_empty_rcu; \
+ /* lockless readers can catch us now */ \
+ rcu_assign_pointer(**--stackptr, n); \
+ peer_avl_rebalance(stack, stackptr, base); \
+} while (0)
+
+static void inetpeer_free_rcu(struct rcu_head *head)
+{
+ kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
+}
+
+static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH])
+{
+ struct inet_peer __rcu ***stackptr, ***delp;
+
+ if (lookup(&p->daddr, stack, base) != p)
+ BUG();
+ delp = stackptr - 1; /* *delp[0] == p */
+ if (p->avl_left == peer_avl_empty_rcu) {
+ *delp[0] = p->avl_right;
+ --stackptr;
+ } else {
+ /* look for a node to insert instead of p */
+ struct inet_peer *t;
+ t = lookup_rightempty(p, base);
+ BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
+ **--stackptr = t->avl_left;
+ /* t is removed, t->daddr > x->daddr for any
+ * x in p->avl_left subtree.
+ * Put t in the old place of p. */
+ RCU_INIT_POINTER(*delp[0], t);
+ t->avl_left = p->avl_left;
+ t->avl_right = p->avl_right;
+ t->avl_height = p->avl_height;
+ BUG_ON(delp[1] != &p->avl_left);
+ delp[1] = &t->avl_left; /* was &p->avl_left */
+ }
+ peer_avl_rebalance(stack, stackptr, base);
+ base->total--;
+ call_rcu(&p->rcu, inetpeer_free_rcu);
+}
+
+/* perform garbage collect on all items stacked during a lookup */
+static int inet_peer_gc(struct inet_peer_base *base,
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH],
+ struct inet_peer __rcu ***stackptr)
+{
+ struct inet_peer *p, *gchead = NULL;
+ __u32 delta, ttl;
+ int cnt = 0;
+
+ if (base->total >= inet_peer_threshold)
+ ttl = 0; /* be aggressive */
+ else
+ ttl = inet_peer_maxttl
+ - (inet_peer_maxttl - inet_peer_minttl) / HZ *
+ base->total / inet_peer_threshold * HZ;
+ stackptr--; /* last stack slot is peer_avl_empty */
+ while (stackptr > stack) {
+ stackptr--;
+ p = rcu_deref_locked(**stackptr, base);
+ if (atomic_read(&p->refcnt) == 0) {
+ smp_rmb();
+ delta = (__u32)jiffies - p->dtime;
+ if (delta >= ttl &&
+ atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
+ p->gc_next = gchead;
+ gchead = p;
+ }
+ }
+ }
+ while ((p = gchead) != NULL) {
+ gchead = p->gc_next;
+ cnt++;
+ unlink_from_pool(p, base, stack);
+ }
+ return cnt;
+}
+
+struct inet_peer *inet_getpeer(struct inet_peer_base *base,
+ const struct inetpeer_addr *daddr,
+ int create)
+{
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
+ struct inet_peer *p;
+ unsigned int sequence;
+ int invalidated, gccnt = 0;
+
+ /* Attempt a lockless lookup first.
+ * Because of a concurrent writer, we might not find an existing entry.
+ */
+ rcu_read_lock();
+ sequence = read_seqbegin(&base->lock);
+ p = lookup_rcu(daddr, base);
+ invalidated = read_seqretry(&base->lock, sequence);
+ rcu_read_unlock();
+
+ if (p)
+ return p;
+
+ /* If no writer did a change during our lookup, we can return early. */
+ if (!create && !invalidated)
+ return NULL;
+
+ /* retry an exact lookup, taking the lock before.
+ * At least, nodes should be hot in our cache.
+ */
+ write_seqlock_bh(&base->lock);
+relookup:
+ p = lookup(daddr, stack, base);
+ if (p != peer_avl_empty) {
+ atomic_inc(&p->refcnt);
+ write_sequnlock_bh(&base->lock);
+ return p;
+ }
+ if (!gccnt) {
+ gccnt = inet_peer_gc(base, stack, stackptr);
+ if (gccnt && create)
+ goto relookup;
+ }
+ p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
+ if (p) {
+ p->daddr = *daddr;
+ atomic_set(&p->refcnt, 1);
+ atomic_set(&p->rid, 0);
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ /* 60*HZ is arbitrary, but chosen enough high so that the first
+ * calculation of tokens is at its maximum.
+ */
+ p->rate_last = jiffies - 60*HZ;
+ INIT_LIST_HEAD(&p->gc_list);
+
+ /* Link the node. */
+ link_to_pool(p, base);
+ base->total++;
+ }
+ write_sequnlock_bh(&base->lock);
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(inet_getpeer);
+
+void inet_putpeer(struct inet_peer *p)
+{
+ p->dtime = (__u32)jiffies;
+ smp_mb__before_atomic();
+ atomic_dec(&p->refcnt);
+}
+EXPORT_SYMBOL_GPL(inet_putpeer);
+
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the inet_peer entries now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same inet_peer fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+ unsigned long now, token;
+ bool rc = false;
+
+ if (!peer)
+ return true;
+
+ token = peer->rate_tokens;
+ now = jiffies;
+ token += now - peer->rate_last;
+ peer->rate_last = now;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ if (token >= timeout) {
+ token -= timeout;
+ rc = true;
+ }
+ peer->rate_tokens = token;
+ return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
+
+static void inetpeer_inval_rcu(struct rcu_head *head)
+{
+ struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
+
+ spin_lock_bh(&gc_lock);
+ list_add_tail(&p->gc_list, &gc_list);
+ spin_unlock_bh(&gc_lock);
+
+ schedule_delayed_work(&gc_work, gc_delay);
+}
+
+void inetpeer_invalidate_tree(struct inet_peer_base *base)
+{
+ struct inet_peer *root;
+
+ write_seqlock_bh(&base->lock);
+
+ root = rcu_deref_locked(base->root, base);
+ if (root != peer_avl_empty) {
+ base->root = peer_avl_empty_rcu;
+ base->total = 0;
+ call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
+ }
+
+ write_sequnlock_bh(&base->lock);
+}
+EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 000000000..367448494
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,159 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP forwarding functionality.
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip_input.c for
+ * history.
+ * Dave Gregorich : NULL ip_rt_put fix for multicast
+ * routing.
+ * Jos Vos : Add call_out_firewall before sending,
+ * use output device for accounting.
+ * Jos Vos : Call forward firewall after routing
+ * (always use output device).
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static bool ip_may_fragment(const struct sk_buff *skb)
+{
+ return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
+ skb->ignore_df;
+}
+
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ return false;
+
+ return true;
+}
+
+
+static int ip_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+
+ if (unlikely(opt->optlen))
+ ip_forward_options(skb);
+
+ skb_sender_cpu_clear(skb);
+ return dst_output_sk(sk, skb);
+}
+
+int ip_forward(struct sk_buff *skb)
+{
+ u32 mtu;
+ struct iphdr *iph; /* Our header */
+ struct rtable *rt; /* Route we use */
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ /* that should never happen */
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ if (unlikely(skb->sk))
+ goto drop;
+
+ if (skb_warn_if_lro(skb))
+ goto drop;
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+ goto drop;
+
+ if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+ return NET_RX_SUCCESS;
+
+ skb_forward_csum(skb);
+
+ /*
+ * According to the RFC, we must first decrease the TTL field. If
+ * that reaches zero, we must reply an ICMP control message telling
+ * that the packet's lifetime expired.
+ */
+ if (ip_hdr(skb)->ttl <= 1)
+ goto too_many_hops;
+
+ if (!xfrm4_route_forward(skb))
+ goto drop;
+
+ rt = skb_rtable(skb);
+
+ if (opt->is_strictroute && rt->rt_uses_gateway)
+ goto sr_failed;
+
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+ mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+ if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
+ IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ goto drop;
+ }
+
+ /* We are about to mangle packet. Copy it! */
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
+ goto drop;
+ iph = ip_hdr(skb);
+
+ /* Decrease ttl after skb cow done */
+ ip_decrease_ttl(iph);
+
+ /*
+ * We now generate an ICMP HOST REDIRECT giving the route
+ * we calculated.
+ */
+ if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
+ !skb_sec_path(skb))
+ ip_rt_send_redirect(skb);
+
+ skb->priority = rt_tos2priority(iph->tos);
+
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
+ skb->dev, rt->dst.dev, ip_forward_finish);
+
+sr_failed:
+ /*
+ * Strict routing permits no gatewaying
+ */
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+ goto drop;
+
+too_many_hops:
+ /* Tell the sender its packet died... */
+ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
+ icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 000000000..cc1da6d9c
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,871 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP fragmentation functionality.
+ *
+ * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ * Fixes:
+ * Alan Cox : Split from ip.c , see ip_input.c for history.
+ * David S. Miller : Begin massive cleanup...
+ * Andi Kleen : Add sysctls.
+ * xxxx : Overlapfrag bug.
+ * Ultima : ip_expire() kernel panic.
+ * Bill Hawes : Frag accounting and evictor fixes.
+ * John McDonald : 0 length frag bug.
+ * Alexey Kuznetsov: SMP races, threading, cleanup.
+ * Patrick McHardy : LRU queue of frag heads for evictor.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/jiffies.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <net/inet_frag.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/inet_ecn.h>
+
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+
+static int sysctl_ipfrag_max_dist __read_mostly = 64;
+static const char ip_frag_cache_name[] = "ip4-frags";
+
+struct ipfrag_skb_cb
+{
+ struct inet_skb_parm h;
+ int offset;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+ struct inet_frag_queue q;
+
+ u32 user;
+ __be32 saddr;
+ __be32 daddr;
+ __be16 id;
+ u8 protocol;
+ u8 ecn; /* RFC3168 support */
+ int iif;
+ unsigned int rid;
+ struct inet_peer *peer;
+};
+
+static u8 ip4_frag_ecn(u8 tos)
+{
+ return 1 << (tos & INET_ECN_MASK);
+}
+
+static struct inet_frags ip4_frags;
+
+int ip_frag_mem(struct net *net)
+{
+ return sum_frag_mem_limit(&net->ipv4.frags);
+}
+
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+ struct net_device *dev);
+
+struct ip4_create_arg {
+ struct iphdr *iph;
+ u32 user;
+};
+
+static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
+{
+ net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
+ return jhash_3words((__force u32)id << 16 | prot,
+ (__force u32)saddr, (__force u32)daddr,
+ ip4_frags.rnd);
+}
+
+static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
+{
+ const struct ipq *ipq;
+
+ ipq = container_of(q, struct ipq, q);
+ return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
+}
+
+static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
+{
+ const struct ipq *qp;
+ const struct ip4_create_arg *arg = a;
+
+ qp = container_of(q, struct ipq, q);
+ return qp->id == arg->iph->id &&
+ qp->saddr == arg->iph->saddr &&
+ qp->daddr == arg->iph->daddr &&
+ qp->protocol == arg->iph->protocol &&
+ qp->user == arg->user;
+}
+
+static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
+{
+ struct ipq *qp = container_of(q, struct ipq, q);
+ struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
+ frags);
+ struct net *net = container_of(ipv4, struct net, ipv4);
+
+ const struct ip4_create_arg *arg = a;
+
+ qp->protocol = arg->iph->protocol;
+ qp->id = arg->iph->id;
+ qp->ecn = ip4_frag_ecn(arg->iph->tos);
+ qp->saddr = arg->iph->saddr;
+ qp->daddr = arg->iph->daddr;
+ qp->user = arg->user;
+ qp->peer = sysctl_ipfrag_max_dist ?
+ inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
+}
+
+static void ip4_frag_free(struct inet_frag_queue *q)
+{
+ struct ipq *qp;
+
+ qp = container_of(q, struct ipq, q);
+ if (qp->peer)
+ inet_putpeer(qp->peer);
+}
+
+
+/* Destruction primitives. */
+
+static void ipq_put(struct ipq *ipq)
+{
+ inet_frag_put(&ipq->q, &ip4_frags);
+}
+
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static void ipq_kill(struct ipq *ipq)
+{
+ inet_frag_kill(&ipq->q, &ip4_frags);
+}
+
+/*
+ * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
+ */
+static void ip_expire(unsigned long arg)
+{
+ struct ipq *qp;
+ struct net *net;
+
+ qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+ net = container_of(qp->q.net, struct net, ipv4.frags);
+
+ spin_lock(&qp->q.lock);
+
+ if (qp->q.flags & INET_FRAG_COMPLETE)
+ goto out;
+
+ ipq_kill(qp);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+
+ if (!(qp->q.flags & INET_FRAG_EVICTED)) {
+ struct sk_buff *head = qp->q.fragments;
+ const struct iphdr *iph;
+ int err;
+
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+
+ if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+ goto out;
+
+ rcu_read_lock();
+ head->dev = dev_get_by_index_rcu(net, qp->iif);
+ if (!head->dev)
+ goto out_rcu_unlock;
+
+ /* skb has no dst, perform route lookup again */
+ iph = ip_hdr(head);
+ err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+ iph->tos, head->dev);
+ if (err)
+ goto out_rcu_unlock;
+
+ /* Only an end host needs to send an ICMP
+ * "Fragment Reassembly Timeout" message, per RFC792.
+ */
+ if (qp->user == IP_DEFRAG_AF_PACKET ||
+ ((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
+ (qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
+ (skb_rtable(head)->rt_type != RTN_LOCAL)))
+ goto out_rcu_unlock;
+
+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
+ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+out_rcu_unlock:
+ rcu_read_unlock();
+ }
+out:
+ spin_unlock(&qp->q.lock);
+ ipq_put(qp);
+}
+
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
+{
+ struct inet_frag_queue *q;
+ struct ip4_create_arg arg;
+ unsigned int hash;
+
+ arg.iph = iph;
+ arg.user = user;
+
+ hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
+
+ q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
+ return container_of(q, struct ipq, q);
+}
+
+/* Is the fragment too far ahead to be part of ipq? */
+static int ip_frag_too_far(struct ipq *qp)
+{
+ struct inet_peer *peer = qp->peer;
+ unsigned int max = sysctl_ipfrag_max_dist;
+ unsigned int start, end;
+
+ int rc;
+
+ if (!peer || !max)
+ return 0;
+
+ start = qp->rid;
+ end = atomic_inc_return(&peer->rid);
+ qp->rid = end;
+
+ rc = qp->q.fragments && (end - start) > max;
+
+ if (rc) {
+ struct net *net;
+
+ net = container_of(qp->q.net, struct net, ipv4.frags);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ }
+
+ return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+ struct sk_buff *fp;
+ unsigned int sum_truesize = 0;
+
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+ atomic_inc(&qp->q.refcnt);
+ return -ETIMEDOUT;
+ }
+
+ fp = qp->q.fragments;
+ do {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
+ fp = xp;
+ } while (fp);
+ sub_frag_mem_limit(&qp->q, sum_truesize);
+
+ qp->q.flags = 0;
+ qp->q.len = 0;
+ qp->q.meat = 0;
+ qp->q.fragments = NULL;
+ qp->q.fragments_tail = NULL;
+ qp->iif = 0;
+ qp->ecn = 0;
+
+ return 0;
+}
+
+/* Add new segment to existing queue. */
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *next;
+ struct net_device *dev;
+ int flags, offset;
+ int ihl, end;
+ int err = -ENOENT;
+ u8 ecn;
+
+ if (qp->q.flags & INET_FRAG_COMPLETE)
+ goto err;
+
+ if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+ unlikely(ip_frag_too_far(qp)) &&
+ unlikely(err = ip_frag_reinit(qp))) {
+ ipq_kill(qp);
+ goto err;
+ }
+
+ ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
+ offset = ntohs(ip_hdr(skb)->frag_off);
+ flags = offset & ~IP_OFFSET;
+ offset &= IP_OFFSET;
+ offset <<= 3; /* offset is in 8-byte chunks */
+ ihl = ip_hdrlen(skb);
+
+ /* Determine the position of this fragment. */
+ end = offset + skb->len - ihl;
+ err = -EINVAL;
+
+ /* Is this the final fragment? */
+ if ((flags & IP_MF) == 0) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+ if (end < qp->q.len ||
+ ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
+ goto err;
+ qp->q.flags |= INET_FRAG_LAST_IN;
+ qp->q.len = end;
+ } else {
+ if (end&7) {
+ end &= ~7;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ if (end > qp->q.len) {
+ /* Some bits beyond end -> corruption. */
+ if (qp->q.flags & INET_FRAG_LAST_IN)
+ goto err;
+ qp->q.len = end;
+ }
+ }
+ if (end == offset)
+ goto err;
+
+ err = -ENOMEM;
+ if (!pskb_pull(skb, ihl))
+ goto err;
+
+ err = pskb_trim_rcsum(skb, end - offset);
+ if (err)
+ goto err;
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = qp->q.fragments_tail;
+ if (!prev || FRAG_CB(prev)->offset < offset) {
+ next = NULL;
+ goto found;
+ }
+ prev = NULL;
+ for (next = qp->q.fragments; next != NULL; next = next->next) {
+ if (FRAG_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+found:
+ /* We found where to put this one. Check for overlap with
+ * preceding fragment, and, if needed, align things so that
+ * any overlaps are eliminated.
+ */
+ if (prev) {
+ int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+
+ if (i > 0) {
+ offset += i;
+ err = -EINVAL;
+ if (end <= offset)
+ goto err;
+ err = -ENOMEM;
+ if (!pskb_pull(skb, i))
+ goto err;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ }
+
+ err = -ENOMEM;
+
+ while (next && FRAG_CB(next)->offset < end) {
+ int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+
+ if (i < next->len) {
+ /* Eat head of the next overlapped fragment
+ * and leave the loop. The next ones cannot overlap.
+ */
+ if (!pskb_pull(next, i))
+ goto err;
+ FRAG_CB(next)->offset += i;
+ qp->q.meat -= i;
+ if (next->ip_summed != CHECKSUM_UNNECESSARY)
+ next->ip_summed = CHECKSUM_NONE;
+ break;
+ } else {
+ struct sk_buff *free_it = next;
+
+ /* Old fragment is completely overridden with
+ * new one drop it.
+ */
+ next = next->next;
+
+ if (prev)
+ prev->next = next;
+ else
+ qp->q.fragments = next;
+
+ qp->q.meat -= free_it->len;
+ sub_frag_mem_limit(&qp->q, free_it->truesize);
+ kfree_skb(free_it);
+ }
+ }
+
+ FRAG_CB(skb)->offset = offset;
+
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (!next)
+ qp->q.fragments_tail = skb;
+ if (prev)
+ prev->next = skb;
+ else
+ qp->q.fragments = skb;
+
+ dev = skb->dev;
+ if (dev) {
+ qp->iif = dev->ifindex;
+ skb->dev = NULL;
+ }
+ qp->q.stamp = skb->tstamp;
+ qp->q.meat += skb->len;
+ qp->ecn |= ecn;
+ add_frag_mem_limit(&qp->q, skb->truesize);
+ if (offset == 0)
+ qp->q.flags |= INET_FRAG_FIRST_IN;
+
+ if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+ skb->len + ihl > qp->q.max_size)
+ qp->q.max_size = skb->len + ihl;
+
+ if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ qp->q.meat == qp->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ err = ip_frag_reasm(qp, prev, dev);
+ skb->_skb_refdst = orefdst;
+ return err;
+ }
+
+ skb_dst_drop(skb);
+ return -EINPROGRESS;
+
+err:
+ kfree_skb(skb);
+ return err;
+}
+
+
+/* Build a new IP datagram from all its fragments. */
+
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+ struct net_device *dev)
+{
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct iphdr *iph;
+ struct sk_buff *fp, *head = qp->q.fragments;
+ int len;
+ int ihlen;
+ int err;
+ int sum_truesize;
+ u8 ecn;
+
+ ipq_kill(qp);
+
+ ecn = ip_frag_ecn_table[qp->ecn];
+ if (unlikely(ecn == 0xff)) {
+ err = -EINVAL;
+ goto out_fail;
+ }
+ /* Make the one we just received the head. */
+ if (prev) {
+ head = prev->next;
+ fp = skb_clone(head, GFP_ATOMIC);
+ if (!fp)
+ goto out_nomem;
+
+ fp->next = head->next;
+ if (!fp->next)
+ qp->q.fragments_tail = fp;
+ prev->next = fp;
+
+ skb_morph(head, qp->q.fragments);
+ head->next = qp->q.fragments->next;
+
+ consume_skb(qp->q.fragments);
+ qp->q.fragments = head;
+ }
+
+ WARN_ON(!head);
+ WARN_ON(FRAG_CB(head)->offset != 0);
+
+ /* Allocate a new buffer for the datagram. */
+ ihlen = ip_hdrlen(head);
+ len = ihlen + qp->q.len;
+
+ err = -E2BIG;
+ if (len > 65535)
+ goto out_oversize;
+
+ /* Head of list must not be cloned. */
+ if (skb_unclone(head, GFP_ATOMIC))
+ goto out_nomem;
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments. */
+ if (skb_has_frag_list(head)) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
+ goto out_nomem;
+ clone->next = head->next;
+ head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->len = clone->data_len = head->data_len - plen;
+ head->data_len -= clone->len;
+ head->len -= clone->len;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ add_frag_mem_limit(&qp->q, clone->truesize);
+ }
+
+ skb_push(head, head->data - skb_network_header(head));
+
+ sum_truesize = head->truesize;
+ for (fp = head->next; fp;) {
+ bool headstolen;
+ int delta;
+ struct sk_buff *next = fp->next;
+
+ sum_truesize += fp->truesize;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+
+ if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
+ kfree_skb_partial(fp, headstolen);
+ } else {
+ if (!skb_shinfo(head)->frag_list)
+ skb_shinfo(head)->frag_list = fp;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+ }
+ fp = next;
+ }
+ sub_frag_mem_limit(&qp->q, sum_truesize);
+
+ head->next = NULL;
+ head->dev = dev;
+ head->tstamp = qp->q.stamp;
+ IPCB(head)->frag_max_size = qp->q.max_size;
+
+ iph = ip_hdr(head);
+ /* max_size != 0 implies at least one fragment had IP_DF set */
+ iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
+ iph->tot_len = htons(len);
+ iph->tos |= ecn;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
+ qp->q.fragments = NULL;
+ qp->q.fragments_tail = NULL;
+ return 0;
+
+out_nomem:
+ net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
+ err = -ENOMEM;
+ goto out_fail;
+out_oversize:
+ net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
+out_fail:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ return err;
+}
+
+/* Process an incoming IP datagram fragment. */
+int ip_defrag(struct sk_buff *skb, u32 user)
+{
+ struct ipq *qp;
+ struct net *net;
+
+ net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+
+ /* Lookup (or create) queue header */
+ qp = ip_find(net, ip_hdr(skb), user);
+ if (qp) {
+ int ret;
+
+ spin_lock(&qp->q.lock);
+
+ ret = ip_frag_queue(qp, skb);
+
+ spin_unlock(&qp->q.lock);
+ ipq_put(qp);
+ return ret;
+ }
+
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ kfree_skb(skb);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(ip_defrag);
+
+struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
+{
+ struct iphdr iph;
+ int netoff;
+ u32 len;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return skb;
+
+ netoff = skb_network_offset(skb);
+
+ if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
+ return skb;
+
+ if (iph.ihl < 5 || iph.version != 4)
+ return skb;
+
+ len = ntohs(iph.tot_len);
+ if (skb->len < netoff + len || len < (iph.ihl * 4))
+ return skb;
+
+ if (ip_is_fragment(&iph)) {
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb) {
+ if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
+ return skb;
+ if (pskb_trim_rcsum(skb, netoff + len))
+ return skb;
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ if (ip_defrag(skb, user))
+ return NULL;
+ skb_clear_hash(skb);
+ }
+ }
+ return skb;
+}
+EXPORT_SYMBOL(ip_check_defrag);
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+
+static struct ctl_table ip4_frags_ns_ctl_table[] = {
+ {
+ .procname = "ipfrag_high_thresh",
+ .data = &init_net.ipv4.frags.high_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &init_net.ipv4.frags.low_thresh
+ },
+ {
+ .procname = "ipfrag_low_thresh",
+ .data = &init_net.ipv4.frags.low_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &init_net.ipv4.frags.high_thresh
+ },
+ {
+ .procname = "ipfrag_time",
+ .data = &init_net.ipv4.frags.timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+
+/* secret interval has been deprecated */
+static int ip4_frags_secret_interval_unused;
+static struct ctl_table ip4_frags_ctl_table[] = {
+ {
+ .procname = "ipfrag_secret_interval",
+ .data = &ip4_frags_secret_interval_unused,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ipfrag_max_dist",
+ .data = &sysctl_ipfrag_max_dist,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero
+ },
+ { }
+};
+
+static int __net_init ip4_frags_ns_ctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = ip4_frags_ns_ctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->ipv4.frags.high_thresh;
+ table[0].extra1 = &net->ipv4.frags.low_thresh;
+ table[0].extra2 = &init_net.ipv4.frags.high_thresh;
+ table[1].data = &net->ipv4.frags.low_thresh;
+ table[1].extra2 = &net->ipv4.frags.high_thresh;
+ table[2].data = &net->ipv4.frags.timeout;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ipv4.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.frags_hdr);
+ kfree(table);
+}
+
+static void __init ip4_frags_ctl_register(void)
+{
+ register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
+}
+#else
+static int ip4_frags_ns_ctl_register(struct net *net)
+{
+ return 0;
+}
+
+static void ip4_frags_ns_ctl_unregister(struct net *net)
+{
+}
+
+static void __init ip4_frags_ctl_register(void)
+{
+}
+#endif
+
+static int __net_init ipv4_frags_init_net(struct net *net)
+{
+ /* Fragment cache limits.
+ *
+ * The fragment memory accounting code, (tries to) account for
+ * the real memory usage, by measuring both the size of frag
+ * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
+ * and the SKB's truesize.
+ *
+ * A 64K fragment consumes 129736 bytes (44*2944)+200
+ * (1500 truesize == 2944, sizeof(struct ipq) == 200)
+ *
+ * We will commit 4MB at one time. Should we cross that limit
+ * we will prune down to 3MB, making room for approx 8 big 64K
+ * fragments 8x128k.
+ */
+ net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
+ net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
+ /*
+ * Important NOTE! Fragment queue must be destroyed before MSL expires.
+ * RFC791 is wrong proposing to prolongate timer each fragment arrival
+ * by TTL.
+ */
+ net->ipv4.frags.timeout = IP_FRAG_TIME;
+
+ inet_frags_init_net(&net->ipv4.frags);
+
+ return ip4_frags_ns_ctl_register(net);
+}
+
+static void __net_exit ipv4_frags_exit_net(struct net *net)
+{
+ ip4_frags_ns_ctl_unregister(net);
+ inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+}
+
+static struct pernet_operations ip4_frags_ops = {
+ .init = ipv4_frags_init_net,
+ .exit = ipv4_frags_exit_net,
+};
+
+void __init ipfrag_init(void)
+{
+ ip4_frags_ctl_register();
+ register_pernet_subsys(&ip4_frags_ops);
+ ip4_frags.hashfn = ip4_hashfn;
+ ip4_frags.constructor = ip4_frag_init;
+ ip4_frags.destructor = ip4_frag_free;
+ ip4_frags.skb_free = NULL;
+ ip4_frags.qsize = sizeof(struct ipq);
+ ip4_frags.match = ip4_frag_match;
+ ip4_frags.frag_expire = ip_expire;
+ ip4_frags.frags_cache_name = ip_frag_cache_name;
+ if (inet_frags_init(&ip4_frags))
+ panic("IP: failed to allocate ip4_frags cache\n");
+}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 000000000..5fd706473
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,926 @@
+/*
+ * Linux NET3: GRE over IP protocol decoder.
+ *
+ * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/gre.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+/*
+ Problems & solutions
+ --------------------
+
+ 1. The most important issue is detecting local dead loops.
+ They would cause complete host lockup in transmit, which
+ would be "resolved" by stack overflow or, if queueing is enabled,
+ with infinite looping in net_bh.
+
+ We cannot track such dead loops during route installation,
+ it is infeasible task. The most general solutions would be
+ to keep skb->encapsulation counter (sort of local ttl),
+ and silently drop packet when it expires. It is a good
+ solution, but it supposes maintaining new variable in ALL
+ skb, even if no tunneling is used.
+
+ Current solution: xmit_recursion breaks dead loops. This is a percpu
+ counter, since when we enter the first ndo_xmit(), cpu migration is
+ forbidden. We force an exit if this counter reaches RECURSION_LIMIT
+
+ 2. Networking dead loops would not kill routers, but would really
+ kill network. IP hop limit plays role of "t->recursion" in this case,
+ if we copy it from packet being encapsulated to upper header.
+ It is very good solution, but it introduces two problems:
+
+ - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+ do not work over tunnels.
+ - traceroute does not work. I planned to relay ICMP from tunnel,
+ so that this problem would be solved and traceroute output
+ would even more informative. This idea appeared to be wrong:
+ only Linux complies to rfc1812 now (yes, guys, Linux is the only
+ true router now :-)), all routers (at least, in neighbourhood of mine)
+ return only 8 bytes of payload. It is the end.
+
+ Hence, if we want that OSPF worked or traceroute said something reasonable,
+ we should search for another solution.
+
+ One of them is to parse packet trying to detect inner encapsulation
+ made by our node. It is difficult or even impossible, especially,
+ taking into account fragmentation. TO be short, ttl is not solution at all.
+
+ Current solution: The solution was UNEXPECTEDLY SIMPLE.
+ We force DF flag on tunnels with preconfigured hop limit,
+ that is ALL. :-) Well, it does not remove the problem completely,
+ but exponential growth of network traffic is changed to linear
+ (branches, that exceed pmtu are pruned) and tunnel mtu
+ rapidly degrades to value <68, where looping stops.
+ Yes, it is not good if there exists a router in the loop,
+ which does not force DF, even when encapsulating packets have DF set.
+ But it is not our problem! Nobody could accuse us, we made
+ all that we could make. Even if it is your gated who injected
+ fatal route to network, even if it were you who configured
+ fatal static route: you are innocent. :-)
+
+ Alexey Kuznetsov.
+ */
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly;
+static int ipgre_tunnel_init(struct net_device *dev);
+
+static int ipgre_net_id __read_mostly;
+static int gre_tap_net_id __read_mostly;
+
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
+{
+
+ /* All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+
+ Moreover, Cisco "wise men" put GRE key to the third word
+ in GRE header. It makes impossible maintaining even soft
+ state for keyed GRE tunnels with enabled checksum. Tell
+ them "thank you".
+
+ Well, I wonder, rfc1812 was written by Cisco employee,
+ what the hell these idiots break standards established
+ by themselves???
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct ip_tunnel *t;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return PACKET_RCVD;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ case ICMP_PORT_UNREACH:
+ /* Impossible event. */
+ return PACKET_RCVD;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return PACKET_RCVD;
+ break;
+
+ case ICMP_REDIRECT:
+ break;
+ }
+
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
+
+ if (!t)
+ return PACKET_REJECT;
+
+ if (t->parms.iph.daddr == 0 ||
+ ipv4_is_multicast(t->parms.iph.daddr))
+ return PACKET_RCVD;
+
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ return PACKET_RCVD;
+
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+ return PACKET_RCVD;
+}
+
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
+ struct ip_tunnel *tunnel;
+
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->saddr, iph->daddr, tpi->key);
+
+ if (tunnel) {
+ skb_pop_mac_header(skb);
+ ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
+ return PACKET_RCVD;
+ }
+ return PACKET_REJECT;
+}
+
+static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params,
+ __be16 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct tnl_ptk_info tpi;
+
+ tpi.flags = tunnel->parms.o_flags;
+ tpi.proto = proto;
+ tpi.key = tunnel->parms.o_key;
+ if (tunnel->parms.o_flags & TUNNEL_SEQ)
+ tunnel->o_seqno++;
+ tpi.seq = htonl(tunnel->o_seqno);
+
+ /* Push GRE header. */
+ gre_build_header(skb, &tpi, tunnel->tun_hlen);
+
+ skb_set_inner_protocol(skb, tpi.proto);
+
+ ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
+}
+
+static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tnl_params;
+
+ if (dev->header_ops) {
+ /* Need space for new headers */
+ if (skb_cow_head(skb, dev->needed_headroom -
+ (tunnel->hlen + sizeof(struct iphdr))))
+ goto free_skb;
+
+ tnl_params = (const struct iphdr *)skb->data;
+
+ /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
+ * to gre header.
+ */
+ skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ skb_reset_mac_header(skb);
+ } else {
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
+
+ tnl_params = &tunnel->parms.iph;
+ }
+
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ goto out;
+
+ __gre_xmit(skb, dev, tnl_params, skb->protocol);
+
+ return NETDEV_TX_OK;
+
+free_skb:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ goto out;
+
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto free_skb;
+
+ __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
+
+ return NETDEV_TX_OK;
+
+free_skb:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
+static int ipgre_tunnel_ioctl(struct net_device *dev,
+ struct ifreq *ifr, int cmd)
+{
+ int err;
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+ ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+ return -EINVAL;
+ }
+ p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
+ p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
+ p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+ return 0;
+}
+
+/* Nice toy. Unfortunately, useless in real life :-)
+ It allows to construct virtual multiprotocol broadcast "LAN"
+ over the Internet, provided multicast routing is tuned.
+
+
+ I have no idea was this bicycle invented before me,
+ so that I had to set ARPHRD_IPGRE to a random value.
+ I have an impression, that Cisco could make something similar,
+ but this feature is apparently missing in IOS<=11.2(8).
+
+ I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+ with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+
+ ping -t 255 224.66.66.66
+
+ If nobody answers, mbone does not work.
+
+ ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+ ip addr add 10.66.66.<somewhat>/24 dev Universe
+ ifconfig Universe up
+ ifconfig Universe add fe80::<Your_real_addr>/10
+ ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+ ftp 10.66.66.66
+ ...
+ ftp fec0:6666:6666::193.233.7.65
+ ...
+ */
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct iphdr *iph;
+ struct gre_base_hdr *greh;
+
+ iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
+ greh = (struct gre_base_hdr *)(iph+1);
+ greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
+ greh->protocol = htons(type);
+
+ memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+
+ /* Set the source hardware address. */
+ if (saddr)
+ memcpy(&iph->saddr, saddr, 4);
+ if (daddr)
+ memcpy(&iph->daddr, daddr, 4);
+ if (iph->daddr)
+ return t->hlen + sizeof(*iph);
+
+ return -(t->hlen + sizeof(*iph));
+}
+
+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
+ memcpy(haddr, &iph->saddr, 4);
+ return 4;
+}
+
+static const struct header_ops ipgre_header_ops = {
+ .create = ipgre_header,
+ .parse = ipgre_header_parse,
+};
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+static int ipgre_open(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (ipv4_is_multicast(t->parms.iph.daddr)) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ rt = ip_route_output_gre(t->net, &fl4,
+ t->parms.iph.daddr,
+ t->parms.iph.saddr,
+ t->parms.o_key,
+ RT_TOS(t->parms.iph.tos),
+ t->parms.link);
+ if (IS_ERR(rt))
+ return -EADDRNOTAVAIL;
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!__in_dev_get_rtnl(dev))
+ return -EADDRNOTAVAIL;
+ t->mlink = dev->ifindex;
+ ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+ }
+ return 0;
+}
+
+static int ipgre_close(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
+ struct in_device *in_dev;
+ in_dev = inetdev_by_index(t->net, t->mlink);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+ }
+ return 0;
+}
+#endif
+
+static const struct net_device_ops ipgre_netdev_ops = {
+ .ndo_init = ipgre_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ .ndo_open = ipgre_open,
+ .ndo_stop = ipgre_close,
+#endif
+ .ndo_start_xmit = ipgre_xmit,
+ .ndo_do_ioctl = ipgre_tunnel_ioctl,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+#define GRE_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM)
+
+static void ipgre_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &ipgre_netdev_ops;
+ dev->type = ARPHRD_IPGRE;
+ ip_tunnel_setup(dev, ipgre_net_id);
+}
+
+static void __gre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel;
+ int t_hlen;
+
+ tunnel = netdev_priv(dev);
+ tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
+
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+
+ t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+ dev->mtu = ETH_DATA_LEN - t_hlen - 4;
+
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
+
+ if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
+ /* TCP offload with GRE SEQ is not supported. */
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ /* Can use a lockless transmit, unless we generate
+ * output sequences
+ */
+ dev->features |= NETIF_F_LLTX;
+ }
+}
+
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ __gre_tunnel_init(dev);
+
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
+
+ dev->flags = IFF_NOARP;
+ netif_keep_dst(dev);
+ dev->addr_len = 4;
+
+ if (iph->daddr) {
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(iph->daddr)) {
+ if (!iph->saddr)
+ return -EINVAL;
+ dev->flags = IFF_BROADCAST;
+ dev->header_ops = &ipgre_header_ops;
+ }
+#endif
+ } else
+ dev->header_ops = &ipgre_header_ops;
+
+ return ip_tunnel_init(dev);
+}
+
+static struct gre_cisco_protocol ipgre_protocol = {
+ .handler = ipgre_rcv,
+ .err_handler = ipgre_err,
+ .priority = 0,
+};
+
+static int __net_init ipgre_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
+}
+
+static void __net_exit ipgre_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
+ ip_tunnel_delete_net(itn, &ipgre_link_ops);
+}
+
+static struct pernet_operations ipgre_net_ops = {
+ .init = ipgre_init_net,
+ .exit = ipgre_exit_net,
+ .id = &ipgre_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ __be16 flags;
+
+ if (!data)
+ return 0;
+
+ flags = 0;
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (flags & (GRE_VERSION|GRE_ROUTING))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ __be32 daddr;
+
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (!data)
+ goto out;
+
+ if (data[IFLA_GRE_REMOTE]) {
+ memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+ if (!daddr)
+ return -EINVAL;
+ }
+
+out:
+ return ipgre_tunnel_validate(tb, data);
+}
+
+static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_GRE;
+
+ if (!data)
+ return;
+
+ if (data[IFLA_GRE_LINK])
+ parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
+
+ if (data[IFLA_GRE_IFLAGS])
+ parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
+
+ if (data[IFLA_GRE_OFLAGS])
+ parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
+
+ if (data[IFLA_GRE_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
+
+ if (data[IFLA_GRE_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
+
+ if (data[IFLA_GRE_LOCAL])
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
+
+ if (data[IFLA_GRE_REMOTE])
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
+
+ if (data[IFLA_GRE_TTL])
+ parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
+
+ if (data[IFLA_GRE_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
+
+ if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+}
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipgre_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_GRE_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_GRE_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
+static int gre_tap_init(struct net_device *dev)
+{
+ __gre_tunnel_init(dev);
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
+ return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops gre_tap_netdev_ops = {
+ .ndo_init = gre_tap_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = gre_tap_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+static void ipgre_tap_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->netdev_ops = &gre_tap_netdev_ops;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ ip_tunnel_setup(dev, gre_tap_net_id);
+}
+
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipgre_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
+}
+
+static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipgre_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ ipgre_netlink_parms(data, tb, &p);
+ return ip_tunnel_changelink(dev, tb, &p);
+}
+
+static size_t ipgre_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_GRE_LINK */
+ nla_total_size(4) +
+ /* IFLA_GRE_IFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_OFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_IKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_OKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_GRE_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_GRE_TTL */
+ nla_total_size(1) +
+ /* IFLA_GRE_TOS */
+ nla_total_size(1) +
+ /* IFLA_GRE_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_GRE_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_DPORT */
+ nla_total_size(2) +
+ 0;
+}
+
+static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm *p = &t->parms;
+
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+ nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+ nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+ !!(p->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+ t->encap.type) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+ t->encap.sport) ||
+ nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+ t->encap.dport) ||
+ nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+ t->encap.flags))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
+ [IFLA_GRE_LINK] = { .type = NLA_U32 },
+ [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_IKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_OKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_GRE_TTL] = { .type = NLA_U8 },
+ [IFLA_GRE_TOS] = { .type = NLA_U8 },
+ [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
+};
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
+ .kind = "gre",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipgre_tunnel_setup,
+ .validate = ipgre_tunnel_validate,
+ .newlink = ipgre_newlink,
+ .changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = ipgre_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
+ .kind = "gretap",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipgre_tap_setup,
+ .validate = ipgre_tap_validate,
+ .newlink = ipgre_newlink,
+ .changelink = ipgre_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = ipgre_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static int __net_init ipgre_tap_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
+}
+
+static void __net_exit ipgre_tap_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
+ ip_tunnel_delete_net(itn, &ipgre_tap_ops);
+}
+
+static struct pernet_operations ipgre_tap_net_ops = {
+ .init = ipgre_tap_init_net,
+ .exit = ipgre_tap_exit_net,
+ .id = &gre_tap_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int __init ipgre_init(void)
+{
+ int err;
+
+ pr_info("GRE over IPv4 tunneling driver\n");
+
+ err = register_pernet_device(&ipgre_net_ops);
+ if (err < 0)
+ return err;
+
+ err = register_pernet_device(&ipgre_tap_net_ops);
+ if (err < 0)
+ goto pnet_tap_faied;
+
+ err = gre_cisco_register(&ipgre_protocol);
+ if (err < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ goto add_proto_failed;
+ }
+
+ err = rtnl_link_register(&ipgre_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ err = rtnl_link_register(&ipgre_tap_ops);
+ if (err < 0)
+ goto tap_ops_failed;
+
+ return 0;
+
+tap_ops_failed:
+ rtnl_link_unregister(&ipgre_link_ops);
+rtnl_link_failed:
+ gre_cisco_unregister(&ipgre_protocol);
+add_proto_failed:
+ unregister_pernet_device(&ipgre_tap_net_ops);
+pnet_tap_faied:
+ unregister_pernet_device(&ipgre_net_ops);
+ return err;
+}
+
+static void __exit ipgre_fini(void)
+{
+ rtnl_link_unregister(&ipgre_tap_ops);
+ rtnl_link_unregister(&ipgre_link_ops);
+ gre_cisco_unregister(&ipgre_protocol);
+ unregister_pernet_device(&ipgre_tap_net_ops);
+ unregister_pernet_device(&ipgre_net_ops);
+}
+
+module_init(ipgre_init);
+module_exit(ipgre_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("gre");
+MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_NETDEV("gre0");
+MODULE_ALIAS_NETDEV("gretap0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 000000000..2db4c8773
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,467 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) module.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *
+ *
+ * Fixes:
+ * Alan Cox : Commented a couple of minor bits of surplus code
+ * Alan Cox : Undefining IP_FORWARD doesn't include the code
+ * (just stops a compiler warning).
+ * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ * are junked rather than corrupting things.
+ * Alan Cox : Frames to bad broadcast subnets are dumped
+ * We used to process them non broadcast and
+ * boy could that cause havoc.
+ * Alan Cox : ip_forward sets the free flag on the
+ * new frame it queues. Still crap because
+ * it copies the frame but at least it
+ * doesn't eat memory too.
+ * Alan Cox : Generic queue code and memory fixes.
+ * Fred Van Kempen : IP fragment support (borrowed from NET2E)
+ * Gerhard Koerting: Forward fragmented frames correctly.
+ * Gerhard Koerting: Fixes to my fix of the above 8-).
+ * Gerhard Koerting: IP interface addressing fix.
+ * Linus Torvalds : More robustness checks
+ * Alan Cox : Even more checks: Still not as robust as it ought to be
+ * Alan Cox : Save IP header pointer for later
+ * Alan Cox : ip option setting
+ * Alan Cox : Use ip_tos/ip_ttl settings
+ * Alan Cox : Fragmentation bogosity removed
+ * (Thanks to Mark.Bush@prg.ox.ac.uk)
+ * Dmitry Gorodchanin : Send of a raw packet crash fix.
+ * Alan Cox : Silly ip bug when an overlength
+ * fragment turns up. Now frees the
+ * queue.
+ * Linus Torvalds/ : Memory leakage on fragmentation
+ * Alan Cox : handling.
+ * Gerhard Koerting: Forwarding uses IP priority hints
+ * Teemu Rantanen : Fragment problems.
+ * Alan Cox : General cleanup, comments and reformat
+ * Alan Cox : SNMP statistics
+ * Alan Cox : BSD address rule semantics. Also see
+ * UDP as there is a nasty checksum issue
+ * if you do things the wrong way.
+ * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file
+ * Alan Cox : IP options adjust sk->priority.
+ * Pedro Roque : Fix mtu/length error in ip_forward.
+ * Alan Cox : Avoid ip_chk_addr when possible.
+ * Richard Underwood : IP multicasting.
+ * Alan Cox : Cleaned up multicast handlers.
+ * Alan Cox : RAW sockets demultiplex in the BSD style.
+ * Gunther Mayer : Fix the SNMP reporting typo
+ * Alan Cox : Always in group 224.0.0.1
+ * Pauline Middelink : Fast ip_checksum update when forwarding
+ * Masquerading support.
+ * Alan Cox : Multicast loopback error for 224.0.0.1
+ * Alan Cox : IP_MULTICAST_LOOP option.
+ * Alan Cox : Use notifiers.
+ * Bjorn Ekwall : Removed ip_csum (from slhc.c too)
+ * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
+ * Stefan Becker : Send out ICMP HOST REDIRECT
+ * Arnt Gulbrandsen : ip_build_xmit
+ * Alan Cox : Per socket routing cache
+ * Alan Cox : Fixed routing cache, added header cache.
+ * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.
+ * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
+ * Alan Cox : Incoming IP option handling.
+ * Alan Cox : Set saddr on raw output frames as per BSD.
+ * Alan Cox : Stopped broadcast source route explosions.
+ * Alan Cox : Can disable source routing
+ * Takeshi Sone : Masquerading didn't work.
+ * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.
+ * Alan Cox : Memory leaks, tramples, misc debugging.
+ * Alan Cox : Fixed multicast (by popular demand 8))
+ * Alan Cox : Fixed forwarding (by even more popular demand 8))
+ * Alan Cox : Fixed SNMP statistics [I think]
+ * Gerhard Koerting : IP fragmentation forwarding fix
+ * Alan Cox : Device lock against page fault.
+ * Alan Cox : IP_HDRINCL facility.
+ * Werner Almesberger : Zero fragment bug
+ * Alan Cox : RAW IP frame length bug
+ * Alan Cox : Outgoing firewall on build_xmit
+ * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel
+ * Alan Cox : Multicast routing hooks
+ * Jos Vos : Do accounting *before* call_in_firewall
+ * Willy Konynenberg : Transparent proxying support
+ *
+ *
+ *
+ * To Fix:
+ * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ * and could be made very efficient with the addition of some virtual memory hacks to permit
+ * the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ * Output fragmentation wants updating along with the buffer management to use a single
+ * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ * fragmentation anyway.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <net/inet_ecn.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/xfrm.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ * Process Router Attention IP option (RFC 2113)
+ */
+bool ip_call_ra_chain(struct sk_buff *skb)
+{
+ struct ip_ra_chain *ra;
+ u8 protocol = ip_hdr(skb)->protocol;
+ struct sock *last = NULL;
+ struct net_device *dev = skb->dev;
+
+ for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
+ struct sock *sk = ra->sk;
+
+ /* If socket is bound to an interface, only report
+ * the packet if it came from that interface.
+ */
+ if (sk && inet_sk(sk)->inet_num == protocol &&
+ (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == dev->ifindex) &&
+ net_eq(sock_net(sk), dev_net(dev))) {
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
+ return true;
+ }
+ if (last) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ raw_rcv(last, skb2);
+ }
+ last = sk;
+ }
+ }
+
+ if (last) {
+ raw_rcv(last, skb);
+ return true;
+ }
+ return false;
+}
+
+static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ rcu_read_lock();
+ {
+ int protocol = ip_hdr(skb)->protocol;
+ const struct net_protocol *ipprot;
+ int raw;
+
+ resubmit:
+ raw = raw_local_deliver(skb, protocol);
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot) {
+ int ret;
+
+ if (!ipprot->no_policy) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb(skb);
+ goto out;
+ }
+ nf_reset(skb);
+ }
+ ret = ipprot->handler(skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
+ }
+ kfree_skb(skb);
+ } else {
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
+ }
+ }
+ }
+ out:
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/*
+ * Deliver IP Packets to the higher protocol layers.
+ */
+int ip_local_deliver(struct sk_buff *skb)
+{
+ /*
+ * Reassemble IP fragments.
+ */
+
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+ return 0;
+ }
+
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, NULL, skb,
+ skb->dev, NULL,
+ ip_local_deliver_finish);
+}
+
+static inline bool ip_rcv_options(struct sk_buff *skb)
+{
+ struct ip_options *opt;
+ const struct iphdr *iph;
+ struct net_device *dev = skb->dev;
+
+ /* It looks as overkill, because not all
+ IP options require packet mangling.
+ But it is the easiest for now, especially taking
+ into account that combination of IP options
+ and running sniffer is extremely rare condition.
+ --ANK (980813)
+ */
+ if (skb_cow(skb, skb_headroom(skb))) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ iph = ip_hdr(skb);
+ opt = &(IPCB(skb)->opt);
+ opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+
+ if (ip_options_compile(dev_net(dev), opt, skb)) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ goto drop;
+ }
+
+ if (unlikely(opt->srr)) {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (in_dev) {
+ if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_info_ratelimited("source route option %pI4 -> %pI4\n",
+ &iph->saddr,
+ &iph->daddr);
+ goto drop;
+ }
+ }
+
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+
+ return false;
+drop:
+ return true;
+}
+
+int sysctl_ip_early_demux __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_ip_early_demux);
+
+static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+
+ if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) {
+ const struct net_protocol *ipprot;
+ int protocol = iph->protocol;
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot && ipprot->early_demux) {
+ ipprot->early_demux(skb);
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ }
+
+ /*
+ * Initialise the virtual path cache for the packet. It describes
+ * how the packet travels inside Linux networking.
+ */
+ if (!skb_dst(skb)) {
+ int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev);
+ if (unlikely(err)) {
+ if (err == -EXDEV)
+ NET_INC_STATS_BH(dev_net(skb->dev),
+ LINUX_MIB_IPRPFILTER);
+ goto drop;
+ }
+ }
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (unlikely(skb_dst(skb)->tclassid)) {
+ struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
+ u32 idx = skb_dst(skb)->tclassid;
+ st[idx&0xFF].o_packets++;
+ st[idx&0xFF].o_bytes += skb->len;
+ st[(idx>>16)&0xFF].i_packets++;
+ st[(idx>>16)&0xFF].i_bytes += skb->len;
+ }
+#endif
+
+ if (iph->ihl > 5 && ip_rcv_options(skb))
+ goto drop;
+
+ rt = skb_rtable(skb);
+ if (rt->rt_type == RTN_MULTICAST) {
+ IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
+ skb->len);
+ } else if (rt->rt_type == RTN_BROADCAST)
+ IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
+ skb->len);
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+/*
+ * Main IP Receive routine.
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ const struct iphdr *iph;
+ u32 len;
+
+ /* When the interface is in promisc. mode, drop all the crap
+ * that it receives, do not try to analyse it.
+ */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+
+ IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+
+ /*
+ * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
+ *
+ * Is the datagram acceptable?
+ *
+ * 1. Length at least the size of an ip header
+ * 2. Version of 4
+ * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+ * 4. Doesn't have a bogus length
+ */
+
+ if (iph->ihl < 5 || iph->version != 4)
+ goto inhdr_error;
+
+ BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+ BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+ BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+ IP_ADD_STATS_BH(dev_net(dev),
+ IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto csum_error;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ } else if (len < (iph->ihl*4))
+ goto inhdr_error;
+
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ skb->transport_header = skb->network_header + iph->ihl*4;
+
+ /* Remove any debris in the socket control block */
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+
+ /* Must drop socket now because of tproxy. */
+ skb_orphan(skb);
+
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
+ dev, NULL,
+ ip_rcv_finish);
+
+csum_error:
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
+inhdr_error:
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+ kfree_skb(skb);
+out:
+ return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 000000000..bd2467923
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,663 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The options processing module for ip.c
+ *
+ * Authors: A.N.Kuznetsov
+ *
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/cipso_ipv4.h>
+#include <net/ip_fib.h>
+
+/*
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
+ __be32 daddr, struct rtable *rt, int is_frag)
+{
+ unsigned char *iph = skb_network_header(skb);
+
+ memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+ memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ opt = &(IPCB(skb)->opt);
+
+ if (opt->srr)
+ memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+
+ if (!is_frag) {
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
+ if (opt->ts_needtime) {
+ struct timespec tv;
+ __be32 midtime;
+ getnstimeofday(&tv);
+ midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+ memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+ }
+ return;
+ }
+ if (opt->rr) {
+ memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ }
+ if (opt->ts) {
+ memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+ opt->ts = 0;
+ opt->ts_needaddr = opt->ts_needtime = 0;
+ }
+}
+
+/*
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+
+int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
+ const struct ip_options *sopt)
+{
+ unsigned char *sptr, *dptr;
+ int soffset, doffset;
+ int optlen;
+
+ memset(dopt, 0, sizeof(struct ip_options));
+
+ if (sopt->optlen == 0)
+ return 0;
+
+ sptr = skb_network_header(skb);
+ dptr = dopt->__data;
+
+ if (sopt->rr) {
+ optlen = sptr[sopt->rr+1];
+ soffset = sptr[sopt->rr+2];
+ dopt->rr = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->rr, optlen);
+ if (sopt->rr_needaddr && soffset <= optlen) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dptr[2] = soffset + 4;
+ dopt->rr_needaddr = 1;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->ts) {
+ optlen = sptr[sopt->ts+1];
+ soffset = sptr[sopt->ts+2];
+ dopt->ts = dopt->optlen + sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->ts, optlen);
+ if (soffset <= optlen) {
+ if (sopt->ts_needaddr) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ dopt->ts_needaddr = 1;
+ soffset += 4;
+ }
+ if (sopt->ts_needtime) {
+ if (soffset + 3 > optlen)
+ return -EINVAL;
+ if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ dopt->ts_needtime = 1;
+ soffset += 4;
+ } else {
+ dopt->ts_needtime = 0;
+
+ if (soffset + 7 <= optlen) {
+ __be32 addr;
+
+ memcpy(&addr, dptr+soffset-1, 4);
+ if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
+ dopt->ts_needtime = 1;
+ soffset += 8;
+ }
+ }
+ }
+ }
+ dptr[2] = soffset;
+ }
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ if (sopt->srr) {
+ unsigned char *start = sptr+sopt->srr;
+ __be32 faddr;
+
+ optlen = start[1];
+ soffset = start[2];
+ doffset = 0;
+ if (soffset > optlen)
+ soffset = optlen + 1;
+ soffset -= 4;
+ if (soffset > 3) {
+ memcpy(&faddr, &start[soffset-1], 4);
+ for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
+ memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+ /*
+ * RFC1812 requires to fix illegal source routes.
+ */
+ if (memcmp(&ip_hdr(skb)->saddr,
+ &start[soffset + 3], 4) == 0)
+ doffset -= 4;
+ }
+ if (doffset > 3) {
+ __be32 daddr = fib_compute_spec_dst(skb);
+
+ memcpy(&start[doffset-1], &daddr, 4);
+ dopt->faddr = faddr;
+ dptr[0] = start[0];
+ dptr[1] = doffset+3;
+ dptr[2] = 4;
+ dptr += doffset+3;
+ dopt->srr = dopt->optlen + sizeof(struct iphdr);
+ dopt->optlen += doffset+3;
+ dopt->is_strictroute = sopt->is_strictroute;
+ }
+ }
+ if (sopt->cipso) {
+ optlen = sptr[sopt->cipso+1];
+ dopt->cipso = dopt->optlen+sizeof(struct iphdr);
+ memcpy(dptr, sptr+sopt->cipso, optlen);
+ dptr += optlen;
+ dopt->optlen += optlen;
+ }
+ while (dopt->optlen & 3) {
+ *dptr++ = IPOPT_END;
+ dopt->optlen++;
+ }
+ return 0;
+}
+
+/*
+ * Options "fragmenting", just fill options not
+ * allowed in fragments with NOOPs.
+ * Simple and stupid 8), but the most efficient way.
+ */
+
+void ip_options_fragment(struct sk_buff *skb)
+{
+ unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ int l = opt->optlen;
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen < 2 || optlen > l)
+ return;
+ if (!IPOPT_COPIED(*optptr))
+ memset(optptr, IPOPT_NOOP, optlen);
+ l -= optlen;
+ optptr += optlen;
+ }
+ opt->ts = 0;
+ opt->rr = 0;
+ opt->rr_needaddr = 0;
+ opt->ts_needaddr = 0;
+ opt->ts_needtime = 0;
+}
+
+/* helper used by ip_options_compile() to call fib_compute_spec_dst()
+ * at most one time.
+ */
+static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
+{
+ if (*spec_dst == htonl(INADDR_ANY))
+ *spec_dst = fib_compute_spec_dst(skb);
+}
+
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+
+int ip_options_compile(struct net *net,
+ struct ip_options *opt, struct sk_buff *skb)
+{
+ __be32 spec_dst = htonl(INADDR_ANY);
+ unsigned char *pp_ptr = NULL;
+ struct rtable *rt = NULL;
+ unsigned char *optptr;
+ unsigned char *iph;
+ int optlen, l;
+
+ if (skb) {
+ rt = skb_rtable(skb);
+ optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+ } else
+ optptr = opt->__data;
+ iph = optptr - sizeof(struct iphdr);
+
+ for (l = opt->optlen; l > 0; ) {
+ switch (*optptr) {
+ case IPOPT_END:
+ for (optptr++, l--; l > 0; optptr++, l--) {
+ if (*optptr != IPOPT_END) {
+ *optptr = IPOPT_END;
+ opt->is_changed = 1;
+ }
+ }
+ goto eol;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ if (unlikely(l < 2)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ optlen = optptr[1];
+ if (optlen < 2 || optlen > l) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ switch (*optptr) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ /* NB: cf RFC-1812 5.2.4.1 */
+ if (opt->srr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (!skb) {
+ if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ memcpy(&opt->faddr, &optptr[3], 4);
+ if (optlen > 7)
+ memmove(&optptr[3], &optptr[7], optlen-7);
+ }
+ opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+ opt->srr = optptr - iph;
+ break;
+ case IPOPT_RR:
+ if (opt->rr) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 3) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 4) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ if (optptr[2]+3 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (rt) {
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+ opt->is_changed = 1;
+ }
+ optptr[2] += 4;
+ opt->rr_needaddr = 1;
+ }
+ opt->rr = optptr - iph;
+ break;
+ case IPOPT_TIMESTAMP:
+ if (opt->ts) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] < 5) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (optptr[2] <= optlen) {
+ unsigned char *timeptr = NULL;
+ if (optptr[2]+3 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ switch (optptr[3]&0xF) {
+ case IPOPT_TS_TSONLY:
+ if (skb)
+ timeptr = &optptr[optptr[2]-1];
+ opt->ts_needtime = 1;
+ optptr[2] += 4;
+ break;
+ case IPOPT_TS_TSANDADDR:
+ if (optptr[2]+7 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ if (rt) {
+ spec_dst_fill(&spec_dst, skb);
+ memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
+ timeptr = &optptr[optptr[2]+3];
+ }
+ opt->ts_needaddr = 1;
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ case IPOPT_TS_PRESPEC:
+ if (optptr[2]+7 > optlen) {
+ pp_ptr = optptr + 2;
+ goto error;
+ }
+ {
+ __be32 addr;
+ memcpy(&addr, &optptr[optptr[2]-1], 4);
+ if (inet_addr_type(net, addr) == RTN_UNICAST)
+ break;
+ if (skb)
+ timeptr = &optptr[optptr[2]+3];
+ }
+ opt->ts_needtime = 1;
+ optptr[2] += 8;
+ break;
+ default:
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ break;
+ }
+ if (timeptr) {
+ struct timespec tv;
+ u32 midtime;
+ getnstimeofday(&tv);
+ midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
+ put_unaligned_be32(midtime, timeptr);
+ opt->is_changed = 1;
+ }
+ } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+ unsigned int overflow = optptr[3]>>4;
+ if (overflow == 15) {
+ pp_ptr = optptr + 3;
+ goto error;
+ }
+ if (skb) {
+ optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+ opt->is_changed = 1;
+ }
+ }
+ opt->ts = optptr - iph;
+ break;
+ case IPOPT_RA:
+ if (optlen < 4) {
+ pp_ptr = optptr + 1;
+ goto error;
+ }
+ if (optptr[2] == 0 && optptr[3] == 0)
+ opt->router_alert = optptr - iph;
+ break;
+ case IPOPT_CIPSO:
+ if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ opt->cipso = optptr - iph;
+ if (cipso_v4_validate(skb, &optptr)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ break;
+ case IPOPT_SEC:
+ case IPOPT_SID:
+ default:
+ if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ pp_ptr = optptr;
+ goto error;
+ }
+ break;
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+
+eol:
+ if (!pp_ptr)
+ return 0;
+
+error:
+ if (skb) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ip_options_compile);
+
+/*
+ * Undo all the changes done by ip_options_compile().
+ */
+
+void ip_options_undo(struct ip_options *opt)
+{
+ if (opt->srr) {
+ unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
+ memmove(optptr+7, optptr+3, optptr[1]-7);
+ memcpy(optptr+3, &opt->faddr, 4);
+ }
+ if (opt->rr_needaddr) {
+ unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ if (opt->ts) {
+ unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ if (opt->ts_needtime) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ optptr[2] -= 4;
+ }
+ if (opt->ts_needaddr) {
+ optptr[2] -= 4;
+ memset(&optptr[optptr[2]-1], 0, 4);
+ }
+ }
+}
+
+static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
+{
+ return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
+ GFP_KERNEL);
+}
+
+static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
+ struct ip_options_rcu *opt, int optlen)
+{
+ while (optlen & 3)
+ opt->opt.__data[optlen++] = IPOPT_END;
+ opt->opt.optlen = optlen;
+ if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
+ kfree(opt);
+ return -EINVAL;
+ }
+ kfree(*optp);
+ *optp = opt;
+ return 0;
+}
+
+int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
+ unsigned char __user *data, int optlen)
+{
+ struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
+ return ip_options_get_finish(net, optp, opt, optlen);
+}
+
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
+ unsigned char *data, int optlen)
+{
+ struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen)
+ memcpy(opt->opt.__data, data, optlen);
+ return ip_options_get_finish(net, optp, opt, optlen);
+}
+
+void ip_forward_options(struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ unsigned char *optptr;
+ struct rtable *rt = skb_rtable(skb);
+ unsigned char *raw = skb_network_header(skb);
+
+ if (opt->rr_needaddr) {
+ optptr = (unsigned char *)raw + opt->rr;
+ ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
+ opt->is_changed = 1;
+ }
+ if (opt->srr_is_hit) {
+ int srrptr, srrspace;
+
+ optptr = raw + opt->srr;
+
+ for ( srrptr = optptr[2], srrspace = optptr[1];
+ srrptr <= srrspace;
+ srrptr += 4
+ ) {
+ if (srrptr + 3 > srrspace)
+ break;
+ if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
+ break;
+ }
+ if (srrptr + 3 <= srrspace) {
+ opt->is_changed = 1;
+ ip_hdr(skb)->daddr = opt->nexthop;
+ ip_rt_get_source(&optptr[srrptr-1], skb, rt);
+ optptr[2] = srrptr+4;
+ } else {
+ net_crit_ratelimited("%s(): Argh! Destination lost!\n",
+ __func__);
+ }
+ if (opt->ts_needaddr) {
+ optptr = raw + opt->ts;
+ ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
+ opt->is_changed = 1;
+ }
+ }
+ if (opt->is_changed) {
+ opt->is_changed = 0;
+ ip_send_check(ip_hdr(skb));
+ }
+}
+
+int ip_options_rcv_srr(struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+ int srrspace, srrptr;
+ __be32 nexthop;
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned char *optptr = skb_network_header(skb) + opt->srr;
+ struct rtable *rt = skb_rtable(skb);
+ struct rtable *rt2;
+ unsigned long orefdst;
+ int err;
+
+ if (!rt)
+ return 0;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return -EINVAL;
+ if (rt->rt_type == RTN_UNICAST) {
+ if (!opt->is_strictroute)
+ return 0;
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+ return -EINVAL;
+ }
+ if (rt->rt_type != RTN_LOCAL)
+ return -EINVAL;
+
+ for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+ if (srrptr + 3 > srrspace) {
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+ return -EINVAL;
+ }
+ memcpy(&nexthop, &optptr[srrptr-1], 4);
+
+ orefdst = skb->_skb_refdst;
+ skb_dst_set(skb, NULL);
+ err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+ rt2 = skb_rtable(skb);
+ if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+ skb_dst_drop(skb);
+ skb->_skb_refdst = orefdst;
+ return -EINVAL;
+ }
+ refdst_drop(orefdst);
+ if (rt2->rt_type != RTN_LOCAL)
+ break;
+ /* Superfast 8) loopback forward */
+ iph->daddr = nexthop;
+ opt->is_changed = 1;
+ }
+ if (srrptr <= srrspace) {
+ opt->srr_is_hit = 1;
+ opt->nexthop = nexthop;
+ opt->is_changed = 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 000000000..c65b93a7b
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1591 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The Internet Protocol (IP) output module.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Donald Becker, <becker@super.org>
+ * Alan Cox, <Alan.Cox@linux.org>
+ * Richard Underwood
+ * Stefan Becker, <stefanb@yello.ping.de>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * See ip_input.c for original log
+ *
+ * Fixes:
+ * Alan Cox : Missing nonblock feature in ip_build_xmit.
+ * Mike Kilburn : htons() missing in ip_build_xmit.
+ * Bradford Johnson: Fix faulty handling of some frames when
+ * no route is found.
+ * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
+ * (in case if packet not accepted by
+ * output firewall rules)
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov: use new route cache
+ * Andi Kleen: Fix broken PMTU recovery and remove
+ * some redundant tests.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Andi Kleen : Replace ip_reply with ip_send_reply.
+ * Andi Kleen : Split fast and slow ip_build_xmit path
+ * for decreased register pressure on x86
+ * and more readibility.
+ * Marc Boucher : When call_out_firewall returns FW_QUEUE,
+ * silently drop skb instead of failing with -EPERM.
+ * Detlev Wengorz : Copy protocol for fragments.
+ * Hirokazu Takahashi: HW checksumming for outgoing UDP
+ * datagrams.
+ * Hirokazu Takahashi: sendfile() on UDP works now.
+ */
+
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+#include <linux/tcp.h>
+
+int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
+
+/* Generate a checksum for an outgoing IP datagram. */
+void ip_send_check(struct iphdr *iph)
+{
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+EXPORT_SYMBOL(ip_send_check);
+
+int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb, NULL,
+ skb_dst(skb)->dev, dst_output_sk);
+}
+
+int __ip_local_out(struct sk_buff *skb)
+{
+ return __ip_local_out_sk(skb->sk, skb);
+}
+
+int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = __ip_local_out(skb);
+ if (likely(err == 1))
+ err = dst_output_sk(sk, skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_local_out_sk);
+
+static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+{
+ int ttl = inet->uc_ttl;
+
+ if (ttl < 0)
+ ttl = ip4_dst_hoplimit(dst);
+ return ttl;
+}
+
+/*
+ * Add an ip header to a skbuff and send it out.
+ *
+ */
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+ __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = skb_rtable(skb);
+ struct iphdr *iph;
+
+ /* Build the IP header. */
+ skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = inet->tos;
+ if (ip_dont_fragment(sk, &rt->dst))
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
+ iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
+ iph->saddr = saddr;
+ iph->protocol = sk->sk_protocol;
+ ip_select_ident(sock_net(sk), skb, sk);
+
+ if (opt && opt->opt.optlen) {
+ iph->ihl += opt->opt.optlen>>2;
+ ip_options_build(skb, &opt->opt, daddr, rt, 0);
+ }
+
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+
+ /* Send it out. */
+ return ip_local_out(skb);
+}
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
+static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct rtable *rt = (struct rtable *)dst;
+ struct net_device *dev = dst->dev;
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ u32 nexthop;
+
+ if (rt->rt_type == RTN_MULTICAST) {
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
+ } else if (rt->rt_type == RTN_BROADCAST)
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
+
+ /* Be paranoid, rather than too clever. */
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+ if (!skb2) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+ consume_skb(skb);
+ skb = skb2;
+ }
+
+ rcu_read_lock_bh();
+ nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
+ neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+ if (!IS_ERR(neigh)) {
+ int res = dst_neigh_output(dst, neigh, skb);
+
+ rcu_read_unlock_bh();
+ return res;
+ }
+ rcu_read_unlock_bh();
+
+ net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
+ __func__);
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
+{
+ netdev_features_t features;
+ struct sk_buff *segs;
+ int ret = 0;
+
+ /* common case: locally created skb or seglen is <= mtu */
+ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
+ skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
+ return ip_finish_output2(sk, skb);
+
+ /* Slowpath - GSO segment length is exceeding the dst MTU.
+ *
+ * This can happen in two cases:
+ * 1) TCP GRO packet, DF bit not set
+ * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
+ * from host network stack.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+
+ consume_skb(skb);
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+ err = ip_fragment(sk, segs, ip_finish_output2);
+
+ if (err && ret == 0)
+ ret = err;
+ segs = nskb;
+ } while (segs);
+
+ return ret;
+}
+
+static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
+{
+#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
+ /* Policy lookup after SNAT yielded a new policy */
+ if (skb_dst(skb)->xfrm) {
+ IPCB(skb)->flags |= IPSKB_REROUTED;
+ return dst_output_sk(sk, skb);
+ }
+#endif
+ if (skb_is_gso(skb))
+ return ip_finish_output_gso(sk, skb);
+
+ if (skb->len > ip_skb_dst_mtu(skb))
+ return ip_fragment(sk, skb, ip_finish_output2);
+
+ return ip_finish_output2(sk, skb);
+}
+
+int ip_mc_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = rt->dst.dev;
+
+ /*
+ * If the indicated interface is up and running, send the packet.
+ */
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ /*
+ * Multicasts are looped back for other local users
+ */
+
+ if (rt->rt_flags&RTCF_MULTICAST) {
+ if (sk_mc_loop(sk)
+#ifdef CONFIG_IP_MROUTE
+ /* Small optimization: do not loopback not local frames,
+ which returned after forwarding; they will be dropped
+ by ip_mr_input in any case.
+ Note, that local frames are looped back to be delivered
+ to local recipients.
+
+ This check is duplicated in ip_mr_input at the moment.
+ */
+ &&
+ ((rt->rt_flags & RTCF_LOCAL) ||
+ !(IPCB(skb)->flags & IPSKB_FORWARDED))
+#endif
+ ) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ sk, newskb, NULL, newskb->dev,
+ dev_loopback_xmit);
+ }
+
+ /* Multicasts with ttl 0 must not go beyond the host */
+
+ if (ip_hdr(skb)->ttl == 0) {
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ if (rt->rt_flags&RTCF_BROADCAST) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, newskb,
+ NULL, newskb->dev, dev_loopback_xmit);
+ }
+
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, NULL,
+ skb->dev, ip_finish_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+int ip_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst(skb)->dev;
+
+ IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
+ NULL, dev,
+ ip_finish_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+/*
+ * copy saddr and daddr, possibly using 64bit load/stores
+ * Equivalent to :
+ * iph->saddr = fl4->saddr;
+ * iph->daddr = fl4->daddr;
+ */
+static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
+{
+ BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
+ offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
+ memcpy(&iph->saddr, &fl4->saddr,
+ sizeof(fl4->saddr) + sizeof(fl4->daddr));
+}
+
+/* Note: skb->sk can be different from sk, in case of tunnels */
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ struct iphdr *iph;
+ int res;
+
+ /* Skip all of this if the packet is already routed,
+ * f.e. by something like SCTP.
+ */
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ fl4 = &fl->u.ip4;
+ rt = skb_rtable(skb);
+ if (rt)
+ goto packet_routed;
+
+ /* Make sure we can route this packet. */
+ rt = (struct rtable *)__sk_dst_check(sk, 0);
+ if (!rt) {
+ __be32 daddr;
+
+ /* Use correct destination address if we have options. */
+ daddr = inet->inet_daddr;
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ daddr, inet->inet_saddr,
+ inet->inet_dport,
+ inet->inet_sport,
+ sk->sk_protocol,
+ RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ goto no_route;
+ sk_setup_caps(sk, &rt->dst);
+ }
+ skb_dst_set_noref(skb, &rt->dst);
+
+packet_routed:
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
+ goto no_route;
+
+ /* OK, we know where to send it, allocate and build IP header. */
+ skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+ if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
+ iph->frag_off = htons(IP_DF);
+ else
+ iph->frag_off = 0;
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
+ iph->protocol = sk->sk_protocol;
+ ip_copy_addrs(iph, fl4);
+
+ /* Transport layer set skb->h.foo itself. */
+
+ if (inet_opt && inet_opt->opt.optlen) {
+ iph->ihl += inet_opt->opt.optlen >> 2;
+ ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+ }
+
+ ip_select_ident_segs(sock_net(sk), skb, sk,
+ skb_shinfo(skb)->gso_segs ?: 1);
+
+ /* TODO : should we use skb->sk here instead of sk ? */
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+
+ res = ip_local_out(skb);
+ rcu_read_unlock();
+ return res;
+
+no_route:
+ rcu_read_unlock();
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb(skb);
+ return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(ip_queue_xmit);
+
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+ to->pkt_type = from->pkt_type;
+ to->priority = from->priority;
+ to->protocol = from->protocol;
+ skb_dst_drop(to);
+ skb_dst_copy(to, from);
+ to->dev = from->dev;
+ to->mark = from->mark;
+
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+#ifdef CONFIG_NET_SCHED
+ to->tc_index = from->tc_index;
+#endif
+ nf_copy(to, from);
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+ to->ipvs_property = from->ipvs_property;
+#endif
+ skb_copy_secmark(to, from);
+}
+
+/*
+ * This IP datagram is too large to be sent in one piece. Break it up into
+ * smaller pieces (each of size equal to IP header plus
+ * a block of the data of the original IP data part) that will yet fit in a
+ * single device frame, and queue such a frame for sending.
+ */
+
+int ip_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *))
+{
+ struct iphdr *iph;
+ int ptr;
+ struct net_device *dev;
+ struct sk_buff *skb2;
+ unsigned int mtu, hlen, left, len, ll_rs;
+ int offset;
+ __be16 not_last_frag;
+ struct rtable *rt = skb_rtable(skb);
+ int err = 0;
+
+ dev = rt->dst.dev;
+
+ /*
+ * Point into the IP datagram header.
+ */
+
+ iph = ip_hdr(skb);
+
+ mtu = ip_skb_dst_mtu(skb);
+ if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
+ (IPCB(skb)->frag_max_size &&
+ IPCB(skb)->frag_max_size > mtu))) {
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ /*
+ * Setup starting values.
+ */
+
+ hlen = iph->ihl * 4;
+ mtu = mtu - hlen; /* Size of data space */
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (skb->nf_bridge)
+ mtu -= nf_bridge_mtu_reduction(skb);
+#endif
+ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
+
+ /* When frag_list is given, use it. First, check its validity:
+ * some transformers could create wrong frag_list or break existing
+ * one, it is not prohibited. In this case fall back to copying.
+ *
+ * LATER: this step can be merged to real generation of fragments,
+ * we can switch to copy when see the first bad fragment.
+ */
+ if (skb_has_frag_list(skb)) {
+ struct sk_buff *frag, *frag2;
+ int first_len = skb_pagelen(skb);
+
+ if (first_len - hlen > mtu ||
+ ((first_len - hlen) & 7) ||
+ ip_is_fragment(iph) ||
+ skb_cloned(skb))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag) {
+ /* Correct geometry. */
+ if (frag->len > mtu ||
+ ((frag->len & 7) && frag->next) ||
+ skb_headroom(frag) < hlen)
+ goto slow_path_clean;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag))
+ goto slow_path_clean;
+
+ BUG_ON(frag->sk);
+ if (skb->sk) {
+ frag->sk = skb->sk;
+ frag->destructor = sock_wfree;
+ }
+ skb->truesize -= frag->truesize;
+ }
+
+ /* Everything is OK. Generate! */
+
+ err = 0;
+ offset = 0;
+ frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ iph->tot_len = htons(first_len);
+ iph->frag_off = htons(IP_MF);
+ ip_send_check(iph);
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down. */
+ if (frag) {
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iph, hlen);
+ iph = ip_hdr(frag);
+ iph->tot_len = htons(frag->len);
+ ip_copy_metadata(frag, skb);
+ if (offset == 0)
+ ip_options_fragment(frag);
+ offset += skb->len - hlen;
+ iph->frag_off = htons(offset>>3);
+ if (frag->next)
+ iph->frag_off |= htons(IP_MF);
+ /* Ready, complete checksum */
+ ip_send_check(iph);
+ }
+
+ err = output(sk, skb);
+
+ if (!err)
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ if (err || !frag)
+ break;
+
+ skb = frag;
+ frag = skb->next;
+ skb->next = NULL;
+ }
+
+ if (err == 0) {
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ return 0;
+ }
+
+ while (frag) {
+ skb = frag->next;
+ kfree_skb(frag);
+ frag = skb;
+ }
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
+ }
+
+slow_path:
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
+ goto fail;
+ iph = ip_hdr(skb);
+
+ left = skb->len - hlen; /* Space per frame */
+ ptr = hlen; /* Where to start from */
+
+ ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
+
+ /*
+ * Fragment the datagram.
+ */
+
+ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ not_last_frag = iph->frag_off & htons(IP_MF);
+
+ /*
+ * Keep copying data until we run out.
+ */
+
+ while (left > 0) {
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left) {
+ len &= ~7;
+ }
+
+ /* Allocate buffer */
+ skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
+ if (!skb2) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ /*
+ * Set up data on packet
+ */
+
+ ip_copy_metadata(skb2, skb);
+ skb_reserve(skb2, ll_rs);
+ skb_put(skb2, len + hlen);
+ skb_reset_network_header(skb2);
+ skb2->transport_header = skb2->network_header + hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
+ BUG();
+ left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = ip_hdr(skb2);
+ iph->frag_off = htons((offset >> 3));
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (offset == 0)
+ ip_options_fragment(skb);
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (left > 0 || not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ ptr += len;
+ offset += len;
+
+ /*
+ * Put this fragment into the sending queue.
+ */
+ iph->tot_len = htons(len + hlen);
+
+ ip_send_check(iph);
+
+ err = output(sk, skb2);
+ if (err)
+ goto fail;
+
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+ }
+ consume_skb(skb);
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+ return err;
+
+fail:
+ kfree_skb(skb);
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ return err;
+}
+EXPORT_SYMBOL(ip_fragment);
+
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+ struct msghdr *msg = from;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ if (copy_from_iter(to, len, &msg->msg_iter) != len)
+ return -EFAULT;
+ } else {
+ __wsum csum = 0;
+ if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
+ return -EFAULT;
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ip_generic_getfrag);
+
+static inline __wsum
+csum_page(struct page *page, int offset, int copy)
+{
+ char *kaddr;
+ __wsum csum;
+ kaddr = kmap(page);
+ csum = csum_partial(kaddr + offset, copy, 0);
+ kunmap(page);
+ return csum;
+}
+
+static inline int ip_ufo_append_data(struct sock *sk,
+ struct sk_buff_head *queue,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int hh_len, int fragheaderlen,
+ int transhdrlen, int maxfraglen, unsigned int flags)
+{
+ struct sk_buff *skb;
+ int err;
+
+ /* There is support for UDP fragmentation offload by network
+ * device, so create one single skb packet containing complete
+ * udp datagram
+ */
+ skb = skb_peek_tail(queue);
+ if (!skb) {
+ skb = sock_alloc_send_skb(sk,
+ hh_len + fragheaderlen + transhdrlen + 20,
+ (flags & MSG_DONTWAIT), &err);
+
+ if (!skb)
+ return err;
+
+ /* reserve space for Hardware header */
+ skb_reserve(skb, hh_len);
+
+ /* create space for UDP/IP header */
+ skb_put(skb, fragheaderlen + transhdrlen);
+
+ /* initialize network header pointer */
+ skb_reset_network_header(skb);
+
+ /* initialize protocol header pointer */
+ skb->transport_header = skb->network_header + fragheaderlen;
+
+ skb->csum = 0;
+
+ __skb_queue_tail(queue, skb);
+ } else if (skb_is_gso(skb)) {
+ goto append;
+ }
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ /* specify the length of each IP datagram fragment */
+ skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
+append:
+ return skb_append_datato_frags(sk, skb, getfrag, from,
+ (length - transhdrlen));
+}
+
+static int __ip_append_data(struct sock *sk,
+ struct flowi4 *fl4,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork,
+ struct page_frag *pfrag,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+
+ struct ip_options *opt = cork->opt;
+ int hh_len;
+ int exthdrlen;
+ int mtu;
+ int copy;
+ int err;
+ int offset = 0;
+ unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
+ int csummode = CHECKSUM_NONE;
+ struct rtable *rt = (struct rtable *)cork->dst;
+ u32 tskey = 0;
+
+ skb = skb_peek_tail(queue);
+
+ exthdrlen = !skb ? rt->dst.header_len : 0;
+ mtu = cork->fragsize;
+ if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = sk->sk_tskey++;
+
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+
+ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
+
+ if (cork->length + length > maxnonfragsize - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ mtu - (opt ? opt->optlen : 0));
+ return -EMSGSIZE;
+ }
+
+ /*
+ * transhdrlen > 0 means that this is the first fragment and we wish
+ * it won't be fragmented in the future.
+ */
+ if (transhdrlen &&
+ length + fragheaderlen <= mtu &&
+ rt->dst.dev->features & NETIF_F_V4_CSUM &&
+ !exthdrlen)
+ csummode = CHECKSUM_PARTIAL;
+
+ cork->length += length;
+ if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+ (sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
+ (sk->sk_type == SOCK_DGRAM)) {
+ err = ip_ufo_append_data(sk, queue, getfrag, from, length,
+ hh_len, fragheaderlen, transhdrlen,
+ maxfraglen, flags);
+ if (err)
+ goto error;
+ return 0;
+ }
+
+ /* So, what's going on in the loop below?
+ *
+ * We use calculated fragment length to generate chained skb,
+ * each of segments is IP fragment ready for sending to network after
+ * adding appropriate IP header.
+ */
+
+ if (!skb)
+ goto alloc_new_skb;
+
+ while (length > 0) {
+ /* Check if the remaining data fits into current packet. */
+ copy = mtu - skb->len;
+ if (copy < length)
+ copy = maxfraglen - skb->len;
+ if (copy <= 0) {
+ char *data;
+ unsigned int datalen;
+ unsigned int fraglen;
+ unsigned int fraggap;
+ unsigned int alloclen;
+ struct sk_buff *skb_prev;
+alloc_new_skb:
+ skb_prev = skb;
+ if (skb_prev)
+ fraggap = skb_prev->len - maxfraglen;
+ else
+ fraggap = 0;
+
+ /*
+ * If remaining data exceeds the mtu,
+ * we know we need more fragment(s).
+ */
+ datalen = length + fraggap;
+ if (datalen > mtu - fragheaderlen)
+ datalen = maxfraglen - fragheaderlen;
+ fraglen = datalen + fragheaderlen;
+
+ if ((flags & MSG_MORE) &&
+ !(rt->dst.dev->features&NETIF_F_SG))
+ alloclen = mtu;
+ else
+ alloclen = fraglen;
+
+ alloclen += exthdrlen;
+
+ /* The last fragment gets additional space at tail.
+ * Note, with MSG_MORE we overallocate on fragments,
+ * because we have no idea what fragment will be
+ * the last.
+ */
+ if (datalen == length + fraggap)
+ alloclen += rt->dst.trailer_len;
+
+ if (transhdrlen) {
+ skb = sock_alloc_send_skb(sk,
+ alloclen + hh_len + 15,
+ (flags & MSG_DONTWAIT), &err);
+ } else {
+ skb = NULL;
+ if (atomic_read(&sk->sk_wmem_alloc) <=
+ 2 * sk->sk_sndbuf)
+ skb = sock_wmalloc(sk,
+ alloclen + hh_len + 15, 1,
+ sk->sk_allocation);
+ if (unlikely(!skb))
+ err = -ENOBUFS;
+ }
+ if (!skb)
+ goto error;
+
+ /*
+ * Fill in the control structures
+ */
+ skb->ip_summed = csummode;
+ skb->csum = 0;
+ skb_reserve(skb, hh_len);
+
+ /* only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+
+ /*
+ * Find where to start putting bytes.
+ */
+ data = skb_put(skb, fraglen + exthdrlen);
+ skb_set_network_header(skb, exthdrlen);
+ skb->transport_header = (skb->network_header +
+ fragheaderlen);
+ data += fragheaderlen + exthdrlen;
+
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(
+ skb_prev, maxfraglen,
+ data + transhdrlen, fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ data += fraggap;
+ pskb_trim_unique(skb_prev, maxfraglen);
+ }
+
+ copy = datalen - transhdrlen - fraggap;
+ if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ err = -EFAULT;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ offset += copy;
+ length -= datalen - fraggap;
+ transhdrlen = 0;
+ exthdrlen = 0;
+ csummode = CHECKSUM_NONE;
+
+ /*
+ * Put the packet on the pending queue.
+ */
+ __skb_queue_tail(queue, skb);
+ continue;
+ }
+
+ if (copy > length)
+ copy = length;
+
+ if (!(rt->dst.dev->features&NETIF_F_SG)) {
+ unsigned int off;
+
+ off = skb->len;
+ if (getfrag(from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
+ __skb_trim(skb, off);
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ int i = skb_shinfo(skb)->nr_frags;
+
+ err = -ENOMEM;
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto error;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ err = -EMSGSIZE;
+ if (i == MAX_SKB_FRAGS)
+ goto error;
+
+ __skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, 0);
+ skb_shinfo(skb)->nr_frags = ++i;
+ get_page(pfrag->page);
+ }
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
+ if (getfrag(from,
+ page_address(pfrag->page) + pfrag->offset,
+ offset, copy, skb->len, skb) < 0)
+ goto error_efault;
+
+ pfrag->offset += copy;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ atomic_add(copy, &sk->sk_wmem_alloc);
+ }
+ offset += copy;
+ length -= copy;
+ }
+
+ return 0;
+
+error_efault:
+ err = -EFAULT;
+error:
+ cork->length -= length;
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
+ struct ipcm_cookie *ipc, struct rtable **rtp)
+{
+ struct ip_options_rcu *opt;
+ struct rtable *rt;
+
+ /*
+ * setup for corking.
+ */
+ opt = ipc->opt;
+ if (opt) {
+ if (!cork->opt) {
+ cork->opt = kmalloc(sizeof(struct ip_options) + 40,
+ sk->sk_allocation);
+ if (unlikely(!cork->opt))
+ return -ENOBUFS;
+ }
+ memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
+ cork->flags |= IPCORK_OPT;
+ cork->addr = ipc->addr;
+ }
+ rt = *rtp;
+ if (unlikely(!rt))
+ return -EFAULT;
+ /*
+ * We steal reference to this route, caller should not release it
+ */
+ *rtp = NULL;
+ cork->fragsize = ip_sk_use_pmtu(sk) ?
+ dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+ cork->dst = &rt->dst;
+ cork->length = 0;
+ cork->ttl = ipc->ttl;
+ cork->tos = ipc->tos;
+ cork->priority = ipc->priority;
+ cork->tx_flags = ipc->tx_flags;
+
+ return 0;
+}
+
+/*
+ * ip_append_data() and ip_append_page() can make one large IP datagram
+ * from many pieces of data. Each pieces will be holded on the socket
+ * until ip_push_pending_frames() is called. Each piece can be a page
+ * or non-page data.
+ *
+ * Not only UDP, other transport protocols - e.g. raw sockets - can use
+ * this interface potentially.
+ *
+ * LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk, struct flowi4 *fl4,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
+ if (err)
+ return err;
+ } else {
+ transhdrlen = 0;
+ }
+
+ return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
+ sk_page_frag(sk), getfrag,
+ from, length, transhdrlen, flags);
+}
+
+ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+ struct rtable *rt;
+ struct ip_options *opt = NULL;
+ struct inet_cork *cork;
+ int hh_len;
+ int mtu;
+ int len;
+ int err;
+ unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
+
+ if (inet->hdrincl)
+ return -EPERM;
+
+ if (flags&MSG_PROBE)
+ return 0;
+
+ if (skb_queue_empty(&sk->sk_write_queue))
+ return -EINVAL;
+
+ cork = &inet->cork.base;
+ rt = (struct rtable *)cork->dst;
+ if (cork->flags & IPCORK_OPT)
+ opt = cork->opt;
+
+ if (!(rt->dst.dev->features&NETIF_F_SG))
+ return -EOPNOTSUPP;
+
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+ mtu = cork->fragsize;
+
+ fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
+
+ if (cork->length + size > maxnonfragsize - fragheaderlen) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ mtu - (opt ? opt->optlen : 0));
+ return -EMSGSIZE;
+ }
+
+ skb = skb_peek_tail(&sk->sk_write_queue);
+ if (!skb)
+ return -EINVAL;
+
+ cork->length += size;
+ if ((size + skb->len > mtu) &&
+ (sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->dst.dev->features & NETIF_F_UFO)) {
+ skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+ }
+
+ while (size > 0) {
+ int i;
+
+ if (skb_is_gso(skb))
+ len = size;
+ else {
+
+ /* Check if the remaining data fits into current packet. */
+ len = mtu - skb->len;
+ if (len < size)
+ len = maxfraglen - skb->len;
+ }
+ if (len <= 0) {
+ struct sk_buff *skb_prev;
+ int alloclen;
+
+ skb_prev = skb;
+ fraggap = skb_prev->len - maxfraglen;
+
+ alloclen = fragheaderlen + hh_len + fraggap + 15;
+ skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ err = -ENOBUFS;
+ goto error;
+ }
+
+ /*
+ * Fill in the control structures
+ */
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->csum = 0;
+ skb_reserve(skb, hh_len);
+
+ /*
+ * Find where to start putting bytes.
+ */
+ skb_put(skb, fragheaderlen + fraggap);
+ skb_reset_network_header(skb);
+ skb->transport_header = (skb->network_header +
+ fragheaderlen);
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(skb_prev,
+ maxfraglen,
+ skb_transport_header(skb),
+ fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ pskb_trim_unique(skb_prev, maxfraglen);
+ }
+
+ /*
+ * Put the packet on the pending queue.
+ */
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ continue;
+ }
+
+ i = skb_shinfo(skb)->nr_frags;
+ if (len > size)
+ len = size;
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
+ } else if (i < MAX_SKB_FRAGS) {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, len);
+ } else {
+ err = -EMSGSIZE;
+ goto error;
+ }
+
+ if (skb->ip_summed == CHECKSUM_NONE) {
+ __wsum csum;
+ csum = csum_page(page, offset, len);
+ skb->csum = csum_block_add(skb->csum, csum, skb->len);
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += len;
+ atomic_add(len, &sk->sk_wmem_alloc);
+ offset += len;
+ size -= len;
+ }
+ return 0;
+
+error:
+ cork->length -= size;
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+static void ip_cork_release(struct inet_cork *cork)
+{
+ cork->flags &= ~IPCORK_OPT;
+ kfree(cork->opt);
+ cork->opt = NULL;
+ dst_release(cork->dst);
+ cork->dst = NULL;
+}
+
+/*
+ * Combined all pending IP fragments on the socket as one IP datagram
+ * and push them out.
+ */
+struct sk_buff *__ip_make_skb(struct sock *sk,
+ struct flowi4 *fl4,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
+{
+ struct sk_buff *skb, *tmp_skb;
+ struct sk_buff **tail_skb;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+ struct ip_options *opt = NULL;
+ struct rtable *rt = (struct rtable *)cork->dst;
+ struct iphdr *iph;
+ __be16 df = 0;
+ __u8 ttl;
+
+ skb = __skb_dequeue(queue);
+ if (!skb)
+ goto out;
+ tail_skb = &(skb_shinfo(skb)->frag_list);
+
+ /* move skb->data to ip header from ext header */
+ if (skb->data < skb_network_header(skb))
+ __skb_pull(skb, skb_network_offset(skb));
+ while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
+ __skb_pull(tmp_skb, skb_network_header_len(skb));
+ *tail_skb = tmp_skb;
+ tail_skb = &(tmp_skb->next);
+ skb->len += tmp_skb->len;
+ skb->data_len += tmp_skb->len;
+ skb->truesize += tmp_skb->truesize;
+ tmp_skb->destructor = NULL;
+ tmp_skb->sk = NULL;
+ }
+
+ /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
+ * to fragment the frame generated here. No matter, what transforms
+ * how transforms change size of the packet, it will come out.
+ */
+ skb->ignore_df = ip_sk_ignore_df(sk);
+
+ /* DF bit is set when we want to see DF on outgoing frames.
+ * If ignore_df is set too, we still allow to fragment this frame
+ * locally. */
+ if (inet->pmtudisc == IP_PMTUDISC_DO ||
+ inet->pmtudisc == IP_PMTUDISC_PROBE ||
+ (skb->len <= dst_mtu(&rt->dst) &&
+ ip_dont_fragment(sk, &rt->dst)))
+ df = htons(IP_DF);
+
+ if (cork->flags & IPCORK_OPT)
+ opt = cork->opt;
+
+ if (cork->ttl != 0)
+ ttl = cork->ttl;
+ else if (rt->rt_type == RTN_MULTICAST)
+ ttl = inet->mc_ttl;
+ else
+ ttl = ip_select_ttl(inet, &rt->dst);
+
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+ iph->frag_off = df;
+ iph->ttl = ttl;
+ iph->protocol = sk->sk_protocol;
+ ip_copy_addrs(iph, fl4);
+ ip_select_ident(net, skb, sk);
+
+ if (opt) {
+ iph->ihl += opt->optlen>>2;
+ ip_options_build(skb, opt, cork->addr, rt, 0);
+ }
+
+ skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
+ skb->mark = sk->sk_mark;
+ /*
+ * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
+ * on dst refcount
+ */
+ cork->dst = NULL;
+ skb_dst_set(skb, &rt->dst);
+
+ if (iph->protocol == IPPROTO_ICMP)
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
+
+ ip_cork_release(cork);
+out:
+ return skb;
+}
+
+int ip_send_skb(struct net *net, struct sk_buff *skb)
+{
+ int err;
+
+ err = ip_local_out(skb);
+ if (err) {
+ if (err > 0)
+ err = net_xmit_errno(err);
+ if (err)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+ }
+
+ return err;
+}
+
+int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
+{
+ struct sk_buff *skb;
+
+ skb = ip_finish_skb(sk, fl4);
+ if (!skb)
+ return 0;
+
+ /* Netfilter gets whole the not fragmented skb. */
+ return ip_send_skb(sock_net(sk), skb);
+}
+
+/*
+ * Throw away all pending data on the socket.
+ */
+static void __ip_flush_pending_frames(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue_tail(queue)) != NULL)
+ kfree_skb(skb);
+
+ ip_cork_release(cork);
+}
+
+void ip_flush_pending_frames(struct sock *sk)
+{
+ __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
+}
+
+struct sk_buff *ip_make_skb(struct sock *sk,
+ struct flowi4 *fl4,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ struct ipcm_cookie *ipc, struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_cork cork;
+ struct sk_buff_head queue;
+ int err;
+
+ if (flags & MSG_PROBE)
+ return NULL;
+
+ __skb_queue_head_init(&queue);
+
+ cork.flags = 0;
+ cork.addr = 0;
+ cork.opt = NULL;
+ err = ip_setup_cork(sk, &cork, ipc, rtp);
+ if (err)
+ return ERR_PTR(err);
+
+ err = __ip_append_data(sk, fl4, &queue, &cork,
+ &current->task_frag, getfrag,
+ from, length, transhdrlen, flags);
+ if (err) {
+ __ip_flush_pending_frames(sk, &queue, &cork);
+ return ERR_PTR(err);
+ }
+
+ return __ip_make_skb(sk, fl4, &queue, &cork);
+}
+
+/*
+ * Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset,
+ int len, int odd, struct sk_buff *skb)
+{
+ __wsum csum;
+
+ csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ return 0;
+}
+
+/*
+ * Generic function to send a packet as reply to another packet.
+ * Used to send some TCP resets/acks so far.
+ */
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+ const struct ip_options *sopt,
+ __be32 daddr, __be32 saddr,
+ const struct ip_reply_arg *arg,
+ unsigned int len)
+{
+ struct ip_options_data replyopts;
+ struct ipcm_cookie ipc;
+ struct flowi4 fl4;
+ struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
+ struct sk_buff *nskb;
+ int err;
+
+ if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
+ return;
+
+ ipc.addr = daddr;
+ ipc.opt = NULL;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ if (replyopts.opt.opt.optlen) {
+ ipc.opt = &replyopts.opt;
+
+ if (replyopts.opt.opt.srr)
+ daddr = replyopts.opt.opt.faddr;
+ }
+
+ flowi4_init_output(&fl4, arg->bound_dev_if,
+ IP4_REPLY_MARK(net, skb->mark),
+ RT_TOS(arg->tos),
+ RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
+ ip_reply_arg_flowi_flags(arg),
+ daddr, saddr,
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return;
+
+ inet_sk(sk)->tos = arg->tos;
+
+ sk->sk_priority = skb->priority;
+ sk->sk_protocol = ip_hdr(skb)->protocol;
+ sk->sk_bound_dev_if = arg->bound_dev_if;
+ sk->sk_sndbuf = sysctl_wmem_default;
+ err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
+ len, 0, &ipc, &rt, MSG_DONTWAIT);
+ if (unlikely(err)) {
+ ip_flush_pending_frames(sk);
+ goto out;
+ }
+
+ nskb = skb_peek(&sk->sk_write_queue);
+ if (nskb) {
+ if (arg->csumoffset >= 0)
+ *((__sum16 *)skb_transport_header(nskb) +
+ arg->csumoffset) = csum_fold(csum_add(nskb->csum,
+ arg->csum));
+ nskb->ip_summed = CHECKSUM_NONE;
+ skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
+ ip_push_pending_frames(sk, &fl4);
+ }
+out:
+ ip_rt_put(rt);
+}
+
+void __init ip_init(void)
+{
+ ip_rt_init();
+ inet_initpeers();
+
+#if defined(CONFIG_IP_MULTICAST)
+ igmp_mc_init();
+#endif
+}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 000000000..6ddde8999
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1544 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The IP to API glue.
+ *
+ * Authors: see ip.c
+ *
+ * Fixes:
+ * Many : Split from ip.c , see ip.c for history.
+ * Martin Mares : TOS setting fixed.
+ * Alan Cox : Fixed a couple of oopses in Martin's
+ * TOS tweaks.
+ * Mike McLagan : Routing by source
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp_states.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/netfilter.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/inet_ecn.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/compat.h>
+#include <net/checksum.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/transp_v6.h>
+#endif
+#include <net/ip_fib.h>
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+/*
+ * SOL_IP control messages.
+ */
+
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct in_pktinfo info = *PKTINFO_SKB_CB(skb);
+
+ info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
+
+ put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+ int ttl = ip_hdr(skb)->ttl;
+ put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+ put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos);
+}
+
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen,
+ ip_hdr(skb) + 1);
+}
+
+
+static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+
+ if (IPCB(skb)->opt.optlen == 0)
+ return;
+
+ if (ip_options_echo(opt, skb)) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+ ip_options_undo(opt);
+
+ put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+
+static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
+ int offset)
+{
+ __wsum csum = skb->csum;
+
+ if (skb->ip_summed != CHECKSUM_COMPLETE)
+ return;
+
+ if (offset != 0)
+ csum = csum_sub(csum, csum_partial(skb->data, offset, 0));
+
+ put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
+}
+
+static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
+{
+ char *secdata;
+ u32 seclen, secid;
+ int err;
+
+ err = security_socket_getpeersec_dgram(NULL, skb, &secid);
+ if (err)
+ return;
+
+ err = security_secid_to_secctx(secid, &secdata, &seclen);
+ if (err)
+ return;
+
+ put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
+ security_release_secctx(secdata, seclen);
+}
+
+static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct sockaddr_in sin;
+ const struct iphdr *iph = ip_hdr(skb);
+ __be16 *ports = (__be16 *)skb_transport_header(skb);
+
+ if (skb_transport_offset(skb) + 4 > skb->len)
+ return;
+
+ /* All current transport protocols have the port numbers in the
+ * first four bytes of the transport header and this function is
+ * written with this assumption in mind.
+ */
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = iph->daddr;
+ sin.sin_port = ports[1];
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
+}
+
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
+ int offset)
+{
+ struct inet_sock *inet = inet_sk(skb->sk);
+ unsigned int flags = inet->cmsg_flags;
+
+ /* Ordered by supposed usage frequency */
+ if (flags & IP_CMSG_PKTINFO) {
+ ip_cmsg_recv_pktinfo(msg, skb);
+
+ flags &= ~IP_CMSG_PKTINFO;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_TTL) {
+ ip_cmsg_recv_ttl(msg, skb);
+
+ flags &= ~IP_CMSG_TTL;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_TOS) {
+ ip_cmsg_recv_tos(msg, skb);
+
+ flags &= ~IP_CMSG_TOS;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_RECVOPTS) {
+ ip_cmsg_recv_opts(msg, skb);
+
+ flags &= ~IP_CMSG_RECVOPTS;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_RETOPTS) {
+ ip_cmsg_recv_retopts(msg, skb);
+
+ flags &= ~IP_CMSG_RETOPTS;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_PASSSEC) {
+ ip_cmsg_recv_security(msg, skb);
+
+ flags &= ~IP_CMSG_PASSSEC;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_ORIGDSTADDR) {
+ ip_cmsg_recv_dstaddr(msg, skb);
+
+ flags &= ~IP_CMSG_ORIGDSTADDR;
+ if (!flags)
+ return;
+ }
+
+ if (flags & IP_CMSG_CHECKSUM)
+ ip_cmsg_recv_checksum(msg, skb, offset);
+}
+EXPORT_SYMBOL(ip_cmsg_recv_offset);
+
+int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
+ bool allow_ipv6)
+{
+ int err, val;
+ struct cmsghdr *cmsg;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (allow_ipv6 &&
+ cmsg->cmsg_level == SOL_IPV6 &&
+ cmsg->cmsg_type == IPV6_PKTINFO) {
+ struct in6_pktinfo *src_info;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info)))
+ return -EINVAL;
+ src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
+ return -EINVAL;
+ ipc->oif = src_info->ipi6_ifindex;
+ ipc->addr = src_info->ipi6_addr.s6_addr32[3];
+ continue;
+ }
+#endif
+ if (cmsg->cmsg_level != SOL_IP)
+ continue;
+ switch (cmsg->cmsg_type) {
+ case IP_RETOPTS:
+ err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
+ err < 40 ? err : 40);
+ if (err)
+ return err;
+ break;
+ case IP_PKTINFO:
+ {
+ struct in_pktinfo *info;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+ return -EINVAL;
+ info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+ ipc->oif = info->ipi_ifindex;
+ ipc->addr = info->ipi_spec_dst.s_addr;
+ break;
+ }
+ case IP_TTL:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->ttl = val;
+ break;
+ case IP_TOS:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ ipc->tos = val;
+ ipc->priority = rt_tos2priority(ipc->tos);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+
+/* Special input handler for packets caught by router alert option.
+ They are selected only by protocol field, and then processed likely
+ local ones; but only if someone wants them! Otherwise, router
+ not running rsvpd will kill RSVP.
+
+ It is user level problem, what it will make with them.
+ I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+ but receiver should be enough clever f.e. to forward mtrace requests,
+ sent to multicast group to reach destination designated router.
+ */
+struct ip_ra_chain __rcu *ip_ra_chain;
+static DEFINE_SPINLOCK(ip_ra_lock);
+
+
+static void ip_ra_destroy_rcu(struct rcu_head *head)
+{
+ struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
+
+ sock_put(ra->saved_sk);
+ kfree(ra);
+}
+
+int ip_ra_control(struct sock *sk, unsigned char on,
+ void (*destructor)(struct sock *))
+{
+ struct ip_ra_chain *ra, *new_ra;
+ struct ip_ra_chain __rcu **rap;
+
+ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
+ return -EINVAL;
+
+ new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+ spin_lock_bh(&ip_ra_lock);
+ for (rap = &ip_ra_chain;
+ (ra = rcu_dereference_protected(*rap,
+ lockdep_is_held(&ip_ra_lock))) != NULL;
+ rap = &ra->next) {
+ if (ra->sk == sk) {
+ if (on) {
+ spin_unlock_bh(&ip_ra_lock);
+ kfree(new_ra);
+ return -EADDRINUSE;
+ }
+ /* dont let ip_call_ra_chain() use sk again */
+ ra->sk = NULL;
+ RCU_INIT_POINTER(*rap, ra->next);
+ spin_unlock_bh(&ip_ra_lock);
+
+ if (ra->destructor)
+ ra->destructor(sk);
+ /*
+ * Delay sock_put(sk) and kfree(ra) after one rcu grace
+ * period. This guarantee ip_call_ra_chain() dont need
+ * to mess with socket refcounts.
+ */
+ ra->saved_sk = sk;
+ call_rcu(&ra->rcu, ip_ra_destroy_rcu);
+ return 0;
+ }
+ }
+ if (!new_ra) {
+ spin_unlock_bh(&ip_ra_lock);
+ return -ENOBUFS;
+ }
+ new_ra->sk = sk;
+ new_ra->destructor = destructor;
+
+ RCU_INIT_POINTER(new_ra->next, ra);
+ rcu_assign_pointer(*rap, new_ra);
+ sock_hold(sk);
+ spin_unlock_bh(&ip_ra_lock);
+
+ return 0;
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+ __be16 port, u32 info, u8 *payload)
+{
+ struct sock_exterr_skb *serr;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+ serr->ee.ee_type = icmp_hdr(skb)->type;
+ serr->ee.ee_code = icmp_hdr(skb)->code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) -
+ skb_network_header(skb);
+ serr->port = port;
+
+ if (skb_pull(skb, payload - skb->data)) {
+ skb_reset_transport_header(skb);
+ if (sock_queue_err_skb(sk, skb) == 0)
+ return;
+ }
+ kfree_skb(skb);
+}
+
+void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sock_exterr_skb *serr;
+ struct iphdr *iph;
+ struct sk_buff *skb;
+
+ if (!inet->recverr)
+ return;
+
+ skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb_put(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ iph->daddr = daddr;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = 0;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
+ serr->port = port;
+
+ __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+ skb_reset_transport_header(skb);
+
+ if (sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+/* For some errors we have valid addr_offset even with zero payload and
+ * zero port. Also, addr_offset should be supported if port is set.
+ */
+static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr)
+{
+ return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
+}
+
+/* IPv4 supports cmsg on all imcp errors and some timestamps
+ *
+ * Timestamp code paths do not initialize the fields expected by cmsg:
+ * the PKTINFO fields in skb->cb[]. Fill those in here.
+ */
+static bool ipv4_datagram_support_cmsg(const struct sock *sk,
+ struct sk_buff *skb,
+ int ee_origin)
+{
+ struct in_pktinfo *info;
+
+ if (ee_origin == SO_EE_ORIGIN_ICMP)
+ return true;
+
+ if (ee_origin == SO_EE_ORIGIN_LOCAL)
+ return false;
+
+ /* Support IP_PKTINFO on tstamp packets if requested, to correlate
+ * timestamp with egress dev. Not possible for packets without dev
+ * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
+ */
+ if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) ||
+ (!skb->dev))
+ return false;
+
+ info = PKTINFO_SKB_CB(skb);
+ info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr;
+ info->ipi_ifindex = skb->dev->ifindex;
+ return true;
+}
+
+/*
+ * Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct {
+ struct sock_extended_err ee;
+ struct sockaddr_in offender;
+ } errhdr;
+ int err;
+ int copied;
+
+ WARN_ON_ONCE(sk->sk_family == AF_INET6);
+
+ err = -EAGAIN;
+ skb = sock_dequeue_err_skb(sk);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+
+ if (sin && ipv4_datagram_support_addr(serr)) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
+ serr->addr_offset);
+ sin->sin_port = serr->port;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+
+ memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+ sin = &errhdr.offender;
+ memset(sin, 0, sizeof(*sin));
+
+ if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ if (inet_sk(sk)->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ }
+
+ put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+
+ /* Now we could try to dump offended packet options */
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+
+
+/*
+ * Socket option code for IP. This is the end of the line after any
+ * TCP,UDP etc options on an IP socket.
+ */
+static bool setsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_ADD_MEMBERSHIP:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_BLOCK_SOURCE:
+ case IP_DROP_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case IP_MSFILTER:
+ case IP_UNBLOCK_SOURCE:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_MSFILTER:
+ case MCAST_JOIN_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_UNBLOCK_SOURCE:
+ return true;
+ }
+ return false;
+}
+
+static int do_ip_setsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, unsigned int optlen)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int val = 0, err;
+ bool needs_rtnl = setsockopt_needs_rtnl(optname);
+
+ switch (optname) {
+ case IP_PKTINFO:
+ case IP_RECVTTL:
+ case IP_RECVOPTS:
+ case IP_RECVTOS:
+ case IP_RETOPTS:
+ case IP_TOS:
+ case IP_TTL:
+ case IP_HDRINCL:
+ case IP_MTU_DISCOVER:
+ case IP_RECVERR:
+ case IP_ROUTER_ALERT:
+ case IP_FREEBIND:
+ case IP_PASSSEC:
+ case IP_TRANSPARENT:
+ case IP_MINTTL:
+ case IP_NODEFRAG:
+ case IP_UNICAST_IF:
+ case IP_MULTICAST_TTL:
+ case IP_MULTICAST_ALL:
+ case IP_MULTICAST_LOOP:
+ case IP_RECVORIGDSTADDR:
+ case IP_CHECKSUM:
+ if (optlen >= sizeof(int)) {
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+ } else if (optlen >= sizeof(char)) {
+ unsigned char ucval;
+
+ if (get_user(ucval, (unsigned char __user *) optval))
+ return -EFAULT;
+ val = (int) ucval;
+ }
+ }
+
+ /* If optlen==0, it is equivalent to val == 0 */
+
+ if (ip_mroute_opt(optname))
+ return ip_mroute_setsockopt(sk, optname, optval, optlen);
+
+ err = 0;
+ if (needs_rtnl)
+ rtnl_lock();
+ lock_sock(sk);
+
+ switch (optname) {
+ case IP_OPTIONS:
+ {
+ struct ip_options_rcu *old, *opt = NULL;
+
+ if (optlen > 40)
+ goto e_inval;
+ err = ip_options_get_from_user(sock_net(sk), &opt,
+ optval, optlen);
+ if (err)
+ break;
+ old = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet->is_icsk) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == PF_INET ||
+ (!((1 << sk->sk_state) &
+ (TCPF_LISTEN | TCPF_CLOSE)) &&
+ inet->inet_daddr != LOOPBACK4_IPV6)) {
+#endif
+ if (old)
+ icsk->icsk_ext_hdr_len -= old->opt.optlen;
+ if (opt)
+ icsk->icsk_ext_hdr_len += opt->opt.optlen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+#if IS_ENABLED(CONFIG_IPV6)
+ }
+#endif
+ }
+ rcu_assign_pointer(inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ case IP_PKTINFO:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_PKTINFO;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
+ break;
+ case IP_RECVTTL:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_TTL;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_TTL;
+ break;
+ case IP_RECVTOS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_TOS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_TOS;
+ break;
+ case IP_RECVOPTS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RECVOPTS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
+ break;
+ case IP_RETOPTS:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RETOPTS;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
+ break;
+ case IP_PASSSEC:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_PASSSEC;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
+ break;
+ case IP_RECVORIGDSTADDR:
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
+ break;
+ case IP_CHECKSUM:
+ if (val) {
+ if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) {
+ inet_inc_convert_csum(sk);
+ inet->cmsg_flags |= IP_CMSG_CHECKSUM;
+ }
+ } else {
+ if (inet->cmsg_flags & IP_CMSG_CHECKSUM) {
+ inet_dec_convert_csum(sk);
+ inet->cmsg_flags &= ~IP_CMSG_CHECKSUM;
+ }
+ }
+ break;
+ case IP_TOS: /* This sets both TOS and Precedence */
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~INET_ECN_MASK;
+ val |= inet->tos & INET_ECN_MASK;
+ }
+ if (inet->tos != val) {
+ inet->tos = val;
+ sk->sk_priority = rt_tos2priority(val);
+ sk_dst_reset(sk);
+ }
+ break;
+ case IP_TTL:
+ if (optlen < 1)
+ goto e_inval;
+ if (val != -1 && (val < 1 || val > 255))
+ goto e_inval;
+ inet->uc_ttl = val;
+ break;
+ case IP_HDRINCL:
+ if (sk->sk_type != SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ inet->hdrincl = val ? 1 : 0;
+ break;
+ case IP_NODEFRAG:
+ if (sk->sk_type != SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ inet->nodefrag = val ? 1 : 0;
+ break;
+ case IP_MTU_DISCOVER:
+ if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
+ goto e_inval;
+ inet->pmtudisc = val;
+ break;
+ case IP_RECVERR:
+ inet->recverr = !!val;
+ if (!val)
+ skb_queue_purge(&sk->sk_error_queue);
+ break;
+ case IP_MULTICAST_TTL:
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ if (optlen < 1)
+ goto e_inval;
+ if (val == -1)
+ val = 1;
+ if (val < 0 || val > 255)
+ goto e_inval;
+ inet->mc_ttl = val;
+ break;
+ case IP_MULTICAST_LOOP:
+ if (optlen < 1)
+ goto e_inval;
+ inet->mc_loop = !!val;
+ break;
+ case IP_UNICAST_IF:
+ {
+ struct net_device *dev = NULL;
+ int ifindex;
+
+ if (optlen != sizeof(int))
+ goto e_inval;
+
+ ifindex = (__force int)ntohl((__force __be32)val);
+ if (ifindex == 0) {
+ inet->uc_index = 0;
+ err = 0;
+ break;
+ }
+
+ dev = dev_get_by_index(sock_net(sk), ifindex);
+ err = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+ dev_put(dev);
+
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if)
+ break;
+
+ inet->uc_index = ifindex;
+ err = 0;
+ break;
+ }
+ case IP_MULTICAST_IF:
+ {
+ struct ip_mreqn mreq;
+ struct net_device *dev = NULL;
+
+ if (sk->sk_type == SOCK_STREAM)
+ goto e_inval;
+ /*
+ * Check the arguments are allowable
+ */
+
+ if (optlen < sizeof(struct in_addr))
+ goto e_inval;
+
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_user(&mreq, optval, sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (optlen >= sizeof(struct ip_mreq)) {
+ if (copy_from_user(&mreq, optval,
+ sizeof(struct ip_mreq)))
+ break;
+ } else if (optlen >= sizeof(struct in_addr)) {
+ if (copy_from_user(&mreq.imr_address, optval,
+ sizeof(struct in_addr)))
+ break;
+ }
+ }
+
+ if (!mreq.imr_ifindex) {
+ if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
+ inet->mc_index = 0;
+ inet->mc_addr = 0;
+ err = 0;
+ break;
+ }
+ dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
+ if (dev)
+ mreq.imr_ifindex = dev->ifindex;
+ } else
+ dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
+
+
+ err = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+ dev_put(dev);
+
+ err = -EINVAL;
+ if (sk->sk_bound_dev_if &&
+ mreq.imr_ifindex != sk->sk_bound_dev_if)
+ break;
+
+ inet->mc_index = mreq.imr_ifindex;
+ inet->mc_addr = mreq.imr_address.s_addr;
+ err = 0;
+ break;
+ }
+
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ {
+ struct ip_mreqn mreq;
+
+ err = -EPROTO;
+ if (inet_sk(sk)->is_icsk)
+ break;
+
+ if (optlen < sizeof(struct ip_mreq))
+ goto e_inval;
+ err = -EFAULT;
+ if (optlen >= sizeof(struct ip_mreqn)) {
+ if (copy_from_user(&mreq, optval, sizeof(mreq)))
+ break;
+ } else {
+ memset(&mreq, 0, sizeof(mreq));
+ if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq)))
+ break;
+ }
+
+ if (optname == IP_ADD_MEMBERSHIP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case IP_MSFILTER:
+ {
+ struct ip_msfilter *msf;
+
+ if (optlen < IP_MSFILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ err = -ENOBUFS;
+ break;
+ }
+ msf = kmalloc(optlen, GFP_KERNEL);
+ if (!msf) {
+ err = -ENOBUFS;
+ break;
+ }
+ err = -EFAULT;
+ if (copy_from_user(msf, optval, optlen)) {
+ kfree(msf);
+ break;
+ }
+ /* numsrc >= (1G-4) overflow in 32 bits */
+ if (msf->imsf_numsrc >= 0x3ffffffcU ||
+ msf->imsf_numsrc > sysctl_igmp_max_msf) {
+ kfree(msf);
+ err = -ENOBUFS;
+ break;
+ }
+ if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+ kfree(msf);
+ err = -EINVAL;
+ break;
+ }
+ err = ip_mc_msfilter(sk, msf, 0);
+ kfree(msf);
+ break;
+ }
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ {
+ struct ip_mreq_source mreqs;
+ int omode, add;
+
+ if (optlen != sizeof(struct ip_mreq_source))
+ goto e_inval;
+ if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
+ err = -EFAULT;
+ break;
+ }
+ if (optname == IP_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == IP_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+ struct ip_mreqn mreq;
+
+ mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+ mreq.imr_address.s_addr = mreqs.imr_interface;
+ mreq.imr_ifindex = 0;
+ err = ip_mc_join_group(sk, &mreq);
+ if (err && err != -EADDRINUSE)
+ break;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs, 0);
+ break;
+ }
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ {
+ struct group_req greq;
+ struct sockaddr_in *psin;
+ struct ip_mreqn mreq;
+
+ if (optlen < sizeof(struct group_req))
+ goto e_inval;
+ err = -EFAULT;
+ if (copy_from_user(&greq, optval, sizeof(greq)))
+ break;
+ psin = (struct sockaddr_in *)&greq.gr_group;
+ if (psin->sin_family != AF_INET)
+ goto e_inval;
+ memset(&mreq, 0, sizeof(mreq));
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_ifindex = greq.gr_interface;
+
+ if (optname == MCAST_JOIN_GROUP)
+ err = ip_mc_join_group(sk, &mreq);
+ else
+ err = ip_mc_leave_group(sk, &mreq);
+ break;
+ }
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ {
+ struct group_source_req greqs;
+ struct ip_mreq_source mreqs;
+ struct sockaddr_in *psin;
+ int omode, add;
+
+ if (optlen != sizeof(struct group_source_req))
+ goto e_inval;
+ if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+ err = -EFAULT;
+ break;
+ }
+ if (greqs.gsr_group.ss_family != AF_INET ||
+ greqs.gsr_source.ss_family != AF_INET) {
+ err = -EADDRNOTAVAIL;
+ break;
+ }
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+ psin = (struct sockaddr_in *)&greqs.gsr_source;
+ mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+ mreqs.imr_interface = 0; /* use index for mc_source */
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct ip_mreqn mreq;
+
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_address.s_addr = 0;
+ mreq.imr_ifindex = greqs.gsr_interface;
+ err = ip_mc_join_group(sk, &mreq);
+ if (err && err != -EADDRINUSE)
+ break;
+ greqs.gsr_interface = mreq.imr_ifindex;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ err = ip_mc_source(add, omode, sk, &mreqs,
+ greqs.gsr_interface);
+ break;
+ }
+ case MCAST_MSFILTER:
+ {
+ struct sockaddr_in *psin;
+ struct ip_msfilter *msf = NULL;
+ struct group_filter *gsf = NULL;
+ int msize, i, ifindex;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ err = -ENOBUFS;
+ break;
+ }
+ gsf = kmalloc(optlen, GFP_KERNEL);
+ if (!gsf) {
+ err = -ENOBUFS;
+ break;
+ }
+ err = -EFAULT;
+ if (copy_from_user(gsf, optval, optlen))
+ goto mc_msf_out;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ if (gsf->gf_numsrc >= 0x1ffffff ||
+ gsf->gf_numsrc > sysctl_igmp_max_msf) {
+ err = -ENOBUFS;
+ goto mc_msf_out;
+ }
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+ err = -EINVAL;
+ goto mc_msf_out;
+ }
+ msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
+ msf = kmalloc(msize, GFP_KERNEL);
+ if (!msf) {
+ err = -ENOBUFS;
+ goto mc_msf_out;
+ }
+ ifindex = gsf->gf_interface;
+ psin = (struct sockaddr_in *)&gsf->gf_group;
+ if (psin->sin_family != AF_INET) {
+ err = -EADDRNOTAVAIL;
+ goto mc_msf_out;
+ }
+ msf->imsf_multiaddr = psin->sin_addr.s_addr;
+ msf->imsf_interface = 0;
+ msf->imsf_fmode = gsf->gf_fmode;
+ msf->imsf_numsrc = gsf->gf_numsrc;
+ err = -EADDRNOTAVAIL;
+ for (i = 0; i < gsf->gf_numsrc; ++i) {
+ psin = (struct sockaddr_in *)&gsf->gf_slist[i];
+
+ if (psin->sin_family != AF_INET)
+ goto mc_msf_out;
+ msf->imsf_slist[i] = psin->sin_addr.s_addr;
+ }
+ kfree(gsf);
+ gsf = NULL;
+
+ err = ip_mc_msfilter(sk, msf, ifindex);
+mc_msf_out:
+ kfree(msf);
+ kfree(gsf);
+ break;
+ }
+ case IP_MULTICAST_ALL:
+ if (optlen < 1)
+ goto e_inval;
+ if (val != 0 && val != 1)
+ goto e_inval;
+ inet->mc_all = val;
+ break;
+ case IP_ROUTER_ALERT:
+ err = ip_ra_control(sk, val ? 1 : 0, NULL);
+ break;
+
+ case IP_FREEBIND:
+ if (optlen < 1)
+ goto e_inval;
+ inet->freebind = !!val;
+ break;
+
+ case IP_IPSEC_POLICY:
+ case IP_XFRM_POLICY:
+ err = -EPERM;
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ break;
+ err = xfrm_user_policy(sk, optname, optval, optlen);
+ break;
+
+ case IP_TRANSPARENT:
+ if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ err = -EPERM;
+ break;
+ }
+ if (optlen < 1)
+ goto e_inval;
+ inet->transparent = !!val;
+ break;
+
+ case IP_MINTTL:
+ if (optlen < 1)
+ goto e_inval;
+ if (val < 0 || val > 255)
+ goto e_inval;
+ inet->min_ttl = val;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return err;
+
+e_inval:
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return -EINVAL;
+}
+
+/**
+ * ipv4_pktinfo_prepare - transfer some info from rtable to skb
+ * @sk: socket
+ * @skb: buffer
+ *
+ * To support IP_CMSG_PKTINFO option, we store rt_iif and specific
+ * destination in skb->cb[] before dst drop.
+ * This way, receiver doesn't make cache line misses to read rtable.
+ */
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
+{
+ struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
+ bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
+ ipv6_sk_rxinfo(sk);
+
+ if (prepare && skb_rtable(skb)) {
+ pktinfo->ipi_ifindex = inet_iif(skb);
+ pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
+ } else {
+ pktinfo->ipi_ifindex = 0;
+ pktinfo->ipi_spec_dst.s_addr = 0;
+ }
+ skb_dst_drop(skb);
+}
+
+int ip_setsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, unsigned int optlen)
+{
+ int err;
+
+ if (level != SOL_IP)
+ return -ENOPROTOOPT;
+
+ err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+ optname != IP_IPSEC_POLICY &&
+ optname != IP_XFRM_POLICY &&
+ !ip_mroute_opt(optname)) {
+ lock_sock(sk);
+ err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
+ release_sock(sk);
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(ip_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ip_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ int err;
+
+ if (level != SOL_IP)
+ return -ENOPROTOOPT;
+
+ if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
+ return compat_mc_setsockopt(sk, level, optname, optval, optlen,
+ ip_setsockopt);
+
+ err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+ optname != IP_IPSEC_POLICY &&
+ optname != IP_XFRM_POLICY &&
+ !ip_mroute_opt(optname)) {
+ lock_sock(sk);
+ err = compat_nf_setsockopt(sk, PF_INET, optname,
+ optval, optlen);
+ release_sock(sk);
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(compat_ip_setsockopt);
+#endif
+
+/*
+ * Get the options. Note for future reference. The GET of IP options gets
+ * the _received_ ones. The set sets the _sent_ ones.
+ */
+
+static int do_ip_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen, unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int val;
+ int len;
+
+ if (level != SOL_IP)
+ return -EOPNOTSUPP;
+
+ if (ip_mroute_opt(optname))
+ return ip_mroute_getsockopt(sk, optname, optval, optlen);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case IP_OPTIONS:
+ {
+ unsigned char optbuf[sizeof(struct ip_options)+40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+ struct ip_options_rcu *inet_opt;
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ opt->optlen = 0;
+ if (inet_opt)
+ memcpy(optbuf, &inet_opt->opt,
+ sizeof(struct ip_options) +
+ inet_opt->opt.optlen);
+ release_sock(sk);
+
+ if (opt->optlen == 0)
+ return put_user(0, optlen);
+
+ ip_options_undo(opt);
+
+ len = min_t(unsigned int, len, opt->optlen);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, opt->__data, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_PKTINFO:
+ val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
+ break;
+ case IP_RECVTTL:
+ val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
+ break;
+ case IP_RECVTOS:
+ val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
+ break;
+ case IP_RECVOPTS:
+ val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+ break;
+ case IP_RETOPTS:
+ val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
+ break;
+ case IP_PASSSEC:
+ val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
+ break;
+ case IP_RECVORIGDSTADDR:
+ val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
+ break;
+ case IP_CHECKSUM:
+ val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
+ break;
+ case IP_TOS:
+ val = inet->tos;
+ break;
+ case IP_TTL:
+ val = (inet->uc_ttl == -1 ?
+ sysctl_ip_default_ttl :
+ inet->uc_ttl);
+ break;
+ case IP_HDRINCL:
+ val = inet->hdrincl;
+ break;
+ case IP_NODEFRAG:
+ val = inet->nodefrag;
+ break;
+ case IP_MTU_DISCOVER:
+ val = inet->pmtudisc;
+ break;
+ case IP_MTU:
+ {
+ struct dst_entry *dst;
+ val = 0;
+ dst = sk_dst_get(sk);
+ if (dst) {
+ val = dst_mtu(dst);
+ dst_release(dst);
+ }
+ if (!val) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+ break;
+ }
+ case IP_RECVERR:
+ val = inet->recverr;
+ break;
+ case IP_MULTICAST_TTL:
+ val = inet->mc_ttl;
+ break;
+ case IP_MULTICAST_LOOP:
+ val = inet->mc_loop;
+ break;
+ case IP_UNICAST_IF:
+ val = (__force int)htonl((__u32) inet->uc_index);
+ break;
+ case IP_MULTICAST_IF:
+ {
+ struct in_addr addr;
+ len = min_t(unsigned int, len, sizeof(struct in_addr));
+ addr.s_addr = inet->mc_addr;
+ release_sock(sk);
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &addr, len))
+ return -EFAULT;
+ return 0;
+ }
+ case IP_MSFILTER:
+ {
+ struct ip_msfilter msf;
+ int err;
+
+ if (len < IP_MSFILTER_SIZE(0)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ err = ip_mc_msfget(sk, &msf,
+ (struct ip_msfilter __user *)optval, optlen);
+ release_sock(sk);
+ return err;
+ }
+ case MCAST_MSFILTER:
+ {
+ struct group_filter gsf;
+ int err;
+
+ if (len < GROUP_FILTER_SIZE(0)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
+ release_sock(sk);
+ return -EFAULT;
+ }
+ err = ip_mc_gsfget(sk, &gsf,
+ (struct group_filter __user *)optval,
+ optlen);
+ release_sock(sk);
+ return err;
+ }
+ case IP_MULTICAST_ALL:
+ val = inet->mc_all;
+ break;
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
+
+ release_sock(sk);
+
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ msg.msg_control = (__force void *) optval;
+ msg.msg_controllen = len;
+ msg.msg_flags = flags;
+
+ if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = inet->inet_rcv_saddr;
+ info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
+ info.ipi_ifindex = inet->mc_index;
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+ }
+ if (inet->cmsg_flags & IP_CMSG_TTL) {
+ int hlim = inet->mc_ttl;
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ if (inet->cmsg_flags & IP_CMSG_TOS) {
+ int tos = inet->rcv_tos;
+ put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+ }
+ len -= msg.msg_controllen;
+ return put_user(len, optlen);
+ }
+ case IP_FREEBIND:
+ val = inet->freebind;
+ break;
+ case IP_TRANSPARENT:
+ val = inet->transparent;
+ break;
+ case IP_MINTTL:
+ val = inet->min_ttl;
+ break;
+ default:
+ release_sock(sk);
+ return -ENOPROTOOPT;
+ }
+ release_sock(sk);
+
+ if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
+ unsigned char ucval = (unsigned char)val;
+ len = 1;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &ucval, 1))
+ return -EFAULT;
+ } else {
+ len = min_t(unsigned int, sizeof(int), len);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+int ip_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, int __user *optlen)
+{
+ int err;
+
+ err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
+ !ip_mroute_opt(optname)) {
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ err = nf_getsockopt(sk, PF_INET, optname, optval,
+ &len);
+ release_sock(sk);
+ if (err >= 0)
+ err = put_user(len, optlen);
+ return err;
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(ip_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ip_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ int err;
+
+ if (optname == MCAST_MSFILTER)
+ return compat_mc_getsockopt(sk, level, optname, optval, optlen,
+ ip_getsockopt);
+
+ err = do_ip_getsockopt(sk, level, optname, optval, optlen,
+ MSG_CMSG_COMPAT);
+
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
+ !ip_mroute_opt(optname)) {
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len);
+ release_sock(sk);
+ if (err >= 0)
+ err = put_user(len, optlen);
+ return err;
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(compat_ip_getsockopt);
+#endif
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
new file mode 100644
index 000000000..4c2c3ba4b
--- /dev/null
+++ b/net/ipv4/ip_tunnel.c
@@ -0,0 +1,1191 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/rculist.h>
+#include <linux/err.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/udp.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
+{
+ return hash_32((__force u32)key ^ (__force u32)remote,
+ IP_TNL_HASH_BITS);
+}
+
+static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
+ struct dst_entry *dst, __be32 saddr)
+{
+ struct dst_entry *old_dst;
+
+ dst_clone(dst);
+ old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
+ dst_release(old_dst);
+ idst->saddr = saddr;
+}
+
+static noinline void tunnel_dst_set(struct ip_tunnel *t,
+ struct dst_entry *dst, __be32 saddr)
+{
+ __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
+}
+
+static void tunnel_dst_reset(struct ip_tunnel *t)
+{
+ tunnel_dst_set(t, NULL, 0);
+}
+
+void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
+}
+EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
+
+static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
+ u32 cookie, __be32 *saddr)
+{
+ struct ip_tunnel_dst *idst;
+ struct dst_entry *dst;
+
+ rcu_read_lock();
+ idst = raw_cpu_ptr(t->dst_cache);
+ dst = rcu_dereference(idst->dst);
+ if (dst && !atomic_inc_not_zero(&dst->__refcnt))
+ dst = NULL;
+ if (dst) {
+ if (!dst->obsolete || dst->ops->check(dst, cookie)) {
+ *saddr = idst->saddr;
+ } else {
+ tunnel_dst_reset(t);
+ dst_release(dst);
+ dst = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return (struct rtable *)dst;
+}
+
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
+ __be16 flags, __be32 key)
+{
+ if (p->i_flags & TUNNEL_KEY) {
+ if (flags & TUNNEL_KEY)
+ return key == p->i_key;
+ else
+ /* key expected, none present */
+ return false;
+ } else
+ return !(flags & TUNNEL_KEY);
+}
+
+/* Fallback tunnel: no source, no destination, no key, no options
+
+ Tunnel hash table:
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ Given src, dst and key, find appropriate for input tunnel.
+*/
+struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
+ int link, __be16 flags,
+ __be32 remote, __be32 local,
+ __be32 key)
+{
+ unsigned int hash;
+ struct ip_tunnel *t, *cand = NULL;
+ struct hlist_head *head;
+
+ hash = ip_tunnel_hash(key, remote);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local != t->parms.iph.saddr ||
+ remote != t->parms.iph.daddr ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else
+ cand = t;
+ }
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (remote != t->parms.iph.daddr ||
+ t->parms.iph.saddr != 0 ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ hash = ip_tunnel_hash(key, 0);
+ head = &itn->tunnels[hash];
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
+ (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
+ continue;
+
+ if (!(t->dev->flags & IFF_UP))
+ continue;
+
+ if (!ip_tunnel_key_match(&t->parms, flags, key))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+ if (flags & TUNNEL_NO_KEY)
+ goto skip_key_lookup;
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (t->parms.i_key != key ||
+ t->parms.iph.saddr != 0 ||
+ t->parms.iph.daddr != 0 ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->parms.link == link)
+ return t;
+ else if (!cand)
+ cand = t;
+ }
+
+skip_key_lookup:
+ if (cand)
+ return cand;
+
+ if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
+ return netdev_priv(itn->fb_tunnel_dev);
+
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
+
+static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ unsigned int h;
+ __be32 remote;
+ __be32 i_key = parms->i_key;
+
+ if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
+ remote = parms->iph.daddr;
+ else
+ remote = 0;
+
+ if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+ i_key = 0;
+
+ h = ip_tunnel_hash(i_key, remote);
+ return &itn->tunnels[h];
+}
+
+static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ struct hlist_head *head = ip_bucket(itn, &t->parms);
+
+ hlist_add_head_rcu(&t->hash_node, head);
+}
+
+static void ip_tunnel_del(struct ip_tunnel *t)
+{
+ hlist_del_init_rcu(&t->hash_node);
+}
+
+static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms,
+ int type)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ __be32 key = parms->i_key;
+ __be16 flags = parms->i_flags;
+ int link = parms->link;
+ struct ip_tunnel *t = NULL;
+ struct hlist_head *head = ip_bucket(itn, parms);
+
+ hlist_for_each_entry_rcu(t, head, hash_node) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ link == t->parms.link &&
+ type == t->dev->type &&
+ ip_tunnel_key_match(&t->parms, flags, key))
+ break;
+ }
+ return t;
+}
+
+static struct net_device *__ip_tunnel_create(struct net *net,
+ const struct rtnl_link_ops *ops,
+ struct ip_tunnel_parm *parms)
+{
+ int err;
+ struct ip_tunnel *tunnel;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else {
+ if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
+ err = -E2BIG;
+ goto failed;
+ }
+ strlcpy(name, ops->kind, IFNAMSIZ);
+ strncat(name, "%d", 2);
+ }
+
+ ASSERT_RTNL();
+ dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
+ if (!dev) {
+ err = -ENOMEM;
+ goto failed;
+ }
+ dev_net_set(dev, net);
+
+ dev->rtnl_link_ops = ops;
+
+ tunnel = netdev_priv(dev);
+ tunnel->parms = *parms;
+ tunnel->net = net;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto failed_free;
+
+ return dev;
+
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static inline void init_tunnel_flow(struct flowi4 *fl4,
+ int proto,
+ __be32 daddr, __be32 saddr,
+ __be32 key, __u8 tos, int oif)
+{
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->flowi4_oif = oif;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->flowi4_tos = tos;
+ fl4->flowi4_proto = proto;
+ fl4->fl4_gre_key = key;
+}
+
+static int ip_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = ETH_DATA_LEN;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ iph = &tunnel->parms.iph;
+
+ /* Guess output device to choose reasonable mtu and needed_headroom */
+ if (iph->daddr) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
+ iph->saddr, tunnel->parms.o_key,
+ RT_TOS(iph->tos), tunnel->parms.link);
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+ ip_rt_put(rt);
+ }
+ if (dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
+ mtu = tdev->mtu;
+ }
+
+ dev->needed_headroom = t_hlen + hlen;
+ mtu -= (dev->hard_header_len + t_hlen);
+
+ if (mtu < 68)
+ mtu = 68;
+
+ return mtu;
+}
+
+static struct ip_tunnel *ip_tunnel_create(struct net *net,
+ struct ip_tunnel_net *itn,
+ struct ip_tunnel_parm *parms)
+{
+ struct ip_tunnel *nt;
+ struct net_device *dev;
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
+ if (IS_ERR(dev))
+ return ERR_CAST(dev);
+
+ dev->mtu = ip_tunnel_bind_dev(dev);
+
+ nt = netdev_priv(dev);
+ ip_tunnel_add(itn, nt);
+ return nt;
+}
+
+int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi, bool log_ecn_error)
+{
+ struct pcpu_sw_netstats *tstats;
+ const struct iphdr *iph = ip_hdr(skb);
+ int err;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+ if (ipv4_is_multicast(iph->daddr)) {
+ tunnel->dev->stats.multicast++;
+ skb->pkt_type = PACKET_BROADCAST;
+ }
+#endif
+
+ if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
+ ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+
+ if (tunnel->parms.i_flags&TUNNEL_SEQ) {
+ if (!(tpi->flags&TUNNEL_SEQ) ||
+ (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = ntohl(tpi->seq) + 1;
+ }
+
+ skb_reset_network_header(skb);
+
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+ &iph->saddr, iph->tos);
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ goto drop;
+ }
+ }
+
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
+
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ } else {
+ skb->dev = tunnel->dev;
+ }
+
+ gro_cells_receive(&tunnel->gro_cells, skb);
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
+
+static int ip_encap_hlen(struct ip_tunnel_encap *e)
+{
+ const struct ip_tunnel_encap_ops *ops;
+ int hlen = -EINVAL;
+
+ if (e->type == TUNNEL_ENCAP_NONE)
+ return 0;
+
+ if (e->type >= MAX_IPTUN_ENCAP_OPS)
+ return -EINVAL;
+
+ rcu_read_lock();
+ ops = rcu_dereference(iptun_encaps[e->type]);
+ if (likely(ops && ops->encap_hlen))
+ hlen = ops->encap_hlen(e);
+ rcu_read_unlock();
+
+ return hlen;
+}
+
+const struct ip_tunnel_encap_ops __rcu *
+ iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+
+int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
+ unsigned int num)
+{
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ return !cmpxchg((const struct ip_tunnel_encap_ops **)
+ &iptun_encaps[num],
+ NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
+
+int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
+ unsigned int num)
+{
+ int ret;
+
+ if (num >= MAX_IPTUN_ENCAP_OPS)
+ return -ERANGE;
+
+ ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
+ &iptun_encaps[num],
+ ops, NULL) == ops) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
+
+int ip_tunnel_encap_setup(struct ip_tunnel *t,
+ struct ip_tunnel_encap *ipencap)
+{
+ int hlen;
+
+ memset(&t->encap, 0, sizeof(t->encap));
+
+ hlen = ip_encap_hlen(ipencap);
+ if (hlen < 0)
+ return hlen;
+
+ t->encap.type = ipencap->type;
+ t->encap.sport = ipencap->sport;
+ t->encap.dport = ipencap->dport;
+ t->encap.flags = ipencap->flags;
+
+ t->encap_hlen = hlen;
+ t->hlen = t->encap_hlen + t->tun_hlen;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
+
+int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ const struct ip_tunnel_encap_ops *ops;
+ int ret = -EINVAL;
+
+ if (t->encap.type == TUNNEL_ENCAP_NONE)
+ return 0;
+
+ if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
+ return -EINVAL;
+
+ rcu_read_lock();
+ ops = rcu_dereference(iptun_encaps[t->encap.type]);
+ if (likely(ops && ops->build_header))
+ ret = ops->build_header(skb, &t->encap, protocol, fl4);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(ip_tunnel_encap);
+
+static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+ struct rtable *rt, __be16 df)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+ int mtu;
+
+ if (df)
+ mtu = dst_mtu(&rt->dst) - dev->hard_header_len
+ - sizeof(struct iphdr) - tunnel->hlen;
+ else
+ mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (!skb_is_gso(skb) &&
+ (df & htons(IP_DF)) && mtu < pkt_size) {
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ return -E2BIG;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+ if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+ mtu >= IPV6_MIN_MTU) {
+ if ((tunnel->parms.iph.daddr &&
+ !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+ rt6->rt6i_dst.plen == 128) {
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+ }
+ }
+
+ if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+ mtu < pkt_size) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ return -E2BIG;
+ }
+ }
+#endif
+ return 0;
+}
+
+void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ const struct iphdr *tnl_params, u8 protocol)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *inner_iph;
+ struct flowi4 fl4;
+ u8 tos, ttl;
+ __be16 df;
+ struct rtable *rt; /* Route to the other host */
+ unsigned int max_headroom; /* The extra header space needed */
+ __be32 dst;
+ int err;
+ bool connected;
+
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ connected = (tunnel->parms.iph.daddr != 0);
+
+ dst = tnl_params->daddr;
+ if (dst == 0) {
+ /* NBMA tunnel */
+
+ if (!skb_dst(skb)) {
+ dev->stats.tx_fifo_errors++;
+ goto tx_error;
+ }
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ rt = skb_rtable(skb);
+ dst = rt_nexthop(rt, inner_iph->daddr);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6)) {
+ const struct in6_addr *addr6;
+ struct neighbour *neigh;
+ bool do_tx_error_icmp;
+ int addr_type;
+
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (!neigh)
+ goto tx_error;
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+ do_tx_error_icmp = true;
+ else {
+ do_tx_error_icmp = false;
+ dst = addr6->s6_addr32[3];
+ }
+ neigh_release(neigh);
+ if (do_tx_error_icmp)
+ goto tx_error_icmp;
+ }
+#endif
+ else
+ goto tx_error;
+
+ connected = false;
+ }
+
+ tos = tnl_params->tos;
+ if (tos & 0x1) {
+ tos &= ~0x1;
+ if (skb->protocol == htons(ETH_P_IP)) {
+ tos = inner_iph->tos;
+ connected = false;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ connected = false;
+ }
+ }
+
+ init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
+ tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
+
+ if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
+ goto tx_error;
+
+ rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
+
+ if (!rt) {
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (connected)
+ tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+ }
+
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = tnl_params->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+#endif
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+
+ df = tnl_params->frag_off;
+ if (skb->protocol == htons(ETH_P_IP))
+ df |= (inner_iph->frag_off&htons(IP_DF));
+
+ max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
+ if (max_headroom > dev->needed_headroom)
+ dev->needed_headroom = max_headroom;
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ ip_rt_put(rt);
+ dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ return;
+ }
+
+ err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
+ tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6)
+tx_error_icmp:
+ dst_link_failure(skb);
+#endif
+tx_error:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
+
+static void ip_tunnel_update(struct ip_tunnel_net *itn,
+ struct ip_tunnel *t,
+ struct net_device *dev,
+ struct ip_tunnel_parm *p,
+ bool set_mtu)
+{
+ ip_tunnel_del(t);
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ if (dev->type != ARPHRD_ETHER) {
+ memcpy(dev->dev_addr, &p->iph.saddr, 4);
+ memcpy(dev->broadcast, &p->iph.daddr, 4);
+ }
+ ip_tunnel_add(itn, t);
+
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ t->parms.iph.frag_off = p->iph.frag_off;
+
+ if (t->parms.link != p->link) {
+ int mtu;
+
+ t->parms.link = p->link;
+ mtu = ip_tunnel_bind_dev(dev);
+ if (set_mtu)
+ dev->mtu = mtu;
+ }
+ ip_tunnel_dst_reset_all(t);
+ netdev_state_change(dev);
+}
+
+int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
+
+ BUG_ON(!itn->fb_tunnel_dev);
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == itn->fb_tunnel_dev) {
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (!t)
+ t = netdev_priv(dev);
+ }
+ memcpy(p, &t->parms, sizeof(*p));
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
+ if (!(p->i_flags & VTI_ISVTI)) {
+ if (!(p->i_flags & TUNNEL_KEY))
+ p->i_key = 0;
+ if (!(p->o_flags & TUNNEL_KEY))
+ p->o_key = 0;
+ }
+
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+
+ if (cmd == SIOCADDTUNNEL) {
+ if (!t) {
+ t = ip_tunnel_create(net, itn, p);
+ err = PTR_ERR_OR_ZERO(t);
+ break;
+ }
+
+ err = -EEXIST;
+ break;
+ }
+ if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+ err = -EINVAL;
+ break;
+ }
+
+ t = netdev_priv(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+ ip_tunnel_update(itn, t, dev, p, true);
+ } else {
+ err = -ENOENT;
+ }
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == itn->fb_tunnel_dev) {
+ err = -ENOENT;
+ t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ if (!t)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(itn->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ if (new_mtu < 68 ||
+ new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ gro_cells_destroy(&tunnel->gro_cells);
+ free_percpu(tunnel->dst_cache);
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
+
+ if (itn->fb_tunnel_dev != dev) {
+ ip_tunnel_del(netdev_priv(dev));
+ unregister_netdevice_queue(dev, head);
+ }
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
+
+struct net *ip_tunnel_get_link_net(const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ return tunnel->net;
+}
+EXPORT_SYMBOL(ip_tunnel_get_link_net);
+
+int ip_tunnel_get_iflink(const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ return tunnel->parms.link;
+}
+EXPORT_SYMBOL(ip_tunnel_get_iflink);
+
+int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+ struct rtnl_link_ops *ops, char *devname)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+ struct ip_tunnel_parm parms;
+ unsigned int i;
+
+ for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&itn->tunnels[i]);
+
+ if (!ops) {
+ itn->fb_tunnel_dev = NULL;
+ return 0;
+ }
+
+ memset(&parms, 0, sizeof(parms));
+ if (devname)
+ strlcpy(parms.name, devname, IFNAMSIZ);
+
+ rtnl_lock();
+ itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ if (!IS_ERR(itn->fb_tunnel_dev)) {
+ itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
+ ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+ }
+ rtnl_unlock();
+
+ return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
+
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+ struct rtnl_link_ops *ops)
+{
+ struct net *net = dev_net(itn->fb_tunnel_dev);
+ struct net_device *dev, *aux;
+ int h;
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == ops)
+ unregister_netdevice_queue(dev, head);
+
+ for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ struct hlist_node *n;
+ struct hlist_head *thead = &itn->tunnels[h];
+
+ hlist_for_each_entry_safe(t, n, thead, hash_node)
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, head);
+ }
+}
+
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
+{
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ ip_tunnel_destroy(itn, &list, ops);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+
+int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *nt;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel_net *itn;
+ int mtu;
+ int err;
+
+ nt = netdev_priv(dev);
+ itn = net_generic(net, nt->ip_tnl_net_id);
+
+ if (ip_tunnel_find(itn, p, dev->type))
+ return -EEXIST;
+
+ nt->net = net;
+ nt->parms = *p;
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ mtu = ip_tunnel_bind_dev(dev);
+ if (!tb[IFLA_MTU])
+ dev->mtu = mtu;
+
+ ip_tunnel_add(itn, nt);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
+
+int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *t;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
+
+ if (dev == itn->fb_tunnel_dev)
+ return -EINVAL;
+
+ t = ip_tunnel_find(itn, p, dev->type);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ t = tunnel;
+
+ if (dev->type != ARPHRD_ETHER) {
+ unsigned int nflags = 0;
+
+ if (ipv4_is_multicast(p->iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p->iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags ^ nflags) &
+ (IFF_POINTOPOINT | IFF_BROADCAST))
+ return -EINVAL;
+ }
+ }
+
+ ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
+
+int ip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+ int err;
+
+ dev->destructor = ip_tunnel_dev_free;
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
+ if (!tunnel->dst_cache) {
+ free_percpu(dev->tstats);
+ return -ENOMEM;
+ }
+
+ err = gro_cells_init(&tunnel->gro_cells, dev);
+ if (err) {
+ free_percpu(tunnel->dst_cache);
+ free_percpu(dev->tstats);
+ return err;
+ }
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+ iph->version = 4;
+ iph->ihl = 5;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_init);
+
+void ip_tunnel_uninit(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+ /* fb_tunnel_dev will be unregisted in net-exit call. */
+ if (itn->fb_tunnel_dev != dev)
+ ip_tunnel_del(netdev_priv(dev));
+
+ ip_tunnel_dst_reset_all(tunnel);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
+
+/* Do least required initialization, rest of init is done in tunnel_init call */
+void ip_tunnel_setup(struct net_device *dev, int net_id)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ tunnel->ip_tnl_net_id = net_id;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_setup);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
new file mode 100644
index 000000000..ce63ab21b
--- /dev/null
+++ b/net/ipv4/ip_tunnel_core.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+
+int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+ int pkt_len = skb->len;
+ struct iphdr *iph;
+ int err;
+
+ skb_scrub_packet(skb, xnet);
+
+ skb_clear_hash(skb);
+ skb_dst_set(skb, &rt->dst);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = proto;
+ iph->tos = tos;
+ iph->daddr = dst;
+ iph->saddr = src;
+ iph->ttl = ttl;
+ __ip_select_ident(dev_net(rt->dst.dev), iph,
+ skb_shinfo(skb)->gso_segs ?: 1);
+
+ err = ip_local_out_sk(sk, skb);
+ if (unlikely(net_xmit_eval(err)))
+ pkt_len = 0;
+ return pkt_len;
+}
+EXPORT_SYMBOL_GPL(iptunnel_xmit);
+
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
+{
+ if (unlikely(!pskb_may_pull(skb, hdr_len)))
+ return -ENOMEM;
+
+ skb_pull_rcsum(skb, hdr_len);
+
+ if (inner_proto == htons(ETH_P_TEB)) {
+ struct ethhdr *eh;
+
+ if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+ return -ENOMEM;
+
+ eh = (struct ethhdr *)skb->data;
+ if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
+ skb->protocol = eh->h_proto;
+ else
+ skb->protocol = htons(ETH_P_802_2);
+
+ } else {
+ skb->protocol = inner_proto;
+ }
+
+ nf_reset(skb);
+ secpath_reset(skb);
+ skb_clear_hash_if_not_l4(skb);
+ skb_dst_drop(skb);
+ skb->vlan_tci = 0;
+ skb_set_queue_mapping(skb, 0);
+ skb->pkt_type = PACKET_HOST;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+
+struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
+ bool csum_help,
+ int gso_type_mask)
+{
+ int err;
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= gso_type_mask;
+ return skb;
+ }
+
+ /* If packet is not gso and we are resolving any partial checksum,
+ * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
+ * on the outer header without confusing devices that implement
+ * NETIF_F_IP_CSUM with encapsulation.
+ */
+ if (csum_help)
+ skb->encapsulation = 0;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
+
+/* Often modified stats are per cpu, other are shared (netdev->stats) */
+struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_sw_netstats *tstats =
+ per_cpu_ptr(dev->tstats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ }
+
+ tot->multicast = dev->stats.multicast;
+
+ tot->rx_crc_errors = dev->stats.rx_crc_errors;
+ tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
+ tot->rx_length_errors = dev->stats.rx_length_errors;
+ tot->rx_frame_errors = dev->stats.rx_frame_errors;
+ tot->rx_errors = dev->stats.rx_errors;
+
+ tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
+ tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
+ tot->tx_dropped = dev->stats.tx_dropped;
+ tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
+ tot->tx_errors = dev->stats.tx_errors;
+
+ tot->collisions = dev->stats.collisions;
+
+ return tot;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
new file mode 100644
index 000000000..0c152087c
--- /dev/null
+++ b/net/ipv4/ip_vti.c
@@ -0,0 +1,599 @@
+/*
+ * Linux NET3: IP/IP protocol decoder modified to support
+ * virtual tunnel interface
+ *
+ * Authors:
+ * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/*
+ This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+#include <linux/icmpv6.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+static struct rtnl_link_ops vti_link_ops __read_mostly;
+
+static int vti_net_id __read_mostly;
+static int vti_tunnel_init(struct net_device *dev);
+
+static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+ }
+
+ return -EINVAL;
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int vti_rcv(struct sk_buff *skb)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
+}
+
+static int vti_rcv_cb(struct sk_buff *skb, int err)
+{
+ unsigned short family;
+ struct net_device *dev;
+ struct pcpu_sw_netstats *tstats;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
+ u32 orig_mark = skb->mark;
+ int ret;
+
+ if (!tunnel)
+ return 1;
+
+ dev = tunnel->dev;
+
+ if (err) {
+ dev->stats.rx_errors++;
+ dev->stats.rx_dropped++;
+
+ return 0;
+ }
+
+ x = xfrm_input_state(skb);
+ family = x->inner_mode->afinfo->family;
+
+ skb->mark = be32_to_cpu(tunnel->parms.i_key);
+ ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+ skb->mark = orig_mark;
+
+ if (!ret)
+ return -EPERM;
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
+ skb->dev = dev;
+
+ tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ return 0;
+}
+
+static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
+{
+ xfrm_address_t *daddr = (xfrm_address_t *)&dst;
+ xfrm_address_t *saddr = (xfrm_address_t *)&src;
+
+ /* if there is no transform then this tunnel is not functional.
+ * Or if the xfrm is not mode tunnel.
+ */
+ if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+ x->props.family != AF_INET)
+ return false;
+
+ if (!dst)
+ return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
+
+ if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
+ return false;
+
+ return true;
+}
+
+static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
+ struct flowi *fl)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm *parms = &tunnel->parms;
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *tdev; /* Device to other host */
+ int err;
+
+ if (!dst) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+
+ dst_hold(dst);
+ dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0);
+ if (IS_ERR(dst)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+
+ if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
+ dev->stats.tx_carrier_errors++;
+ dst_release(dst);
+ goto tx_error_icmp;
+ }
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ dst_release(dst);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = skb_dst(skb)->dev;
+
+ err = dst_output(skb);
+ if (net_xmit_eval(err) == 0)
+ err = skb->len;
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ return NETDEV_TX_OK;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+/* This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ xfrm_decode_session(skb, &fl, AF_INET);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ break;
+ case htons(ETH_P_IPV6):
+ xfrm_decode_session(skb, &fl, AF_INET6);
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ break;
+ default:
+ dev->stats.tx_errors++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ /* override mark with tunnel output key */
+ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);
+
+ return vti_xmit(skb, dev, &fl);
+}
+
+static int vti4_err(struct sk_buff *skb, u32 info)
+{
+ __be32 spi;
+ __u32 mark;
+ struct xfrm_state *x;
+ struct ip_tunnel *tunnel;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah ;
+ struct ip_comp_hdr *ipch;
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ int protocol = iph->protocol;
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!tunnel)
+ return -1;
+
+ mark = be32_to_cpu(tunnel->parms.o_key);
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int
+vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5)
+ return -EINVAL;
+ }
+
+ if (!(p.i_flags & GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags & GRE_KEY))
+ p.o_key = 0;
+
+ p.i_flags = VTI_ISVTI;
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ if (cmd != SIOCDELTUNNEL) {
+ p.i_flags |= GRE_KEY;
+ p.o_flags |= GRE_KEY;
+ }
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+ return 0;
+}
+
+static const struct net_device_ops vti_netdev_ops = {
+ .ndo_init = vti_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = vti_tunnel_xmit,
+ .ndo_do_ioctl = vti_tunnel_ioctl,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+static void vti_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &vti_netdev_ops;
+ dev->type = ARPHRD_TUNNEL;
+ ip_tunnel_setup(dev, vti_net_id);
+}
+
+static int vti_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ memcpy(dev->dev_addr, &iph->saddr, 4);
+ memcpy(dev->broadcast, &iph->daddr, 4);
+
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
+ dev->mtu = ETH_DATA_LEN;
+ dev->flags = IFF_NOARP;
+ dev->addr_len = 4;
+ dev->features |= NETIF_F_LLTX;
+ netif_keep_dst(dev);
+
+ return ip_tunnel_init(dev);
+}
+
+static void __net_init vti_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+}
+
+static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
+ .handler = vti_rcv,
+ .input_handler = vti_input,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 100,
+};
+
+static int __net_init vti_init_net(struct net *net)
+{
+ int err;
+ struct ip_tunnel_net *itn;
+
+ err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
+ if (err)
+ return err;
+ itn = net_generic(net, vti_net_id);
+ vti_fb_tunnel_init(itn->fb_tunnel_dev);
+ return 0;
+}
+
+static void __net_exit vti_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ ip_tunnel_delete_net(itn, &vti_link_ops);
+}
+
+static struct pernet_operations vti_net_ops = {
+ .init = vti_init_net,
+ .exit = vti_exit_net,
+ .id = &vti_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ return 0;
+}
+
+static void vti_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_IPIP;
+
+ if (!data)
+ return;
+
+ parms->i_flags = VTI_ISVTI;
+
+ if (data[IFLA_VTI_LINK])
+ parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+ if (data[IFLA_VTI_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+ if (data[IFLA_VTI_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+
+ if (data[IFLA_VTI_LOCAL])
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]);
+
+ if (data[IFLA_VTI_REMOTE])
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]);
+
+}
+
+static int vti_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip_tunnel_parm parms;
+
+ vti_netlink_parms(data, &parms);
+ return ip_tunnel_newlink(dev, tb, &parms);
+}
+
+static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+
+ vti_netlink_parms(data, &p);
+ return ip_tunnel_changelink(dev, tb, &p);
+}
+
+static size_t vti_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_VTI_LINK */
+ nla_total_size(4) +
+ /* IFLA_VTI_IKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_OKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_VTI_REMOTE */
+ nla_total_size(4) +
+ 0;
+}
+
+static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm *p = &t->parms;
+
+ nla_put_u32(skb, IFLA_VTI_LINK, p->link);
+ nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
+ nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
+ nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr);
+ nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr);
+
+ return 0;
+}
+
+static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
+ [IFLA_VTI_LINK] = { .type = NLA_U32 },
+ [IFLA_VTI_IKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_OKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+ [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+};
+
+static struct rtnl_link_ops vti_link_ops __read_mostly = {
+ .kind = "vti",
+ .maxtype = IFLA_VTI_MAX,
+ .policy = vti_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = vti_tunnel_setup,
+ .validate = vti_tunnel_validate,
+ .newlink = vti_newlink,
+ .changelink = vti_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = vti_get_size,
+ .fill_info = vti_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static int __init vti_init(void)
+{
+ const char *msg;
+ int err;
+
+ pr_info("IPv4 over IPsec tunneling driver\n");
+
+ msg = "tunnel device";
+ err = register_pernet_device(&vti_net_ops);
+ if (err < 0)
+ goto pernet_dev_failed;
+
+ msg = "tunnel protocols";
+ err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ msg = "netlink interface";
+ err = rtnl_link_register(&vti_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ return err;
+
+rtnl_link_failed:
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ unregister_pernet_device(&vti_net_ops);
+pernet_dev_failed:
+ pr_err("vti init: failed to register %s\n", msg);
+ return err;
+}
+
+static void __exit vti_fini(void)
+{
+ rtnl_link_unregister(&vti_link_ops);
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti_net_ops);
+}
+
+module_init(vti_init);
+module_exit(vti_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti");
+MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 000000000..d97f4f278
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,204 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Todo:
+ * - Tunable compression parameters.
+ * - Compression stats.
+ * - Adaptive compression.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+#include <net/ipcomp.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+
+static int ipcomp4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ __be32 spi;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ spi = htonl(ntohs(ipch->cpi));
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, IPPROTO_COMP, AF_INET);
+ if (!x)
+ return 0;
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+/* We always hold one tunnel user reference to indicate a tunnel */
+static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ struct xfrm_state *t;
+
+ t = xfrm_state_alloc(net);
+ if (!t)
+ goto out;
+
+ t->id.proto = IPPROTO_IPIP;
+ t->id.spi = x->props.saddr.a4;
+ t->id.daddr.a4 = x->id.daddr.a4;
+ memcpy(&t->sel, &x->sel, sizeof(t->sel));
+ t->props.family = AF_INET;
+ t->props.mode = x->props.mode;
+ t->props.saddr.a4 = x->props.saddr.a4;
+ t->props.flags = x->props.flags;
+ t->props.extra_flags = x->props.extra_flags;
+ memcpy(&t->mark, &x->mark, sizeof(t->mark));
+
+ if (xfrm_init_state(t))
+ goto error;
+
+ atomic_set(&t->tunnel_users, 1);
+out:
+ return t;
+
+error:
+ t->km.state = XFRM_STATE_DEAD;
+ xfrm_state_put(t);
+ t = NULL;
+ goto out;
+}
+
+/*
+ * Must be protected by xfrm_cfg_mutex. State and tunnel user references are
+ * always incremented on success.
+ */
+static int ipcomp_tunnel_attach(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ int err = 0;
+ struct xfrm_state *t;
+ u32 mark = x->mark.v & x->mark.m;
+
+ t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
+ x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+ if (!t) {
+ t = ipcomp_tunnel_create(x);
+ if (!t) {
+ err = -EINVAL;
+ goto out;
+ }
+ xfrm_state_insert(t);
+ xfrm_state_hold(t);
+ }
+ x->tunnel = t;
+ atomic_inc(&t->tunnel_users);
+out:
+ return err;
+}
+
+static int ipcomp4_init_state(struct xfrm_state *x)
+{
+ int err = -EINVAL;
+
+ x->props.header_len = 0;
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
+ x->props.header_len += sizeof(struct iphdr);
+ break;
+ default:
+ goto out;
+ }
+
+ err = ipcomp_init_state(x);
+ if (err)
+ goto out;
+
+ if (x->props.mode == XFRM_MODE_TUNNEL) {
+ err = ipcomp_tunnel_attach(x);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type ipcomp_type = {
+ .description = "IPCOMP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_COMP,
+ .init_state = ipcomp4_init_state,
+ .destructor = ipcomp_destroy,
+ .input = ipcomp_input,
+ .output = ipcomp_output
+};
+
+static struct xfrm4_protocol ipcomp4_protocol = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = ipcomp4_rcv_cb,
+ .err_handler = ipcomp4_err,
+ .priority = 0,
+};
+
+static int __init ipcomp4_init(void)
+{
+ if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&ipcomp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ipcomp4_fini(void)
+{
+ if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(ipcomp4_init);
+module_exit(ipcomp4_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 000000000..8e7328c6a
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1693 @@
+/*
+ * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
+ * user-supplied information to configure own IP address and routes.
+ *
+ * Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Derived from network configuration code in fs/nfs/nfsroot.c,
+ * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ * BOOTP rewritten to construct and analyse packets itself instead
+ * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ * -- MJ, December 1998
+ *
+ * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ * initialization scheme.
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
+ *
+ * DHCP support added. To users this looks like a whole separate
+ * protocol, but we know it's just a bag on the side of BOOTP.
+ * -- Chip Salzenberg <chip@valinux.com>, May 2000
+ *
+ * Ported DHCP support from 2.2.16 to 2.4.0-test4
+ * -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
+ *
+ * Merged changes from 2.2.19 into 2.4.3
+ * -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
+ *
+ * Multiple Nameservers in /proc/net/pnp
+ * -- Josef Siemes <jsiemes@web.de>, Aug 2002
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+#include <linux/delay.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+#include <net/route.h>
+
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <asm/processor.h>
+
+/* Define this to allow debugging output */
+#undef IPCONFIG_DEBUG
+
+#ifdef IPCONFIG_DEBUG
+#define DBG(x) printk x
+#else
+#define DBG(x) do { } while(0)
+#endif
+
+#if defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_DHCP
+#endif
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_BOOTP
+#endif
+#if defined(CONFIG_IP_PNP_RARP)
+#define IPCONFIG_RARP
+#endif
+#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
+#define IPCONFIG_DYNAMIC
+#endif
+
+/* Define the friendly delay before and after opening net devices */
+#define CONF_POST_OPEN 10 /* After opening: 10 msecs */
+#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */
+
+/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
+#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
+#define CONF_SEND_RETRIES 6 /* Send six requests per open */
+#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */
+#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */
+#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */
+#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers
+ - '3' from resolv.h */
+
+#define NONE cpu_to_be32(INADDR_NONE)
+#define ANY cpu_to_be32(INADDR_ANY)
+
+/*
+ * Public IP configuration
+ */
+
+/* This is used by platforms which might be able to set the ipconfig
+ * variables using firmware environment vars. If this is set, it will
+ * ignore such firmware variables.
+ */
+int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */
+
+static int ic_enable __initdata; /* IP config enabled? */
+
+/* Protocol choice */
+int ic_proto_enabled __initdata = 0
+#ifdef IPCONFIG_BOOTP
+ | IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_DHCP
+ | IC_USE_DHCP
+#endif
+#ifdef IPCONFIG_RARP
+ | IC_RARP
+#endif
+ ;
+
+static int ic_host_name_set __initdata; /* Host name set by us? */
+
+__be32 ic_myaddr = NONE; /* My IP address */
+static __be32 ic_netmask = NONE; /* Netmask for local subnet */
+__be32 ic_gateway = NONE; /* Gateway IP address */
+
+__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+
+__be32 ic_servaddr = NONE; /* Boot server IP address */
+
+__be32 root_server_addr = NONE; /* Address of NFS server */
+u8 root_server_path[256] = { 0, }; /* Path to mount as root */
+
+/* vendor class identifier */
+static char vendor_class_identifier[253] __initdata;
+
+/* Persistent data: */
+
+static int ic_proto_used; /* Protocol used, if any */
+static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static u8 ic_domain[64]; /* DNS (not NIS) domain name */
+
+/*
+ * Private state.
+ */
+
+/* Name of user-selected boot device */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
+
+/* Protocols supported by available interfaces */
+static int ic_proto_have_if __initdata;
+
+/* MTU for boot device */
+static int ic_dev_mtu __initdata;
+
+#ifdef IPCONFIG_DYNAMIC
+static DEFINE_SPINLOCK(ic_recv_lock);
+static volatile int ic_got_reply __initdata; /* Proto(s) that replied */
+#endif
+#ifdef IPCONFIG_DHCP
+static int ic_dhcp_msgtype __initdata; /* DHCP msg type received */
+#endif
+
+
+/*
+ * Network devices
+ */
+
+struct ic_device {
+ struct ic_device *next;
+ struct net_device *dev;
+ unsigned short flags;
+ short able;
+ __be32 xid;
+};
+
+static struct ic_device *ic_first_dev __initdata; /* List of open device */
+static struct net_device *ic_dev __initdata; /* Selected device */
+
+static bool __init ic_is_init_dev(struct net_device *dev)
+{
+ if (dev->flags & IFF_LOOPBACK)
+ return false;
+ return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+ (!(dev->flags & IFF_LOOPBACK) &&
+ (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+ strncmp(dev->name, "dummy", 5));
+}
+
+static int __init ic_open_devs(void)
+{
+ struct ic_device *d, **last;
+ struct net_device *dev;
+ unsigned short oflags;
+ unsigned long start, next_msg;
+
+ last = &ic_first_dev;
+ rtnl_lock();
+
+ /* bring loopback and DSA master network devices up first */
+ for_each_netdev(&init_net, dev) {
+ if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
+ continue;
+ if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+ pr_err("IP-Config: Failed to open %s\n", dev->name);
+ }
+
+ for_each_netdev(&init_net, dev) {
+ if (ic_is_init_dev(dev)) {
+ int able = 0;
+ if (dev->mtu >= 364)
+ able |= IC_BOOTP;
+ else
+ pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small",
+ dev->name, dev->mtu);
+ if (!(dev->flags & IFF_NOARP))
+ able |= IC_RARP;
+ able &= ic_proto_enabled;
+ if (ic_proto_enabled && !able)
+ continue;
+ oflags = dev->flags;
+ if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ pr_err("IP-Config: Failed to open %s\n",
+ dev->name);
+ continue;
+ }
+ if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
+ rtnl_unlock();
+ return -ENOMEM;
+ }
+ d->dev = dev;
+ *last = d;
+ last = &d->next;
+ d->flags = oflags;
+ d->able = able;
+ if (able & IC_BOOTP)
+ get_random_bytes(&d->xid, sizeof(__be32));
+ else
+ d->xid = 0;
+ ic_proto_have_if |= able;
+ DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
+ dev->name, able, d->xid));
+ }
+ }
+
+ /* no point in waiting if we could not bring up at least one device */
+ if (!ic_first_dev)
+ goto have_carrier;
+
+ /* wait for a carrier on at least one device */
+ start = jiffies;
+ next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ while (time_before(jiffies, start +
+ msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) {
+ int wait, elapsed;
+
+ for_each_netdev(&init_net, dev)
+ if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
+ goto have_carrier;
+
+ msleep(1);
+
+ if (time_before(jiffies, next_msg))
+ continue;
+
+ elapsed = jiffies_to_msecs(jiffies - start);
+ wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000;
+ pr_info("Waiting up to %d more seconds for network.\n", wait);
+ next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ }
+have_carrier:
+ rtnl_unlock();
+
+ *last = NULL;
+
+ if (!ic_first_dev) {
+ if (user_dev_name[0])
+ pr_err("IP-Config: Device `%s' not found\n",
+ user_dev_name);
+ else
+ pr_err("IP-Config: No network devices available\n");
+ return -ENODEV;
+ }
+ return 0;
+}
+
+static void __init ic_close_devs(void)
+{
+ struct ic_device *d, *next;
+ struct net_device *dev;
+
+ rtnl_lock();
+ next = ic_first_dev;
+ while ((d = next)) {
+ next = d->next;
+ dev = d->dev;
+ if (dev != ic_dev && !netdev_uses_dsa(dev)) {
+ DBG(("IP-Config: Downing %s\n", dev->name));
+ dev_change_flags(dev, d->flags);
+ }
+ kfree(d);
+ }
+ rtnl_unlock();
+}
+
+/*
+ * Interface to various network functions.
+ */
+
+static inline void
+set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
+{
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = addr;
+ sin->sin_port = port;
+}
+
+static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
+{
+ int res;
+
+ mm_segment_t oldfs = get_fs();
+ set_fs(get_ds());
+ res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg);
+ set_fs(oldfs);
+ return res;
+}
+
+/*
+ * Set up interface addresses and routes.
+ */
+
+static int __init ic_setup_if(void)
+{
+ struct ifreq ir;
+ struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+ int err;
+
+ memset(&ir, 0, sizeof(ir));
+ strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+ set_sockaddr(sin, ic_myaddr, 0);
+ if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface address (%d)\n",
+ err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_netmask, 0);
+ if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface netmask (%d)\n",
+ err);
+ return -1;
+ }
+ set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+ if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+ pr_err("IP-Config: Unable to set interface broadcast address (%d)\n",
+ err);
+ return -1;
+ }
+ /* Handle the case where we need non-standard MTU on the boot link (a network
+ * using jumbo frames, for instance). If we can't set the mtu, don't error
+ * out, we'll try to muddle along.
+ */
+ if (ic_dev_mtu != 0) {
+ strcpy(ir.ifr_name, ic_dev->name);
+ ir.ifr_mtu = ic_dev_mtu;
+ if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
+ pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n",
+ ic_dev_mtu, err);
+ }
+ return 0;
+}
+
+static int __init ic_setup_routes(void)
+{
+ /* No need to setup device routes, only the default route... */
+
+ if (ic_gateway != NONE) {
+ struct rtentry rm;
+ int err;
+
+ memset(&rm, 0, sizeof(rm));
+ if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+ pr_err("IP-Config: Gateway not on directly connected network\n");
+ return -1;
+ }
+ set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+ set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+ rm.rt_flags = RTF_UP | RTF_GATEWAY;
+ if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+ pr_err("IP-Config: Cannot add default route (%d)\n",
+ err);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Fill in default values for all missing parameters.
+ */
+
+static int __init ic_defaults(void)
+{
+ /*
+ * At this point we have no userspace running so need not
+ * claim locks on system_utsname
+ */
+
+ if (!ic_host_name_set)
+ sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr);
+
+ if (root_server_addr == NONE)
+ root_server_addr = ic_servaddr;
+
+ if (ic_netmask == NONE) {
+ if (IN_CLASSA(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSA_NET);
+ else if (IN_CLASSB(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSB_NET);
+ else if (IN_CLASSC(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSC_NET);
+ else {
+ pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
+ &ic_myaddr);
+ return -1;
+ }
+ printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
+ }
+
+ return 0;
+}
+
+/*
+ * RARP support.
+ */
+
+#ifdef IPCONFIG_RARP
+
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type rarp_packet_type __initdata = {
+ .type = cpu_to_be16(ETH_P_RARP),
+ .func = ic_rarp_recv,
+};
+
+static inline void __init ic_rarp_init(void)
+{
+ dev_add_pack(&rarp_packet_type);
+}
+
+static inline void __init ic_rarp_cleanup(void)
+{
+ dev_remove_pack(&rarp_packet_type);
+}
+
+/*
+ * Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct arphdr *rarp;
+ unsigned char *rarp_ptr;
+ __be32 sip, tip;
+ unsigned char *tha; /* t for "target" */
+ struct ic_device *d;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb, sizeof(struct arphdr)))
+ goto drop;
+
+ /* Basic sanity checks can be done without the lock. */
+ rarp = (struct arphdr *)skb_transport_header(skb);
+
+ /* If this test doesn't pass, it's not IP, or we should
+ * ignore it anyway.
+ */
+ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+ goto drop;
+
+ /* If it's not a RARP reply, delete it. */
+ if (rarp->ar_op != htons(ARPOP_RREPLY))
+ goto drop;
+
+ /* If it's not Ethernet, delete it. */
+ if (rarp->ar_pro != htons(ETH_P_IP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+ goto drop;
+
+ /* OK, it is all there and looks valid, process... */
+ rarp = (struct arphdr *)skb_transport_header(skb);
+ rarp_ptr = (unsigned char *) (rarp + 1);
+
+ /* One reply at a time, please. */
+ spin_lock(&ic_recv_lock);
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop_unlock;
+
+ /* Find the ic_device that the packet arrived on */
+ d = ic_first_dev;
+ while (d && d->dev != dev)
+ d = d->next;
+ if (!d)
+ goto drop_unlock; /* should never happen */
+
+ /* Extract variable-width fields */
+ rarp_ptr += dev->addr_len;
+ memcpy(&sip, rarp_ptr, 4);
+ rarp_ptr += 4;
+ tha = rarp_ptr;
+ rarp_ptr += dev->addr_len;
+ memcpy(&tip, rarp_ptr, 4);
+
+ /* Discard packets which are not meant for us. */
+ if (memcmp(tha, dev->dev_addr, dev->addr_len))
+ goto drop_unlock;
+
+ /* Discard packets which are not from specified server. */
+ if (ic_servaddr != NONE && ic_servaddr != sip)
+ goto drop_unlock;
+
+ /* We have a winner! */
+ ic_dev = dev;
+ if (ic_myaddr == NONE)
+ ic_myaddr = tip;
+ ic_servaddr = sip;
+ ic_addrservaddr = sip;
+ ic_got_reply = IC_RARP;
+
+drop_unlock:
+ /* Show's over. Nothing to see here. */
+ spin_unlock(&ic_recv_lock);
+
+drop:
+ /* Throw the packet out. */
+ kfree_skb(skb);
+ return 0;
+}
+
+
+/*
+ * Send RARP request packet over a single interface.
+ */
+static void __init ic_rarp_send_if(struct ic_device *d)
+{
+ struct net_device *dev = d->dev;
+ arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
+ dev->dev_addr, dev->dev_addr);
+}
+#endif
+
+/*
+ * Predefine Nameservers
+ */
+static inline void __init ic_nameservers_predef(void)
+{
+ int i;
+
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ ic_nameservers[i] = NONE;
+}
+
+/*
+ * DHCP/BOOTP support.
+ */
+
+#ifdef IPCONFIG_BOOTP
+
+struct bootp_pkt { /* BOOTP packet format */
+ struct iphdr iph; /* IP header */
+ struct udphdr udph; /* UDP header */
+ u8 op; /* 1=request, 2=reply */
+ u8 htype; /* HW address type */
+ u8 hlen; /* HW address length */
+ u8 hops; /* Used only by gateways */
+ __be32 xid; /* Transaction ID */
+ __be16 secs; /* Seconds since we started */
+ __be16 flags; /* Just what it says */
+ __be32 client_ip; /* Client's IP address if known */
+ __be32 your_ip; /* Assigned IP address */
+ __be32 server_ip; /* (Next, e.g. NFS) Server's IP address */
+ __be32 relay_ip; /* IP address of BOOTP relay */
+ u8 hw_addr[16]; /* Client's HW address */
+ u8 serv_name[64]; /* Server host name */
+ u8 boot_file[128]; /* Name of boot file */
+ u8 exten[312]; /* DHCP options / BOOTP vendor extensions */
+};
+
+/* packet ops */
+#define BOOTP_REQUEST 1
+#define BOOTP_REPLY 2
+
+/* DHCP message types */
+#define DHCPDISCOVER 1
+#define DHCPOFFER 2
+#define DHCPREQUEST 3
+#define DHCPDECLINE 4
+#define DHCPACK 5
+#define DHCPNAK 6
+#define DHCPRELEASE 7
+#define DHCPINFORM 8
+
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type bootp_packet_type __initdata = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .func = ic_bootp_recv,
+};
+
+static __be32 ic_dev_xid; /* Device under configuration */
+
+/*
+ * Initialize DHCP/BOOTP extension fields in the request.
+ */
+
+static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
+
+#ifdef IPCONFIG_DHCP
+
+static void __init
+ic_dhcp_init_options(u8 *options)
+{
+ u8 mt = ((ic_servaddr == NONE)
+ ? DHCPDISCOVER : DHCPREQUEST);
+ u8 *e = options;
+ int len;
+
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Sending message type %d\n", mt);
+#endif
+
+ memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
+ e += 4;
+
+ *e++ = 53; /* DHCP message type */
+ *e++ = 1;
+ *e++ = mt;
+
+ if (mt == DHCPREQUEST) {
+ *e++ = 54; /* Server ID (IP address) */
+ *e++ = 4;
+ memcpy(e, &ic_servaddr, 4);
+ e += 4;
+
+ *e++ = 50; /* Requested IP address */
+ *e++ = 4;
+ memcpy(e, &ic_myaddr, 4);
+ e += 4;
+ }
+
+ /* always? */
+ {
+ static const u8 ic_req_params[] = {
+ 1, /* Subnet mask */
+ 3, /* Default gateway */
+ 6, /* DNS server */
+ 12, /* Host name */
+ 15, /* Domain name */
+ 17, /* Boot path */
+ 26, /* MTU */
+ 40, /* NIS domain name */
+ };
+
+ *e++ = 55; /* Parameter request list */
+ *e++ = sizeof(ic_req_params);
+ memcpy(e, ic_req_params, sizeof(ic_req_params));
+ e += sizeof(ic_req_params);
+
+ if (ic_host_name_set) {
+ *e++ = 12; /* host-name */
+ len = strlen(utsname()->nodename);
+ *e++ = len;
+ memcpy(e, utsname()->nodename, len);
+ e += len;
+ }
+ if (*vendor_class_identifier) {
+ pr_info("DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
+ *e++ = 60; /* Class-identifier */
+ len = strlen(vendor_class_identifier);
+ *e++ = len;
+ memcpy(e, vendor_class_identifier, len);
+ e += len;
+ }
+ }
+
+ *e++ = 255; /* End of the list */
+}
+
+#endif /* IPCONFIG_DHCP */
+
+static void __init ic_bootp_init_ext(u8 *e)
+{
+ memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
+ e += 4;
+ *e++ = 1; /* Subnet mask request */
+ *e++ = 4;
+ e += 4;
+ *e++ = 3; /* Default gateway request */
+ *e++ = 4;
+ e += 4;
+ *e++ = 5; /* Name server request */
+ *e++ = 8;
+ e += 8;
+ *e++ = 12; /* Host name request */
+ *e++ = 32;
+ e += 32;
+ *e++ = 40; /* NIS Domain name request */
+ *e++ = 32;
+ e += 32;
+ *e++ = 17; /* Boot path */
+ *e++ = 40;
+ e += 40;
+
+ *e++ = 57; /* set extension buffer size for reply */
+ *e++ = 2;
+ *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */
+ *e++ = 150;
+
+ *e++ = 255; /* End of the list */
+}
+
+
+/*
+ * Initialize the DHCP/BOOTP mechanism.
+ */
+static inline void __init ic_bootp_init(void)
+{
+ ic_nameservers_predef();
+
+ dev_add_pack(&bootp_packet_type);
+}
+
+
+/*
+ * DHCP/BOOTP cleanup.
+ */
+static inline void __init ic_bootp_cleanup(void)
+{
+ dev_remove_pack(&bootp_packet_type);
+}
+
+
+/*
+ * Send DHCP/BOOTP request to single interface.
+ */
+static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff)
+{
+ struct net_device *dev = d->dev;
+ struct sk_buff *skb;
+ struct bootp_pkt *b;
+ struct iphdr *h;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+
+ /* Allocate packet */
+ skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15,
+ GFP_KERNEL);
+ if (!skb)
+ return;
+ skb_reserve(skb, hlen);
+ b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
+ memset(b, 0, sizeof(struct bootp_pkt));
+
+ /* Construct IP header */
+ skb_reset_network_header(skb);
+ h = ip_hdr(skb);
+ h->version = 4;
+ h->ihl = 5;
+ h->tot_len = htons(sizeof(struct bootp_pkt));
+ h->frag_off = htons(IP_DF);
+ h->ttl = 64;
+ h->protocol = IPPROTO_UDP;
+ h->daddr = htonl(INADDR_BROADCAST);
+ h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+ /* Construct UDP header */
+ b->udph.source = htons(68);
+ b->udph.dest = htons(67);
+ b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
+ /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+ /* Construct DHCP/BOOTP header */
+ b->op = BOOTP_REQUEST;
+ if (dev->type < 256) /* check for false types */
+ b->htype = dev->type;
+ else if (dev->type == ARPHRD_FDDI)
+ b->htype = ARPHRD_ETHER;
+ else {
+ printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+ b->htype = dev->type; /* can cause undefined behavior */
+ }
+
+ /* server_ip and your_ip address are both already zero per RFC2131 */
+ b->hlen = dev->addr_len;
+ memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
+ b->secs = htons(jiffies_diff / HZ);
+ b->xid = d->xid;
+
+ /* add DHCP options or BOOTP extensions */
+#ifdef IPCONFIG_DHCP
+ if (ic_proto_enabled & IC_USE_DHCP)
+ ic_dhcp_init_options(b->exten);
+ else
+#endif
+ ic_bootp_init_ext(b->exten);
+
+ /* Chain packet down the line... */
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IP);
+ if (dev_hard_header(skb, dev, ntohs(skb->protocol),
+ dev->broadcast, dev->dev_addr, skb->len) < 0) {
+ kfree_skb(skb);
+ printk("E");
+ return;
+ }
+
+ if (dev_queue_xmit(skb) < 0)
+ printk("E");
+}
+
+
+/*
+ * Copy BOOTP-supplied string if not already set.
+ */
+static int __init ic_bootp_string(char *dest, char *src, int len, int max)
+{
+ if (!len)
+ return 0;
+ if (len > max-1)
+ len = max-1;
+ memcpy(dest, src, len);
+ dest[len] = '\0';
+ return 1;
+}
+
+
+/*
+ * Process BOOTP extensions.
+ */
+static void __init ic_do_bootp_ext(u8 *ext)
+{
+ u8 servers;
+ int i;
+ __be16 mtu;
+
+#ifdef IPCONFIG_DEBUG
+ u8 *c;
+
+ printk("DHCP/BOOTP: Got extension %d:",*ext);
+ for (c=ext+2; c<ext+2+ext[1]; c++)
+ printk(" %02x", *c);
+ printk("\n");
+#endif
+
+ switch (*ext++) {
+ case 1: /* Subnet mask */
+ if (ic_netmask == NONE)
+ memcpy(&ic_netmask, ext+1, 4);
+ break;
+ case 3: /* Default gateway */
+ if (ic_gateway == NONE)
+ memcpy(&ic_gateway, ext+1, 4);
+ break;
+ case 6: /* DNS server */
+ servers= *ext/4;
+ if (servers > CONF_NAMESERVERS_MAX)
+ servers = CONF_NAMESERVERS_MAX;
+ for (i = 0; i < servers; i++) {
+ if (ic_nameservers[i] == NONE)
+ memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+ }
+ break;
+ case 12: /* Host name */
+ ic_bootp_string(utsname()->nodename, ext+1, *ext,
+ __NEW_UTS_LEN);
+ ic_host_name_set = 1;
+ break;
+ case 15: /* Domain name (DNS) */
+ ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+ break;
+ case 17: /* Root path */
+ if (!root_server_path[0])
+ ic_bootp_string(root_server_path, ext+1, *ext,
+ sizeof(root_server_path));
+ break;
+ case 26: /* Interface MTU */
+ memcpy(&mtu, ext+1, sizeof(mtu));
+ ic_dev_mtu = ntohs(mtu);
+ break;
+ case 40: /* NIS Domain name (_not_ DNS) */
+ ic_bootp_string(utsname()->domainname, ext+1, *ext,
+ __NEW_UTS_LEN);
+ break;
+ }
+}
+
+
+/*
+ * Receive BOOTP reply.
+ */
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct bootp_pkt *b;
+ struct iphdr *h;
+ struct ic_device *d;
+ int len, ext_len;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ /* Perform verifications before taking the lock. */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb,
+ sizeof(struct iphdr) +
+ sizeof(struct udphdr)))
+ goto drop;
+
+ b = (struct bootp_pkt *)skb_network_header(skb);
+ h = &b->iph;
+
+ if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+ goto drop;
+
+ /* Fragments are not supported */
+ if (ip_is_fragment(h)) {
+ net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n");
+ goto drop;
+ }
+
+ if (skb->len < ntohs(h->tot_len))
+ goto drop;
+
+ if (ip_fast_csum((char *) h, h->ihl))
+ goto drop;
+
+ if (b->udph.source != htons(67) || b->udph.dest != htons(68))
+ goto drop;
+
+ if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+ goto drop;
+
+ len = ntohs(b->udph.len) - sizeof(struct udphdr);
+ ext_len = len - (sizeof(*b) -
+ sizeof(struct iphdr) -
+ sizeof(struct udphdr) -
+ sizeof(b->exten));
+ if (ext_len < 0)
+ goto drop;
+
+ /* Ok the front looks good, make sure we can get at the rest. */
+ if (!pskb_may_pull(skb, skb->len))
+ goto drop;
+
+ b = (struct bootp_pkt *)skb_network_header(skb);
+ h = &b->iph;
+
+ /* One reply at a time, please. */
+ spin_lock(&ic_recv_lock);
+
+ /* If we already have a reply, just drop the packet */
+ if (ic_got_reply)
+ goto drop_unlock;
+
+ /* Find the ic_device that the packet arrived on */
+ d = ic_first_dev;
+ while (d && d->dev != dev)
+ d = d->next;
+ if (!d)
+ goto drop_unlock; /* should never happen */
+
+ /* Is it a reply to our BOOTP request? */
+ if (b->op != BOOTP_REPLY ||
+ b->xid != d->xid) {
+ net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n",
+ b->op, b->xid);
+ goto drop_unlock;
+ }
+
+ /* Is it a reply for the device we are configuring? */
+ if (b->xid != ic_dev_xid) {
+ net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n");
+ goto drop_unlock;
+ }
+
+ /* Parse extensions */
+ if (ext_len >= 4 &&
+ !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
+ u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+ u8 *ext;
+
+#ifdef IPCONFIG_DHCP
+ if (ic_proto_enabled & IC_USE_DHCP) {
+ __be32 server_id = NONE;
+ int mt = 0;
+
+ ext = &b->exten[4];
+ while (ext < end && *ext != 0xff) {
+ u8 *opt = ext++;
+ if (*opt == 0) /* Padding */
+ continue;
+ ext += *ext + 1;
+ if (ext >= end)
+ break;
+ switch (*opt) {
+ case 53: /* Message type */
+ if (opt[1])
+ mt = opt[2];
+ break;
+ case 54: /* Server ID (IP address) */
+ if (opt[1] >= 4)
+ memcpy(&server_id, opt + 2, 4);
+ break;
+ }
+ }
+
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Got message type %d\n", mt);
+#endif
+
+ switch (mt) {
+ case DHCPOFFER:
+ /* While in the process of accepting one offer,
+ * ignore all others.
+ */
+ if (ic_myaddr != NONE)
+ goto drop_unlock;
+
+ /* Let's accept that offer. */
+ ic_myaddr = b->your_ip;
+ ic_servaddr = server_id;
+#ifdef IPCONFIG_DEBUG
+ printk("DHCP: Offered address %pI4 by server %pI4\n",
+ &ic_myaddr, &b->iph.saddr);
+#endif
+ /* The DHCP indicated server address takes
+ * precedence over the bootp header one if
+ * they are different.
+ */
+ if ((server_id != NONE) &&
+ (b->server_ip != server_id))
+ b->server_ip = ic_servaddr;
+ break;
+
+ case DHCPACK:
+ if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0)
+ goto drop_unlock;
+
+ /* Yeah! */
+ break;
+
+ default:
+ /* Urque. Forget it*/
+ ic_myaddr = NONE;
+ ic_servaddr = NONE;
+ goto drop_unlock;
+ }
+
+ ic_dhcp_msgtype = mt;
+
+ }
+#endif /* IPCONFIG_DHCP */
+
+ ext = &b->exten[4];
+ while (ext < end && *ext != 0xff) {
+ u8 *opt = ext++;
+ if (*opt == 0) /* Padding */
+ continue;
+ ext += *ext + 1;
+ if (ext < end)
+ ic_do_bootp_ext(opt);
+ }
+ }
+
+ /* We have a winner! */
+ ic_dev = dev;
+ ic_myaddr = b->your_ip;
+ ic_servaddr = b->server_ip;
+ ic_addrservaddr = b->iph.saddr;
+ if (ic_gateway == NONE && b->relay_ip)
+ ic_gateway = b->relay_ip;
+ if (ic_nameservers[0] == NONE)
+ ic_nameservers[0] = ic_servaddr;
+ ic_got_reply = IC_BOOTP;
+
+drop_unlock:
+ /* Show's over. Nothing to see here. */
+ spin_unlock(&ic_recv_lock);
+
+drop:
+ /* Throw the packet out. */
+ kfree_skb(skb);
+
+ return 0;
+}
+
+
+#endif
+
+
+/*
+ * Dynamic IP configuration -- DHCP, BOOTP, RARP.
+ */
+
+#ifdef IPCONFIG_DYNAMIC
+
+static int __init ic_dynamic(void)
+{
+ int retries;
+ struct ic_device *d;
+ unsigned long start_jiffies, timeout, jiff;
+ int do_bootp = ic_proto_have_if & IC_BOOTP;
+ int do_rarp = ic_proto_have_if & IC_RARP;
+
+ /*
+ * If none of DHCP/BOOTP/RARP was selected, return with an error.
+ * This routine gets only called when some pieces of information
+ * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
+ */
+ if (!ic_proto_enabled) {
+ pr_err("IP-Config: Incomplete network configuration information\n");
+ return -1;
+ }
+
+#ifdef IPCONFIG_BOOTP
+ if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
+ pr_err("DHCP/BOOTP: No suitable device found\n");
+#endif
+#ifdef IPCONFIG_RARP
+ if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
+ pr_err("RARP: No suitable device found\n");
+#endif
+
+ if (!ic_proto_have_if)
+ /* Error message already printed */
+ return -1;
+
+ /*
+ * Setup protocols
+ */
+#ifdef IPCONFIG_BOOTP
+ if (do_bootp)
+ ic_bootp_init();
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp)
+ ic_rarp_init();
+#endif
+
+ /*
+ * Send requests and wait, until we get an answer. This loop
+ * seems to be a terrible waste of CPU time, but actually there is
+ * only one process running at all, so we don't need to use any
+ * scheduler functions.
+ * [Actually we could now, but the nothing else running note still
+ * applies.. - AC]
+ */
+ pr_notice("Sending %s%s%s requests .",
+ do_bootp
+ ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+ (do_bootp && do_rarp) ? " and " : "",
+ do_rarp ? "RARP" : "");
+
+ start_jiffies = jiffies;
+ d = ic_first_dev;
+ retries = CONF_SEND_RETRIES;
+ get_random_bytes(&timeout, sizeof(timeout));
+ timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
+ for (;;) {
+#ifdef IPCONFIG_BOOTP
+ /* Track the device we are configuring */
+ ic_dev_xid = d->xid;
+
+ if (do_bootp && (d->able & IC_BOOTP))
+ ic_bootp_send_if(d, jiffies - start_jiffies);
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp && (d->able & IC_RARP))
+ ic_rarp_send_if(d);
+#endif
+
+ jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
+ while (time_before(jiffies, jiff) && !ic_got_reply)
+ schedule_timeout_uninterruptible(1);
+#ifdef IPCONFIG_DHCP
+ /* DHCP isn't done until we get a DHCPACK. */
+ if ((ic_got_reply & IC_BOOTP) &&
+ (ic_proto_enabled & IC_USE_DHCP) &&
+ ic_dhcp_msgtype != DHCPACK) {
+ ic_got_reply = 0;
+ pr_cont(",");
+ continue;
+ }
+#endif /* IPCONFIG_DHCP */
+
+ if (ic_got_reply) {
+ pr_cont(" OK\n");
+ break;
+ }
+
+ if ((d = d->next))
+ continue;
+
+ if (! --retries) {
+ pr_cont(" timed out!\n");
+ break;
+ }
+
+ d = ic_first_dev;
+
+ timeout = timeout CONF_TIMEOUT_MULT;
+ if (timeout > CONF_TIMEOUT_MAX)
+ timeout = CONF_TIMEOUT_MAX;
+
+ pr_cont(".");
+ }
+
+#ifdef IPCONFIG_BOOTP
+ if (do_bootp)
+ ic_bootp_cleanup();
+#endif
+#ifdef IPCONFIG_RARP
+ if (do_rarp)
+ ic_rarp_cleanup();
+#endif
+
+ if (!ic_got_reply) {
+ ic_myaddr = NONE;
+ return -1;
+ }
+
+ printk("IP-Config: Got %s answer from %pI4, ",
+ ((ic_got_reply & IC_RARP) ? "RARP"
+ : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+ &ic_addrservaddr);
+ pr_cont("my address is %pI4\n", &ic_myaddr);
+
+ return 0;
+}
+
+#endif /* IPCONFIG_DYNAMIC */
+
+#ifdef CONFIG_PROC_FS
+
+static int pnp_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ if (ic_proto_used & IC_PROTO)
+ seq_printf(seq, "#PROTO: %s\n",
+ (ic_proto_used & IC_RARP) ? "RARP"
+ : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP");
+ else
+ seq_puts(seq, "#MANUAL\n");
+
+ if (ic_domain[0])
+ seq_printf(seq,
+ "domain %s\n", ic_domain);
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+ if (ic_nameservers[i] != NONE)
+ seq_printf(seq, "nameserver %pI4\n",
+ &ic_nameservers[i]);
+ }
+ if (ic_servaddr != NONE)
+ seq_printf(seq, "bootserver %pI4\n",
+ &ic_servaddr);
+ return 0;
+}
+
+static int pnp_seq_open(struct inode *indoe, struct file *file)
+{
+ return single_open(file, pnp_seq_show, NULL);
+}
+
+static const struct file_operations pnp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = pnp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Extract IP address from the parameter string if needed. Note that we
+ * need to have root_server_addr set _before_ IPConfig gets called as it
+ * can override it.
+ */
+__be32 __init root_nfs_parse_addr(char *name)
+{
+ __be32 addr;
+ int octets = 0;
+ char *cp, *cq;
+
+ cp = cq = name;
+ while (octets < 4) {
+ while (*cp >= '0' && *cp <= '9')
+ cp++;
+ if (cp == cq || cp - cq > 3)
+ break;
+ if (*cp == '.' || octets == 3)
+ octets++;
+ if (octets < 4)
+ cp++;
+ cq = cp;
+ }
+ if (octets == 4 && (*cp == ':' || *cp == '\0')) {
+ if (*cp == ':')
+ *cp++ = '\0';
+ addr = in_aton(name);
+ memmove(name, cp, strlen(cp) + 1);
+ } else
+ addr = NONE;
+
+ return addr;
+}
+
+#define DEVICE_WAIT_MAX 12 /* 12 seconds */
+
+static int __init wait_for_devices(void)
+{
+ int i;
+
+ for (i = 0; i < DEVICE_WAIT_MAX; i++) {
+ struct net_device *dev;
+ int found = 0;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ if (ic_is_init_dev(dev)) {
+ found = 1;
+ break;
+ }
+ }
+ rtnl_unlock();
+ if (found)
+ return 0;
+ ssleep(1);
+ }
+ return -ENODEV;
+}
+
+/*
+ * IP Autoconfig dispatcher.
+ */
+
+static int __init ip_auto_config(void)
+{
+ __be32 addr;
+#ifdef IPCONFIG_DYNAMIC
+ int retries = CONF_OPEN_RETRIES;
+#endif
+ int err;
+ unsigned int i;
+
+#ifdef CONFIG_PROC_FS
+ proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops);
+#endif /* CONFIG_PROC_FS */
+
+ if (!ic_enable)
+ return 0;
+
+ DBG(("IP-Config: Entered.\n"));
+#ifdef IPCONFIG_DYNAMIC
+ try_try_again:
+#endif
+ /* Wait for devices to appear */
+ err = wait_for_devices();
+ if (err)
+ return err;
+
+ /* Setup all network devices */
+ err = ic_open_devs();
+ if (err)
+ return err;
+
+ /* Give drivers a chance to settle */
+ msleep(CONF_POST_OPEN);
+
+ /*
+ * If the config information is insufficient (e.g., our IP address or
+ * IP address of the boot server is missing or we have multiple network
+ * interfaces and no default was set), use BOOTP or RARP to get the
+ * missing values.
+ */
+ if (ic_myaddr == NONE ||
+#ifdef CONFIG_ROOT_NFS
+ (root_server_addr == NONE &&
+ ic_servaddr == NONE &&
+ ROOT_DEV == Root_NFS) ||
+#endif
+ ic_first_dev->next) {
+#ifdef IPCONFIG_DYNAMIC
+ if (ic_dynamic() < 0) {
+ ic_close_devs();
+
+ /*
+ * I don't know why, but sometimes the
+ * eepro100 driver (at least) gets upset and
+ * doesn't work the first time it's opened.
+ * But then if you close it and reopen it, it
+ * works just fine. So we need to try that at
+ * least once before giving up.
+ *
+ * Also, if the root will be NFS-mounted, we
+ * have nowhere to go if DHCP fails. So we
+ * just have to keep trying forever.
+ *
+ * -- Chip
+ */
+#ifdef CONFIG_ROOT_NFS
+ if (ROOT_DEV == Root_NFS) {
+ pr_err("IP-Config: Retrying forever (NFS root)...\n");
+ goto try_try_again;
+ }
+#endif
+
+ if (--retries) {
+ pr_err("IP-Config: Reopening network devices...\n");
+ goto try_try_again;
+ }
+
+ /* Oh, well. At least we tried. */
+ pr_err("IP-Config: Auto-configuration of network failed\n");
+ return -1;
+ }
+#else /* !DYNAMIC */
+ pr_err("IP-Config: Incomplete network configuration information\n");
+ ic_close_devs();
+ return -1;
+#endif /* IPCONFIG_DYNAMIC */
+ } else {
+ /* Device selected manually or only one device -> use it */
+ ic_dev = ic_first_dev->dev;
+ }
+
+ addr = root_nfs_parse_addr(root_server_path);
+ if (root_server_addr == NONE)
+ root_server_addr = addr;
+
+ /*
+ * Use defaults wherever applicable.
+ */
+ if (ic_defaults() < 0)
+ return -1;
+
+ /*
+ * Close all network devices except the device we've
+ * autoconfigured and set up routes.
+ */
+ ic_close_devs();
+ if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+ return -1;
+
+ /*
+ * Record which protocol was actually used.
+ */
+#ifdef IPCONFIG_DYNAMIC
+ ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP);
+#endif
+
+#ifndef IPCONFIG_SILENT
+ /*
+ * Clue in the operator.
+ */
+ pr_info("IP-Config: Complete:\n");
+
+ pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n",
+ ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr,
+ &ic_myaddr, &ic_netmask, &ic_gateway);
+ pr_info(" host=%s, domain=%s, nis-domain=%s\n",
+ utsname()->nodename, ic_domain, utsname()->domainname);
+ pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s",
+ &ic_servaddr, &root_server_addr, root_server_path);
+ if (ic_dev_mtu)
+ pr_cont(", mtu=%d", ic_dev_mtu);
+ for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+ if (ic_nameservers[i] != NONE) {
+ pr_info(" nameserver%u=%pI4",
+ i, &ic_nameservers[i]);
+ break;
+ }
+ for (i++; i < CONF_NAMESERVERS_MAX; i++)
+ if (ic_nameservers[i] != NONE)
+ pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
+ pr_cont("\n");
+#endif /* !SILENT */
+
+ return 0;
+}
+
+late_initcall(ip_auto_config);
+
+
+/*
+ * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
+ * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt.
+ */
+static int __init ic_proto_name(char *name)
+{
+ if (!strcmp(name, "on") || !strcmp(name, "any")) {
+ return 1;
+ }
+ if (!strcmp(name, "off") || !strcmp(name, "none")) {
+ return 0;
+ }
+#ifdef CONFIG_IP_PNP_DHCP
+ else if (!strcmp(name, "dhcp")) {
+ ic_proto_enabled &= ~IC_RARP;
+ return 1;
+ }
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+ else if (!strcmp(name, "bootp")) {
+ ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP);
+ return 1;
+ }
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+ else if (!strcmp(name, "rarp")) {
+ ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP);
+ return 1;
+ }
+#endif
+#ifdef IPCONFIG_DYNAMIC
+ else if (!strcmp(name, "both")) {
+ ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+static int __init ip_auto_config_setup(char *addrs)
+{
+ char *cp, *ip, *dp;
+ int num = 0;
+
+ ic_set_manually = 1;
+ ic_enable = 1;
+
+ /*
+ * If any dhcp, bootp etc options are set, leave autoconfig on
+ * and skip the below static IP processing.
+ */
+ if (ic_proto_name(addrs))
+ return 1;
+
+ /* If no static IP is given, turn off autoconfig and bail. */
+ if (*addrs == 0 ||
+ strcmp(addrs, "off") == 0 ||
+ strcmp(addrs, "none") == 0) {
+ ic_enable = 0;
+ return 1;
+ }
+
+ ic_nameservers_predef();
+
+ /* Parse string for static IP assignment. */
+ ip = addrs;
+ while (ip && *ip) {
+ if ((cp = strchr(ip, ':')))
+ *cp++ = '\0';
+ if (strlen(ip) > 0) {
+ DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+ switch (num) {
+ case 0:
+ if ((ic_myaddr = in_aton(ip)) == ANY)
+ ic_myaddr = NONE;
+ break;
+ case 1:
+ if ((ic_servaddr = in_aton(ip)) == ANY)
+ ic_servaddr = NONE;
+ break;
+ case 2:
+ if ((ic_gateway = in_aton(ip)) == ANY)
+ ic_gateway = NONE;
+ break;
+ case 3:
+ if ((ic_netmask = in_aton(ip)) == ANY)
+ ic_netmask = NONE;
+ break;
+ case 4:
+ if ((dp = strchr(ip, '.'))) {
+ *dp++ = '\0';
+ strlcpy(utsname()->domainname, dp,
+ sizeof(utsname()->domainname));
+ }
+ strlcpy(utsname()->nodename, ip,
+ sizeof(utsname()->nodename));
+ ic_host_name_set = 1;
+ break;
+ case 5:
+ strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+ break;
+ case 6:
+ if (ic_proto_name(ip) == 0 &&
+ ic_myaddr == NONE) {
+ ic_enable = 0;
+ }
+ break;
+ case 7:
+ if (CONF_NAMESERVERS_MAX >= 1) {
+ ic_nameservers[0] = in_aton(ip);
+ if (ic_nameservers[0] == ANY)
+ ic_nameservers[0] = NONE;
+ }
+ break;
+ case 8:
+ if (CONF_NAMESERVERS_MAX >= 2) {
+ ic_nameservers[1] = in_aton(ip);
+ if (ic_nameservers[1] == ANY)
+ ic_nameservers[1] = NONE;
+ }
+ break;
+ }
+ }
+ ip = cp;
+ num++;
+ }
+
+ return 1;
+}
+__setup("ip=", ip_auto_config_setup);
+
+static int __init nfsaddrs_config_setup(char *addrs)
+{
+ return ip_auto_config_setup(addrs);
+}
+__setup("nfsaddrs=", nfsaddrs_config_setup);
+
+static int __init vendor_class_identifier_setup(char *addrs)
+{
+ if (strlcpy(vendor_class_identifier, addrs,
+ sizeof(vendor_class_identifier))
+ >= sizeof(vendor_class_identifier))
+ pr_warn("DHCP: vendorclass too long, truncated to \"%s\"",
+ vendor_class_identifier);
+ return 1;
+}
+__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
new file mode 100644
index 000000000..ff96396eb
--- /dev/null
+++ b/net/ipv4/ipip.c
@@ -0,0 +1,569 @@
+/*
+ * Linux NET3: IP/IP protocol decoder.
+ *
+ * Authors:
+ * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
+ *
+ * Fixes:
+ * Alan Cox : Merged and made usable non modular (its so tiny its silly as
+ * a module taking up 2 pages).
+ * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
+ * to keep ip_forward happy.
+ * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
+ * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL
+ * David Woodhouse : Perform some basic ICMP handling.
+ * IPIP Routing without decapsulation.
+ * Carlos Picoto : GRE over IP support
+ * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
+ * I do not want to merge them together.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* tunnel.c: an IP tunnel driver
+
+ The purpose of this driver is to provide an IP tunnel through
+ which you can tunnel network traffic transparently across subnets.
+
+ This was written by looking at Nick Holloway's dummy driver
+ Thanks for the great code!
+
+ -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
+
+ Minor tweaks:
+ Cleaned up the code a little and added some pre-1.3.0 tweaks.
+ dev->hard_header/hard_header_len changed to use no headers.
+ Comments/bracketing tweaked.
+ Made the tunnels use dev->name not tunnel: when error reporting.
+ Added tx_dropped stat
+
+ -Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95
+
+ Reworked:
+ Changed to tunnel to destination gateway in addition to the
+ tunnel's pointopoint address
+ Almost completely rewritten
+ Note: There is currently no firewall or ICMP handling done.
+
+ -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
+
+*/
+
+/* Things I wish I had known when writing the tunnel driver:
+
+ When the tunnel_xmit() function is called, the skb contains the
+ packet to be sent (plus a great deal of extra info), and dev
+ contains the tunnel device that _we_ are.
+
+ When we are passed a packet, we are expected to fill in the
+ source address with our source IP address.
+
+ What is the proper way to allocate, copy and free a buffer?
+ After you allocate it, it is a "0 length" chunk of memory
+ starting at zero. If you want to add headers to the buffer
+ later, you'll have to call "skb_reserve(skb, amount)" with
+ the amount of memory you want reserved. Then, you call
+ "skb_put(skb, amount)" with the amount of space you want in
+ the buffer. skb_put() returns a pointer to the top (#0) of
+ that buffer. skb->len is set to the amount of space you have
+ "allocated" with skb_put(). You can then write up to skb->len
+ bytes to that buffer. If you need more, you can call skb_put()
+ again with the additional amount of space you need. You can
+ find out how much more space you can allocate by calling
+ "skb_tailroom(skb)".
+ Now, to add header space, call "skb_push(skb, header_len)".
+ This creates space at the beginning of the buffer and returns
+ a pointer to this new space. If later you need to strip a
+ header from a buffer, call "skb_pull(skb, header_len)".
+ skb_headroom() will return how much space is left at the top
+ of the buffer (before the main data). Remember, this headroom
+ space must be reserved before the skb_put() function is called.
+ */
+
+/*
+ This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+static int ipip_net_id __read_mostly;
+
+static int ipip_tunnel_init(struct net_device *dev);
+static struct rtnl_link_ops ipip_link_ops __read_mostly;
+
+static int ipip_err(struct sk_buff *skb, u32 info)
+{
+
+/* All the routers (except for Linux) return only
+ 8 bytes of packet payload. It means, that precise relaying of
+ ICMP in the real Internet is absolutely infeasible.
+ */
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_tunnel *t;
+ int err;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+
+ err = -ENOENT;
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->daddr, iph->saddr, 0);
+ if (!t)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->parms.link, 0, IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
+
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
+ IPPROTO_IPIP, 0);
+ err = 0;
+ goto out;
+ }
+
+ if (t->parms.iph.daddr == 0)
+ goto out;
+
+ err = 0;
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ goto out;
+
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+
+out:
+ return err;
+}
+
+static const struct tnl_ptk_info tpi = {
+ /* no tunnel info required for ipip. */
+ .proto = htons(ETH_P_IP),
+};
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph;
+
+ iph = ip_hdr(skb);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ iph->saddr, iph->daddr, 0);
+ if (tunnel) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+ if (iptunnel_pull_header(skb, 0, tpi.proto))
+ goto drop;
+ return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ }
+
+ return -1;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+/*
+ * This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tiph = &tunnel->parms.iph;
+
+ if (unlikely(skb->protocol != htons(ETH_P_IP)))
+ goto tx_error;
+
+ skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ if (IS_ERR(skb))
+ goto out;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+
+ ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
+ return NETDEV_TX_OK;
+
+tx_error:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_errors++;
+ return NETDEV_TX_OK;
+}
+
+static int
+ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ return -EFAULT;
+
+ if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
+ if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+ return -EINVAL;
+ }
+
+ p.i_key = p.o_key = p.i_flags = p.o_flags = 0;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ err = ip_tunnel_ioctl(dev, &p, cmd);
+ if (err)
+ return err;
+
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static const struct net_device_ops ipip_netdev_ops = {
+ .ndo_init = ipip_tunnel_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = ipip_tunnel_xmit,
+ .ndo_do_ioctl = ipip_tunnel_ioctl,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+#define IPIP_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
+
+static void ipip_tunnel_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &ipip_netdev_ops;
+
+ dev->type = ARPHRD_TUNNEL;
+ dev->flags = IFF_NOARP;
+ dev->addr_len = 4;
+ dev->features |= NETIF_F_LLTX;
+ netif_keep_dst(dev);
+
+ dev->features |= IPIP_FEATURES;
+ dev->hw_features |= IPIP_FEATURES;
+ ip_tunnel_setup(dev, ipip_net_id);
+}
+
+static int ipip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+ tunnel->tun_hlen = 0;
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+ tunnel->parms.iph.protocol = IPPROTO_IPIP;
+ return ip_tunnel_init(dev);
+}
+
+static void ipip_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.version = 4;
+ parms->iph.protocol = IPPROTO_IPIP;
+ parms->iph.ihl = 5;
+
+ if (!data)
+ return;
+
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]);
+
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]);
+
+ if (data[IFLA_IPTUN_TTL]) {
+ parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+ if (parms->iph.ttl)
+ parms->iph.frag_off = htons(IP_DF);
+ }
+
+ if (data[IFLA_IPTUN_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+ if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+}
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipip_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
+static int ipip_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipip_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ ipip_netlink_parms(data, &p);
+ return ip_tunnel_newlink(dev, tb, &p);
+}
+
+static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+
+ if (ipip_netlink_encap_parms(data, &ipencap)) {
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
+
+ if (err < 0)
+ return err;
+ }
+
+ ipip_netlink_parms(data, &p);
+
+ if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+ return -EINVAL;
+
+ return ip_tunnel_changelink(dev, tb, &p);
+}
+
+static size_t ipip_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_TOS */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
+ 0;
+}
+
+static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
+ nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+ nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
+ !!(parm->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+ tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+ tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+ tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+ tunnel->encap.flags))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
+ [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+};
+
+static struct rtnl_link_ops ipip_link_ops __read_mostly = {
+ .kind = "ipip",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ipip_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipip_tunnel_setup,
+ .newlink = ipip_newlink,
+ .changelink = ipip_changelink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipip_get_size,
+ .fill_info = ipip_fill_info,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+ .handler = ipip_rcv,
+ .err_handler = ipip_err,
+ .priority = 1,
+};
+
+static int __net_init ipip_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
+}
+
+static void __net_exit ipip_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ ip_tunnel_delete_net(itn, &ipip_link_ops);
+}
+
+static struct pernet_operations ipip_net_ops = {
+ .init = ipip_init_net,
+ .exit = ipip_exit_net,
+ .id = &ipip_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+static int __init ipip_init(void)
+{
+ int err;
+
+ pr_info("ipip: IPv4 over IPv4 tunneling driver\n");
+
+ err = register_pernet_device(&ipip_net_ops);
+ if (err < 0)
+ return err;
+ err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+ if (err < 0) {
+ pr_info("%s: can't register tunnel\n", __func__);
+ goto xfrm_tunnel_failed;
+ }
+ err = rtnl_link_register(&ipip_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+out:
+ return err;
+
+rtnl_link_failed:
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+xfrm_tunnel_failed:
+ unregister_pernet_device(&ipip_net_ops);
+ goto out;
+}
+
+static void __exit ipip_fini(void)
+{
+ rtnl_link_unregister(&ipip_link_ops);
+ if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
+ pr_info("%s: can't deregister tunnel\n", __func__);
+
+ unregister_pernet_device(&ipip_net_ops);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("ipip");
+MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
new file mode 100644
index 000000000..3a2c0162c
--- /dev/null
+++ b/net/ipv4/ipmr.c
@@ -0,0 +1,2792 @@
+/*
+ * IP multicast routing support for mrouted 3.6/3.8
+ *
+ * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ * Linux Consultancy and Custom Driver Development
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Michael Chastain : Incorrect size of copying.
+ * Alan Cox : Added the cache manager code
+ * Alan Cox : Fixed the clone/copy bug and device race.
+ * Mike McLagan : Routing by source
+ * Malcolm Beattie : Buffer handling fixes.
+ * Alexey Kuznetsov : Double buffer free and other fixes.
+ * SVR Anand : Fixed several multicast bugs and problems.
+ * Alexey Kuznetsov : Status, optimisations and more.
+ * Brad Parker : Better behaviour on mrouted upcall
+ * overflow.
+ * Carlos Picoto : PIMv1 Support
+ * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
+ * Relax this requirement to work with older peers.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/if_ether.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+#include <linux/export.h>
+#include <net/ip_tunnels.h>
+#include <net/checksum.h>
+#include <net/netlink.h>
+#include <net/fib_rules.h>
+#include <linux/netconf.h>
+
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+#define CONFIG_IP_PIMSM 1
+#endif
+
+struct mr_table {
+ struct list_head list;
+ possible_net_t net;
+ u32 id;
+ struct sock __rcu *mroute_sk;
+ struct timer_list ipmr_expire_timer;
+ struct list_head mfc_unres_queue;
+ struct list_head mfc_cache_array[MFC_LINES];
+ struct vif_device vif_table[MAXVIFS];
+ int maxvif;
+ atomic_t cache_resolve_queue_len;
+ bool mroute_do_assert;
+ bool mroute_do_pim;
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+ int mroute_reg_vif_num;
+#endif
+};
+
+struct ipmr_rule {
+ struct fib_rule common;
+};
+
+struct ipmr_result {
+ struct mr_table *mrt;
+};
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+ * Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_RWLOCK(mrt_lock);
+
+/*
+ * Multicast router control variables
+ */
+
+#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
+ */
+
+static struct kmem_cache *mrt_cachep __read_mostly;
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id);
+static void ipmr_free_table(struct mr_table *mrt);
+
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local);
+static int ipmr_cache_report(struct mr_table *mrt,
+ struct sk_buff *pkt, vifi_t vifi, int assert);
+static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ struct mfc_cache *c, struct rtmsg *rtm);
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd);
+static void mroute_clean_tables(struct mr_table *mrt);
+static void ipmr_expire_process(unsigned long arg);
+
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+#define ipmr_for_each_table(mrt, net) \
+ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ ipmr_for_each_table(mrt, net) {
+ if (mrt->id == id)
+ return mrt;
+ }
+ return NULL;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+ struct mr_table **mrt)
+{
+ int err;
+ struct ipmr_result res;
+ struct fib_lookup_arg arg = {
+ .result = &res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+ flowi4_to_flowi(flp4), 0, &arg);
+ if (err < 0)
+ return err;
+ *mrt = res.mrt;
+ return 0;
+}
+
+static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct ipmr_result *res = arg->result;
+ struct mr_table *mrt;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ mrt = ipmr_get_table(rule->fr_net, rule->table);
+ if (!mrt)
+ return -EAGAIN;
+ res->mrt = mrt;
+ return 0;
+}
+
+static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+ return 1;
+}
+
+static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
+ FRA_GENERIC_POLICY,
+};
+
+static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh, struct nlattr **tb)
+{
+ return 0;
+}
+
+static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ return 1;
+}
+
+static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ frh->dst_len = 0;
+ frh->src_len = 0;
+ frh->tos = 0;
+ return 0;
+}
+
+static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
+ .family = RTNL_FAMILY_IPMR,
+ .rule_size = sizeof(struct ipmr_rule),
+ .addr_size = sizeof(u32),
+ .action = ipmr_rule_action,
+ .match = ipmr_rule_match,
+ .configure = ipmr_rule_configure,
+ .compare = ipmr_rule_compare,
+ .default_pref = fib_default_rule_pref,
+ .fill = ipmr_rule_fill,
+ .nlgroup = RTNLGRP_IPV4_RULE,
+ .policy = ipmr_rule_policy,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+ struct fib_rules_ops *ops;
+ struct mr_table *mrt;
+ int err;
+
+ ops = fib_rules_register(&ipmr_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ INIT_LIST_HEAD(&net->ipv4.mr_tables);
+
+ mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ if (!mrt) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
+ if (err < 0)
+ goto err2;
+
+ net->ipv4.mr_rules_ops = ops;
+ return 0;
+
+err2:
+ ipmr_free_table(mrt);
+err1:
+ fib_rules_unregister(ops);
+ return err;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+ struct mr_table *mrt, *next;
+
+ rtnl_lock();
+ list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
+ list_del(&mrt->list);
+ ipmr_free_table(mrt);
+ }
+ fib_rules_unregister(net->ipv4.mr_rules_ops);
+ rtnl_unlock();
+}
+#else
+#define ipmr_for_each_table(mrt, net) \
+ for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ return net->ipv4.mrt;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+ struct mr_table **mrt)
+{
+ *mrt = net->ipv4.mrt;
+ return 0;
+}
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+ net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ return net->ipv4.mrt ? 0 : -ENOMEM;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+ rtnl_lock();
+ ipmr_free_table(net->ipv4.mrt);
+ net->ipv4.mrt = NULL;
+ rtnl_unlock();
+}
+#endif
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+ unsigned int i;
+
+ mrt = ipmr_get_table(net, id);
+ if (mrt)
+ return mrt;
+
+ mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+ if (!mrt)
+ return NULL;
+ write_pnet(&mrt->net, net);
+ mrt->id = id;
+
+ /* Forwarding cache */
+ for (i = 0; i < MFC_LINES; i++)
+ INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
+
+ INIT_LIST_HEAD(&mrt->mfc_unres_queue);
+
+ setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
+ (unsigned long)mrt);
+
+#ifdef CONFIG_IP_PIMSM
+ mrt->mroute_reg_vif_num = -1;
+#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
+#endif
+ return mrt;
+}
+
+static void ipmr_free_table(struct mr_table *mrt)
+{
+ del_timer_sync(&mrt->ipmr_expire_timer);
+ mroute_clean_tables(mrt);
+ kfree(mrt);
+}
+
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+
+static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
+{
+ struct net *net = dev_net(dev);
+
+ dev_close(dev);
+
+ dev = __dev_get_by_name(net, "tunl0");
+ if (dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct ifreq ifr;
+ struct ip_tunnel_parm p;
+
+ memset(&p, 0, sizeof(p));
+ p.iph.daddr = v->vifc_rmt_addr.s_addr;
+ p.iph.saddr = v->vifc_lcl_addr.s_addr;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPIP;
+ sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+ ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+ if (ops->ndo_do_ioctl) {
+ mm_segment_t oldfs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
+ set_fs(oldfs);
+ }
+ }
+}
+
+static
+struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
+{
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(net, "tunl0");
+
+ if (dev) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+ struct ifreq ifr;
+ struct ip_tunnel_parm p;
+ struct in_device *in_dev;
+
+ memset(&p, 0, sizeof(p));
+ p.iph.daddr = v->vifc_rmt_addr.s_addr;
+ p.iph.saddr = v->vifc_lcl_addr.s_addr;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPIP;
+ sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+ ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+ if (ops->ndo_do_ioctl) {
+ mm_segment_t oldfs = get_fs();
+
+ set_fs(KERNEL_DS);
+ err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+ set_fs(oldfs);
+ } else {
+ err = -EOPNOTSUPP;
+ }
+ dev = NULL;
+
+ if (err == 0 &&
+ (dev = __dev_get_by_name(net, p.name)) != NULL) {
+ dev->flags |= IFF_MULTICAST;
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ goto failure;
+
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+
+ if (dev_open(dev))
+ goto failure;
+ dev_hold(dev);
+ }
+ }
+ return dev;
+
+failure:
+ /* allow the register to be completed before unregistering. */
+ rtnl_unlock();
+ rtnl_lock();
+
+ unregister_netdevice(dev);
+ return NULL;
+}
+
+#ifdef CONFIG_IP_PIMSM
+
+static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct mr_table *mrt;
+ struct flowi4 fl4 = {
+ .flowi4_oif = dev->ifindex,
+ .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
+ .flowi4_mark = skb->mark,
+ };
+ int err;
+
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ read_lock(&mrt_lock);
+ dev->stats.tx_bytes += skb->len;
+ dev->stats.tx_packets++;
+ ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
+ read_unlock(&mrt_lock);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int reg_vif_get_iflink(const struct net_device *dev)
+{
+ return 0;
+}
+
+static const struct net_device_ops reg_vif_netdev_ops = {
+ .ndo_start_xmit = reg_vif_xmit,
+ .ndo_get_iflink = reg_vif_get_iflink,
+};
+
+static void reg_vif_setup(struct net_device *dev)
+{
+ dev->type = ARPHRD_PIMREG;
+ dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
+ dev->flags = IFF_NOARP;
+ dev->netdev_ops = &reg_vif_netdev_ops;
+ dev->destructor = free_netdev;
+ dev->features |= NETIF_F_NETNS_LOCAL;
+}
+
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+ struct net_device *dev;
+ struct in_device *in_dev;
+ char name[IFNAMSIZ];
+
+ if (mrt->id == RT_TABLE_DEFAULT)
+ sprintf(name, "pimreg");
+ else
+ sprintf(name, "pimreg%u", mrt->id);
+
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
+
+ if (!dev)
+ return NULL;
+
+ dev_net_set(dev, net);
+
+ if (register_netdevice(dev)) {
+ free_netdev(dev);
+ return NULL;
+ }
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ goto failure;
+ }
+
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+ rcu_read_unlock();
+
+ if (dev_open(dev))
+ goto failure;
+
+ dev_hold(dev);
+
+ return dev;
+
+failure:
+ /* allow the register to be completed before unregistering. */
+ rtnl_unlock();
+ rtnl_lock();
+
+ unregister_netdevice(dev);
+ return NULL;
+}
+#endif
+
+/**
+ * vif_delete - Delete a VIF entry
+ * @notify: Set to 1, if the caller is a notifier_call
+ */
+
+static int vif_delete(struct mr_table *mrt, int vifi, int notify,
+ struct list_head *head)
+{
+ struct vif_device *v;
+ struct net_device *dev;
+ struct in_device *in_dev;
+
+ if (vifi < 0 || vifi >= mrt->maxvif)
+ return -EADDRNOTAVAIL;
+
+ v = &mrt->vif_table[vifi];
+
+ write_lock_bh(&mrt_lock);
+ dev = v->dev;
+ v->dev = NULL;
+
+ if (!dev) {
+ write_unlock_bh(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ }
+
+#ifdef CONFIG_IP_PIMSM
+ if (vifi == mrt->mroute_reg_vif_num)
+ mrt->mroute_reg_vif_num = -1;
+#endif
+
+ if (vifi + 1 == mrt->maxvif) {
+ int tmp;
+
+ for (tmp = vifi - 1; tmp >= 0; tmp--) {
+ if (VIF_EXISTS(mrt, tmp))
+ break;
+ }
+ mrt->maxvif = tmp+1;
+ }
+
+ write_unlock_bh(&mrt_lock);
+
+ dev_set_allmulti(dev, -1);
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
+ IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(dev_net(dev),
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in_dev->cnf);
+ ip_rt_multicast_event(in_dev);
+ }
+
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
+ unregister_netdevice_queue(dev, head);
+
+ dev_put(dev);
+ return 0;
+}
+
+static void ipmr_cache_free_rcu(struct rcu_head *head)
+{
+ struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+
+ kmem_cache_free(mrt_cachep, c);
+}
+
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+ call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+ * and reporting error to netlink readers.
+ */
+
+static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ struct nlmsgerr *e;
+
+ atomic_dec(&mrt->cache_resolve_queue_len);
+
+ while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
+ if (ip_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ e = nlmsg_data(nlh);
+ e->error = -ETIMEDOUT;
+ memset(&e->msg, 0, sizeof(e->msg));
+
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else {
+ kfree_skb(skb);
+ }
+ }
+
+ ipmr_cache_free(c);
+}
+
+
+/* Timer process for the unresolved queue. */
+
+static void ipmr_expire_process(unsigned long arg)
+{
+ struct mr_table *mrt = (struct mr_table *)arg;
+ unsigned long now;
+ unsigned long expires;
+ struct mfc_cache *c, *next;
+
+ if (!spin_trylock(&mfc_unres_lock)) {
+ mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
+ return;
+ }
+
+ if (list_empty(&mrt->mfc_unres_queue))
+ goto out;
+
+ now = jiffies;
+ expires = 10*HZ;
+
+ list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ if (time_after(c->mfc_un.unres.expires, now)) {
+ unsigned long interval = c->mfc_un.unres.expires - now;
+ if (interval < expires)
+ expires = interval;
+ continue;
+ }
+
+ list_del(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_destroy_unres(mrt, c);
+ }
+
+ if (!list_empty(&mrt->mfc_unres_queue))
+ mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
+
+out:
+ spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under write locked mrt_lock. */
+
+static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
+ unsigned char *ttls)
+{
+ int vifi;
+
+ cache->mfc_un.res.minvif = MAXVIFS;
+ cache->mfc_un.res.maxvif = 0;
+ memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
+
+ for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+ if (VIF_EXISTS(mrt, vifi) &&
+ ttls[vifi] && ttls[vifi] < 255) {
+ cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+ if (cache->mfc_un.res.minvif > vifi)
+ cache->mfc_un.res.minvif = vifi;
+ if (cache->mfc_un.res.maxvif <= vifi)
+ cache->mfc_un.res.maxvif = vifi + 1;
+ }
+ }
+}
+
+static int vif_add(struct net *net, struct mr_table *mrt,
+ struct vifctl *vifc, int mrtsock)
+{
+ int vifi = vifc->vifc_vifi;
+ struct vif_device *v = &mrt->vif_table[vifi];
+ struct net_device *dev;
+ struct in_device *in_dev;
+ int err;
+
+ /* Is vif busy ? */
+ if (VIF_EXISTS(mrt, vifi))
+ return -EADDRINUSE;
+
+ switch (vifc->vifc_flags) {
+#ifdef CONFIG_IP_PIMSM
+ case VIFF_REGISTER:
+ /*
+ * Special Purpose VIF in PIM
+ * All the packets will be sent to the daemon
+ */
+ if (mrt->mroute_reg_vif_num >= 0)
+ return -EADDRINUSE;
+ dev = ipmr_reg_vif(net, mrt);
+ if (!dev)
+ return -ENOBUFS;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ unregister_netdevice(dev);
+ dev_put(dev);
+ return err;
+ }
+ break;
+#endif
+ case VIFF_TUNNEL:
+ dev = ipmr_new_tunnel(net, vifc);
+ if (!dev)
+ return -ENOBUFS;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ ipmr_del_tunnel(dev, vifc);
+ dev_put(dev);
+ return err;
+ }
+ break;
+
+ case VIFF_USE_IFINDEX:
+ case 0:
+ if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
+ dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
+ if (dev && !__in_dev_get_rtnl(dev)) {
+ dev_put(dev);
+ return -EADDRNOTAVAIL;
+ }
+ } else {
+ dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+ }
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ dev_put(dev);
+ return err;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev) {
+ dev_put(dev);
+ return -EADDRNOTAVAIL;
+ }
+ IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
+ &in_dev->cnf);
+ ip_rt_multicast_event(in_dev);
+
+ /* Fill in the VIF structures */
+
+ v->rate_limit = vifc->vifc_rate_limit;
+ v->local = vifc->vifc_lcl_addr.s_addr;
+ v->remote = vifc->vifc_rmt_addr.s_addr;
+ v->flags = vifc->vifc_flags;
+ if (!mrtsock)
+ v->flags |= VIFF_STATIC;
+ v->threshold = vifc->vifc_threshold;
+ v->bytes_in = 0;
+ v->bytes_out = 0;
+ v->pkt_in = 0;
+ v->pkt_out = 0;
+ v->link = dev->ifindex;
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
+ v->link = dev_get_iflink(dev);
+
+ /* And finish update writing critical data */
+ write_lock_bh(&mrt_lock);
+ v->dev = dev;
+#ifdef CONFIG_IP_PIMSM
+ if (v->flags & VIFF_REGISTER)
+ mrt->mroute_reg_vif_num = vifi;
+#endif
+ if (vifi+1 > mrt->maxvif)
+ mrt->maxvif = vifi+1;
+ write_unlock_bh(&mrt_lock);
+ return 0;
+}
+
+/* called with rcu_read_lock() */
+static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
+ __be32 origin,
+ __be32 mcastgrp)
+{
+ int line = MFC_HASH(mcastgrp, origin);
+ struct mfc_cache *c;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
+ if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
+ return c;
+ }
+ return NULL;
+}
+
+/* Look for a (*,*,oif) entry */
+static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
+ int vifi)
+{
+ int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
+ struct mfc_cache *c;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY) &&
+ c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ return NULL;
+}
+
+/* Look for a (*,G) entry */
+static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
+ __be32 mcastgrp, int vifi)
+{
+ int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
+ struct mfc_cache *c, *proxy;
+
+ if (mcastgrp == htonl(INADDR_ANY))
+ goto skip;
+
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == mcastgrp) {
+ if (c->mfc_un.res.ttls[vifi] < 255)
+ return c;
+
+ /* It's ok if the vifi is part of the static tree */
+ proxy = ipmr_cache_find_any_parent(mrt,
+ c->mfc_parent);
+ if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+ return c;
+ }
+
+skip:
+ return ipmr_cache_find_any_parent(mrt, vifi);
+}
+
+/*
+ * Allocate a multicast cache entry
+ */
+static struct mfc_cache *ipmr_cache_alloc(void)
+{
+ struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
+
+ if (c)
+ c->mfc_un.res.minvif = MAXVIFS;
+ return c;
+}
+
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+{
+ struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+
+ if (c) {
+ skb_queue_head_init(&c->mfc_un.unres.unresolved);
+ c->mfc_un.unres.expires = jiffies + 10*HZ;
+ }
+ return c;
+}
+
+/*
+ * A cache entry has gone into a resolved state from queued
+ */
+
+static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
+ struct mfc_cache *uc, struct mfc_cache *c)
+{
+ struct sk_buff *skb;
+ struct nlmsgerr *e;
+
+ /* Play the pending entries through our router */
+
+ while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+ if (ip_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+
+ if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
+ nlh->nlmsg_len = skb_tail_pointer(skb) -
+ (u8 *)nlh;
+ } else {
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ e = nlmsg_data(nlh);
+ e->error = -EMSGSIZE;
+ memset(&e->msg, 0, sizeof(e->msg));
+ }
+
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else {
+ ip_mr_forward(net, mrt, skb, c, 0);
+ }
+ }
+}
+
+/*
+ * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ * expects the following bizarre scheme.
+ *
+ * Called under mrt_lock.
+ */
+
+static int ipmr_cache_report(struct mr_table *mrt,
+ struct sk_buff *pkt, vifi_t vifi, int assert)
+{
+ struct sk_buff *skb;
+ const int ihl = ip_hdrlen(pkt);
+ struct igmphdr *igmp;
+ struct igmpmsg *msg;
+ struct sock *mroute_sk;
+ int ret;
+
+#ifdef CONFIG_IP_PIMSM
+ if (assert == IGMPMSG_WHOLEPKT)
+ skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
+ else
+#endif
+ skb = alloc_skb(128, GFP_ATOMIC);
+
+ if (!skb)
+ return -ENOBUFS;
+
+#ifdef CONFIG_IP_PIMSM
+ if (assert == IGMPMSG_WHOLEPKT) {
+ /* Ugly, but we have no choice with this interface.
+ * Duplicate old header, fix ihl, length etc.
+ * And all this only to mangle msg->im_msgtype and
+ * to set msg->im_mbz to "mbz" :-)
+ */
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ msg = (struct igmpmsg *)skb_network_header(skb);
+ memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
+ msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ msg->im_mbz = 0;
+ msg->im_vif = mrt->mroute_reg_vif_num;
+ ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
+ sizeof(struct iphdr));
+ } else
+#endif
+ {
+
+ /* Copy the IP header */
+
+ skb_set_network_header(skb, skb->len);
+ skb_put(skb, ihl);
+ skb_copy_to_linear_data(skb, pkt->data, ihl);
+ ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
+ msg = (struct igmpmsg *)skb_network_header(skb);
+ msg->im_vif = vifi;
+ skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+
+ /* Add our header */
+
+ igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ igmp->type =
+ msg->im_msgtype = assert;
+ igmp->code = 0;
+ ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
+ skb->transport_header = skb->network_header;
+ }
+
+ rcu_read_lock();
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (!mroute_sk) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /* Deliver to mrouted */
+
+ ret = sock_queue_rcv_skb(mroute_sk, skb);
+ rcu_read_unlock();
+ if (ret < 0) {
+ net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
+ kfree_skb(skb);
+ }
+
+ return ret;
+}
+
+/*
+ * Queue a packet for resolution. It gets locked cache entry!
+ */
+
+static int
+ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
+{
+ bool found = false;
+ int err;
+ struct mfc_cache *c;
+ const struct iphdr *iph = ip_hdr(skb);
+
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
+ if (c->mfc_mcastgrp == iph->daddr &&
+ c->mfc_origin == iph->saddr) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ /* Create a new entry if allowable */
+
+ if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
+ (c = ipmr_cache_alloc_unres()) == NULL) {
+ spin_unlock_bh(&mfc_unres_lock);
+
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+
+ /* Fill in the new cache entry */
+
+ c->mfc_parent = -1;
+ c->mfc_origin = iph->saddr;
+ c->mfc_mcastgrp = iph->daddr;
+
+ /* Reflect first query at mrouted. */
+
+ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
+ if (err < 0) {
+ /* If the report failed throw the cache entry
+ out - Brad Parker
+ */
+ spin_unlock_bh(&mfc_unres_lock);
+
+ ipmr_cache_free(c);
+ kfree_skb(skb);
+ return err;
+ }
+
+ atomic_inc(&mrt->cache_resolve_queue_len);
+ list_add(&c->list, &mrt->mfc_unres_queue);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+
+ if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
+ mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
+ }
+
+ /* See if we can append the packet */
+
+ if (c->mfc_un.unres.unresolved.qlen > 3) {
+ kfree_skb(skb);
+ err = -ENOBUFS;
+ } else {
+ skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
+ err = 0;
+ }
+
+ spin_unlock_bh(&mfc_unres_lock);
+ return err;
+}
+
+/*
+ * MFC cache manipulation by user space mroute daemon
+ */
+
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
+{
+ int line;
+ struct mfc_cache *c, *next;
+
+ line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+ list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
+ if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
+ struct mfcctl *mfc, int mrtsock, int parent)
+{
+ bool found = false;
+ int line;
+ struct mfc_cache *uc, *c;
+
+ if (mfc->mfcc_parent >= MAXVIFS)
+ return -ENFILE;
+
+ line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+ list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+ if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
+ (parent == -1 || parent == c->mfc_parent)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ write_lock_bh(&mrt_lock);
+ c->mfc_parent = mfc->mfcc_parent;
+ ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+ write_unlock_bh(&mrt_lock);
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+ return 0;
+ }
+
+ if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
+ !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
+ return -EINVAL;
+
+ c = ipmr_cache_alloc();
+ if (!c)
+ return -ENOMEM;
+
+ c->mfc_origin = mfc->mfcc_origin.s_addr;
+ c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
+ c->mfc_parent = mfc->mfcc_parent;
+ ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+
+ list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
+
+ /*
+ * Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
+ */
+ found = false;
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
+ if (uc->mfc_origin == c->mfc_origin &&
+ uc->mfc_mcastgrp == c->mfc_mcastgrp) {
+ list_del(&uc->list);
+ atomic_dec(&mrt->cache_resolve_queue_len);
+ found = true;
+ break;
+ }
+ }
+ if (list_empty(&mrt->mfc_unres_queue))
+ del_timer(&mrt->ipmr_expire_timer);
+ spin_unlock_bh(&mfc_unres_lock);
+
+ if (found) {
+ ipmr_cache_resolve(net, mrt, uc, c);
+ ipmr_cache_free(uc);
+ }
+ mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+ return 0;
+}
+
+/*
+ * Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct mr_table *mrt)
+{
+ int i;
+ LIST_HEAD(list);
+ struct mfc_cache *c, *next;
+
+ /* Shut down all active vif entries */
+
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (!(mrt->vif_table[i].flags & VIFF_STATIC))
+ vif_delete(mrt, i, 0, &list);
+ }
+ unregister_netdevice_many(&list);
+
+ /* Wipe the cache */
+
+ for (i = 0; i < MFC_LINES; i++) {
+ list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
+ if (c->mfc_flags & MFC_STATIC)
+ continue;
+ list_del_rcu(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_cache_free(c);
+ }
+ }
+
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ list_del(&c->list);
+ mroute_netlink_event(mrt, c, RTM_DELROUTE);
+ ipmr_destroy_unres(mrt, c);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ }
+}
+
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
+static void mrtsock_destruct(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ rtnl_lock();
+ ipmr_for_each_table(mrt, net) {
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ RCU_INIT_POINTER(mrt->mroute_sk, NULL);
+ mroute_clean_tables(mrt);
+ }
+ }
+ rtnl_unlock();
+}
+
+/*
+ * Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+{
+ int ret, parent = 0;
+ struct vifctl vif;
+ struct mfcctl mfc;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ if (optname != MRT_INIT) {
+ if (sk != rcu_access_pointer(mrt->mroute_sk) &&
+ !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EACCES;
+ }
+
+ switch (optname) {
+ case MRT_INIT:
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ rtnl_lock();
+ if (rtnl_dereference(mrt->mroute_sk)) {
+ rtnl_unlock();
+ return -EADDRINUSE;
+ }
+
+ ret = ip_ra_control(sk, 1, mrtsock_destruct);
+ if (ret == 0) {
+ rcu_assign_pointer(mrt->mroute_sk, sk);
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
+ inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all);
+ }
+ rtnl_unlock();
+ return ret;
+ case MRT_DONE:
+ if (sk != rcu_access_pointer(mrt->mroute_sk))
+ return -EACCES;
+ return ip_ra_control(sk, 0, NULL);
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ if (optlen != sizeof(vif))
+ return -EINVAL;
+ if (copy_from_user(&vif, optval, sizeof(vif)))
+ return -EFAULT;
+ if (vif.vifc_vifi >= MAXVIFS)
+ return -ENFILE;
+ rtnl_lock();
+ if (optname == MRT_ADD_VIF) {
+ ret = vif_add(net, mrt, &vif,
+ sk == rtnl_dereference(mrt->mroute_sk));
+ } else {
+ ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
+ }
+ rtnl_unlock();
+ return ret;
+
+ /*
+ * Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ parent = -1;
+ case MRT_ADD_MFC_PROXY:
+ case MRT_DEL_MFC_PROXY:
+ if (optlen != sizeof(mfc))
+ return -EINVAL;
+ if (copy_from_user(&mfc, optval, sizeof(mfc)))
+ return -EFAULT;
+ if (parent == 0)
+ parent = mfc.mfcc_parent;
+ rtnl_lock();
+ if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
+ ret = ipmr_mfc_delete(mrt, &mfc, parent);
+ else
+ ret = ipmr_mfc_add(net, mrt, &mfc,
+ sk == rtnl_dereference(mrt->mroute_sk),
+ parent);
+ rtnl_unlock();
+ return ret;
+ /*
+ * Control PIM assert.
+ */
+ case MRT_ASSERT:
+ {
+ int v;
+ if (optlen != sizeof(v))
+ return -EINVAL;
+ if (get_user(v, (int __user *)optval))
+ return -EFAULT;
+ mrt->mroute_do_assert = v;
+ return 0;
+ }
+#ifdef CONFIG_IP_PIMSM
+ case MRT_PIM:
+ {
+ int v;
+
+ if (optlen != sizeof(v))
+ return -EINVAL;
+ if (get_user(v, (int __user *)optval))
+ return -EFAULT;
+ v = !!v;
+
+ rtnl_lock();
+ ret = 0;
+ if (v != mrt->mroute_do_pim) {
+ mrt->mroute_do_pim = v;
+ mrt->mroute_do_assert = v;
+ }
+ rtnl_unlock();
+ return ret;
+ }
+#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ case MRT_TABLE:
+ {
+ u32 v;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (get_user(v, (u32 __user *)optval))
+ return -EFAULT;
+
+ /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (v != RT_TABLE_DEFAULT && v >= 1000000000)
+ return -EINVAL;
+
+ rtnl_lock();
+ ret = 0;
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ ret = -EBUSY;
+ } else {
+ if (!ipmr_new_table(net, v))
+ ret = -ENOMEM;
+ else
+ raw_sk(sk)->ipmr_table = v;
+ }
+ rtnl_unlock();
+ return ret;
+ }
+#endif
+ /*
+ * Spurious command, or MRT_VERSION which you cannot
+ * set.
+ */
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+/*
+ * Getsock opt support for the multicast routing system.
+ */
+
+int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
+{
+ int olr;
+ int val;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
+ return -EOPNOTSUPP;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ if (optname != MRT_VERSION &&
+#ifdef CONFIG_IP_PIMSM
+ optname != MRT_PIM &&
+#endif
+ optname != MRT_ASSERT)
+ return -ENOPROTOOPT;
+
+ if (get_user(olr, optlen))
+ return -EFAULT;
+
+ olr = min_t(unsigned int, olr, sizeof(int));
+ if (olr < 0)
+ return -EINVAL;
+
+ if (put_user(olr, optlen))
+ return -EFAULT;
+ if (optname == MRT_VERSION)
+ val = 0x0305;
+#ifdef CONFIG_IP_PIMSM
+ else if (optname == MRT_PIM)
+ val = mrt->mroute_do_pim;
+#endif
+ else
+ val = mrt->mroute_do_assert;
+ if (copy_to_user(optval, &val, olr))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * The IP multicast ioctl support routines.
+ */
+
+int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+{
+ struct sioc_sg_req sr;
+ struct sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.vifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+ struct in_addr src;
+ struct in_addr grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_vif_req {
+ vifi_t vifi; /* Which iface */
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req sr;
+ struct compat_sioc_vif_req vr;
+ struct vif_device *vif;
+ struct mfc_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETVIFCNT:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.vifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ rcu_read_unlock();
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+
+static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct mr_table *mrt;
+ struct vif_device *v;
+ int ct;
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ ipmr_for_each_table(mrt, net) {
+ v = &mrt->vif_table[0];
+ for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+ if (v->dev == dev)
+ vif_delete(mrt, ct, 1, NULL);
+ }
+ }
+ return NOTIFY_DONE;
+}
+
+
+static struct notifier_block ip_mr_notifier = {
+ .notifier_call = ipmr_device_event,
+};
+
+/*
+ * Encapsulate a packet by attaching a valid IPIP header to it.
+ * This avoids tunnel drivers and other mess and gives us the speed so
+ * important for multicast video.
+ */
+
+static void ip_encap(struct net *net, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr)
+{
+ struct iphdr *iph;
+ const struct iphdr *old_iph = ip_hdr(skb);
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb->transport_header = skb->network_header;
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ iph->version = 4;
+ iph->tos = old_iph->tos;
+ iph->ttl = old_iph->ttl;
+ iph->frag_off = 0;
+ iph->daddr = daddr;
+ iph->saddr = saddr;
+ iph->protocol = IPPROTO_IPIP;
+ iph->ihl = 5;
+ iph->tot_len = htons(skb->len);
+ ip_select_ident(net, skb, NULL);
+ ip_send_check(iph);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ nf_reset(skb);
+}
+
+static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
+
+ if (unlikely(opt->optlen))
+ ip_forward_options(skb);
+
+ return dst_output_sk(sk, skb);
+}
+
+/*
+ * Processing handlers for ipmr_forward
+ */
+
+static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *c, int vifi)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct vif_device *vif = &mrt->vif_table[vifi];
+ struct net_device *dev;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ int encap = 0;
+
+ if (!vif->dev)
+ goto out_free;
+
+#ifdef CONFIG_IP_PIMSM
+ if (vif->flags & VIFF_REGISTER) {
+ vif->pkt_out++;
+ vif->bytes_out += skb->len;
+ vif->dev->stats.tx_bytes += skb->len;
+ vif->dev->stats.tx_packets++;
+ ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
+ goto out_free;
+ }
+#endif
+
+ if (vif->flags & VIFF_TUNNEL) {
+ rt = ip_route_output_ports(net, &fl4, NULL,
+ vif->remote, vif->local,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos), vif->link);
+ if (IS_ERR(rt))
+ goto out_free;
+ encap = sizeof(struct iphdr);
+ } else {
+ rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(iph->tos), vif->link);
+ if (IS_ERR(rt))
+ goto out_free;
+ }
+
+ dev = rt->dst.dev;
+
+ if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ /* Do not fragment multicasts. Alas, IPv4 does not
+ * allow to send ICMP, so that packets will disappear
+ * to blackhole.
+ */
+
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ ip_rt_put(rt);
+ goto out_free;
+ }
+
+ encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
+
+ if (skb_cow(skb, encap)) {
+ ip_rt_put(rt);
+ goto out_free;
+ }
+
+ vif->pkt_out++;
+ vif->bytes_out += skb->len;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ ip_decrease_ttl(ip_hdr(skb));
+
+ /* FIXME: forward and output firewalls used to be called here.
+ * What do we do with netfilter? -- RR
+ */
+ if (vif->flags & VIFF_TUNNEL) {
+ ip_encap(net, skb, vif->local, vif->remote);
+ /* FIXME: extra output firewall step used to be here. --RR */
+ vif->dev->stats.tx_packets++;
+ vif->dev->stats.tx_bytes += skb->len;
+ }
+
+ IPCB(skb)->flags |= IPSKB_FORWARDED;
+
+ /*
+ * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ * not only before forwarding, but after forwarding on all output
+ * interfaces. It is clear, if mrouter runs a multicasting
+ * program, it should receive packets not depending to what interface
+ * program is joined.
+ * If we will not make it, the program will have to join on all
+ * interfaces. On the other hand, multihoming host (or router, but
+ * not mrouter) cannot join to more than one interface - it will
+ * result in receiving multiple packets.
+ */
+ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, NULL, skb,
+ skb->dev, dev,
+ ipmr_forward_finish);
+ return;
+
+out_free:
+ kfree_skb(skb);
+}
+
+static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
+{
+ int ct;
+
+ for (ct = mrt->maxvif-1; ct >= 0; ct--) {
+ if (mrt->vif_table[ct].dev == dev)
+ break;
+ }
+ return ct;
+}
+
+/* "local" means that we should preserve one skb (for local delivery) */
+
+static void ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local)
+{
+ int psend = -1;
+ int vif, ct;
+ int true_vifi = ipmr_find_vif(mrt, skb->dev);
+
+ vif = cache->mfc_parent;
+ cache->mfc_un.res.pkt++;
+ cache->mfc_un.res.bytes += skb->len;
+
+ if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+ struct mfc_cache *cache_proxy;
+
+ /* For an (*,G) entry, we only check that the incomming
+ * interface is part of the static tree.
+ */
+ cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
+ if (cache_proxy &&
+ cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+ goto forward;
+ }
+
+ /*
+ * Wrong interface: drop packet and (maybe) send PIM assert.
+ */
+ if (mrt->vif_table[vif].dev != skb->dev) {
+ if (rt_is_output_route(skb_rtable(skb))) {
+ /* It is our own packet, looped back.
+ * Very complicated situation...
+ *
+ * The best workaround until routing daemons will be
+ * fixed is not to redistribute packet, if it was
+ * send through wrong interface. It means, that
+ * multicast applications WILL NOT work for
+ * (S,G), which have default multicast route pointing
+ * to wrong oif. In any case, it is not a good
+ * idea to use multicasting applications on router.
+ */
+ goto dont_forward;
+ }
+
+ cache->mfc_un.res.wrong_if++;
+
+ if (true_vifi >= 0 && mrt->mroute_do_assert &&
+ /* pimsm uses asserts, when switching from RPT to SPT,
+ * so that we cannot check that packet arrived on an oif.
+ * It is bad, but otherwise we would need to move pretty
+ * large chunk of pimd to kernel. Ough... --ANK
+ */
+ (mrt->mroute_do_pim ||
+ cache->mfc_un.res.ttls[true_vifi] < 255) &&
+ time_after(jiffies,
+ cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+ cache->mfc_un.res.last_assert = jiffies;
+ ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+ }
+ goto dont_forward;
+ }
+
+forward:
+ mrt->vif_table[vif].pkt_in++;
+ mrt->vif_table[vif].bytes_in += skb->len;
+
+ /*
+ * Forward the frame
+ */
+ if (cache->mfc_origin == htonl(INADDR_ANY) &&
+ cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (true_vifi >= 0 &&
+ true_vifi != cache->mfc_parent &&
+ ip_hdr(skb)->ttl >
+ cache->mfc_un.res.ttls[cache->mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = cache->mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
+ for (ct = cache->mfc_un.res.maxvif - 1;
+ ct >= cache->mfc_un.res.minvif; ct--) {
+ /* For (*,G) entry, don't forward to the incoming interface */
+ if ((cache->mfc_origin != htonl(INADDR_ANY) ||
+ ct != true_vifi) &&
+ ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ if (skb2)
+ ipmr_queue_xmit(net, mrt, skb2, cache,
+ psend);
+ }
+ psend = ct;
+ }
+ }
+last_forward:
+ if (psend != -1) {
+ if (local) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ if (skb2)
+ ipmr_queue_xmit(net, mrt, skb2, cache, psend);
+ } else {
+ ipmr_queue_xmit(net, mrt, skb, cache, psend);
+ return;
+ }
+ }
+
+dont_forward:
+ if (!local)
+ kfree_skb(skb);
+}
+
+static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_oif = (rt_is_output_route(rt) ?
+ skb->dev->ifindex : 0),
+ .flowi4_iif = (rt_is_output_route(rt) ?
+ LOOPBACK_IFINDEX :
+ skb->dev->ifindex),
+ .flowi4_mark = skb->mark,
+ };
+ struct mr_table *mrt;
+ int err;
+
+ err = ipmr_fib_lookup(net, &fl4, &mrt);
+ if (err)
+ return ERR_PTR(err);
+ return mrt;
+}
+
+/*
+ * Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
+ */
+
+int ip_mr_input(struct sk_buff *skb)
+{
+ struct mfc_cache *cache;
+ struct net *net = dev_net(skb->dev);
+ int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+ struct mr_table *mrt;
+
+ /* Packet is looped back after forward, it should not be
+ * forwarded second time, but still can be delivered locally.
+ */
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
+ goto dont_forward;
+
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt)) {
+ kfree_skb(skb);
+ return PTR_ERR(mrt);
+ }
+ if (!local) {
+ if (IPCB(skb)->opt.router_alert) {
+ if (ip_call_ra_chain(skb))
+ return 0;
+ } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+ /* IGMPv1 (and broken IGMPv2 implementations sort of
+ * Cisco IOS <= 11.2(8)) do not put router alert
+ * option to IGMP packets destined to routable
+ * groups. It is very bad, because it means
+ * that we can forward NO IGMP messages.
+ */
+ struct sock *mroute_sk;
+
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk) {
+ nf_reset(skb);
+ raw_rcv(mroute_sk, skb);
+ return 0;
+ }
+ }
+ }
+
+ /* already under rcu_read_lock() */
+ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (!cache) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
+
+ /*
+ * No usable cache entry
+ */
+ if (!cache) {
+ int vif;
+
+ if (local) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ ip_local_deliver(skb);
+ if (!skb2)
+ return -ENOBUFS;
+ skb = skb2;
+ }
+
+ read_lock(&mrt_lock);
+ vif = ipmr_find_vif(mrt, skb->dev);
+ if (vif >= 0) {
+ int err2 = ipmr_cache_unresolved(mrt, vif, skb);
+ read_unlock(&mrt_lock);
+
+ return err2;
+ }
+ read_unlock(&mrt_lock);
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ read_lock(&mrt_lock);
+ ip_mr_forward(net, mrt, skb, cache, local);
+ read_unlock(&mrt_lock);
+
+ if (local)
+ return ip_local_deliver(skb);
+
+ return 0;
+
+dont_forward:
+ if (local)
+ return ip_local_deliver(skb);
+ kfree_skb(skb);
+ return 0;
+}
+
+#ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+ unsigned int pimlen)
+{
+ struct net_device *reg_dev = NULL;
+ struct iphdr *encap;
+
+ encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
+ /*
+ * Check that:
+ * a. packet is really sent to a multicast group
+ * b. packet is not a NULL-REGISTER
+ * c. packet is not truncated
+ */
+ if (!ipv4_is_multicast(encap->daddr) ||
+ encap->tot_len == 0 ||
+ ntohs(encap->tot_len) + pimlen > skb->len)
+ return 1;
+
+ read_lock(&mrt_lock);
+ if (mrt->mroute_reg_vif_num >= 0)
+ reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
+ read_unlock(&mrt_lock);
+
+ if (!reg_dev)
+ return 1;
+
+ skb->mac_header = skb->network_header;
+ skb_pull(skb, (u8 *)encap - skb->data);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
+
+ netif_rx(skb);
+
+ return NET_RX_SUCCESS;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V1
+/*
+ * Handle IGMP messages of PIMv1
+ */
+
+int pim_rcv_v1(struct sk_buff *skb)
+{
+ struct igmphdr *pim;
+ struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
+
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+ goto drop;
+
+ pim = igmp_hdr(skb);
+
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
+ goto drop;
+ if (!mrt->mroute_do_pim ||
+ pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
+ goto drop;
+
+ if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+ kfree_skb(skb);
+ }
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static int pim_rcv(struct sk_buff *skb)
+{
+ struct pimreghdr *pim;
+ struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
+
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+ goto drop;
+
+ pim = (struct pimreghdr *)skb_transport_header(skb);
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+ (pim->flags & PIM_NULL_REGISTER) ||
+ (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+ csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+ goto drop;
+
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
+ goto drop;
+ if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+ kfree_skb(skb);
+ }
+ return 0;
+}
+#endif
+
+static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ struct mfc_cache *c, struct rtmsg *rtm)
+{
+ int ct;
+ struct rtnexthop *nhp;
+ struct nlattr *mp_attr;
+ struct rta_mfc_stats mfcs;
+
+ /* If cache is unresolved, don't try to parse IIF and OIF */
+ if (c->mfc_parent >= MAXVIFS)
+ return -ENOENT;
+
+ if (VIF_EXISTS(mrt, c->mfc_parent) &&
+ nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+ return -EMSGSIZE;
+
+ if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
+ return -EMSGSIZE;
+
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+ if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
+ nla_nest_cancel(skb, mp_attr);
+ return -EMSGSIZE;
+ }
+
+ nhp->rtnh_flags = 0;
+ nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+ nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
+ nhp->rtnh_len = sizeof(*nhp);
+ }
+ }
+
+ nla_nest_end(skb, mp_attr);
+
+ mfcs.mfcs_packets = c->mfc_un.res.pkt;
+ mfcs.mfcs_bytes = c->mfc_un.res.bytes;
+ mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+ if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
+ return -EMSGSIZE;
+
+ rtm->rtm_type = RTN_MULTICAST;
+ return 1;
+}
+
+int ipmr_get_route(struct net *net, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr,
+ struct rtmsg *rtm, int nowait)
+{
+ struct mfc_cache *cache;
+ struct mr_table *mrt;
+ int err;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ rcu_read_lock();
+ cache = ipmr_cache_find(mrt, saddr, daddr);
+ if (!cache && skb->dev) {
+ int vif = ipmr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, daddr, vif);
+ }
+ if (!cache) {
+ struct sk_buff *skb2;
+ struct iphdr *iph;
+ struct net_device *dev;
+ int vif = -1;
+
+ if (nowait) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
+
+ dev = skb->dev;
+ read_lock(&mrt_lock);
+ if (dev)
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif < 0) {
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2) {
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
+
+ skb_push(skb2, sizeof(struct iphdr));
+ skb_reset_network_header(skb2);
+ iph = ip_hdr(skb2);
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+ iph->version = 0;
+ err = ipmr_cache_unresolved(mrt, vif, skb2);
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return err;
+ }
+
+ read_lock(&mrt_lock);
+ if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
+ cache->mfc_flags |= MFC_NOTIFY;
+ err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return err;
+}
+
+static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mfc_cache *c, int cmd,
+ int flags)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+ int err;
+
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = RTNL_FAMILY_IPMR;
+ rtm->rtm_dst_len = 32;
+ rtm->rtm_src_len = 32;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = mrt->id;
+ if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+ goto nla_put_failure;
+ rtm->rtm_type = RTN_MULTICAST;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ if (c->mfc_flags & MFC_STATIC)
+ rtm->rtm_protocol = RTPROT_STATIC;
+ else
+ rtm->rtm_protocol = RTPROT_MROUTED;
+ rtm->rtm_flags = 0;
+
+ if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
+ nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
+ goto nla_put_failure;
+ err = __ipmr_fill_mroute(mrt, skb, c, rtm);
+ /* do not break the dump if cache is unresolved */
+ if (err < 0 && err != -ENOENT)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static size_t mroute_msgsize(bool unresolved, int maxvif)
+{
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(4) /* RTA_SRC */
+ + nla_total_size(4) /* RTA_DST */
+ ;
+
+ if (!unresolved)
+ len = len
+ + nla_total_size(4) /* RTA_IIF */
+ + nla_total_size(0) /* RTA_MULTIPATH */
+ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+ /* RTA_MFC_STATS */
+ + nla_total_size(sizeof(struct rta_mfc_stats))
+ ;
+
+ return len;
+}
+
+static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
+ int cmd)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
+ GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
+ if (err < 0)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ kfree_skb(skb);
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
+}
+
+static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct mr_table *mrt;
+ struct mfc_cache *mfc;
+ unsigned int t = 0, s_t;
+ unsigned int h = 0, s_h;
+ unsigned int e = 0, s_e;
+
+ s_t = cb->args[0];
+ s_h = cb->args[1];
+ s_e = cb->args[2];
+
+ rcu_read_lock();
+ ipmr_for_each_table(mrt, net) {
+ if (t < s_t)
+ goto next_table;
+ if (t > s_t)
+ s_h = 0;
+ for (h = s_h; h < MFC_LINES; h++) {
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
+ if (e < s_e)
+ goto next_entry;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0)
+ goto done;
+next_entry:
+ e++;
+ }
+ e = s_e = 0;
+ }
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0) {
+ spin_unlock_bh(&mfc_unres_lock);
+ goto done;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ e = s_e = 0;
+ s_h = 0;
+next_table:
+ t++;
+ }
+done:
+ rcu_read_unlock();
+
+ cb->args[2] = e;
+ cb->args[1] = h;
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ * The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
+ */
+struct ipmr_vif_iter {
+ struct seq_net_private p;
+ struct mr_table *mrt;
+ int ct;
+};
+
+static struct vif_device *ipmr_vif_seq_idx(struct net *net,
+ struct ipmr_vif_iter *iter,
+ loff_t pos)
+{
+ struct mr_table *mrt = iter->mrt;
+
+ for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+ if (!VIF_EXISTS(mrt, iter->ct))
+ continue;
+ if (pos-- == 0)
+ return &mrt->vif_table[iter->ct];
+ }
+ return NULL;
+}
+
+static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(mrt_lock)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return ERR_PTR(-ENOENT);
+
+ iter->mrt = mrt;
+
+ read_lock(&mrt_lock);
+ return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
+ : SEQ_START_TOKEN;
+}
+
+static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = iter->mrt;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ipmr_vif_seq_idx(net, iter, 0);
+
+ while (++iter->ct < mrt->maxvif) {
+ if (!VIF_EXISTS(mrt, iter->ct))
+ continue;
+ return &mrt->vif_table[iter->ct];
+ }
+ return NULL;
+}
+
+static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+ __releases(mrt_lock)
+{
+ read_unlock(&mrt_lock);
+}
+
+static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct mr_table *mrt = iter->mrt;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
+ } else {
+ const struct vif_device *vif = v;
+ const char *name = vif->dev ? vif->dev->name : "none";
+
+ seq_printf(seq,
+ "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
+ vif - mrt->vif_table,
+ name, vif->bytes_in, vif->pkt_in,
+ vif->bytes_out, vif->pkt_out,
+ vif->flags, vif->local, vif->remote);
+ }
+ return 0;
+}
+
+static const struct seq_operations ipmr_vif_seq_ops = {
+ .start = ipmr_vif_seq_start,
+ .next = ipmr_vif_seq_next,
+ .stop = ipmr_vif_seq_stop,
+ .show = ipmr_vif_seq_show,
+};
+
+static int ipmr_vif_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ipmr_vif_seq_ops,
+ sizeof(struct ipmr_vif_iter));
+}
+
+static const struct file_operations ipmr_vif_fops = {
+ .owner = THIS_MODULE,
+ .open = ipmr_vif_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+struct ipmr_mfc_iter {
+ struct seq_net_private p;
+ struct mr_table *mrt;
+ struct list_head *cache;
+ int ct;
+};
+
+
+static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
+ struct ipmr_mfc_iter *it, loff_t pos)
+{
+ struct mr_table *mrt = it->mrt;
+ struct mfc_cache *mfc;
+
+ rcu_read_lock();
+ for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
+ it->cache = &mrt->mfc_cache_array[it->ct];
+ list_for_each_entry_rcu(mfc, it->cache, list)
+ if (pos-- == 0)
+ return mfc;
+ }
+ rcu_read_unlock();
+
+ spin_lock_bh(&mfc_unres_lock);
+ it->cache = &mrt->mfc_unres_queue;
+ list_for_each_entry(mfc, it->cache, list)
+ if (pos-- == 0)
+ return mfc;
+ spin_unlock_bh(&mfc_unres_lock);
+
+ it->cache = NULL;
+ return NULL;
+}
+
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct ipmr_mfc_iter *it = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return ERR_PTR(-ENOENT);
+
+ it->mrt = mrt;
+ it->cache = NULL;
+ it->ct = 0;
+ return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
+ : SEQ_START_TOKEN;
+}
+
+static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct mfc_cache *mfc = v;
+ struct ipmr_mfc_iter *it = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = it->mrt;
+
+ ++*pos;
+
+ if (v == SEQ_START_TOKEN)
+ return ipmr_mfc_seq_idx(net, seq->private, 0);
+
+ if (mfc->list.next != it->cache)
+ return list_entry(mfc->list.next, struct mfc_cache, list);
+
+ if (it->cache == &mrt->mfc_unres_queue)
+ goto end_of_list;
+
+ BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
+
+ while (++it->ct < MFC_LINES) {
+ it->cache = &mrt->mfc_cache_array[it->ct];
+ if (list_empty(it->cache))
+ continue;
+ return list_first_entry(it->cache, struct mfc_cache, list);
+ }
+
+ /* exhausted cache_array, show unresolved */
+ rcu_read_unlock();
+ it->cache = &mrt->mfc_unres_queue;
+ it->ct = 0;
+
+ spin_lock_bh(&mfc_unres_lock);
+ if (!list_empty(it->cache))
+ return list_first_entry(it->cache, struct mfc_cache, list);
+
+end_of_list:
+ spin_unlock_bh(&mfc_unres_lock);
+ it->cache = NULL;
+
+ return NULL;
+}
+
+static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+ struct ipmr_mfc_iter *it = seq->private;
+ struct mr_table *mrt = it->mrt;
+
+ if (it->cache == &mrt->mfc_unres_queue)
+ spin_unlock_bh(&mfc_unres_lock);
+ else if (it->cache == &mrt->mfc_cache_array[it->ct])
+ rcu_read_unlock();
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+ int n;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Group Origin Iif Pkts Bytes Wrong Oifs\n");
+ } else {
+ const struct mfc_cache *mfc = v;
+ const struct ipmr_mfc_iter *it = seq->private;
+ const struct mr_table *mrt = it->mrt;
+
+ seq_printf(seq, "%08X %08X %-3hd",
+ (__force u32) mfc->mfc_mcastgrp,
+ (__force u32) mfc->mfc_origin,
+ mfc->mfc_parent);
+
+ if (it->cache != &mrt->mfc_unres_queue) {
+ seq_printf(seq, " %8lu %8lu %8lu",
+ mfc->mfc_un.res.pkt,
+ mfc->mfc_un.res.bytes,
+ mfc->mfc_un.res.wrong_if);
+ for (n = mfc->mfc_un.res.minvif;
+ n < mfc->mfc_un.res.maxvif; n++) {
+ if (VIF_EXISTS(mrt, n) &&
+ mfc->mfc_un.res.ttls[n] < 255)
+ seq_printf(seq,
+ " %2d:%-3d",
+ n, mfc->mfc_un.res.ttls[n]);
+ }
+ } else {
+ /* unresolved mfc_caches don't contain
+ * pkt, bytes and wrong_if values
+ */
+ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
+ }
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static const struct seq_operations ipmr_mfc_seq_ops = {
+ .start = ipmr_mfc_seq_start,
+ .next = ipmr_mfc_seq_next,
+ .stop = ipmr_mfc_seq_stop,
+ .show = ipmr_mfc_seq_show,
+};
+
+static int ipmr_mfc_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
+ sizeof(struct ipmr_mfc_iter));
+}
+
+static const struct file_operations ipmr_mfc_fops = {
+ .owner = THIS_MODULE,
+ .open = ipmr_mfc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static const struct net_protocol pim_protocol = {
+ .handler = pim_rcv,
+ .netns_ok = 1,
+};
+#endif
+
+
+/*
+ * Setup for IP multicast routing
+ */
+static int __net_init ipmr_net_init(struct net *net)
+{
+ int err;
+
+ err = ipmr_rules_init(net);
+ if (err < 0)
+ goto fail;
+
+#ifdef CONFIG_PROC_FS
+ err = -ENOMEM;
+ if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
+ goto proc_vif_fail;
+ if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
+ goto proc_cache_fail;
+#endif
+ return 0;
+
+#ifdef CONFIG_PROC_FS
+proc_cache_fail:
+ remove_proc_entry("ip_mr_vif", net->proc_net);
+proc_vif_fail:
+ ipmr_rules_exit(net);
+#endif
+fail:
+ return err;
+}
+
+static void __net_exit ipmr_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_mr_cache", net->proc_net);
+ remove_proc_entry("ip_mr_vif", net->proc_net);
+#endif
+ ipmr_rules_exit(net);
+}
+
+static struct pernet_operations ipmr_net_ops = {
+ .init = ipmr_net_init,
+ .exit = ipmr_net_exit,
+};
+
+int __init ip_mr_init(void)
+{
+ int err;
+
+ mrt_cachep = kmem_cache_create("ip_mrt_cache",
+ sizeof(struct mfc_cache),
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ NULL);
+ if (!mrt_cachep)
+ return -ENOMEM;
+
+ err = register_pernet_subsys(&ipmr_net_ops);
+ if (err)
+ goto reg_pernet_fail;
+
+ err = register_netdevice_notifier(&ip_mr_notifier);
+ if (err)
+ goto reg_notif_fail;
+#ifdef CONFIG_IP_PIMSM_V2
+ if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
+ pr_err("%s: can't add PIM protocol\n", __func__);
+ err = -EAGAIN;
+ goto add_proto_fail;
+ }
+#endif
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
+ NULL, ipmr_rtm_dumproute, NULL);
+ return 0;
+
+#ifdef CONFIG_IP_PIMSM_V2
+add_proto_fail:
+ unregister_netdevice_notifier(&ip_mr_notifier);
+#endif
+reg_notif_fail:
+ unregister_pernet_subsys(&ipmr_net_ops);
+reg_pernet_fail:
+ kmem_cache_destroy(mrt_cachep);
+ return err;
+}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 000000000..65de0684e
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,207 @@
+/*
+ * IPv4 specific functions of netfilter core
+ *
+ * Rusty Russell (C) 2000 -- This code is GPL.
+ * Patrick McHardy (C) 2006-2012
+ */
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_queue.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+ struct flowi4 fl4 = {};
+ __be32 saddr = iph->saddr;
+ __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+ unsigned int hh_len;
+
+ if (addr_type == RTN_UNSPEC)
+ addr_type = inet_addr_type(net, saddr);
+ if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
+ flags |= FLOWI_FLAG_ANYSRC;
+ else
+ saddr = 0;
+
+ /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+ * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
+ */
+ fl4.daddr = iph->daddr;
+ fl4.saddr = saddr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_flags = flags;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ /* Drop old route. */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+
+ if (skb_dst(skb)->error)
+ return skb_dst(skb)->error;
+
+#ifdef CONFIG_XFRM
+ if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
+ struct dst_entry *dst = skb_dst(skb);
+ skb_dst_set(skb, NULL);
+ dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+ skb_dst_set(skb, dst);
+ }
+#endif
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+ 0, GFP_ATOMIC))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip_rt_info {
+ __be32 daddr;
+ __be32 saddr;
+ u_int8_t tos;
+ u_int32_t mark;
+};
+
+static void nf_ip_saveroute(const struct sk_buff *skb,
+ struct nf_queue_entry *entry)
+{
+ struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ rt_info->tos = iph->tos;
+ rt_info->daddr = iph->daddr;
+ rt_info->saddr = iph->saddr;
+ rt_info->mark = skb->mark;
+ }
+}
+
+static int nf_ip_reroute(struct sk_buff *skb,
+ const struct nf_queue_entry *entry)
+{
+ const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (!(iph->tos == rt_info->tos &&
+ skb->mark == rt_info->mark &&
+ iph->daddr == rt_info->daddr &&
+ iph->saddr == rt_info->saddr))
+ return ip_route_me_harder(skb, RTN_UNSPEC);
+ }
+ return 0;
+}
+
+__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, u_int8_t protocol)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+ break;
+ if ((protocol == 0 && !csum_fold(skb->csum)) ||
+ !csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - dataoff, protocol,
+ skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+ /* fall through */
+ case CHECKSUM_NONE:
+ if (protocol == 0)
+ skb->csum = 0;
+ else
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ skb->len - dataoff,
+ protocol, 0);
+ csum = __skb_checksum_complete(skb);
+ }
+ return csum;
+}
+EXPORT_SYMBOL(nf_ip_checksum);
+
+static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u_int8_t protocol)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (len == skb->len - dataoff)
+ return nf_ip_checksum(skb, hook, dataoff, protocol);
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
+ skb->len - dataoff, 0);
+ skb->ip_summed = CHECKSUM_NONE;
+ return __skb_checksum_complete_head(skb, dataoff + len);
+ }
+ return csum;
+}
+
+static int nf_ip_route(struct net *net, struct dst_entry **dst,
+ struct flowi *fl, bool strict __always_unused)
+{
+ struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ *dst = &rt->dst;
+ return 0;
+}
+
+static const struct nf_afinfo nf_ip_afinfo = {
+ .family = AF_INET,
+ .checksum = nf_ip_checksum,
+ .checksum_partial = nf_ip_checksum_partial,
+ .route = nf_ip_route,
+ .saveroute = nf_ip_saveroute,
+ .reroute = nf_ip_reroute,
+ .route_key_size = sizeof(struct ip_rt_info),
+};
+
+static int __init ipv4_netfilter_init(void)
+{
+ return nf_register_afinfo(&nf_ip_afinfo);
+}
+
+static void __exit ipv4_netfilter_fini(void)
+{
+ nf_unregister_afinfo(&nf_ip_afinfo);
+}
+
+module_init(ipv4_netfilter_init);
+module_exit(ipv4_netfilter_fini);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
new file mode 100644
index 000000000..fb20f3631
--- /dev/null
+++ b/net/ipv4/netfilter/Kconfig
@@ -0,0 +1,406 @@
+#
+# IP netfilter configuration
+#
+
+menu "IP: Netfilter Configuration"
+ depends on INET && NETFILTER
+
+config NF_DEFRAG_IPV4
+ tristate
+ default n
+
+config NF_CONNTRACK_IPV4
+ tristate "IPv4 connection tracking support (required for NAT)"
+ depends on NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
+ select NF_DEFRAG_IPV4
+ ---help---
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ This is IPv4 support on Layer 3 independent connection tracking.
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_PROC_COMPAT
+ bool "proc/sysctl compatibility with old connection tracking"
+ depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
+ default y
+ help
+ This option enables /proc and sysctl compatibility with the old
+ layer 3 dependent connection tracking. This is needed to keep
+ old programs that have not been adapted to the new names working.
+
+ If unsure, say Y.
+
+if NF_TABLES
+
+config NF_TABLES_IPV4
+ tristate "IPv4 nf_tables support"
+ help
+ This option enables the IPv4 support for nf_tables.
+
+if NF_TABLES_IPV4
+
+config NFT_CHAIN_ROUTE_IPV4
+ tristate "IPv4 nf_tables route chain support"
+ help
+ This option enables the "route" chain for IPv4 in nf_tables. This
+ chain type is used to force packet re-routing after mangling header
+ fields such as the source, destination, type of service and
+ the packet mark.
+
+config NFT_REJECT_IPV4
+ select NF_REJECT_IPV4
+ default NFT_REJECT
+ tristate
+
+endif # NF_TABLES_IPV4
+
+config NF_TABLES_ARP
+ tristate "ARP nf_tables support"
+ help
+ This option enables the ARP support for nf_tables.
+
+endif # NF_TABLES
+
+config NF_LOG_ARP
+ tristate "ARP packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_LOG_IPV4
+ tristate "IPv4 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_REJECT_IPV4
+ tristate "IPv4 packet rejection"
+ default m if NETFILTER_ADVANCED=n
+
+config NF_NAT_IPV4
+ tristate "IPv4 NAT"
+ depends on NF_CONNTRACK_IPV4
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT
+ help
+ The IPv4 NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. This can be
+ controlled by iptables or nft.
+
+if NF_NAT_IPV4
+
+config NFT_CHAIN_NAT_IPV4
+ depends on NF_TABLES_IPV4
+ tristate "IPv4 nf_tables nat chain support"
+ help
+ This option enables the "nat" chain for IPv4 in nf_tables. This
+ chain type is used to perform Network Address Translation (NAT)
+ packet transformations such as the source, destination address and
+ source and destination ports.
+
+config NF_NAT_MASQUERADE_IPV4
+ tristate "IPv4 masquerade support"
+ help
+ This is the kernel functionality to provide NAT in the masquerade
+ flavour (automatic source address selection).
+
+config NFT_MASQ_IPV4
+ tristate "IPv4 masquerading support for nf_tables"
+ depends on NF_TABLES_IPV4
+ depends on NFT_MASQ
+ select NF_NAT_MASQUERADE_IPV4
+ help
+ This is the expression that provides IPv4 masquerading support for
+ nf_tables.
+
+config NFT_REDIR_IPV4
+ tristate "IPv4 redirect support for nf_tables"
+ depends on NF_TABLES_IPV4
+ depends on NFT_REDIR
+ select NF_NAT_REDIRECT
+ help
+ This is the expression that provides IPv4 redirect support for
+ nf_tables.
+
+config NF_NAT_SNMP_BASIC
+ tristate "Basic SNMP-ALG support"
+ depends on NF_CONNTRACK_SNMP
+ depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
+ ---help---
+
+ This module implements an Application Layer Gateway (ALG) for
+ SNMP payloads. In conjunction with NAT, it allows a network
+ management system to access multiple private networks with
+ conflicting addresses. It works by modifying IP addresses
+ inside SNMP payloads to match IP-layer NAT mapping.
+
+ This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_NAT_PROTO_GRE
+ tristate
+ depends on NF_CT_PROTO_GRE
+
+config NF_NAT_PPTP
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_PPTP
+ select NF_NAT_PROTO_GRE
+
+config NF_NAT_H323
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_H323
+
+endif # NF_NAT_IPV4
+
+config IP_NF_IPTABLES
+ tristate "IP tables support (required for filtering/masq/NAT)"
+ default m if NETFILTER_ADVANCED=n
+ select NETFILTER_XTABLES
+ help
+ iptables is a general, extensible packet identification framework.
+ The packet filtering and full NAT (masquerading, port forwarding,
+ etc) subsystems now use this: say `Y' or `M' here if you want to use
+ either of those.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_NF_IPTABLES
+
+# The matches.
+config IP_NF_MATCH_AH
+ tristate '"ah" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This match extension allows you to match a range of SPIs
+ inside AH header of IPSec packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_MATCH_ECN
+ tristate '"ecn" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MATCH_ECN
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_ECN.
+
+config IP_NF_MATCH_RPFILTER
+ tristate '"rpfilter" reverse path filter match support'
+ depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
+ ---help---
+ This option allows you to match packets whose replies would
+ go out via the interface the packet came in.
+
+ To compile it as a module, choose M here. If unsure, say N.
+ The module will be called ipt_rpfilter.
+
+config IP_NF_MATCH_TTL
+ tristate '"ttl" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MATCH_HL
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_HL.
+
+# `filter', generic and specific targets
+config IP_NF_FILTER
+ tristate "Packet filtering"
+ default m if NETFILTER_ADVANCED=n
+ help
+ Packet filtering defines a table `filter', which has a series of
+ rules for simple packet filtering at local input, forwarding and
+ local output. See the man page for iptables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_REJECT
+ tristate "REJECT target support"
+ depends on IP_NF_FILTER
+ select NF_REJECT_IPV4
+ default m if NETFILTER_ADVANCED=n
+ help
+ The REJECT target allows a filtering rule to specify that an ICMP
+ error should be issued in response to an incoming packet, rather
+ than silently being dropped.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_SYNPROXY
+ tristate "SYNPROXY target support"
+ depends on NF_CONNTRACK && NETFILTER_ADVANCED
+ select NETFILTER_SYNPROXY
+ select SYN_COOKIES
+ help
+ The SYNPROXY target allows you to intercept TCP connections and
+ establish them using syncookies before they are passed on to the
+ server. This allows to avoid conntrack and server resource usage
+ during SYN-flood attacks.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+# NAT + specific targets: nf_conntrack
+config IP_NF_NAT
+ tristate "iptables NAT support"
+ depends on NF_CONNTRACK_IPV4
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT
+ select NF_NAT_IPV4
+ select NETFILTER_XT_NAT
+ help
+ This enables the `nat' table in iptables. This allows masquerading,
+ port forwarding and other forms of full Network Address Port
+ Translation.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_NF_NAT
+
+config IP_NF_TARGET_MASQUERADE
+ tristate "MASQUERADE target support"
+ select NF_NAT_MASQUERADE_IPV4
+ default m if NETFILTER_ADVANCED=n
+ help
+ Masquerading is a special case of NAT: all outgoing connections are
+ changed to seem to come from a particular interface's address, and
+ if the interface goes down, those connections are lost. This is
+ only useful for dialup accounts with dynamic IP address (ie. your IP
+ address will be different on next dialup).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_NETMAP
+ tristate "NETMAP target support"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_NETMAP
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_NETMAP.
+
+config IP_NF_TARGET_REDIRECT
+ tristate "REDIRECT target support"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_REDIRECT
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_REDIRECT.
+
+endif # IP_NF_NAT
+
+# mangle + specific targets
+config IP_NF_MANGLE
+ tristate "Packet mangling"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option adds a `mangle' table to iptables: see the man page for
+ iptables(8). This table is used for various packet alterations
+ which can effect how the packet is routed.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_CLUSTERIP
+ tristate "CLUSTERIP target support"
+ depends on IP_NF_MANGLE
+ depends on NF_CONNTRACK_IPV4
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_MARK
+ help
+ The CLUSTERIP target allows you to build load-balancing clusters of
+ network servers without having a dedicated load-balancing
+ router/server/switch.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_ECN
+ tristate "ECN target support"
+ depends on IP_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a `ECN' target, which can be used in the iptables mangle
+ table.
+
+ You can use this target to remove the ECN bits from the IPv4 header of
+ an IP packet. This is particularly useful, if you need to work around
+ existing ECN blackholes on the internet, but don't want to disable
+ ECN support in general.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_TARGET_TTL
+ tristate '"TTL" target support'
+ depends on NETFILTER_ADVANCED && IP_NF_MANGLE
+ select NETFILTER_XT_TARGET_HL
+ ---help---
+ This is a backwards-compatible option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_HL.
+
+# raw + specific targets
+config IP_NF_RAW
+ tristate 'raw table support (required for NOTRACK/TRACE)'
+ help
+ This option adds a `raw' table to iptables. This table is the very
+ first in the netfilter framework and hooks in at the PREROUTING
+ and OUTPUT chains.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+# security table for MAC policy
+config IP_NF_SECURITY
+ tristate "Security table"
+ depends on SECURITY
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `security' table to iptables, for use
+ with Mandatory Access Control (MAC) policy.
+
+ If unsure, say N.
+
+endif # IP_NF_IPTABLES
+
+# ARP tables
+config IP_NF_ARPTABLES
+ tristate "ARP tables support"
+ select NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ help
+ arptables is a general, extensible packet identification framework.
+ The ARP packet filtering and mangling (manipulation)subsystems
+ use this: say Y or M here if you want to use either of those.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_NF_ARPTABLES
+
+config IP_NF_ARPFILTER
+ tristate "ARP packet filtering"
+ help
+ ARP packet filtering defines a table `filter', which has a series of
+ rules for simple ARP packet filtering at local input and
+ local output. On a bridge, you can also specify filtering rules
+ for forwarded ARP packets. See the man page for arptables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_NF_ARP_MANGLE
+ tristate "ARP payload mangling"
+ help
+ Allows altering the ARP packet payload: source and destination
+ hardware and network addresses.
+
+endif # IP_NF_ARPTABLES
+
+endmenu
+
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 000000000..7fe6c7035
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,72 @@
+#
+# Makefile for the netfilter modules on top of IPv4.
+#
+
+# objects for l3 independent conntrack
+nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
+ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
+ifeq ($(CONFIG_PROC_FS),y)
+nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
+endif
+endif
+
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
+
+nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
+
+# defrag
+obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+
+# logging
+obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
+obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
+
+# reject
+obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
+
+# NAT helpers (nf_conntrack)
+obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
+obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
+obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
+obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
+
+# NAT protocols (nf_nat)
+obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
+
+obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
+obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
+obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
+obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
+obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
+
+# generic IP tables
+obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+
+# the three instances of ip_tables
+obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
+obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
+
+# matches
+obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
+obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
+
+# targets
+obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
+obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
+
+# generic ARP tables
+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
+obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
+
+# just filtering instance of ARP tables for now
+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 000000000..a61200754
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1926 @@
+/*
+ * Packet matching code for ARP packets.
+ *
+ * Based heavily, if not almost entirely, upon ip_tables.c framework.
+ *
+ * Some ARP specific bits are:
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net>
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <net/compat.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables core");
+
+/*#define DEBUG_ARP_TABLES*/
+/*#define DEBUG_ARP_TABLES_USER*/
+
+#ifdef DEBUG_ARP_TABLES
+#define dprintf(format, args...) printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_ARP_TABLES_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define ARP_NF_ASSERT(x) WARN_ON(!(x))
+#else
+#define ARP_NF_ASSERT(x)
+#endif
+
+void *arpt_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(arpt, ARPT);
+}
+EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
+
+static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
+ const char *hdr_addr, int len)
+{
+ int i, ret;
+
+ if (len > ARPT_DEV_ADDR_LEN_MAX)
+ len = ARPT_DEV_ADDR_LEN_MAX;
+
+ ret = 0;
+ for (i = 0; i < len; i++)
+ ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
+
+ return ret != 0;
+}
+
+/*
+ * Unfortunately, _b and _mask are not aligned to an int (or long int)
+ * Some arches dont care, unrolling the loop is a win on them.
+ * For other arches, we only have a 16bit alignement.
+ */
+static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask)
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ unsigned long ret = ifname_compare_aligned(_a, _b, _mask);
+#else
+ unsigned long ret = 0;
+ const u16 *a = (const u16 *)_a;
+ const u16 *b = (const u16 *)_b;
+ const u16 *mask = (const u16 *)_mask;
+ int i;
+
+ for (i = 0; i < IFNAMSIZ/sizeof(u16); i++)
+ ret |= (a[i] ^ b[i]) & mask[i];
+#endif
+ return ret;
+}
+
+/* Returns whether packet matches rule or not. */
+static inline int arp_packet_match(const struct arphdr *arphdr,
+ struct net_device *dev,
+ const char *indev,
+ const char *outdev,
+ const struct arpt_arp *arpinfo)
+{
+ const char *arpptr = (char *)(arphdr + 1);
+ const char *src_devaddr, *tgt_devaddr;
+ __be32 src_ipaddr, tgt_ipaddr;
+ long ret;
+
+#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
+
+ if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
+ ARPT_INV_ARPOP)) {
+ dprintf("ARP operation field mismatch.\n");
+ dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
+ arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
+ ARPT_INV_ARPHRD)) {
+ dprintf("ARP hardware address format mismatch.\n");
+ dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
+ arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
+ ARPT_INV_ARPPRO)) {
+ dprintf("ARP protocol address format mismatch.\n");
+ dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
+ arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+ return 0;
+ }
+
+ if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
+ ARPT_INV_ARPHLN)) {
+ dprintf("ARP hardware address length mismatch.\n");
+ dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
+ arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+ return 0;
+ }
+
+ src_devaddr = arpptr;
+ arpptr += dev->addr_len;
+ memcpy(&src_ipaddr, arpptr, sizeof(u32));
+ arpptr += sizeof(u32);
+ tgt_devaddr = arpptr;
+ arpptr += dev->addr_len;
+ memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
+
+ if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
+ ARPT_INV_SRCDEVADDR) ||
+ FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
+ ARPT_INV_TGTDEVADDR)) {
+ dprintf("Source or target device address mismatch.\n");
+
+ return 0;
+ }
+
+ if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
+ ARPT_INV_SRCIP) ||
+ FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
+ ARPT_INV_TGTIP)) {
+ dprintf("Source or target IP address mismatch.\n");
+
+ dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
+ &src_ipaddr,
+ &arpinfo->smsk.s_addr,
+ &arpinfo->src.s_addr,
+ arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
+ dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n",
+ &tgt_ipaddr,
+ &arpinfo->tmsk.s_addr,
+ &arpinfo->tgt.s_addr,
+ arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+ return 0;
+ }
+
+ /* Look for ifname matches. */
+ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
+
+ if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, arpinfo->iniface,
+ arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+ return 0;
+ }
+
+ ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
+
+ if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, arpinfo->outiface,
+ arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+ return 0;
+ }
+
+ return 1;
+#undef FWINV
+}
+
+static inline int arp_checkentry(const struct arpt_arp *arp)
+{
+ if (arp->flags & ~ARPT_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ arp->flags & ~ARPT_F_MASK);
+ return 0;
+ }
+ if (arp->invflags & ~ARPT_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ arp->invflags & ~ARPT_INV_MASK);
+ return 0;
+ }
+
+ return 1;
+}
+
+static unsigned int
+arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ net_err_ratelimited("arp_tables: error: '%s'\n",
+ (const char *)par->targinfo);
+
+ return NF_DROP;
+}
+
+static inline const struct xt_entry_target *
+arpt_get_target_c(const struct arpt_entry *e)
+{
+ return arpt_get_target((struct arpt_entry *)e);
+}
+
+static inline struct arpt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+ return (struct arpt_entry *)(base + offset);
+}
+
+static inline __pure
+struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
+{
+ return (void *)entry + entry->next_offset;
+}
+
+unsigned int arpt_do_table(struct sk_buff *skb,
+ unsigned int hook,
+ const struct nf_hook_state *state,
+ struct xt_table *table)
+{
+ static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+ unsigned int verdict = NF_DROP;
+ const struct arphdr *arp;
+ struct arpt_entry *e, *back;
+ const char *indev, *outdev;
+ void *table_base;
+ const struct xt_table_info *private;
+ struct xt_action_param acpar;
+ unsigned int addend;
+
+ if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
+ return NF_DROP;
+
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
+
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
+ private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
+ table_base = private->entries[smp_processor_id()];
+
+ e = get_entry(table_base, private->hook_entry[hook]);
+ back = get_entry(table_base, private->underflow[hook]);
+
+ acpar.in = state->in;
+ acpar.out = state->out;
+ acpar.hooknum = hook;
+ acpar.family = NFPROTO_ARP;
+ acpar.hotdrop = false;
+
+ arp = arp_hdr(skb);
+ do {
+ const struct xt_entry_target *t;
+
+ if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
+ e = arpt_next_entry(e);
+ continue;
+ }
+
+ ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+
+ t = arpt_get_target_c(e);
+
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct xt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != XT_RETURN) {
+ verdict = (unsigned int)(-v) - 1;
+ break;
+ }
+ e = back;
+ back = get_entry(table_base, back->comefrom);
+ continue;
+ }
+ if (table_base + v
+ != arpt_next_entry(e)) {
+ /* Save old back ptr in next entry */
+ struct arpt_entry *next = arpt_next_entry(e);
+ next->comefrom = (void *)back - table_base;
+
+ /* set back pointer to next entry */
+ back = next;
+ }
+
+ e = get_entry(table_base, v);
+ continue;
+ }
+
+ /* Targets which reenter must return
+ * abs. verdicts
+ */
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
+ verdict = t->u.kernel.target->target(skb, &acpar);
+
+ /* Target might have changed stuff. */
+ arp = arp_hdr(skb);
+
+ if (verdict == XT_CONTINUE)
+ e = arpt_next_entry(e);
+ else
+ /* Verdict */
+ break;
+ } while (!acpar.hotdrop);
+ xt_write_recseq_end(addend);
+ local_bh_enable();
+
+ if (acpar.hotdrop)
+ return NF_DROP;
+ else
+ return verdict;
+}
+
+/* All zeroes == unconditional rule. */
+static inline bool unconditional(const struct arpt_arp *arp)
+{
+ static const struct arpt_arp uncond;
+
+ return memcmp(arp, &uncond, sizeof(uncond)) == 0;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ * there are loops. Puts hook bitmask in comefrom.
+ */
+static int mark_source_chains(const struct xt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ * to 0 as we leave), and comefrom to save source hook bitmask.
+ */
+ for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct arpt_entry *e
+ = (struct arpt_entry *)(entry0 + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ const struct xt_standard_target *t
+ = (void *)arpt_get_target_c(e);
+ int visited = e->comefrom & (1 << hook);
+
+ if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
+ pr_notice("arptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom
+ |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if ((e->target_offset == sizeof(struct arpt_entry) &&
+ (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0 && unconditional(&e->arp)) ||
+ visited) {
+ unsigned int oldpos, size;
+
+ if ((strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("mark_source_chains: bad "
+ "negative verdict (%i)\n",
+ t->verdict);
+ return 0;
+ }
+
+ /* Return: backtrack through the last
+ * big jump.
+ */
+ do {
+ e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct arpt_entry *)
+ (entry0 + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct arpt_entry *)
+ (entry0 + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
+ if (newpos > newinfo->size -
+ sizeof(struct arpt_entry)) {
+ duprintf("mark_source_chains: "
+ "bad verdict (%i)\n",
+ newpos);
+ return 0;
+ }
+
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct arpt_entry *)
+ (entry0 + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static inline int check_entry(const struct arpt_entry *e, const char *name)
+{
+ const struct xt_entry_target *t;
+
+ if (!arp_checkentry(&e->arp)) {
+ duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
+ return -EINVAL;
+
+ t = arpt_get_target_c(e);
+ if (e->target_offset + t->u.target_size > e->next_offset)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline int check_target(struct arpt_entry *e, const char *name)
+{
+ struct xt_entry_target *t = arpt_get_target(e);
+ int ret;
+ struct xt_tgchk_param par = {
+ .table = name,
+ .entryinfo = e,
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .hook_mask = e->comefrom,
+ .family = NFPROTO_ARP,
+ };
+
+ ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
+ if (ret < 0) {
+ duprintf("arp_tables: check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ return ret;
+ }
+ return 0;
+}
+
+static inline int
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ int ret;
+
+ ret = check_entry(e, name);
+ if (ret)
+ return ret;
+
+ t = arpt_get_target(e);
+ target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
+ ret = PTR_ERR(target);
+ goto out;
+ }
+ t->u.kernel.target = target;
+
+ ret = check_target(e, name);
+ if (ret)
+ goto err;
+ return 0;
+err:
+ module_put(t->u.kernel.target->me);
+out:
+ return ret;
+}
+
+static bool check_underflow(const struct arpt_entry *e)
+{
+ const struct xt_entry_target *t;
+ unsigned int verdict;
+
+ if (!unconditional(&e->arp))
+ return false;
+ t = arpt_get_target_c(e);
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+ return false;
+ verdict = ((struct xt_standard_target *)t)->verdict;
+ verdict = -verdict - 1;
+ return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static inline int check_entry_size_and_hooks(struct arpt_entry *e,
+ struct xt_table_info *newinfo,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int valid_hooks)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if (!(valid_hooks & (1 << h)))
+ continue;
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h]) {
+ if (!check_underflow(e)) {
+ pr_err("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
+ return -EINVAL;
+ }
+ newinfo->underflow[h] = underflows[h];
+ }
+ }
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct xt_counters) { 0, 0 });
+ e->comefrom = 0;
+ return 0;
+}
+
+static inline void cleanup_entry(struct arpt_entry *e)
+{
+ struct xt_tgdtor_param par;
+ struct xt_entry_target *t;
+
+ t = arpt_get_target(e);
+ par.target = t->u.kernel.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_ARP;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ * newinfo).
+ */
+static int translate_table(struct xt_table_info *newinfo, void *entry0,
+ const struct arpt_replace *repl)
+{
+ struct arpt_entry *iter;
+ unsigned int i;
+ int ret = 0;
+
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ break;
+ ++i;
+ if (strcmp(arpt_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+ duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
+ if (ret != 0)
+ return ret;
+
+ if (i != repl->num_entries) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, repl->num_entries);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(repl->valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, repl->hook_entry[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, repl->underflow[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) {
+ duprintf("Looping hook\n");
+ return -ELOOP;
+ }
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, repl->name, repl->size);
+ if (ret != 0)
+ break;
+ ++i;
+ }
+
+ if (ret != 0) {
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter);
+ }
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i) {
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
+ }
+
+ return ret;
+}
+
+static void get_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct arpt_entry *iter;
+ unsigned int cpu;
+ unsigned int i;
+
+ for_each_possible_cpu(cpu) {
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+ i = 0;
+ xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
+ ++i;
+ }
+ }
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+ unsigned int countersize;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ * (other than comefrom, which userspace doesn't care
+ * about).
+ */
+ countersize = sizeof(struct xt_counters) * private->number;
+ counters = vzalloc(countersize);
+
+ if (counters == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ get_counters(private, counters);
+
+ return counters;
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+ const struct xt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num;
+ const struct arpt_entry *e;
+ struct xt_counters *counters;
+ struct xt_table_info *private = table->private;
+ int ret = 0;
+ void *loc_cpu_entry;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ /* ... then copy entire thing ... */
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ const struct xt_entry_target *t;
+
+ e = (struct arpt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct arpt_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ t = arpt_get_target_c(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct xt_entry_target,
+ u.user.name),
+ t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v > 0)
+ v += xt_compat_calc_jump(NFPROTO_ARP, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+ compat_int_t cv = *(int *)src;
+
+ if (cv > 0)
+ cv -= xt_compat_calc_jump(NFPROTO_ARP, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct arpt_entry *e,
+ const struct xt_table_info *info,
+ const void *base, struct xt_table_info *newinfo)
+{
+ const struct xt_entry_target *t;
+ unsigned int entry_offset;
+ int off, i, ret;
+
+ off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+ entry_offset = (void *)e - base;
+
+ t = arpt_get_target_c(e);
+ off += xt_compat_target_offset(t->u.kernel.target);
+ newinfo->size -= off;
+ ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct arpt_entry *)(base + info->hook_entry[i])))
+ newinfo->hook_entry[i] -= off;
+ if (info->underflow[i] &&
+ (e < (struct arpt_entry *)(base + info->underflow[i])))
+ newinfo->underflow[i] -= off;
+ }
+ return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
+{
+ struct arpt_entry *iter;
+ void *loc_cpu_entry;
+ int ret;
+
+ if (!newinfo || !info)
+ return -EINVAL;
+
+ /* we dont care about newinfo->entries[] */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(NFPROTO_ARP, info->number);
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user,
+ const int *len, int compat)
+{
+ char name[XT_TABLE_MAXNAMELEN];
+ struct xt_table *t;
+ int ret;
+
+ if (*len != sizeof(struct arpt_getinfo)) {
+ duprintf("length %u != %Zu\n", *len,
+ sizeof(struct arpt_getinfo));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0)
+ return -EFAULT;
+
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_lock(NFPROTO_ARP);
+#endif
+ t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
+ "arptable_%s", name);
+ if (!IS_ERR_OR_NULL(t)) {
+ struct arpt_getinfo info;
+ const struct xt_table_info *private = t->private;
+#ifdef CONFIG_COMPAT
+ struct xt_table_info tmp;
+
+ if (compat) {
+ ret = compat_table_info(private, &tmp);
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ private = &tmp;
+ }
+#endif
+ memset(&info, 0, sizeof(info));
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = private->number;
+ info.size = private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+ xt_table_unlock(t);
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_unlock(NFPROTO_ARP);
+#endif
+ return ret;
+}
+
+static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
+ const int *len)
+{
+ int ret;
+ struct arpt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct arpt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %Zu\n", *len,
+ sizeof(struct arpt_get_entries) + get.size);
+ return -EINVAL;
+ }
+
+ t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+
+ duprintf("t->private->number = %u\n",
+ private->number);
+ if (get.size == private->size)
+ ret = copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ return ret;
+}
+
+static int __do_replace(struct net *net, const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info *newinfo,
+ unsigned int num_counters,
+ void __user *counters_ptr)
+{
+ int ret;
+ struct xt_table *t;
+ struct xt_table_info *oldinfo;
+ struct xt_counters *counters;
+ void *loc_cpu_old_entry;
+ struct arpt_entry *iter;
+
+ ret = 0;
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
+ "arptable_%s", name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free_newinfo_counters_untrans;
+ }
+
+ /* You lied! */
+ if (valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto put_module;
+ }
+
+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto put_module;
+
+ /* Update module usage count based on number of rules */
+ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+ oldinfo->number, oldinfo->initial_entries, newinfo->number);
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+ if ((oldinfo->number > oldinfo->initial_entries) &&
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+
+ /* Get the old counters, and synchronize with replace */
+ get_counters(oldinfo, counters);
+
+ /* Decrease module usage counts and free resource */
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ cleanup_entry(iter);
+
+ xt_free_table_info(oldinfo);
+ if (copy_to_user(counters_ptr, counters,
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
+ }
+ vfree(counters);
+ xt_table_unlock(t);
+ return ret;
+
+ put_module:
+ module_put(t->me);
+ xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+ vfree(counters);
+ out:
+ return ret;
+}
+
+static int do_replace(struct net *net, const void __user *user,
+ unsigned int len)
+{
+ int ret;
+ struct arpt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct arpt_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_table(newinfo, loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("arp_tables: Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int do_add_counters(struct net *net, const void __user *user,
+ unsigned int len, int compat)
+{
+ unsigned int i, curcpu;
+ struct xt_counters_info tmp;
+ struct xt_counters *paddc;
+ unsigned int num_counters;
+ const char *name;
+ int size;
+ void *ptmp;
+ struct xt_table *t;
+ const struct xt_table_info *private;
+ int ret = 0;
+ void *loc_cpu_entry;
+ struct arpt_entry *iter;
+ unsigned int addend;
+#ifdef CONFIG_COMPAT
+ struct compat_xt_counters_info compat_tmp;
+
+ if (compat) {
+ ptmp = &compat_tmp;
+ size = sizeof(struct compat_xt_counters_info);
+ } else
+#endif
+ {
+ ptmp = &tmp;
+ size = sizeof(struct xt_counters_info);
+ }
+
+ if (copy_from_user(ptmp, user, size) != 0)
+ return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ num_counters = compat_tmp.num_counters;
+ name = compat_tmp.name;
+ } else
+#endif
+ {
+ num_counters = tmp.num_counters;
+ name = tmp.name;
+ }
+
+ if (len != size + num_counters * sizeof(struct xt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len - size);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user + size, len - size) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = xt_find_table_lock(net, NFPROTO_ARP, name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free;
+ }
+
+ local_bh_disable();
+ private = t->private;
+ if (private->number != num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ /* Choose the copy that is on our node */
+ curcpu = smp_processor_id();
+ loc_cpu_entry = private->entries[curcpu];
+ addend = xt_write_recseq_begin();
+ xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+ ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
+ xt_write_recseq_end(addend);
+ unlock_up_free:
+ local_bh_enable();
+ xt_table_unlock(t);
+ module_put(t->me);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static inline void compat_release_entry(struct compat_arpt_entry *e)
+{
+ struct xt_entry_target *t;
+
+ t = compat_arpt_get_target(e);
+ module_put(t->u.kernel.target->me);
+}
+
+static inline int
+check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ const char *name)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ int ret, off, h;
+
+ duprintf("check_compat_entry_size_and_hooks %p\n", e);
+ if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
+ duprintf("Bad offset %p, limit = %p\n", e, limit);
+ return -EINVAL;
+ }
+
+ if (e->next_offset < sizeof(struct compat_arpt_entry) +
+ sizeof(struct compat_xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* For purposes of check_entry casting the compat entry is fine */
+ ret = check_entry((struct arpt_entry *)e, name);
+ if (ret)
+ return ret;
+
+ off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+ entry_offset = (void *)e - (void *)base;
+
+ t = compat_arpt_get_target(e);
+ target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+ t->u.user.name);
+ ret = PTR_ERR(target);
+ goto out;
+ }
+ t->u.kernel.target = target;
+
+ off += xt_compat_target_offset(target);
+ *size += off;
+ ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+ if (ret)
+ goto release_target;
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* Clear counters and comefrom */
+ memset(&e->counters, 0, sizeof(e->counters));
+ e->comefrom = 0;
+ return 0;
+
+release_target:
+ module_put(t->u.kernel.target->me);
+out:
+ return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
+ unsigned int *size, const char *name,
+ struct xt_table_info *newinfo, unsigned char *base)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ struct arpt_entry *de;
+ unsigned int origsize;
+ int ret, h;
+
+ ret = 0;
+ origsize = *size;
+ de = (struct arpt_entry *)*dstptr;
+ memcpy(de, e, sizeof(struct arpt_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct arpt_entry);
+ *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+ de->target_offset = e->target_offset - (origsize - *size);
+ t = compat_arpt_get_target(e);
+ target = t->u.kernel.target;
+ xt_compat_target_from_user(t, dstptr, size);
+
+ de->next_offset = e->next_offset - (origsize - *size);
+ for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+ if ((unsigned char *)de - base < newinfo->hook_entry[h])
+ newinfo->hook_entry[h] -= origsize - *size;
+ if ((unsigned char *)de - base < newinfo->underflow[h])
+ newinfo->underflow[h] -= origsize - *size;
+ }
+ return ret;
+}
+
+static int translate_compat_table(const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ unsigned int total_size,
+ unsigned int number,
+ unsigned int *hook_entries,
+ unsigned int *underflows)
+{
+ unsigned int i, j;
+ struct xt_table_info *newinfo, *info;
+ void *pos, *entry0, *entry1;
+ struct compat_arpt_entry *iter0;
+ struct arpt_entry *iter1;
+ unsigned int size;
+ int ret = 0;
+
+ info = *pinfo;
+ entry0 = *pentry0;
+ size = total_size;
+ info->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ info->hook_entry[i] = 0xFFFFFFFF;
+ info->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_compat_table: size %u\n", info->size);
+ j = 0;
+ xt_compat_lock(NFPROTO_ARP);
+ xt_compat_init_offsets(NFPROTO_ARP, number);
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + total_size,
+ hook_entries,
+ underflows,
+ name);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
+
+ ret = -EINVAL;
+ if (j != number) {
+ duprintf("translate_compat_table: %u not %u entries\n",
+ j, number);
+ goto out_unlock;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (info->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ goto out_unlock;
+ }
+ if (info->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ goto out_unlock;
+ }
+ }
+
+ ret = -ENOMEM;
+ newinfo = xt_alloc_table_info(size);
+ if (!newinfo)
+ goto out_unlock;
+
+ newinfo->number = number;
+ for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = info->hook_entry[i];
+ newinfo->underflow[i] = info->underflow[i];
+ }
+ entry1 = newinfo->entries[raw_smp_processor_id()];
+ pos = entry1;
+ size = total_size;
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = compat_copy_entry_from_user(iter0, &pos, &size,
+ name, newinfo, entry1);
+ if (ret != 0)
+ break;
+ }
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ xt_compat_unlock(NFPROTO_ARP);
+ if (ret)
+ goto free_newinfo;
+
+ ret = -ELOOP;
+ if (!mark_source_chains(newinfo, valid_hooks, entry1))
+ goto free_newinfo;
+
+ i = 0;
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ ret = check_target(iter1, name);
+ if (ret != 0)
+ break;
+ ++i;
+ if (strcmp(arpt_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+ if (ret) {
+ /*
+ * The first i matches need cleanup_entry (calls ->destroy)
+ * because they had called ->check already. The other j-i
+ * entries need only release.
+ */
+ int skip = i;
+ j -= i;
+ xt_entry_foreach(iter0, entry0, newinfo->size) {
+ if (skip-- > 0)
+ continue;
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter1);
+ }
+ xt_free_table_info(newinfo);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i)
+ if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+ memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+ *pinfo = newinfo;
+ *pentry0 = entry1;
+ xt_free_table_info(info);
+ return 0;
+
+free_newinfo:
+ xt_free_table_info(newinfo);
+out:
+ xt_entry_foreach(iter0, entry0, total_size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ xt_compat_unlock(NFPROTO_ARP);
+ goto out;
+}
+
+struct compat_arpt_replace {
+ char name[XT_TABLE_MAXNAMELEN];
+ u32 valid_hooks;
+ u32 num_entries;
+ u32 size;
+ u32 hook_entry[NF_ARP_NUMHOOKS];
+ u32 underflow[NF_ARP_NUMHOOKS];
+ u32 num_counters;
+ compat_uptr_t counters;
+ struct compat_arpt_entry entries[0];
+};
+
+static int compat_do_replace(struct net *net, void __user *user,
+ unsigned int len)
+{
+ int ret;
+ struct compat_arpt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct arpt_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.size >= INT_MAX / num_possible_cpus())
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_compat_table(tmp.name, tmp.valid_hooks,
+ &newinfo, &loc_cpu_entry, tmp.size,
+ tmp.num_entries, tmp.hook_entry,
+ tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("compat_do_replace: Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
+ unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_SET_REPLACE:
+ ret = compat_do_replace(sock_net(sk), user, len);
+ break;
+
+ case ARPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 1);
+ break;
+
+ default:
+ duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
+ compat_uint_t *size,
+ struct xt_counters *counters,
+ unsigned int i)
+{
+ struct xt_entry_target *t;
+ struct compat_arpt_entry __user *ce;
+ u_int16_t target_offset, next_offset;
+ compat_uint_t origsize;
+ int ret;
+
+ origsize = *size;
+ ce = (struct compat_arpt_entry __user *)*dstptr;
+ if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
+
+ *dstptr += sizeof(struct compat_arpt_entry);
+ *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+ target_offset = e->target_offset - (origsize - *size);
+
+ t = arpt_get_target(e);
+ ret = xt_compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int compat_copy_entries_to_user(unsigned int total_size,
+ struct xt_table *table,
+ void __user *userptr)
+{
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ void __user *pos;
+ unsigned int size;
+ int ret = 0;
+ void *loc_cpu_entry;
+ unsigned int i = 0;
+ struct arpt_entry *iter;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ pos = userptr;
+ size = total_size;
+ xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
+ vfree(counters);
+ return ret;
+}
+
+struct compat_arpt_get_entries {
+ char name[XT_TABLE_MAXNAMELEN];
+ compat_uint_t size;
+ struct compat_arpt_entry entrytable[0];
+};
+
+static int compat_get_entries(struct net *net,
+ struct compat_arpt_get_entries __user *uptr,
+ int *len)
+{
+ int ret;
+ struct compat_arpt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct compat_arpt_get_entries) + get.size) {
+ duprintf("compat_get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ xt_compat_lock(NFPROTO_ARP);
+ t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+ struct xt_table_info info;
+
+ duprintf("t->private->number = %u\n", private->number);
+ ret = compat_table_info(private, &info);
+ if (!ret && get.size == info.size) {
+ ret = compat_copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ } else if (!ret) {
+ duprintf("compat_get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ xt_compat_flush_offsets(NFPROTO_ARP);
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ xt_compat_unlock(NFPROTO_ARP);
+ return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *, int, void __user *, int *);
+
+static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
+ int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 1);
+ break;
+ case ARPT_SO_GET_ENTRIES:
+ ret = compat_get_entries(sock_net(sk), user, len);
+ break;
+ default:
+ ret = do_arpt_get_ctl(sk, cmd, user, len);
+ }
+ return ret;
+}
+#endif
+
+static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_SET_REPLACE:
+ ret = do_replace(sock_net(sk), user, len);
+ break;
+
+ case ARPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 0);
+ break;
+
+ default:
+ duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case ARPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 0);
+ break;
+
+ case ARPT_SO_GET_ENTRIES:
+ ret = get_entries(sock_net(sk), user, len);
+ break;
+
+ case ARPT_SO_GET_REVISION_TARGET: {
+ struct xt_get_revision rev;
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ rev.name[sizeof(rev.name)-1] = 0;
+
+ try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
+ rev.revision, 1, &ret),
+ "arpt_%s", rev.name);
+ break;
+ }
+
+ default:
+ duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+struct xt_table *arpt_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct arpt_replace *repl)
+{
+ int ret;
+ struct xt_table_info *newinfo;
+ struct xt_table_info bootstrap = {0};
+ void *loc_cpu_entry;
+ struct xt_table *new_table;
+
+ newinfo = xt_alloc_table_info(repl->size);
+ if (!newinfo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+ ret = translate_table(newinfo, loc_cpu_entry, repl);
+ duprintf("arpt_register_table: translate table gives %d\n", ret);
+ if (ret != 0)
+ goto out_free;
+
+ new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ if (IS_ERR(new_table)) {
+ ret = PTR_ERR(new_table);
+ goto out_free;
+ }
+ return new_table;
+
+out_free:
+ xt_free_table_info(newinfo);
+out:
+ return ERR_PTR(ret);
+}
+
+void arpt_unregister_table(struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct arpt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct xt_target arpt_builtin_tg[] __read_mostly = {
+ {
+ .name = XT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_ARP,
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
+#endif
+ },
+ {
+ .name = XT_ERROR_TARGET,
+ .target = arpt_error,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_ARP,
+ },
+};
+
+static struct nf_sockopt_ops arpt_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = ARPT_BASE_CTL,
+ .set_optmax = ARPT_SO_SET_MAX+1,
+ .set = do_arpt_set_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_set = compat_do_arpt_set_ctl,
+#endif
+ .get_optmin = ARPT_BASE_CTL,
+ .get_optmax = ARPT_SO_GET_MAX+1,
+ .get = do_arpt_get_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_get = compat_do_arpt_get_ctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+static int __net_init arp_tables_net_init(struct net *net)
+{
+ return xt_proto_init(net, NFPROTO_ARP);
+}
+
+static void __net_exit arp_tables_net_exit(struct net *net)
+{
+ xt_proto_fini(net, NFPROTO_ARP);
+}
+
+static struct pernet_operations arp_tables_net_ops = {
+ .init = arp_tables_net_init,
+ .exit = arp_tables_net_exit,
+};
+
+static int __init arp_tables_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&arp_tables_net_ops);
+ if (ret < 0)
+ goto err1;
+
+ /* No one else will be downing sem now, so we won't sleep */
+ ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+ if (ret < 0)
+ goto err2;
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&arpt_sockopts);
+ if (ret < 0)
+ goto err4;
+
+ printk(KERN_INFO "arp_tables: (C) 2002 David S. Miller\n");
+ return 0;
+
+err4:
+ xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+err2:
+ unregister_pernet_subsys(&arp_tables_net_ops);
+err1:
+ return ret;
+}
+
+static void __exit arp_tables_fini(void)
+{
+ nf_unregister_sockopt(&arpt_sockopts);
+ xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+ unregister_pernet_subsys(&arp_tables_net_ops);
+}
+
+EXPORT_SYMBOL(arpt_register_table);
+EXPORT_SYMBOL(arpt_unregister_table);
+EXPORT_SYMBOL(arpt_do_table);
+
+module_init(arp_tables_init);
+module_exit(arp_tables_fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
new file mode 100644
index 000000000..a5e52a9f0
--- /dev/null
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -0,0 +1,91 @@
+/* module that allows mangling of the arp payload */
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_arp/arpt_mangle.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("arptables arp payload mangle target");
+
+static unsigned int
+target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct arpt_mangle *mangle = par->targinfo;
+ const struct arphdr *arp;
+ unsigned char *arpptr;
+ int pln, hln;
+
+ if (!skb_make_writable(skb, skb->len))
+ return NF_DROP;
+
+ arp = arp_hdr(skb);
+ arpptr = skb_network_header(skb) + sizeof(*arp);
+ pln = arp->ar_pln;
+ hln = arp->ar_hln;
+ /* We assume that pln and hln were checked in the match */
+ if (mangle->flags & ARPT_MANGLE_SDEV) {
+ if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+ (arpptr + hln > skb_tail_pointer(skb)))
+ return NF_DROP;
+ memcpy(arpptr, mangle->src_devaddr, hln);
+ }
+ arpptr += hln;
+ if (mangle->flags & ARPT_MANGLE_SIP) {
+ if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+ (arpptr + pln > skb_tail_pointer(skb)))
+ return NF_DROP;
+ memcpy(arpptr, &mangle->u_s.src_ip, pln);
+ }
+ arpptr += pln;
+ if (mangle->flags & ARPT_MANGLE_TDEV) {
+ if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+ (arpptr + hln > skb_tail_pointer(skb)))
+ return NF_DROP;
+ memcpy(arpptr, mangle->tgt_devaddr, hln);
+ }
+ arpptr += hln;
+ if (mangle->flags & ARPT_MANGLE_TIP) {
+ if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+ (arpptr + pln > skb_tail_pointer(skb)))
+ return NF_DROP;
+ memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
+ }
+ return mangle->target;
+}
+
+static int checkentry(const struct xt_tgchk_param *par)
+{
+ const struct arpt_mangle *mangle = par->targinfo;
+
+ if (mangle->flags & ~ARPT_MANGLE_MASK ||
+ !(mangle->flags & ARPT_MANGLE_MASK))
+ return -EINVAL;
+
+ if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
+ mangle->target != XT_CONTINUE)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target arpt_mangle_reg __read_mostly = {
+ .name = "mangle",
+ .family = NFPROTO_ARP,
+ .target = target,
+ .targetsize = sizeof(struct arpt_mangle),
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init arpt_mangle_init(void)
+{
+ return xt_register_target(&arpt_mangle_reg);
+}
+
+static void __exit arpt_mangle_fini(void)
+{
+ xt_unregister_target(&arpt_mangle_reg);
+}
+
+module_init(arpt_mangle_init);
+module_exit(arpt_mangle_fini);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 000000000..93876d031
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,91 @@
+/*
+ * Filtering ARP tables module.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
+ (1 << NF_ARP_FORWARD))
+
+static const struct xt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_ARP,
+ .priority = NF_IP_PRI_FILTER,
+};
+
+/* The work comes in here from netfilter.c */
+static unsigned int
+arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net = dev_net(state->in ? state->in : state->out);
+
+ return arpt_do_table(skb, ops->hooknum, state,
+ net->ipv4.arptable_filter);
+}
+
+static struct nf_hook_ops *arpfilter_ops __read_mostly;
+
+static int __net_init arptable_filter_net_init(struct net *net)
+{
+ struct arpt_replace *repl;
+
+ repl = arpt_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.arptable_filter =
+ arpt_register_table(net, &packet_filter, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
+}
+
+static void __net_exit arptable_filter_net_exit(struct net *net)
+{
+ arpt_unregister_table(net->ipv4.arptable_filter);
+}
+
+static struct pernet_operations arptable_filter_net_ops = {
+ .init = arptable_filter_net_init,
+ .exit = arptable_filter_net_exit,
+};
+
+static int __init arptable_filter_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&arptable_filter_net_ops);
+ if (ret < 0)
+ return ret;
+
+ arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
+ if (IS_ERR(arpfilter_ops)) {
+ ret = PTR_ERR(arpfilter_ops);
+ goto cleanup_table;
+ }
+ return ret;
+
+cleanup_table:
+ unregister_pernet_subsys(&arptable_filter_net_ops);
+ return ret;
+}
+
+static void __exit arptable_filter_fini(void)
+{
+ xt_hook_unlink(&packet_filter, arpfilter_ops);
+ unregister_pernet_subsys(&arptable_filter_net_ops);
+}
+
+module_init(arptable_filter_init);
+module_exit(arptable_filter_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 000000000..2d0e265fe
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,2282 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cache.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+#include <net/compat.h>
+#include <asm/uaccess.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv4 packet filter");
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) pr_info(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) pr_info(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x) WARN_ON(!(x))
+#else
+#define IP_NF_ASSERT(x)
+#endif
+
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+
+void *ipt_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(ipt, IPT);
+}
+EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
+
+/* Returns whether matches rule or not. */
+/* Performance critical - called for every packet */
+static inline bool
+ip_packet_match(const struct iphdr *ip,
+ const char *indev,
+ const char *outdev,
+ const struct ipt_ip *ipinfo,
+ int isfrag)
+{
+ unsigned long ret;
+
+#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
+
+ if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+ IPT_INV_SRCIP) ||
+ FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+ IPT_INV_DSTIP)) {
+ dprintf("Source or dest mismatch.\n");
+
+ dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
+ &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr,
+ ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
+ dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n",
+ &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr,
+ ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+ return false;
+ }
+
+ ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
+
+ if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, ipinfo->iniface,
+ ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+ return false;
+ }
+
+ ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
+
+ if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, ipinfo->outiface,
+ ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+ return false;
+ }
+
+ /* Check specific protocol */
+ if (ipinfo->proto &&
+ FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
+ dprintf("Packet protocol %hi does not match %hi.%s\n",
+ ip->protocol, ipinfo->proto,
+ ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+ return false;
+ }
+
+ /* If we have a fragment rule but the packet is not a fragment
+ * then we return zero */
+ if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
+ dprintf("Fragment rule but not fragment.%s\n",
+ ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+ip_checkentry(const struct ipt_ip *ip)
+{
+ if (ip->flags & ~IPT_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ ip->flags & ~IPT_F_MASK);
+ return false;
+ }
+ if (ip->invflags & ~IPT_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ ip->invflags & ~IPT_INV_MASK);
+ return false;
+ }
+ return true;
+}
+
+static unsigned int
+ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
+
+ return NF_DROP;
+}
+
+/* Performance critical */
+static inline struct ipt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+ return (struct ipt_entry *)(base + offset);
+}
+
+/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
+static inline bool unconditional(const struct ipt_ip *ip)
+{
+ static const struct ipt_ip uncond;
+
+ return memcmp(ip, &uncond, sizeof(uncond)) == 0;
+#undef FWINV
+}
+
+/* for const-correctness */
+static inline const struct xt_entry_target *
+ipt_get_target_c(const struct ipt_entry *e)
+{
+ return ipt_get_target((struct ipt_entry *)e);
+}
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+static const char *const hooknames[] = {
+ [NF_INET_PRE_ROUTING] = "PREROUTING",
+ [NF_INET_LOCAL_IN] = "INPUT",
+ [NF_INET_FORWARD] = "FORWARD",
+ [NF_INET_LOCAL_OUT] = "OUTPUT",
+ [NF_INET_POST_ROUTING] = "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+ NF_IP_TRACE_COMMENT_RULE,
+ NF_IP_TRACE_COMMENT_RETURN,
+ NF_IP_TRACE_COMMENT_POLICY,
+};
+
+static const char *const comments[] = {
+ [NF_IP_TRACE_COMMENT_RULE] = "rule",
+ [NF_IP_TRACE_COMMENT_RETURN] = "return",
+ [NF_IP_TRACE_COMMENT_POLICY] = "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 4,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+/* Mildly perf critical (only if packet tracing is on) */
+static inline int
+get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
+ const char *hookname, const char **chainname,
+ const char **comment, unsigned int *rulenum)
+{
+ const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
+
+ if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
+ /* Head of user chain: ERROR target with chainname */
+ *chainname = t->target.data;
+ (*rulenum) = 0;
+ } else if (s == e) {
+ (*rulenum)++;
+
+ if (s->target_offset == sizeof(struct ipt_entry) &&
+ strcmp(t->target.u.kernel.target->name,
+ XT_STANDARD_TARGET) == 0 &&
+ t->verdict < 0 &&
+ unconditional(&s->ip)) {
+ /* Tail of chains: STANDARD target (return/policy) */
+ *comment = *chainname == hookname
+ ? comments[NF_IP_TRACE_COMMENT_POLICY]
+ : comments[NF_IP_TRACE_COMMENT_RETURN];
+ }
+ return 1;
+ } else
+ (*rulenum)++;
+
+ return 0;
+}
+
+static void trace_packet(const struct sk_buff *skb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *tablename,
+ const struct xt_table_info *private,
+ const struct ipt_entry *e)
+{
+ const void *table_base;
+ const struct ipt_entry *root;
+ const char *hookname, *chainname, *comment;
+ const struct ipt_entry *iter;
+ unsigned int rulenum = 0;
+ struct net *net = dev_net(in ? in : out);
+
+ table_base = private->entries[smp_processor_id()];
+ root = get_entry(table_base, private->hook_entry[hook]);
+
+ hookname = chainname = hooknames[hook];
+ comment = comments[NF_IP_TRACE_COMMENT_RULE];
+
+ xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+ if (get_chainname_rulenum(iter, e, hookname,
+ &chainname, &comment, &rulenum) != 0)
+ break;
+
+ nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
+ "TRACE: %s:%s:%s:%u ",
+ tablename, chainname, comment, rulenum);
+}
+#endif
+
+static inline __pure
+struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
+{
+ return (void *)entry + entry->next_offset;
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ipt_do_table(struct sk_buff *skb,
+ unsigned int hook,
+ const struct nf_hook_state *state,
+ struct xt_table *table)
+{
+ static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+ const struct iphdr *ip;
+ /* Initializing verdict to NF_DROP keeps gcc happy. */
+ unsigned int verdict = NF_DROP;
+ const char *indev, *outdev;
+ const void *table_base;
+ struct ipt_entry *e, **jumpstack;
+ unsigned int *stackptr, origptr, cpu;
+ const struct xt_table_info *private;
+ struct xt_action_param acpar;
+ unsigned int addend;
+
+ /* Initialization */
+ ip = ip_hdr(skb);
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
+ /* We handle fragments by dealing with the first fragment as
+ * if it was a normal packet. All other fragments are treated
+ * normally, except that they will NEVER match rules that ask
+ * things we don't know, ie. tcp syn flag or ports). If the
+ * rule is also a fragment-specific rule, non-fragments won't
+ * match it. */
+ acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+ acpar.thoff = ip_hdrlen(skb);
+ acpar.hotdrop = false;
+ acpar.in = state->in;
+ acpar.out = state->out;
+ acpar.family = NFPROTO_IPV4;
+ acpar.hooknum = hook;
+
+ IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
+ private = table->private;
+ cpu = smp_processor_id();
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
+ table_base = private->entries[cpu];
+ jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
+ stackptr = per_cpu_ptr(private->stackptr, cpu);
+ origptr = *stackptr;
+
+ e = get_entry(table_base, private->hook_entry[hook]);
+
+ pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
+ table->name, hook, origptr,
+ get_entry(table_base, private->underflow[hook]));
+
+ do {
+ const struct xt_entry_target *t;
+ const struct xt_entry_match *ematch;
+
+ IP_NF_ASSERT(e);
+ if (!ip_packet_match(ip, indev, outdev,
+ &e->ip, acpar.fragoff)) {
+ no_match:
+ e = ipt_next_entry(e);
+ continue;
+ }
+
+ xt_ematch_foreach(ematch, e) {
+ acpar.match = ematch->u.kernel.match;
+ acpar.matchinfo = ematch->data;
+ if (!acpar.match->match(skb, &acpar))
+ goto no_match;
+ }
+
+ ADD_COUNTER(e->counters, skb->len, 1);
+
+ t = ipt_get_target(e);
+ IP_NF_ASSERT(t->u.kernel.target);
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+ /* The packet is traced: log it */
+ if (unlikely(skb->nf_trace))
+ trace_packet(skb, hook, state->in, state->out,
+ table->name, private, e);
+#endif
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct xt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != XT_RETURN) {
+ verdict = (unsigned int)(-v) - 1;
+ break;
+ }
+ if (*stackptr <= origptr) {
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ pr_debug("Underflow (this is normal) "
+ "to %p\n", e);
+ } else {
+ e = jumpstack[--*stackptr];
+ pr_debug("Pulled %p out from pos %u\n",
+ e, *stackptr);
+ e = ipt_next_entry(e);
+ }
+ continue;
+ }
+ if (table_base + v != ipt_next_entry(e) &&
+ !(e->ip.flags & IPT_F_GOTO)) {
+ if (*stackptr >= private->stacksize) {
+ verdict = NF_DROP;
+ break;
+ }
+ jumpstack[(*stackptr)++] = e;
+ pr_debug("Pushed %p into pos %u\n",
+ e, *stackptr - 1);
+ }
+
+ e = get_entry(table_base, v);
+ continue;
+ }
+
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
+
+ verdict = t->u.kernel.target->target(skb, &acpar);
+ /* Target might have changed stuff. */
+ ip = ip_hdr(skb);
+ if (verdict == XT_CONTINUE)
+ e = ipt_next_entry(e);
+ else
+ /* Verdict */
+ break;
+ } while (!acpar.hotdrop);
+ pr_debug("Exiting %s; resetting sp from %u to %u\n",
+ __func__, *stackptr, origptr);
+ *stackptr = origptr;
+ xt_write_recseq_end(addend);
+ local_bh_enable();
+
+#ifdef DEBUG_ALLOW_ALL
+ return NF_ACCEPT;
+#else
+ if (acpar.hotdrop)
+ return NF_DROP;
+ else return verdict;
+#endif
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ there are loops. Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(const struct xt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ to 0 as we leave), and comefrom to save source hook bitmask */
+ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ const struct xt_standard_target *t
+ = (void *)ipt_get_target_c(e);
+ int visited = e->comefrom & (1 << hook);
+
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
+ pr_err("iptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if ((e->target_offset == sizeof(struct ipt_entry) &&
+ (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0 && unconditional(&e->ip)) ||
+ visited) {
+ unsigned int oldpos, size;
+
+ if ((strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("mark_source_chains: bad "
+ "negative verdict (%i)\n",
+ t->verdict);
+ return 0;
+ }
+
+ /* Return: backtrack through the last
+ big jump. */
+ do {
+ e->comefrom ^= (1<<NF_INET_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+ if (e->comefrom
+ & (1 << NF_INET_NUMHOOKS)) {
+ duprintf("Back unset "
+ "on hook %u "
+ "rule %u\n",
+ hook, pos);
+ }
+#endif
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct ipt_entry *)
+ (entry0 + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct ipt_entry *)
+ (entry0 + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
+ if (newpos > newinfo->size -
+ sizeof(struct ipt_entry)) {
+ duprintf("mark_source_chains: "
+ "bad verdict (%i)\n",
+ newpos);
+ return 0;
+ }
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct ipt_entry *)
+ (entry0 + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
+{
+ struct xt_mtdtor_param par;
+
+ par.net = net;
+ par.match = m->u.kernel.match;
+ par.matchinfo = m->data;
+ par.family = NFPROTO_IPV4;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+ module_put(par.match->me);
+}
+
+static int
+check_entry(const struct ipt_entry *e, const char *name)
+{
+ const struct xt_entry_target *t;
+
+ if (!ip_checkentry(&e->ip)) {
+ duprintf("ip check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ if (e->target_offset + sizeof(struct xt_entry_target) >
+ e->next_offset)
+ return -EINVAL;
+
+ t = ipt_get_target_c(e);
+ if (e->target_offset + t->u.target_size > e->next_offset)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+ const struct ipt_ip *ip = par->entryinfo;
+ int ret;
+
+ par->match = m->u.kernel.match;
+ par->matchinfo = m->data;
+
+ ret = xt_check_match(par, m->u.match_size - sizeof(*m),
+ ip->proto, ip->invflags & IPT_INV_PROTO);
+ if (ret < 0) {
+ duprintf("check failed for `%s'.\n", par->match->name);
+ return ret;
+ }
+ return 0;
+}
+
+static int
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+ struct xt_match *match;
+ int ret;
+
+ match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match)) {
+ duprintf("find_check_match: `%s' not found\n", m->u.user.name);
+ return PTR_ERR(match);
+ }
+ m->u.kernel.match = match;
+
+ ret = check_match(m, par);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ module_put(m->u.kernel.match->me);
+ return ret;
+}
+
+static int check_target(struct ipt_entry *e, struct net *net, const char *name)
+{
+ struct xt_entry_target *t = ipt_get_target(e);
+ struct xt_tgchk_param par = {
+ .net = net,
+ .table = name,
+ .entryinfo = e,
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .hook_mask = e->comefrom,
+ .family = NFPROTO_IPV4,
+ };
+ int ret;
+
+ ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
+ e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
+ if (ret < 0) {
+ duprintf("check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ return ret;
+ }
+ return 0;
+}
+
+static int
+find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
+ unsigned int size)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ int ret;
+ unsigned int j;
+ struct xt_mtchk_param mtpar;
+ struct xt_entry_match *ematch;
+
+ ret = check_entry(e, name);
+ if (ret)
+ return ret;
+
+ j = 0;
+ mtpar.net = net;
+ mtpar.table = name;
+ mtpar.entryinfo = &e->ip;
+ mtpar.hook_mask = e->comefrom;
+ mtpar.family = NFPROTO_IPV4;
+ xt_ematch_foreach(ematch, e) {
+ ret = find_check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
+
+ t = ipt_get_target(e);
+ target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
+ ret = PTR_ERR(target);
+ goto cleanup_matches;
+ }
+ t->u.kernel.target = target;
+
+ ret = check_target(e, net, name);
+ if (ret)
+ goto err;
+ return 0;
+ err:
+ module_put(t->u.kernel.target->me);
+ cleanup_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
+ return ret;
+}
+
+static bool check_underflow(const struct ipt_entry *e)
+{
+ const struct xt_entry_target *t;
+ unsigned int verdict;
+
+ if (!unconditional(&e->ip))
+ return false;
+ t = ipt_get_target_c(e);
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+ return false;
+ verdict = ((struct xt_standard_target *)t)->verdict;
+ verdict = -verdict - 1;
+ return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static int
+check_entry_size_and_hooks(struct ipt_entry *e,
+ struct xt_table_info *newinfo,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int valid_hooks)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if (!(valid_hooks & (1 << h)))
+ continue;
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h]) {
+ if (!check_underflow(e)) {
+ pr_err("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
+ return -EINVAL;
+ }
+ newinfo->underflow[h] = underflows[h];
+ }
+ }
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct xt_counters) { 0, 0 });
+ e->comefrom = 0;
+ return 0;
+}
+
+static void
+cleanup_entry(struct ipt_entry *e, struct net *net)
+{
+ struct xt_tgdtor_param par;
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
+
+ /* Cleanup all matches */
+ xt_ematch_foreach(ematch, e)
+ cleanup_match(ematch, net);
+ t = ipt_get_target(e);
+
+ par.net = net;
+ par.target = t->u.kernel.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_IPV4;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ newinfo) */
+static int
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+ const struct ipt_replace *repl)
+{
+ struct ipt_entry *iter;
+ unsigned int i;
+ int ret = 0;
+
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ return ret;
+ ++i;
+ if (strcmp(ipt_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+
+ if (i != repl->num_entries) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, repl->num_entries);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(repl->valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, repl->hook_entry[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, repl->underflow[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
+ return -ELOOP;
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, net, repl->name, repl->size);
+ if (ret != 0)
+ break;
+ ++i;
+ }
+
+ if (ret != 0) {
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter, net);
+ }
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i) {
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
+ }
+
+ return ret;
+}
+
+static void
+get_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct ipt_entry *iter;
+ unsigned int cpu;
+ unsigned int i;
+
+ for_each_possible_cpu(cpu) {
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+ i = 0;
+ xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
+ ++i; /* macro does multi eval of i */
+ }
+ }
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+ unsigned int countersize;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ (other than comefrom, which userspace doesn't care
+ about). */
+ countersize = sizeof(struct xt_counters) * private->number;
+ counters = vzalloc(countersize);
+
+ if (counters == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ get_counters(private, counters);
+
+ return counters;
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+ const struct xt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num;
+ const struct ipt_entry *e;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ int ret = 0;
+ const void *loc_cpu_entry;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ unsigned int i;
+ const struct xt_entry_match *m;
+ const struct xt_entry_target *t;
+
+ e = (struct ipt_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct ipt_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ for (i = sizeof(struct ipt_entry);
+ i < e->target_offset;
+ i += m->u.match_size) {
+ m = (void *)e + i;
+
+ if (copy_to_user(userptr + off + i
+ + offsetof(struct xt_entry_match,
+ u.user.name),
+ m->u.kernel.match->name,
+ strlen(m->u.kernel.match->name)+1)
+ != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ t = ipt_get_target_c(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct xt_entry_target,
+ u.user.name),
+ t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v > 0)
+ v += xt_compat_calc_jump(AF_INET, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+ compat_int_t cv = *(int *)src;
+
+ if (cv > 0)
+ cv -= xt_compat_calc_jump(AF_INET, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct ipt_entry *e,
+ const struct xt_table_info *info,
+ const void *base, struct xt_table_info *newinfo)
+{
+ const struct xt_entry_match *ematch;
+ const struct xt_entry_target *t;
+ unsigned int entry_offset;
+ int off, i, ret;
+
+ off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+ entry_offset = (void *)e - base;
+ xt_ematch_foreach(ematch, e)
+ off += xt_compat_match_offset(ematch->u.kernel.match);
+ t = ipt_get_target_c(e);
+ off += xt_compat_target_offset(t->u.kernel.target);
+ newinfo->size -= off;
+ ret = xt_compat_add_offset(AF_INET, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct ipt_entry *)(base + info->hook_entry[i])))
+ newinfo->hook_entry[i] -= off;
+ if (info->underflow[i] &&
+ (e < (struct ipt_entry *)(base + info->underflow[i])))
+ newinfo->underflow[i] -= off;
+ }
+ return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
+{
+ struct ipt_entry *iter;
+ void *loc_cpu_entry;
+ int ret;
+
+ if (!newinfo || !info)
+ return -EINVAL;
+
+ /* we dont care about newinfo->entries[] */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET, info->number);
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user,
+ const int *len, int compat)
+{
+ char name[XT_TABLE_MAXNAMELEN];
+ struct xt_table *t;
+ int ret;
+
+ if (*len != sizeof(struct ipt_getinfo)) {
+ duprintf("length %u != %zu\n", *len,
+ sizeof(struct ipt_getinfo));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0)
+ return -EFAULT;
+
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_lock(AF_INET);
+#endif
+ t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
+ "iptable_%s", name);
+ if (!IS_ERR_OR_NULL(t)) {
+ struct ipt_getinfo info;
+ const struct xt_table_info *private = t->private;
+#ifdef CONFIG_COMPAT
+ struct xt_table_info tmp;
+
+ if (compat) {
+ ret = compat_table_info(private, &tmp);
+ xt_compat_flush_offsets(AF_INET);
+ private = &tmp;
+ }
+#endif
+ memset(&info, 0, sizeof(info));
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = private->number;
+ info.size = private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ xt_table_unlock(t);
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_unlock(AF_INET);
+#endif
+ return ret;
+}
+
+static int
+get_entries(struct net *net, struct ipt_get_entries __user *uptr,
+ const int *len)
+{
+ int ret;
+ struct ipt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct ipt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ t = xt_find_table_lock(net, AF_INET, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+ duprintf("t->private->number = %u\n", private->number);
+ if (get.size == private->size)
+ ret = copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ return ret;
+}
+
+static int
+__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+ struct xt_table_info *newinfo, unsigned int num_counters,
+ void __user *counters_ptr)
+{
+ int ret;
+ struct xt_table *t;
+ struct xt_table_info *oldinfo;
+ struct xt_counters *counters;
+ void *loc_cpu_old_entry;
+ struct ipt_entry *iter;
+
+ ret = 0;
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
+ "iptable_%s", name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free_newinfo_counters_untrans;
+ }
+
+ /* You lied! */
+ if (valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto put_module;
+ }
+
+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto put_module;
+
+ /* Update module usage count based on number of rules */
+ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+ oldinfo->number, oldinfo->initial_entries, newinfo->number);
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+ if ((oldinfo->number > oldinfo->initial_entries) &&
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+
+ /* Get the old counters, and synchronize with replace */
+ get_counters(oldinfo, counters);
+
+ /* Decrease module usage counts and free resource */
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ cleanup_entry(iter, net);
+
+ xt_free_table_info(oldinfo);
+ if (copy_to_user(counters_ptr, counters,
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
+ }
+ vfree(counters);
+ xt_table_unlock(t);
+ return ret;
+
+ put_module:
+ module_put(t->me);
+ xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+ vfree(counters);
+ out:
+ return ret;
+}
+
+static int
+do_replace(struct net *net, const void __user *user, unsigned int len)
+{
+ int ret;
+ struct ipt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct ipt_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int
+do_add_counters(struct net *net, const void __user *user,
+ unsigned int len, int compat)
+{
+ unsigned int i, curcpu;
+ struct xt_counters_info tmp;
+ struct xt_counters *paddc;
+ unsigned int num_counters;
+ const char *name;
+ int size;
+ void *ptmp;
+ struct xt_table *t;
+ const struct xt_table_info *private;
+ int ret = 0;
+ void *loc_cpu_entry;
+ struct ipt_entry *iter;
+ unsigned int addend;
+#ifdef CONFIG_COMPAT
+ struct compat_xt_counters_info compat_tmp;
+
+ if (compat) {
+ ptmp = &compat_tmp;
+ size = sizeof(struct compat_xt_counters_info);
+ } else
+#endif
+ {
+ ptmp = &tmp;
+ size = sizeof(struct xt_counters_info);
+ }
+
+ if (copy_from_user(ptmp, user, size) != 0)
+ return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ num_counters = compat_tmp.num_counters;
+ name = compat_tmp.name;
+ } else
+#endif
+ {
+ num_counters = tmp.num_counters;
+ name = tmp.name;
+ }
+
+ if (len != size + num_counters * sizeof(struct xt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len - size);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user + size, len - size) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = xt_find_table_lock(net, AF_INET, name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free;
+ }
+
+ local_bh_disable();
+ private = t->private;
+ if (private->number != num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ /* Choose the copy that is on our node */
+ curcpu = smp_processor_id();
+ loc_cpu_entry = private->entries[curcpu];
+ addend = xt_write_recseq_begin();
+ xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+ ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
+ xt_write_recseq_end(addend);
+ unlock_up_free:
+ local_bh_enable();
+ xt_table_unlock(t);
+ module_put(t->me);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ipt_replace {
+ char name[XT_TABLE_MAXNAMELEN];
+ u32 valid_hooks;
+ u32 num_entries;
+ u32 size;
+ u32 hook_entry[NF_INET_NUMHOOKS];
+ u32 underflow[NF_INET_NUMHOOKS];
+ u32 num_counters;
+ compat_uptr_t counters; /* struct xt_counters * */
+ struct compat_ipt_entry entries[0];
+};
+
+static int
+compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
+ unsigned int *size, struct xt_counters *counters,
+ unsigned int i)
+{
+ struct xt_entry_target *t;
+ struct compat_ipt_entry __user *ce;
+ u_int16_t target_offset, next_offset;
+ compat_uint_t origsize;
+ const struct xt_entry_match *ematch;
+ int ret = 0;
+
+ origsize = *size;
+ ce = (struct compat_ipt_entry __user *)*dstptr;
+ if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
+
+ *dstptr += sizeof(struct compat_ipt_entry);
+ *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_to_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
+ target_offset = e->target_offset - (origsize - *size);
+ t = ipt_get_target(e);
+ ret = xt_compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int
+compat_find_calc_match(struct xt_entry_match *m,
+ const char *name,
+ const struct ipt_ip *ip,
+ unsigned int hookmask,
+ int *size)
+{
+ struct xt_match *match;
+
+ match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match)) {
+ duprintf("compat_check_calc_match: `%s' not found\n",
+ m->u.user.name);
+ return PTR_ERR(match);
+ }
+ m->u.kernel.match = match;
+ *size += xt_compat_match_offset(match);
+ return 0;
+}
+
+static void compat_release_entry(struct compat_ipt_entry *e)
+{
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
+
+ /* Cleanup all matches */
+ xt_ematch_foreach(ematch, e)
+ module_put(ematch->u.kernel.match->me);
+ t = compat_ipt_get_target(e);
+ module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ const char *name)
+{
+ struct xt_entry_match *ematch;
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ unsigned int j;
+ int ret, off, h;
+
+ duprintf("check_compat_entry_size_and_hooks %p\n", e);
+ if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
+ duprintf("Bad offset %p, limit = %p\n", e, limit);
+ return -EINVAL;
+ }
+
+ if (e->next_offset < sizeof(struct compat_ipt_entry) +
+ sizeof(struct compat_xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* For purposes of check_entry casting the compat entry is fine */
+ ret = check_entry((struct ipt_entry *)e, name);
+ if (ret)
+ return ret;
+
+ off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+ entry_offset = (void *)e - (void *)base;
+ j = 0;
+ xt_ematch_foreach(ematch, e) {
+ ret = compat_find_calc_match(ematch, name,
+ &e->ip, e->comefrom, &off);
+ if (ret != 0)
+ goto release_matches;
+ ++j;
+ }
+
+ t = compat_ipt_get_target(e);
+ target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+ t->u.user.name);
+ ret = PTR_ERR(target);
+ goto release_matches;
+ }
+ t->u.kernel.target = target;
+
+ off += xt_compat_target_offset(target);
+ *size += off;
+ ret = xt_compat_add_offset(AF_INET, entry_offset, off);
+ if (ret)
+ goto out;
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* Clear counters and comefrom */
+ memset(&e->counters, 0, sizeof(e->counters));
+ e->comefrom = 0;
+ return 0;
+
+out:
+ module_put(t->u.kernel.target->me);
+release_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ module_put(ematch->u.kernel.match->me);
+ }
+ return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
+ unsigned int *size, const char *name,
+ struct xt_table_info *newinfo, unsigned char *base)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ struct ipt_entry *de;
+ unsigned int origsize;
+ int ret, h;
+ struct xt_entry_match *ematch;
+
+ ret = 0;
+ origsize = *size;
+ de = (struct ipt_entry *)*dstptr;
+ memcpy(de, e, sizeof(struct ipt_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct ipt_entry);
+ *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_from_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
+ de->target_offset = e->target_offset - (origsize - *size);
+ t = compat_ipt_get_target(e);
+ target = t->u.kernel.target;
+ xt_compat_target_from_user(t, dstptr, size);
+
+ de->next_offset = e->next_offset - (origsize - *size);
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if ((unsigned char *)de - base < newinfo->hook_entry[h])
+ newinfo->hook_entry[h] -= origsize - *size;
+ if ((unsigned char *)de - base < newinfo->underflow[h])
+ newinfo->underflow[h] -= origsize - *size;
+ }
+ return ret;
+}
+
+static int
+compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
+{
+ struct xt_entry_match *ematch;
+ struct xt_mtchk_param mtpar;
+ unsigned int j;
+ int ret = 0;
+
+ j = 0;
+ mtpar.net = net;
+ mtpar.table = name;
+ mtpar.entryinfo = &e->ip;
+ mtpar.hook_mask = e->comefrom;
+ mtpar.family = NFPROTO_IPV4;
+ xt_ematch_foreach(ematch, e) {
+ ret = check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
+
+ ret = check_target(e, net, name);
+ if (ret)
+ goto cleanup_matches;
+ return 0;
+
+ cleanup_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
+ return ret;
+}
+
+static int
+translate_compat_table(struct net *net,
+ const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ unsigned int total_size,
+ unsigned int number,
+ unsigned int *hook_entries,
+ unsigned int *underflows)
+{
+ unsigned int i, j;
+ struct xt_table_info *newinfo, *info;
+ void *pos, *entry0, *entry1;
+ struct compat_ipt_entry *iter0;
+ struct ipt_entry *iter1;
+ unsigned int size;
+ int ret;
+
+ info = *pinfo;
+ entry0 = *pentry0;
+ size = total_size;
+ info->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ info->hook_entry[i] = 0xFFFFFFFF;
+ info->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_compat_table: size %u\n", info->size);
+ j = 0;
+ xt_compat_lock(AF_INET);
+ xt_compat_init_offsets(AF_INET, number);
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + total_size,
+ hook_entries,
+ underflows,
+ name);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
+
+ ret = -EINVAL;
+ if (j != number) {
+ duprintf("translate_compat_table: %u not %u entries\n",
+ j, number);
+ goto out_unlock;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (info->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ goto out_unlock;
+ }
+ if (info->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ goto out_unlock;
+ }
+ }
+
+ ret = -ENOMEM;
+ newinfo = xt_alloc_table_info(size);
+ if (!newinfo)
+ goto out_unlock;
+
+ newinfo->number = number;
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = info->hook_entry[i];
+ newinfo->underflow[i] = info->underflow[i];
+ }
+ entry1 = newinfo->entries[raw_smp_processor_id()];
+ pos = entry1;
+ size = total_size;
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = compat_copy_entry_from_user(iter0, &pos, &size,
+ name, newinfo, entry1);
+ if (ret != 0)
+ break;
+ }
+ xt_compat_flush_offsets(AF_INET);
+ xt_compat_unlock(AF_INET);
+ if (ret)
+ goto free_newinfo;
+
+ ret = -ELOOP;
+ if (!mark_source_chains(newinfo, valid_hooks, entry1))
+ goto free_newinfo;
+
+ i = 0;
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ ret = compat_check_entry(iter1, net, name);
+ if (ret != 0)
+ break;
+ ++i;
+ if (strcmp(ipt_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+ if (ret) {
+ /*
+ * The first i matches need cleanup_entry (calls ->destroy)
+ * because they had called ->check already. The other j-i
+ * entries need only release.
+ */
+ int skip = i;
+ j -= i;
+ xt_entry_foreach(iter0, entry0, newinfo->size) {
+ if (skip-- > 0)
+ continue;
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter1, net);
+ }
+ xt_free_table_info(newinfo);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i)
+ if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+ memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+ *pinfo = newinfo;
+ *pentry0 = entry1;
+ xt_free_table_info(info);
+ return 0;
+
+free_newinfo:
+ xt_free_table_info(newinfo);
+out:
+ xt_entry_foreach(iter0, entry0, total_size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(AF_INET);
+ xt_compat_unlock(AF_INET);
+ goto out;
+}
+
+static int
+compat_do_replace(struct net *net, void __user *user, unsigned int len)
+{
+ int ret;
+ struct compat_ipt_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct ipt_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.size >= INT_MAX / num_possible_cpus())
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
+ &newinfo, &loc_cpu_entry, tmp.size,
+ tmp.num_entries, tmp.hook_entry,
+ tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("compat_do_replace: Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int
+compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
+ unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_SET_REPLACE:
+ ret = compat_do_replace(sock_net(sk), user, len);
+ break;
+
+ case IPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 1);
+ break;
+
+ default:
+ duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+struct compat_ipt_get_entries {
+ char name[XT_TABLE_MAXNAMELEN];
+ compat_uint_t size;
+ struct compat_ipt_entry entrytable[0];
+};
+
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+ void __user *userptr)
+{
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ void __user *pos;
+ unsigned int size;
+ int ret = 0;
+ const void *loc_cpu_entry;
+ unsigned int i = 0;
+ struct ipt_entry *iter;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ pos = userptr;
+ size = total_size;
+ xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
+
+ vfree(counters);
+ return ret;
+}
+
+static int
+compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
+ int *len)
+{
+ int ret;
+ struct compat_ipt_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+
+ if (*len != sizeof(struct compat_ipt_get_entries) + get.size) {
+ duprintf("compat_get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ xt_compat_lock(AF_INET);
+ t = xt_find_table_lock(net, AF_INET, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+ struct xt_table_info info;
+ duprintf("t->private->number = %u\n", private->number);
+ ret = compat_table_info(private, &info);
+ if (!ret && get.size == info.size) {
+ ret = compat_copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ } else if (!ret) {
+ duprintf("compat_get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ xt_compat_flush_offsets(AF_INET);
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ xt_compat_unlock(AF_INET);
+ return ret;
+}
+
+static int do_ipt_get_ctl(struct sock *, int, void __user *, int *);
+
+static int
+compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 1);
+ break;
+ case IPT_SO_GET_ENTRIES:
+ ret = compat_get_entries(sock_net(sk), user, len);
+ break;
+ default:
+ ret = do_ipt_get_ctl(sk, cmd, user, len);
+ }
+ return ret;
+}
+#endif
+
+static int
+do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_SET_REPLACE:
+ ret = do_replace(sock_net(sk), user, len);
+ break;
+
+ case IPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 0);
+ break;
+
+ default:
+ duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int
+do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 0);
+ break;
+
+ case IPT_SO_GET_ENTRIES:
+ ret = get_entries(sock_net(sk), user, len);
+ break;
+
+ case IPT_SO_GET_REVISION_MATCH:
+ case IPT_SO_GET_REVISION_TARGET: {
+ struct xt_get_revision rev;
+ int target;
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ rev.name[sizeof(rev.name)-1] = 0;
+
+ if (cmd == IPT_SO_GET_REVISION_TARGET)
+ target = 1;
+ else
+ target = 0;
+
+ try_then_request_module(xt_find_revision(AF_INET, rev.name,
+ rev.revision,
+ target, &ret),
+ "ipt_%s", rev.name);
+ break;
+ }
+
+ default:
+ duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+struct xt_table *ipt_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct ipt_replace *repl)
+{
+ int ret;
+ struct xt_table_info *newinfo;
+ struct xt_table_info bootstrap = {0};
+ void *loc_cpu_entry;
+ struct xt_table *new_table;
+
+ newinfo = xt_alloc_table_info(repl->size);
+ if (!newinfo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* choose the copy on our node/cpu, but dont care about preemption */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
+ if (ret != 0)
+ goto out_free;
+
+ new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ if (IS_ERR(new_table)) {
+ ret = PTR_ERR(new_table);
+ goto out_free;
+ }
+
+ return new_table;
+
+out_free:
+ xt_free_table_info(newinfo);
+out:
+ return ERR_PTR(ret);
+}
+
+void ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ipt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline bool
+icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+ u_int8_t type, u_int8_t code,
+ bool invert)
+{
+ return ((test_type == 0xFF) ||
+ (type == test_type && code >= min_code && code <= max_code))
+ ^ invert;
+}
+
+static bool
+icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmphdr *ic;
+ struct icmphdr _icmph;
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (ic == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ duprintf("Dropping evil ICMP tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->type, ic->code,
+ !!(icmpinfo->invflags&IPT_ICMP_INV));
+}
+
+static int icmp_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
+}
+
+static struct xt_target ipt_builtin_tg[] __read_mostly = {
+ {
+ .name = XT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_IPV4,
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
+#endif
+ },
+ {
+ .name = XT_ERROR_TARGET,
+ .target = ipt_error,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_IPV4,
+ },
+};
+
+static struct nf_sockopt_ops ipt_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = IPT_BASE_CTL,
+ .set_optmax = IPT_SO_SET_MAX+1,
+ .set = do_ipt_set_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_set = compat_do_ipt_set_ctl,
+#endif
+ .get_optmin = IPT_BASE_CTL,
+ .get_optmax = IPT_SO_GET_MAX+1,
+ .get = do_ipt_get_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_get = compat_do_ipt_get_ctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+static struct xt_match ipt_builtin_mt[] __read_mostly = {
+ {
+ .name = "icmp",
+ .match = icmp_match,
+ .matchsize = sizeof(struct ipt_icmp),
+ .checkentry = icmp_checkentry,
+ .proto = IPPROTO_ICMP,
+ .family = NFPROTO_IPV4,
+ },
+};
+
+static int __net_init ip_tables_net_init(struct net *net)
+{
+ return xt_proto_init(net, NFPROTO_IPV4);
+}
+
+static void __net_exit ip_tables_net_exit(struct net *net)
+{
+ xt_proto_fini(net, NFPROTO_IPV4);
+}
+
+static struct pernet_operations ip_tables_net_ops = {
+ .init = ip_tables_net_init,
+ .exit = ip_tables_net_exit,
+};
+
+static int __init ip_tables_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip_tables_net_ops);
+ if (ret < 0)
+ goto err1;
+
+ /* No one else will be downing sem now, so we won't sleep */
+ ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+ if (ret < 0)
+ goto err2;
+ ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+ if (ret < 0)
+ goto err4;
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&ipt_sockopts);
+ if (ret < 0)
+ goto err5;
+
+ pr_info("(C) 2000-2006 Netfilter Core Team\n");
+ return 0;
+
+err5:
+ xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+err4:
+ xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+err2:
+ unregister_pernet_subsys(&ip_tables_net_ops);
+err1:
+ return ret;
+}
+
+static void __exit ip_tables_fini(void)
+{
+ nf_unregister_sockopt(&ipt_sockopts);
+
+ xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+ xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+ unregister_pernet_subsys(&ip_tables_net_ops);
+}
+
+EXPORT_SYMBOL(ipt_register_table);
+EXPORT_SYMBOL(ipt_unregister_table);
+EXPORT_SYMBOL(ipt_do_table);
+module_init(ip_tables_init);
+module_exit(ip_tables_fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
new file mode 100644
index 000000000..771ab3d01
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -0,0 +1,794 @@
+/* Cluster IP hashmark target
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * based on ideas of Fabio Olive Leite <olive@unixforge.org>
+ *
+ * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/jhash.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/if_arp.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+
+#define CLUSTERIP_VERSION "0.8"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
+
+struct clusterip_config {
+ struct list_head list; /* list of all configs */
+ atomic_t refcount; /* reference count */
+ atomic_t entries; /* number of entries/rules
+ * referencing us */
+
+ __be32 clusterip; /* the IP address */
+ u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
+ struct net_device *dev; /* device */
+ u_int16_t num_total_nodes; /* total number of nodes */
+ unsigned long local_nodes; /* node number array */
+
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *pde; /* proc dir entry */
+#endif
+ enum clusterip_hashmode hash_mode; /* which hashing mode */
+ u_int32_t hash_initval; /* hash initialization */
+ struct rcu_head rcu;
+};
+
+#ifdef CONFIG_PROC_FS
+static const struct file_operations clusterip_proc_fops;
+#endif
+
+static int clusterip_net_id __read_mostly;
+
+struct clusterip_net {
+ struct list_head configs;
+ /* lock protects the configs list */
+ spinlock_t lock;
+
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *procdir;
+#endif
+};
+
+static inline void
+clusterip_config_get(struct clusterip_config *c)
+{
+ atomic_inc(&c->refcount);
+}
+
+
+static void clusterip_config_rcu_free(struct rcu_head *head)
+{
+ kfree(container_of(head, struct clusterip_config, rcu));
+}
+
+static inline void
+clusterip_config_put(struct clusterip_config *c)
+{
+ if (atomic_dec_and_test(&c->refcount))
+ call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
+}
+
+/* decrease the count of entries using/referencing this config. If last
+ * entry(rule) is removed, remove the config from lists, but don't free it
+ * yet, since proc-files could still be holding references */
+static inline void
+clusterip_config_entry_put(struct clusterip_config *c)
+{
+ struct net *net = dev_net(c->dev);
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ local_bh_disable();
+ if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
+ list_del_rcu(&c->list);
+ spin_unlock(&cn->lock);
+ local_bh_enable();
+
+ dev_mc_del(c->dev, c->clustermac);
+ dev_put(c->dev);
+
+ /* In case anyone still accesses the file, the open/close
+ * functions are also incrementing the refcount on their own,
+ * so it's safe to remove the entry even if it's in use. */
+#ifdef CONFIG_PROC_FS
+ proc_remove(c->pde);
+#endif
+ return;
+ }
+ local_bh_enable();
+}
+
+static struct clusterip_config *
+__clusterip_config_find(struct net *net, __be32 clusterip)
+{
+ struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ list_for_each_entry_rcu(c, &cn->configs, list) {
+ if (c->clusterip == clusterip)
+ return c;
+ }
+
+ return NULL;
+}
+
+static inline struct clusterip_config *
+clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
+{
+ struct clusterip_config *c;
+
+ rcu_read_lock_bh();
+ c = __clusterip_config_find(net, clusterip);
+ if (c) {
+ if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+ c = NULL;
+ else if (entry)
+ atomic_inc(&c->entries);
+ }
+ rcu_read_unlock_bh();
+
+ return c;
+}
+
+static void
+clusterip_config_init_nodelist(struct clusterip_config *c,
+ const struct ipt_clusterip_tgt_info *i)
+{
+ int n;
+
+ for (n = 0; n < i->num_local_nodes; n++)
+ set_bit(i->local_nodes[n] - 1, &c->local_nodes);
+}
+
+static struct clusterip_config *
+clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
+ struct net_device *dev)
+{
+ struct clusterip_config *c;
+ struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
+
+ c = kzalloc(sizeof(*c), GFP_ATOMIC);
+ if (!c)
+ return NULL;
+
+ c->dev = dev;
+ c->clusterip = ip;
+ memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+ c->num_total_nodes = i->num_total_nodes;
+ clusterip_config_init_nodelist(c, i);
+ c->hash_mode = i->hash_mode;
+ c->hash_initval = i->hash_initval;
+ atomic_set(&c->refcount, 1);
+ atomic_set(&c->entries, 1);
+
+#ifdef CONFIG_PROC_FS
+ {
+ char buffer[16];
+
+ /* create proc dir entry */
+ sprintf(buffer, "%pI4", &ip);
+ c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
+ cn->procdir,
+ &clusterip_proc_fops, c);
+ if (!c->pde) {
+ kfree(c);
+ return NULL;
+ }
+ }
+#endif
+
+ spin_lock_bh(&cn->lock);
+ list_add_rcu(&c->list, &cn->configs);
+ spin_unlock_bh(&cn->lock);
+
+ return c;
+}
+
+#ifdef CONFIG_PROC_FS
+static int
+clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+
+ if (nodenum == 0 ||
+ nodenum > c->num_total_nodes)
+ return 1;
+
+ /* check if we already have this number in our bitfield */
+ if (test_and_set_bit(nodenum - 1, &c->local_nodes))
+ return 1;
+
+ return 0;
+}
+
+static bool
+clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+ if (nodenum == 0 ||
+ nodenum > c->num_total_nodes)
+ return true;
+
+ if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
+ return false;
+
+ return true;
+}
+#endif
+
+static inline u_int32_t
+clusterip_hashfn(const struct sk_buff *skb,
+ const struct clusterip_config *config)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned long hashval;
+ u_int16_t sport = 0, dport = 0;
+ int poff;
+
+ poff = proto_ports_offset(iph->protocol);
+ if (poff >= 0) {
+ const u_int16_t *ports;
+ u16 _ports[2];
+
+ ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
+ if (ports) {
+ sport = ports[0];
+ dport = ports[1];
+ }
+ } else {
+ net_info_ratelimited("unknown protocol %u\n", iph->protocol);
+ }
+
+ switch (config->hash_mode) {
+ case CLUSTERIP_HASHMODE_SIP:
+ hashval = jhash_1word(ntohl(iph->saddr),
+ config->hash_initval);
+ break;
+ case CLUSTERIP_HASHMODE_SIP_SPT:
+ hashval = jhash_2words(ntohl(iph->saddr), sport,
+ config->hash_initval);
+ break;
+ case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
+ hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
+ config->hash_initval);
+ break;
+ default:
+ /* to make gcc happy */
+ hashval = 0;
+ /* This cannot happen, unless the check function wasn't called
+ * at rule load time */
+ pr_info("unknown mode %u\n", config->hash_mode);
+ BUG();
+ break;
+ }
+
+ /* node numbers are 1..n, not 0..n */
+ return reciprocal_scale(hashval, config->num_total_nodes) + 1;
+}
+
+static inline int
+clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
+{
+ return test_bit(hash - 1, &config->local_nodes);
+}
+
+/***********************************************************************
+ * IPTABLES TARGET
+ ***********************************************************************/
+
+static unsigned int
+clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ u_int32_t hash;
+
+ /* don't need to clusterip_config_get() here, since refcount
+ * is only decremented by destroy() - and ip_tables guarantees
+ * that the ->target() function isn't called after ->destroy() */
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return NF_DROP;
+
+ /* special case: ICMP error handling. conntrack distinguishes between
+ * error messages (RELATED) and information requests (see below) */
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
+ (ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY))
+ return XT_CONTINUE;
+
+ /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
+ * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+ * on, which all have an ID field [relevant for hashing]. */
+
+ hash = clusterip_hashfn(skb, cipinfo->config);
+
+ switch (ctinfo) {
+ case IP_CT_NEW:
+ ct->mark = hash;
+ break;
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ /* FIXME: we don't handle expectations at the moment.
+ * They can arrive on a different node than
+ * the master connection (e.g. FTP passive mode) */
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+ default: /* Prevent gcc warnings */
+ break;
+ }
+
+#ifdef DEBUG
+ nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+#endif
+ pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
+ if (!clusterip_responsible(cipinfo->config, hash)) {
+ pr_debug("not responsible\n");
+ return NF_DROP;
+ }
+ pr_debug("responsible\n");
+
+ /* despite being received via linklayer multicast, this is
+ * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
+ skb->pkt_type = PACKET_HOST;
+
+ return XT_CONTINUE;
+}
+
+static int clusterip_tg_check(const struct xt_tgchk_param *par)
+{
+ struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
+ struct clusterip_config *config;
+ int ret;
+
+ if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
+ cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
+ cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
+ pr_info("unknown mode %u\n", cipinfo->hash_mode);
+ return -EINVAL;
+
+ }
+ if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
+ e->ip.dst.s_addr == 0) {
+ pr_info("Please specify destination IP\n");
+ return -EINVAL;
+ }
+
+ /* FIXME: further sanity checks */
+
+ config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
+ if (!config) {
+ if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
+ pr_info("no config found for %pI4, need 'new'\n",
+ &e->ip.dst.s_addr);
+ return -EINVAL;
+ } else {
+ struct net_device *dev;
+
+ if (e->ip.iniface[0] == '\0') {
+ pr_info("Please specify an interface name\n");
+ return -EINVAL;
+ }
+
+ dev = dev_get_by_name(par->net, e->ip.iniface);
+ if (!dev) {
+ pr_info("no such interface %s\n",
+ e->ip.iniface);
+ return -ENOENT;
+ }
+
+ config = clusterip_config_init(cipinfo,
+ e->ip.dst.s_addr, dev);
+ if (!config) {
+ dev_put(dev);
+ return -ENOMEM;
+ }
+ dev_mc_add(config->dev, config->clustermac);
+ }
+ }
+ cipinfo->config = config;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+
+ if (!par->net->xt.clusterip_deprecated_warning) {
+ pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
+ "use xt_cluster instead\n");
+ par->net->xt.clusterip_deprecated_warning = true;
+ }
+
+ return ret;
+}
+
+/* drop reference count of cluster config when rule is deleted */
+static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+
+ /* if no more entries are referencing the config, remove it
+ * from the list and destroy the proc entry */
+ clusterip_config_entry_put(cipinfo->config);
+
+ clusterip_config_put(cipinfo->config);
+
+ nf_ct_l3proto_module_put(par->family);
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ipt_clusterip_tgt_info
+{
+ u_int32_t flags;
+ u_int8_t clustermac[6];
+ u_int16_t num_total_nodes;
+ u_int16_t num_local_nodes;
+ u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
+ u_int32_t hash_mode;
+ u_int32_t hash_initval;
+ compat_uptr_t config;
+};
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target clusterip_tg_reg __read_mostly = {
+ .name = "CLUSTERIP",
+ .family = NFPROTO_IPV4,
+ .target = clusterip_tg,
+ .checkentry = clusterip_tg_check,
+ .destroy = clusterip_tg_destroy,
+ .targetsize = sizeof(struct ipt_clusterip_tgt_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
+#endif /* CONFIG_COMPAT */
+ .me = THIS_MODULE
+};
+
+
+/***********************************************************************
+ * ARP MANGLING CODE
+ ***********************************************************************/
+
+/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
+struct arp_payload {
+ u_int8_t src_hw[ETH_ALEN];
+ __be32 src_ip;
+ u_int8_t dst_hw[ETH_ALEN];
+ __be32 dst_ip;
+} __packed;
+
+#ifdef DEBUG
+static void arp_print(struct arp_payload *payload)
+{
+#define HBUFFERLEN 30
+ char hbuffer[HBUFFERLEN];
+ int j,k;
+
+ for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+ hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
+ hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
+ hbuffer[k++]=':';
+ }
+ hbuffer[--k]='\0';
+
+ pr_debug("src %pI4@%s, dst %pI4\n",
+ &payload->src_ip, hbuffer, &payload->dst_ip);
+}
+#endif
+
+static unsigned int
+arp_mangle(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct arphdr *arp = arp_hdr(skb);
+ struct arp_payload *payload;
+ struct clusterip_config *c;
+ struct net *net = dev_net(state->in ? state->in : state->out);
+
+ /* we don't care about non-ethernet and non-ipv4 ARP */
+ if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+ arp->ar_pro != htons(ETH_P_IP) ||
+ arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+ return NF_ACCEPT;
+
+ /* we only want to mangle arp requests and replies */
+ if (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST))
+ return NF_ACCEPT;
+
+ payload = (void *)(arp+1);
+
+ /* if there is no clusterip configuration for the arp reply's
+ * source ip, we don't want to mangle it */
+ c = clusterip_config_find_get(net, payload->src_ip, 0);
+ if (!c)
+ return NF_ACCEPT;
+
+ /* normally the linux kernel always replies to arp queries of
+ * addresses on different interfacs. However, in the CLUSTERIP case
+ * this wouldn't work, since we didn't subscribe the mcast group on
+ * other interfaces */
+ if (c->dev != state->out) {
+ pr_debug("not mangling arp reply on different "
+ "interface: cip'%s'-skb'%s'\n",
+ c->dev->name, state->out->name);
+ clusterip_config_put(c);
+ return NF_ACCEPT;
+ }
+
+ /* mangle reply hardware address */
+ memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
+
+#ifdef DEBUG
+ pr_debug("mangled arp reply: ");
+ arp_print(payload);
+#endif
+
+ clusterip_config_put(c);
+
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_ops cip_arp_ops __read_mostly = {
+ .hook = arp_mangle,
+ .pf = NFPROTO_ARP,
+ .hooknum = NF_ARP_OUT,
+ .priority = -1
+};
+
+/***********************************************************************
+ * PROC DIR HANDLING
+ ***********************************************************************/
+
+#ifdef CONFIG_PROC_FS
+
+struct clusterip_seq_position {
+ unsigned int pos; /* position */
+ unsigned int weight; /* number of bits set == size */
+ unsigned int bit; /* current bit */
+ unsigned long val; /* current value */
+};
+
+static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct clusterip_config *c = s->private;
+ unsigned int weight;
+ u_int32_t local_nodes;
+ struct clusterip_seq_position *idx;
+
+ /* FIXME: possible race */
+ local_nodes = c->local_nodes;
+ weight = hweight32(local_nodes);
+ if (*pos >= weight)
+ return NULL;
+
+ idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
+ if (!idx)
+ return ERR_PTR(-ENOMEM);
+
+ idx->pos = *pos;
+ idx->weight = weight;
+ idx->bit = ffs(local_nodes);
+ idx->val = local_nodes;
+ clear_bit(idx->bit - 1, &idx->val);
+
+ return idx;
+}
+
+static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct clusterip_seq_position *idx = v;
+
+ *pos = ++idx->pos;
+ if (*pos >= idx->weight) {
+ kfree(v);
+ return NULL;
+ }
+ idx->bit = ffs(idx->val);
+ clear_bit(idx->bit - 1, &idx->val);
+ return idx;
+}
+
+static void clusterip_seq_stop(struct seq_file *s, void *v)
+{
+ if (!IS_ERR(v))
+ kfree(v);
+}
+
+static int clusterip_seq_show(struct seq_file *s, void *v)
+{
+ struct clusterip_seq_position *idx = v;
+
+ if (idx->pos != 0)
+ seq_putc(s, ',');
+
+ seq_printf(s, "%u", idx->bit);
+
+ if (idx->pos == idx->weight - 1)
+ seq_putc(s, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations clusterip_seq_ops = {
+ .start = clusterip_seq_start,
+ .next = clusterip_seq_next,
+ .stop = clusterip_seq_stop,
+ .show = clusterip_seq_show,
+};
+
+static int clusterip_proc_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &clusterip_seq_ops);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ struct clusterip_config *c = PDE_DATA(inode);
+
+ sf->private = c;
+
+ clusterip_config_get(c);
+ }
+
+ return ret;
+}
+
+static int clusterip_proc_release(struct inode *inode, struct file *file)
+{
+ struct clusterip_config *c = PDE_DATA(inode);
+ int ret;
+
+ ret = seq_release(inode, file);
+
+ if (!ret)
+ clusterip_config_put(c);
+
+ return ret;
+}
+
+static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
+ size_t size, loff_t *ofs)
+{
+ struct clusterip_config *c = PDE_DATA(file_inode(file));
+#define PROC_WRITELEN 10
+ char buffer[PROC_WRITELEN+1];
+ unsigned long nodenum;
+ int rc;
+
+ if (size > PROC_WRITELEN)
+ return -EIO;
+ if (copy_from_user(buffer, input, size))
+ return -EFAULT;
+ buffer[size] = 0;
+
+ if (*buffer == '+') {
+ rc = kstrtoul(buffer+1, 10, &nodenum);
+ if (rc)
+ return rc;
+ if (clusterip_add_node(c, nodenum))
+ return -ENOMEM;
+ } else if (*buffer == '-') {
+ rc = kstrtoul(buffer+1, 10, &nodenum);
+ if (rc)
+ return rc;
+ if (clusterip_del_node(c, nodenum))
+ return -ENOENT;
+ } else
+ return -EIO;
+
+ return size;
+}
+
+static const struct file_operations clusterip_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = clusterip_proc_open,
+ .read = seq_read,
+ .write = clusterip_proc_write,
+ .llseek = seq_lseek,
+ .release = clusterip_proc_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static int clusterip_net_init(struct net *net)
+{
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+ INIT_LIST_HEAD(&cn->configs);
+
+ spin_lock_init(&cn->lock);
+
+#ifdef CONFIG_PROC_FS
+ cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+ if (!cn->procdir) {
+ pr_err("Unable to proc dir entry\n");
+ return -ENOMEM;
+ }
+#endif /* CONFIG_PROC_FS */
+
+ return 0;
+}
+
+static void clusterip_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+ proc_remove(cn->procdir);
+#endif
+}
+
+static struct pernet_operations clusterip_net_ops = {
+ .init = clusterip_net_init,
+ .exit = clusterip_net_exit,
+ .id = &clusterip_net_id,
+ .size = sizeof(struct clusterip_net),
+};
+
+static int __init clusterip_tg_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&clusterip_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_register_target(&clusterip_tg_reg);
+ if (ret < 0)
+ goto cleanup_subsys;
+
+ ret = nf_register_hook(&cip_arp_ops);
+ if (ret < 0)
+ goto cleanup_target;
+
+ pr_info("ClusterIP Version %s loaded successfully\n",
+ CLUSTERIP_VERSION);
+
+ return 0;
+
+cleanup_target:
+ xt_unregister_target(&clusterip_tg_reg);
+cleanup_subsys:
+ unregister_pernet_subsys(&clusterip_net_ops);
+ return ret;
+}
+
+static void __exit clusterip_tg_exit(void)
+{
+ pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
+
+ nf_unregister_hook(&cip_arp_ops);
+ xt_unregister_target(&clusterip_tg_reg);
+ unregister_pernet_subsys(&clusterip_net_ops);
+
+ /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
+ rcu_barrier_bh();
+}
+
+module_init(clusterip_tg_init);
+module_exit(clusterip_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
new file mode 100644
index 000000000..4bf3dc49a
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -0,0 +1,138 @@
+/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/tcp.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ECN.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag modification");
+
+/* set ECT codepoint from IP header.
+ * return false if there was an error. */
+static inline bool
+set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
+ __u8 oldtos;
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return false;
+ iph = ip_hdr(skb);
+ oldtos = iph->tos;
+ iph->tos &= ~IPT_ECN_IP_MASK;
+ iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
+ csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+ }
+ return true;
+}
+
+/* Return false if there was an error. */
+static inline bool
+set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
+{
+ struct tcphdr _tcph, *tcph;
+ __be16 oldval;
+
+ /* Not enough header? */
+ tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
+ if (!tcph)
+ return false;
+
+ if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
+ tcph->ece == einfo->proto.tcp.ece) &&
+ (!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
+ tcph->cwr == einfo->proto.tcp.cwr))
+ return true;
+
+ if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+ return false;
+ tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb);
+
+ oldval = ((__be16 *)tcph)[6];
+ if (einfo->operation & IPT_ECN_OP_SET_ECE)
+ tcph->ece = einfo->proto.tcp.ece;
+ if (einfo->operation & IPT_ECN_OP_SET_CWR)
+ tcph->cwr = einfo->proto.tcp.cwr;
+
+ inet_proto_csum_replace2(&tcph->check, skb,
+ oldval, ((__be16 *)tcph)[6], 0);
+ return true;
+}
+
+static unsigned int
+ecn_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipt_ECN_info *einfo = par->targinfo;
+
+ if (einfo->operation & IPT_ECN_OP_SET_IP)
+ if (!set_ect_ip(skb, einfo))
+ return NF_DROP;
+
+ if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
+ ip_hdr(skb)->protocol == IPPROTO_TCP)
+ if (!set_ect_tcp(skb, einfo))
+ return NF_DROP;
+
+ return XT_CONTINUE;
+}
+
+static int ecn_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_ECN_info *einfo = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
+
+ if (einfo->operation & IPT_ECN_OP_MASK) {
+ pr_info("unsupported ECN operation %x\n", einfo->operation);
+ return -EINVAL;
+ }
+ if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
+ pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect);
+ return -EINVAL;
+ }
+ if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
+ (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
+ pr_info("cannot use TCP operations on a non-tcp rule\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target ecn_tg_reg __read_mostly = {
+ .name = "ECN",
+ .family = NFPROTO_IPV4,
+ .target = ecn_tg,
+ .targetsize = sizeof(struct ipt_ECN_info),
+ .table = "mangle",
+ .checkentry = ecn_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init ecn_tg_init(void)
+{
+ return xt_register_target(&ecn_tg_reg);
+}
+
+static void __exit ecn_tg_exit(void)
+{
+ xt_unregister_target(&ecn_tg_reg);
+}
+
+module_init(ecn_tg_init);
+module_exit(ecn_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
new file mode 100644
index 000000000..da7f02a0b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -0,0 +1,91 @@
+/* Masquerade. Simple mapping which alters range to a local IP address
+ (depending on route). */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
+
+/* FIXME: Multiple targets. --RR */
+static int masquerade_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
+ }
+ if (mr->rangesize != 1) {
+ pr_debug("bad rangesize %u\n", mr->rangesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static unsigned int
+masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct nf_nat_range range;
+ const struct nf_nat_ipv4_multi_range_compat *mr;
+
+ mr = par->targinfo;
+ range.flags = mr->range[0].flags;
+ range.min_proto = mr->range[0].min;
+ range.max_proto = mr->range[0].max;
+
+ return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
+}
+
+static struct xt_target masquerade_tg_reg __read_mostly = {
+ .name = "MASQUERADE",
+ .family = NFPROTO_IPV4,
+ .target = masquerade_tg,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .checkentry = masquerade_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init masquerade_tg_init(void)
+{
+ int ret;
+
+ ret = xt_register_target(&masquerade_tg_reg);
+
+ if (ret == 0)
+ nf_nat_masquerade_ipv4_register_notifier();
+
+ return ret;
+}
+
+static void __exit masquerade_tg_exit(void)
+{
+ xt_unregister_target(&masquerade_tg_reg);
+ nf_nat_masquerade_ipv4_unregister_notifier();
+}
+
+module_init(masquerade_tg_init);
+module_exit(masquerade_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 000000000..87907d4bd
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,113 @@
+/*
+ * This is a module which is used for rejecting packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/icmp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_REJECT.h>
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+#include <linux/netfilter_bridge.h>
+#endif
+
+#include <net/netfilter/ipv4/nf_reject.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
+
+static unsigned int
+reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipt_reject_info *reject = par->targinfo;
+ int hook = par->hooknum;
+
+ switch (reject->with) {
+ case IPT_ICMP_NET_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_NET_UNREACH, hook);
+ break;
+ case IPT_ICMP_HOST_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_HOST_UNREACH, hook);
+ break;
+ case IPT_ICMP_PROT_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_PROT_UNREACH, hook);
+ break;
+ case IPT_ICMP_PORT_UNREACHABLE:
+ nf_send_unreach(skb, ICMP_PORT_UNREACH, hook);
+ break;
+ case IPT_ICMP_NET_PROHIBITED:
+ nf_send_unreach(skb, ICMP_NET_ANO, hook);
+ break;
+ case IPT_ICMP_HOST_PROHIBITED:
+ nf_send_unreach(skb, ICMP_HOST_ANO, hook);
+ break;
+ case IPT_ICMP_ADMIN_PROHIBITED:
+ nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
+ break;
+ case IPT_TCP_RESET:
+ nf_send_reset(skb, hook);
+ case IPT_ICMP_ECHOREPLY:
+ /* Doesn't happen. */
+ break;
+ }
+
+ return NF_DROP;
+}
+
+static int reject_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_reject_info *rejinfo = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
+
+ if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+ pr_info("ECHOREPLY no longer supported.\n");
+ return -EINVAL;
+ } else if (rejinfo->with == IPT_TCP_RESET) {
+ /* Must specify that it's a TCP packet */
+ if (e->ip.proto != IPPROTO_TCP ||
+ (e->ip.invflags & XT_INV_PROTO)) {
+ pr_info("TCP_RESET invalid for non-tcp\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static struct xt_target reject_tg_reg __read_mostly = {
+ .name = "REJECT",
+ .family = NFPROTO_IPV4,
+ .target = reject_tg,
+ .targetsize = sizeof(struct ipt_reject_info),
+ .table = "filter",
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT),
+ .checkentry = reject_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init reject_tg_init(void)
+{
+ return xt_register_target(&reject_tg_reg);
+}
+
+static void __exit reject_tg_exit(void)
+{
+ xt_unregister_target(&reject_tg_reg);
+}
+
+module_init(reject_tg_init);
+module_exit(reject_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 000000000..e9e677930
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+static struct iphdr *
+synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) / 4;
+ iph->tos = 0;
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = sysctl_ip_default_ttl;
+ iph->protocol = IPPROTO_TCP;
+ iph->check = 0;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct iphdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ skb_dst_set_noref(nskb, skb_dst(skb));
+ nskb->protocol = htons(ETH_P_IP);
+ if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ if (nfct) {
+ nskb->nfct = nfct;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nfct);
+ }
+
+ ip_local_out(nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+static void
+synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_syn(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(const struct synproxy_net *snet,
+ const struct ip_ct_tcp *state,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = ntohs(htons(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static bool
+synproxy_recv_client_ack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ int mss;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss = mss;
+ opts->options |= XT_SYNPROXY_OPT_MSS;
+
+ if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+ return true;
+}
+
+static unsigned int
+synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_synproxy_info *info = par->targinfo;
+ struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+ struct synproxy_options opts = {};
+ struct tcphdr *th, _th;
+
+ if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ return NF_DROP;
+
+ th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
+
+ if (th->syn && !(th->ack || th->fin || th->rst)) {
+ /* Initial SYN from client */
+ this_cpu_inc(snet->stats->syn_received);
+
+ if (th->ece && th->cwr)
+ opts.options |= XT_SYNPROXY_OPT_ECN;
+
+ opts.options &= info->options;
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_init_timestamp_cookie(info, &opts);
+ else
+ opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM |
+ XT_SYNPROXY_OPT_ECN);
+
+ synproxy_send_client_synack(skb, th, &opts);
+ return NF_DROP;
+
+ } else if (th->ack && !(th->fin || th->rst || th->syn)) {
+ /* ACK from client */
+ synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+ return NF_DROP;
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ unsigned int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (synproxy == NULL)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb))
+ return NF_ACCEPT;
+
+ thoff = ip_hdrlen(skb);
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+
+ /* fall through */
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack(snet, skb, th, &opts,
+ ntohl(th->seq) + 1))
+ this_cpu_inc(snet->stats->cookie_retrans);
+
+ return NF_DROP;
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->tsoff = opts.tsval - synproxy->its;
+
+ opts.options &= ~(XT_SYNPROXY_OPT_MSS |
+ XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack(snet, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack(snet, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+
+static int synproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_entry *e = par->entryinfo;
+
+ if (e->ip.proto != IPPROTO_TCP ||
+ e->ip.invflags & XT_INV_PROTO)
+ return -EINVAL;
+
+ return nf_ct_l3proto_try_module_get(par->family);
+}
+
+static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg4_reg __read_mostly = {
+ .name = "SYNPROXY",
+ .family = NFPROTO_IPV4,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
+ .target = synproxy_tg4,
+ .targetsize = sizeof(struct xt_synproxy_info),
+ .checkentry = synproxy_tg4_check,
+ .destroy = synproxy_tg4_destroy,
+ .me = THIS_MODULE,
+};
+
+static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
+ {
+ .hook = ipv4_synproxy_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv4_synproxy_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+static int __init synproxy_tg4_init(void)
+{
+ int err;
+
+ err = nf_register_hooks(ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+ if (err < 0)
+ goto err1;
+
+ err = xt_register_target(&synproxy_tg4_reg);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+
+err2:
+ nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+err1:
+ return err;
+}
+
+static void __exit synproxy_tg4_exit(void)
+{
+ xt_unregister_target(&synproxy_tg4_reg);
+ nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+}
+
+module_init(synproxy_tg4_init);
+module_exit(synproxy_tg4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
new file mode 100644
index 000000000..14a2aa8b8
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -0,0 +1,91 @@
+/* Kernel module to match AH parameters. */
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv4/ipt_ah.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+ bool r;
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
+ r=(spi >= min && spi <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+ return r;
+}
+
+static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+ const struct ipt_ah *ahinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ pr_debug("Dropping evil AH tinygram.\n");
+ par->hotdrop = true;
+ return 0;
+ }
+
+ return spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ ntohl(ah->spi),
+ !!(ahinfo->invflags & IPT_AH_INV_SPI));
+}
+
+static int ah_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ipt_ah *ahinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
+ pr_debug("unknown flags %X\n", ahinfo->invflags);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match ah_mt_reg __read_mostly = {
+ .name = "ah",
+ .family = NFPROTO_IPV4,
+ .match = ah_mt,
+ .matchsize = sizeof(struct ipt_ah),
+ .proto = IPPROTO_AH,
+ .checkentry = ah_mt_check,
+ .me = THIS_MODULE,
+};
+
+static int __init ah_mt_init(void)
+{
+ return xt_register_match(&ah_mt_reg);
+}
+
+static void __exit ah_mt_exit(void)
+{
+ xt_unregister_match(&ah_mt_reg);
+}
+
+module_init(ah_mt_init);
+module_exit(ah_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
new file mode 100644
index 000000000..4bfaedf9b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ip_fib.h>
+#include <net/route.h>
+
+#include <linux/netfilter/xt_rpfilter.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
+
+/* don't try to find route from mcast/bcast/zeronet */
+static __be32 rpfilter_get_saddr(__be32 addr)
+{
+ if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
+ ipv4_is_zeronet(addr))
+ return 0;
+ return addr;
+}
+
+static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
+ const struct net_device *dev, u8 flags)
+{
+ struct fib_result res;
+ bool dev_match;
+ struct net *net = dev_net(dev);
+ int ret __maybe_unused;
+
+ if (fib_lookup(net, fl4, &res))
+ return false;
+
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
+ return false;
+ }
+ dev_match = false;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
+#else
+ if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
+#endif
+ if (dev_match || flags & XT_RPFILTER_LOOSE)
+ return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
+ return dev_match;
+}
+
+static bool rpfilter_is_local(const struct sk_buff *skb)
+{
+ const struct rtable *rt = skb_rtable(skb);
+ return rt && (rt->rt_flags & RTCF_LOCAL);
+}
+
+static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_rpfilter_info *info;
+ const struct iphdr *iph;
+ struct flowi4 flow;
+ bool invert;
+
+ info = par->matchinfo;
+ invert = info->flags & XT_RPFILTER_INVERT;
+
+ if (rpfilter_is_local(skb))
+ return true ^ invert;
+
+ iph = ip_hdr(skb);
+ if (ipv4_is_multicast(iph->daddr)) {
+ if (ipv4_is_zeronet(iph->saddr))
+ return ipv4_is_local_multicast(iph->daddr) ^ invert;
+ }
+ flow.flowi4_iif = LOOPBACK_IFINDEX;
+ flow.daddr = iph->saddr;
+ flow.saddr = rpfilter_get_saddr(iph->daddr);
+ flow.flowi4_oif = 0;
+ flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+ flow.flowi4_tos = RT_TOS(iph->tos);
+ flow.flowi4_scope = RT_SCOPE_UNIVERSE;
+
+ return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
+}
+
+static int rpfilter_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_rpfilter_info *info = par->matchinfo;
+ unsigned int options = ~XT_RPFILTER_OPTION_MASK;
+ if (info->flags & options) {
+ pr_info("unknown options encountered");
+ return -EINVAL;
+ }
+
+ if (strcmp(par->table, "mangle") != 0 &&
+ strcmp(par->table, "raw") != 0) {
+ pr_info("match only valid in the \'raw\' "
+ "or \'mangle\' tables, not \'%s\'.\n", par->table);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match rpfilter_mt_reg __read_mostly = {
+ .name = "rpfilter",
+ .family = NFPROTO_IPV4,
+ .checkentry = rpfilter_check,
+ .match = rpfilter_mt,
+ .matchsize = sizeof(struct xt_rpfilter_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING),
+ .me = THIS_MODULE
+};
+
+static int __init rpfilter_mt_init(void)
+{
+ return xt_register_match(&rpfilter_mt_reg);
+}
+
+static void __exit rpfilter_mt_exit(void)
+{
+ xt_unregister_match(&rpfilter_mt_reg);
+}
+
+module_init(rpfilter_mt_init);
+module_exit(rpfilter_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 000000000..a0f3beca5
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,109 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_FILTER,
+};
+
+static unsigned int
+iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net;
+
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* root is playing with raw sockets. */
+ return NF_ACCEPT;
+
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_filter);
+}
+
+static struct nf_hook_ops *filter_ops __read_mostly;
+
+/* Default to forward because I got too much mail already. */
+static bool forward = true;
+module_param(forward, bool, 0000);
+
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ /* Entry 1 is the FORWARD hook */
+ ((struct ipt_standard *)repl->entries)[1].target.verdict =
+ forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+
+ net->ipv4.iptable_filter =
+ ipt_register_table(net, &packet_filter, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
+}
+
+static void __net_exit iptable_filter_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.iptable_filter);
+}
+
+static struct pernet_operations iptable_filter_net_ops = {
+ .init = iptable_filter_net_init,
+ .exit = iptable_filter_net_exit,
+};
+
+static int __init iptable_filter_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&iptable_filter_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
+ if (IS_ERR(filter_ops)) {
+ ret = PTR_ERR(filter_ops);
+ unregister_pernet_subsys(&iptable_filter_net_ops);
+ }
+
+ return ret;
+}
+
+static void __exit iptable_filter_fini(void)
+{
+ xt_hook_unlink(&packet_filter, filter_ops);
+ unregister_pernet_subsys(&iptable_filter_net_ops);
+}
+
+module_init(iptable_filter_init);
+module_exit(iptable_filter_fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 000000000..62cbb8c5f
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,147 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mangle table");
+
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+ (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT) | \
+ (1 << NF_INET_POST_ROUTING))
+
+static const struct xt_table packet_mangler = {
+ .name = "mangle",
+ .valid_hooks = MANGLE_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_MANGLE,
+};
+
+static unsigned int
+ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
+{
+ struct net_device *out = state->out;
+ unsigned int ret;
+ const struct iphdr *iph;
+ u_int8_t tos;
+ __be32 saddr, daddr;
+ u_int32_t mark;
+ int err;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ /* Save things which could affect route */
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, state,
+ dev_net(out)->ipv4.iptable_mangle);
+ /* Reroute for ANY change. */
+ if (ret != NF_DROP && ret != NF_STOLEN) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos) {
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+
+ return ret;
+}
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+iptable_mangle_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (ops->hooknum == NF_INET_LOCAL_OUT)
+ return ipt_mangle_out(skb, state);
+ if (ops->hooknum == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, ops->hooknum, state,
+ dev_net(state->out)->ipv4.iptable_mangle);
+ /* PREROUTING/INPUT/FORWARD: */
+ return ipt_do_table(skb, ops->hooknum, state,
+ dev_net(state->in)->ipv4.iptable_mangle);
+}
+
+static struct nf_hook_ops *mangle_ops __read_mostly;
+
+static int __net_init iptable_mangle_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_mangler);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.iptable_mangle =
+ ipt_register_table(net, &packet_mangler, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
+}
+
+static void __net_exit iptable_mangle_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.iptable_mangle);
+}
+
+static struct pernet_operations iptable_mangle_net_ops = {
+ .init = iptable_mangle_net_init,
+ .exit = iptable_mangle_net_exit,
+};
+
+static int __init iptable_mangle_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&iptable_mangle_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ ret = PTR_ERR(mangle_ops);
+ unregister_pernet_subsys(&iptable_mangle_net_ops);
+ }
+
+ return ret;
+}
+
+static void __exit iptable_mangle_fini(void)
+{
+ xt_hook_unlink(&packet_mangler, mangle_ops);
+ unregister_pernet_subsys(&iptable_mangle_net_ops);
+}
+
+module_init(iptable_mangle_init);
+module_exit(iptable_mangle_fini);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
new file mode 100644
index 000000000..0d4d9cdf9
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -0,0 +1,154 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+
+static const struct xt_table nf_nat_ipv4_table = {
+ .name = "nat",
+ .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+};
+
+static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.nat_table);
+}
+
+static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_fn(ops, skb, state, iptable_nat_do_chain);
+}
+
+static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_in(ops, skb, state, iptable_nat_do_chain);
+}
+
+static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_out(ops, skb, state, iptable_nat_do_chain);
+}
+
+static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_local_fn(ops, skb, state, iptable_nat_do_chain);
+}
+
+static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = iptable_nat_ipv4_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = iptable_nat_ipv4_out,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = iptable_nat_ipv4_local_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = iptable_nat_ipv4_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+};
+
+static int __net_init iptable_nat_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
+}
+
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.nat_table);
+}
+
+static struct pernet_operations iptable_nat_net_ops = {
+ .init = iptable_nat_net_init,
+ .exit = iptable_nat_net_exit,
+};
+
+static int __init iptable_nat_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&iptable_nat_net_ops);
+ if (err < 0)
+ goto err1;
+
+ err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+err1:
+ return err;
+}
+
+static void __exit iptable_nat_exit(void)
+{
+ nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+}
+
+module_init(iptable_nat_init);
+module_exit(iptable_nat_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
new file mode 100644
index 000000000..0356e6da4
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -0,0 +1,89 @@
+/*
+ * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_raw = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_RAW,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net;
+
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* root is playing with raw sockets. */
+ return NF_ACCEPT;
+
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state, net->ipv4.iptable_raw);
+}
+
+static struct nf_hook_ops *rawtable_ops __read_mostly;
+
+static int __net_init iptable_raw_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_raw);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.iptable_raw =
+ ipt_register_table(net, &packet_raw, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
+}
+
+static void __net_exit iptable_raw_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.iptable_raw);
+}
+
+static struct pernet_operations iptable_raw_net_ops = {
+ .init = iptable_raw_net_init,
+ .exit = iptable_raw_net_exit,
+};
+
+static int __init iptable_raw_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&iptable_raw_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
+ if (IS_ERR(rawtable_ops)) {
+ ret = PTR_ERR(rawtable_ops);
+ unregister_pernet_subsys(&iptable_raw_net_ops);
+ }
+
+ return ret;
+}
+
+static void __exit iptable_raw_fini(void)
+{
+ xt_hook_unlink(&packet_raw, rawtable_ops);
+ unregister_pernet_subsys(&iptable_raw_net_ops);
+}
+
+module_init(iptable_raw_init);
+module_exit(iptable_raw_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
new file mode 100644
index 000000000..4bce3980c
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -0,0 +1,109 @@
+/*
+ * "security" table
+ *
+ * This is for use by Mandatory Access Control (MAC) security models,
+ * which need to be able to manage security policy in separate context
+ * to DAC.
+ *
+ * Based on iptable_mangle.c
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
+ * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
+MODULE_DESCRIPTION("iptables security table, for MAC rules");
+
+#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT)
+
+static const struct xt_table security_table = {
+ .name = "security",
+ .valid_hooks = SECURITY_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_SECURITY,
+};
+
+static unsigned int
+iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net;
+
+ if (ops->hooknum == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* Somebody is playing with raw sockets. */
+ return NF_ACCEPT;
+
+ net = dev_net(state->in ? state->in : state->out);
+ return ipt_do_table(skb, ops->hooknum, state,
+ net->ipv4.iptable_security);
+}
+
+static struct nf_hook_ops *sectbl_ops __read_mostly;
+
+static int __net_init iptable_security_net_init(struct net *net)
+{
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&security_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.iptable_security =
+ ipt_register_table(net, &security_table, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
+}
+
+static void __net_exit iptable_security_net_exit(struct net *net)
+{
+ ipt_unregister_table(net, net->ipv4.iptable_security);
+}
+
+static struct pernet_operations iptable_security_net_ops = {
+ .init = iptable_security_net_init,
+ .exit = iptable_security_net_exit,
+};
+
+static int __init iptable_security_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&iptable_security_net_ops);
+ if (ret < 0)
+ return ret;
+
+ sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
+ if (IS_ERR(sectbl_ops)) {
+ ret = PTR_ERR(sectbl_ops);
+ goto cleanup_table;
+ }
+
+ return ret;
+
+cleanup_table:
+ unregister_pernet_subsys(&iptable_security_net_ops);
+ return ret;
+}
+
+static void __exit iptable_security_fini(void)
+{
+ xt_hook_unlink(&security_table, sectbl_ops);
+ unregister_pernet_subsys(&iptable_security_net_ops);
+}
+
+module_init(iptable_security_init);
+module_exit(iptable_security_fini);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
new file mode 100644
index 000000000..30ad9554b
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -0,0 +1,542 @@
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/nf_log.h>
+
+static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const __be32 *ap;
+ __be32 _addrs[2];
+ ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
+ sizeof(u_int32_t) * 2, _addrs);
+ if (ap == NULL)
+ return false;
+
+ tuple->src.u3.ip = ap[0];
+ tuple->dst.u3.ip = ap[1];
+
+ return true;
+}
+
+static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u3.ip = orig->dst.u3.ip;
+ tuple->dst.u3.ip = orig->src.u3.ip;
+
+ return true;
+}
+
+static void ipv4_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "src=%pI4 dst=%pI4 ",
+ &tuple->src.u3.ip, &tuple->dst.u3.ip);
+}
+
+static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ unsigned int *dataoff, u_int8_t *protonum)
+{
+ const struct iphdr *iph;
+ struct iphdr _iph;
+
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (iph == NULL)
+ return -NF_ACCEPT;
+
+ /* Conntrack defragments packets, we might still see fragments
+ * inside ICMP packets though. */
+ if (iph->frag_off & htons(IP_OFFSET))
+ return -NF_ACCEPT;
+
+ *dataoff = nhoff + (iph->ihl << 2);
+ *protonum = iph->protocol;
+
+ /* Check bogus IP headers */
+ if (*dataoff > skb->len) {
+ pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
+ "nhoff %u, ihl %u, skblen %u\n",
+ nhoff, iph->ihl << 2, skb->len);
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+ ct, ctinfo);
+}
+
+static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
+
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
+out:
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(dev_net(state->in), PF_INET, ops->hooknum, skb);
+}
+
+static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+ return nf_conntrack_in(dev_net(state->out), PF_INET, ops->hooknum, skb);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ make it the first hook. */
+static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
+ {
+ .hook = ipv4_conntrack_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv4_conntrack_local,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv4_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv4_confirm,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+ {
+ .hook = ipv4_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv4_confirm,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+};
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+
+static struct ctl_table ip_ct_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_max",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_count",
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_buckets",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_checksum",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_log_invalid",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &log_invalid_proto_min,
+ .extra2 = &log_invalid_proto_max,
+ },
+ { }
+};
+#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+ mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+
+ memset(&tuple, 0, sizeof(tuple));
+ tuple.src.u3.ip = inet->inet_rcv_saddr;
+ tuple.src.u.tcp.port = inet->inet_sport;
+ tuple.dst.u3.ip = inet->inet_daddr;
+ tuple.dst.u.tcp.port = inet->inet_dport;
+ tuple.src.l3num = PF_INET;
+ tuple.dst.protonum = sk->sk_protocol;
+
+ /* We only do TCP and SCTP at the moment: is there a better way? */
+ if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) {
+ pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+ return -ENOPROTOOPT;
+ }
+
+ if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
+ pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+ *len, sizeof(struct sockaddr_in));
+ return -EINVAL;
+ }
+
+ h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+ if (h) {
+ struct sockaddr_in sin;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u3.ip;
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
+ &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ nf_ct_put(ct);
+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+ return -EFAULT;
+ else
+ return 0;
+ }
+ pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
+ &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
+ &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+ nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
+ [CTA_IP_V4_SRC] = { .type = NLA_U32 },
+ [CTA_IP_V4_DST] = { .type = NLA_U32 },
+};
+
+static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *t)
+{
+ if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
+ return -EINVAL;
+
+ t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+ t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+
+ return 0;
+}
+
+static int ipv4_nlattr_tuple_size(void)
+{
+ return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1);
+}
+#endif
+
+static struct nf_sockopt_ops so_getorigdst = {
+ .pf = PF_INET,
+ .get_optmin = SO_ORIGINAL_DST,
+ .get_optmax = SO_ORIGINAL_DST+1,
+ .get = getorigdst,
+ .owner = THIS_MODULE,
+};
+
+static int ipv4_init_net(struct net *net)
+{
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ struct nf_ip_net *in = &net->ct.nf_ct_proto;
+ in->ctl_table = kmemdup(ip_ct_sysctl_table,
+ sizeof(ip_ct_sysctl_table),
+ GFP_KERNEL);
+ if (!in->ctl_table)
+ return -ENOMEM;
+
+ in->ctl_table[0].data = &nf_conntrack_max;
+ in->ctl_table[1].data = &net->ct.count;
+ in->ctl_table[2].data = &net->ct.htable_size;
+ in->ctl_table[3].data = &net->ct.sysctl_checksum;
+ in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
+#endif
+ return 0;
+}
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
+ .l3proto = PF_INET,
+ .name = "ipv4",
+ .pkt_to_tuple = ipv4_pkt_to_tuple,
+ .invert_tuple = ipv4_invert_tuple,
+ .print_tuple = ipv4_print_tuple,
+ .get_l4proto = ipv4_get_l4proto,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = ipv4_tuple_to_nlattr,
+ .nlattr_tuple_size = ipv4_nlattr_tuple_size,
+ .nlattr_to_tuple = ipv4_nlattr_to_tuple,
+ .nla_policy = ipv4_nla_policy,
+#endif
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ .ctl_table_path = "net/ipv4/netfilter",
+#endif
+ .init_net = ipv4_init_net,
+ .me = THIS_MODULE,
+};
+
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+ &nf_conntrack_htable_size, 0600);
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
+MODULE_ALIAS("ip_conntrack");
+MODULE_LICENSE("GPL");
+
+static int ipv4_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_tcp4: pernet registration failed\n");
+ goto out_tcp;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udp4: pernet registration failed\n");
+ goto out_udp;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
+ if (ret < 0) {
+ pr_err("nf_conntrack_icmp4: pernet registration failed\n");
+ goto out_icmp;
+ }
+ ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: pernet registration failed\n");
+ goto out_ipv4;
+ }
+ return 0;
+out_ipv4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
+out_icmp:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
+out_udp:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+out_tcp:
+ return ret;
+}
+
+static void ipv4_net_exit(struct net *net)
+{
+ nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+}
+
+static struct pernet_operations ipv4_net_ops = {
+ .init = ipv4_net_init,
+ .exit = ipv4_net_exit,
+};
+
+static int __init nf_conntrack_l3proto_ipv4_init(void)
+{
+ int ret = 0;
+
+ need_conntrack();
+ nf_defrag_ipv4_enable();
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret < 0) {
+ printk(KERN_ERR "Unable to register netfilter socket option\n");
+ return ret;
+ }
+
+ ret = register_pernet_subsys(&ipv4_net_ops);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
+ goto cleanup_sockopt;
+ }
+
+ ret = nf_register_hooks(ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register hooks.\n");
+ goto cleanup_pernet;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
+ goto cleanup_hooks;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
+ goto cleanup_tcp4;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
+ goto cleanup_udp4;
+ }
+
+ ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
+ goto cleanup_icmpv4;
+ }
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ ret = nf_conntrack_ipv4_compat_init();
+ if (ret < 0)
+ goto cleanup_proto;
+#endif
+ return ret;
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ cleanup_proto:
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+#endif
+ cleanup_icmpv4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ cleanup_udp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ cleanup_tcp4:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ cleanup_hooks:
+ nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+ cleanup_pernet:
+ unregister_pernet_subsys(&ipv4_net_ops);
+ cleanup_sockopt:
+ nf_unregister_sockopt(&so_getorigdst);
+ return ret;
+}
+
+static void __exit nf_conntrack_l3proto_ipv4_fini(void)
+{
+ synchronize_net();
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ nf_conntrack_ipv4_compat_fini();
+#endif
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+ unregister_pernet_subsys(&ipv4_net_ops);
+ nf_unregister_sockopt(&so_getorigdst);
+}
+
+module_init(nf_conntrack_l3proto_ipv4_init);
+module_exit(nf_conntrack_l3proto_ipv4_fini);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
new file mode 100644
index 000000000..f0dfe92a0
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -0,0 +1,469 @@
+/* ip_conntrack proc compat - based on ip_conntrack_standalone.c
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/security.h>
+#include <net/net_namespace.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
+#include <linux/export.h>
+
+struct ct_iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_iter_state *st = seq->private;
+ struct hlist_nulls_node *n;
+
+ for (st->bucket = 0;
+ st->bucket < net->ct.htable_size;
+ st->bucket++) {
+ n = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ if (!is_a_nulls(n))
+ return n;
+ }
+ return NULL;
+}
+
+static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
+ struct hlist_nulls_node *head)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_iter_state *st = seq->private;
+
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
+ while (is_a_nulls(head)) {
+ if (likely(get_nulls_value(head) == st->bucket)) {
+ if (++st->bucket >= net->ct.htable_size)
+ return NULL;
+ }
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ }
+ return head;
+}
+
+static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_nulls_node *head = ct_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return ct_get_idx(seq, *pos);
+}
+
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_get_next(s, v);
+}
+
+static void ct_seq_stop(struct seq_file *s, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ int ret;
+ u32 len;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return;
+
+ seq_printf(s, "secctx=%s ", secctx);
+
+ security_release_secctx(secctx, len);
+}
+#else
+static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+}
+#endif
+
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+ struct nf_conntrack_tuple_hash *hash = v;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
+ const struct nf_conntrack_l3proto *l3proto;
+ const struct nf_conntrack_l4proto *l4proto;
+ int ret = 0;
+
+ NF_CT_ASSERT(ct);
+ if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ return 0;
+
+
+ /* we only want to print DIR_ORIGINAL */
+ if (NF_CT_DIRECTION(hash))
+ goto release;
+ if (nf_ct_l3num(ct) != AF_INET)
+ goto release;
+
+ l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
+ NF_CT_ASSERT(l3proto);
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ NF_CT_ASSERT(l4proto);
+
+ ret = -ENOSPC;
+ seq_printf(s, "%-8s %u %ld ",
+ l4proto->name, nf_ct_protonum(ct),
+ timer_pending(&ct->timeout)
+ ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
+
+ if (l4proto->print_conntrack)
+ l4proto->print_conntrack(s, ct);
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ l3proto, l4proto);
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
+ goto release;
+
+ if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
+ seq_printf(s, "[UNREPLIED] ");
+
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ l3proto, l4proto);
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
+ goto release;
+
+ if (test_bit(IPS_ASSURED_BIT, &ct->status))
+ seq_printf(s, "[ASSURED] ");
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ seq_printf(s, "mark=%u ", ct->mark);
+#endif
+
+ ct_show_secctx(s, ct);
+
+ seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ ret = 0;
+release:
+ nf_ct_put(ct);
+ return ret;
+}
+
+static const struct seq_operations ct_seq_ops = {
+ .start = ct_seq_start,
+ .next = ct_seq_next,
+ .stop = ct_seq_stop,
+ .show = ct_seq_show
+};
+
+static int ct_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ct_seq_ops,
+ sizeof(struct ct_iter_state));
+}
+
+static const struct file_operations ct_file_ops = {
+ .owner = THIS_MODULE,
+ .open = ct_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+/* expects */
+struct ct_expect_iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_expect_iter_state *st = seq->private;
+ struct hlist_node *n;
+
+ for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
+ n = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ if (n)
+ return n;
+ }
+ return NULL;
+}
+
+static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
+ struct hlist_node *head)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_expect_iter_state *st = seq->private;
+
+ head = rcu_dereference(hlist_next_rcu(head));
+ while (head == NULL) {
+ if (++st->bucket >= nf_ct_expect_hsize)
+ return NULL;
+ head = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ }
+ return head;
+}
+
+static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head = ct_expect_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_expect_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return ct_expect_get_idx(seq, *pos);
+}
+
+static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_expect_get_next(seq, v);
+}
+
+static void exp_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+ struct nf_conntrack_expect *exp;
+ const struct hlist_node *n = v;
+
+ exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
+
+ if (exp->tuple.src.l3num != AF_INET)
+ return 0;
+
+ if (exp->timeout.function)
+ seq_printf(s, "%ld ", timer_pending(&exp->timeout)
+ ? (long)(exp->timeout.expires - jiffies)/HZ : 0);
+ else
+ seq_printf(s, "- ");
+
+ seq_printf(s, "proto=%u ", exp->tuple.dst.protonum);
+
+ print_tuple(s, &exp->tuple,
+ __nf_ct_l3proto_find(exp->tuple.src.l3num),
+ __nf_ct_l4proto_find(exp->tuple.src.l3num,
+ exp->tuple.dst.protonum));
+ seq_putc(s, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations exp_seq_ops = {
+ .start = exp_seq_start,
+ .next = exp_seq_next,
+ .stop = exp_seq_stop,
+ .show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &exp_seq_ops,
+ sizeof(struct ct_expect_iter_state));
+}
+
+static const struct file_operations ip_exp_file_ops = {
+ .owner = THIS_MODULE,
+ .open = exp_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(net->ct.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return per_cpu_ptr(net->ct.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_net(seq);
+ unsigned int nr_conntracks = atomic_read(&net->ct.count);
+ const struct ip_conntrack_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ nr_conntracks,
+ st->searched,
+ st->found,
+ st->new,
+ st->invalid,
+ st->ignore,
+ st->delete,
+ st->delete_list,
+ st->insert,
+ st->insert_failed,
+ st->drop,
+ st->early_drop,
+ st->error,
+
+ st->expect_new,
+ st->expect_create,
+ st->expect_delete,
+ st->search_restart
+ );
+ return 0;
+}
+
+static const struct seq_operations ct_cpu_seq_ops = {
+ .start = ct_cpu_seq_start,
+ .next = ct_cpu_seq_next,
+ .stop = ct_cpu_seq_stop,
+ .show = ct_cpu_seq_show,
+};
+
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ct_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ct_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ct_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init ip_conntrack_net_init(struct net *net)
+{
+ struct proc_dir_entry *proc, *proc_exp, *proc_stat;
+
+ proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
+ if (!proc)
+ goto err1;
+
+ proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
+ &ip_exp_file_ops);
+ if (!proc_exp)
+ goto err2;
+
+ proc_stat = proc_create("ip_conntrack", S_IRUGO,
+ net->proc_net_stat, &ct_cpu_seq_fops);
+ if (!proc_stat)
+ goto err3;
+ return 0;
+
+err3:
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
+err2:
+ remove_proc_entry("ip_conntrack", net->proc_net);
+err1:
+ return -ENOMEM;
+}
+
+static void __net_exit ip_conntrack_net_exit(struct net *net)
+{
+ remove_proc_entry("ip_conntrack", net->proc_net_stat);
+ remove_proc_entry("ip_conntrack_expect", net->proc_net);
+ remove_proc_entry("ip_conntrack", net->proc_net);
+}
+
+static struct pernet_operations ip_conntrack_net_ops = {
+ .init = ip_conntrack_net_init,
+ .exit = ip_conntrack_net_exit,
+};
+
+int __init nf_conntrack_ipv4_compat_init(void)
+{
+ return register_pernet_subsys(&ip_conntrack_net_ops);
+}
+
+void __exit nf_conntrack_ipv4_compat_fini(void)
+{
+ unregister_pernet_subsys(&ip_conntrack_net_ops);
+}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
new file mode 100644
index 000000000..80d5554b9
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -0,0 +1,428 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/icmp.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
+
+static inline struct nf_icmp_net *icmp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmp;
+}
+
+static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct icmphdr *hp;
+ struct icmphdr _hdr;
+
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return false;
+
+ tuple->dst.u.icmp.type = hp->type;
+ tuple->src.u.icmp.id = hp->un.echo.id;
+ tuple->dst.u.icmp.code = hp->code;
+
+ return true;
+}
+
+/* Add 1; spaces filled with 0. */
+static const u_int8_t invmap[] = {
+ [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+ [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+ [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+ [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+ [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+ [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+ [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+ [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
+};
+
+static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ if (orig->dst.u.icmp.type >= sizeof(invmap) ||
+ !invmap[orig->dst.u.icmp.type])
+ return false;
+
+ tuple->src.u.icmp.id = orig->src.u.icmp.id;
+ tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
+ tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static void icmp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "type=%u code=%u id=%u ",
+ tuple->dst.u.icmp.type,
+ tuple->dst.u.icmp.code,
+ ntohs(tuple->src.u.icmp.id));
+}
+
+static unsigned int *icmp_get_timeouts(struct net *net)
+{
+ return &icmp_pernet(net)->timeout;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmp_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeout)
+{
+ /* Do not immediately delete the connection after the first
+ successful reply to avoid excessive conntrackd traffic
+ and also to handle correctly ICMP echo reply duplicates. */
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ static const u_int8_t valid_new[] = {
+ [ICMP_ECHO] = 1,
+ [ICMP_TIMESTAMP] = 1,
+ [ICMP_INFO_REQUEST] = 1,
+ [ICMP_ADDRESS] = 1
+ };
+
+ if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
+ !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
+ /* Can't create a new ICMP `conn' with this. */
+ pr_debug("icmp: can't create new conn with type %u\n",
+ ct->tuplehash[0].tuple.dst.u.icmp.type);
+ nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
+ return false;
+ }
+ return true;
+}
+
+/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
+static int
+icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
+ enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct nf_conntrack_tuple innertuple, origtuple;
+ const struct nf_conntrack_l4proto *innerproto;
+ const struct nf_conntrack_tuple_hash *h;
+ u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+
+ NF_CT_ASSERT(skb->nfct == NULL);
+
+ /* Are they talking about one of our connections? */
+ if (!nf_ct_get_tuplepr(skb,
+ skb_network_offset(skb) + ip_hdrlen(skb)
+ + sizeof(struct icmphdr),
+ PF_INET, &origtuple)) {
+ pr_debug("icmp_error_message: failed to get tuple\n");
+ return -NF_ACCEPT;
+ }
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
+
+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
+ been preserved inside the ICMP. */
+ if (!nf_ct_invert_tuple(&innertuple, &origtuple,
+ &nf_conntrack_l3proto_ipv4, innerproto)) {
+ pr_debug("icmp_error_message: no match\n");
+ return -NF_ACCEPT;
+ }
+
+ *ctinfo = IP_CT_RELATED;
+
+ h = nf_conntrack_find_get(net, zone, &innertuple);
+ if (!h) {
+ pr_debug("icmp_error_message: no match\n");
+ return -NF_ACCEPT;
+ }
+
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return NF_ACCEPT;
+}
+
+/* Small and modified version of icmp_rcv */
+static int
+icmp_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb, unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+{
+ const struct icmphdr *icmph;
+ struct icmphdr _ih;
+
+ /* Not enough header? */
+ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
+ if (icmph == NULL) {
+ if (LOG_INVALID(net, IPPROTO_ICMP))
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
+ NULL, "nf_ct_icmp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* See ip_conntrack_proto_tcp.c */
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_ip_checksum(skb, hooknum, dataoff, 0)) {
+ if (LOG_INVALID(net, IPPROTO_ICMP))
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmp: bad HW ICMP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES) {
+ if (LOG_INVALID(net, IPPROTO_ICMP))
+ nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmp: invalid ICMP type ");
+ return -NF_ACCEPT;
+ }
+
+ /* Need to track icmp error message? */
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_SOURCE_QUENCH &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB &&
+ icmph->type != ICMP_REDIRECT)
+ return NF_ACCEPT;
+
+ return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int icmp_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *t)
+{
+ if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
+ nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
+ [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 },
+ [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 },
+ [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 },
+};
+
+static int icmp_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *tuple)
+{
+ if (!tb[CTA_PROTO_ICMP_TYPE] ||
+ !tb[CTA_PROTO_ICMP_CODE] ||
+ !tb[CTA_PROTO_ICMP_ID])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
+ tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
+ tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
+
+ if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
+ !invmap[tuple->dst.u.icmp.type])
+ return -EINVAL;
+
+ return 0;
+}
+
+static int icmp_nlattr_tuple_size(void)
+{
+ return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeout = data;
+ struct nf_icmp_net *in = icmp_pernet(net);
+
+ if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
+ *timeout =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
+ } else {
+ /* Set default ICMP timeout. */
+ *timeout = in->timeout;
+ }
+ return 0;
+}
+
+static int
+icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeout = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
+ [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table icmp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_icmp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table icmp_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_icmp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+ pn->ctl_table = kmemdup(icmp_sysctl_table,
+ sizeof(icmp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &in->timeout;
+#endif
+ return 0;
+}
+
+static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
+ sizeof(icmp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &in->timeout;
+#endif
+#endif
+ return 0;
+}
+
+static int icmp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_icmp_net *in = icmp_pernet(net);
+ struct nf_proto_net *pn = &in->pn;
+
+ in->timeout = nf_ct_icmp_timeout;
+
+ ret = icmp_kmemdup_compat_sysctl_table(pn, in);
+ if (ret < 0)
+ return ret;
+
+ ret = icmp_kmemdup_sysctl_table(pn, in);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+
+ return ret;
+}
+
+static struct nf_proto_net *icmp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmp.pn;
+}
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
+{
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_ICMP,
+ .name = "icmp",
+ .pkt_to_tuple = icmp_pkt_to_tuple,
+ .invert_tuple = icmp_invert_tuple,
+ .print_tuple = icmp_print_tuple,
+ .packet = icmp_packet,
+ .get_timeouts = icmp_get_timeouts,
+ .new = icmp_new,
+ .error = icmp_error,
+ .destroy = NULL,
+ .me = NULL,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = icmp_tuple_to_nlattr,
+ .nlattr_tuple_size = icmp_nlattr_tuple_size,
+ .nlattr_to_tuple = icmp_nlattr_to_tuple,
+ .nla_policy = icmp_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = icmp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = icmp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_ICMP_MAX,
+ .obj_size = sizeof(unsigned int),
+ .nla_policy = icmp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = icmp_init_net,
+ .get_net_proto = icmp_get_net_proto,
+};
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
new file mode 100644
index 000000000..c88b7d434
--- /dev/null
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -0,0 +1,129 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+ int err;
+
+ skb_orphan(skb);
+
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
+
+ if (!err) {
+ ip_send_check(ip_hdr(skb));
+ skb->ignore_df = 1;
+ }
+
+ return err;
+}
+
+static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ u16 zone = NF_CT_DEFAULT_ZONE;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (skb->nfct)
+ zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+#endif
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (skb->nf_bridge &&
+ skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+ return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+ if (hooknum == NF_INET_PRE_ROUTING)
+ return IP_DEFRAG_CONNTRACK_IN + zone;
+ else
+ return IP_DEFRAG_CONNTRACK_OUT + zone;
+}
+
+static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (sk && (sk->sk_family == PF_INET) &&
+ inet->nodefrag)
+ return NF_ACCEPT;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#if !IS_ENABLED(CONFIG_NF_NAT)
+ /* Previously seen (loopback)? Ignore. Do this before
+ fragment check. */
+ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+ return NF_ACCEPT;
+#endif
+#endif
+ /* Gather fragments. */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ enum ip_defrag_users user =
+ nf_ct_defrag_user(ops->hooknum, skb);
+
+ if (nf_ct_ipv4_gather_frags(skb, user))
+ return NF_STOLEN;
+ }
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_ops ipv4_defrag_ops[] = {
+ {
+ .hook = ipv4_conntrack_defrag,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+ },
+ {
+ .hook = ipv4_conntrack_defrag,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+ },
+};
+
+static int __init nf_defrag_init(void)
+{
+ return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+}
+
+static void __exit nf_defrag_fini(void)
+{
+ nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+}
+
+void nf_defrag_ipv4_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
new file mode 100644
index 000000000..e7ad950cf
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -0,0 +1,161 @@
+/*
+ * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Based on code from ebt_log from:
+ *
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_NOTICE,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+struct arppayload {
+ unsigned char mac_src[ETH_ALEN];
+ unsigned char ip_src[4];
+ unsigned char mac_dst[ETH_ALEN];
+ unsigned char ip_dst[4];
+};
+
+static void dump_arp_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int nhoff)
+{
+ const struct arphdr *ah;
+ struct arphdr _arph;
+ const struct arppayload *ap;
+ struct arppayload _arpp;
+
+ ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ if (ah == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+ nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
+
+ /* If it's for Ethernet and the lengths are OK, then log the ARP
+ * payload.
+ */
+ if (ah->ar_hrd != htons(1) ||
+ ah->ar_hln != ETH_ALEN ||
+ ah->ar_pln != sizeof(__be32))
+ return;
+
+ ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
+ if (ap == NULL) {
+ nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+ skb->len - sizeof(_arph));
+ return;
+ }
+ nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
+ ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+}
+
+static void nf_log_arp_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix);
+ dump_arp_packet(m, loginfo, skb, 0);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_arp_logger __read_mostly = {
+ .name = "nf_log_arp",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_arp_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_arp_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_arp_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_arp_logger);
+}
+
+static struct pernet_operations nf_log_arp_net_ops = {
+ .init = nf_log_arp_net_init,
+ .exit = nf_log_arp_net_exit,
+};
+
+static int __init nf_log_arp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_arp_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ if (ret < 0) {
+ pr_err("failed to register logger\n");
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ unregister_pernet_subsys(&nf_log_arp_net_ops);
+ return ret;
+}
+
+static void __exit nf_log_arp_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_arp_net_ops);
+ nf_log_unregister(&nf_arp_logger);
+}
+
+module_init(nf_log_arp_init);
+module_exit(nf_log_arp_exit);
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter ARP packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(3, 0);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
new file mode 100644
index 000000000..076aadda0
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -0,0 +1,397 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_NOTICE,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+/* One level of recursion won't kill us */
+static void dump_ipv4_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int iphoff)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ nf_log_buf_add(m, "CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ nf_log_buf_add(m, "DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ nf_log_buf_add(m, "MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((logflags & XT_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
+ const unsigned char *op;
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+ optsize, _opt);
+ if (op == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+ nf_log_buf_add(m, ") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4))
+ return;
+ break;
+ case IPPROTO_ICMP: {
+ struct icmphdr _icmph;
+ const struct icmphdr *ich;
+ static const size_t required_len[NR_ICMP_TYPES+1]
+ = { [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+
+ /* Max length: 11 "PROTO=ICMP " */
+ nf_log_buf_add(m, "PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (ich == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ nf_log_buf_add(m, "PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+ /* Fall through */
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ nf_log_buf_add(m, "[");
+ dump_ipv4_packet(m, info, skb,
+ iphoff + ih->ihl*4+sizeof(_icmph));
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohs(ich->un.frag.mtu));
+ }
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ nf_log_buf_add(m, "PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 10 "PROTO=ESP " */
+ nf_log_buf_add(m, "PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_esph), &_esph);
+ if (eh == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & XT_LOG_UID) && !iphoff)
+ nf_log_dump_sk_uid_gid(m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (!iphoff && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* UDPLITE: 14+max(25,20) = 39 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 252 = 803 */
+}
+
+static void dump_ipv4_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & XT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ nf_log_buf_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++, p++)
+ nf_log_buf_add(m, ":%02x", *p);
+ }
+ nf_log_buf_add(m, " ");
+}
+
+static void nf_log_ip_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in,
+ out, loginfo, prefix);
+
+ if (in != NULL)
+ dump_ipv4_mac_header(m, loginfo, skb);
+
+ dump_ipv4_packet(m, loginfo, skb, 0);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip_logger __read_mostly = {
+ .name = "nf_log_ipv4",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_ipv4_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_ipv4_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_ip_logger);
+}
+
+static struct pernet_operations nf_log_ipv4_net_ops = {
+ .init = nf_log_ipv4_net_init,
+ .exit = nf_log_ipv4_net_exit,
+};
+
+static int __init nf_log_ipv4_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ if (ret < 0) {
+ pr_err("failed to register logger\n");
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+ return ret;
+}
+
+static void __exit nf_log_ipv4_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+ nf_log_unregister(&nf_ip_logger);
+}
+
+module_init(nf_log_ipv4_init);
+module_exit(nf_log_ipv4_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
new file mode 100644
index 000000000..574f7ebba
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -0,0 +1,631 @@
+/*
+ * H.323 extension for NAT alteration.
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * Based on the 'brute force' H.323 NAT module by
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_h323.h>
+
+/****************************************************************************/
+static int set_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ unsigned int addroff, __be32 ip, __be16 port)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct {
+ __be32 ip;
+ __be16 port;
+ } __attribute__ ((__packed__)) buf;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+
+ buf.ip = ip;
+ buf.port = port;
+ addroff += dataoff;
+
+ if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ protoff, addroff, sizeof(buf),
+ (char *) &buf, sizeof(buf))) {
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
+ return -1;
+ }
+
+ /* Relocate data pointer */
+ th = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return -1;
+ *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
+ } else {
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+ protoff, addroff, sizeof(buf),
+ (char *) &buf, sizeof(buf))) {
+ net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
+ return -1;
+ }
+ /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
+ * or pull everything in a linear buffer, so we can safely
+ * use the skb pointers now */
+ *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+{
+ return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
+ addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+{
+ return set_addr(skb, protoff, data, dataoff,
+ taddr->unicastAddress.iPAddress.network,
+ addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count)
+{
+ const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+
+ for (i = 0; i < count; i++) {
+ if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
+ if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+ port == info->sig_port[dir]) {
+ /* GW->GK */
+
+ /* Fix for Gnomemeeting */
+ if (i > 0 &&
+ get_h225_addr(ct, *data, &taddr[0],
+ &addr, &port) &&
+ (ntohl(addr.ip) & 0xff000000) == 0x7f000000)
+ i = 0;
+
+ pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, port,
+ &ct->tuplehash[!dir].tuple.dst.u3.ip,
+ info->sig_port[!dir]);
+ return set_h225_addr(skb, protoff, data, 0,
+ &taddr[i],
+ &ct->tuplehash[!dir].
+ tuple.dst.u3,
+ info->sig_port[!dir]);
+ } else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
+ port == info->sig_port[dir]) {
+ /* GK->GW */
+ pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, port,
+ &ct->tuplehash[!dir].tuple.src.u3.ip,
+ info->sig_port[!dir]);
+ return set_h225_addr(skb, protoff, data, 0,
+ &taddr[i],
+ &ct->tuplehash[!dir].
+ tuple.src.u3,
+ info->sig_port[!dir]);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+
+ for (i = 0; i < count; i++) {
+ if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
+ addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+ port == ct->tuplehash[dir].tuple.src.u.udp.port) {
+ pr_debug("nf_nat_ras: set rasAddress %pI4:%hu->%pI4:%hu\n",
+ &addr.ip, ntohs(port),
+ &ct->tuplehash[!dir].tuple.dst.u3.ip,
+ ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
+ return set_h225_addr(skb, protoff, data, 0, &taddr[i],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.
+ dst.u.udp.port);
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ __be16 port, __be16 rtp_port,
+ struct nf_conntrack_expect *rtp_exp,
+ struct nf_conntrack_expect *rtcp_exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int i;
+ u_int16_t nated_port;
+
+ /* Set expectations for NAT */
+ rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+ rtp_exp->expectfn = nf_nat_follow_master;
+ rtp_exp->dir = !dir;
+ rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+ rtcp_exp->expectfn = nf_nat_follow_master;
+ rtcp_exp->dir = !dir;
+
+ /* Lookup existing expects */
+ for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
+ if (info->rtp_port[i][dir] == rtp_port) {
+ /* Expected */
+
+ /* Use allocated ports first. This will refresh
+ * the expects */
+ rtp_exp->tuple.dst.u.udp.port = info->rtp_port[i][dir];
+ rtcp_exp->tuple.dst.u.udp.port =
+ htons(ntohs(info->rtp_port[i][dir]) + 1);
+ break;
+ } else if (info->rtp_port[i][dir] == 0) {
+ /* Not expected */
+ break;
+ }
+ }
+
+ /* Run out of expectations */
+ if (i >= H323_RTP_CHANNEL_MAX) {
+ net_notice_ratelimited("nf_nat_h323: out of expectations\n");
+ return 0;
+ }
+
+ /* Try to get a pair of ports. */
+ for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+ nated_port != 0; nated_port += 2) {
+ int ret;
+
+ rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == 0) {
+ rtcp_exp->tuple.dst.u.udp.port =
+ htons(nated_port + 1);
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
+ break;
+ else if (ret == -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ continue;
+ } else if (ret < 0) {
+ nf_ct_unexpect_related(rtp_exp);
+ nated_port = 0;
+ break;
+ }
+ } else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons((port & htons(1)) ? nated_port + 1 :
+ nated_port)) == 0) {
+ /* Save ports */
+ info->rtp_port[i][dir] = rtp_port;
+ info->rtp_port[i][!dir] = htons(nated_port);
+ } else {
+ nf_ct_unexpect_related(rtp_exp);
+ nf_ct_unexpect_related(rtcp_exp);
+ return -1;
+ }
+
+ /* Success */
+ pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n",
+ &rtp_exp->tuple.src.u3.ip,
+ ntohs(rtp_exp->tuple.src.u.udp.port),
+ &rtp_exp->tuple.dst.u3.ip,
+ ntohs(rtp_exp->tuple.dst.u.udp.port));
+ pr_debug("nf_nat_h323: expect RTCP %pI4:%hu->%pI4:%hu\n",
+ &rtcp_exp->tuple.src.u3.ip,
+ ntohs(rtcp_exp->tuple.src.u.udp.port),
+ &rtcp_exp->tuple.dst.u3.ip,
+ ntohs(rtcp_exp->tuple.dst.u.udp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = nf_nat_follow_master;
+ exp->dir = !dir;
+
+ /* Try to get same port: if not, try to change it. */
+ for (; nated_port != 0; nated_port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(nated_port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h245_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) < 0) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ pr_debug("nf_nat_h323: expect T.120 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = nf_nat_follow_master;
+ exp->dir = !dir;
+
+ /* Check existing expects */
+ if (info->sig_port[dir] == port)
+ nated_port = ntohs(info->sig_port[!dir]);
+
+ /* Try to get same port: if not, try to change it. */
+ for (; nated_port != 0; nated_port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(nated_port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h225_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) == 0) {
+ /* Save ports */
+ info->sig_port[dir] = port;
+ info->sig_port[!dir] = htons(nated_port);
+ } else {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************
+ * This conntrack expect function replaces nf_conntrack_q931_expect()
+ * which was set by nf_conntrack_h323.c.
+ ****************************************************************************/
+static void ip_nat_q931_expect(struct nf_conn *new,
+ struct nf_conntrack_expect *this)
+{
+ struct nf_nat_range range;
+
+ if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
+ nf_nat_follow_master(new, this);
+ return;
+ }
+
+ /* This must be a fresh one. */
+ BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ new->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = this->saved_proto;
+ range.min_addr = range.max_addr =
+ new->master->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int idx,
+ __be16 port, struct nf_conntrack_expect *exp)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port = ntohs(port);
+ union nf_inet_addr addr;
+
+ /* Set expectations for NAT */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = ip_nat_q931_expect;
+ exp->dir = !dir;
+
+ /* Check existing expects */
+ if (info->sig_port[dir] == port)
+ nated_port = ntohs(info->sig_port[!dir]);
+
+ /* Try to get same port: if not, try to change it. */
+ for (; nated_port != 0; nated_port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(nated_port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) == 0) {
+ /* Save ports */
+ info->sig_port[dir] = port;
+ info->sig_port[!dir] = htons(nated_port);
+
+ /* Fix for Gnomemeeting */
+ if (idx > 0 &&
+ get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
+ (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
+ set_h225_addr(skb, protoff, data, 0, &taddr[0],
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir]);
+ }
+ } else {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ /* Success */
+ pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+/****************************************************************************/
+static void ip_nat_callforwarding_expect(struct nf_conn *new,
+ struct nf_conntrack_expect *this)
+{
+ struct nf_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ new->tuplehash[!this->dir].tuple.src.u3;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = this->saved_proto;
+ range.min_addr = range.max_addr = this->saved_addr;
+ nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ u_int16_t nated_port;
+
+ /* Set expectations for NAT */
+ exp->saved_addr = exp->tuple.dst.u3;
+ exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->expectfn = ip_nat_callforwarding_expect;
+ exp->dir = !dir;
+
+ /* Try to get same port: if not, try to change it. */
+ for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(nated_port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
+ }
+
+ if (nated_port == 0) { /* No port available */
+ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
+ return 0;
+ }
+
+ /* Modify signal */
+ if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ htons(nated_port)) == 0) {
+ nf_ct_unexpect_related(exp);
+ return -1;
+ }
+
+ /* Success */
+ pr_debug("nf_nat_q931: expect Call Forwarding %pI4:%hu->%pI4:%hu\n",
+ &exp->tuple.src.u3.ip,
+ ntohs(exp->tuple.src.u.tcp.port),
+ &exp->tuple.dst.u3.ip,
+ ntohs(exp->tuple.dst.u.tcp.port));
+
+ return 0;
+}
+
+static struct nf_ct_helper_expectfn q931_nat = {
+ .name = "Q.931",
+ .expectfn = ip_nat_q931_expect,
+};
+
+static struct nf_ct_helper_expectfn callforwarding_nat = {
+ .name = "callforwarding",
+ .expectfn = ip_nat_callforwarding_expect,
+};
+
+/****************************************************************************/
+static int __init init(void)
+{
+ BUG_ON(set_h245_addr_hook != NULL);
+ BUG_ON(set_h225_addr_hook != NULL);
+ BUG_ON(set_sig_addr_hook != NULL);
+ BUG_ON(set_ras_addr_hook != NULL);
+ BUG_ON(nat_rtp_rtcp_hook != NULL);
+ BUG_ON(nat_t120_hook != NULL);
+ BUG_ON(nat_h245_hook != NULL);
+ BUG_ON(nat_callforwarding_hook != NULL);
+ BUG_ON(nat_q931_hook != NULL);
+
+ RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr);
+ RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr);
+ RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr);
+ RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr);
+ RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp);
+ RCU_INIT_POINTER(nat_t120_hook, nat_t120);
+ RCU_INIT_POINTER(nat_h245_hook, nat_h245);
+ RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding);
+ RCU_INIT_POINTER(nat_q931_hook, nat_q931);
+ nf_ct_helper_expectfn_register(&q931_nat);
+ nf_ct_helper_expectfn_register(&callforwarding_nat);
+ return 0;
+}
+
+/****************************************************************************/
+static void __exit fini(void)
+{
+ RCU_INIT_POINTER(set_h245_addr_hook, NULL);
+ RCU_INIT_POINTER(set_h225_addr_hook, NULL);
+ RCU_INIT_POINTER(set_sig_addr_hook, NULL);
+ RCU_INIT_POINTER(set_ras_addr_hook, NULL);
+ RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL);
+ RCU_INIT_POINTER(nat_t120_hook, NULL);
+ RCU_INIT_POINTER(nat_h245_hook, NULL);
+ RCU_INIT_POINTER(nat_callforwarding_hook, NULL);
+ RCU_INIT_POINTER(nat_q931_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&q931_nat);
+ nf_ct_helper_expectfn_unregister(&callforwarding_nat);
+ synchronize_rcu();
+}
+
+/****************************************************************************/
+module_init(init);
+module_exit(fini);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_h323");
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
new file mode 100644
index 000000000..e59cc05c0
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -0,0 +1,481 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/secure_seq.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
+
+#ifdef CONFIG_XFRM
+static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ unsigned long statusbit,
+ struct flowi *fl)
+{
+ const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+ struct flowi4 *fl4 = &fl->u.ip4;
+
+ if (ct->status & statusbit) {
+ fl4->daddr = t->dst.u3.ip;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl4->fl4_dport = t->dst.u.all;
+ }
+
+ statusbit ^= IPS_NAT_MASK;
+
+ if (ct->status & statusbit) {
+ fl4->saddr = t->src.u3.ip;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl4->fl4_sport = t->src.u.all;
+ }
+}
+#endif /* CONFIG_XFRM */
+
+static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
+ const struct nf_nat_range *range)
+{
+ return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
+ ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
+}
+
+static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
+ __be16 dport)
+{
+ return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
+}
+
+static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
+ unsigned int iphdroff,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *target,
+ enum nf_nat_manip_type maniptype)
+{
+ struct iphdr *iph;
+ unsigned int hdroff;
+
+ if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
+ return false;
+
+ iph = (void *)skb->data + iphdroff;
+ hdroff = iphdroff + iph->ihl * 4;
+
+ if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
+ target, maniptype))
+ return false;
+ iph = (void *)skb->data + iphdroff;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
+ iph->saddr = target->src.u3.ip;
+ } else {
+ csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
+ iph->daddr = target->dst.u3.ip;
+ }
+ return true;
+}
+
+static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+ __be32 oldip, newip;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ oldip = iph->saddr;
+ newip = t->src.u3.ip;
+ } else {
+ oldip = iph->daddr;
+ newip = t->dst.u3.ip;
+ }
+ inet_proto_csum_replace4(check, skb, oldip, newip, 1);
+}
+
+static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
+ u8 proto, void *data, __sum16 *check,
+ int datalen, int oldlen)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (!(rt->rt_flags & RTCF_LOCAL) &&
+ (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) +
+ skb_network_offset(skb) +
+ ip_hdrlen(skb);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, proto, 0);
+ } else {
+ *check = 0;
+ *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, proto,
+ csum_partial(data, datalen,
+ 0));
+ if (proto == IPPROTO_UDP && !*check)
+ *check = CSUM_MANGLED_0;
+ }
+ } else
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), 1);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range *range)
+{
+ if (tb[CTA_NAT_V4_MINIP]) {
+ range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ if (tb[CTA_NAT_V4_MAXIP])
+ range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
+ else
+ range->max_addr.ip = range->min_addr.ip;
+
+ return 0;
+}
+#endif
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
+ .l3proto = NFPROTO_IPV4,
+ .in_range = nf_nat_ipv4_in_range,
+ .secure_port = nf_nat_ipv4_secure_port,
+ .manip_pkt = nf_nat_ipv4_manip_pkt,
+ .csum_update = nf_nat_ipv4_csum_update,
+ .csum_recalc = nf_nat_ipv4_csum_recalc,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
+#endif
+#ifdef CONFIG_XFRM
+ .decode_session = nf_nat_ipv4_decode_session,
+#endif
+};
+
+int nf_nat_icmp_reply_translation(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum)
+{
+ struct {
+ struct icmphdr icmp;
+ struct iphdr ip;
+ } *inside;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+ unsigned int hdrlen = ip_hdrlen(skb);
+ const struct nf_nat_l4proto *l4proto;
+ struct nf_conntrack_tuple target;
+ unsigned long statusbit;
+
+ NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+
+ if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ return 0;
+ if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
+ return 0;
+
+ inside = (void *)skb->data + hdrlen;
+ if (inside->icmp.type == ICMP_REDIRECT) {
+ if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+ return 0;
+ if (ct->status & IPS_NAT_MASK)
+ return 0;
+ }
+
+ if (manip == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply direction */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
+ if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
+ l4proto, &ct->tuplehash[!dir].tuple, !manip))
+ return 0;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ /* Reloading "inside" here since manip_pkt may reallocate */
+ inside = (void *)skb->data + hdrlen;
+ inside->icmp.checksum = 0;
+ inside->icmp.checksum =
+ csum_fold(skb_checksum(skb, hdrlen,
+ skb->len - hdrlen, 0));
+ }
+
+ /* Change outer to look like the reply to an incoming packet */
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
+ if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
+
+unsigned int
+nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ /* maniptype == SRC for postrouting. */
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+
+ /* We never see fragments: conntrack defrags on pre-routing
+ * and local-out, and nf_nat_out protects post-routing.
+ */
+ NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
+
+ /* Don't try to NAT if this packet is not conntracked */
+ if (nf_ct_is_untracked(ct))
+ return NF_ACCEPT;
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ ops->hooknum))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ unsigned int ret;
+
+ ret = do_chain(ops, skb, state, ct);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
+ break;
+
+ ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ if (ret != NF_ACCEPT)
+ return ret;
+ } else {
+ pr_debug("Already setup manip %s for ct %p\n",
+ maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat,
+ state->out))
+ goto oif_changed;
+ }
+ break;
+
+ default:
+ /* ESTABLISHED */
+ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+ ctinfo == IP_CT_ESTABLISHED_REPLY);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ goto oif_changed;
+ }
+
+ return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+
+oif_changed:
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_DROP;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
+
+unsigned int
+nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ unsigned int ret;
+ __be32 daddr = ip_hdr(skb)->daddr;
+
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ daddr != ip_hdr(skb)->daddr)
+ skb_dst_drop(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
+
+unsigned int
+nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+#ifdef CONFIG_XFRM
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int err;
+#endif
+ unsigned int ret;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+#ifdef CONFIG_XFRM
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if ((ct->tuplehash[dir].tuple.src.u3.ip !=
+ ct->tuplehash[!dir].tuple.dst.u3.ip) ||
+ (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+#endif
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
+
+unsigned int
+nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int ret;
+ int err;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv4_fn(ops, skb, state, do_chain);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+ ct->tuplehash[!dir].tuple.src.u3.ip) {
+ err = ip_route_me_harder(skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#ifdef CONFIG_XFRM
+ else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all) {
+ err = nf_xfrm_me_harder(skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#endif
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
+
+static int __init nf_nat_l3proto_ipv4_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
+ if (err < 0)
+ goto err2;
+ return err;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_l3proto_ipv4_exit(void)
+{
+ nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
+
+module_init(nf_nat_l3proto_ipv4_init);
+module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
new file mode 100644
index 000000000..c6eb42100
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -0,0 +1,153 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+
+unsigned int
+nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
+ const struct nf_nat_range *range,
+ const struct net_device *out)
+{
+ struct nf_conn *ct;
+ struct nf_conn_nat *nat;
+ enum ip_conntrack_info ctinfo;
+ struct nf_nat_range newrange;
+ const struct rtable *rt;
+ __be32 newsrc, nh;
+
+ NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ nat = nfct_nat(ct);
+
+ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ /* Source address is 0.0.0.0 - locally generated packet that is
+ * probably not supposed to be masqueraded.
+ */
+ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
+ return NF_ACCEPT;
+
+ rt = skb_rtable(skb);
+ nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+ newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
+ if (!newsrc) {
+ pr_info("%s ate my IP address\n", out->name);
+ return NF_DROP;
+ }
+
+ nat->masq_index = out->ifindex;
+
+ /* Transfer from original range. */
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = newsrc;
+ newrange.max_addr.ip = newsrc;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ /* Hand modified range to generic setup. */
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
+
+static int device_cmp(struct nf_conn *i, void *ifindex)
+{
+ const struct nf_conn_nat *nat = nfct_nat(i);
+
+ if (!nat)
+ return 0;
+ if (nf_ct_l3num(i) != NFPROTO_IPV4)
+ return 0;
+ return nat->masq_index == (int)(long)ifindex;
+}
+
+static int masq_device_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_DOWN) {
+ /* Device was downed. Search entire table for
+ * conntracks which were associated with that device,
+ * and forget them.
+ */
+ NF_CT_ASSERT(dev->ifindex != 0);
+
+ nf_ct_iterate_cleanup(net, device_cmp,
+ (void *)(long)dev->ifindex, 0, 0);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int masq_inet_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+ struct netdev_notifier_info info;
+
+ netdev_notifier_info_init(&info, dev);
+ return masq_device_event(this, event, &info);
+}
+
+static struct notifier_block masq_dev_notifier = {
+ .notifier_call = masq_device_event,
+};
+
+static struct notifier_block masq_inet_notifier = {
+ .notifier_call = masq_inet_event,
+};
+
+static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
+
+void nf_nat_masquerade_ipv4_register_notifier(void)
+{
+ /* check if the notifier was already set */
+ if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
+ return;
+
+ /* Register for device down reports */
+ register_netdevice_notifier(&masq_dev_notifier);
+ /* Register IP address change reports */
+ register_inetaddr_notifier(&masq_inet_notifier);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
+
+void nf_nat_masquerade_ipv4_unregister_notifier(void)
+{
+ /* check if the notifier still has clients */
+ if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
+ return;
+
+ unregister_netdevice_notifier(&masq_dev_notifier);
+ unregister_inetaddr_notifier(&masq_inet_notifier);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
new file mode 100644
index 000000000..657d2307f
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -0,0 +1,311 @@
+/*
+ * nf_nat_pptp.c
+ *
+ * NAT support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * TODO: - NAT to a unique tuple, not to TCP source port
+ * (needs netfilter tuple reservation)
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define NF_NAT_PPTP_VERSION "3.0"
+
+#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off)))
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
+MODULE_ALIAS("ip_nat_pptp");
+
+static void pptp_nat_expected(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct net *net = nf_ct_net(ct);
+ const struct nf_conn *master = ct->master;
+ struct nf_conntrack_expect *other_exp;
+ struct nf_conntrack_tuple t;
+ const struct nf_ct_pptp_master *ct_pptp_info;
+ const struct nf_nat_pptp *nat_pptp_info;
+ struct nf_nat_range range;
+
+ ct_pptp_info = nfct_help_data(master);
+ nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
+
+ /* And here goes the grand finale of corrosion... */
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ pr_debug("we are PNS->PAC\n");
+ /* therefore, build tuple for PAC->PNS */
+ t.src.l3num = AF_INET;
+ t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+ t.src.u.gre.key = ct_pptp_info->pac_call_id;
+ t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+ t.dst.u.gre.key = ct_pptp_info->pns_call_id;
+ t.dst.protonum = IPPROTO_GRE;
+ } else {
+ pr_debug("we are PAC->PNS\n");
+ /* build tuple for PNS->PAC */
+ t.src.l3num = AF_INET;
+ t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+ t.src.u.gre.key = nat_pptp_info->pns_call_id;
+ t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+ t.dst.u.gre.key = nat_pptp_info->pac_call_id;
+ t.dst.protonum = IPPROTO_GRE;
+ }
+
+ pr_debug("trying to unexpect other dir: ");
+ nf_ct_dump_tuple_ip(&t);
+ other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
+ if (other_exp) {
+ nf_ct_unexpect_related(other_exp);
+ nf_ct_expect_put(other_exp);
+ pr_debug("success\n");
+ } else {
+ pr_debug("not found!\n");
+ }
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto = range.max_proto = exp->saved_proto;
+ }
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.src.u3;
+ if (exp->dir == IP_CT_DIR_REPLY) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto = range.max_proto = exp->saved_proto;
+ }
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+/* outbound packets == from PNS to PAC */
+static int
+pptp_outbound_pkt(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+
+{
+ struct nf_ct_pptp_master *ct_pptp_info;
+ struct nf_nat_pptp *nat_pptp_info;
+ u_int16_t msg;
+ __be16 new_callid;
+ unsigned int cid_off;
+
+ ct_pptp_info = nfct_help_data(ct);
+ nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+
+ new_callid = ct_pptp_info->pns_call_id;
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REQUEST:
+ cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
+ /* FIXME: ideally we would want to reserve a call ID
+ * here. current netfilter NAT core is not able to do
+ * this :( For now we use TCP source port. This breaks
+ * multiple calls within one control session */
+
+ /* save original call ID in nat_info */
+ nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
+
+ /* don't use tcph->source since we are at a DSTmanip
+ * hook (e.g. PREROUTING) and pkt is not mangled yet */
+ new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+ /* save new call ID in ct info */
+ ct_pptp_info->pns_call_id = new_callid;
+ break;
+ case PPTP_IN_CALL_REPLY:
+ cid_off = offsetof(union pptp_ctrl_union, icack.callID);
+ break;
+ case PPTP_CALL_CLEAR_REQUEST:
+ cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
+ break;
+ default:
+ pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
+ msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
+ pptp_msg_name[0]);
+ /* fall through */
+ case PPTP_SET_LINK_INFO:
+ /* only need to NAT in case PAC is behind NAT box */
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
+ * down to here */
+ pr_debug("altering call id from 0x%04x to 0x%04x\n",
+ ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
+
+ /* mangle packet */
+ if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+ cid_off + sizeof(struct pptp_pkt_hdr) +
+ sizeof(struct PptpControlHeader),
+ sizeof(new_callid), (char *)&new_callid,
+ sizeof(new_callid)) == 0)
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+
+static void
+pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
+ struct nf_conntrack_expect *expect_reply)
+{
+ const struct nf_conn *ct = expect_orig->master;
+ struct nf_ct_pptp_master *ct_pptp_info;
+ struct nf_nat_pptp *nat_pptp_info;
+
+ ct_pptp_info = nfct_help_data(ct);
+ nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+
+ /* save original PAC call ID in nat_info */
+ nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
+
+ /* alter expectation for PNS->PAC direction */
+ expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
+ expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
+ expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
+ expect_orig->dir = IP_CT_DIR_ORIGINAL;
+
+ /* alter expectation for PAC->PNS direction */
+ expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
+ expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
+ expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
+ expect_reply->dir = IP_CT_DIR_REPLY;
+}
+
+/* inbound packets == from PAC to PNS */
+static int
+pptp_inbound_pkt(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq)
+{
+ const struct nf_nat_pptp *nat_pptp_info;
+ u_int16_t msg;
+ __be16 new_pcid;
+ unsigned int pcid_off;
+
+ nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+ new_pcid = nat_pptp_info->pns_call_id;
+
+ switch (msg = ntohs(ctlh->messageType)) {
+ case PPTP_OUT_CALL_REPLY:
+ pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID);
+ break;
+ case PPTP_IN_CALL_CONNECT:
+ pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID);
+ break;
+ case PPTP_IN_CALL_REQUEST:
+ /* only need to nat in case PAC is behind NAT box */
+ return NF_ACCEPT;
+ case PPTP_WAN_ERROR_NOTIFY:
+ pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID);
+ break;
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ pcid_off = offsetof(union pptp_ctrl_union, disc.callID);
+ break;
+ case PPTP_SET_LINK_INFO:
+ pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
+ break;
+ default:
+ pr_debug("unknown inbound packet %s\n",
+ msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
+ pptp_msg_name[0]);
+ /* fall through */
+ case PPTP_START_SESSION_REQUEST:
+ case PPTP_START_SESSION_REPLY:
+ case PPTP_STOP_SESSION_REQUEST:
+ case PPTP_STOP_SESSION_REPLY:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* no need to alter packet */
+ return NF_ACCEPT;
+ }
+
+ /* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
+ * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
+
+ /* mangle packet */
+ pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
+ ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
+
+ if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
+ pcid_off + sizeof(struct pptp_pkt_hdr) +
+ sizeof(struct PptpControlHeader),
+ sizeof(new_pcid), (char *)&new_pcid,
+ sizeof(new_pcid)) == 0)
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+
+static int __init nf_nat_helper_pptp_init(void)
+{
+ nf_nat_need_gre();
+
+ BUG_ON(nf_nat_pptp_hook_outbound != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
+
+ BUG_ON(nf_nat_pptp_hook_inbound != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
+
+ BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+
+ BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
+ return 0;
+}
+
+static void __exit nf_nat_helper_pptp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
+ synchronize_rcu();
+}
+
+module_init(nf_nat_helper_pptp_init);
+module_exit(nf_nat_helper_pptp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
new file mode 100644
index 000000000..9414923f1
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -0,0 +1,149 @@
+/*
+ * nf_nat_proto_gre.c
+ *
+ * NAT protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
+
+/* generate unique tuple ... */
+static void
+gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ static u_int16_t key;
+ __be16 *keyptr;
+ unsigned int min, i, range_size;
+
+ /* If there is no master conntrack we are not PPTP,
+ do not change tuples */
+ if (!ct->master)
+ return;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ keyptr = &tuple->src.u.gre.key;
+ else
+ keyptr = &tuple->dst.u.gre.key;
+
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ pr_debug("%p: NATing GRE PPTP\n", ct);
+ min = 1;
+ range_size = 0xffff;
+ } else {
+ min = ntohs(range->min_proto.gre.key);
+ range_size = ntohs(range->max_proto.gre.key) - min + 1;
+ }
+
+ pr_debug("min = %u, range_size = %u\n", min, range_size);
+
+ for (i = 0; ; ++key) {
+ *keyptr = htons(min + key % range_size);
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+
+ pr_debug("%p: no NAT mapping\n", ct);
+ return;
+}
+
+/* manipulate a GRE packet according to maniptype */
+static bool
+gre_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ const struct gre_hdr *greh;
+ struct gre_hdr_pptp *pgreh;
+
+ /* pgreh includes two optional 32bit fields which are not required
+ * to be there. That's where the magic '8' comes from */
+ if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
+ return false;
+
+ greh = (void *)skb->data + hdroff;
+ pgreh = (struct gre_hdr_pptp *)greh;
+
+ /* we only have destination manip of a packet, since 'source key'
+ * is not present in the packet itself */
+ if (maniptype != NF_NAT_MANIP_DST)
+ return true;
+ switch (greh->version) {
+ case GRE_VERSION_1701:
+ /* We do not currently NAT any GREv0 packets.
+ * Try to behave like "nf_nat_proto_unknown" */
+ break;
+ case GRE_VERSION_PPTP:
+ pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
+ pgreh->call_id = tuple->dst.u.gre.key;
+ break;
+ default:
+ pr_debug("can't nat unknown GRE version\n");
+ return false;
+ }
+ return true;
+}
+
+static const struct nf_nat_l4proto gre = {
+ .l4proto = IPPROTO_GRE,
+ .manip_pkt = gre_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = gre_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_gre_init(void)
+{
+ return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
+}
+
+static void __exit nf_nat_proto_gre_fini(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
+}
+
+module_init(nf_nat_proto_gre_init);
+module_exit(nf_nat_proto_gre_fini);
+
+void nf_nat_need_gre(void)
+{
+ return;
+}
+EXPORT_SYMBOL_GPL(nf_nat_need_gre);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
new file mode 100644
index 000000000..4557b4ab8
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -0,0 +1,83 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static bool
+icmp_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
+ ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
+}
+
+static void
+icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ static u_int16_t id;
+ unsigned int range_size;
+ unsigned int i;
+
+ range_size = ntohs(range->max_proto.icmp.id) -
+ ntohs(range->min_proto.icmp.id) + 1;
+ /* If no range specified... */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
+ range_size = 0xFFFF;
+
+ for (i = 0; ; ++id) {
+ tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
+ (id % range_size));
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+ return;
+}
+
+static bool
+icmp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct icmphdr *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct icmphdr *)(skb->data + hdroff);
+ inet_proto_csum_replace2(&hdr->checksum, skb,
+ hdr->un.echo.id, tuple->src.u.icmp.id, 0);
+ hdr->un.echo.id = tuple->src.u.icmp.id;
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
+ .l4proto = IPPROTO_ICMP,
+ .manip_pkt = icmp_manip_pkt,
+ .in_range = icmp_in_range,
+ .unique_tuple = icmp_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
new file mode 100644
index 000000000..7c6766713
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -0,0 +1,1313 @@
+/*
+ * nf_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network
+ * discovery and monitoring applications where target networks use
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified. The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+MODULE_ALIAS("ip_nat_snmp_basic");
+
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+#define NOCT1(n) (*(u8 *)(n))
+
+static int debug;
+static DEFINE_SPINLOCK(snmp_lock);
+
+/*
+ * Application layer address mapping mimics the NAT mapping, but
+ * only for the first octet in this case (a more flexible system
+ * can be implemented if needed).
+ */
+struct oct1_map
+{
+ u_int8_t from;
+ u_int8_t to;
+};
+
+
+/*****************************************************************************
+ *
+ * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* Class */
+#define ASN1_UNI 0 /* Universal */
+#define ASN1_APL 1 /* Application */
+#define ASN1_CTX 2 /* Context */
+#define ASN1_PRV 3 /* Private */
+
+/* Tag */
+#define ASN1_EOC 0 /* End Of Contents */
+#define ASN1_BOL 1 /* Boolean */
+#define ASN1_INT 2 /* Integer */
+#define ASN1_BTS 3 /* Bit String */
+#define ASN1_OTS 4 /* Octet String */
+#define ASN1_NUL 5 /* Null */
+#define ASN1_OJI 6 /* Object Identifier */
+#define ASN1_OJD 7 /* Object Description */
+#define ASN1_EXT 8 /* External */
+#define ASN1_SEQ 16 /* Sequence */
+#define ASN1_SET 17 /* Set */
+#define ASN1_NUMSTR 18 /* Numerical String */
+#define ASN1_PRNSTR 19 /* Printable String */
+#define ASN1_TEXSTR 20 /* Teletext String */
+#define ASN1_VIDSTR 21 /* Video String */
+#define ASN1_IA5STR 22 /* IA5 String */
+#define ASN1_UNITIM 23 /* Universal Time */
+#define ASN1_GENTIM 24 /* General Time */
+#define ASN1_GRASTR 25 /* Graphical String */
+#define ASN1_VISSTR 26 /* Visible String */
+#define ASN1_GENSTR 27 /* General String */
+
+/* Primitive / Constructed methods*/
+#define ASN1_PRI 0 /* Primitive */
+#define ASN1_CON 1 /* Constructed */
+
+/*
+ * Error codes.
+ */
+#define ASN1_ERR_NOERROR 0
+#define ASN1_ERR_DEC_EMPTY 2
+#define ASN1_ERR_DEC_EOC_MISMATCH 3
+#define ASN1_ERR_DEC_LENGTH_MISMATCH 4
+#define ASN1_ERR_DEC_BADVALUE 5
+
+/*
+ * ASN.1 context.
+ */
+struct asn1_ctx
+{
+ int error; /* Error condition */
+ unsigned char *pointer; /* Octet just to be decoded */
+ unsigned char *begin; /* First octet */
+ unsigned char *end; /* Octet after last octet */
+};
+
+/*
+ * Octet string (not null terminated)
+ */
+struct asn1_octstr
+{
+ unsigned char *data;
+ unsigned int len;
+};
+
+static void asn1_open(struct asn1_ctx *ctx,
+ unsigned char *buf,
+ unsigned int len)
+{
+ ctx->begin = buf;
+ ctx->end = buf + len;
+ ctx->pointer = buf;
+ ctx->error = ASN1_ERR_NOERROR;
+}
+
+static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
+{
+ if (ctx->pointer >= ctx->end) {
+ ctx->error = ASN1_ERR_DEC_EMPTY;
+ return 0;
+ }
+ *ch = *(ctx->pointer)++;
+ return 1;
+}
+
+static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
+{
+ unsigned char ch;
+
+ *tag = 0;
+
+ do
+ {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+ *tag <<= 7;
+ *tag |= ch & 0x7F;
+ } while ((ch & 0x80) == 0x80);
+ return 1;
+}
+
+static unsigned char asn1_id_decode(struct asn1_ctx *ctx,
+ unsigned int *cls,
+ unsigned int *con,
+ unsigned int *tag)
+{
+ unsigned char ch;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *cls = (ch & 0xC0) >> 6;
+ *con = (ch & 0x20) >> 5;
+ *tag = (ch & 0x1F);
+
+ if (*tag == 0x1F) {
+ if (!asn1_tag_decode(ctx, tag))
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
+ unsigned int *def,
+ unsigned int *len)
+{
+ unsigned char ch, cnt;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch == 0x80)
+ *def = 0;
+ else {
+ *def = 1;
+
+ if (ch < 0x80)
+ *len = ch;
+ else {
+ cnt = ch & 0x7F;
+ *len = 0;
+
+ while (cnt > 0) {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+ *len <<= 8;
+ *len |= ch;
+ cnt--;
+ }
+ }
+ }
+
+ /* don't trust len bigger than ctx buffer */
+ if (*len > ctx->end - ctx->pointer)
+ return 0;
+
+ return 1;
+}
+
+static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
+ unsigned char **eoc,
+ unsigned int *cls,
+ unsigned int *con,
+ unsigned int *tag)
+{
+ unsigned int def, len;
+
+ if (!asn1_id_decode(ctx, cls, con, tag))
+ return 0;
+
+ def = len = 0;
+ if (!asn1_length_decode(ctx, &def, &len))
+ return 0;
+
+ /* primitive shall be definite, indefinite shall be constructed */
+ if (*con == ASN1_PRI && !def)
+ return 0;
+
+ if (def)
+ *eoc = ctx->pointer + len;
+ else
+ *eoc = NULL;
+ return 1;
+}
+
+static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+ unsigned char ch;
+
+ if (eoc == NULL) {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch != 0x00) {
+ ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ if (ch != 0x00) {
+ ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+ return 0;
+ }
+ return 1;
+ } else {
+ if (ctx->pointer != eoc) {
+ ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
+ return 0;
+ }
+ return 1;
+ }
+}
+
+static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+ ctx->pointer = eoc;
+ return 1;
+}
+
+static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ long *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = (signed char) ch;
+ len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (long)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned int *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = ch;
+ if (ch == 0) len = 0;
+ else len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (unsigned int)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned long *integer)
+{
+ unsigned char ch;
+ unsigned int len;
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer = ch;
+ if (ch == 0) len = 0;
+ else len = 1;
+
+ while (ctx->pointer < eoc) {
+ if (++len > sizeof (unsigned long)) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ return 0;
+ }
+
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *integer <<= 8;
+ *integer |= ch;
+ }
+ return 1;
+}
+
+static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned char **octets,
+ unsigned int *len)
+{
+ unsigned char *ptr;
+
+ *len = 0;
+
+ *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
+ if (*octets == NULL)
+ return 0;
+
+ ptr = *octets;
+ while (ctx->pointer < eoc) {
+ if (!asn1_octet_decode(ctx, ptr++)) {
+ kfree(*octets);
+ *octets = NULL;
+ return 0;
+ }
+ (*len)++;
+ }
+ return 1;
+}
+
+static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
+ unsigned long *subid)
+{
+ unsigned char ch;
+
+ *subid = 0;
+
+ do {
+ if (!asn1_octet_decode(ctx, &ch))
+ return 0;
+
+ *subid <<= 7;
+ *subid |= ch & 0x7F;
+ } while ((ch & 0x80) == 0x80);
+ return 1;
+}
+
+static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
+ unsigned char *eoc,
+ unsigned long **oid,
+ unsigned int *len)
+{
+ unsigned long subid;
+ unsigned long *optr;
+ size_t size;
+
+ size = eoc - ctx->pointer + 1;
+
+ /* first subid actually encodes first two subids */
+ if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+ return 0;
+
+ *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
+ if (*oid == NULL)
+ return 0;
+
+ optr = *oid;
+
+ if (!asn1_subid_decode(ctx, &subid)) {
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+
+ if (subid < 40) {
+ optr[0] = 0;
+ optr[1] = subid;
+ } else if (subid < 80) {
+ optr[0] = 1;
+ optr[1] = subid - 40;
+ } else {
+ optr[0] = 2;
+ optr[1] = subid - 80;
+ }
+
+ *len = 2;
+ optr += 2;
+
+ while (ctx->pointer < eoc) {
+ if (++(*len) > size) {
+ ctx->error = ASN1_ERR_DEC_BADVALUE;
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+
+ if (!asn1_subid_decode(ctx, optr++)) {
+ kfree(*oid);
+ *oid = NULL;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/*****************************************************************************
+ *
+ * SNMP decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* SNMP Versions */
+#define SNMP_V1 0
+#define SNMP_V2C 1
+#define SNMP_V2 2
+#define SNMP_V3 3
+
+/* Default Sizes */
+#define SNMP_SIZE_COMM 256
+#define SNMP_SIZE_OBJECTID 128
+#define SNMP_SIZE_BUFCHR 256
+#define SNMP_SIZE_BUFINT 128
+#define SNMP_SIZE_SMALLOBJECTID 16
+
+/* Requests */
+#define SNMP_PDU_GET 0
+#define SNMP_PDU_NEXT 1
+#define SNMP_PDU_RESPONSE 2
+#define SNMP_PDU_SET 3
+#define SNMP_PDU_TRAP1 4
+#define SNMP_PDU_BULK 5
+#define SNMP_PDU_INFORM 6
+#define SNMP_PDU_TRAP2 7
+
+/* Errors */
+#define SNMP_NOERROR 0
+#define SNMP_TOOBIG 1
+#define SNMP_NOSUCHNAME 2
+#define SNMP_BADVALUE 3
+#define SNMP_READONLY 4
+#define SNMP_GENERROR 5
+#define SNMP_NOACCESS 6
+#define SNMP_WRONGTYPE 7
+#define SNMP_WRONGLENGTH 8
+#define SNMP_WRONGENCODING 9
+#define SNMP_WRONGVALUE 10
+#define SNMP_NOCREATION 11
+#define SNMP_INCONSISTENTVALUE 12
+#define SNMP_RESOURCEUNAVAILABLE 13
+#define SNMP_COMMITFAILED 14
+#define SNMP_UNDOFAILED 15
+#define SNMP_AUTHORIZATIONERROR 16
+#define SNMP_NOTWRITABLE 17
+#define SNMP_INCONSISTENTNAME 18
+
+/* General SNMP V1 Traps */
+#define SNMP_TRAP_COLDSTART 0
+#define SNMP_TRAP_WARMSTART 1
+#define SNMP_TRAP_LINKDOWN 2
+#define SNMP_TRAP_LINKUP 3
+#define SNMP_TRAP_AUTFAILURE 4
+#define SNMP_TRAP_EQPNEIGHBORLOSS 5
+#define SNMP_TRAP_ENTSPECIFIC 6
+
+/* SNMPv1 Types */
+#define SNMP_NULL 0
+#define SNMP_INTEGER 1 /* l */
+#define SNMP_OCTETSTR 2 /* c */
+#define SNMP_DISPLAYSTR 2 /* c */
+#define SNMP_OBJECTID 3 /* ul */
+#define SNMP_IPADDR 4 /* uc */
+#define SNMP_COUNTER 5 /* ul */
+#define SNMP_GAUGE 6 /* ul */
+#define SNMP_TIMETICKS 7 /* ul */
+#define SNMP_OPAQUE 8 /* c */
+
+/* Additional SNMPv2 Types */
+#define SNMP_UINTEGER 5 /* ul */
+#define SNMP_BITSTR 9 /* uc */
+#define SNMP_NSAP 10 /* uc */
+#define SNMP_COUNTER64 11 /* ul */
+#define SNMP_NOSUCHOBJECT 12
+#define SNMP_NOSUCHINSTANCE 13
+#define SNMP_ENDOFMIBVIEW 14
+
+union snmp_syntax
+{
+ unsigned char uc[0]; /* 8 bit unsigned */
+ char c[0]; /* 8 bit signed */
+ unsigned long ul[0]; /* 32 bit unsigned */
+ long l[0]; /* 32 bit signed */
+};
+
+struct snmp_object
+{
+ unsigned long *id;
+ unsigned int id_len;
+ unsigned short type;
+ unsigned int syntax_len;
+ union snmp_syntax syntax;
+};
+
+struct snmp_request
+{
+ unsigned long id;
+ unsigned int error_status;
+ unsigned int error_index;
+};
+
+struct snmp_v1_trap
+{
+ unsigned long *id;
+ unsigned int id_len;
+ unsigned long ip_address; /* pointer */
+ unsigned int general;
+ unsigned int specific;
+ unsigned long time;
+};
+
+/* SNMP types */
+#define SNMP_IPA 0
+#define SNMP_CNT 1
+#define SNMP_GGE 2
+#define SNMP_TIT 3
+#define SNMP_OPQ 4
+#define SNMP_C64 6
+
+/* SNMP errors */
+#define SERR_NSO 0
+#define SERR_NSI 1
+#define SERR_EOM 2
+
+static inline void mangle_address(unsigned char *begin,
+ unsigned char *addr,
+ const struct oct1_map *map,
+ __sum16 *check);
+struct snmp_cnv
+{
+ unsigned int class;
+ unsigned int tag;
+ int syntax;
+};
+
+static const struct snmp_cnv snmp_conv[] = {
+ {ASN1_UNI, ASN1_NUL, SNMP_NULL},
+ {ASN1_UNI, ASN1_INT, SNMP_INTEGER},
+ {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
+ {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
+ {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
+ {ASN1_APL, SNMP_IPA, SNMP_IPADDR},
+ {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */
+ {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */
+ {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
+ {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
+
+ /* SNMPv2 data types and errors */
+ {ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
+ {ASN1_APL, SNMP_C64, SNMP_COUNTER64},
+ {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
+ {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
+ {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
+ {0, 0, -1}
+};
+
+static unsigned char snmp_tag_cls2syntax(unsigned int tag,
+ unsigned int cls,
+ unsigned short *syntax)
+{
+ const struct snmp_cnv *cnv;
+
+ cnv = snmp_conv;
+
+ while (cnv->syntax != -1) {
+ if (cnv->tag == tag && cnv->class == cls) {
+ *syntax = cnv->syntax;
+ return 1;
+ }
+ cnv++;
+ }
+ return 0;
+}
+
+static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
+ struct snmp_object **obj)
+{
+ unsigned int cls, con, tag, len, idlen;
+ unsigned short type;
+ unsigned char *eoc, *end, *p;
+ unsigned long *lp, *id;
+ unsigned long ul;
+ long l;
+
+ *obj = NULL;
+ id = NULL;
+
+ if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+ return 0;
+
+ if (!asn1_oid_decode(ctx, end, &id, &idlen))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
+ kfree(id);
+ return 0;
+ }
+
+ if (con != ASN1_PRI) {
+ kfree(id);
+ return 0;
+ }
+
+ type = 0;
+ if (!snmp_tag_cls2syntax(tag, cls, &type)) {
+ kfree(id);
+ return 0;
+ }
+
+ l = 0;
+ switch (type) {
+ case SNMP_INTEGER:
+ len = sizeof(long);
+ if (!asn1_long_decode(ctx, end, &l)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ (*obj)->syntax.l[0] = l;
+ break;
+ case SNMP_OCTETSTR:
+ case SNMP_OPAQUE:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(p);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.c, p, len);
+ kfree(p);
+ break;
+ case SNMP_NULL:
+ case SNMP_NOSUCHOBJECT:
+ case SNMP_NOSUCHINSTANCE:
+ case SNMP_ENDOFMIBVIEW:
+ len = 0;
+ *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ if (!asn1_null_decode(ctx, end)) {
+ kfree(id);
+ kfree(*obj);
+ *obj = NULL;
+ return 0;
+ }
+ break;
+ case SNMP_OBJECTID:
+ if (!asn1_oid_decode(ctx, end, &lp, &len)) {
+ kfree(id);
+ return 0;
+ }
+ len *= sizeof(unsigned long);
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(lp);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.ul, lp, len);
+ kfree(lp);
+ break;
+ case SNMP_IPADDR:
+ if (!asn1_octets_decode(ctx, end, &p, &len)) {
+ kfree(id);
+ return 0;
+ }
+ if (len != 4) {
+ kfree(p);
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(p);
+ kfree(id);
+ return 0;
+ }
+ memcpy((*obj)->syntax.uc, p, len);
+ kfree(p);
+ break;
+ case SNMP_COUNTER:
+ case SNMP_GAUGE:
+ case SNMP_TIMETICKS:
+ len = sizeof(unsigned long);
+ if (!asn1_ulong_decode(ctx, end, &ul)) {
+ kfree(id);
+ return 0;
+ }
+ *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+ if (*obj == NULL) {
+ kfree(id);
+ return 0;
+ }
+ (*obj)->syntax.ul[0] = ul;
+ break;
+ default:
+ kfree(id);
+ return 0;
+ }
+
+ (*obj)->syntax_len = len;
+ (*obj)->type = type;
+ (*obj)->id = id;
+ (*obj)->id_len = idlen;
+
+ if (!asn1_eoc_decode(ctx, eoc)) {
+ kfree(id);
+ kfree(*obj);
+ *obj = NULL;
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
+ struct snmp_request *request)
+{
+ unsigned int cls, con, tag;
+ unsigned char *end;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_ulong_decode(ctx, end, &request->id))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_uint_decode(ctx, end, &request->error_status))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+
+ if (!asn1_uint_decode(ctx, end, &request->error_index))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Fast checksum update for possibly oddly-aligned UDP byte, from the
+ * code example in the draft.
+ */
+static void fast_csum(__sum16 *csum,
+ const unsigned char *optr,
+ const unsigned char *nptr,
+ int offset)
+{
+ unsigned char s[4];
+
+ if (offset & 1) {
+ s[0] = ~0;
+ s[1] = ~*optr;
+ s[2] = 0;
+ s[3] = *nptr;
+ } else {
+ s[0] = ~*optr;
+ s[1] = ~0;
+ s[2] = *nptr;
+ s[3] = 0;
+ }
+
+ *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
+}
+
+/*
+ * Mangle IP address.
+ * - begin points to the start of the snmp messgae
+ * - addr points to the start of the address
+ */
+static inline void mangle_address(unsigned char *begin,
+ unsigned char *addr,
+ const struct oct1_map *map,
+ __sum16 *check)
+{
+ if (map->from == NOCT1(addr)) {
+ u_int32_t old;
+
+ if (debug)
+ memcpy(&old, addr, sizeof(old));
+
+ *addr = map->to;
+
+ /* Update UDP checksum if being used */
+ if (*check) {
+ fast_csum(check,
+ &map->from, &map->to, addr - begin);
+
+ }
+
+ if (debug)
+ printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n",
+ &old, addr);
+ }
+}
+
+static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
+ struct snmp_v1_trap *trap,
+ const struct oct1_map *map,
+ __sum16 *check)
+{
+ unsigned int cls, con, tag, len;
+ unsigned char *end;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+ return 0;
+
+ if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
+ return 0;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_id_free;
+
+ if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
+ (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
+ goto err_id_free;
+
+ if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
+ goto err_id_free;
+
+ /* IPv4 only */
+ if (len != 4)
+ goto err_addr_free;
+
+ mangle_address(ctx->begin, ctx->pointer - 4, map, check);
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ goto err_addr_free;
+
+ if (!asn1_uint_decode(ctx, end, &trap->general))
+ goto err_addr_free;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ goto err_addr_free;
+
+ if (!asn1_uint_decode(ctx, end, &trap->specific))
+ goto err_addr_free;
+
+ if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+ goto err_addr_free;
+
+ if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
+ (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
+ goto err_addr_free;
+
+ if (!asn1_ulong_decode(ctx, end, &trap->time))
+ goto err_addr_free;
+
+ return 1;
+
+err_addr_free:
+ kfree((unsigned long *)trap->ip_address);
+
+err_id_free:
+ kfree(trap->id);
+
+ return 0;
+}
+
+/*****************************************************************************
+ *
+ * Misc. routines
+ *
+ *****************************************************************************/
+
+static void hex_dump(const unsigned char *buf, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (i && !(i % 16))
+ printk("\n");
+ printk("%02x ", *(buf + i));
+ }
+ printk("\n");
+}
+
+/*
+ * Parse and mangle SNMP message according to mapping.
+ * (And this is the fucking 'basic' method).
+ */
+static int snmp_parse_mangle(unsigned char *msg,
+ u_int16_t len,
+ const struct oct1_map *map,
+ __sum16 *check)
+{
+ unsigned char *eoc, *end;
+ unsigned int cls, con, tag, vers, pdutype;
+ struct asn1_ctx ctx;
+ struct asn1_octstr comm;
+ struct snmp_object *obj;
+
+ if (debug > 1)
+ hex_dump(msg, len);
+
+ asn1_open(&ctx, msg, len);
+
+ /*
+ * Start of SNMP message.
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ /*
+ * Version 1 or 2 handled.
+ */
+ if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+ return 0;
+ if (!asn1_uint_decode (&ctx, end, &vers))
+ return 0;
+ if (debug > 1)
+ printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
+ if (vers > 1)
+ return 1;
+
+ /*
+ * Community.
+ */
+ if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
+ return 0;
+ if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
+ return 0;
+ if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
+ return 0;
+ if (debug > 1) {
+ unsigned int i;
+
+ printk(KERN_DEBUG "bsalg: community: ");
+ for (i = 0; i < comm.len; i++)
+ printk("%c", comm.data[i]);
+ printk("\n");
+ }
+ kfree(comm.data);
+
+ /*
+ * PDU type
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
+ return 0;
+ if (cls != ASN1_CTX || con != ASN1_CON)
+ return 0;
+ if (debug > 1) {
+ static const unsigned char *const pdus[] = {
+ [SNMP_PDU_GET] = "get",
+ [SNMP_PDU_NEXT] = "get-next",
+ [SNMP_PDU_RESPONSE] = "response",
+ [SNMP_PDU_SET] = "set",
+ [SNMP_PDU_TRAP1] = "trapv1",
+ [SNMP_PDU_BULK] = "bulk",
+ [SNMP_PDU_INFORM] = "inform",
+ [SNMP_PDU_TRAP2] = "trapv2"
+ };
+
+ if (pdutype > SNMP_PDU_TRAP2)
+ printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
+ else
+ printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
+ }
+ if (pdutype != SNMP_PDU_RESPONSE &&
+ pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
+ return 1;
+
+ /*
+ * Request header or v1 trap
+ */
+ if (pdutype == SNMP_PDU_TRAP1) {
+ struct snmp_v1_trap trap;
+ unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
+
+ if (ret) {
+ kfree(trap.id);
+ kfree((unsigned long *)trap.ip_address);
+ } else
+ return ret;
+
+ } else {
+ struct snmp_request req;
+
+ if (!snmp_request_decode(&ctx, &req))
+ return 0;
+
+ if (debug > 1)
+ printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
+ "error_index=%u\n", req.id, req.error_status,
+ req.error_index);
+ }
+
+ /*
+ * Loop through objects, look for IP addresses to mangle.
+ */
+ if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+ return 0;
+
+ if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+ return 0;
+
+ while (!asn1_eoc_decode(&ctx, eoc)) {
+ unsigned int i;
+
+ if (!snmp_object_decode(&ctx, &obj)) {
+ if (obj) {
+ kfree(obj->id);
+ kfree(obj);
+ }
+ return 0;
+ }
+
+ if (debug > 1) {
+ printk(KERN_DEBUG "bsalg: object: ");
+ for (i = 0; i < obj->id_len; i++) {
+ if (i > 0)
+ printk(".");
+ printk("%lu", obj->id[i]);
+ }
+ printk(": type=%u\n", obj->type);
+
+ }
+
+ if (obj->type == SNMP_IPADDR)
+ mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+
+ kfree(obj->id);
+ kfree(obj);
+ }
+
+ if (!asn1_eoc_decode(&ctx, eoc))
+ return 0;
+
+ return 1;
+}
+
+/*****************************************************************************
+ *
+ * NAT routines.
+ *
+ *****************************************************************************/
+
+/*
+ * SNMP translation routine.
+ */
+static int snmp_translate(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+ u_int16_t udplen = ntohs(udph->len);
+ u_int16_t paylen = udplen - sizeof(struct udphdr);
+ int dir = CTINFO2DIR(ctinfo);
+ struct oct1_map map;
+
+ /*
+ * Determine mappping for application layer addresses based
+ * on NAT manipulations for the packet.
+ */
+ if (dir == IP_CT_DIR_ORIGINAL) {
+ /* SNAT traps */
+ map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip);
+ map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
+ } else {
+ /* DNAT replies */
+ map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip);
+ map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip);
+ }
+
+ if (map.from == map.to)
+ return NF_ACCEPT;
+
+ if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
+ paylen, &map, &udph->check)) {
+ net_warn_ratelimited("bsalg: parser failed\n");
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted */
+static int help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ unsigned int ret;
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+
+ /* SNMP replies and originating SNMP traps get mangled */
+ if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+ return NF_ACCEPT;
+ if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ /* No NAT? */
+ if (!(ct->status & IPS_NAT_MASK))
+ return NF_ACCEPT;
+
+ /*
+ * Make sure the packet length is ok. So far, we were only guaranteed
+ * to have a valid length IP header plus 8 bytes, which means we have
+ * enough room for a UDP header. Just verify the UDP length field so we
+ * can mess around with the payload.
+ */
+ if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
+ net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
+ &iph->saddr, &iph->daddr);
+ return NF_DROP;
+ }
+
+ if (!skb_make_writable(skb, skb->len))
+ return NF_DROP;
+
+ spin_lock_bh(&snmp_lock);
+ ret = snmp_translate(ct, ctinfo, skb);
+ spin_unlock_bh(&snmp_lock);
+ return ret;
+}
+
+static const struct nf_conntrack_expect_policy snmp_exp_policy = {
+ .max_expected = 0,
+ .timeout = 180,
+};
+
+static struct nf_conntrack_helper snmp_helper __read_mostly = {
+ .me = THIS_MODULE,
+ .help = help,
+ .expect_policy = &snmp_exp_policy,
+ .name = "snmp",
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+};
+
+static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
+ .me = THIS_MODULE,
+ .help = help,
+ .expect_policy = &snmp_exp_policy,
+ .name = "snmp_trap",
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+};
+
+/*****************************************************************************
+ *
+ * Module stuff.
+ *
+ *****************************************************************************/
+
+static int __init nf_nat_snmp_basic_init(void)
+{
+ int ret = 0;
+
+ BUG_ON(nf_nat_snmp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_snmp_hook, help);
+
+ ret = nf_conntrack_helper_register(&snmp_trap_helper);
+ if (ret < 0) {
+ nf_conntrack_helper_unregister(&snmp_helper);
+ return ret;
+ }
+ return ret;
+}
+
+static void __exit nf_nat_snmp_basic_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_snmp_hook, NULL);
+ nf_conntrack_helper_unregister(&snmp_trap_helper);
+}
+
+module_init(nf_nat_snmp_basic_init);
+module_exit(nf_nat_snmp_basic_fini);
+
+module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
new file mode 100644
index 000000000..3262e41ff
--- /dev/null
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -0,0 +1,192 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+
+const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook)
+{
+ const struct tcphdr *oth;
+
+ /* IP header checks: fragment. */
+ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+ return NULL;
+
+ oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
+ sizeof(struct tcphdr), _oth);
+ if (oth == NULL)
+ return NULL;
+
+ /* No RST for RST. */
+ if (oth->rst)
+ return NULL;
+
+ /* Check checksum */
+ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
+ return NULL;
+
+ return oth;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
+
+struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl)
+{
+ struct iphdr *niph, *oiph = ip_hdr(oldskb);
+
+ skb_reset_network_header(nskb);
+ niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
+ niph->version = 4;
+ niph->ihl = sizeof(struct iphdr) / 4;
+ niph->tos = 0;
+ niph->id = 0;
+ niph->frag_off = htons(IP_DF);
+ niph->protocol = protocol;
+ niph->check = 0;
+ niph->saddr = oiph->daddr;
+ niph->daddr = oiph->saddr;
+ niph->ttl = ttl;
+
+ nskb->protocol = htons(ETH_P_IP);
+
+ return niph;
+}
+EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
+
+void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth)
+{
+ struct iphdr *niph = ip_hdr(nskb);
+ struct tcphdr *tcph;
+
+ skb_reset_transport_header(nskb);
+ tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+ memset(tcph, 0, sizeof(*tcph));
+ tcph->source = oth->dest;
+ tcph->dest = oth->source;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+
+ if (oth->ack) {
+ tcph->seq = oth->ack_seq;
+ } else {
+ tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+ oldskb->len - ip_hdrlen(oldskb) -
+ (oth->doff << 2));
+ tcph->ack = 1;
+ }
+
+ tcph->rst = 1;
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+ niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)tcph - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
+
+/* Send RST reply */
+void nf_send_reset(struct sk_buff *oldskb, int hook)
+{
+ struct sk_buff *nskb;
+ const struct iphdr *oiph;
+ struct iphdr *niph;
+ const struct tcphdr *oth;
+ struct tcphdr _oth;
+
+ oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+ if (!oth)
+ return;
+
+ if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ return;
+
+ oiph = ip_hdr(oldskb);
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ /* ip_route_me_harder expects skb->dst to be set */
+ skb_dst_set_noref(nskb, skb_dst(oldskb));
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ ip4_dst_hoplimit(skb_dst(nskb)));
+ nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+
+ if (ip_route_me_harder(nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ /* "Never happens" */
+ if (nskb->len > dst_mtu(skb_dst(nskb)))
+ goto free_nskb;
+
+ nf_ct_attach(nskb, oldskb);
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* If we use ip_local_out for bridged traffic, the MAC source on
+ * the RST will be ours, instead of the destination's. This confuses
+ * some routers/firewalls, and they drop the packet. So we need to
+ * build the eth header using the original destination's MAC as the
+ * source, and send the RST packet directly.
+ */
+ if (oldskb->nf_bridge) {
+ struct ethhdr *oeth = eth_hdr(oldskb);
+
+ nskb->dev = nf_bridge_get_physindev(oldskb);
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+ if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+ oeth->h_source, oeth->h_dest, nskb->len) < 0)
+ goto free_nskb;
+ dev_queue_xmit(nskb);
+ } else
+#endif
+ ip_local_out(nskb);
+
+ return;
+
+ free_nskb:
+ kfree_skb(nskb);
+}
+EXPORT_SYMBOL_GPL(nf_send_reset);
+
+void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
+{
+ struct iphdr *iph = ip_hdr(skb_in);
+ u8 proto;
+
+ if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET))
+ return;
+
+ if (skb_csum_unnecessary(skb_in)) {
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+ return;
+ }
+
+ if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
+ proto = iph->protocol;
+ else
+ proto = 0;
+
+ if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}
+EXPORT_SYMBOL_GPL(nf_send_unreach);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
new file mode 100644
index 000000000..8412268bb
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netfilter_arp.h>
+#include <net/netfilter/nf_tables.h>
+
+static unsigned int
+nft_do_chain_arp(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo(&pkt, ops, skb, state);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static struct nft_af_info nft_af_arp __read_mostly = {
+ .family = NFPROTO_ARP,
+ .nhooks = NF_ARP_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_ARP_IN] = nft_do_chain_arp,
+ [NF_ARP_OUT] = nft_do_chain_arp,
+ [NF_ARP_FORWARD] = nft_do_chain_arp,
+ },
+};
+
+static int nf_tables_arp_init_net(struct net *net)
+{
+ net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.arp== NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
+
+ if (nft_register_afinfo(net, net->nft.arp) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.arp);
+ return -ENOMEM;
+}
+
+static void nf_tables_arp_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.arp);
+ kfree(net->nft.arp);
+}
+
+static struct pernet_operations nf_tables_arp_net_ops = {
+ .init = nf_tables_arp_init_net,
+ .exit = nf_tables_arp_exit_net,
+};
+
+static const struct nf_chain_type filter_arp = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_ARP,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_ARP_IN) |
+ (1 << NF_ARP_OUT) |
+ (1 << NF_ARP_FORWARD),
+};
+
+static int __init nf_tables_arp_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_arp);
+ ret = register_pernet_subsys(&nf_tables_arp_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_arp);
+
+ return ret;
+}
+
+static void __exit nf_tables_arp_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_arp_net_ops);
+ nft_unregister_chain_type(&filter_arp);
+}
+
+module_init(nf_tables_arp_init);
+module_exit(nf_tables_arp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
new file mode 100644
index 000000000..aa180d3a6
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+
+static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (unlikely(skb->len < sizeof(struct iphdr) ||
+ ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
+ if (net_ratelimit())
+ pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
+ "packet\n");
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain_ipv4(ops, skb, state);
+}
+
+struct nft_af_info nft_af_ipv4 __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .nhooks = NF_INET_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
+ [NF_INET_LOCAL_OUT] = nft_ipv4_output,
+ [NF_INET_FORWARD] = nft_do_chain_ipv4,
+ [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
+ [NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
+ },
+};
+EXPORT_SYMBOL_GPL(nft_af_ipv4);
+
+static int nf_tables_ipv4_init_net(struct net *net)
+{
+ net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.ipv4 == NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
+
+ if (nft_register_afinfo(net, net->nft.ipv4) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.ipv4);
+ return -ENOMEM;
+}
+
+static void nf_tables_ipv4_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.ipv4);
+ kfree(net->nft.ipv4);
+}
+
+static struct pernet_operations nf_tables_ipv4_net_ops = {
+ .init = nf_tables_ipv4_init_net,
+ .exit = nf_tables_ipv4_exit_net,
+};
+
+static const struct nf_chain_type filter_ipv4 = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING),
+};
+
+static int __init nf_tables_ipv4_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_ipv4);
+ ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_ipv4);
+
+ return ret;
+}
+
+static void __exit nf_tables_ipv4_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
+ nft_unregister_chain_type(&filter_ipv4);
+}
+
+module_init(nf_tables_ipv4_init);
+module_exit(nf_tables_ipv4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET);
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
new file mode 100644
index 000000000..bf5c30ae1
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct)
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_fn(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_in(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_out(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv4_local_fn(ops, skb, state, nft_nat_do_chain);
+}
+
+static const struct nf_chain_type nft_chain_nat_ipv4 = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nft_nat_ipv4_in,
+ [NF_INET_POST_ROUTING] = nft_nat_ipv4_out,
+ [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn,
+ [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn,
+ },
+};
+
+static int __init nft_chain_nat_init(void)
+{
+ int err;
+
+ err = nft_register_chain_type(&nft_chain_nat_ipv4);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void __exit nft_chain_nat_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_nat_ipv4);
+}
+
+module_init(nft_chain_nat_init);
+module_exit(nft_chain_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
new file mode 100644
index 000000000..e335b0afd
--- /dev/null
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ unsigned int ret;
+ struct nft_pktinfo pkt;
+ u32 mark;
+ __be32 saddr, daddr;
+ u_int8_t tos;
+ const struct iphdr *iph;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
+ return NF_ACCEPT;
+
+ nft_set_pktinfo_ipv4(&pkt, ops, skb, state);
+
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = nft_do_chain(&pkt, ops);
+ if (ret != NF_DROP && ret != NF_QUEUE) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos)
+ if (ip_route_me_harder(skb, RTN_UNSPEC))
+ ret = NF_DROP;
+ }
+ return ret;
+}
+
+static const struct nf_chain_type nft_chain_route_ipv4 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook,
+ },
+};
+
+static int __init nft_chain_route_init(void)
+{
+ return nft_register_chain_type(&nft_chain_route_ipv4);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_route_ipv4);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
new file mode 100644
index 000000000..40e414c4c
--- /dev/null
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_masq.h>
+#include <net/netfilter/ipv4/nf_nat_masquerade.h>
+
+static void nft_masq_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_masq *priv = nft_expr_priv(expr);
+ struct nf_nat_range range;
+
+ memset(&range, 0, sizeof(range));
+ range.flags = priv->flags;
+
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
+ &range, pkt->out);
+}
+
+static struct nft_expr_type nft_masq_ipv4_type;
+static const struct nft_expr_ops nft_masq_ipv4_ops = {
+ .type = &nft_masq_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
+ .eval = nft_masq_ipv4_eval,
+ .init = nft_masq_init,
+ .dump = nft_masq_dump,
+ .validate = nft_masq_validate,
+};
+
+static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "masq",
+ .ops = &nft_masq_ipv4_ops,
+ .policy = nft_masq_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_masq_ipv4_module_init(void)
+{
+ int ret;
+
+ ret = nft_register_expr(&nft_masq_ipv4_type);
+ if (ret < 0)
+ return ret;
+
+ nf_nat_masquerade_ipv4_register_notifier();
+
+ return ret;
+}
+
+static void __exit nft_masq_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_masq_ipv4_type);
+ nf_nat_masquerade_ipv4_unregister_notifier();
+}
+
+module_init(nft_masq_ipv4_module_init);
+module_exit(nft_masq_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
new file mode 100644
index 000000000..d8d795df9
--- /dev/null
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_redirect.h>
+#include <net/netfilter/nft_redir.h>
+
+static void nft_redir_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_redir *priv = nft_expr_priv(expr);
+ struct nf_nat_ipv4_multi_range_compat mr;
+
+ memset(&mr, 0, sizeof(mr));
+ if (priv->sreg_proto_min) {
+ mr.range[0].min.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
+ mr.range[0].max.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
+ mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+
+ mr.range[0].flags |= priv->flags;
+
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr,
+ pkt->ops->hooknum);
+}
+
+static struct nft_expr_type nft_redir_ipv4_type;
+static const struct nft_expr_ops nft_redir_ipv4_ops = {
+ .type = &nft_redir_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+ .eval = nft_redir_ipv4_eval,
+ .init = nft_redir_init,
+ .dump = nft_redir_dump,
+ .validate = nft_redir_validate,
+};
+
+static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "redir",
+ .ops = &nft_redir_ipv4_ops,
+ .policy = nft_redir_policy,
+ .maxattr = NFTA_REDIR_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_redir_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_redir_ipv4_type);
+}
+
+static void __exit nft_redir_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_redir_ipv4_type);
+}
+
+module_init(nft_redir_ipv4_module_init);
+module_exit(nft_redir_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
new file mode 100644
index 000000000..b07e58b51
--- /dev/null
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/nft_reject.h>
+
+static void nft_reject_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach(pkt->skb, priv->icmp_code,
+ pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ break;
+ default:
+ break;
+ }
+
+ regs->verdict.code = NF_DROP;
+}
+
+static struct nft_expr_type nft_reject_ipv4_type;
+static const struct nft_expr_ops nft_reject_ipv4_ops = {
+ .type = &nft_reject_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_ipv4_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+};
+
+static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "reject",
+ .ops = &nft_reject_ipv4_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_ipv4_module_init(void)
+{
+ return nft_register_expr(&nft_reject_ipv4_type);
+}
+
+static void __exit nft_reject_ipv4_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_ipv4_type);
+}
+
+module_init(nft_reject_ipv4_module_init);
+module_exit(nft_reject_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject");
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 000000000..05ff44b75
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,1223 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * "Ping" sockets
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6),
+ * Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ *
+ */
+
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#endif
+
+struct ping_table {
+ struct hlist_nulls_head hash[PING_HTABLE_SIZE];
+ rwlock_t lock;
+};
+
+static struct ping_table ping_table;
+struct pingv6_ops pingv6_ops;
+EXPORT_SYMBOL_GPL(pingv6_ops);
+
+static u16 ping_port_rover;
+
+static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
+{
+ u32 res = (num + net_hash_mix(net)) & mask;
+
+ pr_debug("hash(%u) = %u\n", num, res);
+ return res;
+}
+EXPORT_SYMBOL_GPL(ping_hash);
+
+static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
+ struct net *net, unsigned int num)
+{
+ return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+
+int ping_get_port(struct sock *sk, unsigned short ident)
+{
+ struct hlist_nulls_node *node;
+ struct hlist_nulls_head *hlist;
+ struct inet_sock *isk, *isk2;
+ struct sock *sk2 = NULL;
+
+ isk = inet_sk(sk);
+ write_lock_bh(&ping_table.lock);
+ if (ident == 0) {
+ u32 i;
+ u16 result = ping_port_rover + 1;
+
+ for (i = 0; i < (1L << 16); i++, result++) {
+ if (!result)
+ result++; /* avoid zero */
+ hlist = ping_hashslot(&ping_table, sock_net(sk),
+ result);
+ ping_portaddr_for_each_entry(sk2, node, hlist) {
+ isk2 = inet_sk(sk2);
+
+ if (isk2->inet_num == result)
+ goto next_port;
+ }
+
+ /* found */
+ ping_port_rover = ident = result;
+ break;
+next_port:
+ ;
+ }
+ if (i >= (1L << 16))
+ goto fail;
+ } else {
+ hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+ ping_portaddr_for_each_entry(sk2, node, hlist) {
+ isk2 = inet_sk(sk2);
+
+ /* BUG? Why is this reuse and not reuseaddr? ping.c
+ * doesn't turn off SO_REUSEADDR, and it doesn't expect
+ * that other ping processes can steal its packets.
+ */
+ if ((isk2->inet_num == ident) &&
+ (sk2 != sk) &&
+ (!sk2->sk_reuse || !sk->sk_reuse))
+ goto fail;
+ }
+ }
+
+ pr_debug("found port/ident = %d\n", ident);
+ isk->inet_num = ident;
+ if (sk_unhashed(sk)) {
+ pr_debug("was not hashed\n");
+ sock_hold(sk);
+ hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ }
+ write_unlock_bh(&ping_table.lock);
+ return 0;
+
+fail:
+ write_unlock_bh(&ping_table.lock);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ping_get_port);
+
+void ping_hash(struct sock *sk)
+{
+ pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+ BUG(); /* "Please do not press this button again." */
+}
+
+void ping_unhash(struct sock *sk)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+ if (sk_hashed(sk)) {
+ write_lock_bh(&ping_table.lock);
+ hlist_nulls_del(&sk->sk_nulls_node);
+ sk_nulls_node_init(&sk->sk_nulls_node);
+ sock_put(sk);
+ isk->inet_num = 0;
+ isk->inet_sport = 0;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&ping_table.lock);
+ }
+}
+EXPORT_SYMBOL_GPL(ping_unhash);
+
+static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
+{
+ struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
+ struct sock *sk = NULL;
+ struct inet_sock *isk;
+ struct hlist_nulls_node *hnode;
+ int dif = skb->dev->ifindex;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
+ (int)ident, &ip_hdr(skb)->daddr, dif);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
+ (int)ident, &ipv6_hdr(skb)->daddr, dif);
+#endif
+ }
+
+ read_lock_bh(&ping_table.lock);
+
+ ping_portaddr_for_each_entry(sk, hnode, hslot) {
+ isk = inet_sk(sk);
+
+ pr_debug("iterate\n");
+ if (isk->inet_num != ident)
+ continue;
+
+ if (skb->protocol == htons(ETH_P_IP) &&
+ sk->sk_family == AF_INET) {
+ pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk,
+ (int) isk->inet_num, &isk->inet_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (isk->inet_rcv_saddr &&
+ isk->inet_rcv_saddr != ip_hdr(skb)->daddr)
+ continue;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6) &&
+ sk->sk_family == AF_INET6) {
+
+ pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk,
+ (int) isk->inet_num,
+ &sk->sk_v6_rcv_saddr,
+ sk->sk_bound_dev_if);
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_rcv_saddr,
+ &ipv6_hdr(skb)->daddr))
+ continue;
+#endif
+ } else {
+ continue;
+ }
+
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ continue;
+
+ sock_hold(sk);
+ goto exit;
+ }
+
+ sk = NULL;
+exit:
+ read_unlock_bh(&ping_table.lock);
+
+ return sk;
+}
+
+static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
+ kgid_t *high)
+{
+ kgid_t *data = net->ipv4.ping_group_range.range;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&net->ipv4.ping_group_range.lock);
+
+ *low = data[0];
+ *high = data[1];
+ } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq));
+}
+
+
+int ping_init_sock(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ kgid_t group = current_egid();
+ struct group_info *group_info;
+ int i, j, count;
+ kgid_t low, high;
+ int ret = 0;
+
+ if (sk->sk_family == AF_INET6)
+ sk->sk_ipv6only = 1;
+
+ inet_get_ping_group_range_net(net, &low, &high);
+ if (gid_lte(low, group) && gid_lte(group, high))
+ return 0;
+
+ group_info = get_current_groups();
+ count = group_info->ngroups;
+ for (i = 0; i < group_info->nblocks; i++) {
+ int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
+ for (j = 0; j < cp_count; j++) {
+ kgid_t gid = group_info->blocks[i][j];
+ if (gid_lte(low, gid) && gid_lte(gid, high))
+ goto out_release_group;
+ }
+
+ count -= cp_count;
+ }
+
+ ret = -EACCES;
+
+out_release_group:
+ put_group_info(group_info);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ping_init_sock);
+
+void ping_close(struct sock *sk, long timeout)
+{
+ pr_debug("ping_close(sk=%p,sk->num=%u)\n",
+ inet_sk(sk), inet_sk(sk)->inet_num);
+ pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
+
+ sk_common_release(sk);
+}
+EXPORT_SYMBOL_GPL(ping_close);
+
+/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
+static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
+ struct sockaddr *uaddr, int addr_len) {
+ struct net *net = sock_net(sk);
+ if (sk->sk_family == AF_INET) {
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ int chk_addr_ret;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->sin_family != AF_INET &&
+ !(addr->sin_family == AF_UNSPEC &&
+ addr->sin_addr.s_addr == htonl(INADDR_ANY)))
+ return -EAFNOSUPPORT;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
+ sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+
+ if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
+ chk_addr_ret = RTN_LOCAL;
+
+ if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 &&
+ isk->freebind == 0 && isk->transparent == 0 &&
+ chk_addr_ret != RTN_LOCAL) ||
+ chk_addr_ret == RTN_MULTICAST ||
+ chk_addr_ret == RTN_BROADCAST)
+ return -EADDRNOTAVAIL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
+ int addr_type, scoped, has_addr;
+ struct net_device *dev = NULL;
+
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ if (addr->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n",
+ sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port));
+
+ addr_type = ipv6_addr_type(&addr->sin6_addr);
+ scoped = __ipv6_addr_needs_scope_id(addr_type);
+ if ((addr_type != IPV6_ADDR_ANY &&
+ !(addr_type & IPV6_ADDR_UNICAST)) ||
+ (scoped && !addr->sin6_scope_id))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (addr->sin6_scope_id) {
+ dev = dev_get_by_index_rcu(net, addr->sin6_scope_id);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ }
+ has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
+ scoped);
+ rcu_read_unlock();
+
+ if (!(isk->freebind || isk->transparent || has_addr ||
+ addr_type == IPV6_ADDR_ANY))
+ return -EADDRNOTAVAIL;
+
+ if (scoped)
+ sk->sk_bound_dev_if = addr->sin6_scope_id;
+#endif
+ } else {
+ return -EAFNOSUPPORT;
+ }
+ return 0;
+}
+
+static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
+{
+ if (saddr->sa_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ struct sockaddr_in *addr = (struct sockaddr_in *) saddr;
+ isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (saddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr;
+#endif
+ }
+}
+
+static void ping_clear_saddr(struct sock *sk, int dif)
+{
+ sk->sk_bound_dev_if = dif;
+ if (sk->sk_family == AF_INET) {
+ struct inet_sock *isk = inet_sk(sk);
+ isk->inet_rcv_saddr = isk->inet_saddr = 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
+ memset(&np->saddr, 0, sizeof(np->saddr));
+#endif
+ }
+}
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+
+int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ unsigned short snum;
+ int err;
+ int dif = sk->sk_bound_dev_if;
+
+ err = ping_check_bind_addr(sk, isk, uaddr, addr_len);
+ if (err)
+ return err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (isk->inet_num != 0)
+ goto out;
+
+ err = -EADDRINUSE;
+ ping_set_saddr(sk, uaddr);
+ snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
+ if (ping_get_port(sk, snum) != 0) {
+ ping_clear_saddr(sk, dif);
+ goto out;
+ }
+
+ pr_debug("after bind(): num = %d, dif = %d\n",
+ (int)isk->inet_num,
+ (int)sk->sk_bound_dev_if);
+
+ err = 0;
+ if (sk->sk_family == AF_INET && isk->inet_rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+#endif
+
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ isk->inet_sport = htons(isk->inet_num);
+ isk->inet_daddr = 0;
+ isk->inet_dport = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr));
+#endif
+
+ sk_dst_reset(sk);
+out:
+ release_sock(sk);
+ pr_debug("ping_v4_bind -> %d\n", err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ping_bind);
+
+/*
+ * Is this a supported type of ICMP message?
+ */
+
+static inline int ping_supported(int family, int type, int code)
+{
+ return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
+ (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0);
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+
+void ping_err(struct sk_buff *skb, int offset, u32 info)
+{
+ int family;
+ struct icmphdr *icmph;
+ struct inet_sock *inet_sock;
+ int type;
+ int code;
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk;
+ int harderr;
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ family = AF_INET;
+ type = icmp_hdr(skb)->type;
+ code = icmp_hdr(skb)->code;
+ icmph = (struct icmphdr *)(skb->data + offset);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ family = AF_INET6;
+ type = icmp6_hdr(skb)->icmp6_type;
+ code = icmp6_hdr(skb)->icmp6_code;
+ icmph = (struct icmphdr *) (skb->data + offset);
+ } else {
+ BUG();
+ }
+
+ /* We assume the packet has already been checked by icmp_unreach */
+
+ if (!ping_supported(family, icmph->type, icmph->code))
+ return;
+
+ pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n",
+ skb->protocol, type, code, ntohs(icmph->un.echo.id),
+ ntohs(icmph->un.echo.sequence));
+
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+ if (!sk) {
+ pr_debug("no socket, dropping\n");
+ return; /* No socket for error */
+ }
+ pr_debug("err on socket %p\n", sk);
+
+ err = 0;
+ harderr = 0;
+ inet_sock = inet_sk(sk);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ /* This is not a real error but ping wants to see it.
+ * Report it with some fake errno.
+ */
+ err = EREMOTEIO;
+ break;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
+ if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ harderr = 1;
+ break;
+ }
+ goto out;
+ }
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ case ICMP_REDIRECT:
+ /* See ICMP_SOURCE_QUENCH */
+ ipv4_sk_redirect(skb, sk);
+ err = EREMOTEIO;
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ harderr = pingv6_ops.icmpv6_err_convert(type, code, &err);
+#endif
+ }
+
+ /*
+ * RFC1122: OK. Passes ICMP errors back to application, as per
+ * 4.1.3.3.
+ */
+ if ((family == AF_INET && !inet_sock->recverr) ||
+ (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else {
+ if (family == AF_INET) {
+ ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+ info, (u8 *)icmph);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ pingv6_ops.ipv6_icmp_error(sk, skb, err, 0,
+ info, (u8 *)icmph);
+#endif
+ }
+ }
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+out:
+ sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(ping_err);
+
+/*
+ * Copy and checksum an ICMP Echo packet from user space into a buffer
+ * starting from the payload.
+ */
+
+int ping_getfrag(void *from, char *to,
+ int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+ struct pingfakehdr *pfh = (struct pingfakehdr *)from;
+
+ if (offset == 0) {
+ fraglen -= sizeof(struct icmphdr);
+ if (fraglen < 0)
+ BUG();
+ if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
+ fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter) != fraglen)
+ return -EFAULT;
+ } else if (offset < sizeof(struct icmphdr)) {
+ BUG();
+ } else {
+ if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter) != fraglen)
+ return -EFAULT;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* For IPv6, checksum each skb as we go along, as expected by
+ * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in
+ * wcheck, it will be finalized in ping_v4_push_pending_frames.
+ */
+ if (pfh->family == AF_INET6) {
+ skb->csum = pfh->wcheck;
+ skb->ip_summed = CHECKSUM_NONE;
+ pfh->wcheck = 0;
+ }
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_getfrag);
+
+static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
+ struct flowi4 *fl4)
+{
+ struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+
+ pfh->wcheck = csum_partial((char *)&pfh->icmph,
+ sizeof(struct icmphdr), pfh->wcheck);
+ pfh->icmph.checksum = csum_fold(pfh->wcheck);
+ memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+ skb->ip_summed = CHECKSUM_NONE;
+ return ip_push_pending_frames(sk, fl4);
+}
+
+int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
+ void *user_icmph, size_t icmph_len) {
+ u8 type, code;
+
+ if (len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /*
+ * Check the flags.
+ */
+
+ /* Mirror BSD error message compatibility */
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ /*
+ * Fetch the ICMP header provided by the userland.
+ * iovec is modified! The ICMP header is consumed.
+ */
+ if (memcpy_from_msg(user_icmph, msg, icmph_len))
+ return -EFAULT;
+
+ if (family == AF_INET) {
+ type = ((struct icmphdr *) user_icmph)->type;
+ code = ((struct icmphdr *) user_icmph)->code;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ type = ((struct icmp6hdr *) user_icmph)->icmp6_type;
+ code = ((struct icmp6hdr *) user_icmph)->icmp6_code;
+#endif
+ } else {
+ BUG();
+ }
+
+ if (!ping_supported(family, type, code))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+
+static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct net *net = sock_net(sk);
+ struct flowi4 fl4;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct icmphdr user_icmph;
+ struct pingfakehdr pfh;
+ struct rtable *rt = NULL;
+ struct ip_options_data opt_copy;
+ int free = 0;
+ __be32 saddr, daddr, faddr;
+ u8 tos;
+ int err;
+
+ pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+ err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph,
+ sizeof(user_icmph));
+ if (err)
+ return err;
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ if (msg->msg_namelen < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+ daddr = usin->sin_addr.s_addr;
+ /* no remote port */
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = inet->inet_daddr;
+ /* no remote port */
+ }
+
+ ipc.addr = inet->inet_saddr;
+ ipc.opt = NULL;
+ ipc.oif = sk->sk_bound_dev_if;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ if (err)
+ return err;
+ if (ipc.opt)
+ free = 1;
+ }
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = faddr = daddr;
+
+ if (ipc.opt && ipc.opt->opt.srr) {
+ if (!daddr)
+ return -EINVAL;
+ faddr = ipc.opt->opt.faddr;
+ }
+ tos = get_rttos(&ipc, inet);
+ if (sock_flag(sk, SOCK_LOCALROUTE) ||
+ (msg->msg_flags & MSG_DONTROUTE) ||
+ (ipc.opt && ipc.opt->opt.is_strictroute)) {
+ tos |= RTO_ONLINK;
+ }
+
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
+
+ flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE, sk->sk_protocol,
+ inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+
+ security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ err = -EACCES;
+ if ((rt->rt_flags & RTCF_BROADCAST) &&
+ !sock_flag(sk, SOCK_BROADCAST))
+ goto out;
+
+ if (msg->msg_flags & MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ if (!ipc.addr)
+ ipc.addr = fl4.daddr;
+
+ lock_sock(sk);
+
+ pfh.icmph.type = user_icmph.type; /* already checked */
+ pfh.icmph.code = user_icmph.code; /* ditto */
+ pfh.icmph.checksum = 0;
+ pfh.icmph.un.echo.id = inet->inet_sport;
+ pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+ pfh.msg = msg;
+ pfh.wcheck = 0;
+ pfh.family = AF_INET;
+
+ err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
+ 0, &ipc, &rt, msg->msg_flags);
+ if (err)
+ ip_flush_pending_frames(sk);
+ else
+ err = ping_v4_push_pending_frames(sk, &pfh, &fl4);
+ release_sock(sk);
+
+out:
+ ip_rt_put(rt);
+ if (free)
+ kfree(ipc.opt);
+ if (!err) {
+ icmp_out_count(sock_net(sk), user_icmph.type);
+ return len;
+ }
+ return err;
+
+do_confirm:
+ dst_confirm(&rt->dst);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
+}
+
+int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
+{
+ struct inet_sock *isk = inet_sk(sk);
+ int family = sk->sk_family;
+ struct sk_buff *skb;
+ int copied, err;
+
+ pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+ err = -EOPNOTSUPP;
+ if (flags & MSG_OOB)
+ goto out;
+
+ if (flags & MSG_ERRQUEUE)
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ /* Don't bother checking the checksum */
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ /* Copy the address and add cmsg data. */
+ if (family == AF_INET) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = 0 /* skb->h.uh->source */;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+
+ if (isk->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6hdr *ip6 = ipv6_hdr(skb);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+
+ if (sin6) {
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = 0;
+ sin6->sin6_addr = ip6->saddr;
+ sin6->sin6_flowinfo = 0;
+ if (np->sndflow)
+ sin6->sin6_flowinfo = ip6_flowinfo(ip6);
+ sin6->sin6_scope_id =
+ ipv6_iface_scope_id(&sin6->sin6_addr,
+ inet6_iif(skb));
+ *addr_len = sizeof(*sin6);
+ }
+
+ if (inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb);
+ if (skb->protocol == htons(ETH_P_IPV6) &&
+ inet6_sk(sk)->rxopt.all)
+ pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+#endif
+ } else {
+ BUG();
+ }
+
+ err = copied;
+
+done:
+ skb_free_datagram(sk, skb);
+out:
+ pr_debug("ping_recvmsg -> %d\n", err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ping_recvmsg);
+
+int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+ inet_sk(sk), inet_sk(sk)->inet_num, skb);
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ pr_debug("ping_queue_rcv_skb -> failed\n");
+ return -1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
+
+
+/*
+ * All we need to do is get the socket.
+ */
+
+bool ping_rcv(struct sk_buff *skb)
+{
+ struct sock *sk;
+ struct net *net = dev_net(skb->dev);
+ struct icmphdr *icmph = icmp_hdr(skb);
+
+ /* We assume the packet has already been checked by icmp_rcv */
+
+ pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+ skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+ /* Push ICMP header back */
+ skb_push(skb, skb->data - (u8 *)icmph);
+
+ sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
+ if (sk) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ pr_debug("rcv on socket %p\n", sk);
+ if (skb2)
+ ping_queue_rcv_skb(sk, skb2);
+ sock_put(sk);
+ return true;
+ }
+ pr_debug("no socket, dropping\n");
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(ping_rcv);
+
+struct proto ping_prot = {
+ .name = "PING",
+ .owner = THIS_MODULE,
+ .init = ping_init_sock,
+ .close = ping_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .sendmsg = ping_v4_sendmsg,
+ .recvmsg = ping_recvmsg,
+ .bind = ping_bind,
+ .backlog_rcv = ping_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = ping_hash,
+ .unhash = ping_unhash,
+ .get_port = ping_get_port,
+ .obj_size = sizeof(struct inet_sock),
+};
+EXPORT_SYMBOL(ping_prot);
+
+#ifdef CONFIG_PROC_FS
+
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+ struct sock *sk;
+ struct ping_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+ ++state->bucket) {
+ struct hlist_nulls_node *node;
+ struct hlist_nulls_head *hslot;
+
+ hslot = &ping_table.hash[state->bucket];
+
+ if (hlist_nulls_empty(hslot))
+ continue;
+
+ sk_nulls_for_each(sk, node, hslot) {
+ if (net_eq(sock_net(sk), net) &&
+ sk->sk_family == state->family)
+ goto found;
+ }
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct ping_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ do {
+ sk = sk_nulls_next(sk);
+ } while (sk && (!net_eq(sock_net(sk), net)));
+
+ if (!sk)
+ return ping_get_first(seq, state->bucket + 1);
+ return sk;
+}
+
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = ping_get_first(seq, 0);
+
+ if (sk)
+ while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
+{
+ struct ping_iter_state *state = seq->private;
+ state->bucket = 0;
+ state->family = family;
+
+ read_lock_bh(&ping_table.lock);
+
+ return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL_GPL(ping_seq_start);
+
+static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return ping_seq_start(seq, pos, AF_INET);
+}
+
+void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = ping_get_idx(seq, 0);
+ else
+ sk = ping_get_next(seq, v);
+
+ ++*pos;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(ping_seq_next);
+
+void ping_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock_bh(&ping_table.lock);
+}
+EXPORT_SYMBOL_GPL(ping_seq_stop);
+
+static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
+ int bucket)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+
+ seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ bucket, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp,
+ atomic_read(&sp->sk_drops));
+}
+
+static int ping_v4_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 127);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops");
+ else {
+ struct ping_iter_state *state = seq->private;
+
+ ping_v4_format_sock(v, seq, state->bucket);
+ }
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct seq_operations ping_v4_seq_ops = {
+ .show = ping_v4_seq_show,
+ .start = ping_v4_seq_start,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+};
+
+static int ping_seq_open(struct inode *inode, struct file *file)
+{
+ struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
+ return seq_open_net(inode, file, &afinfo->seq_ops,
+ sizeof(struct ping_iter_state));
+}
+
+const struct file_operations ping_seq_fops = {
+ .open = ping_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+EXPORT_SYMBOL_GPL(ping_seq_fops);
+
+static struct ping_seq_afinfo ping_v4_seq_afinfo = {
+ .name = "icmp",
+ .family = AF_INET,
+ .seq_fops = &ping_seq_fops,
+ .seq_ops = {
+ .start = ping_v4_seq_start,
+ .show = ping_v4_seq_show,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+ },
+};
+
+int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo)
+{
+ struct proc_dir_entry *p;
+ p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+ afinfo->seq_fops, afinfo);
+ if (!p)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ping_proc_register);
+
+void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo)
+{
+ remove_proc_entry(afinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL_GPL(ping_proc_unregister);
+
+static int __net_init ping_v4_proc_init_net(struct net *net)
+{
+ return ping_proc_register(net, &ping_v4_seq_afinfo);
+}
+
+static void __net_exit ping_v4_proc_exit_net(struct net *net)
+{
+ ping_proc_unregister(net, &ping_v4_seq_afinfo);
+}
+
+static struct pernet_operations ping_v4_net_ops = {
+ .init = ping_v4_proc_init_net,
+ .exit = ping_v4_proc_exit_net,
+};
+
+int __init ping_proc_init(void)
+{
+ return register_pernet_subsys(&ping_v4_net_ops);
+}
+
+void ping_proc_exit(void)
+{
+ unregister_pernet_subsys(&ping_v4_net_ops);
+}
+
+#endif
+
+void __init ping_init(void)
+{
+ int i;
+
+ for (i = 0; i < PING_HTABLE_SIZE; i++)
+ INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
+ rwlock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
new file mode 100644
index 000000000..e1f3b911d
--- /dev/null
+++ b/net/ipv4/proc.c
@@ -0,0 +1,540 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * This file implements the various access functions for the
+ * PROC file system. It is mainly used for debugging and
+ * statistics.
+ *
+ * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
+ * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ * Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
+ *
+ * Fixes:
+ * Alan Cox : UDP sockets show the rxqueue/txqueue
+ * using hint flag for the netinfo.
+ * Pauline Middelink : identd support
+ * Alan Cox : Make /proc safer.
+ * Erik Schoenfelder : /proc/net/snmp
+ * Alan Cox : Handle dead sockets properly.
+ * Gerhard Koerting : Show both timers
+ * Alan Cox : Allow inode to be NULL (kernel socket)
+ * Andi Kleen : Add support for open_requests and
+ * split functions for more readibility.
+ * Andi Kleen : Add support for /proc/net/netstat
+ * Arnaldo C. Melo : Convert to seq_file
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/bottom_half.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/raw.h>
+
+/*
+ * Report socket allocation statistics [mea@utu.fi]
+ */
+static int sockstat_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq->private;
+ unsigned int frag_mem;
+ int orphans, sockets;
+
+ local_bh_disable();
+ orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+ sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
+ local_bh_enable();
+
+ socket_seq_show(seq);
+ seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+ sock_prot_inuse_get(net, &tcp_prot), orphans,
+ atomic_read(&tcp_death_row.tw_count), sockets,
+ proto_memory_allocated(&tcp_prot));
+ seq_printf(seq, "UDP: inuse %d mem %ld\n",
+ sock_prot_inuse_get(net, &udp_prot),
+ proto_memory_allocated(&udp_prot));
+ seq_printf(seq, "UDPLITE: inuse %d\n",
+ sock_prot_inuse_get(net, &udplite_prot));
+ seq_printf(seq, "RAW: inuse %d\n",
+ sock_prot_inuse_get(net, &raw_prot));
+ frag_mem = ip_frag_mem(net);
+ seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
+ return 0;
+}
+
+static int sockstat_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, sockstat_seq_show);
+}
+
+static const struct file_operations sockstat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = sockstat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+/* snmp items */
+static const struct snmp_mib snmp4_ipstats_list[] = {
+ SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
+ SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+ SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+ SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+ SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+ SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
+ SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
+ SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS),
+ SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+ SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+ SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+ SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
+ SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
+ SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
+ SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
+ SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
+ SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
+ SNMP_MIB_SENTINEL
+};
+
+/* Following items are displayed in /proc/net/netstat */
+static const struct snmp_mib snmp4_ipextstats_list[] = {
+ SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
+ SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
+ SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
+ SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
+ SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS),
+ SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS),
+ SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS),
+ SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS),
+ SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+ SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+ SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+ SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+ /* Non RFC4293 fields */
+ SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+ SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+ SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+ SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+ SNMP_MIB_SENTINEL
+};
+
+static const struct {
+ const char *name;
+ int index;
+} icmpmibmap[] = {
+ { "DestUnreachs", ICMP_DEST_UNREACH },
+ { "TimeExcds", ICMP_TIME_EXCEEDED },
+ { "ParmProbs", ICMP_PARAMETERPROB },
+ { "SrcQuenchs", ICMP_SOURCE_QUENCH },
+ { "Redirects", ICMP_REDIRECT },
+ { "Echos", ICMP_ECHO },
+ { "EchoReps", ICMP_ECHOREPLY },
+ { "Timestamps", ICMP_TIMESTAMP },
+ { "TimestampReps", ICMP_TIMESTAMPREPLY },
+ { "AddrMasks", ICMP_ADDRESS },
+ { "AddrMaskReps", ICMP_ADDRESSREPLY },
+ { NULL, 0 }
+};
+
+
+static const struct snmp_mib snmp4_tcp_list[] = {
+ SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
+ SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
+ SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
+ SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
+ SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
+ SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
+ SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
+ SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
+ SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
+ SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
+ SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
+ SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
+ SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
+ SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+ SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
+ SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp4_udp_list[] = {
+ SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
+ SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
+ SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
+ SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+ SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+ SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
+ SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+ SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp4_net_list[] = {
+ SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
+ SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
+ SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
+ SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
+ SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
+ SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
+ SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
+ SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
+ SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
+ SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
+ SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
+ SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
+ SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
+ SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
+ SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
+ SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+ SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
+ SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
+ SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
+ SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
+ SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+ SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
+ SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
+ SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
+ SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
+ SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
+ SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
+ SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+ SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
+ SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
+ SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
+ SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
+ SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
+ SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
+ SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
+ SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
+ SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
+ SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
+ SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
+ SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
+ SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
+ SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
+ SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
+ SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
+ SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
+ SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
+ SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
+ SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+ SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+ SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
+ SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
+ SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
+ SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
+ SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+ SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
+ SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
+ SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
+ SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
+ SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
+ SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
+ SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
+ SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
+ SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
+ SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
+ SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+ SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
+ SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
+ SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
+ SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+ SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
+ SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
+ SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
+ SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+ SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
+ SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+ SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+ SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
+ SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
+ SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
+ SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
+ SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+ SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
+ SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
+ SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
+ SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
+ SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
+ SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
+ SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
+ SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
+ SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
+ SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
+ SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
+ SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
+ SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
+ SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
+ SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
+ SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
+ SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
+ SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT),
+ SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
+ SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
+ SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
+ SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV),
+ SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS),
+ SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ),
+ SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
+ SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
+ SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
+ SNMP_MIB_SENTINEL
+};
+
+static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
+ unsigned short *type, int count)
+{
+ int j;
+
+ if (count) {
+ seq_puts(seq, "\nIcmpMsg:");
+ for (j = 0; j < count; ++j)
+ seq_printf(seq, " %sType%u",
+ type[j] & 0x100 ? "Out" : "In",
+ type[j] & 0xff);
+ seq_puts(seq, "\nIcmpMsg:");
+ for (j = 0; j < count; ++j)
+ seq_printf(seq, " %lu", vals[j]);
+ }
+}
+
+static void icmpmsg_put(struct seq_file *seq)
+{
+#define PERLINE 16
+
+ int i, count;
+ unsigned short type[PERLINE];
+ unsigned long vals[PERLINE], val;
+ struct net *net = seq->private;
+
+ count = 0;
+ for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+ val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
+ if (val) {
+ type[count] = i;
+ vals[count++] = val;
+ }
+ if (count == PERLINE) {
+ icmpmsg_put_line(seq, vals, type, count);
+ count = 0;
+ }
+ }
+ icmpmsg_put_line(seq, vals, type, count);
+
+#undef PERLINE
+}
+
+static void icmp_put(struct seq_file *seq)
+{
+ int i;
+ struct net *net = seq->private;
+ atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
+
+ seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " In%s", icmpmibmap[i].name);
+ seq_puts(seq, " OutMsgs OutErrors");
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " Out%s", icmpmibmap[i].name);
+ seq_printf(seq, "\nIcmp: %lu %lu %lu",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ atomic_long_read(ptr + icmpmibmap[i].index));
+ seq_printf(seq, " %lu %lu",
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+ for (i = 0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
+}
+
+/*
+ * Called from the PROCfs module. This outputs /proc/net/snmp.
+ */
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+ struct net *net = seq->private;
+
+ seq_puts(seq, "Ip: Forwarding DefaultTTL");
+
+ for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+
+ seq_printf(seq, "\nIp: %d %d",
+ IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
+ sysctl_ip_default_ttl);
+
+ BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
+ for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %llu",
+ snmp_fold_field64(net->mib.ip_statistics,
+ snmp4_ipstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
+
+ icmp_put(seq); /* RFC 2011 compatibility */
+ icmpmsg_put(seq);
+
+ seq_puts(seq, "\nTcp:");
+ for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+
+ seq_puts(seq, "\nTcp:");
+ for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+ /* MaxConn field is signed, RFC 2012 */
+ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+ seq_printf(seq, " %ld",
+ snmp_fold_field(net->mib.tcp_statistics,
+ snmp4_tcp_list[i].entry));
+ else
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.tcp_statistics,
+ snmp4_tcp_list[i].entry));
+ }
+
+ seq_puts(seq, "\nUdp:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_udp_list[i].name);
+
+ seq_puts(seq, "\nUdp:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.udp_statistics,
+ snmp4_udp_list[i].entry));
+
+ /* the UDP and UDP-Lite MIBs are the same */
+ seq_puts(seq, "\nUdpLite:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_udp_list[i].name);
+
+ seq_puts(seq, "\nUdpLite:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.udplite_statistics,
+ snmp4_udp_list[i].entry));
+
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int snmp_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, snmp_seq_show);
+}
+
+static const struct file_operations snmp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = snmp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+
+
+/*
+ * Output /proc/net/netstat
+ */
+static int netstat_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+ struct net *net = seq->private;
+
+ seq_puts(seq, "TcpExt:");
+ for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_net_list[i].name);
+
+ seq_puts(seq, "\nTcpExt:");
+ for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.net_statistics,
+ snmp4_net_list[i].entry));
+
+ seq_puts(seq, "\nIpExt:");
+ for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
+
+ seq_puts(seq, "\nIpExt:");
+ for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %llu",
+ snmp_fold_field64(net->mib.ip_statistics,
+ snmp4_ipextstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
+
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int netstat_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, netstat_seq_show);
+}
+
+static const struct file_operations netstat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = netstat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static __net_init int ip_proc_init_net(struct net *net)
+{
+ if (!proc_create("sockstat", S_IRUGO, net->proc_net,
+ &sockstat_seq_fops))
+ goto out_sockstat;
+ if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
+ goto out_netstat;
+ if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
+ goto out_snmp;
+
+ return 0;
+
+out_snmp:
+ remove_proc_entry("netstat", net->proc_net);
+out_netstat:
+ remove_proc_entry("sockstat", net->proc_net);
+out_sockstat:
+ return -ENOMEM;
+}
+
+static __net_exit void ip_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("snmp", net->proc_net);
+ remove_proc_entry("netstat", net->proc_net);
+ remove_proc_entry("sockstat", net->proc_net);
+}
+
+static __net_initdata struct pernet_operations ip_proc_ops = {
+ .init = ip_proc_init_net,
+ .exit = ip_proc_exit_net,
+};
+
+int __init ip_misc_proc_init(void)
+{
+ return register_pernet_subsys(&ip_proc_ops);
+}
+
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
new file mode 100644
index 000000000..4b7c0ec65
--- /dev/null
+++ b/net/ipv4/protocol.c
@@ -0,0 +1,79 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * INET protocol dispatch tables.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : Ahah! udp icmp errors don't work because
+ * udp_err is never called!
+ * Alan Cox : Added new fields for init and ready for
+ * proper fragmentation (_NO_ 4K limits!)
+ * Richard Colella : Hang on hash collision
+ * Vince Laviano : Modified inet_del_protocol() to correctly
+ * maintain copy bit.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+
+const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_offloads);
+
+int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+ if (!prot->netns_ok) {
+ pr_err("Protocol %u is not namespace aware, cannot register.\n",
+ protocol);
+ return -EINVAL;
+ }
+
+ return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_protocol);
+
+int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_offload);
+
+int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+ int ret;
+
+ ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_del_protocol);
+
+int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ int ret;
+
+ ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_del_offload);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
new file mode 100644
index 000000000..561cd4b8f
--- /dev/null
+++ b/net/ipv4/raw.c
@@ -0,0 +1,1100 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * RAW - implementation of IP "raw" sockets.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() fixed up
+ * Alan Cox : ICMP error handling
+ * Alan Cox : EMSGSIZE if you send too big a packet
+ * Alan Cox : Now uses generic datagrams and shared
+ * skbuff library. No more peek crashes,
+ * no more backlogs
+ * Alan Cox : Checks sk->broadcast.
+ * Alan Cox : Uses skb_free_datagram/skb_copy_datagram
+ * Alan Cox : Raw passes ip options too
+ * Alan Cox : Setsocketopt added
+ * Alan Cox : Fixed error return for broadcasts
+ * Alan Cox : Removed wake_up calls
+ * Alan Cox : Use ttl/tos
+ * Alan Cox : Cleaned up old debugging
+ * Alan Cox : Use new kernel side addresses
+ * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets.
+ * Alan Cox : BSD style RAW socket demultiplexing.
+ * Alan Cox : Beginnings of mrouted support.
+ * Alan Cox : Added IP_HDRINCL option.
+ * Alan Cox : Skip broadcast check if BSDism set.
+ * David S. Miller : New socket lookup architecture.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <asm/byteorder.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/sockios.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/mroute.h>
+#include <linux/netdevice.h>
+#include <linux/in_route.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/igmp.h>
+#include <net/net_namespace.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/snmp.h>
+#include <net/tcp_states.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+#include <linux/uio.h>
+
+struct raw_frag_vec {
+ struct msghdr *msg;
+ union {
+ struct icmphdr icmph;
+ char c[1];
+ } hdr;
+ int hlen;
+};
+
+static struct raw_hashinfo raw_v4_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
+};
+
+void raw_hash_sk(struct sock *sk)
+{
+ struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+ struct hlist_head *head;
+
+ head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
+
+ write_lock_bh(&h->lock);
+ sk_add_node(sk, head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_hash_sk);
+
+void raw_unhash_sk(struct sock *sk)
+{
+ struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+
+ write_lock_bh(&h->lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_unhash_sk);
+
+static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+ unsigned short num, __be32 raddr, __be32 laddr, int dif)
+{
+ sk_for_each_from(sk) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
+ !(inet->inet_daddr && inet->inet_daddr != raddr) &&
+ !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ goto found; /* gotcha */
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+/*
+ * 0 - deliver
+ * 1 - block
+ */
+static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
+{
+ struct icmphdr _hdr;
+ const struct icmphdr *hdr;
+
+ hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return 1;
+
+ if (hdr->type < 32) {
+ __u32 data = raw_sk(sk)->filter.data;
+
+ return ((1U << hdr->type) & data) != 0;
+ }
+
+ /* Do not block unknown ICMP types */
+ return 0;
+}
+
+/* IP input processing comes here for RAW socket delivery.
+ * Caller owns SKB, so we must make clones.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
+{
+ struct sock *sk;
+ struct hlist_head *head;
+ int delivered = 0;
+ struct net *net;
+
+ read_lock(&raw_v4_hashinfo.lock);
+ head = &raw_v4_hashinfo.ht[hash];
+ if (hlist_empty(head))
+ goto out;
+
+ net = dev_net(skb->dev);
+ sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
+ iph->saddr, iph->daddr,
+ skb->dev->ifindex);
+
+ while (sk) {
+ delivered = 1;
+ if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
+ ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
+ skb->dev->ifindex)) {
+ struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+ /* Not releasing hash table! */
+ if (clone)
+ raw_rcv(sk, clone);
+ }
+ sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
+ iph->saddr, iph->daddr,
+ skb->dev->ifindex);
+ }
+out:
+ read_unlock(&raw_v4_hashinfo.lock);
+ return delivered;
+}
+
+int raw_local_deliver(struct sk_buff *skb, int protocol)
+{
+ int hash;
+ struct sock *raw_sk;
+
+ hash = protocol & (RAW_HTABLE_SIZE - 1);
+ raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
+
+ /* If there maybe a raw socket we must check - if not we
+ * don't care less
+ */
+ if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
+ raw_sk = NULL;
+
+ return raw_sk != NULL;
+
+}
+
+static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ int err = 0;
+ int harderr = 0;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+ ipv4_sk_update_pmtu(skb, sk, info);
+ else if (type == ICMP_REDIRECT) {
+ ipv4_sk_redirect(skb, sk);
+ return;
+ }
+
+ /* Report error on raw socket, if:
+ 1. User requested ip_recverr.
+ 2. Socket is connected (otherwise the error indication
+ is useless without ip_recverr and error is hard.
+ */
+ if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+ return;
+
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ return;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ err = EHOSTUNREACH;
+ if (code > NR_ICMP_UNREACH)
+ break;
+ err = icmp_err_convert[code].errno;
+ harderr = icmp_err_convert[code].fatal;
+ if (code == ICMP_FRAG_NEEDED) {
+ harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+ err = EMSGSIZE;
+ }
+ }
+
+ if (inet->recverr) {
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ u8 *payload = skb->data + (iph->ihl << 2);
+
+ if (inet->hdrincl)
+ payload = skb->data;
+ ip_icmp_error(sk, skb, err, 0, info, payload);
+ }
+
+ if (inet->recverr || harderr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ }
+}
+
+void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
+{
+ int hash;
+ struct sock *raw_sk;
+ const struct iphdr *iph;
+ struct net *net;
+
+ hash = protocol & (RAW_HTABLE_SIZE - 1);
+
+ read_lock(&raw_v4_hashinfo.lock);
+ raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
+ if (raw_sk) {
+ iph = (const struct iphdr *)skb->data;
+ net = dev_net(skb->dev);
+
+ while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
+ iph->daddr, iph->saddr,
+ skb->dev->ifindex)) != NULL) {
+ raw_err(raw_sk, skb, info);
+ raw_sk = sk_next(raw_sk);
+ iph = (const struct iphdr *)skb->data;
+ }
+ }
+ read_unlock(&raw_v4_hashinfo.lock);
+}
+
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ /* Charge it to the socket. */
+
+ ipv4_pktinfo_prepare(sk, skb);
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return NET_RX_SUCCESS;
+}
+
+int raw_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ atomic_inc(&sk->sk_drops);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+ nf_reset(skb);
+
+ skb_push(skb, skb->data - skb_network_header(skb));
+
+ raw_rcv_skb(sk, skb);
+ return 0;
+}
+
+static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
+ struct msghdr *msg, size_t length,
+ struct rtable **rtp,
+ unsigned int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+ struct iphdr *iph;
+ struct sk_buff *skb;
+ unsigned int iphlen;
+ int err;
+ struct rtable *rt = *rtp;
+ int hlen, tlen;
+
+ if (length > rt->dst.dev->mtu) {
+ ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+ rt->dst.dev->mtu);
+ return -EMSGSIZE;
+ }
+ if (flags&MSG_PROBE)
+ goto out;
+
+ hlen = LL_RESERVED_SPACE(rt->dst.dev);
+ tlen = rt->dst.dev->needed_tailroom;
+ skb = sock_alloc_send_skb(sk,
+ length + hlen + tlen + 15,
+ flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto error;
+ skb_reserve(skb, hlen);
+
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+ skb_dst_set(skb, &rt->dst);
+ *rtp = NULL;
+
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ skb_put(skb, length);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+
+ skb->transport_header = skb->network_header;
+ err = -EFAULT;
+ if (memcpy_from_msg(iph, msg, length))
+ goto error_free;
+
+ iphlen = iph->ihl * 4;
+
+ /*
+ * We don't want to modify the ip header, but we do need to
+ * be sure that it won't cause problems later along the network
+ * stack. Specifically we want to make sure that iph->ihl is a
+ * sane value. If ihl points beyond the length of the buffer passed
+ * in, reject the frame as invalid
+ */
+ err = -EINVAL;
+ if (iphlen > length)
+ goto error_free;
+
+ if (iphlen >= sizeof(*iph)) {
+ if (!iph->saddr)
+ iph->saddr = fl4->saddr;
+ iph->check = 0;
+ iph->tot_len = htons(length);
+ if (!iph->id)
+ ip_select_ident(net, skb, NULL);
+
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ }
+ if (iph->protocol == IPPROTO_ICMP)
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
+
+ err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, sk, skb,
+ NULL, rt->dst.dev, dst_output_sk);
+ if (err > 0)
+ err = net_xmit_errno(err);
+ if (err)
+ goto error;
+out:
+ return 0;
+
+error_free:
+ kfree_skb(skb);
+error:
+ IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+ if (err == -ENOBUFS && !inet->recverr)
+ err = 0;
+ return err;
+}
+
+static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
+{
+ int err;
+
+ if (fl4->flowi4_proto != IPPROTO_ICMP)
+ return 0;
+
+ /* We only need the first two bytes. */
+ rfv->hlen = 2;
+
+ err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen);
+ if (err)
+ return err;
+
+ fl4->fl4_icmp_type = rfv->hdr.icmph.type;
+ fl4->fl4_icmp_code = rfv->hdr.icmph.code;
+
+ return 0;
+}
+
+static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct raw_frag_vec *rfv = from;
+
+ if (offset < rfv->hlen) {
+ int copy = min(rfv->hlen - offset, len);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ memcpy(to, rfv->hdr.c + offset, copy);
+ else
+ skb->csum = csum_block_add(
+ skb->csum,
+ csum_partial_copy_nocheck(rfv->hdr.c + offset,
+ to, copy, 0),
+ odd);
+
+ odd = 0;
+ offset += copy;
+ to += copy;
+ len -= copy;
+
+ if (!len)
+ return 0;
+ }
+
+ offset -= rfv->hlen;
+
+ return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
+}
+
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipcm_cookie ipc;
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ int free = 0;
+ __be32 daddr;
+ __be32 saddr;
+ u8 tos;
+ int err;
+ struct ip_options_data opt_copy;
+ struct raw_frag_vec rfv;
+
+ err = -EMSGSIZE;
+ if (len > 0xFFFF)
+ goto out;
+
+ /*
+ * Check the flags.
+ */
+
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */
+ goto out; /* compatibility */
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (msg->msg_namelen) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ err = -EINVAL;
+ if (msg->msg_namelen < sizeof(*usin))
+ goto out;
+ if (usin->sin_family != AF_INET) {
+ pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n",
+ __func__, current->comm);
+ err = -EAFNOSUPPORT;
+ if (usin->sin_family)
+ goto out;
+ }
+ daddr = usin->sin_addr.s_addr;
+ /* ANK: I did not forget to get protocol from port field.
+ * I just do not know, who uses this weirdness.
+ * IP_HDRINCL is much more convenient.
+ */
+ } else {
+ err = -EDESTADDRREQ;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ daddr = inet->inet_daddr;
+ }
+
+ ipc.addr = inet->inet_saddr;
+ ipc.opt = NULL;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+ ipc.oif = sk->sk_bound_dev_if;
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ if (err)
+ goto out;
+ if (ipc.opt)
+ free = 1;
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = daddr;
+
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ if (ipc.opt) {
+ err = -EINVAL;
+ /* Linux does not mangle headers on raw sockets,
+ * so that IP options + IP_HDRINCL is non-sense.
+ */
+ if (inet->hdrincl)
+ goto done;
+ if (ipc.opt->opt.srr) {
+ if (!daddr)
+ goto done;
+ daddr = ipc.opt->opt.faddr;
+ }
+ }
+ tos = get_rtconn_flags(&ipc, sk);
+ if (msg->msg_flags & MSG_DONTROUTE)
+ tos |= RTO_ONLINK;
+
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
+
+ flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE,
+ inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ inet_sk_flowi_flags(sk) |
+ (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+ daddr, saddr, 0, 0);
+
+ if (!inet->hdrincl) {
+ rfv.msg = msg;
+ rfv.hlen = 0;
+
+ err = raw_probe_proto_opt(&rfv, &fl4);
+ if (err)
+ goto done;
+ }
+
+ security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ goto done;
+ }
+
+ err = -EACCES;
+ if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
+ goto done;
+
+ if (msg->msg_flags & MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ if (inet->hdrincl)
+ err = raw_send_hdrinc(sk, &fl4, msg, len,
+ &rt, msg->msg_flags);
+
+ else {
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
+ if (!ipc.addr)
+ ipc.addr = fl4.daddr;
+ lock_sock(sk);
+ err = ip_append_data(sk, &fl4, raw_getfrag,
+ &rfv, len, 0,
+ &ipc, &rt, msg->msg_flags);
+ if (err)
+ ip_flush_pending_frames(sk);
+ else if (!(msg->msg_flags & MSG_MORE)) {
+ err = ip_push_pending_frames(sk, &fl4);
+ if (err == -ENOBUFS && !inet->recverr)
+ err = 0;
+ }
+ release_sock(sk);
+ }
+done:
+ if (free)
+ kfree(ipc.opt);
+ ip_rt_put(rt);
+
+out:
+ if (err < 0)
+ return err;
+ return len;
+
+do_confirm:
+ dst_confirm(&rt->dst);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto done;
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+ /*
+ * Raw sockets may have direct kernel references. Kill them.
+ */
+ ip_ra_control(sk, 0, NULL);
+
+ sk_common_release(sk);
+}
+
+static void raw_destroy(struct sock *sk)
+{
+ lock_sock(sk);
+ ip_flush_pending_frames(sk);
+ release_sock(sk);
+}
+
+/* This gets rid of all the nasties in af_inet. -DaveM */
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ int ret = -EINVAL;
+ int chk_addr_ret;
+
+ if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
+ goto out;
+ chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+ ret = -EADDRNOTAVAIL;
+ if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ goto out;
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->inet_saddr = 0; /* Use device */
+ sk_dst_reset(sk);
+ ret = 0;
+out: return ret;
+}
+
+/*
+ * This should be easy, if there is something there
+ * we return it, otherwise we block.
+ */
+
+static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct sk_buff *skb;
+
+ if (flags & MSG_OOB)
+ goto out;
+
+ if (flags & MSG_ERRQUEUE) {
+ err = ip_recv_error(sk, msg, len, addr_len);
+ goto out;
+ }
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ /* Copy the address. */
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ sin->sin_port = 0;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+ if (inet->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out:
+ if (err)
+ return err;
+ return copied;
+}
+
+static int raw_init(struct sock *sk)
+{
+ struct raw_sock *rp = raw_sk(sk);
+
+ if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
+ memset(&rp->filter, 0, sizeof(rp->filter));
+ return 0;
+}
+
+static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
+{
+ if (optlen > sizeof(struct icmp_filter))
+ optlen = sizeof(struct icmp_filter);
+ if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
+ return -EFAULT;
+ return 0;
+}
+
+static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
+{
+ int len, ret = -EFAULT;
+
+ if (get_user(len, optlen))
+ goto out;
+ ret = -EINVAL;
+ if (len < 0)
+ goto out;
+ if (len > sizeof(struct icmp_filter))
+ len = sizeof(struct icmp_filter);
+ ret = -EFAULT;
+ if (put_user(len, optlen) ||
+ copy_to_user(optval, &raw_sk(sk)->filter, len))
+ goto out;
+ ret = 0;
+out: return ret;
+}
+
+static int do_raw_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (optname == ICMP_FILTER) {
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
+ return -EOPNOTSUPP;
+ else
+ return raw_seticmpfilter(sk, optval, optlen);
+ }
+ return -ENOPROTOOPT;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_RAW)
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+ return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_RAW)
+ return compat_ip_setsockopt(sk, level, optname, optval, optlen);
+ return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (optname == ICMP_FILTER) {
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
+ return -EOPNOTSUPP;
+ else
+ return raw_geticmpfilter(sk, optval, optlen);
+ }
+ return -ENOPROTOOPT;
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_RAW)
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+ return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_RAW)
+ return compat_ip_getsockopt(sk, level, optname, optval, optlen);
+ return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ: {
+ int amount = sk_wmem_alloc_get(sk);
+
+ return put_user(amount, (int __user *)arg);
+ }
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ amount = skb->len;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
+
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_ioctl(sk, cmd, (void __user *)arg);
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
+#ifdef CONFIG_IP_MROUTE
+ return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+#endif
+
+struct proto raw_prot = {
+ .name = "RAW",
+ .owner = THIS_MODULE,
+ .close = raw_close,
+ .destroy = raw_destroy,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = raw_ioctl,
+ .init = raw_init,
+ .setsockopt = raw_setsockopt,
+ .getsockopt = raw_getsockopt,
+ .sendmsg = raw_sendmsg,
+ .recvmsg = raw_recvmsg,
+ .bind = raw_bind,
+ .backlog_rcv = raw_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = raw_hash_sk,
+ .unhash = raw_unhash_sk,
+ .obj_size = sizeof(struct raw_sock),
+ .h.raw_hash = &raw_v4_hashinfo,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_raw_setsockopt,
+ .compat_getsockopt = compat_raw_getsockopt,
+ .compat_ioctl = compat_raw_ioctl,
+#endif
+};
+
+#ifdef CONFIG_PROC_FS
+static struct sock *raw_get_first(struct seq_file *seq)
+{
+ struct sock *sk;
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
+ ++state->bucket) {
+ sk_for_each(sk, &state->h->ht[state->bucket])
+ if (sock_net(sk) == seq_file_net(seq))
+ goto found;
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ do {
+ sk = sk_next(sk);
+try_again:
+ ;
+ } while (sk && sock_net(sk) != seq_file_net(seq));
+
+ if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
+ sk = sk_head(&state->h->ht[state->bucket]);
+ goto try_again;
+ }
+ return sk;
+}
+
+static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = raw_get_first(seq);
+
+ if (sk)
+ while (pos && (sk = raw_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ read_lock(&state->h->lock);
+ return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL_GPL(raw_seq_start);
+
+void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = raw_get_first(seq);
+ else
+ sk = raw_get_next(seq, v);
+ ++*pos;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(raw_seq_next);
+
+void raw_seq_stop(struct seq_file *seq, void *v)
+{
+ struct raw_iter_state *state = raw_seq_private(seq);
+
+ read_unlock(&state->h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_seq_stop);
+
+static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr,
+ src = inet->inet_rcv_saddr;
+ __u16 destp = 0,
+ srcp = inet->inet_num;
+
+ seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
+ i, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+}
+
+static int raw_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops\n");
+ else
+ raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
+ return 0;
+}
+
+static const struct seq_operations raw_seq_ops = {
+ .start = raw_seq_start,
+ .next = raw_seq_next,
+ .stop = raw_seq_stop,
+ .show = raw_seq_show,
+};
+
+int raw_seq_open(struct inode *ino, struct file *file,
+ struct raw_hashinfo *h, const struct seq_operations *ops)
+{
+ int err;
+ struct raw_iter_state *i;
+
+ err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state));
+ if (err < 0)
+ return err;
+
+ i = raw_seq_private((struct seq_file *)file->private_data);
+ i->h = h;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(raw_seq_open);
+
+static int raw_v4_seq_open(struct inode *inode, struct file *file)
+{
+ return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops);
+}
+
+static const struct file_operations raw_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = raw_v4_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static __net_init int raw_init_net(struct net *net)
+{
+ if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __net_exit void raw_exit_net(struct net *net)
+{
+ remove_proc_entry("raw", net->proc_net);
+}
+
+static __net_initdata struct pernet_operations raw_net_ops = {
+ .init = raw_init_net,
+ .exit = raw_exit_net,
+};
+
+int __init raw_proc_init(void)
+{
+ return register_pernet_subsys(&raw_net_ops);
+}
+
+void __init raw_proc_exit(void)
+{
+ unregister_pernet_subsys(&raw_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
new file mode 100644
index 000000000..f45f2a12f
--- /dev/null
+++ b/net/ipv4/route.c
@@ -0,0 +1,2800 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * ROUTE - implementation of the IP router.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ * Alan Cox : Verify area fixes.
+ * Alan Cox : cli() protects routing changes
+ * Rui Oliveira : ICMP routing table updates
+ * (rco@di.uminho.pt) Routing table insertion and update
+ * Linus Torvalds : Rewrote bits to be sensible
+ * Alan Cox : Added BSD route gw semantics
+ * Alan Cox : Super /proc >4K
+ * Alan Cox : MTU in route table
+ * Alan Cox : MSS actually. Also added the window
+ * clamper.
+ * Sam Lantinga : Fixed route matching in rt_del()
+ * Alan Cox : Routing cache support.
+ * Alan Cox : Removed compatibility cruft.
+ * Alan Cox : RTF_REJECT support.
+ * Alan Cox : TCP irtt support.
+ * Jonathan Naylor : Added Metric support.
+ * Miquel van Smoorenburg : BSD API fixes.
+ * Miquel van Smoorenburg : Metrics.
+ * Alan Cox : Use __u32 properly
+ * Alan Cox : Aligned routing errors more closely with BSD
+ * our system is still very different.
+ * Alan Cox : Faster /proc handling
+ * Alexey Kuznetsov : Massive rework to support tree based routing,
+ * routing caches and better behaviour.
+ *
+ * Olaf Erb : irtt wasn't being copied right.
+ * Bjorn Ekwall : Kerneld route support.
+ * Alan Cox : Multicast fixed (I hope)
+ * Pavel Krauz : Limited broadcast fixed
+ * Mike McLagan : Routing by source
+ * Alexey Kuznetsov : End of old history. Split to fib.c and
+ * route.c and rewritten from scratch.
+ * Andi Kleen : Load-limit warning messages.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
+ * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
+ * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
+ * Marc Boucher : routing by fwmark
+ * Robert Olsson : Added rt_cache statistics
+ * Arnaldo C. Melo : Convert proc stuff to seq_file
+ * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
+ * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
+ * Ilia Sotnikov : Removed TOS from hash calculations
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <net/dst.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/netevent.h>
+#include <net/rtnetlink.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#include <linux/kmemleak.h>
+#endif
+#include <net/secure_seq.h>
+
+#define RT_FL_TOS(oldflp4) \
+ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
+
+#define RT_GC_TIMEOUT (300*HZ)
+
+static int ip_rt_max_size;
+static int ip_rt_redirect_number __read_mostly = 9;
+static int ip_rt_redirect_load __read_mostly = HZ / 50;
+static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
+static int ip_rt_error_cost __read_mostly = HZ;
+static int ip_rt_error_burst __read_mostly = 5 * HZ;
+static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
+static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
+static int ip_rt_min_advmss __read_mostly = 256;
+
+/*
+ * Interface to generic destination cache.
+ */
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
+static unsigned int ipv4_mtu(const struct dst_entry *dst);
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+static void ipv4_link_failure(struct sk_buff *skb);
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu);
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb);
+static void ipv4_dst_destroy(struct dst_entry *dst);
+
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ WARN_ON(1);
+ return NULL;
+}
+
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr);
+
+static struct dst_ops ipv4_dst_ops = {
+ .family = AF_INET,
+ .check = ipv4_dst_check,
+ .default_advmss = ipv4_default_advmss,
+ .mtu = ipv4_mtu,
+ .cow_metrics = ipv4_cow_metrics,
+ .destroy = ipv4_dst_destroy,
+ .negative_advice = ipv4_negative_advice,
+ .link_failure = ipv4_link_failure,
+ .update_pmtu = ip_rt_update_pmtu,
+ .redirect = ip_do_redirect,
+ .local_out = __ip_local_out,
+ .neigh_lookup = ipv4_neigh_lookup,
+};
+
+#define ECN_OR_COST(class) TC_PRIO_##class
+
+const __u8 ip_tos2prio[16] = {
+ TC_PRIO_BESTEFFORT,
+ ECN_OR_COST(BESTEFFORT),
+ TC_PRIO_BESTEFFORT,
+ ECN_OR_COST(BESTEFFORT),
+ TC_PRIO_BULK,
+ ECN_OR_COST(BULK),
+ TC_PRIO_BULK,
+ ECN_OR_COST(BULK),
+ TC_PRIO_INTERACTIVE,
+ ECN_OR_COST(INTERACTIVE),
+ TC_PRIO_INTERACTIVE,
+ ECN_OR_COST(INTERACTIVE),
+ TC_PRIO_INTERACTIVE_BULK,
+ ECN_OR_COST(INTERACTIVE_BULK),
+ TC_PRIO_INTERACTIVE_BULK,
+ ECN_OR_COST(INTERACTIVE_BULK)
+};
+EXPORT_SYMBOL(ip_tos2prio);
+
+static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
+
+#ifdef CONFIG_PROC_FS
+static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+ return SEQ_START_TOKEN;
+}
+
+static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, "%-127s\n",
+ "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
+ "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
+ "HHUptod\tSpecDst");
+ return 0;
+}
+
+static const struct seq_operations rt_cache_seq_ops = {
+ .start = rt_cache_seq_start,
+ .next = rt_cache_seq_next,
+ .stop = rt_cache_seq_stop,
+ .show = rt_cache_seq_show,
+};
+
+static int rt_cache_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rt_cache_seq_ops);
+}
+
+static const struct file_operations rt_cache_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_cache_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return &per_cpu(rt_cache_stat, cpu);
+ }
+ return NULL;
+}
+
+static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu+1;
+ return &per_cpu(rt_cache_stat, cpu);
+ }
+ return NULL;
+
+}
+
+static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int rt_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct rt_cache_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+ return 0;
+ }
+
+ seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
+ " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
+ dst_entries_get_slow(&ipv4_dst_ops),
+ 0, /* st->in_hit */
+ st->in_slow_tot,
+ st->in_slow_mc,
+ st->in_no_route,
+ st->in_brd,
+ st->in_martian_dst,
+ st->in_martian_src,
+
+ 0, /* st->out_hit */
+ st->out_slow_tot,
+ st->out_slow_mc,
+
+ 0, /* st->gc_total */
+ 0, /* st->gc_ignored */
+ 0, /* st->gc_goal_miss */
+ 0, /* st->gc_dst_overflow */
+ 0, /* st->in_hlist_search */
+ 0 /* st->out_hlist_search */
+ );
+ return 0;
+}
+
+static const struct seq_operations rt_cpu_seq_ops = {
+ .start = rt_cpu_seq_start,
+ .next = rt_cpu_seq_next,
+ .stop = rt_cpu_seq_stop,
+ .show = rt_cpu_seq_show,
+};
+
+
+static int rt_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rt_cpu_seq_ops);
+}
+
+static const struct file_operations rt_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static int rt_acct_proc_show(struct seq_file *m, void *v)
+{
+ struct ip_rt_acct *dst, *src;
+ unsigned int i, j;
+
+ dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
+ if (!dst)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
+ for (j = 0; j < 256; j++) {
+ dst[j].o_bytes += src[j].o_bytes;
+ dst[j].o_packets += src[j].o_packets;
+ dst[j].i_bytes += src[j].i_bytes;
+ dst[j].i_packets += src[j].i_packets;
+ }
+ }
+
+ seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
+ kfree(dst);
+ return 0;
+}
+
+static int rt_acct_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rt_acct_proc_show, NULL);
+}
+
+static const struct file_operations rt_acct_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_acct_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static int __net_init ip_rt_do_proc_init(struct net *net)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
+ &rt_cache_seq_fops);
+ if (!pde)
+ goto err1;
+
+ pde = proc_create("rt_cache", S_IRUGO,
+ net->proc_net_stat, &rt_cpu_seq_fops);
+ if (!pde)
+ goto err2;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
+ if (!pde)
+ goto err3;
+#endif
+ return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+err3:
+ remove_proc_entry("rt_cache", net->proc_net_stat);
+#endif
+err2:
+ remove_proc_entry("rt_cache", net->proc_net);
+err1:
+ return -ENOMEM;
+}
+
+static void __net_exit ip_rt_do_proc_exit(struct net *net)
+{
+ remove_proc_entry("rt_cache", net->proc_net_stat);
+ remove_proc_entry("rt_cache", net->proc_net);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ remove_proc_entry("rt_acct", net->proc_net);
+#endif
+}
+
+static struct pernet_operations ip_rt_proc_ops __net_initdata = {
+ .init = ip_rt_do_proc_init,
+ .exit = ip_rt_do_proc_exit,
+};
+
+static int __init ip_rt_proc_init(void)
+{
+ return register_pernet_subsys(&ip_rt_proc_ops);
+}
+
+#else
+static inline int ip_rt_proc_init(void)
+{
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+static inline bool rt_is_expired(const struct rtable *rth)
+{
+ return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
+}
+
+void rt_cache_flush(struct net *net)
+{
+ rt_genid_bump_ipv4(net);
+}
+
+static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ struct net_device *dev = dst->dev;
+ const __be32 *pkey = daddr;
+ const struct rtable *rt;
+ struct neighbour *n;
+
+ rt = (const struct rtable *) dst;
+ if (rt->rt_gateway)
+ pkey = (const __be32 *) &rt->rt_gateway;
+ else if (skb)
+ pkey = &ip_hdr(skb)->daddr;
+
+ n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
+ if (n)
+ return n;
+ return neigh_create(&arp_tbl, pkey, dev);
+}
+
+#define IP_IDENTS_SZ 2048u
+struct ip_ident_bucket {
+ atomic_t id;
+ u32 stamp32;
+};
+
+static struct ip_ident_bucket *ip_idents __read_mostly;
+
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
+ */
+u32 ip_idents_reserve(u32 hash, int segs)
+{
+ struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
+ u32 old = ACCESS_ONCE(bucket->stamp32);
+ u32 now = (u32)jiffies;
+ u32 delta = 0;
+
+ if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
+ delta = prandom_u32_max(now - old);
+
+ return atomic_add_return(segs + delta, &bucket->id) - segs;
+}
+EXPORT_SYMBOL(ip_idents_reserve);
+
+void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
+{
+ static u32 ip_idents_hashrnd __read_mostly;
+ u32 hash, id;
+
+ net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+
+ hash = jhash_3words((__force u32)iph->daddr,
+ (__force u32)iph->saddr,
+ iph->protocol ^ net_hash_mix(net),
+ ip_idents_hashrnd);
+ id = ip_idents_reserve(hash, segs);
+ iph->id = htons(id);
+}
+EXPORT_SYMBOL(__ip_select_ident);
+
+static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct iphdr *iph,
+ int oif, u8 tos,
+ u8 prot, u32 mark, int flow_flags)
+{
+ if (sk) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ oif = sk->sk_bound_dev_if;
+ mark = sk->sk_mark;
+ tos = RT_CONN_FLAGS(sk);
+ prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
+ }
+ flowi4_init_output(fl4, oif, mark, tos,
+ RT_SCOPE_UNIVERSE, prot,
+ flow_flags,
+ iph->daddr, iph->saddr, 0, 0);
+}
+
+static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
+ const struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ int oif = skb->dev->ifindex;
+ u8 tos = RT_TOS(iph->tos);
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
+
+ __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+}
+
+static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ip_options_rcu *inet_opt;
+ __be32 daddr = inet->inet_daddr;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ inet_sk_flowi_flags(sk),
+ daddr, inet->inet_saddr, 0, 0);
+ rcu_read_unlock();
+}
+
+static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ if (skb)
+ build_skb_flow_key(fl4, skb, sk);
+ else
+ build_sk_flow_key(fl4, sk);
+}
+
+static inline void rt_free(struct rtable *rt)
+{
+ call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
+
+static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
+{
+ struct rtable *rt;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
+ rt_free(rt);
+ }
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
+ rt_free(rt);
+ }
+}
+
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+{
+ struct fib_nh_exception *fnhe, *oldest;
+
+ oldest = rcu_dereference(hash->chain);
+ for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+ oldest = fnhe;
+ }
+ fnhe_flush_routes(oldest);
+ return oldest;
+}
+
+static inline u32 fnhe_hashfun(__be32 daddr)
+{
+ static u32 fnhe_hashrnd __read_mostly;
+ u32 hval;
+
+ net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
+ hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
+ return hash_32(hval, FNHE_HASH_SHIFT);
+}
+
+static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
+{
+ rt->rt_pmtu = fnhe->fnhe_pmtu;
+ rt->dst.expires = fnhe->fnhe_expires;
+
+ if (fnhe->fnhe_gw) {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_gateway = fnhe->fnhe_gw;
+ rt->rt_uses_gateway = 1;
+ }
+}
+
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+ u32 pmtu, unsigned long expires)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe;
+ struct rtable *rt;
+ unsigned int i;
+ int depth;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference(nh->nh_exceptions);
+ if (!hash) {
+ hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
+ if (!hash)
+ goto out_unlock;
+ rcu_assign_pointer(nh->nh_exceptions, hash);
+ }
+
+ hash += hval;
+
+ depth = 0;
+ for (fnhe = rcu_dereference(hash->chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr)
+ break;
+ depth++;
+ }
+
+ if (fnhe) {
+ if (gw)
+ fnhe->fnhe_gw = gw;
+ if (pmtu) {
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_expires = max(1UL, expires);
+ }
+ /* Update all cached dsts too */
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ } else {
+ if (depth > FNHE_RECLAIM_DEPTH)
+ fnhe = fnhe_oldest(hash);
+ else {
+ fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+ if (!fnhe)
+ goto out_unlock;
+
+ fnhe->fnhe_next = hash->chain;
+ rcu_assign_pointer(hash->chain, fnhe);
+ }
+ fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
+ fnhe->fnhe_daddr = daddr;
+ fnhe->fnhe_gw = gw;
+ fnhe->fnhe_pmtu = pmtu;
+ fnhe->fnhe_expires = expires;
+
+ /* Exception created; mark the cached routes for the nexthop
+ * stale, so anyone caching it rechecks if this exception
+ * applies to them.
+ */
+ rt = rcu_dereference(nh->nh_rth_input);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+
+ for_each_possible_cpu(i) {
+ struct rtable __rcu **prt;
+ prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
+ rt = rcu_dereference(*prt);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+ }
+ }
+
+ fnhe->fnhe_stamp = jiffies;
+
+out_unlock:
+ spin_unlock_bh(&fnhe_lock);
+}
+
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+ bool kill_route)
+{
+ __be32 new_gw = icmp_hdr(skb)->un.gateway;
+ __be32 old_gw = ip_hdr(skb)->saddr;
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ struct fib_result res;
+ struct neighbour *n;
+ struct net *net;
+
+ switch (icmp_hdr(skb)->code & 7) {
+ case ICMP_REDIR_NET:
+ case ICMP_REDIR_NETTOS:
+ case ICMP_REDIR_HOST:
+ case ICMP_REDIR_HOSTTOS:
+ break;
+
+ default:
+ return;
+ }
+
+ if (rt->rt_gateway != old_gw)
+ return;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ return;
+
+ net = dev_net(dev);
+ if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
+ ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
+ ipv4_is_zeronet(new_gw))
+ goto reject_redirect;
+
+ if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+ if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+ goto reject_redirect;
+ if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+ goto reject_redirect;
+ } else {
+ if (inet_addr_type(net, new_gw) != RTN_UNICAST)
+ goto reject_redirect;
+ }
+
+ n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
+ if (!IS_ERR(n)) {
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ } else {
+ if (fib_lookup(net, fl4, &res) == 0) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ update_or_create_fnhe(nh, fl4->daddr, new_gw,
+ 0, 0);
+ }
+ if (kill_route)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+ }
+ neigh_release(n);
+ }
+ return;
+
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev)) {
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ __be32 daddr = iph->daddr;
+ __be32 saddr = iph->saddr;
+
+ net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
+ " Advised path = %pI4 -> %pI4\n",
+ &old_gw, dev->name, &new_gw,
+ &saddr, &daddr);
+ }
+#endif
+ ;
+}
+
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt;
+ struct flowi4 fl4;
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ int oif = skb->dev->ifindex;
+ u8 tos = RT_TOS(iph->tos);
+ u8 prot = iph->protocol;
+ u32 mark = skb->mark;
+
+ rt = (struct rtable *) dst;
+
+ __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
+ __ip_do_redirect(rt, skb, &fl4, true);
+}
+
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable *)dst;
+ struct dst_entry *ret = dst;
+
+ if (rt) {
+ if (dst->obsolete > 0) {
+ ip_rt_put(rt);
+ ret = NULL;
+ } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+ rt->dst.expires) {
+ ip_rt_put(rt);
+ ret = NULL;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Algorithm:
+ * 1. The first ip_rt_redirect_number redirects are sent
+ * with exponential backoff, then we stop sending them at all,
+ * assuming that the host ignores our redirects.
+ * 2. If we did not see packets requiring redirects
+ * during ip_rt_redirect_silence, we assume that the host
+ * forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
+ */
+
+void ip_rt_send_redirect(struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct in_device *in_dev;
+ struct inet_peer *peer;
+ struct net *net;
+ int log_martians;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
+ rcu_read_unlock();
+ return;
+ }
+ log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+ rcu_read_unlock();
+
+ net = dev_net(rt->dst.dev);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+ if (!peer) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+ rt_nexthop(rt, ip_hdr(skb)->daddr));
+ return;
+ }
+
+ /* No redirected packets during ip_rt_redirect_silence;
+ * reset the algorithm.
+ */
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+ peer->rate_tokens = 0;
+
+ /* Too many ignored redirects; do not send anything
+ * set dst.rate_last to the last seen redirected packet.
+ */
+ if (peer->rate_tokens >= ip_rt_redirect_number) {
+ peer->rate_last = jiffies;
+ goto out_put_peer;
+ }
+
+ /* Check for load limit; set rate_last to the latest sent
+ * redirect.
+ */
+ if (peer->rate_tokens == 0 ||
+ time_after(jiffies,
+ (peer->rate_last +
+ (ip_rt_redirect_load << peer->rate_tokens)))) {
+ __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
+ peer->rate_last = jiffies;
+ ++peer->rate_tokens;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (log_martians &&
+ peer->rate_tokens == ip_rt_redirect_number)
+ net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
+ &ip_hdr(skb)->saddr, inet_iif(skb),
+ &ip_hdr(skb)->daddr, &gw);
+#endif
+ }
+out_put_peer:
+ inet_putpeer(peer);
+}
+
+static int ip_error(struct sk_buff *skb)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct rtable *rt = skb_rtable(skb);
+ struct inet_peer *peer;
+ unsigned long now;
+ struct net *net;
+ bool send;
+ int code;
+
+ /* IP on this device is disabled. */
+ if (!in_dev)
+ goto out;
+
+ net = dev_net(rt->dst.dev);
+ if (!IN_DEV_FORWARD(in_dev)) {
+ switch (rt->dst.error) {
+ case EHOSTUNREACH:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
+ break;
+
+ case ENETUNREACH:
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ break;
+ }
+ goto out;
+ }
+
+ switch (rt->dst.error) {
+ case EINVAL:
+ default:
+ goto out;
+ case EHOSTUNREACH:
+ code = ICMP_HOST_UNREACH;
+ break;
+ case ENETUNREACH:
+ code = ICMP_NET_UNREACH;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ break;
+ case EACCES:
+ code = ICMP_PKT_FILTERED;
+ break;
+ }
+
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
+
+ send = true;
+ if (peer) {
+ now = jiffies;
+ peer->rate_tokens += now - peer->rate_last;
+ if (peer->rate_tokens > ip_rt_error_burst)
+ peer->rate_tokens = ip_rt_error_burst;
+ peer->rate_last = now;
+ if (peer->rate_tokens >= ip_rt_error_cost)
+ peer->rate_tokens -= ip_rt_error_cost;
+ else
+ send = false;
+ inet_putpeer(peer);
+ }
+ if (send)
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+
+out: kfree_skb(skb);
+ return 0;
+}
+
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+{
+ struct dst_entry *dst = &rt->dst;
+ struct fib_result res;
+
+ if (dst_metric_locked(dst, RTAX_MTU))
+ return;
+
+ if (ipv4_mtu(dst) < mtu)
+ return;
+
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
+
+ if (rt->rt_pmtu == mtu &&
+ time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
+ return;
+
+ rcu_read_lock();
+ if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+ struct fib_nh *nh = &FIB_RES_NH(res);
+
+ update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+ jiffies + ip_rt_mtu_expires);
+ }
+ rcu_read_unlock();
+}
+
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ struct flowi4 fl4;
+
+ ip_rt_build_flow_key(&fl4, sk, skb);
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+}
+
+void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
+ int oif, u32 mark, u8 protocol, int flow_flags)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ if (!mark)
+ mark = IP4_REPLY_MARK(net, skb->mark);
+
+ __build_flow_key(&fl4, NULL, iph, oif,
+ RT_TOS(iph->tos), protocol, mark, flow_flags);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
+
+static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ if (!fl4.flowi4_mark)
+ fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
+
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_rt_update_pmtu(rt, &fl4, mtu);
+ ip_rt_put(rt);
+ }
+}
+
+void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ struct dst_entry *odst = NULL;
+ bool new = false;
+
+ bh_lock_sock(sk);
+
+ if (!ip_sk_accept_pmtu(sk))
+ goto out;
+
+ odst = sk_dst_get(sk);
+
+ if (sock_owned_by_user(sk) || !odst) {
+ __ipv4_sk_update_pmtu(skb, sk, mtu);
+ goto out;
+ }
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+
+ rt = (struct rtable *)odst;
+ if (odst->obsolete && !odst->ops->check(odst, 0)) {
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
+ }
+
+ __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
+
+ if (!dst_check(&rt->dst, 0)) {
+ if (new)
+ dst_release(&rt->dst);
+
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ if (IS_ERR(rt))
+ goto out;
+
+ new = true;
+ }
+
+ if (new)
+ sk_dst_set(sk, &rt->dst);
+
+out:
+ bh_unlock_sock(sk);
+ dst_release(odst);
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
+
+void ipv4_redirect(struct sk_buff *skb, struct net *net,
+ int oif, u32 mark, u8 protocol, int flow_flags)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(&fl4, NULL, iph, oif,
+ RT_TOS(iph->tos), protocol, mark, flow_flags);
+ rt = __ip_route_output_key(net, &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4, false);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_redirect);
+
+void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ const struct iphdr *iph = (const struct iphdr *) skb->data;
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+ rt = __ip_route_output_key(sock_net(sk), &fl4);
+ if (!IS_ERR(rt)) {
+ __ip_do_redirect(rt, skb, &fl4, false);
+ ip_rt_put(rt);
+ }
+}
+EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ /* All IPV4 dsts are created with ->obsolete set to the value
+ * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+ * into this function always.
+ *
+ * When a PMTU/redirect information update invalidates a route,
+ * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
+ * DST_OBSOLETE_DEAD by dst_free().
+ */
+ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
+ return NULL;
+ return dst;
+}
+
+static void ipv4_link_failure(struct sk_buff *skb)
+{
+ struct rtable *rt;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+ rt = skb_rtable(skb);
+ if (rt)
+ dst_set_expires(&rt->dst, 0);
+}
+
+static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
+{
+ pr_debug("%s: %pI4 -> %pI4, %s\n",
+ __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ skb->dev ? skb->dev->name : "?");
+ kfree_skb(skb);
+ WARN_ON(1);
+ return 0;
+}
+
+/*
+ We do not cache source address of outgoing interface,
+ because it is used only by IP RR, TS and SRR options,
+ so that it out of fast path.
+
+ BTW remember: "addr" is allowed to be not aligned
+ in IP options!
+ */
+
+void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
+{
+ __be32 src;
+
+ if (rt_is_output_route(rt))
+ src = ip_hdr(skb)->saddr;
+ else {
+ struct fib_result res;
+ struct flowi4 fl4;
+ struct iphdr *iph;
+
+ iph = ip_hdr(skb);
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = iph->daddr;
+ fl4.saddr = iph->saddr;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_oif = rt->dst.dev->ifindex;
+ fl4.flowi4_iif = skb->dev->ifindex;
+ fl4.flowi4_mark = skb->mark;
+
+ rcu_read_lock();
+ if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+ src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
+ else
+ src = inet_select_addr(rt->dst.dev,
+ rt_nexthop(rt, iph->daddr),
+ RT_SCOPE_UNIVERSE);
+ rcu_read_unlock();
+ }
+ memcpy(addr, &src, 4);
+}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+ if (!(rt->dst.tclassid & 0xFFFF))
+ rt->dst.tclassid |= tag & 0xFFFF;
+ if (!(rt->dst.tclassid & 0xFFFF0000))
+ rt->dst.tclassid |= tag & 0xFFFF0000;
+}
+#endif
+
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
+{
+ unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
+
+ if (advmss == 0) {
+ advmss = max_t(unsigned int, dst->dev->mtu - 40,
+ ip_rt_min_advmss);
+ if (advmss > 65535 - 40)
+ advmss = 65535 - 40;
+ }
+ return advmss;
+}
+
+static unsigned int ipv4_mtu(const struct dst_entry *dst)
+{
+ const struct rtable *rt = (const struct rtable *) dst;
+ unsigned int mtu = rt->rt_pmtu;
+
+ if (!mtu || time_after_eq(jiffies, rt->dst.expires))
+ mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ if (mtu)
+ return mtu;
+
+ mtu = dst->dev->mtu;
+
+ if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+ if (rt->rt_uses_gateway && mtu > 576)
+ mtu = 576;
+ }
+
+ return min_t(unsigned int, mtu, IP_MAX_MTU);
+}
+
+static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
+ struct fib_nh_exception *fnhe;
+ u32 hval;
+
+ if (!hash)
+ return NULL;
+
+ hval = fnhe_hashfun(daddr);
+
+ for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ if (fnhe->fnhe_daddr == daddr)
+ return fnhe;
+ }
+ return NULL;
+}
+
+static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+ __be32 daddr)
+{
+ bool ret = false;
+
+ spin_lock_bh(&fnhe_lock);
+
+ if (daddr == fnhe->fnhe_daddr) {
+ struct rtable __rcu **porig;
+ struct rtable *orig;
+ int genid = fnhe_genid(dev_net(rt->dst.dev));
+
+ if (rt_is_input_route(rt))
+ porig = &fnhe->fnhe_rth_input;
+ else
+ porig = &fnhe->fnhe_rth_output;
+ orig = rcu_dereference(*porig);
+
+ if (fnhe->fnhe_genid != genid) {
+ fnhe->fnhe_genid = genid;
+ fnhe->fnhe_gw = 0;
+ fnhe->fnhe_pmtu = 0;
+ fnhe->fnhe_expires = 0;
+ fnhe_flush_routes(fnhe);
+ orig = NULL;
+ }
+ fill_route_from_fnhe(rt, fnhe);
+ if (!rt->rt_gateway)
+ rt->rt_gateway = daddr;
+
+ if (!(rt->dst.flags & DST_NOCACHE)) {
+ rcu_assign_pointer(*porig, rt);
+ if (orig)
+ rt_free(orig);
+ ret = true;
+ }
+
+ fnhe->fnhe_stamp = jiffies;
+ }
+ spin_unlock_bh(&fnhe_lock);
+
+ return ret;
+}
+
+static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+{
+ struct rtable *orig, *prev, **p;
+ bool ret = true;
+
+ if (rt_is_input_route(rt)) {
+ p = (struct rtable **)&nh->nh_rth_input;
+ } else {
+ p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
+ }
+ orig = *p;
+
+ prev = cmpxchg(p, orig, rt);
+ if (prev == orig) {
+ if (orig)
+ rt_free(orig);
+ } else
+ ret = false;
+
+ return ret;
+}
+
+struct uncached_list {
+ spinlock_t lock;
+ struct list_head head;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
+
+static void rt_add_uncached_list(struct rtable *rt)
+{
+ struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
+
+ rt->rt_uncached_list = ul;
+
+ spin_lock_bh(&ul->lock);
+ list_add_tail(&rt->rt_uncached, &ul->head);
+ spin_unlock_bh(&ul->lock);
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (!list_empty(&rt->rt_uncached)) {
+ struct uncached_list *ul = rt->rt_uncached_list;
+
+ spin_lock_bh(&ul->lock);
+ list_del(&rt->rt_uncached);
+ spin_unlock_bh(&ul->lock);
+ }
+}
+
+void rt_flush_dev(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct rtable *rt;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+ spin_lock_bh(&ul->lock);
+ list_for_each_entry(rt, &ul->head, rt_uncached) {
+ if (rt->dst.dev != dev)
+ continue;
+ rt->dst.dev = net->loopback_dev;
+ dev_hold(rt->dst.dev);
+ dev_put(dev);
+ }
+ spin_unlock_bh(&ul->lock);
+ }
+}
+
+static bool rt_cache_valid(const struct rtable *rt)
+{
+ return rt &&
+ rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ !rt_is_expired(rt);
+}
+
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
+ const struct fib_result *res,
+ struct fib_nh_exception *fnhe,
+ struct fib_info *fi, u16 type, u32 itag)
+{
+ bool cached = false;
+
+ if (fi) {
+ struct fib_nh *nh = &FIB_RES_NH(*res);
+
+ if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
+ rt->rt_gateway = nh->nh_gw;
+ rt->rt_uses_gateway = 1;
+ }
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rt->dst.tclassid = nh->nh_tclassid;
+#endif
+ if (unlikely(fnhe))
+ cached = rt_bind_exception(rt, fnhe, daddr);
+ else if (!(rt->dst.flags & DST_NOCACHE))
+ cached = rt_cache_route(nh, rt);
+ if (unlikely(!cached)) {
+ /* Routes we intend to cache in nexthop exception or
+ * FIB nexthop have the DST_NOCACHE bit clear.
+ * However, if we are unsuccessful at storing this
+ * route into the cache we really need to set it.
+ */
+ rt->dst.flags |= DST_NOCACHE;
+ if (!rt->rt_gateway)
+ rt->rt_gateway = daddr;
+ rt_add_uncached_list(rt);
+ }
+ } else
+ rt_add_uncached_list(rt);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ set_class_tag(rt, res->tclassid);
+#endif
+ set_class_tag(rt, itag);
+#endif
+}
+
+static struct rtable *rt_dst_alloc(struct net_device *dev,
+ bool nopolicy, bool noxfrm, bool will_cache)
+{
+ return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0));
+}
+
+/* called in rcu_read_lock() section */
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev, int our)
+{
+ struct rtable *rth;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ u32 itag = 0;
+ int err;
+
+ /* Primary sanity checks. */
+
+ if (!in_dev)
+ return -EINVAL;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ skb->protocol != htons(ETH_P_IP))
+ goto e_inval;
+
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(saddr))
+ goto e_inval;
+
+ if (ipv4_is_zeronet(saddr)) {
+ if (!ipv4_is_local_multicast(daddr))
+ goto e_inval;
+ } else {
+ err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+ in_dev, &itag);
+ if (err < 0)
+ goto e_err;
+ }
+ rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+ if (!rth)
+ goto e_nobufs;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rth->dst.tclassid = itag;
+#endif
+ rth->dst.output = ip_rt_bug;
+
+ rth->rt_genid = rt_genid_ipv4(dev_net(dev));
+ rth->rt_flags = RTCF_MULTICAST;
+ rth->rt_type = RTN_MULTICAST;
+ rth->rt_is_input= 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+ if (our) {
+ rth->dst.input= ip_local_deliver;
+ rth->rt_flags |= RTCF_LOCAL;
+ }
+
+#ifdef CONFIG_IP_MROUTE
+ if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
+ rth->dst.input = ip_mr_input;
+#endif
+ RT_CACHE_STAT_INC(in_slow_mc);
+
+ skb_dst_set(skb, &rth->dst);
+ return 0;
+
+e_nobufs:
+ return -ENOBUFS;
+e_inval:
+ return -EINVAL;
+e_err:
+ return err;
+}
+
+
+static void ip_handle_martian_source(struct net_device *dev,
+ struct in_device *in_dev,
+ struct sk_buff *skb,
+ __be32 daddr,
+ __be32 saddr)
+{
+ RT_CACHE_STAT_INC(in_martian_src);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+ /*
+ * RFC1812 recommendation, if source is martian,
+ * the only hint is MAC header.
+ */
+ pr_warn("martian source %pI4 from %pI4, on dev %s\n",
+ &daddr, &saddr, dev->name);
+ if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
+ print_hex_dump(KERN_WARNING, "ll header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ skb_mac_header(skb),
+ dev->hard_header_len, true);
+ }
+ }
+#endif
+}
+
+/* called in rcu_read_lock() section */
+static int __mkroute_input(struct sk_buff *skb,
+ const struct fib_result *res,
+ struct in_device *in_dev,
+ __be32 daddr, __be32 saddr, u32 tos)
+{
+ struct fib_nh_exception *fnhe;
+ struct rtable *rth;
+ int err;
+ struct in_device *out_dev;
+ unsigned int flags = 0;
+ bool do_cache;
+ u32 itag = 0;
+
+ /* get a working reference to the output device */
+ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
+ if (!out_dev) {
+ net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
+ return -EINVAL;
+ }
+
+ err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
+ in_dev->dev, in_dev, &itag);
+ if (err < 0) {
+ ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
+ saddr);
+
+ goto cleanup;
+ }
+
+ do_cache = res->fi && !itag;
+ if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
+ skb->protocol == htons(ETH_P_IP) &&
+ (IN_DEV_SHARED_MEDIA(out_dev) ||
+ inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+ IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+
+ if (skb->protocol != htons(ETH_P_IP)) {
+ /* Not IP (i.e. ARP). Do not create route, if it is
+ * invalid for proxy arp. DNAT routes are always valid.
+ *
+ * Proxy arp feature have been extended to allow, ARP
+ * replies back to the same interface, to support
+ * Private VLAN switch technologies. See arp.c.
+ */
+ if (out_dev == in_dev &&
+ IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
+ err = -EINVAL;
+ goto cleanup;
+ }
+ }
+
+ fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+ if (do_cache) {
+ if (fnhe)
+ rth = rcu_dereference(fnhe->fnhe_rth_input);
+ else
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ goto out;
+ }
+ }
+
+ rth = rt_dst_alloc(out_dev->dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
+ if (!rth) {
+ err = -ENOBUFS;
+ goto cleanup;
+ }
+
+ rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
+ rth->rt_flags = flags;
+ rth->rt_type = res->type;
+ rth->rt_is_input = 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+ RT_CACHE_STAT_INC(in_slow_tot);
+
+ rth->dst.input = ip_forward;
+ rth->dst.output = ip_output;
+
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ skb_dst_set(skb, &rth->dst);
+out:
+ err = 0;
+ cleanup:
+ return err;
+}
+
+static int ip_mkroute_input(struct sk_buff *skb,
+ struct fib_result *res,
+ const struct flowi4 *fl4,
+ struct in_device *in_dev,
+ __be32 daddr, __be32 saddr, u32 tos)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res->fi && res->fi->fib_nhs > 1)
+ fib_select_multipath(res);
+#endif
+
+ /* create a routing cache entry */
+ return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
+}
+
+/*
+ * NOTE. We drop all the packets that has local source
+ * addresses, because every properly looped back packet
+ * must have correct destination already attached by output routine.
+ *
+ * Such approach solves two big problems:
+ * 1. Not simplex devices are handled properly.
+ * 2. IP spoofing attempts are filtered with 100% of guarantee.
+ * called with rcu_read_lock()
+ */
+
+static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev)
+{
+ struct fib_result res;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct flowi4 fl4;
+ unsigned int flags = 0;
+ u32 itag = 0;
+ struct rtable *rth;
+ int err = -EINVAL;
+ struct net *net = dev_net(dev);
+ bool do_cache;
+
+ /* IP on this device is disabled. */
+
+ if (!in_dev)
+ goto out;
+
+ /* Check for the most weird martians, which can be not detected
+ by fib_lookup.
+ */
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ goto martian_source;
+
+ res.fi = NULL;
+ if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
+ goto brd_input;
+
+ /* Accept zero addresses only to limited broadcast;
+ * I even do not know to fix it or not. Waiting for complains :-)
+ */
+ if (ipv4_is_zeronet(saddr))
+ goto martian_source;
+
+ if (ipv4_is_zeronet(daddr))
+ goto martian_destination;
+
+ /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
+ * and call it once if daddr or/and saddr are loopback addresses
+ */
+ if (ipv4_is_loopback(daddr)) {
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_destination;
+ } else if (ipv4_is_loopback(saddr)) {
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_source;
+ }
+
+ /*
+ * Now we are ready to route packet.
+ */
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = dev->ifindex;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+ err = fib_lookup(net, &fl4, &res);
+ if (err != 0) {
+ if (!IN_DEV_FORWARD(in_dev))
+ err = -EHOSTUNREACH;
+ goto no_route;
+ }
+
+ if (res.type == RTN_BROADCAST)
+ goto brd_input;
+
+ if (res.type == RTN_LOCAL) {
+ err = fib_validate_source(skb, saddr, daddr, tos,
+ 0, dev, in_dev, &itag);
+ if (err < 0)
+ goto martian_source_keep_err;
+ goto local_input;
+ }
+
+ if (!IN_DEV_FORWARD(in_dev)) {
+ err = -EHOSTUNREACH;
+ goto no_route;
+ }
+ if (res.type != RTN_UNICAST)
+ goto martian_destination;
+
+ err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
+out: return err;
+
+brd_input:
+ if (skb->protocol != htons(ETH_P_IP))
+ goto e_inval;
+
+ if (!ipv4_is_zeronet(saddr)) {
+ err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
+ in_dev, &itag);
+ if (err < 0)
+ goto martian_source_keep_err;
+ }
+ flags |= RTCF_BROADCAST;
+ res.type = RTN_BROADCAST;
+ RT_CACHE_STAT_INC(in_brd);
+
+local_input:
+ do_cache = false;
+ if (res.fi) {
+ if (!itag) {
+ rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ err = 0;
+ goto out;
+ }
+ do_cache = true;
+ }
+ }
+
+ rth = rt_dst_alloc(net->loopback_dev,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
+ if (!rth)
+ goto e_nobufs;
+
+ rth->dst.input= ip_local_deliver;
+ rth->dst.output= ip_rt_bug;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ rth->dst.tclassid = itag;
+#endif
+
+ rth->rt_genid = rt_genid_ipv4(net);
+ rth->rt_flags = flags|RTCF_LOCAL;
+ rth->rt_type = res.type;
+ rth->rt_is_input = 1;
+ rth->rt_iif = 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+ RT_CACHE_STAT_INC(in_slow_tot);
+ if (res.type == RTN_UNREACHABLE) {
+ rth->dst.input= ip_error;
+ rth->dst.error= -err;
+ rth->rt_flags &= ~RTCF_LOCAL;
+ }
+ if (do_cache) {
+ if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+ rth->dst.flags |= DST_NOCACHE;
+ rt_add_uncached_list(rth);
+ }
+ }
+ skb_dst_set(skb, &rth->dst);
+ err = 0;
+ goto out;
+
+no_route:
+ RT_CACHE_STAT_INC(in_no_route);
+ res.type = RTN_UNREACHABLE;
+ res.fi = NULL;
+ goto local_input;
+
+ /*
+ * Do not cache martian addresses: they should be logged (RFC1812)
+ */
+martian_destination:
+ RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+ if (IN_DEV_LOG_MARTIANS(in_dev))
+ net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
+ &daddr, &saddr, dev->name);
+#endif
+
+e_inval:
+ err = -EINVAL;
+ goto out;
+
+e_nobufs:
+ err = -ENOBUFS;
+ goto out;
+
+martian_source:
+ err = -EINVAL;
+martian_source_keep_err:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ goto out;
+}
+
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev)
+{
+ int res;
+
+ rcu_read_lock();
+
+ /* Multicast recognition logic is moved from route cache to here.
+ The problem was that too many Ethernet cards have broken/missing
+ hardware multicast filters :-( As result the host on multicasting
+ network acquires a lot of useless route cache entries, sort of
+ SDR messages from all the world. Now we try to get rid of them.
+ Really, provided software IP multicast filter is organized
+ reasonably (at least, hashed), it does not result in a slowdown
+ comparing with route cache reject entries.
+ Note, that multicast routers are not affected, because
+ route cache entry is created eventually.
+ */
+ if (ipv4_is_multicast(daddr)) {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (in_dev) {
+ int our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
+ if (our
+#ifdef CONFIG_IP_MROUTE
+ ||
+ (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
+#endif
+ ) {
+ int res = ip_route_input_mc(skb, daddr, saddr,
+ tos, dev, our);
+ rcu_read_unlock();
+ return res;
+ }
+ }
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL(ip_route_input_noref);
+
+/* called with rcu_read_lock() */
+static struct rtable *__mkroute_output(const struct fib_result *res,
+ const struct flowi4 *fl4, int orig_oif,
+ struct net_device *dev_out,
+ unsigned int flags)
+{
+ struct fib_info *fi = res->fi;
+ struct fib_nh_exception *fnhe;
+ struct in_device *in_dev;
+ u16 type = res->type;
+ struct rtable *rth;
+ bool do_cache;
+
+ in_dev = __in_dev_get_rcu(dev_out);
+ if (!in_dev)
+ return ERR_PTR(-EINVAL);
+
+ if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
+ if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+ return ERR_PTR(-EINVAL);
+
+ if (ipv4_is_lbcast(fl4->daddr))
+ type = RTN_BROADCAST;
+ else if (ipv4_is_multicast(fl4->daddr))
+ type = RTN_MULTICAST;
+ else if (ipv4_is_zeronet(fl4->daddr))
+ return ERR_PTR(-EINVAL);
+
+ if (dev_out->flags & IFF_LOOPBACK)
+ flags |= RTCF_LOCAL;
+
+ do_cache = true;
+ if (type == RTN_BROADCAST) {
+ flags |= RTCF_BROADCAST | RTCF_LOCAL;
+ fi = NULL;
+ } else if (type == RTN_MULTICAST) {
+ flags |= RTCF_MULTICAST | RTCF_LOCAL;
+ if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
+ fl4->flowi4_proto))
+ flags &= ~RTCF_LOCAL;
+ else
+ do_cache = false;
+ /* If multicast route do not exist use
+ * default one, but do not gateway in this case.
+ * Yes, it is hack.
+ */
+ if (fi && res->prefixlen < 4)
+ fi = NULL;
+ }
+
+ fnhe = NULL;
+ do_cache &= fi != NULL;
+ if (do_cache) {
+ struct rtable __rcu **prth;
+ struct fib_nh *nh = &FIB_RES_NH(*res);
+
+ fnhe = find_exception(nh, fl4->daddr);
+ if (fnhe)
+ prth = &fnhe->fnhe_rth_output;
+ else {
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
+ }
+ prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
+ }
+ rth = rcu_dereference(*prth);
+ if (rt_cache_valid(rth)) {
+ dst_hold(&rth->dst);
+ return rth;
+ }
+ }
+
+add:
+ rth = rt_dst_alloc(dev_out,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(in_dev, NOXFRM),
+ do_cache);
+ if (!rth)
+ return ERR_PTR(-ENOBUFS);
+
+ rth->dst.output = ip_output;
+
+ rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
+ rth->rt_flags = flags;
+ rth->rt_type = type;
+ rth->rt_is_input = 0;
+ rth->rt_iif = orig_oif ? : 0;
+ rth->rt_pmtu = 0;
+ rth->rt_gateway = 0;
+ rth->rt_uses_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
+
+ RT_CACHE_STAT_INC(out_slow_tot);
+
+ if (flags & RTCF_LOCAL)
+ rth->dst.input = ip_local_deliver;
+ if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+ if (flags & RTCF_LOCAL &&
+ !(dev_out->flags & IFF_LOOPBACK)) {
+ rth->dst.output = ip_mc_output;
+ RT_CACHE_STAT_INC(out_slow_mc);
+ }
+#ifdef CONFIG_IP_MROUTE
+ if (type == RTN_MULTICAST) {
+ if (IN_DEV_MFORWARD(in_dev) &&
+ !ipv4_is_local_multicast(fl4->daddr)) {
+ rth->dst.input = ip_mr_input;
+ rth->dst.output = ip_mc_output;
+ }
+ }
+#endif
+ }
+
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+
+ return rth;
+}
+
+/*
+ * Major route resolver routine.
+ */
+
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+{
+ struct net_device *dev_out = NULL;
+ __u8 tos = RT_FL_TOS(fl4);
+ unsigned int flags = 0;
+ struct fib_result res;
+ struct rtable *rth;
+ int orig_oif;
+
+ res.tclassid = 0;
+ res.fi = NULL;
+ res.table = NULL;
+
+ orig_oif = fl4->flowi4_oif;
+
+ fl4->flowi4_iif = LOOPBACK_IFINDEX;
+ fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+ fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
+ RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+
+ rcu_read_lock();
+ if (fl4->saddr) {
+ rth = ERR_PTR(-EINVAL);
+ if (ipv4_is_multicast(fl4->saddr) ||
+ ipv4_is_lbcast(fl4->saddr) ||
+ ipv4_is_zeronet(fl4->saddr))
+ goto out;
+
+ /* I removed check for oif == dev_out->oif here.
+ It was wrong for two reasons:
+ 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+ is assigned to multiple interfaces.
+ 2. Moreover, we are allowed to send packets with saddr
+ of another iface. --ANK
+ */
+
+ if (fl4->flowi4_oif == 0 &&
+ (ipv4_is_multicast(fl4->daddr) ||
+ ipv4_is_lbcast(fl4->daddr))) {
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ dev_out = __ip_dev_find(net, fl4->saddr, false);
+ if (!dev_out)
+ goto out;
+
+ /* Special hack: user can direct multicasts
+ and limited broadcast via necessary interface
+ without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ This hack is not just for fun, it allows
+ vic,vat and friends to work.
+ They bind socket to loopback, set ttl to zero
+ and expect that it will work.
+ From the viewpoint of routing cache they are broken,
+ because we are not allowed to build multicast path
+ with loopback source addr (look, routing cache
+ cannot know, that ttl is zero, so that packet
+ will not leave this host and route is valid).
+ Luckily, this hack is good workaround.
+ */
+
+ fl4->flowi4_oif = dev_out->ifindex;
+ goto make_route;
+ }
+
+ if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
+ /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+ if (!__ip_dev_find(net, fl4->saddr, false))
+ goto out;
+ }
+ }
+
+
+ if (fl4->flowi4_oif) {
+ dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
+ rth = ERR_PTR(-ENODEV);
+ if (!dev_out)
+ goto out;
+
+ /* RACE: Check return value of inet_select_addr instead. */
+ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
+ rth = ERR_PTR(-ENETUNREACH);
+ goto out;
+ }
+ if (ipv4_is_local_multicast(fl4->daddr) ||
+ ipv4_is_lbcast(fl4->daddr)) {
+ if (!fl4->saddr)
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_LINK);
+ goto make_route;
+ }
+ if (!fl4->saddr) {
+ if (ipv4_is_multicast(fl4->daddr))
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ fl4->flowi4_scope);
+ else if (!fl4->daddr)
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_HOST);
+ }
+ }
+
+ if (!fl4->daddr) {
+ fl4->daddr = fl4->saddr;
+ if (!fl4->daddr)
+ fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
+ dev_out = net->loopback_dev;
+ fl4->flowi4_oif = LOOPBACK_IFINDEX;
+ res.type = RTN_LOCAL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+ if (fib_lookup(net, fl4, &res)) {
+ res.fi = NULL;
+ res.table = NULL;
+ if (fl4->flowi4_oif) {
+ /* Apparently, routing tables are wrong. Assume,
+ that the destination is on link.
+
+ WHY? DW.
+ Because we are allowed to send to iface
+ even if it has NO routes and NO assigned
+ addresses. When oif is specified, routing
+ tables are looked up with only one purpose:
+ to catch if destination is gatewayed, rather than
+ direct. Moreover, if MSG_DONTROUTE is set,
+ we send packet, ignoring both routing tables
+ and ifaddr state. --ANK
+
+
+ We could make it even if oif is unknown,
+ likely IPv6, but we do not.
+ */
+
+ if (fl4->saddr == 0)
+ fl4->saddr = inet_select_addr(dev_out, 0,
+ RT_SCOPE_LINK);
+ res.type = RTN_UNICAST;
+ goto make_route;
+ }
+ rth = ERR_PTR(-ENETUNREACH);
+ goto out;
+ }
+
+ if (res.type == RTN_LOCAL) {
+ if (!fl4->saddr) {
+ if (res.fi->fib_prefsrc)
+ fl4->saddr = res.fi->fib_prefsrc;
+ else
+ fl4->saddr = fl4->daddr;
+ }
+ dev_out = net->loopback_dev;
+ fl4->flowi4_oif = dev_out->ifindex;
+ flags |= RTCF_LOCAL;
+ goto make_route;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+ fib_select_multipath(&res);
+ else
+#endif
+ if (!res.prefixlen &&
+ res.table->tb_num_default > 1 &&
+ res.type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(&res);
+
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, res);
+
+ dev_out = FIB_RES_DEV(res);
+ fl4->flowi4_oif = dev_out->ifindex;
+
+
+make_route:
+ rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
+
+out:
+ rcu_read_unlock();
+ return rth;
+}
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
+
+static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ return NULL;
+}
+
+static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
+}
+
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+}
+
+static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+}
+
+static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
+ unsigned long old)
+{
+ return NULL;
+}
+
+static struct dst_ops ipv4_dst_blackhole_ops = {
+ .family = AF_INET,
+ .check = ipv4_blackhole_dst_check,
+ .mtu = ipv4_blackhole_mtu,
+ .default_advmss = ipv4_default_advmss,
+ .update_pmtu = ipv4_rt_blackhole_update_pmtu,
+ .redirect = ipv4_rt_blackhole_redirect,
+ .cow_metrics = ipv4_rt_blackhole_cow_metrics,
+ .neigh_lookup = ipv4_neigh_lookup,
+};
+
+struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
+{
+ struct rtable *ort = (struct rtable *) dst_orig;
+ struct rtable *rt;
+
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
+ if (rt) {
+ struct dst_entry *new = &rt->dst;
+
+ new->__use = 1;
+ new->input = dst_discard;
+ new->output = dst_discard_sk;
+
+ new->dev = ort->dst.dev;
+ if (new->dev)
+ dev_hold(new->dev);
+
+ rt->rt_is_input = ort->rt_is_input;
+ rt->rt_iif = ort->rt_iif;
+ rt->rt_pmtu = ort->rt_pmtu;
+
+ rt->rt_genid = rt_genid_ipv4(net);
+ rt->rt_flags = ort->rt_flags;
+ rt->rt_type = ort->rt_type;
+ rt->rt_gateway = ort->rt_gateway;
+ rt->rt_uses_gateway = ort->rt_uses_gateway;
+
+ INIT_LIST_HEAD(&rt->rt_uncached);
+
+ dst_free(new);
+ }
+
+ dst_release(dst_orig);
+
+ return rt ? &rt->dst : ERR_PTR(-ENOMEM);
+}
+
+struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
+ struct sock *sk)
+{
+ struct rtable *rt = __ip_route_output_key(net, flp4);
+
+ if (IS_ERR(rt))
+ return rt;
+
+ if (flp4->flowi4_proto)
+ rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
+ flowi4_to_flowi(flp4),
+ sk, 0);
+
+ return rt;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+ struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
+ u32 seq, int event, int nowait, unsigned int flags)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct rtmsg *r;
+ struct nlmsghdr *nlh;
+ unsigned long expires = 0;
+ u32 error;
+ u32 metrics[RTAX_MAX];
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ r->rtm_family = AF_INET;
+ r->rtm_dst_len = 32;
+ r->rtm_src_len = 0;
+ r->rtm_tos = fl4->flowi4_tos;
+ r->rtm_table = RT_TABLE_MAIN;
+ if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
+ goto nla_put_failure;
+ r->rtm_type = rt->rt_type;
+ r->rtm_scope = RT_SCOPE_UNIVERSE;
+ r->rtm_protocol = RTPROT_UNSPEC;
+ r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+ if (rt->rt_flags & RTCF_NOTIFY)
+ r->rtm_flags |= RTM_F_NOTIFY;
+ if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
+ r->rtm_flags |= RTCF_DOREDIRECT;
+
+ if (nla_put_in_addr(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ if (src) {
+ r->rtm_src_len = 32;
+ if (nla_put_in_addr(skb, RTA_SRC, src))
+ goto nla_put_failure;
+ }
+ if (rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ goto nla_put_failure;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rt->dst.tclassid &&
+ nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
+ goto nla_put_failure;
+#endif
+ if (!rt_is_input_route(rt) &&
+ fl4->saddr != src) {
+ if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
+ goto nla_put_failure;
+ }
+ if (rt->rt_uses_gateway &&
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
+ goto nla_put_failure;
+
+ expires = rt->dst.expires;
+ if (expires) {
+ unsigned long now = jiffies;
+
+ if (time_before(now, expires))
+ expires -= now;
+ else
+ expires = 0;
+ }
+
+ memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
+ if (rt->rt_pmtu && expires)
+ metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+ if (rtnetlink_put_metrics(skb, metrics) < 0)
+ goto nla_put_failure;
+
+ if (fl4->flowi4_mark &&
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+ goto nla_put_failure;
+
+ error = rt->dst.error;
+
+ if (rt_is_input_route(rt)) {
+#ifdef CONFIG_IP_MROUTE
+ if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, nowait);
+ if (err <= 0) {
+ if (!nowait) {
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
+ } else {
+ if (err == -EMSGSIZE)
+ goto nla_put_failure;
+ error = err;
+ }
+ }
+ } else
+#endif
+ if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct rtmsg *rtm;
+ struct nlattr *tb[RTA_MAX+1];
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ __be32 dst = 0;
+ __be32 src = 0;
+ u32 iif;
+ int err;
+ int mark;
+ struct sk_buff *skb;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ rtm = nlmsg_data(nlh);
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ /* Reserve room for dummy headers, this skb can pass
+ through good chunk of routing engine.
+ */
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+
+ /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
+ ip_hdr(skb)->protocol = IPPROTO_ICMP;
+ skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
+ src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
+ dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
+ iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+ mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = dst;
+ fl4.saddr = src;
+ fl4.flowi4_tos = rtm->rtm_tos;
+ fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+ fl4.flowi4_mark = mark;
+
+ if (iif) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_index(net, iif);
+ if (!dev) {
+ err = -ENODEV;
+ goto errout_free;
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb->dev = dev;
+ skb->mark = mark;
+ local_bh_disable();
+ err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ local_bh_enable();
+
+ rt = skb_rtable(skb);
+ if (err == 0 && rt->dst.error)
+ err = -rt->dst.error;
+ } else {
+ rt = ip_route_output_key(net, &fl4);
+
+ err = 0;
+ if (IS_ERR(rt))
+ err = PTR_ERR(rt);
+ }
+
+ if (err)
+ goto errout_free;
+
+ skb_dst_set(skb, &rt->dst);
+ if (rtm->rtm_flags & RTM_F_NOTIFY)
+ rt->rt_flags |= RTCF_NOTIFY;
+
+ err = rt_fill_info(net, dst, src, &fl4, skb,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+ RTM_NEWROUTE, 0, 0);
+ if (err < 0)
+ goto errout_free;
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+
+errout_free:
+ kfree_skb(skb);
+ goto errout;
+}
+
+void ip_rt_multicast_event(struct in_device *in_dev)
+{
+ rt_cache_flush(dev_net(in_dev->dev));
+}
+
+#ifdef CONFIG_SYSCTL
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly = 60 * HZ;
+static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
+static int ip_rt_gc_elasticity __read_mostly = 8;
+
+static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net = (struct net *)__ctl->extra1;
+
+ if (write) {
+ rt_cache_flush(net);
+ fnhe_genid_bump(net);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static struct ctl_table ipv4_route_table[] = {
+ {
+ .procname = "gc_thresh",
+ .data = &ipv4_dst_ops.gc_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_size",
+ .data = &ip_rt_max_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ /* Deprecated. Use gc_min_interval_ms */
+
+ .procname = "gc_min_interval",
+ .data = &ip_rt_gc_min_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "gc_min_interval_ms",
+ .data = &ip_rt_gc_min_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "gc_timeout",
+ .data = &ip_rt_gc_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "gc_interval",
+ .data = &ip_rt_gc_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "redirect_load",
+ .data = &ip_rt_redirect_load,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "redirect_number",
+ .data = &ip_rt_redirect_number,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "redirect_silence",
+ .data = &ip_rt_redirect_silence,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "error_cost",
+ .data = &ip_rt_error_cost,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "error_burst",
+ .data = &ip_rt_error_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "gc_elasticity",
+ .data = &ip_rt_gc_elasticity,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "mtu_expires",
+ .data = &ip_rt_mtu_expires,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "min_pmtu",
+ .data = &ip_rt_min_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "min_adv_mss",
+ .data = &ip_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static struct ctl_table ipv4_route_flush_table[] = {
+ {
+ .procname = "flush",
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = ipv4_sysctl_rtcache_flush,
+ },
+ { },
+};
+
+static __net_init int sysctl_route_net_init(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = ipv4_route_flush_table;
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
+ if (!tbl)
+ goto err_dup;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
+ }
+ tbl[0].extra1 = net;
+
+ net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
+ if (!net->ipv4.route_hdr)
+ goto err_reg;
+ return 0;
+
+err_reg:
+ if (tbl != ipv4_route_flush_table)
+ kfree(tbl);
+err_dup:
+ return -ENOMEM;
+}
+
+static __net_exit void sysctl_route_net_exit(struct net *net)
+{
+ struct ctl_table *tbl;
+
+ tbl = net->ipv4.route_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.route_hdr);
+ BUG_ON(tbl == ipv4_route_flush_table);
+ kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_route_ops = {
+ .init = sysctl_route_net_init,
+ .exit = sysctl_route_net_exit,
+};
+#endif
+
+static __net_init int rt_genid_init(struct net *net)
+{
+ atomic_set(&net->ipv4.rt_genid, 0);
+ atomic_set(&net->fnhe_genid, 0);
+ get_random_bytes(&net->ipv4.dev_addr_genid,
+ sizeof(net->ipv4.dev_addr_genid));
+ return 0;
+}
+
+static __net_initdata struct pernet_operations rt_genid_ops = {
+ .init = rt_genid_init,
+};
+
+static int __net_init ipv4_inetpeer_init(struct net *net)
+{
+ struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+ if (!bp)
+ return -ENOMEM;
+ inet_peer_base_init(bp);
+ net->ipv4.peers = bp;
+ return 0;
+}
+
+static void __net_exit ipv4_inetpeer_exit(struct net *net)
+{
+ struct inet_peer_base *bp = net->ipv4.peers;
+
+ net->ipv4.peers = NULL;
+ inetpeer_invalidate_tree(bp);
+ kfree(bp);
+}
+
+static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
+ .init = ipv4_inetpeer_init,
+ .exit = ipv4_inetpeer_exit,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
+#endif /* CONFIG_IP_ROUTE_CLASSID */
+
+int __init ip_rt_init(void)
+{
+ int rc = 0;
+ int cpu;
+
+ ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
+ if (!ip_idents)
+ panic("IP: failed to allocate ip_idents\n");
+
+ prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+
+ for_each_possible_cpu(cpu) {
+ struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+
+ INIT_LIST_HEAD(&ul->head);
+ spin_lock_init(&ul->lock);
+ }
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
+ if (!ip_rt_acct)
+ panic("IP: failed to allocate ip_rt_acct\n");
+#endif
+
+ ipv4_dst_ops.kmem_cachep =
+ kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+ ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
+
+ if (dst_entries_init(&ipv4_dst_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+ if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
+ ipv4_dst_ops.gc_thresh = ~0;
+ ip_rt_max_size = INT_MAX;
+
+ devinet_init();
+ ip_fib_init();
+
+ if (ip_rt_proc_init())
+ pr_err("Unable to create route proc files\n");
+#ifdef CONFIG_XFRM
+ xfrm_init();
+ xfrm4_init();
+#endif
+ rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
+
+#ifdef CONFIG_SYSCTL
+ register_pernet_subsys(&sysctl_route_ops);
+#endif
+ register_pernet_subsys(&rt_genid_ops);
+ register_pernet_subsys(&ipv4_inetpeer_ops);
+ return rc;
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * We really need to sanitize the damn ipv4 init order, then all
+ * this nonsense will go away.
+ */
+void __init ip_static_sysctl_init(void)
+{
+ register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
+}
+#endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
new file mode 100644
index 000000000..df849e5a1
--- /dev/null
+++ b/net/ipv4/syncookies.c
@@ -0,0 +1,401 @@
+/*
+ * Syncookies implementation for the Linux kernel
+ *
+ * Copyright (C) 1997 Andi Kleen
+ * Based on ideas by D.J.Bernstein and Eric Schenk.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+extern int sysctl_tcp_syncookies;
+
+static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+
+#define COOKIEBITS 24 /* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK
+ * stores TCP options:
+ *
+ * MSB LSB
+ * | 31 ... 6 | 5 | 4 | 3 2 1 0 |
+ * | Timestamp | ECN | SACK | WScale |
+ *
+ * When we receive a valid cookie-ACK, we look at the echoed tsval (if
+ * any) to figure out which TCP options we should use for the rebuilt
+ * connection.
+ *
+ * A WScale setting of '0xf' (which is an invalid scaling value)
+ * means that original syn did not include the TCP window scaling option.
+ */
+#define TS_OPT_WSCALE_MASK 0xf
+#define TS_OPT_SACK BIT(4)
+#define TS_OPT_ECN BIT(5)
+/* There is no TS_OPT_TIMESTAMP:
+ * if ACK contains timestamp option, we already know it was
+ * requested/supported by the syn/synack exchange.
+ */
+#define TSBITS 6
+#define TSMASK (((__u32)1 << TSBITS) - 1)
+
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
+ ipv4_cookie_scratch);
+
+static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
+ u32 count, int c)
+{
+ __u32 *tmp;
+
+ net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
+
+ tmp = this_cpu_ptr(ipv4_cookie_scratch);
+ memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
+ tmp[0] = (__force u32)saddr;
+ tmp[1] = (__force u32)daddr;
+ tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
+ tmp[3] = count;
+ sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+ return tmp[17];
+}
+
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we encode
+ * tcp options in the lower bits of the timestamp value that will be
+ * sent in the syn-ack.
+ * Since subsequent timestamps use the normal tcp_time_stamp value, we
+ * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
+ */
+__u32 cookie_init_timestamp(struct request_sock *req)
+{
+ struct inet_request_sock *ireq;
+ u32 ts, ts_now = tcp_time_stamp;
+ u32 options = 0;
+
+ ireq = inet_rsk(req);
+
+ options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
+ if (ireq->sack_ok)
+ options |= TS_OPT_SACK;
+ if (ireq->ecn_ok)
+ options |= TS_OPT_ECN;
+
+ ts = ts_now & ~TSMASK;
+ ts |= options;
+ if (ts > ts_now) {
+ ts >>= TSBITS;
+ ts--;
+ ts <<= TSBITS;
+ ts |= options;
+ }
+ return ts;
+}
+
+
+static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
+ __be16 dport, __u32 sseq, __u32 data)
+{
+ /*
+ * Compute the secure sequence number.
+ * The output should be:
+ * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+ * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
+ * Where sseq is their sequence number and count increases every
+ * minute by 1.
+ * As an extra hack, we add a small "data" value that encodes the
+ * MSS into the second hash value.
+ */
+ u32 count = tcp_cookie_time();
+ return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+ sseq + (count << COOKIEBITS) +
+ ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+ & COOKIEMASK));
+}
+
+/*
+ * This retrieves the small "data" value from the syncookie.
+ * If the syncookie is bad, the data returned will be out of
+ * range. This must be checked by the caller.
+ *
+ * The count value used to generate the cookie must be less than
+ * MAX_SYNCOOKIE_AGE minutes in the past.
+ * The return value (__u32)-1 if this test fails.
+ */
+static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport, __u32 sseq)
+{
+ u32 diff, count = tcp_cookie_time();
+
+ /* Strip away the layers from the cookie */
+ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+ /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+ if (diff >= MAX_SYNCOOKIE_AGE)
+ return (__u32)-1;
+
+ return (cookie -
+ cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+ & COOKIEMASK; /* Leaving the data behind */
+}
+
+/*
+ * MSS Values are chosen based on the 2011 paper
+ * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
+ * Values ..
+ * .. lower than 536 are rare (< 0.2%)
+ * .. between 537 and 1299 account for less than < 1.5% of observed values
+ * .. in the 1300-1349 range account for about 15 to 20% of observed mss values
+ * .. exceeding 1460 are very rare (< 0.04%)
+ *
+ * 1460 is the single most frequently announced mss value (30 to 46% depending
+ * on monitor location). Table must be sorted.
+ */
+static __u16 const msstab[] = {
+ 536,
+ 1300,
+ 1440, /* 1440, 1452: PPPoE */
+ 1460,
+};
+
+/*
+ * Generate a syncookie. mssp points to the mss, which is returned
+ * rounded down to the value encoded in the cookie.
+ */
+u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
+ u16 *mssp)
+{
+ int mssind;
+ const __u16 mss = *mssp;
+
+ for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+ if (mss >= msstab[mssind])
+ break;
+ *mssp = msstab[mssind];
+
+ return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
+ th->source, th->dest, ntohl(th->seq),
+ mssind);
+}
+EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
+
+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
+ __u16 *mssp)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ tcp_synq_overflow(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+
+ return __cookie_v4_init_sequence(iph, th, mssp);
+}
+
+/*
+ * Check if a ack sequence number is a valid syncookie.
+ * Return the decoded mss if it is, or 0 if not.
+ */
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
+ u32 cookie)
+{
+ __u32 seq = ntohl(th->seq) - 1;
+ __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
+ th->source, th->dest, seq);
+
+ return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
+}
+EXPORT_SYMBOL_GPL(__cookie_v4_check);
+
+static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sock *child;
+
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+ if (child) {
+ atomic_set(&req->rsk_refcnt, 1);
+ inet_csk_reqsk_queue_add(sk, req, child);
+ } else {
+ reqsk_free(req);
+ }
+ return child;
+}
+
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we stored
+ * additional tcp options in the timestamp.
+ * This extracts these options from the timestamp echo.
+ *
+ * return false if we decode a tcp option that is disabled
+ * on the host.
+ */
+bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt)
+{
+ /* echoed timestamp, lowest bits contain options */
+ u32 options = tcp_opt->rcv_tsecr;
+
+ if (!tcp_opt->saw_tstamp) {
+ tcp_clear_options(tcp_opt);
+ return true;
+ }
+
+ if (!sysctl_tcp_timestamps)
+ return false;
+
+ tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0;
+
+ if (tcp_opt->sack_ok && !sysctl_tcp_sack)
+ return false;
+
+ if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK)
+ return true; /* no window scaling */
+
+ tcp_opt->wscale_ok = 1;
+ tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK;
+
+ return sysctl_tcp_window_scaling != 0;
+}
+EXPORT_SYMBOL(cookie_timestamp_decode);
+
+bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
+ const struct net *net, const struct dst_entry *dst)
+{
+ bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN;
+
+ if (!ecn_ok)
+ return false;
+
+ if (net->ipv4.sysctl_tcp_ecn)
+ return true;
+
+ return dst_feature(dst, RTAX_FEATURE_ECN);
+}
+EXPORT_SYMBOL(cookie_ecn_ok);
+
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
+ struct tcp_options_received tcp_opt;
+ struct inet_request_sock *ireq;
+ struct tcp_request_sock *treq;
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
+ __u32 cookie = ntohl(th->ack_seq) - 1;
+ struct sock *ret = sk;
+ struct request_sock *req;
+ int mss;
+ struct rtable *rt;
+ __u8 rcv_wscale;
+ struct flowi4 fl4;
+
+ if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+ goto out;
+
+ if (tcp_synq_no_recent_overflow(sk))
+ goto out;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th, cookie);
+ if (mss == 0) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+ goto out;
+ }
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+
+ /* check for timestamp cookie support */
+ memset(&tcp_opt, 0, sizeof(tcp_opt));
+ tcp_parse_options(skb, &tcp_opt, 0, NULL);
+
+ if (!cookie_timestamp_decode(&tcp_opt))
+ goto out;
+
+ ret = NULL;
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
+ if (!req)
+ goto out;
+
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+ treq->rcv_isn = ntohl(th->seq) - 1;
+ treq->snt_isn = cookie;
+ req->mss = mss;
+ ireq->ir_num = ntohs(th->dest);
+ ireq->ir_rmt_port = th->source;
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+ ireq->snd_wscale = tcp_opt.snd_wscale;
+ ireq->sack_ok = tcp_opt.sack_ok;
+ ireq->wscale_ok = tcp_opt.wscale_ok;
+ ireq->tstamp_ok = tcp_opt.saw_tstamp;
+ req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+ treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+ treq->tfo_listener = false;
+
+ ireq->ir_iif = sk->sk_bound_dev_if;
+
+ /* We throwed the options of the initial SYN away, so we hope
+ * the ACK carries the same options again (see RFC1122 4.2.3.8)
+ */
+ ireq->opt = tcp_v4_save_options(skb);
+
+ if (security_inet_conn_request(sk, skb, req)) {
+ reqsk_free(req);
+ goto out;
+ }
+
+ req->num_retrans = 0;
+
+ /*
+ * We need to lookup the route here to get at the correct
+ * window size. We should better make sure that the window size
+ * hasn't changed since we received the original syn, but I see
+ * no easy way to do this.
+ */
+ flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+ RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
+ inet_sk_flowi_flags(sk),
+ opt->srr ? opt->faddr : ireq->ir_rmt_addr,
+ ireq->ir_loc_addr, th->source, th->dest);
+ security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_key(sock_net(sk), &fl4);
+ if (IS_ERR(rt)) {
+ reqsk_free(req);
+ goto out;
+ }
+
+ /* Try to redo what tcp_v4_send_synack did. */
+ req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+
+ tcp_select_initial_window(tcp_full_space(sk), req->mss,
+ &req->rcv_wnd, &req->window_clamp,
+ ireq->wscale_ok, &rcv_wscale,
+ dst_metric(&rt->dst, RTAX_INITRWND));
+
+ ireq->rcv_wscale = rcv_wscale;
+ ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
+
+ ret = get_cookie_sock(sk, skb, req, &rt->dst);
+ /* ip_queue_xmit() depends on our flow being setup
+ * Normal sockets get it right from inet_csk_route_child_sock()
+ */
+ if (ret)
+ inet_sk(ret)->cork.fl.u.ip4 = fl4;
+out: return ret;
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 000000000..c3852a7ff
--- /dev/null
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,970 @@
+/*
+ * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/seqlock.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/swap.h>
+#include <net/snmp.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/cipso_ipv4.h>
+#include <net/inet_frag.h>
+#include <net/ping.h>
+#include <net/tcp_memcontrol.h>
+
+static int zero;
+static int one = 1;
+static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
+static int tcp_retr1_max = 255;
+static int ip_local_port_range_min[] = { 1, 1 };
+static int ip_local_port_range_max[] = { 65535, 65535 };
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
+static int ip_ttl_min = 1;
+static int ip_ttl_max = 255;
+static int tcp_syn_retries_min = 1;
+static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
+static int ip_ping_group_range_min[] = { 0, 0 };
+static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+
+/* Update system visible IP port range */
+static void set_local_port_range(struct net *net, int range[2])
+{
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
+ net->ipv4.ip_local_ports.range[0] = range[0];
+ net->ipv4.ip_local_ports.range[1] = range[1];
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_local_port_range(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ip_local_ports.range);
+ int ret;
+ int range[2];
+ struct ctl_table tmp = {
+ .data = &range,
+ .maxlen = sizeof(range),
+ .mode = table->mode,
+ .extra1 = &ip_local_port_range_min,
+ .extra2 = &ip_local_port_range_max,
+ };
+
+ inet_get_local_port_range(net, &range[0], &range[1]);
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ if (range[1] < range[0])
+ ret = -EINVAL;
+ else
+ set_local_port_range(net, range);
+ }
+
+ return ret;
+}
+
+
+static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
+{
+ kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ unsigned int seq;
+ do {
+ seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
+
+ *low = data[0];
+ *high = data[1];
+ } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
+}
+
+/* Update system visible IP port range */
+static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
+{
+ kgid_t *data = table->data;
+ struct net *net =
+ container_of(table->data, struct net, ipv4.ping_group_range.range);
+ write_seqlock(&net->ipv4.ip_local_ports.lock);
+ data[0] = low;
+ data[1] = high;
+ write_sequnlock(&net->ipv4.ip_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int ret;
+ gid_t urange[2];
+ kgid_t low, high;
+ struct ctl_table tmp = {
+ .data = &urange,
+ .maxlen = sizeof(urange),
+ .mode = table->mode,
+ .extra1 = &ip_ping_group_range_min,
+ .extra2 = &ip_ping_group_range_max,
+ };
+
+ inet_get_ping_group_range_table(table, &low, &high);
+ urange[0] = from_kgid_munged(user_ns, low);
+ urange[1] = from_kgid_munged(user_ns, high);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ low = make_kgid(user_ns, urange[0]);
+ high = make_kgid(user_ns, urange[1]);
+ if (!gid_valid(low) || !gid_valid(high) ||
+ (urange[1] < urange[0]) || gid_lt(high, low)) {
+ low = make_kgid(&init_user_ns, 1);
+ high = make_kgid(&init_user_ns, 0);
+ }
+ set_ping_group_range(table, low, high);
+ }
+
+ return ret;
+}
+
+static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char val[TCP_CA_NAME_MAX];
+ struct ctl_table tbl = {
+ .data = val,
+ .maxlen = TCP_CA_NAME_MAX,
+ };
+ int ret;
+
+ tcp_get_default_congestion_control(val);
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = tcp_set_default_congestion_control(val);
+ return ret;
+}
+
+static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
+ int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+ tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ kfree(tbl.data);
+ return ret;
+}
+
+static int proc_allowed_congestion_control(struct ctl_table *ctl,
+ int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = tcp_set_allowed_congestion_control(tbl.data);
+ kfree(tbl.data);
+ return ret;
+}
+
+static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
+ struct tcp_fastopen_context *ctxt;
+ int ret;
+ u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
+ if (!tbl.data)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ ctxt = rcu_dereference(tcp_fastopen_ctx);
+ if (ctxt)
+ memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+ else
+ memset(user_key, 0, sizeof(user_key));
+ rcu_read_unlock();
+
+ snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
+ user_key[0], user_key[1], user_key[2], user_key[3]);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
+ user_key + 2, user_key + 3) != 4) {
+ ret = -EINVAL;
+ goto bad_key;
+ }
+ /* Generate a dummy secret but don't publish it. This
+ * is needed so we don't regenerate a new key on the
+ * first invocation of tcp_fastopen_cookie_gen
+ */
+ tcp_fastopen_init_key_once(false);
+ tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+ }
+
+bad_key:
+ pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+ user_key[0], user_key[1], user_key[2], user_key[3],
+ (char *)tbl.data, ret);
+ kfree(tbl.data);
+ return ret;
+}
+
+static struct ctl_table ipv4_table[] = {
+ {
+ .procname = "tcp_timestamps",
+ .data = &sysctl_tcp_timestamps,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_window_scaling",
+ .data = &sysctl_tcp_window_scaling,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_sack",
+ .data = &sysctl_tcp_sack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_retrans_collapse",
+ .data = &sysctl_tcp_retrans_collapse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_default_ttl",
+ .data = &sysctl_ip_default_ttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_ttl_min,
+ .extra2 = &ip_ttl_max,
+ },
+ {
+ .procname = "tcp_syn_retries",
+ .data = &sysctl_tcp_syn_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_syn_retries_min,
+ .extra2 = &tcp_syn_retries_max
+ },
+ {
+ .procname = "tcp_synack_retries",
+ .data = &sysctl_tcp_synack_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_orphans",
+ .data = &sysctl_tcp_max_orphans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_tw_buckets",
+ .data = &tcp_death_row.sysctl_max_tw_buckets,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_early_demux",
+ .data = &sysctl_ip_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_dynaddr",
+ .data = &sysctl_ip_dynaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_keepalive_time",
+ .data = &sysctl_tcp_keepalive_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_keepalive_probes",
+ .data = &sysctl_tcp_keepalive_probes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_keepalive_intvl",
+ .data = &sysctl_tcp_keepalive_intvl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_retries1",
+ .data = &sysctl_tcp_retries1,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra2 = &tcp_retr1_max
+ },
+ {
+ .procname = "tcp_retries2",
+ .data = &sysctl_tcp_retries2,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_fin_timeout",
+ .data = &sysctl_tcp_fin_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#ifdef CONFIG_SYN_COOKIES
+ {
+ .procname = "tcp_syncookies",
+ .data = &sysctl_tcp_syncookies,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ {
+ .procname = "tcp_fastopen",
+ .data = &sysctl_tcp_fastopen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fastopen_key",
+ .mode = 0600,
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+ .proc_handler = proc_tcp_fastopen_key,
+ },
+ {
+ .procname = "tcp_tw_recycle",
+ .data = &tcp_death_row.sysctl_tw_recycle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_abort_on_overflow",
+ .data = &sysctl_tcp_abort_on_overflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_stdurg",
+ .data = &sysctl_tcp_stdurg,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_rfc1337",
+ .data = &sysctl_tcp_rfc1337,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_syn_backlog",
+ .data = &sysctl_max_syn_backlog,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "igmp_max_memberships",
+ .data = &sysctl_igmp_max_memberships,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "igmp_max_msf",
+ .data = &sysctl_igmp_max_msf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#ifdef CONFIG_IP_MULTICAST
+ {
+ .procname = "igmp_qrv",
+ .data = &sysctl_igmp_qrv,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+#endif
+ {
+ .procname = "inet_peer_threshold",
+ .data = &inet_peer_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "inet_peer_minttl",
+ .data = &inet_peer_minttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "inet_peer_maxttl",
+ .data = &inet_peer_maxttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_orphan_retries",
+ .data = &sysctl_tcp_orphan_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_fack",
+ .data = &sysctl_tcp_fack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_reordering",
+ .data = &sysctl_tcp_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_max_reordering",
+ .data = &sysctl_tcp_max_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_dsack",
+ .data = &sysctl_tcp_dsack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_mem",
+ .maxlen = sizeof(sysctl_tcp_mem),
+ .data = &sysctl_tcp_mem,
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "tcp_wmem",
+ .data = &sysctl_tcp_wmem,
+ .maxlen = sizeof(sysctl_tcp_wmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+ {
+ .procname = "tcp_notsent_lowat",
+ .data = &sysctl_tcp_notsent_lowat,
+ .maxlen = sizeof(sysctl_tcp_notsent_lowat),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_rmem",
+ .data = &sysctl_tcp_rmem,
+ .maxlen = sizeof(sysctl_tcp_rmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+ {
+ .procname = "tcp_app_win",
+ .data = &sysctl_tcp_app_win,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_adv_win_scale",
+ .data = &sysctl_tcp_adv_win_scale,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_adv_win_scale_min,
+ .extra2 = &tcp_adv_win_scale_max,
+ },
+ {
+ .procname = "tcp_tw_reuse",
+ .data = &sysctl_tcp_tw_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_frto",
+ .data = &sysctl_tcp_frto,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_low_latency",
+ .data = &sysctl_tcp_low_latency,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_no_metrics_save",
+ .data = &sysctl_tcp_nometrics_save,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_moderate_rcvbuf",
+ .data = &sysctl_tcp_moderate_rcvbuf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_tso_win_divisor",
+ .data = &sysctl_tcp_tso_win_divisor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_congestion_control",
+ .mode = 0644,
+ .maxlen = TCP_CA_NAME_MAX,
+ .proc_handler = proc_tcp_congestion_control,
+ },
+ {
+ .procname = "tcp_workaround_signed_windows",
+ .data = &sysctl_tcp_workaround_signed_windows,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_limit_output_bytes",
+ .data = &sysctl_tcp_limit_output_bytes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_challenge_ack_limit",
+ .data = &sysctl_tcp_challenge_ack_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_slow_start_after_idle",
+ .data = &sysctl_tcp_slow_start_after_idle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#ifdef CONFIG_NETLABEL
+ {
+ .procname = "cipso_cache_enable",
+ .data = &cipso_v4_cache_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cipso_cache_bucket_size",
+ .data = &cipso_v4_cache_bucketsize,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cipso_rbm_optfmt",
+ .data = &cipso_v4_rbm_optfmt,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cipso_rbm_strictvalid",
+ .data = &cipso_v4_rbm_strictvalid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NETLABEL */
+ {
+ .procname = "tcp_available_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_tcp_available_congestion_control,
+ },
+ {
+ .procname = "tcp_allowed_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0644,
+ .proc_handler = proc_allowed_congestion_control,
+ },
+ {
+ .procname = "tcp_thin_linear_timeouts",
+ .data = &sysctl_tcp_thin_linear_timeouts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_thin_dupack",
+ .data = &sysctl_tcp_thin_dupack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_early_retrans",
+ .data = &sysctl_tcp_early_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &four,
+ },
+ {
+ .procname = "tcp_min_tso_segs",
+ .data = &sysctl_tcp_min_tso_segs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &gso_max_segs,
+ },
+ {
+ .procname = "tcp_autocorking",
+ .data = &sysctl_tcp_autocorking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "tcp_invalid_ratelimit",
+ .data = &sysctl_tcp_invalid_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "icmp_msgs_per_sec",
+ .data = &sysctl_icmp_msgs_per_sec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "icmp_msgs_burst",
+ .data = &sysctl_icmp_msgs_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "udp_mem",
+ .data = &sysctl_udp_mem,
+ .maxlen = sizeof(sysctl_udp_mem),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "udp_rmem_min",
+ .data = &sysctl_udp_rmem_min,
+ .maxlen = sizeof(sysctl_udp_rmem_min),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+ {
+ .procname = "udp_wmem_min",
+ .data = &sysctl_udp_wmem_min,
+ .maxlen = sizeof(sysctl_udp_wmem_min),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+ { }
+};
+
+static struct ctl_table ipv4_net_table[] = {
+ {
+ .procname = "icmp_echo_ignore_all",
+ .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "icmp_echo_ignore_broadcasts",
+ .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "icmp_ignore_bogus_error_responses",
+ .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "icmp_errors_use_inbound_ifaddr",
+ .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "icmp_ratelimit",
+ .data = &init_net.ipv4.sysctl_icmp_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "icmp_ratemask",
+ .data = &init_net.ipv4.sysctl_icmp_ratemask,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ping_group_range",
+ .data = &init_net.ipv4.ping_group_range.range,
+ .maxlen = sizeof(gid_t)*2,
+ .mode = 0644,
+ .proc_handler = ipv4_ping_group_range,
+ },
+ {
+ .procname = "tcp_ecn",
+ .data = &init_net.ipv4.sysctl_tcp_ecn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_local_port_range",
+ .maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
+ .data = &init_net.ipv4.ip_local_ports.range,
+ .mode = 0644,
+ .proc_handler = ipv4_local_port_range,
+ },
+ {
+ .procname = "ip_local_reserved_ports",
+ .data = &init_net.ipv4.sysctl_local_reserved_ports,
+ .maxlen = 65536,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
+ {
+ .procname = "ip_no_pmtu_disc",
+ .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_forward_use_pmtu",
+ .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_nonlocal_bind",
+ .data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv4.sysctl_fwmark_reflect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_fwmark_accept",
+ .data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_mtu_probing",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probing,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_base_mss",
+ .data = &init_net.ipv4.sysctl_tcp_base_mss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_probe_threshold",
+ .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_probe_interval",
+ .data = &init_net.ipv4.sysctl_tcp_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static __net_init int ipv4_sysctl_init_net(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = ipv4_net_table;
+ if (!net_eq(net, &init_net)) {
+ int i;
+
+ table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ /* Update the variables to point into the current struct net */
+ for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
+ table[i].data += (void *)net - (void *)&init_net;
+ }
+
+ net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!net->ipv4.ipv4_hdr)
+ goto err_reg;
+
+ net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!net->ipv4.sysctl_local_reserved_ports)
+ goto err_ports;
+
+ return 0;
+
+err_ports:
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static __net_exit void ipv4_sysctl_exit_net(struct net *net)
+{
+ struct ctl_table *table;
+
+ kfree(net->ipv4.sysctl_local_reserved_ports);
+ table = net->ipv4.ipv4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+ kfree(table);
+}
+
+static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
+ .init = ipv4_sysctl_init_net,
+ .exit = ipv4_sysctl_exit_net,
+};
+
+static __init int sysctl_ipv4_init(void)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (register_pernet_subsys(&ipv4_sysctl_ops)) {
+ unregister_net_sysctl_table(hdr);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+__initcall(sysctl_ipv4_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
new file mode 100644
index 000000000..ca6faeb44
--- /dev/null
+++ b/net/ipv4/tcp.c
@@ -0,0 +1,3225 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ *
+ * Fixes:
+ * Alan Cox : Numerous verify_area() calls
+ * Alan Cox : Set the ACK bit on a reset
+ * Alan Cox : Stopped it crashing if it closed while
+ * sk->inuse=1 and was trying to connect
+ * (tcp_err()).
+ * Alan Cox : All icmp error handling was broken
+ * pointers passed where wrong and the
+ * socket was looked up backwards. Nobody
+ * tested any icmp error code obviously.
+ * Alan Cox : tcp_err() now handled properly. It
+ * wakes people on errors. poll
+ * behaves and the icmp error race
+ * has gone by moving it into sock.c
+ * Alan Cox : tcp_send_reset() fixed to work for
+ * everything not just packets for
+ * unknown sockets.
+ * Alan Cox : tcp option processing.
+ * Alan Cox : Reset tweaked (still not 100%) [Had
+ * syn rule wrong]
+ * Herp Rosmanith : More reset fixes
+ * Alan Cox : No longer acks invalid rst frames.
+ * Acking any kind of RST is right out.
+ * Alan Cox : Sets an ignore me flag on an rst
+ * receive otherwise odd bits of prattle
+ * escape still
+ * Alan Cox : Fixed another acking RST frame bug.
+ * Should stop LAN workplace lockups.
+ * Alan Cox : Some tidyups using the new skb list
+ * facilities
+ * Alan Cox : sk->keepopen now seems to work
+ * Alan Cox : Pulls options out correctly on accepts
+ * Alan Cox : Fixed assorted sk->rqueue->next errors
+ * Alan Cox : PSH doesn't end a TCP read. Switched a
+ * bit to skb ops.
+ * Alan Cox : Tidied tcp_data to avoid a potential
+ * nasty.
+ * Alan Cox : Added some better commenting, as the
+ * tcp is hard to follow
+ * Alan Cox : Removed incorrect check for 20 * psh
+ * Michael O'Reilly : ack < copied bug fix.
+ * Johannes Stille : Misc tcp fixes (not all in yet).
+ * Alan Cox : FIN with no memory -> CRASH
+ * Alan Cox : Added socket option proto entries.
+ * Also added awareness of them to accept.
+ * Alan Cox : Added TCP options (SOL_TCP)
+ * Alan Cox : Switched wakeup calls to callbacks,
+ * so the kernel can layer network
+ * sockets.
+ * Alan Cox : Use ip_tos/ip_ttl settings.
+ * Alan Cox : Handle FIN (more) properly (we hope).
+ * Alan Cox : RST frames sent on unsynchronised
+ * state ack error.
+ * Alan Cox : Put in missing check for SYN bit.
+ * Alan Cox : Added tcp_select_window() aka NET2E
+ * window non shrink trick.
+ * Alan Cox : Added a couple of small NET2E timer
+ * fixes
+ * Charles Hedrick : TCP fixes
+ * Toomas Tamm : TCP window fixes
+ * Alan Cox : Small URG fix to rlogin ^C ack fight
+ * Charles Hedrick : Rewrote most of it to actually work
+ * Linus : Rewrote tcp_read() and URG handling
+ * completely
+ * Gerhard Koerting: Fixed some missing timer handling
+ * Matthew Dillon : Reworked TCP machine states as per RFC
+ * Gerhard Koerting: PC/TCP workarounds
+ * Adam Caldwell : Assorted timer/timing errors
+ * Matthew Dillon : Fixed another RST bug
+ * Alan Cox : Move to kernel side addressing changes.
+ * Alan Cox : Beginning work on TCP fastpathing
+ * (not yet usable)
+ * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
+ * Alan Cox : TCP fast path debugging
+ * Alan Cox : Window clamping
+ * Michael Riepe : Bug in tcp_check()
+ * Matt Dillon : More TCP improvements and RST bug fixes
+ * Matt Dillon : Yet more small nasties remove from the
+ * TCP code (Be very nice to this man if
+ * tcp finally works 100%) 8)
+ * Alan Cox : BSD accept semantics.
+ * Alan Cox : Reset on closedown bug.
+ * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
+ * Michael Pall : Handle poll() after URG properly in
+ * all cases.
+ * Michael Pall : Undo the last fix in tcp_read_urg()
+ * (multi URG PUSH broke rlogin).
+ * Michael Pall : Fix the multi URG PUSH problem in
+ * tcp_readable(), poll() after URG
+ * works now.
+ * Michael Pall : recv(...,MSG_OOB) never blocks in the
+ * BSD api.
+ * Alan Cox : Changed the semantics of sk->socket to
+ * fix a race and a signal problem with
+ * accept() and async I/O.
+ * Alan Cox : Relaxed the rules on tcp_sendto().
+ * Yury Shevchuk : Really fixed accept() blocking problem.
+ * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
+ * clients/servers which listen in on
+ * fixed ports.
+ * Alan Cox : Cleaned the above up and shrank it to
+ * a sensible code size.
+ * Alan Cox : Self connect lockup fix.
+ * Alan Cox : No connect to multicast.
+ * Ross Biro : Close unaccepted children on master
+ * socket close.
+ * Alan Cox : Reset tracing code.
+ * Alan Cox : Spurious resets on shutdown.
+ * Alan Cox : Giant 15 minute/60 second timer error
+ * Alan Cox : Small whoops in polling before an
+ * accept.
+ * Alan Cox : Kept the state trace facility since
+ * it's handy for debugging.
+ * Alan Cox : More reset handler fixes.
+ * Alan Cox : Started rewriting the code based on
+ * the RFC's for other useful protocol
+ * references see: Comer, KA9Q NOS, and
+ * for a reference on the difference
+ * between specifications and how BSD
+ * works see the 4.4lite source.
+ * A.N.Kuznetsov : Don't time wait on completion of tidy
+ * close.
+ * Linus Torvalds : Fin/Shutdown & copied_seq changes.
+ * Linus Torvalds : Fixed BSD port reuse to work first syn
+ * Alan Cox : Reimplemented timers as per the RFC
+ * and using multiple timers for sanity.
+ * Alan Cox : Small bug fixes, and a lot of new
+ * comments.
+ * Alan Cox : Fixed dual reader crash by locking
+ * the buffers (much like datagram.c)
+ * Alan Cox : Fixed stuck sockets in probe. A probe
+ * now gets fed up of retrying without
+ * (even a no space) answer.
+ * Alan Cox : Extracted closing code better
+ * Alan Cox : Fixed the closing state machine to
+ * resemble the RFC.
+ * Alan Cox : More 'per spec' fixes.
+ * Jorge Cwik : Even faster checksumming.
+ * Alan Cox : tcp_data() doesn't ack illegal PSH
+ * only frames. At least one pc tcp stack
+ * generates them.
+ * Alan Cox : Cache last socket.
+ * Alan Cox : Per route irtt.
+ * Matt Day : poll()->select() match BSD precisely on error
+ * Alan Cox : New buffers
+ * Marc Tamsky : Various sk->prot->retransmits and
+ * sk->retransmits misupdating fixed.
+ * Fixed tcp_write_timeout: stuck close,
+ * and TCP syn retries gets used now.
+ * Mark Yarvis : In tcp_read_wakeup(), don't send an
+ * ack if state is TCP_CLOSED.
+ * Alan Cox : Look up device on a retransmit - routes may
+ * change. Doesn't yet cope with MSS shrink right
+ * but it's a start!
+ * Marc Tamsky : Closing in closing fixes.
+ * Mike Shaver : RFC1122 verifications.
+ * Alan Cox : rcv_saddr errors.
+ * Alan Cox : Block double connect().
+ * Alan Cox : Small hooks for enSKIP.
+ * Alexey Kuznetsov: Path MTU discovery.
+ * Alan Cox : Support soft errors.
+ * Alan Cox : Fix MTU discovery pathological case
+ * when the remote claims no mtu!
+ * Marc Tamsky : TCP_CLOSE fix.
+ * Colin (G3TNE) : Send a reset on syn ack replies in
+ * window but wrong (fixes NT lpd problems)
+ * Pedro Roque : Better TCP window handling, delayed ack.
+ * Joerg Reuter : No modification of locked buffers in
+ * tcp_do_retransmit()
+ * Eric Schenk : Changed receiver side silly window
+ * avoidance algorithm to BSD style
+ * algorithm. This doubles throughput
+ * against machines running Solaris,
+ * and seems to result in general
+ * improvement.
+ * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
+ * Willy Konynenberg : Transparent proxying support.
+ * Mike McLagan : Routing by source
+ * Keith Owens : Do proper merging with partial SKB's in
+ * tcp_do_sendmsg to avoid burstiness.
+ * Eric Schenk : Fix fast close down bug with
+ * shutdown() followed by close().
+ * Andi Kleen : Make poll agree with SIGIO
+ * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
+ * lingertime == 0 (RFC 793 ABORT Call)
+ * Hirokazu Takahashi : Use copy_from_user() instead of
+ * csum_and_copy_from_user() if possible.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ *
+ * Description of States:
+ *
+ * TCP_SYN_SENT sent a connection request, waiting for ack
+ *
+ * TCP_SYN_RECV received a connection request, sent ack,
+ * waiting for final ack in three-way handshake.
+ *
+ * TCP_ESTABLISHED connection established
+ *
+ * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
+ * transmission of remaining buffered data
+ *
+ * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
+ * to shutdown
+ *
+ * TCP_CLOSING both sides have shutdown but we still have
+ * data we have to finish sending
+ *
+ * TCP_TIME_WAIT timeout to catch resent junk before entering
+ * closed, can only be entered from FIN_WAIT2
+ * or CLOSING. Required because the other end
+ * may not have gotten our last ACK causing it
+ * to retransmit the data packet (which we ignore)
+ *
+ * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
+ * us to finish writing our data and to shutdown
+ * (we have to close() to move on to LAST_ACK)
+ *
+ * TCP_LAST_ACK out side has shutdown after remote has
+ * shutdown. There may still be data in our
+ * buffer that we have to finish sending
+ *
+ * TCP_CLOSE socket is finished
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/inet_diag.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+#include <linux/splice.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/random.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/crypto.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+
+#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <net/busy_poll.h>
+
+int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
+int sysctl_tcp_autocorking __read_mostly = 1;
+
+struct percpu_counter tcp_orphan_count;
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
+
+long sysctl_tcp_mem[3] __read_mostly;
+int sysctl_tcp_wmem[3] __read_mostly;
+int sysctl_tcp_rmem[3] __read_mostly;
+
+EXPORT_SYMBOL(sysctl_tcp_mem);
+EXPORT_SYMBOL(sysctl_tcp_rmem);
+EXPORT_SYMBOL(sysctl_tcp_wmem);
+
+atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
+EXPORT_SYMBOL(tcp_memory_allocated);
+
+/*
+ * Current number of TCP sockets.
+ */
+struct percpu_counter tcp_sockets_allocated;
+EXPORT_SYMBOL(tcp_sockets_allocated);
+
+/*
+ * TCP splice context
+ */
+struct tcp_splice_state {
+ struct pipe_inode_info *pipe;
+ size_t len;
+ unsigned int flags;
+};
+
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the __sk_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+int tcp_memory_pressure __read_mostly;
+EXPORT_SYMBOL(tcp_memory_pressure);
+
+void tcp_enter_memory_pressure(struct sock *sk)
+{
+ if (!tcp_memory_pressure) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
+ tcp_memory_pressure = 1;
+ }
+}
+EXPORT_SYMBOL(tcp_enter_memory_pressure);
+
+/* Convert seconds to retransmits based on initial and max timeout */
+static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
+{
+ u8 res = 0;
+
+ if (seconds > 0) {
+ int period = timeout;
+
+ res = 1;
+ while (seconds > period && res < 255) {
+ res++;
+ timeout <<= 1;
+ if (timeout > rto_max)
+ timeout = rto_max;
+ period += timeout;
+ }
+ }
+ return res;
+}
+
+/* Convert retransmits to seconds based on initial and max timeout */
+static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
+{
+ int period = 0;
+
+ if (retrans > 0) {
+ period = timeout;
+ while (--retrans) {
+ timeout <<= 1;
+ if (timeout > rto_max)
+ timeout = rto_max;
+ period += timeout;
+ }
+ }
+ return period;
+}
+
+/* Address-family independent initialization for a tcp_sock.
+ *
+ * NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+void tcp_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ __skb_queue_head_init(&tp->out_of_order_queue);
+ tcp_init_xmit_timers(sk);
+ tcp_prequeue_init(tp);
+ INIT_LIST_HEAD(&tp->tsq_node);
+
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ tp->snd_cwnd = TCP_INIT_CWND;
+
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
+ * initialization of these values.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd_clamp = ~0;
+ tp->mss_cache = TCP_MSS_DEFAULT;
+ u64_stats_init(&tp->syncp);
+
+ tp->reordering = sysctl_tcp_reordering;
+ tcp_enable_early_retrans(tp);
+ tcp_assign_congestion_control(sk);
+
+ tp->tsoffset = 0;
+
+ sk->sk_state = TCP_CLOSE;
+
+ sk->sk_write_space = sk_stream_write_space;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ icsk->icsk_sync_mss = tcp_sync_mss;
+
+ sk->sk_sndbuf = sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+ local_bh_disable();
+ sock_update_memcg(sk);
+ sk_sockets_allocated_inc(sk);
+ local_bh_enable();
+}
+EXPORT_SYMBOL(tcp_init_sock);
+
+static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+ if (sk->sk_tsflags) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ sock_tx_timestamp(sk, &shinfo->tx_flags);
+ if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
+ shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+ }
+}
+
+/*
+ * Wait for a TCP event.
+ *
+ * Note that we don't need to lock the socket, as the upper poll layers
+ * take care of normal races (between the test and the event) and we don't
+ * go look at any of the socket buffers directly.
+ */
+unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ sock_rps_record_flow(sk);
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ if (sk->sk_state == TCP_LISTEN)
+ return inet_csk_listen_poll(sk);
+
+ /* Socket is not locked. We are protected from async events
+ * by poll logic and correct handling of state changes
+ * made by other threads is impossible in any case.
+ */
+
+ mask = 0;
+
+ /*
+ * POLLHUP is certainly not done right. But poll() doesn't
+ * have a notion of HUP in just one direction, and for a
+ * socket the read side is more interesting.
+ *
+ * Some poll() documentation says that POLLHUP is incompatible
+ * with the POLLOUT/POLLWR flags, so somebody should check this
+ * all. But careful, it tends to be safer to return too many
+ * bits than too few, and you can easily break real applications
+ * if you don't tell them that something has hung up!
+ *
+ * Check-me.
+ *
+ * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+ * our fs/select.c). It means that after we received EOF,
+ * poll always returns immediately, making impossible poll() on write()
+ * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+ * if and only if shutdown has been made in both directions.
+ * Actually, it is interesting to look how Solaris and DUX
+ * solve this dilemma. I would prefer, if POLLHUP were maskable,
+ * then we could set it on SND_SHUTDOWN. BTW examples given
+ * in Stevens' books assume exactly this behaviour, it explains
+ * why POLLHUP is incompatible with POLLOUT. --ANK
+ *
+ * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+ * blocking on fresh not-connected or disconnected socket. --ANK
+ */
+ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+
+ /* Connected or passive Fast Open socket? */
+ if (sk->sk_state != TCP_SYN_SENT &&
+ (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ int target = sock_rcvlowat(sk, 0, INT_MAX);
+
+ if (tp->urg_seq == tp->copied_seq &&
+ !sock_flag(sk, SOCK_URGINLINE) &&
+ tp->urg_data)
+ target++;
+
+ /* Potential race condition. If read of tp below will
+ * escape above sk->sk_state, we can be illegally awaken
+ * in SYN_* states. */
+ if (tp->rcv_nxt - tp->copied_seq >= target)
+ mask |= POLLIN | POLLRDNORM;
+
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (sk_stream_is_writeable(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else { /* send SIGIO later */
+ set_bit(SOCK_ASYNC_NOSPACE,
+ &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* Race breaker. If space is freed after
+ * wspace test but before the flags are set,
+ * IO signal will be lost. Memory barrier
+ * pairs with the input side.
+ */
+ smp_mb__after_atomic();
+ if (sk_stream_is_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ } else
+ mask |= POLLOUT | POLLWRNORM;
+
+ if (tp->urg_data & TCP_URG_VALID)
+ mask |= POLLPRI;
+ }
+ /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ smp_rmb();
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR;
+
+ return mask;
+}
+EXPORT_SYMBOL(tcp_poll);
+
+int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int answ;
+ bool slow;
+
+ switch (cmd) {
+ case SIOCINQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ slow = lock_sock_fast(sk);
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else if (sock_flag(sk, SOCK_URGINLINE) ||
+ !tp->urg_data ||
+ before(tp->urg_seq, tp->copied_seq) ||
+ !before(tp->urg_seq, tp->rcv_nxt)) {
+
+ answ = tp->rcv_nxt - tp->copied_seq;
+
+ /* Subtract 1, if FIN was received */
+ if (answ && sock_flag(sk, SOCK_DONE))
+ answ--;
+ } else
+ answ = tp->urg_seq - tp->copied_seq;
+ unlock_sock_fast(sk, slow);
+ break;
+ case SIOCATMARK:
+ answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+ break;
+ case SIOCOUTQ:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = tp->write_seq - tp->snd_una;
+ break;
+ case SIOCOUTQNSD:
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ answ = 0;
+ else
+ answ = tp->write_seq - tp->snd_nxt;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return put_user(answ, (int __user *)arg);
+}
+EXPORT_SYMBOL(tcp_ioctl);
+
+static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ tp->pushed_seq = tp->write_seq;
+}
+
+static inline bool forced_push(const struct tcp_sock *tp)
+{
+ return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
+}
+
+static void skb_entail(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ skb->csum = 0;
+ tcb->seq = tcb->end_seq = tp->write_seq;
+ tcb->tcp_flags = TCPHDR_ACK;
+ tcb->sacked = 0;
+ __skb_header_release(skb);
+ tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+ if (tp->nonagle & TCP_NAGLE_PUSH)
+ tp->nonagle &= ~TCP_NAGLE_PUSH;
+}
+
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
+{
+ if (flags & MSG_OOB)
+ tp->snd_up = tp->write_seq;
+}
+
+/* If a not yet filled skb is pushed, do not send it if
+ * we have data packets in Qdisc or NIC queues :
+ * Because TX completion will happen shortly, it gives a chance
+ * to coalesce future sendmsg() payload into this skb, without
+ * need for a timer, and with no latency trade off.
+ * As packets containing data payload have a bigger truesize
+ * than pure acks (dataless) packets, the last checks prevent
+ * autocorking if we only have an ACK in Qdisc/NIC queues,
+ * or if TX completion was delayed after we processed ACK packet.
+ */
+static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
+ int size_goal)
+{
+ return skb->len < size_goal &&
+ sysctl_tcp_autocorking &&
+ skb != tcp_write_queue_head(sk) &&
+ atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
+}
+
+static void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (!tcp_send_head(sk))
+ return;
+
+ skb = tcp_write_queue_tail(sk);
+ if (!(flags & MSG_MORE) || forced_push(tp))
+ tcp_mark_push(tp, skb);
+
+ tcp_mark_urg(tp, flags);
+
+ if (tcp_should_autocork(sk, skb, size_goal)) {
+
+ /* avoid atomic op if TSQ_THROTTLED bit is already set */
+ if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ }
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED.
+ */
+ if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
+ return;
+ }
+
+ if (flags & MSG_MORE)
+ nonagle = TCP_NAGLE_CORK;
+
+ __tcp_push_pending_frames(sk, mss_now, nonagle);
+}
+
+static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct tcp_splice_state *tss = rd_desc->arg.data;
+ int ret;
+
+ ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
+ tss->flags);
+ if (ret > 0)
+ rd_desc->count -= ret;
+ return ret;
+}
+
+static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+ /* Store TCP splice context information in read_descriptor_t. */
+ read_descriptor_t rd_desc = {
+ .arg.data = tss,
+ .count = tss->len,
+ };
+
+ return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ * tcp_splice_read - splice data from TCP socket to a pipe
+ * @sock: socket to splice from
+ * @ppos: position (not valid)
+ * @pipe: pipe to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * Will read pages from given socket and fill them into a pipe.
+ *
+ **/
+ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct tcp_splice_state tss = {
+ .pipe = pipe,
+ .len = len,
+ .flags = flags,
+ };
+ long timeo;
+ ssize_t spliced;
+ int ret;
+
+ sock_rps_record_flow(sk);
+ /*
+ * We can't seek on a socket input
+ */
+ if (unlikely(*ppos))
+ return -ESPIPE;
+
+ ret = spliced = 0;
+
+ lock_sock(sk);
+
+ timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+ while (tss.len) {
+ ret = __tcp_splice_read(sk, &tss);
+ if (ret < 0)
+ break;
+ else if (!ret) {
+ if (spliced)
+ break;
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ if (!sock_flag(sk, SOCK_DONE))
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ sk_wait_data(sk, &timeo);
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
+ }
+ tss.len -= ret;
+ spliced += ret;
+
+ if (!timeo)
+ break;
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current))
+ break;
+ }
+
+ release_sock(sk);
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+EXPORT_SYMBOL(tcp_splice_read);
+
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+{
+ struct sk_buff *skb;
+
+ /* The TCP header must be at least 32-bit aligned. */
+ size = ALIGN(size, 4);
+
+ skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+ if (skb) {
+ if (sk_wmem_schedule(sk, skb->truesize)) {
+ skb_reserve(skb, sk->sk_prot->max_header);
+ /*
+ * Make sure that we have exactly size bytes
+ * available to the caller, no more, no less.
+ */
+ skb->reserved_tailroom = skb->end - skb->tail - size;
+ return skb;
+ }
+ __kfree_skb(skb);
+ } else {
+ sk->sk_prot->enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ }
+ return NULL;
+}
+
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+ int large_allowed)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 new_size_goal, size_goal;
+
+ if (!large_allowed || !sk_can_gso(sk))
+ return mss_now;
+
+ /* Note : tcp_tso_autosize() will eventually split this later */
+ new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
+ new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+
+ /* We try hard to avoid divides here */
+ size_goal = tp->gso_segs * mss_now;
+ if (unlikely(new_size_goal < size_goal ||
+ new_size_goal >= size_goal + mss_now)) {
+ tp->gso_segs = min_t(u16, new_size_goal / mss_now,
+ sk->sk_gso_max_segs);
+ size_goal = tp->gso_segs * mss_now;
+ }
+
+ return max(size_goal, mss_now);
+}
+
+static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+ int mss_now;
+
+ mss_now = tcp_current_mss(sk);
+ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+
+ return mss_now;
+}
+
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now, size_goal;
+ int err;
+ ssize_t copied;
+ long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+ /* Wait for a connection to finish. One exception is TCP Fast Open
+ * (passive side) where data is allowed to be sent before a connection
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+ !tcp_passive_fastopen(sk)) {
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto out_err;
+ }
+
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+ copied = 0;
+
+ err = -EPIPE;
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto out_err;
+
+ while (size > 0) {
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+ int copy, i;
+ bool can_coalesce;
+
+ if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
+
+ skb_entail(sk, skb);
+ copy = size_goal;
+ }
+
+ if (copy > size)
+ copy = size;
+
+ i = skb_shinfo(skb)->nr_frags;
+ can_coalesce = skb_can_coalesce(skb, i, page, offset);
+ if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ if (can_coalesce) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+ }
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ sk->sk_wmem_queued += copy;
+ sk_mem_charge(sk, copy);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ tp->write_seq += copy;
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ if (!copied)
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+
+ copied += copy;
+ offset += copy;
+ if (!(size -= copy)) {
+ tcp_tx_timestamp(sk, skb);
+ goto out;
+ }
+
+ if (skb->len < size_goal || (flags & MSG_OOB))
+ continue;
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+ } else if (skb == tcp_send_head(sk))
+ tcp_push_one(sk, mss_now);
+ continue;
+
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
+
+ if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ goto do_error;
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+ }
+
+out:
+ if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+ return copied;
+
+do_error:
+ if (copied)
+ goto out;
+out_err:
+ return sk_stream_error(sk, flags, err);
+}
+
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ ssize_t res;
+
+ if (!(sk->sk_route_caps & NETIF_F_SG) ||
+ !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
+ return sock_no_sendpage(sk->sk_socket, page, offset, size,
+ flags);
+
+ lock_sock(sk);
+ res = do_tcp_sendpages(sk, page, offset, size, flags);
+ release_sock(sk);
+ return res;
+}
+EXPORT_SYMBOL(tcp_sendpage);
+
+static inline int select_size(const struct sock *sk, bool sg)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int tmp = tp->mss_cache;
+
+ if (sg) {
+ if (sk_can_gso(sk)) {
+ /* Small frames wont use a full page:
+ * Payload will immediately follow tcp header.
+ */
+ tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
+ } else {
+ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+
+ if (tmp >= pgbreak &&
+ tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+ tmp = pgbreak;
+ }
+ }
+
+ return tmp;
+}
+
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+ if (tp->fastopen_req) {
+ kfree(tp->fastopen_req);
+ tp->fastopen_req = NULL;
+ }
+}
+
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
+ int *copied, size_t size)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err, flags;
+
+ if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+ return -EOPNOTSUPP;
+ if (tp->fastopen_req)
+ return -EALREADY; /* Another Fast Open is in progress */
+
+ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+ sk->sk_allocation);
+ if (unlikely(!tp->fastopen_req))
+ return -ENOBUFS;
+ tp->fastopen_req->data = msg;
+ tp->fastopen_req->size = size;
+
+ flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+ err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ msg->msg_namelen, flags);
+ *copied = tp->fastopen_req->copied;
+ tcp_free_fastopen_req(tp);
+ return err;
+}
+
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int flags, err, copied = 0;
+ int mss_now = 0, size_goal, copied_syn = 0;
+ bool sg;
+ long timeo;
+
+ lock_sock(sk);
+
+ flags = msg->msg_flags;
+ if (flags & MSG_FASTOPEN) {
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
+ if (err == -EINPROGRESS && copied_syn > 0)
+ goto out;
+ else if (err)
+ goto out_err;
+ }
+
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+ /* Wait for a connection to finish. One exception is TCP Fast Open
+ * (passive side) where data is allowed to be sent before a connection
+ * is fully established.
+ */
+ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
+ !tcp_passive_fastopen(sk)) {
+ if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto do_error;
+ }
+
+ if (unlikely(tp->repair)) {
+ if (tp->repair_queue == TCP_RECV_QUEUE) {
+ copied = tcp_send_rcvq(sk, msg, size);
+ goto out_nopush;
+ }
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out_err;
+
+ /* 'common' sending to sendq */
+ }
+
+ /* This should be in poll */
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+
+ /* Ok commence sending. */
+ copied = 0;
+
+ err = -EPIPE;
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto out_err;
+
+ sg = !!(sk->sk_route_caps & NETIF_F_SG);
+
+ while (msg_data_left(msg)) {
+ int copy = 0;
+ int max = size_goal;
+
+ skb = tcp_write_queue_tail(sk);
+ if (tcp_send_head(sk)) {
+ if (skb->ip_summed == CHECKSUM_NONE)
+ max = mss_now;
+ copy = max - skb->len;
+ }
+
+ if (copy <= 0) {
+new_segment:
+ /* Allocate new segment. If the interface is SG,
+ * allocate skb fitting to single page.
+ */
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
+ skb = sk_stream_alloc_skb(sk,
+ select_size(sk, sg),
+ sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
+
+ /*
+ * Check whether we can use HW checksum.
+ */
+ if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_entail(sk, skb);
+ copy = size_goal;
+ max = size_goal;
+
+ /* All packets are restored as if they have
+ * already been sent. skb_mstamp isn't set to
+ * avoid wrong rtt estimation.
+ */
+ if (tp->repair)
+ TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
+ }
+
+ /* Try to append data to the end of skb. */
+ if (copy > msg_data_left(msg))
+ copy = msg_data_left(msg);
+
+ /* Where to copy to? */
+ if (skb_availroom(skb) > 0) {
+ /* We have some space in skb head. Superb! */
+ copy = min_t(int, copy, skb_availroom(skb));
+ err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
+ if (err)
+ goto do_fault;
+ } else {
+ bool merge = true;
+ int i = skb_shinfo(skb)->nr_frags;
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto wait_for_memory;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ if (i == MAX_SKB_FRAGS || !sg) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ merge = false;
+ }
+
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
+
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
+ goto do_error;
+
+ /* Update the skb. */
+ if (merge) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ get_page(pfrag->page);
+ }
+ pfrag->offset += copy;
+ }
+
+ if (!copied)
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+
+ tp->write_seq += copy;
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ copied += copy;
+ if (!msg_data_left(msg)) {
+ tcp_tx_timestamp(sk, skb);
+ goto out;
+ }
+
+ if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
+ continue;
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+ } else if (skb == tcp_send_head(sk))
+ tcp_push_one(sk, mss_now);
+ continue;
+
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ if (copied)
+ tcp_push(sk, flags & ~MSG_MORE, mss_now,
+ TCP_NAGLE_PUSH, size_goal);
+
+ if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ goto do_error;
+
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+ }
+
+out:
+ if (copied)
+ tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
+out_nopush:
+ release_sock(sk);
+ return copied + copied_syn;
+
+do_fault:
+ if (!skb->len) {
+ tcp_unlink_write_queue(skb, sk);
+ /* It is the one place in all of TCP, except connection
+ * reset, where we can be unlinking the send_head.
+ */
+ tcp_check_send_head(sk, skb);
+ sk_wmem_free_skb(sk, skb);
+ }
+
+do_error:
+ if (copied + copied_syn)
+ goto out;
+out_err:
+ err = sk_stream_error(sk, flags, err);
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(tcp_sendmsg);
+
+/*
+ * Handle reading urgent data. BSD has very simple semantics for
+ * this, no blocking and very strange errors 8)
+ */
+
+static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* No URG data to read. */
+ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
+ tp->urg_data == TCP_URG_READ)
+ return -EINVAL; /* Yes this is right ! */
+
+ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
+ return -ENOTCONN;
+
+ if (tp->urg_data & TCP_URG_VALID) {
+ int err = 0;
+ char c = tp->urg_data;
+
+ if (!(flags & MSG_PEEK))
+ tp->urg_data = TCP_URG_READ;
+
+ /* Read urgent data. */
+ msg->msg_flags |= MSG_OOB;
+
+ if (len > 0) {
+ if (!(flags & MSG_TRUNC))
+ err = memcpy_to_msg(msg, &c, 1);
+ len = 1;
+ } else
+ msg->msg_flags |= MSG_TRUNC;
+
+ return err ? -EFAULT : len;
+ }
+
+ if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+ return 0;
+
+ /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
+ * the available implementations agree in this case:
+ * this call should never block, independent of the
+ * blocking state of the socket.
+ * Mike <pall@rz.uni-karlsruhe.de>
+ */
+ return -EAGAIN;
+}
+
+static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct sk_buff *skb;
+ int copied = 0, err = 0;
+
+ /* XXX -- need to support SO_PEEK_OFF */
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+ if (err)
+ break;
+
+ copied += skb->len;
+ }
+
+ return err ?: copied;
+}
+
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary. COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool time_to_ack = false;
+
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+ WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+ "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+
+ if (inet_csk_ack_scheduled(sk)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ /* Delayed ACKs frequently hit locked sockets during bulk
+ * receive. */
+ if (icsk->icsk_ack.blocked ||
+ /* Once-per-two-segments ACK was not sent by tcp_input.c */
+ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
+ /*
+ * If this read emptied read buffer, we send ACK, if
+ * connection is not bidirectional, user drained
+ * receive buffer and there was a small segment
+ * in queue.
+ */
+ (copied > 0 &&
+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
+ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+ !icsk->icsk_ack.pingpong)) &&
+ !atomic_read(&sk->sk_rmem_alloc)))
+ time_to_ack = true;
+ }
+
+ /* We send an ACK if we can now advertise a non-zero window
+ * which has been raised "significantly".
+ *
+ * Even if window raised up to infinity, do not send window open ACK
+ * in states, where we will not receive more. It is useless.
+ */
+ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ __u32 rcv_window_now = tcp_receive_window(tp);
+
+ /* Optimize, __tcp_select_window() is not cheap. */
+ if (2*rcv_window_now <= tp->window_clamp) {
+ __u32 new_window = __tcp_select_window(sk);
+
+ /* Send ACK now, if this read freed lots of space
+ * in our buffer. Certainly, new_window is new window.
+ * We can advertise it now, if it is not less than current one.
+ * "Lots" means "at least twice" here.
+ */
+ if (new_window && new_window >= 2 * rcv_window_now)
+ time_to_ack = true;
+ }
+ }
+ if (time_to_ack)
+ tcp_send_ack(sk);
+}
+
+static void tcp_prequeue_process(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
+
+ /* RX process wants to run with disabled BHs, though it is not
+ * necessary */
+ local_bh_disable();
+ while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+ sk_backlog_rcv(sk, skb);
+ local_bh_enable();
+
+ /* Clear memory counter. */
+ tp->ucopy.memory = 0;
+}
+
+static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+{
+ struct sk_buff *skb;
+ u32 offset;
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ offset--;
+ if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
+ *off = offset;
+ return skb;
+ }
+ /* This looks weird, but this can happen if TCP collapsing
+ * splitted a fat GRO packet, while we released socket lock
+ * in skb_splice_bits()
+ */
+ sk_eat_skb(sk, skb);
+ }
+ return NULL;
+}
+
+/*
+ * This routine provides an alternative to tcp_recvmsg() for routines
+ * that would like to handle copying from skbuffs directly in 'sendfile'
+ * fashion.
+ * Note:
+ * - It is assumed that the socket was locked by the caller.
+ * - The routine does not block.
+ * - At present, there is no support for reading OOB data
+ * or for 'peeking' the socket using this routine
+ * (although both would be easy to implement).
+ */
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sk_buff *skb;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 seq = tp->copied_seq;
+ u32 offset;
+ int copied = 0;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+ while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ if (offset < skb->len) {
+ int used;
+ size_t len;
+
+ len = skb->len - offset;
+ /* Stop reading if we hit a patch of urgent data */
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - seq;
+ if (urg_offset < len)
+ len = urg_offset;
+ if (!len)
+ break;
+ }
+ used = recv_actor(desc, skb, offset, len);
+ if (used <= 0) {
+ if (!copied)
+ copied = used;
+ break;
+ } else if (used <= len) {
+ seq += used;
+ copied += used;
+ offset += used;
+ }
+ /* If recv_actor drops the lock (e.g. TCP splice
+ * receive) the skb pointer might be invalid when
+ * getting here: tcp_collapse might have deleted it
+ * while aggregating skbs from the socket queue.
+ */
+ skb = tcp_recv_skb(sk, seq - 1, &offset);
+ if (!skb)
+ break;
+ /* TCP coalescing might have appended data to the skb.
+ * Try to splice more frags
+ */
+ if (offset + 1 != skb->len)
+ continue;
+ }
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ sk_eat_skb(sk, skb);
+ ++seq;
+ break;
+ }
+ sk_eat_skb(sk, skb);
+ if (!desc->count)
+ break;
+ tp->copied_seq = seq;
+ }
+ tp->copied_seq = seq;
+
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ if (copied > 0) {
+ tcp_recv_skb(sk, seq, &offset);
+ tcp_cleanup_rbuf(sk, copied);
+ }
+ return copied;
+}
+EXPORT_SYMBOL(tcp_read_sock);
+
+/*
+ * This routine copies from a sock struct into the user buffer.
+ *
+ * Technical note: in 2.3 we work on _locked_ socket, so that
+ * tricks with *seq access order and skb->users are not required.
+ * Probably, code can be easily improved even more.
+ */
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int copied = 0;
+ u32 peek_seq;
+ u32 *seq;
+ unsigned long used;
+ int err;
+ int target; /* Read at least this many bytes */
+ long timeo;
+ struct task_struct *user_recv = NULL;
+ struct sk_buff *skb;
+ u32 urg_hole = 0;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
+ (sk->sk_state == TCP_ESTABLISHED))
+ sk_busy_loop(sk, nonblock);
+
+ lock_sock(sk);
+
+ err = -ENOTCONN;
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ /* Urgent data needs to be handled specially. */
+ if (flags & MSG_OOB)
+ goto recv_urg;
+
+ if (unlikely(tp->repair)) {
+ err = -EPERM;
+ if (!(flags & MSG_PEEK))
+ goto out;
+
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ goto recv_sndq;
+
+ err = -EINVAL;
+ if (tp->repair_queue == TCP_NO_QUEUE)
+ goto out;
+
+ /* 'common' recv queue MSG_PEEK-ing */
+ }
+
+ seq = &tp->copied_seq;
+ if (flags & MSG_PEEK) {
+ peek_seq = tp->copied_seq;
+ seq = &peek_seq;
+ }
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ do {
+ u32 offset;
+
+ /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+ if (tp->urg_data && tp->urg_seq == *seq) {
+ if (copied)
+ break;
+ if (signal_pending(current)) {
+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+ break;
+ }
+ }
+
+ /* Next get a buffer. */
+
+ skb_queue_walk(&sk->sk_receive_queue, skb) {
+ /* Now that we have two receive queues this
+ * shouldn't happen.
+ */
+ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+ "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
+ *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
+ flags))
+ break;
+
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ offset--;
+ if (offset < skb->len)
+ goto found_ok_skb;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ goto found_fin_ok;
+ WARN(!(flags & MSG_PEEK),
+ "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+ *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
+ }
+
+ /* Well, if we have backlog, try to process it now yet. */
+
+ if (copied >= target && !sk->sk_backlog.tail)
+ break;
+
+ if (copied) {
+ if (sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ signal_pending(current))
+ break;
+ } else {
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ tcp_cleanup_rbuf(sk, copied);
+
+ if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
+ /* Install new reader */
+ if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
+ user_recv = current;
+ tp->ucopy.task = user_recv;
+ tp->ucopy.msg = msg;
+ }
+
+ tp->ucopy.len = len;
+
+ WARN_ON(tp->copied_seq != tp->rcv_nxt &&
+ !(flags & (MSG_PEEK | MSG_TRUNC)));
+
+ /* Ugly... If prequeue is not empty, we have to
+ * process it before releasing socket, otherwise
+ * order will be broken at second iteration.
+ * More elegant solution is required!!!
+ *
+ * Look: we have the following (pseudo)queues:
+ *
+ * 1. packets in flight
+ * 2. backlog
+ * 3. prequeue
+ * 4. receive_queue
+ *
+ * Each queue can be processed only if the next ones
+ * are empty. At this point we have empty receive_queue.
+ * But prequeue _can_ be not empty after 2nd iteration,
+ * when we jumped to start of loop because backlog
+ * processing added something to receive_queue.
+ * We cannot release_sock(), because backlog contains
+ * packets arrived _after_ prequeued ones.
+ *
+ * Shortly, algorithm is clear --- to process all
+ * the queues in order. We could make it more directly,
+ * requeueing packets from backlog to prequeue, if
+ * is not empty. It is more elegant, but eats cycles,
+ * unfortunately.
+ */
+ if (!skb_queue_empty(&tp->ucopy.prequeue))
+ goto do_prequeue;
+
+ /* __ Set realtime policy in scheduler __ */
+ }
+
+ if (copied >= target) {
+ /* Do not sleep, just process backlog. */
+ release_sock(sk);
+ lock_sock(sk);
+ } else
+ sk_wait_data(sk, &timeo);
+
+ if (user_recv) {
+ int chunk;
+
+ /* __ Restore normal policy in scheduler __ */
+
+ if ((chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+
+ if (tp->rcv_nxt == tp->copied_seq &&
+ !skb_queue_empty(&tp->ucopy.prequeue)) {
+do_prequeue:
+ tcp_prequeue_process(sk);
+
+ if ((chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+ }
+ }
+ if ((flags & MSG_PEEK) &&
+ (peek_seq - copied - urg_hole != tp->copied_seq)) {
+ net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
+ current->comm,
+ task_pid_nr(current));
+ peek_seq = tp->copied_seq;
+ }
+ continue;
+
+ found_ok_skb:
+ /* Ok so how much can we use? */
+ used = skb->len - offset;
+ if (len < used)
+ used = len;
+
+ /* Do we have urgent data here? */
+ if (tp->urg_data) {
+ u32 urg_offset = tp->urg_seq - *seq;
+ if (urg_offset < used) {
+ if (!urg_offset) {
+ if (!sock_flag(sk, SOCK_URGINLINE)) {
+ ++*seq;
+ urg_hole++;
+ offset++;
+ used--;
+ if (!used)
+ goto skip_copy;
+ }
+ } else
+ used = urg_offset;
+ }
+ }
+
+ if (!(flags & MSG_TRUNC)) {
+ err = skb_copy_datagram_msg(skb, offset, msg, used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ }
+
+ *seq += used;
+ copied += used;
+ len -= used;
+
+ tcp_rcv_space_adjust(sk);
+
+skip_copy:
+ if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+ tp->urg_data = 0;
+ tcp_fast_path_check(sk);
+ }
+ if (used + offset < skb->len)
+ continue;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ goto found_fin_ok;
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ continue;
+
+ found_fin_ok:
+ /* Process the FIN. */
+ ++*seq;
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ break;
+ } while (len > 0);
+
+ if (user_recv) {
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
+ int chunk;
+
+ tp->ucopy.len = copied > 0 ? len : 0;
+
+ tcp_prequeue_process(sk);
+
+ if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+ NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ len -= chunk;
+ copied += chunk;
+ }
+ }
+
+ tp->ucopy.task = NULL;
+ tp->ucopy.len = 0;
+ }
+
+ /* According to UNIX98, msg_name/msg_namelen are ignored
+ * on connected socket. I was just happy when found this 8) --ANK
+ */
+
+ /* Clean up data we have read: This will do ACK frames. */
+ tcp_cleanup_rbuf(sk, copied);
+
+ release_sock(sk);
+ return copied;
+
+out:
+ release_sock(sk);
+ return err;
+
+recv_urg:
+ err = tcp_recv_urg(sk, msg, len, flags);
+ goto out;
+
+recv_sndq:
+ err = tcp_peek_sndq(sk, msg, len);
+ goto out;
+}
+EXPORT_SYMBOL(tcp_recvmsg);
+
+void tcp_set_state(struct sock *sk, int state)
+{
+ int oldstate = sk->sk_state;
+
+ switch (state) {
+ case TCP_ESTABLISHED:
+ if (oldstate != TCP_ESTABLISHED)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ break;
+
+ case TCP_CLOSE:
+ if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
+
+ sk->sk_prot->unhash(sk);
+ if (inet_csk(sk)->icsk_bind_hash &&
+ !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ inet_put_port(sk);
+ /* fall through */
+ default:
+ if (oldstate == TCP_ESTABLISHED)
+ TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ }
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ sk->sk_state = state;
+
+#ifdef STATE_TRACE
+ SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
+#endif
+}
+EXPORT_SYMBOL_GPL(tcp_set_state);
+
+/*
+ * State processing on a close. This implements the state shift for
+ * sending our FIN frame. Note that we only send a FIN for some
+ * states. A shutdown() may have already sent the FIN, or we may be
+ * closed.
+ */
+
+static const unsigned char new_state[16] = {
+ /* current state: new state: action: */
+ [0 /* (Invalid) */] = TCP_CLOSE,
+ [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_SYN_SENT] = TCP_CLOSE,
+ [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+ [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
+ [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
+ [TCP_TIME_WAIT] = TCP_CLOSE,
+ [TCP_CLOSE] = TCP_CLOSE,
+ [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
+ [TCP_LAST_ACK] = TCP_LAST_ACK,
+ [TCP_LISTEN] = TCP_CLOSE,
+ [TCP_CLOSING] = TCP_CLOSING,
+ [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */
+};
+
+static int tcp_close_state(struct sock *sk)
+{
+ int next = (int)new_state[sk->sk_state];
+ int ns = next & TCP_STATE_MASK;
+
+ tcp_set_state(sk, ns);
+
+ return next & TCP_ACTION_FIN;
+}
+
+/*
+ * Shutdown the sending side of a connection. Much like close except
+ * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
+ */
+
+void tcp_shutdown(struct sock *sk, int how)
+{
+ /* We need to grab some memory, and put together a FIN,
+ * and then put it into the queue to be sent.
+ * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
+ */
+ if (!(how & SEND_SHUTDOWN))
+ return;
+
+ /* If we've already sent a FIN, or it's a closed state, skip this. */
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+ TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+ /* Clear out any half completed packets. FIN if needed. */
+ if (tcp_close_state(sk))
+ tcp_send_fin(sk);
+ }
+}
+EXPORT_SYMBOL(tcp_shutdown);
+
+bool tcp_check_oom(struct sock *sk, int shift)
+{
+ bool too_many_orphans, out_of_socket_memory;
+
+ too_many_orphans = tcp_too_many_orphans(sk, shift);
+ out_of_socket_memory = tcp_out_of_memory(sk);
+
+ if (too_many_orphans)
+ net_info_ratelimited("too many orphaned sockets\n");
+ if (out_of_socket_memory)
+ net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
+ return too_many_orphans || out_of_socket_memory;
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ struct sk_buff *skb;
+ int data_was_unread = 0;
+ int state;
+
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ tcp_set_state(sk, TCP_CLOSE);
+
+ /* Special case. */
+ inet_csk_listen_stop(sk);
+
+ goto adjudge_to_death;
+ }
+
+ /* We need to flush the recv. buffs. We do this only on the
+ * descriptor close, not protocol-sourced closes, because the
+ * reader process may not have drained the data yet!
+ */
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ len--;
+ data_was_unread += len;
+ __kfree_skb(skb);
+ }
+
+ sk_mem_reclaim(sk);
+
+ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+ if (sk->sk_state == TCP_CLOSE)
+ goto adjudge_to_death;
+
+ /* As outlined in RFC 2525, section 2.17, we send a RST here because
+ * data was lost. To witness the awful effects of the old behavior of
+ * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
+ * GET in an FTP client, suspend the process, wait for the client to
+ * advertise a zero window, then kill -9 the FTP client, wheee...
+ * Note: timeout is always zero in such a case.
+ */
+ if (unlikely(tcp_sk(sk)->repair)) {
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (data_was_unread) {
+ /* Unread data was tossed, zap the connection. */
+ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, sk->sk_allocation);
+ } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+ /* Check zero linger _after_ checking for unread data. */
+ sk->sk_prot->disconnect(sk, 0);
+ NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ } else if (tcp_close_state(sk)) {
+ /* We FIN if the application ate all the data before
+ * zapping the connection.
+ */
+
+ /* RED-PEN. Formally speaking, we have broken TCP state
+ * machine. State transitions:
+ *
+ * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+ * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
+ * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+ *
+ * are legal only when FIN has been sent (i.e. in window),
+ * rather than queued out of window. Purists blame.
+ *
+ * F.e. "RFC state" is ESTABLISHED,
+ * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+ *
+ * The visible declinations are that sometimes
+ * we enter time-wait state, when it is not required really
+ * (harmless), do not send active resets, when they are
+ * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+ * they look as CLOSING or LAST_ACK for Linux)
+ * Probably, I missed some more holelets.
+ * --ANK
+ * XXX (TFO) - To start off we don't support SYN+ACK+FIN
+ * in a single packet! (May consider it later but will
+ * probably need API support or TCP_CORK SYN-ACK until
+ * data is written and socket is closed.)
+ */
+ tcp_send_fin(sk);
+ }
+
+ sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+ state = sk->sk_state;
+ sock_hold(sk);
+ sock_orphan(sk);
+
+ /* It is the last release_sock in its life. It will remove backlog. */
+ release_sock(sk);
+
+
+ /* Now socket is owned by kernel and we acquire BH lock
+ to finish close. No need to check for user refs.
+ */
+ local_bh_disable();
+ bh_lock_sock(sk);
+ WARN_ON(sock_owned_by_user(sk));
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ /* Have we already been destroyed by a softirq or backlog? */
+ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ /* This is a (useful) BSD violating of the RFC. There is a
+ * problem with TCP as specified in that the other end could
+ * keep a socket open forever with no application left this end.
+ * We use a 1 minute timeout (about the same as BSD) then kill
+ * our end. If they send after that then tough - BUT: long enough
+ * that we won't make the old 4*rto = almost no time - whoops
+ * reset mistake.
+ *
+ * Nope, it was not mistake. It is really desired behaviour
+ * f.e. on http servers, when such sockets are useless, but
+ * consume significant resources. Let's do it with special
+ * linger2 option. --ANK
+ */
+
+ if (sk->sk_state == TCP_FIN_WAIT2) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tp->linger2 < 0) {
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPABORTONLINGER);
+ } else {
+ const int tmo = tcp_fin_time(sk);
+
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ inet_csk_reset_keepalive_timer(sk,
+ tmo - TCP_TIMEWAIT_LEN);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ }
+ if (sk->sk_state != TCP_CLOSE) {
+ sk_mem_reclaim(sk);
+ if (tcp_check_oom(sk, 0)) {
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPABORTONMEMORY);
+ }
+ }
+
+ if (sk->sk_state == TCP_CLOSE) {
+ struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ /* We could get here with a non-NULL req if the socket is
+ * aborted (e.g., closed with unread data) before 3WHS
+ * finishes.
+ */
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
+ inet_csk_destroy_sock(sk);
+ }
+ /* Otherwise, socket is reprieved until protocol close. */
+
+out:
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+}
+EXPORT_SYMBOL(tcp_close);
+
+/* These states need RST on ABORT according to RFC793 */
+
+static inline bool tcp_need_reset(int state)
+{
+ return (1 << state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+ TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+}
+
+int tcp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = 0;
+ int old_state = sk->sk_state;
+
+ if (old_state != TCP_CLOSE)
+ tcp_set_state(sk, TCP_CLOSE);
+
+ /* ABORT function of RFC793 */
+ if (old_state == TCP_LISTEN) {
+ inet_csk_listen_stop(sk);
+ } else if (unlikely(tp->repair)) {
+ sk->sk_err = ECONNABORTED;
+ } else if (tcp_need_reset(old_state) ||
+ (tp->snd_nxt != tp->write_seq &&
+ (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+ /* The last check adjusts for discrepancy of Linux wrt. RFC
+ * states
+ */
+ tcp_send_active_reset(sk, gfp_any());
+ sk->sk_err = ECONNRESET;
+ } else if (old_state == TCP_SYN_SENT)
+ sk->sk_err = ECONNRESET;
+
+ tcp_clear_xmit_timers(sk);
+ __skb_queue_purge(&sk->sk_receive_queue);
+ tcp_write_queue_purge(sk);
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ inet->inet_dport = 0;
+
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ sk->sk_shutdown = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+ tp->srtt_us = 0;
+ if ((tp->write_seq += tp->max_window + 2) == 0)
+ tp->write_seq = 1;
+ icsk->icsk_backoff = 0;
+ tp->snd_cwnd = 2;
+ icsk->icsk_probes_out = 0;
+ tp->packets_out = 0;
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd_cnt = 0;
+ tp->window_clamp = 0;
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ tcp_clear_retrans(tp);
+ inet_csk_delack_init(sk);
+ tcp_init_send_head(sk);
+ memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
+ __sk_dst_reset(sk);
+
+ WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
+
+ sk->sk_error_report(sk);
+ return err;
+}
+EXPORT_SYMBOL(tcp_disconnect);
+
+void tcp_sock_destruct(struct sock *sk)
+{
+ inet_sock_destruct(sk);
+
+ kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
+}
+
+static inline bool tcp_can_repair_sock(const struct sock *sk)
+{
+ return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+}
+
+static int tcp_repair_options_est(struct tcp_sock *tp,
+ struct tcp_repair_opt __user *optbuf, unsigned int len)
+{
+ struct tcp_repair_opt opt;
+
+ while (len >= sizeof(opt)) {
+ if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ return -EFAULT;
+
+ optbuf++;
+ len -= sizeof(opt);
+
+ switch (opt.opt_code) {
+ case TCPOPT_MSS:
+ tp->rx_opt.mss_clamp = opt.opt_val;
+ break;
+ case TCPOPT_WINDOW:
+ {
+ u16 snd_wscale = opt.opt_val & 0xFFFF;
+ u16 rcv_wscale = opt.opt_val >> 16;
+
+ if (snd_wscale > 14 || rcv_wscale > 14)
+ return -EFBIG;
+
+ tp->rx_opt.snd_wscale = snd_wscale;
+ tp->rx_opt.rcv_wscale = rcv_wscale;
+ tp->rx_opt.wscale_ok = 1;
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
+ if (sysctl_tcp_fack)
+ tcp_enable_fack(tp);
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opt.opt_val != 0)
+ return -EINVAL;
+
+ tp->rx_opt.tstamp_ok = 1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_TCP_STEALTH
+int tcp_stealth_integrity(__be16 *hash, u8 *secret, u8 *payload, int len)
+{
+ struct scatterlist sg[2];
+ struct crypto_hash *tfm;
+ struct hash_desc desc;
+ __be16 h[MD5_DIGEST_WORDS * 2];
+ int i;
+ int err = 0;
+
+ tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm)) {
+ err = -PTR_ERR(tfm);
+ goto out;
+ }
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ sg_init_table(sg, 2);
+ sg_set_buf(&sg[0], secret, MD5_MESSAGE_BYTES);
+ sg_set_buf(&sg[1], payload, len);
+
+ if (crypto_hash_digest(&desc, sg, MD5_MESSAGE_BYTES + len, (u8 *)h)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ *hash = be16_to_cpu(h[0]);
+ for (i = 1; i < MD5_DIGEST_WORDS * 2; i++)
+ *hash ^= be16_to_cpu(h[i]);
+
+out:
+ crypto_free_hash(tfm);
+ return err;
+}
+#endif
+
+/*
+ * Socket option code for TCP.
+ */
+static int do_tcp_setsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, unsigned int optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int val;
+ int err = 0;
+
+ /* These are data/string values, all the others are ints */
+ switch (optname) {
+ case TCP_CONGESTION: {
+ char name[TCP_CA_NAME_MAX];
+
+ if (optlen < 1)
+ return -EINVAL;
+
+ val = strncpy_from_user(name, optval,
+ min_t(long, TCP_CA_NAME_MAX-1, optlen));
+ if (val < 0)
+ return -EFAULT;
+ name[val] = 0;
+
+ lock_sock(sk);
+ err = tcp_set_congestion_control(sk, name);
+ release_sock(sk);
+ return err;
+ }
+#ifdef CONFIG_TCP_STEALTH
+ case TCP_STEALTH: {
+ u8 secret[MD5_MESSAGE_BYTES] = { 0 };
+
+ val = copy_from_user(secret, optval,
+ min_t(unsigned int, optlen,
+ MD5_MESSAGE_BYTES));
+ if (val != 0)
+ return -EFAULT;
+
+ lock_sock(sk);
+ memcpy(tp->stealth.secret, secret, MD5_MESSAGE_BYTES);
+ tp->stealth.mode = TCP_STEALTH_MODE_AUTH;
+ tp->stealth.mstamp.v64 = 0;
+ tp->stealth.saw_tsval = false;
+ release_sock(sk);
+ return err;
+ }
+ case TCP_STEALTH_INTEGRITY: {
+ u8 *payload;
+
+ lock_sock(sk);
+
+ if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+ err = -EOPNOTSUPP;
+ goto stealth_integrity_out_1;
+ }
+
+ if (optlen < 1 || optlen > USHRT_MAX) {
+ err = -EINVAL;
+ goto stealth_integrity_out_1;
+ }
+
+ payload = vmalloc(optlen);
+ if (!payload) {
+ err = -ENOMEM;
+ goto stealth_integrity_out_1;
+ }
+
+ val = copy_from_user(payload, optval, optlen);
+ if (val != 0) {
+ err = -EFAULT;
+ goto stealth_integrity_out_2;
+ }
+
+ err = tcp_stealth_integrity(&tp->stealth.integrity_hash,
+ tp->stealth.secret, payload,
+ optlen);
+ if (err)
+ goto stealth_integrity_out_2;
+
+ tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY;
+
+stealth_integrity_out_2:
+ vfree(payload);
+stealth_integrity_out_1:
+ release_sock(sk);
+ return err;
+ }
+#endif
+ default:
+ /* fallthru */
+ break;
+ }
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case TCP_MAXSEG:
+ /* Values greater than interface MTU won't take effect. However
+ * at the point when this call is done we typically don't yet
+ * know which interface is going to be used */
+ if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
+ err = -EINVAL;
+ break;
+ }
+ tp->rx_opt.user_mss = val;
+ break;
+
+ case TCP_NODELAY:
+ if (val) {
+ /* TCP_NODELAY is weaker than TCP_CORK, so that
+ * this option on corked socket is remembered, but
+ * it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make
+ * an explicit push, which overrides even TCP_CORK
+ * for currently queued segments.
+ */
+ tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_OFF;
+ }
+ break;
+
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->thin_lto = val;
+ break;
+
+ case TCP_THIN_DUPACK:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else {
+ tp->thin_dupack = val;
+ if (tp->thin_dupack)
+ tcp_disable_early_retrans(tp);
+ }
+ break;
+
+ case TCP_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ err = -EPERM;
+ else if (val == 1) {
+ tp->repair = 1;
+ sk->sk_reuse = SK_FORCE_REUSE;
+ tp->repair_queue = TCP_NO_QUEUE;
+ } else if (val == 0) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
+ tcp_send_window_probe(sk);
+ } else
+ err = -EINVAL;
+
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (!tp->repair)
+ err = -EPERM;
+ else if (val < TCP_QUEUES_NR)
+ tp->repair_queue = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (sk->sk_state != TCP_CLOSE)
+ err = -EPERM;
+ else if (tp->repair_queue == TCP_SEND_QUEUE)
+ tp->write_seq = val;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ tp->rcv_nxt = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case TCP_REPAIR_OPTIONS:
+ if (!tp->repair)
+ err = -EINVAL;
+ else if (sk->sk_state == TCP_ESTABLISHED)
+ err = tcp_repair_options_est(tp,
+ (struct tcp_repair_opt __user *)optval,
+ optlen);
+ else
+ err = -EPERM;
+ break;
+
+ case TCP_CORK:
+ /* When set indicates to always queue non-full frames.
+ * Later the user clears this option and we transmit
+ * any pending partial frames in the queue. This is
+ * meant to be used alongside sendfile() to get properly
+ * filled frames when the user (for example) must write
+ * out headers with a write() call first and then use
+ * sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is
+ * stronger than TCP_NODELAY.
+ */
+ if (val) {
+ tp->nonagle |= TCP_NAGLE_CORK;
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_CORK;
+ if (tp->nonagle&TCP_NAGLE_OFF)
+ tp->nonagle |= TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ }
+ break;
+
+ case TCP_KEEPIDLE:
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
+ err = -EINVAL;
+ else {
+ tp->keepalive_time = val * HZ;
+ if (sock_flag(sk, SOCK_KEEPOPEN) &&
+ !((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN))) {
+ u32 elapsed = keepalive_time_elapsed(tp);
+ if (tp->keepalive_time > elapsed)
+ elapsed = tp->keepalive_time - elapsed;
+ else
+ elapsed = 0;
+ inet_csk_reset_keepalive_timer(sk, elapsed);
+ }
+ }
+ break;
+ case TCP_KEEPINTVL:
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
+ err = -EINVAL;
+ else
+ tp->keepalive_intvl = val * HZ;
+ break;
+ case TCP_KEEPCNT:
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
+ err = -EINVAL;
+ else
+ tp->keepalive_probes = val;
+ break;
+ case TCP_SYNCNT:
+ if (val < 1 || val > MAX_TCP_SYNCNT)
+ err = -EINVAL;
+ else
+ icsk->icsk_syn_retries = val;
+ break;
+
+ case TCP_LINGER2:
+ if (val < 0)
+ tp->linger2 = -1;
+ else if (val > sysctl_tcp_fin_timeout / HZ)
+ tp->linger2 = 0;
+ else
+ tp->linger2 = val * HZ;
+ break;
+
+ case TCP_DEFER_ACCEPT:
+ /* Translate value in seconds to number of retransmits */
+ icsk->icsk_accept_queue.rskq_defer_accept =
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ);
+ break;
+
+ case TCP_WINDOW_CLAMP:
+ if (!val) {
+ if (sk->sk_state != TCP_CLOSE) {
+ err = -EINVAL;
+ break;
+ }
+ tp->window_clamp = 0;
+ } else
+ tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+ SOCK_MIN_RCVBUF / 2 : val;
+ break;
+
+ case TCP_QUICKACK:
+ if (!val) {
+ icsk->icsk_ack.pingpong = 1;
+ } else {
+ icsk->icsk_ack.pingpong = 0;
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+ inet_csk_ack_scheduled(sk)) {
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+ tcp_cleanup_rbuf(sk, 1);
+ if (!(val & 1))
+ icsk->icsk_ack.pingpong = 1;
+ }
+ }
+ break;
+
+#ifdef CONFIG_TCP_MD5SIG
+ case TCP_MD5SIG:
+ /* Read the IP->Key mappings from userspace */
+ err = tp->af_specific->md5_parse(sk, optval, optlen);
+ break;
+#endif
+ case TCP_USER_TIMEOUT:
+ /* Cap the max time in ms TCP will retry or probe the window
+ * before giving up and aborting (ETIMEDOUT) a connection.
+ */
+ if (val < 0)
+ err = -EINVAL;
+ else
+ icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ break;
+
+ case TCP_FASTOPEN:
+ if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
+ TCPF_LISTEN))) {
+ tcp_fastopen_init_key_once(true);
+
+ err = fastopen_init_queue(sk, val);
+ } else {
+ err = -EINVAL;
+ }
+ break;
+ case TCP_TIMESTAMP:
+ if (!tp->repair)
+ err = -EPERM;
+ else
+ tp->tsoffset = val - tcp_time_stamp;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ tp->notsent_lowat = val;
+ sk->sk_write_space(sk);
+ break;
+#ifdef CONFIG_TCP_STEALTH
+ case TCP_STEALTH_INTEGRITY_LEN:
+ if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+ err = -EOPNOTSUPP;
+ } else if (val < 1 || val > USHRT_MAX) {
+ err = -EINVAL;
+ } else {
+ tp->stealth.integrity_len = val;
+ tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY_LEN;
+ }
+ break;
+#endif
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+ unsigned int optlen)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (level != SOL_TCP)
+ return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(tcp_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_TCP)
+ return inet_csk_compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_tcp_setsockopt);
+#endif
+
+/* Return information about state of tcp endpoint in API format. */
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 now = tcp_time_stamp;
+ unsigned int start;
+ u32 rate;
+
+ memset(info, 0, sizeof(*info));
+
+ info->tcpi_state = sk->sk_state;
+ info->tcpi_ca_state = icsk->icsk_ca_state;
+ info->tcpi_retransmits = icsk->icsk_retransmits;
+ info->tcpi_probes = icsk->icsk_probes_out;
+ info->tcpi_backoff = icsk->icsk_backoff;
+
+ if (tp->rx_opt.tstamp_ok)
+ info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ if (tcp_is_sack(tp))
+ info->tcpi_options |= TCPI_OPT_SACK;
+ if (tp->rx_opt.wscale_ok) {
+ info->tcpi_options |= TCPI_OPT_WSCALE;
+ info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
+ info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
+ }
+
+ if (tp->ecn_flags & TCP_ECN_OK)
+ info->tcpi_options |= TCPI_OPT_ECN;
+ if (tp->ecn_flags & TCP_ECN_SEEN)
+ info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+ if (tp->syn_data_acked)
+ info->tcpi_options |= TCPI_OPT_SYN_DATA;
+
+ info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+ info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
+ info->tcpi_snd_mss = tp->mss_cache;
+ info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ info->tcpi_unacked = sk->sk_ack_backlog;
+ info->tcpi_sacked = sk->sk_max_ack_backlog;
+ } else {
+ info->tcpi_unacked = tp->packets_out;
+ info->tcpi_sacked = tp->sacked_out;
+ }
+ info->tcpi_lost = tp->lost_out;
+ info->tcpi_retrans = tp->retrans_out;
+ info->tcpi_fackets = tp->fackets_out;
+
+ info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
+ info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
+ info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
+
+ info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
+ info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
+ info->tcpi_rtt = tp->srtt_us >> 3;
+ info->tcpi_rttvar = tp->mdev_us >> 2;
+ info->tcpi_snd_ssthresh = tp->snd_ssthresh;
+ info->tcpi_snd_cwnd = tp->snd_cwnd;
+ info->tcpi_advmss = tp->advmss;
+ info->tcpi_reordering = tp->reordering;
+
+ info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+ info->tcpi_rcv_space = tp->rcvq_space.space;
+
+ info->tcpi_total_retrans = tp->total_retrans;
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+ rate = READ_ONCE(sk->sk_max_pacing_rate);
+ info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tp->syncp);
+ info->tcpi_bytes_acked = tp->bytes_acked;
+ info->tcpi_bytes_received = tp->bytes_received;
+ } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
+}
+EXPORT_SYMBOL_GPL(tcp_get_info);
+
+static int do_tcp_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval, int __user *optlen)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int val, len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_MAXSEG:
+ val = tp->mss_cache;
+ if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ val = tp->rx_opt.user_mss;
+ if (tp->repair)
+ val = tp->rx_opt.mss_clamp;
+ break;
+ case TCP_NODELAY:
+ val = !!(tp->nonagle&TCP_NAGLE_OFF);
+ break;
+ case TCP_CORK:
+ val = !!(tp->nonagle&TCP_NAGLE_CORK);
+ break;
+ case TCP_KEEPIDLE:
+ val = keepalive_time_when(tp) / HZ;
+ break;
+ case TCP_KEEPINTVL:
+ val = keepalive_intvl_when(tp) / HZ;
+ break;
+ case TCP_KEEPCNT:
+ val = keepalive_probes(tp);
+ break;
+ case TCP_SYNCNT:
+ val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ break;
+ case TCP_LINGER2:
+ val = tp->linger2;
+ if (val >= 0)
+ val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+ break;
+ case TCP_DEFER_ACCEPT:
+ val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
+ TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
+ break;
+ case TCP_WINDOW_CLAMP:
+ val = tp->window_clamp;
+ break;
+ case TCP_INFO: {
+ struct tcp_info info;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ tcp_get_info(sk, &info);
+
+ len = min_t(unsigned int, len, sizeof(info));
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+ return 0;
+ }
+ case TCP_CC_INFO: {
+ const struct tcp_congestion_ops *ca_ops;
+ union tcp_cc_info info;
+ size_t sz = 0;
+ int attr;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ ca_ops = icsk->icsk_ca_ops;
+ if (ca_ops && ca_ops->get_info)
+ sz = ca_ops->get_info(sk, ~0U, &attr, &info);
+
+ len = min_t(unsigned int, len, sz);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+ return 0;
+ }
+ case TCP_QUICKACK:
+ val = !icsk->icsk_ack.pingpong;
+ break;
+
+ case TCP_CONGESTION:
+ if (get_user(len, optlen))
+ return -EFAULT;
+ len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
+ return -EFAULT;
+ return 0;
+
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ val = tp->thin_lto;
+ break;
+ case TCP_THIN_DUPACK:
+ val = tp->thin_dupack;
+ break;
+
+ case TCP_REPAIR:
+ val = tp->repair;
+ break;
+
+ case TCP_REPAIR_QUEUE:
+ if (tp->repair)
+ val = tp->repair_queue;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_QUEUE_SEQ:
+ if (tp->repair_queue == TCP_SEND_QUEUE)
+ val = tp->write_seq;
+ else if (tp->repair_queue == TCP_RECV_QUEUE)
+ val = tp->rcv_nxt;
+ else
+ return -EINVAL;
+ break;
+
+ case TCP_USER_TIMEOUT:
+ val = jiffies_to_msecs(icsk->icsk_user_timeout);
+ break;
+
+ case TCP_FASTOPEN:
+ if (icsk->icsk_accept_queue.fastopenq)
+ val = icsk->icsk_accept_queue.fastopenq->max_qlen;
+ else
+ val = 0;
+ break;
+
+ case TCP_TIMESTAMP:
+ val = tcp_time_stamp + tp->tsoffset;
+ break;
+ case TCP_NOTSENT_LOWAT:
+ val = tp->notsent_lowat;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+ int __user *optlen)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (level != SOL_TCP)
+ return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(tcp_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_TCP)
+ return inet_csk_compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_tcp_getsockopt);
+#endif
+
+#ifdef CONFIG_TCP_MD5SIG
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
+static DEFINE_MUTEX(tcp_md5sig_mutex);
+static bool tcp_md5sig_pool_populated = false;
+
+static void __tcp_alloc_md5sig_pool(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
+ struct crypto_hash *hash;
+
+ hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR_OR_NULL(hash))
+ return;
+ per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
+ }
+ }
+ /* before setting tcp_md5sig_pool_populated, we must commit all writes
+ * to memory. See smp_rmb() in tcp_get_md5sig_pool()
+ */
+ smp_wmb();
+ tcp_md5sig_pool_populated = true;
+}
+
+bool tcp_alloc_md5sig_pool(void)
+{
+ if (unlikely(!tcp_md5sig_pool_populated)) {
+ mutex_lock(&tcp_md5sig_mutex);
+
+ if (!tcp_md5sig_pool_populated)
+ __tcp_alloc_md5sig_pool();
+
+ mutex_unlock(&tcp_md5sig_mutex);
+ }
+ return tcp_md5sig_pool_populated;
+}
+EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
+
+
+/**
+ * tcp_get_md5sig_pool - get md5sig_pool for this user
+ *
+ * We use percpu structure, so if we succeed, we exit with preemption
+ * and BH disabled, to make sure another thread or softirq handling
+ * wont try to get same context.
+ */
+struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
+{
+ local_bh_disable();
+
+ if (tcp_md5sig_pool_populated) {
+ /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
+ smp_rmb();
+ return this_cpu_ptr(&tcp_md5sig_pool);
+ }
+ local_bh_enable();
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_get_md5sig_pool);
+
+int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
+ const struct tcphdr *th)
+{
+ struct scatterlist sg;
+ struct tcphdr hdr;
+ int err;
+
+ /* We are not allowed to change tcphdr, make a local copy */
+ memcpy(&hdr, th, sizeof(hdr));
+ hdr.check = 0;
+
+ /* options aren't included in the hash */
+ sg_init_one(&sg, &hdr, sizeof(hdr));
+ err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
+ return err;
+}
+EXPORT_SYMBOL(tcp_md5_hash_header);
+
+int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
+ const struct sk_buff *skb, unsigned int header_len)
+{
+ struct scatterlist sg;
+ const struct tcphdr *tp = tcp_hdr(skb);
+ struct hash_desc *desc = &hp->md5_desc;
+ unsigned int i;
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
+ const struct skb_shared_info *shi = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
+
+ sg_init_table(&sg, 1);
+
+ sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
+ if (crypto_hash_update(desc, &sg, head_data_len))
+ return 1;
+
+ for (i = 0; i < shi->nr_frags; ++i) {
+ const struct skb_frag_struct *f = &shi->frags[i];
+ unsigned int offset = f->page_offset;
+ struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+
+ sg_set_page(&sg, page, skb_frag_size(f),
+ offset_in_page(offset));
+ if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
+ return 1;
+ }
+
+ skb_walk_frags(skb, frag_iter)
+ if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_md5_hash_skb_data);
+
+int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, key->key, key->keylen);
+ return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
+}
+EXPORT_SYMBOL(tcp_md5_hash_key);
+
+#endif
+
+void tcp_done(struct sock *sk)
+{
+ struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+
+ if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_clear_xmit_timers(sk);
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ else
+ inet_csk_destroy_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_done);
+
+extern struct tcp_congestion_ops tcp_reno;
+
+static __initdata unsigned long thash_entries;
+static int __init set_thash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtoul(str, 0, &thash_entries);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("thash_entries=", set_thash_entries);
+
+static void __init tcp_init_mem(void)
+{
+ unsigned long limit = nr_free_buffer_pages() / 8;
+ limit = max(limit, 128UL);
+ sysctl_tcp_mem[0] = limit / 4 * 3;
+ sysctl_tcp_mem[1] = limit;
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+}
+
+void __init tcp_init(void)
+{
+ unsigned long limit;
+ int max_rshare, max_wshare, cnt;
+ unsigned int i;
+
+ sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
+
+ percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
+ percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+ tcp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+ /* Size and allocate the main established and bind bucket
+ * hash tables.
+ *
+ * The methodology is similar to that of the buffer cache.
+ */
+ tcp_hashinfo.ehash =
+ alloc_large_system_hash("TCP established",
+ sizeof(struct inet_ehash_bucket),
+ thash_entries,
+ 17, /* one slot per 128 KB of memory */
+ 0,
+ NULL,
+ &tcp_hashinfo.ehash_mask,
+ 0,
+ thash_entries ? 0 : 512 * 1024);
+ for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
+ INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
+
+ if (inet_ehash_locks_alloc(&tcp_hashinfo))
+ panic("TCP: failed to alloc ehash_locks");
+ tcp_hashinfo.bhash =
+ alloc_large_system_hash("TCP bind",
+ sizeof(struct inet_bind_hashbucket),
+ tcp_hashinfo.ehash_mask + 1,
+ 17, /* one slot per 128 KB of memory */
+ 0,
+ &tcp_hashinfo.bhash_size,
+ NULL,
+ 0,
+ 64 * 1024);
+ tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
+ for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+ spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+ }
+
+
+ cnt = tcp_hashinfo.ehash_mask + 1;
+
+ tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
+ sysctl_tcp_max_orphans = cnt / 2;
+ sysctl_max_syn_backlog = max(128, cnt / 256);
+
+ tcp_init_mem();
+ /* Set per-socket limits to no more than 1/128 the pressure threshold */
+ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
+ max_wshare = min(4UL*1024*1024, limit);
+ max_rshare = min(6UL*1024*1024, limit);
+
+ sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+ sysctl_tcp_wmem[1] = 16*1024;
+ sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+
+ sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+ sysctl_tcp_rmem[1] = 87380;
+ sysctl_tcp_rmem[2] = max(87380, max_rshare);
+
+ pr_info("Hash tables configured (established %u bind %u)\n",
+ tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+
+ tcp_metrics_init();
+ BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
+ tcp_tasklet_init();
+}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 000000000..c037644ea
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,239 @@
+/*
+ * Binary Increase Congestion control for TCP
+ * Home page:
+ * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ * "Binary Increase Congestion Control for Fast, Long Distance
+ * Networks" in InfoComm 2004
+ * Available from:
+ * http://netsrv.csc.ncsu.edu/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
+ */
+#define BICTCP_B 4 /*
+ * In binary search,
+ * go to point (max+min)/N
+ */
+
+static int fast_convergence = 1;
+static int max_increment = 16;
+static int low_window = 14;
+static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh;
+static int smooth_part = 20;
+
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(low_window, int, 0644);
+MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(smooth_part, int, 0644);
+MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
+
+/* BIC TCP Parameters */
+struct bictcp {
+ u32 cnt; /* increase cwnd by 1 after ACKs */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_cwnd; /* the last snd_cwnd */
+ u32 last_time; /* time when updated last_cwnd */
+ u32 epoch_start; /* beginning of an epoch */
+#define ACK_RATIO_SHIFT 4
+ u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+ ca->cnt = 0;
+ ca->last_max_cwnd = 0;
+ ca->last_cwnd = 0;
+ ca->last_time = 0;
+ ca->epoch_start = 0;
+ ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+ ca->loss_cwnd = 0;
+
+ if (initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+ if (ca->last_cwnd == cwnd &&
+ (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ return;
+
+ ca->last_cwnd = cwnd;
+ ca->last_time = tcp_time_stamp;
+
+ if (ca->epoch_start == 0) /* record the beginning of an epoch */
+ ca->epoch_start = tcp_time_stamp;
+
+ /* start off normal */
+ if (cwnd <= low_window) {
+ ca->cnt = cwnd;
+ return;
+ }
+
+ /* binary increase */
+ if (cwnd < ca->last_max_cwnd) {
+ __u32 dist = (ca->last_max_cwnd - cwnd)
+ / BICTCP_B;
+
+ if (dist > max_increment)
+ /* linear increase */
+ ca->cnt = cwnd / max_increment;
+ else if (dist <= 1U)
+ /* binary search increase */
+ ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+ else
+ /* binary search increase */
+ ca->cnt = cwnd / dist;
+ } else {
+ /* slow start AMD linear increase */
+ if (cwnd < ca->last_max_cwnd + BICTCP_B)
+ /* slow start */
+ ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+ else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
+ /* slow start */
+ ca->cnt = (cwnd * (BICTCP_B-1))
+ / (cwnd - ca->last_max_cwnd);
+ else
+ /* linear increase */
+ ca->cnt = cwnd / max_increment;
+ }
+
+ /* if in slow start or link utilization is very low */
+ if (ca->last_max_cwnd == 0) {
+ if (ca->cnt > 20) /* increase cwnd 5% per RTT */
+ ca->cnt = 20;
+ }
+
+ ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+ if (ca->cnt == 0) /* cannot be zero */
+ ca->cnt = 1;
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ else {
+ bictcp_update(ca, tp->snd_cwnd);
+ tcp_cong_avoid_ai(tp, ca->cnt, 1);
+ }
+}
+
+/*
+ * behave like Reno until low_window is reached,
+ * then increase congestion window slowly
+ */
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->epoch_start = 0; /* end of epoch */
+
+ /* Wmax and fast convergence */
+ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ / (2 * BICTCP_BETA_SCALE);
+ else
+ ca->last_max_cwnd = tp->snd_cwnd;
+
+ ca->loss_cwnd = tp->snd_cwnd;
+
+ if (tp->snd_cwnd <= low_window)
+ return max(tp->snd_cwnd >> 1U, 2U);
+ else
+ return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct bictcp *ca = inet_csk_ca(sk);
+
+ return max(tp->snd_cwnd, ca->loss_cwnd);
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+ if (new_state == TCP_CA_Loss)
+ bictcp_reset(inet_csk_ca(sk));
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+ ca->delayed_ack += cnt;
+ }
+}
+
+static struct tcp_congestion_ops bictcp __read_mostly = {
+ .init = bictcp_init,
+ .ssthresh = bictcp_recalc_ssthresh,
+ .cong_avoid = bictcp_cong_avoid,
+ .set_state = bictcp_state,
+ .undo_cwnd = bictcp_undo_cwnd,
+ .pkts_acked = bictcp_acked,
+ .owner = THIS_MODULE,
+ .name = "bic",
+};
+
+static int __init bictcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&bictcp);
+}
+
+static void __exit bictcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&bictcp);
+}
+
+module_init(bictcp_register);
+module_exit(bictcp_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000..84be008c9
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,441 @@
+/*
+ * Pluggable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler support and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/jhash.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+ struct tcp_congestion_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+ if (strcmp(e->name, name) == 0)
+ return e;
+ }
+
+ return NULL;
+}
+
+/* Must be called with rcu lock held */
+static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name)
+{
+ const struct tcp_congestion_ops *ca = tcp_ca_find(name);
+#ifdef CONFIG_MODULES
+ if (!ca && capable(CAP_NET_ADMIN)) {
+ rcu_read_unlock();
+ request_module("tcp_%s", name);
+ rcu_read_lock();
+ ca = tcp_ca_find(name);
+ }
+#endif
+ return ca;
+}
+
+/* Simple linear search, not much in here. */
+struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
+{
+ struct tcp_congestion_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+ if (e->key == key)
+ return e;
+ }
+
+ return NULL;
+}
+
+/*
+ * Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+ int ret = 0;
+
+ /* all algorithms must implement ssthresh and cong_avoid ops */
+ if (!ca->ssthresh || !ca->cong_avoid) {
+ pr_err("%s does not implement required ops\n", ca->name);
+ return -EINVAL;
+ }
+
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+ spin_lock(&tcp_cong_list_lock);
+ if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
+ pr_notice("%s already registered or non-unique key\n",
+ ca->name);
+ ret = -EEXIST;
+ } else {
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ pr_debug("%s registered\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function. Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+ spin_lock(&tcp_cong_list_lock);
+ list_del_rcu(&ca->list);
+ spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module gets removed entirely.
+ *
+ * A try_module_get() should fail by now as our module is
+ * in "going" state since no refs are held anymore and
+ * module_exit() handler being called.
+ */
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+
+u32 tcp_ca_get_key_by_name(const char *name)
+{
+ const struct tcp_congestion_ops *ca;
+ u32 key;
+
+ might_sleep();
+
+ rcu_read_lock();
+ ca = __tcp_ca_find_autoload(name);
+ key = ca ? ca->key : TCP_CA_UNSPEC;
+ rcu_read_unlock();
+
+ return key;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
+
+char *tcp_ca_get_name_by_key(u32 key, char *buffer)
+{
+ const struct tcp_congestion_ops *ca;
+ char *ret = NULL;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(key);
+ if (ca)
+ ret = strncpy(buffer, ca->name,
+ TCP_CA_NAME_MAX);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
+
+/* Assign choice of congestion control. */
+void tcp_assign_congestion_control(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_congestion_ops *ca;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (likely(try_module_get(ca->owner))) {
+ icsk->icsk_ca_ops = ca;
+ goto out;
+ }
+ /* Fallback to next available. The last really
+ * guaranteed fallback is Reno from this list.
+ */
+ }
+out:
+ rcu_read_unlock();
+
+ /* Clear out private data before diag gets it and
+ * the ca has not been initialized.
+ */
+ if (ca->get_info)
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+}
+
+void tcp_init_congestion_control(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
+}
+
+static void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = ca;
+ icsk->icsk_ca_setsockopt = 1;
+
+ if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
+}
+
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->release)
+ icsk->icsk_ca_ops->release(sk);
+ module_put(icsk->icsk_ca_ops->owner);
+}
+
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(const char *name)
+{
+ struct tcp_congestion_ops *ca;
+ int ret = -ENOENT;
+
+ spin_lock(&tcp_cong_list_lock);
+ ca = tcp_ca_find(name);
+#ifdef CONFIG_MODULES
+ if (!ca && capable(CAP_NET_ADMIN)) {
+ spin_unlock(&tcp_cong_list_lock);
+
+ request_module("tcp_%s", name);
+ spin_lock(&tcp_cong_list_lock);
+ ca = tcp_ca_find(name);
+ }
+#endif
+
+ if (ca) {
+ ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
+ list_move(&ca->list, &tcp_cong_list);
+ ret = 0;
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ return ret;
+}
+
+/* Set default value from kernel configuration at bootup */
+static int __init tcp_congestion_default(void)
+{
+ return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+}
+late_initcall(tcp_congestion_default);
+
+/* Build string with list of available congestion control values */
+void tcp_get_available_congestion_control(char *buf, size_t maxlen)
+{
+ struct tcp_congestion_ops *ca;
+ size_t offs = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ca->name);
+ }
+ rcu_read_unlock();
+}
+
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(char *name)
+{
+ struct tcp_congestion_ops *ca;
+ /* We will always have reno... */
+ BUG_ON(list_empty(&tcp_cong_list));
+
+ rcu_read_lock();
+ ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+ strncpy(name, ca->name, TCP_CA_NAME_MAX);
+ rcu_read_unlock();
+}
+
+/* Built list of non-restricted congestion control values */
+void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
+{
+ struct tcp_congestion_ops *ca;
+ size_t offs = 0;
+
+ *buf = '\0';
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
+ continue;
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ca->name);
+ }
+ rcu_read_unlock();
+}
+
+/* Change list of non-restricted congestion control */
+int tcp_set_allowed_congestion_control(char *val)
+{
+ struct tcp_congestion_ops *ca;
+ char *saved_clone, *clone, *name;
+ int ret = 0;
+
+ saved_clone = clone = kstrdup(val, GFP_USER);
+ if (!clone)
+ return -ENOMEM;
+
+ spin_lock(&tcp_cong_list_lock);
+ /* pass 1 check for bad entries */
+ while ((name = strsep(&clone, " ")) && *name) {
+ ca = tcp_ca_find(name);
+ if (!ca) {
+ ret = -ENOENT;
+ goto out;
+ }
+ }
+
+ /* pass 2 clear old values */
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list)
+ ca->flags &= ~TCP_CONG_NON_RESTRICTED;
+
+ /* pass 3 mark as allowed */
+ while ((name = strsep(&val, " ")) && *name) {
+ ca = tcp_ca_find(name);
+ WARN_ON(!ca);
+ if (ca)
+ ca->flags |= TCP_CONG_NON_RESTRICTED;
+ }
+out:
+ spin_unlock(&tcp_cong_list_lock);
+ kfree(saved_clone);
+
+ return ret;
+}
+
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct sock *sk, const char *name)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_congestion_ops *ca;
+ int err = 0;
+
+ if (icsk->icsk_ca_dst_locked)
+ return -EPERM;
+
+ rcu_read_lock();
+ ca = __tcp_ca_find_autoload(name);
+ /* No change asking for existing value */
+ if (ca == icsk->icsk_ca_ops) {
+ icsk->icsk_ca_setsockopt = 1;
+ goto out;
+ }
+ if (!ca)
+ err = -ENOENT;
+ else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
+ err = -EPERM;
+ else if (!try_module_get(ca->owner))
+ err = -EBUSY;
+ else
+ tcp_reinit_congestion_control(sk, ca);
+ out:
+ rcu_read_unlock();
+ return err;
+}
+
+/* Slow start is used when congestion window is no greater than the slow start
+ * threshold. We base on RFC2581 and also handle stretch ACKs properly.
+ * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
+ * something better;) a packet is only considered (s)acked in its entirety to
+ * defend the ACK attacks described in the RFC. Slow start processes a stretch
+ * ACK of degree N as if N acks of degree 1 are received back to back except
+ * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
+ * returns the leftover acks to adjust cwnd in congestion avoidance mode.
+ */
+u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
+{
+ u32 cwnd = tp->snd_cwnd + acked;
+
+ if (cwnd > tp->snd_ssthresh)
+ cwnd = tp->snd_ssthresh + 1;
+ acked -= cwnd - tp->snd_cwnd;
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+
+ return acked;
+}
+EXPORT_SYMBOL_GPL(tcp_slow_start);
+
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
+ * for every packet that was ACKed.
+ */
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
+{
+ /* If credits accumulated at a higher w, apply them gently now. */
+ if (tp->snd_cwnd_cnt >= w) {
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd++;
+ }
+
+ tp->snd_cwnd_cnt += acked;
+ if (tp->snd_cwnd_cnt >= w) {
+ u32 delta = tp->snd_cwnd_cnt / w;
+
+ tp->snd_cwnd_cnt -= delta * w;
+ tp->snd_cwnd += delta;
+ }
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ /* In dangerous area, increase slowly. */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+
+/* Slow start threshold is half the congestion window (min 2) */
+u32 tcp_reno_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+
+struct tcp_congestion_ops tcp_reno = {
+ .flags = TCP_CONG_NON_RESTRICTED,
+ .name = "reno",
+ .owner = THIS_MODULE,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+};
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 000000000..06d3d665a
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,504 @@
+/*
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
+ * Home page:
+ * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of CUBIC TCP in
+ * Sangtae Ha, Injong Rhee and Lisong Xu,
+ * "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
+ * in ACM SIGOPS Operating System Review, July 2008.
+ * Available from:
+ * http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
+ *
+ * CUBIC integrates a new slow start algorithm, called HyStart.
+ * The details of HyStart are presented in
+ * Sangtae Ha and Injong Rhee,
+ * "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
+ * Available from:
+ * http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
+ *
+ * All testing results are available from:
+ * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
+ *
+ * Unless CUBIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
+ */
+#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
+
+/* Two methods of hybrid slow start */
+#define HYSTART_ACK_TRAIN 0x1
+#define HYSTART_DELAY 0x2
+
+/* Number of delay samples for detecting the increase of delay */
+#define HYSTART_MIN_SAMPLES 8
+#define HYSTART_DELAY_MIN (4U<<3)
+#define HYSTART_DELAY_MAX (16U<<3)
+#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
+
+static int fast_convergence __read_mostly = 1;
+static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh __read_mostly;
+static int bic_scale __read_mostly = 41;
+static int tcp_friendliness __read_mostly = 1;
+
+static int hystart __read_mostly = 1;
+static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
+static int hystart_low_window __read_mostly = 16;
+static int hystart_ack_delta __read_mostly = 2;
+
+static u32 cube_rtt_scale __read_mostly;
+static u32 beta_scale __read_mostly;
+static u64 cube_factor __read_mostly;
+
+/* Note parameters that are used for precomputing scale factors are read-only */
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(bic_scale, int, 0444);
+MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
+module_param(tcp_friendliness, int, 0644);
+MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
+module_param(hystart, int, 0644);
+MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
+module_param(hystart_detect, int, 0644);
+MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
+ " 1: packet-train 2: delay 3: both packet-train and delay");
+module_param(hystart_low_window, int, 0644);
+MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
+module_param(hystart_ack_delta, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
+
+/* BIC TCP Parameters */
+struct bictcp {
+ u32 cnt; /* increase cwnd by 1 after ACKs */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_cwnd; /* the last snd_cwnd */
+ u32 last_time; /* time when updated last_cwnd */
+ u32 bic_origin_point;/* origin point of bic function */
+ u32 bic_K; /* time to origin point
+ from the beginning of the current epoch */
+ u32 delay_min; /* min delay (msec << 3) */
+ u32 epoch_start; /* beginning of an epoch */
+ u32 ack_cnt; /* number of acks */
+ u32 tcp_cwnd; /* estimated tcp cwnd */
+ u16 unused;
+ u8 sample_cnt; /* number of samples to decide curr_rtt */
+ u8 found; /* the exit point is found? */
+ u32 round_start; /* beginning of each round */
+ u32 end_seq; /* end_seq of the round */
+ u32 last_ack; /* last time when the ACK spacing is close */
+ u32 curr_rtt; /* the minimum rtt of current round */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+ ca->cnt = 0;
+ ca->last_max_cwnd = 0;
+ ca->last_cwnd = 0;
+ ca->last_time = 0;
+ ca->bic_origin_point = 0;
+ ca->bic_K = 0;
+ ca->delay_min = 0;
+ ca->epoch_start = 0;
+ ca->ack_cnt = 0;
+ ca->tcp_cwnd = 0;
+ ca->found = 0;
+}
+
+static inline u32 bictcp_clock(void)
+{
+#if HZ < 1000
+ return ktime_to_ms(ktime_get_real());
+#else
+ return jiffies_to_msecs(jiffies);
+#endif
+}
+
+static inline void bictcp_hystart_reset(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->round_start = ca->last_ack = bictcp_clock();
+ ca->end_seq = tp->snd_nxt;
+ ca->curr_rtt = 0;
+ ca->sample_cnt = 0;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+ ca->loss_cwnd = 0;
+
+ if (hystart)
+ bictcp_hystart_reset(sk);
+
+ if (!hystart && initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
+ */
+static u32 cubic_root(u64 a)
+{
+ u32 x, b, shift;
+ /*
+ * cbrt(x) MSB values for x MSB values in [0..63].
+ * Precomputed then refined by hand - Willy Tarreau
+ *
+ * For x in [0..63],
+ * v = cbrt(x << 18) - 1
+ * cbrt(x) = (v[x] + 10) >> 6
+ */
+ static const u8 v[] = {
+ /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118,
+ /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156,
+ /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179,
+ /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199,
+ /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215,
+ /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229,
+ /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242,
+ /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254,
+ };
+
+ b = fls64(a);
+ if (b < 7) {
+ /* a in [0..63] */
+ return ((u32)v[(u32)a] + 35) >> 6;
+ }
+
+ b = ((b * 84) >> 8) - 1;
+ shift = (a >> (b * 3));
+
+ x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
+
+ /*
+ * Newton-Raphson iteration
+ * 2
+ * x = ( 2 * x + a / x ) / 3
+ * k+1 k k
+ */
+ x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
+ x = ((x * 341) >> 10);
+ return x;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
+{
+ u32 delta, bic_target, max_cnt;
+ u64 offs, t;
+
+ ca->ack_cnt += acked; /* count the number of ACKed packets */
+
+ if (ca->last_cwnd == cwnd &&
+ (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ return;
+
+ /* The CUBIC function can update ca->cnt at most once per jiffy.
+ * On all cwnd reduction events, ca->epoch_start is set to 0,
+ * which will force a recalculation of ca->cnt.
+ */
+ if (ca->epoch_start && tcp_time_stamp == ca->last_time)
+ goto tcp_friendliness;
+
+ ca->last_cwnd = cwnd;
+ ca->last_time = tcp_time_stamp;
+
+ if (ca->epoch_start == 0) {
+ ca->epoch_start = tcp_time_stamp; /* record beginning */
+ ca->ack_cnt = acked; /* start counting */
+ ca->tcp_cwnd = cwnd; /* syn with cubic */
+
+ if (ca->last_max_cwnd <= cwnd) {
+ ca->bic_K = 0;
+ ca->bic_origin_point = cwnd;
+ } else {
+ /* Compute new K based on
+ * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+ */
+ ca->bic_K = cubic_root(cube_factor
+ * (ca->last_max_cwnd - cwnd));
+ ca->bic_origin_point = ca->last_max_cwnd;
+ }
+ }
+
+ /* cubic function - calc*/
+ /* calculate c * time^3 / rtt,
+ * while considering overflow in calculation of time^3
+ * (so time^3 is done by using 64 bit)
+ * and without the support of division of 64bit numbers
+ * (so all divisions are done by using 32 bit)
+ * also NOTE the unit of those veriables
+ * time = (t - K) / 2^bictcp_HZ
+ * c = bic_scale >> 10
+ * rtt = (srtt >> 3) / HZ
+ * !!! The following code does not have overflow problems,
+ * if the cwnd < 1 million packets !!!
+ */
+
+ t = (s32)(tcp_time_stamp - ca->epoch_start);
+ t += msecs_to_jiffies(ca->delay_min >> 3);
+ /* change the unit from HZ to bictcp_HZ */
+ t <<= BICTCP_HZ;
+ do_div(t, HZ);
+
+ if (t < ca->bic_K) /* t - K */
+ offs = ca->bic_K - t;
+ else
+ offs = t - ca->bic_K;
+
+ /* c/rtt * (t-K)^3 */
+ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
+ if (t < ca->bic_K) /* below origin*/
+ bic_target = ca->bic_origin_point - delta;
+ else /* above origin*/
+ bic_target = ca->bic_origin_point + delta;
+
+ /* cubic function - calc bictcp_cnt*/
+ if (bic_target > cwnd) {
+ ca->cnt = cwnd / (bic_target - cwnd);
+ } else {
+ ca->cnt = 100 * cwnd; /* very small increment*/
+ }
+
+ /*
+ * The initial growth of cubic function may be too conservative
+ * when the available bandwidth is still unknown.
+ */
+ if (ca->last_max_cwnd == 0 && ca->cnt > 20)
+ ca->cnt = 20; /* increase cwnd 5% per RTT */
+
+tcp_friendliness:
+ /* TCP Friendly */
+ if (tcp_friendliness) {
+ u32 scale = beta_scale;
+
+ delta = (cwnd * scale) >> 3;
+ while (ca->ack_cnt > delta) { /* update tcp cwnd */
+ ca->ack_cnt -= delta;
+ ca->tcp_cwnd++;
+ }
+
+ if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
+ delta = ca->tcp_cwnd - cwnd;
+ max_cnt = cwnd / delta;
+ if (ca->cnt > max_cnt)
+ ca->cnt = max_cnt;
+ }
+ }
+
+ /* The maximum rate of cwnd increase CUBIC allows is 1 packet per
+ * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
+ */
+ ca->cnt = max(ca->cnt, 2U);
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (hystart && after(ack, ca->end_seq))
+ bictcp_hystart_reset(sk);
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ bictcp_update(ca, tp->snd_cwnd, acked);
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->epoch_start = 0; /* end of epoch */
+
+ /* Wmax and fast convergence */
+ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ / (2 * BICTCP_BETA_SCALE);
+ else
+ ca->last_max_cwnd = tp->snd_cwnd;
+
+ ca->loss_cwnd = tp->snd_cwnd;
+
+ return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+ if (new_state == TCP_CA_Loss) {
+ bictcp_reset(inet_csk_ca(sk));
+ bictcp_hystart_reset(sk);
+ }
+}
+
+static void hystart_update(struct sock *sk, u32 delay)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (ca->found & hystart_detect)
+ return;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
+ u32 now = bictcp_clock();
+
+ /* first detection parameter - ack-train detection */
+ if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
+ ca->last_ack = now;
+ if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
+ ca->found |= HYSTART_ACK_TRAIN;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+
+ if (hystart_detect & HYSTART_DELAY) {
+ /* obtain the minimum delay of more than sampling packets */
+ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
+ if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
+ ca->curr_rtt = delay;
+
+ ca->sample_cnt++;
+ } else {
+ if (ca->curr_rtt > ca->delay_min +
+ HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
+ ca->found |= HYSTART_DELAY;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 delay;
+
+ /* Some calls are for duplicates without timetamps */
+ if (rtt_us < 0)
+ return;
+
+ /* Discard delay samples right after fast recovery */
+ if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+ return;
+
+ delay = (rtt_us << 3) / USEC_PER_MSEC;
+ if (delay == 0)
+ delay = 1;
+
+ /* first time call or link delay decreases */
+ if (ca->delay_min == 0 || ca->delay_min > delay)
+ ca->delay_min = delay;
+
+ /* hystart triggers when cwnd is larger than some threshold */
+ if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
+ tp->snd_cwnd >= hystart_low_window)
+ hystart_update(sk, delay);
+}
+
+static struct tcp_congestion_ops cubictcp __read_mostly = {
+ .init = bictcp_init,
+ .ssthresh = bictcp_recalc_ssthresh,
+ .cong_avoid = bictcp_cong_avoid,
+ .set_state = bictcp_state,
+ .undo_cwnd = bictcp_undo_cwnd,
+ .pkts_acked = bictcp_acked,
+ .owner = THIS_MODULE,
+ .name = "cubic",
+};
+
+static int __init cubictcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+
+ /* Precompute a bunch of the scaling factors that are used per-packet
+ * based on SRTT of 100ms
+ */
+
+ beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
+ / (BICTCP_BETA_SCALE - beta);
+
+ cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
+
+ /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+ * so K = cubic_root( (wmax-cwnd)*rtt/c )
+ * the unit of K is bictcp_HZ=2^10, not HZ
+ *
+ * c = bic_scale >> 10
+ * rtt = 100ms
+ *
+ * the following code has been designed and tested for
+ * cwnd < 1 million packets
+ * RTT < 100 seconds
+ * HZ < 1,000,00 (corresponding to 10 nano-second)
+ */
+
+ /* 1/c * 2^2*bictcp_HZ * srtt */
+ cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
+
+ /* divide by bic_scale and by constant Srtt (100ms) */
+ do_div(cube_factor, bic_scale * 10);
+
+ return tcp_register_congestion_control(&cubictcp);
+}
+
+static void __exit cubictcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&cubictcp);
+}
+
+module_init(cubictcp_register);
+module_exit(cubictcp_unregister);
+
+MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CUBIC TCP");
+MODULE_VERSION("2.3");
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
new file mode 100644
index 000000000..4c41c1287
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.c
@@ -0,0 +1,345 @@
+/* DataCenter TCP (DCTCP) congestion control.
+ *
+ * http://simula.stanford.edu/~alizade/Site/DCTCP.html
+ *
+ * This is an implementation of DCTCP over Reno, an enhancement to the
+ * TCP congestion control algorithm designed for data centers. DCTCP
+ * leverages Explicit Congestion Notification (ECN) in the network to
+ * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
+ * the following three data center transport requirements:
+ *
+ * - High burst tolerance (incast due to partition/aggregate)
+ * - Low latency (short flows, queries)
+ * - High throughput (continuous data updates, large file transfers)
+ * with commodity shallow buffered switches
+ *
+ * The algorithm is described in detail in the following two papers:
+ *
+ * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
+ * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
+ * "Data Center TCP (DCTCP)", Data Center Networks session
+ * Proc. ACM SIGCOMM, New Delhi, 2010.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ *
+ * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
+ * "Analysis of DCTCP: Stability, Convergence, and Fairness"
+ * Proc. ACM SIGMETRICS, San Jose, 2011.
+ * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
+ *
+ * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <dborkman@redhat.com>
+ * Florian Westphal <fw@strlen.de>
+ * Glenn Judd <glenn.judd@morganstanley.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <net/tcp.h>
+#include <linux/inet_diag.h>
+
+#define DCTCP_MAX_ALPHA 1024U
+
+struct dctcp {
+ u32 acked_bytes_ecn;
+ u32 acked_bytes_total;
+ u32 prior_snd_una;
+ u32 prior_rcv_nxt;
+ u32 dctcp_alpha;
+ u32 next_seq;
+ u32 ce_state;
+ u32 delayed_ack_reserved;
+};
+
+static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
+module_param(dctcp_shift_g, uint, 0644);
+MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
+
+static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
+module_param(dctcp_alpha_on_init, uint, 0644);
+MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
+
+static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
+module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
+MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
+ "parameter for clamping alpha on loss");
+
+static struct tcp_congestion_ops dctcp_reno;
+
+static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
+{
+ ca->next_seq = tp->snd_nxt;
+
+ ca->acked_bytes_ecn = 0;
+ ca->acked_bytes_total = 0;
+}
+
+static void dctcp_init(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((tp->ecn_flags & TCP_ECN_OK) ||
+ (sk->sk_state == TCP_LISTEN ||
+ sk->sk_state == TCP_CLOSE)) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ ca->prior_snd_una = tp->snd_una;
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+
+ ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+
+ ca->delayed_ack_reserved = 0;
+ ca->ce_state = 0;
+
+ dctcp_reset(tp, ca);
+ return;
+ }
+
+ /* No ECN support? Fall back to Reno. Also need to clear
+ * ECT from sk since it is set during 3WHS for DCTCP.
+ */
+ inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
+ INET_ECN_dontxmit(sk);
+}
+
+static u32 dctcp_ssthresh(struct sock *sk)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+
+static void dctcp_ce_state_0_to_1(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* State has changed from CE=0 to CE=1 and delayed
+ * ACK has not sent yet.
+ */
+ if (!ca->ce_state && ca->delayed_ack_reserved) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+ tmp_rcv_nxt = tp->rcv_nxt;
+
+ /* Generate previous ack with CE=0. */
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = ca->prior_rcv_nxt;
+
+ tcp_send_ack(sk);
+
+ /* Recover current rcv_nxt. */
+ tp->rcv_nxt = tmp_rcv_nxt;
+ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 1;
+
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_ce_state_1_to_0(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* State has changed from CE=1 to CE=0 and delayed
+ * ACK has not sent yet.
+ */
+ if (ca->ce_state && ca->delayed_ack_reserved) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+ tmp_rcv_nxt = tp->rcv_nxt;
+
+ /* Generate previous ack with CE=1. */
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = ca->prior_rcv_nxt;
+
+ tcp_send_ack(sk);
+
+ /* Recover current rcv_nxt. */
+ tp->rcv_nxt = tmp_rcv_nxt;
+ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 0;
+
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void dctcp_update_alpha(struct sock *sk, u32 flags)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
+ u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
+
+ /* If ack did not advance snd_una, count dupack as MSS size.
+ * If ack did update window, do not count it at all.
+ */
+ if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
+ acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+ if (acked_bytes) {
+ ca->acked_bytes_total += acked_bytes;
+ ca->prior_snd_una = tp->snd_una;
+
+ if (flags & CA_ACK_ECE)
+ ca->acked_bytes_ecn += acked_bytes;
+ }
+
+ /* Expired RTT */
+ if (!before(tp->snd_una, ca->next_seq)) {
+ /* For avoiding denominator == 1. */
+ if (ca->acked_bytes_total == 0)
+ ca->acked_bytes_total = 1;
+
+ /* alpha = (1 - g) * alpha + g * F */
+ ca->dctcp_alpha = ca->dctcp_alpha -
+ (ca->dctcp_alpha >> dctcp_shift_g) +
+ (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
+ ca->acked_bytes_total;
+
+ if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
+ /* Clamp dctcp_alpha to max. */
+ ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+
+ dctcp_reset(tp, ca);
+ }
+}
+
+static void dctcp_state(struct sock *sk, u8 new_state)
+{
+ if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ /* If this extension is enabled, we clamp dctcp_alpha to
+ * max on packet loss; the motivation is that dctcp_alpha
+ * is an indicator to the extend of congestion and packet
+ * loss is an indicator of extreme congestion; setting
+ * this in practice turned out to be beneficial, and
+ * effectively assumes total congestion which reduces the
+ * window by half.
+ */
+ ca->dctcp_alpha = DCTCP_MAX_ALPHA;
+ }
+}
+
+static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ switch (ev) {
+ case CA_EVENT_DELAYED_ACK:
+ if (!ca->delayed_ack_reserved)
+ ca->delayed_ack_reserved = 1;
+ break;
+ case CA_EVENT_NON_DELAYED_ACK:
+ if (ca->delayed_ack_reserved)
+ ca->delayed_ack_reserved = 0;
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+ switch (ev) {
+ case CA_EVENT_ECN_IS_CE:
+ dctcp_ce_state_0_to_1(sk);
+ break;
+ case CA_EVENT_ECN_NO_CE:
+ dctcp_ce_state_1_to_0(sk);
+ break;
+ case CA_EVENT_DELAYED_ACK:
+ case CA_EVENT_NON_DELAYED_ACK:
+ dctcp_update_ack_reserved(sk, ev);
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+
+ /* Fill it also in case of VEGASINFO due to req struct limits.
+ * We can still correctly retrieve it later.
+ */
+ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
+ ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ memset(info, 0, sizeof(struct tcp_dctcp_info));
+ if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
+ info->dctcp.dctcp_enabled = 1;
+ info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
+ info->dctcp.dctcp_alpha = ca->dctcp_alpha;
+ info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
+ info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
+ }
+
+ *attr = INET_DIAG_DCTCPINFO;
+ return sizeof(*info);
+ }
+ return 0;
+}
+
+static struct tcp_congestion_ops dctcp __read_mostly = {
+ .init = dctcp_init,
+ .in_ack_event = dctcp_update_alpha,
+ .cwnd_event = dctcp_cwnd_event,
+ .ssthresh = dctcp_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .set_state = dctcp_state,
+ .get_info = dctcp_get_info,
+ .flags = TCP_CONG_NEEDS_ECN,
+ .owner = THIS_MODULE,
+ .name = "dctcp",
+};
+
+static struct tcp_congestion_ops dctcp_reno __read_mostly = {
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .get_info = dctcp_get_info,
+ .owner = THIS_MODULE,
+ .name = "dctcp-reno",
+};
+
+static int __init dctcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&dctcp);
+}
+
+static void __exit dctcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&dctcp);
+}
+
+module_init(dctcp_register);
+module_exit(dctcp_unregister);
+
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
new file mode 100644
index 000000000..79b34a0f4
--- /dev/null
+++ b/net/ipv4/tcp_diag.c
@@ -0,0 +1,68 @@
+/*
+ * tcp_diag.c Module for monitoring TCP transport protocols sockets.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_info *info = _info;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ r->idiag_rqueue = sk->sk_ack_backlog;
+ r->idiag_wqueue = sk->sk_max_ack_backlog;
+ } else {
+ r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+ r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ }
+ if (info)
+ tcp_get_info(sk, info);
+}
+
+static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
+}
+
+static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
+}
+
+static const struct inet_diag_handler tcp_diag_handler = {
+ .dump = tcp_diag_dump,
+ .dump_one = tcp_diag_dump_one,
+ .idiag_get_info = tcp_diag_get_info,
+ .idiag_type = IPPROTO_TCP,
+};
+
+static int __init tcp_diag_init(void)
+{
+ return inet_diag_register(&tcp_diag_handler);
+}
+
+static void __exit tcp_diag_exit(void)
+{
+ inet_diag_unregister(&tcp_diag_handler);
+}
+
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 000000000..f9c0fb84e
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,313 @@
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/tcp.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <net/inetpeer.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
+
+struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
+
+static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
+
+void tcp_fastopen_init_key_once(bool publish)
+{
+ static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+
+ /* tcp_fastopen_reset_cipher publishes the new context
+ * atomically, so we allow this race happening here.
+ *
+ * All call sites of tcp_fastopen_cookie_gen also check
+ * for a valid cookie, so this is an acceptable risk.
+ */
+ if (net_get_random_once(key, sizeof(key)) && publish)
+ tcp_fastopen_reset_cipher(key, sizeof(key));
+}
+
+static void tcp_fastopen_ctx_free(struct rcu_head *head)
+{
+ struct tcp_fastopen_context *ctx =
+ container_of(head, struct tcp_fastopen_context, rcu);
+ crypto_free_cipher(ctx->tfm);
+ kfree(ctx);
+}
+
+int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+{
+ int err;
+ struct tcp_fastopen_context *ctx, *octx;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
+
+ if (IS_ERR(ctx->tfm)) {
+ err = PTR_ERR(ctx->tfm);
+error: kfree(ctx);
+ pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
+ return err;
+ }
+ err = crypto_cipher_setkey(ctx->tfm, key, len);
+ if (err) {
+ pr_err("TCP: TFO cipher key error: %d\n", err);
+ crypto_free_cipher(ctx->tfm);
+ goto error;
+ }
+ memcpy(ctx->key, key, len);
+
+ spin_lock(&tcp_fastopen_ctx_lock);
+
+ octx = rcu_dereference_protected(tcp_fastopen_ctx,
+ lockdep_is_held(&tcp_fastopen_ctx_lock));
+ rcu_assign_pointer(tcp_fastopen_ctx, ctx);
+ spin_unlock(&tcp_fastopen_ctx_lock);
+
+ if (octx)
+ call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+ return err;
+}
+
+static bool __tcp_fastopen_cookie_gen(const void *path,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_fastopen_context *ctx;
+ bool ok = false;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(tcp_fastopen_ctx);
+ if (ctx) {
+ crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ ok = true;
+ }
+ rcu_read_unlock();
+ return ok;
+}
+
+/* Generate the fastopen cookie by doing aes128 encryption on both
+ * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
+ * addresses. For the longer IPv6 addresses use CBC-MAC.
+ *
+ * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
+ */
+static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ if (req->rsk_ops->family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(syn);
+
+ __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
+ return __tcp_fastopen_cookie_gen(path, foc);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (req->rsk_ops->family == AF_INET6) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(syn);
+ struct tcp_fastopen_cookie tmp;
+
+ if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+ struct in6_addr *buf = (struct in6_addr *) tmp.val;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
+ return __tcp_fastopen_cookie_gen(buf, foc);
+ }
+ }
+#endif
+ return false;
+}
+
+static bool tcp_fastopen_create_child(struct sock *sk,
+ struct sk_buff *skb,
+ struct dst_entry *dst,
+ struct request_sock *req)
+{
+ struct tcp_sock *tp;
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+ struct sock *child;
+ u32 end_seq;
+
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
+
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ if (!child)
+ return false;
+
+ spin_lock(&queue->fastopenq->lock);
+ queue->fastopenq->qlen++;
+ spin_unlock(&queue->fastopenq->lock);
+
+ /* Initialize the child socket. Have to fix some values to take
+ * into account the child is a Fast Open socket and is created
+ * only out of the bits carried in the SYN packet.
+ */
+ tp = tcp_sk(child);
+
+ tp->fastopen_rsk = req;
+ tcp_rsk(req)->tfo_listener = true;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never
+ * scaled. So correct it appropriately.
+ */
+ tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
+
+ /* Activate the retrans timer so that SYNACK can be retransmitted.
+ * The request socket is not added to the SYN table of the parent
+ * because it's been added to the accept queue directly.
+ */
+ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+
+ atomic_set(&req->rsk_refcnt, 1);
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, child);
+
+ /* Now finish processing the fastopen child socket. */
+ inet_csk(child)->icsk_af_ops->rebuild_header(child);
+ tcp_init_congestion_control(child);
+ tcp_mtup_init(child);
+ tcp_init_metrics(child);
+ tcp_init_buffer_space(child);
+
+ /* Queue the data carried in the SYN packet. We need to first
+ * bump skb's refcnt because the caller will attempt to free it.
+ * Note that IPv6 might also have used skb_get() trick
+ * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts)
+ * So we need to eventually get a clone of the packet,
+ * before inserting it in sk_receive_queue.
+ *
+ * XXX (TFO) - we honor a zero-payload TFO request for now,
+ * (any reason not to?) but no need to queue the skb since
+ * there is no data. How about SYN+FIN?
+ */
+ end_seq = TCP_SKB_CB(skb)->end_seq;
+ if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
+ struct sk_buff *skb2;
+
+ if (unlikely(skb_shared(skb)))
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ else
+ skb2 = skb_get(skb);
+
+ if (likely(skb2)) {
+ skb_dst_drop(skb2);
+ __skb_pull(skb2, tcp_hdrlen(skb));
+ skb_set_owner_r(skb2, child);
+ __skb_queue_tail(&child->sk_receive_queue, skb2);
+ tp->syn_data_acked = 1;
+
+ /* u64_stats_update_begin(&tp->syncp) not needed here,
+ * as we certainly are not changing upper 32bit value (0)
+ */
+ tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
+ } else {
+ end_seq = TCP_SKB_CB(skb)->seq + 1;
+ }
+ }
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(child);
+ sock_put(child);
+ WARN_ON(!req->sk);
+ return true;
+}
+
+static bool tcp_fastopen_queue_check(struct sock *sk)
+{
+ struct fastopen_queue *fastopenq;
+
+ /* Make sure the listener has enabled fastopen, and we don't
+ * exceed the max # of pending TFO requests allowed before trying
+ * to validating the cookie in order to avoid burning CPU cycles
+ * unnecessarily.
+ *
+ * XXX (TFO) - The implication of checking the max_qlen before
+ * processing a cookie request is that clients can't differentiate
+ * between qlen overflow causing Fast Open to be disabled
+ * temporarily vs a server not supporting Fast Open at all.
+ */
+ fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
+ if (!fastopenq || fastopenq->max_qlen == 0)
+ return false;
+
+ if (fastopenq->qlen >= fastopenq->max_qlen) {
+ struct request_sock *req1;
+ spin_lock(&fastopenq->lock);
+ req1 = fastopenq->rskq_rst_head;
+ if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
+ spin_unlock(&fastopenq->lock);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+ return false;
+ }
+ fastopenq->rskq_rst_head = req1->dl_next;
+ fastopenq->qlen--;
+ spin_unlock(&fastopenq->lock);
+ reqsk_put(req1);
+ }
+ return true;
+}
+
+/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
+ * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
+ * cookie request (foc->len == 0).
+ */
+bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct dst_entry *dst)
+{
+ struct tcp_fastopen_cookie valid_foc = { .len = -1 };
+ bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+
+ if (foc->len == 0) /* Client requests a cookie */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+
+ if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+ (syn_data || foc->len >= 0) &&
+ tcp_fastopen_queue_check(sk))) {
+ foc->len = -1;
+ return false;
+ }
+
+ if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+ goto fastopen;
+
+ if (foc->len >= 0 && /* Client presents or requests a cookie */
+ tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+ foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
+ foc->len == valid_foc.len &&
+ !memcmp(foc->val, valid_foc.val, foc->len)) {
+ /* Cookie is valid. Create a (full) child socket to accept
+ * the data in SYN before returning a SYN-ACK to ack the
+ * data. If we fail to create the socket, fall back and
+ * ack the ISN only but includes the same cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to send
+ * data in SYN_RECV state.
+ */
+fastopen:
+ if (tcp_fastopen_create_child(sk, skb, dst, req)) {
+ foc->len = -1;
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ return true;
+ }
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ } else if (foc->len > 0) /* Client presents an invalid cookie */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+
+ valid_foc.exp = foc->exp;
+ *foc = valid_foc;
+ return false;
+}
+EXPORT_SYMBOL(tcp_try_fastopen);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 000000000..882c08aae
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,185 @@
+/*
+ * Sally Floyd's High Speed TCP (RFC 3649) congestion control
+ *
+ * See http://www.icir.org/floyd/hstcp.html
+ *
+ * John Heffner <jheffner@psc.edu>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* From AIMD tables from RFC 3649 appendix B,
+ * with fixed-point MD scaled <<8.
+ */
+static const struct hstcp_aimd_val {
+ unsigned int cwnd;
+ unsigned int md;
+} hstcp_aimd_vals[] = {
+ { 38, 128, /* 0.50 */ },
+ { 118, 112, /* 0.44 */ },
+ { 221, 104, /* 0.41 */ },
+ { 347, 98, /* 0.38 */ },
+ { 495, 93, /* 0.37 */ },
+ { 663, 89, /* 0.35 */ },
+ { 851, 86, /* 0.34 */ },
+ { 1058, 83, /* 0.33 */ },
+ { 1284, 81, /* 0.32 */ },
+ { 1529, 78, /* 0.31 */ },
+ { 1793, 76, /* 0.30 */ },
+ { 2076, 74, /* 0.29 */ },
+ { 2378, 72, /* 0.28 */ },
+ { 2699, 71, /* 0.28 */ },
+ { 3039, 69, /* 0.27 */ },
+ { 3399, 68, /* 0.27 */ },
+ { 3778, 66, /* 0.26 */ },
+ { 4177, 65, /* 0.26 */ },
+ { 4596, 64, /* 0.25 */ },
+ { 5036, 62, /* 0.25 */ },
+ { 5497, 61, /* 0.24 */ },
+ { 5979, 60, /* 0.24 */ },
+ { 6483, 59, /* 0.23 */ },
+ { 7009, 58, /* 0.23 */ },
+ { 7558, 57, /* 0.22 */ },
+ { 8130, 56, /* 0.22 */ },
+ { 8726, 55, /* 0.22 */ },
+ { 9346, 54, /* 0.21 */ },
+ { 9991, 53, /* 0.21 */ },
+ { 10661, 52, /* 0.21 */ },
+ { 11358, 52, /* 0.20 */ },
+ { 12082, 51, /* 0.20 */ },
+ { 12834, 50, /* 0.20 */ },
+ { 13614, 49, /* 0.19 */ },
+ { 14424, 48, /* 0.19 */ },
+ { 15265, 48, /* 0.19 */ },
+ { 16137, 47, /* 0.19 */ },
+ { 17042, 46, /* 0.18 */ },
+ { 17981, 45, /* 0.18 */ },
+ { 18955, 45, /* 0.18 */ },
+ { 19965, 44, /* 0.17 */ },
+ { 21013, 43, /* 0.17 */ },
+ { 22101, 43, /* 0.17 */ },
+ { 23230, 42, /* 0.17 */ },
+ { 24402, 41, /* 0.16 */ },
+ { 25618, 41, /* 0.16 */ },
+ { 26881, 40, /* 0.16 */ },
+ { 28193, 39, /* 0.16 */ },
+ { 29557, 39, /* 0.15 */ },
+ { 30975, 38, /* 0.15 */ },
+ { 32450, 38, /* 0.15 */ },
+ { 33986, 37, /* 0.15 */ },
+ { 35586, 36, /* 0.14 */ },
+ { 37253, 36, /* 0.14 */ },
+ { 38992, 35, /* 0.14 */ },
+ { 40808, 35, /* 0.14 */ },
+ { 42707, 34, /* 0.13 */ },
+ { 44694, 33, /* 0.13 */ },
+ { 46776, 33, /* 0.13 */ },
+ { 48961, 32, /* 0.13 */ },
+ { 51258, 32, /* 0.13 */ },
+ { 53677, 31, /* 0.12 */ },
+ { 56230, 30, /* 0.12 */ },
+ { 58932, 30, /* 0.12 */ },
+ { 61799, 29, /* 0.12 */ },
+ { 64851, 28, /* 0.11 */ },
+ { 68113, 28, /* 0.11 */ },
+ { 71617, 27, /* 0.11 */ },
+ { 75401, 26, /* 0.10 */ },
+ { 79517, 26, /* 0.10 */ },
+ { 84035, 25, /* 0.10 */ },
+ { 89053, 24, /* 0.10 */ },
+};
+
+#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
+
+struct hstcp {
+ u32 ai;
+};
+
+static void hstcp_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+
+ ca->ai = 0;
+
+ /* Ensure the MD arithmetic works. This is somewhat pedantic,
+ * since I don't think we will see a cwnd this large. :) */
+ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ else {
+ /* Update AIMD parameters.
+ *
+ * We want to guarantee that:
+ * hstcp_aimd_vals[ca->ai-1].cwnd <
+ * snd_cwnd <=
+ * hstcp_aimd_vals[ca->ai].cwnd
+ */
+ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
+ while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+ ca->ai < HSTCP_AIMD_MAX - 1)
+ ca->ai++;
+ } else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) {
+ while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd)
+ ca->ai--;
+ }
+
+ /* Do additive increase */
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ /* cwnd = cwnd + a(w) / cwnd */
+ tp->snd_cwnd_cnt += ca->ai + 1;
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ tp->snd_cwnd_cnt -= tp->snd_cwnd;
+ tp->snd_cwnd++;
+ }
+ }
+ }
+}
+
+static u32 hstcp_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct hstcp *ca = inet_csk_ca(sk);
+
+ /* Do multiplicative decrease */
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
+ .init = hstcp_init,
+ .ssthresh = hstcp_ssthresh,
+ .cong_avoid = hstcp_cong_avoid,
+
+ .owner = THIS_MODULE,
+ .name = "highspeed"
+};
+
+static int __init hstcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_highspeed);
+}
+
+static void __exit hstcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_highspeed);
+}
+
+module_init(hstcp_register);
+module_exit(hstcp_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 000000000..58469fff6
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,317 @@
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ * "H-TCP: TCP for high-speed and long-distance networks"
+ * Proc. PFLDnet, Argonne, 2004.
+ * http://www.hamilton.ie/net/htcp3.pdf
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
+#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
+#define BETA_MAX 102 /* 0.8 with shift << 7 */
+
+static int use_rtt_scaling __read_mostly = 1;
+module_param(use_rtt_scaling, int, 0644);
+MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
+
+static int use_bandwidth_switch __read_mostly = 1;
+module_param(use_bandwidth_switch, int, 0644);
+MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
+
+struct htcp {
+ u32 alpha; /* Fixed point arith, << 7 */
+ u8 beta; /* Fixed point arith, << 7 */
+ u8 modeswitch; /* Delay modeswitch
+ until we had at least one congestion event */
+ u16 pkts_acked;
+ u32 packetcount;
+ u32 minRTT;
+ u32 maxRTT;
+ u32 last_cong; /* Time since last congestion event end */
+ u32 undo_last_cong;
+
+ u32 undo_maxRTT;
+ u32 undo_old_maxB;
+
+ /* Bandwidth estimation */
+ u32 minB;
+ u32 maxB;
+ u32 old_maxB;
+ u32 Bi;
+ u32 lasttime;
+};
+
+static inline u32 htcp_cong_time(const struct htcp *ca)
+{
+ return jiffies - ca->last_cong;
+}
+
+static inline u32 htcp_ccount(const struct htcp *ca)
+{
+ return htcp_cong_time(ca) / ca->minRTT;
+}
+
+static inline void htcp_reset(struct htcp *ca)
+{
+ ca->undo_last_cong = ca->last_cong;
+ ca->undo_maxRTT = ca->maxRTT;
+ ca->undo_old_maxB = ca->old_maxB;
+
+ ca->last_cong = jiffies;
+}
+
+static u32 htcp_cwnd_undo(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+
+ if (ca->undo_last_cong) {
+ ca->last_cong = ca->undo_last_cong;
+ ca->maxRTT = ca->undo_maxRTT;
+ ca->old_maxB = ca->undo_old_maxB;
+ ca->undo_last_cong = 0;
+ }
+
+ return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
+}
+
+static inline void measure_rtt(struct sock *sk, u32 srtt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+
+ /* keep track of minimum RTT seen so far, minRTT is zero at first */
+ if (ca->minRTT > srtt || !ca->minRTT)
+ ca->minRTT = srtt;
+
+ /* max RTT */
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
+ if (ca->maxRTT < ca->minRTT)
+ ca->maxRTT = ca->minRTT;
+ if (ca->maxRTT < srtt &&
+ srtt <= ca->maxRTT + msecs_to_jiffies(20))
+ ca->maxRTT = srtt;
+ }
+}
+
+static void measure_achieved_throughput(struct sock *sk,
+ u32 pkts_acked, s32 rtt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+ u32 now = tcp_time_stamp;
+
+ if (icsk->icsk_ca_state == TCP_CA_Open)
+ ca->pkts_acked = pkts_acked;
+
+ if (rtt > 0)
+ measure_rtt(sk, usecs_to_jiffies(rtt));
+
+ if (!use_bandwidth_switch)
+ return;
+
+ /* achieved throughput calculations */
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
+ ca->packetcount = 0;
+ ca->lasttime = now;
+ return;
+ }
+
+ ca->packetcount += pkts_acked;
+
+ if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
+ now - ca->lasttime >= ca->minRTT &&
+ ca->minRTT > 0) {
+ __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
+
+ if (htcp_ccount(ca) <= 3) {
+ /* just after backoff */
+ ca->minB = ca->maxB = ca->Bi = cur_Bi;
+ } else {
+ ca->Bi = (3 * ca->Bi + cur_Bi) / 4;
+ if (ca->Bi > ca->maxB)
+ ca->maxB = ca->Bi;
+ if (ca->minB > ca->maxB)
+ ca->minB = ca->maxB;
+ }
+ ca->packetcount = 0;
+ ca->lasttime = now;
+ }
+}
+
+static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
+{
+ if (use_bandwidth_switch) {
+ u32 maxB = ca->maxB;
+ u32 old_maxB = ca->old_maxB;
+
+ ca->old_maxB = ca->maxB;
+ if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
+ ca->beta = BETA_MIN;
+ ca->modeswitch = 0;
+ return;
+ }
+ }
+
+ if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) {
+ ca->beta = (minRTT << 7) / maxRTT;
+ if (ca->beta < BETA_MIN)
+ ca->beta = BETA_MIN;
+ else if (ca->beta > BETA_MAX)
+ ca->beta = BETA_MAX;
+ } else {
+ ca->beta = BETA_MIN;
+ ca->modeswitch = 1;
+ }
+}
+
+static inline void htcp_alpha_update(struct htcp *ca)
+{
+ u32 minRTT = ca->minRTT;
+ u32 factor = 1;
+ u32 diff = htcp_cong_time(ca);
+
+ if (diff > HZ) {
+ diff -= HZ;
+ factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ;
+ }
+
+ if (use_rtt_scaling && minRTT) {
+ u32 scale = (HZ << 3) / (10 * minRTT);
+
+ /* clamping ratio to interval [0.5,10]<<3 */
+ scale = min(max(scale, 1U << 2), 10U << 3);
+ factor = (factor << 3) / scale;
+ if (!factor)
+ factor = 1;
+ }
+
+ ca->alpha = 2 * factor * ((1 << 7) - ca->beta);
+ if (!ca->alpha)
+ ca->alpha = ALPHA_BASE;
+}
+
+/*
+ * After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void htcp_param_update(struct sock *sk)
+{
+ struct htcp *ca = inet_csk_ca(sk);
+ u32 minRTT = ca->minRTT;
+ u32 maxRTT = ca->maxRTT;
+
+ htcp_beta_update(ca, minRTT, maxRTT);
+ htcp_alpha_update(ca);
+
+ /* add slowly fading memory for maxRTT to accommodate routing changes */
+ if (minRTT > 0 && maxRTT > minRTT)
+ ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100;
+}
+
+static u32 htcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct htcp *ca = inet_csk_ca(sk);
+
+ htcp_param_update(sk);
+ return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+}
+
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ else {
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
+ */
+ if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ htcp_alpha_update(ca);
+ } else
+ tp->snd_cwnd_cnt += ca->pkts_acked;
+
+ ca->pkts_acked = 1;
+ }
+}
+
+static void htcp_init(struct sock *sk)
+{
+ struct htcp *ca = inet_csk_ca(sk);
+
+ memset(ca, 0, sizeof(struct htcp));
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_MIN;
+ ca->pkts_acked = 1;
+ ca->last_cong = jiffies;
+}
+
+static void htcp_state(struct sock *sk, u8 new_state)
+{
+ switch (new_state) {
+ case TCP_CA_Open:
+ {
+ struct htcp *ca = inet_csk_ca(sk);
+
+ if (ca->undo_last_cong) {
+ ca->last_cong = jiffies;
+ ca->undo_last_cong = 0;
+ }
+ }
+ break;
+ case TCP_CA_CWR:
+ case TCP_CA_Recovery:
+ case TCP_CA_Loss:
+ htcp_reset(inet_csk_ca(sk));
+ break;
+ }
+}
+
+static struct tcp_congestion_ops htcp __read_mostly = {
+ .init = htcp_init,
+ .ssthresh = htcp_recalc_ssthresh,
+ .cong_avoid = htcp_cong_avoid,
+ .set_state = htcp_state,
+ .undo_cwnd = htcp_cwnd_undo,
+ .pkts_acked = measure_achieved_throughput,
+ .owner = THIS_MODULE,
+ .name = "htcp",
+};
+
+static int __init htcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
+ BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
+ return tcp_register_congestion_control(&htcp);
+}
+
+static void __exit htcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&htcp);
+}
+
+module_init(htcp_register);
+module_exit(htcp_unregister);
+
+MODULE_AUTHOR("Baruch Even");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 000000000..f963b274f
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,192 @@
+/*
+ * TCP HYBLA
+ *
+ * TCP-HYBLA Congestion control algorithm, based on:
+ * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
+ * for Heterogeneous Networks",
+ * International Journal on satellite Communications,
+ * September 2004
+ * Daniele Lacamera
+ * root at danielinux.net
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* Tcp Hybla structure. */
+struct hybla {
+ bool hybla_en;
+ u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
+ u32 rho; /* Rho parameter, integer part */
+ u32 rho2; /* Rho * Rho, integer part */
+ u32 rho_3ls; /* Rho parameter, <<3 */
+ u32 rho2_7ls; /* Rho^2, <<7 */
+ u32 minrtt_us; /* Minimum smoothed round trip time value seen */
+};
+
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
+static int rtt0 = 25;
+module_param(rtt0, int, 0644);
+MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
+
+/* This is called to refresh values for hybla parameters */
+static inline void hybla_recalc_param (struct sock *sk)
+{
+ struct hybla *ca = inet_csk_ca(sk);
+
+ ca->rho_3ls = max_t(u32,
+ tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
+ 8U);
+ ca->rho = ca->rho_3ls >> 3;
+ ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
+ ca->rho2 = ca->rho2_7ls >> 7;
+}
+
+static void hybla_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
+
+ ca->rho = 0;
+ ca->rho2 = 0;
+ ca->rho_3ls = 0;
+ ca->rho2_7ls = 0;
+ ca->snd_cwnd_cents = 0;
+ ca->hybla_en = true;
+ tp->snd_cwnd = 2;
+ tp->snd_cwnd_clamp = 65535;
+
+ /* 1st Rho measurement based on initial srtt */
+ hybla_recalc_param(sk);
+
+ /* set minimum rtt as this is the 1st ever seen */
+ ca->minrtt_us = tp->srtt_us;
+ tp->snd_cwnd = ca->rho;
+}
+
+static void hybla_state(struct sock *sk, u8 ca_state)
+{
+ struct hybla *ca = inet_csk_ca(sk);
+
+ ca->hybla_en = (ca_state == TCP_CA_Open);
+}
+
+static inline u32 hybla_fraction(u32 odds)
+{
+ static const u32 fractions[] = {
+ 128, 139, 152, 165, 181, 197, 215, 234,
+ };
+
+ return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
+}
+
+/* TCP Hybla main routine.
+ * This is the algorithm behavior:
+ * o Recalc Hybla parameters if min_rtt has changed
+ * o Give cwnd a new value based on the model proposed
+ * o remember increments <1
+ */
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
+ u32 increment, odd, rho_fractions;
+ int is_slowstart = 0;
+
+ /* Recalculate rho only if this srtt is the lowest */
+ if (tp->srtt_us < ca->minrtt_us) {
+ hybla_recalc_param(sk);
+ ca->minrtt_us = tp->srtt_us;
+ }
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (!ca->hybla_en) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
+
+ if (ca->rho == 0)
+ hybla_recalc_param(sk);
+
+ rho_fractions = ca->rho_3ls - (ca->rho << 3);
+
+ if (tp->snd_cwnd < tp->snd_ssthresh) {
+ /*
+ * slow start
+ * INC = 2^RHO - 1
+ * This is done by splitting the rho parameter
+ * into 2 parts: an integer part and a fraction part.
+ * Inrement<<7 is estimated by doing:
+ * [2^(int+fract)]<<7
+ * that is equal to:
+ * (2^int) * [(2^fract) <<7]
+ * 2^int is straightly computed as 1<<int,
+ * while we will use hybla_slowstart_fraction_increment() to
+ * calculate 2^fract in a <<7 value.
+ */
+ is_slowstart = 1;
+ increment = ((1 << min(ca->rho, 16U)) *
+ hybla_fraction(rho_fractions)) - 128;
+ } else {
+ /*
+ * congestion avoidance
+ * INC = RHO^2 / W
+ * as long as increment is estimated as (rho<<7)/window
+ * it already is <<7 and we can easily count its fractions.
+ */
+ increment = ca->rho2_7ls / tp->snd_cwnd;
+ if (increment < 128)
+ tp->snd_cwnd_cnt++;
+ }
+
+ odd = increment % 128;
+ tp->snd_cwnd += increment >> 7;
+ ca->snd_cwnd_cents += odd;
+
+ /* check when fractions goes >=128 and increase cwnd by 1. */
+ while (ca->snd_cwnd_cents >= 128) {
+ tp->snd_cwnd++;
+ ca->snd_cwnd_cents -= 128;
+ tp->snd_cwnd_cnt = 0;
+ }
+ /* check when cwnd has not been incremented for a while */
+ if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ }
+ /* clamp down slowstart cwnd to ssthresh value. */
+ if (is_slowstart)
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+
+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+
+static struct tcp_congestion_ops tcp_hybla __read_mostly = {
+ .init = hybla_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = hybla_cong_avoid,
+ .set_state = hybla_state,
+
+ .owner = THIS_MODULE,
+ .name = "hybla"
+};
+
+static int __init hybla_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_hybla);
+}
+
+static void __exit hybla_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_hybla);
+}
+
+module_init(hybla_register);
+module_exit(hybla_unregister);
+
+MODULE_AUTHOR("Daniele Lacamera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
new file mode 100644
index 000000000..f71002e4d
--- /dev/null
+++ b/net/ipv4/tcp_illinois.c
@@ -0,0 +1,355 @@
+/*
+ * TCP Illinois congestion control.
+ * Home page:
+ * http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ *
+ * The algorithm is described in:
+ * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
+ * for High-Speed Networks"
+ * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
+ *
+ * Implemented from description in paper and ns-2 simulation.
+ * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <asm/div64.h>
+#include <net/tcp.h>
+
+#define ALPHA_SHIFT 7
+#define ALPHA_SCALE (1u<<ALPHA_SHIFT)
+#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */
+#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */
+#define ALPHA_BASE ALPHA_SCALE /* 1.0 */
+#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */
+
+#define BETA_SHIFT 6
+#define BETA_SCALE (1u<<BETA_SHIFT)
+#define BETA_MIN (BETA_SCALE/8) /* 0.125 */
+#define BETA_MAX (BETA_SCALE/2) /* 0.5 */
+#define BETA_BASE BETA_MAX
+
+static int win_thresh __read_mostly = 15;
+module_param(win_thresh, int, 0);
+MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing");
+
+static int theta __read_mostly = 5;
+module_param(theta, int, 0);
+MODULE_PARM_DESC(theta, "# of fast RTT's before full growth");
+
+/* TCP Illinois Parameters */
+struct illinois {
+ u64 sum_rtt; /* sum of rtt's measured within last rtt */
+ u16 cnt_rtt; /* # of rtts measured within last rtt */
+ u32 base_rtt; /* min of all rtt in usec */
+ u32 max_rtt; /* max of all rtt in usec */
+ u32 end_seq; /* right edge of current RTT */
+ u32 alpha; /* Additive increase */
+ u32 beta; /* Muliplicative decrease */
+ u16 acked; /* # packets acked by current ACK */
+ u8 rtt_above; /* average rtt has gone above threshold */
+ u8 rtt_low; /* # of rtts measurements below threshold */
+};
+
+static void rtt_reset(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ ca->end_seq = tp->snd_nxt;
+ ca->cnt_rtt = 0;
+ ca->sum_rtt = 0;
+
+ /* TODO: age max_rtt? */
+}
+
+static void tcp_illinois_init(struct sock *sk)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+
+ ca->alpha = ALPHA_MAX;
+ ca->beta = BETA_BASE;
+ ca->base_rtt = 0x7fffffff;
+ ca->max_rtt = 0;
+
+ ca->acked = 0;
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+
+ rtt_reset(sk);
+}
+
+/* Measure RTT for each ack. */
+static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+
+ ca->acked = pkts_acked;
+
+ /* dup ack, no rtt sample */
+ if (rtt < 0)
+ return;
+
+ /* ignore bogus values, this prevents wraparound in alpha math */
+ if (rtt > RTT_MAX)
+ rtt = RTT_MAX;
+
+ /* keep track of minimum RTT seen so far */
+ if (ca->base_rtt > rtt)
+ ca->base_rtt = rtt;
+
+ /* and max */
+ if (ca->max_rtt < rtt)
+ ca->max_rtt = rtt;
+
+ ++ca->cnt_rtt;
+ ca->sum_rtt += rtt;
+}
+
+/* Maximum queuing delay */
+static inline u32 max_delay(const struct illinois *ca)
+{
+ return ca->max_rtt - ca->base_rtt;
+}
+
+/* Average queuing delay */
+static inline u32 avg_delay(const struct illinois *ca)
+{
+ u64 t = ca->sum_rtt;
+
+ do_div(t, ca->cnt_rtt);
+ return t - ca->base_rtt;
+}
+
+/*
+ * Compute value of alpha used for additive increase.
+ * If small window then use 1.0, equivalent to Reno.
+ *
+ * For larger windows, adjust based on average delay.
+ * A. If average delay is at minimum (we are uncongested),
+ * then use large alpha (10.0) to increase faster.
+ * B. If average delay is at maximum (getting congested)
+ * then use small alpha (0.3)
+ *
+ * The result is a convex window growth curve.
+ */
+static u32 alpha(struct illinois *ca, u32 da, u32 dm)
+{
+ u32 d1 = dm / 100; /* Low threshold */
+
+ if (da <= d1) {
+ /* If never got out of low delay zone, then use max */
+ if (!ca->rtt_above)
+ return ALPHA_MAX;
+
+ /* Wait for 5 good RTT's before allowing alpha to go alpha max.
+ * This prevents one good RTT from causing sudden window increase.
+ */
+ if (++ca->rtt_low < theta)
+ return ca->alpha;
+
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+ return ALPHA_MAX;
+ }
+
+ ca->rtt_above = 1;
+
+ /*
+ * Based on:
+ *
+ * (dm - d1) amin amax
+ * k1 = -------------------
+ * amax - amin
+ *
+ * (dm - d1) amin
+ * k2 = ---------------- - d1
+ * amax - amin
+ *
+ * k1
+ * alpha = ----------
+ * k2 + da
+ */
+
+ dm -= d1;
+ da -= d1;
+ return (dm * ALPHA_MAX) /
+ (dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN);
+}
+
+/*
+ * Beta used for multiplicative decrease.
+ * For small window sizes returns same value as Reno (0.5)
+ *
+ * If delay is small (10% of max) then beta = 1/8
+ * If delay is up to 80% of max then beta = 1/2
+ * In between is a linear function
+ */
+static u32 beta(u32 da, u32 dm)
+{
+ u32 d2, d3;
+
+ d2 = dm / 10;
+ if (da <= d2)
+ return BETA_MIN;
+
+ d3 = (8 * dm) / 10;
+ if (da >= d3 || d3 <= d2)
+ return BETA_MAX;
+
+ /*
+ * Based on:
+ *
+ * bmin d3 - bmax d2
+ * k3 = -------------------
+ * d3 - d2
+ *
+ * bmax - bmin
+ * k4 = -------------
+ * d3 - d2
+ *
+ * b = k3 + k4 da
+ */
+ return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da)
+ / (d3 - d2);
+}
+
+/* Update alpha and beta values once per RTT */
+static void update_params(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (tp->snd_cwnd < win_thresh) {
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_BASE;
+ } else if (ca->cnt_rtt > 0) {
+ u32 dm = max_delay(ca);
+ u32 da = avg_delay(ca);
+
+ ca->alpha = alpha(ca, da, dm);
+ ca->beta = beta(da, dm);
+ }
+
+ rtt_reset(sk);
+}
+
+/*
+ * In case of loss, reset to default values
+ */
+static void tcp_illinois_state(struct sock *sk, u8 new_state)
+{
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (new_state == TCP_CA_Loss) {
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_BASE;
+ ca->rtt_low = 0;
+ ca->rtt_above = 0;
+ rtt_reset(sk);
+ }
+}
+
+/*
+ * Increase window in response to successful acknowledgment.
+ */
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ if (after(ack, ca->end_seq))
+ update_params(sk);
+
+ /* RFC2861 only increase cwnd if fully utilized */
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* In slow start */
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+
+ else {
+ u32 delta;
+
+ /* snd_cwnd_cnt is # of packets since last cwnd increment */
+ tp->snd_cwnd_cnt += ca->acked;
+ ca->acked = 1;
+
+ /* This is close approximation of:
+ * tp->snd_cwnd += alpha/tp->snd_cwnd
+ */
+ delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
+ if (delta >= tp->snd_cwnd) {
+ tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
+ (u32)tp->snd_cwnd_clamp);
+ tp->snd_cwnd_cnt = 0;
+ }
+ }
+}
+
+static u32 tcp_illinois_ssthresh(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct illinois *ca = inet_csk_ca(sk);
+
+ /* Multiplicative decrease */
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static size_t tcp_illinois_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct illinois *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = ca->cnt_rtt;
+ info->vegas.tcpv_minrtt = ca->base_rtt;
+ info->vegas.tcpv_rtt = 0;
+
+ if (info->vegas.tcpv_rttcnt > 0) {
+ u64 t = ca->sum_rtt;
+
+ do_div(t, info->vegas.tcpv_rttcnt);
+ info->vegas.tcpv_rtt = t;
+ }
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+
+static struct tcp_congestion_ops tcp_illinois __read_mostly = {
+ .init = tcp_illinois_init,
+ .ssthresh = tcp_illinois_ssthresh,
+ .cong_avoid = tcp_illinois_cong_avoid,
+ .set_state = tcp_illinois_state,
+ .get_info = tcp_illinois_info,
+ .pkts_acked = tcp_illinois_acked,
+
+ .owner = THIS_MODULE,
+ .name = "illinois",
+};
+
+static int __init tcp_illinois_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_illinois);
+}
+
+static void __exit tcp_illinois_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_illinois);
+}
+
+module_init(tcp_illinois_register);
+module_exit(tcp_illinois_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Shao Liu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Illinois");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
new file mode 100644
index 000000000..8e5d1bcbd
--- /dev/null
+++ b/net/ipv4/tcp_input.c
@@ -0,0 +1,6299 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:
+ * Pedro Roque : Fast Retransmit/Recovery.
+ * Two receive queues.
+ * Retransmit queue handled by TCP.
+ * Better retransmit timer handling.
+ * New congestion avoidance.
+ * Header prediction.
+ * Variable renaming.
+ *
+ * Eric : Fast Retransmit.
+ * Randy Scott : MSS option defines.
+ * Eric Schenk : Fixes to slow start algorithm.
+ * Eric Schenk : Yet another double ACK bug.
+ * Eric Schenk : Delayed ACK bug fixes.
+ * Eric Schenk : Floyd style fast retrans war avoidance.
+ * David S. Miller : Don't allow zero congestion window.
+ * Eric Schenk : Fix retransmitter so that it sends
+ * next packet on ack of previous packet.
+ * Andi Kleen : Moved open_request checking here
+ * and process RSTs for open_requests.
+ * Andi Kleen : Better prune_queue, and other fixes.
+ * Andrey Savochkin: Fix RTT measurements in the presence of
+ * timestamps.
+ * Andrey Savochkin: Check sequence numbers correctly when
+ * removing SACKs due to in sequence incoming
+ * data segments.
+ * Andi Kleen: Make sure we never ack data there is not
+ * enough room for. Also make this condition
+ * a fatal error if it might still happen.
+ * Andi Kleen: Add tcp_measure_rcv_mss to make
+ * connections with MSS<min(MTU,ann. MSS)
+ * work without delayed acks.
+ * Andi Kleen: Process packets with PSH set in the
+ * fast path.
+ * J Hadi Salim: ECN support
+ * Andrei Gurtov,
+ * Pasi Sarolahti,
+ * Panu Kuhlberg: Experimental audit of TCP (re)transmission
+ * engine. Lots of bugs are found.
+ * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/kernel.h>
+#include <linux/prefetch.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/ipsec.h>
+#include <asm/unaligned.h>
+#include <linux/errqueue.h>
+
+int sysctl_tcp_timestamps __read_mostly = 1;
+#ifdef CONFIG_TCP_STEALTH
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
+#endif
+int sysctl_tcp_window_scaling __read_mostly = 1;
+int sysctl_tcp_sack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+int sysctl_tcp_max_reordering __read_mostly = 300;
+EXPORT_SYMBOL(sysctl_tcp_reordering);
+int sysctl_tcp_dsack __read_mostly = 1;
+int sysctl_tcp_app_win __read_mostly = 31;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challenge_ack_limit = 100;
+
+int sysctl_tcp_stdurg __read_mostly;
+int sysctl_tcp_rfc1337 __read_mostly;
+int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
+int sysctl_tcp_frto __read_mostly = 2;
+
+int sysctl_tcp_thin_dupack __read_mostly;
+
+int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
+int sysctl_tcp_early_retrans __read_mostly = 3;
+int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+
+#define FLAG_DATA 0x01 /* Incoming frame contained data. */
+#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
+#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
+#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
+#define FLAG_DATA_SACKED 0x20 /* New SACK. */
+#define FLAG_ECE 0x40 /* ECE in this ACK */
+#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
+#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
+#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
+#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
+#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
+
+#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
+
+#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+
+/* Adapt the MSS value used to make delayed ack decision to the
+ * real world.
+ */
+static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const unsigned int lss = icsk->icsk_ack.last_seg_size;
+ unsigned int len;
+
+ icsk->icsk_ack.last_seg_size = 0;
+
+ /* skb->len may jitter because of SACKs, even if peer
+ * sends good full-sized frames.
+ */
+ len = skb_shinfo(skb)->gso_size ? : skb->len;
+ if (len >= icsk->icsk_ack.rcv_mss) {
+ icsk->icsk_ack.rcv_mss = len;
+ } else {
+ /* Otherwise, we make more careful check taking into account,
+ * that SACKs block is variable.
+ *
+ * "len" is invariant segment length, including TCP header.
+ */
+ len += skb->data - skb_transport_header(skb);
+ if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
+ /* If PSH is not set, packet should be
+ * full sized, provided peer TCP is not badly broken.
+ * This observation (if it is correct 8)) allows
+ * to handle super-low mtu links fairly.
+ */
+ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
+ !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
+ /* Subtract also invariant (if peer is RFC compliant),
+ * tcp header plus fixed timestamp option length.
+ * Resulting "len" is MSS free of SACK jitter.
+ */
+ len -= tcp_sk(sk)->tcp_header_len;
+ icsk->icsk_ack.last_seg_size = len;
+ if (len == lss) {
+ icsk->icsk_ack.rcv_mss = len;
+ return;
+ }
+ }
+ if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+ }
+}
+
+static void tcp_incr_quickack(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+
+ if (quickacks == 0)
+ quickacks = 2;
+ if (quickacks > icsk->icsk_ack.quick)
+ icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+}
+
+static void tcp_enter_quickack_mode(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+}
+
+/* Send ACKs quickly, if "quick" count is not exhausted
+ * and the session is not interactive.
+ */
+
+static inline bool tcp_in_quickack_mode(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+}
+
+static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
+{
+ if (tp->ecn_flags & TCP_ECN_OK)
+ tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ if (tcp_hdr(skb)->cwr)
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
+{
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
+ /* Funny extension: if ECT is not set on a segment,
+ * and we already seen ECT on a previous segment,
+ * it is probably a retransmit.
+ */
+ if (tp->ecn_flags & TCP_ECN_SEEN)
+ tcp_enter_quickack_mode((struct sock *)tp);
+ break;
+ case INET_ECN_CE:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ /* Better not delay acks, sender can have a very low cwnd */
+ tcp_enter_quickack_mode((struct sock *)tp);
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ }
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
+ default:
+ if (tcp_ca_needs_ecn((struct sock *)tp))
+ tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
+ tp->ecn_flags |= TCP_ECN_SEEN;
+ break;
+ }
+}
+
+static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+ if (tp->ecn_flags & TCP_ECN_OK)
+ __tcp_ecn_check_ce(tp, skb);
+}
+
+static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+{
+ if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
+{
+ if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+{
+ if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
+ return true;
+ return false;
+}
+
+/* Buffer size and advertised window tuning.
+ *
+ * 1. Tuning sk->sk_sndbuf, when connection enters established state.
+ */
+
+static void tcp_sndbuf_expand(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int sndmem, per_mss;
+ u32 nr_segs;
+
+ /* Worst case is non GSO/TSO : each frame consumes one skb
+ * and skb->head is kmalloced using power of two area of memory
+ */
+ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+ MAX_TCP_HEADER +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ per_mss = roundup_pow_of_two(per_mss) +
+ SKB_DATA_ALIGN(sizeof(struct sk_buff));
+
+ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+
+ /* Fast Recovery (RFC 5681 3.2) :
+ * Cubic needs 1.7 factor, rounded to 2 to include
+ * extra cushion (application might react slowly to POLLOUT)
+ */
+ sndmem = 2 * nr_segs * per_mss;
+
+ if (sk->sk_sndbuf < sndmem)
+ sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+}
+
+/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
+ *
+ * All tcp_full_space() is split to two parts: "network" buffer, allocated
+ * forward and advertised in receiver window (tp->rcv_wnd) and
+ * "application buffer", required to isolate scheduling/application
+ * latencies from network.
+ * window_clamp is maximal advertised window. It can be less than
+ * tcp_full_space(), in this case tcp_full_space() - window_clamp
+ * is reserved for "application" buffer. The less window_clamp is
+ * the smoother our behaviour from viewpoint of network, but the lower
+ * throughput and the higher sensitivity of the connection to losses. 8)
+ *
+ * rcv_ssthresh is more strict window_clamp used at "slow start"
+ * phase to predict further behaviour of this connection.
+ * It is used for two goals:
+ * - to enforce header prediction at sender, even when application
+ * requires some significant "application buffer". It is check #1.
+ * - to prevent pruning of receive queue because of misprediction
+ * of receiver window. Check #2.
+ *
+ * The scheme does not work when sender sends good segments opening
+ * window and then starts to feed us spaghetti. But it should work
+ * in common situations. Otherwise, we have to rely on queue collapsing.
+ */
+
+/* Slow part of check#2. */
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ /* Optimize this! */
+ int truesize = tcp_win_from_space(skb->truesize) >> 1;
+ int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
+
+ while (tp->rcv_ssthresh <= window) {
+ if (truesize <= skb->len)
+ return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
+
+ truesize >>= 1;
+ window >>= 1;
+ }
+ return 0;
+}
+
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Check #1 */
+ if (tp->rcv_ssthresh < tp->window_clamp &&
+ (int)tp->rcv_ssthresh < tcp_space(sk) &&
+ !sk_under_memory_pressure(sk)) {
+ int incr;
+
+ /* Check #2. Increase window, if skb with such overhead
+ * will fit to rcvbuf in future.
+ */
+ if (tcp_win_from_space(skb->truesize) <= skb->len)
+ incr = 2 * tp->advmss;
+ else
+ incr = __tcp_grow_window(sk, skb);
+
+ if (incr) {
+ incr = max_t(int, incr, 2 * skb->len);
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
+ tp->window_clamp);
+ inet_csk(sk)->icsk_ack.quick |= 1;
+ }
+ }
+}
+
+/* 3. Tuning rcvbuf, when connection enters established state. */
+static void tcp_fixup_rcvbuf(struct sock *sk)
+{
+ u32 mss = tcp_sk(sk)->advmss;
+ int rcvmem;
+
+ rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
+ tcp_default_init_rwnd(mss);
+
+ /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
+ * Allow enough cushion so that sender is not limited by our window
+ */
+ if (sysctl_tcp_moderate_rcvbuf)
+ rcvmem <<= 2;
+
+ if (sk->sk_rcvbuf < rcvmem)
+ sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
+}
+
+/* 4. Try to fixup all. It is made immediately after connection enters
+ * established state.
+ */
+void tcp_init_buffer_space(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int maxwin;
+
+ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ tcp_fixup_rcvbuf(sk);
+ if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
+ tcp_sndbuf_expand(sk);
+
+ tp->rcvq_space.space = tp->rcv_wnd;
+ tp->rcvq_space.time = tcp_time_stamp;
+ tp->rcvq_space.seq = tp->copied_seq;
+
+ maxwin = tcp_full_space(sk);
+
+ if (tp->window_clamp >= maxwin) {
+ tp->window_clamp = maxwin;
+
+ if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
+ tp->window_clamp = max(maxwin -
+ (maxwin >> sysctl_tcp_app_win),
+ 4 * tp->advmss);
+ }
+
+ /* Force reservation of one segment. */
+ if (sysctl_tcp_app_win &&
+ tp->window_clamp > 2 * tp->advmss &&
+ tp->window_clamp + tp->advmss > maxwin)
+ tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* 5. Recalculate window clamp after socket hit its memory bounds. */
+static void tcp_clamp_window(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_ack.quick = 0;
+
+ if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+ !sk_under_memory_pressure(sk) &&
+ sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
+ sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+ sysctl_tcp_rmem[2]);
+ }
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
+}
+
+/* Initialize RCV_MSS value.
+ * RCV_MSS is an our guess about MSS used by the peer.
+ * We haven't any direct information about the MSS.
+ * It's better to underestimate the RCV_MSS rather than overestimate.
+ * Overestimations make us ACKing less frequently than needed.
+ * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
+ */
+void tcp_initialize_rcv_mss(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
+
+ hint = min(hint, tp->rcv_wnd / 2);
+ hint = min(hint, TCP_MSS_DEFAULT);
+ hint = max(hint, TCP_MIN_MSS);
+
+ inet_csk(sk)->icsk_ack.rcv_mss = hint;
+}
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
+
+/* Receiver "autotuning" code.
+ *
+ * The algorithm for RTT estimation w/o timestamps is based on
+ * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
+ *
+ * More detail on this code can be found at
+ * <http://staff.psc.edu/jheffner/>,
+ * though this reference is out of date. A new paper
+ * is pending.
+ */
+static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
+{
+ u32 new_sample = tp->rcv_rtt_est.rtt;
+ long m = sample;
+
+ if (m == 0)
+ m = 1;
+
+ if (new_sample != 0) {
+ /* If we sample in larger samples in the non-timestamp
+ * case, we could grossly overestimate the RTT especially
+ * with chatty applications or bulk transfer apps which
+ * are stalled on filesystem I/O.
+ *
+ * Also, since we are only going for a minimum in the
+ * non-timestamp case, we do not smooth things out
+ * else with timestamps disabled convergence takes too
+ * long.
+ */
+ if (!win_dep) {
+ m -= (new_sample >> 3);
+ new_sample += m;
+ } else {
+ m <<= 3;
+ if (m < new_sample)
+ new_sample = m;
+ }
+ } else {
+ /* No previous measure. */
+ new_sample = m << 3;
+ }
+
+ if (tp->rcv_rtt_est.rtt != new_sample)
+ tp->rcv_rtt_est.rtt = new_sample;
+}
+
+static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
+{
+ if (tp->rcv_rtt_est.time == 0)
+ goto new_measure;
+ if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+ return;
+ tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
+
+new_measure:
+ tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
+ tp->rcv_rtt_est.time = tcp_time_stamp;
+}
+
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tp->rx_opt.rcv_tsecr &&
+ (TCP_SKB_CB(skb)->end_seq -
+ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
+ tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+}
+
+/*
+ * This function should be called every time data is copied to user space.
+ * It calculates the appropriate TCP receive buffer space.
+ */
+void tcp_rcv_space_adjust(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int time;
+ int copied;
+
+ time = tcp_time_stamp - tp->rcvq_space.time;
+ if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
+ return;
+
+ /* Number of bytes copied to user in last RTT */
+ copied = tp->copied_seq - tp->rcvq_space.seq;
+ if (copied <= tp->rcvq_space.space)
+ goto new_measure;
+
+ /* A bit of theory :
+ * copied = bytes received in previous RTT, our base window
+ * To cope with packet losses, we need a 2x factor
+ * To cope with slow start, and sender growing its cwin by 100 %
+ * every RTT, we need a 4x factor, because the ACK we are sending
+ * now is for the next RTT, not the current one :
+ * <prev RTT . ><current RTT .. ><next RTT .... >
+ */
+
+ if (sysctl_tcp_moderate_rcvbuf &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvwin, rcvmem, rcvbuf;
+
+ /* minimal window to cope with packet losses, assuming
+ * steady state. Add some cushion because of small variations.
+ */
+ rcvwin = (copied << 1) + 16 * tp->advmss;
+
+ /* If rate increased by 25%,
+ * assume slow start, rcvwin = 3 * copied
+ * If rate increased by 50%,
+ * assume sender can use 2x growth, rcvwin = 4 * copied
+ */
+ if (copied >=
+ tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
+ if (copied >=
+ tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
+ rcvwin <<= 1;
+ else
+ rcvwin += (rcvwin >> 1);
+ }
+
+ rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+ while (tcp_win_from_space(rcvmem) < tp->advmss)
+ rcvmem += 128;
+
+ rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ sk->sk_rcvbuf = rcvbuf;
+
+ /* Make the window clamp follow along. */
+ tp->window_clamp = rcvwin;
+ }
+ }
+ tp->rcvq_space.space = copied;
+
+new_measure:
+ tp->rcvq_space.seq = tp->copied_seq;
+ tp->rcvq_space.time = tcp_time_stamp;
+}
+
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval. When a
+ * connection starts up, we want to ack as quickly as possible. The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission. The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time. For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue. -DaveM
+ */
+static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 now;
+
+ inet_csk_schedule_ack(sk);
+
+ tcp_measure_rcv_mss(sk, skb);
+
+ tcp_rcv_rtt_measure(tp);
+
+ now = tcp_time_stamp;
+
+ if (!icsk->icsk_ack.ato) {
+ /* The _first_ data packet received, initialize
+ * delayed ACK engine.
+ */
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ } else {
+ int m = now - icsk->icsk_ack.lrcvtime;
+
+ if (m <= TCP_ATO_MIN / 2) {
+ /* The fastest case is the first. */
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+ } else if (m < icsk->icsk_ack.ato) {
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+ if (icsk->icsk_ack.ato > icsk->icsk_rto)
+ icsk->icsk_ack.ato = icsk->icsk_rto;
+ } else if (m > icsk->icsk_rto) {
+ /* Too long gap. Apparently sender failed to
+ * restart window, so that we send ACKs quickly.
+ */
+ tcp_incr_quickack(sk);
+ sk_mem_reclaim(sk);
+ }
+ }
+ icsk->icsk_ack.lrcvtime = now;
+
+ tcp_ecn_check_ce(tp, skb);
+
+ if (skb->len >= 128)
+ tcp_grow_window(sk, skb);
+}
+
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ long m = mrtt_us; /* RTT */
+ u32 srtt = tp->srtt_us;
+
+ /* The following amusing code comes from Jacobson's
+ * article in SIGCOMM '88. Note that rtt and mdev
+ * are scaled versions of rtt and mean deviation.
+ * This is designed to be as fast as possible
+ * m stands for "measurement".
+ *
+ * On a 1990 paper the rto value is changed to:
+ * RTO = rtt + 4 * mdev
+ *
+ * Funny. This algorithm seems to be very broken.
+ * These formulae increase RTO, when it should be decreased, increase
+ * too slowly, when it should be increased quickly, decrease too quickly
+ * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
+ * does not matter how to _calculate_ it. Seems, it was trap
+ * that VJ failed to avoid. 8)
+ */
+ if (srtt != 0) {
+ m -= (srtt >> 3); /* m is now error in rtt est */
+ srtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (m < 0) {
+ m = -m; /* m is now abs(error) */
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
+ /* This is similar to one of Eifel findings.
+ * Eifel blocks mdev updates when rtt decreases.
+ * This solution is a bit different: we use finer gain
+ * for mdev in this case (alpha*beta).
+ * Like Eifel it also prevents growth of rto,
+ * but also it limits too fast rto decreases,
+ * happening in pure Eifel.
+ */
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (tp->mdev_us >> 2); /* similar update on mdev */
+ }
+ tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (tp->mdev_us > tp->mdev_max_us) {
+ tp->mdev_max_us = tp->mdev_us;
+ if (tp->mdev_max_us > tp->rttvar_us)
+ tp->rttvar_us = tp->mdev_max_us;
+ }
+ if (after(tp->snd_una, tp->rtt_seq)) {
+ if (tp->mdev_max_us < tp->rttvar_us)
+ tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
+ tp->rtt_seq = tp->snd_nxt;
+ tp->mdev_max_us = tcp_rto_min_us(sk);
+ }
+ } else {
+ /* no previous measure. */
+ srtt = m << 3; /* take the measured time to be rtt */
+ tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
+ tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
+ tp->mdev_max_us = tp->rttvar_us;
+ tp->rtt_seq = tp->snd_nxt;
+ }
+ tp->srtt_us = max(1U, srtt);
+}
+
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u64 rate;
+
+ /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+ rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
+
+ rate *= max(tp->snd_cwnd, tp->packets_out);
+
+ if (likely(tp->srtt_us))
+ do_div(rate, tp->srtt_us);
+
+ /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
+ * without any lock. We want to make sure compiler wont store
+ * intermediate values in this location.
+ */
+ ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
+ sk->sk_max_pacing_rate);
+}
+
+/* Calculate rto without backoff. This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+static void tcp_set_rto(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ /* Old crap is replaced with new one. 8)
+ *
+ * More seriously:
+ * 1. If rtt variance happened to be less 50msec, it is hallucination.
+ * It cannot be less due to utterly erratic ACK generation made
+ * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+ * to do with delayed acks, because at cwnd>2 true delack timeout
+ * is invisible. Actually, Linux-2.4 also generates erratic
+ * ACKs in some circumstances.
+ */
+ inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
+
+ /* 2. Fixups made earlier cannot be right.
+ * If we do not estimate RTO correctly without them,
+ * all the algo is pure shit and should be replaced
+ * with correct one. It is exactly, which we pretend to do.
+ */
+
+ /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+ * guarantees that rto is higher.
+ */
+ tcp_bound_rto(sk);
+}
+
+__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
+{
+ __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+
+ if (!cwnd)
+ cwnd = TCP_INIT_CWND;
+ return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+}
+
+/*
+ * Packet counting of FACK is based on in-order assumptions, therefore TCP
+ * disables it when reordering is detected
+ */
+void tcp_disable_fack(struct tcp_sock *tp)
+{
+ /* RFC3517 uses different metric in lost marker => reset on change */
+ if (tcp_is_fack(tp))
+ tp->lost_skb_hint = NULL;
+ tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
+}
+
+/* Take a notice that peer is sending D-SACKs */
+static void tcp_dsack_seen(struct tcp_sock *tp)
+{
+ tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
+}
+
+static void tcp_update_reordering(struct sock *sk, const int metric,
+ const int ts)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (metric > tp->reordering) {
+ int mib_idx;
+
+ tp->reordering = min(sysctl_tcp_max_reordering, metric);
+
+ /* This exciting event is worth to be remembered. 8) */
+ if (ts)
+ mib_idx = LINUX_MIB_TCPTSREORDER;
+ else if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENOREORDER;
+ else if (tcp_is_fack(tp))
+ mib_idx = LINUX_MIB_TCPFACKREORDER;
+ else
+ mib_idx = LINUX_MIB_TCPSACKREORDER;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+#if FASTRETRANS_DEBUG > 1
+ pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+ tp->reordering,
+ tp->fackets_out,
+ tp->sacked_out,
+ tp->undo_marker ? tp->undo_retrans : 0);
+#endif
+ tcp_disable_fack(tp);
+ }
+
+ if (metric > 0)
+ tcp_disable_early_retrans(tp);
+}
+
+/* This must be called before lost_out is incremented */
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (!tp->retransmit_skb_hint ||
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ tp->retransmit_skb_hint = skb;
+
+ if (!tp->lost_out ||
+ after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
+ tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
+}
+
+static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ tcp_verify_retransmit_hint(tp, skb);
+
+ tp->lost_out += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ }
+}
+
+static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
+ struct sk_buff *skb)
+{
+ tcp_verify_retransmit_hint(tp, skb);
+
+ if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ tp->lost_out += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ }
+}
+
+/* This procedure tags the retransmission queue when SACKs arrive.
+ *
+ * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
+ * Packets in queue with these bits set are counted in variables
+ * sacked_out, retrans_out and lost_out, correspondingly.
+ *
+ * Valid combinations are:
+ * Tag InFlight Description
+ * 0 1 - orig segment is in flight.
+ * S 0 - nothing flies, orig reached receiver.
+ * L 0 - nothing flies, orig lost by net.
+ * R 2 - both orig and retransmit are in flight.
+ * L|R 1 - orig is lost, retransmit is in flight.
+ * S|R 1 - orig reached receiver, retrans is still in flight.
+ * (L|S|R is logically valid, it could occur when L|R is sacked,
+ * but it is equivalent to plain S and code short-curcuits it to S.
+ * L|S is logically invalid, it would mean -1 packet in flight 8))
+ *
+ * These 6 states form finite state machine, controlled by the following events:
+ * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
+ * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
+ * 3. Loss detection event of two flavors:
+ * A. Scoreboard estimator decided the packet is lost.
+ * A'. Reno "three dupacks" marks head of queue lost.
+ * A''. Its FACK modification, head until snd.fack is lost.
+ * B. SACK arrives sacking SND.NXT at the moment, when the
+ * segment was retransmitted.
+ * 4. D-SACK added new rule: D-SACK changes any tag to S.
+ *
+ * It is pleasant to note, that state diagram turns out to be commutative,
+ * so that we are allowed not to be bothered by order of our actions,
+ * when multiple events arrive simultaneously. (see the function below).
+ *
+ * Reordering detection.
+ * --------------------
+ * Reordering metric is maximal distance, which a packet can be displaced
+ * in packet stream. With SACKs we can estimate it:
+ *
+ * 1. SACK fills old hole and the corresponding segment was not
+ * ever retransmitted -> reordering. Alas, we cannot use it
+ * when segment was retransmitted.
+ * 2. The last flaw is solved with D-SACK. D-SACK arrives
+ * for retransmitted and already SACKed segment -> reordering..
+ * Both of these heuristics are not used in Loss state, when we cannot
+ * account for retransmits accurately.
+ *
+ * SACK block validation.
+ * ----------------------
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself reporting
+ * SACK reneging when it should advance SND.UNA. Such SACK block this is
+ * perfectly valid, however, in light of RFC2018 which explicitly states
+ * that "SACK block MUST reflect the newest segment. Even if the newest
+ * segment is going to be discarded ...", not that it looks very clever
+ * in case of head skb. Due to potentional receiver driven attacks, we
+ * choose to avoid immediate execution of a walk in write queue due to
+ * reneging and defer head skb's loss recovery to standard loss recovery
+ * procedure that will eventually trigger (nothing forbids us doing this).
+ *
+ * Implements also blockage to start_seq wrap-around. Problem lies in the
+ * fact that though start_seq (s) is before end_seq (i.e., not reversed),
+ * there's no guarantee that it will be before snd_nxt (n). The problem
+ * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
+ * wrap (s_w):
+ *
+ * <- outs wnd -> <- wrapzone ->
+ * u e n u_w e_w s n_w
+ * | | | | | | |
+ * |<------------+------+----- TCP seqno space --------------+---------->|
+ * ...-- <2^31 ->| |<--------...
+ * ...---- >2^31 ------>| |<--------...
+ *
+ * Current code wouldn't be vulnerable but it's better still to discard such
+ * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
+ * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
+ * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
+ * equal to the ideal case (infinite seqno space without wrap caused issues).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. Yet
+ * again, D-SACK block must not to go across snd_una (for the same reason as
+ * for the normal SACK blocks, explained above). But there all simplicity
+ * ends, TCP might receive valid D-SACKs below that. As long as they reside
+ * fully below undo_marker they do not affect behavior in anyway and can
+ * therefore be safely ignored. In rare cases (which are more or less
+ * theoretical ones), the D-SACK will nicely cross that boundary due to skb
+ * fragmentation and packet reordering past skb's retransmission. To consider
+ * them correctly, the acceptable range must be extended even more though
+ * the exact amount is rather hard to quantify. However, tp->max_window can
+ * be used as an exaggerated estimate.
+ */
+static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
+ u32 start_seq, u32 end_seq)
+{
+ /* Too far in future, or reversed (interpretation is ambiguous) */
+ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
+ return false;
+
+ /* Nasty start_seq wrap-around check (see comments above) */
+ if (!before(start_seq, tp->snd_nxt))
+ return false;
+
+ /* In outstanding window? ...This is valid exit for D-SACKs too.
+ * start_seq == snd_una is non-sensical (see comments above)
+ */
+ if (after(start_seq, tp->snd_una))
+ return true;
+
+ if (!is_dsack || !tp->undo_marker)
+ return false;
+
+ /* ...Then it's D-SACK, and must reside below snd_una completely */
+ if (after(end_seq, tp->snd_una))
+ return false;
+
+ if (!before(start_seq, tp->undo_marker))
+ return true;
+
+ /* Too old */
+ if (!after(end_seq, tp->undo_marker))
+ return false;
+
+ /* Undo_marker boundary crossing (overestimates a lot). Known already:
+ * start_seq < undo_marker and end_seq >= undo_marker.
+ */
+ return !before(start_seq, end_seq - tp->max_window);
+}
+
+/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
+ * Event "B". Later note: FACK people cheated me again 8), we have to account
+ * for reordering! Ugly, but should help.
+ *
+ * Search retransmitted skbs from write_queue that were sent when snd_nxt was
+ * less than what is now known to be received by the other end (derived from
+ * highest SACK block). Also calculate the lowest snd_nxt among the remaining
+ * retransmitted skbs to avoid some costly processing per ACKs.
+ */
+static void tcp_mark_lost_retrans(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int cnt = 0;
+ u32 new_low_seq = tp->snd_nxt;
+ u32 received_upto = tcp_highest_sack_seq(tp);
+
+ if (!tcp_is_fack(tp) || !tp->retrans_out ||
+ !after(received_upto, tp->lost_retrans_low) ||
+ icsk->icsk_ca_state != TCP_CA_Recovery)
+ return;
+
+ tcp_for_write_queue(skb, sk) {
+ u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
+
+ if (skb == tcp_send_head(sk))
+ break;
+ if (cnt == tp->retrans_out)
+ break;
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ continue;
+
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
+ continue;
+
+ /* TODO: We would like to get rid of tcp_is_fack(tp) only
+ * constraint here (see above) but figuring out that at
+ * least tp->reordering SACK blocks reside between ack_seq
+ * and received_upto is not easy task to do cheaply with
+ * the available datastructures.
+ *
+ * Whether FACK should check here for tp->reordering segs
+ * in-between one could argue for either way (it would be
+ * rather simple to implement as we could count fack_count
+ * during the walk and do tp->fackets_out - fack_count).
+ */
+ if (after(received_upto, ack_seq)) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+ } else {
+ if (before(ack_seq, new_low_seq))
+ new_low_seq = ack_seq;
+ cnt += tcp_skb_pcount(skb);
+ }
+ }
+
+ if (tp->retrans_out)
+ tp->lost_retrans_low = new_low_seq;
+}
+
+static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
+ struct tcp_sack_block_wire *sp, int num_sacks,
+ u32 prior_snd_una)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
+ u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
+ bool dup_sack = false;
+
+ if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
+ dup_sack = true;
+ tcp_dsack_seen(tp);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+ } else if (num_sacks > 1) {
+ u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
+ u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
+
+ if (!after(end_seq_0, end_seq_1) &&
+ !before(start_seq_0, start_seq_1)) {
+ dup_sack = true;
+ tcp_dsack_seen(tp);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPDSACKOFORECV);
+ }
+ }
+
+ /* D-SACK for already forgotten data... Do dumb counting. */
+ if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
+ !after(end_seq_0, prior_snd_una) &&
+ after(end_seq_0, tp->undo_marker))
+ tp->undo_retrans--;
+
+ return dup_sack;
+}
+
+struct tcp_sacktag_state {
+ int reord;
+ int fack_count;
+ long rtt_us; /* RTT measured by SACKing never-retransmitted data */
+ int flag;
+};
+
+/* Check if skb is fully within the SACK block. In presence of GSO skbs,
+ * the incoming SACK may not exactly match but we can find smaller MSS
+ * aligned portion of it that matches. Therefore we might need to fragment
+ * which may fail and creates some hassle (caller must handle error case
+ * returns).
+ *
+ * FIXME: this could be merged to shift decision code
+ */
+static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
+ u32 start_seq, u32 end_seq)
+{
+ int err;
+ bool in_sack;
+ unsigned int pkt_len;
+ unsigned int mss;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (tcp_skb_pcount(skb) > 1 && !in_sack &&
+ after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
+ mss = tcp_skb_mss(skb);
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+
+ if (!in_sack) {
+ pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
+ if (pkt_len < mss)
+ pkt_len = mss;
+ } else {
+ pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
+ if (pkt_len < mss)
+ return -EINVAL;
+ }
+
+ /* Round if necessary so that SACKs cover only full MSSes
+ * and/or the remaining small portion (if present)
+ */
+ if (pkt_len > mss) {
+ unsigned int new_len = (pkt_len / mss) * mss;
+ if (!in_sack && new_len < pkt_len) {
+ new_len += mss;
+ if (new_len >= skb->len)
+ return 0;
+ }
+ pkt_len = new_len;
+ }
+ err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+ if (err < 0)
+ return err;
+ }
+
+ return in_sack;
+}
+
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+ struct tcp_sacktag_state *state, u8 sacked,
+ u32 start_seq, u32 end_seq,
+ int dup_sack, int pcount,
+ const struct skb_mstamp *xmit_time)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int fack_count = state->fack_count;
+
+ /* Account D-SACK for retransmitted packet. */
+ if (dup_sack && (sacked & TCPCB_RETRANS)) {
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
+ after(end_seq, tp->undo_marker))
+ tp->undo_retrans--;
+ if (sacked & TCPCB_SACKED_ACKED)
+ state->reord = min(fack_count, state->reord);
+ }
+
+ /* Nothing to do; acked frame is about to be dropped (was ACKed). */
+ if (!after(end_seq, tp->snd_una))
+ return sacked;
+
+ if (!(sacked & TCPCB_SACKED_ACKED)) {
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* If the segment is not tagged as lost,
+ * we do not clear RETRANS, believing
+ * that retransmission is still in flight.
+ */
+ if (sacked & TCPCB_LOST) {
+ sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+ tp->lost_out -= pcount;
+ tp->retrans_out -= pcount;
+ }
+ } else {
+ if (!(sacked & TCPCB_RETRANS)) {
+ /* New sack for not retransmitted frame,
+ * which was in hole. It is reordering.
+ */
+ if (before(start_seq,
+ tcp_highest_sack_seq(tp)))
+ state->reord = min(fack_count,
+ state->reord);
+ if (!after(end_seq, tp->high_seq))
+ state->flag |= FLAG_ORIG_SACK_ACKED;
+ /* Pick the earliest sequence sacked for RTT */
+ if (state->rtt_us < 0) {
+ struct skb_mstamp now;
+
+ skb_mstamp_get(&now);
+ state->rtt_us = skb_mstamp_us_delta(&now,
+ xmit_time);
+ }
+ }
+
+ if (sacked & TCPCB_LOST) {
+ sacked &= ~TCPCB_LOST;
+ tp->lost_out -= pcount;
+ }
+ }
+
+ sacked |= TCPCB_SACKED_ACKED;
+ state->flag |= FLAG_DATA_SACKED;
+ tp->sacked_out += pcount;
+
+ fack_count += pcount;
+
+ /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
+ if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
+ before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
+ tp->lost_cnt_hint += pcount;
+
+ if (fack_count > tp->fackets_out)
+ tp->fackets_out = fack_count;
+ }
+
+ /* D-SACK. We can detect redundant retransmission in S|R and plain R
+ * frames and clear it. undo_retrans is decreased above, L|R frames
+ * are accounted above as well.
+ */
+ if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
+ sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= pcount;
+ }
+
+ return sacked;
+}
+
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ unsigned int pcount, int shifted, int mss,
+ bool dup_sack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
+ u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
+ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
+
+ BUG_ON(!pcount);
+
+ /* Adjust counters and hints for the newly sacked sequence
+ * range but discard the return value since prev is already
+ * marked. We must tag the range first because the seq
+ * advancement below implicitly advances
+ * tcp_highest_sack_seq() when skb is highest_sack.
+ */
+ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+ start_seq, end_seq, dup_sack, pcount,
+ &skb->skb_mstamp);
+
+ if (skb == tp->lost_skb_hint)
+ tp->lost_cnt_hint += pcount;
+
+ TCP_SKB_CB(prev)->end_seq += shifted;
+ TCP_SKB_CB(skb)->seq += shifted;
+
+ tcp_skb_pcount_add(prev, pcount);
+ BUG_ON(tcp_skb_pcount(skb) < pcount);
+ tcp_skb_pcount_add(skb, -pcount);
+
+ /* When we're adding to gso_segs == 1, gso_size will be zero,
+ * in theory this shouldn't be necessary but as long as DSACK
+ * code can come after this skb later on it's better to keep
+ * setting gso_size to something.
+ */
+ if (!skb_shinfo(prev)->gso_size) {
+ skb_shinfo(prev)->gso_size = mss;
+ skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+ }
+
+ /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+ if (tcp_skb_pcount(skb) <= 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ }
+
+ /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+ if (skb->len > 0) {
+ BUG_ON(!tcp_skb_pcount(skb));
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+ return false;
+ }
+
+ /* Whole SKB was eaten :-) */
+
+ if (skb == tp->retransmit_skb_hint)
+ tp->retransmit_skb_hint = prev;
+ if (skb == tp->lost_skb_hint) {
+ tp->lost_skb_hint = prev;
+ tp->lost_cnt_hint -= tcp_skb_pcount(prev);
+ }
+
+ TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ TCP_SKB_CB(prev)->end_seq++;
+
+ if (skb == tcp_highest_sack(sk))
+ tcp_advance_highest_sack(sk, skb);
+
+ tcp_unlink_write_queue(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
+
+ return true;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_skb_seglen(const struct sk_buff *skb)
+{
+ return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(const struct sk_buff *skb)
+{
+ return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+ struct tcp_sacktag_state *state,
+ u32 start_seq, u32 end_seq,
+ bool dup_sack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *prev;
+ int mss;
+ int pcount = 0;
+ int len;
+ int in_sack;
+
+ if (!sk_can_gso(sk))
+ goto fallback;
+
+ /* Normally R but no L won't result in plain S */
+ if (!dup_sack &&
+ (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
+ goto fallback;
+ if (!skb_can_shift(skb))
+ goto fallback;
+ /* This frame is about to be dropped (was ACKed). */
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ goto fallback;
+
+ /* Can only happen with delayed DSACK + discard craziness */
+ if (unlikely(skb == tcp_write_queue_head(sk)))
+ goto fallback;
+ prev = tcp_write_queue_prev(sk, skb);
+
+ if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+ goto fallback;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (in_sack) {
+ len = skb->len;
+ pcount = tcp_skb_pcount(skb);
+ mss = tcp_skb_seglen(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_skb_seglen(prev))
+ goto fallback;
+ } else {
+ if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+ goto noop;
+ /* CHECKME: This is non-MSS split case only?, this will
+ * cause skipped skbs due to advancing loop btw, original
+ * has that feature too
+ */
+ if (tcp_skb_pcount(skb) <= 1)
+ goto noop;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+ if (!in_sack) {
+ /* TODO: head merge to next could be attempted here
+ * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+ * though it might not be worth of the additional hassle
+ *
+ * ...we can probably just fallback to what was done
+ * previously. We could try merging non-SACKed ones
+ * as well but it probably isn't going to buy off
+ * because later SACKs might again split them, and
+ * it would make skb timestamp tracking considerably
+ * harder problem.
+ */
+ goto fallback;
+ }
+
+ len = end_seq - TCP_SKB_CB(skb)->seq;
+ BUG_ON(len < 0);
+ BUG_ON(len > skb->len);
+
+ /* MSS boundaries should be honoured or else pcount will
+ * severely break even though it makes things bit trickier.
+ * Optimize common case to avoid most of the divides
+ */
+ mss = tcp_skb_mss(skb);
+
+ /* TODO: Fix DSACKs to not fragment already SACKed and we can
+ * drop this restriction as unnecessary
+ */
+ if (mss != tcp_skb_seglen(prev))
+ goto fallback;
+
+ if (len == mss) {
+ pcount = 1;
+ } else if (len < mss) {
+ goto noop;
+ } else {
+ pcount = len / mss;
+ len = pcount * mss;
+ }
+ }
+
+ /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+ goto fallback;
+
+ if (!skb_shift(prev, skb, len))
+ goto fallback;
+ if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+ goto out;
+
+ /* Hole filled allows collapsing with the next as well, this is very
+ * useful when hole on every nth skb pattern happens
+ */
+ if (prev == tcp_write_queue_tail(sk))
+ goto out;
+ skb = tcp_write_queue_next(sk, prev);
+
+ if (!skb_can_shift(skb) ||
+ (skb == tcp_send_head(sk)) ||
+ ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
+ (mss != tcp_skb_seglen(skb)))
+ goto out;
+
+ len = skb->len;
+ if (skb_shift(prev, skb, len)) {
+ pcount += tcp_skb_pcount(skb);
+ tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+ }
+
+out:
+ state->fack_count += pcount;
+ return prev;
+
+noop:
+ return skb;
+
+fallback:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+ return NULL;
+}
+
+static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
+ struct tcp_sack_block *next_dup,
+ struct tcp_sacktag_state *state,
+ u32 start_seq, u32 end_seq,
+ bool dup_sack_in)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *tmp;
+
+ tcp_for_write_queue_from(skb, sk) {
+ int in_sack = 0;
+ bool dup_sack = dup_sack_in;
+
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* queue is in-order => we can short-circuit the walk early */
+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))
+ break;
+
+ if (next_dup &&
+ before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ next_dup->start_seq,
+ next_dup->end_seq);
+ if (in_sack > 0)
+ dup_sack = true;
+ }
+
+ /* skb reference here is a bit tricky to get right, since
+ * shifting can eat and free both this skb and the next,
+ * so not even _safe variant of the loop is enough.
+ */
+ if (in_sack <= 0) {
+ tmp = tcp_shift_skb_data(sk, skb, state,
+ start_seq, end_seq, dup_sack);
+ if (tmp) {
+ if (tmp != skb) {
+ skb = tmp;
+ continue;
+ }
+
+ in_sack = 0;
+ } else {
+ in_sack = tcp_match_skb_to_sack(sk, skb,
+ start_seq,
+ end_seq);
+ }
+ }
+
+ if (unlikely(in_sack < 0))
+ break;
+
+ if (in_sack) {
+ TCP_SKB_CB(skb)->sacked =
+ tcp_sacktag_one(sk,
+ state,
+ TCP_SKB_CB(skb)->sacked,
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ dup_sack,
+ tcp_skb_pcount(skb),
+ &skb->skb_mstamp);
+
+ if (!before(TCP_SKB_CB(skb)->seq,
+ tcp_highest_sack_seq(tp)))
+ tcp_advance_highest_sack(sk, skb);
+ }
+
+ state->fack_count += tcp_skb_pcount(skb);
+ }
+ return skb;
+}
+
+/* Avoid all extra work that is being done by sacktag while walking in
+ * a normal way
+ */
+static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
+ struct tcp_sacktag_state *state,
+ u32 skip_to_seq)
+{
+ tcp_for_write_queue_from(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
+ break;
+
+ state->fack_count += tcp_skb_pcount(skb);
+ }
+ return skb;
+}
+
+static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
+ struct sock *sk,
+ struct tcp_sack_block *next_dup,
+ struct tcp_sacktag_state *state,
+ u32 skip_to_seq)
+{
+ if (!next_dup)
+ return skb;
+
+ if (before(next_dup->start_seq, skip_to_seq)) {
+ skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
+ skb = tcp_sacktag_walk(skb, sk, NULL, state,
+ next_dup->start_seq, next_dup->end_seq,
+ 1);
+ }
+
+ return skb;
+}
+
+static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
+{
+ return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+}
+
+static int
+tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
+ u32 prior_snd_una, long *sack_rtt_us)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const unsigned char *ptr = (skb_transport_header(ack_skb) +
+ TCP_SKB_CB(ack_skb)->sacked);
+ struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
+ struct tcp_sack_block sp[TCP_NUM_SACKS];
+ struct tcp_sack_block *cache;
+ struct tcp_sacktag_state state;
+ struct sk_buff *skb;
+ int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
+ int used_sacks;
+ bool found_dup_sack = false;
+ int i, j;
+ int first_sack_index;
+
+ state.flag = 0;
+ state.reord = tp->packets_out;
+ state.rtt_us = -1L;
+
+ if (!tp->sacked_out) {
+ if (WARN_ON(tp->fackets_out))
+ tp->fackets_out = 0;
+ tcp_highest_sack_reset(sk);
+ }
+
+ found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
+ num_sacks, prior_snd_una);
+ if (found_dup_sack)
+ state.flag |= FLAG_DSACKING_ACK;
+
+ /* Eliminate too old ACKs, but take into
+ * account more or less fresh ones, they can
+ * contain valid SACK info.
+ */
+ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
+ return 0;
+
+ if (!tp->packets_out)
+ goto out;
+
+ used_sacks = 0;
+ first_sack_index = 0;
+ for (i = 0; i < num_sacks; i++) {
+ bool dup_sack = !i && found_dup_sack;
+
+ sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
+ sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
+
+ if (!tcp_is_sackblock_valid(tp, dup_sack,
+ sp[used_sacks].start_seq,
+ sp[used_sacks].end_seq)) {
+ int mib_idx;
+
+ if (dup_sack) {
+ if (!tp->undo_marker)
+ mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
+ else
+ mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
+ } else {
+ /* Don't count olds caused by ACK reordering */
+ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
+ !after(sp[used_sacks].end_seq, tp->snd_una))
+ continue;
+ mib_idx = LINUX_MIB_TCPSACKDISCARD;
+ }
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ if (i == 0)
+ first_sack_index = -1;
+ continue;
+ }
+
+ /* Ignore very old stuff early */
+ if (!after(sp[used_sacks].end_seq, prior_snd_una))
+ continue;
+
+ used_sacks++;
+ }
+
+ /* order SACK blocks to allow in order walk of the retrans queue */
+ for (i = used_sacks - 1; i > 0; i--) {
+ for (j = 0; j < i; j++) {
+ if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
+ swap(sp[j], sp[j + 1]);
+
+ /* Track where the first SACK block goes to */
+ if (j == first_sack_index)
+ first_sack_index = j + 1;
+ }
+ }
+ }
+
+ skb = tcp_write_queue_head(sk);
+ state.fack_count = 0;
+ i = 0;
+
+ if (!tp->sacked_out) {
+ /* It's already past, so skip checking against it */
+ cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+ } else {
+ cache = tp->recv_sack_cache;
+ /* Skip empty blocks in at head of the cache */
+ while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
+ !cache->end_seq)
+ cache++;
+ }
+
+ while (i < used_sacks) {
+ u32 start_seq = sp[i].start_seq;
+ u32 end_seq = sp[i].end_seq;
+ bool dup_sack = (found_dup_sack && (i == first_sack_index));
+ struct tcp_sack_block *next_dup = NULL;
+
+ if (found_dup_sack && ((i + 1) == first_sack_index))
+ next_dup = &sp[i + 1];
+
+ /* Skip too early cached blocks */
+ while (tcp_sack_cache_ok(tp, cache) &&
+ !before(start_seq, cache->end_seq))
+ cache++;
+
+ /* Can skip some work by looking recv_sack_cache? */
+ if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
+ after(end_seq, cache->start_seq)) {
+
+ /* Head todo? */
+ if (before(start_seq, cache->start_seq)) {
+ skb = tcp_sacktag_skip(skb, sk, &state,
+ start_seq);
+ skb = tcp_sacktag_walk(skb, sk, next_dup,
+ &state,
+ start_seq,
+ cache->start_seq,
+ dup_sack);
+ }
+
+ /* Rest of the block already fully processed? */
+ if (!after(end_seq, cache->end_seq))
+ goto advance_sp;
+
+ skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
+ &state,
+ cache->end_seq);
+
+ /* ...tail remains todo... */
+ if (tcp_highest_sack_seq(tp) == cache->end_seq) {
+ /* ...but better entrypoint exists! */
+ skb = tcp_highest_sack(sk);
+ if (!skb)
+ break;
+ state.fack_count = tp->fackets_out;
+ cache++;
+ goto walk;
+ }
+
+ skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
+ /* Check overlap against next cached too (past this one already) */
+ cache++;
+ continue;
+ }
+
+ if (!before(start_seq, tcp_highest_sack_seq(tp))) {
+ skb = tcp_highest_sack(sk);
+ if (!skb)
+ break;
+ state.fack_count = tp->fackets_out;
+ }
+ skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
+
+walk:
+ skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
+ start_seq, end_seq, dup_sack);
+
+advance_sp:
+ i++;
+ }
+
+ /* Clear the head of the cache sack blocks so we can skip it next time */
+ for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
+ tp->recv_sack_cache[i].start_seq = 0;
+ tp->recv_sack_cache[i].end_seq = 0;
+ }
+ for (j = 0; j < used_sacks; j++)
+ tp->recv_sack_cache[i++] = sp[j];
+
+ if ((state.reord < tp->fackets_out) &&
+ ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
+ tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
+
+ tcp_mark_lost_retrans(sk);
+ tcp_verify_left_out(tp);
+out:
+
+#if FASTRETRANS_DEBUG > 0
+ WARN_ON((int)tp->sacked_out < 0);
+ WARN_ON((int)tp->lost_out < 0);
+ WARN_ON((int)tp->retrans_out < 0);
+ WARN_ON((int)tcp_packets_in_flight(tp) < 0);
+#endif
+ *sack_rtt_us = state.rtt_us;
+ return state.flag;
+}
+
+/* Limits sacked_out so that sum with lost_out isn't ever larger than
+ * packets_out. Returns false if sacked_out adjustement wasn't necessary.
+ */
+static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
+{
+ u32 holes;
+
+ holes = max(tp->lost_out, 1U);
+ holes = min(holes, tp->packets_out);
+
+ if ((tp->sacked_out + holes) > tp->packets_out) {
+ tp->sacked_out = tp->packets_out - holes;
+ return true;
+ }
+ return false;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (tcp_limit_reno_sacked(tp))
+ tcp_update_reordering(sk, tp->packets_out + addend, 0);
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->sacked_out++;
+ tcp_check_reno_reordering(sk, 0);
+ tcp_verify_left_out(tp);
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (acked > 0) {
+ /* One ACK acked hole. The rest eat duplicate ACKs. */
+ if (acked - 1 >= tp->sacked_out)
+ tp->sacked_out = 0;
+ else
+ tp->sacked_out -= acked - 1;
+ }
+ tcp_check_reno_reordering(sk, acked);
+ tcp_verify_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+ tp->sacked_out = 0;
+}
+
+void tcp_clear_retrans(struct tcp_sock *tp)
+{
+ tp->retrans_out = 0;
+ tp->lost_out = 0;
+ tp->undo_marker = 0;
+ tp->undo_retrans = -1;
+ tp->fackets_out = 0;
+ tp->sacked_out = 0;
+}
+
+static inline void tcp_init_undo(struct tcp_sock *tp)
+{
+ tp->undo_marker = tp->snd_una;
+ /* Retransmission still in flight may cause DSACKs later. */
+ tp->undo_retrans = tp->retrans_out ? : -1;
+}
+
+/* Enter Loss state. If we detect SACK reneging, forget all SACK information
+ * and reset tags completely, otherwise preserve SACKs. If receiver
+ * dropped its ofo queue, we will know this due to reneging detection.
+ */
+void tcp_enter_loss(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ bool new_recovery = false;
+ bool is_reneg; /* is receiver reneging on SACKs? */
+
+ /* Reduce ssthresh if it has not yet been made inside this window. */
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+ !after(tp->high_seq, tp->snd_una) ||
+ (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ new_recovery = true;
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tcp_ca_event(sk, CA_EVENT_LOSS);
+ tcp_init_undo(tp);
+ }
+ tp->snd_cwnd = 1;
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ tp->retrans_out = 0;
+ tp->lost_out = 0;
+
+ if (tcp_is_reno(tp))
+ tcp_reset_reno_sack(tp);
+
+ skb = tcp_write_queue_head(sk);
+ is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+ if (is_reneg) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ tp->sacked_out = 0;
+ tp->fackets_out = 0;
+ }
+ tcp_clear_all_retrans_hints(tp);
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+
+ TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
+ tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
+ }
+ }
+ tcp_verify_left_out(tp);
+
+ /* Timeout in disordered state after receiving substantial DUPACKs
+ * suggests that the degree of reordering is over-estimated.
+ */
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
+ tp->sacked_out >= sysctl_tcp_reordering)
+ tp->reordering = min_t(unsigned int, tp->reordering,
+ sysctl_tcp_reordering);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
+ tp->high_seq = tp->snd_nxt;
+ tcp_ecn_queue_cwr(tp);
+
+ /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+ * loss recovery is underway except recurring timeout(s) on
+ * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+ */
+ tp->frto = sysctl_tcp_frto &&
+ (new_recovery || icsk->icsk_retransmits) &&
+ !inet_csk(sk)->icsk_mtup.probe_size;
+}
+
+/* If ACK arrived pointing to a remembered SACK, it means that our
+ * remembered SACKs do not reflect real state of receiver i.e.
+ * receiver _host_ is heavily congested (or buggy).
+ *
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
+ */
+static bool tcp_check_sack_reneging(struct sock *sk, int flag)
+{
+ if (flag & FLAG_SACK_RENEGING) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+ msecs_to_jiffies(10));
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ delay, TCP_RTO_MAX);
+ return true;
+ }
+ return false;
+}
+
+static inline int tcp_fackets_out(const struct tcp_sock *tp)
+{
+ return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
+}
+
+/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
+ * counter when SACK is enabled (without SACK, sacked_out is used for
+ * that purpose).
+ *
+ * Instead, with FACK TCP uses fackets_out that includes both SACKed
+ * segments up to the highest received SACK block so far and holes in
+ * between them.
+ *
+ * With reordering, holes may still be in flight, so RFC3517 recovery
+ * uses pure sacked_out (total number of SACKed segments) even though
+ * it violates the RFC that uses duplicate ACKs, often these are equal
+ * but when e.g. out-of-window ACKs or packet duplication occurs,
+ * they differ. Since neither occurs due to loss, TCP should really
+ * ignore them.
+ */
+static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
+{
+ return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
+}
+
+static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay;
+
+ /* Delay early retransmit and entering fast recovery for
+ * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
+ * available, or RTO is scheduled to fire first.
+ */
+ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
+ (flag & FLAG_ECE) || !tp->srtt_us)
+ return false;
+
+ delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
+ msecs_to_jiffies(2));
+
+ if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
+ return false;
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
+ TCP_RTO_MAX);
+ return true;
+}
+
+/* Linux NewReno/SACK/FACK/ECN state machine.
+ * --------------------------------------
+ *
+ * "Open" Normal state, no dubious events, fast path.
+ * "Disorder" In all the respects it is "Open",
+ * but requires a bit more attention. It is entered when
+ * we see some SACKs or dupacks. It is split of "Open"
+ * mainly to move some processing from fast path to slow one.
+ * "CWR" CWND was reduced due to some Congestion Notification event.
+ * It can be ECN, ICMP source quench, local device congestion.
+ * "Recovery" CWND was reduced, we are fast-retransmitting.
+ * "Loss" CWND was reduced due to RTO timeout or SACK reneging.
+ *
+ * tcp_fastretrans_alert() is entered:
+ * - each incoming ACK, if state is not "Open"
+ * - when arrived ACK is unusual, namely:
+ * * SACK
+ * * Duplicate ACK.
+ * * ECN ECE.
+ *
+ * Counting packets in flight is pretty simple.
+ *
+ * in_flight = packets_out - left_out + retrans_out
+ *
+ * packets_out is SND.NXT-SND.UNA counted in packets.
+ *
+ * retrans_out is number of retransmitted segments.
+ *
+ * left_out is number of segments left network, but not ACKed yet.
+ *
+ * left_out = sacked_out + lost_out
+ *
+ * sacked_out: Packets, which arrived to receiver out of order
+ * and hence not ACKed. With SACKs this number is simply
+ * amount of SACKed data. Even without SACKs
+ * it is easy to give pretty reliable estimate of this number,
+ * counting duplicate ACKs.
+ *
+ * lost_out: Packets lost by network. TCP has no explicit
+ * "loss notification" feedback from network (for now).
+ * It means that this number can be only _guessed_.
+ * Actually, it is the heuristics to predict lossage that
+ * distinguishes different algorithms.
+ *
+ * F.e. after RTO, when all the queue is considered as lost,
+ * lost_out = packets_out and in_flight = retrans_out.
+ *
+ * Essentially, we have now two algorithms counting
+ * lost packets.
+ *
+ * FACK: It is the simplest heuristics. As soon as we decided
+ * that something is lost, we decide that _all_ not SACKed
+ * packets until the most forward SACK are lost. I.e.
+ * lost_out = fackets_out - sacked_out and left_out = fackets_out.
+ * It is absolutely correct estimate, if network does not reorder
+ * packets. And it loses any connection to reality when reordering
+ * takes place. We use FACK by default until reordering
+ * is suspected on the path to this destination.
+ *
+ * NewReno: when Recovery is entered, we assume that one segment
+ * is lost (classic Reno). While we are in Recovery and
+ * a partial ACK arrives, we assume that one more packet
+ * is lost (NewReno). This heuristics are the same in NewReno
+ * and SACK.
+ *
+ * Imagine, that's all! Forget about all this shamanism about CWND inflation
+ * deflation etc. CWND is real congestion window, never inflated, changes
+ * only according to classic VJ rules.
+ *
+ * Really tricky (and requiring careful tuning) part of algorithm
+ * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The first determines the moment _when_ we should reduce CWND and,
+ * hence, slow down forward transmission. In fact, it determines the moment
+ * when we decide that hole is caused by loss, rather than by a reorder.
+ *
+ * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
+ * holes, caused by lost packets.
+ *
+ * And the most logically complicated part of algorithm is undo
+ * heuristics. We detect false retransmits due to both too early
+ * fast retransmit (reordering) and underestimated RTO, analyzing
+ * timestamps and D-SACKs. When we detect that some segments were
+ * retransmitted by mistake and CWND reduction was wrong, we undo
+ * window reduction and abort recovery phase. This logic is hidden
+ * inside several functions named tcp_try_undo_<something>.
+ */
+
+/* This function decides, when we should leave Disordered state
+ * and enter Recovery phase, reducing congestion window.
+ *
+ * Main question: may we further continue forward transmission
+ * with the same cwnd?
+ */
+static bool tcp_time_to_recover(struct sock *sk, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 packets_out;
+
+ /* Trick#1: The loss is proven. */
+ if (tp->lost_out)
+ return true;
+
+ /* Not-A-Trick#2 : Classic rule... */
+ if (tcp_dupack_heuristics(tp) > tp->reordering)
+ return true;
+
+ /* Trick#4: It is still not OK... But will it be useful to delay
+ * recovery more?
+ */
+ packets_out = tp->packets_out;
+ if (packets_out <= tp->reordering &&
+ tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+ !tcp_may_send_now(sk)) {
+ /* We have nothing to send. This connection is limited
+ * either by receiver window or by application.
+ */
+ return true;
+ }
+
+ /* If a thin stream is detected, retransmit after first
+ * received dupack. Employ only if SACK is supported in order
+ * to avoid possible corner-case series of spurious retransmissions
+ * Use only if there are no unsent data.
+ */
+ if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
+ tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
+ tcp_is_sack(tp) && !tcp_send_head(sk))
+ return true;
+
+ /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious
+ * retransmissions due to small network reorderings, we implement
+ * Mitigation A.3 in the RFC and delay the retransmission for a short
+ * interval if appropriate.
+ */
+ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
+ (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
+ !tcp_may_send_now(sk))
+ return !tcp_pause_early_retransmit(sk, flag);
+
+ return false;
+}
+
+/* Detect loss in event "A" above by marking head of queue up as lost.
+ * For FACK or non-SACK(Reno) senders, the first "packets" number of segments
+ * are considered lost. For RFC3517 SACK, a segment is considered lost if it
+ * has at least tp->reordering SACKed seqments above it; "packets" refers to
+ * the maximum SACKed segments to pass before reaching this limit.
+ */
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int cnt, oldcnt;
+ int err;
+ unsigned int mss;
+ /* Use SACK to deduce losses of new sequences sent during recovery */
+ const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
+
+ WARN_ON(packets > tp->packets_out);
+ if (tp->lost_skb_hint) {
+ skb = tp->lost_skb_hint;
+ cnt = tp->lost_cnt_hint;
+ /* Head already handled? */
+ if (mark_head && skb != tcp_write_queue_head(sk))
+ return;
+ } else {
+ skb = tcp_write_queue_head(sk);
+ cnt = 0;
+ }
+
+ tcp_for_write_queue_from(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ /* TODO: do this better */
+ /* this is not the most efficient way to do this... */
+ tp->lost_skb_hint = skb;
+ tp->lost_cnt_hint = cnt;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
+ break;
+
+ oldcnt = cnt;
+ if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+ cnt += tcp_skb_pcount(skb);
+
+ if (cnt > packets) {
+ if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+ (oldcnt >= packets))
+ break;
+
+ mss = skb_shinfo(skb)->gso_size;
+ err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
+ mss, GFP_ATOMIC);
+ if (err < 0)
+ break;
+ cnt = packets;
+ }
+
+ tcp_skb_mark_lost(tp, skb);
+
+ if (mark_head)
+ break;
+ }
+ tcp_verify_left_out(tp);
+}
+
+/* Account newly detected lost packet(s) */
+
+static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_is_reno(tp)) {
+ tcp_mark_head_lost(sk, 1, 1);
+ } else if (tcp_is_fack(tp)) {
+ int lost = tp->fackets_out - tp->reordering;
+ if (lost <= 0)
+ lost = 1;
+ tcp_mark_head_lost(sk, lost, 0);
+ } else {
+ int sacked_upto = tp->sacked_out - tp->reordering;
+ if (sacked_upto >= 0)
+ tcp_mark_head_lost(sk, sacked_upto, 0);
+ else if (fast_rexmit)
+ tcp_mark_head_lost(sk, 1, 1);
+ }
+}
+
+/* CWND moderation, preventing bursts due to too big ACKs
+ * in dubious situations.
+ */
+static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
+{
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ tcp_packets_in_flight(tp) + tcp_max_burst(tp));
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Nothing was retransmitted or returned timestamp is less
+ * than timestamp of the first retransmission.
+ */
+static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
+{
+ return !tp->retrans_stamp ||
+ (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
+}
+
+/* Undo procedures. */
+
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
+ */
+static bool tcp_any_retrans_done(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (tp->retrans_out)
+ return true;
+
+ skb = tcp_write_queue_head(sk);
+ if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+ return true;
+
+ return false;
+}
+
+#if FASTRETRANS_DEBUG > 1
+static void DBGUNDO(struct sock *sk, const char *msg)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (sk->sk_family == AF_INET) {
+ pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+ msg,
+ &np->daddr, ntohs(inet->inet_dport),
+ tp->snd_cwnd, tcp_left_out(tp),
+ tp->snd_ssthresh, tp->prior_ssthresh,
+ tp->packets_out);
+ }
+#endif
+}
+#else
+#define DBGUNDO(x...) do { } while (0)
+#endif
+
+static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (unmark_loss) {
+ struct sk_buff *skb;
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ }
+ tp->lost_out = 0;
+ tcp_clear_all_retrans_hints(tp);
+ }
+
+ if (tp->prior_ssthresh) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->undo_cwnd)
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
+ else
+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+
+ if (tp->prior_ssthresh > tp->snd_ssthresh) {
+ tp->snd_ssthresh = tp->prior_ssthresh;
+ tcp_ecn_withdraw_cwr(tp);
+ }
+ } else {
+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
+ }
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->undo_marker = 0;
+}
+
+static inline bool tcp_may_undo(const struct tcp_sock *tp)
+{
+ return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
+}
+
+/* People celebrate: "We love our President!" */
+static bool tcp_try_undo_recovery(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_may_undo(tp)) {
+ int mib_idx;
+
+ /* Happy end! We did not retransmit anything
+ * or our original transmission succeeded.
+ */
+ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+ tcp_undo_cwnd_reduction(sk, false);
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+ mib_idx = LINUX_MIB_TCPLOSSUNDO;
+ else
+ mib_idx = LINUX_MIB_TCPFULLUNDO;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ }
+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
+ /* Hold old state until something *above* high_seq
+ * is ACKed. For Reno it is MUST to prevent false
+ * fast retransmits (RFC2582). SACK TCP is safe. */
+ tcp_moderate_cwnd(tp);
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+ return true;
+ }
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ return false;
+}
+
+/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
+static bool tcp_try_undo_dsack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->undo_marker && !tp->undo_retrans) {
+ DBGUNDO(sk, "D-SACK");
+ tcp_undo_cwnd_reduction(sk, false);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ return true;
+ }
+ return false;
+}
+
+/* Undo during loss recovery after partial ACK or using F-RTO. */
+static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (frto_undo || tcp_may_undo(tp)) {
+ tcp_undo_cwnd_reduction(sk, true);
+
+ DBGUNDO(sk, "partial loss");
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ if (frto_undo)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
+ inet_csk(sk)->icsk_retransmits = 0;
+ if (frto_undo || tcp_is_sack(tp))
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ return true;
+ }
+ return false;
+}
+
+/* The cwnd reduction in CWR and Recovery use the PRR algorithm
+ * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
+ * It computes the number of packets to send (sndcnt) based on packets newly
+ * delivered:
+ * 1) If the packets in flight is larger than ssthresh, PRR spreads the
+ * cwnd reductions across a full RTT.
+ * 2) If packets in flight is lower than ssthresh (such as due to excess
+ * losses and/or application stalls), do not perform any further cwnd
+ * reductions, but instead slow start up to ssthresh.
+ */
+static void tcp_init_cwnd_reduction(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = 0;
+ tp->snd_cwnd_cnt = 0;
+ tp->prior_cwnd = tp->snd_cwnd;
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ tcp_ecn_queue_cwr(tp);
+}
+
+static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
+ int fast_rexmit)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int sndcnt = 0;
+ int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
+ int newly_acked_sacked = prior_unsacked -
+ (tp->packets_out - tp->sacked_out);
+
+ tp->prr_delivered += newly_acked_sacked;
+ if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
+ u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
+ tp->prior_cwnd - 1;
+ sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
+ } else {
+ sndcnt = min_t(int, delta,
+ max_t(int, tp->prr_delivered - tp->prr_out,
+ newly_acked_sacked) + 1);
+ }
+
+ sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+ tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+}
+
+static inline void tcp_end_cwnd_reduction(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
+ (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
+ tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
+}
+
+/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
+void tcp_enter_cwr(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->prior_ssthresh = 0;
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ tp->undo_marker = 0;
+ tcp_init_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ }
+}
+
+static void tcp_try_keep_open(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int state = TCP_CA_Open;
+
+ if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
+ state = TCP_CA_Disorder;
+
+ if (inet_csk(sk)->icsk_ca_state != state) {
+ tcp_set_ca_state(sk, state);
+ tp->high_seq = tp->snd_nxt;
+ }
+}
+
+static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_verify_left_out(tp);
+
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+
+ if (flag & FLAG_ECE)
+ tcp_enter_cwr(sk);
+
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
+ tcp_try_keep_open(sk);
+ } else {
+ tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ }
+}
+
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+ icsk->icsk_mtup.probe_size = 0;
+}
+
+static void tcp_mtup_probe_success(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* FIXME: breaks with very large cwnd */
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_cwnd = tp->snd_cwnd *
+ tcp_mss_to_mtu(sk, tp->mss_cache) /
+ icsk->icsk_mtup.probe_size;
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+
+ icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+ icsk->icsk_mtup.probe_size = 0;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery.
+ * The socket is already locked here.
+ */
+void tcp_simple_retransmit(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ unsigned int mss = tcp_current_mss(sk);
+ u32 prior_lost = tp->lost_out;
+
+ tcp_for_write_queue(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+ if (tcp_skb_seglen(skb) > mss &&
+ !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ }
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ }
+ }
+
+ tcp_clear_retrans_hints_partial(tp);
+
+ if (prior_lost == tp->lost_out)
+ return;
+
+ if (tcp_is_reno(tp))
+ tcp_limit_reno_sacked(tp);
+
+ tcp_verify_left_out(tp);
+
+ /* Don't muck with the congestion window here.
+ * Reason is that we do not increase amount of _data_
+ * in network, but units changed and effective
+ * cwnd/ssthresh really reduced now.
+ */
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = 0;
+ tcp_set_ca_state(sk, TCP_CA_Loss);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+EXPORT_SYMBOL(tcp_simple_retransmit);
+
+static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mib_idx;
+
+ if (tcp_is_reno(tp))
+ mib_idx = LINUX_MIB_TCPRENORECOVERY;
+ else
+ mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ tp->prior_ssthresh = 0;
+ tcp_init_undo(tp);
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (!ece_ack)
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tcp_init_cwnd_reduction(sk);
+ }
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
+}
+
+/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
+ * recovered or spurious. Otherwise retransmits more on partial ACKs.
+ */
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool recovered = !before(tp->snd_una, tp->high_seq);
+
+ if ((flag & FLAG_SND_UNA_ADVANCED) &&
+ tcp_try_undo_loss(sk, false))
+ return;
+
+ if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+ /* Step 3.b. A timeout is spurious if not all data are
+ * lost, i.e., never-retransmitted data are (s)acked.
+ */
+ if ((flag & FLAG_ORIG_SACK_ACKED) &&
+ tcp_try_undo_loss(sk, true))
+ return;
+
+ if (after(tp->snd_nxt, tp->high_seq)) {
+ if (flag & FLAG_DATA_SACKED || is_dupack)
+ tp->frto = 0; /* Step 3.a. loss was real */
+ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
+ tp->high_seq = tp->snd_nxt;
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return; /* Step 2.b */
+ tp->frto = 0;
+ }
+ }
+
+ if (recovered) {
+ /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
+ tcp_try_undo_recovery(sk);
+ return;
+ }
+ if (tcp_is_reno(tp)) {
+ /* A Reno DUPACK means new data in F-RTO step 2.b above are
+ * delivered. Lower inflight to clock out (re)tranmissions.
+ */
+ if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
+ tcp_add_reno_sack(sk);
+ else if (flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
+/* Undo during fast recovery after partial ACK. */
+static bool tcp_try_undo_partial(struct sock *sk, const int acked,
+ const int prior_unsacked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->undo_marker && tcp_packet_delayed(tp)) {
+ /* Plain luck! Hole if filled with delayed
+ * packet, rather than with a retransmit.
+ */
+ tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+
+ /* We are getting evidence that the reordering degree is higher
+ * than we realized. If there are no retransmits out then we
+ * can undo. Otherwise we clock out new packets but do not
+ * mark more packets lost or retransmit more.
+ */
+ if (tp->retrans_out) {
+ tcp_cwnd_reduction(sk, prior_unsacked, 0);
+ return true;
+ }
+
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+
+ DBGUNDO(sk, "partial recovery");
+ tcp_undo_cwnd_reduction(sk, true);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ tcp_try_keep_open(sk);
+ return true;
+ }
+ return false;
+}
+
+/* Process an event, which can update packets-in-flight not trivially.
+ * Main goal of this function is to calculate new estimate for left_out,
+ * taking into account both packets sitting in receiver's buffer and
+ * packets lost by network.
+ *
+ * Besides that it does CWND reduction, when packet loss is detected
+ * and changes state of machine.
+ *
+ * It does _not_ decide what to send, it is made in function
+ * tcp_xmit_retransmit_queue().
+ */
+static void tcp_fastretrans_alert(struct sock *sk, const int acked,
+ const int prior_unsacked,
+ bool is_dupack, int flag)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
+ (tcp_fackets_out(tp) > tp->reordering));
+ int fast_rexmit = 0;
+
+ if (WARN_ON(!tp->packets_out && tp->sacked_out))
+ tp->sacked_out = 0;
+ if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+ tp->fackets_out = 0;
+
+ /* Now state machine starts.
+ * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
+ if (flag & FLAG_ECE)
+ tp->prior_ssthresh = 0;
+
+ /* B. In all the states check for reneging SACKs. */
+ if (tcp_check_sack_reneging(sk, flag))
+ return;
+
+ /* C. Check consistency of the current state. */
+ tcp_verify_left_out(tp);
+
+ /* D. Check state exit conditions. State can be terminated
+ * when high_seq is ACKed. */
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
+ WARN_ON(tp->retrans_out != 0);
+ tp->retrans_stamp = 0;
+ } else if (!before(tp->snd_una, tp->high_seq)) {
+ switch (icsk->icsk_ca_state) {
+ case TCP_CA_CWR:
+ /* CWR is to be held something *above* high_seq
+ * is ACKed for CWR bit to reach receiver. */
+ if (tp->snd_una != tp->high_seq) {
+ tcp_end_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_Open);
+ }
+ break;
+
+ case TCP_CA_Recovery:
+ if (tcp_is_reno(tp))
+ tcp_reset_reno_sack(tp);
+ if (tcp_try_undo_recovery(sk))
+ return;
+ tcp_end_cwnd_reduction(sk);
+ break;
+ }
+ }
+
+ /* E. Process state. */
+ switch (icsk->icsk_ca_state) {
+ case TCP_CA_Recovery:
+ if (!(flag & FLAG_SND_UNA_ADVANCED)) {
+ if (tcp_is_reno(tp) && is_dupack)
+ tcp_add_reno_sack(sk);
+ } else {
+ if (tcp_try_undo_partial(sk, acked, prior_unsacked))
+ return;
+ /* Partial ACK arrived. Force fast retransmit. */
+ do_lost = tcp_is_reno(tp) ||
+ tcp_fackets_out(tp) > tp->reordering;
+ }
+ if (tcp_try_undo_dsack(sk)) {
+ tcp_try_keep_open(sk);
+ return;
+ }
+ break;
+ case TCP_CA_Loss:
+ tcp_process_loss(sk, flag, is_dupack);
+ if (icsk->icsk_ca_state != TCP_CA_Open)
+ return;
+ /* Fall through to processing in Open state. */
+ default:
+ if (tcp_is_reno(tp)) {
+ if (flag & FLAG_SND_UNA_ADVANCED)
+ tcp_reset_reno_sack(tp);
+ if (is_dupack)
+ tcp_add_reno_sack(sk);
+ }
+
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder)
+ tcp_try_undo_dsack(sk);
+
+ if (!tcp_time_to_recover(sk, flag)) {
+ tcp_try_to_open(sk, flag, prior_unsacked);
+ return;
+ }
+
+ /* MTU probe failure: don't reduce cwnd */
+ if (icsk->icsk_ca_state < TCP_CA_CWR &&
+ icsk->icsk_mtup.probe_size &&
+ tp->snd_una == tp->mtu_probe.probe_seq_start) {
+ tcp_mtup_probe_failed(sk);
+ /* Restores the reduction we did in tcp_mtup_probe() */
+ tp->snd_cwnd++;
+ tcp_simple_retransmit(sk);
+ return;
+ }
+
+ /* Otherwise enter Recovery state */
+ tcp_enter_recovery(sk, (flag & FLAG_ECE));
+ fast_rexmit = 1;
+ }
+
+ if (do_lost)
+ tcp_update_scoreboard(sk, fast_rexmit);
+ tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
+ tcp_xmit_retransmit_queue(sk);
+}
+
+static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+ long seq_rtt_us, long sack_rtt_us)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
+ * broken middle-boxes or peers may corrupt TS-ECR fields. But
+ * Karn's algorithm forbids taking RTT if some retransmitted data
+ * is acked (RFC6298).
+ */
+ if (flag & FLAG_RETRANS_DATA_ACKED)
+ seq_rtt_us = -1L;
+
+ if (seq_rtt_us < 0)
+ seq_rtt_us = sack_rtt_us;
+
+ /* RTTM Rule: A TSecr value received in a segment is used to
+ * update the averaged RTT measurement only if the segment
+ * acknowledges some new data, i.e., only if it advances the
+ * left edge of the send window.
+ * See draft-ietf-tcplw-high-performance-00, section 3.3.
+ */
+ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ flag & FLAG_ACKED)
+ seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+
+ if (seq_rtt_us < 0)
+ return false;
+
+ tcp_rtt_estimator(sk, seq_rtt_us);
+ tcp_set_rto(sk);
+
+ /* RFC6298: only reset backoff on valid RTT measurement. */
+ inet_csk(sk)->icsk_backoff = 0;
+ return true;
+}
+
+/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
+static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ long seq_rtt_us = -1L;
+
+ if (synack_stamp && !tp->total_retrans)
+ seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
+
+ /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
+ * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
+ */
+ if (!tp->srtt_us)
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
+}
+
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
+ tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Restart timer after forward progress on connection.
+ * RFC2988 recommends to restart timer to now+rto.
+ */
+void tcp_rearm_rto(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If the retrans timer is currently being used by Fast Open
+ * for SYN-ACK retrans purpose, stay put.
+ */
+ if (tp->fastopen_rsk)
+ return;
+
+ if (!tp->packets_out) {
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+ } else {
+ u32 rto = inet_csk(sk)->icsk_rto;
+ /* Offset the time elapsed after installing regular RTO */
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ struct sk_buff *skb = tcp_write_queue_head(sk);
+ const u32 rto_time_stamp =
+ tcp_skb_timestamp(skb) + rto;
+ s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
+ /* delta may not be positive if the socket is locked
+ * when the retrans timer fires and is rescheduled.
+ */
+ if (delta > 0)
+ rto = delta;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+ TCP_RTO_MAX);
+ }
+}
+
+/* This function is called when the delayed ER timer fires. TCP enters
+ * fast recovery and performs fast-retransmit.
+ */
+void tcp_resume_early_retransmit(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_rearm_rto(sk);
+
+ /* Stop if ER is disabled after the delayed ER timer is scheduled */
+ if (!tp->do_early_retrans)
+ return;
+
+ tcp_enter_recovery(sk, false);
+ tcp_update_scoreboard(sk, 1);
+ tcp_xmit_retransmit_queue(sk);
+}
+
+/* If we get here, the whole TSO packet has not been acked. */
+static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 packets_acked;
+
+ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
+
+ packets_acked = tcp_skb_pcount(skb);
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+ return 0;
+ packets_acked -= tcp_skb_pcount(skb);
+
+ if (packets_acked) {
+ BUG_ON(tcp_skb_pcount(skb) == 0);
+ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
+ }
+
+ return packets_acked;
+}
+
+static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
+ u32 prior_snd_una)
+{
+ const struct skb_shared_info *shinfo;
+
+ /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
+ if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+ return;
+
+ shinfo = skb_shinfo(skb);
+ if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+ between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
+ __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+}
+
+/* Remove acknowledged frames from the retransmission queue. If our packet
+ * is before the ack sequence we can discard it as it's confirmed to have
+ * arrived at the other end.
+ */
+static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
+ u32 prior_snd_una, long sack_rtt_us)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct skb_mstamp first_ackt, last_ackt, now;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ u32 reord = tp->packets_out;
+ bool fully_acked = true;
+ long ca_seq_rtt_us = -1L;
+ long seq_rtt_us = -1L;
+ struct sk_buff *skb;
+ u32 pkts_acked = 0;
+ bool rtt_update;
+ int flag = 0;
+
+ first_ackt.v64 = 0;
+
+ while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ u8 sacked = scb->sacked;
+ u32 acked_pcount;
+
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
+
+ /* Determine how many packets and what bytes were acked, tso and else */
+ if (after(scb->end_seq, tp->snd_una)) {
+ if (tcp_skb_pcount(skb) == 1 ||
+ !after(tp->snd_una, scb->seq))
+ break;
+
+ acked_pcount = tcp_tso_acked(sk, skb);
+ if (!acked_pcount)
+ break;
+
+ fully_acked = false;
+ } else {
+ /* Speedup tcp_unlink_write_queue() and next loop */
+ prefetchw(skb->next);
+ acked_pcount = tcp_skb_pcount(skb);
+ }
+
+ if (unlikely(sacked & TCPCB_RETRANS)) {
+ if (sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= acked_pcount;
+ flag |= FLAG_RETRANS_DATA_ACKED;
+ } else if (!(sacked & TCPCB_SACKED_ACKED)) {
+ last_ackt = skb->skb_mstamp;
+ WARN_ON_ONCE(last_ackt.v64 == 0);
+ if (!first_ackt.v64)
+ first_ackt = last_ackt;
+
+ reord = min(pkts_acked, reord);
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
+ }
+
+ if (sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= acked_pcount;
+ if (sacked & TCPCB_LOST)
+ tp->lost_out -= acked_pcount;
+
+ tp->packets_out -= acked_pcount;
+ pkts_acked += acked_pcount;
+
+ /* Initial outgoing SYN's get put onto the write_queue
+ * just like anything else we transmit. It is not
+ * true data, and if we misinform our callers that
+ * this ACK acks real data, we will erroneously exit
+ * connection startup slow start one packet too
+ * quickly. This is severely frowned upon behavior.
+ */
+ if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
+ flag |= FLAG_DATA_ACKED;
+ } else {
+ flag |= FLAG_SYN_ACKED;
+ tp->retrans_stamp = 0;
+ }
+
+ if (!fully_acked)
+ break;
+
+ tcp_unlink_write_queue(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+ if (unlikely(skb == tp->retransmit_skb_hint))
+ tp->retransmit_skb_hint = NULL;
+ if (unlikely(skb == tp->lost_skb_hint))
+ tp->lost_skb_hint = NULL;
+ }
+
+ if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
+ tp->snd_up = tp->snd_una;
+
+ if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+ flag |= FLAG_SACK_RENEGING;
+
+ skb_mstamp_get(&now);
+ if (likely(first_ackt.v64)) {
+ seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
+ ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+ }
+
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
+
+ if (flag & FLAG_ACKED) {
+ const struct tcp_congestion_ops *ca_ops
+ = inet_csk(sk)->icsk_ca_ops;
+
+ tcp_rearm_rto(sk);
+ if (unlikely(icsk->icsk_mtup.probe_size &&
+ !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
+ tcp_mtup_probe_success(sk);
+ }
+
+ if (tcp_is_reno(tp)) {
+ tcp_remove_reno_sacks(sk, pkts_acked);
+ } else {
+ int delta;
+
+ /* Non-retransmitted hole got filled? That's reordering */
+ if (reord < prior_fackets)
+ tcp_update_reordering(sk, tp->fackets_out - reord, 0);
+
+ delta = tcp_is_fack(tp) ? pkts_acked :
+ prior_sacked - tp->sacked_out;
+ tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
+ }
+
+ tp->fackets_out -= min(pkts_acked, tp->fackets_out);
+
+ if (ca_ops->pkts_acked) {
+ long rtt_us = min_t(ulong, ca_seq_rtt_us, sack_rtt_us);
+ ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
+ }
+
+ } else if (skb && rtt_update && sack_rtt_us >= 0 &&
+ sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+ /* Do not re-arm RTO if the sack RTT is measured from data sent
+ * after when the head was last (re)transmitted. Otherwise the
+ * timeout may continue to extend in loss recovery.
+ */
+ tcp_rearm_rto(sk);
+ }
+
+#if FASTRETRANS_DEBUG > 0
+ WARN_ON((int)tp->sacked_out < 0);
+ WARN_ON((int)tp->lost_out < 0);
+ WARN_ON((int)tp->retrans_out < 0);
+ if (!tp->packets_out && tcp_is_sack(tp)) {
+ icsk = inet_csk(sk);
+ if (tp->lost_out) {
+ pr_debug("Leak l=%u %d\n",
+ tp->lost_out, icsk->icsk_ca_state);
+ tp->lost_out = 0;
+ }
+ if (tp->sacked_out) {
+ pr_debug("Leak s=%u %d\n",
+ tp->sacked_out, icsk->icsk_ca_state);
+ tp->sacked_out = 0;
+ }
+ if (tp->retrans_out) {
+ pr_debug("Leak r=%u %d\n",
+ tp->retrans_out, icsk->icsk_ca_state);
+ tp->retrans_out = 0;
+ }
+ }
+#endif
+ return flag;
+}
+
+static void tcp_ack_probe(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* Was it a usable window open? */
+
+ if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+ icsk->icsk_backoff = 0;
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
+ /* Socket must be waked up by subsequent tcp_data_snd_check().
+ * This function is not for random using!
+ */
+ } else {
+ unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ when, TCP_RTO_MAX);
+ }
+}
+
+static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
+{
+ return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
+}
+
+/* Decide wheather to run the increase function of congestion control. */
+static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+{
+ if (tcp_in_cwnd_reduction(sk))
+ return false;
+
+ /* If reordering is high then always grow cwnd whenever data is
+ * delivered regardless of its ordering. Otherwise stay conservative
+ * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
+ * new SACK or ECE mark may first advance cwnd here and later reduce
+ * cwnd in tcp_fastretrans_alert() based on more states.
+ */
+ if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+ return flag & FLAG_FORWARD_PROGRESS;
+
+ return flag & FLAG_DATA_ACKED;
+}
+
+/* Check that window update is acceptable.
+ * The function assumes that snd_una<=ack<=snd_next.
+ */
+static inline bool tcp_may_update_window(const struct tcp_sock *tp,
+ const u32 ack, const u32 ack_seq,
+ const u32 nwin)
+{
+ return after(ack, tp->snd_una) ||
+ after(ack_seq, tp->snd_wl1) ||
+ (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
+}
+
+/* If we update tp->snd_una, also update tp->bytes_acked */
+static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
+{
+ u32 delta = ack - tp->snd_una;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_acked += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->snd_una = ack;
+}
+
+/* If we update tp->rcv_nxt, also update tp->bytes_received */
+static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
+{
+ u32 delta = seq - tp->rcv_nxt;
+
+ u64_stats_update_begin(&tp->syncp);
+ tp->bytes_received += delta;
+ u64_stats_update_end(&tp->syncp);
+ tp->rcv_nxt = seq;
+}
+
+/* Update our send window.
+ *
+ * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
+ * and in FreeBSD. NetBSD's one is even worse.) is wrong.
+ */
+static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
+ u32 ack_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int flag = 0;
+ u32 nwin = ntohs(tcp_hdr(skb)->window);
+
+ if (likely(!tcp_hdr(skb)->syn))
+ nwin <<= tp->rx_opt.snd_wscale;
+
+ if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
+ flag |= FLAG_WIN_UPDATE;
+ tcp_update_wl(tp, ack_seq);
+
+ if (tp->snd_wnd != nwin) {
+ tp->snd_wnd = nwin;
+
+ /* Note, it is the only place, where
+ * fast path is recovered for sending TCP.
+ */
+ tp->pred_flags = 0;
+ tcp_fast_path_check(sk);
+
+ if (nwin > tp->max_window) {
+ tp->max_window = nwin;
+ tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
+ }
+ }
+ }
+
+ tcp_snd_una_update(tp, ack);
+
+ return flag;
+}
+
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+ !tcp_hdr(skb)->syn)
+ goto not_rate_limited;
+
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS_BH(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
+/* RFC 5961 7 [ACK Throttling] */
+static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
+{
+ /* unprotected vars, we dont care of overwrites */
+ static u32 challenge_timestamp;
+ static unsigned int challenge_count;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 now;
+
+ /* First check our per-socket dupack rate limit. */
+ if (tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+ &tp->last_oow_ack_time))
+ return;
+
+ /* Then check the check host-wide RFC 5961 rate limit. */
+ now = jiffies / HZ;
+ if (now != challenge_timestamp) {
+ challenge_timestamp = now;
+ challenge_count = 0;
+ }
+ if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ tcp_send_ack(sk);
+ }
+}
+
+static void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+ tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+ tp->rx_opt.ts_recent_stamp = get_seconds();
+}
+
+static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+ if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+ /* PAWS bug workaround wrt. ACK frames, the PAWS discard
+ * extra check below makes sure this can only happen
+ * for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps.
+ */
+
+ if (tcp_paws_check(&tp->rx_opt, 0))
+ tcp_store_ts_recent(tp);
+ }
+}
+
+/* This routine deals with acks during a TLP episode.
+ * We mark the end of a TLP episode on receiving TLP dupack or when
+ * ack is after tlp_high_seq.
+ * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
+ */
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (before(ack, tp->tlp_high_seq))
+ return;
+
+ if (flag & FLAG_DSACKING_ACK) {
+ /* This DSACK means original and TLP probe arrived; no loss */
+ tp->tlp_high_seq = 0;
+ } else if (after(ack, tp->tlp_high_seq)) {
+ /* ACK advances: there was a loss, so reduce cwnd. Reset
+ * tlp_high_seq in tcp_init_cwnd_reduction()
+ */
+ tcp_init_cwnd_reduction(sk);
+ tcp_set_ca_state(sk, TCP_CA_CWR);
+ tcp_end_cwnd_reduction(sk);
+ tcp_try_keep_open(sk);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
+ } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
+ /* Pure dupack: original and TLP probe arrived; no loss */
+ tp->tlp_high_seq = 0;
+ }
+}
+
+static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->in_ack_event)
+ icsk->icsk_ca_ops->in_ack_event(sk, flags);
+}
+
+/* This routine deals with incoming acks, but not outgoing ones. */
+static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_snd_una = tp->snd_una;
+ u32 ack_seq = TCP_SKB_CB(skb)->seq;
+ u32 ack = TCP_SKB_CB(skb)->ack_seq;
+ bool is_dupack = false;
+ u32 prior_fackets;
+ int prior_packets = tp->packets_out;
+ const int prior_unsacked = tp->packets_out - tp->sacked_out;
+ int acked = 0; /* Number of packets newly acked */
+ long sack_rtt_us = -1L;
+
+ /* We very likely will need to access write queue head. */
+ prefetchw(sk->sk_write_queue.next);
+
+ /* If the ack is older than previous acks
+ * then we can probably ignore it.
+ */
+ if (before(ack, prior_snd_una)) {
+ /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
+ if (before(ack, prior_snd_una - tp->max_window)) {
+ tcp_send_challenge_ack(sk, skb);
+ return -1;
+ }
+ goto old_ack;
+ }
+
+ /* If the ack includes data we haven't sent yet, discard
+ * this segment (RFC793 Section 3.9).
+ */
+ if (after(ack, tp->snd_nxt))
+ goto invalid_ack;
+
+ if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
+ tcp_rearm_rto(sk);
+
+ if (after(ack, prior_snd_una)) {
+ flag |= FLAG_SND_UNA_ADVANCED;
+ icsk->icsk_retransmits = 0;
+ }
+
+ prior_fackets = tp->fackets_out;
+
+ /* ts_recent update must be made after we are sure that the packet
+ * is in window.
+ */
+ if (flag & FLAG_UPDATE_TS_RECENT)
+ tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+ if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ /* Window is constant, pure forward advance.
+ * No more checks are required.
+ * Note, we use the fact that SND.UNA>=SND.WL2.
+ */
+ tcp_update_wl(tp, ack_seq);
+ tcp_snd_una_update(tp, ack);
+ flag |= FLAG_WIN_UPDATE;
+
+ tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
+ } else {
+ u32 ack_ev_flags = CA_ACK_SLOWPATH;
+
+ if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+ flag |= FLAG_DATA;
+ else
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+
+ flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+
+ if (TCP_SKB_CB(skb)->sacked)
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_rtt_us);
+
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+ flag |= FLAG_ECE;
+ ack_ev_flags |= CA_ACK_ECE;
+ }
+
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
+
+ tcp_in_ack_event(sk, ack_ev_flags);
+ }
+
+ /* We passed data and got it acked, remove any soft error
+ * log. Something worked...
+ */
+ sk->sk_err_soft = 0;
+ icsk->icsk_probes_out = 0;
+ tp->rcv_tstamp = tcp_time_stamp;
+ if (!prior_packets)
+ goto no_queue;
+
+ /* See if we can take anything off of the retransmit queue. */
+ acked = tp->packets_out;
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+ sack_rtt_us);
+ acked -= tp->packets_out;
+
+ /* Advance cwnd if state allows */
+ if (tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, acked);
+
+ if (tcp_ack_is_dubious(sk, flag)) {
+ is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
+ }
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
+
+ if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
+ struct dst_entry *dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
+ }
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS)
+ tcp_schedule_loss_probe(sk);
+ tcp_update_pacing_rate(sk);
+ return 1;
+
+no_queue:
+ /* If data was DSACKed, see if we can undo a cwnd reduction. */
+ if (flag & FLAG_DSACKING_ACK)
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
+ /* If this ack opens up a zero window, clear backoff. It was
+ * being used to time the probes, and is probably far higher than
+ * it needs to be for normal retransmission.
+ */
+ if (tcp_send_head(sk))
+ tcp_ack_probe(sk);
+
+ if (tp->tlp_high_seq)
+ tcp_process_tlp_ack(sk, ack, flag);
+ return 1;
+
+invalid_ack:
+ SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ return -1;
+
+old_ack:
+ /* If data was SACKed, tag it and see if we should send more data.
+ * If data was DSACKed, see if we can undo a cwnd reduction.
+ */
+ if (TCP_SKB_CB(skb)->sacked) {
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_rtt_us);
+ tcp_fastretrans_alert(sk, acked, prior_unsacked,
+ is_dupack, flag);
+ }
+
+ SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+ return 0;
+}
+
+static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
+ bool syn, struct tcp_fastopen_cookie *foc,
+ bool exp_opt)
+{
+ /* Valid only in SYN or SYN-ACK with an even length. */
+ if (!foc || !syn || len < 0 || (len & 1))
+ return;
+
+ if (len >= TCP_FASTOPEN_COOKIE_MIN &&
+ len <= TCP_FASTOPEN_COOKIE_MAX)
+ memcpy(foc->val, cookie, len);
+ else if (len != 0)
+ len = -1;
+ foc->len = len;
+ foc->exp = exp_opt;
+}
+
+/* Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ */
+void tcp_parse_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx, int estab,
+ struct tcp_fastopen_cookie *foc)
+{
+ const unsigned char *ptr;
+ const struct tcphdr *th = tcp_hdr(skb);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+ ptr = (const unsigned char *)(th + 1);
+ opt_rx->saw_tstamp = 0;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+ switch (opcode) {
+ case TCPOPT_MSS:
+ if (opsize == TCPOLEN_MSS && th->syn && !estab) {
+ u16 in_mss = get_unaligned_be16(ptr);
+ if (in_mss) {
+ if (opt_rx->user_mss &&
+ opt_rx->user_mss < in_mss)
+ in_mss = opt_rx->user_mss;
+ opt_rx->mss_clamp = in_mss;
+ }
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if (opsize == TCPOLEN_WINDOW && th->syn &&
+ !estab && sysctl_tcp_window_scaling) {
+ __u8 snd_wscale = *(__u8 *)ptr;
+ opt_rx->wscale_ok = 1;
+ if (snd_wscale > 14) {
+ net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
+ __func__,
+ snd_wscale);
+ snd_wscale = 14;
+ }
+ opt_rx->snd_wscale = snd_wscale;
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if ((opsize == TCPOLEN_TIMESTAMP) &&
+ ((estab && opt_rx->tstamp_ok) ||
+ (!estab && sysctl_tcp_timestamps))) {
+ opt_rx->saw_tstamp = 1;
+ opt_rx->rcv_tsval = get_unaligned_be32(ptr);
+ opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opsize == TCPOLEN_SACK_PERM && th->syn &&
+ !estab && sysctl_tcp_sack) {
+ opt_rx->sack_ok = TCP_SACK_SEEN;
+ tcp_sack_reset(opt_rx);
+ }
+ break;
+
+ case TCPOPT_SACK:
+ if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+ !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+ opt_rx->sack_ok) {
+ TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+ }
+ break;
+#ifdef CONFIG_TCP_MD5SIG
+ case TCPOPT_MD5SIG:
+ /*
+ * The MD5 Hash has already been
+ * checked (see tcp_v{4,6}_do_rcv()).
+ */
+ break;
+#endif
+ case TCPOPT_FASTOPEN:
+ tcp_parse_fastopen_option(
+ opsize - TCPOLEN_FASTOPEN_BASE,
+ ptr, th->syn, foc, false);
+ break;
+
+ case TCPOPT_EXP:
+ /* Fast Open option shares code 254 using a
+ * 16 bits magic number.
+ */
+ if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
+ get_unaligned_be16(ptr) ==
+ TCPOPT_FASTOPEN_MAGIC)
+ tcp_parse_fastopen_option(opsize -
+ TCPOLEN_EXP_FASTOPEN_BASE,
+ ptr + 2, th->syn, foc, true);
+ break;
+
+ }
+ ptr += opsize-2;
+ length -= opsize;
+ }
+ }
+}
+EXPORT_SYMBOL(tcp_parse_options);
+
+static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
+{
+ const __be32 *ptr = (const __be32 *)(th + 1);
+
+ if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ tp->rx_opt.saw_tstamp = 1;
+ ++ptr;
+ tp->rx_opt.rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ if (*ptr)
+ tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
+ else
+ tp->rx_opt.rcv_tsecr = 0;
+ return true;
+ }
+ return false;
+}
+
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_options().
+ */
+static bool tcp_fast_parse_options(const struct sk_buff *skb,
+ const struct tcphdr *th, struct tcp_sock *tp)
+{
+ /* In the spirit of fast parsing, compare doff directly to constant
+ * values. Because equality is used, short doff can be ignored here.
+ */
+ if (th->doff == (sizeof(*th) / 4)) {
+ tp->rx_opt.saw_tstamp = 0;
+ return false;
+ } else if (tp->rx_opt.tstamp_ok &&
+ th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
+ if (tcp_parse_aligned_timestamp(tp, th))
+ return true;
+ }
+
+ tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+ return true;
+}
+
+#ifdef CONFIG_TCP_STEALTH
+/* Parse only the TSVal field of the TCP Timestamp option header.
+ */
+const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th)
+{
+ int length = (th->doff << 2) - sizeof(*th);
+ const u8 *ptr = (const u8 *)(th + 1);
+
+ /* If the TCP option is too short, we can short cut */
+ if (length < TCPOLEN_TIMESTAMP)
+ return false;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return false;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ case TCPOPT_TIMESTAMP:
+ opsize = *ptr++;
+ if (opsize != TCPOLEN_TIMESTAMP || opsize > length)
+ return false;
+ *tsval = get_unaligned_be32(ptr);
+ return true;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ return false;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ return false;
+}
+EXPORT_SYMBOL(tcp_parse_tsval_option);
+#endif
+
+#ifdef CONFIG_TCP_MD5SIG
+/*
+ * Parse MD5 Signature option
+ */
+const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
+{
+ int length = (th->doff << 2) - sizeof(*th);
+ const u8 *ptr = (const u8 *)(th + 1);
+
+ /* If the TCP option is too short, we can short cut */
+ if (length < TCPOLEN_MD5SIG)
+ return NULL;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return NULL;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ return NULL;
+ if (opcode == TCPOPT_MD5SIG)
+ return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_parse_md5sig_option);
+#endif
+
+/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+ *
+ * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
+ * it can pass through stack. So, the following predicate verifies that
+ * this segment is not used for anything but congestion avoidance or
+ * fast retransmit. Moreover, we even are able to eliminate most of such
+ * second order effects, if we apply some small "replay" window (~RTO)
+ * to timestamp space.
+ *
+ * All these measures still do not guarantee that we reject wrapped ACKs
+ * on networks with high bandwidth, when sequence space is recycled fastly,
+ * but it guarantees that such events will be very rare and do not affect
+ * connection seriously. This doesn't look nice, but alas, PAWS is really
+ * buggy extension.
+ *
+ * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
+ * states that events when retransmit arrives after original data are rare.
+ * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
+ * the biggest problem on large power networks even with minor reordering.
+ * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
+ * up to bandwidth of 18Gigabit/sec. 8) ]
+ */
+
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
+ u32 seq = TCP_SKB_CB(skb)->seq;
+ u32 ack = TCP_SKB_CB(skb)->ack_seq;
+
+ return (/* 1. Pure ACK with correct sequence number. */
+ (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+
+ /* 2. ... and duplicate ACK. */
+ ack == tp->snd_una &&
+
+ /* 3. ... and does not update window. */
+ !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+
+ /* 4. ... and sits in replay window. */
+ (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
+}
+
+static inline bool tcp_paws_discard(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
+ !tcp_disordered_ack(sk, skb);
+}
+
+/* Check segment sequence number for validity.
+ *
+ * Segment controls are considered valid, if the segment
+ * fits to the window after truncation to the window. Acceptability
+ * of data (and SYN, FIN, of course) is checked separately.
+ * See tcp_data_queue(), for example.
+ *
+ * Also, controls (RST is main one) are accepted using RCV.WUP instead
+ * of RCV.NXT. Peer still did not advance his SND.UNA when we
+ * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
+ * (borrowed from freebsd)
+ */
+
+static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+ return !before(end_seq, tp->rcv_wup) &&
+ !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+}
+
+/* When we get a reset we do this. */
+void tcp_reset(struct sock *sk)
+{
+ /* We want the right error as BSD sees it (and indeed as we do). */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ sk->sk_err = ECONNREFUSED;
+ break;
+ case TCP_CLOSE_WAIT:
+ sk->sk_err = EPIPE;
+ break;
+ case TCP_CLOSE:
+ return;
+ default:
+ sk->sk_err = ECONNRESET;
+ }
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+}
+
+/*
+ * Process the FIN bit. This now behaves as it is supposed to work
+ * and the FIN takes effect when it is validly part of sequence
+ * space. Not before when we get holes.
+ *
+ * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ * (and thence onto LAST-ACK and finally, CLOSE, we never enter
+ * TIME-WAIT)
+ *
+ * If we are in FINWAIT-1, a received FIN indicates simultaneous
+ * close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ */
+static void tcp_fin(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst;
+
+ inet_csk_schedule_ack(sk);
+
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(sk, SOCK_DONE);
+
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ /* Move to CLOSE_WAIT */
+ tcp_set_state(sk, TCP_CLOSE_WAIT);
+ dst = __sk_dst_get(sk);
+ if (!dst || !dst_metric(dst, RTAX_QUICKACK))
+ inet_csk(sk)->icsk_ack.pingpong = 1;
+ break;
+
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ /* Received a retransmission of the FIN, do
+ * nothing.
+ */
+ break;
+ case TCP_LAST_ACK:
+ /* RFC793: Remain in the LAST-ACK state. */
+ break;
+
+ case TCP_FIN_WAIT1:
+ /* This case occurs when a simultaneous close
+ * happens, we must ack the received FIN and
+ * enter the CLOSING state.
+ */
+ tcp_send_ack(sk);
+ tcp_set_state(sk, TCP_CLOSING);
+ break;
+ case TCP_FIN_WAIT2:
+ /* Received a FIN -- send ACK and enter TIME_WAIT. */
+ tcp_send_ack(sk);
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ break;
+ default:
+ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+ * cases we should never reach this piece of code.
+ */
+ pr_err("%s: Impossible, sk->sk_state=%d\n",
+ __func__, sk->sk_state);
+ break;
+ }
+
+ /* It _is_ possible, that we have something out-of-order _after_ FIN.
+ * Probably, we should reset in this case. For now drop them.
+ */
+ __skb_queue_purge(&tp->out_of_order_queue);
+ if (tcp_is_sack(tp))
+ tcp_sack_reset(&tp->rx_opt);
+ sk_mem_reclaim(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+
+ /* Do not send POLL_HUP for half duplex close. */
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ sk->sk_state == TCP_CLOSE)
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ }
+}
+
+static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+ u32 end_seq)
+{
+ if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
+ if (before(seq, sp->start_seq))
+ sp->start_seq = seq;
+ if (after(end_seq, sp->end_seq))
+ sp->end_seq = end_seq;
+ return true;
+ }
+ return false;
+}
+
+static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+ int mib_idx;
+
+ if (before(seq, tp->rcv_nxt))
+ mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
+ else
+ mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ tp->rx_opt.dsack = 1;
+ tp->duplicate_sack[0].start_seq = seq;
+ tp->duplicate_sack[0].end_seq = end_seq;
+ }
+}
+
+static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->rx_opt.dsack)
+ tcp_dsack_set(sk, seq, end_seq);
+ else
+ tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
+}
+
+static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ tcp_enter_quickack_mode(sk);
+
+ if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
+ end_seq = tp->rcv_nxt;
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
+ }
+ }
+
+ tcp_send_ack(sk);
+}
+
+/* These routines update the SACK block as out-of-order packets arrive or
+ * in-order packets close up the sequence space.
+ */
+static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
+{
+ int this_sack;
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ struct tcp_sack_block *swalk = sp + 1;
+
+ /* See if the recent change to the first SACK eats into
+ * or hits the sequence space of other SACK blocks, if so coalesce.
+ */
+ for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
+ if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
+ int i;
+
+ /* Zap SWALK, by moving every further SACK up by one slot.
+ * Decrease num_sacks.
+ */
+ tp->rx_opt.num_sacks--;
+ for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
+ sp[i] = sp[i + 1];
+ continue;
+ }
+ this_sack++, swalk++;
+ }
+}
+
+static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int cur_sacks = tp->rx_opt.num_sacks;
+ int this_sack;
+
+ if (!cur_sacks)
+ goto new_sack;
+
+ for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
+ if (tcp_sack_extend(sp, seq, end_seq)) {
+ /* Rotate this_sack to the first one. */
+ for (; this_sack > 0; this_sack--, sp--)
+ swap(*sp, *(sp - 1));
+ if (cur_sacks > 1)
+ tcp_sack_maybe_coalesce(tp);
+ return;
+ }
+ }
+
+ /* Could not find an adjacent existing SACK, build a new one,
+ * put it at the front, and shift everyone else down. We
+ * always know there is at least one SACK present already here.
+ *
+ * If the sack array is full, forget about the last one.
+ */
+ if (this_sack >= TCP_NUM_SACKS) {
+ this_sack--;
+ tp->rx_opt.num_sacks--;
+ sp--;
+ }
+ for (; this_sack > 0; this_sack--, sp--)
+ *sp = *(sp - 1);
+
+new_sack:
+ /* Build the new head SACK, and we're done. */
+ sp->start_seq = seq;
+ sp->end_seq = end_seq;
+ tp->rx_opt.num_sacks++;
+}
+
+/* RCV.NXT advances, some SACKs should be eaten. */
+
+static void tcp_sack_remove(struct tcp_sock *tp)
+{
+ struct tcp_sack_block *sp = &tp->selective_acks[0];
+ int num_sacks = tp->rx_opt.num_sacks;
+ int this_sack;
+
+ /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+ if (skb_queue_empty(&tp->out_of_order_queue)) {
+ tp->rx_opt.num_sacks = 0;
+ return;
+ }
+
+ for (this_sack = 0; this_sack < num_sacks;) {
+ /* Check if the start of the sack is covered by RCV.NXT. */
+ if (!before(tp->rcv_nxt, sp->start_seq)) {
+ int i;
+
+ /* RCV.NXT must cover all the block! */
+ WARN_ON(before(tp->rcv_nxt, sp->end_seq));
+
+ /* Zap this SACK, by moving forward any other SACKS. */
+ for (i = this_sack+1; i < num_sacks; i++)
+ tp->selective_acks[i-1] = tp->selective_acks[i];
+ num_sacks--;
+ continue;
+ }
+ this_sack++;
+ sp++;
+ }
+ tp->rx_opt.num_sacks = num_sacks;
+}
+
+/**
+ * tcp_try_coalesce - try to merge skb to prior one
+ * @sk: socket
+ * @to: prior buffer
+ * @from: buffer to add in queue
+ * @fragstolen: pointer to boolean
+ *
+ * Before queueing skb @from after @to, try to merge them
+ * to reduce overall memory use and queue lengths, if cost is small.
+ * Packets in ofo or receive queues can stay a long time.
+ * Better try to coalesce them right now to avoid future collapses.
+ * Returns true if caller should free @from instead of queueing it
+ */
+static bool tcp_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ int delta;
+
+ *fragstolen = false;
+
+ /* Its possible this segment overlaps with prior segment in queue */
+ if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
+ return false;
+
+ if (!skb_try_coalesce(to, from, fragstolen, &delta))
+ return false;
+
+ atomic_add(delta, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, delta);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+ TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+ TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
+ return true;
+}
+
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 dsack_high = tp->rcv_nxt;
+ struct sk_buff *skb, *tail;
+ bool fragstolen, eaten;
+
+ while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+ if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ break;
+
+ if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
+ __u32 dsack = dsack_high;
+ if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
+ dsack_high = TCP_SKB_CB(skb)->end_seq;
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
+ }
+
+ __skb_unlink(skb, &tp->out_of_order_queue);
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ SOCK_DEBUG(sk, "ofo packet was already received\n");
+ __kfree_skb(skb);
+ continue;
+ }
+ SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+
+ tail = skb_peek_tail(&sk->sk_receive_queue);
+ eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ if (!eaten)
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ tcp_fin(sk);
+ if (eaten)
+ kfree_skb_partial(skb, fragstolen);
+ }
+}
+
+static bool tcp_prune_ofo_queue(struct sock *sk);
+static int tcp_prune_queue(struct sock *sk);
+
+static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+ unsigned int size)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ !sk_rmem_schedule(sk, skb, size)) {
+
+ if (tcp_prune_queue(sk) < 0)
+ return -1;
+
+ if (!sk_rmem_schedule(sk, skb, size)) {
+ if (!tcp_prune_ofo_queue(sk))
+ return -1;
+
+ if (!sk_rmem_schedule(sk, skb, size))
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb1;
+ u32 seq, end_seq;
+
+ tcp_ecn_check_ce(tp, skb);
+
+ if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
+ __kfree_skb(skb);
+ return;
+ }
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+ inet_csk_schedule_ack(sk);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+ skb1 = skb_peek_tail(&tp->out_of_order_queue);
+ if (!skb1) {
+ /* Initial out of order segment, build 1 SACK. */
+ if (tcp_is_sack(tp)) {
+ tp->rx_opt.num_sacks = 1;
+ tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+ tp->selective_acks[0].end_seq =
+ TCP_SKB_CB(skb)->end_seq;
+ }
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ goto end;
+ }
+
+ seq = TCP_SKB_CB(skb)->seq;
+ end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (seq == TCP_SKB_CB(skb1)->end_seq) {
+ bool fragstolen;
+
+ if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ } else {
+ tcp_grow_window(sk, skb);
+ kfree_skb_partial(skb, fragstolen);
+ skb = NULL;
+ }
+
+ if (!tp->rx_opt.num_sacks ||
+ tp->selective_acks[0].end_seq != seq)
+ goto add_sack;
+
+ /* Common case: data arrive in order after hole. */
+ tp->selective_acks[0].end_seq = end_seq;
+ goto end;
+ }
+
+ /* Find place to insert this segment. */
+ while (1) {
+ if (!after(TCP_SKB_CB(skb1)->seq, seq))
+ break;
+ if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+ skb1 = NULL;
+ break;
+ }
+ skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+ }
+
+ /* Do skb overlap to previous one? */
+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ /* All the bits are present. Drop. */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+ }
+ if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+ /* Partial overlap. */
+ tcp_dsack_set(sk, seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ } else {
+ if (skb_queue_is_first(&tp->out_of_order_queue,
+ skb1))
+ skb1 = NULL;
+ else
+ skb1 = skb_queue_prev(
+ &tp->out_of_order_queue,
+ skb1);
+ }
+ }
+ if (!skb1)
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ else
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+
+ /* And clean segments covered by new one as whole. */
+ while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+ skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+
+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+ break;
+ if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ end_seq);
+ break;
+ }
+ __skb_unlink(skb1, &tp->out_of_order_queue);
+ tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ __kfree_skb(skb1);
+ }
+
+add_sack:
+ if (tcp_is_sack(tp))
+ tcp_sack_new_ofo_skb(sk, seq, end_seq);
+end:
+ if (skb) {
+ tcp_grow_window(sk, skb);
+ skb_set_owner_r(skb, sk);
+ }
+}
+
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+ bool *fragstolen)
+{
+ int eaten;
+ struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
+
+ __skb_pull(skb, hdrlen);
+ eaten = (tail &&
+ tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
+ tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
+ if (!eaten) {
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ }
+ return eaten;
+}
+
+int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_buff *skb;
+ bool fragstolen;
+
+ if (size == 0)
+ return 0;
+
+ skb = alloc_skb(size, sk->sk_allocation);
+ if (!skb)
+ goto err;
+
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto err_free;
+
+ if (memcpy_from_msg(skb_put(skb, size), msg, size))
+ goto err_free;
+
+ TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
+ TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
+
+ if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
+ WARN_ON_ONCE(fragstolen); /* should not happen */
+ __kfree_skb(skb);
+ }
+ return size;
+
+err_free:
+ kfree_skb(skb);
+err:
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_TCP_STEALTH
+static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 hash;
+ __be32 seq = cpu_to_be32(TCP_SKB_CB(skb)->seq - 1);
+ char *data = skb->data + th->doff * 4;
+ int len = skb->len - th->doff * 4;
+
+ if (len < tp->stealth.integrity_len)
+ return 1;
+
+ if (tcp_stealth_integrity(&hash, tp->stealth.secret, data,
+ tp->stealth.integrity_len))
+ return 1;
+
+ if (be32_isn_to_be16_ih(seq) != cpu_to_be16(hash))
+ return 1;
+
+ tp->stealth.mode &= ~TCP_STEALTH_MODE_INTEGRITY_LEN;
+ return 0;
+}
+#endif
+
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int eaten = -1;
+ bool fragstolen = false;
+
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
+ goto drop;
+
+#ifdef CONFIG_TCP_STEALTH
+ if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+ __tcp_stealth_integrity_check(sk, skb)) {
+ tcp_reset(sk);
+ goto drop;
+ }
+#endif
+
+ skb_dst_drop(skb);
+ __skb_pull(skb, tcp_hdr(skb)->doff * 4);
+
+ tcp_ecn_accept_cwr(tp, skb);
+
+ tp->rx_opt.dsack = 0;
+
+ /* Queue data for delivery to the user.
+ * Packets in sequence go to the receive queue.
+ * Out of sequence packets to the out_of_order_queue.
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ if (tcp_receive_window(tp) == 0)
+ goto out_of_window;
+
+ /* Ok. In sequence. In window. */
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
+ sock_owned_by_user(sk) && !tp->urg_data) {
+ int chunk = min_t(unsigned int, skb->len,
+ tp->ucopy.len);
+
+ __set_current_state(TASK_RUNNING);
+
+ local_bh_enable();
+ if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ eaten = (chunk == skb->len);
+ tcp_rcv_space_adjust(sk);
+ }
+ local_bh_disable();
+ }
+
+ if (eaten <= 0) {
+queue_and_out:
+ if (eaten < 0 &&
+ tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
+ }
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ if (skb->len)
+ tcp_event_data_recv(sk, skb);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ tcp_fin(sk);
+
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ tcp_ofo_queue(sk);
+
+ /* RFC2581. 4.2. SHOULD send immediate ACK, when
+ * gap in queue is filled.
+ */
+ if (skb_queue_empty(&tp->out_of_order_queue))
+ inet_csk(sk)->icsk_ack.pingpong = 0;
+ }
+
+ if (tp->rx_opt.num_sacks)
+ tcp_sack_remove(tp);
+
+ tcp_fast_path_check(sk);
+
+ if (eaten > 0)
+ kfree_skb_partial(skb, fragstolen);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return;
+ }
+
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ /* A retransmit, 2nd most common case. Force an immediate ack. */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+out_of_window:
+ tcp_enter_quickack_mode(sk);
+ inet_csk_schedule_ack(sk);
+drop:
+ __kfree_skb(skb);
+ return;
+ }
+
+ /* Out of window. F.e. zero window probe. */
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ goto out_of_window;
+
+ tcp_enter_quickack_mode(sk);
+
+ if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* Partial packet, seq < rcv_next < end_seq */
+ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
+ tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq);
+
+ tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
+
+ /* If window is closed, drop tail of packet. But after
+ * remembering D-SACK for its head made in previous line.
+ */
+ if (!tcp_receive_window(tp))
+ goto out_of_window;
+ goto queue_and_out;
+ }
+
+ tcp_data_queue_ofo(sk, skb);
+}
+
+static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+ struct sk_buff_head *list)
+{
+ struct sk_buff *next = NULL;
+
+ if (!skb_queue_is_last(list, skb))
+ next = skb_queue_next(list, skb);
+
+ __skb_unlink(skb, list);
+ __kfree_skb(skb);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+
+ return next;
+}
+
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ *
+ * If tail is NULL, this means until the end of the list.
+ *
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+ struct sk_buff *head, struct sk_buff *tail,
+ u32 start, u32 end)
+{
+ struct sk_buff *skb, *n;
+ bool end_of_skbs;
+
+ /* First, check that queue is collapsible and find
+ * the point where collapsing can be useful. */
+ skb = head;
+restart:
+ end_of_skbs = true;
+ skb_queue_walk_from_safe(list, skb, n) {
+ if (skb == tail)
+ break;
+ /* No new bits? It is possible on ofo queue. */
+ if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+ skb = tcp_collapse_one(sk, skb, list);
+ if (!skb)
+ break;
+ goto restart;
+ }
+
+ /* The first skb to collapse is:
+ * - not SYN/FIN and
+ * - bloated or contains data before "start" or
+ * overlaps to the next one.
+ */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
+ (tcp_win_from_space(skb->truesize) > skb->len ||
+ before(TCP_SKB_CB(skb)->seq, start))) {
+ end_of_skbs = false;
+ break;
+ }
+
+ if (!skb_queue_is_last(list, skb)) {
+ struct sk_buff *next = skb_queue_next(list, skb);
+ if (next != tail &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
+ end_of_skbs = false;
+ break;
+ }
+ }
+
+ /* Decided to skip this, advance start seq. */
+ start = TCP_SKB_CB(skb)->end_seq;
+ }
+ if (end_of_skbs ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+ return;
+
+ while (before(start, end)) {
+ int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
+ struct sk_buff *nskb;
+
+ nskb = alloc_skb(copy, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+ __skb_queue_before(list, skb, nskb);
+ skb_set_owner_r(nskb, sk);
+
+ /* Copy data, releasing collapsed skbs. */
+ while (copy > 0) {
+ int offset = start - TCP_SKB_CB(skb)->seq;
+ int size = TCP_SKB_CB(skb)->end_seq - start;
+
+ BUG_ON(offset < 0);
+ if (size > 0) {
+ size = min(copy, size);
+ if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+ BUG();
+ TCP_SKB_CB(nskb)->end_seq += size;
+ copy -= size;
+ start += size;
+ }
+ if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+ skb = tcp_collapse_one(sk, skb, list);
+ if (!skb ||
+ skb == tail ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+ return;
+ }
+ }
+ }
+}
+
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+ struct sk_buff *head;
+ u32 start, end;
+
+ if (!skb)
+ return;
+
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
+ head = skb;
+
+ for (;;) {
+ struct sk_buff *next = NULL;
+
+ if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
+ next = skb_queue_next(&tp->out_of_order_queue, skb);
+ skb = next;
+
+ /* Segment is terminated when we see gap or when
+ * we are at the end of all the queue. */
+ if (!skb ||
+ after(TCP_SKB_CB(skb)->seq, end) ||
+ before(TCP_SKB_CB(skb)->end_seq, start)) {
+ tcp_collapse(sk, &tp->out_of_order_queue,
+ head, skb, start, end);
+ head = skb;
+ if (!skb)
+ break;
+ /* Start new segment */
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
+ } else {
+ if (before(TCP_SKB_CB(skb)->seq, start))
+ start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
+ end = TCP_SKB_CB(skb)->end_seq;
+ }
+ }
+}
+
+/*
+ * Purge the out-of-order queue.
+ * Return true if queue was pruned.
+ */
+static bool tcp_prune_ofo_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool res = false;
+
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
+ sk_mem_reclaim(sk);
+ res = true;
+ }
+ return res;
+}
+
+/* Reduce allocated memory if we can, trying to get
+ * the socket within its memory limits again.
+ *
+ * Return less than zero if we should start dropping frames
+ * until the socket owning process reads some of the data
+ * to stabilize the situation.
+ */
+static int tcp_prune_queue(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ tcp_clamp_window(sk);
+ else if (sk_under_memory_pressure(sk))
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+
+ tcp_collapse_ofo_queue(sk);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ tcp_collapse(sk, &sk->sk_receive_queue,
+ skb_peek(&sk->sk_receive_queue),
+ NULL,
+ tp->copied_seq, tp->rcv_nxt);
+ sk_mem_reclaim(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
+ /* Collapsing did not help, destructive actions follow.
+ * This must not ever occur. */
+
+ tcp_prune_ofo_queue(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
+ /* If we are really being abused, tell the caller to silently
+ * drop receive data on the floor. It will get retransmitted
+ * and hopefully then we'll have sufficient space.
+ */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
+
+ /* Massive buffer overcommit. */
+ tp->pred_flags = 0;
+ return -1;
+}
+
+static bool tcp_should_expand_sndbuf(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If the user specified a specific send buffer setting, do
+ * not modify it.
+ */
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ return false;
+
+ /* If we are under global TCP memory pressure, do not expand. */
+ if (sk_under_memory_pressure(sk))
+ return false;
+
+ /* If we are under soft global TCP memory pressure, do not expand. */
+ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+ return false;
+
+ /* If we filled the congestion window, do not expand. */
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ return false;
+
+ return true;
+}
+
+/* When incoming ACK allowed to free some skb from write_queue,
+ * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
+ * on the exit from tcp input handler.
+ *
+ * PROBLEM: sndbuf expansion does not work well with largesend.
+ */
+static void tcp_new_space(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_should_expand_sndbuf(sk)) {
+ tcp_sndbuf_expand(sk);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
+
+ sk->sk_write_space(sk);
+}
+
+static void tcp_check_space(struct sock *sk)
+{
+ if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
+ sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+ /* pairs with tcp_poll() */
+ smp_mb__after_atomic();
+ if (sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_new_space(sk);
+ }
+}
+
+static inline void tcp_data_snd_check(struct sock *sk)
+{
+ tcp_push_pending_frames(sk);
+ tcp_check_space(sk);
+}
+
+/*
+ * Check if sending an ack is needed.
+ */
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* More than one full frame received... */
+ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+ /* ... and right edge of window advances far enough.
+ * (tcp_recvmsg() will send ACK otherwise). Or...
+ */
+ __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ /* We ACK each frame or... */
+ tcp_in_quickack_mode(sk) ||
+ /* We have out of order data. */
+ (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+ /* Then ack it now */
+ tcp_send_ack(sk);
+ } else {
+ /* Else, send delayed ack. */
+ tcp_send_delayed_ack(sk);
+ }
+}
+
+static inline void tcp_ack_snd_check(struct sock *sk)
+{
+ if (!inet_csk_ack_scheduled(sk)) {
+ /* We sent a data segment already. */
+ return;
+ }
+ __tcp_ack_snd_check(sk, 1);
+}
+
+/*
+ * This routine is only called when we have urgent data
+ * signaled. Its the 'slow' part of tcp_urg. It could be
+ * moved inline now as tcp_urg is only called from one
+ * place. We handle URGent data wrong. We have to - as
+ * BSD still doesn't use the correction from RFC961.
+ * For 1003.1g we should support a new option TCP_STDURG to permit
+ * either form (or just set the sysctl tcp_stdurg).
+ */
+
+static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 ptr = ntohs(th->urg_ptr);
+
+ if (ptr && !sysctl_tcp_stdurg)
+ ptr--;
+ ptr += ntohl(th->seq);
+
+ /* Ignore urgent data that we've already seen and read. */
+ if (after(tp->copied_seq, ptr))
+ return;
+
+ /* Do not replay urg ptr.
+ *
+ * NOTE: interesting situation not covered by specs.
+ * Misbehaving sender may send urg ptr, pointing to segment,
+ * which we already have in ofo queue. We are not able to fetch
+ * such data and will stay in TCP_URG_NOTYET until will be eaten
+ * by recvmsg(). Seems, we are not obliged to handle such wicked
+ * situations. But it is worth to think about possibility of some
+ * DoSes using some hypothetical application level deadlock.
+ */
+ if (before(ptr, tp->rcv_nxt))
+ return;
+
+ /* Do we already have a newer (or duplicate) urgent pointer? */
+ if (tp->urg_data && !after(ptr, tp->urg_seq))
+ return;
+
+ /* Tell the world about our new urgent pointer. */
+ sk_send_sigurg(sk);
+
+ /* We may be adding urgent data when the last byte read was
+ * urgent. To do this requires some care. We cannot just ignore
+ * tp->copied_seq since we would read the last urgent byte again
+ * as data, nor can we alter copied_seq until this data arrives
+ * or we break the semantics of SIOCATMARK (and thus sockatmark())
+ *
+ * NOTE. Double Dutch. Rendering to plain English: author of comment
+ * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
+ * and expect that both A and B disappear from stream. This is _wrong_.
+ * Though this happens in BSD with high probability, this is occasional.
+ * Any application relying on this is buggy. Note also, that fix "works"
+ * only in this artificial test. Insert some normal data between A and B and we will
+ * decline of BSD again. Verdict: it is better to remove to trap
+ * buggy users.
+ */
+ if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+ !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ tp->copied_seq++;
+ if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ __kfree_skb(skb);
+ }
+ }
+
+ tp->urg_data = TCP_URG_NOTYET;
+ tp->urg_seq = ptr;
+
+ /* Disable header prediction. */
+ tp->pred_flags = 0;
+}
+
+/* This is the 'fast' part of urgent handling. */
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Check if we get a new urgent pointer - normally not. */
+ if (th->urg)
+ tcp_check_urg(sk, th);
+
+ /* Do we wait for any urgent data? - normally not... */
+ if (tp->urg_data == TCP_URG_NOTYET) {
+ u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
+ th->syn;
+
+ /* Is the urgent pointer pointing into this packet? */
+ if (ptr < skb->len) {
+ u8 tmp;
+ if (skb_copy_bits(skb, ptr, &tmp, 1))
+ BUG();
+ tp->urg_data = TCP_URG_VALID | tmp;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ }
+ }
+}
+
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int chunk = skb->len - hlen;
+ int err;
+
+ local_bh_enable();
+ if (skb_csum_unnecessary(skb))
+ err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
+ else
+ err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
+
+ if (!err) {
+ tp->ucopy.len -= chunk;
+ tp->copied_seq += chunk;
+ tcp_rcv_space_adjust(sk);
+ }
+
+ local_bh_disable();
+ return err;
+}
+
+static __sum16 __tcp_checksum_complete_user(struct sock *sk,
+ struct sk_buff *skb)
+{
+ __sum16 result;
+
+ if (sock_owned_by_user(sk)) {
+ local_bh_enable();
+ result = __tcp_checksum_complete(skb);
+ local_bh_disable();
+ } else {
+ result = __tcp_checksum_complete(skb);
+ }
+ return result;
+}
+
+static inline bool tcp_checksum_complete_user(struct sock *sk,
+ struct sk_buff *skb)
+{
+ return !skb_csum_unnecessary(skb) &&
+ __tcp_checksum_complete_user(sk, skb);
+}
+
+/* Does PAWS and seqno based validation of an incoming segment, flags will
+ * play significant role here.
+ */
+static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, int syn_inerr)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* RFC1323: H1. Apply PAWS check first. */
+ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+ tcp_paws_discard(sk, skb)) {
+ if (!th->rst) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDPAWS,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ goto discard;
+ }
+ /* Reset is accepted even if it did not pass PAWS. */
+ }
+
+ /* Step 1: check sequence number */
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ /* RFC793, page 37: "In all states except SYN-SENT, all reset
+ * (RST) segments are validated by checking their SEQ-fields."
+ * And page 69: "If an incoming segment is not acceptable,
+ * an acknowledgment should be sent in reply (unless the RST
+ * bit is set, if so drop the segment and return)".
+ */
+ if (!th->rst) {
+ if (th->syn)
+ goto syn_challenge;
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSEQ,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ }
+ goto discard;
+ }
+
+ /* Step 2: check RST bit */
+ if (th->rst) {
+ /* RFC 5961 3.2 :
+ * If sequence number exactly matches RCV.NXT, then
+ * RESET the connection
+ * else
+ * Send a challenge ACK
+ */
+ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+ tcp_reset(sk);
+ else
+ tcp_send_challenge_ack(sk, skb);
+ goto discard;
+ }
+
+ /* step 3: check security and precedence [ignored] */
+
+ /* step 4: Check for a SYN
+ * RFC 5961 4.2 : Send a challenge ack
+ */
+ if (th->syn) {
+syn_challenge:
+ if (syn_inerr)
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ tcp_send_challenge_ack(sk, skb);
+ goto discard;
+ }
+
+ return true;
+
+discard:
+ __kfree_skb(skb);
+ return false;
+}
+
+/*
+ * TCP receive function for the ESTABLISHED state.
+ *
+ * It is split into a fast path and a slow path. The fast path is
+ * disabled when:
+ * - A zero window was announced from us - zero window probing
+ * is only handled properly in the slow path.
+ * - Out of order segments arrived.
+ * - Urgent data is expected.
+ * - There is no buffer space left
+ * - Unexpected TCP flags/window values/header lengths are received
+ * (detected by checking the TCP header against pred_flags)
+ * - Data is sent in both directions. Fast path only supports pure senders
+ * or pure receivers (this means either the sequence number or the ack
+ * value must stay constant)
+ * - Unexpected TCP option.
+ *
+ * When these conditions are not satisfied it drops into a standard
+ * receive procedure patterned after RFC793 to handle all cases.
+ * The first three cases are guaranteed by proper pred_flags setting,
+ * the rest is checked inline. Fast processing is turned on in
+ * tcp_data_queue when everything is OK.
+ */
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (unlikely(!sk->sk_rx_dst))
+ inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
+ /*
+ * Header prediction.
+ * The code loosely follows the one in the famous
+ * "30 instruction TCP receive" Van Jacobson mail.
+ *
+ * Van's trick is to deposit buffers into socket queue
+ * on a device interrupt, to call tcp_recv function
+ * on the receive process context and checksum and copy
+ * the buffer to user space. smart...
+ *
+ * Our current scheme is not silly either but we take the
+ * extra cost of the net_bh soft interrupt processing...
+ * We do checksum and copy also but from device to kernel.
+ */
+
+ tp->rx_opt.saw_tstamp = 0;
+
+ /* pred_flags is 0xS?10 << 16 + snd_wnd
+ * if header_prediction is to be made
+ * 'S' will always be tp->tcp_header_len >> 2
+ * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
+ * turn it off (when there are holes in the receive
+ * space for instance)
+ * PSH flag is ignored.
+ */
+
+ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+ !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ int tcp_header_len = tp->tcp_header_len;
+
+ /* Timestamp header prediction: tcp_header_len
+ * is automatically equal to th->doff*4 due to pred_flags
+ * match.
+ */
+
+ /* Check timestamp */
+ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+ /* No? Slow path! */
+ if (!tcp_parse_aligned_timestamp(tp, th))
+ goto slow_path;
+
+ /* If PAWS failed, check it more carefully in slow path */
+ if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+ goto slow_path;
+
+ /* DO NOT update ts_recent here, if checksum fails
+ * and timestamp was corrupted part, it will result
+ * in a hung connection since we will drop all
+ * future packets due to the PAWS test.
+ */
+ }
+
+ if (len <= tcp_header_len) {
+ /* Bulk data transfer: sender */
+ if (len == tcp_header_len) {
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ /* We know that such packets are checksummed
+ * on entry.
+ */
+ tcp_ack(sk, skb, 0);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ return;
+ } else { /* Header too small */
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ goto discard;
+ }
+ } else {
+ int eaten = 0;
+ bool fragstolen = false;
+
+#ifdef CONFIG_TCP_STEALTH
+ if (unlikely(tp->stealth.mode &
+ TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+ __tcp_stealth_integrity_check(sk, skb)) {
+ tcp_reset(sk);
+ goto discard;
+ }
+#endif
+
+ if (tp->ucopy.task == current &&
+ tp->copied_seq == tp->rcv_nxt &&
+ len - tcp_header_len <= tp->ucopy.len &&
+ sock_owned_by_user(sk)) {
+ __set_current_state(TASK_RUNNING);
+
+ if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) +
+ TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ tcp_rcv_rtt_measure_ts(sk, skb);
+
+ __skb_pull(skb, tcp_header_len);
+ tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+ eaten = 1;
+ }
+ }
+ if (!eaten) {
+ if (tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
+
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
+
+ /* Predicted packet is in window by definition.
+ * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+ * Hence, check seq<=rcv_wup reduces to:
+ */
+ if (tcp_header_len ==
+ (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+ tp->rcv_nxt == tp->rcv_wup)
+ tcp_store_ts_recent(tp);
+
+ tcp_rcv_rtt_measure_ts(sk, skb);
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
+
+ /* Bulk data transfer: receiver */
+ eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+ &fragstolen);
+ }
+
+ tcp_event_data_recv(sk, skb);
+
+ if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+ /* Well, only one small jumplet in fast path... */
+ tcp_ack(sk, skb, FLAG_DATA);
+ tcp_data_snd_check(sk);
+ if (!inet_csk_ack_scheduled(sk))
+ goto no_ack;
+ }
+
+ __tcp_ack_snd_check(sk, 0);
+no_ack:
+ if (eaten)
+ kfree_skb_partial(skb, fragstolen);
+ sk->sk_data_ready(sk);
+ return;
+ }
+ }
+
+slow_path:
+ if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
+ goto csum_error;
+
+ if (!th->ack && !th->rst && !th->syn)
+ goto discard;
+
+ /*
+ * Standard slow path.
+ */
+
+ if (!tcp_validate_incoming(sk, skb, th, 1))
+ return;
+
+step5:
+ if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
+ goto discard;
+
+ tcp_rcv_rtt_measure_ts(sk, skb);
+
+ /* Process urgent data. */
+ tcp_urg(sk, skb, th);
+
+ /* step 7: process the segment text */
+ tcp_data_queue(sk, skb);
+
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ return;
+
+csum_error:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+
+discard:
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(tcp_rcv_established);
+
+void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (skb) {
+ icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
+ security_inet_conn_established(sk, skb);
+ }
+
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+
+ tcp_init_metrics(sk);
+
+ tcp_init_congestion_control(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data
+ * packet.
+ */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_init_buffer_space(sk);
+
+ if (sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+ if (!tp->rx_opt.snd_wscale)
+ __tcp_fast_path_on(tp, tp->snd_wnd);
+ else
+ tp->pred_flags = 0;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+}
+
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+ struct tcp_fastopen_cookie *cookie)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+ u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
+ bool syn_drop = false;
+
+ if (mss == tp->rx_opt.user_mss) {
+ struct tcp_options_received opt;
+
+ /* Get original SYNACK MSS value if user MSS sets mss_clamp */
+ tcp_clear_options(&opt);
+ opt.user_mss = opt.mss_clamp = 0;
+ tcp_parse_options(synack, &opt, 0, NULL);
+ mss = opt.mss_clamp;
+ }
+
+ if (!tp->syn_fastopen) {
+ /* Ignore an unsolicited cookie */
+ cookie->len = -1;
+ } else if (tp->total_retrans) {
+ /* SYN timed out and the SYN-ACK neither has a cookie nor
+ * acknowledges data. Presumably the remote received only
+ * the retransmitted (regular) SYNs: either the original
+ * SYN-data or the corresponding SYN-ACK was dropped.
+ */
+ syn_drop = (cookie->len < 0 && data);
+ } else if (cookie->len < 0 && !tp->syn_data) {
+ /* We requested a cookie but didn't get it. If we did not use
+ * the (old) exp opt format then try so next time (try_exp=1).
+ * Otherwise we go back to use the RFC7413 opt (try_exp=2).
+ */
+ try_exp = tp->syn_fastopen_exp ? 2 : 1;
+ }
+
+ tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
+
+ if (data) { /* Retransmit unacked data in SYN */
+ tcp_for_write_queue_from(data, sk) {
+ if (data == tcp_send_head(sk) ||
+ __tcp_retransmit_skb(sk, data))
+ break;
+ }
+ tcp_rearm_rto(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ return true;
+ }
+ tp->syn_data_acked = tp->syn_data;
+ if (tp->syn_data_acked)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ return false;
+}
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ int saved_clamp = tp->rx_opt.mss_clamp;
+
+ tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+ tp->rx_opt.rcv_tsecr -= tp->tsoffset;
+
+ if (th->ack) {
+ /* rfc793:
+ * "If the state is SYN-SENT then
+ * first check the ACK bit
+ * If the ACK bit is set
+ * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+ * a reset (unless the RST bit is set, if so drop
+ * the segment and return)"
+ */
+ if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
+ goto reset_and_undo;
+
+ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
+ tcp_time_stamp)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
+ goto reset_and_undo;
+ }
+
+ /* Now ACK is acceptable.
+ *
+ * "If the RST bit is set
+ * If the ACK was acceptable then signal the user "error:
+ * connection reset", drop the segment, enter CLOSED state,
+ * delete TCB, and return."
+ */
+
+ if (th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ /* rfc793:
+ * "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ *
+ * See note below!
+ * --ANK(990513)
+ */
+ if (!th->syn)
+ goto discard_and_undo;
+
+ /* rfc793:
+ * "If the SYN bit is on ...
+ * are acceptable then ...
+ * (our SYN has been ACKed), change the connection
+ * state to ESTABLISHED..."
+ */
+
+ tcp_ecn_rcv_synack(tp, th);
+
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+ /* Ok.. it's good. Set up sequence numbers and
+ * move to established.
+ */
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = ntohs(th->window);
+
+ if (!tp->rx_opt.wscale_ok) {
+ tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
+ tp->window_clamp = min(tp->window_clamp, 65535U);
+ }
+
+ if (tp->rx_opt.saw_tstamp) {
+ tp->rx_opt.tstamp_ok = 1;
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+ tcp_store_ts_recent(tp);
+ } else {
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ }
+
+ if (tcp_is_sack(tp) && sysctl_tcp_fack)
+ tcp_enable_fack(tp);
+
+ tcp_mtup_init(sk);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+
+ /* Remember, tcp_poll() does not lock socket!
+ * Change state from SYN-SENT only after copied_seq
+ * is initialized. */
+ tp->copied_seq = tp->rcv_nxt;
+
+ smp_mb();
+
+ tcp_finish_connect(sk, skb);
+
+ if ((tp->syn_fastopen || tp->syn_data) &&
+ tcp_rcv_fastopen_synack(sk, skb, &foc))
+ return -1;
+
+ if (sk->sk_write_pending ||
+ icsk->icsk_accept_queue.rskq_defer_accept ||
+ icsk->icsk_ack.pingpong) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+ * It may be deleted, but with this feature tcpdumps
+ * look so _wonderfully_ clever, that I was not able
+ * to stand against the temptation 8) --ANK
+ */
+ inet_csk_schedule_ack(sk);
+ icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+ tcp_enter_quickack_mode(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
+
+discard:
+ __kfree_skb(skb);
+ return 0;
+ } else {
+ tcp_send_ack(sk);
+ }
+ return -1;
+ }
+
+ /* No ACK in the segment */
+
+ if (th->rst) {
+ /* rfc793:
+ * "If the RST bit is set
+ *
+ * Otherwise (no ACK) drop the segment and return."
+ */
+
+ goto discard_and_undo;
+ }
+
+ /* PAWS check. */
+ if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
+ tcp_paws_reject(&tp->rx_opt, 0))
+ goto discard_and_undo;
+
+ if (th->syn) {
+ /* We see SYN without ACK. It is attempt of
+ * simultaneous connect with crossed SYNs.
+ * Particularly, it can be connect to self.
+ */
+ tcp_set_state(sk, TCP_SYN_RECV);
+
+ if (tp->rx_opt.saw_tstamp) {
+ tp->rx_opt.tstamp_ok = 1;
+ tcp_store_ts_recent(tp);
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ }
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = ntohs(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->max_window = tp->snd_wnd;
+
+ tcp_ecn_rcv_syn(tp, th);
+
+ tcp_mtup_init(sk);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+
+ tcp_send_synack(sk);
+#if 0
+ /* Note, we could accept data and URG from this segment.
+ * There are no obstacles to make this (except that we must
+ * either change tcp_recvmsg() to prevent it from returning data
+ * before 3WHS completes per RFC793, or employ TCP Fast Open).
+ *
+ * However, if we ignore data in ACKless segments sometimes,
+ * we have no reasons to accept it sometimes.
+ * Also, seems the code doing it in step6 of tcp_rcv_state_process
+ * is not flawless. So, discard packet for sanity.
+ * Uncomment this return to process the data.
+ */
+ return -1;
+#else
+ goto discard;
+#endif
+ }
+ /* "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ */
+
+discard_and_undo:
+ tcp_clear_options(&tp->rx_opt);
+ tp->rx_opt.mss_clamp = saved_clamp;
+ goto discard;
+
+reset_and_undo:
+ tcp_clear_options(&tp->rx_opt);
+ tp->rx_opt.mss_clamp = saved_clamp;
+ return 1;
+}
+
+/*
+ * This function implements the receiving procedure of RFC 793 for
+ * all states except ESTABLISHED and TIME_WAIT.
+ * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ * address independent.
+ */
+
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *req;
+ int queued = 0;
+ bool acceptable;
+ u32 synack_stamp;
+
+ tp->rx_opt.saw_tstamp = 0;
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ goto discard;
+
+ case TCP_LISTEN:
+ if (th->ack)
+ return 1;
+
+ if (th->rst)
+ goto discard;
+
+ if (th->syn) {
+ if (th->fin)
+ goto discard;
+ if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
+ return 1;
+
+ /* Now we have several options: In theory there is
+ * nothing else in the frame. KA9Q has an option to
+ * send data with the syn, BSD accepts data with the
+ * syn up to the [to be] advertised window and
+ * Solaris 2.1 gives you a protocol error. For now
+ * we just ignore it, that fits the spec precisely
+ * and avoids incompatibilities. It would be nice in
+ * future to drop through and process the data.
+ *
+ * Now that TTCP is starting to be used we ought to
+ * queue this data.
+ * But, this leaves one open to an easy denial of
+ * service attack, and SYN cookies can't defend
+ * against this problem. So, we drop the data
+ * in the interest of security over speed unless
+ * it's still in use.
+ */
+ kfree_skb(skb);
+ return 0;
+ }
+ goto discard;
+
+ case TCP_SYN_SENT:
+ queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ if (queued >= 0)
+ return queued;
+
+ /* Do step6 onward by hand. */
+ tcp_urg(sk, skb, th);
+ __kfree_skb(skb);
+ tcp_data_snd_check(sk);
+ return 0;
+ }
+
+ req = tp->fastopen_rsk;
+ if (req) {
+ WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+ sk->sk_state != TCP_FIN_WAIT1);
+
+ if (!tcp_check_req(sk, skb, req, true))
+ goto discard;
+ }
+
+ if (!th->ack && !th->rst && !th->syn)
+ goto discard;
+
+ if (!tcp_validate_incoming(sk, skb, th, 0))
+ return 0;
+
+ /* step 5: check the ACK field */
+ acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT) > 0;
+
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ if (!acceptable)
+ return 1;
+
+ /* Once we leave TCP_SYN_RECV, we no longer need req
+ * so release it.
+ */
+ if (req) {
+ synack_stamp = tcp_rsk(req)->snt_synack;
+ tp->total_retrans = req->num_retrans;
+ reqsk_fastopen_remove(sk, req, false);
+ } else {
+ synack_stamp = tp->lsndtime;
+ /* Make sure socket is routed, for correct metrics. */
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_congestion_control(sk);
+
+ tcp_mtup_init(sk);
+ tp->copied_seq = tp->rcv_nxt;
+ tcp_init_buffer_space(sk);
+ }
+ smp_mb();
+ tcp_set_state(sk, TCP_ESTABLISHED);
+ sk->sk_state_change(sk);
+
+ /* Note, that this wakeup is only for marginal crossed SYN case.
+ * Passively open sockets are not waked up, because
+ * sk->sk_sleep == NULL and sk->sk_socket == NULL.
+ */
+ if (sk->sk_socket)
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+
+ tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+ tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
+ tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_synack_rtt_meas(sk, synack_stamp);
+
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
+ if (req) {
+ /* Re-arm the timer because data may have been sent out.
+ * This is similar to the regular data transmission case
+ * when new data has just been ack'ed.
+ *
+ * (TFO) - we could try to be more aggressive and
+ * retransmitting any data sooner based on when they
+ * are sent out.
+ */
+ tcp_rearm_rto(sk);
+ } else
+ tcp_init_metrics(sk);
+
+ tcp_update_pacing_rate(sk);
+
+ /* Prevent spurious tcp_cwnd_restart() on first data packet */
+ tp->lsndtime = tcp_time_stamp;
+
+ tcp_initialize_rcv_mss(sk);
+ tcp_fast_path_on(tp);
+ break;
+
+ case TCP_FIN_WAIT1: {
+ struct dst_entry *dst;
+ int tmo;
+
+ /* If we enter the TCP_FIN_WAIT1 state and we are a
+ * Fast Open socket and this is the first acceptable
+ * ACK we have received, this would have acknowledged
+ * our SYNACK so stop the SYNACK timer.
+ */
+ if (req) {
+ /* Return RST if ack_seq is invalid.
+ * Note that RFC793 only says to generate a
+ * DUPACK for it but for TCP Fast Open it seems
+ * better to treat this case like TCP_SYN_RECV
+ * above.
+ */
+ if (!acceptable)
+ return 1;
+ /* We no longer need the request sock. */
+ reqsk_fastopen_remove(sk, req, false);
+ tcp_rearm_rto(sk);
+ }
+ if (tp->snd_una != tp->write_seq)
+ break;
+
+ tcp_set_state(sk, TCP_FIN_WAIT2);
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ dst = __sk_dst_get(sk);
+ if (dst)
+ dst_confirm(dst);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Wake up lingering close() */
+ sk->sk_state_change(sk);
+ break;
+ }
+
+ if (tp->linger2 < 0 ||
+ (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ return 1;
+ }
+
+ tmo = tcp_fin_time(sk);
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ } else if (th->fin || sock_owned_by_user(sk)) {
+ /* Bad case. We could lose such FIN otherwise.
+ * It is not a big problem, but it looks confusing
+ * and not so rare event. We still can lose it now,
+ * if it spins in bh_lock_sock(), but it is really
+ * marginal case.
+ */
+ inet_csk_reset_keepalive_timer(sk, tmo);
+ } else {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto discard;
+ }
+ break;
+ }
+
+ case TCP_CLOSING:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+ goto discard;
+ }
+ break;
+
+ case TCP_LAST_ACK:
+ if (tp->snd_una == tp->write_seq) {
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+ goto discard;
+ }
+ break;
+ }
+
+ /* step 6: check the URG bit */
+ tcp_urg(sk, skb, th);
+
+ /* step 7: process the segment text */
+ switch (sk->sk_state) {
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSING:
+ case TCP_LAST_ACK:
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ break;
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ /* RFC 793 says to queue data in these states,
+ * RFC 1122 says we MUST send a reset.
+ * BSD 4.4 also does reset.
+ */
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ tcp_reset(sk);
+ return 1;
+ }
+ }
+ /* Fall through */
+ case TCP_ESTABLISHED:
+ tcp_data_queue(sk, skb);
+ queued = 1;
+ break;
+ }
+
+ /* tcp_data could move socket to TIME-WAIT */
+ if (sk->sk_state != TCP_CLOSE) {
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ }
+
+ if (!queued) {
+discard:
+ __kfree_skb(skb);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_rcv_state_process);
+
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (family == AF_INET)
+ net_dbg_ratelimited("drop open request from %pI4/%u\n",
+ &ireq->ir_rmt_addr, port);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ net_dbg_ratelimited("drop open request from %pI6/%u\n",
+ &ireq->ir_v6_rmt_addr, port);
+#endif
+}
+
+/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
+ *
+ * If we receive a SYN packet with these bits set, it means a
+ * network is playing bad games with TOS bits. In order to
+ * avoid possible false congestion notifications, we disable
+ * TCP ECN negotiation.
+ *
+ * Exception: tcp_ca wants ECN. This is required for DCTCP
+ * congestion control: Linux DCTCP asserts ECT on all packets,
+ * including SYN, which is most optimal solution; however,
+ * others, such as FreeBSD do not.
+ */
+static void tcp_ecn_create_request(struct request_sock *req,
+ const struct sk_buff *skb,
+ const struct sock *listen_sk,
+ const struct dst_entry *dst)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct net *net = sock_net(listen_sk);
+ bool th_ecn = th->ece && th->cwr;
+ bool ect, ecn_ok;
+
+ if (!th_ecn)
+ return;
+
+ ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
+ ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
+
+ if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
+ inet_rsk(req)->ecn_ok = 1;
+}
+
+static void tcp_openreq_init(struct request_sock *req,
+ const struct tcp_options_received *rx_opt,
+ struct sk_buff *skb, const struct sock *sk)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->cookie_ts = 0;
+ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
+ tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->last_oow_ack_time = 0;
+ req->mss = rx_opt->mss_clamp;
+ req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+ ireq->tstamp_ok = rx_opt->tstamp_ok;
+ ireq->sack_ok = rx_opt->sack_ok;
+ ireq->snd_wscale = rx_opt->snd_wscale;
+ ireq->wscale_ok = rx_opt->wscale_ok;
+ ireq->acked = 0;
+ ireq->ecn_ok = 0;
+ ireq->ir_rmt_port = tcp_hdr(skb)->source;
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+}
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener)
+{
+ struct request_sock *req = reqsk_alloc(ops, sk_listener);
+
+ if (req) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ kmemcheck_annotate_bitfield(ireq, flags);
+ ireq->opt = NULL;
+ atomic64_set(&ireq->ir_cookie, 0);
+ ireq->ireq_state = TCP_NEW_SYN_RECV;
+ write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+ ireq->ireq_family = sk_listener->sk_family;
+ }
+
+ return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
+/*
+ * Return true if a syncookie should be sent
+ */
+static bool tcp_syn_flood_action(struct sock *sk,
+ const struct sk_buff *skb,
+ const char *proto)
+{
+ const char *msg = "Dropping request";
+ bool want_cookie = false;
+ struct listen_sock *lopt;
+
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies) {
+ msg = "Sending cookies";
+ want_cookie = true;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ } else
+#endif
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+
+ lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
+ if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) {
+ lopt->synflood_warned = 1;
+ pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
+ proto, ntohs(tcp_hdr(skb)->dest), msg);
+ }
+ return want_cookie;
+}
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_options_received tmp_opt;
+ struct request_sock *req;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = NULL;
+ __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
+ bool want_cookie = false, fastopen;
+ struct flowi fl;
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ int err;
+
+
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ if ((sysctl_tcp_syncookies == 2 ||
+ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+ if (!want_cookie)
+ goto drop;
+ }
+
+
+ /* Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ req = inet_reqsk_alloc(rsk_ops, sk);
+ if (!req)
+ goto drop;
+
+ tcp_rsk(req)->af_specific = af_ops;
+
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.mss_clamp = af_ops->mss_clamp;
+ tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+
+ if (want_cookie && !tmp_opt.saw_tstamp)
+ tcp_clear_options(&tmp_opt);
+
+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+ tcp_openreq_init(req, &tmp_opt, skb, sk);
+
+ /* Note: tcp_v6_init_req() might override ir_iif for link locals */
+ inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+
+ af_ops->init_req(req, sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
+
+ if (!want_cookie && !isn) {
+ /* VJ's idea. We save last timestamp seen
+ * from the destination in peer table, when entering
+ * state TIME-WAIT, and check against it before
+ * accepting new connection request.
+ *
+ * If "isn" is not zero, this request hit alive
+ * timewait bucket, so that all the necessary checks
+ * are made in the function processing timewait state.
+ */
+ if (tcp_death_row.sysctl_tw_recycle) {
+ bool strict;
+
+ dst = af_ops->route_req(sk, &fl, req, &strict);
+
+ if (dst && strict &&
+ !tcp_peer_is_proven(req, dst, true,
+ tmp_opt.saw_tstamp)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+ goto drop_and_release;
+ }
+ }
+ /* Kill the following clause, if you dislike this way. */
+ else if (!sysctl_tcp_syncookies &&
+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (sysctl_max_syn_backlog >> 2)) &&
+ !tcp_peer_is_proven(req, dst, false,
+ tmp_opt.saw_tstamp)) {
+ /* Without syncookies last quarter of
+ * backlog is filled with destinations,
+ * proven to be alive.
+ * It means that we continue to communicate
+ * to destinations, already remembered
+ * to the moment of synflood.
+ */
+ pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+ rsk_ops->family);
+ goto drop_and_release;
+ }
+
+ isn = af_ops->init_seq(skb);
+ }
+ if (!dst) {
+ dst = af_ops->route_req(sk, &fl, req, NULL);
+ if (!dst)
+ goto drop_and_free;
+ }
+
+ tcp_ecn_create_request(req, skb, sk, dst);
+
+ if (want_cookie) {
+ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ req->cookie_ts = tmp_opt.tstamp_ok;
+ if (!tmp_opt.tstamp_ok)
+ inet_rsk(req)->ecn_ok = 0;
+ }
+
+ tcp_rsk(req)->snt_isn = isn;
+ tcp_openreq_init_rwin(req, sk, dst);
+ fastopen = !want_cookie &&
+ tcp_try_fastopen(sk, skb, req, &foc, dst);
+ err = af_ops->send_synack(sk, dst, &fl, req,
+ skb_get_queue_mapping(skb), &foc);
+ if (!fastopen) {
+ if (err || want_cookie)
+ goto drop_and_free;
+
+ tcp_rsk(req)->tfo_listener = false;
+ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ }
+
+ return 0;
+
+drop_and_release:
+ dst_release(dst);
+drop_and_free:
+ reqsk_free(req);
+drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 000000000..1e0ce4d7b
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2468 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * IPv4 specific functions
+ *
+ *
+ * code split from:
+ * linux/ipv4/tcp.c
+ * linux/ipv4/tcp_input.c
+ * linux/ipv4/tcp_output.c
+ *
+ * See tcp.c for author information
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ * David S. Miller : New socket lookup architecture.
+ * This code is dedicated to John Dyson.
+ * David S. Miller : Change semantics of established hash,
+ * half is devoted to TIME_WAIT sockets
+ * and the rest go in the other half.
+ * Andi Kleen : Add support for syncookies and fixed
+ * some bugs: ip options weren't passed to
+ * the TCP layer, missed a check for an
+ * ACK bit.
+ * Andi Kleen : Implemented fast path mtu discovery.
+ * Fixed many serious bugs in the
+ * request_sock handling and moved
+ * most of it into the af independent code.
+ * Added tail drop and some other bugfixes.
+ * Added new listen semantics.
+ * Mike McLagan : Routing by source
+ * Juan Jose Ciarlante: ip_dynaddr bits
+ * Andi Kleen: various fixes.
+ * Vitaly E. Lavrov : Transparent proxy revived after year
+ * coma.
+ * Andi Kleen : Fix new listen.
+ * Andi Kleen : Fix accept error reporting.
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <linux/bottom_half.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp.h>
+#include <net/transp_v6.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/timewait_sock.h>
+#include <net/xfrm.h>
+#include <net/secure_seq.h>
+#include <net/tcp_memcontrol.h>
+#include <net/busy_poll.h>
+#include <net/secure_seq.h>
+
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/stddef.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+
+int sysctl_tcp_tw_reuse __read_mostly;
+int sysctl_tcp_low_latency __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
+
+#ifdef CONFIG_TCP_MD5SIG
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th);
+#endif
+
+struct inet_hashinfo tcp_hashinfo;
+EXPORT_SYMBOL(tcp_hashinfo);
+
+static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+{
+ return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
+}
+
+int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* With PAWS, it is safe from the viewpoint
+ of data integrity. Even without PAWS it is safe provided sequence
+ spaces do not overlap i.e. at data rates <= 80Mbit/sec.
+
+ Actually, the idea is close to VJ's one, only timestamp cache is
+ held not per host, but per port pair and TW bucket is used as state
+ holder.
+
+ If TW bucket has been already destroyed we fall back to VJ's scheme
+ and use initial timestamp retrieved from peer table.
+ */
+ if (tcptw->tw_ts_recent_stamp &&
+ (!twp || (sysctl_tcp_tw_reuse &&
+ get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
+ tp->write_seq = 1;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ sock_hold(sktw);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+
+/* This will initiate an outgoing connection. */
+int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __be16 orig_sport, orig_dport;
+ __be32 daddr, nexthop;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ int err;
+ struct ip_options_rcu *inet_opt;
+
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ nexthop = daddr = usin->sin_addr.s_addr;
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr) {
+ if (!daddr)
+ return -EINVAL;
+ nexthop = inet_opt->opt.faddr;
+ }
+
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_TCP,
+ orig_sport, orig_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ return err;
+ }
+
+ if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+ ip_rt_put(rt);
+ return -ENETUNREACH;
+ }
+
+ if (!inet_opt || !inet_opt->opt.srr)
+ daddr = fl4->daddr;
+
+ if (!inet->inet_saddr)
+ inet->inet_saddr = fl4->saddr;
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
+
+ if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
+ /* Reset inherited state */
+ tp->rx_opt.ts_recent = 0;
+ tp->rx_opt.ts_recent_stamp = 0;
+ if (likely(!tp->repair))
+ tp->write_seq = 0;
+ }
+
+ if (tcp_death_row.sysctl_tw_recycle &&
+ !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
+ tcp_fetch_timewait_stamp(sk, &rt->dst);
+
+ inet->inet_dport = usin->sin_port;
+ sk_daddr_set(sk, daddr);
+
+ inet_csk(sk)->icsk_ext_hdr_len = 0;
+ if (inet_opt)
+ inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+
+ tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
+
+ /* Socket identity is still unknown (sport may be zero).
+ * However we set state to SYN-SENT and not releasing socket
+ * lock select source port, enter ourselves into the hash tables and
+ * complete initialization after this.
+ */
+ tcp_set_state(sk, TCP_SYN_SENT);
+ err = inet_hash_connect(&tcp_death_row, sk);
+ if (err)
+ goto failure;
+
+ inet_set_txhash(sk);
+
+ rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ goto failure;
+ }
+ /* OK, now commit destination to socket. */
+ sk->sk_gso_type = SKB_GSO_TCPV4;
+ sk_setup_caps(sk, &rt->dst);
+
+#ifdef CONFIG_TCP_STEALTH
+ /* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as
+ * early as possible and thus move taking the snapshot of tcp_time_stamp
+ * here.
+ */
+ skb_mstamp_get(&tp->stealth.mstamp);
+
+ if (!tp->write_seq && likely(!tp->repair) &&
+ unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH))
+ tp->write_seq = tcp_stealth_sequence_number(sk,
+ &inet->inet_daddr,
+ sizeof(inet->inet_daddr),
+ usin->sin_port);
+#endif
+
+ if (!tp->write_seq && likely(!tp->repair))
+ tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port);
+
+ inet->inet_id = tp->write_seq ^ jiffies;
+
+ err = tcp_connect(sk);
+
+ rt = NULL;
+ if (err)
+ goto failure;
+
+ return 0;
+
+failure:
+ /*
+ * This unhashes the socket and releases the local port,
+ * if necessary.
+ */
+ tcp_set_state(sk, TCP_CLOSE);
+ ip_rt_put(rt);
+ sk->sk_route_caps = 0;
+ inet->inet_dport = 0;
+ return err;
+}
+EXPORT_SYMBOL(tcp_v4_connect);
+
+/*
+ * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
+ * It can be called through tcp_release_cb() if socket was owned by user
+ * at the time tcp_v4_err() was called to handle ICMP message.
+ */
+void tcp_v4_mtu_reduced(struct sock *sk)
+{
+ struct dst_entry *dst;
+ struct inet_sock *inet = inet_sk(sk);
+ u32 mtu = tcp_sk(sk)->mtu_info;
+
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
+ return;
+
+ /* Something is about to be wrong... Remember soft error
+ * for the case, if this connection will not able to recover.
+ */
+ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ sk->sk_err_soft = EMSGSIZE;
+
+ mtu = dst_mtu(dst);
+
+ if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
+ inet_csk(sk)->icsk_pmtu_cookie > mtu) {
+ tcp_sync_mss(sk, mtu);
+
+ /* Resend the TCP packet because it's
+ * clear that the old packet has been
+ * dropped. This is the new "fast" path mtu
+ * discovery.
+ */
+ tcp_simple_retransmit(sk);
+ } /* else let the usual retransmit timer handle it */
+}
+EXPORT_SYMBOL(tcp_v4_mtu_reduced);
+
+static void do_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
+
+/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
+void tcp_req_err(struct sock *sk, u32 seq)
+{
+ struct request_sock *req = inet_reqsk(sk);
+ struct net *net = sock_net(sk);
+
+ /* ICMPs are not backlogged, hence we cannot get
+ * an established socket here.
+ */
+ WARN_ON(req->sk);
+
+ if (seq != tcp_rsk(req)->snt_isn) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ reqsk_put(req);
+ } else {
+ /*
+ * Still in SYN_RECV, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ }
+}
+EXPORT_SYMBOL(tcp_req_err);
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code. After adjustment
+ * header points to the first 8 bytes of the tcp header. We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+
+void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
+ struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
+ struct inet_connection_sock *icsk;
+ struct tcp_sock *tp;
+ struct inet_sock *inet;
+ const int type = icmp_hdr(icmp_skb)->type;
+ const int code = icmp_hdr(icmp_skb)->code;
+ struct sock *sk;
+ struct sk_buff *skb;
+ struct request_sock *fastopen;
+ __u32 seq, snd_una;
+ __u32 remaining;
+ int err;
+ struct net *net = dev_net(icmp_skb->dev);
+
+ sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
+ th->dest, iph->saddr, ntohs(th->source),
+ inet_iif(icmp_skb));
+ if (!sk) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return;
+ }
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ inet_twsk_put(inet_twsk(sk));
+ return;
+ }
+ seq = ntohl(th->seq);
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_err(sk, seq);
+
+ bh_lock_sock(sk);
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ * We do take care of PMTU discovery (RFC1191) special case :
+ * we can receive locally generated ICMP messages while socket is held.
+ */
+ if (sock_owned_by_user(sk)) {
+ if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ }
+ if (sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
+
+ icsk = inet_csk(sk);
+ tp = tcp_sk(sk);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = tp->fastopen_rsk;
+ snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
+ if (sk->sk_state != TCP_LISTEN &&
+ !between(seq, snd_una, tp->snd_nxt)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ switch (type) {
+ case ICMP_REDIRECT:
+ do_redirect(icmp_skb, sk);
+ goto out;
+ case ICMP_SOURCE_QUENCH:
+ /* Just silently ignore these. */
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code > NR_ICMP_UNREACH)
+ goto out;
+
+ if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ tp->mtu_info = info;
+ if (!sock_owned_by_user(sk)) {
+ tcp_v4_mtu_reduced(sk);
+ } else {
+ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+ sock_hold(sk);
+ }
+ goto out;
+ }
+
+ err = icmp_err_convert[code].errno;
+ /* check if icmp_skb allows revert of backoff
+ * (see draft-zimmermann-tcp-lcd) */
+ if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
+ break;
+ if (seq != tp->snd_una || !icsk->icsk_retransmits ||
+ !icsk->icsk_backoff || fastopen)
+ break;
+
+ if (sock_owned_by_user(sk))
+ break;
+
+ icsk->icsk_backoff--;
+ icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
+ TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
+ skb = tcp_write_queue_head(sk);
+ BUG_ON(!skb);
+
+ remaining = icsk->icsk_rto -
+ min(icsk->icsk_rto,
+ tcp_time_stamp - tcp_skb_timestamp(skb));
+
+ if (remaining) {
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ remaining, TCP_RTO_MAX);
+ } else {
+ /* RTO revert clocked out retransmission.
+ * Will retransmit now */
+ tcp_retransmit_timer(sk);
+ }
+
+ break;
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ default:
+ goto out;
+ }
+
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ /* Only in fast or simultaneous open. If a fast open socket is
+ * is already accepted it is treated as a connected one below.
+ */
+ if (fastopen && !fastopen->sk)
+ break;
+
+ if (!sock_owned_by_user(sk)) {
+ sk->sk_err = err;
+
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+ } else {
+ sk->sk_err_soft = err;
+ }
+ goto out;
+ }
+
+ /* If we've already connected we will keep trying
+ * until we time out, or the user gives up.
+ *
+ * rfc1122 4.2.3.9 allows to consider as hard errors
+ * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+ * but it is obsoleted by pmtu discovery).
+ *
+ * Note, that in modern internet, where routing is unreliable
+ * and in each dark corner broken firewalls sit, sending random
+ * errors ordered by their masters even this two messages finally lose
+ * their original sense (even Linux sends invalid PORT_UNREACHs)
+ *
+ * Now we are in compliance with RFCs.
+ * --ANK (980905)
+ */
+
+ inet = inet_sk(sk);
+ if (!sock_owned_by_user(sk) && inet->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else { /* Only an error on timeout */
+ sk->sk_err_soft = err;
+ }
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ } else {
+ th->check = tcp_v4_check(skb->len, saddr, daddr,
+ csum_partial(th,
+ th->doff << 2,
+ skb->csum));
+ }
+}
+
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
+}
+EXPORT_SYMBOL(tcp_v4_send_check);
+
+/*
+ * This routine will send an RST to the other tcp.
+ *
+ * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
+ * for reset.
+ * Answer: if a packet caused RST, it is not for a socket
+ * existing in our system, if it is matched to a socket,
+ * it is just duplicate segment or bug in other side's TCP.
+ * So that we build reply only basing on parameters
+ * arrived with segment.
+ * Exception: precedence violation. We do not implement it in any case.
+ */
+
+static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct {
+ struct tcphdr th;
+#ifdef CONFIG_TCP_MD5SIG
+ __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
+#endif
+ } rep;
+ struct ip_reply_arg arg;
+#ifdef CONFIG_TCP_MD5SIG
+ struct tcp_md5sig_key *key;
+ const __u8 *hash_location = NULL;
+ unsigned char newhash[16];
+ int genhash;
+ struct sock *sk1 = NULL;
+#endif
+ struct net *net;
+
+ /* Never send a reset in response to a reset. */
+ if (th->rst)
+ return;
+
+ /* If sk not NULL, it means we did a successful lookup and incoming
+ * route had to be correct. prequeue might have dropped our dst.
+ */
+ if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
+ return;
+
+ /* Swap the send and the receive. */
+ memset(&rep, 0, sizeof(rep));
+ rep.th.dest = th->source;
+ rep.th.source = th->dest;
+ rep.th.doff = sizeof(struct tcphdr) / 4;
+ rep.th.rst = 1;
+
+ if (th->ack) {
+ rep.th.seq = th->ack_seq;
+ } else {
+ rep.th.ack = 1;
+ rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
+ skb->len - (th->doff << 2));
+ }
+
+ memset(&arg, 0, sizeof(arg));
+ arg.iov[0].iov_base = (unsigned char *)&rep;
+ arg.iov[0].iov_len = sizeof(rep.th);
+
+ net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+#ifdef CONFIG_TCP_MD5SIG
+ hash_location = tcp_parse_md5sig_option(th);
+ if (!sk && hash_location) {
+ /*
+ * active side is lost. Try to find listening socket through
+ * source port, and then find md5 key through listening socket.
+ * we are not loose security here:
+ * Incoming packet is checked with md5 hash with finding key,
+ * no RST generated if md5 hash doesn't match.
+ */
+ sk1 = __inet_lookup_listener(net,
+ &tcp_hashinfo, ip_hdr(skb)->saddr,
+ th->source, ip_hdr(skb)->daddr,
+ ntohs(th->source), inet_iif(skb));
+ /* don't send rst if it can't find key */
+ if (!sk1)
+ return;
+ rcu_read_lock();
+ key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr, AF_INET);
+ if (!key)
+ goto release_sk1;
+
+ genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ if (genhash || memcmp(hash_location, newhash, 16) != 0)
+ goto release_sk1;
+ } else {
+ key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr,
+ AF_INET) : NULL;
+ }
+
+ if (key) {
+ rep.opt[0] = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ /* Update length and the length the header thinks exists */
+ arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
+ rep.th.doff = arg.iov[0].iov_len / 4;
+
+ tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
+ key, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &rep.th);
+ }
+#endif
+ arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr, /* XXX */
+ arg.iov[0].iov_len, IPPROTO_TCP, 0);
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+ arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+ /* When socket is gone, all binding information is lost.
+ * routing might fail in this case. No choice here, if we choose to force
+ * input interface, we will misroute in case of asymmetric route.
+ */
+ if (sk)
+ arg.bound_dev_if = sk->sk_bound_dev_if;
+
+ arg.tos = ip_hdr(skb)->tos;
+ ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+ &arg, arg.iov[0].iov_len);
+
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+
+#ifdef CONFIG_TCP_MD5SIG
+release_sk1:
+ if (sk1) {
+ rcu_read_unlock();
+ sock_put(sk1);
+ }
+#endif
+}
+
+/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
+ outside socket context is ugly, certainly. What can I do?
+ */
+
+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+ u32 win, u32 tsval, u32 tsecr, int oif,
+ struct tcp_md5sig_key *key,
+ int reply_flags, u8 tos)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct {
+ struct tcphdr th;
+ __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
+#ifdef CONFIG_TCP_MD5SIG
+ + (TCPOLEN_MD5SIG_ALIGNED >> 2)
+#endif
+ ];
+ } rep;
+ struct ip_reply_arg arg;
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ memset(&rep.th, 0, sizeof(struct tcphdr));
+ memset(&arg, 0, sizeof(arg));
+
+ arg.iov[0].iov_base = (unsigned char *)&rep;
+ arg.iov[0].iov_len = sizeof(rep.th);
+ if (tsecr) {
+ rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ rep.opt[1] = htonl(tsval);
+ rep.opt[2] = htonl(tsecr);
+ arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
+ }
+
+ /* Swap the send and the receive. */
+ rep.th.dest = th->source;
+ rep.th.source = th->dest;
+ rep.th.doff = arg.iov[0].iov_len / 4;
+ rep.th.seq = htonl(seq);
+ rep.th.ack_seq = htonl(ack);
+ rep.th.ack = 1;
+ rep.th.window = htons(win);
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (key) {
+ int offset = (tsecr) ? 3 : 0;
+
+ rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
+ rep.th.doff = arg.iov[0].iov_len/4;
+
+ tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
+ key, ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr, &rep.th);
+ }
+#endif
+ arg.flags = reply_flags;
+ arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr, /* XXX */
+ arg.iov[0].iov_len, IPPROTO_TCP, 0);
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+ if (oif)
+ arg.bound_dev_if = oif;
+ arg.tos = tos;
+ ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+ skb, &TCP_SKB_CB(skb)->header.h4.opt,
+ ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
+ &arg, arg.iov[0].iov_len);
+
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+}
+
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+
+ tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcp_time_stamp + tcptw->tw_ts_offset,
+ tcptw->tw_ts_recent,
+ tw->tw_bound_dev_if,
+ tcp_twsk_md5_key(tcptw),
+ tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
+ tw->tw_tos
+ );
+
+ inet_twsk_put(tw);
+}
+
+static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req)
+{
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+ */
+ tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+ tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_time_stamp,
+ req->ts_recent,
+ 0,
+ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
+ AF_INET),
+ inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
+ ip_hdr(skb)->tos);
+}
+
+/*
+ * Send a SYN-ACK after having received a SYN.
+ * This still operates on a request_sock only, not on a big
+ * socket.
+ */
+static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ u16 queue_mapping,
+ struct tcp_fastopen_cookie *foc)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct flowi4 fl4;
+ int err = -1;
+ struct sk_buff *skb;
+
+ /* First, grab a route. */
+ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+ return -1;
+
+ skb = tcp_make_synack(sk, dst, req, foc);
+
+ if (skb) {
+ __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+
+ skb_set_queue_mapping(skb, queue_mapping);
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
+ ireq->opt);
+ err = net_xmit_eval(err);
+ }
+
+ return err;
+}
+
+/*
+ * IPv4 request_sock destructor.
+ */
+static void tcp_v4_reqsk_destructor(struct request_sock *req)
+{
+ kfree(inet_rsk(req)->opt);
+}
+
+
+#ifdef CONFIG_TCP_MD5SIG
+/*
+ * RFC2385 MD5 checksumming requires a mapping of
+ * IP address->MD5 Key.
+ * We need to maintain these in the sk structure.
+ */
+
+/* Find the Key structure for an address. */
+struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ unsigned int size = sizeof(struct in_addr);
+ const struct tcp_md5sig_info *md5sig;
+
+ /* caller either holds rcu_read_lock() or socket lock */
+ md5sig = rcu_dereference_check(tp->md5sig_info,
+ sock_owned_by_user(sk) ||
+ lockdep_is_held(&sk->sk_lock.slock));
+ if (!md5sig)
+ return NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6)
+ size = sizeof(struct in6_addr);
+#endif
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ if (key->family != family)
+ continue;
+ if (!memcmp(&key->addr, addr, size))
+ return key;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_md5_do_lookup);
+
+struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
+ const struct sock *addr_sk)
+{
+ const union tcp_md5_addr *addr;
+
+ addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
+ return tcp_md5_do_lookup(sk, addr, AF_INET);
+}
+EXPORT_SYMBOL(tcp_v4_md5_lookup);
+
+/* This can be called on a newly created socket, from other files */
+int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
+{
+ /* Add Key to the list */
+ struct tcp_md5sig_key *key;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_info *md5sig;
+
+ key = tcp_md5_do_lookup(sk, addr, family);
+ if (key) {
+ /* Pre-existing entry - just update that one. */
+ memcpy(key->key, newkey, newkeylen);
+ key->keylen = newkeylen;
+ return 0;
+ }
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info,
+ sock_owned_by_user(sk));
+ if (!md5sig) {
+ md5sig = kmalloc(sizeof(*md5sig), gfp);
+ if (!md5sig)
+ return -ENOMEM;
+
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ INIT_HLIST_HEAD(&md5sig->head);
+ rcu_assign_pointer(tp->md5sig_info, md5sig);
+ }
+
+ key = sock_kmalloc(sk, sizeof(*key), gfp);
+ if (!key)
+ return -ENOMEM;
+ if (!tcp_alloc_md5sig_pool()) {
+ sock_kfree_s(sk, key, sizeof(*key));
+ return -ENOMEM;
+ }
+
+ memcpy(key->key, newkey, newkeylen);
+ key->keylen = newkeylen;
+ key->family = family;
+ memcpy(&key->addr, addr,
+ (family == AF_INET6) ? sizeof(struct in6_addr) :
+ sizeof(struct in_addr));
+ hlist_add_head_rcu(&key->node, &md5sig->head);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_md5_do_add);
+
+int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
+{
+ struct tcp_md5sig_key *key;
+
+ key = tcp_md5_do_lookup(sk, addr, family);
+ if (!key)
+ return -ENOENT;
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_md5_do_del);
+
+static void tcp_clear_md5_list(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ struct hlist_node *n;
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
+
+ hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
+ hlist_del_rcu(&key->node);
+ atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
+ kfree_rcu(key, rcu);
+ }
+}
+
+static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ struct tcp_md5sig cmd;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ return -EFAULT;
+
+ if (sin->sin_family != AF_INET)
+ return -EINVAL;
+
+ if (!cmd.tcpm_keylen)
+ return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+ AF_INET);
+
+ if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
+ return -EINVAL;
+
+ return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
+ AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
+ GFP_KERNEL);
+}
+
+static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
+ __be32 daddr, __be32 saddr, int nbytes)
+{
+ struct tcp4_pseudohdr *bp;
+ struct scatterlist sg;
+
+ bp = &hp->md5_blk.ip4;
+
+ /*
+ * 1. the TCP pseudo-header (in the order: source IP address,
+ * destination IP address, zero-padded protocol number, and
+ * segment length)
+ */
+ bp->saddr = saddr;
+ bp->daddr = daddr;
+ bp->pad = 0;
+ bp->protocol = IPPROTO_TCP;
+ bp->len = cpu_to_be16(nbytes);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+}
+
+static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th)
+{
+ struct tcp_md5sig_pool *hp;
+ struct hash_desc *desc;
+
+ hp = tcp_get_md5sig_pool();
+ if (!hp)
+ goto clear_hash_noput;
+ desc = &hp->md5_desc;
+
+ if (crypto_hash_init(desc))
+ goto clear_hash;
+ if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
+ goto clear_hash;
+ if (tcp_md5_hash_header(hp, th))
+ goto clear_hash;
+ if (tcp_md5_hash_key(hp, key))
+ goto clear_hash;
+ if (crypto_hash_final(desc, md5_hash))
+ goto clear_hash;
+
+ tcp_put_md5sig_pool();
+ return 0;
+
+clear_hash:
+ tcp_put_md5sig_pool();
+clear_hash_noput:
+ memset(md5_hash, 0, 16);
+ return 1;
+}
+
+int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ struct tcp_md5sig_pool *hp;
+ struct hash_desc *desc;
+ const struct tcphdr *th = tcp_hdr(skb);
+ __be32 saddr, daddr;
+
+ if (sk) { /* valid for establish/request sockets */
+ saddr = sk->sk_rcv_saddr;
+ daddr = sk->sk_daddr;
+ } else {
+ const struct iphdr *iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ }
+
+ hp = tcp_get_md5sig_pool();
+ if (!hp)
+ goto clear_hash_noput;
+ desc = &hp->md5_desc;
+
+ if (crypto_hash_init(desc))
+ goto clear_hash;
+
+ if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
+ goto clear_hash;
+ if (tcp_md5_hash_header(hp, th))
+ goto clear_hash;
+ if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
+ goto clear_hash;
+ if (tcp_md5_hash_key(hp, key))
+ goto clear_hash;
+ if (crypto_hash_final(desc, md5_hash))
+ goto clear_hash;
+
+ tcp_put_md5sig_pool();
+ return 0;
+
+clear_hash:
+ tcp_put_md5sig_pool();
+clear_hash_noput:
+ memset(md5_hash, 0, 16);
+ return 1;
+}
+EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+
+/* Called with rcu_read_lock() */
+static bool tcp_v4_inbound_md5_hash(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ /*
+ * This gets called for each TCP segment that arrives
+ * so we want to be efficient.
+ * We have 3 drop cases:
+ * o No MD5 hash and one expected.
+ * o MD5 hash and we're not expecting one.
+ * o MD5 hash and its wrong.
+ */
+ const __u8 *hash_location = NULL;
+ struct tcp_md5sig_key *hash_expected;
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+ int genhash;
+ unsigned char newhash[16];
+
+ hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
+ AF_INET);
+ hash_location = tcp_parse_md5sig_option(th);
+
+ /* We've parsed the options - do we have a hash? */
+ if (!hash_expected && !hash_location)
+ return false;
+
+ if (hash_expected && !hash_location) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ return true;
+ }
+
+ if (!hash_expected && hash_location) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ return true;
+ }
+
+ /* Okay, so this is hash_expected and hash_location -
+ * so we need to calculate the checksum.
+ */
+ genhash = tcp_v4_md5_hash_skb(newhash,
+ hash_expected,
+ NULL, skb);
+
+ if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+ &iph->saddr, ntohs(th->source),
+ &iph->daddr, ntohs(th->dest),
+ genhash ? " tcp_v4_calc_md5_hash failed"
+ : "");
+ return true;
+ }
+ return false;
+}
+#endif
+
+static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+ ireq->no_srccheck = inet_sk(sk_listener)->transparent;
+ ireq->opt = tcp_v4_save_options(skb);
+}
+
+static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+ const struct request_sock *req,
+ bool *strict)
+{
+ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
+
+ if (strict) {
+ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
+ *strict = true;
+ else
+ *strict = false;
+ }
+
+ return dst;
+}
+
+struct request_sock_ops tcp_request_sock_ops __read_mostly = {
+ .family = PF_INET,
+ .obj_size = sizeof(struct tcp_request_sock),
+ .rtx_syn_ack = tcp_rtx_synack,
+ .send_ack = tcp_v4_reqsk_send_ack,
+ .destructor = tcp_v4_reqsk_destructor,
+ .send_reset = tcp_v4_send_reset,
+ .syn_ack_timeout = tcp_syn_ack_timeout,
+};
+
+static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+ .mss_clamp = TCP_MSS_DEFAULT,
+#ifdef CONFIG_TCP_MD5SIG
+ .req_md5_lookup = tcp_v4_md5_lookup,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
+#endif
+ .init_req = tcp_v4_init_req,
+#ifdef CONFIG_SYN_COOKIES
+ .cookie_init_seq = cookie_v4_init_sequence,
+#endif
+ .route_req = tcp_v4_route_req,
+ .init_seq = tcp_v4_init_sequence,
+ .send_synack = tcp_v4_send_synack,
+ .queue_hash_add = inet_csk_reqsk_queue_hash_add,
+};
+
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ /* Never answer to SYNs send to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+ return tcp_conn_request(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, skb);
+
+drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_v4_conn_request);
+
+
+/*
+ * The three way handshake has completed - we got a valid synack -
+ * now create the new socket.
+ */
+struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq;
+ struct inet_sock *newinet;
+ struct tcp_sock *newtp;
+ struct sock *newsk;
+#ifdef CONFIG_TCP_MD5SIG
+ struct tcp_md5sig_key *key;
+#endif
+ struct ip_options_rcu *inet_opt;
+
+ if (sk_acceptq_is_full(sk))
+ goto exit_overflow;
+
+ newsk = tcp_create_openreq_child(sk, req, skb);
+ if (!newsk)
+ goto exit_nonewsk;
+
+ newsk->sk_gso_type = SKB_GSO_TCPV4;
+ inet_sk_rx_dst_set(newsk, skb);
+
+ newtp = tcp_sk(newsk);
+ newinet = inet_sk(newsk);
+ ireq = inet_rsk(req);
+ sk_daddr_set(newsk, ireq->ir_rmt_addr);
+ sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+ newinet->inet_saddr = ireq->ir_loc_addr;
+ inet_opt = ireq->opt;
+ rcu_assign_pointer(newinet->inet_opt, inet_opt);
+ ireq->opt = NULL;
+ newinet->mc_index = inet_iif(skb);
+ newinet->mc_ttl = ip_hdr(skb)->ttl;
+ newinet->rcv_tos = ip_hdr(skb)->tos;
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
+ inet_set_txhash(newsk);
+ if (inet_opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+ newinet->inet_id = newtp->write_seq ^ jiffies;
+
+ if (!dst) {
+ dst = inet_csk_route_child_sock(sk, newsk, req);
+ if (!dst)
+ goto put_and_exit;
+ } else {
+ /* syncookie case : see end of cookie_v4_check() */
+ }
+ sk_setup_caps(newsk, dst);
+
+ tcp_ca_openreq_child(newsk, dst);
+
+ tcp_sync_mss(newsk, dst_mtu(dst));
+ newtp->advmss = dst_metric_advmss(dst);
+ if (tcp_sk(sk)->rx_opt.user_mss &&
+ tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
+ newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+
+ tcp_initialize_rcv_mss(newsk);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Copy over the MD5 key from the original socket */
+ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
+ AF_INET);
+ if (key) {
+ /*
+ * We're using one, so create a matching key
+ * on the newsk structure. If we fail to get
+ * memory, then we end up not copying the key
+ * across. Shucks.
+ */
+ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
+ AF_INET, key->key, key->keylen, GFP_ATOMIC);
+ sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
+ }
+#endif
+
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+ __inet_hash_nolisten(newsk, NULL);
+
+ return newsk;
+
+exit_overflow:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+ dst_release(dst);
+exit:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ tcp_done(newsk);
+ goto exit;
+}
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+
+static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
+ struct request_sock *req;
+ struct sock *nsk;
+
+ req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
+ if (req) {
+ nsk = tcp_check_req(sk, skb, req, false);
+ if (!nsk)
+ reqsk_put(req);
+ return nsk;
+ }
+
+ nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
+ th->source, iph->daddr, th->dest, inet_iif(skb));
+
+ if (nsk) {
+ if (nsk->sk_state != TCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put(inet_twsk(nsk));
+ return NULL;
+ }
+
+#ifdef CONFIG_SYN_COOKIES
+ if (!th->syn)
+ sk = cookie_v4_check(sk, skb);
+#endif
+ return sk;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcphdr *th = tcp_hdr(skb);
+ struct sock *rsk;
+
+ if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+ struct dst_entry *dst = sk->sk_rx_dst;
+
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ if (dst) {
+ if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+ !dst->ops->check(dst, 0)) {
+ dst_release(dst);
+ sk->sk_rx_dst = NULL;
+ }
+ }
+ tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+ return 0;
+ }
+
+ if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+ goto csum_err;
+
+#ifdef CONFIG_TCP_STEALTH
+ if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin &&
+ unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH) &&
+ tcp_stealth_do_auth(sk, skb)) {
+ rsk = sk;
+ goto reset;
+ }
+#endif
+
+ if (sk->sk_state == TCP_LISTEN) {
+ struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+ if (!nsk)
+ goto discard;
+
+ if (nsk != sk) {
+ sock_rps_save_rxhash(nsk, skb);
+ sk_mark_napi_id(sk, skb);
+ if (tcp_child_process(sk, nsk, skb)) {
+ rsk = nsk;
+ goto reset;
+ }
+ return 0;
+ }
+ } else
+ sock_rps_save_rxhash(sk, skb);
+
+ if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
+ rsk = sk;
+ goto reset;
+ }
+ return 0;
+
+reset:
+ tcp_v4_send_reset(rsk, skb);
+discard:
+ kfree_skb(skb);
+ /* Be careful here. If this function gets more complicated and
+ * gcc suffers from register pressure on the x86, sk (in %ebx)
+ * might be destroyed here. This current version compiles correctly,
+ * but you have been warned.
+ */
+ return 0;
+
+csum_err:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ goto discard;
+}
+EXPORT_SYMBOL(tcp_v4_do_rcv);
+
+void tcp_v4_early_demux(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct sock *sk;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return;
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
+ return;
+
+ iph = ip_hdr(skb);
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ return;
+
+ sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ skb->skb_iif);
+ if (sk) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ if (sk_fullsock(sk)) {
+ struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst &&
+ inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ skb_dst_set_noref(skb, dst);
+ }
+ }
+}
+
+/* Packet is added to VJ-style prequeue for processing in process
+ * context, if a reader task is waiting. Apparently, this exciting
+ * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
+ * failed somewhere. Latency? Burstiness? Well, at least now we will
+ * see, why it failed. 8)8) --ANK
+ *
+ */
+bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (sysctl_tcp_low_latency || !tp->ucopy.task)
+ return false;
+
+ if (skb->len <= tcp_hdrlen(skb) &&
+ skb_queue_len(&tp->ucopy.prequeue) == 0)
+ return false;
+
+ /* Before escaping RCU protected region, we need to take care of skb
+ * dst. Prequeue is only enabled for established sockets.
+ * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
+ * Instead of doing full sk_rx_dst validity here, let's perform
+ * an optimistic check.
+ */
+ if (likely(sk->sk_rx_dst))
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
+
+ __skb_queue_tail(&tp->ucopy.prequeue, skb);
+ tp->ucopy.memory += skb->truesize;
+ if (tp->ucopy.memory > sk->sk_rcvbuf) {
+ struct sk_buff *skb1;
+
+ BUG_ON(sock_owned_by_user(sk));
+
+ while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
+ sk_backlog_rcv(sk, skb1);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPPREQUEUEDROPPED);
+ }
+
+ tp->ucopy.memory = 0;
+ } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
+ wake_up_interruptible_sync_poll(sk_sleep(sk),
+ POLLIN | POLLRDNORM | POLLRDBAND);
+ if (!inet_csk_ack_scheduled(sk))
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ (3 * tcp_rto_min(sk)) / 4,
+ TCP_RTO_MAX);
+ }
+ return true;
+}
+EXPORT_SYMBOL(tcp_prequeue);
+
+/*
+ * From tcp_input.c
+ */
+
+int tcp_v4_rcv(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct sock *sk;
+ int ret;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto discard_it;
+
+ /* Count it even if it's bad */
+ TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
+
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ goto discard_it;
+
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ goto bad_packet;
+ if (!pskb_may_pull(skb, th->doff * 4))
+ goto discard_it;
+
+ /* An explanation is required here, I think.
+ * Packet length and doff are validated by header prediction,
+ * provided case of th->doff==0 is eliminated.
+ * So, we defer the checks. */
+
+ if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
+ goto csum_error;
+
+ th = tcp_hdr(skb);
+ iph = ip_hdr(skb);
+ /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
+ * barrier() makes sure compiler wont play fool^Waliasing games.
+ */
+ memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
+ sizeof(struct inet_skb_parm));
+ barrier();
+
+ TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff * 4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
+ TCP_SKB_CB(skb)->tcp_tw_isn = 0;
+ TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
+ TCP_SKB_CB(skb)->sacked = 0;
+
+ sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+ if (!sk)
+ goto no_tcp_socket;
+
+process:
+ if (sk->sk_state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+ }
+
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_and_relse;
+
+#ifdef CONFIG_TCP_MD5SIG
+ /*
+ * We really want to reject the packet as early as possible
+ * if:
+ * o We're expecting an MD5'd packet and this is no MD5 tcp option
+ * o There is an MD5 option and we're not expecting one
+ */
+ if (tcp_v4_inbound_md5_hash(sk, skb))
+ goto discard_and_relse;
+#endif
+
+ nf_reset(skb);
+
+ if (sk_filter(sk, skb))
+ goto discard_and_relse;
+
+ sk_incoming_cpu_update(sk);
+ skb->dev = NULL;
+
+ bh_lock_sock_nested(sk);
+ ret = 0;
+ if (!sock_owned_by_user(sk)) {
+ if (!tcp_prequeue(sk, skb))
+ ret = tcp_v4_do_rcv(sk, skb);
+ } else if (unlikely(sk_add_backlog(sk, skb,
+ sk->sk_rcvbuf + sk->sk_sndbuf))) {
+ bh_unlock_sock(sk);
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ goto discard_and_relse;
+ }
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+
+ return ret;
+
+no_tcp_socket:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+
+ if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+csum_error:
+ TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
+bad_packet:
+ TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+ } else {
+ tcp_v4_send_reset(NULL, skb);
+ }
+
+discard_it:
+ /* Discard frame. */
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
+do_time_wait:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto discard_it;
+ }
+
+ if (skb->len < (th->doff << 2)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto bad_packet;
+ }
+ if (tcp_checksum_complete(skb)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto csum_error;
+ }
+ switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+ case TCP_TW_SYN: {
+ struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
+ &tcp_hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, th->dest,
+ inet_iif(skb));
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk));
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ goto process;
+ }
+ /* Fall through to ACK */
+ }
+ case TCP_TW_ACK:
+ tcp_v4_timewait_ack(sk, skb);
+ break;
+ case TCP_TW_RST:
+ goto no_tcp_socket;
+ case TCP_TW_SUCCESS:;
+ }
+ goto discard_it;
+}
+
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+ .twsk_unique = tcp_twsk_unique,
+ .twsk_destructor= tcp_twsk_destructor,
+};
+
+void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (dst) {
+ dst_hold(dst);
+ sk->sk_rx_dst = dst;
+ inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ }
+}
+EXPORT_SYMBOL(inet_sk_rx_dst_set);
+
+const struct inet_connection_sock_af_ops ipv4_specific = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = tcp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .sk_rx_dst_set = inet_sk_rx_dst_set,
+ .conn_request = tcp_v4_conn_request,
+ .syn_recv_sock = tcp_v4_syn_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .addr2sockaddr = inet_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in),
+ .bind_conflict = inet_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ip_setsockopt,
+ .compat_getsockopt = compat_ip_getsockopt,
+#endif
+ .mtu_reduced = tcp_v4_mtu_reduced,
+};
+EXPORT_SYMBOL(ipv4_specific);
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
+ .md5_lookup = tcp_v4_md5_lookup,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
+ .md5_parse = tcp_v4_parse_md5_keys,
+};
+#endif
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+static int tcp_v4_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_init_sock(sk);
+
+ icsk->icsk_af_ops = &ipv4_specific;
+
+#ifdef CONFIG_TCP_MD5SIG
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+#endif
+
+ return 0;
+}
+
+void tcp_v4_destroy_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_clear_xmit_timers(sk);
+
+ tcp_cleanup_congestion_control(sk);
+
+ /* Cleanup up the write buffer. */
+ tcp_write_queue_purge(sk);
+
+ /* Cleans up our, hopefully empty, out_of_order_queue. */
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Clean up the MD5 key list, if any */
+ if (tp->md5sig_info) {
+ tcp_clear_md5_list(sk);
+ kfree_rcu(tp->md5sig_info, rcu);
+ tp->md5sig_info = NULL;
+ }
+#endif
+
+ /* Clean prequeue, it must be empty really */
+ __skb_queue_purge(&tp->ucopy.prequeue);
+
+ /* Clean up a referenced TCP bind bucket. */
+ if (inet_csk(sk)->icsk_bind_hash)
+ inet_put_port(sk);
+
+ BUG_ON(tp->fastopen_rsk);
+
+ /* If socket is aborted during connect operation */
+ tcp_free_fastopen_req(tp);
+
+ sk_sockets_allocated_dec(sk);
+ sock_release_memcg(sk);
+}
+EXPORT_SYMBOL(tcp_v4_destroy_sock);
+
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCP sock list dumping. */
+
+/*
+ * Get next listener socket follow cur. If cur is NULL, get first socket
+ * starting from bucket given in st->bucket; when st->bucket is zero the
+ * very first socket in the hash table is returned.
+ */
+static void *listening_get_next(struct seq_file *seq, void *cur)
+{
+ struct inet_connection_sock *icsk;
+ struct hlist_nulls_node *node;
+ struct sock *sk = cur;
+ struct inet_listen_hashbucket *ilb;
+ struct tcp_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ if (!sk) {
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
+ spin_lock_bh(&ilb->lock);
+ sk = sk_nulls_head(&ilb->head);
+ st->offset = 0;
+ goto get_sk;
+ }
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
+ ++st->num;
+ ++st->offset;
+
+ if (st->state == TCP_SEQ_STATE_OPENREQ) {
+ struct request_sock *req = cur;
+
+ icsk = inet_csk(st->syn_wait_sk);
+ req = req->dl_next;
+ while (1) {
+ while (req) {
+ if (req->rsk_ops->family == st->family) {
+ cur = req;
+ goto out;
+ }
+ req = req->dl_next;
+ }
+ if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
+ break;
+get_req:
+ req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
+ }
+ sk = sk_nulls_next(st->syn_wait_sk);
+ st->state = TCP_SEQ_STATE_LISTENING;
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ } else {
+ icsk = inet_csk(sk);
+ spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue))
+ goto start_req;
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ sk = sk_nulls_next(sk);
+ }
+get_sk:
+ sk_nulls_for_each_from(sk, node) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (sk->sk_family == st->family) {
+ cur = sk;
+ goto out;
+ }
+ icsk = inet_csk(sk);
+ spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
+start_req:
+ st->uid = sock_i_uid(sk);
+ st->syn_wait_sk = sk;
+ st->state = TCP_SEQ_STATE_OPENREQ;
+ st->sbucket = 0;
+ goto get_req;
+ }
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ }
+ spin_unlock_bh(&ilb->lock);
+ st->offset = 0;
+ if (++st->bucket < INET_LHTABLE_SIZE) {
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
+ spin_lock_bh(&ilb->lock);
+ sk = sk_nulls_head(&ilb->head);
+ goto get_sk;
+ }
+ cur = NULL;
+out:
+ return cur;
+}
+
+static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
+{
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ st->offset = 0;
+ rc = listening_get_next(seq, NULL);
+
+ while (rc && *pos) {
+ rc = listening_get_next(seq, rc);
+ --*pos;
+ }
+ return rc;
+}
+
+static inline bool empty_bucket(const struct tcp_iter_state *st)
+{
+ return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
+}
+
+/*
+ * Get first established socket starting from bucket given in st->bucket.
+ * If st->bucket is zero, the very first socket in the hash is returned.
+ */
+static void *established_get_first(struct seq_file *seq)
+{
+ struct tcp_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+ void *rc = NULL;
+
+ st->offset = 0;
+ for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
+
+ /* Lockless fast path for the common case of empty buckets */
+ if (empty_bucket(st))
+ continue;
+
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
+ if (sk->sk_family != st->family ||
+ !net_eq(sock_net(sk), net)) {
+ continue;
+ }
+ rc = sk;
+ goto out;
+ }
+ spin_unlock_bh(lock);
+ }
+out:
+ return rc;
+}
+
+static void *established_get_next(struct seq_file *seq, void *cur)
+{
+ struct sock *sk = cur;
+ struct hlist_nulls_node *node;
+ struct tcp_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ ++st->num;
+ ++st->offset;
+
+ sk = sk_nulls_next(sk);
+
+ sk_nulls_for_each_from(sk, node) {
+ if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
+ return sk;
+ }
+
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ ++st->bucket;
+ return established_get_first(seq);
+}
+
+static void *established_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ rc = established_get_first(seq);
+
+ while (rc && pos) {
+ rc = established_get_next(seq, rc);
+ --pos;
+ }
+ return rc;
+}
+
+static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
+{
+ void *rc;
+ struct tcp_iter_state *st = seq->private;
+
+ st->state = TCP_SEQ_STATE_LISTENING;
+ rc = listening_get_idx(seq, &pos);
+
+ if (!rc) {
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ rc = established_get_idx(seq, pos);
+ }
+
+ return rc;
+}
+
+static void *tcp_seek_last_pos(struct seq_file *seq)
+{
+ struct tcp_iter_state *st = seq->private;
+ int offset = st->offset;
+ int orig_num = st->num;
+ void *rc = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ case TCP_SEQ_STATE_LISTENING:
+ if (st->bucket >= INET_LHTABLE_SIZE)
+ break;
+ st->state = TCP_SEQ_STATE_LISTENING;
+ rc = listening_get_next(seq, NULL);
+ while (offset-- && rc)
+ rc = listening_get_next(seq, rc);
+ if (rc)
+ break;
+ st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ /* Fallthrough */
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (st->bucket > tcp_hashinfo.ehash_mask)
+ break;
+ rc = established_get_first(seq);
+ while (offset-- && rc)
+ rc = established_get_next(seq, rc);
+ }
+
+ st->num = orig_num;
+
+ return rc;
+}
+
+static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ if (*pos && *pos == st->last_pos) {
+ rc = tcp_seek_last_pos(seq);
+ if (rc)
+ goto out;
+ }
+
+ st->state = TCP_SEQ_STATE_LISTENING;
+ st->num = 0;
+ st->bucket = 0;
+ st->offset = 0;
+ rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+out:
+ st->last_pos = *pos;
+ return rc;
+}
+
+static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct tcp_iter_state *st = seq->private;
+ void *rc = NULL;
+
+ if (v == SEQ_START_TOKEN) {
+ rc = tcp_get_idx(seq, 0);
+ goto out;
+ }
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ case TCP_SEQ_STATE_LISTENING:
+ rc = listening_get_next(seq, v);
+ if (!rc) {
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ st->bucket = 0;
+ st->offset = 0;
+ rc = established_get_first(seq);
+ }
+ break;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ rc = established_get_next(seq, v);
+ break;
+ }
+out:
+ ++*pos;
+ st->last_pos = *pos;
+ return rc;
+}
+
+static void tcp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct tcp_iter_state *st = seq->private;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ if (v) {
+ struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
+ spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ }
+ case TCP_SEQ_STATE_LISTENING:
+ if (v != SEQ_START_TOKEN)
+ spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
+ break;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (v)
+ spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ break;
+ }
+}
+
+int tcp_seq_open(struct inode *inode, struct file *file)
+{
+ struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
+ struct tcp_iter_state *s;
+ int err;
+
+ err = seq_open_net(inode, file, &afinfo->seq_ops,
+ sizeof(struct tcp_iter_state));
+ if (err < 0)
+ return err;
+
+ s = ((struct seq_file *)file->private_data)->private;
+ s->family = afinfo->family;
+ s->last_pos = 0;
+ return 0;
+}
+EXPORT_SYMBOL(tcp_seq_open);
+
+int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
+{
+ int rc = 0;
+ struct proc_dir_entry *p;
+
+ afinfo->seq_ops.start = tcp_seq_start;
+ afinfo->seq_ops.next = tcp_seq_next;
+ afinfo->seq_ops.stop = tcp_seq_stop;
+
+ p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+ afinfo->seq_fops, afinfo);
+ if (!p)
+ rc = -ENOMEM;
+ return rc;
+}
+EXPORT_SYMBOL(tcp_proc_register);
+
+void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
+{
+ remove_proc_entry(afinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL(tcp_proc_unregister);
+
+static void get_openreq4(const struct request_sock *req,
+ struct seq_file *f, int i, kuid_t uid)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ long delta = req->rsk_timer.expires - jiffies;
+
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
+ i,
+ ireq->ir_loc_addr,
+ ireq->ir_num,
+ ireq->ir_rmt_addr,
+ ntohs(ireq->ir_rmt_port),
+ TCP_SYN_RECV,
+ 0, 0, /* could print option size, but that is af dependent. */
+ 1, /* timers active (only the expire timer) */
+ jiffies_delta_to_clock_t(delta),
+ req->num_timeout,
+ from_kuid_munged(seq_user_ns(f), uid),
+ 0, /* non standard timer */
+ 0, /* open_requests have no inode */
+ 0,
+ req);
+}
+
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
+{
+ int timer_active;
+ unsigned long timer_expires;
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+ struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+ int rx_queue;
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ timer_active = 1;
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_active = 4;
+ timer_expires = icsk->icsk_timeout;
+ } else if (timer_pending(&sk->sk_timer)) {
+ timer_active = 2;
+ timer_expires = sk->sk_timer.expires;
+ } else {
+ timer_active = 0;
+ timer_expires = jiffies;
+ }
+
+ if (sk->sk_state == TCP_LISTEN)
+ rx_queue = sk->sk_ack_backlog;
+ else
+ /*
+ * because we dont lock socket, we might find a transient negative value
+ */
+ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
+ "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
+ i, src, srcp, dest, destp, sk->sk_state,
+ tp->write_seq - tp->snd_una,
+ rx_queue,
+ timer_active,
+ jiffies_delta_to_clock_t(timer_expires - jiffies),
+ icsk->icsk_retransmits,
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
+ icsk->icsk_probes_out,
+ sock_i_ino(sk),
+ atomic_read(&sk->sk_refcnt), sk,
+ jiffies_to_clock_t(icsk->icsk_rto),
+ jiffies_to_clock_t(icsk->icsk_ack.ato),
+ (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
+ tp->snd_cwnd,
+ sk->sk_state == TCP_LISTEN ?
+ (fastopenq ? fastopenq->max_qlen : 0) :
+ (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
+}
+
+static void get_timewait4_sock(const struct inet_timewait_sock *tw,
+ struct seq_file *f, int i)
+{
+ long delta = tw->tw_timer.expires - jiffies;
+ __be32 dest, src;
+ __u16 destp, srcp;
+
+ dest = tw->tw_daddr;
+ src = tw->tw_rcv_saddr;
+ destp = ntohs(tw->tw_dport);
+ srcp = ntohs(tw->tw_sport);
+
+ seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
+ i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+ 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+ atomic_read(&tw->tw_refcnt), tw);
+}
+
+#define TMPSZ 150
+
+static int tcp4_seq_show(struct seq_file *seq, void *v)
+{
+ struct tcp_iter_state *st;
+ struct sock *sk = v;
+
+ seq_setwidth(seq, TMPSZ - 1);
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode");
+ goto out;
+ }
+ st = seq->private;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait4_sock(v, seq, st->num);
+ else
+ get_tcp4_sock(v, seq, st->num);
+ break;
+ case TCP_SEQ_STATE_OPENREQ:
+ get_openreq4(v, seq, st->num, st->uid);
+ break;
+ }
+out:
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct file_operations tcp_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = tcp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+static struct tcp_seq_afinfo tcp4_seq_afinfo = {
+ .name = "tcp",
+ .family = AF_INET,
+ .seq_fops = &tcp_afinfo_seq_fops,
+ .seq_ops = {
+ .show = tcp4_seq_show,
+ },
+};
+
+static int __net_init tcp4_proc_init_net(struct net *net)
+{
+ return tcp_proc_register(net, &tcp4_seq_afinfo);
+}
+
+static void __net_exit tcp4_proc_exit_net(struct net *net)
+{
+ tcp_proc_unregister(net, &tcp4_seq_afinfo);
+}
+
+static struct pernet_operations tcp4_net_ops = {
+ .init = tcp4_proc_init_net,
+ .exit = tcp4_proc_exit_net,
+};
+
+int __init tcp4_proc_init(void)
+{
+ return register_pernet_subsys(&tcp4_net_ops);
+}
+
+void tcp4_proc_exit(void)
+{
+ unregister_pernet_subsys(&tcp4_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+struct proto tcp_prot = {
+ .name = "TCP",
+ .owner = THIS_MODULE,
+ .close = tcp_close,
+ .connect = tcp_v4_connect,
+ .disconnect = tcp_disconnect,
+ .accept = inet_csk_accept,
+ .ioctl = tcp_ioctl,
+ .init = tcp_v4_init_sock,
+ .destroy = tcp_v4_destroy_sock,
+ .shutdown = tcp_shutdown,
+ .setsockopt = tcp_setsockopt,
+ .getsockopt = tcp_getsockopt,
+ .recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
+ .sendpage = tcp_sendpage,
+ .backlog_rcv = tcp_v4_do_rcv,
+ .release_cb = tcp_release_cb,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .get_port = inet_csk_get_port,
+ .enter_memory_pressure = tcp_enter_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
+ .sockets_allocated = &tcp_sockets_allocated,
+ .orphan_count = &tcp_orphan_count,
+ .memory_allocated = &tcp_memory_allocated,
+ .memory_pressure = &tcp_memory_pressure,
+ .sysctl_mem = sysctl_tcp_mem,
+ .sysctl_wmem = sysctl_tcp_wmem,
+ .sysctl_rmem = sysctl_tcp_rmem,
+ .max_header = MAX_TCP_HEADER,
+ .obj_size = sizeof(struct tcp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .twsk_prot = &tcp_timewait_sock_ops,
+ .rsk_prot = &tcp_request_sock_ops,
+ .h.hashinfo = &tcp_hashinfo,
+ .no_autobind = true,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_tcp_setsockopt,
+ .compat_getsockopt = compat_tcp_getsockopt,
+#endif
+#ifdef CONFIG_MEMCG_KMEM
+ .init_cgroup = tcp_init_cgroup,
+ .destroy_cgroup = tcp_destroy_cgroup,
+ .proto_cgroup = tcp_proto_cgroup,
+#endif
+};
+EXPORT_SYMBOL(tcp_prot);
+
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
+ free_percpu(net->ipv4.tcp_sk);
+}
+
+static int __net_init tcp_sk_init(struct net *net)
+{
+ int res, cpu;
+
+ net->ipv4.tcp_sk = alloc_percpu(struct sock *);
+ if (!net->ipv4.tcp_sk)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, net);
+ if (res)
+ goto fail;
+ *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+ }
+ net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+ net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ return 0;
+
+fail:
+ tcp_sk_exit(net);
+
+ return res;
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
+}
+
+static struct pernet_operations __net_initdata tcp_sk_ops = {
+ .init = tcp_sk_init,
+ .exit = tcp_sk_exit,
+ .exit_batch = tcp_sk_exit_batch,
+};
+
+void __init tcp_v4_init(void)
+{
+ inet_hashinfo_init(&tcp_hashinfo);
+ if (register_pernet_subsys(&tcp_sk_ops))
+ panic("Failed to create the TCP control socket.\n");
+}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
new file mode 100644
index 000000000..1e70fa8fa
--- /dev/null
+++ b/net/ipv4/tcp_lp.c
@@ -0,0 +1,342 @@
+/*
+ * TCP Low Priority (TCP-LP)
+ *
+ * TCP Low Priority is a distributed algorithm whose goal is to utilize only
+ * the excess network bandwidth as compared to the ``fair share`` of
+ * bandwidth as targeted by TCP.
+ *
+ * As of 2.6.13, Linux supports pluggable congestion control algorithms.
+ * Due to the limitation of the API, we take the following changes from
+ * the original TCP-LP implementation:
+ * o We use newReno in most core CA handling. Only add some checking
+ * within cong_avoid.
+ * o Error correcting in remote HZ, therefore remote HZ will be keeped
+ * on checking and updating.
+ * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
+ * OWD have a similar meaning as RTT. Also correct the buggy formular.
+ * o Handle reaction for Early Congestion Indication (ECI) within
+ * pkts_acked, as mentioned within pseudo code.
+ * o OWD is handled in relative format, where local time stamp will in
+ * tcp_time_stamp format.
+ *
+ * Original Author:
+ * Aleksandar Kuzmanovic <akuzma@northwestern.edu>
+ * Available from:
+ * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ * Original implementation for 2.4.19:
+ * http://www-ece.rice.edu/networks/TCP-LP/
+ *
+ * 2.6.x module Authors:
+ * Wong Hoi Sing, Edison <hswong3i@gmail.com>
+ * Hung Hing Lun, Mike <hlhung3i@gmail.com>
+ * SourceForge project page:
+ * http://tcp-lp-mod.sourceforge.net/
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* resolution of owd */
+#define LP_RESOL 1000
+
+/**
+ * enum tcp_lp_state
+ * @LP_VALID_RHZ: is remote HZ valid?
+ * @LP_VALID_OWD: is OWD valid?
+ * @LP_WITHIN_THR: are we within threshold?
+ * @LP_WITHIN_INF: are we within inference?
+ *
+ * TCP-LP's state flags.
+ * We create this set of state flag mainly for debugging.
+ */
+enum tcp_lp_state {
+ LP_VALID_RHZ = (1 << 0),
+ LP_VALID_OWD = (1 << 1),
+ LP_WITHIN_THR = (1 << 3),
+ LP_WITHIN_INF = (1 << 4),
+};
+
+/**
+ * struct lp
+ * @flag: TCP-LP state flag
+ * @sowd: smoothed OWD << 3
+ * @owd_min: min OWD
+ * @owd_max: max OWD
+ * @owd_max_rsv: resrved max owd
+ * @remote_hz: estimated remote HZ
+ * @remote_ref_time: remote reference time
+ * @local_ref_time: local reference time
+ * @last_drop: time for last active drop
+ * @inference: current inference
+ *
+ * TCP-LP's private struct.
+ * We get the idea from original TCP-LP implementation where only left those we
+ * found are really useful.
+ */
+struct lp {
+ u32 flag;
+ u32 sowd;
+ u32 owd_min;
+ u32 owd_max;
+ u32 owd_max_rsv;
+ u32 remote_hz;
+ u32 remote_ref_time;
+ u32 local_ref_time;
+ u32 last_drop;
+ u32 inference;
+};
+
+/**
+ * tcp_lp_init
+ *
+ * Init all required variables.
+ * Clone the handling from Vegas module implementation.
+ */
+static void tcp_lp_init(struct sock *sk)
+{
+ struct lp *lp = inet_csk_ca(sk);
+
+ lp->flag = 0;
+ lp->sowd = 0;
+ lp->owd_min = 0xffffffff;
+ lp->owd_max = 0;
+ lp->owd_max_rsv = 0;
+ lp->remote_hz = 0;
+ lp->remote_ref_time = 0;
+ lp->local_ref_time = 0;
+ lp->last_drop = 0;
+ lp->inference = 0;
+}
+
+/**
+ * tcp_lp_cong_avoid
+ *
+ * Implementation of cong_avoid.
+ * Will only call newReno CA when away from inference.
+ * From TCP-LP's paper, this will be handled in additive increasement.
+ */
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct lp *lp = inet_csk_ca(sk);
+
+ if (!(lp->flag & LP_WITHIN_INF))
+ tcp_reno_cong_avoid(sk, ack, acked);
+}
+
+/**
+ * tcp_lp_remote_hz_estimator
+ *
+ * Estimate remote HZ.
+ * We keep on updating the estimated value, where original TCP-LP
+ * implementation only guest it for once and use forever.
+ */
+static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct lp *lp = inet_csk_ca(sk);
+ s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */
+ s64 m = 0;
+
+ /* not yet record reference time
+ * go away!! record it before come back!! */
+ if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
+ goto out;
+
+ /* we can't calc remote HZ with no different!! */
+ if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
+ tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+ goto out;
+
+ m = HZ * (tp->rx_opt.rcv_tsval -
+ lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr -
+ lp->local_ref_time);
+ if (m < 0)
+ m = -m;
+
+ if (rhz > 0) {
+ m -= rhz >> 6; /* m is now error in remote HZ est */
+ rhz += m; /* 63/64 old + 1/64 new */
+ } else
+ rhz = m << 6;
+
+ out:
+ /* record time for successful remote HZ calc */
+ if ((rhz >> 6) > 0)
+ lp->flag |= LP_VALID_RHZ;
+ else
+ lp->flag &= ~LP_VALID_RHZ;
+
+ /* record reference time stamp */
+ lp->remote_ref_time = tp->rx_opt.rcv_tsval;
+ lp->local_ref_time = tp->rx_opt.rcv_tsecr;
+
+ return rhz >> 6;
+}
+
+/**
+ * tcp_lp_owd_calculator
+ *
+ * Calculate one way delay (in relative format).
+ * Original implement OWD as minus of remote time difference to local time
+ * difference directly. As this time difference just simply equal to RTT, when
+ * the network status is stable, remote RTT will equal to local RTT, and result
+ * OWD into zero.
+ * It seems to be a bug and so we fixed it.
+ */
+static u32 tcp_lp_owd_calculator(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct lp *lp = inet_csk_ca(sk);
+ s64 owd = 0;
+
+ lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
+
+ if (lp->flag & LP_VALID_RHZ) {
+ owd =
+ tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
+ tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ);
+ if (owd < 0)
+ owd = -owd;
+ }
+
+ if (owd > 0)
+ lp->flag |= LP_VALID_OWD;
+ else
+ lp->flag &= ~LP_VALID_OWD;
+
+ return owd;
+}
+
+/**
+ * tcp_lp_rtt_sample
+ *
+ * Implementation or rtt_sample.
+ * Will take the following action,
+ * 1. calc OWD,
+ * 2. record the min/max OWD,
+ * 3. calc smoothed OWD (SOWD).
+ * Most ideas come from the original TCP-LP implementation.
+ */
+static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
+{
+ struct lp *lp = inet_csk_ca(sk);
+ s64 mowd = tcp_lp_owd_calculator(sk);
+
+ /* sorry that we don't have valid data */
+ if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
+ return;
+
+ /* record the next min owd */
+ if (mowd < lp->owd_min)
+ lp->owd_min = mowd;
+
+ /* always forget the max of the max
+ * we just set owd_max as one below it */
+ if (mowd > lp->owd_max) {
+ if (mowd > lp->owd_max_rsv) {
+ if (lp->owd_max_rsv == 0)
+ lp->owd_max = mowd;
+ else
+ lp->owd_max = lp->owd_max_rsv;
+ lp->owd_max_rsv = mowd;
+ } else
+ lp->owd_max = mowd;
+ }
+
+ /* calc for smoothed owd */
+ if (lp->sowd != 0) {
+ mowd -= lp->sowd >> 3; /* m is now error in owd est */
+ lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */
+ } else
+ lp->sowd = mowd << 3; /* take the measured time be owd */
+}
+
+/**
+ * tcp_lp_pkts_acked
+ *
+ * Implementation of pkts_acked.
+ * Deal with active drop under Early Congestion Indication.
+ * Only drop to half and 1 will be handle, because we hope to use back
+ * newReno in increase case.
+ * We work it out by following the idea from TCP-LP's paper directly
+ */
+static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct lp *lp = inet_csk_ca(sk);
+
+ if (rtt_us > 0)
+ tcp_lp_rtt_sample(sk, rtt_us);
+
+ /* calc inference */
+ if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
+ lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+
+ /* test if within inference */
+ if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
+ lp->flag |= LP_WITHIN_INF;
+ else
+ lp->flag &= ~LP_WITHIN_INF;
+
+ /* test if within threshold */
+ if (lp->sowd >> 3 <
+ lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
+ lp->flag |= LP_WITHIN_THR;
+ else
+ lp->flag &= ~LP_WITHIN_THR;
+
+ pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
+ tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
+ lp->sowd >> 3);
+
+ if (lp->flag & LP_WITHIN_THR)
+ return;
+
+ /* FIXME: try to reset owd_min and owd_max here
+ * so decrease the chance the min/max is no longer suitable
+ * and will usually within threshold when whithin inference */
+ lp->owd_min = lp->sowd >> 3;
+ lp->owd_max = lp->sowd >> 2;
+ lp->owd_max_rsv = lp->sowd >> 2;
+
+ /* happened within inference
+ * drop snd_cwnd into 1 */
+ if (lp->flag & LP_WITHIN_INF)
+ tp->snd_cwnd = 1U;
+
+ /* happened after inference
+ * cut snd_cwnd into half */
+ else
+ tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
+
+ /* record this drop time */
+ lp->last_drop = tcp_time_stamp;
+}
+
+static struct tcp_congestion_ops tcp_lp __read_mostly = {
+ .init = tcp_lp_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_lp_cong_avoid,
+ .pkts_acked = tcp_lp_pkts_acked,
+
+ .owner = THIS_MODULE,
+ .name = "lp"
+};
+
+static int __init tcp_lp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_lp);
+}
+
+static void __exit tcp_lp_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_lp);
+}
+
+module_init(tcp_lp_register);
+module_exit(tcp_lp_unregister);
+
+MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun Mike");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Low Priority");
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
new file mode 100644
index 000000000..2379c1b4e
--- /dev/null
+++ b/net/ipv4/tcp_memcontrol.c
@@ -0,0 +1,233 @@
+#include <net/tcp.h>
+#include <net/tcp_memcontrol.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <linux/nsproxy.h>
+#include <linux/memcontrol.h>
+#include <linux/module.h>
+
+int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ /*
+ * The root cgroup does not use page_counters, but rather,
+ * rely on the data already collected by the network
+ * subsystem
+ */
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct page_counter *counter_parent = NULL;
+ struct cg_proto *cg_proto, *parent_cg;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return 0;
+
+ cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
+ cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
+ cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
+ cg_proto->memory_pressure = 0;
+ cg_proto->memcg = memcg;
+
+ parent_cg = tcp_prot.proto_cgroup(parent);
+ if (parent_cg)
+ counter_parent = &parent_cg->memory_allocated;
+
+ page_counter_init(&cg_proto->memory_allocated, counter_parent);
+ percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_init_cgroup);
+
+void tcp_destroy_cgroup(struct mem_cgroup *memcg)
+{
+ struct cg_proto *cg_proto;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return;
+
+ percpu_counter_destroy(&cg_proto->sockets_allocated);
+
+ if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_dec(&memcg_socket_limit_enabled);
+
+}
+EXPORT_SYMBOL(tcp_destroy_cgroup);
+
+static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
+{
+ struct cg_proto *cg_proto;
+ int i;
+ int ret;
+
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return -EINVAL;
+
+ ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < 3; i++)
+ cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
+ sysctl_tcp_mem[i]);
+
+ if (nr_pages == PAGE_COUNTER_MAX)
+ clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ else {
+ /*
+ * The active bit needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ *
+ * The activated bit is used to guarantee that no two writers
+ * will do the update in the same memcg. Without that, we can't
+ * properly shutdown the static key.
+ */
+ if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_inc(&memcg_socket_limit_enabled);
+ set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ }
+
+ return 0;
+}
+
+enum {
+ RES_USAGE,
+ RES_LIMIT,
+ RES_MAX_USAGE,
+ RES_FAILCNT,
+};
+
+static DEFINE_MUTEX(tcp_limit_mutex);
+
+static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long nr_pages;
+ int ret = 0;
+
+ buf = strstrip(buf);
+
+ switch (of_cft(of)->private) {
+ case RES_LIMIT:
+ /* see memcontrol.c */
+ ret = page_counter_memparse(buf, "-1", &nr_pages);
+ if (ret)
+ break;
+ mutex_lock(&tcp_limit_mutex);
+ ret = tcp_update_limit(memcg, nr_pages);
+ mutex_unlock(&tcp_limit_mutex);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
+ u64 val;
+
+ switch (cft->private) {
+ case RES_LIMIT:
+ if (!cg_proto)
+ return PAGE_COUNTER_MAX;
+ val = cg_proto->memory_allocated.limit;
+ val *= PAGE_SIZE;
+ break;
+ case RES_USAGE:
+ if (!cg_proto)
+ val = atomic_long_read(&tcp_memory_allocated);
+ else
+ val = page_counter_read(&cg_proto->memory_allocated);
+ val *= PAGE_SIZE;
+ break;
+ case RES_FAILCNT:
+ if (!cg_proto)
+ return 0;
+ val = cg_proto->memory_allocated.failcnt;
+ break;
+ case RES_MAX_USAGE:
+ if (!cg_proto)
+ return 0;
+ val = cg_proto->memory_allocated.watermark;
+ val *= PAGE_SIZE;
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg;
+ struct cg_proto *cg_proto;
+
+ memcg = mem_cgroup_from_css(of_css(of));
+ cg_proto = tcp_prot.proto_cgroup(memcg);
+ if (!cg_proto)
+ return nbytes;
+
+ switch (of_cft(of)->private) {
+ case RES_MAX_USAGE:
+ page_counter_reset_watermark(&cg_proto->memory_allocated);
+ break;
+ case RES_FAILCNT:
+ cg_proto->memory_allocated.failcnt = 0;
+ break;
+ }
+
+ return nbytes;
+}
+
+static struct cftype tcp_files[] = {
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .write = tcp_cgroup_write,
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_LIMIT,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .read_u64 = tcp_cgroup_read,
+ .private = RES_USAGE,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = RES_FAILCNT,
+ .write = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = RES_MAX_USAGE,
+ .write = tcp_cgroup_reset,
+ .read_u64 = tcp_cgroup_read,
+ },
+ { } /* terminate */
+};
+
+static int __init tcp_memcontrol_init(void)
+{
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
+ return 0;
+}
+__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
new file mode 100644
index 000000000..a51d63a43
--- /dev/null
+++ b/net/ipv4/tcp_metrics.c
@@ -0,0 +1,1198 @@
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/tcp.h>
+#include <linux/hash.h>
+#include <linux/tcp_metrics.h>
+#include <linux/vmalloc.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/genetlink.h>
+
+int sysctl_tcp_nometrics_save __read_mostly;
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash);
+
+struct tcp_fastopen_metrics {
+ u16 mss;
+ u16 syn_loss:10, /* Recurring Fast Open SYN losses */
+ try_exp:2; /* Request w/ exp. option (once) */
+ unsigned long last_syn_loss; /* Last Fast Open SYN loss */
+ struct tcp_fastopen_cookie cookie;
+};
+
+/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
+ * Kernel only stores RTT and RTTVAR in usec resolution
+ */
+#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
+
+struct tcp_metrics_block {
+ struct tcp_metrics_block __rcu *tcpm_next;
+ possible_net_t tcpm_net;
+ struct inetpeer_addr tcpm_saddr;
+ struct inetpeer_addr tcpm_daddr;
+ unsigned long tcpm_stamp;
+ u32 tcpm_ts;
+ u32 tcpm_ts_stamp;
+ u32 tcpm_lock;
+ u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
+ struct tcp_fastopen_metrics tcpm_fastopen;
+
+ struct rcu_head rcu_head;
+};
+
+static inline struct net *tm_net(struct tcp_metrics_block *tm)
+{
+ return read_pnet(&tm->tcpm_net);
+}
+
+static bool tcp_metric_locked(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_lock & (1 << idx);
+}
+
+static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx)
+{
+ return tm->tcpm_vals[idx];
+}
+
+static void tcp_metric_set(struct tcp_metrics_block *tm,
+ enum tcp_metric_index idx,
+ u32 val)
+{
+ tm->tcpm_vals[idx] = val;
+}
+
+static bool addr_same(const struct inetpeer_addr *a,
+ const struct inetpeer_addr *b)
+{
+ if (a->family != b->family)
+ return false;
+ if (a->family == AF_INET)
+ return a->addr.a4 == b->addr.a4;
+ return ipv6_addr_equal(&a->addr.in6, &b->addr.in6);
+}
+
+struct tcpm_hash_bucket {
+ struct tcp_metrics_block __rcu *chain;
+};
+
+static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
+static unsigned int tcp_metrics_hash_log __read_mostly;
+
+static DEFINE_SPINLOCK(tcp_metrics_lock);
+
+static void tcpm_suck_dst(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst,
+ bool fastopen_clear)
+{
+ u32 msval;
+ u32 val;
+
+ tm->tcpm_stamp = jiffies;
+
+ val = 0;
+ if (dst_metric_locked(dst, RTAX_RTT))
+ val |= 1 << TCP_METRIC_RTT;
+ if (dst_metric_locked(dst, RTAX_RTTVAR))
+ val |= 1 << TCP_METRIC_RTTVAR;
+ if (dst_metric_locked(dst, RTAX_SSTHRESH))
+ val |= 1 << TCP_METRIC_SSTHRESH;
+ if (dst_metric_locked(dst, RTAX_CWND))
+ val |= 1 << TCP_METRIC_CWND;
+ if (dst_metric_locked(dst, RTAX_REORDERING))
+ val |= 1 << TCP_METRIC_REORDERING;
+ tm->tcpm_lock = val;
+
+ msval = dst_metric_raw(dst, RTAX_RTT);
+ tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
+
+ msval = dst_metric_raw(dst, RTAX_RTTVAR);
+ tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
+ tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
+ tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
+ tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+ tm->tcpm_ts = 0;
+ tm->tcpm_ts_stamp = 0;
+ if (fastopen_clear) {
+ tm->tcpm_fastopen.mss = 0;
+ tm->tcpm_fastopen.syn_loss = 0;
+ tm->tcpm_fastopen.try_exp = 0;
+ tm->tcpm_fastopen.cookie.exp = false;
+ tm->tcpm_fastopen.cookie.len = 0;
+ }
+}
+
+#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
+
+static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+{
+ if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ tcpm_suck_dst(tm, dst, false);
+}
+
+#define TCP_METRICS_RECLAIM_DEPTH 5
+#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+
+#define deref_locked(p) \
+ rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))
+
+static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
+ struct inetpeer_addr *saddr,
+ struct inetpeer_addr *daddr,
+ unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ struct net *net;
+ bool reclaim = false;
+
+ spin_lock_bh(&tcp_metrics_lock);
+ net = dev_net(dst->dev);
+
+ /* While waiting for the spin-lock the cache might have been populated
+ * with this entry and so we have to check again.
+ */
+ tm = __tcp_get_metrics(saddr, daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR) {
+ reclaim = true;
+ tm = NULL;
+ }
+ if (tm) {
+ tcpm_check_stamp(tm, dst);
+ goto out_unlock;
+ }
+
+ if (unlikely(reclaim)) {
+ struct tcp_metrics_block *oldest;
+
+ oldest = deref_locked(tcp_metrics_hash[hash].chain);
+ for (tm = deref_locked(oldest->tcpm_next); tm;
+ tm = deref_locked(tm->tcpm_next)) {
+ if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+ oldest = tm;
+ }
+ tm = oldest;
+ } else {
+ tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+ if (!tm)
+ goto out_unlock;
+ }
+ write_pnet(&tm->tcpm_net, net);
+ tm->tcpm_saddr = *saddr;
+ tm->tcpm_daddr = *daddr;
+
+ tcpm_suck_dst(tm, dst, true);
+
+ if (likely(!reclaim)) {
+ tm->tcpm_next = tcp_metrics_hash[hash].chain;
+ rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
+ }
+
+out_unlock:
+ spin_unlock_bh(&tcp_metrics_lock);
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
+{
+ if (tm)
+ return tm;
+ if (depth > TCP_METRICS_RECLAIM_DEPTH)
+ return TCP_METRICS_RECLAIM_PTR;
+ return NULL;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
+ const struct inetpeer_addr *daddr,
+ struct net *net, unsigned int hash)
+{
+ struct tcp_metrics_block *tm;
+ int depth = 0;
+
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, saddr) &&
+ addr_same(&tm->tcpm_daddr, daddr) &&
+ net_eq(tm_net(tm), net))
+ break;
+ depth++;
+ }
+ return tcp_get_encode(tm, depth);
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ saddr.family = req->rsk_ops->family;
+ daddr.family = req->rsk_ops->family;
+ switch (daddr.family) {
+ case AF_INET:
+ saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
+ daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
+ daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr;
+ hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
+ break;
+#endif
+ default:
+ return NULL;
+ }
+
+ net = dev_net(dst->dev);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
+ break;
+ }
+ tcpm_check_stamp(tm, dst);
+ return tm;
+}
+
+static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ if (tw->tw_family == AF_INET) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = tw->tw_rcv_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (tw->tw_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = tw->tw_rcv_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = tw->tw_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ } else {
+ saddr.family = AF_INET6;
+ saddr.addr.in6 = tw->tw_v6_rcv_saddr;
+ daddr.family = AF_INET6;
+ daddr.addr.in6 = tw->tw_v6_daddr;
+ hash = ipv6_addr_hash(&tw->tw_v6_daddr);
+ }
+ }
+#endif
+ else
+ return NULL;
+
+ net = twsk_net(tw);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_saddr, &saddr) &&
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
+ break;
+ }
+ return tm;
+}
+
+static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
+ struct dst_entry *dst,
+ bool create)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net;
+
+ if (sk->sk_family == AF_INET) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = inet_sk(sk)->inet_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
+ saddr.family = AF_INET;
+ saddr.addr.a4 = inet_sk(sk)->inet_saddr;
+ daddr.family = AF_INET;
+ daddr.addr.a4 = inet_sk(sk)->inet_daddr;
+ hash = (__force unsigned int) daddr.addr.a4;
+ } else {
+ saddr.family = AF_INET6;
+ saddr.addr.in6 = sk->sk_v6_rcv_saddr;
+ daddr.family = AF_INET6;
+ daddr.addr.in6 = sk->sk_v6_daddr;
+ hash = ipv6_addr_hash(&sk->sk_v6_daddr);
+ }
+ }
+#endif
+ else
+ return NULL;
+
+ net = dev_net(dst->dev);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+
+ tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
+ if (tm == TCP_METRICS_RECLAIM_PTR)
+ tm = NULL;
+ if (!tm && create)
+ tm = tcpm_new(dst, &saddr, &daddr, hash);
+ else
+ tcpm_check_stamp(tm, dst);
+
+ return tm;
+}
+
+/* Save metrics learned by this TCP session. This function is called
+ * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
+ * or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ unsigned long rtt;
+ u32 val;
+ int m;
+
+ if (sysctl_tcp_nometrics_save || !dst)
+ return;
+
+ if (dst->flags & DST_HOST)
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ if (icsk->icsk_backoff || !tp->srtt_us) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time. Reset our
+ * results.
+ */
+ tm = tcp_get_metrics(sk, dst, false);
+ if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
+ tcp_metric_set(tm, TCP_METRIC_RTT, 0);
+ goto out_unlock;
+ } else
+ tm = tcp_get_metrics(sk, dst, true);
+
+ if (!tm)
+ goto out_unlock;
+
+ rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ m = rtt - tp->srtt_us;
+
+ /* If newly calculated rtt larger than stored one, store new
+ * one. Otherwise, use EWMA. Remember, rtt overestimation is
+ * always better than underestimation.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
+ if (m <= 0)
+ rtt = tp->srtt_us;
+ else
+ rtt -= (m >> 3);
+ tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
+ }
+
+ if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
+ unsigned long var;
+
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev_us)
+ m = tp->mdev_us;
+
+ var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
+ if (m >= var)
+ var = m;
+ else
+ var -= (var - m) >> 2;
+
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
+ }
+
+ if (tcp_in_initial_slowstart(tp)) {
+ /* Slow start still did not finish. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && (tp->snd_cwnd >> 1) > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_cwnd >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ if (tp->snd_cwnd > val)
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ tp->snd_cwnd);
+ }
+ } else if (tp->snd_cwnd > tp->snd_ssthresh &&
+ icsk->icsk_ca_state == TCP_CA_Open) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
+ }
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ * ssthresh may be also invalid.
+ */
+ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
+ val = tcp_metric_get(tm, TCP_METRIC_CWND);
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ (val + tp->snd_ssthresh) >> 1);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val && tp->snd_ssthresh > val)
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ tp->snd_ssthresh);
+ }
+ if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val < tp->reordering &&
+ tp->reordering != sysctl_tcp_reordering)
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ tp->reordering);
+ }
+ }
+ tm->tcpm_stamp = jiffies;
+out_unlock:
+ rcu_read_unlock();
+}
+
+/* Initialize metrics on socket. */
+
+void tcp_init_metrics(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_metrics_block *tm;
+ u32 val, crtt = 0; /* cached RTT scaled by 8 */
+
+ if (!dst)
+ goto reset;
+
+ dst_confirm(dst);
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (!tm) {
+ rcu_read_unlock();
+ goto reset;
+ }
+
+ if (tcp_metric_locked(tm, TCP_METRIC_CWND))
+ tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
+
+ val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ if (val) {
+ tp->snd_ssthresh = val;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ } else {
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ }
+ val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
+ if (val && tp->reordering != val) {
+ tcp_disable_fack(tp);
+ tcp_disable_early_retrans(tp);
+ tp->reordering = val;
+ }
+
+ crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
+ rcu_read_unlock();
+reset:
+ /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
+ * to seed the RTO for later data packets because SYN packets are
+ * small. Use the per-dst cached values to seed the RTO but keep
+ * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
+ * Later the RTO will be updated immediately upon obtaining the first
+ * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
+ * influences the first RTO but not later RTT estimation.
+ *
+ * But if RTT is not available from the SYN (due to retransmits or
+ * syn cookies) or the cache, force a conservative 3secs timeout.
+ *
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal circumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
+ */
+ if (crtt > tp->srtt_us) {
+ /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
+ crtt /= 8 * USEC_PER_MSEC;
+ inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
+ } else if (tp->srtt_us == 0) {
+ /* RFC6298: 5.7 We've failed to get a valid RTT sample from
+ * 3WHS. This is most likely due to retransmission,
+ * including spurious one. Reset the RTO back to 3secs
+ * from the more aggressive 1sec to avoid more spurious
+ * retransmission.
+ */
+ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
+ tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
+
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
+ }
+ /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC6298 more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1)
+ tp->snd_cwnd = 1;
+ else
+ tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
+ bool paws_check, bool timestamps)
+{
+ struct tcp_metrics_block *tm;
+ bool ret;
+
+ if (!dst)
+ return false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_req(req, dst);
+ if (paws_check) {
+ if (tm &&
+ (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
+ ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||
+ !timestamps))
+ ret = false;
+ else
+ ret = true;
+ } else {
+ if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
+ ret = true;
+ else
+ ret = false;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
+
+void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
+ tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
+ tp->rx_opt.ts_recent = tm->tcpm_ts;
+ }
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
+
+/* VJ's idea. Save last timestamp seen from this destination and hold
+ * it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter
+ * synchronized state.
+ */
+bool tcp_remember_stamp(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ bool ret = false;
+
+ if (dst) {
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+ tm->tcpm_ts = tp->rx_opt.ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+ }
+ return ret;
+}
+
+bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+ struct tcp_metrics_block *tm;
+ bool ret = false;
+
+ rcu_read_lock();
+ tm = __tcp_get_metrics_tw(tw);
+ if (tm) {
+ const struct tcp_timewait_sock *tcptw;
+ struct sock *sk = (struct sock *) tw;
+
+ tcptw = tcp_twsk(sk);
+ if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
+ ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
+ tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+ tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+ tm->tcpm_ts = tcptw->tw_ts_recent;
+ }
+ ret = true;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static DEFINE_SEQLOCK(fastopen_seqlock);
+
+void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie,
+ int *syn_loss, unsigned long *last_syn_loss)
+{
+ struct tcp_metrics_block *tm;
+
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ if (tfom->mss)
+ *mss = tfom->mss;
+ *cookie = tfom->cookie;
+ if (cookie->len <= 0 && tfom->try_exp == 1)
+ cookie->exp = true;
+ *syn_loss = tfom->syn_loss;
+ *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+ }
+ rcu_read_unlock();
+}
+
+void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie, bool syn_lost,
+ u16 try_exp)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_metrics_block *tm;
+
+ if (!dst)
+ return;
+ rcu_read_lock();
+ tm = tcp_get_metrics(sk, dst, true);
+ if (tm) {
+ struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
+
+ write_seqlock_bh(&fastopen_seqlock);
+ if (mss)
+ tfom->mss = mss;
+ if (cookie && cookie->len > 0)
+ tfom->cookie = *cookie;
+ else if (try_exp > tfom->try_exp &&
+ tfom->cookie.len <= 0 && !tfom->cookie.exp)
+ tfom->try_exp = try_exp;
+ if (syn_lost) {
+ ++tfom->syn_loss;
+ tfom->last_syn_loss = jiffies;
+ } else
+ tfom->syn_loss = 0;
+ write_sequnlock_bh(&fastopen_seqlock);
+ }
+ rcu_read_unlock();
+}
+
+static struct genl_family tcp_metrics_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = TCP_METRICS_GENL_NAME,
+ .version = TCP_METRICS_GENL_VERSION,
+ .maxattr = TCP_METRICS_ATTR_MAX,
+ .netnsok = true,
+};
+
+static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
+ [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr), },
+ /* Following attributes are not received for GET/DEL,
+ * we keep them for reference
+ */
+#if 0
+ [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, },
+ [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, },
+ [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, },
+ [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, },
+ [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, },
+ [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, },
+ [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
+ .len = TCP_FASTOPEN_COOKIE_MAX, },
+#endif
+};
+
+/* Add attributes, caller cancels its header on failure */
+static int tcp_metrics_fill_info(struct sk_buff *msg,
+ struct tcp_metrics_block *tm)
+{
+ struct nlattr *nest;
+ int i;
+
+ switch (tm->tcpm_daddr.family) {
+ case AF_INET:
+ if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
+ tm->tcpm_daddr.addr.a4) < 0)
+ goto nla_put_failure;
+ if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
+ tm->tcpm_saddr.addr.a4) < 0)
+ goto nla_put_failure;
+ break;
+ case AF_INET6:
+ if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
+ &tm->tcpm_daddr.addr.in6) < 0)
+ goto nla_put_failure;
+ if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
+ &tm->tcpm_saddr.addr.in6) < 0)
+ goto nla_put_failure;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
+ jiffies - tm->tcpm_stamp) < 0)
+ goto nla_put_failure;
+ if (tm->tcpm_ts_stamp) {
+ if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
+ (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
+ goto nla_put_failure;
+ if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
+ tm->tcpm_ts) < 0)
+ goto nla_put_failure;
+ }
+
+ {
+ int n = 0;
+
+ nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
+ if (!nest)
+ goto nla_put_failure;
+ for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
+ u32 val = tm->tcpm_vals[i];
+
+ if (!val)
+ continue;
+ if (i == TCP_METRIC_RTT) {
+ if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (i == TCP_METRIC_RTTVAR) {
+ if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
+ val) < 0)
+ goto nla_put_failure;
+ n++;
+ val = max(val / 1000, 1U);
+ }
+ if (nla_put_u32(msg, i + 1, val) < 0)
+ goto nla_put_failure;
+ n++;
+ }
+ if (n)
+ nla_nest_end(msg, nest);
+ else
+ nla_nest_cancel(msg, nest);
+ }
+
+ {
+ struct tcp_fastopen_metrics tfom_copy[1], *tfom;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&fastopen_seqlock);
+ tfom_copy[0] = tm->tcpm_fastopen;
+ } while (read_seqretry(&fastopen_seqlock, seq));
+
+ tfom = tfom_copy;
+ if (tfom->mss &&
+ nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
+ tfom->mss) < 0)
+ goto nla_put_failure;
+ if (tfom->syn_loss &&
+ (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
+ tfom->syn_loss) < 0 ||
+ nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
+ jiffies - tfom->last_syn_loss) < 0))
+ goto nla_put_failure;
+ if (tfom->cookie.len > 0 &&
+ nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
+ tfom->cookie.len, tfom->cookie.val) < 0)
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int tcp_metrics_dump_info(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct tcp_metrics_block *tm)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &tcp_metrics_nl_family, NLM_F_MULTI,
+ TCP_METRICS_CMD_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (tcp_metrics_fill_info(skb, tm) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int tcp_metrics_nl_dump(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
+ unsigned int row, s_row = cb->args[0];
+ int s_col = cb->args[1], col = s_col;
+
+ for (row = s_row; row < max_rows; row++, s_col = 0) {
+ struct tcp_metrics_block *tm;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
+
+ rcu_read_lock();
+ for (col = 0, tm = rcu_dereference(hb->chain); tm;
+ tm = rcu_dereference(tm->tcpm_next), col++) {
+ if (!net_eq(tm_net(tm), net))
+ continue;
+ if (col < s_col)
+ continue;
+ if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ cb->args[0] = row;
+ cb->args[1] = col;
+ return skb->len;
+}
+
+static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional, int v4, int v6)
+{
+ struct nlattr *a;
+
+ a = info->attrs[v4];
+ if (a) {
+ addr->family = AF_INET;
+ addr->addr.a4 = nla_get_in_addr(a);
+ if (hash)
+ *hash = (__force unsigned int) addr->addr.a4;
+ return 0;
+ }
+ a = info->attrs[v6];
+ if (a) {
+ if (nla_len(a) != sizeof(struct in6_addr))
+ return -EINVAL;
+ addr->family = AF_INET6;
+ addr->addr.in6 = nla_get_in6_addr(a);
+ if (hash)
+ *hash = ipv6_addr_hash(&addr->addr.in6);
+ return 0;
+ }
+ return optional ? 1 : -EAFNOSUPPORT;
+}
+
+static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
+ unsigned int *hash, int optional)
+{
+ return __parse_nl_addr(info, addr, hash, optional,
+ TCP_METRICS_ATTR_ADDR_IPV4,
+ TCP_METRICS_ATTR_ADDR_IPV6);
+}
+
+static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
+{
+ return __parse_nl_addr(info, addr, NULL, 0,
+ TCP_METRICS_ATTR_SADDR_IPV4,
+ TCP_METRICS_ATTR_SADDR_IPV6);
+}
+
+static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct tcp_metrics_block *tm;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct sk_buff *msg;
+ struct net *net = genl_info_net(info);
+ void *reply;
+ int ret;
+ bool src = true;
+
+ ret = parse_nl_addr(info, &daddr, &hash, 0);
+ if (ret < 0)
+ return ret;
+
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply)
+ goto nla_put_failure;
+
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+ ret = -ESRCH;
+ rcu_read_lock();
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
+ tm = rcu_dereference(tm->tcpm_next)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
+ ret = tcp_metrics_fill_info(msg, tm);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ret < 0)
+ goto out_free;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ ret = -EMSGSIZE;
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static void tcp_metrics_flush_all(struct net *net)
+{
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash;
+ struct tcp_metrics_block *tm;
+ unsigned int row;
+
+ for (row = 0; row < max_rows; row++, hb++) {
+ struct tcp_metrics_block __rcu **pp;
+ spin_lock_bh(&tcp_metrics_lock);
+ pp = &hb->chain;
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+ if (net_eq(tm_net(tm), net)) {
+ *pp = tm->tcpm_next;
+ kfree_rcu(tm, rcu_head);
+ } else {
+ pp = &tm->tcpm_next;
+ }
+ }
+ spin_unlock_bh(&tcp_metrics_lock);
+ }
+}
+
+static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct tcpm_hash_bucket *hb;
+ struct tcp_metrics_block *tm;
+ struct tcp_metrics_block __rcu **pp;
+ struct inetpeer_addr saddr, daddr;
+ unsigned int hash;
+ struct net *net = genl_info_net(info);
+ int ret;
+ bool src = true, found = false;
+
+ ret = parse_nl_addr(info, &daddr, &hash, 1);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ tcp_metrics_flush_all(net);
+ return 0;
+ }
+ ret = parse_nl_saddr(info, &saddr);
+ if (ret < 0)
+ src = false;
+
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+ hb = tcp_metrics_hash + hash;
+ pp = &hb->chain;
+ spin_lock_bh(&tcp_metrics_lock);
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+ if (addr_same(&tm->tcpm_daddr, &daddr) &&
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
+ *pp = tm->tcpm_next;
+ kfree_rcu(tm, rcu_head);
+ found = true;
+ } else {
+ pp = &tm->tcpm_next;
+ }
+ }
+ spin_unlock_bh(&tcp_metrics_lock);
+ if (!found)
+ return -ESRCH;
+ return 0;
+}
+
+static const struct genl_ops tcp_metrics_nl_ops[] = {
+ {
+ .cmd = TCP_METRICS_CMD_GET,
+ .doit = tcp_metrics_nl_cmd_get,
+ .dumpit = tcp_metrics_nl_dump,
+ .policy = tcp_metrics_nl_policy,
+ },
+ {
+ .cmd = TCP_METRICS_CMD_DEL,
+ .doit = tcp_metrics_nl_cmd_del,
+ .policy = tcp_metrics_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+static unsigned int tcpmhash_entries;
+static int __init set_tcpmhash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtouint(str, 0, &tcpmhash_entries);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("tcpmhash_entries=", set_tcpmhash_entries);
+
+static int __net_init tcp_net_metrics_init(struct net *net)
+{
+ size_t size;
+ unsigned int slots;
+
+ if (!net_eq(net, &init_net))
+ return 0;
+
+ slots = tcpmhash_entries;
+ if (!slots) {
+ if (totalram_pages >= 128 * 1024)
+ slots = 16 * 1024;
+ else
+ slots = 8 * 1024;
+ }
+
+ tcp_metrics_hash_log = order_base_2(slots);
+ size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
+
+ tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!tcp_metrics_hash)
+ tcp_metrics_hash = vzalloc(size);
+
+ if (!tcp_metrics_hash)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit tcp_net_metrics_exit(struct net *net)
+{
+ tcp_metrics_flush_all(net);
+}
+
+static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
+ .init = tcp_net_metrics_init,
+ .exit = tcp_net_metrics_exit,
+};
+
+void __init tcp_metrics_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&tcp_net_metrics_ops);
+ if (ret < 0)
+ panic("Could not allocate the tcp_metrics hash table\n");
+
+ ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
+ tcp_metrics_nl_ops);
+ if (ret < 0)
+ panic("Could not register tcp_metrics generic netlink\n");
+}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
new file mode 100644
index 000000000..17e7339ee
--- /dev/null
+++ b/net/ipv4/tcp_minisocks.c
@@ -0,0 +1,830 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+
+int sysctl_tcp_syncookies __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_syncookies);
+
+int sysctl_tcp_abort_on_overflow __read_mostly;
+
+struct inet_timewait_death_row tcp_death_row = {
+ .sysctl_max_tw_buckets = NR_FILE * 2,
+ .hashinfo = &tcp_hashinfo,
+};
+EXPORT_SYMBOL_GPL(tcp_death_row);
+
+static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+ if (seq == s_win)
+ return true;
+ if (after(end_seq, s_win) && before(seq, e_win))
+ return true;
+ return seq == e_win && seq == end_seq;
+}
+
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+ const struct sk_buff *skb, int mib_idx)
+{
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+ if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+ &tcptw->tw_last_oow_ack_time)) {
+ /* Send ACK. Note, we do not put the bucket,
+ * it will be released by caller.
+ */
+ return TCP_TW_ACK;
+ }
+
+ /* We are rate-limiting, so just release the tw sock and drop skb. */
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+}
+
+/*
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ * (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ * lifetime in the internet, which results in wrong conclusion, that
+ * it is set to catch "old duplicate segments" wandering out of their path.
+ * It is not quite correct. This timeout is calculated so that it exceeds
+ * maximal retransmission timeout enough to allow to lose one (or more)
+ * segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ * finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ * Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc. --ANK
+ *
+ * We don't need to initialize tmp_out.sack_ok as we don't use the results
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+ const struct tcphdr *th)
+{
+ struct tcp_options_received tmp_opt;
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ bool paws_reject = false;
+
+ tmp_opt.saw_tstamp = 0;
+ if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
+
+ if (tmp_opt.saw_tstamp) {
+ tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
+ tmp_opt.ts_recent = tcptw->tw_ts_recent;
+ tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+ }
+ }
+
+ if (tw->tw_substate == TCP_FIN_WAIT2) {
+ /* Just repeat all the checks of tcp_rcv_state_process() */
+
+ /* Out of window, send ACK */
+ if (paws_reject ||
+ !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
+ return tcp_timewait_check_oow_rate_limit(
+ tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
+
+ if (th->rst)
+ goto kill;
+
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
+ goto kill_with_rst;
+
+ /* Dup ACK? */
+ if (!th->ack ||
+ !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* New data or FIN. If new data arrive after half-duplex close,
+ * reset.
+ */
+ if (!th->fin ||
+ TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
+kill_with_rst:
+ inet_twsk_deschedule(tw);
+ inet_twsk_put(tw);
+ return TCP_TW_RST;
+ }
+
+ /* FIN arrived, enter true time-wait state. */
+ tw->tw_substate = TCP_TIME_WAIT;
+ tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ if (tmp_opt.saw_tstamp) {
+ tcptw->tw_ts_recent_stamp = get_seconds();
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ }
+
+ if (tcp_death_row.sysctl_tw_recycle &&
+ tcptw->tw_ts_recent_stamp &&
+ tcp_tw_remember_stamp(tw))
+ inet_twsk_schedule(tw, tw->tw_timeout);
+ else
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+ return TCP_TW_ACK;
+ }
+
+ /*
+ * Now real TIME-WAIT state.
+ *
+ * RFC 1122:
+ * "When a connection is [...] on TIME-WAIT state [...]
+ * [a TCP] MAY accept a new SYN from the remote TCP to
+ * reopen the connection directly, if it:
+ *
+ * (1) assigns its initial sequence number for the new
+ * connection to be larger than the largest sequence
+ * number it used on the previous connection incarnation,
+ * and
+ *
+ * (2) returns to TIME-WAIT state if the SYN turns out
+ * to be an old duplicate".
+ */
+
+ if (!paws_reject &&
+ (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
+ /* In window segment, it may be only reset or bare ack. */
+
+ if (th->rst) {
+ /* This is TIME_WAIT assassination, in two flavors.
+ * Oh well... nobody has a sufficient solution to this
+ * protocol bug yet.
+ */
+ if (sysctl_tcp_rfc1337 == 0) {
+kill:
+ inet_twsk_deschedule(tw);
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+ }
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+
+ if (tmp_opt.saw_tstamp) {
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tcptw->tw_ts_recent_stamp = get_seconds();
+ }
+
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* Out of window segment.
+
+ All the segments are ACKed immediately.
+
+ The only exception is new SYN. We accept it, if it is
+ not old duplicate and we are not in danger to be killed
+ by delayed old duplicates. RFC check is that it has
+ newer sequence number works at rates <40Mbit/sec.
+ However, if paws works, it is reliable AND even more,
+ we even may relax silly seq space cutoff.
+
+ RED-PEN: we violate main RFC requirement, if this SYN will appear
+ old duplicate (i.e. we receive RST in reply to SYN-ACK),
+ we must return socket to time-wait state. It is not good,
+ but not fatal yet.
+ */
+
+ if (th->syn && !th->rst && !th->ack && !paws_reject &&
+ (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+ (tmp_opt.saw_tstamp &&
+ (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
+ if (isn == 0)
+ isn++;
+ TCP_SKB_CB(skb)->tcp_tw_isn = isn;
+ return TCP_TW_SYN;
+ }
+
+ if (paws_reject)
+ NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+
+ if (!th->rst) {
+ /* In this case we must reset the TIMEWAIT timer.
+ *
+ * If it is ACKless SYN it may be both old duplicate
+ * and new good SYN with random sequence number <rcv_nxt.
+ * Do not reschedule in the last case.
+ */
+ if (paws_reject || th->ack)
+ inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
+
+ return tcp_timewait_check_oow_rate_limit(
+ tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
+ }
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+}
+EXPORT_SYMBOL(tcp_timewait_state_process);
+
+/*
+ * Move a socket to time-wait or dead fin-wait-2 state.
+ */
+void tcp_time_wait(struct sock *sk, int state, int timeo)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_timewait_sock *tw;
+ bool recycle_ok = false;
+
+ if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ recycle_ok = tcp_remember_stamp(sk);
+
+ tw = inet_twsk_alloc(sk, &tcp_death_row, state);
+
+ if (tw) {
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+ struct inet_sock *inet = inet_sk(sk);
+
+ tw->tw_transparent = inet->transparent;
+ tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ tcptw->tw_rcv_nxt = tp->rcv_nxt;
+ tcptw->tw_snd_nxt = tp->snd_nxt;
+ tcptw->tw_rcv_wnd = tcp_receive_window(tp);
+ tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
+ tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ tcptw->tw_ts_offset = tp->tsoffset;
+ tcptw->tw_last_oow_ack_time = 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tw->tw_family == PF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ tw->tw_tclass = np->tclass;
+ tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
+ tw->tw_ipv6only = sk->sk_ipv6only;
+ }
+#endif
+
+#ifdef CONFIG_TCP_MD5SIG
+ /*
+ * The timewait bucket does not have the key DB from the
+ * sock structure. We just make a quick copy of the
+ * md5 key being used (if indeed we are using one)
+ * so the timewait ack generating code has the key.
+ */
+ do {
+ struct tcp_md5sig_key *key;
+ tcptw->tw_md5_key = NULL;
+ key = tp->af_specific->md5_lookup(sk, sk);
+ if (key) {
+ tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+ if (tcptw->tw_md5_key && !tcp_alloc_md5sig_pool())
+ BUG();
+ }
+ } while (0);
+#endif
+
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
+
+ /* Get the TIME_WAIT timeout firing. */
+ if (timeo < rto)
+ timeo = rto;
+
+ if (recycle_ok) {
+ tw->tw_timeout = rto;
+ } else {
+ tw->tw_timeout = TCP_TIMEWAIT_LEN;
+ if (state == TCP_TIME_WAIT)
+ timeo = TCP_TIMEWAIT_LEN;
+ }
+
+ inet_twsk_schedule(tw, timeo);
+ inet_twsk_put(tw);
+ } else {
+ /* Sorry, if we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+ }
+
+ tcp_update_metrics(sk);
+ tcp_done(sk);
+}
+
+void tcp_twsk_destructor(struct sock *sk)
+{
+#ifdef CONFIG_TCP_MD5SIG
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+
+ if (twsk->tw_md5_key)
+ kfree_rcu(twsk->tw_md5_key, rcu);
+#endif
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+
+void tcp_openreq_init_rwin(struct request_sock *req,
+ struct sock *sk, struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u8 rcv_wscale;
+ int mss = dst_metric_advmss(dst);
+
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+ mss = tp->rx_opt.user_mss;
+
+ /* Set this up on the first call only */
+ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+ req->window_clamp = tcp_full_space(sk);
+
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ &req->rcv_wnd,
+ &req->window_clamp,
+ ireq->wscale_ok,
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+ ireq->rcv_wscale = rcv_wscale;
+}
+EXPORT_SYMBOL(tcp_openreq_init_rwin);
+
+static void tcp_ecn_openreq_child(struct tcp_sock *tp,
+ const struct request_sock *req)
+{
+ tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
+}
+
+void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+ bool ca_got_dst = false;
+
+ if (ca_key != TCP_CA_UNSPEC) {
+ const struct tcp_congestion_ops *ca;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && try_module_get(ca->owner))) {
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ ca_got_dst = true;
+ }
+ rcu_read_unlock();
+ }
+
+ /* If no valid choice made yet, assign current system default ca. */
+ if (!ca_got_dst &&
+ (!icsk->icsk_ca_setsockopt ||
+ !try_module_get(icsk->icsk_ca_ops->owner)))
+ tcp_assign_congestion_control(sk);
+
+ tcp_set_ca_state(sk, TCP_CA_Open);
+}
+EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
+{
+ struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+
+ if (newsk) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+ struct tcp_sock *newtp = tcp_sk(newsk);
+
+ /* Now setup tcp_sock */
+ newtp->pred_flags = 0;
+
+ newtp->rcv_wup = newtp->copied_seq =
+ newtp->rcv_nxt = treq->rcv_isn + 1;
+
+ newtp->snd_sml = newtp->snd_una =
+ newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
+
+ tcp_prequeue_init(newtp);
+ INIT_LIST_HEAD(&newtp->tsq_node);
+
+ tcp_init_wl(newtp, treq->rcv_isn);
+
+ newtp->srtt_us = 0;
+ newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ newicsk->icsk_rto = TCP_TIMEOUT_INIT;
+
+ newtp->packets_out = 0;
+ newtp->retrans_out = 0;
+ newtp->sacked_out = 0;
+ newtp->fackets_out = 0;
+ newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tcp_enable_early_retrans(newtp);
+ newtp->tlp_high_seq = 0;
+ newtp->lsndtime = treq->snt_synack;
+ newtp->last_oow_ack_time = 0;
+ newtp->total_retrans = req->num_retrans;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ newtp->snd_cwnd = TCP_INIT_CWND;
+ newtp->snd_cwnd_cnt = 0;
+
+ tcp_init_xmit_timers(newsk);
+ __skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+
+ newtp->rx_opt.saw_tstamp = 0;
+
+ newtp->rx_opt.dsack = 0;
+ newtp->rx_opt.num_sacks = 0;
+
+ newtp->urg_data = 0;
+
+ if (sock_flag(newsk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(newsk,
+ keepalive_time_when(newtp));
+
+ newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+ if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
+ if (sysctl_tcp_fack)
+ tcp_enable_fack(newtp);
+ }
+ newtp->window_clamp = req->window_clamp;
+ newtp->rcv_ssthresh = req->rcv_wnd;
+ newtp->rcv_wnd = req->rcv_wnd;
+ newtp->rx_opt.wscale_ok = ireq->wscale_ok;
+ if (newtp->rx_opt.wscale_ok) {
+ newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+ newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
+ } else {
+ newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp, 65535U);
+ }
+ newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
+ newtp->rx_opt.snd_wscale);
+ newtp->max_window = newtp->snd_wnd;
+
+ if (newtp->rx_opt.tstamp_ok) {
+ newtp->rx_opt.ts_recent = req->ts_recent;
+ newtp->rx_opt.ts_recent_stamp = get_seconds();
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->rx_opt.ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ newtp->tsoffset = 0;
+#ifdef CONFIG_TCP_MD5SIG
+ newtp->md5sig_info = NULL; /*XXX*/
+ if (newtp->af_specific->md5_lookup(sk, newsk))
+ newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+ if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
+ newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
+ newtp->rx_opt.mss_clamp = req->mss;
+ tcp_ecn_openreq_child(newtp, req);
+ newtp->fastopen_rsk = NULL;
+ newtp->syn_data_acked = 0;
+
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+ }
+ return newsk;
+}
+EXPORT_SYMBOL(tcp_create_openreq_child);
+
+/*
+ * Process an incoming packet for SYN_RECV sockets represented as a
+ * request_sock. Normally sk is the listener socket but for TFO it
+ * points to the child socket.
+ *
+ * XXX (TFO) - The current impl contains a special check for ack
+ * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
+ *
+ * We don't need to initialize tmp_opt.sack_ok as we don't use the results
+ */
+
+struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ bool fastopen)
+{
+ struct tcp_options_received tmp_opt;
+ struct sock *child;
+ const struct tcphdr *th = tcp_hdr(skb);
+ __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ bool paws_reject = false;
+
+ BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
+
+ tmp_opt.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr)>>2)) {
+ tcp_parse_options(skb, &tmp_opt, 0, NULL);
+
+ if (tmp_opt.saw_tstamp) {
+ tmp_opt.ts_recent = req->ts_recent;
+ /* We do not store true stamp, but it is not required,
+ * it can be estimated (approximately)
+ * from another data.
+ */
+ tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+ paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+ }
+ }
+
+ /* Check for pure retransmitted SYN. */
+ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
+ flg == TCP_FLAG_SYN &&
+ !paws_reject) {
+ /*
+ * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+ * this case on figure 6 and figure 8, but formal
+ * protocol description says NOTHING.
+ * To be more exact, it says that we should send ACK,
+ * because this segment (at least, if it has no data)
+ * is out of window.
+ *
+ * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+ * describe SYN-RECV state. All the description
+ * is wrong, we cannot believe to it and should
+ * rely only on common sense and implementation
+ * experience.
+ *
+ * Enforce "SYN-ACK" according to figure 8, figure 6
+ * of RFC793, fixed by RFC1122.
+ *
+ * Note that even if there is new data in the SYN packet
+ * they will be thrown away too.
+ *
+ * Reset timer after retransmitting SYNACK, similar to
+ * the idea of fast retransmit in recovery.
+ */
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+ &tcp_rsk(req)->last_oow_ack_time) &&
+
+ !inet_rtx_syn_ack(sk, req)) {
+ unsigned long expires = jiffies;
+
+ expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
+ TCP_RTO_MAX);
+ if (!fastopen)
+ mod_timer_pending(&req->rsk_timer, expires);
+ else
+ req->rsk_timer.expires = expires;
+ }
+ return NULL;
+ }
+
+ /* Further reproduces section "SEGMENT ARRIVES"
+ for state SYN-RECEIVED of RFC793.
+ It is broken, however, it does not work only
+ when SYNs are crossed.
+
+ You would think that SYN crossing is impossible here, since
+ we should have a SYN_SENT socket (from connect()) on our end,
+ but this is not true if the crossed SYNs were sent to both
+ ends by a malicious third party. We must defend against this,
+ and to do that we first verify the ACK (as per RFC793, page
+ 36) and reset if it is invalid. Is this a true full defense?
+ To convince ourselves, let us consider a way in which the ACK
+ test can still pass in this 'malicious crossed SYNs' case.
+ Malicious sender sends identical SYNs (and thus identical sequence
+ numbers) to both A and B:
+
+ A: gets SYN, seq=7
+ B: gets SYN, seq=7
+
+ By our good fortune, both A and B select the same initial
+ send sequence number of seven :-)
+
+ A: sends SYN|ACK, seq=7, ack_seq=8
+ B: sends SYN|ACK, seq=7, ack_seq=8
+
+ So we are now A eating this SYN|ACK, ACK test passes. So
+ does sequence test, SYN is truncated, and thus we consider
+ it a bare ACK.
+
+ If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+ bare ACK. Otherwise, we create an established connection. Both
+ ends (listening sockets) accept the new incoming connection and try
+ to talk to each other. 8-)
+
+ Note: This case is both harmless, and rare. Possibility is about the
+ same as us discovering intelligent life on another plant tomorrow.
+
+ But generally, we should (RFC lies!) to accept ACK
+ from SYNACK both here and in tcp_rcv_state_process().
+ tcp_rcv_state_process() does not, hence, we do not too.
+
+ Note that the case is absolutely generic:
+ we cannot optimize anything here without
+ violating protocol. All the checks must be made
+ before attempt to create socket.
+ */
+
+ /* RFC793 page 36: "If the connection is in any non-synchronized state ...
+ * and the incoming segment acknowledges something not yet
+ * sent (the segment carries an unacceptable ACK) ...
+ * a reset is sent."
+ *
+ * Invalid ACK: reset will be sent by listening socket.
+ * Note that the ACK validity check for a Fast Open socket is done
+ * elsewhere and is checked directly against the child socket rather
+ * than req because user data may have been sent out.
+ */
+ if ((flg & TCP_FLAG_ACK) && !fastopen &&
+ (TCP_SKB_CB(skb)->ack_seq !=
+ tcp_rsk(req)->snt_isn + 1))
+ return sk;
+
+ /* Also, it would be not so bad idea to check rcv_tsecr, which
+ * is essentially ACK extension and too early or too late values
+ * should cause reset in unsynchronized states.
+ */
+
+ /* RFC793: "first check sequence number". */
+
+ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
+ /* Out of window: send ACK and drop. */
+ if (!(flg & TCP_FLAG_RST))
+ req->rsk_ops->send_ack(sk, skb, req);
+ if (paws_reject)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ return NULL;
+ }
+
+ /* In sequence, PAWS is OK. */
+
+ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
+ req->ts_recent = tmp_opt.rcv_tsval;
+
+ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
+ /* Truncate SYN, it is out of window starting
+ at tcp_rsk(req)->rcv_isn + 1. */
+ flg &= ~TCP_FLAG_SYN;
+ }
+
+ /* RFC793: "second check the RST bit" and
+ * "fourth, check the SYN bit"
+ */
+ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ goto embryonic_reset;
+ }
+
+ /* ACK sequence verified above, just make sure ACK is
+ * set. If ACK not set, just silently drop the packet.
+ *
+ * XXX (TFO) - if we ever allow "data after SYN", the
+ * following check needs to be removed.
+ */
+ if (!(flg & TCP_FLAG_ACK))
+ return NULL;
+
+ /* For Fast Open no more processing is needed (sk is the
+ * child socket).
+ */
+ if (fastopen)
+ return sk;
+
+ /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
+ if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+ inet_rsk(req)->acked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
+ return NULL;
+ }
+
+ /* OK, ACK is valid, create big socket and
+ * feed this segment to it. It will repeat all
+ * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+ * ESTABLISHED STATE. If it will be dropped after
+ * socket is created, wait for troubles.
+ */
+ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+ if (!child)
+ goto listen_overflow;
+
+ inet_csk_reqsk_queue_drop(sk, req);
+ inet_csk_reqsk_queue_add(sk, req, child);
+ /* Warning: caller must not call reqsk_put(req);
+ * child stole last reference on it.
+ */
+ return child;
+
+listen_overflow:
+ if (!sysctl_tcp_abort_on_overflow) {
+ inet_rsk(req)->acked = 1;
+ return NULL;
+ }
+
+embryonic_reset:
+ if (!(flg & TCP_FLAG_RST)) {
+ /* Received a bad SYN pkt - for TFO We try not to reset
+ * the local connection unless it's really necessary to
+ * avoid becoming vulnerable to outside attack aiming at
+ * resetting legit local connections.
+ */
+ req->rsk_ops->send_reset(sk, skb);
+ } else if (fastopen) { /* received a valid RST pkt */
+ reqsk_fastopen_remove(sk, req, true);
+ tcp_reset(sk);
+ }
+ if (!fastopen) {
+ inet_csk_reqsk_queue_drop(sk, req);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_check_req);
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ *
+ * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
+ * when entering. But other states are possible due to a race condition
+ * where after __inet_lookup_established() fails but before the listener
+ * locked is obtained, other packets cause the same connection to
+ * be created.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+ int state = child->sk_state;
+
+ if (!sock_owned_by_user(child)) {
+ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
+ skb->len);
+ /* Wakeup parent, send SIGIO */
+ if (state == TCP_SYN_RECV && child->sk_state != state)
+ parent->sk_data_ready(parent);
+ } else {
+ /* Alas, it is possible again, because we do lookup
+ * in main socket hash table and lock on listening
+ * socket does not protect us more.
+ */
+ __sk_add_backlog(child, skb);
+ }
+
+ bh_unlock_sock(child);
+ sock_put(child);
+ return ret;
+}
+EXPORT_SYMBOL(tcp_child_process);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
new file mode 100644
index 000000000..3f7c2fca5
--- /dev/null
+++ b/net/ipv4/tcp_offload.c
@@ -0,0 +1,327 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * TCPv4 GSO/GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+
+static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
+ unsigned int seq, unsigned int mss)
+{
+ while (skb) {
+ if (before(ts_seq, seq + mss)) {
+ skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP;
+ skb_shinfo(skb)->tskey = ts_seq;
+ return;
+ }
+
+ skb = skb->next;
+ seq += mss;
+ }
+}
+
+static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ /* Set up checksum pseudo header, usually expect stack to
+ * have done this already.
+ */
+
+ th->check = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
+ }
+
+ return tcp_gso_segment(skb, features);
+}
+
+struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int sum_truesize = 0;
+ struct tcphdr *th;
+ unsigned int thlen;
+ unsigned int seq;
+ __be32 delta;
+ unsigned int oldlen;
+ unsigned int mss;
+ struct sk_buff *gso_skb = skb;
+ __sum16 newcheck;
+ bool ooo_okay, copy_destructor;
+
+ th = tcp_hdr(skb);
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ if (!pskb_may_pull(skb, thlen))
+ goto out;
+
+ oldlen = (u16)~skb->len;
+ __skb_pull(skb, thlen);
+
+ mss = tcp_skb_mss(skb);
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
+ 0) ||
+ !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ copy_destructor = gso_skb->destructor == tcp_wfree;
+ ooo_okay = gso_skb->ooo_okay;
+ /* All segments but the first should have ooo_okay cleared */
+ skb->ooo_okay = 0;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs))
+ goto out;
+
+ /* Only first segment might have ooo_okay set */
+ segs->ooo_okay = ooo_okay;
+
+ delta = htonl(oldlen + (thlen + mss));
+
+ skb = segs;
+ th = tcp_hdr(skb);
+ seq = ntohl(th->seq);
+
+ if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
+ tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
+
+ newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+
+ do {
+ th->fin = th->psh = 0;
+ th->check = newcheck;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ th->check = gso_make_checksum(skb, ~th->check);
+
+ seq += mss;
+ if (copy_destructor) {
+ skb->destructor = gso_skb->destructor;
+ skb->sk = gso_skb->sk;
+ sum_truesize += skb->truesize;
+ }
+ skb = skb->next;
+ th = tcp_hdr(skb);
+
+ th->seq = htonl(seq);
+ th->cwr = 0;
+ } while (skb->next);
+
+ /* Following permits TCP Small Queues to work well with GSO :
+ * The callback to TCP stack will be called at the time last frag
+ * is freed at TX completion, and not right now when gso_skb
+ * is freed by GSO engine
+ */
+ if (copy_destructor) {
+ swap(gso_skb->sk, skb->sk);
+ swap(gso_skb->destructor, skb->destructor);
+ sum_truesize += skb->truesize;
+ atomic_add(sum_truesize - gso_skb->truesize,
+ &skb->sk->sk_wmem_alloc);
+ }
+
+ delta = htonl(oldlen + (skb_tail_pointer(skb) -
+ skb_transport_header(skb)) +
+ skb->data_len);
+ th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
+ (__force u32)delta));
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ th->check = gso_make_checksum(skb, ~th->check);
+out:
+ return segs;
+}
+
+struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th;
+ struct tcphdr *th2;
+ unsigned int len;
+ unsigned int thlen;
+ __be32 flags;
+ unsigned int mss = 1;
+ unsigned int hlen;
+ unsigned int off;
+ int flush = 1;
+ int i;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*th);
+ th = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
+
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+ goto out;
+
+ hlen = off + thlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
+
+ skb_gro_pull(skb, thlen);
+
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
+
+ for (; (p = *head); head = &p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ th2 = tcp_hdr(p);
+
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ goto found;
+ }
+
+ goto out_check_final;
+
+found:
+ /* Include the IP ID check below from the inner most IP hdr */
+ flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
+ flush |= (__force int)(flags & TCP_FLAG_CWR);
+ flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+ flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
+ for (i = sizeof(*th); i < thlen; i += 4)
+ flush |= *(u32 *)((u8 *)th + i) ^
+ *(u32 *)((u8 *)th2 + i);
+
+ mss = tcp_skb_mss(p);
+
+ flush |= (len - 1) >= mss;
+ flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+
+ if (flush || skb_gro_receive(head, skb)) {
+ mss = 1;
+ goto out_check_final;
+ }
+
+ p = *head;
+ th2 = tcp_hdr(p);
+ tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
+
+out_check_final:
+ flush = len < mss;
+ flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+ TCP_FLAG_RST | TCP_FLAG_SYN |
+ TCP_FLAG_FIN));
+
+ if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+ pp = head;
+
+out:
+ NAPI_GRO_CB(skb)->flush |= (flush != 0);
+
+ return pp;
+}
+
+int tcp_gro_complete(struct sk_buff *skb)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)th - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ if (th->cwr)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_gro_complete);
+
+static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (!NAPI_GRO_CB(skb)->flush &&
+ skb_gro_checksum_validate(skb, IPPROTO_TCP,
+ inet_gro_compute_pseudo)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ return tcp_gro_receive(head, skb);
+}
+
+static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
+ iph->daddr, 0);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+
+ return tcp_gro_complete(skb);
+}
+
+static const struct net_offload tcpv4_offload = {
+ .callbacks = {
+ .gso_segment = tcp4_gso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ },
+};
+
+int __init tcpv4_offload_init(void)
+{
+ return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
+}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
new file mode 100644
index 000000000..d80faa151
--- /dev/null
+++ b/net/ipv4/tcp_output.c
@@ -0,0 +1,3525 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes: Pedro Roque : Retransmit queue handled by TCP.
+ * : Fragmentation on mtu decrease
+ * : Segment collapse on retransmit
+ * : AF independence
+ *
+ * Linus Torvalds : send_delayed_ack
+ * David S. Miller : Charge memory using the right skb
+ * during syn/ack processing.
+ * David S. Miller : Output engine completely rewritten.
+ * Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
+ * Cacophonix Gaul : draft-minshall-nagle-01
+ * J Hadi Salim : ECN support
+ *
+ */
+
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <net/tcp.h>
+
+#include <linux/compiler.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+
+/* People can turn this off for buggy TCP's found in printers etc. */
+int sysctl_tcp_retrans_collapse __read_mostly = 1;
+
+/* People can turn this on to work with those rare, broken TCPs that
+ * interpret the window field as a signed quantity.
+ */
+int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
+
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
+
+/* This limits the percentage of the congestion window which we
+ * will allow a single TSO frame to consume. Building TSO frames
+ * which are too large can cause TCP streams to be bursty.
+ */
+int sysctl_tcp_tso_win_divisor __read_mostly = 3;
+
+/* By default, RFC2861 behavior. */
+int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+
+unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
+
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp);
+
+/* Account for new data that has been sent to the network. */
+static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int prior_packets = tp->packets_out;
+
+ tcp_advance_send_head(sk, skb);
+ tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+
+ tp->packets_out += tcp_skb_pcount(skb);
+ if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ tcp_rearm_rto(sk);
+ }
+
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
+ tcp_skb_pcount(skb));
+}
+
+/* SND.NXT, if window was not shrunk.
+ * If window has been shrunk, what should we make? It is not clear at all.
+ * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
+ * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
+ * invalid. OK, let's make this for now:
+ */
+static inline __u32 tcp_acceptable_seq(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!before(tcp_wnd_end(tp), tp->snd_nxt))
+ return tp->snd_nxt;
+ else
+ return tcp_wnd_end(tp);
+}
+
+/* Calculate mss to advertise in SYN segment.
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ * attached devices, because some buggy hosts are confused by
+ * large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ * hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ * This may be overridden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ * probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
+ int mss = tp->advmss;
+
+ if (dst) {
+ unsigned int metric = dst_metric_advmss(dst);
+
+ if (metric < mss) {
+ mss = metric;
+ tp->advmss = mss;
+ }
+ }
+
+ return (__u16)mss;
+}
+
+/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism. */
+static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ s32 delta = tcp_time_stamp - tp->lsndtime;
+ u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+ u32 cwnd = tp->snd_cwnd;
+
+ tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
+
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ restart_cwnd = min(restart_cwnd, cwnd);
+
+ while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
+ cwnd >>= 1;
+ tp->snd_cwnd = max(cwnd, restart_cwnd);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->snd_cwnd_used = 0;
+}
+
+/* Congestion state accounting after a packet has been sent. */
+static void tcp_event_data_sent(struct tcp_sock *tp,
+ struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const u32 now = tcp_time_stamp;
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (sysctl_tcp_slow_start_after_idle &&
+ (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
+ tcp_cwnd_restart(sk, __sk_dst_get(sk));
+
+ tp->lsndtime = now;
+
+ /* If it is a reply for ato after last received
+ * packet, enter pingpong mode.
+ */
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
+ (!dst || !dst_metric(dst, RTAX_QUICKACK)))
+ icsk->icsk_ack.pingpong = 1;
+}
+
+/* Account for an ACK we sent. */
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+{
+ tcp_dec_quickack_mode(sk, pkts);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+
+u32 tcp_default_init_rwnd(u32 mss)
+{
+ /* Initial receive window should be twice of TCP_INIT_CWND to
+ * enable proper sending of new unsent data during fast recovery
+ * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
+ * limit when mss is larger than 1460.
+ */
+ u32 init_rwnd = TCP_INIT_CWND * 2;
+
+ if (mss > 1460)
+ init_rwnd = max((1460 * init_rwnd) / mss, 2U);
+ return init_rwnd;
+}
+
+/* Determine a window scaling and initial window to offer.
+ * Based on the assumption that the given amount of space
+ * will be offered. Store the results in the tp structure.
+ * NOTE: for smooth operation initial space offering should
+ * be a multiple of mss if possible. We assume here that mss >= 1.
+ * This MUST be enforced by all callers.
+ */
+void tcp_select_initial_window(int __space, __u32 mss,
+ __u32 *rcv_wnd, __u32 *window_clamp,
+ int wscale_ok, __u8 *rcv_wscale,
+ __u32 init_rcv_wnd)
+{
+ unsigned int space = (__space < 0 ? 0 : __space);
+
+ /* If no clamp set the clamp to the max possible scaled window */
+ if (*window_clamp == 0)
+ (*window_clamp) = (65535 << 14);
+ space = min(*window_clamp, space);
+
+ /* Quantize space offering to a multiple of mss if possible. */
+ if (space > mss)
+ space = (space / mss) * mss;
+
+ /* NOTE: offering an initial window larger than 32767
+ * will break some buggy TCP stacks. If the admin tells us
+ * it is likely we could be speaking with such a buggy stack
+ * we will truncate our initial window offering to 32K-1
+ * unless the remote has sent us a window scaling option,
+ * which we interpret as a sign the remote TCP is not
+ * misinterpreting the window field as a signed quantity.
+ */
+ if (sysctl_tcp_workaround_signed_windows)
+ (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+ else
+ (*rcv_wnd) = space;
+
+ (*rcv_wscale) = 0;
+ if (wscale_ok) {
+ /* Set window scaling on max possible window
+ * See RFC1323 for an explanation of the limit to 14
+ */
+ space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+ space = min_t(u32, space, *window_clamp);
+ while (space > 65535 && (*rcv_wscale) < 14) {
+ space >>= 1;
+ (*rcv_wscale)++;
+ }
+ }
+
+ if (mss > (1 << *rcv_wscale)) {
+ if (!init_rcv_wnd) /* Use default unless specified otherwise */
+ init_rcv_wnd = tcp_default_init_rwnd(mss);
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
+ }
+
+ /* Set the clamp no higher than max representable value */
+ (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
+}
+EXPORT_SYMBOL(tcp_select_initial_window);
+
+/* Chose a new window to advertise, update state in tcp_sock for the
+ * socket, and return result with RFC1323 scaling applied. The return
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+static u16 tcp_select_window(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 old_win = tp->rcv_wnd;
+ u32 cur_win = tcp_receive_window(tp);
+ u32 new_win = __tcp_select_window(sk);
+
+ /* Never shrink the offered window */
+ if (new_win < cur_win) {
+ /* Danger Will Robinson!
+ * Don't update rcv_wup/rcv_wnd here or else
+ * we will not be able to advertise a zero
+ * window in time. --DaveM
+ *
+ * Relax Will Robinson.
+ */
+ if (new_win == 0)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPWANTZEROWINDOWADV);
+ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+ }
+ tp->rcv_wnd = new_win;
+ tp->rcv_wup = tp->rcv_nxt;
+
+ /* Make sure we do not exceed the maximum possible
+ * scaled window.
+ */
+ if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
+ new_win = min(new_win, MAX_TCP_WINDOW);
+ else
+ new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+ /* RFC1323 scaling applied */
+ new_win >>= tp->rx_opt.rcv_wscale;
+
+ /* If we advertise zero window, disable fast path. */
+ if (new_win == 0) {
+ tp->pred_flags = 0;
+ if (old_win)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPTOZEROWINDOWADV);
+ } else if (old_win == 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
+ }
+
+ return new_win;
+}
+
+/* Packet ECN state for a SYN-ACK */
+static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
+ if (!(tp->ecn_flags & TCP_ECN_OK))
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+ else if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+}
+
+/* Packet ECN state for a SYN. */
+static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+ tcp_ca_needs_ecn(sk);
+
+ if (!use_ecn) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ use_ecn = true;
+ }
+
+ tp->ecn_flags = 0;
+
+ if (use_ecn) {
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
+ tp->ecn_flags = TCP_ECN_OK;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ }
+}
+
+static void
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
+ struct sock *sk)
+{
+ if (inet_rsk(req)->ecn_ok) {
+ th->ece = 1;
+ if (tcp_ca_needs_ecn(sk))
+ INET_ECN_xmit(sk);
+ }
+}
+
+/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
+ * be sent.
+ */
+static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
+ int tcp_header_len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->ecn_flags & TCP_ECN_OK) {
+ /* Not-retransmitted data segment: set ECT and inject CWR. */
+ if (skb->len != tcp_header_len &&
+ !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
+ INET_ECN_xmit(sk);
+ if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+ tcp_hdr(skb)->cwr = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ }
+ } else if (!tcp_ca_needs_ecn(sk)) {
+ /* ACK or retransmitted segment: clear ECT|CE */
+ INET_ECN_dontxmit(sk);
+ }
+ if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
+ tcp_hdr(skb)->ece = 1;
+ }
+}
+
+/* Constructs common control bits of non-data skb. If SYN/FIN is present,
+ * auto increment end seqno.
+ */
+static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum = 0;
+
+ TCP_SKB_CB(skb)->tcp_flags = flags;
+ TCP_SKB_CB(skb)->sacked = 0;
+
+ tcp_skb_pcount_set(skb, 1);
+ shinfo->gso_size = 0;
+ shinfo->gso_type = 0;
+
+ TCP_SKB_CB(skb)->seq = seq;
+ if (flags & (TCPHDR_SYN | TCPHDR_FIN))
+ seq++;
+ TCP_SKB_CB(skb)->end_seq = seq;
+}
+
+static inline bool tcp_urg_mode(const struct tcp_sock *tp)
+{
+ return tp->snd_una != tp->snd_up;
+}
+
+#define OPTION_SACK_ADVERTISE (1 << 0)
+#define OPTION_TS (1 << 1)
+#define OPTION_MD5 (1 << 2)
+#define OPTION_WSCALE (1 << 3)
+#define OPTION_FAST_OPEN_COOKIE (1 << 8)
+
+struct tcp_out_options {
+ u16 options; /* bit field of OPTION_* */
+ u16 mss; /* 0 to disable */
+ u8 ws; /* window scale, 0 to disable */
+ u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 hash_size; /* bytes in hash_location */
+ __u8 *hash_location; /* temporary pointer, overloaded */
+ __u32 tsval, tsecr; /* need to include OPTION_TS */
+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+};
+
+/* Write previously computed TCP options to the packet.
+ *
+ * Beware: Something in the Internet is very sensitive to the ordering of
+ * TCP options, we learned this through the hard way, so be careful here.
+ * Luckily we can at least blame others for their non-compliance but from
+ * inter-operability perspective it seems that we're somewhat stuck with
+ * the ordering which we have been using if we want to keep working with
+ * those broken things (not that it currently hurts anybody as there isn't
+ * particular reason why the ordering would need to be changed).
+ *
+ * At least SACK_PERM as the first option is known to lead to a disaster
+ * (but it may well be that other scenarios fail similarly).
+ */
+static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+ struct tcp_out_options *opts)
+{
+ u16 options = opts->options; /* mungable copy */
+
+ if (unlikely(OPTION_MD5 & options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+ /* overload cookie hash location */
+ opts->hash_location = (__u8 *)ptr;
+ ptr += 4;
+ }
+
+ if (unlikely(opts->mss)) {
+ *ptr++ = htonl((TCPOPT_MSS << 24) |
+ (TCPOLEN_MSS << 16) |
+ opts->mss);
+ }
+
+ if (likely(OPTION_TS & options)) {
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+ *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+ (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ options &= ~OPTION_SACK_ADVERTISE;
+ } else {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ }
+ *ptr++ = htonl(opts->tsval);
+ *ptr++ = htonl(opts->tsecr);
+ }
+
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) |
+ TCPOLEN_SACK_PERM);
+ }
+
+ if (unlikely(OPTION_WSCALE & options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_WINDOW << 16) |
+ (TCPOLEN_WINDOW << 8) |
+ opts->ws);
+ }
+
+ if (unlikely(opts->num_sack_blocks)) {
+ struct tcp_sack_block *sp = tp->rx_opt.dsack ?
+ tp->duplicate_sack : tp->selective_acks;
+ int this_sack;
+
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK << 8) |
+ (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
+ TCPOLEN_SACK_PERBLOCK)));
+
+ for (this_sack = 0; this_sack < opts->num_sack_blocks;
+ ++this_sack) {
+ *ptr++ = htonl(sp[this_sack].start_seq);
+ *ptr++ = htonl(sp[this_sack].end_seq);
+ }
+
+ tp->rx_opt.dsack = 0;
+ }
+
+ if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+ struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+ u8 *p = (u8 *)ptr;
+ u32 len; /* Fast Open option length */
+
+ if (foc->exp) {
+ len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+ p += TCPOLEN_EXP_FASTOPEN_BASE;
+ } else {
+ len = TCPOLEN_FASTOPEN_BASE + foc->len;
+ *p++ = TCPOPT_FASTOPEN;
+ *p++ = len;
+ }
+
+ memcpy(p, foc->val, foc->len);
+ if ((len & 3) == 2) {
+ p[foc->len] = TCPOPT_NOP;
+ p[foc->len + 1] = TCPOPT_NOP;
+ }
+ ptr += (len + 3) >> 2;
+ }
+}
+
+/* Compute TCP options for SYN packets. This is not the final
+ * network wire format yet.
+ */
+static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ struct tcp_md5sig_key **md5)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+
+#ifdef CONFIG_TCP_MD5SIG
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ }
+#else
+ *md5 = NULL;
+#endif
+
+ /* We always get an MSS option. The option bytes which will be seen in
+ * normal data packets should timestamps be used, must be in the MSS
+ * advertised. But we subtract them from tp->mss_cache so that
+ * calculations in tcp_sendmsg are simpler etc. So account for this
+ * fact here if necessary. If we don't do this correctly, as a
+ * receiver we won't recognize data packets as being full sized when we
+ * should, and thus we won't abide by the delayed ACK rules correctly.
+ * SACKs don't matter, we never delay an ACK when we have any of those
+ * going out. */
+ opts->mss = tcp_advertise_mss(sk);
+ remaining -= TCPOLEN_MSS_ALIGNED;
+
+ if (likely(sysctl_tcp_timestamps && !*md5)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
+ opts->tsecr = tp->rx_opt.ts_recent;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
+ }
+ if (likely(sysctl_tcp_window_scaling)) {
+ opts->ws = tp->rx_opt.rcv_wscale;
+ opts->options |= OPTION_WSCALE;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
+ }
+ if (likely(sysctl_tcp_sack)) {
+ opts->options |= OPTION_SACK_ADVERTISE;
+ if (unlikely(!(OPTION_TS & opts->options)))
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
+ }
+
+ if (fastopen && fastopen->cookie.len >= 0) {
+ u32 need = fastopen->cookie.len;
+
+ need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = &fastopen->cookie;
+ remaining -= need;
+ tp->syn_fastopen = 1;
+ tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
+ }
+ }
+
+ return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Set up TCP options for SYN-ACKs. */
+static unsigned int tcp_synack_options(struct sock *sk,
+ struct request_sock *req,
+ unsigned int mss, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ const struct tcp_md5sig_key *md5,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ unsigned int remaining = MAX_TCP_OPTION_SPACE;
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (md5) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+
+ /* We can't fit any SACK blocks in a packet with MD5 + TS
+ * options. There was discussion about disabling SACK
+ * rather than TS in order to fit in better with old,
+ * buggy kernels, but that was deemed to be unnecessary.
+ */
+ ireq->tstamp_ok &= !ireq->sack_ok;
+ }
+#endif
+
+ /* We always send an MSS option. */
+ opts->mss = mss;
+ remaining -= TCPOLEN_MSS_ALIGNED;
+
+ if (likely(ireq->wscale_ok)) {
+ opts->ws = ireq->rcv_wscale;
+ opts->options |= OPTION_WSCALE;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
+ }
+ if (likely(ireq->tstamp_ok)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = tcp_skb_timestamp(skb);
+ opts->tsecr = req->ts_recent;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
+ }
+ if (likely(ireq->sack_ok)) {
+ opts->options |= OPTION_SACK_ADVERTISE;
+ if (unlikely(!ireq->tstamp_ok))
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
+ }
+ if (foc != NULL && foc->len >= 0) {
+ u32 need = foc->len;
+
+ need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
+ need = (need + 3) & ~3U; /* Align to 32 bits */
+ if (remaining >= need) {
+ opts->options |= OPTION_FAST_OPEN_COOKIE;
+ opts->fastopen_cookie = foc;
+ remaining -= need;
+ }
+ }
+
+ return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Compute TCP options for ESTABLISHED sockets. This is not the
+ * final wire format yet.
+ */
+static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
+ struct tcp_out_options *opts,
+ struct tcp_md5sig_key **md5)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int size = 0;
+ unsigned int eff_sacks;
+
+ opts->options = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (unlikely(*md5)) {
+ opts->options |= OPTION_MD5;
+ size += TCPOLEN_MD5SIG_ALIGNED;
+ }
+#else
+ *md5 = NULL;
+#endif
+
+ if (likely(tp->rx_opt.tstamp_ok)) {
+ opts->options |= OPTION_TS;
+ opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
+ opts->tsecr = tp->rx_opt.ts_recent;
+ size += TCPOLEN_TSTAMP_ALIGNED;
+ }
+
+ eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+ if (unlikely(eff_sacks)) {
+ const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ opts->num_sack_blocks =
+ min_t(unsigned int, eff_sacks,
+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+ TCPOLEN_SACK_PERBLOCK);
+ size += TCPOLEN_SACK_BASE_ALIGNED +
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ }
+
+ return size;
+}
+
+
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+ struct tasklet_struct tasklet;
+ struct list_head head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+ if ((1 << sk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
+ TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
+ tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
+ 0, GFP_ATOMIC);
+}
+/*
+ * One tasklet per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transferring tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+ struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+ LIST_HEAD(list);
+ unsigned long flags;
+ struct list_head *q, *n;
+ struct tcp_sock *tp;
+ struct sock *sk;
+
+ local_irq_save(flags);
+ list_splice_init(&tsq->head, &list);
+ local_irq_restore(flags);
+
+ list_for_each_safe(q, n, &list) {
+ tp = list_entry(q, struct tcp_sock, tsq_node);
+ list_del(&tp->tsq_node);
+
+ sk = (struct sock *)tp;
+ bh_lock_sock(sk);
+
+ if (!sock_owned_by_user(sk)) {
+ tcp_tsq_handler(sk);
+ } else {
+ /* defer the work to tcp_release_cb() */
+ set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+ }
+ bh_unlock_sock(sk);
+
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+ sk_free(sk);
+ }
+}
+
+#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
+ (1UL << TCP_WRITE_TIMER_DEFERRED) | \
+ (1UL << TCP_DELACK_TIMER_DEFERRED) | \
+ (1UL << TCP_MTU_REDUCED_DEFERRED))
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long flags, nflags;
+
+ /* perform an atomic operation only if at least one flag is set */
+ do {
+ flags = tp->tsq_flags;
+ if (!(flags & TCP_DEFERRED_ALL))
+ return;
+ nflags = flags & ~TCP_DEFERRED_ALL;
+ } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+
+ if (flags & (1UL << TCP_TSQ_DEFERRED))
+ tcp_tsq_handler(sk);
+
+ /* Here begins the tricky part :
+ * We are called from release_sock() with :
+ * 1) BH disabled
+ * 2) sk_lock.slock spinlock held
+ * 3) socket owned by us (sk->sk_lock.owned == 1)
+ *
+ * But following code is meant to be called from BH handlers,
+ * so we should keep BH disabled, but early release socket ownership
+ */
+ sock_release_ownership(sk);
+
+ if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+ tcp_write_timer_handler(sk);
+ __sock_put(sk);
+ }
+ if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+ tcp_delack_timer_handler(sk);
+ __sock_put(sk);
+ }
+ if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+ inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
+ __sock_put(sk);
+ }
+}
+EXPORT_SYMBOL(tcp_release_cb);
+
+void __init tcp_tasklet_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+
+ INIT_LIST_HEAD(&tsq->head);
+ tasklet_init(&tsq->tasklet,
+ tcp_tasklet_func,
+ (unsigned long)tsq);
+ }
+}
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We can't xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int wmem;
+
+ /* Keep one reference on sk_wmem_alloc.
+ * Will be released by sk_free() from here or tcp_tasklet_func()
+ */
+ wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc);
+
+ /* If this softirq is serviced by ksoftirqd, we are likely under stress.
+ * Wait until our queues (qdisc + devices) are drained.
+ * This gives :
+ * - less callbacks to tcp_write_xmit(), reducing stress (batches)
+ * - chance for incoming ACK (processed by another cpu maybe)
+ * to migrate this flow (skb->ooo_okay will be eventually set)
+ */
+ if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+ goto out;
+
+ if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+ !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+ unsigned long flags;
+ struct tsq_tasklet *tsq;
+
+ /* queue this socket to tasklet queue */
+ local_irq_save(flags);
+ tsq = this_cpu_ptr(&tsq_tasklet);
+ list_add(&tp->tsq_node, &tsq->head);
+ tasklet_schedule(&tsq->tasklet);
+ local_irq_restore(flags);
+ return;
+ }
+out:
+ sk_free(sk);
+}
+
+/* This routine actually transmits TCP packets queued in by
+ * tcp_do_sendmsg(). This is used by both the initial
+ * transmission and possible later retransmissions.
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the TCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ *
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ gfp_t gfp_mask)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet;
+ struct tcp_sock *tp;
+ struct tcp_skb_cb *tcb;
+ struct tcp_out_options opts;
+ unsigned int tcp_options_size, tcp_header_size;
+ struct tcp_md5sig_key *md5;
+ struct tcphdr *th;
+ int err;
+
+ BUG_ON(!skb || !tcp_skb_pcount(skb));
+
+ if (clone_it) {
+ skb_mstamp_get(&skb->skb_mstamp);
+
+ if (unlikely(skb_cloned(skb)))
+ skb = pskb_copy(skb, gfp_mask);
+ else
+ skb = skb_clone(skb, gfp_mask);
+ if (unlikely(!skb))
+ return -ENOBUFS;
+ }
+
+ inet = inet_sk(sk);
+ tp = tcp_sk(sk);
+ tcb = TCP_SKB_CB(skb);
+ memset(&opts, 0, sizeof(opts));
+
+#ifdef TCP_STEALTH
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN &&
+ tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+ skb->skb_mstamp = tp->stealth.mstamp;
+ }
+#endif
+
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+ tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
+ else
+ tcp_options_size = tcp_established_options(sk, skb, &opts,
+ &md5);
+ tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
+
+ if (tcp_packets_in_flight(tp) == 0)
+ tcp_ca_event(sk, CA_EVENT_TX_START);
+
+ /* if no packet is in qdisc/device queue, then allow XPS to select
+ * another queue. We can be called from tcp_tsq_handler()
+ * which holds one reference to sk_wmem_alloc.
+ *
+ * TODO: Ideally, in-flight pure ACK packets should not matter here.
+ * One way to get this would be to set skb->truesize = 2 on them.
+ */
+ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
+
+ skb_push(skb, tcp_header_size);
+ skb_reset_transport_header(skb);
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
+ skb_set_hash_from_sk(skb, sk);
+ atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+
+ /* Build TCP header and checksum it. */
+ th = tcp_hdr(skb);
+ th->source = inet->inet_sport;
+ th->dest = inet->inet_dport;
+ th->seq = htonl(tcb->seq);
+ th->ack_seq = htonl(tp->rcv_nxt);
+ *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
+ tcb->tcp_flags);
+
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(min(tp->rcv_wnd, 65535U));
+ } else {
+ th->window = htons(tcp_select_window(sk));
+ }
+ th->check = 0;
+ th->urg_ptr = 0;
+
+ /* The urg_mode check is necessary during a below snd_una win probe */
+ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+ if (before(tp->snd_up, tcb->seq + 0x10000)) {
+ th->urg_ptr = htons(tp->snd_up - tcb->seq);
+ th->urg = 1;
+ } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
+ th->urg_ptr = htons(0xFFFF);
+ th->urg = 1;
+ }
+ }
+
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
+ tcp_ecn_send(sk, skb, tcp_header_size);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Calculate the MD5 hash, as we have all we need now */
+ if (md5) {
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ tp->af_specific->calc_md5_hash(opts.hash_location,
+ md5, sk, skb);
+ }
+#endif
+
+ icsk->icsk_af_ops->send_check(sk, skb);
+
+ if (likely(tcb->tcp_flags & TCPHDR_ACK))
+ tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+
+ if (skb->len != tcp_header_size)
+ tcp_event_data_sent(tp, sk);
+
+ if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
+ tcp_skb_pcount(skb));
+
+ /* OK, its time to fill skb_shinfo(skb)->gso_segs */
+ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
+
+ /* Our usage of tstamp should remain private */
+ skb->tstamp.tv64 = 0;
+
+ /* Cleanup our debris for IP stacks */
+ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
+ sizeof(struct inet6_skb_parm)));
+
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+
+ if (likely(err <= 0))
+ return err;
+
+ tcp_enter_cwr(sk);
+
+ return net_xmit_eval(err);
+}
+
+/* This routine just queues the buffer for sending.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
+ */
+static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Advance write_seq and place onto the write_queue. */
+ tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+ __skb_header_release(skb);
+ tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+}
+
+/* Initialize TSO segments for a packet. */
+static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
+ unsigned int mss_now)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ /* Make sure we own this skb before messing gso_size/gso_segs */
+ WARN_ON_ONCE(skb_cloned(skb));
+
+ if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
+ /* Avoid the costly divide in the normal
+ * non-TSO case.
+ */
+ tcp_skb_pcount_set(skb, 1);
+ shinfo->gso_size = 0;
+ shinfo->gso_type = 0;
+ } else {
+ tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
+ shinfo->gso_size = mss_now;
+ shinfo->gso_type = sk->sk_gso_type;
+ }
+}
+
+/* When a modification to fackets out becomes necessary, we need to check
+ * skb is counted to fackets_out or not.
+ */
+static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
+ int decr)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->sacked_out || tcp_is_reno(tp))
+ return;
+
+ if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
+ tp->fackets_out -= decr;
+}
+
+/* Pcount in the middle of the write queue got changed, we need to do various
+ * tweaks to fix counters
+ */
+static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->packets_out -= decr;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= decr;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= decr;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+ tp->lost_out -= decr;
+
+ /* Reno case is special. Sigh... */
+ if (tcp_is_reno(tp) && decr > 0)
+ tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
+
+ tcp_adjust_fackets_out(sk, skb, decr);
+
+ if (tp->lost_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+ (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+ tp->lost_cnt_hint -= decr;
+
+ tcp_verify_left_out(tp);
+}
+
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+ !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+ struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+ u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+ shinfo->tx_flags &= ~tsflags;
+ shinfo2->tx_flags |= tsflags;
+ swap(shinfo->tskey, shinfo2->tskey);
+ }
+}
+
+/* Function to create two new TCP segments. Shrinks the given segment
+ * to the specified size and appends a new segment with the rest of the
+ * packet to the list. This won't be called frequently, I hope.
+ * Remember, these are still headerless SKBs at this point.
+ */
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+ unsigned int mss_now, gfp_t gfp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *buff;
+ int nsize, old_factor;
+ int nlen;
+ u8 flags;
+
+ if (WARN_ON(len > skb->len))
+ return -EINVAL;
+
+ nsize = skb_headlen(skb) - len;
+ if (nsize < 0)
+ nsize = 0;
+
+ if (skb_unclone(skb, gfp))
+ return -ENOMEM;
+
+ /* Get a new skb... force flag on. */
+ buff = sk_stream_alloc_skb(sk, nsize, gfp);
+ if (!buff)
+ return -ENOMEM; /* We'll just try again later. */
+
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
+ nlen = skb->len - len - nsize;
+ buff->truesize += nlen;
+ skb->truesize -= nlen;
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+ /* PSH and FIN should only be set in the second packet. */
+ flags = TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+ TCP_SKB_CB(buff)->tcp_flags = flags;
+ TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
+ /* Copy and checksum data tail into the new buffer. */
+ buff->csum = csum_partial_copy_nocheck(skb->data + len,
+ skb_put(buff, nsize),
+ nsize, 0);
+
+ skb_trim(skb, len);
+
+ skb->csum = csum_block_sub(skb->csum, buff->csum, len);
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb_split(skb, buff, len);
+ }
+
+ buff->ip_summed = skb->ip_summed;
+
+ buff->tstamp = skb->tstamp;
+ tcp_fragment_tstamp(skb, buff);
+
+ old_factor = tcp_skb_pcount(skb);
+
+ /* Fix up tso_factor for both original and new SKB. */
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
+
+ /* If this packet has been sent out already, we must
+ * adjust the various packet counters.
+ */
+ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+ int diff = old_factor - tcp_skb_pcount(skb) -
+ tcp_skb_pcount(buff);
+
+ if (diff)
+ tcp_adjust_pcount(sk, skb, diff);
+ }
+
+ /* Link BUFF into the send queue. */
+ __skb_header_release(buff);
+ tcp_insert_write_queue_after(skb, buff, sk);
+
+ return 0;
+}
+
+/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
+ * eventually). The difference is that pulled data not copied, but
+ * immediately discarded.
+ */
+static void __pskb_trim_head(struct sk_buff *skb, int len)
+{
+ struct skb_shared_info *shinfo;
+ int i, k, eat;
+
+ eat = min_t(int, len, skb_headlen(skb));
+ if (eat) {
+ __skb_pull(skb, eat);
+ len -= eat;
+ if (!len)
+ return;
+ }
+ eat = len;
+ k = 0;
+ shinfo = skb_shinfo(skb);
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ int size = skb_frag_size(&shinfo->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
+ } else {
+ shinfo->frags[k] = shinfo->frags[i];
+ if (eat) {
+ shinfo->frags[k].page_offset += eat;
+ skb_frag_size_sub(&shinfo->frags[k], eat);
+ eat = 0;
+ }
+ k++;
+ }
+ }
+ shinfo->nr_frags = k;
+
+ skb_reset_tail_pointer(skb);
+ skb->data_len -= len;
+ skb->len = skb->data_len;
+}
+
+/* Remove acked data from a packet in the transmit queue. */
+int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
+
+ __pskb_trim_head(skb, len);
+
+ TCP_SKB_CB(skb)->seq += len;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb->truesize -= len;
+ sk->sk_wmem_queued -= len;
+ sk_mem_uncharge(sk, len);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+
+ /* Any change of skb->len requires recalculation of tso factor. */
+ if (tcp_skb_pcount(skb) > 1)
+ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
+
+ return 0;
+}
+
+/* Calculate MSS not accounting any TCP options. */
+static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ int mss_now;
+
+ /* Calculate base mss without TCP options:
+ It is MMS_S - sizeof(tcphdr) of rfc1122
+ */
+ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+ }
+
+ /* Clamp it (mss_clamp does not include tcp options) */
+ if (mss_now > tp->rx_opt.mss_clamp)
+ mss_now = tp->rx_opt.mss_clamp;
+
+ /* Now subtract optional transport overhead */
+ mss_now -= icsk->icsk_ext_hdr_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+ if (mss_now < 48)
+ mss_now = 48;
+ return mss_now;
+}
+
+/* Calculate MSS. Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ /* Subtract TCP options size, not including SACKs */
+ return __tcp_mtu_to_mss(sk, pmtu) -
+ (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ int mtu;
+
+ mtu = mss +
+ tp->tcp_header_len +
+ icsk->icsk_ext_hdr_len +
+ icsk->icsk_af_ops->net_header_len;
+
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mtu += icsk->icsk_af_ops->net_frag_header_len;
+ }
+ return mtu;
+}
+
+/* MTU probing init per socket */
+void tcp_mtup_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
+
+ icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
+ icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+}
+EXPORT_SYMBOL(tcp_mtup_init);
+
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+ tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+ for TCP options, but includes only bare TCP header.
+
+ tp->rx_opt.mss_clamp is mss negotiated at connection setup.
+ It is minimum of user_mss and mss received with SYN.
+ It also does not include TCP options.
+
+ inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
+
+ tp->mss_cache is current effective sending mss, including
+ all tcp options except for SACKs. It is evaluated,
+ taking into account current pmtu, but never exceeds
+ tp->rx_opt.mss_clamp.
+
+ NOTE1. rfc1122 clearly states that advertised MSS
+ DOES NOT include either tcp or ip options.
+
+ NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+ are READ ONLY outside this function. --ANK (980731)
+ */
+unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int mss_now;
+
+ if (icsk->icsk_mtup.search_high > pmtu)
+ icsk->icsk_mtup.search_high = pmtu;
+
+ mss_now = tcp_mtu_to_mss(sk, pmtu);
+ mss_now = tcp_bound_to_half_wnd(tp, mss_now);
+
+ /* And store cached results */
+ icsk->icsk_pmtu_cookie = pmtu;
+ if (icsk->icsk_mtup.enabled)
+ mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
+ tp->mss_cache = mss_now;
+
+ return mss_now;
+}
+EXPORT_SYMBOL(tcp_sync_mss);
+
+/* Compute the current effective MSS, taking SACKs and IP options,
+ * and even PMTU discovery events into account.
+ */
+unsigned int tcp_current_mss(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct dst_entry *dst = __sk_dst_get(sk);
+ u32 mss_now;
+ unsigned int header_len;
+ struct tcp_out_options opts;
+ struct tcp_md5sig_key *md5;
+
+ mss_now = tp->mss_cache;
+
+ if (dst) {
+ u32 mtu = dst_mtu(dst);
+ if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
+ mss_now = tcp_sync_mss(sk, mtu);
+ }
+
+ header_len = tcp_established_options(sk, NULL, &opts, &md5) +
+ sizeof(struct tcphdr);
+ /* The mss_cache is sized based on tp->tcp_header_len, which assumes
+ * some common options. If this is an odd packet (because we have SACK
+ * blocks etc) then our calculated header_len will be different, and
+ * we have to adjust mss_now correspondingly */
+ if (header_len != tp->tcp_header_len) {
+ int delta = (int) header_len - tp->tcp_header_len;
+ mss_now -= delta;
+ }
+
+ return mss_now;
+}
+
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+static void tcp_cwnd_application_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ /* Limited by application or receiver window. */
+ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+ u32 win_used = max(tp->snd_cwnd_used, init_win);
+ if (win_used < tp->snd_cwnd) {
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+ }
+ tp->snd_cwnd_used = 0;
+ }
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Track the maximum number of outstanding packets in each
+ * window, and remember whether we were cwnd-limited then.
+ */
+ if (!before(tp->snd_una, tp->max_packets_seq) ||
+ tp->packets_out > tp->max_packets_out) {
+ tp->max_packets_out = tp->packets_out;
+ tp->max_packets_seq = tp->snd_nxt;
+ tp->is_cwnd_limited = is_cwnd_limited;
+ }
+
+ if (tcp_is_cwnd_limited(sk)) {
+ /* Network is feed fully. */
+ tp->snd_cwnd_used = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else {
+ /* Network starves. */
+ if (tp->packets_out > tp->snd_cwnd_used)
+ tp->snd_cwnd_used = tp->packets_out;
+
+ if (sysctl_tcp_slow_start_after_idle &&
+ (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
+ tcp_cwnd_application_limited(sk);
+ }
+}
+
+/* Minshall's variant of the Nagle send check. */
+static bool tcp_minshall_check(const struct tcp_sock *tp)
+{
+ return after(tp->snd_sml, tp->snd_una) &&
+ !after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Update snd_sml if this skb is under mss
+ * Note that a TSO packet might end with a sub-mss segment
+ * The test is really :
+ * if ((skb->len % mss) != 0)
+ * tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+ * But we can avoid doing the divide again given we already have
+ * skb_pcount = skb->len / mss_now
+ */
+static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+ const struct sk_buff *skb)
+{
+ if (skb->len < tcp_skb_pcount(skb) * mss_now)
+ tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
+}
+
+/* Return false, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized. (provided by caller in %partial bool)
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ * With Minshall's modification: all sent small packets are ACKed.
+ */
+static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
+ int nonagle)
+{
+ return partial &&
+ ((nonagle & TCP_NAGLE_CORK) ||
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+}
+
+/* Return how many segs we'd like on a TSO packet,
+ * to send one TSO packet per ms
+ */
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+{
+ u32 bytes, segs;
+
+ bytes = min(sk->sk_pacing_rate >> 10,
+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+
+ /* Goal is to send at least one packet per ms,
+ * not one big TSO packet every 100 ms.
+ * This preserves ACK clocking and is consistent
+ * with tcp_tso_should_defer() heuristic.
+ */
+ segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+
+ return min_t(u32, segs, sk->sk_gso_max_segs);
+}
+
+/* Returns the portion of skb which can be sent right away */
+static unsigned int tcp_mss_split_point(const struct sock *sk,
+ const struct sk_buff *skb,
+ unsigned int mss_now,
+ unsigned int max_segs,
+ int nonagle)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 partial, needed, window, max_len;
+
+ window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+ max_len = mss_now * max_segs;
+
+ if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
+ return max_len;
+
+ needed = min(skb->len, window);
+
+ if (max_len <= needed)
+ return max_len;
+
+ partial = needed % mss_now;
+ /* If last segment is not a full MSS, check if Nagle rules allow us
+ * to include this last segment in this skb.
+ * Otherwise, we'll split the skb at last MSS boundary
+ */
+ if (tcp_nagle_check(partial != 0, tp, nonagle))
+ return needed - partial;
+
+ return needed;
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules? If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ u32 in_flight, cwnd, halfcwnd;
+
+ /* Don't be strict about the congestion window for the final FIN. */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+ tcp_skb_pcount(skb) == 1)
+ return 1;
+
+ in_flight = tcp_packets_in_flight(tp);
+ cwnd = tp->snd_cwnd;
+ if (in_flight >= cwnd)
+ return 0;
+
+ /* For better scheduling, ensure we have at least
+ * 2 GSO packets in flight.
+ */
+ halfcwnd = max(cwnd >> 1, 1U);
+ return min(halfcwnd, cwnd - in_flight);
+}
+
+/* Initialize TSO state of a skb.
+ * This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
+ unsigned int mss_now)
+{
+ int tso_segs = tcp_skb_pcount(skb);
+
+ if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tso_segs = tcp_skb_pcount(skb);
+ }
+ return tso_segs;
+}
+
+
+/* Return true if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
+{
+ /* Nagle rule does not apply to frames, which sit in the middle of the
+ * write_queue (they have no chances to get new data).
+ *
+ * This is implemented in the callers, where they modify the 'nonagle'
+ * argument based upon the location of SKB in the send queue.
+ */
+ if (nonagle & TCP_NAGLE_PUSH)
+ return true;
+
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ return true;
+
+ if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
+ return true;
+
+ return false;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ unsigned int cur_mss)
+{
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (skb->len > cur_mss)
+ end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+ return !after(end_seq, tcp_wnd_end(tp));
+}
+
+/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
+ * should be put on the wire right now. If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int cwnd_quota;
+
+ tcp_init_tso_segs(sk, skb, cur_mss);
+
+ if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+ return 0;
+
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
+ cwnd_quota = 0;
+
+ return cwnd_quota;
+}
+
+/* Test if sending is allowed right now. */
+bool tcp_may_send_now(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = tcp_send_head(sk);
+
+ return skb &&
+ tcp_snd_test(sk, skb, tcp_current_mss(sk),
+ (tcp_skb_is_last(sk, skb) ?
+ tp->nonagle : TCP_NAGLE_PUSH));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list. It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation. In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+ unsigned int mss_now, gfp_t gfp)
+{
+ struct sk_buff *buff;
+ int nlen = skb->len - len;
+ u8 flags;
+
+ /* All of a TSO frame must be composed of paged data. */
+ if (skb->len != skb->data_len)
+ return tcp_fragment(sk, skb, len, mss_now, gfp);
+
+ buff = sk_stream_alloc_skb(sk, 0, gfp);
+ if (unlikely(!buff))
+ return -ENOMEM;
+
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
+ buff->truesize += nlen;
+ skb->truesize -= nlen;
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+ /* PSH and FIN should only be set in the second packet. */
+ flags = TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+ TCP_SKB_CB(buff)->tcp_flags = flags;
+
+ /* This packet was never sent out yet, so no SACK bits. */
+ TCP_SKB_CB(buff)->sacked = 0;
+
+ buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
+ skb_split(skb, buff, len);
+ tcp_fragment_tstamp(skb, buff);
+
+ /* Fix up tso_factor for both original and new SKB. */
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
+
+ /* Link BUFF into the send queue. */
+ __skb_header_release(buff);
+ tcp_insert_write_queue_after(skb, buff, sk);
+
+ return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do. View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+ bool *is_cwnd_limited, u32 max_segs)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 age, send_win, cong_win, limit, in_flight;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ struct sk_buff *head;
+ int win_divisor;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ goto send_now;
+
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
+ goto send_now;
+
+ /* Avoid bursty behavior by allowing defer
+ * only if the last write was recent.
+ */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
+ goto send_now;
+
+ in_flight = tcp_packets_in_flight(tp);
+
+ BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
+
+ send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+ /* From in_flight test above, we know that cwnd > in_flight. */
+ cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+ limit = min(send_win, cong_win);
+
+ /* If a full-sized TSO skb can be sent, do it. */
+ if (limit >= max_segs * tp->mss_cache)
+ goto send_now;
+
+ /* Middle in queue won't get any more data, full sendable already? */
+ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
+ goto send_now;
+
+ win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+ if (win_divisor) {
+ u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+
+ /* If at least some fraction of a window is available,
+ * just use it.
+ */
+ chunk /= win_divisor;
+ if (limit >= chunk)
+ goto send_now;
+ } else {
+ /* Different approach, try not to defer past a single
+ * ACK. Receiver should ACK every other full sized
+ * frame, so if we have space for more than 3 frames
+ * then send now.
+ */
+ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
+ goto send_now;
+ }
+
+ head = tcp_write_queue_head(sk);
+ skb_mstamp_get(&now);
+ age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+ /* If next ACK is likely to come too late (half srtt), do not defer */
+ if (age < (tp->srtt_us >> 4))
+ goto send_now;
+
+ /* Ok, it looks like it is advisable to defer. */
+
+ if (cong_win < send_win && cong_win < skb->len)
+ *is_cwnd_limited = true;
+
+ return true;
+
+send_now:
+ return false;
+}
+
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = net->ipv4.sysctl_tcp_probe_interval;
+ delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ }
+}
+
+/* Create a new MTU probe if we are ready.
+ * MTU probe is regularly attempting to increase the path MTU by
+ * deliberately sending larger packets. This discovers routing
+ * changes resulting in larger path MTUs.
+ *
+ * Returns 0 if we should wait to probe (no cwnd available),
+ * 1 if a probe was sent,
+ * -1 otherwise
+ */
+static int tcp_mtu_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb, *nskb, *next;
+ struct net *net = sock_net(sk);
+ int len;
+ int probe_size;
+ int size_needed;
+ int copy;
+ int mss_now;
+ int interval;
+
+ /* Not currently probing/verifying,
+ * not in recovery,
+ * have enough cwnd, and
+ * not SACKing (the variable headers throw things off) */
+ if (!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+ return -1;
+
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
+ mss_now = tcp_current_mss(sk);
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
+ size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
+ return -1;
+ }
+
+ /* Have enough data in the send queue to probe? */
+ if (tp->write_seq - tp->snd_nxt < size_needed)
+ return -1;
+
+ if (tp->snd_wnd < size_needed)
+ return -1;
+ if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
+ return 0;
+
+ /* Do we need to wait to drain cwnd? With none in flight, don't stall */
+ if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
+ if (!tcp_packets_in_flight(tp))
+ return -1;
+ else
+ return 0;
+ }
+
+ /* We're allowed to probe. Build it now. */
+ nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
+ if (!nskb)
+ return -1;
+ sk->sk_wmem_queued += nskb->truesize;
+ sk_mem_charge(sk, nskb->truesize);
+
+ skb = tcp_send_head(sk);
+
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+ TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
+ TCP_SKB_CB(nskb)->sacked = 0;
+ nskb->csum = 0;
+ nskb->ip_summed = skb->ip_summed;
+
+ tcp_insert_write_queue_before(nskb, skb, sk);
+
+ len = 0;
+ tcp_for_write_queue_from_safe(skb, next, sk) {
+ copy = min_t(int, skb->len, probe_size - len);
+ if (nskb->ip_summed)
+ skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+ else
+ nskb->csum = skb_copy_and_csum_bits(skb, 0,
+ skb_put(nskb, copy),
+ copy, nskb->csum);
+
+ if (skb->len <= copy) {
+ /* We've eaten all the data from this skb.
+ * Throw it away. */
+ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ tcp_unlink_write_queue(skb, sk);
+ sk_wmem_free_skb(sk, skb);
+ } else {
+ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
+ ~(TCPHDR_FIN|TCPHDR_PSH);
+ if (!skb_shinfo(skb)->nr_frags) {
+ skb_pull(skb, copy);
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->csum = csum_partial(skb->data,
+ skb->len, 0);
+ } else {
+ __pskb_trim_head(skb, copy);
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ }
+ TCP_SKB_CB(skb)->seq += copy;
+ }
+
+ len += copy;
+
+ if (len >= probe_size)
+ break;
+ }
+ tcp_init_tso_segs(sk, nskb, nskb->len);
+
+ /* We're ready to send. If this fails, the probe will
+ * be resegmented into mss-sized pieces by tcp_write_xmit().
+ */
+ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+ /* Decrement cwnd here because we are sending
+ * effectively two packets. */
+ tp->snd_cwnd--;
+ tcp_event_new_data_sent(sk, nskb);
+
+ icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+ tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+ tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+ return 1;
+ }
+
+ return -1;
+}
+
+/* This routine writes packets to the network. It advances the
+ * send_head. This happens as incoming acks open up the remote
+ * window for us.
+ *
+ * LARGESEND note: !tcp_urg_mode is overkill, only frames between
+ * snd_up-64k-mss .. snd_up cannot be large. However, taking into
+ * account rare use of URG, this is not a big flaw.
+ *
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
+
+ * Returns true, if no segments are in flight and we have queued segments,
+ * but cannot send anything now because of SWS or another problem.
+ */
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ unsigned int tso_segs, sent_pkts;
+ int cwnd_quota;
+ int result;
+ bool is_cwnd_limited = false;
+ u32 max_segs;
+
+ sent_pkts = 0;
+
+ if (!push_one) {
+ /* Do MTU probing. */
+ result = tcp_mtu_probe(sk);
+ if (!result) {
+ return false;
+ } else if (result > 0) {
+ sent_pkts = 1;
+ }
+ }
+
+ max_segs = tcp_tso_autosize(sk, mss_now);
+ while ((skb = tcp_send_head(sk))) {
+ unsigned int limit;
+
+ tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
+ BUG_ON(!tso_segs);
+
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "skb_mstamp" is used as a start point for the retransmit timer */
+ skb_mstamp_get(&skb->skb_mstamp);
+ goto repair; /* Skip network transmission */
+ }
+
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (!cwnd_quota) {
+ is_cwnd_limited = true;
+ if (push_one == 2)
+ /* Force out a loss probe pkt. */
+ cwnd_quota = 1;
+ else
+ break;
+ }
+
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ break;
+
+ if (tso_segs == 1 || !max_segs) {
+ if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+ (tcp_skb_is_last(sk, skb) ?
+ nonagle : TCP_NAGLE_PUSH))))
+ break;
+ } else {
+ if (!push_one &&
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+ max_segs))
+ break;
+ }
+
+ limit = mss_now;
+ if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp))
+ limit = tcp_mss_split_point(sk, skb, mss_now,
+ min_t(unsigned int,
+ cwnd_quota,
+ max_segs),
+ nonagle);
+
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+ break;
+
+ /* TCP Small Queues :
+ * Control number of packets in qdisc/devices to two packets / or ~1 ms.
+ * This allows for :
+ * - better RTT estimation and ACK scheduling
+ * - faster recovery
+ * - high rates
+ * Alas, some drivers / subsystems require a fair amount
+ * of queued bytes to ensure line rate.
+ * One example is wifi aggregation (802.11 AMPDU)
+ */
+ limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
+ limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
+
+ if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED, so we must
+ * test again the condition.
+ */
+ smp_mb__after_atomic();
+ if (atomic_read(&sk->sk_wmem_alloc) > limit)
+ break;
+ }
+
+ if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+ break;
+
+repair:
+ /* Advance the send_head. This one is sent out.
+ * This call will increment packets_out.
+ */
+ tcp_event_new_data_sent(sk, skb);
+
+ tcp_minshall_update(tp, mss_now, skb);
+ sent_pkts += tcp_skb_pcount(skb);
+
+ if (push_one)
+ break;
+ }
+
+ if (likely(sent_pkts)) {
+ if (tcp_in_cwnd_reduction(sk))
+ tp->prr_out += sent_pkts;
+
+ /* Send one loss probe per tail loss episode. */
+ if (push_one != 2)
+ tcp_schedule_loss_probe(sk);
+ tcp_cwnd_validate(sk, is_cwnd_limited);
+ return false;
+ }
+ return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+}
+
+bool tcp_schedule_loss_probe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout, tlp_time_stamp, rto_time_stamp;
+ u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
+
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
+ return false;
+ /* No consecutive loss probes. */
+ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
+ tcp_rearm_rto(sk);
+ return false;
+ }
+ /* Don't do any loss probe on a Fast Open connection before 3WHS
+ * finishes.
+ */
+ if (sk->sk_state == TCP_SYN_RECV)
+ return false;
+
+ /* TLP is only scheduled when next timer event is RTO. */
+ if (icsk->icsk_pending != ICSK_TIME_RETRANS)
+ return false;
+
+ /* Schedule a loss probe in 2*RTT for SACK capable connections
+ * in Open state, that are either limited by cwnd or application.
+ */
+ if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
+ !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+ return false;
+
+ if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
+ tcp_send_head(sk))
+ return false;
+
+ /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+ * for delayed ack when there's one outstanding packet.
+ */
+ timeout = rtt << 1;
+ if (tp->packets_out == 1)
+ timeout = max_t(u32, timeout,
+ (rtt + (rtt >> 1) + TCP_DELACK_MAX));
+ timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+
+ /* If RTO is shorter, just schedule TLP in its place. */
+ tlp_time_stamp = tcp_time_stamp + timeout;
+ rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
+ if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
+ s32 delta = rto_time_stamp - tcp_time_stamp;
+ if (delta > 0)
+ timeout = delta;
+ }
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+ TCP_RTO_MAX);
+ return true;
+}
+
+/* Thanks to skb fast clones, we can detect if a prior transmit of
+ * a packet is still in a qdisc or driver queue.
+ * In this case, there is very little point doing a retransmit !
+ * Note: This is called from BH context only.
+ */
+static bool skb_still_in_host_queue(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ if (unlikely(skb_fclone_busy(sk, skb))) {
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ return true;
+ }
+ return false;
+}
+
+/* When probe timeout (PTO) fires, send a new segment if one exists, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int pcount;
+ int mss = tcp_current_mss(sk);
+ int err = -1;
+
+ if (tcp_send_head(sk)) {
+ err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ goto rearm_timer;
+ }
+
+ /* At most one outstanding TLP retransmission. */
+ if (tp->tlp_high_seq)
+ goto rearm_timer;
+
+ /* Retransmit last segment. */
+ skb = tcp_write_queue_tail(sk);
+ if (WARN_ON(!skb))
+ goto rearm_timer;
+
+ if (skb_still_in_host_queue(sk, skb))
+ goto rearm_timer;
+
+ pcount = tcp_skb_pcount(skb);
+ if (WARN_ON(!pcount))
+ goto rearm_timer;
+
+ if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+ if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+ GFP_ATOMIC)))
+ goto rearm_timer;
+ skb = tcp_write_queue_tail(sk);
+ }
+
+ if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+ goto rearm_timer;
+
+ err = __tcp_retransmit_skb(sk, skb);
+
+ /* Record snd_nxt for loss detection. */
+ if (likely(!err))
+ tp->tlp_high_seq = tp->snd_nxt;
+
+rearm_timer:
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
+
+ if (likely(!err))
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBES);
+}
+
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+ int nonagle)
+{
+ /* If we are closed, the bytes will have to remain here.
+ * In time closedown will finish, we empty the write queue and
+ * all will be happy.
+ */
+ if (unlikely(sk->sk_state == TCP_CLOSE))
+ return;
+
+ if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+ sk_gfp_atomic(sk, GFP_ATOMIC)))
+ tcp_check_probe_timer(sk);
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+ struct sk_buff *skb = tcp_send_head(sk);
+
+ BUG_ON(!skb || skb->len < mss_now);
+
+ tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
+}
+
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
+ * RECV.NEXT + RCV.WIN fixed until:
+ * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recommended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ *
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ *
+ * BSD seems to make the following compromise:
+ *
+ * If the free space is less than the 1/4 of the maximum
+ * space available and the free space is less than 1/2 mss,
+ * then set the window to 0.
+ * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
+ * Otherwise, just prevent the window from shrinking
+ * and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
+ */
+u32 __tcp_select_window(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ /* MSS for the peer's data. Previous versions used mss_clamp
+ * here. I don't know if the value based on our guesses
+ * of peer's MSS is better for the performance. It's more correct
+ * but may be worse for the performance because of rcv_mss
+ * fluctuations. --SAW 1998/11/1
+ */
+ int mss = icsk->icsk_ack.rcv_mss;
+ int free_space = tcp_space(sk);
+ int allowed_space = tcp_full_space(sk);
+ int full_space = min_t(int, tp->window_clamp, allowed_space);
+ int window;
+
+ if (mss > full_space)
+ mss = full_space;
+
+ if (free_space < (full_space >> 1)) {
+ icsk->icsk_ack.quick = 0;
+
+ if (sk_under_memory_pressure(sk))
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh,
+ 4U * tp->advmss);
+
+ /* free_space might become our new window, make sure we don't
+ * increase it due to wscale.
+ */
+ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+ /* if free space is less than mss estimate, or is below 1/16th
+ * of the maximum allowed, try to move to zero-window, else
+ * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
+ * new incoming data is dropped due to memory limits.
+ * With large window, mss test triggers way too late in order
+ * to announce zero window in time before rmem limit kicks in.
+ */
+ if (free_space < (allowed_space >> 4) || free_space < mss)
+ return 0;
+ }
+
+ if (free_space > tp->rcv_ssthresh)
+ free_space = tp->rcv_ssthresh;
+
+ /* Don't do rounding if we are using window scaling, since the
+ * scaled window will not line up with the MSS boundary anyway.
+ */
+ window = tp->rcv_wnd;
+ if (tp->rx_opt.rcv_wscale) {
+ window = free_space;
+
+ /* Advertise enough space so that it won't get scaled away.
+ * Import case: prevent zero window announcement if
+ * 1<<rcv_wscale > mss.
+ */
+ if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
+ window = (((window >> tp->rx_opt.rcv_wscale) + 1)
+ << tp->rx_opt.rcv_wscale);
+ } else {
+ /* Get the largest window that is a nice multiple of mss.
+ * Window clamp already applied above.
+ * If our current window offering is within 1 mss of the
+ * free space we just keep it. This prevents the divide
+ * and multiply from happening most of the time.
+ * We also don't do any window rounding when the free space
+ * is too small.
+ */
+ if (window <= free_space - mss || window > free_space)
+ window = (free_space / mss) * mss;
+ else if (mss == full_space &&
+ free_space > window + (full_space >> 1))
+ window = free_space;
+ }
+
+ return window;
+}
+
+/* Collapses two adjacent SKB's during retransmission. */
+static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+ int skb_size, next_skb_size;
+
+ skb_size = skb->len;
+ next_skb_size = next_skb->len;
+
+ BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+
+ tcp_highest_sack_combine(sk, next_skb, skb);
+
+ tcp_unlink_write_queue(next_skb, sk);
+
+ skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
+ next_skb_size);
+
+ if (next_skb->ip_summed == CHECKSUM_PARTIAL)
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
+
+ /* Update sequence range on original skb. */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+ /* Merge over control information. This moves PSH/FIN etc. over */
+ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
+
+ /* All done, get rid of second SKB and account for it so
+ * packet counting does not break.
+ */
+ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+
+ /* changed transmit queue under us so clear hints */
+ tcp_clear_retrans_hints_partial(tp);
+ if (next_skb == tp->retransmit_skb_hint)
+ tp->retransmit_skb_hint = skb;
+
+ tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
+
+ sk_wmem_free_skb(sk, next_skb);
+}
+
+/* Check if coalescing SKBs is legal. */
+static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
+{
+ if (tcp_skb_pcount(skb) > 1)
+ return false;
+ /* TODO: SACK collapsing could be used to remove this condition */
+ if (skb_shinfo(skb)->nr_frags != 0)
+ return false;
+ if (skb_cloned(skb))
+ return false;
+ if (skb == tcp_send_head(sk))
+ return false;
+ /* Some heurestics for collapsing over SACK'd could be invented */
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ return false;
+
+ return true;
+}
+
+/* Collapse packets in the retransmit queue to make to create
+ * less packets on the wire. This is only done on retransmission.
+ */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
+ int space)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = to, *tmp;
+ bool first = true;
+
+ if (!sysctl_tcp_retrans_collapse)
+ return;
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ return;
+
+ tcp_for_write_queue_from_safe(skb, tmp, sk) {
+ if (!tcp_can_collapse(sk, skb))
+ break;
+
+ space -= skb->len;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (space < 0)
+ break;
+ /* Punt if not enough space exists in the first SKB for
+ * the data in the second
+ */
+ if (skb->len > skb_availroom(to))
+ break;
+
+ if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
+ break;
+
+ tcp_collapse_retrans(sk, to);
+ }
+}
+
+/* This retransmits one SKB. Policy decisions and retransmit queue
+ * state updates are done by the caller. Returns non-zero if an
+ * error occurred which prevented the send.
+ */
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned int cur_mss;
+ int err;
+
+ /* Inconslusive MTU probe */
+ if (icsk->icsk_mtup.probe_size) {
+ icsk->icsk_mtup.probe_size = 0;
+ }
+
+ /* Do not sent more than we queued. 1/4 is reserved for possible
+ * copying overhead: fragmentation, tunneling, mangling etc.
+ */
+ if (atomic_read(&sk->sk_wmem_alloc) >
+ min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+ return -EAGAIN;
+
+ if (skb_still_in_host_queue(sk, skb))
+ return -EBUSY;
+
+ if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+ if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ BUG();
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+ return -ENOMEM;
+ }
+
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+ return -EHOSTUNREACH; /* Routing failure or similar. */
+
+ cur_mss = tcp_current_mss(sk);
+
+ /* If receiver has shrunk his window, and skb is out of
+ * new window, do not retransmit it. The exception is the
+ * case, when window is shrunk to zero. In this case
+ * our retransmit serves as a zero window probe.
+ */
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
+ TCP_SKB_CB(skb)->seq != tp->snd_una)
+ return -EAGAIN;
+
+ if (skb->len > cur_mss) {
+ if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
+ return -ENOMEM; /* We'll try again later. */
+ } else {
+ int oldpcount = tcp_skb_pcount(skb);
+
+ if (unlikely(oldpcount > 1)) {
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
+ tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
+ }
+ }
+
+ tcp_retrans_try_collapse(sk, skb, cur_mss);
+
+ /* Make a copy, if the first transmission SKB clone we made
+ * is still in somebody's hands, else make a clone.
+ */
+
+ /* make sure skb->data is aligned on arches that require it
+ * and check if ack-trimming & collapsing extended the headroom
+ * beyond what csum_start can cover.
+ */
+ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
+ skb_headroom(skb) >= 0xFFFF)) {
+ struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
+ } else {
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ }
+
+ if (likely(!err)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+ /* Update global TCP statistics. */
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans++;
+ }
+ return err;
+}
+
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = __tcp_retransmit_skb(sk, skb);
+
+ if (err == 0) {
+#if FASTRETRANS_DEBUG > 0
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+ net_dbg_ratelimited("retrans_out leaked\n");
+ }
+#endif
+ if (!tp->retrans_out)
+ tp->lost_retrans_low = tp->snd_nxt;
+ TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
+ tp->retrans_out += tcp_skb_pcount(skb);
+
+ /* Save stamp of the first retransmit. */
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_skb_timestamp(skb);
+
+ /* snd_nxt is stored to detect loss of retransmitted segment,
+ * see tcp_input.c tcp_sacktag_write_queue().
+ */
+ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+ } else if (err != -EBUSY) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+ }
+
+ if (tp->undo_retrans < 0)
+ tp->undo_retrans = 0;
+ tp->undo_retrans += tcp_skb_pcount(skb);
+ return err;
+}
+
+/* Check if we forward retransmits are possible in the current
+ * window/congestion state.
+ */
+static bool tcp_can_forward_retransmit(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Forward retransmissions are possible only during Recovery. */
+ if (icsk->icsk_ca_state != TCP_CA_Recovery)
+ return false;
+
+ /* No forward retransmissions in Reno are possible. */
+ if (tcp_is_reno(tp))
+ return false;
+
+ /* Yeah, we have to make difficult choice between forward transmission
+ * and retransmission... Both ways have their merits...
+ *
+ * For now we do not retransmit anything, while we have some new
+ * segments to send. In the other cases, follow rule 3 for
+ * NextSeg() specified in RFC3517.
+ */
+
+ if (tcp_may_send_now(sk))
+ return false;
+
+ return true;
+}
+
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged. It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ struct sk_buff *hole = NULL;
+ u32 last_lost;
+ int mib_idx;
+ int fwd_rexmitting = 0;
+
+ if (!tp->packets_out)
+ return;
+
+ if (!tp->lost_out)
+ tp->retransmit_high = tp->snd_una;
+
+ if (tp->retransmit_skb_hint) {
+ skb = tp->retransmit_skb_hint;
+ last_lost = TCP_SKB_CB(skb)->end_seq;
+ if (after(last_lost, tp->retransmit_high))
+ last_lost = tp->retransmit_high;
+ } else {
+ skb = tcp_write_queue_head(sk);
+ last_lost = tp->snd_una;
+ }
+
+ tcp_for_write_queue_from(skb, sk) {
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+ if (skb == tcp_send_head(sk))
+ break;
+ /* we could do better than to assign each time */
+ if (!hole)
+ tp->retransmit_skb_hint = skb;
+
+ /* Assume this retransmit will generate
+ * only one packet for congestion window
+ * calculation purposes. This works because
+ * tcp_retransmit_skb() will chop up the
+ * packet to be MSS sized and all the
+ * packet counting works out.
+ */
+ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ return;
+
+ if (fwd_rexmitting) {
+begin_fwd:
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+ break;
+ mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
+
+ } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+ tp->retransmit_high = last_lost;
+ if (!tcp_can_forward_retransmit(sk))
+ break;
+ /* Backtrack if necessary to non-L'ed skb */
+ if (hole) {
+ skb = hole;
+ hole = NULL;
+ }
+ fwd_rexmitting = 1;
+ goto begin_fwd;
+
+ } else if (!(sacked & TCPCB_LOST)) {
+ if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+ hole = skb;
+ continue;
+
+ } else {
+ last_lost = TCP_SKB_CB(skb)->end_seq;
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
+ mib_idx = LINUX_MIB_TCPFASTRETRANS;
+ else
+ mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+ }
+
+ if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+ continue;
+
+ if (tcp_retransmit_skb(sk, skb))
+ return;
+
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+ if (tcp_in_cwnd_reduction(sk))
+ tp->prr_out += tcp_skb_pcount(skb);
+
+ if (skb == tcp_write_queue_head(sk))
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
+ }
+}
+
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ */
+static void sk_forced_wmem_schedule(struct sock *sk, int size)
+{
+ int amt, status;
+
+ if (size <= sk->sk_forward_alloc)
+ return;
+ amt = sk_mem_pages(size);
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ sk_memory_allocated_add(sk, amt, &status);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
+ */
+void tcp_send_fin(struct sock *sk)
+{
+ struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Optimization, tack on the FIN if we have one skb in write queue and
+ * this skb was not yet sent, or we are under memory pressure.
+ * Note: in the latter case, FIN packet will be sent after a timeout,
+ * as TCP stack thinks it has already been transmitted.
+ */
+ if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
+ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+ TCP_SKB_CB(tskb)->end_seq++;
+ tp->write_seq++;
+ if (!tcp_send_head(sk)) {
+ /* This means tskb was already sent.
+ * Pretend we included the FIN on previous transmit.
+ * We need to set tp->snd_nxt to the value it would have
+ * if FIN had been sent. This is because retransmit path
+ * does not change tp->snd_nxt.
+ */
+ tp->snd_nxt++;
+ return;
+ }
+ } else {
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ if (tskb)
+ goto coalesce;
+ return;
+ }
+ skb_reserve(skb, MAX_TCP_HEADER);
+ sk_forced_wmem_schedule(sk, skb->truesize);
+ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+ tcp_init_nondata_skb(skb, tp->write_seq,
+ TCPHDR_ACK | TCPHDR_FIN);
+ tcp_queue_skb(sk, skb);
+ }
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
+}
+
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue. This behavior is recommended
+ * by RFC 2525, section 2.17. -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+{
+ struct sk_buff *skb;
+
+ /* NOTE: No TCP options attached and we never retransmit this. */
+ skb = alloc_skb(MAX_TCP_HEADER, priority);
+ if (!skb) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+ tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
+ TCPHDR_ACK | TCPHDR_RST);
+ /* Send it off. */
+ if (tcp_transmit_skb(sk, skb, 0, priority))
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
+}
+
+/* Send a crossed SYN-ACK during socket establishment.
+ * WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
+int tcp_send_synack(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ skb = tcp_write_queue_head(sk);
+ if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_debug("%s: wrong queue state\n", __func__);
+ return -EFAULT;
+ }
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
+ if (skb_cloned(skb)) {
+ struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+ tcp_unlink_write_queue(skb, sk);
+ __skb_header_release(nskb);
+ __tcp_add_write_queue_head(sk, nskb);
+ sk_wmem_free_skb(sk, skb);
+ sk->sk_wmem_queued += nskb->truesize;
+ sk_mem_charge(sk, nskb->truesize);
+ skb = nskb;
+ }
+
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
+ tcp_ecn_send_synack(sk, skb);
+ }
+ return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+}
+
+/**
+ * tcp_make_synack - Prepare a SYN-ACK.
+ * sk: listener socket
+ * dst: dst entry attached to the SYNACK
+ * req: request_sock pointer
+ *
+ * Allocate one skb and build a SYNACK packet.
+ * @dst is consumed : Caller should not use it again.
+ */
+struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_out_options opts;
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcphdr *th;
+ struct sk_buff *skb;
+ struct tcp_md5sig_key *md5 = NULL;
+ int tcp_header_size;
+ int mss;
+
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
+ if (unlikely(!skb)) {
+ dst_release(dst);
+ return NULL;
+ }
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+
+ skb_dst_set(skb, dst);
+
+ mss = dst_metric_advmss(dst);
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+ mss = tp->rx_opt.user_mss;
+
+ memset(&opts, 0, sizeof(opts));
+#ifdef CONFIG_SYN_COOKIES
+ if (unlikely(req->cookie_ts))
+ skb->skb_mstamp.stamp_jiffies = cookie_init_timestamp(req);
+ else
+#endif
+ skb_mstamp_get(&skb->skb_mstamp);
+
+#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
+ md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+#endif
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+ foc) + sizeof(*th);
+
+ skb_push(skb, tcp_header_size);
+ skb_reset_transport_header(skb);
+
+ th = tcp_hdr(skb);
+ memset(th, 0, sizeof(struct tcphdr));
+ th->syn = 1;
+ th->ack = 1;
+ tcp_ecn_make_synack(req, th, sk);
+ th->source = htons(ireq->ir_num);
+ th->dest = ireq->ir_rmt_port;
+ /* Setting of flags are superfluous here for callers (and ECE is
+ * not even correctly set)
+ */
+ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
+ TCPHDR_SYN | TCPHDR_ACK);
+
+ th->seq = htonl(TCP_SKB_CB(skb)->seq);
+ /* XXX data is queued and acked as is. No buffer/window check */
+ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+ th->window = htons(min(req->rcv_wnd, 65535U));
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
+ th->doff = (tcp_header_size >> 2);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Okay, we have all we need - do the md5 hash if needed */
+ if (md5)
+ tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
+ md5, req_to_sk(req), skb);
+ rcu_read_unlock();
+#endif
+
+ /* Do not fool tcpdump (if any), clean our debris */
+ skb->tstamp.tv64 = 0;
+ return skb;
+}
+EXPORT_SYMBOL(tcp_make_synack);
+
+static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_congestion_ops *ca;
+ u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
+
+ if (ca_key == TCP_CA_UNSPEC)
+ return;
+
+ rcu_read_lock();
+ ca = tcp_ca_find_key(ca_key);
+ if (likely(ca && try_module_get(ca->owner))) {
+ module_put(icsk->icsk_ca_ops->owner);
+ icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
+ icsk->icsk_ca_ops = ca;
+ }
+ rcu_read_unlock();
+}
+
+/* Do all connect socket setups that can be done AF independent. */
+static void tcp_connect_init(struct sock *sk)
+{
+ const struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u8 rcv_wscale;
+
+ /* We'll fix this up when we get a response from the other end.
+ * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+ */
+ tp->tcp_header_len = sizeof(struct tcphdr) +
+ (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (tp->af_specific->md5_lookup(sk, sk))
+ tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
+ /* If user gave his TCP_MAXSEG, record it to clamp */
+ if (tp->rx_opt.user_mss)
+ tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ tp->max_window = 0;
+ tcp_mtup_init(sk);
+ tcp_sync_mss(sk, dst_mtu(dst));
+
+ tcp_ca_dst_init(sk, dst);
+
+ if (!tp->window_clamp)
+ tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+ tp->advmss = dst_metric_advmss(dst);
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
+ tp->advmss = tp->rx_opt.user_mss;
+
+ tcp_initialize_rcv_mss(sk);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+ tp->window_clamp = tcp_full_space(sk);
+
+ tcp_select_initial_window(tcp_full_space(sk),
+ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+ &tp->rcv_wnd,
+ &tp->window_clamp,
+ sysctl_tcp_window_scaling,
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+
+ tp->rx_opt.rcv_wscale = rcv_wscale;
+ tp->rcv_ssthresh = tp->rcv_wnd;
+
+ sk->sk_err = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+ tp->snd_wnd = 0;
+ tcp_init_wl(tp, 0);
+ tp->snd_una = tp->write_seq;
+ tp->snd_sml = tp->write_seq;
+ tp->snd_up = tp->write_seq;
+ tp->snd_nxt = tp->write_seq;
+
+ if (likely(!tp->repair))
+ tp->rcv_nxt = 0;
+ else
+ tp->rcv_tstamp = tcp_time_stamp;
+ tp->rcv_wup = tp->rcv_nxt;
+ tp->copied_seq = tp->rcv_nxt;
+
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_retransmits = 0;
+ tcp_clear_retrans(tp);
+}
+
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ tcb->end_seq += skb->len;
+ __skb_header_release(skb);
+ __tcp_add_write_queue_tail(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
+ tp->write_seq = tcb->end_seq;
+ tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_fastopen_request *fo = tp->fastopen_req;
+ int syn_loss = 0, space, err = 0, copied;
+ unsigned long last_syn_loss = 0;
+ struct sk_buff *syn_data;
+
+ tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
+ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+ &syn_loss, &last_syn_loss);
+ /* Recurring FO SYN losses: revert to regular handshake temporarily */
+ if (syn_loss > 1 &&
+ time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+ fo->cookie.len = -1;
+ goto fallback;
+ }
+
+ if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+ fo->cookie.len = -1;
+ else if (fo->cookie.len <= 0)
+ goto fallback;
+
+ /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+ * user-MSS. Reserve maximum option space for middleboxes that add
+ * private TCP options. The cost is reduced data space in SYN :(
+ */
+ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+ tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ MAX_TCP_OPTION_SPACE;
+
+ space = min_t(size_t, space, fo->size);
+
+ /* limit to order-0 allocations */
+ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
+
+ syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
+ if (!syn_data)
+ goto fallback;
+ syn_data->ip_summed = CHECKSUM_PARTIAL;
+ memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
+ copied = copy_from_iter(skb_put(syn_data, space), space,
+ &fo->data->msg_iter);
+ if (unlikely(!copied)) {
+ kfree_skb(syn_data);
+ goto fallback;
+ }
+ if (copied != space) {
+ skb_trim(syn_data, copied);
+ space = copied;
+ }
+
+ /* No more data pending in inet_wait_for_connect() */
+ if (space == fo->size)
+ fo->data = NULL;
+ fo->copied = space;
+
+ tcp_connect_queue_skb(sk, syn_data);
+
+ err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
+
+ syn->skb_mstamp = syn_data->skb_mstamp;
+
+ /* Now full SYN+DATA was cloned and sent (or not),
+ * remove the SYN from the original skb (syn_data)
+ * we keep in write queue in case of a retransmit, as we
+ * also have the SYN packet (with no data) in the same queue.
+ */
+ TCP_SKB_CB(syn_data)->seq++;
+ TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
+ if (!err) {
+ tp->syn_data = (fo->copied > 0);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
+ goto done;
+ }
+
+fallback:
+ /* Send a regular SYN with Fast Open cookie request option */
+ if (fo->cookie.len > 0)
+ fo->cookie.len = 0;
+ err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+ if (err)
+ tp->syn_fastopen = 0;
+done:
+ fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
+ return err;
+}
+
+/* Build a SYN and send it off. */
+int tcp_connect(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *buff;
+ int err;
+
+ tcp_connect_init(sk);
+
+ if (unlikely(tp->repair)) {
+ tcp_finish_connect(sk, NULL);
+ return 0;
+ }
+
+ buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+ if (unlikely(!buff))
+ return -ENOBUFS;
+
+ tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+#ifdef CONFIG_TCP_STEALTH
+ /* The timetamp was already made at the time the ISN was generated
+ * as we need to know its value in the stealth_tcp_sequence_number()
+ * function.
+ */
+ tp->retrans_stamp = tp->stealth.mstamp.stamp_jiffies;
+#else
+ tp->retrans_stamp = tcp_time_stamp;
+#endif
+ tcp_connect_queue_skb(sk, buff);
+ tcp_ecn_send_syn(sk, buff);
+
+ /* Send off SYN; include data in Fast Open. */
+ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+ if (err == -ECONNREFUSED)
+ return err;
+
+ /* We change tp->snd_nxt after the tcp_transmit_skb() call
+ * in order to make this packet get counted in tcpOutSegs.
+ */
+ tp->snd_nxt = tp->write_seq;
+ tp->pushed_seq = tp->write_seq;
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
+
+ /* Timer for repeating the SYN until an answer. */
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_connect);
+
+/* Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
+ * for details.
+ */
+void tcp_send_delayed_ack(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int ato = icsk->icsk_ack.ato;
+ unsigned long timeout;
+
+ tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
+
+ if (ato > TCP_DELACK_MIN) {
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int max_ato = HZ / 2;
+
+ if (icsk->icsk_ack.pingpong ||
+ (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
+ max_ato = TCP_DELACK_MAX;
+
+ /* Slow path, intersegment interval is "high". */
+
+ /* If some rtt estimate is known, use it to bound delayed ack.
+ * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
+ * directly.
+ */
+ if (tp->srtt_us) {
+ int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
+ TCP_DELACK_MIN);
+
+ if (rtt < max_ato)
+ max_ato = rtt;
+ }
+
+ ato = min(ato, max_ato);
+ }
+
+ /* Stay within the limit we were given */
+ timeout = jiffies + ato;
+
+ /* Use new timeout only if there wasn't a older one earlier. */
+ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+ /* If delack timer was blocked or is about to expire,
+ * send ACK now.
+ */
+ if (icsk->icsk_ack.blocked ||
+ time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+ tcp_send_ack(sk);
+ return;
+ }
+
+ if (!time_before(timeout, icsk->icsk_ack.timeout))
+ timeout = icsk->icsk_ack.timeout;
+ }
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+/* This routine sends an ack and also updates the window. */
+void tcp_send_ack(struct sock *sk)
+{
+ struct sk_buff *buff;
+
+ /* If we have been reset, we may not send again. */
+ if (sk->sk_state == TCP_CLOSE)
+ return;
+
+ tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
+
+ /* We are not putting this on the write queue, so
+ * tcp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+ if (!buff) {
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
+ return;
+ }
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(buff, MAX_TCP_HEADER);
+ tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
+
+ /* We do not want pure acks influencing TCP Small Queues or fq/pacing
+ * too much.
+ * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
+ * We also avoid tcp_wfree() overhead (cache line miss accessing
+ * tp->tsq_flags) by using regular sock_wfree()
+ */
+ skb_set_tcp_pure_ack(buff);
+
+ /* Send it off, this clears delayed acks for us. */
+ skb_mstamp_get(&buff->skb_mstamp);
+ tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
+}
+EXPORT_SYMBOL_GPL(tcp_send_ack);
+
+/* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ *
+ * Question: what should we make while urgent mode?
+ * 4.4BSD forces sending single byte of data. We cannot send
+ * out of window data, because we have SND.NXT==SND.MAX...
+ *
+ * Current solution: to send TWO zero-length segments in urgent mode:
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
+ * out-of-date with SND.UNA-1 to probe window.
+ */
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ /* We don't queue it, tcp_transmit_skb() sets ownership. */
+ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+ if (!skb)
+ return -1;
+
+ /* Reserve space for headers and set control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+ /* Use a previous sequence. This should cause the other
+ * end to send an ack. Don't queue or clone SKB, just
+ * send it.
+ */
+ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
+ skb_mstamp_get(&skb->skb_mstamp);
+ return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
+}
+
+void tcp_send_window_probe(struct sock *sk)
+{
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
+ tcp_xmit_probe_skb(sk, 0);
+ }
+}
+
+/* Initiate keepalive or window probe from timer. */
+int tcp_write_wakeup(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (sk->sk_state == TCP_CLOSE)
+ return -1;
+
+ skb = tcp_send_head(sk);
+ if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+ int err;
+ unsigned int mss = tcp_current_mss(sk);
+ unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+ if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+ tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+ /* We are probing the opening of a window
+ * but the window size is != 0
+ * must have been a result SWS avoidance ( sender )
+ */
+ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+ skb->len > mss) {
+ seg_size = min(seg_size, mss);
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+ return -1;
+ } else if (!tcp_skb_pcount(skb))
+ tcp_set_skb_tso_segs(sk, skb, mss);
+
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ if (!err)
+ tcp_event_new_data_sent(sk, skb);
+ return err;
+ } else {
+ if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+ tcp_xmit_probe_skb(sk, 1);
+ return tcp_xmit_probe_skb(sk, 0);
+ }
+}
+
+/* A window probe timeout has occurred. If window is not closed send
+ * a partial packet else a zero probe.
+ */
+void tcp_send_probe0(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long probe_max;
+ int err;
+
+ err = tcp_write_wakeup(sk);
+
+ if (tp->packets_out || !tcp_send_head(sk)) {
+ /* Cancel probe timer, if it is not required. */
+ icsk->icsk_probes_out = 0;
+ icsk->icsk_backoff = 0;
+ return;
+ }
+
+ if (err <= 0) {
+ if (icsk->icsk_backoff < sysctl_tcp_retries2)
+ icsk->icsk_backoff++;
+ icsk->icsk_probes_out++;
+ probe_max = TCP_RTO_MAX;
+ } else {
+ /* If packet was not sent due to local congestion,
+ * do not backoff and do not remember icsk_probes_out.
+ * Let local senders to fight for local resources.
+ *
+ * Use accumulated backoff yet.
+ */
+ if (!icsk->icsk_probes_out)
+ icsk->icsk_probes_out = 1;
+ probe_max = TCP_RESOURCE_PROBE_INTERVAL;
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ inet_csk_rto_backoff(icsk, probe_max),
+ TCP_RTO_MAX);
+}
+
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+{
+ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+ struct flowi fl;
+ int res;
+
+ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ if (!res) {
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ }
+ return res;
+}
+EXPORT_SYMBOL(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
new file mode 100644
index 000000000..ebf5ff575
--- /dev/null
+++ b/net/ipv4/tcp_probe.c
@@ -0,0 +1,300 @@
+/*
+ * tcpprobe - Observe the TCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
+#include <linux/time.h>
+#include <net/net_namespace.h>
+
+#include <net/tcp.h>
+
+MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
+MODULE_DESCRIPTION("TCP cwnd snooper");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.1");
+
+static int port __read_mostly;
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+static unsigned int bufsize __read_mostly = 4096;
+MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
+module_param(bufsize, uint, 0);
+
+static unsigned int fwmark __read_mostly;
+MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
+module_param(fwmark, uint, 0);
+
+static int full __read_mostly;
+MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
+module_param(full, int, 0);
+
+static const char procname[] = "tcpprobe";
+
+struct tcp_log {
+ ktime_t tstamp;
+ union {
+ struct sockaddr raw;
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+ } src, dst;
+ u16 length;
+ u32 snd_nxt;
+ u32 snd_una;
+ u32 snd_wnd;
+ u32 rcv_wnd;
+ u32 snd_cwnd;
+ u32 ssthresh;
+ u32 srtt;
+};
+
+static struct {
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ ktime_t start;
+ u32 lastcwnd;
+
+ unsigned long head, tail;
+ struct tcp_log *log;
+} tcp_probe;
+
+static inline int tcp_probe_used(void)
+{
+ return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
+}
+
+static inline int tcp_probe_avail(void)
+{
+ return bufsize - tcp_probe_used() - 1;
+}
+
+#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \
+ do { \
+ si4.sin_family = AF_INET; \
+ si4.sin_port = inet->inet_##mem##port; \
+ si4.sin_addr.s_addr = inet->inet_##mem##addr; \
+ } while (0) \
+
+/*
+ * Hook inserted to be called before each receive packet.
+ * Note: arguments must match tcp_rcv_established()!
+ */
+static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct tcphdr *th, unsigned int len)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+
+ /* Only update if port or skb mark matches */
+ if (((port == 0 && fwmark == 0) ||
+ ntohs(inet->inet_dport) == port ||
+ ntohs(inet->inet_sport) == port ||
+ (fwmark > 0 && skb->mark == fwmark)) &&
+ (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
+
+ spin_lock(&tcp_probe.lock);
+ /* If log fills, just silently drop */
+ if (tcp_probe_avail() > 1) {
+ struct tcp_log *p = tcp_probe.log + tcp_probe.head;
+
+ p->tstamp = ktime_get();
+ switch (sk->sk_family) {
+ case AF_INET:
+ tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
+ tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
+ break;
+ case AF_INET6:
+ memset(&p->src.v6, 0, sizeof(p->src.v6));
+ memset(&p->dst.v6, 0, sizeof(p->dst.v6));
+#if IS_ENABLED(CONFIG_IPV6)
+ p->src.v6.sin6_family = AF_INET6;
+ p->src.v6.sin6_port = inet->inet_sport;
+ p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
+
+ p->dst.v6.sin6_family = AF_INET6;
+ p->dst.v6.sin6_port = inet->inet_dport;
+ p->dst.v6.sin6_addr = sk->sk_v6_daddr;
+#endif
+ break;
+ default:
+ BUG();
+ }
+
+ p->length = skb->len;
+ p->snd_nxt = tp->snd_nxt;
+ p->snd_una = tp->snd_una;
+ p->snd_cwnd = tp->snd_cwnd;
+ p->snd_wnd = tp->snd_wnd;
+ p->rcv_wnd = tp->rcv_wnd;
+ p->ssthresh = tcp_current_ssthresh(sk);
+ p->srtt = tp->srtt_us >> 3;
+
+ tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
+ }
+ tcp_probe.lastcwnd = tp->snd_cwnd;
+ spin_unlock(&tcp_probe.lock);
+
+ wake_up(&tcp_probe.wait);
+ }
+
+ jprobe_return();
+}
+
+static struct jprobe tcp_jprobe = {
+ .kp = {
+ .symbol_name = "tcp_rcv_established",
+ },
+ .entry = jtcp_rcv_established,
+};
+
+static int tcpprobe_open(struct inode *inode, struct file *file)
+{
+ /* Reset (empty) log */
+ spin_lock_bh(&tcp_probe.lock);
+ tcp_probe.head = tcp_probe.tail = 0;
+ tcp_probe.start = ktime_get();
+ spin_unlock_bh(&tcp_probe.lock);
+
+ return 0;
+}
+
+static int tcpprobe_sprint(char *tbuf, int n)
+{
+ const struct tcp_log *p
+ = tcp_probe.log + tcp_probe.tail;
+ struct timespec tv
+ = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
+
+ return scnprintf(tbuf, n,
+ "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
+ (unsigned long)tv.tv_sec,
+ (unsigned long)tv.tv_nsec,
+ &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
+ p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
+}
+
+static ssize_t tcpprobe_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ int error = 0;
+ size_t cnt = 0;
+
+ if (!buf)
+ return -EINVAL;
+
+ while (cnt < len) {
+ char tbuf[256];
+ int width;
+
+ /* Wait for data in buffer */
+ error = wait_event_interruptible(tcp_probe.wait,
+ tcp_probe_used() > 0);
+ if (error)
+ break;
+
+ spin_lock_bh(&tcp_probe.lock);
+ if (tcp_probe.head == tcp_probe.tail) {
+ /* multiple readers race? */
+ spin_unlock_bh(&tcp_probe.lock);
+ continue;
+ }
+
+ width = tcpprobe_sprint(tbuf, sizeof(tbuf));
+
+ if (cnt + width < len)
+ tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
+
+ spin_unlock_bh(&tcp_probe.lock);
+
+ /* if record greater than space available
+ return partial buffer (so far) */
+ if (cnt + width >= len)
+ break;
+
+ if (copy_to_user(buf + cnt, tbuf, width))
+ return -EFAULT;
+ cnt += width;
+ }
+
+ return cnt == 0 ? error : cnt;
+}
+
+static const struct file_operations tcpprobe_fops = {
+ .owner = THIS_MODULE,
+ .open = tcpprobe_open,
+ .read = tcpprobe_read,
+ .llseek = noop_llseek,
+};
+
+static __init int tcpprobe_init(void)
+{
+ int ret = -ENOMEM;
+
+ /* Warning: if the function signature of tcp_rcv_established,
+ * has been changed, you also have to change the signature of
+ * jtcp_rcv_established, otherwise you end up right here!
+ */
+ BUILD_BUG_ON(__same_type(tcp_rcv_established,
+ jtcp_rcv_established) == 0);
+
+ init_waitqueue_head(&tcp_probe.wait);
+ spin_lock_init(&tcp_probe.lock);
+
+ if (bufsize == 0)
+ return -EINVAL;
+
+ bufsize = roundup_pow_of_two(bufsize);
+ tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
+ if (!tcp_probe.log)
+ goto err0;
+
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
+ goto err0;
+
+ ret = register_jprobe(&tcp_jprobe);
+ if (ret)
+ goto err1;
+
+ pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
+ port, fwmark, bufsize);
+ return 0;
+ err1:
+ remove_proc_entry(procname, init_net.proc_net);
+ err0:
+ kfree(tcp_probe.log);
+ return ret;
+}
+module_init(tcpprobe_init);
+
+static __exit void tcpprobe_exit(void)
+{
+ remove_proc_entry(procname, init_net.proc_net);
+ unregister_jprobe(&tcp_jprobe);
+ kfree(tcp_probe.log);
+}
+module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 000000000..333bcb241
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,62 @@
+/* Tom Kelly's Scalable TCP
+ *
+ * See http://www.deneholme.net/tom/scalable/
+ *
+ * John Heffner <jheffner@sc.edu>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* These factors derived from the recommended values in the aer:
+ * .01 and and 7/8. We use 50 instead of 100 to account for
+ * delayed ack.
+ */
+#define TCP_SCALABLE_AI_CNT 50U
+#define TCP_SCALABLE_MD_SCALE 3
+
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+ else
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ 1);
+}
+
+static u32 tcp_scalable_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+}
+
+static struct tcp_congestion_ops tcp_scalable __read_mostly = {
+ .ssthresh = tcp_scalable_ssthresh,
+ .cong_avoid = tcp_scalable_cong_avoid,
+
+ .owner = THIS_MODULE,
+ .name = "scalable",
+};
+
+static int __init tcp_scalable_register(void)
+{
+ return tcp_register_congestion_control(&tcp_scalable);
+}
+
+static void __exit tcp_scalable_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_scalable);
+}
+
+module_init(tcp_scalable_register);
+module_exit(tcp_scalable_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
new file mode 100644
index 000000000..8c65dc147
--- /dev/null
+++ b/net/ipv4/tcp_timer.c
@@ -0,0 +1,652 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
+ * Corey Minyard <wf-rch!minyard@relay.EU.net>
+ * Florian La Roche, <flla@stud.uni-sb.de>
+ * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ * Linus Torvalds, <torvalds@cs.helsinki.fi>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ * Matthew Dillon, <dillon@apollo.west.oic.com>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
+int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
+int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME;
+int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
+int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
+int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
+int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_thin_linear_timeouts __read_mostly;
+
+static void tcp_write_err(struct sock *sk)
+{
+ sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+ sk->sk_error_report(sk);
+
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
+}
+
+/* Do not allow orphaned sockets to eat all our resources.
+ * This is direct violation of TCP specs, but it is required
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
+ * Criteria is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ * limit.
+ * 2. If we have strong memory pressure.
+ */
+static int tcp_out_of_resources(struct sock *sk, bool do_reset)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int shift = 0;
+
+ /* If peer does not open window for long time, or did not transmit
+ * anything for long time, penalize it. */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+ shift++;
+
+ /* If some dubious ICMP arrived, penalize even more. */
+ if (sk->sk_err_soft)
+ shift++;
+
+ if (tcp_check_oom(sk, shift)) {
+ /* Catch exceptional cases, when connection requires reset.
+ * 1. Last segment was sent recently. */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+ /* 2. Window is closed. */
+ (!tp->snd_wnd && !tp->packets_out))
+ do_reset = true;
+ if (do_reset)
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+ return 1;
+ }
+ return 0;
+}
+
+/* Calculate maximal number or retries on an orphaned socket. */
+static int tcp_orphan_retries(struct sock *sk, int alive)
+{
+ int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+
+ /* We know from an ICMP that something is wrong. */
+ if (sk->sk_err_soft && !alive)
+ retries = 0;
+
+ /* However, if socket sent something recently, select some safe
+ * number of retries. 8 corresponds to >100 seconds with minimal
+ * RTO of 200msec. */
+ if (retries == 0 && alive)
+ retries = 8;
+ return retries;
+}
+
+static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ /* Black hole detection */
+ if (net->ipv4.sysctl_tcp_mtu_probing) {
+ if (!icsk->icsk_mtup.enabled) {
+ icsk->icsk_mtup.enabled = 1;
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ } else {
+ struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss;
+
+ mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+ mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
+ mss = max(mss, 68 - tp->tcp_header_len);
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ }
+ }
+}
+
+/* This function calculates a "timeout" which is equivalent to the timeout of a
+ * TCP connection after "boundary" unsuccessful, exponentially backed-off
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
+ * syn_set flag is set.
+ */
+static bool retransmits_timed_out(struct sock *sk,
+ unsigned int boundary,
+ unsigned int timeout,
+ bool syn_set)
+{
+ unsigned int linear_backoff_thresh, start_ts;
+ unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
+
+ if (!inet_csk(sk)->icsk_retransmits)
+ return false;
+
+ start_ts = tcp_sk(sk)->retrans_stamp;
+ if (unlikely(!start_ts))
+ start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+
+ if (likely(timeout == 0)) {
+ linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
+
+ if (boundary <= linear_backoff_thresh)
+ timeout = ((2 << boundary) - 1) * rto_base;
+ else
+ timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+ (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ }
+ return (tcp_time_stamp - start_ts) >= timeout;
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int retry_until;
+ bool do_reset, syn_set = false;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ if (icsk->icsk_retransmits) {
+ dst_negative_advice(sk);
+ if (tp->syn_fastopen || tp->syn_data)
+ tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
+ if (tp->syn_data)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
+ retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ syn_set = true;
+ } else {
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+ /* Black hole detection */
+ tcp_mtu_probing(icsk, sk);
+
+ dst_negative_advice(sk);
+ }
+
+ retry_until = sysctl_tcp_retries2;
+ if (sock_flag(sk, SOCK_DEAD)) {
+ const int alive = icsk->icsk_rto < TCP_RTO_MAX;
+
+ retry_until = tcp_orphan_retries(sk, alive);
+ do_reset = alive ||
+ !retransmits_timed_out(sk, retry_until, 0, 0);
+
+ if (tcp_out_of_resources(sk, do_reset))
+ return 1;
+ }
+ }
+
+ if (retransmits_timed_out(sk, retry_until,
+ syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
+ /* Has it gone just too far? */
+ tcp_write_err(sk);
+ return 1;
+ }
+ return 0;
+}
+
+void tcp_delack_timer_handler(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_mem_reclaim_partial(sk);
+
+ if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ goto out;
+
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+ goto out;
+ }
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
+ struct sk_buff *skb;
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
+
+ while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+ sk_backlog_rcv(sk, skb);
+
+ tp->ucopy.memory = 0;
+ }
+
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
+ /* Delayed ACK missed: inflate ATO. */
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
+ } else {
+ /* Delayed ACK missed: leave pingpong mode and
+ * deflate ATO.
+ */
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ }
+ tcp_send_ack(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+ }
+
+out:
+ if (sk_under_memory_pressure(sk))
+ sk_mem_reclaim(sk);
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_delack_timer_handler(sk);
+ } else {
+ inet_csk(sk)->icsk_ack.blocked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+static void tcp_probe_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ int max_probes;
+ u32 start_ts;
+
+ if (tp->packets_out || !tcp_send_head(sk)) {
+ icsk->icsk_probes_out = 0;
+ return;
+ }
+
+ /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
+ * long as the receiver continues to respond probes. We support this by
+ * default and reset icsk_probes_out with incoming ACKs. But if the
+ * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
+ * kill the socket when the retry count and the time exceeds the
+ * corresponding system limit. We also implement similar policy when
+ * we use RTO to probe window in tcp_retransmit_timer().
+ */
+ start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+ if (!start_ts)
+ skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp);
+ else if (icsk->icsk_user_timeout &&
+ (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
+ goto abort;
+
+ max_probes = sysctl_tcp_retries2;
+ if (sock_flag(sk, SOCK_DEAD)) {
+ const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+
+ max_probes = tcp_orphan_retries(sk, alive);
+ if (!alive && icsk->icsk_backoff >= max_probes)
+ goto abort;
+ if (tcp_out_of_resources(sk, true))
+ return;
+ }
+
+ if (icsk->icsk_probes_out > max_probes) {
+abort: tcp_write_err(sk);
+ } else {
+ /* Only send another probe if we didn't close things up. */
+ tcp_send_probe0(sk);
+ }
+}
+
+/*
+ * Timer for Fast Open socket to retransmit SYNACK. Note that the
+ * sk here is the child socket, not the parent (listener) socket.
+ */
+static void tcp_fastopen_synack_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int max_retries = icsk->icsk_syn_retries ? :
+ sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+ struct request_sock *req;
+
+ req = tcp_sk(sk)->fastopen_rsk;
+ req->rsk_ops->syn_ack_timeout(req);
+
+ if (req->num_timeout >= max_retries) {
+ tcp_write_err(sk);
+ return;
+ }
+ /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
+ * returned from rtx_syn_ack() to make it more persistent like
+ * regular retransmit because if the child socket has been accepted
+ * it's not good to give up too easily.
+ */
+ inet_rtx_syn_ack(sk, req);
+ req->num_timeout++;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+}
+
+/*
+ * The TCP retransmit timer.
+ */
+
+void tcp_retransmit_timer(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (tp->fastopen_rsk) {
+ WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+ sk->sk_state != TCP_FIN_WAIT1);
+ tcp_fastopen_synack_timer(sk);
+ /* Before we receive ACK to our SYN-ACK don't retransmit
+ * anything else (e.g., data or FIN segments).
+ */
+ return;
+ }
+ if (!tp->packets_out)
+ goto out;
+
+ WARN_ON(tcp_write_queue_empty(sk));
+
+ tp->tlp_high_seq = 0;
+
+ if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
+ !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+ /* Receiver dastardly shrinks window. Our retransmits
+ * become zero probes, but we should not timeout this
+ * connection. If the socket is an orphan, time it out,
+ * we cannot allow such beasts to hang infinitely.
+ */
+ struct inet_sock *inet = inet_sk(sk);
+ if (sk->sk_family == AF_INET) {
+ net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+ &inet->inet_daddr,
+ ntohs(inet->inet_dport),
+ inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+ &sk->sk_v6_daddr,
+ ntohs(inet->inet_dport),
+ inet->inet_num,
+ tp->snd_una, tp->snd_nxt);
+ }
+#endif
+ if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
+ tcp_write_err(sk);
+ goto out;
+ }
+ tcp_enter_loss(sk);
+ tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
+ __sk_dst_reset(sk);
+ goto out_reset_timer;
+ }
+
+ if (tcp_write_timeout(sk))
+ goto out;
+
+ if (icsk->icsk_retransmits == 0) {
+ int mib_idx;
+
+ if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
+ else
+ mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
+ } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
+ mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+ } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+ tp->sacked_out) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKFAILURES;
+ else
+ mib_idx = LINUX_MIB_TCPRENOFAILURES;
+ } else {
+ mib_idx = LINUX_MIB_TCPTIMEOUTS;
+ }
+ NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ }
+
+ tcp_enter_loss(sk);
+
+ if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
+ /* Retransmission failed because of local congestion,
+ * do not backoff.
+ */
+ if (!icsk->icsk_retransmits)
+ icsk->icsk_retransmits = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RTO_MAX);
+ goto out;
+ }
+
+ /* Increase the timeout each time we retransmit. Note that
+ * we do not increase the rtt estimate. rto is initialized
+ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
+ * that doubling rto each time is the least we can get away with.
+ * In KA9Q, Karn uses this for the first few times, and then
+ * goes to quadratic. netBSD doubles, but only goes up to *64,
+ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
+ * defined in the protocol as the maximum possible RTT. I guess
+ * we'll have to use something other than TCP to talk to the
+ * University of Mars.
+ *
+ * PAWS allows us longer timeouts and large windows, so once
+ * implemented ftp to mars will work nicely. We will have to fix
+ * the 120 second clamps though!
+ */
+ icsk->icsk_backoff++;
+ icsk->icsk_retransmits++;
+
+out_reset_timer:
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+ * might be increased if the stream oscillates between thin and thick,
+ * thus the old value might already be too high compared to the value
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+ * exponential backoff behaviour to avoid continue hammering
+ * linear-timeout retransmissions into a black hole
+ */
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+ tcp_stream_is_thin(tp) &&
+ icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
+ icsk->icsk_backoff = 0;
+ icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ } else {
+ /* Use normal (exponential) backoff */
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ }
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
+ __sk_dst_reset(sk);
+
+out:;
+}
+
+void tcp_write_timer_handler(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int event;
+
+ if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
+ goto out;
+
+ if (time_after(icsk->icsk_timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
+ goto out;
+ }
+
+ event = icsk->icsk_pending;
+
+ switch (event) {
+ case ICSK_TIME_EARLY_RETRANS:
+ tcp_resume_early_retransmit(sk);
+ break;
+ case ICSK_TIME_LOSS_PROBE:
+ tcp_send_loss_probe(sk);
+ break;
+ case ICSK_TIME_RETRANS:
+ icsk->icsk_pending = 0;
+ tcp_retransmit_timer(sk);
+ break;
+ case ICSK_TIME_PROBE0:
+ icsk->icsk_pending = 0;
+ tcp_probe_timer(sk);
+ break;
+ }
+
+out:
+ sk_mem_reclaim(sk);
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ tcp_write_timer_handler(sk);
+ } else {
+ /* deleguate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+void tcp_syn_ack_timeout(const struct request_sock *req)
+{
+ struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
+
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS);
+}
+EXPORT_SYMBOL(tcp_syn_ack_timeout);
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+ return;
+
+ if (val && !sock_flag(sk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ else if (!val)
+ inet_csk_delete_keepalive_timer(sk);
+}
+
+
+static void tcp_keepalive_timer (unsigned long data)
+{
+ struct sock *sk = (struct sock *) data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 elapsed;
+
+ /* Only process if socket is not in use. */
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ inet_csk_reset_keepalive_timer (sk, HZ/20);
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_LISTEN) {
+ pr_err("Hmm... keepalive on a LISTEN ???\n");
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+ if (tp->linger2 >= 0) {
+ const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
+
+ if (tmo > 0) {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ goto death;
+ }
+
+ if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ elapsed = keepalive_time_when(tp);
+
+ /* It is alive without keepalive 8) */
+ if (tp->packets_out || tcp_send_head(sk))
+ goto resched;
+
+ elapsed = keepalive_time_elapsed(tp);
+
+ if (elapsed >= keepalive_time_when(tp)) {
+ /* If the TCP_USER_TIMEOUT option is enabled, use that
+ * to determine when to timeout instead.
+ */
+ if ((icsk->icsk_user_timeout != 0 &&
+ elapsed >= icsk->icsk_user_timeout &&
+ icsk->icsk_probes_out > 0) ||
+ (icsk->icsk_user_timeout == 0 &&
+ icsk->icsk_probes_out >= keepalive_probes(tp))) {
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_write_err(sk);
+ goto out;
+ }
+ if (tcp_write_wakeup(sk) <= 0) {
+ icsk->icsk_probes_out++;
+ elapsed = keepalive_intvl_when(tp);
+ } else {
+ /* If keepalive was lost due to local congestion,
+ * try harder.
+ */
+ elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+ }
+ } else {
+ /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
+ elapsed = keepalive_time_when(tp) - elapsed;
+ }
+
+ sk_mem_reclaim(sk);
+
+resched:
+ inet_csk_reset_keepalive_timer (sk, elapsed);
+ goto out;
+
+death:
+ tcp_done(sk);
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+ inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+ &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000..a6cea1d5e
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,337 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ * Lawrence S. Brakmo and Larry L. Peterson.
+ * "TCP Vegas: End to end congestion avoidance on a global internet."
+ * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ * October 1995. Available from:
+ * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ * o We do not change the loss detection or recovery mechanisms of
+ * Linux in any way. Linux already recovers from losses quite well,
+ * using fine-grained timers, NewReno, and FACK.
+ * o To avoid the performance penalty imposed by increasing cwnd
+ * only every-other RTT during slow start, we increase during
+ * every RTT during slow start, just like Reno.
+ * o Largely to allow continuous cwnd growth during slow start,
+ * we use the rate at which ACKs come back as the "actual"
+ * rate, rather than the rate at which data is sent.
+ * o To speed convergence to the right rate, we set the cwnd
+ * to achieve the right ("actual") rate when we exit slow start.
+ * o To filter out the noise caused by delayed ACKs, we use the
+ * minimum RTT sample observed during the last RTT to calculate
+ * the actual rate.
+ * o When the sender re-starts from idle, it waits until it has
+ * received ACKs for an entire flight of new data before making
+ * a cwnd adjustment decision. The original Vegas implementation
+ * assumed senders never went idle.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+static int alpha = 2;
+static int beta = 4;
+static int gamma = 1;
+
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+
+/* There are several situations when we must "re-start" Vegas:
+ *
+ * o when a connection is established
+ * o after an RTO
+ * o after fast recovery
+ * o when we send a packet and there is no outstanding
+ * unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static void vegas_enable(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ /* Begin taking Vegas samples next time we send something. */
+ vegas->doing_vegas_now = 1;
+
+ /* Set the beginning of the next send window. */
+ vegas->beg_snd_nxt = tp->snd_nxt;
+
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
+}
+
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct sock *sk)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ vegas->doing_vegas_now = 0;
+}
+
+void tcp_vegas_init(struct sock *sk)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ vegas->baseRTT = 0x7fffffff;
+ vegas_enable(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_init);
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ * o min-filter RTT samples from within an RTT to get the current
+ * propagation delay + queuing delay (we are min-filtering to try to
+ * avoid the effects of delayed ACKs)
+ * o min-filter RTT samples from a much longer window (forever for now)
+ * to find the propagation delay (baseRTT)
+ */
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+ u32 vrtt;
+
+ if (rtt_us < 0)
+ return;
+
+ /* Never allow zero rtt or baseRTT */
+ vrtt = rtt_us + 1;
+
+ /* Filter to find propagation delay: */
+ if (vrtt < vegas->baseRTT)
+ vegas->baseRTT = vrtt;
+
+ /* Find the min RTT during the last RTT to find
+ * the current prop. delay + queuing delay:
+ */
+ vegas->minRTT = min(vegas->minRTT, vrtt);
+ vegas->cntRTT++;
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked);
+
+void tcp_vegas_state(struct sock *sk, u8 ca_state)
+{
+ if (ca_state == TCP_CA_Open)
+ vegas_enable(sk);
+ else
+ vegas_disable(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_state);
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples. So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_CWND_RESTART ||
+ event == CA_EVENT_TX_START)
+ tcp_vegas_init(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
+
+static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
+{
+ return min(tp->snd_ssthresh, tp->snd_cwnd-1);
+}
+
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ if (!vegas->doing_vegas_now) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
+
+ if (after(ack, vegas->beg_snd_nxt)) {
+ /* Do the Vegas once-per-RTT cwnd adjustment. */
+
+ /* Save the extent of the current window so we can use this
+ * at the end of the next RTT.
+ */
+ vegas->beg_snd_nxt = tp->snd_nxt;
+
+ /* We do the Vegas calculations only if we got enough RTT
+ * samples that we can be reasonably sure that we got
+ * at least one RTT sample that wasn't from a delayed ACK.
+ * If we only had 2 samples total,
+ * then that means we're getting only 1 ACK per RTT, which
+ * means they're almost certainly delayed ACKs.
+ * If we have 3 samples, we should be OK.
+ */
+
+ if (vegas->cntRTT <= 2) {
+ /* We don't have enough RTT samples to do the Vegas
+ * calculation, so we'll behave like Reno.
+ */
+ tcp_reno_cong_avoid(sk, ack, acked);
+ } else {
+ u32 rtt, diff;
+ u64 target_cwnd;
+
+ /* We have enough RTT samples, so, using the Vegas
+ * algorithm, we determine if we should increase or
+ * decrease cwnd, and by how much.
+ */
+
+ /* Pluck out the RTT we are using for the Vegas
+ * calculations. This is the min RTT seen during the
+ * last RTT. Taking the min filters out the effects
+ * of delayed ACKs, at the cost of noticing congestion
+ * a bit later.
+ */
+ rtt = vegas->minRTT;
+
+ /* Calculate the cwnd we should have, if we weren't
+ * going too fast.
+ *
+ * This is:
+ * (actual rate in segments) * baseRTT
+ */
+ target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
+ do_div(target_cwnd, rtt);
+
+ /* Calculate the difference between the window we had,
+ * and the window we would like to have. This quantity
+ * is the "Diff" from the Arizona Vegas papers.
+ */
+ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
+
+ if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* Going too fast. Time to slow down
+ * and switch to congestion avoidance.
+ */
+
+ /* Set cwnd to match the actual rate
+ * exactly:
+ * cwnd = (actual rate) * baseRTT
+ * Then we add 1 because the integer
+ * truncation robs us of full link
+ * utilization.
+ */
+ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+ tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
+
+ } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* Slow start. */
+ tcp_slow_start(tp, acked);
+ } else {
+ /* Congestion avoidance. */
+
+ /* Figure out where we would like cwnd
+ * to be.
+ */
+ if (diff > beta) {
+ /* The old window was too fast, so
+ * we slow down.
+ */
+ tp->snd_cwnd--;
+ tp->snd_ssthresh
+ = tcp_vegas_ssthresh(tp);
+ } else if (diff < alpha) {
+ /* We don't have enough extra packets
+ * in the network, so speed up.
+ */
+ tp->snd_cwnd++;
+ } else {
+ /* Sending just as fast as we
+ * should be.
+ */
+ }
+ }
+
+ if (tp->snd_cwnd < 2)
+ tp->snd_cwnd = 2;
+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+ tp->snd_cwnd = tp->snd_cwnd_clamp;
+
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ }
+
+ /* Wipe the slate clean for the next RTT. */
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
+ }
+ /* Use normal slow start */
+ else if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct vegas *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = ca->doing_vegas_now,
+ info->vegas.tcpv_rttcnt = ca->cntRTT,
+ info->vegas.tcpv_rtt = ca->baseRTT,
+ info->vegas.tcpv_minrtt = ca->minRTT,
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
+
+static struct tcp_congestion_ops tcp_vegas __read_mostly = {
+ .init = tcp_vegas_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_vegas_cong_avoid,
+ .pkts_acked = tcp_vegas_pkts_acked,
+ .set_state = tcp_vegas_state,
+ .cwnd_event = tcp_vegas_cwnd_event,
+ .get_info = tcp_vegas_get_info,
+
+ .owner = THIS_MODULE,
+ .name = "vegas",
+};
+
+static int __init tcp_vegas_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_vegas);
+ return 0;
+}
+
+static void __exit tcp_vegas_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_vegas);
+}
+
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
new file mode 100644
index 000000000..ef9da5306
--- /dev/null
+++ b/net/ipv4/tcp_vegas.h
@@ -0,0 +1,25 @@
+/*
+ * TCP Vegas congestion control interface
+ */
+#ifndef __TCP_VEGAS_H
+#define __TCP_VEGAS_H 1
+
+/* Vegas variables */
+struct vegas {
+ u32 beg_snd_nxt; /* right edge during last RTT */
+ u32 beg_snd_una; /* left edge during last RTT */
+ u32 beg_snd_cwnd; /* saves the size of the cwnd */
+ u8 doing_vegas_now;/* if true, do vegas for this RTT */
+ u16 cntRTT; /* # of RTTs measured within last RTT */
+ u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
+ u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
+};
+
+void tcp_vegas_init(struct sock *sk);
+void tcp_vegas_state(struct sock *sk, u8 ca_state);
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info);
+
+#endif /* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
new file mode 100644
index 000000000..112151eee
--- /dev/null
+++ b/net/ipv4/tcp_veno.c
@@ -0,0 +1,232 @@
+/*
+ * TCP Veno congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ * C. P. Fu, S. C. Liew.
+ * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
+ * IEEE Journal on Selected Areas in Communication,
+ * Feb. 2003.
+ * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Veno variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static const int beta = 3 << V_PARAM_SHIFT;
+
+/* Veno variables */
+struct veno {
+ u8 doing_veno_now; /* if true, do veno for this rtt */
+ u16 cntrtt; /* # of rtts measured within last rtt */
+ u32 minrtt; /* min of rtts measured within last rtt (in usec) */
+ u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */
+ u32 inc; /* decide whether to increase cwnd */
+ u32 diff; /* calculate the diff rate */
+};
+
+/* There are several situations when we must "re-start" Veno:
+ *
+ * o when a connection is established
+ * o after an RTO
+ * o after fast recovery
+ * o when we send a packet and there is no outstanding
+ * unacknowledged data (restarting an idle connection)
+ *
+ */
+static inline void veno_enable(struct sock *sk)
+{
+ struct veno *veno = inet_csk_ca(sk);
+
+ /* turn on Veno */
+ veno->doing_veno_now = 1;
+
+ veno->minrtt = 0x7fffffff;
+}
+
+static inline void veno_disable(struct sock *sk)
+{
+ struct veno *veno = inet_csk_ca(sk);
+
+ /* turn off Veno */
+ veno->doing_veno_now = 0;
+}
+
+static void tcp_veno_init(struct sock *sk)
+{
+ struct veno *veno = inet_csk_ca(sk);
+
+ veno->basertt = 0x7fffffff;
+ veno->inc = 1;
+ veno_enable(sk);
+}
+
+/* Do rtt sampling needed for Veno. */
+static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+ struct veno *veno = inet_csk_ca(sk);
+ u32 vrtt;
+
+ if (rtt_us < 0)
+ return;
+
+ /* Never allow zero rtt or baseRTT */
+ vrtt = rtt_us + 1;
+
+ /* Filter to find propagation delay: */
+ if (vrtt < veno->basertt)
+ veno->basertt = vrtt;
+
+ /* Find the min rtt during the last rtt to find
+ * the current prop. delay + queuing delay:
+ */
+ veno->minrtt = min(veno->minrtt, vrtt);
+ veno->cntrtt++;
+}
+
+static void tcp_veno_state(struct sock *sk, u8 ca_state)
+{
+ if (ca_state == TCP_CA_Open)
+ veno_enable(sk);
+ else
+ veno_disable(sk);
+}
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Veno calculations
+ * until we get fresh rtt samples. So when we
+ * restart, we reset our Veno state to a clean
+ * state. After we get acks for this flight of
+ * packets, _then_ we can make Veno calculations
+ * again.
+ */
+static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
+ tcp_veno_init(sk);
+}
+
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct veno *veno = inet_csk_ca(sk);
+
+ if (!veno->doing_veno_now) {
+ tcp_reno_cong_avoid(sk, ack, acked);
+ return;
+ }
+
+ /* limited by applications */
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* We do the Veno calculations only if we got enough rtt samples */
+ if (veno->cntrtt <= 2) {
+ /* We don't have enough rtt samples to do the Veno
+ * calculation, so we'll behave like Reno.
+ */
+ tcp_reno_cong_avoid(sk, ack, acked);
+ } else {
+ u64 target_cwnd;
+ u32 rtt;
+
+ /* We have enough rtt samples, so, using the Veno
+ * algorithm, we determine the state of the network.
+ */
+
+ rtt = veno->minrtt;
+
+ target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
+ target_cwnd <<= V_PARAM_SHIFT;
+ do_div(target_cwnd, rtt);
+
+ veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* Slow start. */
+ tcp_slow_start(tp, acked);
+ } else {
+ /* Congestion avoidance. */
+ if (veno->diff < beta) {
+ /* In the "non-congestive state", increase cwnd
+ * every rtt.
+ */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+ } else {
+ /* In the "congestive state", increase cwnd
+ * every other rtt.
+ */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (veno->inc &&
+ tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ tp->snd_cwnd++;
+ veno->inc = 0;
+ } else
+ veno->inc = 1;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
+ }
+ if (tp->snd_cwnd < 2)
+ tp->snd_cwnd = 2;
+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+ tp->snd_cwnd = tp->snd_cwnd_clamp;
+ }
+ /* Wipe the slate clean for the next rtt. */
+ /* veno->cntrtt = 0; */
+ veno->minrtt = 0x7fffffff;
+}
+
+/* Veno MD phase */
+static u32 tcp_veno_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct veno *veno = inet_csk_ca(sk);
+
+ if (veno->diff < beta)
+ /* in "non-congestive state", cut cwnd by 1/5 */
+ return max(tp->snd_cwnd * 4 / 5, 2U);
+ else
+ /* in "congestive state", cut cwnd by 1/2 */
+ return max(tp->snd_cwnd >> 1U, 2U);
+}
+
+static struct tcp_congestion_ops tcp_veno __read_mostly = {
+ .init = tcp_veno_init,
+ .ssthresh = tcp_veno_ssthresh,
+ .cong_avoid = tcp_veno_cong_avoid,
+ .pkts_acked = tcp_veno_pkts_acked,
+ .set_state = tcp_veno_state,
+ .cwnd_event = tcp_veno_cwnd_event,
+
+ .owner = THIS_MODULE,
+ .name = "veno",
+};
+
+static int __init tcp_veno_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_veno);
+ return 0;
+}
+
+static void __exit tcp_veno_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_veno);
+}
+
+module_init(tcp_veno_register);
+module_exit(tcp_veno_unregister);
+
+MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Veno");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 000000000..c10732e39
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,305 @@
+/*
+ * TCP Westwood+: end-to-end bandwidth estimation for TCP
+ *
+ * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4
+ *
+ * Support at http://c3lab.poliba.it/index.php/Westwood
+ * Main references in literature:
+ *
+ * - Mascolo S, Casetti, M. Gerla et al.
+ * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001
+ *
+ * - A. Grieco, s. Mascolo
+ * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer
+ * Comm. Review, 2004
+ *
+ * - A. Dell'Aera, L. Grieco, S. Mascolo.
+ * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving :
+ * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004
+ *
+ * Westwood+ employs end-to-end bandwidth measurement to set cwnd and
+ * ssthresh after packet loss. The probing phase is as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <net/tcp.h>
+
+/* TCP Westwood structure */
+struct westwood {
+ u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
+ u32 bw_est; /* bandwidth estimate */
+ u32 rtt_win_sx; /* here starts a new evaluation... */
+ u32 bk;
+ u32 snd_una; /* used for evaluating the number of acked bytes */
+ u32 cumul_ack;
+ u32 accounted;
+ u32 rtt;
+ u32 rtt_min; /* minimum observed RTT */
+ u8 first_ack; /* flag which infers that this is the first ack */
+ u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/
+};
+
+/* TCP Westwood functions and constants */
+#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
+#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
+
+/*
+ * @tcp_westwood_create
+ * This function initializes fields used in TCP Westwood+,
+ * it is called after the initial SYN, so the sequence numbers
+ * are correct but new passive connections we have no
+ * information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void tcp_westwood_init(struct sock *sk)
+{
+ struct westwood *w = inet_csk_ca(sk);
+
+ w->bk = 0;
+ w->bw_ns_est = 0;
+ w->bw_est = 0;
+ w->accounted = 0;
+ w->cumul_ack = 0;
+ w->reset_rtt_min = 1;
+ w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
+ w->rtt_win_sx = tcp_time_stamp;
+ w->snd_una = tcp_sk(sk)->snd_una;
+ w->first_ack = 1;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coefficients.
+ */
+static inline u32 westwood_do_filter(u32 a, u32 b)
+{
+ return ((7 * a) + b) >> 3;
+}
+
+static void westwood_filter(struct westwood *w, u32 delta)
+{
+ /* If the filter is empty fill it with the first sample of bandwidth */
+ if (w->bw_ns_est == 0 && w->bw_est == 0) {
+ w->bw_ns_est = w->bk / delta;
+ w->bw_est = w->bw_ns_est;
+ } else {
+ w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+ w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+ }
+}
+
+/*
+ * @westwood_pkts_acked
+ * Called after processing group of packets.
+ * but all westwood needs is the last sample of srtt.
+ */
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
+{
+ struct westwood *w = inet_csk_ca(sk);
+
+ if (rtt > 0)
+ w->rtt = usecs_to_jiffies(rtt);
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+static void westwood_update_window(struct sock *sk)
+{
+ struct westwood *w = inet_csk_ca(sk);
+ s32 delta = tcp_time_stamp - w->rtt_win_sx;
+
+ /* Initialize w->snd_una with the first acked sequence number in order
+ * to fix mismatch between tp->snd_una and w->snd_una for the first
+ * bandwidth sample
+ */
+ if (w->first_ack) {
+ w->snd_una = tcp_sk(sk)->snd_una;
+ w->first_ack = 0;
+ }
+
+ /*
+ * See if a RTT-window has passed.
+ * Be careful since if RTT is less than
+ * 50ms we don't filter but we continue 'building the sample'.
+ * This minimum limit was chosen since an estimation on small
+ * time intervals is better to avoid...
+ * Obviously on a LAN we reasonably will always have
+ * right_bound = left_bound + WESTWOOD_RTT_MIN
+ */
+ if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
+ westwood_filter(w, delta);
+
+ w->bk = 0;
+ w->rtt_win_sx = tcp_time_stamp;
+ }
+}
+
+static inline void update_rtt_min(struct westwood *w)
+{
+ if (w->reset_rtt_min) {
+ w->rtt_min = w->rtt;
+ w->reset_rtt_min = 0;
+ } else
+ w->rtt_min = min(w->rtt, w->rtt_min);
+}
+
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ westwood_update_window(sk);
+
+ w->bk += tp->snd_una - w->snd_una;
+ w->snd_una = tp->snd_una;
+ update_rtt_min(w);
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating bk in case of
+ * delayed or partial acks.
+ */
+static inline u32 westwood_acked_count(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ w->cumul_ack = tp->snd_una - w->snd_una;
+
+ /* If cumul_ack is 0 this is a dupack since it's not moving
+ * tp->snd_una.
+ */
+ if (!w->cumul_ack) {
+ w->accounted += tp->mss_cache;
+ w->cumul_ack = tp->mss_cache;
+ }
+
+ if (w->cumul_ack > tp->mss_cache) {
+ /* Partial or delayed ack */
+ if (w->accounted >= w->cumul_ack) {
+ w->accounted -= w->cumul_ack;
+ w->cumul_ack = tp->mss_cache;
+ } else {
+ w->cumul_ack -= w->accounted;
+ w->accounted = 0;
+ }
+ }
+
+ w->snd_una = tp->snd_una;
+
+ return w->cumul_ack;
+}
+
+/*
+ * TCP Westwood
+ * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
+ * so avoids ever returning 0.
+ */
+static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct westwood *w = inet_csk_ca(sk);
+
+ return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
+}
+
+static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
+{
+ if (ack_flags & CA_ACK_SLOWPATH) {
+ struct westwood *w = inet_csk_ca(sk);
+
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
+
+ update_rtt_min(w);
+ return;
+ }
+
+ westwood_fast_bw(sk);
+}
+
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ switch (event) {
+ case CA_EVENT_COMPLETE_CWR:
+ tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ break;
+ case CA_EVENT_LOSS:
+ tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ /* Update RTT_min when next ack arrives */
+ w->reset_rtt_min = 1;
+ break;
+ default:
+ /* don't care */
+ break;
+ }
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info)
+{
+ const struct westwood *ca = inet_csk_ca(sk);
+
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ info->vegas.tcpv_enabled = 1;
+ info->vegas.tcpv_rttcnt = 0;
+ info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt),
+ info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
+
+ *attr = INET_DIAG_VEGASINFO;
+ return sizeof(struct tcpvegas_info);
+ }
+ return 0;
+}
+
+static struct tcp_congestion_ops tcp_westwood __read_mostly = {
+ .init = tcp_westwood_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .cwnd_event = tcp_westwood_event,
+ .in_ack_event = tcp_westwood_ack,
+ .get_info = tcp_westwood_info,
+ .pkts_acked = tcp_westwood_pkts_acked,
+
+ .owner = THIS_MODULE,
+ .name = "westwood"
+};
+
+static int __init tcp_westwood_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_westwood);
+}
+
+static void __exit tcp_westwood_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_westwood);
+}
+
+module_init(tcp_westwood_register);
+module_exit(tcp_westwood_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
new file mode 100644
index 000000000..17d356629
--- /dev/null
+++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,255 @@
+/*
+ *
+ * YeAH TCP
+ *
+ * For further details look at:
+ * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ *
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */
+#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */
+#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */
+#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */
+#define TCP_YEAH_PHY 8 /* maximum delta from base */
+#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */
+#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */
+
+#define TCP_SCALABLE_AI_CNT 100U
+
+/* YeAH variables */
+struct yeah {
+ struct vegas vegas; /* must be first */
+
+ /* YeAH */
+ u32 lastQ;
+ u32 doing_reno_now;
+
+ u32 reno_count;
+ u32 fast_count;
+
+ u32 pkts_acked;
+};
+
+static void tcp_yeah_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+
+ tcp_vegas_init(sk);
+
+ yeah->doing_reno_now = 0;
+ yeah->lastQ = 0;
+
+ yeah->reno_count = 2;
+
+ /* Ensure the MD arithmetic works. This is somewhat pedantic,
+ * since I don't think we will see a cwnd this large. :) */
+ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+
+ if (icsk->icsk_ca_state == TCP_CA_Open)
+ yeah->pkts_acked = pkts_acked;
+
+ tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
+}
+
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp, acked);
+
+ else if (!yeah->doing_reno_now) {
+ /* Scalable */
+
+ tp->snd_cwnd_cnt += yeah->pkts_acked;
+ if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ }
+
+ yeah->pkts_acked = 1;
+
+ } else {
+ /* Reno */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+ }
+
+ /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
+ *
+ * These are so named because they represent the approximate values
+ * of snd_una and snd_nxt at the beginning of the current RTT. More
+ * precisely, they represent the amount of data sent during the RTT.
+ * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+ * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding
+ * bytes of data have been ACKed during the course of the RTT, giving
+ * an "actual" rate of:
+ *
+ * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)
+ *
+ * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,
+ * because delayed ACKs can cover more than one segment, so they
+ * don't line up yeahly with the boundaries of RTTs.
+ *
+ * Another unfortunate fact of life is that delayed ACKs delay the
+ * advance of the left edge of our send window, so that the number
+ * of bytes we send in an RTT is often less than our cwnd will allow.
+ * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+ */
+
+ if (after(ack, yeah->vegas.beg_snd_nxt)) {
+ /* We do the Vegas calculations only if we got enough RTT
+ * samples that we can be reasonably sure that we got
+ * at least one RTT sample that wasn't from a delayed ACK.
+ * If we only had 2 samples total,
+ * then that means we're getting only 1 ACK per RTT, which
+ * means they're almost certainly delayed ACKs.
+ * If we have 3 samples, we should be OK.
+ */
+
+ if (yeah->vegas.cntRTT > 2) {
+ u32 rtt, queue;
+ u64 bw;
+
+ /* We have enough RTT samples, so, using the Vegas
+ * algorithm, we determine if we should increase or
+ * decrease cwnd, and by how much.
+ */
+
+ /* Pluck out the RTT we are using for the Vegas
+ * calculations. This is the min RTT seen during the
+ * last RTT. Taking the min filters out the effects
+ * of delayed ACKs, at the cost of noticing congestion
+ * a bit later.
+ */
+ rtt = yeah->vegas.minRTT;
+
+ /* Compute excess number of packets above bandwidth
+ * Avoid doing full 64 bit divide.
+ */
+ bw = tp->snd_cwnd;
+ bw *= rtt - yeah->vegas.baseRTT;
+ do_div(bw, rtt);
+ queue = bw;
+
+ if (queue > TCP_YEAH_ALPHA ||
+ rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
+ if (queue > TCP_YEAH_ALPHA &&
+ tp->snd_cwnd > yeah->reno_count) {
+ u32 reduction = min(queue / TCP_YEAH_GAMMA ,
+ tp->snd_cwnd >> TCP_YEAH_EPSILON);
+
+ tp->snd_cwnd -= reduction;
+
+ tp->snd_cwnd = max(tp->snd_cwnd,
+ yeah->reno_count);
+
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+
+ if (yeah->reno_count <= 2)
+ yeah->reno_count = max(tp->snd_cwnd>>1, 2U);
+ else
+ yeah->reno_count++;
+
+ yeah->doing_reno_now = min(yeah->doing_reno_now + 1,
+ 0xffffffU);
+ } else {
+ yeah->fast_count++;
+
+ if (yeah->fast_count > TCP_YEAH_ZETA) {
+ yeah->reno_count = 2;
+ yeah->fast_count = 0;
+ }
+
+ yeah->doing_reno_now = 0;
+ }
+
+ yeah->lastQ = queue;
+ }
+
+ /* Save the extent of the current window so we can use this
+ * at the end of the next RTT.
+ */
+ yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;
+ yeah->vegas.beg_snd_nxt = tp->snd_nxt;
+ yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;
+
+ /* Wipe the slate clean for the next RTT. */
+ yeah->vegas.cntRTT = 0;
+ yeah->vegas.minRTT = 0x7fffffff;
+ }
+}
+
+static u32 tcp_yeah_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct yeah *yeah = inet_csk_ca(sk);
+ u32 reduction;
+
+ if (yeah->doing_reno_now < TCP_YEAH_RHO) {
+ reduction = yeah->lastQ;
+
+ reduction = min(reduction, max(tp->snd_cwnd>>1, 2U));
+
+ reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
+ } else
+ reduction = max(tp->snd_cwnd>>1, 2U);
+
+ yeah->fast_count = 0;
+ yeah->reno_count = max(yeah->reno_count>>1, 2U);
+
+ return tp->snd_cwnd - reduction;
+}
+
+static struct tcp_congestion_ops tcp_yeah __read_mostly = {
+ .init = tcp_yeah_init,
+ .ssthresh = tcp_yeah_ssthresh,
+ .cong_avoid = tcp_yeah_cong_avoid,
+ .set_state = tcp_vegas_state,
+ .cwnd_event = tcp_vegas_cwnd_event,
+ .get_info = tcp_vegas_get_info,
+ .pkts_acked = tcp_yeah_pkts_acked,
+
+ .owner = THIS_MODULE,
+ .name = "yeah",
+};
+
+static int __init tcp_yeah_register(void)
+{
+ BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_yeah);
+ return 0;
+}
+
+static void __exit tcp_yeah_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_yeah);
+}
+
+module_init(tcp_yeah_register);
+module_exit(tcp_yeah_unregister);
+
+MODULE_AUTHOR("Angelo P. Castellani");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("YeAH TCP");
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
new file mode 100644
index 000000000..0d0171830
--- /dev/null
+++ b/net/ipv4/tunnel4.c
@@ -0,0 +1,192 @@
+/* tunnel4.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
+static DEFINE_MUTEX(tunnel4_mutex);
+
+static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
+{
+ return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
+}
+
+int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
+{
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
+
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ mutex_lock(&tunnel4_mutex);
+
+ for (pprev = fam_handlers(family);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel4_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority > priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&tunnel4_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_register);
+
+int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
+{
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
+ int ret = -ENOENT;
+
+ mutex_lock(&tunnel4_mutex);
+
+ for (pprev = fam_handlers(family);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel4_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ mutex_unlock(&tunnel4_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+
+#define for_each_tunnel_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+static int tunnel4_rcv(struct sk_buff *skb)
+{
+ struct xfrm_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tunnel64_rcv(struct sk_buff *skb)
+{
+ struct xfrm_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+
+static void tunnel4_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void tunnel64_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+#endif
+
+static const struct net_protocol tunnel4_protocol = {
+ .handler = tunnel4_rcv,
+ .err_handler = tunnel4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static const struct net_protocol tunnel64_protocol = {
+ .handler = tunnel64_rcv,
+ .err_handler = tunnel64_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+#endif
+
+static int __init tunnel4_init(void)
+{
+ if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ return -EAGAIN;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
+ pr_err("tunnel64 init: can't add protocol\n");
+ inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+ return -EAGAIN;
+ }
+#endif
+ return 0;
+}
+
+static void __exit tunnel4_fini(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
+ pr_err("tunnel64 close: can't remove protocol\n");
+#endif
+ if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
+ pr_err("tunnel4 close: can't remove protocol\n");
+}
+
+module_init(tunnel4_init);
+module_exit(tunnel4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
new file mode 100644
index 000000000..83aa604f9
--- /dev/null
+++ b/net/ipv4/udp.c
@@ -0,0 +1,2555 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * The User Datagram Protocol (UDP).
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ * Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() calls
+ * Alan Cox : stopped close while in use off icmp
+ * messages. Not a fix but a botch that
+ * for udp at least is 'valid'.
+ * Alan Cox : Fixed icmp handling properly
+ * Alan Cox : Correct error for oversized datagrams
+ * Alan Cox : Tidied select() semantics.
+ * Alan Cox : udp_err() fixed properly, also now
+ * select and read wake correctly on errors
+ * Alan Cox : udp_send verify_area moved to avoid mem leak
+ * Alan Cox : UDP can count its memory
+ * Alan Cox : send to an unknown connection causes
+ * an ECONNREFUSED off the icmp, but
+ * does NOT close.
+ * Alan Cox : Switched to new sk_buff handlers. No more backlog!
+ * Alan Cox : Using generic datagram code. Even smaller and the PEEK
+ * bug no longer crashes it.
+ * Fred Van Kempen : Net2e support for sk->broadcast.
+ * Alan Cox : Uses skb_free_datagram
+ * Alan Cox : Added get/set sockopt support.
+ * Alan Cox : Broadcasting without option set returns EACCES.
+ * Alan Cox : No wakeup calls. Instead we now use the callbacks.
+ * Alan Cox : Use ip_tos and ip_ttl
+ * Alan Cox : SNMP Mibs
+ * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support.
+ * Matt Dillon : UDP length checks.
+ * Alan Cox : Smarter af_inet used properly.
+ * Alan Cox : Use new kernel side addressing.
+ * Alan Cox : Incorrect return on truncated datagram receive.
+ * Arnt Gulbrandsen : New udp_send and stuff
+ * Alan Cox : Cache last socket
+ * Alan Cox : Route cache
+ * Jon Peatfield : Minor efficiency fix to sendto().
+ * Mike Shaver : RFC1122 checks.
+ * Alan Cox : Nonblocking error fix.
+ * Willy Konynenberg : Transparent proxying support.
+ * Mike McLagan : Routing by source
+ * David S. Miller : New socket lookup architecture.
+ * Last socket cache retained as it
+ * does have a high hit rate.
+ * Olaf Kirch : Don't linearise iovec on sendmsg.
+ * Andi Kleen : Some cleanups, cache destination entry
+ * for connect.
+ * Vitaly E. Lavrov : Transparent proxy revived after year coma.
+ * Melvin Smith : Check msg_name not msg_namelen in sendto(),
+ * return ENOTCONN for unconnected sockets (POSIX)
+ * Janos Farkas : don't deliver multi/broadcasts to a different
+ * bound-to-device socket
+ * Hirokazu Takahashi : HW checksumming for outgoing UDP
+ * datagrams.
+ * Hirokazu Takahashi : sendfile() on UDP works now.
+ * Arnaldo C. Melo : convert /proc/net/udp to seq_file
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
+ * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ * James Chapman : Add L2TP encapsulation type.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "UDP: " fmt
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/tcp_states.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/route.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <trace/events/udp.h>
+#include <linux/static_key.h>
+#include <trace/events/skb.h>
+#include <net/busy_poll.h>
+#include "udp_impl.h"
+
+struct udp_table udp_table __read_mostly;
+EXPORT_SYMBOL(udp_table);
+
+long sysctl_udp_mem[3] __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_mem);
+
+int sysctl_udp_rmem_min __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_rmem_min);
+
+int sysctl_udp_wmem_min __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_wmem_min);
+
+atomic_long_t udp_memory_allocated;
+EXPORT_SYMBOL(udp_memory_allocated);
+
+#define MAX_UDP_PORTS 65536
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+
+static int udp_lib_lport_inuse(struct net *net, __u16 num,
+ const struct udp_hslot *hslot,
+ unsigned long *bitmap,
+ struct sock *sk,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2),
+ unsigned int log)
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+
+ sk_nulls_for_each(sk2, node, &hslot->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
+ saddr_comp(sk, sk2)) {
+ if (!bitmap)
+ return 1;
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Note: we still hold spinlock of primary hash chain, so no other writer
+ * can insert/delete a socket with local_port == num
+ */
+static int udp_lib_lport_inuse2(struct net *net, __u16 num,
+ struct udp_hslot *hslot2,
+ struct sock *sk,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2))
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+ int res = 0;
+
+ spin_lock(&hslot2->lock);
+ udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (!sk2->sk_reuseport || !sk->sk_reuseport ||
+ !uid_eq(uid, sock_i_uid(sk2))) &&
+ saddr_comp(sk, sk2)) {
+ res = 1;
+ break;
+ }
+ }
+ spin_unlock(&hslot2->lock);
+ return res;
+}
+
+/**
+ * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
+ *
+ * @sk: socket struct in question
+ * @snum: port number to look up
+ * @saddr_comp: AF-dependent comparison of bound local IP addresses
+ * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
+ * with NULL address
+ */
+int udp_lib_get_port(struct sock *sk, unsigned short snum,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2),
+ unsigned int hash2_nulladdr)
+{
+ struct udp_hslot *hslot, *hslot2;
+ struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ int error = 1;
+ struct net *net = sock_net(sk);
+
+ if (!snum) {
+ int low, high, remaining;
+ unsigned int rand;
+ unsigned short first, last;
+ DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+
+ rand = prandom_u32();
+ first = reciprocal_scale(rand, remaining) + low;
+ /*
+ * force rand to be an odd multiple of UDP_HTABLE_SIZE
+ */
+ rand = (rand | 1) * (udptable->mask + 1);
+ last = first + udptable->mask + 1;
+ do {
+ hslot = udp_hashslot(udptable, net, first);
+ bitmap_zero(bitmap, PORTS_PER_CHAIN);
+ spin_lock_bh(&hslot->lock);
+ udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
+ saddr_comp, udptable->log);
+
+ snum = first;
+ /*
+ * Iterate on all possible values of snum for this hash.
+ * Using steps of an odd multiple of UDP_HTABLE_SIZE
+ * give us randomization and full range coverage.
+ */
+ do {
+ if (low <= snum && snum <= high &&
+ !test_bit(snum >> udptable->log, bitmap) &&
+ !inet_is_local_reserved_port(net, snum))
+ goto found;
+ snum += rand;
+ } while (snum != first);
+ spin_unlock_bh(&hslot->lock);
+ } while (++first != last);
+ goto fail;
+ } else {
+ hslot = udp_hashslot(udptable, net, snum);
+ spin_lock_bh(&hslot->lock);
+ if (hslot->count > 10) {
+ int exist;
+ unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
+
+ slot2 &= udptable->mask;
+ hash2_nulladdr &= udptable->mask;
+
+ hslot2 = udp_hashslot2(udptable, slot2);
+ if (hslot->count < hslot2->count)
+ goto scan_primary_hash;
+
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk, saddr_comp);
+ if (!exist && (hash2_nulladdr != slot2)) {
+ hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk, saddr_comp);
+ }
+ if (exist)
+ goto fail_unlock;
+ else
+ goto found;
+ }
+scan_primary_hash:
+ if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
+ saddr_comp, 0))
+ goto fail_unlock;
+ }
+found:
+ inet_sk(sk)->inet_num = snum;
+ udp_sk(sk)->udp_port_hash = snum;
+ udp_sk(sk)->udp_portaddr_hash ^= snum;
+ if (sk_unhashed(sk)) {
+ sk_nulls_add_node_rcu(sk, &hslot->head);
+ hslot->count++;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ spin_lock(&hslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
+ hslot2->count++;
+ spin_unlock(&hslot2->lock);
+ }
+ error = 0;
+fail_unlock:
+ spin_unlock_bh(&hslot->lock);
+fail:
+ return error;
+}
+EXPORT_SYMBOL(udp_lib_get_port);
+
+static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
+{
+ struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
+
+ return (!ipv6_only_sock(sk2) &&
+ (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
+ inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
+}
+
+static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
+ unsigned int port)
+{
+ return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
+}
+
+int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+ unsigned int hash2_nulladdr =
+ udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+ unsigned int hash2_partial =
+ udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+
+ /* precompute partial secondary hash */
+ udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+ return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
+}
+
+static inline int compute_score(struct sock *sk, struct net *net,
+ __be32 saddr, unsigned short hnum, __be16 sport,
+ __be32 daddr, __be16 dport, int dif)
+{
+ int score;
+ struct inet_sock *inet;
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ ipv6_only_sock(sk))
+ return -1;
+
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+ inet = inet_sk(sk);
+
+ if (inet->inet_rcv_saddr) {
+ if (inet->inet_rcv_saddr != daddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score += 4;
+ }
+
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
+ }
+
+ return score;
+}
+
+/*
+ * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
+ */
+static inline int compute_score2(struct sock *sk, struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum, int dif)
+{
+ int score;
+ struct inet_sock *inet;
+
+ if (!net_eq(sock_net(sk), net) ||
+ ipv6_only_sock(sk))
+ return -1;
+
+ inet = inet_sk(sk);
+
+ if (inet->inet_rcv_saddr != daddr ||
+ inet->inet_num != hnum)
+ return -1;
+
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
+ return -1;
+ score += 4;
+ }
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score += 4;
+ }
+
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 4;
+ }
+
+ return score;
+}
+
+static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
+{
+ static u32 udp_ehash_secret __read_mostly;
+
+ net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
+
+ return __inet_ehashfn(laddr, lport, faddr, fport,
+ udp_ehash_secret + net_hash_mix(net));
+}
+
+/* called with read_rcu_lock() */
+static struct sock *udp4_lib_lookup2(struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum, int dif,
+ struct udp_hslot *hslot2, unsigned int slot2)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
+
+begin:
+ result = NULL;
+ badness = 0;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ score = compute_score2(sk, net, saddr, sport,
+ daddr, hnum, dif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = udp_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (reciprocal_scale(hash, matches) == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot2)
+ goto begin;
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(compute_score2(result, net, saddr, sport,
+ daddr, hnum, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ return result;
+}
+
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this. -DaveM
+ */
+struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+ __be16 sport, __be32 daddr, __be16 dport,
+ int dif, struct udp_table *udptable)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(dport);
+ unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
+
+ rcu_read_lock();
+ if (hslot->count > 10) {
+ hash2 = udp4_portaddr_hash(net, daddr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif,
+ hslot2, slot2);
+ if (!result) {
+ hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ htonl(INADDR_ANY), hnum, dif,
+ hslot2, slot2);
+ }
+ rcu_read_unlock();
+ return result;
+ }
+begin:
+ result = NULL;
+ badness = 0;
+ sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+ score = compute_score(sk, net, saddr, hnum, sport,
+ daddr, dport, dif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = udp_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (reciprocal_scale(hash, matches) == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(compute_score(result, net, saddr, hnum, sport,
+ daddr, dport, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
+
+static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport,
+ struct udp_table *udptable)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+ iph->daddr, dport, inet_iif(skb),
+ udptable);
+}
+
+struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
+ __be32 daddr, __be16 dport, int dif)
+{
+ return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+
+static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif, unsigned short hnum)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+ (inet->inet_dport != rmt_port && inet->inet_dport) ||
+ (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
+ ipv6_only_sock(sk) ||
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ return false;
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+ return false;
+ return true;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code.
+ * Header points to the ip header of the error packet. We move
+ * on past this. Then (as it used to claim before adjustment)
+ * header points to the first 8 bytes of the udp header. We need
+ * to find the appropriate port.
+ */
+
+void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+{
+ struct inet_sock *inet;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct sock *sk;
+ int harderr;
+ int err;
+ struct net *net = dev_net(skb->dev);
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
+ iph->saddr, uh->source, skb->dev->ifindex, udptable);
+ if (!sk) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return; /* No socket for error */
+ }
+
+ err = 0;
+ harderr = 0;
+ inet = inet_sk(sk);
+
+ switch (type) {
+ default:
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_SOURCE_QUENCH:
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ harderr = 1;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+ ipv4_sk_update_pmtu(skb, sk, info);
+ if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+ err = EMSGSIZE;
+ harderr = 1;
+ break;
+ }
+ goto out;
+ }
+ err = EHOSTUNREACH;
+ if (code <= NR_ICMP_UNREACH) {
+ harderr = icmp_err_convert[code].fatal;
+ err = icmp_err_convert[code].errno;
+ }
+ break;
+ case ICMP_REDIRECT:
+ ipv4_sk_redirect(skb, sk);
+ goto out;
+ }
+
+ /*
+ * RFC1122: OK. Passes ICMP errors back to application, as per
+ * 4.1.3.3.
+ */
+ if (!inet->recverr) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else
+ ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+out:
+ sock_put(sk);
+}
+
+void udp_err(struct sk_buff *skb, u32 info)
+{
+ __udp4_lib_err(skb, info, &udp_table);
+}
+
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+void udp_flush_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+
+ if (up->pending) {
+ up->len = 0;
+ up->pending = 0;
+ ip_flush_pending_frames(sk);
+ }
+}
+EXPORT_SYMBOL(udp_flush_pending_frames);
+
+/**
+ * udp4_hwcsum - handle outgoing HW checksumming
+ * @skb: sk_buff containing the filled-in UDP header
+ * (checksum field must be zeroed out)
+ * @src: source IP address
+ * @dst: destination IP address
+ */
+void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
+{
+ struct udphdr *uh = udp_hdr(skb);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ int hlen = len;
+ __wsum csum = 0;
+
+ if (!skb_has_frag_list(skb)) {
+ /*
+ * Only one fragment on the socket.
+ */
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~csum_tcpudp_magic(src, dst, len,
+ IPPROTO_UDP, 0);
+ } else {
+ struct sk_buff *frags;
+
+ /*
+ * HW-checksum won't work as there are two or more
+ * fragments on the socket so that all csums of sk_buffs
+ * should be together
+ */
+ skb_walk_frags(skb, frags) {
+ csum = csum_add(csum, frags->csum);
+ hlen -= frags->len;
+ }
+
+ csum = skb_checksum(skb, offset, hlen, csum);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
+}
+EXPORT_SYMBOL_GPL(udp4_hwcsum);
+
+/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
+ * for the simple case like when setting the checksum for a UDP tunnel.
+ */
+void udp_set_csum(bool nocheck, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr, int len)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (nocheck)
+ uh->check = 0;
+ else if (skb_is_gso(skb))
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ else if (skb_dst(skb) && skb_dst(skb)->dev &&
+ (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~udp_v4_check(len, saddr, daddr, 0);
+ } else {
+ __wsum csum;
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, len, 0);
+ uh->check = udp_v4_check(len, saddr, daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+}
+EXPORT_SYMBOL(udp_set_csum);
+
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct udphdr *uh;
+ int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+ __wsum csum = 0;
+
+ /*
+ * Create a UDP header
+ */
+ uh = udp_hdr(skb);
+ uh->source = inet->inet_sport;
+ uh->dest = fl4->fl4_dport;
+ uh->len = htons(len);
+ uh->check = 0;
+
+ if (is_udplite) /* UDP-Lite */
+ csum = udplite_csum(skb);
+
+ else if (sk->sk_no_check_tx) { /* UDP csum disabled */
+
+ skb->ip_summed = CHECKSUM_NONE;
+ goto send;
+
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+
+ udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
+ goto send;
+
+ } else
+ csum = udp_csum(skb);
+
+ /* add protocol-dependent pseudo-header */
+ uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
+ sk->sk_protocol, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+send:
+ err = ip_send_skb(sock_net(sk), skb);
+ if (err) {
+ if (err == -ENOBUFS && !inet->recverr) {
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
+ err = 0;
+ }
+ } else
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_OUTDATAGRAMS, is_udplite);
+ return err;
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+int udp_push_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = ip_finish_skb(sk, fl4);
+ if (!skb)
+ goto out;
+
+ err = udp_send_skb(skb, fl4);
+
+out:
+ up->len = 0;
+ up->pending = 0;
+ return err;
+}
+EXPORT_SYMBOL(udp_push_pending_frames);
+
+int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct udp_sock *up = udp_sk(sk);
+ struct flowi4 fl4_stack;
+ struct flowi4 *fl4;
+ int ulen = len;
+ struct ipcm_cookie ipc;
+ struct rtable *rt = NULL;
+ int free = 0;
+ int connected = 0;
+ __be32 daddr, faddr, saddr;
+ __be16 dport;
+ u8 tos;
+ int err, is_udplite = IS_UDPLITE(sk);
+ int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+ struct sk_buff *skb;
+ struct ip_options_data opt_copy;
+
+ if (len > 0xFFFF)
+ return -EMSGSIZE;
+
+ /*
+ * Check the flags.
+ */
+
+ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
+ return -EOPNOTSUPP;
+
+ ipc.opt = NULL;
+ ipc.tx_flags = 0;
+ ipc.ttl = 0;
+ ipc.tos = -1;
+
+ getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+
+ fl4 = &inet->cork.fl.u.ip4;
+ if (up->pending) {
+ /*
+ * There are pending frames.
+ * The socket lock must be held while it's corked.
+ */
+ lock_sock(sk);
+ if (likely(up->pending)) {
+ if (unlikely(up->pending != AF_INET)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ goto do_append_data;
+ }
+ release_sock(sk);
+ }
+ ulen += sizeof(struct udphdr);
+
+ /*
+ * Get and verify the address.
+ */
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ if (msg->msg_namelen < sizeof(*usin))
+ return -EINVAL;
+ if (usin->sin_family != AF_INET) {
+ if (usin->sin_family != AF_UNSPEC)
+ return -EAFNOSUPPORT;
+ }
+
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ if (dport == 0)
+ return -EINVAL;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = inet->inet_daddr;
+ dport = inet->inet_dport;
+ /* Open fast path for connected socket.
+ Route will not be used, if at least one option is set.
+ */
+ connected = 1;
+ }
+ ipc.addr = inet->inet_saddr;
+
+ ipc.oif = sk->sk_bound_dev_if;
+
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
+ if (msg->msg_controllen) {
+ err = ip_cmsg_send(sock_net(sk), msg, &ipc,
+ sk->sk_family == AF_INET6);
+ if (err)
+ return err;
+ if (ipc.opt)
+ free = 1;
+ connected = 0;
+ }
+ if (!ipc.opt) {
+ struct ip_options_rcu *inet_opt;
+
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ if (inet_opt) {
+ memcpy(&opt_copy, inet_opt,
+ sizeof(*inet_opt) + inet_opt->opt.optlen);
+ ipc.opt = &opt_copy.opt;
+ }
+ rcu_read_unlock();
+ }
+
+ saddr = ipc.addr;
+ ipc.addr = faddr = daddr;
+
+ if (ipc.opt && ipc.opt->opt.srr) {
+ if (!daddr)
+ return -EINVAL;
+ faddr = ipc.opt->opt.faddr;
+ connected = 0;
+ }
+ tos = get_rttos(&ipc, inet);
+ if (sock_flag(sk, SOCK_LOCALROUTE) ||
+ (msg->msg_flags & MSG_DONTROUTE) ||
+ (ipc.opt && ipc.opt->opt.is_strictroute)) {
+ tos |= RTO_ONLINK;
+ connected = 0;
+ }
+
+ if (ipv4_is_multicast(daddr)) {
+ if (!ipc.oif)
+ ipc.oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ connected = 0;
+ } else if (!ipc.oif)
+ ipc.oif = inet->uc_index;
+
+ if (connected)
+ rt = (struct rtable *)sk_dst_check(sk, 0);
+
+ if (!rt) {
+ struct net *net = sock_net(sk);
+
+ fl4 = &fl4_stack;
+ flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
+ RT_SCOPE_UNIVERSE, sk->sk_protocol,
+ inet_sk_flowi_flags(sk),
+ faddr, saddr, dport, inet->inet_sport);
+
+ security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+ rt = ip_route_output_flow(net, fl4, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
+ if (err == -ENETUNREACH)
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ err = -EACCES;
+ if ((rt->rt_flags & RTCF_BROADCAST) &&
+ !sock_flag(sk, SOCK_BROADCAST))
+ goto out;
+ if (connected)
+ sk_dst_set(sk, dst_clone(&rt->dst));
+ }
+
+ if (msg->msg_flags&MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ saddr = fl4->saddr;
+ if (!ipc.addr)
+ daddr = ipc.addr = fl4->daddr;
+
+ /* Lockless fast path for the non-corking case. */
+ if (!corkreq) {
+ skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ msg->msg_flags);
+ err = PTR_ERR(skb);
+ if (!IS_ERR_OR_NULL(skb))
+ err = udp_send_skb(skb, fl4);
+ goto out;
+ }
+
+ lock_sock(sk);
+ if (unlikely(up->pending)) {
+ /* The socket is already corked while preparing it. */
+ /* ... which is an evident application bug. --ANK */
+ release_sock(sk);
+
+ net_dbg_ratelimited("cork app bug 2\n");
+ err = -EINVAL;
+ goto out;
+ }
+ /*
+ * Now cork the socket to pend data.
+ */
+ fl4 = &inet->cork.fl.u.ip4;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->fl4_dport = dport;
+ fl4->fl4_sport = inet->inet_sport;
+ up->pending = AF_INET;
+
+do_append_data:
+ up->len += ulen;
+ err = ip_append_data(sk, fl4, getfrag, msg, ulen,
+ sizeof(struct udphdr), &ipc, &rt,
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+ if (err)
+ udp_flush_pending_frames(sk);
+ else if (!corkreq)
+ err = udp_push_pending_frames(sk);
+ else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
+ up->pending = 0;
+ release_sock(sk);
+
+out:
+ ip_rt_put(rt);
+ if (free)
+ kfree(ipc.opt);
+ if (!err)
+ return len;
+ /*
+ * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
+ * ENOBUFS might not be good (it's not tunable per se), but otherwise
+ * we don't have a good statistic (IpOutDiscards but it can be too many
+ * things). We could add another new stat but at least for now that
+ * seems like overkill.
+ */
+ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
+ }
+ return err;
+
+do_confirm:
+ dst_confirm(&rt->dst);
+ if (!(msg->msg_flags&MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
+}
+EXPORT_SYMBOL(udp_sendmsg);
+
+int udp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct udp_sock *up = udp_sk(sk);
+ int ret;
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
+ if (!up->pending) {
+ struct msghdr msg = { .msg_flags = flags|MSG_MORE };
+
+ /* Call udp_sendmsg to specify destination address which
+ * sendpage interface can't pass.
+ * This will succeed only when the socket is connected.
+ */
+ ret = udp_sendmsg(sk, &msg, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ lock_sock(sk);
+
+ if (unlikely(!up->pending)) {
+ release_sock(sk);
+
+ net_dbg_ratelimited("udp cork app bug 3\n");
+ return -EINVAL;
+ }
+
+ ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
+ page, offset, size, flags);
+ if (ret == -EOPNOTSUPP) {
+ release_sock(sk);
+ return sock_no_sendpage(sk->sk_socket, page, offset,
+ size, flags);
+ }
+ if (ret < 0) {
+ udp_flush_pending_frames(sk);
+ goto out;
+ }
+
+ up->len += size;
+ if (!(up->corkflag || (flags&MSG_MORE)))
+ ret = udp_push_pending_frames(sk);
+ if (!ret)
+ ret = size;
+out:
+ release_sock(sk);
+ return ret;
+}
+
+/**
+ * first_packet_length - return length of first packet in receive queue
+ * @sk: socket
+ *
+ * Drops all bad checksum frames, until a valid one is found.
+ * Returns the length of found skb, or 0 if none is found.
+ */
+static unsigned int first_packet_length(struct sock *sk)
+{
+ struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+ struct sk_buff *skb;
+ unsigned int res;
+
+ __skb_queue_head_init(&list_kill);
+
+ spin_lock_bh(&rcvq->lock);
+ while ((skb = skb_peek(rcvq)) != NULL &&
+ udp_lib_checksum_complete(skb)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS,
+ IS_UDPLITE(sk));
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ atomic_inc(&sk->sk_drops);
+ __skb_unlink(skb, rcvq);
+ __skb_queue_tail(&list_kill, skb);
+ }
+ res = skb ? skb->len : 0;
+ spin_unlock_bh(&rcvq->lock);
+
+ if (!skb_queue_empty(&list_kill)) {
+ bool slow = lock_sock_fast(sk);
+
+ __skb_queue_purge(&list_kill);
+ sk_mem_reclaim_partial(sk);
+ unlock_sock_fast(sk, slow);
+ }
+ return res;
+}
+
+/*
+ * IOCTL requests applicable to the UDP protocol
+ */
+
+int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ {
+ int amount = sk_wmem_alloc_get(sk);
+
+ return put_user(amount, (int __user *)arg);
+ }
+
+ case SIOCINQ:
+ {
+ unsigned int amount = first_packet_length(sk);
+
+ if (amount)
+ /*
+ * We will only return the amount
+ * of this packet since that is all
+ * that will be read.
+ */
+ amount -= sizeof(struct udphdr);
+
+ return put_user(amount, (int __user *)arg);
+ }
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(udp_ioctl);
+
+/*
+ * This should be easy, if there is something there we
+ * return it, otherwise we block.
+ */
+
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct sk_buff *skb;
+ unsigned int ulen, copied;
+ int peeked, off = 0;
+ int err;
+ int is_udplite = IS_UDPLITE(sk);
+ bool slow;
+
+ if (flags & MSG_ERRQUEUE)
+ return ip_recv_error(sk, msg, len, addr_len);
+
+try_again:
+ skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ &peeked, &off, &err);
+ if (!skb)
+ goto out;
+
+ ulen = skb->len - sizeof(struct udphdr);
+ copied = len;
+ if (copied > ulen)
+ copied = ulen;
+ else if (copied < ulen)
+ msg->msg_flags |= MSG_TRUNC;
+
+ /*
+ * If checksum is needed at all, try to do it while copying the
+ * data. If the data is truncated, or if we only want a partial
+ * coverage checksum (UDP-Lite), do it before the copy.
+ */
+
+ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_lib_checksum_complete(skb))
+ goto csum_copy_err;
+ }
+
+ if (skb_csum_unnecessary(skb))
+ err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+ msg, copied);
+ else {
+ err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr),
+ msg);
+
+ if (err == -EINVAL)
+ goto csum_copy_err;
+ }
+
+ if (unlikely(err)) {
+ trace_kfree_skb(skb, udp_recvmsg);
+ if (!peeked) {
+ atomic_inc(&sk->sk_drops);
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
+ }
+ goto out_free;
+ }
+
+ if (!peeked)
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INDATAGRAMS, is_udplite);
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ /* Copy the address. */
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = udp_hdr(skb)->source;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+ if (inet->cmsg_flags)
+ ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr));
+
+ err = copied;
+ if (flags & MSG_TRUNC)
+ err = ulen;
+
+out_free:
+ skb_free_datagram_locked(sk, skb);
+out:
+ return err;
+
+csum_copy_err:
+ slow = lock_sock_fast(sk);
+ if (!skb_kill_datagram(sk, skb, flags)) {
+ UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+ UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ }
+ unlock_sock_fast(sk, slow);
+
+ /* starting over for a new packet, but check if we need to yield */
+ cond_resched();
+ msg->msg_flags &= ~MSG_TRUNC;
+ goto try_again;
+}
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ /*
+ * 1003.1g - break association.
+ */
+
+ sk->sk_state = TCP_CLOSE;
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
+ sock_rps_reset_rxhash(sk);
+ sk->sk_bound_dev_if = 0;
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
+ sk->sk_prot->unhash(sk);
+ inet->inet_sport = 0;
+ }
+ sk_dst_reset(sk);
+ return 0;
+}
+EXPORT_SYMBOL(udp_disconnect);
+
+void udp_lib_unhash(struct sock *sk)
+{
+ if (sk_hashed(sk)) {
+ struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_hslot *hslot, *hslot2;
+
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+
+ spin_lock_bh(&hslot->lock);
+ if (sk_nulls_del_node_init_rcu(sk)) {
+ hslot->count--;
+ inet_sk(sk)->inet_num = 0;
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+}
+EXPORT_SYMBOL(udp_lib_unhash);
+
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
+ */
+void udp_lib_rehash(struct sock *sk, u16 newhash)
+{
+ if (sk_hashed(sk)) {
+ struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ nhslot2 = udp_hashslot2(udptable, newhash);
+ udp_sk(sk)->udp_portaddr_hash = newhash;
+ if (hslot2 != nhslot2) {
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ /* we must lock primary chain too */
+ spin_lock_bh(&hslot->lock);
+
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &nhslot2->head);
+ nhslot2->count++;
+ spin_unlock(&nhslot2->lock);
+
+ spin_unlock_bh(&hslot->lock);
+ }
+ }
+}
+EXPORT_SYMBOL(udp_lib_rehash);
+
+static void udp_v4_rehash(struct sock *sk)
+{
+ u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ udp_lib_rehash(sk, new_hash);
+}
+
+static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+
+ if (inet_sk(sk)->inet_daddr) {
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
+ }
+
+ rc = sock_queue_rcv_skb(sk, skb);
+ if (rc < 0) {
+ int is_udplite = IS_UDPLITE(sk);
+
+ /* Note that an ENOMEM error is charged twice */
+ if (rc == -ENOMEM)
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ kfree_skb(skb);
+ trace_udp_fail_queue_rcv_skb(rc, sk);
+ return -1;
+ }
+
+ return 0;
+
+}
+
+static struct static_key udp_encap_needed __read_mostly;
+void udp_encap_enable(void)
+{
+ if (!static_key_enabled(&udp_encap_needed))
+ static_key_slow_inc(&udp_encap_needed);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* returns:
+ * -1: error
+ * 0: success
+ * >0: "udp encap" protocol resubmission
+ *
+ * Note that in the success and error cases, the skb is assumed to
+ * have either been requeued or freed.
+ */
+int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int rc;
+ int is_udplite = IS_UDPLITE(sk);
+
+ /*
+ * Charge it to the socket, dropping if the queue is full.
+ */
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto drop;
+ nf_reset(skb);
+
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+
+ /*
+ * This is an encapsulation socket so pass the skb to
+ * the socket's udp_encap_rcv() hook. Otherwise, just
+ * fall through and pass this up the UDP socket.
+ * up->encap_rcv() returns the following value:
+ * =0 if skb was successfully passed to the encap
+ * handler or was discarded by it.
+ * >0 if skb should be passed on to UDP.
+ * <0 if skb should be resubmitted as proto -N
+ */
+
+ /* if we're overly short, let UDP handle it */
+ encap_rcv = ACCESS_ONCE(up->encap_rcv);
+ if (skb->len > sizeof(struct udphdr) && encap_rcv) {
+ int ret;
+
+ /* Verify checksum before giving to encap */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ ret = encap_rcv(sk, skb);
+ if (ret <= 0) {
+ UDP_INC_STATS_BH(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
+ return -ret;
+ }
+ }
+
+ /* FALLTHROUGH -- it's a UDP Packet */
+ }
+
+ /*
+ * UDP-Lite specific tests, ignored on UDP sockets
+ */
+ if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+
+ /*
+ * MIB statistics other than incrementing the error count are
+ * disabled for the following two types of errors: these depend
+ * on the application settings, not on the functioning of the
+ * protocol stack as such.
+ *
+ * RFC 3828 here recommends (sec 3.3): "There should also be a
+ * way ... to ... at least let the receiving application block
+ * delivery of packets with coverage values less than a value
+ * provided by the application."
+ */
+ if (up->pcrlen == 0) { /* full coverage was set */
+ net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
+ UDP_SKB_CB(skb)->cscov, skb->len);
+ goto drop;
+ }
+ /* The next case involves violating the min. coverage requested
+ * by the receiver. This is subtle: if receiver wants x and x is
+ * greater than the buffersize/MTU then receiver will complain
+ * that it wants x while sender emits packets of smaller size y.
+ * Therefore the above ...()->partial_cov statement is essential.
+ */
+ if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
+ net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
+ UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ goto drop;
+ }
+ }
+
+ if (rcu_access_pointer(sk->sk_filter) &&
+ udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
+ goto drop;
+ }
+
+ rc = 0;
+
+ ipv4_pktinfo_prepare(sk, skb);
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk))
+ rc = __udp_queue_rcv_skb(sk, skb);
+ else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ bh_unlock_sock(sk);
+ goto drop;
+ }
+ bh_unlock_sock(sk);
+
+ return rc;
+
+csum_error:
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+drop:
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ atomic_inc(&sk->sk_drops);
+ kfree_skb(skb);
+ return -1;
+}
+
+static void flush_stack(struct sock **stack, unsigned int count,
+ struct sk_buff *skb, unsigned int final)
+{
+ unsigned int i;
+ struct sk_buff *skb1 = NULL;
+ struct sock *sk;
+
+ for (i = 0; i < count; i++) {
+ sk = stack[i];
+ if (likely(!skb1))
+ skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb1) {
+ atomic_inc(&sk->sk_drops);
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ }
+
+ if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
+ skb1 = NULL;
+
+ sock_put(sk);
+ }
+ if (unlikely(skb1))
+ kfree_skb(skb1);
+}
+
+/* For TCP sockets, sk_rx_dst is protected by socket lock
+ * For UDP, we use xchg() to guard against concurrent changes.
+ */
+static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+ struct dst_entry *old;
+
+ dst_hold(dst);
+ old = xchg(&sk->sk_rx_dst, dst);
+ dst_release(old);
+}
+
+/*
+ * Multicasts and broadcasts go to each listener.
+ *
+ * Note: called only from the BH handler context.
+ */
+static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
+ struct udphdr *uh,
+ __be32 saddr, __be32 daddr,
+ struct udp_table *udptable,
+ int proto)
+{
+ struct sock *sk, *stack[256 / sizeof(struct sock *)];
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(uh->dest);
+ struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+ int dif = skb->dev->ifindex;
+ unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+ unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+ bool inner_flushed = false;
+
+ if (use_hash2) {
+ hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+ udp_table.mask;
+ hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+ hslot = &udp_table.hash2[hash2];
+ offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+ }
+
+ spin_lock(&hslot->lock);
+ sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
+ if (__udp_is_mcast_sock(net, sk,
+ uh->dest, daddr,
+ uh->source, saddr,
+ dif, hnum)) {
+ if (unlikely(count == ARRAY_SIZE(stack))) {
+ flush_stack(stack, count, skb, ~0);
+ inner_flushed = true;
+ count = 0;
+ }
+ stack[count++] = sk;
+ sock_hold(sk);
+ }
+ }
+
+ spin_unlock(&hslot->lock);
+
+ /* Also lookup *:port if we are using hash2 and haven't done so yet. */
+ if (use_hash2 && hash2 != hash2_any) {
+ hash2 = hash2_any;
+ goto start_lookup;
+ }
+
+ /*
+ * do the slow work with no lock held
+ */
+ if (count) {
+ flush_stack(stack, count, skb, count - 1);
+ } else {
+ if (!inner_flushed)
+ UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
+ consume_skb(skb);
+ }
+ return 0;
+}
+
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires chacksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
+ int proto)
+{
+ int err;
+
+ UDP_SKB_CB(skb)->partial_cov = 0;
+ UDP_SKB_CB(skb)->cscov = skb->len;
+
+ if (proto == IPPROTO_UDPLITE) {
+ err = udplite_checksum_init(skb, uh);
+ if (err)
+ return err;
+ }
+
+ return skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
+}
+
+/*
+ * All we need to do is get the socket, and then do a checksum.
+ */
+
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+ int proto)
+{
+ struct sock *sk;
+ struct udphdr *uh;
+ unsigned short ulen;
+ struct rtable *rt = skb_rtable(skb);
+ __be32 saddr, daddr;
+ struct net *net = dev_net(skb->dev);
+
+ /*
+ * Validate the packet.
+ */
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto drop; /* No space for header. */
+
+ uh = udp_hdr(skb);
+ ulen = ntohs(uh->len);
+ saddr = ip_hdr(skb)->saddr;
+ daddr = ip_hdr(skb)->daddr;
+
+ if (ulen > skb->len)
+ goto short_packet;
+
+ if (proto == IPPROTO_UDP) {
+ /* UDP validates ulen. */
+ if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
+ goto short_packet;
+ uh = udp_hdr(skb);
+ }
+
+ if (udp4_csum_init(skb, uh, proto))
+ goto csum_error;
+
+ sk = skb_steal_sock(skb);
+ if (sk) {
+ struct dst_entry *dst = skb_dst(skb);
+ int ret;
+
+ if (unlikely(sk->sk_rx_dst != dst))
+ udp_sk_rx_dst_set(sk, dst);
+
+ ret = udp_queue_rcv_skb(sk, skb);
+ sock_put(sk);
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
+ }
+
+ if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ return __udp4_lib_mcast_deliver(net, skb, uh,
+ saddr, daddr, udptable, proto);
+
+ sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ if (sk) {
+ int ret;
+
+ if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+ skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ inet_compute_pseudo);
+
+ ret = udp_queue_rcv_skb(sk, skb);
+ sock_put(sk);
+
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
+ }
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+ nf_reset(skb);
+
+ /* No socket. Drop packet silently, if checksum is wrong */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ /*
+ * Hmm. We got an UDP packet to a port to which we
+ * don't wanna listen. Ignore it.
+ */
+ kfree_skb(skb);
+ return 0;
+
+short_packet:
+ net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source),
+ ulen, skb->len,
+ &daddr, ntohs(uh->dest));
+ goto drop;
+
+csum_error:
+ /*
+ * RFC1122: OK. Discards the bad packet silently (as far as
+ * the network is concerned, anyway) as per 4.1.3.4 (MUST).
+ */
+ net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
+ proto == IPPROTO_UDPLITE ? "Lite" : "",
+ &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
+ ulen);
+ UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
+drop:
+ UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+ kfree_skb(skb);
+ return 0;
+}
+
+/* We can only early demux multicast if there is a single matching socket.
+ * If more than one socket found returns NULL
+ */
+static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+ struct udp_hslot *hslot = &udp_table.hash[slot];
+
+ /* Do not bother scanning a too big list */
+ if (hslot->count > 10)
+ return NULL;
+
+ rcu_read_lock();
+begin:
+ count = 0;
+ result = NULL;
+ sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+ if (__udp_is_mcast_sock(net, sk,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum)) {
+ result = sk;
+ ++count;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ if (result) {
+ if (count != 1 ||
+ unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(!__udp_is_mcast_sock(net, result,
+ loc_port, loc_addr,
+ rmt_port, rmt_addr,
+ dif, hnum))) {
+ sock_put(result);
+ result = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+
+/* For unicast we should only early demux connected sockets or we can
+ * break forwarding setups. The chains here can be long so only check
+ * if the first socket is an exact match and if not move on.
+ */
+static struct sock *__udp4_lib_demux_lookup(struct net *net,
+ __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(loc_port);
+ unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
+ unsigned int slot2 = hash2 & udp_table.mask;
+ struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
+ INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
+ const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
+ rcu_read_lock();
+ result = NULL;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ if (INET_MATCH(sk, net, acookie,
+ rmt_addr, loc_addr, ports, dif))
+ result = sk;
+ /* Only check first socket in chain */
+ break;
+ }
+
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(!INET_MATCH(sk, net, acookie,
+ rmt_addr, loc_addr,
+ ports, dif))) {
+ sock_put(result);
+ result = NULL;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+
+void udp_v4_early_demux(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph;
+ const struct udphdr *uh;
+ struct sock *sk;
+ struct dst_entry *dst;
+ int dif = skb->dev->ifindex;
+ int ours;
+
+ /* validate the packet */
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
+ return;
+
+ iph = ip_hdr(skb);
+ uh = udp_hdr(skb);
+
+ if (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST) {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+ if (!in_dev)
+ return;
+
+ ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+ iph->protocol);
+ if (!ours)
+ return;
+ sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif);
+ } else if (skb->pkt_type == PACKET_HOST) {
+ sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
+ uh->source, iph->saddr, dif);
+ } else {
+ return;
+ }
+
+ if (!sk)
+ return;
+
+ skb->sk = sk;
+ skb->destructor = sock_efree;
+ dst = sk->sk_rx_dst;
+
+ if (dst)
+ dst = dst_check(dst, 0);
+ if (dst)
+ skb_dst_set_noref(skb, dst);
+}
+
+int udp_rcv(struct sk_buff *skb)
+{
+ return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+}
+
+void udp_destroy_sock(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ bool slow = lock_sock_fast(sk);
+ udp_flush_pending_frames(sk);
+ unlock_sock_fast(sk, slow);
+ if (static_key_false(&udp_encap_needed) && up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+}
+
+/*
+ * Socket option code for UDP
+ */
+int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen,
+ int (*push_pending_frames)(struct sock *))
+{
+ struct udp_sock *up = udp_sk(sk);
+ int val, valbool;
+ int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ switch (optname) {
+ case UDP_CORK:
+ if (val != 0) {
+ up->corkflag = 1;
+ } else {
+ up->corkflag = 0;
+ lock_sock(sk);
+ push_pending_frames(sk);
+ release_sock(sk);
+ }
+ break;
+
+ case UDP_ENCAP:
+ switch (val) {
+ case 0:
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ up->encap_rcv = xfrm4_udp_encap_rcv;
+ /* FALLTHROUGH */
+ case UDP_ENCAP_L2TPINUDP:
+ up->encap_type = val;
+ udp_encap_enable();
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ case UDP_NO_CHECK6_TX:
+ up->no_check6_tx = valbool;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ up->no_check6_rx = valbool;
+ break;
+
+ /*
+ * UDP-Lite's partial checksum coverage (RFC 3828).
+ */
+ /* The sender sets actual checksum coverage length via this option.
+ * The case coverage > packet length is handled by send module. */
+ case UDPLITE_SEND_CSCOV:
+ if (!is_udplite) /* Disable the option on UDP sockets */
+ return -ENOPROTOOPT;
+ if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
+ val = 8;
+ else if (val > USHRT_MAX)
+ val = USHRT_MAX;
+ up->pcslen = val;
+ up->pcflag |= UDPLITE_SEND_CC;
+ break;
+
+ /* The receiver specifies a minimum checksum coverage value. To make
+ * sense, this should be set to at least 8 (as done below). If zero is
+ * used, this again means full checksum coverage. */
+ case UDPLITE_RECV_CSCOV:
+ if (!is_udplite) /* Disable the option on UDP sockets */
+ return -ENOPROTOOPT;
+ if (val != 0 && val < 8) /* Avoid silly minimal values. */
+ val = 8;
+ else if (val > USHRT_MAX)
+ val = USHRT_MAX;
+ up->pcrlen = val;
+ up->pcflag |= UDPLITE_RECV_CC;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(udp_lib_setsockopt);
+
+int udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ udp_push_pending_frames);
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ udp_push_pending_frames);
+ return compat_ip_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int val, len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case UDP_CORK:
+ val = up->corkflag;
+ break;
+
+ case UDP_ENCAP:
+ val = up->encap_type;
+ break;
+
+ case UDP_NO_CHECK6_TX:
+ val = up->no_check6_tx;
+ break;
+
+ case UDP_NO_CHECK6_RX:
+ val = up->no_check6_rx;
+ break;
+
+ /* The following two cannot be changed on UDP sockets, the return is
+ * always 0 (which corresponds to the full checksum coverage of UDP). */
+ case UDPLITE_SEND_CSCOV:
+ val = up->pcslen;
+ break;
+
+ case UDPLITE_RECV_CSCOV:
+ val = up->pcrlen;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL(udp_lib_getsockopt);
+
+int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+ return compat_ip_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+/**
+ * udp_poll - wait for a UDP event.
+ * @file - file struct
+ * @sock - socket
+ * @wait - poll table
+ *
+ * This is same as datagram poll, except for the special case of
+ * blocking sockets. If application is using a blocking fd
+ * and a packet with checksum error is in the queue;
+ * then it could get return from select indicating data available
+ * but then block when reading it. Add special case code
+ * to work around these arguably broken applications.
+ */
+unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ unsigned int mask = datagram_poll(file, sock, wait);
+ struct sock *sk = sock->sk;
+
+ sock_rps_record_flow(sk);
+
+ /* Check for false positives due to checksum errors */
+ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+ !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
+ mask &= ~(POLLIN | POLLRDNORM);
+
+ return mask;
+
+}
+EXPORT_SYMBOL(udp_poll);
+
+struct proto udp_prot = {
+ .name = "UDP",
+ .owner = THIS_MODULE,
+ .close = udp_lib_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .destroy = udp_destroy_sock,
+ .setsockopt = udp_setsockopt,
+ .getsockopt = udp_getsockopt,
+ .sendmsg = udp_sendmsg,
+ .recvmsg = udp_recvmsg,
+ .sendpage = udp_sendpage,
+ .backlog_rcv = __udp_queue_rcv_skb,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+ .rehash = udp_v4_rehash,
+ .get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem = &sysctl_udp_wmem_min,
+ .sysctl_rmem = &sysctl_udp_rmem_min,
+ .obj_size = sizeof(struct udp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .h.udp_table = &udp_table,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_udp_setsockopt,
+ .compat_getsockopt = compat_udp_getsockopt,
+#endif
+ .clear_sk = sk_prot_clear_portaddr_nulls,
+};
+EXPORT_SYMBOL(udp_prot);
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+static struct sock *udp_get_first(struct seq_file *seq, int start)
+{
+ struct sock *sk;
+ struct udp_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ for (state->bucket = start; state->bucket <= state->udp_table->mask;
+ ++state->bucket) {
+ struct hlist_nulls_node *node;
+ struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+
+ if (hlist_nulls_empty(&hslot->head))
+ continue;
+
+ spin_lock_bh(&hslot->lock);
+ sk_nulls_for_each(sk, node, &hslot->head) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (sk->sk_family == state->family)
+ goto found;
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct udp_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ do {
+ sk = sk_nulls_next(sk);
+ } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
+
+ if (!sk) {
+ if (state->bucket <= state->udp_table->mask)
+ spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+ return udp_get_first(seq, state->bucket + 1);
+ }
+ return sk;
+}
+
+static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct sock *sk = udp_get_first(seq, 0);
+
+ if (sk)
+ while (pos && (sk = udp_get_next(seq, sk)) != NULL)
+ --pos;
+ return pos ? NULL : sk;
+}
+
+static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct udp_iter_state *state = seq->private;
+ state->bucket = MAX_UDP_PORTS;
+
+ return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+
+static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = udp_get_idx(seq, 0);
+ else
+ sk = udp_get_next(seq, v);
+
+ ++*pos;
+ return sk;
+}
+
+static void udp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct udp_iter_state *state = seq->private;
+
+ if (state->bucket <= state->udp_table->mask)
+ spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+}
+
+int udp_seq_open(struct inode *inode, struct file *file)
+{
+ struct udp_seq_afinfo *afinfo = PDE_DATA(inode);
+ struct udp_iter_state *s;
+ int err;
+
+ err = seq_open_net(inode, file, &afinfo->seq_ops,
+ sizeof(struct udp_iter_state));
+ if (err < 0)
+ return err;
+
+ s = ((struct seq_file *)file->private_data)->private;
+ s->family = afinfo->family;
+ s->udp_table = afinfo->udp_table;
+ return err;
+}
+EXPORT_SYMBOL(udp_seq_open);
+
+/* ------------------------------------------------------------------------ */
+int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
+{
+ struct proc_dir_entry *p;
+ int rc = 0;
+
+ afinfo->seq_ops.start = udp_seq_start;
+ afinfo->seq_ops.next = udp_seq_next;
+ afinfo->seq_ops.stop = udp_seq_stop;
+
+ p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+ afinfo->seq_fops, afinfo);
+ if (!p)
+ rc = -ENOMEM;
+ return rc;
+}
+EXPORT_SYMBOL(udp_proc_register);
+
+void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
+{
+ remove_proc_entry(afinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL(udp_proc_unregister);
+
+/* ------------------------------------------------------------------------ */
+static void udp4_format_sock(struct sock *sp, struct seq_file *f,
+ int bucket)
+{
+ struct inet_sock *inet = inet_sk(sp);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+
+ seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ bucket, src, srcp, dest, destp, sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ 0, sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp,
+ atomic_read(&sp->sk_drops));
+}
+
+int udp4_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 127);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops");
+ else {
+ struct udp_iter_state *state = seq->private;
+
+ udp4_format_sock(v, seq, state->bucket);
+ }
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct file_operations udp_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = udp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+/* ------------------------------------------------------------------------ */
+static struct udp_seq_afinfo udp4_seq_afinfo = {
+ .name = "udp",
+ .family = AF_INET,
+ .udp_table = &udp_table,
+ .seq_fops = &udp_afinfo_seq_fops,
+ .seq_ops = {
+ .show = udp4_seq_show,
+ },
+};
+
+static int __net_init udp4_proc_init_net(struct net *net)
+{
+ return udp_proc_register(net, &udp4_seq_afinfo);
+}
+
+static void __net_exit udp4_proc_exit_net(struct net *net)
+{
+ udp_proc_unregister(net, &udp4_seq_afinfo);
+}
+
+static struct pernet_operations udp4_net_ops = {
+ .init = udp4_proc_init_net,
+ .exit = udp4_proc_exit_net,
+};
+
+int __init udp4_proc_init(void)
+{
+ return register_pernet_subsys(&udp4_net_ops);
+}
+
+void udp4_proc_exit(void)
+{
+ unregister_pernet_subsys(&udp4_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+static __initdata unsigned long uhash_entries;
+static int __init set_uhash_entries(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtoul(str, 0, &uhash_entries);
+ if (ret)
+ return 0;
+
+ if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
+ uhash_entries = UDP_HTABLE_SIZE_MIN;
+ return 1;
+}
+__setup("uhash_entries=", set_uhash_entries);
+
+void __init udp_table_init(struct udp_table *table, const char *name)
+{
+ unsigned int i;
+
+ table->hash = alloc_large_system_hash(name,
+ 2 * sizeof(struct udp_hslot),
+ uhash_entries,
+ 21, /* one slot per 2 MB */
+ 0,
+ &table->log,
+ &table->mask,
+ UDP_HTABLE_SIZE_MIN,
+ 64 * 1024);
+
+ table->hash2 = table->hash + (table->mask + 1);
+ for (i = 0; i <= table->mask; i++) {
+ INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+ table->hash[i].count = 0;
+ spin_lock_init(&table->hash[i].lock);
+ }
+ for (i = 0; i <= table->mask; i++) {
+ INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+ table->hash2[i].count = 0;
+ spin_lock_init(&table->hash2[i].lock);
+ }
+}
+
+u32 udp_flow_hashrnd(void)
+{
+ static u32 hashrnd __read_mostly;
+
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+
+ return hashrnd;
+}
+EXPORT_SYMBOL(udp_flow_hashrnd);
+
+void __init udp_init(void)
+{
+ unsigned long limit;
+
+ udp_table_init(&udp_table, "UDP");
+ limit = nr_free_buffer_pages() / 8;
+ limit = max(limit, 128UL);
+ sysctl_udp_mem[0] = limit / 4 * 3;
+ sysctl_udp_mem[1] = limit;
+ sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
+
+ sysctl_udp_rmem_min = SK_MEM_QUANTUM;
+ sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
new file mode 100644
index 000000000..b763c39ae
--- /dev/null
+++ b/net/ipv4/udp_diag.c
@@ -0,0 +1,222 @@
+/*
+ * udp_diag.c Module for monitoring UDP transport protocols sockets.
+ *
+ * Authors: Pavel Emelyanov, <xemul@parallels.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/udp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/sock_diag.h>
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ struct nlattr *bc)
+{
+ if (!inet_diag_bc_sk(bc, sk))
+ return 0;
+
+ return inet_sk_diag_fill(sk, NULL, skb, req,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ int err = -EINVAL;
+ struct sock *sk;
+ struct sk_buff *rep;
+ struct net *net = sock_net(in_skb->sk);
+
+ if (req->sdiag_family == AF_INET)
+ sk = __udp4_lib_lookup(net,
+ req->id.idiag_src[0], req->id.idiag_sport,
+ req->id.idiag_dst[0], req->id.idiag_dport,
+ req->id.idiag_if, tbl);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (req->sdiag_family == AF_INET6)
+ sk = __udp6_lib_lookup(net,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ req->id.idiag_if, tbl);
+#endif
+ else
+ goto out_nosk;
+
+ err = -ENOENT;
+ if (!sk)
+ goto out_nosk;
+
+ err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+ if (err)
+ goto out;
+
+ err = -ENOMEM;
+ rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) + 64,
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ err = inet_sk_diag_fill(sk, NULL, rep, req,
+ sk_user_ns(NETLINK_CB(in_skb).sk),
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, nlh);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(rep);
+ goto out;
+ }
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+out:
+ if (sk)
+ sock_put(sk);
+out_nosk:
+ return err;
+}
+
+static void udp_dump(struct udp_table *table, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ int num, s_num, slot, s_slot;
+ struct net *net = sock_net(skb->sk);
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ struct udp_hslot *hslot = &table->hash[slot];
+
+ num = 0;
+
+ if (hlist_nulls_empty(&hslot->head))
+ continue;
+
+ spin_lock_bh(&hslot->lock);
+ sk_nulls_for_each(sk, node, &hslot->head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next;
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next;
+ if (r->id.idiag_dport != inet->inet_dport &&
+ r->id.idiag_dport)
+ goto next;
+
+ if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ spin_unlock_bh(&hslot->lock);
+ goto done;
+ }
+next:
+ num++;
+ }
+ spin_unlock_bh(&hslot->lock);
+ }
+done:
+ cb->args[0] = slot;
+ cb->args[1] = num;
+}
+
+static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ udp_dump(&udp_table, skb, cb, r, bc);
+}
+
+static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ return udp_dump_one(&udp_table, in_skb, nlh, req);
+}
+
+static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *info)
+{
+ r->idiag_rqueue = sk_rmem_alloc_get(sk);
+ r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+static const struct inet_diag_handler udp_diag_handler = {
+ .dump = udp_diag_dump,
+ .dump_one = udp_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
+ .idiag_type = IPPROTO_UDP,
+};
+
+static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc)
+{
+ udp_dump(&udplite_table, skb, cb, r, bc);
+}
+
+static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ return udp_dump_one(&udplite_table, in_skb, nlh, req);
+}
+
+static const struct inet_diag_handler udplite_diag_handler = {
+ .dump = udplite_diag_dump,
+ .dump_one = udplite_diag_dump_one,
+ .idiag_get_info = udp_diag_get_info,
+ .idiag_type = IPPROTO_UDPLITE,
+};
+
+static int __init udp_diag_init(void)
+{
+ int err;
+
+ err = inet_diag_register(&udp_diag_handler);
+ if (err)
+ goto out;
+ err = inet_diag_register(&udplite_diag_handler);
+ if (err)
+ goto out_lite;
+out:
+ return err;
+out_lite:
+ inet_diag_unregister(&udp_diag_handler);
+ goto out;
+}
+
+static void __exit udp_diag_exit(void)
+{
+ inet_diag_unregister(&udplite_diag_handler);
+ inet_diag_unregister(&udp_diag_handler);
+}
+
+module_init(udp_diag_init);
+module_exit(udp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
new file mode 100644
index 000000000..7e0fe4bdd
--- /dev/null
+++ b/net/ipv4/udp_impl.h
@@ -0,0 +1,34 @@
+#ifndef _UDP4_IMPL_H
+#define _UDP4_IMPL_H
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+
+int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
+void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+
+int udp_v4_get_port(struct sock *sk, unsigned short snum);
+
+int udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+
+#ifdef CONFIG_COMPAT
+int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+#endif
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len);
+int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
+ int flags);
+int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+void udp_destroy_sock(struct sock *sk);
+
+#ifdef CONFIG_PROC_FS
+int udp4_seq_show(struct seq_file *seq, void *v);
+#endif
+#endif /* _UDP4_IMPL_H */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
new file mode 100644
index 000000000..f9386160c
--- /dev/null
+++ b/net/ipv4/udp_offload.c
@@ -0,0 +1,442 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * UDPv4 GSO support
+ */
+
+#include <linux/skbuff.h>
+#include <net/udp.h>
+#include <net/protocol.h>
+
+static DEFINE_SPINLOCK(udp_offload_lock);
+static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
+
+#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
+
+struct udp_offload_priv {
+ struct udp_offload *offload;
+ struct rcu_head rcu;
+ struct udp_offload_priv __rcu *next;
+};
+
+static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features,
+ struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+ netdev_features_t features),
+ __be16 new_protocol, bool is_ipv6)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ __be16 protocol = skb->protocol;
+ netdev_features_t enc_features;
+ int udp_offset, outer_hlen;
+ unsigned int oldlen;
+ bool need_csum = !!(skb_shinfo(skb)->gso_type &
+ SKB_GSO_UDP_TUNNEL_CSUM);
+ bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
+ bool offload_csum = false, dont_encap = (need_csum || remcsum);
+
+ oldlen = (u16)~skb->len;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
+
+ skb->encapsulation = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = new_protocol;
+ skb->encap_hdr_csum = need_csum;
+ skb->remcsum_offload = remcsum;
+
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum &&
+ (skb->dev->features &
+ (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM)));
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & features;
+ segs = gso_inner_segment(skb, enc_features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
+ goto out;
+ }
+
+ outer_hlen = skb_tnl_header_len(skb);
+ udp_offset = outer_hlen - tnl_hlen;
+ skb = segs;
+ do {
+ struct udphdr *uh;
+ int len;
+ __be32 delta;
+
+ if (dont_encap) {
+ skb->encapsulation = 0;
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* Only set up inner headers if we might be offloading
+ * inner checksum.
+ */
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+
+ skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, udp_offset);
+ len = skb->len - udp_offset;
+ uh = udp_hdr(skb);
+ uh->len = htons(len);
+
+ if (!need_csum)
+ continue;
+
+ delta = htonl(oldlen + len);
+
+ uh->check = ~csum_fold((__force __wsum)
+ ((__force u32)uh->check +
+ (__force u32)delta));
+ if (offload_csum) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ } else if (remcsum) {
+ /* Need to calculate checksum from scratch,
+ * inner checksums are never when doing
+ * remote_checksum_offload.
+ */
+
+ skb->csum = skb_checksum(skb, udp_offset,
+ skb->len - udp_offset,
+ 0);
+ uh->check = csum_fold(skb->csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
+ uh->check = gso_make_checksum(skb, ~uh->check);
+
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features,
+ bool is_ipv6)
+{
+ __be16 protocol = skb->protocol;
+ const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
+ netdev_features_t features);
+
+ rcu_read_lock();
+
+ switch (skb->inner_protocol_type) {
+ case ENCAP_TYPE_ETHER:
+ protocol = skb->inner_protocol;
+ gso_inner_segment = skb_mac_gso_segment;
+ break;
+ case ENCAP_TYPE_IPPROTO:
+ offloads = is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[skb->inner_ipproto]);
+ if (!ops || !ops->callbacks.gso_segment)
+ goto out_unlock;
+ gso_inner_segment = ops->callbacks.gso_segment;
+ break;
+ default:
+ goto out_unlock;
+ }
+
+ segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment,
+ protocol, is_ipv6);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return segs;
+}
+
+static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int mss;
+ __wsum csum;
+ struct udphdr *uh;
+ struct iphdr *iph;
+
+ if (skb->encapsulation &&
+ (skb_shinfo(skb)->gso_type &
+ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
+ segs = skb_udp_tunnel_segment(skb, features, false);
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto out;
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
+ !(type & (SKB_GSO_UDP))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ segs = NULL;
+ goto out;
+ }
+
+ /* Do software UFO. Complete and fill in the UDP checksum as
+ * HW cannot do checksum of UDP packets sent as multiple
+ * IP fragments.
+ */
+
+ uh = udp_hdr(skb);
+ iph = ip_hdr(skb);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Fragment the skb. IP headers of the fragments are updated in
+ * inet_gso_segment()
+ */
+ segs = skb_segment(skb, features);
+out:
+ return segs;
+}
+
+int udp_add_offload(struct udp_offload *uo)
+{
+ struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
+
+ if (!new_offload)
+ return -ENOMEM;
+
+ new_offload->offload = uo;
+
+ spin_lock(&udp_offload_lock);
+ new_offload->next = udp_offload_base;
+ rcu_assign_pointer(udp_offload_base, new_offload);
+ spin_unlock(&udp_offload_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(udp_add_offload);
+
+static void udp_offload_free_routine(struct rcu_head *head)
+{
+ struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
+ kfree(ou_priv);
+}
+
+void udp_del_offload(struct udp_offload *uo)
+{
+ struct udp_offload_priv __rcu **head = &udp_offload_base;
+ struct udp_offload_priv *uo_priv;
+
+ spin_lock(&udp_offload_lock);
+
+ uo_priv = udp_deref_protected(*head);
+ for (; uo_priv != NULL;
+ uo_priv = udp_deref_protected(*head)) {
+ if (uo_priv->offload == uo) {
+ rcu_assign_pointer(*head,
+ udp_deref_protected(uo_priv->next));
+ goto unlock;
+ }
+ head = &uo_priv->next;
+ }
+ pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
+unlock:
+ spin_unlock(&udp_offload_lock);
+ if (uo_priv)
+ call_rcu(&uo_priv->rcu, udp_offload_free_routine);
+}
+EXPORT_SYMBOL(udp_del_offload);
+
+struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
+ struct udphdr *uh)
+{
+ struct udp_offload_priv *uo_priv;
+ struct sk_buff *p, **pp = NULL;
+ struct udphdr *uh2;
+ unsigned int off = skb_gro_offset(skb);
+ int flush = 1;
+
+ if (NAPI_GRO_CB(skb)->udp_mark ||
+ (skb->ip_summed != CHECKSUM_PARTIAL &&
+ NAPI_GRO_CB(skb)->csum_cnt == 0 &&
+ !NAPI_GRO_CB(skb)->csum_valid))
+ goto out;
+
+ /* mark that this skb passed once through the udp gro layer */
+ NAPI_GRO_CB(skb)->udp_mark = 1;
+
+ rcu_read_lock();
+ uo_priv = rcu_dereference(udp_offload_base);
+ for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+ if (uo_priv->offload->port == uh->dest &&
+ uo_priv->offload->callbacks.gro_receive)
+ goto unflush;
+ }
+ goto out_unlock;
+
+unflush:
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = (struct udphdr *)(p->data + off);
+
+ /* Match ports and either checksums are either both zero
+ * or nonzero.
+ */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) ||
+ (!uh->check ^ !uh2->check)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
+ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+ NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
+ pp = uo_priv->offload->callbacks.gro_receive(head, skb,
+ uo_priv->offload);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+ return pp;
+}
+
+static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_gro_udphdr(skb);
+
+ if (unlikely(!uh))
+ goto flush;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (NAPI_GRO_CB(skb)->flush)
+ goto skip;
+
+ if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
+ inet_gro_compute_pseudo))
+ goto flush;
+ else if (uh->check)
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ inet_gro_compute_pseudo);
+skip:
+ NAPI_GRO_CB(skb)->is_ipv6 = 0;
+ return udp_gro_receive(head, skb, uh);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+int udp_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct udp_offload_priv *uo_priv;
+ __be16 newlen = htons(skb->len - nhoff);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+ int err = -ENOSYS;
+
+ uh->len = newlen;
+
+ rcu_read_lock();
+
+ uo_priv = rcu_dereference(udp_offload_base);
+ for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
+ if (uo_priv->offload->port == uh->dest &&
+ uo_priv->offload->callbacks.gro_complete)
+ break;
+ }
+
+ if (uo_priv) {
+ NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
+ err = uo_priv->offload->callbacks.gro_complete(skb,
+ nhoff + sizeof(struct udphdr),
+ uo_priv->offload);
+ }
+
+ rcu_read_unlock();
+
+ if (skb->remcsum_offload)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+
+ skb->encapsulation = 1;
+ skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr));
+
+ return err;
+}
+
+static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+
+ if (uh->check) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
+ iph->daddr, 0);
+ } else {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
+ }
+
+ return udp_gro_complete(skb, nhoff);
+}
+
+static const struct net_offload udpv4_offload = {
+ .callbacks = {
+ .gso_segment = udp4_ufo_fragment,
+ .gro_receive = udp4_gro_receive,
+ .gro_complete = udp4_gro_complete,
+ },
+};
+
+int __init udpv4_offload_init(void)
+{
+ return inet_add_offload(&udpv4_offload, IPPROTO_UDP);
+}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
new file mode 100644
index 000000000..6bb98cc19
--- /dev/null
+++ b/net/ipv4/udp_tunnel.c
@@ -0,0 +1,108 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/net_namespace.h>
+
+int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
+ struct socket **sockp)
+{
+ int err;
+ struct socket *sock = NULL;
+ struct sockaddr_in udp_addr;
+
+ err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+ if (err < 0)
+ goto error;
+
+ sk_change_net(sock->sk, net);
+
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->local_ip;
+ udp_addr.sin_port = cfg->local_udp_port;
+ err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+ sizeof(udp_addr));
+ if (err < 0)
+ goto error;
+
+ if (cfg->peer_udp_port) {
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->peer_ip;
+ udp_addr.sin_port = cfg->peer_udp_port;
+ err = kernel_connect(sock, (struct sockaddr *)&udp_addr,
+ sizeof(udp_addr), 0);
+ if (err < 0)
+ goto error;
+ }
+
+ sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
+
+ *sockp = sock;
+ return 0;
+
+error:
+ if (sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+ }
+ *sockp = NULL;
+ return err;
+}
+EXPORT_SYMBOL(udp_sock_create4);
+
+void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
+ struct udp_tunnel_sock_cfg *cfg)
+{
+ struct sock *sk = sock->sk;
+
+ /* Disable multicast loopback */
+ inet_sk(sk)->mc_loop = 0;
+
+ /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
+ inet_inc_convert_csum(sk);
+
+ rcu_assign_sk_user_data(sk, cfg->sk_user_data);
+
+ udp_sk(sk)->encap_type = cfg->encap_type;
+ udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+
+ udp_tunnel_encap_enable(sock);
+}
+EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
+
+int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 tos, __u8 ttl,
+ __be16 df, __be16 src_port, __be16 dst_port,
+ bool xnet, bool nocheck)
+{
+ struct udphdr *uh;
+
+ __skb_push(skb, sizeof(*uh));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+
+ uh->dest = dst_port;
+ uh->source = src_port;
+ uh->len = htons(skb->len);
+
+ udp_set_csum(nocheck, skb, src, dst, skb->len);
+
+ return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP,
+ tos, ttl, df, xnet);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
+
+void udp_tunnel_sock_release(struct socket *sock)
+{
+ rcu_assign_sk_user_data(sock->sk, NULL);
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
new file mode 100644
index 000000000..3b3efbda4
--- /dev/null
+++ b/net/ipv4/udplite.c
@@ -0,0 +1,141 @@
+/*
+ * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828).
+ *
+ * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ *
+ * Changes:
+ * Fixes:
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "UDPLite: " fmt
+
+#include <linux/export.h>
+#include "udp_impl.h"
+
+struct udp_table udplite_table __read_mostly;
+EXPORT_SYMBOL(udplite_table);
+
+static int udplite_rcv(struct sk_buff *skb)
+{
+ return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
+}
+
+static void udplite_err(struct sk_buff *skb, u32 info)
+{
+ __udp4_lib_err(skb, info, &udplite_table);
+}
+
+static const struct net_protocol udplite_protocol = {
+ .handler = udplite_rcv,
+ .err_handler = udplite_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+struct proto udplite_prot = {
+ .name = "UDP-Lite",
+ .owner = THIS_MODULE,
+ .close = udp_lib_close,
+ .connect = ip4_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .init = udplite_sk_init,
+ .destroy = udp_destroy_sock,
+ .setsockopt = udp_setsockopt,
+ .getsockopt = udp_getsockopt,
+ .sendmsg = udp_sendmsg,
+ .recvmsg = udp_recvmsg,
+ .sendpage = udp_sendpage,
+ .backlog_rcv = udp_queue_rcv_skb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+ .get_port = udp_v4_get_port,
+ .obj_size = sizeof(struct udp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .h.udp_table = &udplite_table,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_udp_setsockopt,
+ .compat_getsockopt = compat_udp_getsockopt,
+#endif
+ .clear_sk = sk_prot_clear_portaddr_nulls,
+};
+EXPORT_SYMBOL(udplite_prot);
+
+static struct inet_protosw udplite4_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDPLITE,
+ .prot = &udplite_prot,
+ .ops = &inet_dgram_ops,
+ .flags = INET_PROTOSW_PERMANENT,
+};
+
+#ifdef CONFIG_PROC_FS
+
+static const struct file_operations udplite_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = udp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+static struct udp_seq_afinfo udplite4_seq_afinfo = {
+ .name = "udplite",
+ .family = AF_INET,
+ .udp_table = &udplite_table,
+ .seq_fops = &udplite_afinfo_seq_fops,
+ .seq_ops = {
+ .show = udp4_seq_show,
+ },
+};
+
+static int __net_init udplite4_proc_init_net(struct net *net)
+{
+ return udp_proc_register(net, &udplite4_seq_afinfo);
+}
+
+static void __net_exit udplite4_proc_exit_net(struct net *net)
+{
+ udp_proc_unregister(net, &udplite4_seq_afinfo);
+}
+
+static struct pernet_operations udplite4_net_ops = {
+ .init = udplite4_proc_init_net,
+ .exit = udplite4_proc_exit_net,
+};
+
+static __init int udplite4_proc_init(void)
+{
+ return register_pernet_subsys(&udplite4_net_ops);
+}
+#else
+static inline int udplite4_proc_init(void)
+{
+ return 0;
+}
+#endif
+
+void __init udplite4_register(void)
+{
+ udp_table_init(&udplite_table, "UDP-Lite");
+ if (proto_register(&udplite_prot, 1))
+ goto out_register_err;
+
+ if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0)
+ goto out_unregister_proto;
+
+ inet_register_protosw(&udplite4_protosw);
+
+ if (udplite4_proc_init())
+ pr_err("%s: Cannot register /proc!\n", __func__);
+ return;
+
+out_unregister_proto:
+ proto_unregister(&udplite_prot);
+out_register_err:
+ pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__);
+}
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
new file mode 100644
index 000000000..60b032f58
--- /dev/null
+++ b/net/ipv4/xfrm4_input.c
@@ -0,0 +1,158 @@
+/*
+ * xfrm4_input.c
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ * Derek Atkins <derek@ihtfp.com>
+ * Add Encapsulation support
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ return xfrm4_extract_header(skb);
+}
+
+static inline int xfrm4_rcv_encap_finish(struct sock *sk, struct sk_buff *skb)
+{
+ if (!skb_dst(skb)) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev))
+ goto drop;
+ }
+ return dst_input(skb);
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+int xfrm4_transport_finish(struct sk_buff *skb, int async)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+
+#ifndef CONFIG_NETFILTER
+ if (!async)
+ return -iph->protocol;
+#endif
+
+ __skb_push(skb, skb->data - skb_network_header(skb));
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
+
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, NULL, skb,
+ skb->dev, NULL,
+ xfrm4_rcv_encap_finish);
+ return 0;
+}
+
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct udphdr *uh;
+ struct iphdr *iph;
+ int iphlen, len;
+
+ __u8 *udpdata;
+ __be32 *udpdata32;
+ __u16 encap_type = up->encap_type;
+
+ /* if this is not encapsulated socket, then just return now */
+ if (!encap_type)
+ return 1;
+
+ /* If this is a paged skb, make sure we pull up
+ * whatever data we need to look at. */
+ len = skb->len - sizeof(struct udphdr);
+ if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
+ return 1;
+
+ /* Now we can get the pointers */
+ uh = udp_hdr(skb);
+ udpdata = (__u8 *)uh + sizeof(struct udphdr);
+ udpdata32 = (__be32 *)udpdata;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ goto drop;
+ } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
+ /* ESP Packet without Non-ESP header */
+ len = sizeof(struct udphdr);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ goto drop;
+ } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+ udpdata32[0] == 0 && udpdata32[1] == 0) {
+
+ /* ESP Packet with Non-IKE marker */
+ len = sizeof(struct udphdr) + 2 * sizeof(u32);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ }
+
+ /* At this point we are sure that this is an ESPinUDP packet,
+ * so we need to remove 'len' bytes from the packet (the UDP
+ * header and optional ESP marker bytes) and then modify the
+ * protocol to ESP, and then call into the transform receiver.
+ */
+ if (skb_unclone(skb, GFP_ATOMIC))
+ goto drop;
+
+ /* Now we can update and verify the packet length... */
+ iph = ip_hdr(skb);
+ iphlen = iph->ihl << 2;
+ iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ if (skb->len < iphlen + len) {
+ /* packet is too small!?! */
+ goto drop;
+ }
+
+ /* pull the data buffer up to the ESP header and set the
+ * transport header to point to ESP. Keep UDP on the stack
+ * for later.
+ */
+ __skb_pull(skb, len);
+ skb_reset_transport_header(skb);
+
+ /* process ESP */
+ return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, encap_type);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+int xfrm4_rcv(struct sk_buff *skb)
+{
+ return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
+}
+EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
new file mode 100644
index 000000000..71acd0014
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -0,0 +1,156 @@
+/*
+ * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
+ * Miika Komu <miika@iki.fi>
+ * Herbert Xu <herbert@gondor.apana.org.au>
+ * Abhinav Pathak <abhinav.pathak@hiit.fi>
+ * Jeff Ahrenholz <ahrenholz@gmail.com>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static void xfrm4_beet_make_header(struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ iph->ihl = 5;
+ iph->version = 4;
+
+ iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+ iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
+
+ iph->id = XFRM_MODE_SKB_CB(skb)->id;
+ iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off;
+ iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl;
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
+ */
+static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_beet_phdr *ph;
+ struct iphdr *top_iph;
+ int hdrlen, optlen;
+
+ hdrlen = 0;
+ optlen = XFRM_MODE_SKB_CB(skb)->optlen;
+ if (unlikely(optlen))
+ hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
+
+ skb_set_network_header(skb, -x->props.header_len -
+ hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
+ if (x->sel.family != AF_INET6)
+ skb->network_header += IPV4_BEET_PHMAXLEN;
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+ xfrm4_beet_make_header(skb);
+
+ ph = (struct ip_beet_phdr *)
+ __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen);
+
+ top_iph = ip_hdr(skb);
+
+ if (unlikely(optlen)) {
+ BUG_ON(optlen < 0);
+
+ ph->padlen = 4 - (optlen & 4);
+ ph->hdrlen = optlen / 8;
+ ph->nexthdr = top_iph->protocol;
+ if (ph->padlen)
+ memset(ph + 1, IPOPT_NOP, ph->padlen);
+
+ top_iph->protocol = IPPROTO_BEETPH;
+ top_iph->ihl = sizeof(struct iphdr) / 4;
+ }
+
+ top_iph->saddr = x->props.saddr.a4;
+ top_iph->daddr = x->id.daddr.a4;
+
+ return 0;
+}
+
+static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ int optlen = 0;
+ int err = -EINVAL;
+
+ if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) {
+ struct ip_beet_phdr *ph;
+ int phlen;
+
+ if (!pskb_may_pull(skb, sizeof(*ph)))
+ goto out;
+
+ ph = (struct ip_beet_phdr *)skb->data;
+
+ phlen = sizeof(*ph) + ph->padlen;
+ optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen);
+ if (optlen < 0 || optlen & 3 || optlen > 250)
+ goto out;
+
+ XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
+
+ if (!pskb_may_pull(skb, phlen))
+ goto out;
+ __skb_pull(skb, phlen);
+ }
+
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ xfrm4_beet_make_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->ihl += optlen / 4;
+ iph->tot_len = htons(skb->len);
+ iph->daddr = x->sel.daddr.a4;
+ iph->saddr = x->sel.saddr.a4;
+ iph->check = 0;
+ iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+ err = 0;
+out:
+ return err;
+}
+
+static struct xfrm_mode xfrm4_beet_mode = {
+ .input2 = xfrm4_beet_input,
+ .input = xfrm_prepare_input,
+ .output2 = xfrm4_beet_output,
+ .output = xfrm4_prepare_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_BEET,
+ .flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm4_beet_init(void)
+{
+ return xfrm_register_mode(&xfrm4_beet_mode, AF_INET);
+}
+
+static void __exit xfrm4_beet_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET);
+ BUG_ON(err);
+}
+
+module_init(xfrm4_beet_init);
+module_exit(xfrm4_beet_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 000000000..fd840c7d7
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,80 @@
+/*
+ * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header will be moved forward to make space for the encapsulation
+ * header.
+ */
+static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ int ihl = iph->ihl * 4;
+
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + ihl;
+ __skb_pull(skb, ihl);
+ memmove(skb_network_header(skb), iph, ihl);
+ return 0;
+}
+
+/* Remove encapsulation header.
+ *
+ * The IP header will be moved over the top of the encapsulation header.
+ *
+ * On entry, skb->h shall point to where the IP header should be and skb->nh
+ * shall be set to where the IP header currently is. skb->data shall point
+ * to the start of the payload.
+ */
+static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int ihl = skb->data - skb_transport_header(skb);
+
+ if (skb->transport_header != skb->network_header) {
+ memmove(skb_transport_header(skb),
+ skb_network_header(skb), ihl);
+ skb->network_header = skb->transport_header;
+ }
+ ip_hdr(skb)->tot_len = htons(skb->len + ihl);
+ skb_reset_transport_header(skb);
+ return 0;
+}
+
+static struct xfrm_mode xfrm4_transport_mode = {
+ .input = xfrm4_transport_input,
+ .output = xfrm4_transport_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm4_transport_init(void)
+{
+ return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+}
+
+static void __exit xfrm4_transport_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
+ BUG_ON(err);
+}
+
+module_init(xfrm4_transport_init);
+module_exit(xfrm4_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 000000000..35feda676
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,125 @@
+/*
+ * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+ struct iphdr *inner_iph = ipip_hdr(skb);
+
+ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
+ IP_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.
+ */
+static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct iphdr *top_iph;
+ int flags;
+
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
+ top_iph = ip_hdr(skb);
+
+ top_iph->ihl = 5;
+ top_iph->version = 4;
+
+ top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
+
+ /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */
+ if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP)
+ top_iph->tos = 0;
+ else
+ top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
+ top_iph->tos = INET_ECN_encapsulate(top_iph->tos,
+ XFRM_MODE_SKB_CB(skb)->tos);
+
+ flags = x->props.flags;
+ if (flags & XFRM_STATE_NOECN)
+ IP_ECN_clear(top_iph);
+
+ top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+ 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
+
+ top_iph->ttl = ip4_dst_hoplimit(dst->child);
+
+ top_iph->saddr = x->props.saddr.a4;
+ top_iph->daddr = x->id.daddr.a4;
+ ip_select_ident(dev_net(dst->dev), skb, NULL);
+
+ return 0;
+}
+
+static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = -EINVAL;
+
+ if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
+ goto out;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto out;
+
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err)
+ goto out;
+
+ if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+ ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb));
+ if (!(x->props.flags & XFRM_STATE_NOECN))
+ ipip_ecn_decapsulate(skb);
+
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static struct xfrm_mode xfrm4_tunnel_mode = {
+ .input2 = xfrm4_mode_tunnel_input,
+ .input = xfrm_prepare_input,
+ .output2 = xfrm4_mode_tunnel_output,
+ .output = xfrm4_prepare_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_TUNNEL,
+ .flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm4_mode_tunnel_init(void)
+{
+ return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+}
+
+static void __exit xfrm4_mode_tunnel_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
+ BUG_ON(err);
+}
+
+module_init(xfrm4_mode_tunnel_init);
+module_exit(xfrm4_mode_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
new file mode 100644
index 000000000..2878dbfff
--- /dev/null
+++ b/net/ipv4/xfrm4_output.c
@@ -0,0 +1,111 @@
+/*
+ * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+
+static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+{
+ int mtu, ret = 0;
+
+ if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
+ goto out;
+
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
+ goto out;
+
+ mtu = dst_mtu(skb_dst(skb));
+ if (skb->len > mtu) {
+ if (skb->sk)
+ xfrm_local_error(skb, mtu);
+ else
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
+ ret = -EMSGSIZE;
+ }
+out:
+ return ret;
+}
+
+int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ err = xfrm4_tunnel_check_size(skb);
+ if (err)
+ return err;
+
+ XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol;
+
+ return xfrm4_extract_header(skb);
+}
+
+int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ err = xfrm_inner_extract_output(x, skb);
+ if (err)
+ return err;
+
+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+ skb->protocol = htons(ETH_P_IP);
+
+ return x->outer_mode->output2(x, skb);
+}
+EXPORT_SYMBOL(xfrm4_prepare_output);
+
+int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
+{
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+#ifdef CONFIG_NETFILTER
+ IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
+#endif
+
+ return xfrm_output(sk, skb);
+}
+
+static int __xfrm4_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+#ifdef CONFIG_NETFILTER
+ if (!x) {
+ IPCB(skb)->flags |= IPSKB_REROUTED;
+ return dst_output_sk(sk, skb);
+ }
+#endif
+
+ return x->outer_mode->afinfo->output_finish(sk, skb);
+}
+
+int xfrm4_output(struct sock *sk, struct sk_buff *skb)
+{
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb,
+ NULL, skb_dst(skb)->dev, __xfrm4_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+void xfrm4_local_error(struct sk_buff *skb, u32 mtu)
+{
+ struct iphdr *hdr;
+
+ hdr = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
+ ip_local_error(skb->sk, EMSGSIZE, hdr->daddr,
+ inet_sk(skb->sk)->inet_dport, mtu);
+}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
new file mode 100644
index 000000000..bff69746e
--- /dev/null
+++ b/net/ipv4/xfrm4_policy.c
@@ -0,0 +1,332 @@
+/*
+ * xfrm4_policy.c
+ *
+ * Changes:
+ * Kazunori MIYAZAWA @USAGI
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/inetdevice.h>
+#include <linux/if_tunnel.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
+
+static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
+ int tos,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct rtable *rt;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = daddr->a4;
+ fl4->flowi4_tos = tos;
+ if (saddr)
+ fl4->saddr = saddr->a4;
+
+ rt = __ip_route_output_key(net, fl4);
+ if (!IS_ERR(rt))
+ return &rt->dst;
+
+ return ERR_CAST(rt);
+}
+
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct flowi4 fl4;
+
+ return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
+}
+
+static int xfrm4_get_saddr(struct net *net,
+ xfrm_address_t *saddr, xfrm_address_t *daddr)
+{
+ struct dst_entry *dst;
+ struct flowi4 fl4;
+
+ dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
+ if (IS_ERR(dst))
+ return -EHOSTUNREACH;
+
+ saddr->a4 = fl4.saddr;
+ dst_release(dst);
+ return 0;
+}
+
+static int xfrm4_get_tos(const struct flowi *fl)
+{
+ return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
+}
+
+static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+ int nfheader_len)
+{
+ return 0;
+}
+
+static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+ const struct flowi *fl)
+{
+ struct rtable *rt = (struct rtable *)xdst->route;
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ xdst->u.rt.rt_iif = fl4->flowi4_iif;
+
+ xdst->u.dst.dev = dev;
+ dev_hold(dev);
+
+ /* Sheit... I remember I did this right. Apparently,
+ * it was magically lost, so this code needs audit */
+ xdst->u.rt.rt_is_input = rt->rt_is_input;
+ xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
+ RTCF_LOCAL);
+ xdst->u.rt.rt_type = rt->rt_type;
+ xdst->u.rt.rt_gateway = rt->rt_gateway;
+ xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+ xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+ INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
+
+ return 0;
+}
+
+static void
+_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ int oif = 0;
+
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
+
+ memset(fl4, 0, sizeof(struct flowi4));
+ fl4->flowi4_mark = skb->mark;
+ fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
+
+ if (!ip_is_fragment(iph)) {
+ switch (iph->protocol) {
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_DCCP:
+ if (xprth + 4 < skb->data ||
+ pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ports = (__be16 *)xprth;
+
+ fl4->fl4_sport = ports[!!reverse];
+ fl4->fl4_dport = ports[!reverse];
+ }
+ break;
+
+ case IPPROTO_ICMP:
+ if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
+ u8 *icmp = xprth;
+
+ fl4->fl4_icmp_type = icmp[0];
+ fl4->fl4_icmp_code = icmp[1];
+ }
+ break;
+
+ case IPPROTO_ESP:
+ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be32 *ehdr = (__be32 *)xprth;
+
+ fl4->fl4_ipsec_spi = ehdr[0];
+ }
+ break;
+
+ case IPPROTO_AH:
+ if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
+ __be32 *ah_hdr = (__be32 *)xprth;
+
+ fl4->fl4_ipsec_spi = ah_hdr[1];
+ }
+ break;
+
+ case IPPROTO_COMP:
+ if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+ __be16 *ipcomp_hdr = (__be16 *)xprth;
+
+ fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
+ }
+ break;
+
+ case IPPROTO_GRE:
+ if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
+ __be16 *greflags = (__be16 *)xprth;
+ __be32 *gre_hdr = (__be32 *)xprth;
+
+ if (greflags[0] & GRE_KEY) {
+ if (greflags[0] & GRE_CSUM)
+ gre_hdr++;
+ fl4->fl4_gre_key = gre_hdr[1];
+ }
+ }
+ break;
+
+ default:
+ fl4->fl4_ipsec_spi = 0;
+ break;
+ }
+ }
+ fl4->flowi4_proto = iph->protocol;
+ fl4->daddr = reverse ? iph->saddr : iph->daddr;
+ fl4->saddr = reverse ? iph->daddr : iph->saddr;
+ fl4->flowi4_tos = iph->tos;
+}
+
+static inline int xfrm4_garbage_collect(struct dst_ops *ops)
+{
+ struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
+
+ xfrm4_policy_afinfo.garbage_collect(net);
+ return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
+}
+
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->update_pmtu(path, sk, skb, mtu);
+}
+
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->redirect(path, sk, skb);
+}
+
+static void xfrm4_dst_destroy(struct dst_entry *dst)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+ dst_destroy_metrics_generic(dst);
+
+ xfrm_dst_destroy(xdst);
+}
+
+static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+ int unregister)
+{
+ if (!unregister)
+ return;
+
+ xfrm_dst_ifdown(dst, dev);
+}
+
+static struct dst_ops xfrm4_dst_ops = {
+ .family = AF_INET,
+ .gc = xfrm4_garbage_collect,
+ .update_pmtu = xfrm4_update_pmtu,
+ .redirect = xfrm4_redirect,
+ .cow_metrics = dst_cow_metrics_generic,
+ .destroy = xfrm4_dst_destroy,
+ .ifdown = xfrm4_dst_ifdown,
+ .local_out = __ip_local_out,
+ .gc_thresh = 32768,
+};
+
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+ .family = AF_INET,
+ .dst_ops = &xfrm4_dst_ops,
+ .dst_lookup = xfrm4_dst_lookup,
+ .get_saddr = xfrm4_get_saddr,
+ .decode_session = _decode_session4,
+ .get_tos = xfrm4_get_tos,
+ .init_path = xfrm4_init_path,
+ .fill_dst = xfrm4_fill_dst,
+ .blackhole_route = ipv4_blackhole_route,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm4_policy_table[] = {
+ {
+ .procname = "xfrm4_gc_thresh",
+ .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = xfrm4_policy_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(xfrm4_policy_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv4", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv4.xfrm4_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ if (!net->ipv4.xfrm4_hdr)
+ return;
+
+ table = net->ipv4.xfrm4_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.xfrm4_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations __net_initdata xfrm4_net_ops = {
+ .init = xfrm4_net_init,
+ .exit = xfrm4_net_exit,
+};
+#endif
+
+static void __init xfrm4_policy_init(void)
+{
+ xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+}
+
+void __init xfrm4_init(void)
+{
+ dst_entries_init(&xfrm4_dst_ops);
+
+ xfrm4_state_init();
+ xfrm4_policy_init();
+ xfrm4_protocol_init();
+#ifdef CONFIG_SYSCTL
+ register_pernet_subsys(&xfrm4_net_ops);
+#endif
+}
+
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
new file mode 100644
index 000000000..dccefa9d8
--- /dev/null
+++ b/net/ipv4/xfrm4_protocol.c
@@ -0,0 +1,301 @@
+/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/tunnel4.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly;
+static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm4_protocol_mutex);
+
+static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_handlers;
+ case IPPROTO_AH:
+ return &ah4_handlers;
+ case IPPROTO_COMP:
+ return &ipcomp4_handlers;
+ }
+
+ return NULL;
+}
+
+#define for_each_protocol_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(protocol);
+
+ if (!head)
+ return 0;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->cb_handler(skb, err)) <= 0)
+ return ret;
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_cb);
+
+int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+ struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr);
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ if (!head)
+ goto out;
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+ return ret;
+
+out:
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
+
+static int xfrm4_esp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(esp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static int xfrm4_ah_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ah4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm4_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+{
+ struct xfrm4_protocol *handler;
+
+ for_each_protocol_rcu(ipcomp4_handlers, handler)
+ if (!handler->err_handler(skb, info))
+ break;
+}
+
+static const struct net_protocol esp4_protocol = {
+ .handler = xfrm4_esp_rcv,
+ .err_handler = xfrm4_esp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol ah4_protocol = {
+ .handler = xfrm4_ah_rcv,
+ .err_handler = xfrm4_ah_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol ipcomp4_protocol = {
+ .handler = xfrm4_ipcomp_rcv,
+ .err_handler = xfrm4_ipcomp_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static struct xfrm_input_afinfo xfrm4_input_afinfo = {
+ .family = AF_INET,
+ .owner = THIS_MODULE,
+ .callback = xfrm4_rcv_cb,
+};
+
+static inline const struct net_protocol *netproto(unsigned char protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp4_protocol;
+ case IPPROTO_AH:
+ return &ah4_protocol;
+ case IPPROTO_COMP:
+ return &ipcomp4_protocol;
+ }
+
+ return NULL;
+}
+
+int xfrm4_protocol_register(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ bool add_netproto = false;
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex)))
+ add_netproto = true;
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority < priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ if (add_netproto) {
+ if (inet_add_protocol(netproto(protocol), protocol)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_register);
+
+int xfrm4_protocol_deregister(struct xfrm4_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm4_protocol __rcu **pprev;
+ struct xfrm4_protocol *t;
+ int ret = -ENOENT;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm4_protocol_mutex);
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm4_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm4_protocol_mutex))) {
+ if (inet_del_protocol(netproto(protocol), protocol) < 0) {
+ pr_err("%s: can't remove protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ mutex_unlock(&xfrm4_protocol_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_protocol_deregister);
+
+void __init xfrm4_protocol_init(void)
+{
+ xfrm_input_register_afinfo(&xfrm4_input_afinfo);
+}
+EXPORT_SYMBOL(xfrm4_protocol_init);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
new file mode 100644
index 000000000..542074c00
--- /dev/null
+++ b/net/ipv4/xfrm4_state.c
@@ -0,0 +1,100 @@
+/*
+ * xfrm4_state.c
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ *
+ */
+
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/export.h>
+
+static int xfrm4_init_flags(struct xfrm_state *x)
+{
+ if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
+ x->props.flags |= XFRM_STATE_NOPMTUDISC;
+ return 0;
+}
+
+static void
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ sel->daddr.a4 = fl4->daddr;
+ sel->saddr.a4 = fl4->saddr;
+ sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET;
+ sel->prefixlen_d = 32;
+ sel->prefixlen_s = 32;
+ sel->proto = fl4->flowi4_proto;
+ sel->ifindex = fl4->flowi4_oif;
+}
+
+static void
+xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr)
+{
+ x->id = tmpl->id;
+ if (x->id.daddr.a4 == 0)
+ x->id.daddr.a4 = daddr->a4;
+ x->props.saddr = tmpl->saddr;
+ if (x->props.saddr.a4 == 0)
+ x->props.saddr.a4 = saddr->a4;
+ x->props.mode = tmpl->mode;
+ x->props.reqid = tmpl->reqid;
+ x->props.family = AF_INET;
+}
+
+int xfrm4_extract_header(struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
+ XFRM_MODE_SKB_CB(skb)->id = iph->id;
+ XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off;
+ XFRM_MODE_SKB_CB(skb)->tos = iph->tos;
+ XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl;
+ XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph);
+ memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0,
+ sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+
+ return 0;
+}
+
+static struct xfrm_state_afinfo xfrm4_state_afinfo = {
+ .family = AF_INET,
+ .proto = IPPROTO_IPIP,
+ .eth_proto = htons(ETH_P_IP),
+ .owner = THIS_MODULE,
+ .init_flags = xfrm4_init_flags,
+ .init_tempsel = __xfrm4_init_tempsel,
+ .init_temprop = xfrm4_init_temprop,
+ .output = xfrm4_output,
+ .output_finish = xfrm4_output_finish,
+ .extract_input = xfrm4_extract_input,
+ .extract_output = xfrm4_extract_output,
+ .transport_finish = xfrm4_transport_finish,
+ .local_error = xfrm4_local_error,
+};
+
+void __init xfrm4_state_init(void)
+{
+ xfrm_state_register_afinfo(&xfrm4_state_afinfo);
+}
+
+#if 0
+void __exit xfrm4_state_fini(void)
+{
+ xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
+}
+#endif /* 0 */
+
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
new file mode 100644
index 000000000..06347dbd3
--- /dev/null
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -0,0 +1,117 @@
+/* xfrm4_tunnel.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+
+static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ skb_push(skb, -skb_network_offset(skb));
+ return 0;
+}
+
+static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
+{
+ return ip_hdr(skb)->protocol;
+}
+
+static int ipip_init_state(struct xfrm_state *x)
+{
+ if (x->props.mode != XFRM_MODE_TUNNEL)
+ return -EINVAL;
+
+ if (x->encap)
+ return -EINVAL;
+
+ x->props.header_len = sizeof(struct iphdr);
+
+ return 0;
+}
+
+static void ipip_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type ipip_type = {
+ .description = "IPIP",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_IPIP,
+ .init_state = ipip_init_state,
+ .destructor = ipip_destroy,
+ .input = ipip_xfrm_rcv,
+ .output = ipip_output
+};
+
+static int xfrm_tunnel_rcv(struct sk_buff *skb)
+{
+ return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr);
+}
+
+static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
+{
+ return -ENOENT;
+}
+
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
+ .handler = xfrm_tunnel_rcv,
+ .err_handler = xfrm_tunnel_err,
+ .priority = 3,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
+ .handler = xfrm_tunnel_rcv,
+ .err_handler = xfrm_tunnel_err,
+ .priority = 2,
+};
+#endif
+
+static int __init ipip_init(void)
+{
+ if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+
+ if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
+ pr_info("%s: can't add xfrm handler for AF_INET\n", __func__);
+ xfrm_unregister_type(&ipip_type, AF_INET);
+ return -EAGAIN;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
+ pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__);
+ xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
+ xfrm_unregister_type(&ipip_type, AF_INET);
+ return -EAGAIN;
+ }
+#endif
+ return 0;
+}
+
+static void __exit ipip_fini(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
+ pr_info("%s: can't remove xfrm handler for AF_INET6\n",
+ __func__);
+#endif
+ if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
+ pr_info("%s: can't remove xfrm handler for AF_INET\n",
+ __func__);
+ if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
new file mode 100644
index 000000000..438a73aa7
--- /dev/null
+++ b/net/ipv6/Kconfig
@@ -0,0 +1,261 @@
+#
+# IPv6 configuration
+#
+
+# IPv6 as module will cause a CRASH if you try to unload it
+menuconfig IPV6
+ tristate "The IPv6 protocol"
+ default m
+ ---help---
+ This is complemental support for the IP version 6.
+ You will still be able to do traditional IPv4 networking as well.
+
+ For general information about IPv6, see
+ <https://en.wikipedia.org/wiki/IPv6>.
+ For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
+ For specific information about IPv6 under Linux, read the HOWTO at
+ <http://www.bieringer.de/linux/IPv6/>.
+
+ To compile this protocol support as a module, choose M here: the
+ module will be called ipv6.
+
+if IPV6
+
+config IPV6_ROUTER_PREF
+ bool "IPv6: Router Preference (RFC 4191) support"
+ ---help---
+ Router Preference is an optional extension to the Router
+ Advertisement message which improves the ability of hosts
+ to pick an appropriate router, especially when the hosts
+ are placed in a multi-homed network.
+
+ If unsure, say N.
+
+config IPV6_ROUTE_INFO
+ bool "IPv6: Route Information (RFC 4191) support"
+ depends on IPV6_ROUTER_PREF
+ ---help---
+ This is experimental support of Route Information.
+
+ If unsure, say N.
+
+config IPV6_OPTIMISTIC_DAD
+ bool "IPv6: Enable RFC 4429 Optimistic DAD"
+ ---help---
+ This is experimental support for optimistic Duplicate
+ Address Detection. It allows for autoconfigured addresses
+ to be used more quickly.
+
+ If unsure, say N.
+
+config INET6_AH
+ tristate "IPv6: AH transformation"
+ select XFRM_ALGO
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ ---help---
+ Support for IPsec AH.
+
+ If unsure, say Y.
+
+config INET6_ESP
+ tristate "IPv6: ESP transformation"
+ select XFRM_ALGO
+ select CRYPTO
+ select CRYPTO_AUTHENC
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_CBC
+ select CRYPTO_SHA1
+ select CRYPTO_DES
+ ---help---
+ Support for IPsec ESP.
+
+ If unsure, say Y.
+
+config INET6_IPCOMP
+ tristate "IPv6: IPComp transformation"
+ select INET6_XFRM_TUNNEL
+ select XFRM_IPCOMP
+ ---help---
+ Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+ typically needed for IPsec.
+
+ If unsure, say Y.
+
+config IPV6_MIP6
+ tristate "IPv6: Mobility"
+ select XFRM
+ ---help---
+ Support for IPv6 Mobility described in RFC 3775.
+
+ If unsure, say N.
+
+config INET6_XFRM_TUNNEL
+ tristate
+ select INET6_TUNNEL
+ default n
+
+config INET6_TUNNEL
+ tristate
+ default n
+
+config INET6_XFRM_MODE_TRANSPORT
+ tristate "IPv6: IPsec transport mode"
+ default IPV6
+ select XFRM
+ ---help---
+ Support for IPsec transport mode.
+
+ If unsure, say Y.
+
+config INET6_XFRM_MODE_TUNNEL
+ tristate "IPv6: IPsec tunnel mode"
+ default IPV6
+ select XFRM
+ ---help---
+ Support for IPsec tunnel mode.
+
+ If unsure, say Y.
+
+config INET6_XFRM_MODE_BEET
+ tristate "IPv6: IPsec BEET mode"
+ default IPV6
+ select XFRM
+ ---help---
+ Support for IPsec BEET mode.
+
+ If unsure, say Y.
+
+config INET6_XFRM_MODE_ROUTEOPTIMIZATION
+ tristate "IPv6: MIPv6 route optimization mode"
+ select XFRM
+ ---help---
+ Support for MIPv6 route optimization mode.
+
+config IPV6_VTI
+tristate "Virtual (secure) IPv6: tunneling"
+ select IPV6_TUNNEL
+ select NET_IP_TUNNEL
+ depends on INET6_XFRM_MODE_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This can be used with xfrm mode tunnel to give
+ the notion of a secure tunnel for IPSEC and then use routing protocol
+ on top.
+
+config IPV6_SIT
+ tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)"
+ select INET_TUNNEL
+ select NET_IP_TUNNEL
+ select IPV6_NDISC_NODETYPE
+ default y
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This driver implements encapsulation of IPv6
+ into IPv4 packets. This is useful if you want to connect two IPv6
+ networks over an IPv4-only path.
+
+ Saying M here will produce a module called sit. If unsure, say Y.
+
+config IPV6_SIT_6RD
+ bool "IPv6: IPv6 Rapid Deployment (6RD)"
+ depends on IPV6_SIT
+ default n
+ ---help---
+ IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon
+ mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly
+ deploy IPv6 unicast service to IPv4 sites to which it provides
+ customer premise equipment. Like 6to4, it utilizes stateless IPv6 in
+ IPv4 encapsulation in order to transit IPv4-only network
+ infrastructure. Unlike 6to4, a 6rd service provider uses an IPv6
+ prefix of its own in place of the fixed 6to4 prefix.
+
+ With this option enabled, the SIT driver offers 6rd functionality by
+ providing additional ioctl API to configure the IPv6 Prefix for in
+ stead of static 2002::/16 for 6to4.
+
+ If unsure, say N.
+
+config IPV6_NDISC_NODETYPE
+ bool
+
+config IPV6_TUNNEL
+ tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
+ select INET6_TUNNEL
+ ---help---
+ Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
+ RFC 2473.
+
+ If unsure, say N.
+
+config IPV6_GRE
+ tristate "IPv6: GRE tunnel"
+ select IPV6_TUNNEL
+ select NET_IP_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ GRE (Generic Routing Encapsulation) and at this time allows
+ encapsulating of IPv4 or IPv6 over existing IPv6 infrastructure.
+ This driver is useful if the other endpoint is a Cisco router: Cisco
+ likes GRE much better than the other Linux tunneling driver ("IP
+ tunneling" above). In addition, GRE allows multicast redistribution
+ through the tunnel.
+
+ Saying M here will produce a module called ip6_gre. If unsure, say N.
+
+config IPV6_MULTIPLE_TABLES
+ bool "IPv6: Multiple Routing Tables"
+ select FIB_RULES
+ ---help---
+ Support multiple routing tables.
+
+config IPV6_SUBTREES
+ bool "IPv6: source address based routing"
+ depends on IPV6_MULTIPLE_TABLES
+ ---help---
+ Enable routing by source address or prefix.
+
+ The destination address is still the primary routing key, so mixing
+ normal and source prefix specific routes in the same routing table
+ may sometimes lead to unintended routing behavior. This can be
+ avoided by defining different routing tables for the normal and
+ source prefix specific routes.
+
+ If unsure, say N.
+
+config IPV6_MROUTE
+ bool "IPv6: multicast routing"
+ depends on IPV6
+ ---help---
+ Experimental support for IPv6 multicast forwarding.
+ If unsure, say N.
+
+config IPV6_MROUTE_MULTIPLE_TABLES
+ bool "IPv6: multicast policy routing"
+ depends on IPV6_MROUTE
+ select FIB_RULES
+ help
+ Normally, a multicast router runs a userspace daemon and decides
+ what to do with a multicast packet based on the source and
+ destination addresses. If you say Y here, the multicast router
+ will also be able to take interfaces and packet marks into
+ account and run multiple instances of userspace daemons
+ simultaneously, each one handling a single table.
+
+ If unsure, say N.
+
+config IPV6_PIMSM_V2
+ bool "IPv6: PIM-SM version 2 support"
+ depends on IPV6_MROUTE
+ ---help---
+ Support for IPv6 PIM multicast routing protocol PIM-SMv2.
+ If unsure, say N.
+
+endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
new file mode 100644
index 000000000..2e8c06108
--- /dev/null
+++ b/net/ipv6/Makefile
@@ -0,0 +1,51 @@
+#
+# Makefile for the Linux TCP/IP (INET6) layer.
+#
+
+obj-$(CONFIG_IPV6) += ipv6.o
+
+ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
+ addrlabel.o \
+ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
+ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
+ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o
+
+ipv6-offload := ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o
+
+ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
+ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
+
+ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
+ xfrm6_output.o xfrm6_protocol.o
+ipv6-$(CONFIG_NETFILTER) += netfilter.o
+ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
+ipv6-$(CONFIG_PROC_FS) += proc.o
+ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
+
+ipv6-objs += $(ipv6-y)
+
+obj-$(CONFIG_INET6_AH) += ah6.o
+obj-$(CONFIG_INET6_ESP) += esp6.o
+obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
+obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
+obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
+obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
+obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
+obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
+obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
+obj-$(CONFIG_IPV6_MIP6) += mip6.o
+obj-$(CONFIG_NETFILTER) += netfilter/
+
+obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
+obj-$(CONFIG_IPV6_SIT) += sit.o
+obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
+obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
+
+obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
+obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
+
+obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
+
+ifneq ($(CONFIG_IPV6),)
+obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
+endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
new file mode 100644
index 000000000..37b70e82b
--- /dev/null
+++ b/net/ipv6/addrconf.c
@@ -0,0 +1,5859 @@
+/*
+ * IPv6 Address [auto]configuration
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ *
+ * Janos Farkas : delete timer on ifdown
+ * <chexum@bankinf.banki.hu>
+ * Andi Kleen : kill double kfree on module
+ * unload.
+ * Maciej W. Rozycki : FDDI support
+ * sekiya@USAGI : Don't send too many RS
+ * packets.
+ * yoshfuji@USAGI : Fixed interval between DAD
+ * packets.
+ * YOSHIFUJI Hideaki @USAGI : improved accuracy of
+ * address validation timer.
+ * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041)
+ * support.
+ * Yuji SEKIYA @USAGI : Don't assign a same IPv6
+ * address on a same interface.
+ * YOSHIFUJI Hideaki @USAGI : ARCnet support
+ * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to
+ * seq_file.
+ * YOSHIFUJI Hideaki @USAGI : improved source address
+ * selection; consider scope,
+ * status etc.
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/if_arcnet.h>
+#include <linux/if_infiniband.h>
+#include <linux/route.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/capability.h>
+#include <linux/delay.h>
+#include <linux/notifier.h>
+#include <linux/string.h>
+#include <linux/hash.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/af_ieee802154.h>
+#include <net/firewire.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/if_tunnel.h>
+#include <linux/rtnetlink.h>
+#include <linux/netconf.h>
+#include <linux/random.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+
+/* Set to 3 to get tracing... */
+#define ACONF_DEBUG 2
+
+#if ACONF_DEBUG >= 3
+#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0)
+#endif
+
+#define INFINITY_LIFE_TIME 0xFFFFFFFF
+
+#define IPV6_MAX_STRLEN \
+ sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255")
+
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+ return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+#ifdef CONFIG_SYSCTL
+static int addrconf_sysctl_register(struct inet6_dev *idev);
+static void addrconf_sysctl_unregister(struct inet6_dev *idev);
+#else
+static inline int addrconf_sysctl_register(struct inet6_dev *idev)
+{
+ return 0;
+}
+
+static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
+{
+}
+#endif
+
+static void __ipv6_regen_rndid(struct inet6_dev *idev);
+static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
+static void ipv6_regen_rndid(unsigned long data);
+
+static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
+static int ipv6_count_addresses(struct inet6_dev *idev);
+static int ipv6_generate_stable_address(struct in6_addr *addr,
+ u8 dad_count,
+ const struct inet6_dev *idev);
+
+/*
+ * Configured unicast address hash table
+ */
+static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(addrconf_hash_lock);
+
+static void addrconf_verify(void);
+static void addrconf_verify_rtnl(void);
+static void addrconf_verify_work(struct work_struct *);
+
+static struct workqueue_struct *addrconf_wq;
+static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work);
+
+static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
+
+static void addrconf_type_change(struct net_device *dev,
+ unsigned long event);
+static int addrconf_ifdown(struct net_device *dev, int how);
+
+static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+ int plen,
+ const struct net_device *dev,
+ u32 flags, u32 noflags);
+
+static void addrconf_dad_start(struct inet6_ifaddr *ifp);
+static void addrconf_dad_work(struct work_struct *w);
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
+static void addrconf_dad_run(struct inet6_dev *idev);
+static void addrconf_rs_timer(unsigned long data);
+static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
+static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
+
+static void inet6_prefix_notify(int event, struct inet6_dev *idev,
+ struct prefix_info *pinfo);
+static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev);
+
+static struct ipv6_devconf ipv6_devconf __read_mostly = {
+ .forwarding = 0,
+ .hop_limit = IPV6_DEFAULT_HOPLIMIT,
+ .mtu6 = IPV6_MIN_MTU,
+ .accept_ra = 1,
+ .accept_redirects = 1,
+ .autoconf = 1,
+ .force_mld_version = 0,
+ .mldv1_unsolicited_report_interval = 10 * HZ,
+ .mldv2_unsolicited_report_interval = HZ,
+ .dad_transmits = 1,
+ .rtr_solicits = MAX_RTR_SOLICITATIONS,
+ .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
+ .use_tempaddr = 0,
+ .temp_valid_lft = TEMP_VALID_LIFETIME,
+ .temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
+ .regen_max_retry = REGEN_MAX_RETRY,
+ .max_desync_factor = MAX_DESYNC_FACTOR,
+ .max_addresses = IPV6_MAX_ADDRESSES,
+ .accept_ra_defrtr = 1,
+ .accept_ra_from_local = 0,
+ .accept_ra_pinfo = 1,
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ .accept_ra_rtr_pref = 1,
+ .rtr_probe_interval = 60 * HZ,
+#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_max_plen = 0,
+#endif
+#endif
+ .proxy_ndp = 0,
+ .accept_source_route = 0, /* we do not accept RH0 by default. */
+ .disable_ipv6 = 0,
+ .accept_dad = 1,
+ .suppress_frag_ndisc = 1,
+ .accept_ra_mtu = 1,
+ .stable_secret = {
+ .initialized = false,
+ }
+};
+
+static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
+ .forwarding = 0,
+ .hop_limit = IPV6_DEFAULT_HOPLIMIT,
+ .mtu6 = IPV6_MIN_MTU,
+ .accept_ra = 1,
+ .accept_redirects = 1,
+ .autoconf = 1,
+ .force_mld_version = 0,
+ .mldv1_unsolicited_report_interval = 10 * HZ,
+ .mldv2_unsolicited_report_interval = HZ,
+ .dad_transmits = 1,
+ .rtr_solicits = MAX_RTR_SOLICITATIONS,
+ .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL,
+ .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY,
+ .use_tempaddr = 0,
+ .temp_valid_lft = TEMP_VALID_LIFETIME,
+ .temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
+ .regen_max_retry = REGEN_MAX_RETRY,
+ .max_desync_factor = MAX_DESYNC_FACTOR,
+ .max_addresses = IPV6_MAX_ADDRESSES,
+ .accept_ra_defrtr = 1,
+ .accept_ra_from_local = 0,
+ .accept_ra_pinfo = 1,
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ .accept_ra_rtr_pref = 1,
+ .rtr_probe_interval = 60 * HZ,
+#ifdef CONFIG_IPV6_ROUTE_INFO
+ .accept_ra_rt_info_max_plen = 0,
+#endif
+#endif
+ .proxy_ndp = 0,
+ .accept_source_route = 0, /* we do not accept RH0 by default. */
+ .disable_ipv6 = 0,
+ .accept_dad = 1,
+ .suppress_frag_ndisc = 1,
+ .accept_ra_mtu = 1,
+ .stable_secret = {
+ .initialized = false,
+ },
+};
+
+/* Check if a valid qdisc is available */
+static inline bool addrconf_qdisc_ok(const struct net_device *dev)
+{
+ return !qdisc_tx_is_noop(dev);
+}
+
+static void addrconf_del_rs_timer(struct inet6_dev *idev)
+{
+ if (del_timer(&idev->rs_timer))
+ __in6_dev_put(idev);
+}
+
+static void addrconf_del_dad_work(struct inet6_ifaddr *ifp)
+{
+ if (cancel_delayed_work(&ifp->dad_work))
+ __in6_ifa_put(ifp);
+}
+
+static void addrconf_mod_rs_timer(struct inet6_dev *idev,
+ unsigned long when)
+{
+ if (!timer_pending(&idev->rs_timer))
+ in6_dev_hold(idev);
+ mod_timer(&idev->rs_timer, jiffies + when);
+}
+
+static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
+ unsigned long delay)
+{
+ if (!delayed_work_pending(&ifp->dad_work))
+ in6_ifa_hold(ifp);
+ mod_delayed_work(addrconf_wq, &ifp->dad_work, delay);
+}
+
+static int snmp6_alloc_dev(struct inet6_dev *idev)
+{
+ int i;
+
+ idev->stats.ipv6 = alloc_percpu(struct ipstats_mib);
+ if (!idev->stats.ipv6)
+ goto err_ip;
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *addrconf_stats;
+ addrconf_stats = per_cpu_ptr(idev->stats.ipv6, i);
+ u64_stats_init(&addrconf_stats->syncp);
+ }
+
+
+ idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device),
+ GFP_KERNEL);
+ if (!idev->stats.icmpv6dev)
+ goto err_icmp;
+ idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device),
+ GFP_KERNEL);
+ if (!idev->stats.icmpv6msgdev)
+ goto err_icmpmsg;
+
+ return 0;
+
+err_icmpmsg:
+ kfree(idev->stats.icmpv6dev);
+err_icmp:
+ free_percpu(idev->stats.ipv6);
+err_ip:
+ return -ENOMEM;
+}
+
+static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
+{
+ struct inet6_dev *ndev;
+ int err = -ENOMEM;
+
+ ASSERT_RTNL();
+
+ if (dev->mtu < IPV6_MIN_MTU)
+ return ERR_PTR(-EINVAL);
+
+ ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL);
+ if (!ndev)
+ return ERR_PTR(err);
+
+ rwlock_init(&ndev->lock);
+ ndev->dev = dev;
+ INIT_LIST_HEAD(&ndev->addr_list);
+ setup_timer(&ndev->rs_timer, addrconf_rs_timer,
+ (unsigned long)ndev);
+ memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
+ ndev->cnf.mtu6 = dev->mtu;
+ ndev->cnf.sysctl = NULL;
+ ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
+ if (!ndev->nd_parms) {
+ kfree(ndev);
+ return ERR_PTR(err);
+ }
+ if (ndev->cnf.forwarding)
+ dev_disable_lro(dev);
+ /* We refer to the device */
+ dev_hold(dev);
+
+ if (snmp6_alloc_dev(ndev) < 0) {
+ ADBG(KERN_WARNING
+ "%s: cannot allocate memory for statistics; dev=%s.\n",
+ __func__, dev->name);
+ neigh_parms_release(&nd_tbl, ndev->nd_parms);
+ dev_put(dev);
+ kfree(ndev);
+ return ERR_PTR(err);
+ }
+
+ if (snmp6_register_dev(ndev) < 0) {
+ ADBG(KERN_WARNING
+ "%s: cannot create /proc/net/dev_snmp6/%s\n",
+ __func__, dev->name);
+ goto err_release;
+ }
+
+ /* One reference from device. We must do this before
+ * we invoke __ipv6_regen_rndid().
+ */
+ in6_dev_hold(ndev);
+
+ if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
+ ndev->cnf.accept_dad = -1;
+
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+ if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) {
+ pr_info("%s: Disabled Multicast RS\n", dev->name);
+ ndev->cnf.rtr_solicits = 0;
+ }
+#endif
+
+ INIT_LIST_HEAD(&ndev->tempaddr_list);
+ setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev);
+ if ((dev->flags&IFF_LOOPBACK) ||
+ dev->type == ARPHRD_TUNNEL ||
+ dev->type == ARPHRD_TUNNEL6 ||
+ dev->type == ARPHRD_SIT ||
+ dev->type == ARPHRD_NONE) {
+ ndev->cnf.use_tempaddr = -1;
+ } else {
+ in6_dev_hold(ndev);
+ ipv6_regen_rndid((unsigned long) ndev);
+ }
+
+ ndev->token = in6addr_any;
+
+ if (netif_running(dev) && addrconf_qdisc_ok(dev))
+ ndev->if_flags |= IF_READY;
+
+ ipv6_mc_init_dev(ndev);
+ ndev->tstamp = jiffies;
+ err = addrconf_sysctl_register(ndev);
+ if (err) {
+ ipv6_mc_destroy_dev(ndev);
+ del_timer(&ndev->regen_timer);
+ goto err_release;
+ }
+ /* protected by rtnl_lock */
+ rcu_assign_pointer(dev->ip6_ptr, ndev);
+
+ /* Join interface-local all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
+
+ /* Join all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+
+ /* Join all-router multicast group if forwarding is set */
+ if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+
+ return ndev;
+
+err_release:
+ neigh_parms_release(&nd_tbl, ndev->nd_parms);
+ ndev->dead = 1;
+ in6_dev_finish_destroy(ndev);
+ return ERR_PTR(err);
+}
+
+static struct inet6_dev *ipv6_find_idev(struct net_device *dev)
+{
+ struct inet6_dev *idev;
+
+ ASSERT_RTNL();
+
+ idev = __in6_dev_get(dev);
+ if (!idev) {
+ idev = ipv6_add_dev(dev);
+ if (IS_ERR(idev))
+ return NULL;
+ }
+
+ if (dev->flags&IFF_UP)
+ ipv6_mc_up(idev);
+ return idev;
+}
+
+static int inet6_netconf_msgsize_devconf(int type)
+{
+ int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ + nla_total_size(4); /* NETCONFA_IFINDEX */
+
+ /* type -1 is used for ALL */
+ if (type == -1 || type == NETCONFA_FORWARDING)
+ size += nla_total_size(4);
+#ifdef CONFIG_IPV6_MROUTE
+ if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ size += nla_total_size(4);
+#endif
+ if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ size += nla_total_size(4);
+
+ return size;
+}
+
+static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
+ struct ipv6_devconf *devconf, u32 portid,
+ u32 seq, int event, unsigned int flags,
+ int type)
+{
+ struct nlmsghdr *nlh;
+ struct netconfmsg *ncm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ncm = nlmsg_data(nlh);
+ ncm->ncm_family = AF_INET6;
+
+ if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
+ goto nla_put_failure;
+
+ /* type -1 is used for ALL */
+ if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0)
+ goto nla_put_failure;
+#ifdef CONFIG_IPV6_MROUTE
+ if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_MC_FORWARDING,
+ devconf->mc_forwarding) < 0)
+ goto nla_put_failure;
+#endif
+ if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex,
+ struct ipv6_devconf *devconf)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet6_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
+ RTM_NEWNETCONF, 0, type);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err);
+}
+
+static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
+ [NETCONFA_IFINDEX] = { .len = sizeof(int) },
+ [NETCONFA_FORWARDING] = { .len = sizeof(int) },
+ [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) },
+};
+
+static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
+ struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NETCONFA_MAX+1];
+ struct netconfmsg *ncm;
+ struct sk_buff *skb;
+ struct ipv6_devconf *devconf;
+ struct inet6_dev *in6_dev;
+ struct net_device *dev;
+ int ifindex;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
+ devconf_ipv6_policy);
+ if (err < 0)
+ goto errout;
+
+ err = EINVAL;
+ if (!tb[NETCONFA_IFINDEX])
+ goto errout;
+
+ ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
+ switch (ifindex) {
+ case NETCONFA_IFINDEX_ALL:
+ devconf = net->ipv6.devconf_all;
+ break;
+ case NETCONFA_IFINDEX_DEFAULT:
+ devconf = net->ipv6.devconf_dflt;
+ break;
+ default:
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev)
+ goto errout;
+ in6_dev = __in6_dev_get(dev);
+ if (!in6_dev)
+ goto errout;
+ devconf = &in6_dev->cnf;
+ break;
+ }
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
+ -1);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+}
+
+static int inet6_netconf_dump_devconf(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ goto cont;
+
+ if (inet6_netconf_fill_devconf(skb, dev->ifindex,
+ &idev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ -1) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+ if (h == NETDEV_HASHENTRIES) {
+ if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) < 0)
+ goto done;
+ else
+ h++;
+ }
+ if (h == NETDEV_HASHENTRIES + 1) {
+ if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ -1) < 0)
+ goto done;
+ else
+ h++;
+ }
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
+#ifdef CONFIG_SYSCTL
+static void dev_forward_change(struct inet6_dev *idev)
+{
+ struct net_device *dev;
+ struct inet6_ifaddr *ifa;
+
+ if (!idev)
+ return;
+ dev = idev->dev;
+ if (idev->cnf.forwarding)
+ dev_disable_lro(dev);
+ if (dev->flags & IFF_MULTICAST) {
+ if (idev->cnf.forwarding) {
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters);
+ ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters);
+ } else {
+ ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters);
+ ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters);
+ ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters);
+ }
+ }
+
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ if (ifa->flags&IFA_F_TENTATIVE)
+ continue;
+ if (idev->cnf.forwarding)
+ addrconf_join_anycast(ifa);
+ else
+ addrconf_leave_anycast(ifa);
+ }
+ inet6_netconf_notify_devconf(dev_net(dev), NETCONFA_FORWARDING,
+ dev->ifindex, &idev->cnf);
+}
+
+
+static void addrconf_forward_change(struct net *net, __s32 newf)
+{
+ struct net_device *dev;
+ struct inet6_dev *idev;
+
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ int changed = (!idev->cnf.forwarding) ^ (!newf);
+ idev->cnf.forwarding = newf;
+ if (changed)
+ dev_forward_change(idev);
+ }
+ }
+}
+
+static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
+{
+ struct net *net;
+ int old;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ net = (struct net *)table->extra2;
+ old = *p;
+ *p = newf;
+
+ if (p == &net->ipv6.devconf_dflt->forwarding) {
+ if ((!newf) ^ (!old))
+ inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+ rtnl_unlock();
+ return 0;
+ }
+
+ if (p == &net->ipv6.devconf_all->forwarding) {
+ net->ipv6.devconf_dflt->forwarding = newf;
+ addrconf_forward_change(net, newf);
+ if ((!newf) ^ (!old))
+ inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+ } else if ((!newf) ^ (!old))
+ dev_forward_change((struct inet6_dev *)table->extra1);
+ rtnl_unlock();
+
+ if (newf)
+ rt6_purge_dflt_routers(net);
+ return 1;
+}
+#endif
+
+/* Nobody refers to this ifaddr, destroy it */
+void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
+{
+ WARN_ON(!hlist_unhashed(&ifp->addr_lst));
+
+#ifdef NET_REFCNT_DEBUG
+ pr_debug("%s\n", __func__);
+#endif
+
+ in6_dev_put(ifp->idev);
+
+ if (cancel_delayed_work(&ifp->dad_work))
+ pr_notice("delayed DAD work was pending while freeing ifa=%p\n",
+ ifp);
+
+ if (ifp->state != INET6_IFADDR_STATE_DEAD) {
+ pr_warn("Freeing alive inet6 address %p\n", ifp);
+ return;
+ }
+ ip6_rt_put(ifp->rt);
+
+ kfree_rcu(ifp, rcu);
+}
+
+static void
+ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
+{
+ struct list_head *p;
+ int ifp_scope = ipv6_addr_src_scope(&ifp->addr);
+
+ /*
+ * Each device address list is sorted in order of scope -
+ * global before linklocal.
+ */
+ list_for_each(p, &idev->addr_list) {
+ struct inet6_ifaddr *ifa
+ = list_entry(p, struct inet6_ifaddr, if_list);
+ if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
+ break;
+ }
+
+ list_add_tail(&ifp->if_list, p);
+}
+
+static u32 inet6_addr_hash(const struct in6_addr *addr)
+{
+ return hash_32(ipv6_addr_hash(addr), IN6_ADDR_HSIZE_SHIFT);
+}
+
+/* On success it returns ifp with increased reference count */
+
+static struct inet6_ifaddr *
+ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
+ const struct in6_addr *peer_addr, int pfxlen,
+ int scope, u32 flags, u32 valid_lft, u32 prefered_lft)
+{
+ struct inet6_ifaddr *ifa = NULL;
+ struct rt6_info *rt;
+ unsigned int hash;
+ int err = 0;
+ int addr_type = ipv6_addr_type(addr);
+
+ if (addr_type == IPV6_ADDR_ANY ||
+ addr_type & IPV6_ADDR_MULTICAST ||
+ (!(idev->dev->flags & IFF_LOOPBACK) &&
+ addr_type & IPV6_ADDR_LOOPBACK))
+ return ERR_PTR(-EADDRNOTAVAIL);
+
+ rcu_read_lock_bh();
+ if (idev->dead) {
+ err = -ENODEV; /*XXX*/
+ goto out2;
+ }
+
+ if (idev->cnf.disable_ipv6) {
+ err = -EACCES;
+ goto out2;
+ }
+
+ spin_lock(&addrconf_hash_lock);
+
+ /* Ignore adding duplicate addresses on an interface */
+ if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) {
+ ADBG("ipv6_add_addr: already assigned\n");
+ err = -EEXIST;
+ goto out;
+ }
+
+ ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
+
+ if (!ifa) {
+ ADBG("ipv6_add_addr: malloc failed\n");
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ rt = addrconf_dst_alloc(idev, addr, false);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto out;
+ }
+
+ neigh_parms_data_state_setall(idev->nd_parms);
+
+ ifa->addr = *addr;
+ if (peer_addr)
+ ifa->peer_addr = *peer_addr;
+
+ spin_lock_init(&ifa->lock);
+ INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
+ INIT_HLIST_NODE(&ifa->addr_lst);
+ ifa->scope = scope;
+ ifa->prefix_len = pfxlen;
+ ifa->flags = flags | IFA_F_TENTATIVE;
+ ifa->valid_lft = valid_lft;
+ ifa->prefered_lft = prefered_lft;
+ ifa->cstamp = ifa->tstamp = jiffies;
+ ifa->tokenized = false;
+
+ ifa->rt = rt;
+
+ ifa->idev = idev;
+ in6_dev_hold(idev);
+ /* For caller */
+ in6_ifa_hold(ifa);
+
+ /* Add to big hash table */
+ hash = inet6_addr_hash(addr);
+
+ hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
+ spin_unlock(&addrconf_hash_lock);
+
+ write_lock(&idev->lock);
+ /* Add to inet6_dev unicast addr list. */
+ ipv6_link_dev_addr(idev, ifa);
+
+ if (ifa->flags&IFA_F_TEMPORARY) {
+ list_add(&ifa->tmp_list, &idev->tempaddr_list);
+ in6_ifa_hold(ifa);
+ }
+
+ in6_ifa_hold(ifa);
+ write_unlock(&idev->lock);
+out2:
+ rcu_read_unlock_bh();
+
+ if (likely(err == 0))
+ inet6addr_notifier_call_chain(NETDEV_UP, ifa);
+ else {
+ kfree(ifa);
+ ifa = ERR_PTR(err);
+ }
+
+ return ifa;
+out:
+ spin_unlock(&addrconf_hash_lock);
+ goto out2;
+}
+
+enum cleanup_prefix_rt_t {
+ CLEANUP_PREFIX_RT_NOP, /* no cleanup action for prefix route */
+ CLEANUP_PREFIX_RT_DEL, /* delete the prefix route */
+ CLEANUP_PREFIX_RT_EXPIRE, /* update the lifetime of the prefix route */
+};
+
+/*
+ * Check, whether the prefix for ifp would still need a prefix route
+ * after deleting ifp. The function returns one of the CLEANUP_PREFIX_RT_*
+ * constants.
+ *
+ * 1) we don't purge prefix if address was not permanent.
+ * prefix is managed by its own lifetime.
+ * 2) we also don't purge, if the address was IFA_F_NOPREFIXROUTE.
+ * 3) if there are no addresses, delete prefix.
+ * 4) if there are still other permanent address(es),
+ * corresponding prefix is still permanent.
+ * 5) if there are still other addresses with IFA_F_NOPREFIXROUTE,
+ * don't purge the prefix, assume user space is managing it.
+ * 6) otherwise, update prefix lifetime to the
+ * longest valid lifetime among the corresponding
+ * addresses on the device.
+ * Note: subsequent RA will update lifetime.
+ **/
+static enum cleanup_prefix_rt_t
+check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
+{
+ struct inet6_ifaddr *ifa;
+ struct inet6_dev *idev = ifp->idev;
+ unsigned long lifetime;
+ enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_DEL;
+
+ *expires = jiffies;
+
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ if (ifa == ifp)
+ continue;
+ if (!ipv6_prefix_equal(&ifa->addr, &ifp->addr,
+ ifp->prefix_len))
+ continue;
+ if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE))
+ return CLEANUP_PREFIX_RT_NOP;
+
+ action = CLEANUP_PREFIX_RT_EXPIRE;
+
+ spin_lock(&ifa->lock);
+
+ lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ);
+ /*
+ * Note: Because this address is
+ * not permanent, lifetime <
+ * LONG_MAX / HZ here.
+ */
+ if (time_before(*expires, ifa->tstamp + lifetime * HZ))
+ *expires = ifa->tstamp + lifetime * HZ;
+ spin_unlock(&ifa->lock);
+ }
+
+ return action;
+}
+
+static void
+cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
+{
+ struct rt6_info *rt;
+
+ rt = addrconf_get_prefix_route(&ifp->addr,
+ ifp->prefix_len,
+ ifp->idev->dev,
+ 0, RTF_GATEWAY | RTF_DEFAULT);
+ if (rt) {
+ if (del_rt)
+ ip6_del_rt(rt);
+ else {
+ if (!(rt->rt6i_flags & RTF_EXPIRES))
+ rt6_set_expires(rt, expires);
+ ip6_rt_put(rt);
+ }
+ }
+}
+
+
+/* This function wants to get referenced ifp and releases it before return */
+
+static void ipv6_del_addr(struct inet6_ifaddr *ifp)
+{
+ int state;
+ enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP;
+ unsigned long expires;
+
+ ASSERT_RTNL();
+
+ spin_lock_bh(&ifp->lock);
+ state = ifp->state;
+ ifp->state = INET6_IFADDR_STATE_DEAD;
+ spin_unlock_bh(&ifp->lock);
+
+ if (state == INET6_IFADDR_STATE_DEAD)
+ goto out;
+
+ spin_lock_bh(&addrconf_hash_lock);
+ hlist_del_init_rcu(&ifp->addr_lst);
+ spin_unlock_bh(&addrconf_hash_lock);
+
+ write_lock_bh(&ifp->idev->lock);
+
+ if (ifp->flags&IFA_F_TEMPORARY) {
+ list_del(&ifp->tmp_list);
+ if (ifp->ifpub) {
+ in6_ifa_put(ifp->ifpub);
+ ifp->ifpub = NULL;
+ }
+ __in6_ifa_put(ifp);
+ }
+
+ if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE))
+ action = check_cleanup_prefix_route(ifp, &expires);
+
+ list_del_init(&ifp->if_list);
+ __in6_ifa_put(ifp);
+
+ write_unlock_bh(&ifp->idev->lock);
+
+ addrconf_del_dad_work(ifp);
+
+ ipv6_ifa_notify(RTM_DELADDR, ifp);
+
+ inet6addr_notifier_call_chain(NETDEV_DOWN, ifp);
+
+ if (action != CLEANUP_PREFIX_RT_NOP) {
+ cleanup_prefix_route(ifp, expires,
+ action == CLEANUP_PREFIX_RT_DEL);
+ }
+
+ /* clean up prefsrc entries */
+ rt6_remove_prefsrc(ifp);
+out:
+ in6_ifa_put(ifp);
+}
+
+static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift)
+{
+ struct inet6_dev *idev = ifp->idev;
+ struct in6_addr addr, *tmpaddr;
+ unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age;
+ unsigned long regen_advance;
+ int tmp_plen;
+ int ret = 0;
+ u32 addr_flags;
+ unsigned long now = jiffies;
+
+ write_lock_bh(&idev->lock);
+ if (ift) {
+ spin_lock_bh(&ift->lock);
+ memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8);
+ spin_unlock_bh(&ift->lock);
+ tmpaddr = &addr;
+ } else {
+ tmpaddr = NULL;
+ }
+retry:
+ in6_dev_hold(idev);
+ if (idev->cnf.use_tempaddr <= 0) {
+ write_unlock_bh(&idev->lock);
+ pr_info("%s: use_tempaddr is disabled\n", __func__);
+ in6_dev_put(idev);
+ ret = -1;
+ goto out;
+ }
+ spin_lock_bh(&ifp->lock);
+ if (ifp->regen_count++ >= idev->cnf.regen_max_retry) {
+ idev->cnf.use_tempaddr = -1; /*XXX*/
+ spin_unlock_bh(&ifp->lock);
+ write_unlock_bh(&idev->lock);
+ pr_warn("%s: regeneration time exceeded - disabled temporary address support\n",
+ __func__);
+ in6_dev_put(idev);
+ ret = -1;
+ goto out;
+ }
+ in6_ifa_hold(ifp);
+ memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
+ __ipv6_try_regen_rndid(idev, tmpaddr);
+ memcpy(&addr.s6_addr[8], idev->rndid, 8);
+ age = (now - ifp->tstamp) / HZ;
+ tmp_valid_lft = min_t(__u32,
+ ifp->valid_lft,
+ idev->cnf.temp_valid_lft + age);
+ tmp_prefered_lft = min_t(__u32,
+ ifp->prefered_lft,
+ idev->cnf.temp_prefered_lft + age -
+ idev->cnf.max_desync_factor);
+ tmp_plen = ifp->prefix_len;
+ tmp_tstamp = ifp->tstamp;
+ spin_unlock_bh(&ifp->lock);
+
+ regen_advance = idev->cnf.regen_max_retry *
+ idev->cnf.dad_transmits *
+ NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
+ write_unlock_bh(&idev->lock);
+
+ /* A temporary address is created only if this calculated Preferred
+ * Lifetime is greater than REGEN_ADVANCE time units. In particular,
+ * an implementation must not create a temporary address with a zero
+ * Preferred Lifetime.
+ * Use age calculation as in addrconf_verify to avoid unnecessary
+ * temporary addresses being generated.
+ */
+ age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+ if (tmp_prefered_lft <= regen_advance + age) {
+ in6_ifa_put(ifp);
+ in6_dev_put(idev);
+ ret = -1;
+ goto out;
+ }
+
+ addr_flags = IFA_F_TEMPORARY;
+ /* set in addrconf_prefix_rcv() */
+ if (ifp->flags & IFA_F_OPTIMISTIC)
+ addr_flags |= IFA_F_OPTIMISTIC;
+
+ ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
+ ipv6_addr_scope(&addr), addr_flags,
+ tmp_valid_lft, tmp_prefered_lft);
+ if (IS_ERR(ift)) {
+ in6_ifa_put(ifp);
+ in6_dev_put(idev);
+ pr_info("%s: retry temporary address regeneration\n", __func__);
+ tmpaddr = &addr;
+ write_lock_bh(&idev->lock);
+ goto retry;
+ }
+
+ spin_lock_bh(&ift->lock);
+ ift->ifpub = ifp;
+ ift->cstamp = now;
+ ift->tstamp = tmp_tstamp;
+ spin_unlock_bh(&ift->lock);
+
+ addrconf_dad_start(ift);
+ in6_ifa_put(ift);
+ in6_dev_put(idev);
+out:
+ return ret;
+}
+
+/*
+ * Choose an appropriate source address (RFC3484)
+ */
+enum {
+ IPV6_SADDR_RULE_INIT = 0,
+ IPV6_SADDR_RULE_LOCAL,
+ IPV6_SADDR_RULE_SCOPE,
+ IPV6_SADDR_RULE_PREFERRED,
+#ifdef CONFIG_IPV6_MIP6
+ IPV6_SADDR_RULE_HOA,
+#endif
+ IPV6_SADDR_RULE_OIF,
+ IPV6_SADDR_RULE_LABEL,
+ IPV6_SADDR_RULE_PRIVACY,
+ IPV6_SADDR_RULE_ORCHID,
+ IPV6_SADDR_RULE_PREFIX,
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ IPV6_SADDR_RULE_NOT_OPTIMISTIC,
+#endif
+ IPV6_SADDR_RULE_MAX
+};
+
+struct ipv6_saddr_score {
+ int rule;
+ int addr_type;
+ struct inet6_ifaddr *ifa;
+ DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX);
+ int scopedist;
+ int matchlen;
+};
+
+struct ipv6_saddr_dst {
+ const struct in6_addr *addr;
+ int ifindex;
+ int scope;
+ int label;
+ unsigned int prefs;
+};
+
+static inline int ipv6_saddr_preferred(int type)
+{
+ if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK))
+ return 1;
+ return 0;
+}
+
+static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev)
+{
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic;
+#else
+ return false;
+#endif
+}
+
+static int ipv6_get_saddr_eval(struct net *net,
+ struct ipv6_saddr_score *score,
+ struct ipv6_saddr_dst *dst,
+ int i)
+{
+ int ret;
+
+ if (i <= score->rule) {
+ switch (i) {
+ case IPV6_SADDR_RULE_SCOPE:
+ ret = score->scopedist;
+ break;
+ case IPV6_SADDR_RULE_PREFIX:
+ ret = score->matchlen;
+ break;
+ default:
+ ret = !!test_bit(i, score->scorebits);
+ }
+ goto out;
+ }
+
+ switch (i) {
+ case IPV6_SADDR_RULE_INIT:
+ /* Rule 0: remember if hiscore is not ready yet */
+ ret = !!score->ifa;
+ break;
+ case IPV6_SADDR_RULE_LOCAL:
+ /* Rule 1: Prefer same address */
+ ret = ipv6_addr_equal(&score->ifa->addr, dst->addr);
+ break;
+ case IPV6_SADDR_RULE_SCOPE:
+ /* Rule 2: Prefer appropriate scope
+ *
+ * ret
+ * ^
+ * -1 | d 15
+ * ---+--+-+---> scope
+ * |
+ * | d is scope of the destination.
+ * B-d | \
+ * | \ <- smaller scope is better if
+ * B-15 | \ if scope is enough for destination.
+ * | ret = B - scope (-1 <= scope >= d <= 15).
+ * d-C-1 | /
+ * |/ <- greater is better
+ * -C / if scope is not enough for destination.
+ * /| ret = scope - C (-1 <= d < scope <= 15).
+ *
+ * d - C - 1 < B -15 (for all -1 <= d <= 15).
+ * C > d + 14 - B >= 15 + 14 - B = 29 - B.
+ * Assume B = 0 and we get C > 29.
+ */
+ ret = __ipv6_addr_src_scope(score->addr_type);
+ if (ret >= dst->scope)
+ ret = -ret;
+ else
+ ret -= 128; /* 30 is enough */
+ score->scopedist = ret;
+ break;
+ case IPV6_SADDR_RULE_PREFERRED:
+ {
+ /* Rule 3: Avoid deprecated and optimistic addresses */
+ u8 avoid = IFA_F_DEPRECATED;
+
+ if (!ipv6_use_optimistic_addr(score->ifa->idev))
+ avoid |= IFA_F_OPTIMISTIC;
+ ret = ipv6_saddr_preferred(score->addr_type) ||
+ !(score->ifa->flags & avoid);
+ break;
+ }
+#ifdef CONFIG_IPV6_MIP6
+ case IPV6_SADDR_RULE_HOA:
+ {
+ /* Rule 4: Prefer home address */
+ int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA);
+ ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome;
+ break;
+ }
+#endif
+ case IPV6_SADDR_RULE_OIF:
+ /* Rule 5: Prefer outgoing interface */
+ ret = (!dst->ifindex ||
+ dst->ifindex == score->ifa->idev->dev->ifindex);
+ break;
+ case IPV6_SADDR_RULE_LABEL:
+ /* Rule 6: Prefer matching label */
+ ret = ipv6_addr_label(net,
+ &score->ifa->addr, score->addr_type,
+ score->ifa->idev->dev->ifindex) == dst->label;
+ break;
+ case IPV6_SADDR_RULE_PRIVACY:
+ {
+ /* Rule 7: Prefer public address
+ * Note: prefer temporary address if use_tempaddr >= 2
+ */
+ int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ?
+ !!(dst->prefs & IPV6_PREFER_SRC_TMP) :
+ score->ifa->idev->cnf.use_tempaddr >= 2;
+ ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp;
+ break;
+ }
+ case IPV6_SADDR_RULE_ORCHID:
+ /* Rule 8-: Prefer ORCHID vs ORCHID or
+ * non-ORCHID vs non-ORCHID
+ */
+ ret = !(ipv6_addr_orchid(&score->ifa->addr) ^
+ ipv6_addr_orchid(dst->addr));
+ break;
+ case IPV6_SADDR_RULE_PREFIX:
+ /* Rule 8: Use longest matching prefix */
+ ret = ipv6_addr_diff(&score->ifa->addr, dst->addr);
+ if (ret > score->ifa->prefix_len)
+ ret = score->ifa->prefix_len;
+ score->matchlen = ret;
+ break;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ case IPV6_SADDR_RULE_NOT_OPTIMISTIC:
+ /* Optimistic addresses still have lower precedence than other
+ * preferred addresses.
+ */
+ ret = !(score->ifa->flags & IFA_F_OPTIMISTIC);
+ break;
+#endif
+ default:
+ ret = 0;
+ }
+
+ if (ret)
+ __set_bit(i, score->scorebits);
+ score->rule = i;
+out:
+ return ret;
+}
+
+int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
+ const struct in6_addr *daddr, unsigned int prefs,
+ struct in6_addr *saddr)
+{
+ struct ipv6_saddr_score scores[2],
+ *score = &scores[0], *hiscore = &scores[1];
+ struct ipv6_saddr_dst dst;
+ struct net_device *dev;
+ int dst_type;
+
+ dst_type = __ipv6_addr_type(daddr);
+ dst.addr = daddr;
+ dst.ifindex = dst_dev ? dst_dev->ifindex : 0;
+ dst.scope = __ipv6_addr_src_scope(dst_type);
+ dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
+ dst.prefs = prefs;
+
+ hiscore->rule = -1;
+ hiscore->ifa = NULL;
+
+ rcu_read_lock();
+
+ for_each_netdev_rcu(net, dev) {
+ struct inet6_dev *idev;
+
+ /* Candidate Source Address (section 4)
+ * - multicast and link-local destination address,
+ * the set of candidate source address MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same link as the outgoing
+ * interface.
+ * (- For site-local destination addresses, the
+ * set of candidate source addresses MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same site as the outgoing
+ * interface.)
+ */
+ if (((dst_type & IPV6_ADDR_MULTICAST) ||
+ dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
+ dst.ifindex && dev->ifindex != dst.ifindex)
+ continue;
+
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ continue;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+ int i;
+
+ /*
+ * - Tentative Address (RFC2462 section 5.4)
+ * - A tentative address is not considered
+ * "assigned to an interface" in the traditional
+ * sense, unless it is also flagged as optimistic.
+ * - Candidate Source Address (section 4)
+ * - In any case, anycast addresses, multicast
+ * addresses, and the unspecified address MUST
+ * NOT be included in a candidate set.
+ */
+ if ((score->ifa->flags & IFA_F_TENTATIVE) &&
+ (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
+ continue;
+
+ score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+
+ if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
+ score->addr_type & IPV6_ADDR_MULTICAST)) {
+ net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
+ dev->name);
+ continue;
+ }
+
+ score->rule = -1;
+ bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
+
+ for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
+ int minihiscore, miniscore;
+
+ minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i);
+ miniscore = ipv6_get_saddr_eval(net, score, &dst, i);
+
+ if (minihiscore > miniscore) {
+ if (i == IPV6_SADDR_RULE_SCOPE &&
+ score->scopedist > 0) {
+ /*
+ * special case:
+ * each remaining entry
+ * has too small (not enough)
+ * scope, because ifa entries
+ * are sorted by their scope
+ * values.
+ */
+ goto try_nextdev;
+ }
+ break;
+ } else if (minihiscore < miniscore) {
+ if (hiscore->ifa)
+ in6_ifa_put(hiscore->ifa);
+
+ in6_ifa_hold(score->ifa);
+
+ swap(hiscore, score);
+
+ /* restore our iterator */
+ score->ifa = hiscore->ifa;
+
+ break;
+ }
+ }
+ }
+try_nextdev:
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+
+ if (!hiscore->ifa)
+ return -EADDRNOTAVAIL;
+
+ *saddr = hiscore->ifa->addr;
+ in6_ifa_put(hiscore->ifa);
+ return 0;
+}
+EXPORT_SYMBOL(ipv6_dev_get_saddr);
+
+int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
+ u32 banned_flags)
+{
+ struct inet6_ifaddr *ifp;
+ int err = -EADDRNOTAVAIL;
+
+ list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
+ if (ifp->scope > IFA_LINK)
+ break;
+ if (ifp->scope == IFA_LINK &&
+ !(ifp->flags & banned_flags)) {
+ *addr = ifp->addr;
+ err = 0;
+ break;
+ }
+ }
+ return err;
+}
+
+int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
+ u32 banned_flags)
+{
+ struct inet6_dev *idev;
+ int err = -EADDRNOTAVAIL;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ read_lock_bh(&idev->lock);
+ err = __ipv6_get_lladdr(idev, addr, banned_flags);
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+ return err;
+}
+
+static int ipv6_count_addresses(struct inet6_dev *idev)
+{
+ int cnt = 0;
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list)
+ cnt++;
+ read_unlock_bh(&idev->lock);
+ return cnt;
+}
+
+int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, int strict)
+{
+ return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE);
+}
+EXPORT_SYMBOL(ipv6_chk_addr);
+
+int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, int strict,
+ u32 banned_flags)
+{
+ struct inet6_ifaddr *ifp;
+ unsigned int hash = inet6_addr_hash(addr);
+ u32 ifp_flags;
+
+ rcu_read_lock_bh();
+ hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
+ if (!net_eq(dev_net(ifp->idev->dev), net))
+ continue;
+ /* Decouple optimistic from tentative for evaluation here.
+ * Ban optimistic addresses explicitly, when required.
+ */
+ ifp_flags = (ifp->flags&IFA_F_OPTIMISTIC)
+ ? (ifp->flags&~IFA_F_TENTATIVE)
+ : ifp->flags;
+ if (ipv6_addr_equal(&ifp->addr, addr) &&
+ !(ifp_flags&banned_flags) &&
+ (!dev || ifp->idev->dev == dev ||
+ !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
+ rcu_read_unlock_bh();
+ return 1;
+ }
+ }
+
+ rcu_read_unlock_bh();
+ return 0;
+}
+EXPORT_SYMBOL(ipv6_chk_addr_and_flags);
+
+static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev)
+{
+ unsigned int hash = inet6_addr_hash(addr);
+ struct inet6_ifaddr *ifp;
+
+ hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) {
+ if (!net_eq(dev_net(ifp->idev->dev), net))
+ continue;
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ if (!dev || ifp->idev->dev == dev)
+ return true;
+ }
+ }
+ return false;
+}
+
+/* Compares an address/prefix_len with addresses on device @dev.
+ * If one is found it returns true.
+ */
+bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
+ const unsigned int prefix_len, struct net_device *dev)
+{
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+ bool ret = false;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len);
+ if (ret)
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(ipv6_chk_custom_prefix);
+
+int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
+{
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+ int onlink;
+
+ onlink = 0;
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ onlink = ipv6_prefix_equal(addr, &ifa->addr,
+ ifa->prefix_len);
+ if (onlink)
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+ return onlink;
+}
+EXPORT_SYMBOL(ipv6_chk_prefix);
+
+struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev, int strict)
+{
+ struct inet6_ifaddr *ifp, *result = NULL;
+ unsigned int hash = inet6_addr_hash(addr);
+
+ rcu_read_lock_bh();
+ hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) {
+ if (!net_eq(dev_net(ifp->idev->dev), net))
+ continue;
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ if (!dev || ifp->idev->dev == dev ||
+ !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
+ result = ifp;
+ in6_ifa_hold(ifp);
+ break;
+ }
+ }
+ }
+ rcu_read_unlock_bh();
+
+ return result;
+}
+
+/* Gets referenced address, destroys ifaddr */
+
+static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
+{
+ if (ifp->flags&IFA_F_PERMANENT) {
+ spin_lock_bh(&ifp->lock);
+ addrconf_del_dad_work(ifp);
+ ifp->flags |= IFA_F_TENTATIVE;
+ if (dad_failed)
+ ifp->flags |= IFA_F_DADFAILED;
+ spin_unlock_bh(&ifp->lock);
+ if (dad_failed)
+ ipv6_ifa_notify(0, ifp);
+ in6_ifa_put(ifp);
+ } else if (ifp->flags&IFA_F_TEMPORARY) {
+ struct inet6_ifaddr *ifpub;
+ spin_lock_bh(&ifp->lock);
+ ifpub = ifp->ifpub;
+ if (ifpub) {
+ in6_ifa_hold(ifpub);
+ spin_unlock_bh(&ifp->lock);
+ ipv6_create_tempaddr(ifpub, ifp);
+ in6_ifa_put(ifpub);
+ } else {
+ spin_unlock_bh(&ifp->lock);
+ }
+ ipv6_del_addr(ifp);
+ } else {
+ ipv6_del_addr(ifp);
+ }
+}
+
+static int addrconf_dad_end(struct inet6_ifaddr *ifp)
+{
+ int err = -ENOENT;
+
+ spin_lock_bh(&ifp->lock);
+ if (ifp->state == INET6_IFADDR_STATE_DAD) {
+ ifp->state = INET6_IFADDR_STATE_POSTDAD;
+ err = 0;
+ }
+ spin_unlock_bh(&ifp->lock);
+
+ return err;
+}
+
+void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+{
+ struct in6_addr addr;
+ struct inet6_dev *idev = ifp->idev;
+ struct net *net = dev_net(ifp->idev->dev);
+
+ if (addrconf_dad_end(ifp)) {
+ in6_ifa_put(ifp);
+ return;
+ }
+
+ net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n",
+ ifp->idev->dev->name, &ifp->addr);
+
+ spin_lock_bh(&ifp->lock);
+
+ if (ifp->flags & IFA_F_STABLE_PRIVACY) {
+ int scope = ifp->scope;
+ u32 flags = ifp->flags;
+ struct in6_addr new_addr;
+ struct inet6_ifaddr *ifp2;
+ u32 valid_lft, preferred_lft;
+ int pfxlen = ifp->prefix_len;
+ int retries = ifp->stable_privacy_retry + 1;
+
+ if (retries > net->ipv6.sysctl.idgen_retries) {
+ net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
+ ifp->idev->dev->name);
+ goto errdad;
+ }
+
+ new_addr = ifp->addr;
+ if (ipv6_generate_stable_address(&new_addr, retries,
+ idev))
+ goto errdad;
+
+ valid_lft = ifp->valid_lft;
+ preferred_lft = ifp->prefered_lft;
+
+ spin_unlock_bh(&ifp->lock);
+
+ if (idev->cnf.max_addresses &&
+ ipv6_count_addresses(idev) >=
+ idev->cnf.max_addresses)
+ goto lock_errdad;
+
+ net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
+ ifp->idev->dev->name);
+
+ ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
+ scope, flags, valid_lft,
+ preferred_lft);
+ if (IS_ERR(ifp2))
+ goto lock_errdad;
+
+ spin_lock_bh(&ifp2->lock);
+ ifp2->stable_privacy_retry = retries;
+ ifp2->state = INET6_IFADDR_STATE_PREDAD;
+ spin_unlock_bh(&ifp2->lock);
+
+ addrconf_mod_dad_work(ifp2, net->ipv6.sysctl.idgen_delay);
+ in6_ifa_put(ifp2);
+lock_errdad:
+ spin_lock_bh(&ifp->lock);
+ } else if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
+ addr.s6_addr32[0] = htonl(0xfe800000);
+ addr.s6_addr32[1] = 0;
+
+ if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
+ ipv6_addr_equal(&ifp->addr, &addr)) {
+ /* DAD failed for link-local based on MAC address */
+ idev->cnf.disable_ipv6 = 1;
+
+ pr_info("%s: IPv6 being disabled!\n",
+ ifp->idev->dev->name);
+ }
+ }
+
+errdad:
+ /* transition from _POSTDAD to _ERRDAD */
+ ifp->state = INET6_IFADDR_STATE_ERRDAD;
+ spin_unlock_bh(&ifp->lock);
+
+ addrconf_mod_dad_work(ifp, 0);
+}
+
+/* Join to solicited addr multicast group.
+ * caller must hold RTNL */
+void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
+{
+ struct in6_addr maddr;
+
+ if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
+ return;
+
+ addrconf_addr_solict_mult(addr, &maddr);
+ ipv6_dev_mc_inc(dev, &maddr);
+}
+
+/* caller must hold RTNL */
+void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
+{
+ struct in6_addr maddr;
+
+ if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
+ return;
+
+ addrconf_addr_solict_mult(addr, &maddr);
+ __ipv6_dev_mc_dec(idev, &maddr);
+}
+
+/* caller must hold RTNL */
+static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
+{
+ struct in6_addr addr;
+
+ if (ifp->prefix_len >= 127) /* RFC 6164 */
+ return;
+ ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
+ if (ipv6_addr_any(&addr))
+ return;
+ __ipv6_dev_ac_inc(ifp->idev, &addr);
+}
+
+/* caller must hold RTNL */
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
+{
+ struct in6_addr addr;
+
+ if (ifp->prefix_len >= 127) /* RFC 6164 */
+ return;
+ ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
+ if (ipv6_addr_any(&addr))
+ return;
+ __ipv6_dev_ac_dec(ifp->idev, &addr);
+}
+
+static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
+{
+ if (dev->addr_len != ETH_ALEN)
+ return -1;
+ memcpy(eui, dev->dev_addr, 3);
+ memcpy(eui + 5, dev->dev_addr + 3, 3);
+
+ /*
+ * The zSeries OSA network cards can be shared among various
+ * OS instances, but the OSA cards have only one MAC address.
+ * This leads to duplicate address conflicts in conjunction
+ * with IPv6 if more than one instance uses the same card.
+ *
+ * The driver for these cards can deliver a unique 16-bit
+ * identifier for each instance sharing the same card. It is
+ * placed instead of 0xFFFE in the interface identifier. The
+ * "u" bit of the interface identifier is not inverted in this
+ * case. Hence the resulting interface identifier has local
+ * scope according to RFC2373.
+ */
+ if (dev->dev_id) {
+ eui[3] = (dev->dev_id >> 8) & 0xFF;
+ eui[4] = dev->dev_id & 0xFF;
+ } else {
+ eui[3] = 0xFF;
+ eui[4] = 0xFE;
+ eui[0] ^= 2;
+ }
+ return 0;
+}
+
+static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev)
+{
+ if (dev->addr_len != IEEE802154_ADDR_LEN)
+ return -1;
+ memcpy(eui, dev->dev_addr, 8);
+ eui[0] ^= 2;
+ return 0;
+}
+
+static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev)
+{
+ union fwnet_hwaddr *ha;
+
+ if (dev->addr_len != FWNET_ALEN)
+ return -1;
+
+ ha = (union fwnet_hwaddr *)dev->dev_addr;
+
+ memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id));
+ eui[0] ^= 2;
+ return 0;
+}
+
+static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev)
+{
+ /* XXX: inherit EUI-64 from other interface -- yoshfuji */
+ if (dev->addr_len != ARCNET_ALEN)
+ return -1;
+ memset(eui, 0, 7);
+ eui[7] = *(u8 *)dev->dev_addr;
+ return 0;
+}
+
+static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
+{
+ if (dev->addr_len != INFINIBAND_ALEN)
+ return -1;
+ memcpy(eui, dev->dev_addr + 12, 8);
+ eui[0] |= 2;
+ return 0;
+}
+
+static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
+{
+ if (addr == 0)
+ return -1;
+ eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) ||
+ ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) ||
+ ipv4_is_private_172(addr) || ipv4_is_test_192(addr) ||
+ ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) ||
+ ipv4_is_test_198(addr) || ipv4_is_multicast(addr) ||
+ ipv4_is_lbcast(addr)) ? 0x00 : 0x02;
+ eui[1] = 0;
+ eui[2] = 0x5E;
+ eui[3] = 0xFE;
+ memcpy(eui + 4, &addr, 4);
+ return 0;
+}
+
+static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
+{
+ if (dev->priv_flags & IFF_ISATAP)
+ return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
+ return -1;
+}
+
+static int addrconf_ifid_gre(u8 *eui, struct net_device *dev)
+{
+ return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
+}
+
+static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev)
+{
+ memcpy(eui, dev->perm_addr, 3);
+ memcpy(eui + 5, dev->perm_addr + 3, 3);
+ eui[3] = 0xFF;
+ eui[4] = 0xFE;
+ eui[0] ^= 2;
+ return 0;
+}
+
+static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
+{
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_FDDI:
+ return addrconf_ifid_eui48(eui, dev);
+ case ARPHRD_ARCNET:
+ return addrconf_ifid_arcnet(eui, dev);
+ case ARPHRD_INFINIBAND:
+ return addrconf_ifid_infiniband(eui, dev);
+ case ARPHRD_SIT:
+ return addrconf_ifid_sit(eui, dev);
+ case ARPHRD_IPGRE:
+ return addrconf_ifid_gre(eui, dev);
+ case ARPHRD_6LOWPAN:
+ case ARPHRD_IEEE802154:
+ return addrconf_ifid_eui64(eui, dev);
+ case ARPHRD_IEEE1394:
+ return addrconf_ifid_ieee1394(eui, dev);
+ case ARPHRD_TUNNEL6:
+ return addrconf_ifid_ip6tnl(eui, dev);
+ }
+ return -1;
+}
+
+static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
+{
+ int err = -1;
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
+ if (ifp->scope > IFA_LINK)
+ break;
+ if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
+ memcpy(eui, ifp->addr.s6_addr+8, 8);
+ err = 0;
+ break;
+ }
+ }
+ read_unlock_bh(&idev->lock);
+ return err;
+}
+
+/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
+static void __ipv6_regen_rndid(struct inet6_dev *idev)
+{
+regen:
+ get_random_bytes(idev->rndid, sizeof(idev->rndid));
+ idev->rndid[0] &= ~0x02;
+
+ /*
+ * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>:
+ * check if generated address is not inappropriate
+ *
+ * - Reserved subnet anycast (RFC 2526)
+ * 11111101 11....11 1xxxxxxx
+ * - ISATAP (RFC4214) 6.1
+ * 00-00-5E-FE-xx-xx-xx-xx
+ * - value 0
+ * - XXX: already assigned to an address on the device
+ */
+ if (idev->rndid[0] == 0xfd &&
+ (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff &&
+ (idev->rndid[7]&0x80))
+ goto regen;
+ if ((idev->rndid[0]|idev->rndid[1]) == 0) {
+ if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe)
+ goto regen;
+ if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00)
+ goto regen;
+ }
+}
+
+static void ipv6_regen_rndid(unsigned long data)
+{
+ struct inet6_dev *idev = (struct inet6_dev *) data;
+ unsigned long expires;
+
+ rcu_read_lock_bh();
+ write_lock_bh(&idev->lock);
+
+ if (idev->dead)
+ goto out;
+
+ __ipv6_regen_rndid(idev);
+
+ expires = jiffies +
+ idev->cnf.temp_prefered_lft * HZ -
+ idev->cnf.regen_max_retry * idev->cnf.dad_transmits *
+ NEIGH_VAR(idev->nd_parms, RETRANS_TIME) -
+ idev->cnf.max_desync_factor * HZ;
+ if (time_before(expires, jiffies)) {
+ pr_warn("%s: too short regeneration interval; timer disabled for %s\n",
+ __func__, idev->dev->name);
+ goto out;
+ }
+
+ if (!mod_timer(&idev->regen_timer, expires))
+ in6_dev_hold(idev);
+
+out:
+ write_unlock_bh(&idev->lock);
+ rcu_read_unlock_bh();
+ in6_dev_put(idev);
+}
+
+static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr)
+{
+ if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
+ __ipv6_regen_rndid(idev);
+}
+
+/*
+ * Add prefix route.
+ */
+
+static void
+addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
+ unsigned long expires, u32 flags)
+{
+ struct fib6_config cfg = {
+ .fc_table = RT6_TABLE_PREFIX,
+ .fc_metric = IP6_RT_PRIO_ADDRCONF,
+ .fc_ifindex = dev->ifindex,
+ .fc_expires = expires,
+ .fc_dst_len = plen,
+ .fc_flags = RTF_UP | flags,
+ .fc_nlinfo.nl_net = dev_net(dev),
+ .fc_protocol = RTPROT_KERNEL,
+ };
+
+ cfg.fc_dst = *pfx;
+
+ /* Prevent useless cloning on PtP SIT.
+ This thing is done here expecting that the whole
+ class of non-broadcast devices need not cloning.
+ */
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+ if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT))
+ cfg.fc_flags |= RTF_NONEXTHOP;
+#endif
+
+ ip6_route_add(&cfg);
+}
+
+
+static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+ int plen,
+ const struct net_device *dev,
+ u32 flags, u32 noflags)
+{
+ struct fib6_node *fn;
+ struct rt6_info *rt = NULL;
+ struct fib6_table *table;
+
+ table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX);
+ if (!table)
+ return NULL;
+
+ read_lock_bh(&table->tb6_lock);
+ fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0);
+ if (!fn)
+ goto out;
+ for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ if (rt->dst.dev->ifindex != dev->ifindex)
+ continue;
+ if ((rt->rt6i_flags & flags) != flags)
+ continue;
+ if ((rt->rt6i_flags & noflags) != 0)
+ continue;
+ dst_hold(&rt->dst);
+ break;
+ }
+out:
+ read_unlock_bh(&table->tb6_lock);
+ return rt;
+}
+
+
+/* Create "default" multicast route to the interface */
+
+static void addrconf_add_mroute(struct net_device *dev)
+{
+ struct fib6_config cfg = {
+ .fc_table = RT6_TABLE_LOCAL,
+ .fc_metric = IP6_RT_PRIO_ADDRCONF,
+ .fc_ifindex = dev->ifindex,
+ .fc_dst_len = 8,
+ .fc_flags = RTF_UP,
+ .fc_nlinfo.nl_net = dev_net(dev),
+ };
+
+ ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
+
+ ip6_route_add(&cfg);
+}
+
+static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
+{
+ struct inet6_dev *idev;
+
+ ASSERT_RTNL();
+
+ idev = ipv6_find_idev(dev);
+ if (!idev)
+ return ERR_PTR(-ENOBUFS);
+
+ if (idev->cnf.disable_ipv6)
+ return ERR_PTR(-EACCES);
+
+ /* Add default multicast route */
+ if (!(dev->flags & IFF_LOOPBACK))
+ addrconf_add_mroute(dev);
+
+ return idev;
+}
+
+static void manage_tempaddrs(struct inet6_dev *idev,
+ struct inet6_ifaddr *ifp,
+ __u32 valid_lft, __u32 prefered_lft,
+ bool create, unsigned long now)
+{
+ u32 flags;
+ struct inet6_ifaddr *ift;
+
+ read_lock_bh(&idev->lock);
+ /* update all temporary addresses in the list */
+ list_for_each_entry(ift, &idev->tempaddr_list, tmp_list) {
+ int age, max_valid, max_prefered;
+
+ if (ifp != ift->ifpub)
+ continue;
+
+ /* RFC 4941 section 3.3:
+ * If a received option will extend the lifetime of a public
+ * address, the lifetimes of temporary addresses should
+ * be extended, subject to the overall constraint that no
+ * temporary addresses should ever remain "valid" or "preferred"
+ * for a time longer than (TEMP_VALID_LIFETIME) or
+ * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively.
+ */
+ age = (now - ift->cstamp) / HZ;
+ max_valid = idev->cnf.temp_valid_lft - age;
+ if (max_valid < 0)
+ max_valid = 0;
+
+ max_prefered = idev->cnf.temp_prefered_lft -
+ idev->cnf.max_desync_factor - age;
+ if (max_prefered < 0)
+ max_prefered = 0;
+
+ if (valid_lft > max_valid)
+ valid_lft = max_valid;
+
+ if (prefered_lft > max_prefered)
+ prefered_lft = max_prefered;
+
+ spin_lock(&ift->lock);
+ flags = ift->flags;
+ ift->valid_lft = valid_lft;
+ ift->prefered_lft = prefered_lft;
+ ift->tstamp = now;
+ if (prefered_lft > 0)
+ ift->flags &= ~IFA_F_DEPRECATED;
+
+ spin_unlock(&ift->lock);
+ if (!(flags&IFA_F_TENTATIVE))
+ ipv6_ifa_notify(0, ift);
+ }
+
+ if ((create || list_empty(&idev->tempaddr_list)) &&
+ idev->cnf.use_tempaddr > 0) {
+ /* When a new public address is created as described
+ * in [ADDRCONF], also create a new temporary address.
+ * Also create a temporary address if it's enabled but
+ * no temporary address currently exists.
+ */
+ read_unlock_bh(&idev->lock);
+ ipv6_create_tempaddr(ifp, NULL);
+ } else {
+ read_unlock_bh(&idev->lock);
+ }
+}
+
+void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
+{
+ struct prefix_info *pinfo;
+ __u32 valid_lft;
+ __u32 prefered_lft;
+ int addr_type;
+ u32 addr_flags = 0;
+ struct inet6_dev *in6_dev;
+ struct net *net = dev_net(dev);
+
+ pinfo = (struct prefix_info *) opt;
+
+ if (len < sizeof(struct prefix_info)) {
+ ADBG("addrconf: prefix option too short\n");
+ return;
+ }
+
+ /*
+ * Validation checks ([ADDRCONF], page 19)
+ */
+
+ addr_type = ipv6_addr_type(&pinfo->prefix);
+
+ if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL))
+ return;
+
+ valid_lft = ntohl(pinfo->valid);
+ prefered_lft = ntohl(pinfo->prefered);
+
+ if (prefered_lft > valid_lft) {
+ net_warn_ratelimited("addrconf: prefix option has invalid lifetime\n");
+ return;
+ }
+
+ in6_dev = in6_dev_get(dev);
+
+ if (!in6_dev) {
+ net_dbg_ratelimited("addrconf: device %s not configured\n",
+ dev->name);
+ return;
+ }
+
+ /*
+ * Two things going on here:
+ * 1) Add routes for on-link prefixes
+ * 2) Configure prefixes with the auto flag set
+ */
+
+ if (pinfo->onlink) {
+ struct rt6_info *rt;
+ unsigned long rt_expires;
+
+ /* Avoid arithmetic overflow. Really, we could
+ * save rt_expires in seconds, likely valid_lft,
+ * but it would require division in fib gc, that it
+ * not good.
+ */
+ if (HZ > USER_HZ)
+ rt_expires = addrconf_timeout_fixup(valid_lft, HZ);
+ else
+ rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ);
+
+ if (addrconf_finite_timeout(rt_expires))
+ rt_expires *= HZ;
+
+ rt = addrconf_get_prefix_route(&pinfo->prefix,
+ pinfo->prefix_len,
+ dev,
+ RTF_ADDRCONF | RTF_PREFIX_RT,
+ RTF_GATEWAY | RTF_DEFAULT);
+
+ if (rt) {
+ /* Autoconf prefix route */
+ if (valid_lft == 0) {
+ ip6_del_rt(rt);
+ rt = NULL;
+ } else if (addrconf_finite_timeout(rt_expires)) {
+ /* not infinity */
+ rt6_set_expires(rt, jiffies + rt_expires);
+ } else {
+ rt6_clean_expires(rt);
+ }
+ } else if (valid_lft) {
+ clock_t expires = 0;
+ int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
+ if (addrconf_finite_timeout(rt_expires)) {
+ /* not infinity */
+ flags |= RTF_EXPIRES;
+ expires = jiffies_to_clock_t(rt_expires);
+ }
+ addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
+ dev, expires, flags);
+ }
+ ip6_rt_put(rt);
+ }
+
+ /* Try to figure out our local address for this prefix */
+
+ if (pinfo->autoconf && in6_dev->cnf.autoconf) {
+ struct inet6_ifaddr *ifp;
+ struct in6_addr addr;
+ int create = 0, update_lft = 0;
+ bool tokenized = false;
+
+ if (pinfo->prefix_len == 64) {
+ memcpy(&addr, &pinfo->prefix, 8);
+
+ if (!ipv6_addr_any(&in6_dev->token)) {
+ read_lock_bh(&in6_dev->lock);
+ memcpy(addr.s6_addr + 8,
+ in6_dev->token.s6_addr + 8, 8);
+ read_unlock_bh(&in6_dev->lock);
+ tokenized = true;
+ } else if (in6_dev->addr_gen_mode ==
+ IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+ !ipv6_generate_stable_address(&addr, 0,
+ in6_dev)) {
+ addr_flags |= IFA_F_STABLE_PRIVACY;
+ goto ok;
+ } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
+ ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
+ in6_dev_put(in6_dev);
+ return;
+ }
+ goto ok;
+ }
+ net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n",
+ pinfo->prefix_len);
+ in6_dev_put(in6_dev);
+ return;
+
+ok:
+
+ ifp = ipv6_get_ifaddr(net, &addr, dev, 1);
+
+ if (!ifp && valid_lft) {
+ int max_addresses = in6_dev->cnf.max_addresses;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ if (in6_dev->cnf.optimistic_dad &&
+ !net->ipv6.devconf_all->forwarding && sllao)
+ addr_flags = IFA_F_OPTIMISTIC;
+#endif
+
+ /* Do not allow to create too much of autoconfigured
+ * addresses; this would be too easy way to crash kernel.
+ */
+ if (!max_addresses ||
+ ipv6_count_addresses(in6_dev) < max_addresses)
+ ifp = ipv6_add_addr(in6_dev, &addr, NULL,
+ pinfo->prefix_len,
+ addr_type&IPV6_ADDR_SCOPE_MASK,
+ addr_flags, valid_lft,
+ prefered_lft);
+
+ if (IS_ERR_OR_NULL(ifp)) {
+ in6_dev_put(in6_dev);
+ return;
+ }
+
+ update_lft = 0;
+ create = 1;
+ spin_lock_bh(&ifp->lock);
+ ifp->flags |= IFA_F_MANAGETEMPADDR;
+ ifp->cstamp = jiffies;
+ ifp->tokenized = tokenized;
+ spin_unlock_bh(&ifp->lock);
+ addrconf_dad_start(ifp);
+ }
+
+ if (ifp) {
+ u32 flags;
+ unsigned long now;
+ u32 stored_lft;
+
+ /* update lifetime (RFC2462 5.5.3 e) */
+ spin_lock_bh(&ifp->lock);
+ now = jiffies;
+ if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
+ stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
+ else
+ stored_lft = 0;
+ if (!update_lft && !create && stored_lft) {
+ const u32 minimum_lft = min_t(u32,
+ stored_lft, MIN_VALID_LIFETIME);
+ valid_lft = max(valid_lft, minimum_lft);
+
+ /* RFC4862 Section 5.5.3e:
+ * "Note that the preferred lifetime of the
+ * corresponding address is always reset to
+ * the Preferred Lifetime in the received
+ * Prefix Information option, regardless of
+ * whether the valid lifetime is also reset or
+ * ignored."
+ *
+ * So we should always update prefered_lft here.
+ */
+ update_lft = 1;
+ }
+
+ if (update_lft) {
+ ifp->valid_lft = valid_lft;
+ ifp->prefered_lft = prefered_lft;
+ ifp->tstamp = now;
+ flags = ifp->flags;
+ ifp->flags &= ~IFA_F_DEPRECATED;
+ spin_unlock_bh(&ifp->lock);
+
+ if (!(flags&IFA_F_TENTATIVE))
+ ipv6_ifa_notify(0, ifp);
+ } else
+ spin_unlock_bh(&ifp->lock);
+
+ manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft,
+ create, now);
+
+ in6_ifa_put(ifp);
+ addrconf_verify();
+ }
+ }
+ inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
+ in6_dev_put(in6_dev);
+}
+
+/*
+ * Set destination address.
+ * Special case for SIT interfaces where we create a new "virtual"
+ * device.
+ */
+int addrconf_set_dstaddr(struct net *net, void __user *arg)
+{
+ struct in6_ifreq ireq;
+ struct net_device *dev;
+ int err = -EINVAL;
+
+ rtnl_lock();
+
+ err = -EFAULT;
+ if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+ goto err_exit;
+
+ dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
+
+ err = -ENODEV;
+ if (!dev)
+ goto err_exit;
+
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+ if (dev->type == ARPHRD_SIT) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct ifreq ifr;
+ struct ip_tunnel_parm p;
+
+ err = -EADDRNOTAVAIL;
+ if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4))
+ goto err_exit;
+
+ memset(&p, 0, sizeof(p));
+ p.iph.daddr = ireq.ifr6_addr.s6_addr32[3];
+ p.iph.saddr = 0;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPV6;
+ p.iph.ttl = 64;
+ ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+ if (ops->ndo_do_ioctl) {
+ mm_segment_t oldfs = get_fs();
+
+ set_fs(KERNEL_DS);
+ err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+ set_fs(oldfs);
+ } else
+ err = -EOPNOTSUPP;
+
+ if (err == 0) {
+ err = -ENOBUFS;
+ dev = __dev_get_by_name(net, p.name);
+ if (!dev)
+ goto err_exit;
+ err = dev_open(dev);
+ }
+ }
+#endif
+
+err_exit:
+ rtnl_unlock();
+ return err;
+}
+
+static int ipv6_mc_config(struct sock *sk, bool join,
+ const struct in6_addr *addr, int ifindex)
+{
+ int ret;
+
+ ASSERT_RTNL();
+
+ lock_sock(sk);
+ if (join)
+ ret = ipv6_sock_mc_join(sk, ifindex, addr);
+ else
+ ret = ipv6_sock_mc_drop(sk, ifindex, addr);
+ release_sock(sk);
+
+ return ret;
+}
+
+/*
+ * Manual configuration of address on an interface
+ */
+static int inet6_addr_add(struct net *net, int ifindex,
+ const struct in6_addr *pfx,
+ const struct in6_addr *peer_pfx,
+ unsigned int plen, __u32 ifa_flags,
+ __u32 prefered_lft, __u32 valid_lft)
+{
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *idev;
+ struct net_device *dev;
+ unsigned long timeout;
+ clock_t expires;
+ int scope;
+ u32 flags;
+
+ ASSERT_RTNL();
+
+ if (plen > 128)
+ return -EINVAL;
+
+ /* check the lifetime */
+ if (!valid_lft || prefered_lft > valid_lft)
+ return -EINVAL;
+
+ if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64)
+ return -EINVAL;
+
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ idev = addrconf_add_dev(dev);
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
+
+ if (ifa_flags & IFA_F_MCAUTOJOIN) {
+ int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
+ true, pfx, ifindex);
+
+ if (ret < 0)
+ return ret;
+ }
+
+ scope = ipv6_addr_scope(pfx);
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ expires = jiffies_to_clock_t(timeout * HZ);
+ valid_lft = timeout;
+ flags = RTF_EXPIRES;
+ } else {
+ expires = 0;
+ flags = 0;
+ ifa_flags |= IFA_F_PERMANENT;
+ }
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa_flags |= IFA_F_DEPRECATED;
+ prefered_lft = timeout;
+ }
+
+ ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
+ valid_lft, prefered_lft);
+
+ if (!IS_ERR(ifp)) {
+ if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
+ expires, flags);
+ }
+
+ /*
+ * Note that section 3.1 of RFC 4429 indicates
+ * that the Optimistic flag should not be set for
+ * manually configured addresses
+ */
+ addrconf_dad_start(ifp);
+ if (ifa_flags & IFA_F_MANAGETEMPADDR)
+ manage_tempaddrs(idev, ifp, valid_lft, prefered_lft,
+ true, jiffies);
+ in6_ifa_put(ifp);
+ addrconf_verify_rtnl();
+ return 0;
+ } else if (ifa_flags & IFA_F_MCAUTOJOIN) {
+ ipv6_mc_config(net->ipv6.mc_autojoin_sk,
+ false, pfx, ifindex);
+ }
+
+ return PTR_ERR(ifp);
+}
+
+static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
+ const struct in6_addr *pfx, unsigned int plen)
+{
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *idev;
+ struct net_device *dev;
+
+ if (plen > 128)
+ return -EINVAL;
+
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return -ENXIO;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if (ifp->prefix_len == plen &&
+ ipv6_addr_equal(pfx, &ifp->addr)) {
+ in6_ifa_hold(ifp);
+ read_unlock_bh(&idev->lock);
+
+ if (!(ifp->flags & IFA_F_TEMPORARY) &&
+ (ifa_flags & IFA_F_MANAGETEMPADDR))
+ manage_tempaddrs(idev, ifp, 0, 0, false,
+ jiffies);
+ ipv6_del_addr(ifp);
+ addrconf_verify_rtnl();
+ if (ipv6_addr_is_multicast(pfx)) {
+ ipv6_mc_config(net->ipv6.mc_autojoin_sk,
+ false, pfx, dev->ifindex);
+ }
+ return 0;
+ }
+ }
+ read_unlock_bh(&idev->lock);
+ return -EADDRNOTAVAIL;
+}
+
+
+int addrconf_add_ifaddr(struct net *net, void __user *arg)
+{
+ struct in6_ifreq ireq;
+ int err;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+ return -EFAULT;
+
+ rtnl_lock();
+ err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
+ ireq.ifr6_prefixlen, IFA_F_PERMANENT,
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+ rtnl_unlock();
+ return err;
+}
+
+int addrconf_del_ifaddr(struct net *net, void __user *arg)
+{
+ struct in6_ifreq ireq;
+ int err;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+ return -EFAULT;
+
+ rtnl_lock();
+ err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr,
+ ireq.ifr6_prefixlen);
+ rtnl_unlock();
+ return err;
+}
+
+static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
+ int plen, int scope)
+{
+ struct inet6_ifaddr *ifp;
+
+ ifp = ipv6_add_addr(idev, addr, NULL, plen,
+ scope, IFA_F_PERMANENT,
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+ if (!IS_ERR(ifp)) {
+ spin_lock_bh(&ifp->lock);
+ ifp->flags &= ~IFA_F_TENTATIVE;
+ spin_unlock_bh(&ifp->lock);
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ in6_ifa_put(ifp);
+ }
+}
+
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+static void sit_add_v4_addrs(struct inet6_dev *idev)
+{
+ struct in6_addr addr;
+ struct net_device *dev;
+ struct net *net = dev_net(idev->dev);
+ int scope, plen;
+ u32 pflags = 0;
+
+ ASSERT_RTNL();
+
+ memset(&addr, 0, sizeof(struct in6_addr));
+ memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4);
+
+ if (idev->dev->flags&IFF_POINTOPOINT) {
+ addr.s6_addr32[0] = htonl(0xfe800000);
+ scope = IFA_LINK;
+ plen = 64;
+ } else {
+ scope = IPV6_ADDR_COMPATv4;
+ plen = 96;
+ pflags |= RTF_NONEXTHOP;
+ }
+
+ if (addr.s6_addr32[3]) {
+ add_addr(idev, &addr, plen, scope);
+ addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags);
+ return;
+ }
+
+ for_each_netdev(net, dev) {
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev && (dev->flags & IFF_UP)) {
+ struct in_ifaddr *ifa;
+
+ int flag = scope;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+
+ addr.s6_addr32[3] = ifa->ifa_local;
+
+ if (ifa->ifa_scope == RT_SCOPE_LINK)
+ continue;
+ if (ifa->ifa_scope >= RT_SCOPE_HOST) {
+ if (idev->dev->flags&IFF_POINTOPOINT)
+ continue;
+ flag |= IFA_HOST;
+ }
+
+ add_addr(idev, &addr, plen, flag);
+ addrconf_prefix_route(&addr, plen, idev->dev, 0,
+ pflags);
+ }
+ }
+ }
+}
+#endif
+
+static void init_loopback(struct net_device *dev)
+{
+ struct inet6_dev *idev;
+ struct net_device *sp_dev;
+ struct inet6_ifaddr *sp_ifa;
+ struct rt6_info *sp_rt;
+
+ /* ::1 */
+
+ ASSERT_RTNL();
+
+ idev = ipv6_find_idev(dev);
+ if (!idev) {
+ pr_debug("%s: add_dev failed\n", __func__);
+ return;
+ }
+
+ add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
+
+ /* Add routes to other interface's IPv6 addresses */
+ for_each_netdev(dev_net(dev), sp_dev) {
+ if (!strcmp(sp_dev->name, dev->name))
+ continue;
+
+ idev = __in6_dev_get(sp_dev);
+ if (!idev)
+ continue;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(sp_ifa, &idev->addr_list, if_list) {
+
+ if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
+ continue;
+
+ if (sp_ifa->rt) {
+ /* This dst has been added to garbage list when
+ * lo device down, release this obsolete dst and
+ * reallocate a new router for ifa.
+ */
+ if (sp_ifa->rt->dst.obsolete > 0) {
+ ip6_rt_put(sp_ifa->rt);
+ sp_ifa->rt = NULL;
+ } else {
+ continue;
+ }
+ }
+
+ sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false);
+
+ /* Failure cases are ignored */
+ if (!IS_ERR(sp_rt)) {
+ sp_ifa->rt = sp_rt;
+ ip6_ins_rt(sp_rt);
+ }
+ }
+ read_unlock_bh(&idev->lock);
+ }
+}
+
+static void addrconf_add_linklocal(struct inet6_dev *idev,
+ const struct in6_addr *addr, u32 flags)
+{
+ struct inet6_ifaddr *ifp;
+ u32 addr_flags = flags | IFA_F_PERMANENT;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ if (idev->cnf.optimistic_dad &&
+ !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
+ addr_flags |= IFA_F_OPTIMISTIC;
+#endif
+
+ ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+ if (!IS_ERR(ifp)) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
+ addrconf_dad_start(ifp);
+ in6_ifa_put(ifp);
+ }
+}
+
+static bool ipv6_reserved_interfaceid(struct in6_addr address)
+{
+ if ((address.s6_addr32[2] | address.s6_addr32[3]) == 0)
+ return true;
+
+ if (address.s6_addr32[2] == htonl(0x02005eff) &&
+ ((address.s6_addr32[3] & htonl(0xfe000000)) == htonl(0xfe000000)))
+ return true;
+
+ if (address.s6_addr32[2] == htonl(0xfdffffff) &&
+ ((address.s6_addr32[3] & htonl(0xffffff80)) == htonl(0xffffff80)))
+ return true;
+
+ return false;
+}
+
+static int ipv6_generate_stable_address(struct in6_addr *address,
+ u8 dad_count,
+ const struct inet6_dev *idev)
+{
+ static DEFINE_SPINLOCK(lock);
+ static __u32 digest[SHA_DIGEST_WORDS];
+ static __u32 workspace[SHA_WORKSPACE_WORDS];
+
+ static union {
+ char __data[SHA_MESSAGE_BYTES];
+ struct {
+ struct in6_addr secret;
+ __be32 prefix[2];
+ unsigned char hwaddr[MAX_ADDR_LEN];
+ u8 dad_count;
+ } __packed;
+ } data;
+
+ struct in6_addr secret;
+ struct in6_addr temp;
+ struct net *net = dev_net(idev->dev);
+
+ BUILD_BUG_ON(sizeof(data.__data) != sizeof(data));
+
+ if (idev->cnf.stable_secret.initialized)
+ secret = idev->cnf.stable_secret.secret;
+ else if (net->ipv6.devconf_dflt->stable_secret.initialized)
+ secret = net->ipv6.devconf_dflt->stable_secret.secret;
+ else
+ return -1;
+
+retry:
+ spin_lock_bh(&lock);
+
+ sha_init(digest);
+ memset(&data, 0, sizeof(data));
+ memset(workspace, 0, sizeof(workspace));
+ memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len);
+ data.prefix[0] = address->s6_addr32[0];
+ data.prefix[1] = address->s6_addr32[1];
+ data.secret = secret;
+ data.dad_count = dad_count;
+
+ sha_transform(digest, data.__data, workspace);
+
+ temp = *address;
+ temp.s6_addr32[2] = (__force __be32)digest[0];
+ temp.s6_addr32[3] = (__force __be32)digest[1];
+
+ spin_unlock_bh(&lock);
+
+ if (ipv6_reserved_interfaceid(temp)) {
+ dad_count++;
+ if (dad_count > dev_net(idev->dev)->ipv6.sysctl.idgen_retries)
+ return -1;
+ goto retry;
+ }
+
+ *address = temp;
+ return 0;
+}
+
+static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
+{
+ struct in6_addr addr;
+
+ ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
+
+ if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) {
+ if (!ipv6_generate_stable_address(&addr, 0, idev))
+ addrconf_add_linklocal(idev, &addr,
+ IFA_F_STABLE_PRIVACY);
+ else if (prefix_route)
+ addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+ } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) {
+ /* addrconf_add_linklocal also adds a prefix_route and we
+ * only need to care about prefix routes if ipv6_generate_eui64
+ * couldn't generate one.
+ */
+ if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
+ addrconf_add_linklocal(idev, &addr, 0);
+ else if (prefix_route)
+ addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+ }
+}
+
+static void addrconf_dev_config(struct net_device *dev)
+{
+ struct inet6_dev *idev;
+
+ ASSERT_RTNL();
+
+ if ((dev->type != ARPHRD_ETHER) &&
+ (dev->type != ARPHRD_FDDI) &&
+ (dev->type != ARPHRD_ARCNET) &&
+ (dev->type != ARPHRD_INFINIBAND) &&
+ (dev->type != ARPHRD_IEEE802154) &&
+ (dev->type != ARPHRD_IEEE1394) &&
+ (dev->type != ARPHRD_TUNNEL6) &&
+ (dev->type != ARPHRD_6LOWPAN)) {
+ /* Alas, we support only Ethernet autoconfiguration. */
+ return;
+ }
+
+ idev = addrconf_add_dev(dev);
+ if (IS_ERR(idev))
+ return;
+
+ addrconf_addr_gen(idev, false);
+}
+
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+static void addrconf_sit_config(struct net_device *dev)
+{
+ struct inet6_dev *idev;
+
+ ASSERT_RTNL();
+
+ /*
+ * Configure the tunnel with one of our IPv4
+ * addresses... we should configure all of
+ * our v4 addrs in the tunnel
+ */
+
+ idev = ipv6_find_idev(dev);
+ if (!idev) {
+ pr_debug("%s: add_dev failed\n", __func__);
+ return;
+ }
+
+ if (dev->priv_flags & IFF_ISATAP) {
+ addrconf_addr_gen(idev, false);
+ return;
+ }
+
+ sit_add_v4_addrs(idev);
+
+ if (dev->flags&IFF_POINTOPOINT)
+ addrconf_add_mroute(dev);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+static void addrconf_gre_config(struct net_device *dev)
+{
+ struct inet6_dev *idev;
+
+ ASSERT_RTNL();
+
+ idev = ipv6_find_idev(dev);
+ if (!idev) {
+ pr_debug("%s: add_dev failed\n", __func__);
+ return;
+ }
+
+ addrconf_addr_gen(idev, true);
+}
+#endif
+
+static int addrconf_notify(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct inet6_dev *idev = __in6_dev_get(dev);
+ int run_pending = 0;
+ int err;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (!idev && dev->mtu >= IPV6_MIN_MTU) {
+ idev = ipv6_add_dev(dev);
+ if (IS_ERR(idev))
+ return notifier_from_errno(PTR_ERR(idev));
+ }
+ break;
+
+ case NETDEV_UP:
+ case NETDEV_CHANGE:
+ if (dev->flags & IFF_SLAVE)
+ break;
+
+ if (idev && idev->cnf.disable_ipv6)
+ break;
+
+ if (event == NETDEV_UP) {
+ if (!addrconf_qdisc_ok(dev)) {
+ /* device is not ready yet. */
+ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n",
+ dev->name);
+ break;
+ }
+
+ if (!idev && dev->mtu >= IPV6_MIN_MTU)
+ idev = ipv6_add_dev(dev);
+
+ if (!IS_ERR_OR_NULL(idev)) {
+ idev->if_flags |= IF_READY;
+ run_pending = 1;
+ }
+ } else {
+ if (!addrconf_qdisc_ok(dev)) {
+ /* device is still not ready. */
+ break;
+ }
+
+ if (idev) {
+ if (idev->if_flags & IF_READY)
+ /* device is already configured. */
+ break;
+ idev->if_flags |= IF_READY;
+ }
+
+ pr_info("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n",
+ dev->name);
+
+ run_pending = 1;
+ }
+
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+ case ARPHRD_SIT:
+ addrconf_sit_config(dev);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+ case ARPHRD_IPGRE:
+ addrconf_gre_config(dev);
+ break;
+#endif
+ case ARPHRD_LOOPBACK:
+ init_loopback(dev);
+ break;
+
+ default:
+ addrconf_dev_config(dev);
+ break;
+ }
+
+ if (!IS_ERR_OR_NULL(idev)) {
+ if (run_pending)
+ addrconf_dad_run(idev);
+
+ /*
+ * If the MTU changed during the interface down,
+ * when the interface up, the changed MTU must be
+ * reflected in the idev as well as routers.
+ */
+ if (idev->cnf.mtu6 != dev->mtu &&
+ dev->mtu >= IPV6_MIN_MTU) {
+ rt6_mtu_change(dev, dev->mtu);
+ idev->cnf.mtu6 = dev->mtu;
+ }
+ idev->tstamp = jiffies;
+ inet6_ifinfo_notify(RTM_NEWLINK, idev);
+
+ /*
+ * If the changed mtu during down is lower than
+ * IPV6_MIN_MTU stop IPv6 on this interface.
+ */
+ if (dev->mtu < IPV6_MIN_MTU)
+ addrconf_ifdown(dev, 1);
+ }
+ break;
+
+ case NETDEV_CHANGEMTU:
+ if (idev && dev->mtu >= IPV6_MIN_MTU) {
+ rt6_mtu_change(dev, dev->mtu);
+ idev->cnf.mtu6 = dev->mtu;
+ break;
+ }
+
+ if (!idev && dev->mtu >= IPV6_MIN_MTU) {
+ idev = ipv6_add_dev(dev);
+ if (!IS_ERR(idev))
+ break;
+ }
+
+ /*
+ * if MTU under IPV6_MIN_MTU.
+ * Stop IPv6 on this interface.
+ */
+
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ /*
+ * Remove all addresses from this interface.
+ */
+ addrconf_ifdown(dev, event != NETDEV_DOWN);
+ break;
+
+ case NETDEV_CHANGENAME:
+ if (idev) {
+ snmp6_unregister_dev(idev);
+ addrconf_sysctl_unregister(idev);
+ err = addrconf_sysctl_register(idev);
+ if (err)
+ return notifier_from_errno(err);
+ err = snmp6_register_dev(idev);
+ if (err) {
+ addrconf_sysctl_unregister(idev);
+ return notifier_from_errno(err);
+ }
+ }
+ break;
+
+ case NETDEV_PRE_TYPE_CHANGE:
+ case NETDEV_POST_TYPE_CHANGE:
+ addrconf_type_change(dev, event);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * addrconf module should be notified of a device going up
+ */
+static struct notifier_block ipv6_dev_notf = {
+ .notifier_call = addrconf_notify,
+};
+
+static void addrconf_type_change(struct net_device *dev, unsigned long event)
+{
+ struct inet6_dev *idev;
+ ASSERT_RTNL();
+
+ idev = __in6_dev_get(dev);
+
+ if (event == NETDEV_POST_TYPE_CHANGE)
+ ipv6_mc_remap(idev);
+ else if (event == NETDEV_PRE_TYPE_CHANGE)
+ ipv6_mc_unmap(idev);
+}
+
+static int addrconf_ifdown(struct net_device *dev, int how)
+{
+ struct net *net = dev_net(dev);
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+ int state, i;
+
+ ASSERT_RTNL();
+
+ rt6_ifdown(net, dev);
+ neigh_ifdown(&nd_tbl, dev);
+
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return -ENODEV;
+
+ /*
+ * Step 1: remove reference to ipv6 device from parent device.
+ * Do not dev_put!
+ */
+ if (how) {
+ idev->dead = 1;
+
+ /* protected by rtnl_lock */
+ RCU_INIT_POINTER(dev->ip6_ptr, NULL);
+
+ /* Step 1.5: remove snmp6 entry */
+ snmp6_unregister_dev(idev);
+
+ }
+
+ /* Step 2: clear hash table */
+ for (i = 0; i < IN6_ADDR_HSIZE; i++) {
+ struct hlist_head *h = &inet6_addr_lst[i];
+
+ spin_lock_bh(&addrconf_hash_lock);
+restart:
+ hlist_for_each_entry_rcu(ifa, h, addr_lst) {
+ if (ifa->idev == idev) {
+ hlist_del_init_rcu(&ifa->addr_lst);
+ addrconf_del_dad_work(ifa);
+ goto restart;
+ }
+ }
+ spin_unlock_bh(&addrconf_hash_lock);
+ }
+
+ write_lock_bh(&idev->lock);
+
+ addrconf_del_rs_timer(idev);
+
+ /* Step 2: clear flags for stateless addrconf */
+ if (!how)
+ idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
+
+ if (how && del_timer(&idev->regen_timer))
+ in6_dev_put(idev);
+
+ /* Step 3: clear tempaddr list */
+ while (!list_empty(&idev->tempaddr_list)) {
+ ifa = list_first_entry(&idev->tempaddr_list,
+ struct inet6_ifaddr, tmp_list);
+ list_del(&ifa->tmp_list);
+ write_unlock_bh(&idev->lock);
+ spin_lock_bh(&ifa->lock);
+
+ if (ifa->ifpub) {
+ in6_ifa_put(ifa->ifpub);
+ ifa->ifpub = NULL;
+ }
+ spin_unlock_bh(&ifa->lock);
+ in6_ifa_put(ifa);
+ write_lock_bh(&idev->lock);
+ }
+
+ while (!list_empty(&idev->addr_list)) {
+ ifa = list_first_entry(&idev->addr_list,
+ struct inet6_ifaddr, if_list);
+ addrconf_del_dad_work(ifa);
+
+ list_del(&ifa->if_list);
+
+ write_unlock_bh(&idev->lock);
+
+ spin_lock_bh(&ifa->lock);
+ state = ifa->state;
+ ifa->state = INET6_IFADDR_STATE_DEAD;
+ spin_unlock_bh(&ifa->lock);
+
+ if (state != INET6_IFADDR_STATE_DEAD) {
+ __ipv6_ifa_notify(RTM_DELADDR, ifa);
+ inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
+ }
+ in6_ifa_put(ifa);
+
+ write_lock_bh(&idev->lock);
+ }
+
+ write_unlock_bh(&idev->lock);
+
+ /* Step 5: Discard anycast and multicast list */
+ if (how) {
+ ipv6_ac_destroy_dev(idev);
+ ipv6_mc_destroy_dev(idev);
+ } else {
+ ipv6_mc_down(idev);
+ }
+
+ idev->tstamp = jiffies;
+
+ /* Last: Shot the device (if unregistered) */
+ if (how) {
+ addrconf_sysctl_unregister(idev);
+ neigh_parms_release(&nd_tbl, idev->nd_parms);
+ neigh_ifdown(&nd_tbl, dev);
+ in6_dev_put(idev);
+ }
+ return 0;
+}
+
+static void addrconf_rs_timer(unsigned long data)
+{
+ struct inet6_dev *idev = (struct inet6_dev *)data;
+ struct net_device *dev = idev->dev;
+ struct in6_addr lladdr;
+
+ write_lock(&idev->lock);
+ if (idev->dead || !(idev->if_flags & IF_READY))
+ goto out;
+
+ if (!ipv6_accept_ra(idev))
+ goto out;
+
+ /* Announcement received after solicitation was sent */
+ if (idev->if_flags & IF_RA_RCVD)
+ goto out;
+
+ if (idev->rs_probes++ < idev->cnf.rtr_solicits) {
+ write_unlock(&idev->lock);
+ if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
+ ndisc_send_rs(dev, &lladdr,
+ &in6addr_linklocal_allrouters);
+ else
+ goto put;
+
+ write_lock(&idev->lock);
+ /* The wait after the last probe can be shorter */
+ addrconf_mod_rs_timer(idev, (idev->rs_probes ==
+ idev->cnf.rtr_solicits) ?
+ idev->cnf.rtr_solicit_delay :
+ idev->cnf.rtr_solicit_interval);
+ } else {
+ /*
+ * Note: we do not support deprecated "all on-link"
+ * assumption any longer.
+ */
+ pr_debug("%s: no IPv6 routers present\n", idev->dev->name);
+ }
+
+out:
+ write_unlock(&idev->lock);
+put:
+ in6_dev_put(idev);
+}
+
+/*
+ * Duplicate Address Detection
+ */
+static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
+{
+ unsigned long rand_num;
+ struct inet6_dev *idev = ifp->idev;
+
+ if (ifp->flags & IFA_F_OPTIMISTIC)
+ rand_num = 0;
+ else
+ rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
+
+ ifp->dad_probes = idev->cnf.dad_transmits;
+ addrconf_mod_dad_work(ifp, rand_num);
+}
+
+static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
+{
+ struct inet6_dev *idev = ifp->idev;
+ struct net_device *dev = idev->dev;
+
+ addrconf_join_solict(dev, &ifp->addr);
+
+ prandom_seed((__force u32) ifp->addr.s6_addr32[3]);
+
+ read_lock_bh(&idev->lock);
+ spin_lock(&ifp->lock);
+ if (ifp->state == INET6_IFADDR_STATE_DEAD)
+ goto out;
+
+ if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
+ idev->cnf.accept_dad < 1 ||
+ !(ifp->flags&IFA_F_TENTATIVE) ||
+ ifp->flags & IFA_F_NODAD) {
+ ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
+ spin_unlock(&ifp->lock);
+ read_unlock_bh(&idev->lock);
+
+ addrconf_dad_completed(ifp);
+ return;
+ }
+
+ if (!(idev->if_flags & IF_READY)) {
+ spin_unlock(&ifp->lock);
+ read_unlock_bh(&idev->lock);
+ /*
+ * If the device is not ready:
+ * - keep it tentative if it is a permanent address.
+ * - otherwise, kill it.
+ */
+ in6_ifa_hold(ifp);
+ addrconf_dad_stop(ifp, 0);
+ return;
+ }
+
+ /*
+ * Optimistic nodes can start receiving
+ * Frames right away
+ */
+ if (ifp->flags & IFA_F_OPTIMISTIC) {
+ ip6_ins_rt(ifp->rt);
+ if (ipv6_use_optimistic_addr(idev)) {
+ /* Because optimistic nodes can use this address,
+ * notify listeners. If DAD fails, RTM_DELADDR is sent.
+ */
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ }
+ }
+
+ addrconf_dad_kick(ifp);
+out:
+ spin_unlock(&ifp->lock);
+ read_unlock_bh(&idev->lock);
+}
+
+static void addrconf_dad_start(struct inet6_ifaddr *ifp)
+{
+ bool begin_dad = false;
+
+ spin_lock_bh(&ifp->lock);
+ if (ifp->state != INET6_IFADDR_STATE_DEAD) {
+ ifp->state = INET6_IFADDR_STATE_PREDAD;
+ begin_dad = true;
+ }
+ spin_unlock_bh(&ifp->lock);
+
+ if (begin_dad)
+ addrconf_mod_dad_work(ifp, 0);
+}
+
+static void addrconf_dad_work(struct work_struct *w)
+{
+ struct inet6_ifaddr *ifp = container_of(to_delayed_work(w),
+ struct inet6_ifaddr,
+ dad_work);
+ struct inet6_dev *idev = ifp->idev;
+ struct in6_addr mcaddr;
+
+ enum {
+ DAD_PROCESS,
+ DAD_BEGIN,
+ DAD_ABORT,
+ } action = DAD_PROCESS;
+
+ rtnl_lock();
+
+ spin_lock_bh(&ifp->lock);
+ if (ifp->state == INET6_IFADDR_STATE_PREDAD) {
+ action = DAD_BEGIN;
+ ifp->state = INET6_IFADDR_STATE_DAD;
+ } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) {
+ action = DAD_ABORT;
+ ifp->state = INET6_IFADDR_STATE_POSTDAD;
+ }
+ spin_unlock_bh(&ifp->lock);
+
+ if (action == DAD_BEGIN) {
+ addrconf_dad_begin(ifp);
+ goto out;
+ } else if (action == DAD_ABORT) {
+ addrconf_dad_stop(ifp, 1);
+ goto out;
+ }
+
+ if (!ifp->dad_probes && addrconf_dad_end(ifp))
+ goto out;
+
+ write_lock_bh(&idev->lock);
+ if (idev->dead || !(idev->if_flags & IF_READY)) {
+ write_unlock_bh(&idev->lock);
+ goto out;
+ }
+
+ spin_lock(&ifp->lock);
+ if (ifp->state == INET6_IFADDR_STATE_DEAD) {
+ spin_unlock(&ifp->lock);
+ write_unlock_bh(&idev->lock);
+ goto out;
+ }
+
+ if (ifp->dad_probes == 0) {
+ /*
+ * DAD was successful
+ */
+
+ ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
+ spin_unlock(&ifp->lock);
+ write_unlock_bh(&idev->lock);
+
+ addrconf_dad_completed(ifp);
+
+ goto out;
+ }
+
+ ifp->dad_probes--;
+ addrconf_mod_dad_work(ifp,
+ NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME));
+ spin_unlock(&ifp->lock);
+ write_unlock_bh(&idev->lock);
+
+ /* send a neighbour solicitation for our addr */
+ addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
+ ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
+out:
+ in6_ifa_put(ifp);
+ rtnl_unlock();
+}
+
+/* ifp->idev must be at least read locked */
+static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp)
+{
+ struct inet6_ifaddr *ifpiter;
+ struct inet6_dev *idev = ifp->idev;
+
+ list_for_each_entry_reverse(ifpiter, &idev->addr_list, if_list) {
+ if (ifpiter->scope > IFA_LINK)
+ break;
+ if (ifp != ifpiter && ifpiter->scope == IFA_LINK &&
+ (ifpiter->flags & (IFA_F_PERMANENT|IFA_F_TENTATIVE|
+ IFA_F_OPTIMISTIC|IFA_F_DADFAILED)) ==
+ IFA_F_PERMANENT)
+ return false;
+ }
+ return true;
+}
+
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
+{
+ struct net_device *dev = ifp->idev->dev;
+ struct in6_addr lladdr;
+ bool send_rs, send_mld;
+
+ addrconf_del_dad_work(ifp);
+
+ /*
+ * Configure the address for reception. Now it is valid.
+ */
+
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
+
+ /* If added prefix is link local and we are prepared to process
+ router advertisements, start sending router solicitations.
+ */
+
+ read_lock_bh(&ifp->idev->lock);
+ send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
+ send_rs = send_mld &&
+ ipv6_accept_ra(ifp->idev) &&
+ ifp->idev->cnf.rtr_solicits > 0 &&
+ (dev->flags&IFF_LOOPBACK) == 0;
+ read_unlock_bh(&ifp->idev->lock);
+
+ /* While dad is in progress mld report's source address is in6_addrany.
+ * Resend with proper ll now.
+ */
+ if (send_mld)
+ ipv6_mc_dad_complete(ifp->idev);
+
+ if (send_rs) {
+ /*
+ * If a host as already performed a random delay
+ * [...] as part of DAD [...] there is no need
+ * to delay again before sending the first RS
+ */
+ if (ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
+ return;
+ ndisc_send_rs(dev, &lladdr, &in6addr_linklocal_allrouters);
+
+ write_lock_bh(&ifp->idev->lock);
+ spin_lock(&ifp->lock);
+ ifp->idev->rs_probes = 1;
+ ifp->idev->if_flags |= IF_RS_SENT;
+ addrconf_mod_rs_timer(ifp->idev,
+ ifp->idev->cnf.rtr_solicit_interval);
+ spin_unlock(&ifp->lock);
+ write_unlock_bh(&ifp->idev->lock);
+ }
+}
+
+static void addrconf_dad_run(struct inet6_dev *idev)
+{
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ spin_lock(&ifp->lock);
+ if (ifp->flags & IFA_F_TENTATIVE &&
+ ifp->state == INET6_IFADDR_STATE_DAD)
+ addrconf_dad_kick(ifp);
+ spin_unlock(&ifp->lock);
+ }
+ read_unlock_bh(&idev->lock);
+}
+
+#ifdef CONFIG_PROC_FS
+struct if6_iter_state {
+ struct seq_net_private p;
+ int bucket;
+ int offset;
+};
+
+static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
+{
+ struct inet6_ifaddr *ifa = NULL;
+ struct if6_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ int p = 0;
+
+ /* initial bucket if pos is 0 */
+ if (pos == 0) {
+ state->bucket = 0;
+ state->offset = 0;
+ }
+
+ for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
+ hlist_for_each_entry_rcu_bh(ifa, &inet6_addr_lst[state->bucket],
+ addr_lst) {
+ if (!net_eq(dev_net(ifa->idev->dev), net))
+ continue;
+ /* sync with offset */
+ if (p < state->offset) {
+ p++;
+ continue;
+ }
+ state->offset++;
+ return ifa;
+ }
+
+ /* prepare for next bucket */
+ state->offset = 0;
+ p = 0;
+ }
+ return NULL;
+}
+
+static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
+ struct inet6_ifaddr *ifa)
+{
+ struct if6_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ hlist_for_each_entry_continue_rcu_bh(ifa, addr_lst) {
+ if (!net_eq(dev_net(ifa->idev->dev), net))
+ continue;
+ state->offset++;
+ return ifa;
+ }
+
+ while (++state->bucket < IN6_ADDR_HSIZE) {
+ state->offset = 0;
+ hlist_for_each_entry_rcu_bh(ifa,
+ &inet6_addr_lst[state->bucket], addr_lst) {
+ if (!net_eq(dev_net(ifa->idev->dev), net))
+ continue;
+ state->offset++;
+ return ifa;
+ }
+ }
+
+ return NULL;
+}
+
+static void *if6_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu_bh)
+{
+ rcu_read_lock_bh();
+ return if6_get_first(seq, *pos);
+}
+
+static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct inet6_ifaddr *ifa;
+
+ ifa = if6_get_next(seq, v);
+ ++*pos;
+ return ifa;
+}
+
+static void if6_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu_bh)
+{
+ rcu_read_unlock_bh();
+}
+
+static int if6_seq_show(struct seq_file *seq, void *v)
+{
+ struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v;
+ seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n",
+ &ifp->addr,
+ ifp->idev->dev->ifindex,
+ ifp->prefix_len,
+ ifp->scope,
+ (u8) ifp->flags,
+ ifp->idev->dev->name);
+ return 0;
+}
+
+static const struct seq_operations if6_seq_ops = {
+ .start = if6_seq_start,
+ .next = if6_seq_next,
+ .show = if6_seq_show,
+ .stop = if6_seq_stop,
+};
+
+static int if6_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &if6_seq_ops,
+ sizeof(struct if6_iter_state));
+}
+
+static const struct file_operations if6_fops = {
+ .owner = THIS_MODULE,
+ .open = if6_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init if6_proc_net_init(struct net *net)
+{
+ if (!proc_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit if6_proc_net_exit(struct net *net)
+{
+ remove_proc_entry("if_inet6", net->proc_net);
+}
+
+static struct pernet_operations if6_proc_net_ops = {
+ .init = if6_proc_net_init,
+ .exit = if6_proc_net_exit,
+};
+
+int __init if6_proc_init(void)
+{
+ return register_pernet_subsys(&if6_proc_net_ops);
+}
+
+void if6_proc_exit(void)
+{
+ unregister_pernet_subsys(&if6_proc_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+/* Check if address is a home address configured on any interface. */
+int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
+{
+ int ret = 0;
+ struct inet6_ifaddr *ifp = NULL;
+ unsigned int hash = inet6_addr_hash(addr);
+
+ rcu_read_lock_bh();
+ hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[hash], addr_lst) {
+ if (!net_eq(dev_net(ifp->idev->dev), net))
+ continue;
+ if (ipv6_addr_equal(&ifp->addr, addr) &&
+ (ifp->flags & IFA_F_HOMEADDRESS)) {
+ ret = 1;
+ break;
+ }
+ }
+ rcu_read_unlock_bh();
+ return ret;
+}
+#endif
+
+/*
+ * Periodic address status verification
+ */
+
+static void addrconf_verify_rtnl(void)
+{
+ unsigned long now, next, next_sec, next_sched;
+ struct inet6_ifaddr *ifp;
+ int i;
+
+ ASSERT_RTNL();
+
+ rcu_read_lock_bh();
+ now = jiffies;
+ next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+ cancel_delayed_work(&addr_chk_work);
+
+ for (i = 0; i < IN6_ADDR_HSIZE; i++) {
+restart:
+ hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) {
+ unsigned long age;
+
+ /* When setting preferred_lft to a value not zero or
+ * infinity, while valid_lft is infinity
+ * IFA_F_PERMANENT has a non-infinity life time.
+ */
+ if ((ifp->flags & IFA_F_PERMANENT) &&
+ (ifp->prefered_lft == INFINITY_LIFE_TIME))
+ continue;
+
+ spin_lock(&ifp->lock);
+ /* We try to batch several events at once. */
+ age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+ if (ifp->valid_lft != INFINITY_LIFE_TIME &&
+ age >= ifp->valid_lft) {
+ spin_unlock(&ifp->lock);
+ in6_ifa_hold(ifp);
+ ipv6_del_addr(ifp);
+ goto restart;
+ } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
+ spin_unlock(&ifp->lock);
+ continue;
+ } else if (age >= ifp->prefered_lft) {
+ /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */
+ int deprecate = 0;
+
+ if (!(ifp->flags&IFA_F_DEPRECATED)) {
+ deprecate = 1;
+ ifp->flags |= IFA_F_DEPRECATED;
+ }
+
+ if ((ifp->valid_lft != INFINITY_LIFE_TIME) &&
+ (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)))
+ next = ifp->tstamp + ifp->valid_lft * HZ;
+
+ spin_unlock(&ifp->lock);
+
+ if (deprecate) {
+ in6_ifa_hold(ifp);
+
+ ipv6_ifa_notify(0, ifp);
+ in6_ifa_put(ifp);
+ goto restart;
+ }
+ } else if ((ifp->flags&IFA_F_TEMPORARY) &&
+ !(ifp->flags&IFA_F_TENTATIVE)) {
+ unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
+ ifp->idev->cnf.dad_transmits *
+ NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ;
+
+ if (age >= ifp->prefered_lft - regen_advance) {
+ struct inet6_ifaddr *ifpub = ifp->ifpub;
+ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ;
+ if (!ifp->regen_count && ifpub) {
+ ifp->regen_count++;
+ in6_ifa_hold(ifp);
+ in6_ifa_hold(ifpub);
+ spin_unlock(&ifp->lock);
+
+ spin_lock(&ifpub->lock);
+ ifpub->regen_count = 0;
+ spin_unlock(&ifpub->lock);
+ ipv6_create_tempaddr(ifpub, ifp);
+ in6_ifa_put(ifpub);
+ in6_ifa_put(ifp);
+ goto restart;
+ }
+ } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
+ spin_unlock(&ifp->lock);
+ } else {
+ /* ifp->prefered_lft <= ifp->valid_lft */
+ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ;
+ spin_unlock(&ifp->lock);
+ }
+ }
+ }
+
+ next_sec = round_jiffies_up(next);
+ next_sched = next;
+
+ /* If rounded timeout is accurate enough, accept it. */
+ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+ next_sched = next_sec;
+
+ /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+ if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX))
+ next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX;
+
+ ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
+ now, next, next_sec, next_sched);
+ mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now);
+ rcu_read_unlock_bh();
+}
+
+static void addrconf_verify_work(struct work_struct *w)
+{
+ rtnl_lock();
+ addrconf_verify_rtnl();
+ rtnl_unlock();
+}
+
+static void addrconf_verify(void)
+{
+ mod_delayed_work(addrconf_wq, &addr_chk_work, 0);
+}
+
+static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
+ struct in6_addr **peer_pfx)
+{
+ struct in6_addr *pfx = NULL;
+
+ *peer_pfx = NULL;
+
+ if (addr)
+ pfx = nla_data(addr);
+
+ if (local) {
+ if (pfx && nla_memcmp(local, pfx, sizeof(*pfx)))
+ *peer_pfx = pfx;
+ pfx = nla_data(local);
+ }
+
+ return pfx;
+}
+
+static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
+ [IFA_ADDRESS] = { .len = sizeof(struct in6_addr) },
+ [IFA_LOCAL] = { .len = sizeof(struct in6_addr) },
+ [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
+ [IFA_FLAGS] = { .len = sizeof(u32) },
+};
+
+static int
+inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifaddrmsg *ifm;
+ struct nlattr *tb[IFA_MAX+1];
+ struct in6_addr *pfx, *peer_pfx;
+ u32 ifa_flags;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+ if (err < 0)
+ return err;
+
+ ifm = nlmsg_data(nlh);
+ pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
+ if (!pfx)
+ return -EINVAL;
+
+ ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags;
+
+ /* We ignore other flags so far. */
+ ifa_flags &= IFA_F_MANAGETEMPADDR;
+
+ return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
+ ifm->ifa_prefixlen);
+}
+
+static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
+ u32 prefered_lft, u32 valid_lft)
+{
+ u32 flags;
+ clock_t expires;
+ unsigned long timeout;
+ bool was_managetempaddr;
+ bool had_prefixroute;
+
+ ASSERT_RTNL();
+
+ if (!valid_lft || (prefered_lft > valid_lft))
+ return -EINVAL;
+
+ if (ifa_flags & IFA_F_MANAGETEMPADDR &&
+ (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
+ return -EINVAL;
+
+ timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ expires = jiffies_to_clock_t(timeout * HZ);
+ valid_lft = timeout;
+ flags = RTF_EXPIRES;
+ } else {
+ expires = 0;
+ flags = 0;
+ ifa_flags |= IFA_F_PERMANENT;
+ }
+
+ timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ ifa_flags |= IFA_F_DEPRECATED;
+ prefered_lft = timeout;
+ }
+
+ spin_lock_bh(&ifp->lock);
+ was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR;
+ had_prefixroute = ifp->flags & IFA_F_PERMANENT &&
+ !(ifp->flags & IFA_F_NOPREFIXROUTE);
+ ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
+ IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
+ IFA_F_NOPREFIXROUTE);
+ ifp->flags |= ifa_flags;
+ ifp->tstamp = jiffies;
+ ifp->valid_lft = valid_lft;
+ ifp->prefered_lft = prefered_lft;
+
+ spin_unlock_bh(&ifp->lock);
+ if (!(ifp->flags&IFA_F_TENTATIVE))
+ ipv6_ifa_notify(0, ifp);
+
+ if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
+ expires, flags);
+ } else if (had_prefixroute) {
+ enum cleanup_prefix_rt_t action;
+ unsigned long rt_expires;
+
+ write_lock_bh(&ifp->idev->lock);
+ action = check_cleanup_prefix_route(ifp, &rt_expires);
+ write_unlock_bh(&ifp->idev->lock);
+
+ if (action != CLEANUP_PREFIX_RT_NOP) {
+ cleanup_prefix_route(ifp, rt_expires,
+ action == CLEANUP_PREFIX_RT_DEL);
+ }
+ }
+
+ if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
+ if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
+ valid_lft = prefered_lft = 0;
+ manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft,
+ !was_managetempaddr, jiffies);
+ }
+
+ addrconf_verify_rtnl();
+
+ return 0;
+}
+
+static int
+inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifaddrmsg *ifm;
+ struct nlattr *tb[IFA_MAX+1];
+ struct in6_addr *pfx, *peer_pfx;
+ struct inet6_ifaddr *ifa;
+ struct net_device *dev;
+ u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
+ u32 ifa_flags;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+ if (err < 0)
+ return err;
+
+ ifm = nlmsg_data(nlh);
+ pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
+ if (!pfx)
+ return -EINVAL;
+
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ valid_lft = ci->ifa_valid;
+ preferred_lft = ci->ifa_prefered;
+ } else {
+ preferred_lft = INFINITY_LIFE_TIME;
+ valid_lft = INFINITY_LIFE_TIME;
+ }
+
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ if (!dev)
+ return -ENODEV;
+
+ ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags;
+
+ /* We ignore other flags so far. */
+ ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
+ IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN;
+
+ ifa = ipv6_get_ifaddr(net, pfx, dev, 1);
+ if (!ifa) {
+ /*
+ * It would be best to check for !NLM_F_CREATE here but
+ * userspace already relies on not having to provide this.
+ */
+ return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
+ ifm->ifa_prefixlen, ifa_flags,
+ preferred_lft, valid_lft);
+ }
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL ||
+ !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ err = -EEXIST;
+ else
+ err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
+
+ in6_ifa_put(ifa);
+
+ return err;
+}
+
+static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u32 flags,
+ u8 scope, int ifindex)
+{
+ struct ifaddrmsg *ifm;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET6;
+ ifm->ifa_prefixlen = prefixlen;
+ ifm->ifa_flags = flags;
+ ifm->ifa_scope = scope;
+ ifm->ifa_index = ifindex;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+ unsigned long tstamp, u32 preferred, u32 valid)
+{
+ struct ifa_cacheinfo ci;
+
+ ci.cstamp = cstamp_delta(cstamp);
+ ci.tstamp = cstamp_delta(tstamp);
+ ci.ifa_prefered = preferred;
+ ci.ifa_valid = valid;
+
+ return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
+}
+
+static inline int rt_scope(int ifa_scope)
+{
+ if (ifa_scope & IFA_HOST)
+ return RT_SCOPE_HOST;
+ else if (ifa_scope & IFA_LINK)
+ return RT_SCOPE_LINK;
+ else if (ifa_scope & IFA_SITE)
+ return RT_SCOPE_SITE;
+ else
+ return RT_SCOPE_UNIVERSE;
+}
+
+static inline int inet6_ifaddr_msgsize(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(16) /* IFA_LOCAL */
+ + nla_total_size(16) /* IFA_ADDRESS */
+ + nla_total_size(sizeof(struct ifa_cacheinfo))
+ + nla_total_size(4) /* IFA_FLAGS */;
+}
+
+static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
+ u32 portid, u32 seq, int event, unsigned int flags)
+{
+ struct nlmsghdr *nlh;
+ u32 preferred, valid;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
+ ifa->idev->dev->ifindex);
+
+ if (!((ifa->flags&IFA_F_PERMANENT) &&
+ (ifa->prefered_lft == INFINITY_LIFE_TIME))) {
+ preferred = ifa->prefered_lft;
+ valid = ifa->valid_lft;
+ if (preferred != INFINITY_LIFE_TIME) {
+ long tval = (jiffies - ifa->tstamp)/HZ;
+ if (preferred > tval)
+ preferred -= tval;
+ else
+ preferred = 0;
+ if (valid != INFINITY_LIFE_TIME) {
+ if (valid > tval)
+ valid -= tval;
+ else
+ valid = 0;
+ }
+ }
+ } else {
+ preferred = INFINITY_LIFE_TIME;
+ valid = INFINITY_LIFE_TIME;
+ }
+
+ if (!ipv6_addr_any(&ifa->peer_addr)) {
+ if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 ||
+ nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->peer_addr) < 0)
+ goto error;
+ } else
+ if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
+ goto error;
+
+ if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
+ goto error;
+
+ if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0)
+ goto error;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+error:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
+ u32 portid, u32 seq, int event, u16 flags)
+{
+ struct nlmsghdr *nlh;
+ u8 scope = RT_SCOPE_UNIVERSE;
+ int ifindex = ifmca->idev->dev->ifindex;
+
+ if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
+ scope = RT_SCOPE_SITE;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
+ if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
+ put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
+ u32 portid, u32 seq, int event, unsigned int flags)
+{
+ struct nlmsghdr *nlh;
+ u8 scope = RT_SCOPE_UNIVERSE;
+ int ifindex = ifaca->aca_idev->dev->ifindex;
+
+ if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
+ scope = RT_SCOPE_SITE;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
+ if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
+ put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp,
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+enum addr_type_t {
+ UNICAST_ADDR,
+ MULTICAST_ADDR,
+ ANYCAST_ADDR,
+};
+
+/* called with rcu_read_lock() */
+static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
+ struct netlink_callback *cb, enum addr_type_t type,
+ int s_ip_idx, int *p_ip_idx)
+{
+ struct ifmcaddr6 *ifmca;
+ struct ifacaddr6 *ifaca;
+ int err = 1;
+ int ip_idx = *p_ip_idx;
+
+ read_lock_bh(&idev->lock);
+ switch (type) {
+ case UNICAST_ADDR: {
+ struct inet6_ifaddr *ifa;
+
+ /* unicast address incl. temp addr */
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ if (++ip_idx < s_ip_idx)
+ continue;
+ err = inet6_fill_ifaddr(skb, ifa,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWADDR,
+ NLM_F_MULTI);
+ if (err < 0)
+ break;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ }
+ break;
+ }
+ case MULTICAST_ADDR:
+ /* multicast address */
+ for (ifmca = idev->mc_list; ifmca;
+ ifmca = ifmca->next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ err = inet6_fill_ifmcaddr(skb, ifmca,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_GETMULTICAST,
+ NLM_F_MULTI);
+ if (err < 0)
+ break;
+ }
+ break;
+ case ANYCAST_ADDR:
+ /* anycast address */
+ for (ifaca = idev->ac_list; ifaca;
+ ifaca = ifaca->aca_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ err = inet6_fill_ifacaddr(skb, ifaca,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_GETANYCAST,
+ NLM_F_MULTI);
+ if (err < 0)
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ *p_ip_idx = ip_idx;
+ return err;
+}
+
+static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
+ enum addr_type_t type)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, ip_idx;
+ int s_idx, s_ip_idx;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ s_ip_idx = ip_idx = cb->args[2];
+
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq;
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ if (h > s_h || idx > s_idx)
+ s_ip_idx = 0;
+ ip_idx = 0;
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ goto cont;
+
+ if (in6_dump_addrs(idev, skb, cb, type,
+ s_ip_idx, &ip_idx) < 0)
+ goto done;
+cont:
+ idx++;
+ }
+ }
+done:
+ rcu_read_unlock();
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = ip_idx;
+
+ return skb->len;
+}
+
+static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ enum addr_type_t type = UNICAST_ADDR;
+
+ return inet6_dump_addr(skb, cb, type);
+}
+
+static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ enum addr_type_t type = MULTICAST_ADDR;
+
+ return inet6_dump_addr(skb, cb, type);
+}
+
+
+static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ enum addr_type_t type = ANYCAST_ADDR;
+
+ return inet6_dump_addr(skb, cb, type);
+}
+
+static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct ifaddrmsg *ifm;
+ struct nlattr *tb[IFA_MAX+1];
+ struct in6_addr *addr = NULL, *peer;
+ struct net_device *dev = NULL;
+ struct inet6_ifaddr *ifa;
+ struct sk_buff *skb;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+ if (err < 0)
+ goto errout;
+
+ addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
+ if (!addr) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifa_index)
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+
+ ifa = ipv6_get_ifaddr(net, addr, dev, 1);
+ if (!ifa) {
+ err = -EADDRNOTAVAIL;
+ goto errout;
+ }
+
+ skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout_ifa;
+ }
+
+ err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWADDR, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout_ifa;
+ }
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout_ifa:
+ in6_ifa_put(ifa);
+errout:
+ return err;
+}
+
+static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
+{
+ struct sk_buff *skb;
+ struct net *net = dev_net(ifa->idev->dev);
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
+}
+
+static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
+ __s32 *array, int bytes)
+{
+ BUG_ON(bytes < (DEVCONF_MAX * 4));
+
+ memset(array, 0, bytes);
+ array[DEVCONF_FORWARDING] = cnf->forwarding;
+ array[DEVCONF_HOPLIMIT] = cnf->hop_limit;
+ array[DEVCONF_MTU6] = cnf->mtu6;
+ array[DEVCONF_ACCEPT_RA] = cnf->accept_ra;
+ array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects;
+ array[DEVCONF_AUTOCONF] = cnf->autoconf;
+ array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits;
+ array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
+ array[DEVCONF_RTR_SOLICIT_INTERVAL] =
+ jiffies_to_msecs(cnf->rtr_solicit_interval);
+ array[DEVCONF_RTR_SOLICIT_DELAY] =
+ jiffies_to_msecs(cnf->rtr_solicit_delay);
+ array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
+ array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] =
+ jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval);
+ array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] =
+ jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval);
+ array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr;
+ array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft;
+ array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft;
+ array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry;
+ array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
+ array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
+ array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+ array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
+ array[DEVCONF_RTR_PROBE_INTERVAL] =
+ jiffies_to_msecs(cnf->rtr_probe_interval);
+#ifdef CONFIG_IPV6_ROUTE_INFO
+ array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
+#endif
+#endif
+ array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
+ array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad;
+ array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
+#endif
+#ifdef CONFIG_IPV6_MROUTE
+ array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
+#endif
+ array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
+ array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
+ array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao;
+ array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify;
+ array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
+ array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
+ array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
+ /* we omit DEVCONF_STABLE_SECRET for now */
+}
+
+static inline size_t inet6_ifla6_size(void)
+{
+ return nla_total_size(4) /* IFLA_INET6_FLAGS */
+ + nla_total_size(sizeof(struct ifla_cacheinfo))
+ + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
+ + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
+ + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
+ + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */
+}
+
+static inline size_t inet6_if_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(4) /* IFLA_MTU */
+ + nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
+}
+
+static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
+ int items, int bytes)
+{
+ int i;
+ int pad = bytes - sizeof(u64) * items;
+ BUG_ON(pad < 0);
+
+ /* Use put_unaligned() because stats may not be aligned for u64. */
+ put_unaligned(items, &stats[0]);
+ for (i = 1; i < items; i++)
+ put_unaligned(atomic_long_read(&mib[i]), &stats[i]);
+
+ memset(&stats[items], 0, pad);
+}
+
+static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
+ int items, int bytes, size_t syncpoff)
+{
+ int i;
+ int pad = bytes - sizeof(u64) * items;
+ BUG_ON(pad < 0);
+
+ /* Use put_unaligned() because stats may not be aligned for u64. */
+ put_unaligned(items, &stats[0]);
+ for (i = 1; i < items; i++)
+ put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
+
+ memset(&stats[items], 0, pad);
+}
+
+static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
+ int bytes)
+{
+ switch (attrtype) {
+ case IFLA_INET6_STATS:
+ __snmp6_fill_stats64(stats, idev->stats.ipv6,
+ IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp));
+ break;
+ case IFLA_INET6_ICMP6STATS:
+ __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes);
+ break;
+ }
+}
+
+static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
+{
+ struct nlattr *nla;
+ struct ifla_cacheinfo ci;
+
+ if (nla_put_u32(skb, IFLA_INET6_FLAGS, idev->if_flags))
+ goto nla_put_failure;
+ ci.max_reasm_len = IPV6_MAXPLEN;
+ ci.tstamp = cstamp_delta(idev->tstamp);
+ ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time);
+ ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME));
+ if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+ nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32));
+ if (!nla)
+ goto nla_put_failure;
+ ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla));
+
+ /* XXX - MC not implemented */
+
+ nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
+ if (!nla)
+ goto nla_put_failure;
+ snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));
+
+ nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
+ if (!nla)
+ goto nla_put_failure;
+ snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
+
+ nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr));
+ if (!nla)
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->addr_gen_mode))
+ goto nla_put_failure;
+
+ read_lock_bh(&idev->lock);
+ memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla));
+ read_unlock_bh(&idev->lock);
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static size_t inet6_get_link_af_size(const struct net_device *dev)
+{
+ if (!__in6_dev_get(dev))
+ return 0;
+
+ return inet6_ifla6_size();
+}
+
+static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ if (!idev)
+ return -ENODATA;
+
+ if (inet6_fill_ifla6_attrs(skb, idev) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
+{
+ struct inet6_ifaddr *ifp;
+ struct net_device *dev = idev->dev;
+ bool update_rs = false;
+ struct in6_addr ll_addr;
+
+ ASSERT_RTNL();
+
+ if (!token)
+ return -EINVAL;
+ if (ipv6_addr_any(token))
+ return -EINVAL;
+ if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
+ return -EINVAL;
+ if (!ipv6_accept_ra(idev))
+ return -EINVAL;
+ if (idev->cnf.rtr_solicits <= 0)
+ return -EINVAL;
+
+ write_lock_bh(&idev->lock);
+
+ BUILD_BUG_ON(sizeof(token->s6_addr) != 16);
+ memcpy(idev->token.s6_addr + 8, token->s6_addr + 8, 8);
+
+ write_unlock_bh(&idev->lock);
+
+ if (!idev->dead && (idev->if_flags & IF_READY) &&
+ !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
+ IFA_F_OPTIMISTIC)) {
+
+ /* If we're not ready, then normal ifup will take care
+ * of this. Otherwise, we need to request our rs here.
+ */
+ ndisc_send_rs(dev, &ll_addr, &in6addr_linklocal_allrouters);
+ update_rs = true;
+ }
+
+ write_lock_bh(&idev->lock);
+
+ if (update_rs) {
+ idev->if_flags |= IF_RS_SENT;
+ idev->rs_probes = 1;
+ addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval);
+ }
+
+ /* Well, that's kinda nasty ... */
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ spin_lock(&ifp->lock);
+ if (ifp->tokenized) {
+ ifp->valid_lft = 0;
+ ifp->prefered_lft = 0;
+ }
+ spin_unlock(&ifp->lock);
+ }
+
+ write_unlock_bh(&idev->lock);
+ inet6_ifinfo_notify(RTM_NEWLINK, idev);
+ addrconf_verify_rtnl();
+ return 0;
+}
+
+static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
+ [IFLA_INET6_ADDR_GEN_MODE] = { .type = NLA_U8 },
+ [IFLA_INET6_TOKEN] = { .len = sizeof(struct in6_addr) },
+};
+
+static int inet6_validate_link_af(const struct net_device *dev,
+ const struct nlattr *nla)
+{
+ struct nlattr *tb[IFLA_INET6_MAX + 1];
+
+ if (dev && !__in6_dev_get(dev))
+ return -EAFNOSUPPORT;
+
+ return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy);
+}
+
+static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
+{
+ int err = -EINVAL;
+ struct inet6_dev *idev = __in6_dev_get(dev);
+ struct nlattr *tb[IFLA_INET6_MAX + 1];
+
+ if (!idev)
+ return -EAFNOSUPPORT;
+
+ if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0)
+ BUG();
+
+ if (tb[IFLA_INET6_TOKEN]) {
+ err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]));
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
+ u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
+
+ if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
+ mode != IN6_ADDR_GEN_MODE_NONE &&
+ mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY)
+ return -EINVAL;
+
+ if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+ !idev->cnf.stable_secret.initialized &&
+ !dev_net(dev)->ipv6.devconf_dflt->stable_secret.initialized)
+ return -EINVAL;
+
+ idev->addr_gen_mode = mode;
+ err = 0;
+ }
+
+ return err;
+}
+
+static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
+ u32 portid, u32 seq, int event, unsigned int flags)
+{
+ struct net_device *dev = idev->dev;
+ struct ifinfomsg *hdr;
+ struct nlmsghdr *nlh;
+ void *protoinfo;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ hdr = nlmsg_data(nlh);
+ hdr->ifi_family = AF_INET6;
+ hdr->__ifi_pad = 0;
+ hdr->ifi_type = dev->type;
+ hdr->ifi_index = dev->ifindex;
+ hdr->ifi_flags = dev_get_flags(dev);
+ hdr->ifi_change = 0;
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
+ (dev->addr_len &&
+ nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
+ nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ (dev->ifindex != dev_get_iflink(dev) &&
+ nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
+ goto nla_put_failure;
+ protoinfo = nla_nest_start(skb, IFLA_PROTINFO);
+ if (!protoinfo)
+ goto nla_put_failure;
+
+ if (inet6_fill_ifla6_attrs(skb, idev) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, protoinfo);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx = 0, s_idx;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ rcu_read_lock();
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ goto cont;
+ if (inet6_fill_ifinfo(skb, idev,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWLINK, NLM_F_MULTI) < 0)
+ goto out;
+cont:
+ idx++;
+ }
+ }
+out:
+ rcu_read_unlock();
+ cb->args[1] = idx;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
+{
+ struct sk_buff *skb;
+ struct net *net = dev_net(idev->dev);
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err);
+}
+
+static inline size_t inet6_prefix_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct prefixmsg))
+ + nla_total_size(sizeof(struct in6_addr))
+ + nla_total_size(sizeof(struct prefix_cacheinfo));
+}
+
+static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
+ struct prefix_info *pinfo, u32 portid, u32 seq,
+ int event, unsigned int flags)
+{
+ struct prefixmsg *pmsg;
+ struct nlmsghdr *nlh;
+ struct prefix_cacheinfo ci;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*pmsg), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ pmsg = nlmsg_data(nlh);
+ pmsg->prefix_family = AF_INET6;
+ pmsg->prefix_pad1 = 0;
+ pmsg->prefix_pad2 = 0;
+ pmsg->prefix_ifindex = idev->dev->ifindex;
+ pmsg->prefix_len = pinfo->prefix_len;
+ pmsg->prefix_type = pinfo->type;
+ pmsg->prefix_pad3 = 0;
+ pmsg->prefix_flags = 0;
+ if (pinfo->onlink)
+ pmsg->prefix_flags |= IF_PREFIX_ONLINK;
+ if (pinfo->autoconf)
+ pmsg->prefix_flags |= IF_PREFIX_AUTOCONF;
+
+ if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix))
+ goto nla_put_failure;
+ ci.preferred_time = ntohl(pinfo->prefered);
+ ci.valid_time = ntohl(pinfo->valid);
+ if (nla_put(skb, PREFIX_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void inet6_prefix_notify(int event, struct inet6_dev *idev,
+ struct prefix_info *pinfo)
+{
+ struct sk_buff *skb;
+ struct net *net = dev_net(idev->dev);
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
+}
+
+static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
+{
+ struct net *net = dev_net(ifp->idev->dev);
+
+ if (event)
+ ASSERT_RTNL();
+
+ inet6_ifa_notify(event ? : RTM_NEWADDR, ifp);
+
+ switch (event) {
+ case RTM_NEWADDR:
+ /*
+ * If the address was optimistic
+ * we inserted the route at the start of
+ * our DAD process, so we don't need
+ * to do it again
+ */
+ if (!(ifp->rt->rt6i_node))
+ ip6_ins_rt(ifp->rt);
+ if (ifp->idev->cnf.forwarding)
+ addrconf_join_anycast(ifp);
+ if (!ipv6_addr_any(&ifp->peer_addr))
+ addrconf_prefix_route(&ifp->peer_addr, 128,
+ ifp->idev->dev, 0, 0);
+ break;
+ case RTM_DELADDR:
+ if (ifp->idev->cnf.forwarding)
+ addrconf_leave_anycast(ifp);
+ addrconf_leave_solict(ifp->idev, &ifp->addr);
+ if (!ipv6_addr_any(&ifp->peer_addr)) {
+ struct rt6_info *rt;
+
+ rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
+ ifp->idev->dev, 0, 0);
+ if (rt && ip6_del_rt(rt))
+ dst_free(&rt->dst);
+ }
+ dst_hold(&ifp->rt->dst);
+
+ if (ip6_del_rt(ifp->rt))
+ dst_free(&ifp->rt->dst);
+
+ rt_genid_bump_ipv6(net);
+ break;
+ }
+ atomic_inc(&net->ipv6.dev_addr_genid);
+}
+
+static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
+{
+ rcu_read_lock_bh();
+ if (likely(ifp->idev->dead == 0))
+ __ipv6_ifa_notify(event, ifp);
+ rcu_read_unlock_bh();
+}
+
+#ifdef CONFIG_SYSCTL
+
+static
+int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ struct ctl_table lctl;
+ int ret;
+
+ /*
+ * ctl->data points to idev->cnf.forwarding, we should
+ * not modify it until we get the rtnl lock.
+ */
+ lctl = *ctl;
+ lctl.data = &val;
+
+ ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+ if (write)
+ ret = addrconf_fixup_forwarding(ctl, valp, val);
+ if (ret)
+ *ppos = pos;
+ return ret;
+}
+
+static
+int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct inet6_dev *idev = ctl->extra1;
+ int min_mtu = IPV6_MIN_MTU;
+ struct ctl_table lctl;
+
+ lctl = *ctl;
+ lctl.extra1 = &min_mtu;
+ lctl.extra2 = idev ? &idev->dev->mtu : NULL;
+
+ return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
+}
+
+static void dev_disable_change(struct inet6_dev *idev)
+{
+ struct netdev_notifier_info info;
+
+ if (!idev || !idev->dev)
+ return;
+
+ netdev_notifier_info_init(&info, idev->dev);
+ if (idev->cnf.disable_ipv6)
+ addrconf_notify(NULL, NETDEV_DOWN, &info);
+ else
+ addrconf_notify(NULL, NETDEV_UP, &info);
+}
+
+static void addrconf_disable_change(struct net *net, __s32 newf)
+{
+ struct net_device *dev;
+ struct inet6_dev *idev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
+ idev->cnf.disable_ipv6 = newf;
+ if (changed)
+ dev_disable_change(idev);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf)
+{
+ struct net *net;
+ int old;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ net = (struct net *)table->extra2;
+ old = *p;
+ *p = newf;
+
+ if (p == &net->ipv6.devconf_dflt->disable_ipv6) {
+ rtnl_unlock();
+ return 0;
+ }
+
+ if (p == &net->ipv6.devconf_all->disable_ipv6) {
+ net->ipv6.devconf_dflt->disable_ipv6 = newf;
+ addrconf_disable_change(net, newf);
+ } else if ((!newf) ^ (!old))
+ dev_disable_change((struct inet6_dev *)table->extra1);
+
+ rtnl_unlock();
+ return 0;
+}
+
+static
+int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ struct ctl_table lctl;
+ int ret;
+
+ /*
+ * ctl->data points to idev->cnf.disable_ipv6, we should
+ * not modify it until we get the rtnl lock.
+ */
+ lctl = *ctl;
+ lctl.data = &val;
+
+ ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+ if (write)
+ ret = addrconf_disable_ipv6(ctl, valp, val);
+ if (ret)
+ *ppos = pos;
+ return ret;
+}
+
+static
+int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int ret;
+ int old, new;
+
+ old = *valp;
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ new = *valp;
+
+ if (write && old != new) {
+ struct net *net = ctl->extra2;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (valp == &net->ipv6.devconf_dflt->proxy_ndp)
+ inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+ else if (valp == &net->ipv6.devconf_all->proxy_ndp)
+ inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+ else {
+ struct inet6_dev *idev = ctl->extra1;
+
+ inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+ idev->dev->ifindex,
+ &idev->cnf);
+ }
+ rtnl_unlock();
+ }
+
+ return ret;
+}
+
+static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int err;
+ struct in6_addr addr;
+ char str[IPV6_MAX_STRLEN];
+ struct ctl_table lctl = *ctl;
+ struct net *net = ctl->extra2;
+ struct ipv6_stable_secret *secret = ctl->data;
+
+ if (&net->ipv6.devconf_all->stable_secret == ctl->data)
+ return -EIO;
+
+ lctl.maxlen = IPV6_MAX_STRLEN;
+ lctl.data = str;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (!write && !secret->initialized) {
+ err = -EIO;
+ goto out;
+ }
+
+ if (!write) {
+ err = snprintf(str, sizeof(str), "%pI6",
+ &secret->secret);
+ if (err >= sizeof(str)) {
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ err = proc_dostring(&lctl, write, buffer, lenp, ppos);
+ if (err || !write)
+ goto out;
+
+ if (in6_pton(str, -1, addr.in6_u.u6_addr8, -1, NULL) != 1) {
+ err = -EIO;
+ goto out;
+ }
+
+ secret->initialized = true;
+ secret->secret = addr;
+
+ if (&net->ipv6.devconf_dflt->stable_secret == ctl->data) {
+ struct net_device *dev;
+
+ for_each_netdev(net, dev) {
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ if (idev) {
+ idev->addr_gen_mode =
+ IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ }
+ }
+ } else {
+ struct inet6_dev *idev = ctl->extra1;
+
+ idev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ }
+
+out:
+ rtnl_unlock();
+
+ return err;
+}
+
+static struct addrconf_sysctl_table
+{
+ struct ctl_table_header *sysctl_header;
+ struct ctl_table addrconf_vars[DEVCONF_MAX+1];
+} addrconf_sysctl __read_mostly = {
+ .sysctl_header = NULL,
+ .addrconf_vars = {
+ {
+ .procname = "forwarding",
+ .data = &ipv6_devconf.forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_forward,
+ },
+ {
+ .procname = "hop_limit",
+ .data = &ipv6_devconf.hop_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "mtu",
+ .data = &ipv6_devconf.mtu6,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_mtu,
+ },
+ {
+ .procname = "accept_ra",
+ .data = &ipv6_devconf.accept_ra,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_redirects",
+ .data = &ipv6_devconf.accept_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "autoconf",
+ .data = &ipv6_devconf.autoconf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "dad_transmits",
+ .data = &ipv6_devconf.dad_transmits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "router_solicitations",
+ .data = &ipv6_devconf.rtr_solicits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "router_solicitation_interval",
+ .data = &ipv6_devconf.rtr_solicit_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "router_solicitation_delay",
+ .data = &ipv6_devconf.rtr_solicit_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "force_mld_version",
+ .data = &ipv6_devconf.force_mld_version,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "mldv1_unsolicited_report_interval",
+ .data =
+ &ipv6_devconf.mldv1_unsolicited_report_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "mldv2_unsolicited_report_interval",
+ .data =
+ &ipv6_devconf.mldv2_unsolicited_report_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "use_tempaddr",
+ .data = &ipv6_devconf.use_tempaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "temp_valid_lft",
+ .data = &ipv6_devconf.temp_valid_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "temp_prefered_lft",
+ .data = &ipv6_devconf.temp_prefered_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "regen_max_retry",
+ .data = &ipv6_devconf.regen_max_retry,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_desync_factor",
+ .data = &ipv6_devconf.max_desync_factor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_addresses",
+ .data = &ipv6_devconf.max_addresses,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_defrtr",
+ .data = &ipv6_devconf.accept_ra_defrtr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_pinfo",
+ .data = &ipv6_devconf.accept_ra_pinfo,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ {
+ .procname = "accept_ra_rtr_pref",
+ .data = &ipv6_devconf.accept_ra_rtr_pref,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "router_probe_interval",
+ .data = &ipv6_devconf.rtr_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#ifdef CONFIG_IPV6_ROUTE_INFO
+ {
+ .procname = "accept_ra_rt_info_max_plen",
+ .data = &ipv6_devconf.accept_ra_rt_info_max_plen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#endif
+ {
+ .procname = "proxy_ndp",
+ .data = &ipv6_devconf.proxy_ndp,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_proxy_ndp,
+ },
+ {
+ .procname = "accept_source_route",
+ .data = &ipv6_devconf.accept_source_route,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ {
+ .procname = "optimistic_dad",
+ .data = &ipv6_devconf.optimistic_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
+ {
+ .procname = "use_optimistic",
+ .data = &ipv6_devconf.use_optimistic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
+#endif
+#ifdef CONFIG_IPV6_MROUTE
+ {
+ .procname = "mc_forwarding",
+ .data = &ipv6_devconf.mc_forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "disable_ipv6",
+ .data = &ipv6_devconf.disable_ipv6,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_disable,
+ },
+ {
+ .procname = "accept_dad",
+ .data = &ipv6_devconf.accept_dad,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "force_tllao",
+ .data = &ipv6_devconf.force_tllao,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ndisc_notify",
+ .data = &ipv6_devconf.ndisc_notify,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "suppress_frag_ndisc",
+ .data = &ipv6_devconf.suppress_frag_ndisc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "accept_ra_from_local",
+ .data = &ipv6_devconf.accept_ra_from_local,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "accept_ra_mtu",
+ .data = &ipv6_devconf.accept_ra_mtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "stable_secret",
+ .data = &ipv6_devconf.stable_secret,
+ .maxlen = IPV6_MAX_STRLEN,
+ .mode = 0600,
+ .proc_handler = addrconf_sysctl_stable_secret,
+ },
+ {
+ /* sentinel */
+ }
+ },
+};
+
+static int __addrconf_sysctl_register(struct net *net, char *dev_name,
+ struct inet6_dev *idev, struct ipv6_devconf *p)
+{
+ int i;
+ struct addrconf_sysctl_table *t;
+ char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
+
+ t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto out;
+
+ for (i = 0; t->addrconf_vars[i].data; i++) {
+ t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf;
+ t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
+ t->addrconf_vars[i].extra2 = net;
+ }
+
+ snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
+
+ t->sysctl_header = register_net_sysctl(net, path, t->addrconf_vars);
+ if (!t->sysctl_header)
+ goto free;
+
+ p->sysctl = t;
+ return 0;
+
+free:
+ kfree(t);
+out:
+ return -ENOBUFS;
+}
+
+static void __addrconf_sysctl_unregister(struct ipv6_devconf *p)
+{
+ struct addrconf_sysctl_table *t;
+
+ if (!p->sysctl)
+ return;
+
+ t = p->sysctl;
+ p->sysctl = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t);
+}
+
+static int addrconf_sysctl_register(struct inet6_dev *idev)
+{
+ int err;
+
+ if (!sysctl_dev_name_is_allowed(idev->dev->name))
+ return -EINVAL;
+
+ err = neigh_sysctl_register(idev->dev, idev->nd_parms,
+ &ndisc_ifinfo_sysctl_change);
+ if (err)
+ return err;
+ err = __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ idev, &idev->cnf);
+ if (err)
+ neigh_sysctl_unregister(idev->nd_parms);
+
+ return err;
+}
+
+static void addrconf_sysctl_unregister(struct inet6_dev *idev)
+{
+ __addrconf_sysctl_unregister(&idev->cnf);
+ neigh_sysctl_unregister(idev->nd_parms);
+}
+
+
+#endif
+
+static int __net_init addrconf_init_net(struct net *net)
+{
+ int err = -ENOMEM;
+ struct ipv6_devconf *all, *dflt;
+
+ all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
+ if (!all)
+ goto err_alloc_all;
+
+ dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
+ if (!dflt)
+ goto err_alloc_dflt;
+
+ /* these will be inherited by all namespaces */
+ dflt->autoconf = ipv6_defaults.autoconf;
+ dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;
+
+ dflt->stable_secret.initialized = false;
+ all->stable_secret.initialized = false;
+
+ net->ipv6.devconf_all = all;
+ net->ipv6.devconf_dflt = dflt;
+
+#ifdef CONFIG_SYSCTL
+ err = __addrconf_sysctl_register(net, "all", NULL, all);
+ if (err < 0)
+ goto err_reg_all;
+
+ err = __addrconf_sysctl_register(net, "default", NULL, dflt);
+ if (err < 0)
+ goto err_reg_dflt;
+#endif
+ return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_dflt:
+ __addrconf_sysctl_unregister(all);
+err_reg_all:
+ kfree(dflt);
+#endif
+err_alloc_dflt:
+ kfree(all);
+err_alloc_all:
+ return err;
+}
+
+static void __net_exit addrconf_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+ __addrconf_sysctl_unregister(net->ipv6.devconf_dflt);
+ __addrconf_sysctl_unregister(net->ipv6.devconf_all);
+#endif
+ kfree(net->ipv6.devconf_dflt);
+ kfree(net->ipv6.devconf_all);
+}
+
+static struct pernet_operations addrconf_ops = {
+ .init = addrconf_init_net,
+ .exit = addrconf_exit_net,
+};
+
+static struct rtnl_af_ops inet6_ops __read_mostly = {
+ .family = AF_INET6,
+ .fill_link_af = inet6_fill_link_af,
+ .get_link_af_size = inet6_get_link_af_size,
+ .validate_link_af = inet6_validate_link_af,
+ .set_link_af = inet6_set_link_af,
+};
+
+/*
+ * Init / cleanup code
+ */
+
+int __init addrconf_init(void)
+{
+ struct inet6_dev *idev;
+ int i, err;
+
+ err = ipv6_addr_label_init();
+ if (err < 0) {
+ pr_crit("%s: cannot initialize default policy table: %d\n",
+ __func__, err);
+ goto out;
+ }
+
+ err = register_pernet_subsys(&addrconf_ops);
+ if (err < 0)
+ goto out_addrlabel;
+
+ addrconf_wq = create_workqueue("ipv6_addrconf");
+ if (!addrconf_wq) {
+ err = -ENOMEM;
+ goto out_nowq;
+ }
+
+ /* The addrconf netdev notifier requires that loopback_dev
+ * has it's ipv6 private information allocated and setup
+ * before it can bring up and give link-local addresses
+ * to other devices which are up.
+ *
+ * Unfortunately, loopback_dev is not necessarily the first
+ * entry in the global dev_base list of net devices. In fact,
+ * it is likely to be the very last entry on that list.
+ * So this causes the notifier registry below to try and
+ * give link-local addresses to all devices besides loopback_dev
+ * first, then loopback_dev, which cases all the non-loopback_dev
+ * devices to fail to get a link-local address.
+ *
+ * So, as a temporary fix, allocate the ipv6 structure for
+ * loopback_dev first by hand.
+ * Longer term, all of the dependencies ipv6 has upon the loopback
+ * device and it being up should be removed.
+ */
+ rtnl_lock();
+ idev = ipv6_add_dev(init_net.loopback_dev);
+ rtnl_unlock();
+ if (IS_ERR(idev)) {
+ err = PTR_ERR(idev);
+ goto errlo;
+ }
+
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet6_addr_lst[i]);
+
+ register_netdevice_notifier(&ipv6_dev_notf);
+
+ addrconf_verify();
+
+ rtnl_af_register(&inet6_ops);
+
+ err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
+ NULL);
+ if (err < 0)
+ goto errout;
+
+ /* Only the first call to __rtnl_register can fail */
+ __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL);
+ __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL);
+ __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
+ inet6_dump_ifaddr, NULL);
+ __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
+ inet6_dump_ifmcaddr, NULL);
+ __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
+ inet6_dump_ifacaddr, NULL);
+ __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
+ inet6_netconf_dump_devconf, NULL);
+
+ ipv6_addr_label_rtnl_register();
+
+ return 0;
+errout:
+ rtnl_af_unregister(&inet6_ops);
+ unregister_netdevice_notifier(&ipv6_dev_notf);
+errlo:
+ destroy_workqueue(addrconf_wq);
+out_nowq:
+ unregister_pernet_subsys(&addrconf_ops);
+out_addrlabel:
+ ipv6_addr_label_cleanup();
+out:
+ return err;
+}
+
+void addrconf_cleanup(void)
+{
+ struct net_device *dev;
+ int i;
+
+ unregister_netdevice_notifier(&ipv6_dev_notf);
+ unregister_pernet_subsys(&addrconf_ops);
+ ipv6_addr_label_cleanup();
+
+ rtnl_lock();
+
+ __rtnl_af_unregister(&inet6_ops);
+
+ /* clean dev list */
+ for_each_netdev(&init_net, dev) {
+ if (__in6_dev_get(dev) == NULL)
+ continue;
+ addrconf_ifdown(dev, 1);
+ }
+ addrconf_ifdown(init_net.loopback_dev, 2);
+
+ /*
+ * Check hash table.
+ */
+ spin_lock_bh(&addrconf_hash_lock);
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ WARN_ON(!hlist_empty(&inet6_addr_lst[i]));
+ spin_unlock_bh(&addrconf_hash_lock);
+ cancel_delayed_work(&addr_chk_work);
+ rtnl_unlock();
+
+ destroy_workqueue(addrconf_wq);
+}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
new file mode 100644
index 000000000..ca09bf49a
--- /dev/null
+++ b/net/ipv6/addrconf_core.c
@@ -0,0 +1,164 @@
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static.
+ */
+
+#include <linux/export.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip.h>
+
+/* if ipv6 module registers this function is used by xfrm to force all
+ * sockets to relookup their nodes - this is fairly expensive, be
+ * careful
+ */
+void (*__fib6_flush_trees)(struct net *);
+EXPORT_SYMBOL(__fib6_flush_trees);
+
+#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16)
+
+static inline unsigned int ipv6_addr_scope2type(unsigned int scope)
+{
+ switch (scope) {
+ case IPV6_ADDR_SCOPE_NODELOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) |
+ IPV6_ADDR_LOOPBACK);
+ case IPV6_ADDR_SCOPE_LINKLOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) |
+ IPV6_ADDR_LINKLOCAL);
+ case IPV6_ADDR_SCOPE_SITELOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) |
+ IPV6_ADDR_SITELOCAL);
+ }
+ return IPV6_ADDR_SCOPE_TYPE(scope);
+}
+
+int __ipv6_addr_type(const struct in6_addr *addr)
+{
+ __be32 st;
+
+ st = addr->s6_addr32[0];
+
+ /* Consider all addresses with the first three bits different of
+ 000 and 111 as unicasts.
+ */
+ if ((st & htonl(0xE0000000)) != htonl(0x00000000) &&
+ (st & htonl(0xE0000000)) != htonl(0xE0000000))
+ return (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));
+
+ if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) {
+ /* multicast */
+ /* addr-select 3.1 */
+ return (IPV6_ADDR_MULTICAST |
+ ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr)));
+ }
+
+ if ((st & htonl(0xFFC00000)) == htonl(0xFE800000))
+ return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */
+ if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000))
+ return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */
+ if ((st & htonl(0xFE000000)) == htonl(0xFC000000))
+ return (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */
+
+ if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) {
+ if (addr->s6_addr32[2] == 0) {
+ if (addr->s6_addr32[3] == 0)
+ return IPV6_ADDR_ANY;
+
+ if (addr->s6_addr32[3] == htonl(0x00000001))
+ return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */
+
+ return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */
+ }
+
+ if (addr->s6_addr32[2] == htonl(0x0000ffff))
+ return (IPV6_ADDR_MAPPED |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */
+ }
+
+ return (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */
+}
+EXPORT_SYMBOL(__ipv6_addr_type);
+
+static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
+
+int register_inet6addr_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&inet6addr_chain, nb);
+}
+EXPORT_SYMBOL(register_inet6addr_notifier);
+
+int unregister_inet6addr_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&inet6addr_chain, nb);
+}
+EXPORT_SYMBOL(unregister_inet6addr_notifier);
+
+int inet6addr_notifier_call_chain(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&inet6addr_chain, val, v);
+}
+EXPORT_SYMBOL(inet6addr_notifier_call_chain);
+
+const struct ipv6_stub *ipv6_stub __read_mostly;
+EXPORT_SYMBOL_GPL(ipv6_stub);
+
+/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
+const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
+EXPORT_SYMBOL(in6addr_loopback);
+const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
+EXPORT_SYMBOL(in6addr_any);
+const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
+EXPORT_SYMBOL(in6addr_linklocal_allnodes);
+const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_linklocal_allrouters);
+const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
+EXPORT_SYMBOL(in6addr_interfacelocal_allnodes);
+const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_interfacelocal_allrouters);
+const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
+EXPORT_SYMBOL(in6addr_sitelocal_allrouters);
+
+static void snmp6_free_dev(struct inet6_dev *idev)
+{
+ kfree(idev->stats.icmpv6msgdev);
+ kfree(idev->stats.icmpv6dev);
+ free_percpu(idev->stats.ipv6);
+}
+
+static void in6_dev_finish_destroy_rcu(struct rcu_head *head)
+{
+ struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu);
+
+ snmp6_free_dev(idev);
+ kfree(idev);
+}
+
+/* Nobody refers to this device, we may destroy it. */
+
+void in6_dev_finish_destroy(struct inet6_dev *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ WARN_ON(!list_empty(&idev->addr_list));
+ WARN_ON(idev->mc_list);
+ WARN_ON(timer_pending(&idev->rs_timer));
+
+#ifdef NET_REFCNT_DEBUG
+ pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
+#endif
+ dev_put(dev);
+ if (!idev->dead) {
+ pr_warn("Freeing alive inet6 device %p\n", idev);
+ return;
+ }
+ call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu);
+}
+EXPORT_SYMBOL(in6_dev_finish_destroy);
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
new file mode 100644
index 000000000..882124ebb
--- /dev/null
+++ b/net/ipv6/addrlabel.c
@@ -0,0 +1,597 @@
+/*
+ * IPv6 Address Label subsystem
+ * for the IPv6 "Default" Source Address Selection
+ *
+ * Copyright (C)2007 USAGI/WIDE Project
+ */
+/*
+ * Author:
+ * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/addrconf.h>
+#include <linux/if_addrlabel.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#if 0
+#define ADDRLABEL(x...) printk(x)
+#else
+#define ADDRLABEL(x...) do { ; } while (0)
+#endif
+
+/*
+ * Policy Table
+ */
+struct ip6addrlbl_entry {
+ possible_net_t lbl_net;
+ struct in6_addr prefix;
+ int prefixlen;
+ int ifindex;
+ int addrtype;
+ u32 label;
+ struct hlist_node list;
+ atomic_t refcnt;
+ struct rcu_head rcu;
+};
+
+static struct ip6addrlbl_table
+{
+ struct hlist_head head;
+ spinlock_t lock;
+ u32 seq;
+} ip6addrlbl_table;
+
+static inline
+struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl)
+{
+ return read_pnet(&lbl->lbl_net);
+}
+
+/*
+ * Default policy table (RFC6724 + extensions)
+ *
+ * prefix addr_type label
+ * -------------------------------------------------------------------------
+ * ::1/128 LOOPBACK 0
+ * ::/0 N/A 1
+ * 2002::/16 N/A 2
+ * ::/96 COMPATv4 3
+ * ::ffff:0:0/96 V4MAPPED 4
+ * fc00::/7 N/A 5 ULA (RFC 4193)
+ * 2001::/32 N/A 6 Teredo (RFC 4380)
+ * 2001:10::/28 N/A 7 ORCHID (RFC 4843)
+ * fec0::/10 N/A 11 Site-local
+ * (deprecated by RFC3879)
+ * 3ffe::/16 N/A 12 6bone
+ *
+ * Note: 0xffffffff is used if we do not have any policies.
+ * Note: Labels for ULA and 6to4 are different from labels listed in RFC6724.
+ */
+
+#define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL
+
+static const __net_initconst struct ip6addrlbl_init_table
+{
+ const struct in6_addr *prefix;
+ int prefixlen;
+ u32 label;
+} ip6addrlbl_init_table[] = {
+ { /* ::/0 */
+ .prefix = &in6addr_any,
+ .label = 1,
+ }, { /* fc00::/7 */
+ .prefix = &(struct in6_addr){ { { 0xfc } } } ,
+ .prefixlen = 7,
+ .label = 5,
+ }, { /* fec0::/10 */
+ .prefix = &(struct in6_addr){ { { 0xfe, 0xc0 } } },
+ .prefixlen = 10,
+ .label = 11,
+ }, { /* 2002::/16 */
+ .prefix = &(struct in6_addr){ { { 0x20, 0x02 } } },
+ .prefixlen = 16,
+ .label = 2,
+ }, { /* 3ffe::/16 */
+ .prefix = &(struct in6_addr){ { { 0x3f, 0xfe } } },
+ .prefixlen = 16,
+ .label = 12,
+ }, { /* 2001::/32 */
+ .prefix = &(struct in6_addr){ { { 0x20, 0x01 } } },
+ .prefixlen = 32,
+ .label = 6,
+ }, { /* 2001:10::/28 */
+ .prefix = &(struct in6_addr){ { { 0x20, 0x01, 0x00, 0x10 } } },
+ .prefixlen = 28,
+ .label = 7,
+ }, { /* ::ffff:0:0 */
+ .prefix = &(struct in6_addr){ { { [10] = 0xff, [11] = 0xff } } },
+ .prefixlen = 96,
+ .label = 4,
+ }, { /* ::/96 */
+ .prefix = &in6addr_any,
+ .prefixlen = 96,
+ .label = 3,
+ }, { /* ::1/128 */
+ .prefix = &in6addr_loopback,
+ .prefixlen = 128,
+ .label = 0,
+ }
+};
+
+/* Object management */
+static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p)
+{
+ kfree(p);
+}
+
+static void ip6addrlbl_free_rcu(struct rcu_head *h)
+{
+ ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu));
+}
+
+static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p)
+{
+ return atomic_inc_not_zero(&p->refcnt);
+}
+
+static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p)
+{
+ if (atomic_dec_and_test(&p->refcnt))
+ call_rcu(&p->rcu, ip6addrlbl_free_rcu);
+}
+
+/* Find label */
+static bool __ip6addrlbl_match(struct net *net,
+ const struct ip6addrlbl_entry *p,
+ const struct in6_addr *addr,
+ int addrtype, int ifindex)
+{
+ if (!net_eq(ip6addrlbl_net(p), net))
+ return false;
+ if (p->ifindex && p->ifindex != ifindex)
+ return false;
+ if (p->addrtype && p->addrtype != addrtype)
+ return false;
+ if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen))
+ return false;
+ return true;
+}
+
+static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net,
+ const struct in6_addr *addr,
+ int type, int ifindex)
+{
+ struct ip6addrlbl_entry *p;
+ hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) {
+ if (__ip6addrlbl_match(net, p, addr, type, ifindex))
+ return p;
+ }
+ return NULL;
+}
+
+u32 ipv6_addr_label(struct net *net,
+ const struct in6_addr *addr, int type, int ifindex)
+{
+ u32 label;
+ struct ip6addrlbl_entry *p;
+
+ type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK;
+
+ rcu_read_lock();
+ p = __ipv6_addr_label(net, addr, type, ifindex);
+ label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT;
+ rcu_read_unlock();
+
+ ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n",
+ __func__, addr, type, ifindex, label);
+
+ return label;
+}
+
+/* allocate one entry */
+static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
+ const struct in6_addr *prefix,
+ int prefixlen, int ifindex,
+ u32 label)
+{
+ struct ip6addrlbl_entry *newp;
+ int addrtype;
+
+ ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n",
+ __func__, prefix, prefixlen, ifindex, (unsigned int)label);
+
+ addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK);
+
+ switch (addrtype) {
+ case IPV6_ADDR_MAPPED:
+ if (prefixlen > 96)
+ return ERR_PTR(-EINVAL);
+ if (prefixlen < 96)
+ addrtype = 0;
+ break;
+ case IPV6_ADDR_COMPATv4:
+ if (prefixlen != 96)
+ addrtype = 0;
+ break;
+ case IPV6_ADDR_LOOPBACK:
+ if (prefixlen != 128)
+ addrtype = 0;
+ break;
+ }
+
+ newp = kmalloc(sizeof(*newp), GFP_KERNEL);
+ if (!newp)
+ return ERR_PTR(-ENOMEM);
+
+ ipv6_addr_prefix(&newp->prefix, prefix, prefixlen);
+ newp->prefixlen = prefixlen;
+ newp->ifindex = ifindex;
+ newp->addrtype = addrtype;
+ newp->label = label;
+ INIT_HLIST_NODE(&newp->list);
+ write_pnet(&newp->lbl_net, net);
+ atomic_set(&newp->refcnt, 1);
+ return newp;
+}
+
+/* add a label */
+static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
+{
+ struct hlist_node *n;
+ struct ip6addrlbl_entry *last = NULL, *p = NULL;
+ int ret = 0;
+
+ ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp,
+ replace);
+
+ hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
+ if (p->prefixlen == newp->prefixlen &&
+ net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
+ p->ifindex == newp->ifindex &&
+ ipv6_addr_equal(&p->prefix, &newp->prefix)) {
+ if (!replace) {
+ ret = -EEXIST;
+ goto out;
+ }
+ hlist_replace_rcu(&p->list, &newp->list);
+ ip6addrlbl_put(p);
+ goto out;
+ } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
+ (p->prefixlen < newp->prefixlen)) {
+ hlist_add_before_rcu(&newp->list, &p->list);
+ goto out;
+ }
+ last = p;
+ }
+ if (last)
+ hlist_add_behind_rcu(&newp->list, &last->list);
+ else
+ hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
+out:
+ if (!ret)
+ ip6addrlbl_table.seq++;
+ return ret;
+}
+
+/* add a label */
+static int ip6addrlbl_add(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ int ifindex, u32 label, int replace)
+{
+ struct ip6addrlbl_entry *newp;
+ int ret = 0;
+
+ ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
+ __func__, prefix, prefixlen, ifindex, (unsigned int)label,
+ replace);
+
+ newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label);
+ if (IS_ERR(newp))
+ return PTR_ERR(newp);
+ spin_lock(&ip6addrlbl_table.lock);
+ ret = __ip6addrlbl_add(newp, replace);
+ spin_unlock(&ip6addrlbl_table.lock);
+ if (ret)
+ ip6addrlbl_free(newp);
+ return ret;
+}
+
+/* remove a label */
+static int __ip6addrlbl_del(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ int ifindex)
+{
+ struct ip6addrlbl_entry *p = NULL;
+ struct hlist_node *n;
+ int ret = -ESRCH;
+
+ ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
+ __func__, prefix, prefixlen, ifindex);
+
+ hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
+ if (p->prefixlen == prefixlen &&
+ net_eq(ip6addrlbl_net(p), net) &&
+ p->ifindex == ifindex &&
+ ipv6_addr_equal(&p->prefix, prefix)) {
+ hlist_del_rcu(&p->list);
+ ip6addrlbl_put(p);
+ ret = 0;
+ break;
+ }
+ }
+ return ret;
+}
+
+static int ip6addrlbl_del(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ int ifindex)
+{
+ struct in6_addr prefix_buf;
+ int ret;
+
+ ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
+ __func__, prefix, prefixlen, ifindex);
+
+ ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
+ spin_lock(&ip6addrlbl_table.lock);
+ ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex);
+ spin_unlock(&ip6addrlbl_table.lock);
+ return ret;
+}
+
+/* add default label */
+static int __net_init ip6addrlbl_net_init(struct net *net)
+{
+ int err = 0;
+ int i;
+
+ ADDRLABEL(KERN_DEBUG "%s\n", __func__);
+
+ for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
+ int ret = ip6addrlbl_add(net,
+ ip6addrlbl_init_table[i].prefix,
+ ip6addrlbl_init_table[i].prefixlen,
+ 0,
+ ip6addrlbl_init_table[i].label, 0);
+ /* XXX: should we free all rules when we catch an error? */
+ if (ret && (!err || err != -ENOMEM))
+ err = ret;
+ }
+ return err;
+}
+
+static void __net_exit ip6addrlbl_net_exit(struct net *net)
+{
+ struct ip6addrlbl_entry *p = NULL;
+ struct hlist_node *n;
+
+ /* Remove all labels belonging to the exiting net */
+ spin_lock(&ip6addrlbl_table.lock);
+ hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
+ if (net_eq(ip6addrlbl_net(p), net)) {
+ hlist_del_rcu(&p->list);
+ ip6addrlbl_put(p);
+ }
+ }
+ spin_unlock(&ip6addrlbl_table.lock);
+}
+
+static struct pernet_operations ipv6_addr_label_ops = {
+ .init = ip6addrlbl_net_init,
+ .exit = ip6addrlbl_net_exit,
+};
+
+int __init ipv6_addr_label_init(void)
+{
+ spin_lock_init(&ip6addrlbl_table.lock);
+
+ return register_pernet_subsys(&ipv6_addr_label_ops);
+}
+
+void ipv6_addr_label_cleanup(void)
+{
+ unregister_pernet_subsys(&ipv6_addr_label_ops);
+}
+
+static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
+ [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), },
+ [IFAL_LABEL] = { .len = sizeof(u32), },
+};
+
+static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ifaddrlblmsg *ifal;
+ struct nlattr *tb[IFAL_MAX+1];
+ struct in6_addr *pfx;
+ u32 label;
+ int err = 0;
+
+ err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy);
+ if (err < 0)
+ return err;
+
+ ifal = nlmsg_data(nlh);
+
+ if (ifal->ifal_family != AF_INET6 ||
+ ifal->ifal_prefixlen > 128)
+ return -EINVAL;
+
+ if (!tb[IFAL_ADDRESS])
+ return -EINVAL;
+ pfx = nla_data(tb[IFAL_ADDRESS]);
+
+ if (!tb[IFAL_LABEL])
+ return -EINVAL;
+ label = nla_get_u32(tb[IFAL_LABEL]);
+ if (label == IPV6_ADDR_LABEL_DEFAULT)
+ return -EINVAL;
+
+ switch (nlh->nlmsg_type) {
+ case RTM_NEWADDRLABEL:
+ if (ifal->ifal_index &&
+ !__dev_get_by_index(net, ifal->ifal_index))
+ return -EINVAL;
+
+ err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen,
+ ifal->ifal_index, label,
+ nlh->nlmsg_flags & NLM_F_REPLACE);
+ break;
+ case RTM_DELADDRLABEL:
+ err = ip6addrlbl_del(net, pfx, ifal->ifal_prefixlen,
+ ifal->ifal_index);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ }
+ return err;
+}
+
+static void ip6addrlbl_putmsg(struct nlmsghdr *nlh,
+ int prefixlen, int ifindex, u32 lseq)
+{
+ struct ifaddrlblmsg *ifal = nlmsg_data(nlh);
+ ifal->ifal_family = AF_INET6;
+ ifal->ifal_prefixlen = prefixlen;
+ ifal->ifal_flags = 0;
+ ifal->ifal_index = ifindex;
+ ifal->ifal_seq = lseq;
+};
+
+static int ip6addrlbl_fill(struct sk_buff *skb,
+ struct ip6addrlbl_entry *p,
+ u32 lseq,
+ u32 portid, u32 seq, int event,
+ unsigned int flags)
+{
+ struct nlmsghdr *nlh = nlmsg_put(skb, portid, seq, event,
+ sizeof(struct ifaddrlblmsg), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq);
+
+ if (nla_put_in6_addr(skb, IFAL_ADDRESS, &p->prefix) < 0 ||
+ nla_put_u32(skb, IFAL_LABEL, p->label) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct ip6addrlbl_entry *p;
+ int idx = 0, s_idx = cb->args[0];
+ int err;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) {
+ if (idx >= s_idx &&
+ net_eq(ip6addrlbl_net(p), net)) {
+ err = ip6addrlbl_fill(skb, p,
+ ip6addrlbl_table.seq,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWADDRLABEL,
+ NLM_F_MULTI);
+ if (err < 0)
+ break;
+ }
+ idx++;
+ }
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static inline int ip6addrlbl_msgsize(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
+ + nla_total_size(16) /* IFAL_ADDRESS */
+ + nla_total_size(4); /* IFAL_LABEL */
+}
+
+static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct ifaddrlblmsg *ifal;
+ struct nlattr *tb[IFAL_MAX+1];
+ struct in6_addr *addr;
+ u32 lseq;
+ int err = 0;
+ struct ip6addrlbl_entry *p;
+ struct sk_buff *skb;
+
+ err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy);
+ if (err < 0)
+ return err;
+
+ ifal = nlmsg_data(nlh);
+
+ if (ifal->ifal_family != AF_INET6 ||
+ ifal->ifal_prefixlen != 128)
+ return -EINVAL;
+
+ if (ifal->ifal_index &&
+ !__dev_get_by_index(net, ifal->ifal_index))
+ return -EINVAL;
+
+ if (!tb[IFAL_ADDRESS])
+ return -EINVAL;
+ addr = nla_data(tb[IFAL_ADDRESS]);
+
+ rcu_read_lock();
+ p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
+ if (p && ip6addrlbl_hold(p))
+ p = NULL;
+ lseq = ip6addrlbl_table.seq;
+ rcu_read_unlock();
+
+ if (!p) {
+ err = -ESRCH;
+ goto out;
+ }
+
+ skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL);
+ if (!skb) {
+ ip6addrlbl_put(p);
+ return -ENOBUFS;
+ }
+
+ err = ip6addrlbl_fill(skb, p, lseq,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+ RTM_NEWADDRLABEL, 0);
+
+ ip6addrlbl_put(p);
+
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto out;
+ }
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+ return err;
+}
+
+void __init ipv6_addr_label_rtnl_register(void)
+{
+ __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
+ NULL, NULL);
+ __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
+ NULL, NULL);
+ __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
+ ip6addrlbl_dump, NULL);
+}
+
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
new file mode 100644
index 000000000..eef63b394
--- /dev/null
+++ b/net/ipv6/af_inet6.c
@@ -0,0 +1,1030 @@
+/*
+ * PF_INET6 socket protocol family
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Adapted from linux/net/ipv4/af_inet.c
+ *
+ * Fixes:
+ * piggy, Karl Knutson : Socket protocol table
+ * Hideaki YOSHIFUJI : sin6_scope_id support
+ * Arnaldo Melo : check proc_net_create return, cleanups
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/tcp.h>
+#include <net/ping.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+#include <net/route.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+#ifdef CONFIG_IPV6_TUNNEL
+#include <net/ip6_tunnel.h>
+#endif
+
+#include <asm/uaccess.h>
+#include <linux/mroute6.h>
+
+MODULE_AUTHOR("Cast of dozens");
+MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
+MODULE_LICENSE("GPL");
+
+/* The inetsw6 table contains everything that inet6_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw6[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw6_lock);
+
+struct ipv6_params ipv6_defaults = {
+ .disable_ipv6 = 0,
+ .autoconf = 1,
+};
+
+static int disable_ipv6_mod;
+
+module_param_named(disable, disable_ipv6_mod, int, 0444);
+MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional");
+
+module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444);
+MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces");
+
+module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444);
+MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces");
+
+static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
+{
+ const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
+
+ return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
+}
+
+static int inet6_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct inet_sock *inet;
+ struct ipv6_pinfo *np;
+ struct sock *sk;
+ struct inet_protosw *answer;
+ struct proto *answer_prot;
+ unsigned char answer_flags;
+ int try_loading_module = 0;
+ int err;
+
+ /* Look for the requested type/protocol pair. */
+lookup_protocol:
+ err = -ESOCKTNOSUPPORT;
+ rcu_read_lock();
+ list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) {
+
+ err = 0;
+ /* Check the non-wild match. */
+ if (protocol == answer->protocol) {
+ if (protocol != IPPROTO_IP)
+ break;
+ } else {
+ /* Check for the two wild cases. */
+ if (IPPROTO_IP == protocol) {
+ protocol = answer->protocol;
+ break;
+ }
+ if (IPPROTO_IP == answer->protocol)
+ break;
+ }
+ err = -EPROTONOSUPPORT;
+ }
+
+ if (err) {
+ if (try_loading_module < 2) {
+ rcu_read_unlock();
+ /*
+ * Be more specific, e.g. net-pf-10-proto-132-type-1
+ * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+ */
+ if (++try_loading_module == 1)
+ request_module("net-pf-%d-proto-%d-type-%d",
+ PF_INET6, protocol, sock->type);
+ /*
+ * Fall back to generic, e.g. net-pf-10-proto-132
+ * (net-pf-PF_INET6-proto-IPPROTO_SCTP)
+ */
+ else
+ request_module("net-pf-%d-proto-%d",
+ PF_INET6, protocol);
+ goto lookup_protocol;
+ } else
+ goto out_rcu_unlock;
+ }
+
+ err = -EPERM;
+ if (sock->type == SOCK_RAW && !kern &&
+ !ns_capable(net->user_ns, CAP_NET_RAW))
+ goto out_rcu_unlock;
+
+ sock->ops = answer->ops;
+ answer_prot = answer->prot;
+ answer_flags = answer->flags;
+ rcu_read_unlock();
+
+ WARN_ON(!answer_prot->slab);
+
+ err = -ENOBUFS;
+ sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot);
+ if (!sk)
+ goto out;
+
+ sock_init_data(sock, sk);
+
+ err = 0;
+ if (INET_PROTOSW_REUSE & answer_flags)
+ sk->sk_reuse = SK_CAN_REUSE;
+
+ inet = inet_sk(sk);
+ inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+
+ if (SOCK_RAW == sock->type) {
+ inet->inet_num = protocol;
+ if (IPPROTO_RAW == protocol)
+ inet->hdrincl = 1;
+ }
+
+ sk->sk_destruct = inet_sock_destruct;
+ sk->sk_family = PF_INET6;
+ sk->sk_protocol = protocol;
+
+ sk->sk_backlog_rcv = answer->prot->backlog_rcv;
+
+ inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
+ np->hop_limit = -1;
+ np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
+ np->mc_loop = 1;
+ np->pmtudisc = IPV6_PMTUDISC_WANT;
+ sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
+
+ /* Init the ipv4 part of the socket since we can have sockets
+ * using v6 API for ipv4.
+ */
+ inet->uc_ttl = -1;
+
+ inet->mc_loop = 1;
+ inet->mc_ttl = 1;
+ inet->mc_index = 0;
+ inet->mc_list = NULL;
+ inet->rcv_tos = 0;
+
+ if (net->ipv4.sysctl_ip_no_pmtu_disc)
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+ else
+ inet->pmtudisc = IP_PMTUDISC_WANT;
+ /*
+ * Increment only the relevant sk_prot->socks debug field, this changes
+ * the previous behaviour of incrementing both the equivalent to
+ * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
+ *
+ * This allows better debug granularity as we'll know exactly how many
+ * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
+ * transport protocol socks. -acme
+ */
+ sk_refcnt_debug_inc(sk);
+
+ if (inet->inet_num) {
+ /* It assumes that any protocol which allows
+ * the user to assign a number at socket
+ * creation time automatically shares.
+ */
+ inet->inet_sport = htons(inet->inet_num);
+ sk->sk_prot->hash(sk);
+ }
+ if (sk->sk_prot->init) {
+ err = sk->sk_prot->init(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
+ }
+out:
+ return err;
+out_rcu_unlock:
+ rcu_read_unlock();
+ goto out;
+}
+
+
+/* bind for INET6 API */
+int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
+ __be32 v4addr = 0;
+ unsigned short snum;
+ int addr_type = 0;
+ int err = 0;
+
+ /* If the socket has its own bind function then use it. */
+ if (sk->sk_prot->bind)
+ return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (addr->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ addr_type = ipv6_addr_type(&addr->sin6_addr);
+ if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM)
+ return -EINVAL;
+
+ snum = ntohs(addr->sin6_port);
+ if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ lock_sock(sk);
+
+ /* Check these errors (active socket, double bind). */
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Check if the address belongs to the host. */
+ if (addr_type == IPV6_ADDR_MAPPED) {
+ int chk_addr_ret;
+
+ /* Binding to v4-mapped address on a v6-only socket
+ * makes no sense
+ */
+ if (sk->sk_ipv6only) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Reproduce AF_INET checks to make the bindings consistent */
+ v4addr = addr->sin6_addr.s6_addr32[3];
+ chk_addr_ret = inet_addr_type(net, v4addr);
+ if (!net->ipv4.sysctl_ip_nonlocal_bind &&
+ !(inet->freebind || inet->transparent) &&
+ v4addr != htonl(INADDR_ANY) &&
+ chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST &&
+ chk_addr_ret != RTN_BROADCAST) {
+ err = -EADDRNOTAVAIL;
+ goto out;
+ }
+ } else {
+ if (addr_type != IPV6_ADDR_ANY) {
+ struct net_device *dev = NULL;
+
+ rcu_read_lock();
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ addr->sin6_scope_id) {
+ /* Override any existing binding, if another one
+ * is supplied by user.
+ */
+ sk->sk_bound_dev_if = addr->sin6_scope_id;
+ }
+
+ /* Binding to link-local address requires an interface */
+ if (!sk->sk_bound_dev_if) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ /* ipv4 addr of the socket is invalid. Only the
+ * unspecified and mapped address have a v4 equivalent.
+ */
+ v4addr = LOOPBACK4_IPV6;
+ if (!(addr_type & IPV6_ADDR_MULTICAST)) {
+ if (!(inet->freebind || inet->transparent) &&
+ !ipv6_chk_addr(net, &addr->sin6_addr,
+ dev, 0)) {
+ err = -EADDRNOTAVAIL;
+ goto out_unlock;
+ }
+ }
+ rcu_read_unlock();
+ }
+ }
+
+ inet->inet_rcv_saddr = v4addr;
+ inet->inet_saddr = v4addr;
+
+ sk->sk_v6_rcv_saddr = addr->sin6_addr;
+
+ if (!(addr_type & IPV6_ADDR_MULTICAST))
+ np->saddr = addr->sin6_addr;
+
+ /* Make sure we are allowed to bind here. */
+ if (sk->sk_prot->get_port(sk, snum)) {
+ inet_reset_saddr(sk);
+ err = -EADDRINUSE;
+ goto out;
+ }
+
+ if (addr_type != IPV6_ADDR_ANY) {
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+ if (addr_type != IPV6_ADDR_MAPPED)
+ sk->sk_ipv6only = 1;
+ }
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_dport = 0;
+ inet->inet_daddr = 0;
+out:
+ release_sock(sk);
+ return err;
+out_unlock:
+ rcu_read_unlock();
+ goto out;
+}
+EXPORT_SYMBOL(inet6_bind);
+
+int inet6_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return -EINVAL;
+
+ /* Free mc lists */
+ ipv6_sock_mc_close(sk);
+
+ /* Free ac lists */
+ ipv6_sock_ac_close(sk);
+
+ return inet_release(sock);
+}
+EXPORT_SYMBOL(inet6_release);
+
+void inet6_destroy_sock(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *skb;
+ struct ipv6_txoptions *opt;
+
+ /* Release rx options */
+
+ skb = xchg(&np->pktoptions, NULL);
+ if (skb)
+ kfree_skb(skb);
+
+ skb = xchg(&np->rxpmtu, NULL);
+ if (skb)
+ kfree_skb(skb);
+
+ /* Free flowlabels */
+ fl6_free_socklist(sk);
+
+ /* Free tx options */
+
+ opt = xchg(&np->opt, NULL);
+ if (opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+}
+EXPORT_SYMBOL_GPL(inet6_destroy_sock);
+
+/*
+ * This does both peername and sockname.
+ */
+
+int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ sin->sin6_family = AF_INET6;
+ sin->sin6_flowinfo = 0;
+ sin->sin6_scope_id = 0;
+ if (peer) {
+ if (!inet->inet_dport)
+ return -ENOTCONN;
+ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1)
+ return -ENOTCONN;
+ sin->sin6_port = inet->inet_dport;
+ sin->sin6_addr = sk->sk_v6_daddr;
+ if (np->sndflow)
+ sin->sin6_flowinfo = np->flow_label;
+ } else {
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ sin->sin6_addr = np->saddr;
+ else
+ sin->sin6_addr = sk->sk_v6_rcv_saddr;
+
+ sin->sin6_port = inet->inet_sport;
+ }
+ sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
+ sk->sk_bound_dev_if);
+ *uaddr_len = sizeof(*sin);
+ return 0;
+}
+EXPORT_SYMBOL(inet6_getname);
+
+int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+
+ switch (cmd) {
+ case SIOCGSTAMP:
+ return sock_get_timestamp(sk, (struct timeval __user *)arg);
+
+ case SIOCGSTAMPNS:
+ return sock_get_timestampns(sk, (struct timespec __user *)arg);
+
+ case SIOCADDRT:
+ case SIOCDELRT:
+
+ return ipv6_route_ioctl(net, cmd, (void __user *)arg);
+
+ case SIOCSIFADDR:
+ return addrconf_add_ifaddr(net, (void __user *) arg);
+ case SIOCDIFADDR:
+ return addrconf_del_ifaddr(net, (void __user *) arg);
+ case SIOCSIFDSTADDR:
+ return addrconf_set_dstaddr(net, (void __user *) arg);
+ default:
+ if (!sk->sk_prot->ioctl)
+ return -ENOIOCTLCMD;
+ return sk->sk_prot->ioctl(sk, cmd, arg);
+ }
+ /*NOTREACHED*/
+ return 0;
+}
+EXPORT_SYMBOL(inet6_ioctl);
+
+const struct proto_ops inet6_stream_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_stream_connect, /* ok */
+ .socketpair = sock_no_socketpair, /* a do nothing */
+ .accept = inet_accept, /* ok */
+ .getname = inet6_getname,
+ .poll = tcp_poll, /* ok */
+ .ioctl = inet6_ioctl, /* must change */
+ .listen = inet_listen, /* ok */
+ .shutdown = inet_shutdown, /* ok */
+ .setsockopt = sock_common_setsockopt, /* ok */
+ .getsockopt = sock_common_getsockopt, /* ok */
+ .sendmsg = inet_sendmsg, /* ok */
+ .recvmsg = inet_recvmsg, /* ok */
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+ .splice_read = tcp_splice_read,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+const struct proto_ops inet6_dgram_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_dgram_connect, /* ok */
+ .socketpair = sock_no_socketpair, /* a do nothing */
+ .accept = sock_no_accept, /* a do nothing */
+ .getname = inet6_getname,
+ .poll = udp_poll, /* ok */
+ .ioctl = inet6_ioctl, /* must change */
+ .listen = sock_no_listen, /* ok */
+ .shutdown = inet_shutdown, /* ok */
+ .setsockopt = sock_common_setsockopt, /* ok */
+ .getsockopt = sock_common_getsockopt, /* ok */
+ .sendmsg = inet_sendmsg, /* ok */
+ .recvmsg = inet_recvmsg, /* ok */
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static const struct net_proto_family inet6_family_ops = {
+ .family = PF_INET6,
+ .create = inet6_create,
+ .owner = THIS_MODULE,
+};
+
+int inet6_register_protosw(struct inet_protosw *p)
+{
+ struct list_head *lh;
+ struct inet_protosw *answer;
+ struct list_head *last_perm;
+ int protocol = p->protocol;
+ int ret;
+
+ spin_lock_bh(&inetsw6_lock);
+
+ ret = -EINVAL;
+ if (p->type >= SOCK_MAX)
+ goto out_illegal;
+
+ /* If we are trying to override a permanent protocol, bail. */
+ answer = NULL;
+ ret = -EPERM;
+ last_perm = &inetsw6[p->type];
+ list_for_each(lh, &inetsw6[p->type]) {
+ answer = list_entry(lh, struct inet_protosw, list);
+
+ /* Check only the non-wild match. */
+ if (INET_PROTOSW_PERMANENT & answer->flags) {
+ if (protocol == answer->protocol)
+ break;
+ last_perm = lh;
+ }
+
+ answer = NULL;
+ }
+ if (answer)
+ goto out_permanent;
+
+ /* Add the new entry after the last permanent entry if any, so that
+ * the new entry does not override a permanent entry when matched with
+ * a wild-card protocol. But it is allowed to override any existing
+ * non-permanent entry. This means that when we remove this entry, the
+ * system automatically returns to the old behavior.
+ */
+ list_add_rcu(&p->list, last_perm);
+ ret = 0;
+out:
+ spin_unlock_bh(&inetsw6_lock);
+ return ret;
+
+out_permanent:
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
+ goto out;
+
+out_illegal:
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
+ p->type);
+ goto out;
+}
+EXPORT_SYMBOL(inet6_register_protosw);
+
+void
+inet6_unregister_protosw(struct inet_protosw *p)
+{
+ if (INET_PROTOSW_PERMANENT & p->flags) {
+ pr_err("Attempt to unregister permanent protocol %d\n",
+ p->protocol);
+ } else {
+ spin_lock_bh(&inetsw6_lock);
+ list_del_rcu(&p->list);
+ spin_unlock_bh(&inetsw6_lock);
+
+ synchronize_net();
+ }
+}
+EXPORT_SYMBOL(inet6_unregister_protosw);
+
+int inet6_sk_rebuild_header(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dst_entry *dst;
+
+ dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (!dst) {
+ struct inet_sock *inet = inet_sk(sk);
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = sk->sk_protocol;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = np->saddr;
+ fl6.flowlabel = np->flow_label;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.fl6_sport = inet->inet_sport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ sk->sk_route_caps = 0;
+ sk->sk_err_soft = -PTR_ERR(dst);
+ return PTR_ERR(dst);
+ }
+
+ __ip6_dst_store(sk, dst, NULL, NULL);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
+
+bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
+ const struct inet6_skb_parm *opt)
+{
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (np->rxopt.all) {
+ if ((opt->hop && (np->rxopt.bits.hopopts ||
+ np->rxopt.bits.ohopopts)) ||
+ (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) &&
+ np->rxopt.bits.rxflow) ||
+ (opt->srcrt && (np->rxopt.bits.srcrt ||
+ np->rxopt.bits.osrcrt)) ||
+ ((opt->dst1 || opt->dst0) &&
+ (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
+
+static struct packet_type ipv6_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IPV6),
+ .func = ipv6_rcv,
+};
+
+static int __init ipv6_packet_init(void)
+{
+ dev_add_pack(&ipv6_packet_type);
+ return 0;
+}
+
+static void ipv6_packet_cleanup(void)
+{
+ dev_remove_pack(&ipv6_packet_type);
+}
+
+static int __net_init ipv6_init_mibs(struct net *net)
+{
+ int i;
+
+ net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib);
+ if (!net->mib.udp_stats_in6)
+ return -ENOMEM;
+ net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib);
+ if (!net->mib.udplite_stats_in6)
+ goto err_udplite_mib;
+ net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib);
+ if (!net->mib.ipv6_statistics)
+ goto err_ip_mib;
+
+ for_each_possible_cpu(i) {
+ struct ipstats_mib *af_inet6_stats;
+ af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i);
+ u64_stats_init(&af_inet6_stats->syncp);
+ }
+
+
+ net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib);
+ if (!net->mib.icmpv6_statistics)
+ goto err_icmp_mib;
+ net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpv6msg_statistics)
+ goto err_icmpmsg_mib;
+ return 0;
+
+err_icmpmsg_mib:
+ free_percpu(net->mib.icmpv6_statistics);
+err_icmp_mib:
+ free_percpu(net->mib.ipv6_statistics);
+err_ip_mib:
+ free_percpu(net->mib.udplite_stats_in6);
+err_udplite_mib:
+ free_percpu(net->mib.udp_stats_in6);
+ return -ENOMEM;
+}
+
+static void ipv6_cleanup_mibs(struct net *net)
+{
+ free_percpu(net->mib.udp_stats_in6);
+ free_percpu(net->mib.udplite_stats_in6);
+ free_percpu(net->mib.ipv6_statistics);
+ free_percpu(net->mib.icmpv6_statistics);
+ kfree(net->mib.icmpv6msg_statistics);
+}
+
+static int __net_init inet6_net_init(struct net *net)
+{
+ int err = 0;
+
+ net->ipv6.sysctl.bindv6only = 0;
+ net->ipv6.sysctl.icmpv6_time = 1*HZ;
+ net->ipv6.sysctl.flowlabel_consistency = 1;
+ net->ipv6.sysctl.auto_flowlabels = 0;
+ net->ipv6.sysctl.idgen_retries = 3;
+ net->ipv6.sysctl.idgen_delay = 1 * HZ;
+ atomic_set(&net->ipv6.fib6_sernum, 1);
+
+ err = ipv6_init_mibs(net);
+ if (err)
+ return err;
+#ifdef CONFIG_PROC_FS
+ err = udp6_proc_init(net);
+ if (err)
+ goto out;
+ err = tcp6_proc_init(net);
+ if (err)
+ goto proc_tcp6_fail;
+ err = ac6_proc_init(net);
+ if (err)
+ goto proc_ac6_fail;
+#endif
+ return err;
+
+#ifdef CONFIG_PROC_FS
+proc_ac6_fail:
+ tcp6_proc_exit(net);
+proc_tcp6_fail:
+ udp6_proc_exit(net);
+out:
+ ipv6_cleanup_mibs(net);
+ return err;
+#endif
+}
+
+static void __net_exit inet6_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ udp6_proc_exit(net);
+ tcp6_proc_exit(net);
+ ac6_proc_exit(net);
+#endif
+ ipv6_cleanup_mibs(net);
+}
+
+static struct pernet_operations inet6_net_ops = {
+ .init = inet6_net_init,
+ .exit = inet6_net_exit,
+};
+
+static const struct ipv6_stub ipv6_stub_impl = {
+ .ipv6_sock_mc_join = ipv6_sock_mc_join,
+ .ipv6_sock_mc_drop = ipv6_sock_mc_drop,
+ .ipv6_dst_lookup = ip6_dst_lookup,
+ .udpv6_encap_enable = udpv6_encap_enable,
+ .ndisc_send_na = ndisc_send_na,
+ .nd_tbl = &nd_tbl,
+};
+
+static int __init inet6_init(void)
+{
+ struct list_head *r;
+ int err = 0;
+
+ sock_skb_cb_check_size(sizeof(struct inet6_skb_parm));
+
+ /* Register the socket-side information for inet6_create. */
+ for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
+ INIT_LIST_HEAD(r);
+
+ if (disable_ipv6_mod) {
+ pr_info("Loaded, but administratively disabled, reboot required to enable\n");
+ goto out;
+ }
+
+ err = proto_register(&tcpv6_prot, 1);
+ if (err)
+ goto out;
+
+ err = proto_register(&udpv6_prot, 1);
+ if (err)
+ goto out_unregister_tcp_proto;
+
+ err = proto_register(&udplitev6_prot, 1);
+ if (err)
+ goto out_unregister_udp_proto;
+
+ err = proto_register(&rawv6_prot, 1);
+ if (err)
+ goto out_unregister_udplite_proto;
+
+ err = proto_register(&pingv6_prot, 1);
+ if (err)
+ goto out_unregister_ping_proto;
+
+ /* We MUST register RAW sockets before we create the ICMP6,
+ * IGMP6, or NDISC control sockets.
+ */
+ err = rawv6_init();
+ if (err)
+ goto out_unregister_raw_proto;
+
+ /* Register the family here so that the init calls below will
+ * be able to create sockets. (?? is this dangerous ??)
+ */
+ err = sock_register(&inet6_family_ops);
+ if (err)
+ goto out_sock_register_fail;
+
+ /*
+ * ipngwg API draft makes clear that the correct semantics
+ * for TCP and UDP is to consider one TCP and UDP instance
+ * in a host available by both INET and INET6 APIs and
+ * able to communicate via both network protocols.
+ */
+
+ err = register_pernet_subsys(&inet6_net_ops);
+ if (err)
+ goto register_pernet_fail;
+ err = icmpv6_init();
+ if (err)
+ goto icmp_fail;
+ err = ip6_mr_init();
+ if (err)
+ goto ipmr_fail;
+ err = ndisc_init();
+ if (err)
+ goto ndisc_fail;
+ err = igmp6_init();
+ if (err)
+ goto igmp_fail;
+
+ ipv6_stub = &ipv6_stub_impl;
+
+ err = ipv6_netfilter_init();
+ if (err)
+ goto netfilter_fail;
+ /* Create /proc/foo6 entries. */
+#ifdef CONFIG_PROC_FS
+ err = -ENOMEM;
+ if (raw6_proc_init())
+ goto proc_raw6_fail;
+ if (udplite6_proc_init())
+ goto proc_udplite6_fail;
+ if (ipv6_misc_proc_init())
+ goto proc_misc6_fail;
+ if (if6_proc_init())
+ goto proc_if6_fail;
+#endif
+ err = ip6_route_init();
+ if (err)
+ goto ip6_route_fail;
+ err = ndisc_late_init();
+ if (err)
+ goto ndisc_late_fail;
+ err = ip6_flowlabel_init();
+ if (err)
+ goto ip6_flowlabel_fail;
+ err = addrconf_init();
+ if (err)
+ goto addrconf_fail;
+
+ /* Init v6 extension headers. */
+ err = ipv6_exthdrs_init();
+ if (err)
+ goto ipv6_exthdrs_fail;
+
+ err = ipv6_frag_init();
+ if (err)
+ goto ipv6_frag_fail;
+
+ /* Init v6 transport protocols. */
+ err = udpv6_init();
+ if (err)
+ goto udpv6_fail;
+
+ err = udplitev6_init();
+ if (err)
+ goto udplitev6_fail;
+
+ err = tcpv6_init();
+ if (err)
+ goto tcpv6_fail;
+
+ err = ipv6_packet_init();
+ if (err)
+ goto ipv6_packet_fail;
+
+ err = pingv6_init();
+ if (err)
+ goto pingv6_fail;
+
+#ifdef CONFIG_SYSCTL
+ err = ipv6_sysctl_register();
+ if (err)
+ goto sysctl_fail;
+#endif
+out:
+ return err;
+
+#ifdef CONFIG_SYSCTL
+sysctl_fail:
+ pingv6_exit();
+#endif
+pingv6_fail:
+ ipv6_packet_cleanup();
+ipv6_packet_fail:
+ tcpv6_exit();
+tcpv6_fail:
+ udplitev6_exit();
+udplitev6_fail:
+ udpv6_exit();
+udpv6_fail:
+ ipv6_frag_exit();
+ipv6_frag_fail:
+ ipv6_exthdrs_exit();
+ipv6_exthdrs_fail:
+ addrconf_cleanup();
+addrconf_fail:
+ ip6_flowlabel_cleanup();
+ip6_flowlabel_fail:
+ ndisc_late_cleanup();
+ndisc_late_fail:
+ ip6_route_cleanup();
+ip6_route_fail:
+#ifdef CONFIG_PROC_FS
+ if6_proc_exit();
+proc_if6_fail:
+ ipv6_misc_proc_exit();
+proc_misc6_fail:
+ udplite6_proc_exit();
+proc_udplite6_fail:
+ raw6_proc_exit();
+proc_raw6_fail:
+#endif
+ ipv6_netfilter_fini();
+netfilter_fail:
+ igmp6_cleanup();
+igmp_fail:
+ ndisc_cleanup();
+ndisc_fail:
+ ip6_mr_cleanup();
+ipmr_fail:
+ icmpv6_cleanup();
+icmp_fail:
+ unregister_pernet_subsys(&inet6_net_ops);
+register_pernet_fail:
+ sock_unregister(PF_INET6);
+ rtnl_unregister_all(PF_INET6);
+out_sock_register_fail:
+ rawv6_exit();
+out_unregister_ping_proto:
+ proto_unregister(&pingv6_prot);
+out_unregister_raw_proto:
+ proto_unregister(&rawv6_prot);
+out_unregister_udplite_proto:
+ proto_unregister(&udplitev6_prot);
+out_unregister_udp_proto:
+ proto_unregister(&udpv6_prot);
+out_unregister_tcp_proto:
+ proto_unregister(&tcpv6_prot);
+ goto out;
+}
+module_init(inet6_init);
+
+MODULE_ALIAS_NETPROTO(PF_INET6);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
new file mode 100644
index 000000000..ed7d4e3f9
--- /dev/null
+++ b/net/ipv6/ah6.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (C)2002 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Authors
+ *
+ * Mitsuru KANDA @USAGI : IPv6 Support
+ * Kazunori MIYAZAWA @USAGI :
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ *
+ * This file is derived from net/ipv4/ah.c.
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <crypto/hash.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include <net/ip6_route.h>
+#include <net/icmp.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+#define IPV6HDR_BASELEN 8
+
+struct tmp_ext {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ struct in6_addr saddr;
+#endif
+ struct in6_addr daddr;
+ char hdrs[0];
+};
+
+struct ah_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+ unsigned int size)
+{
+ unsigned int len;
+
+ len = size + crypto_ahash_digestsize(ahash) +
+ (crypto_ahash_alignmask(ahash) &
+ ~(crypto_tfm_ctx_alignment() - 1));
+
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+ len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline struct tmp_ext *ah_tmp_ext(void *base)
+{
+ return base + IPV6HDR_BASELEN;
+}
+
+static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+ unsigned int offset)
+{
+ return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+ u8 *icv)
+{
+ struct ahash_request *req;
+
+ req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+ crypto_tfm_ctx_alignment());
+
+ ahash_request_set_tfm(req, ahash);
+
+ return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+ struct ahash_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_ahash_reqsize(ahash),
+ __alignof__(struct scatterlist));
+}
+
+static bool zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr)
+{
+ u8 *opt = (u8 *)opthdr;
+ int len = ipv6_optlen(opthdr);
+ int off = 0;
+ int optlen = 0;
+
+ off += 2;
+ len -= 2;
+
+ while (len > 0) {
+
+ switch (opt[off]) {
+
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ break;
+ default:
+ if (len < 2)
+ goto bad;
+ optlen = opt[off+1]+2;
+ if (len < optlen)
+ goto bad;
+ if (opt[off] & 0x20)
+ memset(&opt[off+2], 0, opt[off+1]);
+ break;
+ }
+
+ off += optlen;
+ len -= optlen;
+ }
+ if (len == 0)
+ return true;
+
+bad:
+ return false;
+}
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+/**
+ * ipv6_rearrange_destopt - rearrange IPv6 destination options header
+ * @iph: IPv6 header
+ * @destopt: destionation options header
+ */
+static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt)
+{
+ u8 *opt = (u8 *)destopt;
+ int len = ipv6_optlen(destopt);
+ int off = 0;
+ int optlen = 0;
+
+ off += 2;
+ len -= 2;
+
+ while (len > 0) {
+
+ switch (opt[off]) {
+
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ break;
+ default:
+ if (len < 2)
+ goto bad;
+ optlen = opt[off+1]+2;
+ if (len < optlen)
+ goto bad;
+
+ /* Rearrange the source address in @iph and the
+ * addresses in home address option for final source.
+ * See 11.3.2 of RFC 3775 for details.
+ */
+ if (opt[off] == IPV6_TLV_HAO) {
+ struct in6_addr final_addr;
+ struct ipv6_destopt_hao *hao;
+
+ hao = (struct ipv6_destopt_hao *)&opt[off];
+ if (hao->length != sizeof(hao->addr)) {
+ net_warn_ratelimited("destopt hao: invalid header length: %u\n",
+ hao->length);
+ goto bad;
+ }
+ final_addr = hao->addr;
+ hao->addr = iph->saddr;
+ iph->saddr = final_addr;
+ }
+ break;
+ }
+
+ off += optlen;
+ len -= optlen;
+ }
+ /* Note: ok if len == 0 */
+bad:
+ return;
+}
+#else
+static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) {}
+#endif
+
+/**
+ * ipv6_rearrange_rthdr - rearrange IPv6 routing header
+ * @iph: IPv6 header
+ * @rthdr: routing header
+ *
+ * Rearrange the destination address in @iph and the addresses in @rthdr
+ * so that they appear in the order they will at the final destination.
+ * See Appendix A2 of RFC 2402 for details.
+ */
+static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr)
+{
+ int segments, segments_left;
+ struct in6_addr *addrs;
+ struct in6_addr final_addr;
+
+ segments_left = rthdr->segments_left;
+ if (segments_left == 0)
+ return;
+ rthdr->segments_left = 0;
+
+ /* The value of rthdr->hdrlen has been verified either by the system
+ * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming
+ * packets. So we can assume that it is even and that segments is
+ * greater than or equal to segments_left.
+ *
+ * For the same reason we can assume that this option is of type 0.
+ */
+ segments = rthdr->hdrlen >> 1;
+
+ addrs = ((struct rt0_hdr *)rthdr)->addr;
+ final_addr = addrs[segments - 1];
+
+ addrs += segments - segments_left;
+ memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs));
+
+ addrs[0] = iph->daddr;
+ iph->daddr = final_addr;
+}
+
+static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
+{
+ union {
+ struct ipv6hdr *iph;
+ struct ipv6_opt_hdr *opth;
+ struct ipv6_rt_hdr *rth;
+ char *raw;
+ } exthdr = { .iph = iph };
+ char *end = exthdr.raw + len;
+ int nexthdr = iph->nexthdr;
+
+ exthdr.iph++;
+
+ while (exthdr.raw < end) {
+ switch (nexthdr) {
+ case NEXTHDR_DEST:
+ if (dir == XFRM_POLICY_OUT)
+ ipv6_rearrange_destopt(iph, exthdr.opth);
+ case NEXTHDR_HOP:
+ if (!zero_out_mutable_opts(exthdr.opth)) {
+ net_dbg_ratelimited("overrun %sopts\n",
+ nexthdr == NEXTHDR_HOP ?
+ "hop" : "dest");
+ return -EINVAL;
+ }
+ break;
+
+ case NEXTHDR_ROUTING:
+ ipv6_rearrange_rthdr(iph, exthdr.rth);
+ break;
+
+ default:
+ return 0;
+ }
+
+ nexthdr = exthdr.opth->nexthdr;
+ exthdr.raw += ipv6_optlen(exthdr.opth);
+ }
+
+ return 0;
+}
+
+static void ah6_output_done(struct crypto_async_request *base, int err)
+{
+ int extlen;
+ u8 *iph_base;
+ u8 *icv;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ struct ah_data *ahp = x->data;
+ struct ipv6hdr *top_iph = ipv6_hdr(skb);
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ struct tmp_ext *iph_ext;
+
+ extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
+ if (extlen)
+ extlen += sizeof(*iph_ext);
+
+ iph_base = AH_SKB_CB(skb)->tmp;
+ iph_ext = ah_tmp_ext(iph_base);
+ icv = ah_tmp_icv(ahp->ahash, iph_ext, extlen);
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+ memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
+
+ if (extlen) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ memcpy(&top_iph->saddr, iph_ext, extlen);
+#else
+ memcpy(&top_iph->daddr, iph_ext, extlen);
+#endif
+ }
+
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
+static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ int nfrags;
+ int extlen;
+ u8 *iph_base;
+ u8 *icv;
+ u8 nexthdr;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct ipv6hdr *top_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ struct tmp_ext *iph_ext;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+ nfrags = err;
+
+ skb_push(skb, -skb_network_offset(skb));
+ extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
+ if (extlen)
+ extlen += sizeof(*iph_ext);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+ err = -ENOMEM;
+ iph_base = ah_alloc_tmp(ahash, nfrags + sglists, IPV6HDR_BASELEN +
+ extlen + seqhi_len);
+ if (!iph_base)
+ goto out;
+
+ iph_ext = ah_tmp_ext(iph_base);
+ seqhi = (__be32 *)((char *)iph_ext + extlen);
+ icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ ah = ip_auth_hdr(skb);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ top_iph = ipv6_hdr(skb);
+ top_iph->payload_len = htons(skb->len - sizeof(*top_iph));
+
+ nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_AH;
+
+ /* When there are no extension headers, we only need to save the first
+ * 8 bytes of the base IP header.
+ */
+ memcpy(iph_base, top_iph, IPV6HDR_BASELEN);
+
+ if (extlen) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ memcpy(iph_ext, &top_iph->saddr, extlen);
+#else
+ memcpy(iph_ext, &top_iph->daddr, extlen);
+#endif
+ err = ipv6_clear_mutable_options(top_iph,
+ extlen - sizeof(*iph_ext) +
+ sizeof(*top_iph),
+ XFRM_POLICY_OUT);
+ if (err)
+ goto out_free;
+ }
+
+ ah->nexthdr = nexthdr;
+
+ top_iph->priority = 0;
+ top_iph->flow_lbl[0] = 0;
+ top_iph->flow_lbl[1] = 0;
+ top_iph->flow_lbl[2] = 0;
+ top_iph->hop_limit = 0;
+
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+
+ ah->reserved = 0;
+ ah->spi = x->id.spi;
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah6_output_done, skb);
+
+ AH_SKB_CB(skb)->tmp = iph_base;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+ goto out_free;
+ }
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+ memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
+
+ if (extlen) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ memcpy(&top_iph->saddr, iph_ext, extlen);
+#else
+ memcpy(&top_iph->daddr, iph_ext, extlen);
+#endif
+ }
+
+out_free:
+ kfree(iph_base);
+out:
+ return err;
+}
+
+static void ah6_input_done(struct crypto_async_request *base, int err)
+{
+ u8 *auth_data;
+ u8 *icv;
+ u8 *work_iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ah_data *ahp = x->data;
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int hdr_len = skb_network_header_len(skb);
+ int ah_hlen = (ah->hdrlen + 2) << 2;
+
+ work_iph = AH_SKB_CB(skb)->tmp;
+ auth_data = ah_tmp_auth(work_iph, hdr_len);
+ icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+ if (err)
+ goto out;
+
+ err = ah->nexthdr;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, hdr_len);
+ __skb_pull(skb, ah_hlen + hdr_len);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -hdr_len);
+out:
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_input_resume(skb, err);
+}
+
+
+
+static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ /*
+ * Before process AH
+ * [IPv6][Ext1][Ext2][AH][Dest][Payload]
+ * |<-------------->| hdr_len
+ *
+ * To erase AH:
+ * Keeping copy of cleared headers. After AH processing,
+ * Moving the pointer of skb->network_header by using skb_pull as long
+ * as AH header length. Then copy back the copy as long as hdr_len
+ * If destination header following AH exists, copy it into after [Ext2].
+ *
+ * |<>|[IPv6][Ext1][Ext2][Dest][Payload]
+ * There is offset of AH before IPv6 header after the process.
+ */
+
+ u8 *auth_data;
+ u8 *icv;
+ u8 *work_iph;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct ip_auth_hdr *ah;
+ struct ipv6hdr *ip6h;
+ struct ah_data *ahp;
+ u16 hdr_len;
+ u16 ah_hlen;
+ int nexthdr;
+ int nfrags;
+ int err = -ENOMEM;
+ int seqhi_len = 0;
+ __be32 *seqhi;
+ int sglists = 0;
+ struct scatterlist *seqhisg;
+
+ if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
+ goto out;
+
+ /* We are going to _remove_ AH header to keep sockets happy,
+ * so... Later this can change. */
+ if (skb_unclone(skb, GFP_ATOMIC))
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ hdr_len = skb_network_header_len(skb);
+ ah = (struct ip_auth_hdr *)skb->data;
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ nexthdr = ah->nexthdr;
+ ah_hlen = (ah->hdrlen + 2) << 2;
+
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+
+ if (!pskb_may_pull(skb, ah_hlen))
+ goto out;
+
+ err = skb_cow_data(skb, 0, &trailer);
+ if (err < 0)
+ goto out;
+ nfrags = err;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ ip6h = ipv6_hdr(skb);
+
+ skb_push(skb, hdr_len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists = 1;
+ seqhi_len = sizeof(*seqhi);
+ }
+
+ work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len +
+ ahp->icv_trunc_len + seqhi_len);
+ if (!work_iph)
+ goto out;
+
+ auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len);
+ seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
+ icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+ seqhisg = sg + nfrags;
+
+ memcpy(work_iph, ip6h, hdr_len);
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN))
+ goto out_free;
+
+ ip6h->priority = 0;
+ ip6h->flow_lbl[0] = 0;
+ ip6h->flow_lbl[1] = 0;
+ ip6h->flow_lbl[2] = 0;
+ ip6h->hop_limit = 0;
+
+ sg_init_table(sg, nfrags + sglists);
+ skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ /* Attach seqhi sg right after packet payload */
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(seqhisg, seqhi, seqhi_len);
+ }
+
+ ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
+ ahash_request_set_callback(req, 0, ah6_input_done, skb);
+
+ AH_SKB_CB(skb)->tmp = work_iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ goto out_free;
+ }
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
+ if (err)
+ goto out_free;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, hdr_len);
+ __skb_pull(skb, ah_hlen + hdr_len);
+
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -hdr_len);
+
+ err = nexthdr;
+
+out_free:
+ kfree(work_iph);
+out:
+ return err;
+}
+
+static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+offset);
+ struct xfrm_state *x;
+
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
+
+ x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6);
+ if (!x)
+ return 0;
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int ah6_init_state(struct xfrm_state *x)
+{
+ struct ah_data *ahp = NULL;
+ struct xfrm_algo_desc *aalg_desc;
+ struct crypto_ahash *ahash;
+
+ if (!x->aalg)
+ goto error;
+
+ if (x->encap)
+ goto error;
+
+ ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
+ if (!ahp)
+ return -ENOMEM;
+
+ ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+ if (IS_ERR(ahash))
+ goto error;
+
+ ahp->ahash = ahash;
+ if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8))
+ goto error;
+
+ /*
+ * Lookup the algorithm description maintained by xfrm_algo,
+ * verify crypto transform properties, and store information
+ * we need for AH processing. This lookup cannot fail here
+ * after a successful crypto_alloc_hash().
+ */
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_ahash_digestsize(ahash)) {
+ pr_info("AH: %s digestsize %u != %hu\n",
+ x->aalg->alg_name, crypto_ahash_digestsize(ahash),
+ aalg_desc->uinfo.auth.icv_fullbits/8);
+ goto error;
+ }
+
+ ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ switch (x->props.mode) {
+ case XFRM_MODE_BEET:
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
+ x->props.header_len += sizeof(struct ipv6hdr);
+ break;
+ default:
+ goto error;
+ }
+ x->data = ahp;
+
+ return 0;
+
+error:
+ if (ahp) {
+ crypto_free_ahash(ahp->ahash);
+ kfree(ahp);
+ }
+ return -EINVAL;
+}
+
+static void ah6_destroy(struct xfrm_state *x)
+{
+ struct ah_data *ahp = x->data;
+
+ if (!ahp)
+ return;
+
+ crypto_free_ahash(ahp->ahash);
+ kfree(ahp);
+}
+
+static int ah6_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type ah6_type = {
+ .description = "AH6",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_AH,
+ .flags = XFRM_TYPE_REPLAY_PROT,
+ .init_state = ah6_init_state,
+ .destructor = ah6_destroy,
+ .input = ah6_input,
+ .output = ah6_output,
+ .hdr_offset = xfrm6_find_1stfragopt,
+};
+
+static struct xfrm6_protocol ah6_protocol = {
+ .handler = xfrm6_rcv,
+ .cb_handler = ah6_rcv_cb,
+ .err_handler = ah6_err,
+ .priority = 0,
+};
+
+static int __init ah6_init(void)
+{
+ if (xfrm_register_type(&ah6_type, AF_INET6) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+
+ if (xfrm6_protocol_register(&ah6_protocol, IPPROTO_AH) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&ah6_type, AF_INET6);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void __exit ah6_fini(void)
+{
+ if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+
+ if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+
+}
+
+module_init(ah6_init);
+module_exit(ah6_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
new file mode 100644
index 000000000..514ac259f
--- /dev/null
+++ b/net/ipv6/anycast.c
@@ -0,0 +1,556 @@
+/*
+ * Anycast support for IPv6
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * David L Stevens (dlstevens@us.ibm.com)
+ *
+ * based heavily on net/ipv6/mcast.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/route.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/if_inet6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+
+#include <net/checksum.h>
+
+static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr);
+
+/*
+ * socket join an anycast group
+ */
+
+int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net_device *dev = NULL;
+ struct inet6_dev *idev;
+ struct ipv6_ac_socklist *pac;
+ struct net *net = sock_net(sk);
+ int ishost = !net->ipv6.devconf_all->forwarding;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (ipv6_addr_is_multicast(addr))
+ return -EINVAL;
+ if (ipv6_chk_addr(net, addr, NULL, 0))
+ return -EINVAL;
+
+ pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
+ if (!pac)
+ return -ENOMEM;
+ pac->acl_next = NULL;
+ pac->acl_addr = *addr;
+
+ if (ifindex == 0) {
+ struct rt6_info *rt;
+
+ rt = rt6_lookup(net, addr, NULL, 0, 0);
+ if (rt) {
+ dev = rt->dst.dev;
+ ip6_rt_put(rt);
+ } else if (ishost) {
+ err = -EADDRNOTAVAIL;
+ goto error;
+ } else {
+ /* router, no matching interface: just pick one */
+ dev = __dev_get_by_flags(net, IFF_UP,
+ IFF_UP | IFF_LOOPBACK);
+ }
+ } else
+ dev = __dev_get_by_index(net, ifindex);
+
+ if (!dev) {
+ err = -ENODEV;
+ goto error;
+ }
+
+ idev = __in6_dev_get(dev);
+ if (!idev) {
+ if (ifindex)
+ err = -ENODEV;
+ else
+ err = -EADDRNOTAVAIL;
+ goto error;
+ }
+ /* reset ishost, now that we have a specific device */
+ ishost = !idev->cnf.forwarding;
+
+ pac->acl_ifindex = dev->ifindex;
+
+ /* XXX
+ * For hosts, allow link-local or matching prefix anycasts.
+ * This obviates the need for propagating anycast routes while
+ * still allowing some non-router anycast participation.
+ */
+ if (!ipv6_chk_prefix(addr, dev)) {
+ if (ishost)
+ err = -EADDRNOTAVAIL;
+ if (err)
+ goto error;
+ }
+
+ err = __ipv6_dev_ac_inc(idev, addr);
+ if (!err) {
+ pac->acl_next = np->ipv6_ac_list;
+ np->ipv6_ac_list = pac;
+ pac = NULL;
+ }
+
+error:
+ if (pac)
+ sock_kfree_s(sk, pac, sizeof(*pac));
+ return err;
+}
+
+/*
+ * socket leave an anycast group
+ */
+int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net_device *dev;
+ struct ipv6_ac_socklist *pac, *prev_pac;
+ struct net *net = sock_net(sk);
+
+ ASSERT_RTNL();
+
+ prev_pac = NULL;
+ for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) {
+ if ((ifindex == 0 || pac->acl_ifindex == ifindex) &&
+ ipv6_addr_equal(&pac->acl_addr, addr))
+ break;
+ prev_pac = pac;
+ }
+ if (!pac)
+ return -ENOENT;
+ if (prev_pac)
+ prev_pac->acl_next = pac->acl_next;
+ else
+ np->ipv6_ac_list = pac->acl_next;
+
+ dev = __dev_get_by_index(net, pac->acl_ifindex);
+ if (dev)
+ ipv6_dev_ac_dec(dev, &pac->acl_addr);
+
+ sock_kfree_s(sk, pac, sizeof(*pac));
+ return 0;
+}
+
+void ipv6_sock_ac_close(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net_device *dev = NULL;
+ struct ipv6_ac_socklist *pac;
+ struct net *net = sock_net(sk);
+ int prev_index;
+
+ if (!np->ipv6_ac_list)
+ return;
+
+ rtnl_lock();
+ pac = np->ipv6_ac_list;
+ np->ipv6_ac_list = NULL;
+
+ prev_index = 0;
+ while (pac) {
+ struct ipv6_ac_socklist *next = pac->acl_next;
+
+ if (pac->acl_ifindex != prev_index) {
+ dev = __dev_get_by_index(net, pac->acl_ifindex);
+ prev_index = pac->acl_ifindex;
+ }
+ if (dev)
+ ipv6_dev_ac_dec(dev, &pac->acl_addr);
+ sock_kfree_s(sk, pac, sizeof(*pac));
+ pac = next;
+ }
+ rtnl_unlock();
+}
+
+static void aca_get(struct ifacaddr6 *aca)
+{
+ atomic_inc(&aca->aca_refcnt);
+}
+
+static void aca_put(struct ifacaddr6 *ac)
+{
+ if (atomic_dec_and_test(&ac->aca_refcnt)) {
+ in6_dev_put(ac->aca_idev);
+ dst_release(&ac->aca_rt->dst);
+ kfree(ac);
+ }
+}
+
+static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
+ const struct in6_addr *addr)
+{
+ struct inet6_dev *idev = rt->rt6i_idev;
+ struct ifacaddr6 *aca;
+
+ aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
+ if (!aca)
+ return NULL;
+
+ aca->aca_addr = *addr;
+ in6_dev_hold(idev);
+ aca->aca_idev = idev;
+ aca->aca_rt = rt;
+ aca->aca_users = 1;
+ /* aca_tstamp should be updated upon changes */
+ aca->aca_cstamp = aca->aca_tstamp = jiffies;
+ atomic_set(&aca->aca_refcnt, 1);
+
+ return aca;
+}
+
+/*
+ * device anycast group inc (add if not found)
+ */
+int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
+{
+ struct ifacaddr6 *aca;
+ struct rt6_info *rt;
+ int err;
+
+ ASSERT_RTNL();
+
+ write_lock_bh(&idev->lock);
+ if (idev->dead) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+ if (ipv6_addr_equal(&aca->aca_addr, addr)) {
+ aca->aca_users++;
+ err = 0;
+ goto out;
+ }
+ }
+
+ rt = addrconf_dst_alloc(idev, addr, true);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto out;
+ }
+ aca = aca_alloc(rt, addr);
+ if (!aca) {
+ ip6_rt_put(rt);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ aca->aca_next = idev->ac_list;
+ idev->ac_list = aca;
+
+ /* Hold this for addrconf_join_solict() below before we unlock,
+ * it is already exposed via idev->ac_list.
+ */
+ aca_get(aca);
+ write_unlock_bh(&idev->lock);
+
+ ip6_ins_rt(rt);
+
+ addrconf_join_solict(idev->dev, &aca->aca_addr);
+
+ aca_put(aca);
+ return 0;
+out:
+ write_unlock_bh(&idev->lock);
+ return err;
+}
+
+/*
+ * device anycast group decrement
+ */
+int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
+{
+ struct ifacaddr6 *aca, *prev_aca;
+
+ ASSERT_RTNL();
+
+ write_lock_bh(&idev->lock);
+ prev_aca = NULL;
+ for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+ if (ipv6_addr_equal(&aca->aca_addr, addr))
+ break;
+ prev_aca = aca;
+ }
+ if (!aca) {
+ write_unlock_bh(&idev->lock);
+ return -ENOENT;
+ }
+ if (--aca->aca_users > 0) {
+ write_unlock_bh(&idev->lock);
+ return 0;
+ }
+ if (prev_aca)
+ prev_aca->aca_next = aca->aca_next;
+ else
+ idev->ac_list = aca->aca_next;
+ write_unlock_bh(&idev->lock);
+ addrconf_leave_solict(idev, &aca->aca_addr);
+
+ dst_hold(&aca->aca_rt->dst);
+ ip6_del_rt(aca->aca_rt);
+
+ aca_put(aca);
+ return 0;
+}
+
+/* called with rtnl_lock() */
+static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ if (!idev)
+ return -ENODEV;
+ return __ipv6_dev_ac_dec(idev, addr);
+}
+
+void ipv6_ac_destroy_dev(struct inet6_dev *idev)
+{
+ struct ifacaddr6 *aca;
+
+ write_lock_bh(&idev->lock);
+ while ((aca = idev->ac_list) != NULL) {
+ idev->ac_list = aca->aca_next;
+ write_unlock_bh(&idev->lock);
+
+ addrconf_leave_solict(idev, &aca->aca_addr);
+
+ dst_hold(&aca->aca_rt->dst);
+ ip6_del_rt(aca->aca_rt);
+
+ aca_put(aca);
+
+ write_lock_bh(&idev->lock);
+ }
+ write_unlock_bh(&idev->lock);
+}
+
+/*
+ * check if the interface has this anycast address
+ * called with rcu_read_lock()
+ */
+static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr)
+{
+ struct inet6_dev *idev;
+ struct ifacaddr6 *aca;
+
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ read_lock_bh(&idev->lock);
+ for (aca = idev->ac_list; aca; aca = aca->aca_next)
+ if (ipv6_addr_equal(&aca->aca_addr, addr))
+ break;
+ read_unlock_bh(&idev->lock);
+ return aca != NULL;
+ }
+ return false;
+}
+
+/*
+ * check if given interface (or any, if dev==0) has this anycast address
+ */
+bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
+ const struct in6_addr *addr)
+{
+ bool found = false;
+
+ rcu_read_lock();
+ if (dev)
+ found = ipv6_chk_acast_dev(dev, addr);
+ else
+ for_each_netdev_rcu(net, dev)
+ if (ipv6_chk_acast_dev(dev, addr)) {
+ found = true;
+ break;
+ }
+ rcu_read_unlock();
+ return found;
+}
+
+/* check if this anycast address is link-local on given interface or
+ * is global
+ */
+bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
+ const struct in6_addr *addr)
+{
+ return ipv6_chk_acast_addr(net,
+ (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ?
+ dev : NULL),
+ addr);
+}
+
+#ifdef CONFIG_PROC_FS
+struct ac6_iter_state {
+ struct seq_net_private p;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+};
+
+#define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private)
+
+static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
+{
+ struct ifacaddr6 *im = NULL;
+ struct ac6_iter_state *state = ac6_seq_private(seq);
+ struct net *net = seq_file_net(seq);
+
+ state->idev = NULL;
+ for_each_netdev_rcu(net, state->dev) {
+ struct inet6_dev *idev;
+ idev = __in6_dev_get(state->dev);
+ if (!idev)
+ continue;
+ read_lock_bh(&idev->lock);
+ im = idev->ac_list;
+ if (im) {
+ state->idev = idev;
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ return im;
+}
+
+static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im)
+{
+ struct ac6_iter_state *state = ac6_seq_private(seq);
+
+ im = im->aca_next;
+ while (!im) {
+ if (likely(state->idev != NULL))
+ read_unlock_bh(&state->idev->lock);
+
+ state->dev = next_net_device_rcu(state->dev);
+ if (!state->dev) {
+ state->idev = NULL;
+ break;
+ }
+ state->idev = __in6_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+ read_lock_bh(&state->idev->lock);
+ im = state->idev->ac_list;
+ }
+ return im;
+}
+
+static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ifacaddr6 *im = ac6_get_first(seq);
+ if (im)
+ while (pos && (im = ac6_get_next(seq, im)) != NULL)
+ --pos;
+ return pos ? NULL : im;
+}
+
+static void *ac6_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return ac6_get_idx(seq, *pos);
+}
+
+static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ifacaddr6 *im = ac6_get_next(seq, v);
+
+ ++*pos;
+ return im;
+}
+
+static void ac6_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ struct ac6_iter_state *state = ac6_seq_private(seq);
+
+ if (likely(state->idev != NULL)) {
+ read_unlock_bh(&state->idev->lock);
+ state->idev = NULL;
+ }
+ rcu_read_unlock();
+}
+
+static int ac6_seq_show(struct seq_file *seq, void *v)
+{
+ struct ifacaddr6 *im = (struct ifacaddr6 *)v;
+ struct ac6_iter_state *state = ac6_seq_private(seq);
+
+ seq_printf(seq, "%-4d %-15s %pi6 %5d\n",
+ state->dev->ifindex, state->dev->name,
+ &im->aca_addr, im->aca_users);
+ return 0;
+}
+
+static const struct seq_operations ac6_seq_ops = {
+ .start = ac6_seq_start,
+ .next = ac6_seq_next,
+ .stop = ac6_seq_stop,
+ .show = ac6_seq_show,
+};
+
+static int ac6_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ac6_seq_ops,
+ sizeof(struct ac6_iter_state));
+}
+
+static const struct file_operations ac6_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ac6_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+int __net_init ac6_proc_init(struct net *net)
+{
+ if (!proc_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ac6_proc_exit(struct net *net)
+{
+ remove_proc_entry("anycast6", net->proc_net);
+}
+#endif
+
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
new file mode 100644
index 000000000..62d908e64
--- /dev/null
+++ b/net/ipv6/datagram.c
@@ -0,0 +1,971 @@
+/*
+ * common UDP/RAW code
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/route.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/tcp_states.h>
+#include <net/dsfield.h>
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+static bool ipv6_mapped_addr_any(const struct in6_addr *a)
+{
+ return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
+}
+
+int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *daddr, *final_p, final;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ struct ip6_flowlabel *flowlabel = NULL;
+ struct ipv6_txoptions *opt;
+ int addr_type;
+ int err;
+
+ if (usin->sin6_family == AF_INET) {
+ if (__ipv6_only_sock(sk))
+ return -EAFNOSUPPORT;
+ err = ip4_datagram_connect(sk, uaddr, addr_len);
+ goto ipv4_connected;
+ }
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ memset(&fl6, 0, sizeof(fl6));
+ if (np->sndflow) {
+ fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (!flowlabel)
+ return -EINVAL;
+ }
+ }
+
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ /*
+ * connect to self
+ */
+ usin->sin6_addr.s6_addr[15] = 0x01;
+ }
+
+ daddr = &usin->sin6_addr;
+
+ if (addr_type == IPV6_ADDR_MAPPED) {
+ struct sockaddr_in sin;
+
+ if (__ipv6_only_sock(sk)) {
+ err = -ENETUNREACH;
+ goto out;
+ }
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = daddr->s6_addr32[3];
+ sin.sin_port = usin->sin6_port;
+
+ err = ip4_datagram_connect(sk,
+ (struct sockaddr *) &sin,
+ sizeof(sin));
+
+ipv4_connected:
+ if (err)
+ goto out;
+
+ ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr);
+
+ if (ipv6_addr_any(&np->saddr) ||
+ ipv6_mapped_addr_any(&np->saddr))
+ ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
+
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
+ ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) {
+ ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
+ &sk->sk_v6_rcv_saddr);
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
+
+ goto out;
+ }
+
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ usin->sin6_scope_id) {
+ if (sk->sk_bound_dev_if &&
+ sk->sk_bound_dev_if != usin->sin6_scope_id) {
+ err = -EINVAL;
+ goto out;
+ }
+ sk->sk_bound_dev_if = usin->sin6_scope_id;
+ }
+
+ if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST))
+ sk->sk_bound_dev_if = np->mcast_oif;
+
+ /* Connect to link-local address requires an interface */
+ if (!sk->sk_bound_dev_if) {
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ sk->sk_v6_daddr = *daddr;
+ np->flow_label = fl6.flowlabel;
+
+ inet->inet_dport = usin->sin6_port;
+
+ /*
+ * Check for a route to destination an obtain the
+ * destination cache for it.
+ */
+
+ fl6.flowi6_proto = sk->sk_protocol;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = np->saddr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = inet->inet_dport;
+ fl6.fl6_sport = inet->inet_sport;
+
+ if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
+ fl6.flowi6_oif = np->mcast_oif;
+
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ opt = flowlabel ? flowlabel->opt : np->opt;
+ final_p = fl6_update_dst(&fl6, opt, &final);
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ err = 0;
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto out;
+ }
+
+ /* source address lookup done in ip6_dst_lookup */
+
+ if (ipv6_addr_any(&np->saddr))
+ np->saddr = fl6.saddr;
+
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+ sk->sk_v6_rcv_saddr = fl6.saddr;
+ inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
+
+ ip6_dst_store(sk, dst,
+ ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ?
+ &sk->sk_v6_daddr : NULL,
+#ifdef CONFIG_IPV6_SUBTREES
+ ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
+ &np->saddr :
+#endif
+ NULL);
+
+ sk->sk_state = TCP_ESTABLISHED;
+ ip6_set_txhash(sk);
+out:
+ fl6_sock_release(flowlabel);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_connect);
+
+int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr);
+ if (sin6->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+ return ip6_datagram_connect(sk, uaddr, addr_len);
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only);
+
+void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+ __be16 port, u32 info, u8 *payload)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct icmp6hdr *icmph = icmp6_hdr(skb);
+ struct sock_exterr_skb *serr;
+
+ if (!np->recverr)
+ return;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb->protocol = htons(ETH_P_IPV6);
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6;
+ serr->ee.ee_type = icmph->icmp6_type;
+ serr->ee.ee_code = icmph->icmp6_code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) -
+ skb_network_header(skb);
+ serr->port = port;
+
+ __skb_pull(skb, payload - skb->data);
+ skb_reset_transport_header(skb);
+
+ if (sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sock_exterr_skb *serr;
+ struct ipv6hdr *iph;
+ struct sk_buff *skb;
+
+ if (!np->recverr)
+ return;
+
+ skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb->protocol = htons(ETH_P_IPV6);
+
+ skb_put(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ iph = ipv6_hdr(skb);
+ iph->daddr = fl6->daddr;
+
+ serr = SKB_EXT_ERR(skb);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = 0;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_info = info;
+ serr->ee.ee_data = 0;
+ serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
+ serr->port = fl6->fl6_dport;
+
+ __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+ skb_reset_transport_header(skb);
+
+ if (sock_queue_err_skb(sk, skb))
+ kfree_skb(skb);
+}
+
+void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6hdr *iph;
+ struct sk_buff *skb;
+ struct ip6_mtuinfo *mtu_info;
+
+ if (!np->rxopt.bits.rxpmtu)
+ return;
+
+ skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb_put(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ iph = ipv6_hdr(skb);
+ iph->daddr = fl6->daddr;
+
+ mtu_info = IP6CBMTU(skb);
+
+ mtu_info->ip6m_mtu = mtu;
+ mtu_info->ip6m_addr.sin6_family = AF_INET6;
+ mtu_info->ip6m_addr.sin6_port = 0;
+ mtu_info->ip6m_addr.sin6_flowinfo = 0;
+ mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif;
+ mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr;
+
+ __skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+ skb_reset_transport_header(skb);
+
+ skb = xchg(&np->rxpmtu, skb);
+ kfree_skb(skb);
+}
+
+/* For some errors we have valid addr_offset even with zero payload and
+ * zero port. Also, addr_offset should be supported if port is set.
+ */
+static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr)
+{
+ return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port;
+}
+
+/* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL.
+ *
+ * At one point, excluding local errors was a quick test to identify icmp/icmp6
+ * errors. This is no longer true, but the test remained, so the v6 stack,
+ * unlike v4, also honors cmsg requests on all wifi and timestamp errors.
+ *
+ * Timestamp code paths do not initialize the fields expected by cmsg:
+ * the PKTINFO fields in skb->cb[]. Fill those in here.
+ */
+static bool ip6_datagram_support_cmsg(struct sk_buff *skb,
+ struct sock_exterr_skb *serr)
+{
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6)
+ return true;
+
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL)
+ return false;
+
+ if (!skb->dev)
+ return false;
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ IP6CB(skb)->iif = skb->dev->ifindex;
+ else
+ PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex;
+
+ return true;
+}
+
+/*
+ * Handle MSG_ERRQUEUE
+ */
+int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name);
+ struct {
+ struct sock_extended_err ee;
+ struct sockaddr_in6 offender;
+ } errhdr;
+ int err;
+ int copied;
+
+ err = -EAGAIN;
+ skb = sock_dequeue_err_skb(sk);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ serr = SKB_EXT_ERR(skb);
+
+ if (sin && ipv6_datagram_support_addr(serr)) {
+ const unsigned char *nh = skb_network_header(skb);
+ sin->sin6_family = AF_INET6;
+ sin->sin6_flowinfo = 0;
+ sin->sin6_port = serr->port;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset),
+ struct ipv6hdr, daddr);
+ sin->sin6_addr = ip6h->daddr;
+ if (np->sndflow)
+ sin->sin6_flowinfo = ip6_flowinfo(ip6h);
+ sin->sin6_scope_id =
+ ipv6_iface_scope_id(&sin->sin6_addr,
+ IP6CB(skb)->iif);
+ } else {
+ ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset),
+ &sin->sin6_addr);
+ sin->sin6_scope_id = 0;
+ }
+ *addr_len = sizeof(*sin);
+ }
+
+ memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+ sin = &errhdr.offender;
+ memset(sin, 0, sizeof(*sin));
+
+ if (ip6_datagram_support_cmsg(skb, serr)) {
+ sin->sin6_family = AF_INET6;
+ if (np->rxopt.all)
+ ip6_datagram_recv_common_ctl(sk, msg, skb);
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ sin->sin6_addr = ipv6_hdr(skb)->saddr;
+ if (np->rxopt.all)
+ ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ sin->sin6_scope_id =
+ ipv6_iface_scope_id(&sin->sin6_addr,
+ IP6CB(skb)->iif);
+ } else {
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
+ &sin->sin6_addr);
+ if (inet_sk(sk)->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ }
+ }
+
+ put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr);
+
+ /* Now we could try to dump offended packet options */
+
+ msg->msg_flags |= MSG_ERRQUEUE;
+ err = copied;
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ipv6_recv_error);
+
+/*
+ * Handle IPV6_RECVPATHMTU
+ */
+int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
+ int *addr_len)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sk_buff *skb;
+ struct ip6_mtuinfo mtu_info;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name);
+ int err;
+ int copied;
+
+ err = -EAGAIN;
+ skb = xchg(&np->rxpmtu, NULL);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free_skb;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info));
+
+ if (sin) {
+ sin->sin6_family = AF_INET6;
+ sin->sin6_flowinfo = 0;
+ sin->sin6_port = 0;
+ sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id;
+ sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr;
+ *addr_len = sizeof(*sin);
+ }
+
+ put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info);
+
+ err = copied;
+
+out_free_skb:
+ kfree_skb(skb);
+out:
+ return err;
+}
+
+
+void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6);
+
+ if (np->rxopt.bits.rxinfo) {
+ struct in6_pktinfo src_info;
+
+ if (is_ipv6) {
+ src_info.ipi6_ifindex = IP6CB(skb)->iif;
+ src_info.ipi6_addr = ipv6_hdr(skb)->daddr;
+ } else {
+ src_info.ipi6_ifindex =
+ PKTINFO_SKB_CB(skb)->ipi_ifindex;
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+ &src_info.ipi6_addr);
+ }
+
+ if (src_info.ipi6_ifindex >= 0)
+ put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO,
+ sizeof(src_info), &src_info);
+ }
+}
+
+void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ unsigned char *nh = skb_network_header(skb);
+
+ if (np->rxopt.bits.rxhlim) {
+ int hlim = ipv6_hdr(skb)->hop_limit;
+ put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
+ }
+
+ if (np->rxopt.bits.rxtclass) {
+ int tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+ put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass);
+ }
+
+ if (np->rxopt.bits.rxflow) {
+ __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh);
+ if (flowinfo)
+ put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo);
+ }
+
+ /* HbH is allowed only once */
+ if (np->rxopt.bits.hopopts && opt->hop) {
+ u8 *ptr = nh + opt->hop;
+ put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
+ }
+
+ if (opt->lastopt &&
+ (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) {
+ /*
+ * Silly enough, but we need to reparse in order to
+ * report extension headers (except for HbH)
+ * in order.
+ *
+ * Also note that IPV6_RECVRTHDRDSTOPTS is NOT
+ * (and WILL NOT be) defined because
+ * IPV6_RECVDSTOPTS is more generic. --yoshfuji
+ */
+ unsigned int off = sizeof(struct ipv6hdr);
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+
+ while (off <= opt->lastopt) {
+ unsigned int len;
+ u8 *ptr = nh + off;
+
+ switch (nexthdr) {
+ case IPPROTO_DSTOPTS:
+ nexthdr = ptr[0];
+ len = (ptr[1] + 1) << 3;
+ if (np->rxopt.bits.dstopts)
+ put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr);
+ break;
+ case IPPROTO_ROUTING:
+ nexthdr = ptr[0];
+ len = (ptr[1] + 1) << 3;
+ if (np->rxopt.bits.srcrt)
+ put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr);
+ break;
+ case IPPROTO_AH:
+ nexthdr = ptr[0];
+ len = (ptr[1] + 2) << 2;
+ break;
+ default:
+ nexthdr = ptr[0];
+ len = (ptr[1] + 1) << 3;
+ break;
+ }
+
+ off += len;
+ }
+ }
+
+ /* socket options in old style */
+ if (np->rxopt.bits.rxoinfo) {
+ struct in6_pktinfo src_info;
+
+ src_info.ipi6_ifindex = opt->iif;
+ src_info.ipi6_addr = ipv6_hdr(skb)->daddr;
+ put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
+ }
+ if (np->rxopt.bits.rxohlim) {
+ int hlim = ipv6_hdr(skb)->hop_limit;
+ put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
+ }
+ if (np->rxopt.bits.ohopopts && opt->hop) {
+ u8 *ptr = nh + opt->hop;
+ put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
+ }
+ if (np->rxopt.bits.odstopts && opt->dst0) {
+ u8 *ptr = nh + opt->dst0;
+ put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
+ }
+ if (np->rxopt.bits.osrcrt && opt->srcrt) {
+ struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt);
+ put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
+ }
+ if (np->rxopt.bits.odstopts && opt->dst1) {
+ u8 *ptr = nh + opt->dst1;
+ put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
+ }
+ if (np->rxopt.bits.rxorigdstaddr) {
+ struct sockaddr_in6 sin6;
+ __be16 *ports = (__be16 *) skb_transport_header(skb);
+
+ if (skb_transport_offset(skb) + 4 <= skb->len) {
+ /* All current transport protocols have the port numbers in the
+ * first four bytes of the transport header and this function is
+ * written with this assumption in mind.
+ */
+
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ipv6_hdr(skb)->daddr;
+ sin6.sin6_port = ports[1];
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id =
+ ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr,
+ opt->iif);
+
+ put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
+ }
+ }
+}
+
+void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+ ip6_datagram_recv_common_ctl(sk, msg, skb);
+ ip6_datagram_recv_specific_ctl(sk, msg, skb);
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
+
+int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
+ struct msghdr *msg, struct flowi6 *fl6,
+ struct ipv6_txoptions *opt,
+ int *hlimit, int *tclass, int *dontfrag)
+{
+ struct in6_pktinfo *src_info;
+ struct cmsghdr *cmsg;
+ struct ipv6_rt_hdr *rthdr;
+ struct ipv6_opt_hdr *hdr;
+ int len;
+ int err = 0;
+
+ for_each_cmsghdr(cmsg, msg) {
+ int addr_type;
+
+ if (!CMSG_OK(msg, cmsg)) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ if (cmsg->cmsg_level != SOL_IPV6)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case IPV6_PKTINFO:
+ case IPV6_2292PKTINFO:
+ {
+ struct net_device *dev = NULL;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+
+ if (src_info->ipi6_ifindex) {
+ if (fl6->flowi6_oif &&
+ src_info->ipi6_ifindex != fl6->flowi6_oif)
+ return -EINVAL;
+ fl6->flowi6_oif = src_info->ipi6_ifindex;
+ }
+
+ addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
+
+ rcu_read_lock();
+ if (fl6->flowi6_oif) {
+ dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ } else if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ if (addr_type != IPV6_ADDR_ANY) {
+ int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
+ if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
+ !ipv6_chk_addr(net, &src_info->ipi6_addr,
+ strict ? dev : NULL, 0) &&
+ !ipv6_chk_acast_addr_src(net, dev,
+ &src_info->ipi6_addr))
+ err = -EINVAL;
+ else
+ fl6->saddr = src_info->ipi6_addr;
+ }
+
+ rcu_read_unlock();
+
+ if (err)
+ goto exit_f;
+
+ break;
+ }
+
+ case IPV6_FLOWINFO:
+ if (cmsg->cmsg_len < CMSG_LEN(4)) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ if (fl6->flowlabel&IPV6_FLOWINFO_MASK) {
+ if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+ }
+ fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg);
+ break;
+
+ case IPV6_2292HOPOPTS:
+ case IPV6_HOPOPTS:
+ if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+ len = ((hdr->hdrlen + 1) << 3);
+ if (cmsg->cmsg_len < CMSG_LEN(len)) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+ if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
+ err = -EPERM;
+ goto exit_f;
+ }
+ opt->opt_nflen += len;
+ opt->hopopt = hdr;
+ break;
+
+ case IPV6_2292DSTOPTS:
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+ len = ((hdr->hdrlen + 1) << 3);
+ if (cmsg->cmsg_len < CMSG_LEN(len)) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+ if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
+ err = -EPERM;
+ goto exit_f;
+ }
+ if (opt->dst1opt) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+ opt->opt_flen += len;
+ opt->dst1opt = hdr;
+ break;
+
+ case IPV6_DSTOPTS:
+ case IPV6_RTHDRDSTOPTS:
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+ len = ((hdr->hdrlen + 1) << 3);
+ if (cmsg->cmsg_len < CMSG_LEN(len)) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+ if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
+ err = -EPERM;
+ goto exit_f;
+ }
+ if (cmsg->cmsg_type == IPV6_DSTOPTS) {
+ opt->opt_flen += len;
+ opt->dst1opt = hdr;
+ } else {
+ opt->opt_nflen += len;
+ opt->dst0opt = hdr;
+ }
+ break;
+
+ case IPV6_2292RTHDR:
+ case IPV6_RTHDR:
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg);
+
+ switch (rthdr->type) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_SRCRT_TYPE_2:
+ if (rthdr->hdrlen != 2 ||
+ rthdr->segments_left != 1) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+ break;
+#endif
+ default:
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ len = ((rthdr->hdrlen + 1) << 3);
+
+ if (cmsg->cmsg_len < CMSG_LEN(len)) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ /* segments left must also match */
+ if ((rthdr->hdrlen >> 1) != rthdr->segments_left) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ opt->opt_nflen += len;
+ opt->srcrt = rthdr;
+
+ if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) {
+ int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3);
+
+ opt->opt_nflen += dsthdrlen;
+ opt->dst0opt = opt->dst1opt;
+ opt->dst1opt = NULL;
+ opt->opt_flen -= dsthdrlen;
+ }
+
+ break;
+
+ case IPV6_2292HOPLIMIT:
+ case IPV6_HOPLIMIT:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ *hlimit = *(int *)CMSG_DATA(cmsg);
+ if (*hlimit < -1 || *hlimit > 0xff) {
+ err = -EINVAL;
+ goto exit_f;
+ }
+
+ break;
+
+ case IPV6_TCLASS:
+ {
+ int tc;
+
+ err = -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ goto exit_f;
+
+ tc = *(int *)CMSG_DATA(cmsg);
+ if (tc < -1 || tc > 0xff)
+ goto exit_f;
+
+ err = 0;
+ *tclass = tc;
+
+ break;
+ }
+
+ case IPV6_DONTFRAG:
+ {
+ int df;
+
+ err = -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ goto exit_f;
+
+ df = *(int *)CMSG_DATA(cmsg);
+ if (df < 0 || df > 1)
+ goto exit_f;
+
+ err = 0;
+ *dontfrag = df;
+
+ break;
+ }
+ default:
+ net_dbg_ratelimited("invalid cmsg type: %d\n",
+ cmsg->cmsg_type);
+ err = -EINVAL;
+ goto exit_f;
+ }
+ }
+
+exit_f:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl);
+
+void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
+ __u16 srcp, __u16 destp, int bucket)
+{
+ const struct in6_addr *dest, *src;
+
+ dest = &sp->sk_v6_daddr;
+ src = &sp->sk_v6_rcv_saddr;
+ seq_printf(seq,
+ "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+ "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
+ bucket,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3], srcp,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3], destp,
+ sp->sk_state,
+ sk_wmem_alloc_get(sp),
+ sk_rmem_alloc_get(sp),
+ 0, 0L, 0,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+ 0,
+ sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp,
+ atomic_read(&sp->sk_drops));
+}
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
new file mode 100644
index 000000000..7c07ce36a
--- /dev/null
+++ b/net/ipv6/esp6.c
@@ -0,0 +1,673 @@
+/*
+ * Copyright (C)2002 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Authors
+ *
+ * Mitsuru KANDA @USAGI : IPv6 Support
+ * Kazunori MIYAZAWA @USAGI :
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ *
+ * This file is derived from net/ipv4/esp.c
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip6_route.h>
+#include <net/icmp.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <linux/icmpv6.h>
+
+struct esp_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the upper 32 bits of the sequence number are
+ * placed at the front, if present. Followed by the IV, the request and finally
+ * the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
+{
+ unsigned int len;
+
+ len = seqihlen;
+
+ len += crypto_aead_ivsize(aead);
+
+ if (len) {
+ len += crypto_aead_alignmask(aead) &
+ ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ }
+
+ len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+ return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+{
+ return crypto_aead_ivsize(aead) ?
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+}
+
+static inline struct aead_givcrypt_request *esp_tmp_givreq(
+ struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_givcrypt_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_givcrypt_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_request_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+ struct aead_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static inline struct scatterlist *esp_givreq_sg(
+ struct crypto_aead *aead, struct aead_givcrypt_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static void esp_output_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
+static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct aead_givcrypt_request *req;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+ struct sk_buff *trailer;
+ void *tmp;
+ int blksize;
+ int clen;
+ int alen;
+ int plen;
+ int tfclen;
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ u8 *iv;
+ u8 *tail;
+ __be32 *seqhi;
+
+ /* skb is pure payload to encrypt */
+ aead = x->data;
+ alen = crypto_aead_authsize(aead);
+
+ tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
+
+ padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ tfclen = padto - skb->len;
+ }
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ clen = ALIGN(skb->len + 2 + tfclen, blksize);
+ plen = clen - skb->len - tfclen;
+
+ err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+ if (err < 0)
+ goto error;
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_givreq(aead, iv);
+ asg = esp_givreq_sg(aead, req);
+ sg = asg + sglists;
+
+ /* Fill padding... */
+ tail = skb_tail_pointer(trailer);
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
+ do {
+ int i;
+ for (i = 0; i < plen - 2; i++)
+ tail[i] = i + 1;
+ } while (0);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = *skb_mac_header(skb);
+ pskb_put(skb, trailer, clen - skb->len + alen);
+
+ skb_push(skb, -skb_network_offset(skb));
+ esph = ip_esp_hdr(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
+ clen + alen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
+ aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
+ aead_givcrypt_set_assoc(req, asg, assoclen);
+ aead_givcrypt_set_giv(req, esph->enc_data,
+ XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ err = crypto_aead_givencrypt(req);
+ if (err == -EINPROGRESS)
+ goto error;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+
+ kfree(tmp);
+
+error:
+ return err;
+}
+
+static int esp_input_done2(struct sk_buff *skb, int err)
+{
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct crypto_aead *aead = x->data;
+ int alen = crypto_aead_authsize(aead);
+ int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ int elen = skb->len - hlen;
+ int hdr_len = skb_network_header_len(skb);
+ int padlen;
+ u8 nexthdr[2];
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+
+ if (unlikely(err))
+ goto out;
+
+ if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
+ BUG();
+
+ err = -EINVAL;
+ padlen = nexthdr[0];
+ if (padlen + 2 + alen >= elen) {
+ net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
+ padlen + 2, elen - alen);
+ goto out;
+ }
+
+ /* ... check padding bits here. Silly. :-) */
+
+ pskb_trim(skb, skb->len - alen - padlen - 2);
+ __skb_pull(skb, hlen);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ skb_reset_transport_header(skb);
+ else
+ skb_set_transport_header(skb, -hdr_len);
+
+ err = nexthdr[1];
+
+ /* RFC4303: Drop dummy packets without any error */
+ if (err == IPPROTO_NONE)
+ err = -EINVAL;
+
+out:
+ return err;
+}
+
+static void esp_input_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead = x->data;
+ struct aead_request *req;
+ struct sk_buff *trailer;
+ int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ int ret = 0;
+ void *tmp;
+ __be32 *seqhi;
+ u8 *iv;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (elen <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ nfrags = skb_cow_data(skb, 0, &trailer);
+ if (nfrags < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = -ENOMEM;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp)
+ goto out;
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ asg = esp_req_sg(aead, req);
+ sg = asg + sglists;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ esph = (struct ip_esp_hdr *)skb->data;
+
+ /* Get ivec. This can be wrong, check against another impls. */
+ iv = esph->enc_data;
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ aead_request_set_crypt(req, sg, sg, elen, iv);
+ aead_request_set_assoc(req, asg, assoclen);
+
+ ret = crypto_aead_decrypt(req);
+ if (ret == -EINPROGRESS)
+ goto out;
+
+ ret = esp_input_done2(skb, ret);
+
+out:
+ return ret;
+}
+
+static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
+{
+ struct crypto_aead *aead = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ unsigned int net_adj;
+
+ if (x->props.mode != XFRM_MODE_TUNNEL)
+ net_adj = sizeof(struct ipv6hdr);
+ else
+ net_adj = 0;
+
+ return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
+ net_adj) & ~(blksize - 1)) + net_adj - 2;
+}
+
+static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + offset);
+ struct xfrm_state *x;
+
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ esph->spi, IPPROTO_ESP, AF_INET6);
+ if (!x)
+ return 0;
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static void esp6_destroy(struct xfrm_state *x)
+{
+ struct crypto_aead *aead = x->data;
+
+ if (!aead)
+ return;
+
+ crypto_free_aead(aead);
+}
+
+static int esp_init_aead(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ int err;
+
+ aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ err = crypto_aead_setkey(aead, x->aead->alg_key,
+ (x->aead->alg_key_len + 7) / 8);
+ if (err)
+ goto error;
+
+ err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+ if (err)
+ goto error;
+
+error:
+ return err;
+}
+
+static int esp_init_authenc(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+ char *key;
+ char *p;
+ char authenc_name[CRYPTO_MAX_ALG_NAME];
+ unsigned int keylen;
+ int err;
+
+ err = -EINVAL;
+ if (!x->ealg)
+ goto error;
+
+ err = -ENAMETOOLONG;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authencesn(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authenc(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ }
+
+ aead = crypto_alloc_aead(authenc_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ x->data = aead;
+
+ keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+ (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+ err = -ENOMEM;
+ key = kmalloc(keylen, GFP_KERNEL);
+ if (!key)
+ goto error;
+
+ p = key;
+ rta = (void *)p;
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ param = RTA_DATA(rta);
+ p += RTA_SPACE(sizeof(*param));
+
+ if (x->aalg) {
+ struct xfrm_algo_desc *aalg_desc;
+
+ memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+ p += (x->aalg->alg_key_len + 7) / 8;
+
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ err = -EINVAL;
+ if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
+ crypto_aead_authsize(aead)) {
+ pr_info("ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_aead_authsize(aead),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
+ goto free_key;
+ }
+
+ err = crypto_aead_setauthsize(
+ aead, x->aalg->alg_trunc_len / 8);
+ if (err)
+ goto free_key;
+ }
+
+ param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+ memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+ err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+ kfree(key);
+
+error:
+ return err;
+}
+
+static int esp6_init_state(struct xfrm_state *x)
+{
+ struct crypto_aead *aead;
+ u32 align;
+ int err;
+
+ if (x->encap)
+ return -EINVAL;
+
+ x->data = NULL;
+
+ if (x->aead)
+ err = esp_init_aead(x);
+ else
+ err = esp_init_authenc(x);
+
+ if (err)
+ goto error;
+
+ aead = x->data;
+
+ x->props.header_len = sizeof(struct ip_esp_hdr) +
+ crypto_aead_ivsize(aead);
+ switch (x->props.mode) {
+ case XFRM_MODE_BEET:
+ if (x->sel.family != AF_INET6)
+ x->props.header_len += IPV4_BEET_PHMAXLEN +
+ (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
+ break;
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
+ x->props.header_len += sizeof(struct ipv6hdr);
+ break;
+ default:
+ goto error;
+ }
+
+ align = ALIGN(crypto_aead_blocksize(aead), 4);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
+
+error:
+ return err;
+}
+
+static int esp6_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type esp6_type = {
+ .description = "ESP6",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .flags = XFRM_TYPE_REPLAY_PROT,
+ .init_state = esp6_init_state,
+ .destructor = esp6_destroy,
+ .get_mtu = esp6_get_mtu,
+ .input = esp6_input,
+ .output = esp6_output,
+ .hdr_offset = xfrm6_find_1stfragopt,
+};
+
+static struct xfrm6_protocol esp6_protocol = {
+ .handler = xfrm6_rcv,
+ .cb_handler = esp6_rcv_cb,
+ .err_handler = esp6_err,
+ .priority = 0,
+};
+
+static int __init esp6_init(void)
+{
+ if (xfrm_register_type(&esp6_type, AF_INET6) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (xfrm6_protocol_register(&esp6_protocol, IPPROTO_ESP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&esp6_type, AF_INET6);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void __exit esp6_fini(void)
+{
+ if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(esp6_init);
+module_exit(esp6_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
new file mode 100644
index 000000000..a7bbbe455
--- /dev/null
+++ b/net/ipv6/exthdrs.c
@@ -0,0 +1,874 @@
+/*
+ * Extension Header handling for IPv6
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Andi Kleen <ak@muc.de>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* Changes:
+ * yoshfuji : ensure not to overrun while parsing
+ * tlv options.
+ * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs().
+ * YOSHIFUJI Hideaki @USAGI Register inbound extension header
+ * handlers as inet6_protocol{}.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+#include <net/xfrm.h>
+#endif
+
+#include <linux/uaccess.h>
+
+/*
+ * Parsing tlv encoded headers.
+ *
+ * Parsing function "func" returns true, if parsing succeed
+ * and false, if it failed.
+ * It MUST NOT touch skb->h.
+ */
+
+struct tlvtype_proc {
+ int type;
+ bool (*func)(struct sk_buff *skb, int offset);
+};
+
+/*********************
+ Generic functions
+ *********************/
+
+/* An unknown option is detected, decide what to do */
+
+static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff)
+{
+ switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) {
+ case 0: /* ignore */
+ return true;
+
+ case 1: /* drop packet */
+ break;
+
+ case 3: /* Send ICMP if not a multicast address and drop packet */
+ /* Actually, it is redundant check. icmp_send
+ will recheck in any case.
+ */
+ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr))
+ break;
+ case 2: /* send ICMP PARM PROB regardless and drop packet */
+ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff);
+ return false;
+ }
+
+ kfree_skb(skb);
+ return false;
+}
+
+/* Parse tlv encoded option header (hop-by-hop or destination) */
+
+static bool ip6_parse_tlv(const struct tlvtype_proc *procs, struct sk_buff *skb)
+{
+ const struct tlvtype_proc *curr;
+ const unsigned char *nh = skb_network_header(skb);
+ int off = skb_network_header_len(skb);
+ int len = (skb_transport_header(skb)[1] + 1) << 3;
+ int padlen = 0;
+
+ if (skb_transport_offset(skb) + len > skb_headlen(skb))
+ goto bad;
+
+ off += 2;
+ len -= 2;
+
+ while (len > 0) {
+ int optlen = nh[off + 1] + 2;
+ int i;
+
+ switch (nh[off]) {
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ padlen++;
+ if (padlen > 7)
+ goto bad;
+ break;
+
+ case IPV6_TLV_PADN:
+ /* RFC 2460 states that the purpose of PadN is
+ * to align the containing header to multiples
+ * of 8. 7 is therefore the highest valid value.
+ * See also RFC 4942, Section 2.1.9.5.
+ */
+ padlen += optlen;
+ if (padlen > 7)
+ goto bad;
+ /* RFC 4942 recommends receiving hosts to
+ * actively check PadN payload to contain
+ * only zeroes.
+ */
+ for (i = 2; i < optlen; i++) {
+ if (nh[off + i] != 0)
+ goto bad;
+ }
+ break;
+
+ default: /* Other TLV code so scan list */
+ if (optlen > len)
+ goto bad;
+ for (curr = procs; curr->type >= 0; curr++) {
+ if (curr->type == nh[off]) {
+ /* type specific length/alignment
+ checks will be performed in the
+ func(). */
+ if (curr->func(skb, off) == false)
+ return false;
+ break;
+ }
+ }
+ if (curr->type < 0) {
+ if (ip6_tlvopt_unknown(skb, off) == 0)
+ return false;
+ }
+ padlen = 0;
+ break;
+ }
+ off += optlen;
+ len -= optlen;
+ }
+
+ if (len == 0)
+ return true;
+bad:
+ kfree_skb(skb);
+ return false;
+}
+
+/*****************************
+ Destination options header.
+ *****************************/
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
+{
+ struct ipv6_destopt_hao *hao;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct in6_addr tmp_addr;
+ int ret;
+
+ if (opt->dsthao) {
+ net_dbg_ratelimited("hao duplicated\n");
+ goto discard;
+ }
+ opt->dsthao = opt->dst1;
+ opt->dst1 = 0;
+
+ hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff);
+
+ if (hao->length != 16) {
+ net_dbg_ratelimited("hao invalid option length = %d\n",
+ hao->length);
+ goto discard;
+ }
+
+ if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
+ net_dbg_ratelimited("hao is not an unicast addr: %pI6\n",
+ &hao->addr);
+ goto discard;
+ }
+
+ ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr,
+ (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS);
+ if (unlikely(ret < 0))
+ goto discard;
+
+ if (skb_cloned(skb)) {
+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto discard;
+
+ /* update all variable using below by copied skbuff */
+ hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) +
+ optoff);
+ ipv6h = ipv6_hdr(skb);
+ }
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ tmp_addr = ipv6h->saddr;
+ ipv6h->saddr = hao->addr;
+ hao->addr = tmp_addr;
+
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp(skb);
+
+ return true;
+
+ discard:
+ kfree_skb(skb);
+ return false;
+}
+#endif
+
+static const struct tlvtype_proc tlvprocdestopt_lst[] = {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ {
+ .type = IPV6_TLV_HAO,
+ .func = ipv6_dest_hao,
+ },
+#endif
+ {-1, NULL}
+};
+
+static int ipv6_destopt_rcv(struct sk_buff *skb)
+{
+ struct inet6_skb_parm *opt = IP6CB(skb);
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ __u16 dstbuf;
+#endif
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
+ !pskb_may_pull(skb, (skb_transport_offset(skb) +
+ ((skb_transport_header(skb)[1] + 1) << 3)))) {
+ IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst),
+ IPSTATS_MIB_INHDRERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+
+ opt->lastopt = opt->dst1 = skb_network_header_len(skb);
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ dstbuf = opt->dst1;
+#endif
+
+ if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
+ skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
+ opt = IP6CB(skb);
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ opt->nhoff = dstbuf;
+#else
+ opt->nhoff = opt->dst1;
+#endif
+ return 1;
+ }
+
+ IP6_INC_STATS_BH(dev_net(dst->dev),
+ ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+ return -1;
+}
+
+/********************************
+ Routing header.
+ ********************************/
+
+/* called with rcu_read_lock() */
+static int ipv6_rthdr_rcv(struct sk_buff *skb)
+{
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct in6_addr *addr = NULL;
+ struct in6_addr daddr;
+ struct inet6_dev *idev;
+ int n, i;
+ struct ipv6_rt_hdr *hdr;
+ struct rt0_hdr *rthdr;
+ struct net *net = dev_net(skb->dev);
+ int accept_source_route = net->ipv6.devconf_all->accept_source_route;
+
+ idev = __in6_dev_get(skb->dev);
+ if (idev && accept_source_route > idev->cnf.accept_source_route)
+ accept_source_route = idev->cnf.accept_source_route;
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
+ !pskb_may_pull(skb, (skb_transport_offset(skb) +
+ ((skb_transport_header(skb)[1] + 1) << 3)))) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+
+ hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);
+
+ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
+ skb->pkt_type != PACKET_HOST) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+
+looped_back:
+ if (hdr->segments_left == 0) {
+ switch (hdr->type) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_SRCRT_TYPE_2:
+ /* Silently discard type 2 header unless it was
+ * processed by own
+ */
+ if (!addr) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+
+ opt->lastopt = opt->srcrt = skb_network_header_len(skb);
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->dst0 = opt->dst1;
+ opt->dst1 = 0;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+ return 1;
+ }
+
+ switch (hdr->type) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_SRCRT_TYPE_2:
+ if (accept_source_route < 0)
+ goto unknown_rh;
+ /* Silently discard invalid RTH type 2 */
+ if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+ break;
+#endif
+ default:
+ goto unknown_rh;
+ }
+
+ /*
+ * This is the routing header forwarding algorithm from
+ * RFC 2460, page 16.
+ */
+
+ n = hdr->hdrlen >> 1;
+
+ if (hdr->segments_left > n) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ /* We are about to mangle packet header. Be careful!
+ Do not damage packets queued somewhere.
+ */
+ if (skb_cloned(skb)) {
+ /* the copy is a forwarded packet */
+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -1;
+ }
+ hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);
+ }
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_NONE;
+
+ i = n - --hdr->segments_left;
+
+ rthdr = (struct rt0_hdr *) hdr;
+ addr = rthdr->addr;
+ addr += i - 1;
+
+ switch (hdr->type) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_SRCRT_TYPE_2:
+ if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
+ (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
+ IPPROTO_ROUTING) < 0) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+ if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+
+ if (ipv6_addr_is_multicast(addr)) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INADDRERRORS);
+ kfree_skb(skb);
+ return -1;
+ }
+
+ daddr = *addr;
+ *addr = ipv6_hdr(skb)->daddr;
+ ipv6_hdr(skb)->daddr = daddr;
+
+ skb_dst_drop(skb);
+ ip6_route_input(skb);
+ if (skb_dst(skb)->error) {
+ skb_push(skb, skb->data - skb_network_header(skb));
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+ 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+ goto looped_back;
+ }
+
+ skb_push(skb, skb->data - skb_network_header(skb));
+ dst_input(skb);
+ return -1;
+
+unknown_rh:
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ (&hdr->type) - skb_network_header(skb));
+ return -1;
+}
+
+static const struct inet6_protocol rthdr_protocol = {
+ .handler = ipv6_rthdr_rcv,
+ .flags = INET6_PROTO_NOPOLICY,
+};
+
+static const struct inet6_protocol destopt_protocol = {
+ .handler = ipv6_destopt_rcv,
+ .flags = INET6_PROTO_NOPOLICY,
+};
+
+static const struct inet6_protocol nodata_protocol = {
+ .handler = dst_discard,
+ .flags = INET6_PROTO_NOPOLICY,
+};
+
+int __init ipv6_exthdrs_init(void)
+{
+ int ret;
+
+ ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING);
+ if (ret)
+ goto out;
+
+ ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+ if (ret)
+ goto out_rthdr;
+
+ ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE);
+ if (ret)
+ goto out_destopt;
+
+out:
+ return ret;
+out_destopt:
+ inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+out_rthdr:
+ inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
+ goto out;
+};
+
+void ipv6_exthdrs_exit(void)
+{
+ inet6_del_protocol(&nodata_protocol, IPPROTO_NONE);
+ inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+ inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
+}
+
+/**********************************
+ Hop-by-hop options.
+ **********************************/
+
+/*
+ * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input().
+ */
+static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb)
+{
+ return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev);
+}
+
+static inline struct net *ipv6_skb_net(struct sk_buff *skb)
+{
+ return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev);
+}
+
+/* Router Alert as of RFC 2711 */
+
+static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
+{
+ const unsigned char *nh = skb_network_header(skb);
+
+ if (nh[optoff + 1] == 2) {
+ IP6CB(skb)->flags |= IP6SKB_ROUTERALERT;
+ memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra));
+ return true;
+ }
+ net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n",
+ nh[optoff + 1]);
+ kfree_skb(skb);
+ return false;
+}
+
+/* Jumbo payload */
+
+static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
+{
+ const unsigned char *nh = skb_network_header(skb);
+ struct net *net = ipv6_skb_net(skb);
+ u32 pkt_len;
+
+ if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
+ net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
+ nh[optoff+1]);
+ IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+ IPSTATS_MIB_INHDRERRORS);
+ goto drop;
+ }
+
+ pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
+ if (pkt_len <= IPV6_MAXPLEN) {
+ IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
+ return false;
+ }
+ if (ipv6_hdr(skb)->payload_len) {
+ IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
+ return false;
+ }
+
+ if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
+ IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+ IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ }
+
+ if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+ goto drop;
+
+ return true;
+
+drop:
+ kfree_skb(skb);
+ return false;
+}
+
+static const struct tlvtype_proc tlvprochopopt_lst[] = {
+ {
+ .type = IPV6_TLV_ROUTERALERT,
+ .func = ipv6_hop_ra,
+ },
+ {
+ .type = IPV6_TLV_JUMBO,
+ .func = ipv6_hop_jumbo,
+ },
+ { -1, }
+};
+
+int ipv6_parse_hopopts(struct sk_buff *skb)
+{
+ struct inet6_skb_parm *opt = IP6CB(skb);
+
+ /*
+ * skb_network_header(skb) is equal to skb->data, and
+ * skb_network_header_len(skb) is always equal to
+ * sizeof(struct ipv6hdr) by definition of
+ * hop-by-hop options.
+ */
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) ||
+ !pskb_may_pull(skb, (sizeof(struct ipv6hdr) +
+ ((skb_transport_header(skb)[1] + 1) << 3)))) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ opt->hop = sizeof(struct ipv6hdr);
+ if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+ skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
+ opt = IP6CB(skb);
+ opt->nhoff = sizeof(struct ipv6hdr);
+ return 1;
+ }
+ return -1;
+}
+
+/*
+ * Creating outbound headers.
+ *
+ * "build" functions work when skb is filled from head to tail (datagram)
+ * "push" functions work when headers are added from tail to head (tcp)
+ *
+ * In both cases we assume, that caller reserved enough room
+ * for headers.
+ */
+
+static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p)
+{
+ struct rt0_hdr *phdr, *ihdr;
+ int hops;
+
+ ihdr = (struct rt0_hdr *) opt;
+
+ phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
+ memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
+
+ hops = ihdr->rt_hdr.hdrlen >> 1;
+
+ if (hops > 1)
+ memcpy(phdr->addr, ihdr->addr + 1,
+ (hops - 1) * sizeof(struct in6_addr));
+
+ phdr->addr[hops - 1] = **addr_p;
+ *addr_p = ihdr->addr;
+
+ phdr->rt_hdr.nexthdr = *proto;
+ *proto = NEXTHDR_ROUTING;
+}
+
+static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
+{
+ struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
+
+ memcpy(h, opt, ipv6_optlen(opt));
+ h->nexthdr = *proto;
+ *proto = type;
+}
+
+void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
+ u8 *proto,
+ struct in6_addr **daddr)
+{
+ if (opt->srcrt) {
+ ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
+ /*
+ * IPV6_RTHDRDSTOPTS is ignored
+ * unless IPV6_RTHDR is set (RFC3542).
+ */
+ if (opt->dst0opt)
+ ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
+ }
+ if (opt->hopopt)
+ ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
+}
+EXPORT_SYMBOL(ipv6_push_nfrag_opts);
+
+void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto)
+{
+ if (opt->dst1opt)
+ ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
+}
+
+struct ipv6_txoptions *
+ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
+{
+ struct ipv6_txoptions *opt2;
+
+ opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC);
+ if (opt2) {
+ long dif = (char *)opt2 - (char *)opt;
+ memcpy(opt2, opt, opt->tot_len);
+ if (opt2->hopopt)
+ *((char **)&opt2->hopopt) += dif;
+ if (opt2->dst0opt)
+ *((char **)&opt2->dst0opt) += dif;
+ if (opt2->dst1opt)
+ *((char **)&opt2->dst1opt) += dif;
+ if (opt2->srcrt)
+ *((char **)&opt2->srcrt) += dif;
+ }
+ return opt2;
+}
+EXPORT_SYMBOL_GPL(ipv6_dup_options);
+
+static int ipv6_renew_option(void *ohdr,
+ struct ipv6_opt_hdr __user *newopt, int newoptlen,
+ int inherit,
+ struct ipv6_opt_hdr **hdr,
+ char **p)
+{
+ if (inherit) {
+ if (ohdr) {
+ memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
+ *hdr = (struct ipv6_opt_hdr *)*p;
+ *p += CMSG_ALIGN(ipv6_optlen(*hdr));
+ }
+ } else {
+ if (newopt) {
+ if (copy_from_user(*p, newopt, newoptlen))
+ return -EFAULT;
+ *hdr = (struct ipv6_opt_hdr *)*p;
+ if (ipv6_optlen(*hdr) > newoptlen)
+ return -EINVAL;
+ *p += CMSG_ALIGN(newoptlen);
+ }
+ }
+ return 0;
+}
+
+struct ipv6_txoptions *
+ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
+ int newtype,
+ struct ipv6_opt_hdr __user *newopt, int newoptlen)
+{
+ int tot_len = 0;
+ char *p;
+ struct ipv6_txoptions *opt2;
+ int err;
+
+ if (opt) {
+ if (newtype != IPV6_HOPOPTS && opt->hopopt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt));
+ if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt));
+ if (newtype != IPV6_RTHDR && opt->srcrt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt));
+ if (newtype != IPV6_DSTOPTS && opt->dst1opt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
+ }
+
+ if (newopt && newoptlen)
+ tot_len += CMSG_ALIGN(newoptlen);
+
+ if (!tot_len)
+ return NULL;
+
+ tot_len += sizeof(*opt2);
+ opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC);
+ if (!opt2)
+ return ERR_PTR(-ENOBUFS);
+
+ memset(opt2, 0, tot_len);
+
+ opt2->tot_len = tot_len;
+ p = (char *)(opt2 + 1);
+
+ err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen,
+ newtype != IPV6_HOPOPTS,
+ &opt2->hopopt, &p);
+ if (err)
+ goto out;
+
+ err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen,
+ newtype != IPV6_RTHDRDSTOPTS,
+ &opt2->dst0opt, &p);
+ if (err)
+ goto out;
+
+ err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen,
+ newtype != IPV6_RTHDR,
+ (struct ipv6_opt_hdr **)&opt2->srcrt, &p);
+ if (err)
+ goto out;
+
+ err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
+ newtype != IPV6_DSTOPTS,
+ &opt2->dst1opt, &p);
+ if (err)
+ goto out;
+
+ opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
+ (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
+ (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0);
+ opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
+
+ return opt2;
+out:
+ sock_kfree_s(sk, opt2, opt2->tot_len);
+ return ERR_PTR(err);
+}
+
+struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+ struct ipv6_txoptions *opt)
+{
+ /*
+ * ignore the dest before srcrt unless srcrt is being included.
+ * --yoshfuji
+ */
+ if (opt && opt->dst0opt && !opt->srcrt) {
+ if (opt_space != opt) {
+ memcpy(opt_space, opt, sizeof(*opt_space));
+ opt = opt_space;
+ }
+ opt->opt_nflen -= ipv6_optlen(opt->dst0opt);
+ opt->dst0opt = NULL;
+ }
+
+ return opt;
+}
+EXPORT_SYMBOL_GPL(ipv6_fixup_options);
+
+/**
+ * fl6_update_dst - update flowi destination address with info given
+ * by srcrt option, if any.
+ *
+ * @fl6: flowi6 for which daddr is to be updated
+ * @opt: struct ipv6_txoptions in which to look for srcrt opt
+ * @orig: copy of original daddr address if modified
+ *
+ * Returns NULL if no txoptions or no srcrt, otherwise returns orig
+ * and initial value of fl6->daddr set in orig
+ */
+struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
+ const struct ipv6_txoptions *opt,
+ struct in6_addr *orig)
+{
+ if (!opt || !opt->srcrt)
+ return NULL;
+
+ *orig = fl6->daddr;
+ fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
+ return orig;
+}
+EXPORT_SYMBOL_GPL(fl6_update_dst);
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
new file mode 100644
index 000000000..5c5d23e59
--- /dev/null
+++ b/net/ipv6/exthdrs_core.c
@@ -0,0 +1,281 @@
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static.
+ */
+#include <linux/export.h>
+#include <net/ipv6.h>
+
+/*
+ * find out if nexthdr is a well-known extension header or a protocol
+ */
+
+bool ipv6_ext_hdr(u8 nexthdr)
+{
+ /*
+ * find out if nexthdr is an extension header or a protocol
+ */
+ return (nexthdr == NEXTHDR_HOP) ||
+ (nexthdr == NEXTHDR_ROUTING) ||
+ (nexthdr == NEXTHDR_FRAGMENT) ||
+ (nexthdr == NEXTHDR_AUTH) ||
+ (nexthdr == NEXTHDR_NONE) ||
+ (nexthdr == NEXTHDR_DEST);
+}
+EXPORT_SYMBOL(ipv6_ext_hdr);
+
+/*
+ * Skip any extension headers. This is used by the ICMP module.
+ *
+ * Note that strictly speaking this conflicts with RFC 2460 4.0:
+ * ...The contents and semantics of each extension header determine whether
+ * or not to proceed to the next header. Therefore, extension headers must
+ * be processed strictly in the order they appear in the packet; a
+ * receiver must not, for example, scan through a packet looking for a
+ * particular kind of extension header and process that header prior to
+ * processing all preceding ones.
+ *
+ * We do exactly this. This is a protocol bug. We can't decide after a
+ * seeing an unknown discard-with-error flavour TLV option if it's a
+ * ICMP error message or not (errors should never be send in reply to
+ * ICMP error messages).
+ *
+ * But I see no other way to do this. This might need to be reexamined
+ * when Linux implements ESP (and maybe AUTH) headers.
+ * --AK
+ *
+ * This function parses (probably truncated) exthdr set "hdr".
+ * "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * If it is not NULL *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
+ * - it may return pointer pointing beyond end of packet,
+ * if the last recognized header is truncated in the middle.
+ * - if packet is truncated, so that all parsed headers are skipped,
+ * it returns NULL.
+ * - First fragment header is skipped, not-first ones
+ * are considered as unparsable.
+ * - Reports the offset field of the final fragment header so it is
+ * possible to tell whether this is a first fragment, later fragment,
+ * or not fragmented.
+ * - ESP is unparsable for now and considered like
+ * normal payload protocol.
+ * - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ *
+ * --ANK (980726)
+ */
+
+int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
+ __be16 *frag_offp)
+{
+ u8 nexthdr = *nexthdrp;
+
+ *frag_offp = 0;
+
+ while (ipv6_ext_hdr(nexthdr)) {
+ struct ipv6_opt_hdr _hdr, *hp;
+ int hdrlen;
+
+ if (nexthdr == NEXTHDR_NONE)
+ return -1;
+ hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+ if (!hp)
+ return -1;
+ if (nexthdr == NEXTHDR_FRAGMENT) {
+ __be16 _frag_off, *fp;
+ fp = skb_header_pointer(skb,
+ start+offsetof(struct frag_hdr,
+ frag_off),
+ sizeof(_frag_off),
+ &_frag_off);
+ if (!fp)
+ return -1;
+
+ *frag_offp = *fp;
+ if (ntohs(*frag_offp) & ~0x7)
+ break;
+ hdrlen = 8;
+ } else if (nexthdr == NEXTHDR_AUTH)
+ hdrlen = (hp->hdrlen+2)<<2;
+ else
+ hdrlen = ipv6_optlen(hp);
+
+ nexthdr = hp->nexthdr;
+ start += hdrlen;
+ }
+
+ *nexthdrp = nexthdr;
+ return start;
+}
+EXPORT_SYMBOL(ipv6_skip_exthdr);
+
+int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
+{
+ const unsigned char *nh = skb_network_header(skb);
+ int packet_len = skb_tail_pointer(skb) - skb_network_header(skb);
+ struct ipv6_opt_hdr *hdr;
+ int len;
+
+ if (offset + 2 > packet_len)
+ goto bad;
+ hdr = (struct ipv6_opt_hdr *)(nh + offset);
+ len = ((hdr->hdrlen + 1) << 3);
+
+ if (offset + len > packet_len)
+ goto bad;
+
+ offset += 2;
+ len -= 2;
+
+ while (len > 0) {
+ int opttype = nh[offset];
+ int optlen;
+
+ if (opttype == type)
+ return offset;
+
+ switch (opttype) {
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ break;
+ default:
+ optlen = nh[offset + 1] + 2;
+ if (optlen > len)
+ goto bad;
+ break;
+ }
+ offset += optlen;
+ len -= optlen;
+ }
+ /* not_found */
+ bad:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(ipv6_find_tlv);
+
+/*
+ * find the offset to specified header or the protocol number of last header
+ * if target < 0. "last header" is transport protocol header, ESP, or
+ * "No next header".
+ *
+ * Note that *offset is used as input/output parameter. an if it is not zero,
+ * then it must be a valid offset to an inner IPv6 header. This can be used
+ * to explore inner IPv6 header, eg. ICMPv6 error messages.
+ *
+ * If target header is found, its offset is set in *offset and return protocol
+ * number. Otherwise, return -1.
+ *
+ * If the first fragment doesn't contain the final protocol header or
+ * NEXTHDR_NONE it is considered invalid.
+ *
+ * Note that non-1st fragment is special case that "the protocol number
+ * of last header" is "next header" field in Fragment header. In this case,
+ * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
+ * isn't NULL.
+ *
+ * if flags is not NULL and it's a fragment, then the frag flag
+ * IP6_FH_F_FRAG will be set. If it's an AH header, the
+ * IP6_FH_F_AUTH flag is set and target < 0, then this function will
+ * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this
+ * function will skip all those routing headers, where segements_left was 0.
+ */
+int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
+ int target, unsigned short *fragoff, int *flags)
+{
+ unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ unsigned int len;
+ bool found;
+
+ if (fragoff)
+ *fragoff = 0;
+
+ if (*offset) {
+ struct ipv6hdr _ip6, *ip6;
+
+ ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
+ if (!ip6 || (ip6->version != 6)) {
+ printk(KERN_ERR "IPv6 header not found\n");
+ return -EBADMSG;
+ }
+ start = *offset + sizeof(struct ipv6hdr);
+ nexthdr = ip6->nexthdr;
+ }
+ len = skb->len - start;
+
+ do {
+ struct ipv6_opt_hdr _hdr, *hp;
+ unsigned int hdrlen;
+ found = (nexthdr == target);
+
+ if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
+ if (target < 0 || found)
+ break;
+ return -ENOENT;
+ }
+
+ hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+ if (!hp)
+ return -EBADMSG;
+
+ if (nexthdr == NEXTHDR_ROUTING) {
+ struct ipv6_rt_hdr _rh, *rh;
+
+ rh = skb_header_pointer(skb, start, sizeof(_rh),
+ &_rh);
+ if (!rh)
+ return -EBADMSG;
+
+ if (flags && (*flags & IP6_FH_F_SKIP_RH) &&
+ rh->segments_left == 0)
+ found = false;
+ }
+
+ if (nexthdr == NEXTHDR_FRAGMENT) {
+ unsigned short _frag_off;
+ __be16 *fp;
+
+ if (flags) /* Indicate that this is a fragment */
+ *flags |= IP6_FH_F_FRAG;
+ fp = skb_header_pointer(skb,
+ start+offsetof(struct frag_hdr,
+ frag_off),
+ sizeof(_frag_off),
+ &_frag_off);
+ if (!fp)
+ return -EBADMSG;
+
+ _frag_off = ntohs(*fp) & ~0x7;
+ if (_frag_off) {
+ if (target < 0 &&
+ ((!ipv6_ext_hdr(hp->nexthdr)) ||
+ hp->nexthdr == NEXTHDR_NONE)) {
+ if (fragoff)
+ *fragoff = _frag_off;
+ return hp->nexthdr;
+ }
+ return -ENOENT;
+ }
+ hdrlen = 8;
+ } else if (nexthdr == NEXTHDR_AUTH) {
+ if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
+ break;
+ hdrlen = (hp->hdrlen + 2) << 2;
+ } else
+ hdrlen = ipv6_optlen(hp);
+
+ if (!found) {
+ nexthdr = hp->nexthdr;
+ len -= hdrlen;
+ start += hdrlen;
+ }
+ } while (!found);
+
+ *offset = start;
+ return nexthdr;
+}
+EXPORT_SYMBOL(ipv6_find_hdr);
+
diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c
new file mode 100644
index 000000000..447a7fbd1
--- /dev/null
+++ b/net/ipv6/exthdrs_offload.c
@@ -0,0 +1,41 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * IPV6 Extension Header GSO/GRO support
+ */
+#include <net/protocol.h>
+#include "ip6_offload.h"
+
+static const struct net_offload rthdr_offload = {
+ .flags = INET6_PROTO_GSO_EXTHDR,
+};
+
+static const struct net_offload dstopt_offload = {
+ .flags = INET6_PROTO_GSO_EXTHDR,
+};
+
+int __init ipv6_exthdrs_offload_init(void)
+{
+ int ret;
+
+ ret = inet6_add_offload(&rthdr_offload, IPPROTO_ROUTING);
+ if (ret)
+ goto out;
+
+ ret = inet6_add_offload(&dstopt_offload, IPPROTO_DSTOPTS);
+ if (ret)
+ goto out_rt;
+
+out:
+ return ret;
+
+out_rt:
+ inet_del_offload(&rthdr_offload, IPPROTO_ROUTING);
+ goto out;
+}
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
new file mode 100644
index 000000000..2367a16ea
--- /dev/null
+++ b/net/ipv6/fib6_rules.c
@@ -0,0 +1,337 @@
+/*
+ * net/ipv6/fib6_rules.c IPv6 Routing Policy Rules
+ *
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2.
+ *
+ * Authors
+ * Thomas Graf <tgraf@suug.ch>
+ * Ville Nuorvala <vnuorval@tcs.hut.fi>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/export.h>
+
+#include <net/fib_rules.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/netlink.h>
+
+struct fib6_rule {
+ struct fib_rule common;
+ struct rt6key src;
+ struct rt6key dst;
+ u8 tclass;
+};
+
+struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+ int flags, pol_lookup_t lookup)
+{
+ struct fib_lookup_arg arg = {
+ .lookup_ptr = lookup,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ fib_rules_lookup(net->ipv6.fib6_rules_ops,
+ flowi6_to_flowi(fl6), flags, &arg);
+
+ if (arg.result)
+ return arg.result;
+
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
+ return &net->ipv6.ip6_null_entry->dst;
+}
+
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct flowi6 *flp6 = &flp->u.ip6;
+ struct rt6_info *rt = NULL;
+ struct fib6_table *table;
+ struct net *net = rule->fr_net;
+ pol_lookup_t lookup = arg->lookup_ptr;
+ int err = 0;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ err = -ENETUNREACH;
+ rt = net->ipv6.ip6_null_entry;
+ goto discard_pkt;
+ default:
+ case FR_ACT_BLACKHOLE:
+ err = -EINVAL;
+ rt = net->ipv6.ip6_blk_hole_entry;
+ goto discard_pkt;
+ case FR_ACT_PROHIBIT:
+ err = -EACCES;
+ rt = net->ipv6.ip6_prohibit_entry;
+ goto discard_pkt;
+ }
+
+ table = fib6_get_table(net, rule->table);
+ if (!table) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ rt = lookup(net, table, flp6, flags);
+ if (rt != net->ipv6.ip6_null_entry) {
+ struct fib6_rule *r = (struct fib6_rule *)rule;
+
+ /*
+ * If we need to find a source address for this traffic,
+ * we check the result if it meets requirement of the rule.
+ */
+ if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+ r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+ struct in6_addr saddr;
+
+ if (ipv6_dev_get_saddr(net,
+ ip6_dst_idev(&rt->dst)->dev,
+ &flp6->daddr,
+ rt6_flags2srcprefs(flags),
+ &saddr))
+ goto again;
+ if (!ipv6_prefix_equal(&saddr, &r->src.addr,
+ r->src.plen))
+ goto again;
+ flp6->saddr = saddr;
+ }
+ err = rt->dst.error;
+ goto out;
+ }
+again:
+ ip6_rt_put(rt);
+ err = -EAGAIN;
+ rt = NULL;
+ goto out;
+
+discard_pkt:
+ dst_hold(&rt->dst);
+out:
+ arg->result = rt;
+ return err;
+}
+
+static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+ struct rt6_info *rt = (struct rt6_info *) arg->result;
+ struct net_device *dev = NULL;
+
+ if (rt->rt6i_idev)
+ dev = rt->rt6i_idev->dev;
+
+ /* do not accept result if the route does
+ * not meet the required prefix length
+ */
+ if (rt->rt6i_dst.plen <= rule->suppress_prefixlen)
+ goto suppress_route;
+
+ /* do not accept result if the route uses a device
+ * belonging to a forbidden interface group
+ */
+ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+ goto suppress_route;
+
+ return false;
+
+suppress_route:
+ ip6_rt_put(rt);
+ return true;
+}
+
+static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+ struct fib6_rule *r = (struct fib6_rule *) rule;
+ struct flowi6 *fl6 = &fl->u.ip6;
+
+ if (r->dst.plen &&
+ !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen))
+ return 0;
+
+ /*
+ * If FIB_RULE_FIND_SADDR is set and we do not have a
+ * source address for the traffic, we defer check for
+ * source address.
+ */
+ if (r->src.plen) {
+ if (flags & RT6_LOOKUP_F_HAS_SADDR) {
+ if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr,
+ r->src.plen))
+ return 0;
+ } else if (!(r->common.flags & FIB_RULE_FIND_SADDR))
+ return 0;
+ }
+
+ if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel))
+ return 0;
+
+ return 1;
+}
+
+static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
+ FRA_GENERIC_POLICY,
+};
+
+static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ int err = -EINVAL;
+ struct net *net = sock_net(skb->sk);
+ struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+ if (rule->action == FR_ACT_TO_TBL) {
+ if (rule->table == RT6_TABLE_UNSPEC)
+ goto errout;
+
+ if (fib6_new_table(net, rule->table) == NULL) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+ }
+
+ if (frh->src_len)
+ rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]);
+
+ if (frh->dst_len)
+ rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]);
+
+ rule6->src.plen = frh->src_len;
+ rule6->dst.plen = frh->dst_len;
+ rule6->tclass = frh->tos;
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+ if (frh->src_len && (rule6->src.plen != frh->src_len))
+ return 0;
+
+ if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
+ return 0;
+
+ if (frh->tos && (rule6->tclass != frh->tos))
+ return 0;
+
+ if (frh->src_len &&
+ nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
+ return 0;
+
+ if (frh->dst_len &&
+ nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr)))
+ return 0;
+
+ return 1;
+}
+
+static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+ frh->dst_len = rule6->dst.plen;
+ frh->src_len = rule6->src.plen;
+ frh->tos = rule6->tclass;
+
+ if ((rule6->dst.plen &&
+ nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
+ (rule6->src.plen &&
+ nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOBUFS;
+}
+
+static u32 fib6_rule_default_pref(struct fib_rules_ops *ops)
+{
+ return 0x3FFF;
+}
+
+static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
+{
+ return nla_total_size(16) /* dst */
+ + nla_total_size(16); /* src */
+}
+
+static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
+ .family = AF_INET6,
+ .rule_size = sizeof(struct fib6_rule),
+ .addr_size = sizeof(struct in6_addr),
+ .action = fib6_rule_action,
+ .match = fib6_rule_match,
+ .suppress = fib6_rule_suppress,
+ .configure = fib6_rule_configure,
+ .compare = fib6_rule_compare,
+ .fill = fib6_rule_fill,
+ .default_pref = fib6_rule_default_pref,
+ .nlmsg_payload = fib6_rule_nlmsg_payload,
+ .nlgroup = RTNLGRP_IPV6_RULE,
+ .policy = fib6_rule_policy,
+ .owner = THIS_MODULE,
+ .fro_net = &init_net,
+};
+
+static int __net_init fib6_rules_net_init(struct net *net)
+{
+ struct fib_rules_ops *ops;
+ int err = -ENOMEM;
+
+ ops = fib_rules_register(&fib6_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL, 0);
+ if (err)
+ goto out_fib6_rules_ops;
+
+ err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN, 0);
+ if (err)
+ goto out_fib6_rules_ops;
+
+ net->ipv6.fib6_rules_ops = ops;
+out:
+ return err;
+
+out_fib6_rules_ops:
+ fib_rules_unregister(ops);
+ goto out;
+}
+
+static void __net_exit fib6_rules_net_exit(struct net *net)
+{
+ rtnl_lock();
+ fib_rules_unregister(net->ipv6.fib6_rules_ops);
+ rtnl_unlock();
+}
+
+static struct pernet_operations fib6_rules_net_ops = {
+ .init = fib6_rules_net_init,
+ .exit = fib6_rules_net_exit,
+};
+
+int __init fib6_rules_init(void)
+{
+ return register_pernet_subsys(&fib6_rules_net_ops);
+}
+
+
+void fib6_rules_cleanup(void)
+{
+ unregister_pernet_subsys(&fib6_rules_net_ops);
+}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
new file mode 100644
index 000000000..2c2b5d51f
--- /dev/null
+++ b/net/ipv6/icmp.c
@@ -0,0 +1,1018 @@
+/*
+ * Internet Control Message Protocol (ICMPv6)
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Based on net/ipv4/icmp.c
+ *
+ * RFC 1885
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ *
+ * Andi Kleen : exception handling
+ * Andi Kleen add rate limits. never reply to a icmp.
+ * add more length checks and other fixes.
+ * yoshfuji : ensure to sent parameter problem for
+ * fragments.
+ * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit.
+ * Randy Dunlap and
+ * YOSHIFUJI Hideaki @USAGI: Per-interface statistics support
+ * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/icmpv6.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/ping.h>
+#include <net/protocol.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+#include <net/dsfield.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * The ICMP socket(s). This is the most convenient way to flow control
+ * our ICMP output as well as maintain a clean interface throughout
+ * all layers. All Socketless IP sends will soon be gone.
+ *
+ * On SMP we have one ICMP socket per-cpu.
+ */
+static inline struct sock *icmpv6_sk(struct net *net)
+{
+ return net->ipv6.icmp_sk[smp_processor_id()];
+}
+
+static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
+ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset);
+ struct net *net = dev_net(skb->dev);
+
+ if (type == ICMPV6_PKT_TOOBIG)
+ ip6_update_pmtu(skb, net, info, 0, 0);
+ else if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0);
+
+ if (!(type & ICMPV6_INFOMSG_MASK))
+ if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
+ ping_err(skb, offset, info);
+}
+
+static int icmpv6_rcv(struct sk_buff *skb);
+
+static const struct inet6_protocol icmpv6_protocol = {
+ .handler = icmpv6_rcv,
+ .err_handler = icmpv6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+
+ sk = icmpv6_sk(net);
+ if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
+ /* This can happen if the output path (f.e. SIT or
+ * ip6ip6 tunnel) signals dst_link_failure() for an
+ * outgoing ICMP6 packet.
+ */
+ local_bh_enable();
+ return NULL;
+ }
+ return sk;
+}
+
+static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
+{
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+
+/*
+ * Figure out, may we reply to this packet with icmp error.
+ *
+ * We do not reply, if:
+ * - it was icmp error message.
+ * - it is truncated, so that it is known, that protocol is ICMPV6
+ * (i.e. in the middle of some exthdr)
+ *
+ * --ANK (980726)
+ */
+
+static bool is_ineligible(const struct sk_buff *skb)
+{
+ int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
+ int len = skb->len - ptr;
+ __u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+
+ if (len < 0)
+ return true;
+
+ ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off);
+ if (ptr < 0)
+ return false;
+ if (nexthdr == IPPROTO_ICMPV6) {
+ u8 _type, *tp;
+ tp = skb_header_pointer(skb,
+ ptr+offsetof(struct icmp6hdr, icmp6_type),
+ sizeof(_type), &_type);
+ if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Check the ICMP output rate limit
+ */
+static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
+ struct flowi6 *fl6)
+{
+ struct net *net = sock_net(sk);
+ struct dst_entry *dst;
+ bool res = false;
+
+ /* Informational messages are not limited. */
+ if (type & ICMPV6_INFOMSG_MASK)
+ return true;
+
+ /* Do not limit pmtu discovery, it would break it. */
+ if (type == ICMPV6_PKT_TOOBIG)
+ return true;
+
+ /*
+ * Look up the output route.
+ * XXX: perhaps the expire for routing entries cloned by
+ * this lookup should be more aggressive (not longer than timeout).
+ */
+ dst = ip6_route_output(net, sk, fl6);
+ if (dst->error) {
+ IP6_INC_STATS(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_OUTNOROUTES);
+ } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
+ res = true;
+ } else {
+ struct rt6_info *rt = (struct rt6_info *)dst;
+ int tmo = net->ipv6.sysctl.icmpv6_time;
+
+ /* Give more bandwidth to wider prefixes. */
+ if (rt->rt6i_dst.plen < 128)
+ tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
+
+ if (icmp_global_allow()) {
+ struct inet_peer *peer;
+
+ peer = inet_getpeer_v6(net->ipv6.peers,
+ &rt->rt6i_dst.addr, 1);
+ res = inet_peer_xrlim_allow(peer, tmo);
+ if (peer)
+ inet_putpeer(peer);
+ }
+ }
+ dst_release(dst);
+ return res;
+}
+
+/*
+ * an inline helper for the "simple" if statement below
+ * checks if parameter problem report is caused by an
+ * unrecognized IPv6 option that has the Option Type
+ * highest-order two bits set to 10
+ */
+
+static bool opt_unrec(struct sk_buff *skb, __u32 offset)
+{
+ u8 _optval, *op;
+
+ offset += skb_network_offset(skb);
+ op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval);
+ if (!op)
+ return true;
+ return (*op & 0xC0) == 0x80;
+}
+
+int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+ struct icmp6hdr *thdr, int len)
+{
+ struct sk_buff *skb;
+ struct icmp6hdr *icmp6h;
+ int err = 0;
+
+ skb = skb_peek(&sk->sk_write_queue);
+ if (!skb)
+ goto out;
+
+ icmp6h = icmp6_hdr(skb);
+ memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
+ icmp6h->icmp6_cksum = 0;
+
+ if (skb_queue_len(&sk->sk_write_queue) == 1) {
+ skb->csum = csum_partial(icmp6h,
+ sizeof(struct icmp6hdr), skb->csum);
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
+ &fl6->daddr,
+ len, fl6->flowi6_proto,
+ skb->csum);
+ } else {
+ __wsum tmp_csum = 0;
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ tmp_csum = csum_add(tmp_csum, skb->csum);
+ }
+
+ tmp_csum = csum_partial(icmp6h,
+ sizeof(struct icmp6hdr), tmp_csum);
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
+ &fl6->daddr,
+ len, fl6->flowi6_proto,
+ tmp_csum);
+ }
+ ip6_push_pending_frames(sk);
+out:
+ return err;
+}
+
+struct icmpv6_msg {
+ struct sk_buff *skb;
+ int offset;
+ uint8_t type;
+};
+
+static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+ struct icmpv6_msg *msg = (struct icmpv6_msg *) from;
+ struct sk_buff *org_skb = msg->skb;
+ __wsum csum = 0;
+
+ csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset,
+ to, len, csum);
+ skb->csum = csum_block_add(skb->csum, csum, odd);
+ if (!(msg->type & ICMPV6_INFOMSG_MASK))
+ nf_ct_attach(skb, org_skb);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+static void mip6_addr_swap(struct sk_buff *skb)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct ipv6_destopt_hao *hao;
+ struct in6_addr tmp;
+ int off;
+
+ if (opt->dsthao) {
+ off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO);
+ if (likely(off >= 0)) {
+ hao = (struct ipv6_destopt_hao *)
+ (skb_network_header(skb) + off);
+ tmp = iph->saddr;
+ iph->saddr = hao->addr;
+ hao->addr = tmp;
+ }
+ }
+}
+#else
+static inline void mip6_addr_swap(struct sk_buff *skb) {}
+#endif
+
+static struct dst_entry *icmpv6_route_lookup(struct net *net,
+ struct sk_buff *skb,
+ struct sock *sk,
+ struct flowi6 *fl6)
+{
+ struct dst_entry *dst, *dst2;
+ struct flowi6 fl2;
+ int err;
+
+ err = ip6_dst_lookup(sk, &dst, fl6);
+ if (err)
+ return ERR_PTR(err);
+
+ /*
+ * We won't send icmp if the destination is known
+ * anycast.
+ */
+ if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
+ net_dbg_ratelimited("icmp6_send: acast source\n");
+ dst_release(dst);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* No need to clone since we're just using its address. */
+ dst2 = dst;
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0);
+ if (!IS_ERR(dst)) {
+ if (dst != dst2)
+ return dst;
+ } else {
+ if (PTR_ERR(dst) == -EPERM)
+ dst = NULL;
+ else
+ return dst;
+ }
+
+ err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6);
+ if (err)
+ goto relookup_failed;
+
+ err = ip6_dst_lookup(sk, &dst2, &fl2);
+ if (err)
+ goto relookup_failed;
+
+ dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP);
+ if (!IS_ERR(dst2)) {
+ dst_release(dst);
+ dst = dst2;
+ } else {
+ err = PTR_ERR(dst2);
+ if (err == -EPERM) {
+ dst_release(dst);
+ return dst2;
+ } else
+ goto relookup_failed;
+ }
+
+relookup_failed:
+ if (dst)
+ return dst;
+ return ERR_PTR(err);
+}
+
+/*
+ * Send an ICMP message in response to a packet in error
+ */
+static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ struct inet6_dev *idev = NULL;
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct sock *sk;
+ struct ipv6_pinfo *np;
+ const struct in6_addr *saddr = NULL;
+ struct dst_entry *dst;
+ struct icmp6hdr tmp_hdr;
+ struct flowi6 fl6;
+ struct icmpv6_msg msg;
+ int iif = 0;
+ int addr_type = 0;
+ int len;
+ int hlimit;
+ int err = 0;
+ u32 mark = IP6_REPLY_MARK(net, skb->mark);
+
+ if ((u8 *)hdr < skb->head ||
+ (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb))
+ return;
+
+ /*
+ * Make sure we respect the rules
+ * i.e. RFC 1885 2.4(e)
+ * Rule (e.1) is enforced by not using icmp6_send
+ * in any code that processes icmp errors.
+ */
+ addr_type = ipv6_addr_type(&hdr->daddr);
+
+ if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) ||
+ ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr))
+ saddr = &hdr->daddr;
+
+ /*
+ * Dest addr check
+ */
+
+ if (addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST) {
+ if (type != ICMPV6_PKT_TOOBIG &&
+ !(type == ICMPV6_PARAMPROB &&
+ code == ICMPV6_UNK_OPTION &&
+ (opt_unrec(skb, info))))
+ return;
+
+ saddr = NULL;
+ }
+
+ addr_type = ipv6_addr_type(&hdr->saddr);
+
+ /*
+ * Source addr check
+ */
+
+ if (__ipv6_addr_needs_scope_id(addr_type))
+ iif = skb->dev->ifindex;
+
+ /*
+ * Must not send error if the source does not uniquely
+ * identify a single node (RFC2463 Section 2.4).
+ * We check unspecified / multicast addresses here,
+ * and anycast addresses will be checked later.
+ */
+ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
+ net_dbg_ratelimited("icmp6_send: addr_any/mcast source\n");
+ return;
+ }
+
+ /*
+ * Never answer to a ICMP packet.
+ */
+ if (is_ineligible(skb)) {
+ net_dbg_ratelimited("icmp6_send: no reply to icmp error\n");
+ return;
+ }
+
+ mip6_addr_swap(skb);
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_ICMPV6;
+ fl6.daddr = hdr->saddr;
+ if (saddr)
+ fl6.saddr = *saddr;
+ fl6.flowi6_mark = mark;
+ fl6.flowi6_oif = iif;
+ fl6.fl6_icmp_type = type;
+ fl6.fl6_icmp_code = code;
+ security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+
+ sk = icmpv6_xmit_lock(net);
+ if (!sk)
+ return;
+ sk->sk_mark = mark;
+ np = inet6_sk(sk);
+
+ if (!icmpv6_xrlim_allow(sk, type, &fl6))
+ goto out;
+
+ tmp_hdr.icmp6_type = type;
+ tmp_hdr.icmp6_code = code;
+ tmp_hdr.icmp6_cksum = 0;
+ tmp_hdr.icmp6_pointer = htonl(info);
+
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->ucast_oif;
+
+ dst = icmpv6_route_lookup(net, skb, sk, &fl6);
+ if (IS_ERR(dst))
+ goto out;
+
+ hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+
+ msg.skb = skb;
+ msg.offset = skb_network_offset(skb);
+ msg.type = type;
+
+ len = skb->len - msg.offset;
+ len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr));
+ if (len < 0) {
+ net_dbg_ratelimited("icmp: len problem\n");
+ goto out_dst_release;
+ }
+
+ rcu_read_lock();
+ idev = __in6_dev_get(skb->dev);
+
+ err = ip6_append_data(sk, icmpv6_getfrag, &msg,
+ len + sizeof(struct icmp6hdr),
+ sizeof(struct icmp6hdr), hlimit,
+ np->tclass, NULL, &fl6, (struct rt6_info *)dst,
+ MSG_DONTWAIT, np->dontfrag);
+ if (err) {
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
+ ip6_flush_pending_frames(sk);
+ } else {
+ err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+ len + sizeof(struct icmp6hdr));
+ }
+ rcu_read_unlock();
+out_dst_release:
+ dst_release(dst);
+out:
+ icmpv6_xmit_unlock(sk);
+}
+
+/* Slightly more convenient version of icmp6_send.
+ */
+void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
+{
+ icmp6_send(skb, ICMPV6_PARAMPROB, code, pos);
+ kfree_skb(skb);
+}
+
+static void icmpv6_echo_reply(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk;
+ struct inet6_dev *idev;
+ struct ipv6_pinfo *np;
+ const struct in6_addr *saddr = NULL;
+ struct icmp6hdr *icmph = icmp6_hdr(skb);
+ struct icmp6hdr tmp_hdr;
+ struct flowi6 fl6;
+ struct icmpv6_msg msg;
+ struct dst_entry *dst;
+ int err = 0;
+ int hlimit;
+ u8 tclass;
+ u32 mark = IP6_REPLY_MARK(net, skb->mark);
+
+ saddr = &ipv6_hdr(skb)->daddr;
+
+ if (!ipv6_unicast_destination(skb) &&
+ !(net->ipv6.sysctl.anycast_src_echo_reply &&
+ ipv6_anycast_destination(skb)))
+ saddr = NULL;
+
+ memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
+ tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_ICMPV6;
+ fl6.daddr = ipv6_hdr(skb)->saddr;
+ if (saddr)
+ fl6.saddr = *saddr;
+ fl6.flowi6_oif = skb->dev->ifindex;
+ fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
+ fl6.flowi6_mark = mark;
+ security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+
+ sk = icmpv6_xmit_lock(net);
+ if (!sk)
+ return;
+ sk->sk_mark = mark;
+ np = inet6_sk(sk);
+
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->ucast_oif;
+
+ err = ip6_dst_lookup(sk, &dst, &fl6);
+ if (err)
+ goto out;
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
+ if (IS_ERR(dst))
+ goto out;
+
+ hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+
+ idev = __in6_dev_get(skb->dev);
+
+ msg.skb = skb;
+ msg.offset = 0;
+ msg.type = ICMPV6_ECHO_REPLY;
+
+ tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+ err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
+ sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl6,
+ (struct rt6_info *)dst, MSG_DONTWAIT,
+ np->dontfrag);
+
+ if (err) {
+ ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS);
+ ip6_flush_pending_frames(sk);
+ } else {
+ err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+ skb->len + sizeof(struct icmp6hdr));
+ }
+ dst_release(dst);
+out:
+ icmpv6_xmit_unlock(sk);
+}
+
+void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
+{
+ const struct inet6_protocol *ipprot;
+ int inner_offset;
+ __be16 frag_off;
+ u8 nexthdr;
+ struct net *net = dev_net(skb->dev);
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto out;
+
+ nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
+ if (ipv6_ext_hdr(nexthdr)) {
+ /* now skip over extension headers */
+ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+ if (inner_offset < 0)
+ goto out;
+ } else {
+ inner_offset = sizeof(struct ipv6hdr);
+ }
+
+ /* Checkin header including 8 bytes of inner protocol header. */
+ if (!pskb_may_pull(skb, inner_offset+8))
+ goto out;
+
+ /* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
+ Without this we will not able f.e. to make source routed
+ pmtu discovery.
+ Corresponding argument (opt) to notifiers is already added.
+ --ANK (980726)
+ */
+
+ ipprot = rcu_dereference(inet6_protos[nexthdr]);
+ if (ipprot && ipprot->err_handler)
+ ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
+
+ raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info);
+ return;
+
+out:
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+}
+
+/*
+ * Handle icmp messages
+ */
+
+static int icmpv6_rcv(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct inet6_dev *idev = __in6_dev_get(dev);
+ const struct in6_addr *saddr, *daddr;
+ struct icmp6hdr *hdr;
+ u8 type;
+ bool success = false;
+
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ struct sec_path *sp = skb_sec_path(skb);
+ int nh;
+
+ if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+ XFRM_STATE_ICMP))
+ goto drop_no_count;
+
+ if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr)))
+ goto drop_no_count;
+
+ nh = skb_network_offset(skb);
+ skb_set_network_header(skb, sizeof(*hdr));
+
+ if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ goto drop_no_count;
+
+ skb_set_network_header(skb, nh);
+ }
+
+ ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS);
+
+ saddr = &ipv6_hdr(skb)->saddr;
+ daddr = &ipv6_hdr(skb)->daddr;
+
+ if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) {
+ net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n",
+ saddr, daddr);
+ goto csum_error;
+ }
+
+ if (!pskb_pull(skb, sizeof(*hdr)))
+ goto discard_it;
+
+ hdr = icmp6_hdr(skb);
+
+ type = hdr->icmp6_type;
+
+ ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type);
+
+ switch (type) {
+ case ICMPV6_ECHO_REQUEST:
+ icmpv6_echo_reply(skb);
+ break;
+
+ case ICMPV6_ECHO_REPLY:
+ success = ping_rcv(skb);
+ break;
+
+ case ICMPV6_PKT_TOOBIG:
+ /* BUGGG_FUTURE: if packet contains rthdr, we cannot update
+ standard destination cache. Seems, only "advanced"
+ destination cache will allow to solve this problem
+ --ANK (980726)
+ */
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto discard_it;
+ hdr = icmp6_hdr(skb);
+
+ /*
+ * Drop through to notify
+ */
+
+ case ICMPV6_DEST_UNREACH:
+ case ICMPV6_TIME_EXCEED:
+ case ICMPV6_PARAMPROB:
+ icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
+ break;
+
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_ROUTER_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ case NDISC_REDIRECT:
+ ndisc_rcv(skb);
+ break;
+
+ case ICMPV6_MGM_QUERY:
+ igmp6_event_query(skb);
+ break;
+
+ case ICMPV6_MGM_REPORT:
+ igmp6_event_report(skb);
+ break;
+
+ case ICMPV6_MGM_REDUCTION:
+ case ICMPV6_NI_QUERY:
+ case ICMPV6_NI_REPLY:
+ case ICMPV6_MLD2_REPORT:
+ case ICMPV6_DHAAD_REQUEST:
+ case ICMPV6_DHAAD_REPLY:
+ case ICMPV6_MOBILE_PREFIX_SOL:
+ case ICMPV6_MOBILE_PREFIX_ADV:
+ break;
+
+ default:
+ /* informational */
+ if (type & ICMPV6_INFOMSG_MASK)
+ break;
+
+ net_dbg_ratelimited("icmpv6: msg of unknown type\n");
+
+ /*
+ * error of unknown type.
+ * must pass to upper level
+ */
+
+ icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
+ }
+
+ /* until the v6 path can be better sorted assume failure and
+ * preserve the status quo behaviour for the rest of the paths to here
+ */
+ if (success)
+ consume_skb(skb);
+ else
+ kfree_skb(skb);
+
+ return 0;
+
+csum_error:
+ ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS);
+discard_it:
+ ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS);
+drop_no_count:
+ kfree_skb(skb);
+ return 0;
+}
+
+void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
+ u8 type,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ int oif)
+{
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->saddr = *saddr;
+ fl6->daddr = *daddr;
+ fl6->flowi6_proto = IPPROTO_ICMPV6;
+ fl6->fl6_icmp_type = type;
+ fl6->fl6_icmp_code = 0;
+ fl6->flowi6_oif = oif;
+ security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+}
+
+/*
+ * Special lock-class for __icmpv6_sk:
+ */
+static struct lock_class_key icmpv6_socket_sk_dst_lock_key;
+
+static int __net_init icmpv6_sk_init(struct net *net)
+{
+ struct sock *sk;
+ int err, i, j;
+
+ net->ipv6.icmp_sk =
+ kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL);
+ if (!net->ipv6.icmp_sk)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ err = inet_ctl_sock_create(&sk, PF_INET6,
+ SOCK_RAW, IPPROTO_ICMPV6, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
+ err);
+ goto fail;
+ }
+
+ net->ipv6.icmp_sk[i] = sk;
+
+ /*
+ * Split off their lock-class, because sk->sk_dst_lock
+ * gets used from softirqs, which is safe for
+ * __icmpv6_sk (because those never get directly used
+ * via userspace syscalls), but unsafe for normal sockets.
+ */
+ lockdep_set_class(&sk->sk_dst_lock,
+ &icmpv6_socket_sk_dst_lock_key);
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff struct overhead.
+ */
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+ }
+ return 0;
+
+ fail:
+ for (j = 0; j < i; j++)
+ inet_ctl_sock_destroy(net->ipv6.icmp_sk[j]);
+ kfree(net->ipv6.icmp_sk);
+ return err;
+}
+
+static void __net_exit icmpv6_sk_exit(struct net *net)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ inet_ctl_sock_destroy(net->ipv6.icmp_sk[i]);
+ }
+ kfree(net->ipv6.icmp_sk);
+}
+
+static struct pernet_operations icmpv6_sk_ops = {
+ .init = icmpv6_sk_init,
+ .exit = icmpv6_sk_exit,
+};
+
+int __init icmpv6_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&icmpv6_sk_ops);
+ if (err < 0)
+ return err;
+
+ err = -EAGAIN;
+ if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
+ goto fail;
+
+ err = inet6_register_icmp_sender(icmp6_send);
+ if (err)
+ goto sender_reg_err;
+ return 0;
+
+sender_reg_err:
+ inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
+fail:
+ pr_err("Failed to register ICMP6 protocol\n");
+ unregister_pernet_subsys(&icmpv6_sk_ops);
+ return err;
+}
+
+void icmpv6_cleanup(void)
+{
+ inet6_unregister_icmp_sender(icmp6_send);
+ unregister_pernet_subsys(&icmpv6_sk_ops);
+ inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
+}
+
+
+static const struct icmp6_err {
+ int err;
+ int fatal;
+} tab_unreach[] = {
+ { /* NOROUTE */
+ .err = ENETUNREACH,
+ .fatal = 0,
+ },
+ { /* ADM_PROHIBITED */
+ .err = EACCES,
+ .fatal = 1,
+ },
+ { /* Was NOT_NEIGHBOUR, now reserved */
+ .err = EHOSTUNREACH,
+ .fatal = 0,
+ },
+ { /* ADDR_UNREACH */
+ .err = EHOSTUNREACH,
+ .fatal = 0,
+ },
+ { /* PORT_UNREACH */
+ .err = ECONNREFUSED,
+ .fatal = 1,
+ },
+ { /* POLICY_FAIL */
+ .err = EACCES,
+ .fatal = 1,
+ },
+ { /* REJECT_ROUTE */
+ .err = EACCES,
+ .fatal = 1,
+ },
+};
+
+int icmpv6_err_convert(u8 type, u8 code, int *err)
+{
+ int fatal = 0;
+
+ *err = EPROTO;
+
+ switch (type) {
+ case ICMPV6_DEST_UNREACH:
+ fatal = 1;
+ if (code < ARRAY_SIZE(tab_unreach)) {
+ *err = tab_unreach[code].err;
+ fatal = tab_unreach[code].fatal;
+ }
+ break;
+
+ case ICMPV6_PKT_TOOBIG:
+ *err = EMSGSIZE;
+ break;
+
+ case ICMPV6_PARAMPROB:
+ *err = EPROTO;
+ fatal = 1;
+ break;
+
+ case ICMPV6_TIME_EXCEED:
+ *err = EHOSTUNREACH;
+ break;
+ }
+
+ return fatal;
+}
+EXPORT_SYMBOL(icmpv6_err_convert);
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table ipv6_icmp_table_template[] = {
+ {
+ .procname = "ratelimit",
+ .data = &init_net.ipv6.sysctl.icmpv6_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ { },
+};
+
+struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(ipv6_icmp_table_template,
+ sizeof(ipv6_icmp_table_template),
+ GFP_KERNEL);
+
+ if (table)
+ table[0].data = &net->ipv6.sysctl.icmpv6_time;
+
+ return table;
+}
+#endif
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
new file mode 100644
index 000000000..6927f3fb5
--- /dev/null
+++ b/net/ipv6/inet6_connection_sock.c
@@ -0,0 +1,261 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Support for INET6 connection oriented protocols.
+ *
+ * Authors: See the TCPv6 sources
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+
+#include <net/addrconf.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_ecn.h>
+#include <net/inet_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/sock.h>
+#include <net/inet6_connection_sock.h>
+
+int inet6_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb, bool relax)
+{
+ const struct sock *sk2;
+ int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ kuid_t uid = sock_i_uid((struct sock *)sk);
+
+ /* We must walk the whole port owner list in this case. -DaveM */
+ /*
+ * See comment in inet_csk_bind_conflict about sock lookup
+ * vs net namespaces issues.
+ */
+ sk_for_each_bound(sk2, &tb->owners) {
+ if (sk != sk2 &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid,
+ sock_i_uid((struct sock *)sk2))))) {
+ if (ipv6_rcv_saddr_equal(sk, sk2))
+ break;
+ }
+ if (!relax && reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN &&
+ ipv6_rcv_saddr_equal(sk, sk2))
+ break;
+ }
+ }
+
+ return sk2 != NULL;
+}
+EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
+
+struct dst_entry *inet6_csk_route_req(struct sock *sk,
+ struct flowi6 *fl6,
+ const struct request_sock *req)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *final_p, final;
+ struct dst_entry *dst;
+
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->flowi6_proto = IPPROTO_TCP;
+ fl6->daddr = ireq->ir_v6_rmt_addr;
+ final_p = fl6_update_dst(fl6, np->opt, &final);
+ fl6->saddr = ireq->ir_v6_loc_addr;
+ fl6->flowi6_oif = ireq->ir_iif;
+ fl6->flowi6_mark = ireq->ir_mark;
+ fl6->fl6_dport = ireq->ir_rmt_port;
+ fl6->fl6_sport = htons(ireq->ir_num);
+ security_req_classify_flow(req, flowi6_to_flowi(fl6));
+
+ dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ if (IS_ERR(dst))
+ return NULL;
+
+ return dst;
+}
+
+/*
+ * request_sock (formerly open request) hash tables.
+ */
+static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
+ const u32 rnd, const u32 synq_hsize)
+{
+ u32 c;
+
+ c = jhash_3words((__force u32)raddr->s6_addr32[0],
+ (__force u32)raddr->s6_addr32[1],
+ (__force u32)raddr->s6_addr32[2],
+ rnd);
+
+ c = jhash_2words((__force u32)raddr->s6_addr32[3],
+ (__force u32)rport,
+ c);
+
+ return c & (synq_hsize - 1);
+}
+
+struct request_sock *inet6_csk_search_req(struct sock *sk,
+ const __be16 rport,
+ const struct in6_addr *raddr,
+ const struct in6_addr *laddr,
+ const int iif)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req;
+ u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd,
+ lopt->nr_table_entries);
+
+ spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
+ for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (ireq->ir_rmt_port == rport &&
+ req->rsk_ops->family == AF_INET6 &&
+ ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) &&
+ ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) &&
+ (!ireq->ir_iif || ireq->ir_iif == iif)) {
+ atomic_inc(&req->rsk_refcnt);
+ WARN_ON(req->sk != NULL);
+ break;
+ }
+ }
+ spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ return req;
+}
+EXPORT_SYMBOL_GPL(inet6_csk_search_req);
+
+void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
+ struct request_sock *req,
+ const unsigned long timeout)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
+ inet_rsk(req)->ir_rmt_port,
+ lopt->hash_rnd, lopt->nr_table_entries);
+
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+ inet_csk_reqsk_queue_added(sk, timeout);
+}
+EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
+
+void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
+
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = sk->sk_v6_daddr;
+ sin6->sin6_port = inet_sk(sk)->inet_dport;
+ /* We do not store received flowlabel for TCP */
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
+ sk->sk_bound_dev_if);
+}
+EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
+
+static inline
+void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ __ip6_dst_store(sk, dst, daddr, saddr);
+}
+
+static inline
+struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
+{
+ return __sk_dst_check(sk, cookie);
+}
+
+static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
+ struct flowi6 *fl6)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *final_p, final;
+ struct dst_entry *dst;
+
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->daddr = sk->sk_v6_daddr;
+ fl6->saddr = np->saddr;
+ fl6->flowlabel = np->flow_label;
+ IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+ fl6->flowi6_oif = sk->sk_bound_dev_if;
+ fl6->flowi6_mark = sk->sk_mark;
+ fl6->fl6_sport = inet->inet_sport;
+ fl6->fl6_dport = inet->inet_dport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+
+ final_p = fl6_update_dst(fl6, np->opt, &final);
+
+ dst = __inet6_csk_dst_check(sk, np->dst_cookie);
+ if (!dst) {
+ dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+
+ if (!IS_ERR(dst))
+ __inet6_csk_dst_store(sk, dst, NULL, NULL);
+ }
+ return dst;
+}
+
+int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+ int res;
+
+ dst = inet6_csk_route_socket(sk, &fl6);
+ if (IS_ERR(dst)) {
+ sk->sk_err_soft = -PTR_ERR(dst);
+ sk->sk_route_caps = 0;
+ kfree_skb(skb);
+ return PTR_ERR(dst);
+ }
+
+ rcu_read_lock();
+ skb_dst_set_noref(skb, dst);
+
+ /* Restore final destination back after routing done */
+ fl6.daddr = sk->sk_v6_daddr;
+
+ res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL_GPL(inet6_csk_xmit);
+
+struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+ struct flowi6 fl6;
+ struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6);
+
+ if (IS_ERR(dst))
+ return NULL;
+ dst->ops->update_pmtu(dst, sk, NULL, mtu);
+
+ dst = inet6_csk_route_socket(sk, &fl6);
+ return IS_ERR(dst) ? NULL : dst;
+}
+EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
new file mode 100644
index 000000000..871641bc1
--- /dev/null
+++ b/net/ipv6/inet6_hashtables.c
@@ -0,0 +1,275 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic INET6 transport hashtables
+ *
+ * Authors: Lotsa people, from code originally in tcp, generalised here
+ * by Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/secure_seq.h>
+#include <net/ip.h>
+
+u32 inet6_ehashfn(const struct net *net,
+ const struct in6_addr *laddr, const u16 lport,
+ const struct in6_addr *faddr, const __be16 fport)
+{
+ static u32 inet6_ehash_secret __read_mostly;
+ static u32 ipv6_hash_secret __read_mostly;
+
+ u32 lhash, fhash;
+
+ net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
+ net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
+
+ lhash = (__force u32)laddr->s6_addr32[3];
+ fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
+
+ return __inet6_ehashfn(lhash, lport, fhash, fport,
+ inet6_ehash_secret + net_hash_mix(net));
+}
+
+/*
+ * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
+ * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ *
+ * The sockhash lock must be held as a reader here.
+ */
+struct sock *__inet6_lookup_established(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 hnum,
+ const int dif)
+{
+ struct sock *sk;
+ const struct hlist_nulls_node *node;
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+ unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+ unsigned int slot = hash & hashinfo->ehash_mask;
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+
+
+ rcu_read_lock();
+begin:
+ sk_nulls_for_each_rcu(sk, node, &head->chain) {
+ if (sk->sk_hash != hash)
+ continue;
+ if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif))
+ continue;
+ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+ goto out;
+
+ if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) {
+ sock_gen_put(sk);
+ goto begin;
+ }
+ goto found;
+ }
+ if (get_nulls_value(node) != slot)
+ goto begin;
+out:
+ sk = NULL;
+found:
+ rcu_read_unlock();
+ return sk;
+}
+EXPORT_SYMBOL(__inet6_lookup_established);
+
+static inline int compute_score(struct sock *sk, struct net *net,
+ const unsigned short hnum,
+ const struct in6_addr *daddr,
+ const int dif)
+{
+ int score = -1;
+
+ if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
+ sk->sk_family == PF_INET6) {
+
+ score = 1;
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
+ score++;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score++;
+ }
+ }
+ return score;
+}
+
+struct sock *inet6_lookup_listener(struct net *net,
+ struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
+ const __be16 sport, const struct in6_addr *daddr,
+ const unsigned short hnum, const int dif)
+{
+ struct sock *sk;
+ const struct hlist_nulls_node *node;
+ struct sock *result;
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
+ unsigned int hash = inet_lhashfn(net, hnum);
+ struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+
+ rcu_read_lock();
+begin:
+ result = NULL;
+ hiscore = 0;
+ sk_nulls_for_each(sk, node, &ilb->head) {
+ score = compute_score(sk, net, hnum, daddr, dif);
+ if (score > hiscore) {
+ hiscore = score;
+ result = sk;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet6_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (reciprocal_scale(phash, matches) == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
+ goto begin;
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+ result = NULL;
+ else if (unlikely(compute_score(result, net, hnum, daddr,
+ dif) < hiscore)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL_GPL(inet6_lookup_listener);
+
+struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+ const struct in6_addr *saddr, const __be16 sport,
+ const struct in6_addr *daddr, const __be16 dport,
+ const int dif)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+ sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+ local_bh_enable();
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(inet6_lookup);
+
+static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, const __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_sock *inet = inet_sk(sk);
+ const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
+ const struct in6_addr *saddr = &sk->sk_v6_daddr;
+ const int dif = sk->sk_bound_dev_if;
+ const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
+ struct net *net = sock_net(sk);
+ const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
+ inet->inet_dport);
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
+ struct sock *sk2;
+ const struct hlist_nulls_node *node;
+ struct inet_timewait_sock *tw = NULL;
+ int twrefcnt = 0;
+
+ spin_lock(lock);
+
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash)
+ continue;
+
+ if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) {
+ if (sk2->sk_state == TCP_TIME_WAIT) {
+ tw = inet_twsk(sk2);
+ if (twsk_unique(sk, sk2, twp))
+ break;
+ }
+ goto not_unique;
+ }
+ }
+
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity.
+ */
+ inet->inet_num = lport;
+ inet->inet_sport = htons(lport);
+ sk->sk_hash = hash;
+ WARN_ON(!sk_unhashed(sk));
+ __sk_nulls_add_node_rcu(sk, &head->chain);
+ if (tw) {
+ twrefcnt = inet_twsk_unhash(tw);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
+ spin_unlock(lock);
+ if (twrefcnt)
+ inet_twsk_put(tw);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ if (twp) {
+ *twp = tw;
+ } else if (tw) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw);
+
+ inet_twsk_put(tw);
+ }
+ return 0;
+
+not_unique:
+ spin_unlock(lock);
+ return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet6_sk_port_offset(const struct sock *sk)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_dport);
+}
+
+int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+ struct sock *sk)
+{
+ return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk),
+ __inet6_check_established);
+}
+EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
new file mode 100644
index 000000000..9a4d7322f
--- /dev/null
+++ b/net/ipv6/ip6_checksum.c
@@ -0,0 +1,124 @@
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <asm/checksum.h>
+
+#ifndef _HAVE_ARCH_IPV6_CSUM
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, unsigned short proto,
+ __wsum csum)
+{
+
+ int carry;
+ __u32 ulen;
+ __u32 uproto;
+ __u32 sum = (__force u32)csum;
+
+ sum += (__force u32)saddr->s6_addr32[0];
+ carry = (sum < (__force u32)saddr->s6_addr32[0]);
+ sum += carry;
+
+ sum += (__force u32)saddr->s6_addr32[1];
+ carry = (sum < (__force u32)saddr->s6_addr32[1]);
+ sum += carry;
+
+ sum += (__force u32)saddr->s6_addr32[2];
+ carry = (sum < (__force u32)saddr->s6_addr32[2]);
+ sum += carry;
+
+ sum += (__force u32)saddr->s6_addr32[3];
+ carry = (sum < (__force u32)saddr->s6_addr32[3]);
+ sum += carry;
+
+ sum += (__force u32)daddr->s6_addr32[0];
+ carry = (sum < (__force u32)daddr->s6_addr32[0]);
+ sum += carry;
+
+ sum += (__force u32)daddr->s6_addr32[1];
+ carry = (sum < (__force u32)daddr->s6_addr32[1]);
+ sum += carry;
+
+ sum += (__force u32)daddr->s6_addr32[2];
+ carry = (sum < (__force u32)daddr->s6_addr32[2]);
+ sum += carry;
+
+ sum += (__force u32)daddr->s6_addr32[3];
+ carry = (sum < (__force u32)daddr->s6_addr32[3]);
+ sum += carry;
+
+ ulen = (__force u32)htonl((__u32) len);
+ sum += ulen;
+ carry = (sum < ulen);
+ sum += carry;
+
+ uproto = (__force u32)htonl(proto);
+ sum += uproto;
+ carry = (sum < uproto);
+ sum += carry;
+
+ return csum_fold((__force __wsum)sum);
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
+#endif
+
+int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
+{
+ int err;
+
+ UDP_SKB_CB(skb)->partial_cov = 0;
+ UDP_SKB_CB(skb)->cscov = skb->len;
+
+ if (proto == IPPROTO_UDPLITE) {
+ err = udplite_checksum_init(skb, uh);
+ if (err)
+ return err;
+ }
+
+ /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels)
+ * we accept a checksum of zero here. When we find the socket
+ * for the UDP packet we'll check if that socket allows zero checksum
+ * for IPv6 (set by socket option).
+ */
+ return skb_checksum_init_zero_check(skb, proto, uh->check,
+ ip6_compute_pseudo);
+}
+EXPORT_SYMBOL(udp6_csum_init);
+
+/* Function to set UDP checksum for an IPv6 UDP packet. This is intended
+ * for the simple case like when setting the checksum for a UDP tunnel.
+ */
+void udp6_set_csum(bool nocheck, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr, int len)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (nocheck)
+ uh->check = 0;
+ else if (skb_is_gso(skb))
+ uh->check = ~udp_v6_check(len, saddr, daddr, 0);
+ else if (skb_dst(skb) && skb_dst(skb)->dev &&
+ (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~udp_v6_check(len, saddr, daddr, 0);
+ } else {
+ __wsum csum;
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, len, 0);
+ uh->check = udp_v6_check(len, saddr, daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+}
+EXPORT_SYMBOL(udp6_set_csum);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
new file mode 100644
index 000000000..bde57b113
--- /dev/null
+++ b/net/ipv6/ip6_fib.c
@@ -0,0 +1,2060 @@
+/*
+ * Linux INET6 implementation
+ * Forwarding Information Database
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Yuji SEKIYA @USAGI: Support default route on router node;
+ * remove ip6_null_entry from the top of
+ * routing table.
+ * Ville Nuorvala: Fixed routing subtrees.
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/route.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+
+#define RT6_DEBUG 2
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) pr_debug(x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
+
+static struct kmem_cache *fib6_node_kmem __read_mostly;
+
+struct fib6_cleaner {
+ struct fib6_walker w;
+ struct net *net;
+ int (*func)(struct rt6_info *, void *arg);
+ int sernum;
+ void *arg;
+};
+
+static DEFINE_RWLOCK(fib6_walker_lock);
+
+#ifdef CONFIG_IPV6_SUBTREES
+#define FWS_INIT FWS_S
+#else
+#define FWS_INIT FWS_L
+#endif
+
+static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
+static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
+static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
+static int fib6_walk(struct fib6_walker *w);
+static int fib6_walk_continue(struct fib6_walker *w);
+
+/*
+ * A routing update causes an increase of the serial number on the
+ * affected subtree. This allows for cached routes to be asynchronously
+ * tested when modifications are made to the destination cache as a
+ * result of redirects, path MTU changes, etc.
+ */
+
+static void fib6_gc_timer_cb(unsigned long arg);
+
+static LIST_HEAD(fib6_walkers);
+#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
+
+static void fib6_walker_link(struct fib6_walker *w)
+{
+ write_lock_bh(&fib6_walker_lock);
+ list_add(&w->lh, &fib6_walkers);
+ write_unlock_bh(&fib6_walker_lock);
+}
+
+static void fib6_walker_unlink(struct fib6_walker *w)
+{
+ write_lock_bh(&fib6_walker_lock);
+ list_del(&w->lh);
+ write_unlock_bh(&fib6_walker_lock);
+}
+
+static int fib6_new_sernum(struct net *net)
+{
+ int new, old;
+
+ do {
+ old = atomic_read(&net->ipv6.fib6_sernum);
+ new = old < INT_MAX ? old + 1 : 1;
+ } while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
+ old, new) != old);
+ return new;
+}
+
+enum {
+ FIB6_NO_SERNUM_CHANGE = 0,
+};
+
+/*
+ * Auxiliary address test functions for the radix tree.
+ *
+ * These assume a 32bit processor (although it will work on
+ * 64bit processors)
+ */
+
+/*
+ * test bit
+ */
+#if defined(__LITTLE_ENDIAN)
+# define BITOP_BE32_SWIZZLE (0x1F & ~7)
+#else
+# define BITOP_BE32_SWIZZLE 0
+#endif
+
+static __be32 addr_bit_set(const void *token, int fn_bit)
+{
+ const __be32 *addr = token;
+ /*
+ * Here,
+ * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
+ * is optimized version of
+ * htonl(1 << ((~fn_bit)&0x1F))
+ * See include/asm-generic/bitops/le.h.
+ */
+ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
+ addr[fn_bit >> 5];
+}
+
+static struct fib6_node *node_alloc(void)
+{
+ struct fib6_node *fn;
+
+ fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
+
+ return fn;
+}
+
+static void node_free(struct fib6_node *fn)
+{
+ kmem_cache_free(fib6_node_kmem, fn);
+}
+
+static void rt6_release(struct rt6_info *rt)
+{
+ if (atomic_dec_and_test(&rt->rt6i_ref))
+ dst_free(&rt->dst);
+}
+
+static void fib6_link_table(struct net *net, struct fib6_table *tb)
+{
+ unsigned int h;
+
+ /*
+ * Initialize table lock at a single place to give lockdep a key,
+ * tables aren't visible prior to being linked to the list.
+ */
+ rwlock_init(&tb->tb6_lock);
+
+ h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
+
+ /*
+ * No protection necessary, this is the only list mutatation
+ * operation, tables never disappear once they exist.
+ */
+ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
+}
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+
+static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
+{
+ struct fib6_table *table;
+
+ table = kzalloc(sizeof(*table), GFP_ATOMIC);
+ if (table) {
+ table->tb6_id = id;
+ table->tb6_root.leaf = net->ipv6.ip6_null_entry;
+ table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+ inet_peer_base_init(&table->tb6_peers);
+ }
+
+ return table;
+}
+
+struct fib6_table *fib6_new_table(struct net *net, u32 id)
+{
+ struct fib6_table *tb;
+
+ if (id == 0)
+ id = RT6_TABLE_MAIN;
+ tb = fib6_get_table(net, id);
+ if (tb)
+ return tb;
+
+ tb = fib6_alloc_table(net, id);
+ if (tb)
+ fib6_link_table(net, tb);
+
+ return tb;
+}
+
+struct fib6_table *fib6_get_table(struct net *net, u32 id)
+{
+ struct fib6_table *tb;
+ struct hlist_head *head;
+ unsigned int h;
+
+ if (id == 0)
+ id = RT6_TABLE_MAIN;
+ h = id & (FIB6_TABLE_HASHSZ - 1);
+ rcu_read_lock();
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+ if (tb->tb6_id == id) {
+ rcu_read_unlock();
+ return tb;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void __net_init fib6_tables_init(struct net *net)
+{
+ fib6_link_table(net, net->ipv6.fib6_main_tbl);
+ fib6_link_table(net, net->ipv6.fib6_local_tbl);
+}
+#else
+
+struct fib6_table *fib6_new_table(struct net *net, u32 id)
+{
+ return fib6_get_table(net, id);
+}
+
+struct fib6_table *fib6_get_table(struct net *net, u32 id)
+{
+ return net->ipv6.fib6_main_tbl;
+}
+
+struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+ int flags, pol_lookup_t lookup)
+{
+ return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+}
+
+static void __net_init fib6_tables_init(struct net *net)
+{
+ fib6_link_table(net, net->ipv6.fib6_main_tbl);
+}
+
+#endif
+
+static int fib6_dump_node(struct fib6_walker *w)
+{
+ int res;
+ struct rt6_info *rt;
+
+ for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+ res = rt6_dump_route(rt, w->args);
+ if (res < 0) {
+ /* Frame is full, suspend walking */
+ w->leaf = rt;
+ return 1;
+ }
+ }
+ w->leaf = NULL;
+ return 0;
+}
+
+static void fib6_dump_end(struct netlink_callback *cb)
+{
+ struct fib6_walker *w = (void *)cb->args[2];
+
+ if (w) {
+ if (cb->args[4]) {
+ cb->args[4] = 0;
+ fib6_walker_unlink(w);
+ }
+ cb->args[2] = 0;
+ kfree(w);
+ }
+ cb->done = (void *)cb->args[3];
+ cb->args[1] = 3;
+}
+
+static int fib6_dump_done(struct netlink_callback *cb)
+{
+ fib6_dump_end(cb);
+ return cb->done ? cb->done(cb) : 0;
+}
+
+static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct fib6_walker *w;
+ int res;
+
+ w = (void *)cb->args[2];
+ w->root = &table->tb6_root;
+
+ if (cb->args[4] == 0) {
+ w->count = 0;
+ w->skip = 0;
+
+ read_lock_bh(&table->tb6_lock);
+ res = fib6_walk(w);
+ read_unlock_bh(&table->tb6_lock);
+ if (res > 0) {
+ cb->args[4] = 1;
+ cb->args[5] = w->root->fn_sernum;
+ }
+ } else {
+ if (cb->args[5] != w->root->fn_sernum) {
+ /* Begin at the root if the tree changed */
+ cb->args[5] = w->root->fn_sernum;
+ w->state = FWS_INIT;
+ w->node = w->root;
+ w->skip = w->count;
+ } else
+ w->skip = 0;
+
+ read_lock_bh(&table->tb6_lock);
+ res = fib6_walk_continue(w);
+ read_unlock_bh(&table->tb6_lock);
+ if (res <= 0) {
+ fib6_walker_unlink(w);
+ cb->args[4] = 0;
+ }
+ }
+
+ return res;
+}
+
+static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int h, s_h;
+ unsigned int e = 0, s_e;
+ struct rt6_rtnl_dump_arg arg;
+ struct fib6_walker *w;
+ struct fib6_table *tb;
+ struct hlist_head *head;
+ int res = 0;
+
+ s_h = cb->args[0];
+ s_e = cb->args[1];
+
+ w = (void *)cb->args[2];
+ if (!w) {
+ /* New dump:
+ *
+ * 1. hook callback destructor.
+ */
+ cb->args[3] = (long)cb->done;
+ cb->done = fib6_dump_done;
+
+ /*
+ * 2. allocate and initialize walker.
+ */
+ w = kzalloc(sizeof(*w), GFP_ATOMIC);
+ if (!w)
+ return -ENOMEM;
+ w->func = fib6_dump_node;
+ cb->args[2] = (long)w;
+ }
+
+ arg.skb = skb;
+ arg.cb = cb;
+ arg.net = net;
+ w->args = &arg;
+
+ rcu_read_lock();
+ for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
+ e = 0;
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+ if (e < s_e)
+ goto next;
+ res = fib6_dump_table(tb, skb, cb);
+ if (res != 0)
+ goto out;
+next:
+ e++;
+ }
+ }
+out:
+ rcu_read_unlock();
+ cb->args[1] = e;
+ cb->args[0] = h;
+
+ res = res < 0 ? res : skb->len;
+ if (res <= 0)
+ fib6_dump_end(cb);
+ return res;
+}
+
+/*
+ * Routing Table
+ *
+ * return the appropriate node for a routing tree "add" operation
+ * by either creating and inserting or by returning an existing
+ * node.
+ */
+
+static struct fib6_node *fib6_add_1(struct fib6_node *root,
+ struct in6_addr *addr, int plen,
+ int offset, int allow_create,
+ int replace_required, int sernum)
+{
+ struct fib6_node *fn, *in, *ln;
+ struct fib6_node *pn = NULL;
+ struct rt6key *key;
+ int bit;
+ __be32 dir = 0;
+
+ RT6_TRACE("fib6_add_1\n");
+
+ /* insert node in tree */
+
+ fn = root;
+
+ do {
+ key = (struct rt6key *)((u8 *)fn->leaf + offset);
+
+ /*
+ * Prefix match
+ */
+ if (plen < fn->fn_bit ||
+ !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
+ if (!allow_create) {
+ if (replace_required) {
+ pr_warn("Can't replace route, no match found\n");
+ return ERR_PTR(-ENOENT);
+ }
+ pr_warn("NLM_F_CREATE should be set when creating new route\n");
+ }
+ goto insert_above;
+ }
+
+ /*
+ * Exact match ?
+ */
+
+ if (plen == fn->fn_bit) {
+ /* clean up an intermediate node */
+ if (!(fn->fn_flags & RTN_RTINFO)) {
+ rt6_release(fn->leaf);
+ fn->leaf = NULL;
+ }
+
+ fn->fn_sernum = sernum;
+
+ return fn;
+ }
+
+ /*
+ * We have more bits to go
+ */
+
+ /* Try to walk down on tree. */
+ fn->fn_sernum = sernum;
+ dir = addr_bit_set(addr, fn->fn_bit);
+ pn = fn;
+ fn = dir ? fn->right : fn->left;
+ } while (fn);
+
+ if (!allow_create) {
+ /* We should not create new node because
+ * NLM_F_REPLACE was specified without NLM_F_CREATE
+ * I assume it is safe to require NLM_F_CREATE when
+ * REPLACE flag is used! Later we may want to remove the
+ * check for replace_required, because according
+ * to netlink specification, NLM_F_CREATE
+ * MUST be specified if new route is created.
+ * That would keep IPv6 consistent with IPv4
+ */
+ if (replace_required) {
+ pr_warn("Can't replace route, no match found\n");
+ return ERR_PTR(-ENOENT);
+ }
+ pr_warn("NLM_F_CREATE should be set when creating new route\n");
+ }
+ /*
+ * We walked to the bottom of tree.
+ * Create new leaf node without children.
+ */
+
+ ln = node_alloc();
+
+ if (!ln)
+ return ERR_PTR(-ENOMEM);
+ ln->fn_bit = plen;
+
+ ln->parent = pn;
+ ln->fn_sernum = sernum;
+
+ if (dir)
+ pn->right = ln;
+ else
+ pn->left = ln;
+
+ return ln;
+
+
+insert_above:
+ /*
+ * split since we don't have a common prefix anymore or
+ * we have a less significant route.
+ * we've to insert an intermediate node on the list
+ * this new node will point to the one we need to create
+ * and the current
+ */
+
+ pn = fn->parent;
+
+ /* find 1st bit in difference between the 2 addrs.
+
+ See comment in __ipv6_addr_diff: bit may be an invalid value,
+ but if it is >= plen, the value is ignored in any case.
+ */
+
+ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
+
+ /*
+ * (intermediate)[in]
+ * / \
+ * (new leaf node)[ln] (old node)[fn]
+ */
+ if (plen > bit) {
+ in = node_alloc();
+ ln = node_alloc();
+
+ if (!in || !ln) {
+ if (in)
+ node_free(in);
+ if (ln)
+ node_free(ln);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * new intermediate node.
+ * RTN_RTINFO will
+ * be off since that an address that chooses one of
+ * the branches would not match less specific routes
+ * in the other branch
+ */
+
+ in->fn_bit = bit;
+
+ in->parent = pn;
+ in->leaf = fn->leaf;
+ atomic_inc(&in->leaf->rt6i_ref);
+
+ in->fn_sernum = sernum;
+
+ /* update parent pointer */
+ if (dir)
+ pn->right = in;
+ else
+ pn->left = in;
+
+ ln->fn_bit = plen;
+
+ ln->parent = in;
+ fn->parent = in;
+
+ ln->fn_sernum = sernum;
+
+ if (addr_bit_set(addr, bit)) {
+ in->right = ln;
+ in->left = fn;
+ } else {
+ in->left = ln;
+ in->right = fn;
+ }
+ } else { /* plen <= bit */
+
+ /*
+ * (new leaf node)[ln]
+ * / \
+ * (old node)[fn] NULL
+ */
+
+ ln = node_alloc();
+
+ if (!ln)
+ return ERR_PTR(-ENOMEM);
+
+ ln->fn_bit = plen;
+
+ ln->parent = pn;
+
+ ln->fn_sernum = sernum;
+
+ if (dir)
+ pn->right = ln;
+ else
+ pn->left = ln;
+
+ if (addr_bit_set(&key->addr, plen))
+ ln->right = fn;
+ else
+ ln->left = fn;
+
+ fn->parent = ln;
+ }
+ return ln;
+}
+
+static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
+{
+ return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+ RTF_GATEWAY;
+}
+
+static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
+{
+ int i;
+
+ for (i = 0; i < RTAX_MAX; i++) {
+ if (test_bit(i, mxc->mx_valid))
+ mp[i] = mxc->mx[i];
+ }
+}
+
+static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
+{
+ if (!mxc->mx)
+ return 0;
+
+ if (dst->flags & DST_HOST) {
+ u32 *mp = dst_metrics_write_ptr(dst);
+
+ if (unlikely(!mp))
+ return -ENOMEM;
+
+ fib6_copy_metrics(mp, mxc);
+ } else {
+ dst_init_metrics(dst, mxc->mx, false);
+
+ /* We've stolen mx now. */
+ mxc->mx = NULL;
+ }
+
+ return 0;
+}
+
+static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
+ struct net *net)
+{
+ if (atomic_read(&rt->rt6i_ref) != 1) {
+ /* This route is used as dummy address holder in some split
+ * nodes. It is not leaked, but it still holds other resources,
+ * which must be released in time. So, scan ascendant nodes
+ * and replace dummy references to this route with references
+ * to still alive ones.
+ */
+ while (fn) {
+ if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
+ fn->leaf = fib6_find_prefix(net, fn);
+ atomic_inc(&fn->leaf->rt6i_ref);
+ rt6_release(rt);
+ }
+ fn = fn->parent;
+ }
+ /* No more references are possible at this point. */
+ BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
+ }
+}
+
+/*
+ * Insert routing information in a node.
+ */
+
+static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
+ struct nl_info *info, struct mx6_config *mxc)
+{
+ struct rt6_info *iter = NULL;
+ struct rt6_info **ins;
+ struct rt6_info **fallback_ins = NULL;
+ int replace = (info->nlh &&
+ (info->nlh->nlmsg_flags & NLM_F_REPLACE));
+ int add = (!info->nlh ||
+ (info->nlh->nlmsg_flags & NLM_F_CREATE));
+ int found = 0;
+ bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ int err;
+
+ ins = &fn->leaf;
+
+ for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
+ /*
+ * Search for duplicates
+ */
+
+ if (iter->rt6i_metric == rt->rt6i_metric) {
+ /*
+ * Same priority level
+ */
+ if (info->nlh &&
+ (info->nlh->nlmsg_flags & NLM_F_EXCL))
+ return -EEXIST;
+ if (replace) {
+ if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
+ found++;
+ break;
+ }
+ if (rt_can_ecmp)
+ fallback_ins = fallback_ins ?: ins;
+ goto next_iter;
+ }
+
+ if (iter->dst.dev == rt->dst.dev &&
+ iter->rt6i_idev == rt->rt6i_idev &&
+ ipv6_addr_equal(&iter->rt6i_gateway,
+ &rt->rt6i_gateway)) {
+ if (rt->rt6i_nsiblings)
+ rt->rt6i_nsiblings = 0;
+ if (!(iter->rt6i_flags & RTF_EXPIRES))
+ return -EEXIST;
+ if (!(rt->rt6i_flags & RTF_EXPIRES))
+ rt6_clean_expires(iter);
+ else
+ rt6_set_expires(iter, rt->dst.expires);
+ return -EEXIST;
+ }
+ /* If we have the same destination and the same metric,
+ * but not the same gateway, then the route we try to
+ * add is sibling to this route, increment our counter
+ * of siblings, and later we will add our route to the
+ * list.
+ * Only static routes (which don't have flag
+ * RTF_EXPIRES) are used for ECMPv6.
+ *
+ * To avoid long list, we only had siblings if the
+ * route have a gateway.
+ */
+ if (rt_can_ecmp &&
+ rt6_qualify_for_ecmp(iter))
+ rt->rt6i_nsiblings++;
+ }
+
+ if (iter->rt6i_metric > rt->rt6i_metric)
+ break;
+
+next_iter:
+ ins = &iter->dst.rt6_next;
+ }
+
+ if (fallback_ins && !found) {
+ /* No ECMP-able route found, replace first non-ECMP one */
+ ins = fallback_ins;
+ iter = *ins;
+ found++;
+ }
+
+ /* Reset round-robin state, if necessary */
+ if (ins == &fn->leaf)
+ fn->rr_ptr = NULL;
+
+ /* Link this route to others same route. */
+ if (rt->rt6i_nsiblings) {
+ unsigned int rt6i_nsiblings;
+ struct rt6_info *sibling, *temp_sibling;
+
+ /* Find the first route that have the same metric */
+ sibling = fn->leaf;
+ while (sibling) {
+ if (sibling->rt6i_metric == rt->rt6i_metric &&
+ rt6_qualify_for_ecmp(sibling)) {
+ list_add_tail(&rt->rt6i_siblings,
+ &sibling->rt6i_siblings);
+ break;
+ }
+ sibling = sibling->dst.rt6_next;
+ }
+ /* For each sibling in the list, increment the counter of
+ * siblings. BUG() if counters does not match, list of siblings
+ * is broken!
+ */
+ rt6i_nsiblings = 0;
+ list_for_each_entry_safe(sibling, temp_sibling,
+ &rt->rt6i_siblings, rt6i_siblings) {
+ sibling->rt6i_nsiblings++;
+ BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
+ rt6i_nsiblings++;
+ }
+ BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
+ }
+
+ /*
+ * insert node
+ */
+ if (!replace) {
+ if (!add)
+ pr_warn("NLM_F_CREATE should be set when creating new route\n");
+
+add:
+ err = fib6_commit_metrics(&rt->dst, mxc);
+ if (err)
+ return err;
+
+ rt->dst.rt6_next = iter;
+ *ins = rt;
+ rt->rt6i_node = fn;
+ atomic_inc(&rt->rt6i_ref);
+ inet6_rt_notify(RTM_NEWROUTE, rt, info);
+ info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
+
+ if (!(fn->fn_flags & RTN_RTINFO)) {
+ info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
+ fn->fn_flags |= RTN_RTINFO;
+ }
+
+ } else {
+ int nsiblings;
+
+ if (!found) {
+ if (add)
+ goto add;
+ pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
+ return -ENOENT;
+ }
+
+ err = fib6_commit_metrics(&rt->dst, mxc);
+ if (err)
+ return err;
+
+ *ins = rt;
+ rt->rt6i_node = fn;
+ rt->dst.rt6_next = iter->dst.rt6_next;
+ atomic_inc(&rt->rt6i_ref);
+ inet6_rt_notify(RTM_NEWROUTE, rt, info);
+ if (!(fn->fn_flags & RTN_RTINFO)) {
+ info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
+ fn->fn_flags |= RTN_RTINFO;
+ }
+ nsiblings = iter->rt6i_nsiblings;
+ fib6_purge_rt(iter, fn, info->nl_net);
+ rt6_release(iter);
+
+ if (nsiblings) {
+ /* Replacing an ECMP route, remove all siblings */
+ ins = &rt->dst.rt6_next;
+ iter = *ins;
+ while (iter) {
+ if (rt6_qualify_for_ecmp(iter)) {
+ *ins = iter->dst.rt6_next;
+ fib6_purge_rt(iter, fn, info->nl_net);
+ rt6_release(iter);
+ nsiblings--;
+ } else {
+ ins = &iter->dst.rt6_next;
+ }
+ iter = *ins;
+ }
+ WARN_ON(nsiblings != 0);
+ }
+ }
+
+ return 0;
+}
+
+static void fib6_start_gc(struct net *net, struct rt6_info *rt)
+{
+ if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
+ (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
+ mod_timer(&net->ipv6.ip6_fib_timer,
+ jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
+}
+
+void fib6_force_start_gc(struct net *net)
+{
+ if (!timer_pending(&net->ipv6.ip6_fib_timer))
+ mod_timer(&net->ipv6.ip6_fib_timer,
+ jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
+}
+
+/*
+ * Add routing information to the routing tree.
+ * <destination addr>/<source addr>
+ * with source addr info in sub-trees
+ */
+
+int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+ struct nl_info *info, struct mx6_config *mxc)
+{
+ struct fib6_node *fn, *pn = NULL;
+ int err = -ENOMEM;
+ int allow_create = 1;
+ int replace_required = 0;
+ int sernum = fib6_new_sernum(info->nl_net);
+
+ if (info->nlh) {
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
+ allow_create = 0;
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
+ replace_required = 1;
+ }
+ if (!allow_create && !replace_required)
+ pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
+
+ fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
+ offsetof(struct rt6_info, rt6i_dst), allow_create,
+ replace_required, sernum);
+ if (IS_ERR(fn)) {
+ err = PTR_ERR(fn);
+ fn = NULL;
+ goto out;
+ }
+
+ pn = fn;
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (rt->rt6i_src.plen) {
+ struct fib6_node *sn;
+
+ if (!fn->subtree) {
+ struct fib6_node *sfn;
+
+ /*
+ * Create subtree.
+ *
+ * fn[main tree]
+ * |
+ * sfn[subtree root]
+ * \
+ * sn[new leaf node]
+ */
+
+ /* Create subtree root node */
+ sfn = node_alloc();
+ if (!sfn)
+ goto st_failure;
+
+ sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
+ atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+ sfn->fn_flags = RTN_ROOT;
+ sfn->fn_sernum = sernum;
+
+ /* Now add the first leaf node to new subtree */
+
+ sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
+ rt->rt6i_src.plen,
+ offsetof(struct rt6_info, rt6i_src),
+ allow_create, replace_required, sernum);
+
+ if (IS_ERR(sn)) {
+ /* If it is failed, discard just allocated
+ root, and then (in st_failure) stale node
+ in main tree.
+ */
+ node_free(sfn);
+ err = PTR_ERR(sn);
+ goto st_failure;
+ }
+
+ /* Now link new subtree to main tree */
+ sfn->parent = fn;
+ fn->subtree = sfn;
+ } else {
+ sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
+ rt->rt6i_src.plen,
+ offsetof(struct rt6_info, rt6i_src),
+ allow_create, replace_required, sernum);
+
+ if (IS_ERR(sn)) {
+ err = PTR_ERR(sn);
+ goto st_failure;
+ }
+ }
+
+ if (!fn->leaf) {
+ fn->leaf = rt;
+ atomic_inc(&rt->rt6i_ref);
+ }
+ fn = sn;
+ }
+#endif
+
+ err = fib6_add_rt2node(fn, rt, info, mxc);
+ if (!err) {
+ fib6_start_gc(info->nl_net, rt);
+ if (!(rt->rt6i_flags & RTF_CACHE))
+ fib6_prune_clones(info->nl_net, pn);
+ }
+
+out:
+ if (err) {
+#ifdef CONFIG_IPV6_SUBTREES
+ /*
+ * If fib6_add_1 has cleared the old leaf pointer in the
+ * super-tree leaf node we have to find a new one for it.
+ */
+ if (pn != fn && pn->leaf == rt) {
+ pn->leaf = NULL;
+ atomic_dec(&rt->rt6i_ref);
+ }
+ if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
+ pn->leaf = fib6_find_prefix(info->nl_net, pn);
+#if RT6_DEBUG >= 2
+ if (!pn->leaf) {
+ WARN_ON(pn->leaf == NULL);
+ pn->leaf = info->nl_net->ipv6.ip6_null_entry;
+ }
+#endif
+ atomic_inc(&pn->leaf->rt6i_ref);
+ }
+#endif
+ dst_free(&rt->dst);
+ }
+ return err;
+
+#ifdef CONFIG_IPV6_SUBTREES
+ /* Subtree creation failed, probably main tree node
+ is orphan. If it is, shoot it.
+ */
+st_failure:
+ if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
+ fib6_repair_tree(info->nl_net, fn);
+ dst_free(&rt->dst);
+ return err;
+#endif
+}
+
+/*
+ * Routing tree lookup
+ *
+ */
+
+struct lookup_args {
+ int offset; /* key offset on rt6_info */
+ const struct in6_addr *addr; /* search key */
+};
+
+static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
+ struct lookup_args *args)
+{
+ struct fib6_node *fn;
+ __be32 dir;
+
+ if (unlikely(args->offset == 0))
+ return NULL;
+
+ /*
+ * Descend on a tree
+ */
+
+ fn = root;
+
+ for (;;) {
+ struct fib6_node *next;
+
+ dir = addr_bit_set(args->addr, fn->fn_bit);
+
+ next = dir ? fn->right : fn->left;
+
+ if (next) {
+ fn = next;
+ continue;
+ }
+ break;
+ }
+
+ while (fn) {
+ if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
+ struct rt6key *key;
+
+ key = (struct rt6key *) ((u8 *) fn->leaf +
+ args->offset);
+
+ if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
+#ifdef CONFIG_IPV6_SUBTREES
+ if (fn->subtree) {
+ struct fib6_node *sfn;
+ sfn = fib6_lookup_1(fn->subtree,
+ args + 1);
+ if (!sfn)
+ goto backtrack;
+ fn = sfn;
+ }
+#endif
+ if (fn->fn_flags & RTN_RTINFO)
+ return fn;
+ }
+ }
+#ifdef CONFIG_IPV6_SUBTREES
+backtrack:
+#endif
+ if (fn->fn_flags & RTN_ROOT)
+ break;
+
+ fn = fn->parent;
+ }
+
+ return NULL;
+}
+
+struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ struct fib6_node *fn;
+ struct lookup_args args[] = {
+ {
+ .offset = offsetof(struct rt6_info, rt6i_dst),
+ .addr = daddr,
+ },
+#ifdef CONFIG_IPV6_SUBTREES
+ {
+ .offset = offsetof(struct rt6_info, rt6i_src),
+ .addr = saddr,
+ },
+#endif
+ {
+ .offset = 0, /* sentinel */
+ }
+ };
+
+ fn = fib6_lookup_1(root, daddr ? args : args + 1);
+ if (!fn || fn->fn_flags & RTN_TL_ROOT)
+ fn = root;
+
+ return fn;
+}
+
+/*
+ * Get node with specified destination prefix (and source prefix,
+ * if subtrees are used)
+ */
+
+
+static struct fib6_node *fib6_locate_1(struct fib6_node *root,
+ const struct in6_addr *addr,
+ int plen, int offset)
+{
+ struct fib6_node *fn;
+
+ for (fn = root; fn ; ) {
+ struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+
+ /*
+ * Prefix match
+ */
+ if (plen < fn->fn_bit ||
+ !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
+ return NULL;
+
+ if (plen == fn->fn_bit)
+ return fn;
+
+ /*
+ * We have more bits to go
+ */
+ if (addr_bit_set(addr, fn->fn_bit))
+ fn = fn->right;
+ else
+ fn = fn->left;
+ }
+ return NULL;
+}
+
+struct fib6_node *fib6_locate(struct fib6_node *root,
+ const struct in6_addr *daddr, int dst_len,
+ const struct in6_addr *saddr, int src_len)
+{
+ struct fib6_node *fn;
+
+ fn = fib6_locate_1(root, daddr, dst_len,
+ offsetof(struct rt6_info, rt6i_dst));
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (src_len) {
+ WARN_ON(saddr == NULL);
+ if (fn && fn->subtree)
+ fn = fib6_locate_1(fn->subtree, saddr, src_len,
+ offsetof(struct rt6_info, rt6i_src));
+ }
+#endif
+
+ if (fn && fn->fn_flags & RTN_RTINFO)
+ return fn;
+
+ return NULL;
+}
+
+
+/*
+ * Deletion
+ *
+ */
+
+static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
+{
+ if (fn->fn_flags & RTN_ROOT)
+ return net->ipv6.ip6_null_entry;
+
+ while (fn) {
+ if (fn->left)
+ return fn->left->leaf;
+ if (fn->right)
+ return fn->right->leaf;
+
+ fn = FIB6_SUBTREE(fn);
+ }
+ return NULL;
+}
+
+/*
+ * Called to trim the tree of intermediate nodes when possible. "fn"
+ * is the node we want to try and remove.
+ */
+
+static struct fib6_node *fib6_repair_tree(struct net *net,
+ struct fib6_node *fn)
+{
+ int children;
+ int nstate;
+ struct fib6_node *child, *pn;
+ struct fib6_walker *w;
+ int iter = 0;
+
+ for (;;) {
+ RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
+ iter++;
+
+ WARN_ON(fn->fn_flags & RTN_RTINFO);
+ WARN_ON(fn->fn_flags & RTN_TL_ROOT);
+ WARN_ON(fn->leaf);
+
+ children = 0;
+ child = NULL;
+ if (fn->right)
+ child = fn->right, children |= 1;
+ if (fn->left)
+ child = fn->left, children |= 2;
+
+ if (children == 3 || FIB6_SUBTREE(fn)
+#ifdef CONFIG_IPV6_SUBTREES
+ /* Subtree root (i.e. fn) may have one child */
+ || (children && fn->fn_flags & RTN_ROOT)
+#endif
+ ) {
+ fn->leaf = fib6_find_prefix(net, fn);
+#if RT6_DEBUG >= 2
+ if (!fn->leaf) {
+ WARN_ON(!fn->leaf);
+ fn->leaf = net->ipv6.ip6_null_entry;
+ }
+#endif
+ atomic_inc(&fn->leaf->rt6i_ref);
+ return fn->parent;
+ }
+
+ pn = fn->parent;
+#ifdef CONFIG_IPV6_SUBTREES
+ if (FIB6_SUBTREE(pn) == fn) {
+ WARN_ON(!(fn->fn_flags & RTN_ROOT));
+ FIB6_SUBTREE(pn) = NULL;
+ nstate = FWS_L;
+ } else {
+ WARN_ON(fn->fn_flags & RTN_ROOT);
+#endif
+ if (pn->right == fn)
+ pn->right = child;
+ else if (pn->left == fn)
+ pn->left = child;
+#if RT6_DEBUG >= 2
+ else
+ WARN_ON(1);
+#endif
+ if (child)
+ child->parent = pn;
+ nstate = FWS_R;
+#ifdef CONFIG_IPV6_SUBTREES
+ }
+#endif
+
+ read_lock(&fib6_walker_lock);
+ FOR_WALKERS(w) {
+ if (!child) {
+ if (w->root == fn) {
+ w->root = w->node = NULL;
+ RT6_TRACE("W %p adjusted by delroot 1\n", w);
+ } else if (w->node == fn) {
+ RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
+ w->node = pn;
+ w->state = nstate;
+ }
+ } else {
+ if (w->root == fn) {
+ w->root = child;
+ RT6_TRACE("W %p adjusted by delroot 2\n", w);
+ }
+ if (w->node == fn) {
+ w->node = child;
+ if (children&2) {
+ RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+ w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
+ } else {
+ RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+ w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
+ }
+ }
+ }
+ }
+ read_unlock(&fib6_walker_lock);
+
+ node_free(fn);
+ if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
+ return pn;
+
+ rt6_release(pn->leaf);
+ pn->leaf = NULL;
+ fn = pn;
+ }
+}
+
+static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
+ struct nl_info *info)
+{
+ struct fib6_walker *w;
+ struct rt6_info *rt = *rtp;
+ struct net *net = info->nl_net;
+
+ RT6_TRACE("fib6_del_route\n");
+
+ /* Unlink it */
+ *rtp = rt->dst.rt6_next;
+ rt->rt6i_node = NULL;
+ net->ipv6.rt6_stats->fib_rt_entries--;
+ net->ipv6.rt6_stats->fib_discarded_routes++;
+
+ /* Reset round-robin state, if necessary */
+ if (fn->rr_ptr == rt)
+ fn->rr_ptr = NULL;
+
+ /* Remove this entry from other siblings */
+ if (rt->rt6i_nsiblings) {
+ struct rt6_info *sibling, *next_sibling;
+
+ list_for_each_entry_safe(sibling, next_sibling,
+ &rt->rt6i_siblings, rt6i_siblings)
+ sibling->rt6i_nsiblings--;
+ rt->rt6i_nsiblings = 0;
+ list_del_init(&rt->rt6i_siblings);
+ }
+
+ /* Adjust walkers */
+ read_lock(&fib6_walker_lock);
+ FOR_WALKERS(w) {
+ if (w->state == FWS_C && w->leaf == rt) {
+ RT6_TRACE("walker %p adjusted by delroute\n", w);
+ w->leaf = rt->dst.rt6_next;
+ if (!w->leaf)
+ w->state = FWS_U;
+ }
+ }
+ read_unlock(&fib6_walker_lock);
+
+ rt->dst.rt6_next = NULL;
+
+ /* If it was last route, expunge its radix tree node */
+ if (!fn->leaf) {
+ fn->fn_flags &= ~RTN_RTINFO;
+ net->ipv6.rt6_stats->fib_route_nodes--;
+ fn = fib6_repair_tree(net, fn);
+ }
+
+ fib6_purge_rt(rt, fn, net);
+
+ inet6_rt_notify(RTM_DELROUTE, rt, info);
+ rt6_release(rt);
+}
+
+int fib6_del(struct rt6_info *rt, struct nl_info *info)
+{
+ struct net *net = info->nl_net;
+ struct fib6_node *fn = rt->rt6i_node;
+ struct rt6_info **rtp;
+
+#if RT6_DEBUG >= 2
+ if (rt->dst.obsolete > 0) {
+ WARN_ON(fn);
+ return -ENOENT;
+ }
+#endif
+ if (!fn || rt == net->ipv6.ip6_null_entry)
+ return -ENOENT;
+
+ WARN_ON(!(fn->fn_flags & RTN_RTINFO));
+
+ if (!(rt->rt6i_flags & RTF_CACHE)) {
+ struct fib6_node *pn = fn;
+#ifdef CONFIG_IPV6_SUBTREES
+ /* clones of this route might be in another subtree */
+ if (rt->rt6i_src.plen) {
+ while (!(pn->fn_flags & RTN_ROOT))
+ pn = pn->parent;
+ pn = pn->parent;
+ }
+#endif
+ fib6_prune_clones(info->nl_net, pn);
+ }
+
+ /*
+ * Walk the leaf entries looking for ourself
+ */
+
+ for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
+ if (*rtp == rt) {
+ fib6_del_route(fn, rtp, info);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+/*
+ * Tree traversal function.
+ *
+ * Certainly, it is not interrupt safe.
+ * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
+ * It means, that we can modify tree during walking
+ * and use this function for garbage collection, clone pruning,
+ * cleaning tree when a device goes down etc. etc.
+ *
+ * It guarantees that every node will be traversed,
+ * and that it will be traversed only once.
+ *
+ * Callback function w->func may return:
+ * 0 -> continue walking.
+ * positive value -> walking is suspended (used by tree dumps,
+ * and probably by gc, if it will be split to several slices)
+ * negative value -> terminate walking.
+ *
+ * The function itself returns:
+ * 0 -> walk is complete.
+ * >0 -> walk is incomplete (i.e. suspended)
+ * <0 -> walk is terminated by an error.
+ */
+
+static int fib6_walk_continue(struct fib6_walker *w)
+{
+ struct fib6_node *fn, *pn;
+
+ for (;;) {
+ fn = w->node;
+ if (!fn)
+ return 0;
+
+ if (w->prune && fn != w->root &&
+ fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
+ w->state = FWS_C;
+ w->leaf = fn->leaf;
+ }
+ switch (w->state) {
+#ifdef CONFIG_IPV6_SUBTREES
+ case FWS_S:
+ if (FIB6_SUBTREE(fn)) {
+ w->node = FIB6_SUBTREE(fn);
+ continue;
+ }
+ w->state = FWS_L;
+#endif
+ case FWS_L:
+ if (fn->left) {
+ w->node = fn->left;
+ w->state = FWS_INIT;
+ continue;
+ }
+ w->state = FWS_R;
+ case FWS_R:
+ if (fn->right) {
+ w->node = fn->right;
+ w->state = FWS_INIT;
+ continue;
+ }
+ w->state = FWS_C;
+ w->leaf = fn->leaf;
+ case FWS_C:
+ if (w->leaf && fn->fn_flags & RTN_RTINFO) {
+ int err;
+
+ if (w->skip) {
+ w->skip--;
+ goto skip;
+ }
+
+ err = w->func(w);
+ if (err)
+ return err;
+
+ w->count++;
+ continue;
+ }
+skip:
+ w->state = FWS_U;
+ case FWS_U:
+ if (fn == w->root)
+ return 0;
+ pn = fn->parent;
+ w->node = pn;
+#ifdef CONFIG_IPV6_SUBTREES
+ if (FIB6_SUBTREE(pn) == fn) {
+ WARN_ON(!(fn->fn_flags & RTN_ROOT));
+ w->state = FWS_L;
+ continue;
+ }
+#endif
+ if (pn->left == fn) {
+ w->state = FWS_R;
+ continue;
+ }
+ if (pn->right == fn) {
+ w->state = FWS_C;
+ w->leaf = w->node->leaf;
+ continue;
+ }
+#if RT6_DEBUG >= 2
+ WARN_ON(1);
+#endif
+ }
+ }
+}
+
+static int fib6_walk(struct fib6_walker *w)
+{
+ int res;
+
+ w->state = FWS_INIT;
+ w->node = w->root;
+
+ fib6_walker_link(w);
+ res = fib6_walk_continue(w);
+ if (res <= 0)
+ fib6_walker_unlink(w);
+ return res;
+}
+
+static int fib6_clean_node(struct fib6_walker *w)
+{
+ int res;
+ struct rt6_info *rt;
+ struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
+ struct nl_info info = {
+ .nl_net = c->net,
+ };
+
+ if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
+ w->node->fn_sernum != c->sernum)
+ w->node->fn_sernum = c->sernum;
+
+ if (!c->func) {
+ WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
+ w->leaf = NULL;
+ return 0;
+ }
+
+ for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+ res = c->func(rt, c->arg);
+ if (res < 0) {
+ w->leaf = rt;
+ res = fib6_del(rt, &info);
+ if (res) {
+#if RT6_DEBUG >= 2
+ pr_debug("%s: del failed: rt=%p@%p err=%d\n",
+ __func__, rt, rt->rt6i_node, res);
+#endif
+ continue;
+ }
+ return 0;
+ }
+ WARN_ON(res != 0);
+ }
+ w->leaf = rt;
+ return 0;
+}
+
+/*
+ * Convenient frontend to tree walker.
+ *
+ * func is called on each route.
+ * It may return -1 -> delete this route.
+ * 0 -> continue walking
+ *
+ * prune==1 -> only immediate children of node (certainly,
+ * ignoring pure split nodes) will be scanned.
+ */
+
+static void fib6_clean_tree(struct net *net, struct fib6_node *root,
+ int (*func)(struct rt6_info *, void *arg),
+ bool prune, int sernum, void *arg)
+{
+ struct fib6_cleaner c;
+
+ c.w.root = root;
+ c.w.func = fib6_clean_node;
+ c.w.prune = prune;
+ c.w.count = 0;
+ c.w.skip = 0;
+ c.func = func;
+ c.sernum = sernum;
+ c.arg = arg;
+ c.net = net;
+
+ fib6_walk(&c.w);
+}
+
+static void __fib6_clean_all(struct net *net,
+ int (*func)(struct rt6_info *, void *),
+ int sernum, void *arg)
+{
+ struct fib6_table *table;
+ struct hlist_head *head;
+ unsigned int h;
+
+ rcu_read_lock();
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+ write_lock_bh(&table->tb6_lock);
+ fib6_clean_tree(net, &table->tb6_root,
+ func, false, sernum, arg);
+ write_unlock_bh(&table->tb6_lock);
+ }
+ }
+ rcu_read_unlock();
+}
+
+void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
+ void *arg)
+{
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
+}
+
+static int fib6_prune_clone(struct rt6_info *rt, void *arg)
+{
+ if (rt->rt6i_flags & RTF_CACHE) {
+ RT6_TRACE("pruning clone %p\n", rt);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
+{
+ fib6_clean_tree(net, fn, fib6_prune_clone, true,
+ FIB6_NO_SERNUM_CHANGE, NULL);
+}
+
+static void fib6_flush_trees(struct net *net)
+{
+ int new_sernum = fib6_new_sernum(net);
+
+ __fib6_clean_all(net, NULL, new_sernum, NULL);
+}
+
+/*
+ * Garbage collection
+ */
+
+static struct fib6_gc_args
+{
+ int timeout;
+ int more;
+} gc_args;
+
+static int fib6_age(struct rt6_info *rt, void *arg)
+{
+ unsigned long now = jiffies;
+
+ /*
+ * check addrconf expiration here.
+ * Routes are expired even if they are in use.
+ *
+ * Also age clones. Note, that clones are aged out
+ * only if they are not in use now.
+ */
+
+ if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
+ if (time_after(now, rt->dst.expires)) {
+ RT6_TRACE("expiring %p\n", rt);
+ return -1;
+ }
+ gc_args.more++;
+ } else if (rt->rt6i_flags & RTF_CACHE) {
+ if (atomic_read(&rt->dst.__refcnt) == 0 &&
+ time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
+ RT6_TRACE("aging clone %p\n", rt);
+ return -1;
+ } else if (rt->rt6i_flags & RTF_GATEWAY) {
+ struct neighbour *neigh;
+ __u8 neigh_flags = 0;
+
+ neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
+ if (neigh) {
+ neigh_flags = neigh->flags;
+ neigh_release(neigh);
+ }
+ if (!(neigh_flags & NTF_ROUTER)) {
+ RT6_TRACE("purging route %p via non-router but gateway\n",
+ rt);
+ return -1;
+ }
+ }
+ gc_args.more++;
+ }
+
+ return 0;
+}
+
+static DEFINE_SPINLOCK(fib6_gc_lock);
+
+void fib6_run_gc(unsigned long expires, struct net *net, bool force)
+{
+ unsigned long now;
+
+ if (force) {
+ spin_lock_bh(&fib6_gc_lock);
+ } else if (!spin_trylock_bh(&fib6_gc_lock)) {
+ mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
+ return;
+ }
+ gc_args.timeout = expires ? (int)expires :
+ net->ipv6.sysctl.ip6_rt_gc_interval;
+
+ gc_args.more = icmp6_dst_gc();
+
+ fib6_clean_all(net, fib6_age, NULL);
+ now = jiffies;
+ net->ipv6.ip6_rt_last_gc = now;
+
+ if (gc_args.more)
+ mod_timer(&net->ipv6.ip6_fib_timer,
+ round_jiffies(now
+ + net->ipv6.sysctl.ip6_rt_gc_interval));
+ else
+ del_timer(&net->ipv6.ip6_fib_timer);
+ spin_unlock_bh(&fib6_gc_lock);
+}
+
+static void fib6_gc_timer_cb(unsigned long arg)
+{
+ fib6_run_gc(0, (struct net *)arg, true);
+}
+
+static int __net_init fib6_net_init(struct net *net)
+{
+ size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+
+ setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
+
+ net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
+ if (!net->ipv6.rt6_stats)
+ goto out_timer;
+
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
+ if (!net->ipv6.fib_table_hash)
+ goto out_rt6_stats;
+
+ net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
+ GFP_KERNEL);
+ if (!net->ipv6.fib6_main_tbl)
+ goto out_fib_table_hash;
+
+ net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
+ net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+ net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
+ RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+ inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
+ GFP_KERNEL);
+ if (!net->ipv6.fib6_local_tbl)
+ goto out_fib6_main_tbl;
+ net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
+ net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+ net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
+ RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+ inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
+#endif
+ fib6_tables_init(net);
+
+ return 0;
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+out_fib6_main_tbl:
+ kfree(net->ipv6.fib6_main_tbl);
+#endif
+out_fib_table_hash:
+ kfree(net->ipv6.fib_table_hash);
+out_rt6_stats:
+ kfree(net->ipv6.rt6_stats);
+out_timer:
+ return -ENOMEM;
+}
+
+static void fib6_net_exit(struct net *net)
+{
+ rt6_ifdown(net, NULL);
+ del_timer_sync(&net->ipv6.ip6_fib_timer);
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
+ kfree(net->ipv6.fib6_local_tbl);
+#endif
+ inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
+ kfree(net->ipv6.fib6_main_tbl);
+ kfree(net->ipv6.fib_table_hash);
+ kfree(net->ipv6.rt6_stats);
+}
+
+static struct pernet_operations fib6_net_ops = {
+ .init = fib6_net_init,
+ .exit = fib6_net_exit,
+};
+
+int __init fib6_init(void)
+{
+ int ret = -ENOMEM;
+
+ fib6_node_kmem = kmem_cache_create("fib6_nodes",
+ sizeof(struct fib6_node),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!fib6_node_kmem)
+ goto out;
+
+ ret = register_pernet_subsys(&fib6_net_ops);
+ if (ret)
+ goto out_kmem_cache_create;
+
+ ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
+ NULL);
+ if (ret)
+ goto out_unregister_subsys;
+
+ __fib6_flush_trees = fib6_flush_trees;
+out:
+ return ret;
+
+out_unregister_subsys:
+ unregister_pernet_subsys(&fib6_net_ops);
+out_kmem_cache_create:
+ kmem_cache_destroy(fib6_node_kmem);
+ goto out;
+}
+
+void fib6_gc_cleanup(void)
+{
+ unregister_pernet_subsys(&fib6_net_ops);
+ kmem_cache_destroy(fib6_node_kmem);
+}
+
+#ifdef CONFIG_PROC_FS
+
+struct ipv6_route_iter {
+ struct seq_net_private p;
+ struct fib6_walker w;
+ loff_t skip;
+ struct fib6_table *tbl;
+ int sernum;
+};
+
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+ struct rt6_info *rt = v;
+ struct ipv6_route_iter *iter = seq->private;
+
+ seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
+
+#ifdef CONFIG_IPV6_SUBTREES
+ seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
+#else
+ seq_puts(seq, "00000000000000000000000000000000 00 ");
+#endif
+ if (rt->rt6i_flags & RTF_GATEWAY)
+ seq_printf(seq, "%pi6", &rt->rt6i_gateway);
+ else
+ seq_puts(seq, "00000000000000000000000000000000");
+
+ seq_printf(seq, " %08x %08x %08x %08x %8s\n",
+ rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
+ rt->dst.__use, rt->rt6i_flags,
+ rt->dst.dev ? rt->dst.dev->name : "");
+ iter->w.leaf = NULL;
+ return 0;
+}
+
+static int ipv6_route_yield(struct fib6_walker *w)
+{
+ struct ipv6_route_iter *iter = w->args;
+
+ if (!iter->skip)
+ return 1;
+
+ do {
+ iter->w.leaf = iter->w.leaf->dst.rt6_next;
+ iter->skip--;
+ if (!iter->skip && iter->w.leaf)
+ return 1;
+ } while (iter->w.leaf);
+
+ return 0;
+}
+
+static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
+{
+ memset(&iter->w, 0, sizeof(iter->w));
+ iter->w.func = ipv6_route_yield;
+ iter->w.root = &iter->tbl->tb6_root;
+ iter->w.state = FWS_INIT;
+ iter->w.node = iter->w.root;
+ iter->w.args = iter;
+ iter->sernum = iter->w.root->fn_sernum;
+ INIT_LIST_HEAD(&iter->w.lh);
+ fib6_walker_link(&iter->w);
+}
+
+static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
+ struct net *net)
+{
+ unsigned int h;
+ struct hlist_node *node;
+
+ if (tbl) {
+ h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
+ node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
+ } else {
+ h = 0;
+ node = NULL;
+ }
+
+ while (!node && h < FIB6_TABLE_HASHSZ) {
+ node = rcu_dereference_bh(
+ hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
+ }
+ return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
+}
+
+static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
+{
+ if (iter->sernum != iter->w.root->fn_sernum) {
+ iter->sernum = iter->w.root->fn_sernum;
+ iter->w.state = FWS_INIT;
+ iter->w.node = iter->w.root;
+ WARN_ON(iter->w.skip);
+ iter->w.skip = iter->w.count;
+ }
+}
+
+static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ int r;
+ struct rt6_info *n;
+ struct net *net = seq_file_net(seq);
+ struct ipv6_route_iter *iter = seq->private;
+
+ if (!v)
+ goto iter_table;
+
+ n = ((struct rt6_info *)v)->dst.rt6_next;
+ if (n) {
+ ++*pos;
+ return n;
+ }
+
+iter_table:
+ ipv6_route_check_sernum(iter);
+ read_lock(&iter->tbl->tb6_lock);
+ r = fib6_walk_continue(&iter->w);
+ read_unlock(&iter->tbl->tb6_lock);
+ if (r > 0) {
+ if (v)
+ ++*pos;
+ return iter->w.leaf;
+ } else if (r < 0) {
+ fib6_walker_unlink(&iter->w);
+ return NULL;
+ }
+ fib6_walker_unlink(&iter->w);
+
+ iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
+ if (!iter->tbl)
+ return NULL;
+
+ ipv6_route_seq_setup_walk(iter);
+ goto iter_table;
+}
+
+static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU_BH)
+{
+ struct net *net = seq_file_net(seq);
+ struct ipv6_route_iter *iter = seq->private;
+
+ rcu_read_lock_bh();
+ iter->tbl = ipv6_route_seq_next_table(NULL, net);
+ iter->skip = *pos;
+
+ if (iter->tbl) {
+ ipv6_route_seq_setup_walk(iter);
+ return ipv6_route_seq_next(seq, NULL, pos);
+ } else {
+ return NULL;
+ }
+}
+
+static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
+{
+ struct fib6_walker *w = &iter->w;
+ return w->node && !(w->state == FWS_U && w->node == w->root);
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU_BH)
+{
+ struct ipv6_route_iter *iter = seq->private;
+
+ if (ipv6_route_iter_active(iter))
+ fib6_walker_unlink(&iter->w);
+
+ rcu_read_unlock_bh();
+}
+
+static const struct seq_operations ipv6_route_seq_ops = {
+ .start = ipv6_route_seq_start,
+ .next = ipv6_route_seq_next,
+ .stop = ipv6_route_seq_stop,
+ .show = ipv6_route_seq_show
+};
+
+int ipv6_route_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ipv6_route_seq_ops,
+ sizeof(struct ipv6_route_iter));
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
new file mode 100644
index 000000000..d49112501
--- /dev/null
+++ b/net/ipv6/ip6_flowlabel.c
@@ -0,0 +1,881 @@
+/*
+ * ip6_flowlabel.c IPv6 flowlabel manager.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/pid_namespace.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include <net/ipv6.h>
+#include <net/rawv6.h>
+#include <net/transp_v6.h>
+
+#include <asm/uaccess.h>
+
+#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified
+ in old IPv6 RFC. Well, it was reasonable value.
+ */
+#define FL_MAX_LINGER 150 /* Maximal linger timeout */
+
+/* FL hash table */
+
+#define FL_MAX_PER_SOCK 32
+#define FL_MAX_SIZE 4096
+#define FL_HASH_MASK 255
+#define FL_HASH(l) (ntohl(l)&FL_HASH_MASK)
+
+static atomic_t fl_size = ATOMIC_INIT(0);
+static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
+
+static void ip6_fl_gc(unsigned long dummy);
+static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0);
+
+/* FL hash table lock: it protects only of GC */
+
+static DEFINE_SPINLOCK(ip6_fl_lock);
+
+/* Big socket sock */
+
+static DEFINE_SPINLOCK(ip6_sk_fl_lock);
+
+#define for_each_fl_rcu(hash, fl) \
+ for (fl = rcu_dereference_bh(fl_ht[(hash)]); \
+ fl != NULL; \
+ fl = rcu_dereference_bh(fl->next))
+#define for_each_fl_continue_rcu(fl) \
+ for (fl = rcu_dereference_bh(fl->next); \
+ fl != NULL; \
+ fl = rcu_dereference_bh(fl->next))
+
+#define for_each_sk_fl_rcu(np, sfl) \
+ for (sfl = rcu_dereference_bh(np->ipv6_fl_list); \
+ sfl != NULL; \
+ sfl = rcu_dereference_bh(sfl->next))
+
+static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label)
+{
+ struct ip6_flowlabel *fl;
+
+ for_each_fl_rcu(FL_HASH(label), fl) {
+ if (fl->label == label && net_eq(fl->fl_net, net))
+ return fl;
+ }
+ return NULL;
+}
+
+static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
+{
+ struct ip6_flowlabel *fl;
+
+ rcu_read_lock_bh();
+ fl = __fl_lookup(net, label);
+ if (fl && !atomic_inc_not_zero(&fl->users))
+ fl = NULL;
+ rcu_read_unlock_bh();
+ return fl;
+}
+
+
+static void fl_free(struct ip6_flowlabel *fl)
+{
+ if (fl) {
+ if (fl->share == IPV6_FL_S_PROCESS)
+ put_pid(fl->owner.pid);
+ kfree(fl->opt);
+ kfree_rcu(fl, rcu);
+ }
+}
+
+static void fl_release(struct ip6_flowlabel *fl)
+{
+ spin_lock_bh(&ip6_fl_lock);
+
+ fl->lastuse = jiffies;
+ if (atomic_dec_and_test(&fl->users)) {
+ unsigned long ttd = fl->lastuse + fl->linger;
+ if (time_after(ttd, fl->expires))
+ fl->expires = ttd;
+ ttd = fl->expires;
+ if (fl->opt && fl->share == IPV6_FL_S_EXCL) {
+ struct ipv6_txoptions *opt = fl->opt;
+ fl->opt = NULL;
+ kfree(opt);
+ }
+ if (!timer_pending(&ip6_fl_gc_timer) ||
+ time_after(ip6_fl_gc_timer.expires, ttd))
+ mod_timer(&ip6_fl_gc_timer, ttd);
+ }
+ spin_unlock_bh(&ip6_fl_lock);
+}
+
+static void ip6_fl_gc(unsigned long dummy)
+{
+ int i;
+ unsigned long now = jiffies;
+ unsigned long sched = 0;
+
+ spin_lock(&ip6_fl_lock);
+
+ for (i = 0; i <= FL_HASH_MASK; i++) {
+ struct ip6_flowlabel *fl;
+ struct ip6_flowlabel __rcu **flp;
+
+ flp = &fl_ht[i];
+ while ((fl = rcu_dereference_protected(*flp,
+ lockdep_is_held(&ip6_fl_lock))) != NULL) {
+ if (atomic_read(&fl->users) == 0) {
+ unsigned long ttd = fl->lastuse + fl->linger;
+ if (time_after(ttd, fl->expires))
+ fl->expires = ttd;
+ ttd = fl->expires;
+ if (time_after_eq(now, ttd)) {
+ *flp = fl->next;
+ fl_free(fl);
+ atomic_dec(&fl_size);
+ continue;
+ }
+ if (!sched || time_before(ttd, sched))
+ sched = ttd;
+ }
+ flp = &fl->next;
+ }
+ }
+ if (!sched && atomic_read(&fl_size))
+ sched = now + FL_MAX_LINGER;
+ if (sched) {
+ mod_timer(&ip6_fl_gc_timer, sched);
+ }
+ spin_unlock(&ip6_fl_lock);
+}
+
+static void __net_exit ip6_fl_purge(struct net *net)
+{
+ int i;
+
+ spin_lock_bh(&ip6_fl_lock);
+ for (i = 0; i <= FL_HASH_MASK; i++) {
+ struct ip6_flowlabel *fl;
+ struct ip6_flowlabel __rcu **flp;
+
+ flp = &fl_ht[i];
+ while ((fl = rcu_dereference_protected(*flp,
+ lockdep_is_held(&ip6_fl_lock))) != NULL) {
+ if (net_eq(fl->fl_net, net) &&
+ atomic_read(&fl->users) == 0) {
+ *flp = fl->next;
+ fl_free(fl);
+ atomic_dec(&fl_size);
+ continue;
+ }
+ flp = &fl->next;
+ }
+ }
+ spin_unlock_bh(&ip6_fl_lock);
+}
+
+static struct ip6_flowlabel *fl_intern(struct net *net,
+ struct ip6_flowlabel *fl, __be32 label)
+{
+ struct ip6_flowlabel *lfl;
+
+ fl->label = label & IPV6_FLOWLABEL_MASK;
+
+ spin_lock_bh(&ip6_fl_lock);
+ if (label == 0) {
+ for (;;) {
+ fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK;
+ if (fl->label) {
+ lfl = __fl_lookup(net, fl->label);
+ if (!lfl)
+ break;
+ }
+ }
+ } else {
+ /*
+ * we dropper the ip6_fl_lock, so this entry could reappear
+ * and we need to recheck with it.
+ *
+ * OTOH no need to search the active socket first, like it is
+ * done in ipv6_flowlabel_opt - sock is locked, so new entry
+ * with the same label can only appear on another sock
+ */
+ lfl = __fl_lookup(net, fl->label);
+ if (lfl) {
+ atomic_inc(&lfl->users);
+ spin_unlock_bh(&ip6_fl_lock);
+ return lfl;
+ }
+ }
+
+ fl->lastuse = jiffies;
+ fl->next = fl_ht[FL_HASH(fl->label)];
+ rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl);
+ atomic_inc(&fl_size);
+ spin_unlock_bh(&ip6_fl_lock);
+ return NULL;
+}
+
+
+
+/* Socket flowlabel lists */
+
+struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label)
+{
+ struct ipv6_fl_socklist *sfl;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ label &= IPV6_FLOWLABEL_MASK;
+
+ rcu_read_lock_bh();
+ for_each_sk_fl_rcu(np, sfl) {
+ struct ip6_flowlabel *fl = sfl->fl;
+ if (fl->label == label) {
+ fl->lastuse = jiffies;
+ atomic_inc(&fl->users);
+ rcu_read_unlock_bh();
+ return fl;
+ }
+ }
+ rcu_read_unlock_bh();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+
+void fl6_free_socklist(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_fl_socklist *sfl;
+
+ if (!rcu_access_pointer(np->ipv6_fl_list))
+ return;
+
+ spin_lock_bh(&ip6_sk_fl_lock);
+ while ((sfl = rcu_dereference_protected(np->ipv6_fl_list,
+ lockdep_is_held(&ip6_sk_fl_lock))) != NULL) {
+ np->ipv6_fl_list = sfl->next;
+ spin_unlock_bh(&ip6_sk_fl_lock);
+
+ fl_release(sfl->fl);
+ kfree_rcu(sfl, rcu);
+
+ spin_lock_bh(&ip6_sk_fl_lock);
+ }
+ spin_unlock_bh(&ip6_sk_fl_lock);
+}
+
+/* Service routines */
+
+
+/*
+ It is the only difficult place. flowlabel enforces equal headers
+ before and including routing header, however user may supply options
+ following rthdr.
+ */
+
+struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
+ struct ip6_flowlabel *fl,
+ struct ipv6_txoptions *fopt)
+{
+ struct ipv6_txoptions *fl_opt = fl->opt;
+
+ if (!fopt || fopt->opt_flen == 0)
+ return fl_opt;
+
+ if (fl_opt) {
+ opt_space->hopopt = fl_opt->hopopt;
+ opt_space->dst0opt = fl_opt->dst0opt;
+ opt_space->srcrt = fl_opt->srcrt;
+ opt_space->opt_nflen = fl_opt->opt_nflen;
+ } else {
+ if (fopt->opt_nflen == 0)
+ return fopt;
+ opt_space->hopopt = NULL;
+ opt_space->dst0opt = NULL;
+ opt_space->srcrt = NULL;
+ opt_space->opt_nflen = 0;
+ }
+ opt_space->dst1opt = fopt->dst1opt;
+ opt_space->opt_flen = fopt->opt_flen;
+ return opt_space;
+}
+EXPORT_SYMBOL_GPL(fl6_merge_options);
+
+static unsigned long check_linger(unsigned long ttl)
+{
+ if (ttl < FL_MIN_LINGER)
+ return FL_MIN_LINGER*HZ;
+ if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN))
+ return 0;
+ return ttl*HZ;
+}
+
+static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires)
+{
+ linger = check_linger(linger);
+ if (!linger)
+ return -EPERM;
+ expires = check_linger(expires);
+ if (!expires)
+ return -EPERM;
+
+ spin_lock_bh(&ip6_fl_lock);
+ fl->lastuse = jiffies;
+ if (time_before(fl->linger, linger))
+ fl->linger = linger;
+ if (time_before(expires, fl->linger))
+ expires = fl->linger;
+ if (time_before(fl->expires, fl->lastuse + expires))
+ fl->expires = fl->lastuse + expires;
+ spin_unlock_bh(&ip6_fl_lock);
+
+ return 0;
+}
+
+static struct ip6_flowlabel *
+fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
+ char __user *optval, int optlen, int *err_p)
+{
+ struct ip6_flowlabel *fl = NULL;
+ int olen;
+ int addr_type;
+ int err;
+
+ olen = optlen - CMSG_ALIGN(sizeof(*freq));
+ err = -EINVAL;
+ if (olen > 64 * 1024)
+ goto done;
+
+ err = -ENOMEM;
+ fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+ if (!fl)
+ goto done;
+
+ if (olen > 0) {
+ struct msghdr msg;
+ struct flowi6 flowi6;
+ int junk;
+
+ err = -ENOMEM;
+ fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL);
+ if (!fl->opt)
+ goto done;
+
+ memset(fl->opt, 0, sizeof(*fl->opt));
+ fl->opt->tot_len = sizeof(*fl->opt) + olen;
+ err = -EFAULT;
+ if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen))
+ goto done;
+
+ msg.msg_controllen = olen;
+ msg.msg_control = (void *)(fl->opt+1);
+ memset(&flowi6, 0, sizeof(flowi6));
+
+ err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt,
+ &junk, &junk, &junk);
+ if (err)
+ goto done;
+ err = -EINVAL;
+ if (fl->opt->opt_flen)
+ goto done;
+ if (fl->opt->opt_nflen == 0) {
+ kfree(fl->opt);
+ fl->opt = NULL;
+ }
+ }
+
+ fl->fl_net = net;
+ fl->expires = jiffies;
+ err = fl6_renew(fl, freq->flr_linger, freq->flr_expires);
+ if (err)
+ goto done;
+ fl->share = freq->flr_share;
+ addr_type = ipv6_addr_type(&freq->flr_dst);
+ if ((addr_type & IPV6_ADDR_MAPPED) ||
+ addr_type == IPV6_ADDR_ANY) {
+ err = -EINVAL;
+ goto done;
+ }
+ fl->dst = freq->flr_dst;
+ atomic_set(&fl->users, 1);
+ switch (fl->share) {
+ case IPV6_FL_S_EXCL:
+ case IPV6_FL_S_ANY:
+ break;
+ case IPV6_FL_S_PROCESS:
+ fl->owner.pid = get_task_pid(current, PIDTYPE_PID);
+ break;
+ case IPV6_FL_S_USER:
+ fl->owner.uid = current_euid();
+ break;
+ default:
+ err = -EINVAL;
+ goto done;
+ }
+ return fl;
+
+done:
+ fl_free(fl);
+ *err_p = err;
+ return NULL;
+}
+
+static int mem_check(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_fl_socklist *sfl;
+ int room = FL_MAX_SIZE - atomic_read(&fl_size);
+ int count = 0;
+
+ if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
+ return 0;
+
+ rcu_read_lock_bh();
+ for_each_sk_fl_rcu(np, sfl)
+ count++;
+ rcu_read_unlock_bh();
+
+ if (room <= 0 ||
+ ((count >= FL_MAX_PER_SOCK ||
+ (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) &&
+ !capable(CAP_NET_ADMIN)))
+ return -ENOBUFS;
+
+ return 0;
+}
+
+static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl,
+ struct ip6_flowlabel *fl)
+{
+ spin_lock_bh(&ip6_sk_fl_lock);
+ sfl->fl = fl;
+ sfl->next = np->ipv6_fl_list;
+ rcu_assign_pointer(np->ipv6_fl_list, sfl);
+ spin_unlock_bh(&ip6_sk_fl_lock);
+}
+
+int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
+ int flags)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_fl_socklist *sfl;
+
+ if (flags & IPV6_FL_F_REMOTE) {
+ freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK;
+ return 0;
+ }
+
+ if (np->repflow) {
+ freq->flr_label = np->flow_label;
+ return 0;
+ }
+
+ rcu_read_lock_bh();
+
+ for_each_sk_fl_rcu(np, sfl) {
+ if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) {
+ spin_lock_bh(&ip6_fl_lock);
+ freq->flr_label = sfl->fl->label;
+ freq->flr_dst = sfl->fl->dst;
+ freq->flr_share = sfl->fl->share;
+ freq->flr_expires = (sfl->fl->expires - jiffies) / HZ;
+ freq->flr_linger = sfl->fl->linger / HZ;
+
+ spin_unlock_bh(&ip6_fl_lock);
+ rcu_read_unlock_bh();
+ return 0;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ return -ENOENT;
+}
+
+int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
+{
+ int uninitialized_var(err);
+ struct net *net = sock_net(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_flowlabel_req freq;
+ struct ipv6_fl_socklist *sfl1 = NULL;
+ struct ipv6_fl_socklist *sfl;
+ struct ipv6_fl_socklist __rcu **sflp;
+ struct ip6_flowlabel *fl, *fl1 = NULL;
+
+
+ if (optlen < sizeof(freq))
+ return -EINVAL;
+
+ if (copy_from_user(&freq, optval, sizeof(freq)))
+ return -EFAULT;
+
+ switch (freq.flr_action) {
+ case IPV6_FL_A_PUT:
+ if (freq.flr_flags & IPV6_FL_F_REFLECT) {
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+ if (!np->repflow)
+ return -ESRCH;
+ np->flow_label = 0;
+ np->repflow = 0;
+ return 0;
+ }
+ spin_lock_bh(&ip6_sk_fl_lock);
+ for (sflp = &np->ipv6_fl_list;
+ (sfl = rcu_dereference(*sflp)) != NULL;
+ sflp = &sfl->next) {
+ if (sfl->fl->label == freq.flr_label) {
+ if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
+ np->flow_label &= ~IPV6_FLOWLABEL_MASK;
+ *sflp = rcu_dereference(sfl->next);
+ spin_unlock_bh(&ip6_sk_fl_lock);
+ fl_release(sfl->fl);
+ kfree_rcu(sfl, rcu);
+ return 0;
+ }
+ }
+ spin_unlock_bh(&ip6_sk_fl_lock);
+ return -ESRCH;
+
+ case IPV6_FL_A_RENEW:
+ rcu_read_lock_bh();
+ for_each_sk_fl_rcu(np, sfl) {
+ if (sfl->fl->label == freq.flr_label) {
+ err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires);
+ rcu_read_unlock_bh();
+ return err;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ if (freq.flr_share == IPV6_FL_S_NONE &&
+ ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ fl = fl_lookup(net, freq.flr_label);
+ if (fl) {
+ err = fl6_renew(fl, freq.flr_linger, freq.flr_expires);
+ fl_release(fl);
+ return err;
+ }
+ }
+ return -ESRCH;
+
+ case IPV6_FL_A_GET:
+ if (freq.flr_flags & IPV6_FL_F_REFLECT) {
+ struct net *net = sock_net(sk);
+ if (net->ipv6.sysctl.flowlabel_consistency) {
+ net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n");
+ return -EPERM;
+ }
+
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+
+ np->repflow = 1;
+ return 0;
+ }
+
+ if (freq.flr_label & ~IPV6_FLOWLABEL_MASK)
+ return -EINVAL;
+
+ fl = fl_create(net, sk, &freq, optval, optlen, &err);
+ if (!fl)
+ return err;
+ sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
+
+ if (freq.flr_label) {
+ err = -EEXIST;
+ rcu_read_lock_bh();
+ for_each_sk_fl_rcu(np, sfl) {
+ if (sfl->fl->label == freq.flr_label) {
+ if (freq.flr_flags&IPV6_FL_F_EXCL) {
+ rcu_read_unlock_bh();
+ goto done;
+ }
+ fl1 = sfl->fl;
+ atomic_inc(&fl1->users);
+ break;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ if (!fl1)
+ fl1 = fl_lookup(net, freq.flr_label);
+ if (fl1) {
+recheck:
+ err = -EEXIST;
+ if (freq.flr_flags&IPV6_FL_F_EXCL)
+ goto release;
+ err = -EPERM;
+ if (fl1->share == IPV6_FL_S_EXCL ||
+ fl1->share != fl->share ||
+ ((fl1->share == IPV6_FL_S_PROCESS) &&
+ (fl1->owner.pid == fl->owner.pid)) ||
+ ((fl1->share == IPV6_FL_S_USER) &&
+ uid_eq(fl1->owner.uid, fl->owner.uid)))
+ goto release;
+
+ err = -ENOMEM;
+ if (!sfl1)
+ goto release;
+ if (fl->linger > fl1->linger)
+ fl1->linger = fl->linger;
+ if ((long)(fl->expires - fl1->expires) > 0)
+ fl1->expires = fl->expires;
+ fl_link(np, sfl1, fl1);
+ fl_free(fl);
+ return 0;
+
+release:
+ fl_release(fl1);
+ goto done;
+ }
+ }
+ err = -ENOENT;
+ if (!(freq.flr_flags&IPV6_FL_F_CREATE))
+ goto done;
+
+ err = -ENOMEM;
+ if (!sfl1)
+ goto done;
+
+ err = mem_check(sk);
+ if (err != 0)
+ goto done;
+
+ fl1 = fl_intern(net, fl, freq.flr_label);
+ if (fl1)
+ goto recheck;
+
+ if (!freq.flr_label) {
+ if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label,
+ &fl->label, sizeof(fl->label))) {
+ /* Intentionally ignore fault. */
+ }
+ }
+
+ fl_link(np, sfl1, fl);
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+
+done:
+ fl_free(fl);
+ kfree(sfl1);
+ return err;
+}
+
+#ifdef CONFIG_PROC_FS
+
+struct ip6fl_iter_state {
+ struct seq_net_private p;
+ struct pid_namespace *pid_ns;
+ int bucket;
+};
+
+#define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private)
+
+static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq)
+{
+ struct ip6_flowlabel *fl = NULL;
+ struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+ struct net *net = seq_file_net(seq);
+
+ for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) {
+ for_each_fl_rcu(state->bucket, fl) {
+ if (net_eq(fl->fl_net, net))
+ goto out;
+ }
+ }
+ fl = NULL;
+out:
+ return fl;
+}
+
+static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl)
+{
+ struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+ struct net *net = seq_file_net(seq);
+
+ for_each_fl_continue_rcu(fl) {
+ if (net_eq(fl->fl_net, net))
+ goto out;
+ }
+
+try_again:
+ if (++state->bucket <= FL_HASH_MASK) {
+ for_each_fl_rcu(state->bucket, fl) {
+ if (net_eq(fl->fl_net, net))
+ goto out;
+ }
+ goto try_again;
+ }
+ fl = NULL;
+
+out:
+ return fl;
+}
+
+static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip6_flowlabel *fl = ip6fl_get_first(seq);
+ if (fl)
+ while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL)
+ --pos;
+ return pos ? NULL : fl;
+}
+
+static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock_bh();
+ return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip6_flowlabel *fl;
+
+ if (v == SEQ_START_TOKEN)
+ fl = ip6fl_get_first(seq);
+ else
+ fl = ip6fl_get_next(seq, v);
+ ++*pos;
+ return fl;
+}
+
+static void ip6fl_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock_bh();
+}
+
+static int ip6fl_seq_show(struct seq_file *seq, void *v)
+{
+ struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n");
+ } else {
+ struct ip6_flowlabel *fl = v;
+ seq_printf(seq,
+ "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n",
+ (unsigned int)ntohl(fl->label),
+ fl->share,
+ ((fl->share == IPV6_FL_S_PROCESS) ?
+ pid_nr_ns(fl->owner.pid, state->pid_ns) :
+ ((fl->share == IPV6_FL_S_USER) ?
+ from_kuid_munged(seq_user_ns(seq), fl->owner.uid) :
+ 0)),
+ atomic_read(&fl->users),
+ fl->linger/HZ,
+ (long)(fl->expires - jiffies)/HZ,
+ &fl->dst,
+ fl->opt ? fl->opt->opt_nflen : 0);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip6fl_seq_ops = {
+ .start = ip6fl_seq_start,
+ .next = ip6fl_seq_next,
+ .stop = ip6fl_seq_stop,
+ .show = ip6fl_seq_show,
+};
+
+static int ip6fl_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct ip6fl_iter_state *state;
+ int err;
+
+ err = seq_open_net(inode, file, &ip6fl_seq_ops,
+ sizeof(struct ip6fl_iter_state));
+
+ if (!err) {
+ seq = file->private_data;
+ state = ip6fl_seq_private(seq);
+ rcu_read_lock();
+ state->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ rcu_read_unlock();
+ }
+ return err;
+}
+
+static int ip6fl_seq_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+ put_pid_ns(state->pid_ns);
+ return seq_release_net(inode, file);
+}
+
+static const struct file_operations ip6fl_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ip6fl_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = ip6fl_seq_release,
+};
+
+static int __net_init ip6_flowlabel_proc_init(struct net *net)
+{
+ if (!proc_create("ip6_flowlabel", S_IRUGO, net->proc_net,
+ &ip6fl_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit ip6_flowlabel_proc_fini(struct net *net)
+{
+ remove_proc_entry("ip6_flowlabel", net->proc_net);
+}
+#else
+static inline int ip6_flowlabel_proc_init(struct net *net)
+{
+ return 0;
+}
+static inline void ip6_flowlabel_proc_fini(struct net *net)
+{
+}
+#endif
+
+static void __net_exit ip6_flowlabel_net_exit(struct net *net)
+{
+ ip6_fl_purge(net);
+ ip6_flowlabel_proc_fini(net);
+}
+
+static struct pernet_operations ip6_flowlabel_net_ops = {
+ .init = ip6_flowlabel_proc_init,
+ .exit = ip6_flowlabel_net_exit,
+};
+
+int ip6_flowlabel_init(void)
+{
+ return register_pernet_subsys(&ip6_flowlabel_net_ops);
+}
+
+void ip6_flowlabel_cleanup(void)
+{
+ del_timer(&ip6_fl_gc_timer);
+ unregister_pernet_subsys(&ip6_flowlabel_net_ops);
+}
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
new file mode 100644
index 000000000..a38d3ac0f
--- /dev/null
+++ b/net/ipv6/ip6_gre.c
@@ -0,0 +1,1726 @@
+/*
+ * GRE over IPv6 protocol decoder.
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/hash.h>
+#include <linux/if_tunnel.h>
+#include <linux/ip6_tunnel.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/addrconf.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/ip6_tunnel.h>
+
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+#define HASH_SIZE_SHIFT 5
+#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+
+static int ip6gre_net_id __read_mostly;
+struct ip6gre_net {
+ struct ip6_tnl __rcu *tunnels[4][HASH_SIZE];
+
+ struct net_device *fb_tunnel_dev;
+};
+
+static struct rtnl_link_ops ip6gre_link_ops __read_mostly;
+static struct rtnl_link_ops ip6gre_tap_ops __read_mostly;
+static int ip6gre_tunnel_init(struct net_device *dev);
+static void ip6gre_tunnel_setup(struct net_device *dev);
+static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t);
+static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu);
+
+/* Tunnel hash table */
+
+/*
+ 4 hash tables:
+
+ 3: (remote,local)
+ 2: (remote,*)
+ 1: (*,local)
+ 0: (*,*)
+
+ We require exact key match i.e. if a key is present in packet
+ it will match only tunnel with the same key; if it is not present,
+ it will match only keyless tunnel.
+
+ All keysless packets, if not matched configured keyless tunnels
+ will match fallback tunnel.
+ */
+
+#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1))
+static u32 HASH_ADDR(const struct in6_addr *addr)
+{
+ u32 hash = ipv6_addr_hash(addr);
+
+ return hash_32(hash, HASH_SIZE_SHIFT);
+}
+
+#define tunnels_r_l tunnels[3]
+#define tunnels_r tunnels[2]
+#define tunnels_l tunnels[1]
+#define tunnels_wc tunnels[0]
+
+/* Given src, dst and key, find appropriate for input tunnel. */
+
+static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
+ const struct in6_addr *remote, const struct in6_addr *local,
+ __be32 key, __be16 gre_proto)
+{
+ struct net *net = dev_net(dev);
+ int link = dev->ifindex;
+ unsigned int h0 = HASH_ADDR(remote);
+ unsigned int h1 = HASH_KEY(key);
+ struct ip6_tnl *t, *cand = NULL;
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
+ ARPHRD_ETHER : ARPHRD_IP6GRE;
+ int score, cand_score = 4;
+
+ for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_equal(remote, &t->parms.raddr) ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->dev->type != ARPHRD_IP6GRE &&
+ t->dev->type != dev_type)
+ continue;
+
+ score = 0;
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if (score == 0)
+ return t;
+
+ if (score < cand_score) {
+ cand = t;
+ cand_score = score;
+ }
+ }
+
+ for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
+ if (!ipv6_addr_equal(remote, &t->parms.raddr) ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->dev->type != ARPHRD_IP6GRE &&
+ t->dev->type != dev_type)
+ continue;
+
+ score = 0;
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if (score == 0)
+ return t;
+
+ if (score < cand_score) {
+ cand = t;
+ cand_score = score;
+ }
+ }
+
+ for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
+ if ((!ipv6_addr_equal(local, &t->parms.laddr) &&
+ (!ipv6_addr_equal(local, &t->parms.raddr) ||
+ !ipv6_addr_is_multicast(local))) ||
+ key != t->parms.i_key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->dev->type != ARPHRD_IP6GRE &&
+ t->dev->type != dev_type)
+ continue;
+
+ score = 0;
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if (score == 0)
+ return t;
+
+ if (score < cand_score) {
+ cand = t;
+ cand_score = score;
+ }
+ }
+
+ for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
+ if (t->parms.i_key != key ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (t->dev->type != ARPHRD_IP6GRE &&
+ t->dev->type != dev_type)
+ continue;
+
+ score = 0;
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if (score == 0)
+ return t;
+
+ if (score < cand_score) {
+ cand = t;
+ cand_score = score;
+ }
+ }
+
+ if (cand)
+ return cand;
+
+ dev = ign->fb_tunnel_dev;
+ if (dev->flags & IFF_UP)
+ return netdev_priv(dev);
+
+ return NULL;
+}
+
+static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign,
+ const struct __ip6_tnl_parm *p)
+{
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ unsigned int h = HASH_KEY(p->i_key);
+ int prio = 0;
+
+ if (!ipv6_addr_any(local))
+ prio |= 1;
+ if (!ipv6_addr_any(remote) && !ipv6_addr_is_multicast(remote)) {
+ prio |= 2;
+ h ^= HASH_ADDR(remote);
+ }
+
+ return &ign->tunnels[prio][h];
+}
+
+static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign,
+ const struct ip6_tnl *t)
+{
+ return __ip6gre_bucket(ign, &t->parms);
+}
+
+static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t);
+
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
+}
+
+static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *iter;
+
+ for (tp = ip6gre_bucket(ign, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
+ break;
+ }
+ }
+}
+
+static struct ip6_tnl *ip6gre_tunnel_find(struct net *net,
+ const struct __ip6_tnl_parm *parms,
+ int type)
+{
+ const struct in6_addr *remote = &parms->raddr;
+ const struct in6_addr *local = &parms->laddr;
+ __be32 key = parms->i_key;
+ int link = parms->link;
+ struct ip6_tnl *t;
+ struct ip6_tnl __rcu **tp;
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+
+ for (tp = __ip6gre_bucket(ign, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next)
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ key == t->parms.i_key &&
+ link == t->parms.link &&
+ type == t->dev->type)
+ break;
+
+ return t;
+}
+
+static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
+ const struct __ip6_tnl_parm *parms, int create)
+{
+ struct ip6_tnl *t, *nt;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+
+ t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE);
+ if (t && create)
+ return NULL;
+ if (t || !create)
+ return t;
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else
+ strcpy(name, "ip6gre%d");
+
+ dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+ ip6gre_tunnel_setup);
+ if (!dev)
+ return NULL;
+
+ dev_net_set(dev, net);
+
+ nt = netdev_priv(dev);
+ nt->parms = *parms;
+ dev->rtnl_link_ops = &ip6gre_link_ops;
+
+ nt->dev = dev;
+ nt->net = dev_net(dev);
+ ip6gre_tnl_link_config(nt, 1);
+
+ if (register_netdevice(dev) < 0)
+ goto failed_free;
+
+ /* Can use a lockless transmit, unless we generate output sequences */
+ if (!(nt->parms.o_flags & GRE_SEQ))
+ dev->features |= NETIF_F_LLTX;
+
+ dev_hold(dev);
+ ip6gre_tunnel_link(ign, nt);
+ return nt;
+
+failed_free:
+ free_netdev(dev);
+ return NULL;
+}
+
+static void ip6gre_tunnel_uninit(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
+
+ ip6gre_tunnel_unlink(ign, t);
+ dev_put(dev);
+}
+
+
+static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data;
+ __be16 *p = (__be16 *)(skb->data + offset);
+ int grehlen = offset + 4;
+ struct ip6_tnl *t;
+ __be16 flags;
+
+ flags = p[0];
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ return;
+ if (flags&GRE_KEY) {
+ grehlen += 4;
+ if (flags&GRE_CSUM)
+ grehlen += 4;
+ }
+ }
+
+ /* If only 8 bytes returned, keyed message will be dropped here */
+ if (!pskb_may_pull(skb, grehlen))
+ return;
+ ipv6h = (const struct ipv6hdr *)skb->data;
+ p = (__be16 *)(skb->data + offset);
+
+ t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
+ flags & GRE_KEY ?
+ *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
+ p[1]);
+ if (!t)
+ return;
+
+ switch (type) {
+ __u32 teli;
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 mtu;
+ case ICMPV6_DEST_UNREACH:
+ net_warn_ratelimited("%s: Path to destination invalid or inactive!\n",
+ t->parms.name);
+ break;
+ case ICMPV6_TIME_EXCEED:
+ if (code == ICMPV6_EXC_HOPLIMIT) {
+ net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
+ t->parms.name);
+ }
+ break;
+ case ICMPV6_PARAMPROB:
+ teli = 0;
+ if (code == ICMPV6_HDR_FIELD)
+ teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
+
+ if (teli && teli == be32_to_cpu(info) - 2) {
+ tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
+ if (tel->encap_limit == 0) {
+ net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
+ t->parms.name);
+ }
+ } else {
+ net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
+ t->parms.name);
+ }
+ break;
+ case ICMPV6_PKT_TOOBIG:
+ mtu = be32_to_cpu(info) - offset;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ t->dev->mtu = mtu;
+ break;
+ }
+
+ if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+}
+
+static int ip6gre_rcv(struct sk_buff *skb)
+{
+ const struct ipv6hdr *ipv6h;
+ u8 *h;
+ __be16 flags;
+ __sum16 csum = 0;
+ __be32 key = 0;
+ u32 seqno = 0;
+ struct ip6_tnl *tunnel;
+ int offset = 4;
+ __be16 gre_proto;
+ int err;
+
+ if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
+ goto drop;
+
+ ipv6h = ipv6_hdr(skb);
+ h = skb->data;
+ flags = *(__be16 *)h;
+
+ if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
+ /* - Version must be 0.
+ - We do not support routing headers.
+ */
+ if (flags&(GRE_VERSION|GRE_ROUTING))
+ goto drop;
+
+ if (flags&GRE_CSUM) {
+ csum = skb_checksum_simple_validate(skb);
+ offset += 4;
+ }
+ if (flags&GRE_KEY) {
+ key = *(__be32 *)(h + offset);
+ offset += 4;
+ }
+ if (flags&GRE_SEQ) {
+ seqno = ntohl(*(__be32 *)(h + offset));
+ offset += 4;
+ }
+ }
+
+ gre_proto = *(__be16 *)(h + 2);
+
+ tunnel = ip6gre_tunnel_lookup(skb->dev,
+ &ipv6h->saddr, &ipv6h->daddr, key,
+ gre_proto);
+ if (tunnel) {
+ struct pcpu_sw_netstats *tstats;
+
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ if (!ip6_tnl_rcv_ctl(tunnel, &ipv6h->daddr, &ipv6h->saddr)) {
+ tunnel->dev->stats.rx_dropped++;
+ goto drop;
+ }
+
+ skb->protocol = gre_proto;
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IPv6
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
+ skb->protocol = htons(ETH_P_IPV6);
+ if ((*(h + offset) & 0xF0) != 0x40)
+ offset += 4;
+ }
+
+ skb->mac_header = skb->network_header;
+ __pskb_pull(skb, offset);
+ skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
+
+ if (((flags&GRE_CSUM) && csum) ||
+ (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+ if (tunnel->parms.i_flags&GRE_SEQ) {
+ if (!(flags&GRE_SEQ) ||
+ (tunnel->i_seqno &&
+ (s32)(seqno - tunnel->i_seqno) < 0)) {
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+ tunnel->i_seqno = seqno + 1;
+ }
+
+ /* Warning: All skb pointers will be invalidated! */
+ if (tunnel->dev->type == ARPHRD_ETHER) {
+ if (!pskb_may_pull(skb, ETH_HLEN)) {
+ tunnel->dev->stats.rx_length_errors++;
+ tunnel->dev->stats.rx_errors++;
+ goto drop;
+ }
+
+ ipv6h = ipv6_hdr(skb);
+ skb->protocol = eth_type_trans(skb, tunnel->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ }
+
+ __skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
+
+ skb_reset_network_header(skb);
+
+ err = IP6_ECN_decapsulate(ipv6h, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n",
+ &ipv6h->saddr,
+ ipv6_get_dsfield(ipv6h));
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ goto drop;
+ }
+ }
+
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ netif_rx(skb);
+
+ return 0;
+ }
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+struct ipv6_tel_txoption {
+ struct ipv6_txoptions ops;
+ __u8 dst_opt[8];
+};
+
+static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit)
+{
+ memset(opt, 0, sizeof(struct ipv6_tel_txoption));
+
+ opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT;
+ opt->dst_opt[3] = 1;
+ opt->dst_opt[4] = encap_limit;
+ opt->dst_opt[5] = IPV6_TLV_PADN;
+ opt->dst_opt[6] = 1;
+
+ opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt;
+ opt->ops.opt_nflen = 8;
+}
+
+static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
+ struct net_device *dev,
+ __u8 dsfield,
+ struct flowi6 *fl6,
+ int encap_limit,
+ __u32 *pmtu)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct net_device *tdev; /* Device to other host */
+ struct ipv6hdr *ipv6h; /* Our new IP header */
+ unsigned int max_headroom = 0; /* The extra header space needed */
+ int gre_hlen;
+ struct ipv6_tel_txoption opt;
+ int mtu;
+ struct dst_entry *dst = NULL, *ndst = NULL;
+ struct net_device_stats *stats = &tunnel->dev->stats;
+ int err = -1;
+ u8 proto;
+ struct sk_buff *new_skb;
+ __be16 protocol;
+
+ if (dev->type == ARPHRD_ETHER)
+ IPCB(skb)->flags = 0;
+
+ if (dev->header_ops && dev->type == ARPHRD_IP6GRE) {
+ gre_hlen = 0;
+ ipv6h = (struct ipv6hdr *)skb->data;
+ fl6->daddr = ipv6h->daddr;
+ } else {
+ gre_hlen = tunnel->hlen;
+ fl6->daddr = tunnel->parms.raddr;
+ }
+
+ if (!fl6->flowi6_mark)
+ dst = ip6_tnl_dst_check(tunnel);
+
+ if (!dst) {
+ ndst = ip6_route_output(net, NULL, fl6);
+
+ if (ndst->error)
+ goto tx_err_link_failure;
+ ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0);
+ if (IS_ERR(ndst)) {
+ err = PTR_ERR(ndst);
+ ndst = NULL;
+ goto tx_err_link_failure;
+ }
+ dst = ndst;
+ }
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ stats->collisions++;
+ net_warn_ratelimited("%s: Local routing loop detected!\n",
+ tunnel->parms.name);
+ goto tx_err_dst_release;
+ }
+
+ mtu = dst_mtu(dst) - sizeof(*ipv6h);
+ if (encap_limit >= 0) {
+ max_headroom += 8;
+ mtu -= 8;
+ }
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+ if (skb->len > mtu) {
+ *pmtu = mtu;
+ err = -EMSGSIZE;
+ goto tx_err_dst_release;
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IP6TUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
+
+ max_headroom += LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len;
+
+ if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
+ (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
+ new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (max_headroom > dev->needed_headroom)
+ dev->needed_headroom = max_headroom;
+ if (!new_skb)
+ goto tx_err_dst_release;
+
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ consume_skb(skb);
+ skb = new_skb;
+ }
+
+ if (fl6->flowi6_mark) {
+ skb_dst_set(skb, dst);
+ ndst = NULL;
+ } else {
+ skb_dst_set_noref(skb, dst);
+ }
+
+ proto = NEXTHDR_GRE;
+ if (encap_limit >= 0) {
+ init_tel_txopt(&opt, encap_limit);
+ ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
+ }
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ skb_push(skb, gre_hlen);
+ skb_reset_network_header(skb);
+ skb_set_transport_header(skb, sizeof(*ipv6h));
+
+ /*
+ * Push down and install the IP header.
+ */
+ ipv6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
+ ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
+ ipv6h->hop_limit = tunnel->parms.hop_limit;
+ ipv6h->nexthdr = proto;
+ ipv6h->saddr = fl6->saddr;
+ ipv6h->daddr = fl6->daddr;
+
+ ((__be16 *)(ipv6h + 1))[0] = tunnel->parms.o_flags;
+ protocol = (dev->type == ARPHRD_ETHER) ?
+ htons(ETH_P_TEB) : skb->protocol;
+ ((__be16 *)(ipv6h + 1))[1] = protocol;
+
+ if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)ipv6h) + tunnel->hlen - 4);
+
+ if (tunnel->parms.o_flags&GRE_SEQ) {
+ ++tunnel->o_seqno;
+ *ptr = htonl(tunnel->o_seqno);
+ ptr--;
+ }
+ if (tunnel->parms.o_flags&GRE_KEY) {
+ *ptr = tunnel->parms.o_key;
+ ptr--;
+ }
+ if (tunnel->parms.o_flags&GRE_CSUM) {
+ *ptr = 0;
+ *(__sum16 *)ptr = ip_compute_csum((void *)(ipv6h+1),
+ skb->len - sizeof(struct ipv6hdr));
+ }
+ }
+
+ skb_set_inner_protocol(skb, protocol);
+
+ ip6tunnel_xmit(NULL, skb, dev);
+ if (ndst)
+ ip6_tnl_dst_store(tunnel, ndst);
+ return 0;
+tx_err_link_failure:
+ stats->tx_carrier_errors++;
+ dst_link_failure(skb);
+tx_err_dst_release:
+ dst_release(ndst);
+ return err;
+}
+
+static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ const struct iphdr *iph = ip_hdr(skb);
+ int encap_limit = -1;
+ struct flowi6 fl6;
+ __u8 dsfield;
+ __u32 mtu;
+ int err;
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ encap_limit = t->parms.encap_limit;
+
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_GRE;
+
+ dsfield = ipv4_get_dsfield(iph);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+ & IPV6_TCLASS_MASK;
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+
+ err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+ if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
+ if (err == -EMSGSIZE)
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ int encap_limit = -1;
+ __u16 offset;
+ struct flowi6 fl6;
+ __u8 dsfield;
+ __u32 mtu;
+ int err;
+
+ if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
+ return -1;
+
+ offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ encap_limit = tel->encap_limit - 1;
+ } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ encap_limit = t->parms.encap_limit;
+
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_GRE;
+
+ dsfield = ipv6_get_dsfield(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6.flowlabel |= ip6_flowlabel(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+
+ err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+ if (err != 0) {
+ if (err == -EMSGSIZE)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ * @t: the outgoing tunnel device
+ * @hdr: IPv6 header from the incoming packet
+ *
+ * Description:
+ * Avoid trivial tunneling loop by checking that tunnel exit-point
+ * doesn't match source of incoming packet.
+ *
+ * Return:
+ * 1 if conflict,
+ * 0 else
+ **/
+
+static inline bool ip6gre_tnl_addr_conflict(const struct ip6_tnl *t,
+ const struct ipv6hdr *hdr)
+{
+ return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
+}
+
+static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ int encap_limit = -1;
+ struct flowi6 fl6;
+ __u32 mtu;
+ int err;
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ encap_limit = t->parms.encap_limit;
+
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = skb->protocol;
+
+ err = ip6gre_xmit2(skb, dev, 0, &fl6, encap_limit, &mtu);
+
+ return err;
+}
+
+static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net_device_stats *stats = &t->dev->stats;
+ int ret;
+
+ if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
+ goto tx_err;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ret = ip6gre_xmit_ipv4(skb, dev);
+ break;
+ case htons(ETH_P_IPV6):
+ ret = ip6gre_xmit_ipv6(skb, dev);
+ break;
+ default:
+ ret = ip6gre_xmit_other(skb, dev);
+ break;
+ }
+
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ stats->tx_errors++;
+ stats->tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
+{
+ struct net_device *dev = t->dev;
+ struct __ip6_tnl_parm *p = &t->parms;
+ struct flowi6 *fl6 = &t->fl.u.ip6;
+ int addend = sizeof(struct ipv6hdr) + 4;
+
+ if (dev->type != ARPHRD_ETHER) {
+ memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
+ }
+
+ /* Set up flowi template */
+ fl6->saddr = p->laddr;
+ fl6->daddr = p->raddr;
+ fl6->flowi6_oif = p->link;
+ fl6->flowlabel = 0;
+
+ if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
+ fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
+ if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
+ fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
+
+ p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET);
+ p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
+
+ if (p->flags&IP6_TNL_F_CAP_XMIT &&
+ p->flags&IP6_TNL_F_CAP_RCV && dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+ else
+ dev->flags &= ~IFF_POINTOPOINT;
+
+ /* Precalculate GRE options length */
+ if (t->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
+ if (t->parms.o_flags&GRE_CSUM)
+ addend += 4;
+ if (t->parms.o_flags&GRE_KEY)
+ addend += 4;
+ if (t->parms.o_flags&GRE_SEQ)
+ addend += 4;
+ }
+ t->hlen = addend;
+
+ if (p->flags & IP6_TNL_F_CAP_XMIT) {
+ int strict = (ipv6_addr_type(&p->raddr) &
+ (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
+
+ struct rt6_info *rt = rt6_lookup(t->net,
+ &p->raddr, &p->laddr,
+ p->link, strict);
+
+ if (!rt)
+ return;
+
+ if (rt->dst.dev) {
+ dev->hard_header_len = rt->dst.dev->hard_header_len + addend;
+
+ if (set_mtu) {
+ dev->mtu = rt->dst.dev->mtu - addend;
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+
+ if (dev->mtu < IPV6_MIN_MTU)
+ dev->mtu = IPV6_MIN_MTU;
+ }
+ }
+ ip6_rt_put(rt);
+ }
+}
+
+static int ip6gre_tnl_change(struct ip6_tnl *t,
+ const struct __ip6_tnl_parm *p, int set_mtu)
+{
+ t->parms.laddr = p->laddr;
+ t->parms.raddr = p->raddr;
+ t->parms.flags = p->flags;
+ t->parms.hop_limit = p->hop_limit;
+ t->parms.encap_limit = p->encap_limit;
+ t->parms.flowinfo = p->flowinfo;
+ t->parms.link = p->link;
+ t->parms.proto = p->proto;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ t->parms.i_flags = p->i_flags;
+ t->parms.o_flags = p->o_flags;
+ ip6_tnl_dst_reset(t);
+ ip6gre_tnl_link_config(t, set_mtu);
+ return 0;
+}
+
+static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p,
+ const struct ip6_tnl_parm2 *u)
+{
+ p->laddr = u->laddr;
+ p->raddr = u->raddr;
+ p->flags = u->flags;
+ p->hop_limit = u->hop_limit;
+ p->encap_limit = u->encap_limit;
+ p->flowinfo = u->flowinfo;
+ p->link = u->link;
+ p->i_key = u->i_key;
+ p->o_key = u->o_key;
+ p->i_flags = u->i_flags;
+ p->o_flags = u->o_flags;
+ memcpy(p->name, u->name, sizeof(u->name));
+}
+
+static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u,
+ const struct __ip6_tnl_parm *p)
+{
+ u->proto = IPPROTO_GRE;
+ u->laddr = p->laddr;
+ u->raddr = p->raddr;
+ u->flags = p->flags;
+ u->hop_limit = p->hop_limit;
+ u->encap_limit = p->encap_limit;
+ u->flowinfo = p->flowinfo;
+ u->link = p->link;
+ u->i_key = p->i_key;
+ u->o_key = p->o_key;
+ u->i_flags = p->i_flags;
+ u->o_flags = p->o_flags;
+ memcpy(u->name, p->name, sizeof(u->name));
+}
+
+static int ip6gre_tunnel_ioctl(struct net_device *dev,
+ struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip6_tnl_parm2 p;
+ struct __ip6_tnl_parm p1;
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == ign->fb_tunnel_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ ip6gre_tnl_parm_from_user(&p1, &p);
+ t = ip6gre_tunnel_locate(net, &p1, 0);
+ if (!t)
+ t = netdev_priv(dev);
+ }
+ memset(&p, 0, sizeof(p));
+ ip6gre_tnl_parm_to_user(&p, &t->parms);
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))
+ goto done;
+
+ if (!(p.i_flags&GRE_KEY))
+ p.i_key = 0;
+ if (!(p.o_flags&GRE_KEY))
+ p.o_key = 0;
+
+ ip6gre_tnl_parm_from_user(&p1, &p);
+ t = ip6gre_tunnel_locate(net, &p1, cmd == SIOCADDTUNNEL);
+
+ if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ t = netdev_priv(dev);
+
+ ip6gre_tunnel_unlink(ign, t);
+ synchronize_net();
+ ip6gre_tnl_change(t, &p1, 1);
+ ip6gre_tunnel_link(ign, t);
+ netdev_state_change(dev);
+ }
+ }
+
+ if (t) {
+ err = 0;
+
+ memset(&p, 0, sizeof(p));
+ ip6gre_tnl_parm_to_user(&p, &t->parms);
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == ign->fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+ err = -ENOENT;
+ ip6gre_tnl_parm_from_user(&p1, &p);
+ t = ip6gre_tunnel_locate(net, &p1, 0);
+ if (!t)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(ign->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+
+static int ip6gre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if (new_mtu < 68 ||
+ new_mtu > 0xFFF8 - dev->hard_header_len)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen);
+ __be16 *p = (__be16 *)(ipv6h+1);
+
+ ip6_flow_hdr(ipv6h, 0,
+ ip6_make_flowlabel(dev_net(dev), skb,
+ t->fl.u.ip6.flowlabel, false));
+ ipv6h->hop_limit = t->parms.hop_limit;
+ ipv6h->nexthdr = NEXTHDR_GRE;
+ ipv6h->saddr = t->parms.laddr;
+ ipv6h->daddr = t->parms.raddr;
+
+ p[0] = t->parms.o_flags;
+ p[1] = htons(type);
+
+ /*
+ * Set the source hardware address.
+ */
+
+ if (saddr)
+ memcpy(&ipv6h->saddr, saddr, sizeof(struct in6_addr));
+ if (daddr)
+ memcpy(&ipv6h->daddr, daddr, sizeof(struct in6_addr));
+ if (!ipv6_addr_any(&ipv6h->daddr))
+ return t->hlen;
+
+ return -t->hlen;
+}
+
+static const struct header_ops ip6gre_header_ops = {
+ .create = ip6gre_header,
+};
+
+static const struct net_device_ops ip6gre_netdev_ops = {
+ .ndo_init = ip6gre_tunnel_init,
+ .ndo_uninit = ip6gre_tunnel_uninit,
+ .ndo_start_xmit = ip6gre_tunnel_xmit,
+ .ndo_do_ioctl = ip6gre_tunnel_ioctl,
+ .ndo_change_mtu = ip6gre_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+static void ip6gre_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+static void ip6gre_tunnel_setup(struct net_device *dev)
+{
+ struct ip6_tnl *t;
+
+ dev->netdev_ops = &ip6gre_netdev_ops;
+ dev->destructor = ip6gre_dev_free;
+
+ dev->type = ARPHRD_IP6GRE;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr) + 4;
+ dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr) - 4;
+ t = netdev_priv(dev);
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+ dev->flags |= IFF_NOARP;
+ dev->addr_len = sizeof(struct in6_addr);
+ netif_keep_dst(dev);
+}
+
+static int ip6gre_tunnel_init(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel;
+
+ tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+
+ memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));
+ memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
+
+ if (ipv6_addr_any(&tunnel->parms.raddr))
+ dev->header_ops = &ip6gre_header_ops;
+
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void ip6gre_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+
+ tunnel->hlen = sizeof(struct ipv6hdr) + 4;
+
+ dev_hold(dev);
+}
+
+
+static struct inet6_protocol ip6gre_protocol __read_mostly = {
+ .handler = ip6gre_rcv,
+ .err_handler = ip6gre_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
+{
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct net_device *dev, *aux;
+ int prio;
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &ip6gre_link_ops ||
+ dev->rtnl_link_ops == &ip6gre_tap_ops)
+ unregister_netdevice_queue(dev, head);
+
+ for (prio = 0; prio < 4; prio++) {
+ int h;
+ for (h = 0; h < HASH_SIZE; h++) {
+ struct ip6_tnl *t;
+
+ t = rtnl_dereference(ign->tunnels[prio][h]);
+
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev,
+ head);
+ t = rtnl_dereference(t->next);
+ }
+ }
+ }
+}
+
+static int __net_init ip6gre_init_net(struct net *net)
+{
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ int err;
+
+ ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
+ NET_NAME_UNKNOWN,
+ ip6gre_tunnel_setup);
+ if (!ign->fb_tunnel_dev) {
+ err = -ENOMEM;
+ goto err_alloc_dev;
+ }
+ dev_net_set(ign->fb_tunnel_dev, net);
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+
+
+ ip6gre_fb_tunnel_init(ign->fb_tunnel_dev);
+ ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops;
+
+ err = register_netdev(ign->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
+
+ rcu_assign_pointer(ign->tunnels_wc[0],
+ netdev_priv(ign->fb_tunnel_dev));
+ return 0;
+
+err_reg_dev:
+ ip6gre_dev_free(ign->fb_tunnel_dev);
+err_alloc_dev:
+ return err;
+}
+
+static void __net_exit ip6gre_exit_net(struct net *net)
+{
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ ip6gre_destroy_tunnels(net, &list);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations ip6gre_net_ops = {
+ .init = ip6gre_init_net,
+ .exit = ip6gre_exit_net,
+ .id = &ip6gre_net_id,
+ .size = sizeof(struct ip6gre_net),
+};
+
+static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ __be16 flags;
+
+ if (!data)
+ return 0;
+
+ flags = 0;
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (flags & (GRE_VERSION|GRE_ROUTING))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ struct in6_addr daddr;
+
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (!data)
+ goto out;
+
+ if (data[IFLA_GRE_REMOTE]) {
+ daddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]);
+ if (ipv6_addr_any(&daddr))
+ return -EINVAL;
+ }
+
+out:
+ return ip6gre_tunnel_validate(tb, data);
+}
+
+
+static void ip6gre_netlink_parms(struct nlattr *data[],
+ struct __ip6_tnl_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_GRE_LINK])
+ parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
+
+ if (data[IFLA_GRE_IFLAGS])
+ parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
+
+ if (data[IFLA_GRE_OFLAGS])
+ parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
+
+ if (data[IFLA_GRE_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
+
+ if (data[IFLA_GRE_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
+
+ if (data[IFLA_GRE_LOCAL])
+ parms->laddr = nla_get_in6_addr(data[IFLA_GRE_LOCAL]);
+
+ if (data[IFLA_GRE_REMOTE])
+ parms->raddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]);
+
+ if (data[IFLA_GRE_TTL])
+ parms->hop_limit = nla_get_u8(data[IFLA_GRE_TTL]);
+
+ if (data[IFLA_GRE_ENCAP_LIMIT])
+ parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);
+
+ if (data[IFLA_GRE_FLOWINFO])
+ parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]);
+
+ if (data[IFLA_GRE_FLAGS])
+ parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);
+}
+
+static int ip6gre_tap_init(struct net_device *dev)
+{
+ struct ip6_tnl *tunnel;
+
+ tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+
+ ip6gre_tnl_link_config(tunnel, 1);
+
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static const struct net_device_ops ip6gre_tap_netdev_ops = {
+ .ndo_init = ip6gre_tap_init,
+ .ndo_uninit = ip6gre_tunnel_uninit,
+ .ndo_start_xmit = ip6gre_tunnel_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip6gre_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+static void ip6gre_tap_setup(struct net_device *dev)
+{
+
+ ether_setup(dev);
+
+ dev->netdev_ops = &ip6gre_tap_netdev_ops;
+ dev->destructor = ip6gre_dev_free;
+
+ dev->features |= NETIF_F_NETNS_LOCAL;
+}
+
+static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct ip6_tnl *nt;
+ struct net *net = dev_net(dev);
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ int err;
+
+ nt = netdev_priv(dev);
+ ip6gre_netlink_parms(data, &nt->parms);
+
+ if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
+ return -EEXIST;
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ nt->dev = dev;
+ nt->net = dev_net(dev);
+ ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
+
+ /* Can use a lockless transmit, unless we generate output sequences */
+ if (!(nt->parms.o_flags & GRE_SEQ))
+ dev->features |= NETIF_F_LLTX;
+
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ dev_hold(dev);
+ ip6gre_tunnel_link(ign, nt);
+
+out:
+ return err;
+}
+
+static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip6_tnl *t, *nt = netdev_priv(dev);
+ struct net *net = nt->net;
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct __ip6_tnl_parm p;
+
+ if (dev == ign->fb_tunnel_dev)
+ return -EINVAL;
+
+ ip6gre_netlink_parms(data, &p);
+
+ t = ip6gre_tunnel_locate(net, &p, 0);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else {
+ t = nt;
+
+ ip6gre_tunnel_unlink(ign, t);
+ ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
+ ip6gre_tunnel_link(ign, t);
+ netdev_state_change(dev);
+ }
+
+ return 0;
+}
+
+static void ip6gre_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+
+ if (dev != ign->fb_tunnel_dev)
+ unregister_netdevice_queue(dev, head);
+}
+
+static size_t ip6gre_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_GRE_LINK */
+ nla_total_size(4) +
+ /* IFLA_GRE_IFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_OFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_IKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_OKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_LOCAL */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_GRE_REMOTE */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_GRE_TTL */
+ nla_total_size(1) +
+ /* IFLA_GRE_TOS */
+ nla_total_size(1) +
+ /* IFLA_GRE_ENCAP_LIMIT */
+ nla_total_size(1) +
+ /* IFLA_GRE_FLOWINFO */
+ nla_total_size(4) +
+ /* IFLA_GRE_FLAGS */
+ nla_total_size(4) +
+ 0;
+}
+
+static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct __ip6_tnl_parm *p = &t->parms;
+
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) ||
+ nla_put_in6_addr(skb, IFLA_GRE_REMOTE, &p->raddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) ||
+ /*nla_put_u8(skb, IFLA_GRE_TOS, t->priority) ||*/
+ nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) ||
+ nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
+ nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
+ [IFLA_GRE_LINK] = { .type = NLA_U32 },
+ [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_IKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_OKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct ipv6hdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct ipv6hdr, daddr) },
+ [IFLA_GRE_TTL] = { .type = NLA_U8 },
+ [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
+ [IFLA_GRE_FLOWINFO] = { .type = NLA_U32 },
+ [IFLA_GRE_FLAGS] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
+ .kind = "ip6gre",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ip6gre_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = ip6gre_tunnel_setup,
+ .validate = ip6gre_tunnel_validate,
+ .newlink = ip6gre_newlink,
+ .changelink = ip6gre_changelink,
+ .dellink = ip6gre_dellink,
+ .get_size = ip6gre_get_size,
+ .fill_info = ip6gre_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
+static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
+ .kind = "ip6gretap",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ip6gre_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = ip6gre_tap_setup,
+ .validate = ip6gre_tap_validate,
+ .newlink = ip6gre_newlink,
+ .changelink = ip6gre_changelink,
+ .get_size = ip6gre_get_size,
+ .fill_info = ip6gre_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
+/*
+ * And now the modules code and kernel interface.
+ */
+
+static int __init ip6gre_init(void)
+{
+ int err;
+
+ pr_info("GRE over IPv6 tunneling driver\n");
+
+ err = register_pernet_device(&ip6gre_net_ops);
+ if (err < 0)
+ return err;
+
+ err = inet6_add_protocol(&ip6gre_protocol, IPPROTO_GRE);
+ if (err < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ goto add_proto_failed;
+ }
+
+ err = rtnl_link_register(&ip6gre_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ err = rtnl_link_register(&ip6gre_tap_ops);
+ if (err < 0)
+ goto tap_ops_failed;
+
+out:
+ return err;
+
+tap_ops_failed:
+ rtnl_link_unregister(&ip6gre_link_ops);
+rtnl_link_failed:
+ inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
+add_proto_failed:
+ unregister_pernet_device(&ip6gre_net_ops);
+ goto out;
+}
+
+static void __exit ip6gre_fini(void)
+{
+ rtnl_link_unregister(&ip6gre_tap_ops);
+ rtnl_link_unregister(&ip6gre_link_ops);
+ inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE);
+ unregister_pernet_device(&ip6gre_net_ops);
+}
+
+module_init(ip6gre_init);
+module_exit(ip6gre_fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_DESCRIPTION("GRE over IPv6 tunneling device");
+MODULE_ALIAS_RTNL_LINK("ip6gre");
+MODULE_ALIAS_RTNL_LINK("ip6gretap");
+MODULE_ALIAS_NETDEV("ip6gre0");
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
new file mode 100644
index 000000000..14dacc544
--- /dev/null
+++ b/net/ipv6/ip6_icmp.c
@@ -0,0 +1,47 @@
+#include <linux/export.h>
+#include <linux/icmpv6.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+
+#include <net/ipv6.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+static ip6_icmp_send_t __rcu *ip6_icmp_send;
+
+int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
+{
+ return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ?
+ 0 : -EBUSY;
+}
+EXPORT_SYMBOL(inet6_register_icmp_sender);
+
+int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
+{
+ int ret;
+
+ ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ?
+ 0 : -EINVAL;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet6_unregister_icmp_sender);
+
+void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+{
+ ip6_icmp_send_t *send;
+
+ rcu_read_lock();
+ send = rcu_dereference(ip6_icmp_send);
+
+ if (!send)
+ goto out;
+ send(skb, type, code, info);
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(icmpv6_send);
+#endif
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
new file mode 100644
index 000000000..f2e464eba
--- /dev/null
+++ b/net/ipv6/ip6_input.c
@@ -0,0 +1,363 @@
+/*
+ * IPv6 input
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Ian P. Morris <I.P.Morris@soton.ac.uk>
+ *
+ * Based in linux/net/ipv4/ip_input.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+/* Changes
+ *
+ * Mitsuru KANDA @USAGI and
+ * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs().
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/mroute6.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+#include <net/inet_ecn.h>
+
+int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb)
+{
+ if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
+ const struct inet6_protocol *ipprot;
+
+ ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
+ if (ipprot && ipprot->early_demux)
+ ipprot->early_demux(skb);
+ }
+ if (!skb_dst(skb))
+ ip6_route_input(skb);
+
+ return dst_input(skb);
+}
+
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ const struct ipv6hdr *hdr;
+ u32 pkt_len;
+ struct inet6_dev *idev;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ rcu_read_lock();
+
+ idev = __in6_dev_get(skb->dev);
+
+ IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len);
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
+ !idev || unlikely(idev->cnf.disable_ipv6)) {
+ IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+
+ /*
+ * Store incoming device index. When the packet will
+ * be queued, we cannot refer to skb->dev anymore.
+ *
+ * BTW, when we send a packet for our own local address on a
+ * non-loopback interface (e.g. ethX), it is being delivered
+ * via the loopback interface (lo) here; skb->dev = loopback_dev.
+ * It, however, should be considered as if it is being
+ * arrived via the sending interface (ethX), because of the
+ * nature of scoping architecture. --yoshfuji
+ */
+ IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
+ goto err;
+
+ hdr = ipv6_hdr(skb);
+
+ if (hdr->version != 6)
+ goto err;
+
+ IP6_ADD_STATS_BH(dev_net(dev), idev,
+ IPSTATS_MIB_NOECTPKTS +
+ (ipv6_get_dsfield(hdr) & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+ /*
+ * RFC4291 2.5.3
+ * A packet received on an interface with a destination address
+ * of loopback must be dropped.
+ */
+ if (!(dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_loopback(&hdr->daddr))
+ goto err;
+
+ /* RFC4291 Errata ID: 3480
+ * Interface-Local scope spans only a single interface on a
+ * node and is useful only for loopback transmission of
+ * multicast. Packets with interface-local scope received
+ * from another node must be discarded.
+ */
+ if (!(skb->pkt_type == PACKET_LOOPBACK ||
+ dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_is_multicast(&hdr->daddr) &&
+ IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
+ goto err;
+
+ /* RFC4291 2.7
+ * Nodes must not originate a packet to a multicast address whose scope
+ * field contains the reserved value 0; if such a packet is received, it
+ * must be silently dropped.
+ */
+ if (ipv6_addr_is_multicast(&hdr->daddr) &&
+ IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 0)
+ goto err;
+
+ /*
+ * RFC4291 2.7
+ * Multicast addresses must not be used as source addresses in IPv6
+ * packets or appear in any Routing header.
+ */
+ if (ipv6_addr_is_multicast(&hdr->saddr))
+ goto err;
+
+ skb->transport_header = skb->network_header + sizeof(*hdr);
+ IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+ pkt_len = ntohs(hdr->payload_len);
+
+ /* pkt_len may be zero if Jumbo payload option is present */
+ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
+ IP6_INC_STATS_BH(net,
+ idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ }
+ if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) {
+ IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+ goto drop;
+ }
+ hdr = ipv6_hdr(skb);
+ }
+
+ if (hdr->nexthdr == NEXTHDR_HOP) {
+ if (ipv6_parse_hopopts(skb) < 0) {
+ IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+ rcu_read_unlock();
+ return NET_RX_DROP;
+ }
+ }
+
+ rcu_read_unlock();
+
+ /* Must drop socket now because of tproxy. */
+ skb_orphan(skb);
+
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb,
+ dev, NULL,
+ ip6_rcv_finish);
+err:
+ IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+/*
+ * Deliver the packet to the host
+ */
+
+
+static int ip6_input_finish(struct sock *sk, struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ const struct inet6_protocol *ipprot;
+ struct inet6_dev *idev;
+ unsigned int nhoff;
+ int nexthdr;
+ bool raw;
+
+ /*
+ * Parse extension headers
+ */
+
+ rcu_read_lock();
+resubmit:
+ idev = ip6_dst_idev(skb_dst(skb));
+ if (!pskb_pull(skb, skb_transport_offset(skb)))
+ goto discard;
+ nhoff = IP6CB(skb)->nhoff;
+ nexthdr = skb_network_header(skb)[nhoff];
+
+ raw = raw6_local_deliver(skb, nexthdr);
+ ipprot = rcu_dereference(inet6_protos[nexthdr]);
+ if (ipprot) {
+ int ret;
+
+ if (ipprot->flags & INET6_PROTO_FINAL) {
+ const struct ipv6hdr *hdr;
+
+ /* Free reference early: we don't need it any more,
+ and it may hold ip_conntrack module loaded
+ indefinitely. */
+ nf_reset(skb);
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ hdr = ipv6_hdr(skb);
+ if (ipv6_addr_is_multicast(&hdr->daddr) &&
+ !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr,
+ &hdr->saddr) &&
+ !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb)))
+ goto discard;
+ }
+ if (!(ipprot->flags & INET6_PROTO_NOPOLICY) &&
+ !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard;
+
+ ret = ipprot->handler(skb);
+ if (ret > 0)
+ goto resubmit;
+ else if (ret == 0)
+ IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ IP6_INC_STATS_BH(net, idev,
+ IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_UNK_NEXTHDR, nhoff);
+ }
+ kfree_skb(skb);
+ } else {
+ IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+
+discard:
+ IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return 0;
+}
+
+
+int ip6_input(struct sk_buff *skb)
+{
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, NULL, skb,
+ skb->dev, NULL,
+ ip6_input_finish);
+}
+
+int ip6_mc_input(struct sk_buff *skb)
+{
+ const struct ipv6hdr *hdr;
+ bool deliver;
+
+ IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev),
+ ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
+ skb->len);
+
+ hdr = ipv6_hdr(skb);
+ deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
+
+#ifdef CONFIG_IPV6_MROUTE
+ /*
+ * IPv6 multicast router mode is now supported ;)
+ */
+ if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
+ !(ipv6_addr_type(&hdr->daddr) &
+ (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
+ likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
+ /*
+ * Okay, we try to forward - split and duplicate
+ * packets.
+ */
+ struct sk_buff *skb2;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+
+ /* Check for MLD */
+ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
+ /* Check if this is a mld message */
+ u8 nexthdr = hdr->nexthdr;
+ __be16 frag_off;
+ int offset;
+
+ /* Check if the value of Router Alert
+ * is for MLD (0x0000).
+ */
+ if (opt->ra == htons(IPV6_OPT_ROUTERALERT_MLD)) {
+ deliver = false;
+
+ if (!ipv6_ext_hdr(nexthdr)) {
+ /* BUG */
+ goto out;
+ }
+ offset = ipv6_skip_exthdr(skb, sizeof(*hdr),
+ &nexthdr, &frag_off);
+ if (offset < 0)
+ goto out;
+
+ if (!ipv6_is_mld(skb, nexthdr, offset))
+ goto out;
+
+ deliver = true;
+ }
+ /* unknown RA - process it normally */
+ }
+
+ if (deliver)
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ else {
+ skb2 = skb;
+ skb = NULL;
+ }
+
+ if (skb2) {
+ ip6_mr_input(skb2);
+ }
+ }
+out:
+#endif
+ if (likely(deliver))
+ ip6_input(skb);
+ else {
+ /* discard */
+ kfree_skb(skb);
+ }
+
+ return 0;
+}
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
new file mode 100644
index 000000000..e893cd186
--- /dev/null
+++ b/net/ipv6/ip6_offload.c
@@ -0,0 +1,317 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/printk.h>
+
+#include <net/protocol.h>
+#include <net/ipv6.h>
+
+#include "ip6_offload.h"
+
+static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
+{
+ const struct net_offload *ops = NULL;
+
+ for (;;) {
+ struct ipv6_opt_hdr *opth;
+ int len;
+
+ if (proto != NEXTHDR_HOP) {
+ ops = rcu_dereference(inet6_offloads[proto]);
+
+ if (unlikely(!ops))
+ break;
+
+ if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
+ }
+
+ if (unlikely(!pskb_may_pull(skb, 8)))
+ break;
+
+ opth = (void *)skb->data;
+ len = ipv6_optlen(opth);
+
+ if (unlikely(!pskb_may_pull(skb, len)))
+ break;
+
+ opth = (void *)skb->data;
+ proto = opth->nexthdr;
+ __skb_pull(skb, len);
+ }
+
+ return proto;
+}
+
+static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct ipv6hdr *ipv6h;
+ const struct net_offload *ops;
+ int proto;
+ struct frag_hdr *fptr;
+ unsigned int unfrag_ip6hlen;
+ u8 *prevhdr;
+ int offset = 0;
+ bool encap, udpfrag;
+ int nhoff;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
+ SKB_GSO_TCPV6 |
+ 0)))
+ goto out;
+
+ skb_reset_network_header(skb);
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
+ goto out;
+
+ encap = SKB_GSO_CB(skb)->encap_level > 0;
+ if (encap)
+ features &= skb->dev->hw_enc_features;
+ SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h);
+
+ ipv6h = ipv6_hdr(skb);
+ __skb_pull(skb, sizeof(*ipv6h));
+ segs = ERR_PTR(-EPROTONOSUPPORT);
+
+ proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
+
+ if (skb->encapsulation &&
+ skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
+ udpfrag = proto == IPPROTO_UDP && encap;
+ else
+ udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment)) {
+ skb_reset_transport_header(skb);
+ segs = ops->callbacks.gso_segment(skb, features);
+ }
+
+ if (IS_ERR(segs))
+ goto out;
+
+ for (skb = segs; skb; skb = skb->next) {
+ ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
+ ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h));
+ skb->network_header = (u8 *)ipv6h - skb->head;
+
+ if (udpfrag) {
+ unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen);
+ fptr->frag_off = htons(offset);
+ if (skb->next)
+ fptr->frag_off |= htons(IP6_MF);
+ offset += (ntohs(ipv6h->payload_len) -
+ sizeof(struct frag_hdr));
+ }
+ if (encap)
+ skb_reset_inner_headers(skb);
+ }
+
+out:
+ return segs;
+}
+
+/* Return the total length of all the extension hdrs, following the same
+ * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs.
+ */
+static int ipv6_exthdrs_len(struct ipv6hdr *iph,
+ const struct net_offload **opps)
+{
+ struct ipv6_opt_hdr *opth = (void *)iph;
+ int len = 0, proto, optlen = sizeof(*iph);
+
+ proto = iph->nexthdr;
+ for (;;) {
+ if (proto != NEXTHDR_HOP) {
+ *opps = rcu_dereference(inet6_offloads[proto]);
+ if (unlikely(!(*opps)))
+ break;
+ if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
+ }
+ opth = (void *)opth + optlen;
+ optlen = ipv6_optlen(opth);
+ len += optlen;
+ proto = opth->nexthdr;
+ }
+ return len;
+}
+
+static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ const struct net_offload *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ struct ipv6hdr *iph;
+ unsigned int nlen;
+ unsigned int hlen;
+ unsigned int off;
+ u16 flush = 1;
+ int proto;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ iph = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+ }
+
+ skb_set_network_header(skb, off);
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
+ flush += ntohs(iph->payload_len) != skb_gro_len(skb);
+
+ rcu_read_lock();
+ proto = iph->nexthdr;
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive) {
+ __pskb_pull(skb, skb_gro_offset(skb));
+ proto = ipv6_gso_pull_exthdrs(skb, proto);
+ skb_gro_pull(skb, -skb_transport_offset(skb));
+ skb_reset_transport_header(skb);
+ __skb_push(skb, skb_gro_offset(skb));
+
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out_unlock;
+
+ iph = ipv6_hdr(skb);
+ }
+
+ NAPI_GRO_CB(skb)->proto = proto;
+
+ flush--;
+ nlen = skb_network_header_len(skb);
+
+ for (p = *head; p; p = p->next) {
+ const struct ipv6hdr *iph2;
+ __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ iph2 = (struct ipv6hdr *)(p->data + off);
+ first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
+
+ /* All fields must match except length and Traffic Class.
+ * XXX skbs on the gro_list have all been parsed and pulled
+ * already so we don't need to compare nlen
+ * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
+ * memcmp() alone below is suffcient, right?
+ */
+ if ((first_word & htonl(0xF00FFFFF)) ||
+ memcmp(&iph->nexthdr, &iph2->nexthdr,
+ nlen - offsetof(struct ipv6hdr, nexthdr))) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ /* flush if Traffic Class fields are different */
+ NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
+ NAPI_GRO_CB(p)->flush |= flush;
+
+ /* Clear flush_id, there's really no concept of ID in IPv6. */
+ NAPI_GRO_CB(p)->flush_id = 0;
+ }
+
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ skb_gro_postpull_rcsum(skb, iph, nlen);
+
+ pp = ops->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const struct net_offload *ops;
+ struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
+ int err = -ENOSYS;
+
+ iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
+
+ rcu_read_lock();
+
+ nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ goto out_unlock;
+
+ err = ops->callbacks.gro_complete(skb, nhoff);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static struct packet_offload ipv6_packet_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IPV6),
+ .callbacks = {
+ .gso_segment = ipv6_gso_segment,
+ .gro_receive = ipv6_gro_receive,
+ .gro_complete = ipv6_gro_complete,
+ },
+};
+
+static const struct net_offload sit_offload = {
+ .callbacks = {
+ .gso_segment = ipv6_gso_segment,
+ .gro_receive = ipv6_gro_receive,
+ .gro_complete = ipv6_gro_complete,
+ },
+};
+
+static int __init ipv6_offload_init(void)
+{
+
+ if (tcpv6_offload_init() < 0)
+ pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+ if (udp_offload_init() < 0)
+ pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
+ if (ipv6_exthdrs_offload_init() < 0)
+ pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
+
+ dev_add_offload(&ipv6_packet_offload);
+
+ inet_add_offload(&sit_offload, IPPROTO_IPV6);
+
+ return 0;
+}
+
+fs_initcall(ipv6_offload_init);
diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h
new file mode 100644
index 000000000..2e155c651
--- /dev/null
+++ b/net/ipv6/ip6_offload.h
@@ -0,0 +1,18 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef __ip6_offload_h
+#define __ip6_offload_h
+
+int ipv6_exthdrs_offload_init(void);
+int udp_offload_init(void);
+int tcpv6_offload_init(void);
+
+#endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
new file mode 100644
index 000000000..bc09cb97b
--- /dev/null
+++ b/net/ipv6/ip6_output.c
@@ -0,0 +1,1762 @@
+/*
+ * IPv6 output functions
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Based on linux/net/ipv4/ip_output.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * A.N.Kuznetsov : airthmetics in fragmentation.
+ * extension headers are implemented.
+ * route changes now work.
+ * ip6_forward does not confuse sniffers.
+ * etc.
+ *
+ * H. von Brand : Added missing #include <linux/string.h>
+ * Imran Patel : frag id should be in NBO
+ * Kazunori MIYAZAWA @USAGI
+ * : add ip6_append_data and related functions
+ * for datagram xmit
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/tcp.h>
+#include <linux/route.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/rawv6.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/checksum.h>
+#include <linux/mroute6.h>
+
+static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst->dev;
+ struct neighbour *neigh;
+ struct in6_addr *nexthop;
+ int ret;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
+
+ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
+ struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+
+ if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
+ ((mroute6_socket(dev_net(dev), skb) &&
+ !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
+ ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr))) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+
+ /* Do not check for IFF_ALLMULTI; multicast routing
+ is not supported in any case.
+ */
+ if (newskb)
+ NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ sk, newskb, NULL, newskb->dev,
+ dev_loopback_xmit);
+
+ if (ipv6_hdr(skb)->hop_limit == 0) {
+ IP6_INC_STATS(dev_net(dev), idev,
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
+ skb->len);
+
+ if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
+ IPV6_ADDR_SCOPE_NODELOCAL &&
+ !(dev->flags & IFF_LOOPBACK)) {
+ kfree_skb(skb);
+ return 0;
+ }
+ }
+
+ rcu_read_lock_bh();
+ nexthop = rt6_nexthop((struct rt6_info *)dst);
+ neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
+ if (!IS_ERR(neigh)) {
+ ret = dst_neigh_output(dst, neigh, skb);
+ rcu_read_unlock_bh();
+ return ret;
+ }
+ rcu_read_unlock_bh();
+
+ IP6_INC_STATS(dev_net(dst->dev),
+ ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
+{
+ if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+ dst_allfrag(skb_dst(skb)) ||
+ (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
+ return ip6_fragment(sk, skb, ip6_finish_output2);
+ else
+ return ip6_finish_output2(sk, skb);
+}
+
+int ip6_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst(skb)->dev;
+ struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ if (unlikely(idev->cnf.disable_ipv6)) {
+ IP6_INC_STATS(dev_net(dev), idev,
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return 0;
+ }
+
+ return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
+ NULL, dev,
+ ip6_finish_output,
+ !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+}
+
+/*
+ * xmit an sk_buff (used by TCP, SCTP and DCCP)
+ */
+
+int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
+ struct ipv6_txoptions *opt, int tclass)
+{
+ struct net *net = sock_net(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *first_hop = &fl6->daddr;
+ struct dst_entry *dst = skb_dst(skb);
+ struct ipv6hdr *hdr;
+ u8 proto = fl6->flowi6_proto;
+ int seg_len = skb->len;
+ int hlimit = -1;
+ u32 mtu;
+
+ if (opt) {
+ unsigned int head_room;
+
+ /* First: exthdrs may take lots of space (~8K for now)
+ MAX_HEADER is not enough.
+ */
+ head_room = opt->opt_nflen + opt->opt_flen;
+ seg_len += head_room;
+ head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
+
+ if (skb_headroom(skb) < head_room) {
+ struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+ if (!skb2) {
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+ consume_skb(skb);
+ skb = skb2;
+ skb_set_owner_w(skb, sk);
+ }
+ if (opt->opt_flen)
+ ipv6_push_frag_opts(skb, opt, &proto);
+ if (opt->opt_nflen)
+ ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
+ }
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ hdr = ipv6_hdr(skb);
+
+ /*
+ * Fill in the IPv6 header
+ */
+ if (np)
+ hlimit = np->hop_limit;
+ if (hlimit < 0)
+ hlimit = ip6_dst_hoplimit(dst);
+
+ ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
+ np->autoflowlabel));
+
+ hdr->payload_len = htons(seg_len);
+ hdr->nexthdr = proto;
+ hdr->hop_limit = hlimit;
+
+ hdr->saddr = fl6->saddr;
+ hdr->daddr = *first_hop;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+
+ mtu = dst_mtu(dst);
+ if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
+ IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUT, skb->len);
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
+ NULL, dst->dev, dst_output_sk);
+ }
+
+ skb->dev = dst->dev;
+ ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ip6_xmit);
+
+static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
+{
+ struct ip6_ra_chain *ra;
+ struct sock *last = NULL;
+
+ read_lock(&ip6_ra_lock);
+ for (ra = ip6_ra_chain; ra; ra = ra->next) {
+ struct sock *sk = ra->sk;
+ if (sk && ra->sel == sel &&
+ (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == skb->dev->ifindex)) {
+ if (last) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ rawv6_rcv(last, skb2);
+ }
+ last = sk;
+ }
+ }
+
+ if (last) {
+ rawv6_rcv(last, skb);
+ read_unlock(&ip6_ra_lock);
+ return 1;
+ }
+ read_unlock(&ip6_ra_lock);
+ return 0;
+}
+
+static int ip6_forward_proxy_check(struct sk_buff *skb)
+{
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ u8 nexthdr = hdr->nexthdr;
+ __be16 frag_off;
+ int offset;
+
+ if (ipv6_ext_hdr(nexthdr)) {
+ offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
+ if (offset < 0)
+ return 0;
+ } else
+ offset = sizeof(struct ipv6hdr);
+
+ if (nexthdr == IPPROTO_ICMPV6) {
+ struct icmp6hdr *icmp6;
+
+ if (!pskb_may_pull(skb, (skb_network_header(skb) +
+ offset + 1 - skb->data)))
+ return 0;
+
+ icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
+
+ switch (icmp6->icmp6_type) {
+ case NDISC_ROUTER_SOLICITATION:
+ case NDISC_ROUTER_ADVERTISEMENT:
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ case NDISC_REDIRECT:
+ /* For reaction involving unicast neighbor discovery
+ * message destined to the proxied address, pass it to
+ * input function.
+ */
+ return 1;
+ default:
+ break;
+ }
+ }
+
+ /*
+ * The proxying router can't forward traffic sent to a link-local
+ * address, so signal the sender and discard the packet. This
+ * behavior is clarified by the MIPv6 specification.
+ */
+ if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
+ dst_link_failure(skb);
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+ skb_sender_cpu_clear(skb);
+ return dst_output_sk(sk, skb);
+}
+
+static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+{
+ unsigned int mtu;
+ struct inet6_dev *idev;
+
+ if (dst_metric_locked(dst, RTAX_MTU)) {
+ mtu = dst_metric_raw(dst, RTAX_MTU);
+ if (mtu)
+ return mtu;
+ }
+
+ mtu = IPV6_MIN_MTU;
+ rcu_read_lock();
+ idev = __in6_dev_get(dst->dev);
+ if (idev)
+ mtu = idev->cnf.mtu6;
+ rcu_read_unlock();
+
+ return mtu;
+}
+
+static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
+ if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
+ return true;
+
+ if (skb->ignore_df)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ return false;
+
+ return true;
+}
+
+int ip6_forward(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(dst->dev);
+ u32 mtu;
+
+ if (net->ipv6.devconf_all->forwarding == 0)
+ goto error;
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ if (skb_warn_if_lro(skb))
+ goto drop;
+
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ skb_forward_csum(skb);
+
+ /*
+ * We DO NOT make any processing on
+ * RA packets, pushing them to user level AS IS
+ * without ane WARRANTY that application will be able
+ * to interpret them. The reason is that we
+ * cannot make anything clever here.
+ *
+ * We are not end-node, so that if packet contains
+ * AH/ESP, we cannot make anything.
+ * Defragmentation also would be mistake, RA packets
+ * cannot be fragmented, because there is no warranty
+ * that different fragments will go along one path. --ANK
+ */
+ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
+ if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
+ return 0;
+ }
+
+ /*
+ * check and decrement ttl
+ */
+ if (hdr->hop_limit <= 1) {
+ /* Force OUTPUT device used as source address */
+ skb->dev = dst->dev;
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INHDRERRORS);
+
+ kfree_skb(skb);
+ return -ETIMEDOUT;
+ }
+
+ /* XXX: idev->cnf.proxy_ndp? */
+ if (net->ipv6.devconf_all->proxy_ndp &&
+ pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
+ int proxied = ip6_forward_proxy_check(skb);
+ if (proxied > 0)
+ return ip6_input(skb);
+ else if (proxied < 0) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+ }
+
+ if (!xfrm6_route_forward(skb)) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+ dst = skb_dst(skb);
+
+ /* IPv6 specs say nothing about it, but it is clear that we cannot
+ send redirects to source routed frames.
+ We don't send redirects to frames decapsulated from IPsec.
+ */
+ if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
+ struct in6_addr *target = NULL;
+ struct inet_peer *peer;
+ struct rt6_info *rt;
+
+ /*
+ * incoming and outgoing devices are the same
+ * send a redirect.
+ */
+
+ rt = (struct rt6_info *) dst;
+ if (rt->rt6i_flags & RTF_GATEWAY)
+ target = &rt->rt6i_gateway;
+ else
+ target = &hdr->daddr;
+
+ peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+
+ /* Limit redirects both by destination (here)
+ and by source (inside ndisc_send_redirect)
+ */
+ if (inet_peer_xrlim_allow(peer, 1*HZ))
+ ndisc_send_redirect(skb, target);
+ if (peer)
+ inet_putpeer(peer);
+ } else {
+ int addrtype = ipv6_addr_type(&hdr->saddr);
+
+ /* This check is security critical. */
+ if (addrtype == IPV6_ADDR_ANY ||
+ addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
+ goto error;
+ if (addrtype & IPV6_ADDR_LINKLOCAL) {
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH,
+ ICMPV6_NOT_NEIGHBOUR, 0);
+ goto error;
+ }
+ }
+
+ mtu = ip6_dst_mtu_forward(dst);
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ if (ip6_pkt_too_big(skb, mtu)) {
+ /* Again, force OUTPUT device used as source address */
+ skb->dev = dst->dev;
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_INTOOBIGERRORS);
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ if (skb_cow(skb, dst->dev->hard_header_len)) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
+ IPSTATS_MIB_OUTDISCARDS);
+ goto drop;
+ }
+
+ hdr = ipv6_hdr(skb);
+
+ /* Mangling hops number delayed to point after skb COW */
+
+ hdr->hop_limit--;
+
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
+ skb->dev, dst->dev,
+ ip6_forward_finish);
+
+error:
+ IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+ to->pkt_type = from->pkt_type;
+ to->priority = from->priority;
+ to->protocol = from->protocol;
+ skb_dst_drop(to);
+ skb_dst_set(to, dst_clone(skb_dst(from)));
+ to->dev = from->dev;
+ to->mark = from->mark;
+
+#ifdef CONFIG_NET_SCHED
+ to->tc_index = from->tc_index;
+#endif
+ nf_copy(to, from);
+ skb_copy_secmark(to, from);
+}
+
+int ip6_fragment(struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct sock *, struct sk_buff *))
+{
+ struct sk_buff *frag;
+ struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+ inet6_sk(skb->sk) : NULL;
+ struct ipv6hdr *tmp_hdr;
+ struct frag_hdr *fh;
+ unsigned int mtu, hlen, left, len;
+ int hroom, troom;
+ __be32 frag_id = 0;
+ int ptr, offset = 0, err = 0;
+ u8 *prevhdr, nexthdr = 0;
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ nexthdr = *prevhdr;
+
+ mtu = ip6_skb_dst_mtu(skb);
+
+ /* We must not fragment if the socket is set to force MTU discovery
+ * or if the skb it not generated by a local socket.
+ */
+ if (unlikely(!skb->ignore_df && skb->len > mtu) ||
+ (IP6CB(skb)->frag_max_size &&
+ IP6CB(skb)->frag_max_size > mtu)) {
+ if (skb->sk && dst_allfrag(skb_dst(skb)))
+ sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
+
+ skb->dev = skb_dst(skb)->dev;
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ if (np && np->frag_size < mtu) {
+ if (np->frag_size)
+ mtu = np->frag_size;
+ }
+ mtu -= hlen + sizeof(struct frag_hdr);
+
+ if (skb_has_frag_list(skb)) {
+ int first_len = skb_pagelen(skb);
+ struct sk_buff *frag2;
+
+ if (first_len - hlen > mtu ||
+ ((first_len - hlen) & 7) ||
+ skb_cloned(skb))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag) {
+ /* Correct geometry. */
+ if (frag->len > mtu ||
+ ((frag->len & 7) && frag->next) ||
+ skb_headroom(frag) < hlen)
+ goto slow_path_clean;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag))
+ goto slow_path_clean;
+
+ BUG_ON(frag->sk);
+ if (skb->sk) {
+ frag->sk = skb->sk;
+ frag->destructor = sock_wfree;
+ }
+ skb->truesize -= frag->truesize;
+ }
+
+ err = 0;
+ offset = 0;
+ frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+ /* BUILD HEADER */
+
+ *prevhdr = NEXTHDR_FRAGMENT;
+ tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
+ if (!tmp_hdr) {
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_FRAGFAILS);
+ return -ENOMEM;
+ }
+
+ __skb_pull(skb, hlen);
+ fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
+ __skb_push(skb, hlen);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), tmp_hdr, hlen);
+
+ ipv6_select_ident(net, fh, rt);
+ fh->nexthdr = nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(IP6_MF);
+ frag_id = fh->identification;
+
+ first_len = skb_pagelen(skb);
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ ipv6_hdr(skb)->payload_len = htons(first_len -
+ sizeof(struct ipv6hdr));
+
+ dst_hold(&rt->dst);
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down. */
+ if (frag) {
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), tmp_hdr,
+ hlen);
+ offset += skb->len - hlen - sizeof(struct frag_hdr);
+ fh->nexthdr = nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(offset);
+ if (frag->next)
+ fh->frag_off |= htons(IP6_MF);
+ fh->identification = frag_id;
+ ipv6_hdr(frag)->payload_len =
+ htons(frag->len -
+ sizeof(struct ipv6hdr));
+ ip6_copy_metadata(frag, skb);
+ }
+
+ err = output(sk, skb);
+ if (!err)
+ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+ IPSTATS_MIB_FRAGCREATES);
+
+ if (err || !frag)
+ break;
+
+ skb = frag;
+ frag = skb->next;
+ skb->next = NULL;
+ }
+
+ kfree(tmp_hdr);
+
+ if (err == 0) {
+ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+ IPSTATS_MIB_FRAGOKS);
+ ip6_rt_put(rt);
+ return 0;
+ }
+
+ kfree_skb_list(frag);
+
+ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+ IPSTATS_MIB_FRAGFAILS);
+ ip6_rt_put(rt);
+ return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
+ }
+
+slow_path:
+ if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
+ skb_checksum_help(skb))
+ goto fail;
+
+ left = skb->len - hlen; /* Space per frame */
+ ptr = hlen; /* Where to start from */
+
+ /*
+ * Fragment the datagram.
+ */
+
+ *prevhdr = NEXTHDR_FRAGMENT;
+ hroom = LL_RESERVED_SPACE(rt->dst.dev);
+ troom = rt->dst.dev->needed_tailroom;
+
+ /*
+ * Keep copying data until we run out.
+ */
+ while (left > 0) {
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left) {
+ len &= ~7;
+ }
+
+ /* Allocate buffer */
+ frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
+ hroom + troom, GFP_ATOMIC);
+ if (!frag) {
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_FRAGFAILS);
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ /*
+ * Set up data on packet
+ */
+
+ ip6_copy_metadata(frag, skb);
+ skb_reserve(frag, hroom);
+ skb_put(frag, len + hlen + sizeof(struct frag_hdr));
+ skb_reset_network_header(frag);
+ fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
+ frag->transport_header = (frag->network_header + hlen +
+ sizeof(struct frag_hdr));
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+ if (skb->sk)
+ skb_set_owner_w(frag, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+ skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
+
+ /*
+ * Build fragment header.
+ */
+ fh->nexthdr = nexthdr;
+ fh->reserved = 0;
+ if (!frag_id) {
+ ipv6_select_ident(net, fh, rt);
+ frag_id = fh->identification;
+ } else
+ fh->identification = frag_id;
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
+ len));
+ left -= len;
+
+ fh->frag_off = htons(offset);
+ if (left > 0)
+ fh->frag_off |= htons(IP6_MF);
+ ipv6_hdr(frag)->payload_len = htons(frag->len -
+ sizeof(struct ipv6hdr));
+
+ ptr += len;
+ offset += len;
+
+ /*
+ * Put this fragment into the sending queue.
+ */
+ err = output(sk, frag);
+ if (err)
+ goto fail;
+
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_FRAGCREATES);
+ }
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_FRAGOKS);
+ consume_skb(skb);
+ return err;
+
+fail:
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_FRAGFAILS);
+ kfree_skb(skb);
+ return err;
+}
+
+static inline int ip6_rt_check(const struct rt6key *rt_key,
+ const struct in6_addr *fl_addr,
+ const struct in6_addr *addr_cache)
+{
+ return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
+ (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
+}
+
+static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
+ struct dst_entry *dst,
+ const struct flowi6 *fl6)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct rt6_info *rt;
+
+ if (!dst)
+ goto out;
+
+ if (dst->ops->family != AF_INET6) {
+ dst_release(dst);
+ return NULL;
+ }
+
+ rt = (struct rt6_info *)dst;
+ /* Yes, checking route validity in not connected
+ * case is not very simple. Take into account,
+ * that we do not support routing by source, TOS,
+ * and MSG_DONTROUTE --ANK (980726)
+ *
+ * 1. ip6_rt_check(): If route was host route,
+ * check that cached destination is current.
+ * If it is network route, we still may
+ * check its validity using saved pointer
+ * to the last used address: daddr_cache.
+ * We do not want to save whole address now,
+ * (because main consumer of this service
+ * is tcp, which has not this problem),
+ * so that the last trick works only on connected
+ * sockets.
+ * 2. oif also should be the same.
+ */
+ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
+#ifdef CONFIG_IPV6_SUBTREES
+ ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
+#endif
+ (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
+ dst_release(dst);
+ dst = NULL;
+ }
+
+out:
+ return dst;
+}
+
+static int ip6_dst_lookup_tail(struct sock *sk,
+ struct dst_entry **dst, struct flowi6 *fl6)
+{
+ struct net *net = sock_net(sk);
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ struct neighbour *n;
+ struct rt6_info *rt;
+#endif
+ int err;
+
+ /* The correct way to handle this would be to do
+ * ip6_route_get_saddr, and then ip6_route_output; however,
+ * the route-specific preferred source forces the
+ * ip6_route_output call _before_ ip6_route_get_saddr.
+ *
+ * In source specific routing (no src=any default route),
+ * ip6_route_output will fail given src=any saddr, though, so
+ * that's why we try it again later.
+ */
+ if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+ struct rt6_info *rt;
+ bool had_dst = *dst != NULL;
+
+ if (!had_dst)
+ *dst = ip6_route_output(net, sk, fl6);
+ rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
+ err = ip6_route_get_saddr(net, rt, &fl6->daddr,
+ sk ? inet6_sk(sk)->srcprefs : 0,
+ &fl6->saddr);
+ if (err)
+ goto out_err_release;
+
+ /* If we had an erroneous initial result, pretend it
+ * never existed and let the SA-enabled version take
+ * over.
+ */
+ if (!had_dst && (*dst)->error) {
+ dst_release(*dst);
+ *dst = NULL;
+ }
+ }
+
+ if (!*dst)
+ *dst = ip6_route_output(net, sk, fl6);
+
+ err = (*dst)->error;
+ if (err)
+ goto out_err_release;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ /*
+ * Here if the dst entry we've looked up
+ * has a neighbour entry that is in the INCOMPLETE
+ * state and the src address from the flow is
+ * marked as OPTIMISTIC, we release the found
+ * dst entry and replace it instead with the
+ * dst entry of the nexthop router
+ */
+ rt = (struct rt6_info *) *dst;
+ rcu_read_lock_bh();
+ n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
+ err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
+ rcu_read_unlock_bh();
+
+ if (err) {
+ struct inet6_ifaddr *ifp;
+ struct flowi6 fl_gw6;
+ int redirect;
+
+ ifp = ipv6_get_ifaddr(net, &fl6->saddr,
+ (*dst)->dev, 1);
+
+ redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
+ if (ifp)
+ in6_ifa_put(ifp);
+
+ if (redirect) {
+ /*
+ * We need to get the dst entry for the
+ * default router instead
+ */
+ dst_release(*dst);
+ memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
+ memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
+ *dst = ip6_route_output(net, sk, &fl_gw6);
+ err = (*dst)->error;
+ if (err)
+ goto out_err_release;
+ }
+ }
+#endif
+
+ return 0;
+
+out_err_release:
+ if (err == -ENETUNREACH)
+ IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
+ dst_release(*dst);
+ *dst = NULL;
+ return err;
+}
+
+/**
+ * ip6_dst_lookup - perform route lookup on flow
+ * @sk: socket which provides route info
+ * @dst: pointer to dst_entry * for result
+ * @fl6: flow to lookup
+ *
+ * This function performs a route lookup on the given flow.
+ *
+ * It returns zero on success, or a standard errno code on error.
+ */
+int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
+{
+ *dst = NULL;
+ return ip6_dst_lookup_tail(sk, dst, fl6);
+}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup);
+
+/**
+ * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
+ * @sk: socket which provides route info
+ * @fl6: flow to lookup
+ * @final_dst: final destination address for ipsec lookup
+ *
+ * This function performs a route lookup on the given flow.
+ *
+ * It returns a valid dst pointer on success, or a pointer encoded
+ * error code.
+ */
+struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
+ const struct in6_addr *final_dst)
+{
+ struct dst_entry *dst = NULL;
+ int err;
+
+ err = ip6_dst_lookup_tail(sk, &dst, fl6);
+ if (err)
+ return ERR_PTR(err);
+ if (final_dst)
+ fl6->daddr = *final_dst;
+
+ return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
+
+/**
+ * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
+ * @sk: socket which provides the dst cache and route info
+ * @fl6: flow to lookup
+ * @final_dst: final destination address for ipsec lookup
+ *
+ * This function performs a route lookup on the given flow with the
+ * possibility of using the cached route in the socket if it is valid.
+ * It will take the socket dst lock when operating on the dst cache.
+ * As a result, this function can only be used in process context.
+ *
+ * It returns a valid dst pointer on success, or a pointer encoded
+ * error code.
+ */
+struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
+ const struct in6_addr *final_dst)
+{
+ struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
+ int err;
+
+ dst = ip6_sk_dst_check(sk, dst, fl6);
+
+ err = ip6_dst_lookup_tail(sk, &dst, fl6);
+ if (err)
+ return ERR_PTR(err);
+ if (final_dst)
+ fl6->daddr = *final_dst;
+
+ return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+}
+EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
+
+static inline int ip6_ufo_append_data(struct sock *sk,
+ struct sk_buff_head *queue,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int hh_len, int fragheaderlen,
+ int transhdrlen, int mtu, unsigned int flags,
+ struct rt6_info *rt)
+
+{
+ struct sk_buff *skb;
+ struct frag_hdr fhdr;
+ int err;
+
+ /* There is support for UDP large send offload by network
+ * device, so create one single skb packet containing complete
+ * udp datagram
+ */
+ skb = skb_peek_tail(queue);
+ if (!skb) {
+ skb = sock_alloc_send_skb(sk,
+ hh_len + fragheaderlen + transhdrlen + 20,
+ (flags & MSG_DONTWAIT), &err);
+ if (!skb)
+ return err;
+
+ /* reserve space for Hardware header */
+ skb_reserve(skb, hh_len);
+
+ /* create space for UDP/IP header */
+ skb_put(skb, fragheaderlen + transhdrlen);
+
+ /* initialize network header pointer */
+ skb_reset_network_header(skb);
+
+ /* initialize protocol header pointer */
+ skb->transport_header = skb->network_header + fragheaderlen;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->csum = 0;
+
+ __skb_queue_tail(queue, skb);
+ } else if (skb_is_gso(skb)) {
+ goto append;
+ }
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ /* Specify the length of each IPv6 datagram fragment.
+ * It has to be a multiple of 8.
+ */
+ skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
+ sizeof(struct frag_hdr)) & ~7;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+ ipv6_select_ident(sock_net(sk), &fhdr, rt);
+ skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
+
+append:
+ return skb_append_datato_frags(sk, skb, getfrag, from,
+ (length - transhdrlen));
+}
+
+static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
+ gfp_t gfp)
+{
+ return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
+}
+
+static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
+ gfp_t gfp)
+{
+ return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
+}
+
+static void ip6_append_data_mtu(unsigned int *mtu,
+ int *maxfraglen,
+ unsigned int fragheaderlen,
+ struct sk_buff *skb,
+ struct rt6_info *rt,
+ unsigned int orig_mtu)
+{
+ if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
+ if (!skb) {
+ /* first fragment, reserve header_len */
+ *mtu = orig_mtu - rt->dst.header_len;
+
+ } else {
+ /*
+ * this fragment is not first, the headers
+ * space is regarded as data space.
+ */
+ *mtu = orig_mtu;
+ }
+ *maxfraglen = ((*mtu - fragheaderlen) & ~7)
+ + fragheaderlen - sizeof(struct frag_hdr);
+ }
+}
+
+static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
+ struct inet6_cork *v6_cork,
+ int hlimit, int tclass, struct ipv6_txoptions *opt,
+ struct rt6_info *rt, struct flowi6 *fl6)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ unsigned int mtu;
+
+ /*
+ * setup for corking
+ */
+ if (opt) {
+ if (WARN_ON(v6_cork->opt))
+ return -EINVAL;
+
+ v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
+ if (unlikely(!v6_cork->opt))
+ return -ENOBUFS;
+
+ v6_cork->opt->tot_len = opt->tot_len;
+ v6_cork->opt->opt_flen = opt->opt_flen;
+ v6_cork->opt->opt_nflen = opt->opt_nflen;
+
+ v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
+ sk->sk_allocation);
+ if (opt->dst0opt && !v6_cork->opt->dst0opt)
+ return -ENOBUFS;
+
+ v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
+ sk->sk_allocation);
+ if (opt->dst1opt && !v6_cork->opt->dst1opt)
+ return -ENOBUFS;
+
+ v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
+ sk->sk_allocation);
+ if (opt->hopopt && !v6_cork->opt->hopopt)
+ return -ENOBUFS;
+
+ v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
+ sk->sk_allocation);
+ if (opt->srcrt && !v6_cork->opt->srcrt)
+ return -ENOBUFS;
+
+ /* need source address above miyazawa*/
+ }
+ dst_hold(&rt->dst);
+ cork->base.dst = &rt->dst;
+ cork->fl.u.ip6 = *fl6;
+ v6_cork->hop_limit = hlimit;
+ v6_cork->tclass = tclass;
+ if (rt->dst.flags & DST_XFRM_TUNNEL)
+ mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+ rt->dst.dev->mtu : dst_mtu(&rt->dst);
+ else
+ mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+ rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+ if (np->frag_size < mtu) {
+ if (np->frag_size)
+ mtu = np->frag_size;
+ }
+ cork->base.fragsize = mtu;
+ if (dst_allfrag(rt->dst.path))
+ cork->base.flags |= IPCORK_ALLFRAG;
+ cork->base.length = 0;
+
+ return 0;
+}
+
+static int __ip6_append_data(struct sock *sk,
+ struct flowi6 *fl6,
+ struct sk_buff_head *queue,
+ struct inet_cork *cork,
+ struct inet6_cork *v6_cork,
+ struct page_frag *pfrag,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ unsigned int flags, int dontfrag)
+{
+ struct sk_buff *skb, *skb_prev = NULL;
+ unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
+ int exthdrlen = 0;
+ int dst_exthdrlen = 0;
+ int hh_len;
+ int copy;
+ int err;
+ int offset = 0;
+ __u8 tx_flags = 0;
+ u32 tskey = 0;
+ struct rt6_info *rt = (struct rt6_info *)cork->dst;
+ struct ipv6_txoptions *opt = v6_cork->opt;
+ int csummode = CHECKSUM_NONE;
+
+ skb = skb_peek_tail(queue);
+ if (!skb) {
+ exthdrlen = opt ? opt->opt_flen : 0;
+ dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
+ }
+
+ mtu = cork->fragsize;
+ orig_mtu = mtu;
+
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+
+ fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
+ (opt ? opt->opt_nflen : 0);
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
+ sizeof(struct frag_hdr);
+
+ if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
+ unsigned int maxnonfragsize, headersize;
+
+ headersize = sizeof(struct ipv6hdr) +
+ (opt ? opt->opt_flen + opt->opt_nflen : 0) +
+ (dst_allfrag(&rt->dst) ?
+ sizeof(struct frag_hdr) : 0) +
+ rt->rt6i_nfheader_len;
+
+ if (ip6_sk_ignore_df(sk))
+ maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
+ else
+ maxnonfragsize = mtu;
+
+ /* dontfrag active */
+ if ((cork->length + length > mtu - headersize) && dontfrag &&
+ (sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_RAW)) {
+ ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
+ sizeof(struct ipv6hdr));
+ goto emsgsize;
+ }
+
+ if (cork->length + length > maxnonfragsize - headersize) {
+emsgsize:
+ ipv6_local_error(sk, EMSGSIZE, fl6,
+ mtu - headersize +
+ sizeof(struct ipv6hdr));
+ return -EMSGSIZE;
+ }
+ }
+
+ if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
+ sock_tx_timestamp(sk, &tx_flags);
+ if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = sk->sk_tskey++;
+ }
+
+ /* If this is the first and only packet and device
+ * supports checksum offloading, let's use it.
+ * Use transhdrlen, same as IPv4, because partial
+ * sums only work when transhdrlen is set.
+ */
+ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
+ length + fragheaderlen < mtu &&
+ rt->dst.dev->features & NETIF_F_V6_CSUM &&
+ !exthdrlen)
+ csummode = CHECKSUM_PARTIAL;
+ /*
+ * Let's try using as much space as possible.
+ * Use MTU if total length of the message fits into the MTU.
+ * Otherwise, we need to reserve fragment header and
+ * fragment alignment (= 8-15 octects, in total).
+ *
+ * Note that we may need to "move" the data from the tail of
+ * of the buffer to the new fragment when we split
+ * the message.
+ *
+ * FIXME: It may be fragmented into multiple chunks
+ * at once if non-fragmentable extension headers
+ * are too large.
+ * --yoshfuji
+ */
+
+ cork->length += length;
+ if (((length > mtu) ||
+ (skb && skb_is_gso(skb))) &&
+ (sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->dst.dev->features & NETIF_F_UFO) &&
+ (sk->sk_type == SOCK_DGRAM)) {
+ err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
+ hh_len, fragheaderlen,
+ transhdrlen, mtu, flags, rt);
+ if (err)
+ goto error;
+ return 0;
+ }
+
+ if (!skb)
+ goto alloc_new_skb;
+
+ while (length > 0) {
+ /* Check if the remaining data fits into current packet. */
+ copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
+ if (copy < length)
+ copy = maxfraglen - skb->len;
+
+ if (copy <= 0) {
+ char *data;
+ unsigned int datalen;
+ unsigned int fraglen;
+ unsigned int fraggap;
+ unsigned int alloclen;
+alloc_new_skb:
+ /* There's no room in the current skb */
+ if (skb)
+ fraggap = skb->len - maxfraglen;
+ else
+ fraggap = 0;
+ /* update mtu and maxfraglen if necessary */
+ if (!skb || !skb_prev)
+ ip6_append_data_mtu(&mtu, &maxfraglen,
+ fragheaderlen, skb, rt,
+ orig_mtu);
+
+ skb_prev = skb;
+
+ /*
+ * If remaining data exceeds the mtu,
+ * we know we need more fragment(s).
+ */
+ datalen = length + fraggap;
+
+ if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
+ datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+ if ((flags & MSG_MORE) &&
+ !(rt->dst.dev->features&NETIF_F_SG))
+ alloclen = mtu;
+ else
+ alloclen = datalen + fragheaderlen;
+
+ alloclen += dst_exthdrlen;
+
+ if (datalen != length + fraggap) {
+ /*
+ * this is not the last fragment, the trailer
+ * space is regarded as data space.
+ */
+ datalen += rt->dst.trailer_len;
+ }
+
+ alloclen += rt->dst.trailer_len;
+ fraglen = datalen + fragheaderlen;
+
+ /*
+ * We just reserve space for fragment header.
+ * Note: this may be overallocation if the message
+ * (without MSG_MORE) fits into the MTU.
+ */
+ alloclen += sizeof(struct frag_hdr);
+
+ if (transhdrlen) {
+ skb = sock_alloc_send_skb(sk,
+ alloclen + hh_len,
+ (flags & MSG_DONTWAIT), &err);
+ } else {
+ skb = NULL;
+ if (atomic_read(&sk->sk_wmem_alloc) <=
+ 2 * sk->sk_sndbuf)
+ skb = sock_wmalloc(sk,
+ alloclen + hh_len, 1,
+ sk->sk_allocation);
+ if (unlikely(!skb))
+ err = -ENOBUFS;
+ }
+ if (!skb)
+ goto error;
+ /*
+ * Fill in the control structures
+ */
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->ip_summed = csummode;
+ skb->csum = 0;
+ /* reserve for fragmentation and ipsec header */
+ skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
+ dst_exthdrlen);
+
+ /* Only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = tx_flags;
+ tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+
+ /*
+ * Find where to start putting bytes
+ */
+ data = skb_put(skb, fraglen);
+ skb_set_network_header(skb, exthdrlen);
+ data += fragheaderlen;
+ skb->transport_header = (skb->network_header +
+ fragheaderlen);
+ if (fraggap) {
+ skb->csum = skb_copy_and_csum_bits(
+ skb_prev, maxfraglen,
+ data + transhdrlen, fraggap, 0);
+ skb_prev->csum = csum_sub(skb_prev->csum,
+ skb->csum);
+ data += fraggap;
+ pskb_trim_unique(skb_prev, maxfraglen);
+ }
+ copy = datalen - transhdrlen - fraggap;
+
+ if (copy < 0) {
+ err = -EINVAL;
+ kfree_skb(skb);
+ goto error;
+ } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ err = -EFAULT;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ offset += copy;
+ length -= datalen - fraggap;
+ transhdrlen = 0;
+ exthdrlen = 0;
+ dst_exthdrlen = 0;
+
+ /*
+ * Put the packet on the pending queue
+ */
+ __skb_queue_tail(queue, skb);
+ continue;
+ }
+
+ if (copy > length)
+ copy = length;
+
+ if (!(rt->dst.dev->features&NETIF_F_SG)) {
+ unsigned int off;
+
+ off = skb->len;
+ if (getfrag(from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
+ __skb_trim(skb, off);
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ int i = skb_shinfo(skb)->nr_frags;
+
+ err = -ENOMEM;
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto error;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ err = -EMSGSIZE;
+ if (i == MAX_SKB_FRAGS)
+ goto error;
+
+ __skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, 0);
+ skb_shinfo(skb)->nr_frags = ++i;
+ get_page(pfrag->page);
+ }
+ copy = min_t(int, copy, pfrag->size - pfrag->offset);
+ if (getfrag(from,
+ page_address(pfrag->page) + pfrag->offset,
+ offset, copy, skb->len, skb) < 0)
+ goto error_efault;
+
+ pfrag->offset += copy;
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ atomic_add(copy, &sk->sk_wmem_alloc);
+ }
+ offset += copy;
+ length -= copy;
+ }
+
+ return 0;
+
+error_efault:
+ err = -EFAULT;
+error:
+ cork->length -= length;
+ IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+ return err;
+}
+
+int ip6_append_data(struct sock *sk,
+ int getfrag(void *from, char *to, int offset, int len,
+ int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen, int hlimit,
+ int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
+ struct rt6_info *rt, unsigned int flags, int dontfrag)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ int exthdrlen;
+ int err;
+
+ if (flags&MSG_PROBE)
+ return 0;
+ if (skb_queue_empty(&sk->sk_write_queue)) {
+ /*
+ * setup for corking
+ */
+ err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
+ tclass, opt, rt, fl6);
+ if (err)
+ return err;
+
+ exthdrlen = (opt ? opt->opt_flen : 0);
+ length += exthdrlen;
+ transhdrlen += exthdrlen;
+ } else {
+ fl6 = &inet->cork.fl.u.ip6;
+ transhdrlen = 0;
+ }
+
+ return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
+ &np->cork, sk_page_frag(sk), getfrag,
+ from, length, transhdrlen, flags, dontfrag);
+}
+EXPORT_SYMBOL_GPL(ip6_append_data);
+
+static void ip6_cork_release(struct inet_cork_full *cork,
+ struct inet6_cork *v6_cork)
+{
+ if (v6_cork->opt) {
+ kfree(v6_cork->opt->dst0opt);
+ kfree(v6_cork->opt->dst1opt);
+ kfree(v6_cork->opt->hopopt);
+ kfree(v6_cork->opt->srcrt);
+ kfree(v6_cork->opt);
+ v6_cork->opt = NULL;
+ }
+
+ if (cork->base.dst) {
+ dst_release(cork->base.dst);
+ cork->base.dst = NULL;
+ cork->base.flags &= ~IPCORK_ALLFRAG;
+ }
+ memset(&cork->fl, 0, sizeof(cork->fl));
+}
+
+struct sk_buff *__ip6_make_skb(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork_full *cork,
+ struct inet6_cork *v6_cork)
+{
+ struct sk_buff *skb, *tmp_skb;
+ struct sk_buff **tail_skb;
+ struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
+ struct ipv6hdr *hdr;
+ struct ipv6_txoptions *opt = v6_cork->opt;
+ struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
+ struct flowi6 *fl6 = &cork->fl.u.ip6;
+ unsigned char proto = fl6->flowi6_proto;
+
+ skb = __skb_dequeue(queue);
+ if (!skb)
+ goto out;
+ tail_skb = &(skb_shinfo(skb)->frag_list);
+
+ /* move skb->data to ip header from ext header */
+ if (skb->data < skb_network_header(skb))
+ __skb_pull(skb, skb_network_offset(skb));
+ while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
+ __skb_pull(tmp_skb, skb_network_header_len(skb));
+ *tail_skb = tmp_skb;
+ tail_skb = &(tmp_skb->next);
+ skb->len += tmp_skb->len;
+ skb->data_len += tmp_skb->len;
+ skb->truesize += tmp_skb->truesize;
+ tmp_skb->destructor = NULL;
+ tmp_skb->sk = NULL;
+ }
+
+ /* Allow local fragmentation. */
+ skb->ignore_df = ip6_sk_ignore_df(sk);
+
+ *final_dst = fl6->daddr;
+ __skb_pull(skb, skb_network_header_len(skb));
+ if (opt && opt->opt_flen)
+ ipv6_push_frag_opts(skb, opt, &proto);
+ if (opt && opt->opt_nflen)
+ ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ hdr = ipv6_hdr(skb);
+
+ ip6_flow_hdr(hdr, v6_cork->tclass,
+ ip6_make_flowlabel(net, skb, fl6->flowlabel,
+ np->autoflowlabel));
+ hdr->hop_limit = v6_cork->hop_limit;
+ hdr->nexthdr = proto;
+ hdr->saddr = fl6->saddr;
+ hdr->daddr = *final_dst;
+
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+ if (proto == IPPROTO_ICMPV6) {
+ struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+
+ ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+ }
+
+ ip6_cork_release(cork, v6_cork);
+out:
+ return skb;
+}
+
+int ip6_send_skb(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ int err;
+
+ err = ip6_local_out(skb);
+ if (err) {
+ if (err > 0)
+ err = net_xmit_errno(err);
+ if (err)
+ IP6_INC_STATS(net, rt->rt6i_idev,
+ IPSTATS_MIB_OUTDISCARDS);
+ }
+
+ return err;
+}
+
+int ip6_push_pending_frames(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ skb = ip6_finish_skb(sk);
+ if (!skb)
+ return 0;
+
+ return ip6_send_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
+
+static void __ip6_flush_pending_frames(struct sock *sk,
+ struct sk_buff_head *queue,
+ struct inet_cork_full *cork,
+ struct inet6_cork *v6_cork)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue_tail(queue)) != NULL) {
+ if (skb_dst(skb))
+ IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ }
+
+ ip6_cork_release(cork, v6_cork);
+}
+
+void ip6_flush_pending_frames(struct sock *sk)
+{
+ __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
+ &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
+}
+EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
+
+struct sk_buff *ip6_make_skb(struct sock *sk,
+ int getfrag(void *from, char *to, int offset,
+ int len, int odd, struct sk_buff *skb),
+ void *from, int length, int transhdrlen,
+ int hlimit, int tclass,
+ struct ipv6_txoptions *opt, struct flowi6 *fl6,
+ struct rt6_info *rt, unsigned int flags,
+ int dontfrag)
+{
+ struct inet_cork_full cork;
+ struct inet6_cork v6_cork;
+ struct sk_buff_head queue;
+ int exthdrlen = (opt ? opt->opt_flen : 0);
+ int err;
+
+ if (flags & MSG_PROBE)
+ return NULL;
+
+ __skb_queue_head_init(&queue);
+
+ cork.base.flags = 0;
+ cork.base.addr = 0;
+ cork.base.opt = NULL;
+ v6_cork.opt = NULL;
+ err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
+ if (err)
+ return ERR_PTR(err);
+
+ if (dontfrag < 0)
+ dontfrag = inet6_sk(sk)->dontfrag;
+
+ err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
+ &current->task_frag, getfrag, from,
+ length + exthdrlen, transhdrlen + exthdrlen,
+ flags, dontfrag);
+ if (err) {
+ __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
+ return ERR_PTR(err);
+ }
+
+ return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
+}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
new file mode 100644
index 000000000..5cafd92c2
--- /dev/null
+++ b/net/ipv6/ip6_tunnel.c
@@ -0,0 +1,1946 @@
+/*
+ * IPv6 tunneling device
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Ville Nuorvala <vnuorval@tcs.hut.fi>
+ * Yasuyuki Kozakai <kozakai@linux-ipv6.org>
+ *
+ * Based on:
+ * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c
+ *
+ * RFC 2473
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/sockios.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/route.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/etherdevice.h>
+
+#include <asm/uaccess.h>
+#include <linux/atomic.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/xfrm.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+MODULE_AUTHOR("Ville Nuorvala");
+MODULE_DESCRIPTION("IPv6 tunneling device");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("ip6tnl");
+MODULE_ALIAS_NETDEV("ip6tnl0");
+
+#define HASH_SIZE_SHIFT 5
+#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
+{
+ u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
+
+ return hash_32(hash, HASH_SIZE_SHIFT);
+}
+
+static int ip6_tnl_dev_init(struct net_device *dev);
+static void ip6_tnl_dev_setup(struct net_device *dev);
+static struct rtnl_link_ops ip6_link_ops __read_mostly;
+
+static int ip6_tnl_net_id __read_mostly;
+struct ip6_tnl_net {
+ /* the IPv6 tunnel fallback device */
+ struct net_device *fb_tnl_dev;
+ /* lists for storing tunnels in use */
+ struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+ struct ip6_tnl __rcu *tnls_wc[1];
+ struct ip6_tnl __rcu **tnls[2];
+};
+
+static struct net_device_stats *ip6_get_stats(struct net_device *dev)
+{
+ struct pcpu_sw_netstats tmp, sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ unsigned int start;
+ const struct pcpu_sw_netstats *tstats =
+ per_cpu_ptr(dev->tstats, i);
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tstats->syncp);
+ tmp.rx_packets = tstats->rx_packets;
+ tmp.rx_bytes = tstats->rx_bytes;
+ tmp.tx_packets = tstats->tx_packets;
+ tmp.tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
+
+ sum.rx_packets += tmp.rx_packets;
+ sum.rx_bytes += tmp.rx_bytes;
+ sum.tx_packets += tmp.tx_packets;
+ sum.tx_bytes += tmp.tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
+
+/*
+ * Locking : hash tables are protected by RCU and RTNL
+ */
+
+struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
+{
+ struct dst_entry *dst = t->dst_cache;
+
+ if (dst && dst->obsolete &&
+ !dst->ops->check(dst, t->dst_cookie)) {
+ t->dst_cache = NULL;
+ dst_release(dst);
+ return NULL;
+ }
+
+ return dst;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_dst_check);
+
+void ip6_tnl_dst_reset(struct ip6_tnl *t)
+{
+ dst_release(t->dst_cache);
+ t->dst_cache = NULL;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset);
+
+void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
+{
+ struct rt6_info *rt = (struct rt6_info *) dst;
+ t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ dst_release(t->dst_cache);
+ t->dst_cache = dst;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_dst_store);
+
+/**
+ * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
+ * @remote: the address of the tunnel exit-point
+ * @local: the address of the tunnel entry-point
+ *
+ * Return:
+ * tunnel matching given end-points if found,
+ * else fallback tunnel if its device is up,
+ * else %NULL
+ **/
+
+#define for_each_ip6_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+static struct ip6_tnl *
+ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
+{
+ unsigned int hash = HASH(remote, local);
+ struct ip6_tnl *t;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct in6_addr any;
+
+ for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ memset(&any, 0, sizeof(any));
+ hash = HASH(&any, local);
+ for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ hash = HASH(remote, &any);
+ for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(remote, &t->parms.raddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ t = rcu_dereference(ip6n->tnls_wc[0]);
+ if (t && (t->dev->flags & IFF_UP))
+ return t;
+
+ return NULL;
+}
+
+/**
+ * ip6_tnl_bucket - get head of list matching given tunnel parameters
+ * @p: parameters containing tunnel end-points
+ *
+ * Description:
+ * ip6_tnl_bucket() returns the head of the list matching the
+ * &struct in6_addr entries laddr and raddr in @p.
+ *
+ * Return: head of IPv6 tunnel list
+ **/
+
+static struct ip6_tnl __rcu **
+ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p)
+{
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ unsigned int h = 0;
+ int prio = 0;
+
+ if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
+ prio = 1;
+ h = HASH(remote, local);
+ }
+ return &ip6n->tnls[prio][h];
+}
+
+/**
+ * ip6_tnl_link - add tunnel to hash table
+ * @t: tunnel to be added
+ **/
+
+static void
+ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
+
+ rcu_assign_pointer(t->next , rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
+}
+
+/**
+ * ip6_tnl_unlink - remove tunnel from hash table
+ * @t: tunnel to be removed
+ **/
+
+static void
+ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *iter;
+
+ for (tp = ip6_tnl_bucket(ip6n, &t->parms);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
+ break;
+ }
+ }
+}
+
+static void ip6_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+static int ip6_tnl_create2(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ int err;
+
+ t = netdev_priv(dev);
+
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ strcpy(t->parms.name, dev->name);
+ dev->rtnl_link_ops = &ip6_link_ops;
+
+ dev_hold(dev);
+ ip6_tnl_link(ip6n, t);
+ return 0;
+
+out:
+ return err;
+}
+
+/**
+ * ip6_tnl_create - create a new tunnel
+ * @p: tunnel parameters
+ * @pt: pointer to new tunnel
+ *
+ * Description:
+ * Create tunnel matching given parameters.
+ *
+ * Return:
+ * created tunnel or error pointer
+ **/
+
+static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
+{
+ struct net_device *dev;
+ struct ip6_tnl *t;
+ char name[IFNAMSIZ];
+ int err = -ENOMEM;
+
+ if (p->name[0])
+ strlcpy(name, p->name, IFNAMSIZ);
+ else
+ sprintf(name, "ip6tnl%%d");
+
+ dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+ ip6_tnl_dev_setup);
+ if (!dev)
+ goto failed;
+
+ dev_net_set(dev, net);
+
+ t = netdev_priv(dev);
+ t->parms = *p;
+ t->net = dev_net(dev);
+ err = ip6_tnl_create2(dev);
+ if (err < 0)
+ goto failed_free;
+
+ return t;
+
+failed_free:
+ ip6_dev_free(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+/**
+ * ip6_tnl_locate - find or create tunnel matching given parameters
+ * @p: tunnel parameters
+ * @create: != 0 if allowed to create new tunnel if no match found
+ *
+ * Description:
+ * ip6_tnl_locate() first tries to locate an existing tunnel
+ * based on @parms. If this is unsuccessful, but @create is set a new
+ * tunnel device is created and registered for use.
+ *
+ * Return:
+ * matching tunnel or error pointer
+ **/
+
+static struct ip6_tnl *ip6_tnl_locate(struct net *net,
+ struct __ip6_tnl_parm *p, int create)
+{
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *t;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ for (tp = ip6_tnl_bucket(ip6n, p);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_equal(remote, &t->parms.raddr)) {
+ if (create)
+ return ERR_PTR(-EEXIST);
+
+ return t;
+ }
+ }
+ if (!create)
+ return ERR_PTR(-ENODEV);
+ return ip6_tnl_create(net, p);
+}
+
+/**
+ * ip6_tnl_dev_uninit - tunnel device uninitializer
+ * @dev: the device to be destroyed
+ *
+ * Description:
+ * ip6_tnl_dev_uninit() removes tunnel from its list
+ **/
+
+static void
+ip6_tnl_dev_uninit(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ if (dev == ip6n->fb_tnl_dev)
+ RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
+ else
+ ip6_tnl_unlink(ip6n, t);
+ ip6_tnl_dst_reset(t);
+ dev_put(dev);
+}
+
+/**
+ * parse_tvl_tnl_enc_lim - handle encapsulation limit option
+ * @skb: received socket buffer
+ *
+ * Return:
+ * 0 if none was found,
+ * else index to encapsulation limit
+ **/
+
+__u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
+{
+ const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw;
+ __u8 nexthdr = ipv6h->nexthdr;
+ __u16 off = sizeof(*ipv6h);
+
+ while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
+ __u16 optlen = 0;
+ struct ipv6_opt_hdr *hdr;
+ if (raw + off + sizeof(*hdr) > skb->data &&
+ !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr)))
+ break;
+
+ hdr = (struct ipv6_opt_hdr *) (raw + off);
+ if (nexthdr == NEXTHDR_FRAGMENT) {
+ struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
+ if (frag_hdr->frag_off)
+ break;
+ optlen = 8;
+ } else if (nexthdr == NEXTHDR_AUTH) {
+ optlen = (hdr->hdrlen + 2) << 2;
+ } else {
+ optlen = ipv6_optlen(hdr);
+ }
+ if (nexthdr == NEXTHDR_DEST) {
+ __u16 i = off + 2;
+ while (1) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+
+ /* No more room for encapsulation limit */
+ if (i + sizeof (*tel) > off + optlen)
+ break;
+
+ tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i];
+ /* return index of option if found and valid */
+ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
+ tel->length == 1)
+ return i;
+ /* else jump to next option */
+ if (tel->type)
+ i += tel->length + 2;
+ else
+ i++;
+ }
+ }
+ nexthdr = hdr->nexthdr;
+ off += optlen;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim);
+
+/**
+ * ip6_tnl_err - tunnel error handler
+ *
+ * Description:
+ * ip6_tnl_err() should handle errors in the tunnel according
+ * to the specifications in RFC 2473.
+ **/
+
+static int
+ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
+ u8 *type, u8 *code, int *msg, __u32 *info, int offset)
+{
+ const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data;
+ struct ip6_tnl *t;
+ int rel_msg = 0;
+ u8 rel_type = ICMPV6_DEST_UNREACH;
+ u8 rel_code = ICMPV6_ADDR_UNREACH;
+ u8 tproto;
+ __u32 rel_info = 0;
+ __u16 len;
+ int err = -ENOENT;
+
+ /* If the packet doesn't contain the original IPv6 header we are
+ in trouble since we might need the source address for further
+ processing of the error. */
+
+ rcu_read_lock();
+ t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, &ipv6h->saddr);
+ if (!t)
+ goto out;
+
+ tproto = ACCESS_ONCE(t->parms.proto);
+ if (tproto != ipproto && tproto != 0)
+ goto out;
+
+ err = 0;
+
+ switch (*type) {
+ __u32 teli;
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 mtu;
+ case ICMPV6_DEST_UNREACH:
+ net_warn_ratelimited("%s: Path to destination invalid or inactive!\n",
+ t->parms.name);
+ rel_msg = 1;
+ break;
+ case ICMPV6_TIME_EXCEED:
+ if ((*code) == ICMPV6_EXC_HOPLIMIT) {
+ net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
+ t->parms.name);
+ rel_msg = 1;
+ }
+ break;
+ case ICMPV6_PARAMPROB:
+ teli = 0;
+ if ((*code) == ICMPV6_HDR_FIELD)
+ teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
+
+ if (teli && teli == *info - 2) {
+ tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
+ if (tel->encap_limit == 0) {
+ net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
+ t->parms.name);
+ rel_msg = 1;
+ }
+ } else {
+ net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
+ t->parms.name);
+ }
+ break;
+ case ICMPV6_PKT_TOOBIG:
+ mtu = *info - offset;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ t->dev->mtu = mtu;
+
+ len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len);
+ if (len > mtu) {
+ rel_type = ICMPV6_PKT_TOOBIG;
+ rel_code = 0;
+ rel_info = mtu;
+ rel_msg = 1;
+ }
+ break;
+ }
+
+ *type = rel_type;
+ *code = rel_code;
+ *info = rel_info;
+ *msg = rel_msg;
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int
+ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ int rel_msg = 0;
+ u8 rel_type = type;
+ u8 rel_code = code;
+ __u32 rel_info = ntohl(info);
+ int err;
+ struct sk_buff *skb2;
+ const struct iphdr *eiph;
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code,
+ &rel_msg, &rel_info, offset);
+ if (err < 0)
+ return err;
+
+ if (rel_msg == 0)
+ return 0;
+
+ switch (rel_type) {
+ case ICMPV6_DEST_UNREACH:
+ if (rel_code != ICMPV6_ADDR_UNREACH)
+ return 0;
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_HOST_UNREACH;
+ break;
+ case ICMPV6_PKT_TOOBIG:
+ if (rel_code != 0)
+ return 0;
+ rel_type = ICMP_DEST_UNREACH;
+ rel_code = ICMP_FRAG_NEEDED;
+ break;
+ case NDISC_REDIRECT:
+ rel_type = ICMP_REDIRECT;
+ rel_code = ICMP_REDIR_HOST;
+ default:
+ return 0;
+ }
+
+ if (!pskb_may_pull(skb, offset + sizeof(struct iphdr)))
+ return 0;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ return 0;
+
+ skb_dst_drop(skb2);
+
+ skb_pull(skb2, offset);
+ skb_reset_network_header(skb2);
+ eiph = ip_hdr(skb2);
+
+ /* Try to guess incoming interface */
+ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
+ eiph->saddr, 0,
+ 0, 0,
+ IPPROTO_IPIP, RT_TOS(eiph->tos), 0);
+ if (IS_ERR(rt))
+ goto out;
+
+ skb2->dev = rt->dst.dev;
+
+ /* route "incoming" packet */
+ if (rt->rt_flags & RTCF_LOCAL) {
+ ip_rt_put(rt);
+ rt = NULL;
+ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
+ eiph->daddr, eiph->saddr,
+ 0, 0,
+ IPPROTO_IPIP,
+ RT_TOS(eiph->tos), 0);
+ if (IS_ERR(rt) ||
+ rt->dst.dev->type != ARPHRD_TUNNEL) {
+ if (!IS_ERR(rt))
+ ip_rt_put(rt);
+ goto out;
+ }
+ skb_dst_set(skb2, &rt->dst);
+ } else {
+ ip_rt_put(rt);
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
+ skb2->dev) ||
+ skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
+ goto out;
+ }
+
+ /* change mtu on this route */
+ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) {
+ if (rel_info > dst_mtu(skb_dst(skb2)))
+ goto out;
+
+ skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info);
+ }
+ if (rel_type == ICMP_REDIRECT)
+ skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2);
+
+ icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
+
+out:
+ kfree_skb(skb2);
+ return 0;
+}
+
+static int
+ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ int rel_msg = 0;
+ u8 rel_type = type;
+ u8 rel_code = code;
+ __u32 rel_info = ntohl(info);
+ int err;
+
+ err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code,
+ &rel_msg, &rel_info, offset);
+ if (err < 0)
+ return err;
+
+ if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) {
+ struct rt6_info *rt;
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb2)
+ return 0;
+
+ skb_dst_drop(skb2);
+ skb_pull(skb2, offset);
+ skb_reset_network_header(skb2);
+
+ /* Try to guess incoming interface */
+ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr,
+ NULL, 0, 0);
+
+ if (rt && rt->dst.dev)
+ skb2->dev = rt->dst.dev;
+
+ icmpv6_send(skb2, rel_type, rel_code, rel_info);
+
+ ip6_rt_put(rt);
+
+ kfree_skb(skb2);
+ }
+
+ return 0;
+}
+
+static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb)
+{
+ __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK;
+
+ if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield);
+
+ return IP6_ECN_decapsulate(ipv6h, skb);
+}
+
+static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb)
+{
+ if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
+ ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb));
+
+ return IP6_ECN_decapsulate(ipv6h, skb);
+}
+
+__u32 ip6_tnl_get_cap(struct ip6_tnl *t,
+ const struct in6_addr *laddr,
+ const struct in6_addr *raddr)
+{
+ struct __ip6_tnl_parm *p = &t->parms;
+ int ltype = ipv6_addr_type(laddr);
+ int rtype = ipv6_addr_type(raddr);
+ __u32 flags = 0;
+
+ if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) {
+ flags = IP6_TNL_F_CAP_PER_PACKET;
+ } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
+ rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
+ !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
+ (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
+ if (ltype&IPV6_ADDR_UNICAST)
+ flags |= IP6_TNL_F_CAP_XMIT;
+ if (rtype&IPV6_ADDR_UNICAST)
+ flags |= IP6_TNL_F_CAP_RCV;
+ }
+ return flags;
+}
+EXPORT_SYMBOL(ip6_tnl_get_cap);
+
+/* called with rcu_read_lock() */
+int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
+ const struct in6_addr *laddr,
+ const struct in6_addr *raddr)
+{
+ struct __ip6_tnl_parm *p = &t->parms;
+ int ret = 0;
+ struct net *net = t->net;
+
+ if ((p->flags & IP6_TNL_F_CAP_RCV) ||
+ ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
+ (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) {
+ struct net_device *ldev = NULL;
+
+ if (p->link)
+ ldev = dev_get_by_index_rcu(net, p->link);
+
+ if ((ipv6_addr_is_multicast(laddr) ||
+ likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
+ likely(!ipv6_chk_addr(net, raddr, NULL, 0)))
+ ret = 1;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl);
+
+/**
+ * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally
+ * @skb: received socket buffer
+ * @protocol: ethernet protocol ID
+ * @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN
+ *
+ * Return: 0
+ **/
+
+static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
+ __u8 ipproto,
+ int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb))
+{
+ struct ip6_tnl *t;
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ u8 tproto;
+ int err;
+
+ rcu_read_lock();
+ t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
+ if (t) {
+ struct pcpu_sw_netstats *tstats;
+
+ tproto = ACCESS_ONCE(t->parms.proto);
+ if (tproto != ipproto && tproto != 0) {
+ rcu_read_unlock();
+ goto discard;
+ }
+
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ rcu_read_unlock();
+ goto discard;
+ }
+
+ if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
+ t->dev->stats.rx_dropped++;
+ rcu_read_unlock();
+ goto discard;
+ }
+ skb->mac_header = skb->network_header;
+ skb_reset_network_header(skb);
+ skb->protocol = htons(protocol);
+ memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+
+ __skb_tunnel_rx(skb, t->dev, t->net);
+
+ err = dscp_ecn_decapsulate(t, ipv6h, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n",
+ &ipv6h->saddr,
+ ipv6_get_dsfield(ipv6h));
+ if (err > 1) {
+ ++t->dev->stats.rx_frame_errors;
+ ++t->dev->stats.rx_errors;
+ rcu_read_unlock();
+ goto discard;
+ }
+ }
+
+ tstats = this_cpu_ptr(t->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ netif_rx(skb);
+
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ return 1;
+
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int ip4ip6_rcv(struct sk_buff *skb)
+{
+ return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP,
+ ip4ip6_dscp_ecn_decapsulate);
+}
+
+static int ip6ip6_rcv(struct sk_buff *skb)
+{
+ return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6,
+ ip6ip6_dscp_ecn_decapsulate);
+}
+
+struct ipv6_tel_txoption {
+ struct ipv6_txoptions ops;
+ __u8 dst_opt[8];
+};
+
+static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit)
+{
+ memset(opt, 0, sizeof(struct ipv6_tel_txoption));
+
+ opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT;
+ opt->dst_opt[3] = 1;
+ opt->dst_opt[4] = encap_limit;
+ opt->dst_opt[5] = IPV6_TLV_PADN;
+ opt->dst_opt[6] = 1;
+
+ opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt;
+ opt->ops.opt_nflen = 8;
+}
+
+/**
+ * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ * @t: the outgoing tunnel device
+ * @hdr: IPv6 header from the incoming packet
+ *
+ * Description:
+ * Avoid trivial tunneling loop by checking that tunnel exit-point
+ * doesn't match source of incoming packet.
+ *
+ * Return:
+ * 1 if conflict,
+ * 0 else
+ **/
+
+static inline bool
+ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
+{
+ return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
+}
+
+int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
+ const struct in6_addr *laddr,
+ const struct in6_addr *raddr)
+{
+ struct __ip6_tnl_parm *p = &t->parms;
+ int ret = 0;
+ struct net *net = t->net;
+
+ if ((p->flags & IP6_TNL_F_CAP_XMIT) ||
+ ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
+ (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) {
+ struct net_device *ldev = NULL;
+
+ rcu_read_lock();
+ if (p->link)
+ ldev = dev_get_by_index_rcu(net, p->link);
+
+ if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0)))
+ pr_warn("%s xmit: Local address not yet configured!\n",
+ p->name);
+ else if (!ipv6_addr_is_multicast(raddr) &&
+ unlikely(ipv6_chk_addr(net, raddr, NULL, 0)))
+ pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
+ p->name);
+ else
+ ret = 1;
+ rcu_read_unlock();
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl);
+
+/**
+ * ip6_tnl_xmit2 - encapsulate packet and send
+ * @skb: the outgoing socket buffer
+ * @dev: the outgoing tunnel device
+ * @dsfield: dscp code for outer header
+ * @fl: flow of tunneled packet
+ * @encap_limit: encapsulation limit
+ * @pmtu: Path MTU is stored if packet is too big
+ *
+ * Description:
+ * Build new header and do some sanity checks on the packet before sending
+ * it.
+ *
+ * Return:
+ * 0 on success
+ * -1 fail
+ * %-EMSGSIZE message too big. return mtu in this case.
+ **/
+
+static int ip6_tnl_xmit2(struct sk_buff *skb,
+ struct net_device *dev,
+ __u8 dsfield,
+ struct flowi6 *fl6,
+ int encap_limit,
+ __u32 *pmtu)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct net_device_stats *stats = &t->dev->stats;
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct ipv6_tel_txoption opt;
+ struct dst_entry *dst = NULL, *ndst = NULL;
+ struct net_device *tdev;
+ int mtu;
+ unsigned int max_headroom = sizeof(struct ipv6hdr);
+ u8 proto;
+ int err = -1;
+
+ /* NBMA tunnel */
+ if (ipv6_addr_any(&t->parms.raddr)) {
+ struct in6_addr *addr6;
+ struct neighbour *neigh;
+ int addr_type;
+
+ if (!skb_dst(skb))
+ goto tx_err_link_failure;
+
+ neigh = dst_neigh_lookup(skb_dst(skb),
+ &ipv6_hdr(skb)->daddr);
+ if (!neigh)
+ goto tx_err_link_failure;
+
+ addr6 = (struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY)
+ addr6 = &ipv6_hdr(skb)->daddr;
+
+ memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
+ neigh_release(neigh);
+ } else if (!fl6->flowi6_mark)
+ dst = ip6_tnl_dst_check(t);
+
+ if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
+ goto tx_err_link_failure;
+
+ if (!dst) {
+ ndst = ip6_route_output(net, NULL, fl6);
+
+ if (ndst->error)
+ goto tx_err_link_failure;
+ ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0);
+ if (IS_ERR(ndst)) {
+ err = PTR_ERR(ndst);
+ ndst = NULL;
+ goto tx_err_link_failure;
+ }
+ dst = ndst;
+ }
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ stats->collisions++;
+ net_warn_ratelimited("%s: Local routing loop detected!\n",
+ t->parms.name);
+ goto tx_err_dst_release;
+ }
+ mtu = dst_mtu(dst) - sizeof(*ipv6h);
+ if (encap_limit >= 0) {
+ max_headroom += 8;
+ mtu -= 8;
+ }
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+ if (skb->len > mtu) {
+ *pmtu = mtu;
+ err = -EMSGSIZE;
+ goto tx_err_dst_release;
+ }
+
+ skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom += LL_RESERVED_SPACE(tdev);
+
+ if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
+ (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
+ struct sk_buff *new_skb;
+
+ new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb)
+ goto tx_err_dst_release;
+
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ consume_skb(skb);
+ skb = new_skb;
+ }
+ if (fl6->flowi6_mark) {
+ skb_dst_set(skb, dst);
+ ndst = NULL;
+ } else {
+ skb_dst_set_noref(skb, dst);
+ }
+ skb->transport_header = skb->network_header;
+
+ proto = fl6->flowi6_proto;
+ if (encap_limit >= 0) {
+ init_tel_txopt(&opt, encap_limit);
+ ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
+ }
+
+ if (likely(!skb->encapsulation)) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ ipv6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
+ ip6_make_flowlabel(net, skb, fl6->flowlabel, false));
+ ipv6h->hop_limit = t->parms.hop_limit;
+ ipv6h->nexthdr = proto;
+ ipv6h->saddr = fl6->saddr;
+ ipv6h->daddr = fl6->daddr;
+ ip6tunnel_xmit(NULL, skb, dev);
+ if (ndst)
+ ip6_tnl_dst_store(t, ndst);
+ return 0;
+tx_err_link_failure:
+ stats->tx_carrier_errors++;
+ dst_link_failure(skb);
+tx_err_dst_release:
+ dst_release(ndst);
+ return err;
+}
+
+static inline int
+ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ const struct iphdr *iph = ip_hdr(skb);
+ int encap_limit = -1;
+ struct flowi6 fl6;
+ __u8 dsfield;
+ __u32 mtu;
+ u8 tproto;
+ int err;
+
+ tproto = ACCESS_ONCE(t->parms.proto);
+ if (tproto != IPPROTO_IPIP && tproto != 0)
+ return -1;
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ encap_limit = t->parms.encap_limit;
+
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPIP;
+
+ dsfield = ipv4_get_dsfield(iph);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+ & IPV6_TCLASS_MASK;
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+
+ err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+ if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
+ if (err == -EMSGSIZE)
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int
+ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ int encap_limit = -1;
+ __u16 offset;
+ struct flowi6 fl6;
+ __u8 dsfield;
+ __u32 mtu;
+ u8 tproto;
+ int err;
+
+ tproto = ACCESS_ONCE(t->parms.proto);
+ if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
+ ip6_tnl_addr_conflict(t, ipv6h))
+ return -1;
+
+ offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ encap_limit = tel->encap_limit - 1;
+ } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ encap_limit = t->parms.encap_limit;
+
+ memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_IPV6;
+
+ dsfield = ipv6_get_dsfield(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6.flowlabel |= ip6_flowlabel(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6.flowi6_mark = skb->mark;
+
+ err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+ if (err != 0) {
+ if (err == -EMSGSIZE)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ return -1;
+ }
+
+ return 0;
+}
+
+static netdev_tx_t
+ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net_device_stats *stats = &t->dev->stats;
+ int ret;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ret = ip4ip6_tnl_xmit(skb, dev);
+ break;
+ case htons(ETH_P_IPV6):
+ ret = ip6ip6_tnl_xmit(skb, dev);
+ break;
+ default:
+ goto tx_err;
+ }
+
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ stats->tx_errors++;
+ stats->tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static void ip6_tnl_link_config(struct ip6_tnl *t)
+{
+ struct net_device *dev = t->dev;
+ struct __ip6_tnl_parm *p = &t->parms;
+ struct flowi6 *fl6 = &t->fl.u.ip6;
+
+ memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
+
+ /* Set up flowi template */
+ fl6->saddr = p->laddr;
+ fl6->daddr = p->raddr;
+ fl6->flowi6_oif = p->link;
+ fl6->flowlabel = 0;
+
+ if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
+ fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
+ if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
+ fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
+
+ p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET);
+ p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
+
+ if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV)
+ dev->flags |= IFF_POINTOPOINT;
+ else
+ dev->flags &= ~IFF_POINTOPOINT;
+
+ if (p->flags & IP6_TNL_F_CAP_XMIT) {
+ int strict = (ipv6_addr_type(&p->raddr) &
+ (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
+
+ struct rt6_info *rt = rt6_lookup(t->net,
+ &p->raddr, &p->laddr,
+ p->link, strict);
+
+ if (!rt)
+ return;
+
+ if (rt->dst.dev) {
+ dev->hard_header_len = rt->dst.dev->hard_header_len +
+ sizeof(struct ipv6hdr);
+
+ dev->mtu = rt->dst.dev->mtu - sizeof(struct ipv6hdr);
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+
+ if (dev->mtu < IPV6_MIN_MTU)
+ dev->mtu = IPV6_MIN_MTU;
+ }
+ ip6_rt_put(rt);
+ }
+}
+
+/**
+ * ip6_tnl_change - update the tunnel parameters
+ * @t: tunnel to be changed
+ * @p: tunnel configuration parameters
+ *
+ * Description:
+ * ip6_tnl_change() updates the tunnel parameters
+ **/
+
+static int
+ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
+{
+ t->parms.laddr = p->laddr;
+ t->parms.raddr = p->raddr;
+ t->parms.flags = p->flags;
+ t->parms.hop_limit = p->hop_limit;
+ t->parms.encap_limit = p->encap_limit;
+ t->parms.flowinfo = p->flowinfo;
+ t->parms.link = p->link;
+ t->parms.proto = p->proto;
+ ip6_tnl_dst_reset(t);
+ ip6_tnl_link_config(t);
+ return 0;
+}
+
+static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+{
+ struct net *net = t->net;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ int err;
+
+ ip6_tnl_unlink(ip6n, t);
+ synchronize_net();
+ err = ip6_tnl_change(t, p);
+ ip6_tnl_link(ip6n, t);
+ netdev_state_change(t->dev);
+ return err;
+}
+
+static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+{
+ /* for default tnl0 device allow to change only the proto */
+ t->parms.proto = p->proto;
+ netdev_state_change(t->dev);
+ return 0;
+}
+
+static void
+ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u)
+{
+ p->laddr = u->laddr;
+ p->raddr = u->raddr;
+ p->flags = u->flags;
+ p->hop_limit = u->hop_limit;
+ p->encap_limit = u->encap_limit;
+ p->flowinfo = u->flowinfo;
+ p->link = u->link;
+ p->proto = u->proto;
+ memcpy(p->name, u->name, sizeof(u->name));
+}
+
+static void
+ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
+{
+ u->laddr = p->laddr;
+ u->raddr = p->raddr;
+ u->flags = p->flags;
+ u->hop_limit = p->hop_limit;
+ u->encap_limit = p->encap_limit;
+ u->flowinfo = p->flowinfo;
+ u->link = p->link;
+ u->proto = p->proto;
+ memcpy(u->name, p->name, sizeof(u->name));
+}
+
+/**
+ * ip6_tnl_ioctl - configure ipv6 tunnels from userspace
+ * @dev: virtual device associated with tunnel
+ * @ifr: parameters passed from userspace
+ * @cmd: command to be performed
+ *
+ * Description:
+ * ip6_tnl_ioctl() is used for managing IPv6 tunnels
+ * from userspace.
+ *
+ * The possible commands are the following:
+ * %SIOCGETTUNNEL: get tunnel parameters for device
+ * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters
+ * %SIOCCHGTUNNEL: change tunnel parameters to those given
+ * %SIOCDELTUNNEL: delete tunnel
+ *
+ * The fallback device "ip6tnl0", created during module
+ * initialization, can be used for creating other tunnel devices.
+ *
+ * Return:
+ * 0 on success,
+ * %-EFAULT if unable to copy data to or from userspace,
+ * %-EPERM if current process hasn't %CAP_NET_ADMIN set
+ * %-EINVAL if passed tunnel parameters are invalid,
+ * %-EEXIST if changing a tunnel's parameters would cause a conflict
+ * %-ENODEV if attempting to change or delete a nonexisting device
+ **/
+
+static int
+ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip6_tnl_parm p;
+ struct __ip6_tnl_parm p1;
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == ip6n->fb_tnl_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ ip6_tnl_parm_from_user(&p1, &p);
+ t = ip6_tnl_locate(net, &p1, 0);
+ if (IS_ERR(t))
+ t = netdev_priv(dev);
+ } else {
+ memset(&p, 0, sizeof(p));
+ }
+ ip6_tnl_parm_to_user(&p, &t->parms);
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) {
+ err = -EFAULT;
+ }
+ break;
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ break;
+ err = -EINVAL;
+ if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
+ p.proto != 0)
+ break;
+ ip6_tnl_parm_from_user(&p1, &p);
+ t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL);
+ if (cmd == SIOCCHGTUNNEL) {
+ if (!IS_ERR(t)) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else
+ t = netdev_priv(dev);
+ if (dev == ip6n->fb_tnl_dev)
+ err = ip6_tnl0_update(t, &p1);
+ else
+ err = ip6_tnl_update(t, &p1);
+ }
+ if (!IS_ERR(t)) {
+ err = 0;
+ ip6_tnl_parm_to_user(&p, &t->parms);
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+
+ } else {
+ err = PTR_ERR(t);
+ }
+ break;
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+
+ if (dev == ip6n->fb_tnl_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ break;
+ err = -ENOENT;
+ ip6_tnl_parm_from_user(&p1, &p);
+ t = ip6_tnl_locate(net, &p1, 0);
+ if (IS_ERR(t))
+ break;
+ err = -EPERM;
+ if (t->dev == ip6n->fb_tnl_dev)
+ break;
+ dev = t->dev;
+ }
+ err = 0;
+ unregister_netdevice(dev);
+ break;
+ default:
+ err = -EINVAL;
+ }
+ return err;
+}
+
+/**
+ * ip6_tnl_change_mtu - change mtu manually for tunnel device
+ * @dev: virtual device associated with tunnel
+ * @new_mtu: the new mtu
+ *
+ * Return:
+ * 0 on success,
+ * %-EINVAL if mtu too small
+ **/
+
+static int
+ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip6_tnl *tnl = netdev_priv(dev);
+
+ if (tnl->parms.proto == IPPROTO_IPIP) {
+ if (new_mtu < 68)
+ return -EINVAL;
+ } else {
+ if (new_mtu < IPV6_MIN_MTU)
+ return -EINVAL;
+ }
+ if (new_mtu > 0xFFF8 - dev->hard_header_len)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+int ip6_tnl_get_iflink(const struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ return t->parms.link;
+}
+EXPORT_SYMBOL(ip6_tnl_get_iflink);
+
+static const struct net_device_ops ip6_tnl_netdev_ops = {
+ .ndo_init = ip6_tnl_dev_init,
+ .ndo_uninit = ip6_tnl_dev_uninit,
+ .ndo_start_xmit = ip6_tnl_xmit,
+ .ndo_do_ioctl = ip6_tnl_ioctl,
+ .ndo_change_mtu = ip6_tnl_change_mtu,
+ .ndo_get_stats = ip6_get_stats,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+
+/**
+ * ip6_tnl_dev_setup - setup virtual tunnel device
+ * @dev: virtual device associated with tunnel
+ *
+ * Description:
+ * Initialize function pointers and device parameters
+ **/
+
+static void ip6_tnl_dev_setup(struct net_device *dev)
+{
+ struct ip6_tnl *t;
+
+ dev->netdev_ops = &ip6_tnl_netdev_ops;
+ dev->destructor = ip6_dev_free;
+
+ dev->type = ARPHRD_TUNNEL6;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
+ dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr);
+ t = netdev_priv(dev);
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ dev->mtu -= 8;
+ dev->flags |= IFF_NOARP;
+ dev->addr_len = sizeof(struct in6_addr);
+ netif_keep_dst(dev);
+ /* This perm addr will be used as interface identifier by IPv6 */
+ dev->addr_assign_type = NET_ADDR_RANDOM;
+ eth_random_addr(dev->perm_addr);
+}
+
+
+/**
+ * ip6_tnl_dev_init_gen - general initializer for all tunnel devices
+ * @dev: virtual device associated with tunnel
+ **/
+
+static inline int
+ip6_tnl_dev_init_gen(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ t->dev = dev;
+ t->net = dev_net(dev);
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+ return 0;
+}
+
+/**
+ * ip6_tnl_dev_init - initializer for all non fallback tunnel devices
+ * @dev: virtual device associated with tunnel
+ **/
+
+static int ip6_tnl_dev_init(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ int err = ip6_tnl_dev_init_gen(dev);
+
+ if (err)
+ return err;
+ ip6_tnl_link_config(t);
+ return 0;
+}
+
+/**
+ * ip6_fb_tnl_dev_init - initializer for fallback tunnel device
+ * @dev: fallback device
+ *
+ * Return: 0
+ **/
+
+static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ t->parms.proto = IPPROTO_IPV6;
+ dev_hold(dev);
+
+ rcu_assign_pointer(ip6n->tnls_wc[0], t);
+ return 0;
+}
+
+static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ u8 proto;
+
+ if (!data || !data[IFLA_IPTUN_PROTO])
+ return 0;
+
+ proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+ if (proto != IPPROTO_IPV6 &&
+ proto != IPPROTO_IPIP &&
+ proto != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void ip6_tnl_netlink_parms(struct nlattr *data[],
+ struct __ip6_tnl_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]);
+
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]);
+
+ if (data[IFLA_IPTUN_TTL])
+ parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]);
+
+ if (data[IFLA_IPTUN_ENCAP_LIMIT])
+ parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]);
+
+ if (data[IFLA_IPTUN_FLOWINFO])
+ parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]);
+
+ if (data[IFLA_IPTUN_FLAGS])
+ parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]);
+
+ if (data[IFLA_IPTUN_PROTO])
+ parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+}
+
+static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net *net = dev_net(dev);
+ struct ip6_tnl *nt, *t;
+
+ nt = netdev_priv(dev);
+ ip6_tnl_netlink_parms(data, &nt->parms);
+
+ t = ip6_tnl_locate(net, &nt->parms, 0);
+ if (!IS_ERR(t))
+ return -EEXIST;
+
+ return ip6_tnl_create2(dev);
+}
+
+static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct __ip6_tnl_parm p;
+ struct net *net = t->net;
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ if (dev == ip6n->fb_tnl_dev)
+ return -EINVAL;
+
+ ip6_tnl_netlink_parms(data, &p);
+
+ t = ip6_tnl_locate(net, &p, 0);
+ if (!IS_ERR(t)) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else
+ t = netdev_priv(dev);
+
+ return ip6_tnl_update(t, &p);
+}
+
+static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+ if (dev != ip6n->fb_tnl_dev)
+ unregister_netdevice_queue(dev, head);
+}
+
+static size_t ip6_tnl_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_ENCAP_LIMIT */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_FLOWINFO */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_FLAGS */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_PROTO */
+ nla_total_size(1) +
+ 0;
+}
+
+static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+ struct __ip6_tnl_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) ||
+ nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) ||
+ nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) ||
+ nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) ||
+ nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) ||
+ nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+struct net *ip6_tnl_get_link_net(const struct net_device *dev)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+
+ return tunnel->net;
+}
+EXPORT_SYMBOL(ip6_tnl_get_link_net);
+
+static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) },
+ [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 },
+ [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 },
+ [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 },
+ [IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
+};
+
+static struct rtnl_link_ops ip6_link_ops __read_mostly = {
+ .kind = "ip6tnl",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ip6_tnl_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = ip6_tnl_dev_setup,
+ .validate = ip6_tnl_validate,
+ .newlink = ip6_tnl_newlink,
+ .changelink = ip6_tnl_changelink,
+ .dellink = ip6_tnl_dellink,
+ .get_size = ip6_tnl_get_size,
+ .fill_info = ip6_tnl_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
+static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
+ .handler = ip4ip6_rcv,
+ .err_handler = ip4ip6_err,
+ .priority = 1,
+};
+
+static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
+ .handler = ip6ip6_rcv,
+ .err_handler = ip6ip6_err,
+ .priority = 1,
+};
+
+static void __net_exit ip6_tnl_destroy_tunnels(struct net *net)
+{
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct net_device *dev, *aux;
+ int h;
+ struct ip6_tnl *t;
+ LIST_HEAD(list);
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &ip6_link_ops)
+ unregister_netdevice_queue(dev, &list);
+
+ for (h = 0; h < HASH_SIZE; h++) {
+ t = rtnl_dereference(ip6n->tnls_r_l[h]);
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, &list);
+ t = rtnl_dereference(t->next);
+ }
+ }
+
+ unregister_netdevice_many(&list);
+}
+
+static int __net_init ip6_tnl_init_net(struct net *net)
+{
+ struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct ip6_tnl *t = NULL;
+ int err;
+
+ ip6n->tnls[0] = ip6n->tnls_wc;
+ ip6n->tnls[1] = ip6n->tnls_r_l;
+
+ err = -ENOMEM;
+ ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
+ NET_NAME_UNKNOWN, ip6_tnl_dev_setup);
+
+ if (!ip6n->fb_tnl_dev)
+ goto err_alloc_dev;
+ dev_net_set(ip6n->fb_tnl_dev, net);
+ ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops;
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL;
+
+ err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
+
+ err = register_netdev(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
+
+ t = netdev_priv(ip6n->fb_tnl_dev);
+
+ strcpy(t->parms.name, ip6n->fb_tnl_dev->name);
+ return 0;
+
+err_register:
+ ip6_dev_free(ip6n->fb_tnl_dev);
+err_alloc_dev:
+ return err;
+}
+
+static void __net_exit ip6_tnl_exit_net(struct net *net)
+{
+ rtnl_lock();
+ ip6_tnl_destroy_tunnels(net);
+ rtnl_unlock();
+}
+
+static struct pernet_operations ip6_tnl_net_ops = {
+ .init = ip6_tnl_init_net,
+ .exit = ip6_tnl_exit_net,
+ .id = &ip6_tnl_net_id,
+ .size = sizeof(struct ip6_tnl_net),
+};
+
+/**
+ * ip6_tunnel_init - register protocol and reserve needed resources
+ *
+ * Return: 0 on success
+ **/
+
+static int __init ip6_tunnel_init(void)
+{
+ int err;
+
+ err = register_pernet_device(&ip6_tnl_net_ops);
+ if (err < 0)
+ goto out_pernet;
+
+ err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET);
+ if (err < 0) {
+ pr_err("%s: can't register ip4ip6\n", __func__);
+ goto out_ip4ip6;
+ }
+
+ err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6);
+ if (err < 0) {
+ pr_err("%s: can't register ip6ip6\n", __func__);
+ goto out_ip6ip6;
+ }
+ err = rtnl_link_register(&ip6_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ return 0;
+
+rtnl_link_failed:
+ xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
+out_ip6ip6:
+ xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
+out_ip4ip6:
+ unregister_pernet_device(&ip6_tnl_net_ops);
+out_pernet:
+ return err;
+}
+
+/**
+ * ip6_tunnel_cleanup - free resources and unregister protocol
+ **/
+
+static void __exit ip6_tunnel_cleanup(void)
+{
+ rtnl_link_unregister(&ip6_link_ops);
+ if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET))
+ pr_info("%s: can't deregister ip4ip6\n", __func__);
+
+ if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
+ pr_info("%s: can't deregister ip6ip6\n", __func__);
+
+ unregister_pernet_device(&ip6_tnl_net_ops);
+}
+
+module_init(ip6_tunnel_init);
+module_exit(ip6_tunnel_cleanup);
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
new file mode 100644
index 000000000..bba8903e8
--- /dev/null
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -0,0 +1,106 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/in6.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+
+int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
+ struct socket **sockp)
+{
+ struct sockaddr_in6 udp6_addr;
+ int err;
+ struct socket *sock = NULL;
+
+ err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock);
+ if (err < 0)
+ goto error;
+
+ sk_change_net(sock->sk, net);
+
+ udp6_addr.sin6_family = AF_INET6;
+ memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
+ sizeof(udp6_addr.sin6_addr));
+ udp6_addr.sin6_port = cfg->local_udp_port;
+ err = kernel_bind(sock, (struct sockaddr *)&udp6_addr,
+ sizeof(udp6_addr));
+ if (err < 0)
+ goto error;
+
+ if (cfg->peer_udp_port) {
+ udp6_addr.sin6_family = AF_INET6;
+ memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
+ sizeof(udp6_addr.sin6_addr));
+ udp6_addr.sin6_port = cfg->peer_udp_port;
+ err = kernel_connect(sock,
+ (struct sockaddr *)&udp6_addr,
+ sizeof(udp6_addr), 0);
+ }
+ if (err < 0)
+ goto error;
+
+ udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums);
+ udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums);
+
+ *sockp = sock;
+ return 0;
+
+error:
+ if (sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+ }
+ *sockp = NULL;
+ return err;
+}
+EXPORT_SYMBOL_GPL(udp_sock_create6);
+
+int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
+ struct net_device *dev, struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ __u8 prio, __u8 ttl, __be16 src_port,
+ __be16 dst_port, bool nocheck)
+{
+ struct udphdr *uh;
+ struct ipv6hdr *ip6h;
+
+ __skb_push(skb, sizeof(*uh));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+
+ uh->dest = dst_port;
+ uh->source = src_port;
+
+ uh->len = htons(skb->len);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
+ | IPSKB_REROUTED);
+ skb_dst_set(skb, dst);
+
+ udp6_set_csum(nocheck, skb, saddr, daddr, skb->len);
+
+ __skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ip6h, prio, htonl(0));
+ ip6h->payload_len = htons(skb->len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = ttl;
+ ip6h->daddr = *daddr;
+ ip6h->saddr = *saddr;
+
+ ip6tunnel_xmit(sk, skb, dev);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
new file mode 100644
index 000000000..0224c032d
--- /dev/null
+++ b/net/ipv6/ip6_vti.c
@@ -0,0 +1,1194 @@
+/*
+ * IPv6 virtual tunneling interface
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv6/ip6_tunnel.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/sockios.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/route.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define HASH_SIZE_SHIFT 5
+#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
+
+static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
+{
+ u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
+
+ return hash_32(hash, HASH_SIZE_SHIFT);
+}
+
+static int vti6_dev_init(struct net_device *dev);
+static void vti6_dev_setup(struct net_device *dev);
+static struct rtnl_link_ops vti6_link_ops __read_mostly;
+
+static int vti6_net_id __read_mostly;
+struct vti6_net {
+ /* the vti6 tunnel fallback device */
+ struct net_device *fb_tnl_dev;
+ /* lists for storing tunnels in use */
+ struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+ struct ip6_tnl __rcu *tnls_wc[1];
+ struct ip6_tnl __rcu **tnls[2];
+};
+
+#define for_each_vti6_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+/**
+ * vti6_tnl_lookup - fetch tunnel matching the end-point addresses
+ * @net: network namespace
+ * @remote: the address of the tunnel exit-point
+ * @local: the address of the tunnel entry-point
+ *
+ * Return:
+ * tunnel matching given end-points if found,
+ * else fallback tunnel if its device is up,
+ * else %NULL
+ **/
+static struct ip6_tnl *
+vti6_tnl_lookup(struct net *net, const struct in6_addr *remote,
+ const struct in6_addr *local)
+{
+ unsigned int hash = HASH(remote, local);
+ struct ip6_tnl *t;
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ struct in6_addr any;
+
+ for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ memset(&any, 0, sizeof(any));
+ hash = HASH(&any, local);
+ for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ hash = HASH(remote, &any);
+ for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
+ if (ipv6_addr_equal(remote, &t->parms.raddr) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+
+ t = rcu_dereference(ip6n->tnls_wc[0]);
+ if (t && (t->dev->flags & IFF_UP))
+ return t;
+
+ return NULL;
+}
+
+/**
+ * vti6_tnl_bucket - get head of list matching given tunnel parameters
+ * @p: parameters containing tunnel end-points
+ *
+ * Description:
+ * vti6_tnl_bucket() returns the head of the list matching the
+ * &struct in6_addr entries laddr and raddr in @p.
+ *
+ * Return: head of IPv6 tunnel list
+ **/
+static struct ip6_tnl __rcu **
+vti6_tnl_bucket(struct vti6_net *ip6n, const struct __ip6_tnl_parm *p)
+{
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ unsigned int h = 0;
+ int prio = 0;
+
+ if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
+ prio = 1;
+ h = HASH(remote, local);
+ }
+ return &ip6n->tnls[prio][h];
+}
+
+static void
+vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms);
+
+ rcu_assign_pointer(t->next , rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
+}
+
+static void
+vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t)
+{
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *iter;
+
+ for (tp = vti6_tnl_bucket(ip6n, &t->parms);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
+ break;
+ }
+ }
+}
+
+static void vti6_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+static int vti6_tnl_create2(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ int err;
+
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ strcpy(t->parms.name, dev->name);
+ dev->rtnl_link_ops = &vti6_link_ops;
+
+ dev_hold(dev);
+ vti6_tnl_link(ip6n, t);
+
+ return 0;
+
+out:
+ return err;
+}
+
+static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
+{
+ struct net_device *dev;
+ struct ip6_tnl *t;
+ char name[IFNAMSIZ];
+ int err;
+
+ if (p->name[0])
+ strlcpy(name, p->name, IFNAMSIZ);
+ else
+ sprintf(name, "ip6_vti%%d");
+
+ dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup);
+ if (!dev)
+ goto failed;
+
+ dev_net_set(dev, net);
+
+ t = netdev_priv(dev);
+ t->parms = *p;
+ t->net = dev_net(dev);
+
+ err = vti6_tnl_create2(dev);
+ if (err < 0)
+ goto failed_free;
+
+ return t;
+
+failed_free:
+ vti6_dev_free(dev);
+failed:
+ return NULL;
+}
+
+/**
+ * vti6_locate - find or create tunnel matching given parameters
+ * @net: network namespace
+ * @p: tunnel parameters
+ * @create: != 0 if allowed to create new tunnel if no match found
+ *
+ * Description:
+ * vti6_locate() first tries to locate an existing tunnel
+ * based on @parms. If this is unsuccessful, but @create is set a new
+ * tunnel device is created and registered for use.
+ *
+ * Return:
+ * matching tunnel or NULL
+ **/
+static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p,
+ int create)
+{
+ const struct in6_addr *remote = &p->raddr;
+ const struct in6_addr *local = &p->laddr;
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *t;
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ for (tp = vti6_tnl_bucket(ip6n, p);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
+ if (ipv6_addr_equal(local, &t->parms.laddr) &&
+ ipv6_addr_equal(remote, &t->parms.raddr)) {
+ if (create)
+ return NULL;
+
+ return t;
+ }
+ }
+ if (!create)
+ return NULL;
+ return vti6_tnl_create(net, p);
+}
+
+/**
+ * vti6_dev_uninit - tunnel device uninitializer
+ * @dev: the device to be destroyed
+ *
+ * Description:
+ * vti6_dev_uninit() removes tunnel from its list
+ **/
+static void vti6_dev_uninit(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct vti6_net *ip6n = net_generic(t->net, vti6_net_id);
+
+ if (dev == ip6n->fb_tnl_dev)
+ RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
+ else
+ vti6_tnl_unlink(ip6n, t);
+ dev_put(dev);
+}
+
+static int vti6_rcv(struct sk_buff *skb)
+{
+ struct ip6_tnl *t;
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+ rcu_read_lock();
+ t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
+ if (t) {
+ if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) {
+ rcu_read_unlock();
+ goto discard;
+ }
+
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
+ t->dev->stats.rx_dropped++;
+ rcu_read_unlock();
+ goto discard;
+ }
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
+
+ rcu_read_unlock();
+
+ return xfrm6_rcv(skb);
+ }
+ rcu_read_unlock();
+ return -EINVAL;
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int vti6_rcv_cb(struct sk_buff *skb, int err)
+{
+ unsigned short family;
+ struct net_device *dev;
+ struct pcpu_sw_netstats *tstats;
+ struct xfrm_state *x;
+ struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
+ u32 orig_mark = skb->mark;
+ int ret;
+
+ if (!t)
+ return 1;
+
+ dev = t->dev;
+
+ if (err) {
+ dev->stats.rx_errors++;
+ dev->stats.rx_dropped++;
+
+ return 0;
+ }
+
+ x = xfrm_input_state(skb);
+ family = x->inner_mode->afinfo->family;
+
+ skb->mark = be32_to_cpu(t->parms.i_key);
+ ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+ skb->mark = orig_mark;
+
+ if (!ret)
+ return -EPERM;
+
+ skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev)));
+ skb->dev = dev;
+
+ tstats = this_cpu_ptr(dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ return 0;
+}
+
+/**
+ * vti6_addr_conflict - compare packet addresses to tunnel's own
+ * @t: the outgoing tunnel device
+ * @hdr: IPv6 header from the incoming packet
+ *
+ * Description:
+ * Avoid trivial tunneling loop by checking that tunnel exit-point
+ * doesn't match source of incoming packet.
+ *
+ * Return:
+ * 1 if conflict,
+ * 0 else
+ **/
+static inline bool
+vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
+{
+ return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
+}
+
+static bool vti6_state_check(const struct xfrm_state *x,
+ const struct in6_addr *dst,
+ const struct in6_addr *src)
+{
+ xfrm_address_t *daddr = (xfrm_address_t *)dst;
+ xfrm_address_t *saddr = (xfrm_address_t *)src;
+
+ /* if there is no transform then this tunnel is not functional.
+ * Or if the xfrm is not mode tunnel.
+ */
+ if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
+ x->props.family != AF_INET6)
+ return false;
+
+ if (ipv6_addr_any(dst))
+ return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6);
+
+ if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6))
+ return false;
+
+ return true;
+}
+
+/**
+ * vti6_xmit - send a packet
+ * @skb: the outgoing socket buffer
+ * @dev: the outgoing tunnel device
+ * @fl: the flow informations for the xfrm_lookup
+ **/
+static int
+vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net_device_stats *stats = &t->dev->stats;
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *tdev;
+ struct xfrm_state *x;
+ int err = -1;
+ int mtu;
+
+ if (!dst)
+ goto tx_err_link_failure;
+
+ dst_hold(dst);
+ dst = xfrm_lookup(t->net, dst, fl, NULL, 0);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+
+ x = dst->xfrm;
+ if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr))
+ goto tx_err_link_failure;
+
+ if (!ip6_tnl_xmit_ctl(t, (const struct in6_addr *)&x->props.saddr,
+ (const struct in6_addr *)&x->id.daddr))
+ goto tx_err_link_failure;
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ stats->collisions++;
+ net_warn_ratelimited("%s: Local routing loop detected!\n",
+ t->parms.name);
+ goto tx_err_dst_release;
+ }
+
+ skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = skb_dst(skb)->dev;
+
+ mtu = dst_mtu(dst);
+ if (!skb->ignore_df && skb->len > mtu) {
+ skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ else
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+
+ return -EMSGSIZE;
+ }
+
+ err = dst_output(skb);
+ if (net_xmit_eval(err) == 0) {
+ struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += skb->len;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else {
+ stats->tx_errors++;
+ stats->tx_aborted_errors++;
+ }
+
+ return 0;
+tx_err_link_failure:
+ stats->tx_carrier_errors++;
+ dst_link_failure(skb);
+tx_err_dst_release:
+ dst_release(dst);
+ return err;
+}
+
+static netdev_tx_t
+vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net_device_stats *stats = &t->dev->stats;
+ struct ipv6hdr *ipv6h;
+ struct flowi fl;
+ int ret;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IPV6):
+ ipv6h = ipv6_hdr(skb);
+
+ if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
+ vti6_addr_conflict(t, ipv6h))
+ goto tx_err;
+
+ xfrm_decode_session(skb, &fl, AF_INET6);
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ break;
+ case htons(ETH_P_IP):
+ xfrm_decode_session(skb, &fl, AF_INET);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ break;
+ default:
+ goto tx_err;
+ }
+
+ /* override mark with tunnel output key */
+ fl.flowi_mark = be32_to_cpu(t->parms.o_key);
+
+ ret = vti6_xmit(skb, dev, &fl);
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ stats->tx_errors++;
+ stats->tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __be32 spi;
+ __u32 mark;
+ struct xfrm_state *x;
+ struct ip6_tnl *t;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah;
+ struct ip_comp_hdr *ipch;
+ struct net *net = dev_net(skb->dev);
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ int protocol = iph->nexthdr;
+
+ t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr);
+ if (!t)
+ return -1;
+
+ mark = be32_to_cpu(t->parms.o_key);
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data + offset);
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data + offset);
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data + offset);
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
+
+ x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET6);
+ if (!x)
+ return 0;
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static void vti6_link_config(struct ip6_tnl *t)
+{
+ struct net_device *dev = t->dev;
+ struct __ip6_tnl_parm *p = &t->parms;
+
+ memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
+
+ p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV |
+ IP6_TNL_F_CAP_PER_PACKET);
+ p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
+
+ if (p->flags & IP6_TNL_F_CAP_XMIT && p->flags & IP6_TNL_F_CAP_RCV)
+ dev->flags |= IFF_POINTOPOINT;
+ else
+ dev->flags &= ~IFF_POINTOPOINT;
+}
+
+/**
+ * vti6_tnl_change - update the tunnel parameters
+ * @t: tunnel to be changed
+ * @p: tunnel configuration parameters
+ *
+ * Description:
+ * vti6_tnl_change() updates the tunnel parameters
+ **/
+static int
+vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
+{
+ t->parms.laddr = p->laddr;
+ t->parms.raddr = p->raddr;
+ t->parms.link = p->link;
+ t->parms.i_key = p->i_key;
+ t->parms.o_key = p->o_key;
+ t->parms.proto = p->proto;
+ ip6_tnl_dst_reset(t);
+ vti6_link_config(t);
+ return 0;
+}
+
+static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+{
+ struct net *net = dev_net(t->dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ int err;
+
+ vti6_tnl_unlink(ip6n, t);
+ synchronize_net();
+ err = vti6_tnl_change(t, p);
+ vti6_tnl_link(ip6n, t);
+ netdev_state_change(t->dev);
+ return err;
+}
+
+static void
+vti6_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm2 *u)
+{
+ p->laddr = u->laddr;
+ p->raddr = u->raddr;
+ p->link = u->link;
+ p->i_key = u->i_key;
+ p->o_key = u->o_key;
+ p->proto = u->proto;
+
+ memcpy(p->name, u->name, sizeof(u->name));
+}
+
+static void
+vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
+{
+ u->laddr = p->laddr;
+ u->raddr = p->raddr;
+ u->link = p->link;
+ u->i_key = p->i_key;
+ u->o_key = p->o_key;
+ u->proto = p->proto;
+
+ memcpy(u->name, p->name, sizeof(u->name));
+}
+
+/**
+ * vti6_tnl_ioctl - configure vti6 tunnels from userspace
+ * @dev: virtual device associated with tunnel
+ * @ifr: parameters passed from userspace
+ * @cmd: command to be performed
+ *
+ * Description:
+ * vti6_ioctl() is used for managing vti6 tunnels
+ * from userspace.
+ *
+ * The possible commands are the following:
+ * %SIOCGETTUNNEL: get tunnel parameters for device
+ * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters
+ * %SIOCCHGTUNNEL: change tunnel parameters to those given
+ * %SIOCDELTUNNEL: delete tunnel
+ *
+ * The fallback device "ip6_vti0", created during module
+ * initialization, can be used for creating other tunnel devices.
+ *
+ * Return:
+ * 0 on success,
+ * %-EFAULT if unable to copy data to or from userspace,
+ * %-EPERM if current process hasn't %CAP_NET_ADMIN set
+ * %-EINVAL if passed tunnel parameters are invalid,
+ * %-EEXIST if changing a tunnel's parameters would cause a conflict
+ * %-ENODEV if attempting to change or delete a nonexisting device
+ **/
+static int
+vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip6_tnl_parm2 p;
+ struct __ip6_tnl_parm p1;
+ struct ip6_tnl *t = NULL;
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ if (dev == ip6n->fb_tnl_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ vti6_parm_from_user(&p1, &p);
+ t = vti6_locate(net, &p1, 0);
+ } else {
+ memset(&p, 0, sizeof(p));
+ }
+ if (!t)
+ t = netdev_priv(dev);
+ vti6_parm_to_user(&p, &t->parms);
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+ break;
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ break;
+ err = -EINVAL;
+ if (p.proto != IPPROTO_IPV6 && p.proto != 0)
+ break;
+ vti6_parm_from_user(&p1, &p);
+ t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL);
+ if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
+ if (t) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else
+ t = netdev_priv(dev);
+
+ err = vti6_update(t, &p1);
+ }
+ if (t) {
+ err = 0;
+ vti6_parm_to_user(&p, &t->parms);
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ err = -EFAULT;
+
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+
+ if (dev == ip6n->fb_tnl_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ break;
+ err = -ENOENT;
+ vti6_parm_from_user(&p1, &p);
+ t = vti6_locate(net, &p1, 0);
+ if (!t)
+ break;
+ err = -EPERM;
+ if (t->dev == ip6n->fb_tnl_dev)
+ break;
+ dev = t->dev;
+ }
+ err = 0;
+ unregister_netdevice(dev);
+ break;
+ default:
+ err = -EINVAL;
+ }
+ return err;
+}
+
+/**
+ * vti6_tnl_change_mtu - change mtu manually for tunnel device
+ * @dev: virtual device associated with tunnel
+ * @new_mtu: the new mtu
+ *
+ * Return:
+ * 0 on success,
+ * %-EINVAL if mtu too small
+ **/
+static int vti6_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if (new_mtu < IPV6_MIN_MTU)
+ return -EINVAL;
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static const struct net_device_ops vti6_netdev_ops = {
+ .ndo_init = vti6_dev_init,
+ .ndo_uninit = vti6_dev_uninit,
+ .ndo_start_xmit = vti6_tnl_xmit,
+ .ndo_do_ioctl = vti6_ioctl,
+ .ndo_change_mtu = vti6_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip6_tnl_get_iflink,
+};
+
+/**
+ * vti6_dev_setup - setup virtual tunnel device
+ * @dev: virtual device associated with tunnel
+ *
+ * Description:
+ * Initialize function pointers and device parameters
+ **/
+static void vti6_dev_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &vti6_netdev_ops;
+ dev->destructor = vti6_dev_free;
+
+ dev->type = ARPHRD_TUNNEL6;
+ dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr);
+ dev->mtu = ETH_DATA_LEN;
+ dev->flags |= IFF_NOARP;
+ dev->addr_len = sizeof(struct in6_addr);
+ netif_keep_dst(dev);
+}
+
+/**
+ * vti6_dev_init_gen - general initializer for all tunnel devices
+ * @dev: virtual device associated with tunnel
+ **/
+static inline int vti6_dev_init_gen(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ t->dev = dev;
+ t->net = dev_net(dev);
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+ return 0;
+}
+
+/**
+ * vti6_dev_init - initializer for all non fallback tunnel devices
+ * @dev: virtual device associated with tunnel
+ **/
+static int vti6_dev_init(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ int err = vti6_dev_init_gen(dev);
+
+ if (err)
+ return err;
+ vti6_link_config(t);
+ return 0;
+}
+
+/**
+ * vti6_fb_tnl_dev_init - initializer for fallback tunnel device
+ * @dev: fallback device
+ *
+ * Return: 0
+ **/
+static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ t->parms.proto = IPPROTO_IPV6;
+ dev_hold(dev);
+
+ rcu_assign_pointer(ip6n->tnls_wc[0], t);
+ return 0;
+}
+
+static int vti6_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ return 0;
+}
+
+static void vti6_netlink_parms(struct nlattr *data[],
+ struct __ip6_tnl_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_VTI_LINK])
+ parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
+
+ if (data[IFLA_VTI_LOCAL])
+ parms->laddr = nla_get_in6_addr(data[IFLA_VTI_LOCAL]);
+
+ if (data[IFLA_VTI_REMOTE])
+ parms->raddr = nla_get_in6_addr(data[IFLA_VTI_REMOTE]);
+
+ if (data[IFLA_VTI_IKEY])
+ parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
+
+ if (data[IFLA_VTI_OKEY])
+ parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
+}
+
+static int vti6_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net *net = dev_net(dev);
+ struct ip6_tnl *nt;
+
+ nt = netdev_priv(dev);
+ vti6_netlink_parms(data, &nt->parms);
+
+ nt->parms.proto = IPPROTO_IPV6;
+
+ if (vti6_locate(net, &nt->parms, 0))
+ return -EEXIST;
+
+ return vti6_tnl_create2(dev);
+}
+
+static void vti6_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ if (dev != ip6n->fb_tnl_dev)
+ unregister_netdevice_queue(dev, head);
+}
+
+static int vti6_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip6_tnl *t;
+ struct __ip6_tnl_parm p;
+ struct net *net = dev_net(dev);
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ if (dev == ip6n->fb_tnl_dev)
+ return -EINVAL;
+
+ vti6_netlink_parms(data, &p);
+
+ t = vti6_locate(net, &p, 0);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else
+ t = netdev_priv(dev);
+
+ return vti6_update(t, &p);
+}
+
+static size_t vti6_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_VTI_LINK */
+ nla_total_size(4) +
+ /* IFLA_VTI_LOCAL */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_VTI_REMOTE */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_VTI_IKEY */
+ nla_total_size(4) +
+ /* IFLA_VTI_OKEY */
+ nla_total_size(4) +
+ 0;
+}
+
+static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip6_tnl *tunnel = netdev_priv(dev);
+ struct __ip6_tnl_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) ||
+ nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) ||
+ nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) ||
+ nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) ||
+ nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = {
+ [IFLA_VTI_LINK] = { .type = NLA_U32 },
+ [IFLA_VTI_LOCAL] = { .len = sizeof(struct in6_addr) },
+ [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) },
+ [IFLA_VTI_IKEY] = { .type = NLA_U32 },
+ [IFLA_VTI_OKEY] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops vti6_link_ops __read_mostly = {
+ .kind = "vti6",
+ .maxtype = IFLA_VTI_MAX,
+ .policy = vti6_policy,
+ .priv_size = sizeof(struct ip6_tnl),
+ .setup = vti6_dev_setup,
+ .validate = vti6_validate,
+ .newlink = vti6_newlink,
+ .dellink = vti6_dellink,
+ .changelink = vti6_changelink,
+ .get_size = vti6_get_size,
+ .fill_info = vti6_fill_info,
+ .get_link_net = ip6_tnl_get_link_net,
+};
+
+static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
+{
+ int h;
+ struct ip6_tnl *t;
+ LIST_HEAD(list);
+
+ for (h = 0; h < HASH_SIZE; h++) {
+ t = rtnl_dereference(ip6n->tnls_r_l[h]);
+ while (t) {
+ unregister_netdevice_queue(t->dev, &list);
+ t = rtnl_dereference(t->next);
+ }
+ }
+
+ t = rtnl_dereference(ip6n->tnls_wc[0]);
+ unregister_netdevice_queue(t->dev, &list);
+ unregister_netdevice_many(&list);
+}
+
+static int __net_init vti6_init_net(struct net *net)
+{
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ struct ip6_tnl *t = NULL;
+ int err;
+
+ ip6n->tnls[0] = ip6n->tnls_wc;
+ ip6n->tnls[1] = ip6n->tnls_r_l;
+
+ err = -ENOMEM;
+ ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0",
+ NET_NAME_UNKNOWN, vti6_dev_setup);
+
+ if (!ip6n->fb_tnl_dev)
+ goto err_alloc_dev;
+ dev_net_set(ip6n->fb_tnl_dev, net);
+ ip6n->fb_tnl_dev->rtnl_link_ops = &vti6_link_ops;
+
+ err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
+
+ err = register_netdev(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
+
+ t = netdev_priv(ip6n->fb_tnl_dev);
+
+ strcpy(t->parms.name, ip6n->fb_tnl_dev->name);
+ return 0;
+
+err_register:
+ vti6_dev_free(ip6n->fb_tnl_dev);
+err_alloc_dev:
+ return err;
+}
+
+static void __net_exit vti6_exit_net(struct net *net)
+{
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+ rtnl_lock();
+ vti6_destroy_tunnels(ip6n);
+ rtnl_unlock();
+}
+
+static struct pernet_operations vti6_net_ops = {
+ .init = vti6_init_net,
+ .exit = vti6_exit_net,
+ .id = &vti6_net_id,
+ .size = sizeof(struct vti6_net),
+};
+
+static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
+ .handler = vti6_rcv,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 100,
+};
+
+static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
+ .handler = vti6_rcv,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 100,
+};
+
+static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
+ .handler = vti6_rcv,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 100,
+};
+
+/**
+ * vti6_tunnel_init - register protocol and reserve needed resources
+ *
+ * Return: 0 on success
+ **/
+static int __init vti6_tunnel_init(void)
+{
+ const char *msg;
+ int err;
+
+ msg = "tunnel device";
+ err = register_pernet_device(&vti6_net_ops);
+ if (err < 0)
+ goto pernet_dev_failed;
+
+ msg = "tunnel protocols";
+ err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ msg = "netlink interface";
+ err = rtnl_link_register(&vti6_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ return 0;
+
+rtnl_link_failed:
+ xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
+ xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ unregister_pernet_device(&vti6_net_ops);
+pernet_dev_failed:
+ pr_err("vti6 init: failed to register %s\n", msg);
+ return err;
+}
+
+/**
+ * vti6_tunnel_cleanup - free resources and unregister protocol
+ **/
+static void __exit vti6_tunnel_cleanup(void)
+{
+ rtnl_link_unregister(&vti6_link_ops);
+ xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
+ xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
+ xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
+ unregister_pernet_device(&vti6_net_ops);
+}
+
+module_init(vti6_tunnel_init);
+module_exit(vti6_tunnel_cleanup);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("vti6");
+MODULE_ALIAS_NETDEV("ip6_vti0");
+MODULE_AUTHOR("Steffen Klassert");
+MODULE_DESCRIPTION("IPv6 virtual tunnel interface");
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
new file mode 100644
index 000000000..74ceb73c1
--- /dev/null
+++ b/net/ipv6/ip6mr.c
@@ -0,0 +1,2512 @@
+/*
+ * Linux IPv6 multicast routing support for BSD pim6sd
+ * Based on net/ipv4/ipmr.c.
+ *
+ * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr>
+ * LSIIT Laboratory, Strasbourg, France
+ * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com>
+ * 6WIND, Paris, France
+ * Copyright (C)2007,2008 USAGI/WIDE Project
+ * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <net/checksum.h>
+#include <net/netlink.h>
+#include <net/fib_rules.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <linux/mroute6.h>
+#include <linux/pim.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/export.h>
+#include <net/ip6_checksum.h>
+#include <linux/netconf.h>
+
+struct mr6_table {
+ struct list_head list;
+ possible_net_t net;
+ u32 id;
+ struct sock *mroute6_sk;
+ struct timer_list ipmr_expire_timer;
+ struct list_head mfc6_unres_queue;
+ struct list_head mfc6_cache_array[MFC6_LINES];
+ struct mif_device vif6_table[MAXMIFS];
+ int maxvif;
+ atomic_t cache_resolve_queue_len;
+ bool mroute_do_assert;
+ bool mroute_do_pim;
+#ifdef CONFIG_IPV6_PIMSM_V2
+ int mroute_reg_vif_num;
+#endif
+};
+
+struct ip6mr_rule {
+ struct fib_rule common;
+};
+
+struct ip6mr_result {
+ struct mr6_table *mrt;
+};
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+ Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_RWLOCK(mrt_lock);
+
+/*
+ * Multicast router control variables
+ */
+
+#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL)
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+ entries is changed only in process context and protected
+ with weak lock mrt_lock. Queue of unresolved entries is protected
+ with strong spinlock mfc_unres_lock.
+
+ In this case data path is free of exclusive locks at all.
+ */
+
+static struct kmem_cache *mrt_cachep __read_mostly;
+
+static struct mr6_table *ip6mr_new_table(struct net *net, u32 id);
+static void ip6mr_free_table(struct mr6_table *mrt);
+
+static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
+ struct sk_buff *skb, struct mfc6_cache *cache);
+static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
+ mifi_t mifi, int assert);
+static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+ struct mfc6_cache *c, struct rtmsg *rtm);
+static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc,
+ int cmd);
+static int ip6mr_rtm_dumproute(struct sk_buff *skb,
+ struct netlink_callback *cb);
+static void mroute_clean_tables(struct mr6_table *mrt);
+static void ipmr_expire_process(unsigned long arg);
+
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+#define ip6mr_for_each_table(mrt, net) \
+ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)
+
+static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
+{
+ struct mr6_table *mrt;
+
+ ip6mr_for_each_table(mrt, net) {
+ if (mrt->id == id)
+ return mrt;
+ }
+ return NULL;
+}
+
+static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
+ struct mr6_table **mrt)
+{
+ int err;
+ struct ip6mr_result res;
+ struct fib_lookup_arg arg = {
+ .result = &res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
+ flowi6_to_flowi(flp6), 0, &arg);
+ if (err < 0)
+ return err;
+ *mrt = res.mrt;
+ return 0;
+}
+
+static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct ip6mr_result *res = arg->result;
+ struct mr6_table *mrt;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ mrt = ip6mr_get_table(rule->fr_net, rule->table);
+ if (!mrt)
+ return -EAGAIN;
+ res->mrt = mrt;
+ return 0;
+}
+
+static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags)
+{
+ return 1;
+}
+
+static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
+ FRA_GENERIC_POLICY,
+};
+
+static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh, struct nlattr **tb)
+{
+ return 0;
+}
+
+static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ return 1;
+}
+
+static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ frh->dst_len = 0;
+ frh->src_len = 0;
+ frh->tos = 0;
+ return 0;
+}
+
+static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
+ .family = RTNL_FAMILY_IP6MR,
+ .rule_size = sizeof(struct ip6mr_rule),
+ .addr_size = sizeof(struct in6_addr),
+ .action = ip6mr_rule_action,
+ .match = ip6mr_rule_match,
+ .configure = ip6mr_rule_configure,
+ .compare = ip6mr_rule_compare,
+ .default_pref = fib_default_rule_pref,
+ .fill = ip6mr_rule_fill,
+ .nlgroup = RTNLGRP_IPV6_RULE,
+ .policy = ip6mr_rule_policy,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ip6mr_rules_init(struct net *net)
+{
+ struct fib_rules_ops *ops;
+ struct mr6_table *mrt;
+ int err;
+
+ ops = fib_rules_register(&ip6mr_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ INIT_LIST_HEAD(&net->ipv6.mr6_tables);
+
+ mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+ if (!mrt) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0);
+ if (err < 0)
+ goto err2;
+
+ net->ipv6.mr6_rules_ops = ops;
+ return 0;
+
+err2:
+ ip6mr_free_table(mrt);
+err1:
+ fib_rules_unregister(ops);
+ return err;
+}
+
+static void __net_exit ip6mr_rules_exit(struct net *net)
+{
+ struct mr6_table *mrt, *next;
+
+ rtnl_lock();
+ list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
+ list_del(&mrt->list);
+ ip6mr_free_table(mrt);
+ }
+ fib_rules_unregister(net->ipv6.mr6_rules_ops);
+ rtnl_unlock();
+}
+#else
+#define ip6mr_for_each_table(mrt, net) \
+ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
+
+static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
+{
+ return net->ipv6.mrt6;
+}
+
+static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
+ struct mr6_table **mrt)
+{
+ *mrt = net->ipv6.mrt6;
+ return 0;
+}
+
+static int __net_init ip6mr_rules_init(struct net *net)
+{
+ net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT);
+ return net->ipv6.mrt6 ? 0 : -ENOMEM;
+}
+
+static void __net_exit ip6mr_rules_exit(struct net *net)
+{
+ rtnl_lock();
+ ip6mr_free_table(net->ipv6.mrt6);
+ net->ipv6.mrt6 = NULL;
+ rtnl_unlock();
+}
+#endif
+
+static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
+{
+ struct mr6_table *mrt;
+ unsigned int i;
+
+ mrt = ip6mr_get_table(net, id);
+ if (mrt)
+ return mrt;
+
+ mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+ if (!mrt)
+ return NULL;
+ mrt->id = id;
+ write_pnet(&mrt->net, net);
+
+ /* Forwarding cache */
+ for (i = 0; i < MFC6_LINES; i++)
+ INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]);
+
+ INIT_LIST_HEAD(&mrt->mfc6_unres_queue);
+
+ setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
+ (unsigned long)mrt);
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ mrt->mroute_reg_vif_num = -1;
+#endif
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+ list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables);
+#endif
+ return mrt;
+}
+
+static void ip6mr_free_table(struct mr6_table *mrt)
+{
+ del_timer_sync(&mrt->ipmr_expire_timer);
+ mroute_clean_tables(mrt);
+ kfree(mrt);
+}
+
+#ifdef CONFIG_PROC_FS
+
+struct ipmr_mfc_iter {
+ struct seq_net_private p;
+ struct mr6_table *mrt;
+ struct list_head *cache;
+ int ct;
+};
+
+
+static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net,
+ struct ipmr_mfc_iter *it, loff_t pos)
+{
+ struct mr6_table *mrt = it->mrt;
+ struct mfc6_cache *mfc;
+
+ read_lock(&mrt_lock);
+ for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) {
+ it->cache = &mrt->mfc6_cache_array[it->ct];
+ list_for_each_entry(mfc, it->cache, list)
+ if (pos-- == 0)
+ return mfc;
+ }
+ read_unlock(&mrt_lock);
+
+ spin_lock_bh(&mfc_unres_lock);
+ it->cache = &mrt->mfc6_unres_queue;
+ list_for_each_entry(mfc, it->cache, list)
+ if (pos-- == 0)
+ return mfc;
+ spin_unlock_bh(&mfc_unres_lock);
+
+ it->cache = NULL;
+ return NULL;
+}
+
+/*
+ * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif
+ */
+
+struct ipmr_vif_iter {
+ struct seq_net_private p;
+ struct mr6_table *mrt;
+ int ct;
+};
+
+static struct mif_device *ip6mr_vif_seq_idx(struct net *net,
+ struct ipmr_vif_iter *iter,
+ loff_t pos)
+{
+ struct mr6_table *mrt = iter->mrt;
+
+ for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+ if (!MIF_EXISTS(mrt, iter->ct))
+ continue;
+ if (pos-- == 0)
+ return &mrt->vif6_table[iter->ct];
+ }
+ return NULL;
+}
+
+static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(mrt_lock)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr6_table *mrt;
+
+ mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt)
+ return ERR_PTR(-ENOENT);
+
+ iter->mrt = mrt;
+
+ read_lock(&mrt_lock);
+ return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1)
+ : SEQ_START_TOKEN;
+}
+
+static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr6_table *mrt = iter->mrt;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip6mr_vif_seq_idx(net, iter, 0);
+
+ while (++iter->ct < mrt->maxvif) {
+ if (!MIF_EXISTS(mrt, iter->ct))
+ continue;
+ return &mrt->vif6_table[iter->ct];
+ }
+ return NULL;
+}
+
+static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
+ __releases(mrt_lock)
+{
+ read_unlock(&mrt_lock);
+}
+
+static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
+{
+ struct ipmr_vif_iter *iter = seq->private;
+ struct mr6_table *mrt = iter->mrt;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Interface BytesIn PktsIn BytesOut PktsOut Flags\n");
+ } else {
+ const struct mif_device *vif = v;
+ const char *name = vif->dev ? vif->dev->name : "none";
+
+ seq_printf(seq,
+ "%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
+ vif - mrt->vif6_table,
+ name, vif->bytes_in, vif->pkt_in,
+ vif->bytes_out, vif->pkt_out,
+ vif->flags);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip6mr_vif_seq_ops = {
+ .start = ip6mr_vif_seq_start,
+ .next = ip6mr_vif_seq_next,
+ .stop = ip6mr_vif_seq_stop,
+ .show = ip6mr_vif_seq_show,
+};
+
+static int ip6mr_vif_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip6mr_vif_seq_ops,
+ sizeof(struct ipmr_vif_iter));
+}
+
+static const struct file_operations ip6mr_vif_fops = {
+ .owner = THIS_MODULE,
+ .open = ip6mr_vif_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct ipmr_mfc_iter *it = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr6_table *mrt;
+
+ mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt)
+ return ERR_PTR(-ENOENT);
+
+ it->mrt = mrt;
+ return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
+ : SEQ_START_TOKEN;
+}
+
+static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct mfc6_cache *mfc = v;
+ struct ipmr_mfc_iter *it = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct mr6_table *mrt = it->mrt;
+
+ ++*pos;
+
+ if (v == SEQ_START_TOKEN)
+ return ipmr_mfc_seq_idx(net, seq->private, 0);
+
+ if (mfc->list.next != it->cache)
+ return list_entry(mfc->list.next, struct mfc6_cache, list);
+
+ if (it->cache == &mrt->mfc6_unres_queue)
+ goto end_of_list;
+
+ BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]);
+
+ while (++it->ct < MFC6_LINES) {
+ it->cache = &mrt->mfc6_cache_array[it->ct];
+ if (list_empty(it->cache))
+ continue;
+ return list_first_entry(it->cache, struct mfc6_cache, list);
+ }
+
+ /* exhausted cache_array, show unresolved */
+ read_unlock(&mrt_lock);
+ it->cache = &mrt->mfc6_unres_queue;
+ it->ct = 0;
+
+ spin_lock_bh(&mfc_unres_lock);
+ if (!list_empty(it->cache))
+ return list_first_entry(it->cache, struct mfc6_cache, list);
+
+ end_of_list:
+ spin_unlock_bh(&mfc_unres_lock);
+ it->cache = NULL;
+
+ return NULL;
+}
+
+static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+ struct ipmr_mfc_iter *it = seq->private;
+ struct mr6_table *mrt = it->mrt;
+
+ if (it->cache == &mrt->mfc6_unres_queue)
+ spin_unlock_bh(&mfc_unres_lock);
+ else if (it->cache == mrt->mfc6_cache_array)
+ read_unlock(&mrt_lock);
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+ int n;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Group "
+ "Origin "
+ "Iif Pkts Bytes Wrong Oifs\n");
+ } else {
+ const struct mfc6_cache *mfc = v;
+ const struct ipmr_mfc_iter *it = seq->private;
+ struct mr6_table *mrt = it->mrt;
+
+ seq_printf(seq, "%pI6 %pI6 %-3hd",
+ &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
+ mfc->mf6c_parent);
+
+ if (it->cache != &mrt->mfc6_unres_queue) {
+ seq_printf(seq, " %8lu %8lu %8lu",
+ mfc->mfc_un.res.pkt,
+ mfc->mfc_un.res.bytes,
+ mfc->mfc_un.res.wrong_if);
+ for (n = mfc->mfc_un.res.minvif;
+ n < mfc->mfc_un.res.maxvif; n++) {
+ if (MIF_EXISTS(mrt, n) &&
+ mfc->mfc_un.res.ttls[n] < 255)
+ seq_printf(seq,
+ " %2d:%-3d",
+ n, mfc->mfc_un.res.ttls[n]);
+ }
+ } else {
+ /* unresolved mfc_caches don't contain
+ * pkt, bytes and wrong_if values
+ */
+ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
+ }
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static const struct seq_operations ipmr_mfc_seq_ops = {
+ .start = ipmr_mfc_seq_start,
+ .next = ipmr_mfc_seq_next,
+ .stop = ipmr_mfc_seq_stop,
+ .show = ipmr_mfc_seq_show,
+};
+
+static int ipmr_mfc_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
+ sizeof(struct ipmr_mfc_iter));
+}
+
+static const struct file_operations ip6mr_mfc_fops = {
+ .owner = THIS_MODULE,
+ .open = ipmr_mfc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+
+static int pim6_rcv(struct sk_buff *skb)
+{
+ struct pimreghdr *pim;
+ struct ipv6hdr *encap;
+ struct net_device *reg_dev = NULL;
+ struct net *net = dev_net(skb->dev);
+ struct mr6_table *mrt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .flowi6_mark = skb->mark,
+ };
+ int reg_vif_num;
+
+ if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
+ goto drop;
+
+ pim = (struct pimreghdr *)skb_transport_header(skb);
+ if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) ||
+ (pim->flags & PIM_NULL_REGISTER) ||
+ (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+ sizeof(*pim), IPPROTO_PIM,
+ csum_partial((void *)pim, sizeof(*pim), 0)) &&
+ csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+ goto drop;
+
+ /* check if the inner packet is destined to mcast group */
+ encap = (struct ipv6hdr *)(skb_transport_header(skb) +
+ sizeof(*pim));
+
+ if (!ipv6_addr_is_multicast(&encap->daddr) ||
+ encap->payload_len == 0 ||
+ ntohs(encap->payload_len) + sizeof(*pim) > skb->len)
+ goto drop;
+
+ if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+ goto drop;
+ reg_vif_num = mrt->mroute_reg_vif_num;
+
+ read_lock(&mrt_lock);
+ if (reg_vif_num >= 0)
+ reg_dev = mrt->vif6_table[reg_vif_num].dev;
+ if (reg_dev)
+ dev_hold(reg_dev);
+ read_unlock(&mrt_lock);
+
+ if (!reg_dev)
+ goto drop;
+
+ skb->mac_header = skb->network_header;
+ skb_pull(skb, (u8 *)encap - skb->data);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
+
+ netif_rx(skb);
+
+ dev_put(reg_dev);
+ return 0;
+ drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static const struct inet6_protocol pim6_protocol = {
+ .handler = pim6_rcv,
+};
+
+/* Service routines creating virtual interfaces: PIMREG */
+
+static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct mr6_table *mrt;
+ struct flowi6 fl6 = {
+ .flowi6_oif = dev->ifindex,
+ .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
+ .flowi6_mark = skb->mark,
+ };
+ int err;
+
+ err = ip6mr_fib_lookup(net, &fl6, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ read_lock(&mrt_lock);
+ dev->stats.tx_bytes += skb->len;
+ dev->stats.tx_packets++;
+ ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
+ read_unlock(&mrt_lock);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int reg_vif_get_iflink(const struct net_device *dev)
+{
+ return 0;
+}
+
+static const struct net_device_ops reg_vif_netdev_ops = {
+ .ndo_start_xmit = reg_vif_xmit,
+ .ndo_get_iflink = reg_vif_get_iflink,
+};
+
+static void reg_vif_setup(struct net_device *dev)
+{
+ dev->type = ARPHRD_PIMREG;
+ dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8;
+ dev->flags = IFF_NOARP;
+ dev->netdev_ops = &reg_vif_netdev_ops;
+ dev->destructor = free_netdev;
+ dev->features |= NETIF_F_NETNS_LOCAL;
+}
+
+static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt)
+{
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ if (mrt->id == RT6_TABLE_DFLT)
+ sprintf(name, "pim6reg");
+ else
+ sprintf(name, "pim6reg%u", mrt->id);
+
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
+ if (!dev)
+ return NULL;
+
+ dev_net_set(dev, net);
+
+ if (register_netdevice(dev)) {
+ free_netdev(dev);
+ return NULL;
+ }
+
+ if (dev_open(dev))
+ goto failure;
+
+ dev_hold(dev);
+ return dev;
+
+failure:
+ /* allow the register to be completed before unregistering. */
+ rtnl_unlock();
+ rtnl_lock();
+
+ unregister_netdevice(dev);
+ return NULL;
+}
+#endif
+
+/*
+ * Delete a VIF entry
+ */
+
+static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
+{
+ struct mif_device *v;
+ struct net_device *dev;
+ struct inet6_dev *in6_dev;
+
+ if (vifi < 0 || vifi >= mrt->maxvif)
+ return -EADDRNOTAVAIL;
+
+ v = &mrt->vif6_table[vifi];
+
+ write_lock_bh(&mrt_lock);
+ dev = v->dev;
+ v->dev = NULL;
+
+ if (!dev) {
+ write_unlock_bh(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ }
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (vifi == mrt->mroute_reg_vif_num)
+ mrt->mroute_reg_vif_num = -1;
+#endif
+
+ if (vifi + 1 == mrt->maxvif) {
+ int tmp;
+ for (tmp = vifi - 1; tmp >= 0; tmp--) {
+ if (MIF_EXISTS(mrt, tmp))
+ break;
+ }
+ mrt->maxvif = tmp + 1;
+ }
+
+ write_unlock_bh(&mrt_lock);
+
+ dev_set_allmulti(dev, -1);
+
+ in6_dev = __in6_dev_get(dev);
+ if (in6_dev) {
+ in6_dev->cnf.mc_forwarding--;
+ inet6_netconf_notify_devconf(dev_net(dev),
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in6_dev->cnf);
+ }
+
+ if (v->flags & MIFF_REGISTER)
+ unregister_netdevice_queue(dev, head);
+
+ dev_put(dev);
+ return 0;
+}
+
+static inline void ip6mr_cache_free(struct mfc6_cache *c)
+{
+ kmem_cache_free(mrt_cachep, c);
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+ and reporting error to netlink readers.
+ */
+
+static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+
+ atomic_dec(&mrt->cache_resolve_queue_len);
+
+ while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) {
+ if (ipv6_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT;
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else
+ kfree_skb(skb);
+ }
+
+ ip6mr_cache_free(c);
+}
+
+
+/* Timer process for all the unresolved queue. */
+
+static void ipmr_do_expire_process(struct mr6_table *mrt)
+{
+ unsigned long now = jiffies;
+ unsigned long expires = 10 * HZ;
+ struct mfc6_cache *c, *next;
+
+ list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) {
+ if (time_after(c->mfc_un.unres.expires, now)) {
+ /* not yet... */
+ unsigned long interval = c->mfc_un.unres.expires - now;
+ if (interval < expires)
+ expires = interval;
+ continue;
+ }
+
+ list_del(&c->list);
+ mr6_netlink_event(mrt, c, RTM_DELROUTE);
+ ip6mr_destroy_unres(mrt, c);
+ }
+
+ if (!list_empty(&mrt->mfc6_unres_queue))
+ mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
+}
+
+static void ipmr_expire_process(unsigned long arg)
+{
+ struct mr6_table *mrt = (struct mr6_table *)arg;
+
+ if (!spin_trylock(&mfc_unres_lock)) {
+ mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
+ return;
+ }
+
+ if (!list_empty(&mrt->mfc6_unres_queue))
+ ipmr_do_expire_process(mrt);
+
+ spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under write locked mrt_lock. */
+
+static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache,
+ unsigned char *ttls)
+{
+ int vifi;
+
+ cache->mfc_un.res.minvif = MAXMIFS;
+ cache->mfc_un.res.maxvif = 0;
+ memset(cache->mfc_un.res.ttls, 255, MAXMIFS);
+
+ for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+ if (MIF_EXISTS(mrt, vifi) &&
+ ttls[vifi] && ttls[vifi] < 255) {
+ cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+ if (cache->mfc_un.res.minvif > vifi)
+ cache->mfc_un.res.minvif = vifi;
+ if (cache->mfc_un.res.maxvif <= vifi)
+ cache->mfc_un.res.maxvif = vifi + 1;
+ }
+ }
+}
+
+static int mif6_add(struct net *net, struct mr6_table *mrt,
+ struct mif6ctl *vifc, int mrtsock)
+{
+ int vifi = vifc->mif6c_mifi;
+ struct mif_device *v = &mrt->vif6_table[vifi];
+ struct net_device *dev;
+ struct inet6_dev *in6_dev;
+ int err;
+
+ /* Is vif busy ? */
+ if (MIF_EXISTS(mrt, vifi))
+ return -EADDRINUSE;
+
+ switch (vifc->mif6c_flags) {
+#ifdef CONFIG_IPV6_PIMSM_V2
+ case MIFF_REGISTER:
+ /*
+ * Special Purpose VIF in PIM
+ * All the packets will be sent to the daemon
+ */
+ if (mrt->mroute_reg_vif_num >= 0)
+ return -EADDRINUSE;
+ dev = ip6mr_reg_vif(net, mrt);
+ if (!dev)
+ return -ENOBUFS;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ unregister_netdevice(dev);
+ dev_put(dev);
+ return err;
+ }
+ break;
+#endif
+ case 0:
+ dev = dev_get_by_index(net, vifc->mif6c_pifi);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ err = dev_set_allmulti(dev, 1);
+ if (err) {
+ dev_put(dev);
+ return err;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ in6_dev = __in6_dev_get(dev);
+ if (in6_dev) {
+ in6_dev->cnf.mc_forwarding++;
+ inet6_netconf_notify_devconf(dev_net(dev),
+ NETCONFA_MC_FORWARDING,
+ dev->ifindex, &in6_dev->cnf);
+ }
+
+ /*
+ * Fill in the VIF structures
+ */
+ v->rate_limit = vifc->vifc_rate_limit;
+ v->flags = vifc->mif6c_flags;
+ if (!mrtsock)
+ v->flags |= VIFF_STATIC;
+ v->threshold = vifc->vifc_threshold;
+ v->bytes_in = 0;
+ v->bytes_out = 0;
+ v->pkt_in = 0;
+ v->pkt_out = 0;
+ v->link = dev->ifindex;
+ if (v->flags & MIFF_REGISTER)
+ v->link = dev_get_iflink(dev);
+
+ /* And finish update writing critical data */
+ write_lock_bh(&mrt_lock);
+ v->dev = dev;
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (v->flags & MIFF_REGISTER)
+ mrt->mroute_reg_vif_num = vifi;
+#endif
+ if (vifi + 1 > mrt->maxvif)
+ mrt->maxvif = vifi + 1;
+ write_unlock_bh(&mrt_lock);
+ return 0;
+}
+
+static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt,
+ const struct in6_addr *origin,
+ const struct in6_addr *mcastgrp)
+{
+ int line = MFC6_HASH(mcastgrp, origin);
+ struct mfc6_cache *c;
+
+ list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) {
+ if (ipv6_addr_equal(&c->mf6c_origin, origin) &&
+ ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp))
+ return c;
+ }
+ return NULL;
+}
+
+/* Look for a (*,*,oif) entry */
+static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt,
+ mifi_t mifi)
+{
+ int line = MFC6_HASH(&in6addr_any, &in6addr_any);
+ struct mfc6_cache *c;
+
+ list_for_each_entry(c, &mrt->mfc6_cache_array[line], list)
+ if (ipv6_addr_any(&c->mf6c_origin) &&
+ ipv6_addr_any(&c->mf6c_mcastgrp) &&
+ (c->mfc_un.res.ttls[mifi] < 255))
+ return c;
+
+ return NULL;
+}
+
+/* Look for a (*,G) entry */
+static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt,
+ struct in6_addr *mcastgrp,
+ mifi_t mifi)
+{
+ int line = MFC6_HASH(mcastgrp, &in6addr_any);
+ struct mfc6_cache *c, *proxy;
+
+ if (ipv6_addr_any(mcastgrp))
+ goto skip;
+
+ list_for_each_entry(c, &mrt->mfc6_cache_array[line], list)
+ if (ipv6_addr_any(&c->mf6c_origin) &&
+ ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) {
+ if (c->mfc_un.res.ttls[mifi] < 255)
+ return c;
+
+ /* It's ok if the mifi is part of the static tree */
+ proxy = ip6mr_cache_find_any_parent(mrt,
+ c->mf6c_parent);
+ if (proxy && proxy->mfc_un.res.ttls[mifi] < 255)
+ return c;
+ }
+
+skip:
+ return ip6mr_cache_find_any_parent(mrt, mifi);
+}
+
+/*
+ * Allocate a multicast cache entry
+ */
+static struct mfc6_cache *ip6mr_cache_alloc(void)
+{
+ struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
+ if (!c)
+ return NULL;
+ c->mfc_un.res.minvif = MAXMIFS;
+ return c;
+}
+
+static struct mfc6_cache *ip6mr_cache_alloc_unres(void)
+{
+ struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+ if (!c)
+ return NULL;
+ skb_queue_head_init(&c->mfc_un.unres.unresolved);
+ c->mfc_un.unres.expires = jiffies + 10 * HZ;
+ return c;
+}
+
+/*
+ * A cache entry has gone into a resolved state from queued
+ */
+
+static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
+ struct mfc6_cache *uc, struct mfc6_cache *c)
+{
+ struct sk_buff *skb;
+
+ /*
+ * Play the pending entries through our router
+ */
+
+ while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+ if (ipv6_hdr(skb)->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
+
+ if (__ip6mr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
+ nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
+ } else {
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE;
+ }
+ rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
+ } else
+ ip6_mr_forward(net, mrt, skb, c);
+ }
+}
+
+/*
+ * Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd
+ * expects the following bizarre scheme.
+ *
+ * Called under mrt_lock.
+ */
+
+static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
+ mifi_t mifi, int assert)
+{
+ struct sk_buff *skb;
+ struct mrt6msg *msg;
+ int ret;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (assert == MRT6MSG_WHOLEPKT)
+ skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
+ +sizeof(*msg));
+ else
+#endif
+ skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC);
+
+ if (!skb)
+ return -ENOBUFS;
+
+ /* I suppose that internal messages
+ * do not require checksums */
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (assert == MRT6MSG_WHOLEPKT) {
+ /* Ugly, but we have no choice with this interface.
+ Duplicate old header, fix length etc.
+ And all this only to mangle msg->im6_msgtype and
+ to set msg->im6_mbz to "mbz" :-)
+ */
+ skb_push(skb, -skb_network_offset(pkt));
+
+ skb_push(skb, sizeof(*msg));
+ skb_reset_transport_header(skb);
+ msg = (struct mrt6msg *)skb_transport_header(skb);
+ msg->im6_mbz = 0;
+ msg->im6_msgtype = MRT6MSG_WHOLEPKT;
+ msg->im6_mif = mrt->mroute_reg_vif_num;
+ msg->im6_pad = 0;
+ msg->im6_src = ipv6_hdr(pkt)->saddr;
+ msg->im6_dst = ipv6_hdr(pkt)->daddr;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else
+#endif
+ {
+ /*
+ * Copy the IP header
+ */
+
+ skb_put(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr));
+
+ /*
+ * Add our header
+ */
+ skb_put(skb, sizeof(*msg));
+ skb_reset_transport_header(skb);
+ msg = (struct mrt6msg *)skb_transport_header(skb);
+
+ msg->im6_mbz = 0;
+ msg->im6_msgtype = assert;
+ msg->im6_mif = mifi;
+ msg->im6_pad = 0;
+ msg->im6_src = ipv6_hdr(pkt)->saddr;
+ msg->im6_dst = ipv6_hdr(pkt)->daddr;
+
+ skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ if (!mrt->mroute6_sk) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /*
+ * Deliver to user space multicast routing algorithms
+ */
+ ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb);
+ if (ret < 0) {
+ net_warn_ratelimited("mroute6: pending queue full, dropping entries\n");
+ kfree_skb(skb);
+ }
+
+ return ret;
+}
+
+/*
+ * Queue a packet for resolution. It gets locked cache entry!
+ */
+
+static int
+ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb)
+{
+ bool found = false;
+ int err;
+ struct mfc6_cache *c;
+
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(c, &mrt->mfc6_unres_queue, list) {
+ if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
+ ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ /*
+ * Create a new entry if allowable
+ */
+
+ if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
+ (c = ip6mr_cache_alloc_unres()) == NULL) {
+ spin_unlock_bh(&mfc_unres_lock);
+
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+
+ /*
+ * Fill in the new cache entry
+ */
+ c->mf6c_parent = -1;
+ c->mf6c_origin = ipv6_hdr(skb)->saddr;
+ c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr;
+
+ /*
+ * Reflect first query at pim6sd
+ */
+ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE);
+ if (err < 0) {
+ /* If the report failed throw the cache entry
+ out - Brad Parker
+ */
+ spin_unlock_bh(&mfc_unres_lock);
+
+ ip6mr_cache_free(c);
+ kfree_skb(skb);
+ return err;
+ }
+
+ atomic_inc(&mrt->cache_resolve_queue_len);
+ list_add(&c->list, &mrt->mfc6_unres_queue);
+ mr6_netlink_event(mrt, c, RTM_NEWROUTE);
+
+ ipmr_do_expire_process(mrt);
+ }
+
+ /*
+ * See if we can append the packet
+ */
+ if (c->mfc_un.unres.unresolved.qlen > 3) {
+ kfree_skb(skb);
+ err = -ENOBUFS;
+ } else {
+ skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
+ err = 0;
+ }
+
+ spin_unlock_bh(&mfc_unres_lock);
+ return err;
+}
+
+/*
+ * MFC6 cache manipulation by user space
+ */
+
+static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc,
+ int parent)
+{
+ int line;
+ struct mfc6_cache *c, *next;
+
+ line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
+
+ list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) {
+ if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
+ ipv6_addr_equal(&c->mf6c_mcastgrp,
+ &mfc->mf6cc_mcastgrp.sin6_addr) &&
+ (parent == -1 || parent == c->mf6c_parent)) {
+ write_lock_bh(&mrt_lock);
+ list_del(&c->list);
+ write_unlock_bh(&mrt_lock);
+
+ mr6_netlink_event(mrt, c, RTM_DELROUTE);
+ ip6mr_cache_free(c);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int ip6mr_device_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct mr6_table *mrt;
+ struct mif_device *v;
+ int ct;
+ LIST_HEAD(list);
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ ip6mr_for_each_table(mrt, net) {
+ v = &mrt->vif6_table[0];
+ for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+ if (v->dev == dev)
+ mif6_delete(mrt, ct, &list);
+ }
+ }
+ unregister_netdevice_many(&list);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ip6_mr_notifier = {
+ .notifier_call = ip6mr_device_event
+};
+
+/*
+ * Setup for IP multicast routing
+ */
+
+static int __net_init ip6mr_net_init(struct net *net)
+{
+ int err;
+
+ err = ip6mr_rules_init(net);
+ if (err < 0)
+ goto fail;
+
+#ifdef CONFIG_PROC_FS
+ err = -ENOMEM;
+ if (!proc_create("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_fops))
+ goto proc_vif_fail;
+ if (!proc_create("ip6_mr_cache", 0, net->proc_net, &ip6mr_mfc_fops))
+ goto proc_cache_fail;
+#endif
+
+ return 0;
+
+#ifdef CONFIG_PROC_FS
+proc_cache_fail:
+ remove_proc_entry("ip6_mr_vif", net->proc_net);
+proc_vif_fail:
+ ip6mr_rules_exit(net);
+#endif
+fail:
+ return err;
+}
+
+static void __net_exit ip6mr_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip6_mr_cache", net->proc_net);
+ remove_proc_entry("ip6_mr_vif", net->proc_net);
+#endif
+ ip6mr_rules_exit(net);
+}
+
+static struct pernet_operations ip6mr_net_ops = {
+ .init = ip6mr_net_init,
+ .exit = ip6mr_net_exit,
+};
+
+int __init ip6_mr_init(void)
+{
+ int err;
+
+ mrt_cachep = kmem_cache_create("ip6_mrt_cache",
+ sizeof(struct mfc6_cache),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!mrt_cachep)
+ return -ENOMEM;
+
+ err = register_pernet_subsys(&ip6mr_net_ops);
+ if (err)
+ goto reg_pernet_fail;
+
+ err = register_netdevice_notifier(&ip6_mr_notifier);
+ if (err)
+ goto reg_notif_fail;
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) {
+ pr_err("%s: can't add PIM protocol\n", __func__);
+ err = -EAGAIN;
+ goto add_proto_fail;
+ }
+#endif
+ rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL,
+ ip6mr_rtm_dumproute, NULL);
+ return 0;
+#ifdef CONFIG_IPV6_PIMSM_V2
+add_proto_fail:
+ unregister_netdevice_notifier(&ip6_mr_notifier);
+#endif
+reg_notif_fail:
+ unregister_pernet_subsys(&ip6mr_net_ops);
+reg_pernet_fail:
+ kmem_cache_destroy(mrt_cachep);
+ return err;
+}
+
+void ip6_mr_cleanup(void)
+{
+ rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE);
+#ifdef CONFIG_IPV6_PIMSM_V2
+ inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
+#endif
+ unregister_netdevice_notifier(&ip6_mr_notifier);
+ unregister_pernet_subsys(&ip6mr_net_ops);
+ kmem_cache_destroy(mrt_cachep);
+}
+
+static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
+ struct mf6cctl *mfc, int mrtsock, int parent)
+{
+ bool found = false;
+ int line;
+ struct mfc6_cache *uc, *c;
+ unsigned char ttls[MAXMIFS];
+ int i;
+
+ if (mfc->mf6cc_parent >= MAXMIFS)
+ return -ENFILE;
+
+ memset(ttls, 255, MAXMIFS);
+ for (i = 0; i < MAXMIFS; i++) {
+ if (IF_ISSET(i, &mfc->mf6cc_ifset))
+ ttls[i] = 1;
+
+ }
+
+ line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
+
+ list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) {
+ if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
+ ipv6_addr_equal(&c->mf6c_mcastgrp,
+ &mfc->mf6cc_mcastgrp.sin6_addr) &&
+ (parent == -1 || parent == mfc->mf6cc_parent)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ write_lock_bh(&mrt_lock);
+ c->mf6c_parent = mfc->mf6cc_parent;
+ ip6mr_update_thresholds(mrt, c, ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+ write_unlock_bh(&mrt_lock);
+ mr6_netlink_event(mrt, c, RTM_NEWROUTE);
+ return 0;
+ }
+
+ if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) &&
+ !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr))
+ return -EINVAL;
+
+ c = ip6mr_cache_alloc();
+ if (!c)
+ return -ENOMEM;
+
+ c->mf6c_origin = mfc->mf6cc_origin.sin6_addr;
+ c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr;
+ c->mf6c_parent = mfc->mf6cc_parent;
+ ip6mr_update_thresholds(mrt, c, ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+
+ write_lock_bh(&mrt_lock);
+ list_add(&c->list, &mrt->mfc6_cache_array[line]);
+ write_unlock_bh(&mrt_lock);
+
+ /*
+ * Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
+ */
+ found = false;
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) {
+ if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
+ ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
+ list_del(&uc->list);
+ atomic_dec(&mrt->cache_resolve_queue_len);
+ found = true;
+ break;
+ }
+ }
+ if (list_empty(&mrt->mfc6_unres_queue))
+ del_timer(&mrt->ipmr_expire_timer);
+ spin_unlock_bh(&mfc_unres_lock);
+
+ if (found) {
+ ip6mr_cache_resolve(net, mrt, uc, c);
+ ip6mr_cache_free(uc);
+ }
+ mr6_netlink_event(mrt, c, RTM_NEWROUTE);
+ return 0;
+}
+
+/*
+ * Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct mr6_table *mrt)
+{
+ int i;
+ LIST_HEAD(list);
+ struct mfc6_cache *c, *next;
+
+ /*
+ * Shut down all active vif entries
+ */
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (!(mrt->vif6_table[i].flags & VIFF_STATIC))
+ mif6_delete(mrt, i, &list);
+ }
+ unregister_netdevice_many(&list);
+
+ /*
+ * Wipe the cache
+ */
+ for (i = 0; i < MFC6_LINES; i++) {
+ list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) {
+ if (c->mfc_flags & MFC_STATIC)
+ continue;
+ write_lock_bh(&mrt_lock);
+ list_del(&c->list);
+ write_unlock_bh(&mrt_lock);
+
+ mr6_netlink_event(mrt, c, RTM_DELROUTE);
+ ip6mr_cache_free(c);
+ }
+ }
+
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) {
+ list_del(&c->list);
+ mr6_netlink_event(mrt, c, RTM_DELROUTE);
+ ip6mr_destroy_unres(mrt, c);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ }
+}
+
+static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk)
+{
+ int err = 0;
+ struct net *net = sock_net(sk);
+
+ rtnl_lock();
+ write_lock_bh(&mrt_lock);
+ if (likely(mrt->mroute6_sk == NULL)) {
+ mrt->mroute6_sk = sk;
+ net->ipv6.devconf_all->mc_forwarding++;
+ inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+ }
+ else
+ err = -EADDRINUSE;
+ write_unlock_bh(&mrt_lock);
+
+ rtnl_unlock();
+
+ return err;
+}
+
+int ip6mr_sk_done(struct sock *sk)
+{
+ int err = -EACCES;
+ struct net *net = sock_net(sk);
+ struct mr6_table *mrt;
+
+ rtnl_lock();
+ ip6mr_for_each_table(mrt, net) {
+ if (sk == mrt->mroute6_sk) {
+ write_lock_bh(&mrt_lock);
+ mrt->mroute6_sk = NULL;
+ net->ipv6.devconf_all->mc_forwarding--;
+ inet6_netconf_notify_devconf(net,
+ NETCONFA_MC_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+ write_unlock_bh(&mrt_lock);
+
+ mroute_clean_tables(mrt);
+ err = 0;
+ break;
+ }
+ }
+ rtnl_unlock();
+
+ return err;
+}
+
+struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
+{
+ struct mr6_table *mrt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
+ .flowi6_oif = skb->dev->ifindex,
+ .flowi6_mark = skb->mark,
+ };
+
+ if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+ return NULL;
+
+ return mrt->mroute6_sk;
+}
+
+/*
+ * Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+{
+ int ret, parent = 0;
+ struct mif6ctl vif;
+ struct mf6cctl mfc;
+ mifi_t mifi;
+ struct net *net = sock_net(sk);
+ struct mr6_table *mrt;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ if (optname != MRT6_INIT) {
+ if (sk != mrt->mroute6_sk && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EACCES;
+ }
+
+ switch (optname) {
+ case MRT6_INIT:
+ if (sk->sk_type != SOCK_RAW ||
+ inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ return ip6mr_sk_init(mrt, sk);
+
+ case MRT6_DONE:
+ return ip6mr_sk_done(sk);
+
+ case MRT6_ADD_MIF:
+ if (optlen < sizeof(vif))
+ return -EINVAL;
+ if (copy_from_user(&vif, optval, sizeof(vif)))
+ return -EFAULT;
+ if (vif.mif6c_mifi >= MAXMIFS)
+ return -ENFILE;
+ rtnl_lock();
+ ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk);
+ rtnl_unlock();
+ return ret;
+
+ case MRT6_DEL_MIF:
+ if (optlen < sizeof(mifi_t))
+ return -EINVAL;
+ if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
+ return -EFAULT;
+ rtnl_lock();
+ ret = mif6_delete(mrt, mifi, NULL);
+ rtnl_unlock();
+ return ret;
+
+ /*
+ * Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
+ case MRT6_ADD_MFC:
+ case MRT6_DEL_MFC:
+ parent = -1;
+ case MRT6_ADD_MFC_PROXY:
+ case MRT6_DEL_MFC_PROXY:
+ if (optlen < sizeof(mfc))
+ return -EINVAL;
+ if (copy_from_user(&mfc, optval, sizeof(mfc)))
+ return -EFAULT;
+ if (parent == 0)
+ parent = mfc.mf6cc_parent;
+ rtnl_lock();
+ if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY)
+ ret = ip6mr_mfc_delete(mrt, &mfc, parent);
+ else
+ ret = ip6mr_mfc_add(net, mrt, &mfc,
+ sk == mrt->mroute6_sk, parent);
+ rtnl_unlock();
+ return ret;
+
+ /*
+ * Control PIM assert (to activate pim will activate assert)
+ */
+ case MRT6_ASSERT:
+ {
+ int v;
+
+ if (optlen != sizeof(v))
+ return -EINVAL;
+ if (get_user(v, (int __user *)optval))
+ return -EFAULT;
+ mrt->mroute_do_assert = v;
+ return 0;
+ }
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ case MRT6_PIM:
+ {
+ int v;
+
+ if (optlen != sizeof(v))
+ return -EINVAL;
+ if (get_user(v, (int __user *)optval))
+ return -EFAULT;
+ v = !!v;
+ rtnl_lock();
+ ret = 0;
+ if (v != mrt->mroute_do_pim) {
+ mrt->mroute_do_pim = v;
+ mrt->mroute_do_assert = v;
+ }
+ rtnl_unlock();
+ return ret;
+ }
+
+#endif
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+ case MRT6_TABLE:
+ {
+ u32 v;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (get_user(v, (u32 __user *)optval))
+ return -EFAULT;
+ /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (v != RT_TABLE_DEFAULT && v >= 100000000)
+ return -EINVAL;
+ if (sk == mrt->mroute6_sk)
+ return -EBUSY;
+
+ rtnl_lock();
+ ret = 0;
+ if (!ip6mr_new_table(net, v))
+ ret = -ENOMEM;
+ raw6_sk(sk)->ip6mr_table = v;
+ rtnl_unlock();
+ return ret;
+ }
+#endif
+ /*
+ * Spurious command, or MRT6_VERSION which you cannot
+ * set.
+ */
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+/*
+ * Getsock opt support for the multicast routing system.
+ */
+
+int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
+ int __user *optlen)
+{
+ int olr;
+ int val;
+ struct net *net = sock_net(sk);
+ struct mr6_table *mrt;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (optname) {
+ case MRT6_VERSION:
+ val = 0x0305;
+ break;
+#ifdef CONFIG_IPV6_PIMSM_V2
+ case MRT6_PIM:
+ val = mrt->mroute_do_pim;
+ break;
+#endif
+ case MRT6_ASSERT:
+ val = mrt->mroute_do_assert;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (get_user(olr, optlen))
+ return -EFAULT;
+
+ olr = min_t(int, olr, sizeof(int));
+ if (olr < 0)
+ return -EINVAL;
+
+ if (put_user(olr, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, olr))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * The IP multicast ioctl support routines.
+ */
+
+int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
+{
+ struct sioc_sg_req6 sr;
+ struct sioc_mif_req6 vr;
+ struct mif_device *vif;
+ struct mfc6_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr6_table *mrt;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETMIFCNT_IN6:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.mifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif6_table[vr.mifi];
+ if (MIF_EXISTS(mrt, vr.mifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT_IN6:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ read_lock(&mrt_lock);
+ c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req6 {
+ struct sockaddr_in6 src;
+ struct sockaddr_in6 grp;
+ compat_ulong_t pktcnt;
+ compat_ulong_t bytecnt;
+ compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_mif_req6 {
+ mifi_t mifi;
+ compat_ulong_t icount;
+ compat_ulong_t ocount;
+ compat_ulong_t ibytes;
+ compat_ulong_t obytes;
+};
+
+int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ struct compat_sioc_sg_req6 sr;
+ struct compat_sioc_mif_req6 vr;
+ struct mif_device *vif;
+ struct mfc6_cache *c;
+ struct net *net = sock_net(sk);
+ struct mr6_table *mrt;
+
+ mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ switch (cmd) {
+ case SIOCGETMIFCNT_IN6:
+ if (copy_from_user(&vr, arg, sizeof(vr)))
+ return -EFAULT;
+ if (vr.mifi >= mrt->maxvif)
+ return -EINVAL;
+ read_lock(&mrt_lock);
+ vif = &mrt->vif6_table[vr.mifi];
+ if (MIF_EXISTS(mrt, vr.mifi)) {
+ vr.icount = vif->pkt_in;
+ vr.ocount = vif->pkt_out;
+ vr.ibytes = vif->bytes_in;
+ vr.obytes = vif->bytes_out;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &vr, sizeof(vr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ case SIOCGETSGCNT_IN6:
+ if (copy_from_user(&sr, arg, sizeof(sr)))
+ return -EFAULT;
+
+ read_lock(&mrt_lock);
+ c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user(arg, &sr, sizeof(sr)))
+ return -EFAULT;
+ return 0;
+ }
+ read_unlock(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+static inline int ip6mr_forward2_finish(struct sock *sk, struct sk_buff *skb)
+{
+ IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTFORWDATAGRAMS);
+ IP6_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTOCTETS, skb->len);
+ return dst_output_sk(sk, skb);
+}
+
+/*
+ * Processing handlers for ip6mr_forward
+ */
+
+static int ip6mr_forward2(struct net *net, struct mr6_table *mrt,
+ struct sk_buff *skb, struct mfc6_cache *c, int vifi)
+{
+ struct ipv6hdr *ipv6h;
+ struct mif_device *vif = &mrt->vif6_table[vifi];
+ struct net_device *dev;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ if (!vif->dev)
+ goto out_free;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+ if (vif->flags & MIFF_REGISTER) {
+ vif->pkt_out++;
+ vif->bytes_out += skb->len;
+ vif->dev->stats.tx_bytes += skb->len;
+ vif->dev->stats.tx_packets++;
+ ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
+ goto out_free;
+ }
+#endif
+
+ ipv6h = ipv6_hdr(skb);
+
+ fl6 = (struct flowi6) {
+ .flowi6_oif = vif->link,
+ .daddr = ipv6h->daddr,
+ };
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
+ goto out_free;
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ /*
+ * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ * not only before forwarding, but after forwarding on all output
+ * interfaces. It is clear, if mrouter runs a multicasting
+ * program, it should receive packets not depending to what interface
+ * program is joined.
+ * If we will not make it, the program will have to join on all
+ * interfaces. On the other hand, multihoming host (or router, but
+ * not mrouter) cannot join to more than one interface - it will
+ * result in receiving multiple packets.
+ */
+ dev = vif->dev;
+ skb->dev = dev;
+ vif->pkt_out++;
+ vif->bytes_out += skb->len;
+
+ /* We are about to write */
+ /* XXX: extension headers? */
+ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev)))
+ goto out_free;
+
+ ipv6h = ipv6_hdr(skb);
+ ipv6h->hop_limit--;
+
+ IP6CB(skb)->flags |= IP6SKB_FORWARDED;
+
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
+ skb->dev, dev,
+ ip6mr_forward2_finish);
+
+out_free:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev)
+{
+ int ct;
+
+ for (ct = mrt->maxvif - 1; ct >= 0; ct--) {
+ if (mrt->vif6_table[ct].dev == dev)
+ break;
+ }
+ return ct;
+}
+
+static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
+ struct sk_buff *skb, struct mfc6_cache *cache)
+{
+ int psend = -1;
+ int vif, ct;
+ int true_vifi = ip6mr_find_vif(mrt, skb->dev);
+
+ vif = cache->mf6c_parent;
+ cache->mfc_un.res.pkt++;
+ cache->mfc_un.res.bytes += skb->len;
+
+ if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) {
+ struct mfc6_cache *cache_proxy;
+
+ /* For an (*,G) entry, we only check that the incoming
+ * interface is part of the static tree.
+ */
+ cache_proxy = ip6mr_cache_find_any_parent(mrt, vif);
+ if (cache_proxy &&
+ cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+ goto forward;
+ }
+
+ /*
+ * Wrong interface: drop packet and (maybe) send PIM assert.
+ */
+ if (mrt->vif6_table[vif].dev != skb->dev) {
+ cache->mfc_un.res.wrong_if++;
+
+ if (true_vifi >= 0 && mrt->mroute_do_assert &&
+ /* pimsm uses asserts, when switching from RPT to SPT,
+ so that we cannot check that packet arrived on an oif.
+ It is bad, but otherwise we would need to move pretty
+ large chunk of pimd to kernel. Ough... --ANK
+ */
+ (mrt->mroute_do_pim ||
+ cache->mfc_un.res.ttls[true_vifi] < 255) &&
+ time_after(jiffies,
+ cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+ cache->mfc_un.res.last_assert = jiffies;
+ ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
+ }
+ goto dont_forward;
+ }
+
+forward:
+ mrt->vif6_table[vif].pkt_in++;
+ mrt->vif6_table[vif].bytes_in += skb->len;
+
+ /*
+ * Forward the frame
+ */
+ if (ipv6_addr_any(&cache->mf6c_origin) &&
+ ipv6_addr_any(&cache->mf6c_mcastgrp)) {
+ if (true_vifi >= 0 &&
+ true_vifi != cache->mf6c_parent &&
+ ipv6_hdr(skb)->hop_limit >
+ cache->mfc_un.res.ttls[cache->mf6c_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = cache->mf6c_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
+ for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) {
+ /* For (*,G) entry, don't forward to the incoming interface */
+ if ((!ipv6_addr_any(&cache->mf6c_origin) || ct != true_vifi) &&
+ ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ip6mr_forward2(net, mrt, skb2, cache, psend);
+ }
+ psend = ct;
+ }
+ }
+last_forward:
+ if (psend != -1) {
+ ip6mr_forward2(net, mrt, skb, cache, psend);
+ return;
+ }
+
+dont_forward:
+ kfree_skb(skb);
+}
+
+
+/*
+ * Multicast packets for forwarding arrive here
+ */
+
+int ip6_mr_input(struct sk_buff *skb)
+{
+ struct mfc6_cache *cache;
+ struct net *net = dev_net(skb->dev);
+ struct mr6_table *mrt;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .flowi6_mark = skb->mark,
+ };
+ int err;
+
+ err = ip6mr_fib_lookup(net, &fl6, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ read_lock(&mrt_lock);
+ cache = ip6mr_cache_find(mrt,
+ &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
+ if (!cache) {
+ int vif = ip6mr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ip6mr_cache_find_any(mrt,
+ &ipv6_hdr(skb)->daddr,
+ vif);
+ }
+
+ /*
+ * No usable cache entry
+ */
+ if (!cache) {
+ int vif;
+
+ vif = ip6mr_find_vif(mrt, skb->dev);
+ if (vif >= 0) {
+ int err = ip6mr_cache_unresolved(mrt, vif, skb);
+ read_unlock(&mrt_lock);
+
+ return err;
+ }
+ read_unlock(&mrt_lock);
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ ip6_mr_forward(net, mrt, skb, cache);
+
+ read_unlock(&mrt_lock);
+
+ return 0;
+}
+
+
+static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+ struct mfc6_cache *c, struct rtmsg *rtm)
+{
+ int ct;
+ struct rtnexthop *nhp;
+ struct nlattr *mp_attr;
+ struct rta_mfc_stats mfcs;
+
+ /* If cache is unresolved, don't try to parse IIF and OIF */
+ if (c->mf6c_parent >= MAXMIFS)
+ return -ENOENT;
+
+ if (MIF_EXISTS(mrt, c->mf6c_parent) &&
+ nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0)
+ return -EMSGSIZE;
+ mp_attr = nla_nest_start(skb, RTA_MULTIPATH);
+ if (!mp_attr)
+ return -EMSGSIZE;
+
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+ nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
+ if (!nhp) {
+ nla_nest_cancel(skb, mp_attr);
+ return -EMSGSIZE;
+ }
+
+ nhp->rtnh_flags = 0;
+ nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+ nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex;
+ nhp->rtnh_len = sizeof(*nhp);
+ }
+ }
+
+ nla_nest_end(skb, mp_attr);
+
+ mfcs.mfcs_packets = c->mfc_un.res.pkt;
+ mfcs.mfcs_bytes = c->mfc_un.res.bytes;
+ mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+ if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
+ return -EMSGSIZE;
+
+ rtm->rtm_type = RTN_MULTICAST;
+ return 1;
+}
+
+int ip6mr_get_route(struct net *net,
+ struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+{
+ int err;
+ struct mr6_table *mrt;
+ struct mfc6_cache *cache;
+ struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+
+ mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ read_lock(&mrt_lock);
+ cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
+ if (!cache && skb->dev) {
+ int vif = ip6mr_find_vif(mrt, skb->dev);
+
+ if (vif >= 0)
+ cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr,
+ vif);
+ }
+
+ if (!cache) {
+ struct sk_buff *skb2;
+ struct ipv6hdr *iph;
+ struct net_device *dev;
+ int vif;
+
+ if (nowait) {
+ read_unlock(&mrt_lock);
+ return -EAGAIN;
+ }
+
+ dev = skb->dev;
+ if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
+ read_unlock(&mrt_lock);
+ return -ENODEV;
+ }
+
+ /* really correct? */
+ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
+ if (!skb2) {
+ read_unlock(&mrt_lock);
+ return -ENOMEM;
+ }
+
+ skb_reset_transport_header(skb2);
+
+ skb_put(skb2, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb2);
+
+ iph = ipv6_hdr(skb2);
+ iph->version = 0;
+ iph->priority = 0;
+ iph->flow_lbl[0] = 0;
+ iph->flow_lbl[1] = 0;
+ iph->flow_lbl[2] = 0;
+ iph->payload_len = 0;
+ iph->nexthdr = IPPROTO_NONE;
+ iph->hop_limit = 0;
+ iph->saddr = rt->rt6i_src.addr;
+ iph->daddr = rt->rt6i_dst.addr;
+
+ err = ip6mr_cache_unresolved(mrt, vif, skb2);
+ read_unlock(&mrt_lock);
+
+ return err;
+ }
+
+ if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+ cache->mfc_flags |= MFC_NOTIFY;
+
+ err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
+ read_unlock(&mrt_lock);
+ return err;
+}
+
+static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mfc6_cache *c, int cmd,
+ int flags)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+ int err;
+
+ nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = RTNL_FAMILY_IP6MR;
+ rtm->rtm_dst_len = 128;
+ rtm->rtm_src_len = 128;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = mrt->id;
+ if (nla_put_u32(skb, RTA_TABLE, mrt->id))
+ goto nla_put_failure;
+ rtm->rtm_type = RTN_MULTICAST;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ if (c->mfc_flags & MFC_STATIC)
+ rtm->rtm_protocol = RTPROT_STATIC;
+ else
+ rtm->rtm_protocol = RTPROT_MROUTED;
+ rtm->rtm_flags = 0;
+
+ if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) ||
+ nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp))
+ goto nla_put_failure;
+ err = __ip6mr_fill_mroute(mrt, skb, c, rtm);
+ /* do not break the dump if cache is unresolved */
+ if (err < 0 && err != -ENOENT)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int mr6_msgsize(bool unresolved, int maxvif)
+{
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */
+ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */
+ ;
+
+ if (!unresolved)
+ len = len
+ + nla_total_size(4) /* RTA_IIF */
+ + nla_total_size(0) /* RTA_MULTIPATH */
+ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+ /* RTA_MFC_STATS */
+ + nla_total_size(sizeof(struct rta_mfc_stats))
+ ;
+
+ return len;
+}
+
+static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc,
+ int cmd)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(mr6_msgsize(mfc->mf6c_parent >= MAXMIFS, mrt->maxvif),
+ GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
+ if (err < 0)
+ goto errout;
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC);
+ return;
+
+errout:
+ kfree_skb(skb);
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err);
+}
+
+static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct mr6_table *mrt;
+ struct mfc6_cache *mfc;
+ unsigned int t = 0, s_t;
+ unsigned int h = 0, s_h;
+ unsigned int e = 0, s_e;
+
+ s_t = cb->args[0];
+ s_h = cb->args[1];
+ s_e = cb->args[2];
+
+ read_lock(&mrt_lock);
+ ip6mr_for_each_table(mrt, net) {
+ if (t < s_t)
+ goto next_table;
+ if (t > s_t)
+ s_h = 0;
+ for (h = s_h; h < MFC6_LINES; h++) {
+ list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) {
+ if (e < s_e)
+ goto next_entry;
+ if (ip6mr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0)
+ goto done;
+next_entry:
+ e++;
+ }
+ e = s_e = 0;
+ }
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry(mfc, &mrt->mfc6_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+ if (ip6mr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ mfc, RTM_NEWROUTE,
+ NLM_F_MULTI) < 0) {
+ spin_unlock_bh(&mfc_unres_lock);
+ goto done;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+ e = s_e = 0;
+ s_h = 0;
+next_table:
+ t++;
+ }
+done:
+ read_unlock(&mrt_lock);
+
+ cb->args[2] = e;
+ cb->args[1] = h;
+ cb->args[0] = t;
+
+ return skb->len;
+}
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
new file mode 100644
index 000000000..1b9316e13
--- /dev/null
+++ b/net/ipv6/ipcomp6.c
@@ -0,0 +1,230 @@
+/*
+ * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173
+ *
+ * Copyright (C)2003 USAGI/WIDE Project
+ *
+ * Author Mitsuru KANDA <mk@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * [Memo]
+ *
+ * Outbound:
+ * The compression of IP datagram MUST be done before AH/ESP processing,
+ * fragmentation, and the addition of Hop-by-Hop/Routing header.
+ *
+ * Inbound:
+ * The decompression of IP datagram MUST be done after the reassembly,
+ * AH/ESP processing.
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ipcomp.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <net/ip6_route.h>
+#include <net/icmp.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/mutex.h>
+
+static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ __be32 spi;
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ struct ip_comp_hdr *ipcomph =
+ (struct ip_comp_hdr *)(skb->data + offset);
+ struct xfrm_state *x;
+
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
+
+ spi = htonl(ntohs(ipcomph->cpi));
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, IPPROTO_COMP, AF_INET6);
+ if (!x)
+ return 0;
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0);
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ struct xfrm_state *t = NULL;
+
+ t = xfrm_state_alloc(net);
+ if (!t)
+ goto out;
+
+ t->id.proto = IPPROTO_IPV6;
+ t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr);
+ if (!t->id.spi)
+ goto error;
+
+ memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr));
+ memcpy(&t->sel, &x->sel, sizeof(t->sel));
+ t->props.family = AF_INET6;
+ t->props.mode = x->props.mode;
+ memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
+ memcpy(&t->mark, &x->mark, sizeof(t->mark));
+
+ if (xfrm_init_state(t))
+ goto error;
+
+ atomic_set(&t->tunnel_users, 1);
+
+out:
+ return t;
+
+error:
+ t->km.state = XFRM_STATE_DEAD;
+ xfrm_state_put(t);
+ t = NULL;
+ goto out;
+}
+
+static int ipcomp6_tunnel_attach(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ int err = 0;
+ struct xfrm_state *t = NULL;
+ __be32 spi;
+ u32 mark = x->mark.m & x->mark.v;
+
+ spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr);
+ if (spi)
+ t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr,
+ spi, IPPROTO_IPV6, AF_INET6);
+ if (!t) {
+ t = ipcomp6_tunnel_create(x);
+ if (!t) {
+ err = -EINVAL;
+ goto out;
+ }
+ xfrm_state_insert(t);
+ xfrm_state_hold(t);
+ }
+ x->tunnel = t;
+ atomic_inc(&t->tunnel_users);
+
+out:
+ return err;
+}
+
+static int ipcomp6_init_state(struct xfrm_state *x)
+{
+ int err = -EINVAL;
+
+ x->props.header_len = 0;
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ break;
+ case XFRM_MODE_TUNNEL:
+ x->props.header_len += sizeof(struct ipv6hdr);
+ break;
+ default:
+ goto out;
+ }
+
+ err = ipcomp_init_state(x);
+ if (err)
+ goto out;
+
+ if (x->props.mode == XFRM_MODE_TUNNEL) {
+ err = ipcomp6_tunnel_attach(x);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static int ipcomp6_rcv_cb(struct sk_buff *skb, int err)
+{
+ return 0;
+}
+
+static const struct xfrm_type ipcomp6_type = {
+ .description = "IPCOMP6",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_COMP,
+ .init_state = ipcomp6_init_state,
+ .destructor = ipcomp_destroy,
+ .input = ipcomp_input,
+ .output = ipcomp_output,
+ .hdr_offset = xfrm6_find_1stfragopt,
+};
+
+static struct xfrm6_protocol ipcomp6_protocol = {
+ .handler = xfrm6_rcv,
+ .cb_handler = ipcomp6_rcv_cb,
+ .err_handler = ipcomp6_err,
+ .priority = 0,
+};
+
+static int __init ipcomp6_init(void)
+{
+ if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (xfrm6_protocol_register(&ipcomp6_protocol, IPPROTO_COMP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&ipcomp6_type, AF_INET6);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ipcomp6_fini(void)
+{
+ if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(ipcomp6_init);
+module_exit(ipcomp6_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173");
+MODULE_AUTHOR("Mitsuru KANDA <mk@linux-ipv6.org>");
+
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_COMP);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
new file mode 100644
index 000000000..63e695691
--- /dev/null
+++ b/net/ipv6/ipv6_sockglue.c
@@ -0,0 +1,1384 @@
+/*
+ * IPv6 BSD socket options interface
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Based on linux/net/ipv4/ip_sockglue.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * FIXME: Make the setsockopt code POSIX compliant: That is
+ *
+ * o Truncate getsockopt returns
+ * o Return an optlen of the truncated length if need be
+ *
+ * Changes:
+ * David L Stevens <dlstevens@us.ibm.com>:
+ * - added multicast source filtering API for MLDv2
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/mroute6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/xfrm.h>
+#include <net/compat.h>
+
+#include <asm/uaccess.h>
+
+struct ip6_ra_chain *ip6_ra_chain;
+DEFINE_RWLOCK(ip6_ra_lock);
+
+int ip6_ra_control(struct sock *sk, int sel)
+{
+ struct ip6_ra_chain *ra, *new_ra, **rap;
+
+ /* RA packet may be delivered ONLY to IPPROTO_RAW socket */
+ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW)
+ return -ENOPROTOOPT;
+
+ new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+ write_lock_bh(&ip6_ra_lock);
+ for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
+ if (ra->sk == sk) {
+ if (sel >= 0) {
+ write_unlock_bh(&ip6_ra_lock);
+ kfree(new_ra);
+ return -EADDRINUSE;
+ }
+
+ *rap = ra->next;
+ write_unlock_bh(&ip6_ra_lock);
+
+ sock_put(sk);
+ kfree(ra);
+ return 0;
+ }
+ }
+ if (!new_ra) {
+ write_unlock_bh(&ip6_ra_lock);
+ return -ENOBUFS;
+ }
+ new_ra->sk = sk;
+ new_ra->sel = sel;
+ new_ra->next = ra;
+ *rap = new_ra;
+ sock_hold(sk);
+ write_unlock_bh(&ip6_ra_lock);
+ return 0;
+}
+
+static
+struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
+ struct ipv6_txoptions *opt)
+{
+ if (inet_sk(sk)->is_icsk) {
+ if (opt &&
+ !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+ inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ }
+ }
+ opt = xchg(&inet6_sk(sk)->opt, opt);
+ sk_dst_reset(sk);
+
+ return opt;
+}
+
+static bool setsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IPV6_ADD_MEMBERSHIP:
+ case IPV6_DROP_MEMBERSHIP:
+ case IPV6_JOIN_ANYCAST:
+ case IPV6_LEAVE_ANYCAST:
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ case MCAST_MSFILTER:
+ return true;
+ }
+ return false;
+}
+
+static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
+ int val, valbool;
+ int retv = -ENOPROTOOPT;
+ bool needs_rtnl = setsockopt_needs_rtnl(optname);
+
+ if (!optval)
+ val = 0;
+ else {
+ if (optlen >= sizeof(int)) {
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+ } else
+ val = 0;
+ }
+
+ valbool = (val != 0);
+
+ if (ip6_mroute_opt(optname))
+ return ip6_mroute_setsockopt(sk, optname, optval, optlen);
+
+ if (needs_rtnl)
+ rtnl_lock();
+ lock_sock(sk);
+
+ switch (optname) {
+
+ case IPV6_ADDRFORM:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val == PF_INET) {
+ struct ipv6_txoptions *opt;
+ struct sk_buff *pktopt;
+
+ if (sk->sk_type == SOCK_RAW)
+ break;
+
+ if (sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_UDPLITE) {
+ struct udp_sock *up = udp_sk(sk);
+ if (up->pending == AF_INET6) {
+ retv = -EBUSY;
+ break;
+ }
+ } else if (sk->sk_protocol != IPPROTO_TCP)
+ break;
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ retv = -ENOTCONN;
+ break;
+ }
+
+ if (ipv6_only_sock(sk) ||
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
+ retv = -EADDRNOTAVAIL;
+ break;
+ }
+
+ fl6_free_socklist(sk);
+ ipv6_sock_mc_close(sk);
+
+ /*
+ * Sock is moving from IPv6 to IPv4 (sk_prot), so
+ * remove it from the refcnt debug socks count in the
+ * original family...
+ */
+ sk_refcnt_debug_dec(sk);
+
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ local_bh_disable();
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ sock_prot_inuse_add(net, &tcp_prot, 1);
+ local_bh_enable();
+ sk->sk_prot = &tcp_prot;
+ icsk->icsk_af_ops = &ipv4_specific;
+ sk->sk_socket->ops = &inet_stream_ops;
+ sk->sk_family = PF_INET;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ } else {
+ struct proto *prot = &udp_prot;
+
+ if (sk->sk_protocol == IPPROTO_UDPLITE)
+ prot = &udplite_prot;
+ local_bh_disable();
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ sock_prot_inuse_add(net, prot, 1);
+ local_bh_enable();
+ sk->sk_prot = prot;
+ sk->sk_socket->ops = &inet_dgram_ops;
+ sk->sk_family = PF_INET;
+ }
+ opt = xchg(&np->opt, NULL);
+ if (opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ pktopt = xchg(&np->pktoptions, NULL);
+ kfree_skb(pktopt);
+
+ sk->sk_destruct = inet_sock_destruct;
+ /*
+ * ... and add it to the refcnt debug socks count
+ * in the new family. -acme
+ */
+ sk_refcnt_debug_inc(sk);
+ module_put(THIS_MODULE);
+ retv = 0;
+ break;
+ }
+ goto e_inval;
+
+ case IPV6_V6ONLY:
+ if (optlen < sizeof(int) ||
+ inet_sk(sk)->inet_num)
+ goto e_inval;
+ sk->sk_ipv6only = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVPKTINFO:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxinfo = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_2292PKTINFO:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxoinfo = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVHOPLIMIT:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxhlim = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_2292HOPLIMIT:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxohlim = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVRTHDR:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.srcrt = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_2292RTHDR:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.osrcrt = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVHOPOPTS:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.hopopts = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_2292HOPOPTS:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.ohopopts = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVDSTOPTS:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.dstopts = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_2292DSTOPTS:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.odstopts = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_TCLASS:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val < -1 || val > 0xff)
+ goto e_inval;
+ /* RFC 3542, 6.5: default traffic class of 0x0 */
+ if (val == -1)
+ val = 0;
+ np->tclass = val;
+ retv = 0;
+ break;
+
+ case IPV6_RECVTCLASS:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxtclass = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_FLOWINFO:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxflow = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVPATHMTU:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxpmtu = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_TRANSPARENT:
+ if (valbool && !ns_capable(net->user_ns, CAP_NET_ADMIN) &&
+ !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ retv = -EPERM;
+ break;
+ }
+ if (optlen < sizeof(int))
+ goto e_inval;
+ /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
+ inet_sk(sk)->transparent = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVORIGDSTADDR:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxorigdstaddr = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_HOPOPTS:
+ case IPV6_RTHDRDSTOPTS:
+ case IPV6_RTHDR:
+ case IPV6_DSTOPTS:
+ {
+ struct ipv6_txoptions *opt;
+
+ /* remove any sticky options header with a zero option
+ * length, per RFC3542.
+ */
+ if (optlen == 0)
+ optval = NULL;
+ else if (!optval)
+ goto e_inval;
+ else if (optlen < sizeof(struct ipv6_opt_hdr) ||
+ optlen & 0x7 || optlen > 8 * 255)
+ goto e_inval;
+
+ /* hop-by-hop / destination options are privileged option */
+ retv = -EPERM;
+ if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
+ break;
+
+ opt = ipv6_renew_options(sk, np->opt, optname,
+ (struct ipv6_opt_hdr __user *)optval,
+ optlen);
+ if (IS_ERR(opt)) {
+ retv = PTR_ERR(opt);
+ break;
+ }
+
+ /* routing header option needs extra check */
+ retv = -EINVAL;
+ if (optname == IPV6_RTHDR && opt && opt->srcrt) {
+ struct ipv6_rt_hdr *rthdr = opt->srcrt;
+ switch (rthdr->type) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_SRCRT_TYPE_2:
+ if (rthdr->hdrlen != 2 ||
+ rthdr->segments_left != 1)
+ goto sticky_done;
+
+ break;
+#endif
+ default:
+ goto sticky_done;
+ }
+ }
+
+ retv = 0;
+ opt = ipv6_update_options(sk, opt);
+sticky_done:
+ if (opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ break;
+ }
+
+ case IPV6_PKTINFO:
+ {
+ struct in6_pktinfo pkt;
+
+ if (optlen == 0)
+ goto e_inval;
+ else if (optlen < sizeof(struct in6_pktinfo) || !optval)
+ goto e_inval;
+
+ if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) {
+ retv = -EFAULT;
+ break;
+ }
+ if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if)
+ goto e_inval;
+
+ np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
+ np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr;
+ retv = 0;
+ break;
+ }
+
+ case IPV6_2292PKTOPTIONS:
+ {
+ struct ipv6_txoptions *opt = NULL;
+ struct msghdr msg;
+ struct flowi6 fl6;
+ int junk;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+
+ if (optlen == 0)
+ goto update;
+
+ /* 1K is probably excessive
+ * 1K is surely not enough, 2K per standard header is 16K.
+ */
+ retv = -EINVAL;
+ if (optlen > 64*1024)
+ break;
+
+ opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL);
+ retv = -ENOBUFS;
+ if (!opt)
+ break;
+
+ memset(opt, 0, sizeof(*opt));
+ opt->tot_len = sizeof(*opt) + optlen;
+ retv = -EFAULT;
+ if (copy_from_user(opt+1, optval, optlen))
+ goto done;
+
+ msg.msg_controllen = optlen;
+ msg.msg_control = (void *)(opt+1);
+
+ retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk,
+ &junk, &junk);
+ if (retv)
+ goto done;
+update:
+ retv = 0;
+ opt = ipv6_update_options(sk, opt);
+done:
+ if (opt)
+ sock_kfree_s(sk, opt, opt->tot_len);
+ break;
+ }
+ case IPV6_UNICAST_HOPS:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val > 255 || val < -1)
+ goto e_inval;
+ np->hop_limit = val;
+ retv = 0;
+ break;
+
+ case IPV6_MULTICAST_HOPS:
+ if (sk->sk_type == SOCK_STREAM)
+ break;
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val > 255 || val < -1)
+ goto e_inval;
+ np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
+ retv = 0;
+ break;
+
+ case IPV6_MULTICAST_LOOP:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val != valbool)
+ goto e_inval;
+ np->mc_loop = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_UNICAST_IF:
+ {
+ struct net_device *dev = NULL;
+ int ifindex;
+
+ if (optlen != sizeof(int))
+ goto e_inval;
+
+ ifindex = (__force int)ntohl((__force __be32)val);
+ if (ifindex == 0) {
+ np->ucast_oif = 0;
+ retv = 0;
+ break;
+ }
+
+ dev = dev_get_by_index(net, ifindex);
+ retv = -EADDRNOTAVAIL;
+ if (!dev)
+ break;
+ dev_put(dev);
+
+ retv = -EINVAL;
+ if (sk->sk_bound_dev_if)
+ break;
+
+ np->ucast_oif = ifindex;
+ retv = 0;
+ break;
+ }
+
+ case IPV6_MULTICAST_IF:
+ if (sk->sk_type == SOCK_STREAM)
+ break;
+ if (optlen < sizeof(int))
+ goto e_inval;
+
+ if (val) {
+ struct net_device *dev;
+
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val)
+ goto e_inval;
+
+ dev = dev_get_by_index(net, val);
+ if (!dev) {
+ retv = -ENODEV;
+ break;
+ }
+ dev_put(dev);
+ }
+ np->mcast_oif = val;
+ retv = 0;
+ break;
+ case IPV6_ADD_MEMBERSHIP:
+ case IPV6_DROP_MEMBERSHIP:
+ {
+ struct ipv6_mreq mreq;
+
+ if (optlen < sizeof(struct ipv6_mreq))
+ goto e_inval;
+
+ retv = -EPROTO;
+ if (inet_sk(sk)->is_icsk)
+ break;
+
+ retv = -EFAULT;
+ if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+ break;
+
+ if (optname == IPV6_ADD_MEMBERSHIP)
+ retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr);
+ else
+ retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr);
+ break;
+ }
+ case IPV6_JOIN_ANYCAST:
+ case IPV6_LEAVE_ANYCAST:
+ {
+ struct ipv6_mreq mreq;
+
+ if (optlen < sizeof(struct ipv6_mreq))
+ goto e_inval;
+
+ retv = -EFAULT;
+ if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+ break;
+
+ if (optname == IPV6_JOIN_ANYCAST)
+ retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
+ else
+ retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
+ break;
+ }
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ {
+ struct group_req greq;
+ struct sockaddr_in6 *psin6;
+
+ if (optlen < sizeof(struct group_req))
+ goto e_inval;
+
+ retv = -EFAULT;
+ if (copy_from_user(&greq, optval, sizeof(struct group_req)))
+ break;
+ if (greq.gr_group.ss_family != AF_INET6) {
+ retv = -EADDRNOTAVAIL;
+ break;
+ }
+ psin6 = (struct sockaddr_in6 *)&greq.gr_group;
+ if (optname == MCAST_JOIN_GROUP)
+ retv = ipv6_sock_mc_join(sk, greq.gr_interface,
+ &psin6->sin6_addr);
+ else
+ retv = ipv6_sock_mc_drop(sk, greq.gr_interface,
+ &psin6->sin6_addr);
+ break;
+ }
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ {
+ struct group_source_req greqs;
+ int omode, add;
+
+ if (optlen < sizeof(struct group_source_req))
+ goto e_inval;
+ if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+ retv = -EFAULT;
+ break;
+ }
+ if (greqs.gsr_group.ss_family != AF_INET6 ||
+ greqs.gsr_source.ss_family != AF_INET6) {
+ retv = -EADDRNOTAVAIL;
+ break;
+ }
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct sockaddr_in6 *psin6;
+
+ psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
+ retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
+ &psin6->sin6_addr);
+ /* prior join w/ different source is ok */
+ if (retv && retv != -EADDRINUSE)
+ break;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ retv = ip6_mc_source(add, omode, sk, &greqs);
+ break;
+ }
+ case MCAST_MSFILTER:
+ {
+ struct group_filter *gsf;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ goto e_inval;
+ if (optlen > sysctl_optmem_max) {
+ retv = -ENOBUFS;
+ break;
+ }
+ gsf = kmalloc(optlen, GFP_KERNEL);
+ if (!gsf) {
+ retv = -ENOBUFS;
+ break;
+ }
+ retv = -EFAULT;
+ if (copy_from_user(gsf, optval, optlen)) {
+ kfree(gsf);
+ break;
+ }
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ if (gsf->gf_numsrc >= 0x1ffffffU ||
+ gsf->gf_numsrc > sysctl_mld_max_msf) {
+ kfree(gsf);
+ retv = -ENOBUFS;
+ break;
+ }
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+ kfree(gsf);
+ retv = -EINVAL;
+ break;
+ }
+ retv = ip6_mc_msfilter(sk, gsf);
+ kfree(gsf);
+
+ break;
+ }
+ case IPV6_ROUTER_ALERT:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ retv = ip6_ra_control(sk, val);
+ break;
+ case IPV6_MTU_DISCOVER:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
+ goto e_inval;
+ np->pmtudisc = val;
+ retv = 0;
+ break;
+ case IPV6_MTU:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val && val < IPV6_MIN_MTU)
+ goto e_inval;
+ np->frag_size = val;
+ retv = 0;
+ break;
+ case IPV6_RECVERR:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->recverr = valbool;
+ if (!val)
+ skb_queue_purge(&sk->sk_error_queue);
+ retv = 0;
+ break;
+ case IPV6_FLOWINFO_SEND:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->sndflow = valbool;
+ retv = 0;
+ break;
+ case IPV6_FLOWLABEL_MGR:
+ retv = ipv6_flowlabel_opt(sk, optval, optlen);
+ break;
+ case IPV6_IPSEC_POLICY:
+ case IPV6_XFRM_POLICY:
+ retv = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ break;
+ retv = xfrm_user_policy(sk, optname, optval, optlen);
+ break;
+
+ case IPV6_ADDR_PREFERENCES:
+ {
+ unsigned int pref = 0;
+ unsigned int prefmask = ~0;
+
+ if (optlen < sizeof(int))
+ goto e_inval;
+
+ retv = -EINVAL;
+
+ /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
+ switch (val & (IPV6_PREFER_SRC_PUBLIC|
+ IPV6_PREFER_SRC_TMP|
+ IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
+ case IPV6_PREFER_SRC_PUBLIC:
+ pref |= IPV6_PREFER_SRC_PUBLIC;
+ break;
+ case IPV6_PREFER_SRC_TMP:
+ pref |= IPV6_PREFER_SRC_TMP;
+ break;
+ case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
+ break;
+ case 0:
+ goto pref_skip_pubtmp;
+ default:
+ goto e_inval;
+ }
+
+ prefmask &= ~(IPV6_PREFER_SRC_PUBLIC|
+ IPV6_PREFER_SRC_TMP);
+pref_skip_pubtmp:
+
+ /* check HOME/COA conflicts */
+ switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) {
+ case IPV6_PREFER_SRC_HOME:
+ break;
+ case IPV6_PREFER_SRC_COA:
+ pref |= IPV6_PREFER_SRC_COA;
+ case 0:
+ goto pref_skip_coa;
+ default:
+ goto e_inval;
+ }
+
+ prefmask &= ~IPV6_PREFER_SRC_COA;
+pref_skip_coa:
+
+ /* check CGA/NONCGA conflicts */
+ switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
+ case IPV6_PREFER_SRC_CGA:
+ case IPV6_PREFER_SRC_NONCGA:
+ case 0:
+ break;
+ default:
+ goto e_inval;
+ }
+
+ np->srcprefs = (np->srcprefs & prefmask) | pref;
+ retv = 0;
+
+ break;
+ }
+ case IPV6_MINHOPCOUNT:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ if (val < 0 || val > 255)
+ goto e_inval;
+ np->min_hopcount = val;
+ retv = 0;
+ break;
+ case IPV6_DONTFRAG:
+ np->dontfrag = valbool;
+ retv = 0;
+ break;
+ case IPV6_AUTOFLOWLABEL:
+ np->autoflowlabel = valbool;
+ retv = 0;
+ break;
+ }
+
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+
+ return retv;
+
+e_inval:
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return -EINVAL;
+}
+
+int ipv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ int err;
+
+ if (level == SOL_IP && sk->sk_type != SOCK_RAW)
+ return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+
+ if (level != SOL_IPV6)
+ return -ENOPROTOOPT;
+
+ err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
+ optname != IPV6_XFRM_POLICY) {
+ lock_sock(sk);
+ err = nf_setsockopt(sk, PF_INET6, optname, optval,
+ optlen);
+ release_sock(sk);
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(ipv6_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ int err;
+
+ if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
+ if (udp_prot.compat_setsockopt != NULL)
+ return udp_prot.compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+ }
+
+ if (level != SOL_IPV6)
+ return -ENOPROTOOPT;
+
+ if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
+ return compat_mc_setsockopt(sk, level, optname, optval, optlen,
+ ipv6_setsockopt);
+
+ err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
+ optname != IPV6_XFRM_POLICY) {
+ lock_sock(sk);
+ err = compat_nf_setsockopt(sk, PF_INET6, optname,
+ optval, optlen);
+ release_sock(sk);
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(compat_ipv6_setsockopt);
+#endif
+
+static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
+ int optname, char __user *optval, int len)
+{
+ struct ipv6_opt_hdr *hdr;
+
+ if (!opt)
+ return 0;
+
+ switch (optname) {
+ case IPV6_HOPOPTS:
+ hdr = opt->hopopt;
+ break;
+ case IPV6_RTHDRDSTOPTS:
+ hdr = opt->dst0opt;
+ break;
+ case IPV6_RTHDR:
+ hdr = (struct ipv6_opt_hdr *)opt->srcrt;
+ break;
+ case IPV6_DSTOPTS:
+ hdr = opt->dst1opt;
+ break;
+ default:
+ return -EINVAL; /* should not happen */
+ }
+
+ if (!hdr)
+ return 0;
+
+ len = min_t(unsigned int, len, ipv6_optlen(hdr));
+ if (copy_to_user(optval, hdr, len))
+ return -EFAULT;
+ return len;
+}
+
+static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen, unsigned int flags)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ int len;
+ int val;
+
+ if (ip6_mroute_opt(optname))
+ return ip6_mroute_getsockopt(sk, optname, optval, optlen);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ switch (optname) {
+ case IPV6_ADDRFORM:
+ if (sk->sk_protocol != IPPROTO_UDP &&
+ sk->sk_protocol != IPPROTO_UDPLITE &&
+ sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+ val = sk->sk_family;
+ break;
+ case MCAST_MSFILTER:
+ {
+ struct group_filter gsf;
+ int err;
+
+ if (len < GROUP_FILTER_SIZE(0))
+ return -EINVAL;
+ if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0)))
+ return -EFAULT;
+ if (gsf.gf_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gsf,
+ (struct group_filter __user *)optval, optlen);
+ release_sock(sk);
+ return err;
+ }
+
+ case IPV6_2292PKTOPTIONS:
+ {
+ struct msghdr msg;
+ struct sk_buff *skb;
+
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ msg.msg_control = optval;
+ msg.msg_controllen = len;
+ msg.msg_flags = flags;
+
+ lock_sock(sk);
+ skb = np->pktoptions;
+ if (skb)
+ ip6_datagram_recv_ctl(sk, &msg, skb);
+ release_sock(sk);
+ if (!skb) {
+ if (np->rxopt.bits.rxinfo) {
+ struct in6_pktinfo src_info;
+ src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif :
+ np->sticky_pktinfo.ipi6_ifindex;
+ src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr;
+ put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
+ }
+ if (np->rxopt.bits.rxhlim) {
+ int hlim = np->mcast_hops;
+ put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
+ }
+ if (np->rxopt.bits.rxtclass) {
+ int tclass = (int)ip6_tclass(np->rcv_flowinfo);
+
+ put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass);
+ }
+ if (np->rxopt.bits.rxoinfo) {
+ struct in6_pktinfo src_info;
+ src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif :
+ np->sticky_pktinfo.ipi6_ifindex;
+ src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr :
+ np->sticky_pktinfo.ipi6_addr;
+ put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
+ }
+ if (np->rxopt.bits.rxohlim) {
+ int hlim = np->mcast_hops;
+ put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
+ }
+ if (np->rxopt.bits.rxflow) {
+ __be32 flowinfo = np->rcv_flowinfo;
+
+ put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo);
+ }
+ }
+ len -= msg.msg_controllen;
+ return put_user(len, optlen);
+ }
+ case IPV6_MTU:
+ {
+ struct dst_entry *dst;
+
+ val = 0;
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ if (dst)
+ val = dst_mtu(dst);
+ rcu_read_unlock();
+ if (!val)
+ return -ENOTCONN;
+ break;
+ }
+
+ case IPV6_V6ONLY:
+ val = sk->sk_ipv6only;
+ break;
+
+ case IPV6_RECVPKTINFO:
+ val = np->rxopt.bits.rxinfo;
+ break;
+
+ case IPV6_2292PKTINFO:
+ val = np->rxopt.bits.rxoinfo;
+ break;
+
+ case IPV6_RECVHOPLIMIT:
+ val = np->rxopt.bits.rxhlim;
+ break;
+
+ case IPV6_2292HOPLIMIT:
+ val = np->rxopt.bits.rxohlim;
+ break;
+
+ case IPV6_RECVRTHDR:
+ val = np->rxopt.bits.srcrt;
+ break;
+
+ case IPV6_2292RTHDR:
+ val = np->rxopt.bits.osrcrt;
+ break;
+
+ case IPV6_HOPOPTS:
+ case IPV6_RTHDRDSTOPTS:
+ case IPV6_RTHDR:
+ case IPV6_DSTOPTS:
+ {
+
+ lock_sock(sk);
+ len = ipv6_getsockopt_sticky(sk, np->opt,
+ optname, optval, len);
+ release_sock(sk);
+ /* check if ipv6_getsockopt_sticky() returns err code */
+ if (len < 0)
+ return len;
+ return put_user(len, optlen);
+ }
+
+ case IPV6_RECVHOPOPTS:
+ val = np->rxopt.bits.hopopts;
+ break;
+
+ case IPV6_2292HOPOPTS:
+ val = np->rxopt.bits.ohopopts;
+ break;
+
+ case IPV6_RECVDSTOPTS:
+ val = np->rxopt.bits.dstopts;
+ break;
+
+ case IPV6_2292DSTOPTS:
+ val = np->rxopt.bits.odstopts;
+ break;
+
+ case IPV6_TCLASS:
+ val = np->tclass;
+ break;
+
+ case IPV6_RECVTCLASS:
+ val = np->rxopt.bits.rxtclass;
+ break;
+
+ case IPV6_FLOWINFO:
+ val = np->rxopt.bits.rxflow;
+ break;
+
+ case IPV6_RECVPATHMTU:
+ val = np->rxopt.bits.rxpmtu;
+ break;
+
+ case IPV6_PATHMTU:
+ {
+ struct dst_entry *dst;
+ struct ip6_mtuinfo mtuinfo;
+
+ if (len < sizeof(mtuinfo))
+ return -EINVAL;
+
+ len = sizeof(mtuinfo);
+ memset(&mtuinfo, 0, sizeof(mtuinfo));
+
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ if (dst)
+ mtuinfo.ip6m_mtu = dst_mtu(dst);
+ rcu_read_unlock();
+ if (!mtuinfo.ip6m_mtu)
+ return -ENOTCONN;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &mtuinfo, len))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case IPV6_TRANSPARENT:
+ val = inet_sk(sk)->transparent;
+ break;
+
+ case IPV6_RECVORIGDSTADDR:
+ val = np->rxopt.bits.rxorigdstaddr;
+ break;
+
+ case IPV6_UNICAST_HOPS:
+ case IPV6_MULTICAST_HOPS:
+ {
+ struct dst_entry *dst;
+
+ if (optname == IPV6_UNICAST_HOPS)
+ val = np->hop_limit;
+ else
+ val = np->mcast_hops;
+
+ if (val < 0) {
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ if (dst)
+ val = ip6_dst_hoplimit(dst);
+ rcu_read_unlock();
+ }
+
+ if (val < 0)
+ val = sock_net(sk)->ipv6.devconf_all->hop_limit;
+ break;
+ }
+
+ case IPV6_MULTICAST_LOOP:
+ val = np->mc_loop;
+ break;
+
+ case IPV6_MULTICAST_IF:
+ val = np->mcast_oif;
+ break;
+
+ case IPV6_UNICAST_IF:
+ val = (__force int)htonl((__u32) np->ucast_oif);
+ break;
+
+ case IPV6_MTU_DISCOVER:
+ val = np->pmtudisc;
+ break;
+
+ case IPV6_RECVERR:
+ val = np->recverr;
+ break;
+
+ case IPV6_FLOWINFO_SEND:
+ val = np->sndflow;
+ break;
+
+ case IPV6_FLOWLABEL_MGR:
+ {
+ struct in6_flowlabel_req freq;
+ int flags;
+
+ if (len < sizeof(freq))
+ return -EINVAL;
+
+ if (copy_from_user(&freq, optval, sizeof(freq)))
+ return -EFAULT;
+
+ if (freq.flr_action != IPV6_FL_A_GET)
+ return -EINVAL;
+
+ len = sizeof(freq);
+ flags = freq.flr_flags;
+
+ memset(&freq, 0, sizeof(freq));
+
+ val = ipv6_flowlabel_opt_get(sk, &freq, flags);
+ if (val < 0)
+ return val;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &freq, len))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ case IPV6_ADDR_PREFERENCES:
+ val = 0;
+
+ if (np->srcprefs & IPV6_PREFER_SRC_TMP)
+ val |= IPV6_PREFER_SRC_TMP;
+ else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC)
+ val |= IPV6_PREFER_SRC_PUBLIC;
+ else {
+ /* XXX: should we return system default? */
+ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT;
+ }
+
+ if (np->srcprefs & IPV6_PREFER_SRC_COA)
+ val |= IPV6_PREFER_SRC_COA;
+ else
+ val |= IPV6_PREFER_SRC_HOME;
+ break;
+
+ case IPV6_MINHOPCOUNT:
+ val = np->min_hopcount;
+ break;
+
+ case IPV6_DONTFRAG:
+ val = np->dontfrag;
+ break;
+
+ case IPV6_AUTOFLOWLABEL:
+ val = np->autoflowlabel;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+ len = min_t(unsigned int, sizeof(int), len);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+int ipv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ int err;
+
+ if (level == SOL_IP && sk->sk_type != SOCK_RAW)
+ return udp_prot.getsockopt(sk, level, optname, optval, optlen);
+
+ if (level != SOL_IPV6)
+ return -ENOPROTOOPT;
+
+ err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ err = nf_getsockopt(sk, PF_INET6, optname, optval,
+ &len);
+ release_sock(sk);
+ if (err >= 0)
+ err = put_user(len, optlen);
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(ipv6_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ int err;
+
+ if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
+ if (udp_prot.compat_getsockopt != NULL)
+ return udp_prot.compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return udp_prot.getsockopt(sk, level, optname, optval, optlen);
+ }
+
+ if (level != SOL_IPV6)
+ return -ENOPROTOOPT;
+
+ if (optname == MCAST_MSFILTER)
+ return compat_mc_getsockopt(sk, level, optname, optval, optlen,
+ ipv6_getsockopt);
+
+ err = do_ipv6_getsockopt(sk, level, optname, optval, optlen,
+ MSG_CMSG_COMPAT);
+#ifdef CONFIG_NETFILTER
+ /* we need to exclude all possible ENOPROTOOPTs except default case */
+ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
+ int len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+ err = compat_nf_getsockopt(sk, PF_INET6,
+ optname, optval, &len);
+ release_sock(sk);
+ if (err >= 0)
+ err = put_user(len, optlen);
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(compat_ipv6_getsockopt);
+#endif
+
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
new file mode 100644
index 000000000..083b2927f
--- /dev/null
+++ b/net/ipv6/mcast.c
@@ -0,0 +1,2951 @@
+/*
+ * Multicast support for IPv6
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* Changes:
+ *
+ * yoshfuji : fix format of router-alert option
+ * YOSHIFUJI Hideaki @USAGI:
+ * Fixed source address for MLD message based on
+ * <draft-ietf-magma-mld-source-05.txt>.
+ * YOSHIFUJI Hideaki @USAGI:
+ * - Ignore Queries for invalid addresses.
+ * - MLD for link-local addresses.
+ * David L Stevens <dlstevens@us.ibm.com>:
+ * - MLDv2 support
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/jiffies.h>
+#include <linux/times.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/route.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#include <net/mld.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/if_inet6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/inet_common.h>
+
+#include <net/ip6_checksum.h>
+
+/* Ensure that we have struct in6_addr aligned on 32bit word. */
+static void *__mld2_query_bugs[] __attribute__((__unused__)) = {
+ BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4),
+ BUILD_BUG_ON_NULL(offsetof(struct mld2_report, mld2r_grec) % 4),
+ BUILD_BUG_ON_NULL(offsetof(struct mld2_grec, grec_mca) % 4)
+};
+
+static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;
+
+static void igmp6_join_group(struct ifmcaddr6 *ma);
+static void igmp6_leave_group(struct ifmcaddr6 *ma);
+static void igmp6_timer_handler(unsigned long data);
+
+static void mld_gq_timer_expire(unsigned long data);
+static void mld_ifc_timer_expire(unsigned long data);
+static void mld_ifc_event(struct inet6_dev *idev);
+static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
+static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr);
+static void mld_clear_delrec(struct inet6_dev *idev);
+static bool mld_in_v1_mode(const struct inet6_dev *idev);
+static int sf_setstate(struct ifmcaddr6 *pmc);
+static void sf_markstate(struct ifmcaddr6 *pmc);
+static void ip6_mc_clear_src(struct ifmcaddr6 *pmc);
+static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+ int sfmode, int sfcount, const struct in6_addr *psfsrc,
+ int delta);
+static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+ int sfmode, int sfcount, const struct in6_addr *psfsrc,
+ int delta);
+static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
+ struct inet6_dev *idev);
+
+#define MLD_QRV_DEFAULT 2
+/* RFC3810, 9.2. Query Interval */
+#define MLD_QI_DEFAULT (125 * HZ)
+/* RFC3810, 9.3. Query Response Interval */
+#define MLD_QRI_DEFAULT (10 * HZ)
+
+/* RFC3810, 8.1 Query Version Distinctions */
+#define MLD_V1_QUERY_LEN 24
+#define MLD_V2_QUERY_LEN_MIN 28
+
+#define IPV6_MLD_MAX_MSF 64
+
+int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
+int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT;
+
+/*
+ * socket join on multicast group
+ */
+
+#define for_each_pmc_rcu(np, pmc) \
+ for (pmc = rcu_dereference(np->ipv6_mc_list); \
+ pmc != NULL; \
+ pmc = rcu_dereference(pmc->next))
+
+static int unsolicited_report_interval(struct inet6_dev *idev)
+{
+ int iv;
+
+ if (mld_in_v1_mode(idev))
+ iv = idev->cnf.mldv1_unsolicited_report_interval;
+ else
+ iv = idev->cnf.mldv2_unsolicited_report_interval;
+
+ return iv > 0 ? iv : 1;
+}
+
+int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+ struct net_device *dev = NULL;
+ struct ipv6_mc_socklist *mc_lst;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!ipv6_addr_is_multicast(addr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ for_each_pmc_rcu(np, mc_lst) {
+ if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
+ ipv6_addr_equal(&mc_lst->addr, addr)) {
+ rcu_read_unlock();
+ return -EADDRINUSE;
+ }
+ }
+ rcu_read_unlock();
+
+ mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
+
+ if (!mc_lst)
+ return -ENOMEM;
+
+ mc_lst->next = NULL;
+ mc_lst->addr = *addr;
+
+ if (ifindex == 0) {
+ struct rt6_info *rt;
+ rt = rt6_lookup(net, addr, NULL, 0, 0);
+ if (rt) {
+ dev = rt->dst.dev;
+ ip6_rt_put(rt);
+ }
+ } else
+ dev = __dev_get_by_index(net, ifindex);
+
+ if (!dev) {
+ sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
+ return -ENODEV;
+ }
+
+ mc_lst->ifindex = dev->ifindex;
+ mc_lst->sfmode = MCAST_EXCLUDE;
+ rwlock_init(&mc_lst->sflock);
+ mc_lst->sflist = NULL;
+
+ /*
+ * now add/increase the group membership on the device
+ */
+
+ err = ipv6_dev_mc_inc(dev, addr);
+
+ if (err) {
+ sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
+ return err;
+ }
+
+ mc_lst->next = np->ipv6_mc_list;
+ rcu_assign_pointer(np->ipv6_mc_list, mc_lst);
+
+ return 0;
+}
+EXPORT_SYMBOL(ipv6_sock_mc_join);
+
+/*
+ * socket leave on multicast group
+ */
+int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_mc_socklist *mc_lst;
+ struct ipv6_mc_socklist __rcu **lnk;
+ struct net *net = sock_net(sk);
+
+ ASSERT_RTNL();
+
+ if (!ipv6_addr_is_multicast(addr))
+ return -EINVAL;
+
+ for (lnk = &np->ipv6_mc_list;
+ (mc_lst = rtnl_dereference(*lnk)) != NULL;
+ lnk = &mc_lst->next) {
+ if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
+ ipv6_addr_equal(&mc_lst->addr, addr)) {
+ struct net_device *dev;
+
+ *lnk = mc_lst->next;
+
+ dev = __dev_get_by_index(net, mc_lst->ifindex);
+ if (dev) {
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ (void) ip6_mc_leave_src(sk, mc_lst, idev);
+ if (idev)
+ __ipv6_dev_mc_dec(idev, &mc_lst->addr);
+ } else
+ (void) ip6_mc_leave_src(sk, mc_lst, NULL);
+
+ atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
+ kfree_rcu(mc_lst, rcu);
+ return 0;
+ }
+ }
+
+ return -EADDRNOTAVAIL;
+}
+EXPORT_SYMBOL(ipv6_sock_mc_drop);
+
+/* called with rcu_read_lock() */
+static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
+{
+ struct net_device *dev = NULL;
+ struct inet6_dev *idev = NULL;
+
+ if (ifindex == 0) {
+ struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0);
+
+ if (rt) {
+ dev = rt->dst.dev;
+ ip6_rt_put(rt);
+ }
+ } else
+ dev = dev_get_by_index_rcu(net, ifindex);
+
+ if (!dev)
+ return NULL;
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return NULL;
+ read_lock_bh(&idev->lock);
+ if (idev->dead) {
+ read_unlock_bh(&idev->lock);
+ return NULL;
+ }
+ return idev;
+}
+
+void ipv6_sock_mc_close(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_mc_socklist *mc_lst;
+ struct net *net = sock_net(sk);
+
+ if (!rcu_access_pointer(np->ipv6_mc_list))
+ return;
+
+ rtnl_lock();
+ while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) {
+ struct net_device *dev;
+
+ np->ipv6_mc_list = mc_lst->next;
+
+ dev = __dev_get_by_index(net, mc_lst->ifindex);
+ if (dev) {
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ (void) ip6_mc_leave_src(sk, mc_lst, idev);
+ if (idev)
+ __ipv6_dev_mc_dec(idev, &mc_lst->addr);
+ } else
+ (void) ip6_mc_leave_src(sk, mc_lst, NULL);
+
+ atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
+ kfree_rcu(mc_lst, rcu);
+
+ }
+ rtnl_unlock();
+}
+
+int ip6_mc_source(int add, int omode, struct sock *sk,
+ struct group_source_req *pgsr)
+{
+ struct in6_addr *source, *group;
+ struct ipv6_mc_socklist *pmc;
+ struct inet6_dev *idev;
+ struct ipv6_pinfo *inet6 = inet6_sk(sk);
+ struct ip6_sf_socklist *psl;
+ struct net *net = sock_net(sk);
+ int i, j, rv;
+ int leavegroup = 0;
+ int pmclocked = 0;
+ int err;
+
+ source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr;
+ group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr;
+
+ if (!ipv6_addr_is_multicast(group))
+ return -EINVAL;
+
+ rcu_read_lock();
+ idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface);
+ if (!idev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ err = -EADDRNOTAVAIL;
+
+ for_each_pmc_rcu(inet6, pmc) {
+ if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
+ continue;
+ if (ipv6_addr_equal(&pmc->addr, group))
+ break;
+ }
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
+ goto done;
+ }
+ /* if a source filter was set, must be the same mode as before */
+ if (pmc->sflist) {
+ if (pmc->sfmode != omode) {
+ err = -EINVAL;
+ goto done;
+ }
+ } else if (pmc->sfmode != omode) {
+ /* allow mode switches for empty-set filters */
+ ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
+ ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
+ pmc->sfmode = omode;
+ }
+
+ write_lock(&pmc->sflock);
+ pmclocked = 1;
+
+ psl = pmc->sflist;
+ if (!add) {
+ if (!psl)
+ goto done; /* err = -EADDRNOTAVAIL */
+ rv = !0;
+ for (i = 0; i < psl->sl_count; i++) {
+ rv = !ipv6_addr_equal(&psl->sl_addr[i], source);
+ if (rv == 0)
+ break;
+ }
+ if (rv) /* source not found */
+ goto done; /* err = -EADDRNOTAVAIL */
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+ leavegroup = 1;
+ goto done;
+ }
+
+ /* update the interface filter */
+ ip6_mc_del_src(idev, group, omode, 1, source, 1);
+
+ for (j = i+1; j < psl->sl_count; j++)
+ psl->sl_addr[j-1] = psl->sl_addr[j];
+ psl->sl_count--;
+ err = 0;
+ goto done;
+ }
+ /* else, add a new source to the filter */
+
+ if (psl && psl->sl_count >= sysctl_mld_max_msf) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ if (!psl || psl->sl_count == psl->sl_max) {
+ struct ip6_sf_socklist *newpsl;
+ int count = IP6_SFBLOCK;
+
+ if (psl)
+ count += psl->sl_max;
+ newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = count;
+ newpsl->sl_count = count - IP6_SFBLOCK;
+ if (psl) {
+ for (i = 0; i < psl->sl_count; i++)
+ newpsl->sl_addr[i] = psl->sl_addr[i];
+ sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
+ }
+ pmc->sflist = psl = newpsl;
+ }
+ rv = 1; /* > 0 for insert logic below if sl_count is 0 */
+ for (i = 0; i < psl->sl_count; i++) {
+ rv = !ipv6_addr_equal(&psl->sl_addr[i], source);
+ if (rv == 0) /* There is an error in the address. */
+ goto done;
+ }
+ for (j = psl->sl_count-1; j >= i; j--)
+ psl->sl_addr[j+1] = psl->sl_addr[j];
+ psl->sl_addr[i] = *source;
+ psl->sl_count++;
+ err = 0;
+ /* update the interface list */
+ ip6_mc_add_src(idev, group, omode, 1, source, 1);
+done:
+ if (pmclocked)
+ write_unlock(&pmc->sflock);
+ read_unlock_bh(&idev->lock);
+ rcu_read_unlock();
+ if (leavegroup)
+ err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
+ return err;
+}
+
+int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
+{
+ const struct in6_addr *group;
+ struct ipv6_mc_socklist *pmc;
+ struct inet6_dev *idev;
+ struct ipv6_pinfo *inet6 = inet6_sk(sk);
+ struct ip6_sf_socklist *newpsl, *psl;
+ struct net *net = sock_net(sk);
+ int leavegroup = 0;
+ int i, err;
+
+ group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
+
+ if (!ipv6_addr_is_multicast(group))
+ return -EINVAL;
+ if (gsf->gf_fmode != MCAST_INCLUDE &&
+ gsf->gf_fmode != MCAST_EXCLUDE)
+ return -EINVAL;
+
+ rcu_read_lock();
+ idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
+
+ if (!idev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ err = 0;
+
+ if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
+ leavegroup = 1;
+ goto done;
+ }
+
+ for_each_pmc_rcu(inet6, pmc) {
+ if (pmc->ifindex != gsf->gf_interface)
+ continue;
+ if (ipv6_addr_equal(&pmc->addr, group))
+ break;
+ }
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
+ goto done;
+ }
+ if (gsf->gf_numsrc) {
+ newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc),
+ GFP_ATOMIC);
+ if (!newpsl) {
+ err = -ENOBUFS;
+ goto done;
+ }
+ newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc;
+ for (i = 0; i < newpsl->sl_count; ++i) {
+ struct sockaddr_in6 *psin6;
+
+ psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i];
+ newpsl->sl_addr[i] = psin6->sin6_addr;
+ }
+ err = ip6_mc_add_src(idev, group, gsf->gf_fmode,
+ newpsl->sl_count, newpsl->sl_addr, 0);
+ if (err) {
+ sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max));
+ goto done;
+ }
+ } else {
+ newpsl = NULL;
+ (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
+ }
+
+ write_lock(&pmc->sflock);
+ psl = pmc->sflist;
+ if (psl) {
+ (void) ip6_mc_del_src(idev, group, pmc->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
+ } else
+ (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
+ pmc->sflist = newpsl;
+ pmc->sfmode = gsf->gf_fmode;
+ write_unlock(&pmc->sflock);
+ err = 0;
+done:
+ read_unlock_bh(&idev->lock);
+ rcu_read_unlock();
+ if (leavegroup)
+ err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
+ return err;
+}
+
+int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
+ struct group_filter __user *optval, int __user *optlen)
+{
+ int err, i, count, copycount;
+ const struct in6_addr *group;
+ struct ipv6_mc_socklist *pmc;
+ struct inet6_dev *idev;
+ struct ipv6_pinfo *inet6 = inet6_sk(sk);
+ struct ip6_sf_socklist *psl;
+ struct net *net = sock_net(sk);
+
+ group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
+
+ if (!ipv6_addr_is_multicast(group))
+ return -EINVAL;
+
+ rcu_read_lock();
+ idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
+
+ if (!idev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ err = -EADDRNOTAVAIL;
+ /* changes to the ipv6_mc_list require the socket lock and
+ * rtnl lock. We have the socket lock and rcu read lock,
+ * so reading the list is safe.
+ */
+
+ for_each_pmc_rcu(inet6, pmc) {
+ if (pmc->ifindex != gsf->gf_interface)
+ continue;
+ if (ipv6_addr_equal(group, &pmc->addr))
+ break;
+ }
+ if (!pmc) /* must have a prior join */
+ goto done;
+ gsf->gf_fmode = pmc->sfmode;
+ psl = pmc->sflist;
+ count = psl ? psl->sl_count : 0;
+ read_unlock_bh(&idev->lock);
+ rcu_read_unlock();
+
+ copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+ gsf->gf_numsrc = count;
+ if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+ copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+ return -EFAULT;
+ }
+ /* changes to psl require the socket lock, and a write lock
+ * on pmc->sflock. We have the socket lock so reading here is safe.
+ */
+ for (i = 0; i < copycount; i++) {
+ struct sockaddr_in6 *psin6;
+ struct sockaddr_storage ss;
+
+ psin6 = (struct sockaddr_in6 *)&ss;
+ memset(&ss, 0, sizeof(ss));
+ psin6->sin6_family = AF_INET6;
+ psin6->sin6_addr = psl->sl_addr[i];
+ if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ return -EFAULT;
+ }
+ return 0;
+done:
+ read_unlock_bh(&idev->lock);
+ rcu_read_unlock();
+ return err;
+}
+
+bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
+ const struct in6_addr *src_addr)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_mc_socklist *mc;
+ struct ip6_sf_socklist *psl;
+ bool rv = true;
+
+ rcu_read_lock();
+ for_each_pmc_rcu(np, mc) {
+ if (ipv6_addr_equal(&mc->addr, mc_addr))
+ break;
+ }
+ if (!mc) {
+ rcu_read_unlock();
+ return true;
+ }
+ read_lock(&mc->sflock);
+ psl = mc->sflist;
+ if (!psl) {
+ rv = mc->sfmode == MCAST_EXCLUDE;
+ } else {
+ int i;
+
+ for (i = 0; i < psl->sl_count; i++) {
+ if (ipv6_addr_equal(&psl->sl_addr[i], src_addr))
+ break;
+ }
+ if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+ rv = false;
+ if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+ rv = false;
+ }
+ read_unlock(&mc->sflock);
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static void igmp6_group_added(struct ifmcaddr6 *mc)
+{
+ struct net_device *dev = mc->idev->dev;
+ char buf[MAX_ADDR_LEN];
+
+ if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ return;
+
+ spin_lock_bh(&mc->mca_lock);
+ if (!(mc->mca_flags&MAF_LOADED)) {
+ mc->mca_flags |= MAF_LOADED;
+ if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
+ dev_mc_add(dev, buf);
+ }
+ spin_unlock_bh(&mc->mca_lock);
+
+ if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT))
+ return;
+
+ if (mld_in_v1_mode(mc->idev)) {
+ igmp6_join_group(mc);
+ return;
+ }
+ /* else v2 */
+
+ mc->mca_crcount = mc->idev->mc_qrv;
+ mld_ifc_event(mc->idev);
+}
+
+static void igmp6_group_dropped(struct ifmcaddr6 *mc)
+{
+ struct net_device *dev = mc->idev->dev;
+ char buf[MAX_ADDR_LEN];
+
+ if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ return;
+
+ spin_lock_bh(&mc->mca_lock);
+ if (mc->mca_flags&MAF_LOADED) {
+ mc->mca_flags &= ~MAF_LOADED;
+ if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
+ dev_mc_del(dev, buf);
+ }
+
+ if (mc->mca_flags & MAF_NOREPORT)
+ goto done;
+ spin_unlock_bh(&mc->mca_lock);
+
+ if (!mc->idev->dead)
+ igmp6_leave_group(mc);
+
+ spin_lock_bh(&mc->mca_lock);
+ if (del_timer(&mc->mca_timer))
+ atomic_dec(&mc->mca_refcnt);
+done:
+ ip6_mc_clear_src(mc);
+ spin_unlock_bh(&mc->mca_lock);
+}
+
+/*
+ * deleted ifmcaddr6 manipulation
+ */
+static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
+{
+ struct ifmcaddr6 *pmc;
+
+ /* this is an "ifmcaddr6" for convenience; only the fields below
+ * are actually used. In particular, the refcnt and users are not
+ * used for management of the delete list. Using the same structure
+ * for deleted items allows change reports to use common code with
+ * non-deleted or query-response MCA's.
+ */
+ pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC);
+ if (!pmc)
+ return;
+
+ spin_lock_bh(&im->mca_lock);
+ spin_lock_init(&pmc->mca_lock);
+ pmc->idev = im->idev;
+ in6_dev_hold(idev);
+ pmc->mca_addr = im->mca_addr;
+ pmc->mca_crcount = idev->mc_qrv;
+ pmc->mca_sfmode = im->mca_sfmode;
+ if (pmc->mca_sfmode == MCAST_INCLUDE) {
+ struct ip6_sf_list *psf;
+
+ pmc->mca_tomb = im->mca_tomb;
+ pmc->mca_sources = im->mca_sources;
+ im->mca_tomb = im->mca_sources = NULL;
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = pmc->mca_crcount;
+ }
+ spin_unlock_bh(&im->mca_lock);
+
+ spin_lock_bh(&idev->mc_lock);
+ pmc->next = idev->mc_tomb;
+ idev->mc_tomb = pmc;
+ spin_unlock_bh(&idev->mc_lock);
+}
+
+static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca)
+{
+ struct ifmcaddr6 *pmc, *pmc_prev;
+ struct ip6_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&idev->mc_lock);
+ pmc_prev = NULL;
+ for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) {
+ if (ipv6_addr_equal(&pmc->mca_addr, pmca))
+ break;
+ pmc_prev = pmc;
+ }
+ if (pmc) {
+ if (pmc_prev)
+ pmc_prev->next = pmc->next;
+ else
+ idev->mc_tomb = pmc->next;
+ }
+ spin_unlock_bh(&idev->mc_lock);
+
+ if (pmc) {
+ for (psf = pmc->mca_tomb; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ in6_dev_put(pmc->idev);
+ kfree(pmc);
+ }
+}
+
+static void mld_clear_delrec(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *pmc, *nextpmc;
+
+ spin_lock_bh(&idev->mc_lock);
+ pmc = idev->mc_tomb;
+ idev->mc_tomb = NULL;
+ spin_unlock_bh(&idev->mc_lock);
+
+ for (; pmc; pmc = nextpmc) {
+ nextpmc = pmc->next;
+ ip6_mc_clear_src(pmc);
+ in6_dev_put(pmc->idev);
+ kfree(pmc);
+ }
+
+ /* clear dead sources, too */
+ read_lock_bh(&idev->lock);
+ for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ struct ip6_sf_list *psf, *psf_next;
+
+ spin_lock_bh(&pmc->mca_lock);
+ psf = pmc->mca_tomb;
+ pmc->mca_tomb = NULL;
+ spin_unlock_bh(&pmc->mca_lock);
+ for (; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ kfree(psf);
+ }
+ }
+ read_unlock_bh(&idev->lock);
+}
+
+static void mca_get(struct ifmcaddr6 *mc)
+{
+ atomic_inc(&mc->mca_refcnt);
+}
+
+static void ma_put(struct ifmcaddr6 *mc)
+{
+ if (atomic_dec_and_test(&mc->mca_refcnt)) {
+ in6_dev_put(mc->idev);
+ kfree(mc);
+ }
+}
+
+static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
+ const struct in6_addr *addr)
+{
+ struct ifmcaddr6 *mc;
+
+ mc = kzalloc(sizeof(*mc), GFP_ATOMIC);
+ if (!mc)
+ return NULL;
+
+ setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc);
+
+ mc->mca_addr = *addr;
+ mc->idev = idev; /* reference taken by caller */
+ mc->mca_users = 1;
+ /* mca_stamp should be updated upon changes */
+ mc->mca_cstamp = mc->mca_tstamp = jiffies;
+ atomic_set(&mc->mca_refcnt, 1);
+ spin_lock_init(&mc->mca_lock);
+
+ /* initial mode is (EX, empty) */
+ mc->mca_sfmode = MCAST_EXCLUDE;
+ mc->mca_sfcount[MCAST_EXCLUDE] = 1;
+
+ if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
+ IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
+ mc->mca_flags |= MAF_NOREPORT;
+
+ return mc;
+}
+
+/*
+ * device multicast group inc (add if not found)
+ */
+int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+{
+ struct ifmcaddr6 *mc;
+ struct inet6_dev *idev;
+
+ ASSERT_RTNL();
+
+ /* we need to take a reference on idev */
+ idev = in6_dev_get(dev);
+
+ if (!idev)
+ return -EINVAL;
+
+ write_lock_bh(&idev->lock);
+ if (idev->dead) {
+ write_unlock_bh(&idev->lock);
+ in6_dev_put(idev);
+ return -ENODEV;
+ }
+
+ for (mc = idev->mc_list; mc; mc = mc->next) {
+ if (ipv6_addr_equal(&mc->mca_addr, addr)) {
+ mc->mca_users++;
+ write_unlock_bh(&idev->lock);
+ ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0,
+ NULL, 0);
+ in6_dev_put(idev);
+ return 0;
+ }
+ }
+
+ mc = mca_alloc(idev, addr);
+ if (!mc) {
+ write_unlock_bh(&idev->lock);
+ in6_dev_put(idev);
+ return -ENOMEM;
+ }
+
+ mc->next = idev->mc_list;
+ idev->mc_list = mc;
+
+ /* Hold this for the code below before we unlock,
+ * it is already exposed via idev->mc_list.
+ */
+ mca_get(mc);
+ write_unlock_bh(&idev->lock);
+
+ mld_del_delrec(idev, &mc->mca_addr);
+ igmp6_group_added(mc);
+ ma_put(mc);
+ return 0;
+}
+
+/*
+ * device multicast group del
+ */
+int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
+{
+ struct ifmcaddr6 *ma, **map;
+
+ ASSERT_RTNL();
+
+ write_lock_bh(&idev->lock);
+ for (map = &idev->mc_list; (ma = *map) != NULL; map = &ma->next) {
+ if (ipv6_addr_equal(&ma->mca_addr, addr)) {
+ if (--ma->mca_users == 0) {
+ *map = ma->next;
+ write_unlock_bh(&idev->lock);
+
+ igmp6_group_dropped(ma);
+
+ ma_put(ma);
+ return 0;
+ }
+ write_unlock_bh(&idev->lock);
+ return 0;
+ }
+ }
+ write_unlock_bh(&idev->lock);
+
+ return -ENOENT;
+}
+
+int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr)
+{
+ struct inet6_dev *idev;
+ int err;
+
+ ASSERT_RTNL();
+
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ err = -ENODEV;
+ else
+ err = __ipv6_dev_mc_dec(idev, addr);
+
+ return err;
+}
+
+/*
+ * check if the interface/address pair is valid
+ */
+bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
+ const struct in6_addr *src_addr)
+{
+ struct inet6_dev *idev;
+ struct ifmcaddr6 *mc;
+ bool rv = false;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ read_lock_bh(&idev->lock);
+ for (mc = idev->mc_list; mc; mc = mc->next) {
+ if (ipv6_addr_equal(&mc->mca_addr, group))
+ break;
+ }
+ if (mc) {
+ if (src_addr && !ipv6_addr_any(src_addr)) {
+ struct ip6_sf_list *psf;
+
+ spin_lock_bh(&mc->mca_lock);
+ for (psf = mc->mca_sources; psf; psf = psf->sf_next) {
+ if (ipv6_addr_equal(&psf->sf_addr, src_addr))
+ break;
+ }
+ if (psf)
+ rv = psf->sf_count[MCAST_INCLUDE] ||
+ psf->sf_count[MCAST_EXCLUDE] !=
+ mc->mca_sfcount[MCAST_EXCLUDE];
+ else
+ rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0;
+ spin_unlock_bh(&mc->mca_lock);
+ } else
+ rv = true; /* don't filter unspecified source */
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+ return rv;
+}
+
+static void mld_gq_start_timer(struct inet6_dev *idev)
+{
+ unsigned long tv = prandom_u32() % idev->mc_maxdelay;
+
+ idev->mc_gq_running = 1;
+ if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2))
+ in6_dev_hold(idev);
+}
+
+static void mld_gq_stop_timer(struct inet6_dev *idev)
+{
+ idev->mc_gq_running = 0;
+ if (del_timer(&idev->mc_gq_timer))
+ __in6_dev_put(idev);
+}
+
+static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay)
+{
+ unsigned long tv = prandom_u32() % delay;
+
+ if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2))
+ in6_dev_hold(idev);
+}
+
+static void mld_ifc_stop_timer(struct inet6_dev *idev)
+{
+ idev->mc_ifc_count = 0;
+ if (del_timer(&idev->mc_ifc_timer))
+ __in6_dev_put(idev);
+}
+
+static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay)
+{
+ unsigned long tv = prandom_u32() % delay;
+
+ if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2))
+ in6_dev_hold(idev);
+}
+
+static void mld_dad_stop_timer(struct inet6_dev *idev)
+{
+ if (del_timer(&idev->mc_dad_timer))
+ __in6_dev_put(idev);
+}
+
+/*
+ * IGMP handling (alias multicast ICMPv6 messages)
+ */
+
+static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
+{
+ unsigned long delay = resptime;
+
+ /* Do not start timer for these addresses */
+ if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) ||
+ IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
+ return;
+
+ if (del_timer(&ma->mca_timer)) {
+ atomic_dec(&ma->mca_refcnt);
+ delay = ma->mca_timer.expires - jiffies;
+ }
+
+ if (delay >= resptime)
+ delay = prandom_u32() % resptime;
+
+ ma->mca_timer.expires = jiffies + delay;
+ if (!mod_timer(&ma->mca_timer, jiffies + delay))
+ atomic_inc(&ma->mca_refcnt);
+ ma->mca_flags |= MAF_TIMER_RUNNING;
+}
+
+/* mark EXCLUDE-mode sources */
+static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
+ const struct in6_addr *srcs)
+{
+ struct ip6_sf_list *psf;
+ int i, scount;
+
+ scount = 0;
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i = 0; i < nsrcs; i++) {
+ /* skip inactive filters */
+ if (psf->sf_count[MCAST_INCLUDE] ||
+ pmc->mca_sfcount[MCAST_EXCLUDE] !=
+ psf->sf_count[MCAST_EXCLUDE])
+ break;
+ if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
+ scount++;
+ break;
+ }
+ }
+ }
+ pmc->mca_flags &= ~MAF_GSQUERY;
+ if (scount == nsrcs) /* all sources excluded */
+ return false;
+ return true;
+}
+
+static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
+ const struct in6_addr *srcs)
+{
+ struct ip6_sf_list *psf;
+ int i, scount;
+
+ if (pmc->mca_sfmode == MCAST_EXCLUDE)
+ return mld_xmarksources(pmc, nsrcs, srcs);
+
+ /* mark INCLUDE-mode sources */
+
+ scount = 0;
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ if (scount == nsrcs)
+ break;
+ for (i = 0; i < nsrcs; i++) {
+ if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
+ psf->sf_gsresp = 1;
+ scount++;
+ break;
+ }
+ }
+ }
+ if (!scount) {
+ pmc->mca_flags &= ~MAF_GSQUERY;
+ return false;
+ }
+ pmc->mca_flags |= MAF_GSQUERY;
+ return true;
+}
+
+static int mld_force_mld_version(const struct inet6_dev *idev)
+{
+ /* Normally, both are 0 here. If enforcement to a particular is
+ * being used, individual device enforcement will have a lower
+ * precedence over 'all' device (.../conf/all/force_mld_version).
+ */
+
+ if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version != 0)
+ return dev_net(idev->dev)->ipv6.devconf_all->force_mld_version;
+ else
+ return idev->cnf.force_mld_version;
+}
+
+static bool mld_in_v2_mode_only(const struct inet6_dev *idev)
+{
+ return mld_force_mld_version(idev) == 2;
+}
+
+static bool mld_in_v1_mode_only(const struct inet6_dev *idev)
+{
+ return mld_force_mld_version(idev) == 1;
+}
+
+static bool mld_in_v1_mode(const struct inet6_dev *idev)
+{
+ if (mld_in_v2_mode_only(idev))
+ return false;
+ if (mld_in_v1_mode_only(idev))
+ return true;
+ if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen))
+ return true;
+
+ return false;
+}
+
+static void mld_set_v1_mode(struct inet6_dev *idev)
+{
+ /* RFC3810, relevant sections:
+ * - 9.1. Robustness Variable
+ * - 9.2. Query Interval
+ * - 9.3. Query Response Interval
+ * - 9.12. Older Version Querier Present Timeout
+ */
+ unsigned long switchback;
+
+ switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri;
+
+ idev->mc_v1_seen = jiffies + switchback;
+}
+
+static void mld_update_qrv(struct inet6_dev *idev,
+ const struct mld2_query *mlh2)
+{
+ /* RFC3810, relevant sections:
+ * - 5.1.8. QRV (Querier's Robustness Variable)
+ * - 9.1. Robustness Variable
+ */
+
+ /* The value of the Robustness Variable MUST NOT be zero,
+ * and SHOULD NOT be one. Catch this here if we ever run
+ * into such a case in future.
+ */
+ const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv);
+ WARN_ON(idev->mc_qrv == 0);
+
+ if (mlh2->mld2q_qrv > 0)
+ idev->mc_qrv = mlh2->mld2q_qrv;
+
+ if (unlikely(idev->mc_qrv < min_qrv)) {
+ net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n",
+ idev->mc_qrv, min_qrv);
+ idev->mc_qrv = min_qrv;
+ }
+}
+
+static void mld_update_qi(struct inet6_dev *idev,
+ const struct mld2_query *mlh2)
+{
+ /* RFC3810, relevant sections:
+ * - 5.1.9. QQIC (Querier's Query Interval Code)
+ * - 9.2. Query Interval
+ * - 9.12. Older Version Querier Present Timeout
+ * (the [Query Interval] in the last Query received)
+ */
+ unsigned long mc_qqi;
+
+ if (mlh2->mld2q_qqic < 128) {
+ mc_qqi = mlh2->mld2q_qqic;
+ } else {
+ unsigned long mc_man, mc_exp;
+
+ mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic);
+ mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic);
+
+ mc_qqi = (mc_man | 0x10) << (mc_exp + 3);
+ }
+
+ idev->mc_qi = mc_qqi * HZ;
+}
+
+static void mld_update_qri(struct inet6_dev *idev,
+ const struct mld2_query *mlh2)
+{
+ /* RFC3810, relevant sections:
+ * - 5.1.3. Maximum Response Code
+ * - 9.3. Query Response Interval
+ */
+ idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2));
+}
+
+static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld,
+ unsigned long *max_delay, bool v1_query)
+{
+ unsigned long mldv1_md;
+
+ /* Ignore v1 queries */
+ if (mld_in_v2_mode_only(idev))
+ return -EINVAL;
+
+ mldv1_md = ntohs(mld->mld_maxdelay);
+
+ /* When in MLDv1 fallback and a MLDv2 router start-up being
+ * unaware of current MLDv1 operation, the MRC == MRD mapping
+ * only works when the exponential algorithm is not being
+ * used (as MLDv1 is unaware of such things).
+ *
+ * According to the RFC author, the MLDv2 implementations
+ * he's aware of all use a MRC < 32768 on start up queries.
+ *
+ * Thus, should we *ever* encounter something else larger
+ * than that, just assume the maximum possible within our
+ * reach.
+ */
+ if (!v1_query)
+ mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT);
+
+ *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL);
+
+ /* MLDv1 router present: we need to go into v1 mode *only*
+ * when an MLDv1 query is received as per section 9.12. of
+ * RFC3810! And we know from RFC2710 section 3.7 that MLDv1
+ * queries MUST be of exactly 24 octets.
+ */
+ if (v1_query)
+ mld_set_v1_mode(idev);
+
+ /* cancel MLDv2 report timer */
+ mld_gq_stop_timer(idev);
+ /* cancel the interface change timer */
+ mld_ifc_stop_timer(idev);
+ /* clear deleted report items */
+ mld_clear_delrec(idev);
+
+ return 0;
+}
+
+static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
+ unsigned long *max_delay)
+{
+ *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL);
+
+ mld_update_qrv(idev, mld);
+ mld_update_qi(idev, mld);
+ mld_update_qri(idev, mld);
+
+ idev->mc_maxdelay = *max_delay;
+
+ return 0;
+}
+
+/* called with rcu_read_lock() */
+int igmp6_event_query(struct sk_buff *skb)
+{
+ struct mld2_query *mlh2 = NULL;
+ struct ifmcaddr6 *ma;
+ const struct in6_addr *group;
+ unsigned long max_delay;
+ struct inet6_dev *idev;
+ struct mld_msg *mld;
+ int group_type;
+ int mark = 0;
+ int len, err;
+
+ if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
+ return -EINVAL;
+
+ /* compute payload length excluding extension headers */
+ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr);
+ len -= skb_network_header_len(skb);
+
+ /* RFC3810 6.2
+ * Upon reception of an MLD message that contains a Query, the node
+ * checks if the source address of the message is a valid link-local
+ * address, if the Hop Limit is set to 1, and if the Router Alert
+ * option is present in the Hop-By-Hop Options header of the IPv6
+ * packet. If any of these checks fails, the packet is dropped.
+ */
+ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) ||
+ ipv6_hdr(skb)->hop_limit != 1 ||
+ !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) ||
+ IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD))
+ return -EINVAL;
+
+ idev = __in6_dev_get(skb->dev);
+ if (!idev)
+ return 0;
+
+ mld = (struct mld_msg *)icmp6_hdr(skb);
+ group = &mld->mld_mca;
+ group_type = ipv6_addr_type(group);
+
+ if (group_type != IPV6_ADDR_ANY &&
+ !(group_type&IPV6_ADDR_MULTICAST))
+ return -EINVAL;
+
+ if (len < MLD_V1_QUERY_LEN) {
+ return -EINVAL;
+ } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) {
+ err = mld_process_v1(idev, mld, &max_delay,
+ len == MLD_V1_QUERY_LEN);
+ if (err < 0)
+ return err;
+ } else if (len >= MLD_V2_QUERY_LEN_MIN) {
+ int srcs_offset = sizeof(struct mld2_query) -
+ sizeof(struct icmp6hdr);
+
+ if (!pskb_may_pull(skb, srcs_offset))
+ return -EINVAL;
+
+ mlh2 = (struct mld2_query *)skb_transport_header(skb);
+
+ err = mld_process_v2(idev, mlh2, &max_delay);
+ if (err < 0)
+ return err;
+
+ if (group_type == IPV6_ADDR_ANY) { /* general query */
+ if (mlh2->mld2q_nsrcs)
+ return -EINVAL; /* no sources allowed */
+
+ mld_gq_start_timer(idev);
+ return 0;
+ }
+ /* mark sources to include, if group & source-specific */
+ if (mlh2->mld2q_nsrcs != 0) {
+ if (!pskb_may_pull(skb, srcs_offset +
+ ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr)))
+ return -EINVAL;
+
+ mlh2 = (struct mld2_query *)skb_transport_header(skb);
+ mark = 1;
+ }
+ } else {
+ return -EINVAL;
+ }
+
+ read_lock_bh(&idev->lock);
+ if (group_type == IPV6_ADDR_ANY) {
+ for (ma = idev->mc_list; ma; ma = ma->next) {
+ spin_lock_bh(&ma->mca_lock);
+ igmp6_group_queried(ma, max_delay);
+ spin_unlock_bh(&ma->mca_lock);
+ }
+ } else {
+ for (ma = idev->mc_list; ma; ma = ma->next) {
+ if (!ipv6_addr_equal(group, &ma->mca_addr))
+ continue;
+ spin_lock_bh(&ma->mca_lock);
+ if (ma->mca_flags & MAF_TIMER_RUNNING) {
+ /* gsquery <- gsquery && mark */
+ if (!mark)
+ ma->mca_flags &= ~MAF_GSQUERY;
+ } else {
+ /* gsquery <- mark */
+ if (mark)
+ ma->mca_flags |= MAF_GSQUERY;
+ else
+ ma->mca_flags &= ~MAF_GSQUERY;
+ }
+ if (!(ma->mca_flags & MAF_GSQUERY) ||
+ mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs))
+ igmp6_group_queried(ma, max_delay);
+ spin_unlock_bh(&ma->mca_lock);
+ break;
+ }
+ }
+ read_unlock_bh(&idev->lock);
+
+ return 0;
+}
+
+/* called with rcu_read_lock() */
+int igmp6_event_report(struct sk_buff *skb)
+{
+ struct ifmcaddr6 *ma;
+ struct inet6_dev *idev;
+ struct mld_msg *mld;
+ int addr_type;
+
+ /* Our own report looped back. Ignore it. */
+ if (skb->pkt_type == PACKET_LOOPBACK)
+ return 0;
+
+ /* send our report if the MC router may not have heard this report */
+ if (skb->pkt_type != PACKET_MULTICAST &&
+ skb->pkt_type != PACKET_BROADCAST)
+ return 0;
+
+ if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr)))
+ return -EINVAL;
+
+ mld = (struct mld_msg *)icmp6_hdr(skb);
+
+ /* Drop reports with not link local source */
+ addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
+ if (addr_type != IPV6_ADDR_ANY &&
+ !(addr_type&IPV6_ADDR_LINKLOCAL))
+ return -EINVAL;
+
+ idev = __in6_dev_get(skb->dev);
+ if (!idev)
+ return -ENODEV;
+
+ /*
+ * Cancel the timer for this group
+ */
+
+ read_lock_bh(&idev->lock);
+ for (ma = idev->mc_list; ma; ma = ma->next) {
+ if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) {
+ spin_lock(&ma->mca_lock);
+ if (del_timer(&ma->mca_timer))
+ atomic_dec(&ma->mca_refcnt);
+ ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING);
+ spin_unlock(&ma->mca_lock);
+ break;
+ }
+ }
+ read_unlock_bh(&idev->lock);
+ return 0;
+}
+
+static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
+ int gdeleted, int sdeleted)
+{
+ switch (type) {
+ case MLD2_MODE_IS_INCLUDE:
+ case MLD2_MODE_IS_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return false;
+ if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) {
+ if (pmc->mca_sfmode == MCAST_INCLUDE)
+ return true;
+ /* don't include if this source is excluded
+ * in all filters
+ */
+ if (psf->sf_count[MCAST_INCLUDE])
+ return type == MLD2_MODE_IS_INCLUDE;
+ return pmc->mca_sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ }
+ return false;
+ case MLD2_CHANGE_TO_INCLUDE:
+ if (gdeleted || sdeleted)
+ return false;
+ return psf->sf_count[MCAST_INCLUDE] != 0;
+ case MLD2_CHANGE_TO_EXCLUDE:
+ if (gdeleted || sdeleted)
+ return false;
+ if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 ||
+ psf->sf_count[MCAST_INCLUDE])
+ return false;
+ return pmc->mca_sfcount[MCAST_EXCLUDE] ==
+ psf->sf_count[MCAST_EXCLUDE];
+ case MLD2_ALLOW_NEW_SOURCES:
+ if (gdeleted || !psf->sf_crcount)
+ return false;
+ return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted;
+ case MLD2_BLOCK_OLD_SOURCES:
+ if (pmc->mca_sfmode == MCAST_INCLUDE)
+ return gdeleted || (psf->sf_crcount && sdeleted);
+ return psf->sf_crcount && !gdeleted && !sdeleted;
+ }
+ return false;
+}
+
+static int
+mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
+{
+ struct ip6_sf_list *psf;
+ int scount = 0;
+
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+ continue;
+ scount++;
+ }
+ return scount;
+}
+
+static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb,
+ struct net_device *dev,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ int proto, int len)
+{
+ struct ipv6hdr *hdr;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
+
+ skb_reset_network_header(skb);
+ skb_put(skb, sizeof(struct ipv6hdr));
+ hdr = ipv6_hdr(skb);
+
+ ip6_flow_hdr(hdr, 0, 0);
+
+ hdr->payload_len = htons(len);
+ hdr->nexthdr = proto;
+ hdr->hop_limit = inet6_sk(sk)->hop_limit;
+
+ hdr->saddr = *saddr;
+ hdr->daddr = *daddr;
+}
+
+static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
+{
+ struct net_device *dev = idev->dev;
+ struct net *net = dev_net(dev);
+ struct sock *sk = net->ipv6.igmp_sk;
+ struct sk_buff *skb;
+ struct mld2_report *pmr;
+ struct in6_addr addr_buf;
+ const struct in6_addr *saddr;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ unsigned int size = mtu + hlen + tlen;
+ int err;
+ u8 ra[8] = { IPPROTO_ICMPV6, 0,
+ IPV6_TLV_ROUTERALERT, 2, 0, 0,
+ IPV6_TLV_PADN, 0 };
+
+ /* we assume size > sizeof(ra) here */
+ /* limit our allocations to order-0 page */
+ size = min_t(int, size, SKB_MAX_ORDER(0, 0));
+ skb = sock_alloc_send_skb(sk, size, 1, &err);
+
+ if (!skb)
+ return NULL;
+
+ skb->priority = TC_PRIO_CONTROL;
+ skb->reserved_tailroom = skb_end_offset(skb) -
+ min(mtu, skb_end_offset(skb));
+ skb_reserve(skb, hlen);
+
+ if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
+ /* <draft-ietf-magma-mld-source-05.txt>:
+ * use unspecified address as the source address
+ * when a valid link-local address is not available.
+ */
+ saddr = &in6addr_any;
+ } else
+ saddr = &addr_buf;
+
+ ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0);
+
+ memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
+
+ skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data);
+ skb_put(skb, sizeof(*pmr));
+ pmr = (struct mld2_report *)skb_transport_header(skb);
+ pmr->mld2r_type = ICMPV6_MLD2_REPORT;
+ pmr->mld2r_resv1 = 0;
+ pmr->mld2r_cksum = 0;
+ pmr->mld2r_resv2 = 0;
+ pmr->mld2r_ngrec = 0;
+ return skb;
+}
+
+static void mld_sendpack(struct sk_buff *skb)
+{
+ struct ipv6hdr *pip6 = ipv6_hdr(skb);
+ struct mld2_report *pmr =
+ (struct mld2_report *)skb_transport_header(skb);
+ int payload_len, mldlen;
+ struct inet6_dev *idev;
+ struct net *net = dev_net(skb->dev);
+ int err;
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(skb->dev);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
+ payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) -
+ sizeof(*pip6);
+ mldlen = skb_tail_pointer(skb) - skb_transport_header(skb);
+ pip6->payload_len = htons(payload_len);
+
+ pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
+ IPPROTO_ICMPV6,
+ csum_partial(skb_transport_header(skb),
+ mldlen, 0));
+
+ icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT,
+ &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+ skb->dev->ifindex);
+ dst = icmp6_dst_alloc(skb->dev, &fl6);
+
+ err = 0;
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ }
+ skb_dst_set(skb, dst);
+ if (err)
+ goto err_out;
+
+ payload_len = skb->len;
+
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net->ipv6.igmp_sk, skb, NULL, skb->dev,
+ dst_output_sk);
+out:
+ if (!err) {
+ ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len);
+ } else {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ }
+
+ rcu_read_unlock();
+ return;
+
+err_out:
+ kfree_skb(skb);
+ goto out;
+}
+
+static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
+{
+ return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
+ int type, struct mld2_grec **ppgr)
+{
+ struct net_device *dev = pmc->idev->dev;
+ struct mld2_report *pmr;
+ struct mld2_grec *pgr;
+
+ if (!skb)
+ skb = mld_newpack(pmc->idev, dev->mtu);
+ if (!skb)
+ return NULL;
+ pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec));
+ pgr->grec_type = type;
+ pgr->grec_auxwords = 0;
+ pgr->grec_nsrcs = 0;
+ pgr->grec_mca = pmc->mca_addr; /* structure copy */
+ pmr = (struct mld2_report *)skb_transport_header(skb);
+ pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1);
+ *ppgr = pgr;
+ return skb;
+}
+
+#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
+ int type, int gdeleted, int sdeleted, int crsend)
+{
+ struct inet6_dev *idev = pmc->idev;
+ struct net_device *dev = idev->dev;
+ struct mld2_report *pmr;
+ struct mld2_grec *pgr = NULL;
+ struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+ int scount, stotal, first, isquery, truncate;
+
+ if (pmc->mca_flags & MAF_NOREPORT)
+ return skb;
+
+ isquery = type == MLD2_MODE_IS_INCLUDE ||
+ type == MLD2_MODE_IS_EXCLUDE;
+ truncate = type == MLD2_MODE_IS_EXCLUDE ||
+ type == MLD2_CHANGE_TO_EXCLUDE;
+
+ stotal = scount = 0;
+
+ psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources;
+
+ if (!*psf_list)
+ goto empty_source;
+
+ pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL;
+
+ /* EX and TO_EX get a fresh packet, if needed */
+ if (truncate) {
+ if (pmr && pmr->mld2r_ngrec &&
+ AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+ if (skb)
+ mld_sendpack(skb);
+ skb = mld_newpack(idev, dev->mtu);
+ }
+ }
+ first = 1;
+ psf_prev = NULL;
+ for (psf = *psf_list; psf; psf = psf_next) {
+ struct in6_addr *psrc;
+
+ psf_next = psf->sf_next;
+
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+ psf_prev = psf;
+ continue;
+ }
+
+ /* clear marks on query responses */
+ if (isquery)
+ psf->sf_gsresp = 0;
+
+ if (AVAILABLE(skb) < sizeof(*psrc) +
+ first*sizeof(struct mld2_grec)) {
+ if (truncate && !first)
+ break; /* truncate these */
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+ if (skb)
+ mld_sendpack(skb);
+ skb = mld_newpack(idev, dev->mtu);
+ first = 1;
+ scount = 0;
+ }
+ if (first) {
+ skb = add_grhead(skb, pmc, type, &pgr);
+ first = 0;
+ }
+ if (!skb)
+ return NULL;
+ psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc));
+ *psrc = psf->sf_addr;
+ scount++; stotal++;
+ if ((type == MLD2_ALLOW_NEW_SOURCES ||
+ type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+ psf->sf_crcount--;
+ if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *psf_list = psf->sf_next;
+ kfree(psf);
+ continue;
+ }
+ }
+ psf_prev = psf;
+ }
+
+empty_source:
+ if (!stotal) {
+ if (type == MLD2_ALLOW_NEW_SOURCES ||
+ type == MLD2_BLOCK_OLD_SOURCES)
+ return skb;
+ if (pmc->mca_crcount || isquery || crsend) {
+ /* make sure we have room for group header */
+ if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) {
+ mld_sendpack(skb);
+ skb = NULL; /* add_grhead will get a new one */
+ }
+ skb = add_grhead(skb, pmc, type, &pgr);
+ }
+ }
+ if (pgr)
+ pgr->grec_nsrcs = htons(scount);
+
+ if (isquery)
+ pmc->mca_flags &= ~MAF_GSQUERY; /* clear query state */
+ return skb;
+}
+
+static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc)
+{
+ struct sk_buff *skb = NULL;
+ int type;
+
+ read_lock_bh(&idev->lock);
+ if (!pmc) {
+ for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ if (pmc->mca_flags & MAF_NOREPORT)
+ continue;
+ spin_lock_bh(&pmc->mca_lock);
+ if (pmc->mca_sfcount[MCAST_EXCLUDE])
+ type = MLD2_MODE_IS_EXCLUDE;
+ else
+ type = MLD2_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0, 0);
+ spin_unlock_bh(&pmc->mca_lock);
+ }
+ } else {
+ spin_lock_bh(&pmc->mca_lock);
+ if (pmc->mca_sfcount[MCAST_EXCLUDE])
+ type = MLD2_MODE_IS_EXCLUDE;
+ else
+ type = MLD2_MODE_IS_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0, 0);
+ spin_unlock_bh(&pmc->mca_lock);
+ }
+ read_unlock_bh(&idev->lock);
+ if (skb)
+ mld_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void mld_clear_zeros(struct ip6_sf_list **ppsf)
+{
+ struct ip6_sf_list *psf_prev, *psf_next, *psf;
+
+ psf_prev = NULL;
+ for (psf = *ppsf; psf; psf = psf_next) {
+ psf_next = psf->sf_next;
+ if (psf->sf_crcount == 0) {
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ *ppsf = psf->sf_next;
+ kfree(psf);
+ } else
+ psf_prev = psf;
+ }
+}
+
+static void mld_send_cr(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next;
+ struct sk_buff *skb = NULL;
+ int type, dtype;
+
+ read_lock_bh(&idev->lock);
+ spin_lock(&idev->mc_lock);
+
+ /* deleted MCA's */
+ pmc_prev = NULL;
+ for (pmc = idev->mc_tomb; pmc; pmc = pmc_next) {
+ pmc_next = pmc->next;
+ if (pmc->mca_sfmode == MCAST_INCLUDE) {
+ type = MLD2_BLOCK_OLD_SOURCES;
+ dtype = MLD2_BLOCK_OLD_SOURCES;
+ skb = add_grec(skb, pmc, type, 1, 0, 0);
+ skb = add_grec(skb, pmc, dtype, 1, 1, 0);
+ }
+ if (pmc->mca_crcount) {
+ if (pmc->mca_sfmode == MCAST_EXCLUDE) {
+ type = MLD2_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 1, 0, 0);
+ }
+ pmc->mca_crcount--;
+ if (pmc->mca_crcount == 0) {
+ mld_clear_zeros(&pmc->mca_tomb);
+ mld_clear_zeros(&pmc->mca_sources);
+ }
+ }
+ if (pmc->mca_crcount == 0 && !pmc->mca_tomb &&
+ !pmc->mca_sources) {
+ if (pmc_prev)
+ pmc_prev->next = pmc_next;
+ else
+ idev->mc_tomb = pmc_next;
+ in6_dev_put(pmc->idev);
+ kfree(pmc);
+ } else
+ pmc_prev = pmc;
+ }
+ spin_unlock(&idev->mc_lock);
+
+ /* change recs */
+ for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ spin_lock_bh(&pmc->mca_lock);
+ if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
+ type = MLD2_BLOCK_OLD_SOURCES;
+ dtype = MLD2_ALLOW_NEW_SOURCES;
+ } else {
+ type = MLD2_ALLOW_NEW_SOURCES;
+ dtype = MLD2_BLOCK_OLD_SOURCES;
+ }
+ skb = add_grec(skb, pmc, type, 0, 0, 0);
+ skb = add_grec(skb, pmc, dtype, 0, 1, 0); /* deleted sources */
+
+ /* filter mode changes */
+ if (pmc->mca_crcount) {
+ if (pmc->mca_sfmode == MCAST_EXCLUDE)
+ type = MLD2_CHANGE_TO_EXCLUDE;
+ else
+ type = MLD2_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0, 0);
+ pmc->mca_crcount--;
+ }
+ spin_unlock_bh(&pmc->mca_lock);
+ }
+ read_unlock_bh(&idev->lock);
+ if (!skb)
+ return;
+ (void) mld_sendpack(skb);
+}
+
+static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
+{
+ struct net *net = dev_net(dev);
+ struct sock *sk = net->ipv6.igmp_sk;
+ struct inet6_dev *idev;
+ struct sk_buff *skb;
+ struct mld_msg *hdr;
+ const struct in6_addr *snd_addr, *saddr;
+ struct in6_addr addr_buf;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ int err, len, payload_len, full_len;
+ u8 ra[8] = { IPPROTO_ICMPV6, 0,
+ IPV6_TLV_ROUTERALERT, 2, 0, 0,
+ IPV6_TLV_PADN, 0 };
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+
+ if (type == ICMPV6_MGM_REDUCTION)
+ snd_addr = &in6addr_linklocal_allrouters;
+ else
+ snd_addr = addr;
+
+ len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
+ payload_len = len + sizeof(ra);
+ full_len = sizeof(struct ipv6hdr) + payload_len;
+
+ rcu_read_lock();
+ IP6_UPD_PO_STATS(net, __in6_dev_get(dev),
+ IPSTATS_MIB_OUT, full_len);
+ rcu_read_unlock();
+
+ skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err);
+
+ if (!skb) {
+ rcu_read_lock();
+ IP6_INC_STATS(net, __in6_dev_get(dev),
+ IPSTATS_MIB_OUTDISCARDS);
+ rcu_read_unlock();
+ return;
+ }
+ skb->priority = TC_PRIO_CONTROL;
+ skb_reserve(skb, hlen);
+
+ if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
+ /* <draft-ietf-magma-mld-source-05.txt>:
+ * use unspecified address as the source address
+ * when a valid link-local address is not available.
+ */
+ saddr = &in6addr_any;
+ } else
+ saddr = &addr_buf;
+
+ ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len);
+
+ memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
+
+ hdr = (struct mld_msg *) skb_put(skb, sizeof(struct mld_msg));
+ memset(hdr, 0, sizeof(struct mld_msg));
+ hdr->mld_type = type;
+ hdr->mld_mca = *addr;
+
+ hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len,
+ IPPROTO_ICMPV6,
+ csum_partial(hdr, len, 0));
+
+ rcu_read_lock();
+ idev = __in6_dev_get(skb->dev);
+
+ icmpv6_flow_init(sk, &fl6, type,
+ &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+ skb->dev->ifindex);
+ dst = icmp6_dst_alloc(skb->dev, &fl6);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto err_out;
+ }
+
+ skb_dst_set(skb, dst);
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
+ NULL, skb->dev, dst_output_sk);
+out:
+ if (!err) {
+ ICMP6MSGOUT_INC_STATS(net, idev, type);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len);
+ } else
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+
+ rcu_read_unlock();
+ return;
+
+err_out:
+ kfree_skb(skb);
+ goto out;
+}
+
+static void mld_send_initial_cr(struct inet6_dev *idev)
+{
+ struct sk_buff *skb;
+ struct ifmcaddr6 *pmc;
+ int type;
+
+ if (mld_in_v1_mode(idev))
+ return;
+
+ skb = NULL;
+ read_lock_bh(&idev->lock);
+ for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ spin_lock_bh(&pmc->mca_lock);
+ if (pmc->mca_sfcount[MCAST_EXCLUDE])
+ type = MLD2_CHANGE_TO_EXCLUDE;
+ else
+ type = MLD2_CHANGE_TO_INCLUDE;
+ skb = add_grec(skb, pmc, type, 0, 0, 1);
+ spin_unlock_bh(&pmc->mca_lock);
+ }
+ read_unlock_bh(&idev->lock);
+ if (skb)
+ mld_sendpack(skb);
+}
+
+void ipv6_mc_dad_complete(struct inet6_dev *idev)
+{
+ idev->mc_dad_count = idev->mc_qrv;
+ if (idev->mc_dad_count) {
+ mld_send_initial_cr(idev);
+ idev->mc_dad_count--;
+ if (idev->mc_dad_count)
+ mld_dad_start_timer(idev, idev->mc_maxdelay);
+ }
+}
+
+static void mld_dad_timer_expire(unsigned long data)
+{
+ struct inet6_dev *idev = (struct inet6_dev *)data;
+
+ mld_send_initial_cr(idev);
+ if (idev->mc_dad_count) {
+ idev->mc_dad_count--;
+ if (idev->mc_dad_count)
+ mld_dad_start_timer(idev, idev->mc_maxdelay);
+ }
+ in6_dev_put(idev);
+}
+
+static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
+ const struct in6_addr *psfsrc)
+{
+ struct ip6_sf_list *psf, *psf_prev;
+ int rv = 0;
+
+ psf_prev = NULL;
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
+ break;
+ psf_prev = psf;
+ }
+ if (!psf || psf->sf_count[sfmode] == 0) {
+ /* source filter not found, or count wrong => bug */
+ return -ESRCH;
+ }
+ psf->sf_count[sfmode]--;
+ if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+ struct inet6_dev *idev = pmc->idev;
+
+ /* no more filters for this source */
+ if (psf_prev)
+ psf_prev->sf_next = psf->sf_next;
+ else
+ pmc->mca_sources = psf->sf_next;
+ if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) &&
+ !mld_in_v1_mode(idev)) {
+ psf->sf_crcount = idev->mc_qrv;
+ psf->sf_next = pmc->mca_tomb;
+ pmc->mca_tomb = psf;
+ rv = 1;
+ } else
+ kfree(psf);
+ }
+ return rv;
+}
+
+static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+ int sfmode, int sfcount, const struct in6_addr *psfsrc,
+ int delta)
+{
+ struct ifmcaddr6 *pmc;
+ int changerec = 0;
+ int i, err;
+
+ if (!idev)
+ return -ENODEV;
+ read_lock_bh(&idev->lock);
+ for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ if (ipv6_addr_equal(pmca, &pmc->mca_addr))
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ read_unlock_bh(&idev->lock);
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->mca_lock);
+ sf_markstate(pmc);
+ if (!delta) {
+ if (!pmc->mca_sfcount[sfmode]) {
+ spin_unlock_bh(&pmc->mca_lock);
+ read_unlock_bh(&idev->lock);
+ return -EINVAL;
+ }
+ pmc->mca_sfcount[sfmode]--;
+ }
+ err = 0;
+ for (i = 0; i < sfcount; i++) {
+ int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+ changerec |= rv > 0;
+ if (!err && rv < 0)
+ err = rv;
+ }
+ if (pmc->mca_sfmode == MCAST_EXCLUDE &&
+ pmc->mca_sfcount[MCAST_EXCLUDE] == 0 &&
+ pmc->mca_sfcount[MCAST_INCLUDE]) {
+ struct ip6_sf_list *psf;
+
+ /* filter mode change */
+ pmc->mca_sfmode = MCAST_INCLUDE;
+ pmc->mca_crcount = idev->mc_qrv;
+ idev->mc_ifc_count = pmc->mca_crcount;
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ mld_ifc_event(pmc->idev);
+ } else if (sf_setstate(pmc) || changerec)
+ mld_ifc_event(pmc->idev);
+ spin_unlock_bh(&pmc->mca_lock);
+ read_unlock_bh(&idev->lock);
+ return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
+ const struct in6_addr *psfsrc)
+{
+ struct ip6_sf_list *psf, *psf_prev;
+
+ psf_prev = NULL;
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
+ break;
+ psf_prev = psf;
+ }
+ if (!psf) {
+ psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+ if (!psf)
+ return -ENOBUFS;
+
+ psf->sf_addr = *psfsrc;
+ if (psf_prev) {
+ psf_prev->sf_next = psf;
+ } else
+ pmc->mca_sources = psf;
+ }
+ psf->sf_count[sfmode]++;
+ return 0;
+}
+
+static void sf_markstate(struct ifmcaddr6 *pmc)
+{
+ struct ip6_sf_list *psf;
+ int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
+
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
+ psf->sf_oldin = mca_xcount ==
+ psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ifmcaddr6 *pmc)
+{
+ struct ip6_sf_list *psf, *dpsf;
+ int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
+ int qrv = pmc->idev->mc_qrv;
+ int new_in, rv;
+
+ rv = 0;
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
+ new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+ !psf->sf_count[MCAST_INCLUDE];
+ } else
+ new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+ if (new_in) {
+ if (!psf->sf_oldin) {
+ struct ip6_sf_list *prev = NULL;
+
+ for (dpsf = pmc->mca_tomb; dpsf;
+ dpsf = dpsf->sf_next) {
+ if (ipv6_addr_equal(&dpsf->sf_addr,
+ &psf->sf_addr))
+ break;
+ prev = dpsf;
+ }
+ if (dpsf) {
+ if (prev)
+ prev->sf_next = dpsf->sf_next;
+ else
+ pmc->mca_tomb = dpsf->sf_next;
+ kfree(dpsf);
+ }
+ psf->sf_crcount = qrv;
+ rv++;
+ }
+ } else if (psf->sf_oldin) {
+ psf->sf_crcount = 0;
+ /*
+ * add or update "delete" records if an active filter
+ * is now inactive
+ */
+ for (dpsf = pmc->mca_tomb; dpsf; dpsf = dpsf->sf_next)
+ if (ipv6_addr_equal(&dpsf->sf_addr,
+ &psf->sf_addr))
+ break;
+ if (!dpsf) {
+ dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ if (!dpsf)
+ continue;
+ *dpsf = *psf;
+ /* pmc->mca_lock held by callers */
+ dpsf->sf_next = pmc->mca_tomb;
+ pmc->mca_tomb = dpsf;
+ }
+ dpsf->sf_crcount = qrv;
+ rv++;
+ }
+ }
+ return rv;
+}
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+ int sfmode, int sfcount, const struct in6_addr *psfsrc,
+ int delta)
+{
+ struct ifmcaddr6 *pmc;
+ int isexclude;
+ int i, err;
+
+ if (!idev)
+ return -ENODEV;
+ read_lock_bh(&idev->lock);
+ for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ if (ipv6_addr_equal(pmca, &pmc->mca_addr))
+ break;
+ }
+ if (!pmc) {
+ /* MCA not found?? bug */
+ read_unlock_bh(&idev->lock);
+ return -ESRCH;
+ }
+ spin_lock_bh(&pmc->mca_lock);
+
+ sf_markstate(pmc);
+ isexclude = pmc->mca_sfmode == MCAST_EXCLUDE;
+ if (!delta)
+ pmc->mca_sfcount[sfmode]++;
+ err = 0;
+ for (i = 0; i < sfcount; i++) {
+ err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]);
+ if (err)
+ break;
+ }
+ if (err) {
+ int j;
+
+ if (!delta)
+ pmc->mca_sfcount[sfmode]--;
+ for (j = 0; j < i; j++)
+ ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]);
+ } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) {
+ struct ip6_sf_list *psf;
+
+ /* filter mode change */
+ if (pmc->mca_sfcount[MCAST_EXCLUDE])
+ pmc->mca_sfmode = MCAST_EXCLUDE;
+ else if (pmc->mca_sfcount[MCAST_INCLUDE])
+ pmc->mca_sfmode = MCAST_INCLUDE;
+ /* else no filters; keep old mode for reports */
+
+ pmc->mca_crcount = idev->mc_qrv;
+ idev->mc_ifc_count = pmc->mca_crcount;
+ for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ psf->sf_crcount = 0;
+ mld_ifc_event(idev);
+ } else if (sf_setstate(pmc))
+ mld_ifc_event(idev);
+ spin_unlock_bh(&pmc->mca_lock);
+ read_unlock_bh(&idev->lock);
+ return err;
+}
+
+static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
+{
+ struct ip6_sf_list *psf, *nextpsf;
+
+ for (psf = pmc->mca_tomb; psf; psf = nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->mca_tomb = NULL;
+ for (psf = pmc->mca_sources; psf; psf = nextpsf) {
+ nextpsf = psf->sf_next;
+ kfree(psf);
+ }
+ pmc->mca_sources = NULL;
+ pmc->mca_sfmode = MCAST_EXCLUDE;
+ pmc->mca_sfcount[MCAST_INCLUDE] = 0;
+ pmc->mca_sfcount[MCAST_EXCLUDE] = 1;
+}
+
+
+static void igmp6_join_group(struct ifmcaddr6 *ma)
+{
+ unsigned long delay;
+
+ if (ma->mca_flags & MAF_NOREPORT)
+ return;
+
+ igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
+
+ delay = prandom_u32() % unsolicited_report_interval(ma->idev);
+
+ spin_lock_bh(&ma->mca_lock);
+ if (del_timer(&ma->mca_timer)) {
+ atomic_dec(&ma->mca_refcnt);
+ delay = ma->mca_timer.expires - jiffies;
+ }
+
+ if (!mod_timer(&ma->mca_timer, jiffies + delay))
+ atomic_inc(&ma->mca_refcnt);
+ ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER;
+ spin_unlock_bh(&ma->mca_lock);
+}
+
+static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
+ struct inet6_dev *idev)
+{
+ int err;
+
+ /* callers have the socket lock and rtnl lock
+ * so no other readers or writers of iml or its sflist
+ */
+ if (!iml->sflist) {
+ /* any-source empty exclude case */
+ return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+ }
+ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
+ iml->sflist->sl_count, iml->sflist->sl_addr, 0);
+ sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
+ iml->sflist = NULL;
+ return err;
+}
+
+static void igmp6_leave_group(struct ifmcaddr6 *ma)
+{
+ if (mld_in_v1_mode(ma->idev)) {
+ if (ma->mca_flags & MAF_LAST_REPORTER)
+ igmp6_send(&ma->mca_addr, ma->idev->dev,
+ ICMPV6_MGM_REDUCTION);
+ } else {
+ mld_add_delrec(ma->idev, ma);
+ mld_ifc_event(ma->idev);
+ }
+}
+
+static void mld_gq_timer_expire(unsigned long data)
+{
+ struct inet6_dev *idev = (struct inet6_dev *)data;
+
+ idev->mc_gq_running = 0;
+ mld_send_report(idev, NULL);
+ in6_dev_put(idev);
+}
+
+static void mld_ifc_timer_expire(unsigned long data)
+{
+ struct inet6_dev *idev = (struct inet6_dev *)data;
+
+ mld_send_cr(idev);
+ if (idev->mc_ifc_count) {
+ idev->mc_ifc_count--;
+ if (idev->mc_ifc_count)
+ mld_ifc_start_timer(idev, idev->mc_maxdelay);
+ }
+ in6_dev_put(idev);
+}
+
+static void mld_ifc_event(struct inet6_dev *idev)
+{
+ if (mld_in_v1_mode(idev))
+ return;
+ idev->mc_ifc_count = idev->mc_qrv;
+ mld_ifc_start_timer(idev, 1);
+}
+
+
+static void igmp6_timer_handler(unsigned long data)
+{
+ struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
+
+ if (mld_in_v1_mode(ma->idev))
+ igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
+ else
+ mld_send_report(ma->idev, ma);
+
+ spin_lock(&ma->mca_lock);
+ ma->mca_flags |= MAF_LAST_REPORTER;
+ ma->mca_flags &= ~MAF_TIMER_RUNNING;
+ spin_unlock(&ma->mca_lock);
+ ma_put(ma);
+}
+
+/* Device changing type */
+
+void ipv6_mc_unmap(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *i;
+
+ /* Install multicast list, except for all-nodes (already installed) */
+
+ read_lock_bh(&idev->lock);
+ for (i = idev->mc_list; i; i = i->next)
+ igmp6_group_dropped(i);
+ read_unlock_bh(&idev->lock);
+}
+
+void ipv6_mc_remap(struct inet6_dev *idev)
+{
+ ipv6_mc_up(idev);
+}
+
+/* Device going down */
+
+void ipv6_mc_down(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *i;
+
+ /* Withdraw multicast list */
+
+ read_lock_bh(&idev->lock);
+ mld_ifc_stop_timer(idev);
+ mld_gq_stop_timer(idev);
+ mld_dad_stop_timer(idev);
+
+ for (i = idev->mc_list; i; i = i->next)
+ igmp6_group_dropped(i);
+ read_unlock_bh(&idev->lock);
+
+ mld_clear_delrec(idev);
+}
+
+static void ipv6_mc_reset(struct inet6_dev *idev)
+{
+ idev->mc_qrv = sysctl_mld_qrv;
+ idev->mc_qi = MLD_QI_DEFAULT;
+ idev->mc_qri = MLD_QRI_DEFAULT;
+ idev->mc_v1_seen = 0;
+ idev->mc_maxdelay = unsolicited_report_interval(idev);
+}
+
+/* Device going up */
+
+void ipv6_mc_up(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *i;
+
+ /* Install multicast list, except for all-nodes (already installed) */
+
+ read_lock_bh(&idev->lock);
+ ipv6_mc_reset(idev);
+ for (i = idev->mc_list; i; i = i->next)
+ igmp6_group_added(i);
+ read_unlock_bh(&idev->lock);
+}
+
+/* IPv6 device initialization. */
+
+void ipv6_mc_init_dev(struct inet6_dev *idev)
+{
+ write_lock_bh(&idev->lock);
+ spin_lock_init(&idev->mc_lock);
+ idev->mc_gq_running = 0;
+ setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire,
+ (unsigned long)idev);
+ idev->mc_tomb = NULL;
+ idev->mc_ifc_count = 0;
+ setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire,
+ (unsigned long)idev);
+ setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire,
+ (unsigned long)idev);
+ ipv6_mc_reset(idev);
+ write_unlock_bh(&idev->lock);
+}
+
+/*
+ * Device is about to be destroyed: clean up.
+ */
+
+void ipv6_mc_destroy_dev(struct inet6_dev *idev)
+{
+ struct ifmcaddr6 *i;
+
+ /* Deactivate timers */
+ ipv6_mc_down(idev);
+
+ /* Delete all-nodes address. */
+ /* We cannot call ipv6_dev_mc_dec() directly, our caller in
+ * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will
+ * fail.
+ */
+ __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes);
+
+ if (idev->cnf.forwarding)
+ __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters);
+
+ write_lock_bh(&idev->lock);
+ while ((i = idev->mc_list) != NULL) {
+ idev->mc_list = i->next;
+ write_unlock_bh(&idev->lock);
+
+ igmp6_group_dropped(i);
+ ma_put(i);
+
+ write_lock_bh(&idev->lock);
+ }
+ write_unlock_bh(&idev->lock);
+}
+
+#ifdef CONFIG_PROC_FS
+struct igmp6_mc_iter_state {
+ struct seq_net_private p;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+};
+
+#define igmp6_mc_seq_private(seq) ((struct igmp6_mc_iter_state *)(seq)->private)
+
+static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
+{
+ struct ifmcaddr6 *im = NULL;
+ struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+ struct net *net = seq_file_net(seq);
+
+ state->idev = NULL;
+ for_each_netdev_rcu(net, state->dev) {
+ struct inet6_dev *idev;
+ idev = __in6_dev_get(state->dev);
+ if (!idev)
+ continue;
+ read_lock_bh(&idev->lock);
+ im = idev->mc_list;
+ if (im) {
+ state->idev = idev;
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ return im;
+}
+
+static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im)
+{
+ struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+
+ im = im->next;
+ while (!im) {
+ if (likely(state->idev))
+ read_unlock_bh(&state->idev->lock);
+
+ state->dev = next_net_device_rcu(state->dev);
+ if (!state->dev) {
+ state->idev = NULL;
+ break;
+ }
+ state->idev = __in6_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+ read_lock_bh(&state->idev->lock);
+ im = state->idev->mc_list;
+ }
+ return im;
+}
+
+static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ifmcaddr6 *im = igmp6_mc_get_first(seq);
+ if (im)
+ while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL)
+ --pos;
+ return pos ? NULL : im;
+}
+
+static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return igmp6_mc_get_idx(seq, *pos);
+}
+
+static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v);
+
+ ++*pos;
+ return im;
+}
+
+static void igmp6_mc_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+
+ if (likely(state->idev)) {
+ read_unlock_bh(&state->idev->lock);
+ state->idev = NULL;
+ }
+ state->dev = NULL;
+ rcu_read_unlock();
+}
+
+static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
+{
+ struct ifmcaddr6 *im = (struct ifmcaddr6 *)v;
+ struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+
+ seq_printf(seq,
+ "%-4d %-15s %pi6 %5d %08X %ld\n",
+ state->dev->ifindex, state->dev->name,
+ &im->mca_addr,
+ im->mca_users, im->mca_flags,
+ (im->mca_flags&MAF_TIMER_RUNNING) ?
+ jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0);
+ return 0;
+}
+
+static const struct seq_operations igmp6_mc_seq_ops = {
+ .start = igmp6_mc_seq_start,
+ .next = igmp6_mc_seq_next,
+ .stop = igmp6_mc_seq_stop,
+ .show = igmp6_mc_seq_show,
+};
+
+static int igmp6_mc_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &igmp6_mc_seq_ops,
+ sizeof(struct igmp6_mc_iter_state));
+}
+
+static const struct file_operations igmp6_mc_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp6_mc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+struct igmp6_mcf_iter_state {
+ struct seq_net_private p;
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ struct ifmcaddr6 *im;
+};
+
+#define igmp6_mcf_seq_private(seq) ((struct igmp6_mcf_iter_state *)(seq)->private)
+
+static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
+{
+ struct ip6_sf_list *psf = NULL;
+ struct ifmcaddr6 *im = NULL;
+ struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+ struct net *net = seq_file_net(seq);
+
+ state->idev = NULL;
+ state->im = NULL;
+ for_each_netdev_rcu(net, state->dev) {
+ struct inet6_dev *idev;
+ idev = __in6_dev_get(state->dev);
+ if (unlikely(idev == NULL))
+ continue;
+ read_lock_bh(&idev->lock);
+ im = idev->mc_list;
+ if (likely(im)) {
+ spin_lock_bh(&im->mca_lock);
+ psf = im->mca_sources;
+ if (likely(psf)) {
+ state->im = im;
+ state->idev = idev;
+ break;
+ }
+ spin_unlock_bh(&im->mca_lock);
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ return psf;
+}
+
+static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf)
+{
+ struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+
+ psf = psf->sf_next;
+ while (!psf) {
+ spin_unlock_bh(&state->im->mca_lock);
+ state->im = state->im->next;
+ while (!state->im) {
+ if (likely(state->idev))
+ read_unlock_bh(&state->idev->lock);
+
+ state->dev = next_net_device_rcu(state->dev);
+ if (!state->dev) {
+ state->idev = NULL;
+ goto out;
+ }
+ state->idev = __in6_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+ read_lock_bh(&state->idev->lock);
+ state->im = state->idev->mc_list;
+ }
+ if (!state->im)
+ break;
+ spin_lock_bh(&state->im->mca_lock);
+ psf = state->im->mca_sources;
+ }
+out:
+ return psf;
+}
+
+static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct ip6_sf_list *psf = igmp6_mcf_get_first(seq);
+ if (psf)
+ while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL)
+ --pos;
+ return pos ? NULL : psf;
+}
+
+static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip6_sf_list *psf;
+ if (v == SEQ_START_TOKEN)
+ psf = igmp6_mcf_get_first(seq);
+ else
+ psf = igmp6_mcf_get_next(seq, v);
+ ++*pos;
+ return psf;
+}
+
+static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+ if (likely(state->im)) {
+ spin_unlock_bh(&state->im->mca_lock);
+ state->im = NULL;
+ }
+ if (likely(state->idev)) {
+ read_unlock_bh(&state->idev->lock);
+ state->idev = NULL;
+ }
+ state->dev = NULL;
+ rcu_read_unlock();
+}
+
+static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
+{
+ struct ip6_sf_list *psf = (struct ip6_sf_list *)v;
+ struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Idx Device Multicast Address Source Address INC EXC\n");
+ } else {
+ seq_printf(seq,
+ "%3d %6.6s %pi6 %pi6 %6lu %6lu\n",
+ state->dev->ifindex, state->dev->name,
+ &state->im->mca_addr,
+ &psf->sf_addr,
+ psf->sf_count[MCAST_INCLUDE],
+ psf->sf_count[MCAST_EXCLUDE]);
+ }
+ return 0;
+}
+
+static const struct seq_operations igmp6_mcf_seq_ops = {
+ .start = igmp6_mcf_seq_start,
+ .next = igmp6_mcf_seq_next,
+ .stop = igmp6_mcf_seq_stop,
+ .show = igmp6_mcf_seq_show,
+};
+
+static int igmp6_mcf_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &igmp6_mcf_seq_ops,
+ sizeof(struct igmp6_mcf_iter_state));
+}
+
+static const struct file_operations igmp6_mcf_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = igmp6_mcf_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init igmp6_proc_init(struct net *net)
+{
+ int err;
+
+ err = -ENOMEM;
+ if (!proc_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops))
+ goto out;
+ if (!proc_create("mcfilter6", S_IRUGO, net->proc_net,
+ &igmp6_mcf_seq_fops))
+ goto out_proc_net_igmp6;
+
+ err = 0;
+out:
+ return err;
+
+out_proc_net_igmp6:
+ remove_proc_entry("igmp6", net->proc_net);
+ goto out;
+}
+
+static void __net_exit igmp6_proc_exit(struct net *net)
+{
+ remove_proc_entry("mcfilter6", net->proc_net);
+ remove_proc_entry("igmp6", net->proc_net);
+}
+#else
+static inline int igmp6_proc_init(struct net *net)
+{
+ return 0;
+}
+static inline void igmp6_proc_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init igmp6_net_init(struct net *net)
+{
+ int err;
+
+ err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6,
+ SOCK_RAW, IPPROTO_ICMPV6, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the IGMP6 control socket (err %d)\n",
+ err);
+ goto out;
+ }
+
+ inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1;
+
+ err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6,
+ SOCK_RAW, IPPROTO_ICMPV6, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n",
+ err);
+ goto out_sock_create;
+ }
+
+ err = igmp6_proc_init(net);
+ if (err)
+ goto out_sock_create_autojoin;
+
+ return 0;
+
+out_sock_create_autojoin:
+ inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk);
+out_sock_create:
+ inet_ctl_sock_destroy(net->ipv6.igmp_sk);
+out:
+ return err;
+}
+
+static void __net_exit igmp6_net_exit(struct net *net)
+{
+ inet_ctl_sock_destroy(net->ipv6.igmp_sk);
+ inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk);
+ igmp6_proc_exit(net);
+}
+
+static struct pernet_operations igmp6_net_ops = {
+ .init = igmp6_net_init,
+ .exit = igmp6_net_exit,
+};
+
+int __init igmp6_init(void)
+{
+ return register_pernet_subsys(&igmp6_net_ops);
+}
+
+void igmp6_cleanup(void)
+{
+ unregister_pernet_subsys(&igmp6_net_ops);
+}
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
new file mode 100644
index 000000000..b9779d441
--- /dev/null
+++ b/net/ipv6/mip6.c
@@ -0,0 +1,527 @@
+/*
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Authors:
+ * Noriaki TAKAMIYA @USAGI
+ * Masahide NAKAMURA @USAGI
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/time.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/rawv6.h>
+#include <net/xfrm.h>
+#include <net/mip6.h>
+
+static inline unsigned int calc_padlen(unsigned int len, unsigned int n)
+{
+ return (n - len + 16) & 0x7;
+}
+
+static inline void *mip6_padn(__u8 *data, __u8 padlen)
+{
+ if (!data)
+ return NULL;
+ if (padlen == 1) {
+ data[0] = IPV6_TLV_PAD1;
+ } else if (padlen > 1) {
+ data[0] = IPV6_TLV_PADN;
+ data[1] = padlen - 2;
+ if (padlen > 2)
+ memset(data+2, 0, data[1]);
+ }
+ return data + padlen;
+}
+
+static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos)
+{
+ icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos);
+}
+
+static int mip6_mh_len(int type)
+{
+ int len = 0;
+
+ switch (type) {
+ case IP6_MH_TYPE_BRR:
+ len = 0;
+ break;
+ case IP6_MH_TYPE_HOTI:
+ case IP6_MH_TYPE_COTI:
+ case IP6_MH_TYPE_BU:
+ case IP6_MH_TYPE_BACK:
+ len = 1;
+ break;
+ case IP6_MH_TYPE_HOT:
+ case IP6_MH_TYPE_COT:
+ case IP6_MH_TYPE_BERROR:
+ len = 2;
+ break;
+ }
+ return len;
+}
+
+static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip6_mh _hdr;
+ const struct ip6_mh *mh;
+
+ mh = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_hdr), &_hdr);
+ if (!mh)
+ return -1;
+
+ if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len)
+ return -1;
+
+ if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) {
+ net_dbg_ratelimited("mip6: MH message too short: %d vs >=%d\n",
+ mh->ip6mh_hdrlen,
+ mip6_mh_len(mh->ip6mh_type));
+ mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) +
+ skb_network_header_len(skb));
+ return -1;
+ }
+
+ if (mh->ip6mh_proto != IPPROTO_NONE) {
+ net_dbg_ratelimited("mip6: MH invalid payload proto = %d\n",
+ mh->ip6mh_proto);
+ mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) +
+ skb_network_header_len(skb));
+ return -1;
+ }
+
+ return 0;
+}
+
+struct mip6_report_rate_limiter {
+ spinlock_t lock;
+ struct timeval stamp;
+ int iif;
+ struct in6_addr src;
+ struct in6_addr dst;
+};
+
+static struct mip6_report_rate_limiter mip6_report_rl = {
+ .lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock)
+};
+
+static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data;
+ int err = destopt->nexthdr;
+
+ spin_lock(&x->lock);
+ if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) &&
+ !ipv6_addr_any((struct in6_addr *)x->coaddr))
+ err = -ENOENT;
+ spin_unlock(&x->lock);
+
+ return err;
+}
+
+/* Destination Option Header is inserted.
+ * IP Header's src address is replaced with Home Address Option in
+ * Destination Option Header.
+ */
+static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ipv6hdr *iph;
+ struct ipv6_destopt_hdr *dstopt;
+ struct ipv6_destopt_hao *hao;
+ u8 nexthdr;
+ int len;
+
+ skb_push(skb, -skb_network_offset(skb));
+ iph = ipv6_hdr(skb);
+
+ nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_DSTOPTS;
+
+ dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb);
+ dstopt->nexthdr = nexthdr;
+
+ hao = mip6_padn((char *)(dstopt + 1),
+ calc_padlen(sizeof(*dstopt), 6));
+
+ hao->type = IPV6_TLV_HAO;
+ BUILD_BUG_ON(sizeof(*hao) != 18);
+ hao->length = sizeof(*hao) - 2;
+
+ len = ((char *)hao - (char *)dstopt) + sizeof(*hao);
+
+ memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr));
+ spin_lock_bh(&x->lock);
+ memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr));
+ spin_unlock_bh(&x->lock);
+
+ WARN_ON(len != x->props.header_len);
+ dstopt->hdrlen = (x->props.header_len >> 3) - 1;
+
+ return 0;
+}
+
+static inline int mip6_report_rl_allow(struct timeval *stamp,
+ const struct in6_addr *dst,
+ const struct in6_addr *src, int iif)
+{
+ int allow = 0;
+
+ spin_lock_bh(&mip6_report_rl.lock);
+ if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec ||
+ mip6_report_rl.stamp.tv_usec != stamp->tv_usec ||
+ mip6_report_rl.iif != iif ||
+ !ipv6_addr_equal(&mip6_report_rl.src, src) ||
+ !ipv6_addr_equal(&mip6_report_rl.dst, dst)) {
+ mip6_report_rl.stamp.tv_sec = stamp->tv_sec;
+ mip6_report_rl.stamp.tv_usec = stamp->tv_usec;
+ mip6_report_rl.iif = iif;
+ mip6_report_rl.src = *src;
+ mip6_report_rl.dst = *dst;
+ allow = 1;
+ }
+ spin_unlock_bh(&mip6_report_rl.lock);
+ return allow;
+}
+
+static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
+ const struct flowi *fl)
+{
+ struct net *net = xs_net(x);
+ struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+ const struct flowi6 *fl6 = &fl->u.ip6;
+ struct ipv6_destopt_hao *hao = NULL;
+ struct xfrm_selector sel;
+ int offset;
+ struct timeval stamp;
+ int err = 0;
+
+ if (unlikely(fl6->flowi6_proto == IPPROTO_MH &&
+ fl6->fl6_mh_type <= IP6_MH_TYPE_MAX))
+ goto out;
+
+ if (likely(opt->dsthao)) {
+ offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO);
+ if (likely(offset >= 0))
+ hao = (struct ipv6_destopt_hao *)
+ (skb_network_header(skb) + offset);
+ }
+
+ skb_get_timestamp(skb, &stamp);
+
+ if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr,
+ hao ? &hao->addr : &ipv6_hdr(skb)->saddr,
+ opt->iif))
+ goto out;
+
+ memset(&sel, 0, sizeof(sel));
+ memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
+ sizeof(sel.daddr));
+ sel.prefixlen_d = 128;
+ memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
+ sizeof(sel.saddr));
+ sel.prefixlen_s = 128;
+ sel.family = AF_INET6;
+ sel.proto = fl6->flowi6_proto;
+ sel.dport = xfrm_flowi_dport(fl, &fl6->uli);
+ if (sel.dport)
+ sel.dport_mask = htons(~0);
+ sel.sport = xfrm_flowi_sport(fl, &fl6->uli);
+ if (sel.sport)
+ sel.sport_mask = htons(~0);
+ sel.ifindex = fl6->flowi6_oif;
+
+ err = km_report(net, IPPROTO_DSTOPTS, &sel,
+ (hao ? (xfrm_address_t *)&hao->addr : NULL));
+
+ out:
+ return err;
+}
+
+static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb,
+ u8 **nexthdr)
+{
+ u16 offset = sizeof(struct ipv6hdr);
+ struct ipv6_opt_hdr *exthdr =
+ (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
+ const unsigned char *nh = skb_network_header(skb);
+ unsigned int packet_len = skb_tail_pointer(skb) -
+ skb_network_header(skb);
+ int found_rhdr = 0;
+
+ *nexthdr = &ipv6_hdr(skb)->nexthdr;
+
+ while (offset + 1 <= packet_len) {
+
+ switch (**nexthdr) {
+ case NEXTHDR_HOP:
+ break;
+ case NEXTHDR_ROUTING:
+ found_rhdr = 1;
+ break;
+ case NEXTHDR_DEST:
+ /*
+ * HAO MUST NOT appear more than once.
+ * XXX: It is better to try to find by the end of
+ * XXX: packet if HAO exists.
+ */
+ if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) {
+ net_dbg_ratelimited("mip6: hao exists already, override\n");
+ return offset;
+ }
+
+ if (found_rhdr)
+ return offset;
+
+ break;
+ default:
+ return offset;
+ }
+
+ offset += ipv6_optlen(exthdr);
+ *nexthdr = &exthdr->nexthdr;
+ exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+ }
+
+ return offset;
+}
+
+static int mip6_destopt_init_state(struct xfrm_state *x)
+{
+ if (x->id.spi) {
+ pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
+ return -EINVAL;
+ }
+ if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
+ pr_info("%s: state's mode is not %u: %u\n",
+ __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+ return -EINVAL;
+ }
+
+ x->props.header_len = sizeof(struct ipv6_destopt_hdr) +
+ calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) +
+ sizeof(struct ipv6_destopt_hao);
+ WARN_ON(x->props.header_len != 24);
+
+ return 0;
+}
+
+/*
+ * Do nothing about destroying since it has no specific operation for
+ * destination options header unlike IPsec protocols.
+ */
+static void mip6_destopt_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type mip6_destopt_type = {
+ .description = "MIP6DESTOPT",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_DSTOPTS,
+ .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR,
+ .init_state = mip6_destopt_init_state,
+ .destructor = mip6_destopt_destroy,
+ .input = mip6_destopt_input,
+ .output = mip6_destopt_output,
+ .reject = mip6_destopt_reject,
+ .hdr_offset = mip6_destopt_offset,
+};
+
+static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data;
+ int err = rt2->rt_hdr.nexthdr;
+
+ spin_lock(&x->lock);
+ if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) &&
+ !ipv6_addr_any((struct in6_addr *)x->coaddr))
+ err = -ENOENT;
+ spin_unlock(&x->lock);
+
+ return err;
+}
+
+/* Routing Header type 2 is inserted.
+ * IP Header's dst address is replaced with Routing Header's Home Address.
+ */
+static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ipv6hdr *iph;
+ struct rt2_hdr *rt2;
+ u8 nexthdr;
+
+ skb_push(skb, -skb_network_offset(skb));
+ iph = ipv6_hdr(skb);
+
+ nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_ROUTING;
+
+ rt2 = (struct rt2_hdr *)skb_transport_header(skb);
+ rt2->rt_hdr.nexthdr = nexthdr;
+ rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1;
+ rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2;
+ rt2->rt_hdr.segments_left = 1;
+ memset(&rt2->reserved, 0, sizeof(rt2->reserved));
+
+ WARN_ON(rt2->rt_hdr.hdrlen != 2);
+
+ memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr));
+ spin_lock_bh(&x->lock);
+ memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr));
+ spin_unlock_bh(&x->lock);
+
+ return 0;
+}
+
+static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb,
+ u8 **nexthdr)
+{
+ u16 offset = sizeof(struct ipv6hdr);
+ struct ipv6_opt_hdr *exthdr =
+ (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
+ const unsigned char *nh = skb_network_header(skb);
+ unsigned int packet_len = skb_tail_pointer(skb) -
+ skb_network_header(skb);
+ int found_rhdr = 0;
+
+ *nexthdr = &ipv6_hdr(skb)->nexthdr;
+
+ while (offset + 1 <= packet_len) {
+
+ switch (**nexthdr) {
+ case NEXTHDR_HOP:
+ break;
+ case NEXTHDR_ROUTING:
+ if (offset + 3 <= packet_len) {
+ struct ipv6_rt_hdr *rt;
+ rt = (struct ipv6_rt_hdr *)(nh + offset);
+ if (rt->type != 0)
+ return offset;
+ }
+ found_rhdr = 1;
+ break;
+ case NEXTHDR_DEST:
+ if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
+ return offset;
+
+ if (found_rhdr)
+ return offset;
+
+ break;
+ default:
+ return offset;
+ }
+
+ offset += ipv6_optlen(exthdr);
+ *nexthdr = &exthdr->nexthdr;
+ exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+ }
+
+ return offset;
+}
+
+static int mip6_rthdr_init_state(struct xfrm_state *x)
+{
+ if (x->id.spi) {
+ pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
+ return -EINVAL;
+ }
+ if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
+ pr_info("%s: state's mode is not %u: %u\n",
+ __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+ return -EINVAL;
+ }
+
+ x->props.header_len = sizeof(struct rt2_hdr);
+
+ return 0;
+}
+
+/*
+ * Do nothing about destroying since it has no specific operation for routing
+ * header type 2 unlike IPsec protocols.
+ */
+static void mip6_rthdr_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type mip6_rthdr_type = {
+ .description = "MIP6RT",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ROUTING,
+ .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR,
+ .init_state = mip6_rthdr_init_state,
+ .destructor = mip6_rthdr_destroy,
+ .input = mip6_rthdr_input,
+ .output = mip6_rthdr_output,
+ .hdr_offset = mip6_rthdr_offset,
+};
+
+static int __init mip6_init(void)
+{
+ pr_info("Mobile IPv6\n");
+
+ if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) {
+ pr_info("%s: can't add xfrm type(destopt)\n", __func__);
+ goto mip6_destopt_xfrm_fail;
+ }
+ if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) {
+ pr_info("%s: can't add xfrm type(rthdr)\n", __func__);
+ goto mip6_rthdr_xfrm_fail;
+ }
+ if (rawv6_mh_filter_register(mip6_mh_filter) < 0) {
+ pr_info("%s: can't add rawv6 mh filter\n", __func__);
+ goto mip6_rawv6_mh_fail;
+ }
+
+
+ return 0;
+
+ mip6_rawv6_mh_fail:
+ xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
+ mip6_rthdr_xfrm_fail:
+ xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
+ mip6_destopt_xfrm_fail:
+ return -EAGAIN;
+}
+
+static void __exit mip6_fini(void)
+{
+ if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
+ pr_info("%s: can't remove rawv6 mh filter\n", __func__);
+ if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
+ pr_info("%s: can't remove xfrm type(rthdr)\n", __func__);
+ if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
+ pr_info("%s: can't remove xfrm type(destopt)\n", __func__);
+}
+
+module_init(mip6_init);
+module_exit(mip6_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS);
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
new file mode 100644
index 000000000..96f153c08
--- /dev/null
+++ b/net/ipv6/ndisc.c
@@ -0,0 +1,1819 @@
+/*
+ * Neighbour Discovery for IPv6
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Mike Shaver <shaver@ingenia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ *
+ * Alexey I. Froloff : RFC6106 (DNSSL) support
+ * Pierre Ynard : export userland ND options
+ * through netlink (RDNSS support)
+ * Lars Fenneberg : fixed MTU setting on receipt
+ * of an RA.
+ * Janos Farkas : kmalloc failure checks
+ * Alexey Kuznetsov : state machine reworked
+ * and moved to net/core.
+ * Pekka Savola : RFC2461 validation
+ * YOSHIFUJI Hideaki @USAGI : Verify ND options properly
+ */
+
+#define pr_fmt(fmt) "ICMPv6: " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/sched.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/route.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/jhash.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/icmp.h>
+
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <net/flow.h>
+#include <net/ip6_checksum.h>
+#include <net/inet_common.h>
+#include <linux/proc_fs.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+/* Set to 3 to get tracing... */
+#define ND_DEBUG 1
+
+#define ND_PRINTK(val, level, fmt, ...) \
+do { \
+ if (val <= ND_DEBUG) \
+ net_##level##_ratelimited(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static u32 ndisc_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd);
+static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey);
+static int ndisc_constructor(struct neighbour *neigh);
+static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static int pndisc_constructor(struct pneigh_entry *n);
+static void pndisc_destructor(struct pneigh_entry *n);
+static void pndisc_redo(struct sk_buff *skb);
+
+static const struct neigh_ops ndisc_generic_ops = {
+ .family = AF_INET6,
+ .solicit = ndisc_solicit,
+ .error_report = ndisc_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_connected_output,
+};
+
+static const struct neigh_ops ndisc_hh_ops = {
+ .family = AF_INET6,
+ .solicit = ndisc_solicit,
+ .error_report = ndisc_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_resolve_output,
+};
+
+
+static const struct neigh_ops ndisc_direct_ops = {
+ .family = AF_INET6,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
+};
+
+struct neigh_table nd_tbl = {
+ .family = AF_INET6,
+ .key_len = sizeof(struct in6_addr),
+ .protocol = cpu_to_be16(ETH_P_IPV6),
+ .hash = ndisc_hash,
+ .key_eq = ndisc_key_eq,
+ .constructor = ndisc_constructor,
+ .pconstructor = pndisc_constructor,
+ .pdestructor = pndisc_destructor,
+ .proxy_redo = pndisc_redo,
+ .id = "ndisc_cache",
+ .parms = {
+ .tbl = &nd_tbl,
+ .reachable_time = ND_REACHABLE_TIME,
+ .data = {
+ [NEIGH_VAR_MCAST_PROBES] = 3,
+ [NEIGH_VAR_UCAST_PROBES] = 3,
+ [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
+ [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+ [NEIGH_VAR_PROXY_QLEN] = 64,
+ [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
+ [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
+ },
+ },
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
+};
+
+static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
+{
+ int pad = ndisc_addr_option_pad(skb->dev->type);
+ int data_len = skb->dev->addr_len;
+ int space = ndisc_opt_addr_space(skb->dev);
+ u8 *opt = skb_put(skb, space);
+
+ opt[0] = type;
+ opt[1] = space>>3;
+
+ memset(opt + 2, 0, pad);
+ opt += pad;
+ space -= pad;
+
+ memcpy(opt+2, data, data_len);
+ data_len += 2;
+ opt += data_len;
+ space -= data_len;
+ if (space > 0)
+ memset(opt, 0, space);
+}
+
+static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
+ struct nd_opt_hdr *end)
+{
+ int type;
+ if (!cur || !end || cur >= end)
+ return NULL;
+ type = cur->nd_opt_type;
+ do {
+ cur = ((void *)cur) + (cur->nd_opt_len << 3);
+ } while (cur < end && cur->nd_opt_type != type);
+ return cur <= end && cur->nd_opt_type == type ? cur : NULL;
+}
+
+static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
+{
+ return opt->nd_opt_type == ND_OPT_RDNSS ||
+ opt->nd_opt_type == ND_OPT_DNSSL;
+}
+
+static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
+ struct nd_opt_hdr *end)
+{
+ if (!cur || !end || cur >= end)
+ return NULL;
+ do {
+ cur = ((void *)cur) + (cur->nd_opt_len << 3);
+ } while (cur < end && !ndisc_is_useropt(cur));
+ return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
+}
+
+struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+ struct ndisc_options *ndopts)
+{
+ struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
+
+ if (!nd_opt || opt_len < 0 || !ndopts)
+ return NULL;
+ memset(ndopts, 0, sizeof(*ndopts));
+ while (opt_len) {
+ int l;
+ if (opt_len < sizeof(struct nd_opt_hdr))
+ return NULL;
+ l = nd_opt->nd_opt_len << 3;
+ if (opt_len < l || l == 0)
+ return NULL;
+ switch (nd_opt->nd_opt_type) {
+ case ND_OPT_SOURCE_LL_ADDR:
+ case ND_OPT_TARGET_LL_ADDR:
+ case ND_OPT_MTU:
+ case ND_OPT_REDIRECT_HDR:
+ if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
+ ND_PRINTK(2, warn,
+ "%s: duplicated ND6 option found: type=%d\n",
+ __func__, nd_opt->nd_opt_type);
+ } else {
+ ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
+ }
+ break;
+ case ND_OPT_PREFIX_INFO:
+ ndopts->nd_opts_pi_end = nd_opt;
+ if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
+ ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
+ break;
+#ifdef CONFIG_IPV6_ROUTE_INFO
+ case ND_OPT_ROUTE_INFO:
+ ndopts->nd_opts_ri_end = nd_opt;
+ if (!ndopts->nd_opts_ri)
+ ndopts->nd_opts_ri = nd_opt;
+ break;
+#endif
+ default:
+ if (ndisc_is_useropt(nd_opt)) {
+ ndopts->nd_useropts_end = nd_opt;
+ if (!ndopts->nd_useropts)
+ ndopts->nd_useropts = nd_opt;
+ } else {
+ /*
+ * Unknown options must be silently ignored,
+ * to accommodate future extension to the
+ * protocol.
+ */
+ ND_PRINTK(2, notice,
+ "%s: ignored unsupported option; type=%d, len=%d\n",
+ __func__,
+ nd_opt->nd_opt_type,
+ nd_opt->nd_opt_len);
+ }
+ }
+ opt_len -= l;
+ nd_opt = ((void *)nd_opt) + l;
+ }
+ return ndopts;
+}
+
+int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
+{
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */
+ case ARPHRD_FDDI:
+ ipv6_eth_mc_map(addr, buf);
+ return 0;
+ case ARPHRD_ARCNET:
+ ipv6_arcnet_mc_map(addr, buf);
+ return 0;
+ case ARPHRD_INFINIBAND:
+ ipv6_ib_mc_map(addr, dev->broadcast, buf);
+ return 0;
+ case ARPHRD_IPGRE:
+ return ipv6_ipgre_mc_map(addr, dev->broadcast, buf);
+ default:
+ if (dir) {
+ memcpy(buf, dev->broadcast, dev->addr_len);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ndisc_mc_map);
+
+static u32 ndisc_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd)
+{
+ return ndisc_hashfn(pkey, dev, hash_rnd);
+}
+
+static bool ndisc_key_eq(const struct neighbour *n, const void *pkey)
+{
+ return neigh_key_eq128(n, pkey);
+}
+
+static int ndisc_constructor(struct neighbour *neigh)
+{
+ struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key;
+ struct net_device *dev = neigh->dev;
+ struct inet6_dev *in6_dev;
+ struct neigh_parms *parms;
+ bool is_multicast = ipv6_addr_is_multicast(addr);
+
+ in6_dev = in6_dev_get(dev);
+ if (!in6_dev) {
+ return -EINVAL;
+ }
+
+ parms = in6_dev->nd_parms;
+ __neigh_parms_put(neigh->parms);
+ neigh->parms = neigh_parms_clone(parms);
+
+ neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST;
+ if (!dev->header_ops) {
+ neigh->nud_state = NUD_NOARP;
+ neigh->ops = &ndisc_direct_ops;
+ neigh->output = neigh_direct_output;
+ } else {
+ if (is_multicast) {
+ neigh->nud_state = NUD_NOARP;
+ ndisc_mc_map(addr, neigh->ha, dev, 1);
+ } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+ if (dev->flags&IFF_LOOPBACK)
+ neigh->type = RTN_LOCAL;
+ } else if (dev->flags&IFF_POINTOPOINT) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+ }
+ if (dev->header_ops->cache)
+ neigh->ops = &ndisc_hh_ops;
+ else
+ neigh->ops = &ndisc_generic_ops;
+ if (neigh->nud_state&NUD_VALID)
+ neigh->output = neigh->ops->connected_output;
+ else
+ neigh->output = neigh->ops->output;
+ }
+ in6_dev_put(in6_dev);
+ return 0;
+}
+
+static int pndisc_constructor(struct pneigh_entry *n)
+{
+ struct in6_addr *addr = (struct in6_addr *)&n->key;
+ struct in6_addr maddr;
+ struct net_device *dev = n->dev;
+
+ if (!dev || !__in6_dev_get(dev))
+ return -EINVAL;
+ addrconf_addr_solict_mult(addr, &maddr);
+ ipv6_dev_mc_inc(dev, &maddr);
+ return 0;
+}
+
+static void pndisc_destructor(struct pneigh_entry *n)
+{
+ struct in6_addr *addr = (struct in6_addr *)&n->key;
+ struct in6_addr maddr;
+ struct net_device *dev = n->dev;
+
+ if (!dev || !__in6_dev_get(dev))
+ return;
+ addrconf_addr_solict_mult(addr, &maddr);
+ ipv6_dev_mc_dec(dev, &maddr);
+}
+
+static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
+ int len)
+{
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ struct sock *sk = dev_net(dev)->ipv6.ndisc_sk;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
+ if (!skb) {
+ ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n",
+ __func__);
+ return NULL;
+ }
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
+
+ skb_reserve(skb, hlen + sizeof(struct ipv6hdr));
+ skb_reset_transport_header(skb);
+
+ /* Manually assign socket ownership as we avoid calling
+ * sock_alloc_send_pskb() to bypass wmem buffer limits
+ */
+ skb_set_owner_w(skb, sk);
+
+ return skb;
+}
+
+static void ip6_nd_hdr(struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ int hop_limit, int len)
+{
+ struct ipv6hdr *hdr;
+
+ skb_push(skb, sizeof(*hdr));
+ skb_reset_network_header(skb);
+ hdr = ipv6_hdr(skb);
+
+ ip6_flow_hdr(hdr, 0, 0);
+
+ hdr->payload_len = htons(len);
+ hdr->nexthdr = IPPROTO_ICMPV6;
+ hdr->hop_limit = hop_limit;
+
+ hdr->saddr = *saddr;
+ hdr->daddr = *daddr;
+}
+
+static void ndisc_send_skb(struct sk_buff *skb,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk = net->ipv6.ndisc_sk;
+ struct inet6_dev *idev;
+ int err;
+ struct icmp6hdr *icmp6h = icmp6_hdr(skb);
+ u8 type;
+
+ type = icmp6h->icmp6_type;
+
+ if (!dst) {
+ struct flowi6 fl6;
+
+ icmpv6_flow_init(sk, &fl6, type, saddr, daddr, skb->dev->ifindex);
+ dst = icmp6_dst_alloc(skb->dev, &fl6);
+ if (IS_ERR(dst)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ skb_dst_set(skb, dst);
+ }
+
+ icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len,
+ IPPROTO_ICMPV6,
+ csum_partial(icmp6h,
+ skb->len, 0));
+
+ ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len);
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dst->dev);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
+ NULL, dst->dev,
+ dst_output_sk);
+ if (!err) {
+ ICMP6MSGOUT_INC_STATS(net, idev, type);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+ }
+
+ rcu_read_unlock();
+}
+
+void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
+ const struct in6_addr *daddr,
+ const struct in6_addr *solicited_addr,
+ bool router, bool solicited, bool override, bool inc_opt)
+{
+ struct sk_buff *skb;
+ struct in6_addr tmpaddr;
+ struct inet6_ifaddr *ifp;
+ const struct in6_addr *src_addr;
+ struct nd_msg *msg;
+ int optlen = 0;
+
+ /* for anycast or proxy, solicited_addr != src_addr */
+ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1);
+ if (ifp) {
+ src_addr = solicited_addr;
+ if (ifp->flags & IFA_F_OPTIMISTIC)
+ override = false;
+ inc_opt |= ifp->idev->cnf.force_tllao;
+ in6_ifa_put(ifp);
+ } else {
+ if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr,
+ inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs,
+ &tmpaddr))
+ return;
+ src_addr = &tmpaddr;
+ }
+
+ if (!dev->addr_len)
+ inc_opt = 0;
+ if (inc_opt)
+ optlen += ndisc_opt_addr_space(dev);
+
+ skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+ if (!skb)
+ return;
+
+ msg = (struct nd_msg *)skb_put(skb, sizeof(*msg));
+ *msg = (struct nd_msg) {
+ .icmph = {
+ .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT,
+ .icmp6_router = router,
+ .icmp6_solicited = solicited,
+ .icmp6_override = override,
+ },
+ .target = *solicited_addr,
+ };
+
+ if (inc_opt)
+ ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
+ dev->dev_addr);
+
+
+ ndisc_send_skb(skb, daddr, src_addr);
+}
+
+static void ndisc_send_unsol_na(struct net_device *dev)
+{
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+
+ idev = in6_dev_get(dev);
+ if (!idev)
+ return;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &ifa->addr,
+ /*router=*/ !!idev->cnf.forwarding,
+ /*solicited=*/ false, /*override=*/ true,
+ /*inc_opt=*/ true);
+ }
+ read_unlock_bh(&idev->lock);
+
+ in6_dev_put(idev);
+}
+
+void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
+ const struct in6_addr *solicit,
+ const struct in6_addr *daddr, const struct in6_addr *saddr)
+{
+ struct sk_buff *skb;
+ struct in6_addr addr_buf;
+ int inc_opt = dev->addr_len;
+ int optlen = 0;
+ struct nd_msg *msg;
+
+ if (!saddr) {
+ if (ipv6_get_lladdr(dev, &addr_buf,
+ (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)))
+ return;
+ saddr = &addr_buf;
+ }
+
+ if (ipv6_addr_any(saddr))
+ inc_opt = false;
+ if (inc_opt)
+ optlen += ndisc_opt_addr_space(dev);
+
+ skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+ if (!skb)
+ return;
+
+ msg = (struct nd_msg *)skb_put(skb, sizeof(*msg));
+ *msg = (struct nd_msg) {
+ .icmph = {
+ .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION,
+ },
+ .target = *solicit,
+ };
+
+ if (inc_opt)
+ ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
+ dev->dev_addr);
+
+ ndisc_send_skb(skb, daddr, saddr);
+}
+
+void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
+{
+ struct sk_buff *skb;
+ struct rs_msg *msg;
+ int send_sllao = dev->addr_len;
+ int optlen = 0;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+ /*
+ * According to section 2.2 of RFC 4429, we must not
+ * send router solicitations with a sllao from
+ * optimistic addresses, but we may send the solicitation
+ * if we don't include the sllao. So here we check
+ * if our address is optimistic, and if so, we
+ * suppress the inclusion of the sllao.
+ */
+ if (send_sllao) {
+ struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr,
+ dev, 1);
+ if (ifp) {
+ if (ifp->flags & IFA_F_OPTIMISTIC) {
+ send_sllao = 0;
+ }
+ in6_ifa_put(ifp);
+ } else {
+ send_sllao = 0;
+ }
+ }
+#endif
+ if (send_sllao)
+ optlen += ndisc_opt_addr_space(dev);
+
+ skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+ if (!skb)
+ return;
+
+ msg = (struct rs_msg *)skb_put(skb, sizeof(*msg));
+ *msg = (struct rs_msg) {
+ .icmph = {
+ .icmp6_type = NDISC_ROUTER_SOLICITATION,
+ },
+ };
+
+ if (send_sllao)
+ ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
+ dev->dev_addr);
+
+ ndisc_send_skb(skb, daddr, saddr);
+}
+
+
+static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+ /*
+ * "The sender MUST return an ICMP
+ * destination unreachable"
+ */
+ dst_link_failure(skb);
+ kfree_skb(skb);
+}
+
+/* Called with locked neigh: either read or both */
+
+static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+ struct in6_addr *saddr = NULL;
+ struct in6_addr mcaddr;
+ struct net_device *dev = neigh->dev;
+ struct in6_addr *target = (struct in6_addr *)&neigh->primary_key;
+ int probes = atomic_read(&neigh->probes);
+
+ if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
+ dev, 1,
+ IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
+ saddr = &ipv6_hdr(skb)->saddr;
+ probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
+ if (probes < 0) {
+ if (!(neigh->nud_state & NUD_VALID)) {
+ ND_PRINTK(1, dbg,
+ "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
+ __func__, target);
+ }
+ ndisc_send_ns(dev, neigh, target, target, saddr);
+ } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
+ neigh_app_ns(neigh);
+ } else {
+ addrconf_addr_solict_mult(target, &mcaddr);
+ ndisc_send_ns(dev, NULL, target, &mcaddr, saddr);
+ }
+}
+
+static int pndisc_is_router(const void *pkey,
+ struct net_device *dev)
+{
+ struct pneigh_entry *n;
+ int ret = -1;
+
+ read_lock_bh(&nd_tbl.lock);
+ n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
+ if (n)
+ ret = !!(n->flags & NTF_ROUTER);
+ read_unlock_bh(&nd_tbl.lock);
+
+ return ret;
+}
+
+static void ndisc_recv_ns(struct sk_buff *skb)
+{
+ struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
+ const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+ const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+ u8 *lladdr = NULL;
+ u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
+ offsetof(struct nd_msg, opt));
+ struct ndisc_options ndopts;
+ struct net_device *dev = skb->dev;
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *idev = NULL;
+ struct neighbour *neigh;
+ int dad = ipv6_addr_any(saddr);
+ bool inc;
+ int is_router = -1;
+
+ if (skb->len < sizeof(struct nd_msg)) {
+ ND_PRINTK(2, warn, "NS: packet too short\n");
+ return;
+ }
+
+ if (ipv6_addr_is_multicast(&msg->target)) {
+ ND_PRINTK(2, warn, "NS: multicast target address\n");
+ return;
+ }
+
+ /*
+ * RFC2461 7.1.1:
+ * DAD has to be destined for solicited node multicast address.
+ */
+ if (dad && !ipv6_addr_is_solict_mult(daddr)) {
+ ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n");
+ return;
+ }
+
+ if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+ ND_PRINTK(2, warn, "NS: invalid ND options\n");
+ return;
+ }
+
+ if (ndopts.nd_opts_src_lladdr) {
+ lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
+ if (!lladdr) {
+ ND_PRINTK(2, warn,
+ "NS: invalid link-layer address length\n");
+ return;
+ }
+
+ /* RFC2461 7.1.1:
+ * If the IP source address is the unspecified address,
+ * there MUST NOT be source link-layer address option
+ * in the message.
+ */
+ if (dad) {
+ ND_PRINTK(2, warn,
+ "NS: bad DAD packet (link-layer address option)\n");
+ return;
+ }
+ }
+
+ inc = ipv6_addr_is_multicast(daddr);
+
+ ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
+ if (ifp) {
+
+ if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
+ if (dad) {
+ /*
+ * We are colliding with another node
+ * who is doing DAD
+ * so fail our DAD process
+ */
+ addrconf_dad_failure(ifp);
+ return;
+ } else {
+ /*
+ * This is not a dad solicitation.
+ * If we are an optimistic node,
+ * we should respond.
+ * Otherwise, we should ignore it.
+ */
+ if (!(ifp->flags & IFA_F_OPTIMISTIC))
+ goto out;
+ }
+ }
+
+ idev = ifp->idev;
+ } else {
+ struct net *net = dev_net(dev);
+
+ idev = in6_dev_get(dev);
+ if (!idev) {
+ /* XXX: count this drop? */
+ return;
+ }
+
+ if (ipv6_chk_acast_addr(net, dev, &msg->target) ||
+ (idev->cnf.forwarding &&
+ (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) &&
+ (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
+ if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
+ skb->pkt_type != PACKET_HOST &&
+ inc &&
+ NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) {
+ /*
+ * for anycast or proxy,
+ * sender should delay its response
+ * by a random time between 0 and
+ * MAX_ANYCAST_DELAY_TIME seconds.
+ * (RFC2461) -- yoshfuji
+ */
+ struct sk_buff *n = skb_clone(skb, GFP_ATOMIC);
+ if (n)
+ pneigh_enqueue(&nd_tbl, idev->nd_parms, n);
+ goto out;
+ }
+ } else
+ goto out;
+ }
+
+ if (is_router < 0)
+ is_router = idev->cnf.forwarding;
+
+ if (dad) {
+ ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target,
+ !!is_router, false, (ifp != NULL), true);
+ goto out;
+ }
+
+ if (inc)
+ NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast);
+ else
+ NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast);
+
+ /*
+ * update / create cache entry
+ * for the source address
+ */
+ neigh = __neigh_lookup(&nd_tbl, saddr, dev,
+ !inc || lladdr || !dev->addr_len);
+ if (neigh)
+ neigh_update(neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_WEAK_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE);
+ if (neigh || !dev->header_ops) {
+ ndisc_send_na(dev, neigh, saddr, &msg->target,
+ !!is_router,
+ true, (ifp != NULL && inc), inc);
+ if (neigh)
+ neigh_release(neigh);
+ }
+
+out:
+ if (ifp)
+ in6_ifa_put(ifp);
+ else
+ in6_dev_put(idev);
+}
+
+static void ndisc_recv_na(struct sk_buff *skb)
+{
+ struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
+ struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+ const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+ u8 *lladdr = NULL;
+ u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
+ offsetof(struct nd_msg, opt));
+ struct ndisc_options ndopts;
+ struct net_device *dev = skb->dev;
+ struct inet6_ifaddr *ifp;
+ struct neighbour *neigh;
+
+ if (skb->len < sizeof(struct nd_msg)) {
+ ND_PRINTK(2, warn, "NA: packet too short\n");
+ return;
+ }
+
+ if (ipv6_addr_is_multicast(&msg->target)) {
+ ND_PRINTK(2, warn, "NA: target address is multicast\n");
+ return;
+ }
+
+ if (ipv6_addr_is_multicast(daddr) &&
+ msg->icmph.icmp6_solicited) {
+ ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n");
+ return;
+ }
+
+ if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+ ND_PRINTK(2, warn, "NS: invalid ND option\n");
+ return;
+ }
+ if (ndopts.nd_opts_tgt_lladdr) {
+ lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
+ if (!lladdr) {
+ ND_PRINTK(2, warn,
+ "NA: invalid link-layer address length\n");
+ return;
+ }
+ }
+ ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
+ if (ifp) {
+ if (skb->pkt_type != PACKET_LOOPBACK
+ && (ifp->flags & IFA_F_TENTATIVE)) {
+ addrconf_dad_failure(ifp);
+ return;
+ }
+ /* What should we make now? The advertisement
+ is invalid, but ndisc specs say nothing
+ about it. It could be misconfiguration, or
+ an smart proxy agent tries to help us :-)
+
+ We should not print the error if NA has been
+ received from loopback - it is just our own
+ unsolicited advertisement.
+ */
+ if (skb->pkt_type != PACKET_LOOPBACK)
+ ND_PRINTK(1, warn,
+ "NA: someone advertises our address %pI6 on %s!\n",
+ &ifp->addr, ifp->idev->dev->name);
+ in6_ifa_put(ifp);
+ return;
+ }
+ neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
+
+ if (neigh) {
+ u8 old_flags = neigh->flags;
+ struct net *net = dev_net(dev);
+
+ if (neigh->nud_state & NUD_FAILED)
+ goto out;
+
+ /*
+ * Don't update the neighbor cache entry on a proxy NA from
+ * ourselves because either the proxied node is off link or it
+ * has already sent a NA to us.
+ */
+ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
+ net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp &&
+ pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) {
+ /* XXX: idev->cnf.proxy_ndp */
+ goto out;
+ }
+
+ neigh_update(neigh, lladdr,
+ msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
+ NEIGH_UPDATE_F_WEAK_OVERRIDE|
+ (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
+ (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0));
+
+ if ((old_flags & ~neigh->flags) & NTF_ROUTER) {
+ /*
+ * Change: router to host
+ */
+ rt6_clean_tohost(dev_net(dev), saddr);
+ }
+
+out:
+ neigh_release(neigh);
+ }
+}
+
+static void ndisc_recv_rs(struct sk_buff *skb)
+{
+ struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb);
+ unsigned long ndoptlen = skb->len - sizeof(*rs_msg);
+ struct neighbour *neigh;
+ struct inet6_dev *idev;
+ const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+ struct ndisc_options ndopts;
+ u8 *lladdr = NULL;
+
+ if (skb->len < sizeof(*rs_msg))
+ return;
+
+ idev = __in6_dev_get(skb->dev);
+ if (!idev) {
+ ND_PRINTK(1, err, "RS: can't find in6 device\n");
+ return;
+ }
+
+ /* Don't accept RS if we're not in router mode */
+ if (!idev->cnf.forwarding)
+ goto out;
+
+ /*
+ * Don't update NCE if src = ::;
+ * this implies that the source node has no ip address assigned yet.
+ */
+ if (ipv6_addr_any(saddr))
+ goto out;
+
+ /* Parse ND options */
+ if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) {
+ ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n");
+ goto out;
+ }
+
+ if (ndopts.nd_opts_src_lladdr) {
+ lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
+ skb->dev);
+ if (!lladdr)
+ goto out;
+ }
+
+ neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1);
+ if (neigh) {
+ neigh_update(neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_WEAK_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
+ neigh_release(neigh);
+ }
+out:
+ return;
+}
+
+static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
+{
+ struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra);
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ struct nduseroptmsg *ndmsg;
+ struct net *net = dev_net(ra->dev);
+ int err;
+ int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg)
+ + (opt->nd_opt_len << 3));
+ size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr));
+
+ skb = nlmsg_new(msg_size, GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0);
+ if (!nlh) {
+ goto nla_put_failure;
+ }
+
+ ndmsg = nlmsg_data(nlh);
+ ndmsg->nduseropt_family = AF_INET6;
+ ndmsg->nduseropt_ifindex = ra->dev->ifindex;
+ ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type;
+ ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code;
+ ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3;
+
+ memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3);
+
+ if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
+ return;
+
+nla_put_failure:
+ nlmsg_free(skb);
+ err = -EMSGSIZE;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
+}
+
+static void ndisc_router_discovery(struct sk_buff *skb)
+{
+ struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
+ struct neighbour *neigh = NULL;
+ struct inet6_dev *in6_dev;
+ struct rt6_info *rt = NULL;
+ int lifetime;
+ struct ndisc_options ndopts;
+ int optlen;
+ unsigned int pref = 0;
+
+ __u8 *opt = (__u8 *)(ra_msg + 1);
+
+ optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) -
+ sizeof(struct ra_msg);
+
+ ND_PRINTK(2, info,
+ "RA: %s, dev: %s\n",
+ __func__, skb->dev->name);
+ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+ ND_PRINTK(2, warn, "RA: source address is not link-local\n");
+ return;
+ }
+ if (optlen < 0) {
+ ND_PRINTK(2, warn, "RA: packet too short\n");
+ return;
+ }
+
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) {
+ ND_PRINTK(2, warn, "RA: from host or unauthorized router\n");
+ return;
+ }
+#endif
+
+ /*
+ * set the RA_RECV flag in the interface
+ */
+
+ in6_dev = __in6_dev_get(skb->dev);
+ if (!in6_dev) {
+ ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n",
+ skb->dev->name);
+ return;
+ }
+
+ if (!ndisc_parse_options(opt, optlen, &ndopts)) {
+ ND_PRINTK(2, warn, "RA: invalid ND options\n");
+ return;
+ }
+
+ if (!ipv6_accept_ra(in6_dev)) {
+ ND_PRINTK(2, info,
+ "RA: %s, did not accept ra for dev: %s\n",
+ __func__, skb->dev->name);
+ goto skip_linkparms;
+ }
+
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ /* skip link-specific parameters from interior routers */
+ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
+ ND_PRINTK(2, info,
+ "RA: %s, nodetype is NODEFAULT, dev: %s\n",
+ __func__, skb->dev->name);
+ goto skip_linkparms;
+ }
+#endif
+
+ if (in6_dev->if_flags & IF_RS_SENT) {
+ /*
+ * flag that an RA was received after an RS was sent
+ * out on this interface.
+ */
+ in6_dev->if_flags |= IF_RA_RCVD;
+ }
+
+ /*
+ * Remember the managed/otherconf flags from most recently
+ * received RA message (RFC 2462) -- yoshfuji
+ */
+ in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED |
+ IF_RA_OTHERCONF)) |
+ (ra_msg->icmph.icmp6_addrconf_managed ?
+ IF_RA_MANAGED : 0) |
+ (ra_msg->icmph.icmp6_addrconf_other ?
+ IF_RA_OTHERCONF : 0);
+
+ if (!in6_dev->cnf.accept_ra_defrtr) {
+ ND_PRINTK(2, info,
+ "RA: %s, defrtr is false for dev: %s\n",
+ __func__, skb->dev->name);
+ goto skip_defrtr;
+ }
+
+ /* Do not accept RA with source-addr found on local machine unless
+ * accept_ra_from_local is set to true.
+ */
+ if (!in6_dev->cnf.accept_ra_from_local &&
+ ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
+ NULL, 0)) {
+ ND_PRINTK(2, info,
+ "RA from local address detected on dev: %s: default router ignored\n",
+ skb->dev->name);
+ goto skip_defrtr;
+ }
+
+ lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
+
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ pref = ra_msg->icmph.icmp6_router_pref;
+ /* 10b is handled as if it were 00b (medium) */
+ if (pref == ICMPV6_ROUTER_PREF_INVALID ||
+ !in6_dev->cnf.accept_ra_rtr_pref)
+ pref = ICMPV6_ROUTER_PREF_MEDIUM;
+#endif
+
+ rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev);
+
+ if (rt) {
+ neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+ if (!neigh) {
+ ND_PRINTK(0, err,
+ "RA: %s got default router without neighbour\n",
+ __func__);
+ ip6_rt_put(rt);
+ return;
+ }
+ }
+ if (rt && lifetime == 0) {
+ ip6_del_rt(rt);
+ rt = NULL;
+ }
+
+ ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n",
+ rt, lifetime, skb->dev->name);
+ if (!rt && lifetime) {
+ ND_PRINTK(3, info, "RA: adding default router\n");
+
+ rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref);
+ if (!rt) {
+ ND_PRINTK(0, err,
+ "RA: %s failed to add default route\n",
+ __func__);
+ return;
+ }
+
+ neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+ if (!neigh) {
+ ND_PRINTK(0, err,
+ "RA: %s got default router without neighbour\n",
+ __func__);
+ ip6_rt_put(rt);
+ return;
+ }
+ neigh->flags |= NTF_ROUTER;
+ } else if (rt) {
+ rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ }
+
+ if (rt)
+ rt6_set_expires(rt, jiffies + (HZ * lifetime));
+ if (ra_msg->icmph.icmp6_hop_limit) {
+ /* Only set hop_limit on the interface if it is higher than
+ * the current hop_limit.
+ */
+ if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) {
+ in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+ } else {
+ ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n");
+ }
+ if (rt)
+ dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
+ ra_msg->icmph.icmp6_hop_limit);
+ }
+
+skip_defrtr:
+
+ /*
+ * Update Reachable Time and Retrans Timer
+ */
+
+ if (in6_dev->nd_parms) {
+ unsigned long rtime = ntohl(ra_msg->retrans_timer);
+
+ if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
+ rtime = (rtime*HZ)/1000;
+ if (rtime < HZ/10)
+ rtime = HZ/10;
+ NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
+ in6_dev->tstamp = jiffies;
+ inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+ }
+
+ rtime = ntohl(ra_msg->reachable_time);
+ if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) {
+ rtime = (rtime*HZ)/1000;
+
+ if (rtime < HZ/10)
+ rtime = HZ/10;
+
+ if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) {
+ NEIGH_VAR_SET(in6_dev->nd_parms,
+ BASE_REACHABLE_TIME, rtime);
+ NEIGH_VAR_SET(in6_dev->nd_parms,
+ GC_STALETIME, 3 * rtime);
+ in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime);
+ in6_dev->tstamp = jiffies;
+ inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+ }
+ }
+ }
+
+skip_linkparms:
+
+ /*
+ * Process options.
+ */
+
+ if (!neigh)
+ neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr,
+ skb->dev, 1);
+ if (neigh) {
+ u8 *lladdr = NULL;
+ if (ndopts.nd_opts_src_lladdr) {
+ lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
+ skb->dev);
+ if (!lladdr) {
+ ND_PRINTK(2, warn,
+ "RA: invalid link-layer address length\n");
+ goto out;
+ }
+ }
+ neigh_update(neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_WEAK_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
+ NEIGH_UPDATE_F_ISROUTER);
+ }
+
+ if (!ipv6_accept_ra(in6_dev)) {
+ ND_PRINTK(2, info,
+ "RA: %s, accept_ra is false for dev: %s\n",
+ __func__, skb->dev->name);
+ goto out;
+ }
+
+#ifdef CONFIG_IPV6_ROUTE_INFO
+ if (!in6_dev->cnf.accept_ra_from_local &&
+ ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
+ NULL, 0)) {
+ ND_PRINTK(2, info,
+ "RA from local address detected on dev: %s: router info ignored.\n",
+ skb->dev->name);
+ goto skip_routeinfo;
+ }
+
+ if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) {
+ struct nd_opt_hdr *p;
+ for (p = ndopts.nd_opts_ri;
+ p;
+ p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) {
+ struct route_info *ri = (struct route_info *)p;
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT &&
+ ri->prefix_len == 0)
+ continue;
+#endif
+ if (ri->prefix_len == 0 &&
+ !in6_dev->cnf.accept_ra_defrtr)
+ continue;
+ if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
+ continue;
+ rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
+ &ipv6_hdr(skb)->saddr);
+ }
+ }
+
+skip_routeinfo:
+#endif
+
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ /* skip link-specific ndopts from interior routers */
+ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
+ ND_PRINTK(2, info,
+ "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n",
+ __func__, skb->dev->name);
+ goto out;
+ }
+#endif
+
+ if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) {
+ struct nd_opt_hdr *p;
+ for (p = ndopts.nd_opts_pi;
+ p;
+ p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) {
+ addrconf_prefix_rcv(skb->dev, (u8 *)p,
+ (p->nd_opt_len) << 3,
+ ndopts.nd_opts_src_lladdr != NULL);
+ }
+ }
+
+ if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) {
+ __be32 n;
+ u32 mtu;
+
+ memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
+ mtu = ntohl(n);
+
+ if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
+ ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
+ } else if (in6_dev->cnf.mtu6 != mtu) {
+ in6_dev->cnf.mtu6 = mtu;
+
+ if (rt)
+ dst_metric_set(&rt->dst, RTAX_MTU, mtu);
+
+ rt6_mtu_change(skb->dev, mtu);
+ }
+ }
+
+ if (ndopts.nd_useropts) {
+ struct nd_opt_hdr *p;
+ for (p = ndopts.nd_useropts;
+ p;
+ p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) {
+ ndisc_ra_useropt(skb, p);
+ }
+ }
+
+ if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) {
+ ND_PRINTK(2, warn, "RA: invalid RA options\n");
+ }
+out:
+ ip6_rt_put(rt);
+ if (neigh)
+ neigh_release(neigh);
+}
+
+static void ndisc_redirect_rcv(struct sk_buff *skb)
+{
+ u8 *hdr;
+ struct ndisc_options ndopts;
+ struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb);
+ u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
+ offsetof(struct rd_msg, opt));
+
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+ switch (skb->ndisc_nodetype) {
+ case NDISC_NODETYPE_HOST:
+ case NDISC_NODETYPE_NODEFAULT:
+ ND_PRINTK(2, warn,
+ "Redirect: from host or unauthorized router\n");
+ return;
+ }
+#endif
+
+ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+ ND_PRINTK(2, warn,
+ "Redirect: source address is not link-local\n");
+ return;
+ }
+
+ if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts))
+ return;
+
+ if (!ndopts.nd_opts_rh) {
+ ip6_redirect_no_header(skb, dev_net(skb->dev),
+ skb->dev->ifindex, 0);
+ return;
+ }
+
+ hdr = (u8 *)ndopts.nd_opts_rh;
+ hdr += 8;
+ if (!pskb_pull(skb, hdr - skb_transport_header(skb)))
+ return;
+
+ icmpv6_notify(skb, NDISC_REDIRECT, 0, 0);
+}
+
+static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb,
+ struct sk_buff *orig_skb,
+ int rd_len)
+{
+ u8 *opt = skb_put(skb, rd_len);
+
+ memset(opt, 0, 8);
+ *(opt++) = ND_OPT_REDIRECT_HDR;
+ *(opt++) = (rd_len >> 3);
+ opt += 6;
+
+ memcpy(opt, ipv6_hdr(orig_skb), rd_len - 8);
+}
+
+void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
+{
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+ struct sock *sk = net->ipv6.ndisc_sk;
+ int optlen = 0;
+ struct inet_peer *peer;
+ struct sk_buff *buff;
+ struct rd_msg *msg;
+ struct in6_addr saddr_buf;
+ struct rt6_info *rt;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ int rd_len;
+ u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+ bool ret;
+
+ if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
+ ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
+ dev->name);
+ return;
+ }
+
+ if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) &&
+ ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
+ ND_PRINTK(2, warn,
+ "Redirect: target address is not link-local unicast\n");
+ return;
+ }
+
+ icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
+ &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
+ return;
+ }
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
+ return;
+
+ rt = (struct rt6_info *) dst;
+
+ if (rt->rt6i_flags & RTF_GATEWAY) {
+ ND_PRINTK(2, warn,
+ "Redirect: destination is not a neighbour\n");
+ goto release;
+ }
+ peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
+ ret = inet_peer_xrlim_allow(peer, 1*HZ);
+ if (peer)
+ inet_putpeer(peer);
+ if (!ret)
+ goto release;
+
+ if (dev->addr_len) {
+ struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target);
+ if (!neigh) {
+ ND_PRINTK(2, warn,
+ "Redirect: no neigh for target address\n");
+ goto release;
+ }
+
+ read_lock_bh(&neigh->lock);
+ if (neigh->nud_state & NUD_VALID) {
+ memcpy(ha_buf, neigh->ha, dev->addr_len);
+ read_unlock_bh(&neigh->lock);
+ ha = ha_buf;
+ optlen += ndisc_opt_addr_space(dev);
+ } else
+ read_unlock_bh(&neigh->lock);
+
+ neigh_release(neigh);
+ }
+
+ rd_len = min_t(unsigned int,
+ IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen,
+ skb->len + 8);
+ rd_len &= ~0x7;
+ optlen += rd_len;
+
+ buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
+ if (!buff)
+ goto release;
+
+ msg = (struct rd_msg *)skb_put(buff, sizeof(*msg));
+ *msg = (struct rd_msg) {
+ .icmph = {
+ .icmp6_type = NDISC_REDIRECT,
+ },
+ .target = *target,
+ .dest = ipv6_hdr(skb)->daddr,
+ };
+
+ /*
+ * include target_address option
+ */
+
+ if (ha)
+ ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha);
+
+ /*
+ * build redirect option and copy skb over to the new packet.
+ */
+
+ if (rd_len)
+ ndisc_fill_redirect_hdr_option(buff, skb, rd_len);
+
+ skb_dst_set(buff, dst);
+ ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf);
+ return;
+
+release:
+ dst_release(dst);
+}
+
+static void pndisc_redo(struct sk_buff *skb)
+{
+ ndisc_recv_ns(skb);
+ kfree_skb(skb);
+}
+
+static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev)
+ return true;
+ if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED &&
+ idev->cnf.suppress_frag_ndisc) {
+ net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n");
+ return true;
+ }
+ return false;
+}
+
+int ndisc_rcv(struct sk_buff *skb)
+{
+ struct nd_msg *msg;
+
+ if (ndisc_suppress_frag_ndisc(skb))
+ return 0;
+
+ if (skb_linearize(skb))
+ return 0;
+
+ msg = (struct nd_msg *)skb_transport_header(skb);
+
+ __skb_push(skb, skb->data - skb_transport_header(skb));
+
+ if (ipv6_hdr(skb)->hop_limit != 255) {
+ ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n",
+ ipv6_hdr(skb)->hop_limit);
+ return 0;
+ }
+
+ if (msg->icmph.icmp6_code != 0) {
+ ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n",
+ msg->icmph.icmp6_code);
+ return 0;
+ }
+
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+ switch (msg->icmph.icmp6_type) {
+ case NDISC_NEIGHBOUR_SOLICITATION:
+ ndisc_recv_ns(skb);
+ break;
+
+ case NDISC_NEIGHBOUR_ADVERTISEMENT:
+ ndisc_recv_na(skb);
+ break;
+
+ case NDISC_ROUTER_SOLICITATION:
+ ndisc_recv_rs(skb);
+ break;
+
+ case NDISC_ROUTER_ADVERTISEMENT:
+ ndisc_router_discovery(skb);
+ break;
+
+ case NDISC_REDIRECT:
+ ndisc_redirect_rcv(skb);
+ break;
+ }
+
+ return 0;
+}
+
+static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct inet6_dev *idev;
+
+ switch (event) {
+ case NETDEV_CHANGEADDR:
+ neigh_changeaddr(&nd_tbl, dev);
+ fib6_run_gc(0, net, false);
+ idev = in6_dev_get(dev);
+ if (!idev)
+ break;
+ if (idev->cnf.ndisc_notify)
+ ndisc_send_unsol_na(dev);
+ in6_dev_put(idev);
+ break;
+ case NETDEV_DOWN:
+ neigh_ifdown(&nd_tbl, dev);
+ fib6_run_gc(0, net, false);
+ break;
+ case NETDEV_NOTIFY_PEERS:
+ ndisc_send_unsol_na(dev);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ndisc_netdev_notifier = {
+ .notifier_call = ndisc_netdev_event,
+};
+
+#ifdef CONFIG_SYSCTL
+static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
+ const char *func, const char *dev_name)
+{
+ static char warncomm[TASK_COMM_LEN];
+ static int warned;
+ if (strcmp(warncomm, current->comm) && warned < 5) {
+ strcpy(warncomm, current->comm);
+ pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n",
+ warncomm, func,
+ dev_name, ctl->procname,
+ dev_name, ctl->procname);
+ warned++;
+ }
+}
+
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net_device *dev = ctl->extra1;
+ struct inet6_dev *idev;
+ int ret;
+
+ if ((strcmp(ctl->procname, "retrans_time") == 0) ||
+ (strcmp(ctl->procname, "base_reachable_time") == 0))
+ ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
+
+ if (strcmp(ctl->procname, "retrans_time") == 0)
+ ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ else if (strcmp(ctl->procname, "base_reachable_time") == 0)
+ ret = neigh_proc_dointvec_jiffies(ctl, write,
+ buffer, lenp, ppos);
+
+ else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
+ (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
+ ret = neigh_proc_dointvec_ms_jiffies(ctl, write,
+ buffer, lenp, ppos);
+ else
+ ret = -1;
+
+ if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) {
+ if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME))
+ idev->nd_parms->reachable_time =
+ neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME));
+ idev->tstamp = jiffies;
+ inet6_ifinfo_notify(RTM_NEWLINK, idev);
+ in6_dev_put(idev);
+ }
+ return ret;
+}
+
+
+#endif
+
+static int __net_init ndisc_net_init(struct net *net)
+{
+ struct ipv6_pinfo *np;
+ struct sock *sk;
+ int err;
+
+ err = inet_ctl_sock_create(&sk, PF_INET6,
+ SOCK_RAW, IPPROTO_ICMPV6, net);
+ if (err < 0) {
+ ND_PRINTK(0, err,
+ "NDISC: Failed to initialize the control socket (err %d)\n",
+ err);
+ return err;
+ }
+
+ net->ipv6.ndisc_sk = sk;
+
+ np = inet6_sk(sk);
+ np->hop_limit = 255;
+ /* Do not loopback ndisc messages */
+ np->mc_loop = 0;
+
+ return 0;
+}
+
+static void __net_exit ndisc_net_exit(struct net *net)
+{
+ inet_ctl_sock_destroy(net->ipv6.ndisc_sk);
+}
+
+static struct pernet_operations ndisc_net_ops = {
+ .init = ndisc_net_init,
+ .exit = ndisc_net_exit,
+};
+
+int __init ndisc_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&ndisc_net_ops);
+ if (err)
+ return err;
+ /*
+ * Initialize the neighbour table
+ */
+ neigh_table_init(NEIGH_ND_TABLE, &nd_tbl);
+
+#ifdef CONFIG_SYSCTL
+ err = neigh_sysctl_register(NULL, &nd_tbl.parms,
+ ndisc_ifinfo_sysctl_change);
+ if (err)
+ goto out_unregister_pernet;
+out:
+#endif
+ return err;
+
+#ifdef CONFIG_SYSCTL
+out_unregister_pernet:
+ unregister_pernet_subsys(&ndisc_net_ops);
+ goto out;
+#endif
+}
+
+int __init ndisc_late_init(void)
+{
+ return register_netdevice_notifier(&ndisc_netdev_notifier);
+}
+
+void ndisc_late_cleanup(void)
+{
+ unregister_netdevice_notifier(&ndisc_netdev_notifier);
+}
+
+void ndisc_cleanup(void)
+{
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_unregister(&nd_tbl.parms);
+#endif
+ neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl);
+ unregister_pernet_subsys(&ndisc_net_ops);
+}
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
new file mode 100644
index 000000000..d958718b5
--- /dev/null
+++ b/net/ipv6/netfilter.c
@@ -0,0 +1,219 @@
+/*
+ * IPv6 specific functions of netfilter core
+ *
+ * Rusty Russell (C) 2000 -- This code is GPL.
+ * Patrick McHardy (C) 2006-2012
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/export.h>
+#include <net/addrconf.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/xfrm.h>
+#include <net/ip6_checksum.h>
+#include <net/netfilter/nf_queue.h>
+
+int ip6_route_me_harder(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ unsigned int hh_len;
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+ .flowi6_mark = skb->mark,
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ };
+ int err;
+
+ dst = ip6_route_output(net, skb->sk, &fl6);
+ err = dst->error;
+ if (err) {
+ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+ net_dbg_ratelimited("ip6_route_me_harder: No more route\n");
+ dst_release(dst);
+ return err;
+ }
+
+ /* Drop old route. */
+ skb_dst_drop(skb);
+
+ skb_dst_set(skb, dst);
+
+#ifdef CONFIG_XFRM
+ if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+ xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
+ skb_dst_set(skb, NULL);
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+ skb_dst_set(skb, dst);
+ }
+#endif
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+ 0, GFP_ATOMIC))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(ip6_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip6_rt_info {
+ struct in6_addr daddr;
+ struct in6_addr saddr;
+ u_int32_t mark;
+};
+
+static void nf_ip6_saveroute(const struct sk_buff *skb,
+ struct nf_queue_entry *entry)
+{
+ struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ rt_info->daddr = iph->daddr;
+ rt_info->saddr = iph->saddr;
+ rt_info->mark = skb->mark;
+ }
+}
+
+static int nf_ip6_reroute(struct sk_buff *skb,
+ const struct nf_queue_entry *entry)
+{
+ struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
+ !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
+ skb->mark != rt_info->mark)
+ return ip6_route_me_harder(skb);
+ }
+ return 0;
+}
+
+static int nf_ip6_route(struct net *net, struct dst_entry **dst,
+ struct flowi *fl, bool strict)
+{
+ static const struct ipv6_pinfo fake_pinfo;
+ static const struct inet_sock fake_sk = {
+ /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */
+ .sk.sk_bound_dev_if = 1,
+ .pinet6 = (struct ipv6_pinfo *) &fake_pinfo,
+ };
+ const void *sk = strict ? &fake_sk : NULL;
+ struct dst_entry *result;
+ int err;
+
+ result = ip6_route_output(net, sk, &fl->u.ip6);
+ err = result->error;
+ if (err)
+ dst_release(result);
+ else
+ *dst = result;
+ return err;
+}
+
+__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, u_int8_t protocol)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+ break;
+ if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ skb->len - dataoff, protocol,
+ csum_sub(skb->csum,
+ skb_checksum(skb, 0,
+ dataoff, 0)))) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = ~csum_unfold(
+ csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ skb->len - dataoff,
+ protocol,
+ csum_sub(0,
+ skb_checksum(skb, 0,
+ dataoff, 0))));
+ csum = __skb_checksum_complete(skb);
+ }
+ return csum;
+}
+EXPORT_SYMBOL(nf_ip6_checksum);
+
+static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u_int8_t protocol)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ __wsum hsum;
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (len == skb->len - dataoff)
+ return nf_ip6_checksum(skb, hook, dataoff, protocol);
+ /* fall through */
+ case CHECKSUM_NONE:
+ hsum = skb_checksum(skb, 0, dataoff, 0);
+ skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
+ &ip6h->daddr,
+ skb->len - dataoff,
+ protocol,
+ csum_sub(0, hsum)));
+ skb->ip_summed = CHECKSUM_NONE;
+ return __skb_checksum_complete_head(skb, dataoff + len);
+ }
+ return csum;
+};
+
+static const struct nf_ipv6_ops ipv6ops = {
+ .chk_addr = ipv6_chk_addr,
+};
+
+static const struct nf_afinfo nf_ip6_afinfo = {
+ .family = AF_INET6,
+ .checksum = nf_ip6_checksum,
+ .checksum_partial = nf_ip6_checksum_partial,
+ .route = nf_ip6_route,
+ .saveroute = nf_ip6_saveroute,
+ .reroute = nf_ip6_reroute,
+ .route_key_size = sizeof(struct ip6_rt_info),
+};
+
+int __init ipv6_netfilter_init(void)
+{
+ RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops);
+ return nf_register_afinfo(&nf_ip6_afinfo);
+}
+
+/* This can be called from inet6_init() on errors, so it cannot
+ * be marked __exit. -DaveM
+ */
+void ipv6_netfilter_fini(void)
+{
+ RCU_INIT_POINTER(nf_ipv6_ops, NULL);
+ nf_unregister_afinfo(&nf_ip6_afinfo);
+}
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
new file mode 100644
index 000000000..ca6998345
--- /dev/null
+++ b/net/ipv6/netfilter/Kconfig
@@ -0,0 +1,323 @@
+#
+# IP netfilter configuration
+#
+
+menu "IPv6: Netfilter Configuration"
+ depends on INET && IPV6 && NETFILTER
+
+config NF_DEFRAG_IPV6
+ tristate
+ default n
+
+config NF_CONNTRACK_IPV6
+ tristate "IPv6 connection tracking support"
+ depends on INET && IPV6 && NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
+ select NF_DEFRAG_IPV6
+ ---help---
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ This is IPv6 support on Layer 3 independent connection tracking.
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if NF_TABLES
+
+config NF_TABLES_IPV6
+ tristate "IPv6 nf_tables support"
+ help
+ This option enables the IPv6 support for nf_tables.
+
+if NF_TABLES_IPV6
+
+config NFT_CHAIN_ROUTE_IPV6
+ tristate "IPv6 nf_tables route chain support"
+ help
+ This option enables the "route" chain for IPv6 in nf_tables. This
+ chain type is used to force packet re-routing after mangling header
+ fields such as the source, destination, flowlabel, hop-limit and
+ the packet mark.
+
+config NFT_REJECT_IPV6
+ select NF_REJECT_IPV6
+ default NFT_REJECT
+ tristate
+
+endif # NF_TABLES_IPV6
+endif # NF_TABLES
+
+config NF_REJECT_IPV6
+ tristate "IPv6 packet rejection"
+ default m if NETFILTER_ADVANCED=n
+
+config NF_LOG_IPV6
+ tristate "IPv6 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_NAT_IPV6
+ tristate "IPv6 NAT"
+ depends on NF_CONNTRACK_IPV6
+ depends on NETFILTER_ADVANCED
+ select NF_NAT
+ help
+ The IPv6 NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. This can be
+ controlled by iptables or nft.
+
+if NF_NAT_IPV6
+
+config NFT_CHAIN_NAT_IPV6
+ depends on NF_TABLES_IPV6
+ tristate "IPv6 nf_tables nat chain support"
+ help
+ This option enables the "nat" chain for IPv6 in nf_tables. This
+ chain type is used to perform Network Address Translation (NAT)
+ packet transformations such as the source, destination address and
+ source and destination ports.
+
+config NF_NAT_MASQUERADE_IPV6
+ tristate "IPv6 masquerade support"
+ help
+ This is the kernel functionality to provide NAT in the masquerade
+ flavour (automatic source address selection) for IPv6.
+
+config NFT_MASQ_IPV6
+ tristate "IPv6 masquerade support for nf_tables"
+ depends on NF_TABLES_IPV6
+ depends on NFT_MASQ
+ select NF_NAT_MASQUERADE_IPV6
+ help
+ This is the expression that provides IPv4 masquerading support for
+ nf_tables.
+
+config NFT_REDIR_IPV6
+ tristate "IPv6 redirect support for nf_tables"
+ depends on NF_TABLES_IPV6
+ depends on NFT_REDIR
+ select NF_NAT_REDIRECT
+ help
+ This is the expression that provides IPv4 redirect support for
+ nf_tables.
+
+endif # NF_NAT_IPV6
+
+config IP6_NF_IPTABLES
+ tristate "IP6 tables support (required for filtering)"
+ depends on INET && IPV6
+ select NETFILTER_XTABLES
+ default m if NETFILTER_ADVANCED=n
+ help
+ ip6tables is a general, extensible packet identification framework.
+ Currently only the packet filtering and packet mangling subsystem
+ for IPv6 use this, but connection tracking is going to follow.
+ Say 'Y' or 'M' here if you want to use either of those.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP6_NF_IPTABLES
+
+# The simple matches.
+config IP6_NF_MATCH_AH
+ tristate '"ah" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This module allows one to match AH packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_MATCH_EUI64
+ tristate '"eui64" address check'
+ depends on NETFILTER_ADVANCED
+ help
+ This module performs checking on the IPv6 source address
+ Compares the last 64 bits with the EUI64 (delivered
+ from the MAC address) address
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_MATCH_FRAG
+ tristate '"frag" Fragmentation header match support'
+ depends on NETFILTER_ADVANCED
+ help
+ frag matching allows you to match packets based on the fragmentation
+ header of the packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_MATCH_OPTS
+ tristate '"hbh" hop-by-hop and "dst" opts header match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This allows one to match packets based on the hop-by-hop
+ and destination options headers of a packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_MATCH_HL
+ tristate '"hl" hoplimit match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MATCH_HL
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_HL.
+
+config IP6_NF_MATCH_IPV6HEADER
+ tristate '"ipv6header" IPv6 Extension Headers Match'
+ default m if NETFILTER_ADVANCED=n
+ help
+ This module allows one to match packets based upon
+ the ipv6 extension headers.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_MATCH_MH
+ tristate '"mh" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This module allows one to match MH packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_MATCH_RPFILTER
+ tristate '"rpfilter" reverse path filter match support'
+ depends on NETFILTER_ADVANCED && (IP6_NF_MANGLE || IP6_NF_RAW)
+ ---help---
+ This option allows you to match packets whose replies would
+ go out via the interface the packet came in.
+
+ To compile it as a module, choose M here. If unsure, say N.
+ The module will be called ip6t_rpfilter.
+
+config IP6_NF_MATCH_RT
+ tristate '"rt" Routing header match support'
+ depends on NETFILTER_ADVANCED
+ help
+ rt matching allows you to match packets based on the routing
+ header of the packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+# The targets
+config IP6_NF_TARGET_HL
+ tristate '"HL" hoplimit target support'
+ depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
+ select NETFILTER_XT_TARGET_HL
+ ---help---
+ This is a backwards-compatible option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_HL.
+
+config IP6_NF_FILTER
+ tristate "Packet filtering"
+ default m if NETFILTER_ADVANCED=n
+ help
+ Packet filtering defines a table `filter', which has a series of
+ rules for simple packet filtering at local input, forwarding and
+ local output. See the man page for iptables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_TARGET_REJECT
+ tristate "REJECT target support"
+ depends on IP6_NF_FILTER
+ select NF_REJECT_IPV6
+ default m if NETFILTER_ADVANCED=n
+ help
+ The REJECT target allows a filtering rule to specify that an ICMPv6
+ error should be issued in response to an incoming packet, rather
+ than silently being dropped.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_TARGET_SYNPROXY
+ tristate "SYNPROXY target support"
+ depends on NF_CONNTRACK && NETFILTER_ADVANCED
+ select NETFILTER_SYNPROXY
+ select SYN_COOKIES
+ help
+ The SYNPROXY target allows you to intercept TCP connections and
+ establish them using syncookies before they are passed on to the
+ server. This allows to avoid conntrack and server resource usage
+ during SYN-flood attacks.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_MANGLE
+ tristate "Packet mangling"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option adds a `mangle' table to iptables: see the man page for
+ iptables(8). This table is used for various packet alterations
+ which can effect how the packet is routed.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_RAW
+ tristate 'raw table support (required for TRACE)'
+ help
+ This option adds a `raw' table to ip6tables. This table is the very
+ first in the netfilter framework and hooks in at the PREROUTING
+ and OUTPUT chains.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+# security table for MAC policy
+config IP6_NF_SECURITY
+ tristate "Security table"
+ depends on SECURITY
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `security' table to iptables, for use
+ with Mandatory Access Control (MAC) policy.
+
+ If unsure, say N.
+
+config IP6_NF_NAT
+ tristate "ip6tables NAT support"
+ depends on NF_CONNTRACK_IPV6
+ depends on NETFILTER_ADVANCED
+ select NF_NAT
+ select NF_NAT_IPV6
+ select NETFILTER_XT_NAT
+ help
+ This enables the `nat' table in ip6tables. This allows masquerading,
+ port forwarding and other forms of full Network Address Port
+ Translation.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP6_NF_NAT
+
+config IP6_NF_TARGET_MASQUERADE
+ tristate "MASQUERADE target support"
+ select NF_NAT_MASQUERADE_IPV6
+ help
+ Masquerading is a special case of NAT: all outgoing connections are
+ changed to seem to come from a particular interface's address, and
+ if the interface goes down, those connections are lost. This is
+ only useful for dialup accounts with dynamic IP address (ie. your IP
+ address will be different on next dialup).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP6_NF_TARGET_NPT
+ tristate "NPT (Network Prefix translation) target support"
+ help
+ This option adds the `SNPT' and `DNPT' target, which perform
+ stateless IPv6-to-IPv6 Network Prefix Translation per RFC 6296.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+endif # IP6_NF_NAT
+
+endif # IP6_NF_IPTABLES
+
+endmenu
+
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
new file mode 100644
index 000000000..c36e0a549
--- /dev/null
+++ b/net/ipv6/netfilter/Makefile
@@ -0,0 +1,55 @@
+#
+# Makefile for the netfilter modules on top of IPv6.
+#
+
+# Link order matters here.
+obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o
+obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
+obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
+obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
+obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
+obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
+
+# objects for l3 independent conntrack
+nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
+
+# l3 independent conntrack
+obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
+
+nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
+obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
+obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
+
+# defrag
+nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
+obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
+
+# logging
+obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
+
+# reject
+obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o
+
+# nf_tables
+obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
+obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
+obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o
+obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
+
+# matches
+obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
+obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
+obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
+obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
+obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o
+obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o
+obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o
+obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
+
+# targets
+obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o
+obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o
+obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
+obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
new file mode 100644
index 000000000..62f5b0d0b
--- /dev/null
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -0,0 +1,2296 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/in.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/poison.h>
+#include <linux/icmpv6.h>
+#include <net/ipv6.h>
+#include <net/compat.h>
+#include <asm/uaccess.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv6 packet filter");
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) pr_info(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) pr_info(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x) WARN_ON(!(x))
+#else
+#define IP_NF_ASSERT(x)
+#endif
+
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+
+void *ip6t_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(ip6t, IP6T);
+}
+EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);
+
+/*
+ We keep a set of rules for each CPU, so we can avoid write-locking
+ them in the softirq when updating the counters and therefore
+ only need to read-lock in the softirq; doing a write_lock_bh() in user
+ context stops packets coming through and allows user context to read
+ the counters or update the rules.
+
+ Hence the start of any table is given by get_table() below. */
+
+/* Returns whether matches rule or not. */
+/* Performance critical - called for every packet */
+static inline bool
+ip6_packet_match(const struct sk_buff *skb,
+ const char *indev,
+ const char *outdev,
+ const struct ip6t_ip6 *ip6info,
+ unsigned int *protoff,
+ int *fragoff, bool *hotdrop)
+{
+ unsigned long ret;
+ const struct ipv6hdr *ipv6 = ipv6_hdr(skb);
+
+#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg)))
+
+ if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
+ &ip6info->src), IP6T_INV_SRCIP) ||
+ FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
+ &ip6info->dst), IP6T_INV_DSTIP)) {
+ dprintf("Source or dest mismatch.\n");
+/*
+ dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr,
+ ipinfo->smsk.s_addr, ipinfo->src.s_addr,
+ ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : "");
+ dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr,
+ ipinfo->dmsk.s_addr, ipinfo->dst.s_addr,
+ ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/
+ return false;
+ }
+
+ ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask);
+
+ if (FWINV(ret != 0, IP6T_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, ip6info->iniface,
+ ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":"");
+ return false;
+ }
+
+ ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask);
+
+ if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, ip6info->outiface,
+ ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":"");
+ return false;
+ }
+
+/* ... might want to do something with class and flowlabel here ... */
+
+ /* look for the desired protocol header */
+ if((ip6info->flags & IP6T_F_PROTO)) {
+ int protohdr;
+ unsigned short _frag_off;
+
+ protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL);
+ if (protohdr < 0) {
+ if (_frag_off == 0)
+ *hotdrop = true;
+ return false;
+ }
+ *fragoff = _frag_off;
+
+ dprintf("Packet protocol %hi ?= %s%hi.\n",
+ protohdr,
+ ip6info->invflags & IP6T_INV_PROTO ? "!":"",
+ ip6info->proto);
+
+ if (ip6info->proto == protohdr) {
+ if(ip6info->invflags & IP6T_INV_PROTO) {
+ return false;
+ }
+ return true;
+ }
+
+ /* We need match for the '-p all', too! */
+ if ((ip6info->proto != 0) &&
+ !(ip6info->invflags & IP6T_INV_PROTO))
+ return false;
+ }
+ return true;
+}
+
+/* should be ip6 safe */
+static bool
+ip6_checkentry(const struct ip6t_ip6 *ipv6)
+{
+ if (ipv6->flags & ~IP6T_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ ipv6->flags & ~IP6T_F_MASK);
+ return false;
+ }
+ if (ipv6->invflags & ~IP6T_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ ipv6->invflags & ~IP6T_INV_MASK);
+ return false;
+ }
+ return true;
+}
+
+static unsigned int
+ip6t_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);
+
+ return NF_DROP;
+}
+
+static inline struct ip6t_entry *
+get_entry(const void *base, unsigned int offset)
+{
+ return (struct ip6t_entry *)(base + offset);
+}
+
+/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
+static inline bool unconditional(const struct ip6t_ip6 *ipv6)
+{
+ static const struct ip6t_ip6 uncond;
+
+ return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
+}
+
+static inline const struct xt_entry_target *
+ip6t_get_target_c(const struct ip6t_entry *e)
+{
+ return ip6t_get_target((struct ip6t_entry *)e);
+}
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+/* This cries for unification! */
+static const char *const hooknames[] = {
+ [NF_INET_PRE_ROUTING] = "PREROUTING",
+ [NF_INET_LOCAL_IN] = "INPUT",
+ [NF_INET_FORWARD] = "FORWARD",
+ [NF_INET_LOCAL_OUT] = "OUTPUT",
+ [NF_INET_POST_ROUTING] = "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+ NF_IP6_TRACE_COMMENT_RULE,
+ NF_IP6_TRACE_COMMENT_RETURN,
+ NF_IP6_TRACE_COMMENT_POLICY,
+};
+
+static const char *const comments[] = {
+ [NF_IP6_TRACE_COMMENT_RULE] = "rule",
+ [NF_IP6_TRACE_COMMENT_RETURN] = "return",
+ [NF_IP6_TRACE_COMMENT_POLICY] = "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_WARNING,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+/* Mildly perf critical (only if packet tracing is on) */
+static inline int
+get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
+ const char *hookname, const char **chainname,
+ const char **comment, unsigned int *rulenum)
+{
+ const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
+
+ if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
+ /* Head of user chain: ERROR target with chainname */
+ *chainname = t->target.data;
+ (*rulenum) = 0;
+ } else if (s == e) {
+ (*rulenum)++;
+
+ if (s->target_offset == sizeof(struct ip6t_entry) &&
+ strcmp(t->target.u.kernel.target->name,
+ XT_STANDARD_TARGET) == 0 &&
+ t->verdict < 0 &&
+ unconditional(&s->ipv6)) {
+ /* Tail of chains: STANDARD target (return/policy) */
+ *comment = *chainname == hookname
+ ? comments[NF_IP6_TRACE_COMMENT_POLICY]
+ : comments[NF_IP6_TRACE_COMMENT_RETURN];
+ }
+ return 1;
+ } else
+ (*rulenum)++;
+
+ return 0;
+}
+
+static void trace_packet(const struct sk_buff *skb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ const char *tablename,
+ const struct xt_table_info *private,
+ const struct ip6t_entry *e)
+{
+ const void *table_base;
+ const struct ip6t_entry *root;
+ const char *hookname, *chainname, *comment;
+ const struct ip6t_entry *iter;
+ unsigned int rulenum = 0;
+ struct net *net = dev_net(in ? in : out);
+
+ table_base = private->entries[smp_processor_id()];
+ root = get_entry(table_base, private->hook_entry[hook]);
+
+ hookname = chainname = hooknames[hook];
+ comment = comments[NF_IP6_TRACE_COMMENT_RULE];
+
+ xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+ if (get_chainname_rulenum(iter, e, hookname,
+ &chainname, &comment, &rulenum) != 0)
+ break;
+
+ nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
+ "TRACE: %s:%s:%s:%u ",
+ tablename, chainname, comment, rulenum);
+}
+#endif
+
+static inline __pure struct ip6t_entry *
+ip6t_next_entry(const struct ip6t_entry *entry)
+{
+ return (void *)entry + entry->next_offset;
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ip6t_do_table(struct sk_buff *skb,
+ unsigned int hook,
+ const struct nf_hook_state *state,
+ struct xt_table *table)
+{
+ static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+ /* Initializing verdict to NF_DROP keeps gcc happy. */
+ unsigned int verdict = NF_DROP;
+ const char *indev, *outdev;
+ const void *table_base;
+ struct ip6t_entry *e, **jumpstack;
+ unsigned int *stackptr, origptr, cpu;
+ const struct xt_table_info *private;
+ struct xt_action_param acpar;
+ unsigned int addend;
+
+ /* Initialization */
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
+ /* We handle fragments by dealing with the first fragment as
+ * if it was a normal packet. All other fragments are treated
+ * normally, except that they will NEVER match rules that ask
+ * things we don't know, ie. tcp syn flag or ports). If the
+ * rule is also a fragment-specific rule, non-fragments won't
+ * match it. */
+ acpar.hotdrop = false;
+ acpar.in = state->in;
+ acpar.out = state->out;
+ acpar.family = NFPROTO_IPV6;
+ acpar.hooknum = hook;
+
+ IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+
+ local_bh_disable();
+ addend = xt_write_recseq_begin();
+ private = table->private;
+ /*
+ * Ensure we load private-> members after we've fetched the base
+ * pointer.
+ */
+ smp_read_barrier_depends();
+ cpu = smp_processor_id();
+ table_base = private->entries[cpu];
+ jumpstack = (struct ip6t_entry **)private->jumpstack[cpu];
+ stackptr = per_cpu_ptr(private->stackptr, cpu);
+ origptr = *stackptr;
+
+ e = get_entry(table_base, private->hook_entry[hook]);
+
+ do {
+ const struct xt_entry_target *t;
+ const struct xt_entry_match *ematch;
+
+ IP_NF_ASSERT(e);
+ acpar.thoff = 0;
+ if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
+ &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) {
+ no_match:
+ e = ip6t_next_entry(e);
+ continue;
+ }
+
+ xt_ematch_foreach(ematch, e) {
+ acpar.match = ematch->u.kernel.match;
+ acpar.matchinfo = ematch->data;
+ if (!acpar.match->match(skb, &acpar))
+ goto no_match;
+ }
+
+ ADD_COUNTER(e->counters, skb->len, 1);
+
+ t = ip6t_get_target_c(e);
+ IP_NF_ASSERT(t->u.kernel.target);
+
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
+ /* The packet is traced: log it */
+ if (unlikely(skb->nf_trace))
+ trace_packet(skb, hook, state->in, state->out,
+ table->name, private, e);
+#endif
+ /* Standard target? */
+ if (!t->u.kernel.target->target) {
+ int v;
+
+ v = ((struct xt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != XT_RETURN) {
+ verdict = (unsigned int)(-v) - 1;
+ break;
+ }
+ if (*stackptr <= origptr)
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ else
+ e = ip6t_next_entry(jumpstack[--*stackptr]);
+ continue;
+ }
+ if (table_base + v != ip6t_next_entry(e) &&
+ !(e->ipv6.flags & IP6T_F_GOTO)) {
+ if (*stackptr >= private->stacksize) {
+ verdict = NF_DROP;
+ break;
+ }
+ jumpstack[(*stackptr)++] = e;
+ }
+
+ e = get_entry(table_base, v);
+ continue;
+ }
+
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
+
+ verdict = t->u.kernel.target->target(skb, &acpar);
+ if (verdict == XT_CONTINUE)
+ e = ip6t_next_entry(e);
+ else
+ /* Verdict */
+ break;
+ } while (!acpar.hotdrop);
+
+ *stackptr = origptr;
+
+ xt_write_recseq_end(addend);
+ local_bh_enable();
+
+#ifdef DEBUG_ALLOW_ALL
+ return NF_ACCEPT;
+#else
+ if (acpar.hotdrop)
+ return NF_DROP;
+ else return verdict;
+#endif
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ there are loops. Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(const struct xt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ to 0 as we leave), and comefrom to save source hook bitmask */
+ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct ip6t_entry *e = (struct ip6t_entry *)(entry0 + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ const struct xt_standard_target *t
+ = (void *)ip6t_get_target_c(e);
+ int visited = e->comefrom & (1 << hook);
+
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
+ pr_err("iptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if ((e->target_offset == sizeof(struct ip6t_entry) &&
+ (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0 &&
+ unconditional(&e->ipv6)) || visited) {
+ unsigned int oldpos, size;
+
+ if ((strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0) &&
+ t->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("mark_source_chains: bad "
+ "negative verdict (%i)\n",
+ t->verdict);
+ return 0;
+ }
+
+ /* Return: backtrack through the last
+ big jump. */
+ do {
+ e->comefrom ^= (1<<NF_INET_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+ if (e->comefrom
+ & (1 << NF_INET_NUMHOOKS)) {
+ duprintf("Back unset "
+ "on hook %u "
+ "rule %u\n",
+ hook, pos);
+ }
+#endif
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct ip6t_entry *)
+ (entry0 + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct ip6t_entry *)
+ (entry0 + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.user.name,
+ XT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
+ if (newpos > newinfo->size -
+ sizeof(struct ip6t_entry)) {
+ duprintf("mark_source_chains: "
+ "bad verdict (%i)\n",
+ newpos);
+ return 0;
+ }
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct ip6t_entry *)
+ (entry0 + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
+{
+ struct xt_mtdtor_param par;
+
+ par.net = net;
+ par.match = m->u.kernel.match;
+ par.matchinfo = m->data;
+ par.family = NFPROTO_IPV6;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+ module_put(par.match->me);
+}
+
+static int
+check_entry(const struct ip6t_entry *e, const char *name)
+{
+ const struct xt_entry_target *t;
+
+ if (!ip6_checkentry(&e->ipv6)) {
+ duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ if (e->target_offset + sizeof(struct xt_entry_target) >
+ e->next_offset)
+ return -EINVAL;
+
+ t = ip6t_get_target_c(e);
+ if (e->target_offset + t->u.target_size > e->next_offset)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+ const struct ip6t_ip6 *ipv6 = par->entryinfo;
+ int ret;
+
+ par->match = m->u.kernel.match;
+ par->matchinfo = m->data;
+
+ ret = xt_check_match(par, m->u.match_size - sizeof(*m),
+ ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
+ if (ret < 0) {
+ duprintf("ip_tables: check failed for `%s'.\n",
+ par.match->name);
+ return ret;
+ }
+ return 0;
+}
+
+static int
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+ struct xt_match *match;
+ int ret;
+
+ match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match)) {
+ duprintf("find_check_match: `%s' not found\n", m->u.user.name);
+ return PTR_ERR(match);
+ }
+ m->u.kernel.match = match;
+
+ ret = check_match(m, par);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ module_put(m->u.kernel.match->me);
+ return ret;
+}
+
+static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
+{
+ struct xt_entry_target *t = ip6t_get_target(e);
+ struct xt_tgchk_param par = {
+ .net = net,
+ .table = name,
+ .entryinfo = e,
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ .hook_mask = e->comefrom,
+ .family = NFPROTO_IPV6,
+ };
+ int ret;
+
+ t = ip6t_get_target(e);
+ ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
+ e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO);
+ if (ret < 0) {
+ duprintf("ip_tables: check failed for `%s'.\n",
+ t->u.kernel.target->name);
+ return ret;
+ }
+ return 0;
+}
+
+static int
+find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
+ unsigned int size)
+{
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ int ret;
+ unsigned int j;
+ struct xt_mtchk_param mtpar;
+ struct xt_entry_match *ematch;
+
+ ret = check_entry(e, name);
+ if (ret)
+ return ret;
+
+ j = 0;
+ mtpar.net = net;
+ mtpar.table = name;
+ mtpar.entryinfo = &e->ipv6;
+ mtpar.hook_mask = e->comefrom;
+ mtpar.family = NFPROTO_IPV6;
+ xt_ematch_foreach(ematch, e) {
+ ret = find_check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
+
+ t = ip6t_get_target(e);
+ target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
+ ret = PTR_ERR(target);
+ goto cleanup_matches;
+ }
+ t->u.kernel.target = target;
+
+ ret = check_target(e, net, name);
+ if (ret)
+ goto err;
+ return 0;
+ err:
+ module_put(t->u.kernel.target->me);
+ cleanup_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
+ return ret;
+}
+
+static bool check_underflow(const struct ip6t_entry *e)
+{
+ const struct xt_entry_target *t;
+ unsigned int verdict;
+
+ if (!unconditional(&e->ipv6))
+ return false;
+ t = ip6t_get_target_c(e);
+ if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+ return false;
+ verdict = ((struct xt_standard_target *)t)->verdict;
+ verdict = -verdict - 1;
+ return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static int
+check_entry_size_and_hooks(struct ip6t_entry *e,
+ struct xt_table_info *newinfo,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int valid_hooks)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if (!(valid_hooks & (1 << h)))
+ continue;
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h]) {
+ if (!check_underflow(e)) {
+ pr_err("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
+ return -EINVAL;
+ }
+ newinfo->underflow[h] = underflows[h];
+ }
+ }
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct xt_counters) { 0, 0 });
+ e->comefrom = 0;
+ return 0;
+}
+
+static void cleanup_entry(struct ip6t_entry *e, struct net *net)
+{
+ struct xt_tgdtor_param par;
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
+
+ /* Cleanup all matches */
+ xt_ematch_foreach(ematch, e)
+ cleanup_match(ematch, net);
+ t = ip6t_get_target(e);
+
+ par.net = net;
+ par.target = t->u.kernel.target;
+ par.targinfo = t->data;
+ par.family = NFPROTO_IPV6;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ newinfo) */
+static int
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+ const struct ip6t_replace *repl)
+{
+ struct ip6t_entry *iter;
+ unsigned int i;
+ int ret = 0;
+
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ return ret;
+ ++i;
+ if (strcmp(ip6t_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+
+ if (i != repl->num_entries) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, repl->num_entries);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(repl->valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, repl->hook_entry[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, repl->underflow[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
+ return -ELOOP;
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, net, repl->name, repl->size);
+ if (ret != 0)
+ break;
+ ++i;
+ }
+
+ if (ret != 0) {
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter, net);
+ }
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i) {
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
+ }
+
+ return ret;
+}
+
+static void
+get_counters(const struct xt_table_info *t,
+ struct xt_counters counters[])
+{
+ struct ip6t_entry *iter;
+ unsigned int cpu;
+ unsigned int i;
+
+ for_each_possible_cpu(cpu) {
+ seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+ i = 0;
+ xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqcount_begin(s);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqcount_retry(s, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
+ ++i;
+ }
+ }
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+ unsigned int countersize;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ (other than comefrom, which userspace doesn't care
+ about). */
+ countersize = sizeof(struct xt_counters) * private->number;
+ counters = vzalloc(countersize);
+
+ if (counters == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ get_counters(private, counters);
+
+ return counters;
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+ const struct xt_table *table,
+ void __user *userptr)
+{
+ unsigned int off, num;
+ const struct ip6t_entry *e;
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ int ret = 0;
+ const void *loc_cpu_entry;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ unsigned int i;
+ const struct xt_entry_match *m;
+ const struct xt_entry_target *t;
+
+ e = (struct ip6t_entry *)(loc_cpu_entry + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct ip6t_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ for (i = sizeof(struct ip6t_entry);
+ i < e->target_offset;
+ i += m->u.match_size) {
+ m = (void *)e + i;
+
+ if (copy_to_user(userptr + off + i
+ + offsetof(struct xt_entry_match,
+ u.user.name),
+ m->u.kernel.match->name,
+ strlen(m->u.kernel.match->name)+1)
+ != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ t = ip6t_get_target_c(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct xt_entry_target,
+ u.user.name),
+ t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+ int v = *(compat_int_t *)src;
+
+ if (v > 0)
+ v += xt_compat_calc_jump(AF_INET6, v);
+ memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+ compat_int_t cv = *(int *)src;
+
+ if (cv > 0)
+ cv -= xt_compat_calc_jump(AF_INET6, cv);
+ return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct ip6t_entry *e,
+ const struct xt_table_info *info,
+ const void *base, struct xt_table_info *newinfo)
+{
+ const struct xt_entry_match *ematch;
+ const struct xt_entry_target *t;
+ unsigned int entry_offset;
+ int off, i, ret;
+
+ off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+ entry_offset = (void *)e - base;
+ xt_ematch_foreach(ematch, e)
+ off += xt_compat_match_offset(ematch->u.kernel.match);
+ t = ip6t_get_target_c(e);
+ off += xt_compat_target_offset(t->u.kernel.target);
+ newinfo->size -= off;
+ ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ if (info->hook_entry[i] &&
+ (e < (struct ip6t_entry *)(base + info->hook_entry[i])))
+ newinfo->hook_entry[i] -= off;
+ if (info->underflow[i] &&
+ (e < (struct ip6t_entry *)(base + info->underflow[i])))
+ newinfo->underflow[i] -= off;
+ }
+ return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+ struct xt_table_info *newinfo)
+{
+ struct ip6t_entry *iter;
+ void *loc_cpu_entry;
+ int ret;
+
+ if (!newinfo || !info)
+ return -EINVAL;
+
+ /* we dont care about newinfo->entries[] */
+ memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+ newinfo->initial_entries = 0;
+ loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET6, info->number);
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user,
+ const int *len, int compat)
+{
+ char name[XT_TABLE_MAXNAMELEN];
+ struct xt_table *t;
+ int ret;
+
+ if (*len != sizeof(struct ip6t_getinfo)) {
+ duprintf("length %u != %zu\n", *len,
+ sizeof(struct ip6t_getinfo));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0)
+ return -EFAULT;
+
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_lock(AF_INET6);
+#endif
+ t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
+ "ip6table_%s", name);
+ if (!IS_ERR_OR_NULL(t)) {
+ struct ip6t_getinfo info;
+ const struct xt_table_info *private = t->private;
+#ifdef CONFIG_COMPAT
+ struct xt_table_info tmp;
+
+ if (compat) {
+ ret = compat_table_info(private, &tmp);
+ xt_compat_flush_offsets(AF_INET6);
+ private = &tmp;
+ }
+#endif
+ memset(&info, 0, sizeof(info));
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = private->number;
+ info.size = private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ xt_table_unlock(t);
+ module_put(t->me);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+ if (compat)
+ xt_compat_unlock(AF_INET6);
+#endif
+ return ret;
+}
+
+static int
+get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
+ const int *len)
+{
+ int ret;
+ struct ip6t_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+ if (*len != sizeof(struct ip6t_get_entries) + get.size) {
+ duprintf("get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ t = xt_find_table_lock(net, AF_INET6, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ struct xt_table_info *private = t->private;
+ duprintf("t->private->number = %u\n", private->number);
+ if (get.size == private->size)
+ ret = copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ return ret;
+}
+
+static int
+__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+ struct xt_table_info *newinfo, unsigned int num_counters,
+ void __user *counters_ptr)
+{
+ int ret;
+ struct xt_table *t;
+ struct xt_table_info *oldinfo;
+ struct xt_counters *counters;
+ const void *loc_cpu_old_entry;
+ struct ip6t_entry *iter;
+
+ ret = 0;
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
+ "ip6table_%s", name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free_newinfo_counters_untrans;
+ }
+
+ /* You lied! */
+ if (valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto put_module;
+ }
+
+ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto put_module;
+
+ /* Update module usage count based on number of rules */
+ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+ oldinfo->number, oldinfo->initial_entries, newinfo->number);
+ if ((oldinfo->number > oldinfo->initial_entries) ||
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+ if ((oldinfo->number > oldinfo->initial_entries) &&
+ (newinfo->number <= oldinfo->initial_entries))
+ module_put(t->me);
+
+ /* Get the old counters, and synchronize with replace */
+ get_counters(oldinfo, counters);
+
+ /* Decrease module usage counts and free resource */
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ cleanup_entry(iter, net);
+
+ xt_free_table_info(oldinfo);
+ if (copy_to_user(counters_ptr, counters,
+ sizeof(struct xt_counters) * num_counters) != 0) {
+ /* Silent error, can't fail, new table is already in place */
+ net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n");
+ }
+ vfree(counters);
+ xt_table_unlock(t);
+ return ret;
+
+ put_module:
+ module_put(t->me);
+ xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+ vfree(counters);
+ out:
+ return ret;
+}
+
+static int
+do_replace(struct net *net, const void __user *user, unsigned int len)
+{
+ int ret;
+ struct ip6t_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct ip6t_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("ip_tables: Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, tmp.counters);
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int
+do_add_counters(struct net *net, const void __user *user, unsigned int len,
+ int compat)
+{
+ unsigned int i, curcpu;
+ struct xt_counters_info tmp;
+ struct xt_counters *paddc;
+ unsigned int num_counters;
+ char *name;
+ int size;
+ void *ptmp;
+ struct xt_table *t;
+ const struct xt_table_info *private;
+ int ret = 0;
+ const void *loc_cpu_entry;
+ struct ip6t_entry *iter;
+ unsigned int addend;
+#ifdef CONFIG_COMPAT
+ struct compat_xt_counters_info compat_tmp;
+
+ if (compat) {
+ ptmp = &compat_tmp;
+ size = sizeof(struct compat_xt_counters_info);
+ } else
+#endif
+ {
+ ptmp = &tmp;
+ size = sizeof(struct xt_counters_info);
+ }
+
+ if (copy_from_user(ptmp, user, size) != 0)
+ return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ num_counters = compat_tmp.num_counters;
+ name = compat_tmp.name;
+ } else
+#endif
+ {
+ num_counters = tmp.num_counters;
+ name = tmp.name;
+ }
+
+ if (len != size + num_counters * sizeof(struct xt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len - size);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user + size, len - size) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = xt_find_table_lock(net, AF_INET6, name);
+ if (IS_ERR_OR_NULL(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free;
+ }
+
+
+ local_bh_disable();
+ private = t->private;
+ if (private->number != num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ /* Choose the copy that is on our node */
+ curcpu = smp_processor_id();
+ addend = xt_write_recseq_begin();
+ loc_cpu_entry = private->entries[curcpu];
+ xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+ ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
+ xt_write_recseq_end(addend);
+
+ unlock_up_free:
+ local_bh_enable();
+ xt_table_unlock(t);
+ module_put(t->me);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ip6t_replace {
+ char name[XT_TABLE_MAXNAMELEN];
+ u32 valid_hooks;
+ u32 num_entries;
+ u32 size;
+ u32 hook_entry[NF_INET_NUMHOOKS];
+ u32 underflow[NF_INET_NUMHOOKS];
+ u32 num_counters;
+ compat_uptr_t counters; /* struct xt_counters * */
+ struct compat_ip6t_entry entries[0];
+};
+
+static int
+compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
+ unsigned int *size, struct xt_counters *counters,
+ unsigned int i)
+{
+ struct xt_entry_target *t;
+ struct compat_ip6t_entry __user *ce;
+ u_int16_t target_offset, next_offset;
+ compat_uint_t origsize;
+ const struct xt_entry_match *ematch;
+ int ret = 0;
+
+ origsize = *size;
+ ce = (struct compat_ip6t_entry __user *)*dstptr;
+ if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
+
+ *dstptr += sizeof(struct compat_ip6t_entry);
+ *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_to_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
+ target_offset = e->target_offset - (origsize - *size);
+ t = ip6t_get_target(e);
+ ret = xt_compat_target_to_user(t, dstptr, size);
+ if (ret)
+ return ret;
+ next_offset = e->next_offset - (origsize - *size);
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int
+compat_find_calc_match(struct xt_entry_match *m,
+ const char *name,
+ const struct ip6t_ip6 *ipv6,
+ unsigned int hookmask,
+ int *size)
+{
+ struct xt_match *match;
+
+ match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match)) {
+ duprintf("compat_check_calc_match: `%s' not found\n",
+ m->u.user.name);
+ return PTR_ERR(match);
+ }
+ m->u.kernel.match = match;
+ *size += xt_compat_match_offset(match);
+ return 0;
+}
+
+static void compat_release_entry(struct compat_ip6t_entry *e)
+{
+ struct xt_entry_target *t;
+ struct xt_entry_match *ematch;
+
+ /* Cleanup all matches */
+ xt_ematch_foreach(ematch, e)
+ module_put(ematch->u.kernel.match->me);
+ t = compat_ip6t_get_target(e);
+ module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
+ struct xt_table_info *newinfo,
+ unsigned int *size,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ const char *name)
+{
+ struct xt_entry_match *ematch;
+ struct xt_entry_target *t;
+ struct xt_target *target;
+ unsigned int entry_offset;
+ unsigned int j;
+ int ret, off, h;
+
+ duprintf("check_compat_entry_size_and_hooks %p\n", e);
+ if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) {
+ duprintf("Bad offset %p, limit = %p\n", e, limit);
+ return -EINVAL;
+ }
+
+ if (e->next_offset < sizeof(struct compat_ip6t_entry) +
+ sizeof(struct compat_xt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* For purposes of check_entry casting the compat entry is fine */
+ ret = check_entry((struct ip6t_entry *)e, name);
+ if (ret)
+ return ret;
+
+ off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+ entry_offset = (void *)e - (void *)base;
+ j = 0;
+ xt_ematch_foreach(ematch, e) {
+ ret = compat_find_calc_match(ematch, name,
+ &e->ipv6, e->comefrom, &off);
+ if (ret != 0)
+ goto release_matches;
+ ++j;
+ }
+
+ t = compat_ip6t_get_target(e);
+ target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
+ duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+ t->u.user.name);
+ ret = PTR_ERR(target);
+ goto release_matches;
+ }
+ t->u.kernel.target = target;
+
+ off += xt_compat_target_offset(target);
+ *size += off;
+ ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
+ if (ret)
+ goto out;
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* Clear counters and comefrom */
+ memset(&e->counters, 0, sizeof(e->counters));
+ e->comefrom = 0;
+ return 0;
+
+out:
+ module_put(t->u.kernel.target->me);
+release_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ module_put(ematch->u.kernel.match->me);
+ }
+ return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
+ unsigned int *size, const char *name,
+ struct xt_table_info *newinfo, unsigned char *base)
+{
+ struct xt_entry_target *t;
+ struct ip6t_entry *de;
+ unsigned int origsize;
+ int ret, h;
+ struct xt_entry_match *ematch;
+
+ ret = 0;
+ origsize = *size;
+ de = (struct ip6t_entry *)*dstptr;
+ memcpy(de, e, sizeof(struct ip6t_entry));
+ memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+ *dstptr += sizeof(struct ip6t_entry);
+ *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_from_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
+ de->target_offset = e->target_offset - (origsize - *size);
+ t = compat_ip6t_get_target(e);
+ xt_compat_target_from_user(t, dstptr, size);
+
+ de->next_offset = e->next_offset - (origsize - *size);
+ for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+ if ((unsigned char *)de - base < newinfo->hook_entry[h])
+ newinfo->hook_entry[h] -= origsize - *size;
+ if ((unsigned char *)de - base < newinfo->underflow[h])
+ newinfo->underflow[h] -= origsize - *size;
+ }
+ return ret;
+}
+
+static int compat_check_entry(struct ip6t_entry *e, struct net *net,
+ const char *name)
+{
+ unsigned int j;
+ int ret = 0;
+ struct xt_mtchk_param mtpar;
+ struct xt_entry_match *ematch;
+
+ j = 0;
+ mtpar.net = net;
+ mtpar.table = name;
+ mtpar.entryinfo = &e->ipv6;
+ mtpar.hook_mask = e->comefrom;
+ mtpar.family = NFPROTO_IPV6;
+ xt_ematch_foreach(ematch, e) {
+ ret = check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
+
+ ret = check_target(e, net, name);
+ if (ret)
+ goto cleanup_matches;
+ return 0;
+
+ cleanup_matches:
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
+ return ret;
+}
+
+static int
+translate_compat_table(struct net *net,
+ const char *name,
+ unsigned int valid_hooks,
+ struct xt_table_info **pinfo,
+ void **pentry0,
+ unsigned int total_size,
+ unsigned int number,
+ unsigned int *hook_entries,
+ unsigned int *underflows)
+{
+ unsigned int i, j;
+ struct xt_table_info *newinfo, *info;
+ void *pos, *entry0, *entry1;
+ struct compat_ip6t_entry *iter0;
+ struct ip6t_entry *iter1;
+ unsigned int size;
+ int ret = 0;
+
+ info = *pinfo;
+ entry0 = *pentry0;
+ size = total_size;
+ info->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ info->hook_entry[i] = 0xFFFFFFFF;
+ info->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_compat_table: size %u\n", info->size);
+ j = 0;
+ xt_compat_lock(AF_INET6);
+ xt_compat_init_offsets(AF_INET6, number);
+ /* Walk through entries, checking offsets. */
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + total_size,
+ hook_entries,
+ underflows,
+ name);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
+
+ ret = -EINVAL;
+ if (j != number) {
+ duprintf("translate_compat_table: %u not %u entries\n",
+ j, number);
+ goto out_unlock;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (info->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ goto out_unlock;
+ }
+ if (info->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ goto out_unlock;
+ }
+ }
+
+ ret = -ENOMEM;
+ newinfo = xt_alloc_table_info(size);
+ if (!newinfo)
+ goto out_unlock;
+
+ newinfo->number = number;
+ for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = info->hook_entry[i];
+ newinfo->underflow[i] = info->underflow[i];
+ }
+ entry1 = newinfo->entries[raw_smp_processor_id()];
+ pos = entry1;
+ size = total_size;
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = compat_copy_entry_from_user(iter0, &pos, &size,
+ name, newinfo, entry1);
+ if (ret != 0)
+ break;
+ }
+ xt_compat_flush_offsets(AF_INET6);
+ xt_compat_unlock(AF_INET6);
+ if (ret)
+ goto free_newinfo;
+
+ ret = -ELOOP;
+ if (!mark_source_chains(newinfo, valid_hooks, entry1))
+ goto free_newinfo;
+
+ i = 0;
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ ret = compat_check_entry(iter1, net, name);
+ if (ret != 0)
+ break;
+ ++i;
+ if (strcmp(ip6t_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
+ }
+ if (ret) {
+ /*
+ * The first i matches need cleanup_entry (calls ->destroy)
+ * because they had called ->check already. The other j-i
+ * entries need only release.
+ */
+ int skip = i;
+ j -= i;
+ xt_entry_foreach(iter0, entry0, newinfo->size) {
+ if (skip-- > 0)
+ continue;
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter1, net);
+ }
+ xt_free_table_info(newinfo);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for_each_possible_cpu(i)
+ if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+ memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+ *pinfo = newinfo;
+ *pentry0 = entry1;
+ xt_free_table_info(info);
+ return 0;
+
+free_newinfo:
+ xt_free_table_info(newinfo);
+out:
+ xt_entry_foreach(iter0, entry0, total_size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ return ret;
+out_unlock:
+ xt_compat_flush_offsets(AF_INET6);
+ xt_compat_unlock(AF_INET6);
+ goto out;
+}
+
+static int
+compat_do_replace(struct net *net, void __user *user, unsigned int len)
+{
+ int ret;
+ struct compat_ip6t_replace tmp;
+ struct xt_table_info *newinfo;
+ void *loc_cpu_entry;
+ struct ip6t_entry *iter;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ /* overflow check */
+ if (tmp.size >= INT_MAX / num_possible_cpus())
+ return -ENOMEM;
+ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+ return -ENOMEM;
+ if (tmp.num_counters == 0)
+ return -EINVAL;
+
+ tmp.name[sizeof(tmp.name)-1] = 0;
+
+ newinfo = xt_alloc_table_info(tmp.size);
+ if (!newinfo)
+ return -ENOMEM;
+
+ /* choose the copy that is on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
+ &newinfo, &loc_cpu_entry, tmp.size,
+ tmp.num_entries, tmp.hook_entry,
+ tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo;
+
+ duprintf("compat_do_replace: Translated table\n");
+
+ ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+ tmp.num_counters, compat_ptr(tmp.counters));
+ if (ret)
+ goto free_newinfo_untrans;
+ return 0;
+
+ free_newinfo_untrans:
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+}
+
+static int
+compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
+ unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IP6T_SO_SET_REPLACE:
+ ret = compat_do_replace(sock_net(sk), user, len);
+ break;
+
+ case IP6T_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 1);
+ break;
+
+ default:
+ duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+struct compat_ip6t_get_entries {
+ char name[XT_TABLE_MAXNAMELEN];
+ compat_uint_t size;
+ struct compat_ip6t_entry entrytable[0];
+};
+
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+ void __user *userptr)
+{
+ struct xt_counters *counters;
+ const struct xt_table_info *private = table->private;
+ void __user *pos;
+ unsigned int size;
+ int ret = 0;
+ const void *loc_cpu_entry;
+ unsigned int i = 0;
+ struct ip6t_entry *iter;
+
+ counters = alloc_counters(table);
+ if (IS_ERR(counters))
+ return PTR_ERR(counters);
+
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ pos = userptr;
+ size = total_size;
+ xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
+
+ vfree(counters);
+ return ret;
+}
+
+static int
+compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
+ int *len)
+{
+ int ret;
+ struct compat_ip6t_get_entries get;
+ struct xt_table *t;
+
+ if (*len < sizeof(get)) {
+ duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+ return -EFAULT;
+
+ if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) {
+ duprintf("compat_get_entries: %u != %zu\n",
+ *len, sizeof(get) + get.size);
+ return -EINVAL;
+ }
+
+ xt_compat_lock(AF_INET6);
+ t = xt_find_table_lock(net, AF_INET6, get.name);
+ if (!IS_ERR_OR_NULL(t)) {
+ const struct xt_table_info *private = t->private;
+ struct xt_table_info info;
+ duprintf("t->private->number = %u\n", private->number);
+ ret = compat_table_info(private, &info);
+ if (!ret && get.size == info.size) {
+ ret = compat_copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ } else if (!ret) {
+ duprintf("compat_get_entries: I've got %u not %u!\n",
+ private->size, get.size);
+ ret = -EAGAIN;
+ }
+ xt_compat_flush_offsets(AF_INET6);
+ module_put(t->me);
+ xt_table_unlock(t);
+ } else
+ ret = t ? PTR_ERR(t) : -ENOENT;
+
+ xt_compat_unlock(AF_INET6);
+ return ret;
+}
+
+static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *);
+
+static int
+compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IP6T_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 1);
+ break;
+ case IP6T_SO_GET_ENTRIES:
+ ret = compat_get_entries(sock_net(sk), user, len);
+ break;
+ default:
+ ret = do_ip6t_get_ctl(sk, cmd, user, len);
+ }
+ return ret;
+}
+#endif
+
+static int
+do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IP6T_SO_SET_REPLACE:
+ ret = do_replace(sock_net(sk), user, len);
+ break;
+
+ case IP6T_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(sock_net(sk), user, len, 0);
+ break;
+
+ default:
+ duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int
+do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ int ret;
+
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IP6T_SO_GET_INFO:
+ ret = get_info(sock_net(sk), user, len, 0);
+ break;
+
+ case IP6T_SO_GET_ENTRIES:
+ ret = get_entries(sock_net(sk), user, len);
+ break;
+
+ case IP6T_SO_GET_REVISION_MATCH:
+ case IP6T_SO_GET_REVISION_TARGET: {
+ struct xt_get_revision rev;
+ int target;
+
+ if (*len != sizeof(rev)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ rev.name[sizeof(rev.name)-1] = 0;
+
+ if (cmd == IP6T_SO_GET_REVISION_TARGET)
+ target = 1;
+ else
+ target = 0;
+
+ try_then_request_module(xt_find_revision(AF_INET6, rev.name,
+ rev.revision,
+ target, &ret),
+ "ip6t_%s", rev.name);
+ break;
+ }
+
+ default:
+ duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+struct xt_table *ip6t_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct ip6t_replace *repl)
+{
+ int ret;
+ struct xt_table_info *newinfo;
+ struct xt_table_info bootstrap = {0};
+ void *loc_cpu_entry;
+ struct xt_table *new_table;
+
+ newinfo = xt_alloc_table_info(repl->size);
+ if (!newinfo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* choose the copy on our node/cpu, but dont care about preemption */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
+ if (ret != 0)
+ goto out_free;
+
+ new_table = xt_register_table(net, table, &bootstrap, newinfo);
+ if (IS_ERR(new_table)) {
+ ret = PTR_ERR(new_table);
+ goto out_free;
+ }
+ return new_table;
+
+out_free:
+ xt_free_table_info(newinfo);
+out:
+ return ERR_PTR(ret);
+}
+
+void ip6t_unregister_table(struct net *net, struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ip6t_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline bool
+icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+ u_int8_t type, u_int8_t code,
+ bool invert)
+{
+ return (type == test_type && code >= min_code && code <= max_code)
+ ^ invert;
+}
+
+static bool
+icmp6_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmp6hdr *ic;
+ struct icmp6hdr _icmph;
+ const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (ic == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ duprintf("Dropping evil ICMP tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp6_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->icmp6_type, ic->icmp6_code,
+ !!(icmpinfo->invflags&IP6T_ICMP_INV));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int icmp6_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0;
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct xt_target ip6t_builtin_tg[] __read_mostly = {
+ {
+ .name = XT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_IPV6,
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
+#endif
+ },
+ {
+ .name = XT_ERROR_TARGET,
+ .target = ip6t_error,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_IPV6,
+ },
+};
+
+static struct nf_sockopt_ops ip6t_sockopts = {
+ .pf = PF_INET6,
+ .set_optmin = IP6T_BASE_CTL,
+ .set_optmax = IP6T_SO_SET_MAX+1,
+ .set = do_ip6t_set_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_set = compat_do_ip6t_set_ctl,
+#endif
+ .get_optmin = IP6T_BASE_CTL,
+ .get_optmax = IP6T_SO_GET_MAX+1,
+ .get = do_ip6t_get_ctl,
+#ifdef CONFIG_COMPAT
+ .compat_get = compat_do_ip6t_get_ctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+static struct xt_match ip6t_builtin_mt[] __read_mostly = {
+ {
+ .name = "icmp6",
+ .match = icmp6_match,
+ .matchsize = sizeof(struct ip6t_icmp),
+ .checkentry = icmp6_checkentry,
+ .proto = IPPROTO_ICMPV6,
+ .family = NFPROTO_IPV6,
+ },
+};
+
+static int __net_init ip6_tables_net_init(struct net *net)
+{
+ return xt_proto_init(net, NFPROTO_IPV6);
+}
+
+static void __net_exit ip6_tables_net_exit(struct net *net)
+{
+ xt_proto_fini(net, NFPROTO_IPV6);
+}
+
+static struct pernet_operations ip6_tables_net_ops = {
+ .init = ip6_tables_net_init,
+ .exit = ip6_tables_net_exit,
+};
+
+static int __init ip6_tables_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip6_tables_net_ops);
+ if (ret < 0)
+ goto err1;
+
+ /* No one else will be downing sem now, so we won't sleep */
+ ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
+ if (ret < 0)
+ goto err2;
+ ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
+ if (ret < 0)
+ goto err4;
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&ip6t_sockopts);
+ if (ret < 0)
+ goto err5;
+
+ pr_info("(C) 2000-2006 Netfilter Core Team\n");
+ return 0;
+
+err5:
+ xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
+err4:
+ xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
+err2:
+ unregister_pernet_subsys(&ip6_tables_net_ops);
+err1:
+ return ret;
+}
+
+static void __exit ip6_tables_fini(void)
+{
+ nf_unregister_sockopt(&ip6t_sockopts);
+
+ xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
+ xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
+ unregister_pernet_subsys(&ip6_tables_net_ops);
+}
+
+EXPORT_SYMBOL(ip6t_register_table);
+EXPORT_SYMBOL(ip6t_unregister_table);
+EXPORT_SYMBOL(ip6t_do_table);
+
+module_init(ip6_tables_init);
+module_exit(ip6_tables_fini);
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
new file mode 100644
index 000000000..7f9f45d82
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/netfilter/ipv6/nf_nat_masquerade.h>
+
+static unsigned int
+masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return nf_nat_masquerade_ipv6(skb, par->targinfo, par->out);
+}
+
+static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target masquerade_tg6_reg __read_mostly = {
+ .name = "MASQUERADE",
+ .family = NFPROTO_IPV6,
+ .checkentry = masquerade_tg6_checkentry,
+ .target = masquerade_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .me = THIS_MODULE,
+};
+
+static int __init masquerade_tg6_init(void)
+{
+ int err;
+
+ err = xt_register_target(&masquerade_tg6_reg);
+ if (err == 0)
+ nf_nat_masquerade_ipv6_register_notifier();
+
+ return err;
+}
+static void __exit masquerade_tg6_exit(void)
+{
+ nf_nat_masquerade_ipv6_unregister_notifier();
+ xt_unregister_target(&masquerade_tg6_reg);
+}
+
+module_init(masquerade_tg6_init);
+module_exit(masquerade_tg6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: automatic address SNAT");
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
new file mode 100644
index 000000000..590f767db
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2011, 2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6t_NPT.h>
+#include <linux/netfilter/x_tables.h>
+
+static int ip6t_npt_checkentry(const struct xt_tgchk_param *par)
+{
+ struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct in6_addr pfx;
+ __wsum src_sum, dst_sum;
+
+ if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64)
+ return -EINVAL;
+
+ /* Ensure that LSB of prefix is zero */
+ ipv6_addr_prefix(&pfx, &npt->src_pfx.in6, npt->src_pfx_len);
+ if (!ipv6_addr_equal(&pfx, &npt->src_pfx.in6))
+ return -EINVAL;
+ ipv6_addr_prefix(&pfx, &npt->dst_pfx.in6, npt->dst_pfx_len);
+ if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6))
+ return -EINVAL;
+
+ src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0);
+ dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0);
+
+ npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum));
+ return 0;
+}
+
+static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt,
+ struct in6_addr *addr)
+{
+ unsigned int pfx_len;
+ unsigned int i, idx;
+ __be32 mask;
+ __sum16 sum;
+
+ pfx_len = max(npt->src_pfx_len, npt->dst_pfx_len);
+ for (i = 0; i < pfx_len; i += 32) {
+ if (pfx_len - i >= 32)
+ mask = 0;
+ else
+ mask = htonl((1 << (i - pfx_len + 32)) - 1);
+
+ idx = i / 32;
+ addr->s6_addr32[idx] &= mask;
+ addr->s6_addr32[idx] |= ~mask & npt->dst_pfx.in6.s6_addr32[idx];
+ }
+
+ if (pfx_len <= 48)
+ idx = 3;
+ else {
+ for (idx = 4; idx < ARRAY_SIZE(addr->s6_addr16); idx++) {
+ if ((__force __sum16)addr->s6_addr16[idx] !=
+ CSUM_MANGLED_0)
+ break;
+ }
+ if (idx == ARRAY_SIZE(addr->s6_addr16))
+ return false;
+ }
+
+ sum = ~csum_fold(csum_add(csum_unfold((__force __sum16)addr->s6_addr16[idx]),
+ csum_unfold(npt->adjustment)));
+ if (sum == CSUM_MANGLED_0)
+ sum = 0;
+ *(__force __sum16 *)&addr->s6_addr16[idx] = sum;
+
+ return true;
+}
+
+static unsigned int
+ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ip6t_npt_tginfo *npt = par->targinfo;
+
+ if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
+ offsetof(struct ipv6hdr, saddr));
+ return NF_DROP;
+ }
+ return XT_CONTINUE;
+}
+
+static unsigned int
+ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ip6t_npt_tginfo *npt = par->targinfo;
+
+ if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
+ offsetof(struct ipv6hdr, daddr));
+ return NF_DROP;
+ }
+ return XT_CONTINUE;
+}
+
+static struct xt_target ip6t_npt_target_reg[] __read_mostly = {
+ {
+ .name = "SNPT",
+ .table = "mangle",
+ .target = ip6t_snpt_tg,
+ .targetsize = sizeof(struct ip6t_npt_tginfo),
+ .checkentry = ip6t_npt_checkentry,
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_POST_ROUTING),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DNPT",
+ .table = "mangle",
+ .target = ip6t_dnpt_tg,
+ .targetsize = sizeof(struct ip6t_npt_tginfo),
+ .checkentry = ip6t_npt_checkentry,
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init ip6t_npt_init(void)
+{
+ return xt_register_targets(ip6t_npt_target_reg,
+ ARRAY_SIZE(ip6t_npt_target_reg));
+}
+
+static void __exit ip6t_npt_exit(void)
+{
+ xt_unregister_targets(ip6t_npt_target_reg,
+ ARRAY_SIZE(ip6t_npt_target_reg));
+}
+
+module_init(ip6t_npt_init);
+module_exit(ip6t_npt_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6-to-IPv6 Network Prefix Translation (RFC 6296)");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS("ip6t_SNPT");
+MODULE_ALIAS("ip6t_DNPT");
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
new file mode 100644
index 000000000..12331efd4
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -0,0 +1,119 @@
+/*
+ * IP6 tables REJECT target module
+ * Linux INET6 implementation
+ *
+ * Copyright (C)2003 USAGI/WIDE Project
+ *
+ * Authors:
+ * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on net/ipv4/netfilter/ipt_REJECT.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <linux/netdevice.h>
+#include <net/icmp.h>
+#include <net/flow.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_REJECT.h>
+
+#include <net/netfilter/ipv6/nf_reject.h>
+
+MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6");
+MODULE_LICENSE("GPL");
+
+
+static unsigned int
+reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ip6t_reject_info *reject = par->targinfo;
+ struct net *net = dev_net((par->in != NULL) ? par->in : par->out);
+
+ pr_debug("%s: medium point\n", __func__);
+ switch (reject->with) {
+ case IP6T_ICMP6_NO_ROUTE:
+ nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum);
+ break;
+ case IP6T_ICMP6_ADM_PROHIBITED:
+ nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum);
+ break;
+ case IP6T_ICMP6_NOT_NEIGHBOUR:
+ nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum);
+ break;
+ case IP6T_ICMP6_ADDR_UNREACH:
+ nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum);
+ break;
+ case IP6T_ICMP6_PORT_UNREACH:
+ nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, par->hooknum);
+ break;
+ case IP6T_ICMP6_ECHOREPLY:
+ /* Do nothing */
+ break;
+ case IP6T_TCP_RESET:
+ nf_send_reset6(net, skb, par->hooknum);
+ break;
+ default:
+ net_info_ratelimited("case %u not handled yet\n", reject->with);
+ break;
+ }
+
+ return NF_DROP;
+}
+
+static int reject_tg6_check(const struct xt_tgchk_param *par)
+{
+ const struct ip6t_reject_info *rejinfo = par->targinfo;
+ const struct ip6t_entry *e = par->entryinfo;
+
+ if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
+ pr_info("ECHOREPLY is not supported.\n");
+ return -EINVAL;
+ } else if (rejinfo->with == IP6T_TCP_RESET) {
+ /* Must specify that it's a TCP packet */
+ if (!(e->ipv6.flags & IP6T_F_PROTO) ||
+ e->ipv6.proto != IPPROTO_TCP ||
+ (e->ipv6.invflags & XT_INV_PROTO)) {
+ pr_info("TCP_RESET illegal for non-tcp\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static struct xt_target reject_tg6_reg __read_mostly = {
+ .name = "REJECT",
+ .family = NFPROTO_IPV6,
+ .target = reject_tg6,
+ .targetsize = sizeof(struct ip6t_reject_info),
+ .table = "filter",
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT),
+ .checkentry = reject_tg6_check,
+ .me = THIS_MODULE
+};
+
+static int __init reject_tg6_init(void)
+{
+ return xt_register_target(&reject_tg6_reg);
+}
+
+static void __exit reject_tg6_exit(void)
+{
+ xt_unregister_target(&reject_tg6_reg);
+}
+
+module_init(reject_tg6_init);
+module_exit(reject_tg6_exit);
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
new file mode 100644
index 000000000..6edb7b106
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_route.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+static struct ipv6hdr *
+synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
+{
+ struct ipv6hdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = (struct ipv6hdr *)skb_put(skb, sizeof(*iph));
+ ip6_flow_hdr(iph, 0, 0);
+ iph->hop_limit = 64; //XXX
+ iph->nexthdr = IPPROTO_TCP;
+ iph->saddr = *saddr;
+ iph->daddr = *daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct ipv6hdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ struct net *net = nf_ct_net((struct nf_conn *)nfct);
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.saddr = niph->saddr;
+ fl6.daddr = niph->daddr;
+ fl6.fl6_sport = nth->source;
+ fl6.fl6_dport = nth->dest;
+ security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst == NULL || dst->error) {
+ dst_release(dst);
+ goto free_nskb;
+ }
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
+ goto free_nskb;
+
+ skb_dst_set(nskb, dst);
+
+ if (nfct) {
+ nskb->nfct = nfct;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nfct);
+ }
+
+ ip6_local_out(nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+static void
+synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_syn(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(const struct synproxy_net *snet,
+ const struct ip_ct_tcp *state,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (nskb == NULL)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = ntohs(htons(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static bool
+synproxy_recv_client_ack(const struct synproxy_net *snet,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ int mss;
+
+ mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss = mss;
+ opts->options |= XT_SYNPROXY_OPT_MSS;
+
+ if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+ return true;
+}
+
+static unsigned int
+synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_synproxy_info *info = par->targinfo;
+ struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+ struct synproxy_options opts = {};
+ struct tcphdr *th, _th;
+
+ if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+ return NF_DROP;
+
+ th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ if (!synproxy_parse_options(skb, par->thoff, th, &opts))
+ return NF_DROP;
+
+ if (th->syn && !(th->ack || th->fin || th->rst)) {
+ /* Initial SYN from client */
+ this_cpu_inc(snet->stats->syn_received);
+
+ if (th->ece && th->cwr)
+ opts.options |= XT_SYNPROXY_OPT_ECN;
+
+ opts.options &= info->options;
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_init_timestamp_cookie(info, &opts);
+ else
+ opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM |
+ XT_SYNPROXY_OPT_ECN);
+
+ synproxy_send_client_synack(skb, th, &opts);
+ return NF_DROP;
+
+ } else if (th->ack && !(th->fin || th->rst || th->syn)) {
+ /* ACK from client */
+ synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+ return NF_DROP;
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct synproxy_net *snet = synproxy_pernet(dev_net(nhs->in ? : nhs->out));
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ __be16 frag_off;
+ u8 nexthdr;
+ int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (synproxy == NULL)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb))
+ return NF_ACCEPT;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (thoff < 0)
+ return NF_ACCEPT;
+
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+
+ /* fall through */
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack(snet, skb, th, &opts,
+ ntohl(th->seq) + 1))
+ this_cpu_inc(snet->stats->cookie_retrans);
+
+ return NF_DROP;
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->tsoff = opts.tsval - synproxy->its;
+
+ opts.options &= ~(XT_SYNPROXY_OPT_MSS |
+ XT_SYNPROXY_OPT_WSCALE |
+ XT_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack(snet, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack(snet, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+
+static int synproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+ const struct ip6t_entry *e = par->entryinfo;
+
+ if (!(e->ipv6.flags & IP6T_F_PROTO) ||
+ e->ipv6.proto != IPPROTO_TCP ||
+ e->ipv6.invflags & XT_INV_PROTO)
+ return -EINVAL;
+
+ return nf_ct_l3proto_try_module_get(par->family);
+}
+
+static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg6_reg __read_mostly = {
+ .name = "SYNPROXY",
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
+ .target = synproxy_tg6,
+ .targetsize = sizeof(struct xt_synproxy_info),
+ .checkentry = synproxy_tg6_check,
+ .destroy = synproxy_tg6_destroy,
+ .me = THIS_MODULE,
+};
+
+static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = {
+ {
+ .hook = ipv6_synproxy_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv6_synproxy_hook,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+static int __init synproxy_tg6_init(void)
+{
+ int err;
+
+ err = nf_register_hooks(ipv6_synproxy_ops,
+ ARRAY_SIZE(ipv6_synproxy_ops));
+ if (err < 0)
+ goto err1;
+
+ err = xt_register_target(&synproxy_tg6_reg);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+
+err2:
+ nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops));
+err1:
+ return err;
+}
+
+static void __exit synproxy_tg6_exit(void)
+{
+ xt_unregister_target(&synproxy_tg6_reg);
+ nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops));
+}
+
+module_init(synproxy_tg6_init);
+module_exit(synproxy_tg6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
new file mode 100644
index 000000000..04099ab7d
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -0,0 +1,121 @@
+/* Kernel module to match AH parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_ah.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 IPsec-AH match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+ bool r;
+
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
+ r = (spi >= min && spi <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+ return r;
+}
+
+static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ip_auth_hdr _ah;
+ const struct ip_auth_hdr *ah;
+ const struct ip6t_ah *ahinfo = par->matchinfo;
+ unsigned int ptr = 0;
+ unsigned int hdrlen = 0;
+ int err;
+
+ err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL, NULL);
+ if (err < 0) {
+ if (err != -ENOENT)
+ par->hotdrop = true;
+ return false;
+ }
+
+ ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
+ if (ah == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
+
+ hdrlen = (ah->hdrlen + 2) << 2;
+
+ pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
+ pr_debug("RES %04X ", ah->reserved);
+ pr_debug("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi));
+
+ pr_debug("IPv6 AH spi %02X ",
+ spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ ntohl(ah->spi),
+ !!(ahinfo->invflags & IP6T_AH_INV_SPI)));
+ pr_debug("len %02X %04X %02X ",
+ ahinfo->hdrlen, hdrlen,
+ (!ahinfo->hdrlen ||
+ (ahinfo->hdrlen == hdrlen) ^
+ !!(ahinfo->invflags & IP6T_AH_INV_LEN)));
+ pr_debug("res %02X %04X %02X\n",
+ ahinfo->hdrres, ah->reserved,
+ !(ahinfo->hdrres && ah->reserved));
+
+ return (ah != NULL) &&
+ spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ ntohl(ah->spi),
+ !!(ahinfo->invflags & IP6T_AH_INV_SPI)) &&
+ (!ahinfo->hdrlen ||
+ (ahinfo->hdrlen == hdrlen) ^
+ !!(ahinfo->invflags & IP6T_AH_INV_LEN)) &&
+ !(ahinfo->hdrres && ah->reserved);
+}
+
+static int ah_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_ah *ahinfo = par->matchinfo;
+
+ if (ahinfo->invflags & ~IP6T_AH_INV_MASK) {
+ pr_debug("unknown flags %X\n", ahinfo->invflags);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match ah_mt6_reg __read_mostly = {
+ .name = "ah",
+ .family = NFPROTO_IPV6,
+ .match = ah_mt6,
+ .matchsize = sizeof(struct ip6t_ah),
+ .checkentry = ah_mt6_check,
+ .me = THIS_MODULE,
+};
+
+static int __init ah_mt6_init(void)
+{
+ return xt_register_match(&ah_mt6_reg);
+}
+
+static void __exit ah_mt6_exit(void)
+{
+ xt_unregister_match(&ah_mt6_reg);
+}
+
+module_init(ah_mt6_init);
+module_exit(ah_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
new file mode 100644
index 000000000..aab070690
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -0,0 +1,74 @@
+/* Kernel module to match EUI64 address parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/if_ether.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_DESCRIPTION("Xtables: IPv6 EUI64 address match");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+static bool
+eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ unsigned char eui64[8];
+
+ if (!(skb_mac_header(skb) >= skb->head &&
+ skb_mac_header(skb) + ETH_HLEN <= skb->data) &&
+ par->fragoff != 0) {
+ par->hotdrop = true;
+ return false;
+ }
+
+ memset(eui64, 0, sizeof(eui64));
+
+ if (eth_hdr(skb)->h_proto == htons(ETH_P_IPV6)) {
+ if (ipv6_hdr(skb)->version == 0x6) {
+ memcpy(eui64, eth_hdr(skb)->h_source, 3);
+ memcpy(eui64 + 5, eth_hdr(skb)->h_source + 3, 3);
+ eui64[3] = 0xff;
+ eui64[4] = 0xfe;
+ eui64[0] ^= 0x02;
+
+ if (!memcmp(ipv6_hdr(skb)->saddr.s6_addr + 8, eui64,
+ sizeof(eui64)))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static struct xt_match eui64_mt6_reg __read_mostly = {
+ .name = "eui64",
+ .family = NFPROTO_IPV6,
+ .match = eui64_mt6,
+ .matchsize = sizeof(int),
+ .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD),
+ .me = THIS_MODULE,
+};
+
+static int __init eui64_mt6_init(void)
+{
+ return xt_register_match(&eui64_mt6_reg);
+}
+
+static void __exit eui64_mt6_exit(void)
+{
+ xt_unregister_match(&eui64_mt6_reg);
+}
+
+module_init(eui64_mt6_init);
+module_exit(eui64_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
new file mode 100644
index 000000000..3b5735e56
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -0,0 +1,136 @@
+/* Kernel module to match FRAG parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_frag.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 fragment match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+/* Returns 1 if the id is matched by the range, 0 otherwise */
+static inline bool
+id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
+{
+ bool r;
+ pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ',
+ min, id, max);
+ r = (id >= min && id <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+ return r;
+}
+
+static bool
+frag_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct frag_hdr _frag;
+ const struct frag_hdr *fh;
+ const struct ip6t_frag *fraginfo = par->matchinfo;
+ unsigned int ptr = 0;
+ int err;
+
+ err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL);
+ if (err < 0) {
+ if (err != -ENOENT)
+ par->hotdrop = true;
+ return false;
+ }
+
+ fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
+ if (fh == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
+
+ pr_debug("INFO %04X ", fh->frag_off);
+ pr_debug("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7);
+ pr_debug("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6);
+ pr_debug("MF %04X ", fh->frag_off & htons(IP6_MF));
+ pr_debug("ID %u %08X\n", ntohl(fh->identification),
+ ntohl(fh->identification));
+
+ pr_debug("IPv6 FRAG id %02X ",
+ id_match(fraginfo->ids[0], fraginfo->ids[1],
+ ntohl(fh->identification),
+ !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)));
+ pr_debug("res %02X %02X%04X %02X ",
+ fraginfo->flags & IP6T_FRAG_RES, fh->reserved,
+ ntohs(fh->frag_off) & 0x6,
+ !((fraginfo->flags & IP6T_FRAG_RES) &&
+ (fh->reserved || (ntohs(fh->frag_off) & 0x06))));
+ pr_debug("first %02X %02X %02X ",
+ fraginfo->flags & IP6T_FRAG_FST,
+ ntohs(fh->frag_off) & ~0x7,
+ !((fraginfo->flags & IP6T_FRAG_FST) &&
+ (ntohs(fh->frag_off) & ~0x7)));
+ pr_debug("mf %02X %02X %02X ",
+ fraginfo->flags & IP6T_FRAG_MF,
+ ntohs(fh->frag_off) & IP6_MF,
+ !((fraginfo->flags & IP6T_FRAG_MF) &&
+ !((ntohs(fh->frag_off) & IP6_MF))));
+ pr_debug("last %02X %02X %02X\n",
+ fraginfo->flags & IP6T_FRAG_NMF,
+ ntohs(fh->frag_off) & IP6_MF,
+ !((fraginfo->flags & IP6T_FRAG_NMF) &&
+ (ntohs(fh->frag_off) & IP6_MF)));
+
+ return (fh != NULL) &&
+ id_match(fraginfo->ids[0], fraginfo->ids[1],
+ ntohl(fh->identification),
+ !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) &&
+ !((fraginfo->flags & IP6T_FRAG_RES) &&
+ (fh->reserved || (ntohs(fh->frag_off) & 0x6))) &&
+ !((fraginfo->flags & IP6T_FRAG_FST) &&
+ (ntohs(fh->frag_off) & ~0x7)) &&
+ !((fraginfo->flags & IP6T_FRAG_MF) &&
+ !(ntohs(fh->frag_off) & IP6_MF)) &&
+ !((fraginfo->flags & IP6T_FRAG_NMF) &&
+ (ntohs(fh->frag_off) & IP6_MF));
+}
+
+static int frag_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_frag *fraginfo = par->matchinfo;
+
+ if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) {
+ pr_debug("unknown flags %X\n", fraginfo->invflags);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match frag_mt6_reg __read_mostly = {
+ .name = "frag",
+ .family = NFPROTO_IPV6,
+ .match = frag_mt6,
+ .matchsize = sizeof(struct ip6t_frag),
+ .checkentry = frag_mt6_check,
+ .me = THIS_MODULE,
+};
+
+static int __init frag_mt6_init(void)
+{
+ return xt_register_match(&frag_mt6_reg);
+}
+
+static void __exit frag_mt6_exit(void)
+{
+ xt_unregister_match(&frag_mt6_reg);
+}
+
+module_init(frag_mt6_init);
+module_exit(frag_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
new file mode 100644
index 000000000..01df142bb
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -0,0 +1,215 @@
+/* Kernel module to match Hop-by-Hop and Destination parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <asm/byteorder.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_opts.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+MODULE_ALIAS("ip6t_dst");
+
+/*
+ * (Type & 0xC0) >> 6
+ * 0 -> ignorable
+ * 1 -> must drop the packet
+ * 2 -> send ICMP PARM PROB regardless and drop packet
+ * 3 -> Send ICMP if not a multicast address and drop packet
+ * (Type & 0x20) >> 5
+ * 0 -> invariant
+ * 1 -> can change the routing
+ * (Type & 0x1F) Type
+ * 0 -> Pad1 (only 1 byte!)
+ * 1 -> PadN LENGTH info (total length = length + 2)
+ * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k )
+ * 5 -> RTALERT 2 x x
+ */
+
+static struct xt_match hbh_mt6_reg[] __read_mostly;
+
+static bool
+hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ipv6_opt_hdr _optsh;
+ const struct ipv6_opt_hdr *oh;
+ const struct ip6t_opts *optinfo = par->matchinfo;
+ unsigned int temp;
+ unsigned int ptr = 0;
+ unsigned int hdrlen = 0;
+ bool ret = false;
+ u8 _opttype;
+ u8 _optlen;
+ const u_int8_t *tp = NULL;
+ const u_int8_t *lp = NULL;
+ unsigned int optlen;
+ int err;
+
+ err = ipv6_find_hdr(skb, &ptr,
+ (par->match == &hbh_mt6_reg[0]) ?
+ NEXTHDR_HOP : NEXTHDR_DEST, NULL, NULL);
+ if (err < 0) {
+ if (err != -ENOENT)
+ par->hotdrop = true;
+ return false;
+ }
+
+ oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
+ if (oh == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
+
+ hdrlen = ipv6_optlen(oh);
+ if (skb->len - ptr < hdrlen) {
+ /* Packet smaller than it's length field */
+ return false;
+ }
+
+ pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
+
+ pr_debug("len %02X %04X %02X ",
+ optinfo->hdrlen, hdrlen,
+ (!(optinfo->flags & IP6T_OPTS_LEN) ||
+ ((optinfo->hdrlen == hdrlen) ^
+ !!(optinfo->invflags & IP6T_OPTS_INV_LEN))));
+
+ ret = (oh != NULL) &&
+ (!(optinfo->flags & IP6T_OPTS_LEN) ||
+ ((optinfo->hdrlen == hdrlen) ^
+ !!(optinfo->invflags & IP6T_OPTS_INV_LEN)));
+
+ ptr += 2;
+ hdrlen -= 2;
+ if (!(optinfo->flags & IP6T_OPTS_OPTS)) {
+ return ret;
+ } else {
+ pr_debug("Strict ");
+ pr_debug("#%d ", optinfo->optsnr);
+ for (temp = 0; temp < optinfo->optsnr; temp++) {
+ /* type field exists ? */
+ if (hdrlen < 1)
+ break;
+ tp = skb_header_pointer(skb, ptr, sizeof(_opttype),
+ &_opttype);
+ if (tp == NULL)
+ break;
+
+ /* Type check */
+ if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) {
+ pr_debug("Tbad %02X %02X\n", *tp,
+ (optinfo->opts[temp] & 0xFF00) >> 8);
+ return false;
+ } else {
+ pr_debug("Tok ");
+ }
+ /* Length check */
+ if (*tp) {
+ u16 spec_len;
+
+ /* length field exists ? */
+ if (hdrlen < 2)
+ break;
+ lp = skb_header_pointer(skb, ptr + 1,
+ sizeof(_optlen),
+ &_optlen);
+ if (lp == NULL)
+ break;
+ spec_len = optinfo->opts[temp] & 0x00FF;
+
+ if (spec_len != 0x00FF && spec_len != *lp) {
+ pr_debug("Lbad %02X %04X\n", *lp,
+ spec_len);
+ return false;
+ }
+ pr_debug("Lok ");
+ optlen = *lp + 2;
+ } else {
+ pr_debug("Pad1\n");
+ optlen = 1;
+ }
+
+ /* Step to the next */
+ pr_debug("len%04X\n", optlen);
+
+ if ((ptr > skb->len - optlen || hdrlen < optlen) &&
+ temp < optinfo->optsnr - 1) {
+ pr_debug("new pointer is too large!\n");
+ break;
+ }
+ ptr += optlen;
+ hdrlen -= optlen;
+ }
+ if (temp == optinfo->optsnr)
+ return ret;
+ else
+ return false;
+ }
+
+ return false;
+}
+
+static int hbh_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_opts *optsinfo = par->matchinfo;
+
+ if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) {
+ pr_debug("unknown flags %X\n", optsinfo->invflags);
+ return -EINVAL;
+ }
+
+ if (optsinfo->flags & IP6T_OPTS_NSTRICT) {
+ pr_debug("Not strict - not implemented");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match hbh_mt6_reg[] __read_mostly = {
+ {
+ /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */
+ .name = "hbh",
+ .family = NFPROTO_IPV6,
+ .match = hbh_mt6,
+ .matchsize = sizeof(struct ip6t_opts),
+ .checkentry = hbh_mt6_check,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "dst",
+ .family = NFPROTO_IPV6,
+ .match = hbh_mt6,
+ .matchsize = sizeof(struct ip6t_opts),
+ .checkentry = hbh_mt6_check,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init hbh_mt6_init(void)
+{
+ return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg));
+}
+
+static void __exit hbh_mt6_exit(void)
+{
+ xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg));
+}
+
+module_init(hbh_mt6_init);
+module_exit(hbh_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
new file mode 100644
index 000000000..8b147440f
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -0,0 +1,153 @@
+/* ipv6header match - matches IPv6 packets based
+ on whether they contain certain headers */
+
+/* Original idea: Brad Chapman
+ * Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_ipv6header.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 header types match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+static bool
+ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ip6t_ipv6header_info *info = par->matchinfo;
+ unsigned int temp;
+ int len;
+ u8 nexthdr;
+ unsigned int ptr;
+
+ /* Make sure this isn't an evil packet */
+
+ /* type of the 1st exthdr */
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ /* pointer to the 1st exthdr */
+ ptr = sizeof(struct ipv6hdr);
+ /* available length */
+ len = skb->len - ptr;
+ temp = 0;
+
+ while (ip6t_ext_hdr(nexthdr)) {
+ const struct ipv6_opt_hdr *hp;
+ struct ipv6_opt_hdr _hdr;
+ int hdrlen;
+
+ /* No more exthdr -> evaluate */
+ if (nexthdr == NEXTHDR_NONE) {
+ temp |= MASK_NONE;
+ break;
+ }
+ /* Is there enough space for the next ext header? */
+ if (len < (int)sizeof(struct ipv6_opt_hdr))
+ return false;
+ /* ESP -> evaluate */
+ if (nexthdr == NEXTHDR_ESP) {
+ temp |= MASK_ESP;
+ break;
+ }
+
+ hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+ BUG_ON(hp == NULL);
+
+ /* Calculate the header length */
+ if (nexthdr == NEXTHDR_FRAGMENT)
+ hdrlen = 8;
+ else if (nexthdr == NEXTHDR_AUTH)
+ hdrlen = (hp->hdrlen + 2) << 2;
+ else
+ hdrlen = ipv6_optlen(hp);
+
+ /* set the flag */
+ switch (nexthdr) {
+ case NEXTHDR_HOP:
+ temp |= MASK_HOPOPTS;
+ break;
+ case NEXTHDR_ROUTING:
+ temp |= MASK_ROUTING;
+ break;
+ case NEXTHDR_FRAGMENT:
+ temp |= MASK_FRAGMENT;
+ break;
+ case NEXTHDR_AUTH:
+ temp |= MASK_AH;
+ break;
+ case NEXTHDR_DEST:
+ temp |= MASK_DSTOPTS;
+ break;
+ default:
+ return false;
+ }
+
+ nexthdr = hp->nexthdr;
+ len -= hdrlen;
+ ptr += hdrlen;
+ if (ptr > skb->len)
+ break;
+ }
+
+ if (nexthdr != NEXTHDR_NONE && nexthdr != NEXTHDR_ESP)
+ temp |= MASK_PROTO;
+
+ if (info->modeflag)
+ return !((temp ^ info->matchflags ^ info->invflags)
+ & info->matchflags);
+ else {
+ if (info->invflags)
+ return temp != info->matchflags;
+ else
+ return temp == info->matchflags;
+ }
+}
+
+static int ipv6header_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_ipv6header_info *info = par->matchinfo;
+
+ /* invflags is 0 or 0xff in hard mode */
+ if ((!info->modeflag) && info->invflags != 0x00 &&
+ info->invflags != 0xFF)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_match ipv6header_mt6_reg __read_mostly = {
+ .name = "ipv6header",
+ .family = NFPROTO_IPV6,
+ .match = ipv6header_mt6,
+ .matchsize = sizeof(struct ip6t_ipv6header_info),
+ .checkentry = ipv6header_mt6_check,
+ .destroy = NULL,
+ .me = THIS_MODULE,
+};
+
+static int __init ipv6header_mt6_init(void)
+{
+ return xt_register_match(&ipv6header_mt6_reg);
+}
+
+static void __exit ipv6header_mt6_exit(void)
+{
+ xt_unregister_match(&ipv6header_mt6_reg);
+}
+
+module_init(ipv6header_mt6_init);
+module_exit(ipv6header_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
new file mode 100644
index 000000000..0c90c66b1
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C)2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ * Masahide NAKAMURA @USAGI <masahide.nakamura.cz@hitachi.com>
+ *
+ * Based on net/netfilter/xt_tcpudp.c
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/mip6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6t_mh.h>
+
+MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match");
+MODULE_LICENSE("GPL");
+
+/* Returns 1 if the type is matched by the range, 0 otherwise */
+static inline bool
+type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert)
+{
+ return (type >= min && type <= max) ^ invert;
+}
+
+static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ip6_mh _mh;
+ const struct ip6_mh *mh;
+ const struct ip6t_mh *mhinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh);
+ if (mh == NULL) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ pr_debug("Dropping evil MH tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ if (mh->ip6mh_proto != IPPROTO_NONE) {
+ pr_debug("Dropping invalid MH Payload Proto: %u\n",
+ mh->ip6mh_proto);
+ par->hotdrop = true;
+ return false;
+ }
+
+ return type_match(mhinfo->types[0], mhinfo->types[1], mh->ip6mh_type,
+ !!(mhinfo->invflags & IP6T_MH_INV_TYPE));
+}
+
+static int mh_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_mh *mhinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0;
+}
+
+static struct xt_match mh_mt6_reg __read_mostly = {
+ .name = "mh",
+ .family = NFPROTO_IPV6,
+ .checkentry = mh_mt6_check,
+ .match = mh_mt6,
+ .matchsize = sizeof(struct ip6t_mh),
+ .proto = IPPROTO_MH,
+ .me = THIS_MODULE,
+};
+
+static int __init mh_mt6_init(void)
+{
+ return xt_register_match(&mh_mt6_reg);
+}
+
+static void __exit mh_mt6_exit(void)
+{
+ xt_unregister_match(&mh_mt6_reg);
+}
+
+module_init(mh_mt6_init);
+module_exit(mh_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
new file mode 100644
index 000000000..790e0c6b1
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/route.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+
+#include <linux/netfilter/xt_rpfilter.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("Xtables: IPv6 reverse path filter match");
+
+static bool rpfilter_addr_unicast(const struct in6_addr *addr)
+{
+ int addr_type = ipv6_addr_type(addr);
+ return addr_type & IPV6_ADDR_UNICAST;
+}
+
+static bool rpfilter_lookup_reverse6(const struct sk_buff *skb,
+ const struct net_device *dev, u8 flags)
+{
+ struct rt6_info *rt;
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ bool ret = false;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
+ .flowi6_proto = iph->nexthdr,
+ .daddr = iph->saddr,
+ };
+ int lookup_flags;
+
+ if (rpfilter_addr_unicast(&iph->daddr)) {
+ memcpy(&fl6.saddr, &iph->daddr, sizeof(struct in6_addr));
+ lookup_flags = RT6_LOOKUP_F_HAS_SADDR;
+ } else {
+ lookup_flags = 0;
+ }
+
+ fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+ if ((flags & XT_RPFILTER_LOOSE) == 0) {
+ fl6.flowi6_oif = dev->ifindex;
+ lookup_flags |= RT6_LOOKUP_F_IFACE;
+ }
+
+ rt = (void *) ip6_route_lookup(dev_net(dev), &fl6, lookup_flags);
+ if (rt->dst.error)
+ goto out;
+
+ if (rt->rt6i_flags & (RTF_REJECT|RTF_ANYCAST))
+ goto out;
+
+ if (rt->rt6i_flags & RTF_LOCAL) {
+ ret = flags & XT_RPFILTER_ACCEPT_LOCAL;
+ goto out;
+ }
+
+ if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE))
+ ret = true;
+ out:
+ ip6_rt_put(rt);
+ return ret;
+}
+
+static bool rpfilter_is_local(const struct sk_buff *skb)
+{
+ const struct rt6_info *rt = (const void *) skb_dst(skb);
+ return rt && (rt->rt6i_flags & RTF_LOCAL);
+}
+
+static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_rpfilter_info *info = par->matchinfo;
+ int saddrtype;
+ struct ipv6hdr *iph;
+ bool invert = info->flags & XT_RPFILTER_INVERT;
+
+ if (rpfilter_is_local(skb))
+ return true ^ invert;
+
+ iph = ipv6_hdr(skb);
+ saddrtype = ipv6_addr_type(&iph->saddr);
+ if (unlikely(saddrtype == IPV6_ADDR_ANY))
+ return true ^ invert; /* not routable: forward path will drop it */
+
+ return rpfilter_lookup_reverse6(skb, par->in, info->flags) ^ invert;
+}
+
+static int rpfilter_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_rpfilter_info *info = par->matchinfo;
+ unsigned int options = ~XT_RPFILTER_OPTION_MASK;
+
+ if (info->flags & options) {
+ pr_info("unknown options encountered");
+ return -EINVAL;
+ }
+
+ if (strcmp(par->table, "mangle") != 0 &&
+ strcmp(par->table, "raw") != 0) {
+ pr_info("match only valid in the \'raw\' "
+ "or \'mangle\' tables, not \'%s\'.\n", par->table);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match rpfilter_mt_reg __read_mostly = {
+ .name = "rpfilter",
+ .family = NFPROTO_IPV6,
+ .checkentry = rpfilter_check,
+ .match = rpfilter_mt,
+ .matchsize = sizeof(struct xt_rpfilter_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING),
+ .me = THIS_MODULE
+};
+
+static int __init rpfilter_mt_init(void)
+{
+ return xt_register_match(&rpfilter_mt_reg);
+}
+
+static void __exit rpfilter_mt_exit(void)
+{
+ xt_unregister_match(&rpfilter_mt_reg);
+}
+
+module_init(rpfilter_mt_init);
+module_exit(rpfilter_mt_exit);
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
new file mode 100644
index 000000000..2c99b94ee
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -0,0 +1,225 @@
+/* Kernel module to match ROUTING parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <asm/byteorder.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_rt.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 Routing Header match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+/* Returns 1 if the id is matched by the range, 0 otherwise */
+static inline bool
+segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
+{
+ bool r;
+ pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, id, max);
+ r = (id >= min && id <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+ return r;
+}
+
+static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ipv6_rt_hdr _route;
+ const struct ipv6_rt_hdr *rh;
+ const struct ip6t_rt *rtinfo = par->matchinfo;
+ unsigned int temp;
+ unsigned int ptr = 0;
+ unsigned int hdrlen = 0;
+ bool ret = false;
+ struct in6_addr _addr;
+ const struct in6_addr *ap;
+ int err;
+
+ err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL, NULL);
+ if (err < 0) {
+ if (err != -ENOENT)
+ par->hotdrop = true;
+ return false;
+ }
+
+ rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
+ if (rh == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
+
+ hdrlen = ipv6_optlen(rh);
+ if (skb->len - ptr < hdrlen) {
+ /* Pcket smaller than its length field */
+ return false;
+ }
+
+ pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
+ pr_debug("TYPE %04X ", rh->type);
+ pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left);
+
+ pr_debug("IPv6 RT segsleft %02X ",
+ segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
+ rh->segments_left,
+ !!(rtinfo->invflags & IP6T_RT_INV_SGS)));
+ pr_debug("type %02X %02X %02X ",
+ rtinfo->rt_type, rh->type,
+ (!(rtinfo->flags & IP6T_RT_TYP) ||
+ ((rtinfo->rt_type == rh->type) ^
+ !!(rtinfo->invflags & IP6T_RT_INV_TYP))));
+ pr_debug("len %02X %04X %02X ",
+ rtinfo->hdrlen, hdrlen,
+ !(rtinfo->flags & IP6T_RT_LEN) ||
+ ((rtinfo->hdrlen == hdrlen) ^
+ !!(rtinfo->invflags & IP6T_RT_INV_LEN)));
+ pr_debug("res %02X %02X %02X ",
+ rtinfo->flags & IP6T_RT_RES,
+ ((const struct rt0_hdr *)rh)->reserved,
+ !((rtinfo->flags & IP6T_RT_RES) &&
+ (((const struct rt0_hdr *)rh)->reserved)));
+
+ ret = (rh != NULL) &&
+ (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
+ rh->segments_left,
+ !!(rtinfo->invflags & IP6T_RT_INV_SGS))) &&
+ (!(rtinfo->flags & IP6T_RT_LEN) ||
+ ((rtinfo->hdrlen == hdrlen) ^
+ !!(rtinfo->invflags & IP6T_RT_INV_LEN))) &&
+ (!(rtinfo->flags & IP6T_RT_TYP) ||
+ ((rtinfo->rt_type == rh->type) ^
+ !!(rtinfo->invflags & IP6T_RT_INV_TYP)));
+
+ if (ret && (rtinfo->flags & IP6T_RT_RES)) {
+ const u_int32_t *rp;
+ u_int32_t _reserved;
+ rp = skb_header_pointer(skb,
+ ptr + offsetof(struct rt0_hdr,
+ reserved),
+ sizeof(_reserved),
+ &_reserved);
+
+ ret = (*rp == 0);
+ }
+
+ pr_debug("#%d ", rtinfo->addrnr);
+ if (!(rtinfo->flags & IP6T_RT_FST)) {
+ return ret;
+ } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) {
+ pr_debug("Not strict ");
+ if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
+ pr_debug("There isn't enough space\n");
+ return false;
+ } else {
+ unsigned int i = 0;
+
+ pr_debug("#%d ", rtinfo->addrnr);
+ for (temp = 0;
+ temp < (unsigned int)((hdrlen - 8) / 16);
+ temp++) {
+ ap = skb_header_pointer(skb,
+ ptr
+ + sizeof(struct rt0_hdr)
+ + temp * sizeof(_addr),
+ sizeof(_addr),
+ &_addr);
+
+ BUG_ON(ap == NULL);
+
+ if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) {
+ pr_debug("i=%d temp=%d;\n", i, temp);
+ i++;
+ }
+ if (i == rtinfo->addrnr)
+ break;
+ }
+ pr_debug("i=%d #%d\n", i, rtinfo->addrnr);
+ if (i == rtinfo->addrnr)
+ return ret;
+ else
+ return false;
+ }
+ } else {
+ pr_debug("Strict ");
+ if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
+ pr_debug("There isn't enough space\n");
+ return false;
+ } else {
+ pr_debug("#%d ", rtinfo->addrnr);
+ for (temp = 0; temp < rtinfo->addrnr; temp++) {
+ ap = skb_header_pointer(skb,
+ ptr
+ + sizeof(struct rt0_hdr)
+ + temp * sizeof(_addr),
+ sizeof(_addr),
+ &_addr);
+ BUG_ON(ap == NULL);
+
+ if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp]))
+ break;
+ }
+ pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr);
+ if (temp == rtinfo->addrnr &&
+ temp == (unsigned int)((hdrlen - 8) / 16))
+ return ret;
+ else
+ return false;
+ }
+ }
+
+ return false;
+}
+
+static int rt_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_rt *rtinfo = par->matchinfo;
+
+ if (rtinfo->invflags & ~IP6T_RT_INV_MASK) {
+ pr_debug("unknown flags %X\n", rtinfo->invflags);
+ return -EINVAL;
+ }
+ if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) &&
+ (!(rtinfo->flags & IP6T_RT_TYP) ||
+ (rtinfo->rt_type != 0) ||
+ (rtinfo->invflags & IP6T_RT_INV_TYP))) {
+ pr_debug("`--rt-type 0' required before `--rt-0-*'");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match rt_mt6_reg __read_mostly = {
+ .name = "rt",
+ .family = NFPROTO_IPV6,
+ .match = rt_mt6,
+ .matchsize = sizeof(struct ip6t_rt),
+ .checkentry = rt_mt6_check,
+ .me = THIS_MODULE,
+};
+
+static int __init rt_mt6_init(void)
+{
+ return xt_register_match(&rt_mt6_reg);
+}
+
+static void __exit rt_mt6_exit(void)
+{
+ xt_unregister_match(&rt_mt6_reg);
+}
+
+module_init(rt_mt6_init);
+module_exit(rt_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
new file mode 100644
index 000000000..5c33d8abc
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -0,0 +1,105 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("ip6tables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_filter = {
+ .name = "filter",
+ .valid_hooks = FILTER_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_FILTER,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net = dev_net(state->in ? state->in : state->out);
+
+ return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_filter);
+}
+
+static struct nf_hook_ops *filter_ops __read_mostly;
+
+/* Default to forward because I got too much mail already. */
+static bool forward = true;
+module_param(forward, bool, 0000);
+
+static int __net_init ip6table_filter_net_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+
+ repl = ip6t_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ /* Entry 1 is the FORWARD hook */
+ ((struct ip6t_standard *)repl->entries)[1].target.verdict =
+ forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+
+ net->ipv6.ip6table_filter =
+ ip6t_register_table(net, &packet_filter, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter);
+}
+
+static void __net_exit ip6table_filter_net_exit(struct net *net)
+{
+ ip6t_unregister_table(net, net->ipv6.ip6table_filter);
+}
+
+static struct pernet_operations ip6table_filter_net_ops = {
+ .init = ip6table_filter_net_init,
+ .exit = ip6table_filter_net_exit,
+};
+
+static int __init ip6table_filter_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip6table_filter_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook);
+ if (IS_ERR(filter_ops)) {
+ ret = PTR_ERR(filter_ops);
+ goto cleanup_table;
+ }
+
+ return ret;
+
+ cleanup_table:
+ unregister_pernet_subsys(&ip6table_filter_net_ops);
+ return ret;
+}
+
+static void __exit ip6table_filter_fini(void)
+{
+ xt_hook_unlink(&packet_filter, filter_ops);
+ unregister_pernet_subsys(&ip6table_filter_net_ops);
+}
+
+module_init(ip6table_filter_init);
+module_exit(ip6table_filter_fini);
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
new file mode 100644
index 000000000..b551f5b79
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -0,0 +1,145 @@
+/*
+ * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6
+ *
+ * Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("ip6tables mangle table");
+
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+ (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT) | \
+ (1 << NF_INET_POST_ROUTING))
+
+static const struct xt_table packet_mangler = {
+ .name = "mangle",
+ .valid_hooks = MANGLE_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_MANGLE,
+};
+
+static unsigned int
+ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
+{
+ unsigned int ret;
+ struct in6_addr saddr, daddr;
+ u_int8_t hop_limit;
+ u_int32_t flowlabel, mark;
+ int err;
+#if 0
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)) {
+ net_warn_ratelimited("ip6t_hook: happy cracking\n");
+ return NF_ACCEPT;
+ }
+#endif
+
+ /* save source/dest address, mark, hoplimit, flowlabel, priority, */
+ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+ memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+ mark = skb->mark;
+ hop_limit = ipv6_hdr(skb)->hop_limit;
+
+ /* flowlabel and prio (includes version, which shouldn't change either */
+ flowlabel = *((u_int32_t *)ipv6_hdr(skb));
+
+ ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, state,
+ dev_net(state->out)->ipv6.ip6table_mangle);
+
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) ||
+ !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) ||
+ skb->mark != mark ||
+ ipv6_hdr(skb)->hop_limit != hop_limit ||
+ flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
+ err = ip6_route_me_harder(skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+
+ return ret;
+}
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (ops->hooknum == NF_INET_LOCAL_OUT)
+ return ip6t_mangle_out(skb, state);
+ if (ops->hooknum == NF_INET_POST_ROUTING)
+ return ip6t_do_table(skb, ops->hooknum, state,
+ dev_net(state->out)->ipv6.ip6table_mangle);
+ /* INPUT/FORWARD */
+ return ip6t_do_table(skb, ops->hooknum, state,
+ dev_net(state->in)->ipv6.ip6table_mangle);
+}
+
+static struct nf_hook_ops *mangle_ops __read_mostly;
+static int __net_init ip6table_mangle_net_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+
+ repl = ip6t_alloc_initial_table(&packet_mangler);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv6.ip6table_mangle =
+ ip6t_register_table(net, &packet_mangler, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle);
+}
+
+static void __net_exit ip6table_mangle_net_exit(struct net *net)
+{
+ ip6t_unregister_table(net, net->ipv6.ip6table_mangle);
+}
+
+static struct pernet_operations ip6table_mangle_net_ops = {
+ .init = ip6table_mangle_net_init,
+ .exit = ip6table_mangle_net_exit,
+};
+
+static int __init ip6table_mangle_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip6table_mangle_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ ret = PTR_ERR(mangle_ops);
+ goto cleanup_table;
+ }
+
+ return ret;
+
+ cleanup_table:
+ unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ return ret;
+}
+
+static void __exit ip6table_mangle_fini(void)
+{
+ xt_hook_unlink(&packet_mangler, mangle_ops);
+ unregister_pernet_subsys(&ip6table_mangle_net_ops);
+}
+
+module_init(ip6table_mangle_init);
+module_exit(ip6table_mangle_fini);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
new file mode 100644
index 000000000..c3a7f7af0
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv4 NAT code. Development of IPv6 NAT
+ * funded by Astaro.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+
+static const struct xt_table nf_nat_ipv6_table = {
+ .name = "nat",
+ .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+};
+
+static unsigned int ip6table_nat_do_chain(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+
+ return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_nat);
+}
+
+static unsigned int ip6table_nat_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv6_fn(ops, skb, state, ip6table_nat_do_chain);
+}
+
+static unsigned int ip6table_nat_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv6_in(ops, skb, state, ip6table_nat_do_chain);
+}
+
+static unsigned int ip6table_nat_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv6_out(ops, skb, state, ip6table_nat_do_chain);
+}
+
+static unsigned int ip6table_nat_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv6_local_fn(ops, skb, state, ip6table_nat_do_chain);
+}
+
+static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = ip6table_nat_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = ip6table_nat_out,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = ip6table_nat_local_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = ip6table_nat_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC,
+ },
+};
+
+static int __net_init ip6table_nat_net_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+
+ repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat);
+}
+
+static void __net_exit ip6table_nat_net_exit(struct net *net)
+{
+ ip6t_unregister_table(net, net->ipv6.ip6table_nat);
+}
+
+static struct pernet_operations ip6table_nat_net_ops = {
+ .init = ip6table_nat_net_init,
+ .exit = ip6table_nat_net_exit,
+};
+
+static int __init ip6table_nat_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&ip6table_nat_net_ops);
+ if (err < 0)
+ goto err1;
+
+ err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ unregister_pernet_subsys(&ip6table_nat_net_ops);
+err1:
+ return err;
+}
+
+static void __exit ip6table_nat_exit(void)
+{
+ nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+ unregister_pernet_subsys(&ip6table_nat_net_ops);
+}
+
+module_init(ip6table_nat_init);
+module_exit(ip6table_nat_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
new file mode 100644
index 000000000..0b33caad2
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -0,0 +1,85 @@
+/*
+ * IPv6 raw table, a port of the IPv4 raw table to IPv6
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_raw = {
+ .name = "raw",
+ .valid_hooks = RAW_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_RAW,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net = dev_net(state->in ? state->in : state->out);
+
+ return ip6t_do_table(skb, ops->hooknum, state, net->ipv6.ip6table_raw);
+}
+
+static struct nf_hook_ops *rawtable_ops __read_mostly;
+
+static int __net_init ip6table_raw_net_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+
+ repl = ip6t_alloc_initial_table(&packet_raw);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv6.ip6table_raw =
+ ip6t_register_table(net, &packet_raw, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw);
+}
+
+static void __net_exit ip6table_raw_net_exit(struct net *net)
+{
+ ip6t_unregister_table(net, net->ipv6.ip6table_raw);
+}
+
+static struct pernet_operations ip6table_raw_net_ops = {
+ .init = ip6table_raw_net_init,
+ .exit = ip6table_raw_net_exit,
+};
+
+static int __init ip6table_raw_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip6table_raw_net_ops);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook);
+ if (IS_ERR(rawtable_ops)) {
+ ret = PTR_ERR(rawtable_ops);
+ goto cleanup_table;
+ }
+
+ return ret;
+
+ cleanup_table:
+ unregister_pernet_subsys(&ip6table_raw_net_ops);
+ return ret;
+}
+
+static void __exit ip6table_raw_fini(void)
+{
+ xt_hook_unlink(&packet_raw, rawtable_ops);
+ unregister_pernet_subsys(&ip6table_raw_net_ops);
+}
+
+module_init(ip6table_raw_init);
+module_exit(ip6table_raw_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
new file mode 100644
index 000000000..fcef83c25
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -0,0 +1,101 @@
+/*
+ * "security" table for IPv6
+ *
+ * This is for use by Mandatory Access Control (MAC) security models,
+ * which need to be able to manage security policy in separate context
+ * to DAC.
+ *
+ * Based on iptable_mangle.c
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
+ * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
+MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
+
+#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \
+ (1 << NF_INET_FORWARD) | \
+ (1 << NF_INET_LOCAL_OUT)
+
+static const struct xt_table security_table = {
+ .name = "security",
+ .valid_hooks = SECURITY_VALID_HOOKS,
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+ .priority = NF_IP6_PRI_SECURITY,
+};
+
+static unsigned int
+ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct net *net = dev_net(state->in ? state->in : state->out);
+
+ return ip6t_do_table(skb, ops->hooknum, state,
+ net->ipv6.ip6table_security);
+}
+
+static struct nf_hook_ops *sectbl_ops __read_mostly;
+
+static int __net_init ip6table_security_net_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+
+ repl = ip6t_alloc_initial_table(&security_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv6.ip6table_security =
+ ip6t_register_table(net, &security_table, repl);
+ kfree(repl);
+ return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security);
+}
+
+static void __net_exit ip6table_security_net_exit(struct net *net)
+{
+ ip6t_unregister_table(net, net->ipv6.ip6table_security);
+}
+
+static struct pernet_operations ip6table_security_net_ops = {
+ .init = ip6table_security_net_init,
+ .exit = ip6table_security_net_exit,
+};
+
+static int __init ip6table_security_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip6table_security_net_ops);
+ if (ret < 0)
+ return ret;
+
+ sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook);
+ if (IS_ERR(sectbl_ops)) {
+ ret = PTR_ERR(sectbl_ops);
+ goto cleanup_table;
+ }
+
+ return ret;
+
+cleanup_table:
+ unregister_pernet_subsys(&ip6table_security_net_ops);
+ return ret;
+}
+
+static void __exit ip6table_security_fini(void)
+{
+ xt_hook_unlink(&security_table, sectbl_ops);
+ unregister_pernet_subsys(&ip6table_security_net_ops);
+}
+
+module_init(ip6table_security_init);
+module_exit(ip6table_security_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
new file mode 100644
index 000000000..4ba0c34c6
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (C)2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netfilter/nf_log.h>
+
+static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const u_int32_t *ap;
+ u_int32_t _addrs[8];
+
+ ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
+ sizeof(_addrs), _addrs);
+ if (ap == NULL)
+ return false;
+
+ memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
+ memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
+
+ return true;
+}
+
+static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
+ memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
+
+ return true;
+}
+
+static void ipv6_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "src=%pI6 dst=%pI6 ",
+ tuple->src.u3.ip6, tuple->dst.u3.ip6);
+}
+
+static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ unsigned int *dataoff, u_int8_t *protonum)
+{
+ unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
+ __be16 frag_off;
+ int protoff;
+ u8 nexthdr;
+
+ if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
+ &nexthdr, sizeof(nexthdr)) != 0) {
+ pr_debug("ip6_conntrack_core: can't get nexthdr\n");
+ return -NF_ACCEPT;
+ }
+ protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
+ /*
+ * (protoff == skb->len) means the packet has not data, just
+ * IPv6 and possibly extensions headers, but it is tracked anyway
+ */
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("ip6_conntrack_core: can't find proto in pkt\n");
+ return -NF_ACCEPT;
+ }
+
+ *dataoff = protoff;
+ *protonum = nexthdr;
+ return NF_ACCEPT;
+}
+
+static unsigned int ipv6_helper(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+ enum ip_conntrack_info ctinfo;
+ __be16 frag_off;
+ int protoff;
+ u8 nexthdr;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+
+ return helper->help(skb, protoff, ct, ctinfo);
+}
+
+static unsigned int ipv6_confirm(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned char pnum = ipv6_hdr(skb)->nexthdr;
+ int protoff;
+ __be16 frag_off;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
+
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ goto out;
+ }
+
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
+out:
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(dev_net(state->in), PF_INET6, ops->hooknum, skb);
+}
+
+static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct ipv6hdr)) {
+ net_notice_ratelimited("ipv6_conntrack_local: packet too short\n");
+ return NF_ACCEPT;
+ }
+ return nf_conntrack_in(dev_net(state->out), PF_INET6, ops->hooknum, skb);
+}
+
+static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
+ {
+ .hook = ipv6_conntrack_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv6_conntrack_local,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv6_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv6_confirm,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_LAST,
+ },
+ {
+ .hook = ipv6_helper,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv6_confirm,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_LAST-1,
+ },
+};
+
+static int
+ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ipv6_pinfo *inet6 = inet6_sk(sk);
+ const struct nf_conntrack_tuple_hash *h;
+ struct sockaddr_in6 sin6;
+ struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
+ struct nf_conn *ct;
+
+ tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
+ tuple.src.u.tcp.port = inet->inet_sport;
+ tuple.dst.u3.in6 = sk->sk_v6_daddr;
+ tuple.dst.u.tcp.port = inet->inet_dport;
+ tuple.dst.protonum = sk->sk_protocol;
+
+ if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP)
+ return -ENOPROTOOPT;
+
+ if (*len < 0 || (unsigned int) *len < sizeof(sin6))
+ return -EINVAL;
+
+ h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+ if (!h) {
+ pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
+ &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
+ &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+ }
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
+ sin6.sin6_flowinfo = inet6->flow_label & IPV6_FLOWINFO_MASK;
+ memcpy(&sin6.sin6_addr,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
+ sizeof(sin6.sin6_addr));
+
+ nf_ct_put(ct);
+ sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr,
+ sk->sk_bound_dev_if);
+ return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
+ nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = {
+ [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 },
+ [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 },
+};
+
+static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *t)
+{
+ if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
+ return -EINVAL;
+
+ t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
+ t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+
+ return 0;
+}
+
+static int ipv6_nlattr_tuple_size(void)
+{
+ return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1);
+}
+#endif
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
+ .l3proto = PF_INET6,
+ .name = "ipv6",
+ .pkt_to_tuple = ipv6_pkt_to_tuple,
+ .invert_tuple = ipv6_invert_tuple,
+ .print_tuple = ipv6_print_tuple,
+ .get_l4proto = ipv6_get_l4proto,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = ipv6_tuple_to_nlattr,
+ .nlattr_tuple_size = ipv6_nlattr_tuple_size,
+ .nlattr_to_tuple = ipv6_nlattr_to_tuple,
+ .nla_policy = ipv6_nla_policy,
+#endif
+ .me = THIS_MODULE,
+};
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
+
+static struct nf_sockopt_ops so_getorigdst6 = {
+ .pf = NFPROTO_IPV6,
+ .get_optmin = IP6T_SO_ORIGINAL_DST,
+ .get_optmax = IP6T_SO_ORIGINAL_DST + 1,
+ .get = ipv6_getorigdst,
+ .owner = THIS_MODULE,
+};
+
+static int ipv6_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_tcp6: pernet registration failed\n");
+ goto out;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udp6: pernet registration failed\n");
+ goto cleanup_tcp6;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmpv6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_icmp6: pernet registration failed\n");
+ goto cleanup_udp6;
+ }
+ ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv6: pernet registration failed.\n");
+ goto cleanup_icmpv6;
+ }
+ return 0;
+ cleanup_icmpv6:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6);
+ cleanup_udp6:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6);
+ cleanup_tcp6:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6);
+ out:
+ return ret;
+}
+
+static void ipv6_net_exit(struct net *net)
+{
+ nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmpv6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp6);
+}
+
+static struct pernet_operations ipv6_net_ops = {
+ .init = ipv6_net_init,
+ .exit = ipv6_net_exit,
+};
+
+static int __init nf_conntrack_l3proto_ipv6_init(void)
+{
+ int ret = 0;
+
+ need_conntrack();
+ nf_defrag_ipv6_enable();
+
+ ret = nf_register_sockopt(&so_getorigdst6);
+ if (ret < 0) {
+ pr_err("Unable to register netfilter socket option\n");
+ return ret;
+ }
+
+ ret = register_pernet_subsys(&ipv6_net_ops);
+ if (ret < 0)
+ goto cleanup_sockopt;
+
+ ret = nf_register_hooks(ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv6: can't register pre-routing defrag "
+ "hook.\n");
+ goto cleanup_pernet;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv6: can't register tcp6 proto.\n");
+ goto cleanup_hooks;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv6: can't register udp6 proto.\n");
+ goto cleanup_tcp6;
+ }
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmpv6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv6: can't register icmpv6 proto.\n");
+ goto cleanup_udp6;
+ }
+
+ ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n");
+ goto cleanup_icmpv6;
+ }
+ return ret;
+
+ cleanup_icmpv6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
+ cleanup_udp6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6);
+ cleanup_tcp6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
+ cleanup_hooks:
+ nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
+ cleanup_pernet:
+ unregister_pernet_subsys(&ipv6_net_ops);
+ cleanup_sockopt:
+ nf_unregister_sockopt(&so_getorigdst6);
+ return ret;
+}
+
+static void __exit nf_conntrack_l3proto_ipv6_fini(void)
+{
+ synchronize_net();
+ nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
+ nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
+ unregister_pernet_subsys(&ipv6_net_ops);
+ nf_unregister_sockopt(&so_getorigdst6);
+}
+
+module_init(nf_conntrack_l3proto_ipv6_init);
+module_exit(nf_conntrack_l3proto_ipv6_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
new file mode 100644
index 000000000..90388d606
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (C)2003,2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ;
+
+static inline struct nf_icmp_net *icmpv6_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmpv6;
+}
+
+static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct icmp6hdr *hp;
+ struct icmp6hdr _hdr;
+
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return false;
+ tuple->dst.u.icmp.type = hp->icmp6_type;
+ tuple->src.u.icmp.id = hp->icmp6_identifier;
+ tuple->dst.u.icmp.code = hp->icmp6_code;
+
+ return true;
+}
+
+/* Add 1; spaces filled with 0. */
+static const u_int8_t invmap[] = {
+ [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1,
+ [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1,
+ [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1,
+ [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY +1
+};
+
+static const u_int8_t noct_valid_new[] = {
+ [ICMPV6_MGM_QUERY - 130] = 1,
+ [ICMPV6_MGM_REPORT -130] = 1,
+ [ICMPV6_MGM_REDUCTION - 130] = 1,
+ [NDISC_ROUTER_SOLICITATION - 130] = 1,
+ [NDISC_ROUTER_ADVERTISEMENT - 130] = 1,
+ [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1,
+ [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1,
+ [ICMPV6_MLD2_REPORT - 130] = 1
+};
+
+static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ int type = orig->dst.u.icmp.type - 128;
+ if (type < 0 || type >= sizeof(invmap) || !invmap[type])
+ return false;
+
+ tuple->src.u.icmp.id = orig->src.u.icmp.id;
+ tuple->dst.u.icmp.type = invmap[type] - 1;
+ tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static void icmpv6_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "type=%u code=%u id=%u ",
+ tuple->dst.u.icmp.type,
+ tuple->dst.u.icmp.code,
+ ntohs(tuple->src.u.icmp.id));
+}
+
+static unsigned int *icmpv6_get_timeouts(struct net *net)
+{
+ return &icmpv6_pernet(net)->timeout;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmpv6_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeout)
+{
+ /* Do not immediately delete the connection after the first
+ successful reply to avoid excessive conntrackd traffic
+ and also to handle correctly ICMP echo reply duplicates. */
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ static const u_int8_t valid_new[] = {
+ [ICMPV6_ECHO_REQUEST - 128] = 1,
+ [ICMPV6_NI_QUERY - 128] = 1
+ };
+ int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
+
+ if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
+ /* Can't create a new ICMPv6 `conn' with this. */
+ pr_debug("icmpv6: can't create new conn with type %u\n",
+ type + 128);
+ nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
+ if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6))
+ nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL,
+ NULL, NULL,
+ "nf_ct_icmpv6: invalid new with type %d ",
+ type + 128);
+ return false;
+ }
+ return true;
+}
+
+static int
+icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb,
+ unsigned int icmp6off,
+ enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct nf_conntrack_tuple intuple, origtuple;
+ const struct nf_conntrack_tuple_hash *h;
+ const struct nf_conntrack_l4proto *inproto;
+ u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+
+ NF_CT_ASSERT(skb->nfct == NULL);
+
+ /* Are they talking about one of our connections? */
+ if (!nf_ct_get_tuplepr(skb,
+ skb_network_offset(skb)
+ + sizeof(struct ipv6hdr)
+ + sizeof(struct icmp6hdr),
+ PF_INET6, &origtuple)) {
+ pr_debug("icmpv6_error: Can't get tuple\n");
+ return -NF_ACCEPT;
+ }
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum);
+
+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
+ been preserved inside the ICMP. */
+ if (!nf_ct_invert_tuple(&intuple, &origtuple,
+ &nf_conntrack_l3proto_ipv6, inproto)) {
+ pr_debug("icmpv6_error: Can't invert tuple\n");
+ return -NF_ACCEPT;
+ }
+
+ *ctinfo = IP_CT_RELATED;
+
+ h = nf_conntrack_find_get(net, zone, &intuple);
+ if (!h) {
+ pr_debug("icmpv6_error: no match\n");
+ return -NF_ACCEPT;
+ } else {
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+ }
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return NF_ACCEPT;
+}
+
+static int
+icmpv6_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb, unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+{
+ const struct icmp6hdr *icmp6h;
+ struct icmp6hdr _ih;
+ int type;
+
+ icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
+ if (icmp6h == NULL) {
+ if (LOG_INVALID(net, IPPROTO_ICMPV6))
+ nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmpv6: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
+ if (LOG_INVALID(net, IPPROTO_ICMPV6))
+ nf_log_packet(net, PF_INET6, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmpv6: ICMPv6 checksum failed ");
+ return -NF_ACCEPT;
+ }
+
+ type = icmp6h->icmp6_type - 130;
+ if (type >= 0 && type < sizeof(noct_valid_new) &&
+ noct_valid_new[type]) {
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+ return NF_ACCEPT;
+ }
+
+ /* is not error message ? */
+ if (icmp6h->icmp6_type >= 128)
+ return NF_ACCEPT;
+
+ return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+static int icmpv6_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *t)
+{
+ if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) ||
+ nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) ||
+ nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = {
+ [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 },
+ [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 },
+ [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 },
+};
+
+static int icmpv6_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *tuple)
+{
+ if (!tb[CTA_PROTO_ICMPV6_TYPE] ||
+ !tb[CTA_PROTO_ICMPV6_CODE] ||
+ !tb[CTA_PROTO_ICMPV6_ID])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]);
+ tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]);
+ tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]);
+
+ if (tuple->dst.u.icmp.type < 128 ||
+ tuple->dst.u.icmp.type - 128 >= sizeof(invmap) ||
+ !invmap[tuple->dst.u.icmp.type - 128])
+ return -EINVAL;
+
+ return 0;
+}
+
+static int icmpv6_nlattr_tuple_size(void)
+{
+ return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeout = data;
+ struct nf_icmp_net *in = icmpv6_pernet(net);
+
+ if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) {
+ *timeout =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ;
+ } else {
+ /* Set default ICMPv6 timeout. */
+ *timeout = in->timeout;
+ }
+ return 0;
+}
+
+static int
+icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeout = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = {
+ [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table icmpv6_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_icmpv6_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_SYSCTL */
+
+static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_icmp_net *in)
+{
+#ifdef CONFIG_SYSCTL
+ pn->ctl_table = kmemdup(icmpv6_sysctl_table,
+ sizeof(icmpv6_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &in->timeout;
+#endif
+ return 0;
+}
+
+static int icmpv6_init_net(struct net *net, u_int16_t proto)
+{
+ struct nf_icmp_net *in = icmpv6_pernet(net);
+ struct nf_proto_net *pn = &in->pn;
+
+ in->timeout = nf_ct_icmpv6_timeout;
+
+ return icmpv6_kmemdup_sysctl_table(pn, in);
+}
+
+static struct nf_proto_net *icmpv6_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.icmpv6.pn;
+}
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
+{
+ .l3proto = PF_INET6,
+ .l4proto = IPPROTO_ICMPV6,
+ .name = "icmpv6",
+ .pkt_to_tuple = icmpv6_pkt_to_tuple,
+ .invert_tuple = icmpv6_invert_tuple,
+ .print_tuple = icmpv6_print_tuple,
+ .packet = icmpv6_packet,
+ .get_timeouts = icmpv6_get_timeouts,
+ .new = icmpv6_new,
+ .error = icmpv6_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = icmpv6_tuple_to_nlattr,
+ .nlattr_tuple_size = icmpv6_nlattr_tuple_size,
+ .nlattr_to_tuple = icmpv6_nlattr_to_tuple,
+ .nla_policy = icmpv6_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj,
+ .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_ICMP_MAX,
+ .obj_size = sizeof(unsigned int),
+ .nla_policy = icmpv6_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = icmpv6_init_net,
+ .get_net_proto = icmpv6_get_net_proto,
+};
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
new file mode 100644
index 000000000..6f187c8d8
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -0,0 +1,697 @@
+/*
+ * IPv6 fragment reassembly for connection tracking
+ *
+ * Copyright (C)2004 USAGI/WIDE Project
+ *
+ * Author:
+ * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Based on: net/ipv6/reassembly.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv6-nf: " fmt
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/jiffies.h>
+#include <linux/net.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+#include <net/inet_frag.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/inet_ecn.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <linux/sysctl.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static const char nf_frags_cache_name[] = "nf-frags";
+
+struct nf_ct_frag6_skb_cb
+{
+ struct inet6_skb_parm h;
+ int offset;
+ struct sk_buff *orig;
+};
+
+#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb))
+
+static struct inet_frags nf_frags;
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+
+static struct ctl_table nf_ct_frag6_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_frag6_timeout",
+ .data = &init_net.nf_frag.frags.timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_frag6_low_thresh",
+ .data = &init_net.nf_frag.frags.low_thresh,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &init_net.nf_frag.frags.high_thresh
+ },
+ {
+ .procname = "nf_conntrack_frag6_high_thresh",
+ .data = &init_net.nf_frag.frags.high_thresh,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &init_net.nf_frag.frags.low_thresh
+ },
+ { }
+};
+
+static int nf_ct_frag6_sysctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = nf_ct_frag6_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(nf_ct_frag6_sysctl_table),
+ GFP_KERNEL);
+ if (table == NULL)
+ goto err_alloc;
+
+ table[0].data = &net->nf_frag.frags.timeout;
+ table[1].data = &net->nf_frag.frags.low_thresh;
+ table[1].extra2 = &net->nf_frag.frags.high_thresh;
+ table[2].data = &net->nf_frag.frags.high_thresh;
+ table[2].extra1 = &net->nf_frag.frags.low_thresh;
+ table[2].extra2 = &init_net.nf_frag.frags.high_thresh;
+ }
+
+ hdr = register_net_sysctl(net, "net/netfilter", table);
+ if (hdr == NULL)
+ goto err_reg;
+
+ net->nf_frag.sysctl.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+#else
+static int nf_ct_frag6_sysctl_register(struct net *net)
+{
+ return 0;
+}
+static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
+{
+}
+#endif
+
+static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
+{
+ return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
+}
+
+static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
+{
+ net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
+ return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
+ (__force u32)id, nf_frags.rnd);
+}
+
+
+static unsigned int nf_hashfn(const struct inet_frag_queue *q)
+{
+ const struct frag_queue *nq;
+
+ nq = container_of(q, struct frag_queue, q);
+ return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
+}
+
+static void nf_skb_free(struct sk_buff *skb)
+{
+ if (NFCT_FRAG6_CB(skb)->orig)
+ kfree_skb(NFCT_FRAG6_CB(skb)->orig);
+}
+
+static void nf_ct_frag6_expire(unsigned long data)
+{
+ struct frag_queue *fq;
+ struct net *net;
+
+ fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+ net = container_of(fq->q.net, struct net, nf_frag.frags);
+
+ ip6_expire_frag_queue(net, fq, &nf_frags);
+}
+
+/* Creation primitives. */
+static inline struct frag_queue *fq_find(struct net *net, __be32 id,
+ u32 user, struct in6_addr *src,
+ struct in6_addr *dst, u8 ecn)
+{
+ struct inet_frag_queue *q;
+ struct ip6_create_arg arg;
+ unsigned int hash;
+
+ arg.id = id;
+ arg.user = user;
+ arg.src = src;
+ arg.dst = dst;
+ arg.ecn = ecn;
+
+ local_bh_disable();
+ hash = nf_hash_frag(id, src, dst);
+
+ q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
+ local_bh_enable();
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
+ return container_of(q, struct frag_queue, q);
+}
+
+
+static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
+ const struct frag_hdr *fhdr, int nhoff)
+{
+ struct sk_buff *prev, *next;
+ unsigned int payload_len;
+ int offset, end;
+ u8 ecn;
+
+ if (fq->q.flags & INET_FRAG_COMPLETE) {
+ pr_debug("Already completed\n");
+ goto err;
+ }
+
+ payload_len = ntohs(ipv6_hdr(skb)->payload_len);
+
+ offset = ntohs(fhdr->frag_off) & ~0x7;
+ end = offset + (payload_len -
+ ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
+
+ if ((unsigned int)end > IPV6_MAXPLEN) {
+ pr_debug("offset is too large.\n");
+ return -1;
+ }
+
+ ecn = ip6_frag_ecn(ipv6_hdr(skb));
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ const unsigned char *nh = skb_network_header(skb);
+ skb->csum = csum_sub(skb->csum,
+ csum_partial(nh, (u8 *)(fhdr + 1) - nh,
+ 0));
+ }
+
+ /* Is this the final fragment? */
+ if (!(fhdr->frag_off & htons(IP6_MF))) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+ if (end < fq->q.len ||
+ ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) {
+ pr_debug("already received last fragment\n");
+ goto err;
+ }
+ fq->q.flags |= INET_FRAG_LAST_IN;
+ fq->q.len = end;
+ } else {
+ /* Check if the fragment is rounded to 8 bytes.
+ * Required by the RFC.
+ */
+ if (end & 0x7) {
+ /* RFC2460 says always send parameter problem in
+ * this case. -DaveM
+ */
+ pr_debug("end of fragment not rounded to 8 bytes.\n");
+ return -1;
+ }
+ if (end > fq->q.len) {
+ /* Some bits beyond end -> corruption. */
+ if (fq->q.flags & INET_FRAG_LAST_IN) {
+ pr_debug("last packet already reached.\n");
+ goto err;
+ }
+ fq->q.len = end;
+ }
+ }
+
+ if (end == offset)
+ goto err;
+
+ /* Point into the IP datagram 'data' part. */
+ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) {
+ pr_debug("queue: message is too short.\n");
+ goto err;
+ }
+ if (pskb_trim_rcsum(skb, end - offset)) {
+ pr_debug("Can't trim\n");
+ goto err;
+ }
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = fq->q.fragments_tail;
+ if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) {
+ next = NULL;
+ goto found;
+ }
+ prev = NULL;
+ for (next = fq->q.fragments; next != NULL; next = next->next) {
+ if (NFCT_FRAG6_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+found:
+ /* RFC5722, Section 4:
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments, including those not yet received) MUST be silently
+ * discarded.
+ */
+
+ /* Check for overlap with preceding fragment. */
+ if (prev &&
+ (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset)
+ goto discard_fq;
+
+ /* Look for overlap with succeeding segment. */
+ if (next && NFCT_FRAG6_CB(next)->offset < end)
+ goto discard_fq;
+
+ NFCT_FRAG6_CB(skb)->offset = offset;
+
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (!next)
+ fq->q.fragments_tail = skb;
+ if (prev)
+ prev->next = skb;
+ else
+ fq->q.fragments = skb;
+
+ if (skb->dev) {
+ fq->iif = skb->dev->ifindex;
+ skb->dev = NULL;
+ }
+ fq->q.stamp = skb->tstamp;
+ fq->q.meat += skb->len;
+ fq->ecn |= ecn;
+ if (payload_len > fq->q.max_size)
+ fq->q.max_size = payload_len;
+ add_frag_mem_limit(&fq->q, skb->truesize);
+
+ /* The first fragment.
+ * nhoffset is obtained from the first fragment, of course.
+ */
+ if (offset == 0) {
+ fq->nhoffset = nhoff;
+ fq->q.flags |= INET_FRAG_FIRST_IN;
+ }
+
+ return 0;
+
+discard_fq:
+ inet_frag_kill(&fq->q, &nf_frags);
+err:
+ return -1;
+}
+
+/*
+ * Check if this packet is complete.
+ * Returns NULL on failure by any reason, and pointer
+ * to current nexthdr field in reassembled frame.
+ *
+ * It is called with locked fq, and caller must check that
+ * queue is eligible for reassembly i.e. it is not COMPLETE,
+ * the last and the first frames arrived and all the bits are here.
+ */
+static struct sk_buff *
+nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
+{
+ struct sk_buff *fp, *op, *head = fq->q.fragments;
+ int payload_len;
+ u8 ecn;
+
+ inet_frag_kill(&fq->q, &nf_frags);
+
+ WARN_ON(head == NULL);
+ WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
+
+ ecn = ip_frag_ecn_table[fq->ecn];
+ if (unlikely(ecn == 0xff))
+ goto out_fail;
+
+ /* Unfragmented part is taken from the first segment. */
+ payload_len = ((head->data - skb_network_header(head)) -
+ sizeof(struct ipv6hdr) + fq->q.len -
+ sizeof(struct frag_hdr));
+ if (payload_len > IPV6_MAXPLEN) {
+ pr_debug("payload len is too large.\n");
+ goto out_oversize;
+ }
+
+ /* Head of list must not be cloned. */
+ if (skb_unclone(head, GFP_ATOMIC)) {
+ pr_debug("skb is cloned but can't expand head");
+ goto out_oom;
+ }
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments. */
+ if (skb_has_frag_list(head)) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (clone == NULL)
+ goto out_oom;
+
+ clone->next = head->next;
+ head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->len = clone->data_len = head->data_len - plen;
+ head->data_len -= clone->len;
+ head->len -= clone->len;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+
+ NFCT_FRAG6_CB(clone)->orig = NULL;
+ add_frag_mem_limit(&fq->q, clone->truesize);
+ }
+
+ /* We have to remove fragment header from datagram and to relocate
+ * header in order to calculate ICV correctly. */
+ skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0];
+ memmove(head->head + sizeof(struct frag_hdr), head->head,
+ (head->data - head->head) - sizeof(struct frag_hdr));
+ head->mac_header += sizeof(struct frag_hdr);
+ head->network_header += sizeof(struct frag_hdr);
+
+ skb_shinfo(head)->frag_list = head->next;
+ skb_reset_transport_header(head);
+ skb_push(head, head->data - skb_network_header(head));
+
+ for (fp=head->next; fp; fp = fp->next) {
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
+ }
+ sub_frag_mem_limit(&fq->q, head->truesize);
+
+ head->ignore_df = 1;
+ head->next = NULL;
+ head->dev = dev;
+ head->tstamp = fq->q.stamp;
+ ipv6_hdr(head)->payload_len = htons(payload_len);
+ ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
+ IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+
+ /* Yes, and fold redundant checksum back. 8) */
+ if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_partial(skb_network_header(head),
+ skb_network_header_len(head),
+ head->csum);
+
+ fq->q.fragments = NULL;
+ fq->q.fragments_tail = NULL;
+
+ /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
+ fp = skb_shinfo(head)->frag_list;
+ if (fp && NFCT_FRAG6_CB(fp)->orig == NULL)
+ /* at above code, head skb is divided into two skbs. */
+ fp = fp->next;
+
+ op = NFCT_FRAG6_CB(head)->orig;
+ for (; fp; fp = fp->next) {
+ struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig;
+
+ op->next = orig;
+ op = orig;
+ NFCT_FRAG6_CB(fp)->orig = NULL;
+ }
+
+ return head;
+
+out_oversize:
+ net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n",
+ payload_len);
+ goto out_fail;
+out_oom:
+ net_dbg_ratelimited("nf_ct_frag6_reasm: no memory for reassembly\n");
+out_fail:
+ return NULL;
+}
+
+/*
+ * find the header just before Fragment Header.
+ *
+ * if success return 0 and set ...
+ * (*prevhdrp): the value of "Next Header Field" in the header
+ * just before Fragment Header.
+ * (*prevhoff): the offset of "Next Header Field" in the header
+ * just before Fragment Header.
+ * (*fhoff) : the offset of Fragment Header.
+ *
+ * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c
+ *
+ */
+static int
+find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
+{
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ const int netoff = skb_network_offset(skb);
+ u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr);
+ int start = netoff + sizeof(struct ipv6hdr);
+ int len = skb->len - start;
+ u8 prevhdr = NEXTHDR_IPV6;
+
+ while (nexthdr != NEXTHDR_FRAGMENT) {
+ struct ipv6_opt_hdr hdr;
+ int hdrlen;
+
+ if (!ipv6_ext_hdr(nexthdr)) {
+ return -1;
+ }
+ if (nexthdr == NEXTHDR_NONE) {
+ pr_debug("next header is none\n");
+ return -1;
+ }
+ if (len < (int)sizeof(struct ipv6_opt_hdr)) {
+ pr_debug("too short\n");
+ return -1;
+ }
+ if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
+ BUG();
+ if (nexthdr == NEXTHDR_AUTH)
+ hdrlen = (hdr.hdrlen+2)<<2;
+ else
+ hdrlen = ipv6_optlen(&hdr);
+
+ prevhdr = nexthdr;
+ prev_nhoff = start;
+
+ nexthdr = hdr.nexthdr;
+ len -= hdrlen;
+ start += hdrlen;
+ }
+
+ if (len < 0)
+ return -1;
+
+ *prevhdrp = prevhdr;
+ *prevhoff = prev_nhoff;
+ *fhoff = start;
+
+ return 0;
+}
+
+struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
+{
+ struct sk_buff *clone;
+ struct net_device *dev = skb->dev;
+ struct net *net = skb_dst(skb) ? dev_net(skb_dst(skb)->dev)
+ : dev_net(skb->dev);
+ struct frag_hdr *fhdr;
+ struct frag_queue *fq;
+ struct ipv6hdr *hdr;
+ int fhoff, nhoff;
+ u8 prevhdr;
+ struct sk_buff *ret_skb = NULL;
+
+ /* Jumbo payload inhibits frag. header */
+ if (ipv6_hdr(skb)->payload_len == 0) {
+ pr_debug("payload len = 0\n");
+ return skb;
+ }
+
+ if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
+ return skb;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone == NULL) {
+ pr_debug("Can't clone skb\n");
+ return skb;
+ }
+
+ NFCT_FRAG6_CB(clone)->orig = skb;
+
+ if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) {
+ pr_debug("message is too short.\n");
+ goto ret_orig;
+ }
+
+ skb_set_transport_header(clone, fhoff);
+ hdr = ipv6_hdr(clone);
+ fhdr = (struct frag_hdr *)skb_transport_header(clone);
+
+ fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
+ ip6_frag_ecn(hdr));
+ if (fq == NULL) {
+ pr_debug("Can't find and can't create new queue\n");
+ goto ret_orig;
+ }
+
+ spin_lock_bh(&fq->q.lock);
+
+ if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
+ spin_unlock_bh(&fq->q.lock);
+ pr_debug("Can't insert skb to queue\n");
+ inet_frag_put(&fq->q, &nf_frags);
+ goto ret_orig;
+ }
+
+ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ fq->q.meat == fq->q.len) {
+ ret_skb = nf_ct_frag6_reasm(fq, dev);
+ if (ret_skb == NULL)
+ pr_debug("Can't reassemble fragmented packets\n");
+ }
+ spin_unlock_bh(&fq->q.lock);
+
+ inet_frag_put(&fq->q, &nf_frags);
+ return ret_skb;
+
+ret_orig:
+ kfree_skb(clone);
+ return skb;
+}
+
+void nf_ct_frag6_consume_orig(struct sk_buff *skb)
+{
+ struct sk_buff *s, *s2;
+
+ for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
+ s2 = s->next;
+ s->next = NULL;
+ consume_skb(s);
+ s = s2;
+ }
+}
+
+static int nf_ct_net_init(struct net *net)
+{
+ net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+ inet_frags_init_net(&net->nf_frag.frags);
+
+ return nf_ct_frag6_sysctl_register(net);
+}
+
+static void nf_ct_net_exit(struct net *net)
+{
+ nf_ct_frags6_sysctl_unregister(net);
+ inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+}
+
+static struct pernet_operations nf_ct_net_ops = {
+ .init = nf_ct_net_init,
+ .exit = nf_ct_net_exit,
+};
+
+int nf_ct_frag6_init(void)
+{
+ int ret = 0;
+
+ nf_frags.hashfn = nf_hashfn;
+ nf_frags.constructor = ip6_frag_init;
+ nf_frags.destructor = NULL;
+ nf_frags.skb_free = nf_skb_free;
+ nf_frags.qsize = sizeof(struct frag_queue);
+ nf_frags.match = ip6_frag_match;
+ nf_frags.frag_expire = nf_ct_frag6_expire;
+ nf_frags.frags_cache_name = nf_frags_cache_name;
+ ret = inet_frags_init(&nf_frags);
+ if (ret)
+ goto out;
+ ret = register_pernet_subsys(&nf_ct_net_ops);
+ if (ret)
+ inet_frags_fini(&nf_frags);
+
+out:
+ return ret;
+}
+
+void nf_ct_frag6_cleanup(void)
+{
+ unregister_pernet_subsys(&nf_ct_net_ops);
+ inet_frags_fini(&nf_frags);
+}
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
new file mode 100644
index 000000000..a45db0b47
--- /dev/null
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -0,0 +1,138 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ u16 zone = NF_CT_DEFAULT_ZONE;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (skb->nfct)
+ zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+#endif
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (skb->nf_bridge &&
+ skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+ return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+ if (hooknum == NF_INET_PRE_ROUTING)
+ return IP6_DEFRAG_CONNTRACK_IN + zone;
+ else
+ return IP6_DEFRAG_CONNTRACK_OUT + zone;
+
+}
+
+static unsigned int ipv6_defrag(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct sk_buff *reasm;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Previously seen (loopback)? */
+ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+ return NF_ACCEPT;
+#endif
+
+ reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(ops->hooknum, skb));
+ /* queued */
+ if (reasm == NULL)
+ return NF_STOLEN;
+
+ /* error occurred or not fragmented */
+ if (reasm == skb)
+ return NF_ACCEPT;
+
+ nf_ct_frag6_consume_orig(reasm);
+
+ NF_HOOK_THRESH(NFPROTO_IPV6, ops->hooknum, state->sk, reasm,
+ state->in, state->out,
+ state->okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
+
+ return NF_STOLEN;
+}
+
+static struct nf_hook_ops ipv6_defrag_ops[] = {
+ {
+ .hook = ipv6_defrag,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
+ },
+ {
+ .hook = ipv6_defrag,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
+ },
+};
+
+static int __init nf_defrag_init(void)
+{
+ int ret = 0;
+
+ ret = nf_ct_frag6_init();
+ if (ret < 0) {
+ pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
+ return ret;
+ }
+ ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ if (ret < 0) {
+ pr_err("nf_defrag_ipv6: can't register hooks\n");
+ goto cleanup_frag6;
+ }
+ return ret;
+
+cleanup_frag6:
+ nf_ct_frag6_cleanup();
+ return ret;
+
+}
+
+static void __exit nf_defrag_fini(void)
+{
+ nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ nf_ct_frag6_cleanup();
+}
+
+void nf_defrag_ipv6_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
new file mode 100644
index 000000000..8dd869642
--- /dev/null
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -0,0 +1,429 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_NOTICE,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+/* One level of recursion won't kill us */
+static void dump_ipv6_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int ip6hoff,
+ int recurse)
+{
+ u_int8_t currenthdr;
+ int fragment;
+ struct ipv6hdr _ip6h;
+ const struct ipv6hdr *ih;
+ unsigned int ptr;
+ unsigned int hdrlen = 0;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
+
+ ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
+ if (ih == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
+ nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
+ nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+ ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
+ (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
+ ih->hop_limit,
+ (ntohl(*(__be32 *)ih) & 0x000fffff));
+
+ fragment = 0;
+ ptr = ip6hoff + sizeof(struct ipv6hdr);
+ currenthdr = ih->nexthdr;
+ while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) {
+ struct ipv6_opt_hdr _hdr;
+ const struct ipv6_opt_hdr *hp;
+
+ hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 48 "OPT (...) " */
+ if (logflags & XT_LOG_IPOPT)
+ nf_log_buf_add(m, "OPT ( ");
+
+ switch (currenthdr) {
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr _fhdr;
+ const struct frag_hdr *fh;
+
+ nf_log_buf_add(m, "FRAG:");
+ fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
+ &_fhdr);
+ if (fh == NULL) {
+ nf_log_buf_add(m, "TRUNCATED ");
+ return;
+ }
+
+ /* Max length: 6 "65535 " */
+ nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
+
+ /* Max length: 11 "INCOMPLETE " */
+ if (fh->frag_off & htons(0x0001))
+ nf_log_buf_add(m, "INCOMPLETE ");
+
+ nf_log_buf_add(m, "ID:%08x ",
+ ntohl(fh->identification));
+
+ if (ntohs(fh->frag_off) & 0xFFF8)
+ fragment = 1;
+
+ hdrlen = 8;
+
+ break;
+ }
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_HOPOPTS:
+ if (fragment) {
+ if (logflags & XT_LOG_IPOPT)
+ nf_log_buf_add(m, ")");
+ return;
+ }
+ hdrlen = ipv6_optlen(hp);
+ break;
+ /* Max Length */
+ case IPPROTO_AH:
+ if (logflags & XT_LOG_IPOPT) {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ /* Max length: 3 "AH " */
+ nf_log_buf_add(m, "AH ");
+
+ if (fragment) {
+ nf_log_buf_add(m, ")");
+ return;
+ }
+
+ ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
+ &_ahdr);
+ if (ah == NULL) {
+ /*
+ * Max length: 26 "INCOMPLETE [65535
+ * bytes] )"
+ */
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+
+ }
+
+ hdrlen = (hp->hdrlen+2)<<2;
+ break;
+ case IPPROTO_ESP:
+ if (logflags & XT_LOG_IPOPT) {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 4 "ESP " */
+ nf_log_buf_add(m, "ESP ");
+
+ if (fragment) {
+ nf_log_buf_add(m, ")");
+ return;
+ }
+
+ /*
+ * Max length: 26 "INCOMPLETE [65535 bytes] )"
+ */
+ eh = skb_header_pointer(skb, ptr, sizeof(_esph),
+ &_esph);
+ if (eh == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 16 "SPI=0xF1234567 )" */
+ nf_log_buf_add(m, "SPI=0x%x )",
+ ntohl(eh->spi));
+ }
+ return;
+ default:
+ /* Max length: 20 "Unknown Ext Hdr 255" */
+ nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
+ return;
+ }
+ if (logflags & XT_LOG_IPOPT)
+ nf_log_buf_add(m, ") ");
+
+ currenthdr = hp->nexthdr;
+ ptr += hdrlen;
+ }
+
+ switch (currenthdr) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment,
+ ptr, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr))
+ return;
+ break;
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _icmp6h;
+ const struct icmp6hdr *ic;
+
+ /* Max length: 13 "PROTO=ICMPv6 " */
+ nf_log_buf_add(m, "PROTO=ICMPv6 ");
+
+ if (fragment)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
+ if (ic == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ",
+ ic->icmp6_type, ic->icmp6_code);
+
+ switch (ic->icmp6_type) {
+ case ICMPV6_ECHO_REQUEST:
+ case ICMPV6_ECHO_REPLY:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ic->icmp6_identifier),
+ ntohs(ic->icmp6_sequence));
+ break;
+ case ICMPV6_MGM_QUERY:
+ case ICMPV6_MGM_REPORT:
+ case ICMPV6_MGM_REDUCTION:
+ break;
+
+ case ICMPV6_PARAMPROB:
+ /* Max length: 17 "POINTER=ffffffff " */
+ nf_log_buf_add(m, "POINTER=%08x ",
+ ntohl(ic->icmp6_pointer));
+ /* Fall through */
+ case ICMPV6_DEST_UNREACH:
+ case ICMPV6_PKT_TOOBIG:
+ case ICMPV6_TIME_EXCEED:
+ /* Max length: 3+maxlen */
+ if (recurse) {
+ nf_log_buf_add(m, "[");
+ dump_ipv6_packet(m, info, skb,
+ ptr + sizeof(_icmp6h), 0);
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohl(ic->icmp6_mtu));
+ }
+ }
+ break;
+ }
+ /* Max length: 10 "PROTO=255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", currenthdr);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & XT_LOG_UID) && recurse)
+ nf_log_dump_sk_uid_gid(m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (recurse && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+}
+
+static void dump_ipv6_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & XT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ nf_log_buf_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int len = dev->hard_header_len;
+ unsigned int i;
+
+ if (dev->type == ARPHRD_SIT) {
+ p -= ETH_HLEN;
+
+ if (p < skb->head)
+ p = NULL;
+ }
+
+ if (p != NULL) {
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < len; i++)
+ nf_log_buf_add(m, ":%02x", *p++);
+ }
+ nf_log_buf_add(m, " ");
+
+ if (dev->type == ARPHRD_SIT) {
+ const struct iphdr *iph =
+ (struct iphdr *)skb_mac_header(skb);
+ nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr,
+ &iph->daddr);
+ }
+ } else {
+ nf_log_buf_add(m, " ");
+ }
+}
+
+static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
+ loginfo, prefix);
+
+ if (in != NULL)
+ dump_ipv6_mac_header(m, loginfo, skb);
+
+ dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip6_logger __read_mostly = {
+ .name = "nf_log_ipv6",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip6_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_ipv6_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_ipv6_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_ip6_logger);
+}
+
+static struct pernet_operations nf_log_ipv6_net_ops = {
+ .init = nf_log_ipv6_net_init,
+ .exit = nf_log_ipv6_net_exit,
+};
+
+static int __init nf_log_ipv6_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_ipv6_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+ if (ret < 0) {
+ pr_err("failed to register logger\n");
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ unregister_pernet_subsys(&nf_log_ipv6_net_ops);
+ return ret;
+}
+
+static void __exit nf_log_ipv6_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_ipv6_net_ops);
+ nf_log_unregister(&nf_ip6_logger);
+}
+
+module_init(nf_log_ipv6_init);
+module_exit(nf_log_ipv6_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter IPv6 packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(AF_INET6, 0);
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
new file mode 100644
index 000000000..e76900e0a
--- /dev/null
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of IPv6 NAT funded by Astaro.
+ */
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/secure_seq.h>
+#include <net/checksum.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv6;
+
+#ifdef CONFIG_XFRM
+static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ unsigned long statusbit,
+ struct flowi *fl)
+{
+ const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+ struct flowi6 *fl6 = &fl->u.ip6;
+
+ if (ct->status & statusbit) {
+ fl6->daddr = t->dst.u3.in6;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl6->fl6_dport = t->dst.u.all;
+ }
+
+ statusbit ^= IPS_NAT_MASK;
+
+ if (ct->status & statusbit) {
+ fl6->saddr = t->src.u3.in6;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl6->fl6_sport = t->src.u.all;
+ }
+}
+#endif
+
+static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
+ const struct nf_nat_range *range)
+{
+ return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
+ ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
+}
+
+static u32 nf_nat_ipv6_secure_port(const struct nf_conntrack_tuple *t,
+ __be16 dport)
+{
+ return secure_ipv6_port_ephemeral(t->src.u3.ip6, t->dst.u3.ip6, dport);
+}
+
+static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
+ unsigned int iphdroff,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *target,
+ enum nf_nat_manip_type maniptype)
+{
+ struct ipv6hdr *ipv6h;
+ __be16 frag_off;
+ int hdroff;
+ u8 nexthdr;
+
+ if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h)))
+ return false;
+
+ ipv6h = (void *)skb->data + iphdroff;
+ nexthdr = ipv6h->nexthdr;
+ hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h),
+ &nexthdr, &frag_off);
+ if (hdroff < 0)
+ goto manip_addr;
+
+ if ((frag_off & htons(~0x7)) == 0 &&
+ !l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff,
+ target, maniptype))
+ return false;
+manip_addr:
+ if (maniptype == NF_NAT_MANIP_SRC)
+ ipv6h->saddr = target->src.u3.in6;
+ else
+ ipv6h->daddr = target->dst.u3.in6;
+
+ return true;
+}
+
+static void nf_nat_ipv6_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype)
+{
+ const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff);
+ const struct in6_addr *oldip, *newip;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ oldip = &ipv6h->saddr;
+ newip = &t->src.u3.in6;
+ } else {
+ oldip = &ipv6h->daddr;
+ newip = &t->dst.u3.in6;
+ }
+ inet_proto_csum_replace16(check, skb, oldip->s6_addr32,
+ newip->s6_addr32, 1);
+}
+
+static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
+ u8 proto, void *data, __sum16 *check,
+ int datalen, int oldlen)
+{
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (!(rt->rt6i_flags & RTF_LOCAL) &&
+ (!skb->dev || skb->dev->features & NETIF_F_V6_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) +
+ skb_network_offset(skb) +
+ (data - (void *)skb->data);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ datalen, proto, 0);
+ } else {
+ *check = 0;
+ *check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ datalen, proto,
+ csum_partial(data, datalen,
+ 0));
+ if (proto == IPPROTO_UDP && !*check)
+ *check = CSUM_MANGLED_0;
+ }
+ } else
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), 1);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range *range)
+{
+ if (tb[CTA_NAT_V6_MINIP]) {
+ nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
+ sizeof(struct in6_addr));
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ if (tb[CTA_NAT_V6_MAXIP])
+ nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP],
+ sizeof(struct in6_addr));
+ else
+ range->max_addr = range->min_addr;
+
+ return 0;
+}
+#endif
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = {
+ .l3proto = NFPROTO_IPV6,
+ .secure_port = nf_nat_ipv6_secure_port,
+ .in_range = nf_nat_ipv6_in_range,
+ .manip_pkt = nf_nat_ipv6_manip_pkt,
+ .csum_update = nf_nat_ipv6_csum_update,
+ .csum_recalc = nf_nat_ipv6_csum_recalc,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_ipv6_nlattr_to_range,
+#endif
+#ifdef CONFIG_XFRM
+ .decode_session = nf_nat_ipv6_decode_session,
+#endif
+};
+
+int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ unsigned int hdrlen)
+{
+ struct {
+ struct icmp6hdr icmp6;
+ struct ipv6hdr ip6;
+ } *inside;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+ const struct nf_nat_l4proto *l4proto;
+ struct nf_conntrack_tuple target;
+ unsigned long statusbit;
+
+ NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+
+ if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ return 0;
+ if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6))
+ return 0;
+
+ inside = (void *)skb->data + hdrlen;
+ if (inside->icmp6.icmp6_type == NDISC_REDIRECT) {
+ if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+ return 0;
+ if (ct->status & IPS_NAT_MASK)
+ return 0;
+ }
+
+ if (manip == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply direction */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, inside->ip6.nexthdr);
+ if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6),
+ l4proto, &ct->tuplehash[!dir].tuple, !manip))
+ return 0;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ inside = (void *)skb->data + hdrlen;
+ inside->icmp6.icmp6_cksum = 0;
+ inside->icmp6.icmp6_cksum =
+ csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ skb->len - hdrlen, IPPROTO_ICMPV6,
+ csum_partial(&inside->icmp6,
+ skb->len - hdrlen, 0));
+ }
+
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, IPPROTO_ICMPV6);
+ if (!nf_nat_ipv6_manip_pkt(skb, 0, l4proto, &target, manip))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
+
+unsigned int
+nf_nat_ipv6_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+ __be16 frag_off;
+ int hdrlen;
+ u8 nexthdr;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
+
+ /* Don't try to NAT if this packet is not conntracked */
+ if (nf_ct_is_untracked(ct))
+ return NF_ACCEPT;
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
+ ops->hooknum,
+ hdrlen))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ unsigned int ret;
+
+ ret = do_chain(ops, skb, state, ct);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
+ break;
+
+ ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+ if (ret != NF_ACCEPT)
+ return ret;
+ } else {
+ pr_debug("Already setup manip %s for ct %p\n",
+ maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ goto oif_changed;
+ }
+ break;
+
+ default:
+ /* ESTABLISHED */
+ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+ ctinfo == IP_CT_ESTABLISHED_REPLY);
+ if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, state->out))
+ goto oif_changed;
+ }
+
+ return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+
+oif_changed:
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_DROP;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn);
+
+unsigned int
+nf_nat_ipv6_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ unsigned int ret;
+ struct in6_addr daddr = ipv6_hdr(skb)->daddr;
+
+ ret = nf_nat_ipv6_fn(ops, skb, state, do_chain);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
+ skb_dst_drop(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv6_in);
+
+unsigned int
+nf_nat_ipv6_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+#ifdef CONFIG_XFRM
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int err;
+#endif
+ unsigned int ret;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct ipv6hdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv6_fn(ops, skb, state, do_chain);
+#ifdef CONFIG_XFRM
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3) ||
+ (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)) {
+ err = nf_xfrm_me_harder(skb, AF_INET6);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+#endif
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv6_out);
+
+unsigned int
+nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ unsigned int (*do_chain)(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct))
+{
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int ret;
+ int err;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct ipv6hdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv6_fn(ops, skb, state, do_chain);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
+ &ct->tuplehash[!dir].tuple.src.u3)) {
+ err = ip6_route_me_harder(skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#ifdef CONFIG_XFRM
+ else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+ ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
+ ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all) {
+ err = nf_xfrm_me_harder(skb, AF_INET6);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#endif
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn);
+
+static int __init nf_nat_l3proto_ipv6_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv6);
+ if (err < 0)
+ goto err2;
+ return err;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_l3proto_ipv6_exit(void)
+{
+ nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv6);
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-" __stringify(AF_INET6));
+
+module_init(nf_nat_l3proto_ipv6_init);
+module_exit(nf_nat_l3proto_ipv6_exit);
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
new file mode 100644
index 000000000..774560966
--- /dev/null
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/netfilter/ipv6/nf_nat_masquerade.h>
+
+unsigned int
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+ const struct net_device *out)
+{
+ enum ip_conntrack_info ctinfo;
+ struct in6_addr src;
+ struct nf_conn *ct;
+ struct nf_nat_range newrange;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ if (ipv6_dev_get_saddr(dev_net(out), out,
+ &ipv6_hdr(skb)->daddr, 0, &src) < 0)
+ return NF_DROP;
+
+ nfct_nat(ct)->masq_index = out->ifindex;
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.in6 = src;
+ newrange.max_addr.in6 = src;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
+
+static int device_cmp(struct nf_conn *ct, void *ifindex)
+{
+ const struct nf_conn_nat *nat = nfct_nat(ct);
+
+ if (!nat)
+ return 0;
+ if (nf_ct_l3num(ct) != NFPROTO_IPV6)
+ return 0;
+ return nat->masq_index == (int)(long)ifindex;
+}
+
+static int masq_device_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_DOWN)
+ nf_ct_iterate_cleanup(net, device_cmp,
+ (void *)(long)dev->ifindex, 0, 0);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block masq_dev_notifier = {
+ .notifier_call = masq_device_event,
+};
+
+static int masq_inet_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = ptr;
+ struct netdev_notifier_info info;
+
+ netdev_notifier_info_init(&info, ifa->idev->dev);
+ return masq_device_event(this, event, &info);
+}
+
+static struct notifier_block masq_inet_notifier = {
+ .notifier_call = masq_inet_event,
+};
+
+static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
+
+void nf_nat_masquerade_ipv6_register_notifier(void)
+{
+ /* check if the notifier is already set */
+ if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
+ return;
+
+ register_netdevice_notifier(&masq_dev_notifier);
+ register_inet6addr_notifier(&masq_inet_notifier);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier);
+
+void nf_nat_masquerade_ipv6_unregister_notifier(void)
+{
+ /* check if the notifier still has clients */
+ if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
+ return;
+
+ unregister_inet6addr_notifier(&masq_inet_notifier);
+ unregister_netdevice_notifier(&masq_dev_notifier);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
new file mode 100644
index 000000000..2205e8eee
--- /dev/null
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2011 Patrick Mchardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv4 ICMP NAT code. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/icmpv6.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static bool
+icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
+ ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
+}
+
+static void
+icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ static u16 id;
+ unsigned int range_size;
+ unsigned int i;
+
+ range_size = ntohs(range->max_proto.icmp.id) -
+ ntohs(range->min_proto.icmp.id) + 1;
+
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
+ range_size = 0xffff;
+
+ for (i = 0; ; ++id) {
+ tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
+ (id % range_size));
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+}
+
+static bool
+icmpv6_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct icmp6hdr *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct icmp6hdr *)(skb->data + hdroff);
+ l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum,
+ tuple, maniptype);
+ if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST ||
+ hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
+ inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
+ hdr->icmp6_identifier,
+ tuple->src.u.icmp.id, 0);
+ hdr->icmp6_identifier = tuple->src.u.icmp.id;
+ }
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_icmpv6 = {
+ .l4proto = IPPROTO_ICMPV6,
+ .manip_pkt = icmpv6_manip_pkt,
+ .in_range = icmpv6_in_range,
+ .unique_tuple = icmpv6_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
new file mode 100644
index 000000000..94b4c6dfb
--- /dev/null
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -0,0 +1,248 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_checksum.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
+const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook)
+{
+ const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+ u8 proto;
+ __be16 frag_off;
+ int tcphoff;
+
+ proto = oip6h->nexthdr;
+ tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data),
+ &proto, &frag_off);
+
+ if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
+ pr_debug("Cannot get TCP header.\n");
+ return NULL;
+ }
+
+ *otcplen = oldskb->len - tcphoff;
+
+ /* IP header checks: fragment, too short. */
+ if (proto != IPPROTO_TCP || *otcplen < sizeof(struct tcphdr)) {
+ pr_debug("proto(%d) != IPPROTO_TCP or too short (len = %d)\n",
+ proto, *otcplen);
+ return NULL;
+ }
+
+ otcph = skb_header_pointer(oldskb, tcphoff, sizeof(struct tcphdr),
+ otcph);
+ if (otcph == NULL)
+ return NULL;
+
+ /* No RST for RST. */
+ if (otcph->rst) {
+ pr_debug("RST is set\n");
+ return NULL;
+ }
+
+ /* Check checksum. */
+ if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) {
+ pr_debug("TCP checksum is invalid\n");
+ return NULL;
+ }
+
+ return otcph;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get);
+
+struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit)
+{
+ struct ipv6hdr *ip6h;
+ const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+#define DEFAULT_TOS_VALUE 0x0U
+ const __u8 tclass = DEFAULT_TOS_VALUE;
+
+ skb_put(nskb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(nskb);
+ ip6h = ipv6_hdr(nskb);
+ ip6_flow_hdr(ip6h, tclass, 0);
+ ip6h->hop_limit = hoplimit;
+ ip6h->nexthdr = protocol;
+ ip6h->saddr = oip6h->daddr;
+ ip6h->daddr = oip6h->saddr;
+
+ nskb->protocol = htons(ETH_P_IPV6);
+
+ return ip6h;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put);
+
+void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen)
+{
+ struct tcphdr *tcph;
+ int needs_ack;
+
+ skb_reset_transport_header(nskb);
+ tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+ /* Truncate to length (no data) */
+ tcph->doff = sizeof(struct tcphdr)/4;
+ tcph->source = oth->dest;
+ tcph->dest = oth->source;
+
+ if (oth->ack) {
+ needs_ack = 0;
+ tcph->seq = oth->ack_seq;
+ tcph->ack_seq = 0;
+ } else {
+ needs_ack = 1;
+ tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+ otcplen - (oth->doff<<2));
+ tcph->seq = 0;
+ }
+
+ /* Reset flags */
+ ((u_int8_t *)tcph)[13] = 0;
+ tcph->rst = 1;
+ tcph->ack = needs_ack;
+ tcph->window = 0;
+ tcph->urg_ptr = 0;
+ tcph->check = 0;
+
+ /* Adjust TCP checksum */
+ tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr,
+ &ipv6_hdr(nskb)->daddr,
+ sizeof(struct tcphdr), IPPROTO_TCP,
+ csum_partial(tcph,
+ sizeof(struct tcphdr), 0));
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put);
+
+void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
+{
+ struct sk_buff *nskb;
+ struct tcphdr _otcph;
+ const struct tcphdr *otcph;
+ unsigned int otcplen, hh_len;
+ const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+ struct ipv6hdr *ip6h;
+ struct dst_entry *dst = NULL;
+ struct flowi6 fl6;
+
+ if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
+ (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
+ pr_debug("addr is not unicast.\n");
+ return;
+ }
+
+ otcph = nf_reject_ip6_tcphdr_get(oldskb, &_otcph, &otcplen, hook);
+ if (!otcph)
+ return;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.saddr = oip6h->daddr;
+ fl6.daddr = oip6h->saddr;
+ fl6.fl6_sport = otcph->dest;
+ fl6.fl6_dport = otcph->source;
+ security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst == NULL || dst->error) {
+ dst_release(dst);
+ return;
+ }
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
+ return;
+
+ hh_len = (dst->dev->hard_header_len + 15)&~15;
+ nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
+ + sizeof(struct tcphdr) + dst->trailer_len,
+ GFP_ATOMIC);
+
+ if (!nskb) {
+ net_dbg_ratelimited("cannot alloc skb\n");
+ dst_release(dst);
+ return;
+ }
+
+ skb_dst_set(nskb, dst);
+
+ skb_reserve(nskb, hh_len + dst->header_len);
+ ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
+ ip6_dst_hoplimit(dst));
+ nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);
+
+ nf_ct_attach(nskb, oldskb);
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* If we use ip6_local_out for bridged traffic, the MAC source on
+ * the RST will be ours, instead of the destination's. This confuses
+ * some routers/firewalls, and they drop the packet. So we need to
+ * build the eth header using the original destination's MAC as the
+ * source, and send the RST packet directly.
+ */
+ if (oldskb->nf_bridge) {
+ struct ethhdr *oeth = eth_hdr(oldskb);
+
+ nskb->dev = nf_bridge_get_physindev(oldskb);
+ nskb->protocol = htons(ETH_P_IPV6);
+ ip6h->payload_len = htons(sizeof(struct tcphdr));
+ if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
+ oeth->h_source, oeth->h_dest, nskb->len) < 0)
+ return;
+ dev_queue_xmit(nskb);
+ } else
+#endif
+ ip6_local_out(nskb);
+}
+EXPORT_SYMBOL_GPL(nf_send_reset6);
+
+static bool reject6_csum_ok(struct sk_buff *skb, int hook)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int thoff;
+ __be16 fo;
+ u8 proto;
+
+ if (skb->csum_bad)
+ return false;
+
+ if (skb_csum_unnecessary(skb))
+ return true;
+
+ proto = ip6h->nexthdr;
+ thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
+
+ if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
+ return false;
+
+ return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
+}
+
+void nf_send_unreach6(struct net *net, struct sk_buff *skb_in,
+ unsigned char code, unsigned int hooknum)
+{
+ if (!reject6_csum_ok(skb_in, hooknum))
+ return;
+
+ if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL)
+ skb_in->dev = net->loopback_dev;
+
+ icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);
+}
+EXPORT_SYMBOL_GPL(nf_send_unreach6);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
new file mode 100644
index 000000000..c8148ba76
--- /dev/null
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+
+static unsigned int nft_do_chain_ipv6(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ /* malformed packet, drop it */
+ if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0)
+ return NF_DROP;
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
+ if (net_ratelimit())
+ pr_info("nf_tables_ipv6: ignoring short SOCK_RAW "
+ "packet\n");
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain_ipv6(ops, skb, state);
+}
+
+struct nft_af_info nft_af_ipv6 __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .nhooks = NF_INET_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 1,
+ .hooks = {
+ [NF_INET_LOCAL_IN] = nft_do_chain_ipv6,
+ [NF_INET_LOCAL_OUT] = nft_ipv6_output,
+ [NF_INET_FORWARD] = nft_do_chain_ipv6,
+ [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6,
+ [NF_INET_POST_ROUTING] = nft_do_chain_ipv6,
+ },
+};
+EXPORT_SYMBOL_GPL(nft_af_ipv6);
+
+static int nf_tables_ipv6_init_net(struct net *net)
+{
+ net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.ipv6 == NULL)
+ return -ENOMEM;
+
+ memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6));
+
+ if (nft_register_afinfo(net, net->nft.ipv6) < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(net->nft.ipv6);
+ return -ENOMEM;
+}
+
+static void nf_tables_ipv6_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.ipv6);
+ kfree(net->nft.ipv6);
+}
+
+static struct pernet_operations nf_tables_ipv6_net_ops = {
+ .init = nf_tables_ipv6_init_net,
+ .exit = nf_tables_ipv6_exit_net,
+};
+
+static const struct nf_chain_type filter_ipv6 = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_IPV6,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING),
+};
+
+static int __init nf_tables_ipv6_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_ipv6);
+ ret = register_pernet_subsys(&nf_tables_ipv6_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_ipv6);
+
+ return ret;
+}
+
+static void __exit nf_tables_ipv6_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_ipv6_net_ops);
+ nft_unregister_chain_type(&filter_ipv6);
+}
+
+module_init(nf_tables_ipv6_init);
+module_exit(nf_tables_ipv6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(AF_INET6);
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
new file mode 100644
index 000000000..951bb458b
--- /dev/null
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ipv6.h>
+
+static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct nf_conn *ct)
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo_ipv6(&pkt, ops, skb, state);
+
+ return nft_do_chain(&pkt, ops);
+}
+
+static unsigned int nft_nat_ipv6_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv6_fn(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv6_in(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv6_in(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv6_out(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv6_out(ops, skb, state, nft_nat_do_chain);
+}
+
+static unsigned int nft_nat_ipv6_local_fn(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_nat_ipv6_local_fn(ops, skb, state, nft_nat_do_chain);
+}
+
+static const struct nf_chain_type nft_chain_nat_ipv6 = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_IPV6,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nft_nat_ipv6_in,
+ [NF_INET_POST_ROUTING] = nft_nat_ipv6_out,
+ [NF_INET_LOCAL_OUT] = nft_nat_ipv6_local_fn,
+ [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn,
+ },
+};
+
+static int __init nft_chain_nat_ipv6_init(void)
+{
+ int err;
+
+ err = nft_register_chain_type(&nft_chain_nat_ipv6);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void __exit nft_chain_nat_ipv6_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_nat_ipv6);
+}
+
+module_init(nft_chain_nat_ipv6_init);
+module_exit(nft_chain_nat_ipv6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat");
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
new file mode 100644
index 000000000..0dafdaac5
--- /dev/null
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/route.h>
+
+static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ unsigned int ret;
+ struct nft_pktinfo pkt;
+ struct in6_addr saddr, daddr;
+ u_int8_t hop_limit;
+ u32 mark, flowlabel;
+
+ /* malformed packet, drop it */
+ if (nft_set_pktinfo_ipv6(&pkt, ops, skb, state) < 0)
+ return NF_DROP;
+
+ /* save source/dest address, mark, hoplimit, flowlabel, priority */
+ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+ memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+ mark = skb->mark;
+ hop_limit = ipv6_hdr(skb)->hop_limit;
+
+ /* flowlabel and prio (includes version, which shouldn't change either */
+ flowlabel = *((u32 *)ipv6_hdr(skb));
+
+ ret = nft_do_chain(&pkt, ops);
+ if (ret != NF_DROP && ret != NF_QUEUE &&
+ (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
+ memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
+ skb->mark != mark ||
+ ipv6_hdr(skb)->hop_limit != hop_limit ||
+ flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
+ return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
+
+ return ret;
+}
+
+static const struct nf_chain_type nft_chain_route_ipv6 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV6,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook,
+ },
+};
+
+static int __init nft_chain_route_init(void)
+{
+ return nft_register_chain_type(&nft_chain_route_ipv6);
+}
+
+static void __exit nft_chain_route_exit(void)
+{
+ nft_unregister_chain_type(&nft_chain_route_ipv6);
+}
+
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route");
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
new file mode 100644
index 000000000..cd1ac1637
--- /dev/null
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nft_masq.h>
+#include <net/netfilter/ipv6/nf_nat_masquerade.h>
+
+static void nft_masq_ipv6_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_masq *priv = nft_expr_priv(expr);
+ struct nf_nat_range range;
+
+ memset(&range, 0, sizeof(range));
+ range.flags = priv->flags;
+
+ regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
+}
+
+static struct nft_expr_type nft_masq_ipv6_type;
+static const struct nft_expr_ops nft_masq_ipv6_ops = {
+ .type = &nft_masq_ipv6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
+ .eval = nft_masq_ipv6_eval,
+ .init = nft_masq_init,
+ .dump = nft_masq_dump,
+ .validate = nft_masq_validate,
+};
+
+static struct nft_expr_type nft_masq_ipv6_type __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .name = "masq",
+ .ops = &nft_masq_ipv6_ops,
+ .policy = nft_masq_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_masq_ipv6_module_init(void)
+{
+ int ret;
+
+ ret = nft_register_expr(&nft_masq_ipv6_type);
+ if (ret < 0)
+ return ret;
+
+ nf_nat_masquerade_ipv6_register_notifier();
+
+ return ret;
+}
+
+static void __exit nft_masq_ipv6_module_exit(void)
+{
+ nft_unregister_expr(&nft_masq_ipv6_type);
+ nf_nat_masquerade_ipv6_unregister_notifier();
+}
+
+module_init(nft_masq_ipv6_module_init);
+module_exit(nft_masq_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq");
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
new file mode 100644
index 000000000..effd393bd
--- /dev/null
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nft_redir.h>
+#include <net/netfilter/nf_nat_redirect.h>
+
+static void nft_redir_ipv6_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_redir *priv = nft_expr_priv(expr);
+ struct nf_nat_range range;
+
+ memset(&range, 0, sizeof(range));
+ if (priv->sreg_proto_min) {
+ range.min_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min],
+ range.max_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max],
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+
+ range.flags |= priv->flags;
+
+ regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range,
+ pkt->ops->hooknum);
+}
+
+static struct nft_expr_type nft_redir_ipv6_type;
+static const struct nft_expr_ops nft_redir_ipv6_ops = {
+ .type = &nft_redir_ipv6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+ .eval = nft_redir_ipv6_eval,
+ .init = nft_redir_init,
+ .dump = nft_redir_dump,
+ .validate = nft_redir_validate,
+};
+
+static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .name = "redir",
+ .ops = &nft_redir_ipv6_ops,
+ .policy = nft_redir_policy,
+ .maxattr = NFTA_REDIR_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_redir_ipv6_module_init(void)
+{
+ return nft_register_expr(&nft_redir_ipv6_type);
+}
+
+static void __exit nft_redir_ipv6_module_exit(void)
+{
+ nft_unregister_expr(&nft_redir_ipv6_type);
+}
+
+module_init(nft_redir_ipv6_module_init);
+module_exit(nft_redir_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir");
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
new file mode 100644
index 000000000..d0d1540ec
--- /dev/null
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
+static void nft_reject_ipv6_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+ struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out);
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach6(net, pkt->skb, priv->icmp_code,
+ pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset6(net, pkt->skb, pkt->ops->hooknum);
+ break;
+ default:
+ break;
+ }
+
+ regs->verdict.code = NF_DROP;
+}
+
+static struct nft_expr_type nft_reject_ipv6_type;
+static const struct nft_expr_ops nft_reject_ipv6_ops = {
+ .type = &nft_reject_ipv6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_ipv6_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+};
+
+static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .name = "reject",
+ .ops = &nft_reject_ipv6_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_ipv6_module_init(void)
+{
+ return nft_register_expr(&nft_reject_ipv6_type);
+}
+
+static void __exit nft_reject_ipv6_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_ipv6_type);
+}
+
+module_init(nft_reject_ipv6_module_init);
+module_exit(nft_reject_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject");
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
new file mode 100644
index 000000000..85892af57
--- /dev/null
+++ b/net/ipv6/output_core.c
@@ -0,0 +1,175 @@
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static. These functions are needed by GSO/GRO implementation.
+ */
+#include <linux/export.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/addrconf.h>
+#include <net/secure_seq.h>
+
+static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
+ struct in6_addr *dst, struct in6_addr *src)
+{
+ u32 hash, id;
+
+ hash = __ipv6_addr_jhash(dst, hashrnd);
+ hash = __ipv6_addr_jhash(src, hash);
+ hash ^= net_hash_mix(net);
+
+ /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve,
+ * set the hight order instead thus minimizing possible future
+ * collisions.
+ */
+ id = ip_idents_reserve(hash, 1);
+ if (unlikely(!id))
+ id = 1 << 31;
+
+ return id;
+}
+
+/* This function exists only for tap drivers that must support broken
+ * clients requesting UFO without specifying an IPv6 fragment ID.
+ *
+ * This is similar to ipv6_select_ident() but we use an independent hash
+ * seed to limit information leakage.
+ *
+ * The network header must be set before calling this.
+ */
+void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
+{
+ static u32 ip6_proxy_idents_hashrnd __read_mostly;
+ struct in6_addr buf[2];
+ struct in6_addr *addrs;
+ u32 id;
+
+ addrs = skb_header_pointer(skb,
+ skb_network_offset(skb) +
+ offsetof(struct ipv6hdr, saddr),
+ sizeof(buf), buf);
+ if (!addrs)
+ return;
+
+ net_get_random_once(&ip6_proxy_idents_hashrnd,
+ sizeof(ip6_proxy_idents_hashrnd));
+
+ id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
+ &addrs[1], &addrs[0]);
+ skb_shinfo(skb)->ip6_frag_id = htonl(id);
+}
+EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
+
+void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr,
+ struct rt6_info *rt)
+{
+ static u32 ip6_idents_hashrnd __read_mostly;
+ u32 id;
+
+ net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
+
+ id = __ipv6_select_ident(net, ip6_idents_hashrnd, &rt->rt6i_dst.addr,
+ &rt->rt6i_src.addr);
+ fhdr->identification = htonl(id);
+}
+EXPORT_SYMBOL(ipv6_select_ident);
+
+int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
+{
+ u16 offset = sizeof(struct ipv6hdr);
+ struct ipv6_opt_hdr *exthdr =
+ (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
+ unsigned int packet_len = skb_tail_pointer(skb) -
+ skb_network_header(skb);
+ int found_rhdr = 0;
+ *nexthdr = &ipv6_hdr(skb)->nexthdr;
+
+ while (offset + 1 <= packet_len) {
+
+ switch (**nexthdr) {
+
+ case NEXTHDR_HOP:
+ break;
+ case NEXTHDR_ROUTING:
+ found_rhdr = 1;
+ break;
+ case NEXTHDR_DEST:
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
+ break;
+#endif
+ if (found_rhdr)
+ return offset;
+ break;
+ default:
+ return offset;
+ }
+
+ offset += ipv6_optlen(exthdr);
+ *nexthdr = &exthdr->nexthdr;
+ exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
+ offset);
+ }
+
+ return offset;
+}
+EXPORT_SYMBOL(ip6_find_1stfragopt);
+
+#if IS_ENABLED(CONFIG_IPV6)
+int ip6_dst_hoplimit(struct dst_entry *dst)
+{
+ int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+ if (hoplimit == 0) {
+ struct net_device *dev = dst->dev;
+ struct inet6_dev *idev;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev)
+ hoplimit = idev->cnf.hop_limit;
+ else
+ hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
+ rcu_read_unlock();
+ }
+ return hoplimit;
+}
+EXPORT_SYMBOL(ip6_dst_hoplimit);
+#endif
+
+static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
+{
+ int len;
+
+ len = skb->len - sizeof(struct ipv6hdr);
+ if (len > IPV6_MAXPLEN)
+ len = 0;
+ ipv6_hdr(skb)->payload_len = htons(len);
+ IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+ return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
+ NULL, skb_dst(skb)->dev, dst_output_sk);
+}
+
+int __ip6_local_out(struct sk_buff *skb)
+{
+ return __ip6_local_out_sk(skb->sk, skb);
+}
+EXPORT_SYMBOL_GPL(__ip6_local_out);
+
+int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = __ip6_local_out_sk(sk, skb);
+ if (likely(err == 1))
+ err = dst_output_sk(sk, skb);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip6_local_out_sk);
+
+int ip6_local_out(struct sk_buff *skb)
+{
+ return ip6_local_out_sk(skb->sk, skb);
+}
+EXPORT_SYMBOL_GPL(ip6_local_out);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
new file mode 100644
index 000000000..263a5164a
--- /dev/null
+++ b/net/ipv6/ping.c
@@ -0,0 +1,275 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * "Ping" sockets
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Based on ipv4/ping.c code.
+ *
+ * Authors: Lorenzo Colitti (IPv6 support)
+ * Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6),
+ * Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32)
+ *
+ */
+
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/transp_v6.h>
+#include <net/ping.h>
+
+struct proto pingv6_prot = {
+ .name = "PINGv6",
+ .owner = THIS_MODULE,
+ .init = ping_init_sock,
+ .close = ping_close,
+ .connect = ip6_datagram_connect_v6_only,
+ .disconnect = udp_disconnect,
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .sendmsg = ping_v6_sendmsg,
+ .recvmsg = ping_recvmsg,
+ .bind = ping_bind,
+ .backlog_rcv = ping_queue_rcv_skb,
+ .hash = ping_hash,
+ .unhash = ping_unhash,
+ .get_port = ping_get_port,
+ .obj_size = sizeof(struct raw6_sock),
+};
+EXPORT_SYMBOL_GPL(pingv6_prot);
+
+static struct inet_protosw pingv6_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMPV6,
+ .prot = &pingv6_prot,
+ .ops = &inet6_dgram_ops,
+ .flags = INET_PROTOSW_REUSE,
+};
+
+
+/* Compatibility glue so we can support IPv6 when it's compiled as a module */
+static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
+ int *addr_len)
+{
+ return -EAFNOSUPPORT;
+}
+static void dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
+ struct sk_buff *skb)
+{
+}
+static int dummy_icmpv6_err_convert(u8 type, u8 code, int *err)
+{
+ return -EAFNOSUPPORT;
+}
+static void dummy_ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+ __be16 port, u32 info, u8 *payload) {}
+static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, int strict)
+{
+ return 0;
+}
+
+int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct icmp6hdr user_icmph;
+ int addr_type;
+ struct in6_addr *daddr;
+ int iif = 0;
+ struct flowi6 fl6;
+ int err;
+ int hlimit;
+ struct dst_entry *dst;
+ struct rt6_info *rt;
+ struct pingfakehdr pfh;
+
+ pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+ err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph,
+ sizeof(user_icmph));
+ if (err)
+ return err;
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name);
+ if (msg->msg_namelen < sizeof(*u))
+ return -EINVAL;
+ if (u->sin6_family != AF_INET6) {
+ return -EAFNOSUPPORT;
+ }
+ if (sk->sk_bound_dev_if &&
+ sk->sk_bound_dev_if != u->sin6_scope_id) {
+ return -EINVAL;
+ }
+ daddr = &(u->sin6_addr);
+ iif = u->sin6_scope_id;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = &sk->sk_v6_daddr;
+ }
+
+ if (!iif)
+ iif = sk->sk_bound_dev_if;
+
+ addr_type = ipv6_addr_type(daddr);
+ if (__ipv6_addr_needs_scope_id(addr_type) && !iif)
+ return -EINVAL;
+ if (addr_type & IPV6_ADDR_MAPPED)
+ return -EINVAL;
+
+ /* TODO: use ip6_datagram_send_ctl to get options from cmsg */
+
+ memset(&fl6, 0, sizeof(fl6));
+
+ fl6.flowi6_proto = IPPROTO_ICMPV6;
+ fl6.saddr = np->saddr;
+ fl6.daddr = *daddr;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_icmp_type = user_icmph.icmp6_type;
+ fl6.fl6_icmp_code = user_icmph.icmp6_code;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->ucast_oif;
+
+ dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+ rt = (struct rt6_info *) dst;
+
+ np = inet6_sk(sk);
+ if (!np)
+ return -EBADF;
+
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->ucast_oif;
+
+ pfh.icmph.type = user_icmph.icmp6_type;
+ pfh.icmph.code = user_icmph.icmp6_code;
+ pfh.icmph.checksum = 0;
+ pfh.icmph.un.echo.id = inet->inet_sport;
+ pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence;
+ pfh.msg = msg;
+ pfh.wcheck = 0;
+ pfh.family = AF_INET6;
+
+ hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+
+ lock_sock(sk);
+ err = ip6_append_data(sk, ping_getfrag, &pfh, len,
+ 0, hlimit,
+ np->tclass, NULL, &fl6, rt,
+ MSG_DONTWAIT, np->dontfrag);
+
+ if (err) {
+ ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev,
+ ICMP6_MIB_OUTERRORS);
+ ip6_flush_pending_frames(sk);
+ } else {
+ err = icmpv6_push_pending_frames(sk, &fl6,
+ (struct icmp6hdr *) &pfh.icmph,
+ len);
+ }
+ release_sock(sk);
+
+ if (err)
+ return err;
+
+ return len;
+}
+
+#ifdef CONFIG_PROC_FS
+static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return ping_seq_start(seq, pos, AF_INET6);
+}
+
+static int ping_v6_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
+ } else {
+ int bucket = ((struct ping_iter_state *) seq->private)->bucket;
+ struct inet_sock *inet = inet_sk(v);
+ __u16 srcp = ntohs(inet->inet_sport);
+ __u16 destp = ntohs(inet->inet_dport);
+ ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket);
+ }
+ return 0;
+}
+
+static struct ping_seq_afinfo ping_v6_seq_afinfo = {
+ .name = "icmp6",
+ .family = AF_INET6,
+ .seq_fops = &ping_seq_fops,
+ .seq_ops = {
+ .start = ping_v6_seq_start,
+ .show = ping_v6_seq_show,
+ .next = ping_seq_next,
+ .stop = ping_seq_stop,
+ },
+};
+
+static int __net_init ping_v6_proc_init_net(struct net *net)
+{
+ return ping_proc_register(net, &ping_v6_seq_afinfo);
+}
+
+static void __net_init ping_v6_proc_exit_net(struct net *net)
+{
+ return ping_proc_unregister(net, &ping_v6_seq_afinfo);
+}
+
+static struct pernet_operations ping_v6_net_ops = {
+ .init = ping_v6_proc_init_net,
+ .exit = ping_v6_proc_exit_net,
+};
+#endif
+
+int __init pingv6_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ int ret = register_pernet_subsys(&ping_v6_net_ops);
+ if (ret)
+ return ret;
+#endif
+ pingv6_ops.ipv6_recv_error = ipv6_recv_error;
+ pingv6_ops.ip6_datagram_recv_common_ctl = ip6_datagram_recv_common_ctl;
+ pingv6_ops.ip6_datagram_recv_specific_ctl =
+ ip6_datagram_recv_specific_ctl;
+ pingv6_ops.icmpv6_err_convert = icmpv6_err_convert;
+ pingv6_ops.ipv6_icmp_error = ipv6_icmp_error;
+ pingv6_ops.ipv6_chk_addr = ipv6_chk_addr;
+ return inet6_register_protosw(&pingv6_protosw);
+}
+
+/* This never gets called because it's not possible to unload the ipv6 module,
+ * but just in case.
+ */
+void pingv6_exit(void)
+{
+ pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error;
+ pingv6_ops.ip6_datagram_recv_common_ctl = dummy_ip6_datagram_recv_ctl;
+ pingv6_ops.ip6_datagram_recv_specific_ctl = dummy_ip6_datagram_recv_ctl;
+ pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert;
+ pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error;
+ pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr;
+#ifdef CONFIG_PROC_FS
+ unregister_pernet_subsys(&ping_v6_net_ops);
+#endif
+ inet6_unregister_protosw(&pingv6_protosw);
+}
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
new file mode 100644
index 000000000..679253d0a
--- /dev/null
+++ b/net/ipv6/proc.c
@@ -0,0 +1,346 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * This file implements the various access functions for the
+ * PROC file system. This is very similar to the IPv4 version,
+ * except it reports the sockets in the INET6 address family.
+ *
+ * Authors: David S. Miller (davem@caip.rutgers.edu)
+ * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stddef.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/transp_v6.h>
+#include <net/ipv6.h>
+
+static int sockstat6_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq->private;
+ unsigned int frag_mem = ip6_frag_mem(net);
+
+ seq_printf(seq, "TCP6: inuse %d\n",
+ sock_prot_inuse_get(net, &tcpv6_prot));
+ seq_printf(seq, "UDP6: inuse %d\n",
+ sock_prot_inuse_get(net, &udpv6_prot));
+ seq_printf(seq, "UDPLITE6: inuse %d\n",
+ sock_prot_inuse_get(net, &udplitev6_prot));
+ seq_printf(seq, "RAW6: inuse %d\n",
+ sock_prot_inuse_get(net, &rawv6_prot));
+ seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
+ return 0;
+}
+
+static int sockstat6_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, sockstat6_seq_show);
+}
+
+static const struct file_operations sockstat6_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = sockstat6_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static const struct snmp_mib snmp6_ipstats_list[] = {
+/* ipv6 mib according to RFC 2465 */
+ SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS),
+ SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+ SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS),
+ SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES),
+ SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+ SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+ SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
+ SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS),
+ SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS),
+ SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+ SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS),
+ SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+ SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+ SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+ SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS),
+ SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS),
+ SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS),
+ SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS),
+ SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS),
+ SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES),
+ SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
+ SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
+ SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS),
+ SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS),
+ SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+ SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+ SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+ SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+ /* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */
+ SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+ SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+ SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+ SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS),
+ SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp6_icmp6_list[] = {
+/* icmpv6 mib according to RFC 2466 */
+ SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS),
+ SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS),
+ SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
+ SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS),
+ SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS),
+ SNMP_MIB_SENTINEL
+};
+
+/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
+static const char *const icmp6type2name[256] = {
+ [ICMPV6_DEST_UNREACH] = "DestUnreachs",
+ [ICMPV6_PKT_TOOBIG] = "PktTooBigs",
+ [ICMPV6_TIME_EXCEED] = "TimeExcds",
+ [ICMPV6_PARAMPROB] = "ParmProblems",
+ [ICMPV6_ECHO_REQUEST] = "Echos",
+ [ICMPV6_ECHO_REPLY] = "EchoReplies",
+ [ICMPV6_MGM_QUERY] = "GroupMembQueries",
+ [ICMPV6_MGM_REPORT] = "GroupMembResponses",
+ [ICMPV6_MGM_REDUCTION] = "GroupMembReductions",
+ [ICMPV6_MLD2_REPORT] = "MLDv2Reports",
+ [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements",
+ [NDISC_ROUTER_SOLICITATION] = "RouterSolicits",
+ [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements",
+ [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits",
+ [NDISC_REDIRECT] = "Redirects",
+};
+
+
+static const struct snmp_mib snmp6_udp6_list[] = {
+ SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS),
+ SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
+ SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS),
+ SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+ SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+ SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS),
+ SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI),
+ SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp6_udplite6_list[] = {
+ SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS),
+ SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS),
+ SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS),
+ SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+ SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+ SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
+ SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS),
+ SNMP_MIB_SENTINEL
+};
+
+static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
+{
+ char name[32];
+ int i;
+
+ /* print by name -- deprecated items */
+ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+ int icmptype;
+ const char *p;
+
+ icmptype = i & 0xff;
+ p = icmp6type2name[icmptype];
+ if (!p) /* don't print un-named types here */
+ continue;
+ snprintf(name, sizeof(name), "Icmp6%s%s",
+ i & 0x100 ? "Out" : "In", p);
+ seq_printf(seq, "%-32s\t%lu\n", name,
+ atomic_long_read(smib + i));
+ }
+
+ /* print by number (nonzero only) - ICMPMsgStat format */
+ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+ unsigned long val;
+
+ val = atomic_long_read(smib + i);
+ if (!val)
+ continue;
+ snprintf(name, sizeof(name), "Icmp6%sType%u",
+ i & 0x100 ? "Out" : "In", i & 0xff);
+ seq_printf(seq, "%-32s\t%lu\n", name, val);
+ }
+}
+
+/* can be called either with percpu mib (pcpumib != NULL),
+ * or shared one (smib != NULL)
+ */
+static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
+ atomic_long_t *smib,
+ const struct snmp_mib *itemlist)
+{
+ int i;
+ unsigned long val;
+
+ for (i = 0; itemlist[i].name; i++) {
+ val = pcpumib ?
+ snmp_fold_field(pcpumib, itemlist[i].entry) :
+ atomic_long_read(smib + itemlist[i].entry);
+ seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val);
+ }
+}
+
+static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
+ const struct snmp_mib *itemlist, size_t syncpoff)
+{
+ int i;
+
+ for (i = 0; itemlist[i].name; i++)
+ seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name,
+ snmp_fold_field64(mib, itemlist[i].entry, syncpoff));
+}
+
+static int snmp6_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = (struct net *)seq->private;
+
+ snmp6_seq_show_item64(seq, net->mib.ipv6_statistics,
+ snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+ snmp6_seq_show_item(seq, net->mib.icmpv6_statistics,
+ NULL, snmp6_icmp6_list);
+ snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs);
+ snmp6_seq_show_item(seq, net->mib.udp_stats_in6,
+ NULL, snmp6_udp6_list);
+ snmp6_seq_show_item(seq, net->mib.udplite_stats_in6,
+ NULL, snmp6_udplite6_list);
+ return 0;
+}
+
+static int snmp6_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, snmp6_seq_show);
+}
+
+static const struct file_operations snmp6_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = snmp6_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
+{
+ struct inet6_dev *idev = (struct inet6_dev *)seq->private;
+
+ seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
+ snmp6_seq_show_item64(seq, idev->stats.ipv6,
+ snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+ snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs,
+ snmp6_icmp6_list);
+ snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs);
+ return 0;
+}
+
+static int snmp6_dev_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, snmp6_dev_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations snmp6_dev_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = snmp6_dev_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+int snmp6_register_dev(struct inet6_dev *idev)
+{
+ struct proc_dir_entry *p;
+ struct net *net;
+
+ if (!idev || !idev->dev)
+ return -EINVAL;
+
+ net = dev_net(idev->dev);
+ if (!net->mib.proc_net_devsnmp6)
+ return -ENOENT;
+
+ p = proc_create_data(idev->dev->name, S_IRUGO,
+ net->mib.proc_net_devsnmp6,
+ &snmp6_dev_seq_fops, idev);
+ if (!p)
+ return -ENOMEM;
+
+ idev->stats.proc_dir_entry = p;
+ return 0;
+}
+
+int snmp6_unregister_dev(struct inet6_dev *idev)
+{
+ struct net *net = dev_net(idev->dev);
+ if (!net->mib.proc_net_devsnmp6)
+ return -ENOENT;
+ if (!idev->stats.proc_dir_entry)
+ return -EINVAL;
+ proc_remove(idev->stats.proc_dir_entry);
+ idev->stats.proc_dir_entry = NULL;
+ return 0;
+}
+
+static int __net_init ipv6_proc_init_net(struct net *net)
+{
+ if (!proc_create("sockstat6", S_IRUGO, net->proc_net,
+ &sockstat6_seq_fops))
+ return -ENOMEM;
+
+ if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops))
+ goto proc_snmp6_fail;
+
+ net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
+ if (!net->mib.proc_net_devsnmp6)
+ goto proc_dev_snmp6_fail;
+ return 0;
+
+proc_dev_snmp6_fail:
+ remove_proc_entry("snmp6", net->proc_net);
+proc_snmp6_fail:
+ remove_proc_entry("sockstat6", net->proc_net);
+ return -ENOMEM;
+}
+
+static void __net_exit ipv6_proc_exit_net(struct net *net)
+{
+ remove_proc_entry("sockstat6", net->proc_net);
+ remove_proc_entry("dev_snmp6", net->proc_net);
+ remove_proc_entry("snmp6", net->proc_net);
+}
+
+static struct pernet_operations ipv6_proc_ops = {
+ .init = ipv6_proc_init_net,
+ .exit = ipv6_proc_exit_net,
+};
+
+int __init ipv6_misc_proc_init(void)
+{
+ return register_pernet_subsys(&ipv6_proc_ops);
+}
+
+void ipv6_misc_proc_exit(void)
+{
+ unregister_pernet_subsys(&ipv6_proc_ops);
+}
+
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
new file mode 100644
index 000000000..e3770abe6
--- /dev/null
+++ b/net/ipv6/protocol.c
@@ -0,0 +1,74 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * PF_INET6 protocol dispatch tables.
+ *
+ * Authors: Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ *
+ * Vince Laviano (vince@cs.stanford.edu) 16 May 2001
+ * - Removed unused variable 'inet6_protocol_base'
+ * - Modified inet6_del_protocol() to correctly maintain copy bit.
+ */
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet6_protos);
+
+int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
+{
+ return !cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet6_add_protocol);
+
+int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol)
+{
+ int ret;
+
+ ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet6_del_protocol);
+#endif
+
+const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet6_offloads);
+
+int inet6_add_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ return !cmpxchg((const struct net_offload **)&inet6_offloads[protocol],
+ NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet6_add_offload);
+
+int inet6_del_offload(const struct net_offload *prot, unsigned char protocol)
+{
+ int ret;
+
+ ret = (cmpxchg((const struct net_offload **)&inet6_offloads[protocol],
+ prot, NULL) == prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet6_del_offload);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
new file mode 100644
index 000000000..8072bd413
--- /dev/null
+++ b/net/ipv6/raw.c
@@ -0,0 +1,1339 @@
+/*
+ * RAW sockets for IPv6
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Adapted from linux/net/ipv4/raw.c
+ *
+ * Fixes:
+ * Hideaki YOSHIFUJI : sin6_scope_id support
+ * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance)
+ * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/skbuff.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/ip6_route.h>
+#include <net/ip6_checksum.h>
+#include <net/addrconf.h>
+#include <net/transp_v6.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/tcp_states.h>
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+#include <net/mip6.h>
+#endif
+#include <linux/mroute6.h>
+
+#include <net/raw.h>
+#include <net/rawv6.h>
+#include <net/xfrm.h>
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+
+#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */
+
+static struct raw_hashinfo raw_v6_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
+};
+
+static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+ unsigned short num, const struct in6_addr *loc_addr,
+ const struct in6_addr *rmt_addr, int dif)
+{
+ bool is_multicast = ipv6_addr_is_multicast(loc_addr);
+
+ sk_for_each_from(sk)
+ if (inet_sk(sk)->inet_num == num) {
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
+ continue;
+
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ continue;
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+ if (ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
+ goto found;
+ if (is_multicast &&
+ inet6_mc_check(sk, loc_addr, rmt_addr))
+ goto found;
+ continue;
+ }
+ goto found;
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+/*
+ * 0 - deliver
+ * 1 - block
+ */
+static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb)
+{
+ struct icmp6hdr _hdr;
+ const struct icmp6hdr *hdr;
+
+ /* We require only the four bytes of the ICMPv6 header, not any
+ * additional bytes of message body in "struct icmp6hdr".
+ */
+ hdr = skb_header_pointer(skb, skb_transport_offset(skb),
+ ICMPV6_HDRLEN, &_hdr);
+ if (hdr) {
+ const __u32 *data = &raw6_sk(sk)->filter.data[0];
+ unsigned int type = hdr->icmp6_type;
+
+ return (data[type >> 5] & (1U << (type & 31))) != 0;
+ }
+ return 1;
+}
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);
+
+static mh_filter_t __rcu *mh_filter __read_mostly;
+
+int rawv6_mh_filter_register(mh_filter_t filter)
+{
+ rcu_assign_pointer(mh_filter, filter);
+ return 0;
+}
+EXPORT_SYMBOL(rawv6_mh_filter_register);
+
+int rawv6_mh_filter_unregister(mh_filter_t filter)
+{
+ RCU_INIT_POINTER(mh_filter, NULL);
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL(rawv6_mh_filter_unregister);
+
+#endif
+
+/*
+ * demultiplex raw sockets.
+ * (should consider queueing the skb in the sock receive_queue
+ * without calling rawv6.c)
+ *
+ * Caller owns SKB so we must make clones.
+ */
+static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
+{
+ const struct in6_addr *saddr;
+ const struct in6_addr *daddr;
+ struct sock *sk;
+ bool delivered = false;
+ __u8 hash;
+ struct net *net;
+
+ saddr = &ipv6_hdr(skb)->saddr;
+ daddr = saddr + 1;
+
+ hash = nexthdr & (RAW_HTABLE_SIZE - 1);
+
+ read_lock(&raw_v6_hashinfo.lock);
+ sk = sk_head(&raw_v6_hashinfo.ht[hash]);
+
+ if (!sk)
+ goto out;
+
+ net = dev_net(skb->dev);
+ sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, inet6_iif(skb));
+
+ while (sk) {
+ int filtered;
+
+ delivered = true;
+ switch (nexthdr) {
+ case IPPROTO_ICMPV6:
+ filtered = icmpv6_filter(sk, skb);
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPPROTO_MH:
+ {
+ /* XXX: To validate MH only once for each packet,
+ * this is placed here. It should be after checking
+ * xfrm policy, however it doesn't. The checking xfrm
+ * policy is placed in rawv6_rcv() because it is
+ * required for each socket.
+ */
+ mh_filter_t *filter;
+
+ filter = rcu_dereference(mh_filter);
+ filtered = filter ? (*filter)(sk, skb) : 0;
+ break;
+ }
+#endif
+ default:
+ filtered = 0;
+ break;
+ }
+
+ if (filtered < 0)
+ break;
+ if (filtered == 0) {
+ struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+ /* Not releasing hash table! */
+ if (clone) {
+ nf_reset(clone);
+ rawv6_rcv(sk, clone);
+ }
+ }
+ sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr,
+ inet6_iif(skb));
+ }
+out:
+ read_unlock(&raw_v6_hashinfo.lock);
+ return delivered;
+}
+
+bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
+{
+ struct sock *raw_sk;
+
+ raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]);
+ if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
+ raw_sk = NULL;
+
+ return raw_sk != NULL;
+}
+
+/* This cleans up af_inet6 a bit. -DaveM */
+static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
+ __be32 v4addr = 0;
+ int addr_type;
+ int err;
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (addr->sin6_family != AF_INET6)
+ return -EINVAL;
+
+ addr_type = ipv6_addr_type(&addr->sin6_addr);
+
+ /* Raw sockets are IPv6 only */
+ if (addr_type == IPV6_ADDR_MAPPED)
+ return -EADDRNOTAVAIL;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sk->sk_state != TCP_CLOSE)
+ goto out;
+
+ rcu_read_lock();
+ /* Check if the address belongs to the host. */
+ if (addr_type != IPV6_ADDR_ANY) {
+ struct net_device *dev = NULL;
+
+ if (__ipv6_addr_needs_scope_id(addr_type)) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ addr->sin6_scope_id) {
+ /* Override any existing binding, if another
+ * one is supplied by user.
+ */
+ sk->sk_bound_dev_if = addr->sin6_scope_id;
+ }
+
+ /* Binding to link-local address requires an interface */
+ if (!sk->sk_bound_dev_if)
+ goto out_unlock;
+
+ err = -ENODEV;
+ dev = dev_get_by_index_rcu(sock_net(sk),
+ sk->sk_bound_dev_if);
+ if (!dev)
+ goto out_unlock;
+ }
+
+ /* ipv4 addr of the socket is invalid. Only the
+ * unspecified and mapped address have a v4 equivalent.
+ */
+ v4addr = LOOPBACK4_IPV6;
+ if (!(addr_type & IPV6_ADDR_MULTICAST)) {
+ err = -EADDRNOTAVAIL;
+ if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
+ dev, 0)) {
+ goto out_unlock;
+ }
+ }
+ }
+
+ inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
+ sk->sk_v6_rcv_saddr = addr->sin6_addr;
+ if (!(addr_type & IPV6_ADDR_MULTICAST))
+ np->saddr = addr->sin6_addr;
+ err = 0;
+out_unlock:
+ rcu_read_unlock();
+out:
+ release_sock(sk);
+ return err;
+}
+
+static void rawv6_err(struct sock *sk, struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ int err;
+ int harderr;
+
+ /* Report error on raw socket, if:
+ 1. User requested recverr.
+ 2. Socket is connected (otherwise the error indication
+ is useless without recverr and error is hard.
+ */
+ if (!np->recverr && sk->sk_state != TCP_ESTABLISHED)
+ return;
+
+ harderr = icmpv6_err_convert(type, code, &err);
+ if (type == ICMPV6_PKT_TOOBIG) {
+ ip6_sk_update_pmtu(skb, sk, info);
+ harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
+ }
+ if (type == NDISC_REDIRECT) {
+ ip6_sk_redirect(skb, sk);
+ return;
+ }
+ if (np->recverr) {
+ u8 *payload = skb->data;
+ if (!inet->hdrincl)
+ payload += offset;
+ ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
+ }
+
+ if (np->recverr || harderr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ }
+}
+
+void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
+ u8 type, u8 code, int inner_offset, __be32 info)
+{
+ struct sock *sk;
+ int hash;
+ const struct in6_addr *saddr, *daddr;
+ struct net *net;
+
+ hash = nexthdr & (RAW_HTABLE_SIZE - 1);
+
+ read_lock(&raw_v6_hashinfo.lock);
+ sk = sk_head(&raw_v6_hashinfo.ht[hash]);
+ if (sk) {
+ /* Note: ipv6_hdr(skb) != skb->data */
+ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
+ saddr = &ip6h->saddr;
+ daddr = &ip6h->daddr;
+ net = dev_net(skb->dev);
+
+ while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
+ inet6_iif(skb)))) {
+ rawv6_err(sk, skb, NULL, type, code,
+ inner_offset, info);
+ sk = sk_next(sk);
+ }
+ }
+ read_unlock(&raw_v6_hashinfo.lock);
+}
+
+static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
+ skb_checksum_complete(skb)) {
+ atomic_inc(&sk->sk_drops);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ /* Charge it to the socket. */
+ skb_dst_drop(skb);
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return 0;
+}
+
+/*
+ * This is next to useless...
+ * if we demultiplex in network layer we don't need the extra call
+ * just to queue the skb...
+ * maybe we could have the network decide upon a hint if it
+ * should call raw_rcv for demultiplexing
+ */
+int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct raw6_sock *rp = raw6_sk(sk);
+
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ atomic_inc(&sk->sk_drops);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ if (!rp->checksum)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len, inet->inet_num, skb->csum))
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ if (!skb_csum_unnecessary(skb))
+ skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len,
+ inet->inet_num, 0));
+
+ if (inet->hdrincl) {
+ if (skb_checksum_complete(skb)) {
+ atomic_inc(&sk->sk_drops);
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+ }
+
+ rawv6_rcv_skb(sk, skb);
+ return 0;
+}
+
+
+/*
+ * This should be easy, if there is something there
+ * we return it, otherwise we block.
+ */
+
+static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+ struct sk_buff *skb;
+ size_t copied;
+ int err;
+
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ if (flags & MSG_ERRQUEUE)
+ return ipv6_recv_error(sk, msg, len, addr_len);
+
+ if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+ return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ copied = len;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ if (skb_csum_unnecessary(skb)) {
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ } else if (msg->msg_flags&MSG_TRUNC) {
+ if (__skb_checksum_complete(skb))
+ goto csum_copy_err;
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ } else {
+ err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
+ if (err == -EINVAL)
+ goto csum_copy_err;
+ }
+ if (err)
+ goto out_free;
+
+ /* Copy the address. */
+ if (sin6) {
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = 0;
+ sin6->sin6_addr = ipv6_hdr(skb)->saddr;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
+ inet6_iif(skb));
+ *addr_len = sizeof(*sin6);
+ }
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (np->rxopt.all)
+ ip6_datagram_recv_ctl(sk, msg, skb);
+
+ err = copied;
+ if (flags & MSG_TRUNC)
+ err = skb->len;
+
+out_free:
+ skb_free_datagram(sk, skb);
+out:
+ return err;
+
+csum_copy_err:
+ skb_kill_datagram(sk, skb, flags);
+
+ /* Error for blocking case is chosen to masquerade
+ as some normal condition.
+ */
+ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+ goto out;
+}
+
+static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+ struct raw6_sock *rp)
+{
+ struct sk_buff *skb;
+ int err = 0;
+ int offset;
+ int len;
+ int total_len;
+ __wsum tmp_csum;
+ __sum16 csum;
+
+ if (!rp->checksum)
+ goto send;
+
+ skb = skb_peek(&sk->sk_write_queue);
+ if (!skb)
+ goto out;
+
+ offset = rp->offset;
+ total_len = inet_sk(sk)->cork.base.length;
+ if (offset >= total_len - 1) {
+ err = -EINVAL;
+ ip6_flush_pending_frames(sk);
+ goto out;
+ }
+
+ /* should be check HW csum miyazawa */
+ if (skb_queue_len(&sk->sk_write_queue) == 1) {
+ /*
+ * Only one fragment on the socket.
+ */
+ tmp_csum = skb->csum;
+ } else {
+ struct sk_buff *csum_skb = NULL;
+ tmp_csum = 0;
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ tmp_csum = csum_add(tmp_csum, skb->csum);
+
+ if (csum_skb)
+ continue;
+
+ len = skb->len - skb_transport_offset(skb);
+ if (offset >= len) {
+ offset -= len;
+ continue;
+ }
+
+ csum_skb = skb;
+ }
+
+ skb = csum_skb;
+ }
+
+ offset += skb_transport_offset(skb);
+ BUG_ON(skb_copy_bits(skb, offset, &csum, 2));
+
+ /* in case cksum was not initialized */
+ if (unlikely(csum))
+ tmp_csum = csum_sub(tmp_csum, csum_unfold(csum));
+
+ csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
+ total_len, fl6->flowi6_proto, tmp_csum);
+
+ if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP)
+ csum = CSUM_MANGLED_0;
+
+ BUG_ON(skb_store_bits(skb, offset, &csum, 2));
+
+send:
+ err = ip6_push_pending_frames(sk);
+out:
+ return err;
+}
+
+static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
+ struct flowi6 *fl6, struct dst_entry **dstp,
+ unsigned int flags)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6hdr *iph;
+ struct sk_buff *skb;
+ int err;
+ struct rt6_info *rt = (struct rt6_info *)*dstp;
+ int hlen = LL_RESERVED_SPACE(rt->dst.dev);
+ int tlen = rt->dst.dev->needed_tailroom;
+
+ if (length > rt->dst.dev->mtu) {
+ ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu);
+ return -EMSGSIZE;
+ }
+ if (flags&MSG_PROBE)
+ goto out;
+
+ skb = sock_alloc_send_skb(sk,
+ length + hlen + tlen + 15,
+ flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto error;
+ skb_reserve(skb, hlen);
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+ skb_dst_set(skb, &rt->dst);
+ *dstp = NULL;
+
+ skb_put(skb, length);
+ skb_reset_network_header(skb);
+ iph = ipv6_hdr(skb);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb->transport_header = skb->network_header;
+ err = memcpy_from_msg(iph, msg, length);
+ if (err)
+ goto error_fault;
+
+ IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
+ NULL, rt->dst.dev, dst_output_sk);
+ if (err > 0)
+ err = net_xmit_errno(err);
+ if (err)
+ goto error;
+out:
+ return 0;
+
+error_fault:
+ err = -EFAULT;
+ kfree_skb(skb);
+error:
+ IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+ if (err == -ENOBUFS && !np->recverr)
+ err = 0;
+ return err;
+}
+
+struct raw6_frag_vec {
+ struct msghdr *msg;
+ int hlen;
+ char c[4];
+};
+
+static int rawv6_probe_proto_opt(struct raw6_frag_vec *rfv, struct flowi6 *fl6)
+{
+ int err = 0;
+ switch (fl6->flowi6_proto) {
+ case IPPROTO_ICMPV6:
+ rfv->hlen = 2;
+ err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
+ if (!err) {
+ fl6->fl6_icmp_type = rfv->c[0];
+ fl6->fl6_icmp_code = rfv->c[1];
+ }
+ break;
+ case IPPROTO_MH:
+ rfv->hlen = 4;
+ err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
+ if (!err)
+ fl6->fl6_mh_type = rfv->c[2];
+ }
+ return err;
+}
+
+static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
+ struct sk_buff *skb)
+{
+ struct raw6_frag_vec *rfv = from;
+
+ if (offset < rfv->hlen) {
+ int copy = min(rfv->hlen - offset, len);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ memcpy(to, rfv->c + offset, copy);
+ else
+ skb->csum = csum_block_add(
+ skb->csum,
+ csum_partial_copy_nocheck(rfv->c + offset,
+ to, copy, 0),
+ odd);
+
+ odd = 0;
+ offset += copy;
+ to += copy;
+ len -= copy;
+
+ if (!len)
+ return 0;
+ }
+
+ offset -= rfv->hlen;
+
+ return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
+}
+
+static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct ipv6_txoptions opt_space;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+ struct in6_addr *daddr, *final_p, final;
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct raw6_sock *rp = raw6_sk(sk);
+ struct ipv6_txoptions *opt = NULL;
+ struct ip6_flowlabel *flowlabel = NULL;
+ struct dst_entry *dst = NULL;
+ struct raw6_frag_vec rfv;
+ struct flowi6 fl6;
+ int addr_len = msg->msg_namelen;
+ int hlimit = -1;
+ int tclass = -1;
+ int dontfrag = -1;
+ u16 proto;
+ int err;
+
+ /* Rough check on arithmetic overflow,
+ better check is made in ip6_append_data().
+ */
+ if (len > INT_MAX)
+ return -EMSGSIZE;
+
+ /* Mirror BSD error message compatibility */
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ /*
+ * Get and verify the address.
+ */
+ memset(&fl6, 0, sizeof(fl6));
+
+ fl6.flowi6_mark = sk->sk_mark;
+
+ if (sin6) {
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ /* port is the proto value [0..255] carried in nexthdr */
+ proto = ntohs(sin6->sin6_port);
+
+ if (!proto)
+ proto = inet->inet_num;
+ else if (proto != inet->inet_num)
+ return -EINVAL;
+
+ if (proto > 255)
+ return -EINVAL;
+
+ daddr = &sin6->sin6_addr;
+ if (np->sndflow) {
+ fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (!flowlabel)
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Otherwise it will be difficult to maintain
+ * sk->sk_dst_cache.
+ */
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
+ daddr = &sk->sk_v6_daddr;
+
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ sin6->sin6_scope_id &&
+ __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
+ fl6.flowi6_oif = sin6->sin6_scope_id;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+
+ proto = inet->inet_num;
+ daddr = &sk->sk_v6_daddr;
+ fl6.flowlabel = np->flow_label;
+ }
+
+ if (fl6.flowi6_oif == 0)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+
+ if (msg->msg_controllen) {
+ opt = &opt_space;
+ memset(opt, 0, sizeof(struct ipv6_txoptions));
+ opt->tot_len = sizeof(struct ipv6_txoptions);
+
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt,
+ &hlimit, &tclass, &dontfrag);
+ if (err < 0) {
+ fl6_sock_release(flowlabel);
+ return err;
+ }
+ if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (!flowlabel)
+ return -EINVAL;
+ }
+ if (!(opt->opt_nflen|opt->opt_flen))
+ opt = NULL;
+ }
+ if (!opt)
+ opt = np->opt;
+ if (flowlabel)
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ opt = ipv6_fixup_options(&opt_space, opt);
+
+ fl6.flowi6_proto = proto;
+ rfv.msg = msg;
+ rfv.hlen = 0;
+ err = rawv6_probe_proto_opt(&rfv, &fl6);
+ if (err)
+ goto out;
+
+ if (!ipv6_addr_any(daddr))
+ fl6.daddr = *daddr;
+ else
+ fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
+ fl6.saddr = np->saddr;
+
+ final_p = fl6_update_dst(&fl6, opt, &final);
+
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->ucast_oif;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto out;
+ }
+ if (hlimit < 0)
+ hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+
+ if (tclass < 0)
+ tclass = np->tclass;
+
+ if (dontfrag < 0)
+ dontfrag = np->dontfrag;
+
+ if (msg->msg_flags&MSG_CONFIRM)
+ goto do_confirm;
+
+back_from_confirm:
+ if (inet->hdrincl)
+ err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags);
+ else {
+ lock_sock(sk);
+ err = ip6_append_data(sk, raw6_getfrag, &rfv,
+ len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst,
+ msg->msg_flags, dontfrag);
+
+ if (err)
+ ip6_flush_pending_frames(sk);
+ else if (!(msg->msg_flags & MSG_MORE))
+ err = rawv6_push_pending_frames(sk, &fl6, rp);
+ release_sock(sk);
+ }
+done:
+ dst_release(dst);
+out:
+ fl6_sock_release(flowlabel);
+ return err < 0 ? err : len;
+do_confirm:
+ dst_confirm(dst);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto done;
+}
+
+static int rawv6_seticmpfilter(struct sock *sk, int level, int optname,
+ char __user *optval, int optlen)
+{
+ switch (optname) {
+ case ICMPV6_FILTER:
+ if (optlen > sizeof(struct icmp6_filter))
+ optlen = sizeof(struct icmp6_filter);
+ if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ return 0;
+}
+
+static int rawv6_geticmpfilter(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ int len;
+
+ switch (optname) {
+ case ICMPV6_FILTER:
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+ if (len > sizeof(struct icmp6_filter))
+ len = sizeof(struct icmp6_filter);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &raw6_sk(sk)->filter, len))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ return 0;
+}
+
+
+static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct raw6_sock *rp = raw6_sk(sk);
+ int val;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ switch (optname) {
+ case IPV6_CHECKSUM:
+ if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
+ level == IPPROTO_IPV6) {
+ /*
+ * RFC3542 tells that IPV6_CHECKSUM socket
+ * option in the IPPROTO_IPV6 level is not
+ * allowed on ICMPv6 sockets.
+ * If you want to set it, use IPPROTO_RAW
+ * level IPV6_CHECKSUM socket option
+ * (Linux extension).
+ */
+ return -EINVAL;
+ }
+
+ /* You may get strange result with a positive odd offset;
+ RFC2292bis agrees with me. */
+ if (val > 0 && (val&1))
+ return -EINVAL;
+ if (val < 0) {
+ rp->checksum = 0;
+ } else {
+ rp->checksum = 1;
+ rp->offset = val;
+ }
+
+ return 0;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+static int rawv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ switch (level) {
+ case SOL_RAW:
+ break;
+
+ case SOL_ICMPV6:
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+ return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
+ case SOL_IPV6:
+ if (optname == IPV6_CHECKSUM)
+ break;
+ default:
+ return ipv6_setsockopt(sk, level, optname, optval, optlen);
+ }
+
+ return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ switch (level) {
+ case SOL_RAW:
+ break;
+ case SOL_ICMPV6:
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+ return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
+ case SOL_IPV6:
+ if (optname == IPV6_CHECKSUM)
+ break;
+ default:
+ return compat_ipv6_setsockopt(sk, level, optname,
+ optval, optlen);
+ }
+ return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct raw6_sock *rp = raw6_sk(sk);
+ int val, len;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ switch (optname) {
+ case IPV6_CHECKSUM:
+ /*
+ * We allow getsockopt() for IPPROTO_IPV6-level
+ * IPV6_CHECKSUM socket option on ICMPv6 sockets
+ * since RFC3542 is silent about it.
+ */
+ if (rp->checksum == 0)
+ val = -1;
+ else
+ val = rp->offset;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ len = min_t(unsigned int, sizeof(int), len);
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static int rawv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ switch (level) {
+ case SOL_RAW:
+ break;
+
+ case SOL_ICMPV6:
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+ return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
+ case SOL_IPV6:
+ if (optname == IPV6_CHECKSUM)
+ break;
+ default:
+ return ipv6_getsockopt(sk, level, optname, optval, optlen);
+ }
+
+ return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ switch (level) {
+ case SOL_RAW:
+ break;
+ case SOL_ICMPV6:
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+ return -EOPNOTSUPP;
+ return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
+ case SOL_IPV6:
+ if (optname == IPV6_CHECKSUM)
+ break;
+ default:
+ return compat_ipv6_getsockopt(sk, level, optname,
+ optval, optlen);
+ }
+ return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ: {
+ int amount = sk_wmem_alloc_get(sk);
+
+ return put_user(amount, (int __user *)arg);
+ }
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ amount = skb_tail_pointer(skb) -
+ skb_transport_header(skb);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
+
+ default:
+#ifdef CONFIG_IPV6_MROUTE
+ return ip6mr_ioctl(sk, cmd, (void __user *)arg);
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SIOCOUTQ:
+ case SIOCINQ:
+ return -ENOIOCTLCMD;
+ default:
+#ifdef CONFIG_IPV6_MROUTE
+ return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+ return -ENOIOCTLCMD;
+#endif
+ }
+}
+#endif
+
+static void rawv6_close(struct sock *sk, long timeout)
+{
+ if (inet_sk(sk)->inet_num == IPPROTO_RAW)
+ ip6_ra_control(sk, -1);
+ ip6mr_sk_done(sk);
+ sk_common_release(sk);
+}
+
+static void raw6_destroy(struct sock *sk)
+{
+ lock_sock(sk);
+ ip6_flush_pending_frames(sk);
+ release_sock(sk);
+
+ inet6_destroy_sock(sk);
+}
+
+static int rawv6_init_sk(struct sock *sk)
+{
+ struct raw6_sock *rp = raw6_sk(sk);
+
+ switch (inet_sk(sk)->inet_num) {
+ case IPPROTO_ICMPV6:
+ rp->checksum = 1;
+ rp->offset = 2;
+ break;
+ case IPPROTO_MH:
+ rp->checksum = 1;
+ rp->offset = 4;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+struct proto rawv6_prot = {
+ .name = "RAWv6",
+ .owner = THIS_MODULE,
+ .close = rawv6_close,
+ .destroy = raw6_destroy,
+ .connect = ip6_datagram_connect_v6_only,
+ .disconnect = udp_disconnect,
+ .ioctl = rawv6_ioctl,
+ .init = rawv6_init_sk,
+ .setsockopt = rawv6_setsockopt,
+ .getsockopt = rawv6_getsockopt,
+ .sendmsg = rawv6_sendmsg,
+ .recvmsg = rawv6_recvmsg,
+ .bind = rawv6_bind,
+ .backlog_rcv = rawv6_rcv_skb,
+ .hash = raw_hash_sk,
+ .unhash = raw_unhash_sk,
+ .obj_size = sizeof(struct raw6_sock),
+ .h.raw_hash = &raw_v6_hashinfo,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_rawv6_setsockopt,
+ .compat_getsockopt = compat_rawv6_getsockopt,
+ .compat_ioctl = compat_rawv6_ioctl,
+#endif
+};
+
+#ifdef CONFIG_PROC_FS
+static int raw6_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
+ } else {
+ struct sock *sp = v;
+ __u16 srcp = inet_sk(sp)->inet_num;
+ ip6_dgram_sock_seq_show(seq, v, srcp, 0,
+ raw_seq_private(seq)->bucket);
+ }
+ return 0;
+}
+
+static const struct seq_operations raw6_seq_ops = {
+ .start = raw_seq_start,
+ .next = raw_seq_next,
+ .stop = raw_seq_stop,
+ .show = raw6_seq_show,
+};
+
+static int raw6_seq_open(struct inode *inode, struct file *file)
+{
+ return raw_seq_open(inode, file, &raw_v6_hashinfo, &raw6_seq_ops);
+}
+
+static const struct file_operations raw6_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = raw6_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init raw6_init_net(struct net *net)
+{
+ if (!proc_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit raw6_exit_net(struct net *net)
+{
+ remove_proc_entry("raw6", net->proc_net);
+}
+
+static struct pernet_operations raw6_net_ops = {
+ .init = raw6_init_net,
+ .exit = raw6_exit_net,
+};
+
+int __init raw6_proc_init(void)
+{
+ return register_pernet_subsys(&raw6_net_ops);
+}
+
+void raw6_proc_exit(void)
+{
+ unregister_pernet_subsys(&raw6_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+/* Same as inet6_dgram_ops, sans udp_poll. */
+static const struct proto_ops inet6_sockraw_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_dgram_connect, /* ok */
+ .socketpair = sock_no_socketpair, /* a do nothing */
+ .accept = sock_no_accept, /* a do nothing */
+ .getname = inet6_getname,
+ .poll = datagram_poll, /* ok */
+ .ioctl = inet6_ioctl, /* must change */
+ .listen = sock_no_listen, /* ok */
+ .shutdown = inet_shutdown, /* ok */
+ .setsockopt = sock_common_setsockopt, /* ok */
+ .getsockopt = sock_common_getsockopt, /* ok */
+ .sendmsg = inet_sendmsg, /* ok */
+ .recvmsg = sock_common_recvmsg, /* ok */
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw rawv6_protosw = {
+ .type = SOCK_RAW,
+ .protocol = IPPROTO_IP, /* wild card */
+ .prot = &rawv6_prot,
+ .ops = &inet6_sockraw_ops,
+ .flags = INET_PROTOSW_REUSE,
+};
+
+int __init rawv6_init(void)
+{
+ int ret;
+
+ ret = inet6_register_protosw(&rawv6_protosw);
+ if (ret)
+ goto out;
+out:
+ return ret;
+}
+
+void rawv6_exit(void)
+{
+ inet6_unregister_protosw(&rawv6_protosw);
+}
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
new file mode 100644
index 000000000..8ffa2c8cc
--- /dev/null
+++ b/net/ipv6/reassembly.c
@@ -0,0 +1,772 @@
+/*
+ * IPv6 fragment reassembly
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Based on: net/ipv4/ip_fragment.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Fixes:
+ * Andi Kleen Make it work with multiple hosts.
+ * More RFC compliance.
+ *
+ * Horst von Brand Add missing #include <linux/string.h>
+ * Alexey Kuznetsov SMP races, threading, cleanup.
+ * Patrick McHardy LRU queue of frag heads for evictor.
+ * Mitsuru KANDA @USAGI Register inet6_protocol{}.
+ * David Stevens and
+ * YOSHIFUJI,H. @USAGI Always remove fragment header to
+ * calculate ICV correctly.
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/jiffies.h>
+#include <linux/net.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/inet_frag.h>
+#include <net/inet_ecn.h>
+
+static const char ip6_frag_cache_name[] = "ip6-frags";
+
+struct ip6frag_skb_cb {
+ struct inet6_skb_parm h;
+ int offset;
+};
+
+#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb))
+
+static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
+{
+ return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
+}
+
+static struct inet_frags ip6_frags;
+
+static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
+ struct net_device *dev);
+
+/*
+ * callers should be careful not to use the hash value outside the ipfrag_lock
+ * as doing so could race with ipfrag_hash_rnd being recalculated.
+ */
+static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
+{
+ net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
+ return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
+ (__force u32)id, ip6_frags.rnd);
+}
+
+static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
+{
+ const struct frag_queue *fq;
+
+ fq = container_of(q, struct frag_queue, q);
+ return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
+}
+
+bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
+{
+ const struct frag_queue *fq;
+ const struct ip6_create_arg *arg = a;
+
+ fq = container_of(q, struct frag_queue, q);
+ return fq->id == arg->id &&
+ fq->user == arg->user &&
+ ipv6_addr_equal(&fq->saddr, arg->src) &&
+ ipv6_addr_equal(&fq->daddr, arg->dst);
+}
+EXPORT_SYMBOL(ip6_frag_match);
+
+void ip6_frag_init(struct inet_frag_queue *q, const void *a)
+{
+ struct frag_queue *fq = container_of(q, struct frag_queue, q);
+ const struct ip6_create_arg *arg = a;
+
+ fq->id = arg->id;
+ fq->user = arg->user;
+ fq->saddr = *arg->src;
+ fq->daddr = *arg->dst;
+ fq->ecn = arg->ecn;
+}
+EXPORT_SYMBOL(ip6_frag_init);
+
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
+ struct inet_frags *frags)
+{
+ struct net_device *dev = NULL;
+
+ spin_lock(&fq->q.lock);
+
+ if (fq->q.flags & INET_FRAG_COMPLETE)
+ goto out;
+
+ inet_frag_kill(&fq->q, frags);
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, fq->iif);
+ if (!dev)
+ goto out_rcu_unlock;
+
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+
+ if (fq->q.flags & INET_FRAG_EVICTED)
+ goto out_rcu_unlock;
+
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+
+ /* Don't send error if the first segment did not arrive. */
+ if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments)
+ goto out_rcu_unlock;
+
+ /* But use as source device on which LAST ARRIVED
+ * segment was received. And do not use fq->dev
+ * pointer directly, device might already disappeared.
+ */
+ fq->q.fragments->dev = dev;
+ icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
+out_rcu_unlock:
+ rcu_read_unlock();
+out:
+ spin_unlock(&fq->q.lock);
+ inet_frag_put(&fq->q, frags);
+}
+EXPORT_SYMBOL(ip6_expire_frag_queue);
+
+static void ip6_frag_expire(unsigned long data)
+{
+ struct frag_queue *fq;
+ struct net *net;
+
+ fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+ net = container_of(fq->q.net, struct net, ipv6.frags);
+
+ ip6_expire_frag_queue(net, fq, &ip6_frags);
+}
+
+static struct frag_queue *
+fq_find(struct net *net, __be32 id, const struct in6_addr *src,
+ const struct in6_addr *dst, u8 ecn)
+{
+ struct inet_frag_queue *q;
+ struct ip6_create_arg arg;
+ unsigned int hash;
+
+ arg.id = id;
+ arg.user = IP6_DEFRAG_LOCAL_DELIVER;
+ arg.src = src;
+ arg.dst = dst;
+ arg.ecn = ecn;
+
+ hash = inet6_hash_frag(id, src, dst);
+
+ q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
+ if (IS_ERR_OR_NULL(q)) {
+ inet_frag_maybe_warn_overflow(q, pr_fmt());
+ return NULL;
+ }
+ return container_of(q, struct frag_queue, q);
+}
+
+static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
+ struct frag_hdr *fhdr, int nhoff)
+{
+ struct sk_buff *prev, *next;
+ struct net_device *dev;
+ int offset, end;
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ u8 ecn;
+
+ if (fq->q.flags & INET_FRAG_COMPLETE)
+ goto err;
+
+ offset = ntohs(fhdr->frag_off) & ~0x7;
+ end = offset + (ntohs(ipv6_hdr(skb)->payload_len) -
+ ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
+
+ if ((unsigned int)end > IPV6_MAXPLEN) {
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((u8 *)&fhdr->frag_off -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ ecn = ip6_frag_ecn(ipv6_hdr(skb));
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ const unsigned char *nh = skb_network_header(skb);
+ skb->csum = csum_sub(skb->csum,
+ csum_partial(nh, (u8 *)(fhdr + 1) - nh,
+ 0));
+ }
+
+ /* Is this the final fragment? */
+ if (!(fhdr->frag_off & htons(IP6_MF))) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+ if (end < fq->q.len ||
+ ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
+ goto err;
+ fq->q.flags |= INET_FRAG_LAST_IN;
+ fq->q.len = end;
+ } else {
+ /* Check if the fragment is rounded to 8 bytes.
+ * Required by the RFC.
+ */
+ if (end & 0x7) {
+ /* RFC2460 says always send parameter problem in
+ * this case. -DaveM
+ */
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ offsetof(struct ipv6hdr, payload_len));
+ return -1;
+ }
+ if (end > fq->q.len) {
+ /* Some bits beyond end -> corruption. */
+ if (fq->q.flags & INET_FRAG_LAST_IN)
+ goto err;
+ fq->q.len = end;
+ }
+ }
+
+ if (end == offset)
+ goto err;
+
+ /* Point into the IP datagram 'data' part. */
+ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data))
+ goto err;
+
+ if (pskb_trim_rcsum(skb, end - offset))
+ goto err;
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = fq->q.fragments_tail;
+ if (!prev || FRAG6_CB(prev)->offset < offset) {
+ next = NULL;
+ goto found;
+ }
+ prev = NULL;
+ for (next = fq->q.fragments; next != NULL; next = next->next) {
+ if (FRAG6_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+found:
+ /* RFC5722, Section 4, amended by Errata ID : 3089
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ */
+
+ /* Check for overlap with preceding fragment. */
+ if (prev &&
+ (FRAG6_CB(prev)->offset + prev->len) > offset)
+ goto discard_fq;
+
+ /* Look for overlap with succeeding segment. */
+ if (next && FRAG6_CB(next)->offset < end)
+ goto discard_fq;
+
+ FRAG6_CB(skb)->offset = offset;
+
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (!next)
+ fq->q.fragments_tail = skb;
+ if (prev)
+ prev->next = skb;
+ else
+ fq->q.fragments = skb;
+
+ dev = skb->dev;
+ if (dev) {
+ fq->iif = dev->ifindex;
+ skb->dev = NULL;
+ }
+ fq->q.stamp = skb->tstamp;
+ fq->q.meat += skb->len;
+ fq->ecn |= ecn;
+ add_frag_mem_limit(&fq->q, skb->truesize);
+
+ /* The first fragment.
+ * nhoffset is obtained from the first fragment, of course.
+ */
+ if (offset == 0) {
+ fq->nhoffset = nhoff;
+ fq->q.flags |= INET_FRAG_FIRST_IN;
+ }
+
+ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ fq->q.meat == fq->q.len) {
+ int res;
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ res = ip6_frag_reasm(fq, prev, dev);
+ skb->_skb_refdst = orefdst;
+ return res;
+ }
+
+ skb_dst_drop(skb);
+ return -1;
+
+discard_fq:
+ inet_frag_kill(&fq->q, &ip6_frags);
+err:
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_REASMFAILS);
+ kfree_skb(skb);
+ return -1;
+}
+
+/*
+ * Check if this packet is complete.
+ * Returns NULL on failure by any reason, and pointer
+ * to current nexthdr field in reassembled frame.
+ *
+ * It is called with locked fq, and caller must check that
+ * queue is eligible for reassembly i.e. it is not COMPLETE,
+ * the last and the first frames arrived and all the bits are here.
+ */
+static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
+ struct net_device *dev)
+{
+ struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
+ struct sk_buff *fp, *head = fq->q.fragments;
+ int payload_len;
+ unsigned int nhoff;
+ int sum_truesize;
+ u8 ecn;
+
+ inet_frag_kill(&fq->q, &ip6_frags);
+
+ ecn = ip_frag_ecn_table[fq->ecn];
+ if (unlikely(ecn == 0xff))
+ goto out_fail;
+
+ /* Make the one we just received the head. */
+ if (prev) {
+ head = prev->next;
+ fp = skb_clone(head, GFP_ATOMIC);
+
+ if (!fp)
+ goto out_oom;
+
+ fp->next = head->next;
+ if (!fp->next)
+ fq->q.fragments_tail = fp;
+ prev->next = fp;
+
+ skb_morph(head, fq->q.fragments);
+ head->next = fq->q.fragments->next;
+
+ consume_skb(fq->q.fragments);
+ fq->q.fragments = head;
+ }
+
+ WARN_ON(head == NULL);
+ WARN_ON(FRAG6_CB(head)->offset != 0);
+
+ /* Unfragmented part is taken from the first segment. */
+ payload_len = ((head->data - skb_network_header(head)) -
+ sizeof(struct ipv6hdr) + fq->q.len -
+ sizeof(struct frag_hdr));
+ if (payload_len > IPV6_MAXPLEN)
+ goto out_oversize;
+
+ /* Head of list must not be cloned. */
+ if (skb_unclone(head, GFP_ATOMIC))
+ goto out_oom;
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments. */
+ if (skb_has_frag_list(head)) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
+ goto out_oom;
+ clone->next = head->next;
+ head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->len = clone->data_len = head->data_len - plen;
+ head->data_len -= clone->len;
+ head->len -= clone->len;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ add_frag_mem_limit(&fq->q, clone->truesize);
+ }
+
+ /* We have to remove fragment header from datagram and to relocate
+ * header in order to calculate ICV correctly. */
+ nhoff = fq->nhoffset;
+ skb_network_header(head)[nhoff] = skb_transport_header(head)[0];
+ memmove(head->head + sizeof(struct frag_hdr), head->head,
+ (head->data - head->head) - sizeof(struct frag_hdr));
+ head->mac_header += sizeof(struct frag_hdr);
+ head->network_header += sizeof(struct frag_hdr);
+
+ skb_reset_transport_header(head);
+ skb_push(head, head->data - skb_network_header(head));
+
+ sum_truesize = head->truesize;
+ for (fp = head->next; fp;) {
+ bool headstolen;
+ int delta;
+ struct sk_buff *next = fp->next;
+
+ sum_truesize += fp->truesize;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+
+ if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
+ kfree_skb_partial(fp, headstolen);
+ } else {
+ if (!skb_shinfo(head)->frag_list)
+ skb_shinfo(head)->frag_list = fp;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+ }
+ fp = next;
+ }
+ sub_frag_mem_limit(&fq->q, sum_truesize);
+
+ head->next = NULL;
+ head->dev = dev;
+ head->tstamp = fq->q.stamp;
+ ipv6_hdr(head)->payload_len = htons(payload_len);
+ ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
+ IP6CB(head)->nhoff = nhoff;
+ IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
+
+ /* Yes, and fold redundant checksum back. 8) */
+ if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_partial(skb_network_header(head),
+ skb_network_header_len(head),
+ head->csum);
+
+ rcu_read_lock();
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+ rcu_read_unlock();
+ fq->q.fragments = NULL;
+ fq->q.fragments_tail = NULL;
+ return 1;
+
+out_oversize:
+ net_dbg_ratelimited("ip6_frag_reasm: payload len = %d\n", payload_len);
+ goto out_fail;
+out_oom:
+ net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n");
+out_fail:
+ rcu_read_lock();
+ IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+ rcu_read_unlock();
+ return -1;
+}
+
+static int ipv6_frag_rcv(struct sk_buff *skb)
+{
+ struct frag_hdr *fhdr;
+ struct frag_queue *fq;
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
+ goto fail_hdr;
+
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
+
+ /* Jumbo payload inhibits frag. header */
+ if (hdr->payload_len == 0)
+ goto fail_hdr;
+
+ if (!pskb_may_pull(skb, (skb_transport_offset(skb) +
+ sizeof(struct frag_hdr))))
+ goto fail_hdr;
+
+ hdr = ipv6_hdr(skb);
+ fhdr = (struct frag_hdr *)skb_transport_header(skb);
+
+ if (!(fhdr->frag_off & htons(0xFFF9))) {
+ /* It is not a fragmented frame */
+ skb->transport_header += sizeof(struct frag_hdr);
+ IP6_INC_STATS_BH(net,
+ ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS);
+
+ IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
+ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
+ return 1;
+ }
+
+ fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
+ ip6_frag_ecn(hdr));
+ if (fq) {
+ int ret;
+
+ spin_lock(&fq->q.lock);
+
+ ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
+
+ spin_unlock(&fq->q.lock);
+ inet_frag_put(&fq->q, &ip6_frags);
+ return ret;
+ }
+
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
+ kfree_skb(skb);
+ return -1;
+
+fail_hdr:
+ IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
+ return -1;
+}
+
+static const struct inet6_protocol frag_protocol = {
+ .handler = ipv6_frag_rcv,
+ .flags = INET6_PROTO_NOPOLICY,
+};
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+
+static struct ctl_table ip6_frags_ns_ctl_table[] = {
+ {
+ .procname = "ip6frag_high_thresh",
+ .data = &init_net.ipv6.frags.high_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &init_net.ipv6.frags.low_thresh
+ },
+ {
+ .procname = "ip6frag_low_thresh",
+ .data = &init_net.ipv6.frags.low_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &init_net.ipv6.frags.high_thresh
+ },
+ {
+ .procname = "ip6frag_time",
+ .data = &init_net.ipv6.frags.timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+
+/* secret interval has been deprecated */
+static int ip6_frags_secret_interval_unused;
+static struct ctl_table ip6_frags_ctl_table[] = {
+ {
+ .procname = "ip6frag_secret_interval",
+ .data = &ip6_frags_secret_interval_unused,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+
+static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = ip6_frags_ns_ctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->ipv6.frags.high_thresh;
+ table[0].extra1 = &net->ipv6.frags.low_thresh;
+ table[0].extra2 = &init_net.ipv6.frags.high_thresh;
+ table[1].data = &net->ipv6.frags.low_thresh;
+ table[1].extra2 = &net->ipv6.frags.high_thresh;
+ table[2].data = &net->ipv6.frags.timeout;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv6", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv6.sysctl.frags_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ipv6.sysctl.frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct ctl_table_header *ip6_ctl_header;
+
+static int ip6_frags_sysctl_register(void)
+{
+ ip6_ctl_header = register_net_sysctl(&init_net, "net/ipv6",
+ ip6_frags_ctl_table);
+ return ip6_ctl_header == NULL ? -ENOMEM : 0;
+}
+
+static void ip6_frags_sysctl_unregister(void)
+{
+ unregister_net_sysctl_table(ip6_ctl_header);
+}
+#else
+static int ip6_frags_ns_sysctl_register(struct net *net)
+{
+ return 0;
+}
+
+static void ip6_frags_ns_sysctl_unregister(struct net *net)
+{
+}
+
+static int ip6_frags_sysctl_register(void)
+{
+ return 0;
+}
+
+static void ip6_frags_sysctl_unregister(void)
+{
+}
+#endif
+
+static int __net_init ipv6_frags_init_net(struct net *net)
+{
+ net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+
+ inet_frags_init_net(&net->ipv6.frags);
+
+ return ip6_frags_ns_sysctl_register(net);
+}
+
+static void __net_exit ipv6_frags_exit_net(struct net *net)
+{
+ ip6_frags_ns_sysctl_unregister(net);
+ inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+}
+
+static struct pernet_operations ip6_frags_ops = {
+ .init = ipv6_frags_init_net,
+ .exit = ipv6_frags_exit_net,
+};
+
+int __init ipv6_frag_init(void)
+{
+ int ret;
+
+ ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+ if (ret)
+ goto out;
+
+ ret = ip6_frags_sysctl_register();
+ if (ret)
+ goto err_sysctl;
+
+ ret = register_pernet_subsys(&ip6_frags_ops);
+ if (ret)
+ goto err_pernet;
+
+ ip6_frags.hashfn = ip6_hashfn;
+ ip6_frags.constructor = ip6_frag_init;
+ ip6_frags.destructor = NULL;
+ ip6_frags.skb_free = NULL;
+ ip6_frags.qsize = sizeof(struct frag_queue);
+ ip6_frags.match = ip6_frag_match;
+ ip6_frags.frag_expire = ip6_frag_expire;
+ ip6_frags.frags_cache_name = ip6_frag_cache_name;
+ ret = inet_frags_init(&ip6_frags);
+ if (ret)
+ goto err_pernet;
+out:
+ return ret;
+
+err_pernet:
+ ip6_frags_sysctl_unregister();
+err_sysctl:
+ inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+ goto out;
+}
+
+void ipv6_frag_exit(void)
+{
+ inet_frags_fini(&ip6_frags);
+ ip6_frags_sysctl_unregister();
+ unregister_pernet_subsys(&ip6_frags_ops);
+ inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
new file mode 100644
index 000000000..c73ae5039
--- /dev/null
+++ b/net/ipv6/route.c
@@ -0,0 +1,3311 @@
+/*
+ * Linux INET6 implementation
+ * FIB front-end.
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* Changes:
+ *
+ * YOSHIFUJI Hideaki @USAGI
+ * reworked default router selection.
+ * - respect outgoing interface
+ * - select from (probably) reachable routers (i.e.
+ * routers in REACHABLE, STALE, DELAY or PROBE states).
+ * - always select the same router if it is (probably)
+ * reachable. otherwise, round-robin the list.
+ * Ville Nuorvala
+ * Fixed routing subtrees.
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/times.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/route.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/mroute6.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/snmp.h>
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/tcp.h>
+#include <linux/rtnetlink.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/netevent.h>
+#include <net/netlink.h>
+#include <net/nexthop.h>
+
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+enum rt6_nud_state {
+ RT6_NUD_FAIL_HARD = -3,
+ RT6_NUD_FAIL_PROBE = -2,
+ RT6_NUD_FAIL_DO_RR = -1,
+ RT6_NUD_SUCCEED = 1
+};
+
+static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
+ const struct in6_addr *dest);
+static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int ip6_default_advmss(const struct dst_entry *dst);
+static unsigned int ip6_mtu(const struct dst_entry *dst);
+static struct dst_entry *ip6_negative_advice(struct dst_entry *);
+static void ip6_dst_destroy(struct dst_entry *);
+static void ip6_dst_ifdown(struct dst_entry *,
+ struct net_device *dev, int how);
+static int ip6_dst_gc(struct dst_ops *ops);
+
+static int ip6_pkt_discard(struct sk_buff *skb);
+static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
+static int ip6_pkt_prohibit(struct sk_buff *skb);
+static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
+static void ip6_link_failure(struct sk_buff *skb);
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu);
+static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb);
+static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
+
+#ifdef CONFIG_IPV6_ROUTE_INFO
+static struct rt6_info *rt6_add_route_info(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ const struct in6_addr *gwaddr, int ifindex,
+ unsigned int pref);
+static struct rt6_info *rt6_get_route_info(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ const struct in6_addr *gwaddr, int ifindex);
+#endif
+
+static void rt6_bind_peer(struct rt6_info *rt, int create)
+{
+ struct inet_peer_base *base;
+ struct inet_peer *peer;
+
+ base = inetpeer_base_ptr(rt->_rt6i_peer);
+ if (!base)
+ return;
+
+ peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
+ if (peer) {
+ if (!rt6_set_peer(rt, peer))
+ inet_putpeer(peer);
+ }
+}
+
+static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create)
+{
+ if (rt6_has_peer(rt))
+ return rt6_peer_ptr(rt);
+
+ rt6_bind_peer(rt, create);
+ return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL);
+}
+
+static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt)
+{
+ return __rt6_get_peer(rt, 1);
+}
+
+static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ struct rt6_info *rt = (struct rt6_info *) dst;
+ struct inet_peer *peer;
+ u32 *p = NULL;
+
+ if (!(rt->dst.flags & DST_HOST))
+ return dst_cow_metrics_generic(dst, old);
+
+ peer = rt6_get_peer_create(rt);
+ if (peer) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ p = peer->metrics;
+ if (inet_metrics_new(peer) ||
+ (old & DST_METRICS_FORCE_OVERWRITE))
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ }
+ }
+ return p;
+}
+
+static inline const void *choose_neigh_daddr(struct rt6_info *rt,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ struct in6_addr *p = &rt->rt6i_gateway;
+
+ if (!ipv6_addr_any(p))
+ return (const void *) p;
+ else if (skb)
+ return &ipv6_hdr(skb)->daddr;
+ return daddr;
+}
+
+static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ struct rt6_info *rt = (struct rt6_info *) dst;
+ struct neighbour *n;
+
+ daddr = choose_neigh_daddr(rt, skb, daddr);
+ n = __ipv6_neigh_lookup(dst->dev, daddr);
+ if (n)
+ return n;
+ return neigh_create(&nd_tbl, daddr, dst->dev);
+}
+
+static struct dst_ops ip6_dst_ops_template = {
+ .family = AF_INET6,
+ .gc = ip6_dst_gc,
+ .gc_thresh = 1024,
+ .check = ip6_dst_check,
+ .default_advmss = ip6_default_advmss,
+ .mtu = ip6_mtu,
+ .cow_metrics = ipv6_cow_metrics,
+ .destroy = ip6_dst_destroy,
+ .ifdown = ip6_dst_ifdown,
+ .negative_advice = ip6_negative_advice,
+ .link_failure = ip6_link_failure,
+ .update_pmtu = ip6_rt_update_pmtu,
+ .redirect = rt6_do_redirect,
+ .local_out = __ip6_local_out,
+ .neigh_lookup = ip6_neigh_lookup,
+};
+
+static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
+}
+
+static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+}
+
+static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+}
+
+static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
+ unsigned long old)
+{
+ return NULL;
+}
+
+static struct dst_ops ip6_dst_blackhole_ops = {
+ .family = AF_INET6,
+ .destroy = ip6_dst_destroy,
+ .check = ip6_dst_check,
+ .mtu = ip6_blackhole_mtu,
+ .default_advmss = ip6_default_advmss,
+ .update_pmtu = ip6_rt_blackhole_update_pmtu,
+ .redirect = ip6_rt_blackhole_redirect,
+ .cow_metrics = ip6_rt_blackhole_cow_metrics,
+ .neigh_lookup = ip6_neigh_lookup,
+};
+
+static const u32 ip6_template_metrics[RTAX_MAX] = {
+ [RTAX_HOPLIMIT - 1] = 0,
+};
+
+static const struct rt6_info ip6_null_entry_template = {
+ .dst = {
+ .__refcnt = ATOMIC_INIT(1),
+ .__use = 1,
+ .obsolete = DST_OBSOLETE_FORCE_CHK,
+ .error = -ENETUNREACH,
+ .input = ip6_pkt_discard,
+ .output = ip6_pkt_discard_out,
+ },
+ .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
+ .rt6i_protocol = RTPROT_KERNEL,
+ .rt6i_metric = ~(u32) 0,
+ .rt6i_ref = ATOMIC_INIT(1),
+};
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+
+static const struct rt6_info ip6_prohibit_entry_template = {
+ .dst = {
+ .__refcnt = ATOMIC_INIT(1),
+ .__use = 1,
+ .obsolete = DST_OBSOLETE_FORCE_CHK,
+ .error = -EACCES,
+ .input = ip6_pkt_prohibit,
+ .output = ip6_pkt_prohibit_out,
+ },
+ .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
+ .rt6i_protocol = RTPROT_KERNEL,
+ .rt6i_metric = ~(u32) 0,
+ .rt6i_ref = ATOMIC_INIT(1),
+};
+
+static const struct rt6_info ip6_blk_hole_entry_template = {
+ .dst = {
+ .__refcnt = ATOMIC_INIT(1),
+ .__use = 1,
+ .obsolete = DST_OBSOLETE_FORCE_CHK,
+ .error = -EINVAL,
+ .input = dst_discard,
+ .output = dst_discard_sk,
+ },
+ .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
+ .rt6i_protocol = RTPROT_KERNEL,
+ .rt6i_metric = ~(u32) 0,
+ .rt6i_ref = ATOMIC_INIT(1),
+};
+
+#endif
+
+/* allocate dst with ip6_dst_ops */
+static inline struct rt6_info *ip6_dst_alloc(struct net *net,
+ struct net_device *dev,
+ int flags,
+ struct fib6_table *table)
+{
+ struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
+ 0, DST_OBSOLETE_FORCE_CHK, flags);
+
+ if (rt) {
+ struct dst_entry *dst = &rt->dst;
+
+ memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
+ rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
+ INIT_LIST_HEAD(&rt->rt6i_siblings);
+ }
+ return rt;
+}
+
+static void ip6_dst_destroy(struct dst_entry *dst)
+{
+ struct rt6_info *rt = (struct rt6_info *)dst;
+ struct inet6_dev *idev = rt->rt6i_idev;
+ struct dst_entry *from = dst->from;
+
+ if (!(rt->dst.flags & DST_HOST))
+ dst_destroy_metrics_generic(dst);
+
+ if (idev) {
+ rt->rt6i_idev = NULL;
+ in6_dev_put(idev);
+ }
+
+ dst->from = NULL;
+ dst_release(from);
+
+ if (rt6_has_peer(rt)) {
+ struct inet_peer *peer = rt6_peer_ptr(rt);
+ inet_putpeer(peer);
+ }
+}
+
+static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+ int how)
+{
+ struct rt6_info *rt = (struct rt6_info *)dst;
+ struct inet6_dev *idev = rt->rt6i_idev;
+ struct net_device *loopback_dev =
+ dev_net(dev)->loopback_dev;
+
+ if (dev != loopback_dev) {
+ if (idev && idev->dev == dev) {
+ struct inet6_dev *loopback_idev =
+ in6_dev_get(loopback_dev);
+ if (loopback_idev) {
+ rt->rt6i_idev = loopback_idev;
+ in6_dev_put(idev);
+ }
+ }
+ }
+}
+
+static bool rt6_check_expired(const struct rt6_info *rt)
+{
+ if (rt->rt6i_flags & RTF_EXPIRES) {
+ if (time_after(jiffies, rt->dst.expires))
+ return true;
+ } else if (rt->dst.from) {
+ return rt6_check_expired((struct rt6_info *) rt->dst.from);
+ }
+ return false;
+}
+
+/* Multipath route selection:
+ * Hash based function using packet header and flowlabel.
+ * Adapted from fib_info_hashfn()
+ */
+static int rt6_info_hash_nhsfn(unsigned int candidate_count,
+ const struct flowi6 *fl6)
+{
+ unsigned int val = fl6->flowi6_proto;
+
+ val ^= ipv6_addr_hash(&fl6->daddr);
+ val ^= ipv6_addr_hash(&fl6->saddr);
+
+ /* Work only if this not encapsulated */
+ switch (fl6->flowi6_proto) {
+ case IPPROTO_UDP:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ val ^= (__force u16)fl6->fl6_sport;
+ val ^= (__force u16)fl6->fl6_dport;
+ break;
+
+ case IPPROTO_ICMPV6:
+ val ^= (__force u16)fl6->fl6_icmp_type;
+ val ^= (__force u16)fl6->fl6_icmp_code;
+ break;
+ }
+ /* RFC6438 recommands to use flowlabel */
+ val ^= (__force u32)fl6->flowlabel;
+
+ /* Perhaps, we need to tune, this function? */
+ val = val ^ (val >> 7) ^ (val >> 12);
+ return val % candidate_count;
+}
+
+static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+ struct flowi6 *fl6, int oif,
+ int strict)
+{
+ struct rt6_info *sibling, *next_sibling;
+ int route_choosen;
+
+ route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+ /* Don't change the route, if route_choosen == 0
+ * (siblings does not include ourself)
+ */
+ if (route_choosen)
+ list_for_each_entry_safe(sibling, next_sibling,
+ &match->rt6i_siblings, rt6i_siblings) {
+ route_choosen--;
+ if (route_choosen == 0) {
+ if (rt6_score_route(sibling, oif, strict) < 0)
+ break;
+ match = sibling;
+ break;
+ }
+ }
+ return match;
+}
+
+/*
+ * Route lookup. Any table->tb6_lock is implied.
+ */
+
+static inline struct rt6_info *rt6_device_match(struct net *net,
+ struct rt6_info *rt,
+ const struct in6_addr *saddr,
+ int oif,
+ int flags)
+{
+ struct rt6_info *local = NULL;
+ struct rt6_info *sprt;
+
+ if (!oif && ipv6_addr_any(saddr))
+ goto out;
+
+ for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
+ struct net_device *dev = sprt->dst.dev;
+
+ if (oif) {
+ if (dev->ifindex == oif)
+ return sprt;
+ if (dev->flags & IFF_LOOPBACK) {
+ if (!sprt->rt6i_idev ||
+ sprt->rt6i_idev->dev->ifindex != oif) {
+ if (flags & RT6_LOOKUP_F_IFACE && oif)
+ continue;
+ if (local && (!oif ||
+ local->rt6i_idev->dev->ifindex == oif))
+ continue;
+ }
+ local = sprt;
+ }
+ } else {
+ if (ipv6_chk_addr(net, saddr, dev,
+ flags & RT6_LOOKUP_F_IFACE))
+ return sprt;
+ }
+ }
+
+ if (oif) {
+ if (local)
+ return local;
+
+ if (flags & RT6_LOOKUP_F_IFACE)
+ return net->ipv6.ip6_null_entry;
+ }
+out:
+ return rt;
+}
+
+#ifdef CONFIG_IPV6_ROUTER_PREF
+struct __rt6_probe_work {
+ struct work_struct work;
+ struct in6_addr target;
+ struct net_device *dev;
+};
+
+static void rt6_probe_deferred(struct work_struct *w)
+{
+ struct in6_addr mcaddr;
+ struct __rt6_probe_work *work =
+ container_of(w, struct __rt6_probe_work, work);
+
+ addrconf_addr_solict_mult(&work->target, &mcaddr);
+ ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
+ dev_put(work->dev);
+ kfree(work);
+}
+
+static void rt6_probe(struct rt6_info *rt)
+{
+ struct neighbour *neigh;
+ /*
+ * Okay, this does not seem to be appropriate
+ * for now, however, we need to check if it
+ * is really so; aka Router Reachability Probing.
+ *
+ * Router Reachability Probe MUST be rate-limited
+ * to no more than one per minute.
+ */
+ if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
+ return;
+ rcu_read_lock_bh();
+ neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+ if (neigh) {
+ write_lock(&neigh->lock);
+ if (neigh->nud_state & NUD_VALID)
+ goto out;
+ }
+
+ if (!neigh ||
+ time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
+ struct __rt6_probe_work *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+
+ if (neigh && work)
+ __neigh_set_probe_once(neigh);
+
+ if (neigh)
+ write_unlock(&neigh->lock);
+
+ if (work) {
+ INIT_WORK(&work->work, rt6_probe_deferred);
+ work->target = rt->rt6i_gateway;
+ dev_hold(rt->dst.dev);
+ work->dev = rt->dst.dev;
+ schedule_work(&work->work);
+ }
+ } else {
+out:
+ write_unlock(&neigh->lock);
+ }
+ rcu_read_unlock_bh();
+}
+#else
+static inline void rt6_probe(struct rt6_info *rt)
+{
+}
+#endif
+
+/*
+ * Default Router Selection (RFC 2461 6.3.6)
+ */
+static inline int rt6_check_dev(struct rt6_info *rt, int oif)
+{
+ struct net_device *dev = rt->dst.dev;
+ if (!oif || dev->ifindex == oif)
+ return 2;
+ if ((dev->flags & IFF_LOOPBACK) &&
+ rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
+ return 1;
+ return 0;
+}
+
+static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
+{
+ struct neighbour *neigh;
+ enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
+
+ if (rt->rt6i_flags & RTF_NONEXTHOP ||
+ !(rt->rt6i_flags & RTF_GATEWAY))
+ return RT6_NUD_SUCCEED;
+
+ rcu_read_lock_bh();
+ neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+ if (neigh) {
+ read_lock(&neigh->lock);
+ if (neigh->nud_state & NUD_VALID)
+ ret = RT6_NUD_SUCCEED;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ else if (!(neigh->nud_state & NUD_FAILED))
+ ret = RT6_NUD_SUCCEED;
+ else
+ ret = RT6_NUD_FAIL_PROBE;
+#endif
+ read_unlock(&neigh->lock);
+ } else {
+ ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
+ RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
+ }
+ rcu_read_unlock_bh();
+
+ return ret;
+}
+
+static int rt6_score_route(struct rt6_info *rt, int oif,
+ int strict)
+{
+ int m;
+
+ m = rt6_check_dev(rt, oif);
+ if (!m && (strict & RT6_LOOKUP_F_IFACE))
+ return RT6_NUD_FAIL_HARD;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
+#endif
+ if (strict & RT6_LOOKUP_F_REACHABLE) {
+ int n = rt6_check_neigh(rt);
+ if (n < 0)
+ return n;
+ }
+ return m;
+}
+
+static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
+ int *mpri, struct rt6_info *match,
+ bool *do_rr)
+{
+ int m;
+ bool match_do_rr = false;
+
+ if (rt6_check_expired(rt))
+ goto out;
+
+ m = rt6_score_route(rt, oif, strict);
+ if (m == RT6_NUD_FAIL_DO_RR) {
+ match_do_rr = true;
+ m = 0; /* lowest valid score */
+ } else if (m == RT6_NUD_FAIL_HARD) {
+ goto out;
+ }
+
+ if (strict & RT6_LOOKUP_F_REACHABLE)
+ rt6_probe(rt);
+
+ /* note that m can be RT6_NUD_FAIL_PROBE at this point */
+ if (m > *mpri) {
+ *do_rr = match_do_rr;
+ *mpri = m;
+ match = rt;
+ }
+out:
+ return match;
+}
+
+static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+ struct rt6_info *rr_head,
+ u32 metric, int oif, int strict,
+ bool *do_rr)
+{
+ struct rt6_info *rt, *match;
+ int mpri = -1;
+
+ match = NULL;
+ for (rt = rr_head; rt && rt->rt6i_metric == metric;
+ rt = rt->dst.rt6_next)
+ match = find_match(rt, oif, strict, &mpri, match, do_rr);
+ for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
+ rt = rt->dst.rt6_next)
+ match = find_match(rt, oif, strict, &mpri, match, do_rr);
+
+ return match;
+}
+
+static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+{
+ struct rt6_info *match, *rt0;
+ struct net *net;
+ bool do_rr = false;
+
+ rt0 = fn->rr_ptr;
+ if (!rt0)
+ fn->rr_ptr = rt0 = fn->leaf;
+
+ match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
+ &do_rr);
+
+ if (do_rr) {
+ struct rt6_info *next = rt0->dst.rt6_next;
+
+ /* no entries matched; do round-robin */
+ if (!next || next->rt6i_metric != rt0->rt6i_metric)
+ next = fn->leaf;
+
+ if (next != rt0)
+ fn->rr_ptr = next;
+ }
+
+ net = dev_net(rt0->dst.dev);
+ return match ? match : net->ipv6.ip6_null_entry;
+}
+
+#ifdef CONFIG_IPV6_ROUTE_INFO
+int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
+ const struct in6_addr *gwaddr)
+{
+ struct net *net = dev_net(dev);
+ struct route_info *rinfo = (struct route_info *) opt;
+ struct in6_addr prefix_buf, *prefix;
+ unsigned int pref;
+ unsigned long lifetime;
+ struct rt6_info *rt;
+
+ if (len < sizeof(struct route_info)) {
+ return -EINVAL;
+ }
+
+ /* Sanity check for prefix_len and length */
+ if (rinfo->length > 3) {
+ return -EINVAL;
+ } else if (rinfo->prefix_len > 128) {
+ return -EINVAL;
+ } else if (rinfo->prefix_len > 64) {
+ if (rinfo->length < 2) {
+ return -EINVAL;
+ }
+ } else if (rinfo->prefix_len > 0) {
+ if (rinfo->length < 1) {
+ return -EINVAL;
+ }
+ }
+
+ pref = rinfo->route_pref;
+ if (pref == ICMPV6_ROUTER_PREF_INVALID)
+ return -EINVAL;
+
+ lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
+
+ if (rinfo->length == 3)
+ prefix = (struct in6_addr *)rinfo->prefix;
+ else {
+ /* this function is safe */
+ ipv6_addr_prefix(&prefix_buf,
+ (struct in6_addr *)rinfo->prefix,
+ rinfo->prefix_len);
+ prefix = &prefix_buf;
+ }
+
+ if (rinfo->prefix_len == 0)
+ rt = rt6_get_dflt_router(gwaddr, dev);
+ else
+ rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
+ gwaddr, dev->ifindex);
+
+ if (rt && !lifetime) {
+ ip6_del_rt(rt);
+ rt = NULL;
+ }
+
+ if (!rt && lifetime)
+ rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
+ pref);
+ else if (rt)
+ rt->rt6i_flags = RTF_ROUTEINFO |
+ (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+
+ if (rt) {
+ if (!addrconf_finite_timeout(lifetime))
+ rt6_clean_expires(rt);
+ else
+ rt6_set_expires(rt, jiffies + HZ * lifetime);
+
+ ip6_rt_put(rt);
+ }
+ return 0;
+}
+#endif
+
+static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
+ struct in6_addr *saddr)
+{
+ struct fib6_node *pn;
+ while (1) {
+ if (fn->fn_flags & RTN_TL_ROOT)
+ return NULL;
+ pn = fn->parent;
+ if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
+ fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+ else
+ fn = pn;
+ if (fn->fn_flags & RTN_RTINFO)
+ return fn;
+ }
+}
+
+static struct rt6_info *ip6_pol_route_lookup(struct net *net,
+ struct fib6_table *table,
+ struct flowi6 *fl6, int flags)
+{
+ struct fib6_node *fn;
+ struct rt6_info *rt;
+
+ read_lock_bh(&table->tb6_lock);
+ fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+restart:
+ rt = fn->leaf;
+ rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
+ if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+ rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
+ if (rt == net->ipv6.ip6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto restart;
+ }
+ dst_use(&rt->dst, jiffies);
+ read_unlock_bh(&table->tb6_lock);
+ return rt;
+
+}
+
+struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
+ int flags)
+{
+ return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
+}
+EXPORT_SYMBOL_GPL(ip6_route_lookup);
+
+struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
+ const struct in6_addr *saddr, int oif, int strict)
+{
+ struct flowi6 fl6 = {
+ .flowi6_oif = oif,
+ .daddr = *daddr,
+ };
+ struct dst_entry *dst;
+ int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
+
+ if (saddr) {
+ memcpy(&fl6.saddr, saddr, sizeof(*saddr));
+ flags |= RT6_LOOKUP_F_HAS_SADDR;
+ }
+
+ dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
+ if (dst->error == 0)
+ return (struct rt6_info *) dst;
+
+ dst_release(dst);
+
+ return NULL;
+}
+EXPORT_SYMBOL(rt6_lookup);
+
+/* ip6_ins_rt is called with FREE table->tb6_lock.
+ It takes new route entry, the addition fails by any reason the
+ route is freed. In any case, if caller does not hold it, it may
+ be destroyed.
+ */
+
+static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
+ struct mx6_config *mxc)
+{
+ int err;
+ struct fib6_table *table;
+
+ table = rt->rt6i_table;
+ write_lock_bh(&table->tb6_lock);
+ err = fib6_add(&table->tb6_root, rt, info, mxc);
+ write_unlock_bh(&table->tb6_lock);
+
+ return err;
+}
+
+int ip6_ins_rt(struct rt6_info *rt)
+{
+ struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
+ struct mx6_config mxc = { .mx = NULL, };
+
+ return __ip6_ins_rt(rt, &info, &mxc);
+}
+
+static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
+{
+ struct rt6_info *rt;
+
+ /*
+ * Clone the route.
+ */
+
+ rt = ip6_rt_copy(ort, daddr);
+
+ if (rt) {
+ if (ort->rt6i_dst.plen != 128 &&
+ ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
+ rt->rt6i_flags |= RTF_ANYCAST;
+
+ rt->rt6i_flags |= RTF_CACHE;
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (rt->rt6i_src.plen && saddr) {
+ rt->rt6i_src.addr = *saddr;
+ rt->rt6i_src.plen = 128;
+ }
+#endif
+ }
+
+ return rt;
+}
+
+static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
+ const struct in6_addr *daddr)
+{
+ struct rt6_info *rt = ip6_rt_copy(ort, daddr);
+
+ if (rt)
+ rt->rt6i_flags |= RTF_CACHE;
+ return rt;
+}
+
+static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
+ struct flowi6 *fl6, int flags)
+{
+ struct fib6_node *fn, *saved_fn;
+ struct rt6_info *rt, *nrt;
+ int strict = 0;
+ int attempts = 3;
+ int err;
+
+ strict |= flags & RT6_LOOKUP_F_IFACE;
+ if (net->ipv6.devconf_all->forwarding == 0)
+ strict |= RT6_LOOKUP_F_REACHABLE;
+
+redo_fib6_lookup_lock:
+ read_lock_bh(&table->tb6_lock);
+
+ fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ saved_fn = fn;
+
+redo_rt6_select:
+ rt = rt6_select(fn, oif, strict);
+ if (rt->rt6i_nsiblings)
+ rt = rt6_multipath_select(rt, fl6, oif, strict);
+ if (rt == net->ipv6.ip6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto redo_rt6_select;
+ else if (strict & RT6_LOOKUP_F_REACHABLE) {
+ /* also consider unreachable route */
+ strict &= ~RT6_LOOKUP_F_REACHABLE;
+ fn = saved_fn;
+ goto redo_rt6_select;
+ } else {
+ dst_hold(&rt->dst);
+ read_unlock_bh(&table->tb6_lock);
+ goto out2;
+ }
+ }
+
+ dst_hold(&rt->dst);
+ read_unlock_bh(&table->tb6_lock);
+
+ if (rt->rt6i_flags & RTF_CACHE)
+ goto out2;
+
+ if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
+ nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
+ else if (!(rt->dst.flags & DST_HOST))
+ nrt = rt6_alloc_clone(rt, &fl6->daddr);
+ else
+ goto out2;
+
+ ip6_rt_put(rt);
+ rt = nrt ? : net->ipv6.ip6_null_entry;
+
+ dst_hold(&rt->dst);
+ if (nrt) {
+ err = ip6_ins_rt(nrt);
+ if (!err)
+ goto out2;
+ }
+
+ if (--attempts <= 0)
+ goto out2;
+
+ /*
+ * Race condition! In the gap, when table->tb6_lock was
+ * released someone could insert this route. Relookup.
+ */
+ ip6_rt_put(rt);
+ goto redo_fib6_lookup_lock;
+
+out2:
+ rt->dst.lastuse = jiffies;
+ rt->dst.__use++;
+
+ return rt;
+}
+
+static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
+ struct flowi6 *fl6, int flags)
+{
+ return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
+}
+
+static struct dst_entry *ip6_route_input_lookup(struct net *net,
+ struct net_device *dev,
+ struct flowi6 *fl6, int flags)
+{
+ if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
+ flags |= RT6_LOOKUP_F_IFACE;
+
+ return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
+}
+
+void ip6_route_input(struct sk_buff *skb)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ int flags = RT6_LOOKUP_F_HAS_SADDR;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
+ .flowi6_mark = skb->mark,
+ .flowi6_proto = iph->nexthdr,
+ };
+
+ skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
+}
+
+static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
+ struct flowi6 *fl6, int flags)
+{
+ return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
+}
+
+struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
+ struct flowi6 *fl6)
+{
+ int flags = 0;
+
+ fl6->flowi6_iif = LOOPBACK_IFINDEX;
+
+ if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
+ flags |= RT6_LOOKUP_F_IFACE;
+
+ if (!ipv6_addr_any(&fl6->saddr))
+ flags |= RT6_LOOKUP_F_HAS_SADDR;
+ else if (sk)
+ flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
+
+ return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
+}
+EXPORT_SYMBOL(ip6_route_output);
+
+struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
+{
+ struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
+ struct dst_entry *new = NULL;
+
+ rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
+ if (rt) {
+ new = &rt->dst;
+
+ memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
+ rt6_init_peer(rt, net->ipv6.peers);
+
+ new->__use = 1;
+ new->input = dst_discard;
+ new->output = dst_discard_sk;
+
+ if (dst_metrics_read_only(&ort->dst))
+ new->_metrics = ort->dst._metrics;
+ else
+ dst_copy_metrics(new, &ort->dst);
+ rt->rt6i_idev = ort->rt6i_idev;
+ if (rt->rt6i_idev)
+ in6_dev_hold(rt->rt6i_idev);
+
+ rt->rt6i_gateway = ort->rt6i_gateway;
+ rt->rt6i_flags = ort->rt6i_flags;
+ rt->rt6i_metric = 0;
+
+ memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
+#ifdef CONFIG_IPV6_SUBTREES
+ memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+#endif
+
+ dst_free(new);
+ }
+
+ dst_release(dst_orig);
+ return new ? new : ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Destination cache support functions
+ */
+
+static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ struct rt6_info *rt;
+
+ rt = (struct rt6_info *) dst;
+
+ /* All IPV6 dsts are created with ->obsolete set to the value
+ * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+ * into this function always.
+ */
+ if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+ return NULL;
+
+ if (rt6_check_expired(rt))
+ return NULL;
+
+ return dst;
+}
+
+static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
+{
+ struct rt6_info *rt = (struct rt6_info *) dst;
+
+ if (rt) {
+ if (rt->rt6i_flags & RTF_CACHE) {
+ if (rt6_check_expired(rt)) {
+ ip6_del_rt(rt);
+ dst = NULL;
+ }
+ } else {
+ dst_release(dst);
+ dst = NULL;
+ }
+ }
+ return dst;
+}
+
+static void ip6_link_failure(struct sk_buff *skb)
+{
+ struct rt6_info *rt;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
+
+ rt = (struct rt6_info *) skb_dst(skb);
+ if (rt) {
+ if (rt->rt6i_flags & RTF_CACHE) {
+ dst_hold(&rt->dst);
+ if (ip6_del_rt(rt))
+ dst_free(&rt->dst);
+ } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
+ rt->rt6i_node->fn_sernum = -1;
+ }
+ }
+}
+
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct rt6_info *rt6 = (struct rt6_info *)dst;
+
+ dst_confirm(dst);
+ if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
+ struct net *net = dev_net(dst->dev);
+
+ rt6->rt6i_flags |= RTF_MODIFIED;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ dst_metric_set(dst, RTAX_MTU, mtu);
+ rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
+ }
+}
+
+void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
+ int oif, u32 mark)
+{
+ const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
+ fl6.daddr = iph->daddr;
+ fl6.saddr = iph->saddr;
+ fl6.flowlabel = ip6_flowinfo(iph);
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (!dst->error)
+ ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
+ dst_release(dst);
+}
+EXPORT_SYMBOL_GPL(ip6_update_pmtu);
+
+void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
+{
+ ip6_update_pmtu(skb, sock_net(sk), mtu,
+ sk->sk_bound_dev_if, sk->sk_mark);
+}
+EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
+
+/* Handle redirects */
+struct ip6rd_flowi {
+ struct flowi6 fl6;
+ struct in6_addr gateway;
+};
+
+static struct rt6_info *__ip6_route_redirect(struct net *net,
+ struct fib6_table *table,
+ struct flowi6 *fl6,
+ int flags)
+{
+ struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
+ struct rt6_info *rt;
+ struct fib6_node *fn;
+
+ /* Get the "current" route for this destination and
+ * check if the redirect has come from approriate router.
+ *
+ * RFC 4861 specifies that redirects should only be
+ * accepted if they come from the nexthop to the target.
+ * Due to the way the routes are chosen, this notion
+ * is a bit fuzzy and one might need to check all possible
+ * routes.
+ */
+
+ read_lock_bh(&table->tb6_lock);
+ fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+restart:
+ for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ if (rt6_check_expired(rt))
+ continue;
+ if (rt->dst.error)
+ break;
+ if (!(rt->rt6i_flags & RTF_GATEWAY))
+ continue;
+ if (fl6->flowi6_oif != rt->dst.dev->ifindex)
+ continue;
+ if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
+ continue;
+ break;
+ }
+
+ if (!rt)
+ rt = net->ipv6.ip6_null_entry;
+ else if (rt->dst.error) {
+ rt = net->ipv6.ip6_null_entry;
+ goto out;
+ }
+
+ if (rt == net->ipv6.ip6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto restart;
+ }
+
+out:
+ dst_hold(&rt->dst);
+
+ read_unlock_bh(&table->tb6_lock);
+
+ return rt;
+};
+
+static struct dst_entry *ip6_route_redirect(struct net *net,
+ const struct flowi6 *fl6,
+ const struct in6_addr *gateway)
+{
+ int flags = RT6_LOOKUP_F_HAS_SADDR;
+ struct ip6rd_flowi rdfl;
+
+ rdfl.fl6 = *fl6;
+ rdfl.gateway = *gateway;
+
+ return fib6_rule_lookup(net, &rdfl.fl6,
+ flags, __ip6_route_redirect);
+}
+
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
+{
+ const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_iif = LOOPBACK_IFINDEX;
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_mark = mark;
+ fl6.daddr = iph->daddr;
+ fl6.saddr = iph->saddr;
+ fl6.flowlabel = ip6_flowinfo(iph);
+
+ dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
+ rt6_do_redirect(dst, NULL, skb);
+ dst_release(dst);
+}
+EXPORT_SYMBOL_GPL(ip6_redirect);
+
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
+ u32 mark)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_iif = LOOPBACK_IFINDEX;
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_mark = mark;
+ fl6.daddr = msg->dest;
+ fl6.saddr = iph->daddr;
+
+ dst = ip6_route_redirect(net, &fl6, &iph->saddr);
+ rt6_do_redirect(dst, NULL, skb);
+ dst_release(dst);
+}
+
+void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
+{
+ ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
+}
+EXPORT_SYMBOL_GPL(ip6_sk_redirect);
+
+static unsigned int ip6_default_advmss(const struct dst_entry *dst)
+{
+ struct net_device *dev = dst->dev;
+ unsigned int mtu = dst_mtu(dst);
+ struct net *net = dev_net(dev);
+
+ mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
+
+ if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
+ mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
+
+ /*
+ * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
+ * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
+ * IPV6_MAXPLEN is also valid and means: "any MSS,
+ * rely only on pmtu discovery"
+ */
+ if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
+ mtu = IPV6_MAXPLEN;
+ return mtu;
+}
+
+static unsigned int ip6_mtu(const struct dst_entry *dst)
+{
+ struct inet6_dev *idev;
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ if (mtu)
+ goto out;
+
+ mtu = IPV6_MIN_MTU;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dst->dev);
+ if (idev)
+ mtu = idev->cnf.mtu6;
+ rcu_read_unlock();
+
+out:
+ return min_t(unsigned int, mtu, IP6_MAX_MTU);
+}
+
+static struct dst_entry *icmp6_dst_gc_list;
+static DEFINE_SPINLOCK(icmp6_dst_lock);
+
+struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
+ struct flowi6 *fl6)
+{
+ struct dst_entry *dst;
+ struct rt6_info *rt;
+ struct inet6_dev *idev = in6_dev_get(dev);
+ struct net *net = dev_net(dev);
+
+ if (unlikely(!idev))
+ return ERR_PTR(-ENODEV);
+
+ rt = ip6_dst_alloc(net, dev, 0, NULL);
+ if (unlikely(!rt)) {
+ in6_dev_put(idev);
+ dst = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ rt->dst.flags |= DST_HOST;
+ rt->dst.output = ip6_output;
+ atomic_set(&rt->dst.__refcnt, 1);
+ rt->rt6i_gateway = fl6->daddr;
+ rt->rt6i_dst.addr = fl6->daddr;
+ rt->rt6i_dst.plen = 128;
+ rt->rt6i_idev = idev;
+ dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
+
+ spin_lock_bh(&icmp6_dst_lock);
+ rt->dst.next = icmp6_dst_gc_list;
+ icmp6_dst_gc_list = &rt->dst;
+ spin_unlock_bh(&icmp6_dst_lock);
+
+ fib6_force_start_gc(net);
+
+ dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
+
+out:
+ return dst;
+}
+
+int icmp6_dst_gc(void)
+{
+ struct dst_entry *dst, **pprev;
+ int more = 0;
+
+ spin_lock_bh(&icmp6_dst_lock);
+ pprev = &icmp6_dst_gc_list;
+
+ while ((dst = *pprev) != NULL) {
+ if (!atomic_read(&dst->__refcnt)) {
+ *pprev = dst->next;
+ dst_free(dst);
+ } else {
+ pprev = &dst->next;
+ ++more;
+ }
+ }
+
+ spin_unlock_bh(&icmp6_dst_lock);
+
+ return more;
+}
+
+static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
+ void *arg)
+{
+ struct dst_entry *dst, **pprev;
+
+ spin_lock_bh(&icmp6_dst_lock);
+ pprev = &icmp6_dst_gc_list;
+ while ((dst = *pprev) != NULL) {
+ struct rt6_info *rt = (struct rt6_info *) dst;
+ if (func(rt, arg)) {
+ *pprev = dst->next;
+ dst_free(dst);
+ } else {
+ pprev = &dst->next;
+ }
+ }
+ spin_unlock_bh(&icmp6_dst_lock);
+}
+
+static int ip6_dst_gc(struct dst_ops *ops)
+{
+ struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
+ int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
+ int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
+ int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
+ int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
+ unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+ int entries;
+
+ entries = dst_entries_get_fast(ops);
+ if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
+ entries <= rt_max_size)
+ goto out;
+
+ net->ipv6.ip6_rt_gc_expire++;
+ fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
+ entries = dst_entries_get_slow(ops);
+ if (entries < ops->gc_thresh)
+ net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
+out:
+ net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
+ return entries > rt_max_size;
+}
+
+static int ip6_convert_metrics(struct mx6_config *mxc,
+ const struct fib6_config *cfg)
+{
+ struct nlattr *nla;
+ int remaining;
+ u32 *mp;
+
+ if (!cfg->fc_mx)
+ return 0;
+
+ mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (unlikely(!mp))
+ return -ENOMEM;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+
+ if (type) {
+ u32 val;
+
+ if (unlikely(type > RTAX_MAX))
+ goto err;
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp);
+ if (val == TCP_CA_UNSPEC)
+ goto err;
+ } else {
+ val = nla_get_u32(nla);
+ }
+
+ mp[type - 1] = val;
+ __set_bit(type - 1, mxc->mx_valid);
+ }
+ }
+
+ mxc->mx = mp;
+
+ return 0;
+ err:
+ kfree(mp);
+ return -EINVAL;
+}
+
+int ip6_route_add(struct fib6_config *cfg)
+{
+ int err;
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ struct rt6_info *rt = NULL;
+ struct net_device *dev = NULL;
+ struct inet6_dev *idev = NULL;
+ struct fib6_table *table;
+ struct mx6_config mxc = { .mx = NULL, };
+ int addr_type;
+
+ if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
+ return -EINVAL;
+#ifndef CONFIG_IPV6_SUBTREES
+ if (cfg->fc_src_len)
+ return -EINVAL;
+#endif
+ if (cfg->fc_ifindex) {
+ err = -ENODEV;
+ dev = dev_get_by_index(net, cfg->fc_ifindex);
+ if (!dev)
+ goto out;
+ idev = in6_dev_get(dev);
+ if (!idev)
+ goto out;
+ }
+
+ if (cfg->fc_metric == 0)
+ cfg->fc_metric = IP6_RT_PRIO_USER;
+
+ err = -ENOBUFS;
+ if (cfg->fc_nlinfo.nlh &&
+ !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
+ table = fib6_get_table(net, cfg->fc_table);
+ if (!table) {
+ pr_warn("NLM_F_CREATE should be specified when creating new route\n");
+ table = fib6_new_table(net, cfg->fc_table);
+ }
+ } else {
+ table = fib6_new_table(net, cfg->fc_table);
+ }
+
+ if (!table)
+ goto out;
+
+ rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
+
+ if (!rt) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (cfg->fc_flags & RTF_EXPIRES)
+ rt6_set_expires(rt, jiffies +
+ clock_t_to_jiffies(cfg->fc_expires));
+ else
+ rt6_clean_expires(rt);
+
+ if (cfg->fc_protocol == RTPROT_UNSPEC)
+ cfg->fc_protocol = RTPROT_BOOT;
+ rt->rt6i_protocol = cfg->fc_protocol;
+
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
+
+ if (addr_type & IPV6_ADDR_MULTICAST)
+ rt->dst.input = ip6_mc_input;
+ else if (cfg->fc_flags & RTF_LOCAL)
+ rt->dst.input = ip6_input;
+ else
+ rt->dst.input = ip6_forward;
+
+ rt->dst.output = ip6_output;
+
+ ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
+ rt->rt6i_dst.plen = cfg->fc_dst_len;
+ if (rt->rt6i_dst.plen == 128) {
+ rt->dst.flags |= DST_HOST;
+ dst_metrics_set_force_overwrite(&rt->dst);
+ }
+
+#ifdef CONFIG_IPV6_SUBTREES
+ ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
+ rt->rt6i_src.plen = cfg->fc_src_len;
+#endif
+
+ rt->rt6i_metric = cfg->fc_metric;
+
+ /* We cannot add true routes via loopback here,
+ they would result in kernel looping; promote them to reject routes
+ */
+ if ((cfg->fc_flags & RTF_REJECT) ||
+ (dev && (dev->flags & IFF_LOOPBACK) &&
+ !(addr_type & IPV6_ADDR_LOOPBACK) &&
+ !(cfg->fc_flags & RTF_LOCAL))) {
+ /* hold loopback dev/idev if we haven't done so. */
+ if (dev != net->loopback_dev) {
+ if (dev) {
+ dev_put(dev);
+ in6_dev_put(idev);
+ }
+ dev = net->loopback_dev;
+ dev_hold(dev);
+ idev = in6_dev_get(dev);
+ if (!idev) {
+ err = -ENODEV;
+ goto out;
+ }
+ }
+ rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
+ switch (cfg->fc_type) {
+ case RTN_BLACKHOLE:
+ rt->dst.error = -EINVAL;
+ rt->dst.output = dst_discard_sk;
+ rt->dst.input = dst_discard;
+ break;
+ case RTN_PROHIBIT:
+ rt->dst.error = -EACCES;
+ rt->dst.output = ip6_pkt_prohibit_out;
+ rt->dst.input = ip6_pkt_prohibit;
+ break;
+ case RTN_THROW:
+ default:
+ rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
+ : -ENETUNREACH;
+ rt->dst.output = ip6_pkt_discard_out;
+ rt->dst.input = ip6_pkt_discard;
+ break;
+ }
+ goto install_route;
+ }
+
+ if (cfg->fc_flags & RTF_GATEWAY) {
+ const struct in6_addr *gw_addr;
+ int gwa_type;
+
+ gw_addr = &cfg->fc_gateway;
+ rt->rt6i_gateway = *gw_addr;
+ gwa_type = ipv6_addr_type(gw_addr);
+
+ if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
+ struct rt6_info *grt;
+
+ /* IPv6 strictly inhibits using not link-local
+ addresses as nexthop address.
+ Otherwise, router will not able to send redirects.
+ It is very good, but in some (rare!) circumstances
+ (SIT, PtP, NBMA NOARP links) it is handy to allow
+ some exceptions. --ANK
+ */
+ err = -EINVAL;
+ if (!(gwa_type & IPV6_ADDR_UNICAST))
+ goto out;
+
+ grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
+
+ err = -EHOSTUNREACH;
+ if (!grt)
+ goto out;
+ if (dev) {
+ if (dev != grt->dst.dev) {
+ ip6_rt_put(grt);
+ goto out;
+ }
+ } else {
+ dev = grt->dst.dev;
+ idev = grt->rt6i_idev;
+ dev_hold(dev);
+ in6_dev_hold(grt->rt6i_idev);
+ }
+ if (!(grt->rt6i_flags & RTF_GATEWAY))
+ err = 0;
+ ip6_rt_put(grt);
+
+ if (err)
+ goto out;
+ }
+ err = -EINVAL;
+ if (!dev || (dev->flags & IFF_LOOPBACK))
+ goto out;
+ }
+
+ err = -ENODEV;
+ if (!dev)
+ goto out;
+
+ if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
+ if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
+ err = -EINVAL;
+ goto out;
+ }
+ rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
+ rt->rt6i_prefsrc.plen = 128;
+ } else
+ rt->rt6i_prefsrc.plen = 0;
+
+ rt->rt6i_flags = cfg->fc_flags;
+
+install_route:
+ rt->dst.dev = dev;
+ rt->rt6i_idev = idev;
+ rt->rt6i_table = table;
+
+ cfg->fc_nlinfo.nl_net = dev_net(dev);
+
+ err = ip6_convert_metrics(&mxc, cfg);
+ if (err)
+ goto out;
+
+ err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
+
+ kfree(mxc.mx);
+ return err;
+out:
+ if (dev)
+ dev_put(dev);
+ if (idev)
+ in6_dev_put(idev);
+ if (rt)
+ dst_free(&rt->dst);
+ return err;
+}
+
+static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
+{
+ int err;
+ struct fib6_table *table;
+ struct net *net = dev_net(rt->dst.dev);
+
+ if (rt == net->ipv6.ip6_null_entry) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ table = rt->rt6i_table;
+ write_lock_bh(&table->tb6_lock);
+ err = fib6_del(rt, info);
+ write_unlock_bh(&table->tb6_lock);
+
+out:
+ ip6_rt_put(rt);
+ return err;
+}
+
+int ip6_del_rt(struct rt6_info *rt)
+{
+ struct nl_info info = {
+ .nl_net = dev_net(rt->dst.dev),
+ };
+ return __ip6_del_rt(rt, &info);
+}
+
+static int ip6_route_del(struct fib6_config *cfg)
+{
+ struct fib6_table *table;
+ struct fib6_node *fn;
+ struct rt6_info *rt;
+ int err = -ESRCH;
+
+ table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
+ if (!table)
+ return err;
+
+ read_lock_bh(&table->tb6_lock);
+
+ fn = fib6_locate(&table->tb6_root,
+ &cfg->fc_dst, cfg->fc_dst_len,
+ &cfg->fc_src, cfg->fc_src_len);
+
+ if (fn) {
+ for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ if (cfg->fc_ifindex &&
+ (!rt->dst.dev ||
+ rt->dst.dev->ifindex != cfg->fc_ifindex))
+ continue;
+ if (cfg->fc_flags & RTF_GATEWAY &&
+ !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+ continue;
+ if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
+ continue;
+ dst_hold(&rt->dst);
+ read_unlock_bh(&table->tb6_lock);
+
+ return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ }
+ }
+ read_unlock_bh(&table->tb6_lock);
+
+ return err;
+}
+
+static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ struct netevent_redirect netevent;
+ struct rt6_info *rt, *nrt = NULL;
+ struct ndisc_options ndopts;
+ struct inet6_dev *in6_dev;
+ struct neighbour *neigh;
+ struct rd_msg *msg;
+ int optlen, on_link;
+ u8 *lladdr;
+
+ optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
+ optlen -= sizeof(*msg);
+
+ if (optlen < 0) {
+ net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
+ return;
+ }
+
+ msg = (struct rd_msg *)icmp6_hdr(skb);
+
+ if (ipv6_addr_is_multicast(&msg->dest)) {
+ net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
+ return;
+ }
+
+ on_link = 0;
+ if (ipv6_addr_equal(&msg->dest, &msg->target)) {
+ on_link = 1;
+ } else if (ipv6_addr_type(&msg->target) !=
+ (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
+ net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
+ return;
+ }
+
+ in6_dev = __in6_dev_get(skb->dev);
+ if (!in6_dev)
+ return;
+ if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
+ return;
+
+ /* RFC2461 8.1:
+ * The IP source address of the Redirect MUST be the same as the current
+ * first-hop router for the specified ICMP Destination Address.
+ */
+
+ if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
+ net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
+ return;
+ }
+
+ lladdr = NULL;
+ if (ndopts.nd_opts_tgt_lladdr) {
+ lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
+ skb->dev);
+ if (!lladdr) {
+ net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
+ return;
+ }
+ }
+
+ rt = (struct rt6_info *) dst;
+ if (rt == net->ipv6.ip6_null_entry) {
+ net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
+ return;
+ }
+
+ /* Redirect received -> path was valid.
+ * Look, redirects are sent only in response to data packets,
+ * so that this nexthop apparently is reachable. --ANK
+ */
+ dst_confirm(&rt->dst);
+
+ neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
+ if (!neigh)
+ return;
+
+ /*
+ * We have finally decided to accept it.
+ */
+
+ neigh_update(neigh, lladdr, NUD_STALE,
+ NEIGH_UPDATE_F_WEAK_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE|
+ (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
+ NEIGH_UPDATE_F_ISROUTER))
+ );
+
+ nrt = ip6_rt_copy(rt, &msg->dest);
+ if (!nrt)
+ goto out;
+
+ nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
+ if (on_link)
+ nrt->rt6i_flags &= ~RTF_GATEWAY;
+
+ nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
+
+ if (ip6_ins_rt(nrt))
+ goto out;
+
+ netevent.old = &rt->dst;
+ netevent.new = &nrt->dst;
+ netevent.daddr = &msg->dest;
+ netevent.neigh = neigh;
+ call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
+
+ if (rt->rt6i_flags & RTF_CACHE) {
+ rt = (struct rt6_info *) dst_clone(&rt->dst);
+ ip6_del_rt(rt);
+ }
+
+out:
+ neigh_release(neigh);
+}
+
+/*
+ * Misc support functions
+ */
+
+static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
+ const struct in6_addr *dest)
+{
+ struct net *net = dev_net(ort->dst.dev);
+ struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
+ ort->rt6i_table);
+
+ if (rt) {
+ rt->dst.input = ort->dst.input;
+ rt->dst.output = ort->dst.output;
+ rt->dst.flags |= DST_HOST;
+
+ rt->rt6i_dst.addr = *dest;
+ rt->rt6i_dst.plen = 128;
+ dst_copy_metrics(&rt->dst, &ort->dst);
+ rt->dst.error = ort->dst.error;
+ rt->rt6i_idev = ort->rt6i_idev;
+ if (rt->rt6i_idev)
+ in6_dev_hold(rt->rt6i_idev);
+ rt->dst.lastuse = jiffies;
+
+ if (ort->rt6i_flags & RTF_GATEWAY)
+ rt->rt6i_gateway = ort->rt6i_gateway;
+ else
+ rt->rt6i_gateway = *dest;
+ rt->rt6i_flags = ort->rt6i_flags;
+ rt6_set_from(rt, ort);
+ rt->rt6i_metric = 0;
+
+#ifdef CONFIG_IPV6_SUBTREES
+ memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+#endif
+ memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
+ rt->rt6i_table = ort->rt6i_table;
+ }
+ return rt;
+}
+
+#ifdef CONFIG_IPV6_ROUTE_INFO
+static struct rt6_info *rt6_get_route_info(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ const struct in6_addr *gwaddr, int ifindex)
+{
+ struct fib6_node *fn;
+ struct rt6_info *rt = NULL;
+ struct fib6_table *table;
+
+ table = fib6_get_table(net, RT6_TABLE_INFO);
+ if (!table)
+ return NULL;
+
+ read_lock_bh(&table->tb6_lock);
+ fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
+ if (!fn)
+ goto out;
+
+ for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+ if (rt->dst.dev->ifindex != ifindex)
+ continue;
+ if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+ continue;
+ if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
+ continue;
+ dst_hold(&rt->dst);
+ break;
+ }
+out:
+ read_unlock_bh(&table->tb6_lock);
+ return rt;
+}
+
+static struct rt6_info *rt6_add_route_info(struct net *net,
+ const struct in6_addr *prefix, int prefixlen,
+ const struct in6_addr *gwaddr, int ifindex,
+ unsigned int pref)
+{
+ struct fib6_config cfg = {
+ .fc_table = RT6_TABLE_INFO,
+ .fc_metric = IP6_RT_PRIO_USER,
+ .fc_ifindex = ifindex,
+ .fc_dst_len = prefixlen,
+ .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
+ RTF_UP | RTF_PREF(pref),
+ .fc_nlinfo.portid = 0,
+ .fc_nlinfo.nlh = NULL,
+ .fc_nlinfo.nl_net = net,
+ };
+
+ cfg.fc_dst = *prefix;
+ cfg.fc_gateway = *gwaddr;
+
+ /* We should treat it as a default route if prefix length is 0. */
+ if (!prefixlen)
+ cfg.fc_flags |= RTF_DEFAULT;
+
+ ip6_route_add(&cfg);
+
+ return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
+}
+#endif
+
+struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
+{
+ struct rt6_info *rt;
+ struct fib6_table *table;
+
+ table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
+ if (!table)
+ return NULL;
+
+ read_lock_bh(&table->tb6_lock);
+ for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+ if (dev == rt->dst.dev &&
+ ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
+ ipv6_addr_equal(&rt->rt6i_gateway, addr))
+ break;
+ }
+ if (rt)
+ dst_hold(&rt->dst);
+ read_unlock_bh(&table->tb6_lock);
+ return rt;
+}
+
+struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
+ struct net_device *dev,
+ unsigned int pref)
+{
+ struct fib6_config cfg = {
+ .fc_table = RT6_TABLE_DFLT,
+ .fc_metric = IP6_RT_PRIO_USER,
+ .fc_ifindex = dev->ifindex,
+ .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
+ RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
+ .fc_nlinfo.portid = 0,
+ .fc_nlinfo.nlh = NULL,
+ .fc_nlinfo.nl_net = dev_net(dev),
+ };
+
+ cfg.fc_gateway = *gwaddr;
+
+ ip6_route_add(&cfg);
+
+ return rt6_get_dflt_router(gwaddr, dev);
+}
+
+void rt6_purge_dflt_routers(struct net *net)
+{
+ struct rt6_info *rt;
+ struct fib6_table *table;
+
+ /* NOTE: Keep consistent with rt6_get_dflt_router */
+ table = fib6_get_table(net, RT6_TABLE_DFLT);
+ if (!table)
+ return;
+
+restart:
+ read_lock_bh(&table->tb6_lock);
+ for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+ if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+ (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
+ dst_hold(&rt->dst);
+ read_unlock_bh(&table->tb6_lock);
+ ip6_del_rt(rt);
+ goto restart;
+ }
+ }
+ read_unlock_bh(&table->tb6_lock);
+}
+
+static void rtmsg_to_fib6_config(struct net *net,
+ struct in6_rtmsg *rtmsg,
+ struct fib6_config *cfg)
+{
+ memset(cfg, 0, sizeof(*cfg));
+
+ cfg->fc_table = RT6_TABLE_MAIN;
+ cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
+ cfg->fc_metric = rtmsg->rtmsg_metric;
+ cfg->fc_expires = rtmsg->rtmsg_info;
+ cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
+ cfg->fc_src_len = rtmsg->rtmsg_src_len;
+ cfg->fc_flags = rtmsg->rtmsg_flags;
+
+ cfg->fc_nlinfo.nl_net = net;
+
+ cfg->fc_dst = rtmsg->rtmsg_dst;
+ cfg->fc_src = rtmsg->rtmsg_src;
+ cfg->fc_gateway = rtmsg->rtmsg_gateway;
+}
+
+int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct fib6_config cfg;
+ struct in6_rtmsg rtmsg;
+ int err;
+
+ switch (cmd) {
+ case SIOCADDRT: /* Add a route */
+ case SIOCDELRT: /* Delete a route */
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ err = copy_from_user(&rtmsg, arg,
+ sizeof(struct in6_rtmsg));
+ if (err)
+ return -EFAULT;
+
+ rtmsg_to_fib6_config(net, &rtmsg, &cfg);
+
+ rtnl_lock();
+ switch (cmd) {
+ case SIOCADDRT:
+ err = ip6_route_add(&cfg);
+ break;
+ case SIOCDELRT:
+ err = ip6_route_del(&cfg);
+ break;
+ default:
+ err = -EINVAL;
+ }
+ rtnl_unlock();
+
+ return err;
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * Drop the packet on the floor
+ */
+
+static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
+{
+ int type;
+ struct dst_entry *dst = skb_dst(skb);
+ switch (ipstats_mib_noroutes) {
+ case IPSTATS_MIB_INNOROUTES:
+ type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
+ if (type == IPV6_ADDR_ANY) {
+ IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+ IPSTATS_MIB_INADDRERRORS);
+ break;
+ }
+ /* FALLTHROUGH */
+ case IPSTATS_MIB_OUTNOROUTES:
+ IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+ ipstats_mib_noroutes);
+ break;
+ }
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
+ kfree_skb(skb);
+ return 0;
+}
+
+static int ip6_pkt_discard(struct sk_buff *skb)
+{
+ return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
+}
+
+static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
+{
+ skb->dev = skb_dst(skb)->dev;
+ return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
+}
+
+static int ip6_pkt_prohibit(struct sk_buff *skb)
+{
+ return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
+}
+
+static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
+{
+ skb->dev = skb_dst(skb)->dev;
+ return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
+}
+
+/*
+ * Allocate a dst for local (unicast / anycast) address.
+ */
+
+struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
+ const struct in6_addr *addr,
+ bool anycast)
+{
+ struct net *net = dev_net(idev->dev);
+ struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
+ DST_NOCOUNT, NULL);
+ if (!rt)
+ return ERR_PTR(-ENOMEM);
+
+ in6_dev_hold(idev);
+
+ rt->dst.flags |= DST_HOST;
+ rt->dst.input = ip6_input;
+ rt->dst.output = ip6_output;
+ rt->rt6i_idev = idev;
+
+ rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
+ if (anycast)
+ rt->rt6i_flags |= RTF_ANYCAST;
+ else
+ rt->rt6i_flags |= RTF_LOCAL;
+
+ rt->rt6i_gateway = *addr;
+ rt->rt6i_dst.addr = *addr;
+ rt->rt6i_dst.plen = 128;
+ rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
+
+ atomic_set(&rt->dst.__refcnt, 1);
+
+ return rt;
+}
+
+int ip6_route_get_saddr(struct net *net,
+ struct rt6_info *rt,
+ const struct in6_addr *daddr,
+ unsigned int prefs,
+ struct in6_addr *saddr)
+{
+ struct inet6_dev *idev =
+ rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
+ int err = 0;
+ if (rt && rt->rt6i_prefsrc.plen)
+ *saddr = rt->rt6i_prefsrc.addr;
+ else
+ err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
+ daddr, prefs, saddr);
+ return err;
+}
+
+/* remove deleted ip from prefsrc entries */
+struct arg_dev_net_ip {
+ struct net_device *dev;
+ struct net *net;
+ struct in6_addr *addr;
+};
+
+static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
+{
+ struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
+ struct net *net = ((struct arg_dev_net_ip *)arg)->net;
+ struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
+
+ if (((void *)rt->dst.dev == dev || !dev) &&
+ rt != net->ipv6.ip6_null_entry &&
+ ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+ /* remove prefsrc entry */
+ rt->rt6i_prefsrc.plen = 0;
+ }
+ return 0;
+}
+
+void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
+{
+ struct net *net = dev_net(ifp->idev->dev);
+ struct arg_dev_net_ip adni = {
+ .dev = ifp->idev->dev,
+ .net = net,
+ .addr = &ifp->addr,
+ };
+ fib6_clean_all(net, fib6_remove_prefsrc, &adni);
+}
+
+#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
+#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
+
+/* Remove routers and update dst entries when gateway turn into host. */
+static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
+{
+ struct in6_addr *gateway = (struct in6_addr *)arg;
+
+ if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
+ ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
+ ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+ return -1;
+ }
+ return 0;
+}
+
+void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
+{
+ fib6_clean_all(net, fib6_clean_tohost, gateway);
+}
+
+struct arg_dev_net {
+ struct net_device *dev;
+ struct net *net;
+};
+
+static int fib6_ifdown(struct rt6_info *rt, void *arg)
+{
+ const struct arg_dev_net *adn = arg;
+ const struct net_device *dev = adn->dev;
+
+ if ((rt->dst.dev == dev || !dev) &&
+ rt != adn->net->ipv6.ip6_null_entry)
+ return -1;
+
+ return 0;
+}
+
+void rt6_ifdown(struct net *net, struct net_device *dev)
+{
+ struct arg_dev_net adn = {
+ .dev = dev,
+ .net = net,
+ };
+
+ fib6_clean_all(net, fib6_ifdown, &adn);
+ icmp6_clean_all(fib6_ifdown, &adn);
+}
+
+struct rt6_mtu_change_arg {
+ struct net_device *dev;
+ unsigned int mtu;
+};
+
+static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
+{
+ struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
+ struct inet6_dev *idev;
+
+ /* In IPv6 pmtu discovery is not optional,
+ so that RTAX_MTU lock cannot disable it.
+ We still use this lock to block changes
+ caused by addrconf/ndisc.
+ */
+
+ idev = __in6_dev_get(arg->dev);
+ if (!idev)
+ return 0;
+
+ /* For administrative MTU increase, there is no way to discover
+ IPv6 PMTU increase, so PMTU increase should be updated here.
+ Since RFC 1981 doesn't include administrative MTU increase
+ update PMTU increase is a MUST. (i.e. jumbo frame)
+ */
+ /*
+ If new MTU is less than route PMTU, this new MTU will be the
+ lowest MTU in the path, update the route PMTU to reflect PMTU
+ decreases; if new MTU is greater than route PMTU, and the
+ old MTU is the lowest MTU in the path, update the route PMTU
+ to reflect the increase. In this case if the other nodes' MTU
+ also have the lowest MTU, TOO BIG MESSAGE will be lead to
+ PMTU discouvery.
+ */
+ if (rt->dst.dev == arg->dev &&
+ !dst_metric_locked(&rt->dst, RTAX_MTU) &&
+ (dst_mtu(&rt->dst) >= arg->mtu ||
+ (dst_mtu(&rt->dst) < arg->mtu &&
+ dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
+ dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+ }
+ return 0;
+}
+
+void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
+{
+ struct rt6_mtu_change_arg arg = {
+ .dev = dev,
+ .mtu = mtu,
+ };
+
+ fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
+}
+
+static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
+ [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
+ [RTA_OIF] = { .type = NLA_U32 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_PRIORITY] = { .type = NLA_U32 },
+ [RTA_METRICS] = { .type = NLA_NESTED },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+ [RTA_PREF] = { .type = NLA_U8 },
+};
+
+static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct fib6_config *cfg)
+{
+ struct rtmsg *rtm;
+ struct nlattr *tb[RTA_MAX+1];
+ unsigned int pref;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ rtm = nlmsg_data(nlh);
+ memset(cfg, 0, sizeof(*cfg));
+
+ cfg->fc_table = rtm->rtm_table;
+ cfg->fc_dst_len = rtm->rtm_dst_len;
+ cfg->fc_src_len = rtm->rtm_src_len;
+ cfg->fc_flags = RTF_UP;
+ cfg->fc_protocol = rtm->rtm_protocol;
+ cfg->fc_type = rtm->rtm_type;
+
+ if (rtm->rtm_type == RTN_UNREACHABLE ||
+ rtm->rtm_type == RTN_BLACKHOLE ||
+ rtm->rtm_type == RTN_PROHIBIT ||
+ rtm->rtm_type == RTN_THROW)
+ cfg->fc_flags |= RTF_REJECT;
+
+ if (rtm->rtm_type == RTN_LOCAL)
+ cfg->fc_flags |= RTF_LOCAL;
+
+ cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->fc_nlinfo.nlh = nlh;
+ cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
+
+ if (tb[RTA_GATEWAY]) {
+ cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
+ cfg->fc_flags |= RTF_GATEWAY;
+ }
+
+ if (tb[RTA_DST]) {
+ int plen = (rtm->rtm_dst_len + 7) >> 3;
+
+ if (nla_len(tb[RTA_DST]) < plen)
+ goto errout;
+
+ nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
+ }
+
+ if (tb[RTA_SRC]) {
+ int plen = (rtm->rtm_src_len + 7) >> 3;
+
+ if (nla_len(tb[RTA_SRC]) < plen)
+ goto errout;
+
+ nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
+ }
+
+ if (tb[RTA_PREFSRC])
+ cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
+
+ if (tb[RTA_OIF])
+ cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
+
+ if (tb[RTA_PRIORITY])
+ cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
+
+ if (tb[RTA_METRICS]) {
+ cfg->fc_mx = nla_data(tb[RTA_METRICS]);
+ cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
+ }
+
+ if (tb[RTA_TABLE])
+ cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
+
+ if (tb[RTA_MULTIPATH]) {
+ cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
+ cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+ }
+
+ if (tb[RTA_PREF]) {
+ pref = nla_get_u8(tb[RTA_PREF]);
+ if (pref != ICMPV6_ROUTER_PREF_LOW &&
+ pref != ICMPV6_ROUTER_PREF_HIGH)
+ pref = ICMPV6_ROUTER_PREF_MEDIUM;
+ cfg->fc_flags |= RTF_PREF(pref);
+ }
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int ip6_route_multipath(struct fib6_config *cfg, int add)
+{
+ struct fib6_config r_cfg;
+ struct rtnexthop *rtnh;
+ int remaining;
+ int attrlen;
+ int err = 0, last_err = 0;
+
+ remaining = cfg->fc_mp_len;
+beginning:
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+ /* Parse a Multipath Entry */
+ while (rtnh_ok(rtnh, remaining)) {
+ memcpy(&r_cfg, cfg, sizeof(*cfg));
+ if (rtnh->rtnh_ifindex)
+ r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
+ r_cfg.fc_flags |= RTF_GATEWAY;
+ }
+ }
+ err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
+ if (err) {
+ last_err = err;
+ /* If we are trying to remove a route, do not stop the
+ * loop when ip6_route_del() fails (because next hop is
+ * already gone), we should try to remove all next hops.
+ */
+ if (add) {
+ /* If add fails, we should try to delete all
+ * next hops that have been already added.
+ */
+ add = 0;
+ remaining = cfg->fc_mp_len - remaining;
+ goto beginning;
+ }
+ }
+ /* Because each route is added like a single route we remove
+ * these flags after the first nexthop: if there is a collision,
+ * we have already failed to add the first nexthop:
+ * fib6_add_rt2node() has rejected it; when replacing, old
+ * nexthops have been replaced by first new, the rest should
+ * be added to it.
+ */
+ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
+ NLM_F_REPLACE);
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return last_err;
+}
+
+static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct fib6_config cfg;
+ int err;
+
+ err = rtm_to_fib6_config(skb, nlh, &cfg);
+ if (err < 0)
+ return err;
+
+ if (cfg.fc_mp)
+ return ip6_route_multipath(&cfg, 0);
+ else
+ return ip6_route_del(&cfg);
+}
+
+static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct fib6_config cfg;
+ int err;
+
+ err = rtm_to_fib6_config(skb, nlh, &cfg);
+ if (err < 0)
+ return err;
+
+ if (cfg.fc_mp)
+ return ip6_route_multipath(&cfg, 1);
+ else
+ return ip6_route_add(&cfg);
+}
+
+static inline size_t rt6_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(16) /* RTA_SRC */
+ + nla_total_size(16) /* RTA_DST */
+ + nla_total_size(16) /* RTA_GATEWAY */
+ + nla_total_size(16) /* RTA_PREFSRC */
+ + nla_total_size(4) /* RTA_TABLE */
+ + nla_total_size(4) /* RTA_IIF */
+ + nla_total_size(4) /* RTA_OIF */
+ + nla_total_size(4) /* RTA_PRIORITY */
+ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
+ + nla_total_size(sizeof(struct rta_cacheinfo))
+ + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ + nla_total_size(1); /* RTA_PREF */
+}
+
+static int rt6_fill_node(struct net *net,
+ struct sk_buff *skb, struct rt6_info *rt,
+ struct in6_addr *dst, struct in6_addr *src,
+ int iif, int type, u32 portid, u32 seq,
+ int prefix, int nowait, unsigned int flags)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+ long expires;
+ u32 table;
+
+ if (prefix) { /* user wants prefix routes only */
+ if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
+ /* success since this is not a prefix route */
+ return 1;
+ }
+ }
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = AF_INET6;
+ rtm->rtm_dst_len = rt->rt6i_dst.plen;
+ rtm->rtm_src_len = rt->rt6i_src.plen;
+ rtm->rtm_tos = 0;
+ if (rt->rt6i_table)
+ table = rt->rt6i_table->tb6_id;
+ else
+ table = RT6_TABLE_UNSPEC;
+ rtm->rtm_table = table;
+ if (nla_put_u32(skb, RTA_TABLE, table))
+ goto nla_put_failure;
+ if (rt->rt6i_flags & RTF_REJECT) {
+ switch (rt->dst.error) {
+ case -EINVAL:
+ rtm->rtm_type = RTN_BLACKHOLE;
+ break;
+ case -EACCES:
+ rtm->rtm_type = RTN_PROHIBIT;
+ break;
+ case -EAGAIN:
+ rtm->rtm_type = RTN_THROW;
+ break;
+ default:
+ rtm->rtm_type = RTN_UNREACHABLE;
+ break;
+ }
+ }
+ else if (rt->rt6i_flags & RTF_LOCAL)
+ rtm->rtm_type = RTN_LOCAL;
+ else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
+ rtm->rtm_type = RTN_LOCAL;
+ else
+ rtm->rtm_type = RTN_UNICAST;
+ rtm->rtm_flags = 0;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ rtm->rtm_protocol = rt->rt6i_protocol;
+ if (rt->rt6i_flags & RTF_DYNAMIC)
+ rtm->rtm_protocol = RTPROT_REDIRECT;
+ else if (rt->rt6i_flags & RTF_ADDRCONF) {
+ if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
+ rtm->rtm_protocol = RTPROT_RA;
+ else
+ rtm->rtm_protocol = RTPROT_KERNEL;
+ }
+
+ if (rt->rt6i_flags & RTF_CACHE)
+ rtm->rtm_flags |= RTM_F_CLONED;
+
+ if (dst) {
+ if (nla_put_in6_addr(skb, RTA_DST, dst))
+ goto nla_put_failure;
+ rtm->rtm_dst_len = 128;
+ } else if (rtm->rtm_dst_len)
+ if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
+ goto nla_put_failure;
+#ifdef CONFIG_IPV6_SUBTREES
+ if (src) {
+ if (nla_put_in6_addr(skb, RTA_SRC, src))
+ goto nla_put_failure;
+ rtm->rtm_src_len = 128;
+ } else if (rtm->rtm_src_len &&
+ nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
+ goto nla_put_failure;
+#endif
+ if (iif) {
+#ifdef CONFIG_IPV6_MROUTE
+ if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+ int err = ip6mr_get_route(net, skb, rtm, nowait);
+ if (err <= 0) {
+ if (!nowait) {
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
+ } else {
+ if (err == -EMSGSIZE)
+ goto nla_put_failure;
+ }
+ }
+ } else
+#endif
+ if (nla_put_u32(skb, RTA_IIF, iif))
+ goto nla_put_failure;
+ } else if (dst) {
+ struct in6_addr saddr_buf;
+ if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
+ nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
+ goto nla_put_failure;
+ }
+
+ if (rt->rt6i_prefsrc.plen) {
+ struct in6_addr saddr_buf;
+ saddr_buf = rt->rt6i_prefsrc.addr;
+ if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
+ goto nla_put_failure;
+ }
+
+ if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+ goto nla_put_failure;
+
+ if (rt->rt6i_flags & RTF_GATEWAY) {
+ if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
+ goto nla_put_failure;
+ }
+
+ if (rt->dst.dev &&
+ nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
+ goto nla_put_failure;
+
+ expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
+
+ if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+int rt6_dump_route(struct rt6_info *rt, void *p_arg)
+{
+ struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+ int prefix;
+
+ if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
+ struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
+ prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
+ } else
+ prefix = 0;
+
+ return rt6_fill_node(arg->net,
+ arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
+ NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
+ prefix, 0, NLM_F_MULTI);
+}
+
+static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[RTA_MAX+1];
+ struct rt6_info *rt;
+ struct sk_buff *skb;
+ struct rtmsg *rtm;
+ struct flowi6 fl6;
+ int err, iif = 0, oif = 0;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ memset(&fl6, 0, sizeof(fl6));
+
+ if (tb[RTA_SRC]) {
+ if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
+ goto errout;
+
+ fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
+ }
+
+ if (tb[RTA_DST]) {
+ if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
+ goto errout;
+
+ fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
+ }
+
+ if (tb[RTA_IIF])
+ iif = nla_get_u32(tb[RTA_IIF]);
+
+ if (tb[RTA_OIF])
+ oif = nla_get_u32(tb[RTA_OIF]);
+
+ if (tb[RTA_MARK])
+ fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
+
+ if (iif) {
+ struct net_device *dev;
+ int flags = 0;
+
+ dev = __dev_get_by_index(net, iif);
+ if (!dev) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ fl6.flowi6_iif = iif;
+
+ if (!ipv6_addr_any(&fl6.saddr))
+ flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+ rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
+ flags);
+ } else {
+ fl6.flowi6_oif = oif;
+
+ rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
+ }
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb) {
+ ip6_rt_put(rt);
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ /* Reserve room for dummy headers, this skb can pass
+ through good chunk of routing engine.
+ */
+ skb_reset_mac_header(skb);
+ skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
+
+ skb_dst_set(skb, &rt->dst);
+
+ err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
+ RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, 0, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+errout:
+ return err;
+}
+
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
+{
+ struct sk_buff *skb;
+ struct net *net = info->nl_net;
+ u32 seq;
+ int err;
+
+ err = -ENOBUFS;
+ seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+
+ skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
+ event, info->portid, seq, 0, 0, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, gfp_any());
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+
+static int ip6_route_dev_notify(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
+ net->ipv6.ip6_null_entry->dst.dev = dev;
+ net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ net->ipv6.ip6_prohibit_entry->dst.dev = dev;
+ net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
+ net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
+ net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
+#endif
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * /proc
+ */
+
+#ifdef CONFIG_PROC_FS
+
+static const struct file_operations ipv6_route_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = ipv6_route_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int rt6_stats_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = (struct net *)seq->private;
+ seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
+ net->ipv6.rt6_stats->fib_nodes,
+ net->ipv6.rt6_stats->fib_route_nodes,
+ net->ipv6.rt6_stats->fib_rt_alloc,
+ net->ipv6.rt6_stats->fib_rt_entries,
+ net->ipv6.rt6_stats->fib_rt_cache,
+ dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
+ net->ipv6.rt6_stats->fib_discarded_routes);
+
+ return 0;
+}
+
+static int rt6_stats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, rt6_stats_seq_show);
+}
+
+static const struct file_operations rt6_stats_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rt6_stats_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SYSCTL
+
+static
+int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int delay;
+ if (!write)
+ return -EINVAL;
+
+ net = (struct net *)ctl->extra1;
+ delay = net->ipv6.sysctl.flush_delay;
+ proc_dointvec(ctl, write, buffer, lenp, ppos);
+ fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
+ return 0;
+}
+
+struct ctl_table ipv6_route_table_template[] = {
+ {
+ .procname = "flush",
+ .data = &init_net.ipv6.sysctl.flush_delay,
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = ipv6_sysctl_rtcache_flush
+ },
+ {
+ .procname = "gc_thresh",
+ .data = &ip6_dst_ops_template.gc_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_size",
+ .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "gc_min_interval",
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "gc_timeout",
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "gc_interval",
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "gc_elasticity",
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "mtu_expires",
+ .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "min_adv_mss",
+ .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "gc_min_interval_ms",
+ .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ { }
+};
+
+struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(ipv6_route_table_template,
+ sizeof(ipv6_route_table_template),
+ GFP_KERNEL);
+
+ if (table) {
+ table[0].data = &net->ipv6.sysctl.flush_delay;
+ table[0].extra1 = net;
+ table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
+ table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
+ table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+ table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
+ table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
+ table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
+ table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
+ table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
+ table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+ }
+
+ return table;
+}
+#endif
+
+static int __net_init ip6_route_net_init(struct net *net)
+{
+ int ret = -ENOMEM;
+
+ memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
+ sizeof(net->ipv6.ip6_dst_ops));
+
+ if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
+ goto out_ip6_dst_ops;
+
+ net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
+ sizeof(*net->ipv6.ip6_null_entry),
+ GFP_KERNEL);
+ if (!net->ipv6.ip6_null_entry)
+ goto out_ip6_dst_entries;
+ net->ipv6.ip6_null_entry->dst.path =
+ (struct dst_entry *)net->ipv6.ip6_null_entry;
+ net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+ dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
+ ip6_template_metrics, true);
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
+ sizeof(*net->ipv6.ip6_prohibit_entry),
+ GFP_KERNEL);
+ if (!net->ipv6.ip6_prohibit_entry)
+ goto out_ip6_null_entry;
+ net->ipv6.ip6_prohibit_entry->dst.path =
+ (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
+ net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+ dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
+ ip6_template_metrics, true);
+
+ net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
+ sizeof(*net->ipv6.ip6_blk_hole_entry),
+ GFP_KERNEL);
+ if (!net->ipv6.ip6_blk_hole_entry)
+ goto out_ip6_prohibit_entry;
+ net->ipv6.ip6_blk_hole_entry->dst.path =
+ (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
+ net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+ dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
+ ip6_template_metrics, true);
+#endif
+
+ net->ipv6.sysctl.flush_delay = 0;
+ net->ipv6.sysctl.ip6_rt_max_size = 4096;
+ net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
+ net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
+ net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
+ net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
+ net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
+ net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+
+ net->ipv6.ip6_rt_gc_expire = 30*HZ;
+
+ ret = 0;
+out:
+ return ret;
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+out_ip6_prohibit_entry:
+ kfree(net->ipv6.ip6_prohibit_entry);
+out_ip6_null_entry:
+ kfree(net->ipv6.ip6_null_entry);
+#endif
+out_ip6_dst_entries:
+ dst_entries_destroy(&net->ipv6.ip6_dst_ops);
+out_ip6_dst_ops:
+ goto out;
+}
+
+static void __net_exit ip6_route_net_exit(struct net *net)
+{
+ kfree(net->ipv6.ip6_null_entry);
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ kfree(net->ipv6.ip6_prohibit_entry);
+ kfree(net->ipv6.ip6_blk_hole_entry);
+#endif
+ dst_entries_destroy(&net->ipv6.ip6_dst_ops);
+}
+
+static int __net_init ip6_route_net_init_late(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
+ proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
+#endif
+ return 0;
+}
+
+static void __net_exit ip6_route_net_exit_late(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ipv6_route", net->proc_net);
+ remove_proc_entry("rt6_stats", net->proc_net);
+#endif
+}
+
+static struct pernet_operations ip6_route_net_ops = {
+ .init = ip6_route_net_init,
+ .exit = ip6_route_net_exit,
+};
+
+static int __net_init ipv6_inetpeer_init(struct net *net)
+{
+ struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
+
+ if (!bp)
+ return -ENOMEM;
+ inet_peer_base_init(bp);
+ net->ipv6.peers = bp;
+ return 0;
+}
+
+static void __net_exit ipv6_inetpeer_exit(struct net *net)
+{
+ struct inet_peer_base *bp = net->ipv6.peers;
+
+ net->ipv6.peers = NULL;
+ inetpeer_invalidate_tree(bp);
+ kfree(bp);
+}
+
+static struct pernet_operations ipv6_inetpeer_ops = {
+ .init = ipv6_inetpeer_init,
+ .exit = ipv6_inetpeer_exit,
+};
+
+static struct pernet_operations ip6_route_net_late_ops = {
+ .init = ip6_route_net_init_late,
+ .exit = ip6_route_net_exit_late,
+};
+
+static struct notifier_block ip6_route_dev_notifier = {
+ .notifier_call = ip6_route_dev_notify,
+ .priority = 0,
+};
+
+int __init ip6_route_init(void)
+{
+ int ret;
+
+ ret = -ENOMEM;
+ ip6_dst_ops_template.kmem_cachep =
+ kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!ip6_dst_ops_template.kmem_cachep)
+ goto out;
+
+ ret = dst_entries_init(&ip6_dst_blackhole_ops);
+ if (ret)
+ goto out_kmem_cache;
+
+ ret = register_pernet_subsys(&ipv6_inetpeer_ops);
+ if (ret)
+ goto out_dst_entries;
+
+ ret = register_pernet_subsys(&ip6_route_net_ops);
+ if (ret)
+ goto out_register_inetpeer;
+
+ ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
+
+ /* Registering of the loopback is done before this portion of code,
+ * the loopback reference in rt6_info will not be taken, do it
+ * manually for init_net */
+ init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
+ init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+ #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
+ init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+ init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
+ init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+ #endif
+ ret = fib6_init();
+ if (ret)
+ goto out_register_subsys;
+
+ ret = xfrm6_init();
+ if (ret)
+ goto out_fib6_init;
+
+ ret = fib6_rules_init();
+ if (ret)
+ goto xfrm6_init;
+
+ ret = register_pernet_subsys(&ip6_route_net_late_ops);
+ if (ret)
+ goto fib6_rules_init;
+
+ ret = -ENOBUFS;
+ if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
+ __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
+ __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
+ goto out_register_late_subsys;
+
+ ret = register_netdevice_notifier(&ip6_route_dev_notifier);
+ if (ret)
+ goto out_register_late_subsys;
+
+out:
+ return ret;
+
+out_register_late_subsys:
+ unregister_pernet_subsys(&ip6_route_net_late_ops);
+fib6_rules_init:
+ fib6_rules_cleanup();
+xfrm6_init:
+ xfrm6_fini();
+out_fib6_init:
+ fib6_gc_cleanup();
+out_register_subsys:
+ unregister_pernet_subsys(&ip6_route_net_ops);
+out_register_inetpeer:
+ unregister_pernet_subsys(&ipv6_inetpeer_ops);
+out_dst_entries:
+ dst_entries_destroy(&ip6_dst_blackhole_ops);
+out_kmem_cache:
+ kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
+ goto out;
+}
+
+void ip6_route_cleanup(void)
+{
+ unregister_netdevice_notifier(&ip6_route_dev_notifier);
+ unregister_pernet_subsys(&ip6_route_net_late_ops);
+ fib6_rules_cleanup();
+ xfrm6_fini();
+ fib6_gc_cleanup();
+ unregister_pernet_subsys(&ipv6_inetpeer_ops);
+ unregister_pernet_subsys(&ip6_route_net_ops);
+ dst_entries_destroy(&ip6_dst_blackhole_ops);
+ kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
+}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
new file mode 100644
index 000000000..ac35a2859
--- /dev/null
+++ b/net/ipv6/sit.c
@@ -0,0 +1,1922 @@
+/*
+ * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT)
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Roger Venning <r.venning@telstra.com>: 6to4 support
+ * Nate Thompson <nate@thebog.net>: 6to4 support
+ * Fred Templin <fred.l.templin@boeing.com>: isatap support
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/ip_tunnels.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/dsfield.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/*
+ This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c
+
+ For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+#define HASH_SIZE 16
+#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+static int ipip6_tunnel_init(struct net_device *dev);
+static void ipip6_tunnel_setup(struct net_device *dev);
+static void ipip6_dev_free(struct net_device *dev);
+static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
+ __be32 *v4dst);
+static struct rtnl_link_ops sit_link_ops __read_mostly;
+
+static int sit_net_id __read_mostly;
+struct sit_net {
+ struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_wc[1];
+ struct ip_tunnel __rcu **tunnels[4];
+
+ struct net_device *fb_tunnel_dev;
+};
+
+/*
+ * Must be invoked with rcu_read_lock
+ */
+static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
+ struct net_device *dev, __be32 remote, __be32 local)
+{
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(local);
+ struct ip_tunnel *t;
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+
+ for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+ for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) {
+ if (remote == t->parms.iph.daddr &&
+ (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+ for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) {
+ if (local == t->parms.iph.saddr &&
+ (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
+ (t->dev->flags & IFF_UP))
+ return t;
+ }
+ t = rcu_dereference(sitn->tunnels_wc[0]);
+ if (t && (t->dev->flags & IFF_UP))
+ return t;
+ return NULL;
+}
+
+static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn,
+ struct ip_tunnel_parm *parms)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ unsigned int h = 0;
+ int prio = 0;
+
+ if (remote) {
+ prio |= 2;
+ h ^= HASH(remote);
+ }
+ if (local) {
+ prio |= 1;
+ h ^= HASH(local);
+ }
+ return &sitn->tunnels[prio][h];
+}
+
+static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn,
+ struct ip_tunnel *t)
+{
+ return __ipip6_bucket(sitn, &t->parms);
+}
+
+static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
+{
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipip6_bucket(sitn, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
+ break;
+ }
+ }
+}
+
+static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t)
+{
+ struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t);
+
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+ rcu_assign_pointer(*tp, t);
+}
+
+static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
+{
+#ifdef CONFIG_IPV6_SIT_6RD
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (t->dev == sitn->fb_tunnel_dev) {
+ ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
+ t->ip6rd.relay_prefix = 0;
+ t->ip6rd.prefixlen = 16;
+ t->ip6rd.relay_prefixlen = 0;
+ } else {
+ struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev);
+ memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd));
+ }
+#endif
+}
+
+static int ipip6_tunnel_create(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+ int err;
+
+ memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
+ memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
+
+ if ((__force u16)t->parms.i_flags & SIT_ISATAP)
+ dev->priv_flags |= IFF_ISATAP;
+
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ ipip6_tunnel_clone_6rd(dev, sitn);
+
+ dev->rtnl_link_ops = &sit_link_ops;
+
+ dev_hold(dev);
+
+ ipip6_tunnel_link(sitn, t);
+ return 0;
+
+out:
+ return err;
+}
+
+static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
+ struct ip_tunnel_parm *parms, int create)
+{
+ __be32 remote = parms->iph.daddr;
+ __be32 local = parms->iph.saddr;
+ struct ip_tunnel *t, *nt;
+ struct ip_tunnel __rcu **tp;
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+
+ for (tp = __ipip6_bucket(sitn, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
+ if (local == t->parms.iph.saddr &&
+ remote == t->parms.iph.daddr &&
+ parms->link == t->parms.link) {
+ if (create)
+ return NULL;
+ else
+ return t;
+ }
+ }
+ if (!create)
+ goto failed;
+
+ if (parms->name[0])
+ strlcpy(name, parms->name, IFNAMSIZ);
+ else
+ strcpy(name, "sit%d");
+
+ dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+ ipip6_tunnel_setup);
+ if (!dev)
+ return NULL;
+
+ dev_net_set(dev, net);
+
+ nt = netdev_priv(dev);
+
+ nt->parms = *parms;
+ if (ipip6_tunnel_create(dev) < 0)
+ goto failed_free;
+
+ return nt;
+
+failed_free:
+ ipip6_dev_free(dev);
+failed:
+ return NULL;
+}
+
+#define for_each_prl_rcu(start) \
+ for (prl = rcu_dereference(start); \
+ prl; \
+ prl = rcu_dereference(prl->next))
+
+static struct ip_tunnel_prl_entry *
+__ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
+{
+ struct ip_tunnel_prl_entry *prl;
+
+ for_each_prl_rcu(t->prl)
+ if (prl->addr == addr)
+ break;
+ return prl;
+
+}
+
+static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
+ struct ip_tunnel_prl __user *a)
+{
+ struct ip_tunnel_prl kprl, *kp;
+ struct ip_tunnel_prl_entry *prl;
+ unsigned int cmax, c = 0, ca, len;
+ int ret = 0;
+
+ if (copy_from_user(&kprl, a, sizeof(kprl)))
+ return -EFAULT;
+ cmax = kprl.datalen / sizeof(kprl);
+ if (cmax > 1 && kprl.addr != htonl(INADDR_ANY))
+ cmax = 1;
+
+ /* For simple GET or for root users,
+ * we try harder to allocate.
+ */
+ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
+ kcalloc(cmax, sizeof(*kp), GFP_KERNEL) :
+ NULL;
+
+ rcu_read_lock();
+
+ ca = t->prl_count < cmax ? t->prl_count : cmax;
+
+ if (!kp) {
+ /* We don't try hard to allocate much memory for
+ * non-root users.
+ * For root users, retry allocating enough memory for
+ * the answer.
+ */
+ kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC);
+ if (!kp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ c = 0;
+ for_each_prl_rcu(t->prl) {
+ if (c >= cmax)
+ break;
+ if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr)
+ continue;
+ kp[c].addr = prl->addr;
+ kp[c].flags = prl->flags;
+ c++;
+ if (kprl.addr != htonl(INADDR_ANY))
+ break;
+ }
+out:
+ rcu_read_unlock();
+
+ len = sizeof(*kp) * c;
+ ret = 0;
+ if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen))
+ ret = -EFAULT;
+
+ kfree(kp);
+
+ return ret;
+}
+
+static int
+ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
+{
+ struct ip_tunnel_prl_entry *p;
+ int err = 0;
+
+ if (a->addr == htonl(INADDR_ANY))
+ return -EINVAL;
+
+ ASSERT_RTNL();
+
+ for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) {
+ if (p->addr == a->addr) {
+ if (chg) {
+ p->flags = a->flags;
+ goto out;
+ }
+ err = -EEXIST;
+ goto out;
+ }
+ }
+
+ if (chg) {
+ err = -ENXIO;
+ goto out;
+ }
+
+ p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL);
+ if (!p) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ p->next = t->prl;
+ p->addr = a->addr;
+ p->flags = a->flags;
+ t->prl_count++;
+ rcu_assign_pointer(t->prl, p);
+out:
+ return err;
+}
+
+static void prl_list_destroy_rcu(struct rcu_head *head)
+{
+ struct ip_tunnel_prl_entry *p, *n;
+
+ p = container_of(head, struct ip_tunnel_prl_entry, rcu_head);
+ do {
+ n = rcu_dereference_protected(p->next, 1);
+ kfree(p);
+ p = n;
+ } while (p);
+}
+
+static int
+ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
+{
+ struct ip_tunnel_prl_entry *x;
+ struct ip_tunnel_prl_entry __rcu **p;
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ if (a && a->addr != htonl(INADDR_ANY)) {
+ for (p = &t->prl;
+ (x = rtnl_dereference(*p)) != NULL;
+ p = &x->next) {
+ if (x->addr == a->addr) {
+ *p = x->next;
+ kfree_rcu(x, rcu_head);
+ t->prl_count--;
+ goto out;
+ }
+ }
+ err = -ENXIO;
+ } else {
+ x = rtnl_dereference(t->prl);
+ if (x) {
+ t->prl_count = 0;
+ call_rcu(&x->rcu_head, prl_list_destroy_rcu);
+ t->prl = NULL;
+ }
+ }
+out:
+ return err;
+}
+
+static int
+isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t)
+{
+ struct ip_tunnel_prl_entry *p;
+ int ok = 1;
+
+ rcu_read_lock();
+ p = __ipip6_tunnel_locate_prl(t, iph->saddr);
+ if (p) {
+ if (p->flags & PRL_DEFAULT)
+ skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT;
+ else
+ skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT;
+ } else {
+ const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr;
+
+ if (ipv6_addr_is_isatap(addr6) &&
+ (addr6->s6_addr32[3] == iph->saddr) &&
+ ipv6_chk_prefix(addr6, t->dev))
+ skb->ndisc_nodetype = NDISC_NODETYPE_HOST;
+ else
+ ok = 0;
+ }
+ rcu_read_unlock();
+ return ok;
+}
+
+static void ipip6_tunnel_uninit(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct sit_net *sitn = net_generic(tunnel->net, sit_net_id);
+
+ if (dev == sitn->fb_tunnel_dev) {
+ RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL);
+ } else {
+ ipip6_tunnel_unlink(sitn, tunnel);
+ ipip6_tunnel_del_prl(tunnel, NULL);
+ }
+ ip_tunnel_dst_reset_all(tunnel);
+ dev_put(dev);
+}
+
+/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
+ * if sufficient data bytes are available
+ */
+static int ipip6_err_gen_icmpv6_unreach(struct sk_buff *skb)
+{
+ int ihl = ((const struct iphdr *)skb->data)->ihl*4;
+ struct rt6_info *rt;
+ struct sk_buff *skb2;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(struct ipv6hdr) + 8))
+ return 1;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb2)
+ return 1;
+
+ skb_dst_drop(skb2);
+ skb_pull(skb2, ihl);
+ skb_reset_network_header(skb2);
+
+ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0);
+
+ if (rt && rt->dst.dev)
+ skb2->dev = rt->dst.dev;
+
+ icmpv6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
+
+ if (rt)
+ ip6_rt_put(rt);
+
+ kfree_skb(skb2);
+
+ return 0;
+}
+
+static int ipip6_err(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct ip_tunnel *t;
+ int err;
+
+ switch (type) {
+ default:
+ case ICMP_PARAMETERPROB:
+ return 0;
+
+ case ICMP_DEST_UNREACH:
+ switch (code) {
+ case ICMP_SR_FAILED:
+ /* Impossible event. */
+ return 0;
+ default:
+ /* All others are translated to HOST_UNREACH.
+ rfc2003 contains "deep thoughts" about NET_UNREACH,
+ I believe they are just ether pollution. --ANK
+ */
+ break;
+ }
+ break;
+ case ICMP_TIME_EXCEEDED:
+ if (code != ICMP_EXC_TTL)
+ return 0;
+ break;
+ case ICMP_REDIRECT:
+ break;
+ }
+
+ err = -ENOENT;
+
+ t = ipip6_tunnel_lookup(dev_net(skb->dev),
+ skb->dev,
+ iph->daddr,
+ iph->saddr);
+ if (!t)
+ goto out;
+
+ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+ ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+ t->parms.link, 0, IPPROTO_IPV6, 0);
+ err = 0;
+ goto out;
+ }
+ if (type == ICMP_REDIRECT) {
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
+ IPPROTO_IPV6, 0);
+ err = 0;
+ goto out;
+ }
+
+ if (t->parms.iph.daddr == 0)
+ goto out;
+
+ err = 0;
+ if (!ipip6_err_gen_icmpv6_unreach(skb))
+ goto out;
+
+ if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+ goto out;
+
+ if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+ t->err_count++;
+ else
+ t->err_count = 1;
+ t->err_time = jiffies;
+out:
+ return err;
+}
+
+static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr,
+ const struct in6_addr *v6addr)
+{
+ __be32 v4embed = 0;
+ if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed)
+ return true;
+ return false;
+}
+
+/* Checks if an address matches an address on the tunnel interface.
+ * Used to detect the NAT of proto 41 packets and let them pass spoofing test.
+ * Long story:
+ * This function is called after we considered the packet as spoofed
+ * in is_spoofed_6rd.
+ * We may have a router that is doing NAT for proto 41 packets
+ * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb
+ * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd
+ * function will return true, dropping the packet.
+ * But, we can still check if is spoofed against the IP
+ * addresses associated with the interface.
+ */
+static bool only_dnatted(const struct ip_tunnel *tunnel,
+ const struct in6_addr *v6dst)
+{
+ int prefix_len;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ prefix_len = tunnel->ip6rd.prefixlen + 32
+ - tunnel->ip6rd.relay_prefixlen;
+#else
+ prefix_len = 48;
+#endif
+ return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev);
+}
+
+/* Returns true if a packet is spoofed */
+static bool packet_is_spoofed(struct sk_buff *skb,
+ const struct iphdr *iph,
+ struct ip_tunnel *tunnel)
+{
+ const struct ipv6hdr *ipv6h;
+
+ if (tunnel->dev->priv_flags & IFF_ISATAP) {
+ if (!isatap_chksrc(skb, iph, tunnel))
+ return true;
+
+ return false;
+ }
+
+ if (tunnel->dev->flags & IFF_POINTOPOINT)
+ return false;
+
+ ipv6h = ipv6_hdr(skb);
+
+ if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) {
+ net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n",
+ &iph->saddr, &ipv6h->saddr,
+ &iph->daddr, &ipv6h->daddr);
+ return true;
+ }
+
+ if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr)))
+ return false;
+
+ if (only_dnatted(tunnel, &ipv6h->daddr))
+ return false;
+
+ net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n",
+ &iph->saddr, &ipv6h->saddr,
+ &iph->daddr, &ipv6h->daddr);
+ return true;
+}
+
+static int ipip6_rcv(struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct ip_tunnel *tunnel;
+ int err;
+
+ tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
+ iph->saddr, iph->daddr);
+ if (tunnel) {
+ struct pcpu_sw_netstats *tstats;
+
+ if (tunnel->parms.iph.protocol != IPPROTO_IPV6 &&
+ tunnel->parms.iph.protocol != 0)
+ goto out;
+
+ skb->mac_header = skb->network_header;
+ skb_reset_network_header(skb);
+ IPCB(skb)->flags = 0;
+ skb->protocol = htons(ETH_P_IPV6);
+
+ if (packet_is_spoofed(skb, iph, tunnel)) {
+ tunnel->dev->stats.rx_errors++;
+ goto out;
+ }
+
+ __skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
+
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (log_ecn_error)
+ net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+ &iph->saddr, iph->tos);
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ goto out;
+ }
+ }
+
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ netif_rx(skb);
+
+ return 0;
+ }
+
+ /* no tunnel matched, let upstream know, ipsec may handle it */
+ return 1;
+out:
+ kfree_skb(skb);
+ return 0;
+}
+
+static const struct tnl_ptk_info tpi = {
+ /* no tunnel info required for ipip. */
+ .proto = htons(ETH_P_IP),
+};
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ struct ip_tunnel *tunnel;
+
+ iph = ip_hdr(skb);
+ tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
+ iph->saddr, iph->daddr);
+ if (tunnel) {
+ if (tunnel->parms.iph.protocol != IPPROTO_IPIP &&
+ tunnel->parms.iph.protocol != 0)
+ goto drop;
+
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto drop;
+ if (iptunnel_pull_header(skb, 0, tpi.proto))
+ goto drop;
+ return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
+ }
+
+ return 1;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+/*
+ * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function
+ * stores the embedded IPv4 address in v4dst and returns true.
+ */
+static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
+ __be32 *v4dst)
+{
+#ifdef CONFIG_IPV6_SIT_6RD
+ if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix,
+ tunnel->ip6rd.prefixlen)) {
+ unsigned int pbw0, pbi0;
+ int pbi1;
+ u32 d;
+
+ pbw0 = tunnel->ip6rd.prefixlen >> 5;
+ pbi0 = tunnel->ip6rd.prefixlen & 0x1f;
+
+ d = (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >>
+ tunnel->ip6rd.relay_prefixlen;
+
+ pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen;
+ if (pbi1 > 0)
+ d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >>
+ (32 - pbi1);
+
+ *v4dst = tunnel->ip6rd.relay_prefix | htonl(d);
+ return true;
+ }
+#else
+ if (v6dst->s6_addr16[0] == htons(0x2002)) {
+ /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */
+ memcpy(v4dst, &v6dst->s6_addr16[1], 4);
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline __be32 try_6rd(struct ip_tunnel *tunnel,
+ const struct in6_addr *v6dst)
+{
+ __be32 dst = 0;
+ check_6rd(tunnel, v6dst, &dst);
+ return dst;
+}
+
+/*
+ * This function assumes it is being called from dev_queue_xmit()
+ * and that skb is filled properly by that function.
+ */
+
+static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tiph = &tunnel->parms.iph;
+ const struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ u8 tos = tunnel->parms.iph.tos;
+ __be16 df = tiph->frag_off;
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ unsigned int max_headroom; /* The extra header space needed */
+ __be32 dst = tiph->daddr;
+ struct flowi4 fl4;
+ int mtu;
+ const struct in6_addr *addr6;
+ int addr_type;
+ u8 ttl;
+ int err;
+ u8 protocol = IPPROTO_IPV6;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ goto tx_error;
+
+ if (tos == 1)
+ tos = ipv6_get_dsfield(iph6);
+
+ /* ISATAP (RFC4214) - must come before 6to4 */
+ if (dev->priv_flags & IFF_ISATAP) {
+ struct neighbour *neigh = NULL;
+ bool do_tx_error = false;
+
+ if (skb_dst(skb))
+ neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
+
+ if (!neigh) {
+ net_dbg_ratelimited("nexthop == NULL\n");
+ goto tx_error;
+ }
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if ((addr_type & IPV6_ADDR_UNICAST) &&
+ ipv6_addr_is_isatap(addr6))
+ dst = addr6->s6_addr32[3];
+ else
+ do_tx_error = true;
+
+ neigh_release(neigh);
+ if (do_tx_error)
+ goto tx_error;
+ }
+
+ if (!dst)
+ dst = try_6rd(tunnel, &iph6->daddr);
+
+ if (!dst) {
+ struct neighbour *neigh = NULL;
+ bool do_tx_error = false;
+
+ if (skb_dst(skb))
+ neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
+
+ if (!neigh) {
+ net_dbg_ratelimited("nexthop == NULL\n");
+ goto tx_error;
+ }
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) != 0)
+ dst = addr6->s6_addr32[3];
+ else
+ do_tx_error = true;
+
+ neigh_release(neigh);
+ if (do_tx_error)
+ goto tx_error;
+ }
+
+ rt = ip_route_output_ports(tunnel->net, &fl4, NULL,
+ dst, tiph->saddr,
+ 0, 0,
+ IPPROTO_IPV6, RT_TOS(tos),
+ tunnel->parms.link);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ if (rt->rt_type != RTN_UNICAST) {
+ ip_rt_put(rt);
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ tdev = rt->dst.dev;
+
+ if (tdev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT);
+ if (IS_ERR(skb)) {
+ ip_rt_put(rt);
+ goto out;
+ }
+
+ if (df) {
+ mtu = dst_mtu(&rt->dst) - t_hlen;
+
+ if (mtu < 68) {
+ dev->stats.collisions++;
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ if (mtu < IPV6_MIN_MTU) {
+ mtu = IPV6_MIN_MTU;
+ df = 0;
+ }
+
+ if (tunnel->parms.iph.daddr && skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+ if (skb->len > mtu && !skb_is_gso(skb)) {
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+ }
+
+ if (tunnel->err_count > 0) {
+ if (time_before(jiffies,
+ tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+ tunnel->err_count--;
+ dst_link_failure(skb);
+ } else
+ tunnel->err_count = 0;
+ }
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen;
+
+ if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
+ (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb) {
+ ip_rt_put(rt);
+ dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ dev_kfree_skb(skb);
+ skb = new_skb;
+ iph6 = ipv6_hdr(skb);
+ }
+ ttl = tiph->ttl;
+ if (ttl == 0)
+ ttl = iph6->hop_limit;
+ tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
+
+ if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+
+ err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr,
+ protocol, tos, ttl, df,
+ !net_eq(tunnel->net, dev_net(dev)));
+ iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ return NETDEV_TX_OK;
+
+tx_error_icmp:
+ dst_link_failure(skb);
+tx_error:
+ kfree_skb(skb);
+out:
+ dev->stats.tx_errors++;
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *tiph = &tunnel->parms.iph;
+
+ skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ if (IS_ERR(skb))
+ goto out;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+
+ ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP);
+ return NETDEV_TX_OK;
+out:
+ dev->stats.tx_errors++;
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ipip_tunnel_xmit(skb, dev);
+ break;
+ case htons(ETH_P_IPV6):
+ ipip6_tunnel_xmit(skb, dev);
+ break;
+ default:
+ goto tx_err;
+ }
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+
+}
+
+static void ipip6_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel;
+ const struct iphdr *iph;
+ struct flowi4 fl4;
+
+ tunnel = netdev_priv(dev);
+ iph = &tunnel->parms.iph;
+
+ if (iph->daddr) {
+ struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4,
+ NULL,
+ iph->daddr, iph->saddr,
+ 0, 0,
+ IPPROTO_IPV6,
+ RT_TOS(iph->tos),
+ tunnel->parms.link);
+
+ if (!IS_ERR(rt)) {
+ tdev = rt->dst.dev;
+ ip_rt_put(rt);
+ }
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
+
+ if (tdev) {
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
+ dev->mtu = tdev->mtu - t_hlen;
+ if (dev->mtu < IPV6_MIN_MTU)
+ dev->mtu = IPV6_MIN_MTU;
+ }
+}
+
+static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
+{
+ struct net *net = t->net;
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+
+ ipip6_tunnel_unlink(sitn, t);
+ synchronize_net();
+ t->parms.iph.saddr = p->iph.saddr;
+ t->parms.iph.daddr = p->iph.daddr;
+ memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
+ memcpy(t->dev->broadcast, &p->iph.daddr, 4);
+ ipip6_tunnel_link(sitn, t);
+ t->parms.iph.ttl = p->iph.ttl;
+ t->parms.iph.tos = p->iph.tos;
+ if (t->parms.link != p->link) {
+ t->parms.link = p->link;
+ ipip6_tunnel_bind_dev(t->dev);
+ }
+ ip_tunnel_dst_reset_all(t);
+ netdev_state_change(t->dev);
+}
+
+#ifdef CONFIG_IPV6_SIT_6RD
+static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
+ struct ip_tunnel_6rd *ip6rd)
+{
+ struct in6_addr prefix;
+ __be32 relay_prefix;
+
+ if (ip6rd->relay_prefixlen > 32 ||
+ ip6rd->prefixlen + (32 - ip6rd->relay_prefixlen) > 64)
+ return -EINVAL;
+
+ ipv6_addr_prefix(&prefix, &ip6rd->prefix, ip6rd->prefixlen);
+ if (!ipv6_addr_equal(&prefix, &ip6rd->prefix))
+ return -EINVAL;
+ if (ip6rd->relay_prefixlen)
+ relay_prefix = ip6rd->relay_prefix &
+ htonl(0xffffffffUL <<
+ (32 - ip6rd->relay_prefixlen));
+ else
+ relay_prefix = 0;
+ if (relay_prefix != ip6rd->relay_prefix)
+ return -EINVAL;
+
+ t->ip6rd.prefix = prefix;
+ t->ip6rd.relay_prefix = relay_prefix;
+ t->ip6rd.prefixlen = ip6rd->prefixlen;
+ t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen;
+ ip_tunnel_dst_reset_all(t);
+ netdev_state_change(t->dev);
+ return 0;
+}
+#endif
+
+static int
+ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ int err = 0;
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_prl prl;
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct net *net = t->net;
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+#ifdef CONFIG_IPV6_SIT_6RD
+ struct ip_tunnel_6rd ip6rd;
+#endif
+
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+#ifdef CONFIG_IPV6_SIT_6RD
+ case SIOCGET6RD:
+#endif
+ if (dev == sitn->fb_tunnel_dev) {
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ err = -EFAULT;
+ break;
+ }
+ t = ipip6_tunnel_locate(net, &p, 0);
+ if (!t)
+ t = netdev_priv(dev);
+ }
+
+ err = -EFAULT;
+ if (cmd == SIOCGETTUNNEL) {
+ memcpy(&p, &t->parms, sizeof(p));
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &p,
+ sizeof(p)))
+ goto done;
+#ifdef CONFIG_IPV6_SIT_6RD
+ } else {
+ ip6rd.prefix = t->ip6rd.prefix;
+ ip6rd.relay_prefix = t->ip6rd.relay_prefix;
+ ip6rd.prefixlen = t->ip6rd.prefixlen;
+ ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd,
+ sizeof(ip6rd)))
+ goto done;
+#endif
+ }
+ err = 0;
+ break;
+
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+
+ err = -EINVAL;
+ if (p.iph.protocol != IPPROTO_IPV6 &&
+ p.iph.protocol != IPPROTO_IPIP &&
+ p.iph.protocol != 0)
+ goto done;
+ if (p.iph.version != 4 ||
+ p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+ goto done;
+ if (p.iph.ttl)
+ p.iph.frag_off |= htons(IP_DF);
+
+ t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
+
+ if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+ if (t) {
+ if (t->dev != dev) {
+ err = -EEXIST;
+ break;
+ }
+ } else {
+ if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+ err = -EINVAL;
+ break;
+ }
+ t = netdev_priv(dev);
+ }
+
+ ipip6_tunnel_update(t, &p);
+ }
+
+ if (t) {
+ err = 0;
+ if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+ err = -EFAULT;
+ } else
+ err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+ break;
+
+ case SIOCDELTUNNEL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ if (dev == sitn->fb_tunnel_dev) {
+ err = -EFAULT;
+ if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ goto done;
+ err = -ENOENT;
+ t = ipip6_tunnel_locate(net, &p, 0);
+ if (!t)
+ goto done;
+ err = -EPERM;
+ if (t == netdev_priv(sitn->fb_tunnel_dev))
+ goto done;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ err = 0;
+ break;
+
+ case SIOCGETPRL:
+ err = -EINVAL;
+ if (dev == sitn->fb_tunnel_dev)
+ goto done;
+ err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data);
+ break;
+
+ case SIOCADDPRL:
+ case SIOCDELPRL:
+ case SIOCCHGPRL:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+ err = -EINVAL;
+ if (dev == sitn->fb_tunnel_dev)
+ goto done;
+ err = -EFAULT;
+ if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
+ goto done;
+
+ switch (cmd) {
+ case SIOCDELPRL:
+ err = ipip6_tunnel_del_prl(t, &prl);
+ break;
+ case SIOCADDPRL:
+ case SIOCCHGPRL:
+ err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
+ break;
+ }
+ ip_tunnel_dst_reset_all(t);
+ netdev_state_change(dev);
+ break;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ case SIOCADD6RD:
+ case SIOCCHG6RD:
+ case SIOCDEL6RD:
+ err = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto done;
+
+ err = -EFAULT;
+ if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data,
+ sizeof(ip6rd)))
+ goto done;
+
+ if (cmd != SIOCDEL6RD) {
+ err = ipip6_tunnel_update_6rd(t, &ip6rd);
+ if (err < 0)
+ goto done;
+ } else
+ ipip6_tunnel_clone_6rd(dev, sitn);
+
+ err = 0;
+ break;
+#endif
+
+ default:
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+
+static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - t_hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static const struct net_device_ops ipip6_netdev_ops = {
+ .ndo_init = ipip6_tunnel_init,
+ .ndo_uninit = ipip6_tunnel_uninit,
+ .ndo_start_xmit = sit_tunnel_xmit,
+ .ndo_do_ioctl = ipip6_tunnel_ioctl,
+ .ndo_change_mtu = ipip6_tunnel_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+};
+
+static void ipip6_dev_free(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ free_percpu(tunnel->dst_cache);
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
+#define SIT_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_GSO_SOFTWARE | \
+ NETIF_F_HW_CSUM)
+
+static void ipip6_tunnel_setup(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ dev->netdev_ops = &ipip6_netdev_ops;
+ dev->destructor = ipip6_dev_free;
+
+ dev->type = ARPHRD_SIT;
+ dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ dev->mtu = ETH_DATA_LEN - t_hlen;
+ dev->flags = IFF_NOARP;
+ netif_keep_dst(dev);
+ dev->addr_len = 4;
+ dev->features |= NETIF_F_LLTX;
+ dev->features |= SIT_FEATURES;
+ dev->hw_features |= SIT_FEATURES;
+}
+
+static int ipip6_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+
+ ipip6_tunnel_bind_dev(dev);
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
+ if (!tunnel->dst_cache) {
+ free_percpu(dev->tstats);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+ struct net *net = dev_net(dev);
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+
+ iph->version = 4;
+ iph->protocol = IPPROTO_IPV6;
+ iph->ihl = 5;
+ iph->ttl = 64;
+
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
+ if (!tunnel->dst_cache) {
+ free_percpu(dev->tstats);
+ return -ENOMEM;
+ }
+
+ dev_hold(dev);
+ rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
+ return 0;
+}
+
+static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ u8 proto;
+
+ if (!data || !data[IFLA_IPTUN_PROTO])
+ return 0;
+
+ proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+ if (proto != IPPROTO_IPV6 &&
+ proto != IPPROTO_IPIP &&
+ proto != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void ipip6_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.version = 4;
+ parms->iph.protocol = IPPROTO_IPV6;
+ parms->iph.ihl = 5;
+ parms->iph.ttl = 64;
+
+ if (!data)
+ return;
+
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
+
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
+
+ if (data[IFLA_IPTUN_TTL]) {
+ parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+ if (parms->iph.ttl)
+ parms->iph.frag_off = htons(IP_DF);
+ }
+
+ if (data[IFLA_IPTUN_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+ if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_IPTUN_FLAGS])
+ parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+
+ if (data[IFLA_IPTUN_PROTO])
+ parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+}
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+static bool ipip6_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *ipencap)
+{
+ bool ret = false;
+
+ memset(ipencap, 0, sizeof(*ipencap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+ ret = true;
+ ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+ ret = true;
+ ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+ ret = true;
+ ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+ ret = true;
+ ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_IPV6_SIT_6RD
+/* This function returns true when 6RD attributes are present in the nl msg */
+static bool ipip6_netlink_6rd_parms(struct nlattr *data[],
+ struct ip_tunnel_6rd *ip6rd)
+{
+ bool ret = false;
+ memset(ip6rd, 0, sizeof(*ip6rd));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_6RD_PREFIX]) {
+ ret = true;
+ ip6rd->prefix = nla_get_in6_addr(data[IFLA_IPTUN_6RD_PREFIX]);
+ }
+
+ if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) {
+ ret = true;
+ ip6rd->relay_prefix =
+ nla_get_be32(data[IFLA_IPTUN_6RD_RELAY_PREFIX]);
+ }
+
+ if (data[IFLA_IPTUN_6RD_PREFIXLEN]) {
+ ret = true;
+ ip6rd->prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_PREFIXLEN]);
+ }
+
+ if (data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]) {
+ ret = true;
+ ip6rd->relay_prefixlen =
+ nla_get_u16(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]);
+ }
+
+ return ret;
+}
+#endif
+
+static int ipip6_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net *net = dev_net(dev);
+ struct ip_tunnel *nt;
+ struct ip_tunnel_encap ipencap;
+#ifdef CONFIG_IPV6_SIT_6RD
+ struct ip_tunnel_6rd ip6rd;
+#endif
+ int err;
+
+ nt = netdev_priv(dev);
+
+ if (ipip6_netlink_encap_parms(data, &ipencap)) {
+ err = ip_tunnel_encap_setup(nt, &ipencap);
+ if (err < 0)
+ return err;
+ }
+
+ ipip6_netlink_parms(data, &nt->parms);
+
+ if (ipip6_tunnel_locate(net, &nt->parms, 0))
+ return -EEXIST;
+
+ err = ipip6_tunnel_create(dev);
+ if (err < 0)
+ return err;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ if (ipip6_netlink_6rd_parms(data, &ip6rd))
+ err = ipip6_tunnel_update_6rd(nt, &ip6rd);
+#endif
+
+ return err;
+}
+
+static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm p;
+ struct ip_tunnel_encap ipencap;
+ struct net *net = t->net;
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+#ifdef CONFIG_IPV6_SIT_6RD
+ struct ip_tunnel_6rd ip6rd;
+#endif
+ int err;
+
+ if (dev == sitn->fb_tunnel_dev)
+ return -EINVAL;
+
+ if (ipip6_netlink_encap_parms(data, &ipencap)) {
+ err = ip_tunnel_encap_setup(t, &ipencap);
+ if (err < 0)
+ return err;
+ }
+
+ ipip6_netlink_parms(data, &p);
+
+ if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
+ return -EINVAL;
+
+ t = ipip6_tunnel_locate(net, &p, 0);
+
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
+ } else
+ t = netdev_priv(dev);
+
+ ipip6_tunnel_update(t, &p);
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ if (ipip6_netlink_6rd_parms(data, &ip6rd))
+ return ipip6_tunnel_update_6rd(t, &ip6rd);
+#endif
+
+ return 0;
+}
+
+static size_t ipip6_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_IPTUN_LINK */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_TTL */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_TOS */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_PROTO */
+ nla_total_size(1) +
+#ifdef CONFIG_IPV6_SIT_6RD
+ /* IFLA_IPTUN_6RD_PREFIX */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_IPTUN_6RD_RELAY_PREFIX */
+ nla_total_size(4) +
+ /* IFLA_IPTUN_6RD_PREFIXLEN */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */
+ nla_total_size(2) +
+#endif
+ /* IFLA_IPTUN_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_IPTUN_ENCAP_DPORT */
+ nla_total_size(2) +
+ 0;
+}
+
+static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_parm *parm = &tunnel->parms;
+
+ if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
+ nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
+ nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
+ nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
+ !!(parm->iph.frag_off & htons(IP_DF))) ||
+ nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
+ nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags))
+ goto nla_put_failure;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+ if (nla_put_in6_addr(skb, IFLA_IPTUN_6RD_PREFIX,
+ &tunnel->ip6rd.prefix) ||
+ nla_put_in_addr(skb, IFLA_IPTUN_6RD_RELAY_PREFIX,
+ tunnel->ip6rd.relay_prefix) ||
+ nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN,
+ tunnel->ip6rd.prefixlen) ||
+ nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN,
+ tunnel->ip6rd.relay_prefixlen))
+ goto nla_put_failure;
+#endif
+
+ if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
+ tunnel->encap.type) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
+ tunnel->encap.sport) ||
+ nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
+ tunnel->encap.dport) ||
+ nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
+ tunnel->encap.flags))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = {
+ [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
+ [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
+ [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
+ [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
+ [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
+ [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+ [IFLA_IPTUN_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
+#ifdef CONFIG_IPV6_SIT_6RD
+ [IFLA_IPTUN_6RD_PREFIX] = { .len = sizeof(struct in6_addr) },
+ [IFLA_IPTUN_6RD_RELAY_PREFIX] = { .type = NLA_U32 },
+ [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 },
+ [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 },
+#endif
+ [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
+ [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
+};
+
+static void ipip6_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct net *net = dev_net(dev);
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+
+ if (dev != sitn->fb_tunnel_dev)
+ unregister_netdevice_queue(dev, head);
+}
+
+static struct rtnl_link_ops sit_link_ops __read_mostly = {
+ .kind = "sit",
+ .maxtype = IFLA_IPTUN_MAX,
+ .policy = ipip6_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipip6_tunnel_setup,
+ .validate = ipip6_validate,
+ .newlink = ipip6_newlink,
+ .changelink = ipip6_changelink,
+ .get_size = ipip6_get_size,
+ .fill_info = ipip6_fill_info,
+ .dellink = ipip6_dellink,
+ .get_link_net = ip_tunnel_get_link_net,
+};
+
+static struct xfrm_tunnel sit_handler __read_mostly = {
+ .handler = ipip6_rcv,
+ .err_handler = ipip6_err,
+ .priority = 1,
+};
+
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+ .handler = ipip_rcv,
+ .err_handler = ipip6_err,
+ .priority = 2,
+};
+
+static void __net_exit sit_destroy_tunnels(struct net *net,
+ struct list_head *head)
+{
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+ struct net_device *dev, *aux;
+ int prio;
+
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &sit_link_ops)
+ unregister_netdevice_queue(dev, head);
+
+ for (prio = 1; prio < 4; prio++) {
+ int h;
+ for (h = 0; h < HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+
+ t = rtnl_dereference(sitn->tunnels[prio][h]);
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev,
+ head);
+ t = rtnl_dereference(t->next);
+ }
+ }
+ }
+}
+
+static int __net_init sit_init_net(struct net *net)
+{
+ struct sit_net *sitn = net_generic(net, sit_net_id);
+ struct ip_tunnel *t;
+ int err;
+
+ sitn->tunnels[0] = sitn->tunnels_wc;
+ sitn->tunnels[1] = sitn->tunnels_l;
+ sitn->tunnels[2] = sitn->tunnels_r;
+ sitn->tunnels[3] = sitn->tunnels_r_l;
+
+ sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
+ NET_NAME_UNKNOWN,
+ ipip6_tunnel_setup);
+ if (!sitn->fb_tunnel_dev) {
+ err = -ENOMEM;
+ goto err_alloc_dev;
+ }
+ dev_net_set(sitn->fb_tunnel_dev, net);
+ sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops;
+ /* FB netdevice is special: we have one, and only one per netns.
+ * Allowing to move it to another netns is clearly unsafe.
+ */
+ sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+
+ err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
+ if (err)
+ goto err_dev_free;
+
+ ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
+ err = register_netdev(sitn->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
+
+ t = netdev_priv(sitn->fb_tunnel_dev);
+
+ strcpy(t->parms.name, sitn->fb_tunnel_dev->name);
+ return 0;
+
+err_reg_dev:
+ dev_put(sitn->fb_tunnel_dev);
+err_dev_free:
+ ipip6_dev_free(sitn->fb_tunnel_dev);
+err_alloc_dev:
+ return err;
+}
+
+static void __net_exit sit_exit_net(struct net *net)
+{
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ sit_destroy_tunnels(net, &list);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations sit_net_ops = {
+ .init = sit_init_net,
+ .exit = sit_exit_net,
+ .id = &sit_net_id,
+ .size = sizeof(struct sit_net),
+};
+
+static void __exit sit_cleanup(void)
+{
+ rtnl_link_unregister(&sit_link_ops);
+ xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+
+ unregister_pernet_device(&sit_net_ops);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+
+static int __init sit_init(void)
+{
+ int err;
+
+ pr_info("IPv6 over IPv4 tunneling driver\n");
+
+ err = register_pernet_device(&sit_net_ops);
+ if (err < 0)
+ return err;
+ err = xfrm4_tunnel_register(&sit_handler, AF_INET6);
+ if (err < 0) {
+ pr_info("%s: can't register ip6ip4\n", __func__);
+ goto xfrm_tunnel_failed;
+ }
+ err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+ if (err < 0) {
+ pr_info("%s: can't register ip4ip4\n", __func__);
+ goto xfrm_tunnel4_failed;
+ }
+ err = rtnl_link_register(&sit_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+out:
+ return err;
+
+rtnl_link_failed:
+ xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+xfrm_tunnel4_failed:
+ xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
+xfrm_tunnel_failed:
+ unregister_pernet_device(&sit_net_ops);
+ goto out;
+}
+
+module_init(sit_init);
+module_exit(sit_cleanup);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("sit");
+MODULE_ALIAS_NETDEV("sit0");
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
new file mode 100644
index 000000000..21bc2eb53
--- /dev/null
+++ b/net/ipv6/syncookies.c
@@ -0,0 +1,273 @@
+/*
+ * IPv6 Syncookies implementation for the Linux kernel
+ *
+ * Authors:
+ * Glenn Griffin <ggriffin.kernel@gmail.com>
+ *
+ * Based on IPv4 implementation by Andi Kleen
+ * linux/net/ipv4/syncookies.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/tcp.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+
+#define COOKIEBITS 24 /* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+
+/* RFC 2460, Section 8.3:
+ * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
+ *
+ * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows
+ * using higher values than ipv4 tcp syncookies.
+ * The other values are chosen based on ethernet (1500 and 9k MTU), plus
+ * one that accounts for common encap (PPPoe) overhead. Table must be sorted.
+ */
+static __u16 const msstab[] = {
+ 1280 - 60, /* IPV6_MIN_MTU - 60 */
+ 1480 - 60,
+ 1500 - 60,
+ 9000 - 60,
+};
+
+static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sock *child;
+
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+ if (child) {
+ atomic_set(&req->rsk_refcnt, 1);
+ inet_csk_reqsk_queue_add(sk, req, child);
+ } else {
+ reqsk_free(req);
+ }
+ return child;
+}
+
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
+ ipv6_cookie_scratch);
+
+static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
+ __be16 sport, __be16 dport, u32 count, int c)
+{
+ __u32 *tmp;
+
+ net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
+
+ tmp = this_cpu_ptr(ipv6_cookie_scratch);
+
+ /*
+ * we have 320 bits of information to hash, copy in the remaining
+ * 192 bits required for sha_transform, from the syncookie6_secret
+ * and overwrite the digest with the secret
+ */
+ memcpy(tmp + 10, syncookie6_secret[c], 44);
+ memcpy(tmp, saddr, 16);
+ memcpy(tmp + 4, daddr, 16);
+ tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
+ tmp[9] = count;
+ sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+ return tmp[17];
+}
+
+static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __be16 sport, __be16 dport, __u32 sseq,
+ __u32 data)
+{
+ u32 count = tcp_cookie_time();
+ return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+ sseq + (count << COOKIEBITS) +
+ ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+ & COOKIEMASK));
+}
+
+static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr,
+ const struct in6_addr *daddr, __be16 sport,
+ __be16 dport, __u32 sseq)
+{
+ __u32 diff, count = tcp_cookie_time();
+
+ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+ if (diff >= MAX_SYNCOOKIE_AGE)
+ return (__u32)-1;
+
+ return (cookie -
+ cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+ & COOKIEMASK;
+}
+
+u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
+ const struct tcphdr *th, __u16 *mssp)
+{
+ int mssind;
+ const __u16 mss = *mssp;
+
+ for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+ if (mss >= msstab[mssind])
+ break;
+
+ *mssp = msstab[mssind];
+
+ return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source,
+ th->dest, ntohl(th->seq), mssind);
+}
+EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence);
+
+__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ tcp_synq_overflow(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+
+ return __cookie_v6_init_sequence(iph, th, mssp);
+}
+
+int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
+ __u32 cookie)
+{
+ __u32 seq = ntohl(th->seq) - 1;
+ __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
+ th->source, th->dest, seq);
+
+ return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
+}
+EXPORT_SYMBOL_GPL(__cookie_v6_check);
+
+struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_options_received tcp_opt;
+ struct inet_request_sock *ireq;
+ struct tcp_request_sock *treq;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
+ __u32 cookie = ntohl(th->ack_seq) - 1;
+ struct sock *ret = sk;
+ struct request_sock *req;
+ int mss;
+ struct dst_entry *dst;
+ __u8 rcv_wscale;
+
+ if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+ goto out;
+
+ if (tcp_synq_no_recent_overflow(sk))
+ goto out;
+
+ mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie);
+ if (mss == 0) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+ goto out;
+ }
+
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+
+ /* check for timestamp cookie support */
+ memset(&tcp_opt, 0, sizeof(tcp_opt));
+ tcp_parse_options(skb, &tcp_opt, 0, NULL);
+
+ if (!cookie_timestamp_decode(&tcp_opt))
+ goto out;
+
+ ret = NULL;
+ req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk);
+ if (!req)
+ goto out;
+
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+ treq->tfo_listener = false;
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto out_free;
+
+ req->mss = mss;
+ ireq->ir_rmt_port = th->source;
+ ireq->ir_num = ntohs(th->dest);
+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+ if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+ atomic_inc(&skb->users);
+ ireq->pktopts = skb;
+ }
+
+ ireq->ir_iif = sk->sk_bound_dev_if;
+ /* So that link locals have meaning */
+ if (!sk->sk_bound_dev_if &&
+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq->ir_iif = tcp_v6_iif(skb);
+
+ ireq->ir_mark = inet_request_mark(sk, skb);
+
+ req->num_retrans = 0;
+ ireq->snd_wscale = tcp_opt.snd_wscale;
+ ireq->sack_ok = tcp_opt.sack_ok;
+ ireq->wscale_ok = tcp_opt.wscale_ok;
+ ireq->tstamp_ok = tcp_opt.saw_tstamp;
+ req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+ treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
+ treq->rcv_isn = ntohl(th->seq) - 1;
+ treq->snt_isn = cookie;
+
+ /*
+ * We need to lookup the dst_entry to get the correct window size.
+ * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten
+ * me if there is a preferred way.
+ */
+ {
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = ireq->ir_mark;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = inet_sk(sk)->inet_sport;
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst))
+ goto out_free;
+ }
+
+ req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
+ tcp_select_initial_window(tcp_full_space(sk), req->mss,
+ &req->rcv_wnd, &req->window_clamp,
+ ireq->wscale_ok, &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
+
+ ireq->rcv_wscale = rcv_wscale;
+ ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
+
+ ret = get_cookie_sock(sk, skb, req, dst);
+out:
+ return ret;
+out_free:
+ reqsk_free(req);
+ return NULL;
+}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
new file mode 100644
index 000000000..abcc79f64
--- /dev/null
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -0,0 +1,200 @@
+/*
+ * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem.
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI: added icmp sysctl table.
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/ndisc.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/inet_frag.h>
+
+static int one = 1;
+
+static struct ctl_table ipv6_table_template[] = {
+ {
+ .procname = "bindv6only",
+ .data = &init_net.ipv6.sysctl.bindv6only,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "anycast_src_echo_reply",
+ .data = &init_net.ipv6.sysctl.anycast_src_echo_reply,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "flowlabel_consistency",
+ .data = &init_net.ipv6.sysctl.flowlabel_consistency,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "auto_flowlabels",
+ .data = &init_net.ipv6.sysctl.auto_flowlabels,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "fwmark_reflect",
+ .data = &init_net.ipv6.sysctl.fwmark_reflect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "idgen_retries",
+ .data = &init_net.ipv6.sysctl.idgen_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "idgen_delay",
+ .data = &init_net.ipv6.sysctl.idgen_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+
+static struct ctl_table ipv6_rotable[] = {
+ {
+ .procname = "mld_max_msf",
+ .data = &sysctl_mld_max_msf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "mld_qrv",
+ .data = &sysctl_mld_qrv,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+ { }
+};
+
+static int __net_init ipv6_sysctl_net_init(struct net *net)
+{
+ struct ctl_table *ipv6_table;
+ struct ctl_table *ipv6_route_table;
+ struct ctl_table *ipv6_icmp_table;
+ int err;
+
+ err = -ENOMEM;
+ ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template),
+ GFP_KERNEL);
+ if (!ipv6_table)
+ goto out;
+ ipv6_table[0].data = &net->ipv6.sysctl.bindv6only;
+ ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply;
+ ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency;
+ ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels;
+ ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect;
+ ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
+ ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
+
+ ipv6_route_table = ipv6_route_sysctl_init(net);
+ if (!ipv6_route_table)
+ goto out_ipv6_table;
+
+ ipv6_icmp_table = ipv6_icmp_sysctl_init(net);
+ if (!ipv6_icmp_table)
+ goto out_ipv6_route_table;
+
+ net->ipv6.sysctl.hdr = register_net_sysctl(net, "net/ipv6", ipv6_table);
+ if (!net->ipv6.sysctl.hdr)
+ goto out_ipv6_icmp_table;
+
+ net->ipv6.sysctl.route_hdr =
+ register_net_sysctl(net, "net/ipv6/route", ipv6_route_table);
+ if (!net->ipv6.sysctl.route_hdr)
+ goto out_unregister_ipv6_table;
+
+ net->ipv6.sysctl.icmp_hdr =
+ register_net_sysctl(net, "net/ipv6/icmp", ipv6_icmp_table);
+ if (!net->ipv6.sysctl.icmp_hdr)
+ goto out_unregister_route_table;
+
+ err = 0;
+out:
+ return err;
+out_unregister_route_table:
+ unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr);
+out_unregister_ipv6_table:
+ unregister_net_sysctl_table(net->ipv6.sysctl.hdr);
+out_ipv6_icmp_table:
+ kfree(ipv6_icmp_table);
+out_ipv6_route_table:
+ kfree(ipv6_route_table);
+out_ipv6_table:
+ kfree(ipv6_table);
+ goto out;
+}
+
+static void __net_exit ipv6_sysctl_net_exit(struct net *net)
+{
+ struct ctl_table *ipv6_table;
+ struct ctl_table *ipv6_route_table;
+ struct ctl_table *ipv6_icmp_table;
+
+ ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg;
+ ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg;
+ ipv6_icmp_table = net->ipv6.sysctl.icmp_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(net->ipv6.sysctl.icmp_hdr);
+ unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr);
+ unregister_net_sysctl_table(net->ipv6.sysctl.hdr);
+
+ kfree(ipv6_table);
+ kfree(ipv6_route_table);
+ kfree(ipv6_icmp_table);
+}
+
+static struct pernet_operations ipv6_sysctl_net_ops = {
+ .init = ipv6_sysctl_net_init,
+ .exit = ipv6_sysctl_net_exit,
+};
+
+static struct ctl_table_header *ip6_header;
+
+int ipv6_sysctl_register(void)
+{
+ int err = -ENOMEM;
+
+ ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable);
+ if (!ip6_header)
+ goto out;
+
+ err = register_pernet_subsys(&ipv6_sysctl_net_ops);
+ if (err)
+ goto err_pernet;
+out:
+ return err;
+
+err_pernet:
+ unregister_net_sysctl_table(ip6_header);
+ goto out;
+}
+
+void ipv6_sysctl_unregister(void)
+{
+ unregister_net_sysctl_table(ip6_header);
+ unregister_pernet_subsys(&ipv6_sysctl_net_ops);
+}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
new file mode 100644
index 000000000..5193d0953
--- /dev/null
+++ b/net/ipv6/tcp_ipv6.c
@@ -0,0 +1,1965 @@
+/*
+ * TCP over IPv6
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Based on:
+ * linux/net/ipv4/tcp.c
+ * linux/net/ipv4/tcp_input.c
+ * linux/net/ipv4/tcp_output.c
+ *
+ * Fixes:
+ * Hideaki YOSHIFUJI : sin6_scope_id support
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
+ * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/bottom_half.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/jiffies.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/jhash.h>
+#include <linux/ipsec.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+
+#include <net/tcp.h>
+#include <net/ndisc.h>
+#include <net/inet6_hashtables.h>
+#include <net/inet6_connection_sock.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/ip6_checksum.h>
+#include <net/inet_ecn.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+#include <net/snmp.h>
+#include <net/dsfield.h>
+#include <net/timewait_sock.h>
+#include <net/inet_common.h>
+#include <net/secure_seq.h>
+#include <net/tcp_memcontrol.h>
+#include <net/busy_poll.h>
+#include <net/secure_seq.h>
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+
+static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
+static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req);
+
+static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+static const struct inet_connection_sock_af_ops ipv6_mapped;
+static const struct inet_connection_sock_af_ops ipv6_specific;
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
+static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
+#else
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
+ const struct in6_addr *addr)
+{
+ return NULL;
+}
+#endif
+
+static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (dst) {
+ const struct rt6_info *rt = (const struct rt6_info *)dst;
+
+ dst_hold(dst);
+ sk->sk_rx_dst = dst;
+ inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ if (rt->rt6i_node)
+ inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
+ }
+}
+
+static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
+{
+ return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32,
+ tcp_hdr(skb)->dest,
+ tcp_hdr(skb)->source);
+}
+
+static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct in6_addr *saddr = NULL, *final_p, final;
+ struct rt6_info *rt;
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+ int addr_type;
+ int err;
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ memset(&fl6, 0, sizeof(fl6));
+
+ if (np->sndflow) {
+ fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6.flowlabel);
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ struct ip6_flowlabel *flowlabel;
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (!flowlabel)
+ return -EINVAL;
+ fl6_sock_release(flowlabel);
+ }
+ }
+
+ /*
+ * connect() to INADDR_ANY means loopback (BSD'ism).
+ */
+
+ if (ipv6_addr_any(&usin->sin6_addr))
+ usin->sin6_addr.s6_addr[15] = 0x1;
+
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+ if (addr_type & IPV6_ADDR_MULTICAST)
+ return -ENETUNREACH;
+
+ if (addr_type&IPV6_ADDR_LINKLOCAL) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ usin->sin6_scope_id) {
+ /* If interface is set while binding, indices
+ * must coincide.
+ */
+ if (sk->sk_bound_dev_if &&
+ sk->sk_bound_dev_if != usin->sin6_scope_id)
+ return -EINVAL;
+
+ sk->sk_bound_dev_if = usin->sin6_scope_id;
+ }
+
+ /* Connect to link-local address requires an interface */
+ if (!sk->sk_bound_dev_if)
+ return -EINVAL;
+ }
+
+ if (tp->rx_opt.ts_recent_stamp &&
+ !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) {
+ tp->rx_opt.ts_recent = 0;
+ tp->rx_opt.ts_recent_stamp = 0;
+ tp->write_seq = 0;
+ }
+
+ sk->sk_v6_daddr = usin->sin6_addr;
+ np->flow_label = fl6.flowlabel;
+
+ /*
+ * TCP over IPv4
+ */
+
+ if (addr_type == IPV6_ADDR_MAPPED) {
+ u32 exthdrlen = icsk->icsk_ext_hdr_len;
+ struct sockaddr_in sin;
+
+ SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
+
+ if (__ipv6_only_sock(sk))
+ return -ENETUNREACH;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = usin->sin6_port;
+ sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+ icsk->icsk_af_ops = &ipv6_mapped;
+ sk->sk_backlog_rcv = tcp_v4_do_rcv;
+#ifdef CONFIG_TCP_MD5SIG
+ tp->af_specific = &tcp_sock_ipv6_mapped_specific;
+#endif
+
+ err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+
+ if (err) {
+ icsk->icsk_ext_hdr_len = exthdrlen;
+ icsk->icsk_af_ops = &ipv6_specific;
+ sk->sk_backlog_rcv = tcp_v6_do_rcv;
+#ifdef CONFIG_TCP_MD5SIG
+ tp->af_specific = &tcp_sock_ipv6_specific;
+#endif
+ goto failure;
+ }
+ np->saddr = sk->sk_v6_rcv_saddr;
+
+ return err;
+ }
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ saddr = &sk->sk_v6_rcv_saddr;
+
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = saddr ? *saddr : np->saddr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_mark = sk->sk_mark;
+ fl6.fl6_dport = usin->sin6_port;
+ fl6.fl6_sport = inet->inet_sport;
+
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto failure;
+ }
+
+ if (!saddr) {
+ saddr = &fl6.saddr;
+ sk->sk_v6_rcv_saddr = *saddr;
+ }
+
+ /* set the source address */
+ np->saddr = *saddr;
+ inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+ sk->sk_gso_type = SKB_GSO_TCPV6;
+ __ip6_dst_store(sk, dst, NULL, NULL);
+
+ rt = (struct rt6_info *) dst;
+ if (tcp_death_row.sysctl_tw_recycle &&
+ !tp->rx_opt.ts_recent_stamp &&
+ ipv6_addr_equal(&rt->rt6i_dst.addr, &sk->sk_v6_daddr))
+ tcp_fetch_timewait_stamp(sk, dst);
+
+ icsk->icsk_ext_hdr_len = 0;
+ if (np->opt)
+ icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+ np->opt->opt_nflen);
+
+ tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
+
+ inet->inet_dport = usin->sin6_port;
+
+ tcp_set_state(sk, TCP_SYN_SENT);
+ err = inet6_hash_connect(&tcp_death_row, sk);
+ if (err)
+ goto late_failure;
+
+ ip6_set_txhash(sk);
+
+#ifdef CONFIG_TCP_STEALTH
+ /* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as
+ * early as possible and thus move taking the snapshot of tcp_time_stamp
+ * here.
+ */
+ skb_mstamp_get(&tp->stealth.mstamp);
+
+ if (!tp->write_seq && likely(!tp->repair) &&
+ unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH))
+ tp->write_seq = tcp_stealth_sequence_number(sk,
+ sk->sk_v6_daddr.s6_addr32,
+ sizeof(sk->sk_v6_daddr),
+ inet->inet_dport);
+#endif
+
+ if (!tp->write_seq && likely(!tp->repair))
+ tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport);
+
+ err = tcp_connect(sk);
+ if (err)
+ goto late_failure;
+
+ return 0;
+
+late_failure:
+ tcp_set_state(sk, TCP_CLOSE);
+ __sk_dst_reset(sk);
+failure:
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ return err;
+}
+
+static void tcp_v6_mtu_reduced(struct sock *sk)
+{
+ struct dst_entry *dst;
+
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+ return;
+
+ dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
+ if (!dst)
+ return;
+
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+ tcp_sync_mss(sk, dst_mtu(dst));
+ tcp_simple_retransmit(sk);
+ }
+}
+
+static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
+ const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
+ struct net *net = dev_net(skb->dev);
+ struct request_sock *fastopen;
+ struct ipv6_pinfo *np;
+ struct tcp_sock *tp;
+ __u32 seq, snd_una;
+ struct sock *sk;
+ int err;
+
+ sk = __inet6_lookup_established(net, &tcp_hashinfo,
+ &hdr->daddr, th->dest,
+ &hdr->saddr, ntohs(th->source),
+ skb->dev->ifindex);
+
+ if (!sk) {
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return;
+ }
+
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ inet_twsk_put(inet_twsk(sk));
+ return;
+ }
+ seq = ntohl(th->seq);
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_err(sk, seq);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == TCP_CLOSE)
+ goto out;
+
+ if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
+
+ tp = tcp_sk(sk);
+ /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
+ fastopen = tp->fastopen_rsk;
+ snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
+ if (sk->sk_state != TCP_LISTEN &&
+ !between(seq, snd_una, tp->snd_nxt)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ np = inet6_sk(sk);
+
+ if (type == NDISC_REDIRECT) {
+ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+ goto out;
+ }
+
+ if (type == ICMPV6_PKT_TOOBIG) {
+ /* We are not interested in TCP_LISTEN and open_requests
+ * (SYN-ACKs send out by Linux are always <576bytes so
+ * they should go through unfragmented).
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ if (!ip6_sk_accept_pmtu(sk))
+ goto out;
+
+ tp->mtu_info = ntohl(info);
+ if (!sock_owned_by_user(sk))
+ tcp_v6_mtu_reduced(sk);
+ else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
+ &tp->tsq_flags))
+ sock_hold(sk);
+ goto out;
+ }
+
+ icmpv6_err_convert(type, code, &err);
+
+ /* Might be for an request_sock */
+ switch (sk->sk_state) {
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ /* Only in fast or simultaneous open. If a fast open socket is
+ * is already accepted it is treated as a connected one below.
+ */
+ if (fastopen && !fastopen->sk)
+ break;
+
+ if (!sock_owned_by_user(sk)) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
+
+ tcp_done(sk);
+ } else
+ sk->sk_err_soft = err;
+ goto out;
+ }
+
+ if (!sock_owned_by_user(sk) && np->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else
+ sk->sk_err_soft = err;
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+
+static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
+ struct request_sock *req,
+ u16 queue_mapping,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct flowi6 *fl6 = &fl->u.ip6;
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ /* First, grab a route. */
+ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL)
+ goto done;
+
+ skb = tcp_make_synack(sk, dst, req, foc);
+
+ if (skb) {
+ __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
+ &ireq->ir_v6_rmt_addr);
+
+ fl6->daddr = ireq->ir_v6_rmt_addr;
+ if (np->repflow && ireq->pktopts)
+ fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
+
+ skb_set_queue_mapping(skb, queue_mapping);
+ err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
+ err = net_xmit_eval(err);
+ }
+
+done:
+ return err;
+}
+
+
+static void tcp_v6_reqsk_destructor(struct request_sock *req)
+{
+ kfree_skb(inet_rsk(req)->pktopts);
+}
+
+#ifdef CONFIG_TCP_MD5SIG
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
+ const struct in6_addr *addr)
+{
+ return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6);
+}
+
+static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk,
+ const struct sock *addr_sk)
+{
+ return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr);
+}
+
+static int tcp_v6_parse_md5_keys(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ struct tcp_md5sig cmd;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ return -EFAULT;
+
+ if (sin6->sin6_family != AF_INET6)
+ return -EINVAL;
+
+ if (!cmd.tcpm_keylen) {
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+ return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
+ AF_INET);
+ return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
+ AF_INET6);
+ }
+
+ if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
+ return -EINVAL;
+
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+ return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
+ AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
+
+ return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
+ AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
+}
+
+static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr, int nbytes)
+{
+ struct tcp6_pseudohdr *bp;
+ struct scatterlist sg;
+
+ bp = &hp->md5_blk.ip6;
+ /* 1. TCP pseudo-header (RFC2460) */
+ bp->saddr = *saddr;
+ bp->daddr = *daddr;
+ bp->protocol = cpu_to_be32(IPPROTO_TCP);
+ bp->len = cpu_to_be32(nbytes);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+}
+
+static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+ const struct in6_addr *daddr, struct in6_addr *saddr,
+ const struct tcphdr *th)
+{
+ struct tcp_md5sig_pool *hp;
+ struct hash_desc *desc;
+
+ hp = tcp_get_md5sig_pool();
+ if (!hp)
+ goto clear_hash_noput;
+ desc = &hp->md5_desc;
+
+ if (crypto_hash_init(desc))
+ goto clear_hash;
+ if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
+ goto clear_hash;
+ if (tcp_md5_hash_header(hp, th))
+ goto clear_hash;
+ if (tcp_md5_hash_key(hp, key))
+ goto clear_hash;
+ if (crypto_hash_final(desc, md5_hash))
+ goto clear_hash;
+
+ tcp_put_md5sig_pool();
+ return 0;
+
+clear_hash:
+ tcp_put_md5sig_pool();
+clear_hash_noput:
+ memset(md5_hash, 0, 16);
+ return 1;
+}
+
+static int tcp_v6_md5_hash_skb(char *md5_hash,
+ const struct tcp_md5sig_key *key,
+ const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct in6_addr *saddr, *daddr;
+ struct tcp_md5sig_pool *hp;
+ struct hash_desc *desc;
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ if (sk) { /* valid for establish/request sockets */
+ saddr = &sk->sk_v6_rcv_saddr;
+ daddr = &sk->sk_v6_daddr;
+ } else {
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ saddr = &ip6h->saddr;
+ daddr = &ip6h->daddr;
+ }
+
+ hp = tcp_get_md5sig_pool();
+ if (!hp)
+ goto clear_hash_noput;
+ desc = &hp->md5_desc;
+
+ if (crypto_hash_init(desc))
+ goto clear_hash;
+
+ if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
+ goto clear_hash;
+ if (tcp_md5_hash_header(hp, th))
+ goto clear_hash;
+ if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
+ goto clear_hash;
+ if (tcp_md5_hash_key(hp, key))
+ goto clear_hash;
+ if (crypto_hash_final(desc, md5_hash))
+ goto clear_hash;
+
+ tcp_put_md5sig_pool();
+ return 0;
+
+clear_hash:
+ tcp_put_md5sig_pool();
+clear_hash_noput:
+ memset(md5_hash, 0, 16);
+ return 1;
+}
+
+static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
+{
+ const __u8 *hash_location = NULL;
+ struct tcp_md5sig_key *hash_expected;
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+ int genhash;
+ u8 newhash[16];
+
+ hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr);
+ hash_location = tcp_parse_md5sig_option(th);
+
+ /* We've parsed the options - do we have a hash? */
+ if (!hash_expected && !hash_location)
+ return false;
+
+ if (hash_expected && !hash_location) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ return true;
+ }
+
+ if (!hash_expected && hash_location) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ return true;
+ }
+
+ /* check the signature */
+ genhash = tcp_v6_md5_hash_skb(newhash,
+ hash_expected,
+ NULL, skb);
+
+ if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+ net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
+ genhash ? "failed" : "mismatch",
+ &ip6h->saddr, ntohs(th->source),
+ &ip6h->daddr, ntohs(th->dest));
+ return true;
+ }
+ return false;
+}
+#endif
+
+static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+
+ /* So that link locals have meaning */
+ if (!sk->sk_bound_dev_if &&
+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq->ir_iif = tcp_v6_iif(skb);
+
+ if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
+ (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
+ np->rxopt.bits.rxinfo ||
+ np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
+ np->rxopt.bits.rxohlim || np->repflow)) {
+ atomic_inc(&skb->users);
+ ireq->pktopts = skb;
+ }
+}
+
+static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,
+ const struct request_sock *req,
+ bool *strict)
+{
+ if (strict)
+ *strict = true;
+ return inet6_csk_route_req(sk, &fl->u.ip6, req);
+}
+
+struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
+ .family = AF_INET6,
+ .obj_size = sizeof(struct tcp6_request_sock),
+ .rtx_syn_ack = tcp_rtx_synack,
+ .send_ack = tcp_v6_reqsk_send_ack,
+ .destructor = tcp_v6_reqsk_destructor,
+ .send_reset = tcp_v6_send_reset,
+ .syn_ack_timeout = tcp_syn_ack_timeout,
+};
+
+static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+ .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
+ sizeof(struct ipv6hdr),
+#ifdef CONFIG_TCP_MD5SIG
+ .req_md5_lookup = tcp_v6_md5_lookup,
+ .calc_md5_hash = tcp_v6_md5_hash_skb,
+#endif
+ .init_req = tcp_v6_init_req,
+#ifdef CONFIG_SYN_COOKIES
+ .cookie_init_seq = cookie_v6_init_sequence,
+#endif
+ .route_req = tcp_v6_route_req,
+ .init_seq = tcp_v6_init_sequence,
+ .send_synack = tcp_v6_send_synack,
+ .queue_hash_add = inet6_csk_reqsk_queue_hash_add,
+};
+
+static void tcp_v6_send_response(struct sock *sk, struct sk_buff *skb, u32 seq,
+ u32 ack, u32 win, u32 tsval, u32 tsecr,
+ int oif, struct tcp_md5sig_key *key, int rst,
+ u8 tclass, u32 label)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcphdr *t1;
+ struct sk_buff *buff;
+ struct flowi6 fl6;
+ struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+ struct sock *ctl_sk = net->ipv6.tcp_sk;
+ unsigned int tot_len = sizeof(struct tcphdr);
+ struct dst_entry *dst;
+ __be32 *topt;
+
+ if (tsecr)
+ tot_len += TCPOLEN_TSTAMP_ALIGNED;
+#ifdef CONFIG_TCP_MD5SIG
+ if (key)
+ tot_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
+ buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
+ GFP_ATOMIC);
+ if (!buff)
+ return;
+
+ skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
+
+ t1 = (struct tcphdr *) skb_push(buff, tot_len);
+ skb_reset_transport_header(buff);
+
+ /* Swap the send and the receive. */
+ memset(t1, 0, sizeof(*t1));
+ t1->dest = th->source;
+ t1->source = th->dest;
+ t1->doff = tot_len / 4;
+ t1->seq = htonl(seq);
+ t1->ack_seq = htonl(ack);
+ t1->ack = !rst || !th->ack;
+ t1->rst = rst;
+ t1->window = htons(win);
+
+ topt = (__be32 *)(t1 + 1);
+
+ if (tsecr) {
+ *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ *topt++ = htonl(tsval);
+ *topt++ = htonl(tsecr);
+ }
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (key) {
+ *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+ tcp_v6_md5_hash_hdr((__u8 *)topt, key,
+ &ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr, t1);
+ }
+#endif
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = ipv6_hdr(skb)->saddr;
+ fl6.saddr = ipv6_hdr(skb)->daddr;
+ fl6.flowlabel = label;
+
+ buff->ip_summed = CHECKSUM_PARTIAL;
+ buff->csum = 0;
+
+ __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
+
+ fl6.flowi6_proto = IPPROTO_TCP;
+ if (rt6_need_strict(&fl6.daddr) && !oif)
+ fl6.flowi6_oif = tcp_v6_iif(skb);
+ else
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
+ fl6.fl6_dport = t1->dest;
+ fl6.fl6_sport = t1->source;
+ security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+
+ /* Pass a socket to ip6_dst_lookup either it is for RST
+ * Underlying function will use this to retrieve the network
+ * namespace
+ */
+ dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
+ if (!IS_ERR(dst)) {
+ skb_dst_set(buff, dst);
+ ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+ if (rst)
+ TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+ return;
+ }
+
+ kfree_skb(buff);
+}
+
+static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ u32 seq = 0, ack_seq = 0;
+ struct tcp_md5sig_key *key = NULL;
+#ifdef CONFIG_TCP_MD5SIG
+ const __u8 *hash_location = NULL;
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ unsigned char newhash[16];
+ int genhash;
+ struct sock *sk1 = NULL;
+#endif
+ int oif;
+
+ if (th->rst)
+ return;
+
+ /* If sk not NULL, it means we did a successful lookup and incoming
+ * route had to be correct. prequeue might have dropped our dst.
+ */
+ if (!sk && !ipv6_unicast_destination(skb))
+ return;
+
+#ifdef CONFIG_TCP_MD5SIG
+ hash_location = tcp_parse_md5sig_option(th);
+ if (!sk && hash_location) {
+ /*
+ * active side is lost. Try to find listening socket through
+ * source port, and then find md5 key through listening socket.
+ * we are not loose security here:
+ * Incoming packet is checked with md5 hash with finding key,
+ * no RST generated if md5 hash doesn't match.
+ */
+ sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
+ &tcp_hashinfo, &ipv6h->saddr,
+ th->source, &ipv6h->daddr,
+ ntohs(th->source), tcp_v6_iif(skb));
+ if (!sk1)
+ return;
+
+ rcu_read_lock();
+ key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr);
+ if (!key)
+ goto release_sk1;
+
+ genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb);
+ if (genhash || memcmp(hash_location, newhash, 16) != 0)
+ goto release_sk1;
+ } else {
+ key = sk ? tcp_v6_md5_do_lookup(sk, &ipv6h->saddr) : NULL;
+ }
+#endif
+
+ if (th->ack)
+ seq = ntohl(th->ack_seq);
+ else
+ ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
+ (th->doff << 2);
+
+ oif = sk ? sk->sk_bound_dev_if : 0;
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
+
+#ifdef CONFIG_TCP_MD5SIG
+release_sk1:
+ if (sk1) {
+ rcu_read_unlock();
+ sock_put(sk1);
+ }
+#endif
+}
+
+static void tcp_v6_send_ack(struct sock *sk, struct sk_buff *skb, u32 seq,
+ u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
+ struct tcp_md5sig_key *key, u8 tclass,
+ u32 label)
+{
+ tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
+ tclass, label);
+}
+
+static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+
+ tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcp_time_stamp + tcptw->tw_ts_offset,
+ tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
+ tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
+
+ inet_twsk_put(tw);
+}
+
+static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req)
+{
+ /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
+ * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
+ */
+ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
+ tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+ tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
+ tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
+ 0, 0);
+}
+
+
+static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct request_sock *req;
+ struct sock *nsk;
+
+ /* Find possible connection requests. */
+ req = inet6_csk_search_req(sk, th->source,
+ &ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
+ if (req) {
+ nsk = tcp_check_req(sk, skb, req, false);
+ if (!nsk)
+ reqsk_put(req);
+ return nsk;
+ }
+ nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
+ &ipv6_hdr(skb)->saddr, th->source,
+ &ipv6_hdr(skb)->daddr, ntohs(th->dest),
+ tcp_v6_iif(skb));
+
+ if (nsk) {
+ if (nsk->sk_state != TCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put(inet_twsk(nsk));
+ return NULL;
+ }
+
+#ifdef CONFIG_SYN_COOKIES
+ if (!th->syn)
+ sk = cookie_v6_check(sk, skb);
+#endif
+ return sk;
+}
+
+static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb->protocol == htons(ETH_P_IP))
+ return tcp_v4_conn_request(sk, skb);
+
+ if (!ipv6_unicast_destination(skb))
+ goto drop;
+
+ return tcp_conn_request(&tcp6_request_sock_ops,
+ &tcp_request_sock_ipv6_ops, sk, skb);
+
+drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0; /* don't send reset */
+}
+
+static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq;
+ struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+ struct tcp6_sock *newtcp6sk;
+ struct inet_sock *newinet;
+ struct tcp_sock *newtp;
+ struct sock *newsk;
+#ifdef CONFIG_TCP_MD5SIG
+ struct tcp_md5sig_key *key;
+#endif
+ struct flowi6 fl6;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ /*
+ * v6 mapped
+ */
+
+ newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst);
+
+ if (!newsk)
+ return NULL;
+
+ newtcp6sk = (struct tcp6_sock *)newsk;
+ inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+
+ newinet = inet_sk(newsk);
+ newnp = inet6_sk(newsk);
+ newtp = tcp_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ newnp->saddr = newsk->sk_v6_rcv_saddr;
+
+ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+ newsk->sk_backlog_rcv = tcp_v4_do_rcv;
+#ifdef CONFIG_TCP_MD5SIG
+ newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
+#endif
+
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
+ newnp->pktoptions = NULL;
+ newnp->opt = NULL;
+ newnp->mcast_oif = tcp_v6_iif(skb);
+ newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
+ newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
+ if (np->repflow)
+ newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+ * here, tcp_create_openreq_child now does this for us, see the comment in
+ * that function for the gory details. -acme
+ */
+
+ /* It is tricky place. Until this moment IPv4 tcp
+ worked with IPv6 icsk.icsk_af_ops.
+ Sync it now.
+ */
+ tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
+
+ return newsk;
+ }
+
+ ireq = inet_rsk(req);
+
+ if (sk_acceptq_is_full(sk))
+ goto out_overflow;
+
+ if (!dst) {
+ dst = inet6_csk_route_req(sk, &fl6, req);
+ if (!dst)
+ goto out;
+ }
+
+ newsk = tcp_create_openreq_child(sk, req, skb);
+ if (!newsk)
+ goto out_nonewsk;
+
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks
+ * count here, tcp_create_openreq_child now does this for us, see the
+ * comment in that function for the gory details. -acme
+ */
+
+ newsk->sk_gso_type = SKB_GSO_TCPV6;
+ __ip6_dst_store(newsk, dst, NULL, NULL);
+ inet6_sk_rx_dst_set(newsk, skb);
+
+ newtcp6sk = (struct tcp6_sock *)newsk;
+ inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+
+ newtp = tcp_sk(newsk);
+ newinet = inet_sk(newsk);
+ newnp = inet6_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ newnp->saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_bound_dev_if = ireq->ir_iif;
+
+ ip6_set_txhash(newsk);
+
+ /* Now IPv6 options...
+
+ First: no IPv4 options.
+ */
+ newinet->inet_opt = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
+
+ /* Clone RX bits */
+ newnp->rxopt.all = np->rxopt.all;
+
+ /* Clone pktoptions received with SYN */
+ newnp->pktoptions = NULL;
+ if (ireq->pktopts) {
+ newnp->pktoptions = skb_clone(ireq->pktopts,
+ sk_gfp_atomic(sk, GFP_ATOMIC));
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
+ if (newnp->pktoptions)
+ skb_set_owner_r(newnp->pktoptions, newsk);
+ }
+ newnp->opt = NULL;
+ newnp->mcast_oif = tcp_v6_iif(skb);
+ newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
+ newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
+ if (np->repflow)
+ newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+
+ /* Clone native IPv6 options from listening socket (if any)
+
+ Yes, keeping reference count would be much more clever,
+ but we make one more one thing there: reattach optmem
+ to newsk.
+ */
+ if (np->opt)
+ newnp->opt = ipv6_dup_options(newsk, np->opt);
+
+ inet_csk(newsk)->icsk_ext_hdr_len = 0;
+ if (newnp->opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+ newnp->opt->opt_flen);
+
+ tcp_ca_openreq_child(newsk, dst);
+
+ tcp_sync_mss(newsk, dst_mtu(dst));
+ newtp->advmss = dst_metric_advmss(dst);
+ if (tcp_sk(sk)->rx_opt.user_mss &&
+ tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
+ newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+
+ tcp_initialize_rcv_mss(newsk);
+
+ newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
+ newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* Copy over the MD5 key from the original socket */
+ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr);
+ if (key) {
+ /* We're using one, so create a matching key
+ * on the newsk structure. If we fail to get
+ * memory, then we end up not copying the key
+ * across. Shucks.
+ */
+ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
+ AF_INET6, key->key, key->keylen,
+ sk_gfp_atomic(sk, GFP_ATOMIC));
+ }
+#endif
+
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ inet_csk_prepare_forced_close(newsk);
+ tcp_done(newsk);
+ goto out;
+ }
+ __inet_hash(newsk, NULL);
+
+ return newsk;
+
+out_overflow:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+ dst_release(dst);
+out:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return NULL;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcphdr *th = tcp_hdr(skb);
+ struct sk_buff *opt_skb = NULL;
+
+ /* Imagine: socket is IPv6. IPv4 packet arrives,
+ goes to IPv4 receive handler and backlogged.
+ From backlog it always goes here. Kerboom...
+ Fortunately, tcp_rcv_established and rcv_established
+ handle them correctly, but it is not case with
+ tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK
+ */
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return tcp_v4_do_rcv(sk, skb);
+
+ if (sk_filter(sk, skb))
+ goto discard;
+
+ /*
+ * socket locking is here for SMP purposes as backlog rcv
+ * is currently called with bh processing disabled.
+ */
+
+ /* Do Stevens' IPV6_PKTOPTIONS.
+
+ Yes, guys, it is the only place in our code, where we
+ may make it not affecting IPv4.
+ The rest of code is protocol independent,
+ and I do not like idea to uglify IPv4.
+
+ Actually, all the idea behind IPV6_PKTOPTIONS
+ looks not very well thought. For now we latch
+ options, received in the last packet, enqueued
+ by tcp. Feel free to propose better solution.
+ --ANK (980728)
+ */
+ if (np->rxopt.all)
+ opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC));
+
+ if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+ struct dst_entry *dst = sk->sk_rx_dst;
+
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ if (dst) {
+ if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
+ dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
+ dst_release(dst);
+ sk->sk_rx_dst = NULL;
+ }
+ }
+
+ tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+ if (opt_skb)
+ goto ipv6_pktoptions;
+ return 0;
+ }
+
+ if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+ goto csum_err;
+
+#ifdef CONFIG_TCP_STEALTH
+ if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin &&
+ tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+ tcp_stealth_do_auth(sk, skb))
+ goto reset;
+#endif
+
+ if (sk->sk_state == TCP_LISTEN) {
+ struct sock *nsk = tcp_v6_hnd_req(sk, skb);
+ if (!nsk)
+ goto discard;
+
+ /*
+ * Queue it on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket..
+ */
+ if (nsk != sk) {
+ sock_rps_save_rxhash(nsk, skb);
+ sk_mark_napi_id(sk, skb);
+ if (tcp_child_process(sk, nsk, skb))
+ goto reset;
+ if (opt_skb)
+ __kfree_skb(opt_skb);
+ return 0;
+ }
+ } else
+ sock_rps_save_rxhash(sk, skb);
+
+ if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len))
+ goto reset;
+ if (opt_skb)
+ goto ipv6_pktoptions;
+ return 0;
+
+reset:
+ tcp_v6_send_reset(sk, skb);
+discard:
+ if (opt_skb)
+ __kfree_skb(opt_skb);
+ kfree_skb(skb);
+ return 0;
+csum_err:
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ goto discard;
+
+
+ipv6_pktoptions:
+ /* Do you ask, what is it?
+
+ 1. skb was enqueued by tcp.
+ 2. skb is added to tail of read queue, rather than out of order.
+ 3. socket is not in passive state.
+ 4. Finally, it really contains options, which user wants to receive.
+ */
+ tp = tcp_sk(sk);
+ if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
+ !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
+ if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
+ np->mcast_oif = tcp_v6_iif(opt_skb);
+ if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
+ np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+ if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
+ np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
+ if (np->repflow)
+ np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
+ if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
+ skb_set_owner_r(opt_skb, sk);
+ opt_skb = xchg(&np->pktoptions, opt_skb);
+ } else {
+ __kfree_skb(opt_skb);
+ opt_skb = xchg(&np->pktoptions, NULL);
+ }
+ }
+
+ kfree_skb(opt_skb);
+ return 0;
+}
+
+static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
+ const struct tcphdr *th)
+{
+ /* This is tricky: we move IP6CB at its correct location into
+ * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because
+ * _decode_session6() uses IP6CB().
+ * barrier() makes sure compiler won't play aliasing games.
+ */
+ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
+ sizeof(struct inet6_skb_parm));
+ barrier();
+
+ TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+ TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+ skb->len - th->doff*4);
+ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
+ TCP_SKB_CB(skb)->tcp_tw_isn = 0;
+ TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
+ TCP_SKB_CB(skb)->sacked = 0;
+}
+
+static void tcp_v6_restore_cb(struct sk_buff *skb)
+{
+ /* We need to move header back to the beginning if xfrm6_policy_check()
+ * and tcp_v6_fill_cb() are going to be called again.
+ */
+ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
+ sizeof(struct inet6_skb_parm));
+}
+
+static int tcp_v6_rcv(struct sk_buff *skb)
+{
+ const struct tcphdr *th;
+ const struct ipv6hdr *hdr;
+ struct sock *sk;
+ int ret;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto discard_it;
+
+ /*
+ * Count it even if it's bad.
+ */
+ TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
+
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ goto discard_it;
+
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr)/4)
+ goto bad_packet;
+ if (!pskb_may_pull(skb, th->doff*4))
+ goto discard_it;
+
+ if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo))
+ goto csum_error;
+
+ th = tcp_hdr(skb);
+ hdr = ipv6_hdr(skb);
+
+ sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest,
+ inet6_iif(skb));
+ if (!sk)
+ goto no_tcp_socket;
+
+process:
+ if (sk->sk_state == TCP_TIME_WAIT)
+ goto do_time_wait;
+
+ if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+ }
+
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_and_relse;
+
+ tcp_v6_fill_cb(skb, hdr, th);
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (tcp_v6_inbound_md5_hash(sk, skb))
+ goto discard_and_relse;
+#endif
+
+ if (sk_filter(sk, skb))
+ goto discard_and_relse;
+
+ sk_incoming_cpu_update(sk);
+ skb->dev = NULL;
+
+ bh_lock_sock_nested(sk);
+ ret = 0;
+ if (!sock_owned_by_user(sk)) {
+ if (!tcp_prequeue(sk, skb))
+ ret = tcp_v6_do_rcv(sk, skb);
+ } else if (unlikely(sk_add_backlog(sk, skb,
+ sk->sk_rcvbuf + sk->sk_sndbuf))) {
+ bh_unlock_sock(sk);
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ goto discard_and_relse;
+ }
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+ return ret ? -1 : 0;
+
+no_tcp_socket:
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+
+ tcp_v6_fill_cb(skb, hdr, th);
+
+ if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+csum_error:
+ TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
+bad_packet:
+ TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+ } else {
+ tcp_v6_send_reset(NULL, skb);
+ }
+
+discard_it:
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
+do_time_wait:
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto discard_it;
+ }
+
+ tcp_v6_fill_cb(skb, hdr, th);
+
+ if (skb->len < (th->doff<<2)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto bad_packet;
+ }
+ if (tcp_checksum_complete(skb)) {
+ inet_twsk_put(inet_twsk(sk));
+ goto csum_error;
+ }
+
+ switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+ case TCP_TW_SYN:
+ {
+ struct sock *sk2;
+
+ sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+ &ipv6_hdr(skb)->saddr, th->source,
+ &ipv6_hdr(skb)->daddr,
+ ntohs(th->dest), tcp_v6_iif(skb));
+ if (sk2) {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ inet_twsk_deschedule(tw);
+ inet_twsk_put(tw);
+ sk = sk2;
+ tcp_v6_restore_cb(skb);
+ goto process;
+ }
+ /* Fall through to ACK */
+ }
+ case TCP_TW_ACK:
+ tcp_v6_timewait_ack(sk, skb);
+ break;
+ case TCP_TW_RST:
+ tcp_v6_restore_cb(skb);
+ goto no_tcp_socket;
+ case TCP_TW_SUCCESS:
+ ;
+ }
+ goto discard_it;
+}
+
+static void tcp_v6_early_demux(struct sk_buff *skb)
+{
+ const struct ipv6hdr *hdr;
+ const struct tcphdr *th;
+ struct sock *sk;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return;
+
+ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
+ return;
+
+ hdr = ipv6_hdr(skb);
+ th = tcp_hdr(skb);
+
+ if (th->doff < sizeof(struct tcphdr) / 4)
+ return;
+
+ /* Note : We use inet6_iif() here, not tcp_v6_iif() */
+ sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
+ &hdr->saddr, th->source,
+ &hdr->daddr, ntohs(th->dest),
+ inet6_iif(skb));
+ if (sk) {
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+ if (sk_fullsock(sk)) {
+ struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
+
+ if (dst)
+ dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
+ if (dst &&
+ inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ skb_dst_set_noref(skb, dst);
+ }
+ }
+}
+
+static struct timewait_sock_ops tcp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
+ .twsk_unique = tcp_twsk_unique,
+ .twsk_destructor = tcp_twsk_destructor,
+};
+
+static const struct inet_connection_sock_af_ops ipv6_specific = {
+ .queue_xmit = inet6_csk_xmit,
+ .send_check = tcp_v6_send_check,
+ .rebuild_header = inet6_sk_rebuild_header,
+ .sk_rx_dst_set = inet6_sk_rx_dst_set,
+ .conn_request = tcp_v6_conn_request,
+ .syn_recv_sock = tcp_v6_syn_recv_sock,
+ .net_header_len = sizeof(struct ipv6hdr),
+ .net_frag_header_len = sizeof(struct frag_hdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6),
+ .bind_conflict = inet6_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ipv6_setsockopt,
+ .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+ .mtu_reduced = tcp_v6_mtu_reduced,
+};
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
+ .md5_lookup = tcp_v6_md5_lookup,
+ .calc_md5_hash = tcp_v6_md5_hash_skb,
+ .md5_parse = tcp_v6_parse_md5_keys,
+};
+#endif
+
+/*
+ * TCP over IPv4 via INET6 API
+ */
+static const struct inet_connection_sock_af_ops ipv6_mapped = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = tcp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .sk_rx_dst_set = inet_sk_rx_dst_set,
+ .conn_request = tcp_v6_conn_request,
+ .syn_recv_sock = tcp_v6_syn_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6),
+ .bind_conflict = inet6_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ipv6_setsockopt,
+ .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+ .mtu_reduced = tcp_v4_mtu_reduced,
+};
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
+ .md5_lookup = tcp_v4_md5_lookup,
+ .calc_md5_hash = tcp_v4_md5_hash_skb,
+ .md5_parse = tcp_v6_parse_md5_keys,
+};
+#endif
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
+static int tcp_v6_init_sock(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ tcp_init_sock(sk);
+
+ icsk->icsk_af_ops = &ipv6_specific;
+
+#ifdef CONFIG_TCP_MD5SIG
+ tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
+#endif
+
+ return 0;
+}
+
+static void tcp_v6_destroy_sock(struct sock *sk)
+{
+ tcp_v4_destroy_sock(sk);
+ inet6_destroy_sock(sk);
+}
+
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCPv6 sock list dumping. */
+static void get_openreq6(struct seq_file *seq,
+ struct request_sock *req, int i, kuid_t uid)
+{
+ long ttd = req->rsk_timer.expires - jiffies;
+ const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
+ const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr;
+
+ if (ttd < 0)
+ ttd = 0;
+
+ seq_printf(seq,
+ "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+ "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n",
+ i,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3],
+ inet_rsk(req)->ir_num,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3],
+ ntohs(inet_rsk(req)->ir_rmt_port),
+ TCP_SYN_RECV,
+ 0, 0, /* could print option size, but that is af dependent. */
+ 1, /* timers active (only the expire timer) */
+ jiffies_to_clock_t(ttd),
+ req->num_timeout,
+ from_kuid_munged(seq_user_ns(seq), uid),
+ 0, /* non standard timer */
+ 0, /* open_requests have no inode */
+ 0, req);
+}
+
+static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
+{
+ const struct in6_addr *dest, *src;
+ __u16 destp, srcp;
+ int timer_active;
+ unsigned long timer_expires;
+ const struct inet_sock *inet = inet_sk(sp);
+ const struct tcp_sock *tp = tcp_sk(sp);
+ const struct inet_connection_sock *icsk = inet_csk(sp);
+ struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
+
+ dest = &sp->sk_v6_daddr;
+ src = &sp->sk_v6_rcv_saddr;
+ destp = ntohs(inet->inet_dport);
+ srcp = ntohs(inet->inet_sport);
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ timer_active = 1;
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_active = 4;
+ timer_expires = icsk->icsk_timeout;
+ } else if (timer_pending(&sp->sk_timer)) {
+ timer_active = 2;
+ timer_expires = sp->sk_timer.expires;
+ } else {
+ timer_active = 0;
+ timer_expires = jiffies;
+ }
+
+ seq_printf(seq,
+ "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+ "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n",
+ i,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3], srcp,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3], destp,
+ sp->sk_state,
+ tp->write_seq-tp->snd_una,
+ (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
+ timer_active,
+ jiffies_delta_to_clock_t(timer_expires - jiffies),
+ icsk->icsk_retransmits,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+ icsk->icsk_probes_out,
+ sock_i_ino(sp),
+ atomic_read(&sp->sk_refcnt), sp,
+ jiffies_to_clock_t(icsk->icsk_rto),
+ jiffies_to_clock_t(icsk->icsk_ack.ato),
+ (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
+ tp->snd_cwnd,
+ sp->sk_state == TCP_LISTEN ?
+ (fastopenq ? fastopenq->max_qlen : 0) :
+ (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
+ );
+}
+
+static void get_timewait6_sock(struct seq_file *seq,
+ struct inet_timewait_sock *tw, int i)
+{
+ long delta = tw->tw_timer.expires - jiffies;
+ const struct in6_addr *dest, *src;
+ __u16 destp, srcp;
+
+ dest = &tw->tw_v6_daddr;
+ src = &tw->tw_v6_rcv_saddr;
+ destp = ntohs(tw->tw_dport);
+ srcp = ntohs(tw->tw_sport);
+
+ seq_printf(seq,
+ "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+ "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
+ i,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3], srcp,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3], destp,
+ tw->tw_substate, 0, 0,
+ 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+ atomic_read(&tw->tw_refcnt), tw);
+}
+
+static int tcp6_seq_show(struct seq_file *seq, void *v)
+{
+ struct tcp_iter_state *st;
+ struct sock *sk = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ " sl "
+ "local_address "
+ "remote_address "
+ "st tx_queue rx_queue tr tm->when retrnsmt"
+ " uid timeout inode\n");
+ goto out;
+ }
+ st = seq->private;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ case TCP_SEQ_STATE_ESTABLISHED:
+ if (sk->sk_state == TCP_TIME_WAIT)
+ get_timewait6_sock(seq, v, st->num);
+ else
+ get_tcp6_sock(seq, v, st->num);
+ break;
+ case TCP_SEQ_STATE_OPENREQ:
+ get_openreq6(seq, v, st->num, st->uid);
+ break;
+ }
+out:
+ return 0;
+}
+
+static const struct file_operations tcp6_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = tcp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+static struct tcp_seq_afinfo tcp6_seq_afinfo = {
+ .name = "tcp6",
+ .family = AF_INET6,
+ .seq_fops = &tcp6_afinfo_seq_fops,
+ .seq_ops = {
+ .show = tcp6_seq_show,
+ },
+};
+
+int __net_init tcp6_proc_init(struct net *net)
+{
+ return tcp_proc_register(net, &tcp6_seq_afinfo);
+}
+
+void tcp6_proc_exit(struct net *net)
+{
+ tcp_proc_unregister(net, &tcp6_seq_afinfo);
+}
+#endif
+
+static void tcp_v6_clear_sk(struct sock *sk, int size)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* we do not want to clear pinet6 field, because of RCU lookups */
+ sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
+
+ size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
+ memset(&inet->pinet6 + 1, 0, size);
+}
+
+struct proto tcpv6_prot = {
+ .name = "TCPv6",
+ .owner = THIS_MODULE,
+ .close = tcp_close,
+ .connect = tcp_v6_connect,
+ .disconnect = tcp_disconnect,
+ .accept = inet_csk_accept,
+ .ioctl = tcp_ioctl,
+ .init = tcp_v6_init_sock,
+ .destroy = tcp_v6_destroy_sock,
+ .shutdown = tcp_shutdown,
+ .setsockopt = tcp_setsockopt,
+ .getsockopt = tcp_getsockopt,
+ .recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
+ .sendpage = tcp_sendpage,
+ .backlog_rcv = tcp_v6_do_rcv,
+ .release_cb = tcp_release_cb,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .get_port = inet_csk_get_port,
+ .enter_memory_pressure = tcp_enter_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
+ .sockets_allocated = &tcp_sockets_allocated,
+ .memory_allocated = &tcp_memory_allocated,
+ .memory_pressure = &tcp_memory_pressure,
+ .orphan_count = &tcp_orphan_count,
+ .sysctl_mem = sysctl_tcp_mem,
+ .sysctl_wmem = sysctl_tcp_wmem,
+ .sysctl_rmem = sysctl_tcp_rmem,
+ .max_header = MAX_TCP_HEADER,
+ .obj_size = sizeof(struct tcp6_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .twsk_prot = &tcp6_timewait_sock_ops,
+ .rsk_prot = &tcp6_request_sock_ops,
+ .h.hashinfo = &tcp_hashinfo,
+ .no_autobind = true,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_tcp_setsockopt,
+ .compat_getsockopt = compat_tcp_getsockopt,
+#endif
+#ifdef CONFIG_MEMCG_KMEM
+ .proto_cgroup = tcp_proto_cgroup,
+#endif
+ .clear_sk = tcp_v6_clear_sk,
+};
+
+static const struct inet6_protocol tcpv6_protocol = {
+ .early_demux = tcp_v6_early_demux,
+ .handler = tcp_v6_rcv,
+ .err_handler = tcp_v6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static struct inet_protosw tcpv6_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_TCP,
+ .prot = &tcpv6_prot,
+ .ops = &inet6_stream_ops,
+ .flags = INET_PROTOSW_PERMANENT |
+ INET_PROTOSW_ICSK,
+};
+
+static int __net_init tcpv6_net_init(struct net *net)
+{
+ return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
+ SOCK_RAW, IPPROTO_TCP, net);
+}
+
+static void __net_exit tcpv6_net_exit(struct net *net)
+{
+ inet_ctl_sock_destroy(net->ipv6.tcp_sk);
+}
+
+static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
+}
+
+static struct pernet_operations tcpv6_net_ops = {
+ .init = tcpv6_net_init,
+ .exit = tcpv6_net_exit,
+ .exit_batch = tcpv6_net_exit_batch,
+};
+
+int __init tcpv6_init(void)
+{
+ int ret;
+
+ ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP);
+ if (ret)
+ goto out;
+
+ /* register inet6 protocol */
+ ret = inet6_register_protosw(&tcpv6_protosw);
+ if (ret)
+ goto out_tcpv6_protocol;
+
+ ret = register_pernet_subsys(&tcpv6_net_ops);
+ if (ret)
+ goto out_tcpv6_protosw;
+out:
+ return ret;
+
+out_tcpv6_protosw:
+ inet6_unregister_protosw(&tcpv6_protosw);
+out_tcpv6_protocol:
+ inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
+ goto out;
+}
+
+void tcpv6_exit(void)
+{
+ unregister_pernet_subsys(&tcpv6_net_ops);
+ inet6_unregister_protosw(&tcpv6_protosw);
+ inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
+}
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
new file mode 100644
index 000000000..d883c9204
--- /dev/null
+++ b/net/ipv6/tcpv6_offload.c
@@ -0,0 +1,78 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * TCPv6 GSO/GRO support
+ */
+#include <linux/skbuff.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/ip6_checksum.h>
+#include "ip6_offload.h"
+
+static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (!NAPI_GRO_CB(skb)->flush &&
+ skb_gro_checksum_validate(skb, IPPROTO_TCP,
+ ip6_gro_compute_pseudo)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ return tcp_gro_receive(head, skb);
+}
+
+static int tcp6_gro_complete(struct sk_buff *skb, int thoff)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
+ &iph->daddr, 0);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+
+ return tcp_gro_complete(skb);
+}
+
+static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct tcphdr *th;
+
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
+ /* Set up pseudo header, usually expect stack to have done
+ * this.
+ */
+
+ th->check = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr);
+ }
+
+ return tcp_gso_segment(skb, features);
+}
+static const struct net_offload tcpv6_offload = {
+ .callbacks = {
+ .gso_segment = tcp6_gso_segment,
+ .gro_receive = tcp6_gro_receive,
+ .gro_complete = tcp6_gro_complete,
+ },
+};
+
+int __init tcpv6_offload_init(void)
+{
+ return inet6_add_offload(&tcpv6_offload, IPPROTO_TCP);
+}
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
new file mode 100644
index 000000000..3c758007b
--- /dev/null
+++ b/net/ipv6/tunnel6.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C)2003,2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Authors Mitsuru KANDA <mk@linux-ipv6.org>
+ * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ */
+
+#define pr_fmt(fmt) "IPv6: " fmt
+
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
+static DEFINE_MUTEX(tunnel6_mutex);
+
+int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
+{
+ struct xfrm6_tunnel __rcu **pprev;
+ struct xfrm6_tunnel *t;
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ mutex_lock(&tunnel6_mutex);
+
+ for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel6_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority > priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&tunnel6_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm6_tunnel_register);
+
+int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
+{
+ struct xfrm6_tunnel __rcu **pprev;
+ struct xfrm6_tunnel *t;
+ int ret = -ENOENT;
+
+ mutex_lock(&tunnel6_mutex);
+
+ for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel6_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ mutex_unlock(&tunnel6_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm6_tunnel_deregister);
+
+#define for_each_tunnel_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+static int tunnel6_rcv(struct sk_buff *skb)
+{
+ struct xfrm6_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnel6_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int tunnel46_rcv(struct sk_buff *skb)
+{
+ struct xfrm6_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnel46_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnel6_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ break;
+}
+
+static const struct inet6_protocol tunnel6_protocol = {
+ .handler = tunnel6_rcv,
+ .err_handler = tunnel6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static const struct inet6_protocol tunnel46_protocol = {
+ .handler = tunnel46_rcv,
+ .err_handler = tunnel6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static int __init tunnel6_init(void)
+{
+ if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ return -EAGAIN;
+ }
+ if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit tunnel6_fini(void)
+{
+ if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP))
+ pr_err("%s: can't remove protocol\n", __func__);
+ if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6))
+ pr_err("%s: can't remove protocol\n", __func__);
+}
+
+module_init(tunnel6_init);
+module_exit(tunnel6_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
new file mode 100644
index 000000000..e51fc3eee
--- /dev/null
+++ b/net/ipv6/udp.c
@@ -0,0 +1,1579 @@
+/*
+ * UDP over IPv6
+ * Linux INET6 implementation
+ *
+ * Authors:
+ * Pedro Roque <roque@di.fc.ul.pt>
+ *
+ * Based on linux/ipv4/udp.c
+ *
+ * Fixes:
+ * Hideaki YOSHIFUJI : sin6_scope_id support
+ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
+ * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
+ * a single port at the same time.
+ * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
+ * YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/raw.h>
+#include <net/tcp_states.h>
+#include <net/ip6_checksum.h>
+#include <net/xfrm.h>
+#include <net/inet6_hashtables.h>
+#include <net/busy_poll.h>
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <trace/events/skb.h>
+#include "udp_impl.h"
+
+static u32 udp6_ehashfn(const struct net *net,
+ const struct in6_addr *laddr,
+ const u16 lport,
+ const struct in6_addr *faddr,
+ const __be16 fport)
+{
+ static u32 udp6_ehash_secret __read_mostly;
+ static u32 udp_ipv6_hash_secret __read_mostly;
+
+ u32 lhash, fhash;
+
+ net_get_random_once(&udp6_ehash_secret,
+ sizeof(udp6_ehash_secret));
+ net_get_random_once(&udp_ipv6_hash_secret,
+ sizeof(udp_ipv6_hash_secret));
+
+ lhash = (__force u32)laddr->s6_addr32[3];
+ fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret);
+
+ return __inet6_ehashfn(lhash, lport, fhash, fport,
+ udp_ipv6_hash_secret + net_hash_mix(net));
+}
+
+int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
+{
+ const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
+ int sk2_ipv6only = inet_v6_ipv6only(sk2);
+ int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+ int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+ /* if both are mapped, treat as IPv4 */
+ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED)
+ return (!sk2_ipv6only &&
+ (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr ||
+ sk->sk_rcv_saddr == sk2->sk_rcv_saddr));
+
+ if (addr_type2 == IPV6_ADDR_ANY &&
+ !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (addr_type == IPV6_ADDR_ANY &&
+ !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (sk2_rcv_saddr6 &&
+ ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
+ return 1;
+
+ return 0;
+}
+
+static u32 udp6_portaddr_hash(const struct net *net,
+ const struct in6_addr *addr6,
+ unsigned int port)
+{
+ unsigned int hash, mix = net_hash_mix(net);
+
+ if (ipv6_addr_any(addr6))
+ hash = jhash_1word(0, mix);
+ else if (ipv6_addr_v4mapped(addr6))
+ hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
+ else
+ hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
+
+ return hash ^ port;
+}
+
+int udp_v6_get_port(struct sock *sk, unsigned short snum)
+{
+ unsigned int hash2_nulladdr =
+ udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+ unsigned int hash2_partial =
+ udp6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, 0);
+
+ /* precompute partial secondary hash */
+ udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+ return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
+}
+
+static void udp_v6_rehash(struct sock *sk)
+{
+ u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+ &sk->sk_v6_rcv_saddr,
+ inet_sk(sk)->inet_num);
+
+ udp_lib_rehash(sk, new_hash);
+}
+
+static inline int compute_score(struct sock *sk, struct net *net,
+ unsigned short hnum,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport,
+ int dif)
+{
+ int score;
+ struct inet_sock *inet;
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ sk->sk_family != PF_INET6)
+ return -1;
+
+ score = 0;
+ inet = inet_sk(sk);
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score++;
+ }
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
+ score++;
+ }
+
+ if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+ if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
+ return -1;
+ score++;
+ }
+
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score++;
+ }
+
+ return score;
+}
+
+#define SCORE2_MAX (1 + 1 + 1)
+static inline int compute_score2(struct sock *sk, struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned short hnum, int dif)
+{
+ int score;
+ struct inet_sock *inet;
+
+ if (!net_eq(sock_net(sk), net) ||
+ udp_sk(sk)->udp_port_hash != hnum ||
+ sk->sk_family != PF_INET6)
+ return -1;
+
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
+
+ score = 0;
+ inet = inet_sk(sk);
+
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score++;
+ }
+
+ if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+ if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
+ return -1;
+ score++;
+ }
+
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score++;
+ }
+
+ return score;
+}
+
+/* called with read_rcu_lock() */
+static struct sock *udp6_lib_lookup2(struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, unsigned int hnum, int dif,
+ struct udp_hslot *hslot2, unsigned int slot2)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
+
+begin:
+ result = NULL;
+ badness = -1;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ score = compute_score2(sk, net, saddr, sport,
+ daddr, hnum, dif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = udp6_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ } else if (score == SCORE2_MAX)
+ goto exact_match;
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (reciprocal_scale(hash, matches) == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot2)
+ goto begin;
+
+ if (result) {
+exact_match:
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(compute_score2(result, net, saddr, sport,
+ daddr, hnum, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ return result;
+}
+
+struct sock *__udp6_lib_lookup(struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport,
+ int dif, struct udp_table *udptable)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(dport);
+ unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+ int score, badness, matches = 0, reuseport = 0;
+ u32 hash = 0;
+
+ rcu_read_lock();
+ if (hslot->count > 10) {
+ hash2 = udp6_portaddr_hash(net, daddr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp6_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif,
+ hslot2, slot2);
+ if (!result) {
+ hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp6_lib_lookup2(net, saddr, sport,
+ &in6addr_any, hnum, dif,
+ hslot2, slot2);
+ }
+ rcu_read_unlock();
+ return result;
+ }
+begin:
+ result = NULL;
+ badness = -1;
+ sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+ score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ hash = udp6_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == badness && reuseport) {
+ matches++;
+ if (reciprocal_scale(hash, matches) == 0)
+ result = sk;
+ hash = next_pseudo_random32(hash);
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ if (result) {
+ if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+ result = NULL;
+ else if (unlikely(compute_score(result, net, hnum, saddr, sport,
+ daddr, dport, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
+
+static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport,
+ struct udp_table *udptable)
+{
+ struct sock *sk;
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ sk = skb_steal_sock(skb);
+ if (unlikely(sk))
+ return sk;
+ return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport,
+ &iph->daddr, dport, inet6_iif(skb),
+ udptable);
+}
+
+struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport, int dif)
+{
+ return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+
+/*
+ * This should be easy, if there is something there we
+ * return it, otherwise we block.
+ */
+
+int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct sk_buff *skb;
+ unsigned int ulen, copied;
+ int peeked, off = 0;
+ int err;
+ int is_udplite = IS_UDPLITE(sk);
+ int is_udp4;
+ bool slow;
+
+ if (flags & MSG_ERRQUEUE)
+ return ipv6_recv_error(sk, msg, len, addr_len);
+
+ if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+ return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
+
+try_again:
+ skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ &peeked, &off, &err);
+ if (!skb)
+ goto out;
+
+ ulen = skb->len - sizeof(struct udphdr);
+ copied = len;
+ if (copied > ulen)
+ copied = ulen;
+ else if (copied < ulen)
+ msg->msg_flags |= MSG_TRUNC;
+
+ is_udp4 = (skb->protocol == htons(ETH_P_IP));
+
+ /*
+ * If checksum is needed at all, try to do it while copying the
+ * data. If the data is truncated, or if we only want a partial
+ * coverage checksum (UDP-Lite), do it before the copy.
+ */
+
+ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_lib_checksum_complete(skb))
+ goto csum_copy_err;
+ }
+
+ if (skb_csum_unnecessary(skb))
+ err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+ msg, copied);
+ else {
+ err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg);
+ if (err == -EINVAL)
+ goto csum_copy_err;
+ }
+ if (unlikely(err)) {
+ trace_kfree_skb(skb, udpv6_recvmsg);
+ if (!peeked) {
+ atomic_inc(&sk->sk_drops);
+ if (is_udp4)
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INERRORS,
+ is_udplite);
+ else
+ UDP6_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INERRORS,
+ is_udplite);
+ }
+ goto out_free;
+ }
+ if (!peeked) {
+ if (is_udp4)
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INDATAGRAMS, is_udplite);
+ else
+ UDP6_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INDATAGRAMS, is_udplite);
+ }
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ /* Copy the address. */
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = udp_hdr(skb)->source;
+ sin6->sin6_flowinfo = 0;
+
+ if (is_udp4) {
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
+ &sin6->sin6_addr);
+ sin6->sin6_scope_id = 0;
+ } else {
+ sin6->sin6_addr = ipv6_hdr(skb)->saddr;
+ sin6->sin6_scope_id =
+ ipv6_iface_scope_id(&sin6->sin6_addr,
+ inet6_iif(skb));
+ }
+ *addr_len = sizeof(*sin6);
+ }
+
+ if (np->rxopt.all)
+ ip6_datagram_recv_common_ctl(sk, msg, skb);
+
+ if (is_udp4) {
+ if (inet->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ } else {
+ if (np->rxopt.all)
+ ip6_datagram_recv_specific_ctl(sk, msg, skb);
+ }
+
+ err = copied;
+ if (flags & MSG_TRUNC)
+ err = ulen;
+
+out_free:
+ skb_free_datagram_locked(sk, skb);
+out:
+ return err;
+
+csum_copy_err:
+ slow = lock_sock_fast(sk);
+ if (!skb_kill_datagram(sk, skb, flags)) {
+ if (is_udp4) {
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_CSUMERRORS, is_udplite);
+ UDP_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
+ } else {
+ UDP6_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_CSUMERRORS, is_udplite);
+ UDP6_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
+ }
+ }
+ unlock_sock_fast(sk, slow);
+
+ /* starting over for a new packet, but check if we need to yield */
+ cond_resched();
+ msg->msg_flags &= ~MSG_TRUNC;
+ goto try_again;
+}
+
+void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info,
+ struct udp_table *udptable)
+{
+ struct ipv6_pinfo *np;
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
+ const struct in6_addr *saddr = &hdr->saddr;
+ const struct in6_addr *daddr = &hdr->daddr;
+ struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+ struct sock *sk;
+ int err;
+ struct net *net = dev_net(skb->dev);
+
+ sk = __udp6_lib_lookup(net, daddr, uh->dest,
+ saddr, uh->source, inet6_iif(skb), udptable);
+ if (!sk) {
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return;
+ }
+
+ if (type == ICMPV6_PKT_TOOBIG) {
+ if (!ip6_sk_accept_pmtu(sk))
+ goto out;
+ ip6_sk_update_pmtu(skb, sk, info);
+ }
+ if (type == NDISC_REDIRECT) {
+ ip6_sk_redirect(skb, sk);
+ goto out;
+ }
+
+ np = inet6_sk(sk);
+
+ if (!icmpv6_err_convert(type, code, &err) && !np->recverr)
+ goto out;
+
+ if (sk->sk_state != TCP_ESTABLISHED && !np->recverr)
+ goto out;
+
+ if (np->recverr)
+ ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
+
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+out:
+ sock_put(sk);
+}
+
+static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+
+ if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+ sock_rps_save_rxhash(sk, skb);
+ sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
+ }
+
+ rc = sock_queue_rcv_skb(sk, skb);
+ if (rc < 0) {
+ int is_udplite = IS_UDPLITE(sk);
+
+ /* Note that an ENOMEM error is charged twice */
+ if (rc == -ENOMEM)
+ UDP6_INC_STATS_BH(sock_net(sk),
+ UDP_MIB_RCVBUFERRORS, is_udplite);
+ UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ kfree_skb(skb);
+ return -1;
+ }
+ return 0;
+}
+
+static __inline__ void udpv6_err(struct sk_buff *skb,
+ struct inet6_skb_parm *opt, u8 type,
+ u8 code, int offset, __be32 info)
+{
+ __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
+}
+
+static struct static_key udpv6_encap_needed __read_mostly;
+void udpv6_encap_enable(void)
+{
+ if (!static_key_enabled(&udpv6_encap_needed))
+ static_key_slow_inc(&udpv6_encap_needed);
+}
+EXPORT_SYMBOL(udpv6_encap_enable);
+
+int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct udp_sock *up = udp_sk(sk);
+ int rc;
+ int is_udplite = IS_UDPLITE(sk);
+
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto drop;
+
+ if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
+ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+
+ /*
+ * This is an encapsulation socket so pass the skb to
+ * the socket's udp_encap_rcv() hook. Otherwise, just
+ * fall through and pass this up the UDP socket.
+ * up->encap_rcv() returns the following value:
+ * =0 if skb was successfully passed to the encap
+ * handler or was discarded by it.
+ * >0 if skb should be passed on to UDP.
+ * <0 if skb should be resubmitted as proto -N
+ */
+
+ /* if we're overly short, let UDP handle it */
+ encap_rcv = ACCESS_ONCE(up->encap_rcv);
+ if (skb->len > sizeof(struct udphdr) && encap_rcv) {
+ int ret;
+
+ /* Verify checksum before giving to encap */
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ ret = encap_rcv(sk, skb);
+ if (ret <= 0) {
+ UDP_INC_STATS_BH(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
+ return -ret;
+ }
+ }
+
+ /* FALLTHROUGH -- it's a UDP Packet */
+ }
+
+ /*
+ * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
+ */
+ if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+
+ if (up->pcrlen == 0) { /* full coverage was set */
+ net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
+ UDP_SKB_CB(skb)->cscov, skb->len);
+ goto drop;
+ }
+ if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
+ net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
+ UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ goto drop;
+ }
+ }
+
+ if (rcu_access_pointer(sk->sk_filter)) {
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+ }
+
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
+ UDP6_INC_STATS_BH(sock_net(sk),
+ UDP_MIB_RCVBUFERRORS, is_udplite);
+ goto drop;
+ }
+
+ skb_dst_drop(skb);
+
+ bh_lock_sock(sk);
+ rc = 0;
+ if (!sock_owned_by_user(sk))
+ rc = __udpv6_queue_rcv_skb(sk, skb);
+ else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ bh_unlock_sock(sk);
+ goto drop;
+ }
+ bh_unlock_sock(sk);
+
+ return rc;
+
+csum_error:
+ UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+drop:
+ UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ atomic_inc(&sk->sk_drops);
+ kfree_skb(skb);
+ return -1;
+}
+
+static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
+ __be16 loc_port, const struct in6_addr *loc_addr,
+ __be16 rmt_port, const struct in6_addr *rmt_addr,
+ int dif, unsigned short hnum)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ return false;
+
+ if (udp_sk(sk)->udp_port_hash != hnum ||
+ sk->sk_family != PF_INET6 ||
+ (inet->inet_dport && inet->inet_dport != rmt_port) ||
+ (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+ (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
+ return false;
+ if (!inet6_mc_check(sk, loc_addr, rmt_addr))
+ return false;
+ return true;
+}
+
+static void flush_stack(struct sock **stack, unsigned int count,
+ struct sk_buff *skb, unsigned int final)
+{
+ struct sk_buff *skb1 = NULL;
+ struct sock *sk;
+ unsigned int i;
+
+ for (i = 0; i < count; i++) {
+ sk = stack[i];
+ if (likely(!skb1))
+ skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
+ if (!skb1) {
+ atomic_inc(&sk->sk_drops);
+ UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ }
+
+ if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
+ skb1 = NULL;
+ sock_put(sk);
+ }
+ if (unlikely(skb1))
+ kfree_skb(skb1);
+}
+
+static void udp6_csum_zero_error(struct sk_buff *skb)
+{
+ /* RFC 2460 section 8.1 says that we SHOULD log
+ * this error. Well, it is reasonable.
+ */
+ net_dbg_ratelimited("IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n",
+ &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source),
+ &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest));
+}
+
+/*
+ * Note: called only from the BH handler context,
+ * so we don't need to lock the hashes.
+ */
+static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ struct udp_table *udptable, int proto)
+{
+ struct sock *sk, *stack[256 / sizeof(struct sock *)];
+ const struct udphdr *uh = udp_hdr(skb);
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(uh->dest);
+ struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+ int dif = inet6_iif(skb);
+ unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+ unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+ bool inner_flushed = false;
+
+ if (use_hash2) {
+ hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+ udp_table.mask;
+ hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+ hslot = &udp_table.hash2[hash2];
+ offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+ }
+
+ spin_lock(&hslot->lock);
+ sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
+ if (__udp_v6_is_mcast_sock(net, sk,
+ uh->dest, daddr,
+ uh->source, saddr,
+ dif, hnum) &&
+ /* If zero checksum and no_check is not on for
+ * the socket then skip it.
+ */
+ (uh->check || udp_sk(sk)->no_check6_rx)) {
+ if (unlikely(count == ARRAY_SIZE(stack))) {
+ flush_stack(stack, count, skb, ~0);
+ inner_flushed = true;
+ count = 0;
+ }
+ stack[count++] = sk;
+ sock_hold(sk);
+ }
+ }
+
+ spin_unlock(&hslot->lock);
+
+ /* Also lookup *:port if we are using hash2 and haven't done so yet. */
+ if (use_hash2 && hash2 != hash2_any) {
+ hash2 = hash2_any;
+ goto start_lookup;
+ }
+
+ if (count) {
+ flush_stack(stack, count, skb, count - 1);
+ } else {
+ if (!inner_flushed)
+ UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
+ consume_skb(skb);
+ }
+ return 0;
+}
+
+int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+ int proto)
+{
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk;
+ struct udphdr *uh;
+ const struct in6_addr *saddr, *daddr;
+ u32 ulen = 0;
+
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto discard;
+
+ saddr = &ipv6_hdr(skb)->saddr;
+ daddr = &ipv6_hdr(skb)->daddr;
+ uh = udp_hdr(skb);
+
+ ulen = ntohs(uh->len);
+ if (ulen > skb->len)
+ goto short_packet;
+
+ if (proto == IPPROTO_UDP) {
+ /* UDP validates ulen. */
+
+ /* Check for jumbo payload */
+ if (ulen == 0)
+ ulen = skb->len;
+
+ if (ulen < sizeof(*uh))
+ goto short_packet;
+
+ if (ulen < skb->len) {
+ if (pskb_trim_rcsum(skb, ulen))
+ goto short_packet;
+ saddr = &ipv6_hdr(skb)->saddr;
+ daddr = &ipv6_hdr(skb)->daddr;
+ uh = udp_hdr(skb);
+ }
+ }
+
+ if (udp6_csum_init(skb, uh, proto))
+ goto csum_error;
+
+ /*
+ * Multicast receive code
+ */
+ if (ipv6_addr_is_multicast(daddr))
+ return __udp6_lib_mcast_deliver(net, skb,
+ saddr, daddr, udptable, proto);
+
+ /* Unicast */
+
+ /*
+ * check socket cache ... must talk to Alan about his plans
+ * for sock caches... i'll skip this for now.
+ */
+ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ if (sk) {
+ int ret;
+
+ if (!uh->check && !udp_sk(sk)->no_check6_rx) {
+ sock_put(sk);
+ udp6_csum_zero_error(skb);
+ goto csum_error;
+ }
+
+ if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+ skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ ip6_compute_pseudo);
+
+ ret = udpv6_queue_rcv_skb(sk, skb);
+ sock_put(sk);
+
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+
+ return 0;
+ }
+
+ if (!uh->check) {
+ udp6_csum_zero_error(skb);
+ goto csum_error;
+ }
+
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard;
+
+ if (udp_lib_checksum_complete(skb))
+ goto csum_error;
+
+ UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+
+short_packet:
+ net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n",
+ proto == IPPROTO_UDPLITE ? "-Lite" : "",
+ saddr, ntohs(uh->source),
+ ulen, skb->len,
+ daddr, ntohs(uh->dest));
+ goto discard;
+csum_error:
+ UDP6_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
+discard:
+ UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+ kfree_skb(skb);
+ return 0;
+}
+
+static __inline__ int udpv6_rcv(struct sk_buff *skb)
+{
+ return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+}
+
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+static void udp_v6_flush_pending_frames(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+
+ if (up->pending == AF_INET)
+ udp_flush_pending_frames(sk);
+ else if (up->pending) {
+ up->len = 0;
+ up->pending = 0;
+ ip6_flush_pending_frames(sk);
+ }
+}
+
+/**
+ * udp6_hwcsum_outgoing - handle outgoing HW checksumming
+ * @sk: socket we are sending on
+ * @skb: sk_buff containing the filled-in UDP header
+ * (checksum field must be zeroed out)
+ */
+static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr, int len)
+{
+ unsigned int offset;
+ struct udphdr *uh = udp_hdr(skb);
+ struct sk_buff *frags = skb_shinfo(skb)->frag_list;
+ __wsum csum = 0;
+
+ if (!frags) {
+ /* Only one fragment on the socket. */
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0);
+ } else {
+ /*
+ * HW-checksum won't work as there are two or more
+ * fragments on the socket so that all csums of sk_buffs
+ * should be together
+ */
+ offset = skb_transport_offset(skb);
+ skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ do {
+ csum = csum_add(csum, frags->csum);
+ } while ((frags = frags->next));
+
+ uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP,
+ csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
+}
+
+/*
+ * Sending
+ */
+
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
+{
+ struct sock *sk = skb->sk;
+ struct udphdr *uh;
+ int err = 0;
+ int is_udplite = IS_UDPLITE(sk);
+ __wsum csum = 0;
+ int offset = skb_transport_offset(skb);
+ int len = skb->len - offset;
+
+ /*
+ * Create a UDP header
+ */
+ uh = udp_hdr(skb);
+ uh->source = fl6->fl6_sport;
+ uh->dest = fl6->fl6_dport;
+ uh->len = htons(len);
+ uh->check = 0;
+
+ if (is_udplite)
+ csum = udplite_csum(skb);
+ else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
+ skb->ip_summed = CHECKSUM_NONE;
+ goto send;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+ udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
+ goto send;
+ } else
+ csum = udp_csum(skb);
+
+ /* add protocol-dependent pseudo-header */
+ uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
+ len, fl6->flowi6_proto, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+send:
+ err = ip6_send_skb(skb);
+ if (err) {
+ if (err == -ENOBUFS && !inet6_sk(sk)->recverr) {
+ UDP6_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
+ err = 0;
+ }
+ } else
+ UDP6_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_OUTDATAGRAMS, is_udplite);
+ return err;
+}
+
+static int udp_v6_push_pending_frames(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct udp_sock *up = udp_sk(sk);
+ struct flowi6 fl6;
+ int err = 0;
+
+ if (up->pending == AF_INET)
+ return udp_push_pending_frames(sk);
+
+ /* ip6_finish_skb will release the cork, so make a copy of
+ * fl6 here.
+ */
+ fl6 = inet_sk(sk)->cork.fl.u.ip6;
+
+ skb = ip6_finish_skb(sk);
+ if (!skb)
+ goto out;
+
+ err = udp_v6_send_skb(skb, &fl6);
+
+out:
+ up->len = 0;
+ up->pending = 0;
+ return err;
+}
+
+int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct ipv6_txoptions opt_space;
+ struct udp_sock *up = udp_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
+ struct in6_addr *daddr, *final_p, final;
+ struct ipv6_txoptions *opt = NULL;
+ struct ip6_flowlabel *flowlabel = NULL;
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+ int addr_len = msg->msg_namelen;
+ int ulen = len;
+ int hlimit = -1;
+ int tclass = -1;
+ int dontfrag = -1;
+ int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int err;
+ int connected = 0;
+ int is_udplite = IS_UDPLITE(sk);
+ int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+
+ /* destination address check */
+ if (sin6) {
+ if (addr_len < offsetof(struct sockaddr, sa_data))
+ return -EINVAL;
+
+ switch (sin6->sin6_family) {
+ case AF_INET6:
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+ daddr = &sin6->sin6_addr;
+ break;
+ case AF_INET:
+ goto do_udp_sendmsg;
+ case AF_UNSPEC:
+ msg->msg_name = sin6 = NULL;
+ msg->msg_namelen = addr_len = 0;
+ daddr = NULL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ } else if (!up->pending) {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+ daddr = &sk->sk_v6_daddr;
+ } else
+ daddr = NULL;
+
+ if (daddr) {
+ if (ipv6_addr_v4mapped(daddr)) {
+ struct sockaddr_in sin;
+ sin.sin_family = AF_INET;
+ sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport;
+ sin.sin_addr.s_addr = daddr->s6_addr32[3];
+ msg->msg_name = &sin;
+ msg->msg_namelen = sizeof(sin);
+do_udp_sendmsg:
+ if (__ipv6_only_sock(sk))
+ return -ENETUNREACH;
+ return udp_sendmsg(sk, msg, len);
+ }
+ }
+
+ if (up->pending == AF_INET)
+ return udp_sendmsg(sk, msg, len);
+
+ /* Rough check on arithmetic overflow,
+ better check is made in ip6_append_data().
+ */
+ if (len > INT_MAX - sizeof(struct udphdr))
+ return -EMSGSIZE;
+
+ getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+ if (up->pending) {
+ /*
+ * There are pending frames.
+ * The socket lock must be held while it's corked.
+ */
+ lock_sock(sk);
+ if (likely(up->pending)) {
+ if (unlikely(up->pending != AF_INET6)) {
+ release_sock(sk);
+ return -EAFNOSUPPORT;
+ }
+ dst = NULL;
+ goto do_append_data;
+ }
+ release_sock(sk);
+ }
+ ulen += sizeof(struct udphdr);
+
+ memset(&fl6, 0, sizeof(fl6));
+
+ if (sin6) {
+ if (sin6->sin6_port == 0)
+ return -EINVAL;
+
+ fl6.fl6_dport = sin6->sin6_port;
+ daddr = &sin6->sin6_addr;
+
+ if (np->sndflow) {
+ fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (!flowlabel)
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Otherwise it will be difficult to maintain
+ * sk->sk_dst_cache.
+ */
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
+ daddr = &sk->sk_v6_daddr;
+
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ sin6->sin6_scope_id &&
+ __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
+ fl6.flowi6_oif = sin6->sin6_scope_id;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+
+ fl6.fl6_dport = inet->inet_dport;
+ daddr = &sk->sk_v6_daddr;
+ fl6.flowlabel = np->flow_label;
+ connected = 1;
+ }
+
+ if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+
+ if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+
+ fl6.flowi6_mark = sk->sk_mark;
+
+ if (msg->msg_controllen) {
+ opt = &opt_space;
+ memset(opt, 0, sizeof(struct ipv6_txoptions));
+ opt->tot_len = sizeof(*opt);
+
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt,
+ &hlimit, &tclass, &dontfrag);
+ if (err < 0) {
+ fl6_sock_release(flowlabel);
+ return err;
+ }
+ if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (!flowlabel)
+ return -EINVAL;
+ }
+ if (!(opt->opt_nflen|opt->opt_flen))
+ opt = NULL;
+ connected = 0;
+ }
+ if (!opt)
+ opt = np->opt;
+ if (flowlabel)
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ opt = ipv6_fixup_options(&opt_space, opt);
+
+ fl6.flowi6_proto = sk->sk_protocol;
+ if (!ipv6_addr_any(daddr))
+ fl6.daddr = *daddr;
+ else
+ fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
+ fl6.saddr = np->saddr;
+ fl6.fl6_sport = inet->inet_sport;
+
+ final_p = fl6_update_dst(&fl6, opt, &final);
+ if (final_p)
+ connected = 0;
+
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
+ fl6.flowi6_oif = np->mcast_oif;
+ connected = 0;
+ } else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->ucast_oif;
+
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto out;
+ }
+
+ if (hlimit < 0)
+ hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+
+ if (tclass < 0)
+ tclass = np->tclass;
+
+ if (msg->msg_flags&MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
+ /* Lockless fast path for the non-corking case */
+ if (!corkreq) {
+ struct sk_buff *skb;
+
+ skb = ip6_make_skb(sk, getfrag, msg, ulen,
+ sizeof(struct udphdr), hlimit, tclass, opt,
+ &fl6, (struct rt6_info *)dst,
+ msg->msg_flags, dontfrag);
+ err = PTR_ERR(skb);
+ if (!IS_ERR_OR_NULL(skb))
+ err = udp_v6_send_skb(skb, &fl6);
+ goto release_dst;
+ }
+
+ lock_sock(sk);
+ if (unlikely(up->pending)) {
+ /* The socket is already corked while preparing it. */
+ /* ... which is an evident application bug. --ANK */
+ release_sock(sk);
+
+ net_dbg_ratelimited("udp cork app bug 2\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ up->pending = AF_INET6;
+
+do_append_data:
+ if (dontfrag < 0)
+ dontfrag = np->dontfrag;
+ up->len += ulen;
+ err = ip6_append_data(sk, getfrag, msg, ulen,
+ sizeof(struct udphdr), hlimit, tclass, opt, &fl6,
+ (struct rt6_info *)dst,
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag);
+ if (err)
+ udp_v6_flush_pending_frames(sk);
+ else if (!corkreq)
+ err = udp_v6_push_pending_frames(sk);
+ else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
+ up->pending = 0;
+
+ if (err > 0)
+ err = np->recverr ? net_xmit_errno(err) : 0;
+ release_sock(sk);
+
+release_dst:
+ if (dst) {
+ if (connected) {
+ ip6_dst_store(sk, dst,
+ ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr) ?
+ &sk->sk_v6_daddr : NULL,
+#ifdef CONFIG_IPV6_SUBTREES
+ ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
+ &np->saddr :
+#endif
+ NULL);
+ } else {
+ dst_release(dst);
+ }
+ dst = NULL;
+ }
+
+out:
+ dst_release(dst);
+ fl6_sock_release(flowlabel);
+ if (!err)
+ return len;
+ /*
+ * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
+ * ENOBUFS might not be good (it's not tunable per se), but otherwise
+ * we don't have a good statistic (IpOutDiscards but it can be too many
+ * things). We could add another new stat but at least for now that
+ * seems like overkill.
+ */
+ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ UDP6_INC_STATS_USER(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
+ }
+ return err;
+
+do_confirm:
+ dst_confirm(dst);
+ if (!(msg->msg_flags&MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
+}
+
+void udpv6_destroy_sock(struct sock *sk)
+{
+ struct udp_sock *up = udp_sk(sk);
+ lock_sock(sk);
+ udp_v6_flush_pending_frames(sk);
+ release_sock(sk);
+
+ if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = ACCESS_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+
+ inet6_destroy_sock(sk);
+}
+
+/*
+ * Socket option code for UDP
+ */
+int udpv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ udp_v6_push_pending_frames);
+ return ipv6_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ udp_v6_push_pending_frames);
+ return compat_ipv6_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+int udpv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+ return ipv6_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level == SOL_UDP || level == SOL_UDPLITE)
+ return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+ return compat_ipv6_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static const struct inet6_protocol udpv6_protocol = {
+ .handler = udpv6_rcv,
+ .err_handler = udpv6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+int udp6_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
+ } else {
+ int bucket = ((struct udp_iter_state *)seq->private)->bucket;
+ struct inet_sock *inet = inet_sk(v);
+ __u16 srcp = ntohs(inet->inet_sport);
+ __u16 destp = ntohs(inet->inet_dport);
+ ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket);
+ }
+ return 0;
+}
+
+static const struct file_operations udp6_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = udp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+static struct udp_seq_afinfo udp6_seq_afinfo = {
+ .name = "udp6",
+ .family = AF_INET6,
+ .udp_table = &udp_table,
+ .seq_fops = &udp6_afinfo_seq_fops,
+ .seq_ops = {
+ .show = udp6_seq_show,
+ },
+};
+
+int __net_init udp6_proc_init(struct net *net)
+{
+ return udp_proc_register(net, &udp6_seq_afinfo);
+}
+
+void udp6_proc_exit(struct net *net) {
+ udp_proc_unregister(net, &udp6_seq_afinfo);
+}
+#endif /* CONFIG_PROC_FS */
+
+void udp_v6_clear_sk(struct sock *sk, int size)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* we do not want to clear pinet6 field, because of RCU lookups */
+ sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6));
+
+ size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
+ memset(&inet->pinet6 + 1, 0, size);
+}
+
+/* ------------------------------------------------------------------------ */
+
+struct proto udpv6_prot = {
+ .name = "UDPv6",
+ .owner = THIS_MODULE,
+ .close = udp_lib_close,
+ .connect = ip6_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .destroy = udpv6_destroy_sock,
+ .setsockopt = udpv6_setsockopt,
+ .getsockopt = udpv6_getsockopt,
+ .sendmsg = udpv6_sendmsg,
+ .recvmsg = udpv6_recvmsg,
+ .backlog_rcv = __udpv6_queue_rcv_skb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+ .rehash = udp_v6_rehash,
+ .get_port = udp_v6_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem = &sysctl_udp_wmem_min,
+ .sysctl_rmem = &sysctl_udp_rmem_min,
+ .obj_size = sizeof(struct udp6_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .h.udp_table = &udp_table,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_udpv6_setsockopt,
+ .compat_getsockopt = compat_udpv6_getsockopt,
+#endif
+ .clear_sk = udp_v6_clear_sk,
+};
+
+static struct inet_protosw udpv6_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDP,
+ .prot = &udpv6_prot,
+ .ops = &inet6_dgram_ops,
+ .flags = INET_PROTOSW_PERMANENT,
+};
+
+int __init udpv6_init(void)
+{
+ int ret;
+
+ ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP);
+ if (ret)
+ goto out;
+
+ ret = inet6_register_protosw(&udpv6_protosw);
+ if (ret)
+ goto out_udpv6_protocol;
+out:
+ return ret;
+
+out_udpv6_protocol:
+ inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
+ goto out;
+}
+
+void udpv6_exit(void)
+{
+ inet6_unregister_protosw(&udpv6_protosw);
+ inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
+}
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
new file mode 100644
index 000000000..0682c031c
--- /dev/null
+++ b/net/ipv6/udp_impl.h
@@ -0,0 +1,37 @@
+#ifndef _UDP6_IMPL_H
+#define _UDP6_IMPL_H
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/protocol.h>
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/transp_v6.h>
+
+int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
+void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
+ __be32, struct udp_table *);
+
+int udp_v6_get_port(struct sock *sk, unsigned short snum);
+
+int udpv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int udpv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+#ifdef CONFIG_COMPAT
+int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+#endif
+int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len);
+int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+void udpv6_destroy_sock(struct sock *sk);
+
+void udp_v6_clear_sk(struct sock *sk, int size);
+
+#ifdef CONFIG_PROC_FS
+int udp6_seq_show(struct seq_file *seq, void *v);
+#endif
+#endif /* _UDP6_IMPL_H */
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
new file mode 100644
index 000000000..7441e1e63
--- /dev/null
+++ b/net/ipv6/udp_offload.c
@@ -0,0 +1,184 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET6 implementation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * UDPv6 GSO support
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/protocol.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+#include "ip6_offload.h"
+
+static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ unsigned int mss;
+ unsigned int unfrag_ip6hlen, unfrag_len;
+ struct frag_hdr *fptr;
+ u8 *packet_start, *prevhdr;
+ u8 nexthdr;
+ u8 frag_hdr_sz = sizeof(struct frag_hdr);
+ __wsum csum;
+ int tnl_hlen;
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
+
+ if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ int type = skb_shinfo(skb)->gso_type;
+
+ if (unlikely(type & ~(SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_UDP_TUNNEL |
+ SKB_GSO_UDP_TUNNEL_CSUM |
+ SKB_GSO_TUNNEL_REMCSUM |
+ SKB_GSO_GRE |
+ SKB_GSO_GRE_CSUM |
+ SKB_GSO_IPIP |
+ SKB_GSO_SIT) ||
+ !(type & (SKB_GSO_UDP))))
+ goto out;
+
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+ /* Set the IPv6 fragment id if not set yet */
+ if (!skb_shinfo(skb)->ip6_frag_id)
+ ipv6_proxy_select_ident(dev_net(skb->dev), skb);
+
+ segs = NULL;
+ goto out;
+ }
+
+ if (skb->encapsulation && skb_shinfo(skb)->gso_type &
+ (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
+ segs = skb_udp_tunnel_segment(skb, features, true);
+ else {
+ const struct ipv6hdr *ipv6h;
+ struct udphdr *uh;
+
+ if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+ goto out;
+
+ /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+ * do checksum of UDP packets sent as multiple IP fragments.
+ */
+
+ uh = udp_hdr(skb);
+ ipv6h = ipv6_hdr(skb);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
+ &ipv6h->daddr, csum);
+
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Check if there is enough headroom to insert fragment header. */
+ tnl_hlen = skb_tnl_header_len(skb);
+ if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
+ if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
+ goto out;
+ }
+
+ /* Find the unfragmentable header and shift it left by frag_hdr_sz
+ * bytes to insert fragment header.
+ */
+ unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ nexthdr = *prevhdr;
+ *prevhdr = NEXTHDR_FRAGMENT;
+ unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
+ unfrag_ip6hlen + tnl_hlen;
+ packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
+ memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
+
+ SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
+ skb->mac_header -= frag_hdr_sz;
+ skb->network_header -= frag_hdr_sz;
+
+ fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+ fptr->nexthdr = nexthdr;
+ fptr->reserved = 0;
+ if (!skb_shinfo(skb)->ip6_frag_id)
+ ipv6_proxy_select_ident(dev_net(skb->dev), skb);
+ fptr->identification = skb_shinfo(skb)->ip6_frag_id;
+
+ /* Fragment the skb. ipv6 header and the remaining fields of the
+ * fragment header are updated in ipv6_gso_segment()
+ */
+ segs = skb_segment(skb, features);
+ }
+
+out:
+ return segs;
+}
+
+static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_gro_udphdr(skb);
+
+ if (unlikely(!uh))
+ goto flush;
+
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (NAPI_GRO_CB(skb)->flush)
+ goto skip;
+
+ if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
+ ip6_gro_compute_pseudo))
+ goto flush;
+ else if (uh->check)
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ ip6_gro_compute_pseudo);
+
+skip:
+ NAPI_GRO_CB(skb)->is_ipv6 = 1;
+ return udp_gro_receive(head, skb, uh);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+
+ if (uh->check) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
+ &ipv6h->daddr, 0);
+ } else {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
+ }
+
+ return udp_gro_complete(skb, nhoff);
+}
+
+static const struct net_offload udpv6_offload = {
+ .callbacks = {
+ .gso_segment = udp6_ufo_fragment,
+ .gro_receive = udp6_gro_receive,
+ .gro_complete = udp6_gro_complete,
+ },
+};
+
+int __init udp_offload_init(void)
+{
+ return inet6_add_offload(&udpv6_offload, IPPROTO_UDP);
+}
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
new file mode 100644
index 000000000..9cf097e20
--- /dev/null
+++ b/net/ipv6/udplite.c
@@ -0,0 +1,139 @@
+/*
+ * UDPLITEv6 An implementation of the UDP-Lite protocol over IPv6.
+ * See also net/ipv4/udplite.c
+ *
+ * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ *
+ * Changes:
+ * Fixes:
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/export.h>
+#include "udp_impl.h"
+
+static int udplitev6_rcv(struct sk_buff *skb)
+{
+ return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
+}
+
+static void udplitev6_err(struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
+}
+
+static const struct inet6_protocol udplitev6_protocol = {
+ .handler = udplitev6_rcv,
+ .err_handler = udplitev6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+struct proto udplitev6_prot = {
+ .name = "UDPLITEv6",
+ .owner = THIS_MODULE,
+ .close = udp_lib_close,
+ .connect = ip6_datagram_connect,
+ .disconnect = udp_disconnect,
+ .ioctl = udp_ioctl,
+ .init = udplite_sk_init,
+ .destroy = udpv6_destroy_sock,
+ .setsockopt = udpv6_setsockopt,
+ .getsockopt = udpv6_getsockopt,
+ .sendmsg = udpv6_sendmsg,
+ .recvmsg = udpv6_recvmsg,
+ .backlog_rcv = udpv6_queue_rcv_skb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+ .get_port = udp_v6_get_port,
+ .obj_size = sizeof(struct udp6_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .h.udp_table = &udplite_table,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_udpv6_setsockopt,
+ .compat_getsockopt = compat_udpv6_getsockopt,
+#endif
+ .clear_sk = udp_v6_clear_sk,
+};
+
+static struct inet_protosw udplite6_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDPLITE,
+ .prot = &udplitev6_prot,
+ .ops = &inet6_dgram_ops,
+ .flags = INET_PROTOSW_PERMANENT,
+};
+
+int __init udplitev6_init(void)
+{
+ int ret;
+
+ ret = inet6_add_protocol(&udplitev6_protocol, IPPROTO_UDPLITE);
+ if (ret)
+ goto out;
+
+ ret = inet6_register_protosw(&udplite6_protosw);
+ if (ret)
+ goto out_udplitev6_protocol;
+out:
+ return ret;
+
+out_udplitev6_protocol:
+ inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE);
+ goto out;
+}
+
+void udplitev6_exit(void)
+{
+ inet6_unregister_protosw(&udplite6_protosw);
+ inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE);
+}
+
+#ifdef CONFIG_PROC_FS
+
+static const struct file_operations udplite6_afinfo_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = udp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net
+};
+
+static struct udp_seq_afinfo udplite6_seq_afinfo = {
+ .name = "udplite6",
+ .family = AF_INET6,
+ .udp_table = &udplite_table,
+ .seq_fops = &udplite6_afinfo_seq_fops,
+ .seq_ops = {
+ .show = udp6_seq_show,
+ },
+};
+
+static int __net_init udplite6_proc_init_net(struct net *net)
+{
+ return udp_proc_register(net, &udplite6_seq_afinfo);
+}
+
+static void __net_exit udplite6_proc_exit_net(struct net *net)
+{
+ udp_proc_unregister(net, &udplite6_seq_afinfo);
+}
+
+static struct pernet_operations udplite6_net_ops = {
+ .init = udplite6_proc_init_net,
+ .exit = udplite6_proc_exit_net,
+};
+
+int __init udplite6_proc_init(void)
+{
+ return register_pernet_subsys(&udplite6_net_ops);
+}
+
+void udplite6_proc_exit(void)
+{
+ unregister_pernet_subsys(&udplite6_net_ops);
+}
+#endif
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
new file mode 100644
index 000000000..74bd17882
--- /dev/null
+++ b/net/ipv6/xfrm6_input.c
@@ -0,0 +1,145 @@
+/*
+ * xfrm6_input.c: based on net/ipv4/xfrm4_input.c
+ *
+ * Authors:
+ * Mitsuru KANDA @USAGI
+ * Kazunori MIYAZAWA @USAGI
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * YOSHIFUJI Hideaki @USAGI
+ * IPv6 support
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ return xfrm6_extract_header(skb);
+}
+
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+ return xfrm_input(skb, nexthdr, spi, 0);
+}
+EXPORT_SYMBOL(xfrm6_rcv_spi);
+
+int xfrm6_transport_finish(struct sk_buff *skb, int async)
+{
+ skb_network_header(skb)[IP6CB(skb)->nhoff] =
+ XFRM_MODE_SKB_CB(skb)->protocol;
+
+#ifndef CONFIG_NETFILTER
+ if (!async)
+ return 1;
+#endif
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len);
+ __skb_push(skb, skb->data - skb_network_header(skb));
+
+ NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, NULL, skb,
+ skb->dev, NULL,
+ ip6_rcv_finish);
+ return -1;
+}
+
+int xfrm6_rcv(struct sk_buff *skb)
+{
+ return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
+ 0);
+}
+EXPORT_SYMBOL(xfrm6_rcv);
+
+int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
+ xfrm_address_t *saddr, u8 proto)
+{
+ struct net *net = dev_net(skb->dev);
+ struct xfrm_state *x = NULL;
+ int i = 0;
+
+ /* Allocate new secpath or COW existing one. */
+ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+ struct sec_path *sp;
+
+ sp = secpath_dup(skb->sp);
+ if (!sp) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+ goto drop;
+ }
+ if (skb->sp)
+ secpath_put(skb->sp);
+ skb->sp = sp;
+ }
+
+ if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+ goto drop;
+ }
+
+ for (i = 0; i < 3; i++) {
+ xfrm_address_t *dst, *src;
+
+ switch (i) {
+ case 0:
+ dst = daddr;
+ src = saddr;
+ break;
+ case 1:
+ /* lookup state with wild-card source address */
+ dst = daddr;
+ src = (xfrm_address_t *)&in6addr_any;
+ break;
+ default:
+ /* lookup state with wild-card addresses */
+ dst = (xfrm_address_t *)&in6addr_any;
+ src = (xfrm_address_t *)&in6addr_any;
+ break;
+ }
+
+ x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6);
+ if (!x)
+ continue;
+
+ spin_lock(&x->lock);
+
+ if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) &&
+ likely(x->km.state == XFRM_STATE_VALID) &&
+ !xfrm_state_check_expire(x)) {
+ spin_unlock(&x->lock);
+ if (x->type->input(x, skb) > 0) {
+ /* found a valid state */
+ break;
+ }
+ } else
+ spin_unlock(&x->lock);
+
+ xfrm_state_put(x);
+ x = NULL;
+ }
+
+ if (!x) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
+ xfrm_audit_state_notfound_simple(skb, AF_INET6);
+ goto drop;
+ }
+
+ skb->sp->xvec[skb->sp->len++] = x;
+
+ spin_lock(&x->lock);
+
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+
+ spin_unlock(&x->lock);
+
+ return 1;
+
+drop:
+ return -1;
+}
+EXPORT_SYMBOL(xfrm6_input_addr);
diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c
new file mode 100644
index 000000000..1e205c325
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_beet.c
@@ -0,0 +1,131 @@
+/*
+ * xfrm6_mode_beet.c - BEET mode encapsulation for IPv6.
+ *
+ * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
+ * Miika Komu <miika@iki.fi>
+ * Herbert Xu <herbert@gondor.apana.org.au>
+ * Abhinav Pathak <abhinav.pathak@hiit.fi>
+ * Jeff Ahrenholz <ahrenholz@gmail.com>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dsfield.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+static void xfrm6_beet_make_header(struct sk_buff *skb)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ iph->version = 6;
+
+ memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl,
+ sizeof(iph->flow_lbl));
+ iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol;
+
+ ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos);
+ iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl;
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
+ */
+static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ipv6hdr *top_iph;
+ struct ip_beet_phdr *ph;
+ int optlen, hdr_len;
+
+ hdr_len = 0;
+ optlen = XFRM_MODE_SKB_CB(skb)->optlen;
+ if (unlikely(optlen))
+ hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4);
+
+ skb_set_network_header(skb, -x->props.header_len - hdr_len);
+ if (x->sel.family != AF_INET6)
+ skb->network_header += IPV4_BEET_PHMAXLEN;
+ skb->mac_header = skb->network_header +
+ offsetof(struct ipv6hdr, nexthdr);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
+ ph = (struct ip_beet_phdr *)__skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl-hdr_len);
+
+ xfrm6_beet_make_header(skb);
+
+ top_iph = ipv6_hdr(skb);
+ if (unlikely(optlen)) {
+
+ BUG_ON(optlen < 0);
+
+ ph->padlen = 4 - (optlen & 4);
+ ph->hdrlen = optlen / 8;
+ ph->nexthdr = top_iph->nexthdr;
+ if (ph->padlen)
+ memset(ph + 1, IPOPT_NOP, ph->padlen);
+
+ top_iph->nexthdr = IPPROTO_BEETPH;
+ }
+
+ top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
+ top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
+ return 0;
+}
+
+static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ipv6hdr *ip6h;
+ int size = sizeof(struct ipv6hdr);
+ int err;
+
+ err = skb_cow_head(skb, size + skb->mac_len);
+ if (err)
+ goto out;
+
+ __skb_push(skb, size);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ xfrm6_beet_make_header(skb);
+
+ ip6h = ipv6_hdr(skb);
+ ip6h->payload_len = htons(skb->len - size);
+ ip6h->daddr = x->sel.daddr.in6;
+ ip6h->saddr = x->sel.saddr.in6;
+ err = 0;
+out:
+ return err;
+}
+
+static struct xfrm_mode xfrm6_beet_mode = {
+ .input2 = xfrm6_beet_input,
+ .input = xfrm_prepare_input,
+ .output2 = xfrm6_beet_output,
+ .output = xfrm6_prepare_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_BEET,
+ .flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm6_beet_init(void)
+{
+ return xfrm_register_mode(&xfrm6_beet_mode, AF_INET6);
+}
+
+static void __exit xfrm6_beet_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm6_beet_mode, AF_INET6);
+ BUG_ON(err);
+}
+
+module_init(xfrm6_beet_init);
+module_exit(xfrm6_beet_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_BEET);
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
new file mode 100644
index 000000000..0e015906f
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -0,0 +1,83 @@
+/*
+ * xfrm6_mode_ro.c - Route optimization mode for IPv6.
+ *
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Authors:
+ * Noriaki TAKAMIYA @USAGI
+ * Masahide NAKAMURA @USAGI
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/stringify.h>
+#include <linux/time.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+/* Add route optimization header space.
+ *
+ * The IP header and mutable extension headers will be moved forward to make
+ * space for the route optimization header.
+ */
+static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ipv6hdr *iph;
+ u8 *prevhdr;
+ int hdr_len;
+
+ iph = ipv6_hdr(skb);
+
+ hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
+ skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->transport_header = skb->network_header + hdr_len;
+ __skb_pull(skb, hdr_len);
+ memmove(ipv6_hdr(skb), iph, hdr_len);
+
+ x->lastused = get_seconds();
+
+ return 0;
+}
+
+static struct xfrm_mode xfrm6_ro_mode = {
+ .output = xfrm6_ro_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_ROUTEOPTIMIZATION,
+};
+
+static int __init xfrm6_ro_init(void)
+{
+ return xfrm_register_mode(&xfrm6_ro_mode, AF_INET6);
+}
+
+static void __exit xfrm6_ro_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm6_ro_mode, AF_INET6);
+ BUG_ON(err);
+}
+
+module_init(xfrm6_ro_init);
+module_exit(xfrm6_ro_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_ROUTEOPTIMIZATION);
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
new file mode 100644
index 000000000..4e344105b
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -0,0 +1,85 @@
+/*
+ * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header and mutable extension headers will be moved forward to make
+ * space for the encapsulation header.
+ */
+static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ipv6hdr *iph;
+ u8 *prevhdr;
+ int hdr_len;
+
+ iph = ipv6_hdr(skb);
+
+ hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
+ skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->transport_header = skb->network_header + hdr_len;
+ __skb_pull(skb, hdr_len);
+ memmove(ipv6_hdr(skb), iph, hdr_len);
+ return 0;
+}
+
+/* Remove encapsulation header.
+ *
+ * The IP header will be moved over the top of the encapsulation header.
+ *
+ * On entry, skb->h shall point to where the IP header should be and skb->nh
+ * shall be set to where the IP header currently is. skb->data shall point
+ * to the start of the payload.
+ */
+static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int ihl = skb->data - skb_transport_header(skb);
+
+ if (skb->transport_header != skb->network_header) {
+ memmove(skb_transport_header(skb),
+ skb_network_header(skb), ihl);
+ skb->network_header = skb->transport_header;
+ }
+ ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
+ sizeof(struct ipv6hdr));
+ skb_reset_transport_header(skb);
+ return 0;
+}
+
+static struct xfrm_mode xfrm6_transport_mode = {
+ .input = xfrm6_transport_input,
+ .output = xfrm6_transport_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm6_transport_init(void)
+{
+ return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6);
+}
+
+static void __exit xfrm6_transport_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6);
+ BUG_ON(err);
+}
+
+module_init(xfrm6_transport_init);
+module_exit(xfrm6_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
new file mode 100644
index 000000000..901ef6f8a
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -0,0 +1,126 @@
+/*
+ * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dsfield.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
+{
+ const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
+ struct ipv6hdr *inner_iph = ipipv6_hdr(skb);
+
+ if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
+ IP6_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.
+ */
+static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct ipv6hdr *top_iph;
+ int dsfield;
+
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->mac_header = skb->network_header +
+ offsetof(struct ipv6hdr, nexthdr);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
+ top_iph = ipv6_hdr(skb);
+
+ top_iph->version = 6;
+
+ memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl,
+ sizeof(top_iph->flow_lbl));
+ top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family);
+
+ if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP)
+ dsfield = 0;
+ else
+ dsfield = XFRM_MODE_SKB_CB(skb)->tos;
+ dsfield = INET_ECN_encapsulate(dsfield, XFRM_MODE_SKB_CB(skb)->tos);
+ if (x->props.flags & XFRM_STATE_NOECN)
+ dsfield &= ~INET_ECN_MASK;
+ ipv6_change_dsfield(top_iph, 0, dsfield);
+ top_iph->hop_limit = ip6_dst_hoplimit(dst->child);
+ top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
+ top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
+ return 0;
+}
+
+#define for_each_input_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next))
+
+
+static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = -EINVAL;
+
+ if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6)
+ goto out;
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto out;
+
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err)
+ goto out;
+
+ if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+ ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)),
+ ipipv6_hdr(skb));
+ if (!(x->props.flags & XFRM_STATE_NOECN))
+ ipip6_ecn_decapsulate(skb);
+
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static struct xfrm_mode xfrm6_tunnel_mode = {
+ .input2 = xfrm6_mode_tunnel_input,
+ .input = xfrm_prepare_input,
+ .output2 = xfrm6_mode_tunnel_output,
+ .output = xfrm6_prepare_output,
+ .owner = THIS_MODULE,
+ .encap = XFRM_MODE_TUNNEL,
+ .flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm6_mode_tunnel_init(void)
+{
+ return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6);
+}
+
+static void __exit xfrm6_mode_tunnel_exit(void)
+{
+ int err;
+
+ err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6);
+ BUG_ON(err);
+}
+
+module_init(xfrm6_mode_tunnel_init);
+module_exit(xfrm6_mode_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
new file mode 100644
index 000000000..09c76a7b4
--- /dev/null
+++ b/net/ipv6/xfrm6_output.c
@@ -0,0 +1,174 @@
+/*
+ * xfrm6_output.c - Common IPsec encapsulation code for IPv6.
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/xfrm.h>
+
+int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
+ u8 **prevhdr)
+{
+ return ip6_find_1stfragopt(skb, prevhdr);
+}
+EXPORT_SYMBOL(xfrm6_find_1stfragopt);
+
+static int xfrm6_local_dontfrag(struct sk_buff *skb)
+{
+ int proto;
+ struct sock *sk = skb->sk;
+
+ if (sk) {
+ if (sk->sk_family != AF_INET6)
+ return 0;
+
+ proto = sk->sk_protocol;
+ if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
+ return inet6_sk(sk)->dontfrag;
+ }
+
+ return 0;
+}
+
+static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
+{
+ struct flowi6 fl6;
+ struct sock *sk = skb->sk;
+
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.daddr = ipv6_hdr(skb)->daddr;
+
+ ipv6_local_rxpmtu(sk, &fl6, mtu);
+}
+
+void xfrm6_local_error(struct sk_buff *skb, u32 mtu)
+{
+ struct flowi6 fl6;
+ const struct ipv6hdr *hdr;
+ struct sock *sk = skb->sk;
+
+ hdr = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb);
+ fl6.fl6_dport = inet_sk(sk)->inet_dport;
+ fl6.daddr = hdr->daddr;
+
+ ipv6_local_error(sk, EMSGSIZE, &fl6, mtu);
+}
+
+static int xfrm6_tunnel_check_size(struct sk_buff *skb)
+{
+ int mtu, ret = 0;
+ struct dst_entry *dst = skb_dst(skb);
+
+ mtu = dst_mtu(dst);
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ if (!skb->ignore_df && skb->len > mtu) {
+ skb->dev = dst->dev;
+
+ if (xfrm6_local_dontfrag(skb))
+ xfrm6_local_rxpmtu(skb, mtu);
+ else if (skb->sk)
+ xfrm_local_error(skb, mtu);
+ else
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ ret = -EMSGSIZE;
+ }
+
+ return ret;
+}
+
+int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ err = xfrm6_tunnel_check_size(skb);
+ if (err)
+ return err;
+
+ XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr;
+
+ return xfrm6_extract_header(skb);
+}
+
+int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ err = xfrm_inner_extract_output(x, skb);
+ if (err)
+ return err;
+
+ skb->ignore_df = 1;
+ skb->protocol = htons(ETH_P_IPV6);
+
+ return x->outer_mode->output2(x, skb);
+}
+EXPORT_SYMBOL(xfrm6_prepare_output);
+
+int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb)
+{
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+
+#ifdef CONFIG_NETFILTER
+ IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
+#endif
+
+ return xfrm_output(sk, skb);
+}
+
+static int __xfrm6_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+ int mtu;
+
+#ifdef CONFIG_NETFILTER
+ if (!x) {
+ IP6CB(skb)->flags |= IP6SKB_REROUTED;
+ return dst_output_sk(sk, skb);
+ }
+#endif
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ mtu = ip6_skb_dst_mtu(skb);
+ else
+ mtu = dst_mtu(skb_dst(skb));
+
+ if (skb->len > mtu && xfrm6_local_dontfrag(skb)) {
+ xfrm6_local_rxpmtu(skb, mtu);
+ return -EMSGSIZE;
+ } else if (!skb->ignore_df && skb->len > mtu && skb->sk) {
+ xfrm_local_error(skb, mtu);
+ return -EMSGSIZE;
+ }
+
+ if (x->props.mode == XFRM_MODE_TUNNEL &&
+ ((skb->len > mtu && !skb_is_gso(skb)) ||
+ dst_allfrag(skb_dst(skb)))) {
+ return ip6_fragment(sk, skb,
+ x->outer_mode->afinfo->output_finish);
+ }
+ return x->outer_mode->afinfo->output_finish(sk, skb);
+}
+
+int xfrm6_output(struct sock *sk, struct sk_buff *skb)
+{
+ return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
+ NULL, skb_dst(skb)->dev, __xfrm6_output,
+ !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
new file mode 100644
index 000000000..f337a908a
--- /dev/null
+++ b/net/ipv6/xfrm6_policy.c
@@ -0,0 +1,426 @@
+/*
+ * xfrm6_policy.c: based on xfrm4_policy.c
+ *
+ * Authors:
+ * Mitsuru KANDA @USAGI
+ * Kazunori MIYAZAWA @USAGI
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * IPv6 support
+ * YOSHIFUJI Hideaki
+ * Split up af-specific portion
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+#include <net/mip6.h>
+#endif
+
+static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
+
+static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr)
+{
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+ int err;
+
+ memset(&fl6, 0, sizeof(fl6));
+ memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
+ if (saddr)
+ memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
+
+ dst = ip6_route_output(net, NULL, &fl6);
+
+ err = dst->error;
+ if (dst->error) {
+ dst_release(dst);
+ dst = ERR_PTR(err);
+ }
+
+ return dst;
+}
+
+static int xfrm6_get_saddr(struct net *net,
+ xfrm_address_t *saddr, xfrm_address_t *daddr)
+{
+ struct dst_entry *dst;
+ struct net_device *dev;
+
+ dst = xfrm6_dst_lookup(net, 0, NULL, daddr);
+ if (IS_ERR(dst))
+ return -EHOSTUNREACH;
+
+ dev = ip6_dst_idev(dst)->dev;
+ ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6);
+ dst_release(dst);
+ return 0;
+}
+
+static int xfrm6_get_tos(const struct flowi *fl)
+{
+ return 0;
+}
+
+static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst)
+{
+ struct rt6_info *rt = (struct rt6_info *)xdst;
+
+ rt6_init_peer(rt, net->ipv6.peers);
+}
+
+static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+ int nfheader_len)
+{
+ if (dst->ops->family == AF_INET6) {
+ struct rt6_info *rt = (struct rt6_info *)dst;
+ if (rt->rt6i_node)
+ path->path_cookie = rt->rt6i_node->fn_sernum;
+ }
+
+ path->u.rt6.rt6i_nfheader_len = nfheader_len;
+
+ return 0;
+}
+
+static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+ const struct flowi *fl)
+{
+ struct rt6_info *rt = (struct rt6_info *)xdst->route;
+
+ xdst->u.dst.dev = dev;
+ dev_hold(dev);
+
+ xdst->u.rt6.rt6i_idev = in6_dev_get(dev);
+ if (!xdst->u.rt6.rt6i_idev) {
+ dev_put(dev);
+ return -ENODEV;
+ }
+
+ rt6_transfer_peer(&xdst->u.rt6, rt);
+
+ /* Sheit... I remember I did this right. Apparently,
+ * it was magically lost, so this code needs audit */
+ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
+ RTF_LOCAL);
+ xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
+ xdst->u.rt6.rt6i_node = rt->rt6i_node;
+ if (rt->rt6i_node)
+ xdst->route_cookie = rt->rt6i_node->fn_sernum;
+ xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
+ xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
+ xdst->u.rt6.rt6i_src = rt->rt6i_src;
+
+ return 0;
+}
+
+static inline void
+_decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
+{
+ struct flowi6 *fl6 = &fl->u.ip6;
+ int onlyproto = 0;
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
+ u16 offset = sizeof(*hdr);
+ struct ipv6_opt_hdr *exthdr;
+ const unsigned char *nh = skb_network_header(skb);
+ u16 nhoff = IP6CB(skb)->nhoff;
+ int oif = 0;
+ u8 nexthdr;
+
+ if (!nhoff)
+ nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+ nexthdr = nh[nhoff];
+
+ if (skb_dst(skb))
+ oif = skb_dst(skb)->dev->ifindex;
+
+ memset(fl6, 0, sizeof(struct flowi6));
+ fl6->flowi6_mark = skb->mark;
+ fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
+
+ fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
+ fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
+
+ while (nh + offset + 1 < skb->data ||
+ pskb_may_pull(skb, nh + offset + 1 - skb->data)) {
+ nh = skb_network_header(skb);
+ exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+
+ switch (nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ onlyproto = 1;
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_HOP:
+ case NEXTHDR_DEST:
+ offset += ipv6_optlen(exthdr);
+ nexthdr = exthdr->nexthdr;
+ exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+ break;
+
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_DCCP:
+ if (!onlyproto && (nh + offset + 4 < skb->data ||
+ pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
+ __be16 *ports;
+
+ nh = skb_network_header(skb);
+ ports = (__be16 *)(nh + offset);
+ fl6->fl6_sport = ports[!!reverse];
+ fl6->fl6_dport = ports[!reverse];
+ }
+ fl6->flowi6_proto = nexthdr;
+ return;
+
+ case IPPROTO_ICMPV6:
+ if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) {
+ u8 *icmp;
+
+ nh = skb_network_header(skb);
+ icmp = (u8 *)(nh + offset);
+ fl6->fl6_icmp_type = icmp[0];
+ fl6->fl6_icmp_code = icmp[1];
+ }
+ fl6->flowi6_proto = nexthdr;
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPPROTO_MH:
+ offset += ipv6_optlen(exthdr);
+ if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) {
+ struct ip6_mh *mh;
+
+ nh = skb_network_header(skb);
+ mh = (struct ip6_mh *)(nh + offset);
+ fl6->fl6_mh_type = mh->ip6mh_type;
+ }
+ fl6->flowi6_proto = nexthdr;
+ return;
+#endif
+
+ /* XXX Why are there these headers? */
+ case IPPROTO_AH:
+ case IPPROTO_ESP:
+ case IPPROTO_COMP:
+ default:
+ fl6->fl6_ipsec_spi = 0;
+ fl6->flowi6_proto = nexthdr;
+ return;
+ }
+ }
+}
+
+static inline int xfrm6_garbage_collect(struct dst_ops *ops)
+{
+ struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
+
+ xfrm6_policy_afinfo.garbage_collect(net);
+ return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
+}
+
+static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->update_pmtu(path, sk, skb, mtu);
+}
+
+static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ struct dst_entry *path = xdst->route;
+
+ path->ops->redirect(path, sk, skb);
+}
+
+static void xfrm6_dst_destroy(struct dst_entry *dst)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+ if (likely(xdst->u.rt6.rt6i_idev))
+ in6_dev_put(xdst->u.rt6.rt6i_idev);
+ dst_destroy_metrics_generic(dst);
+ if (rt6_has_peer(&xdst->u.rt6)) {
+ struct inet_peer *peer = rt6_peer_ptr(&xdst->u.rt6);
+ inet_putpeer(peer);
+ }
+ xfrm_dst_destroy(xdst);
+}
+
+static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+ int unregister)
+{
+ struct xfrm_dst *xdst;
+
+ if (!unregister)
+ return;
+
+ xdst = (struct xfrm_dst *)dst;
+ if (xdst->u.rt6.rt6i_idev->dev == dev) {
+ struct inet6_dev *loopback_idev =
+ in6_dev_get(dev_net(dev)->loopback_dev);
+ BUG_ON(!loopback_idev);
+
+ do {
+ in6_dev_put(xdst->u.rt6.rt6i_idev);
+ xdst->u.rt6.rt6i_idev = loopback_idev;
+ in6_dev_hold(loopback_idev);
+ xdst = (struct xfrm_dst *)xdst->u.dst.child;
+ } while (xdst->u.dst.xfrm);
+
+ __in6_dev_put(loopback_idev);
+ }
+
+ xfrm_dst_ifdown(dst, dev);
+}
+
+static struct dst_ops xfrm6_dst_ops = {
+ .family = AF_INET6,
+ .gc = xfrm6_garbage_collect,
+ .update_pmtu = xfrm6_update_pmtu,
+ .redirect = xfrm6_redirect,
+ .cow_metrics = dst_cow_metrics_generic,
+ .destroy = xfrm6_dst_destroy,
+ .ifdown = xfrm6_dst_ifdown,
+ .local_out = __ip6_local_out,
+ .gc_thresh = 32768,
+};
+
+static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
+ .family = AF_INET6,
+ .dst_ops = &xfrm6_dst_ops,
+ .dst_lookup = xfrm6_dst_lookup,
+ .get_saddr = xfrm6_get_saddr,
+ .decode_session = _decode_session6,
+ .get_tos = xfrm6_get_tos,
+ .init_dst = xfrm6_init_dst,
+ .init_path = xfrm6_init_path,
+ .fill_dst = xfrm6_fill_dst,
+ .blackhole_route = ip6_blackhole_route,
+};
+
+static int __init xfrm6_policy_init(void)
+{
+ return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo);
+}
+
+static void xfrm6_policy_fini(void)
+{
+ xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo);
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm6_policy_table[] = {
+ {
+ .procname = "xfrm6_gc_thresh",
+ .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static int __net_init xfrm6_net_init(struct net *net)
+{
+ struct ctl_table *table;
+ struct ctl_table_header *hdr;
+
+ table = xfrm6_policy_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(xfrm6_policy_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh;
+ }
+
+ hdr = register_net_sysctl(net, "net/ipv6", table);
+ if (!hdr)
+ goto err_reg;
+
+ net->ipv6.sysctl.xfrm6_hdr = hdr;
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit xfrm6_net_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ if (!net->ipv6.sysctl.xfrm6_hdr)
+ return;
+
+ table = net->ipv6.sysctl.xfrm6_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv6.sysctl.xfrm6_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations xfrm6_net_ops = {
+ .init = xfrm6_net_init,
+ .exit = xfrm6_net_exit,
+};
+#endif
+
+int __init xfrm6_init(void)
+{
+ int ret;
+
+ dst_entries_init(&xfrm6_dst_ops);
+
+ ret = xfrm6_policy_init();
+ if (ret) {
+ dst_entries_destroy(&xfrm6_dst_ops);
+ goto out;
+ }
+ ret = xfrm6_state_init();
+ if (ret)
+ goto out_policy;
+
+ ret = xfrm6_protocol_init();
+ if (ret)
+ goto out_state;
+
+#ifdef CONFIG_SYSCTL
+ register_pernet_subsys(&xfrm6_net_ops);
+#endif
+out:
+ return ret;
+out_state:
+ xfrm6_state_fini();
+out_policy:
+ xfrm6_policy_fini();
+ goto out;
+}
+
+void xfrm6_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+ unregister_pernet_subsys(&xfrm6_net_ops);
+#endif
+ xfrm6_protocol_fini();
+ xfrm6_policy_fini();
+ xfrm6_state_fini();
+ dst_entries_destroy(&xfrm6_dst_ops);
+}
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
new file mode 100644
index 000000000..54d13f8db
--- /dev/null
+++ b/net/ipv6/xfrm6_protocol.c
@@ -0,0 +1,279 @@
+/* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6.
+ *
+ * Copyright (C) 2013 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Based on:
+ * net/ipv4/xfrm4_protocol.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm6_protocol __rcu *esp6_handlers __read_mostly;
+static struct xfrm6_protocol __rcu *ah6_handlers __read_mostly;
+static struct xfrm6_protocol __rcu *ipcomp6_handlers __read_mostly;
+static DEFINE_MUTEX(xfrm6_protocol_mutex);
+
+static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp6_handlers;
+ case IPPROTO_AH:
+ return &ah6_handlers;
+ case IPPROTO_COMP:
+ return &ipcomp6_handlers;
+ }
+
+ return NULL;
+}
+
+#define for_each_protocol_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
+int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+{
+ int ret;
+ struct xfrm6_protocol *handler;
+ struct xfrm6_protocol __rcu **head = proto_handlers(protocol);
+
+ if (!head)
+ return 0;
+
+ for_each_protocol_rcu(*proto_handlers(protocol), handler)
+ if ((ret = handler->cb_handler(skb, err)) <= 0)
+ return ret;
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm6_rcv_cb);
+
+static int xfrm6_esp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm6_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+
+ for_each_protocol_rcu(esp6_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_protocol *handler;
+
+ for_each_protocol_rcu(esp6_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ break;
+}
+
+static int xfrm6_ah_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm6_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+
+ for_each_protocol_rcu(ah6_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_protocol *handler;
+
+ for_each_protocol_rcu(ah6_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ break;
+}
+
+static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
+{
+ int ret;
+ struct xfrm6_protocol *handler;
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+
+ for_each_protocol_rcu(ipcomp6_handlers, handler)
+ if ((ret = handler->handler(skb)) != -EINVAL)
+ return ret;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_protocol *handler;
+
+ for_each_protocol_rcu(ipcomp6_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ break;
+}
+
+static const struct inet6_protocol esp6_protocol = {
+ .handler = xfrm6_esp_rcv,
+ .err_handler = xfrm6_esp_err,
+ .flags = INET6_PROTO_NOPOLICY,
+};
+
+static const struct inet6_protocol ah6_protocol = {
+ .handler = xfrm6_ah_rcv,
+ .err_handler = xfrm6_ah_err,
+ .flags = INET6_PROTO_NOPOLICY,
+};
+
+static const struct inet6_protocol ipcomp6_protocol = {
+ .handler = xfrm6_ipcomp_rcv,
+ .err_handler = xfrm6_ipcomp_err,
+ .flags = INET6_PROTO_NOPOLICY,
+};
+
+static struct xfrm_input_afinfo xfrm6_input_afinfo = {
+ .family = AF_INET6,
+ .owner = THIS_MODULE,
+ .callback = xfrm6_rcv_cb,
+};
+
+static inline const struct inet6_protocol *netproto(unsigned char protocol)
+{
+ switch (protocol) {
+ case IPPROTO_ESP:
+ return &esp6_protocol;
+ case IPPROTO_AH:
+ return &ah6_protocol;
+ case IPPROTO_COMP:
+ return &ipcomp6_protocol;
+ }
+
+ return NULL;
+}
+
+int xfrm6_protocol_register(struct xfrm6_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm6_protocol __rcu **pprev;
+ struct xfrm6_protocol *t;
+ bool add_netproto = false;
+ int ret = -EEXIST;
+ int priority = handler->priority;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm6_protocol_mutex);
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm6_protocol_mutex)))
+ add_netproto = true;
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm6_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority < priority)
+ break;
+ if (t->priority == priority)
+ goto err;
+ }
+
+ handler->next = *pprev;
+ rcu_assign_pointer(*pprev, handler);
+
+ ret = 0;
+
+err:
+ mutex_unlock(&xfrm6_protocol_mutex);
+
+ if (add_netproto) {
+ if (inet6_add_protocol(netproto(protocol), protocol)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm6_protocol_register);
+
+int xfrm6_protocol_deregister(struct xfrm6_protocol *handler,
+ unsigned char protocol)
+{
+ struct xfrm6_protocol __rcu **pprev;
+ struct xfrm6_protocol *t;
+ int ret = -ENOENT;
+
+ if (!proto_handlers(protocol) || !netproto(protocol))
+ return -EINVAL;
+
+ mutex_lock(&xfrm6_protocol_mutex);
+
+ for (pprev = proto_handlers(protocol);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&xfrm6_protocol_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
+ *pprev = handler->next;
+ ret = 0;
+ break;
+ }
+ }
+
+ if (!rcu_dereference_protected(*proto_handlers(protocol),
+ lockdep_is_held(&xfrm6_protocol_mutex))) {
+ if (inet6_del_protocol(netproto(protocol), protocol) < 0) {
+ pr_err("%s: can't remove protocol\n", __func__);
+ ret = -EAGAIN;
+ }
+ }
+
+ mutex_unlock(&xfrm6_protocol_mutex);
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm6_protocol_deregister);
+
+int __init xfrm6_protocol_init(void)
+{
+ return xfrm_input_register_afinfo(&xfrm6_input_afinfo);
+}
+
+void xfrm6_protocol_fini(void)
+{
+ xfrm_input_unregister_afinfo(&xfrm6_input_afinfo);
+}
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
new file mode 100644
index 000000000..8a1f9c0d2
--- /dev/null
+++ b/net/ipv6/xfrm6_state.c
@@ -0,0 +1,198 @@
+/*
+ * xfrm6_state.c: based on xfrm4_state.c
+ *
+ * Authors:
+ * Mitsuru KANDA @USAGI
+ * Kazunori MIYAZAWA @USAGI
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * IPv6 support
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ *
+ */
+
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/export.h>
+#include <net/dsfield.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+
+static void
+__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+ const struct flowi6 *fl6 = &fl->u.ip6;
+
+ /* Initialize temporary selector matching only
+ * to current session. */
+ *(struct in6_addr *)&sel->daddr = fl6->daddr;
+ *(struct in6_addr *)&sel->saddr = fl6->saddr;
+ sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET6;
+ sel->prefixlen_d = 128;
+ sel->prefixlen_s = 128;
+ sel->proto = fl6->flowi6_proto;
+ sel->ifindex = fl6->flowi6_oif;
+}
+
+static void
+xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr)
+{
+ x->id = tmpl->id;
+ if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
+ memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
+ memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
+ if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
+ memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
+ x->props.mode = tmpl->mode;
+ x->props.reqid = tmpl->reqid;
+ x->props.family = AF_INET6;
+}
+
+/* distribution counting sort function for xfrm_state and xfrm_tmpl */
+static int
+__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
+{
+ int i;
+ int class[XFRM_MAX_DEPTH];
+ int count[maxclass];
+
+ memset(count, 0, sizeof(count));
+
+ for (i = 0; i < n; i++) {
+ int c;
+ class[i] = c = cmp(src[i]);
+ count[c]++;
+ }
+
+ for (i = 2; i < maxclass; i++)
+ count[i] += count[i - 1];
+
+ for (i = 0; i < n; i++) {
+ dst[count[class[i] - 1]++] = src[i];
+ src[i] = NULL;
+ }
+
+ return 0;
+}
+
+/*
+ * Rule for xfrm_state:
+ *
+ * rule 1: select IPsec transport except AH
+ * rule 2: select MIPv6 RO or inbound trigger
+ * rule 3: select IPsec transport AH
+ * rule 4: select IPsec tunnel
+ * rule 5: others
+ */
+static int __xfrm6_state_sort_cmp(void *p)
+{
+ struct xfrm_state *v = p;
+
+ switch (v->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ if (v->id.proto != IPPROTO_AH)
+ return 1;
+ else
+ return 3;
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case XFRM_MODE_ROUTEOPTIMIZATION:
+ case XFRM_MODE_IN_TRIGGER:
+ return 2;
+#endif
+ case XFRM_MODE_TUNNEL:
+ case XFRM_MODE_BEET:
+ return 4;
+ }
+ return 5;
+}
+
+static int
+__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
+{
+ return __xfrm6_sort((void **)dst, (void **)src, n,
+ __xfrm6_state_sort_cmp, 6);
+}
+
+/*
+ * Rule for xfrm_tmpl:
+ *
+ * rule 1: select IPsec transport
+ * rule 2: select MIPv6 RO or inbound trigger
+ * rule 3: select IPsec tunnel
+ * rule 4: others
+ */
+static int __xfrm6_tmpl_sort_cmp(void *p)
+{
+ struct xfrm_tmpl *v = p;
+ switch (v->mode) {
+ case XFRM_MODE_TRANSPORT:
+ return 1;
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case XFRM_MODE_ROUTEOPTIMIZATION:
+ case XFRM_MODE_IN_TRIGGER:
+ return 2;
+#endif
+ case XFRM_MODE_TUNNEL:
+ case XFRM_MODE_BEET:
+ return 3;
+ }
+ return 4;
+}
+
+static int
+__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n)
+{
+ return __xfrm6_sort((void **)dst, (void **)src, n,
+ __xfrm6_tmpl_sort_cmp, 5);
+}
+
+int xfrm6_extract_header(struct sk_buff *skb)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
+ XFRM_MODE_SKB_CB(skb)->id = 0;
+ XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF);
+ XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph);
+ XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit;
+ XFRM_MODE_SKB_CB(skb)->optlen = 0;
+ memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl,
+ sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+
+ return 0;
+}
+
+static struct xfrm_state_afinfo xfrm6_state_afinfo = {
+ .family = AF_INET6,
+ .proto = IPPROTO_IPV6,
+ .eth_proto = htons(ETH_P_IPV6),
+ .owner = THIS_MODULE,
+ .init_tempsel = __xfrm6_init_tempsel,
+ .init_temprop = xfrm6_init_temprop,
+ .tmpl_sort = __xfrm6_tmpl_sort,
+ .state_sort = __xfrm6_state_sort,
+ .output = xfrm6_output,
+ .output_finish = xfrm6_output_finish,
+ .extract_input = xfrm6_extract_input,
+ .extract_output = xfrm6_extract_output,
+ .transport_finish = xfrm6_transport_finish,
+ .local_error = xfrm6_local_error,
+};
+
+int __init xfrm6_state_init(void)
+{
+ return xfrm_state_register_afinfo(&xfrm6_state_afinfo);
+}
+
+void xfrm6_state_fini(void)
+{
+ xfrm_state_unregister_afinfo(&xfrm6_state_afinfo);
+}
+
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
new file mode 100644
index 000000000..5743044cd
--- /dev/null
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (C)2003,2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Authors Mitsuru KANDA <mk@linux-ipv6.org>
+ * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ *
+ * Based on net/ipv4/xfrm4_tunnel.c
+ *
+ */
+#include <linux/module.h>
+#include <linux/xfrm.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ipv6.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/mutex.h>
+#include <net/netns/generic.h>
+
+#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
+#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
+
+#define XFRM6_TUNNEL_SPI_MIN 1
+#define XFRM6_TUNNEL_SPI_MAX 0xffffffff
+
+struct xfrm6_tunnel_net {
+ struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE];
+ struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE];
+ u32 spi;
+};
+
+static int xfrm6_tunnel_net_id __read_mostly;
+static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net)
+{
+ return net_generic(net, xfrm6_tunnel_net_id);
+}
+
+/*
+ * xfrm_tunnel_spi things are for allocating unique id ("spi")
+ * per xfrm_address_t.
+ */
+struct xfrm6_tunnel_spi {
+ struct hlist_node list_byaddr;
+ struct hlist_node list_byspi;
+ xfrm_address_t addr;
+ u32 spi;
+ atomic_t refcnt;
+ struct rcu_head rcu_head;
+};
+
+static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock);
+
+static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly;
+
+static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr)
+{
+ unsigned int h;
+
+ h = ipv6_addr_hash((const struct in6_addr *)addr);
+ h ^= h >> 16;
+ h ^= h >> 8;
+ h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1;
+
+ return h;
+}
+
+static inline unsigned int xfrm6_tunnel_spi_hash_byspi(u32 spi)
+{
+ return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE;
+}
+
+static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr)
+{
+ struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+ struct xfrm6_tunnel_spi *x6spi;
+
+ hlist_for_each_entry_rcu(x6spi,
+ &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
+ list_byaddr) {
+ if (xfrm6_addr_equal(&x6spi->addr, saddr))
+ return x6spi;
+ }
+
+ return NULL;
+}
+
+__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr)
+{
+ struct xfrm6_tunnel_spi *x6spi;
+ u32 spi;
+
+ rcu_read_lock_bh();
+ x6spi = __xfrm6_tunnel_spi_lookup(net, saddr);
+ spi = x6spi ? x6spi->spi : 0;
+ rcu_read_unlock_bh();
+ return htonl(spi);
+}
+EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup);
+
+static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi)
+{
+ struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+ struct xfrm6_tunnel_spi *x6spi;
+ int index = xfrm6_tunnel_spi_hash_byspi(spi);
+
+ hlist_for_each_entry(x6spi,
+ &xfrm6_tn->spi_byspi[index],
+ list_byspi) {
+ if (x6spi->spi == spi)
+ return -1;
+ }
+ return index;
+}
+
+static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
+{
+ struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+ u32 spi;
+ struct xfrm6_tunnel_spi *x6spi;
+ int index;
+
+ if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN ||
+ xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX)
+ xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN;
+ else
+ xfrm6_tn->spi++;
+
+ for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) {
+ index = __xfrm6_tunnel_spi_check(net, spi);
+ if (index >= 0)
+ goto alloc_spi;
+ }
+ for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) {
+ index = __xfrm6_tunnel_spi_check(net, spi);
+ if (index >= 0)
+ goto alloc_spi;
+ }
+ spi = 0;
+ goto out;
+alloc_spi:
+ xfrm6_tn->spi = spi;
+ x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC);
+ if (!x6spi)
+ goto out;
+
+ memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr));
+ x6spi->spi = spi;
+ atomic_set(&x6spi->refcnt, 1);
+
+ hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]);
+
+ index = xfrm6_tunnel_spi_hash_byaddr(saddr);
+ hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]);
+out:
+ return spi;
+}
+
+__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
+{
+ struct xfrm6_tunnel_spi *x6spi;
+ u32 spi;
+
+ spin_lock_bh(&xfrm6_tunnel_spi_lock);
+ x6spi = __xfrm6_tunnel_spi_lookup(net, saddr);
+ if (x6spi) {
+ atomic_inc(&x6spi->refcnt);
+ spi = x6spi->spi;
+ } else
+ spi = __xfrm6_tunnel_alloc_spi(net, saddr);
+ spin_unlock_bh(&xfrm6_tunnel_spi_lock);
+
+ return htonl(spi);
+}
+EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi);
+
+static void x6spi_destroy_rcu(struct rcu_head *head)
+{
+ kmem_cache_free(xfrm6_tunnel_spi_kmem,
+ container_of(head, struct xfrm6_tunnel_spi, rcu_head));
+}
+
+static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
+{
+ struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+ struct xfrm6_tunnel_spi *x6spi;
+ struct hlist_node *n;
+
+ spin_lock_bh(&xfrm6_tunnel_spi_lock);
+
+ hlist_for_each_entry_safe(x6spi, n,
+ &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
+ list_byaddr)
+ {
+ if (xfrm6_addr_equal(&x6spi->addr, saddr)) {
+ if (atomic_dec_and_test(&x6spi->refcnt)) {
+ hlist_del_rcu(&x6spi->list_byaddr);
+ hlist_del_rcu(&x6spi->list_byspi);
+ call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu);
+ break;
+ }
+ }
+ }
+ spin_unlock_bh(&xfrm6_tunnel_spi_lock);
+}
+
+static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ skb_push(skb, -skb_network_offset(skb));
+ return 0;
+}
+
+static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ return skb_network_header(skb)[IP6CB(skb)->nhoff];
+}
+
+static int xfrm6_tunnel_rcv(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ __be32 spi;
+
+ spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
+ return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi);
+}
+
+static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ /* xfrm6_tunnel native err handling */
+ switch (type) {
+ case ICMPV6_DEST_UNREACH:
+ switch (code) {
+ case ICMPV6_NOROUTE:
+ case ICMPV6_ADM_PROHIBITED:
+ case ICMPV6_NOT_NEIGHBOUR:
+ case ICMPV6_ADDR_UNREACH:
+ case ICMPV6_PORT_UNREACH:
+ default:
+ break;
+ }
+ break;
+ case ICMPV6_PKT_TOOBIG:
+ break;
+ case ICMPV6_TIME_EXCEED:
+ switch (code) {
+ case ICMPV6_EXC_HOPLIMIT:
+ break;
+ case ICMPV6_EXC_FRAGTIME:
+ default:
+ break;
+ }
+ break;
+ case ICMPV6_PARAMPROB:
+ switch (code) {
+ case ICMPV6_HDR_FIELD: break;
+ case ICMPV6_UNK_NEXTHDR: break;
+ case ICMPV6_UNK_OPTION: break;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int xfrm6_tunnel_init_state(struct xfrm_state *x)
+{
+ if (x->props.mode != XFRM_MODE_TUNNEL)
+ return -EINVAL;
+
+ if (x->encap)
+ return -EINVAL;
+
+ x->props.header_len = sizeof(struct ipv6hdr);
+
+ return 0;
+}
+
+static void xfrm6_tunnel_destroy(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+
+ xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr);
+}
+
+static const struct xfrm_type xfrm6_tunnel_type = {
+ .description = "IP6IP6",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_IPV6,
+ .init_state = xfrm6_tunnel_init_state,
+ .destructor = xfrm6_tunnel_destroy,
+ .input = xfrm6_tunnel_input,
+ .output = xfrm6_tunnel_output,
+};
+
+static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
+ .handler = xfrm6_tunnel_rcv,
+ .err_handler = xfrm6_tunnel_err,
+ .priority = 2,
+};
+
+static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
+ .handler = xfrm6_tunnel_rcv,
+ .err_handler = xfrm6_tunnel_err,
+ .priority = 2,
+};
+
+static int __net_init xfrm6_tunnel_net_init(struct net *net)
+{
+ struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+ unsigned int i;
+
+ for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]);
+ for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++)
+ INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]);
+ xfrm6_tn->spi = 0;
+
+ return 0;
+}
+
+static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
+{
+}
+
+static struct pernet_operations xfrm6_tunnel_net_ops = {
+ .init = xfrm6_tunnel_net_init,
+ .exit = xfrm6_tunnel_net_exit,
+ .id = &xfrm6_tunnel_net_id,
+ .size = sizeof(struct xfrm6_tunnel_net),
+};
+
+static int __init xfrm6_tunnel_init(void)
+{
+ int rv;
+
+ xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi",
+ sizeof(struct xfrm6_tunnel_spi),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!xfrm6_tunnel_spi_kmem)
+ return -ENOMEM;
+ rv = register_pernet_subsys(&xfrm6_tunnel_net_ops);
+ if (rv < 0)
+ goto out_pernet;
+ rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6);
+ if (rv < 0)
+ goto out_type;
+ rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6);
+ if (rv < 0)
+ goto out_xfrm6;
+ rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET);
+ if (rv < 0)
+ goto out_xfrm46;
+ return 0;
+
+out_xfrm46:
+ xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6);
+out_xfrm6:
+ xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
+out_type:
+ unregister_pernet_subsys(&xfrm6_tunnel_net_ops);
+out_pernet:
+ kmem_cache_destroy(xfrm6_tunnel_spi_kmem);
+ return rv;
+}
+
+static void __exit xfrm6_tunnel_fini(void)
+{
+ xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET);
+ xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6);
+ xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
+ unregister_pernet_subsys(&xfrm6_tunnel_net_ops);
+ kmem_cache_destroy(xfrm6_tunnel_spi_kmem);
+}
+
+module_init(xfrm6_tunnel_init);
+module_exit(xfrm6_tunnel_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6);
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
new file mode 100644
index 000000000..e9ad0062f
--- /dev/null
+++ b/net/ipx/Kconfig
@@ -0,0 +1,60 @@
+#
+# IPX configuration
+#
+config IPX
+ tristate "The IPX protocol"
+ select LLC
+ ---help---
+ This is support for the Novell networking protocol, IPX, commonly
+ used for local networks of Windows machines. You need it if you
+ want to access Novell NetWare file or print servers using the Linux
+ Novell client ncpfs (available from
+ <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
+ within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
+ available from <http://www.tldp.org/docs.html#howto>). In order
+ to do the former, you'll also have to say Y to "NCP file system
+ support", below.
+
+ IPX is similar in scope to IP, while SPX, which runs on top of IPX,
+ is similar to TCP.
+
+ To turn your Linux box into a fully featured NetWare file server and
+ IPX router, say Y here and fetch either lwared from
+ <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
+ mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
+ information, read the IPX-HOWTO available from
+ <http://www.tldp.org/docs.html#howto>.
+
+ The IPX driver would enlarge your kernel by about 16 KB. To compile
+ this driver as a module, choose M here: the module will be called ipx.
+ Unless you want to integrate your Linux box with a local Novell
+ network, say N.
+
+config IPX_INTERN
+ bool "IPX: Full internal IPX network"
+ depends on IPX
+ ---help---
+ Every IPX network has an address that identifies it. Sometimes it is
+ useful to give an IPX "network" address to your Linux box as well
+ (for example if your box is acting as a file server for different
+ IPX networks: it will then be accessible from everywhere using the
+ same address). The way this is done is to create a virtual internal
+ "network" inside your box and to assign an IPX address to this
+ network. Say Y here if you want to do this; read the IPX-HOWTO at
+ <http://www.tldp.org/docs.html#howto> for details.
+
+ The full internal IPX network enables you to allocate sockets on
+ different virtual nodes of the internal network. This is done by
+ evaluating the field sipx_node of the socket address given to the
+ bind call. So applications should always initialize the node field
+ to 0 when binding a socket on the primary network. In this case the
+ socket is assigned the default node that has been given to the
+ kernel when the internal network was created. By enabling the full
+ internal IPX network the cross-forwarding of packets targeted at
+ 'special' sockets to sockets listening on the primary network is
+ disabled. This might break existing applications, especially RIP/SAP
+ daemons. A RIP/SAP daemon that works well with the full internal net
+ can be found on <ftp://ftp.gwdg.de/pub/linux/misc/ncpfs/>.
+
+ If you don't know what you are doing, say N.
+
diff --git a/net/ipx/Makefile b/net/ipx/Makefile
new file mode 100644
index 000000000..440fafa9f
--- /dev/null
+++ b/net/ipx/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux IPX layer.
+#
+
+obj-$(CONFIG_IPX) += ipx.o
+
+ipx-y := af_ipx.o ipx_route.o ipx_proc.o pe2.o
+ipx-$(CONFIG_SYSCTL) += sysctl_net_ipx.o
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
new file mode 100644
index 000000000..4ea5d7497
--- /dev/null
+++ b/net/ipx/af_ipx.c
@@ -0,0 +1,2084 @@
+/*
+ * Implements an IPX socket layer.
+ *
+ * This code is derived from work by
+ * Ross Biro : Writing the original IP stack
+ * Fred Van Kempen : Tidying up the TCP/IP
+ *
+ * Many thanks go to Keith Baker, Institute For Industrial Information
+ * Technology Ltd, Swansea University for allowing me to work on this
+ * in my own time even though it was in some ways related to commercial
+ * work I am currently employed to do there.
+ *
+ * All the material in this file is subject to the Gnu license version 2.
+ * Neither Alan Cox nor the Swansea University Computer Society admit
+ * liability nor provide warranty for any of this software. This material
+ * is provided as is and at no charge.
+ *
+ * Portions Copyright (c) 2000-2003 Conectiva, Inc. <acme@conectiva.com.br>
+ * Neither Arnaldo Carvalho de Melo nor Conectiva, Inc. admit liability nor
+ * provide warranty for any of this software. This material is provided
+ * "AS-IS" and at no charge.
+ *
+ * Portions Copyright (c) 1995 Caldera, Inc. <greg@caldera.com>
+ * Neither Greg Page nor Caldera, Inc. admit liability nor provide
+ * warranty for any of this software. This material is provided
+ * "AS-IS" and at no charge.
+ *
+ * See net/ipx/ChangeLog.
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/ipx.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/uio.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/termios.h>
+
+#include <net/ipx.h>
+#include <net/p8022.h>
+#include <net/psnap.h>
+#include <net/sock.h>
+#include <net/datalink.h>
+#include <net/tcp_states.h>
+#include <net/net_namespace.h>
+
+#include <asm/uaccess.h>
+
+/* Configuration Variables */
+static unsigned char ipxcfg_max_hops = 16;
+static char ipxcfg_auto_select_primary;
+static char ipxcfg_auto_create_interfaces;
+int sysctl_ipx_pprop_broadcasting = 1;
+
+/* Global Variables */
+static struct datalink_proto *p8022_datalink;
+static struct datalink_proto *pEII_datalink;
+static struct datalink_proto *p8023_datalink;
+static struct datalink_proto *pSNAP_datalink;
+
+static const struct proto_ops ipx_dgram_ops;
+
+LIST_HEAD(ipx_interfaces);
+DEFINE_SPINLOCK(ipx_interfaces_lock);
+
+struct ipx_interface *ipx_primary_net;
+struct ipx_interface *ipx_internal_net;
+
+struct ipx_interface *ipx_interfaces_head(void)
+{
+ struct ipx_interface *rc = NULL;
+
+ if (!list_empty(&ipx_interfaces))
+ rc = list_entry(ipx_interfaces.next,
+ struct ipx_interface, node);
+ return rc;
+}
+
+static void ipxcfg_set_auto_select(char val)
+{
+ ipxcfg_auto_select_primary = val;
+ if (val && !ipx_primary_net)
+ ipx_primary_net = ipx_interfaces_head();
+}
+
+static int ipxcfg_get_config_data(struct ipx_config_data __user *arg)
+{
+ struct ipx_config_data vals;
+
+ vals.ipxcfg_auto_create_interfaces = ipxcfg_auto_create_interfaces;
+ vals.ipxcfg_auto_select_primary = ipxcfg_auto_select_primary;
+
+ return copy_to_user(arg, &vals, sizeof(vals)) ? -EFAULT : 0;
+}
+
+/*
+ * Note: Sockets may not be removed _during_ an interrupt or inet_bh
+ * handler using this technique. They can be added although we do not
+ * use this facility.
+ */
+
+static void ipx_remove_socket(struct sock *sk)
+{
+ /* Determine interface with which socket is associated */
+ struct ipx_interface *intrfc = ipx_sk(sk)->intrfc;
+
+ if (!intrfc)
+ goto out;
+
+ ipxitf_hold(intrfc);
+ spin_lock_bh(&intrfc->if_sklist_lock);
+ sk_del_node_init(sk);
+ spin_unlock_bh(&intrfc->if_sklist_lock);
+ ipxitf_put(intrfc);
+out:
+ return;
+}
+
+static void ipx_destroy_socket(struct sock *sk)
+{
+ ipx_remove_socket(sk);
+ skb_queue_purge(&sk->sk_receive_queue);
+ sk_refcnt_debug_dec(sk);
+}
+
+/*
+ * The following code is used to support IPX Interfaces (IPXITF). An
+ * IPX interface is defined by a physical device and a frame type.
+ */
+
+/* ipxitf_clear_primary_net has to be called with ipx_interfaces_lock held */
+
+static void ipxitf_clear_primary_net(void)
+{
+ ipx_primary_net = NULL;
+ if (ipxcfg_auto_select_primary)
+ ipx_primary_net = ipx_interfaces_head();
+}
+
+static struct ipx_interface *__ipxitf_find_using_phys(struct net_device *dev,
+ __be16 datalink)
+{
+ struct ipx_interface *i;
+
+ list_for_each_entry(i, &ipx_interfaces, node)
+ if (i->if_dev == dev && i->if_dlink_type == datalink)
+ goto out;
+ i = NULL;
+out:
+ return i;
+}
+
+static struct ipx_interface *ipxitf_find_using_phys(struct net_device *dev,
+ __be16 datalink)
+{
+ struct ipx_interface *i;
+
+ spin_lock_bh(&ipx_interfaces_lock);
+ i = __ipxitf_find_using_phys(dev, datalink);
+ if (i)
+ ipxitf_hold(i);
+ spin_unlock_bh(&ipx_interfaces_lock);
+ return i;
+}
+
+struct ipx_interface *ipxitf_find_using_net(__be32 net)
+{
+ struct ipx_interface *i;
+
+ spin_lock_bh(&ipx_interfaces_lock);
+ if (net) {
+ list_for_each_entry(i, &ipx_interfaces, node)
+ if (i->if_netnum == net)
+ goto hold;
+ i = NULL;
+ goto unlock;
+ }
+
+ i = ipx_primary_net;
+ if (i)
+hold:
+ ipxitf_hold(i);
+unlock:
+ spin_unlock_bh(&ipx_interfaces_lock);
+ return i;
+}
+
+/* Sockets are bound to a particular IPX interface. */
+static void ipxitf_insert_socket(struct ipx_interface *intrfc, struct sock *sk)
+{
+ ipxitf_hold(intrfc);
+ spin_lock_bh(&intrfc->if_sklist_lock);
+ ipx_sk(sk)->intrfc = intrfc;
+ sk_add_node(sk, &intrfc->if_sklist);
+ spin_unlock_bh(&intrfc->if_sklist_lock);
+ ipxitf_put(intrfc);
+}
+
+/* caller must hold intrfc->if_sklist_lock */
+static struct sock *__ipxitf_find_socket(struct ipx_interface *intrfc,
+ __be16 port)
+{
+ struct sock *s;
+
+ sk_for_each(s, &intrfc->if_sklist)
+ if (ipx_sk(s)->port == port)
+ goto found;
+ s = NULL;
+found:
+ return s;
+}
+
+/* caller must hold a reference to intrfc */
+static struct sock *ipxitf_find_socket(struct ipx_interface *intrfc,
+ __be16 port)
+{
+ struct sock *s;
+
+ spin_lock_bh(&intrfc->if_sklist_lock);
+ s = __ipxitf_find_socket(intrfc, port);
+ if (s)
+ sock_hold(s);
+ spin_unlock_bh(&intrfc->if_sklist_lock);
+
+ return s;
+}
+
+#ifdef CONFIG_IPX_INTERN
+static struct sock *ipxitf_find_internal_socket(struct ipx_interface *intrfc,
+ unsigned char *ipx_node,
+ __be16 port)
+{
+ struct sock *s;
+
+ ipxitf_hold(intrfc);
+ spin_lock_bh(&intrfc->if_sklist_lock);
+
+ sk_for_each(s, &intrfc->if_sklist) {
+ struct ipx_sock *ipxs = ipx_sk(s);
+
+ if (ipxs->port == port &&
+ !memcmp(ipx_node, ipxs->node, IPX_NODE_LEN))
+ goto found;
+ }
+ s = NULL;
+found:
+ spin_unlock_bh(&intrfc->if_sklist_lock);
+ ipxitf_put(intrfc);
+ return s;
+}
+#endif
+
+static void __ipxitf_down(struct ipx_interface *intrfc)
+{
+ struct sock *s;
+ struct hlist_node *t;
+
+ /* Delete all routes associated with this interface */
+ ipxrtr_del_routes(intrfc);
+
+ spin_lock_bh(&intrfc->if_sklist_lock);
+ /* error sockets */
+ sk_for_each_safe(s, t, &intrfc->if_sklist) {
+ struct ipx_sock *ipxs = ipx_sk(s);
+
+ s->sk_err = ENOLINK;
+ s->sk_error_report(s);
+ ipxs->intrfc = NULL;
+ ipxs->port = 0;
+ sock_set_flag(s, SOCK_ZAPPED); /* Indicates it is no longer bound */
+ sk_del_node_init(s);
+ }
+ INIT_HLIST_HEAD(&intrfc->if_sklist);
+ spin_unlock_bh(&intrfc->if_sklist_lock);
+
+ /* remove this interface from list */
+ list_del(&intrfc->node);
+
+ /* remove this interface from *special* networks */
+ if (intrfc == ipx_primary_net)
+ ipxitf_clear_primary_net();
+ if (intrfc == ipx_internal_net)
+ ipx_internal_net = NULL;
+
+ if (intrfc->if_dev)
+ dev_put(intrfc->if_dev);
+ kfree(intrfc);
+}
+
+void ipxitf_down(struct ipx_interface *intrfc)
+{
+ spin_lock_bh(&ipx_interfaces_lock);
+ __ipxitf_down(intrfc);
+ spin_unlock_bh(&ipx_interfaces_lock);
+}
+
+static void __ipxitf_put(struct ipx_interface *intrfc)
+{
+ if (atomic_dec_and_test(&intrfc->refcnt))
+ __ipxitf_down(intrfc);
+}
+
+static int ipxitf_device_event(struct notifier_block *notifier,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct ipx_interface *i, *tmp;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (event != NETDEV_DOWN && event != NETDEV_UP)
+ goto out;
+
+ spin_lock_bh(&ipx_interfaces_lock);
+ list_for_each_entry_safe(i, tmp, &ipx_interfaces, node)
+ if (i->if_dev == dev) {
+ if (event == NETDEV_UP)
+ ipxitf_hold(i);
+ else
+ __ipxitf_put(i);
+ }
+ spin_unlock_bh(&ipx_interfaces_lock);
+out:
+ return NOTIFY_DONE;
+}
+
+
+static __exit void ipxitf_cleanup(void)
+{
+ struct ipx_interface *i, *tmp;
+
+ spin_lock_bh(&ipx_interfaces_lock);
+ list_for_each_entry_safe(i, tmp, &ipx_interfaces, node)
+ __ipxitf_put(i);
+ spin_unlock_bh(&ipx_interfaces_lock);
+}
+
+static void ipxitf_def_skb_handler(struct sock *sock, struct sk_buff *skb)
+{
+ if (sock_queue_rcv_skb(sock, skb) < 0)
+ kfree_skb(skb);
+}
+
+/*
+ * On input skb->sk is NULL. Nobody is charged for the memory.
+ */
+
+/* caller must hold a reference to intrfc */
+
+#ifdef CONFIG_IPX_INTERN
+static int ipxitf_demux_socket(struct ipx_interface *intrfc,
+ struct sk_buff *skb, int copy)
+{
+ struct ipxhdr *ipx = ipx_hdr(skb);
+ int is_broadcast = !memcmp(ipx->ipx_dest.node, ipx_broadcast_node,
+ IPX_NODE_LEN);
+ struct sock *s;
+ int rc;
+
+ spin_lock_bh(&intrfc->if_sklist_lock);
+
+ sk_for_each(s, &intrfc->if_sklist) {
+ struct ipx_sock *ipxs = ipx_sk(s);
+
+ if (ipxs->port == ipx->ipx_dest.sock &&
+ (is_broadcast || !memcmp(ipx->ipx_dest.node,
+ ipxs->node, IPX_NODE_LEN))) {
+ /* We found a socket to which to send */
+ struct sk_buff *skb1;
+
+ if (copy) {
+ skb1 = skb_clone(skb, GFP_ATOMIC);
+ rc = -ENOMEM;
+ if (!skb1)
+ goto out;
+ } else {
+ skb1 = skb;
+ copy = 1; /* skb may only be used once */
+ }
+ ipxitf_def_skb_handler(s, skb1);
+
+ /* On an external interface, one socket can listen */
+ if (intrfc != ipx_internal_net)
+ break;
+ }
+ }
+
+ /* skb was solely for us, and we did not make a copy, so free it. */
+ if (!copy)
+ kfree_skb(skb);
+
+ rc = 0;
+out:
+ spin_unlock_bh(&intrfc->if_sklist_lock);
+ return rc;
+}
+#else
+static struct sock *ncp_connection_hack(struct ipx_interface *intrfc,
+ struct ipxhdr *ipx)
+{
+ /* The packet's target is a NCP connection handler. We want to hand it
+ * to the correct socket directly within the kernel, so that the
+ * mars_nwe packet distribution process does not have to do it. Here we
+ * only care about NCP and BURST packets.
+ *
+ * You might call this a hack, but believe me, you do not want a
+ * complete NCP layer in the kernel, and this is VERY fast as well. */
+ struct sock *sk = NULL;
+ int connection = 0;
+ u8 *ncphdr = (u8 *)(ipx + 1);
+
+ if (*ncphdr == 0x22 && *(ncphdr + 1) == 0x22) /* NCP request */
+ connection = (((int) *(ncphdr + 5)) << 8) | (int) *(ncphdr + 3);
+ else if (*ncphdr == 0x77 && *(ncphdr + 1) == 0x77) /* BURST packet */
+ connection = (((int) *(ncphdr + 9)) << 8) | (int) *(ncphdr + 8);
+
+ if (connection) {
+ /* Now we have to look for a special NCP connection handling
+ * socket. Only these sockets have ipx_ncp_conn != 0, set by
+ * SIOCIPXNCPCONN. */
+ spin_lock_bh(&intrfc->if_sklist_lock);
+ sk_for_each(sk, &intrfc->if_sklist)
+ if (ipx_sk(sk)->ipx_ncp_conn == connection) {
+ sock_hold(sk);
+ goto found;
+ }
+ sk = NULL;
+ found:
+ spin_unlock_bh(&intrfc->if_sklist_lock);
+ }
+ return sk;
+}
+
+static int ipxitf_demux_socket(struct ipx_interface *intrfc,
+ struct sk_buff *skb, int copy)
+{
+ struct ipxhdr *ipx = ipx_hdr(skb);
+ struct sock *sock1 = NULL, *sock2 = NULL;
+ struct sk_buff *skb1 = NULL, *skb2 = NULL;
+ int rc;
+
+ if (intrfc == ipx_primary_net && ntohs(ipx->ipx_dest.sock) == 0x451)
+ sock1 = ncp_connection_hack(intrfc, ipx);
+ if (!sock1)
+ /* No special socket found, forward the packet the normal way */
+ sock1 = ipxitf_find_socket(intrfc, ipx->ipx_dest.sock);
+
+ /*
+ * We need to check if there is a primary net and if
+ * this is addressed to one of the *SPECIAL* sockets because
+ * these need to be propagated to the primary net.
+ * The *SPECIAL* socket list contains: 0x452(SAP), 0x453(RIP) and
+ * 0x456(Diagnostic).
+ */
+
+ if (ipx_primary_net && intrfc != ipx_primary_net) {
+ const int dsock = ntohs(ipx->ipx_dest.sock);
+
+ if (dsock == 0x452 || dsock == 0x453 || dsock == 0x456)
+ /* The appropriate thing to do here is to dup the
+ * packet and route to the primary net interface via
+ * ipxitf_send; however, we'll cheat and just demux it
+ * here. */
+ sock2 = ipxitf_find_socket(ipx_primary_net,
+ ipx->ipx_dest.sock);
+ }
+
+ /*
+ * If there is nothing to do return. The kfree will cancel any charging.
+ */
+ rc = 0;
+ if (!sock1 && !sock2) {
+ if (!copy)
+ kfree_skb(skb);
+ goto out;
+ }
+
+ /*
+ * This next segment of code is a little awkward, but it sets it up
+ * so that the appropriate number of copies of the SKB are made and
+ * that skb1 and skb2 point to it (them) so that it (they) can be
+ * demuxed to sock1 and/or sock2. If we are unable to make enough
+ * copies, we do as much as is possible.
+ */
+
+ if (copy)
+ skb1 = skb_clone(skb, GFP_ATOMIC);
+ else
+ skb1 = skb;
+
+ rc = -ENOMEM;
+ if (!skb1)
+ goto out_put;
+
+ /* Do we need 2 SKBs? */
+ if (sock1 && sock2)
+ skb2 = skb_clone(skb1, GFP_ATOMIC);
+ else
+ skb2 = skb1;
+
+ if (sock1)
+ ipxitf_def_skb_handler(sock1, skb1);
+
+ if (!skb2)
+ goto out_put;
+
+ if (sock2)
+ ipxitf_def_skb_handler(sock2, skb2);
+
+ rc = 0;
+out_put:
+ if (sock1)
+ sock_put(sock1);
+ if (sock2)
+ sock_put(sock2);
+out:
+ return rc;
+}
+#endif /* CONFIG_IPX_INTERN */
+
+static struct sk_buff *ipxitf_adjust_skbuff(struct ipx_interface *intrfc,
+ struct sk_buff *skb)
+{
+ struct sk_buff *skb2;
+ int in_offset = (unsigned char *)ipx_hdr(skb) - skb->head;
+ int out_offset = intrfc->if_ipx_offset;
+ int len;
+
+ /* Hopefully, most cases */
+ if (in_offset >= out_offset)
+ return skb;
+
+ /* Need new SKB */
+ len = skb->len + out_offset;
+ skb2 = alloc_skb(len, GFP_ATOMIC);
+ if (skb2) {
+ skb_reserve(skb2, out_offset);
+ skb_reset_network_header(skb2);
+ skb_reset_transport_header(skb2);
+ skb_put(skb2, skb->len);
+ memcpy(ipx_hdr(skb2), ipx_hdr(skb), skb->len);
+ memcpy(skb2->cb, skb->cb, sizeof(skb->cb));
+ }
+ kfree_skb(skb);
+ return skb2;
+}
+
+/* caller must hold a reference to intrfc and the skb has to be unshared */
+int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node)
+{
+ struct ipxhdr *ipx = ipx_hdr(skb);
+ struct net_device *dev = intrfc->if_dev;
+ struct datalink_proto *dl = intrfc->if_dlink;
+ char dest_node[IPX_NODE_LEN];
+ int send_to_wire = 1;
+ int addr_len;
+
+ ipx->ipx_tctrl = IPX_SKB_CB(skb)->ipx_tctrl;
+ ipx->ipx_dest.net = IPX_SKB_CB(skb)->ipx_dest_net;
+ ipx->ipx_source.net = IPX_SKB_CB(skb)->ipx_source_net;
+
+ /* see if we need to include the netnum in the route list */
+ if (IPX_SKB_CB(skb)->last_hop.index >= 0) {
+ __be32 *last_hop = (__be32 *)(((u8 *) skb->data) +
+ sizeof(struct ipxhdr) +
+ IPX_SKB_CB(skb)->last_hop.index *
+ sizeof(__be32));
+ *last_hop = IPX_SKB_CB(skb)->last_hop.netnum;
+ IPX_SKB_CB(skb)->last_hop.index = -1;
+ }
+
+ /*
+ * We need to know how many skbuffs it will take to send out this
+ * packet to avoid unnecessary copies.
+ */
+
+ if (!dl || !dev || dev->flags & IFF_LOOPBACK)
+ send_to_wire = 0; /* No non looped */
+
+ /*
+ * See if this should be demuxed to sockets on this interface
+ *
+ * We want to ensure the original was eaten or that we only use
+ * up clones.
+ */
+
+ if (ipx->ipx_dest.net == intrfc->if_netnum) {
+ /*
+ * To our own node, loop and free the original.
+ * The internal net will receive on all node address.
+ */
+ if (intrfc == ipx_internal_net ||
+ !memcmp(intrfc->if_node, node, IPX_NODE_LEN)) {
+ /* Don't charge sender */
+ skb_orphan(skb);
+
+ /* Will charge receiver */
+ return ipxitf_demux_socket(intrfc, skb, 0);
+ }
+
+ /* Broadcast, loop and possibly keep to send on. */
+ if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN)) {
+ if (!send_to_wire)
+ skb_orphan(skb);
+ ipxitf_demux_socket(intrfc, skb, send_to_wire);
+ if (!send_to_wire)
+ goto out;
+ }
+ }
+
+ /*
+ * If the originating net is not equal to our net; this is routed
+ * We are still charging the sender. Which is right - the driver
+ * free will handle this fairly.
+ */
+ if (ipx->ipx_source.net != intrfc->if_netnum) {
+ /*
+ * Unshare the buffer before modifying the count in
+ * case it's a flood or tcpdump
+ */
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+ if (++ipx->ipx_tctrl > ipxcfg_max_hops)
+ send_to_wire = 0;
+ }
+
+ if (!send_to_wire) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ /* Determine the appropriate hardware address */
+ addr_len = dev->addr_len;
+ if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN))
+ memcpy(dest_node, dev->broadcast, addr_len);
+ else
+ memcpy(dest_node, &(node[IPX_NODE_LEN-addr_len]), addr_len);
+
+ /* Make any compensation for differing physical/data link size */
+ skb = ipxitf_adjust_skbuff(intrfc, skb);
+ if (!skb)
+ goto out;
+
+ /* set up data link and physical headers */
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_IPX);
+
+ /* Send it out */
+ dl->request(dl, skb, dest_node);
+out:
+ return 0;
+}
+
+static int ipxitf_add_local_route(struct ipx_interface *intrfc)
+{
+ return ipxrtr_add_route(intrfc->if_netnum, intrfc, NULL);
+}
+
+static void ipxitf_discover_netnum(struct ipx_interface *intrfc,
+ struct sk_buff *skb);
+static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb);
+
+static int ipxitf_rcv(struct ipx_interface *intrfc, struct sk_buff *skb)
+{
+ struct ipxhdr *ipx = ipx_hdr(skb);
+ int rc = 0;
+
+ ipxitf_hold(intrfc);
+
+ /* See if we should update our network number */
+ if (!intrfc->if_netnum) /* net number of intrfc not known yet */
+ ipxitf_discover_netnum(intrfc, skb);
+
+ IPX_SKB_CB(skb)->last_hop.index = -1;
+ if (ipx->ipx_type == IPX_TYPE_PPROP) {
+ rc = ipxitf_pprop(intrfc, skb);
+ if (rc)
+ goto out_free_skb;
+ }
+
+ /* local processing follows */
+ if (!IPX_SKB_CB(skb)->ipx_dest_net)
+ IPX_SKB_CB(skb)->ipx_dest_net = intrfc->if_netnum;
+ if (!IPX_SKB_CB(skb)->ipx_source_net)
+ IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum;
+
+ /* it doesn't make sense to route a pprop packet, there's no meaning
+ * in the ipx_dest_net for such packets */
+ if (ipx->ipx_type != IPX_TYPE_PPROP &&
+ intrfc->if_netnum != IPX_SKB_CB(skb)->ipx_dest_net) {
+ /* We only route point-to-point packets. */
+ if (skb->pkt_type == PACKET_HOST) {
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (skb)
+ rc = ipxrtr_route_skb(skb);
+ goto out_intrfc;
+ }
+
+ goto out_free_skb;
+ }
+
+ /* see if we should keep it */
+ if (!memcmp(ipx_broadcast_node, ipx->ipx_dest.node, IPX_NODE_LEN) ||
+ !memcmp(intrfc->if_node, ipx->ipx_dest.node, IPX_NODE_LEN)) {
+ rc = ipxitf_demux_socket(intrfc, skb, 0);
+ goto out_intrfc;
+ }
+
+ /* we couldn't pawn it off so unload it */
+out_free_skb:
+ kfree_skb(skb);
+out_intrfc:
+ ipxitf_put(intrfc);
+ return rc;
+}
+
+static void ipxitf_discover_netnum(struct ipx_interface *intrfc,
+ struct sk_buff *skb)
+{
+ const struct ipx_cb *cb = IPX_SKB_CB(skb);
+
+ /* see if this is an intra packet: source_net == dest_net */
+ if (cb->ipx_source_net == cb->ipx_dest_net && cb->ipx_source_net) {
+ struct ipx_interface *i =
+ ipxitf_find_using_net(cb->ipx_source_net);
+ /* NB: NetWare servers lie about their hop count so we
+ * dropped the test based on it. This is the best way
+ * to determine this is a 0 hop count packet. */
+ if (!i) {
+ intrfc->if_netnum = cb->ipx_source_net;
+ ipxitf_add_local_route(intrfc);
+ } else {
+ printk(KERN_WARNING "IPX: Network number collision "
+ "%lx\n %s %s and %s %s\n",
+ (unsigned long) ntohl(cb->ipx_source_net),
+ ipx_device_name(i),
+ ipx_frame_name(i->if_dlink_type),
+ ipx_device_name(intrfc),
+ ipx_frame_name(intrfc->if_dlink_type));
+ ipxitf_put(i);
+ }
+ }
+}
+
+/**
+ * ipxitf_pprop - Process packet propagation IPX packet type 0x14, used for
+ * NetBIOS broadcasts
+ * @intrfc: IPX interface receiving this packet
+ * @skb: Received packet
+ *
+ * Checks if packet is valid: if its more than %IPX_MAX_PPROP_HOPS hops or if it
+ * is smaller than a IPX header + the room for %IPX_MAX_PPROP_HOPS hops we drop
+ * it, not even processing it locally, if it has exact %IPX_MAX_PPROP_HOPS we
+ * don't broadcast it, but process it locally. See chapter 5 of Novell's "IPX
+ * RIP and SAP Router Specification", Part Number 107-000029-001.
+ *
+ * If it is valid, check if we have pprop broadcasting enabled by the user,
+ * if not, just return zero for local processing.
+ *
+ * If it is enabled check the packet and don't broadcast it if we have already
+ * seen this packet.
+ *
+ * Broadcast: send it to the interfaces that aren't on the packet visited nets
+ * array, just after the IPX header.
+ *
+ * Returns -EINVAL for invalid packets, so that the calling function drops
+ * the packet without local processing. 0 if packet is to be locally processed.
+ */
+static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb)
+{
+ struct ipxhdr *ipx = ipx_hdr(skb);
+ int i, rc = -EINVAL;
+ struct ipx_interface *ifcs;
+ char *c;
+ __be32 *l;
+
+ /* Illegal packet - too many hops or too short */
+ /* We decide to throw it away: no broadcasting, no local processing.
+ * NetBIOS unaware implementations route them as normal packets -
+ * tctrl <= 15, any data payload... */
+ if (IPX_SKB_CB(skb)->ipx_tctrl > IPX_MAX_PPROP_HOPS ||
+ ntohs(ipx->ipx_pktsize) < sizeof(struct ipxhdr) +
+ IPX_MAX_PPROP_HOPS * sizeof(u32))
+ goto out;
+ /* are we broadcasting this damn thing? */
+ rc = 0;
+ if (!sysctl_ipx_pprop_broadcasting)
+ goto out;
+ /* We do broadcast packet on the IPX_MAX_PPROP_HOPS hop, but we
+ * process it locally. All previous hops broadcasted it, and process it
+ * locally. */
+ if (IPX_SKB_CB(skb)->ipx_tctrl == IPX_MAX_PPROP_HOPS)
+ goto out;
+
+ c = ((u8 *) ipx) + sizeof(struct ipxhdr);
+ l = (__be32 *) c;
+
+ /* Don't broadcast packet if already seen this net */
+ for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++)
+ if (*l++ == intrfc->if_netnum)
+ goto out;
+
+ /* < IPX_MAX_PPROP_HOPS hops && input interface not in list. Save the
+ * position where we will insert recvd netnum into list, later on,
+ * in ipxitf_send */
+ IPX_SKB_CB(skb)->last_hop.index = i;
+ IPX_SKB_CB(skb)->last_hop.netnum = intrfc->if_netnum;
+ /* xmit on all other interfaces... */
+ spin_lock_bh(&ipx_interfaces_lock);
+ list_for_each_entry(ifcs, &ipx_interfaces, node) {
+ /* Except unconfigured interfaces */
+ if (!ifcs->if_netnum)
+ continue;
+
+ /* That aren't in the list */
+ if (ifcs == intrfc)
+ continue;
+ l = (__be32 *) c;
+ /* don't consider the last entry in the packet list,
+ * it is our netnum, and it is not there yet */
+ for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++)
+ if (ifcs->if_netnum == *l++)
+ break;
+ if (i == IPX_SKB_CB(skb)->ipx_tctrl) {
+ struct sk_buff *s = skb_copy(skb, GFP_ATOMIC);
+
+ if (s) {
+ IPX_SKB_CB(s)->ipx_dest_net = ifcs->if_netnum;
+ ipxrtr_route_skb(s);
+ }
+ }
+ }
+ spin_unlock_bh(&ipx_interfaces_lock);
+out:
+ return rc;
+}
+
+static void ipxitf_insert(struct ipx_interface *intrfc)
+{
+ spin_lock_bh(&ipx_interfaces_lock);
+ list_add_tail(&intrfc->node, &ipx_interfaces);
+ spin_unlock_bh(&ipx_interfaces_lock);
+
+ if (ipxcfg_auto_select_primary && !ipx_primary_net)
+ ipx_primary_net = intrfc;
+}
+
+static struct ipx_interface *ipxitf_alloc(struct net_device *dev, __be32 netnum,
+ __be16 dlink_type,
+ struct datalink_proto *dlink,
+ unsigned char internal,
+ int ipx_offset)
+{
+ struct ipx_interface *intrfc = kmalloc(sizeof(*intrfc), GFP_ATOMIC);
+
+ if (intrfc) {
+ intrfc->if_dev = dev;
+ intrfc->if_netnum = netnum;
+ intrfc->if_dlink_type = dlink_type;
+ intrfc->if_dlink = dlink;
+ intrfc->if_internal = internal;
+ intrfc->if_ipx_offset = ipx_offset;
+ intrfc->if_sknum = IPX_MIN_EPHEMERAL_SOCKET;
+ INIT_HLIST_HEAD(&intrfc->if_sklist);
+ atomic_set(&intrfc->refcnt, 1);
+ spin_lock_init(&intrfc->if_sklist_lock);
+ }
+
+ return intrfc;
+}
+
+static int ipxitf_create_internal(struct ipx_interface_definition *idef)
+{
+ struct ipx_interface *intrfc;
+ int rc = -EEXIST;
+
+ /* Only one primary network allowed */
+ if (ipx_primary_net)
+ goto out;
+
+ /* Must have a valid network number */
+ rc = -EADDRNOTAVAIL;
+ if (!idef->ipx_network)
+ goto out;
+ intrfc = ipxitf_find_using_net(idef->ipx_network);
+ rc = -EADDRINUSE;
+ if (intrfc) {
+ ipxitf_put(intrfc);
+ goto out;
+ }
+ intrfc = ipxitf_alloc(NULL, idef->ipx_network, 0, NULL, 1, 0);
+ rc = -EAGAIN;
+ if (!intrfc)
+ goto out;
+ memcpy((char *)&(intrfc->if_node), idef->ipx_node, IPX_NODE_LEN);
+ ipx_internal_net = ipx_primary_net = intrfc;
+ ipxitf_hold(intrfc);
+ ipxitf_insert(intrfc);
+
+ rc = ipxitf_add_local_route(intrfc);
+ ipxitf_put(intrfc);
+out:
+ return rc;
+}
+
+static __be16 ipx_map_frame_type(unsigned char type)
+{
+ __be16 rc = 0;
+
+ switch (type) {
+ case IPX_FRAME_ETHERII: rc = htons(ETH_P_IPX); break;
+ case IPX_FRAME_8022: rc = htons(ETH_P_802_2); break;
+ case IPX_FRAME_SNAP: rc = htons(ETH_P_SNAP); break;
+ case IPX_FRAME_8023: rc = htons(ETH_P_802_3); break;
+ }
+
+ return rc;
+}
+
+static int ipxitf_create(struct ipx_interface_definition *idef)
+{
+ struct net_device *dev;
+ __be16 dlink_type = 0;
+ struct datalink_proto *datalink = NULL;
+ struct ipx_interface *intrfc;
+ int rc;
+
+ if (idef->ipx_special == IPX_INTERNAL) {
+ rc = ipxitf_create_internal(idef);
+ goto out;
+ }
+
+ rc = -EEXIST;
+ if (idef->ipx_special == IPX_PRIMARY && ipx_primary_net)
+ goto out;
+
+ intrfc = ipxitf_find_using_net(idef->ipx_network);
+ rc = -EADDRINUSE;
+ if (idef->ipx_network && intrfc) {
+ ipxitf_put(intrfc);
+ goto out;
+ }
+
+ if (intrfc)
+ ipxitf_put(intrfc);
+
+ dev = dev_get_by_name(&init_net, idef->ipx_device);
+ rc = -ENODEV;
+ if (!dev)
+ goto out;
+
+ switch (idef->ipx_dlink_type) {
+ case IPX_FRAME_8022:
+ dlink_type = htons(ETH_P_802_2);
+ datalink = p8022_datalink;
+ break;
+ case IPX_FRAME_ETHERII:
+ if (dev->type != ARPHRD_IEEE802) {
+ dlink_type = htons(ETH_P_IPX);
+ datalink = pEII_datalink;
+ break;
+ }
+ /* fall through */
+ case IPX_FRAME_SNAP:
+ dlink_type = htons(ETH_P_SNAP);
+ datalink = pSNAP_datalink;
+ break;
+ case IPX_FRAME_8023:
+ dlink_type = htons(ETH_P_802_3);
+ datalink = p8023_datalink;
+ break;
+ case IPX_FRAME_NONE:
+ default:
+ rc = -EPROTONOSUPPORT;
+ goto out_dev;
+ }
+
+ rc = -ENETDOWN;
+ if (!(dev->flags & IFF_UP))
+ goto out_dev;
+
+ /* Check addresses are suitable */
+ rc = -EINVAL;
+ if (dev->addr_len > IPX_NODE_LEN)
+ goto out_dev;
+
+ intrfc = ipxitf_find_using_phys(dev, dlink_type);
+ if (!intrfc) {
+ /* Ok now create */
+ intrfc = ipxitf_alloc(dev, idef->ipx_network, dlink_type,
+ datalink, 0, dev->hard_header_len +
+ datalink->header_length);
+ rc = -EAGAIN;
+ if (!intrfc)
+ goto out_dev;
+ /* Setup primary if necessary */
+ if (idef->ipx_special == IPX_PRIMARY)
+ ipx_primary_net = intrfc;
+ if (!memcmp(idef->ipx_node, "\000\000\000\000\000\000",
+ IPX_NODE_LEN)) {
+ memset(intrfc->if_node, 0, IPX_NODE_LEN);
+ memcpy(intrfc->if_node + IPX_NODE_LEN - dev->addr_len,
+ dev->dev_addr, dev->addr_len);
+ } else
+ memcpy(intrfc->if_node, idef->ipx_node, IPX_NODE_LEN);
+ ipxitf_hold(intrfc);
+ ipxitf_insert(intrfc);
+ }
+
+
+ /* If the network number is known, add a route */
+ rc = 0;
+ if (!intrfc->if_netnum)
+ goto out_intrfc;
+
+ rc = ipxitf_add_local_route(intrfc);
+out_intrfc:
+ ipxitf_put(intrfc);
+ goto out;
+out_dev:
+ dev_put(dev);
+out:
+ return rc;
+}
+
+static int ipxitf_delete(struct ipx_interface_definition *idef)
+{
+ struct net_device *dev = NULL;
+ __be16 dlink_type = 0;
+ struct ipx_interface *intrfc;
+ int rc = 0;
+
+ spin_lock_bh(&ipx_interfaces_lock);
+ if (idef->ipx_special == IPX_INTERNAL) {
+ if (ipx_internal_net) {
+ __ipxitf_put(ipx_internal_net);
+ goto out;
+ }
+ rc = -ENOENT;
+ goto out;
+ }
+
+ dlink_type = ipx_map_frame_type(idef->ipx_dlink_type);
+ rc = -EPROTONOSUPPORT;
+ if (!dlink_type)
+ goto out;
+
+ dev = __dev_get_by_name(&init_net, idef->ipx_device);
+ rc = -ENODEV;
+ if (!dev)
+ goto out;
+
+ intrfc = __ipxitf_find_using_phys(dev, dlink_type);
+ rc = -EINVAL;
+ if (!intrfc)
+ goto out;
+ __ipxitf_put(intrfc);
+
+ rc = 0;
+out:
+ spin_unlock_bh(&ipx_interfaces_lock);
+ return rc;
+}
+
+static struct ipx_interface *ipxitf_auto_create(struct net_device *dev,
+ __be16 dlink_type)
+{
+ struct ipx_interface *intrfc = NULL;
+ struct datalink_proto *datalink;
+
+ if (!dev)
+ goto out;
+
+ /* Check addresses are suitable */
+ if (dev->addr_len > IPX_NODE_LEN)
+ goto out;
+
+ switch (ntohs(dlink_type)) {
+ case ETH_P_IPX: datalink = pEII_datalink; break;
+ case ETH_P_802_2: datalink = p8022_datalink; break;
+ case ETH_P_SNAP: datalink = pSNAP_datalink; break;
+ case ETH_P_802_3: datalink = p8023_datalink; break;
+ default: goto out;
+ }
+
+ intrfc = ipxitf_alloc(dev, 0, dlink_type, datalink, 0,
+ dev->hard_header_len + datalink->header_length);
+
+ if (intrfc) {
+ memset(intrfc->if_node, 0, IPX_NODE_LEN);
+ memcpy((char *)&(intrfc->if_node[IPX_NODE_LEN-dev->addr_len]),
+ dev->dev_addr, dev->addr_len);
+ spin_lock_init(&intrfc->if_sklist_lock);
+ atomic_set(&intrfc->refcnt, 1);
+ ipxitf_insert(intrfc);
+ dev_hold(dev);
+ }
+
+out:
+ return intrfc;
+}
+
+static int ipxitf_ioctl(unsigned int cmd, void __user *arg)
+{
+ int rc = -EINVAL;
+ struct ifreq ifr;
+ int val;
+
+ switch (cmd) {
+ case SIOCSIFADDR: {
+ struct sockaddr_ipx *sipx;
+ struct ipx_interface_definition f;
+
+ rc = -EFAULT;
+ if (copy_from_user(&ifr, arg, sizeof(ifr)))
+ break;
+ sipx = (struct sockaddr_ipx *)&ifr.ifr_addr;
+ rc = -EINVAL;
+ if (sipx->sipx_family != AF_IPX)
+ break;
+ f.ipx_network = sipx->sipx_network;
+ memcpy(f.ipx_device, ifr.ifr_name,
+ sizeof(f.ipx_device));
+ memcpy(f.ipx_node, sipx->sipx_node, IPX_NODE_LEN);
+ f.ipx_dlink_type = sipx->sipx_type;
+ f.ipx_special = sipx->sipx_special;
+
+ if (sipx->sipx_action == IPX_DLTITF)
+ rc = ipxitf_delete(&f);
+ else
+ rc = ipxitf_create(&f);
+ break;
+ }
+ case SIOCGIFADDR: {
+ struct sockaddr_ipx *sipx;
+ struct ipx_interface *ipxif;
+ struct net_device *dev;
+
+ rc = -EFAULT;
+ if (copy_from_user(&ifr, arg, sizeof(ifr)))
+ break;
+ sipx = (struct sockaddr_ipx *)&ifr.ifr_addr;
+ dev = __dev_get_by_name(&init_net, ifr.ifr_name);
+ rc = -ENODEV;
+ if (!dev)
+ break;
+ ipxif = ipxitf_find_using_phys(dev,
+ ipx_map_frame_type(sipx->sipx_type));
+ rc = -EADDRNOTAVAIL;
+ if (!ipxif)
+ break;
+
+ sipx->sipx_family = AF_IPX;
+ sipx->sipx_network = ipxif->if_netnum;
+ memcpy(sipx->sipx_node, ipxif->if_node,
+ sizeof(sipx->sipx_node));
+ rc = -EFAULT;
+ if (copy_to_user(arg, &ifr, sizeof(ifr)))
+ break;
+ ipxitf_put(ipxif);
+ rc = 0;
+ break;
+ }
+ case SIOCAIPXITFCRT:
+ rc = -EFAULT;
+ if (get_user(val, (unsigned char __user *) arg))
+ break;
+ rc = 0;
+ ipxcfg_auto_create_interfaces = val;
+ break;
+ case SIOCAIPXPRISLT:
+ rc = -EFAULT;
+ if (get_user(val, (unsigned char __user *) arg))
+ break;
+ rc = 0;
+ ipxcfg_set_auto_select(val);
+ break;
+ }
+
+ return rc;
+}
+
+/*
+ * Checksum routine for IPX
+ */
+
+/* Note: We assume ipx_tctrl==0 and htons(length)==ipx_pktsize */
+/* This functions should *not* mess with packet contents */
+
+__be16 ipx_cksum(struct ipxhdr *packet, int length)
+{
+ /*
+ * NOTE: sum is a net byte order quantity, which optimizes the
+ * loop. This only works on big and little endian machines. (I
+ * don't know of a machine that isn't.)
+ */
+ /* handle the first 3 words separately; checksum should be skipped
+ * and ipx_tctrl masked out */
+ __u16 *p = (__u16 *)packet;
+ __u32 sum = p[1] + (p[2] & (__force u16)htons(0x00ff));
+ __u32 i = (length >> 1) - 3; /* Number of remaining complete words */
+
+ /* Loop through them */
+ p += 3;
+ while (i--)
+ sum += *p++;
+
+ /* Add on the last part word if it exists */
+ if (packet->ipx_pktsize & htons(1))
+ sum += (__force u16)htons(0xff00) & *p;
+
+ /* Do final fixup */
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ /* It's a pity there's no concept of carry in C */
+ if (sum >= 0x10000)
+ sum++;
+
+ /*
+ * Leave 0 alone; we don't want 0xffff here. Note that we can't get
+ * here with 0x10000, so this check is the same as ((__u16)sum)
+ */
+ if (sum)
+ sum = ~sum;
+
+ return (__force __be16)sum;
+}
+
+const char *ipx_frame_name(__be16 frame)
+{
+ char* rc = "None";
+
+ switch (ntohs(frame)) {
+ case ETH_P_IPX: rc = "EtherII"; break;
+ case ETH_P_802_2: rc = "802.2"; break;
+ case ETH_P_SNAP: rc = "SNAP"; break;
+ case ETH_P_802_3: rc = "802.3"; break;
+ }
+
+ return rc;
+}
+
+const char *ipx_device_name(struct ipx_interface *intrfc)
+{
+ return intrfc->if_internal ? "Internal" :
+ intrfc->if_dev ? intrfc->if_dev->name : "Unknown";
+}
+
+/* Handling for system calls applied via the various interfaces to an IPX
+ * socket object. */
+
+static int ipx_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int opt;
+ int rc = -EINVAL;
+
+ lock_sock(sk);
+ if (optlen != sizeof(int))
+ goto out;
+
+ rc = -EFAULT;
+ if (get_user(opt, (unsigned int __user *)optval))
+ goto out;
+
+ rc = -ENOPROTOOPT;
+ if (!(level == SOL_IPX && optname == IPX_TYPE))
+ goto out;
+
+ ipx_sk(sk)->type = opt;
+ rc = 0;
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int ipx_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int val = 0;
+ int len;
+ int rc = -ENOPROTOOPT;
+
+ lock_sock(sk);
+ if (!(level == SOL_IPX && optname == IPX_TYPE))
+ goto out;
+
+ val = ipx_sk(sk)->type;
+
+ rc = -EFAULT;
+ if (get_user(len, optlen))
+ goto out;
+
+ len = min_t(unsigned int, len, sizeof(int));
+ rc = -EINVAL;
+ if(len < 0)
+ goto out;
+
+ rc = -EFAULT;
+ if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+ goto out;
+
+ rc = 0;
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static struct proto ipx_proto = {
+ .name = "IPX",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct ipx_sock),
+};
+
+static int ipx_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ int rc = -ESOCKTNOSUPPORT;
+ struct sock *sk;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ /*
+ * SPX support is not anymore in the kernel sources. If you want to
+ * ressurrect it, completing it and making it understand shared skbs,
+ * be fully multithreaded, etc, grab the sources in an early 2.5 kernel
+ * tree.
+ */
+ if (sock->type != SOCK_DGRAM)
+ goto out;
+
+ rc = -ENOMEM;
+ sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto);
+ if (!sk)
+ goto out;
+
+ sk_refcnt_debug_inc(sk);
+ sock_init_data(sock, sk);
+ sk->sk_no_check_tx = 1; /* Checksum off by default */
+ sock->ops = &ipx_dgram_ops;
+ rc = 0;
+out:
+ return rc;
+}
+
+static int ipx_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ goto out;
+
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+
+ sock_set_flag(sk, SOCK_DEAD);
+ sock->sk = NULL;
+ sk_refcnt_debug_release(sk);
+ ipx_destroy_socket(sk);
+ release_sock(sk);
+ sock_put(sk);
+out:
+ return 0;
+}
+
+/* caller must hold a reference to intrfc */
+
+static __be16 ipx_first_free_socketnum(struct ipx_interface *intrfc)
+{
+ unsigned short socketNum = intrfc->if_sknum;
+
+ spin_lock_bh(&intrfc->if_sklist_lock);
+
+ if (socketNum < IPX_MIN_EPHEMERAL_SOCKET)
+ socketNum = IPX_MIN_EPHEMERAL_SOCKET;
+
+ while (__ipxitf_find_socket(intrfc, htons(socketNum)))
+ if (socketNum > IPX_MAX_EPHEMERAL_SOCKET)
+ socketNum = IPX_MIN_EPHEMERAL_SOCKET;
+ else
+ socketNum++;
+
+ spin_unlock_bh(&intrfc->if_sklist_lock);
+ intrfc->if_sknum = socketNum;
+
+ return htons(socketNum);
+}
+
+static int __ipx_bind(struct socket *sock,
+ struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct ipx_sock *ipxs = ipx_sk(sk);
+ struct ipx_interface *intrfc;
+ struct sockaddr_ipx *addr = (struct sockaddr_ipx *)uaddr;
+ int rc = -EINVAL;
+
+ if (!sock_flag(sk, SOCK_ZAPPED) || addr_len != sizeof(struct sockaddr_ipx))
+ goto out;
+
+ intrfc = ipxitf_find_using_net(addr->sipx_network);
+ rc = -EADDRNOTAVAIL;
+ if (!intrfc)
+ goto out;
+
+ if (!addr->sipx_port) {
+ addr->sipx_port = ipx_first_free_socketnum(intrfc);
+ rc = -EINVAL;
+ if (!addr->sipx_port)
+ goto out_put;
+ }
+
+ /* protect IPX system stuff like routing/sap */
+ rc = -EACCES;
+ if (ntohs(addr->sipx_port) < IPX_MIN_EPHEMERAL_SOCKET &&
+ !capable(CAP_NET_ADMIN))
+ goto out_put;
+
+ ipxs->port = addr->sipx_port;
+
+#ifdef CONFIG_IPX_INTERN
+ if (intrfc == ipx_internal_net) {
+ /* The source address is to be set explicitly if the
+ * socket is to be bound on the internal network. If a
+ * node number 0 was specified, the default is used.
+ */
+
+ rc = -EINVAL;
+ if (!memcmp(addr->sipx_node, ipx_broadcast_node, IPX_NODE_LEN))
+ goto out_put;
+ if (!memcmp(addr->sipx_node, ipx_this_node, IPX_NODE_LEN))
+ memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN);
+ else
+ memcpy(ipxs->node, addr->sipx_node, IPX_NODE_LEN);
+
+ rc = -EADDRINUSE;
+ if (ipxitf_find_internal_socket(intrfc, ipxs->node,
+ ipxs->port)) {
+ SOCK_DEBUG(sk,
+ "IPX: bind failed because port %X in use.\n",
+ ntohs(addr->sipx_port));
+ goto out_put;
+ }
+ } else {
+ /* Source addresses are easy. It must be our
+ * network:node pair for an interface routed to IPX
+ * with the ipx routing ioctl()
+ */
+
+ memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN);
+
+ rc = -EADDRINUSE;
+ if (ipxitf_find_socket(intrfc, addr->sipx_port)) {
+ SOCK_DEBUG(sk,
+ "IPX: bind failed because port %X in use.\n",
+ ntohs(addr->sipx_port));
+ goto out_put;
+ }
+ }
+
+#else /* !def CONFIG_IPX_INTERN */
+
+ /* Source addresses are easy. It must be our network:node pair for
+ an interface routed to IPX with the ipx routing ioctl() */
+
+ rc = -EADDRINUSE;
+ if (ipxitf_find_socket(intrfc, addr->sipx_port)) {
+ SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n",
+ ntohs((int)addr->sipx_port));
+ goto out_put;
+ }
+
+#endif /* CONFIG_IPX_INTERN */
+
+ ipxitf_insert_socket(intrfc, sk);
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ rc = 0;
+out_put:
+ ipxitf_put(intrfc);
+out:
+ return rc;
+}
+
+static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ int rc;
+
+ lock_sock(sk);
+ rc = __ipx_bind(sock, uaddr, addr_len);
+ release_sock(sk);
+
+ return rc;
+}
+
+static int ipx_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct ipx_sock *ipxs = ipx_sk(sk);
+ struct sockaddr_ipx *addr;
+ int rc = -EINVAL;
+ struct ipx_route *rt;
+
+ sk->sk_state = TCP_CLOSE;
+ sock->state = SS_UNCONNECTED;
+
+ lock_sock(sk);
+ if (addr_len != sizeof(*addr))
+ goto out;
+ addr = (struct sockaddr_ipx *)uaddr;
+
+ /* put the autobinding in */
+ if (!ipxs->port) {
+ struct sockaddr_ipx uaddr;
+
+ uaddr.sipx_port = 0;
+ uaddr.sipx_network = 0;
+
+#ifdef CONFIG_IPX_INTERN
+ rc = -ENETDOWN;
+ if (!ipxs->intrfc)
+ goto out; /* Someone zonked the iface */
+ memcpy(uaddr.sipx_node, ipxs->intrfc->if_node,
+ IPX_NODE_LEN);
+#endif /* CONFIG_IPX_INTERN */
+
+ rc = __ipx_bind(sock, (struct sockaddr *)&uaddr,
+ sizeof(struct sockaddr_ipx));
+ if (rc)
+ goto out;
+ }
+
+ /* We can either connect to primary network or somewhere
+ * we can route to */
+ rt = ipxrtr_lookup(addr->sipx_network);
+ rc = -ENETUNREACH;
+ if (!rt && !(!addr->sipx_network && ipx_primary_net))
+ goto out;
+
+ ipxs->dest_addr.net = addr->sipx_network;
+ ipxs->dest_addr.sock = addr->sipx_port;
+ memcpy(ipxs->dest_addr.node, addr->sipx_node, IPX_NODE_LEN);
+ ipxs->type = addr->sipx_type;
+
+ if (sock->type == SOCK_DGRAM) {
+ sock->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
+ }
+
+ if (rt)
+ ipxrtr_put(rt);
+ rc = 0;
+out:
+ release_sock(sk);
+ return rc;
+}
+
+
+static int ipx_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct ipx_address *addr;
+ struct sockaddr_ipx sipx;
+ struct sock *sk = sock->sk;
+ struct ipx_sock *ipxs = ipx_sk(sk);
+ int rc;
+
+ *uaddr_len = sizeof(struct sockaddr_ipx);
+
+ lock_sock(sk);
+ if (peer) {
+ rc = -ENOTCONN;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ addr = &ipxs->dest_addr;
+ sipx.sipx_network = addr->net;
+ sipx.sipx_port = addr->sock;
+ memcpy(sipx.sipx_node, addr->node, IPX_NODE_LEN);
+ } else {
+ if (ipxs->intrfc) {
+ sipx.sipx_network = ipxs->intrfc->if_netnum;
+#ifdef CONFIG_IPX_INTERN
+ memcpy(sipx.sipx_node, ipxs->node, IPX_NODE_LEN);
+#else
+ memcpy(sipx.sipx_node, ipxs->intrfc->if_node,
+ IPX_NODE_LEN);
+#endif /* CONFIG_IPX_INTERN */
+
+ } else {
+ sipx.sipx_network = 0;
+ memset(sipx.sipx_node, '\0', IPX_NODE_LEN);
+ }
+
+ sipx.sipx_port = ipxs->port;
+ }
+
+ sipx.sipx_family = AF_IPX;
+ sipx.sipx_type = ipxs->type;
+ sipx.sipx_zero = 0;
+ memcpy(uaddr, &sipx, sizeof(sipx));
+
+ rc = 0;
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ /* NULL here for pt means the packet was looped back */
+ struct ipx_interface *intrfc;
+ struct ipxhdr *ipx;
+ u16 ipx_pktsize;
+ int rc = 0;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ /* Not ours */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto out;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipxhdr)))
+ goto drop;
+
+ ipx_pktsize = ntohs(ipx_hdr(skb)->ipx_pktsize);
+
+ /* Too small or invalid header? */
+ if (ipx_pktsize < sizeof(struct ipxhdr) ||
+ !pskb_may_pull(skb, ipx_pktsize))
+ goto drop;
+
+ ipx = ipx_hdr(skb);
+ if (ipx->ipx_checksum != IPX_NO_CHECKSUM &&
+ ipx->ipx_checksum != ipx_cksum(ipx, ipx_pktsize))
+ goto drop;
+
+ IPX_SKB_CB(skb)->ipx_tctrl = ipx->ipx_tctrl;
+ IPX_SKB_CB(skb)->ipx_dest_net = ipx->ipx_dest.net;
+ IPX_SKB_CB(skb)->ipx_source_net = ipx->ipx_source.net;
+
+ /* Determine what local ipx endpoint this is */
+ intrfc = ipxitf_find_using_phys(dev, pt->type);
+ if (!intrfc) {
+ if (ipxcfg_auto_create_interfaces &&
+ IPX_SKB_CB(skb)->ipx_dest_net) {
+ intrfc = ipxitf_auto_create(dev, pt->type);
+ if (intrfc)
+ ipxitf_hold(intrfc);
+ }
+
+ if (!intrfc) /* Not one of ours */
+ /* or invalid packet for auto creation */
+ goto drop;
+ }
+
+ rc = ipxitf_rcv(intrfc, skb);
+ ipxitf_put(intrfc);
+ goto out;
+drop:
+ kfree_skb(skb);
+out:
+ return rc;
+}
+
+static int ipx_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct ipx_sock *ipxs = ipx_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_ipx *, usipx, msg->msg_name);
+ struct sockaddr_ipx local_sipx;
+ int rc = -EINVAL;
+ int flags = msg->msg_flags;
+
+ lock_sock(sk);
+ /* Socket gets bound below anyway */
+/* if (sk->sk_zapped)
+ return -EIO; */ /* Socket not bound */
+ if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+ goto out;
+
+ /* Max possible packet size limited by 16 bit pktsize in header */
+ if (len >= 65535 - sizeof(struct ipxhdr))
+ goto out;
+
+ if (usipx) {
+ if (!ipxs->port) {
+ struct sockaddr_ipx uaddr;
+
+ uaddr.sipx_port = 0;
+ uaddr.sipx_network = 0;
+#ifdef CONFIG_IPX_INTERN
+ rc = -ENETDOWN;
+ if (!ipxs->intrfc)
+ goto out; /* Someone zonked the iface */
+ memcpy(uaddr.sipx_node, ipxs->intrfc->if_node,
+ IPX_NODE_LEN);
+#endif
+ rc = __ipx_bind(sock, (struct sockaddr *)&uaddr,
+ sizeof(struct sockaddr_ipx));
+ if (rc)
+ goto out;
+ }
+
+ rc = -EINVAL;
+ if (msg->msg_namelen < sizeof(*usipx) ||
+ usipx->sipx_family != AF_IPX)
+ goto out;
+ } else {
+ rc = -ENOTCONN;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ usipx = &local_sipx;
+ usipx->sipx_family = AF_IPX;
+ usipx->sipx_type = ipxs->type;
+ usipx->sipx_port = ipxs->dest_addr.sock;
+ usipx->sipx_network = ipxs->dest_addr.net;
+ memcpy(usipx->sipx_node, ipxs->dest_addr.node, IPX_NODE_LEN);
+ }
+
+ rc = ipxrtr_route_packet(sk, usipx, msg, len, flags & MSG_DONTWAIT);
+ if (rc >= 0)
+ rc = len;
+out:
+ release_sock(sk);
+ return rc;
+}
+
+
+static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct ipx_sock *ipxs = ipx_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_ipx *, sipx, msg->msg_name);
+ struct ipxhdr *ipx = NULL;
+ struct sk_buff *skb;
+ int copied, rc;
+ bool locked = true;
+
+ lock_sock(sk);
+ /* put the autobinding in */
+ if (!ipxs->port) {
+ struct sockaddr_ipx uaddr;
+
+ uaddr.sipx_port = 0;
+ uaddr.sipx_network = 0;
+
+#ifdef CONFIG_IPX_INTERN
+ rc = -ENETDOWN;
+ if (!ipxs->intrfc)
+ goto out; /* Someone zonked the iface */
+ memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, IPX_NODE_LEN);
+#endif /* CONFIG_IPX_INTERN */
+
+ rc = __ipx_bind(sock, (struct sockaddr *)&uaddr,
+ sizeof(struct sockaddr_ipx));
+ if (rc)
+ goto out;
+ }
+
+ rc = -ENOTCONN;
+ if (sock_flag(sk, SOCK_ZAPPED))
+ goto out;
+
+ release_sock(sk);
+ locked = false;
+ skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+ flags & MSG_DONTWAIT, &rc);
+ if (!skb) {
+ if (rc == -EAGAIN && (sk->sk_shutdown & RCV_SHUTDOWN))
+ rc = 0;
+ goto out;
+ }
+
+ ipx = ipx_hdr(skb);
+ copied = ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr);
+ if (copied > size) {
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied);
+ if (rc)
+ goto out_free;
+ if (skb->tstamp.tv64)
+ sk->sk_stamp = skb->tstamp;
+
+ if (sipx) {
+ sipx->sipx_family = AF_IPX;
+ sipx->sipx_port = ipx->ipx_source.sock;
+ memcpy(sipx->sipx_node, ipx->ipx_source.node, IPX_NODE_LEN);
+ sipx->sipx_network = IPX_SKB_CB(skb)->ipx_source_net;
+ sipx->sipx_type = ipx->ipx_type;
+ sipx->sipx_zero = 0;
+ msg->msg_namelen = sizeof(*sipx);
+ }
+ rc = copied;
+
+out_free:
+ skb_free_datagram(sk, skb);
+out:
+ if (locked)
+ release_sock(sk);
+ return rc;
+}
+
+
+static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ int rc = 0;
+ long amount = 0;
+ struct sock *sk = sock->sk;
+ void __user *argp = (void __user *)arg;
+
+ lock_sock(sk);
+ switch (cmd) {
+ case TIOCOUTQ:
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ rc = put_user(amount, (int __user *)argp);
+ break;
+ case TIOCINQ: {
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ /* These two are safe on a single CPU system as only
+ * user tasks fiddle here */
+ if (skb)
+ amount = skb->len - sizeof(struct ipxhdr);
+ rc = put_user(amount, (int __user *)argp);
+ break;
+ }
+ case SIOCADDRT:
+ case SIOCDELRT:
+ rc = -EPERM;
+ if (capable(CAP_NET_ADMIN))
+ rc = ipxrtr_ioctl(cmd, argp);
+ break;
+ case SIOCSIFADDR:
+ case SIOCAIPXITFCRT:
+ case SIOCAIPXPRISLT:
+ rc = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
+ case SIOCGIFADDR:
+ rc = ipxitf_ioctl(cmd, argp);
+ break;
+ case SIOCIPXCFGDATA:
+ rc = ipxcfg_get_config_data(argp);
+ break;
+ case SIOCIPXNCPCONN:
+ /*
+ * This socket wants to take care of the NCP connection
+ * handed to us in arg.
+ */
+ rc = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
+ rc = get_user(ipx_sk(sk)->ipx_ncp_conn,
+ (const unsigned short __user *)argp);
+ break;
+ case SIOCGSTAMP:
+ rc = sock_get_timestamp(sk, argp);
+ break;
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ rc = -EINVAL;
+ break;
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
+
+#ifdef CONFIG_COMPAT
+static int ipx_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ /*
+ * These 4 commands use same structure on 32bit and 64bit. Rest of IPX
+ * commands is handled by generic ioctl code. As these commands are
+ * SIOCPROTOPRIVATE..SIOCPROTOPRIVATE+3, they cannot be handled by generic
+ * code.
+ */
+ switch (cmd) {
+ case SIOCAIPXITFCRT:
+ case SIOCAIPXPRISLT:
+ case SIOCIPXCFGDATA:
+ case SIOCIPXNCPCONN:
+ return ipx_ioctl(sock, cmd, arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+static int ipx_shutdown(struct socket *sock, int mode)
+{
+ struct sock *sk = sock->sk;
+
+ if (mode < SHUT_RD || mode > SHUT_RDWR)
+ return -EINVAL;
+ /* This maps:
+ * SHUT_RD (0) -> RCV_SHUTDOWN (1)
+ * SHUT_WR (1) -> SEND_SHUTDOWN (2)
+ * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
+ */
+ ++mode;
+
+ lock_sock(sk);
+ sk->sk_shutdown |= mode;
+ release_sock(sk);
+ sk->sk_state_change(sk);
+
+ return 0;
+}
+
+/*
+ * Socket family declarations
+ */
+
+static const struct net_proto_family ipx_family_ops = {
+ .family = PF_IPX,
+ .create = ipx_create,
+ .owner = THIS_MODULE,
+};
+
+static const struct proto_ops ipx_dgram_ops = {
+ .family = PF_IPX,
+ .owner = THIS_MODULE,
+ .release = ipx_release,
+ .bind = ipx_bind,
+ .connect = ipx_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = ipx_getname,
+ .poll = datagram_poll,
+ .ioctl = ipx_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ipx_compat_ioctl,
+#endif
+ .listen = sock_no_listen,
+ .shutdown = ipx_shutdown,
+ .setsockopt = ipx_setsockopt,
+ .getsockopt = ipx_getsockopt,
+ .sendmsg = ipx_sendmsg,
+ .recvmsg = ipx_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct packet_type ipx_8023_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_802_3),
+ .func = ipx_rcv,
+};
+
+static struct packet_type ipx_dix_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IPX),
+ .func = ipx_rcv,
+};
+
+static struct notifier_block ipx_dev_notifier = {
+ .notifier_call = ipxitf_device_event,
+};
+
+static const unsigned char ipx_8022_type = 0xE0;
+static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
+static const char ipx_EII_err_msg[] __initconst =
+ KERN_CRIT "IPX: Unable to register with Ethernet II\n";
+static const char ipx_8023_err_msg[] __initconst =
+ KERN_CRIT "IPX: Unable to register with 802.3\n";
+static const char ipx_llc_err_msg[] __initconst =
+ KERN_CRIT "IPX: Unable to register with 802.2\n";
+static const char ipx_snap_err_msg[] __initconst =
+ KERN_CRIT "IPX: Unable to register with SNAP\n";
+
+static int __init ipx_init(void)
+{
+ int rc = proto_register(&ipx_proto, 1);
+
+ if (rc != 0)
+ goto out;
+
+ sock_register(&ipx_family_ops);
+
+ pEII_datalink = make_EII_client();
+ if (pEII_datalink)
+ dev_add_pack(&ipx_dix_packet_type);
+ else
+ printk(ipx_EII_err_msg);
+
+ p8023_datalink = make_8023_client();
+ if (p8023_datalink)
+ dev_add_pack(&ipx_8023_packet_type);
+ else
+ printk(ipx_8023_err_msg);
+
+ p8022_datalink = register_8022_client(ipx_8022_type, ipx_rcv);
+ if (!p8022_datalink)
+ printk(ipx_llc_err_msg);
+
+ pSNAP_datalink = register_snap_client(ipx_snap_id, ipx_rcv);
+ if (!pSNAP_datalink)
+ printk(ipx_snap_err_msg);
+
+ register_netdevice_notifier(&ipx_dev_notifier);
+ ipx_register_sysctl();
+ ipx_proc_init();
+out:
+ return rc;
+}
+
+static void __exit ipx_proto_finito(void)
+{
+ ipx_proc_exit();
+ ipx_unregister_sysctl();
+
+ unregister_netdevice_notifier(&ipx_dev_notifier);
+
+ ipxitf_cleanup();
+
+ if (pSNAP_datalink) {
+ unregister_snap_client(pSNAP_datalink);
+ pSNAP_datalink = NULL;
+ }
+
+ if (p8022_datalink) {
+ unregister_8022_client(p8022_datalink);
+ p8022_datalink = NULL;
+ }
+
+ dev_remove_pack(&ipx_8023_packet_type);
+ if (p8023_datalink) {
+ destroy_8023_client(p8023_datalink);
+ p8023_datalink = NULL;
+ }
+
+ dev_remove_pack(&ipx_dix_packet_type);
+ if (pEII_datalink) {
+ destroy_EII_client(pEII_datalink);
+ pEII_datalink = NULL;
+ }
+
+ proto_unregister(&ipx_proto);
+ sock_unregister(ipx_family_ops.family);
+}
+
+module_init(ipx_init);
+module_exit(ipx_proto_finito);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_IPX);
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
new file mode 100644
index 000000000..c1d247ebe
--- /dev/null
+++ b/net/ipx/ipx_proc.c
@@ -0,0 +1,340 @@
+/*
+ * IPX proc routines
+ *
+ * Copyright(C) Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 2002
+ */
+
+#include <linux/init.h>
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/tcp_states.h>
+#include <net/ipx.h>
+
+static void *ipx_seq_interface_start(struct seq_file *seq, loff_t *pos)
+{
+ spin_lock_bh(&ipx_interfaces_lock);
+ return seq_list_start_head(&ipx_interfaces, *pos);
+}
+
+static void *ipx_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &ipx_interfaces, pos);
+}
+
+static void ipx_seq_interface_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_bh(&ipx_interfaces_lock);
+}
+
+static int ipx_seq_interface_show(struct seq_file *seq, void *v)
+{
+ struct ipx_interface *i;
+
+ if (v == &ipx_interfaces) {
+ seq_puts(seq, "Network Node_Address Primary Device "
+ "Frame_Type");
+#ifdef IPX_REFCNT_DEBUG
+ seq_puts(seq, " refcnt");
+#endif
+ seq_puts(seq, "\n");
+ goto out;
+ }
+
+ i = list_entry(v, struct ipx_interface, node);
+ seq_printf(seq, "%08X ", ntohl(i->if_netnum));
+ seq_printf(seq, "%02X%02X%02X%02X%02X%02X ",
+ i->if_node[0], i->if_node[1], i->if_node[2],
+ i->if_node[3], i->if_node[4], i->if_node[5]);
+ seq_printf(seq, "%-9s", i == ipx_primary_net ? "Yes" : "No");
+ seq_printf(seq, "%-11s", ipx_device_name(i));
+ seq_printf(seq, "%-9s", ipx_frame_name(i->if_dlink_type));
+#ifdef IPX_REFCNT_DEBUG
+ seq_printf(seq, "%6d", atomic_read(&i->refcnt));
+#endif
+ seq_puts(seq, "\n");
+out:
+ return 0;
+}
+
+static void *ipx_seq_route_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock_bh(&ipx_routes_lock);
+ return seq_list_start_head(&ipx_routes, *pos);
+}
+
+static void *ipx_seq_route_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &ipx_routes, pos);
+}
+
+static void ipx_seq_route_stop(struct seq_file *seq, void *v)
+{
+ read_unlock_bh(&ipx_routes_lock);
+}
+
+static int ipx_seq_route_show(struct seq_file *seq, void *v)
+{
+ struct ipx_route *rt;
+
+ if (v == &ipx_routes) {
+ seq_puts(seq, "Network Router_Net Router_Node\n");
+ goto out;
+ }
+
+ rt = list_entry(v, struct ipx_route, node);
+
+ seq_printf(seq, "%08X ", ntohl(rt->ir_net));
+ if (rt->ir_routed)
+ seq_printf(seq, "%08X %02X%02X%02X%02X%02X%02X\n",
+ ntohl(rt->ir_intrfc->if_netnum),
+ rt->ir_router_node[0], rt->ir_router_node[1],
+ rt->ir_router_node[2], rt->ir_router_node[3],
+ rt->ir_router_node[4], rt->ir_router_node[5]);
+ else
+ seq_puts(seq, "Directly Connected\n");
+out:
+ return 0;
+}
+
+static __inline__ struct sock *ipx_get_socket_idx(loff_t pos)
+{
+ struct sock *s = NULL;
+ struct ipx_interface *i;
+
+ list_for_each_entry(i, &ipx_interfaces, node) {
+ spin_lock_bh(&i->if_sklist_lock);
+ sk_for_each(s, &i->if_sklist) {
+ if (!pos)
+ break;
+ --pos;
+ }
+ spin_unlock_bh(&i->if_sklist_lock);
+ if (!pos) {
+ if (s)
+ goto found;
+ break;
+ }
+ }
+ s = NULL;
+found:
+ return s;
+}
+
+static void *ipx_seq_socket_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t l = *pos;
+
+ spin_lock_bh(&ipx_interfaces_lock);
+ return l ? ipx_get_socket_idx(--l) : SEQ_START_TOKEN;
+}
+
+static void *ipx_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock* sk, *next;
+ struct ipx_interface *i;
+ struct ipx_sock *ipxs;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN) {
+ sk = NULL;
+ i = ipx_interfaces_head();
+ if (!i)
+ goto out;
+ sk = sk_head(&i->if_sklist);
+ if (sk)
+ spin_lock_bh(&i->if_sklist_lock);
+ goto out;
+ }
+ sk = v;
+ next = sk_next(sk);
+ if (next) {
+ sk = next;
+ goto out;
+ }
+ ipxs = ipx_sk(sk);
+ i = ipxs->intrfc;
+ spin_unlock_bh(&i->if_sklist_lock);
+ sk = NULL;
+ for (;;) {
+ if (i->node.next == &ipx_interfaces)
+ break;
+ i = list_entry(i->node.next, struct ipx_interface, node);
+ spin_lock_bh(&i->if_sklist_lock);
+ if (!hlist_empty(&i->if_sklist)) {
+ sk = sk_head(&i->if_sklist);
+ break;
+ }
+ spin_unlock_bh(&i->if_sklist_lock);
+ }
+out:
+ return sk;
+}
+
+static int ipx_seq_socket_show(struct seq_file *seq, void *v)
+{
+ struct sock *s;
+ struct ipx_sock *ipxs;
+
+ if (v == SEQ_START_TOKEN) {
+#ifdef CONFIG_IPX_INTERN
+ seq_puts(seq, "Local_Address "
+ "Remote_Address Tx_Queue "
+ "Rx_Queue State Uid\n");
+#else
+ seq_puts(seq, "Local_Address Remote_Address "
+ "Tx_Queue Rx_Queue State Uid\n");
+#endif
+ goto out;
+ }
+
+ s = v;
+ ipxs = ipx_sk(s);
+#ifdef CONFIG_IPX_INTERN
+ seq_printf(seq, "%08X:%02X%02X%02X%02X%02X%02X:%04X ",
+ ntohl(ipxs->intrfc->if_netnum),
+ ipxs->node[0], ipxs->node[1], ipxs->node[2], ipxs->node[3],
+ ipxs->node[4], ipxs->node[5], ntohs(ipxs->port));
+#else
+ seq_printf(seq, "%08X:%04X ", ntohl(ipxs->intrfc->if_netnum),
+ ntohs(ipxs->port));
+#endif /* CONFIG_IPX_INTERN */
+ if (s->sk_state != TCP_ESTABLISHED)
+ seq_printf(seq, "%-28s", "Not_Connected");
+ else {
+ seq_printf(seq, "%08X:%02X%02X%02X%02X%02X%02X:%04X ",
+ ntohl(ipxs->dest_addr.net),
+ ipxs->dest_addr.node[0], ipxs->dest_addr.node[1],
+ ipxs->dest_addr.node[2], ipxs->dest_addr.node[3],
+ ipxs->dest_addr.node[4], ipxs->dest_addr.node[5],
+ ntohs(ipxs->dest_addr.sock));
+ }
+
+ seq_printf(seq, "%08X %08X %02X %03u\n",
+ sk_wmem_alloc_get(s),
+ sk_rmem_alloc_get(s),
+ s->sk_state,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)));
+out:
+ return 0;
+}
+
+static const struct seq_operations ipx_seq_interface_ops = {
+ .start = ipx_seq_interface_start,
+ .next = ipx_seq_interface_next,
+ .stop = ipx_seq_interface_stop,
+ .show = ipx_seq_interface_show,
+};
+
+static const struct seq_operations ipx_seq_route_ops = {
+ .start = ipx_seq_route_start,
+ .next = ipx_seq_route_next,
+ .stop = ipx_seq_route_stop,
+ .show = ipx_seq_route_show,
+};
+
+static const struct seq_operations ipx_seq_socket_ops = {
+ .start = ipx_seq_socket_start,
+ .next = ipx_seq_socket_next,
+ .stop = ipx_seq_interface_stop,
+ .show = ipx_seq_socket_show,
+};
+
+static int ipx_seq_route_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ipx_seq_route_ops);
+}
+
+static int ipx_seq_interface_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ipx_seq_interface_ops);
+}
+
+static int ipx_seq_socket_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ipx_seq_socket_ops);
+}
+
+static const struct file_operations ipx_seq_interface_fops = {
+ .owner = THIS_MODULE,
+ .open = ipx_seq_interface_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations ipx_seq_route_fops = {
+ .owner = THIS_MODULE,
+ .open = ipx_seq_route_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations ipx_seq_socket_fops = {
+ .owner = THIS_MODULE,
+ .open = ipx_seq_socket_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct proc_dir_entry *ipx_proc_dir;
+
+int __init ipx_proc_init(void)
+{
+ struct proc_dir_entry *p;
+ int rc = -ENOMEM;
+
+ ipx_proc_dir = proc_mkdir("ipx", init_net.proc_net);
+
+ if (!ipx_proc_dir)
+ goto out;
+ p = proc_create("interface", S_IRUGO,
+ ipx_proc_dir, &ipx_seq_interface_fops);
+ if (!p)
+ goto out_interface;
+
+ p = proc_create("route", S_IRUGO, ipx_proc_dir, &ipx_seq_route_fops);
+ if (!p)
+ goto out_route;
+
+ p = proc_create("socket", S_IRUGO, ipx_proc_dir, &ipx_seq_socket_fops);
+ if (!p)
+ goto out_socket;
+
+ rc = 0;
+out:
+ return rc;
+out_socket:
+ remove_proc_entry("route", ipx_proc_dir);
+out_route:
+ remove_proc_entry("interface", ipx_proc_dir);
+out_interface:
+ remove_proc_entry("ipx", init_net.proc_net);
+ goto out;
+}
+
+void __exit ipx_proc_exit(void)
+{
+ remove_proc_entry("interface", ipx_proc_dir);
+ remove_proc_entry("route", ipx_proc_dir);
+ remove_proc_entry("socket", ipx_proc_dir);
+ remove_proc_entry("ipx", init_net.proc_net);
+}
+
+#else /* CONFIG_PROC_FS */
+
+int __init ipx_proc_init(void)
+{
+ return 0;
+}
+
+void __exit ipx_proc_exit(void)
+{
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c
new file mode 100644
index 000000000..3e2a32a9f
--- /dev/null
+++ b/net/ipx/ipx_route.c
@@ -0,0 +1,292 @@
+/*
+ * Implements the IPX routing routines.
+ * Code moved from af_ipx.c.
+ *
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 2003
+ *
+ * See net/ipx/ChangeLog.
+ */
+
+#include <linux/list.h>
+#include <linux/route.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <net/ipx.h>
+#include <net/sock.h>
+
+LIST_HEAD(ipx_routes);
+DEFINE_RWLOCK(ipx_routes_lock);
+
+extern struct ipx_interface *ipx_internal_net;
+
+extern struct ipx_interface *ipxitf_find_using_net(__be32 net);
+extern int ipxitf_demux_socket(struct ipx_interface *intrfc,
+ struct sk_buff *skb, int copy);
+extern int ipxitf_demux_socket(struct ipx_interface *intrfc,
+ struct sk_buff *skb, int copy);
+
+struct ipx_route *ipxrtr_lookup(__be32 net)
+{
+ struct ipx_route *r;
+
+ read_lock_bh(&ipx_routes_lock);
+ list_for_each_entry(r, &ipx_routes, node)
+ if (r->ir_net == net) {
+ ipxrtr_hold(r);
+ goto unlock;
+ }
+ r = NULL;
+unlock:
+ read_unlock_bh(&ipx_routes_lock);
+ return r;
+}
+
+/*
+ * Caller must hold a reference to intrfc
+ */
+int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc,
+ unsigned char *node)
+{
+ struct ipx_route *rt;
+ int rc;
+
+ /* Get a route structure; either existing or create */
+ rt = ipxrtr_lookup(network);
+ if (!rt) {
+ rt = kmalloc(sizeof(*rt), GFP_ATOMIC);
+ rc = -EAGAIN;
+ if (!rt)
+ goto out;
+
+ atomic_set(&rt->refcnt, 1);
+ ipxrtr_hold(rt);
+ write_lock_bh(&ipx_routes_lock);
+ list_add(&rt->node, &ipx_routes);
+ write_unlock_bh(&ipx_routes_lock);
+ } else {
+ rc = -EEXIST;
+ if (intrfc == ipx_internal_net)
+ goto out_put;
+ }
+
+ rt->ir_net = network;
+ rt->ir_intrfc = intrfc;
+ if (!node) {
+ memset(rt->ir_router_node, '\0', IPX_NODE_LEN);
+ rt->ir_routed = 0;
+ } else {
+ memcpy(rt->ir_router_node, node, IPX_NODE_LEN);
+ rt->ir_routed = 1;
+ }
+
+ rc = 0;
+out_put:
+ ipxrtr_put(rt);
+out:
+ return rc;
+}
+
+void ipxrtr_del_routes(struct ipx_interface *intrfc)
+{
+ struct ipx_route *r, *tmp;
+
+ write_lock_bh(&ipx_routes_lock);
+ list_for_each_entry_safe(r, tmp, &ipx_routes, node)
+ if (r->ir_intrfc == intrfc) {
+ list_del(&r->node);
+ ipxrtr_put(r);
+ }
+ write_unlock_bh(&ipx_routes_lock);
+}
+
+static int ipxrtr_create(struct ipx_route_definition *rd)
+{
+ struct ipx_interface *intrfc;
+ int rc = -ENETUNREACH;
+
+ /* Find the appropriate interface */
+ intrfc = ipxitf_find_using_net(rd->ipx_router_network);
+ if (!intrfc)
+ goto out;
+ rc = ipxrtr_add_route(rd->ipx_network, intrfc, rd->ipx_router_node);
+ ipxitf_put(intrfc);
+out:
+ return rc;
+}
+
+static int ipxrtr_delete(__be32 net)
+{
+ struct ipx_route *r, *tmp;
+ int rc;
+
+ write_lock_bh(&ipx_routes_lock);
+ list_for_each_entry_safe(r, tmp, &ipx_routes, node)
+ if (r->ir_net == net) {
+ /* Directly connected; can't lose route */
+ rc = -EPERM;
+ if (!r->ir_routed)
+ goto out;
+ list_del(&r->node);
+ ipxrtr_put(r);
+ rc = 0;
+ goto out;
+ }
+ rc = -ENOENT;
+out:
+ write_unlock_bh(&ipx_routes_lock);
+ return rc;
+}
+
+/*
+ * The skb has to be unshared, we'll end up calling ipxitf_send, that'll
+ * modify the packet
+ */
+int ipxrtr_route_skb(struct sk_buff *skb)
+{
+ struct ipxhdr *ipx = ipx_hdr(skb);
+ struct ipx_route *r = ipxrtr_lookup(IPX_SKB_CB(skb)->ipx_dest_net);
+
+ if (!r) { /* no known route */
+ kfree_skb(skb);
+ return 0;
+ }
+
+ ipxitf_hold(r->ir_intrfc);
+ ipxitf_send(r->ir_intrfc, skb, r->ir_routed ?
+ r->ir_router_node : ipx->ipx_dest.node);
+ ipxitf_put(r->ir_intrfc);
+ ipxrtr_put(r);
+
+ return 0;
+}
+
+/*
+ * Route an outgoing frame from a socket.
+ */
+int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx,
+ struct msghdr *msg, size_t len, int noblock)
+{
+ struct sk_buff *skb;
+ struct ipx_sock *ipxs = ipx_sk(sk);
+ struct ipx_interface *intrfc;
+ struct ipxhdr *ipx;
+ size_t size;
+ int ipx_offset;
+ struct ipx_route *rt = NULL;
+ int rc;
+
+ /* Find the appropriate interface on which to send packet */
+ if (!usipx->sipx_network && ipx_primary_net) {
+ usipx->sipx_network = ipx_primary_net->if_netnum;
+ intrfc = ipx_primary_net;
+ } else {
+ rt = ipxrtr_lookup(usipx->sipx_network);
+ rc = -ENETUNREACH;
+ if (!rt)
+ goto out;
+ intrfc = rt->ir_intrfc;
+ }
+
+ ipxitf_hold(intrfc);
+ ipx_offset = intrfc->if_ipx_offset;
+ size = sizeof(struct ipxhdr) + len + ipx_offset;
+
+ skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+ if (!skb)
+ goto out_put;
+
+ skb_reserve(skb, ipx_offset);
+ skb->sk = sk;
+
+ /* Fill in IPX header */
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_put(skb, sizeof(struct ipxhdr));
+ ipx = ipx_hdr(skb);
+ ipx->ipx_pktsize = htons(len + sizeof(struct ipxhdr));
+ IPX_SKB_CB(skb)->ipx_tctrl = 0;
+ ipx->ipx_type = usipx->sipx_type;
+
+ IPX_SKB_CB(skb)->last_hop.index = -1;
+#ifdef CONFIG_IPX_INTERN
+ IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum;
+ memcpy(ipx->ipx_source.node, ipxs->node, IPX_NODE_LEN);
+#else
+ rc = ntohs(ipxs->port);
+ if (rc == 0x453 || rc == 0x452) {
+ /* RIP/SAP special handling for mars_nwe */
+ IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum;
+ memcpy(ipx->ipx_source.node, intrfc->if_node, IPX_NODE_LEN);
+ } else {
+ IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum;
+ memcpy(ipx->ipx_source.node, ipxs->intrfc->if_node,
+ IPX_NODE_LEN);
+ }
+#endif /* CONFIG_IPX_INTERN */
+ ipx->ipx_source.sock = ipxs->port;
+ IPX_SKB_CB(skb)->ipx_dest_net = usipx->sipx_network;
+ memcpy(ipx->ipx_dest.node, usipx->sipx_node, IPX_NODE_LEN);
+ ipx->ipx_dest.sock = usipx->sipx_port;
+
+ rc = memcpy_from_msg(skb_put(skb, len), msg, len);
+ if (rc) {
+ kfree_skb(skb);
+ goto out_put;
+ }
+
+ /* Apply checksum. Not allowed on 802.3 links. */
+ if (sk->sk_no_check_tx ||
+ intrfc->if_dlink_type == htons(IPX_FRAME_8023))
+ ipx->ipx_checksum = htons(0xFFFF);
+ else
+ ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr));
+
+ rc = ipxitf_send(intrfc, skb, (rt && rt->ir_routed) ?
+ rt->ir_router_node : ipx->ipx_dest.node);
+out_put:
+ ipxitf_put(intrfc);
+ if (rt)
+ ipxrtr_put(rt);
+out:
+ return rc;
+}
+
+/*
+ * We use a normal struct rtentry for route handling
+ */
+int ipxrtr_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct rtentry rt; /* Use these to behave like 'other' stacks */
+ struct sockaddr_ipx *sg, *st;
+ int rc = -EFAULT;
+
+ if (copy_from_user(&rt, arg, sizeof(rt)))
+ goto out;
+
+ sg = (struct sockaddr_ipx *)&rt.rt_gateway;
+ st = (struct sockaddr_ipx *)&rt.rt_dst;
+
+ rc = -EINVAL;
+ if (!(rt.rt_flags & RTF_GATEWAY) || /* Direct routes are fixed */
+ sg->sipx_family != AF_IPX ||
+ st->sipx_family != AF_IPX)
+ goto out;
+
+ switch (cmd) {
+ case SIOCDELRT:
+ rc = ipxrtr_delete(st->sipx_network);
+ break;
+ case SIOCADDRT: {
+ struct ipx_route_definition f;
+ f.ipx_network = st->sipx_network;
+ f.ipx_router_network = sg->sipx_network;
+ memcpy(f.ipx_router_node, sg->sipx_node, IPX_NODE_LEN);
+ rc = ipxrtr_create(&f);
+ break;
+ }
+ }
+
+out:
+ return rc;
+}
diff --git a/net/ipx/pe2.c b/net/ipx/pe2.c
new file mode 100644
index 000000000..32dcd601a
--- /dev/null
+++ b/net/ipx/pe2.c
@@ -0,0 +1,35 @@
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/datalink.h>
+
+static int pEII_request(struct datalink_proto *dl,
+ struct sk_buff *skb, unsigned char *dest_node)
+{
+ struct net_device *dev = skb->dev;
+
+ skb->protocol = htons(ETH_P_IPX);
+ dev_hard_header(skb, dev, ETH_P_IPX, dest_node, NULL, skb->len);
+ return dev_queue_xmit(skb);
+}
+
+struct datalink_proto *make_EII_client(void)
+{
+ struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
+
+ if (proto) {
+ proto->header_length = 0;
+ proto->request = pEII_request;
+ }
+
+ return proto;
+}
+
+void destroy_EII_client(struct datalink_proto *dl)
+{
+ kfree(dl);
+}
diff --git a/net/ipx/sysctl_net_ipx.c b/net/ipx/sysctl_net_ipx.c
new file mode 100644
index 000000000..0dafcc561
--- /dev/null
+++ b/net/ipx/sysctl_net_ipx.c
@@ -0,0 +1,39 @@
+/* -*- linux-c -*-
+ * sysctl_net_ipx.c: sysctl interface to net IPX subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipx directory entry (empty =) ). [MS]
+ * Added /proc/sys/net/ipx/ipx_pprop_broadcasting - acme March 4, 2001
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+#include <net/ipx.h>
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+static struct ctl_table ipx_table[] = {
+ {
+ .procname = "ipx_pprop_broadcasting",
+ .data = &sysctl_ipx_pprop_broadcasting,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { },
+};
+
+static struct ctl_table_header *ipx_table_header;
+
+void ipx_register_sysctl(void)
+{
+ ipx_table_header = register_net_sysctl(&init_net, "net/ipx", ipx_table);
+}
+
+void ipx_unregister_sysctl(void)
+{
+ unregister_net_sysctl_table(ipx_table_header);
+}
diff --git a/net/irda/Kconfig b/net/irda/Kconfig
new file mode 100644
index 000000000..c8671a7ff
--- /dev/null
+++ b/net/irda/Kconfig
@@ -0,0 +1,96 @@
+#
+# IrDA protocol configuration
+#
+
+menuconfig IRDA
+ depends on NET && !S390
+ tristate "IrDA (infrared) subsystem support"
+ select CRC_CCITT
+ ---help---
+ Say Y here if you want to build support for the IrDA (TM) protocols.
+ The Infrared Data Associations (tm) specifies standards for wireless
+ infrared communication and is supported by most laptops and PDA's.
+
+ To use Linux support for the IrDA (tm) protocols, you will also need
+ some user-space utilities like irattach. For more information, see
+ the file <file:Documentation/networking/irda.txt>. You also want to
+ read the IR-HOWTO, available at
+ <http://www.tldp.org/docs.html#howto>.
+
+ If you want to exchange bits of data (vCal, vCard) with a PDA, you
+ will need to install some OBEX application, such as OpenObex :
+ <http://sourceforge.net/projects/openobex/>
+
+ To compile this support as a module, choose M here: the module will
+ be called irda.
+
+comment "IrDA protocols"
+ depends on IRDA
+
+source "net/irda/irlan/Kconfig"
+
+source "net/irda/irnet/Kconfig"
+
+source "net/irda/ircomm/Kconfig"
+
+config IRDA_ULTRA
+ bool "Ultra (connectionless) protocol"
+ depends on IRDA
+ help
+ Say Y here to support the connectionless Ultra IRDA protocol.
+ Ultra allows to exchange data over IrDA with really simple devices
+ (watch, beacon) without the overhead of the IrDA protocol (no handshaking,
+ no management frames, simple fixed header).
+ Ultra is available as a special socket : socket(AF_IRDA, SOCK_DGRAM, 1);
+
+comment "IrDA options"
+ depends on IRDA
+
+config IRDA_CACHE_LAST_LSAP
+ bool "Cache last LSAP"
+ depends on IRDA
+ help
+ Say Y here if you want IrLMP to cache the last LSAP used. This
+ makes sense since most frames will be sent/received on the same
+ connection. Enabling this option will save a hash-lookup per frame.
+
+ If unsure, say Y.
+
+config IRDA_FAST_RR
+ bool "Fast RRs (low latency)"
+ depends on IRDA
+ ---help---
+ Say Y here is you want IrLAP to send fast RR (Receive Ready) frames
+ when acting as a primary station.
+ Disabling this option will make latency over IrDA very bad. Enabling
+ this option will make the IrDA stack send more packet than strictly
+ necessary, thus reduce your battery life (but not that much).
+
+ Fast RR will make IrLAP send out a RR frame immediately when
+ receiving a frame if its own transmit queue is currently empty. This
+ will give a lot of speed improvement when receiving much data since
+ the secondary station will not have to wait the max. turn around
+ time (usually 500ms) before it is allowed to transmit the next time.
+ If the transmit queue of the secondary is also empty, the primary will
+ start backing-off before sending another RR frame, waiting longer
+ each time until the back-off reaches the max. turn around time.
+ This back-off increase in controlled via
+ /proc/sys/net/irda/fast_poll_increase
+
+ If unsure, say Y.
+
+config IRDA_DEBUG
+ bool "Debug information"
+ depends on IRDA
+ help
+ Say Y here if you want the IrDA subsystem to write debug information
+ to your syslog. You can change the debug level in
+ /proc/sys/net/irda/debug .
+ When this option is enabled, the IrDA also perform many extra internal
+ verifications which will usually prevent the kernel to crash in case of
+ bugs.
+
+ If unsure, say Y (since it makes it easier to find the bugs).
+
+source "drivers/net/irda/Kconfig"
+
diff --git a/net/irda/Makefile b/net/irda/Makefile
new file mode 100644
index 000000000..187f6c563
--- /dev/null
+++ b/net/irda/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the Linux IrDA protocol layer.
+#
+
+obj-$(CONFIG_IRDA) += irda.o
+obj-$(CONFIG_IRLAN) += irlan/
+obj-$(CONFIG_IRNET) += irnet/
+obj-$(CONFIG_IRCOMM) += ircomm/
+
+irda-y := iriap.o iriap_event.o irlmp.o irlmp_event.o irlmp_frame.o \
+ irlap.o irlap_event.o irlap_frame.o timer.o qos.o irqueue.o \
+ irttp.o irda_device.o irias_object.o wrapper.o af_irda.o \
+ discovery.o parameters.o irnetlink.o irmod.o
+irda-$(CONFIG_PROC_FS) += irproc.o
+irda-$(CONFIG_SYSCTL) += irsysctl.o
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
new file mode 100644
index 000000000..ee0ea25c8
--- /dev/null
+++ b/net/irda/af_irda.c
@@ -0,0 +1,2716 @@
+/*********************************************************************
+ *
+ * Filename: af_irda.c
+ * Version: 0.9
+ * Description: IrDA sockets implementation
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun May 31 10:12:43 1998
+ * Modified at: Sat Dec 25 21:10:23 1999
+ * Modified by: Dag Brattli <dag@brattli.net>
+ * Sources: af_netroom.c, af_ax25.c, af_rose.c, af_x25.c etc.
+ *
+ * Copyright (c) 1999 Dag Brattli <dagb@cs.uit.no>
+ * Copyright (c) 1999-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Linux-IrDA now supports four different types of IrDA sockets:
+ *
+ * o SOCK_STREAM: TinyTP connections with SAR disabled. The
+ * max SDU size is 0 for conn. of this type
+ * o SOCK_SEQPACKET: TinyTP connections with SAR enabled. TTP may
+ * fragment the messages, but will preserve
+ * the message boundaries
+ * o SOCK_DGRAM: IRDAPROTO_UNITDATA: TinyTP connections with Unitdata
+ * (unreliable) transfers
+ * IRDAPROTO_ULTRA: Connectionless and unreliable data
+ *
+ ********************************************************************/
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/irda.h>
+#include <linux/poll.h>
+
+#include <asm/ioctls.h> /* TIOCOUTQ, TIOCINQ */
+#include <asm/uaccess.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+#include <net/irda/af_irda.h>
+
+static int irda_create(struct net *net, struct socket *sock, int protocol, int kern);
+
+static const struct proto_ops irda_stream_ops;
+static const struct proto_ops irda_seqpacket_ops;
+static const struct proto_ops irda_dgram_ops;
+
+#ifdef CONFIG_IRDA_ULTRA
+static const struct proto_ops irda_ultra_ops;
+#define ULTRA_MAX_DATA 382
+#endif /* CONFIG_IRDA_ULTRA */
+
+#define IRDA_MAX_HEADER (TTP_MAX_HEADER)
+
+/*
+ * Function irda_data_indication (instance, sap, skb)
+ *
+ * Received some data from TinyTP. Just queue it on the receive queue
+ *
+ */
+static int irda_data_indication(void *instance, void *sap, struct sk_buff *skb)
+{
+ struct irda_sock *self;
+ struct sock *sk;
+ int err;
+
+ self = instance;
+ sk = instance;
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err) {
+ pr_debug("%s(), error: no more mem!\n", __func__);
+ self->rx_flow = FLOW_STOP;
+
+ /* When we return error, TTP will need to requeue the skb */
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * Function irda_disconnect_indication (instance, sap, reason, skb)
+ *
+ * Connection has been closed. Check reason to find out why
+ *
+ */
+static void irda_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason, struct sk_buff *skb)
+{
+ struct irda_sock *self;
+ struct sock *sk;
+
+ self = instance;
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ /* Don't care about it, but let's not leak it */
+ if(skb)
+ dev_kfree_skb(skb);
+
+ sk = instance;
+ if (sk == NULL) {
+ pr_debug("%s(%p) : BUG : sk is NULL\n",
+ __func__, self);
+ return;
+ }
+
+ /* Prevent race conditions with irda_release() and irda_shutdown() */
+ bh_lock_sock(sk);
+ if (!sock_flag(sk, SOCK_DEAD) && sk->sk_state != TCP_CLOSE) {
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ sk->sk_state_change(sk);
+
+ /* Close our TSAP.
+ * If we leave it open, IrLMP put it back into the list of
+ * unconnected LSAPs. The problem is that any incoming request
+ * can then be matched to this socket (and it will be, because
+ * it is at the head of the list). This would prevent any
+ * listening socket waiting on the same TSAP to get those
+ * requests. Some apps forget to close sockets, or hang to it
+ * a bit too long, so we may stay in this dead state long
+ * enough to be noticed...
+ * Note : all socket function do check sk->sk_state, so we are
+ * safe...
+ * Jean II
+ */
+ if (self->tsap) {
+ irttp_close_tsap(self->tsap);
+ self->tsap = NULL;
+ }
+ }
+ bh_unlock_sock(sk);
+
+ /* Note : once we are there, there is not much you want to do
+ * with the socket anymore, apart from closing it.
+ * For example, bind() and connect() won't reset sk->sk_err,
+ * sk->sk_shutdown and sk->sk_flags to valid values...
+ * Jean II
+ */
+}
+
+/*
+ * Function irda_connect_confirm (instance, sap, qos, max_sdu_size, skb)
+ *
+ * Connections has been confirmed by the remote device
+ *
+ */
+static void irda_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size, __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct irda_sock *self;
+ struct sock *sk;
+
+ self = instance;
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ sk = instance;
+ if (sk == NULL) {
+ dev_kfree_skb(skb);
+ return;
+ }
+
+ dev_kfree_skb(skb);
+ // Should be ??? skb_queue_tail(&sk->sk_receive_queue, skb);
+
+ /* How much header space do we need to reserve */
+ self->max_header_size = max_header_size;
+
+ /* IrTTP max SDU size in transmit direction */
+ self->max_sdu_size_tx = max_sdu_size;
+
+ /* Find out what the largest chunk of data that we can transmit is */
+ switch (sk->sk_type) {
+ case SOCK_STREAM:
+ if (max_sdu_size != 0) {
+ net_err_ratelimited("%s: max_sdu_size must be 0\n",
+ __func__);
+ return;
+ }
+ self->max_data_size = irttp_get_max_seg_size(self->tsap);
+ break;
+ case SOCK_SEQPACKET:
+ if (max_sdu_size == 0) {
+ net_err_ratelimited("%s: max_sdu_size cannot be 0\n",
+ __func__);
+ return;
+ }
+ self->max_data_size = max_sdu_size;
+ break;
+ default:
+ self->max_data_size = irttp_get_max_seg_size(self->tsap);
+ }
+
+ pr_debug("%s(), max_data_size=%d\n", __func__,
+ self->max_data_size);
+
+ memcpy(&self->qos_tx, qos, sizeof(struct qos_info));
+
+ /* We are now connected! */
+ sk->sk_state = TCP_ESTABLISHED;
+ sk->sk_state_change(sk);
+}
+
+/*
+ * Function irda_connect_indication(instance, sap, qos, max_sdu_size, userdata)
+ *
+ * Incoming connection
+ *
+ */
+static void irda_connect_indication(void *instance, void *sap,
+ struct qos_info *qos, __u32 max_sdu_size,
+ __u8 max_header_size, struct sk_buff *skb)
+{
+ struct irda_sock *self;
+ struct sock *sk;
+
+ self = instance;
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ sk = instance;
+ if (sk == NULL) {
+ dev_kfree_skb(skb);
+ return;
+ }
+
+ /* How much header space do we need to reserve */
+ self->max_header_size = max_header_size;
+
+ /* IrTTP max SDU size in transmit direction */
+ self->max_sdu_size_tx = max_sdu_size;
+
+ /* Find out what the largest chunk of data that we can transmit is */
+ switch (sk->sk_type) {
+ case SOCK_STREAM:
+ if (max_sdu_size != 0) {
+ net_err_ratelimited("%s: max_sdu_size must be 0\n",
+ __func__);
+ kfree_skb(skb);
+ return;
+ }
+ self->max_data_size = irttp_get_max_seg_size(self->tsap);
+ break;
+ case SOCK_SEQPACKET:
+ if (max_sdu_size == 0) {
+ net_err_ratelimited("%s: max_sdu_size cannot be 0\n",
+ __func__);
+ kfree_skb(skb);
+ return;
+ }
+ self->max_data_size = max_sdu_size;
+ break;
+ default:
+ self->max_data_size = irttp_get_max_seg_size(self->tsap);
+ }
+
+ pr_debug("%s(), max_data_size=%d\n", __func__,
+ self->max_data_size);
+
+ memcpy(&self->qos_tx, qos, sizeof(struct qos_info));
+
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_state_change(sk);
+}
+
+/*
+ * Function irda_connect_response (handle)
+ *
+ * Accept incoming connection
+ *
+ */
+static void irda_connect_response(struct irda_sock *self)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER, GFP_KERNEL);
+ if (skb == NULL) {
+ pr_debug("%s() Unable to allocate sk_buff!\n",
+ __func__);
+ return;
+ }
+
+ /* Reserve space for MUX_CONTROL and LAP header */
+ skb_reserve(skb, IRDA_MAX_HEADER);
+
+ irttp_connect_response(self->tsap, self->max_sdu_size_rx, skb);
+}
+
+/*
+ * Function irda_flow_indication (instance, sap, flow)
+ *
+ * Used by TinyTP to tell us if it can accept more data or not
+ *
+ */
+static void irda_flow_indication(void *instance, void *sap, LOCAL_FLOW flow)
+{
+ struct irda_sock *self;
+ struct sock *sk;
+
+ self = instance;
+ sk = instance;
+ BUG_ON(sk == NULL);
+
+ switch (flow) {
+ case FLOW_STOP:
+ pr_debug("%s(), IrTTP wants us to slow down\n",
+ __func__);
+ self->tx_flow = flow;
+ break;
+ case FLOW_START:
+ self->tx_flow = flow;
+ pr_debug("%s(), IrTTP wants us to start again\n",
+ __func__);
+ wake_up_interruptible(sk_sleep(sk));
+ break;
+ default:
+ pr_debug("%s(), Unknown flow command!\n", __func__);
+ /* Unknown flow command, better stop */
+ self->tx_flow = flow;
+ break;
+ }
+}
+
+/*
+ * Function irda_getvalue_confirm (obj_id, value, priv)
+ *
+ * Got answer from remote LM-IAS, just pass object to requester...
+ *
+ * Note : duplicate from above, but we need our own version that
+ * doesn't touch the dtsap_sel and save the full value structure...
+ */
+static void irda_getvalue_confirm(int result, __u16 obj_id,
+ struct ias_value *value, void *priv)
+{
+ struct irda_sock *self;
+
+ self = priv;
+ if (!self) {
+ net_warn_ratelimited("%s: lost myself!\n", __func__);
+ return;
+ }
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ /* We probably don't need to make any more queries */
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+
+ /* Check if request succeeded */
+ if (result != IAS_SUCCESS) {
+ pr_debug("%s(), IAS query failed! (%d)\n", __func__,
+ result);
+
+ self->errno = result; /* We really need it later */
+
+ /* Wake up any processes waiting for result */
+ wake_up_interruptible(&self->query_wait);
+
+ return;
+ }
+
+ /* Pass the object to the caller (so the caller must delete it) */
+ self->ias_result = value;
+ self->errno = 0;
+
+ /* Wake up any processes waiting for result */
+ wake_up_interruptible(&self->query_wait);
+}
+
+/*
+ * Function irda_selective_discovery_indication (discovery)
+ *
+ * Got a selective discovery indication from IrLMP.
+ *
+ * IrLMP is telling us that this node is new and matching our hint bit
+ * filter. Wake up any process waiting for answer...
+ */
+static void irda_selective_discovery_indication(discinfo_t *discovery,
+ DISCOVERY_MODE mode,
+ void *priv)
+{
+ struct irda_sock *self;
+
+ self = priv;
+ if (!self) {
+ net_warn_ratelimited("%s: lost myself!\n", __func__);
+ return;
+ }
+
+ /* Pass parameter to the caller */
+ self->cachedaddr = discovery->daddr;
+
+ /* Wake up process if its waiting for device to be discovered */
+ wake_up_interruptible(&self->query_wait);
+}
+
+/*
+ * Function irda_discovery_timeout (priv)
+ *
+ * Timeout in the selective discovery process
+ *
+ * We were waiting for a node to be discovered, but nothing has come up
+ * so far. Wake up the user and tell him that we failed...
+ */
+static void irda_discovery_timeout(u_long priv)
+{
+ struct irda_sock *self;
+
+ self = (struct irda_sock *) priv;
+ BUG_ON(self == NULL);
+
+ /* Nothing for the caller */
+ self->cachelog = NULL;
+ self->cachedaddr = 0;
+ self->errno = -ETIME;
+
+ /* Wake up process if its still waiting... */
+ wake_up_interruptible(&self->query_wait);
+}
+
+/*
+ * Function irda_open_tsap (self)
+ *
+ * Open local Transport Service Access Point (TSAP)
+ *
+ */
+static int irda_open_tsap(struct irda_sock *self, __u8 tsap_sel, char *name)
+{
+ notify_t notify;
+
+ if (self->tsap) {
+ pr_debug("%s: busy!\n", __func__);
+ return -EBUSY;
+ }
+
+ /* Initialize callbacks to be used by the IrDA stack */
+ irda_notify_init(&notify);
+ notify.connect_confirm = irda_connect_confirm;
+ notify.connect_indication = irda_connect_indication;
+ notify.disconnect_indication = irda_disconnect_indication;
+ notify.data_indication = irda_data_indication;
+ notify.udata_indication = irda_data_indication;
+ notify.flow_indication = irda_flow_indication;
+ notify.instance = self;
+ strncpy(notify.name, name, NOTIFY_MAX_NAME);
+
+ self->tsap = irttp_open_tsap(tsap_sel, DEFAULT_INITIAL_CREDIT,
+ &notify);
+ if (self->tsap == NULL) {
+ pr_debug("%s(), Unable to allocate TSAP!\n",
+ __func__);
+ return -ENOMEM;
+ }
+ /* Remember which TSAP selector we actually got */
+ self->stsap_sel = self->tsap->stsap_sel;
+
+ return 0;
+}
+
+/*
+ * Function irda_open_lsap (self)
+ *
+ * Open local Link Service Access Point (LSAP). Used for opening Ultra
+ * sockets
+ */
+#ifdef CONFIG_IRDA_ULTRA
+static int irda_open_lsap(struct irda_sock *self, int pid)
+{
+ notify_t notify;
+
+ if (self->lsap) {
+ net_warn_ratelimited("%s(), busy!\n", __func__);
+ return -EBUSY;
+ }
+
+ /* Initialize callbacks to be used by the IrDA stack */
+ irda_notify_init(&notify);
+ notify.udata_indication = irda_data_indication;
+ notify.instance = self;
+ strncpy(notify.name, "Ultra", NOTIFY_MAX_NAME);
+
+ self->lsap = irlmp_open_lsap(LSAP_CONNLESS, &notify, pid);
+ if (self->lsap == NULL) {
+ pr_debug("%s(), Unable to allocate LSAP!\n", __func__);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irda_find_lsap_sel (self, name)
+ *
+ * Try to lookup LSAP selector in remote LM-IAS
+ *
+ * Basically, we start a IAP query, and then go to sleep. When the query
+ * return, irda_getvalue_confirm will wake us up, and we can examine the
+ * result of the query...
+ * Note that in some case, the query fail even before we go to sleep,
+ * creating some races...
+ */
+static int irda_find_lsap_sel(struct irda_sock *self, char *name)
+{
+ pr_debug("%s(%p, %s)\n", __func__, self, name);
+
+ if (self->iriap) {
+ net_warn_ratelimited("%s(): busy with a previous query\n",
+ __func__);
+ return -EBUSY;
+ }
+
+ self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+ irda_getvalue_confirm);
+ if(self->iriap == NULL)
+ return -ENOMEM;
+
+ /* Treat unexpected wakeup as disconnect */
+ self->errno = -EHOSTUNREACH;
+
+ /* Query remote LM-IAS */
+ iriap_getvaluebyclass_request(self->iriap, self->saddr, self->daddr,
+ name, "IrDA:TinyTP:LsapSel");
+
+ /* Wait for answer, if not yet finished (or failed) */
+ if (wait_event_interruptible(self->query_wait, (self->iriap==NULL)))
+ /* Treat signals as disconnect */
+ return -EHOSTUNREACH;
+
+ /* Check what happened */
+ if (self->errno)
+ {
+ /* Requested object/attribute doesn't exist */
+ if((self->errno == IAS_CLASS_UNKNOWN) ||
+ (self->errno == IAS_ATTRIB_UNKNOWN))
+ return -EADDRNOTAVAIL;
+ else
+ return -EHOSTUNREACH;
+ }
+
+ /* Get the remote TSAP selector */
+ switch (self->ias_result->type) {
+ case IAS_INTEGER:
+ pr_debug("%s() int=%d\n",
+ __func__, self->ias_result->t.integer);
+
+ if (self->ias_result->t.integer != -1)
+ self->dtsap_sel = self->ias_result->t.integer;
+ else
+ self->dtsap_sel = 0;
+ break;
+ default:
+ self->dtsap_sel = 0;
+ pr_debug("%s(), bad type!\n", __func__);
+ break;
+ }
+ if (self->ias_result)
+ irias_delete_value(self->ias_result);
+
+ if (self->dtsap_sel)
+ return 0;
+
+ return -EADDRNOTAVAIL;
+}
+
+/*
+ * Function irda_discover_daddr_and_lsap_sel (self, name)
+ *
+ * This try to find a device with the requested service.
+ *
+ * It basically look into the discovery log. For each address in the list,
+ * it queries the LM-IAS of the device to find if this device offer
+ * the requested service.
+ * If there is more than one node supporting the service, we complain
+ * to the user (it should move devices around).
+ * The, we set both the destination address and the lsap selector to point
+ * on the service on the unique device we have found.
+ *
+ * Note : this function fails if there is more than one device in range,
+ * because IrLMP doesn't disconnect the LAP when the last LSAP is closed.
+ * Moreover, we would need to wait the LAP disconnection...
+ */
+static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
+{
+ discinfo_t *discoveries; /* Copy of the discovery log */
+ int number; /* Number of nodes in the log */
+ int i;
+ int err = -ENETUNREACH;
+ __u32 daddr = DEV_ADDR_ANY; /* Address we found the service on */
+ __u8 dtsap_sel = 0x0; /* TSAP associated with it */
+
+ pr_debug("%s(), name=%s\n", __func__, name);
+
+ /* Ask lmp for the current discovery log
+ * Note : we have to use irlmp_get_discoveries(), as opposed
+ * to play with the cachelog directly, because while we are
+ * making our ias query, le log might change... */
+ discoveries = irlmp_get_discoveries(&number, self->mask.word,
+ self->nslots);
+ /* Check if the we got some results */
+ if (discoveries == NULL)
+ return -ENETUNREACH; /* No nodes discovered */
+
+ /*
+ * Now, check all discovered devices (if any), and connect
+ * client only about the services that the client is
+ * interested in...
+ */
+ for(i = 0; i < number; i++) {
+ /* Try the address in the log */
+ self->daddr = discoveries[i].daddr;
+ self->saddr = 0x0;
+ pr_debug("%s(), trying daddr = %08x\n",
+ __func__, self->daddr);
+
+ /* Query remote LM-IAS for this service */
+ err = irda_find_lsap_sel(self, name);
+ switch (err) {
+ case 0:
+ /* We found the requested service */
+ if(daddr != DEV_ADDR_ANY) {
+ pr_debug("%s(), discovered service ''%s'' in two different devices !!!\n",
+ __func__, name);
+ self->daddr = DEV_ADDR_ANY;
+ kfree(discoveries);
+ return -ENOTUNIQ;
+ }
+ /* First time we found that one, save it ! */
+ daddr = self->daddr;
+ dtsap_sel = self->dtsap_sel;
+ break;
+ case -EADDRNOTAVAIL:
+ /* Requested service simply doesn't exist on this node */
+ break;
+ default:
+ /* Something bad did happen :-( */
+ pr_debug("%s(), unexpected IAS query failure\n",
+ __func__);
+ self->daddr = DEV_ADDR_ANY;
+ kfree(discoveries);
+ return -EHOSTUNREACH;
+ }
+ }
+ /* Cleanup our copy of the discovery log */
+ kfree(discoveries);
+
+ /* Check out what we found */
+ if(daddr == DEV_ADDR_ANY) {
+ pr_debug("%s(), cannot discover service ''%s'' in any device !!!\n",
+ __func__, name);
+ self->daddr = DEV_ADDR_ANY;
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Revert back to discovered device & service */
+ self->daddr = daddr;
+ self->saddr = 0x0;
+ self->dtsap_sel = dtsap_sel;
+
+ pr_debug("%s(), discovered requested service ''%s'' at address %08x\n",
+ __func__, name, self->daddr);
+
+ return 0;
+}
+
+/*
+ * Function irda_getname (sock, uaddr, uaddr_len, peer)
+ *
+ * Return the our own, or peers socket address (sockaddr_irda)
+ *
+ */
+static int irda_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sockaddr_irda saddr;
+ struct sock *sk = sock->sk;
+ struct irda_sock *self = irda_sk(sk);
+
+ memset(&saddr, 0, sizeof(saddr));
+ if (peer) {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ saddr.sir_family = AF_IRDA;
+ saddr.sir_lsap_sel = self->dtsap_sel;
+ saddr.sir_addr = self->daddr;
+ } else {
+ saddr.sir_family = AF_IRDA;
+ saddr.sir_lsap_sel = self->stsap_sel;
+ saddr.sir_addr = self->saddr;
+ }
+
+ pr_debug("%s(), tsap_sel = %#x\n", __func__, saddr.sir_lsap_sel);
+ pr_debug("%s(), addr = %08x\n", __func__, saddr.sir_addr);
+
+ /* uaddr_len come to us uninitialised */
+ *uaddr_len = sizeof (struct sockaddr_irda);
+ memcpy(uaddr, &saddr, *uaddr_len);
+
+ return 0;
+}
+
+/*
+ * Function irda_listen (sock, backlog)
+ *
+ * Just move to the listen state
+ *
+ */
+static int irda_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err = -EOPNOTSUPP;
+
+ lock_sock(sk);
+
+ if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
+ (sk->sk_type != SOCK_DGRAM))
+ goto out;
+
+ if (sk->sk_state != TCP_LISTEN) {
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_state = TCP_LISTEN;
+
+ err = 0;
+ }
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+/*
+ * Function irda_bind (sock, uaddr, addr_len)
+ *
+ * Used by servers to register their well known TSAP
+ *
+ */
+static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_irda *addr = (struct sockaddr_irda *) uaddr;
+ struct irda_sock *self = irda_sk(sk);
+ int err;
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ if (addr_len != sizeof(struct sockaddr_irda))
+ return -EINVAL;
+
+ lock_sock(sk);
+#ifdef CONFIG_IRDA_ULTRA
+ /* Special care for Ultra sockets */
+ if ((sk->sk_type == SOCK_DGRAM) &&
+ (sk->sk_protocol == IRDAPROTO_ULTRA)) {
+ self->pid = addr->sir_lsap_sel;
+ err = -EOPNOTSUPP;
+ if (self->pid & 0x80) {
+ pr_debug("%s(), extension in PID not supp!\n",
+ __func__);
+ goto out;
+ }
+ err = irda_open_lsap(self, self->pid);
+ if (err < 0)
+ goto out;
+
+ /* Pretend we are connected */
+ sock->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
+ err = 0;
+
+ goto out;
+ }
+#endif /* CONFIG_IRDA_ULTRA */
+
+ self->ias_obj = irias_new_object(addr->sir_name, jiffies);
+ err = -ENOMEM;
+ if (self->ias_obj == NULL)
+ goto out;
+
+ err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name);
+ if (err < 0) {
+ irias_delete_object(self->ias_obj);
+ self->ias_obj = NULL;
+ goto out;
+ }
+
+ /* Register with LM-IAS */
+ irias_add_integer_attrib(self->ias_obj, "IrDA:TinyTP:LsapSel",
+ self->stsap_sel, IAS_KERNEL_ATTR);
+ irias_insert_object(self->ias_obj);
+
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * Function irda_accept (sock, newsock, flags)
+ *
+ * Wait for incoming connection
+ *
+ */
+static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *new, *self = irda_sk(sk);
+ struct sock *newsk;
+ struct sk_buff *skb;
+ int err;
+
+ err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
+ if (err)
+ return err;
+
+ err = -EINVAL;
+
+ lock_sock(sk);
+ if (sock->state != SS_UNCONNECTED)
+ goto out;
+
+ if ((sk = sock->sk) == NULL)
+ goto out;
+
+ err = -EOPNOTSUPP;
+ if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
+ (sk->sk_type != SOCK_DGRAM))
+ goto out;
+
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out;
+
+ /*
+ * The read queue this time is holding sockets ready to use
+ * hooked into the SABM we saved
+ */
+
+ /*
+ * We can perform the accept only if there is incoming data
+ * on the listening socket.
+ * So, we will block the caller until we receive any data.
+ * If the caller was waiting on select() or poll() before
+ * calling us, the data is waiting for us ;-)
+ * Jean II
+ */
+ while (1) {
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (skb)
+ break;
+
+ /* Non blocking operation */
+ err = -EWOULDBLOCK;
+ if (flags & O_NONBLOCK)
+ goto out;
+
+ err = wait_event_interruptible(*(sk_sleep(sk)),
+ skb_peek(&sk->sk_receive_queue));
+ if (err)
+ goto out;
+ }
+
+ newsk = newsock->sk;
+ err = -EIO;
+ if (newsk == NULL)
+ goto out;
+
+ newsk->sk_state = TCP_ESTABLISHED;
+
+ new = irda_sk(newsk);
+
+ /* Now attach up the new socket */
+ new->tsap = irttp_dup(self->tsap, new);
+ err = -EPERM; /* value does not seem to make sense. -arnd */
+ if (!new->tsap) {
+ pr_debug("%s(), dup failed!\n", __func__);
+ kfree_skb(skb);
+ goto out;
+ }
+
+ new->stsap_sel = new->tsap->stsap_sel;
+ new->dtsap_sel = new->tsap->dtsap_sel;
+ new->saddr = irttp_get_saddr(new->tsap);
+ new->daddr = irttp_get_daddr(new->tsap);
+
+ new->max_sdu_size_tx = self->max_sdu_size_tx;
+ new->max_sdu_size_rx = self->max_sdu_size_rx;
+ new->max_data_size = self->max_data_size;
+ new->max_header_size = self->max_header_size;
+
+ memcpy(&new->qos_tx, &self->qos_tx, sizeof(struct qos_info));
+
+ /* Clean up the original one to keep it in listen state */
+ irttp_listen(self->tsap);
+
+ kfree_skb(skb);
+ sk->sk_ack_backlog--;
+
+ newsock->state = SS_CONNECTED;
+
+ irda_connect_response(new);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * Function irda_connect (sock, uaddr, addr_len, flags)
+ *
+ * Connect to a IrDA device
+ *
+ * The main difference with a "standard" connect is that with IrDA we need
+ * to resolve the service name into a TSAP selector (in TCP, port number
+ * doesn't have to be resolved).
+ * Because of this service name resolution, we can offer "auto-connect",
+ * where we connect to a service without specifying a destination address.
+ *
+ * Note : by consulting "errno", the user space caller may learn the cause
+ * of the failure. Most of them are visible in the function, others may come
+ * from subroutines called and are listed here :
+ * o EBUSY : already processing a connect
+ * o EHOSTUNREACH : bad addr->sir_addr argument
+ * o EADDRNOTAVAIL : bad addr->sir_name argument
+ * o ENOTUNIQ : more than one node has addr->sir_name (auto-connect)
+ * o ENETUNREACH : no node found on the network (auto-connect)
+ */
+static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_irda *addr = (struct sockaddr_irda *) uaddr;
+ struct irda_sock *self = irda_sk(sk);
+ int err;
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ lock_sock(sk);
+ /* Don't allow connect for Ultra sockets */
+ err = -ESOCKTNOSUPPORT;
+ if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA))
+ goto out;
+
+ if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
+ sock->state = SS_CONNECTED;
+ err = 0;
+ goto out; /* Connect completed during a ERESTARTSYS event */
+ }
+
+ if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
+ sock->state = SS_UNCONNECTED;
+ err = -ECONNREFUSED;
+ goto out;
+ }
+
+ err = -EISCONN; /* No reconnect on a seqpacket socket */
+ if (sk->sk_state == TCP_ESTABLISHED)
+ goto out;
+
+ sk->sk_state = TCP_CLOSE;
+ sock->state = SS_UNCONNECTED;
+
+ err = -EINVAL;
+ if (addr_len != sizeof(struct sockaddr_irda))
+ goto out;
+
+ /* Check if user supplied any destination device address */
+ if ((!addr->sir_addr) || (addr->sir_addr == DEV_ADDR_ANY)) {
+ /* Try to find one suitable */
+ err = irda_discover_daddr_and_lsap_sel(self, addr->sir_name);
+ if (err) {
+ pr_debug("%s(), auto-connect failed!\n", __func__);
+ goto out;
+ }
+ } else {
+ /* Use the one provided by the user */
+ self->daddr = addr->sir_addr;
+ pr_debug("%s(), daddr = %08x\n", __func__, self->daddr);
+
+ /* If we don't have a valid service name, we assume the
+ * user want to connect on a specific LSAP. Prevent
+ * the use of invalid LSAPs (IrLMP 1.1 p10). Jean II */
+ if((addr->sir_name[0] != '\0') ||
+ (addr->sir_lsap_sel >= 0x70)) {
+ /* Query remote LM-IAS using service name */
+ err = irda_find_lsap_sel(self, addr->sir_name);
+ if (err) {
+ pr_debug("%s(), connect failed!\n", __func__);
+ goto out;
+ }
+ } else {
+ /* Directly connect to the remote LSAP
+ * specified by the sir_lsap field.
+ * Please use with caution, in IrDA LSAPs are
+ * dynamic and there is no "well-known" LSAP. */
+ self->dtsap_sel = addr->sir_lsap_sel;
+ }
+ }
+
+ /* Check if we have opened a local TSAP */
+ if (!self->tsap)
+ irda_open_tsap(self, LSAP_ANY, addr->sir_name);
+
+ /* Move to connecting socket, start sending Connect Requests */
+ sock->state = SS_CONNECTING;
+ sk->sk_state = TCP_SYN_SENT;
+
+ /* Connect to remote device */
+ err = irttp_connect_request(self->tsap, self->dtsap_sel,
+ self->saddr, self->daddr, NULL,
+ self->max_sdu_size_rx, NULL);
+ if (err) {
+ pr_debug("%s(), connect failed!\n", __func__);
+ goto out;
+ }
+
+ /* Now the loop */
+ err = -EINPROGRESS;
+ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK))
+ goto out;
+
+ err = -ERESTARTSYS;
+ if (wait_event_interruptible(*(sk_sleep(sk)),
+ (sk->sk_state != TCP_SYN_SENT)))
+ goto out;
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ sock->state = SS_UNCONNECTED;
+ err = sock_error(sk);
+ if (!err)
+ err = -ECONNRESET;
+ goto out;
+ }
+
+ sock->state = SS_CONNECTED;
+
+ /* At this point, IrLMP has assigned our source address */
+ self->saddr = irttp_get_saddr(self->tsap);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+
+static struct proto irda_proto = {
+ .name = "IRDA",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct irda_sock),
+};
+
+/*
+ * Function irda_create (sock, protocol)
+ *
+ * Create IrDA socket
+ *
+ */
+static int irda_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct irda_sock *self;
+
+ if (net != &init_net)
+ return -EAFNOSUPPORT;
+
+ /* Check for valid socket type */
+ switch (sock->type) {
+ case SOCK_STREAM: /* For TTP connections with SAR disabled */
+ case SOCK_SEQPACKET: /* For TTP connections with SAR enabled */
+ case SOCK_DGRAM: /* For TTP Unitdata or LMP Ultra transfers */
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ /* Allocate networking socket */
+ sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto);
+ if (sk == NULL)
+ return -ENOMEM;
+
+ self = irda_sk(sk);
+ pr_debug("%s() : self is %p\n", __func__, self);
+
+ init_waitqueue_head(&self->query_wait);
+
+ switch (sock->type) {
+ case SOCK_STREAM:
+ sock->ops = &irda_stream_ops;
+ self->max_sdu_size_rx = TTP_SAR_DISABLE;
+ break;
+ case SOCK_SEQPACKET:
+ sock->ops = &irda_seqpacket_ops;
+ self->max_sdu_size_rx = TTP_SAR_UNBOUND;
+ break;
+ case SOCK_DGRAM:
+ switch (protocol) {
+#ifdef CONFIG_IRDA_ULTRA
+ case IRDAPROTO_ULTRA:
+ sock->ops = &irda_ultra_ops;
+ /* Initialise now, because we may send on unbound
+ * sockets. Jean II */
+ self->max_data_size = ULTRA_MAX_DATA - LMP_PID_HEADER;
+ self->max_header_size = IRDA_MAX_HEADER + LMP_PID_HEADER;
+ break;
+#endif /* CONFIG_IRDA_ULTRA */
+ case IRDAPROTO_UNITDATA:
+ sock->ops = &irda_dgram_ops;
+ /* We let Unitdata conn. be like seqpack conn. */
+ self->max_sdu_size_rx = TTP_SAR_UNBOUND;
+ break;
+ default:
+ sk_free(sk);
+ return -ESOCKTNOSUPPORT;
+ }
+ break;
+ default:
+ sk_free(sk);
+ return -ESOCKTNOSUPPORT;
+ }
+
+ /* Initialise networking socket struct */
+ sock_init_data(sock, sk); /* Note : set sk->sk_refcnt to 1 */
+ sk->sk_family = PF_IRDA;
+ sk->sk_protocol = protocol;
+
+ /* Register as a client with IrLMP */
+ self->ckey = irlmp_register_client(0, NULL, NULL, NULL);
+ self->mask.word = 0xffff;
+ self->rx_flow = self->tx_flow = FLOW_START;
+ self->nslots = DISCOVERY_DEFAULT_SLOTS;
+ self->daddr = DEV_ADDR_ANY; /* Until we get connected */
+ self->saddr = 0x0; /* so IrLMP assign us any link */
+ return 0;
+}
+
+/*
+ * Function irda_destroy_socket (self)
+ *
+ * Destroy socket
+ *
+ */
+static void irda_destroy_socket(struct irda_sock *self)
+{
+ pr_debug("%s(%p)\n", __func__, self);
+
+ /* Unregister with IrLMP */
+ irlmp_unregister_client(self->ckey);
+ irlmp_unregister_service(self->skey);
+
+ /* Unregister with LM-IAS */
+ if (self->ias_obj) {
+ irias_delete_object(self->ias_obj);
+ self->ias_obj = NULL;
+ }
+
+ if (self->iriap) {
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+ }
+
+ if (self->tsap) {
+ irttp_disconnect_request(self->tsap, NULL, P_NORMAL);
+ irttp_close_tsap(self->tsap);
+ self->tsap = NULL;
+ }
+#ifdef CONFIG_IRDA_ULTRA
+ if (self->lsap) {
+ irlmp_close_lsap(self->lsap);
+ self->lsap = NULL;
+ }
+#endif /* CONFIG_IRDA_ULTRA */
+}
+
+/*
+ * Function irda_release (sock)
+ */
+static int irda_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk == NULL)
+ return 0;
+
+ lock_sock(sk);
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ sk->sk_state_change(sk);
+
+ /* Destroy IrDA socket */
+ irda_destroy_socket(irda_sk(sk));
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ release_sock(sk);
+
+ /* Purge queues (see sock_init_data()) */
+ skb_queue_purge(&sk->sk_receive_queue);
+
+ /* Destroy networking socket if we are the last reference on it,
+ * i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */
+ sock_put(sk);
+
+ /* Notes on socket locking and deallocation... - Jean II
+ * In theory we should put pairs of sock_hold() / sock_put() to
+ * prevent the socket to be destroyed whenever there is an
+ * outstanding request or outstanding incoming packet or event.
+ *
+ * 1) This may include IAS request, both in connect and getsockopt.
+ * Unfortunately, the situation is a bit more messy than it looks,
+ * because we close iriap and kfree(self) above.
+ *
+ * 2) This may include selective discovery in getsockopt.
+ * Same stuff as above, irlmp registration and self are gone.
+ *
+ * Probably 1 and 2 may not matter, because it's all triggered
+ * by a process and the socket layer already prevent the
+ * socket to go away while a process is holding it, through
+ * sockfd_put() and fput()...
+ *
+ * 3) This may include deferred TSAP closure. In particular,
+ * we may receive a late irda_disconnect_indication()
+ * Fortunately, (tsap_cb *)->close_pend should protect us
+ * from that.
+ *
+ * I did some testing on SMP, and it looks solid. And the socket
+ * memory leak is now gone... - Jean II
+ */
+
+ return 0;
+}
+
+/*
+ * Function irda_sendmsg (sock, msg, len)
+ *
+ * Send message down to TinyTP. This function is used for both STREAM and
+ * SEQPACK services. This is possible since it forces the client to
+ * fragment the message if necessary
+ */
+static int irda_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self;
+ struct sk_buff *skb;
+ int err = -EPIPE;
+
+ pr_debug("%s(), len=%zd\n", __func__, len);
+
+ /* Note : socket.c set MSG_EOR on SEQPACKET sockets */
+ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_EOR | MSG_CMSG_COMPAT |
+ MSG_NOSIGNAL)) {
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ goto out_err;
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ self = irda_sk(sk);
+
+ /* Check if IrTTP is wants us to slow down */
+
+ if (wait_event_interruptible(*(sk_sleep(sk)),
+ (self->tx_flow != FLOW_STOP || sk->sk_state != TCP_ESTABLISHED))) {
+ err = -ERESTARTSYS;
+ goto out;
+ }
+
+ /* Check if we are still connected */
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ /* Check that we don't send out too big frames */
+ if (len > self->max_data_size) {
+ pr_debug("%s(), Chopping frame from %zd to %d bytes!\n",
+ __func__, len, self->max_data_size);
+ len = self->max_data_size;
+ }
+
+ skb = sock_alloc_send_skb(sk, len + self->max_header_size + 16,
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto out_err;
+
+ skb_reserve(skb, self->max_header_size + 16);
+ skb_reset_transport_header(skb);
+ skb_put(skb, len);
+ err = memcpy_from_msg(skb_transport_header(skb), msg, len);
+ if (err) {
+ kfree_skb(skb);
+ goto out_err;
+ }
+
+ /*
+ * Just send the message to TinyTP, and let it deal with possible
+ * errors. No need to duplicate all that here
+ */
+ err = irttp_data_request(self->tsap, skb);
+ if (err) {
+ pr_debug("%s(), err=%d\n", __func__, err);
+ goto out_err;
+ }
+
+ release_sock(sk);
+ /* Tell client how much data we actually sent */
+ return len;
+
+out_err:
+ err = sk_stream_error(sk, msg->msg_flags, err);
+out:
+ release_sock(sk);
+ return err;
+
+}
+
+/*
+ * Function irda_recvmsg_dgram (sock, msg, size, flags)
+ *
+ * Try to receive message and copy it to user. The frame is discarded
+ * after being read, regardless of how much the user actually read
+ */
+static int irda_recvmsg_dgram(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self = irda_sk(sk);
+ struct sk_buff *skb;
+ size_t copied;
+ int err;
+
+ skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+ flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ return err;
+
+ skb_reset_transport_header(skb);
+ copied = skb->len;
+
+ if (copied > size) {
+ pr_debug("%s(), Received truncated frame (%zd < %zd)!\n",
+ __func__, copied, size);
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+ skb_copy_datagram_msg(skb, 0, msg, copied);
+
+ skb_free_datagram(sk, skb);
+
+ /*
+ * Check if we have previously stopped IrTTP and we know
+ * have more free space in our rx_queue. If so tell IrTTP
+ * to start delivering frames again before our rx_queue gets
+ * empty
+ */
+ if (self->rx_flow == FLOW_STOP) {
+ if ((atomic_read(&sk->sk_rmem_alloc) << 2) <= sk->sk_rcvbuf) {
+ pr_debug("%s(), Starting IrTTP\n", __func__);
+ self->rx_flow = FLOW_START;
+ irttp_flow_request(self->tsap, FLOW_START);
+ }
+ }
+
+ return copied;
+}
+
+/*
+ * Function irda_recvmsg_stream (sock, msg, size, flags)
+ */
+static int irda_recvmsg_stream(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self = irda_sk(sk);
+ int noblock = flags & MSG_DONTWAIT;
+ size_t copied = 0;
+ int target, err;
+ long timeo;
+
+ if ((err = sock_error(sk)) < 0)
+ return err;
+
+ if (sock->flags & __SO_ACCEPTCON)
+ return -EINVAL;
+
+ err =-EOPNOTSUPP;
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ err = 0;
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
+ timeo = sock_rcvtimeo(sk, noblock);
+
+ do {
+ int chunk;
+ struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue);
+
+ if (skb == NULL) {
+ DEFINE_WAIT(wait);
+ err = 0;
+
+ if (copied >= target)
+ break;
+
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ /*
+ * POSIX 1003.1g mandates this order.
+ */
+ err = sock_error(sk);
+ if (err)
+ ;
+ else if (sk->sk_shutdown & RCV_SHUTDOWN)
+ ;
+ else if (noblock)
+ err = -EAGAIN;
+ else if (signal_pending(current))
+ err = sock_intr_errno(timeo);
+ else if (sk->sk_state != TCP_ESTABLISHED)
+ err = -ENOTCONN;
+ else if (skb_peek(&sk->sk_receive_queue) == NULL)
+ /* Wait process until data arrives */
+ schedule();
+
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (err)
+ return err;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ continue;
+ }
+
+ chunk = min_t(unsigned int, skb->len, size);
+ if (memcpy_to_msg(msg, skb->data, chunk)) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ if (copied == 0)
+ copied = -EFAULT;
+ break;
+ }
+ copied += chunk;
+ size -= chunk;
+
+ /* Mark read part of skb as used */
+ if (!(flags & MSG_PEEK)) {
+ skb_pull(skb, chunk);
+
+ /* put the skb back if we didn't use it up.. */
+ if (skb->len) {
+ pr_debug("%s(), back on q!\n",
+ __func__);
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+
+ kfree_skb(skb);
+ } else {
+ pr_debug("%s() questionable!?\n", __func__);
+
+ /* put message back and return */
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ } while (size);
+
+ /*
+ * Check if we have previously stopped IrTTP and we know
+ * have more free space in our rx_queue. If so tell IrTTP
+ * to start delivering frames again before our rx_queue gets
+ * empty
+ */
+ if (self->rx_flow == FLOW_STOP) {
+ if ((atomic_read(&sk->sk_rmem_alloc) << 2) <= sk->sk_rcvbuf) {
+ pr_debug("%s(), Starting IrTTP\n", __func__);
+ self->rx_flow = FLOW_START;
+ irttp_flow_request(self->tsap, FLOW_START);
+ }
+ }
+
+ return copied;
+}
+
+/*
+ * Function irda_sendmsg_dgram (sock, msg, len)
+ *
+ * Send message down to TinyTP for the unreliable sequenced
+ * packet service...
+ *
+ */
+static int irda_sendmsg_dgram(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self;
+ struct sk_buff *skb;
+ int err;
+
+ pr_debug("%s(), len=%zd\n", __func__, len);
+
+ if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ send_sig(SIGPIPE, current, 0);
+ err = -EPIPE;
+ goto out;
+ }
+
+ err = -ENOTCONN;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ self = irda_sk(sk);
+
+ /*
+ * Check that we don't send out too big frames. This is an unreliable
+ * service, so we have no fragmentation and no coalescence
+ */
+ if (len > self->max_data_size) {
+ pr_debug("%s(), Warning too much data! Chopping frame from %zd to %d bytes!\n",
+ __func__, len, self->max_data_size);
+ len = self->max_data_size;
+ }
+
+ skb = sock_alloc_send_skb(sk, len + self->max_header_size,
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ err = -ENOBUFS;
+ if (!skb)
+ goto out;
+
+ skb_reserve(skb, self->max_header_size);
+ skb_reset_transport_header(skb);
+
+ pr_debug("%s(), appending user data\n", __func__);
+ skb_put(skb, len);
+ err = memcpy_from_msg(skb_transport_header(skb), msg, len);
+ if (err) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ /*
+ * Just send the message to TinyTP, and let it deal with possible
+ * errors. No need to duplicate all that here
+ */
+ err = irttp_udata_request(self->tsap, skb);
+ if (err) {
+ pr_debug("%s(), err=%d\n", __func__, err);
+ goto out;
+ }
+
+ release_sock(sk);
+ return len;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * Function irda_sendmsg_ultra (sock, msg, len)
+ *
+ * Send message down to IrLMP for the unreliable Ultra
+ * packet service...
+ */
+#ifdef CONFIG_IRDA_ULTRA
+static int irda_sendmsg_ultra(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self;
+ __u8 pid = 0;
+ int bound = 0;
+ struct sk_buff *skb;
+ int err;
+
+ pr_debug("%s(), len=%zd\n", __func__, len);
+
+ err = -EINVAL;
+ if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ err = -EPIPE;
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ send_sig(SIGPIPE, current, 0);
+ goto out;
+ }
+
+ self = irda_sk(sk);
+
+ /* Check if an address was specified with sendto. Jean II */
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_irda *, addr, msg->msg_name);
+ err = -EINVAL;
+ /* Check address, extract pid. Jean II */
+ if (msg->msg_namelen < sizeof(*addr))
+ goto out;
+ if (addr->sir_family != AF_IRDA)
+ goto out;
+
+ pid = addr->sir_lsap_sel;
+ if (pid & 0x80) {
+ pr_debug("%s(), extension in PID not supp!\n",
+ __func__);
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ } else {
+ /* Check that the socket is properly bound to an Ultra
+ * port. Jean II */
+ if ((self->lsap == NULL) ||
+ (sk->sk_state != TCP_ESTABLISHED)) {
+ pr_debug("%s(), socket not bound to Ultra PID.\n",
+ __func__);
+ err = -ENOTCONN;
+ goto out;
+ }
+ /* Use PID from socket */
+ bound = 1;
+ }
+
+ /*
+ * Check that we don't send out too big frames. This is an unreliable
+ * service, so we have no fragmentation and no coalescence
+ */
+ if (len > self->max_data_size) {
+ pr_debug("%s(), Warning too much data! Chopping frame from %zd to %d bytes!\n",
+ __func__, len, self->max_data_size);
+ len = self->max_data_size;
+ }
+
+ skb = sock_alloc_send_skb(sk, len + self->max_header_size,
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ err = -ENOBUFS;
+ if (!skb)
+ goto out;
+
+ skb_reserve(skb, self->max_header_size);
+ skb_reset_transport_header(skb);
+
+ pr_debug("%s(), appending user data\n", __func__);
+ skb_put(skb, len);
+ err = memcpy_from_msg(skb_transport_header(skb), msg, len);
+ if (err) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ err = irlmp_connless_data_request((bound ? self->lsap : NULL),
+ skb, pid);
+ if (err)
+ pr_debug("%s(), err=%d\n", __func__, err);
+out:
+ release_sock(sk);
+ return err ? : len;
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irda_shutdown (sk, how)
+ */
+static int irda_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self = irda_sk(sk);
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ lock_sock(sk);
+
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ sk->sk_state_change(sk);
+
+ if (self->iriap) {
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+ }
+
+ if (self->tsap) {
+ irttp_disconnect_request(self->tsap, NULL, P_NORMAL);
+ irttp_close_tsap(self->tsap);
+ self->tsap = NULL;
+ }
+
+ /* A few cleanup so the socket look as good as new... */
+ self->rx_flow = self->tx_flow = FLOW_START; /* needed ??? */
+ self->daddr = DEV_ADDR_ANY; /* Until we get re-connected */
+ self->saddr = 0x0; /* so IrLMP assign us any link */
+
+ release_sock(sk);
+
+ return 0;
+}
+
+/*
+ * Function irda_poll (file, sock, wait)
+ */
+static unsigned int irda_poll(struct file * file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self = irda_sk(sk);
+ unsigned int mask;
+
+ poll_wait(file, sk_sleep(sk), wait);
+ mask = 0;
+
+ /* Exceptional events? */
+ if (sk->sk_err)
+ mask |= POLLERR;
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ pr_debug("%s(), POLLHUP\n", __func__);
+ mask |= POLLHUP;
+ }
+
+ /* Readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
+ pr_debug("Socket is readable\n");
+ mask |= POLLIN | POLLRDNORM;
+ }
+
+ /* Connection-based need to check for termination and startup */
+ switch (sk->sk_type) {
+ case SOCK_STREAM:
+ if (sk->sk_state == TCP_CLOSE) {
+ pr_debug("%s(), POLLHUP\n", __func__);
+ mask |= POLLHUP;
+ }
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ if ((self->tx_flow == FLOW_START) &&
+ sock_writeable(sk))
+ {
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ }
+ }
+ break;
+ case SOCK_SEQPACKET:
+ if ((self->tx_flow == FLOW_START) &&
+ sock_writeable(sk))
+ {
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ }
+ break;
+ case SOCK_DGRAM:
+ if (sock_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ break;
+ default:
+ break;
+ }
+
+ return mask;
+}
+
+/*
+ * Function irda_ioctl (sock, cmd, arg)
+ */
+static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err;
+
+ pr_debug("%s(), cmd=%#x\n", __func__, cmd);
+
+ err = -EINVAL;
+ switch (cmd) {
+ case TIOCOUTQ: {
+ long amount;
+
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ err = put_user(amount, (unsigned int __user *)arg);
+ break;
+ }
+
+ case TIOCINQ: {
+ struct sk_buff *skb;
+ long amount = 0L;
+ /* These two are safe on a single CPU system as only user tasks fiddle here */
+ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+ amount = skb->len;
+ err = put_user(amount, (unsigned int __user *)arg);
+ break;
+ }
+
+ case SIOCGSTAMP:
+ if (sk != NULL)
+ err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+ break;
+
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFMETRIC:
+ case SIOCSIFMETRIC:
+ break;
+ default:
+ pr_debug("%s(), doing device ioctl!\n", __func__);
+ err = -ENOIOCTLCMD;
+ }
+
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Function irda_ioctl (sock, cmd, arg)
+ */
+static int irda_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ /*
+ * All IRDA's ioctl are standard ones.
+ */
+ return -ENOIOCTLCMD;
+}
+#endif
+
+/*
+ * Function irda_setsockopt (sock, level, optname, optval, optlen)
+ *
+ * Set some options for the socket
+ *
+ */
+static int irda_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self = irda_sk(sk);
+ struct irda_ias_set *ias_opt;
+ struct ias_object *ias_obj;
+ struct ias_attrib * ias_attr; /* Attribute in IAS object */
+ int opt, free_ias = 0, err = 0;
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ if (level != SOL_IRLMP)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case IRLMP_IAS_SET:
+ /* The user want to add an attribute to an existing IAS object
+ * (in the IAS database) or to create a new object with this
+ * attribute.
+ * We first query IAS to know if the object exist, and then
+ * create the right attribute...
+ */
+
+ if (optlen != sizeof(struct irda_ias_set)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
+ if (ias_opt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Copy query to the driver. */
+ if (copy_from_user(ias_opt, optval, optlen)) {
+ kfree(ias_opt);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* Find the object we target.
+ * If the user gives us an empty string, we use the object
+ * associated with this socket. This will workaround
+ * duplicated class name - Jean II */
+ if(ias_opt->irda_class_name[0] == '\0') {
+ if(self->ias_obj == NULL) {
+ kfree(ias_opt);
+ err = -EINVAL;
+ goto out;
+ }
+ ias_obj = self->ias_obj;
+ } else
+ ias_obj = irias_find_object(ias_opt->irda_class_name);
+
+ /* Only ROOT can mess with the global IAS database.
+ * Users can only add attributes to the object associated
+ * with the socket they own - Jean II */
+ if((!capable(CAP_NET_ADMIN)) &&
+ ((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
+ kfree(ias_opt);
+ err = -EPERM;
+ goto out;
+ }
+
+ /* If the object doesn't exist, create it */
+ if(ias_obj == (struct ias_object *) NULL) {
+ /* Create a new object */
+ ias_obj = irias_new_object(ias_opt->irda_class_name,
+ jiffies);
+ if (ias_obj == NULL) {
+ kfree(ias_opt);
+ err = -ENOMEM;
+ goto out;
+ }
+ free_ias = 1;
+ }
+
+ /* Do we have the attribute already ? */
+ if(irias_find_attrib(ias_obj, ias_opt->irda_attrib_name)) {
+ kfree(ias_opt);
+ if (free_ias) {
+ kfree(ias_obj->name);
+ kfree(ias_obj);
+ }
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Look at the type */
+ switch(ias_opt->irda_attrib_type) {
+ case IAS_INTEGER:
+ /* Add an integer attribute */
+ irias_add_integer_attrib(
+ ias_obj,
+ ias_opt->irda_attrib_name,
+ ias_opt->attribute.irda_attrib_int,
+ IAS_USER_ATTR);
+ break;
+ case IAS_OCT_SEQ:
+ /* Check length */
+ if(ias_opt->attribute.irda_attrib_octet_seq.len >
+ IAS_MAX_OCTET_STRING) {
+ kfree(ias_opt);
+ if (free_ias) {
+ kfree(ias_obj->name);
+ kfree(ias_obj);
+ }
+
+ err = -EINVAL;
+ goto out;
+ }
+ /* Add an octet sequence attribute */
+ irias_add_octseq_attrib(
+ ias_obj,
+ ias_opt->irda_attrib_name,
+ ias_opt->attribute.irda_attrib_octet_seq.octet_seq,
+ ias_opt->attribute.irda_attrib_octet_seq.len,
+ IAS_USER_ATTR);
+ break;
+ case IAS_STRING:
+ /* Should check charset & co */
+ /* Check length */
+ /* The length is encoded in a __u8, and
+ * IAS_MAX_STRING == 256, so there is no way
+ * userspace can pass us a string too large.
+ * Jean II */
+ /* NULL terminate the string (avoid troubles) */
+ ias_opt->attribute.irda_attrib_string.string[ias_opt->attribute.irda_attrib_string.len] = '\0';
+ /* Add a string attribute */
+ irias_add_string_attrib(
+ ias_obj,
+ ias_opt->irda_attrib_name,
+ ias_opt->attribute.irda_attrib_string.string,
+ IAS_USER_ATTR);
+ break;
+ default :
+ kfree(ias_opt);
+ if (free_ias) {
+ kfree(ias_obj->name);
+ kfree(ias_obj);
+ }
+ err = -EINVAL;
+ goto out;
+ }
+ irias_insert_object(ias_obj);
+ kfree(ias_opt);
+ break;
+ case IRLMP_IAS_DEL:
+ /* The user want to delete an object from our local IAS
+ * database. We just need to query the IAS, check is the
+ * object is not owned by the kernel and delete it.
+ */
+
+ if (optlen != sizeof(struct irda_ias_set)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
+ if (ias_opt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Copy query to the driver. */
+ if (copy_from_user(ias_opt, optval, optlen)) {
+ kfree(ias_opt);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* Find the object we target.
+ * If the user gives us an empty string, we use the object
+ * associated with this socket. This will workaround
+ * duplicated class name - Jean II */
+ if(ias_opt->irda_class_name[0] == '\0')
+ ias_obj = self->ias_obj;
+ else
+ ias_obj = irias_find_object(ias_opt->irda_class_name);
+ if(ias_obj == (struct ias_object *) NULL) {
+ kfree(ias_opt);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Only ROOT can mess with the global IAS database.
+ * Users can only del attributes from the object associated
+ * with the socket they own - Jean II */
+ if((!capable(CAP_NET_ADMIN)) &&
+ ((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
+ kfree(ias_opt);
+ err = -EPERM;
+ goto out;
+ }
+
+ /* Find the attribute (in the object) we target */
+ ias_attr = irias_find_attrib(ias_obj,
+ ias_opt->irda_attrib_name);
+ if(ias_attr == (struct ias_attrib *) NULL) {
+ kfree(ias_opt);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Check is the user space own the object */
+ if(ias_attr->value->owner != IAS_USER_ATTR) {
+ pr_debug("%s(), attempting to delete a kernel attribute\n",
+ __func__);
+ kfree(ias_opt);
+ err = -EPERM;
+ goto out;
+ }
+
+ /* Remove the attribute (and maybe the object) */
+ irias_delete_attrib(ias_obj, ias_attr, 1);
+ kfree(ias_opt);
+ break;
+ case IRLMP_MAX_SDU_SIZE:
+ if (optlen < sizeof(int)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (get_user(opt, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* Only possible for a seqpacket service (TTP with SAR) */
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ pr_debug("%s(), setting max_sdu_size = %d\n",
+ __func__, opt);
+ self->max_sdu_size_rx = opt;
+ } else {
+ net_warn_ratelimited("%s: not allowed to set MAXSDUSIZE for this socket type!\n",
+ __func__);
+ err = -ENOPROTOOPT;
+ goto out;
+ }
+ break;
+ case IRLMP_HINTS_SET:
+ if (optlen < sizeof(int)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* The input is really a (__u8 hints[2]), easier as an int */
+ if (get_user(opt, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* Unregister any old registration */
+ if (self->skey)
+ irlmp_unregister_service(self->skey);
+
+ self->skey = irlmp_register_service((__u16) opt);
+ break;
+ case IRLMP_HINT_MASK_SET:
+ /* As opposed to the previous case which set the hint bits
+ * that we advertise, this one set the filter we use when
+ * making a discovery (nodes which don't match any hint
+ * bit in the mask are not reported).
+ */
+ if (optlen < sizeof(int)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* The input is really a (__u8 hints[2]), easier as an int */
+ if (get_user(opt, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* Set the new hint mask */
+ self->mask.word = (__u16) opt;
+ /* Mask out extension bits */
+ self->mask.word &= 0x7f7f;
+ /* Check if no bits */
+ if(!self->mask.word)
+ self->mask.word = 0xFFFF;
+
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+out:
+ release_sock(sk);
+
+ return err;
+}
+
+/*
+ * Function irda_extract_ias_value(ias_opt, ias_value)
+ *
+ * Translate internal IAS value structure to the user space representation
+ *
+ * The external representation of IAS values, as we exchange them with
+ * user space program is quite different from the internal representation,
+ * as stored in the IAS database (because we need a flat structure for
+ * crossing kernel boundary).
+ * This function transform the former in the latter. We also check
+ * that the value type is valid.
+ */
+static int irda_extract_ias_value(struct irda_ias_set *ias_opt,
+ struct ias_value *ias_value)
+{
+ /* Look at the type */
+ switch (ias_value->type) {
+ case IAS_INTEGER:
+ /* Copy the integer */
+ ias_opt->attribute.irda_attrib_int = ias_value->t.integer;
+ break;
+ case IAS_OCT_SEQ:
+ /* Set length */
+ ias_opt->attribute.irda_attrib_octet_seq.len = ias_value->len;
+ /* Copy over */
+ memcpy(ias_opt->attribute.irda_attrib_octet_seq.octet_seq,
+ ias_value->t.oct_seq, ias_value->len);
+ break;
+ case IAS_STRING:
+ /* Set length */
+ ias_opt->attribute.irda_attrib_string.len = ias_value->len;
+ ias_opt->attribute.irda_attrib_string.charset = ias_value->charset;
+ /* Copy over */
+ memcpy(ias_opt->attribute.irda_attrib_string.string,
+ ias_value->t.string, ias_value->len);
+ /* NULL terminate the string (avoid troubles) */
+ ias_opt->attribute.irda_attrib_string.string[ias_value->len] = '\0';
+ break;
+ case IAS_MISSING:
+ default :
+ return -EINVAL;
+ }
+
+ /* Copy type over */
+ ias_opt->irda_attrib_type = ias_value->type;
+
+ return 0;
+}
+
+/*
+ * Function irda_getsockopt (sock, level, optname, optval, optlen)
+ */
+static int irda_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct irda_sock *self = irda_sk(sk);
+ struct irda_device_list list;
+ struct irda_device_info *discoveries;
+ struct irda_ias_set * ias_opt; /* IAS get/query params */
+ struct ias_object * ias_obj; /* Object in IAS */
+ struct ias_attrib * ias_attr; /* Attribute in IAS object */
+ int daddr = DEV_ADDR_ANY; /* Dest address for IAS queries */
+ int val = 0;
+ int len = 0;
+ int err = 0;
+ int offset, total;
+
+ pr_debug("%s(%p)\n", __func__, self);
+
+ if (level != SOL_IRLMP)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if(len < 0)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case IRLMP_ENUMDEVICES:
+
+ /* Offset to first device entry */
+ offset = sizeof(struct irda_device_list) -
+ sizeof(struct irda_device_info);
+
+ if (len < offset) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Ask lmp for the current discovery log */
+ discoveries = irlmp_get_discoveries(&list.len, self->mask.word,
+ self->nslots);
+ /* Check if the we got some results */
+ if (discoveries == NULL) {
+ err = -EAGAIN;
+ goto out; /* Didn't find any devices */
+ }
+
+ /* Write total list length back to client */
+ if (copy_to_user(optval, &list, offset))
+ err = -EFAULT;
+
+ /* Copy the list itself - watch for overflow */
+ if (list.len > 2048) {
+ err = -EINVAL;
+ goto bed;
+ }
+ total = offset + (list.len * sizeof(struct irda_device_info));
+ if (total > len)
+ total = len;
+ if (copy_to_user(optval+offset, discoveries, total - offset))
+ err = -EFAULT;
+
+ /* Write total number of bytes used back to client */
+ if (put_user(total, optlen))
+ err = -EFAULT;
+bed:
+ /* Free up our buffer */
+ kfree(discoveries);
+ break;
+ case IRLMP_MAX_SDU_SIZE:
+ val = self->max_data_size;
+ len = sizeof(int);
+ if (put_user(len, optlen)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, &val, len)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ break;
+ case IRLMP_IAS_GET:
+ /* The user want an object from our local IAS database.
+ * We just need to query the IAS and return the value
+ * that we found */
+
+ /* Check that the user has allocated the right space for us */
+ if (len != sizeof(struct irda_ias_set)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
+ if (ias_opt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Copy query to the driver. */
+ if (copy_from_user(ias_opt, optval, len)) {
+ kfree(ias_opt);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* Find the object we target.
+ * If the user gives us an empty string, we use the object
+ * associated with this socket. This will workaround
+ * duplicated class name - Jean II */
+ if(ias_opt->irda_class_name[0] == '\0')
+ ias_obj = self->ias_obj;
+ else
+ ias_obj = irias_find_object(ias_opt->irda_class_name);
+ if(ias_obj == (struct ias_object *) NULL) {
+ kfree(ias_opt);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Find the attribute (in the object) we target */
+ ias_attr = irias_find_attrib(ias_obj,
+ ias_opt->irda_attrib_name);
+ if(ias_attr == (struct ias_attrib *) NULL) {
+ kfree(ias_opt);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Translate from internal to user structure */
+ err = irda_extract_ias_value(ias_opt, ias_attr->value);
+ if(err) {
+ kfree(ias_opt);
+ goto out;
+ }
+
+ /* Copy reply to the user */
+ if (copy_to_user(optval, ias_opt,
+ sizeof(struct irda_ias_set))) {
+ kfree(ias_opt);
+ err = -EFAULT;
+ goto out;
+ }
+ /* Note : don't need to put optlen, we checked it */
+ kfree(ias_opt);
+ break;
+ case IRLMP_IAS_QUERY:
+ /* The user want an object from a remote IAS database.
+ * We need to use IAP to query the remote database and
+ * then wait for the answer to come back. */
+
+ /* Check that the user has allocated the right space for us */
+ if (len != sizeof(struct irda_ias_set)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
+ if (ias_opt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Copy query to the driver. */
+ if (copy_from_user(ias_opt, optval, len)) {
+ kfree(ias_opt);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* At this point, there are two cases...
+ * 1) the socket is connected - that's the easy case, we
+ * just query the device we are connected to...
+ * 2) the socket is not connected - the user doesn't want
+ * to connect and/or may not have a valid service name
+ * (so can't create a fake connection). In this case,
+ * we assume that the user pass us a valid destination
+ * address in the requesting structure...
+ */
+ if(self->daddr != DEV_ADDR_ANY) {
+ /* We are connected - reuse known daddr */
+ daddr = self->daddr;
+ } else {
+ /* We are not connected, we must specify a valid
+ * destination address */
+ daddr = ias_opt->daddr;
+ if((!daddr) || (daddr == DEV_ADDR_ANY)) {
+ kfree(ias_opt);
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Check that we can proceed with IAP */
+ if (self->iriap) {
+ net_warn_ratelimited("%s: busy with a previous query\n",
+ __func__);
+ kfree(ias_opt);
+ err = -EBUSY;
+ goto out;
+ }
+
+ self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+ irda_getvalue_confirm);
+
+ if (self->iriap == NULL) {
+ kfree(ias_opt);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Treat unexpected wakeup as disconnect */
+ self->errno = -EHOSTUNREACH;
+
+ /* Query remote LM-IAS */
+ iriap_getvaluebyclass_request(self->iriap,
+ self->saddr, daddr,
+ ias_opt->irda_class_name,
+ ias_opt->irda_attrib_name);
+
+ /* Wait for answer, if not yet finished (or failed) */
+ if (wait_event_interruptible(self->query_wait,
+ (self->iriap == NULL))) {
+ /* pending request uses copy of ias_opt-content
+ * we can free it regardless! */
+ kfree(ias_opt);
+ /* Treat signals as disconnect */
+ err = -EHOSTUNREACH;
+ goto out;
+ }
+
+ /* Check what happened */
+ if (self->errno)
+ {
+ kfree(ias_opt);
+ /* Requested object/attribute doesn't exist */
+ if((self->errno == IAS_CLASS_UNKNOWN) ||
+ (self->errno == IAS_ATTRIB_UNKNOWN))
+ err = -EADDRNOTAVAIL;
+ else
+ err = -EHOSTUNREACH;
+
+ goto out;
+ }
+
+ /* Translate from internal to user structure */
+ err = irda_extract_ias_value(ias_opt, self->ias_result);
+ if (self->ias_result)
+ irias_delete_value(self->ias_result);
+ if (err) {
+ kfree(ias_opt);
+ goto out;
+ }
+
+ /* Copy reply to the user */
+ if (copy_to_user(optval, ias_opt,
+ sizeof(struct irda_ias_set))) {
+ kfree(ias_opt);
+ err = -EFAULT;
+ goto out;
+ }
+ /* Note : don't need to put optlen, we checked it */
+ kfree(ias_opt);
+ break;
+ case IRLMP_WAITDEVICE:
+ /* This function is just another way of seeing life ;-)
+ * IRLMP_ENUMDEVICES assumes that you have a static network,
+ * and that you just want to pick one of the devices present.
+ * On the other hand, in here we assume that no device is
+ * present and that at some point in the future a device will
+ * come into range. When this device arrive, we just wake
+ * up the caller, so that he has time to connect to it before
+ * the device goes away...
+ * Note : once the node has been discovered for more than a
+ * few second, it won't trigger this function, unless it
+ * goes away and come back changes its hint bits (so we
+ * might call it IRLMP_WAITNEWDEVICE).
+ */
+
+ /* Check that the user is passing us an int */
+ if (len != sizeof(int)) {
+ err = -EINVAL;
+ goto out;
+ }
+ /* Get timeout in ms (max time we block the caller) */
+ if (get_user(val, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* Tell IrLMP we want to be notified */
+ irlmp_update_client(self->ckey, self->mask.word,
+ irda_selective_discovery_indication,
+ NULL, (void *) self);
+
+ /* Do some discovery (and also return cached results) */
+ irlmp_discovery_request(self->nslots);
+
+ /* Wait until a node is discovered */
+ if (!self->cachedaddr) {
+ pr_debug("%s(), nothing discovered yet, going to sleep...\n",
+ __func__);
+
+ /* Set watchdog timer to expire in <val> ms. */
+ self->errno = 0;
+ setup_timer(&self->watchdog, irda_discovery_timeout,
+ (unsigned long)self);
+ mod_timer(&self->watchdog,
+ jiffies + msecs_to_jiffies(val));
+
+ /* Wait for IR-LMP to call us back */
+ err = __wait_event_interruptible(self->query_wait,
+ (self->cachedaddr != 0 || self->errno == -ETIME));
+
+ /* If watchdog is still activated, kill it! */
+ del_timer(&(self->watchdog));
+
+ pr_debug("%s(), ...waking up !\n", __func__);
+
+ if (err != 0)
+ goto out;
+ }
+ else
+ pr_debug("%s(), found immediately !\n",
+ __func__);
+
+ /* Tell IrLMP that we have been notified */
+ irlmp_update_client(self->ckey, self->mask.word,
+ NULL, NULL, NULL);
+
+ /* Check if the we got some results */
+ if (!self->cachedaddr) {
+ err = -EAGAIN; /* Didn't find any devices */
+ goto out;
+ }
+ daddr = self->cachedaddr;
+ /* Cleanup */
+ self->cachedaddr = 0;
+
+ /* We return the daddr of the device that trigger the
+ * wakeup. As irlmp pass us only the new devices, we
+ * are sure that it's not an old device.
+ * If the user want more details, he should query
+ * the whole discovery log and pick one device...
+ */
+ if (put_user(daddr, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ }
+
+out:
+
+ release_sock(sk);
+
+ return err;
+}
+
+static const struct net_proto_family irda_family_ops = {
+ .family = PF_IRDA,
+ .create = irda_create,
+ .owner = THIS_MODULE,
+};
+
+static const struct proto_ops irda_stream_ops = {
+ .family = PF_IRDA,
+ .owner = THIS_MODULE,
+ .release = irda_release,
+ .bind = irda_bind,
+ .connect = irda_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = irda_accept,
+ .getname = irda_getname,
+ .poll = irda_poll,
+ .ioctl = irda_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = irda_compat_ioctl,
+#endif
+ .listen = irda_listen,
+ .shutdown = irda_shutdown,
+ .setsockopt = irda_setsockopt,
+ .getsockopt = irda_getsockopt,
+ .sendmsg = irda_sendmsg,
+ .recvmsg = irda_recvmsg_stream,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static const struct proto_ops irda_seqpacket_ops = {
+ .family = PF_IRDA,
+ .owner = THIS_MODULE,
+ .release = irda_release,
+ .bind = irda_bind,
+ .connect = irda_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = irda_accept,
+ .getname = irda_getname,
+ .poll = datagram_poll,
+ .ioctl = irda_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = irda_compat_ioctl,
+#endif
+ .listen = irda_listen,
+ .shutdown = irda_shutdown,
+ .setsockopt = irda_setsockopt,
+ .getsockopt = irda_getsockopt,
+ .sendmsg = irda_sendmsg,
+ .recvmsg = irda_recvmsg_dgram,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static const struct proto_ops irda_dgram_ops = {
+ .family = PF_IRDA,
+ .owner = THIS_MODULE,
+ .release = irda_release,
+ .bind = irda_bind,
+ .connect = irda_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = irda_accept,
+ .getname = irda_getname,
+ .poll = datagram_poll,
+ .ioctl = irda_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = irda_compat_ioctl,
+#endif
+ .listen = irda_listen,
+ .shutdown = irda_shutdown,
+ .setsockopt = irda_setsockopt,
+ .getsockopt = irda_getsockopt,
+ .sendmsg = irda_sendmsg_dgram,
+ .recvmsg = irda_recvmsg_dgram,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+#ifdef CONFIG_IRDA_ULTRA
+static const struct proto_ops irda_ultra_ops = {
+ .family = PF_IRDA,
+ .owner = THIS_MODULE,
+ .release = irda_release,
+ .bind = irda_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = irda_getname,
+ .poll = datagram_poll,
+ .ioctl = irda_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = irda_compat_ioctl,
+#endif
+ .listen = sock_no_listen,
+ .shutdown = irda_shutdown,
+ .setsockopt = irda_setsockopt,
+ .getsockopt = irda_getsockopt,
+ .sendmsg = irda_sendmsg_ultra,
+ .recvmsg = irda_recvmsg_dgram,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irsock_init (pro)
+ *
+ * Initialize IrDA protocol
+ *
+ */
+int __init irsock_init(void)
+{
+ int rc = proto_register(&irda_proto, 0);
+
+ if (rc == 0)
+ rc = sock_register(&irda_family_ops);
+
+ return rc;
+}
+
+/*
+ * Function irsock_cleanup (void)
+ *
+ * Remove IrDA protocol
+ *
+ */
+void irsock_cleanup(void)
+{
+ sock_unregister(PF_IRDA);
+ proto_unregister(&irda_proto);
+}
diff --git a/net/irda/discovery.c b/net/irda/discovery.c
new file mode 100644
index 000000000..364d70aed
--- /dev/null
+++ b/net/irda/discovery.c
@@ -0,0 +1,417 @@
+/*********************************************************************
+ *
+ * Filename: discovery.c
+ * Version: 0.1
+ * Description: Routines for handling discoveries at the IrLMP layer
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Tue Apr 6 15:33:50 1999
+ * Modified at: Sat Oct 9 17:11:31 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ * Modified at: Fri May 28 3:11 CST 1999
+ * Modified by: Horst von Brand <vonbrand@sleipnir.valparaiso.cl>
+ *
+ * Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+
+#include <net/irda/discovery.h>
+
+#include <asm/unaligned.h>
+
+/*
+ * Function irlmp_add_discovery (cachelog, discovery)
+ *
+ * Add a new discovery to the cachelog, and remove any old discoveries
+ * from the same device
+ *
+ * Note : we try to preserve the time this device was *first* discovered
+ * (as opposed to the time of last discovery used for cleanup). This is
+ * used by clients waiting for discovery events to tell if the device
+ * discovered is "new" or just the same old one. They can't rely there
+ * on a binary flag (new/old), because not all discovery events are
+ * propagated to them, and they might not always listen, so they would
+ * miss some new devices popping up...
+ * Jean II
+ */
+void irlmp_add_discovery(hashbin_t *cachelog, discovery_t *new)
+{
+ discovery_t *discovery, *node;
+ unsigned long flags;
+
+ /* Set time of first discovery if node is new (see below) */
+ new->firststamp = new->timestamp;
+
+ spin_lock_irqsave(&cachelog->hb_spinlock, flags);
+
+ /*
+ * Remove all discoveries of devices that has previously been
+ * discovered on the same link with the same name (info), or the
+ * same daddr. We do this since some devices (mostly PDAs) change
+ * their device address between every discovery.
+ */
+ discovery = (discovery_t *) hashbin_get_first(cachelog);
+ while (discovery != NULL ) {
+ node = discovery;
+
+ /* Be sure to stay one item ahead */
+ discovery = (discovery_t *) hashbin_get_next(cachelog);
+
+ if ((node->data.saddr == new->data.saddr) &&
+ ((node->data.daddr == new->data.daddr) ||
+ (strcmp(node->data.info, new->data.info) == 0)))
+ {
+ /* This discovery is a previous discovery
+ * from the same device, so just remove it
+ */
+ hashbin_remove_this(cachelog, (irda_queue_t *) node);
+ /* Check if hints bits are unchanged */
+ if (get_unaligned((__u16 *)node->data.hints) == get_unaligned((__u16 *)new->data.hints))
+ /* Set time of first discovery for this node */
+ new->firststamp = node->firststamp;
+ kfree(node);
+ }
+ }
+
+ /* Insert the new and updated version */
+ hashbin_insert(cachelog, (irda_queue_t *) new, new->data.daddr, NULL);
+
+ spin_unlock_irqrestore(&cachelog->hb_spinlock, flags);
+}
+
+/*
+ * Function irlmp_add_discovery_log (cachelog, log)
+ *
+ * Merge a disovery log into the cachelog.
+ *
+ */
+void irlmp_add_discovery_log(hashbin_t *cachelog, hashbin_t *log)
+{
+ discovery_t *discovery;
+
+ /*
+ * If log is missing this means that IrLAP was unable to perform the
+ * discovery, so restart discovery again with just the half timeout
+ * of the normal one.
+ */
+ /* Well... It means that there was nobody out there - Jean II */
+ if (log == NULL) {
+ /* irlmp_start_discovery_timer(irlmp, 150); */
+ return;
+ }
+
+ /*
+ * Locking : we are the only owner of this discovery log, so
+ * no need to lock it.
+ * We just need to lock the global log in irlmp_add_discovery().
+ */
+ discovery = (discovery_t *) hashbin_remove_first(log);
+ while (discovery != NULL) {
+ irlmp_add_discovery(cachelog, discovery);
+
+ discovery = (discovery_t *) hashbin_remove_first(log);
+ }
+
+ /* Delete the now empty log */
+ hashbin_delete(log, (FREE_FUNC) kfree);
+}
+
+/*
+ * Function irlmp_expire_discoveries (log, saddr, force)
+ *
+ * Go through all discoveries and expire all that has stayed too long
+ *
+ * Note : this assume that IrLAP won't change its saddr, which
+ * currently is a valid assumption...
+ */
+void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force)
+{
+ discovery_t * discovery;
+ discovery_t * curr;
+ unsigned long flags;
+ discinfo_t * buffer = NULL;
+ int n; /* Size of the full log */
+ int i = 0; /* How many we expired */
+
+ IRDA_ASSERT(log != NULL, return;);
+ spin_lock_irqsave(&log->hb_spinlock, flags);
+
+ discovery = (discovery_t *) hashbin_get_first(log);
+ while (discovery != NULL) {
+ /* Be sure to be one item ahead */
+ curr = discovery;
+ discovery = (discovery_t *) hashbin_get_next(log);
+
+ /* Test if it's time to expire this discovery */
+ if ((curr->data.saddr == saddr) &&
+ (force ||
+ ((jiffies - curr->timestamp) > DISCOVERY_EXPIRE_TIMEOUT)))
+ {
+ /* Create buffer as needed.
+ * As this function get called a lot and most time
+ * we don't have anything to put in the log (we are
+ * quite picky), we can save a lot of overhead
+ * by not calling kmalloc. Jean II */
+ if(buffer == NULL) {
+ /* Create the client specific buffer */
+ n = HASHBIN_GET_SIZE(log);
+ buffer = kmalloc(n * sizeof(struct irda_device_info), GFP_ATOMIC);
+ if (buffer == NULL) {
+ spin_unlock_irqrestore(&log->hb_spinlock, flags);
+ return;
+ }
+
+ }
+
+ /* Copy discovery information */
+ memcpy(&(buffer[i]), &(curr->data),
+ sizeof(discinfo_t));
+ i++;
+
+ /* Remove it from the log */
+ curr = hashbin_remove_this(log, (irda_queue_t *) curr);
+ kfree(curr);
+ }
+ }
+
+ /* Drop the spinlock before calling the higher layers, as
+ * we can't guarantee they won't call us back and create a
+ * deadlock. We will work on our own private data, so we
+ * don't care to be interrupted. - Jean II */
+ spin_unlock_irqrestore(&log->hb_spinlock, flags);
+
+ if(buffer == NULL)
+ return;
+
+ /* Tell IrLMP and registered clients about it */
+ irlmp_discovery_expiry(buffer, i);
+
+ /* Free up our buffer */
+ kfree(buffer);
+}
+
+#if 0
+/*
+ * Function irlmp_dump_discoveries (log)
+ *
+ * Print out all discoveries in log
+ *
+ */
+void irlmp_dump_discoveries(hashbin_t *log)
+{
+ discovery_t *discovery;
+
+ IRDA_ASSERT(log != NULL, return;);
+
+ discovery = (discovery_t *) hashbin_get_first(log);
+ while (discovery != NULL) {
+ pr_debug("Discovery:\n");
+ pr_debug(" daddr=%08x\n", discovery->data.daddr);
+ pr_debug(" saddr=%08x\n", discovery->data.saddr);
+ pr_debug(" nickname=%s\n", discovery->data.info);
+
+ discovery = (discovery_t *) hashbin_get_next(log);
+ }
+}
+#endif
+
+/*
+ * Function irlmp_copy_discoveries (log, pn, mask)
+ *
+ * Copy all discoveries in a buffer
+ *
+ * This function implement a safe way for lmp clients to access the
+ * discovery log. The basic problem is that we don't want the log
+ * to change (add/remove) while the client is reading it. If the
+ * lmp client manipulate directly the hashbin, he is sure to get
+ * into troubles...
+ * The idea is that we copy all the current discovery log in a buffer
+ * which is specific to the client and pass this copy to him. As we
+ * do this operation with the spinlock grabbed, we are safe...
+ * Note : we don't want those clients to grab the spinlock, because
+ * we have no control on how long they will hold it...
+ * Note : we choose to copy the log in "struct irda_device_info" to
+ * save space...
+ * Note : the client must kfree himself() the log...
+ * Jean II
+ */
+struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn,
+ __u16 mask, int old_entries)
+{
+ discovery_t * discovery;
+ unsigned long flags;
+ discinfo_t * buffer = NULL;
+ int j_timeout = (sysctl_discovery_timeout * HZ);
+ int n; /* Size of the full log */
+ int i = 0; /* How many we picked */
+
+ IRDA_ASSERT(pn != NULL, return NULL;);
+ IRDA_ASSERT(log != NULL, return NULL;);
+
+ /* Save spin lock */
+ spin_lock_irqsave(&log->hb_spinlock, flags);
+
+ discovery = (discovery_t *) hashbin_get_first(log);
+ while (discovery != NULL) {
+ /* Mask out the ones we don't want :
+ * We want to match the discovery mask, and to get only
+ * the most recent one (unless we want old ones) */
+ if ((get_unaligned((__u16 *)discovery->data.hints) & mask) &&
+ ((old_entries) ||
+ ((jiffies - discovery->firststamp) < j_timeout))) {
+ /* Create buffer as needed.
+ * As this function get called a lot and most time
+ * we don't have anything to put in the log (we are
+ * quite picky), we can save a lot of overhead
+ * by not calling kmalloc. Jean II */
+ if(buffer == NULL) {
+ /* Create the client specific buffer */
+ n = HASHBIN_GET_SIZE(log);
+ buffer = kmalloc(n * sizeof(struct irda_device_info), GFP_ATOMIC);
+ if (buffer == NULL) {
+ spin_unlock_irqrestore(&log->hb_spinlock, flags);
+ return NULL;
+ }
+
+ }
+
+ /* Copy discovery information */
+ memcpy(&(buffer[i]), &(discovery->data),
+ sizeof(discinfo_t));
+ i++;
+ }
+ discovery = (discovery_t *) hashbin_get_next(log);
+ }
+
+ spin_unlock_irqrestore(&log->hb_spinlock, flags);
+
+ /* Get the actual number of device in the buffer and return */
+ *pn = i;
+ return buffer;
+}
+
+#ifdef CONFIG_PROC_FS
+static inline discovery_t *discovery_seq_idx(loff_t pos)
+
+{
+ discovery_t *discovery;
+
+ for (discovery = (discovery_t *) hashbin_get_first(irlmp->cachelog);
+ discovery != NULL;
+ discovery = (discovery_t *) hashbin_get_next(irlmp->cachelog)) {
+ if (pos-- == 0)
+ break;
+ }
+
+ return discovery;
+}
+
+static void *discovery_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ spin_lock_irq(&irlmp->cachelog->hb_spinlock);
+ return *pos ? discovery_seq_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *discovery_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return (v == SEQ_START_TOKEN)
+ ? (void *) hashbin_get_first(irlmp->cachelog)
+ : (void *) hashbin_get_next(irlmp->cachelog);
+}
+
+static void discovery_seq_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_irq(&irlmp->cachelog->hb_spinlock);
+}
+
+static int discovery_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "IrLMP: Discovery log:\n\n");
+ else {
+ const discovery_t *discovery = v;
+
+ seq_printf(seq, "nickname: %s, hint: 0x%02x%02x",
+ discovery->data.info,
+ discovery->data.hints[0],
+ discovery->data.hints[1]);
+#if 0
+ if ( discovery->data.hints[0] & HINT_PNP)
+ seq_puts(seq, "PnP Compatible ");
+ if ( discovery->data.hints[0] & HINT_PDA)
+ seq_puts(seq, "PDA/Palmtop ");
+ if ( discovery->data.hints[0] & HINT_COMPUTER)
+ seq_puts(seq, "Computer ");
+ if ( discovery->data.hints[0] & HINT_PRINTER)
+ seq_puts(seq, "Printer ");
+ if ( discovery->data.hints[0] & HINT_MODEM)
+ seq_puts(seq, "Modem ");
+ if ( discovery->data.hints[0] & HINT_FAX)
+ seq_puts(seq, "Fax ");
+ if ( discovery->data.hints[0] & HINT_LAN)
+ seq_puts(seq, "LAN Access ");
+
+ if ( discovery->data.hints[1] & HINT_TELEPHONY)
+ seq_puts(seq, "Telephony ");
+ if ( discovery->data.hints[1] & HINT_FILE_SERVER)
+ seq_puts(seq, "File Server ");
+ if ( discovery->data.hints[1] & HINT_COMM)
+ seq_puts(seq, "IrCOMM ");
+ if ( discovery->data.hints[1] & HINT_OBEX)
+ seq_puts(seq, "IrOBEX ");
+#endif
+ seq_printf(seq,", saddr: 0x%08x, daddr: 0x%08x\n\n",
+ discovery->data.saddr,
+ discovery->data.daddr);
+
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static const struct seq_operations discovery_seq_ops = {
+ .start = discovery_seq_start,
+ .next = discovery_seq_next,
+ .stop = discovery_seq_stop,
+ .show = discovery_seq_show,
+};
+
+static int discovery_seq_open(struct inode *inode, struct file *file)
+{
+ IRDA_ASSERT(irlmp != NULL, return -EINVAL;);
+
+ return seq_open(file, &discovery_seq_ops);
+}
+
+const struct file_operations discovery_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = discovery_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif
diff --git a/net/irda/ircomm/Kconfig b/net/irda/ircomm/Kconfig
new file mode 100644
index 000000000..19492c170
--- /dev/null
+++ b/net/irda/ircomm/Kconfig
@@ -0,0 +1,12 @@
+config IRCOMM
+ tristate "IrCOMM protocol"
+ depends on IRDA && TTY
+ help
+ Say Y here if you want to build support for the IrCOMM protocol.
+ To compile it as modules, choose M here: the modules will be
+ called ircomm and ircomm_tty.
+ IrCOMM implements serial port emulation, and makes it possible to
+ use all existing applications that understands TTY's with an
+ infrared link. Thus you should be able to use application like PPP,
+ minicom and others.
+
diff --git a/net/irda/ircomm/Makefile b/net/irda/ircomm/Makefile
new file mode 100644
index 000000000..ab23b5ba7
--- /dev/null
+++ b/net/irda/ircomm/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux IrDA IrCOMM protocol layer.
+#
+
+obj-$(CONFIG_IRCOMM) += ircomm.o ircomm-tty.o
+
+ircomm-y := ircomm_core.o ircomm_event.o ircomm_lmp.o ircomm_ttp.o
+ircomm-tty-y := ircomm_tty.o ircomm_tty_attach.o ircomm_tty_ioctl.o ircomm_param.o
diff --git a/net/irda/ircomm/ircomm_core.c b/net/irda/ircomm/ircomm_core.c
new file mode 100644
index 000000000..3af219545
--- /dev/null
+++ b/net/irda/ircomm/ircomm_core.c
@@ -0,0 +1,563 @@
+/*********************************************************************
+ *
+ * Filename: ircomm_core.c
+ * Version: 1.0
+ * Description: IrCOMM service interface
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Jun 6 20:37:34 1999
+ * Modified at: Tue Dec 21 13:26:41 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irias_object.h>
+
+#include <net/irda/ircomm_event.h>
+#include <net/irda/ircomm_lmp.h>
+#include <net/irda/ircomm_ttp.h>
+#include <net/irda/ircomm_param.h>
+#include <net/irda/ircomm_core.h>
+
+static int __ircomm_close(struct ircomm_cb *self);
+static void ircomm_control_indication(struct ircomm_cb *self,
+ struct sk_buff *skb, int clen);
+
+#ifdef CONFIG_PROC_FS
+extern struct proc_dir_entry *proc_irda;
+static int ircomm_seq_open(struct inode *, struct file *);
+
+static const struct file_operations ircomm_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = ircomm_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+hashbin_t *ircomm = NULL;
+
+static int __init ircomm_init(void)
+{
+ ircomm = hashbin_new(HB_LOCK);
+ if (ircomm == NULL) {
+ net_err_ratelimited("%s(), can't allocate hashbin!\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+#ifdef CONFIG_PROC_FS
+ { struct proc_dir_entry *ent;
+ ent = proc_create("ircomm", 0, proc_irda, &ircomm_proc_fops);
+ if (!ent) {
+ printk(KERN_ERR "ircomm_init: can't create /proc entry!\n");
+ return -ENODEV;
+ }
+ }
+#endif /* CONFIG_PROC_FS */
+
+ net_info_ratelimited("IrCOMM protocol (Dag Brattli)\n");
+
+ return 0;
+}
+
+static void __exit ircomm_cleanup(void)
+{
+ hashbin_delete(ircomm, (FREE_FUNC) __ircomm_close);
+
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ircomm", proc_irda);
+#endif /* CONFIG_PROC_FS */
+}
+
+/*
+ * Function ircomm_open (client_notify)
+ *
+ * Start a new IrCOMM instance
+ *
+ */
+struct ircomm_cb *ircomm_open(notify_t *notify, __u8 service_type, int line)
+{
+ struct ircomm_cb *self = NULL;
+ int ret;
+
+ pr_debug("%s(), service_type=0x%02x\n", __func__ ,
+ service_type);
+
+ IRDA_ASSERT(ircomm != NULL, return NULL;);
+
+ self = kzalloc(sizeof(struct ircomm_cb), GFP_KERNEL);
+ if (self == NULL)
+ return NULL;
+
+ self->notify = *notify;
+ self->magic = IRCOMM_MAGIC;
+
+ /* Check if we should use IrLMP or IrTTP */
+ if (service_type & IRCOMM_3_WIRE_RAW) {
+ self->flow_status = FLOW_START;
+ ret = ircomm_open_lsap(self);
+ } else
+ ret = ircomm_open_tsap(self);
+
+ if (ret < 0) {
+ kfree(self);
+ return NULL;
+ }
+
+ self->service_type = service_type;
+ self->line = line;
+
+ hashbin_insert(ircomm, (irda_queue_t *) self, line, NULL);
+
+ ircomm_next_state(self, IRCOMM_IDLE);
+
+ return self;
+}
+
+EXPORT_SYMBOL(ircomm_open);
+
+/*
+ * Function ircomm_close_instance (self)
+ *
+ * Remove IrCOMM instance
+ *
+ */
+static int __ircomm_close(struct ircomm_cb *self)
+{
+ /* Disconnect link if any */
+ ircomm_do_event(self, IRCOMM_DISCONNECT_REQUEST, NULL, NULL);
+
+ /* Remove TSAP */
+ if (self->tsap) {
+ irttp_close_tsap(self->tsap);
+ self->tsap = NULL;
+ }
+
+ /* Remove LSAP */
+ if (self->lsap) {
+ irlmp_close_lsap(self->lsap);
+ self->lsap = NULL;
+ }
+ self->magic = 0;
+
+ kfree(self);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_close (self)
+ *
+ * Closes and removes the specified IrCOMM instance
+ *
+ */
+int ircomm_close(struct ircomm_cb *self)
+{
+ struct ircomm_cb *entry;
+
+ IRDA_ASSERT(self != NULL, return -EIO;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EIO;);
+
+ entry = hashbin_remove(ircomm, self->line, NULL);
+
+ IRDA_ASSERT(entry == self, return -1;);
+
+ return __ircomm_close(self);
+}
+
+EXPORT_SYMBOL(ircomm_close);
+
+/*
+ * Function ircomm_connect_request (self, service_type)
+ *
+ * Impl. of this function is differ from one of the reference. This
+ * function does discovery as well as sending connect request
+ *
+ */
+int ircomm_connect_request(struct ircomm_cb *self, __u8 dlsap_sel,
+ __u32 saddr, __u32 daddr, struct sk_buff *skb,
+ __u8 service_type)
+{
+ struct ircomm_info info;
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+
+ self->service_type= service_type;
+
+ info.dlsap_sel = dlsap_sel;
+ info.saddr = saddr;
+ info.daddr = daddr;
+
+ ret = ircomm_do_event(self, IRCOMM_CONNECT_REQUEST, skb, &info);
+
+ return ret;
+}
+
+EXPORT_SYMBOL(ircomm_connect_request);
+
+/*
+ * Function ircomm_connect_indication (self, qos, skb)
+ *
+ * Notify user layer about the incoming connection
+ *
+ */
+void ircomm_connect_indication(struct ircomm_cb *self, struct sk_buff *skb,
+ struct ircomm_info *info)
+{
+ /*
+ * If there are any data hiding in the control channel, we must
+ * deliver it first. The side effect is that the control channel
+ * will be removed from the skb
+ */
+ if (self->notify.connect_indication)
+ self->notify.connect_indication(self->notify.instance, self,
+ info->qos, info->max_data_size,
+ info->max_header_size, skb);
+ else {
+ pr_debug("%s(), missing handler\n", __func__);
+ }
+}
+
+/*
+ * Function ircomm_connect_response (self, userdata, max_sdu_size)
+ *
+ * User accepts connection
+ *
+ */
+int ircomm_connect_response(struct ircomm_cb *self, struct sk_buff *userdata)
+{
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+
+ ret = ircomm_do_event(self, IRCOMM_CONNECT_RESPONSE, userdata, NULL);
+
+ return ret;
+}
+
+EXPORT_SYMBOL(ircomm_connect_response);
+
+/*
+ * Function connect_confirm (self, skb)
+ *
+ * Notify user layer that the link is now connected
+ *
+ */
+void ircomm_connect_confirm(struct ircomm_cb *self, struct sk_buff *skb,
+ struct ircomm_info *info)
+{
+ if (self->notify.connect_confirm )
+ self->notify.connect_confirm(self->notify.instance,
+ self, info->qos,
+ info->max_data_size,
+ info->max_header_size, skb);
+ else {
+ pr_debug("%s(), missing handler\n", __func__);
+ }
+}
+
+/*
+ * Function ircomm_data_request (self, userdata)
+ *
+ * Send IrCOMM data to peer device
+ *
+ */
+int ircomm_data_request(struct ircomm_cb *self, struct sk_buff *skb)
+{
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -EFAULT;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EFAULT;);
+ IRDA_ASSERT(skb != NULL, return -EFAULT;);
+
+ ret = ircomm_do_event(self, IRCOMM_DATA_REQUEST, skb, NULL);
+
+ return ret;
+}
+
+EXPORT_SYMBOL(ircomm_data_request);
+
+/*
+ * Function ircomm_data_indication (self, skb)
+ *
+ * Data arrived, so deliver it to user
+ *
+ */
+void ircomm_data_indication(struct ircomm_cb *self, struct sk_buff *skb)
+{
+ IRDA_ASSERT(skb->len > 0, return;);
+
+ if (self->notify.data_indication)
+ self->notify.data_indication(self->notify.instance, self, skb);
+ else {
+ pr_debug("%s(), missing handler\n", __func__);
+ }
+}
+
+/*
+ * Function ircomm_process_data (self, skb)
+ *
+ * Data arrived which may contain control channel data
+ *
+ */
+void ircomm_process_data(struct ircomm_cb *self, struct sk_buff *skb)
+{
+ int clen;
+
+ IRDA_ASSERT(skb->len > 0, return;);
+
+ clen = skb->data[0];
+
+ /*
+ * Input validation check: a stir4200/mcp2150 combinations sometimes
+ * results in frames with clen > remaining packet size. These are
+ * illegal; if we throw away just this frame then it seems to carry on
+ * fine
+ */
+ if (unlikely(skb->len < (clen + 1))) {
+ pr_debug("%s() throwing away illegal frame\n",
+ __func__);
+ return;
+ }
+
+ /*
+ * If there are any data hiding in the control channel, we must
+ * deliver it first. The side effect is that the control channel
+ * will be removed from the skb
+ */
+ if (clen > 0)
+ ircomm_control_indication(self, skb, clen);
+
+ /* Remove control channel from data channel */
+ skb_pull(skb, clen+1);
+
+ if (skb->len)
+ ircomm_data_indication(self, skb);
+ else {
+ pr_debug("%s(), data was control info only!\n",
+ __func__);
+ }
+}
+
+/*
+ * Function ircomm_control_request (self, params)
+ *
+ * Send control data to peer device
+ *
+ */
+int ircomm_control_request(struct ircomm_cb *self, struct sk_buff *skb)
+{
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -EFAULT;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EFAULT;);
+ IRDA_ASSERT(skb != NULL, return -EFAULT;);
+
+ ret = ircomm_do_event(self, IRCOMM_CONTROL_REQUEST, skb, NULL);
+
+ return ret;
+}
+
+EXPORT_SYMBOL(ircomm_control_request);
+
+/*
+ * Function ircomm_control_indication (self, skb)
+ *
+ * Data has arrived on the control channel
+ *
+ */
+static void ircomm_control_indication(struct ircomm_cb *self,
+ struct sk_buff *skb, int clen)
+{
+ /* Use udata for delivering data on the control channel */
+ if (self->notify.udata_indication) {
+ struct sk_buff *ctrl_skb;
+
+ /* We don't own the skb, so clone it */
+ ctrl_skb = skb_clone(skb, GFP_ATOMIC);
+ if (!ctrl_skb)
+ return;
+
+ /* Remove data channel from control channel */
+ skb_trim(ctrl_skb, clen+1);
+
+ self->notify.udata_indication(self->notify.instance, self,
+ ctrl_skb);
+
+ /* Drop reference count -
+ * see ircomm_tty_control_indication(). */
+ dev_kfree_skb(ctrl_skb);
+ } else {
+ pr_debug("%s(), missing handler\n", __func__);
+ }
+}
+
+/*
+ * Function ircomm_disconnect_request (self, userdata, priority)
+ *
+ * User layer wants to disconnect the IrCOMM connection
+ *
+ */
+int ircomm_disconnect_request(struct ircomm_cb *self, struct sk_buff *userdata)
+{
+ struct ircomm_info info;
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+
+ ret = ircomm_do_event(self, IRCOMM_DISCONNECT_REQUEST, userdata,
+ &info);
+ return ret;
+}
+
+EXPORT_SYMBOL(ircomm_disconnect_request);
+
+/*
+ * Function disconnect_indication (self, skb)
+ *
+ * Tell user that the link has been disconnected
+ *
+ */
+void ircomm_disconnect_indication(struct ircomm_cb *self, struct sk_buff *skb,
+ struct ircomm_info *info)
+{
+ IRDA_ASSERT(info != NULL, return;);
+
+ if (self->notify.disconnect_indication) {
+ self->notify.disconnect_indication(self->notify.instance, self,
+ info->reason, skb);
+ } else {
+ pr_debug("%s(), missing handler\n", __func__);
+ }
+}
+
+/*
+ * Function ircomm_flow_request (self, flow)
+ *
+ *
+ *
+ */
+void ircomm_flow_request(struct ircomm_cb *self, LOCAL_FLOW flow)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+ if (self->service_type == IRCOMM_3_WIRE_RAW)
+ return;
+
+ irttp_flow_request(self->tsap, flow);
+}
+
+EXPORT_SYMBOL(ircomm_flow_request);
+
+#ifdef CONFIG_PROC_FS
+static void *ircomm_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct ircomm_cb *self;
+ loff_t off = 0;
+
+ spin_lock_irq(&ircomm->hb_spinlock);
+
+ for (self = (struct ircomm_cb *) hashbin_get_first(ircomm);
+ self != NULL;
+ self = (struct ircomm_cb *) hashbin_get_next(ircomm)) {
+ if (off++ == *pos)
+ break;
+
+ }
+ return self;
+}
+
+static void *ircomm_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+
+ return (void *) hashbin_get_next(ircomm);
+}
+
+static void ircomm_seq_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_irq(&ircomm->hb_spinlock);
+}
+
+static int ircomm_seq_show(struct seq_file *seq, void *v)
+{
+ const struct ircomm_cb *self = v;
+
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EINVAL; );
+
+ if(self->line < 0x10)
+ seq_printf(seq, "ircomm%d", self->line);
+ else
+ seq_printf(seq, "irlpt%d", self->line - 0x10);
+
+ seq_printf(seq,
+ " state: %s, slsap_sel: %#02x, dlsap_sel: %#02x, mode:",
+ ircomm_state[ self->state],
+ self->slsap_sel, self->dlsap_sel);
+
+ if(self->service_type & IRCOMM_3_WIRE_RAW)
+ seq_printf(seq, " 3-wire-raw");
+ if(self->service_type & IRCOMM_3_WIRE)
+ seq_printf(seq, " 3-wire");
+ if(self->service_type & IRCOMM_9_WIRE)
+ seq_printf(seq, " 9-wire");
+ if(self->service_type & IRCOMM_CENTRONICS)
+ seq_printf(seq, " Centronics");
+ seq_putc(seq, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations ircomm_seq_ops = {
+ .start = ircomm_seq_start,
+ .next = ircomm_seq_next,
+ .stop = ircomm_seq_stop,
+ .show = ircomm_seq_show,
+};
+
+static int ircomm_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ircomm_seq_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_AUTHOR("Dag Brattli <dag@brattli.net>");
+MODULE_DESCRIPTION("IrCOMM protocol");
+MODULE_LICENSE("GPL");
+
+module_init(ircomm_init);
+module_exit(ircomm_cleanup);
diff --git a/net/irda/ircomm/ircomm_event.c b/net/irda/ircomm/ircomm_event.c
new file mode 100644
index 000000000..b0730ac9f
--- /dev/null
+++ b/net/irda/ircomm/ircomm_event.c
@@ -0,0 +1,246 @@
+/*********************************************************************
+ *
+ * Filename: ircomm_event.c
+ * Version: 1.0
+ * Description: IrCOMM layer state machine
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Jun 6 20:33:11 1999
+ * Modified at: Sun Dec 12 13:44:32 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irias_object.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_event.h>
+
+static int ircomm_state_idle(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info);
+static int ircomm_state_waiti(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info);
+static int ircomm_state_waitr(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info);
+static int ircomm_state_conn(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info);
+
+const char *const ircomm_state[] = {
+ "IRCOMM_IDLE",
+ "IRCOMM_WAITI",
+ "IRCOMM_WAITR",
+ "IRCOMM_CONN",
+};
+
+static const char *const ircomm_event[] __maybe_unused = {
+ "IRCOMM_CONNECT_REQUEST",
+ "IRCOMM_CONNECT_RESPONSE",
+ "IRCOMM_TTP_CONNECT_INDICATION",
+ "IRCOMM_LMP_CONNECT_INDICATION",
+ "IRCOMM_TTP_CONNECT_CONFIRM",
+ "IRCOMM_LMP_CONNECT_CONFIRM",
+
+ "IRCOMM_LMP_DISCONNECT_INDICATION",
+ "IRCOMM_TTP_DISCONNECT_INDICATION",
+ "IRCOMM_DISCONNECT_REQUEST",
+
+ "IRCOMM_TTP_DATA_INDICATION",
+ "IRCOMM_LMP_DATA_INDICATION",
+ "IRCOMM_DATA_REQUEST",
+ "IRCOMM_CONTROL_REQUEST",
+ "IRCOMM_CONTROL_INDICATION",
+};
+
+static int (*state[])(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info) =
+{
+ ircomm_state_idle,
+ ircomm_state_waiti,
+ ircomm_state_waitr,
+ ircomm_state_conn,
+};
+
+/*
+ * Function ircomm_state_idle (self, event, skb)
+ *
+ * IrCOMM is currently idle
+ *
+ */
+static int ircomm_state_idle(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info)
+{
+ int ret = 0;
+
+ switch (event) {
+ case IRCOMM_CONNECT_REQUEST:
+ ircomm_next_state(self, IRCOMM_WAITI);
+ ret = self->issue.connect_request(self, skb, info);
+ break;
+ case IRCOMM_TTP_CONNECT_INDICATION:
+ case IRCOMM_LMP_CONNECT_INDICATION:
+ ircomm_next_state(self, IRCOMM_WAITR);
+ ircomm_connect_indication(self, skb, info);
+ break;
+ default:
+ pr_debug("%s(), unknown event: %s\n", __func__ ,
+ ircomm_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_state_waiti (self, event, skb)
+ *
+ * The IrCOMM user has requested an IrCOMM connection to the remote
+ * device and is awaiting confirmation
+ */
+static int ircomm_state_waiti(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info)
+{
+ int ret = 0;
+
+ switch (event) {
+ case IRCOMM_TTP_CONNECT_CONFIRM:
+ case IRCOMM_LMP_CONNECT_CONFIRM:
+ ircomm_next_state(self, IRCOMM_CONN);
+ ircomm_connect_confirm(self, skb, info);
+ break;
+ case IRCOMM_TTP_DISCONNECT_INDICATION:
+ case IRCOMM_LMP_DISCONNECT_INDICATION:
+ ircomm_next_state(self, IRCOMM_IDLE);
+ ircomm_disconnect_indication(self, skb, info);
+ break;
+ default:
+ pr_debug("%s(), unknown event: %s\n", __func__ ,
+ ircomm_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_state_waitr (self, event, skb)
+ *
+ * IrCOMM has received an incoming connection request and is awaiting
+ * response from the user
+ */
+static int ircomm_state_waitr(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info)
+{
+ int ret = 0;
+
+ switch (event) {
+ case IRCOMM_CONNECT_RESPONSE:
+ ircomm_next_state(self, IRCOMM_CONN);
+ ret = self->issue.connect_response(self, skb);
+ break;
+ case IRCOMM_DISCONNECT_REQUEST:
+ ircomm_next_state(self, IRCOMM_IDLE);
+ ret = self->issue.disconnect_request(self, skb, info);
+ break;
+ case IRCOMM_TTP_DISCONNECT_INDICATION:
+ case IRCOMM_LMP_DISCONNECT_INDICATION:
+ ircomm_next_state(self, IRCOMM_IDLE);
+ ircomm_disconnect_indication(self, skb, info);
+ break;
+ default:
+ pr_debug("%s(), unknown event = %s\n", __func__ ,
+ ircomm_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_state_conn (self, event, skb)
+ *
+ * IrCOMM is connected to the peer IrCOMM device
+ *
+ */
+static int ircomm_state_conn(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info)
+{
+ int ret = 0;
+
+ switch (event) {
+ case IRCOMM_DATA_REQUEST:
+ ret = self->issue.data_request(self, skb, 0);
+ break;
+ case IRCOMM_TTP_DATA_INDICATION:
+ ircomm_process_data(self, skb);
+ break;
+ case IRCOMM_LMP_DATA_INDICATION:
+ ircomm_data_indication(self, skb);
+ break;
+ case IRCOMM_CONTROL_REQUEST:
+ /* Just send a separate frame for now */
+ ret = self->issue.data_request(self, skb, skb->len);
+ break;
+ case IRCOMM_TTP_DISCONNECT_INDICATION:
+ case IRCOMM_LMP_DISCONNECT_INDICATION:
+ ircomm_next_state(self, IRCOMM_IDLE);
+ ircomm_disconnect_indication(self, skb, info);
+ break;
+ case IRCOMM_DISCONNECT_REQUEST:
+ ircomm_next_state(self, IRCOMM_IDLE);
+ ret = self->issue.disconnect_request(self, skb, info);
+ break;
+ default:
+ pr_debug("%s(), unknown event = %s\n", __func__ ,
+ ircomm_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_do_event (self, event, skb)
+ *
+ * Process event
+ *
+ */
+int ircomm_do_event(struct ircomm_cb *self, IRCOMM_EVENT event,
+ struct sk_buff *skb, struct ircomm_info *info)
+{
+ pr_debug("%s: state=%s, event=%s\n", __func__ ,
+ ircomm_state[self->state], ircomm_event[event]);
+
+ return (*state[self->state])(self, event, skb, info);
+}
+
+/*
+ * Function ircomm_next_state (self, state)
+ *
+ * Switch state
+ *
+ */
+void ircomm_next_state(struct ircomm_cb *self, IRCOMM_STATE state)
+{
+ self->state = state;
+
+ pr_debug("%s: next state=%s, service type=%d\n", __func__ ,
+ ircomm_state[self->state], self->service_type);
+}
diff --git a/net/irda/ircomm/ircomm_lmp.c b/net/irda/ircomm/ircomm_lmp.c
new file mode 100644
index 000000000..e4cc847bb
--- /dev/null
+++ b/net/irda/ircomm/ircomm_lmp.c
@@ -0,0 +1,350 @@
+/*********************************************************************
+ *
+ * Filename: ircomm_lmp.c
+ * Version: 1.0
+ * Description: Interface between IrCOMM and IrLMP
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Jun 6 20:48:27 1999
+ * Modified at: Sun Dec 12 13:44:17 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ * Sources: Previous IrLPT work by Thomas Davis
+ *
+ * Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+#include <linux/gfp.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irda_device.h> /* struct irda_skb_cb */
+
+#include <net/irda/ircomm_event.h>
+#include <net/irda/ircomm_lmp.h>
+
+
+/*
+ * Function ircomm_lmp_connect_request (self, userdata)
+ *
+ *
+ *
+ */
+static int ircomm_lmp_connect_request(struct ircomm_cb *self,
+ struct sk_buff *userdata,
+ struct ircomm_info *info)
+{
+ int ret = 0;
+
+ /* Don't forget to refcount it - should be NULL anyway */
+ if(userdata)
+ skb_get(userdata);
+
+ ret = irlmp_connect_request(self->lsap, info->dlsap_sel,
+ info->saddr, info->daddr, NULL, userdata);
+ return ret;
+}
+
+/*
+ * Function ircomm_lmp_connect_response (self, skb)
+ *
+ *
+ *
+ */
+static int ircomm_lmp_connect_response(struct ircomm_cb *self,
+ struct sk_buff *userdata)
+{
+ struct sk_buff *tx_skb;
+
+ /* Any userdata supplied? */
+ if (userdata == NULL) {
+ tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+ if (!tx_skb)
+ return -ENOMEM;
+
+ /* Reserve space for MUX and LAP header */
+ skb_reserve(tx_skb, LMP_MAX_HEADER);
+ } else {
+ /*
+ * Check that the client has reserved enough space for
+ * headers
+ */
+ IRDA_ASSERT(skb_headroom(userdata) >= LMP_MAX_HEADER,
+ return -1;);
+
+ /* Don't forget to refcount it - should be NULL anyway */
+ skb_get(userdata);
+ tx_skb = userdata;
+ }
+
+ return irlmp_connect_response(self->lsap, tx_skb);
+}
+
+static int ircomm_lmp_disconnect_request(struct ircomm_cb *self,
+ struct sk_buff *userdata,
+ struct ircomm_info *info)
+{
+ struct sk_buff *tx_skb;
+ int ret;
+
+ if (!userdata) {
+ tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+ if (!tx_skb)
+ return -ENOMEM;
+
+ /* Reserve space for MUX and LAP header */
+ skb_reserve(tx_skb, LMP_MAX_HEADER);
+ userdata = tx_skb;
+ } else {
+ /* Don't forget to refcount it - should be NULL anyway */
+ skb_get(userdata);
+ }
+
+ ret = irlmp_disconnect_request(self->lsap, userdata);
+
+ return ret;
+}
+
+/*
+ * Function ircomm_lmp_flow_control (skb)
+ *
+ * This function is called when a data frame we have sent to IrLAP has
+ * been deallocated. We do this to make sure we don't flood IrLAP with
+ * frames, since we are not using the IrTTP flow control mechanism
+ */
+static void ircomm_lmp_flow_control(struct sk_buff *skb)
+{
+ struct irda_skb_cb *cb;
+ struct ircomm_cb *self;
+ int line;
+
+ IRDA_ASSERT(skb != NULL, return;);
+
+ cb = (struct irda_skb_cb *) skb->cb;
+
+ line = cb->line;
+
+ self = (struct ircomm_cb *) hashbin_lock_find(ircomm, line, NULL);
+ if (!self) {
+ pr_debug("%s(), didn't find myself\n", __func__);
+ return;
+ }
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+ self->pkt_count--;
+
+ if ((self->pkt_count < 2) && (self->flow_status == FLOW_STOP)) {
+ pr_debug("%s(), asking TTY to start again!\n", __func__);
+ self->flow_status = FLOW_START;
+ if (self->notify.flow_indication)
+ self->notify.flow_indication(self->notify.instance,
+ self, FLOW_START);
+ }
+}
+
+/*
+ * Function ircomm_lmp_data_request (self, userdata)
+ *
+ * Send data frame to peer device
+ *
+ */
+static int ircomm_lmp_data_request(struct ircomm_cb *self,
+ struct sk_buff *skb,
+ int not_used)
+{
+ struct irda_skb_cb *cb;
+ int ret;
+
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ cb = (struct irda_skb_cb *) skb->cb;
+
+ cb->line = self->line;
+
+ pr_debug("%s(), sending frame\n", __func__);
+
+ /* Don't forget to refcount it - see ircomm_tty_do_softint() */
+ skb_get(skb);
+
+ skb_orphan(skb);
+ skb->destructor = ircomm_lmp_flow_control;
+
+ if ((self->pkt_count++ > 7) && (self->flow_status == FLOW_START)) {
+ pr_debug("%s(), asking TTY to slow down!\n", __func__);
+ self->flow_status = FLOW_STOP;
+ if (self->notify.flow_indication)
+ self->notify.flow_indication(self->notify.instance,
+ self, FLOW_STOP);
+ }
+ ret = irlmp_data_request(self->lsap, skb);
+ if (ret) {
+ net_err_ratelimited("%s(), failed\n", __func__);
+ /* irlmp_data_request already free the packet */
+ }
+
+ return ret;
+}
+
+/*
+ * Function ircomm_lmp_data_indication (instance, sap, skb)
+ *
+ * Incoming data which we must deliver to the state machine, to check
+ * we are still connected.
+ */
+static int ircomm_lmp_data_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ ircomm_do_event(self, IRCOMM_LMP_DATA_INDICATION, skb, NULL);
+
+ /* Drop reference count - see ircomm_tty_data_indication(). */
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_lmp_connect_confirm (instance, sap, qos, max_sdu_size,
+ * max_header_size, skb)
+ *
+ * Connection has been confirmed by peer device
+ *
+ */
+static void ircomm_lmp_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_seg_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *) instance;
+ struct ircomm_info info;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(qos != NULL, return;);
+
+ info.max_data_size = max_seg_size;
+ info.max_header_size = max_header_size;
+ info.qos = qos;
+
+ ircomm_do_event(self, IRCOMM_LMP_CONNECT_CONFIRM, skb, &info);
+
+ /* Drop reference count - see ircomm_tty_connect_confirm(). */
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_lmp_connect_indication (instance, sap, qos, max_sdu_size,
+ * max_header_size, skb)
+ *
+ * Peer device wants to make a connection with us
+ *
+ */
+static void ircomm_lmp_connect_indication(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_seg_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *)instance;
+ struct ircomm_info info;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(qos != NULL, return;);
+
+ info.max_data_size = max_seg_size;
+ info.max_header_size = max_header_size;
+ info.qos = qos;
+
+ ircomm_do_event(self, IRCOMM_LMP_CONNECT_INDICATION, skb, &info);
+
+ /* Drop reference count - see ircomm_tty_connect_indication(). */
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_lmp_disconnect_indication (instance, sap, reason, skb)
+ *
+ * Peer device has closed the connection, or the link went down for some
+ * other reason
+ */
+static void ircomm_lmp_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason,
+ struct sk_buff *skb)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *) instance;
+ struct ircomm_info info;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+ info.reason = reason;
+
+ ircomm_do_event(self, IRCOMM_LMP_DISCONNECT_INDICATION, skb, &info);
+
+ /* Drop reference count - see ircomm_tty_disconnect_indication(). */
+ if(skb)
+ dev_kfree_skb(skb);
+}
+/*
+ * Function ircomm_open_lsap (self)
+ *
+ * Open LSAP. This function will only be used when using "raw" services
+ *
+ */
+int ircomm_open_lsap(struct ircomm_cb *self)
+{
+ notify_t notify;
+
+ /* Register callbacks */
+ irda_notify_init(&notify);
+ notify.data_indication = ircomm_lmp_data_indication;
+ notify.connect_confirm = ircomm_lmp_connect_confirm;
+ notify.connect_indication = ircomm_lmp_connect_indication;
+ notify.disconnect_indication = ircomm_lmp_disconnect_indication;
+ notify.instance = self;
+ strlcpy(notify.name, "IrCOMM", sizeof(notify.name));
+
+ self->lsap = irlmp_open_lsap(LSAP_ANY, &notify, 0);
+ if (!self->lsap) {
+ pr_debug("%sfailed to allocate tsap\n", __func__);
+ return -1;
+ }
+ self->slsap_sel = self->lsap->slsap_sel;
+
+ /*
+ * Initialize the call-table for issuing commands
+ */
+ self->issue.data_request = ircomm_lmp_data_request;
+ self->issue.connect_request = ircomm_lmp_connect_request;
+ self->issue.connect_response = ircomm_lmp_connect_response;
+ self->issue.disconnect_request = ircomm_lmp_disconnect_request;
+
+ return 0;
+}
diff --git a/net/irda/ircomm/ircomm_param.c b/net/irda/ircomm/ircomm_param.c
new file mode 100644
index 000000000..3c4caa60c
--- /dev/null
+++ b/net/irda/ircomm/ircomm_param.c
@@ -0,0 +1,502 @@
+/*********************************************************************
+ *
+ * Filename: ircomm_param.c
+ * Version: 1.0
+ * Description: Parameter handling for the IrCOMM protocol
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Mon Jun 7 10:25:11 1999
+ * Modified at: Sun Jan 30 14:32:03 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/gfp.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/parameters.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_tty_attach.h>
+#include <net/irda/ircomm_tty.h>
+
+#include <net/irda/ircomm_param.h>
+
+static int ircomm_param_service_type(void *instance, irda_param_t *param,
+ int get);
+static int ircomm_param_port_type(void *instance, irda_param_t *param,
+ int get);
+static int ircomm_param_port_name(void *instance, irda_param_t *param,
+ int get);
+static int ircomm_param_service_type(void *instance, irda_param_t *param,
+ int get);
+static int ircomm_param_data_rate(void *instance, irda_param_t *param,
+ int get);
+static int ircomm_param_data_format(void *instance, irda_param_t *param,
+ int get);
+static int ircomm_param_flow_control(void *instance, irda_param_t *param,
+ int get);
+static int ircomm_param_xon_xoff(void *instance, irda_param_t *param, int get);
+static int ircomm_param_enq_ack(void *instance, irda_param_t *param, int get);
+static int ircomm_param_line_status(void *instance, irda_param_t *param,
+ int get);
+static int ircomm_param_dte(void *instance, irda_param_t *param, int get);
+static int ircomm_param_dce(void *instance, irda_param_t *param, int get);
+static int ircomm_param_poll(void *instance, irda_param_t *param, int get);
+
+static const pi_minor_info_t pi_minor_call_table_common[] = {
+ { ircomm_param_service_type, PV_INT_8_BITS },
+ { ircomm_param_port_type, PV_INT_8_BITS },
+ { ircomm_param_port_name, PV_STRING }
+};
+static const pi_minor_info_t pi_minor_call_table_non_raw[] = {
+ { ircomm_param_data_rate, PV_INT_32_BITS | PV_BIG_ENDIAN },
+ { ircomm_param_data_format, PV_INT_8_BITS },
+ { ircomm_param_flow_control, PV_INT_8_BITS },
+ { ircomm_param_xon_xoff, PV_INT_16_BITS },
+ { ircomm_param_enq_ack, PV_INT_16_BITS },
+ { ircomm_param_line_status, PV_INT_8_BITS }
+};
+static const pi_minor_info_t pi_minor_call_table_9_wire[] = {
+ { ircomm_param_dte, PV_INT_8_BITS },
+ { ircomm_param_dce, PV_INT_8_BITS },
+ { ircomm_param_poll, PV_NO_VALUE },
+};
+
+static const pi_major_info_t pi_major_call_table[] = {
+ { pi_minor_call_table_common, 3 },
+ { pi_minor_call_table_non_raw, 6 },
+ { pi_minor_call_table_9_wire, 3 }
+/* { pi_minor_call_table_centronics } */
+};
+
+pi_param_info_t ircomm_param_info = { pi_major_call_table, 3, 0x0f, 4 };
+
+/*
+ * Function ircomm_param_request (self, pi, flush)
+ *
+ * Queue a parameter for the control channel
+ *
+ */
+int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush)
+{
+ unsigned long flags;
+ struct sk_buff *skb;
+ int count;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ /* Make sure we don't send parameters for raw mode */
+ if (self->service_type == IRCOMM_3_WIRE_RAW)
+ return 0;
+
+ spin_lock_irqsave(&self->spinlock, flags);
+
+ skb = self->ctrl_skb;
+ if (!skb) {
+ skb = alloc_skb(256, GFP_ATOMIC);
+ if (!skb) {
+ spin_unlock_irqrestore(&self->spinlock, flags);
+ return -ENOMEM;
+ }
+
+ skb_reserve(skb, self->max_header_size);
+ self->ctrl_skb = skb;
+ }
+ /*
+ * Inserting is a little bit tricky since we don't know how much
+ * room we will need. But this should hopefully work OK
+ */
+ count = irda_param_insert(self, pi, skb_tail_pointer(skb),
+ skb_tailroom(skb), &ircomm_param_info);
+ if (count < 0) {
+ net_warn_ratelimited("%s(), no room for parameter!\n",
+ __func__);
+ spin_unlock_irqrestore(&self->spinlock, flags);
+ return -1;
+ }
+ skb_put(skb, count);
+
+ spin_unlock_irqrestore(&self->spinlock, flags);
+
+ pr_debug("%s(), skb->len=%d\n", __func__ , skb->len);
+
+ if (flush) {
+ /* ircomm_tty_do_softint will take care of the rest */
+ schedule_work(&self->tqueue);
+ }
+
+ return count;
+}
+
+/*
+ * Function ircomm_param_service_type (self, buf, len)
+ *
+ * Handle service type, this function will both be called after the LM-IAS
+ * query and then the remote device sends its initial parameters
+ *
+ */
+static int ircomm_param_service_type(void *instance, irda_param_t *param,
+ int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+ __u8 service_type = (__u8) param->pv.i;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get) {
+ param->pv.i = self->settings.service_type;
+ return 0;
+ }
+
+ /* Find all common service types */
+ service_type &= self->service_type;
+ if (!service_type) {
+ pr_debug("%s(), No common service type to use!\n", __func__);
+ return -1;
+ }
+ pr_debug("%s(), services in common=%02x\n", __func__ ,
+ service_type);
+
+ /*
+ * Now choose a preferred service type of those available
+ */
+ if (service_type & IRCOMM_CENTRONICS)
+ self->settings.service_type = IRCOMM_CENTRONICS;
+ else if (service_type & IRCOMM_9_WIRE)
+ self->settings.service_type = IRCOMM_9_WIRE;
+ else if (service_type & IRCOMM_3_WIRE)
+ self->settings.service_type = IRCOMM_3_WIRE;
+ else if (service_type & IRCOMM_3_WIRE_RAW)
+ self->settings.service_type = IRCOMM_3_WIRE_RAW;
+
+ pr_debug("%s(), resulting service type=0x%02x\n", __func__ ,
+ self->settings.service_type);
+
+ /*
+ * Now the line is ready for some communication. Check if we are a
+ * server, and send over some initial parameters.
+ * Client do it in ircomm_tty_state_setup().
+ * Note : we may get called from ircomm_tty_getvalue_confirm(),
+ * therefore before we even have open any socket. And self->client
+ * is initialised to TRUE only later. So, we check if the link is
+ * really initialised. - Jean II
+ */
+ if ((self->max_header_size != IRCOMM_TTY_HDR_UNINITIALISED) &&
+ (!self->client) &&
+ (self->settings.service_type != IRCOMM_3_WIRE_RAW))
+ {
+ /* Init connection */
+ ircomm_tty_send_initial_parameters(self);
+ ircomm_tty_link_established(self);
+ }
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_port_type (self, param)
+ *
+ * The port type parameter tells if the devices are serial or parallel.
+ * Since we only advertise serial service, this parameter should only
+ * be equal to IRCOMM_SERIAL.
+ */
+static int ircomm_param_port_type(void *instance, irda_param_t *param, int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = IRCOMM_SERIAL;
+ else {
+ self->settings.port_type = (__u8) param->pv.i;
+
+ pr_debug("%s(), port type=%d\n", __func__ ,
+ self->settings.port_type);
+ }
+ return 0;
+}
+
+/*
+ * Function ircomm_param_port_name (self, param)
+ *
+ * Exchange port name
+ *
+ */
+static int ircomm_param_port_name(void *instance, irda_param_t *param, int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get) {
+ pr_debug("%s(), not imp!\n", __func__);
+ } else {
+ pr_debug("%s(), port-name=%s\n", __func__ , param->pv.c);
+ strncpy(self->settings.port_name, param->pv.c, 32);
+ }
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_data_rate (self, param)
+ *
+ * Exchange data rate to be used in this settings
+ *
+ */
+static int ircomm_param_data_rate(void *instance, irda_param_t *param, int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->settings.data_rate;
+ else
+ self->settings.data_rate = param->pv.i;
+
+ pr_debug("%s(), data rate = %d\n", __func__ , param->pv.i);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_data_format (self, param)
+ *
+ * Exchange data format to be used in this settings
+ *
+ */
+static int ircomm_param_data_format(void *instance, irda_param_t *param,
+ int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->settings.data_format;
+ else
+ self->settings.data_format = (__u8) param->pv.i;
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_flow_control (self, param)
+ *
+ * Exchange flow control settings to be used in this settings
+ *
+ */
+static int ircomm_param_flow_control(void *instance, irda_param_t *param,
+ int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->settings.flow_control;
+ else
+ self->settings.flow_control = (__u8) param->pv.i;
+
+ pr_debug("%s(), flow control = 0x%02x\n", __func__ , (__u8)param->pv.i);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_xon_xoff (self, param)
+ *
+ * Exchange XON/XOFF characters
+ *
+ */
+static int ircomm_param_xon_xoff(void *instance, irda_param_t *param, int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get) {
+ param->pv.i = self->settings.xonxoff[0];
+ param->pv.i |= self->settings.xonxoff[1] << 8;
+ } else {
+ self->settings.xonxoff[0] = (__u16) param->pv.i & 0xff;
+ self->settings.xonxoff[1] = (__u16) param->pv.i >> 8;
+ }
+
+ pr_debug("%s(), XON/XOFF = 0x%02x,0x%02x\n", __func__ ,
+ param->pv.i & 0xff, param->pv.i >> 8);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_enq_ack (self, param)
+ *
+ * Exchange ENQ/ACK characters
+ *
+ */
+static int ircomm_param_enq_ack(void *instance, irda_param_t *param, int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get) {
+ param->pv.i = self->settings.enqack[0];
+ param->pv.i |= self->settings.enqack[1] << 8;
+ } else {
+ self->settings.enqack[0] = (__u16) param->pv.i & 0xff;
+ self->settings.enqack[1] = (__u16) param->pv.i >> 8;
+ }
+
+ pr_debug("%s(), ENQ/ACK = 0x%02x,0x%02x\n", __func__ ,
+ param->pv.i & 0xff, param->pv.i >> 8);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_line_status (self, param)
+ *
+ *
+ *
+ */
+static int ircomm_param_line_status(void *instance, irda_param_t *param,
+ int get)
+{
+ pr_debug("%s(), not impl.\n", __func__);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_dte (instance, param)
+ *
+ * If we get here, there must be some sort of null-modem connection, and
+ * we are probably working in server mode as well.
+ */
+static int ircomm_param_dte(void *instance, irda_param_t *param, int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+ __u8 dte;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->settings.dte;
+ else {
+ dte = (__u8) param->pv.i;
+
+ self->settings.dce = 0;
+
+ if (dte & IRCOMM_DELTA_DTR)
+ self->settings.dce |= (IRCOMM_DELTA_DSR|
+ IRCOMM_DELTA_RI |
+ IRCOMM_DELTA_CD);
+ if (dte & IRCOMM_DTR)
+ self->settings.dce |= (IRCOMM_DSR|
+ IRCOMM_RI |
+ IRCOMM_CD);
+
+ if (dte & IRCOMM_DELTA_RTS)
+ self->settings.dce |= IRCOMM_DELTA_CTS;
+ if (dte & IRCOMM_RTS)
+ self->settings.dce |= IRCOMM_CTS;
+
+ /* Take appropriate actions */
+ ircomm_tty_check_modem_status(self);
+
+ /* Null modem cable emulator */
+ self->settings.null_modem = TRUE;
+ }
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_dce (instance, param)
+ *
+ *
+ *
+ */
+static int ircomm_param_dce(void *instance, irda_param_t *param, int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+ __u8 dce;
+
+ pr_debug("%s(), dce = 0x%02x\n", __func__ , (__u8)param->pv.i);
+
+ dce = (__u8) param->pv.i;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ self->settings.dce = dce;
+
+ /* Check if any of the settings have changed */
+ if (dce & 0x0f) {
+ if (dce & IRCOMM_DELTA_CTS) {
+ pr_debug("%s(), CTS\n", __func__);
+ }
+ }
+
+ ircomm_tty_check_modem_status(self);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_param_poll (instance, param)
+ *
+ * Called when the peer device is polling for the line settings
+ *
+ */
+static int ircomm_param_poll(void *instance, irda_param_t *param, int get)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ /* Poll parameters are always of length 0 (just a signal) */
+ if (!get) {
+ /* Respond with DTE line settings */
+ ircomm_param_request(self, IRCOMM_DTE, TRUE);
+ }
+ return 0;
+}
+
+
+
+
+
diff --git a/net/irda/ircomm/ircomm_ttp.c b/net/irda/ircomm/ircomm_ttp.c
new file mode 100644
index 000000000..4b81e0934
--- /dev/null
+++ b/net/irda/ircomm/ircomm_ttp.c
@@ -0,0 +1,350 @@
+/*********************************************************************
+ *
+ * Filename: ircomm_ttp.c
+ * Version: 1.0
+ * Description: Interface between IrCOMM and IrTTP
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Jun 6 20:48:27 1999
+ * Modified at: Mon Dec 13 11:35:13 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irttp.h>
+
+#include <net/irda/ircomm_event.h>
+#include <net/irda/ircomm_ttp.h>
+
+static int ircomm_ttp_data_indication(void *instance, void *sap,
+ struct sk_buff *skb);
+static void ircomm_ttp_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb);
+static void ircomm_ttp_connect_indication(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb);
+static void ircomm_ttp_flow_indication(void *instance, void *sap,
+ LOCAL_FLOW cmd);
+static void ircomm_ttp_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason,
+ struct sk_buff *skb);
+static int ircomm_ttp_data_request(struct ircomm_cb *self,
+ struct sk_buff *skb,
+ int clen);
+static int ircomm_ttp_connect_request(struct ircomm_cb *self,
+ struct sk_buff *userdata,
+ struct ircomm_info *info);
+static int ircomm_ttp_connect_response(struct ircomm_cb *self,
+ struct sk_buff *userdata);
+static int ircomm_ttp_disconnect_request(struct ircomm_cb *self,
+ struct sk_buff *userdata,
+ struct ircomm_info *info);
+
+/*
+ * Function ircomm_open_tsap (self)
+ *
+ *
+ *
+ */
+int ircomm_open_tsap(struct ircomm_cb *self)
+{
+ notify_t notify;
+
+ /* Register callbacks */
+ irda_notify_init(&notify);
+ notify.data_indication = ircomm_ttp_data_indication;
+ notify.connect_confirm = ircomm_ttp_connect_confirm;
+ notify.connect_indication = ircomm_ttp_connect_indication;
+ notify.flow_indication = ircomm_ttp_flow_indication;
+ notify.disconnect_indication = ircomm_ttp_disconnect_indication;
+ notify.instance = self;
+ strlcpy(notify.name, "IrCOMM", sizeof(notify.name));
+
+ self->tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT,
+ &notify);
+ if (!self->tsap) {
+ pr_debug("%sfailed to allocate tsap\n", __func__);
+ return -1;
+ }
+ self->slsap_sel = self->tsap->stsap_sel;
+
+ /*
+ * Initialize the call-table for issuing commands
+ */
+ self->issue.data_request = ircomm_ttp_data_request;
+ self->issue.connect_request = ircomm_ttp_connect_request;
+ self->issue.connect_response = ircomm_ttp_connect_response;
+ self->issue.disconnect_request = ircomm_ttp_disconnect_request;
+
+ return 0;
+}
+
+/*
+ * Function ircomm_ttp_connect_request (self, userdata)
+ *
+ *
+ *
+ */
+static int ircomm_ttp_connect_request(struct ircomm_cb *self,
+ struct sk_buff *userdata,
+ struct ircomm_info *info)
+{
+ int ret = 0;
+
+ /* Don't forget to refcount it - should be NULL anyway */
+ if(userdata)
+ skb_get(userdata);
+
+ ret = irttp_connect_request(self->tsap, info->dlsap_sel,
+ info->saddr, info->daddr, NULL,
+ TTP_SAR_DISABLE, userdata);
+
+ return ret;
+}
+
+/*
+ * Function ircomm_ttp_connect_response (self, skb)
+ *
+ *
+ *
+ */
+static int ircomm_ttp_connect_response(struct ircomm_cb *self,
+ struct sk_buff *userdata)
+{
+ int ret;
+
+ /* Don't forget to refcount it - should be NULL anyway */
+ if(userdata)
+ skb_get(userdata);
+
+ ret = irttp_connect_response(self->tsap, TTP_SAR_DISABLE, userdata);
+
+ return ret;
+}
+
+/*
+ * Function ircomm_ttp_data_request (self, userdata)
+ *
+ * Send IrCOMM data to IrTTP layer. Currently we do not try to combine
+ * control data with pure data, so they will be sent as separate frames.
+ * Should not be a big problem though, since control frames are rare. But
+ * some of them are sent after connection establishment, so this can
+ * increase the latency a bit.
+ */
+static int ircomm_ttp_data_request(struct ircomm_cb *self,
+ struct sk_buff *skb,
+ int clen)
+{
+ int ret;
+
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ pr_debug("%s(), clen=%d\n", __func__ , clen);
+
+ /*
+ * Insert clen field, currently we either send data only, or control
+ * only frames, to make things easier and avoid queueing
+ */
+ IRDA_ASSERT(skb_headroom(skb) >= IRCOMM_HEADER_SIZE, return -1;);
+
+ /* Don't forget to refcount it - see ircomm_tty_do_softint() */
+ skb_get(skb);
+
+ skb_push(skb, IRCOMM_HEADER_SIZE);
+
+ skb->data[0] = clen;
+
+ ret = irttp_data_request(self->tsap, skb);
+ if (ret) {
+ net_err_ratelimited("%s(), failed\n", __func__);
+ /* irttp_data_request already free the packet */
+ }
+
+ return ret;
+}
+
+/*
+ * Function ircomm_ttp_data_indication (instance, sap, skb)
+ *
+ * Incoming data
+ *
+ */
+static int ircomm_ttp_data_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ ircomm_do_event(self, IRCOMM_TTP_DATA_INDICATION, skb, NULL);
+
+ /* Drop reference count - see ircomm_tty_data_indication(). */
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+static void ircomm_ttp_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *) instance;
+ struct ircomm_info info;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(qos != NULL, goto out;);
+
+ if (max_sdu_size != TTP_SAR_DISABLE) {
+ net_err_ratelimited("%s(), SAR not allowed for IrCOMM!\n",
+ __func__);
+ goto out;
+ }
+
+ info.max_data_size = irttp_get_max_seg_size(self->tsap)
+ - IRCOMM_HEADER_SIZE;
+ info.max_header_size = max_header_size + IRCOMM_HEADER_SIZE;
+ info.qos = qos;
+
+ ircomm_do_event(self, IRCOMM_TTP_CONNECT_CONFIRM, skb, &info);
+
+out:
+ /* Drop reference count - see ircomm_tty_connect_confirm(). */
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_ttp_connect_indication (instance, sap, qos, max_sdu_size,
+ * max_header_size, skb)
+ *
+ *
+ *
+ */
+static void ircomm_ttp_connect_indication(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *)instance;
+ struct ircomm_info info;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(qos != NULL, goto out;);
+
+ if (max_sdu_size != TTP_SAR_DISABLE) {
+ net_err_ratelimited("%s(), SAR not allowed for IrCOMM!\n",
+ __func__);
+ goto out;
+ }
+
+ info.max_data_size = irttp_get_max_seg_size(self->tsap)
+ - IRCOMM_HEADER_SIZE;
+ info.max_header_size = max_header_size + IRCOMM_HEADER_SIZE;
+ info.qos = qos;
+
+ ircomm_do_event(self, IRCOMM_TTP_CONNECT_INDICATION, skb, &info);
+
+out:
+ /* Drop reference count - see ircomm_tty_connect_indication(). */
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_ttp_disconnect_request (self, userdata, info)
+ *
+ *
+ *
+ */
+static int ircomm_ttp_disconnect_request(struct ircomm_cb *self,
+ struct sk_buff *userdata,
+ struct ircomm_info *info)
+{
+ int ret;
+
+ /* Don't forget to refcount it - should be NULL anyway */
+ if(userdata)
+ skb_get(userdata);
+
+ ret = irttp_disconnect_request(self->tsap, userdata, P_NORMAL);
+
+ return ret;
+}
+
+/*
+ * Function ircomm_ttp_disconnect_indication (instance, sap, reason, skb)
+ *
+ *
+ *
+ */
+static void ircomm_ttp_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason,
+ struct sk_buff *skb)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *) instance;
+ struct ircomm_info info;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+ info.reason = reason;
+
+ ircomm_do_event(self, IRCOMM_TTP_DISCONNECT_INDICATION, skb, &info);
+
+ /* Drop reference count - see ircomm_tty_disconnect_indication(). */
+ if(skb)
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_ttp_flow_indication (instance, sap, cmd)
+ *
+ * Layer below is telling us to start or stop the flow of data
+ *
+ */
+static void ircomm_ttp_flow_indication(void *instance, void *sap,
+ LOCAL_FLOW cmd)
+{
+ struct ircomm_cb *self = (struct ircomm_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+ if (self->notify.flow_indication)
+ self->notify.flow_indication(self->notify.instance, self, cmd);
+}
+
+
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
new file mode 100644
index 000000000..683346d2d
--- /dev/null
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -0,0 +1,1362 @@
+/*********************************************************************
+ *
+ * Filename: ircomm_tty.c
+ * Version: 1.0
+ * Description: IrCOMM serial TTY driver
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Jun 6 21:00:56 1999
+ * Modified at: Wed Feb 23 00:09:02 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ * Sources: serial.c and previous IrCOMM work by Takahide Higuchi
+ *
+ * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/termios.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/interrupt.h>
+#include <linux/device.h> /* for MODULE_ALIAS_CHARDEV_MAJOR */
+
+#include <asm/uaccess.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_param.h>
+#include <net/irda/ircomm_tty_attach.h>
+#include <net/irda/ircomm_tty.h>
+
+static int ircomm_tty_install(struct tty_driver *driver,
+ struct tty_struct *tty);
+static int ircomm_tty_open(struct tty_struct *tty, struct file *filp);
+static void ircomm_tty_close(struct tty_struct * tty, struct file *filp);
+static int ircomm_tty_write(struct tty_struct * tty,
+ const unsigned char *buf, int count);
+static int ircomm_tty_write_room(struct tty_struct *tty);
+static void ircomm_tty_throttle(struct tty_struct *tty);
+static void ircomm_tty_unthrottle(struct tty_struct *tty);
+static int ircomm_tty_chars_in_buffer(struct tty_struct *tty);
+static void ircomm_tty_flush_buffer(struct tty_struct *tty);
+static void ircomm_tty_send_xchar(struct tty_struct *tty, char ch);
+static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout);
+static void ircomm_tty_hangup(struct tty_struct *tty);
+static void ircomm_tty_do_softint(struct work_struct *work);
+static void ircomm_tty_shutdown(struct ircomm_tty_cb *self);
+static void ircomm_tty_stop(struct tty_struct *tty);
+
+static int ircomm_tty_data_indication(void *instance, void *sap,
+ struct sk_buff *skb);
+static int ircomm_tty_control_indication(void *instance, void *sap,
+ struct sk_buff *skb);
+static void ircomm_tty_flow_indication(void *instance, void *sap,
+ LOCAL_FLOW cmd);
+#ifdef CONFIG_PROC_FS
+static const struct file_operations ircomm_tty_proc_fops;
+#endif /* CONFIG_PROC_FS */
+static struct tty_driver *driver;
+
+static hashbin_t *ircomm_tty = NULL;
+
+static const struct tty_operations ops = {
+ .install = ircomm_tty_install,
+ .open = ircomm_tty_open,
+ .close = ircomm_tty_close,
+ .write = ircomm_tty_write,
+ .write_room = ircomm_tty_write_room,
+ .chars_in_buffer = ircomm_tty_chars_in_buffer,
+ .flush_buffer = ircomm_tty_flush_buffer,
+ .ioctl = ircomm_tty_ioctl, /* ircomm_tty_ioctl.c */
+ .tiocmget = ircomm_tty_tiocmget, /* ircomm_tty_ioctl.c */
+ .tiocmset = ircomm_tty_tiocmset, /* ircomm_tty_ioctl.c */
+ .throttle = ircomm_tty_throttle,
+ .unthrottle = ircomm_tty_unthrottle,
+ .send_xchar = ircomm_tty_send_xchar,
+ .set_termios = ircomm_tty_set_termios,
+ .stop = ircomm_tty_stop,
+ .start = ircomm_tty_start,
+ .hangup = ircomm_tty_hangup,
+ .wait_until_sent = ircomm_tty_wait_until_sent,
+#ifdef CONFIG_PROC_FS
+ .proc_fops = &ircomm_tty_proc_fops,
+#endif /* CONFIG_PROC_FS */
+};
+
+static void ircomm_port_raise_dtr_rts(struct tty_port *port, int raise)
+{
+ struct ircomm_tty_cb *self = container_of(port, struct ircomm_tty_cb,
+ port);
+ /*
+ * Here, we use to lock those two guys, but as ircomm_param_request()
+ * does it itself, I don't see the point (and I see the deadlock).
+ * Jean II
+ */
+ if (raise)
+ self->settings.dte |= IRCOMM_RTS | IRCOMM_DTR;
+ else
+ self->settings.dte &= ~(IRCOMM_RTS | IRCOMM_DTR);
+
+ ircomm_param_request(self, IRCOMM_DTE, TRUE);
+}
+
+static int ircomm_port_carrier_raised(struct tty_port *port)
+{
+ struct ircomm_tty_cb *self = container_of(port, struct ircomm_tty_cb,
+ port);
+ return self->settings.dce & IRCOMM_CD;
+}
+
+static const struct tty_port_operations ircomm_port_ops = {
+ .dtr_rts = ircomm_port_raise_dtr_rts,
+ .carrier_raised = ircomm_port_carrier_raised,
+};
+
+/*
+ * Function ircomm_tty_init()
+ *
+ * Init IrCOMM TTY layer/driver
+ *
+ */
+static int __init ircomm_tty_init(void)
+{
+ driver = alloc_tty_driver(IRCOMM_TTY_PORTS);
+ if (!driver)
+ return -ENOMEM;
+ ircomm_tty = hashbin_new(HB_LOCK);
+ if (ircomm_tty == NULL) {
+ net_err_ratelimited("%s(), can't allocate hashbin!\n",
+ __func__);
+ put_tty_driver(driver);
+ return -ENOMEM;
+ }
+
+ driver->driver_name = "ircomm";
+ driver->name = "ircomm";
+ driver->major = IRCOMM_TTY_MAJOR;
+ driver->minor_start = IRCOMM_TTY_MINOR;
+ driver->type = TTY_DRIVER_TYPE_SERIAL;
+ driver->subtype = SERIAL_TYPE_NORMAL;
+ driver->init_termios = tty_std_termios;
+ driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+ driver->flags = TTY_DRIVER_REAL_RAW;
+ tty_set_operations(driver, &ops);
+ if (tty_register_driver(driver)) {
+ net_err_ratelimited("%s(): Couldn't register serial driver\n",
+ __func__);
+ put_tty_driver(driver);
+ return -1;
+ }
+ return 0;
+}
+
+static void __exit __ircomm_tty_cleanup(struct ircomm_tty_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ ircomm_tty_shutdown(self);
+
+ self->magic = 0;
+ tty_port_destroy(&self->port);
+ kfree(self);
+}
+
+/*
+ * Function ircomm_tty_cleanup ()
+ *
+ * Remove IrCOMM TTY layer/driver
+ *
+ */
+static void __exit ircomm_tty_cleanup(void)
+{
+ int ret;
+
+ ret = tty_unregister_driver(driver);
+ if (ret) {
+ net_err_ratelimited("%s(), failed to unregister driver\n",
+ __func__);
+ return;
+ }
+
+ hashbin_delete(ircomm_tty, (FREE_FUNC) __ircomm_tty_cleanup);
+ put_tty_driver(driver);
+}
+
+/*
+ * Function ircomm_startup (self)
+ *
+ *
+ *
+ */
+static int ircomm_tty_startup(struct ircomm_tty_cb *self)
+{
+ notify_t notify;
+ int ret = -ENODEV;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ /* Check if already open */
+ if (test_and_set_bit(ASYNCB_INITIALIZED, &self->port.flags)) {
+ pr_debug("%s(), already open so break out!\n", __func__);
+ return 0;
+ }
+
+ /* Register with IrCOMM */
+ irda_notify_init(&notify);
+ /* These callbacks we must handle ourselves */
+ notify.data_indication = ircomm_tty_data_indication;
+ notify.udata_indication = ircomm_tty_control_indication;
+ notify.flow_indication = ircomm_tty_flow_indication;
+
+ /* Use the ircomm_tty interface for these ones */
+ notify.disconnect_indication = ircomm_tty_disconnect_indication;
+ notify.connect_confirm = ircomm_tty_connect_confirm;
+ notify.connect_indication = ircomm_tty_connect_indication;
+ strlcpy(notify.name, "ircomm_tty", sizeof(notify.name));
+ notify.instance = self;
+
+ if (!self->ircomm) {
+ self->ircomm = ircomm_open(&notify, self->service_type,
+ self->line);
+ }
+ if (!self->ircomm)
+ goto err;
+
+ self->slsap_sel = self->ircomm->slsap_sel;
+
+ /* Connect IrCOMM link with remote device */
+ ret = ircomm_tty_attach_cable(self);
+ if (ret < 0) {
+ net_err_ratelimited("%s(), error attaching cable!\n", __func__);
+ goto err;
+ }
+
+ return 0;
+err:
+ clear_bit(ASYNCB_INITIALIZED, &self->port.flags);
+ return ret;
+}
+
+/*
+ * Function ircomm_block_til_ready (self, filp)
+ *
+ *
+ *
+ */
+static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
+ struct tty_struct *tty, struct file *filp)
+{
+ struct tty_port *port = &self->port;
+ DECLARE_WAITQUEUE(wait, current);
+ int retval;
+ int do_clocal = 0;
+ unsigned long flags;
+
+ /*
+ * If non-blocking mode is set, or the port is not enabled,
+ * then make the check up front and then exit.
+ */
+ if (test_bit(TTY_IO_ERROR, &tty->flags)) {
+ port->flags |= ASYNC_NORMAL_ACTIVE;
+ return 0;
+ }
+
+ if (filp->f_flags & O_NONBLOCK) {
+ /* nonblock mode is set */
+ if (tty->termios.c_cflag & CBAUD)
+ tty_port_raise_dtr_rts(port);
+ port->flags |= ASYNC_NORMAL_ACTIVE;
+ pr_debug("%s(), O_NONBLOCK requested!\n", __func__);
+ return 0;
+ }
+
+ if (tty->termios.c_cflag & CLOCAL) {
+ pr_debug("%s(), doing CLOCAL!\n", __func__);
+ do_clocal = 1;
+ }
+
+ /* Wait for carrier detect and the line to become
+ * free (i.e., not in use by the callout). While we are in
+ * this loop, port->count is dropped by one, so that
+ * mgsl_close() knows when to free things. We restore it upon
+ * exit, either normal or abnormal.
+ */
+
+ retval = 0;
+ add_wait_queue(&port->open_wait, &wait);
+
+ pr_debug("%s(%d):block_til_ready before block on %s open_count=%d\n",
+ __FILE__, __LINE__, tty->driver->name, port->count);
+
+ spin_lock_irqsave(&port->lock, flags);
+ port->count--;
+ port->blocked_open++;
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ while (1) {
+ if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags))
+ tty_port_raise_dtr_rts(port);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (tty_hung_up_p(filp) ||
+ !test_bit(ASYNCB_INITIALIZED, &port->flags)) {
+ retval = (port->flags & ASYNC_HUP_NOTIFY) ?
+ -EAGAIN : -ERESTARTSYS;
+ break;
+ }
+
+ /*
+ * Check if link is ready now. Even if CLOCAL is
+ * specified, we cannot return before the IrCOMM link is
+ * ready
+ */
+ if (!test_bit(ASYNCB_CLOSING, &port->flags) &&
+ (do_clocal || tty_port_carrier_raised(port)) &&
+ self->state == IRCOMM_TTY_READY)
+ {
+ break;
+ }
+
+ if (signal_pending(current)) {
+ retval = -ERESTARTSYS;
+ break;
+ }
+
+ pr_debug("%s(%d):block_til_ready blocking on %s open_count=%d\n",
+ __FILE__, __LINE__, tty->driver->name, port->count);
+
+ schedule();
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&port->open_wait, &wait);
+
+ spin_lock_irqsave(&port->lock, flags);
+ if (!tty_hung_up_p(filp))
+ port->count++;
+ port->blocked_open--;
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ pr_debug("%s(%d):block_til_ready after blocking on %s open_count=%d\n",
+ __FILE__, __LINE__, tty->driver->name, port->count);
+
+ if (!retval)
+ port->flags |= ASYNC_NORMAL_ACTIVE;
+
+ return retval;
+}
+
+
+static int ircomm_tty_install(struct tty_driver *driver, struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self;
+ unsigned int line = tty->index;
+
+ /* Check if instance already exists */
+ self = hashbin_lock_find(ircomm_tty, line, NULL);
+ if (!self) {
+ /* No, so make new instance */
+ self = kzalloc(sizeof(struct ircomm_tty_cb), GFP_KERNEL);
+ if (self == NULL)
+ return -ENOMEM;
+
+ tty_port_init(&self->port);
+ self->port.ops = &ircomm_port_ops;
+ self->magic = IRCOMM_TTY_MAGIC;
+ self->flow = FLOW_STOP;
+
+ self->line = line;
+ INIT_WORK(&self->tqueue, ircomm_tty_do_softint);
+ self->max_header_size = IRCOMM_TTY_HDR_UNINITIALISED;
+ self->max_data_size = IRCOMM_TTY_DATA_UNINITIALISED;
+
+ /* Init some important stuff */
+ init_timer(&self->watchdog_timer);
+ spin_lock_init(&self->spinlock);
+
+ /*
+ * Force TTY into raw mode by default which is usually what
+ * we want for IrCOMM and IrLPT. This way applications will
+ * not have to twiddle with printcap etc.
+ *
+ * Note this is completely usafe and doesn't work properly
+ */
+ tty->termios.c_iflag = 0;
+ tty->termios.c_oflag = 0;
+
+ /* Insert into hash */
+ hashbin_insert(ircomm_tty, (irda_queue_t *) self, line, NULL);
+ }
+
+ tty->driver_data = self;
+
+ return tty_port_install(&self->port, driver, tty);
+}
+
+/*
+ * Function ircomm_tty_open (tty, filp)
+ *
+ * This routine is called when a particular tty device is opened. This
+ * routine is mandatory; if this routine is not filled in, the attempted
+ * open will fail with ENODEV.
+ */
+static int ircomm_tty_open(struct tty_struct *tty, struct file *filp)
+{
+ struct ircomm_tty_cb *self = tty->driver_data;
+ unsigned long flags;
+ int ret;
+
+ /* ++ is not atomic, so this should be protected - Jean II */
+ spin_lock_irqsave(&self->port.lock, flags);
+ self->port.count++;
+ spin_unlock_irqrestore(&self->port.lock, flags);
+ tty_port_tty_set(&self->port, tty);
+
+ pr_debug("%s(), %s%d, count = %d\n", __func__ , tty->driver->name,
+ self->line, self->port.count);
+
+ /* Not really used by us, but lets do it anyway */
+ self->port.low_latency = (self->port.flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+
+ /*
+ * If the port is the middle of closing, bail out now
+ */
+ if (test_bit(ASYNCB_CLOSING, &self->port.flags)) {
+
+ /* Hm, why are we blocking on ASYNC_CLOSING if we
+ * do return -EAGAIN/-ERESTARTSYS below anyway?
+ * IMHO it's either not needed in the first place
+ * or for some reason we need to make sure the async
+ * closing has been finished - if so, wouldn't we
+ * probably better sleep uninterruptible?
+ */
+
+ if (wait_event_interruptible(self->port.close_wait,
+ !test_bit(ASYNCB_CLOSING, &self->port.flags))) {
+ net_warn_ratelimited("%s - got signal while blocking on ASYNC_CLOSING!\n",
+ __func__);
+ return -ERESTARTSYS;
+ }
+
+#ifdef SERIAL_DO_RESTART
+ return (self->port.flags & ASYNC_HUP_NOTIFY) ?
+ -EAGAIN : -ERESTARTSYS;
+#else
+ return -EAGAIN;
+#endif
+ }
+
+ /* Check if this is a "normal" ircomm device, or an irlpt device */
+ if (self->line < 0x10) {
+ self->service_type = IRCOMM_3_WIRE | IRCOMM_9_WIRE;
+ self->settings.service_type = IRCOMM_9_WIRE; /* 9 wire as default */
+ /* Jan Kiszka -> add DSR/RI -> Conform to IrCOMM spec */
+ self->settings.dce = IRCOMM_CTS | IRCOMM_CD | IRCOMM_DSR | IRCOMM_RI; /* Default line settings */
+ pr_debug("%s(), IrCOMM device\n", __func__);
+ } else {
+ pr_debug("%s(), IrLPT device\n", __func__);
+ self->service_type = IRCOMM_3_WIRE_RAW;
+ self->settings.service_type = IRCOMM_3_WIRE_RAW; /* Default */
+ }
+
+ ret = ircomm_tty_startup(self);
+ if (ret)
+ return ret;
+
+ ret = ircomm_tty_block_til_ready(self, tty, filp);
+ if (ret) {
+ pr_debug("%s(), returning after block_til_ready with %d\n",
+ __func__, ret);
+
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Function ircomm_tty_close (tty, filp)
+ *
+ * This routine is called when a particular tty device is closed.
+ *
+ */
+static void ircomm_tty_close(struct tty_struct *tty, struct file *filp)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ struct tty_port *port = &self->port;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ if (tty_port_close_start(port, tty, filp) == 0)
+ return;
+
+ ircomm_tty_shutdown(self);
+
+ tty_driver_flush_buffer(tty);
+
+ tty_port_close_end(port, tty);
+ tty_port_tty_set(port, NULL);
+}
+
+/*
+ * Function ircomm_tty_flush_buffer (tty)
+ *
+ *
+ *
+ */
+static void ircomm_tty_flush_buffer(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ /*
+ * Let do_softint() do this to avoid race condition with
+ * do_softint() ;-)
+ */
+ schedule_work(&self->tqueue);
+}
+
+/*
+ * Function ircomm_tty_do_softint (work)
+ *
+ * We use this routine to give the write wakeup to the user at at a
+ * safe time (as fast as possible after write have completed). This
+ * can be compared to the Tx interrupt.
+ */
+static void ircomm_tty_do_softint(struct work_struct *work)
+{
+ struct ircomm_tty_cb *self =
+ container_of(work, struct ircomm_tty_cb, tqueue);
+ struct tty_struct *tty;
+ unsigned long flags;
+ struct sk_buff *skb, *ctrl_skb;
+
+ if (!self || self->magic != IRCOMM_TTY_MAGIC)
+ return;
+
+ tty = tty_port_tty_get(&self->port);
+ if (!tty)
+ return;
+
+ /* Unlink control buffer */
+ spin_lock_irqsave(&self->spinlock, flags);
+
+ ctrl_skb = self->ctrl_skb;
+ self->ctrl_skb = NULL;
+
+ spin_unlock_irqrestore(&self->spinlock, flags);
+
+ /* Flush control buffer if any */
+ if(ctrl_skb) {
+ if(self->flow == FLOW_START)
+ ircomm_control_request(self->ircomm, ctrl_skb);
+ /* Drop reference count - see ircomm_ttp_data_request(). */
+ dev_kfree_skb(ctrl_skb);
+ }
+
+ if (tty->hw_stopped)
+ goto put;
+
+ /* Unlink transmit buffer */
+ spin_lock_irqsave(&self->spinlock, flags);
+
+ skb = self->tx_skb;
+ self->tx_skb = NULL;
+
+ spin_unlock_irqrestore(&self->spinlock, flags);
+
+ /* Flush transmit buffer if any */
+ if (skb) {
+ ircomm_tty_do_event(self, IRCOMM_TTY_DATA_REQUEST, skb, NULL);
+ /* Drop reference count - see ircomm_ttp_data_request(). */
+ dev_kfree_skb(skb);
+ }
+
+ /* Check if user (still) wants to be waken up */
+ tty_wakeup(tty);
+put:
+ tty_kref_put(tty);
+}
+
+/*
+ * Function ircomm_tty_write (tty, buf, count)
+ *
+ * This routine is called by the kernel to write a series of characters
+ * to the tty device. The characters may come from user space or kernel
+ * space. This routine will return the number of characters actually
+ * accepted for writing. This routine is mandatory.
+ */
+static int ircomm_tty_write(struct tty_struct *tty,
+ const unsigned char *buf, int count)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ unsigned long flags;
+ struct sk_buff *skb;
+ int tailroom = 0;
+ int len = 0;
+ int size;
+
+ pr_debug("%s(), count=%d, hw_stopped=%d\n", __func__ , count,
+ tty->hw_stopped);
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ /* We may receive packets from the TTY even before we have finished
+ * our setup. Not cool.
+ * The problem is that we don't know the final header and data size
+ * to create the proper skb, so any skb we would create would have
+ * bogus header and data size, so need care.
+ * We use a bogus header size to safely detect this condition.
+ * Another problem is that hw_stopped was set to 0 way before it
+ * should be, so we would drop this skb. It should now be fixed.
+ * One option is to not accept data until we are properly setup.
+ * But, I suspect that when it happens, the ppp line discipline
+ * just "drops" the data, which might screw up connect scripts.
+ * The second option is to create a "safe skb", with large header
+ * and small size (see ircomm_tty_open() for values).
+ * We just need to make sure that when the real values get filled,
+ * we don't mess up the original "safe skb" (see tx_data_size).
+ * Jean II */
+ if (self->max_header_size == IRCOMM_TTY_HDR_UNINITIALISED) {
+ pr_debug("%s() : not initialised\n", __func__);
+#ifdef IRCOMM_NO_TX_BEFORE_INIT
+ /* We didn't consume anything, TTY will retry */
+ return 0;
+#endif
+ }
+
+ if (count < 1)
+ return 0;
+
+ /* Protect our manipulation of self->tx_skb and related */
+ spin_lock_irqsave(&self->spinlock, flags);
+
+ /* Fetch current transmit buffer */
+ skb = self->tx_skb;
+
+ /*
+ * Send out all the data we get, possibly as multiple fragmented
+ * frames, but this will only happen if the data is larger than the
+ * max data size. The normal case however is just the opposite, and
+ * this function may be called multiple times, and will then actually
+ * defragment the data and send it out as one packet as soon as
+ * possible, but at a safer point in time
+ */
+ while (count) {
+ size = count;
+
+ /* Adjust data size to the max data size */
+ if (size > self->max_data_size)
+ size = self->max_data_size;
+
+ /*
+ * Do we already have a buffer ready for transmit, or do
+ * we need to allocate a new frame
+ */
+ if (skb) {
+ /*
+ * Any room for more data at the end of the current
+ * transmit buffer? Cannot use skb_tailroom, since
+ * dev_alloc_skb gives us a larger skb than we
+ * requested
+ * Note : use tx_data_size, because max_data_size
+ * may have changed and we don't want to overwrite
+ * the skb. - Jean II
+ */
+ if ((tailroom = (self->tx_data_size - skb->len)) > 0) {
+ /* Adjust data to tailroom */
+ if (size > tailroom)
+ size = tailroom;
+ } else {
+ /*
+ * Current transmit frame is full, so break
+ * out, so we can send it as soon as possible
+ */
+ break;
+ }
+ } else {
+ /* Prepare a full sized frame */
+ skb = alloc_skb(self->max_data_size+
+ self->max_header_size,
+ GFP_ATOMIC);
+ if (!skb) {
+ spin_unlock_irqrestore(&self->spinlock, flags);
+ return -ENOBUFS;
+ }
+ skb_reserve(skb, self->max_header_size);
+ self->tx_skb = skb;
+ /* Remember skb size because max_data_size may
+ * change later on - Jean II */
+ self->tx_data_size = self->max_data_size;
+ }
+
+ /* Copy data */
+ memcpy(skb_put(skb,size), buf + len, size);
+
+ count -= size;
+ len += size;
+ }
+
+ spin_unlock_irqrestore(&self->spinlock, flags);
+
+ /*
+ * Schedule a new thread which will transmit the frame as soon
+ * as possible, but at a safe point in time. We do this so the
+ * "user" can give us data multiple times, as PPP does (because of
+ * its 256 byte tx buffer). We will then defragment and send out
+ * all this data as one single packet.
+ */
+ schedule_work(&self->tqueue);
+
+ return len;
+}
+
+/*
+ * Function ircomm_tty_write_room (tty)
+ *
+ * This routine returns the numbers of characters the tty driver will
+ * accept for queuing to be written. This number is subject to change as
+ * output buffers get emptied, or if the output flow control is acted.
+ */
+static int ircomm_tty_write_room(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ unsigned long flags;
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+#ifdef IRCOMM_NO_TX_BEFORE_INIT
+ /* max_header_size tells us if the channel is initialised or not. */
+ if (self->max_header_size == IRCOMM_TTY_HDR_UNINITIALISED)
+ /* Don't bother us yet */
+ return 0;
+#endif
+
+ /* Check if we are allowed to transmit any data.
+ * hw_stopped is the regular flow control.
+ * Jean II */
+ if (tty->hw_stopped)
+ ret = 0;
+ else {
+ spin_lock_irqsave(&self->spinlock, flags);
+ if (self->tx_skb)
+ ret = self->tx_data_size - self->tx_skb->len;
+ else
+ ret = self->max_data_size;
+ spin_unlock_irqrestore(&self->spinlock, flags);
+ }
+ pr_debug("%s(), ret=%d\n", __func__ , ret);
+
+ return ret;
+}
+
+/*
+ * Function ircomm_tty_wait_until_sent (tty, timeout)
+ *
+ * This routine waits until the device has written out all of the
+ * characters in its transmitter FIFO.
+ */
+static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ unsigned long orig_jiffies, poll_time;
+ unsigned long flags;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ orig_jiffies = jiffies;
+
+ /* Set poll time to 200 ms */
+ poll_time = msecs_to_jiffies(200);
+ if (timeout)
+ poll_time = min_t(unsigned long, timeout, poll_time);
+
+ spin_lock_irqsave(&self->spinlock, flags);
+ while (self->tx_skb && self->tx_skb->len) {
+ spin_unlock_irqrestore(&self->spinlock, flags);
+ schedule_timeout_interruptible(poll_time);
+ spin_lock_irqsave(&self->spinlock, flags);
+ if (signal_pending(current))
+ break;
+ if (timeout && time_after(jiffies, orig_jiffies + timeout))
+ break;
+ }
+ spin_unlock_irqrestore(&self->spinlock, flags);
+ __set_current_state(TASK_RUNNING);
+}
+
+/*
+ * Function ircomm_tty_throttle (tty)
+ *
+ * This routine notifies the tty driver that input buffers for the line
+ * discipline are close to full, and it should somehow signal that no
+ * more characters should be sent to the tty.
+ */
+static void ircomm_tty_throttle(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ /* Software flow control? */
+ if (I_IXOFF(tty))
+ ircomm_tty_send_xchar(tty, STOP_CHAR(tty));
+
+ /* Hardware flow control? */
+ if (tty->termios.c_cflag & CRTSCTS) {
+ self->settings.dte &= ~IRCOMM_RTS;
+ self->settings.dte |= IRCOMM_DELTA_RTS;
+
+ ircomm_param_request(self, IRCOMM_DTE, TRUE);
+ }
+
+ ircomm_flow_request(self->ircomm, FLOW_STOP);
+}
+
+/*
+ * Function ircomm_tty_unthrottle (tty)
+ *
+ * This routine notifies the tty drivers that it should signals that
+ * characters can now be sent to the tty without fear of overrunning the
+ * input buffers of the line disciplines.
+ */
+static void ircomm_tty_unthrottle(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ /* Using software flow control? */
+ if (I_IXOFF(tty)) {
+ ircomm_tty_send_xchar(tty, START_CHAR(tty));
+ }
+
+ /* Using hardware flow control? */
+ if (tty->termios.c_cflag & CRTSCTS) {
+ self->settings.dte |= (IRCOMM_RTS|IRCOMM_DELTA_RTS);
+
+ ircomm_param_request(self, IRCOMM_DTE, TRUE);
+ pr_debug("%s(), FLOW_START\n", __func__);
+ }
+ ircomm_flow_request(self->ircomm, FLOW_START);
+}
+
+/*
+ * Function ircomm_tty_chars_in_buffer (tty)
+ *
+ * Indicates if there are any data in the buffer
+ *
+ */
+static int ircomm_tty_chars_in_buffer(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ unsigned long flags;
+ int len = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ spin_lock_irqsave(&self->spinlock, flags);
+
+ if (self->tx_skb)
+ len = self->tx_skb->len;
+
+ spin_unlock_irqrestore(&self->spinlock, flags);
+
+ return len;
+}
+
+static void ircomm_tty_shutdown(struct ircomm_tty_cb *self)
+{
+ unsigned long flags;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ if (!test_and_clear_bit(ASYNCB_INITIALIZED, &self->port.flags))
+ return;
+
+ ircomm_tty_detach_cable(self);
+
+ spin_lock_irqsave(&self->spinlock, flags);
+
+ del_timer(&self->watchdog_timer);
+
+ /* Free parameter buffer */
+ if (self->ctrl_skb) {
+ dev_kfree_skb(self->ctrl_skb);
+ self->ctrl_skb = NULL;
+ }
+
+ /* Free transmit buffer */
+ if (self->tx_skb) {
+ dev_kfree_skb(self->tx_skb);
+ self->tx_skb = NULL;
+ }
+
+ if (self->ircomm) {
+ ircomm_close(self->ircomm);
+ self->ircomm = NULL;
+ }
+
+ spin_unlock_irqrestore(&self->spinlock, flags);
+}
+
+/*
+ * Function ircomm_tty_hangup (tty)
+ *
+ * This routine notifies the tty driver that it should hangup the tty
+ * device.
+ *
+ */
+static void ircomm_tty_hangup(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ struct tty_port *port = &self->port;
+ unsigned long flags;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ /* ircomm_tty_flush_buffer(tty); */
+ ircomm_tty_shutdown(self);
+
+ spin_lock_irqsave(&port->lock, flags);
+ port->flags &= ~ASYNC_NORMAL_ACTIVE;
+ if (port->tty) {
+ set_bit(TTY_IO_ERROR, &port->tty->flags);
+ tty_kref_put(port->tty);
+ }
+ port->tty = NULL;
+ port->count = 0;
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ wake_up_interruptible(&port->open_wait);
+}
+
+/*
+ * Function ircomm_tty_send_xchar (tty, ch)
+ *
+ * This routine is used to send a high-priority XON/XOFF character to
+ * the device.
+ */
+static void ircomm_tty_send_xchar(struct tty_struct *tty, char ch)
+{
+ pr_debug("%s(), not impl\n", __func__);
+}
+
+/*
+ * Function ircomm_tty_start (tty)
+ *
+ * This routine notifies the tty driver that it resume sending
+ * characters to the tty device.
+ */
+void ircomm_tty_start(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+ ircomm_flow_request(self->ircomm, FLOW_START);
+}
+
+/*
+ * Function ircomm_tty_stop (tty)
+ *
+ * This routine notifies the tty driver that it should stop outputting
+ * characters to the tty device.
+ */
+static void ircomm_tty_stop(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ ircomm_flow_request(self->ircomm, FLOW_STOP);
+}
+
+/*
+ * Function ircomm_check_modem_status (self)
+ *
+ * Check for any changes in the DCE's line settings. This function should
+ * be called whenever the dce parameter settings changes, to update the
+ * flow control settings and other things
+ */
+void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self)
+{
+ struct tty_struct *tty;
+ int status;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ tty = tty_port_tty_get(&self->port);
+
+ status = self->settings.dce;
+
+ if (status & IRCOMM_DCE_DELTA_ANY) {
+ /*wake_up_interruptible(&self->delta_msr_wait);*/
+ }
+ if ((self->port.flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) {
+ pr_debug("%s(), ircomm%d CD now %s...\n", __func__ , self->line,
+ (status & IRCOMM_CD) ? "on" : "off");
+
+ if (status & IRCOMM_CD) {
+ wake_up_interruptible(&self->port.open_wait);
+ } else {
+ pr_debug("%s(), Doing serial hangup..\n", __func__);
+ if (tty)
+ tty_hangup(tty);
+
+ /* Hangup will remote the tty, so better break out */
+ goto put;
+ }
+ }
+ if (tty && tty_port_cts_enabled(&self->port)) {
+ if (tty->hw_stopped) {
+ if (status & IRCOMM_CTS) {
+ pr_debug("%s(), CTS tx start...\n", __func__);
+ tty->hw_stopped = 0;
+
+ /* Wake up processes blocked on open */
+ wake_up_interruptible(&self->port.open_wait);
+
+ schedule_work(&self->tqueue);
+ goto put;
+ }
+ } else {
+ if (!(status & IRCOMM_CTS)) {
+ pr_debug("%s(), CTS tx stop...\n", __func__);
+ tty->hw_stopped = 1;
+ }
+ }
+ }
+put:
+ tty_kref_put(tty);
+}
+
+/*
+ * Function ircomm_tty_data_indication (instance, sap, skb)
+ *
+ * Handle incoming data, and deliver it to the line discipline
+ *
+ */
+static int ircomm_tty_data_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+ struct tty_struct *tty;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ tty = tty_port_tty_get(&self->port);
+ if (!tty) {
+ pr_debug("%s(), no tty!\n", __func__);
+ return 0;
+ }
+
+ /*
+ * If we receive data when hardware is stopped then something is wrong.
+ * We try to poll the peers line settings to check if we are up todate.
+ * Devices like WinCE can do this, and since they don't send any
+ * params, we can just as well declare the hardware for running.
+ */
+ if (tty->hw_stopped && (self->flow == FLOW_START)) {
+ pr_debug("%s(), polling for line settings!\n", __func__);
+ ircomm_param_request(self, IRCOMM_POLL, TRUE);
+
+ /* We can just as well declare the hardware for running */
+ ircomm_tty_send_initial_parameters(self);
+ ircomm_tty_link_established(self);
+ }
+ tty_kref_put(tty);
+
+ /*
+ * Use flip buffer functions since the code may be called from interrupt
+ * context
+ */
+ tty_insert_flip_string(&self->port, skb->data, skb->len);
+ tty_flip_buffer_push(&self->port);
+
+ /* No need to kfree_skb - see ircomm_ttp_data_indication() */
+
+ return 0;
+}
+
+/*
+ * Function ircomm_tty_control_indication (instance, sap, skb)
+ *
+ * Parse all incoming parameters (easy!)
+ *
+ */
+static int ircomm_tty_control_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+ int clen;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ clen = skb->data[0];
+
+ irda_param_extract_all(self, skb->data+1, IRDA_MIN(skb->len-1, clen),
+ &ircomm_param_info);
+
+ /* No need to kfree_skb - see ircomm_control_indication() */
+
+ return 0;
+}
+
+/*
+ * Function ircomm_tty_flow_indication (instance, sap, cmd)
+ *
+ * This function is called by IrTTP when it wants us to slow down the
+ * transmission of data. We just mark the hardware as stopped, and wait
+ * for IrTTP to notify us that things are OK again.
+ */
+static void ircomm_tty_flow_indication(void *instance, void *sap,
+ LOCAL_FLOW cmd)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+ struct tty_struct *tty;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ tty = tty_port_tty_get(&self->port);
+
+ switch (cmd) {
+ case FLOW_START:
+ pr_debug("%s(), hw start!\n", __func__);
+ if (tty)
+ tty->hw_stopped = 0;
+
+ /* ircomm_tty_do_softint will take care of the rest */
+ schedule_work(&self->tqueue);
+ break;
+ default: /* If we get here, something is very wrong, better stop */
+ case FLOW_STOP:
+ pr_debug("%s(), hw stopped!\n", __func__);
+ if (tty)
+ tty->hw_stopped = 1;
+ break;
+ }
+
+ tty_kref_put(tty);
+ self->flow = cmd;
+}
+
+#ifdef CONFIG_PROC_FS
+static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
+{
+ struct tty_struct *tty;
+ char sep;
+
+ seq_printf(m, "State: %s\n", ircomm_tty_state[self->state]);
+
+ seq_puts(m, "Service type: ");
+ if (self->service_type & IRCOMM_9_WIRE)
+ seq_puts(m, "9_WIRE");
+ else if (self->service_type & IRCOMM_3_WIRE)
+ seq_puts(m, "3_WIRE");
+ else if (self->service_type & IRCOMM_3_WIRE_RAW)
+ seq_puts(m, "3_WIRE_RAW");
+ else
+ seq_puts(m, "No common service type!\n");
+ seq_putc(m, '\n');
+
+ seq_printf(m, "Port name: %s\n", self->settings.port_name);
+
+ seq_printf(m, "DTE status:");
+ sep = ' ';
+ if (self->settings.dte & IRCOMM_RTS) {
+ seq_printf(m, "%cRTS", sep);
+ sep = '|';
+ }
+ if (self->settings.dte & IRCOMM_DTR) {
+ seq_printf(m, "%cDTR", sep);
+ sep = '|';
+ }
+ seq_putc(m, '\n');
+
+ seq_puts(m, "DCE status:");
+ sep = ' ';
+ if (self->settings.dce & IRCOMM_CTS) {
+ seq_printf(m, "%cCTS", sep);
+ sep = '|';
+ }
+ if (self->settings.dce & IRCOMM_DSR) {
+ seq_printf(m, "%cDSR", sep);
+ sep = '|';
+ }
+ if (self->settings.dce & IRCOMM_CD) {
+ seq_printf(m, "%cCD", sep);
+ sep = '|';
+ }
+ if (self->settings.dce & IRCOMM_RI) {
+ seq_printf(m, "%cRI", sep);
+ sep = '|';
+ }
+ seq_putc(m, '\n');
+
+ seq_puts(m, "Configuration: ");
+ if (!self->settings.null_modem)
+ seq_puts(m, "DTE <-> DCE\n");
+ else
+ seq_puts(m, "DTE <-> DTE (null modem emulation)\n");
+
+ seq_printf(m, "Data rate: %d\n", self->settings.data_rate);
+
+ seq_puts(m, "Flow control:");
+ sep = ' ';
+ if (self->settings.flow_control & IRCOMM_XON_XOFF_IN) {
+ seq_printf(m, "%cXON_XOFF_IN", sep);
+ sep = '|';
+ }
+ if (self->settings.flow_control & IRCOMM_XON_XOFF_OUT) {
+ seq_printf(m, "%cXON_XOFF_OUT", sep);
+ sep = '|';
+ }
+ if (self->settings.flow_control & IRCOMM_RTS_CTS_IN) {
+ seq_printf(m, "%cRTS_CTS_IN", sep);
+ sep = '|';
+ }
+ if (self->settings.flow_control & IRCOMM_RTS_CTS_OUT) {
+ seq_printf(m, "%cRTS_CTS_OUT", sep);
+ sep = '|';
+ }
+ if (self->settings.flow_control & IRCOMM_DSR_DTR_IN) {
+ seq_printf(m, "%cDSR_DTR_IN", sep);
+ sep = '|';
+ }
+ if (self->settings.flow_control & IRCOMM_DSR_DTR_OUT) {
+ seq_printf(m, "%cDSR_DTR_OUT", sep);
+ sep = '|';
+ }
+ if (self->settings.flow_control & IRCOMM_ENQ_ACK_IN) {
+ seq_printf(m, "%cENQ_ACK_IN", sep);
+ sep = '|';
+ }
+ if (self->settings.flow_control & IRCOMM_ENQ_ACK_OUT) {
+ seq_printf(m, "%cENQ_ACK_OUT", sep);
+ sep = '|';
+ }
+ seq_putc(m, '\n');
+
+ seq_puts(m, "Flags:");
+ sep = ' ';
+ if (tty_port_cts_enabled(&self->port)) {
+ seq_printf(m, "%cASYNC_CTS_FLOW", sep);
+ sep = '|';
+ }
+ if (self->port.flags & ASYNC_CHECK_CD) {
+ seq_printf(m, "%cASYNC_CHECK_CD", sep);
+ sep = '|';
+ }
+ if (self->port.flags & ASYNC_INITIALIZED) {
+ seq_printf(m, "%cASYNC_INITIALIZED", sep);
+ sep = '|';
+ }
+ if (self->port.flags & ASYNC_LOW_LATENCY) {
+ seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
+ sep = '|';
+ }
+ if (self->port.flags & ASYNC_CLOSING) {
+ seq_printf(m, "%cASYNC_CLOSING", sep);
+ sep = '|';
+ }
+ if (self->port.flags & ASYNC_NORMAL_ACTIVE) {
+ seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
+ sep = '|';
+ }
+ seq_putc(m, '\n');
+
+ seq_printf(m, "Role: %s\n", self->client ? "client" : "server");
+ seq_printf(m, "Open count: %d\n", self->port.count);
+ seq_printf(m, "Max data size: %d\n", self->max_data_size);
+ seq_printf(m, "Max header size: %d\n", self->max_header_size);
+
+ tty = tty_port_tty_get(&self->port);
+ if (tty) {
+ seq_printf(m, "Hardware: %s\n",
+ tty->hw_stopped ? "Stopped" : "Running");
+ tty_kref_put(tty);
+ }
+}
+
+static int ircomm_tty_proc_show(struct seq_file *m, void *v)
+{
+ struct ircomm_tty_cb *self;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ircomm_tty->hb_spinlock, flags);
+
+ self = (struct ircomm_tty_cb *) hashbin_get_first(ircomm_tty);
+ while (self != NULL) {
+ if (self->magic != IRCOMM_TTY_MAGIC)
+ break;
+
+ ircomm_tty_line_info(self, m);
+ self = (struct ircomm_tty_cb *) hashbin_get_next(ircomm_tty);
+ }
+ spin_unlock_irqrestore(&ircomm_tty->hb_spinlock, flags);
+ return 0;
+}
+
+static int ircomm_tty_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ircomm_tty_proc_show, NULL);
+}
+
+static const struct file_operations ircomm_tty_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = ircomm_tty_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no>");
+MODULE_DESCRIPTION("IrCOMM serial TTY driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CHARDEV_MAJOR(IRCOMM_TTY_MAJOR);
+
+module_init(ircomm_tty_init);
+module_exit(ircomm_tty_cleanup);
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
new file mode 100644
index 000000000..61137f8b5
--- /dev/null
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -0,0 +1,987 @@
+/*********************************************************************
+ *
+ * Filename: ircomm_tty_attach.c
+ * Version:
+ * Description: Code for attaching the serial driver to IrCOMM
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sat Jun 5 17:42:00 1999
+ * Modified at: Tue Jan 4 14:20:49 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+#include <linux/sched.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/parameters.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_param.h>
+#include <net/irda/ircomm_event.h>
+
+#include <net/irda/ircomm_tty.h>
+#include <net/irda/ircomm_tty_attach.h>
+
+static void ircomm_tty_ias_register(struct ircomm_tty_cb *self);
+static void ircomm_tty_discovery_indication(discinfo_t *discovery,
+ DISCOVERY_MODE mode,
+ void *priv);
+static void ircomm_tty_getvalue_confirm(int result, __u16 obj_id,
+ struct ias_value *value, void *priv);
+static void ircomm_tty_start_watchdog_timer(struct ircomm_tty_cb *self,
+ int timeout);
+static void ircomm_tty_watchdog_timer_expired(void *data);
+
+static int ircomm_tty_state_idle(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info);
+static int ircomm_tty_state_search(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info);
+static int ircomm_tty_state_query_parameters(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info);
+static int ircomm_tty_state_query_lsap_sel(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info);
+static int ircomm_tty_state_setup(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info);
+static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info);
+
+const char *const ircomm_tty_state[] = {
+ "IRCOMM_TTY_IDLE",
+ "IRCOMM_TTY_SEARCH",
+ "IRCOMM_TTY_QUERY_PARAMETERS",
+ "IRCOMM_TTY_QUERY_LSAP_SEL",
+ "IRCOMM_TTY_SETUP",
+ "IRCOMM_TTY_READY",
+ "*** ERROR *** ",
+};
+
+static const char *const ircomm_tty_event[] __maybe_unused = {
+ "IRCOMM_TTY_ATTACH_CABLE",
+ "IRCOMM_TTY_DETACH_CABLE",
+ "IRCOMM_TTY_DATA_REQUEST",
+ "IRCOMM_TTY_DATA_INDICATION",
+ "IRCOMM_TTY_DISCOVERY_REQUEST",
+ "IRCOMM_TTY_DISCOVERY_INDICATION",
+ "IRCOMM_TTY_CONNECT_CONFIRM",
+ "IRCOMM_TTY_CONNECT_INDICATION",
+ "IRCOMM_TTY_DISCONNECT_REQUEST",
+ "IRCOMM_TTY_DISCONNECT_INDICATION",
+ "IRCOMM_TTY_WD_TIMER_EXPIRED",
+ "IRCOMM_TTY_GOT_PARAMETERS",
+ "IRCOMM_TTY_GOT_LSAPSEL",
+ "*** ERROR ****",
+};
+
+static int (*state[])(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb, struct ircomm_tty_info *info) =
+{
+ ircomm_tty_state_idle,
+ ircomm_tty_state_search,
+ ircomm_tty_state_query_parameters,
+ ircomm_tty_state_query_lsap_sel,
+ ircomm_tty_state_setup,
+ ircomm_tty_state_ready,
+};
+
+/*
+ * Function ircomm_tty_attach_cable (driver)
+ *
+ * Try to attach cable (IrCOMM link). This function will only return
+ * when the link has been connected, or if an error condition occurs.
+ * If success, the return value is the resulting service type.
+ */
+int ircomm_tty_attach_cable(struct ircomm_tty_cb *self)
+{
+ struct tty_struct *tty;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ /* Check if somebody has already connected to us */
+ if (ircomm_is_connected(self->ircomm)) {
+ pr_debug("%s(), already connected!\n", __func__);
+ return 0;
+ }
+
+ /* Make sure nobody tries to write before the link is up */
+ tty = tty_port_tty_get(&self->port);
+ if (tty) {
+ tty->hw_stopped = 1;
+ tty_kref_put(tty);
+ }
+
+ ircomm_tty_ias_register(self);
+
+ ircomm_tty_do_event(self, IRCOMM_TTY_ATTACH_CABLE, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_detach_cable (driver)
+ *
+ * Detach cable, or cable has been detached by peer
+ *
+ */
+void ircomm_tty_detach_cable(struct ircomm_tty_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ del_timer(&self->watchdog_timer);
+
+ /* Remove discovery handler */
+ if (self->ckey) {
+ irlmp_unregister_client(self->ckey);
+ self->ckey = NULL;
+ }
+ /* Remove IrCOMM hint bits */
+ if (self->skey) {
+ irlmp_unregister_service(self->skey);
+ self->skey = NULL;
+ }
+
+ if (self->iriap) {
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+ }
+
+ /* Remove LM-IAS object */
+ if (self->obj) {
+ irias_delete_object(self->obj);
+ self->obj = NULL;
+ }
+
+ ircomm_tty_do_event(self, IRCOMM_TTY_DETACH_CABLE, NULL, NULL);
+
+ /* Reset some values */
+ self->daddr = self->saddr = 0;
+ self->dlsap_sel = self->slsap_sel = 0;
+
+ memset(&self->settings, 0, sizeof(struct ircomm_params));
+}
+
+/*
+ * Function ircomm_tty_ias_register (self)
+ *
+ * Register with LM-IAS depending on which service type we are
+ *
+ */
+static void ircomm_tty_ias_register(struct ircomm_tty_cb *self)
+{
+ __u8 oct_seq[6];
+ __u16 hints;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ /* Compute hint bits based on service */
+ hints = irlmp_service_to_hint(S_COMM);
+ if (self->service_type & IRCOMM_3_WIRE_RAW)
+ hints |= irlmp_service_to_hint(S_PRINTER);
+
+ /* Advertise IrCOMM hint bit in discovery */
+ if (!self->skey)
+ self->skey = irlmp_register_service(hints);
+ /* Set up a discovery handler */
+ if (!self->ckey)
+ self->ckey = irlmp_register_client(hints,
+ ircomm_tty_discovery_indication,
+ NULL, (void *) self);
+
+ /* If already done, no need to do it again */
+ if (self->obj)
+ return;
+
+ if (self->service_type & IRCOMM_3_WIRE_RAW) {
+ /* Register IrLPT with LM-IAS */
+ self->obj = irias_new_object("IrLPT", IAS_IRLPT_ID);
+ irias_add_integer_attrib(self->obj, "IrDA:IrLMP:LsapSel",
+ self->slsap_sel, IAS_KERNEL_ATTR);
+ } else {
+ /* Register IrCOMM with LM-IAS */
+ self->obj = irias_new_object("IrDA:IrCOMM", IAS_IRCOMM_ID);
+ irias_add_integer_attrib(self->obj, "IrDA:TinyTP:LsapSel",
+ self->slsap_sel, IAS_KERNEL_ATTR);
+
+ /* Code the parameters into the buffer */
+ irda_param_pack(oct_seq, "bbbbbb",
+ IRCOMM_SERVICE_TYPE, 1, self->service_type,
+ IRCOMM_PORT_TYPE, 1, IRCOMM_SERIAL);
+
+ /* Register parameters with LM-IAS */
+ irias_add_octseq_attrib(self->obj, "Parameters", oct_seq, 6,
+ IAS_KERNEL_ATTR);
+ }
+ irias_insert_object(self->obj);
+}
+
+/*
+ * Function ircomm_tty_ias_unregister (self)
+ *
+ * Remove our IAS object and client hook while connected.
+ *
+ */
+static void ircomm_tty_ias_unregister(struct ircomm_tty_cb *self)
+{
+ /* Remove LM-IAS object now so it is not reused.
+ * IrCOMM deals very poorly with multiple incoming connections.
+ * It should looks a lot more like IrNET, and "dup" a server TSAP
+ * to the application TSAP (based on various rules).
+ * This is a cheap workaround allowing multiple clients to
+ * connect to us. It will not always work.
+ * Each IrCOMM socket has an IAS entry. Incoming connection will
+ * pick the first one found. So, when we are fully connected,
+ * we remove our IAS entries so that the next IAS entry is used.
+ * We do that for *both* client and server, because a server
+ * can also create client instances.
+ * Jean II */
+ if (self->obj) {
+ irias_delete_object(self->obj);
+ self->obj = NULL;
+ }
+
+#if 0
+ /* Remove discovery handler.
+ * While we are connected, we no longer need to receive
+ * discovery events. This would be the case if there is
+ * multiple IrLAP interfaces. Jean II */
+ if (self->ckey) {
+ irlmp_unregister_client(self->ckey);
+ self->ckey = NULL;
+ }
+#endif
+}
+
+/*
+ * Function ircomm_send_initial_parameters (self)
+ *
+ * Send initial parameters to the remote IrCOMM device. These parameters
+ * must be sent before any data.
+ */
+int ircomm_tty_send_initial_parameters(struct ircomm_tty_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (self->service_type & IRCOMM_3_WIRE_RAW)
+ return 0;
+
+ /*
+ * Set default values, but only if the application for some reason
+ * haven't set them already
+ */
+ pr_debug("%s(), data-rate = %d\n", __func__ ,
+ self->settings.data_rate);
+ if (!self->settings.data_rate)
+ self->settings.data_rate = 9600;
+ pr_debug("%s(), data-format = %d\n", __func__ ,
+ self->settings.data_format);
+ if (!self->settings.data_format)
+ self->settings.data_format = IRCOMM_WSIZE_8; /* 8N1 */
+
+ pr_debug("%s(), flow-control = %d\n", __func__ ,
+ self->settings.flow_control);
+ /*self->settings.flow_control = IRCOMM_RTS_CTS_IN|IRCOMM_RTS_CTS_OUT;*/
+
+ /* Do not set delta values for the initial parameters */
+ self->settings.dte = IRCOMM_DTR | IRCOMM_RTS;
+
+ /* Only send service type parameter when we are the client */
+ if (self->client)
+ ircomm_param_request(self, IRCOMM_SERVICE_TYPE, FALSE);
+ ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
+ ircomm_param_request(self, IRCOMM_DATA_FORMAT, FALSE);
+
+ /* For a 3 wire service, we just flush the last parameter and return */
+ if (self->settings.service_type == IRCOMM_3_WIRE) {
+ ircomm_param_request(self, IRCOMM_FLOW_CONTROL, TRUE);
+ return 0;
+ }
+
+ /* Only 9-wire service types continue here */
+ ircomm_param_request(self, IRCOMM_FLOW_CONTROL, FALSE);
+#if 0
+ ircomm_param_request(self, IRCOMM_XON_XOFF, FALSE);
+ ircomm_param_request(self, IRCOMM_ENQ_ACK, FALSE);
+#endif
+ /* Notify peer that we are ready to receive data */
+ ircomm_param_request(self, IRCOMM_DTE, TRUE);
+
+ return 0;
+}
+
+/*
+ * Function ircomm_tty_discovery_indication (discovery)
+ *
+ * Remote device is discovered, try query the remote IAS to see which
+ * device it is, and which services it has.
+ *
+ */
+static void ircomm_tty_discovery_indication(discinfo_t *discovery,
+ DISCOVERY_MODE mode,
+ void *priv)
+{
+ struct ircomm_tty_cb *self;
+ struct ircomm_tty_info info;
+
+ /* Important note :
+ * We need to drop all passive discoveries.
+ * The LSAP management of IrComm is deficient and doesn't deal
+ * with the case of two instance connecting to each other
+ * simultaneously (it will deadlock in LMP).
+ * The proper fix would be to use the same technique as in IrNET,
+ * to have one server socket and separate instances for the
+ * connecting/connected socket.
+ * The workaround is to drop passive discovery, which drastically
+ * reduce the probability of this happening.
+ * Jean II */
+ if(mode == DISCOVERY_PASSIVE)
+ return;
+
+ info.daddr = discovery->daddr;
+ info.saddr = discovery->saddr;
+
+ self = priv;
+ ircomm_tty_do_event(self, IRCOMM_TTY_DISCOVERY_INDICATION,
+ NULL, &info);
+}
+
+/*
+ * Function ircomm_tty_disconnect_indication (instance, sap, reason, skb)
+ *
+ * Link disconnected
+ *
+ */
+void ircomm_tty_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason,
+ struct sk_buff *skb)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+ struct tty_struct *tty;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ tty = tty_port_tty_get(&self->port);
+ if (!tty)
+ return;
+
+ /* This will stop control data transfers */
+ self->flow = FLOW_STOP;
+
+ /* Stop data transfers */
+ tty->hw_stopped = 1;
+
+ ircomm_tty_do_event(self, IRCOMM_TTY_DISCONNECT_INDICATION, NULL,
+ NULL);
+ tty_kref_put(tty);
+}
+
+/*
+ * Function ircomm_tty_getvalue_confirm (result, obj_id, value, priv)
+ *
+ * Got result from the IAS query we make
+ *
+ */
+static void ircomm_tty_getvalue_confirm(int result, __u16 obj_id,
+ struct ias_value *value,
+ void *priv)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) priv;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ /* We probably don't need to make any more queries */
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+
+ /* Check if request succeeded */
+ if (result != IAS_SUCCESS) {
+ pr_debug("%s(), got NULL value!\n", __func__);
+ return;
+ }
+
+ switch (value->type) {
+ case IAS_OCT_SEQ:
+ pr_debug("%s(), got octet sequence\n", __func__);
+
+ irda_param_extract_all(self, value->t.oct_seq, value->len,
+ &ircomm_param_info);
+
+ ircomm_tty_do_event(self, IRCOMM_TTY_GOT_PARAMETERS, NULL,
+ NULL);
+ break;
+ case IAS_INTEGER:
+ /* Got LSAP selector */
+ pr_debug("%s(), got lsapsel = %d\n", __func__ ,
+ value->t.integer);
+
+ if (value->t.integer == -1) {
+ pr_debug("%s(), invalid value!\n", __func__);
+ } else
+ self->dlsap_sel = value->t.integer;
+
+ ircomm_tty_do_event(self, IRCOMM_TTY_GOT_LSAPSEL, NULL, NULL);
+ break;
+ case IAS_MISSING:
+ pr_debug("%s(), got IAS_MISSING\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), got unknown type!\n", __func__);
+ break;
+ }
+ irias_delete_value(value);
+}
+
+/*
+ * Function ircomm_tty_connect_confirm (instance, sap, qos, max_sdu_size, skb)
+ *
+ * Connection confirmed
+ *
+ */
+void ircomm_tty_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_data_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ self->client = TRUE;
+ self->max_data_size = max_data_size;
+ self->max_header_size = max_header_size;
+ self->flow = FLOW_START;
+
+ ircomm_tty_do_event(self, IRCOMM_TTY_CONNECT_CONFIRM, NULL, NULL);
+
+ /* No need to kfree_skb - see ircomm_ttp_connect_confirm() */
+}
+
+/*
+ * Function ircomm_tty_connect_indication (instance, sap, qos, max_sdu_size,
+ * skb)
+ *
+ * we are discovered and being requested to connect by remote device !
+ *
+ */
+void ircomm_tty_connect_indication(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_data_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+ int clen;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ self->client = FALSE;
+ self->max_data_size = max_data_size;
+ self->max_header_size = max_header_size;
+ self->flow = FLOW_START;
+
+ clen = skb->data[0];
+ if (clen)
+ irda_param_extract_all(self, skb->data+1,
+ IRDA_MIN(skb->len, clen),
+ &ircomm_param_info);
+
+ ircomm_tty_do_event(self, IRCOMM_TTY_CONNECT_INDICATION, NULL, NULL);
+
+ /* No need to kfree_skb - see ircomm_ttp_connect_indication() */
+}
+
+/*
+ * Function ircomm_tty_link_established (self)
+ *
+ * Called when the IrCOMM link is established
+ *
+ */
+void ircomm_tty_link_established(struct ircomm_tty_cb *self)
+{
+ struct tty_struct *tty;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ tty = tty_port_tty_get(&self->port);
+ if (!tty)
+ return;
+
+ del_timer(&self->watchdog_timer);
+
+ /*
+ * IrCOMM link is now up, and if we are not using hardware
+ * flow-control, then declare the hardware as running. Otherwise we
+ * will have to wait for the peer device (DCE) to raise the CTS
+ * line.
+ */
+ if (tty_port_cts_enabled(&self->port) &&
+ ((self->settings.dce & IRCOMM_CTS) == 0)) {
+ pr_debug("%s(), waiting for CTS ...\n", __func__);
+ goto put;
+ } else {
+ pr_debug("%s(), starting hardware!\n", __func__);
+
+ tty->hw_stopped = 0;
+
+ /* Wake up processes blocked on open */
+ wake_up_interruptible(&self->port.open_wait);
+ }
+
+ schedule_work(&self->tqueue);
+put:
+ tty_kref_put(tty);
+}
+
+/*
+ * Function ircomm_tty_start_watchdog_timer (self, timeout)
+ *
+ * Start the watchdog timer. This timer is used to make sure that any
+ * connection attempt is successful, and if not, we will retry after
+ * the timeout
+ */
+static void ircomm_tty_start_watchdog_timer(struct ircomm_tty_cb *self,
+ int timeout)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ irda_start_timer(&self->watchdog_timer, timeout, (void *) self,
+ ircomm_tty_watchdog_timer_expired);
+}
+
+/*
+ * Function ircomm_tty_watchdog_timer_expired (data)
+ *
+ * Called when the connect procedure have taken to much time.
+ *
+ */
+static void ircomm_tty_watchdog_timer_expired(void *data)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ ircomm_tty_do_event(self, IRCOMM_TTY_WD_TIMER_EXPIRED, NULL, NULL);
+}
+
+
+/*
+ * Function ircomm_tty_do_event (self, event, skb)
+ *
+ * Process event
+ *
+ */
+int ircomm_tty_do_event(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb, struct ircomm_tty_info *info)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ pr_debug("%s: state=%s, event=%s\n", __func__ ,
+ ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+ return (*state[self->state])(self, event, skb, info);
+}
+
+/*
+ * Function ircomm_tty_next_state (self, state)
+ *
+ * Switch state
+ *
+ */
+static inline void ircomm_tty_next_state(struct ircomm_tty_cb *self, IRCOMM_TTY_STATE state)
+{
+ /*
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+ pr_debug("%s: next state=%s, service type=%d\n", __func__ ,
+ ircomm_tty_state[self->state], self->service_type);
+ */
+ self->state = state;
+}
+
+/*
+ * Function ircomm_tty_state_idle (self, event, skb, info)
+ *
+ * Just hanging around
+ *
+ */
+static int ircomm_tty_state_idle(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s: state=%s, event=%s\n", __func__ ,
+ ircomm_tty_state[self->state], ircomm_tty_event[event]);
+ switch (event) {
+ case IRCOMM_TTY_ATTACH_CABLE:
+ /* Try to discover any remote devices */
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+
+ irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS);
+ break;
+ case IRCOMM_TTY_DISCOVERY_INDICATION:
+ self->daddr = info->daddr;
+ self->saddr = info->saddr;
+
+ if (self->iriap) {
+ net_warn_ratelimited("%s(), busy with a previous query\n",
+ __func__);
+ return -EBUSY;
+ }
+
+ self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+ ircomm_tty_getvalue_confirm);
+
+ iriap_getvaluebyclass_request(self->iriap,
+ self->saddr, self->daddr,
+ "IrDA:IrCOMM", "Parameters");
+
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_PARAMETERS);
+ break;
+ case IRCOMM_TTY_CONNECT_INDICATION:
+ del_timer(&self->watchdog_timer);
+
+ /* Accept connection */
+ ircomm_connect_response(self->ircomm, NULL);
+ ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+ break;
+ case IRCOMM_TTY_WD_TIMER_EXPIRED:
+ /* Just stay idle */
+ break;
+ case IRCOMM_TTY_DETACH_CABLE:
+ ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+ break;
+ default:
+ pr_debug("%s(), unknown event: %s\n", __func__ ,
+ ircomm_tty_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_tty_state_search (self, event, skb, info)
+ *
+ * Trying to discover an IrCOMM device
+ *
+ */
+static int ircomm_tty_state_search(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s: state=%s, event=%s\n", __func__ ,
+ ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+ switch (event) {
+ case IRCOMM_TTY_DISCOVERY_INDICATION:
+ self->daddr = info->daddr;
+ self->saddr = info->saddr;
+
+ if (self->iriap) {
+ net_warn_ratelimited("%s(), busy with a previous query\n",
+ __func__);
+ return -EBUSY;
+ }
+
+ self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+ ircomm_tty_getvalue_confirm);
+
+ if (self->service_type == IRCOMM_3_WIRE_RAW) {
+ iriap_getvaluebyclass_request(self->iriap, self->saddr,
+ self->daddr, "IrLPT",
+ "IrDA:IrLMP:LsapSel");
+ ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_LSAP_SEL);
+ } else {
+ iriap_getvaluebyclass_request(self->iriap, self->saddr,
+ self->daddr,
+ "IrDA:IrCOMM",
+ "Parameters");
+
+ ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_PARAMETERS);
+ }
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ break;
+ case IRCOMM_TTY_CONNECT_INDICATION:
+ del_timer(&self->watchdog_timer);
+ ircomm_tty_ias_unregister(self);
+
+ /* Accept connection */
+ ircomm_connect_response(self->ircomm, NULL);
+ ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+ break;
+ case IRCOMM_TTY_WD_TIMER_EXPIRED:
+#if 1
+ /* Give up */
+#else
+ /* Try to discover any remote devices */
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS);
+#endif
+ break;
+ case IRCOMM_TTY_DETACH_CABLE:
+ ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+ break;
+ default:
+ pr_debug("%s(), unknown event: %s\n", __func__ ,
+ ircomm_tty_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_tty_state_query (self, event, skb, info)
+ *
+ * Querying the remote LM-IAS for IrCOMM parameters
+ *
+ */
+static int ircomm_tty_state_query_parameters(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s: state=%s, event=%s\n", __func__ ,
+ ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+ switch (event) {
+ case IRCOMM_TTY_GOT_PARAMETERS:
+ if (self->iriap) {
+ net_warn_ratelimited("%s(), busy with a previous query\n",
+ __func__);
+ return -EBUSY;
+ }
+
+ self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+ ircomm_tty_getvalue_confirm);
+
+ iriap_getvaluebyclass_request(self->iriap, self->saddr,
+ self->daddr, "IrDA:IrCOMM",
+ "IrDA:TinyTP:LsapSel");
+
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_LSAP_SEL);
+ break;
+ case IRCOMM_TTY_WD_TIMER_EXPIRED:
+ /* Go back to search mode */
+ ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ break;
+ case IRCOMM_TTY_CONNECT_INDICATION:
+ del_timer(&self->watchdog_timer);
+ ircomm_tty_ias_unregister(self);
+
+ /* Accept connection */
+ ircomm_connect_response(self->ircomm, NULL);
+ ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+ break;
+ case IRCOMM_TTY_DETACH_CABLE:
+ ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+ break;
+ default:
+ pr_debug("%s(), unknown event: %s\n", __func__ ,
+ ircomm_tty_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_tty_state_query_lsap_sel (self, event, skb, info)
+ *
+ * Query remote LM-IAS for the LSAP selector which we can connect to
+ *
+ */
+static int ircomm_tty_state_query_lsap_sel(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s: state=%s, event=%s\n", __func__ ,
+ ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+ switch (event) {
+ case IRCOMM_TTY_GOT_LSAPSEL:
+ /* Connect to remote device */
+ ret = ircomm_connect_request(self->ircomm, self->dlsap_sel,
+ self->saddr, self->daddr,
+ NULL, self->service_type);
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ ircomm_tty_next_state(self, IRCOMM_TTY_SETUP);
+ break;
+ case IRCOMM_TTY_WD_TIMER_EXPIRED:
+ /* Go back to search mode */
+ ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ break;
+ case IRCOMM_TTY_CONNECT_INDICATION:
+ del_timer(&self->watchdog_timer);
+ ircomm_tty_ias_unregister(self);
+
+ /* Accept connection */
+ ircomm_connect_response(self->ircomm, NULL);
+ ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+ break;
+ case IRCOMM_TTY_DETACH_CABLE:
+ ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+ break;
+ default:
+ pr_debug("%s(), unknown event: %s\n", __func__ ,
+ ircomm_tty_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_tty_state_setup (self, event, skb, info)
+ *
+ * Trying to connect
+ *
+ */
+static int ircomm_tty_state_setup(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s: state=%s, event=%s\n", __func__ ,
+ ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+ switch (event) {
+ case IRCOMM_TTY_CONNECT_CONFIRM:
+ del_timer(&self->watchdog_timer);
+ ircomm_tty_ias_unregister(self);
+
+ /*
+ * Send initial parameters. This will also send out queued
+ * parameters waiting for the connection to come up
+ */
+ ircomm_tty_send_initial_parameters(self);
+ ircomm_tty_link_established(self);
+ ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+ break;
+ case IRCOMM_TTY_CONNECT_INDICATION:
+ del_timer(&self->watchdog_timer);
+ ircomm_tty_ias_unregister(self);
+
+ /* Accept connection */
+ ircomm_connect_response(self->ircomm, NULL);
+ ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+ break;
+ case IRCOMM_TTY_WD_TIMER_EXPIRED:
+ /* Go back to search mode */
+ ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+ break;
+ case IRCOMM_TTY_DETACH_CABLE:
+ /* ircomm_disconnect_request(self->ircomm, NULL); */
+ ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+ break;
+ default:
+ pr_debug("%s(), unknown event: %s\n", __func__ ,
+ ircomm_tty_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Function ircomm_tty_state_ready (self, event, skb, info)
+ *
+ * IrCOMM is now connected
+ *
+ */
+static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
+ IRCOMM_TTY_EVENT event,
+ struct sk_buff *skb,
+ struct ircomm_tty_info *info)
+{
+ int ret = 0;
+
+ switch (event) {
+ case IRCOMM_TTY_DATA_REQUEST:
+ ret = ircomm_data_request(self->ircomm, skb);
+ break;
+ case IRCOMM_TTY_DETACH_CABLE:
+ ircomm_disconnect_request(self->ircomm, NULL);
+ ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+ break;
+ case IRCOMM_TTY_DISCONNECT_INDICATION:
+ ircomm_tty_ias_register(self);
+ ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+ ircomm_tty_start_watchdog_timer(self, 3*HZ);
+
+ if (self->port.flags & ASYNC_CHECK_CD) {
+ /* Drop carrier */
+ self->settings.dce = IRCOMM_DELTA_CD;
+ ircomm_tty_check_modem_status(self);
+ } else {
+ pr_debug("%s(), hanging up!\n", __func__);
+ tty_port_tty_hangup(&self->port, false);
+ }
+ break;
+ default:
+ pr_debug("%s(), unknown event: %s\n", __func__ ,
+ ircomm_tty_event[event]);
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
new file mode 100644
index 000000000..75ccdbd07
--- /dev/null
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -0,0 +1,413 @@
+/*********************************************************************
+ *
+ * Filename: ircomm_tty_ioctl.c
+ * Version:
+ * Description:
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Thu Jun 10 14:39:09 1999
+ * Modified at: Wed Jan 5 14:45:43 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/termios.h>
+#include <linux/tty.h>
+#include <linux/serial.h>
+
+#include <asm/uaccess.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_param.h>
+#include <net/irda/ircomm_tty_attach.h>
+#include <net/irda/ircomm_tty.h>
+
+#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
+
+/*
+ * Function ircomm_tty_change_speed (driver)
+ *
+ * Change speed of the driver. If the remote device is a DCE, then this
+ * should make it change the speed of its serial port
+ */
+static void ircomm_tty_change_speed(struct ircomm_tty_cb *self,
+ struct tty_struct *tty)
+{
+ unsigned int cflag, cval;
+ int baud;
+
+ if (!self->ircomm)
+ return;
+
+ cflag = tty->termios.c_cflag;
+
+ /* byte size and parity */
+ switch (cflag & CSIZE) {
+ case CS5: cval = IRCOMM_WSIZE_5; break;
+ case CS6: cval = IRCOMM_WSIZE_6; break;
+ case CS7: cval = IRCOMM_WSIZE_7; break;
+ case CS8: cval = IRCOMM_WSIZE_8; break;
+ default: cval = IRCOMM_WSIZE_5; break;
+ }
+ if (cflag & CSTOPB)
+ cval |= IRCOMM_2_STOP_BIT;
+
+ if (cflag & PARENB)
+ cval |= IRCOMM_PARITY_ENABLE;
+ if (!(cflag & PARODD))
+ cval |= IRCOMM_PARITY_EVEN;
+
+ /* Determine divisor based on baud rate */
+ baud = tty_get_baud_rate(tty);
+ if (!baud)
+ baud = 9600; /* B0 transition handled in rs_set_termios */
+
+ self->settings.data_rate = baud;
+ ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
+
+ /* CTS flow control flag and modem status interrupts */
+ if (cflag & CRTSCTS) {
+ self->port.flags |= ASYNC_CTS_FLOW;
+ self->settings.flow_control |= IRCOMM_RTS_CTS_IN;
+ /* This got me. Bummer. Jean II */
+ if (self->service_type == IRCOMM_3_WIRE_RAW)
+ net_warn_ratelimited("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n",
+ __func__);
+ } else {
+ self->port.flags &= ~ASYNC_CTS_FLOW;
+ self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
+ }
+ if (cflag & CLOCAL)
+ self->port.flags &= ~ASYNC_CHECK_CD;
+ else
+ self->port.flags |= ASYNC_CHECK_CD;
+#if 0
+ /*
+ * Set up parity check flag
+ */
+
+ if (I_INPCK(self->tty))
+ driver->read_status_mask |= LSR_FE | LSR_PE;
+ if (I_BRKINT(driver->tty) || I_PARMRK(driver->tty))
+ driver->read_status_mask |= LSR_BI;
+
+ /*
+ * Characters to ignore
+ */
+ driver->ignore_status_mask = 0;
+ if (I_IGNPAR(driver->tty))
+ driver->ignore_status_mask |= LSR_PE | LSR_FE;
+
+ if (I_IGNBRK(self->tty)) {
+ self->ignore_status_mask |= LSR_BI;
+ /*
+ * If we're ignore parity and break indicators, ignore
+ * overruns too. (For real raw support).
+ */
+ if (I_IGNPAR(self->tty))
+ self->ignore_status_mask |= LSR_OE;
+ }
+#endif
+ self->settings.data_format = cval;
+
+ ircomm_param_request(self, IRCOMM_DATA_FORMAT, FALSE);
+ ircomm_param_request(self, IRCOMM_FLOW_CONTROL, TRUE);
+}
+
+/*
+ * Function ircomm_tty_set_termios (tty, old_termios)
+ *
+ * This routine allows the tty driver to be notified when device's
+ * termios settings have changed. Note that a well-designed tty driver
+ * should be prepared to accept the case where old == NULL, and try to
+ * do something rational.
+ */
+void ircomm_tty_set_termios(struct tty_struct *tty,
+ struct ktermios *old_termios)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ unsigned int cflag = tty->termios.c_cflag;
+
+ if ((cflag == old_termios->c_cflag) &&
+ (RELEVANT_IFLAG(tty->termios.c_iflag) ==
+ RELEVANT_IFLAG(old_termios->c_iflag)))
+ {
+ return;
+ }
+
+ ircomm_tty_change_speed(self, tty);
+
+ /* Handle transition to B0 status */
+ if ((old_termios->c_cflag & CBAUD) &&
+ !(cflag & CBAUD)) {
+ self->settings.dte &= ~(IRCOMM_DTR|IRCOMM_RTS);
+ ircomm_param_request(self, IRCOMM_DTE, TRUE);
+ }
+
+ /* Handle transition away from B0 status */
+ if (!(old_termios->c_cflag & CBAUD) &&
+ (cflag & CBAUD)) {
+ self->settings.dte |= IRCOMM_DTR;
+ if (!(tty->termios.c_cflag & CRTSCTS) ||
+ !test_bit(TTY_THROTTLED, &tty->flags)) {
+ self->settings.dte |= IRCOMM_RTS;
+ }
+ ircomm_param_request(self, IRCOMM_DTE, TRUE);
+ }
+
+ /* Handle turning off CRTSCTS */
+ if ((old_termios->c_cflag & CRTSCTS) &&
+ !(tty->termios.c_cflag & CRTSCTS))
+ {
+ tty->hw_stopped = 0;
+ ircomm_tty_start(tty);
+ }
+}
+
+/*
+ * Function ircomm_tty_tiocmget (tty)
+ *
+ *
+ *
+ */
+int ircomm_tty_tiocmget(struct tty_struct *tty)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ unsigned int result;
+
+ if (tty->flags & (1 << TTY_IO_ERROR))
+ return -EIO;
+
+ result = ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0)
+ | ((self->settings.dte & IRCOMM_DTR) ? TIOCM_DTR : 0)
+ | ((self->settings.dce & IRCOMM_CD) ? TIOCM_CAR : 0)
+ | ((self->settings.dce & IRCOMM_RI) ? TIOCM_RNG : 0)
+ | ((self->settings.dce & IRCOMM_DSR) ? TIOCM_DSR : 0)
+ | ((self->settings.dce & IRCOMM_CTS) ? TIOCM_CTS : 0);
+ return result;
+}
+
+/*
+ * Function ircomm_tty_tiocmset (tty, set, clear)
+ *
+ *
+ *
+ */
+int ircomm_tty_tiocmset(struct tty_struct *tty,
+ unsigned int set, unsigned int clear)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+ if (tty->flags & (1 << TTY_IO_ERROR))
+ return -EIO;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+ if (set & TIOCM_RTS)
+ self->settings.dte |= IRCOMM_RTS;
+ if (set & TIOCM_DTR)
+ self->settings.dte |= IRCOMM_DTR;
+
+ if (clear & TIOCM_RTS)
+ self->settings.dte &= ~IRCOMM_RTS;
+ if (clear & TIOCM_DTR)
+ self->settings.dte &= ~IRCOMM_DTR;
+
+ if ((set|clear) & TIOCM_RTS)
+ self->settings.dte |= IRCOMM_DELTA_RTS;
+ if ((set|clear) & TIOCM_DTR)
+ self->settings.dte |= IRCOMM_DELTA_DTR;
+
+ ircomm_param_request(self, IRCOMM_DTE, TRUE);
+
+ return 0;
+}
+
+/*
+ * Function get_serial_info (driver, retinfo)
+ *
+ *
+ *
+ */
+static int ircomm_tty_get_serial_info(struct ircomm_tty_cb *self,
+ struct serial_struct __user *retinfo)
+{
+ struct serial_struct info;
+
+ if (!retinfo)
+ return -EFAULT;
+
+ memset(&info, 0, sizeof(info));
+ info.line = self->line;
+ info.flags = self->port.flags;
+ info.baud_base = self->settings.data_rate;
+ info.close_delay = self->port.close_delay;
+ info.closing_wait = self->port.closing_wait;
+
+ /* For compatibility */
+ info.type = PORT_16550A;
+ info.port = 0;
+ info.irq = 0;
+ info.xmit_fifo_size = 0;
+ info.hub6 = 0;
+ info.custom_divisor = 0;
+
+ if (copy_to_user(retinfo, &info, sizeof(*retinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * Function set_serial_info (driver, new_info)
+ *
+ *
+ *
+ */
+static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self,
+ struct serial_struct __user *new_info)
+{
+#if 0
+ struct serial_struct new_serial;
+ struct ircomm_tty_cb old_state, *state;
+
+ if (copy_from_user(&new_serial,new_info,sizeof(new_serial)))
+ return -EFAULT;
+
+
+ state = self
+ old_state = *self;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ if ((new_serial.baud_base != state->settings.data_rate) ||
+ (new_serial.close_delay != state->close_delay) ||
+ ((new_serial.flags & ~ASYNC_USR_MASK) !=
+ (self->flags & ~ASYNC_USR_MASK)))
+ return -EPERM;
+ state->flags = ((state->flags & ~ASYNC_USR_MASK) |
+ (new_serial.flags & ASYNC_USR_MASK));
+ self->flags = ((self->flags & ~ASYNC_USR_MASK) |
+ (new_serial.flags & ASYNC_USR_MASK));
+ /* self->custom_divisor = new_serial.custom_divisor; */
+ goto check_and_exit;
+ }
+
+ /*
+ * OK, past this point, all the error checking has been done.
+ * At this point, we start making changes.....
+ */
+
+ if (self->settings.data_rate != new_serial.baud_base) {
+ self->settings.data_rate = new_serial.baud_base;
+ ircomm_param_request(self, IRCOMM_DATA_RATE, TRUE);
+ }
+
+ self->close_delay = new_serial.close_delay * HZ/100;
+ self->closing_wait = new_serial.closing_wait * HZ/100;
+ /* self->custom_divisor = new_serial.custom_divisor; */
+
+ self->flags = ((self->flags & ~ASYNC_FLAGS) |
+ (new_serial.flags & ASYNC_FLAGS));
+ self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+
+ check_and_exit:
+
+ if (self->flags & ASYNC_INITIALIZED) {
+ if (((old_state.flags & ASYNC_SPD_MASK) !=
+ (self->flags & ASYNC_SPD_MASK)) ||
+ (old_driver.custom_divisor != driver->custom_divisor)) {
+ if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI)
+ driver->tty->alt_speed = 57600;
+ if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI)
+ driver->tty->alt_speed = 115200;
+ if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI)
+ driver->tty->alt_speed = 230400;
+ if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP)
+ driver->tty->alt_speed = 460800;
+ ircomm_tty_change_speed(driver);
+ }
+ }
+#endif
+ return 0;
+}
+
+/*
+ * Function ircomm_tty_ioctl (tty, cmd, arg)
+ *
+ *
+ *
+ */
+int ircomm_tty_ioctl(struct tty_struct *tty,
+ unsigned int cmd, unsigned long arg)
+{
+ struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+ int ret = 0;
+
+ if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
+ (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
+ (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
+ if (tty->flags & (1 << TTY_IO_ERROR))
+ return -EIO;
+ }
+
+ switch (cmd) {
+ case TIOCGSERIAL:
+ ret = ircomm_tty_get_serial_info(self, (struct serial_struct __user *) arg);
+ break;
+ case TIOCSSERIAL:
+ ret = ircomm_tty_set_serial_info(self, (struct serial_struct __user *) arg);
+ break;
+ case TIOCMIWAIT:
+ pr_debug("(), TIOCMIWAIT, not impl!\n");
+ break;
+
+ case TIOCGICOUNT:
+ pr_debug("%s(), TIOCGICOUNT not impl!\n", __func__);
+#if 0
+ save_flags(flags); cli();
+ cnow = driver->icount;
+ restore_flags(flags);
+ p_cuser = (struct serial_icounter_struct __user *) arg;
+ if (put_user(cnow.cts, &p_cuser->cts) ||
+ put_user(cnow.dsr, &p_cuser->dsr) ||
+ put_user(cnow.rng, &p_cuser->rng) ||
+ put_user(cnow.dcd, &p_cuser->dcd) ||
+ put_user(cnow.rx, &p_cuser->rx) ||
+ put_user(cnow.tx, &p_cuser->tx) ||
+ put_user(cnow.frame, &p_cuser->frame) ||
+ put_user(cnow.overrun, &p_cuser->overrun) ||
+ put_user(cnow.parity, &p_cuser->parity) ||
+ put_user(cnow.brk, &p_cuser->brk) ||
+ put_user(cnow.buf_overrun, &p_cuser->buf_overrun))
+ return -EFAULT;
+#endif
+ return 0;
+ default:
+ ret = -ENOIOCTLCMD; /* ioctls which we must ignore */
+ }
+ return ret;
+}
+
+
+
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
new file mode 100644
index 000000000..856736656
--- /dev/null
+++ b/net/irda/irda_device.c
@@ -0,0 +1,316 @@
+/*********************************************************************
+ *
+ * Filename: irda_device.c
+ * Version: 0.9
+ * Description: Utility functions used by the device drivers
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sat Oct 9 09:22:27 1999
+ * Modified at: Sun Jan 23 17:41:24 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2001 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/string.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/kmod.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <asm/ioctls.h>
+#include <asm/uaccess.h>
+#include <asm/dma.h>
+#include <asm/io.h>
+
+#include <net/irda/irda_device.h>
+#include <net/irda/irlap.h>
+#include <net/irda/timer.h>
+#include <net/irda/wrapper.h>
+
+static void __irda_task_delete(struct irda_task *task);
+
+static hashbin_t *dongles = NULL;
+static hashbin_t *tasks = NULL;
+
+static void irda_task_timer_expired(void *data);
+
+int __init irda_device_init( void)
+{
+ dongles = hashbin_new(HB_NOLOCK);
+ if (dongles == NULL) {
+ net_warn_ratelimited("IrDA: Can't allocate dongles hashbin!\n");
+ return -ENOMEM;
+ }
+ spin_lock_init(&dongles->hb_spinlock);
+
+ tasks = hashbin_new(HB_LOCK);
+ if (tasks == NULL) {
+ net_warn_ratelimited("IrDA: Can't allocate tasks hashbin!\n");
+ hashbin_delete(dongles, NULL);
+ return -ENOMEM;
+ }
+
+ /* We no longer initialise the driver ourselves here, we let
+ * the system do it for us... - Jean II */
+
+ return 0;
+}
+
+static void leftover_dongle(void *arg)
+{
+ struct dongle_reg *reg = arg;
+ net_warn_ratelimited("IrDA: Dongle type %x not unregistered\n",
+ reg->type);
+}
+
+void irda_device_cleanup(void)
+{
+ hashbin_delete(tasks, (FREE_FUNC) __irda_task_delete);
+
+ hashbin_delete(dongles, leftover_dongle);
+}
+
+/*
+ * Function irda_device_set_media_busy (self, status)
+ *
+ * Called when we have detected that another station is transmitting
+ * in contention mode.
+ */
+void irda_device_set_media_busy(struct net_device *dev, int status)
+{
+ struct irlap_cb *self;
+
+ pr_debug("%s(%s)\n", __func__, status ? "TRUE" : "FALSE");
+
+ self = (struct irlap_cb *) dev->atalk_ptr;
+
+ /* Some drivers may enable the receive interrupt before calling
+ * irlap_open(), or they may disable the receive interrupt
+ * after calling irlap_close().
+ * The IrDA stack is protected from this in irlap_driver_rcv().
+ * However, the driver calls directly the wrapper, that calls
+ * us directly. Make sure we protect ourselves.
+ * Jean II */
+ if (!self || self->magic != LAP_MAGIC)
+ return;
+
+ if (status) {
+ self->media_busy = TRUE;
+ if (status == SMALL)
+ irlap_start_mbusy_timer(self, SMALLBUSY_TIMEOUT);
+ else
+ irlap_start_mbusy_timer(self, MEDIABUSY_TIMEOUT);
+ pr_debug("Media busy!\n");
+ } else {
+ self->media_busy = FALSE;
+ irlap_stop_mbusy_timer(self);
+ }
+}
+EXPORT_SYMBOL(irda_device_set_media_busy);
+
+
+/*
+ * Function irda_device_is_receiving (dev)
+ *
+ * Check if the device driver is currently receiving data
+ *
+ */
+int irda_device_is_receiving(struct net_device *dev)
+{
+ struct if_irda_req req;
+ int ret;
+
+ if (!dev->netdev_ops->ndo_do_ioctl) {
+ net_err_ratelimited("%s: do_ioctl not impl. by device driver\n",
+ __func__);
+ return -1;
+ }
+
+ ret = (dev->netdev_ops->ndo_do_ioctl)(dev, (struct ifreq *) &req,
+ SIOCGRECEIVING);
+ if (ret < 0)
+ return ret;
+
+ return req.ifr_receiving;
+}
+
+static void __irda_task_delete(struct irda_task *task)
+{
+ del_timer(&task->timer);
+
+ kfree(task);
+}
+
+static void irda_task_delete(struct irda_task *task)
+{
+ /* Unregister task */
+ hashbin_remove(tasks, (long) task, NULL);
+
+ __irda_task_delete(task);
+}
+
+/*
+ * Function irda_task_kick (task)
+ *
+ * Tries to execute a task possible multiple times until the task is either
+ * finished, or askes for a timeout. When a task is finished, we do post
+ * processing, and notify the parent task, that is waiting for this task
+ * to complete.
+ */
+static int irda_task_kick(struct irda_task *task)
+{
+ int finished = TRUE;
+ int count = 0;
+ int timeout;
+
+ IRDA_ASSERT(task != NULL, return -1;);
+ IRDA_ASSERT(task->magic == IRDA_TASK_MAGIC, return -1;);
+
+ /* Execute task until it's finished, or askes for a timeout */
+ do {
+ timeout = task->function(task);
+ if (count++ > 100) {
+ net_err_ratelimited("%s: error in task handler!\n",
+ __func__);
+ irda_task_delete(task);
+ return TRUE;
+ }
+ } while ((timeout == 0) && (task->state != IRDA_TASK_DONE));
+
+ if (timeout < 0) {
+ net_err_ratelimited("%s: Error executing task!\n", __func__);
+ irda_task_delete(task);
+ return TRUE;
+ }
+
+ /* Check if we are finished */
+ if (task->state == IRDA_TASK_DONE) {
+ del_timer(&task->timer);
+
+ /* Do post processing */
+ if (task->finished)
+ task->finished(task);
+
+ /* Notify parent */
+ if (task->parent) {
+ /* Check if parent is waiting for us to complete */
+ if (task->parent->state == IRDA_TASK_CHILD_WAIT) {
+ task->parent->state = IRDA_TASK_CHILD_DONE;
+
+ /* Stop timer now that we are here */
+ del_timer(&task->parent->timer);
+
+ /* Kick parent task */
+ irda_task_kick(task->parent);
+ }
+ }
+ irda_task_delete(task);
+ } else if (timeout > 0) {
+ irda_start_timer(&task->timer, timeout, (void *) task,
+ irda_task_timer_expired);
+ finished = FALSE;
+ } else {
+ pr_debug("%s(), not finished, and no timeout!\n",
+ __func__);
+ finished = FALSE;
+ }
+
+ return finished;
+}
+
+/*
+ * Function irda_task_timer_expired (data)
+ *
+ * Task time has expired. We now try to execute task (again), and restart
+ * the timer if the task has not finished yet
+ */
+static void irda_task_timer_expired(void *data)
+{
+ struct irda_task *task;
+
+ task = data;
+
+ irda_task_kick(task);
+}
+
+/*
+ * Function irda_device_setup (dev)
+ *
+ * This function should be used by low level device drivers in a similar way
+ * as ether_setup() is used by normal network device drivers
+ */
+static void irda_device_setup(struct net_device *dev)
+{
+ dev->hard_header_len = 0;
+ dev->addr_len = LAP_ALEN;
+
+ dev->type = ARPHRD_IRDA;
+ dev->tx_queue_len = 8; /* Window size + 1 s-frame */
+
+ memset(dev->broadcast, 0xff, LAP_ALEN);
+
+ dev->mtu = 2048;
+ dev->flags = IFF_NOARP;
+}
+
+/*
+ * Funciton alloc_irdadev
+ * Allocates and sets up an IRDA device in a manner similar to
+ * alloc_etherdev.
+ */
+struct net_device *alloc_irdadev(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "irda%d", NET_NAME_UNKNOWN,
+ irda_device_setup);
+}
+EXPORT_SYMBOL(alloc_irdadev);
+
+#ifdef CONFIG_ISA_DMA_API
+/*
+ * Function setup_dma (idev, buffer, count, mode)
+ *
+ * Setup the DMA channel. Commonly used by LPC FIR drivers
+ *
+ */
+void irda_setup_dma(int channel, dma_addr_t buffer, int count, int mode)
+{
+ unsigned long flags;
+
+ flags = claim_dma_lock();
+
+ disable_dma(channel);
+ clear_dma_ff(channel);
+ set_dma_mode(channel, mode);
+ set_dma_addr(channel, buffer);
+ set_dma_count(channel, count);
+ enable_dma(channel);
+
+ release_dma_lock(flags);
+}
+EXPORT_SYMBOL(irda_setup_dma);
+#endif
diff --git a/net/irda/iriap.c b/net/irda/iriap.c
new file mode 100644
index 000000000..4a7ae32af
--- /dev/null
+++ b/net/irda/iriap.c
@@ -0,0 +1,1081 @@
+/*********************************************************************
+ *
+ * Filename: iriap.c
+ * Version: 0.8
+ * Description: Information Access Protocol (IAP)
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Thu Aug 21 00:02:07 1997
+ * Modified at: Sat Dec 25 16:42:42 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/iriap_event.h>
+#include <net/irda/iriap.h>
+
+/* FIXME: This one should go in irlmp.c */
+static const char *const ias_charset_types[] __maybe_unused = {
+ "CS_ASCII",
+ "CS_ISO_8859_1",
+ "CS_ISO_8859_2",
+ "CS_ISO_8859_3",
+ "CS_ISO_8859_4",
+ "CS_ISO_8859_5",
+ "CS_ISO_8859_6",
+ "CS_ISO_8859_7",
+ "CS_ISO_8859_8",
+ "CS_ISO_8859_9",
+ "CS_UNICODE"
+};
+
+static hashbin_t *iriap = NULL;
+static void *service_handle;
+
+static void __iriap_close(struct iriap_cb *self);
+static int iriap_register_lsap(struct iriap_cb *self, __u8 slsap_sel, int mode);
+static void iriap_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason, struct sk_buff *skb);
+static void iriap_connect_indication(void *instance, void *sap,
+ struct qos_info *qos, __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb);
+static void iriap_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size, __u8 max_header_size,
+ struct sk_buff *skb);
+static int iriap_data_indication(void *instance, void *sap,
+ struct sk_buff *skb);
+
+static void iriap_watchdog_timer_expired(void *data);
+
+static inline void iriap_start_watchdog_timer(struct iriap_cb *self,
+ int timeout)
+{
+ irda_start_timer(&self->watchdog_timer, timeout, self,
+ iriap_watchdog_timer_expired);
+}
+
+static struct lock_class_key irias_objects_key;
+
+/*
+ * Function iriap_init (void)
+ *
+ * Initializes the IrIAP layer, called by the module initialization code
+ * in irmod.c
+ */
+int __init iriap_init(void)
+{
+ struct ias_object *obj;
+ struct iriap_cb *server;
+ __u8 oct_seq[6];
+ __u16 hints;
+
+ /* Allocate master array */
+ iriap = hashbin_new(HB_LOCK);
+ if (!iriap)
+ return -ENOMEM;
+
+ /* Object repository - defined in irias_object.c */
+ irias_objects = hashbin_new(HB_LOCK);
+ if (!irias_objects) {
+ net_warn_ratelimited("%s: Can't allocate irias_objects hashbin!\n",
+ __func__);
+ hashbin_delete(iriap, NULL);
+ return -ENOMEM;
+ }
+
+ lockdep_set_class_and_name(&irias_objects->hb_spinlock, &irias_objects_key,
+ "irias_objects");
+
+ /*
+ * Register some default services for IrLMP
+ */
+ hints = irlmp_service_to_hint(S_COMPUTER);
+ service_handle = irlmp_register_service(hints);
+
+ /* Register the Device object with LM-IAS */
+ obj = irias_new_object("Device", IAS_DEVICE_ID);
+ irias_add_string_attrib(obj, "DeviceName", "Linux", IAS_KERNEL_ATTR);
+
+ oct_seq[0] = 0x01; /* Version 1 */
+ oct_seq[1] = 0x00; /* IAS support bits */
+ oct_seq[2] = 0x00; /* LM-MUX support bits */
+#ifdef CONFIG_IRDA_ULTRA
+ oct_seq[2] |= 0x04; /* Connectionless Data support */
+#endif
+ irias_add_octseq_attrib(obj, "IrLMPSupport", oct_seq, 3,
+ IAS_KERNEL_ATTR);
+ irias_insert_object(obj);
+
+ /*
+ * Register server support with IrLMP so we can accept incoming
+ * connections
+ */
+ server = iriap_open(LSAP_IAS, IAS_SERVER, NULL, NULL);
+ if (!server) {
+ pr_debug("%s(), unable to open server\n", __func__);
+ return -1;
+ }
+ iriap_register_lsap(server, LSAP_IAS, IAS_SERVER);
+
+ return 0;
+}
+
+/*
+ * Function iriap_cleanup (void)
+ *
+ * Initializes the IrIAP layer, called by the module cleanup code in
+ * irmod.c
+ */
+void iriap_cleanup(void)
+{
+ irlmp_unregister_service(service_handle);
+
+ hashbin_delete(iriap, (FREE_FUNC) __iriap_close);
+ hashbin_delete(irias_objects, (FREE_FUNC) __irias_delete_object);
+}
+
+/*
+ * Function iriap_open (void)
+ *
+ * Opens an instance of the IrIAP layer, and registers with IrLMP
+ */
+struct iriap_cb *iriap_open(__u8 slsap_sel, int mode, void *priv,
+ CONFIRM_CALLBACK callback)
+{
+ struct iriap_cb *self;
+
+ self = kzalloc(sizeof(*self), GFP_ATOMIC);
+ if (!self)
+ return NULL;
+
+ /*
+ * Initialize instance
+ */
+
+ self->magic = IAS_MAGIC;
+ self->mode = mode;
+ if (mode == IAS_CLIENT)
+ iriap_register_lsap(self, slsap_sel, mode);
+
+ self->confirm = callback;
+ self->priv = priv;
+
+ /* iriap_getvaluebyclass_request() will construct packets before
+ * we connect, so this must have a sane value... Jean II */
+ self->max_header_size = LMP_MAX_HEADER;
+
+ init_timer(&self->watchdog_timer);
+
+ hashbin_insert(iriap, (irda_queue_t *) self, (long) self, NULL);
+
+ /* Initialize state machines */
+ iriap_next_client_state(self, S_DISCONNECT);
+ iriap_next_call_state(self, S_MAKE_CALL);
+ iriap_next_server_state(self, R_DISCONNECT);
+ iriap_next_r_connect_state(self, R_WAITING);
+
+ return self;
+}
+EXPORT_SYMBOL(iriap_open);
+
+/*
+ * Function __iriap_close (self)
+ *
+ * Removes (deallocates) the IrIAP instance
+ *
+ */
+static void __iriap_close(struct iriap_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ del_timer(&self->watchdog_timer);
+
+ if (self->request_skb)
+ dev_kfree_skb(self->request_skb);
+
+ self->magic = 0;
+
+ kfree(self);
+}
+
+/*
+ * Function iriap_close (void)
+ *
+ * Closes IrIAP and deregisters with IrLMP
+ */
+void iriap_close(struct iriap_cb *self)
+{
+ struct iriap_cb *entry;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ if (self->lsap) {
+ irlmp_close_lsap(self->lsap);
+ self->lsap = NULL;
+ }
+
+ entry = (struct iriap_cb *) hashbin_remove(iriap, (long) self, NULL);
+ IRDA_ASSERT(entry == self, return;);
+
+ __iriap_close(self);
+}
+EXPORT_SYMBOL(iriap_close);
+
+static int iriap_register_lsap(struct iriap_cb *self, __u8 slsap_sel, int mode)
+{
+ notify_t notify;
+
+ irda_notify_init(&notify);
+ notify.connect_confirm = iriap_connect_confirm;
+ notify.connect_indication = iriap_connect_indication;
+ notify.disconnect_indication = iriap_disconnect_indication;
+ notify.data_indication = iriap_data_indication;
+ notify.instance = self;
+ if (mode == IAS_CLIENT)
+ strcpy(notify.name, "IrIAS cli");
+ else
+ strcpy(notify.name, "IrIAS srv");
+
+ self->lsap = irlmp_open_lsap(slsap_sel, &notify, 0);
+ if (self->lsap == NULL) {
+ net_err_ratelimited("%s: Unable to allocated LSAP!\n",
+ __func__);
+ return -1;
+ }
+ self->slsap_sel = self->lsap->slsap_sel;
+
+ return 0;
+}
+
+/*
+ * Function iriap_disconnect_indication (handle, reason)
+ *
+ * Got disconnect, so clean up everything associated with this connection
+ *
+ */
+static void iriap_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason,
+ struct sk_buff *skb)
+{
+ struct iriap_cb *self;
+
+ pr_debug("%s(), reason=%s [%d]\n", __func__,
+ irlmp_reason_str(reason), reason);
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ IRDA_ASSERT(iriap != NULL, return;);
+
+ del_timer(&self->watchdog_timer);
+
+ /* Not needed */
+ if (skb)
+ dev_kfree_skb(skb);
+
+ if (self->mode == IAS_CLIENT) {
+ pr_debug("%s(), disconnect as client\n", __func__);
+
+
+ iriap_do_client_event(self, IAP_LM_DISCONNECT_INDICATION,
+ NULL);
+ /*
+ * Inform service user that the request failed by sending
+ * it a NULL value. Warning, the client might close us, so
+ * remember no to use self anymore after calling confirm
+ */
+ if (self->confirm)
+ self->confirm(IAS_DISCONNECT, 0, NULL, self->priv);
+ } else {
+ pr_debug("%s(), disconnect as server\n", __func__);
+ iriap_do_server_event(self, IAP_LM_DISCONNECT_INDICATION,
+ NULL);
+ iriap_close(self);
+ }
+}
+
+/*
+ * Function iriap_disconnect_request (handle)
+ */
+static void iriap_disconnect_request(struct iriap_cb *self)
+{
+ struct sk_buff *tx_skb;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+ if (tx_skb == NULL) {
+ pr_debug("%s(), Could not allocate an sk_buff of length %d\n",
+ __func__, LMP_MAX_HEADER);
+ return;
+ }
+
+ /*
+ * Reserve space for MUX control and LAP header
+ */
+ skb_reserve(tx_skb, LMP_MAX_HEADER);
+
+ irlmp_disconnect_request(self->lsap, tx_skb);
+}
+
+/*
+ * Function iriap_getvaluebyclass (addr, name, attr)
+ *
+ * Retrieve all values from attribute in all objects with given class
+ * name
+ */
+int iriap_getvaluebyclass_request(struct iriap_cb *self,
+ __u32 saddr, __u32 daddr,
+ char *name, char *attr)
+{
+ struct sk_buff *tx_skb;
+ int name_len, attr_len, skb_len;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return -1;);
+
+ /* Client must supply the destination device address */
+ if (!daddr)
+ return -1;
+
+ self->daddr = daddr;
+ self->saddr = saddr;
+
+ /*
+ * Save operation, so we know what the later indication is about
+ */
+ self->operation = GET_VALUE_BY_CLASS;
+
+ /* Give ourselves 10 secs to finish this operation */
+ iriap_start_watchdog_timer(self, 10*HZ);
+
+ name_len = strlen(name); /* Up to IAS_MAX_CLASSNAME = 60 */
+ attr_len = strlen(attr); /* Up to IAS_MAX_ATTRIBNAME = 60 */
+
+ skb_len = self->max_header_size+2+name_len+1+attr_len+4;
+ tx_skb = alloc_skb(skb_len, GFP_ATOMIC);
+ if (!tx_skb)
+ return -ENOMEM;
+
+ /* Reserve space for MUX and LAP header */
+ skb_reserve(tx_skb, self->max_header_size);
+ skb_put(tx_skb, 3+name_len+attr_len);
+ frame = tx_skb->data;
+
+ /* Build frame */
+ frame[0] = IAP_LST | GET_VALUE_BY_CLASS;
+ frame[1] = name_len; /* Insert length of name */
+ memcpy(frame+2, name, name_len); /* Insert name */
+ frame[2+name_len] = attr_len; /* Insert length of attr */
+ memcpy(frame+3+name_len, attr, attr_len); /* Insert attr */
+
+ iriap_do_client_event(self, IAP_CALL_REQUEST_GVBC, tx_skb);
+
+ /* Drop reference count - see state_s_disconnect(). */
+ dev_kfree_skb(tx_skb);
+
+ return 0;
+}
+EXPORT_SYMBOL(iriap_getvaluebyclass_request);
+
+/*
+ * Function iriap_getvaluebyclass_confirm (self, skb)
+ *
+ * Got result from GetValueByClass command. Parse it and return result
+ * to service user.
+ *
+ */
+static void iriap_getvaluebyclass_confirm(struct iriap_cb *self,
+ struct sk_buff *skb)
+{
+ struct ias_value *value;
+ int charset;
+ __u32 value_len;
+ __u32 tmp_cpu32;
+ __u16 obj_id;
+ __u16 len;
+ __u8 type;
+ __u8 *fp;
+ int n;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ /* Initialize variables */
+ fp = skb->data;
+ n = 2;
+
+ /* Get length, MSB first */
+ len = get_unaligned_be16(fp + n);
+ n += 2;
+
+ pr_debug("%s(), len=%d\n", __func__, len);
+
+ /* Get object ID, MSB first */
+ obj_id = get_unaligned_be16(fp + n);
+ n += 2;
+
+ type = fp[n++];
+ pr_debug("%s(), Value type = %d\n", __func__, type);
+
+ switch (type) {
+ case IAS_INTEGER:
+ memcpy(&tmp_cpu32, fp+n, 4); n += 4;
+ be32_to_cpus(&tmp_cpu32);
+ value = irias_new_integer_value(tmp_cpu32);
+
+ /* Legal values restricted to 0x01-0x6f, page 15 irttp */
+ pr_debug("%s(), lsap=%d\n", __func__, value->t.integer);
+ break;
+ case IAS_STRING:
+ charset = fp[n++];
+
+ switch (charset) {
+ case CS_ASCII:
+ break;
+/* case CS_ISO_8859_1: */
+/* case CS_ISO_8859_2: */
+/* case CS_ISO_8859_3: */
+/* case CS_ISO_8859_4: */
+/* case CS_ISO_8859_5: */
+/* case CS_ISO_8859_6: */
+/* case CS_ISO_8859_7: */
+/* case CS_ISO_8859_8: */
+/* case CS_ISO_8859_9: */
+/* case CS_UNICODE: */
+ default:
+ pr_debug("%s(), charset [%d] %s, not supported\n",
+ __func__, charset,
+ charset < ARRAY_SIZE(ias_charset_types) ?
+ ias_charset_types[charset] :
+ "(unknown)");
+
+ /* Aborting, close connection! */
+ iriap_disconnect_request(self);
+ return;
+ /* break; */
+ }
+ value_len = fp[n++];
+ pr_debug("%s(), strlen=%d\n", __func__, value_len);
+
+ /* Make sure the string is null-terminated */
+ if (n + value_len < skb->len)
+ fp[n + value_len] = 0x00;
+ pr_debug("Got string %s\n", fp+n);
+
+ /* Will truncate to IAS_MAX_STRING bytes */
+ value = irias_new_string_value(fp+n);
+ break;
+ case IAS_OCT_SEQ:
+ value_len = get_unaligned_be16(fp + n);
+ n += 2;
+
+ /* Will truncate to IAS_MAX_OCTET_STRING bytes */
+ value = irias_new_octseq_value(fp+n, value_len);
+ break;
+ default:
+ value = irias_new_missing_value();
+ break;
+ }
+
+ /* Finished, close connection! */
+ iriap_disconnect_request(self);
+
+ /* Warning, the client might close us, so remember no to use self
+ * anymore after calling confirm
+ */
+ if (self->confirm)
+ self->confirm(IAS_SUCCESS, obj_id, value, self->priv);
+ else {
+ pr_debug("%s(), missing handler!\n", __func__);
+ irias_delete_value(value);
+ }
+}
+
+/*
+ * Function iriap_getvaluebyclass_response ()
+ *
+ * Send answer back to remote LM-IAS
+ *
+ */
+static void iriap_getvaluebyclass_response(struct iriap_cb *self,
+ __u16 obj_id,
+ __u8 ret_code,
+ struct ias_value *value)
+{
+ struct sk_buff *tx_skb;
+ int n;
+ __be32 tmp_be32;
+ __be16 tmp_be16;
+ __u8 *fp;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+ IRDA_ASSERT(value != NULL, return;);
+ IRDA_ASSERT(value->len <= 1024, return;);
+
+ /* Initialize variables */
+ n = 0;
+
+ /*
+ * We must adjust the size of the response after the length of the
+ * value. We add 32 bytes because of the 6 bytes for the frame and
+ * max 5 bytes for the value coding.
+ */
+ tx_skb = alloc_skb(value->len + self->max_header_size + 32,
+ GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ /* Reserve space for MUX and LAP header */
+ skb_reserve(tx_skb, self->max_header_size);
+ skb_put(tx_skb, 6);
+
+ fp = tx_skb->data;
+
+ /* Build frame */
+ fp[n++] = GET_VALUE_BY_CLASS | IAP_LST;
+ fp[n++] = ret_code;
+
+ /* Insert list length (MSB first) */
+ tmp_be16 = htons(0x0001);
+ memcpy(fp+n, &tmp_be16, 2); n += 2;
+
+ /* Insert object identifier ( MSB first) */
+ tmp_be16 = cpu_to_be16(obj_id);
+ memcpy(fp+n, &tmp_be16, 2); n += 2;
+
+ switch (value->type) {
+ case IAS_STRING:
+ skb_put(tx_skb, 3 + value->len);
+ fp[n++] = value->type;
+ fp[n++] = 0; /* ASCII */
+ fp[n++] = (__u8) value->len;
+ memcpy(fp+n, value->t.string, value->len); n+=value->len;
+ break;
+ case IAS_INTEGER:
+ skb_put(tx_skb, 5);
+ fp[n++] = value->type;
+
+ tmp_be32 = cpu_to_be32(value->t.integer);
+ memcpy(fp+n, &tmp_be32, 4); n += 4;
+ break;
+ case IAS_OCT_SEQ:
+ skb_put(tx_skb, 3 + value->len);
+ fp[n++] = value->type;
+
+ tmp_be16 = cpu_to_be16(value->len);
+ memcpy(fp+n, &tmp_be16, 2); n += 2;
+ memcpy(fp+n, value->t.oct_seq, value->len); n+=value->len;
+ break;
+ case IAS_MISSING:
+ pr_debug("%s: sending IAS_MISSING\n", __func__);
+ skb_put(tx_skb, 1);
+ fp[n++] = value->type;
+ break;
+ default:
+ pr_debug("%s(), type not implemented!\n", __func__);
+ break;
+ }
+ iriap_do_r_connect_event(self, IAP_CALL_RESPONSE, tx_skb);
+
+ /* Drop reference count - see state_r_execute(). */
+ dev_kfree_skb(tx_skb);
+}
+
+/*
+ * Function iriap_getvaluebyclass_indication (self, skb)
+ *
+ * getvaluebyclass is requested from peer LM-IAS
+ *
+ */
+static void iriap_getvaluebyclass_indication(struct iriap_cb *self,
+ struct sk_buff *skb)
+{
+ struct ias_object *obj;
+ struct ias_attrib *attrib;
+ int name_len;
+ int attr_len;
+ char name[IAS_MAX_CLASSNAME + 1]; /* 60 bytes */
+ char attr[IAS_MAX_ATTRIBNAME + 1]; /* 60 bytes */
+ __u8 *fp;
+ int n;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ fp = skb->data;
+ n = 1;
+
+ name_len = fp[n++];
+
+ IRDA_ASSERT(name_len < IAS_MAX_CLASSNAME + 1, return;);
+
+ memcpy(name, fp+n, name_len); n+=name_len;
+ name[name_len] = '\0';
+
+ attr_len = fp[n++];
+
+ IRDA_ASSERT(attr_len < IAS_MAX_ATTRIBNAME + 1, return;);
+
+ memcpy(attr, fp+n, attr_len); n+=attr_len;
+ attr[attr_len] = '\0';
+
+ pr_debug("LM-IAS: Looking up %s: %s\n", name, attr);
+ obj = irias_find_object(name);
+
+ if (obj == NULL) {
+ pr_debug("LM-IAS: Object %s not found\n", name);
+ iriap_getvaluebyclass_response(self, 0x1235, IAS_CLASS_UNKNOWN,
+ &irias_missing);
+ return;
+ }
+ pr_debug("LM-IAS: found %s, id=%d\n", obj->name, obj->id);
+
+ attrib = irias_find_attrib(obj, attr);
+ if (attrib == NULL) {
+ pr_debug("LM-IAS: Attribute %s not found\n", attr);
+ iriap_getvaluebyclass_response(self, obj->id,
+ IAS_ATTRIB_UNKNOWN,
+ &irias_missing);
+ return;
+ }
+
+ /* We have a match; send the value. */
+ iriap_getvaluebyclass_response(self, obj->id, IAS_SUCCESS,
+ attrib->value);
+}
+
+/*
+ * Function iriap_send_ack (void)
+ *
+ * Currently not used
+ *
+ */
+void iriap_send_ack(struct iriap_cb *self)
+{
+ struct sk_buff *tx_skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ tx_skb = alloc_skb(LMP_MAX_HEADER + 1, GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ /* Reserve space for MUX and LAP header */
+ skb_reserve(tx_skb, self->max_header_size);
+ skb_put(tx_skb, 1);
+ frame = tx_skb->data;
+
+ /* Build frame */
+ frame[0] = IAP_LST | IAP_ACK | self->operation;
+
+ irlmp_data_request(self->lsap, tx_skb);
+}
+
+void iriap_connect_request(struct iriap_cb *self)
+{
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ ret = irlmp_connect_request(self->lsap, LSAP_IAS,
+ self->saddr, self->daddr,
+ NULL, NULL);
+ if (ret < 0) {
+ pr_debug("%s(), connect failed!\n", __func__);
+ self->confirm(IAS_DISCONNECT, 0, NULL, self->priv);
+ }
+}
+
+/*
+ * Function iriap_connect_confirm (handle, skb)
+ *
+ * LSAP connection confirmed!
+ *
+ */
+static void iriap_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos, __u32 max_seg_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct iriap_cb *self;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ self->max_data_size = max_seg_size;
+ self->max_header_size = max_header_size;
+
+ del_timer(&self->watchdog_timer);
+
+ iriap_do_client_event(self, IAP_LM_CONNECT_CONFIRM, skb);
+
+ /* Drop reference count - see state_s_make_call(). */
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function iriap_connect_indication ( handle, skb)
+ *
+ * Remote LM-IAS is requesting connection
+ *
+ */
+static void iriap_connect_indication(void *instance, void *sap,
+ struct qos_info *qos, __u32 max_seg_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct iriap_cb *self, *new;
+
+ self = instance;
+
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(self != NULL, goto out;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, goto out;);
+
+ /* Start new server */
+ new = iriap_open(LSAP_IAS, IAS_SERVER, NULL, NULL);
+ if (!new) {
+ pr_debug("%s(), open failed\n", __func__);
+ goto out;
+ }
+
+ /* Now attach up the new "socket" */
+ new->lsap = irlmp_dup(self->lsap, new);
+ if (!new->lsap) {
+ pr_debug("%s(), dup failed!\n", __func__);
+ goto out;
+ }
+
+ new->max_data_size = max_seg_size;
+ new->max_header_size = max_header_size;
+
+ /* Clean up the original one to keep it in listen state */
+ irlmp_listen(self->lsap);
+
+ iriap_do_server_event(new, IAP_LM_CONNECT_INDICATION, skb);
+
+out:
+ /* Drop reference count - see state_r_disconnect(). */
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function iriap_data_indication (handle, skb)
+ *
+ * Receives data from connection identified by handle from IrLMP
+ *
+ */
+static int iriap_data_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct iriap_cb *self;
+ __u8 *frame;
+ __u8 opcode;
+
+ self = instance;
+
+ IRDA_ASSERT(skb != NULL, return 0;);
+ IRDA_ASSERT(self != NULL, goto out;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, goto out;);
+
+ frame = skb->data;
+
+ if (self->mode == IAS_SERVER) {
+ /* Call server */
+ pr_debug("%s(), Calling server!\n", __func__);
+ iriap_do_r_connect_event(self, IAP_RECV_F_LST, skb);
+ goto out;
+ }
+ opcode = frame[0];
+ if (~opcode & IAP_LST) {
+ net_warn_ratelimited("%s:, IrIAS multiframe commands or results is not implemented yet!\n",
+ __func__);
+ goto out;
+ }
+
+ /* Check for ack frames since they don't contain any data */
+ if (opcode & IAP_ACK) {
+ pr_debug("%s() Got ack frame!\n", __func__);
+ goto out;
+ }
+
+ opcode &= ~IAP_LST; /* Mask away LST bit */
+
+ switch (opcode) {
+ case GET_INFO_BASE:
+ pr_debug("IrLMP GetInfoBaseDetails not implemented!\n");
+ break;
+ case GET_VALUE_BY_CLASS:
+ iriap_do_call_event(self, IAP_RECV_F_LST, NULL);
+
+ switch (frame[1]) {
+ case IAS_SUCCESS:
+ iriap_getvaluebyclass_confirm(self, skb);
+ break;
+ case IAS_CLASS_UNKNOWN:
+ pr_debug("%s(), No such class!\n", __func__);
+ /* Finished, close connection! */
+ iriap_disconnect_request(self);
+
+ /*
+ * Warning, the client might close us, so remember
+ * no to use self anymore after calling confirm
+ */
+ if (self->confirm)
+ self->confirm(IAS_CLASS_UNKNOWN, 0, NULL,
+ self->priv);
+ break;
+ case IAS_ATTRIB_UNKNOWN:
+ pr_debug("%s(), No such attribute!\n", __func__);
+ /* Finished, close connection! */
+ iriap_disconnect_request(self);
+
+ /*
+ * Warning, the client might close us, so remember
+ * no to use self anymore after calling confirm
+ */
+ if (self->confirm)
+ self->confirm(IAS_ATTRIB_UNKNOWN, 0, NULL,
+ self->priv);
+ break;
+ }
+ break;
+ default:
+ pr_debug("%s(), Unknown op-code: %02x\n", __func__,
+ opcode);
+ break;
+ }
+
+out:
+ /* Cleanup - sub-calls will have done skb_get() as needed. */
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+/*
+ * Function iriap_call_indication (self, skb)
+ *
+ * Received call to server from peer LM-IAS
+ *
+ */
+void iriap_call_indication(struct iriap_cb *self, struct sk_buff *skb)
+{
+ __u8 *fp;
+ __u8 opcode;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ fp = skb->data;
+
+ opcode = fp[0];
+ if (~opcode & 0x80) {
+ net_warn_ratelimited("%s: IrIAS multiframe commands or results is not implemented yet!\n",
+ __func__);
+ return;
+ }
+ opcode &= 0x7f; /* Mask away LST bit */
+
+ switch (opcode) {
+ case GET_INFO_BASE:
+ net_warn_ratelimited("%s: GetInfoBaseDetails not implemented yet!\n",
+ __func__);
+ break;
+ case GET_VALUE_BY_CLASS:
+ iriap_getvaluebyclass_indication(self, skb);
+ break;
+ }
+ /* skb will be cleaned up in iriap_data_indication */
+}
+
+/*
+ * Function iriap_watchdog_timer_expired (data)
+ *
+ * Query has taken too long time, so abort
+ *
+ */
+static void iriap_watchdog_timer_expired(void *data)
+{
+ struct iriap_cb *self = (struct iriap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ /* iriap_close(self); */
+}
+
+#ifdef CONFIG_PROC_FS
+
+static const char *const ias_value_types[] = {
+ "IAS_MISSING",
+ "IAS_INTEGER",
+ "IAS_OCT_SEQ",
+ "IAS_STRING"
+};
+
+static inline struct ias_object *irias_seq_idx(loff_t pos)
+{
+ struct ias_object *obj;
+
+ for (obj = (struct ias_object *) hashbin_get_first(irias_objects);
+ obj; obj = (struct ias_object *) hashbin_get_next(irias_objects)) {
+ if (pos-- == 0)
+ break;
+ }
+
+ return obj;
+}
+
+static void *irias_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ spin_lock_irq(&irias_objects->hb_spinlock);
+
+ return *pos ? irias_seq_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *irias_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+
+ return (v == SEQ_START_TOKEN)
+ ? (void *) hashbin_get_first(irias_objects)
+ : (void *) hashbin_get_next(irias_objects);
+}
+
+static void irias_seq_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_irq(&irias_objects->hb_spinlock);
+}
+
+static int irias_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "LM-IAS Objects:\n");
+ else {
+ struct ias_object *obj = v;
+ struct ias_attrib *attrib;
+
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -EINVAL;);
+
+ seq_printf(seq, "name: %s, id=%d\n",
+ obj->name, obj->id);
+
+ /* Careful for priority inversions here !
+ * All other uses of attrib spinlock are independent of
+ * the object spinlock, so we are safe. Jean II */
+ spin_lock(&obj->attribs->hb_spinlock);
+
+ /* List all attributes for this object */
+ for (attrib = (struct ias_attrib *) hashbin_get_first(obj->attribs);
+ attrib != NULL;
+ attrib = (struct ias_attrib *) hashbin_get_next(obj->attribs)) {
+
+ IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC,
+ goto outloop; );
+
+ seq_printf(seq, " - Attribute name: \"%s\", ",
+ attrib->name);
+ seq_printf(seq, "value[%s]: ",
+ ias_value_types[attrib->value->type]);
+
+ switch (attrib->value->type) {
+ case IAS_INTEGER:
+ seq_printf(seq, "%d\n",
+ attrib->value->t.integer);
+ break;
+ case IAS_STRING:
+ seq_printf(seq, "\"%s\"\n",
+ attrib->value->t.string);
+ break;
+ case IAS_OCT_SEQ:
+ seq_printf(seq, "octet sequence (%d bytes)\n",
+ attrib->value->len);
+ break;
+ case IAS_MISSING:
+ seq_puts(seq, "missing\n");
+ break;
+ default:
+ seq_printf(seq, "type %d?\n",
+ attrib->value->type);
+ }
+ seq_putc(seq, '\n');
+
+ }
+ IRDA_ASSERT_LABEL(outloop:)
+ spin_unlock(&obj->attribs->hb_spinlock);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations irias_seq_ops = {
+ .start = irias_seq_start,
+ .next = irias_seq_next,
+ .stop = irias_seq_stop,
+ .show = irias_seq_show,
+};
+
+static int irias_seq_open(struct inode *inode, struct file *file)
+{
+ IRDA_ASSERT( irias_objects != NULL, return -EINVAL;);
+
+ return seq_open(file, &irias_seq_ops);
+}
+
+const struct file_operations irias_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = irias_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif /* PROC_FS */
diff --git a/net/irda/iriap_event.c b/net/irda/iriap_event.c
new file mode 100644
index 000000000..e6098b2e0
--- /dev/null
+++ b/net/irda/iriap_event.c
@@ -0,0 +1,496 @@
+/*********************************************************************
+ *
+ * Filename: iriap_event.c
+ * Version: 0.1
+ * Description: IAP Finite State Machine
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Thu Aug 21 00:02:07 1997
+ * Modified at: Wed Mar 1 11:28:34 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1997, 1999-2000 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/iriap_event.h>
+
+static void state_s_disconnect (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_s_connecting (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_s_call (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+
+static void state_s_make_call (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_s_calling (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_s_outstanding (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_s_replying (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_s_wait_for_call(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_s_wait_active (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+
+static void state_r_disconnect (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_r_call (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_r_waiting (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_r_wait_active (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_r_receiving (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_r_execute (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+static void state_r_returning (struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb);
+
+static void (*iriap_state[])(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb) = {
+ /* Client FSM */
+ state_s_disconnect,
+ state_s_connecting,
+ state_s_call,
+
+ /* S-Call FSM */
+ state_s_make_call,
+ state_s_calling,
+ state_s_outstanding,
+ state_s_replying,
+ state_s_wait_for_call,
+ state_s_wait_active,
+
+ /* Server FSM */
+ state_r_disconnect,
+ state_r_call,
+
+ /* R-Connect FSM */
+ state_r_waiting,
+ state_r_wait_active,
+ state_r_receiving,
+ state_r_execute,
+ state_r_returning,
+};
+
+void iriap_next_client_state(struct iriap_cb *self, IRIAP_STATE state)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ self->client_state = state;
+}
+
+void iriap_next_call_state(struct iriap_cb *self, IRIAP_STATE state)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ self->call_state = state;
+}
+
+void iriap_next_server_state(struct iriap_cb *self, IRIAP_STATE state)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ self->server_state = state;
+}
+
+void iriap_next_r_connect_state(struct iriap_cb *self, IRIAP_STATE state)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ self->r_connect_state = state;
+}
+
+void iriap_do_client_event(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ (*iriap_state[ self->client_state]) (self, event, skb);
+}
+
+void iriap_do_call_event(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ (*iriap_state[ self->call_state]) (self, event, skb);
+}
+
+void iriap_do_server_event(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ (*iriap_state[ self->server_state]) (self, event, skb);
+}
+
+void iriap_do_r_connect_event(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ (*iriap_state[ self->r_connect_state]) (self, event, skb);
+}
+
+
+/*
+ * Function state_s_disconnect (event, skb)
+ *
+ * S-Disconnect, The device has no LSAP connection to a particular
+ * remote device.
+ */
+static void state_s_disconnect(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ switch (event) {
+ case IAP_CALL_REQUEST_GVBC:
+ iriap_next_client_state(self, S_CONNECTING);
+ IRDA_ASSERT(self->request_skb == NULL, return;);
+ /* Don't forget to refcount it -
+ * see iriap_getvaluebyclass_request(). */
+ skb_get(skb);
+ self->request_skb = skb;
+ iriap_connect_request(self);
+ break;
+ case IAP_LM_DISCONNECT_INDICATION:
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__, event);
+ break;
+ }
+}
+
+/*
+ * Function state_s_connecting (self, event, skb)
+ *
+ * S-Connecting
+ *
+ */
+static void state_s_connecting(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ switch (event) {
+ case IAP_LM_CONNECT_CONFIRM:
+ /*
+ * Jump to S-Call FSM
+ */
+ iriap_do_call_event(self, IAP_CALL_REQUEST, skb);
+ /* iriap_call_request(self, 0,0,0); */
+ iriap_next_client_state(self, S_CALL);
+ break;
+ case IAP_LM_DISCONNECT_INDICATION:
+ /* Abort calls */
+ iriap_next_call_state(self, S_MAKE_CALL);
+ iriap_next_client_state(self, S_DISCONNECT);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__, event);
+ break;
+ }
+}
+
+/*
+ * Function state_s_call (self, event, skb)
+ *
+ * S-Call, The device can process calls to a specific remote
+ * device. Whenever the LSAP connection is disconnected, this state
+ * catches that event and clears up
+ */
+static void state_s_call(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+
+ switch (event) {
+ case IAP_LM_DISCONNECT_INDICATION:
+ /* Abort calls */
+ iriap_next_call_state(self, S_MAKE_CALL);
+ iriap_next_client_state(self, S_DISCONNECT);
+ break;
+ default:
+ pr_debug("state_s_call: Unknown event %d\n", event);
+ break;
+ }
+}
+
+/*
+ * Function state_s_make_call (event, skb)
+ *
+ * S-Make-Call
+ *
+ */
+static void state_s_make_call(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ struct sk_buff *tx_skb;
+
+ IRDA_ASSERT(self != NULL, return;);
+
+ switch (event) {
+ case IAP_CALL_REQUEST:
+ /* Already refcounted - see state_s_disconnect() */
+ tx_skb = self->request_skb;
+ self->request_skb = NULL;
+
+ irlmp_data_request(self->lsap, tx_skb);
+ iriap_next_call_state(self, S_OUTSTANDING);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__, event);
+ break;
+ }
+}
+
+/*
+ * Function state_s_calling (event, skb)
+ *
+ * S-Calling
+ *
+ */
+static void state_s_calling(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ pr_debug("%s(), Not implemented\n", __func__);
+}
+
+/*
+ * Function state_s_outstanding (event, skb)
+ *
+ * S-Outstanding, The device is waiting for a response to a command
+ *
+ */
+static void state_s_outstanding(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+
+ switch (event) {
+ case IAP_RECV_F_LST:
+ /*iriap_send_ack(self);*/
+ /*LM_Idle_request(idle); */
+
+ iriap_next_call_state(self, S_WAIT_FOR_CALL);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__, event);
+ break;
+ }
+}
+
+/*
+ * Function state_s_replying (event, skb)
+ *
+ * S-Replying, The device is collecting a multiple part response
+ */
+static void state_s_replying(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ pr_debug("%s(), Not implemented\n", __func__);
+}
+
+/*
+ * Function state_s_wait_for_call (event, skb)
+ *
+ * S-Wait-for-Call
+ *
+ */
+static void state_s_wait_for_call(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ pr_debug("%s(), Not implemented\n", __func__);
+}
+
+
+/*
+ * Function state_s_wait_active (event, skb)
+ *
+ * S-Wait-Active
+ *
+ */
+static void state_s_wait_active(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ pr_debug("%s(), Not implemented\n", __func__);
+}
+
+/**************************************************************************
+ *
+ * Server FSM
+ *
+ **************************************************************************/
+
+/*
+ * Function state_r_disconnect (self, event, skb)
+ *
+ * LM-IAS server is disconnected (not processing any requests!)
+ *
+ */
+static void state_r_disconnect(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ struct sk_buff *tx_skb;
+
+ switch (event) {
+ case IAP_LM_CONNECT_INDICATION:
+ tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+ if (tx_skb == NULL)
+ return;
+
+ /* Reserve space for MUX_CONTROL and LAP header */
+ skb_reserve(tx_skb, LMP_MAX_HEADER);
+
+ irlmp_connect_response(self->lsap, tx_skb);
+ /*LM_Idle_request(idle); */
+
+ iriap_next_server_state(self, R_CALL);
+
+ /*
+ * Jump to R-Connect FSM, we skip R-Waiting since we do not
+ * care about LM_Idle_request()!
+ */
+ iriap_next_r_connect_state(self, R_RECEIVING);
+ break;
+ default:
+ pr_debug("%s(), unknown event %d\n", __func__, event);
+ break;
+ }
+}
+
+/*
+ * Function state_r_call (self, event, skb)
+ */
+static void state_r_call(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ switch (event) {
+ case IAP_LM_DISCONNECT_INDICATION:
+ /* Abort call */
+ iriap_next_server_state(self, R_DISCONNECT);
+ iriap_next_r_connect_state(self, R_WAITING);
+ break;
+ default:
+ pr_debug("%s(), unknown event!\n", __func__);
+ break;
+ }
+}
+
+/*
+ * R-Connect FSM
+ */
+
+/*
+ * Function state_r_waiting (self, event, skb)
+ */
+static void state_r_waiting(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ pr_debug("%s(), Not implemented\n", __func__);
+}
+
+static void state_r_wait_active(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ pr_debug("%s(), Not implemented\n", __func__);
+}
+
+/*
+ * Function state_r_receiving (self, event, skb)
+ *
+ * We are receiving a command
+ *
+ */
+static void state_r_receiving(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ switch (event) {
+ case IAP_RECV_F_LST:
+ iriap_next_r_connect_state(self, R_EXECUTE);
+
+ iriap_call_indication(self, skb);
+ break;
+ default:
+ pr_debug("%s(), unknown event!\n", __func__);
+ break;
+ }
+}
+
+/*
+ * Function state_r_execute (self, event, skb)
+ *
+ * The server is processing the request
+ *
+ */
+static void state_r_execute(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+ switch (event) {
+ case IAP_CALL_RESPONSE:
+ /*
+ * Since we don't implement the Waiting state, we return
+ * to state Receiving instead, DB.
+ */
+ iriap_next_r_connect_state(self, R_RECEIVING);
+
+ /* Don't forget to refcount it - see
+ * iriap_getvaluebyclass_response(). */
+ skb_get(skb);
+
+ irlmp_data_request(self->lsap, skb);
+ break;
+ default:
+ pr_debug("%s(), unknown event!\n", __func__);
+ break;
+ }
+}
+
+static void state_r_returning(struct iriap_cb *self, IRIAP_EVENT event,
+ struct sk_buff *skb)
+{
+ pr_debug("%s(), event=%d\n", __func__, event);
+
+ switch (event) {
+ case IAP_RECV_F_LST:
+ break;
+ default:
+ break;
+ }
+}
diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c
new file mode 100644
index 000000000..53b86d0e1
--- /dev/null
+++ b/net/irda/irias_object.c
@@ -0,0 +1,555 @@
+/*********************************************************************
+ *
+ * Filename: irias_object.c
+ * Version: 0.3
+ * Description: IAS object database and functions
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Thu Oct 1 22:50:04 1998
+ * Modified at: Wed Dec 15 11:23:16 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/module.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irias_object.h>
+
+hashbin_t *irias_objects;
+
+/*
+ * Used when a missing value needs to be returned
+ */
+struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}};
+
+
+/*
+ * Function ias_new_object (name, id)
+ *
+ * Create a new IAS object
+ *
+ */
+struct ias_object *irias_new_object( char *name, int id)
+{
+ struct ias_object *obj;
+
+ obj = kzalloc(sizeof(struct ias_object), GFP_ATOMIC);
+ if (obj == NULL) {
+ net_warn_ratelimited("%s(), Unable to allocate object!\n",
+ __func__);
+ return NULL;
+ }
+
+ obj->magic = IAS_OBJECT_MAGIC;
+ obj->name = kstrndup(name, IAS_MAX_CLASSNAME, GFP_ATOMIC);
+ if (!obj->name) {
+ net_warn_ratelimited("%s(), Unable to allocate name!\n",
+ __func__);
+ kfree(obj);
+ return NULL;
+ }
+ obj->id = id;
+
+ /* Locking notes : the attrib spinlock has lower precendence
+ * than the objects spinlock. Never grap the objects spinlock
+ * while holding any attrib spinlock (risk of deadlock). Jean II */
+ obj->attribs = hashbin_new(HB_LOCK);
+
+ if (obj->attribs == NULL) {
+ net_warn_ratelimited("%s(), Unable to allocate attribs!\n",
+ __func__);
+ kfree(obj->name);
+ kfree(obj);
+ return NULL;
+ }
+
+ return obj;
+}
+EXPORT_SYMBOL(irias_new_object);
+
+/*
+ * Function irias_delete_attrib (attrib)
+ *
+ * Delete given attribute and deallocate all its memory
+ *
+ */
+static void __irias_delete_attrib(struct ias_attrib *attrib)
+{
+ IRDA_ASSERT(attrib != NULL, return;);
+ IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;);
+
+ kfree(attrib->name);
+
+ irias_delete_value(attrib->value);
+ attrib->magic = ~IAS_ATTRIB_MAGIC;
+
+ kfree(attrib);
+}
+
+void __irias_delete_object(struct ias_object *obj)
+{
+ IRDA_ASSERT(obj != NULL, return;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+ kfree(obj->name);
+
+ hashbin_delete(obj->attribs, (FREE_FUNC) __irias_delete_attrib);
+
+ obj->magic = ~IAS_OBJECT_MAGIC;
+
+ kfree(obj);
+}
+
+/*
+ * Function irias_delete_object (obj)
+ *
+ * Remove object from hashbin and deallocate all attributes associated with
+ * with this object and the object itself
+ *
+ */
+int irias_delete_object(struct ias_object *obj)
+{
+ struct ias_object *node;
+
+ IRDA_ASSERT(obj != NULL, return -1;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -1;);
+
+ /* Remove from list */
+ node = hashbin_remove_this(irias_objects, (irda_queue_t *) obj);
+ if (!node)
+ pr_debug("%s(), object already removed!\n",
+ __func__);
+
+ /* Destroy */
+ __irias_delete_object(obj);
+
+ return 0;
+}
+EXPORT_SYMBOL(irias_delete_object);
+
+/*
+ * Function irias_delete_attrib (obj)
+ *
+ * Remove attribute from hashbin and, if it was the last attribute of
+ * the object, remove the object as well.
+ *
+ */
+int irias_delete_attrib(struct ias_object *obj, struct ias_attrib *attrib,
+ int cleanobject)
+{
+ struct ias_attrib *node;
+
+ IRDA_ASSERT(obj != NULL, return -1;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -1;);
+ IRDA_ASSERT(attrib != NULL, return -1;);
+
+ /* Remove attribute from object */
+ node = hashbin_remove_this(obj->attribs, (irda_queue_t *) attrib);
+ if (!node)
+ return 0; /* Already removed or non-existent */
+
+ /* Deallocate attribute */
+ __irias_delete_attrib(node);
+
+ /* Check if object has still some attributes, destroy it if none.
+ * At first glance, this look dangerous, as the kernel reference
+ * various IAS objects. However, we only use this function on
+ * user attributes, not kernel attributes, so there is no risk
+ * of deleting a kernel object this way. Jean II */
+ node = (struct ias_attrib *) hashbin_get_first(obj->attribs);
+ if (cleanobject && !node)
+ irias_delete_object(obj);
+
+ return 0;
+}
+
+/*
+ * Function irias_insert_object (obj)
+ *
+ * Insert an object into the LM-IAS database
+ *
+ */
+void irias_insert_object(struct ias_object *obj)
+{
+ IRDA_ASSERT(obj != NULL, return;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+ hashbin_insert(irias_objects, (irda_queue_t *) obj, 0, obj->name);
+}
+EXPORT_SYMBOL(irias_insert_object);
+
+/*
+ * Function irias_find_object (name)
+ *
+ * Find object with given name
+ *
+ */
+struct ias_object *irias_find_object(char *name)
+{
+ IRDA_ASSERT(name != NULL, return NULL;);
+
+ /* Unsafe (locking), object might change */
+ return hashbin_lock_find(irias_objects, 0, name);
+}
+EXPORT_SYMBOL(irias_find_object);
+
+/*
+ * Function irias_find_attrib (obj, name)
+ *
+ * Find named attribute in object
+ *
+ */
+struct ias_attrib *irias_find_attrib(struct ias_object *obj, char *name)
+{
+ struct ias_attrib *attrib;
+
+ IRDA_ASSERT(obj != NULL, return NULL;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return NULL;);
+ IRDA_ASSERT(name != NULL, return NULL;);
+
+ attrib = hashbin_lock_find(obj->attribs, 0, name);
+ if (attrib == NULL)
+ return NULL;
+
+ /* Unsafe (locking), attrib might change */
+ return attrib;
+}
+
+/*
+ * Function irias_add_attribute (obj, attrib)
+ *
+ * Add attribute to object
+ *
+ */
+static void irias_add_attrib(struct ias_object *obj, struct ias_attrib *attrib,
+ int owner)
+{
+ IRDA_ASSERT(obj != NULL, return;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+ IRDA_ASSERT(attrib != NULL, return;);
+ IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;);
+
+ /* Set if attrib is owned by kernel or user space */
+ attrib->value->owner = owner;
+
+ hashbin_insert(obj->attribs, (irda_queue_t *) attrib, 0, attrib->name);
+}
+
+/*
+ * Function irias_object_change_attribute (obj_name, attrib_name, new_value)
+ *
+ * Change the value of an objects attribute.
+ *
+ */
+int irias_object_change_attribute(char *obj_name, char *attrib_name,
+ struct ias_value *new_value)
+{
+ struct ias_object *obj;
+ struct ias_attrib *attrib;
+ unsigned long flags;
+
+ /* Find object */
+ obj = hashbin_lock_find(irias_objects, 0, obj_name);
+ if (obj == NULL) {
+ net_warn_ratelimited("%s: Unable to find object: %s\n",
+ __func__, obj_name);
+ return -1;
+ }
+
+ /* Slightly unsafe (obj might get removed under us) */
+ spin_lock_irqsave(&obj->attribs->hb_spinlock, flags);
+
+ /* Find attribute */
+ attrib = hashbin_find(obj->attribs, 0, attrib_name);
+ if (attrib == NULL) {
+ net_warn_ratelimited("%s: Unable to find attribute: %s\n",
+ __func__, attrib_name);
+ spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags);
+ return -1;
+ }
+
+ if ( attrib->value->type != new_value->type) {
+ pr_debug("%s(), changing value type not allowed!\n",
+ __func__);
+ spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags);
+ return -1;
+ }
+
+ /* Delete old value */
+ irias_delete_value(attrib->value);
+
+ /* Insert new value */
+ attrib->value = new_value;
+
+ /* Success */
+ spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(irias_object_change_attribute);
+
+/*
+ * Function irias_object_add_integer_attrib (obj, name, value)
+ *
+ * Add an integer attribute to an LM-IAS object
+ *
+ */
+void irias_add_integer_attrib(struct ias_object *obj, char *name, int value,
+ int owner)
+{
+ struct ias_attrib *attrib;
+
+ IRDA_ASSERT(obj != NULL, return;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+ IRDA_ASSERT(name != NULL, return;);
+
+ attrib = kzalloc(sizeof(struct ias_attrib), GFP_ATOMIC);
+ if (attrib == NULL) {
+ net_warn_ratelimited("%s: Unable to allocate attribute!\n",
+ __func__);
+ return;
+ }
+
+ attrib->magic = IAS_ATTRIB_MAGIC;
+ attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
+
+ /* Insert value */
+ attrib->value = irias_new_integer_value(value);
+ if (!attrib->name || !attrib->value) {
+ net_warn_ratelimited("%s: Unable to allocate attribute!\n",
+ __func__);
+ if (attrib->value)
+ irias_delete_value(attrib->value);
+ kfree(attrib->name);
+ kfree(attrib);
+ return;
+ }
+
+ irias_add_attrib(obj, attrib, owner);
+}
+EXPORT_SYMBOL(irias_add_integer_attrib);
+
+ /*
+ * Function irias_add_octseq_attrib (obj, name, octet_seq, len)
+ *
+ * Add a octet sequence attribute to an LM-IAS object
+ *
+ */
+
+void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets,
+ int len, int owner)
+{
+ struct ias_attrib *attrib;
+
+ IRDA_ASSERT(obj != NULL, return;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+ IRDA_ASSERT(name != NULL, return;);
+ IRDA_ASSERT(octets != NULL, return;);
+
+ attrib = kzalloc(sizeof(struct ias_attrib), GFP_ATOMIC);
+ if (attrib == NULL) {
+ net_warn_ratelimited("%s: Unable to allocate attribute!\n",
+ __func__);
+ return;
+ }
+
+ attrib->magic = IAS_ATTRIB_MAGIC;
+ attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
+
+ attrib->value = irias_new_octseq_value( octets, len);
+ if (!attrib->name || !attrib->value) {
+ net_warn_ratelimited("%s: Unable to allocate attribute!\n",
+ __func__);
+ if (attrib->value)
+ irias_delete_value(attrib->value);
+ kfree(attrib->name);
+ kfree(attrib);
+ return;
+ }
+
+ irias_add_attrib(obj, attrib, owner);
+}
+EXPORT_SYMBOL(irias_add_octseq_attrib);
+
+/*
+ * Function irias_object_add_string_attrib (obj, string)
+ *
+ * Add a string attribute to an LM-IAS object
+ *
+ */
+void irias_add_string_attrib(struct ias_object *obj, char *name, char *value,
+ int owner)
+{
+ struct ias_attrib *attrib;
+
+ IRDA_ASSERT(obj != NULL, return;);
+ IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+ IRDA_ASSERT(name != NULL, return;);
+ IRDA_ASSERT(value != NULL, return;);
+
+ attrib = kzalloc(sizeof( struct ias_attrib), GFP_ATOMIC);
+ if (attrib == NULL) {
+ net_warn_ratelimited("%s: Unable to allocate attribute!\n",
+ __func__);
+ return;
+ }
+
+ attrib->magic = IAS_ATTRIB_MAGIC;
+ attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
+
+ attrib->value = irias_new_string_value(value);
+ if (!attrib->name || !attrib->value) {
+ net_warn_ratelimited("%s: Unable to allocate attribute!\n",
+ __func__);
+ if (attrib->value)
+ irias_delete_value(attrib->value);
+ kfree(attrib->name);
+ kfree(attrib);
+ return;
+ }
+
+ irias_add_attrib(obj, attrib, owner);
+}
+EXPORT_SYMBOL(irias_add_string_attrib);
+
+/*
+ * Function irias_new_integer_value (integer)
+ *
+ * Create new IAS integer value
+ *
+ */
+struct ias_value *irias_new_integer_value(int integer)
+{
+ struct ias_value *value;
+
+ value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC);
+ if (value == NULL)
+ return NULL;
+
+ value->type = IAS_INTEGER;
+ value->len = 4;
+ value->t.integer = integer;
+
+ return value;
+}
+EXPORT_SYMBOL(irias_new_integer_value);
+
+/*
+ * Function irias_new_string_value (string)
+ *
+ * Create new IAS string value
+ *
+ * Per IrLMP 1.1, 4.3.3.2, strings are up to 256 chars - Jean II
+ */
+struct ias_value *irias_new_string_value(char *string)
+{
+ struct ias_value *value;
+
+ value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC);
+ if (value == NULL)
+ return NULL;
+
+ value->type = IAS_STRING;
+ value->charset = CS_ASCII;
+ value->t.string = kstrndup(string, IAS_MAX_STRING, GFP_ATOMIC);
+ if (!value->t.string) {
+ net_warn_ratelimited("%s: Unable to kmalloc!\n", __func__);
+ kfree(value);
+ return NULL;
+ }
+
+ value->len = strlen(value->t.string);
+
+ return value;
+}
+
+/*
+ * Function irias_new_octseq_value (octets, len)
+ *
+ * Create new IAS octet-sequence value
+ *
+ * Per IrLMP 1.1, 4.3.3.2, octet-sequence are up to 1024 bytes - Jean II
+ */
+struct ias_value *irias_new_octseq_value(__u8 *octseq , int len)
+{
+ struct ias_value *value;
+
+ value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC);
+ if (value == NULL)
+ return NULL;
+
+ value->type = IAS_OCT_SEQ;
+ /* Check length */
+ if(len > IAS_MAX_OCTET_STRING)
+ len = IAS_MAX_OCTET_STRING;
+ value->len = len;
+
+ value->t.oct_seq = kmemdup(octseq, len, GFP_ATOMIC);
+ if (value->t.oct_seq == NULL){
+ net_warn_ratelimited("%s: Unable to kmalloc!\n", __func__);
+ kfree(value);
+ return NULL;
+ }
+ return value;
+}
+
+struct ias_value *irias_new_missing_value(void)
+{
+ struct ias_value *value;
+
+ value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC);
+ if (value == NULL)
+ return NULL;
+
+ value->type = IAS_MISSING;
+
+ return value;
+}
+
+/*
+ * Function irias_delete_value (value)
+ *
+ * Delete IAS value
+ *
+ */
+void irias_delete_value(struct ias_value *value)
+{
+ IRDA_ASSERT(value != NULL, return;);
+
+ switch (value->type) {
+ case IAS_INTEGER: /* Fallthrough */
+ case IAS_MISSING:
+ /* No need to deallocate */
+ break;
+ case IAS_STRING:
+ /* Deallocate string */
+ kfree(value->t.string);
+ break;
+ case IAS_OCT_SEQ:
+ /* Deallocate byte stream */
+ kfree(value->t.oct_seq);
+ break;
+ default:
+ pr_debug("%s(), Unknown value type!\n", __func__);
+ break;
+ }
+ kfree(value);
+}
+EXPORT_SYMBOL(irias_delete_value);
diff --git a/net/irda/irlan/Kconfig b/net/irda/irlan/Kconfig
new file mode 100644
index 000000000..951abc2e3
--- /dev/null
+++ b/net/irda/irlan/Kconfig
@@ -0,0 +1,14 @@
+config IRLAN
+ tristate "IrLAN protocol"
+ depends on IRDA
+ help
+ Say Y here if you want to build support for the IrLAN protocol.
+ To compile it as a module, choose M here: the module will be called
+ irlan. IrLAN emulates an Ethernet and makes it possible to put up
+ a wireless LAN using infrared beams.
+
+ The IrLAN protocol can be used to talk with infrared access points
+ like the HP NetbeamIR, or the ESI JetEye NET. You can also connect
+ to another Linux machine running the IrLAN protocol for ad-hoc
+ networking!
+
diff --git a/net/irda/irlan/Makefile b/net/irda/irlan/Makefile
new file mode 100644
index 000000000..94eefbc8e
--- /dev/null
+++ b/net/irda/irlan/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux IrDA IrLAN protocol layer.
+#
+
+obj-$(CONFIG_IRLAN) += irlan.o
+
+irlan-y := irlan_common.o irlan_eth.o irlan_event.o irlan_client.o irlan_provider.o irlan_filter.o irlan_provider_event.o irlan_client_event.o
diff --git a/net/irda/irlan/irlan_client.c b/net/irda/irlan/irlan_client.c
new file mode 100644
index 000000000..c5837a40c
--- /dev/null
+++ b/net/irda/irlan/irlan_client.c
@@ -0,0 +1,559 @@
+/*********************************************************************
+ *
+ * Filename: irlan_client.c
+ * Version: 0.9
+ * Description: IrDA LAN Access Protocol (IrLAN) Client
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Aug 31 20:14:37 1997
+ * Modified at: Tue Dec 14 15:47:02 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ * Sources: skeleton.c by Donald Becker <becker@CESDIS.gsfc.nasa.gov>
+ * slip.c by Laurence Culhane, <loz@holmes.demon.co.uk>
+ * Fred N. van Kempen, <waltje@uwalt.nl.mugnet.org>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/bitops.h>
+#include <net/arp.h>
+
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/iriap.h>
+#include <net/irda/timer.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_event.h>
+#include <net/irda/irlan_eth.h>
+#include <net/irda/irlan_provider.h>
+#include <net/irda/irlan_client.h>
+
+#undef CONFIG_IRLAN_GRATUITOUS_ARP
+
+static void irlan_client_ctrl_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason,
+ struct sk_buff *);
+static int irlan_client_ctrl_data_indication(void *instance, void *sap,
+ struct sk_buff *skb);
+static void irlan_client_ctrl_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *);
+static void irlan_check_response_param(struct irlan_cb *self, char *param,
+ char *value, int val_len);
+static void irlan_client_open_ctrl_tsap(struct irlan_cb *self);
+
+static void irlan_client_kick_timer_expired(void *data)
+{
+ struct irlan_cb *self = (struct irlan_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /*
+ * If we are in peer mode, the client may not have got the discovery
+ * indication it needs to make progress. If the client is still in
+ * IDLE state, we must kick it to, but only if the provider is not IDLE
+ */
+ if ((self->provider.access_type == ACCESS_PEER) &&
+ (self->client.state == IRLAN_IDLE) &&
+ (self->provider.state != IRLAN_IDLE)) {
+ irlan_client_wakeup(self, self->saddr, self->daddr);
+ }
+}
+
+static void irlan_client_start_kick_timer(struct irlan_cb *self, int timeout)
+{
+ irda_start_timer(&self->client.kick_timer, timeout, (void *) self,
+ irlan_client_kick_timer_expired);
+}
+
+/*
+ * Function irlan_client_wakeup (self, saddr, daddr)
+ *
+ * Wake up client
+ *
+ */
+void irlan_client_wakeup(struct irlan_cb *self, __u32 saddr, __u32 daddr)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /*
+ * Check if we are already awake, or if we are a provider in direct
+ * mode (in that case we must leave the client idle
+ */
+ if ((self->client.state != IRLAN_IDLE) ||
+ (self->provider.access_type == ACCESS_DIRECT))
+ {
+ pr_debug("%s(), already awake!\n", __func__);
+ return;
+ }
+
+ /* Addresses may have changed! */
+ self->saddr = saddr;
+ self->daddr = daddr;
+
+ if (self->disconnect_reason == LM_USER_REQUEST) {
+ pr_debug("%s(), still stopped by user\n", __func__);
+ return;
+ }
+
+ /* Open TSAPs */
+ irlan_client_open_ctrl_tsap(self);
+ irlan_open_data_tsap(self);
+
+ irlan_do_client_event(self, IRLAN_DISCOVERY_INDICATION, NULL);
+
+ /* Start kick timer */
+ irlan_client_start_kick_timer(self, 2*HZ);
+}
+
+/*
+ * Function irlan_discovery_indication (daddr)
+ *
+ * Remote device with IrLAN server support discovered
+ *
+ */
+void irlan_client_discovery_indication(discinfo_t *discovery,
+ DISCOVERY_MODE mode,
+ void *priv)
+{
+ struct irlan_cb *self;
+ __u32 saddr, daddr;
+
+ IRDA_ASSERT(discovery != NULL, return;);
+
+ /*
+ * I didn't check it, but I bet that IrLAN suffer from the same
+ * deficiency as IrComm and doesn't handle two instances
+ * simultaneously connecting to each other.
+ * Same workaround, drop passive discoveries.
+ * Jean II */
+ if(mode == DISCOVERY_PASSIVE)
+ return;
+
+ saddr = discovery->saddr;
+ daddr = discovery->daddr;
+
+ /* Find instance */
+ rcu_read_lock();
+ self = irlan_get_any();
+ if (self) {
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, goto out;);
+
+ pr_debug("%s(), Found instance (%08x)!\n", __func__ ,
+ daddr);
+
+ irlan_client_wakeup(self, saddr, daddr);
+ }
+IRDA_ASSERT_LABEL(out:)
+ rcu_read_unlock();
+}
+
+/*
+ * Function irlan_client_data_indication (handle, skb)
+ *
+ * This function gets the data that is received on the control channel
+ *
+ */
+static int irlan_client_ctrl_data_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct irlan_cb *self;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ irlan_do_client_event(self, IRLAN_DATA_INDICATION, skb);
+
+ /* Ready for a new command */
+ pr_debug("%s(), clearing tx_busy\n", __func__);
+ self->client.tx_busy = FALSE;
+
+ /* Check if we have some queued commands waiting to be sent */
+ irlan_run_ctrl_tx_queue(self);
+
+ return 0;
+}
+
+static void irlan_client_ctrl_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason,
+ struct sk_buff *userdata)
+{
+ struct irlan_cb *self;
+ struct tsap_cb *tsap;
+ struct sk_buff *skb;
+
+ pr_debug("%s(), reason=%d\n", __func__ , reason);
+
+ self = instance;
+ tsap = sap;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+ IRDA_ASSERT(tsap != NULL, return;);
+ IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;);
+
+ IRDA_ASSERT(tsap == self->client.tsap_ctrl, return;);
+
+ /* Remove frames queued on the control channel */
+ while ((skb = skb_dequeue(&self->client.txq)) != NULL) {
+ dev_kfree_skb(skb);
+ }
+ self->client.tx_busy = FALSE;
+
+ irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL);
+}
+
+/*
+ * Function irlan_client_open_tsaps (self)
+ *
+ * Initialize callbacks and open IrTTP TSAPs
+ *
+ */
+static void irlan_client_open_ctrl_tsap(struct irlan_cb *self)
+{
+ struct tsap_cb *tsap;
+ notify_t notify;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /* Check if already open */
+ if (self->client.tsap_ctrl)
+ return;
+
+ irda_notify_init(&notify);
+
+ /* Set up callbacks */
+ notify.data_indication = irlan_client_ctrl_data_indication;
+ notify.connect_confirm = irlan_client_ctrl_connect_confirm;
+ notify.disconnect_indication = irlan_client_ctrl_disconnect_indication;
+ notify.instance = self;
+ strlcpy(notify.name, "IrLAN ctrl (c)", sizeof(notify.name));
+
+ tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, &notify);
+ if (!tsap) {
+ pr_debug("%s(), Got no tsap!\n", __func__);
+ return;
+ }
+ self->client.tsap_ctrl = tsap;
+}
+
+/*
+ * Function irlan_client_connect_confirm (handle, skb)
+ *
+ * Connection to peer IrLAN laye confirmed
+ *
+ */
+static void irlan_client_ctrl_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct irlan_cb *self;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ self->client.max_sdu_size = max_sdu_size;
+ self->client.max_header_size = max_header_size;
+
+ /* TODO: we could set the MTU depending on the max_sdu_size */
+
+ irlan_do_client_event(self, IRLAN_CONNECT_COMPLETE, NULL);
+}
+
+/*
+ * Function print_ret_code (code)
+ *
+ * Print return code of request to peer IrLAN layer.
+ *
+ */
+static void print_ret_code(__u8 code)
+{
+ switch(code) {
+ case 0:
+ printk(KERN_INFO "Success\n");
+ break;
+ case 1:
+ net_warn_ratelimited("IrLAN: Insufficient resources\n");
+ break;
+ case 2:
+ net_warn_ratelimited("IrLAN: Invalid command format\n");
+ break;
+ case 3:
+ net_warn_ratelimited("IrLAN: Command not supported\n");
+ break;
+ case 4:
+ net_warn_ratelimited("IrLAN: Parameter not supported\n");
+ break;
+ case 5:
+ net_warn_ratelimited("IrLAN: Value not supported\n");
+ break;
+ case 6:
+ net_warn_ratelimited("IrLAN: Not open\n");
+ break;
+ case 7:
+ net_warn_ratelimited("IrLAN: Authentication required\n");
+ break;
+ case 8:
+ net_warn_ratelimited("IrLAN: Invalid password\n");
+ break;
+ case 9:
+ net_warn_ratelimited("IrLAN: Protocol error\n");
+ break;
+ case 255:
+ net_warn_ratelimited("IrLAN: Asynchronous status\n");
+ break;
+ }
+}
+
+/*
+ * Function irlan_client_parse_response (self, skb)
+ *
+ * Extract all parameters from received buffer, then feed them to
+ * check_params for parsing
+ */
+void irlan_client_parse_response(struct irlan_cb *self, struct sk_buff *skb)
+{
+ __u8 *frame;
+ __u8 *ptr;
+ int count;
+ int ret;
+ __u16 val_len;
+ int i;
+ char *name;
+ char *value;
+
+ IRDA_ASSERT(skb != NULL, return;);
+
+ pr_debug("%s() skb->len=%d\n", __func__ , (int)skb->len);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ if (!skb) {
+ net_err_ratelimited("%s(), Got NULL skb!\n", __func__);
+ return;
+ }
+ frame = skb->data;
+
+ /*
+ * Check return code and print it if not success
+ */
+ if (frame[0]) {
+ print_ret_code(frame[0]);
+ return;
+ }
+
+ name = kmalloc(255, GFP_ATOMIC);
+ if (!name)
+ return;
+ value = kmalloc(1016, GFP_ATOMIC);
+ if (!value) {
+ kfree(name);
+ return;
+ }
+
+ /* How many parameters? */
+ count = frame[1];
+
+ pr_debug("%s(), got %d parameters\n", __func__ , count);
+
+ ptr = frame+2;
+
+ /* For all parameters */
+ for (i=0; i<count;i++) {
+ ret = irlan_extract_param(ptr, name, value, &val_len);
+ if (ret < 0) {
+ pr_debug("%s(), IrLAN, Error!\n", __func__);
+ break;
+ }
+ ptr += ret;
+ irlan_check_response_param(self, name, value, val_len);
+ }
+ /* Cleanup */
+ kfree(name);
+ kfree(value);
+}
+
+/*
+ * Function irlan_check_response_param (self, param, value, val_len)
+ *
+ * Check which parameter is received and update local variables
+ *
+ */
+static void irlan_check_response_param(struct irlan_cb *self, char *param,
+ char *value, int val_len)
+{
+ __u16 tmp_cpu; /* Temporary value in host order */
+ __u8 *bytes;
+ int i;
+
+ pr_debug("%s(), parm=%s\n", __func__ , param);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /* Media type */
+ if (strcmp(param, "MEDIA") == 0) {
+ if (strcmp(value, "802.3") == 0)
+ self->media = MEDIA_802_3;
+ else
+ self->media = MEDIA_802_5;
+ return;
+ }
+ if (strcmp(param, "FILTER_TYPE") == 0) {
+ if (strcmp(value, "DIRECTED") == 0)
+ self->client.filter_type |= IRLAN_DIRECTED;
+ else if (strcmp(value, "FUNCTIONAL") == 0)
+ self->client.filter_type |= IRLAN_FUNCTIONAL;
+ else if (strcmp(value, "GROUP") == 0)
+ self->client.filter_type |= IRLAN_GROUP;
+ else if (strcmp(value, "MAC_FRAME") == 0)
+ self->client.filter_type |= IRLAN_MAC_FRAME;
+ else if (strcmp(value, "MULTICAST") == 0)
+ self->client.filter_type |= IRLAN_MULTICAST;
+ else if (strcmp(value, "BROADCAST") == 0)
+ self->client.filter_type |= IRLAN_BROADCAST;
+ else if (strcmp(value, "IPX_SOCKET") == 0)
+ self->client.filter_type |= IRLAN_IPX_SOCKET;
+
+ }
+ if (strcmp(param, "ACCESS_TYPE") == 0) {
+ if (strcmp(value, "DIRECT") == 0)
+ self->client.access_type = ACCESS_DIRECT;
+ else if (strcmp(value, "PEER") == 0)
+ self->client.access_type = ACCESS_PEER;
+ else if (strcmp(value, "HOSTED") == 0)
+ self->client.access_type = ACCESS_HOSTED;
+ else {
+ pr_debug("%s(), unknown access type!\n", __func__);
+ }
+ }
+ /* IRLAN version */
+ if (strcmp(param, "IRLAN_VER") == 0) {
+ pr_debug("IrLAN version %d.%d\n", (__u8)value[0],
+ (__u8)value[1]);
+
+ self->version[0] = value[0];
+ self->version[1] = value[1];
+ return;
+ }
+ /* Which remote TSAP to use for data channel */
+ if (strcmp(param, "DATA_CHAN") == 0) {
+ self->dtsap_sel_data = value[0];
+ pr_debug("Data TSAP = %02x\n", self->dtsap_sel_data);
+ return;
+ }
+ if (strcmp(param, "CON_ARB") == 0) {
+ memcpy(&tmp_cpu, value, 2); /* Align value */
+ le16_to_cpus(&tmp_cpu); /* Convert to host order */
+ self->client.recv_arb_val = tmp_cpu;
+ pr_debug("%s(), receive arb val=%d\n", __func__ ,
+ self->client.recv_arb_val);
+ }
+ if (strcmp(param, "MAX_FRAME") == 0) {
+ memcpy(&tmp_cpu, value, 2); /* Align value */
+ le16_to_cpus(&tmp_cpu); /* Convert to host order */
+ self->client.max_frame = tmp_cpu;
+ pr_debug("%s(), max frame=%d\n", __func__ ,
+ self->client.max_frame);
+ }
+
+ /* RECONNECT_KEY, in case the link goes down! */
+ if (strcmp(param, "RECONNECT_KEY") == 0) {
+ pr_debug("Got reconnect key: ");
+ /* for (i = 0; i < val_len; i++) */
+/* printk("%02x", value[i]); */
+ memcpy(self->client.reconnect_key, value, val_len);
+ self->client.key_len = val_len;
+ pr_debug("\n");
+ }
+ /* FILTER_ENTRY, have we got an ethernet address? */
+ if (strcmp(param, "FILTER_ENTRY") == 0) {
+ bytes = value;
+ pr_debug("Ethernet address = %pM\n", bytes);
+ for (i = 0; i < 6; i++)
+ self->dev->dev_addr[i] = bytes[i];
+ }
+}
+
+/*
+ * Function irlan_client_get_value_confirm (obj_id, value)
+ *
+ * Got results from remote LM-IAS
+ *
+ */
+void irlan_client_get_value_confirm(int result, __u16 obj_id,
+ struct ias_value *value, void *priv)
+{
+ struct irlan_cb *self;
+
+ IRDA_ASSERT(priv != NULL, return;);
+
+ self = priv;
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /* We probably don't need to make any more queries */
+ iriap_close(self->client.iriap);
+ self->client.iriap = NULL;
+
+ /* Check if request succeeded */
+ if (result != IAS_SUCCESS) {
+ pr_debug("%s(), got NULL value!\n", __func__);
+ irlan_do_client_event(self, IRLAN_IAS_PROVIDER_NOT_AVAIL,
+ NULL);
+ return;
+ }
+
+ switch (value->type) {
+ case IAS_INTEGER:
+ self->dtsap_sel_ctrl = value->t.integer;
+
+ if (value->t.integer != -1) {
+ irlan_do_client_event(self, IRLAN_IAS_PROVIDER_AVAIL,
+ NULL);
+ return;
+ }
+ irias_delete_value(value);
+ break;
+ default:
+ pr_debug("%s(), unknown type!\n", __func__);
+ break;
+ }
+ irlan_do_client_event(self, IRLAN_IAS_PROVIDER_NOT_AVAIL, NULL);
+}
diff --git a/net/irda/irlan/irlan_client_event.c b/net/irda/irlan/irlan_client_event.c
new file mode 100644
index 000000000..cc93fabbb
--- /dev/null
+++ b/net/irda/irlan/irlan_client_event.c
@@ -0,0 +1,511 @@
+/*********************************************************************
+ *
+ * Filename: irlan_client_event.c
+ * Version: 0.9
+ * Description: IrLAN client state machine
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Aug 31 20:14:37 1997
+ * Modified at: Sun Dec 26 21:52:24 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/timer.h>
+#include <net/irda/irmod.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irttp.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_client.h>
+#include <net/irda/irlan_event.h>
+
+static int irlan_client_state_idle (struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_query(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_conn (struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_info (struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_media(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_open (struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_wait (struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_arb (struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_data (struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_close(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_client_state_sync (struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+
+static int (*state[])(struct irlan_cb *, IRLAN_EVENT event, struct sk_buff *) =
+{
+ irlan_client_state_idle,
+ irlan_client_state_query,
+ irlan_client_state_conn,
+ irlan_client_state_info,
+ irlan_client_state_media,
+ irlan_client_state_open,
+ irlan_client_state_wait,
+ irlan_client_state_arb,
+ irlan_client_state_data,
+ irlan_client_state_close,
+ irlan_client_state_sync
+};
+
+void irlan_do_client_event(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ (*state[ self->client.state]) (self, event, skb);
+}
+
+/*
+ * Function irlan_client_state_idle (event, skb, info)
+ *
+ * IDLE, We are waiting for an indication that there is a provider
+ * available.
+ */
+static int irlan_client_state_idle(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+ switch (event) {
+ case IRLAN_DISCOVERY_INDICATION:
+ if (self->client.iriap) {
+ net_warn_ratelimited("%s(), busy with a previous query\n",
+ __func__);
+ return -EBUSY;
+ }
+
+ self->client.iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+ irlan_client_get_value_confirm);
+ /* Get some values from peer IAS */
+ irlan_next_client_state(self, IRLAN_QUERY);
+ iriap_getvaluebyclass_request(self->client.iriap,
+ self->saddr, self->daddr,
+ "IrLAN", "IrDA:TinyTP:LsapSel");
+ break;
+ case IRLAN_WATCHDOG_TIMEOUT:
+ pr_debug("%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_query (event, skb, info)
+ *
+ * QUERY, We have queryed the remote IAS and is ready to connect
+ * to provider, just waiting for the confirm.
+ *
+ */
+static int irlan_client_state_query(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+ switch(event) {
+ case IRLAN_IAS_PROVIDER_AVAIL:
+ IRDA_ASSERT(self->dtsap_sel_ctrl != 0, return -1;);
+
+ self->client.open_retries = 0;
+
+ irttp_connect_request(self->client.tsap_ctrl,
+ self->dtsap_sel_ctrl,
+ self->saddr, self->daddr, NULL,
+ IRLAN_MTU, NULL);
+ irlan_next_client_state(self, IRLAN_CONN);
+ break;
+ case IRLAN_IAS_PROVIDER_NOT_AVAIL:
+ pr_debug("%s(), IAS_PROVIDER_NOT_AVAIL\n", __func__);
+ irlan_next_client_state(self, IRLAN_IDLE);
+
+ /* Give the client a kick! */
+ if ((self->provider.access_type == ACCESS_PEER) &&
+ (self->provider.state != IRLAN_IDLE))
+ irlan_client_wakeup(self, self->saddr, self->daddr);
+ break;
+ case IRLAN_LMP_DISCONNECT:
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_client_state(self, IRLAN_IDLE);
+ break;
+ case IRLAN_WATCHDOG_TIMEOUT:
+ pr_debug("%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_conn (event, skb, info)
+ *
+ * CONN, We have connected to a provider but has not issued any
+ * commands yet.
+ *
+ */
+static int irlan_client_state_conn(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch (event) {
+ case IRLAN_CONNECT_COMPLETE:
+ /* Send getinfo cmd */
+ irlan_get_provider_info(self);
+ irlan_next_client_state(self, IRLAN_INFO);
+ break;
+ case IRLAN_LMP_DISCONNECT:
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_client_state(self, IRLAN_IDLE);
+ break;
+ case IRLAN_WATCHDOG_TIMEOUT:
+ pr_debug("%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_info (self, event, skb, info)
+ *
+ * INFO, We have issued a GetInfo command and is awaiting a reply.
+ */
+static int irlan_client_state_info(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch (event) {
+ case IRLAN_DATA_INDICATION:
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ irlan_client_parse_response(self, skb);
+
+ irlan_next_client_state(self, IRLAN_MEDIA);
+
+ irlan_get_media_char(self);
+ break;
+
+ case IRLAN_LMP_DISCONNECT:
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_client_state(self, IRLAN_IDLE);
+ break;
+ case IRLAN_WATCHDOG_TIMEOUT:
+ pr_debug("%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_media (self, event, skb, info)
+ *
+ * MEDIA, The irlan_client has issued a GetMedia command and is awaiting a
+ * reply.
+ *
+ */
+static int irlan_client_state_media(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch(event) {
+ case IRLAN_DATA_INDICATION:
+ irlan_client_parse_response(self, skb);
+ irlan_open_data_channel(self);
+ irlan_next_client_state(self, IRLAN_OPEN);
+ break;
+ case IRLAN_LMP_DISCONNECT:
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_client_state(self, IRLAN_IDLE);
+ break;
+ case IRLAN_WATCHDOG_TIMEOUT:
+ pr_debug("%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_open (self, event, skb, info)
+ *
+ * OPEN, The irlan_client has issued a OpenData command and is awaiting a
+ * reply
+ *
+ */
+static int irlan_client_state_open(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ struct qos_info qos;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch(event) {
+ case IRLAN_DATA_INDICATION:
+ irlan_client_parse_response(self, skb);
+
+ /*
+ * Check if we have got the remote TSAP for data
+ * communications
+ */
+ IRDA_ASSERT(self->dtsap_sel_data != 0, return -1;);
+
+ /* Check which access type we are dealing with */
+ switch (self->client.access_type) {
+ case ACCESS_PEER:
+ if (self->provider.state == IRLAN_OPEN) {
+
+ irlan_next_client_state(self, IRLAN_ARB);
+ irlan_do_client_event(self, IRLAN_CHECK_CON_ARB,
+ NULL);
+ } else {
+
+ irlan_next_client_state(self, IRLAN_WAIT);
+ }
+ break;
+ case ACCESS_DIRECT:
+ case ACCESS_HOSTED:
+ qos.link_disc_time.bits = 0x01; /* 3 secs */
+
+ irttp_connect_request(self->tsap_data,
+ self->dtsap_sel_data,
+ self->saddr, self->daddr, &qos,
+ IRLAN_MTU, NULL);
+
+ irlan_next_client_state(self, IRLAN_DATA);
+ break;
+ default:
+ pr_debug("%s(), unknown access type!\n", __func__);
+ break;
+ }
+ break;
+ case IRLAN_LMP_DISCONNECT:
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_client_state(self, IRLAN_IDLE);
+ break;
+ case IRLAN_WATCHDOG_TIMEOUT:
+ pr_debug("%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_wait (self, event, skb, info)
+ *
+ * WAIT, The irlan_client is waiting for the local provider to enter the
+ * provider OPEN state.
+ *
+ */
+static int irlan_client_state_wait(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch(event) {
+ case IRLAN_PROVIDER_SIGNAL:
+ irlan_next_client_state(self, IRLAN_ARB);
+ irlan_do_client_event(self, IRLAN_CHECK_CON_ARB, NULL);
+ break;
+ case IRLAN_LMP_DISCONNECT:
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_client_state(self, IRLAN_IDLE);
+ break;
+ case IRLAN_WATCHDOG_TIMEOUT:
+ pr_debug("%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+static int irlan_client_state_arb(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ struct qos_info qos;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch(event) {
+ case IRLAN_CHECK_CON_ARB:
+ if (self->client.recv_arb_val == self->provider.send_arb_val) {
+ irlan_next_client_state(self, IRLAN_CLOSE);
+ irlan_close_data_channel(self);
+ } else if (self->client.recv_arb_val <
+ self->provider.send_arb_val)
+ {
+ qos.link_disc_time.bits = 0x01; /* 3 secs */
+
+ irlan_next_client_state(self, IRLAN_DATA);
+ irttp_connect_request(self->tsap_data,
+ self->dtsap_sel_data,
+ self->saddr, self->daddr, &qos,
+ IRLAN_MTU, NULL);
+ } else if (self->client.recv_arb_val >
+ self->provider.send_arb_val)
+ {
+ pr_debug("%s(), lost the battle :-(\n", __func__);
+ }
+ break;
+ case IRLAN_DATA_CONNECT_INDICATION:
+ irlan_next_client_state(self, IRLAN_DATA);
+ break;
+ case IRLAN_LMP_DISCONNECT:
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_client_state(self, IRLAN_IDLE);
+ break;
+ case IRLAN_WATCHDOG_TIMEOUT:
+ pr_debug("%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_data (self, event, skb, info)
+ *
+ * DATA, The data channel is connected, allowing data transfers between
+ * the local and remote machines.
+ *
+ */
+static int irlan_client_state_data(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+ switch(event) {
+ case IRLAN_DATA_INDICATION:
+ irlan_client_parse_response(self, skb);
+ break;
+ case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_client_state(self, IRLAN_IDLE);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_close (self, event, skb, info)
+ *
+ *
+ *
+ */
+static int irlan_client_state_close(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_client_state_sync (self, event, skb, info)
+ *
+ *
+ *
+ */
+static int irlan_client_state_sync(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c
new file mode 100644
index 000000000..481bbc2a4
--- /dev/null
+++ b/net/irda/irlan/irlan_common.c
@@ -0,0 +1,1176 @@
+/*********************************************************************
+ *
+ * Filename: irlan_common.c
+ * Version: 0.9
+ * Description: IrDA LAN Access Protocol Implementation
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Aug 31 20:14:37 1997
+ * Modified at: Sun Dec 26 21:53:10 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1997, 1999 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/random.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/timer.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_client.h>
+#include <net/irda/irlan_provider.h>
+#include <net/irda/irlan_eth.h>
+#include <net/irda/irlan_filter.h>
+
+
+/* extern char sysctl_devname[]; */
+
+/*
+ * Master structure
+ */
+static LIST_HEAD(irlans);
+
+static void *ckey;
+static void *skey;
+
+/* Module parameters */
+static bool eth; /* Use "eth" or "irlan" name for devices */
+static int access = ACCESS_PEER; /* PEER, DIRECT or HOSTED */
+
+#ifdef CONFIG_PROC_FS
+static const char *const irlan_access[] = {
+ "UNKNOWN",
+ "DIRECT",
+ "PEER",
+ "HOSTED"
+};
+
+static const char *const irlan_media[] = {
+ "UNKNOWN",
+ "802.3",
+ "802.5"
+};
+
+extern struct proc_dir_entry *proc_irda;
+
+static int irlan_seq_open(struct inode *inode, struct file *file);
+
+static const struct file_operations irlan_fops = {
+ .owner = THIS_MODULE,
+ .open = irlan_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+extern struct proc_dir_entry *proc_irda;
+#endif /* CONFIG_PROC_FS */
+
+static struct irlan_cb __init *irlan_open(__u32 saddr, __u32 daddr);
+static void __irlan_close(struct irlan_cb *self);
+static int __irlan_insert_param(struct sk_buff *skb, char *param, int type,
+ __u8 value_byte, __u16 value_short,
+ __u8 *value_array, __u16 value_len);
+static void irlan_open_unicast_addr(struct irlan_cb *self);
+static void irlan_get_unicast_addr(struct irlan_cb *self);
+void irlan_close_tsaps(struct irlan_cb *self);
+
+/*
+ * Function irlan_init (void)
+ *
+ * Initialize IrLAN layer
+ *
+ */
+static int __init irlan_init(void)
+{
+ struct irlan_cb *new;
+ __u16 hints;
+
+#ifdef CONFIG_PROC_FS
+ { struct proc_dir_entry *proc;
+ proc = proc_create("irlan", 0, proc_irda, &irlan_fops);
+ if (!proc) {
+ printk(KERN_ERR "irlan_init: can't create /proc entry!\n");
+ return -ENODEV;
+ }
+ }
+#endif /* CONFIG_PROC_FS */
+
+ hints = irlmp_service_to_hint(S_LAN);
+
+ /* Register with IrLMP as a client */
+ ckey = irlmp_register_client(hints, &irlan_client_discovery_indication,
+ NULL, NULL);
+ if (!ckey)
+ goto err_ckey;
+
+ /* Register with IrLMP as a service */
+ skey = irlmp_register_service(hints);
+ if (!skey)
+ goto err_skey;
+
+ /* Start the master IrLAN instance (the only one for now) */
+ new = irlan_open(DEV_ADDR_ANY, DEV_ADDR_ANY);
+ if (!new)
+ goto err_open;
+
+ /* The master will only open its (listen) control TSAP */
+ irlan_provider_open_ctrl_tsap(new);
+
+ /* Do some fast discovery! */
+ irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS);
+
+ return 0;
+
+err_open:
+ irlmp_unregister_service(skey);
+err_skey:
+ irlmp_unregister_client(ckey);
+err_ckey:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("irlan", proc_irda);
+#endif /* CONFIG_PROC_FS */
+
+ return -ENOMEM;
+}
+
+static void __exit irlan_cleanup(void)
+{
+ struct irlan_cb *self, *next;
+
+ irlmp_unregister_client(ckey);
+ irlmp_unregister_service(skey);
+
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("irlan", proc_irda);
+#endif /* CONFIG_PROC_FS */
+
+ /* Cleanup any leftover network devices */
+ rtnl_lock();
+ list_for_each_entry_safe(self, next, &irlans, dev_list) {
+ __irlan_close(self);
+ }
+ rtnl_unlock();
+}
+
+/*
+ * Function irlan_open (void)
+ *
+ * Open new instance of a client/provider, we should only register the
+ * network device if this instance is ment for a particular client/provider
+ */
+static struct irlan_cb __init *irlan_open(__u32 saddr, __u32 daddr)
+{
+ struct net_device *dev;
+ struct irlan_cb *self;
+
+ /* Create network device with irlan */
+ dev = alloc_irlandev(eth ? "eth%d" : "irlan%d");
+ if (!dev)
+ return NULL;
+
+ self = netdev_priv(dev);
+ self->dev = dev;
+
+ /*
+ * Initialize local device structure
+ */
+ self->magic = IRLAN_MAGIC;
+ self->saddr = saddr;
+ self->daddr = daddr;
+
+ /* Provider access can only be PEER, DIRECT, or HOSTED */
+ self->provider.access_type = access;
+ if (access == ACCESS_DIRECT) {
+ /*
+ * Since we are emulating an IrLAN sever we will have to
+ * give ourself an ethernet address!
+ */
+ dev->dev_addr[0] = 0x40;
+ dev->dev_addr[1] = 0x00;
+ dev->dev_addr[2] = 0x00;
+ dev->dev_addr[3] = 0x00;
+ get_random_bytes(dev->dev_addr+4, 1);
+ get_random_bytes(dev->dev_addr+5, 1);
+ }
+
+ self->media = MEDIA_802_3;
+ self->disconnect_reason = LM_USER_REQUEST;
+ init_timer(&self->watchdog_timer);
+ init_timer(&self->client.kick_timer);
+ init_waitqueue_head(&self->open_wait);
+
+ skb_queue_head_init(&self->client.txq);
+
+ irlan_next_client_state(self, IRLAN_IDLE);
+ irlan_next_provider_state(self, IRLAN_IDLE);
+
+ if (register_netdev(dev)) {
+ pr_debug("%s(), register_netdev() failed!\n",
+ __func__);
+ self = NULL;
+ free_netdev(dev);
+ } else {
+ rtnl_lock();
+ list_add_rcu(&self->dev_list, &irlans);
+ rtnl_unlock();
+ }
+
+ return self;
+}
+/*
+ * Function __irlan_close (self)
+ *
+ * This function closes and deallocates the IrLAN client instances. Be
+ * aware that other functions which calls client_close() must
+ * remove self from irlans list first.
+ */
+static void __irlan_close(struct irlan_cb *self)
+{
+ ASSERT_RTNL();
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ del_timer_sync(&self->watchdog_timer);
+ del_timer_sync(&self->client.kick_timer);
+
+ /* Close all open connections and remove TSAPs */
+ irlan_close_tsaps(self);
+
+ if (self->client.iriap)
+ iriap_close(self->client.iriap);
+
+ /* Remove frames queued on the control channel */
+ skb_queue_purge(&self->client.txq);
+
+ /* Unregister and free self via destructor */
+ unregister_netdevice(self->dev);
+}
+
+/* Find any instance of irlan, used for client discovery wakeup */
+struct irlan_cb *irlan_get_any(void)
+{
+ struct irlan_cb *self;
+
+ list_for_each_entry_rcu(self, &irlans, dev_list) {
+ return self;
+ }
+ return NULL;
+}
+
+/*
+ * Function irlan_connect_indication (instance, sap, qos, max_sdu_size, skb)
+ *
+ * Here we receive the connect indication for the data channel
+ *
+ */
+static void irlan_connect_indication(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct irlan_cb *self;
+ struct tsap_cb *tsap;
+
+ self = instance;
+ tsap = sap;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+ IRDA_ASSERT(tsap == self->tsap_data,return;);
+
+ self->max_sdu_size = max_sdu_size;
+ self->max_header_size = max_header_size;
+
+ pr_debug("%s: We are now connected!\n", __func__);
+
+ del_timer(&self->watchdog_timer);
+
+ /* If you want to pass the skb to *both* state machines, you will
+ * need to skb_clone() it, so that you don't free it twice.
+ * As the state machines don't need it, git rid of it here...
+ * Jean II */
+ if (skb)
+ dev_kfree_skb(skb);
+
+ irlan_do_provider_event(self, IRLAN_DATA_CONNECT_INDICATION, NULL);
+ irlan_do_client_event(self, IRLAN_DATA_CONNECT_INDICATION, NULL);
+
+ if (self->provider.access_type == ACCESS_PEER) {
+ /*
+ * Data channel is open, so we are now allowed to
+ * configure the remote filter
+ */
+ irlan_get_unicast_addr(self);
+ irlan_open_unicast_addr(self);
+ }
+ /* Ready to transfer Ethernet frames (at last) */
+ netif_start_queue(self->dev); /* Clear reason */
+}
+
+static void irlan_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct irlan_cb *self;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ self->max_sdu_size = max_sdu_size;
+ self->max_header_size = max_header_size;
+
+ /* TODO: we could set the MTU depending on the max_sdu_size */
+
+ pr_debug("%s: We are now connected!\n", __func__);
+ del_timer(&self->watchdog_timer);
+
+ /*
+ * Data channel is open, so we are now allowed to configure the remote
+ * filter
+ */
+ irlan_get_unicast_addr(self);
+ irlan_open_unicast_addr(self);
+
+ /* Open broadcast and multicast filter by default */
+ irlan_set_broadcast_filter(self, TRUE);
+ irlan_set_multicast_filter(self, TRUE);
+
+ /* Ready to transfer Ethernet frames */
+ netif_start_queue(self->dev);
+ self->disconnect_reason = 0; /* Clear reason */
+ wake_up_interruptible(&self->open_wait);
+}
+
+/*
+ * Function irlan_client_disconnect_indication (handle)
+ *
+ * Callback function for the IrTTP layer. Indicates a disconnection of
+ * the specified connection (handle)
+ */
+static void irlan_disconnect_indication(void *instance,
+ void *sap, LM_REASON reason,
+ struct sk_buff *userdata)
+{
+ struct irlan_cb *self;
+ struct tsap_cb *tsap;
+
+ pr_debug("%s(), reason=%d\n", __func__ , reason);
+
+ self = instance;
+ tsap = sap;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+ IRDA_ASSERT(tsap != NULL, return;);
+ IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;);
+
+ IRDA_ASSERT(tsap == self->tsap_data, return;);
+
+ pr_debug("IrLAN, data channel disconnected by peer!\n");
+
+ /* Save reason so we know if we should try to reconnect or not */
+ self->disconnect_reason = reason;
+
+ switch (reason) {
+ case LM_USER_REQUEST: /* User request */
+ pr_debug("%s(), User requested\n", __func__);
+ break;
+ case LM_LAP_DISCONNECT: /* Unexpected IrLAP disconnect */
+ pr_debug("%s(), Unexpected IrLAP disconnect\n", __func__);
+ break;
+ case LM_CONNECT_FAILURE: /* Failed to establish IrLAP connection */
+ pr_debug("%s(), IrLAP connect failed\n", __func__);
+ break;
+ case LM_LAP_RESET: /* IrLAP reset */
+ pr_debug("%s(), IrLAP reset\n", __func__);
+ break;
+ case LM_INIT_DISCONNECT:
+ pr_debug("%s(), IrLMP connect failed\n", __func__);
+ break;
+ default:
+ net_err_ratelimited("%s(), Unknown disconnect reason\n",
+ __func__);
+ break;
+ }
+
+ /* If you want to pass the skb to *both* state machines, you will
+ * need to skb_clone() it, so that you don't free it twice.
+ * As the state machines don't need it, git rid of it here...
+ * Jean II */
+ if (userdata)
+ dev_kfree_skb(userdata);
+
+ irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL);
+ irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL);
+
+ wake_up_interruptible(&self->open_wait);
+}
+
+void irlan_open_data_tsap(struct irlan_cb *self)
+{
+ struct tsap_cb *tsap;
+ notify_t notify;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /* Check if already open */
+ if (self->tsap_data)
+ return;
+
+ irda_notify_init(&notify);
+
+ notify.data_indication = irlan_eth_receive;
+ notify.udata_indication = irlan_eth_receive;
+ notify.connect_indication = irlan_connect_indication;
+ notify.connect_confirm = irlan_connect_confirm;
+ notify.flow_indication = irlan_eth_flow_indication;
+ notify.disconnect_indication = irlan_disconnect_indication;
+ notify.instance = self;
+ strlcpy(notify.name, "IrLAN data", sizeof(notify.name));
+
+ tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, &notify);
+ if (!tsap) {
+ pr_debug("%s(), Got no tsap!\n", __func__);
+ return;
+ }
+ self->tsap_data = tsap;
+
+ /*
+ * This is the data TSAP selector which we will pass to the client
+ * when the client ask for it.
+ */
+ self->stsap_sel_data = self->tsap_data->stsap_sel;
+}
+
+void irlan_close_tsaps(struct irlan_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /* Disconnect and close all open TSAP connections */
+ if (self->tsap_data) {
+ irttp_disconnect_request(self->tsap_data, NULL, P_NORMAL);
+ irttp_close_tsap(self->tsap_data);
+ self->tsap_data = NULL;
+ }
+ if (self->client.tsap_ctrl) {
+ irttp_disconnect_request(self->client.tsap_ctrl, NULL,
+ P_NORMAL);
+ irttp_close_tsap(self->client.tsap_ctrl);
+ self->client.tsap_ctrl = NULL;
+ }
+ if (self->provider.tsap_ctrl) {
+ irttp_disconnect_request(self->provider.tsap_ctrl, NULL,
+ P_NORMAL);
+ irttp_close_tsap(self->provider.tsap_ctrl);
+ self->provider.tsap_ctrl = NULL;
+ }
+ self->disconnect_reason = LM_USER_REQUEST;
+}
+
+/*
+ * Function irlan_ias_register (self, tsap_sel)
+ *
+ * Register with LM-IAS
+ *
+ */
+void irlan_ias_register(struct irlan_cb *self, __u8 tsap_sel)
+{
+ struct ias_object *obj;
+ struct ias_value *new_value;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /*
+ * Check if object has already been registered by a previous provider.
+ * If that is the case, we just change the value of the attribute
+ */
+ if (!irias_find_object("IrLAN")) {
+ obj = irias_new_object("IrLAN", IAS_IRLAN_ID);
+ irias_add_integer_attrib(obj, "IrDA:TinyTP:LsapSel", tsap_sel,
+ IAS_KERNEL_ATTR);
+ irias_insert_object(obj);
+ } else {
+ new_value = irias_new_integer_value(tsap_sel);
+ irias_object_change_attribute("IrLAN", "IrDA:TinyTP:LsapSel",
+ new_value);
+ }
+
+ /* Register PnP object only if not registered before */
+ if (!irias_find_object("PnP")) {
+ obj = irias_new_object("PnP", IAS_PNP_ID);
+#if 0
+ irias_add_string_attrib(obj, "Name", sysctl_devname,
+ IAS_KERNEL_ATTR);
+#else
+ irias_add_string_attrib(obj, "Name", "Linux", IAS_KERNEL_ATTR);
+#endif
+ irias_add_string_attrib(obj, "DeviceID", "HWP19F0",
+ IAS_KERNEL_ATTR);
+ irias_add_integer_attrib(obj, "CompCnt", 1, IAS_KERNEL_ATTR);
+ if (self->provider.access_type == ACCESS_PEER)
+ irias_add_string_attrib(obj, "Comp#01", "PNP8389",
+ IAS_KERNEL_ATTR);
+ else
+ irias_add_string_attrib(obj, "Comp#01", "PNP8294",
+ IAS_KERNEL_ATTR);
+
+ irias_add_string_attrib(obj, "Manufacturer",
+ "Linux-IrDA Project", IAS_KERNEL_ATTR);
+ irias_insert_object(obj);
+ }
+}
+
+/*
+ * Function irlan_run_ctrl_tx_queue (self)
+ *
+ * Try to send the next command in the control transmit queue
+ *
+ */
+int irlan_run_ctrl_tx_queue(struct irlan_cb *self)
+{
+ struct sk_buff *skb;
+
+ if (irda_lock(&self->client.tx_busy) == FALSE)
+ return -EBUSY;
+
+ skb = skb_dequeue(&self->client.txq);
+ if (!skb) {
+ self->client.tx_busy = FALSE;
+ return 0;
+ }
+
+ /* Check that it's really possible to send commands */
+ if ((self->client.tsap_ctrl == NULL) ||
+ (self->client.state == IRLAN_IDLE))
+ {
+ self->client.tx_busy = FALSE;
+ dev_kfree_skb(skb);
+ return -1;
+ }
+ pr_debug("%s(), sending ...\n", __func__);
+
+ return irttp_data_request(self->client.tsap_ctrl, skb);
+}
+
+/*
+ * Function irlan_ctrl_data_request (self, skb)
+ *
+ * This function makes sure that commands on the control channel is being
+ * sent in a command/response fashion
+ */
+static void irlan_ctrl_data_request(struct irlan_cb *self, struct sk_buff *skb)
+{
+ /* Queue command */
+ skb_queue_tail(&self->client.txq, skb);
+
+ /* Try to send command */
+ irlan_run_ctrl_tx_queue(self);
+}
+
+/*
+ * Function irlan_get_provider_info (self)
+ *
+ * Send Get Provider Information command to peer IrLAN layer
+ *
+ */
+void irlan_get_provider_info(struct irlan_cb *self)
+{
+ struct sk_buff *skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER,
+ GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Reserve space for TTP, LMP, and LAP header */
+ skb_reserve(skb, self->client.max_header_size);
+ skb_put(skb, 2);
+
+ frame = skb->data;
+
+ frame[0] = CMD_GET_PROVIDER_INFO;
+ frame[1] = 0x00; /* Zero parameters */
+
+ irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_open_data_channel (self)
+ *
+ * Send an Open Data Command to provider
+ *
+ */
+void irlan_open_data_channel(struct irlan_cb *self)
+{
+ struct sk_buff *skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+ IRLAN_STRING_PARAMETER_LEN("MEDIA", "802.3") +
+ IRLAN_STRING_PARAMETER_LEN("ACCESS_TYPE", "DIRECT"),
+ GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, self->client.max_header_size);
+ skb_put(skb, 2);
+
+ frame = skb->data;
+
+ /* Build frame */
+ frame[0] = CMD_OPEN_DATA_CHANNEL;
+ frame[1] = 0x02; /* Two parameters */
+
+ irlan_insert_string_param(skb, "MEDIA", "802.3");
+ irlan_insert_string_param(skb, "ACCESS_TYPE", "DIRECT");
+ /* irlan_insert_string_param(skb, "MODE", "UNRELIABLE"); */
+
+/* self->use_udata = TRUE; */
+
+ irlan_ctrl_data_request(self, skb);
+}
+
+void irlan_close_data_channel(struct irlan_cb *self)
+{
+ struct sk_buff *skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /* Check if the TSAP is still there */
+ if (self->client.tsap_ctrl == NULL)
+ return;
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+ IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN"),
+ GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, self->client.max_header_size);
+ skb_put(skb, 2);
+
+ frame = skb->data;
+
+ /* Build frame */
+ frame[0] = CMD_CLOSE_DATA_CHAN;
+ frame[1] = 0x01; /* One parameter */
+
+ irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data);
+
+ irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_open_unicast_addr (self)
+ *
+ * Make IrLAN provider accept ethernet frames addressed to the unicast
+ * address.
+ *
+ */
+static void irlan_open_unicast_addr(struct irlan_cb *self)
+{
+ struct sk_buff *skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+ IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") +
+ IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") +
+ IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "FILTER"),
+ GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Reserve space for TTP, LMP, and LAP header */
+ skb_reserve(skb, self->max_header_size);
+ skb_put(skb, 2);
+
+ frame = skb->data;
+
+ frame[0] = CMD_FILTER_OPERATION;
+ frame[1] = 0x03; /* Three parameters */
+ irlan_insert_byte_param(skb, "DATA_CHAN" , self->dtsap_sel_data);
+ irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED");
+ irlan_insert_string_param(skb, "FILTER_MODE", "FILTER");
+
+ irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_set_broadcast_filter (self, status)
+ *
+ * Make IrLAN provider accept ethernet frames addressed to the broadcast
+ * address. Be careful with the use of this one, since there may be a lot
+ * of broadcast traffic out there. We can still function without this
+ * one but then _we_ have to initiate all communication with other
+ * hosts, since ARP request for this host will not be answered.
+ */
+void irlan_set_broadcast_filter(struct irlan_cb *self, int status)
+{
+ struct sk_buff *skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+ IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") +
+ IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "BROADCAST") +
+ /* We may waste one byte here...*/
+ IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "FILTER"),
+ GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Reserve space for TTP, LMP, and LAP header */
+ skb_reserve(skb, self->client.max_header_size);
+ skb_put(skb, 2);
+
+ frame = skb->data;
+
+ frame[0] = CMD_FILTER_OPERATION;
+ frame[1] = 0x03; /* Three parameters */
+ irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data);
+ irlan_insert_string_param(skb, "FILTER_TYPE", "BROADCAST");
+ if (status)
+ irlan_insert_string_param(skb, "FILTER_MODE", "FILTER");
+ else
+ irlan_insert_string_param(skb, "FILTER_MODE", "NONE");
+
+ irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_set_multicast_filter (self, status)
+ *
+ * Make IrLAN provider accept ethernet frames addressed to the multicast
+ * address.
+ *
+ */
+void irlan_set_multicast_filter(struct irlan_cb *self, int status)
+{
+ struct sk_buff *skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+ IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") +
+ IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "MULTICAST") +
+ /* We may waste one byte here...*/
+ IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "NONE"),
+ GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Reserve space for TTP, LMP, and LAP header */
+ skb_reserve(skb, self->client.max_header_size);
+ skb_put(skb, 2);
+
+ frame = skb->data;
+
+ frame[0] = CMD_FILTER_OPERATION;
+ frame[1] = 0x03; /* Three parameters */
+ irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data);
+ irlan_insert_string_param(skb, "FILTER_TYPE", "MULTICAST");
+ if (status)
+ irlan_insert_string_param(skb, "FILTER_MODE", "ALL");
+ else
+ irlan_insert_string_param(skb, "FILTER_MODE", "NONE");
+
+ irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_get_unicast_addr (self)
+ *
+ * Retrieves the unicast address from the IrLAN provider. This address
+ * will be inserted into the devices structure, so the ethernet layer
+ * can construct its packets.
+ *
+ */
+static void irlan_get_unicast_addr(struct irlan_cb *self)
+{
+ struct sk_buff *skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+ IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") +
+ IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") +
+ IRLAN_STRING_PARAMETER_LEN("FILTER_OPERATION",
+ "DYNAMIC"),
+ GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Reserve space for TTP, LMP, and LAP header */
+ skb_reserve(skb, self->client.max_header_size);
+ skb_put(skb, 2);
+
+ frame = skb->data;
+
+ frame[0] = CMD_FILTER_OPERATION;
+ frame[1] = 0x03; /* Three parameters */
+ irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data);
+ irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED");
+ irlan_insert_string_param(skb, "FILTER_OPERATION", "DYNAMIC");
+
+ irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_get_media_char (self)
+ *
+ *
+ *
+ */
+void irlan_get_media_char(struct irlan_cb *self)
+{
+ struct sk_buff *skb;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+ IRLAN_STRING_PARAMETER_LEN("MEDIA", "802.3"),
+ GFP_ATOMIC);
+
+ if (!skb)
+ return;
+
+ /* Reserve space for TTP, LMP, and LAP header */
+ skb_reserve(skb, self->client.max_header_size);
+ skb_put(skb, 2);
+
+ frame = skb->data;
+
+ /* Build frame */
+ frame[0] = CMD_GET_MEDIA_CHAR;
+ frame[1] = 0x01; /* One parameter */
+
+ irlan_insert_string_param(skb, "MEDIA", "802.3");
+ irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function insert_byte_param (skb, param, value)
+ *
+ * Insert byte parameter into frame
+ *
+ */
+int irlan_insert_byte_param(struct sk_buff *skb, char *param, __u8 value)
+{
+ return __irlan_insert_param(skb, param, IRLAN_BYTE, value, 0, NULL, 0);
+}
+
+int irlan_insert_short_param(struct sk_buff *skb, char *param, __u16 value)
+{
+ return __irlan_insert_param(skb, param, IRLAN_SHORT, 0, value, NULL, 0);
+}
+
+/*
+ * Function insert_string (skb, param, value)
+ *
+ * Insert string parameter into frame
+ *
+ */
+int irlan_insert_string_param(struct sk_buff *skb, char *param, char *string)
+{
+ int string_len = strlen(string);
+
+ return __irlan_insert_param(skb, param, IRLAN_ARRAY, 0, 0, string,
+ string_len);
+}
+
+/*
+ * Function insert_array_param(skb, param, value, len_value)
+ *
+ * Insert array parameter into frame
+ *
+ */
+int irlan_insert_array_param(struct sk_buff *skb, char *name, __u8 *array,
+ __u16 array_len)
+{
+ return __irlan_insert_param(skb, name, IRLAN_ARRAY, 0, 0, array,
+ array_len);
+}
+
+/*
+ * Function insert_param (skb, param, value, byte)
+ *
+ * Insert parameter at end of buffer, structure of a parameter is:
+ *
+ * -----------------------------------------------------------------------
+ * | Name Length[1] | Param Name[1..255] | Val Length[2] | Value[0..1016]|
+ * -----------------------------------------------------------------------
+ */
+static int __irlan_insert_param(struct sk_buff *skb, char *param, int type,
+ __u8 value_byte, __u16 value_short,
+ __u8 *value_array, __u16 value_len)
+{
+ __u8 *frame;
+ __u8 param_len;
+ __le16 tmp_le; /* Temporary value in little endian format */
+ int n=0;
+
+ if (skb == NULL) {
+ pr_debug("%s(), Got NULL skb\n", __func__);
+ return 0;
+ }
+
+ param_len = strlen(param);
+ switch (type) {
+ case IRLAN_BYTE:
+ value_len = 1;
+ break;
+ case IRLAN_SHORT:
+ value_len = 2;
+ break;
+ case IRLAN_ARRAY:
+ IRDA_ASSERT(value_array != NULL, return 0;);
+ IRDA_ASSERT(value_len > 0, return 0;);
+ break;
+ default:
+ pr_debug("%s(), Unknown parameter type!\n", __func__);
+ return 0;
+ }
+
+ /* Insert at end of sk-buffer */
+ frame = skb_tail_pointer(skb);
+
+ /* Make space for data */
+ if (skb_tailroom(skb) < (param_len+value_len+3)) {
+ pr_debug("%s(), No more space at end of skb\n", __func__);
+ return 0;
+ }
+ skb_put(skb, param_len+value_len+3);
+
+ /* Insert parameter length */
+ frame[n++] = param_len;
+
+ /* Insert parameter */
+ memcpy(frame+n, param, param_len); n += param_len;
+
+ /* Insert value length (2 byte little endian format, LSB first) */
+ tmp_le = cpu_to_le16(value_len);
+ memcpy(frame+n, &tmp_le, 2); n += 2; /* To avoid alignment problems */
+
+ /* Insert value */
+ switch (type) {
+ case IRLAN_BYTE:
+ frame[n++] = value_byte;
+ break;
+ case IRLAN_SHORT:
+ tmp_le = cpu_to_le16(value_short);
+ memcpy(frame+n, &tmp_le, 2); n += 2;
+ break;
+ case IRLAN_ARRAY:
+ memcpy(frame+n, value_array, value_len); n+=value_len;
+ break;
+ default:
+ break;
+ }
+ IRDA_ASSERT(n == (param_len+value_len+3), return 0;);
+
+ return param_len+value_len+3;
+}
+
+/*
+ * Function irlan_extract_param (buf, name, value, len)
+ *
+ * Extracts a single parameter name/value pair from buffer and updates
+ * the buffer pointer to point to the next name/value pair.
+ */
+int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len)
+{
+ __u8 name_len;
+ __u16 val_len;
+ int n=0;
+
+ /* get length of parameter name (1 byte) */
+ name_len = buf[n++];
+
+ if (name_len > 254) {
+ pr_debug("%s(), name_len > 254\n", __func__);
+ return -RSP_INVALID_COMMAND_FORMAT;
+ }
+
+ /* get parameter name */
+ memcpy(name, buf+n, name_len);
+ name[name_len] = '\0';
+ n+=name_len;
+
+ /*
+ * Get length of parameter value (2 bytes in little endian
+ * format)
+ */
+ memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */
+ le16_to_cpus(&val_len); n+=2;
+
+ if (val_len >= 1016) {
+ pr_debug("%s(), parameter length to long\n", __func__);
+ return -RSP_INVALID_COMMAND_FORMAT;
+ }
+ *len = val_len;
+
+ /* get parameter value */
+ memcpy(value, buf+n, val_len);
+ value[val_len] = '\0';
+ n+=val_len;
+
+ pr_debug("Parameter: %s ", name);
+ pr_debug("Value: %s\n", value);
+
+ return n;
+}
+
+#ifdef CONFIG_PROC_FS
+
+/*
+ * Start of reading /proc entries.
+ * Return entry at pos,
+ * or start_token to indicate print header line
+ * or NULL if end of file
+ */
+static void *irlan_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ rcu_read_lock();
+ return seq_list_start_head(&irlans, *pos);
+}
+
+/* Return entry after v, and increment pos */
+static void *irlan_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &irlans, pos);
+}
+
+/* End of reading /proc file */
+static void irlan_seq_stop(struct seq_file *seq, void *v)
+{
+ rcu_read_unlock();
+}
+
+
+/*
+ * Show one entry in /proc file.
+ */
+static int irlan_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == &irlans)
+ seq_puts(seq, "IrLAN instances:\n");
+ else {
+ struct irlan_cb *self = list_entry(v, struct irlan_cb, dev_list);
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+ seq_printf(seq,"ifname: %s,\n",
+ self->dev->name);
+ seq_printf(seq,"client state: %s, ",
+ irlan_state[ self->client.state]);
+ seq_printf(seq,"provider state: %s,\n",
+ irlan_state[ self->provider.state]);
+ seq_printf(seq,"saddr: %#08x, ",
+ self->saddr);
+ seq_printf(seq,"daddr: %#08x\n",
+ self->daddr);
+ seq_printf(seq,"version: %d.%d,\n",
+ self->version[1], self->version[0]);
+ seq_printf(seq,"access type: %s\n",
+ irlan_access[self->client.access_type]);
+ seq_printf(seq,"media: %s\n",
+ irlan_media[self->media]);
+
+ seq_printf(seq,"local filter:\n");
+ seq_printf(seq,"remote filter: ");
+ irlan_print_filter(seq, self->client.filter_type);
+ seq_printf(seq,"tx busy: %s\n",
+ netif_queue_stopped(self->dev) ? "TRUE" : "FALSE");
+
+ seq_putc(seq,'\n');
+ }
+ return 0;
+}
+
+static const struct seq_operations irlan_seq_ops = {
+ .start = irlan_seq_start,
+ .next = irlan_seq_next,
+ .stop = irlan_seq_stop,
+ .show = irlan_seq_show,
+};
+
+static int irlan_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &irlan_seq_ops);
+}
+#endif
+
+MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no>");
+MODULE_DESCRIPTION("The Linux IrDA LAN protocol");
+MODULE_LICENSE("GPL");
+
+module_param(eth, bool, 0);
+MODULE_PARM_DESC(eth, "Name devices ethX (0) or irlanX (1)");
+module_param(access, int, 0);
+MODULE_PARM_DESC(access, "Access type DIRECT=1, PEER=2, HOSTED=3");
+
+module_init(irlan_init);
+module_exit(irlan_cleanup);
+
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
new file mode 100644
index 000000000..fcfbe5794
--- /dev/null
+++ b/net/irda/irlan/irlan_eth.c
@@ -0,0 +1,340 @@
+/*********************************************************************
+ *
+ * Filename: irlan_eth.c
+ * Version:
+ * Description:
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Thu Oct 15 08:37:58 1998
+ * Modified at: Tue Mar 21 09:06:41 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ * Sources: skeleton.c by Donald Becker <becker@CESDIS.gsfc.nasa.gov>
+ * slip.c by Laurence Culhane, <loz@holmes.demon.co.uk>
+ * Fred N. van Kempen, <waltje@uwalt.nl.mugnet.org>
+ *
+ * Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <net/arp.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_client.h>
+#include <net/irda/irlan_event.h>
+#include <net/irda/irlan_eth.h>
+
+static int irlan_eth_open(struct net_device *dev);
+static int irlan_eth_close(struct net_device *dev);
+static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+static void irlan_eth_set_multicast_list(struct net_device *dev);
+
+static const struct net_device_ops irlan_eth_netdev_ops = {
+ .ndo_open = irlan_eth_open,
+ .ndo_stop = irlan_eth_close,
+ .ndo_start_xmit = irlan_eth_xmit,
+ .ndo_set_rx_mode = irlan_eth_set_multicast_list,
+ .ndo_change_mtu = eth_change_mtu,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
+/*
+ * Function irlan_eth_setup (dev)
+ *
+ * The network device initialization function.
+ *
+ */
+static void irlan_eth_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->netdev_ops = &irlan_eth_netdev_ops;
+ dev->destructor = free_netdev;
+
+
+ /*
+ * Lets do all queueing in IrTTP instead of this device driver.
+ * Queueing here as well can introduce some strange latency
+ * problems, which we will avoid by setting the queue size to 0.
+ */
+ /*
+ * The bugs in IrTTP and IrLAN that created this latency issue
+ * have now been fixed, and we can propagate flow control properly
+ * to the network layer. However, this requires a minimal queue of
+ * packets for the device.
+ * Without flow control, the Tx Queue is 14 (ttp) + 0 (dev) = 14
+ * With flow control, the Tx Queue is 7 (ttp) + 4 (dev) = 11
+ * See irlan_eth_flow_indication()...
+ * Note : this number was randomly selected and would need to
+ * be adjusted.
+ * Jean II */
+ dev->tx_queue_len = 4;
+}
+
+/*
+ * Function alloc_irlandev
+ *
+ * Allocate network device and control block
+ *
+ */
+struct net_device *alloc_irlandev(const char *name)
+{
+ return alloc_netdev(sizeof(struct irlan_cb), name, NET_NAME_UNKNOWN,
+ irlan_eth_setup);
+}
+
+/*
+ * Function irlan_eth_open (dev)
+ *
+ * Network device has been opened by user
+ *
+ */
+static int irlan_eth_open(struct net_device *dev)
+{
+ struct irlan_cb *self = netdev_priv(dev);
+
+ /* Ready to play! */
+ netif_stop_queue(dev); /* Wait until data link is ready */
+
+ /* We are now open, so time to do some work */
+ self->disconnect_reason = 0;
+ irlan_client_wakeup(self, self->saddr, self->daddr);
+
+ /* Make sure we have a hardware address before we return,
+ so DHCP clients gets happy */
+ return wait_event_interruptible(self->open_wait,
+ !self->tsap_data->connected);
+}
+
+/*
+ * Function irlan_eth_close (dev)
+ *
+ * Stop the ether network device, his function will usually be called by
+ * ifconfig down. We should now disconnect the link, We start the
+ * close timer, so that the instance will be removed if we are unable
+ * to discover the remote device after the disconnect.
+ */
+static int irlan_eth_close(struct net_device *dev)
+{
+ struct irlan_cb *self = netdev_priv(dev);
+
+ /* Stop device */
+ netif_stop_queue(dev);
+
+ irlan_close_data_channel(self);
+ irlan_close_tsaps(self);
+
+ irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL);
+ irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL);
+
+ /* Remove frames queued on the control channel */
+ skb_queue_purge(&self->client.txq);
+
+ self->client.tx_busy = 0;
+
+ return 0;
+}
+
+/*
+ * Function irlan_eth_tx (skb)
+ *
+ * Transmits ethernet frames over IrDA link.
+ *
+ */
+static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct irlan_cb *self = netdev_priv(dev);
+ int ret;
+ unsigned int len;
+
+ /* skb headroom large enough to contain all IrDA-headers? */
+ if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) {
+ struct sk_buff *new_skb =
+ skb_realloc_headroom(skb, self->max_header_size);
+
+ /* We have to free the original skb anyway */
+ dev_kfree_skb(skb);
+
+ /* Did the realloc succeed? */
+ if (new_skb == NULL)
+ return NETDEV_TX_OK;
+
+ /* Use the new skb instead */
+ skb = new_skb;
+ }
+
+ dev->trans_start = jiffies;
+
+ len = skb->len;
+ /* Now queue the packet in the transport layer */
+ if (self->use_udata)
+ ret = irttp_udata_request(self->tsap_data, skb);
+ else
+ ret = irttp_data_request(self->tsap_data, skb);
+
+ if (ret < 0) {
+ /*
+ * IrTTPs tx queue is full, so we just have to
+ * drop the frame! You might think that we should
+ * just return -1 and don't deallocate the frame,
+ * but that is dangerous since it's possible that
+ * we have replaced the original skb with a new
+ * one with larger headroom, and that would really
+ * confuse do_dev_queue_xmit() in dev.c! I have
+ * tried :-) DB
+ */
+ /* irttp_data_request already free the packet */
+ dev->stats.tx_dropped++;
+ } else {
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += len;
+ }
+
+ return NETDEV_TX_OK;
+}
+
+/*
+ * Function irlan_eth_receive (handle, skb)
+ *
+ * This function gets the data that is received on the data channel
+ *
+ */
+int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb)
+{
+ struct irlan_cb *self = instance;
+ struct net_device *dev = self->dev;
+
+ if (skb == NULL) {
+ dev->stats.rx_dropped++;
+ return 0;
+ }
+ if (skb->len < ETH_HLEN) {
+ pr_debug("%s() : IrLAN frame too short (%d)\n",
+ __func__, skb->len);
+ dev->stats.rx_dropped++;
+ dev_kfree_skb(skb);
+ return 0;
+ }
+
+ /*
+ * Adopt this frame! Important to set all these fields since they
+ * might have been previously set by the low level IrDA network
+ * device driver
+ */
+ skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */
+
+ dev->stats.rx_packets++;
+ dev->stats.rx_bytes += skb->len;
+
+ netif_rx(skb); /* Eat it! */
+
+ return 0;
+}
+
+/*
+ * Function irlan_eth_flow (status)
+ *
+ * Do flow control between IP/Ethernet and IrLAN/IrTTP. This is done by
+ * controlling the queue stop/start.
+ *
+ * The IrDA link layer has the advantage to have flow control, and
+ * IrTTP now properly handles that. Flow controlling the higher layers
+ * prevent us to drop Tx packets in here (up to 15% for a TCP socket,
+ * more for UDP socket).
+ * Also, this allow us to reduce the overall transmit queue, which means
+ * less latency in case of mixed traffic.
+ * Jean II
+ */
+void irlan_eth_flow_indication(void *instance, void *sap, LOCAL_FLOW flow)
+{
+ struct irlan_cb *self;
+ struct net_device *dev;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ dev = self->dev;
+
+ IRDA_ASSERT(dev != NULL, return;);
+
+ pr_debug("%s() : flow %s ; running %d\n", __func__,
+ flow == FLOW_STOP ? "FLOW_STOP" : "FLOW_START",
+ netif_running(dev));
+
+ switch (flow) {
+ case FLOW_STOP:
+ /* IrTTP is full, stop higher layers */
+ netif_stop_queue(dev);
+ break;
+ case FLOW_START:
+ default:
+ /* Tell upper layers that its time to transmit frames again */
+ /* Schedule network layer */
+ netif_wake_queue(dev);
+ break;
+ }
+}
+
+/*
+ * Function set_multicast_list (dev)
+ *
+ * Configure the filtering of the device
+ *
+ */
+#define HW_MAX_ADDRS 4 /* Must query to get it! */
+static void irlan_eth_set_multicast_list(struct net_device *dev)
+{
+ struct irlan_cb *self = netdev_priv(dev);
+
+ /* Check if data channel has been connected yet */
+ if (self->client.state != IRLAN_DATA) {
+ pr_debug("%s(), delaying!\n", __func__);
+ return;
+ }
+
+ if (dev->flags & IFF_PROMISC) {
+ /* Enable promiscuous mode */
+ net_warn_ratelimited("Promiscuous mode not implemented by IrLAN!\n");
+ } else if ((dev->flags & IFF_ALLMULTI) ||
+ netdev_mc_count(dev) > HW_MAX_ADDRS) {
+ /* Disable promiscuous mode, use normal mode. */
+ pr_debug("%s(), Setting multicast filter\n", __func__);
+ /* hardware_set_filter(NULL); */
+
+ irlan_set_multicast_filter(self, TRUE);
+ } else if (!netdev_mc_empty(dev)) {
+ pr_debug("%s(), Setting multicast filter\n", __func__);
+ /* Walk the address list, and load the filter */
+ /* hardware_set_filter(dev->mc_list); */
+
+ irlan_set_multicast_filter(self, TRUE);
+ } else {
+ pr_debug("%s(), Clearing multicast filter\n", __func__);
+ irlan_set_multicast_filter(self, FALSE);
+ }
+
+ if (dev->flags & IFF_BROADCAST)
+ irlan_set_broadcast_filter(self, TRUE);
+ else
+ irlan_set_broadcast_filter(self, FALSE);
+}
diff --git a/net/irda/irlan/irlan_event.c b/net/irda/irlan/irlan_event.c
new file mode 100644
index 000000000..9a1cc11c1
--- /dev/null
+++ b/net/irda/irlan/irlan_event.c
@@ -0,0 +1,60 @@
+/*********************************************************************
+ *
+ * Filename: irlan_event.c
+ * Version:
+ * Description:
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Tue Oct 20 09:10:16 1998
+ * Modified at: Sat Oct 30 12:59:01 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <net/irda/irlan_event.h>
+
+const char * const irlan_state[] = {
+ "IRLAN_IDLE",
+ "IRLAN_QUERY",
+ "IRLAN_CONN",
+ "IRLAN_INFO",
+ "IRLAN_MEDIA",
+ "IRLAN_OPEN",
+ "IRLAN_WAIT",
+ "IRLAN_ARB",
+ "IRLAN_DATA",
+ "IRLAN_CLOSE",
+ "IRLAN_SYNC",
+};
+
+void irlan_next_client_state(struct irlan_cb *self, IRLAN_STATE state)
+{
+ pr_debug("%s(), %s\n", __func__ , irlan_state[state]);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ self->client.state = state;
+}
+
+void irlan_next_provider_state(struct irlan_cb *self, IRLAN_STATE state)
+{
+ pr_debug("%s(), %s\n", __func__ , irlan_state[state]);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ self->provider.state = state;
+}
+
diff --git a/net/irda/irlan/irlan_filter.c b/net/irda/irlan/irlan_filter.c
new file mode 100644
index 000000000..e755e90b2
--- /dev/null
+++ b/net/irda/irlan/irlan_filter.c
@@ -0,0 +1,240 @@
+/*********************************************************************
+ *
+ * Filename: irlan_filter.c
+ * Version:
+ * Description:
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Fri Jan 29 11:16:38 1999
+ * Modified at: Sat Oct 30 12:58:45 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_filter.h>
+
+/*
+ * Function irlan_filter_request (self, skb)
+ *
+ * Handle filter request from client peer device
+ *
+ */
+void irlan_filter_request(struct irlan_cb *self, struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ if ((self->provider.filter_type == IRLAN_DIRECTED) &&
+ (self->provider.filter_operation == DYNAMIC))
+ {
+ pr_debug("Giving peer a dynamic Ethernet address\n");
+ self->provider.mac_address[0] = 0x40;
+ self->provider.mac_address[1] = 0x00;
+ self->provider.mac_address[2] = 0x00;
+ self->provider.mac_address[3] = 0x00;
+
+ /* Use arbitration value to generate MAC address */
+ if (self->provider.access_type == ACCESS_PEER) {
+ self->provider.mac_address[4] =
+ self->provider.send_arb_val & 0xff;
+ self->provider.mac_address[5] =
+ (self->provider.send_arb_val >> 8) & 0xff;
+ } else {
+ /* Just generate something for now */
+ get_random_bytes(self->provider.mac_address+4, 1);
+ get_random_bytes(self->provider.mac_address+5, 1);
+ }
+
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x03;
+ irlan_insert_string_param(skb, "FILTER_MODE", "NONE");
+ irlan_insert_short_param(skb, "MAX_ENTRY", 0x0001);
+ irlan_insert_array_param(skb, "FILTER_ENTRY",
+ self->provider.mac_address, 6);
+ return;
+ }
+
+ if ((self->provider.filter_type == IRLAN_DIRECTED) &&
+ (self->provider.filter_mode == FILTER))
+ {
+ pr_debug("Directed filter on\n");
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x00;
+ return;
+ }
+ if ((self->provider.filter_type == IRLAN_DIRECTED) &&
+ (self->provider.filter_mode == NONE))
+ {
+ pr_debug("Directed filter off\n");
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x00;
+ return;
+ }
+
+ if ((self->provider.filter_type == IRLAN_BROADCAST) &&
+ (self->provider.filter_mode == FILTER))
+ {
+ pr_debug("Broadcast filter on\n");
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x00;
+ return;
+ }
+ if ((self->provider.filter_type == IRLAN_BROADCAST) &&
+ (self->provider.filter_mode == NONE))
+ {
+ pr_debug("Broadcast filter off\n");
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x00;
+ return;
+ }
+ if ((self->provider.filter_type == IRLAN_MULTICAST) &&
+ (self->provider.filter_mode == FILTER))
+ {
+ pr_debug("Multicast filter on\n");
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x00;
+ return;
+ }
+ if ((self->provider.filter_type == IRLAN_MULTICAST) &&
+ (self->provider.filter_mode == NONE))
+ {
+ pr_debug("Multicast filter off\n");
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x00;
+ return;
+ }
+ if ((self->provider.filter_type == IRLAN_MULTICAST) &&
+ (self->provider.filter_operation == GET))
+ {
+ pr_debug("Multicast filter get\n");
+ skb->data[0] = 0x00; /* Success? */
+ skb->data[1] = 0x02;
+ irlan_insert_string_param(skb, "FILTER_MODE", "NONE");
+ irlan_insert_short_param(skb, "MAX_ENTRY", 16);
+ return;
+ }
+ skb->data[0] = 0x00; /* Command not supported */
+ skb->data[1] = 0x00;
+
+ pr_debug("Not implemented!\n");
+}
+
+/*
+ * Function check_request_param (self, param, value)
+ *
+ * Check parameters in request from peer device
+ *
+ */
+void irlan_check_command_param(struct irlan_cb *self, char *param, char *value)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ pr_debug("%s, %s\n", param, value);
+
+ /*
+ * This is experimental!! DB.
+ */
+ if (strcmp(param, "MODE") == 0) {
+ self->use_udata = TRUE;
+ return;
+ }
+
+ /*
+ * FILTER_TYPE
+ */
+ if (strcmp(param, "FILTER_TYPE") == 0) {
+ if (strcmp(value, "DIRECTED") == 0) {
+ self->provider.filter_type = IRLAN_DIRECTED;
+ return;
+ }
+ if (strcmp(value, "MULTICAST") == 0) {
+ self->provider.filter_type = IRLAN_MULTICAST;
+ return;
+ }
+ if (strcmp(value, "BROADCAST") == 0) {
+ self->provider.filter_type = IRLAN_BROADCAST;
+ return;
+ }
+ }
+ /*
+ * FILTER_MODE
+ */
+ if (strcmp(param, "FILTER_MODE") == 0) {
+ if (strcmp(value, "ALL") == 0) {
+ self->provider.filter_mode = ALL;
+ return;
+ }
+ if (strcmp(value, "FILTER") == 0) {
+ self->provider.filter_mode = FILTER;
+ return;
+ }
+ if (strcmp(value, "NONE") == 0) {
+ self->provider.filter_mode = FILTER;
+ return;
+ }
+ }
+ /*
+ * FILTER_OPERATION
+ */
+ if (strcmp(param, "FILTER_OPERATION") == 0) {
+ if (strcmp(value, "DYNAMIC") == 0) {
+ self->provider.filter_operation = DYNAMIC;
+ return;
+ }
+ if (strcmp(value, "GET") == 0) {
+ self->provider.filter_operation = GET;
+ return;
+ }
+ }
+}
+
+/*
+ * Function irlan_print_filter (filter_type, buf)
+ *
+ * Print status of filter. Used by /proc file system
+ *
+ */
+#ifdef CONFIG_PROC_FS
+#define MASK2STR(m,s) { .mask = m, .str = s }
+
+void irlan_print_filter(struct seq_file *seq, int filter_type)
+{
+ static struct {
+ int mask;
+ const char *str;
+ } filter_mask2str[] = {
+ MASK2STR(IRLAN_DIRECTED, "DIRECTED"),
+ MASK2STR(IRLAN_FUNCTIONAL, "FUNCTIONAL"),
+ MASK2STR(IRLAN_GROUP, "GROUP"),
+ MASK2STR(IRLAN_MAC_FRAME, "MAC_FRAME"),
+ MASK2STR(IRLAN_MULTICAST, "MULTICAST"),
+ MASK2STR(IRLAN_BROADCAST, "BROADCAST"),
+ MASK2STR(IRLAN_IPX_SOCKET, "IPX_SOCKET"),
+ MASK2STR(0, NULL)
+ }, *p;
+
+ for (p = filter_mask2str; p->str; p++) {
+ if (filter_type & p->mask)
+ seq_printf(seq, "%s ", p->str);
+ }
+ seq_putc(seq, '\n');
+}
+#undef MASK2STR
+#endif
diff --git a/net/irda/irlan/irlan_provider.c b/net/irda/irlan/irlan_provider.c
new file mode 100644
index 000000000..15c292cf2
--- /dev/null
+++ b/net/irda/irlan/irlan_provider.c
@@ -0,0 +1,408 @@
+/*********************************************************************
+ *
+ * Filename: irlan_provider.c
+ * Version: 0.9
+ * Description: IrDA LAN Access Protocol Implementation
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Aug 31 20:14:37 1997
+ * Modified at: Sat Oct 30 12:52:10 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ * Sources: skeleton.c by Donald Becker <becker@CESDIS.gsfc.nasa.gov>
+ * slip.c by Laurence Culhane, <loz@holmes.demon.co.uk>
+ * Fred N. van Kempen, <waltje@uwalt.nl.mugnet.org>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/iriap.h>
+#include <net/irda/timer.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_eth.h>
+#include <net/irda/irlan_event.h>
+#include <net/irda/irlan_provider.h>
+#include <net/irda/irlan_filter.h>
+#include <net/irda/irlan_client.h>
+
+static void irlan_provider_connect_indication(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb);
+
+/*
+ * Function irlan_provider_control_data_indication (handle, skb)
+ *
+ * This function gets the data that is received on the control channel
+ *
+ */
+static int irlan_provider_data_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct irlan_cb *self;
+ __u8 code;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ code = skb->data[0];
+ switch(code) {
+ case CMD_GET_PROVIDER_INFO:
+ pr_debug("Got GET_PROVIDER_INFO command!\n");
+ irlan_do_provider_event(self, IRLAN_GET_INFO_CMD, skb);
+ break;
+
+ case CMD_GET_MEDIA_CHAR:
+ pr_debug("Got GET_MEDIA_CHAR command!\n");
+ irlan_do_provider_event(self, IRLAN_GET_MEDIA_CMD, skb);
+ break;
+ case CMD_OPEN_DATA_CHANNEL:
+ pr_debug("Got OPEN_DATA_CHANNEL command!\n");
+ irlan_do_provider_event(self, IRLAN_OPEN_DATA_CMD, skb);
+ break;
+ case CMD_FILTER_OPERATION:
+ pr_debug("Got FILTER_OPERATION command!\n");
+ irlan_do_provider_event(self, IRLAN_FILTER_CONFIG_CMD, skb);
+ break;
+ case CMD_RECONNECT_DATA_CHAN:
+ pr_debug("%s(), Got RECONNECT_DATA_CHAN command\n", __func__);
+ pr_debug("%s(), NOT IMPLEMENTED\n", __func__);
+ break;
+ case CMD_CLOSE_DATA_CHAN:
+ pr_debug("Got CLOSE_DATA_CHAN command!\n");
+ pr_debug("%s(), NOT IMPLEMENTED\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown command!\n", __func__);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Function irlan_provider_connect_indication (handle, skb, priv)
+ *
+ * Got connection from peer IrLAN client
+ *
+ */
+static void irlan_provider_connect_indication(void *instance, void *sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct irlan_cb *self;
+ struct tsap_cb *tsap;
+
+ self = instance;
+ tsap = sap;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ IRDA_ASSERT(tsap == self->provider.tsap_ctrl,return;);
+ IRDA_ASSERT(self->provider.state == IRLAN_IDLE, return;);
+
+ self->provider.max_sdu_size = max_sdu_size;
+ self->provider.max_header_size = max_header_size;
+
+ irlan_do_provider_event(self, IRLAN_CONNECT_INDICATION, NULL);
+
+ /*
+ * If we are in peer mode, the client may not have got the discovery
+ * indication it needs to make progress. If the client is still in
+ * IDLE state, we must kick it.
+ */
+ if ((self->provider.access_type == ACCESS_PEER) &&
+ (self->client.state == IRLAN_IDLE))
+ {
+ irlan_client_wakeup(self, self->saddr, self->daddr);
+ }
+}
+
+/*
+ * Function irlan_provider_connect_response (handle)
+ *
+ * Accept incoming connection
+ *
+ */
+void irlan_provider_connect_response(struct irlan_cb *self,
+ struct tsap_cb *tsap)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ /* Just accept */
+ irttp_connect_response(tsap, IRLAN_MTU, NULL);
+}
+
+static void irlan_provider_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason,
+ struct sk_buff *userdata)
+{
+ struct irlan_cb *self;
+ struct tsap_cb *tsap;
+
+ pr_debug("%s(), reason=%d\n", __func__ , reason);
+
+ self = instance;
+ tsap = sap;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+ IRDA_ASSERT(tsap != NULL, return;);
+ IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;);
+
+ IRDA_ASSERT(tsap == self->provider.tsap_ctrl, return;);
+
+ irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL);
+}
+
+/*
+ * Function irlan_parse_open_data_cmd (self, skb)
+ *
+ *
+ *
+ */
+int irlan_parse_open_data_cmd(struct irlan_cb *self, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = irlan_provider_parse_command(self, CMD_OPEN_DATA_CHANNEL, skb);
+
+ /* Open data channel */
+ irlan_open_data_tsap(self);
+
+ return ret;
+}
+
+/*
+ * Function parse_command (skb)
+ *
+ * Extract all parameters from received buffer, then feed them to
+ * check_params for parsing
+ *
+ */
+int irlan_provider_parse_command(struct irlan_cb *self, int cmd,
+ struct sk_buff *skb)
+{
+ __u8 *frame;
+ __u8 *ptr;
+ int count;
+ __u16 val_len;
+ int i;
+ char *name;
+ char *value;
+ int ret = RSP_SUCCESS;
+
+ IRDA_ASSERT(skb != NULL, return -RSP_PROTOCOL_ERROR;);
+
+ pr_debug("%s(), skb->len=%d\n", __func__ , (int)skb->len);
+
+ IRDA_ASSERT(self != NULL, return -RSP_PROTOCOL_ERROR;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -RSP_PROTOCOL_ERROR;);
+
+ if (!skb)
+ return -RSP_PROTOCOL_ERROR;
+
+ frame = skb->data;
+
+ name = kmalloc(255, GFP_ATOMIC);
+ if (!name)
+ return -RSP_INSUFFICIENT_RESOURCES;
+ value = kmalloc(1016, GFP_ATOMIC);
+ if (!value) {
+ kfree(name);
+ return -RSP_INSUFFICIENT_RESOURCES;
+ }
+
+ /* How many parameters? */
+ count = frame[1];
+
+ pr_debug("Got %d parameters\n", count);
+
+ ptr = frame+2;
+
+ /* For all parameters */
+ for (i=0; i<count;i++) {
+ ret = irlan_extract_param(ptr, name, value, &val_len);
+ if (ret < 0) {
+ pr_debug("%s(), IrLAN, Error!\n", __func__);
+ break;
+ }
+ ptr+=ret;
+ ret = RSP_SUCCESS;
+ irlan_check_command_param(self, name, value);
+ }
+ /* Cleanup */
+ kfree(name);
+ kfree(value);
+
+ return ret;
+}
+
+/*
+ * Function irlan_provider_send_reply (self, info)
+ *
+ * Send reply to query to peer IrLAN layer
+ *
+ */
+void irlan_provider_send_reply(struct irlan_cb *self, int command,
+ int ret_code)
+{
+ struct sk_buff *skb;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+ skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+ /* Bigger param length comes from CMD_GET_MEDIA_CHAR */
+ IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") +
+ IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "BROADCAST") +
+ IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "MULTICAST") +
+ IRLAN_STRING_PARAMETER_LEN("ACCESS_TYPE", "HOSTED"),
+ GFP_ATOMIC);
+
+ if (!skb)
+ return;
+
+ /* Reserve space for TTP, LMP, and LAP header */
+ skb_reserve(skb, self->provider.max_header_size);
+ skb_put(skb, 2);
+
+ switch (command) {
+ case CMD_GET_PROVIDER_INFO:
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x02; /* 2 parameters */
+ switch (self->media) {
+ case MEDIA_802_3:
+ irlan_insert_string_param(skb, "MEDIA", "802.3");
+ break;
+ case MEDIA_802_5:
+ irlan_insert_string_param(skb, "MEDIA", "802.5");
+ break;
+ default:
+ pr_debug("%s(), unknown media type!\n", __func__);
+ break;
+ }
+ irlan_insert_short_param(skb, "IRLAN_VER", 0x0101);
+ break;
+
+ case CMD_GET_MEDIA_CHAR:
+ skb->data[0] = 0x00; /* Success */
+ skb->data[1] = 0x05; /* 5 parameters */
+ irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED");
+ irlan_insert_string_param(skb, "FILTER_TYPE", "BROADCAST");
+ irlan_insert_string_param(skb, "FILTER_TYPE", "MULTICAST");
+
+ switch (self->provider.access_type) {
+ case ACCESS_DIRECT:
+ irlan_insert_string_param(skb, "ACCESS_TYPE", "DIRECT");
+ break;
+ case ACCESS_PEER:
+ irlan_insert_string_param(skb, "ACCESS_TYPE", "PEER");
+ break;
+ case ACCESS_HOSTED:
+ irlan_insert_string_param(skb, "ACCESS_TYPE", "HOSTED");
+ break;
+ default:
+ pr_debug("%s(), Unknown access type\n", __func__);
+ break;
+ }
+ irlan_insert_short_param(skb, "MAX_FRAME", 0x05ee);
+ break;
+ case CMD_OPEN_DATA_CHANNEL:
+ skb->data[0] = 0x00; /* Success */
+ if (self->provider.send_arb_val) {
+ skb->data[1] = 0x03; /* 3 parameters */
+ irlan_insert_short_param(skb, "CON_ARB",
+ self->provider.send_arb_val);
+ } else
+ skb->data[1] = 0x02; /* 2 parameters */
+ irlan_insert_byte_param(skb, "DATA_CHAN", self->stsap_sel_data);
+ irlan_insert_string_param(skb, "RECONNECT_KEY", "LINUX RULES!");
+ break;
+ case CMD_FILTER_OPERATION:
+ irlan_filter_request(self, skb);
+ break;
+ default:
+ pr_debug("%s(), Unknown command!\n", __func__);
+ break;
+ }
+
+ irttp_data_request(self->provider.tsap_ctrl, skb);
+}
+
+/*
+ * Function irlan_provider_register(void)
+ *
+ * Register provider support so we can accept incoming connections.
+ *
+ */
+int irlan_provider_open_ctrl_tsap(struct irlan_cb *self)
+{
+ struct tsap_cb *tsap;
+ notify_t notify;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+ /* Check if already open */
+ if (self->provider.tsap_ctrl)
+ return -1;
+
+ /*
+ * First register well known control TSAP
+ */
+ irda_notify_init(&notify);
+ notify.data_indication = irlan_provider_data_indication;
+ notify.connect_indication = irlan_provider_connect_indication;
+ notify.disconnect_indication = irlan_provider_disconnect_indication;
+ notify.instance = self;
+ strlcpy(notify.name, "IrLAN ctrl (p)", sizeof(notify.name));
+
+ tsap = irttp_open_tsap(LSAP_ANY, 1, &notify);
+ if (!tsap) {
+ pr_debug("%s(), Got no tsap!\n", __func__);
+ return -1;
+ }
+ self->provider.tsap_ctrl = tsap;
+
+ /* Register with LM-IAS */
+ irlan_ias_register(self, tsap->stsap_sel);
+
+ return 0;
+}
+
diff --git a/net/irda/irlan/irlan_provider_event.c b/net/irda/irlan/irlan_provider_event.c
new file mode 100644
index 000000000..9c4f7f51d
--- /dev/null
+++ b/net/irda/irlan/irlan_provider_event.c
@@ -0,0 +1,233 @@
+/*********************************************************************
+ *
+ * Filename: irlan_provider_event.c
+ * Version: 0.9
+ * Description: IrLAN provider state machine)
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Aug 31 20:14:37 1997
+ * Modified at: Sat Oct 30 12:52:41 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <net/irda/irda.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irttp.h>
+
+#include <net/irda/irlan_provider.h>
+#include <net/irda/irlan_event.h>
+
+static int irlan_provider_state_idle(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_provider_state_info(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_provider_state_open(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+static int irlan_provider_state_data(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb);
+
+static int (*state[])(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb) =
+{
+ irlan_provider_state_idle,
+ NULL, /* Query */
+ NULL, /* Info */
+ irlan_provider_state_info,
+ NULL, /* Media */
+ irlan_provider_state_open,
+ NULL, /* Wait */
+ NULL, /* Arb */
+ irlan_provider_state_data,
+ NULL, /* Close */
+ NULL, /* Sync */
+};
+
+void irlan_do_provider_event(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(*state[ self->provider.state] != NULL, return;);
+
+ (*state[self->provider.state]) (self, event, skb);
+}
+
+/*
+ * Function irlan_provider_state_idle (event, skb, info)
+ *
+ * IDLE, We are waiting for an indication that there is a provider
+ * available.
+ */
+static int irlan_provider_state_idle(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch(event) {
+ case IRLAN_CONNECT_INDICATION:
+ irlan_provider_connect_response( self, self->provider.tsap_ctrl);
+ irlan_next_provider_state( self, IRLAN_INFO);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_provider_state_info (self, event, skb, info)
+ *
+ * INFO, We have issued a GetInfo command and is awaiting a reply.
+ */
+static int irlan_provider_state_info(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch(event) {
+ case IRLAN_GET_INFO_CMD:
+ /* Be sure to use 802.3 in case of peer mode */
+ if (self->provider.access_type == ACCESS_PEER) {
+ self->media = MEDIA_802_3;
+
+ /* Check if client has started yet */
+ if (self->client.state == IRLAN_IDLE) {
+ /* This should get the client going */
+ irlmp_discovery_request(8);
+ }
+ }
+
+ irlan_provider_send_reply(self, CMD_GET_PROVIDER_INFO,
+ RSP_SUCCESS);
+ /* Keep state */
+ break;
+ case IRLAN_GET_MEDIA_CMD:
+ irlan_provider_send_reply(self, CMD_GET_MEDIA_CHAR,
+ RSP_SUCCESS);
+ /* Keep state */
+ break;
+ case IRLAN_OPEN_DATA_CMD:
+ ret = irlan_parse_open_data_cmd(self, skb);
+ if (self->provider.access_type == ACCESS_PEER) {
+ /* FIXME: make use of random functions! */
+ self->provider.send_arb_val = (jiffies & 0xffff);
+ }
+ irlan_provider_send_reply(self, CMD_OPEN_DATA_CHANNEL, ret);
+
+ if (ret == RSP_SUCCESS) {
+ irlan_next_provider_state(self, IRLAN_OPEN);
+
+ /* Signal client that we are now open */
+ irlan_do_client_event(self, IRLAN_PROVIDER_SIGNAL, NULL);
+ }
+ break;
+ case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_provider_state(self, IRLAN_IDLE);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_provider_state_open (self, event, skb, info)
+ *
+ * OPEN, The client has issued a OpenData command and is awaiting a
+ * reply
+ *
+ */
+static int irlan_provider_state_open(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+
+ switch(event) {
+ case IRLAN_FILTER_CONFIG_CMD:
+ irlan_provider_parse_command(self, CMD_FILTER_OPERATION, skb);
+ irlan_provider_send_reply(self, CMD_FILTER_OPERATION,
+ RSP_SUCCESS);
+ /* Keep state */
+ break;
+ case IRLAN_DATA_CONNECT_INDICATION:
+ irlan_next_provider_state(self, IRLAN_DATA);
+ irlan_provider_connect_response(self, self->tsap_data);
+ break;
+ case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_provider_state(self, IRLAN_IDLE);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irlan_provider_state_data (self, event, skb, info)
+ *
+ * DATA, The data channel is connected, allowing data transfers between
+ * the local and remote machines.
+ *
+ */
+static int irlan_provider_state_data(struct irlan_cb *self, IRLAN_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+ switch(event) {
+ case IRLAN_FILTER_CONFIG_CMD:
+ irlan_provider_parse_command(self, CMD_FILTER_OPERATION, skb);
+ irlan_provider_send_reply(self, CMD_FILTER_OPERATION,
+ RSP_SUCCESS);
+ break;
+ case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */
+ case IRLAN_LAP_DISCONNECT:
+ irlan_next_provider_state(self, IRLAN_IDLE);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__ , event);
+ break;
+ }
+ if (skb)
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+
+
+
+
+
+
+
+
+
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
new file mode 100644
index 000000000..1cde711bc
--- /dev/null
+++ b/net/irda/irlap.c
@@ -0,0 +1,1207 @@
+/*********************************************************************
+ *
+ * Filename: irlap.c
+ * Version: 1.0
+ * Description: IrLAP implementation for Linux
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Mon Aug 4 20:40:53 1997
+ * Modified at: Tue Dec 14 09:26:44 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/delay.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irda_device.h>
+#include <net/irda/irqueue.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irlmp_frame.h>
+#include <net/irda/irlap_frame.h>
+#include <net/irda/irlap.h>
+#include <net/irda/timer.h>
+#include <net/irda/qos.h>
+
+static hashbin_t *irlap = NULL;
+int sysctl_slot_timeout = SLOT_TIMEOUT * 1000 / HZ;
+
+/* This is the delay of missed pf period before generating an event
+ * to the application. The spec mandate 3 seconds, but in some cases
+ * it's way too long. - Jean II */
+int sysctl_warn_noreply_time = 3;
+
+extern void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb);
+static void __irlap_close(struct irlap_cb *self);
+static void irlap_init_qos_capabilities(struct irlap_cb *self,
+ struct qos_info *qos_user);
+
+static const char *const lap_reasons[] __maybe_unused = {
+ "ERROR, NOT USED",
+ "LAP_DISC_INDICATION",
+ "LAP_NO_RESPONSE",
+ "LAP_RESET_INDICATION",
+ "LAP_FOUND_NONE",
+ "LAP_MEDIA_BUSY",
+ "LAP_PRIMARY_CONFLICT",
+ "ERROR, NOT USED",
+};
+
+int __init irlap_init(void)
+{
+ /* Check if the compiler did its job properly.
+ * May happen on some ARM configuration, check with Russell King. */
+ IRDA_ASSERT(sizeof(struct xid_frame) == 14, ;);
+ IRDA_ASSERT(sizeof(struct test_frame) == 10, ;);
+ IRDA_ASSERT(sizeof(struct ua_frame) == 10, ;);
+ IRDA_ASSERT(sizeof(struct snrm_frame) == 11, ;);
+
+ /* Allocate master array */
+ irlap = hashbin_new(HB_LOCK);
+ if (irlap == NULL) {
+ net_err_ratelimited("%s: can't allocate irlap hashbin!\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void irlap_cleanup(void)
+{
+ IRDA_ASSERT(irlap != NULL, return;);
+
+ hashbin_delete(irlap, (FREE_FUNC) __irlap_close);
+}
+
+/*
+ * Function irlap_open (driver)
+ *
+ * Initialize IrLAP layer
+ *
+ */
+struct irlap_cb *irlap_open(struct net_device *dev, struct qos_info *qos,
+ const char *hw_name)
+{
+ struct irlap_cb *self;
+
+ /* Initialize the irlap structure. */
+ self = kzalloc(sizeof(struct irlap_cb), GFP_KERNEL);
+ if (self == NULL)
+ return NULL;
+
+ self->magic = LAP_MAGIC;
+
+ /* Make a binding between the layers */
+ self->netdev = dev;
+ self->qos_dev = qos;
+ /* Copy hardware name */
+ if(hw_name != NULL) {
+ strlcpy(self->hw_name, hw_name, sizeof(self->hw_name));
+ } else {
+ self->hw_name[0] = '\0';
+ }
+
+ /* FIXME: should we get our own field? */
+ dev->atalk_ptr = self;
+
+ self->state = LAP_OFFLINE;
+
+ /* Initialize transmit queue */
+ skb_queue_head_init(&self->txq);
+ skb_queue_head_init(&self->txq_ultra);
+ skb_queue_head_init(&self->wx_list);
+
+ /* My unique IrLAP device address! */
+ /* We don't want the broadcast address, neither the NULL address
+ * (most often used to signify "invalid"), and we don't want an
+ * address already in use (otherwise connect won't be able
+ * to select the proper link). - Jean II */
+ do {
+ get_random_bytes(&self->saddr, sizeof(self->saddr));
+ } while ((self->saddr == 0x0) || (self->saddr == BROADCAST) ||
+ (hashbin_lock_find(irlap, self->saddr, NULL)) );
+ /* Copy to the driver */
+ memcpy(dev->dev_addr, &self->saddr, 4);
+
+ init_timer(&self->slot_timer);
+ init_timer(&self->query_timer);
+ init_timer(&self->discovery_timer);
+ init_timer(&self->final_timer);
+ init_timer(&self->poll_timer);
+ init_timer(&self->wd_timer);
+ init_timer(&self->backoff_timer);
+ init_timer(&self->media_busy_timer);
+
+ irlap_apply_default_connection_parameters(self);
+
+ self->N3 = 3; /* # connections attempts to try before giving up */
+
+ self->state = LAP_NDM;
+
+ hashbin_insert(irlap, (irda_queue_t *) self, self->saddr, NULL);
+
+ irlmp_register_link(self, self->saddr, &self->notify);
+
+ return self;
+}
+EXPORT_SYMBOL(irlap_open);
+
+/*
+ * Function __irlap_close (self)
+ *
+ * Remove IrLAP and all allocated memory. Stop any pending timers.
+ *
+ */
+static void __irlap_close(struct irlap_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Stop timers */
+ del_timer(&self->slot_timer);
+ del_timer(&self->query_timer);
+ del_timer(&self->discovery_timer);
+ del_timer(&self->final_timer);
+ del_timer(&self->poll_timer);
+ del_timer(&self->wd_timer);
+ del_timer(&self->backoff_timer);
+ del_timer(&self->media_busy_timer);
+
+ irlap_flush_all_queues(self);
+
+ self->magic = 0;
+
+ kfree(self);
+}
+
+/*
+ * Function irlap_close (self)
+ *
+ * Remove IrLAP instance
+ *
+ */
+void irlap_close(struct irlap_cb *self)
+{
+ struct irlap_cb *lap;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* We used to send a LAP_DISC_INDICATION here, but this was
+ * racy. This has been move within irlmp_unregister_link()
+ * itself. Jean II */
+
+ /* Kill the LAP and all LSAPs on top of it */
+ irlmp_unregister_link(self->saddr);
+ self->notify.instance = NULL;
+
+ /* Be sure that we manage to remove ourself from the hash */
+ lap = hashbin_remove(irlap, self->saddr, NULL);
+ if (!lap) {
+ pr_debug("%s(), Didn't find myself!\n", __func__);
+ return;
+ }
+ __irlap_close(lap);
+}
+EXPORT_SYMBOL(irlap_close);
+
+/*
+ * Function irlap_connect_indication (self, skb)
+ *
+ * Another device is attempting to make a connection
+ *
+ */
+void irlap_connect_indication(struct irlap_cb *self, struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ irlap_init_qos_capabilities(self, NULL); /* No user QoS! */
+
+ irlmp_link_connect_indication(self->notify.instance, self->saddr,
+ self->daddr, &self->qos_tx, skb);
+}
+
+/*
+ * Function irlap_connect_response (self, skb)
+ *
+ * Service user has accepted incoming connection
+ *
+ */
+void irlap_connect_response(struct irlap_cb *self, struct sk_buff *userdata)
+{
+ irlap_do_event(self, CONNECT_RESPONSE, userdata, NULL);
+}
+
+/*
+ * Function irlap_connect_request (self, daddr, qos_user, sniff)
+ *
+ * Request connection with another device, sniffing is not implemented
+ * yet.
+ *
+ */
+void irlap_connect_request(struct irlap_cb *self, __u32 daddr,
+ struct qos_info *qos_user, int sniff)
+{
+ pr_debug("%s(), daddr=0x%08x\n", __func__, daddr);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ self->daddr = daddr;
+
+ /*
+ * If the service user specifies QoS values for this connection,
+ * then use them
+ */
+ irlap_init_qos_capabilities(self, qos_user);
+
+ if ((self->state == LAP_NDM) && !self->media_busy)
+ irlap_do_event(self, CONNECT_REQUEST, NULL, NULL);
+ else
+ self->connect_pending = TRUE;
+}
+
+/*
+ * Function irlap_connect_confirm (self, skb)
+ *
+ * Connection request has been accepted
+ *
+ */
+void irlap_connect_confirm(struct irlap_cb *self, struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ irlmp_link_connect_confirm(self->notify.instance, &self->qos_tx, skb);
+}
+
+/*
+ * Function irlap_data_indication (self, skb)
+ *
+ * Received data frames from IR-port, so we just pass them up to
+ * IrLMP for further processing
+ *
+ */
+void irlap_data_indication(struct irlap_cb *self, struct sk_buff *skb,
+ int unreliable)
+{
+ /* Hide LAP header from IrLMP layer */
+ skb_pull(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER);
+
+ irlmp_link_data_indication(self->notify.instance, skb, unreliable);
+}
+
+
+/*
+ * Function irlap_data_request (self, skb)
+ *
+ * Queue data for transmission, must wait until XMIT state
+ *
+ */
+void irlap_data_request(struct irlap_cb *self, struct sk_buff *skb,
+ int unreliable)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ IRDA_ASSERT(skb_headroom(skb) >= (LAP_ADDR_HEADER+LAP_CTRL_HEADER),
+ return;);
+ skb_push(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER);
+
+ /*
+ * Must set frame format now so that the rest of the code knows
+ * if its dealing with an I or an UI frame
+ */
+ if (unreliable)
+ skb->data[1] = UI_FRAME;
+ else
+ skb->data[1] = I_FRAME;
+
+ /* Don't forget to refcount it - see irlmp_connect_request(). */
+ skb_get(skb);
+
+ /* Add at the end of the queue (keep ordering) - Jean II */
+ skb_queue_tail(&self->txq, skb);
+
+ /*
+ * Send event if this frame only if we are in the right state
+ * FIXME: udata should be sent first! (skb_queue_head?)
+ */
+ if ((self->state == LAP_XMIT_P) || (self->state == LAP_XMIT_S)) {
+ /* If we are not already processing the Tx queue, trigger
+ * transmission immediately - Jean II */
+ if((skb_queue_len(&self->txq) <= 1) && (!self->local_busy))
+ irlap_do_event(self, DATA_REQUEST, skb, NULL);
+ /* Otherwise, the packets will be sent normally at the
+ * next pf-poll - Jean II */
+ }
+}
+
+/*
+ * Function irlap_unitdata_request (self, skb)
+ *
+ * Send Ultra data. This is data that must be sent outside any connection
+ *
+ */
+#ifdef CONFIG_IRDA_ULTRA
+void irlap_unitdata_request(struct irlap_cb *self, struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ IRDA_ASSERT(skb_headroom(skb) >= (LAP_ADDR_HEADER+LAP_CTRL_HEADER),
+ return;);
+ skb_push(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER);
+
+ skb->data[0] = CBROADCAST;
+ skb->data[1] = UI_FRAME;
+
+ /* Don't need to refcount, see irlmp_connless_data_request() */
+
+ skb_queue_tail(&self->txq_ultra, skb);
+
+ irlap_do_event(self, SEND_UI_FRAME, NULL, NULL);
+}
+#endif /*CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irlap_udata_indication (self, skb)
+ *
+ * Receive Ultra data. This is data that is received outside any connection
+ *
+ */
+#ifdef CONFIG_IRDA_ULTRA
+void irlap_unitdata_indication(struct irlap_cb *self, struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ /* Hide LAP header from IrLMP layer */
+ skb_pull(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER);
+
+ irlmp_link_unitdata_indication(self->notify.instance, skb);
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irlap_disconnect_request (void)
+ *
+ * Request to disconnect connection by service user
+ */
+void irlap_disconnect_request(struct irlap_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Don't disconnect until all data frames are successfully sent */
+ if (!skb_queue_empty(&self->txq)) {
+ self->disconnect_pending = TRUE;
+ return;
+ }
+
+ /* Check if we are in the right state for disconnecting */
+ switch (self->state) {
+ case LAP_XMIT_P: /* FALLTHROUGH */
+ case LAP_XMIT_S: /* FALLTHROUGH */
+ case LAP_CONN: /* FALLTHROUGH */
+ case LAP_RESET_WAIT: /* FALLTHROUGH */
+ case LAP_RESET_CHECK:
+ irlap_do_event(self, DISCONNECT_REQUEST, NULL, NULL);
+ break;
+ default:
+ pr_debug("%s(), disconnect pending!\n", __func__);
+ self->disconnect_pending = TRUE;
+ break;
+ }
+}
+
+/*
+ * Function irlap_disconnect_indication (void)
+ *
+ * Disconnect request from other device
+ *
+ */
+void irlap_disconnect_indication(struct irlap_cb *self, LAP_REASON reason)
+{
+ pr_debug("%s(), reason=%s\n", __func__, lap_reasons[reason]);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Flush queues */
+ irlap_flush_all_queues(self);
+
+ switch (reason) {
+ case LAP_RESET_INDICATION:
+ pr_debug("%s(), Sending reset request!\n", __func__);
+ irlap_do_event(self, RESET_REQUEST, NULL, NULL);
+ break;
+ case LAP_NO_RESPONSE: /* FALLTHROUGH */
+ case LAP_DISC_INDICATION: /* FALLTHROUGH */
+ case LAP_FOUND_NONE: /* FALLTHROUGH */
+ case LAP_MEDIA_BUSY:
+ irlmp_link_disconnect_indication(self->notify.instance, self,
+ reason, NULL);
+ break;
+ default:
+ net_err_ratelimited("%s: Unknown reason %d\n",
+ __func__, reason);
+ }
+}
+
+/*
+ * Function irlap_discovery_request (gen_addr_bit)
+ *
+ * Start one single discovery operation.
+ *
+ */
+void irlap_discovery_request(struct irlap_cb *self, discovery_t *discovery)
+{
+ struct irlap_info info;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+ IRDA_ASSERT(discovery != NULL, return;);
+
+ pr_debug("%s(), nslots = %d\n", __func__, discovery->nslots);
+
+ IRDA_ASSERT((discovery->nslots == 1) || (discovery->nslots == 6) ||
+ (discovery->nslots == 8) || (discovery->nslots == 16),
+ return;);
+
+ /* Discovery is only possible in NDM mode */
+ if (self->state != LAP_NDM) {
+ pr_debug("%s(), discovery only possible in NDM mode\n",
+ __func__);
+ irlap_discovery_confirm(self, NULL);
+ /* Note : in theory, if we are not in NDM, we could postpone
+ * the discovery like we do for connection request.
+ * In practice, it's not worth it. If the media was busy,
+ * it's likely next time around it won't be busy. If we are
+ * in REPLY state, we will get passive discovery info & event.
+ * Jean II */
+ return;
+ }
+
+ /* Check if last discovery request finished in time, or if
+ * it was aborted due to the media busy flag. */
+ if (self->discovery_log != NULL) {
+ hashbin_delete(self->discovery_log, (FREE_FUNC) kfree);
+ self->discovery_log = NULL;
+ }
+
+ /* All operations will occur at predictable time, no need to lock */
+ self->discovery_log = hashbin_new(HB_NOLOCK);
+
+ if (self->discovery_log == NULL) {
+ net_warn_ratelimited("%s(), Unable to allocate discovery log!\n",
+ __func__);
+ return;
+ }
+
+ info.S = discovery->nslots; /* Number of slots */
+ info.s = 0; /* Current slot */
+
+ self->discovery_cmd = discovery;
+ info.discovery = discovery;
+
+ /* sysctl_slot_timeout bounds are checked in irsysctl.c - Jean II */
+ self->slot_timeout = msecs_to_jiffies(sysctl_slot_timeout);
+
+ irlap_do_event(self, DISCOVERY_REQUEST, NULL, &info);
+}
+
+/*
+ * Function irlap_discovery_confirm (log)
+ *
+ * A device has been discovered in front of this station, we
+ * report directly to LMP.
+ */
+void irlap_discovery_confirm(struct irlap_cb *self, hashbin_t *discovery_log)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ IRDA_ASSERT(self->notify.instance != NULL, return;);
+
+ /*
+ * Check for successful discovery, since we are then allowed to clear
+ * the media busy condition (IrLAP 6.13.4 - p.94). This should allow
+ * us to make connection attempts much faster and easier (i.e. no
+ * collisions).
+ * Setting media busy to false will also generate an event allowing
+ * to process pending events in NDM state machine.
+ * Note : the spec doesn't define what's a successful discovery is.
+ * If we want Ultra to work, it's successful even if there is
+ * nobody discovered - Jean II
+ */
+ if (discovery_log)
+ irda_device_set_media_busy(self->netdev, FALSE);
+
+ /* Inform IrLMP */
+ irlmp_link_discovery_confirm(self->notify.instance, discovery_log);
+}
+
+/*
+ * Function irlap_discovery_indication (log)
+ *
+ * Somebody is trying to discover us!
+ *
+ */
+void irlap_discovery_indication(struct irlap_cb *self, discovery_t *discovery)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+ IRDA_ASSERT(discovery != NULL, return;);
+
+ IRDA_ASSERT(self->notify.instance != NULL, return;);
+
+ /* A device is very likely to connect immediately after it performs
+ * a successful discovery. This means that in our case, we are much
+ * more likely to receive a connection request over the medium.
+ * So, we backoff to avoid collisions.
+ * IrLAP spec 6.13.4 suggest 100ms...
+ * Note : this little trick actually make a *BIG* difference. If I set
+ * my Linux box with discovery enabled and one Ultra frame sent every
+ * second, my Palm has no trouble connecting to it every time !
+ * Jean II */
+ irda_device_set_media_busy(self->netdev, SMALL);
+
+ irlmp_link_discovery_indication(self->notify.instance, discovery);
+}
+
+/*
+ * Function irlap_status_indication (quality_of_link)
+ */
+void irlap_status_indication(struct irlap_cb *self, int quality_of_link)
+{
+ switch (quality_of_link) {
+ case STATUS_NO_ACTIVITY:
+ net_info_ratelimited("IrLAP, no activity on link!\n");
+ break;
+ case STATUS_NOISY:
+ net_info_ratelimited("IrLAP, noisy link!\n");
+ break;
+ default:
+ break;
+ }
+ irlmp_status_indication(self->notify.instance,
+ quality_of_link, LOCK_NO_CHANGE);
+}
+
+/*
+ * Function irlap_reset_indication (void)
+ */
+void irlap_reset_indication(struct irlap_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ if (self->state == LAP_RESET_WAIT)
+ irlap_do_event(self, RESET_REQUEST, NULL, NULL);
+ else
+ irlap_do_event(self, RESET_RESPONSE, NULL, NULL);
+}
+
+/*
+ * Function irlap_reset_confirm (void)
+ */
+void irlap_reset_confirm(void)
+{
+}
+
+/*
+ * Function irlap_generate_rand_time_slot (S, s)
+ *
+ * Generate a random time slot between s and S-1 where
+ * S = Number of slots (0 -> S-1)
+ * s = Current slot
+ */
+int irlap_generate_rand_time_slot(int S, int s)
+{
+ static int rand;
+ int slot;
+
+ IRDA_ASSERT((S - s) > 0, return 0;);
+
+ rand += jiffies;
+ rand ^= (rand << 12);
+ rand ^= (rand >> 20);
+
+ slot = s + rand % (S-s);
+
+ IRDA_ASSERT((slot >= s) || (slot < S), return 0;);
+
+ return slot;
+}
+
+/*
+ * Function irlap_update_nr_received (nr)
+ *
+ * Remove all acknowledged frames in current window queue. This code is
+ * not intuitive and you should not try to change it. If you think it
+ * contains bugs, please mail a patch to the author instead.
+ */
+void irlap_update_nr_received(struct irlap_cb *self, int nr)
+{
+ struct sk_buff *skb = NULL;
+ int count = 0;
+
+ /*
+ * Remove all the ack-ed frames from the window queue.
+ */
+
+ /*
+ * Optimize for the common case. It is most likely that the receiver
+ * will acknowledge all the frames we have sent! So in that case we
+ * delete all frames stored in window.
+ */
+ if (nr == self->vs) {
+ while ((skb = skb_dequeue(&self->wx_list)) != NULL) {
+ dev_kfree_skb(skb);
+ }
+ /* The last acked frame is the next to send minus one */
+ self->va = nr - 1;
+ } else {
+ /* Remove all acknowledged frames in current window */
+ while ((skb_peek(&self->wx_list) != NULL) &&
+ (((self->va+1) % 8) != nr))
+ {
+ skb = skb_dequeue(&self->wx_list);
+ dev_kfree_skb(skb);
+
+ self->va = (self->va + 1) % 8;
+ count++;
+ }
+ }
+
+ /* Advance window */
+ self->window = self->window_size - skb_queue_len(&self->wx_list);
+}
+
+/*
+ * Function irlap_validate_ns_received (ns)
+ *
+ * Validate the next to send (ns) field from received frame.
+ */
+int irlap_validate_ns_received(struct irlap_cb *self, int ns)
+{
+ /* ns as expected? */
+ if (ns == self->vr)
+ return NS_EXPECTED;
+ /*
+ * Stations are allowed to treat invalid NS as unexpected NS
+ * IrLAP, Recv ... with-invalid-Ns. p. 84
+ */
+ return NS_UNEXPECTED;
+
+ /* return NR_INVALID; */
+}
+/*
+ * Function irlap_validate_nr_received (nr)
+ *
+ * Validate the next to receive (nr) field from received frame.
+ *
+ */
+int irlap_validate_nr_received(struct irlap_cb *self, int nr)
+{
+ /* nr as expected? */
+ if (nr == self->vs) {
+ pr_debug("%s(), expected!\n", __func__);
+ return NR_EXPECTED;
+ }
+
+ /*
+ * unexpected nr? (but within current window), first we check if the
+ * ns numbers of the frames in the current window wrap.
+ */
+ if (self->va < self->vs) {
+ if ((nr >= self->va) && (nr <= self->vs))
+ return NR_UNEXPECTED;
+ } else {
+ if ((nr >= self->va) || (nr <= self->vs))
+ return NR_UNEXPECTED;
+ }
+
+ /* Invalid nr! */
+ return NR_INVALID;
+}
+
+/*
+ * Function irlap_initiate_connection_state ()
+ *
+ * Initialize the connection state parameters
+ *
+ */
+void irlap_initiate_connection_state(struct irlap_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Next to send and next to receive */
+ self->vs = self->vr = 0;
+
+ /* Last frame which got acked (0 - 1) % 8 */
+ self->va = 7;
+
+ self->window = 1;
+
+ self->remote_busy = FALSE;
+ self->retry_count = 0;
+}
+
+/*
+ * Function irlap_wait_min_turn_around (self, qos)
+ *
+ * Wait negotiated minimum turn around time, this function actually sets
+ * the number of BOS's that must be sent before the next transmitted
+ * frame in order to delay for the specified amount of time. This is
+ * done to avoid using timers, and the forbidden udelay!
+ */
+void irlap_wait_min_turn_around(struct irlap_cb *self, struct qos_info *qos)
+{
+ __u32 min_turn_time;
+ __u32 speed;
+
+ /* Get QoS values. */
+ speed = qos->baud_rate.value;
+ min_turn_time = qos->min_turn_time.value;
+
+ /* No need to calculate XBOFs for speeds over 115200 bps */
+ if (speed > 115200) {
+ self->mtt_required = min_turn_time;
+ return;
+ }
+
+ /*
+ * Send additional BOF's for the next frame for the requested
+ * min turn time, so now we must calculate how many chars (XBOF's) we
+ * must send for the requested time period (min turn time)
+ */
+ self->xbofs_delay = irlap_min_turn_time_in_bytes(speed, min_turn_time);
+}
+
+/*
+ * Function irlap_flush_all_queues (void)
+ *
+ * Flush all queues
+ *
+ */
+void irlap_flush_all_queues(struct irlap_cb *self)
+{
+ struct sk_buff* skb;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Free transmission queue */
+ while ((skb = skb_dequeue(&self->txq)) != NULL)
+ dev_kfree_skb(skb);
+
+ while ((skb = skb_dequeue(&self->txq_ultra)) != NULL)
+ dev_kfree_skb(skb);
+
+ /* Free sliding window buffered packets */
+ while ((skb = skb_dequeue(&self->wx_list)) != NULL)
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function irlap_setspeed (self, speed)
+ *
+ * Change the speed of the IrDA port
+ *
+ */
+static void irlap_change_speed(struct irlap_cb *self, __u32 speed, int now)
+{
+ struct sk_buff *skb;
+
+ pr_debug("%s(), setting speed to %d\n", __func__, speed);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ self->speed = speed;
+
+ /* Change speed now, or just piggyback speed on frames */
+ if (now) {
+ /* Send down empty frame to trigger speed change */
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (skb)
+ irlap_queue_xmit(self, skb);
+ }
+}
+
+/*
+ * Function irlap_init_qos_capabilities (self, qos)
+ *
+ * Initialize QoS for this IrLAP session, What we do is to compute the
+ * intersection of the QoS capabilities for the user, driver and for
+ * IrLAP itself. Normally, IrLAP will not specify any values, but it can
+ * be used to restrict certain values.
+ */
+static void irlap_init_qos_capabilities(struct irlap_cb *self,
+ struct qos_info *qos_user)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+ IRDA_ASSERT(self->netdev != NULL, return;);
+
+ /* Start out with the maximum QoS support possible */
+ irda_init_max_qos_capabilies(&self->qos_rx);
+
+ /* Apply drivers QoS capabilities */
+ irda_qos_compute_intersection(&self->qos_rx, self->qos_dev);
+
+ /*
+ * Check for user supplied QoS parameters. The service user is only
+ * allowed to supply these values. We check each parameter since the
+ * user may not have set all of them.
+ */
+ if (qos_user) {
+ pr_debug("%s(), Found user specified QoS!\n", __func__);
+
+ if (qos_user->baud_rate.bits)
+ self->qos_rx.baud_rate.bits &= qos_user->baud_rate.bits;
+
+ if (qos_user->max_turn_time.bits)
+ self->qos_rx.max_turn_time.bits &= qos_user->max_turn_time.bits;
+ if (qos_user->data_size.bits)
+ self->qos_rx.data_size.bits &= qos_user->data_size.bits;
+
+ if (qos_user->link_disc_time.bits)
+ self->qos_rx.link_disc_time.bits &= qos_user->link_disc_time.bits;
+ }
+
+ /* Use 500ms in IrLAP for now */
+ self->qos_rx.max_turn_time.bits &= 0x01;
+
+ /* Set data size */
+ /*self->qos_rx.data_size.bits &= 0x03;*/
+
+ irda_qos_bits_to_value(&self->qos_rx);
+}
+
+/*
+ * Function irlap_apply_default_connection_parameters (void, now)
+ *
+ * Use the default connection and transmission parameters
+ */
+void irlap_apply_default_connection_parameters(struct irlap_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* xbofs : Default value in NDM */
+ self->next_bofs = 12;
+ self->bofs_count = 12;
+
+ /* NDM Speed is 9600 */
+ irlap_change_speed(self, 9600, TRUE);
+
+ /* Set mbusy when going to NDM state */
+ irda_device_set_media_busy(self->netdev, TRUE);
+
+ /*
+ * Generate random connection address for this session, which must
+ * be 7 bits wide and different from 0x00 and 0xfe
+ */
+ while ((self->caddr == 0x00) || (self->caddr == 0xfe)) {
+ get_random_bytes(&self->caddr, sizeof(self->caddr));
+ self->caddr &= 0xfe;
+ }
+
+ /* Use default values until connection has been negitiated */
+ self->slot_timeout = sysctl_slot_timeout;
+ self->final_timeout = FINAL_TIMEOUT;
+ self->poll_timeout = POLL_TIMEOUT;
+ self->wd_timeout = WD_TIMEOUT;
+
+ /* Set some default values */
+ self->qos_tx.baud_rate.value = 9600;
+ self->qos_rx.baud_rate.value = 9600;
+ self->qos_tx.max_turn_time.value = 0;
+ self->qos_rx.max_turn_time.value = 0;
+ self->qos_tx.min_turn_time.value = 0;
+ self->qos_rx.min_turn_time.value = 0;
+ self->qos_tx.data_size.value = 64;
+ self->qos_rx.data_size.value = 64;
+ self->qos_tx.window_size.value = 1;
+ self->qos_rx.window_size.value = 1;
+ self->qos_tx.additional_bofs.value = 12;
+ self->qos_rx.additional_bofs.value = 12;
+ self->qos_tx.link_disc_time.value = 0;
+ self->qos_rx.link_disc_time.value = 0;
+
+ irlap_flush_all_queues(self);
+
+ self->disconnect_pending = FALSE;
+ self->connect_pending = FALSE;
+}
+
+/*
+ * Function irlap_apply_connection_parameters (qos, now)
+ *
+ * Initialize IrLAP with the negotiated QoS values
+ *
+ * If 'now' is false, the speed and xbofs will be changed after the next
+ * frame is sent.
+ * If 'now' is true, the speed and xbofs is changed immediately
+ */
+void irlap_apply_connection_parameters(struct irlap_cb *self, int now)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Set the negotiated xbofs value */
+ self->next_bofs = self->qos_tx.additional_bofs.value;
+ if (now)
+ self->bofs_count = self->next_bofs;
+
+ /* Set the negotiated link speed (may need the new xbofs value) */
+ irlap_change_speed(self, self->qos_tx.baud_rate.value, now);
+
+ self->window_size = self->qos_tx.window_size.value;
+ self->window = self->qos_tx.window_size.value;
+
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+ /*
+ * Calculate how many bytes it is possible to transmit before the
+ * link must be turned around
+ */
+ self->line_capacity =
+ irlap_max_line_capacity(self->qos_tx.baud_rate.value,
+ self->qos_tx.max_turn_time.value);
+ self->bytes_left = self->line_capacity;
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+
+
+ /*
+ * Initialize timeout values, some of the rules are listed on
+ * page 92 in IrLAP.
+ */
+ IRDA_ASSERT(self->qos_tx.max_turn_time.value != 0, return;);
+ IRDA_ASSERT(self->qos_rx.max_turn_time.value != 0, return;);
+ /* The poll timeout applies only to the primary station.
+ * It defines the maximum time the primary stay in XMIT mode
+ * before timeout and turning the link around (sending a RR).
+ * Or, this is how much we can keep the pf bit in primary mode.
+ * Therefore, it must be lower or equal than our *OWN* max turn around.
+ * Jean II */
+ self->poll_timeout = msecs_to_jiffies(
+ self->qos_tx.max_turn_time.value);
+ /* The Final timeout applies only to the primary station.
+ * It defines the maximum time the primary wait (mostly in RECV mode)
+ * for an answer from the secondary station before polling it again.
+ * Therefore, it must be greater or equal than our *PARTNER*
+ * max turn around time - Jean II */
+ self->final_timeout = msecs_to_jiffies(
+ self->qos_rx.max_turn_time.value);
+ /* The Watchdog Bit timeout applies only to the secondary station.
+ * It defines the maximum time the secondary wait (mostly in RECV mode)
+ * for poll from the primary station before getting annoyed.
+ * Therefore, it must be greater or equal than our *PARTNER*
+ * max turn around time - Jean II */
+ self->wd_timeout = self->final_timeout * 2;
+
+ /*
+ * N1 and N2 are maximum retry count for *both* the final timer
+ * and the wd timer (with a factor 2) as defined above.
+ * After N1 retry of a timer, we give a warning to the user.
+ * After N2 retry, we consider the link dead and disconnect it.
+ * Jean II
+ */
+
+ /*
+ * Set N1 to 0 if Link Disconnect/Threshold Time = 3 and set it to
+ * 3 seconds otherwise. See page 71 in IrLAP for more details.
+ * Actually, it's not always 3 seconds, as we allow to set
+ * it via sysctl... Max maxtt is 500ms, and N1 need to be multiple
+ * of 2, so 1 second is minimum we can allow. - Jean II
+ */
+ if (self->qos_tx.link_disc_time.value == sysctl_warn_noreply_time)
+ /*
+ * If we set N1 to 0, it will trigger immediately, which is
+ * not what we want. What we really want is to disable it,
+ * Jean II
+ */
+ self->N1 = -2; /* Disable - Need to be multiple of 2*/
+ else
+ self->N1 = sysctl_warn_noreply_time * 1000 /
+ self->qos_rx.max_turn_time.value;
+
+ pr_debug("Setting N1 = %d\n", self->N1);
+
+ /* Set N2 to match our own disconnect time */
+ self->N2 = self->qos_tx.link_disc_time.value * 1000 /
+ self->qos_rx.max_turn_time.value;
+ pr_debug("Setting N2 = %d\n", self->N2);
+}
+
+#ifdef CONFIG_PROC_FS
+struct irlap_iter_state {
+ int id;
+};
+
+static void *irlap_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct irlap_iter_state *iter = seq->private;
+ struct irlap_cb *self;
+
+ /* Protect our access to the tsap list */
+ spin_lock_irq(&irlap->hb_spinlock);
+ iter->id = 0;
+
+ for (self = (struct irlap_cb *) hashbin_get_first(irlap);
+ self; self = (struct irlap_cb *) hashbin_get_next(irlap)) {
+ if (iter->id == *pos)
+ break;
+ ++iter->id;
+ }
+
+ return self;
+}
+
+static void *irlap_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct irlap_iter_state *iter = seq->private;
+
+ ++*pos;
+ ++iter->id;
+ return (void *) hashbin_get_next(irlap);
+}
+
+static void irlap_seq_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_irq(&irlap->hb_spinlock);
+}
+
+static int irlap_seq_show(struct seq_file *seq, void *v)
+{
+ const struct irlap_iter_state *iter = seq->private;
+ const struct irlap_cb *self = v;
+
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -EINVAL;);
+
+ seq_printf(seq, "irlap%d ", iter->id);
+ seq_printf(seq, "state: %s\n",
+ irlap_state[self->state]);
+
+ seq_printf(seq, " device name: %s, ",
+ (self->netdev) ? self->netdev->name : "bug");
+ seq_printf(seq, "hardware name: %s\n", self->hw_name);
+
+ seq_printf(seq, " caddr: %#02x, ", self->caddr);
+ seq_printf(seq, "saddr: %#08x, ", self->saddr);
+ seq_printf(seq, "daddr: %#08x\n", self->daddr);
+
+ seq_printf(seq, " win size: %d, ",
+ self->window_size);
+ seq_printf(seq, "win: %d, ", self->window);
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+ seq_printf(seq, "line capacity: %d, ",
+ self->line_capacity);
+ seq_printf(seq, "bytes left: %d\n", self->bytes_left);
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+ seq_printf(seq, " tx queue len: %d ",
+ skb_queue_len(&self->txq));
+ seq_printf(seq, "win queue len: %d ",
+ skb_queue_len(&self->wx_list));
+ seq_printf(seq, "rbusy: %s", self->remote_busy ?
+ "TRUE" : "FALSE");
+ seq_printf(seq, " mbusy: %s\n", self->media_busy ?
+ "TRUE" : "FALSE");
+
+ seq_printf(seq, " retrans: %d ", self->retry_count);
+ seq_printf(seq, "vs: %d ", self->vs);
+ seq_printf(seq, "vr: %d ", self->vr);
+ seq_printf(seq, "va: %d\n", self->va);
+
+ seq_printf(seq, " qos\tbps\tmaxtt\tdsize\twinsize\taddbofs\tmintt\tldisc\tcomp\n");
+
+ seq_printf(seq, " tx\t%d\t",
+ self->qos_tx.baud_rate.value);
+ seq_printf(seq, "%d\t",
+ self->qos_tx.max_turn_time.value);
+ seq_printf(seq, "%d\t",
+ self->qos_tx.data_size.value);
+ seq_printf(seq, "%d\t",
+ self->qos_tx.window_size.value);
+ seq_printf(seq, "%d\t",
+ self->qos_tx.additional_bofs.value);
+ seq_printf(seq, "%d\t",
+ self->qos_tx.min_turn_time.value);
+ seq_printf(seq, "%d\t",
+ self->qos_tx.link_disc_time.value);
+ seq_printf(seq, "\n");
+
+ seq_printf(seq, " rx\t%d\t",
+ self->qos_rx.baud_rate.value);
+ seq_printf(seq, "%d\t",
+ self->qos_rx.max_turn_time.value);
+ seq_printf(seq, "%d\t",
+ self->qos_rx.data_size.value);
+ seq_printf(seq, "%d\t",
+ self->qos_rx.window_size.value);
+ seq_printf(seq, "%d\t",
+ self->qos_rx.additional_bofs.value);
+ seq_printf(seq, "%d\t",
+ self->qos_rx.min_turn_time.value);
+ seq_printf(seq, "%d\n",
+ self->qos_rx.link_disc_time.value);
+
+ return 0;
+}
+
+static const struct seq_operations irlap_seq_ops = {
+ .start = irlap_seq_start,
+ .next = irlap_seq_next,
+ .stop = irlap_seq_stop,
+ .show = irlap_seq_show,
+};
+
+static int irlap_seq_open(struct inode *inode, struct file *file)
+{
+ if (irlap == NULL)
+ return -EINVAL;
+
+ return seq_open_private(file, &irlap_seq_ops,
+ sizeof(struct irlap_iter_state));
+}
+
+const struct file_operations irlap_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = irlap_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
new file mode 100644
index 000000000..0e1b4d79f
--- /dev/null
+++ b/net/irda/irlap_event.c
@@ -0,0 +1,2316 @@
+/*********************************************************************
+ *
+ * Filename: irlap_event.c
+ * Version: 0.9
+ * Description: IrLAP state machine implementation
+ * Status: Experimental.
+ * Author: Dag Brattli <dag@brattli.net>
+ * Created at: Sat Aug 16 00:59:29 1997
+ * Modified at: Sat Dec 25 21:07:57 1999
+ * Modified by: Dag Brattli <dag@brattli.net>
+ *
+ * Copyright (c) 1998-2000 Dag Brattli <dag@brattli.net>,
+ * Copyright (c) 1998 Thomas Davis <ratbert@radiks.net>
+ * All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlap_event.h>
+
+#include <net/irda/timer.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlap_frame.h>
+#include <net/irda/qos.h>
+#include <net/irda/parameters.h>
+#include <net/irda/irlmp.h> /* irlmp_flow_indication(), ... */
+
+#include <net/irda/irda_device.h>
+
+#ifdef CONFIG_IRDA_FAST_RR
+int sysctl_fast_poll_increase = 50;
+#endif
+
+static int irlap_state_ndm (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_query (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_reply (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_conn (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_setup (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_offline(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_xmit_p (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_pclose (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_nrm_p (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_reset_wait(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_reset (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_nrm_s (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_xmit_s (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_sclose (struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_reset_check(struct irlap_cb *, IRLAP_EVENT event,
+ struct sk_buff *, struct irlap_info *);
+
+static const char *const irlap_event[] __maybe_unused = {
+ "DISCOVERY_REQUEST",
+ "CONNECT_REQUEST",
+ "CONNECT_RESPONSE",
+ "DISCONNECT_REQUEST",
+ "DATA_REQUEST",
+ "RESET_REQUEST",
+ "RESET_RESPONSE",
+ "SEND_I_CMD",
+ "SEND_UI_FRAME",
+ "RECV_DISCOVERY_XID_CMD",
+ "RECV_DISCOVERY_XID_RSP",
+ "RECV_SNRM_CMD",
+ "RECV_TEST_CMD",
+ "RECV_TEST_RSP",
+ "RECV_UA_RSP",
+ "RECV_DM_RSP",
+ "RECV_RD_RSP",
+ "RECV_I_CMD",
+ "RECV_I_RSP",
+ "RECV_UI_FRAME",
+ "RECV_FRMR_RSP",
+ "RECV_RR_CMD",
+ "RECV_RR_RSP",
+ "RECV_RNR_CMD",
+ "RECV_RNR_RSP",
+ "RECV_REJ_CMD",
+ "RECV_REJ_RSP",
+ "RECV_SREJ_CMD",
+ "RECV_SREJ_RSP",
+ "RECV_DISC_CMD",
+ "SLOT_TIMER_EXPIRED",
+ "QUERY_TIMER_EXPIRED",
+ "FINAL_TIMER_EXPIRED",
+ "POLL_TIMER_EXPIRED",
+ "DISCOVERY_TIMER_EXPIRED",
+ "WD_TIMER_EXPIRED",
+ "BACKOFF_TIMER_EXPIRED",
+ "MEDIA_BUSY_TIMER_EXPIRED",
+};
+
+const char *const irlap_state[] = {
+ "LAP_NDM",
+ "LAP_QUERY",
+ "LAP_REPLY",
+ "LAP_CONN",
+ "LAP_SETUP",
+ "LAP_OFFLINE",
+ "LAP_XMIT_P",
+ "LAP_PCLOSE",
+ "LAP_NRM_P",
+ "LAP_RESET_WAIT",
+ "LAP_RESET",
+ "LAP_NRM_S",
+ "LAP_XMIT_S",
+ "LAP_SCLOSE",
+ "LAP_RESET_CHECK",
+};
+
+static int (*state[])(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info) =
+{
+ irlap_state_ndm,
+ irlap_state_query,
+ irlap_state_reply,
+ irlap_state_conn,
+ irlap_state_setup,
+ irlap_state_offline,
+ irlap_state_xmit_p,
+ irlap_state_pclose,
+ irlap_state_nrm_p,
+ irlap_state_reset_wait,
+ irlap_state_reset,
+ irlap_state_nrm_s,
+ irlap_state_xmit_s,
+ irlap_state_sclose,
+ irlap_state_reset_check,
+};
+
+/*
+ * Function irda_poll_timer_expired (data)
+ *
+ * Poll timer has expired. Normally we must now send a RR frame to the
+ * remote device
+ */
+static void irlap_poll_timer_expired(void *data)
+{
+ struct irlap_cb *self = (struct irlap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ irlap_do_event(self, POLL_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Calculate and set time before we will have to send back the pf bit
+ * to the peer. Use in primary.
+ * Make sure that state is XMIT_P/XMIT_S when calling this function
+ * (and that nobody messed up with the state). - Jean II
+ */
+static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+#ifdef CONFIG_IRDA_FAST_RR
+ /*
+ * Send out the RR frames faster if our own transmit queue is empty, or
+ * if the peer is busy. The effect is a much faster conversation
+ */
+ if (skb_queue_empty(&self->txq) || self->remote_busy) {
+ if (self->fast_RR == TRUE) {
+ /*
+ * Assert that the fast poll timer has not reached the
+ * normal poll timer yet
+ */
+ if (self->fast_RR_timeout < timeout) {
+ /*
+ * FIXME: this should be a more configurable
+ * function
+ */
+ self->fast_RR_timeout +=
+ (sysctl_fast_poll_increase * HZ/1000);
+
+ /* Use this fast(er) timeout instead */
+ timeout = self->fast_RR_timeout;
+ }
+ } else {
+ self->fast_RR = TRUE;
+
+ /* Start with just 0 ms */
+ self->fast_RR_timeout = 0;
+ timeout = 0;
+ }
+ } else
+ self->fast_RR = FALSE;
+
+ pr_debug("%s(), timeout=%d (%ld)\n", __func__, timeout, jiffies);
+#endif /* CONFIG_IRDA_FAST_RR */
+
+ if (timeout == 0)
+ irlap_do_event(self, POLL_TIMER_EXPIRED, NULL, NULL);
+ else
+ irda_start_timer(&self->poll_timer, timeout, self,
+ irlap_poll_timer_expired);
+}
+
+/*
+ * Function irlap_do_event (event, skb, info)
+ *
+ * Rushes through the state machine without any delay. If state == XMIT
+ * then send queued data frames.
+ */
+void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret;
+
+ if (!self || self->magic != LAP_MAGIC)
+ return;
+
+ pr_debug("%s(), event = %s, state = %s\n", __func__,
+ irlap_event[event], irlap_state[self->state]);
+
+ ret = (*state[self->state])(self, event, skb, info);
+
+ /*
+ * Check if there are any pending events that needs to be executed
+ */
+ switch (self->state) {
+ case LAP_XMIT_P: /* FALLTHROUGH */
+ case LAP_XMIT_S:
+ /*
+ * We just received the pf bit and are at the beginning
+ * of a new LAP transmit window.
+ * Check if there are any queued data frames, and do not
+ * try to disconnect link if we send any data frames, since
+ * that will change the state away form XMIT
+ */
+ pr_debug("%s() : queue len = %d\n", __func__,
+ skb_queue_len(&self->txq));
+
+ if (!skb_queue_empty(&self->txq)) {
+ /* Prevent race conditions with irlap_data_request() */
+ self->local_busy = TRUE;
+
+ /* Theory of operation.
+ * We send frames up to when we fill the window or
+ * reach line capacity. Those frames will queue up
+ * in the device queue, and the driver will slowly
+ * send them.
+ * After each frame that we send, we poll the higher
+ * layer for more data. It's the right time to do
+ * that because the link layer need to perform the mtt
+ * and then send the first frame, so we can afford
+ * to send a bit of time in kernel space.
+ * The explicit flow indication allow to minimise
+ * buffers (== lower latency), to avoid higher layer
+ * polling via timers (== less context switches) and
+ * to implement a crude scheduler - Jean II */
+
+ /* Try to send away all queued data frames */
+ while ((skb = skb_dequeue(&self->txq)) != NULL) {
+ /* Send one frame */
+ ret = (*state[self->state])(self, SEND_I_CMD,
+ skb, NULL);
+ /* Drop reference count.
+ * It will be increase as needed in
+ * irlap_send_data_xxx() */
+ kfree_skb(skb);
+
+ /* Poll the higher layers for one more frame */
+ irlmp_flow_indication(self->notify.instance,
+ FLOW_START);
+
+ if (ret == -EPROTO)
+ break; /* Try again later! */
+ }
+ /* Finished transmitting */
+ self->local_busy = FALSE;
+ } else if (self->disconnect_pending) {
+ self->disconnect_pending = FALSE;
+
+ ret = (*state[self->state])(self, DISCONNECT_REQUEST,
+ NULL, NULL);
+ }
+ break;
+/* case LAP_NDM: */
+/* case LAP_CONN: */
+/* case LAP_RESET_WAIT: */
+/* case LAP_RESET_CHECK: */
+ default:
+ break;
+ }
+}
+
+/*
+ * Function irlap_state_ndm (event, skb, frame)
+ *
+ * NDM (Normal Disconnected Mode) state
+ *
+ */
+static int irlap_state_ndm(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ discovery_t *discovery_rsp;
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case CONNECT_REQUEST:
+ IRDA_ASSERT(self->netdev != NULL, return -1;);
+
+ if (self->media_busy) {
+ /* Note : this will never happen, because we test
+ * media busy in irlap_connect_request() and
+ * postpone the event... - Jean II */
+ pr_debug("%s(), CONNECT_REQUEST: media busy!\n",
+ __func__);
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_disconnect_indication(self, LAP_MEDIA_BUSY);
+ } else {
+ irlap_send_snrm_frame(self, &self->qos_rx);
+
+ /* Start Final-bit timer */
+ irlap_start_final_timer(self, self->final_timeout);
+
+ self->retry_count = 0;
+ irlap_next_state(self, LAP_SETUP);
+ }
+ break;
+ case RECV_SNRM_CMD:
+ /* Check if the frame contains and I field */
+ if (info) {
+ self->daddr = info->daddr;
+ self->caddr = info->caddr;
+
+ irlap_next_state(self, LAP_CONN);
+
+ irlap_connect_indication(self, skb);
+ } else {
+ pr_debug("%s(), SNRM frame does not contain an I field!\n",
+ __func__);
+ }
+ break;
+ case DISCOVERY_REQUEST:
+ IRDA_ASSERT(info != NULL, return -1;);
+
+ if (self->media_busy) {
+ pr_debug("%s(), DISCOVERY_REQUEST: media busy!\n",
+ __func__);
+ /* irlap->log.condition = MEDIA_BUSY; */
+
+ /* This will make IrLMP try again */
+ irlap_discovery_confirm(self, NULL);
+ /* Note : the discovery log is not cleaned up here,
+ * it will be done in irlap_discovery_request()
+ * Jean II */
+ return 0;
+ }
+
+ self->S = info->S;
+ self->s = info->s;
+ irlap_send_discovery_xid_frame(self, info->S, info->s, TRUE,
+ info->discovery);
+ self->frame_sent = FALSE;
+ self->s++;
+
+ irlap_start_slot_timer(self, self->slot_timeout);
+ irlap_next_state(self, LAP_QUERY);
+ break;
+ case RECV_DISCOVERY_XID_CMD:
+ IRDA_ASSERT(info != NULL, return -1;);
+
+ /* Assert that this is not the final slot */
+ if (info->s <= info->S) {
+ self->slot = irlap_generate_rand_time_slot(info->S,
+ info->s);
+ if (self->slot == info->s) {
+ discovery_rsp = irlmp_get_discovery_response();
+ discovery_rsp->data.daddr = info->daddr;
+
+ irlap_send_discovery_xid_frame(self, info->S,
+ self->slot,
+ FALSE,
+ discovery_rsp);
+ self->frame_sent = TRUE;
+ } else
+ self->frame_sent = FALSE;
+
+ /*
+ * Go to reply state until end of discovery to
+ * inhibit our own transmissions. Set the timer
+ * to not stay forever there... Jean II
+ */
+ irlap_start_query_timer(self, info->S, info->s);
+ irlap_next_state(self, LAP_REPLY);
+ } else {
+ /* This is the final slot. How is it possible ?
+ * This would happen is both discoveries are just slightly
+ * offset (if they are in sync, all packets are lost).
+ * Most often, all the discovery requests will be received
+ * in QUERY state (see my comment there), except for the
+ * last frame that will come here.
+ * The big trouble when it happen is that active discovery
+ * doesn't happen, because nobody answer the discoveries
+ * frame of the other guy, so the log shows up empty.
+ * What should we do ?
+ * Not much. It's too late to answer those discovery frames,
+ * so we just pass the info to IrLMP who will put it in the
+ * log (and post an event).
+ * Another cause would be devices that do discovery much
+ * slower than us, however the latest fixes should minimise
+ * those cases...
+ * Jean II
+ */
+ pr_debug("%s(), Receiving final discovery request, missed the discovery slots :-(\n",
+ __func__);
+
+ /* Last discovery request -> in the log */
+ irlap_discovery_indication(self, info->discovery);
+ }
+ break;
+ case MEDIA_BUSY_TIMER_EXPIRED:
+ /* A bunch of events may be postponed because the media is
+ * busy (usually immediately after we close a connection),
+ * or while we are doing discovery (state query/reply).
+ * In all those cases, the media busy flag will be cleared
+ * when it's OK for us to process those postponed events.
+ * This event is not mentioned in the state machines in the
+ * IrLAP spec. It's because they didn't consider Ultra and
+ * postponing connection request is optional.
+ * Jean II */
+#ifdef CONFIG_IRDA_ULTRA
+ /* Send any pending Ultra frames if any */
+ if (!skb_queue_empty(&self->txq_ultra)) {
+ /* We don't send the frame, just post an event.
+ * Also, previously this code was in timer.c...
+ * Jean II */
+ ret = (*state[self->state])(self, SEND_UI_FRAME,
+ NULL, NULL);
+ }
+#endif /* CONFIG_IRDA_ULTRA */
+ /* Check if we should try to connect.
+ * This code was previously in irlap_do_event() */
+ if (self->connect_pending) {
+ self->connect_pending = FALSE;
+
+ /* This one *should* not pend in this state, except
+ * if a socket try to connect and immediately
+ * disconnect. - clear - Jean II */
+ if (self->disconnect_pending)
+ irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+ else
+ ret = (*state[self->state])(self,
+ CONNECT_REQUEST,
+ NULL, NULL);
+ self->disconnect_pending = FALSE;
+ }
+ /* Note : one way to test if this code works well (including
+ * media busy and small busy) is to create a user space
+ * application generating an Ultra packet every 3.05 sec (or
+ * 2.95 sec) and to see how it interact with discovery.
+ * It's fairly easy to check that no packet is lost, that the
+ * packets are postponed during discovery and that after
+ * discovery indication you have a 100ms "gap".
+ * As connection request and Ultra are now processed the same
+ * way, this avoid the tedious job of trying IrLAP connection
+ * in all those cases...
+ * Jean II */
+ break;
+#ifdef CONFIG_IRDA_ULTRA
+ case SEND_UI_FRAME:
+ {
+ int i;
+ /* Only allowed to repeat an operation twice */
+ for (i=0; ((i<2) && (self->media_busy == FALSE)); i++) {
+ skb = skb_dequeue(&self->txq_ultra);
+ if (skb)
+ irlap_send_ui_frame(self, skb, CBROADCAST,
+ CMD_FRAME);
+ else
+ break;
+ /* irlap_send_ui_frame() won't increase skb reference
+ * count, so no dev_kfree_skb() - Jean II */
+ }
+ if (i == 2) {
+ /* Force us to listen 500 ms again */
+ irda_device_set_media_busy(self->netdev, TRUE);
+ }
+ break;
+ }
+ case RECV_UI_FRAME:
+ /* Only accept broadcast frames in NDM mode */
+ if (info->caddr != CBROADCAST) {
+ pr_debug("%s(), not a broadcast frame!\n",
+ __func__);
+ } else
+ irlap_unitdata_indication(self, skb);
+ break;
+#endif /* CONFIG_IRDA_ULTRA */
+ case RECV_TEST_CMD:
+ /* Remove test frame header */
+ skb_pull(skb, sizeof(struct test_frame));
+
+ /*
+ * Send response. This skb will not be sent out again, and
+ * will only be used to send out the same info as the cmd
+ */
+ irlap_send_test_frame(self, CBROADCAST, info->daddr, skb);
+ break;
+ case RECV_TEST_RSP:
+ pr_debug("%s() not implemented!\n", __func__);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n", __func__,
+ irlap_event[event]);
+
+ ret = -1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_query (event, skb, info)
+ *
+ * QUERY state
+ *
+ */
+static int irlap_state_query(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case RECV_DISCOVERY_XID_RSP:
+ IRDA_ASSERT(info != NULL, return -1;);
+ IRDA_ASSERT(info->discovery != NULL, return -1;);
+
+ pr_debug("%s(), daddr=%08x\n", __func__,
+ info->discovery->data.daddr);
+
+ if (!self->discovery_log) {
+ net_warn_ratelimited("%s: discovery log is gone! maybe the discovery timeout has been set too short?\n",
+ __func__);
+ break;
+ }
+ hashbin_insert(self->discovery_log,
+ (irda_queue_t *) info->discovery,
+ info->discovery->data.daddr, NULL);
+
+ /* Keep state */
+ /* irlap_next_state(self, LAP_QUERY); */
+
+ break;
+ case RECV_DISCOVERY_XID_CMD:
+ /* Yes, it is possible to receive those frames in this mode.
+ * Note that most often the last discovery request won't
+ * occur here but in NDM state (see my comment there).
+ * What should we do ?
+ * Not much. We are currently performing our own discovery,
+ * therefore we can't answer those frames. We don't want
+ * to change state either. We just pass the info to
+ * IrLMP who will put it in the log (and post an event).
+ * Jean II
+ */
+
+ IRDA_ASSERT(info != NULL, return -1;);
+
+ pr_debug("%s(), Receiving discovery request (s = %d) while performing discovery :-(\n",
+ __func__, info->s);
+
+ /* Last discovery request ? */
+ if (info->s == 0xff)
+ irlap_discovery_indication(self, info->discovery);
+ break;
+ case SLOT_TIMER_EXPIRED:
+ /*
+ * Wait a little longer if we detect an incoming frame. This
+ * is not mentioned in the spec, but is a good thing to do,
+ * since we want to work even with devices that violate the
+ * timing requirements.
+ */
+ if (irda_device_is_receiving(self->netdev) && !self->add_wait) {
+ pr_debug("%s(), device is slow to answer, waiting some more!\n",
+ __func__);
+ irlap_start_slot_timer(self, msecs_to_jiffies(10));
+ self->add_wait = TRUE;
+ return ret;
+ }
+ self->add_wait = FALSE;
+
+ if (self->s < self->S) {
+ irlap_send_discovery_xid_frame(self, self->S,
+ self->s, TRUE,
+ self->discovery_cmd);
+ self->s++;
+ irlap_start_slot_timer(self, self->slot_timeout);
+
+ /* Keep state */
+ irlap_next_state(self, LAP_QUERY);
+ } else {
+ /* This is the final slot! */
+ irlap_send_discovery_xid_frame(self, self->S, 0xff,
+ TRUE,
+ self->discovery_cmd);
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ /*
+ * We are now finished with the discovery procedure,
+ * so now we must return the results
+ */
+ irlap_discovery_confirm(self, self->discovery_log);
+
+ /* IrLMP should now have taken care of the log */
+ self->discovery_log = NULL;
+ }
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n", __func__,
+ irlap_event[event]);
+
+ ret = -1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_reply (self, event, skb, info)
+ *
+ * REPLY, we have received a XID discovery frame from a device and we
+ * are waiting for the right time slot to send a response XID frame
+ *
+ */
+static int irlap_state_reply(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ discovery_t *discovery_rsp;
+ int ret=0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case QUERY_TIMER_EXPIRED:
+ pr_debug("%s(), QUERY_TIMER_EXPIRED <%ld>\n",
+ __func__, jiffies);
+ irlap_next_state(self, LAP_NDM);
+ break;
+ case RECV_DISCOVERY_XID_CMD:
+ IRDA_ASSERT(info != NULL, return -1;);
+ /* Last frame? */
+ if (info->s == 0xff) {
+ del_timer(&self->query_timer);
+
+ /* info->log.condition = REMOTE; */
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_discovery_indication(self, info->discovery);
+ } else {
+ /* If it's our slot, send our reply */
+ if ((info->s >= self->slot) && (!self->frame_sent)) {
+ discovery_rsp = irlmp_get_discovery_response();
+ discovery_rsp->data.daddr = info->daddr;
+
+ irlap_send_discovery_xid_frame(self, info->S,
+ self->slot,
+ FALSE,
+ discovery_rsp);
+
+ self->frame_sent = TRUE;
+ }
+ /* Readjust our timer to accommodate devices
+ * doing faster or slower discovery than us...
+ * Jean II */
+ irlap_start_query_timer(self, info->S, info->s);
+
+ /* Keep state */
+ //irlap_next_state(self, LAP_REPLY);
+ }
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d, %s\n", __func__,
+ event, irlap_event[event]);
+
+ ret = -1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_conn (event, skb, info)
+ *
+ * CONN, we have received a SNRM command and is waiting for the upper
+ * layer to accept or refuse connection
+ *
+ */
+static int irlap_state_conn(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s(), event=%s\n", __func__, irlap_event[event]);
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case CONNECT_RESPONSE:
+ skb_pull(skb, sizeof(struct snrm_frame));
+
+ IRDA_ASSERT(self->netdev != NULL, return -1;);
+
+ irlap_qos_negotiate(self, skb);
+
+ irlap_initiate_connection_state(self);
+
+ /*
+ * Applying the parameters now will make sure we change speed
+ * *after* we have sent the next frame
+ */
+ irlap_apply_connection_parameters(self, FALSE);
+
+ /*
+ * Sending this frame will force a speed change after it has
+ * been sent (i.e. the frame will be sent at 9600).
+ */
+ irlap_send_ua_response_frame(self, &self->qos_rx);
+
+#if 0
+ /*
+ * We are allowed to send two frames, but this may increase
+ * the connect latency, so lets not do it for now.
+ */
+ /* This is full of good intentions, but doesn't work in
+ * practice.
+ * After sending the first UA response, we switch the
+ * dongle to the negotiated speed, which is usually
+ * different than 9600 kb/s.
+ * From there, there is two solutions :
+ * 1) The other end has received the first UA response :
+ * it will set up the connection, move to state LAP_NRM_P,
+ * and will ignore and drop the second UA response.
+ * Actually, it's even worse : the other side will almost
+ * immediately send a RR that will likely collide with the
+ * UA response (depending on negotiated turnaround).
+ * 2) The other end has not received the first UA response,
+ * will stay at 9600 and will never see the second UA response.
+ * Jean II */
+ irlap_send_ua_response_frame(self, &self->qos_rx);
+#endif
+
+ /*
+ * The WD-timer could be set to the duration of the P-timer
+ * for this case, but it is recommended to use twice the
+ * value (note 3 IrLAP p. 60).
+ */
+ irlap_start_wd_timer(self, self->wd_timeout);
+ irlap_next_state(self, LAP_NRM_S);
+
+ break;
+ case RECV_DISCOVERY_XID_CMD:
+ pr_debug("%s(), event RECV_DISCOVER_XID_CMD!\n",
+ __func__);
+ irlap_next_state(self, LAP_NDM);
+
+ break;
+ case DISCONNECT_REQUEST:
+ pr_debug("%s(), Disconnect request!\n", __func__);
+ irlap_send_dm_frame(self);
+ irlap_next_state( self, LAP_NDM);
+ irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d, %s\n", __func__,
+ event, irlap_event[event]);
+
+ ret = -1;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Function irlap_state_setup (event, skb, frame)
+ *
+ * SETUP state, The local layer has transmitted a SNRM command frame to
+ * a remote peer layer and is awaiting a reply .
+ *
+ */
+static int irlap_state_setup(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case FINAL_TIMER_EXPIRED:
+ if (self->retry_count < self->N3) {
+/*
+ * Perform random backoff, Wait a random number of time units, minimum
+ * duration half the time taken to transmitt a SNRM frame, maximum duration
+ * 1.5 times the time taken to transmit a SNRM frame. So this time should
+ * between 15 msecs and 45 msecs.
+ */
+ irlap_start_backoff_timer(self, msecs_to_jiffies(20 +
+ (jiffies % 30)));
+ } else {
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_disconnect_indication(self, LAP_FOUND_NONE);
+ }
+ break;
+ case BACKOFF_TIMER_EXPIRED:
+ irlap_send_snrm_frame(self, &self->qos_rx);
+ irlap_start_final_timer(self, self->final_timeout);
+ self->retry_count++;
+ break;
+ case RECV_SNRM_CMD:
+ pr_debug("%s(), SNRM battle!\n", __func__);
+
+ IRDA_ASSERT(skb != NULL, return 0;);
+ IRDA_ASSERT(info != NULL, return 0;);
+
+ /*
+ * The device with the largest device address wins the battle
+ * (both have sent a SNRM command!)
+ */
+ if (info &&(info->daddr > self->saddr)) {
+ del_timer(&self->final_timer);
+ irlap_initiate_connection_state(self);
+
+ IRDA_ASSERT(self->netdev != NULL, return -1;);
+
+ skb_pull(skb, sizeof(struct snrm_frame));
+
+ irlap_qos_negotiate(self, skb);
+
+ /* Send UA frame and then change link settings */
+ irlap_apply_connection_parameters(self, FALSE);
+ irlap_send_ua_response_frame(self, &self->qos_rx);
+
+ irlap_next_state(self, LAP_NRM_S);
+ irlap_connect_confirm(self, skb);
+
+ /*
+ * The WD-timer could be set to the duration of the
+ * P-timer for this case, but it is recommended
+ * to use twice the value (note 3 IrLAP p. 60).
+ */
+ irlap_start_wd_timer(self, self->wd_timeout);
+ } else {
+ /* We just ignore the other device! */
+ irlap_next_state(self, LAP_SETUP);
+ }
+ break;
+ case RECV_UA_RSP:
+ /* Stop F-timer */
+ del_timer(&self->final_timer);
+
+ /* Initiate connection state */
+ irlap_initiate_connection_state(self);
+
+ /* Negotiate connection parameters */
+ IRDA_ASSERT(skb->len > 10, return -1;);
+
+ skb_pull(skb, sizeof(struct ua_frame));
+
+ IRDA_ASSERT(self->netdev != NULL, return -1;);
+
+ irlap_qos_negotiate(self, skb);
+
+ /* Set the new link setting *now* (before the rr frame) */
+ irlap_apply_connection_parameters(self, TRUE);
+ self->retry_count = 0;
+
+ /* Wait for turnaround time to give a chance to the other
+ * device to be ready to receive us.
+ * Note : the time to switch speed is typically larger
+ * than the turnaround time, but as we don't have the other
+ * side speed switch time, that's our best guess...
+ * Jean II */
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+
+ /* This frame will actually be sent at the new speed */
+ irlap_send_rr_frame(self, CMD_FRAME);
+
+ /* The timer is set to half the normal timer to quickly
+ * detect a failure to negotiate the new connection
+ * parameters. IrLAP 6.11.3.2, note 3.
+ * Note that currently we don't process this failure
+ * properly, as we should do a quick disconnect.
+ * Jean II */
+ irlap_start_final_timer(self, self->final_timeout/2);
+ irlap_next_state(self, LAP_NRM_P);
+
+ irlap_connect_confirm(self, skb);
+ break;
+ case RECV_DM_RSP: /* FALLTHROUGH */
+ case RECV_DISC_CMD:
+ del_timer(&self->final_timer);
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d, %s\n", __func__,
+ event, irlap_event[event]);
+
+ ret = -1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_offline (self, event, skb, info)
+ *
+ * OFFLINE state, not used for now!
+ *
+ */
+static int irlap_state_offline(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ pr_debug("%s(), Unknown event\n", __func__);
+
+ return -1;
+}
+
+/*
+ * Function irlap_state_xmit_p (self, event, skb, info)
+ *
+ * XMIT, Only the primary station has right to transmit, and we
+ * therefore do not expect to receive any transmissions from other
+ * stations.
+ *
+ */
+static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+
+ switch (event) {
+ case SEND_I_CMD:
+ /*
+ * Only send frame if send-window > 0.
+ */
+ if ((self->window > 0) && (!self->remote_busy)) {
+ int nextfit;
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+ struct sk_buff *skb_next;
+
+ /* With DYNAMIC_WINDOW, we keep the window size
+ * maximum, and adapt on the packets we are sending.
+ * At 115k, we can send only 2 packets of 2048 bytes
+ * in a 500 ms turnaround. Without this option, we
+ * would always limit the window to 2. With this
+ * option, if we send smaller packets, we can send
+ * up to 7 of them (always depending on QoS).
+ * Jean II */
+
+ /* Look at the next skb. This is safe, as we are
+ * the only consumer of the Tx queue (if we are not,
+ * we have other problems) - Jean II */
+ skb_next = skb_peek(&self->txq);
+
+ /* Check if a subsequent skb exist and would fit in
+ * the current window (with respect to turnaround
+ * time).
+ * This allow us to properly mark the current packet
+ * with the pf bit, to avoid falling back on the
+ * second test below, and avoid waiting the
+ * end of the window and sending a extra RR.
+ * Note : (skb_next != NULL) <=> (skb_queue_len() > 0)
+ * Jean II */
+ nextfit = ((skb_next != NULL) &&
+ ((skb_next->len + skb->len) <=
+ self->bytes_left));
+
+ /*
+ * The current packet may not fit ! Because of test
+ * above, this should not happen any more !!!
+ * Test if we have transmitted more bytes over the
+ * link than its possible to do with the current
+ * speed and turn-around-time.
+ */
+ if((!nextfit) && (skb->len > self->bytes_left)) {
+ pr_debug("%s(), Not allowed to transmit more bytes!\n",
+ __func__);
+ /* Requeue the skb */
+ skb_queue_head(&self->txq, skb_get(skb));
+ /*
+ * We should switch state to LAP_NRM_P, but
+ * that is not possible since we must be sure
+ * that we poll the other side. Since we have
+ * used up our time, the poll timer should
+ * trigger anyway now, so we just wait for it
+ * DB
+ */
+ /*
+ * Sorry, but that's not totally true. If
+ * we send 2000B packets, we may wait another
+ * 1000B until our turnaround expire. That's
+ * why we need to be proactive in avoiding
+ * coming here. - Jean II
+ */
+ return -EPROTO;
+ }
+
+ /* Subtract space used by this skb */
+ self->bytes_left -= skb->len;
+#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
+ /* Window has been adjusted for the max packet
+ * size, so much simpler... - Jean II */
+ nextfit = !skb_queue_empty(&self->txq);
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+ /*
+ * Send data with poll bit cleared only if window > 1
+ * and there is more frames after this one to be sent
+ */
+ if ((self->window > 1) && (nextfit)) {
+ /* More packet to send in current window */
+ irlap_send_data_primary(self, skb);
+ irlap_next_state(self, LAP_XMIT_P);
+ } else {
+ /* Final packet of window */
+ irlap_send_data_primary_poll(self, skb);
+
+ /*
+ * Make sure state machine does not try to send
+ * any more frames
+ */
+ ret = -EPROTO;
+ }
+#ifdef CONFIG_IRDA_FAST_RR
+ /* Peer may want to reply immediately */
+ self->fast_RR = FALSE;
+#endif /* CONFIG_IRDA_FAST_RR */
+ } else {
+ pr_debug("%s(), Unable to send! remote busy?\n",
+ __func__);
+ skb_queue_head(&self->txq, skb_get(skb));
+
+ /*
+ * The next ret is important, because it tells
+ * irlap_next_state _not_ to deliver more frames
+ */
+ ret = -EPROTO;
+ }
+ break;
+ case POLL_TIMER_EXPIRED:
+ pr_debug("%s(), POLL_TIMER_EXPIRED <%ld>\n",
+ __func__, jiffies);
+ irlap_send_rr_frame(self, CMD_FRAME);
+ /* Return to NRM properly - Jean II */
+ self->window = self->window_size;
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+ /* Allowed to transmit a maximum number of bytes again. */
+ self->bytes_left = self->line_capacity;
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+ irlap_start_final_timer(self, self->final_timeout);
+ irlap_next_state(self, LAP_NRM_P);
+ break;
+ case DISCONNECT_REQUEST:
+ del_timer(&self->poll_timer);
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_disc_frame(self);
+ irlap_flush_all_queues(self);
+ irlap_start_final_timer(self, self->final_timeout);
+ self->retry_count = 0;
+ irlap_next_state(self, LAP_PCLOSE);
+ break;
+ case DATA_REQUEST:
+ /* Nothing to do, irlap_do_event() will send the packet
+ * when we return... - Jean II */
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n",
+ __func__, irlap_event[event]);
+
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_pclose (event, skb, info)
+ *
+ * PCLOSE state
+ */
+static int irlap_state_pclose(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case RECV_UA_RSP: /* FALLTHROUGH */
+ case RECV_DM_RSP:
+ del_timer(&self->final_timer);
+
+ /* Set new link parameters */
+ irlap_apply_default_connection_parameters(self);
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+ break;
+ case FINAL_TIMER_EXPIRED:
+ if (self->retry_count < self->N3) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_disc_frame(self);
+ irlap_start_final_timer(self, self->final_timeout);
+ self->retry_count++;
+ /* Keep state */
+ } else {
+ irlap_apply_default_connection_parameters(self);
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+ }
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d\n", __func__, event);
+
+ ret = -1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_nrm_p (self, event, skb, info)
+ *
+ * NRM_P (Normal Response Mode as Primary), The primary station has given
+ * permissions to a secondary station to transmit IrLAP resonse frames
+ * (by sending a frame with the P bit set). The primary station will not
+ * transmit any frames and is expecting to receive frames only from the
+ * secondary to which transmission permissions has been given.
+ */
+static int irlap_state_nrm_p(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+ int ns_status;
+ int nr_status;
+
+ switch (event) {
+ case RECV_I_RSP: /* Optimize for the common case */
+ if (unlikely(skb->len <= LAP_ADDR_HEADER + LAP_CTRL_HEADER)) {
+ /*
+ * Input validation check: a stir4200/mcp2150
+ * combination sometimes results in an empty i:rsp.
+ * This makes no sense; we can just ignore the frame
+ * and send an rr:cmd immediately. This happens before
+ * changing nr or ns so triggers a retransmit
+ */
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, CMD_FRAME);
+ /* Keep state */
+ break;
+ }
+ /* FIXME: must check for remote_busy below */
+#ifdef CONFIG_IRDA_FAST_RR
+ /*
+ * Reset the fast_RR so we can use the fast RR code with
+ * full speed the next time since peer may have more frames
+ * to transmitt
+ */
+ self->fast_RR = FALSE;
+#endif /* CONFIG_IRDA_FAST_RR */
+ IRDA_ASSERT( info != NULL, return -1;);
+
+ ns_status = irlap_validate_ns_received(self, info->ns);
+ nr_status = irlap_validate_nr_received(self, info->nr);
+
+ /*
+ * Check for expected I(nformation) frame
+ */
+ if ((ns_status == NS_EXPECTED) && (nr_status == NR_EXPECTED)) {
+
+ /* Update Vr (next frame for us to receive) */
+ self->vr = (self->vr + 1) % 8;
+
+ /* Update Nr received, cleanup our retry queue */
+ irlap_update_nr_received(self, info->nr);
+
+ /*
+ * Got expected NR, so reset the
+ * retry_count. This is not done by IrLAP spec,
+ * which is strange!
+ */
+ self->retry_count = 0;
+ self->ack_required = TRUE;
+
+ /* poll bit cleared? */
+ if (!info->pf) {
+ /* Keep state, do not move this line */
+ irlap_next_state(self, LAP_NRM_P);
+
+ irlap_data_indication(self, skb, FALSE);
+ } else {
+ /* No longer waiting for pf */
+ del_timer(&self->final_timer);
+
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+
+ /* Call higher layer *before* changing state
+ * to give them a chance to send data in the
+ * next LAP frame.
+ * Jean II */
+ irlap_data_indication(self, skb, FALSE);
+
+ /* XMIT states are the most dangerous state
+ * to be in, because user requests are
+ * processed directly and may change state.
+ * On the other hand, in NDM_P, those
+ * requests are queued and we will process
+ * them when we return to irlap_do_event().
+ * Jean II
+ */
+ irlap_next_state(self, LAP_XMIT_P);
+
+ /* This is the last frame.
+ * Make sure it's always called in XMIT state.
+ * - Jean II */
+ irlap_start_poll_timer(self, self->poll_timeout);
+ }
+ break;
+
+ }
+ /* Unexpected next to send (Ns) */
+ if ((ns_status == NS_UNEXPECTED) && (nr_status == NR_EXPECTED))
+ {
+ if (!info->pf) {
+ irlap_update_nr_received(self, info->nr);
+
+ /*
+ * Wait until the last frame before doing
+ * anything
+ */
+
+ /* Keep state */
+ irlap_next_state(self, LAP_NRM_P);
+ } else {
+ pr_debug("%s(), missing or duplicate frame!\n",
+ __func__);
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, CMD_FRAME);
+
+ self->ack_required = FALSE;
+
+ irlap_start_final_timer(self, self->final_timeout);
+ irlap_next_state(self, LAP_NRM_P);
+ }
+ break;
+ }
+ /*
+ * Unexpected next to receive (Nr)
+ */
+ if ((ns_status == NS_EXPECTED) && (nr_status == NR_UNEXPECTED))
+ {
+ if (info->pf) {
+ self->vr = (self->vr + 1) % 8;
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ /* Resend rejected frames */
+ irlap_resend_rejected_frames(self, CMD_FRAME);
+
+ self->ack_required = FALSE;
+
+ /* Make sure we account for the time
+ * to transmit our frames. See comemnts
+ * in irlap_send_data_primary_poll().
+ * Jean II */
+ irlap_start_final_timer(self, 2 * self->final_timeout);
+
+ /* Keep state, do not move this line */
+ irlap_next_state(self, LAP_NRM_P);
+
+ irlap_data_indication(self, skb, FALSE);
+ } else {
+ /*
+ * Do not resend frames until the last
+ * frame has arrived from the other
+ * device. This is not documented in
+ * IrLAP!!
+ */
+ self->vr = (self->vr + 1) % 8;
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ self->ack_required = FALSE;
+
+ /* Keep state, do not move this line!*/
+ irlap_next_state(self, LAP_NRM_P);
+
+ irlap_data_indication(self, skb, FALSE);
+ }
+ break;
+ }
+ /*
+ * Unexpected next to send (Ns) and next to receive (Nr)
+ * Not documented by IrLAP!
+ */
+ if ((ns_status == NS_UNEXPECTED) &&
+ (nr_status == NR_UNEXPECTED))
+ {
+ pr_debug("%s(), unexpected nr and ns!\n",
+ __func__);
+ if (info->pf) {
+ /* Resend rejected frames */
+ irlap_resend_rejected_frames(self, CMD_FRAME);
+
+ /* Give peer some time to retransmit!
+ * But account for our own Tx. */
+ irlap_start_final_timer(self, 2 * self->final_timeout);
+
+ /* Keep state, do not move this line */
+ irlap_next_state(self, LAP_NRM_P);
+ } else {
+ /* Update Nr received */
+ /* irlap_update_nr_received( info->nr); */
+
+ self->ack_required = FALSE;
+ }
+ break;
+ }
+
+ /*
+ * Invalid NR or NS
+ */
+ if ((nr_status == NR_INVALID) || (ns_status == NS_INVALID)) {
+ if (info->pf) {
+ del_timer(&self->final_timer);
+
+ irlap_next_state(self, LAP_RESET_WAIT);
+
+ irlap_disconnect_indication(self, LAP_RESET_INDICATION);
+ self->xmitflag = TRUE;
+ } else {
+ del_timer(&self->final_timer);
+
+ irlap_disconnect_indication(self, LAP_RESET_INDICATION);
+
+ self->xmitflag = FALSE;
+ }
+ break;
+ }
+ pr_debug("%s(), Not implemented!\n", __func__);
+ pr_debug("%s(), event=%s, ns_status=%d, nr_status=%d\n",
+ __func__, irlap_event[event], ns_status, nr_status);
+ break;
+ case RECV_UI_FRAME:
+ /* Poll bit cleared? */
+ if (!info->pf) {
+ irlap_data_indication(self, skb, TRUE);
+ irlap_next_state(self, LAP_NRM_P);
+ } else {
+ del_timer(&self->final_timer);
+ irlap_data_indication(self, skb, TRUE);
+ irlap_next_state(self, LAP_XMIT_P);
+ pr_debug("%s: RECV_UI_FRAME: next state %s\n",
+ __func__, irlap_state[self->state]);
+ irlap_start_poll_timer(self, self->poll_timeout);
+ }
+ break;
+ case RECV_RR_RSP:
+ /*
+ * If you get a RR, the remote isn't busy anymore,
+ * no matter what the NR
+ */
+ self->remote_busy = FALSE;
+
+ /* Stop final timer */
+ del_timer(&self->final_timer);
+
+ /*
+ * Nr as expected?
+ */
+ ret = irlap_validate_nr_received(self, info->nr);
+ if (ret == NR_EXPECTED) {
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ /*
+ * Got expected NR, so reset the retry_count. This
+ * is not done by the IrLAP standard , which is
+ * strange! DB.
+ */
+ self->retry_count = 0;
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+
+ irlap_next_state(self, LAP_XMIT_P);
+
+ /* Start poll timer */
+ irlap_start_poll_timer(self, self->poll_timeout);
+ } else if (ret == NR_UNEXPECTED) {
+ IRDA_ASSERT(info != NULL, return -1;);
+ /*
+ * Unexpected nr!
+ */
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ pr_debug("RECV_RR_FRAME: Retrans:%d, nr=%d, va=%d, vs=%d, vr=%d\n",
+ self->retry_count, info->nr, self->va,
+ self->vs, self->vr);
+
+ /* Resend rejected frames */
+ irlap_resend_rejected_frames(self, CMD_FRAME);
+ irlap_start_final_timer(self, self->final_timeout * 2);
+
+ irlap_next_state(self, LAP_NRM_P);
+ } else if (ret == NR_INVALID) {
+ pr_debug("%s(), Received RR with invalid nr !\n",
+ __func__);
+
+ irlap_next_state(self, LAP_RESET_WAIT);
+
+ irlap_disconnect_indication(self, LAP_RESET_INDICATION);
+ self->xmitflag = TRUE;
+ }
+ break;
+ case RECV_RNR_RSP:
+ IRDA_ASSERT(info != NULL, return -1;);
+
+ /* Stop final timer */
+ del_timer(&self->final_timer);
+ self->remote_busy = TRUE;
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+ irlap_next_state(self, LAP_XMIT_P);
+
+ /* Start poll timer */
+ irlap_start_poll_timer(self, self->poll_timeout);
+ break;
+ case RECV_FRMR_RSP:
+ del_timer(&self->final_timer);
+ self->xmitflag = TRUE;
+ irlap_next_state(self, LAP_RESET_WAIT);
+ irlap_reset_indication(self);
+ break;
+ case FINAL_TIMER_EXPIRED:
+ /*
+ * We are allowed to wait for additional 300 ms if
+ * final timer expires when we are in the middle
+ * of receiving a frame (page 45, IrLAP). Check that
+ * we only do this once for each frame.
+ */
+ if (irda_device_is_receiving(self->netdev) && !self->add_wait) {
+ pr_debug("FINAL_TIMER_EXPIRED when receiving a frame! Waiting a little bit more!\n");
+ irlap_start_final_timer(self, msecs_to_jiffies(300));
+
+ /*
+ * Don't allow this to happen one more time in a row,
+ * or else we can get a pretty tight loop here if
+ * if we only receive half a frame. DB.
+ */
+ self->add_wait = TRUE;
+ break;
+ }
+ self->add_wait = FALSE;
+
+ /* N2 is the disconnect timer. Until we reach it, we retry */
+ if (self->retry_count < self->N2) {
+ if (skb_peek(&self->wx_list) == NULL) {
+ /* Retry sending the pf bit to the secondary */
+ pr_debug("nrm_p: resending rr");
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, CMD_FRAME);
+ } else {
+ pr_debug("nrm_p: resend frames");
+ irlap_resend_rejected_frames(self, CMD_FRAME);
+ }
+
+ irlap_start_final_timer(self, self->final_timeout);
+ self->retry_count++;
+ pr_debug("irlap_state_nrm_p: FINAL_TIMER_EXPIRED: retry_count=%d\n",
+ self->retry_count);
+
+ /* Early warning event. I'm using a pretty liberal
+ * interpretation of the spec and generate an event
+ * every time the timer is multiple of N1 (and not
+ * only the first time). This allow application
+ * to know precisely if connectivity restart...
+ * Jean II */
+ if((self->retry_count % self->N1) == 0)
+ irlap_status_indication(self,
+ STATUS_NO_ACTIVITY);
+
+ /* Keep state */
+ } else {
+ irlap_apply_default_connection_parameters(self);
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+ irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+ }
+ break;
+ case RECV_REJ_RSP:
+ irlap_update_nr_received(self, info->nr);
+ if (self->remote_busy) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, CMD_FRAME);
+ } else
+ irlap_resend_rejected_frames(self, CMD_FRAME);
+ irlap_start_final_timer(self, 2 * self->final_timeout);
+ break;
+ case RECV_SREJ_RSP:
+ irlap_update_nr_received(self, info->nr);
+ if (self->remote_busy) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, CMD_FRAME);
+ } else
+ irlap_resend_rejected_frame(self, CMD_FRAME);
+ irlap_start_final_timer(self, 2 * self->final_timeout);
+ break;
+ case RECV_RD_RSP:
+ pr_debug("%s(), RECV_RD_RSP\n", __func__);
+
+ irlap_flush_all_queues(self);
+ irlap_next_state(self, LAP_XMIT_P);
+ /* Call back the LAP state machine to do a proper disconnect */
+ irlap_disconnect_request(self);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n",
+ __func__, irlap_event[event]);
+
+ ret = -1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_reset_wait (event, skb, info)
+ *
+ * We have informed the service user of a reset condition, and is
+ * awaiting reset of disconnect request.
+ *
+ */
+static int irlap_state_reset_wait(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s(), event = %s\n", __func__, irlap_event[event]);
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case RESET_REQUEST:
+ if (self->xmitflag) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_snrm_frame(self, NULL);
+ irlap_start_final_timer(self, self->final_timeout);
+ irlap_next_state(self, LAP_RESET);
+ } else {
+ irlap_start_final_timer(self, self->final_timeout);
+ irlap_next_state(self, LAP_RESET);
+ }
+ break;
+ case DISCONNECT_REQUEST:
+ irlap_wait_min_turn_around( self, &self->qos_tx);
+ irlap_send_disc_frame( self);
+ irlap_flush_all_queues( self);
+ irlap_start_final_timer( self, self->final_timeout);
+ self->retry_count = 0;
+ irlap_next_state( self, LAP_PCLOSE);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n", __func__,
+ irlap_event[event]);
+
+ ret = -1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_reset (self, event, skb, info)
+ *
+ * We have sent a SNRM reset command to the peer layer, and is awaiting
+ * reply.
+ *
+ */
+static int irlap_state_reset(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s(), event = %s\n", __func__, irlap_event[event]);
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case RECV_DISC_CMD:
+ del_timer(&self->final_timer);
+
+ irlap_apply_default_connection_parameters(self);
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+
+ break;
+ case RECV_UA_RSP:
+ del_timer(&self->final_timer);
+
+ /* Initiate connection state */
+ irlap_initiate_connection_state(self);
+
+ irlap_reset_confirm();
+
+ self->remote_busy = FALSE;
+
+ irlap_next_state(self, LAP_XMIT_P);
+
+ irlap_start_poll_timer(self, self->poll_timeout);
+
+ break;
+ case FINAL_TIMER_EXPIRED:
+ if (self->retry_count < 3) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+
+ IRDA_ASSERT(self->netdev != NULL, return -1;);
+ irlap_send_snrm_frame(self, self->qos_dev);
+
+ self->retry_count++; /* Experimental!! */
+
+ irlap_start_final_timer(self, self->final_timeout);
+ irlap_next_state(self, LAP_RESET);
+ } else if (self->retry_count >= self->N3) {
+ irlap_apply_default_connection_parameters(self);
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+ }
+ break;
+ case RECV_SNRM_CMD:
+ /*
+ * SNRM frame is not allowed to contain an I-field in this
+ * state
+ */
+ if (!info) {
+ pr_debug("%s(), RECV_SNRM_CMD\n", __func__);
+ irlap_initiate_connection_state(self);
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_ua_response_frame(self, &self->qos_rx);
+ irlap_reset_confirm();
+ irlap_start_wd_timer(self, self->wd_timeout);
+ irlap_next_state(self, LAP_NDM);
+ } else {
+ pr_debug("%s(), SNRM frame contained an I field!\n",
+ __func__);
+ }
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n",
+ __func__, irlap_event[event]);
+
+ ret = -1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_xmit_s (event, skb, info)
+ *
+ * XMIT_S, The secondary station has been given the right to transmit,
+ * and we therefore do not expect to receive any transmissions from other
+ * stations.
+ */
+static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s(), event=%s\n", __func__, irlap_event[event]);
+
+ IRDA_ASSERT(self != NULL, return -ENODEV;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;);
+
+ switch (event) {
+ case SEND_I_CMD:
+ /*
+ * Send frame only if send window > 0
+ */
+ if ((self->window > 0) && (!self->remote_busy)) {
+ int nextfit;
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+ struct sk_buff *skb_next;
+
+ /*
+ * Same deal as in irlap_state_xmit_p(), so see
+ * the comments at that point.
+ * We are the secondary, so there are only subtle
+ * differences. - Jean II
+ */
+
+ /* Check if a subsequent skb exist and would fit in
+ * the current window (with respect to turnaround
+ * time). - Jean II */
+ skb_next = skb_peek(&self->txq);
+ nextfit = ((skb_next != NULL) &&
+ ((skb_next->len + skb->len) <=
+ self->bytes_left));
+
+ /*
+ * Test if we have transmitted more bytes over the
+ * link than its possible to do with the current
+ * speed and turn-around-time.
+ */
+ if((!nextfit) && (skb->len > self->bytes_left)) {
+ pr_debug("%s(), Not allowed to transmit more bytes!\n",
+ __func__);
+ /* Requeue the skb */
+ skb_queue_head(&self->txq, skb_get(skb));
+
+ /*
+ * Switch to NRM_S, this is only possible
+ * when we are in secondary mode, since we
+ * must be sure that we don't miss any RR
+ * frames
+ */
+ self->window = self->window_size;
+ self->bytes_left = self->line_capacity;
+ irlap_start_wd_timer(self, self->wd_timeout);
+
+ irlap_next_state(self, LAP_NRM_S);
+ /* Slight difference with primary :
+ * here we would wait for the other side to
+ * expire the turnaround. - Jean II */
+
+ return -EPROTO; /* Try again later */
+ }
+ /* Subtract space used by this skb */
+ self->bytes_left -= skb->len;
+#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
+ /* Window has been adjusted for the max packet
+ * size, so much simpler... - Jean II */
+ nextfit = !skb_queue_empty(&self->txq);
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+ /*
+ * Send data with final bit cleared only if window > 1
+ * and there is more frames to be sent
+ */
+ if ((self->window > 1) && (nextfit)) {
+ irlap_send_data_secondary(self, skb);
+ irlap_next_state(self, LAP_XMIT_S);
+ } else {
+ irlap_send_data_secondary_final(self, skb);
+ irlap_next_state(self, LAP_NRM_S);
+
+ /*
+ * Make sure state machine does not try to send
+ * any more frames
+ */
+ ret = -EPROTO;
+ }
+ } else {
+ pr_debug("%s(), Unable to send!\n", __func__);
+ skb_queue_head(&self->txq, skb_get(skb));
+ ret = -EPROTO;
+ }
+ break;
+ case DISCONNECT_REQUEST:
+ irlap_send_rd_frame(self);
+ irlap_flush_all_queues(self);
+ irlap_start_wd_timer(self, self->wd_timeout);
+ irlap_next_state(self, LAP_SCLOSE);
+ break;
+ case DATA_REQUEST:
+ /* Nothing to do, irlap_do_event() will send the packet
+ * when we return... - Jean II */
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n", __func__,
+ irlap_event[event]);
+
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_nrm_s (event, skb, info)
+ *
+ * NRM_S (Normal Response Mode as Secondary) state, in this state we are
+ * expecting to receive frames from the primary station
+ *
+ */
+static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ int ns_status;
+ int nr_status;
+ int ret = 0;
+
+ pr_debug("%s(), event=%s\n", __func__, irlap_event[event]);
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ switch (event) {
+ case RECV_I_CMD: /* Optimize for the common case */
+ /* FIXME: must check for remote_busy below */
+ pr_debug("%s(), event=%s nr=%d, vs=%d, ns=%d, vr=%d, pf=%d\n",
+ __func__, irlap_event[event], info->nr,
+ self->vs, info->ns, self->vr, info->pf);
+
+ self->retry_count = 0;
+
+ ns_status = irlap_validate_ns_received(self, info->ns);
+ nr_status = irlap_validate_nr_received(self, info->nr);
+ /*
+ * Check for expected I(nformation) frame
+ */
+ if ((ns_status == NS_EXPECTED) && (nr_status == NR_EXPECTED)) {
+
+ /* Update Vr (next frame for us to receive) */
+ self->vr = (self->vr + 1) % 8;
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ /*
+ * poll bit cleared?
+ */
+ if (!info->pf) {
+
+ self->ack_required = TRUE;
+
+ /*
+ * Starting WD-timer here is optional, but
+ * not recommended. Note 6 IrLAP p. 83
+ */
+#if 0
+ irda_start_timer(WD_TIMER, self->wd_timeout);
+#endif
+ /* Keep state, do not move this line */
+ irlap_next_state(self, LAP_NRM_S);
+
+ irlap_data_indication(self, skb, FALSE);
+ break;
+ } else {
+ /*
+ * We should wait before sending RR, and
+ * also before changing to XMIT_S
+ * state. (note 1, IrLAP p. 82)
+ */
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+
+ /*
+ * Give higher layers a chance to
+ * immediately reply with some data before
+ * we decide if we should send a RR frame
+ * or not
+ */
+ irlap_data_indication(self, skb, FALSE);
+
+ /* Any pending data requests? */
+ if (!skb_queue_empty(&self->txq) &&
+ (self->window > 0))
+ {
+ self->ack_required = TRUE;
+
+ del_timer(&self->wd_timer);
+
+ irlap_next_state(self, LAP_XMIT_S);
+ } else {
+ irlap_send_rr_frame(self, RSP_FRAME);
+ irlap_start_wd_timer(self,
+ self->wd_timeout);
+
+ /* Keep the state */
+ irlap_next_state(self, LAP_NRM_S);
+ }
+ break;
+ }
+ }
+ /*
+ * Check for Unexpected next to send (Ns)
+ */
+ if ((ns_status == NS_UNEXPECTED) && (nr_status == NR_EXPECTED))
+ {
+ /* Unexpected next to send, with final bit cleared */
+ if (!info->pf) {
+ irlap_update_nr_received(self, info->nr);
+
+ irlap_start_wd_timer(self, self->wd_timeout);
+ } else {
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, RSP_FRAME);
+
+ irlap_start_wd_timer(self, self->wd_timeout);
+ }
+ break;
+ }
+
+ /*
+ * Unexpected Next to Receive(NR) ?
+ */
+ if ((ns_status == NS_EXPECTED) && (nr_status == NR_UNEXPECTED))
+ {
+ if (info->pf) {
+ pr_debug("RECV_I_RSP: frame(s) lost\n");
+
+ self->vr = (self->vr + 1) % 8;
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ /* Resend rejected frames */
+ irlap_resend_rejected_frames(self, RSP_FRAME);
+
+ /* Keep state, do not move this line */
+ irlap_next_state(self, LAP_NRM_S);
+
+ irlap_data_indication(self, skb, FALSE);
+ irlap_start_wd_timer(self, self->wd_timeout);
+ break;
+ }
+ /*
+ * This is not documented in IrLAP!! Unexpected NR
+ * with poll bit cleared
+ */
+ if (!info->pf) {
+ self->vr = (self->vr + 1) % 8;
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+
+ /* Keep state, do not move this line */
+ irlap_next_state(self, LAP_NRM_S);
+
+ irlap_data_indication(self, skb, FALSE);
+ irlap_start_wd_timer(self, self->wd_timeout);
+ }
+ break;
+ }
+
+ if (ret == NR_INVALID) {
+ pr_debug("NRM_S, NR_INVALID not implemented!\n");
+ }
+ if (ret == NS_INVALID) {
+ pr_debug("NRM_S, NS_INVALID not implemented!\n");
+ }
+ break;
+ case RECV_UI_FRAME:
+ /*
+ * poll bit cleared?
+ */
+ if (!info->pf) {
+ irlap_data_indication(self, skb, TRUE);
+ irlap_next_state(self, LAP_NRM_S); /* Keep state */
+ } else {
+ /*
+ * Any pending data requests?
+ */
+ if (!skb_queue_empty(&self->txq) &&
+ (self->window > 0) && !self->remote_busy)
+ {
+ irlap_data_indication(self, skb, TRUE);
+
+ del_timer(&self->wd_timer);
+
+ irlap_next_state(self, LAP_XMIT_S);
+ } else {
+ irlap_data_indication(self, skb, TRUE);
+
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+
+ irlap_send_rr_frame(self, RSP_FRAME);
+ self->ack_required = FALSE;
+
+ irlap_start_wd_timer(self, self->wd_timeout);
+
+ /* Keep the state */
+ irlap_next_state(self, LAP_NRM_S);
+ }
+ }
+ break;
+ case RECV_RR_CMD:
+ self->retry_count = 0;
+
+ /*
+ * Nr as expected?
+ */
+ nr_status = irlap_validate_nr_received(self, info->nr);
+ if (nr_status == NR_EXPECTED) {
+ if (!skb_queue_empty(&self->txq) &&
+ (self->window > 0)) {
+ self->remote_busy = FALSE;
+
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+ del_timer(&self->wd_timer);
+
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_next_state(self, LAP_XMIT_S);
+ } else {
+ self->remote_busy = FALSE;
+ /* Update Nr received */
+ irlap_update_nr_received(self, info->nr);
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_start_wd_timer(self, self->wd_timeout);
+
+ /* Note : if the link is idle (this case),
+ * we never go in XMIT_S, so we never get a
+ * chance to process any DISCONNECT_REQUEST.
+ * Do it now ! - Jean II */
+ if (self->disconnect_pending) {
+ /* Disconnect */
+ irlap_send_rd_frame(self);
+ irlap_flush_all_queues(self);
+
+ irlap_next_state(self, LAP_SCLOSE);
+ } else {
+ /* Just send back pf bit */
+ irlap_send_rr_frame(self, RSP_FRAME);
+
+ irlap_next_state(self, LAP_NRM_S);
+ }
+ }
+ } else if (nr_status == NR_UNEXPECTED) {
+ self->remote_busy = FALSE;
+ irlap_update_nr_received(self, info->nr);
+ irlap_resend_rejected_frames(self, RSP_FRAME);
+
+ irlap_start_wd_timer(self, self->wd_timeout);
+
+ /* Keep state */
+ irlap_next_state(self, LAP_NRM_S);
+ } else {
+ pr_debug("%s(), invalid nr not implemented!\n",
+ __func__);
+ }
+ break;
+ case RECV_SNRM_CMD:
+ /* SNRM frame is not allowed to contain an I-field */
+ if (!info) {
+ del_timer(&self->wd_timer);
+ pr_debug("%s(), received SNRM cmd\n", __func__);
+ irlap_next_state(self, LAP_RESET_CHECK);
+
+ irlap_reset_indication(self);
+ } else {
+ pr_debug("%s(), SNRM frame contained an I-field!\n",
+ __func__);
+
+ }
+ break;
+ case RECV_REJ_CMD:
+ irlap_update_nr_received(self, info->nr);
+ if (self->remote_busy) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, RSP_FRAME);
+ } else
+ irlap_resend_rejected_frames(self, RSP_FRAME);
+ irlap_start_wd_timer(self, self->wd_timeout);
+ break;
+ case RECV_SREJ_CMD:
+ irlap_update_nr_received(self, info->nr);
+ if (self->remote_busy) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, RSP_FRAME);
+ } else
+ irlap_resend_rejected_frame(self, RSP_FRAME);
+ irlap_start_wd_timer(self, self->wd_timeout);
+ break;
+ case WD_TIMER_EXPIRED:
+ /*
+ * Wait until retry_count * n matches negotiated threshold/
+ * disconnect time (note 2 in IrLAP p. 82)
+ *
+ * Similar to irlap_state_nrm_p() -> FINAL_TIMER_EXPIRED
+ * Note : self->wd_timeout = (self->final_timeout * 2),
+ * which explain why we use (self->N2 / 2) here !!!
+ * Jean II
+ */
+ pr_debug("%s(), retry_count = %d\n", __func__,
+ self->retry_count);
+
+ if (self->retry_count < (self->N2 / 2)) {
+ /* No retry, just wait for primary */
+ irlap_start_wd_timer(self, self->wd_timeout);
+ self->retry_count++;
+
+ if((self->retry_count % (self->N1 / 2)) == 0)
+ irlap_status_indication(self,
+ STATUS_NO_ACTIVITY);
+ } else {
+ irlap_apply_default_connection_parameters(self);
+
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+ irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+ }
+ break;
+ case RECV_DISC_CMD:
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ /* Send disconnect response */
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_ua_response_frame(self, NULL);
+
+ del_timer(&self->wd_timer);
+ irlap_flush_all_queues(self);
+ /* Set default link parameters */
+ irlap_apply_default_connection_parameters(self);
+
+ irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+ break;
+ case RECV_DISCOVERY_XID_CMD:
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rr_frame(self, RSP_FRAME);
+ self->ack_required = TRUE;
+ irlap_start_wd_timer(self, self->wd_timeout);
+ irlap_next_state(self, LAP_NRM_S);
+
+ break;
+ case RECV_TEST_CMD:
+ /* Remove test frame header (only LAP header in NRM) */
+ skb_pull(skb, LAP_ADDR_HEADER + LAP_CTRL_HEADER);
+
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_start_wd_timer(self, self->wd_timeout);
+
+ /* Send response (info will be copied) */
+ irlap_send_test_frame(self, self->caddr, info->daddr, skb);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d, (%s)\n", __func__,
+ event, irlap_event[event]);
+
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlap_state_sclose (self, event, skb, info)
+ */
+static int irlap_state_sclose(struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb, struct irlap_info *info)
+{
+ IRDA_ASSERT(self != NULL, return -ENODEV;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;);
+
+ switch (event) {
+ case RECV_DISC_CMD:
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ /* Send disconnect response */
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_ua_response_frame(self, NULL);
+
+ del_timer(&self->wd_timer);
+ /* Set default link parameters */
+ irlap_apply_default_connection_parameters(self);
+
+ irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+ break;
+ case RECV_DM_RSP:
+ /* IrLAP-1.1 p.82: in SCLOSE, S and I type RSP frames
+ * shall take us down into default NDM state, like DM_RSP
+ */
+ case RECV_RR_RSP:
+ case RECV_RNR_RSP:
+ case RECV_REJ_RSP:
+ case RECV_SREJ_RSP:
+ case RECV_I_RSP:
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ del_timer(&self->wd_timer);
+ irlap_apply_default_connection_parameters(self);
+
+ irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+ break;
+ case WD_TIMER_EXPIRED:
+ /* Always switch state before calling upper layers */
+ irlap_next_state(self, LAP_NDM);
+
+ irlap_apply_default_connection_parameters(self);
+
+ irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+ break;
+ default:
+ /* IrLAP-1.1 p.82: in SCLOSE, basically any received frame
+ * with pf=1 shall restart the wd-timer and resend the rd:rsp
+ */
+ if (info != NULL && info->pf) {
+ del_timer(&self->wd_timer);
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rd_frame(self);
+ irlap_start_wd_timer(self, self->wd_timeout);
+ break; /* stay in SCLOSE */
+ }
+
+ pr_debug("%s(), Unknown event %d, (%s)\n", __func__,
+ event, irlap_event[event]);
+
+ break;
+ }
+
+ return -1;
+}
+
+static int irlap_state_reset_check( struct irlap_cb *self, IRLAP_EVENT event,
+ struct sk_buff *skb,
+ struct irlap_info *info)
+{
+ int ret = 0;
+
+ pr_debug("%s(), event=%s\n", __func__, irlap_event[event]);
+
+ IRDA_ASSERT(self != NULL, return -ENODEV;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;);
+
+ switch (event) {
+ case RESET_RESPONSE:
+ irlap_send_ua_response_frame(self, &self->qos_rx);
+ irlap_initiate_connection_state(self);
+ irlap_start_wd_timer(self, WD_TIMEOUT);
+ irlap_flush_all_queues(self);
+
+ irlap_next_state(self, LAP_NRM_S);
+ break;
+ case DISCONNECT_REQUEST:
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_send_rd_frame(self);
+ irlap_start_wd_timer(self, WD_TIMEOUT);
+ irlap_next_state(self, LAP_SCLOSE);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %d, (%s)\n", __func__,
+ event, irlap_event[event]);
+
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
new file mode 100644
index 000000000..b936b1251
--- /dev/null
+++ b/net/irda/irlap_frame.c
@@ -0,0 +1,1411 @@
+/*********************************************************************
+ *
+ * Filename: irlap_frame.c
+ * Version: 1.0
+ * Description: Build and transmit IrLAP frames
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Tue Aug 19 10:27:26 1997
+ * Modified at: Wed Jan 5 08:59:04 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/irda.h>
+#include <linux/slab.h>
+
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irda_device.h>
+#include <net/irda/irlap.h>
+#include <net/irda/wrapper.h>
+#include <net/irda/timer.h>
+#include <net/irda/irlap_frame.h>
+#include <net/irda/qos.h>
+
+static void irlap_send_i_frame(struct irlap_cb *self, struct sk_buff *skb,
+ int command);
+
+/*
+ * Function irlap_insert_info (self, skb)
+ *
+ * Insert minimum turnaround time and speed information into the skb. We
+ * need to do this since it's per packet relevant information. Safe to
+ * have this function inlined since it's only called from one place
+ */
+static inline void irlap_insert_info(struct irlap_cb *self,
+ struct sk_buff *skb)
+{
+ struct irda_skb_cb *cb = (struct irda_skb_cb *) skb->cb;
+
+ /*
+ * Insert MTT (min. turn time) and speed into skb, so that the
+ * device driver knows which settings to use
+ */
+ cb->magic = LAP_MAGIC;
+ cb->mtt = self->mtt_required;
+ cb->next_speed = self->speed;
+
+ /* Reset */
+ self->mtt_required = 0;
+
+ /*
+ * Delay equals negotiated BOFs count, plus the number of BOFs to
+ * force the negotiated minimum turnaround time
+ */
+ cb->xbofs = self->bofs_count;
+ cb->next_xbofs = self->next_bofs;
+ cb->xbofs_delay = self->xbofs_delay;
+
+ /* Reset XBOF's delay (used only for getting min turn time) */
+ self->xbofs_delay = 0;
+ /* Put the correct xbofs value for the next packet */
+ self->bofs_count = self->next_bofs;
+}
+
+/*
+ * Function irlap_queue_xmit (self, skb)
+ *
+ * A little wrapper for dev_queue_xmit, so we can insert some common
+ * code into it.
+ */
+void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb)
+{
+ /* Some common init stuff */
+ skb->dev = self->netdev;
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->protocol = htons(ETH_P_IRDA);
+ skb->priority = TC_PRIO_BESTEFFORT;
+
+ irlap_insert_info(self, skb);
+
+ if (unlikely(self->mode & IRDA_MODE_MONITOR)) {
+ pr_debug("%s(): %s is in monitor mode\n", __func__,
+ self->netdev->name);
+ dev_kfree_skb(skb);
+ return;
+ }
+
+ dev_queue_xmit(skb);
+}
+
+/*
+ * Function irlap_send_snrm_cmd (void)
+ *
+ * Transmits a connect SNRM command frame
+ */
+void irlap_send_snrm_frame(struct irlap_cb *self, struct qos_info *qos)
+{
+ struct sk_buff *tx_skb;
+ struct snrm_frame *frame;
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Allocate frame */
+ tx_skb = alloc_skb(sizeof(struct snrm_frame) +
+ IRLAP_NEGOCIATION_PARAMS_LEN,
+ GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ frame = (struct snrm_frame *) skb_put(tx_skb, 2);
+
+ /* Insert connection address field */
+ if (qos)
+ frame->caddr = CMD_FRAME | CBROADCAST;
+ else
+ frame->caddr = CMD_FRAME | self->caddr;
+
+ /* Insert control field */
+ frame->control = SNRM_CMD | PF_BIT;
+
+ /*
+ * If we are establishing a connection then insert QoS parameters
+ */
+ if (qos) {
+ skb_put(tx_skb, 9); /* 25 left */
+ frame->saddr = cpu_to_le32(self->saddr);
+ frame->daddr = cpu_to_le32(self->daddr);
+
+ frame->ncaddr = self->caddr;
+
+ ret = irlap_insert_qos_negotiation_params(self, tx_skb);
+ if (ret < 0) {
+ dev_kfree_skb(tx_skb);
+ return;
+ }
+ }
+ irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_recv_snrm_cmd (skb, info)
+ *
+ * Received SNRM (Set Normal Response Mode) command frame
+ *
+ */
+static void irlap_recv_snrm_cmd(struct irlap_cb *self, struct sk_buff *skb,
+ struct irlap_info *info)
+{
+ struct snrm_frame *frame;
+
+ if (pskb_may_pull(skb,sizeof(struct snrm_frame))) {
+ frame = (struct snrm_frame *) skb->data;
+
+ /* Copy the new connection address ignoring the C/R bit */
+ info->caddr = frame->ncaddr & 0xFE;
+
+ /* Check if the new connection address is valid */
+ if ((info->caddr == 0x00) || (info->caddr == 0xfe)) {
+ pr_debug("%s(), invalid connection address!\n",
+ __func__);
+ return;
+ }
+
+ /* Copy peer device address */
+ info->daddr = le32_to_cpu(frame->saddr);
+ info->saddr = le32_to_cpu(frame->daddr);
+
+ /* Only accept if addressed directly to us */
+ if (info->saddr != self->saddr) {
+ pr_debug("%s(), not addressed to us!\n",
+ __func__);
+ return;
+ }
+ irlap_do_event(self, RECV_SNRM_CMD, skb, info);
+ } else {
+ /* Signal that this SNRM frame does not contain and I-field */
+ irlap_do_event(self, RECV_SNRM_CMD, skb, NULL);
+ }
+}
+
+/*
+ * Function irlap_send_ua_response_frame (qos)
+ *
+ * Send UA (Unnumbered Acknowledgement) frame
+ *
+ */
+void irlap_send_ua_response_frame(struct irlap_cb *self, struct qos_info *qos)
+{
+ struct sk_buff *tx_skb;
+ struct ua_frame *frame;
+ int ret;
+
+ pr_debug("%s() <%ld>\n", __func__, jiffies);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Allocate frame */
+ tx_skb = alloc_skb(sizeof(struct ua_frame) +
+ IRLAP_NEGOCIATION_PARAMS_LEN,
+ GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ frame = (struct ua_frame *) skb_put(tx_skb, 10);
+
+ /* Build UA response */
+ frame->caddr = self->caddr;
+ frame->control = UA_RSP | PF_BIT;
+
+ frame->saddr = cpu_to_le32(self->saddr);
+ frame->daddr = cpu_to_le32(self->daddr);
+
+ /* Should we send QoS negotiation parameters? */
+ if (qos) {
+ ret = irlap_insert_qos_negotiation_params(self, tx_skb);
+ if (ret < 0) {
+ dev_kfree_skb(tx_skb);
+ return;
+ }
+ }
+
+ irlap_queue_xmit(self, tx_skb);
+}
+
+
+/*
+ * Function irlap_send_dm_frame (void)
+ *
+ * Send disconnected mode (DM) frame
+ *
+ */
+void irlap_send_dm_frame( struct irlap_cb *self)
+{
+ struct sk_buff *tx_skb = NULL;
+ struct dm_frame *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ tx_skb = alloc_skb(sizeof(struct dm_frame), GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ frame = (struct dm_frame *)skb_put(tx_skb, 2);
+
+ if (self->state == LAP_NDM)
+ frame->caddr = CBROADCAST;
+ else
+ frame->caddr = self->caddr;
+
+ frame->control = DM_RSP | PF_BIT;
+
+ irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_send_disc_frame (void)
+ *
+ * Send disconnect (DISC) frame
+ *
+ */
+void irlap_send_disc_frame(struct irlap_cb *self)
+{
+ struct sk_buff *tx_skb = NULL;
+ struct disc_frame *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ tx_skb = alloc_skb(sizeof(struct disc_frame), GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ frame = (struct disc_frame *)skb_put(tx_skb, 2);
+
+ frame->caddr = self->caddr | CMD_FRAME;
+ frame->control = DISC_CMD | PF_BIT;
+
+ irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_send_discovery_xid_frame (S, s, command)
+ *
+ * Build and transmit a XID (eXchange station IDentifier) discovery
+ * frame.
+ */
+void irlap_send_discovery_xid_frame(struct irlap_cb *self, int S, __u8 s,
+ __u8 command, discovery_t *discovery)
+{
+ struct sk_buff *tx_skb = NULL;
+ struct xid_frame *frame;
+ __u32 bcast = BROADCAST;
+ __u8 *info;
+
+ pr_debug("%s(), s=%d, S=%d, command=%d\n", __func__,
+ s, S, command);
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+ IRDA_ASSERT(discovery != NULL, return;);
+
+ tx_skb = alloc_skb(sizeof(struct xid_frame) + IRLAP_DISCOVERY_INFO_LEN,
+ GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ skb_put(tx_skb, 14);
+ frame = (struct xid_frame *) tx_skb->data;
+
+ if (command) {
+ frame->caddr = CBROADCAST | CMD_FRAME;
+ frame->control = XID_CMD | PF_BIT;
+ } else {
+ frame->caddr = CBROADCAST;
+ frame->control = XID_RSP | PF_BIT;
+ }
+ frame->ident = XID_FORMAT;
+
+ frame->saddr = cpu_to_le32(self->saddr);
+
+ if (command)
+ frame->daddr = cpu_to_le32(bcast);
+ else
+ frame->daddr = cpu_to_le32(discovery->data.daddr);
+
+ switch (S) {
+ case 1:
+ frame->flags = 0x00;
+ break;
+ case 6:
+ frame->flags = 0x01;
+ break;
+ case 8:
+ frame->flags = 0x02;
+ break;
+ case 16:
+ frame->flags = 0x03;
+ break;
+ default:
+ frame->flags = 0x02;
+ break;
+ }
+
+ frame->slotnr = s;
+ frame->version = 0x00;
+
+ /*
+ * Provide info for final slot only in commands, and for all
+ * responses. Send the second byte of the hint only if the
+ * EXTENSION bit is set in the first byte.
+ */
+ if (!command || (frame->slotnr == 0xff)) {
+ int len;
+
+ if (discovery->data.hints[0] & HINT_EXTENSION) {
+ info = skb_put(tx_skb, 2);
+ info[0] = discovery->data.hints[0];
+ info[1] = discovery->data.hints[1];
+ } else {
+ info = skb_put(tx_skb, 1);
+ info[0] = discovery->data.hints[0];
+ }
+ info = skb_put(tx_skb, 1);
+ info[0] = discovery->data.charset;
+
+ len = IRDA_MIN(discovery->name_len, skb_tailroom(tx_skb));
+ info = skb_put(tx_skb, len);
+ memcpy(info, discovery->data.info, len);
+ }
+ irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_recv_discovery_xid_rsp (skb, info)
+ *
+ * Received a XID discovery response
+ *
+ */
+static void irlap_recv_discovery_xid_rsp(struct irlap_cb *self,
+ struct sk_buff *skb,
+ struct irlap_info *info)
+{
+ struct xid_frame *xid;
+ discovery_t *discovery = NULL;
+ __u8 *discovery_info;
+ char *text;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ if (!pskb_may_pull(skb, sizeof(struct xid_frame))) {
+ net_err_ratelimited("%s: frame too short!\n", __func__);
+ return;
+ }
+
+ xid = (struct xid_frame *) skb->data;
+
+ info->daddr = le32_to_cpu(xid->saddr);
+ info->saddr = le32_to_cpu(xid->daddr);
+
+ /* Make sure frame is addressed to us */
+ if ((info->saddr != self->saddr) && (info->saddr != BROADCAST)) {
+ pr_debug("%s(), frame is not addressed to us!\n",
+ __func__);
+ return;
+ }
+
+ if ((discovery = kzalloc(sizeof(discovery_t), GFP_ATOMIC)) == NULL) {
+ net_warn_ratelimited("%s: kmalloc failed!\n", __func__);
+ return;
+ }
+
+ discovery->data.daddr = info->daddr;
+ discovery->data.saddr = self->saddr;
+ discovery->timestamp = jiffies;
+
+ pr_debug("%s(), daddr=%08x\n", __func__,
+ discovery->data.daddr);
+
+ discovery_info = skb_pull(skb, sizeof(struct xid_frame));
+
+ /* Get info returned from peer */
+ discovery->data.hints[0] = discovery_info[0];
+ if (discovery_info[0] & HINT_EXTENSION) {
+ pr_debug("EXTENSION\n");
+ discovery->data.hints[1] = discovery_info[1];
+ discovery->data.charset = discovery_info[2];
+ text = (char *) &discovery_info[3];
+ } else {
+ discovery->data.hints[1] = 0;
+ discovery->data.charset = discovery_info[1];
+ text = (char *) &discovery_info[2];
+ }
+ /*
+ * Terminate info string, should be safe since this is where the
+ * FCS bytes resides.
+ */
+ skb->data[skb->len] = '\0';
+ strncpy(discovery->data.info, text, NICKNAME_MAX_LEN);
+ discovery->name_len = strlen(discovery->data.info);
+
+ info->discovery = discovery;
+
+ irlap_do_event(self, RECV_DISCOVERY_XID_RSP, skb, info);
+}
+
+/*
+ * Function irlap_recv_discovery_xid_cmd (skb, info)
+ *
+ * Received a XID discovery command
+ *
+ */
+static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self,
+ struct sk_buff *skb,
+ struct irlap_info *info)
+{
+ struct xid_frame *xid;
+ discovery_t *discovery = NULL;
+ __u8 *discovery_info;
+ char *text;
+
+ if (!pskb_may_pull(skb, sizeof(struct xid_frame))) {
+ net_err_ratelimited("%s: frame too short!\n", __func__);
+ return;
+ }
+
+ xid = (struct xid_frame *) skb->data;
+
+ info->daddr = le32_to_cpu(xid->saddr);
+ info->saddr = le32_to_cpu(xid->daddr);
+
+ /* Make sure frame is addressed to us */
+ if ((info->saddr != self->saddr) && (info->saddr != BROADCAST)) {
+ pr_debug("%s(), frame is not addressed to us!\n",
+ __func__);
+ return;
+ }
+
+ switch (xid->flags & 0x03) {
+ case 0x00:
+ info->S = 1;
+ break;
+ case 0x01:
+ info->S = 6;
+ break;
+ case 0x02:
+ info->S = 8;
+ break;
+ case 0x03:
+ info->S = 16;
+ break;
+ default:
+ /* Error!! */
+ return;
+ }
+ info->s = xid->slotnr;
+
+ discovery_info = skb_pull(skb, sizeof(struct xid_frame));
+
+ /*
+ * Check if last frame
+ */
+ if (info->s == 0xff) {
+ /* Check if things are sane at this point... */
+ if((discovery_info == NULL) ||
+ !pskb_may_pull(skb, 3)) {
+ net_err_ratelimited("%s: discovery frame too short!\n",
+ __func__);
+ return;
+ }
+
+ /*
+ * We now have some discovery info to deliver!
+ */
+ discovery = kzalloc(sizeof(discovery_t), GFP_ATOMIC);
+ if (!discovery)
+ return;
+
+ discovery->data.daddr = info->daddr;
+ discovery->data.saddr = self->saddr;
+ discovery->timestamp = jiffies;
+
+ discovery->data.hints[0] = discovery_info[0];
+ if (discovery_info[0] & HINT_EXTENSION) {
+ discovery->data.hints[1] = discovery_info[1];
+ discovery->data.charset = discovery_info[2];
+ text = (char *) &discovery_info[3];
+ } else {
+ discovery->data.hints[1] = 0;
+ discovery->data.charset = discovery_info[1];
+ text = (char *) &discovery_info[2];
+ }
+ /*
+ * Terminate string, should be safe since this is where the
+ * FCS bytes resides.
+ */
+ skb->data[skb->len] = '\0';
+ strncpy(discovery->data.info, text, NICKNAME_MAX_LEN);
+ discovery->name_len = strlen(discovery->data.info);
+
+ info->discovery = discovery;
+ } else
+ info->discovery = NULL;
+
+ irlap_do_event(self, RECV_DISCOVERY_XID_CMD, skb, info);
+}
+
+/*
+ * Function irlap_send_rr_frame (self, command)
+ *
+ * Build and transmit RR (Receive Ready) frame. Notice that it is currently
+ * only possible to send RR frames with the poll bit set.
+ */
+void irlap_send_rr_frame(struct irlap_cb *self, int command)
+{
+ struct sk_buff *tx_skb;
+ struct rr_frame *frame;
+
+ tx_skb = alloc_skb(sizeof(struct rr_frame), GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ frame = (struct rr_frame *)skb_put(tx_skb, 2);
+
+ frame->caddr = self->caddr;
+ frame->caddr |= (command) ? CMD_FRAME : 0;
+
+ frame->control = RR | PF_BIT | (self->vr << 5);
+
+ irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_send_rd_frame (self)
+ *
+ * Request disconnect. Used by a secondary station to request the
+ * disconnection of the link.
+ */
+void irlap_send_rd_frame(struct irlap_cb *self)
+{
+ struct sk_buff *tx_skb;
+ struct rd_frame *frame;
+
+ tx_skb = alloc_skb(sizeof(struct rd_frame), GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ frame = (struct rd_frame *)skb_put(tx_skb, 2);
+
+ frame->caddr = self->caddr;
+ frame->control = RD_RSP | PF_BIT;
+
+ irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_recv_rr_frame (skb, info)
+ *
+ * Received RR (Receive Ready) frame from peer station, no harm in
+ * making it inline since its called only from one single place
+ * (irlap_driver_rcv).
+ */
+static inline void irlap_recv_rr_frame(struct irlap_cb *self,
+ struct sk_buff *skb,
+ struct irlap_info *info, int command)
+{
+ info->nr = skb->data[1] >> 5;
+
+ /* Check if this is a command or a response frame */
+ if (command)
+ irlap_do_event(self, RECV_RR_CMD, skb, info);
+ else
+ irlap_do_event(self, RECV_RR_RSP, skb, info);
+}
+
+/*
+ * Function irlap_recv_rnr_frame (self, skb, info)
+ *
+ * Received RNR (Receive Not Ready) frame from peer station
+ *
+ */
+static void irlap_recv_rnr_frame(struct irlap_cb *self, struct sk_buff *skb,
+ struct irlap_info *info, int command)
+{
+ info->nr = skb->data[1] >> 5;
+
+ pr_debug("%s(), nr=%d, %ld\n", __func__, info->nr, jiffies);
+
+ if (command)
+ irlap_do_event(self, RECV_RNR_CMD, skb, info);
+ else
+ irlap_do_event(self, RECV_RNR_RSP, skb, info);
+}
+
+static void irlap_recv_rej_frame(struct irlap_cb *self, struct sk_buff *skb,
+ struct irlap_info *info, int command)
+{
+ info->nr = skb->data[1] >> 5;
+
+ /* Check if this is a command or a response frame */
+ if (command)
+ irlap_do_event(self, RECV_REJ_CMD, skb, info);
+ else
+ irlap_do_event(self, RECV_REJ_RSP, skb, info);
+}
+
+static void irlap_recv_srej_frame(struct irlap_cb *self, struct sk_buff *skb,
+ struct irlap_info *info, int command)
+{
+ info->nr = skb->data[1] >> 5;
+
+ /* Check if this is a command or a response frame */
+ if (command)
+ irlap_do_event(self, RECV_SREJ_CMD, skb, info);
+ else
+ irlap_do_event(self, RECV_SREJ_RSP, skb, info);
+}
+
+static void irlap_recv_disc_frame(struct irlap_cb *self, struct sk_buff *skb,
+ struct irlap_info *info, int command)
+{
+ /* Check if this is a command or a response frame */
+ if (command)
+ irlap_do_event(self, RECV_DISC_CMD, skb, info);
+ else
+ irlap_do_event(self, RECV_RD_RSP, skb, info);
+}
+
+/*
+ * Function irlap_recv_ua_frame (skb, frame)
+ *
+ * Received UA (Unnumbered Acknowledgement) frame
+ *
+ */
+static inline void irlap_recv_ua_frame(struct irlap_cb *self,
+ struct sk_buff *skb,
+ struct irlap_info *info)
+{
+ irlap_do_event(self, RECV_UA_RSP, skb, info);
+}
+
+/*
+ * Function irlap_send_data_primary(self, skb)
+ *
+ * Send I-frames as the primary station but without the poll bit set
+ *
+ */
+void irlap_send_data_primary(struct irlap_cb *self, struct sk_buff *skb)
+{
+ struct sk_buff *tx_skb;
+
+ if (skb->data[1] == I_FRAME) {
+
+ /*
+ * Insert frame sequence number (Vs) in control field before
+ * inserting into transmit window queue.
+ */
+ skb->data[1] = I_FRAME | (self->vs << 1);
+
+ /*
+ * Insert frame in store, in case of retransmissions
+ * Increase skb reference count, see irlap_do_event()
+ */
+ skb_get(skb);
+ skb_queue_tail(&self->wx_list, skb);
+
+ /* Copy buffer */
+ tx_skb = skb_clone(skb, GFP_ATOMIC);
+ if (tx_skb == NULL) {
+ return;
+ }
+
+ self->vs = (self->vs + 1) % 8;
+ self->ack_required = FALSE;
+ self->window -= 1;
+
+ irlap_send_i_frame( self, tx_skb, CMD_FRAME);
+ } else {
+ pr_debug("%s(), sending unreliable frame\n", __func__);
+ irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME);
+ self->window -= 1;
+ }
+}
+/*
+ * Function irlap_send_data_primary_poll (self, skb)
+ *
+ * Send I(nformation) frame as primary with poll bit set
+ */
+void irlap_send_data_primary_poll(struct irlap_cb *self, struct sk_buff *skb)
+{
+ struct sk_buff *tx_skb;
+ int transmission_time;
+
+ /* Stop P timer */
+ del_timer(&self->poll_timer);
+
+ /* Is this reliable or unreliable data? */
+ if (skb->data[1] == I_FRAME) {
+
+ /*
+ * Insert frame sequence number (Vs) in control field before
+ * inserting into transmit window queue.
+ */
+ skb->data[1] = I_FRAME | (self->vs << 1);
+
+ /*
+ * Insert frame in store, in case of retransmissions
+ * Increase skb reference count, see irlap_do_event()
+ */
+ skb_get(skb);
+ skb_queue_tail(&self->wx_list, skb);
+
+ /* Copy buffer */
+ tx_skb = skb_clone(skb, GFP_ATOMIC);
+ if (tx_skb == NULL) {
+ return;
+ }
+
+ /*
+ * Set poll bit if necessary. We do this to the copied
+ * skb, since retransmitted need to set or clear the poll
+ * bit depending on when they are sent.
+ */
+ tx_skb->data[1] |= PF_BIT;
+
+ self->vs = (self->vs + 1) % 8;
+ self->ack_required = FALSE;
+
+ irlap_next_state(self, LAP_NRM_P);
+ irlap_send_i_frame(self, tx_skb, CMD_FRAME);
+ } else {
+ pr_debug("%s(), sending unreliable frame\n", __func__);
+
+ if (self->ack_required) {
+ irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME);
+ irlap_next_state(self, LAP_NRM_P);
+ irlap_send_rr_frame(self, CMD_FRAME);
+ self->ack_required = FALSE;
+ } else {
+ skb->data[1] |= PF_BIT;
+ irlap_next_state(self, LAP_NRM_P);
+ irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME);
+ }
+ }
+
+ /* How much time we took for transmission of all frames.
+ * We don't know, so let assume we used the full window. Jean II */
+ transmission_time = self->final_timeout;
+
+ /* Reset parameter so that we can fill next window */
+ self->window = self->window_size;
+
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+ /* Remove what we have not used. Just do a prorata of the
+ * bytes left in window to window capacity.
+ * See max_line_capacities[][] in qos.c for details. Jean II */
+ transmission_time -= (self->final_timeout * self->bytes_left
+ / self->line_capacity);
+ pr_debug("%s() adjusting transmission_time : ft=%d, bl=%d, lc=%d -> tt=%d\n",
+ __func__, self->final_timeout, self->bytes_left,
+ self->line_capacity, transmission_time);
+
+ /* We are allowed to transmit a maximum number of bytes again. */
+ self->bytes_left = self->line_capacity;
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+
+ /*
+ * The network layer has a intermediate buffer between IrLAP
+ * and the IrDA driver which can contain 8 frames. So, even
+ * though IrLAP is currently sending the *last* frame of the
+ * tx-window, the driver most likely has only just started
+ * sending the *first* frame of the same tx-window.
+ * I.e. we are always at the very beginning of or Tx window.
+ * Now, we are supposed to set the final timer from the end
+ * of our tx-window to let the other peer reply. So, we need
+ * to add extra time to compensate for the fact that we
+ * are really at the start of tx-window, otherwise the final timer
+ * might expire before he can answer...
+ * Jean II
+ */
+ irlap_start_final_timer(self, self->final_timeout + transmission_time);
+
+ /*
+ * The clever amongst you might ask why we do this adjustement
+ * only here, and not in all the other cases in irlap_event.c.
+ * In all those other case, we only send a very short management
+ * frame (few bytes), so the adjustement would be lost in the
+ * noise...
+ * The exception of course is irlap_resend_rejected_frame().
+ * Jean II */
+}
+
+/*
+ * Function irlap_send_data_secondary_final (self, skb)
+ *
+ * Send I(nformation) frame as secondary with final bit set
+ *
+ */
+void irlap_send_data_secondary_final(struct irlap_cb *self,
+ struct sk_buff *skb)
+{
+ struct sk_buff *tx_skb = NULL;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ /* Is this reliable or unreliable data? */
+ if (skb->data[1] == I_FRAME) {
+
+ /*
+ * Insert frame sequence number (Vs) in control field before
+ * inserting into transmit window queue.
+ */
+ skb->data[1] = I_FRAME | (self->vs << 1);
+
+ /*
+ * Insert frame in store, in case of retransmissions
+ * Increase skb reference count, see irlap_do_event()
+ */
+ skb_get(skb);
+ skb_queue_tail(&self->wx_list, skb);
+
+ tx_skb = skb_clone(skb, GFP_ATOMIC);
+ if (tx_skb == NULL) {
+ return;
+ }
+
+ tx_skb->data[1] |= PF_BIT;
+
+ self->vs = (self->vs + 1) % 8;
+ self->ack_required = FALSE;
+
+ irlap_send_i_frame(self, tx_skb, RSP_FRAME);
+ } else {
+ if (self->ack_required) {
+ irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME);
+ irlap_send_rr_frame(self, RSP_FRAME);
+ self->ack_required = FALSE;
+ } else {
+ skb->data[1] |= PF_BIT;
+ irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME);
+ }
+ }
+
+ self->window = self->window_size;
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+ /* We are allowed to transmit a maximum number of bytes again. */
+ self->bytes_left = self->line_capacity;
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+
+ irlap_start_wd_timer(self, self->wd_timeout);
+}
+
+/*
+ * Function irlap_send_data_secondary (self, skb)
+ *
+ * Send I(nformation) frame as secondary without final bit set
+ *
+ */
+void irlap_send_data_secondary(struct irlap_cb *self, struct sk_buff *skb)
+{
+ struct sk_buff *tx_skb = NULL;
+
+ /* Is this reliable or unreliable data? */
+ if (skb->data[1] == I_FRAME) {
+
+ /*
+ * Insert frame sequence number (Vs) in control field before
+ * inserting into transmit window queue.
+ */
+ skb->data[1] = I_FRAME | (self->vs << 1);
+
+ /*
+ * Insert frame in store, in case of retransmissions
+ * Increase skb reference count, see irlap_do_event()
+ */
+ skb_get(skb);
+ skb_queue_tail(&self->wx_list, skb);
+
+ tx_skb = skb_clone(skb, GFP_ATOMIC);
+ if (tx_skb == NULL) {
+ return;
+ }
+
+ self->vs = (self->vs + 1) % 8;
+ self->ack_required = FALSE;
+ self->window -= 1;
+
+ irlap_send_i_frame(self, tx_skb, RSP_FRAME);
+ } else {
+ irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME);
+ self->window -= 1;
+ }
+}
+
+/*
+ * Function irlap_resend_rejected_frames (nr)
+ *
+ * Resend frames which has not been acknowledged. Should be safe to
+ * traverse the list without locking it since this function will only be
+ * called from interrupt context (BH)
+ */
+void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
+{
+ struct sk_buff *tx_skb;
+ struct sk_buff *skb;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Resend unacknowledged frame(s) */
+ skb_queue_walk(&self->wx_list, skb) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+
+ /* We copy the skb to be retransmitted since we will have to
+ * modify it. Cloning will confuse packet sniffers
+ */
+ /* tx_skb = skb_clone( skb, GFP_ATOMIC); */
+ tx_skb = skb_copy(skb, GFP_ATOMIC);
+ if (!tx_skb) {
+ pr_debug("%s(), unable to copy\n", __func__);
+ return;
+ }
+
+ /* Clear old Nr field + poll bit */
+ tx_skb->data[1] &= 0x0f;
+
+ /*
+ * Set poll bit on the last frame retransmitted
+ */
+ if (skb_queue_is_last(&self->wx_list, skb))
+ tx_skb->data[1] |= PF_BIT; /* Set p/f bit */
+ else
+ tx_skb->data[1] &= ~PF_BIT; /* Clear p/f bit */
+
+ irlap_send_i_frame(self, tx_skb, command);
+ }
+#if 0 /* Not yet */
+ /*
+ * We can now fill the window with additional data frames
+ */
+ while (!skb_queue_empty(&self->txq)) {
+
+ pr_debug("%s(), sending additional frames!\n", __func__);
+ if (self->window > 0) {
+ skb = skb_dequeue( &self->txq);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ /*
+ * If send window > 1 then send frame with pf
+ * bit cleared
+ */
+ if ((self->window > 1) &&
+ !skb_queue_empty(&self->txq)) {
+ irlap_send_data_primary(self, skb);
+ } else {
+ irlap_send_data_primary_poll(self, skb);
+ }
+ kfree_skb(skb);
+ }
+ }
+#endif
+}
+
+void irlap_resend_rejected_frame(struct irlap_cb *self, int command)
+{
+ struct sk_buff *tx_skb;
+ struct sk_buff *skb;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ /* Resend unacknowledged frame(s) */
+ skb = skb_peek(&self->wx_list);
+ if (skb != NULL) {
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+
+ /* We copy the skb to be retransmitted since we will have to
+ * modify it. Cloning will confuse packet sniffers
+ */
+ /* tx_skb = skb_clone( skb, GFP_ATOMIC); */
+ tx_skb = skb_copy(skb, GFP_ATOMIC);
+ if (!tx_skb) {
+ pr_debug("%s(), unable to copy\n", __func__);
+ return;
+ }
+
+ /* Clear old Nr field + poll bit */
+ tx_skb->data[1] &= 0x0f;
+
+ /* Set poll/final bit */
+ tx_skb->data[1] |= PF_BIT; /* Set p/f bit */
+
+ irlap_send_i_frame(self, tx_skb, command);
+ }
+}
+
+/*
+ * Function irlap_send_ui_frame (self, skb, command)
+ *
+ * Contruct and transmit an Unnumbered Information (UI) frame
+ *
+ */
+void irlap_send_ui_frame(struct irlap_cb *self, struct sk_buff *skb,
+ __u8 caddr, int command)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ /* Insert connection address */
+ skb->data[0] = caddr | ((command) ? CMD_FRAME : 0);
+
+ irlap_queue_xmit(self, skb);
+}
+
+/*
+ * Function irlap_send_i_frame (skb)
+ *
+ * Contruct and transmit Information (I) frame
+ */
+static void irlap_send_i_frame(struct irlap_cb *self, struct sk_buff *skb,
+ int command)
+{
+ /* Insert connection address */
+ skb->data[0] = self->caddr;
+ skb->data[0] |= (command) ? CMD_FRAME : 0;
+
+ /* Insert next to receive (Vr) */
+ skb->data[1] |= (self->vr << 5); /* insert nr */
+
+ irlap_queue_xmit(self, skb);
+}
+
+/*
+ * Function irlap_recv_i_frame (skb, frame)
+ *
+ * Receive and parse an I (Information) frame, no harm in making it inline
+ * since it's called only from one single place (irlap_driver_rcv).
+ */
+static inline void irlap_recv_i_frame(struct irlap_cb *self,
+ struct sk_buff *skb,
+ struct irlap_info *info, int command)
+{
+ info->nr = skb->data[1] >> 5; /* Next to receive */
+ info->pf = skb->data[1] & PF_BIT; /* Final bit */
+ info->ns = (skb->data[1] >> 1) & 0x07; /* Next to send */
+
+ /* Check if this is a command or a response frame */
+ if (command)
+ irlap_do_event(self, RECV_I_CMD, skb, info);
+ else
+ irlap_do_event(self, RECV_I_RSP, skb, info);
+}
+
+/*
+ * Function irlap_recv_ui_frame (self, skb, info)
+ *
+ * Receive and parse an Unnumbered Information (UI) frame
+ *
+ */
+static void irlap_recv_ui_frame(struct irlap_cb *self, struct sk_buff *skb,
+ struct irlap_info *info)
+{
+ info->pf = skb->data[1] & PF_BIT; /* Final bit */
+
+ irlap_do_event(self, RECV_UI_FRAME, skb, info);
+}
+
+/*
+ * Function irlap_recv_frmr_frame (skb, frame)
+ *
+ * Received Frame Reject response.
+ *
+ */
+static void irlap_recv_frmr_frame(struct irlap_cb *self, struct sk_buff *skb,
+ struct irlap_info *info)
+{
+ __u8 *frame;
+ int w, x, y, z;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(info != NULL, return;);
+
+ if (!pskb_may_pull(skb, 4)) {
+ net_err_ratelimited("%s: frame too short!\n", __func__);
+ return;
+ }
+
+ frame = skb->data;
+
+ info->nr = frame[2] >> 5; /* Next to receive */
+ info->pf = frame[2] & PF_BIT; /* Final bit */
+ info->ns = (frame[2] >> 1) & 0x07; /* Next to send */
+
+ w = frame[3] & 0x01;
+ x = frame[3] & 0x02;
+ y = frame[3] & 0x04;
+ z = frame[3] & 0x08;
+
+ if (w) {
+ pr_debug("Rejected control field is undefined or not implemented\n");
+ }
+ if (x) {
+ pr_debug("Rejected control field was invalid because it contained a non permitted I field\n");
+ }
+ if (y) {
+ pr_debug("Received I field exceeded the maximum negotiated for the existing connection or exceeded the maximum this station supports if no connection exists\n");
+ }
+ if (z) {
+ pr_debug("Rejected control field control field contained an invalid Nr count\n");
+ }
+ irlap_do_event(self, RECV_FRMR_RSP, skb, info);
+}
+
+/*
+ * Function irlap_send_test_frame (self, daddr)
+ *
+ * Send a test frame response
+ *
+ */
+void irlap_send_test_frame(struct irlap_cb *self, __u8 caddr, __u32 daddr,
+ struct sk_buff *cmd)
+{
+ struct sk_buff *tx_skb;
+ struct test_frame *frame;
+ __u8 *info;
+
+ tx_skb = alloc_skb(cmd->len + sizeof(struct test_frame), GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ /* Broadcast frames must include saddr and daddr fields */
+ if (caddr == CBROADCAST) {
+ frame = (struct test_frame *)
+ skb_put(tx_skb, sizeof(struct test_frame));
+
+ /* Insert the swapped addresses */
+ frame->saddr = cpu_to_le32(self->saddr);
+ frame->daddr = cpu_to_le32(daddr);
+ } else
+ frame = (struct test_frame *) skb_put(tx_skb, LAP_ADDR_HEADER + LAP_CTRL_HEADER);
+
+ frame->caddr = caddr;
+ frame->control = TEST_RSP | PF_BIT;
+
+ /* Copy info */
+ info = skb_put(tx_skb, cmd->len);
+ memcpy(info, cmd->data, cmd->len);
+
+ /* Return to sender */
+ irlap_wait_min_turn_around(self, &self->qos_tx);
+ irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_recv_test_frame (self, skb)
+ *
+ * Receive a test frame
+ *
+ */
+static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb,
+ struct irlap_info *info, int command)
+{
+ struct test_frame *frame;
+
+ if (!pskb_may_pull(skb, sizeof(*frame))) {
+ net_err_ratelimited("%s: frame too short!\n", __func__);
+ return;
+ }
+ frame = (struct test_frame *) skb->data;
+
+ /* Broadcast frames must carry saddr and daddr fields */
+ if (info->caddr == CBROADCAST) {
+ if (skb->len < sizeof(struct test_frame)) {
+ pr_debug("%s() test frame too short!\n",
+ __func__);
+ return;
+ }
+
+ /* Read and swap addresses */
+ info->daddr = le32_to_cpu(frame->saddr);
+ info->saddr = le32_to_cpu(frame->daddr);
+
+ /* Make sure frame is addressed to us */
+ if ((info->saddr != self->saddr) &&
+ (info->saddr != BROADCAST)) {
+ return;
+ }
+ }
+
+ if (command)
+ irlap_do_event(self, RECV_TEST_CMD, skb, info);
+ else
+ irlap_do_event(self, RECV_TEST_RSP, skb, info);
+}
+
+/*
+ * Function irlap_driver_rcv (skb, netdev, ptype)
+ *
+ * Called when a frame is received. Dispatches the right receive function
+ * for processing of the frame.
+ *
+ * Note on skb management :
+ * After calling the higher layers of the IrDA stack, we always
+ * kfree() the skb, which drop the reference count (and potentially
+ * destroy it).
+ * If a higher layer of the stack want to keep the skb around (to put
+ * in a queue or pass it to the higher layer), it will need to use
+ * skb_get() to keep a reference on it. This is usually done at the
+ * LMP level in irlmp.c.
+ * Jean II
+ */
+int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype, struct net_device *orig_dev)
+{
+ struct irlap_info info;
+ struct irlap_cb *self;
+ int command;
+ __u8 control;
+ int ret = -1;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto out;
+
+ /* FIXME: should we get our own field? */
+ self = (struct irlap_cb *) dev->atalk_ptr;
+
+ /* If the net device is down, then IrLAP is gone! */
+ if (!self || self->magic != LAP_MAGIC)
+ goto err;
+
+ /* We are no longer an "old" protocol, so we need to handle
+ * share and non linear skbs. This should never happen, so
+ * we don't need to be clever about it. Jean II */
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
+ net_err_ratelimited("%s: can't clone shared skb!\n", __func__);
+ goto err;
+ }
+
+ /* Check if frame is large enough for parsing */
+ if (!pskb_may_pull(skb, 2)) {
+ net_err_ratelimited("%s: frame too short!\n", __func__);
+ goto err;
+ }
+
+ command = skb->data[0] & CMD_FRAME;
+ info.caddr = skb->data[0] & CBROADCAST;
+
+ info.pf = skb->data[1] & PF_BIT;
+ info.control = skb->data[1] & ~PF_BIT; /* Mask away poll/final bit */
+
+ control = info.control;
+
+ /* First we check if this frame has a valid connection address */
+ if ((info.caddr != self->caddr) && (info.caddr != CBROADCAST)) {
+ pr_debug("%s(), wrong connection address!\n",
+ __func__);
+ goto out;
+ }
+ /*
+ * Optimize for the common case and check if the frame is an
+ * I(nformation) frame. Only I-frames have bit 0 set to 0
+ */
+ if (~control & 0x01) {
+ irlap_recv_i_frame(self, skb, &info, command);
+ goto out;
+ }
+ /*
+ * We now check is the frame is an S(upervisory) frame. Only
+ * S-frames have bit 0 set to 1 and bit 1 set to 0
+ */
+ if (~control & 0x02) {
+ /*
+ * Received S(upervisory) frame, check which frame type it is
+ * only the first nibble is of interest
+ */
+ switch (control & 0x0f) {
+ case RR:
+ irlap_recv_rr_frame(self, skb, &info, command);
+ break;
+ case RNR:
+ irlap_recv_rnr_frame(self, skb, &info, command);
+ break;
+ case REJ:
+ irlap_recv_rej_frame(self, skb, &info, command);
+ break;
+ case SREJ:
+ irlap_recv_srej_frame(self, skb, &info, command);
+ break;
+ default:
+ net_warn_ratelimited("%s: Unknown S-frame %02x received!\n",
+ __func__, info.control);
+ break;
+ }
+ goto out;
+ }
+ /*
+ * This must be a C(ontrol) frame
+ */
+ switch (control) {
+ case XID_RSP:
+ irlap_recv_discovery_xid_rsp(self, skb, &info);
+ break;
+ case XID_CMD:
+ irlap_recv_discovery_xid_cmd(self, skb, &info);
+ break;
+ case SNRM_CMD:
+ irlap_recv_snrm_cmd(self, skb, &info);
+ break;
+ case DM_RSP:
+ irlap_do_event(self, RECV_DM_RSP, skb, &info);
+ break;
+ case DISC_CMD: /* And RD_RSP since they have the same value */
+ irlap_recv_disc_frame(self, skb, &info, command);
+ break;
+ case TEST_CMD:
+ irlap_recv_test_frame(self, skb, &info, command);
+ break;
+ case UA_RSP:
+ irlap_recv_ua_frame(self, skb, &info);
+ break;
+ case FRMR_RSP:
+ irlap_recv_frmr_frame(self, skb, &info);
+ break;
+ case UI_FRAME:
+ irlap_recv_ui_frame(self, skb, &info);
+ break;
+ default:
+ net_warn_ratelimited("%s: Unknown frame %02x received!\n",
+ __func__, info.control);
+ break;
+ }
+out:
+ ret = 0;
+err:
+ /* Always drop our reference on the skb */
+ dev_kfree_skb(skb);
+ return ret;
+}
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
new file mode 100644
index 000000000..a26c401ef
--- /dev/null
+++ b/net/irda/irlmp.c
@@ -0,0 +1,1996 @@
+/*********************************************************************
+ *
+ * Filename: irlmp.c
+ * Version: 1.0
+ * Description: IrDA Link Management Protocol (LMP) layer
+ * Status: Stable.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Aug 17 20:54:32 1997
+ * Modified at: Wed Jan 5 11:26:03 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/timer.h>
+#include <net/irda/qos.h>
+#include <net/irda/irlap.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irlmp_frame.h>
+
+#include <asm/unaligned.h>
+
+static __u8 irlmp_find_free_slsap(void);
+static int irlmp_slsap_inuse(__u8 slsap_sel);
+
+/* Master structure */
+struct irlmp_cb *irlmp = NULL;
+
+/* These can be altered by the sysctl interface */
+int sysctl_discovery = 0;
+int sysctl_discovery_timeout = 3; /* 3 seconds by default */
+int sysctl_discovery_slots = 6; /* 6 slots by default */
+int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ;
+char sysctl_devname[65];
+
+static const char *irlmp_reasons[] = {
+ "ERROR, NOT USED",
+ "LM_USER_REQUEST",
+ "LM_LAP_DISCONNECT",
+ "LM_CONNECT_FAILURE",
+ "LM_LAP_RESET",
+ "LM_INIT_DISCONNECT",
+ "ERROR, NOT USED",
+ "UNKNOWN",
+};
+
+const char *irlmp_reason_str(LM_REASON reason)
+{
+ reason = min_t(size_t, reason, ARRAY_SIZE(irlmp_reasons) - 1);
+ return irlmp_reasons[reason];
+}
+
+/*
+ * Function irlmp_init (void)
+ *
+ * Create (allocate) the main IrLMP structure
+ *
+ */
+int __init irlmp_init(void)
+{
+ /* Initialize the irlmp structure. */
+ irlmp = kzalloc( sizeof(struct irlmp_cb), GFP_KERNEL);
+ if (irlmp == NULL)
+ return -ENOMEM;
+
+ irlmp->magic = LMP_MAGIC;
+
+ irlmp->clients = hashbin_new(HB_LOCK);
+ irlmp->services = hashbin_new(HB_LOCK);
+ irlmp->links = hashbin_new(HB_LOCK);
+ irlmp->unconnected_lsaps = hashbin_new(HB_LOCK);
+ irlmp->cachelog = hashbin_new(HB_NOLOCK);
+
+ if ((irlmp->clients == NULL) ||
+ (irlmp->services == NULL) ||
+ (irlmp->links == NULL) ||
+ (irlmp->unconnected_lsaps == NULL) ||
+ (irlmp->cachelog == NULL)) {
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&irlmp->cachelog->hb_spinlock);
+
+ irlmp->last_lsap_sel = 0x0f; /* Reserved 0x00-0x0f */
+ strcpy(sysctl_devname, "Linux");
+
+ init_timer(&irlmp->discovery_timer);
+
+ /* Do discovery every 3 seconds, conditionally */
+ if (sysctl_discovery)
+ irlmp_start_discovery_timer(irlmp,
+ sysctl_discovery_timeout*HZ);
+
+ return 0;
+}
+
+/*
+ * Function irlmp_cleanup (void)
+ *
+ * Remove IrLMP layer
+ *
+ */
+void irlmp_cleanup(void)
+{
+ /* Check for main structure */
+ IRDA_ASSERT(irlmp != NULL, return;);
+ IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return;);
+
+ del_timer(&irlmp->discovery_timer);
+
+ hashbin_delete(irlmp->links, (FREE_FUNC) kfree);
+ hashbin_delete(irlmp->unconnected_lsaps, (FREE_FUNC) kfree);
+ hashbin_delete(irlmp->clients, (FREE_FUNC) kfree);
+ hashbin_delete(irlmp->services, (FREE_FUNC) kfree);
+ hashbin_delete(irlmp->cachelog, (FREE_FUNC) kfree);
+
+ /* De-allocate main structure */
+ kfree(irlmp);
+ irlmp = NULL;
+}
+
+/*
+ * Function irlmp_open_lsap (slsap, notify)
+ *
+ * Register with IrLMP and create a local LSAP,
+ * returns handle to LSAP.
+ */
+struct lsap_cb *irlmp_open_lsap(__u8 slsap_sel, notify_t *notify, __u8 pid)
+{
+ struct lsap_cb *self;
+
+ IRDA_ASSERT(notify != NULL, return NULL;);
+ IRDA_ASSERT(irlmp != NULL, return NULL;);
+ IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return NULL;);
+ IRDA_ASSERT(notify->instance != NULL, return NULL;);
+
+ /* Does the client care which Source LSAP selector it gets? */
+ if (slsap_sel == LSAP_ANY) {
+ slsap_sel = irlmp_find_free_slsap();
+ if (!slsap_sel)
+ return NULL;
+ } else if (irlmp_slsap_inuse(slsap_sel))
+ return NULL;
+
+ /* Allocate new instance of a LSAP connection */
+ self = kzalloc(sizeof(struct lsap_cb), GFP_ATOMIC);
+ if (self == NULL)
+ return NULL;
+
+ self->magic = LMP_LSAP_MAGIC;
+ self->slsap_sel = slsap_sel;
+
+ /* Fix connectionless LSAP's */
+ if (slsap_sel == LSAP_CONNLESS) {
+#ifdef CONFIG_IRDA_ULTRA
+ self->dlsap_sel = LSAP_CONNLESS;
+ self->pid = pid;
+#endif /* CONFIG_IRDA_ULTRA */
+ } else
+ self->dlsap_sel = LSAP_ANY;
+ /* self->connected = FALSE; -> already NULL via memset() */
+
+ init_timer(&self->watchdog_timer);
+
+ self->notify = *notify;
+
+ self->lsap_state = LSAP_DISCONNECTED;
+
+ /* Insert into queue of unconnected LSAPs */
+ hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) self,
+ (long) self, NULL);
+
+ return self;
+}
+EXPORT_SYMBOL(irlmp_open_lsap);
+
+/*
+ * Function __irlmp_close_lsap (self)
+ *
+ * Remove an instance of LSAP
+ */
+static void __irlmp_close_lsap(struct lsap_cb *self)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+
+ /*
+ * Set some of the variables to preset values
+ */
+ self->magic = 0;
+ del_timer(&self->watchdog_timer); /* Important! */
+
+ if (self->conn_skb)
+ dev_kfree_skb(self->conn_skb);
+
+ kfree(self);
+}
+
+/*
+ * Function irlmp_close_lsap (self)
+ *
+ * Close and remove LSAP
+ *
+ */
+void irlmp_close_lsap(struct lsap_cb *self)
+{
+ struct lap_cb *lap;
+ struct lsap_cb *lsap = NULL;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+
+ /*
+ * Find out if we should remove this LSAP from a link or from the
+ * list of unconnected lsaps (not associated with a link)
+ */
+ lap = self->lap;
+ if (lap) {
+ IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;);
+ /* We might close a LSAP before it has completed the
+ * connection setup. In those case, higher layers won't
+ * send a proper disconnect request. Harmless, except
+ * that we will forget to close LAP... - Jean II */
+ if(self->lsap_state != LSAP_DISCONNECTED) {
+ self->lsap_state = LSAP_DISCONNECTED;
+ irlmp_do_lap_event(self->lap,
+ LM_LAP_DISCONNECT_REQUEST, NULL);
+ }
+ /* Now, remove from the link */
+ lsap = hashbin_remove(lap->lsaps, (long) self, NULL);
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+ lap->cache.valid = FALSE;
+#endif
+ }
+ self->lap = NULL;
+ /* Check if we found the LSAP! If not then try the unconnected lsaps */
+ if (!lsap) {
+ lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self,
+ NULL);
+ }
+ if (!lsap) {
+ pr_debug("%s(), Looks like somebody has removed me already!\n",
+ __func__);
+ return;
+ }
+ __irlmp_close_lsap(self);
+}
+EXPORT_SYMBOL(irlmp_close_lsap);
+
+/*
+ * Function irlmp_register_irlap (saddr, notify)
+ *
+ * Register IrLAP layer with IrLMP. There is possible to have multiple
+ * instances of the IrLAP layer, each connected to different IrDA ports
+ *
+ */
+void irlmp_register_link(struct irlap_cb *irlap, __u32 saddr, notify_t *notify)
+{
+ struct lap_cb *lap;
+
+ IRDA_ASSERT(irlmp != NULL, return;);
+ IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return;);
+ IRDA_ASSERT(notify != NULL, return;);
+
+ /*
+ * Allocate new instance of a LSAP connection
+ */
+ lap = kzalloc(sizeof(struct lap_cb), GFP_KERNEL);
+ if (lap == NULL)
+ return;
+
+ lap->irlap = irlap;
+ lap->magic = LMP_LAP_MAGIC;
+ lap->saddr = saddr;
+ lap->daddr = DEV_ADDR_ANY;
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+ lap->cache.valid = FALSE;
+#endif
+ lap->lsaps = hashbin_new(HB_LOCK);
+ if (lap->lsaps == NULL) {
+ net_warn_ratelimited("%s(), unable to kmalloc lsaps\n",
+ __func__);
+ kfree(lap);
+ return;
+ }
+
+ lap->lap_state = LAP_STANDBY;
+
+ init_timer(&lap->idle_timer);
+
+ /*
+ * Insert into queue of LMP links
+ */
+ hashbin_insert(irlmp->links, (irda_queue_t *) lap, lap->saddr, NULL);
+
+ /*
+ * We set only this variable so IrLAP can tell us on which link the
+ * different events happened on
+ */
+ irda_notify_init(notify);
+ notify->instance = lap;
+}
+
+/*
+ * Function irlmp_unregister_irlap (saddr)
+ *
+ * IrLAP layer has been removed!
+ *
+ */
+void irlmp_unregister_link(__u32 saddr)
+{
+ struct lap_cb *link;
+
+ /* We must remove ourselves from the hashbin *first*. This ensure
+ * that no more LSAPs will be open on this link and no discovery
+ * will be triggered anymore. Jean II */
+ link = hashbin_remove(irlmp->links, saddr, NULL);
+ if (link) {
+ IRDA_ASSERT(link->magic == LMP_LAP_MAGIC, return;);
+
+ /* Kill all the LSAPs on this link. Jean II */
+ link->reason = LAP_DISC_INDICATION;
+ link->daddr = DEV_ADDR_ANY;
+ irlmp_do_lap_event(link, LM_LAP_DISCONNECT_INDICATION, NULL);
+
+ /* Remove all discoveries discovered at this link */
+ irlmp_expire_discoveries(irlmp->cachelog, link->saddr, TRUE);
+
+ /* Final cleanup */
+ del_timer(&link->idle_timer);
+ link->magic = 0;
+ hashbin_delete(link->lsaps, (FREE_FUNC) __irlmp_close_lsap);
+ kfree(link);
+ }
+}
+
+/*
+ * Function irlmp_connect_request (handle, dlsap, userdata)
+ *
+ * Connect with a peer LSAP
+ *
+ */
+int irlmp_connect_request(struct lsap_cb *self, __u8 dlsap_sel,
+ __u32 saddr, __u32 daddr,
+ struct qos_info *qos, struct sk_buff *userdata)
+{
+ struct sk_buff *tx_skb = userdata;
+ struct lap_cb *lap;
+ struct lsap_cb *lsap;
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -EBADR;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -EBADR;);
+
+ pr_debug("%s(), slsap_sel=%02x, dlsap_sel=%02x, saddr=%08x, daddr=%08x\n",
+ __func__, self->slsap_sel, dlsap_sel, saddr, daddr);
+
+ if (test_bit(0, &self->connected)) {
+ ret = -EISCONN;
+ goto err;
+ }
+
+ /* Client must supply destination device address */
+ if (!daddr) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* Any userdata? */
+ if (tx_skb == NULL) {
+ tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+ if (!tx_skb)
+ return -ENOMEM;
+
+ skb_reserve(tx_skb, LMP_MAX_HEADER);
+ }
+
+ /* Make room for MUX control header (3 bytes) */
+ IRDA_ASSERT(skb_headroom(tx_skb) >= LMP_CONTROL_HEADER, return -1;);
+ skb_push(tx_skb, LMP_CONTROL_HEADER);
+
+ self->dlsap_sel = dlsap_sel;
+
+ /*
+ * Find the link to where we should try to connect since there may
+ * be more than one IrDA port on this machine. If the client has
+ * passed us the saddr (and already knows which link to use), then
+ * we use that to find the link, if not then we have to look in the
+ * discovery log and check if any of the links has discovered a
+ * device with the given daddr
+ */
+ if ((!saddr) || (saddr == DEV_ADDR_ANY)) {
+ discovery_t *discovery;
+ unsigned long flags;
+
+ spin_lock_irqsave(&irlmp->cachelog->hb_spinlock, flags);
+ if (daddr != DEV_ADDR_ANY)
+ discovery = hashbin_find(irlmp->cachelog, daddr, NULL);
+ else {
+ pr_debug("%s(), no daddr\n", __func__);
+ discovery = (discovery_t *)
+ hashbin_get_first(irlmp->cachelog);
+ }
+
+ if (discovery) {
+ saddr = discovery->data.saddr;
+ daddr = discovery->data.daddr;
+ }
+ spin_unlock_irqrestore(&irlmp->cachelog->hb_spinlock, flags);
+ }
+ lap = hashbin_lock_find(irlmp->links, saddr, NULL);
+ if (lap == NULL) {
+ pr_debug("%s(), Unable to find a usable link!\n", __func__);
+ ret = -EHOSTUNREACH;
+ goto err;
+ }
+
+ /* Check if LAP is disconnected or already connected */
+ if (lap->daddr == DEV_ADDR_ANY)
+ lap->daddr = daddr;
+ else if (lap->daddr != daddr) {
+ /* Check if some LSAPs are active on this LAP */
+ if (HASHBIN_GET_SIZE(lap->lsaps) == 0) {
+ /* No active connection, but LAP hasn't been
+ * disconnected yet (waiting for timeout in LAP).
+ * Maybe we could give LAP a bit of help in this case.
+ */
+ pr_debug("%s(), sorry, but I'm waiting for LAP to timeout!\n",
+ __func__);
+ ret = -EAGAIN;
+ goto err;
+ }
+
+ /* LAP is already connected to a different node, and LAP
+ * can only talk to one node at a time */
+ pr_debug("%s(), sorry, but link is busy!\n", __func__);
+ ret = -EBUSY;
+ goto err;
+ }
+
+ self->lap = lap;
+
+ /*
+ * Remove LSAP from list of unconnected LSAPs and insert it into the
+ * list of connected LSAPs for the particular link
+ */
+ lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self, NULL);
+
+ IRDA_ASSERT(lsap != NULL, return -1;);
+ IRDA_ASSERT(lsap->magic == LMP_LSAP_MAGIC, return -1;);
+ IRDA_ASSERT(lsap->lap != NULL, return -1;);
+ IRDA_ASSERT(lsap->lap->magic == LMP_LAP_MAGIC, return -1;);
+
+ hashbin_insert(self->lap->lsaps, (irda_queue_t *) self, (long) self,
+ NULL);
+
+ set_bit(0, &self->connected); /* TRUE */
+
+ /*
+ * User supplied qos specifications?
+ */
+ if (qos)
+ self->qos = *qos;
+
+ irlmp_do_lsap_event(self, LM_CONNECT_REQUEST, tx_skb);
+
+ /* Drop reference count - see irlap_data_request(). */
+ dev_kfree_skb(tx_skb);
+
+ return 0;
+
+err:
+ /* Cleanup */
+ if(tx_skb)
+ dev_kfree_skb(tx_skb);
+ return ret;
+}
+EXPORT_SYMBOL(irlmp_connect_request);
+
+/*
+ * Function irlmp_connect_indication (self)
+ *
+ * Incoming connection
+ *
+ */
+void irlmp_connect_indication(struct lsap_cb *self, struct sk_buff *skb)
+{
+ int max_seg_size;
+ int lap_header_size;
+ int max_header_size;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(self->lap != NULL, return;);
+
+ pr_debug("%s(), slsap_sel=%02x, dlsap_sel=%02x\n",
+ __func__, self->slsap_sel, self->dlsap_sel);
+
+ /* Note : self->lap is set in irlmp_link_data_indication(),
+ * (case CONNECT_CMD:) because we have no way to set it here.
+ * Similarly, self->dlsap_sel is usually set in irlmp_find_lsap().
+ * Jean II */
+
+ self->qos = *self->lap->qos;
+
+ max_seg_size = self->lap->qos->data_size.value-LMP_HEADER;
+ lap_header_size = IRLAP_GET_HEADER_SIZE(self->lap->irlap);
+ max_header_size = LMP_HEADER + lap_header_size;
+
+ /* Hide LMP_CONTROL_HEADER header from layer above */
+ skb_pull(skb, LMP_CONTROL_HEADER);
+
+ if (self->notify.connect_indication) {
+ /* Don't forget to refcount it - see irlap_driver_rcv(). */
+ skb_get(skb);
+ self->notify.connect_indication(self->notify.instance, self,
+ &self->qos, max_seg_size,
+ max_header_size, skb);
+ }
+}
+
+/*
+ * Function irlmp_connect_response (handle, userdata)
+ *
+ * Service user is accepting connection
+ *
+ */
+int irlmp_connect_response(struct lsap_cb *self, struct sk_buff *userdata)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+ IRDA_ASSERT(userdata != NULL, return -1;);
+
+ /* We set the connected bit and move the lsap to the connected list
+ * in the state machine itself. Jean II */
+
+ pr_debug("%s(), slsap_sel=%02x, dlsap_sel=%02x\n",
+ __func__, self->slsap_sel, self->dlsap_sel);
+
+ /* Make room for MUX control header (3 bytes) */
+ IRDA_ASSERT(skb_headroom(userdata) >= LMP_CONTROL_HEADER, return -1;);
+ skb_push(userdata, LMP_CONTROL_HEADER);
+
+ irlmp_do_lsap_event(self, LM_CONNECT_RESPONSE, userdata);
+
+ /* Drop reference count - see irlap_data_request(). */
+ dev_kfree_skb(userdata);
+
+ return 0;
+}
+EXPORT_SYMBOL(irlmp_connect_response);
+
+/*
+ * Function irlmp_connect_confirm (handle, skb)
+ *
+ * LSAP connection confirmed peer device!
+ */
+void irlmp_connect_confirm(struct lsap_cb *self, struct sk_buff *skb)
+{
+ int max_header_size;
+ int lap_header_size;
+ int max_seg_size;
+
+ IRDA_ASSERT(skb != NULL, return;);
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+ IRDA_ASSERT(self->lap != NULL, return;);
+
+ self->qos = *self->lap->qos;
+
+ max_seg_size = self->lap->qos->data_size.value-LMP_HEADER;
+ lap_header_size = IRLAP_GET_HEADER_SIZE(self->lap->irlap);
+ max_header_size = LMP_HEADER + lap_header_size;
+
+ pr_debug("%s(), max_header_size=%d\n",
+ __func__, max_header_size);
+
+ /* Hide LMP_CONTROL_HEADER header from layer above */
+ skb_pull(skb, LMP_CONTROL_HEADER);
+
+ if (self->notify.connect_confirm) {
+ /* Don't forget to refcount it - see irlap_driver_rcv() */
+ skb_get(skb);
+ self->notify.connect_confirm(self->notify.instance, self,
+ &self->qos, max_seg_size,
+ max_header_size, skb);
+ }
+}
+
+/*
+ * Function irlmp_dup (orig, instance)
+ *
+ * Duplicate LSAP, can be used by servers to confirm a connection on a
+ * new LSAP so it can keep listening on the old one.
+ *
+ */
+struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance)
+{
+ struct lsap_cb *new;
+ unsigned long flags;
+
+ spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+ /* Only allowed to duplicate unconnected LSAP's, and only LSAPs
+ * that have received a connect indication. Jean II */
+ if ((!hashbin_find(irlmp->unconnected_lsaps, (long) orig, NULL)) ||
+ (orig->lap == NULL)) {
+ pr_debug("%s(), invalid LSAP (wrong state)\n",
+ __func__);
+ spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock,
+ flags);
+ return NULL;
+ }
+
+ /* Allocate a new instance */
+ new = kmemdup(orig, sizeof(*new), GFP_ATOMIC);
+ if (!new) {
+ pr_debug("%s(), unable to kmalloc\n", __func__);
+ spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock,
+ flags);
+ return NULL;
+ }
+ /* new->lap = orig->lap; => done in the memcpy() */
+ /* new->slsap_sel = orig->slsap_sel; => done in the memcpy() */
+ new->conn_skb = NULL;
+
+ spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+ /* Not everything is the same */
+ new->notify.instance = instance;
+
+ init_timer(&new->watchdog_timer);
+
+ hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) new,
+ (long) new, NULL);
+
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+ /* Make sure that we invalidate the LSAP cache */
+ new->lap->cache.valid = FALSE;
+#endif /* CONFIG_IRDA_CACHE_LAST_LSAP */
+
+ return new;
+}
+
+/*
+ * Function irlmp_disconnect_request (handle, userdata)
+ *
+ * The service user is requesting disconnection, this will not remove the
+ * LSAP, but only mark it as disconnected
+ */
+int irlmp_disconnect_request(struct lsap_cb *self, struct sk_buff *userdata)
+{
+ struct lsap_cb *lsap;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+ IRDA_ASSERT(userdata != NULL, return -1;);
+
+ /* Already disconnected ?
+ * There is a race condition between irlmp_disconnect_indication()
+ * and us that might mess up the hashbins below. This fixes it.
+ * Jean II */
+ if (! test_and_clear_bit(0, &self->connected)) {
+ pr_debug("%s(), already disconnected!\n", __func__);
+ dev_kfree_skb(userdata);
+ return -1;
+ }
+
+ skb_push(userdata, LMP_CONTROL_HEADER);
+
+ /*
+ * Do the event before the other stuff since we must know
+ * which lap layer that the frame should be transmitted on
+ */
+ irlmp_do_lsap_event(self, LM_DISCONNECT_REQUEST, userdata);
+
+ /* Drop reference count - see irlap_data_request(). */
+ dev_kfree_skb(userdata);
+
+ /*
+ * Remove LSAP from list of connected LSAPs for the particular link
+ * and insert it into the list of unconnected LSAPs
+ */
+ IRDA_ASSERT(self->lap != NULL, return -1;);
+ IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;);
+ IRDA_ASSERT(self->lap->lsaps != NULL, return -1;);
+
+ lsap = hashbin_remove(self->lap->lsaps, (long) self, NULL);
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+ self->lap->cache.valid = FALSE;
+#endif
+
+ IRDA_ASSERT(lsap != NULL, return -1;);
+ IRDA_ASSERT(lsap->magic == LMP_LSAP_MAGIC, return -1;);
+ IRDA_ASSERT(lsap == self, return -1;);
+
+ hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) self,
+ (long) self, NULL);
+
+ /* Reset some values */
+ self->dlsap_sel = LSAP_ANY;
+ self->lap = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL(irlmp_disconnect_request);
+
+/*
+ * Function irlmp_disconnect_indication (reason, userdata)
+ *
+ * LSAP is being closed!
+ */
+void irlmp_disconnect_indication(struct lsap_cb *self, LM_REASON reason,
+ struct sk_buff *skb)
+{
+ struct lsap_cb *lsap;
+
+ pr_debug("%s(), reason=%s [%d]\n", __func__,
+ irlmp_reason_str(reason), reason);
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+
+ pr_debug("%s(), slsap_sel=%02x, dlsap_sel=%02x\n",
+ __func__, self->slsap_sel, self->dlsap_sel);
+
+ /* Already disconnected ?
+ * There is a race condition between irlmp_disconnect_request()
+ * and us that might mess up the hashbins below. This fixes it.
+ * Jean II */
+ if (! test_and_clear_bit(0, &self->connected)) {
+ pr_debug("%s(), already disconnected!\n", __func__);
+ return;
+ }
+
+ /*
+ * Remove association between this LSAP and the link it used
+ */
+ IRDA_ASSERT(self->lap != NULL, return;);
+ IRDA_ASSERT(self->lap->lsaps != NULL, return;);
+
+ lsap = hashbin_remove(self->lap->lsaps, (long) self, NULL);
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+ self->lap->cache.valid = FALSE;
+#endif
+
+ IRDA_ASSERT(lsap != NULL, return;);
+ IRDA_ASSERT(lsap == self, return;);
+ hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) lsap,
+ (long) lsap, NULL);
+
+ self->dlsap_sel = LSAP_ANY;
+ self->lap = NULL;
+
+ /*
+ * Inform service user
+ */
+ if (self->notify.disconnect_indication) {
+ /* Don't forget to refcount it - see irlap_driver_rcv(). */
+ if(skb)
+ skb_get(skb);
+ self->notify.disconnect_indication(self->notify.instance,
+ self, reason, skb);
+ } else {
+ pr_debug("%s(), no handler\n", __func__);
+ }
+}
+
+/*
+ * Function irlmp_do_expiry (void)
+ *
+ * Do a cleanup of the discovery log (remove old entries)
+ *
+ * Note : separate from irlmp_do_discovery() so that we can handle
+ * passive discovery properly.
+ */
+void irlmp_do_expiry(void)
+{
+ struct lap_cb *lap;
+
+ /*
+ * Expire discovery on all links which are *not* connected.
+ * On links which are connected, we can't do discovery
+ * anymore and can't refresh the log, so we freeze the
+ * discovery log to keep info about the device we are
+ * connected to.
+ * This info is mandatory if we want irlmp_connect_request()
+ * to work properly. - Jean II
+ */
+ lap = (struct lap_cb *) hashbin_get_first(irlmp->links);
+ while (lap != NULL) {
+ IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;);
+
+ if (lap->lap_state == LAP_STANDBY) {
+ /* Expire discoveries discovered on this link */
+ irlmp_expire_discoveries(irlmp->cachelog, lap->saddr,
+ FALSE);
+ }
+ lap = (struct lap_cb *) hashbin_get_next(irlmp->links);
+ }
+}
+
+/*
+ * Function irlmp_do_discovery (nslots)
+ *
+ * Do some discovery on all links
+ *
+ * Note : log expiry is done above.
+ */
+void irlmp_do_discovery(int nslots)
+{
+ struct lap_cb *lap;
+ __u16 *data_hintsp;
+
+ /* Make sure the value is sane */
+ if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){
+ net_warn_ratelimited("%s: invalid value for number of slots!\n",
+ __func__);
+ nslots = sysctl_discovery_slots = 8;
+ }
+
+ /* Construct new discovery info to be used by IrLAP, */
+ data_hintsp = (__u16 *) irlmp->discovery_cmd.data.hints;
+ put_unaligned(irlmp->hints.word, data_hintsp);
+
+ /*
+ * Set character set for device name (we use ASCII), and
+ * copy device name. Remember to make room for a \0 at the
+ * end
+ */
+ irlmp->discovery_cmd.data.charset = CS_ASCII;
+ strncpy(irlmp->discovery_cmd.data.info, sysctl_devname,
+ NICKNAME_MAX_LEN);
+ irlmp->discovery_cmd.name_len = strlen(irlmp->discovery_cmd.data.info);
+ irlmp->discovery_cmd.nslots = nslots;
+
+ /*
+ * Try to send discovery packets on all links
+ */
+ lap = (struct lap_cb *) hashbin_get_first(irlmp->links);
+ while (lap != NULL) {
+ IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;);
+
+ if (lap->lap_state == LAP_STANDBY) {
+ /* Try to discover */
+ irlmp_do_lap_event(lap, LM_LAP_DISCOVERY_REQUEST,
+ NULL);
+ }
+ lap = (struct lap_cb *) hashbin_get_next(irlmp->links);
+ }
+}
+
+/*
+ * Function irlmp_discovery_request (nslots)
+ *
+ * Do a discovery of devices in front of the computer
+ *
+ * If the caller has registered a client discovery callback, this
+ * allow him to receive the full content of the discovery log through
+ * this callback (as normally he will receive only new discoveries).
+ */
+void irlmp_discovery_request(int nslots)
+{
+ /* Return current cached discovery log (in full) */
+ irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_LOG);
+
+ /*
+ * Start a single discovery operation if discovery is not already
+ * running
+ */
+ if (!sysctl_discovery) {
+ /* Check if user wants to override the default */
+ if (nslots == DISCOVERY_DEFAULT_SLOTS)
+ nslots = sysctl_discovery_slots;
+
+ irlmp_do_discovery(nslots);
+ /* Note : we never do expiry here. Expiry will run on the
+ * discovery timer regardless of the state of sysctl_discovery
+ * Jean II */
+ }
+}
+EXPORT_SYMBOL(irlmp_discovery_request);
+
+/*
+ * Function irlmp_get_discoveries (pn, mask, slots)
+ *
+ * Return the current discovery log
+ *
+ * If discovery is not enabled, you should call this function again
+ * after 1 or 2 seconds (i.e. after discovery has been done).
+ */
+struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots)
+{
+ /* If discovery is not enabled, it's likely that the discovery log
+ * will be empty. So, we trigger a single discovery, so that next
+ * time the user call us there might be some results in the log.
+ * Jean II
+ */
+ if (!sysctl_discovery) {
+ /* Check if user wants to override the default */
+ if (nslots == DISCOVERY_DEFAULT_SLOTS)
+ nslots = sysctl_discovery_slots;
+
+ /* Start discovery - will complete sometime later */
+ irlmp_do_discovery(nslots);
+ /* Note : we never do expiry here. Expiry will run on the
+ * discovery timer regardless of the state of sysctl_discovery
+ * Jean II */
+ }
+
+ /* Return current cached discovery log */
+ return irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE);
+}
+EXPORT_SYMBOL(irlmp_get_discoveries);
+
+/*
+ * Function irlmp_notify_client (log)
+ *
+ * Notify all about discovered devices
+ *
+ * Clients registered with IrLMP are :
+ * o IrComm
+ * o IrLAN
+ * o Any socket (in any state - ouch, that may be a lot !)
+ * The client may have defined a callback to be notified in case of
+ * partial/selective discovery based on the hints that it passed to IrLMP.
+ */
+static inline void
+irlmp_notify_client(irlmp_client_t *client,
+ hashbin_t *log, DISCOVERY_MODE mode)
+{
+ discinfo_t *discoveries; /* Copy of the discovery log */
+ int number; /* Number of nodes in the log */
+ int i;
+
+ /* Check if client wants or not partial/selective log (optimisation) */
+ if (!client->disco_callback)
+ return;
+
+ /*
+ * Locking notes :
+ * the old code was manipulating the log directly, which was
+ * very racy. Now, we use copy_discoveries, that protects
+ * itself while dumping the log for us.
+ * The overhead of the copy is compensated by the fact that
+ * we only pass new discoveries in normal mode and don't
+ * pass the same old entry every 3s to the caller as we used
+ * to do (virtual function calling is expensive).
+ * Jean II
+ */
+
+ /*
+ * Now, check all discovered devices (if any), and notify client
+ * only about the services that the client is interested in
+ * We also notify only about the new devices unless the caller
+ * explicitly request a dump of the log. Jean II
+ */
+ discoveries = irlmp_copy_discoveries(log, &number,
+ client->hint_mask.word,
+ (mode == DISCOVERY_LOG));
+ /* Check if the we got some results */
+ if (discoveries == NULL)
+ return; /* No nodes discovered */
+
+ /* Pass all entries to the listener */
+ for(i = 0; i < number; i++)
+ client->disco_callback(&(discoveries[i]), mode, client->priv);
+
+ /* Free up our buffer */
+ kfree(discoveries);
+}
+
+/*
+ * Function irlmp_discovery_confirm ( self, log)
+ *
+ * Some device(s) answered to our discovery request! Check to see which
+ * device it is, and give indication to the client(s)
+ *
+ */
+void irlmp_discovery_confirm(hashbin_t *log, DISCOVERY_MODE mode)
+{
+ irlmp_client_t *client;
+ irlmp_client_t *client_next;
+
+ IRDA_ASSERT(log != NULL, return;);
+
+ if (!(HASHBIN_GET_SIZE(log)))
+ return;
+
+ /* For each client - notify callback may touch client list */
+ client = (irlmp_client_t *) hashbin_get_first(irlmp->clients);
+ while (NULL != hashbin_find_next(irlmp->clients, (long) client, NULL,
+ (void *) &client_next) ) {
+ /* Check if we should notify client */
+ irlmp_notify_client(client, log, mode);
+
+ client = client_next;
+ }
+}
+
+/*
+ * Function irlmp_discovery_expiry (expiry)
+ *
+ * This device is no longer been discovered, and therefore it is being
+ * purged from the discovery log. Inform all clients who have
+ * registered for this event...
+ *
+ * Note : called exclusively from discovery.c
+ * Note : this is no longer called under discovery spinlock, so the
+ * client can do whatever he wants in the callback.
+ */
+void irlmp_discovery_expiry(discinfo_t *expiries, int number)
+{
+ irlmp_client_t *client;
+ irlmp_client_t *client_next;
+ int i;
+
+ IRDA_ASSERT(expiries != NULL, return;);
+
+ /* For each client - notify callback may touch client list */
+ client = (irlmp_client_t *) hashbin_get_first(irlmp->clients);
+ while (NULL != hashbin_find_next(irlmp->clients, (long) client, NULL,
+ (void *) &client_next) ) {
+
+ /* Pass all entries to the listener */
+ for(i = 0; i < number; i++) {
+ /* Check if we should notify client */
+ if ((client->expir_callback) &&
+ (client->hint_mask.word &
+ get_unaligned((__u16 *)expiries[i].hints)
+ & 0x7f7f) )
+ client->expir_callback(&(expiries[i]),
+ EXPIRY_TIMEOUT,
+ client->priv);
+ }
+
+ /* Next client */
+ client = client_next;
+ }
+}
+
+/*
+ * Function irlmp_get_discovery_response ()
+ *
+ * Used by IrLAP to get the discovery info it needs when answering
+ * discovery requests by other devices.
+ */
+discovery_t *irlmp_get_discovery_response(void)
+{
+ IRDA_ASSERT(irlmp != NULL, return NULL;);
+
+ put_unaligned(irlmp->hints.word, (__u16 *)irlmp->discovery_rsp.data.hints);
+
+ /*
+ * Set character set for device name (we use ASCII), and
+ * copy device name. Remember to make room for a \0 at the
+ * end
+ */
+ irlmp->discovery_rsp.data.charset = CS_ASCII;
+
+ strncpy(irlmp->discovery_rsp.data.info, sysctl_devname,
+ NICKNAME_MAX_LEN);
+ irlmp->discovery_rsp.name_len = strlen(irlmp->discovery_rsp.data.info);
+
+ return &irlmp->discovery_rsp;
+}
+
+/*
+ * Function irlmp_data_request (self, skb)
+ *
+ * Send some data to peer device
+ *
+ * Note on skb management :
+ * After calling the lower layers of the IrDA stack, we always
+ * kfree() the skb, which drop the reference count (and potentially
+ * destroy it).
+ * IrLMP and IrLAP may queue the packet, and in those cases will need
+ * to use skb_get() to keep it around.
+ * Jean II
+ */
+int irlmp_data_request(struct lsap_cb *self, struct sk_buff *userdata)
+{
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+ /* Make room for MUX header */
+ IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER, return -1;);
+ skb_push(userdata, LMP_HEADER);
+
+ ret = irlmp_do_lsap_event(self, LM_DATA_REQUEST, userdata);
+
+ /* Drop reference count - see irlap_data_request(). */
+ dev_kfree_skb(userdata);
+
+ return ret;
+}
+EXPORT_SYMBOL(irlmp_data_request);
+
+/*
+ * Function irlmp_data_indication (handle, skb)
+ *
+ * Got data from LAP layer so pass it up to upper layer
+ *
+ */
+void irlmp_data_indication(struct lsap_cb *self, struct sk_buff *skb)
+{
+ /* Hide LMP header from layer above */
+ skb_pull(skb, LMP_HEADER);
+
+ if (self->notify.data_indication) {
+ /* Don't forget to refcount it - see irlap_driver_rcv(). */
+ skb_get(skb);
+ self->notify.data_indication(self->notify.instance, self, skb);
+ }
+}
+
+/*
+ * Function irlmp_udata_request (self, skb)
+ */
+int irlmp_udata_request(struct lsap_cb *self, struct sk_buff *userdata)
+{
+ int ret;
+
+ IRDA_ASSERT(userdata != NULL, return -1;);
+
+ /* Make room for MUX header */
+ IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER, return -1;);
+ skb_push(userdata, LMP_HEADER);
+
+ ret = irlmp_do_lsap_event(self, LM_UDATA_REQUEST, userdata);
+
+ /* Drop reference count - see irlap_data_request(). */
+ dev_kfree_skb(userdata);
+
+ return ret;
+}
+
+/*
+ * Function irlmp_udata_indication (self, skb)
+ *
+ * Send unreliable data (but still within the connection)
+ *
+ */
+void irlmp_udata_indication(struct lsap_cb *self, struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ /* Hide LMP header from layer above */
+ skb_pull(skb, LMP_HEADER);
+
+ if (self->notify.udata_indication) {
+ /* Don't forget to refcount it - see irlap_driver_rcv(). */
+ skb_get(skb);
+ self->notify.udata_indication(self->notify.instance, self,
+ skb);
+ }
+}
+
+/*
+ * Function irlmp_connless_data_request (self, skb)
+ */
+#ifdef CONFIG_IRDA_ULTRA
+int irlmp_connless_data_request(struct lsap_cb *self, struct sk_buff *userdata,
+ __u8 pid)
+{
+ struct sk_buff *clone_skb;
+ struct lap_cb *lap;
+
+ IRDA_ASSERT(userdata != NULL, return -1;);
+
+ /* Make room for MUX and PID header */
+ IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER+LMP_PID_HEADER,
+ return -1;);
+
+ /* Insert protocol identifier */
+ skb_push(userdata, LMP_PID_HEADER);
+ if(self != NULL)
+ userdata->data[0] = self->pid;
+ else
+ userdata->data[0] = pid;
+
+ /* Connectionless sockets must use 0x70 */
+ skb_push(userdata, LMP_HEADER);
+ userdata->data[0] = userdata->data[1] = LSAP_CONNLESS;
+
+ /* Try to send Connectionless packets out on all links */
+ lap = (struct lap_cb *) hashbin_get_first(irlmp->links);
+ while (lap != NULL) {
+ IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return -1;);
+
+ clone_skb = skb_clone(userdata, GFP_ATOMIC);
+ if (!clone_skb) {
+ dev_kfree_skb(userdata);
+ return -ENOMEM;
+ }
+
+ irlap_unitdata_request(lap->irlap, clone_skb);
+ /* irlap_unitdata_request() don't increase refcount,
+ * so no dev_kfree_skb() - Jean II */
+
+ lap = (struct lap_cb *) hashbin_get_next(irlmp->links);
+ }
+ dev_kfree_skb(userdata);
+
+ return 0;
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irlmp_connless_data_indication (self, skb)
+ *
+ * Receive unreliable data outside any connection. Mostly used by Ultra
+ *
+ */
+#ifdef CONFIG_IRDA_ULTRA
+void irlmp_connless_data_indication(struct lsap_cb *self, struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ /* Hide LMP and PID header from layer above */
+ skb_pull(skb, LMP_HEADER+LMP_PID_HEADER);
+
+ if (self->notify.udata_indication) {
+ /* Don't forget to refcount it - see irlap_driver_rcv(). */
+ skb_get(skb);
+ self->notify.udata_indication(self->notify.instance, self,
+ skb);
+ }
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Propagate status indication from LAP to LSAPs (via LMP)
+ * This don't trigger any change of state in lap_cb, lmp_cb or lsap_cb,
+ * and the event is stateless, therefore we can bypass both state machines
+ * and send the event direct to the LSAP user.
+ * Jean II
+ */
+void irlmp_status_indication(struct lap_cb *self,
+ LINK_STATUS link, LOCK_STATUS lock)
+{
+ struct lsap_cb *next;
+ struct lsap_cb *curr;
+
+ /* Send status_indication to all LSAPs using this link */
+ curr = (struct lsap_cb *) hashbin_get_first( self->lsaps);
+ while (NULL != hashbin_find_next(self->lsaps, (long) curr, NULL,
+ (void *) &next) ) {
+ IRDA_ASSERT(curr->magic == LMP_LSAP_MAGIC, return;);
+ /*
+ * Inform service user if he has requested it
+ */
+ if (curr->notify.status_indication != NULL)
+ curr->notify.status_indication(curr->notify.instance,
+ link, lock);
+ else
+ pr_debug("%s(), no handler\n", __func__);
+
+ curr = next;
+ }
+}
+
+/*
+ * Receive flow control indication from LAP.
+ * LAP want us to send it one more frame. We implement a simple round
+ * robin scheduler between the active sockets so that we get a bit of
+ * fairness. Note that the round robin is far from perfect, but it's
+ * better than nothing.
+ * We then poll the selected socket so that we can do synchronous
+ * refilling of IrLAP (which allow to minimise the number of buffers).
+ * Jean II
+ */
+void irlmp_flow_indication(struct lap_cb *self, LOCAL_FLOW flow)
+{
+ struct lsap_cb *next;
+ struct lsap_cb *curr;
+ int lsap_todo;
+
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+ IRDA_ASSERT(flow == FLOW_START, return;);
+
+ /* Get the number of lsap. That's the only safe way to know
+ * that we have looped around... - Jean II */
+ lsap_todo = HASHBIN_GET_SIZE(self->lsaps);
+ pr_debug("%s() : %d lsaps to scan\n", __func__, lsap_todo);
+
+ /* Poll lsap in order until the queue is full or until we
+ * tried them all.
+ * Most often, the current LSAP will have something to send,
+ * so we will go through this loop only once. - Jean II */
+ while((lsap_todo--) &&
+ (IRLAP_GET_TX_QUEUE_LEN(self->irlap) < LAP_HIGH_THRESHOLD)) {
+ /* Try to find the next lsap we should poll. */
+ next = self->flow_next;
+ /* If we have no lsap, restart from first one */
+ if(next == NULL)
+ next = (struct lsap_cb *) hashbin_get_first(self->lsaps);
+ /* Verify current one and find the next one */
+ curr = hashbin_find_next(self->lsaps, (long) next, NULL,
+ (void *) &self->flow_next);
+ /* Uh-oh... Paranoia */
+ if(curr == NULL)
+ break;
+ pr_debug("%s() : curr is %p, next was %p and is now %p, still %d to go - queue len = %d\n",
+ __func__, curr, next, self->flow_next, lsap_todo,
+ IRLAP_GET_TX_QUEUE_LEN(self->irlap));
+
+ /* Inform lsap user that it can send one more packet. */
+ if (curr->notify.flow_indication != NULL)
+ curr->notify.flow_indication(curr->notify.instance,
+ curr, flow);
+ else
+ pr_debug("%s(), no handler\n", __func__);
+ }
+}
+
+#if 0
+/*
+ * Function irlmp_hint_to_service (hint)
+ *
+ * Returns a list of all servics contained in the given hint bits. This
+ * function assumes that the hint bits have the size of two bytes only
+ */
+__u8 *irlmp_hint_to_service(__u8 *hint)
+{
+ __u8 *service;
+ int i = 0;
+
+ /*
+ * Allocate array to store services in. 16 entries should be safe
+ * since we currently only support 2 hint bytes
+ */
+ service = kmalloc(16, GFP_ATOMIC);
+ if (!service)
+ return NULL;
+
+ if (!hint[0]) {
+ pr_debug("<None>\n");
+ kfree(service);
+ return NULL;
+ }
+ if (hint[0] & HINT_PNP)
+ pr_debug("PnP Compatible ");
+ if (hint[0] & HINT_PDA)
+ pr_debug("PDA/Palmtop ");
+ if (hint[0] & HINT_COMPUTER)
+ pr_debug("Computer ");
+ if (hint[0] & HINT_PRINTER) {
+ pr_debug("Printer ");
+ service[i++] = S_PRINTER;
+ }
+ if (hint[0] & HINT_MODEM)
+ pr_debug("Modem ");
+ if (hint[0] & HINT_FAX)
+ pr_debug("Fax ");
+ if (hint[0] & HINT_LAN) {
+ pr_debug("LAN Access ");
+ service[i++] = S_LAN;
+ }
+ /*
+ * Test if extension byte exists. This byte will usually be
+ * there, but this is not really required by the standard.
+ * (IrLMP p. 29)
+ */
+ if (hint[0] & HINT_EXTENSION) {
+ if (hint[1] & HINT_TELEPHONY) {
+ pr_debug("Telephony ");
+ service[i++] = S_TELEPHONY;
+ }
+ if (hint[1] & HINT_FILE_SERVER)
+ pr_debug("File Server ");
+
+ if (hint[1] & HINT_COMM) {
+ pr_debug("IrCOMM ");
+ service[i++] = S_COMM;
+ }
+ if (hint[1] & HINT_OBEX) {
+ pr_debug("IrOBEX ");
+ service[i++] = S_OBEX;
+ }
+ }
+ pr_debug("\n");
+
+ /* So that client can be notified about any discovery */
+ service[i++] = S_ANY;
+
+ service[i] = S_END;
+
+ return service;
+}
+#endif
+
+static const __u16 service_hint_mapping[S_END][2] = {
+ { HINT_PNP, 0 }, /* S_PNP */
+ { HINT_PDA, 0 }, /* S_PDA */
+ { HINT_COMPUTER, 0 }, /* S_COMPUTER */
+ { HINT_PRINTER, 0 }, /* S_PRINTER */
+ { HINT_MODEM, 0 }, /* S_MODEM */
+ { HINT_FAX, 0 }, /* S_FAX */
+ { HINT_LAN, 0 }, /* S_LAN */
+ { HINT_EXTENSION, HINT_TELEPHONY }, /* S_TELEPHONY */
+ { HINT_EXTENSION, HINT_COMM }, /* S_COMM */
+ { HINT_EXTENSION, HINT_OBEX }, /* S_OBEX */
+ { 0xFF, 0xFF }, /* S_ANY */
+};
+
+/*
+ * Function irlmp_service_to_hint (service)
+ *
+ * Converts a service type, to a hint bit
+ *
+ * Returns: a 16 bit hint value, with the service bit set
+ */
+__u16 irlmp_service_to_hint(int service)
+{
+ __u16_host_order hint;
+
+ hint.byte[0] = service_hint_mapping[service][0];
+ hint.byte[1] = service_hint_mapping[service][1];
+
+ return hint.word;
+}
+EXPORT_SYMBOL(irlmp_service_to_hint);
+
+/*
+ * Function irlmp_register_service (service)
+ *
+ * Register local service with IrLMP
+ *
+ */
+void *irlmp_register_service(__u16 hints)
+{
+ irlmp_service_t *service;
+
+ pr_debug("%s(), hints = %04x\n", __func__, hints);
+
+ /* Make a new registration */
+ service = kmalloc(sizeof(irlmp_service_t), GFP_ATOMIC);
+ if (!service)
+ return NULL;
+
+ service->hints.word = hints;
+ hashbin_insert(irlmp->services, (irda_queue_t *) service,
+ (long) service, NULL);
+
+ irlmp->hints.word |= hints;
+
+ return (void *)service;
+}
+EXPORT_SYMBOL(irlmp_register_service);
+
+/*
+ * Function irlmp_unregister_service (handle)
+ *
+ * Unregister service with IrLMP.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int irlmp_unregister_service(void *handle)
+{
+ irlmp_service_t *service;
+ unsigned long flags;
+
+ if (!handle)
+ return -1;
+
+ /* Caller may call with invalid handle (it's legal) - Jean II */
+ service = hashbin_lock_find(irlmp->services, (long) handle, NULL);
+ if (!service) {
+ pr_debug("%s(), Unknown service!\n", __func__);
+ return -1;
+ }
+
+ hashbin_remove_this(irlmp->services, (irda_queue_t *) service);
+ kfree(service);
+
+ /* Remove old hint bits */
+ irlmp->hints.word = 0;
+
+ /* Refresh current hint bits */
+ spin_lock_irqsave(&irlmp->services->hb_spinlock, flags);
+ service = (irlmp_service_t *) hashbin_get_first(irlmp->services);
+ while (service) {
+ irlmp->hints.word |= service->hints.word;
+
+ service = (irlmp_service_t *)hashbin_get_next(irlmp->services);
+ }
+ spin_unlock_irqrestore(&irlmp->services->hb_spinlock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(irlmp_unregister_service);
+
+/*
+ * Function irlmp_register_client (hint_mask, callback1, callback2)
+ *
+ * Register a local client with IrLMP
+ * First callback is selective discovery (based on hints)
+ * Second callback is for selective discovery expiries
+ *
+ * Returns: handle > 0 on success, 0 on error
+ */
+void *irlmp_register_client(__u16 hint_mask, DISCOVERY_CALLBACK1 disco_clb,
+ DISCOVERY_CALLBACK2 expir_clb, void *priv)
+{
+ irlmp_client_t *client;
+
+ IRDA_ASSERT(irlmp != NULL, return NULL;);
+
+ /* Make a new registration */
+ client = kmalloc(sizeof(irlmp_client_t), GFP_ATOMIC);
+ if (!client)
+ return NULL;
+
+ /* Register the details */
+ client->hint_mask.word = hint_mask;
+ client->disco_callback = disco_clb;
+ client->expir_callback = expir_clb;
+ client->priv = priv;
+
+ hashbin_insert(irlmp->clients, (irda_queue_t *) client,
+ (long) client, NULL);
+
+ return (void *) client;
+}
+EXPORT_SYMBOL(irlmp_register_client);
+
+/*
+ * Function irlmp_update_client (handle, hint_mask, callback1, callback2)
+ *
+ * Updates specified client (handle) with possibly new hint_mask and
+ * callback
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int irlmp_update_client(void *handle, __u16 hint_mask,
+ DISCOVERY_CALLBACK1 disco_clb,
+ DISCOVERY_CALLBACK2 expir_clb, void *priv)
+{
+ irlmp_client_t *client;
+
+ if (!handle)
+ return -1;
+
+ client = hashbin_lock_find(irlmp->clients, (long) handle, NULL);
+ if (!client) {
+ pr_debug("%s(), Unknown client!\n", __func__);
+ return -1;
+ }
+
+ client->hint_mask.word = hint_mask;
+ client->disco_callback = disco_clb;
+ client->expir_callback = expir_clb;
+ client->priv = priv;
+
+ return 0;
+}
+EXPORT_SYMBOL(irlmp_update_client);
+
+/*
+ * Function irlmp_unregister_client (handle)
+ *
+ * Returns: 0 on success, -1 on error
+ *
+ */
+int irlmp_unregister_client(void *handle)
+{
+ struct irlmp_client *client;
+
+ if (!handle)
+ return -1;
+
+ /* Caller may call with invalid handle (it's legal) - Jean II */
+ client = hashbin_lock_find(irlmp->clients, (long) handle, NULL);
+ if (!client) {
+ pr_debug("%s(), Unknown client!\n", __func__);
+ return -1;
+ }
+
+ pr_debug("%s(), removing client!\n", __func__);
+ hashbin_remove_this(irlmp->clients, (irda_queue_t *) client);
+ kfree(client);
+
+ return 0;
+}
+EXPORT_SYMBOL(irlmp_unregister_client);
+
+/*
+ * Function irlmp_slsap_inuse (slsap)
+ *
+ * Check if the given source LSAP selector is in use
+ *
+ * This function is clearly not very efficient. On the mitigating side, the
+ * stack make sure that in 99% of the cases, we are called only once
+ * for each socket allocation. We could probably keep a bitmap
+ * of the allocated LSAP, but I'm not sure the complexity is worth it.
+ * Jean II
+ */
+static int irlmp_slsap_inuse(__u8 slsap_sel)
+{
+ struct lsap_cb *self;
+ struct lap_cb *lap;
+ unsigned long flags;
+
+ IRDA_ASSERT(irlmp != NULL, return TRUE;);
+ IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return TRUE;);
+ IRDA_ASSERT(slsap_sel != LSAP_ANY, return TRUE;);
+
+#ifdef CONFIG_IRDA_ULTRA
+ /* Accept all bindings to the connectionless LSAP */
+ if (slsap_sel == LSAP_CONNLESS)
+ return FALSE;
+#endif /* CONFIG_IRDA_ULTRA */
+
+ /* Valid values are between 0 and 127 (0x0-0x6F) */
+ if (slsap_sel > LSAP_MAX)
+ return TRUE;
+
+ /*
+ * Check if slsap is already in use. To do this we have to loop over
+ * every IrLAP connection and check every LSAP associated with each
+ * the connection.
+ */
+ spin_lock_irqsave_nested(&irlmp->links->hb_spinlock, flags,
+ SINGLE_DEPTH_NESTING);
+ lap = (struct lap_cb *) hashbin_get_first(irlmp->links);
+ while (lap != NULL) {
+ IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, goto errlap;);
+
+ /* Careful for priority inversions here !
+ * irlmp->links is never taken while another IrDA
+ * spinlock is held, so we are safe. Jean II */
+ spin_lock(&lap->lsaps->hb_spinlock);
+
+ /* For this IrLAP, check all the LSAPs */
+ self = (struct lsap_cb *) hashbin_get_first(lap->lsaps);
+ while (self != NULL) {
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC,
+ goto errlsap;);
+
+ if ((self->slsap_sel == slsap_sel)) {
+ pr_debug("Source LSAP selector=%02x in use\n",
+ self->slsap_sel);
+ goto errlsap;
+ }
+ self = (struct lsap_cb*) hashbin_get_next(lap->lsaps);
+ }
+ spin_unlock(&lap->lsaps->hb_spinlock);
+
+ /* Next LAP */
+ lap = (struct lap_cb *) hashbin_get_next(irlmp->links);
+ }
+ spin_unlock_irqrestore(&irlmp->links->hb_spinlock, flags);
+
+ /*
+ * Server sockets are typically waiting for connections and
+ * therefore reside in the unconnected list. We don't want
+ * to give out their LSAPs for obvious reasons...
+ * Jean II
+ */
+ spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+ self = (struct lsap_cb *) hashbin_get_first(irlmp->unconnected_lsaps);
+ while (self != NULL) {
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, goto erruncon;);
+ if ((self->slsap_sel == slsap_sel)) {
+ pr_debug("Source LSAP selector=%02x in use (unconnected)\n",
+ self->slsap_sel);
+ goto erruncon;
+ }
+ self = (struct lsap_cb*) hashbin_get_next(irlmp->unconnected_lsaps);
+ }
+ spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+ return FALSE;
+
+ /* Error exit from within one of the two nested loops.
+ * Make sure we release the right spinlock in the righ order.
+ * Jean II */
+errlsap:
+ spin_unlock(&lap->lsaps->hb_spinlock);
+IRDA_ASSERT_LABEL(errlap:)
+ spin_unlock_irqrestore(&irlmp->links->hb_spinlock, flags);
+ return TRUE;
+
+ /* Error exit from within the unconnected loop.
+ * Just one spinlock to release... Jean II */
+erruncon:
+ spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+ return TRUE;
+}
+
+/*
+ * Function irlmp_find_free_slsap ()
+ *
+ * Find a free source LSAP to use. This function is called if the service
+ * user has requested a source LSAP equal to LM_ANY
+ */
+static __u8 irlmp_find_free_slsap(void)
+{
+ __u8 lsap_sel;
+ int wrapped = 0;
+
+ IRDA_ASSERT(irlmp != NULL, return -1;);
+ IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return -1;);
+
+ /* Most users don't really care which LSAPs they are given,
+ * and therefore we automatically give them a free LSAP.
+ * This function try to find a suitable LSAP, i.e. which is
+ * not in use and is within the acceptable range. Jean II */
+
+ do {
+ /* Always increment to LSAP number before using it.
+ * In theory, we could reuse the last LSAP number, as long
+ * as it is no longer in use. Some IrDA stack do that.
+ * However, the previous socket may be half closed, i.e.
+ * we closed it, we think it's no longer in use, but the
+ * other side did not receive our close and think it's
+ * active and still send data on it.
+ * This is similar to what is done with PIDs and TCP ports.
+ * Also, this reduce the number of calls to irlmp_slsap_inuse()
+ * which is an expensive function to call.
+ * Jean II */
+ irlmp->last_lsap_sel++;
+
+ /* Check if we need to wraparound (0x70-0x7f are reserved) */
+ if (irlmp->last_lsap_sel > LSAP_MAX) {
+ /* 0x00-0x10 are also reserved for well know ports */
+ irlmp->last_lsap_sel = 0x10;
+
+ /* Make sure we terminate the loop */
+ if (wrapped++) {
+ net_err_ratelimited("%s: no more free LSAPs !\n",
+ __func__);
+ return 0;
+ }
+ }
+
+ /* If the LSAP is in use, try the next one.
+ * Despite the autoincrement, we need to check if the lsap
+ * is really in use or not, first because LSAP may be
+ * directly allocated in irlmp_open_lsap(), and also because
+ * we may wraparound on old sockets. Jean II */
+ } while (irlmp_slsap_inuse(irlmp->last_lsap_sel));
+
+ /* Got it ! */
+ lsap_sel = irlmp->last_lsap_sel;
+ pr_debug("%s(), found free lsap_sel=%02x\n",
+ __func__, lsap_sel);
+
+ return lsap_sel;
+}
+
+/*
+ * Function irlmp_convert_lap_reason (lap_reason)
+ *
+ * Converts IrLAP disconnect reason codes to IrLMP disconnect reason
+ * codes
+ *
+ */
+LM_REASON irlmp_convert_lap_reason( LAP_REASON lap_reason)
+{
+ int reason = LM_LAP_DISCONNECT;
+
+ switch (lap_reason) {
+ case LAP_DISC_INDICATION: /* Received a disconnect request from peer */
+ pr_debug("%s(), LAP_DISC_INDICATION\n", __func__);
+ reason = LM_USER_REQUEST;
+ break;
+ case LAP_NO_RESPONSE: /* To many retransmits without response */
+ pr_debug("%s(), LAP_NO_RESPONSE\n", __func__);
+ reason = LM_LAP_DISCONNECT;
+ break;
+ case LAP_RESET_INDICATION:
+ pr_debug("%s(), LAP_RESET_INDICATION\n", __func__);
+ reason = LM_LAP_RESET;
+ break;
+ case LAP_FOUND_NONE:
+ case LAP_MEDIA_BUSY:
+ case LAP_PRIMARY_CONFLICT:
+ pr_debug("%s(), LAP_FOUND_NONE, LAP_MEDIA_BUSY or LAP_PRIMARY_CONFLICT\n",
+ __func__);
+ reason = LM_CONNECT_FAILURE;
+ break;
+ default:
+ pr_debug("%s(), Unknown IrLAP disconnect reason %d!\n",
+ __func__, lap_reason);
+ reason = LM_LAP_DISCONNECT;
+ break;
+ }
+
+ return reason;
+}
+
+#ifdef CONFIG_PROC_FS
+
+struct irlmp_iter_state {
+ hashbin_t *hashbin;
+};
+
+#define LSAP_START_TOKEN ((void *)1)
+#define LINK_START_TOKEN ((void *)2)
+
+static void *irlmp_seq_hb_idx(struct irlmp_iter_state *iter, loff_t *off)
+{
+ void *element;
+
+ spin_lock_irq(&iter->hashbin->hb_spinlock);
+ for (element = hashbin_get_first(iter->hashbin);
+ element != NULL;
+ element = hashbin_get_next(iter->hashbin)) {
+ if (!off || *off-- == 0) {
+ /* NB: hashbin left locked */
+ return element;
+ }
+ }
+ spin_unlock_irq(&iter->hashbin->hb_spinlock);
+ iter->hashbin = NULL;
+ return NULL;
+}
+
+
+static void *irlmp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct irlmp_iter_state *iter = seq->private;
+ void *v;
+ loff_t off = *pos;
+
+ iter->hashbin = NULL;
+ if (off-- == 0)
+ return LSAP_START_TOKEN;
+
+ iter->hashbin = irlmp->unconnected_lsaps;
+ v = irlmp_seq_hb_idx(iter, &off);
+ if (v)
+ return v;
+
+ if (off-- == 0)
+ return LINK_START_TOKEN;
+
+ iter->hashbin = irlmp->links;
+ return irlmp_seq_hb_idx(iter, &off);
+}
+
+static void *irlmp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct irlmp_iter_state *iter = seq->private;
+
+ ++*pos;
+
+ if (v == LSAP_START_TOKEN) { /* start of list of lsaps */
+ iter->hashbin = irlmp->unconnected_lsaps;
+ v = irlmp_seq_hb_idx(iter, NULL);
+ return v ? v : LINK_START_TOKEN;
+ }
+
+ if (v == LINK_START_TOKEN) { /* start of list of links */
+ iter->hashbin = irlmp->links;
+ return irlmp_seq_hb_idx(iter, NULL);
+ }
+
+ v = hashbin_get_next(iter->hashbin);
+
+ if (v == NULL) { /* no more in this hash bin */
+ spin_unlock_irq(&iter->hashbin->hb_spinlock);
+
+ if (iter->hashbin == irlmp->unconnected_lsaps)
+ v = LINK_START_TOKEN;
+
+ iter->hashbin = NULL;
+ }
+ return v;
+}
+
+static void irlmp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct irlmp_iter_state *iter = seq->private;
+
+ if (iter->hashbin)
+ spin_unlock_irq(&iter->hashbin->hb_spinlock);
+}
+
+static int irlmp_seq_show(struct seq_file *seq, void *v)
+{
+ const struct irlmp_iter_state *iter = seq->private;
+ struct lsap_cb *self = v;
+
+ if (v == LSAP_START_TOKEN)
+ seq_puts(seq, "Unconnected LSAPs:\n");
+ else if (v == LINK_START_TOKEN)
+ seq_puts(seq, "\nRegistered Link Layers:\n");
+ else if (iter->hashbin == irlmp->unconnected_lsaps) {
+ self = v;
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -EINVAL; );
+ seq_printf(seq, "lsap state: %s, ",
+ irlsap_state[ self->lsap_state]);
+ seq_printf(seq,
+ "slsap_sel: %#02x, dlsap_sel: %#02x, ",
+ self->slsap_sel, self->dlsap_sel);
+ seq_printf(seq, "(%s)", self->notify.name);
+ seq_printf(seq, "\n");
+ } else if (iter->hashbin == irlmp->links) {
+ struct lap_cb *lap = v;
+
+ seq_printf(seq, "lap state: %s, ",
+ irlmp_state[lap->lap_state]);
+
+ seq_printf(seq, "saddr: %#08x, daddr: %#08x, ",
+ lap->saddr, lap->daddr);
+ seq_printf(seq, "num lsaps: %d",
+ HASHBIN_GET_SIZE(lap->lsaps));
+ seq_printf(seq, "\n");
+
+ /* Careful for priority inversions here !
+ * All other uses of attrib spinlock are independent of
+ * the object spinlock, so we are safe. Jean II */
+ spin_lock(&lap->lsaps->hb_spinlock);
+
+ seq_printf(seq, "\n Connected LSAPs:\n");
+ for (self = (struct lsap_cb *) hashbin_get_first(lap->lsaps);
+ self != NULL;
+ self = (struct lsap_cb *)hashbin_get_next(lap->lsaps)) {
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC,
+ goto outloop;);
+ seq_printf(seq, " lsap state: %s, ",
+ irlsap_state[ self->lsap_state]);
+ seq_printf(seq,
+ "slsap_sel: %#02x, dlsap_sel: %#02x, ",
+ self->slsap_sel, self->dlsap_sel);
+ seq_printf(seq, "(%s)", self->notify.name);
+ seq_putc(seq, '\n');
+
+ }
+ IRDA_ASSERT_LABEL(outloop:)
+ spin_unlock(&lap->lsaps->hb_spinlock);
+ seq_putc(seq, '\n');
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+static const struct seq_operations irlmp_seq_ops = {
+ .start = irlmp_seq_start,
+ .next = irlmp_seq_next,
+ .stop = irlmp_seq_stop,
+ .show = irlmp_seq_show,
+};
+
+static int irlmp_seq_open(struct inode *inode, struct file *file)
+{
+ IRDA_ASSERT(irlmp != NULL, return -EINVAL;);
+
+ return seq_open_private(file, &irlmp_seq_ops,
+ sizeof(struct irlmp_iter_state));
+}
+
+const struct file_operations irlmp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = irlmp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif /* PROC_FS */
diff --git a/net/irda/irlmp_event.c b/net/irda/irlmp_event.c
new file mode 100644
index 000000000..e306cf2c1
--- /dev/null
+++ b/net/irda/irlmp_event.c
@@ -0,0 +1,886 @@
+/*********************************************************************
+ *
+ * Filename: irlmp_event.c
+ * Version: 0.8
+ * Description: An IrDA LMP event driver for Linux
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Mon Aug 4 20:40:53 1997
+ * Modified at: Tue Dec 14 23:04:16 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/kernel.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/timer.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irlmp_frame.h>
+#include <net/irda/irlmp_event.h>
+
+const char *const irlmp_state[] = {
+ "LAP_STANDBY",
+ "LAP_U_CONNECT",
+ "LAP_ACTIVE",
+};
+
+const char *const irlsap_state[] = {
+ "LSAP_DISCONNECTED",
+ "LSAP_CONNECT",
+ "LSAP_CONNECT_PEND",
+ "LSAP_DATA_TRANSFER_READY",
+ "LSAP_SETUP",
+ "LSAP_SETUP_PEND",
+};
+
+static const char *const irlmp_event[] __maybe_unused = {
+ "LM_CONNECT_REQUEST",
+ "LM_CONNECT_CONFIRM",
+ "LM_CONNECT_RESPONSE",
+ "LM_CONNECT_INDICATION",
+
+ "LM_DISCONNECT_INDICATION",
+ "LM_DISCONNECT_REQUEST",
+
+ "LM_DATA_REQUEST",
+ "LM_UDATA_REQUEST",
+ "LM_DATA_INDICATION",
+ "LM_UDATA_INDICATION",
+
+ "LM_WATCHDOG_TIMEOUT",
+
+ /* IrLAP events */
+ "LM_LAP_CONNECT_REQUEST",
+ "LM_LAP_CONNECT_INDICATION",
+ "LM_LAP_CONNECT_CONFIRM",
+ "LM_LAP_DISCONNECT_INDICATION",
+ "LM_LAP_DISCONNECT_REQUEST",
+ "LM_LAP_DISCOVERY_REQUEST",
+ "LM_LAP_DISCOVERY_CONFIRM",
+ "LM_LAP_IDLE_TIMEOUT",
+};
+
+/* LAP Connection control proto declarations */
+static void irlmp_state_standby (struct lap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+static void irlmp_state_u_connect(struct lap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+static void irlmp_state_active (struct lap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+
+/* LSAP Connection control proto declarations */
+static int irlmp_state_disconnected(struct lsap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+static int irlmp_state_connect (struct lsap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+static int irlmp_state_connect_pend(struct lsap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+static int irlmp_state_dtr (struct lsap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+static int irlmp_state_setup (struct lsap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+static int irlmp_state_setup_pend (struct lsap_cb *, IRLMP_EVENT,
+ struct sk_buff *);
+
+static void (*lap_state[]) (struct lap_cb *, IRLMP_EVENT, struct sk_buff *) =
+{
+ irlmp_state_standby,
+ irlmp_state_u_connect,
+ irlmp_state_active,
+};
+
+static int (*lsap_state[])( struct lsap_cb *, IRLMP_EVENT, struct sk_buff *) =
+{
+ irlmp_state_disconnected,
+ irlmp_state_connect,
+ irlmp_state_connect_pend,
+ irlmp_state_dtr,
+ irlmp_state_setup,
+ irlmp_state_setup_pend
+};
+
+static inline void irlmp_next_lap_state(struct lap_cb *self,
+ IRLMP_STATE state)
+{
+ /*
+ pr_debug("%s(), LMP LAP = %s\n", __func__, irlmp_state[state]);
+ */
+ self->lap_state = state;
+}
+
+static inline void irlmp_next_lsap_state(struct lsap_cb *self,
+ LSAP_STATE state)
+{
+ /*
+ IRDA_ASSERT(self != NULL, return;);
+ pr_debug("%s(), LMP LSAP = %s\n", __func__, irlsap_state[state]);
+ */
+ self->lsap_state = state;
+}
+
+/* Do connection control events */
+int irlmp_do_lsap_event(struct lsap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+ pr_debug("%s(), EVENT = %s, STATE = %s\n",
+ __func__, irlmp_event[event], irlsap_state[self->lsap_state]);
+
+ return (*lsap_state[self->lsap_state]) (self, event, skb);
+}
+
+/*
+ * Function do_lap_event (event, skb, info)
+ *
+ * Do IrLAP control events
+ *
+ */
+void irlmp_do_lap_event(struct lap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+
+ pr_debug("%s(), EVENT = %s, STATE = %s\n", __func__,
+ irlmp_event[event],
+ irlmp_state[self->lap_state]);
+
+ (*lap_state[self->lap_state]) (self, event, skb);
+}
+
+void irlmp_discovery_timer_expired(void *data)
+{
+ /* We always cleanup the log (active & passive discovery) */
+ irlmp_do_expiry();
+
+ irlmp_do_discovery(sysctl_discovery_slots);
+
+ /* Restart timer */
+ irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout * HZ);
+}
+
+void irlmp_watchdog_timer_expired(void *data)
+{
+ struct lsap_cb *self = (struct lsap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+
+ irlmp_do_lsap_event(self, LM_WATCHDOG_TIMEOUT, NULL);
+}
+
+void irlmp_idle_timer_expired(void *data)
+{
+ struct lap_cb *self = (struct lap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+
+ irlmp_do_lap_event(self, LM_LAP_IDLE_TIMEOUT, NULL);
+}
+
+/*
+ * Send an event on all LSAPs attached to this LAP.
+ */
+static inline void
+irlmp_do_all_lsap_event(hashbin_t * lsap_hashbin,
+ IRLMP_EVENT event)
+{
+ struct lsap_cb *lsap;
+ struct lsap_cb *lsap_next;
+
+ /* Note : this function use the new hashbin_find_next()
+ * function, instead of the old hashbin_get_next().
+ * This make sure that we are always pointing one lsap
+ * ahead, so that if the current lsap is removed as the
+ * result of sending the event, we don't care.
+ * Also, as we store the context ourselves, if an enumeration
+ * of the same lsap hashbin happens as the result of sending the
+ * event, we don't care.
+ * The only problem is if the next lsap is removed. In that case,
+ * hashbin_find_next() will return NULL and we will abort the
+ * enumeration. - Jean II */
+
+ /* Also : we don't accept any skb in input. We can *NOT* pass
+ * the same skb to multiple clients safely, we would need to
+ * skb_clone() it. - Jean II */
+
+ lsap = (struct lsap_cb *) hashbin_get_first(lsap_hashbin);
+
+ while (NULL != hashbin_find_next(lsap_hashbin,
+ (long) lsap,
+ NULL,
+ (void *) &lsap_next) ) {
+ irlmp_do_lsap_event(lsap, event, NULL);
+ lsap = lsap_next;
+ }
+}
+
+/*********************************************************************
+ *
+ * LAP connection control states
+ *
+ ********************************************************************/
+
+/*
+ * Function irlmp_state_standby (event, skb, info)
+ *
+ * STANDBY, The IrLAP connection does not exist.
+ *
+ */
+static void irlmp_state_standby(struct lap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self->irlap != NULL, return;);
+
+ switch (event) {
+ case LM_LAP_DISCOVERY_REQUEST:
+ /* irlmp_next_station_state( LMP_DISCOVER); */
+
+ irlap_discovery_request(self->irlap, &irlmp->discovery_cmd);
+ break;
+ case LM_LAP_CONNECT_INDICATION:
+ /* It's important to switch state first, to avoid IrLMP to
+ * think that the link is free since IrLMP may then start
+ * discovery before the connection is properly set up. DB.
+ */
+ irlmp_next_lap_state(self, LAP_ACTIVE);
+
+ /* Just accept connection TODO, this should be fixed */
+ irlap_connect_response(self->irlap, skb);
+ break;
+ case LM_LAP_CONNECT_REQUEST:
+ pr_debug("%s() LS_CONNECT_REQUEST\n", __func__);
+
+ irlmp_next_lap_state(self, LAP_U_CONNECT);
+
+ /* FIXME: need to set users requested QoS */
+ irlap_connect_request(self->irlap, self->daddr, NULL, 0);
+ break;
+ case LM_LAP_DISCONNECT_INDICATION:
+ pr_debug("%s(), Error LM_LAP_DISCONNECT_INDICATION\n",
+ __func__);
+
+ irlmp_next_lap_state(self, LAP_STANDBY);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n",
+ __func__, irlmp_event[event]);
+ break;
+ }
+}
+
+/*
+ * Function irlmp_state_u_connect (event, skb, info)
+ *
+ * U_CONNECT, The layer above has tried to open an LSAP connection but
+ * since the IrLAP connection does not exist, we must first start an
+ * IrLAP connection. We are now waiting response from IrLAP.
+ * */
+static void irlmp_state_u_connect(struct lap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ pr_debug("%s(), event=%s\n", __func__, irlmp_event[event]);
+
+ switch (event) {
+ case LM_LAP_CONNECT_INDICATION:
+ /* It's important to switch state first, to avoid IrLMP to
+ * think that the link is free since IrLMP may then start
+ * discovery before the connection is properly set up. DB.
+ */
+ irlmp_next_lap_state(self, LAP_ACTIVE);
+
+ /* Just accept connection TODO, this should be fixed */
+ irlap_connect_response(self->irlap, skb);
+
+ /* Tell LSAPs that they can start sending data */
+ irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM);
+
+ /* Note : by the time we get there (LAP retries and co),
+ * the lsaps may already have gone. This avoid getting stuck
+ * forever in LAP_ACTIVE state - Jean II */
+ if (HASHBIN_GET_SIZE(self->lsaps) == 0) {
+ pr_debug("%s() NO LSAPs !\n", __func__);
+ irlmp_start_idle_timer(self, LM_IDLE_TIMEOUT);
+ }
+ break;
+ case LM_LAP_CONNECT_REQUEST:
+ /* Already trying to connect */
+ break;
+ case LM_LAP_CONNECT_CONFIRM:
+ /* For all lsap_ce E Associated do LS_Connect_confirm */
+ irlmp_next_lap_state(self, LAP_ACTIVE);
+
+ /* Tell LSAPs that they can start sending data */
+ irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM);
+
+ /* Note : by the time we get there (LAP retries and co),
+ * the lsaps may already have gone. This avoid getting stuck
+ * forever in LAP_ACTIVE state - Jean II */
+ if (HASHBIN_GET_SIZE(self->lsaps) == 0) {
+ pr_debug("%s() NO LSAPs !\n", __func__);
+ irlmp_start_idle_timer(self, LM_IDLE_TIMEOUT);
+ }
+ break;
+ case LM_LAP_DISCONNECT_INDICATION:
+ pr_debug("%s(), LM_LAP_DISCONNECT_INDICATION\n", __func__);
+ irlmp_next_lap_state(self, LAP_STANDBY);
+
+ /* Send disconnect event to all LSAPs using this link */
+ irlmp_do_all_lsap_event(self->lsaps,
+ LM_LAP_DISCONNECT_INDICATION);
+ break;
+ case LM_LAP_DISCONNECT_REQUEST:
+ pr_debug("%s(), LM_LAP_DISCONNECT_REQUEST\n", __func__);
+
+ /* One of the LSAP did timeout or was closed, if it was
+ * the last one, try to get out of here - Jean II */
+ if (HASHBIN_GET_SIZE(self->lsaps) <= 1) {
+ irlap_disconnect_request(self->irlap);
+ }
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n",
+ __func__, irlmp_event[event]);
+ break;
+ }
+}
+
+/*
+ * Function irlmp_state_active (event, skb, info)
+ *
+ * ACTIVE, IrLAP connection is active
+ *
+ */
+static void irlmp_state_active(struct lap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ switch (event) {
+ case LM_LAP_CONNECT_REQUEST:
+ pr_debug("%s(), LS_CONNECT_REQUEST\n", __func__);
+
+ /*
+ * IrLAP may have a pending disconnect. We tried to close
+ * IrLAP, but it was postponed because the link was
+ * busy or we were still sending packets. As we now
+ * need it, make sure it stays on. Jean II
+ */
+ irlap_clear_disconnect(self->irlap);
+
+ /*
+ * LAP connection already active, just bounce back! Since we
+ * don't know which LSAP that tried to do this, we have to
+ * notify all LSAPs using this LAP, but that should be safe to
+ * do anyway.
+ */
+ irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM);
+
+ /* Needed by connect indication */
+ irlmp_do_all_lsap_event(irlmp->unconnected_lsaps,
+ LM_LAP_CONNECT_CONFIRM);
+ /* Keep state */
+ break;
+ case LM_LAP_DISCONNECT_REQUEST:
+ /*
+ * Need to find out if we should close IrLAP or not. If there
+ * is only one LSAP connection left on this link, that LSAP
+ * must be the one that tries to close IrLAP. It will be
+ * removed later and moved to the list of unconnected LSAPs
+ */
+ if (HASHBIN_GET_SIZE(self->lsaps) > 0) {
+ /* Timer value is checked in irsysctl - Jean II */
+ irlmp_start_idle_timer(self, sysctl_lap_keepalive_time * HZ / 1000);
+ } else {
+ /* No more connections, so close IrLAP */
+
+ /* We don't want to change state just yet, because
+ * we want to reflect accurately the real state of
+ * the LAP, not the state we wish it was in,
+ * so that we don't lose LM_LAP_CONNECT_REQUEST.
+ * In some cases, IrLAP won't close the LAP
+ * immediately. For example, it might still be
+ * retrying packets or waiting for the pf bit.
+ * As the LAP always send a DISCONNECT_INDICATION
+ * in PCLOSE or SCLOSE, just change state on that.
+ * Jean II */
+ irlap_disconnect_request(self->irlap);
+ }
+ break;
+ case LM_LAP_IDLE_TIMEOUT:
+ if (HASHBIN_GET_SIZE(self->lsaps) == 0) {
+ /* Same reasoning as above - keep state */
+ irlap_disconnect_request(self->irlap);
+ }
+ break;
+ case LM_LAP_DISCONNECT_INDICATION:
+ irlmp_next_lap_state(self, LAP_STANDBY);
+
+ /* In some case, at this point our side has already closed
+ * all lsaps, and we are waiting for the idle_timer to
+ * expire. If another device reconnect immediately, the
+ * idle timer will expire in the midle of the connection
+ * initialisation, screwing up things a lot...
+ * Therefore, we must stop the timer... */
+ irlmp_stop_idle_timer(self);
+
+ /*
+ * Inform all connected LSAP's using this link
+ */
+ irlmp_do_all_lsap_event(self->lsaps,
+ LM_LAP_DISCONNECT_INDICATION);
+
+ /* Force an expiry of the discovery log.
+ * Now that the LAP is free, the system may attempt to
+ * connect to another device. Unfortunately, our entries
+ * are stale. There is a small window (<3s) before the
+ * normal discovery will run and where irlmp_connect_request()
+ * can get the wrong info, so make sure things get
+ * cleaned *NOW* ;-) - Jean II */
+ irlmp_do_expiry();
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s\n",
+ __func__, irlmp_event[event]);
+ break;
+ }
+}
+
+/*********************************************************************
+ *
+ * LSAP connection control states
+ *
+ ********************************************************************/
+
+/*
+ * Function irlmp_state_disconnected (event, skb, info)
+ *
+ * DISCONNECTED
+ *
+ */
+static int irlmp_state_disconnected(struct lsap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+ switch (event) {
+#ifdef CONFIG_IRDA_ULTRA
+ case LM_UDATA_INDICATION:
+ /* This is most bizarre. Those packets are aka unreliable
+ * connected, aka IrLPT or SOCK_DGRAM/IRDAPROTO_UNITDATA.
+ * Why do we pass them as Ultra ??? Jean II */
+ irlmp_connless_data_indication(self, skb);
+ break;
+#endif /* CONFIG_IRDA_ULTRA */
+ case LM_CONNECT_REQUEST:
+ pr_debug("%s(), LM_CONNECT_REQUEST\n", __func__);
+
+ if (self->conn_skb) {
+ net_warn_ratelimited("%s: busy with another request!\n",
+ __func__);
+ return -EBUSY;
+ }
+ /* Don't forget to refcount it (see irlmp_connect_request()) */
+ skb_get(skb);
+ self->conn_skb = skb;
+
+ irlmp_next_lsap_state(self, LSAP_SETUP_PEND);
+
+ /* Start watchdog timer (5 secs for now) */
+ irlmp_start_watchdog_timer(self, 5*HZ);
+
+ irlmp_do_lap_event(self->lap, LM_LAP_CONNECT_REQUEST, NULL);
+ break;
+ case LM_CONNECT_INDICATION:
+ if (self->conn_skb) {
+ net_warn_ratelimited("%s: busy with another request!\n",
+ __func__);
+ return -EBUSY;
+ }
+ /* Don't forget to refcount it (see irlap_driver_rcv()) */
+ skb_get(skb);
+ self->conn_skb = skb;
+
+ irlmp_next_lsap_state(self, LSAP_CONNECT_PEND);
+
+ /* Start watchdog timer
+ * This is not mentionned in the spec, but there is a rare
+ * race condition that can get the socket stuck.
+ * If we receive this event while our LAP is closing down,
+ * the LM_LAP_CONNECT_REQUEST get lost and we get stuck in
+ * CONNECT_PEND state forever.
+ * The other cause of getting stuck down there is if the
+ * higher layer never reply to the CONNECT_INDICATION.
+ * Anyway, it make sense to make sure that we always have
+ * a backup plan. 1 second is plenty (should be immediate).
+ * Jean II */
+ irlmp_start_watchdog_timer(self, 1*HZ);
+
+ irlmp_do_lap_event(self->lap, LM_LAP_CONNECT_REQUEST, NULL);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s on LSAP %#02x\n",
+ __func__, irlmp_event[event], self->slsap_sel);
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlmp_state_connect (self, event, skb)
+ *
+ * CONNECT
+ *
+ */
+static int irlmp_state_connect(struct lsap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ struct lsap_cb *lsap;
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+ switch (event) {
+ case LM_CONNECT_RESPONSE:
+ /*
+ * Bind this LSAP to the IrLAP link where the connect was
+ * received
+ */
+ lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self,
+ NULL);
+
+ IRDA_ASSERT(lsap == self, return -1;);
+ IRDA_ASSERT(self->lap != NULL, return -1;);
+ IRDA_ASSERT(self->lap->lsaps != NULL, return -1;);
+
+ hashbin_insert(self->lap->lsaps, (irda_queue_t *) self,
+ (long) self, NULL);
+
+ set_bit(0, &self->connected); /* TRUE */
+
+ irlmp_send_lcf_pdu(self->lap, self->dlsap_sel,
+ self->slsap_sel, CONNECT_CNF, skb);
+
+ del_timer(&self->watchdog_timer);
+
+ irlmp_next_lsap_state(self, LSAP_DATA_TRANSFER_READY);
+ break;
+ case LM_WATCHDOG_TIMEOUT:
+ /* May happen, who knows...
+ * Jean II */
+ pr_debug("%s() WATCHDOG_TIMEOUT!\n", __func__);
+
+ /* Disconnect, get out... - Jean II */
+ self->lap = NULL;
+ self->dlsap_sel = LSAP_ANY;
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+ break;
+ default:
+ /* LM_LAP_DISCONNECT_INDICATION : Should never happen, we
+ * are *not* yet bound to the IrLAP link. Jean II */
+ pr_debug("%s(), Unknown event %s on LSAP %#02x\n",
+ __func__, irlmp_event[event], self->slsap_sel);
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlmp_state_connect_pend (event, skb, info)
+ *
+ * CONNECT_PEND
+ *
+ */
+static int irlmp_state_connect_pend(struct lsap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ struct sk_buff *tx_skb;
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+ switch (event) {
+ case LM_CONNECT_REQUEST:
+ /* Keep state */
+ break;
+ case LM_CONNECT_RESPONSE:
+ pr_debug("%s(), LM_CONNECT_RESPONSE, no indication issued yet\n",
+ __func__);
+ /* Keep state */
+ break;
+ case LM_DISCONNECT_REQUEST:
+ pr_debug("%s(), LM_DISCONNECT_REQUEST, not yet bound to IrLAP connection\n",
+ __func__);
+ /* Keep state */
+ break;
+ case LM_LAP_CONNECT_CONFIRM:
+ pr_debug("%s(), LS_CONNECT_CONFIRM\n", __func__);
+ irlmp_next_lsap_state(self, LSAP_CONNECT);
+
+ tx_skb = self->conn_skb;
+ self->conn_skb = NULL;
+
+ irlmp_connect_indication(self, tx_skb);
+ /* Drop reference count - see irlmp_connect_indication(). */
+ dev_kfree_skb(tx_skb);
+ break;
+ case LM_WATCHDOG_TIMEOUT:
+ /* Will happen in some rare cases because of a race condition.
+ * Just make sure we don't stay there forever...
+ * Jean II */
+ pr_debug("%s() WATCHDOG_TIMEOUT!\n", __func__);
+
+ /* Go back to disconnected mode, keep the socket waiting */
+ self->lap = NULL;
+ self->dlsap_sel = LSAP_ANY;
+ if(self->conn_skb)
+ dev_kfree_skb(self->conn_skb);
+ self->conn_skb = NULL;
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+ break;
+ default:
+ /* LM_LAP_DISCONNECT_INDICATION : Should never happen, we
+ * are *not* yet bound to the IrLAP link. Jean II */
+ pr_debug("%s(), Unknown event %s on LSAP %#02x\n",
+ __func__, irlmp_event[event], self->slsap_sel);
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlmp_state_dtr (self, event, skb)
+ *
+ * DATA_TRANSFER_READY
+ *
+ */
+static int irlmp_state_dtr(struct lsap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ LM_REASON reason;
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+ IRDA_ASSERT(self->lap != NULL, return -1;);
+
+ switch (event) {
+ case LM_DATA_REQUEST: /* Optimize for the common case */
+ irlmp_send_data_pdu(self->lap, self->dlsap_sel,
+ self->slsap_sel, FALSE, skb);
+ break;
+ case LM_DATA_INDICATION: /* Optimize for the common case */
+ irlmp_data_indication(self, skb);
+ break;
+ case LM_UDATA_REQUEST:
+ IRDA_ASSERT(skb != NULL, return -1;);
+ irlmp_send_data_pdu(self->lap, self->dlsap_sel,
+ self->slsap_sel, TRUE, skb);
+ break;
+ case LM_UDATA_INDICATION:
+ irlmp_udata_indication(self, skb);
+ break;
+ case LM_CONNECT_REQUEST:
+ pr_debug("%s(), LM_CONNECT_REQUEST, error, LSAP already connected\n",
+ __func__);
+ /* Keep state */
+ break;
+ case LM_CONNECT_RESPONSE:
+ pr_debug("%s(), LM_CONNECT_RESPONSE, error, LSAP already connected\n",
+ __func__);
+ /* Keep state */
+ break;
+ case LM_DISCONNECT_REQUEST:
+ irlmp_send_lcf_pdu(self->lap, self->dlsap_sel, self->slsap_sel,
+ DISCONNECT, skb);
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+ /* Called only from irlmp_disconnect_request(), will
+ * unbind from LAP over there. Jean II */
+
+ /* Try to close the LAP connection if its still there */
+ if (self->lap) {
+ pr_debug("%s(), trying to close IrLAP\n",
+ __func__);
+ irlmp_do_lap_event(self->lap,
+ LM_LAP_DISCONNECT_REQUEST,
+ NULL);
+ }
+ break;
+ case LM_LAP_DISCONNECT_INDICATION:
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+ reason = irlmp_convert_lap_reason(self->lap->reason);
+
+ irlmp_disconnect_indication(self, reason, NULL);
+ break;
+ case LM_DISCONNECT_INDICATION:
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+ IRDA_ASSERT(self->lap != NULL, return -1;);
+ IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;);
+
+ IRDA_ASSERT(skb != NULL, return -1;);
+ IRDA_ASSERT(skb->len > 3, return -1;);
+ reason = skb->data[3];
+
+ /* Try to close the LAP connection */
+ pr_debug("%s(), trying to close IrLAP\n", __func__);
+ irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL);
+
+ irlmp_disconnect_indication(self, reason, skb);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s on LSAP %#02x\n",
+ __func__, irlmp_event[event], self->slsap_sel);
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlmp_state_setup (event, skb, info)
+ *
+ * SETUP, Station Control has set up the underlying IrLAP connection.
+ * An LSAP connection request has been transmitted to the peer
+ * LSAP-Connection Control FSM and we are awaiting reply.
+ */
+static int irlmp_state_setup(struct lsap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ LM_REASON reason;
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+ switch (event) {
+ case LM_CONNECT_CONFIRM:
+ irlmp_next_lsap_state(self, LSAP_DATA_TRANSFER_READY);
+
+ del_timer(&self->watchdog_timer);
+
+ irlmp_connect_confirm(self, skb);
+ break;
+ case LM_DISCONNECT_INDICATION:
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+ IRDA_ASSERT(self->lap != NULL, return -1;);
+ IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;);
+
+ IRDA_ASSERT(skb != NULL, return -1;);
+ IRDA_ASSERT(skb->len > 3, return -1;);
+ reason = skb->data[3];
+
+ /* Try to close the LAP connection */
+ pr_debug("%s(), trying to close IrLAP\n", __func__);
+ irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL);
+
+ irlmp_disconnect_indication(self, reason, skb);
+ break;
+ case LM_LAP_DISCONNECT_INDICATION:
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+ del_timer(&self->watchdog_timer);
+
+ IRDA_ASSERT(self->lap != NULL, return -1;);
+ IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;);
+
+ reason = irlmp_convert_lap_reason(self->lap->reason);
+
+ irlmp_disconnect_indication(self, reason, skb);
+ break;
+ case LM_WATCHDOG_TIMEOUT:
+ pr_debug("%s() WATCHDOG_TIMEOUT!\n", __func__);
+
+ IRDA_ASSERT(self->lap != NULL, return -1;);
+ irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL);
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+ irlmp_disconnect_indication(self, LM_CONNECT_FAILURE, NULL);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s on LSAP %#02x\n",
+ __func__, irlmp_event[event], self->slsap_sel);
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Function irlmp_state_setup_pend (event, skb, info)
+ *
+ * SETUP_PEND, An LM_CONNECT_REQUEST has been received from the service
+ * user to set up an LSAP connection. A request has been sent to the
+ * LAP FSM to set up the underlying IrLAP connection, and we
+ * are awaiting confirm.
+ */
+static int irlmp_state_setup_pend(struct lsap_cb *self, IRLMP_EVENT event,
+ struct sk_buff *skb)
+{
+ struct sk_buff *tx_skb;
+ LM_REASON reason;
+ int ret = 0;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(irlmp != NULL, return -1;);
+
+ switch (event) {
+ case LM_LAP_CONNECT_CONFIRM:
+ IRDA_ASSERT(self->conn_skb != NULL, return -1;);
+
+ tx_skb = self->conn_skb;
+ self->conn_skb = NULL;
+
+ irlmp_send_lcf_pdu(self->lap, self->dlsap_sel,
+ self->slsap_sel, CONNECT_CMD, tx_skb);
+ /* Drop reference count - see irlap_data_request(). */
+ dev_kfree_skb(tx_skb);
+
+ irlmp_next_lsap_state(self, LSAP_SETUP);
+ break;
+ case LM_WATCHDOG_TIMEOUT:
+ pr_debug("%s() : WATCHDOG_TIMEOUT !\n", __func__);
+
+ IRDA_ASSERT(self->lap != NULL, return -1;);
+ irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL);
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+ irlmp_disconnect_indication(self, LM_CONNECT_FAILURE, NULL);
+ break;
+ case LM_LAP_DISCONNECT_INDICATION: /* LS_Disconnect.indication */
+ del_timer( &self->watchdog_timer);
+
+ irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+ reason = irlmp_convert_lap_reason(self->lap->reason);
+
+ irlmp_disconnect_indication(self, reason, NULL);
+ break;
+ default:
+ pr_debug("%s(), Unknown event %s on LSAP %#02x\n",
+ __func__, irlmp_event[event], self->slsap_sel);
+ break;
+ }
+ return ret;
+}
diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c
new file mode 100644
index 000000000..38b0f994b
--- /dev/null
+++ b/net/irda/irlmp_frame.c
@@ -0,0 +1,476 @@
+/*********************************************************************
+ *
+ * Filename: irlmp_frame.c
+ * Version: 0.9
+ * Description: IrLMP frame implementation
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Tue Aug 19 02:09:59 1997
+ * Modified at: Mon Dec 13 13:41:12 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>
+ * All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/kernel.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/irda/timer.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irlmp_frame.h>
+#include <net/irda/discovery.h>
+
+static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap,
+ __u8 slsap, int status, hashbin_t *);
+
+inline void irlmp_send_data_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap,
+ int expedited, struct sk_buff *skb)
+{
+ skb->data[0] = dlsap;
+ skb->data[1] = slsap;
+
+ if (expedited) {
+ pr_debug("%s(), sending expedited data\n", __func__);
+ irlap_data_request(self->irlap, skb, TRUE);
+ } else
+ irlap_data_request(self->irlap, skb, FALSE);
+}
+
+/*
+ * Function irlmp_send_lcf_pdu (dlsap, slsap, opcode,skb)
+ *
+ * Send Link Control Frame to IrLAP
+ */
+void irlmp_send_lcf_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap,
+ __u8 opcode, struct sk_buff *skb)
+{
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ frame = skb->data;
+
+ frame[0] = dlsap | CONTROL_BIT;
+ frame[1] = slsap;
+
+ frame[2] = opcode;
+
+ if (opcode == DISCONNECT)
+ frame[3] = 0x01; /* Service user request */
+ else
+ frame[3] = 0x00; /* rsvd */
+
+ irlap_data_request(self->irlap, skb, FALSE);
+}
+
+/*
+ * Function irlmp_input (skb)
+ *
+ * Used by IrLAP to pass received data frames to IrLMP layer
+ *
+ */
+void irlmp_link_data_indication(struct lap_cb *self, struct sk_buff *skb,
+ int unreliable)
+{
+ struct lsap_cb *lsap;
+ __u8 slsap_sel; /* Source (this) LSAP address */
+ __u8 dlsap_sel; /* Destination LSAP address */
+ __u8 *fp;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+ IRDA_ASSERT(skb->len > 2, return;);
+
+ fp = skb->data;
+
+ /*
+ * The next statements may be confusing, but we do this so that
+ * destination LSAP of received frame is source LSAP in our view
+ */
+ slsap_sel = fp[0] & LSAP_MASK;
+ dlsap_sel = fp[1];
+
+ /*
+ * Check if this is an incoming connection, since we must deal with
+ * it in a different way than other established connections.
+ */
+ if ((fp[0] & CONTROL_BIT) && (fp[2] == CONNECT_CMD)) {
+ pr_debug("%s(), incoming connection, source LSAP=%d, dest LSAP=%d\n",
+ __func__, slsap_sel, dlsap_sel);
+
+ /* Try to find LSAP among the unconnected LSAPs */
+ lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, CONNECT_CMD,
+ irlmp->unconnected_lsaps);
+
+ /* Maybe LSAP was already connected, so try one more time */
+ if (!lsap) {
+ pr_debug("%s(), incoming connection for LSAP already connected\n",
+ __func__);
+ lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, 0,
+ self->lsaps);
+ }
+ } else
+ lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, 0,
+ self->lsaps);
+
+ if (lsap == NULL) {
+ pr_debug("IrLMP, Sorry, no LSAP for received frame!\n");
+ pr_debug("%s(), slsap_sel = %02x, dlsap_sel = %02x\n",
+ __func__, slsap_sel, dlsap_sel);
+ if (fp[0] & CONTROL_BIT) {
+ pr_debug("%s(), received control frame %02x\n",
+ __func__, fp[2]);
+ } else {
+ pr_debug("%s(), received data frame\n", __func__);
+ }
+ return;
+ }
+
+ /*
+ * Check if we received a control frame?
+ */
+ if (fp[0] & CONTROL_BIT) {
+ switch (fp[2]) {
+ case CONNECT_CMD:
+ lsap->lap = self;
+ irlmp_do_lsap_event(lsap, LM_CONNECT_INDICATION, skb);
+ break;
+ case CONNECT_CNF:
+ irlmp_do_lsap_event(lsap, LM_CONNECT_CONFIRM, skb);
+ break;
+ case DISCONNECT:
+ pr_debug("%s(), Disconnect indication!\n",
+ __func__);
+ irlmp_do_lsap_event(lsap, LM_DISCONNECT_INDICATION,
+ skb);
+ break;
+ case ACCESSMODE_CMD:
+ pr_debug("Access mode cmd not implemented!\n");
+ break;
+ case ACCESSMODE_CNF:
+ pr_debug("Access mode cnf not implemented!\n");
+ break;
+ default:
+ pr_debug("%s(), Unknown control frame %02x\n",
+ __func__, fp[2]);
+ break;
+ }
+ } else if (unreliable) {
+ /* Optimize and bypass the state machine if possible */
+ if (lsap->lsap_state == LSAP_DATA_TRANSFER_READY)
+ irlmp_udata_indication(lsap, skb);
+ else
+ irlmp_do_lsap_event(lsap, LM_UDATA_INDICATION, skb);
+ } else {
+ /* Optimize and bypass the state machine if possible */
+ if (lsap->lsap_state == LSAP_DATA_TRANSFER_READY)
+ irlmp_data_indication(lsap, skb);
+ else
+ irlmp_do_lsap_event(lsap, LM_DATA_INDICATION, skb);
+ }
+}
+
+/*
+ * Function irlmp_link_unitdata_indication (self, skb)
+ *
+ *
+ *
+ */
+#ifdef CONFIG_IRDA_ULTRA
+void irlmp_link_unitdata_indication(struct lap_cb *self, struct sk_buff *skb)
+{
+ struct lsap_cb *lsap;
+ __u8 slsap_sel; /* Source (this) LSAP address */
+ __u8 dlsap_sel; /* Destination LSAP address */
+ __u8 pid; /* Protocol identifier */
+ __u8 *fp;
+ unsigned long flags;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+ IRDA_ASSERT(skb->len > 2, return;);
+
+ fp = skb->data;
+
+ /*
+ * The next statements may be confusing, but we do this so that
+ * destination LSAP of received frame is source LSAP in our view
+ */
+ slsap_sel = fp[0] & LSAP_MASK;
+ dlsap_sel = fp[1];
+ pid = fp[2];
+
+ if (pid & 0x80) {
+ pr_debug("%s(), extension in PID not supp!\n",
+ __func__);
+ return;
+ }
+
+ /* Check if frame is addressed to the connectionless LSAP */
+ if ((slsap_sel != LSAP_CONNLESS) || (dlsap_sel != LSAP_CONNLESS)) {
+ pr_debug("%s(), dropping frame!\n", __func__);
+ return;
+ }
+
+ /* Search the connectionless LSAP */
+ spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+ lsap = (struct lsap_cb *) hashbin_get_first(irlmp->unconnected_lsaps);
+ while (lsap != NULL) {
+ /*
+ * Check if source LSAP and dest LSAP selectors and PID match.
+ */
+ if ((lsap->slsap_sel == slsap_sel) &&
+ (lsap->dlsap_sel == dlsap_sel) &&
+ (lsap->pid == pid))
+ {
+ break;
+ }
+ lsap = (struct lsap_cb *) hashbin_get_next(irlmp->unconnected_lsaps);
+ }
+ spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+ if (lsap)
+ irlmp_connless_data_indication(lsap, skb);
+ else {
+ pr_debug("%s(), found no matching LSAP!\n", __func__);
+ }
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irlmp_link_disconnect_indication (reason, userdata)
+ *
+ * IrLAP has disconnected
+ *
+ */
+void irlmp_link_disconnect_indication(struct lap_cb *lap,
+ struct irlap_cb *irlap,
+ LAP_REASON reason,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(lap != NULL, return;);
+ IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;);
+
+ lap->reason = reason;
+ lap->daddr = DEV_ADDR_ANY;
+
+ /* FIXME: must do something with the skb if any */
+
+ /*
+ * Inform station state machine
+ */
+ irlmp_do_lap_event(lap, LM_LAP_DISCONNECT_INDICATION, NULL);
+}
+
+/*
+ * Function irlmp_link_connect_indication (qos)
+ *
+ * Incoming LAP connection!
+ *
+ */
+void irlmp_link_connect_indication(struct lap_cb *self, __u32 saddr,
+ __u32 daddr, struct qos_info *qos,
+ struct sk_buff *skb)
+{
+ /* Copy QoS settings for this session */
+ self->qos = qos;
+
+ /* Update destination device address */
+ self->daddr = daddr;
+ IRDA_ASSERT(self->saddr == saddr, return;);
+
+ irlmp_do_lap_event(self, LM_LAP_CONNECT_INDICATION, skb);
+}
+
+/*
+ * Function irlmp_link_connect_confirm (qos)
+ *
+ * LAP connection confirmed!
+ *
+ */
+void irlmp_link_connect_confirm(struct lap_cb *self, struct qos_info *qos,
+ struct sk_buff *skb)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+ IRDA_ASSERT(qos != NULL, return;);
+
+ /* Don't need use the skb for now */
+
+ /* Copy QoS settings for this session */
+ self->qos = qos;
+
+ irlmp_do_lap_event(self, LM_LAP_CONNECT_CONFIRM, NULL);
+}
+
+/*
+ * Function irlmp_link_discovery_indication (self, log)
+ *
+ * Device is discovering us
+ *
+ * It's not an answer to our own discoveries, just another device trying
+ * to perform discovery, but we don't want to miss the opportunity
+ * to exploit this information, because :
+ * o We may not actively perform discovery (just passive discovery)
+ * o This type of discovery is much more reliable. In some cases, it
+ * seem that less than 50% of our discoveries get an answer, while
+ * we always get ~100% of these.
+ * o Make faster discovery, statistically divide time of discovery
+ * events by 2 (important for the latency aspect and user feel)
+ * o Even is we do active discovery, the other node might not
+ * answer our discoveries (ex: Palm). The Palm will just perform
+ * one active discovery and connect directly to us.
+ *
+ * However, when both devices discover each other, they might attempt to
+ * connect to each other following the discovery event, and it would create
+ * collisions on the medium (SNRM battle).
+ * The "fix" for that is to disable all connection requests in IrLAP
+ * for 100ms after a discovery indication by setting the media_busy flag.
+ * Previously, we used to postpone the event which was quite ugly. Now
+ * that IrLAP takes care of this problem, just pass the event up...
+ *
+ * Jean II
+ */
+void irlmp_link_discovery_indication(struct lap_cb *self,
+ discovery_t *discovery)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+
+ /* Add to main log, cleanup */
+ irlmp_add_discovery(irlmp->cachelog, discovery);
+
+ /* Just handle it the same way as a discovery confirm,
+ * bypass the LM_LAP state machine (see below) */
+ irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_PASSIVE);
+}
+
+/*
+ * Function irlmp_link_discovery_confirm (self, log)
+ *
+ * Called by IrLAP with a list of discoveries after the discovery
+ * request has been carried out. A NULL log is received if IrLAP
+ * was unable to carry out the discovery request
+ *
+ */
+void irlmp_link_discovery_confirm(struct lap_cb *self, hashbin_t *log)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+
+ /* Add to main log, cleanup */
+ irlmp_add_discovery_log(irlmp->cachelog, log);
+
+ /* Propagate event to various LSAPs registered for it.
+ * We bypass the LM_LAP state machine because
+ * 1) We do it regardless of the LM_LAP state
+ * 2) It doesn't affect the LM_LAP state
+ * 3) Faster, slimer, simpler, ...
+ * Jean II */
+ irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_ACTIVE);
+}
+
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+static inline void irlmp_update_cache(struct lap_cb *lap,
+ struct lsap_cb *lsap)
+{
+ /* Prevent concurrent read to get garbage */
+ lap->cache.valid = FALSE;
+ /* Update cache entry */
+ lap->cache.dlsap_sel = lsap->dlsap_sel;
+ lap->cache.slsap_sel = lsap->slsap_sel;
+ lap->cache.lsap = lsap;
+ lap->cache.valid = TRUE;
+}
+#endif
+
+/*
+ * Function irlmp_find_handle (self, dlsap_sel, slsap_sel, status, queue)
+ *
+ * Find handle associated with destination and source LSAP
+ *
+ * Any IrDA connection (LSAP/TSAP) is uniquely identified by
+ * 3 parameters, the local lsap, the remote lsap and the remote address.
+ * We may initiate multiple connections to the same remote service
+ * (they will have different local lsap), a remote device may initiate
+ * multiple connections to the same local service (they will have
+ * different remote lsap), or multiple devices may connect to the same
+ * service and may use the same remote lsap (and they will have
+ * different remote address).
+ * So, where is the remote address ? Each LAP connection is made with
+ * a single remote device, so imply a specific remote address.
+ * Jean II
+ */
+static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel,
+ __u8 slsap_sel, int status,
+ hashbin_t *queue)
+{
+ struct lsap_cb *lsap;
+ unsigned long flags;
+
+ /*
+ * Optimize for the common case. We assume that the last frame
+ * received is in the same connection as the last one, so check in
+ * cache first to avoid the linear search
+ */
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+ if ((self->cache.valid) &&
+ (self->cache.slsap_sel == slsap_sel) &&
+ (self->cache.dlsap_sel == dlsap_sel))
+ {
+ return self->cache.lsap;
+ }
+#endif
+
+ spin_lock_irqsave(&queue->hb_spinlock, flags);
+
+ lsap = (struct lsap_cb *) hashbin_get_first(queue);
+ while (lsap != NULL) {
+ /*
+ * If this is an incoming connection, then the destination
+ * LSAP selector may have been specified as LM_ANY so that
+ * any client can connect. In that case we only need to check
+ * if the source LSAP (in our view!) match!
+ */
+ if ((status == CONNECT_CMD) &&
+ (lsap->slsap_sel == slsap_sel) &&
+ (lsap->dlsap_sel == LSAP_ANY)) {
+ /* This is where the dest lsap sel is set on incoming
+ * lsaps */
+ lsap->dlsap_sel = dlsap_sel;
+ break;
+ }
+ /*
+ * Check if source LSAP and dest LSAP selectors match.
+ */
+ if ((lsap->slsap_sel == slsap_sel) &&
+ (lsap->dlsap_sel == dlsap_sel))
+ break;
+
+ lsap = (struct lsap_cb *) hashbin_get_next(queue);
+ }
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+ if(lsap)
+ irlmp_update_cache(self, lsap);
+#endif
+ spin_unlock_irqrestore(&queue->hb_spinlock, flags);
+
+ /* Return what we've found or NULL */
+ return lsap;
+}
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
new file mode 100644
index 000000000..c5e35b85c
--- /dev/null
+++ b/net/irda/irmod.c
@@ -0,0 +1,199 @@
+/*********************************************************************
+ *
+ * Filename: irmod.c
+ * Version: 0.9
+ * Description: IrDA stack main entry points
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Mon Dec 15 13:55:39 1997
+ * Modified at: Wed Jan 5 15:12:41 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1997, 1999-2000 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2004 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+/*
+ * This file contains the main entry points of the IrDA stack.
+ * They are in this file and not af_irda.c because some developpers
+ * are using the IrDA stack without the socket API (compiling out
+ * af_irda.c).
+ * Jean II
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h> /* notify_t */
+#include <net/irda/irlap.h> /* irlap_init */
+#include <net/irda/irlmp.h> /* irlmp_init */
+#include <net/irda/iriap.h> /* iriap_init */
+#include <net/irda/irttp.h> /* irttp_init */
+#include <net/irda/irda_device.h> /* irda_device_init */
+
+/* Packet type handler.
+ * Tell the kernel how IrDA packets should be handled.
+ */
+static struct packet_type irda_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IRDA),
+ .func = irlap_driver_rcv, /* Packet type handler irlap_frame.c */
+};
+
+/*
+ * Function irda_notify_init (notify)
+ *
+ * Used for initializing the notify structure
+ *
+ */
+void irda_notify_init(notify_t *notify)
+{
+ notify->data_indication = NULL;
+ notify->udata_indication = NULL;
+ notify->connect_confirm = NULL;
+ notify->connect_indication = NULL;
+ notify->disconnect_indication = NULL;
+ notify->flow_indication = NULL;
+ notify->status_indication = NULL;
+ notify->instance = NULL;
+ strlcpy(notify->name, "Unknown", sizeof(notify->name));
+}
+EXPORT_SYMBOL(irda_notify_init);
+
+/*
+ * Function irda_init (void)
+ *
+ * Protocol stack initialisation entry point.
+ * Initialise the various components of the IrDA stack
+ */
+static int __init irda_init(void)
+{
+ int ret = 0;
+
+ /* Lower layer of the stack */
+ irlmp_init();
+ irlap_init();
+
+ /* Driver/dongle support */
+ irda_device_init();
+
+ /* Higher layers of the stack */
+ iriap_init();
+ irttp_init();
+ ret = irsock_init();
+ if (ret < 0)
+ goto out_err_1;
+
+ /* Add IrDA packet type (Start receiving packets) */
+ dev_add_pack(&irda_packet_type);
+
+ /* External APIs */
+#ifdef CONFIG_PROC_FS
+ irda_proc_register();
+#endif
+#ifdef CONFIG_SYSCTL
+ ret = irda_sysctl_register();
+ if (ret < 0)
+ goto out_err_2;
+#endif
+
+ ret = irda_nl_register();
+ if (ret < 0)
+ goto out_err_3;
+
+ return 0;
+
+ out_err_3:
+#ifdef CONFIG_SYSCTL
+ irda_sysctl_unregister();
+ out_err_2:
+#endif
+#ifdef CONFIG_PROC_FS
+ irda_proc_unregister();
+#endif
+
+ /* Remove IrDA packet type (stop receiving packets) */
+ dev_remove_pack(&irda_packet_type);
+
+ /* Remove higher layers */
+ irsock_cleanup();
+ out_err_1:
+ irttp_cleanup();
+ iriap_cleanup();
+
+ /* Remove lower layers */
+ irda_device_cleanup();
+ irlap_cleanup(); /* Must be done before irlmp_cleanup()! DB */
+
+ /* Remove middle layer */
+ irlmp_cleanup();
+
+
+ return ret;
+}
+
+/*
+ * Function irda_cleanup (void)
+ *
+ * Protocol stack cleanup/removal entry point.
+ * Cleanup the various components of the IrDA stack
+ */
+static void __exit irda_cleanup(void)
+{
+ /* Remove External APIs */
+ irda_nl_unregister();
+
+#ifdef CONFIG_SYSCTL
+ irda_sysctl_unregister();
+#endif
+#ifdef CONFIG_PROC_FS
+ irda_proc_unregister();
+#endif
+
+ /* Remove IrDA packet type (stop receiving packets) */
+ dev_remove_pack(&irda_packet_type);
+
+ /* Remove higher layers */
+ irsock_cleanup();
+ irttp_cleanup();
+ iriap_cleanup();
+
+ /* Remove lower layers */
+ irda_device_cleanup();
+ irlap_cleanup(); /* Must be done before irlmp_cleanup()! DB */
+
+ /* Remove middle layer */
+ irlmp_cleanup();
+}
+
+/*
+ * The IrDA stack must be initialised *before* drivers get initialised,
+ * and *before* higher protocols (IrLAN/IrCOMM/IrNET) get initialised,
+ * otherwise bad things will happen (hashbins will be NULL for example).
+ * Those modules are at module_init()/device_initcall() level.
+ *
+ * On the other hand, it needs to be initialised *after* the basic
+ * networking, the /proc/net filesystem and sysctl module. Those are
+ * currently initialised in .../init/main.c (before initcalls).
+ * Also, IrDA drivers needs to be initialised *after* the random number
+ * generator (main stack and higher layer init don't need it anymore).
+ *
+ * Jean II
+ */
+subsys_initcall(irda_init);
+module_exit(irda_cleanup);
+
+MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no> & Jean Tourrilhes <jt@hpl.hp.com>");
+MODULE_DESCRIPTION("The Linux IrDA Protocol Stack");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_IRDA);
diff --git a/net/irda/irnet/Kconfig b/net/irda/irnet/Kconfig
new file mode 100644
index 000000000..28c557f0f
--- /dev/null
+++ b/net/irda/irnet/Kconfig
@@ -0,0 +1,13 @@
+config IRNET
+ tristate "IrNET protocol"
+ depends on IRDA && PPP
+ help
+ Say Y here if you want to build support for the IrNET protocol.
+ To compile it as a module, choose M here: the module will be
+ called irnet. IrNET is a PPP driver, so you will also need a
+ working PPP subsystem (driver, daemon and config)...
+
+ IrNET is an alternate way to transfer TCP/IP traffic over IrDA. It
+ uses synchronous PPP over a set of point to point IrDA sockets. You
+ can use it between Linux machine or with W2k.
+
diff --git a/net/irda/irnet/Makefile b/net/irda/irnet/Makefile
new file mode 100644
index 000000000..61c365c8a
--- /dev/null
+++ b/net/irda/irnet/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux IrDA IrNET protocol layer.
+#
+
+obj-$(CONFIG_IRNET) += irnet.o
+
+irnet-y := irnet_ppp.o irnet_irda.o
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
new file mode 100644
index 000000000..8d65bb947
--- /dev/null
+++ b/net/irda/irnet/irnet.h
@@ -0,0 +1,523 @@
+/*
+ * IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ * Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file contains definitions and declarations global to the IrNET module,
+ * all grouped in one place...
+ * This file is a *private* header, so other modules don't want to know
+ * what's in there...
+ *
+ * Note : as most part of the Linux kernel, this module is available
+ * under the GNU General Public License (GPL).
+ */
+
+#ifndef IRNET_H
+#define IRNET_H
+
+/************************** DOCUMENTATION ***************************/
+/*
+ * What is IrNET
+ * -------------
+ * IrNET is a protocol allowing to carry TCP/IP traffic between two
+ * IrDA peers in an efficient fashion. It is a thin layer, passing PPP
+ * packets to IrTTP and vice versa. It uses PPP in synchronous mode,
+ * because IrTTP offer a reliable sequenced packet service (as opposed
+ * to a byte stream). In fact, you could see IrNET as carrying TCP/IP
+ * in a IrDA socket, using PPP to provide the glue.
+ *
+ * The main difference with traditional PPP over IrCOMM is that we
+ * avoid the framing and serial emulation which are a performance
+ * bottleneck. It also allows multipoint communications in a sensible
+ * fashion.
+ *
+ * The main difference with IrLAN is that we use PPP for the link
+ * management, which is more standard, interoperable and flexible than
+ * the IrLAN protocol. For example, PPP adds authentication,
+ * encryption, compression, header compression and automated routing
+ * setup. And, as IrNET let PPP do the hard work, the implementation
+ * is much simpler than IrLAN.
+ *
+ * The Linux implementation
+ * ------------------------
+ * IrNET is written on top of the Linux-IrDA stack, and interface with
+ * the generic Linux PPP driver. Because IrNET depend on recent
+ * changes of the PPP driver interface, IrNET will work only with very
+ * recent kernel (2.3.99-pre6 and up).
+ *
+ * The present implementation offer the following features :
+ * o simple user interface using pppd
+ * o efficient implementation (interface directly to PPP and IrTTP)
+ * o addressing (you can specify the name of the IrNET recipient)
+ * o multipoint operation (limited by IrLAP specification)
+ * o information in /proc/net/irda/irnet
+ * o IrNET events on /dev/irnet (for user space daemon)
+ * o IrNET daemon (irnetd) to automatically handle incoming requests
+ * o Windows 2000 compatibility (tested, but need more work)
+ * Currently missing :
+ * o Lot's of testing (that's your job)
+ * o Connection retries (may be too hard to do)
+ * o Check pppd persist mode
+ * o User space daemon (to automatically handle incoming requests)
+ *
+ * The setup is not currently the most easy, but this should get much
+ * better when everything will get integrated...
+ *
+ * Acknowledgements
+ * ----------------
+ * This module is based on :
+ * o The PPP driver (ppp_synctty/ppp_generic) by Paul Mackerras
+ * o The IrLAN protocol (irlan_common/XXX) by Dag Brattli
+ * o The IrSock interface (af_irda) by Dag Brattli
+ * o Some other bits from the kernel and my drivers...
+ * Infinite thanks to those brave souls for providing the infrastructure
+ * upon which IrNET is built.
+ *
+ * Thanks to all my colleagues in HP for helping me. In particular,
+ * thanks to Salil Pradhan and Bill Serra for W2k testing...
+ * Thanks to Luiz Magalhaes for irnetd and much testing...
+ *
+ * Thanks to Alan Cox for answering lot's of my stupid questions, and
+ * to Paul Mackerras answering my questions on how to best integrate
+ * IrNET and pppd.
+ *
+ * Jean II
+ *
+ * Note on some implementations choices...
+ * ------------------------------------
+ * 1) Direct interface vs tty/socket
+ * I could have used a tty interface to hook to ppp and use the full
+ * socket API to connect to IrDA. The code would have been easier to
+ * maintain, and maybe the code would have been smaller...
+ * Instead, we hook directly to ppp_generic and to IrTTP, which make
+ * things more complicated...
+ *
+ * The first reason is flexibility : this allow us to create IrNET
+ * instances on demand (no /dev/ircommX crap) and to allow linkname
+ * specification on pppd command line...
+ *
+ * Second reason is speed optimisation. If you look closely at the
+ * transmit and receive paths, you will notice that they are "super lean"
+ * (that's why they look ugly), with no function calls and as little data
+ * copy and modification as I could...
+ *
+ * 2) irnetd in user space
+ * irnetd is implemented in user space, which is necessary to call pppd.
+ * This also give maximum benefits in term of flexibility and customability,
+ * and allow to offer the event channel, useful for other stuff like debug.
+ *
+ * On the other hand, this require a loose coordination between the
+ * present module and irnetd. One critical area is how incoming request
+ * are handled.
+ * When irnet receive an incoming request, it send an event to irnetd and
+ * drop the incoming IrNET socket.
+ * irnetd start a pppd instance, which create a new IrNET socket. This new
+ * socket is then connected in the originating node to the pppd instance.
+ * At this point, in the originating node, the first socket is closed.
+ *
+ * I admit, this is a bit messy and waste some resources. The alternative
+ * is caching incoming socket, and that's also quite messy and waste
+ * resources.
+ * We also make connection time slower. For example, on a 115 kb/s link it
+ * adds 60ms to the connection time (770 ms). However, this is slower than
+ * the time it takes to fire up pppd on my P133...
+ *
+ *
+ * History :
+ * -------
+ *
+ * v1 - 15.5.00 - Jean II
+ * o Basic IrNET (hook to ppp_generic & IrTTP - incl. multipoint)
+ * o control channel on /dev/irnet (set name/address)
+ * o event channel on /dev/irnet (for user space daemon)
+ *
+ * v2 - 5.6.00 - Jean II
+ * o Enable DROP_NOT_READY to avoid PPP timeouts & other weirdness...
+ * o Add DISCONNECT_TO event and rename DISCONNECT_FROM.
+ * o Set official device number alloaction on /dev/irnet
+ *
+ * v3 - 30.8.00 - Jean II
+ * o Update to latest Linux-IrDA changes :
+ * - queue_t => irda_queue_t
+ * o Update to ppp-2.4.0 :
+ * - move irda_irnet_connect from PPPIOCATTACH to TIOCSETD
+ * o Add EXPIRE event (depend on new IrDA-Linux patch)
+ * o Switch from `hashbin_remove' to `hashbin_remove_this' to fix
+ * a multilink bug... (depend on new IrDA-Linux patch)
+ * o fix a self->daddr to self->raddr in irda_irnet_connect to fix
+ * another multilink bug (darn !)
+ * o Remove LINKNAME_IOCTL cruft
+ *
+ * v3b - 31.8.00 - Jean II
+ * o Dump discovery log at event channel startup
+ *
+ * v4 - 28.9.00 - Jean II
+ * o Fix interaction between poll/select and dump discovery log
+ * o Add IRNET_BLOCKED_LINK event (depend on new IrDA-Linux patch)
+ * o Add IRNET_NOANSWER_FROM event (mostly to help support)
+ * o Release flow control in disconnect_indication
+ * o Block packets while connecting (speed up connections)
+ *
+ * v5 - 11.01.01 - Jean II
+ * o Init self->max_header_size, just in case...
+ * o Set up ap->chan.hdrlen, to get zero copy on tx side working.
+ * o avoid tx->ttp->flow->ppp->tx->... loop, by checking flow state
+ * Thanks to Christian Gennerat for finding this bug !
+ * ---
+ * o Declare the proper MTU/MRU that we can support
+ * (but PPP doesn't read the MTU value :-()
+ * o Declare hashbin HB_NOLOCK instead of HB_LOCAL to avoid
+ * disabling and enabling irq twice
+ *
+ * v6 - 31.05.01 - Jean II
+ * o Print source address in Found, Discovery, Expiry & Request events
+ * o Print requested source address in /proc/net/irnet
+ * o Change control channel input. Allow multiple commands in one line.
+ * o Add saddr command to change ap->rsaddr (and use that in IrDA)
+ * ---
+ * o Make the IrDA connection procedure totally asynchronous.
+ * Heavy rewrite of the IAS query code and the whole connection
+ * procedure. Now, irnet_connect() no longer need to be called from
+ * a process context...
+ * o Enable IrDA connect retries in ppp_irnet_send(). The good thing
+ * is that IrDA connect retries are directly driven by PPP LCP
+ * retries (we retry for each LCP packet), so that everything
+ * is transparently controlled from pppd lcp-max-configure.
+ * o Add ttp_connect flag to prevent rentry on the connect procedure
+ * o Test and fixups to eliminate side effects of retries
+ *
+ * v7 - 22.08.01 - Jean II
+ * o Cleanup : Change "saddr = 0x0" to "saddr = DEV_ADDR_ANY"
+ * o Fix bug in BLOCK_WHEN_CONNECT introduced in v6 : due to the
+ * asynchronous IAS query, self->tsap is NULL when PPP send the
+ * first packet. This was preventing "connect-delay 0" to work.
+ * Change the test in ppp_irnet_send() to self->ttp_connect.
+ *
+ * v8 - 1.11.01 - Jean II
+ * o Tighten the use of self->ttp_connect and self->ttp_open to
+ * prevent various race conditions.
+ * o Avoid leaking discovery log and skb
+ * o Replace "self" with "server" in irnet_connect_indication() to
+ * better detect cut'n'paste error ;-)
+ *
+ * v9 - 29.11.01 - Jean II
+ * o Fix event generation in disconnect indication that I broke in v8
+ * It was always generation "No-Answer" because I was testing ttp_open
+ * just after clearing it. *blush*.
+ * o Use newly created irttp_listen() to fix potential crash when LAP
+ * destroyed before irnet module removed.
+ *
+ * v10 - 4.3.2 - Jean II
+ * o When receiving a disconnect indication, don't reenable the
+ * PPP Tx queue, this will trigger a reconnect. Instead, close
+ * the channel, which will kill pppd...
+ *
+ * v11 - 20.3.02 - Jean II
+ * o Oops ! v10 fix disabled IrNET retries and passive behaviour.
+ * Better fix in irnet_disconnect_indication() :
+ * - if connected, kill pppd via hangup.
+ * - if not connected, reenable ppp Tx, which trigger IrNET retry.
+ *
+ * v12 - 10.4.02 - Jean II
+ * o Fix race condition in irnet_connect_indication().
+ * If the socket was already trying to connect, drop old connection
+ * and use new one only if acting as primary. See comments.
+ *
+ * v13 - 30.5.02 - Jean II
+ * o Update module init code
+ *
+ * v14 - 20.2.03 - Jean II
+ * o Add discovery hint bits in the control channel.
+ * o Remove obsolete MOD_INC/DEC_USE_COUNT in favor of .owner
+ *
+ * v15 - 7.4.03 - Jean II
+ * o Replace spin_lock_irqsave() with spin_lock_bh() so that we can
+ * use ppp_unit_number(). It's probably also better overall...
+ * o Disable call to ppp_unregister_channel(), because we can't do it.
+ */
+
+/***************************** INCLUDES *****************************/
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tty.h>
+#include <linux/proc_fs.h>
+#include <linux/netdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/capability.h>
+#include <linux/ctype.h> /* isspace() */
+#include <linux/string.h> /* skip_spaces() */
+#include <asm/uaccess.h>
+#include <linux/init.h>
+
+#include <linux/ppp_defs.h>
+#include <linux/ppp-ioctl.h>
+#include <linux/ppp_channel.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irttp.h>
+#include <net/irda/discovery.h>
+
+/***************************** OPTIONS *****************************/
+/*
+ * Define or undefine to compile or not some optional part of the
+ * IrNET driver...
+ * Note : the present defaults make sense, play with that at your
+ * own risk...
+ */
+/* IrDA side of the business... */
+#define DISCOVERY_NOMASK /* To enable W2k compatibility... */
+#define ADVERTISE_HINT /* Advertise IrLAN hint bit */
+#define ALLOW_SIMULT_CONNECT /* This seem to work, cross fingers... */
+#define DISCOVERY_EVENTS /* Query the discovery log to post events */
+#define INITIAL_DISCOVERY /* Dump current discovery log as events */
+#undef STREAM_COMPAT /* Not needed - potentially messy */
+#undef CONNECT_INDIC_KICK /* Might mess IrDA, not needed */
+#undef FAIL_SEND_DISCONNECT /* Might mess IrDA, not needed */
+#undef PASS_CONNECT_PACKETS /* Not needed ? Safe */
+#undef MISSING_PPP_API /* Stuff I wish I could do */
+
+/* PPP side of the business */
+#define BLOCK_WHEN_CONNECT /* Block packets when connecting */
+#define CONNECT_IN_SEND /* Retry IrDA connection procedure */
+#undef FLUSH_TO_PPP /* Not sure about this one, let's play safe */
+#undef SECURE_DEVIRNET /* Bah... */
+
+/****************************** DEBUG ******************************/
+
+/*
+ * This set of flags enable and disable all the various warning,
+ * error and debug message of this driver.
+ * Each section can be enabled and disabled independently
+ */
+/* In the PPP part */
+#define DEBUG_CTRL_TRACE 0 /* Control channel */
+#define DEBUG_CTRL_INFO 0 /* various info */
+#define DEBUG_CTRL_ERROR 1 /* problems */
+#define DEBUG_FS_TRACE 0 /* filesystem callbacks */
+#define DEBUG_FS_INFO 0 /* various info */
+#define DEBUG_FS_ERROR 1 /* problems */
+#define DEBUG_PPP_TRACE 0 /* PPP related functions */
+#define DEBUG_PPP_INFO 0 /* various info */
+#define DEBUG_PPP_ERROR 1 /* problems */
+#define DEBUG_MODULE_TRACE 0 /* module insertion/removal */
+#define DEBUG_MODULE_ERROR 1 /* problems */
+
+/* In the IrDA part */
+#define DEBUG_IRDA_SR_TRACE 0 /* IRDA subroutines */
+#define DEBUG_IRDA_SR_INFO 0 /* various info */
+#define DEBUG_IRDA_SR_ERROR 1 /* problems */
+#define DEBUG_IRDA_SOCK_TRACE 0 /* IRDA main socket functions */
+#define DEBUG_IRDA_SOCK_INFO 0 /* various info */
+#define DEBUG_IRDA_SOCK_ERROR 1 /* problems */
+#define DEBUG_IRDA_SERV_TRACE 0 /* The IrNET server */
+#define DEBUG_IRDA_SERV_INFO 0 /* various info */
+#define DEBUG_IRDA_SERV_ERROR 1 /* problems */
+#define DEBUG_IRDA_TCB_TRACE 0 /* IRDA IrTTP callbacks */
+#define DEBUG_IRDA_CB_INFO 0 /* various info */
+#define DEBUG_IRDA_CB_ERROR 1 /* problems */
+#define DEBUG_IRDA_OCB_TRACE 0 /* IRDA other callbacks */
+#define DEBUG_IRDA_OCB_INFO 0 /* various info */
+#define DEBUG_IRDA_OCB_ERROR 1 /* problems */
+
+#define DEBUG_ASSERT 0 /* Verify all assertions */
+
+/*
+ * These are the macros we are using to actually print the debug
+ * statements. Don't look at it, it's ugly...
+ *
+ * One of the trick is that, as the DEBUG_XXX are constant, the
+ * compiler will optimise away the if() in all cases.
+ */
+/* All error messages (will show up in the normal logs) */
+#define DERROR(dbg, format, args...) \
+ {if(DEBUG_##dbg) \
+ printk(KERN_INFO "irnet: %s(): " format, __func__ , ##args);}
+
+/* Normal debug message (will show up in /var/log/debug) */
+#define DEBUG(dbg, format, args...) \
+ {if(DEBUG_##dbg) \
+ printk(KERN_DEBUG "irnet: %s(): " format, __func__ , ##args);}
+
+/* Entering a function (trace) */
+#define DENTER(dbg, format, args...) \
+ {if(DEBUG_##dbg) \
+ printk(KERN_DEBUG "irnet: -> %s" format, __func__ , ##args);}
+
+/* Entering and exiting a function in one go (trace) */
+#define DPASS(dbg, format, args...) \
+ {if(DEBUG_##dbg) \
+ printk(KERN_DEBUG "irnet: <>%s" format, __func__ , ##args);}
+
+/* Exiting a function (trace) */
+#define DEXIT(dbg, format, args...) \
+ {if(DEBUG_##dbg) \
+ printk(KERN_DEBUG "irnet: <-%s()" format, __func__ , ##args);}
+
+/* Exit a function with debug */
+#define DRETURN(ret, dbg, args...) \
+ {DEXIT(dbg, ": " args);\
+ return ret; }
+
+/* Exit a function on failed condition */
+#define DABORT(cond, ret, dbg, args...) \
+ {if(cond) {\
+ DERROR(dbg, args);\
+ return ret; }}
+
+/* Invalid assertion, print out an error and exit... */
+#define DASSERT(cond, ret, dbg, args...) \
+ {if((DEBUG_ASSERT) && !(cond)) {\
+ DERROR(dbg, "Invalid assertion: " args);\
+ return ret; }}
+
+/************************ CONSTANTS & MACROS ************************/
+
+/* Paranoia */
+#define IRNET_MAGIC 0xB00754
+
+/* Number of control events in the control channel buffer... */
+#define IRNET_MAX_EVENTS 8 /* Should be more than enough... */
+
+/****************************** TYPES ******************************/
+
+/*
+ * This is the main structure where we store all the data pertaining to
+ * one instance of irnet.
+ * Note : in irnet functions, a pointer this structure is usually called
+ * "ap" or "self". If the code is borrowed from the IrDA stack, it tend
+ * to be called "self", and if it is borrowed from the PPP driver it is
+ * "ap". Apart from that, it's exactly the same structure ;-)
+ */
+typedef struct irnet_socket
+{
+ /* ------------------- Instance management ------------------- */
+ /* We manage a linked list of IrNET socket instances */
+ irda_queue_t q; /* Must be first - for hasbin */
+ int magic; /* Paranoia */
+
+ /* --------------------- FileSystem part --------------------- */
+ /* "pppd" interact directly with us on a /dev/ file */
+ struct file * file; /* File descriptor of this instance */
+ /* TTY stuff - to keep "pppd" happy */
+ struct ktermios termios; /* Various tty flags */
+ /* Stuff for the control channel */
+ int event_index; /* Last read in the event log */
+
+ /* ------------------------- PPP part ------------------------- */
+ /* We interface directly to the ppp_generic driver in the kernel */
+ int ppp_open; /* registered with ppp_generic */
+ struct ppp_channel chan; /* Interface to generic ppp layer */
+
+ int mru; /* Max size of PPP payload */
+ u32 xaccm[8]; /* Asynchronous character map (just */
+ u32 raccm; /* to please pppd - dummy) */
+ unsigned int flags; /* PPP flags (compression, ...) */
+ unsigned int rbits; /* Unused receive flags ??? */
+ struct work_struct disconnect_work; /* Process context disconnection */
+ /* ------------------------ IrTTP part ------------------------ */
+ /* We create a pseudo "socket" over the IrDA tranport */
+ unsigned long ttp_open; /* Set when IrTTP is ready */
+ unsigned long ttp_connect; /* Set when IrTTP is connecting */
+ struct tsap_cb * tsap; /* IrTTP instance (the connection) */
+
+ char rname[NICKNAME_MAX_LEN + 1];
+ /* IrDA nickname of destination */
+ __u32 rdaddr; /* Requested peer IrDA address */
+ __u32 rsaddr; /* Requested local IrDA address */
+ __u32 daddr; /* actual peer IrDA address */
+ __u32 saddr; /* my local IrDA address */
+ __u8 dtsap_sel; /* Remote TSAP selector */
+ __u8 stsap_sel; /* Local TSAP selector */
+
+ __u32 max_sdu_size_rx;/* Socket parameters used for IrTTP */
+ __u32 max_sdu_size_tx;
+ __u32 max_data_size;
+ __u8 max_header_size;
+ LOCAL_FLOW tx_flow; /* State of the Tx path in IrTTP */
+
+ /* ------------------- IrLMP and IrIAS part ------------------- */
+ /* Used for IrDA Discovery and socket name resolution */
+ void * ckey; /* IrLMP client handle */
+ __u16 mask; /* Hint bits mask (filter discov.)*/
+ int nslots; /* Number of slots for discovery */
+
+ struct iriap_cb * iriap; /* Used to query remote IAS */
+ int errno; /* status of the IAS query */
+
+ /* -------------------- Discovery log part -------------------- */
+ /* Used by initial discovery on the control channel
+ * and by irnet_discover_daddr_and_lsap_sel() */
+ struct irda_device_info *discoveries; /* Copy of the discovery log */
+ int disco_index; /* Last read in the discovery log */
+ int disco_number; /* Size of the discovery log */
+
+ struct mutex lock;
+
+} irnet_socket;
+
+/*
+ * This is the various event that we will generate on the control channel
+ */
+typedef enum irnet_event
+{
+ IRNET_DISCOVER, /* New IrNET node discovered */
+ IRNET_EXPIRE, /* IrNET node expired */
+ IRNET_CONNECT_TO, /* IrNET socket has connected to other node */
+ IRNET_CONNECT_FROM, /* Other node has connected to IrNET socket */
+ IRNET_REQUEST_FROM, /* Non satisfied connection request */
+ IRNET_NOANSWER_FROM, /* Failed connection request */
+ IRNET_BLOCKED_LINK, /* Link (IrLAP) is blocked for > 3s */
+ IRNET_DISCONNECT_FROM, /* IrNET socket has disconnected */
+ IRNET_DISCONNECT_TO /* Closing IrNET socket */
+} irnet_event;
+
+/*
+ * This is the storage for an event and its arguments
+ */
+typedef struct irnet_log
+{
+ irnet_event event;
+ int unit;
+ __u32 saddr;
+ __u32 daddr;
+ char name[NICKNAME_MAX_LEN + 1]; /* 21 + 1 */
+ __u16_host_order hints; /* Discovery hint bits */
+} irnet_log;
+
+/*
+ * This is the storage for all events and related stuff...
+ */
+typedef struct irnet_ctrl_channel
+{
+ irnet_log log[IRNET_MAX_EVENTS]; /* Event log */
+ int index; /* Current index in log */
+ spinlock_t spinlock; /* Serialize access to the event log */
+ wait_queue_head_t rwait; /* processes blocked on read (or poll) */
+} irnet_ctrl_channel;
+
+/**************************** PROTOTYPES ****************************/
+/*
+ * Global functions of the IrNET module
+ * Note : we list here also functions called from one file to the other.
+ */
+
+/* -------------------------- IRDA PART -------------------------- */
+int irda_irnet_create(irnet_socket *); /* Initialise an IrNET socket */
+int irda_irnet_connect(irnet_socket *); /* Try to connect over IrDA */
+void irda_irnet_destroy(irnet_socket *); /* Teardown an IrNET socket */
+int irda_irnet_init(void); /* Initialise IrDA part of IrNET */
+void irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */
+
+/**************************** VARIABLES ****************************/
+
+/* Control channel stuff - allocated in irnet_irda.h */
+extern struct irnet_ctrl_channel irnet_events;
+
+#endif /* IRNET_H */
diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c
new file mode 100644
index 000000000..7f17a8020
--- /dev/null
+++ b/net/irda/irnet/irnet_irda.c
@@ -0,0 +1,1885 @@
+/*
+ * IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ * Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file implement the IRDA interface of IrNET.
+ * Basically, we sit on top of IrTTP. We set up IrTTP, IrIAS properly,
+ * and exchange frames with IrTTP.
+ */
+
+#include "irnet_irda.h" /* Private header */
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+/*
+ * PPP disconnect work: we need to make sure we're in
+ * process context when calling ppp_unregister_channel().
+ */
+static void irnet_ppp_disconnect(struct work_struct *work)
+{
+ irnet_socket * self =
+ container_of(work, irnet_socket, disconnect_work);
+
+ if (self == NULL)
+ return;
+ /*
+ * If we were connected, cleanup & close the PPP
+ * channel, which will kill pppd (hangup) and the rest.
+ */
+ if (self->ppp_open && !self->ttp_open && !self->ttp_connect) {
+ ppp_unregister_channel(&self->chan);
+ self->ppp_open = 0;
+ }
+}
+
+/************************* CONTROL CHANNEL *************************/
+/*
+ * When ppp is not active, /dev/irnet act as a control channel.
+ * Writing allow to set up the IrDA destination of the IrNET channel,
+ * and any application may be read events happening on IrNET...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Post an event to the control channel...
+ * Put the event in the log, and then wait all process blocked on read
+ * so they can read the log...
+ */
+static void
+irnet_post_event(irnet_socket * ap,
+ irnet_event event,
+ __u32 saddr,
+ __u32 daddr,
+ char * name,
+ __u16 hints)
+{
+ int index; /* In the log */
+
+ DENTER(CTRL_TRACE, "(ap=0x%p, event=%d, daddr=%08x, name=``%s'')\n",
+ ap, event, daddr, name);
+
+ /* Protect this section via spinlock.
+ * Note : as we are the only event producer, we only need to exclude
+ * ourself when touching the log, which is nice and easy.
+ */
+ spin_lock_bh(&irnet_events.spinlock);
+
+ /* Copy the event in the log */
+ index = irnet_events.index;
+ irnet_events.log[index].event = event;
+ irnet_events.log[index].daddr = daddr;
+ irnet_events.log[index].saddr = saddr;
+ /* Try to copy IrDA nickname */
+ if(name)
+ strcpy(irnet_events.log[index].name, name);
+ else
+ irnet_events.log[index].name[0] = '\0';
+ /* Copy hints */
+ irnet_events.log[index].hints.word = hints;
+ /* Try to get ppp unit number */
+ if((ap != (irnet_socket *) NULL) && (ap->ppp_open))
+ irnet_events.log[index].unit = ppp_unit_number(&ap->chan);
+ else
+ irnet_events.log[index].unit = -1;
+
+ /* Increment the index
+ * Note that we increment the index only after the event is written,
+ * to make sure that the readers don't get garbage... */
+ irnet_events.index = (index + 1) % IRNET_MAX_EVENTS;
+
+ DEBUG(CTRL_INFO, "New event index is %d\n", irnet_events.index);
+
+ /* Spin lock end */
+ spin_unlock_bh(&irnet_events.spinlock);
+
+ /* Now : wake up everybody waiting for events... */
+ wake_up_interruptible_all(&irnet_events.rwait);
+
+ DEXIT(CTRL_TRACE, "\n");
+}
+
+/************************* IRDA SUBROUTINES *************************/
+/*
+ * These are a bunch of subroutines called from other functions
+ * down there, mostly common code or to improve readability...
+ *
+ * Note : we duplicate quite heavily some routines of af_irda.c,
+ * because our input structure (self) is quite different
+ * (struct irnet instead of struct irda_sock), which make sharing
+ * the same code impossible (at least, without templates).
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_open_tsap (self)
+ *
+ * Open local Transport Service Access Point (TSAP)
+ *
+ * Create a IrTTP instance for us and set all the IrTTP callbacks.
+ */
+static inline int
+irnet_open_tsap(irnet_socket * self)
+{
+ notify_t notify; /* Callback structure */
+
+ DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+ DABORT(self->tsap != NULL, -EBUSY, IRDA_SR_ERROR, "Already busy !\n");
+
+ /* Initialize IrTTP callbacks to be used by the IrDA stack */
+ irda_notify_init(&notify);
+ notify.connect_confirm = irnet_connect_confirm;
+ notify.connect_indication = irnet_connect_indication;
+ notify.disconnect_indication = irnet_disconnect_indication;
+ notify.data_indication = irnet_data_indication;
+ /*notify.udata_indication = NULL;*/
+ notify.flow_indication = irnet_flow_indication;
+ notify.status_indication = irnet_status_indication;
+ notify.instance = self;
+ strlcpy(notify.name, IRNET_NOTIFY_NAME, sizeof(notify.name));
+
+ /* Open an IrTTP instance */
+ self->tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT,
+ &notify);
+ DABORT(self->tsap == NULL, -ENOMEM,
+ IRDA_SR_ERROR, "Unable to allocate TSAP !\n");
+
+ /* Remember which TSAP selector we actually got */
+ self->stsap_sel = self->tsap->stsap_sel;
+
+ DEXIT(IRDA_SR_TRACE, " - tsap=0x%p, sel=0x%X\n",
+ self->tsap, self->stsap_sel);
+ return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_ias_to_tsap (self, result, value)
+ *
+ * Examine an IAS object and extract TSAP
+ *
+ * We do an IAP query to find the TSAP associated with the IrNET service.
+ * When IrIAP pass us the result of the query, this function look at
+ * the return values to check for failures and extract the TSAP if
+ * possible.
+ * Also deallocate value
+ * The failure is in self->errno
+ * Return TSAP or -1
+ */
+static inline __u8
+irnet_ias_to_tsap(irnet_socket * self,
+ int result,
+ struct ias_value * value)
+{
+ __u8 dtsap_sel = 0; /* TSAP we are looking for */
+
+ DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+ /* By default, no error */
+ self->errno = 0;
+
+ /* Check if request succeeded */
+ switch(result)
+ {
+ /* Standard errors : service not available */
+ case IAS_CLASS_UNKNOWN:
+ case IAS_ATTRIB_UNKNOWN:
+ DEBUG(IRDA_SR_INFO, "IAS object doesn't exist ! (%d)\n", result);
+ self->errno = -EADDRNOTAVAIL;
+ break;
+
+ /* Other errors, most likely IrDA stack failure */
+ default :
+ DEBUG(IRDA_SR_INFO, "IAS query failed ! (%d)\n", result);
+ self->errno = -EHOSTUNREACH;
+ break;
+
+ /* Success : we got what we wanted */
+ case IAS_SUCCESS:
+ break;
+ }
+
+ /* Check what was returned to us */
+ if(value != NULL)
+ {
+ /* What type of argument have we got ? */
+ switch(value->type)
+ {
+ case IAS_INTEGER:
+ DEBUG(IRDA_SR_INFO, "result=%d\n", value->t.integer);
+ if(value->t.integer != -1)
+ /* Get the remote TSAP selector */
+ dtsap_sel = value->t.integer;
+ else
+ self->errno = -EADDRNOTAVAIL;
+ break;
+ default:
+ self->errno = -EADDRNOTAVAIL;
+ DERROR(IRDA_SR_ERROR, "bad type ! (0x%X)\n", value->type);
+ break;
+ }
+
+ /* Cleanup */
+ irias_delete_value(value);
+ }
+ else /* value == NULL */
+ {
+ /* Nothing returned to us - usually result != SUCCESS */
+ if(!(self->errno))
+ {
+ DERROR(IRDA_SR_ERROR,
+ "IrDA bug : result == SUCCESS && value == NULL\n");
+ self->errno = -EHOSTUNREACH;
+ }
+ }
+ DEXIT(IRDA_SR_TRACE, "\n");
+
+ /* Return the TSAP */
+ return dtsap_sel;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_find_lsap_sel (self)
+ *
+ * Try to lookup LSAP selector in remote LM-IAS
+ *
+ * Basically, we start a IAP query, and then go to sleep. When the query
+ * return, irnet_getvalue_confirm will wake us up, and we can examine the
+ * result of the query...
+ * Note that in some case, the query fail even before we go to sleep,
+ * creating some races...
+ */
+static inline int
+irnet_find_lsap_sel(irnet_socket * self)
+{
+ DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+ /* This should not happen */
+ DABORT(self->iriap, -EBUSY, IRDA_SR_ERROR, "busy with a previous query.\n");
+
+ /* Create an IAP instance, will be closed in irnet_getvalue_confirm() */
+ self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+ irnet_getvalue_confirm);
+
+ /* Treat unexpected signals as disconnect */
+ self->errno = -EHOSTUNREACH;
+
+ /* Query remote LM-IAS */
+ iriap_getvaluebyclass_request(self->iriap, self->rsaddr, self->daddr,
+ IRNET_SERVICE_NAME, IRNET_IAS_VALUE);
+
+ /* The above request is non-blocking.
+ * After a while, IrDA will call us back in irnet_getvalue_confirm()
+ * We will then call irnet_ias_to_tsap() and finish the
+ * connection procedure */
+
+ DEXIT(IRDA_SR_TRACE, "\n");
+ return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_connect_tsap (self)
+ *
+ * Initialise the TTP socket and initiate TTP connection
+ *
+ */
+static inline int
+irnet_connect_tsap(irnet_socket * self)
+{
+ int err;
+
+ DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+ /* Open a local TSAP (an IrTTP instance) */
+ err = irnet_open_tsap(self);
+ if(err != 0)
+ {
+ clear_bit(0, &self->ttp_connect);
+ DERROR(IRDA_SR_ERROR, "connect aborted!\n");
+ return err;
+ }
+
+ /* Connect to remote device */
+ err = irttp_connect_request(self->tsap, self->dtsap_sel,
+ self->rsaddr, self->daddr, NULL,
+ self->max_sdu_size_rx, NULL);
+ if(err != 0)
+ {
+ clear_bit(0, &self->ttp_connect);
+ DERROR(IRDA_SR_ERROR, "connect aborted!\n");
+ return err;
+ }
+
+ /* The above call is non-blocking.
+ * After a while, the IrDA stack will either call us back in
+ * irnet_connect_confirm() or irnet_disconnect_indication()
+ * See you there ;-) */
+
+ DEXIT(IRDA_SR_TRACE, "\n");
+ return err;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_discover_next_daddr (self)
+ *
+ * Query the IrNET TSAP of the next device in the log.
+ *
+ * Used in the TSAP discovery procedure.
+ */
+static inline int
+irnet_discover_next_daddr(irnet_socket * self)
+{
+ /* Close the last instance of IrIAP, and open a new one.
+ * We can't reuse the IrIAP instance in the IrIAP callback */
+ if(self->iriap)
+ {
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+ }
+ /* Create a new IAP instance */
+ self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+ irnet_discovervalue_confirm);
+ if(self->iriap == NULL)
+ return -ENOMEM;
+
+ /* Next discovery - before the call to avoid races */
+ self->disco_index++;
+
+ /* Check if we have one more address to try */
+ if(self->disco_index < self->disco_number)
+ {
+ /* Query remote LM-IAS */
+ iriap_getvaluebyclass_request(self->iriap,
+ self->discoveries[self->disco_index].saddr,
+ self->discoveries[self->disco_index].daddr,
+ IRNET_SERVICE_NAME, IRNET_IAS_VALUE);
+ /* The above request is non-blocking.
+ * After a while, IrDA will call us back in irnet_discovervalue_confirm()
+ * We will then call irnet_ias_to_tsap() and come back here again... */
+ return 0;
+ }
+ else
+ return 1;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_discover_daddr_and_lsap_sel (self)
+ *
+ * This try to find a device with the requested service.
+ *
+ * Initiate a TSAP discovery procedure.
+ * It basically look into the discovery log. For each address in the list,
+ * it queries the LM-IAS of the device to find if this device offer
+ * the requested service.
+ * If there is more than one node supporting the service, we complain
+ * to the user (it should move devices around).
+ * If we find one node which have the requested TSAP, we connect to it.
+ *
+ * This function just start the whole procedure. It request the discovery
+ * log and submit the first IAS query.
+ * The bulk of the job is handled in irnet_discovervalue_confirm()
+ *
+ * Note : this procedure fails if there is more than one device in range
+ * on the same dongle, because IrLMP doesn't disconnect the LAP when the
+ * last LSAP is closed. Moreover, we would need to wait the LAP
+ * disconnection...
+ */
+static inline int
+irnet_discover_daddr_and_lsap_sel(irnet_socket * self)
+{
+ int ret;
+
+ DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+ /* Ask lmp for the current discovery log */
+ self->discoveries = irlmp_get_discoveries(&self->disco_number, self->mask,
+ DISCOVERY_DEFAULT_SLOTS);
+
+ /* Check if the we got some results */
+ if(self->discoveries == NULL)
+ {
+ self->disco_number = -1;
+ clear_bit(0, &self->ttp_connect);
+ DRETURN(-ENETUNREACH, IRDA_SR_INFO, "No Cachelog...\n");
+ }
+ DEBUG(IRDA_SR_INFO, "Got the log (0x%p), size is %d\n",
+ self->discoveries, self->disco_number);
+
+ /* Start with the first discovery */
+ self->disco_index = -1;
+ self->daddr = DEV_ADDR_ANY;
+
+ /* This will fail if the log is empty - this is non-blocking */
+ ret = irnet_discover_next_daddr(self);
+ if(ret)
+ {
+ /* Close IAP */
+ if(self->iriap)
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+
+ /* Cleanup our copy of the discovery log */
+ kfree(self->discoveries);
+ self->discoveries = NULL;
+
+ clear_bit(0, &self->ttp_connect);
+ DRETURN(-ENETUNREACH, IRDA_SR_INFO, "Cachelog empty...\n");
+ }
+
+ /* Follow me in irnet_discovervalue_confirm() */
+
+ DEXIT(IRDA_SR_TRACE, "\n");
+ return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_dname_to_daddr (self)
+ *
+ * Convert an IrDA nickname to a valid IrDA address
+ *
+ * It basically look into the discovery log until there is a match.
+ */
+static inline int
+irnet_dname_to_daddr(irnet_socket * self)
+{
+ struct irda_device_info *discoveries; /* Copy of the discovery log */
+ int number; /* Number of nodes in the log */
+ int i;
+
+ DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+ /* Ask lmp for the current discovery log */
+ discoveries = irlmp_get_discoveries(&number, 0xffff,
+ DISCOVERY_DEFAULT_SLOTS);
+ /* Check if the we got some results */
+ if(discoveries == NULL)
+ DRETURN(-ENETUNREACH, IRDA_SR_INFO, "Cachelog empty...\n");
+
+ /*
+ * Now, check all discovered devices (if any), and connect
+ * client only about the services that the client is
+ * interested in...
+ */
+ for(i = 0; i < number; i++)
+ {
+ /* Does the name match ? */
+ if(!strncmp(discoveries[i].info, self->rname, NICKNAME_MAX_LEN))
+ {
+ /* Yes !!! Get it.. */
+ self->daddr = discoveries[i].daddr;
+ DEBUG(IRDA_SR_INFO, "discovered device ``%s'' at address 0x%08x.\n",
+ self->rname, self->daddr);
+ kfree(discoveries);
+ DEXIT(IRDA_SR_TRACE, "\n");
+ return 0;
+ }
+ }
+ /* No luck ! */
+ DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname);
+ kfree(discoveries);
+ return -EADDRNOTAVAIL;
+}
+
+
+/************************* SOCKET ROUTINES *************************/
+/*
+ * This are the main operations on IrNET sockets, basically to create
+ * and destroy IrNET sockets. These are called from the PPP part...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Create a IrNET instance : just initialise some parameters...
+ */
+int
+irda_irnet_create(irnet_socket * self)
+{
+ DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self);
+
+ self->magic = IRNET_MAGIC; /* Paranoia */
+
+ self->ttp_open = 0; /* Prevent higher layer from accessing IrTTP */
+ self->ttp_connect = 0; /* Not connecting yet */
+ self->rname[0] = '\0'; /* May be set via control channel */
+ self->rdaddr = DEV_ADDR_ANY; /* May be set via control channel */
+ self->rsaddr = DEV_ADDR_ANY; /* May be set via control channel */
+ self->daddr = DEV_ADDR_ANY; /* Until we get connected */
+ self->saddr = DEV_ADDR_ANY; /* Until we get connected */
+ self->max_sdu_size_rx = TTP_SAR_UNBOUND;
+
+ /* Register as a client with IrLMP */
+ self->ckey = irlmp_register_client(0, NULL, NULL, NULL);
+#ifdef DISCOVERY_NOMASK
+ self->mask = 0xffff; /* For W2k compatibility */
+#else /* DISCOVERY_NOMASK */
+ self->mask = irlmp_service_to_hint(S_LAN);
+#endif /* DISCOVERY_NOMASK */
+ self->tx_flow = FLOW_START; /* Flow control from IrTTP */
+
+ INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect);
+
+ DEXIT(IRDA_SOCK_TRACE, "\n");
+ return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Connect to the other side :
+ * o convert device name to an address
+ * o find the socket number (dlsap)
+ * o Establish the connection
+ *
+ * Note : We no longer mimic af_irda. The IAS query for finding the TSAP
+ * is done asynchronously, like the TTP connection. This allow us to
+ * call this function from any context (not only process).
+ * The downside is that following what's happening in there is tricky
+ * because it involve various functions all over the place...
+ */
+int
+irda_irnet_connect(irnet_socket * self)
+{
+ int err;
+
+ DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self);
+
+ /* Check if we are already trying to connect.
+ * Because irda_irnet_connect() can be called directly by pppd plus
+ * packet retries in ppp_generic and connect may take time, plus we may
+ * race with irnet_connect_indication(), we need to be careful there... */
+ if(test_and_set_bit(0, &self->ttp_connect))
+ DRETURN(-EBUSY, IRDA_SOCK_INFO, "Already connecting...\n");
+ if((self->iriap != NULL) || (self->tsap != NULL))
+ DERROR(IRDA_SOCK_ERROR, "Socket not cleaned up...\n");
+
+ /* Insert ourselves in the hashbin so that the IrNET server can find us.
+ * Notes : 4th arg is string of 32 char max and must be null terminated
+ * When 4th arg is used (string), 3rd arg isn't (int)
+ * Can't re-insert (MUST remove first) so check for that... */
+ if((irnet_server.running) && (self->q.q_next == NULL))
+ {
+ spin_lock_bh(&irnet_server.spinlock);
+ hashbin_insert(irnet_server.list, (irda_queue_t *) self, 0, self->rname);
+ spin_unlock_bh(&irnet_server.spinlock);
+ DEBUG(IRDA_SOCK_INFO, "Inserted ``%s'' in hashbin...\n", self->rname);
+ }
+
+ /* If we don't have anything (no address, no name) */
+ if((self->rdaddr == DEV_ADDR_ANY) && (self->rname[0] == '\0'))
+ {
+ /* Try to find a suitable address */
+ if((err = irnet_discover_daddr_and_lsap_sel(self)) != 0)
+ DRETURN(err, IRDA_SOCK_INFO, "auto-connect failed!\n");
+ /* In most cases, the call above is non-blocking */
+ }
+ else
+ {
+ /* If we have only the name (no address), try to get an address */
+ if(self->rdaddr == DEV_ADDR_ANY)
+ {
+ if((err = irnet_dname_to_daddr(self)) != 0)
+ DRETURN(err, IRDA_SOCK_INFO, "name connect failed!\n");
+ }
+ else
+ /* Use the requested destination address */
+ self->daddr = self->rdaddr;
+
+ /* Query remote LM-IAS to find LSAP selector */
+ irnet_find_lsap_sel(self);
+ /* The above call is non blocking */
+ }
+
+ /* At this point, we are waiting for the IrDA stack to call us back,
+ * or we have already failed.
+ * We will finish the connection procedure in irnet_connect_tsap().
+ */
+ DEXIT(IRDA_SOCK_TRACE, "\n");
+ return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_irnet_destroy(self)
+ *
+ * Destroy irnet instance
+ *
+ * Note : this need to be called from a process context.
+ */
+void
+irda_irnet_destroy(irnet_socket * self)
+{
+ DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self);
+ if(self == NULL)
+ return;
+
+ /* Remove ourselves from hashbin (if we are queued in hashbin)
+ * Note : `irnet_server.running' protect us from calls in hashbin_delete() */
+ if((irnet_server.running) && (self->q.q_next != NULL))
+ {
+ struct irnet_socket * entry;
+ DEBUG(IRDA_SOCK_INFO, "Removing from hash..\n");
+ spin_lock_bh(&irnet_server.spinlock);
+ entry = hashbin_remove_this(irnet_server.list, (irda_queue_t *) self);
+ self->q.q_next = NULL;
+ spin_unlock_bh(&irnet_server.spinlock);
+ DASSERT(entry == self, , IRDA_SOCK_ERROR, "Can't remove from hash.\n");
+ }
+
+ /* If we were connected, post a message */
+ if(test_bit(0, &self->ttp_open))
+ {
+ /* Note : as the disconnect comes from ppp_generic, the unit number
+ * doesn't exist anymore when we post the event, so we need to pass
+ * NULL as the first arg... */
+ irnet_post_event(NULL, IRNET_DISCONNECT_TO,
+ self->saddr, self->daddr, self->rname, 0);
+ }
+
+ /* Prevent various IrDA callbacks from messing up things
+ * Need to be first */
+ clear_bit(0, &self->ttp_connect);
+
+ /* Prevent higher layer from accessing IrTTP */
+ clear_bit(0, &self->ttp_open);
+
+ /* Unregister with IrLMP */
+ irlmp_unregister_client(self->ckey);
+
+ /* Unregister with LM-IAS */
+ if(self->iriap)
+ {
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+ }
+
+ /* Cleanup eventual discoveries from connection attempt or control channel */
+ if(self->discoveries != NULL)
+ {
+ /* Cleanup our copy of the discovery log */
+ kfree(self->discoveries);
+ self->discoveries = NULL;
+ }
+
+ /* Close our IrTTP connection */
+ if(self->tsap)
+ {
+ DEBUG(IRDA_SOCK_INFO, "Closing our TTP connection.\n");
+ irttp_disconnect_request(self->tsap, NULL, P_NORMAL);
+ irttp_close_tsap(self->tsap);
+ self->tsap = NULL;
+ }
+ self->stsap_sel = 0;
+
+ DEXIT(IRDA_SOCK_TRACE, "\n");
+}
+
+
+/************************** SERVER SOCKET **************************/
+/*
+ * The IrNET service is composed of one server socket and a variable
+ * number of regular IrNET sockets. The server socket is supposed to
+ * handle incoming connections and redirect them to one IrNET sockets.
+ * It's a superset of the regular IrNET socket, but has a very distinct
+ * behaviour...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_daddr_to_dname (self)
+ *
+ * Convert an IrDA address to a IrDA nickname
+ *
+ * It basically look into the discovery log until there is a match.
+ */
+static inline int
+irnet_daddr_to_dname(irnet_socket * self)
+{
+ struct irda_device_info *discoveries; /* Copy of the discovery log */
+ int number; /* Number of nodes in the log */
+ int i;
+
+ DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self);
+
+ /* Ask lmp for the current discovery log */
+ discoveries = irlmp_get_discoveries(&number, 0xffff,
+ DISCOVERY_DEFAULT_SLOTS);
+ /* Check if the we got some results */
+ if (discoveries == NULL)
+ DRETURN(-ENETUNREACH, IRDA_SERV_INFO, "Cachelog empty...\n");
+
+ /* Now, check all discovered devices (if any) */
+ for(i = 0; i < number; i++)
+ {
+ /* Does the name match ? */
+ if(discoveries[i].daddr == self->daddr)
+ {
+ /* Yes !!! Get it.. */
+ strlcpy(self->rname, discoveries[i].info, sizeof(self->rname));
+ self->rname[sizeof(self->rname) - 1] = '\0';
+ DEBUG(IRDA_SERV_INFO, "Device 0x%08x is in fact ``%s''.\n",
+ self->daddr, self->rname);
+ kfree(discoveries);
+ DEXIT(IRDA_SERV_TRACE, "\n");
+ return 0;
+ }
+ }
+ /* No luck ! */
+ DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr);
+ kfree(discoveries);
+ return -EADDRNOTAVAIL;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_find_socket (self)
+ *
+ * Find the correct IrNET socket
+ *
+ * Look into the list of IrNET sockets and finds one with the right
+ * properties...
+ */
+static inline irnet_socket *
+irnet_find_socket(irnet_socket * self)
+{
+ irnet_socket * new = (irnet_socket *) NULL;
+ int err;
+
+ DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self);
+
+ /* Get the addresses of the requester */
+ self->daddr = irttp_get_daddr(self->tsap);
+ self->saddr = irttp_get_saddr(self->tsap);
+
+ /* Try to get the IrDA nickname of the requester */
+ err = irnet_daddr_to_dname(self);
+
+ /* Protect access to the instance list */
+ spin_lock_bh(&irnet_server.spinlock);
+
+ /* So now, try to get an socket having specifically
+ * requested that nickname */
+ if(err == 0)
+ {
+ new = (irnet_socket *) hashbin_find(irnet_server.list,
+ 0, self->rname);
+ if(new)
+ DEBUG(IRDA_SERV_INFO, "Socket 0x%p matches rname ``%s''.\n",
+ new, new->rname);
+ }
+
+ /* If no name matches, try to find an socket by the destination address */
+ /* It can be either the requested destination address (set via the
+ * control channel), or the current destination address if the
+ * socket is in the middle of a connection request */
+ if(new == (irnet_socket *) NULL)
+ {
+ new = (irnet_socket *) hashbin_get_first(irnet_server.list);
+ while(new !=(irnet_socket *) NULL)
+ {
+ /* Does it have the same address ? */
+ if((new->rdaddr == self->daddr) || (new->daddr == self->daddr))
+ {
+ /* Yes !!! Get it.. */
+ DEBUG(IRDA_SERV_INFO, "Socket 0x%p matches daddr %#08x.\n",
+ new, self->daddr);
+ break;
+ }
+ new = (irnet_socket *) hashbin_get_next(irnet_server.list);
+ }
+ }
+
+ /* If we don't have any socket, get the first unconnected socket */
+ if(new == (irnet_socket *) NULL)
+ {
+ new = (irnet_socket *) hashbin_get_first(irnet_server.list);
+ while(new !=(irnet_socket *) NULL)
+ {
+ /* Is it available ? */
+ if(!(test_bit(0, &new->ttp_open)) && (new->rdaddr == DEV_ADDR_ANY) &&
+ (new->rname[0] == '\0') && (new->ppp_open))
+ {
+ /* Yes !!! Get it.. */
+ DEBUG(IRDA_SERV_INFO, "Socket 0x%p is free.\n",
+ new);
+ break;
+ }
+ new = (irnet_socket *) hashbin_get_next(irnet_server.list);
+ }
+ }
+
+ /* Spin lock end */
+ spin_unlock_bh(&irnet_server.spinlock);
+
+ DEXIT(IRDA_SERV_TRACE, " - new = 0x%p\n", new);
+ return new;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_connect_socket (self)
+ *
+ * Connect an incoming connection to the socket
+ *
+ */
+static inline int
+irnet_connect_socket(irnet_socket * server,
+ irnet_socket * new,
+ struct qos_info * qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size)
+{
+ DENTER(IRDA_SERV_TRACE, "(server=0x%p, new=0x%p)\n",
+ server, new);
+
+ /* Now attach up the new socket */
+ new->tsap = irttp_dup(server->tsap, new);
+ DABORT(new->tsap == NULL, -1, IRDA_SERV_ERROR, "dup failed!\n");
+
+ /* Set up all the relevant parameters on the new socket */
+ new->stsap_sel = new->tsap->stsap_sel;
+ new->dtsap_sel = new->tsap->dtsap_sel;
+ new->saddr = irttp_get_saddr(new->tsap);
+ new->daddr = irttp_get_daddr(new->tsap);
+
+ new->max_header_size = max_header_size;
+ new->max_sdu_size_tx = max_sdu_size;
+ new->max_data_size = max_sdu_size;
+#ifdef STREAM_COMPAT
+ /* If we want to receive "stream sockets" */
+ if(max_sdu_size == 0)
+ new->max_data_size = irttp_get_max_seg_size(new->tsap);
+#endif /* STREAM_COMPAT */
+
+ /* Clean up the original one to keep it in listen state */
+ irttp_listen(server->tsap);
+
+ /* Send a connection response on the new socket */
+ irttp_connect_response(new->tsap, new->max_sdu_size_rx, NULL);
+
+ /* Allow PPP to send its junk over the new socket... */
+ set_bit(0, &new->ttp_open);
+
+ /* Not connecting anymore, and clean up last possible remains
+ * of connection attempts on the socket */
+ clear_bit(0, &new->ttp_connect);
+ if(new->iriap)
+ {
+ iriap_close(new->iriap);
+ new->iriap = NULL;
+ }
+ if(new->discoveries != NULL)
+ {
+ kfree(new->discoveries);
+ new->discoveries = NULL;
+ }
+
+#ifdef CONNECT_INDIC_KICK
+ /* As currently we don't block packets in ppp_irnet_send() while passive,
+ * this is not really needed...
+ * Also, not doing it give IrDA a chance to finish the setup properly
+ * before being swamped with packets... */
+ ppp_output_wakeup(&new->chan);
+#endif /* CONNECT_INDIC_KICK */
+
+ /* Notify the control channel */
+ irnet_post_event(new, IRNET_CONNECT_FROM,
+ new->saddr, new->daddr, server->rname, 0);
+
+ DEXIT(IRDA_SERV_TRACE, "\n");
+ return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_disconnect_server (self)
+ *
+ * Cleanup the server socket when the incoming connection abort
+ *
+ */
+static inline void
+irnet_disconnect_server(irnet_socket * self,
+ struct sk_buff *skb)
+{
+ DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self);
+
+ /* Put the received packet in the black hole */
+ kfree_skb(skb);
+
+#ifdef FAIL_SEND_DISCONNECT
+ /* Tell the other party we don't want to be connected */
+ /* Hum... Is it the right thing to do ? And do we need to send
+ * a connect response before ? It looks ok without this... */
+ irttp_disconnect_request(self->tsap, NULL, P_NORMAL);
+#endif /* FAIL_SEND_DISCONNECT */
+
+ /* Notify the control channel (see irnet_find_socket()) */
+ irnet_post_event(NULL, IRNET_REQUEST_FROM,
+ self->saddr, self->daddr, self->rname, 0);
+
+ /* Clean up the server to keep it in listen state */
+ irttp_listen(self->tsap);
+
+ DEXIT(IRDA_SERV_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_setup_server (self)
+ *
+ * Create a IrTTP server and set it up...
+ *
+ * Register the IrLAN hint bit, create a IrTTP instance for us,
+ * set all the IrTTP callbacks and create an IrIAS entry...
+ */
+static inline int
+irnet_setup_server(void)
+{
+ __u16 hints;
+
+ DENTER(IRDA_SERV_TRACE, "()\n");
+
+ /* Initialise the regular socket part of the server */
+ irda_irnet_create(&irnet_server.s);
+
+ /* Open a local TSAP (an IrTTP instance) for the server */
+ irnet_open_tsap(&irnet_server.s);
+
+ /* PPP part setup */
+ irnet_server.s.ppp_open = 0;
+ irnet_server.s.chan.private = NULL;
+ irnet_server.s.file = NULL;
+
+ /* Get the hint bit corresponding to IrLAN */
+ /* Note : we overload the IrLAN hint bit. As it is only a "hint", and as
+ * we provide roughly the same functionality as IrLAN, this is ok.
+ * In fact, the situation is similar as JetSend overloading the Obex hint
+ */
+ hints = irlmp_service_to_hint(S_LAN);
+
+#ifdef ADVERTISE_HINT
+ /* Register with IrLMP as a service (advertise our hint bit) */
+ irnet_server.skey = irlmp_register_service(hints);
+#endif /* ADVERTISE_HINT */
+
+ /* Register with LM-IAS (so that people can connect to us) */
+ irnet_server.ias_obj = irias_new_object(IRNET_SERVICE_NAME, jiffies);
+ irias_add_integer_attrib(irnet_server.ias_obj, IRNET_IAS_VALUE,
+ irnet_server.s.stsap_sel, IAS_KERNEL_ATTR);
+ irias_insert_object(irnet_server.ias_obj);
+
+#ifdef DISCOVERY_EVENTS
+ /* Tell IrLMP we want to be notified of newly discovered nodes */
+ irlmp_update_client(irnet_server.s.ckey, hints,
+ irnet_discovery_indication, irnet_expiry_indication,
+ (void *) &irnet_server.s);
+#endif
+
+ DEXIT(IRDA_SERV_TRACE, " - self=0x%p\n", &irnet_server.s);
+ return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_destroy_server (self)
+ *
+ * Destroy the IrTTP server...
+ *
+ * Reverse of the previous function...
+ */
+static inline void
+irnet_destroy_server(void)
+{
+ DENTER(IRDA_SERV_TRACE, "()\n");
+
+#ifdef ADVERTISE_HINT
+ /* Unregister with IrLMP */
+ irlmp_unregister_service(irnet_server.skey);
+#endif /* ADVERTISE_HINT */
+
+ /* Unregister with LM-IAS */
+ if(irnet_server.ias_obj)
+ irias_delete_object(irnet_server.ias_obj);
+
+ /* Cleanup the socket part */
+ irda_irnet_destroy(&irnet_server.s);
+
+ DEXIT(IRDA_SERV_TRACE, "\n");
+}
+
+
+/************************ IRDA-TTP CALLBACKS ************************/
+/*
+ * When we create a IrTTP instance, we pass to it a set of callbacks
+ * that IrTTP will call in case of various events.
+ * We take care of those events here.
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_data_indication (instance, sap, skb)
+ *
+ * Received some data from TinyTP. Just queue it on the receive queue
+ *
+ */
+static int
+irnet_data_indication(void * instance,
+ void * sap,
+ struct sk_buff *skb)
+{
+ irnet_socket * ap = (irnet_socket *) instance;
+ unsigned char * p;
+ int code = 0;
+
+ DENTER(IRDA_TCB_TRACE, "(self/ap=0x%p, skb=0x%p)\n",
+ ap, skb);
+ DASSERT(skb != NULL, 0, IRDA_CB_ERROR, "skb is NULL !!!\n");
+
+ /* Check is ppp is ready to receive our packet */
+ if(!ap->ppp_open)
+ {
+ DERROR(IRDA_CB_ERROR, "PPP not ready, dropping packet...\n");
+ /* When we return error, TTP will need to requeue the skb and
+ * will stop the sender. IrTTP will stall until we send it a
+ * flow control request... */
+ return -ENOMEM;
+ }
+
+ /* strip address/control field if present */
+ p = skb->data;
+ if((p[0] == PPP_ALLSTATIONS) && (p[1] == PPP_UI))
+ {
+ /* chop off address/control */
+ if(skb->len < 3)
+ goto err_exit;
+ p = skb_pull(skb, 2);
+ }
+
+ /* decompress protocol field if compressed */
+ if(p[0] & 1)
+ {
+ /* protocol is compressed */
+ skb_push(skb, 1)[0] = 0;
+ }
+ else
+ if(skb->len < 2)
+ goto err_exit;
+
+ /* pass to generic ppp layer */
+ /* Note : how do I know if ppp can accept or not the packet ? This is
+ * essential if I want to manage flow control smoothly... */
+ ppp_input(&ap->chan, skb);
+
+ DEXIT(IRDA_TCB_TRACE, "\n");
+ return 0;
+
+ err_exit:
+ DERROR(IRDA_CB_ERROR, "Packet too small, dropping...\n");
+ kfree_skb(skb);
+ ppp_input_error(&ap->chan, code);
+ return 0; /* Don't return an error code, only for flow control... */
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_disconnect_indication (instance, sap, reason, skb)
+ *
+ * Connection has been closed. Chech reason to find out why
+ *
+ * Note : there are many cases where we come here :
+ * o attempted to connect, timeout
+ * o connected, link is broken, LAP has timeout
+ * o connected, other side close the link
+ * o connection request on the server not handled
+ */
+static void
+irnet_disconnect_indication(void * instance,
+ void * sap,
+ LM_REASON reason,
+ struct sk_buff *skb)
+{
+ irnet_socket * self = (irnet_socket *) instance;
+ int test_open;
+ int test_connect;
+
+ DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self);
+ DASSERT(self != NULL, , IRDA_CB_ERROR, "Self is NULL !!!\n");
+
+ /* Don't care about it, but let's not leak it */
+ if(skb)
+ dev_kfree_skb(skb);
+
+ /* Prevent higher layer from accessing IrTTP */
+ test_open = test_and_clear_bit(0, &self->ttp_open);
+ /* Not connecting anymore...
+ * (note : TSAP is open, so IAP callbacks are no longer pending...) */
+ test_connect = test_and_clear_bit(0, &self->ttp_connect);
+
+ /* If both self->ttp_open and self->ttp_connect are NULL, it mean that we
+ * have a race condition with irda_irnet_destroy() or
+ * irnet_connect_indication(), so don't mess up tsap...
+ */
+ if(!(test_open || test_connect))
+ {
+ DERROR(IRDA_CB_ERROR, "Race condition detected...\n");
+ return;
+ }
+
+ /* If we were active, notify the control channel */
+ if(test_open)
+ irnet_post_event(self, IRNET_DISCONNECT_FROM,
+ self->saddr, self->daddr, self->rname, 0);
+ else
+ /* If we were trying to connect, notify the control channel */
+ if((self->tsap) && (self != &irnet_server.s))
+ irnet_post_event(self, IRNET_NOANSWER_FROM,
+ self->saddr, self->daddr, self->rname, 0);
+
+ /* Close our IrTTP connection, cleanup tsap */
+ if((self->tsap) && (self != &irnet_server.s))
+ {
+ DEBUG(IRDA_CB_INFO, "Closing our TTP connection.\n");
+ irttp_close_tsap(self->tsap);
+ self->tsap = NULL;
+ }
+ /* Cleanup the socket in case we want to reconnect in ppp_output_wakeup() */
+ self->stsap_sel = 0;
+ self->daddr = DEV_ADDR_ANY;
+ self->tx_flow = FLOW_START;
+
+ /* Deal with the ppp instance if it's still alive */
+ if(self->ppp_open)
+ {
+ if(test_open)
+ {
+ /* ppp_unregister_channel() wants a user context. */
+ schedule_work(&self->disconnect_work);
+ }
+ else
+ {
+ /* If we were trying to connect, flush (drain) ppp_generic
+ * Tx queue (most often we have blocked it), which will
+ * trigger an other attempt to connect. If we are passive,
+ * this will empty the Tx queue after last try. */
+ ppp_output_wakeup(&self->chan);
+ }
+ }
+
+ DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_connect_confirm (instance, sap, qos, max_sdu_size, skb)
+ *
+ * Connections has been confirmed by the remote device
+ *
+ */
+static void
+irnet_connect_confirm(void * instance,
+ void * sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ irnet_socket * self = (irnet_socket *) instance;
+
+ DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self);
+
+ /* Check if socket is closing down (via irda_irnet_destroy()) */
+ if(! test_bit(0, &self->ttp_connect))
+ {
+ DERROR(IRDA_CB_ERROR, "Socket no longer connecting. Ouch !\n");
+ return;
+ }
+
+ /* How much header space do we need to reserve */
+ self->max_header_size = max_header_size;
+
+ /* IrTTP max SDU size in transmit direction */
+ self->max_sdu_size_tx = max_sdu_size;
+ self->max_data_size = max_sdu_size;
+#ifdef STREAM_COMPAT
+ if(max_sdu_size == 0)
+ self->max_data_size = irttp_get_max_seg_size(self->tsap);
+#endif /* STREAM_COMPAT */
+
+ /* At this point, IrLMP has assigned our source address */
+ self->saddr = irttp_get_saddr(self->tsap);
+
+ /* Allow higher layer to access IrTTP */
+ set_bit(0, &self->ttp_open);
+ clear_bit(0, &self->ttp_connect); /* Not racy, IrDA traffic is serial */
+ /* Give a kick in the ass of ppp_generic so that he sends us some data */
+ ppp_output_wakeup(&self->chan);
+
+ /* Check size of received packet */
+ if(skb->len > 0)
+ {
+#ifdef PASS_CONNECT_PACKETS
+ DEBUG(IRDA_CB_INFO, "Passing connect packet to PPP.\n");
+ /* Try to pass it to PPP */
+ irnet_data_indication(instance, sap, skb);
+#else /* PASS_CONNECT_PACKETS */
+ DERROR(IRDA_CB_ERROR, "Dropping non empty packet.\n");
+ kfree_skb(skb); /* Note : will be optimised with other kfree... */
+#endif /* PASS_CONNECT_PACKETS */
+ }
+ else
+ kfree_skb(skb);
+
+ /* Notify the control channel */
+ irnet_post_event(self, IRNET_CONNECT_TO,
+ self->saddr, self->daddr, self->rname, 0);
+
+ DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_flow_indication (instance, sap, flow)
+ *
+ * Used by TinyTP to tell us if it can accept more data or not
+ *
+ */
+static void
+irnet_flow_indication(void * instance,
+ void * sap,
+ LOCAL_FLOW flow)
+{
+ irnet_socket * self = (irnet_socket *) instance;
+ LOCAL_FLOW oldflow = self->tx_flow;
+
+ DENTER(IRDA_TCB_TRACE, "(self=0x%p, flow=%d)\n", self, flow);
+
+ /* Update our state */
+ self->tx_flow = flow;
+
+ /* Check what IrTTP want us to do... */
+ switch(flow)
+ {
+ case FLOW_START:
+ DEBUG(IRDA_CB_INFO, "IrTTP wants us to start again\n");
+ /* Check if we really need to wake up PPP */
+ if(oldflow == FLOW_STOP)
+ ppp_output_wakeup(&self->chan);
+ else
+ DEBUG(IRDA_CB_INFO, "But we were already transmitting !!!\n");
+ break;
+ case FLOW_STOP:
+ DEBUG(IRDA_CB_INFO, "IrTTP wants us to slow down\n");
+ break;
+ default:
+ DEBUG(IRDA_CB_INFO, "Unknown flow command!\n");
+ break;
+ }
+
+ DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_status_indication (instance, sap, reason, skb)
+ *
+ * Link (IrLAP) status report.
+ *
+ */
+static void
+irnet_status_indication(void * instance,
+ LINK_STATUS link,
+ LOCK_STATUS lock)
+{
+ irnet_socket * self = (irnet_socket *) instance;
+
+ DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self);
+ DASSERT(self != NULL, , IRDA_CB_ERROR, "Self is NULL !!!\n");
+
+ /* We can only get this event if we are connected */
+ switch(link)
+ {
+ case STATUS_NO_ACTIVITY:
+ irnet_post_event(self, IRNET_BLOCKED_LINK,
+ self->saddr, self->daddr, self->rname, 0);
+ break;
+ default:
+ DEBUG(IRDA_CB_INFO, "Unknown status...\n");
+ }
+
+ DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_connect_indication(instance, sap, qos, max_sdu_size, userdata)
+ *
+ * Incoming connection
+ *
+ * In theory, this function is called only on the server socket.
+ * Some other node is attempting to connect to the IrNET service, and has
+ * sent a connection request on our server socket.
+ * We just redirect the connection to the relevant IrNET socket.
+ *
+ * Note : we also make sure that between 2 irnet nodes, there can
+ * exist only one irnet connection.
+ */
+static void
+irnet_connect_indication(void * instance,
+ void * sap,
+ struct qos_info *qos,
+ __u32 max_sdu_size,
+ __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ irnet_socket * server = &irnet_server.s;
+ irnet_socket * new = (irnet_socket *) NULL;
+
+ DENTER(IRDA_TCB_TRACE, "(server=0x%p)\n", server);
+ DASSERT(instance == &irnet_server, , IRDA_CB_ERROR,
+ "Invalid instance (0x%p) !!!\n", instance);
+ DASSERT(sap == irnet_server.s.tsap, , IRDA_CB_ERROR, "Invalid sap !!!\n");
+
+ /* Try to find the most appropriate IrNET socket */
+ new = irnet_find_socket(server);
+
+ /* After all this hard work, do we have an socket ? */
+ if(new == (irnet_socket *) NULL)
+ {
+ DEXIT(IRDA_CB_INFO, ": No socket waiting for this connection.\n");
+ irnet_disconnect_server(server, skb);
+ return;
+ }
+
+ /* Is the socket already busy ? */
+ if(test_bit(0, &new->ttp_open))
+ {
+ DEXIT(IRDA_CB_INFO, ": Socket already connected.\n");
+ irnet_disconnect_server(server, skb);
+ return;
+ }
+
+ /* The following code is a bit tricky, so need comments ;-)
+ */
+ /* If ttp_connect is set, the socket is trying to connect to the other
+ * end and may have sent a IrTTP connection request and is waiting for
+ * a connection response (that may never come).
+ * Now, the pain is that the socket may have opened a tsap and is
+ * waiting on it, while the other end is trying to connect to it on
+ * another tsap.
+ * Because IrNET can be peer to peer, we need to workaround this.
+ * Furthermore, the way the irnetd script is implemented, the
+ * target will create a second IrNET connection back to the
+ * originator and expect the originator to bind this new connection
+ * to the original PPPD instance.
+ * And of course, if we don't use irnetd, we can have a race when
+ * both side try to connect simultaneously, which could leave both
+ * connections half closed (yuck).
+ * Conclusions :
+ * 1) The "originator" must accept the new connection and get rid
+ * of the old one so that irnetd works
+ * 2) One side must deny the new connection to avoid races,
+ * but both side must agree on which side it is...
+ * Most often, the originator is primary at the LAP layer.
+ * Jean II
+ */
+ /* Now, let's look at the way I wrote the test...
+ * We need to clear up the ttp_connect flag atomically to prevent
+ * irnet_disconnect_indication() to mess up the tsap we are going to close.
+ * We want to clear the ttp_connect flag only if we close the tsap,
+ * otherwise we will never close it, so we need to check for primary
+ * *before* doing the test on the flag.
+ * And of course, ALLOW_SIMULT_CONNECT can disable this entirely...
+ * Jean II
+ */
+
+ /* Socket already connecting ? On primary ? */
+ if(0
+#ifdef ALLOW_SIMULT_CONNECT
+ || ((irttp_is_primary(server->tsap) == 1) && /* primary */
+ (test_and_clear_bit(0, &new->ttp_connect)))
+#endif /* ALLOW_SIMULT_CONNECT */
+ )
+ {
+ DERROR(IRDA_CB_ERROR, "Socket already connecting, but going to reuse it !\n");
+
+ /* Cleanup the old TSAP if necessary - IrIAP will be cleaned up later */
+ if(new->tsap != NULL)
+ {
+ /* Close the old connection the new socket was attempting,
+ * so that we can hook it up to the new connection.
+ * It's now safe to do it... */
+ irttp_close_tsap(new->tsap);
+ new->tsap = NULL;
+ }
+ }
+ else
+ {
+ /* Three options :
+ * 1) socket was not connecting or connected : ttp_connect should be 0.
+ * 2) we don't want to connect the socket because we are secondary or
+ * ALLOW_SIMULT_CONNECT is undefined. ttp_connect should be 1.
+ * 3) we are half way in irnet_disconnect_indication(), and it's a
+ * nice race condition... Fortunately, we can detect that by checking
+ * if tsap is still alive. On the other hand, we can't be in
+ * irda_irnet_destroy() otherwise we would not have found this
+ * socket in the hashbin.
+ * Jean II */
+ if((test_bit(0, &new->ttp_connect)) || (new->tsap != NULL))
+ {
+ /* Don't mess this socket, somebody else in in charge... */
+ DERROR(IRDA_CB_ERROR, "Race condition detected, socket in use, abort connect...\n");
+ irnet_disconnect_server(server, skb);
+ return;
+ }
+ }
+
+ /* So : at this point, we have a socket, and it is idle. Good ! */
+ irnet_connect_socket(server, new, qos, max_sdu_size, max_header_size);
+
+ /* Check size of received packet */
+ if(skb->len > 0)
+ {
+#ifdef PASS_CONNECT_PACKETS
+ DEBUG(IRDA_CB_INFO, "Passing connect packet to PPP.\n");
+ /* Try to pass it to PPP */
+ irnet_data_indication(new, new->tsap, skb);
+#else /* PASS_CONNECT_PACKETS */
+ DERROR(IRDA_CB_ERROR, "Dropping non empty packet.\n");
+ kfree_skb(skb); /* Note : will be optimised with other kfree... */
+#endif /* PASS_CONNECT_PACKETS */
+ }
+ else
+ kfree_skb(skb);
+
+ DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+
+/********************** IRDA-IAS/LMP CALLBACKS **********************/
+/*
+ * These are the callbacks called by other layers of the IrDA stack,
+ * mainly LMP for discovery and IAS for name queries.
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_getvalue_confirm (result, obj_id, value, priv)
+ *
+ * Got answer from remote LM-IAS, just connect
+ *
+ * This is the reply to a IAS query we were doing to find the TSAP of
+ * the device we want to connect to.
+ * If we have found a valid TSAP, just initiate the TTP connection
+ * on this TSAP.
+ */
+static void
+irnet_getvalue_confirm(int result,
+ __u16 obj_id,
+ struct ias_value *value,
+ void * priv)
+{
+ irnet_socket * self = (irnet_socket *) priv;
+
+ DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self);
+ DASSERT(self != NULL, , IRDA_OCB_ERROR, "Self is NULL !!!\n");
+
+ /* Check if already connected (via irnet_connect_socket())
+ * or socket is closing down (via irda_irnet_destroy()) */
+ if(! test_bit(0, &self->ttp_connect))
+ {
+ DERROR(IRDA_OCB_ERROR, "Socket no longer connecting. Ouch !\n");
+ return;
+ }
+
+ /* We probably don't need to make any more queries */
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+
+ /* Post process the IAS reply */
+ self->dtsap_sel = irnet_ias_to_tsap(self, result, value);
+
+ /* If error, just go out */
+ if(self->errno)
+ {
+ clear_bit(0, &self->ttp_connect);
+ DERROR(IRDA_OCB_ERROR, "IAS connect failed ! (0x%X)\n", self->errno);
+ return;
+ }
+
+ DEBUG(IRDA_OCB_INFO, "daddr = %08x, lsap = %d, starting IrTTP connection\n",
+ self->daddr, self->dtsap_sel);
+
+ /* Start up TTP - non blocking */
+ irnet_connect_tsap(self);
+
+ DEXIT(IRDA_OCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_discovervalue_confirm (result, obj_id, value, priv)
+ *
+ * Handle the TSAP discovery procedure state machine.
+ * Got answer from remote LM-IAS, try next device
+ *
+ * We are doing a TSAP discovery procedure, and we got an answer to
+ * a IAS query we were doing to find the TSAP on one of the address
+ * in the discovery log.
+ *
+ * If we have found a valid TSAP for the first time, save it. If it's
+ * not the first time we found one, complain.
+ *
+ * If we have more addresses in the log, just initiate a new query.
+ * Note that those query may fail (see irnet_discover_daddr_and_lsap_sel())
+ *
+ * Otherwise, wrap up the procedure (cleanup), check if we have found
+ * any device and connect to it.
+ */
+static void
+irnet_discovervalue_confirm(int result,
+ __u16 obj_id,
+ struct ias_value *value,
+ void * priv)
+{
+ irnet_socket * self = (irnet_socket *) priv;
+ __u8 dtsap_sel; /* TSAP we are looking for */
+
+ DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self);
+ DASSERT(self != NULL, , IRDA_OCB_ERROR, "Self is NULL !!!\n");
+
+ /* Check if already connected (via irnet_connect_socket())
+ * or socket is closing down (via irda_irnet_destroy()) */
+ if(! test_bit(0, &self->ttp_connect))
+ {
+ DERROR(IRDA_OCB_ERROR, "Socket no longer connecting. Ouch !\n");
+ return;
+ }
+
+ /* Post process the IAS reply */
+ dtsap_sel = irnet_ias_to_tsap(self, result, value);
+
+ /* Have we got something ? */
+ if(self->errno == 0)
+ {
+ /* We found the requested service */
+ if(self->daddr != DEV_ADDR_ANY)
+ {
+ DERROR(IRDA_OCB_ERROR, "More than one device in range supports IrNET...\n");
+ }
+ else
+ {
+ /* First time we found that one, save it ! */
+ self->daddr = self->discoveries[self->disco_index].daddr;
+ self->dtsap_sel = dtsap_sel;
+ }
+ }
+
+ /* If no failure */
+ if((self->errno == -EADDRNOTAVAIL) || (self->errno == 0))
+ {
+ int ret;
+
+ /* Search the next node */
+ ret = irnet_discover_next_daddr(self);
+ if(!ret)
+ {
+ /* In this case, the above request was non-blocking.
+ * We will return here after a while... */
+ return;
+ }
+ /* In this case, we have processed the last discovery item */
+ }
+
+ /* No more queries to be done (failure or last one) */
+
+ /* We probably don't need to make any more queries */
+ iriap_close(self->iriap);
+ self->iriap = NULL;
+
+ /* No more items : remove the log and signal termination */
+ DEBUG(IRDA_OCB_INFO, "Cleaning up log (0x%p)\n",
+ self->discoveries);
+ if(self->discoveries != NULL)
+ {
+ /* Cleanup our copy of the discovery log */
+ kfree(self->discoveries);
+ self->discoveries = NULL;
+ }
+ self->disco_number = -1;
+
+ /* Check out what we found */
+ if(self->daddr == DEV_ADDR_ANY)
+ {
+ self->daddr = DEV_ADDR_ANY;
+ clear_bit(0, &self->ttp_connect);
+ DEXIT(IRDA_OCB_TRACE, ": cannot discover IrNET in any device !!!\n");
+ return;
+ }
+
+ /* We have a valid address - just connect */
+
+ DEBUG(IRDA_OCB_INFO, "daddr = %08x, lsap = %d, starting IrTTP connection\n",
+ self->daddr, self->dtsap_sel);
+
+ /* Start up TTP - non blocking */
+ irnet_connect_tsap(self);
+
+ DEXIT(IRDA_OCB_TRACE, "\n");
+}
+
+#ifdef DISCOVERY_EVENTS
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_discovery_indication (discovery)
+ *
+ * Got a discovery indication from IrLMP, post an event
+ *
+ * Note : IrLMP take care of matching the hint mask for us, and also
+ * check if it is a "new" node for us...
+ *
+ * As IrLMP filter on the IrLAN hint bit, we get both IrLAN and IrNET
+ * nodes, so it's only at connection time that we will know if the
+ * node support IrNET, IrLAN or both. The other solution is to check
+ * in IAS the PNP ids and service name.
+ * Note : even if a node support IrNET (or IrLAN), it's no guarantee
+ * that we will be able to connect to it, the node might already be
+ * busy...
+ *
+ * One last thing : in some case, this function will trigger duplicate
+ * discovery events. On the other hand, we should catch all
+ * discoveries properly (i.e. not miss one). Filtering duplicate here
+ * is to messy, so we leave that to user space...
+ */
+static void
+irnet_discovery_indication(discinfo_t * discovery,
+ DISCOVERY_MODE mode,
+ void * priv)
+{
+ irnet_socket * self = &irnet_server.s;
+
+ DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self);
+ DASSERT(priv == &irnet_server, , IRDA_OCB_ERROR,
+ "Invalid instance (0x%p) !!!\n", priv);
+
+ DEBUG(IRDA_OCB_INFO, "Discovered new IrNET/IrLAN node %s...\n",
+ discovery->info);
+
+ /* Notify the control channel */
+ irnet_post_event(NULL, IRNET_DISCOVER,
+ discovery->saddr, discovery->daddr, discovery->info,
+ get_unaligned((__u16 *)discovery->hints));
+
+ DEXIT(IRDA_OCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_expiry_indication (expiry)
+ *
+ * Got a expiry indication from IrLMP, post an event
+ *
+ * Note : IrLMP take care of matching the hint mask for us, we only
+ * check if it is a "new" node...
+ */
+static void
+irnet_expiry_indication(discinfo_t * expiry,
+ DISCOVERY_MODE mode,
+ void * priv)
+{
+ irnet_socket * self = &irnet_server.s;
+
+ DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self);
+ DASSERT(priv == &irnet_server, , IRDA_OCB_ERROR,
+ "Invalid instance (0x%p) !!!\n", priv);
+
+ DEBUG(IRDA_OCB_INFO, "IrNET/IrLAN node %s expired...\n",
+ expiry->info);
+
+ /* Notify the control channel */
+ irnet_post_event(NULL, IRNET_EXPIRE,
+ expiry->saddr, expiry->daddr, expiry->info,
+ get_unaligned((__u16 *)expiry->hints));
+
+ DEXIT(IRDA_OCB_TRACE, "\n");
+}
+#endif /* DISCOVERY_EVENTS */
+
+
+/*********************** PROC ENTRY CALLBACKS ***********************/
+/*
+ * We create a instance in the /proc filesystem, and here we take care
+ * of that...
+ */
+
+#ifdef CONFIG_PROC_FS
+static int
+irnet_proc_show(struct seq_file *m, void *v)
+{
+ irnet_socket * self;
+ char * state;
+ int i = 0;
+
+ /* Get the IrNET server information... */
+ seq_printf(m, "IrNET server - ");
+ seq_printf(m, "IrDA state: %s, ",
+ (irnet_server.running ? "running" : "dead"));
+ seq_printf(m, "stsap_sel: %02x, ", irnet_server.s.stsap_sel);
+ seq_printf(m, "dtsap_sel: %02x\n", irnet_server.s.dtsap_sel);
+
+ /* Do we need to continue ? */
+ if(!irnet_server.running)
+ return 0;
+
+ /* Protect access to the instance list */
+ spin_lock_bh(&irnet_server.spinlock);
+
+ /* Get the sockets one by one... */
+ self = (irnet_socket *) hashbin_get_first(irnet_server.list);
+ while(self != NULL)
+ {
+ /* Start printing info about the socket. */
+ seq_printf(m, "\nIrNET socket %d - ", i++);
+
+ /* First, get the requested configuration */
+ seq_printf(m, "Requested IrDA name: \"%s\", ", self->rname);
+ seq_printf(m, "daddr: %08x, ", self->rdaddr);
+ seq_printf(m, "saddr: %08x\n", self->rsaddr);
+
+ /* Second, get all the PPP info */
+ seq_printf(m, " PPP state: %s",
+ (self->ppp_open ? "registered" : "unregistered"));
+ if(self->ppp_open)
+ {
+ seq_printf(m, ", unit: ppp%d",
+ ppp_unit_number(&self->chan));
+ seq_printf(m, ", channel: %d",
+ ppp_channel_index(&self->chan));
+ seq_printf(m, ", mru: %d",
+ self->mru);
+ /* Maybe add self->flags ? Later... */
+ }
+
+ /* Then, get all the IrDA specific info... */
+ if(self->ttp_open)
+ state = "connected";
+ else
+ if(self->tsap != NULL)
+ state = "connecting";
+ else
+ if(self->iriap != NULL)
+ state = "searching";
+ else
+ if(self->ttp_connect)
+ state = "weird";
+ else
+ state = "idle";
+ seq_printf(m, "\n IrDA state: %s, ", state);
+ seq_printf(m, "daddr: %08x, ", self->daddr);
+ seq_printf(m, "stsap_sel: %02x, ", self->stsap_sel);
+ seq_printf(m, "dtsap_sel: %02x\n", self->dtsap_sel);
+
+ /* Next socket, please... */
+ self = (irnet_socket *) hashbin_get_next(irnet_server.list);
+ }
+
+ /* Spin lock end */
+ spin_unlock_bh(&irnet_server.spinlock);
+
+ return 0;
+}
+
+static int irnet_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irnet_proc_show, NULL);
+}
+
+static const struct file_operations irnet_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = irnet_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* PROC_FS */
+
+
+/********************** CONFIGURATION/CLEANUP **********************/
+/*
+ * Initialisation and teardown of the IrDA part, called at module
+ * insertion and removal...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Prepare the IrNET layer for operation...
+ */
+int __init
+irda_irnet_init(void)
+{
+ int err = 0;
+
+ DENTER(MODULE_TRACE, "()\n");
+
+ /* Pure paranoia - should be redundant */
+ memset(&irnet_server, 0, sizeof(struct irnet_root));
+
+ /* Setup start of irnet instance list */
+ irnet_server.list = hashbin_new(HB_NOLOCK);
+ DABORT(irnet_server.list == NULL, -ENOMEM,
+ MODULE_ERROR, "Can't allocate hashbin!\n");
+ /* Init spinlock for instance list */
+ spin_lock_init(&irnet_server.spinlock);
+
+ /* Initialise control channel */
+ init_waitqueue_head(&irnet_events.rwait);
+ irnet_events.index = 0;
+ /* Init spinlock for event logging */
+ spin_lock_init(&irnet_events.spinlock);
+
+#ifdef CONFIG_PROC_FS
+ /* Add a /proc file for irnet infos */
+ proc_create("irnet", 0, proc_irda, &irnet_proc_fops);
+#endif /* CONFIG_PROC_FS */
+
+ /* Setup the IrNET server */
+ err = irnet_setup_server();
+
+ if(!err)
+ /* We are no longer functional... */
+ irnet_server.running = 1;
+
+ DEXIT(MODULE_TRACE, "\n");
+ return err;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Cleanup at exit...
+ */
+void __exit
+irda_irnet_cleanup(void)
+{
+ DENTER(MODULE_TRACE, "()\n");
+
+ /* We are no longer there... */
+ irnet_server.running = 0;
+
+#ifdef CONFIG_PROC_FS
+ /* Remove our /proc file */
+ remove_proc_entry("irnet", proc_irda);
+#endif /* CONFIG_PROC_FS */
+
+ /* Remove our IrNET server from existence */
+ irnet_destroy_server();
+
+ /* Remove all instances of IrNET socket still present */
+ hashbin_delete(irnet_server.list, (FREE_FUNC) irda_irnet_destroy);
+
+ DEXIT(MODULE_TRACE, "\n");
+}
diff --git a/net/irda/irnet/irnet_irda.h b/net/irda/irnet/irnet_irda.h
new file mode 100644
index 000000000..3e408952a
--- /dev/null
+++ b/net/irda/irnet/irnet_irda.h
@@ -0,0 +1,178 @@
+/*
+ * IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ * Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file contains all definitions and declarations necessary for the
+ * IRDA part of the IrNET module (dealing with IrTTP, IrIAS and co).
+ * This file is a private header, so other modules don't want to know
+ * what's in there...
+ */
+
+#ifndef IRNET_IRDA_H
+#define IRNET_IRDA_H
+
+/***************************** INCLUDES *****************************/
+/* Please add other headers in irnet.h */
+
+#include "irnet.h" /* Module global include */
+
+/************************ CONSTANTS & MACROS ************************/
+
+/*
+ * Name of the service (socket name) used by IrNET
+ */
+/* IAS object name (or part of it) */
+#define IRNET_SERVICE_NAME "IrNetv1"
+/* IAS attribute */
+#define IRNET_IAS_VALUE "IrDA:TinyTP:LsapSel"
+/* LMP notify name for client (only for /proc/net/irda/irlmp) */
+#define IRNET_NOTIFY_NAME "IrNET socket"
+/* LMP notify name for server (only for /proc/net/irda/irlmp) */
+#define IRNET_NOTIFY_NAME_SERV "IrNET server"
+
+/****************************** TYPES ******************************/
+
+/*
+ * This is the main structure where we store all the data pertaining to
+ * the IrNET server (listen for connection requests) and the root
+ * of the IrNET socket list
+ */
+typedef struct irnet_root
+{
+ irnet_socket s; /* To pretend we are a client... */
+
+ /* Generic stuff */
+ int magic; /* Paranoia */
+ int running; /* Are we operational ? */
+
+ /* Link list of all IrNET instances opened */
+ hashbin_t * list;
+ spinlock_t spinlock; /* Serialize access to the list */
+ /* Note : the way hashbin has been designed is absolutely not
+ * reentrant, beware... So, we blindly protect all with spinlock */
+
+ /* Handle for the hint bit advertised in IrLMP */
+ void * skey;
+
+ /* Server socket part */
+ struct ias_object * ias_obj; /* Our service name + lsap in IAS */
+
+} irnet_root;
+
+
+/**************************** PROTOTYPES ****************************/
+
+/* ----------------------- CONTROL CHANNEL ----------------------- */
+static void
+ irnet_post_event(irnet_socket *,
+ irnet_event,
+ __u32,
+ __u32,
+ char *,
+ __u16);
+/* ----------------------- IRDA SUBROUTINES ----------------------- */
+static inline int
+ irnet_open_tsap(irnet_socket *);
+static inline __u8
+ irnet_ias_to_tsap(irnet_socket *,
+ int,
+ struct ias_value *);
+static inline int
+ irnet_find_lsap_sel(irnet_socket *);
+static inline int
+ irnet_connect_tsap(irnet_socket *);
+static inline int
+ irnet_discover_next_daddr(irnet_socket *);
+static inline int
+ irnet_discover_daddr_and_lsap_sel(irnet_socket *);
+static inline int
+ irnet_dname_to_daddr(irnet_socket *);
+/* ------------------------ SERVER SOCKET ------------------------ */
+static inline int
+ irnet_daddr_to_dname(irnet_socket *);
+static inline irnet_socket *
+ irnet_find_socket(irnet_socket *);
+static inline int
+ irnet_connect_socket(irnet_socket *,
+ irnet_socket *,
+ struct qos_info *,
+ __u32,
+ __u8);
+static inline void
+ irnet_disconnect_server(irnet_socket *,
+ struct sk_buff *);
+static inline int
+ irnet_setup_server(void);
+static inline void
+ irnet_destroy_server(void);
+/* ---------------------- IRDA-TTP CALLBACKS ---------------------- */
+static int
+ irnet_data_indication(void *, /* instance */
+ void *, /* sap */
+ struct sk_buff *);
+static void
+ irnet_disconnect_indication(void *,
+ void *,
+ LM_REASON,
+ struct sk_buff *);
+static void
+ irnet_connect_confirm(void *,
+ void *,
+ struct qos_info *,
+ __u32,
+ __u8,
+ struct sk_buff *);
+static void
+ irnet_flow_indication(void *,
+ void *,
+ LOCAL_FLOW);
+static void
+ irnet_status_indication(void *,
+ LINK_STATUS,
+ LOCK_STATUS);
+static void
+ irnet_connect_indication(void *,
+ void *,
+ struct qos_info *,
+ __u32,
+ __u8,
+ struct sk_buff *);
+/* -------------------- IRDA-IAS/LMP CALLBACKS -------------------- */
+static void
+ irnet_getvalue_confirm(int,
+ __u16,
+ struct ias_value *,
+ void *);
+static void
+ irnet_discovervalue_confirm(int,
+ __u16,
+ struct ias_value *,
+ void *);
+#ifdef DISCOVERY_EVENTS
+static void
+ irnet_discovery_indication(discinfo_t *,
+ DISCOVERY_MODE,
+ void *);
+static void
+ irnet_expiry_indication(discinfo_t *,
+ DISCOVERY_MODE,
+ void *);
+#endif
+
+/**************************** VARIABLES ****************************/
+
+/*
+ * The IrNET server. Listen to connection requests and co...
+ */
+static struct irnet_root irnet_server;
+
+/* Control channel stuff (note : extern) */
+struct irnet_ctrl_channel irnet_events;
+
+/* The /proc/net/irda directory, defined elsewhere... */
+#ifdef CONFIG_PROC_FS
+extern struct proc_dir_entry *proc_irda;
+#endif /* CONFIG_PROC_FS */
+
+#endif /* IRNET_IRDA_H */
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
new file mode 100644
index 000000000..1215693fd
--- /dev/null
+++ b/net/irda/irnet/irnet_ppp.c
@@ -0,0 +1,1188 @@
+/*
+ * IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ * Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file implement the PPP interface and /dev/irnet character device.
+ * The PPP interface hook to the ppp_generic module, handle all our
+ * relationship to the PPP code in the kernel (and by extension to pppd),
+ * and exchange PPP frames with this module (send/receive).
+ * The /dev/irnet device is used primarily for 2 functions :
+ * 1) as a stub for pppd (the ppp daemon), so that we can appropriately
+ * generate PPP sessions (we pretend we are a tty).
+ * 2) as a control channel (write commands, read events)
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "irnet_ppp.h" /* Private header */
+/* Please put other headers in irnet.h - Thanks */
+
+/* Generic PPP callbacks (to call us) */
+static const struct ppp_channel_ops irnet_ppp_ops = {
+ .start_xmit = ppp_irnet_send,
+ .ioctl = ppp_irnet_ioctl
+};
+
+/************************* CONTROL CHANNEL *************************/
+/*
+ * When a pppd instance is not active on /dev/irnet, it acts as a control
+ * channel.
+ * Writing allow to set up the IrDA destination of the IrNET channel,
+ * and any application may be read events happening in IrNET...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Write is used to send a command to configure a IrNET channel
+ * before it is open by pppd. The syntax is : "command argument"
+ * Currently there is only two defined commands :
+ * o name : set the requested IrDA nickname of the IrNET peer.
+ * o addr : set the requested IrDA address of the IrNET peer.
+ * Note : the code is crude, but effective...
+ */
+static inline ssize_t
+irnet_ctrl_write(irnet_socket * ap,
+ const char __user *buf,
+ size_t count)
+{
+ char command[IRNET_MAX_COMMAND];
+ char * start; /* Current command being processed */
+ char * next; /* Next command to process */
+ int length; /* Length of current command */
+
+ DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+
+ /* Check for overflow... */
+ DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM,
+ CTRL_ERROR, "Too much data !!!\n");
+
+ /* Get the data in the driver */
+ if(copy_from_user(command, buf, count))
+ {
+ DERROR(CTRL_ERROR, "Invalid user space pointer.\n");
+ return -EFAULT;
+ }
+
+ /* Safe terminate the string */
+ command[count] = '\0';
+ DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n",
+ command, count);
+
+ /* Check every commands in the command line */
+ next = command;
+ while(next != NULL)
+ {
+ /* Look at the next command */
+ start = next;
+
+ /* Scrap whitespaces before the command */
+ start = skip_spaces(start);
+
+ /* ',' is our command separator */
+ next = strchr(start, ',');
+ if(next)
+ {
+ *next = '\0'; /* Terminate command */
+ length = next - start; /* Length */
+ next++; /* Skip the '\0' */
+ }
+ else
+ length = strlen(start);
+
+ DEBUG(CTRL_INFO, "Found command ``%s'' (%d).\n", start, length);
+
+ /* Check if we recognised one of the known command
+ * We can't use "switch" with strings, so hack with "continue" */
+
+ /* First command : name -> Requested IrDA nickname */
+ if(!strncmp(start, "name", 4))
+ {
+ /* Copy the name only if is included and not "any" */
+ if((length > 5) && (strcmp(start + 5, "any")))
+ {
+ /* Strip out trailing whitespaces */
+ while(isspace(start[length - 1]))
+ length--;
+
+ DABORT(length < 5 || length > NICKNAME_MAX_LEN + 5,
+ -EINVAL, CTRL_ERROR, "Invalid nickname.\n");
+
+ /* Copy the name for later reuse */
+ memcpy(ap->rname, start + 5, length - 5);
+ ap->rname[length - 5] = '\0';
+ }
+ else
+ ap->rname[0] = '\0';
+ DEBUG(CTRL_INFO, "Got rname = ``%s''\n", ap->rname);
+
+ /* Restart the loop */
+ continue;
+ }
+
+ /* Second command : addr, daddr -> Requested IrDA destination address
+ * Also process : saddr -> Requested IrDA source address */
+ if((!strncmp(start, "addr", 4)) ||
+ (!strncmp(start, "daddr", 5)) ||
+ (!strncmp(start, "saddr", 5)))
+ {
+ __u32 addr = DEV_ADDR_ANY;
+
+ /* Copy the address only if is included and not "any" */
+ if((length > 5) && (strcmp(start + 5, "any")))
+ {
+ char * begp = start + 5;
+ char * endp;
+
+ /* Scrap whitespaces before the command */
+ begp = skip_spaces(begp);
+
+ /* Convert argument to a number (last arg is the base) */
+ addr = simple_strtoul(begp, &endp, 16);
+ /* Has it worked ? (endp should be start + length) */
+ DABORT(endp <= (start + 5), -EINVAL,
+ CTRL_ERROR, "Invalid address.\n");
+ }
+ /* Which type of address ? */
+ if(start[0] == 's')
+ {
+ /* Save it */
+ ap->rsaddr = addr;
+ DEBUG(CTRL_INFO, "Got rsaddr = %08x\n", ap->rsaddr);
+ }
+ else
+ {
+ /* Save it */
+ ap->rdaddr = addr;
+ DEBUG(CTRL_INFO, "Got rdaddr = %08x\n", ap->rdaddr);
+ }
+
+ /* Restart the loop */
+ continue;
+ }
+
+ /* Other possible command : connect N (number of retries) */
+
+ /* No command matched -> Failed... */
+ DABORT(1, -EINVAL, CTRL_ERROR, "Not a recognised IrNET command.\n");
+ }
+
+ /* Success : we have parsed all commands successfully */
+ return count;
+}
+
+#ifdef INITIAL_DISCOVERY
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_get_discovery_log (self)
+ *
+ * Query the content on the discovery log if not done
+ *
+ * This function query the current content of the discovery log
+ * at the startup of the event channel and save it in the internal struct.
+ */
+static void
+irnet_get_discovery_log(irnet_socket * ap)
+{
+ __u16 mask = irlmp_service_to_hint(S_LAN);
+
+ /* Ask IrLMP for the current discovery log */
+ ap->discoveries = irlmp_get_discoveries(&ap->disco_number, mask,
+ DISCOVERY_DEFAULT_SLOTS);
+
+ /* Check if the we got some results */
+ if(ap->discoveries == NULL)
+ ap->disco_number = -1;
+
+ DEBUG(CTRL_INFO, "Got the log (0x%p), size is %d\n",
+ ap->discoveries, ap->disco_number);
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_read_discovery_log (self, event)
+ *
+ * Read the content on the discovery log
+ *
+ * This function dump the current content of the discovery log
+ * at the startup of the event channel.
+ * Return 1 if wrote an event on the control channel...
+ *
+ * State of the ap->disco_XXX variables :
+ * Socket creation : discoveries = NULL ; disco_index = 0 ; disco_number = 0
+ * While reading : discoveries = ptr ; disco_index = X ; disco_number = Y
+ * After reading : discoveries = NULL ; disco_index = Y ; disco_number = -1
+ */
+static inline int
+irnet_read_discovery_log(irnet_socket *ap, char *event, int buf_size)
+{
+ int done_event = 0;
+
+ DENTER(CTRL_TRACE, "(ap=0x%p, event=0x%p)\n",
+ ap, event);
+
+ /* Test if we have some work to do or we have already finished */
+ if(ap->disco_number == -1)
+ {
+ DEBUG(CTRL_INFO, "Already done\n");
+ return 0;
+ }
+
+ /* Test if it's the first time and therefore we need to get the log */
+ if(ap->discoveries == NULL)
+ irnet_get_discovery_log(ap);
+
+ /* Check if we have more item to dump */
+ if(ap->disco_index < ap->disco_number)
+ {
+ /* Write an event */
+ snprintf(event, buf_size,
+ "Found %08x (%s) behind %08x {hints %02X-%02X}\n",
+ ap->discoveries[ap->disco_index].daddr,
+ ap->discoveries[ap->disco_index].info,
+ ap->discoveries[ap->disco_index].saddr,
+ ap->discoveries[ap->disco_index].hints[0],
+ ap->discoveries[ap->disco_index].hints[1]);
+ DEBUG(CTRL_INFO, "Writing discovery %d : %s\n",
+ ap->disco_index, ap->discoveries[ap->disco_index].info);
+
+ /* We have an event */
+ done_event = 1;
+ /* Next discovery */
+ ap->disco_index++;
+ }
+
+ /* Check if we have done the last item */
+ if(ap->disco_index >= ap->disco_number)
+ {
+ /* No more items : remove the log and signal termination */
+ DEBUG(CTRL_INFO, "Cleaning up log (0x%p)\n",
+ ap->discoveries);
+ if(ap->discoveries != NULL)
+ {
+ /* Cleanup our copy of the discovery log */
+ kfree(ap->discoveries);
+ ap->discoveries = NULL;
+ }
+ ap->disco_number = -1;
+ }
+
+ return done_event;
+}
+#endif /* INITIAL_DISCOVERY */
+
+/*------------------------------------------------------------------*/
+/*
+ * Read is used to get IrNET events
+ */
+static inline ssize_t
+irnet_ctrl_read(irnet_socket * ap,
+ struct file * file,
+ char __user * buf,
+ size_t count)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ char event[75];
+ ssize_t ret = 0;
+
+ DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+
+#ifdef INITIAL_DISCOVERY
+ /* Check if we have read the log */
+ if (irnet_read_discovery_log(ap, event, sizeof(event)))
+ {
+ count = min(strlen(event), count);
+ if (copy_to_user(buf, event, count))
+ {
+ DERROR(CTRL_ERROR, "Invalid user space pointer.\n");
+ return -EFAULT;
+ }
+
+ DEXIT(CTRL_TRACE, "\n");
+ return count;
+ }
+#endif /* INITIAL_DISCOVERY */
+
+ /* Put ourselves on the wait queue to be woken up */
+ add_wait_queue(&irnet_events.rwait, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ for(;;)
+ {
+ /* If there is unread events */
+ ret = 0;
+ if(ap->event_index != irnet_events.index)
+ break;
+ ret = -EAGAIN;
+ if(file->f_flags & O_NONBLOCK)
+ break;
+ ret = -ERESTARTSYS;
+ if(signal_pending(current))
+ break;
+ /* Yield and wait to be woken up */
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&irnet_events.rwait, &wait);
+
+ /* Did we got it ? */
+ if(ret != 0)
+ {
+ /* No, return the error code */
+ DEXIT(CTRL_TRACE, " - ret %Zd\n", ret);
+ return ret;
+ }
+
+ /* Which event is it ? */
+ switch(irnet_events.log[ap->event_index].event)
+ {
+ case IRNET_DISCOVER:
+ snprintf(event, sizeof(event),
+ "Discovered %08x (%s) behind %08x {hints %02X-%02X}\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name,
+ irnet_events.log[ap->event_index].saddr,
+ irnet_events.log[ap->event_index].hints.byte[0],
+ irnet_events.log[ap->event_index].hints.byte[1]);
+ break;
+ case IRNET_EXPIRE:
+ snprintf(event, sizeof(event),
+ "Expired %08x (%s) behind %08x {hints %02X-%02X}\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name,
+ irnet_events.log[ap->event_index].saddr,
+ irnet_events.log[ap->event_index].hints.byte[0],
+ irnet_events.log[ap->event_index].hints.byte[1]);
+ break;
+ case IRNET_CONNECT_TO:
+ snprintf(event, sizeof(event), "Connected to %08x (%s) on ppp%d\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name,
+ irnet_events.log[ap->event_index].unit);
+ break;
+ case IRNET_CONNECT_FROM:
+ snprintf(event, sizeof(event), "Connection from %08x (%s) on ppp%d\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name,
+ irnet_events.log[ap->event_index].unit);
+ break;
+ case IRNET_REQUEST_FROM:
+ snprintf(event, sizeof(event), "Request from %08x (%s) behind %08x\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name,
+ irnet_events.log[ap->event_index].saddr);
+ break;
+ case IRNET_NOANSWER_FROM:
+ snprintf(event, sizeof(event), "No-answer from %08x (%s) on ppp%d\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name,
+ irnet_events.log[ap->event_index].unit);
+ break;
+ case IRNET_BLOCKED_LINK:
+ snprintf(event, sizeof(event), "Blocked link with %08x (%s) on ppp%d\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name,
+ irnet_events.log[ap->event_index].unit);
+ break;
+ case IRNET_DISCONNECT_FROM:
+ snprintf(event, sizeof(event), "Disconnection from %08x (%s) on ppp%d\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name,
+ irnet_events.log[ap->event_index].unit);
+ break;
+ case IRNET_DISCONNECT_TO:
+ snprintf(event, sizeof(event), "Disconnected to %08x (%s)\n",
+ irnet_events.log[ap->event_index].daddr,
+ irnet_events.log[ap->event_index].name);
+ break;
+ default:
+ snprintf(event, sizeof(event), "Bug\n");
+ }
+ /* Increment our event index */
+ ap->event_index = (ap->event_index + 1) % IRNET_MAX_EVENTS;
+
+ DEBUG(CTRL_INFO, "Event is :%s", event);
+
+ count = min(strlen(event), count);
+ if (copy_to_user(buf, event, count))
+ {
+ DERROR(CTRL_ERROR, "Invalid user space pointer.\n");
+ return -EFAULT;
+ }
+
+ DEXIT(CTRL_TRACE, "\n");
+ return count;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Poll : called when someone do a select on /dev/irnet.
+ * Just check if there are new events...
+ */
+static inline unsigned int
+irnet_ctrl_poll(irnet_socket * ap,
+ struct file * file,
+ poll_table * wait)
+{
+ unsigned int mask;
+
+ DENTER(CTRL_TRACE, "(ap=0x%p)\n", ap);
+
+ poll_wait(file, &irnet_events.rwait, wait);
+ mask = POLLOUT | POLLWRNORM;
+ /* If there is unread events */
+ if(ap->event_index != irnet_events.index)
+ mask |= POLLIN | POLLRDNORM;
+#ifdef INITIAL_DISCOVERY
+ if(ap->disco_number != -1)
+ {
+ /* Test if it's the first time and therefore we need to get the log */
+ if(ap->discoveries == NULL)
+ irnet_get_discovery_log(ap);
+ /* Recheck */
+ if(ap->disco_number != -1)
+ mask |= POLLIN | POLLRDNORM;
+ }
+#endif /* INITIAL_DISCOVERY */
+
+ DEXIT(CTRL_TRACE, " - mask=0x%X\n", mask);
+ return mask;
+}
+
+
+/*********************** FILESYSTEM CALLBACKS ***********************/
+/*
+ * Implement the usual open, read, write functions that will be called
+ * by the file system when some action is performed on /dev/irnet.
+ * Most of those actions will in fact be performed by "pppd" or
+ * the control channel, we just act as a redirector...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Open : when somebody open /dev/irnet
+ * We basically create a new instance of irnet and initialise it.
+ */
+static int
+dev_irnet_open(struct inode * inode,
+ struct file * file)
+{
+ struct irnet_socket * ap;
+ int err;
+
+ DENTER(FS_TRACE, "(file=0x%p)\n", file);
+
+#ifdef SECURE_DEVIRNET
+ /* This could (should?) be enforced by the permissions on /dev/irnet. */
+ if(!capable(CAP_NET_ADMIN))
+ return -EPERM;
+#endif /* SECURE_DEVIRNET */
+
+ /* Allocate a private structure for this IrNET instance */
+ ap = kzalloc(sizeof(*ap), GFP_KERNEL);
+ DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n");
+
+ /* initialize the irnet structure */
+ ap->file = file;
+
+ /* PPP channel setup */
+ ap->ppp_open = 0;
+ ap->chan.private = ap;
+ ap->chan.ops = &irnet_ppp_ops;
+ ap->chan.mtu = (2048 - TTP_MAX_HEADER - 2 - PPP_HDRLEN);
+ ap->chan.hdrlen = 2 + TTP_MAX_HEADER; /* for A/C + Max IrDA hdr */
+ /* PPP parameters */
+ ap->mru = (2048 - TTP_MAX_HEADER - 2 - PPP_HDRLEN);
+ ap->xaccm[0] = ~0U;
+ ap->xaccm[3] = 0x60000000U;
+ ap->raccm = ~0U;
+
+ /* Setup the IrDA part... */
+ err = irda_irnet_create(ap);
+ if(err)
+ {
+ DERROR(FS_ERROR, "Can't setup IrDA link...\n");
+ kfree(ap);
+
+ return err;
+ }
+
+ /* For the control channel */
+ ap->event_index = irnet_events.index; /* Cancel all past events */
+
+ mutex_init(&ap->lock);
+
+ /* Put our stuff where we will be able to find it later */
+ file->private_data = ap;
+
+ DEXIT(FS_TRACE, " - ap=0x%p\n", ap);
+
+ return 0;
+}
+
+
+/*------------------------------------------------------------------*/
+/*
+ * Close : when somebody close /dev/irnet
+ * Destroy the instance of /dev/irnet
+ */
+static int
+dev_irnet_close(struct inode * inode,
+ struct file * file)
+{
+ irnet_socket * ap = file->private_data;
+
+ DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n",
+ file, ap);
+ DABORT(ap == NULL, 0, FS_ERROR, "ap is NULL !!!\n");
+
+ /* Detach ourselves */
+ file->private_data = NULL;
+
+ /* Close IrDA stuff */
+ irda_irnet_destroy(ap);
+
+ /* Disconnect from the generic PPP layer if not already done */
+ if(ap->ppp_open)
+ {
+ DERROR(FS_ERROR, "Channel still registered - deregistering !\n");
+ ap->ppp_open = 0;
+ ppp_unregister_channel(&ap->chan);
+ }
+
+ kfree(ap);
+
+ DEXIT(FS_TRACE, "\n");
+ return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Write does nothing.
+ * (we receive packet from ppp_generic through ppp_irnet_send())
+ */
+static ssize_t
+dev_irnet_write(struct file * file,
+ const char __user *buf,
+ size_t count,
+ loff_t * ppos)
+{
+ irnet_socket * ap = file->private_data;
+
+ DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+ file, ap, count);
+ DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
+
+ /* If we are connected to ppp_generic, let it handle the job */
+ if(ap->ppp_open)
+ return -EAGAIN;
+ else
+ return irnet_ctrl_write(ap, buf, count);
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Read doesn't do much either.
+ * (pppd poll us, but ultimately reads through /dev/ppp)
+ */
+static ssize_t
+dev_irnet_read(struct file * file,
+ char __user * buf,
+ size_t count,
+ loff_t * ppos)
+{
+ irnet_socket * ap = file->private_data;
+
+ DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+ file, ap, count);
+ DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
+
+ /* If we are connected to ppp_generic, let it handle the job */
+ if(ap->ppp_open)
+ return -EAGAIN;
+ else
+ return irnet_ctrl_read(ap, file, buf, count);
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Poll : called when someone do a select on /dev/irnet
+ */
+static unsigned int
+dev_irnet_poll(struct file * file,
+ poll_table * wait)
+{
+ irnet_socket * ap = file->private_data;
+ unsigned int mask;
+
+ DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n",
+ file, ap);
+
+ mask = POLLOUT | POLLWRNORM;
+ DABORT(ap == NULL, mask, FS_ERROR, "ap is NULL !!!\n");
+
+ /* If we are connected to ppp_generic, let it handle the job */
+ if(!ap->ppp_open)
+ mask |= irnet_ctrl_poll(ap, file, wait);
+
+ DEXIT(FS_TRACE, " - mask=0x%X\n", mask);
+ return mask;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * IOCtl : Called when someone does some ioctls on /dev/irnet
+ * This is the way pppd configure us and control us while the PPP
+ * instance is active.
+ */
+static long
+dev_irnet_ioctl(
+ struct file * file,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ irnet_socket * ap = file->private_data;
+ int err;
+ int val;
+ void __user *argp = (void __user *)arg;
+
+ DENTER(FS_TRACE, "(file=0x%p, ap=0x%p, cmd=0x%X)\n",
+ file, ap, cmd);
+
+ /* Basic checks... */
+ DASSERT(ap != NULL, -ENXIO, PPP_ERROR, "ap is NULL...\n");
+#ifdef SECURE_DEVIRNET
+ if(!capable(CAP_NET_ADMIN))
+ return -EPERM;
+#endif /* SECURE_DEVIRNET */
+
+ err = -EFAULT;
+ switch(cmd)
+ {
+ /* Set discipline (should be N_SYNC_PPP or N_TTY) */
+ case TIOCSETD:
+ if(get_user(val, (int __user *)argp))
+ break;
+ if((val == N_SYNC_PPP) || (val == N_PPP))
+ {
+ DEBUG(FS_INFO, "Entering PPP discipline.\n");
+ /* PPP channel setup (ap->chan in configured in dev_irnet_open())*/
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
+ err = ppp_register_channel(&ap->chan);
+ if(err == 0)
+ {
+ /* Our ppp side is active */
+ ap->ppp_open = 1;
+
+ DEBUG(FS_INFO, "Trying to establish a connection.\n");
+ /* Setup the IrDA link now - may fail... */
+ irda_irnet_connect(ap);
+ }
+ else
+ DERROR(FS_ERROR, "Can't setup PPP channel...\n");
+
+ mutex_unlock(&ap->lock);
+ }
+ else
+ {
+ /* In theory, should be N_TTY */
+ DEBUG(FS_INFO, "Exiting PPP discipline.\n");
+ /* Disconnect from the generic PPP layer */
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
+ if(ap->ppp_open)
+ {
+ ap->ppp_open = 0;
+ ppp_unregister_channel(&ap->chan);
+ }
+ else
+ DERROR(FS_ERROR, "Channel not registered !\n");
+ err = 0;
+
+ mutex_unlock(&ap->lock);
+ }
+ break;
+
+ /* Query PPP channel and unit number */
+ case PPPIOCGCHAN:
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
+ if(ap->ppp_open && !put_user(ppp_channel_index(&ap->chan),
+ (int __user *)argp))
+ err = 0;
+
+ mutex_unlock(&ap->lock);
+ break;
+ case PPPIOCGUNIT:
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
+ if(ap->ppp_open && !put_user(ppp_unit_number(&ap->chan),
+ (int __user *)argp))
+ err = 0;
+
+ mutex_unlock(&ap->lock);
+ break;
+
+ /* All these ioctls can be passed both directly and from ppp_generic,
+ * so we just deal with them in one place...
+ */
+ case PPPIOCGFLAGS:
+ case PPPIOCSFLAGS:
+ case PPPIOCGASYNCMAP:
+ case PPPIOCSASYNCMAP:
+ case PPPIOCGRASYNCMAP:
+ case PPPIOCSRASYNCMAP:
+ case PPPIOCGXASYNCMAP:
+ case PPPIOCSXASYNCMAP:
+ case PPPIOCGMRU:
+ case PPPIOCSMRU:
+ DEBUG(FS_INFO, "Standard PPP ioctl.\n");
+ if(!capable(CAP_NET_ADMIN))
+ err = -EPERM;
+ else {
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
+ err = ppp_irnet_ioctl(&ap->chan, cmd, arg);
+
+ mutex_unlock(&ap->lock);
+ }
+ break;
+
+ /* TTY IOCTLs : Pretend that we are a tty, to keep pppd happy */
+ /* Get termios */
+ case TCGETS:
+ DEBUG(FS_INFO, "Get termios.\n");
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
+#ifndef TCGETS2
+ if(!kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios))
+ err = 0;
+#else
+ if(kernel_termios_to_user_termios_1((struct termios __user *)argp, &ap->termios))
+ err = 0;
+#endif
+
+ mutex_unlock(&ap->lock);
+ break;
+ /* Set termios */
+ case TCSETSF:
+ DEBUG(FS_INFO, "Set termios.\n");
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
+#ifndef TCGETS2
+ if(!user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp))
+ err = 0;
+#else
+ if(!user_termios_to_kernel_termios_1(&ap->termios, (struct termios __user *)argp))
+ err = 0;
+#endif
+
+ mutex_unlock(&ap->lock);
+ break;
+
+ /* Set DTR/RTS */
+ case TIOCMBIS:
+ case TIOCMBIC:
+ /* Set exclusive/non-exclusive mode */
+ case TIOCEXCL:
+ case TIOCNXCL:
+ DEBUG(FS_INFO, "TTY compatibility.\n");
+ err = 0;
+ break;
+
+ case TCGETA:
+ DEBUG(FS_INFO, "TCGETA\n");
+ break;
+
+ case TCFLSH:
+ DEBUG(FS_INFO, "TCFLSH\n");
+ /* Note : this will flush buffers in PPP, so it *must* be done
+ * We should also worry that we don't accept junk here and that
+ * we get rid of our own buffers */
+#ifdef FLUSH_TO_PPP
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+ ppp_output_wakeup(&ap->chan);
+ mutex_unlock(&ap->lock);
+#endif /* FLUSH_TO_PPP */
+ err = 0;
+ break;
+
+ case FIONREAD:
+ DEBUG(FS_INFO, "FIONREAD\n");
+ val = 0;
+ if(put_user(val, (int __user *)argp))
+ break;
+ err = 0;
+ break;
+
+ default:
+ DERROR(FS_ERROR, "Unsupported ioctl (0x%X)\n", cmd);
+ err = -ENOTTY;
+ }
+
+ DEXIT(FS_TRACE, " - err = 0x%X\n", err);
+ return err;
+}
+
+/************************** PPP CALLBACKS **************************/
+/*
+ * This are the functions that the generic PPP driver in the kernel
+ * will call to communicate to us.
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Prepare the ppp frame for transmission over the IrDA socket.
+ * We make sure that the header space is enough, and we change ppp header
+ * according to flags passed by pppd.
+ * This is not a callback, but just a helper function used in ppp_irnet_send()
+ */
+static inline struct sk_buff *
+irnet_prepare_skb(irnet_socket * ap,
+ struct sk_buff * skb)
+{
+ unsigned char * data;
+ int proto; /* PPP protocol */
+ int islcp; /* Protocol == LCP */
+ int needaddr; /* Need PPP address */
+
+ DENTER(PPP_TRACE, "(ap=0x%p, skb=0x%p)\n",
+ ap, skb);
+
+ /* Extract PPP protocol from the frame */
+ data = skb->data;
+ proto = (data[0] << 8) + data[1];
+
+ /* LCP packets with codes between 1 (configure-request)
+ * and 7 (code-reject) must be sent as though no options
+ * have been negotiated. */
+ islcp = (proto == PPP_LCP) && (1 <= data[2]) && (data[2] <= 7);
+
+ /* compress protocol field if option enabled */
+ if((data[0] == 0) && (ap->flags & SC_COMP_PROT) && (!islcp))
+ skb_pull(skb,1);
+
+ /* Check if we need address/control fields */
+ needaddr = 2*((ap->flags & SC_COMP_AC) == 0 || islcp);
+
+ /* Is the skb headroom large enough to contain all IrDA-headers? */
+ if((skb_headroom(skb) < (ap->max_header_size + needaddr)) ||
+ (skb_shared(skb)))
+ {
+ struct sk_buff * new_skb;
+
+ DEBUG(PPP_INFO, "Reallocating skb\n");
+
+ /* Create a new skb */
+ new_skb = skb_realloc_headroom(skb, ap->max_header_size + needaddr);
+
+ /* We have to free the original skb anyway */
+ dev_kfree_skb(skb);
+
+ /* Did the realloc succeed ? */
+ DABORT(new_skb == NULL, NULL, PPP_ERROR, "Could not realloc skb\n");
+
+ /* Use the new skb instead */
+ skb = new_skb;
+ }
+
+ /* prepend address/control fields if necessary */
+ if(needaddr)
+ {
+ skb_push(skb, 2);
+ skb->data[0] = PPP_ALLSTATIONS;
+ skb->data[1] = PPP_UI;
+ }
+
+ DEXIT(PPP_TRACE, "\n");
+
+ return skb;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Send a packet to the peer over the IrTTP connection.
+ * Returns 1 iff the packet was accepted.
+ * Returns 0 iff packet was not consumed.
+ * If the packet was not accepted, we will call ppp_output_wakeup
+ * at some later time to reactivate flow control in ppp_generic.
+ */
+static int
+ppp_irnet_send(struct ppp_channel * chan,
+ struct sk_buff * skb)
+{
+ irnet_socket * self = (struct irnet_socket *) chan->private;
+ int ret;
+
+ DENTER(PPP_TRACE, "(channel=0x%p, ap/self=0x%p)\n",
+ chan, self);
+
+ /* Check if things are somewhat valid... */
+ DASSERT(self != NULL, 0, PPP_ERROR, "Self is NULL !!!\n");
+
+ /* Check if we are connected */
+ if(!(test_bit(0, &self->ttp_open)))
+ {
+#ifdef CONNECT_IN_SEND
+ /* Let's try to connect one more time... */
+ /* Note : we won't be connected after this call, but we should be
+ * ready for next packet... */
+ /* If we are already connecting, this will fail */
+ irda_irnet_connect(self);
+#endif /* CONNECT_IN_SEND */
+
+ DEBUG(PPP_INFO, "IrTTP not ready ! (%ld-%ld)\n",
+ self->ttp_open, self->ttp_connect);
+
+ /* Note : we can either drop the packet or block the packet.
+ *
+ * Blocking the packet allow us a better connection time,
+ * because by calling ppp_output_wakeup() we can have
+ * ppp_generic resending the LCP request immediately to us,
+ * rather than waiting for one of pppd periodic transmission of
+ * LCP request.
+ *
+ * On the other hand, if we block all packet, all those periodic
+ * transmissions of pppd accumulate in ppp_generic, creating a
+ * backlog of LCP request. When we eventually connect later on,
+ * we have to transmit all this backlog before we can connect
+ * proper (if we don't timeout before).
+ *
+ * The current strategy is as follow :
+ * While we are attempting to connect, we block packets to get
+ * a better connection time.
+ * If we fail to connect, we drain the queue and start dropping packets
+ */
+#ifdef BLOCK_WHEN_CONNECT
+ /* If we are attempting to connect */
+ if(test_bit(0, &self->ttp_connect))
+ {
+ /* Blocking packet, ppp_generic will retry later */
+ return 0;
+ }
+#endif /* BLOCK_WHEN_CONNECT */
+
+ /* Dropping packet, pppd will retry later */
+ dev_kfree_skb(skb);
+ return 1;
+ }
+
+ /* Check if the queue can accept any packet, otherwise block */
+ if(self->tx_flow != FLOW_START)
+ DRETURN(0, PPP_INFO, "IrTTP queue full (%d skbs)...\n",
+ skb_queue_len(&self->tsap->tx_queue));
+
+ /* Prepare ppp frame for transmission */
+ skb = irnet_prepare_skb(self, skb);
+ DABORT(skb == NULL, 1, PPP_ERROR, "Prepare skb for Tx failed.\n");
+
+ /* Send the packet to IrTTP */
+ ret = irttp_data_request(self->tsap, skb);
+ if(ret < 0)
+ {
+ /*
+ * > IrTTPs tx queue is full, so we just have to
+ * > drop the frame! You might think that we should
+ * > just return -1 and don't deallocate the frame,
+ * > but that is dangerous since it's possible that
+ * > we have replaced the original skb with a new
+ * > one with larger headroom, and that would really
+ * > confuse do_dev_queue_xmit() in dev.c! I have
+ * > tried :-) DB
+ * Correction : we verify the flow control above (self->tx_flow),
+ * so we come here only if IrTTP doesn't like the packet (empty,
+ * too large, IrTTP not connected). In those rare cases, it's ok
+ * to drop it, we don't want to see it here again...
+ * Jean II
+ */
+ DERROR(PPP_ERROR, "IrTTP doesn't like this packet !!! (0x%X)\n", ret);
+ /* irttp_data_request already free the packet */
+ }
+
+ DEXIT(PPP_TRACE, "\n");
+ return 1; /* Packet has been consumed */
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Take care of the ioctls that ppp_generic doesn't want to deal with...
+ * Note : we are also called from dev_irnet_ioctl().
+ */
+static int
+ppp_irnet_ioctl(struct ppp_channel * chan,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ irnet_socket * ap = (struct irnet_socket *) chan->private;
+ int err;
+ int val;
+ u32 accm[8];
+ void __user *argp = (void __user *)arg;
+
+ DENTER(PPP_TRACE, "(channel=0x%p, ap=0x%p, cmd=0x%X)\n",
+ chan, ap, cmd);
+
+ /* Basic checks... */
+ DASSERT(ap != NULL, -ENXIO, PPP_ERROR, "ap is NULL...\n");
+
+ err = -EFAULT;
+ switch(cmd)
+ {
+ /* PPP flags */
+ case PPPIOCGFLAGS:
+ val = ap->flags | ap->rbits;
+ if(put_user(val, (int __user *) argp))
+ break;
+ err = 0;
+ break;
+ case PPPIOCSFLAGS:
+ if(get_user(val, (int __user *) argp))
+ break;
+ ap->flags = val & ~SC_RCV_BITS;
+ ap->rbits = val & SC_RCV_BITS;
+ err = 0;
+ break;
+
+ /* Async map stuff - all dummy to please pppd */
+ case PPPIOCGASYNCMAP:
+ if(put_user(ap->xaccm[0], (u32 __user *) argp))
+ break;
+ err = 0;
+ break;
+ case PPPIOCSASYNCMAP:
+ if(get_user(ap->xaccm[0], (u32 __user *) argp))
+ break;
+ err = 0;
+ break;
+ case PPPIOCGRASYNCMAP:
+ if(put_user(ap->raccm, (u32 __user *) argp))
+ break;
+ err = 0;
+ break;
+ case PPPIOCSRASYNCMAP:
+ if(get_user(ap->raccm, (u32 __user *) argp))
+ break;
+ err = 0;
+ break;
+ case PPPIOCGXASYNCMAP:
+ if(copy_to_user(argp, ap->xaccm, sizeof(ap->xaccm)))
+ break;
+ err = 0;
+ break;
+ case PPPIOCSXASYNCMAP:
+ if(copy_from_user(accm, argp, sizeof(accm)))
+ break;
+ accm[2] &= ~0x40000000U; /* can't escape 0x5e */
+ accm[3] |= 0x60000000U; /* must escape 0x7d, 0x7e */
+ memcpy(ap->xaccm, accm, sizeof(ap->xaccm));
+ err = 0;
+ break;
+
+ /* Max PPP frame size */
+ case PPPIOCGMRU:
+ if(put_user(ap->mru, (int __user *) argp))
+ break;
+ err = 0;
+ break;
+ case PPPIOCSMRU:
+ if(get_user(val, (int __user *) argp))
+ break;
+ if(val < PPP_MRU)
+ val = PPP_MRU;
+ ap->mru = val;
+ err = 0;
+ break;
+
+ default:
+ DEBUG(PPP_INFO, "Unsupported ioctl (0x%X)\n", cmd);
+ err = -ENOIOCTLCMD;
+ }
+
+ DEXIT(PPP_TRACE, " - err = 0x%X\n", err);
+ return err;
+}
+
+/************************** INITIALISATION **************************/
+/*
+ * Module initialisation and all that jazz...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Hook our device callbacks in the filesystem, to connect our code
+ * to /dev/irnet
+ */
+static inline int __init
+ppp_irnet_init(void)
+{
+ int err = 0;
+
+ DENTER(MODULE_TRACE, "()\n");
+
+ /* Allocate ourselves as a minor in the misc range */
+ err = misc_register(&irnet_misc_device);
+
+ DEXIT(MODULE_TRACE, "\n");
+ return err;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Cleanup at exit...
+ */
+static inline void __exit
+ppp_irnet_cleanup(void)
+{
+ DENTER(MODULE_TRACE, "()\n");
+
+ /* De-allocate /dev/irnet minor in misc range */
+ misc_deregister(&irnet_misc_device);
+
+ DEXIT(MODULE_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Module main entry point
+ */
+static int __init
+irnet_init(void)
+{
+ int err;
+
+ /* Initialise both parts... */
+ err = irda_irnet_init();
+ if(!err)
+ err = ppp_irnet_init();
+ return err;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Module exit
+ */
+static void __exit
+irnet_cleanup(void)
+{
+ irda_irnet_cleanup();
+ ppp_irnet_cleanup();
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Module magic
+ */
+module_init(irnet_init);
+module_exit(irnet_cleanup);
+MODULE_AUTHOR("Jean Tourrilhes <jt@hpl.hp.com>");
+MODULE_DESCRIPTION("IrNET : Synchronous PPP over IrDA");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CHARDEV(10, 187);
diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h
new file mode 100644
index 000000000..940225866
--- /dev/null
+++ b/net/irda/irnet/irnet_ppp.h
@@ -0,0 +1,119 @@
+/*
+ * IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ * Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file contains all definitions and declarations necessary for the
+ * PPP part of the IrNET module.
+ * This file is a private header, so other modules don't want to know
+ * what's in there...
+ */
+
+#ifndef IRNET_PPP_H
+#define IRNET_PPP_H
+
+/***************************** INCLUDES *****************************/
+
+#include "irnet.h" /* Module global include */
+
+/************************ CONSTANTS & MACROS ************************/
+
+/* /dev/irnet file constants */
+#define IRNET_MAJOR 10 /* Misc range */
+#define IRNET_MINOR 187 /* Official allocation */
+
+/* IrNET control channel stuff */
+#define IRNET_MAX_COMMAND 256 /* Max length of a command line */
+
+/* PPP hardcore stuff */
+
+/* Bits in rbits (PPP flags in irnet struct) */
+#define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP)
+
+/* Bit numbers in busy */
+#define XMIT_BUSY 0
+#define RECV_BUSY 1
+#define XMIT_WAKEUP 2
+#define XMIT_FULL 3
+
+/* Queue management */
+#define PPPSYNC_MAX_RQLEN 32 /* arbitrary */
+
+/****************************** TYPES ******************************/
+
+
+/**************************** PROTOTYPES ****************************/
+
+/* ----------------------- CONTROL CHANNEL ----------------------- */
+static inline ssize_t
+ irnet_ctrl_write(irnet_socket *,
+ const char *,
+ size_t);
+static inline ssize_t
+ irnet_ctrl_read(irnet_socket *,
+ struct file *,
+ char *,
+ size_t);
+static inline unsigned int
+ irnet_ctrl_poll(irnet_socket *,
+ struct file *,
+ poll_table *);
+/* ----------------------- CHARACTER DEVICE ----------------------- */
+static int
+ dev_irnet_open(struct inode *, /* fs callback : open */
+ struct file *),
+ dev_irnet_close(struct inode *,
+ struct file *);
+static ssize_t
+ dev_irnet_write(struct file *,
+ const char __user *,
+ size_t,
+ loff_t *),
+ dev_irnet_read(struct file *,
+ char __user *,
+ size_t,
+ loff_t *);
+static unsigned int
+ dev_irnet_poll(struct file *,
+ poll_table *);
+static long
+ dev_irnet_ioctl(struct file *,
+ unsigned int,
+ unsigned long);
+/* ------------------------ PPP INTERFACE ------------------------ */
+static inline struct sk_buff *
+ irnet_prepare_skb(irnet_socket *,
+ struct sk_buff *);
+static int
+ ppp_irnet_send(struct ppp_channel *,
+ struct sk_buff *);
+static int
+ ppp_irnet_ioctl(struct ppp_channel *,
+ unsigned int,
+ unsigned long);
+
+/**************************** VARIABLES ****************************/
+
+/* Filesystem callbacks (to call us) */
+static const struct file_operations irnet_device_fops =
+{
+ .owner = THIS_MODULE,
+ .read = dev_irnet_read,
+ .write = dev_irnet_write,
+ .poll = dev_irnet_poll,
+ .unlocked_ioctl = dev_irnet_ioctl,
+ .open = dev_irnet_open,
+ .release = dev_irnet_close,
+ .llseek = noop_llseek,
+ /* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */
+};
+
+/* Structure so that the misc major (drivers/char/misc.c) take care of us... */
+static struct miscdevice irnet_misc_device =
+{
+ IRNET_MINOR,
+ "irnet",
+ &irnet_device_fops
+};
+
+#endif /* IRNET_PPP_H */
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
new file mode 100644
index 000000000..e15c40e86
--- /dev/null
+++ b/net/irda/irnetlink.c
@@ -0,0 +1,158 @@
+/*
+ * IrDA netlink layer, for stack configuration.
+ *
+ * Copyright (c) 2007 Samuel Ortiz <samuel@sortiz.org>
+ *
+ * Partly based on the 802.11 nelink implementation
+ * (see net/wireless/nl80211.c) which is:
+ * Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/socket.h>
+#include <linux/irda.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/genetlink.h>
+
+
+
+static struct genl_family irda_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .name = IRDA_NL_NAME,
+ .hdrsize = 0,
+ .version = IRDA_NL_VERSION,
+ .maxattr = IRDA_NL_CMD_MAX,
+};
+
+static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info)
+{
+ char * ifname;
+
+ if (!info->attrs[IRDA_NL_ATTR_IFNAME])
+ return NULL;
+
+ ifname = nla_data(info->attrs[IRDA_NL_ATTR_IFNAME]);
+
+ pr_debug("%s(): Looking for %s\n", __func__, ifname);
+
+ return dev_get_by_name(net, ifname);
+}
+
+static int irda_nl_set_mode(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device * dev;
+ struct irlap_cb * irlap;
+ u32 mode;
+
+ if (!info->attrs[IRDA_NL_ATTR_MODE])
+ return -EINVAL;
+
+ mode = nla_get_u32(info->attrs[IRDA_NL_ATTR_MODE]);
+
+ pr_debug("%s(): Switching to mode: %d\n", __func__, mode);
+
+ dev = ifname_to_netdev(&init_net, info);
+ if (!dev)
+ return -ENODEV;
+
+ irlap = (struct irlap_cb *)dev->atalk_ptr;
+ if (!irlap) {
+ dev_put(dev);
+ return -ENODEV;
+ }
+
+ irlap->mode = mode;
+
+ dev_put(dev);
+
+ return 0;
+}
+
+static int irda_nl_get_mode(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device * dev;
+ struct irlap_cb * irlap;
+ struct sk_buff *msg;
+ void *hdr;
+ int ret = -ENOBUFS;
+
+ dev = ifname_to_netdev(&init_net, info);
+ if (!dev)
+ return -ENODEV;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ dev_put(dev);
+ return -ENOMEM;
+ }
+
+ irlap = (struct irlap_cb *)dev->atalk_ptr;
+ if (!irlap) {
+ ret = -ENODEV;
+ goto err_out;
+ }
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &irda_nl_family, 0, IRDA_NL_CMD_GET_MODE);
+ if (hdr == NULL) {
+ ret = -EMSGSIZE;
+ goto err_out;
+ }
+
+ if(nla_put_string(msg, IRDA_NL_ATTR_IFNAME,
+ dev->name))
+ goto err_out;
+
+ if(nla_put_u32(msg, IRDA_NL_ATTR_MODE, irlap->mode))
+ goto err_out;
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+ err_out:
+ nlmsg_free(msg);
+ dev_put(dev);
+
+ return ret;
+}
+
+static const struct nla_policy irda_nl_policy[IRDA_NL_ATTR_MAX + 1] = {
+ [IRDA_NL_ATTR_IFNAME] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ-1 },
+ [IRDA_NL_ATTR_MODE] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops irda_nl_ops[] = {
+ {
+ .cmd = IRDA_NL_CMD_SET_MODE,
+ .doit = irda_nl_set_mode,
+ .policy = irda_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = IRDA_NL_CMD_GET_MODE,
+ .doit = irda_nl_get_mode,
+ .policy = irda_nl_policy,
+ /* can be retrieved by unprivileged users */
+ },
+
+};
+
+int irda_nl_register(void)
+{
+ return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops);
+}
+
+void irda_nl_unregister(void)
+{
+ genl_unregister_family(&irda_nl_family);
+}
diff --git a/net/irda/irproc.c b/net/irda/irproc.c
new file mode 100644
index 000000000..b9ac598e2
--- /dev/null
+++ b/net/irda/irproc.c
@@ -0,0 +1,97 @@
+/*********************************************************************
+ *
+ * Filename: irproc.c
+ * Version: 1.0
+ * Description: Various entries in the /proc file system
+ * Status: Experimental.
+ * Author: Thomas Davis, <ratbert@radiks.net>
+ * Created at: Sat Feb 21 21:33:24 1998
+ * Modified at: Sun Nov 14 08:54:54 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-1999, Dag Brattli <dagb@cs.uit.no>
+ * Copyright (c) 1998, Thomas Davis, <ratbert@radiks.net>,
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * I, Thomas Davis, provide no warranty for any of this software.
+ * This material is provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/miscdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlmp.h>
+
+extern const struct file_operations discovery_seq_fops;
+extern const struct file_operations irlap_seq_fops;
+extern const struct file_operations irlmp_seq_fops;
+extern const struct file_operations irttp_seq_fops;
+extern const struct file_operations irias_seq_fops;
+
+struct irda_entry {
+ const char *name;
+ const struct file_operations *fops;
+};
+
+struct proc_dir_entry *proc_irda;
+EXPORT_SYMBOL(proc_irda);
+
+static const struct irda_entry irda_dirs[] = {
+ {"discovery", &discovery_seq_fops},
+ {"irttp", &irttp_seq_fops},
+ {"irlmp", &irlmp_seq_fops},
+ {"irlap", &irlap_seq_fops},
+ {"irias", &irias_seq_fops},
+};
+
+/*
+ * Function irda_proc_register (void)
+ *
+ * Register irda entry in /proc file system
+ *
+ */
+void __init irda_proc_register(void)
+{
+ int i;
+
+ proc_irda = proc_mkdir("irda", init_net.proc_net);
+ if (proc_irda == NULL)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(irda_dirs); i++)
+ (void) proc_create(irda_dirs[i].name, 0, proc_irda,
+ irda_dirs[i].fops);
+}
+
+/*
+ * Function irda_proc_unregister (void)
+ *
+ * Unregister irda entry in /proc file system
+ *
+ */
+void irda_proc_unregister(void)
+{
+ int i;
+
+ if (proc_irda) {
+ for (i=0; i<ARRAY_SIZE(irda_dirs); i++)
+ remove_proc_entry(irda_dirs[i].name, proc_irda);
+
+ remove_proc_entry("irda", init_net.proc_net);
+ proc_irda = NULL;
+ }
+}
+
+
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
new file mode 100644
index 000000000..acbe61c7e
--- /dev/null
+++ b/net/irda/irqueue.c
@@ -0,0 +1,913 @@
+/*********************************************************************
+ *
+ * Filename: irqueue.c
+ * Version: 0.3
+ * Description: General queue implementation
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Tue Jun 9 13:29:31 1998
+ * Modified at: Sun Dec 12 13:48:22 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ * Modified at: Thu Jan 4 14:29:10 CET 2001
+ * Modified by: Marc Zyngier <mzyngier@freesurf.fr>
+ *
+ * Copyright (C) 1998-1999, Aage Kvalnes <aage@cs.uit.no>
+ * Copyright (C) 1998, Dag Brattli,
+ * All Rights Reserved.
+ *
+ * This code is taken from the Vortex Operating System written by Aage
+ * Kvalnes. Aage has agreed that this code can use the GPL licence,
+ * although he does not use that licence in his own code.
+ *
+ * This copyright does however _not_ include the ELF hash() function
+ * which I currently don't know which licence or copyright it
+ * has. Please inform me if you know.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+/*
+ * NOTE :
+ * There are various problems with this package :
+ * o the hash function for ints is pathetic (but could be changed)
+ * o locking is sometime suspicious (especially during enumeration)
+ * o most users have only a few elements (== overhead)
+ * o most users never use search, so don't benefit from hashing
+ * Problem already fixed :
+ * o not 64 bit compliant (most users do hashv = (int) self)
+ * o hashbin_remove() is broken => use hashbin_remove_this()
+ * I think most users would be better served by a simple linked list
+ * (like include/linux/list.h) with a global spinlock per list.
+ * Jean II
+ */
+
+/*
+ * Notes on the concurrent access to hashbin and other SMP issues
+ * -------------------------------------------------------------
+ * Hashbins are very often in the IrDA stack a global repository of
+ * information, and therefore used in a very asynchronous manner following
+ * various events (driver calls, timers, user calls...).
+ * Therefore, very often it is highly important to consider the
+ * management of concurrent access to the hashbin and how to guarantee the
+ * consistency of the operations on it.
+ *
+ * First, we need to define the objective of locking :
+ * 1) Protect user data (content pointed by the hashbin)
+ * 2) Protect hashbin structure itself (linked list in each bin)
+ *
+ * OLD LOCKING
+ * -----------
+ *
+ * The previous locking strategy, either HB_LOCAL or HB_GLOBAL were
+ * both inadequate in *both* aspect.
+ * o HB_GLOBAL was using a spinlock for each bin (local locking).
+ * o HB_LOCAL was disabling irq on *all* CPUs, so use a single
+ * global semaphore.
+ * The problems were :
+ * A) Global irq disabling is no longer supported by the kernel
+ * B) No protection for the hashbin struct global data
+ * o hashbin_delete()
+ * o hb_current
+ * C) No protection for user data in some cases
+ *
+ * A) HB_LOCAL use global irq disabling, so doesn't work on kernel
+ * 2.5.X. Even when it is supported (kernel 2.4.X and earlier), its
+ * performance is not satisfactory on SMP setups. Most hashbins were
+ * HB_LOCAL, so (A) definitely need fixing.
+ * B) HB_LOCAL could be modified to fix (B). However, because HB_GLOBAL
+ * lock only the individual bins, it will never be able to lock the
+ * global data, so can't do (B).
+ * C) Some functions return pointer to data that is still in the
+ * hashbin :
+ * o hashbin_find()
+ * o hashbin_get_first()
+ * o hashbin_get_next()
+ * As the data is still in the hashbin, it may be changed or free'd
+ * while the caller is examinimg the data. In those case, locking can't
+ * be done within the hashbin, but must include use of the data within
+ * the caller.
+ * The caller can easily do this with HB_LOCAL (just disable irqs).
+ * However, this is impossible with HB_GLOBAL because the caller has no
+ * way to know the proper bin, so don't know which spinlock to use.
+ *
+ * Quick summary : can no longer use HB_LOCAL, and HB_GLOBAL is
+ * fundamentally broken and will never work.
+ *
+ * NEW LOCKING
+ * -----------
+ *
+ * To fix those problems, I've introduce a few changes in the
+ * hashbin locking :
+ * 1) New HB_LOCK scheme
+ * 2) hashbin->hb_spinlock
+ * 3) New hashbin usage policy
+ *
+ * HB_LOCK :
+ * -------
+ * HB_LOCK is a locking scheme intermediate between the old HB_LOCAL
+ * and HB_GLOBAL. It uses a single spinlock to protect the whole content
+ * of the hashbin. As it is a single spinlock, it can protect the global
+ * data of the hashbin and not only the bins themselves.
+ * HB_LOCK can only protect some of the hashbin calls, so it only lock
+ * call that can be made 100% safe and leave other call unprotected.
+ * HB_LOCK in theory is slower than HB_GLOBAL, but as the hashbin
+ * content is always small contention is not high, so it doesn't matter
+ * much. HB_LOCK is probably faster than HB_LOCAL.
+ *
+ * hashbin->hb_spinlock :
+ * --------------------
+ * The spinlock that HB_LOCK uses is available for caller, so that
+ * the caller can protect unprotected calls (see below).
+ * If the caller want to do entirely its own locking (HB_NOLOCK), he
+ * can do so and may use safely this spinlock.
+ * Locking is done like this :
+ * spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+ * Releasing the lock :
+ * spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+ *
+ * Safe & Protected calls :
+ * ----------------------
+ * The following calls are safe or protected via HB_LOCK :
+ * o hashbin_new() -> safe
+ * o hashbin_delete()
+ * o hashbin_insert()
+ * o hashbin_remove_first()
+ * o hashbin_remove()
+ * o hashbin_remove_this()
+ * o HASHBIN_GET_SIZE() -> atomic
+ *
+ * The following calls only protect the hashbin itself :
+ * o hashbin_lock_find()
+ * o hashbin_find_next()
+ *
+ * Unprotected calls :
+ * -----------------
+ * The following calls need to be protected by the caller :
+ * o hashbin_find()
+ * o hashbin_get_first()
+ * o hashbin_get_next()
+ *
+ * Locking Policy :
+ * --------------
+ * If the hashbin is used only in a single thread of execution
+ * (explicitly or implicitely), you can use HB_NOLOCK
+ * If the calling module already provide concurrent access protection,
+ * you may use HB_NOLOCK.
+ *
+ * In all other cases, you need to use HB_LOCK and lock the hashbin
+ * every time before calling one of the unprotected calls. You also must
+ * use the pointer returned by the unprotected call within the locked
+ * region.
+ *
+ * Extra care for enumeration :
+ * --------------------------
+ * hashbin_get_first() and hashbin_get_next() use the hashbin to
+ * store the current position, in hb_current.
+ * As long as the hashbin remains locked, this is safe. If you unlock
+ * the hashbin, the current position may change if anybody else modify
+ * or enumerate the hashbin.
+ * Summary : do the full enumeration while locked.
+ *
+ * Alternatively, you may use hashbin_find_next(). But, this will
+ * be slower, is more complex to use and doesn't protect the hashbin
+ * content. So, care is needed here as well.
+ *
+ * Other issues :
+ * ------------
+ * I believe that we are overdoing it by using spin_lock_irqsave()
+ * and we should use only spin_lock_bh() or similar. But, I don't have
+ * the balls to try it out.
+ * Don't believe that because hashbin are now (somewhat) SMP safe
+ * that the rest of the code is. Higher layers tend to be safest,
+ * but LAP and LMP would need some serious dedicated love.
+ *
+ * Jean II
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irqueue.h>
+
+/************************ QUEUE SUBROUTINES ************************/
+
+/*
+ * Hashbin
+ */
+#define GET_HASHBIN(x) ( x & HASHBIN_MASK )
+
+/*
+ * Function hash (name)
+ *
+ * This function hash the input string 'name' using the ELF hash
+ * function for strings.
+ */
+static __u32 hash( const char* name)
+{
+ __u32 h = 0;
+ __u32 g;
+
+ while(*name) {
+ h = (h<<4) + *name++;
+ if ((g = (h & 0xf0000000)))
+ h ^=g>>24;
+ h &=~g;
+ }
+ return h;
+}
+
+/*
+ * Function enqueue_first (queue, proc)
+ *
+ * Insert item first in queue.
+ *
+ */
+static void enqueue_first(irda_queue_t **queue, irda_queue_t* element)
+{
+
+ /*
+ * Check if queue is empty.
+ */
+ if ( *queue == NULL ) {
+ /*
+ * Queue is empty. Insert one element into the queue.
+ */
+ element->q_next = element->q_prev = *queue = element;
+
+ } else {
+ /*
+ * Queue is not empty. Insert element into front of queue.
+ */
+ element->q_next = (*queue);
+ (*queue)->q_prev->q_next = element;
+ element->q_prev = (*queue)->q_prev;
+ (*queue)->q_prev = element;
+ (*queue) = element;
+ }
+}
+
+
+/*
+ * Function dequeue (queue)
+ *
+ * Remove first entry in queue
+ *
+ */
+static irda_queue_t *dequeue_first(irda_queue_t **queue)
+{
+ irda_queue_t *ret;
+
+ pr_debug("dequeue_first()\n");
+
+ /*
+ * Set return value
+ */
+ ret = *queue;
+
+ if ( *queue == NULL ) {
+ /*
+ * Queue was empty.
+ */
+ } else if ( (*queue)->q_next == *queue ) {
+ /*
+ * Queue only contained a single element. It will now be
+ * empty.
+ */
+ *queue = NULL;
+ } else {
+ /*
+ * Queue contained several element. Remove the first one.
+ */
+ (*queue)->q_prev->q_next = (*queue)->q_next;
+ (*queue)->q_next->q_prev = (*queue)->q_prev;
+ *queue = (*queue)->q_next;
+ }
+
+ /*
+ * Return the removed entry (or NULL of queue was empty).
+ */
+ return ret;
+}
+
+/*
+ * Function dequeue_general (queue, element)
+ *
+ *
+ */
+static irda_queue_t *dequeue_general(irda_queue_t **queue, irda_queue_t* element)
+{
+ irda_queue_t *ret;
+
+ pr_debug("dequeue_general()\n");
+
+ /*
+ * Set return value
+ */
+ ret = *queue;
+
+ if ( *queue == NULL ) {
+ /*
+ * Queue was empty.
+ */
+ } else if ( (*queue)->q_next == *queue ) {
+ /*
+ * Queue only contained a single element. It will now be
+ * empty.
+ */
+ *queue = NULL;
+
+ } else {
+ /*
+ * Remove specific element.
+ */
+ element->q_prev->q_next = element->q_next;
+ element->q_next->q_prev = element->q_prev;
+ if ( (*queue) == element)
+ (*queue) = element->q_next;
+ }
+
+ /*
+ * Return the removed entry (or NULL of queue was empty).
+ */
+ return ret;
+}
+
+/************************ HASHBIN MANAGEMENT ************************/
+
+/*
+ * Function hashbin_create ( type, name )
+ *
+ * Create hashbin!
+ *
+ */
+hashbin_t *hashbin_new(int type)
+{
+ hashbin_t* hashbin;
+
+ /*
+ * Allocate new hashbin
+ */
+ hashbin = kzalloc(sizeof(*hashbin), GFP_ATOMIC);
+ if (!hashbin)
+ return NULL;
+
+ /*
+ * Initialize structure
+ */
+ hashbin->hb_type = type;
+ hashbin->magic = HB_MAGIC;
+ //hashbin->hb_current = NULL;
+
+ /* Make sure all spinlock's are unlocked */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_lock_init(&hashbin->hb_spinlock);
+ }
+
+ return hashbin;
+}
+EXPORT_SYMBOL(hashbin_new);
+
+
+/*
+ * Function hashbin_delete (hashbin, free_func)
+ *
+ * Destroy hashbin, the free_func can be a user supplied special routine
+ * for deallocating this structure if it's complex. If not the user can
+ * just supply kfree, which should take care of the job.
+ */
+#ifdef CONFIG_LOCKDEP
+static int hashbin_lock_depth = 0;
+#endif
+int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
+{
+ irda_queue_t* queue;
+ unsigned long flags = 0;
+ int i;
+
+ IRDA_ASSERT(hashbin != NULL, return -1;);
+ IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;);
+
+ /* Synchronize */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags,
+ hashbin_lock_depth++);
+ }
+
+ /*
+ * Free the entries in the hashbin, TODO: use hashbin_clear when
+ * it has been shown to work
+ */
+ for (i = 0; i < HASHBIN_SIZE; i ++ ) {
+ queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
+ while (queue ) {
+ if (free_func)
+ (*free_func)(queue);
+ queue = dequeue_first(
+ (irda_queue_t**) &hashbin->hb_queue[i]);
+ }
+ }
+
+ /* Cleanup local data */
+ hashbin->hb_current = NULL;
+ hashbin->magic = ~HB_MAGIC;
+
+ /* Release lock */
+ if ( hashbin->hb_type & HB_LOCK) {
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+#ifdef CONFIG_LOCKDEP
+ hashbin_lock_depth--;
+#endif
+ }
+
+ /*
+ * Free the hashbin structure
+ */
+ kfree(hashbin);
+
+ return 0;
+}
+EXPORT_SYMBOL(hashbin_delete);
+
+/********************* HASHBIN LIST OPERATIONS *********************/
+
+/*
+ * Function hashbin_insert (hashbin, entry, name)
+ *
+ * Insert an entry into the hashbin
+ *
+ */
+void hashbin_insert(hashbin_t* hashbin, irda_queue_t* entry, long hashv,
+ const char* name)
+{
+ unsigned long flags = 0;
+ int bin;
+
+ IRDA_ASSERT( hashbin != NULL, return;);
+ IRDA_ASSERT( hashbin->magic == HB_MAGIC, return;);
+
+ /*
+ * Locate hashbin
+ */
+ if ( name )
+ hashv = hash( name );
+ bin = GET_HASHBIN( hashv );
+
+ /* Synchronize */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+ } /* Default is no-lock */
+
+ /*
+ * Store name and key
+ */
+ entry->q_hash = hashv;
+ if ( name )
+ strlcpy( entry->q_name, name, sizeof(entry->q_name));
+
+ /*
+ * Insert new entry first
+ */
+ enqueue_first( (irda_queue_t**) &hashbin->hb_queue[ bin ],
+ entry);
+ hashbin->hb_size++;
+
+ /* Release lock */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+ } /* Default is no-lock */
+}
+EXPORT_SYMBOL(hashbin_insert);
+
+/*
+ * Function hashbin_remove_first (hashbin)
+ *
+ * Remove first entry of the hashbin
+ *
+ * Note : this function no longer use hashbin_remove(), but does things
+ * similar to hashbin_remove_this(), so can be considered safe.
+ * Jean II
+ */
+void *hashbin_remove_first( hashbin_t *hashbin)
+{
+ unsigned long flags = 0;
+ irda_queue_t *entry = NULL;
+
+ /* Synchronize */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+ } /* Default is no-lock */
+
+ entry = hashbin_get_first( hashbin);
+ if ( entry != NULL) {
+ int bin;
+ long hashv;
+ /*
+ * Locate hashbin
+ */
+ hashv = entry->q_hash;
+ bin = GET_HASHBIN( hashv );
+
+ /*
+ * Dequeue the entry...
+ */
+ dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
+ entry);
+ hashbin->hb_size--;
+ entry->q_next = NULL;
+ entry->q_prev = NULL;
+
+ /*
+ * Check if this item is the currently selected item, and in
+ * that case we must reset hb_current
+ */
+ if ( entry == hashbin->hb_current)
+ hashbin->hb_current = NULL;
+ }
+
+ /* Release lock */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+ } /* Default is no-lock */
+
+ return entry;
+}
+
+
+/*
+ * Function hashbin_remove (hashbin, hashv, name)
+ *
+ * Remove entry with the given name
+ *
+ * The use of this function is highly discouraged, because the whole
+ * concept behind hashbin_remove() is broken. In many cases, it's not
+ * possible to guarantee the unicity of the index (either hashv or name),
+ * leading to removing the WRONG entry.
+ * The only simple safe use is :
+ * hashbin_remove(hasbin, (int) self, NULL);
+ * In other case, you must think hard to guarantee unicity of the index.
+ * Jean II
+ */
+void* hashbin_remove( hashbin_t* hashbin, long hashv, const char* name)
+{
+ int bin, found = FALSE;
+ unsigned long flags = 0;
+ irda_queue_t* entry;
+
+ IRDA_ASSERT( hashbin != NULL, return NULL;);
+ IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+
+ /*
+ * Locate hashbin
+ */
+ if ( name )
+ hashv = hash( name );
+ bin = GET_HASHBIN( hashv );
+
+ /* Synchronize */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+ } /* Default is no-lock */
+
+ /*
+ * Search for entry
+ */
+ entry = hashbin->hb_queue[ bin ];
+ if ( entry ) {
+ do {
+ /*
+ * Check for key
+ */
+ if ( entry->q_hash == hashv ) {
+ /*
+ * Name compare too?
+ */
+ if ( name ) {
+ if ( strcmp( entry->q_name, name) == 0)
+ {
+ found = TRUE;
+ break;
+ }
+ } else {
+ found = TRUE;
+ break;
+ }
+ }
+ entry = entry->q_next;
+ } while ( entry != hashbin->hb_queue[ bin ] );
+ }
+
+ /*
+ * If entry was found, dequeue it
+ */
+ if ( found ) {
+ dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
+ entry);
+ hashbin->hb_size--;
+
+ /*
+ * Check if this item is the currently selected item, and in
+ * that case we must reset hb_current
+ */
+ if ( entry == hashbin->hb_current)
+ hashbin->hb_current = NULL;
+ }
+
+ /* Release lock */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+ } /* Default is no-lock */
+
+
+ /* Return */
+ if ( found )
+ return entry;
+ else
+ return NULL;
+
+}
+EXPORT_SYMBOL(hashbin_remove);
+
+/*
+ * Function hashbin_remove_this (hashbin, entry)
+ *
+ * Remove entry with the given name
+ *
+ * In some cases, the user of hashbin can't guarantee the unicity
+ * of either the hashv or name.
+ * In those cases, using the above function is guaranteed to cause troubles,
+ * so we use this one instead...
+ * And by the way, it's also faster, because we skip the search phase ;-)
+ */
+void* hashbin_remove_this( hashbin_t* hashbin, irda_queue_t* entry)
+{
+ unsigned long flags = 0;
+ int bin;
+ long hashv;
+
+ IRDA_ASSERT( hashbin != NULL, return NULL;);
+ IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+ IRDA_ASSERT( entry != NULL, return NULL;);
+
+ /* Synchronize */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+ } /* Default is no-lock */
+
+ /* Check if valid and not already removed... */
+ if((entry->q_next == NULL) || (entry->q_prev == NULL)) {
+ entry = NULL;
+ goto out;
+ }
+
+ /*
+ * Locate hashbin
+ */
+ hashv = entry->q_hash;
+ bin = GET_HASHBIN( hashv );
+
+ /*
+ * Dequeue the entry...
+ */
+ dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
+ entry);
+ hashbin->hb_size--;
+ entry->q_next = NULL;
+ entry->q_prev = NULL;
+
+ /*
+ * Check if this item is the currently selected item, and in
+ * that case we must reset hb_current
+ */
+ if ( entry == hashbin->hb_current)
+ hashbin->hb_current = NULL;
+out:
+ /* Release lock */
+ if ( hashbin->hb_type & HB_LOCK ) {
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+ } /* Default is no-lock */
+
+ return entry;
+}
+EXPORT_SYMBOL(hashbin_remove_this);
+
+/*********************** HASHBIN ENUMERATION ***********************/
+
+/*
+ * Function hashbin_common_find (hashbin, hashv, name)
+ *
+ * Find item with the given hashv or name
+ *
+ */
+void* hashbin_find( hashbin_t* hashbin, long hashv, const char* name )
+{
+ int bin;
+ irda_queue_t* entry;
+
+ pr_debug("hashbin_find()\n");
+
+ IRDA_ASSERT( hashbin != NULL, return NULL;);
+ IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+
+ /*
+ * Locate hashbin
+ */
+ if ( name )
+ hashv = hash( name );
+ bin = GET_HASHBIN( hashv );
+
+ /*
+ * Search for entry
+ */
+ entry = hashbin->hb_queue[ bin];
+ if ( entry ) {
+ do {
+ /*
+ * Check for key
+ */
+ if ( entry->q_hash == hashv ) {
+ /*
+ * Name compare too?
+ */
+ if ( name ) {
+ if ( strcmp( entry->q_name, name ) == 0 ) {
+ return entry;
+ }
+ } else {
+ return entry;
+ }
+ }
+ entry = entry->q_next;
+ } while ( entry != hashbin->hb_queue[ bin ] );
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(hashbin_find);
+
+/*
+ * Function hashbin_lock_find (hashbin, hashv, name)
+ *
+ * Find item with the given hashv or name
+ *
+ * Same, but with spinlock protection...
+ * I call it safe, but it's only safe with respect to the hashbin, not its
+ * content. - Jean II
+ */
+void* hashbin_lock_find( hashbin_t* hashbin, long hashv, const char* name )
+{
+ unsigned long flags = 0;
+ irda_queue_t* entry;
+
+ /* Synchronize */
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+
+ /*
+ * Search for entry
+ */
+ entry = hashbin_find(hashbin, hashv, name);
+
+ /* Release lock */
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+
+ return entry;
+}
+EXPORT_SYMBOL(hashbin_lock_find);
+
+/*
+ * Function hashbin_find (hashbin, hashv, name, pnext)
+ *
+ * Find an item with the given hashv or name, and its successor
+ *
+ * This function allow to do concurrent enumerations without the
+ * need to lock over the whole session, because the caller keep the
+ * context of the search. On the other hand, it might fail and return
+ * NULL if the entry is removed. - Jean II
+ */
+void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name,
+ void ** pnext)
+{
+ unsigned long flags = 0;
+ irda_queue_t* entry;
+
+ /* Synchronize */
+ spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+
+ /*
+ * Search for current entry
+ * This allow to check if the current item is still in the
+ * hashbin or has been removed.
+ */
+ entry = hashbin_find(hashbin, hashv, name);
+
+ /*
+ * Trick hashbin_get_next() to return what we want
+ */
+ if(entry) {
+ hashbin->hb_current = entry;
+ *pnext = hashbin_get_next( hashbin );
+ } else
+ *pnext = NULL;
+
+ /* Release lock */
+ spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+
+ return entry;
+}
+
+/*
+ * Function hashbin_get_first (hashbin)
+ *
+ * Get a pointer to first element in hashbin, this function must be
+ * called before any calls to hashbin_get_next()!
+ *
+ */
+irda_queue_t *hashbin_get_first( hashbin_t* hashbin)
+{
+ irda_queue_t *entry;
+ int i;
+
+ IRDA_ASSERT( hashbin != NULL, return NULL;);
+ IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+
+ if ( hashbin == NULL)
+ return NULL;
+
+ for ( i = 0; i < HASHBIN_SIZE; i ++ ) {
+ entry = hashbin->hb_queue[ i];
+ if ( entry) {
+ hashbin->hb_current = entry;
+ return entry;
+ }
+ }
+ /*
+ * Did not find any item in hashbin
+ */
+ return NULL;
+}
+EXPORT_SYMBOL(hashbin_get_first);
+
+/*
+ * Function hashbin_get_next (hashbin)
+ *
+ * Get next item in hashbin. A series of hashbin_get_next() calls must
+ * be started by a call to hashbin_get_first(). The function returns
+ * NULL when all items have been traversed
+ *
+ * The context of the search is stored within the hashbin, so you must
+ * protect yourself from concurrent enumerations. - Jean II
+ */
+irda_queue_t *hashbin_get_next( hashbin_t *hashbin)
+{
+ irda_queue_t* entry;
+ int bin;
+ int i;
+
+ IRDA_ASSERT( hashbin != NULL, return NULL;);
+ IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+
+ if ( hashbin->hb_current == NULL) {
+ IRDA_ASSERT( hashbin->hb_current != NULL, return NULL;);
+ return NULL;
+ }
+ entry = hashbin->hb_current->q_next;
+ bin = GET_HASHBIN( entry->q_hash);
+
+ /*
+ * Make sure that we are not back at the beginning of the queue
+ * again
+ */
+ if ( entry != hashbin->hb_queue[ bin ]) {
+ hashbin->hb_current = entry;
+
+ return entry;
+ }
+
+ /*
+ * Check that this is not the last queue in hashbin
+ */
+ if ( bin >= HASHBIN_SIZE)
+ return NULL;
+
+ /*
+ * Move to next queue in hashbin
+ */
+ bin++;
+ for ( i = bin; i < HASHBIN_SIZE; i++ ) {
+ entry = hashbin->hb_queue[ i];
+ if ( entry) {
+ hashbin->hb_current = entry;
+
+ return entry;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(hashbin_get_next);
diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c
new file mode 100644
index 000000000..873da5e7d
--- /dev/null
+++ b/net/irda/irsysctl.c
@@ -0,0 +1,258 @@
+/*********************************************************************
+ *
+ * Filename: irsysctl.c
+ * Version: 1.0
+ * Description: Sysctl interface for IrDA
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun May 24 22:12:06 1998
+ * Modified at: Fri Jun 4 02:50:15 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1997, 1999 Dag Brattli, All Rights Reserved.
+ * Copyright (c) 2000-2001 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/mm.h>
+#include <linux/ctype.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+
+#include <net/irda/irda.h> /* irda_debug */
+#include <net/irda/irlmp.h>
+#include <net/irda/timer.h>
+#include <net/irda/irias_object.h>
+
+extern int sysctl_discovery;
+extern int sysctl_discovery_slots;
+extern int sysctl_discovery_timeout;
+extern int sysctl_slot_timeout;
+extern int sysctl_fast_poll_increase;
+extern char sysctl_devname[];
+extern int sysctl_max_baud_rate;
+extern unsigned int sysctl_min_tx_turn_time;
+extern unsigned int sysctl_max_tx_data_size;
+extern unsigned int sysctl_max_tx_window;
+extern int sysctl_max_noreply_time;
+extern int sysctl_warn_noreply_time;
+extern int sysctl_lap_keepalive_time;
+
+extern struct irlmp_cb *irlmp;
+
+/* this is needed for the proc_dointvec_minmax - Jean II */
+static int max_discovery_slots = 16; /* ??? */
+static int min_discovery_slots = 1;
+/* IrLAP 6.13.2 says 25ms to 10+70ms - allow higher since some devices
+ * seems to require it. (from Dag's comment) */
+static int max_slot_timeout = 160;
+static int min_slot_timeout = 20;
+static int max_max_baud_rate = 16000000; /* See qos.c - IrLAP spec */
+static int min_max_baud_rate = 2400;
+static int max_min_tx_turn_time = 10000; /* See qos.c - IrLAP spec */
+static int min_min_tx_turn_time;
+static int max_max_tx_data_size = 2048; /* See qos.c - IrLAP spec */
+static int min_max_tx_data_size = 64;
+static int max_max_tx_window = 7; /* See qos.c - IrLAP spec */
+static int min_max_tx_window = 1;
+static int max_max_noreply_time = 40; /* See qos.c - IrLAP spec */
+static int min_max_noreply_time = 3;
+static int max_warn_noreply_time = 3; /* 3s == standard */
+static int min_warn_noreply_time = 1; /* 1s == min WD_TIMER */
+static int max_lap_keepalive_time = 10000; /* 10s */
+static int min_lap_keepalive_time = 100; /* 100us */
+/* For other sysctl, I've no idea of the range. Maybe Dag could help
+ * us on that - Jean II */
+
+static int do_devname(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dostring(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write) {
+ struct ias_value *val;
+
+ val = irias_new_string_value(sysctl_devname);
+ if (val)
+ irias_object_change_attribute("Device", "DeviceName", val);
+ }
+ return ret;
+}
+
+
+static int do_discovery(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (irlmp == NULL)
+ return -ENODEV;
+
+ if (sysctl_discovery)
+ irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout*HZ);
+ else
+ del_timer_sync(&irlmp->discovery_timer);
+
+ return ret;
+}
+
+/* One file */
+static struct ctl_table irda_table[] = {
+ {
+ .procname = "discovery",
+ .data = &sysctl_discovery,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = do_discovery,
+ },
+ {
+ .procname = "devname",
+ .data = sysctl_devname,
+ .maxlen = 65,
+ .mode = 0644,
+ .proc_handler = do_devname,
+ },
+#ifdef CONFIG_IRDA_FAST_RR
+ {
+ .procname = "fast_poll_increase",
+ .data = &sysctl_fast_poll_increase,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ {
+ .procname = "discovery_slots",
+ .data = &sysctl_discovery_slots,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_discovery_slots,
+ .extra2 = &max_discovery_slots
+ },
+ {
+ .procname = "discovery_timeout",
+ .data = &sysctl_discovery_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "slot_timeout",
+ .data = &sysctl_slot_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_slot_timeout,
+ .extra2 = &max_slot_timeout
+ },
+ {
+ .procname = "max_baud_rate",
+ .data = &sysctl_max_baud_rate,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_max_baud_rate,
+ .extra2 = &max_max_baud_rate
+ },
+ {
+ .procname = "min_tx_turn_time",
+ .data = &sysctl_min_tx_turn_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_min_tx_turn_time,
+ .extra2 = &max_min_tx_turn_time
+ },
+ {
+ .procname = "max_tx_data_size",
+ .data = &sysctl_max_tx_data_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_max_tx_data_size,
+ .extra2 = &max_max_tx_data_size
+ },
+ {
+ .procname = "max_tx_window",
+ .data = &sysctl_max_tx_window,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_max_tx_window,
+ .extra2 = &max_max_tx_window
+ },
+ {
+ .procname = "max_noreply_time",
+ .data = &sysctl_max_noreply_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_max_noreply_time,
+ .extra2 = &max_max_noreply_time
+ },
+ {
+ .procname = "warn_noreply_time",
+ .data = &sysctl_warn_noreply_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_warn_noreply_time,
+ .extra2 = &max_warn_noreply_time
+ },
+ {
+ .procname = "lap_keepalive_time",
+ .data = &sysctl_lap_keepalive_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_lap_keepalive_time,
+ .extra2 = &max_lap_keepalive_time
+ },
+ { }
+};
+
+static struct ctl_table_header *irda_table_header;
+
+/*
+ * Function irda_sysctl_register (void)
+ *
+ * Register our sysctl interface
+ *
+ */
+int __init irda_sysctl_register(void)
+{
+ irda_table_header = register_net_sysctl(&init_net, "net/irda", irda_table);
+ if (!irda_table_header)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Function irda_sysctl_unregister (void)
+ *
+ * Unregister our sysctl interface
+ *
+ */
+void irda_sysctl_unregister(void)
+{
+ unregister_net_sysctl_table(irda_table_header);
+}
+
+
+
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
new file mode 100644
index 000000000..b6ab41d5b
--- /dev/null
+++ b/net/irda/irttp.c
@@ -0,0 +1,1891 @@
+/*********************************************************************
+ *
+ * Filename: irttp.c
+ * Version: 1.2
+ * Description: Tiny Transport Protocol (TTP) implementation
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sun Aug 31 20:14:31 1997
+ * Modified at: Wed Jan 5 11:31:27 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/parameters.h>
+#include <net/irda/irttp.h>
+
+static struct irttp_cb *irttp;
+
+static void __irttp_close_tsap(struct tsap_cb *self);
+
+static int irttp_data_indication(void *instance, void *sap,
+ struct sk_buff *skb);
+static int irttp_udata_indication(void *instance, void *sap,
+ struct sk_buff *skb);
+static void irttp_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason, struct sk_buff *);
+static void irttp_connect_indication(void *instance, void *sap,
+ struct qos_info *qos, __u32 max_sdu_size,
+ __u8 header_size, struct sk_buff *skb);
+static void irttp_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos, __u32 max_sdu_size,
+ __u8 header_size, struct sk_buff *skb);
+static void irttp_run_tx_queue(struct tsap_cb *self);
+static void irttp_run_rx_queue(struct tsap_cb *self);
+
+static void irttp_flush_queues(struct tsap_cb *self);
+static void irttp_fragment_skb(struct tsap_cb *self, struct sk_buff *skb);
+static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self);
+static void irttp_todo_expired(unsigned long data);
+static int irttp_param_max_sdu_size(void *instance, irda_param_t *param,
+ int get);
+
+static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow);
+static void irttp_status_indication(void *instance,
+ LINK_STATUS link, LOCK_STATUS lock);
+
+/* Information for parsing parameters in IrTTP */
+static const pi_minor_info_t pi_minor_call_table[] = {
+ { NULL, 0 }, /* 0x00 */
+ { irttp_param_max_sdu_size, PV_INTEGER | PV_BIG_ENDIAN } /* 0x01 */
+};
+static const pi_major_info_t pi_major_call_table[] = {
+ { pi_minor_call_table, 2 }
+};
+static pi_param_info_t param_info = { pi_major_call_table, 1, 0x0f, 4 };
+
+/************************ GLOBAL PROCEDURES ************************/
+
+/*
+ * Function irttp_init (void)
+ *
+ * Initialize the IrTTP layer. Called by module initialization code
+ *
+ */
+int __init irttp_init(void)
+{
+ irttp = kzalloc(sizeof(struct irttp_cb), GFP_KERNEL);
+ if (irttp == NULL)
+ return -ENOMEM;
+
+ irttp->magic = TTP_MAGIC;
+
+ irttp->tsaps = hashbin_new(HB_LOCK);
+ if (!irttp->tsaps) {
+ net_err_ratelimited("%s: can't allocate IrTTP hashbin!\n",
+ __func__);
+ kfree(irttp);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Function irttp_cleanup (void)
+ *
+ * Called by module destruction/cleanup code
+ *
+ */
+void irttp_cleanup(void)
+{
+ /* Check for main structure */
+ IRDA_ASSERT(irttp->magic == TTP_MAGIC, return;);
+
+ /*
+ * Delete hashbin and close all TSAP instances in it
+ */
+ hashbin_delete(irttp->tsaps, (FREE_FUNC) __irttp_close_tsap);
+
+ irttp->magic = 0;
+
+ /* De-allocate main structure */
+ kfree(irttp);
+
+ irttp = NULL;
+}
+
+/*************************** SUBROUTINES ***************************/
+
+/*
+ * Function irttp_start_todo_timer (self, timeout)
+ *
+ * Start todo timer.
+ *
+ * Made it more effient and unsensitive to race conditions - Jean II
+ */
+static inline void irttp_start_todo_timer(struct tsap_cb *self, int timeout)
+{
+ /* Set new value for timer */
+ mod_timer(&self->todo_timer, jiffies + timeout);
+}
+
+/*
+ * Function irttp_todo_expired (data)
+ *
+ * Todo timer has expired!
+ *
+ * One of the restriction of the timer is that it is run only on the timer
+ * interrupt which run every 10ms. This mean that even if you set the timer
+ * with a delay of 0, it may take up to 10ms before it's run.
+ * So, to minimise latency and keep cache fresh, we try to avoid using
+ * it as much as possible.
+ * Note : we can't use tasklets, because they can't be asynchronously
+ * killed (need user context), and we can't guarantee that here...
+ * Jean II
+ */
+static void irttp_todo_expired(unsigned long data)
+{
+ struct tsap_cb *self = (struct tsap_cb *) data;
+
+ /* Check that we still exist */
+ if (!self || self->magic != TTP_TSAP_MAGIC)
+ return;
+
+ pr_debug("%s(instance=%p)\n", __func__, self);
+
+ /* Try to make some progress, especially on Tx side - Jean II */
+ irttp_run_rx_queue(self);
+ irttp_run_tx_queue(self);
+
+ /* Check if time for disconnect */
+ if (test_bit(0, &self->disconnect_pend)) {
+ /* Check if it's possible to disconnect yet */
+ if (skb_queue_empty(&self->tx_queue)) {
+ /* Make sure disconnect is not pending anymore */
+ clear_bit(0, &self->disconnect_pend); /* FALSE */
+
+ /* Note : self->disconnect_skb may be NULL */
+ irttp_disconnect_request(self, self->disconnect_skb,
+ P_NORMAL);
+ self->disconnect_skb = NULL;
+ } else {
+ /* Try again later */
+ irttp_start_todo_timer(self, HZ/10);
+
+ /* No reason to try and close now */
+ return;
+ }
+ }
+
+ /* Check if it's closing time */
+ if (self->close_pend)
+ /* Finish cleanup */
+ irttp_close_tsap(self);
+}
+
+/*
+ * Function irttp_flush_queues (self)
+ *
+ * Flushes (removes all frames) in transitt-buffer (tx_list)
+ */
+static void irttp_flush_queues(struct tsap_cb *self)
+{
+ struct sk_buff *skb;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+ /* Deallocate frames waiting to be sent */
+ while ((skb = skb_dequeue(&self->tx_queue)) != NULL)
+ dev_kfree_skb(skb);
+
+ /* Deallocate received frames */
+ while ((skb = skb_dequeue(&self->rx_queue)) != NULL)
+ dev_kfree_skb(skb);
+
+ /* Deallocate received fragments */
+ while ((skb = skb_dequeue(&self->rx_fragments)) != NULL)
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function irttp_reassemble (self)
+ *
+ * Makes a new (continuous) skb of all the fragments in the fragment
+ * queue
+ *
+ */
+static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self)
+{
+ struct sk_buff *skb, *frag;
+ int n = 0; /* Fragment index */
+
+ IRDA_ASSERT(self != NULL, return NULL;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return NULL;);
+
+ pr_debug("%s(), self->rx_sdu_size=%d\n", __func__,
+ self->rx_sdu_size);
+
+ skb = dev_alloc_skb(TTP_HEADER + self->rx_sdu_size);
+ if (!skb)
+ return NULL;
+
+ /*
+ * Need to reserve space for TTP header in case this skb needs to
+ * be requeued in case delivery failes
+ */
+ skb_reserve(skb, TTP_HEADER);
+ skb_put(skb, self->rx_sdu_size);
+
+ /*
+ * Copy all fragments to a new buffer
+ */
+ while ((frag = skb_dequeue(&self->rx_fragments)) != NULL) {
+ skb_copy_to_linear_data_offset(skb, n, frag->data, frag->len);
+ n += frag->len;
+
+ dev_kfree_skb(frag);
+ }
+
+ pr_debug("%s(), frame len=%d, rx_sdu_size=%d, rx_max_sdu_size=%d\n",
+ __func__, n, self->rx_sdu_size, self->rx_max_sdu_size);
+ /* Note : irttp_run_rx_queue() calculate self->rx_sdu_size
+ * by summing the size of all fragments, so we should always
+ * have n == self->rx_sdu_size, except in cases where we
+ * droped the last fragment (when self->rx_sdu_size exceed
+ * self->rx_max_sdu_size), where n < self->rx_sdu_size.
+ * Jean II */
+ IRDA_ASSERT(n <= self->rx_sdu_size, n = self->rx_sdu_size;);
+
+ /* Set the new length */
+ skb_trim(skb, n);
+
+ self->rx_sdu_size = 0;
+
+ return skb;
+}
+
+/*
+ * Function irttp_fragment_skb (skb)
+ *
+ * Fragments a frame and queues all the fragments for transmission
+ *
+ */
+static inline void irttp_fragment_skb(struct tsap_cb *self,
+ struct sk_buff *skb)
+{
+ struct sk_buff *frag;
+ __u8 *frame;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ /*
+ * Split frame into a number of segments
+ */
+ while (skb->len > self->max_seg_size) {
+ pr_debug("%s(), fragmenting ...\n", __func__);
+
+ /* Make new segment */
+ frag = alloc_skb(self->max_seg_size+self->max_header_size,
+ GFP_ATOMIC);
+ if (!frag)
+ return;
+
+ skb_reserve(frag, self->max_header_size);
+
+ /* Copy data from the original skb into this fragment. */
+ skb_copy_from_linear_data(skb, skb_put(frag, self->max_seg_size),
+ self->max_seg_size);
+
+ /* Insert TTP header, with the more bit set */
+ frame = skb_push(frag, TTP_HEADER);
+ frame[0] = TTP_MORE;
+
+ /* Hide the copied data from the original skb */
+ skb_pull(skb, self->max_seg_size);
+
+ /* Queue fragment */
+ skb_queue_tail(&self->tx_queue, frag);
+ }
+ /* Queue what is left of the original skb */
+ pr_debug("%s(), queuing last segment\n", __func__);
+
+ frame = skb_push(skb, TTP_HEADER);
+ frame[0] = 0x00; /* Clear more bit */
+
+ /* Queue fragment */
+ skb_queue_tail(&self->tx_queue, skb);
+}
+
+/*
+ * Function irttp_param_max_sdu_size (self, param)
+ *
+ * Handle the MaxSduSize parameter in the connect frames, this function
+ * will be called both when this parameter needs to be inserted into, and
+ * extracted from the connect frames
+ */
+static int irttp_param_max_sdu_size(void *instance, irda_param_t *param,
+ int get)
+{
+ struct tsap_cb *self;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->tx_max_sdu_size;
+ else
+ self->tx_max_sdu_size = param->pv.i;
+
+ pr_debug("%s(), MaxSduSize=%d\n", __func__, param->pv.i);
+
+ return 0;
+}
+
+/*************************** CLIENT CALLS ***************************/
+/************************** LMP CALLBACKS **************************/
+/* Everything is happily mixed up. Waiting for next clean up - Jean II */
+
+/*
+ * Initialization, that has to be done on new tsap
+ * instance allocation and on duplication
+ */
+static void irttp_init_tsap(struct tsap_cb *tsap)
+{
+ spin_lock_init(&tsap->lock);
+ init_timer(&tsap->todo_timer);
+
+ skb_queue_head_init(&tsap->rx_queue);
+ skb_queue_head_init(&tsap->tx_queue);
+ skb_queue_head_init(&tsap->rx_fragments);
+}
+
+/*
+ * Function irttp_open_tsap (stsap, notify)
+ *
+ * Create TSAP connection endpoint,
+ */
+struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify)
+{
+ struct tsap_cb *self;
+ struct lsap_cb *lsap;
+ notify_t ttp_notify;
+
+ IRDA_ASSERT(irttp->magic == TTP_MAGIC, return NULL;);
+
+ /* The IrLMP spec (IrLMP 1.1 p10) says that we have the right to
+ * use only 0x01-0x6F. Of course, we can use LSAP_ANY as well.
+ * JeanII */
+ if ((stsap_sel != LSAP_ANY) &&
+ ((stsap_sel < 0x01) || (stsap_sel >= 0x70))) {
+ pr_debug("%s(), invalid tsap!\n", __func__);
+ return NULL;
+ }
+
+ self = kzalloc(sizeof(struct tsap_cb), GFP_ATOMIC);
+ if (self == NULL)
+ return NULL;
+
+ /* Initialize internal objects */
+ irttp_init_tsap(self);
+
+ /* Initialise todo timer */
+ self->todo_timer.data = (unsigned long) self;
+ self->todo_timer.function = &irttp_todo_expired;
+
+ /* Initialize callbacks for IrLMP to use */
+ irda_notify_init(&ttp_notify);
+ ttp_notify.connect_confirm = irttp_connect_confirm;
+ ttp_notify.connect_indication = irttp_connect_indication;
+ ttp_notify.disconnect_indication = irttp_disconnect_indication;
+ ttp_notify.data_indication = irttp_data_indication;
+ ttp_notify.udata_indication = irttp_udata_indication;
+ ttp_notify.flow_indication = irttp_flow_indication;
+ if (notify->status_indication != NULL)
+ ttp_notify.status_indication = irttp_status_indication;
+ ttp_notify.instance = self;
+ strncpy(ttp_notify.name, notify->name, NOTIFY_MAX_NAME);
+
+ self->magic = TTP_TSAP_MAGIC;
+ self->connected = FALSE;
+
+ /*
+ * Create LSAP at IrLMP layer
+ */
+ lsap = irlmp_open_lsap(stsap_sel, &ttp_notify, 0);
+ if (lsap == NULL) {
+ pr_debug("%s: unable to allocate LSAP!!\n", __func__);
+ __irttp_close_tsap(self);
+ return NULL;
+ }
+
+ /*
+ * If user specified LSAP_ANY as source TSAP selector, then IrLMP
+ * will replace it with whatever source selector which is free, so
+ * the stsap_sel we have might not be valid anymore
+ */
+ self->stsap_sel = lsap->slsap_sel;
+ pr_debug("%s(), stsap_sel=%02x\n", __func__, self->stsap_sel);
+
+ self->notify = *notify;
+ self->lsap = lsap;
+
+ hashbin_insert(irttp->tsaps, (irda_queue_t *) self, (long) self, NULL);
+
+ if (credit > TTP_RX_MAX_CREDIT)
+ self->initial_credit = TTP_RX_MAX_CREDIT;
+ else
+ self->initial_credit = credit;
+
+ return self;
+}
+EXPORT_SYMBOL(irttp_open_tsap);
+
+/*
+ * Function irttp_close (handle)
+ *
+ * Remove an instance of a TSAP. This function should only deal with the
+ * deallocation of the TSAP, and resetting of the TSAPs values;
+ *
+ */
+static void __irttp_close_tsap(struct tsap_cb *self)
+{
+ /* First make sure we're connected. */
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+ irttp_flush_queues(self);
+
+ del_timer(&self->todo_timer);
+
+ /* This one won't be cleaned up if we are disconnect_pend + close_pend
+ * and we receive a disconnect_indication */
+ if (self->disconnect_skb)
+ dev_kfree_skb(self->disconnect_skb);
+
+ self->connected = FALSE;
+ self->magic = ~TTP_TSAP_MAGIC;
+
+ kfree(self);
+}
+
+/*
+ * Function irttp_close (self)
+ *
+ * Remove TSAP from list of all TSAPs and then deallocate all resources
+ * associated with this TSAP
+ *
+ * Note : because we *free* the tsap structure, it is the responsibility
+ * of the caller to make sure we are called only once and to deal with
+ * possible race conditions. - Jean II
+ */
+int irttp_close_tsap(struct tsap_cb *self)
+{
+ struct tsap_cb *tsap;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+
+ /* Make sure tsap has been disconnected */
+ if (self->connected) {
+ /* Check if disconnect is not pending */
+ if (!test_bit(0, &self->disconnect_pend)) {
+ net_warn_ratelimited("%s: TSAP still connected!\n",
+ __func__);
+ irttp_disconnect_request(self, NULL, P_NORMAL);
+ }
+ self->close_pend = TRUE;
+ irttp_start_todo_timer(self, HZ/10);
+
+ return 0; /* Will be back! */
+ }
+
+ tsap = hashbin_remove(irttp->tsaps, (long) self, NULL);
+
+ IRDA_ASSERT(tsap == self, return -1;);
+
+ /* Close corresponding LSAP */
+ if (self->lsap) {
+ irlmp_close_lsap(self->lsap);
+ self->lsap = NULL;
+ }
+
+ __irttp_close_tsap(self);
+
+ return 0;
+}
+EXPORT_SYMBOL(irttp_close_tsap);
+
+/*
+ * Function irttp_udata_request (self, skb)
+ *
+ * Send unreliable data on this TSAP
+ *
+ */
+int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb)
+{
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ /* Take shortcut on zero byte packets */
+ if (skb->len == 0) {
+ ret = 0;
+ goto err;
+ }
+
+ /* Check that nothing bad happens */
+ if (!self->connected) {
+ net_warn_ratelimited("%s(), Not connected\n", __func__);
+ ret = -ENOTCONN;
+ goto err;
+ }
+
+ if (skb->len > self->max_seg_size) {
+ net_err_ratelimited("%s(), UData is too large for IrLAP!\n",
+ __func__);
+ ret = -EMSGSIZE;
+ goto err;
+ }
+
+ irlmp_udata_request(self->lsap, skb);
+ self->stats.tx_packets++;
+
+ return 0;
+
+err:
+ dev_kfree_skb(skb);
+ return ret;
+}
+EXPORT_SYMBOL(irttp_udata_request);
+
+
+/*
+ * Function irttp_data_request (handle, skb)
+ *
+ * Queue frame for transmission. If SAR is enabled, fragement the frame
+ * and queue the fragments for transmission
+ */
+int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb)
+{
+ __u8 *frame;
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ pr_debug("%s() : queue len = %d\n", __func__,
+ skb_queue_len(&self->tx_queue));
+
+ /* Take shortcut on zero byte packets */
+ if (skb->len == 0) {
+ ret = 0;
+ goto err;
+ }
+
+ /* Check that nothing bad happens */
+ if (!self->connected) {
+ net_warn_ratelimited("%s: Not connected\n", __func__);
+ ret = -ENOTCONN;
+ goto err;
+ }
+
+ /*
+ * Check if SAR is disabled, and the frame is larger than what fits
+ * inside an IrLAP frame
+ */
+ if ((self->tx_max_sdu_size == 0) && (skb->len > self->max_seg_size)) {
+ net_err_ratelimited("%s: SAR disabled, and data is too large for IrLAP!\n",
+ __func__);
+ ret = -EMSGSIZE;
+ goto err;
+ }
+
+ /*
+ * Check if SAR is enabled, and the frame is larger than the
+ * TxMaxSduSize
+ */
+ if ((self->tx_max_sdu_size != 0) &&
+ (self->tx_max_sdu_size != TTP_SAR_UNBOUND) &&
+ (skb->len > self->tx_max_sdu_size)) {
+ net_err_ratelimited("%s: SAR enabled, but data is larger than TxMaxSduSize!\n",
+ __func__);
+ ret = -EMSGSIZE;
+ goto err;
+ }
+ /*
+ * Check if transmit queue is full
+ */
+ if (skb_queue_len(&self->tx_queue) >= TTP_TX_MAX_QUEUE) {
+ /*
+ * Give it a chance to empty itself
+ */
+ irttp_run_tx_queue(self);
+
+ /* Drop packet. This error code should trigger the caller
+ * to resend the data in the client code - Jean II */
+ ret = -ENOBUFS;
+ goto err;
+ }
+
+ /* Queue frame, or queue frame segments */
+ if ((self->tx_max_sdu_size == 0) || (skb->len < self->max_seg_size)) {
+ /* Queue frame */
+ IRDA_ASSERT(skb_headroom(skb) >= TTP_HEADER, return -1;);
+ frame = skb_push(skb, TTP_HEADER);
+ frame[0] = 0x00; /* Clear more bit */
+
+ skb_queue_tail(&self->tx_queue, skb);
+ } else {
+ /*
+ * Fragment the frame, this function will also queue the
+ * fragments, we don't care about the fact the transmit
+ * queue may be overfilled by all the segments for a little
+ * while
+ */
+ irttp_fragment_skb(self, skb);
+ }
+
+ /* Check if we can accept more data from client */
+ if ((!self->tx_sdu_busy) &&
+ (skb_queue_len(&self->tx_queue) > TTP_TX_HIGH_THRESHOLD)) {
+ /* Tx queue filling up, so stop client. */
+ if (self->notify.flow_indication) {
+ self->notify.flow_indication(self->notify.instance,
+ self, FLOW_STOP);
+ }
+ /* self->tx_sdu_busy is the state of the client.
+ * Update state after notifying client to avoid
+ * race condition with irttp_flow_indication().
+ * If the queue empty itself after our test but before
+ * we set the flag, we will fix ourselves below in
+ * irttp_run_tx_queue().
+ * Jean II */
+ self->tx_sdu_busy = TRUE;
+ }
+
+ /* Try to make some progress */
+ irttp_run_tx_queue(self);
+
+ return 0;
+
+err:
+ dev_kfree_skb(skb);
+ return ret;
+}
+EXPORT_SYMBOL(irttp_data_request);
+
+/*
+ * Function irttp_run_tx_queue (self)
+ *
+ * Transmit packets queued for transmission (if possible)
+ *
+ */
+static void irttp_run_tx_queue(struct tsap_cb *self)
+{
+ struct sk_buff *skb;
+ unsigned long flags;
+ int n;
+
+ pr_debug("%s() : send_credit = %d, queue_len = %d\n",
+ __func__,
+ self->send_credit, skb_queue_len(&self->tx_queue));
+
+ /* Get exclusive access to the tx queue, otherwise don't touch it */
+ if (irda_lock(&self->tx_queue_lock) == FALSE)
+ return;
+
+ /* Try to send out frames as long as we have credits
+ * and as long as LAP is not full. If LAP is full, it will
+ * poll us through irttp_flow_indication() - Jean II */
+ while ((self->send_credit > 0) &&
+ (!irlmp_lap_tx_queue_full(self->lsap)) &&
+ (skb = skb_dequeue(&self->tx_queue))) {
+ /*
+ * Since we can transmit and receive frames concurrently,
+ * the code below is a critical region and we must assure that
+ * nobody messes with the credits while we update them.
+ */
+ spin_lock_irqsave(&self->lock, flags);
+
+ n = self->avail_credit;
+ self->avail_credit = 0;
+
+ /* Only room for 127 credits in frame */
+ if (n > 127) {
+ self->avail_credit = n-127;
+ n = 127;
+ }
+ self->remote_credit += n;
+ self->send_credit--;
+
+ spin_unlock_irqrestore(&self->lock, flags);
+
+ /*
+ * More bit must be set by the data_request() or fragment()
+ * functions
+ */
+ skb->data[0] |= (n & 0x7f);
+
+ /* Detach from socket.
+ * The current skb has a reference to the socket that sent
+ * it (skb->sk). When we pass it to IrLMP, the skb will be
+ * stored in in IrLAP (self->wx_list). When we are within
+ * IrLAP, we lose the notion of socket, so we should not
+ * have a reference to a socket. So, we drop it here.
+ *
+ * Why does it matter ?
+ * When the skb is freed (kfree_skb), if it is associated
+ * with a socket, it release buffer space on the socket
+ * (through sock_wfree() and sock_def_write_space()).
+ * If the socket no longer exist, we may crash. Hard.
+ * When we close a socket, we make sure that associated packets
+ * in IrTTP are freed. However, we have no way to cancel
+ * the packet that we have passed to IrLAP. So, if a packet
+ * remains in IrLAP (retry on the link or else) after we
+ * close the socket, we are dead !
+ * Jean II */
+ if (skb->sk != NULL) {
+ /* IrSOCK application, IrOBEX, ... */
+ skb_orphan(skb);
+ }
+ /* IrCOMM over IrTTP, IrLAN, ... */
+
+ /* Pass the skb to IrLMP - done */
+ irlmp_data_request(self->lsap, skb);
+ self->stats.tx_packets++;
+ }
+
+ /* Check if we can accept more frames from client.
+ * We don't want to wait until the todo timer to do that, and we
+ * can't use tasklets (grr...), so we are obliged to give control
+ * to client. That's ok, this test will be true not too often
+ * (max once per LAP window) and we are called from places
+ * where we can spend a bit of time doing stuff. - Jean II */
+ if ((self->tx_sdu_busy) &&
+ (skb_queue_len(&self->tx_queue) < TTP_TX_LOW_THRESHOLD) &&
+ (!self->close_pend)) {
+ if (self->notify.flow_indication)
+ self->notify.flow_indication(self->notify.instance,
+ self, FLOW_START);
+
+ /* self->tx_sdu_busy is the state of the client.
+ * We don't really have a race here, but it's always safer
+ * to update our state after the client - Jean II */
+ self->tx_sdu_busy = FALSE;
+ }
+
+ /* Reset lock */
+ self->tx_queue_lock = 0;
+}
+
+/*
+ * Function irttp_give_credit (self)
+ *
+ * Send a dataless flowdata TTP-PDU and give available credit to peer
+ * TSAP
+ */
+static inline void irttp_give_credit(struct tsap_cb *self)
+{
+ struct sk_buff *tx_skb = NULL;
+ unsigned long flags;
+ int n;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+ pr_debug("%s() send=%d,avail=%d,remote=%d\n",
+ __func__,
+ self->send_credit, self->avail_credit, self->remote_credit);
+
+ /* Give credit to peer */
+ tx_skb = alloc_skb(TTP_MAX_HEADER, GFP_ATOMIC);
+ if (!tx_skb)
+ return;
+
+ /* Reserve space for LMP, and LAP header */
+ skb_reserve(tx_skb, LMP_MAX_HEADER);
+
+ /*
+ * Since we can transmit and receive frames concurrently,
+ * the code below is a critical region and we must assure that
+ * nobody messes with the credits while we update them.
+ */
+ spin_lock_irqsave(&self->lock, flags);
+
+ n = self->avail_credit;
+ self->avail_credit = 0;
+
+ /* Only space for 127 credits in frame */
+ if (n > 127) {
+ self->avail_credit = n - 127;
+ n = 127;
+ }
+ self->remote_credit += n;
+
+ spin_unlock_irqrestore(&self->lock, flags);
+
+ skb_put(tx_skb, 1);
+ tx_skb->data[0] = (__u8) (n & 0x7f);
+
+ irlmp_data_request(self->lsap, tx_skb);
+ self->stats.tx_packets++;
+}
+
+/*
+ * Function irttp_udata_indication (instance, sap, skb)
+ *
+ * Received some unit-data (unreliable)
+ *
+ */
+static int irttp_udata_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct tsap_cb *self;
+ int err;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+ IRDA_ASSERT(skb != NULL, return -1;);
+
+ self->stats.rx_packets++;
+
+ /* Just pass data to layer above */
+ if (self->notify.udata_indication) {
+ err = self->notify.udata_indication(self->notify.instance,
+ self, skb);
+ /* Same comment as in irttp_do_data_indication() */
+ if (!err)
+ return 0;
+ }
+ /* Either no handler, or handler returns an error */
+ dev_kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Function irttp_data_indication (instance, sap, skb)
+ *
+ * Receive segment from IrLMP.
+ *
+ */
+static int irttp_data_indication(void *instance, void *sap,
+ struct sk_buff *skb)
+{
+ struct tsap_cb *self;
+ unsigned long flags;
+ int n;
+
+ self = instance;
+
+ n = skb->data[0] & 0x7f; /* Extract the credits */
+
+ self->stats.rx_packets++;
+
+ /* Deal with inbound credit
+ * Since we can transmit and receive frames concurrently,
+ * the code below is a critical region and we must assure that
+ * nobody messes with the credits while we update them.
+ */
+ spin_lock_irqsave(&self->lock, flags);
+ self->send_credit += n;
+ if (skb->len > 1)
+ self->remote_credit--;
+ spin_unlock_irqrestore(&self->lock, flags);
+
+ /*
+ * Data or dataless packet? Dataless frames contains only the
+ * TTP_HEADER.
+ */
+ if (skb->len > 1) {
+ /*
+ * We don't remove the TTP header, since we must preserve the
+ * more bit, so the defragment routing knows what to do
+ */
+ skb_queue_tail(&self->rx_queue, skb);
+ } else {
+ /* Dataless flowdata TTP-PDU */
+ dev_kfree_skb(skb);
+ }
+
+
+ /* Push data to the higher layer.
+ * We do it synchronously because running the todo timer for each
+ * receive packet would be too much overhead and latency.
+ * By passing control to the higher layer, we run the risk that
+ * it may take time or grab a lock. Most often, the higher layer
+ * will only put packet in a queue.
+ * Anyway, packets are only dripping through the IrDA, so we can
+ * have time before the next packet.
+ * Further, we are run from NET_BH, so the worse that can happen is
+ * us missing the optimal time to send back the PF bit in LAP.
+ * Jean II */
+ irttp_run_rx_queue(self);
+
+ /* We now give credits to peer in irttp_run_rx_queue().
+ * We need to send credit *NOW*, otherwise we are going
+ * to miss the next Tx window. The todo timer may take
+ * a while before it's run... - Jean II */
+
+ /*
+ * If the peer device has given us some credits and we didn't have
+ * anyone from before, then we need to shedule the tx queue.
+ * We need to do that because our Tx have stopped (so we may not
+ * get any LAP flow indication) and the user may be stopped as
+ * well. - Jean II
+ */
+ if (self->send_credit == n) {
+ /* Restart pushing stuff to LAP */
+ irttp_run_tx_queue(self);
+ /* Note : we don't want to schedule the todo timer
+ * because it has horrible latency. No tasklets
+ * because the tasklet API is broken. - Jean II */
+ }
+
+ return 0;
+}
+
+/*
+ * Function irttp_status_indication (self, reason)
+ *
+ * Status_indication, just pass to the higher layer...
+ *
+ */
+static void irttp_status_indication(void *instance,
+ LINK_STATUS link, LOCK_STATUS lock)
+{
+ struct tsap_cb *self;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+ /* Check if client has already closed the TSAP and gone away */
+ if (self->close_pend)
+ return;
+
+ /*
+ * Inform service user if he has requested it
+ */
+ if (self->notify.status_indication != NULL)
+ self->notify.status_indication(self->notify.instance,
+ link, lock);
+ else
+ pr_debug("%s(), no handler\n", __func__);
+}
+
+/*
+ * Function irttp_flow_indication (self, reason)
+ *
+ * Flow_indication : IrLAP tells us to send more data.
+ *
+ */
+static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow)
+{
+ struct tsap_cb *self;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+ pr_debug("%s(instance=%p)\n", __func__, self);
+
+ /* We are "polled" directly from LAP, and the LAP want to fill
+ * its Tx window. We want to do our best to send it data, so that
+ * we maximise the window. On the other hand, we want to limit the
+ * amount of work here so that LAP doesn't hang forever waiting
+ * for packets. - Jean II */
+
+ /* Try to send some packets. Currently, LAP calls us every time
+ * there is one free slot, so we will send only one packet.
+ * This allow the scheduler to do its round robin - Jean II */
+ irttp_run_tx_queue(self);
+
+ /* Note regarding the interraction with higher layer.
+ * irttp_run_tx_queue() may call the client when its queue
+ * start to empty, via notify.flow_indication(). Initially.
+ * I wanted this to happen in a tasklet, to avoid client
+ * grabbing the CPU, but we can't use tasklets safely. And timer
+ * is definitely too slow.
+ * This will happen only once per LAP window, and usually at
+ * the third packet (unless window is smaller). LAP is still
+ * doing mtt and sending first packet so it's sort of OK
+ * to do that. Jean II */
+
+ /* If we need to send disconnect. try to do it now */
+ if (self->disconnect_pend)
+ irttp_start_todo_timer(self, 0);
+}
+
+/*
+ * Function irttp_flow_request (self, command)
+ *
+ * This function could be used by the upper layers to tell IrTTP to stop
+ * delivering frames if the receive queues are starting to get full, or
+ * to tell IrTTP to start delivering frames again.
+ */
+void irttp_flow_request(struct tsap_cb *self, LOCAL_FLOW flow)
+{
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+ switch (flow) {
+ case FLOW_STOP:
+ pr_debug("%s(), flow stop\n", __func__);
+ self->rx_sdu_busy = TRUE;
+ break;
+ case FLOW_START:
+ pr_debug("%s(), flow start\n", __func__);
+ self->rx_sdu_busy = FALSE;
+
+ /* Client say he can accept more data, try to free our
+ * queues ASAP - Jean II */
+ irttp_run_rx_queue(self);
+
+ break;
+ default:
+ pr_debug("%s(), Unknown flow command!\n", __func__);
+ }
+}
+EXPORT_SYMBOL(irttp_flow_request);
+
+/*
+ * Function irttp_connect_request (self, dtsap_sel, daddr, qos)
+ *
+ * Try to connect to remote destination TSAP selector
+ *
+ */
+int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel,
+ __u32 saddr, __u32 daddr,
+ struct qos_info *qos, __u32 max_sdu_size,
+ struct sk_buff *userdata)
+{
+ struct sk_buff *tx_skb;
+ __u8 *frame;
+ __u8 n;
+
+ pr_debug("%s(), max_sdu_size=%d\n", __func__, max_sdu_size);
+
+ IRDA_ASSERT(self != NULL, return -EBADR;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -EBADR;);
+
+ if (self->connected) {
+ if (userdata)
+ dev_kfree_skb(userdata);
+ return -EISCONN;
+ }
+
+ /* Any userdata supplied? */
+ if (userdata == NULL) {
+ tx_skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER,
+ GFP_ATOMIC);
+ if (!tx_skb)
+ return -ENOMEM;
+
+ /* Reserve space for MUX_CONTROL and LAP header */
+ skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER);
+ } else {
+ tx_skb = userdata;
+ /*
+ * Check that the client has reserved enough space for
+ * headers
+ */
+ IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER,
+ { dev_kfree_skb(userdata); return -1; });
+ }
+
+ /* Initialize connection parameters */
+ self->connected = FALSE;
+ self->avail_credit = 0;
+ self->rx_max_sdu_size = max_sdu_size;
+ self->rx_sdu_size = 0;
+ self->rx_sdu_busy = FALSE;
+ self->dtsap_sel = dtsap_sel;
+
+ n = self->initial_credit;
+
+ self->remote_credit = 0;
+ self->send_credit = 0;
+
+ /*
+ * Give away max 127 credits for now
+ */
+ if (n > 127) {
+ self->avail_credit = n - 127;
+ n = 127;
+ }
+
+ self->remote_credit = n;
+
+ /* SAR enabled? */
+ if (max_sdu_size > 0) {
+ IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER),
+ { dev_kfree_skb(tx_skb); return -1; });
+
+ /* Insert SAR parameters */
+ frame = skb_push(tx_skb, TTP_HEADER + TTP_SAR_HEADER);
+
+ frame[0] = TTP_PARAMETERS | n;
+ frame[1] = 0x04; /* Length */
+ frame[2] = 0x01; /* MaxSduSize */
+ frame[3] = 0x02; /* Value length */
+
+ put_unaligned(cpu_to_be16((__u16) max_sdu_size),
+ (__be16 *)(frame+4));
+ } else {
+ /* Insert plain TTP header */
+ frame = skb_push(tx_skb, TTP_HEADER);
+
+ /* Insert initial credit in frame */
+ frame[0] = n & 0x7f;
+ }
+
+ /* Connect with IrLMP. No QoS parameters for now */
+ return irlmp_connect_request(self->lsap, dtsap_sel, saddr, daddr, qos,
+ tx_skb);
+}
+EXPORT_SYMBOL(irttp_connect_request);
+
+/*
+ * Function irttp_connect_confirm (handle, qos, skb)
+ *
+ * Service user confirms TSAP connection with peer.
+ *
+ */
+static void irttp_connect_confirm(void *instance, void *sap,
+ struct qos_info *qos, __u32 max_seg_size,
+ __u8 max_header_size, struct sk_buff *skb)
+{
+ struct tsap_cb *self;
+ int parameters;
+ int ret;
+ __u8 plen;
+ __u8 n;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ self->max_seg_size = max_seg_size - TTP_HEADER;
+ self->max_header_size = max_header_size + TTP_HEADER;
+
+ /*
+ * Check if we have got some QoS parameters back! This should be the
+ * negotiated QoS for the link.
+ */
+ if (qos) {
+ pr_debug("IrTTP, Negotiated BAUD_RATE: %02x\n",
+ qos->baud_rate.bits);
+ pr_debug("IrTTP, Negotiated BAUD_RATE: %d bps.\n",
+ qos->baud_rate.value);
+ }
+
+ n = skb->data[0] & 0x7f;
+
+ pr_debug("%s(), Initial send_credit=%d\n", __func__, n);
+
+ self->send_credit = n;
+ self->tx_max_sdu_size = 0;
+ self->connected = TRUE;
+
+ parameters = skb->data[0] & 0x80;
+
+ IRDA_ASSERT(skb->len >= TTP_HEADER, return;);
+ skb_pull(skb, TTP_HEADER);
+
+ if (parameters) {
+ plen = skb->data[0];
+
+ ret = irda_param_extract_all(self, skb->data+1,
+ IRDA_MIN(skb->len-1, plen),
+ &param_info);
+
+ /* Any errors in the parameter list? */
+ if (ret < 0) {
+ net_warn_ratelimited("%s: error extracting parameters\n",
+ __func__);
+ dev_kfree_skb(skb);
+
+ /* Do not accept this connection attempt */
+ return;
+ }
+ /* Remove parameters */
+ skb_pull(skb, IRDA_MIN(skb->len, plen+1));
+ }
+
+ pr_debug("%s() send=%d,avail=%d,remote=%d\n", __func__,
+ self->send_credit, self->avail_credit, self->remote_credit);
+
+ pr_debug("%s(), MaxSduSize=%d\n", __func__,
+ self->tx_max_sdu_size);
+
+ if (self->notify.connect_confirm) {
+ self->notify.connect_confirm(self->notify.instance, self, qos,
+ self->tx_max_sdu_size,
+ self->max_header_size, skb);
+ } else
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function irttp_connect_indication (handle, skb)
+ *
+ * Some other device is connecting to this TSAP
+ *
+ */
+static void irttp_connect_indication(void *instance, void *sap,
+ struct qos_info *qos, __u32 max_seg_size, __u8 max_header_size,
+ struct sk_buff *skb)
+{
+ struct tsap_cb *self;
+ struct lsap_cb *lsap;
+ int parameters;
+ int ret;
+ __u8 plen;
+ __u8 n;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+ IRDA_ASSERT(skb != NULL, return;);
+
+ lsap = sap;
+
+ self->max_seg_size = max_seg_size - TTP_HEADER;
+ self->max_header_size = max_header_size+TTP_HEADER;
+
+ pr_debug("%s(), TSAP sel=%02x\n", __func__, self->stsap_sel);
+
+ /* Need to update dtsap_sel if its equal to LSAP_ANY */
+ self->dtsap_sel = lsap->dlsap_sel;
+
+ n = skb->data[0] & 0x7f;
+
+ self->send_credit = n;
+ self->tx_max_sdu_size = 0;
+
+ parameters = skb->data[0] & 0x80;
+
+ IRDA_ASSERT(skb->len >= TTP_HEADER, return;);
+ skb_pull(skb, TTP_HEADER);
+
+ if (parameters) {
+ plen = skb->data[0];
+
+ ret = irda_param_extract_all(self, skb->data+1,
+ IRDA_MIN(skb->len-1, plen),
+ &param_info);
+
+ /* Any errors in the parameter list? */
+ if (ret < 0) {
+ net_warn_ratelimited("%s: error extracting parameters\n",
+ __func__);
+ dev_kfree_skb(skb);
+
+ /* Do not accept this connection attempt */
+ return;
+ }
+
+ /* Remove parameters */
+ skb_pull(skb, IRDA_MIN(skb->len, plen+1));
+ }
+
+ if (self->notify.connect_indication) {
+ self->notify.connect_indication(self->notify.instance, self,
+ qos, self->tx_max_sdu_size,
+ self->max_header_size, skb);
+ } else
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function irttp_connect_response (handle, userdata)
+ *
+ * Service user is accepting the connection, just pass it down to
+ * IrLMP!
+ *
+ */
+int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size,
+ struct sk_buff *userdata)
+{
+ struct sk_buff *tx_skb;
+ __u8 *frame;
+ int ret;
+ __u8 n;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+
+ pr_debug("%s(), Source TSAP selector=%02x\n", __func__,
+ self->stsap_sel);
+
+ /* Any userdata supplied? */
+ if (userdata == NULL) {
+ tx_skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER,
+ GFP_ATOMIC);
+ if (!tx_skb)
+ return -ENOMEM;
+
+ /* Reserve space for MUX_CONTROL and LAP header */
+ skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER);
+ } else {
+ tx_skb = userdata;
+ /*
+ * Check that the client has reserved enough space for
+ * headers
+ */
+ IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER,
+ { dev_kfree_skb(userdata); return -1; });
+ }
+
+ self->avail_credit = 0;
+ self->remote_credit = 0;
+ self->rx_max_sdu_size = max_sdu_size;
+ self->rx_sdu_size = 0;
+ self->rx_sdu_busy = FALSE;
+
+ n = self->initial_credit;
+
+ /* Frame has only space for max 127 credits (7 bits) */
+ if (n > 127) {
+ self->avail_credit = n - 127;
+ n = 127;
+ }
+
+ self->remote_credit = n;
+ self->connected = TRUE;
+
+ /* SAR enabled? */
+ if (max_sdu_size > 0) {
+ IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER),
+ { dev_kfree_skb(tx_skb); return -1; });
+
+ /* Insert TTP header with SAR parameters */
+ frame = skb_push(tx_skb, TTP_HEADER + TTP_SAR_HEADER);
+
+ frame[0] = TTP_PARAMETERS | n;
+ frame[1] = 0x04; /* Length */
+
+ /* irda_param_insert(self, IRTTP_MAX_SDU_SIZE, frame+1, */
+/* TTP_SAR_HEADER, &param_info) */
+
+ frame[2] = 0x01; /* MaxSduSize */
+ frame[3] = 0x02; /* Value length */
+
+ put_unaligned(cpu_to_be16((__u16) max_sdu_size),
+ (__be16 *)(frame+4));
+ } else {
+ /* Insert TTP header */
+ frame = skb_push(tx_skb, TTP_HEADER);
+
+ frame[0] = n & 0x7f;
+ }
+
+ ret = irlmp_connect_response(self->lsap, tx_skb);
+
+ return ret;
+}
+EXPORT_SYMBOL(irttp_connect_response);
+
+/*
+ * Function irttp_dup (self, instance)
+ *
+ * Duplicate TSAP, can be used by servers to confirm a connection on a
+ * new TSAP so it can keep listening on the old one.
+ */
+struct tsap_cb *irttp_dup(struct tsap_cb *orig, void *instance)
+{
+ struct tsap_cb *new;
+ unsigned long flags;
+
+ /* Protect our access to the old tsap instance */
+ spin_lock_irqsave(&irttp->tsaps->hb_spinlock, flags);
+
+ /* Find the old instance */
+ if (!hashbin_find(irttp->tsaps, (long) orig, NULL)) {
+ pr_debug("%s(), unable to find TSAP\n", __func__);
+ spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags);
+ return NULL;
+ }
+
+ /* Allocate a new instance */
+ new = kmemdup(orig, sizeof(struct tsap_cb), GFP_ATOMIC);
+ if (!new) {
+ pr_debug("%s(), unable to kmalloc\n", __func__);
+ spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags);
+ return NULL;
+ }
+ spin_lock_init(&new->lock);
+
+ /* We don't need the old instance any more */
+ spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags);
+
+ /* Try to dup the LSAP (may fail if we were too slow) */
+ new->lsap = irlmp_dup(orig->lsap, new);
+ if (!new->lsap) {
+ pr_debug("%s(), dup failed!\n", __func__);
+ kfree(new);
+ return NULL;
+ }
+
+ /* Not everything should be copied */
+ new->notify.instance = instance;
+
+ /* Initialize internal objects */
+ irttp_init_tsap(new);
+
+ /* This is locked */
+ hashbin_insert(irttp->tsaps, (irda_queue_t *) new, (long) new, NULL);
+
+ return new;
+}
+EXPORT_SYMBOL(irttp_dup);
+
+/*
+ * Function irttp_disconnect_request (self)
+ *
+ * Close this connection please! If priority is high, the queued data
+ * segments, if any, will be deallocated first
+ *
+ */
+int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
+ int priority)
+{
+ int ret;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+
+ /* Already disconnected? */
+ if (!self->connected) {
+ pr_debug("%s(), already disconnected!\n", __func__);
+ if (userdata)
+ dev_kfree_skb(userdata);
+ return -1;
+ }
+
+ /* Disconnect already pending ?
+ * We need to use an atomic operation to prevent reentry. This
+ * function may be called from various context, like user, timer
+ * for following a disconnect_indication() (i.e. net_bh).
+ * Jean II */
+ if (test_and_set_bit(0, &self->disconnect_pend)) {
+ pr_debug("%s(), disconnect already pending\n",
+ __func__);
+ if (userdata)
+ dev_kfree_skb(userdata);
+
+ /* Try to make some progress */
+ irttp_run_tx_queue(self);
+ return -1;
+ }
+
+ /*
+ * Check if there is still data segments in the transmit queue
+ */
+ if (!skb_queue_empty(&self->tx_queue)) {
+ if (priority == P_HIGH) {
+ /*
+ * No need to send the queued data, if we are
+ * disconnecting right now since the data will
+ * not have any usable connection to be sent on
+ */
+ pr_debug("%s(): High priority!!()\n", __func__);
+ irttp_flush_queues(self);
+ } else if (priority == P_NORMAL) {
+ /*
+ * Must delay disconnect until after all data segments
+ * have been sent and the tx_queue is empty
+ */
+ /* We'll reuse this one later for the disconnect */
+ self->disconnect_skb = userdata; /* May be NULL */
+
+ irttp_run_tx_queue(self);
+
+ irttp_start_todo_timer(self, HZ/10);
+ return -1;
+ }
+ }
+ /* Note : we don't need to check if self->rx_queue is full and the
+ * state of self->rx_sdu_busy because the disconnect response will
+ * be sent at the LMP level (so even if the peer has its Tx queue
+ * full of data). - Jean II */
+
+ pr_debug("%s(), Disconnecting ...\n", __func__);
+ self->connected = FALSE;
+
+ if (!userdata) {
+ struct sk_buff *tx_skb;
+ tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+ if (!tx_skb)
+ return -ENOMEM;
+
+ /*
+ * Reserve space for MUX and LAP header
+ */
+ skb_reserve(tx_skb, LMP_MAX_HEADER);
+
+ userdata = tx_skb;
+ }
+ ret = irlmp_disconnect_request(self->lsap, userdata);
+
+ /* The disconnect is no longer pending */
+ clear_bit(0, &self->disconnect_pend); /* FALSE */
+
+ return ret;
+}
+EXPORT_SYMBOL(irttp_disconnect_request);
+
+/*
+ * Function irttp_disconnect_indication (self, reason)
+ *
+ * Disconnect indication, TSAP disconnected by peer?
+ *
+ */
+static void irttp_disconnect_indication(void *instance, void *sap,
+ LM_REASON reason, struct sk_buff *skb)
+{
+ struct tsap_cb *self;
+
+ self = instance;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+ /* Prevent higher layer to send more data */
+ self->connected = FALSE;
+
+ /* Check if client has already tried to close the TSAP */
+ if (self->close_pend) {
+ /* In this case, the higher layer is probably gone. Don't
+ * bother it and clean up the remains - Jean II */
+ if (skb)
+ dev_kfree_skb(skb);
+ irttp_close_tsap(self);
+ return;
+ }
+
+ /* If we are here, we assume that is the higher layer is still
+ * waiting for the disconnect notification and able to process it,
+ * even if he tried to disconnect. Otherwise, it would have already
+ * attempted to close the tsap and self->close_pend would be TRUE.
+ * Jean II */
+
+ /* No need to notify the client if has already tried to disconnect */
+ if (self->notify.disconnect_indication)
+ self->notify.disconnect_indication(self->notify.instance, self,
+ reason, skb);
+ else
+ if (skb)
+ dev_kfree_skb(skb);
+}
+
+/*
+ * Function irttp_do_data_indication (self, skb)
+ *
+ * Try to deliver reassembled skb to layer above, and requeue it if that
+ * for some reason should fail. We mark rx sdu as busy to apply back
+ * pressure is necessary.
+ */
+static void irttp_do_data_indication(struct tsap_cb *self, struct sk_buff *skb)
+{
+ int err;
+
+ /* Check if client has already closed the TSAP and gone away */
+ if (self->close_pend) {
+ dev_kfree_skb(skb);
+ return;
+ }
+
+ err = self->notify.data_indication(self->notify.instance, self, skb);
+
+ /* Usually the layer above will notify that it's input queue is
+ * starting to get filled by using the flow request, but this may
+ * be difficult, so it can instead just refuse to eat it and just
+ * give an error back
+ */
+ if (err) {
+ pr_debug("%s() requeueing skb!\n", __func__);
+
+ /* Make sure we take a break */
+ self->rx_sdu_busy = TRUE;
+
+ /* Need to push the header in again */
+ skb_push(skb, TTP_HEADER);
+ skb->data[0] = 0x00; /* Make sure MORE bit is cleared */
+
+ /* Put skb back on queue */
+ skb_queue_head(&self->rx_queue, skb);
+ }
+}
+
+/*
+ * Function irttp_run_rx_queue (self)
+ *
+ * Check if we have any frames to be transmitted, or if we have any
+ * available credit to give away.
+ */
+static void irttp_run_rx_queue(struct tsap_cb *self)
+{
+ struct sk_buff *skb;
+ int more = 0;
+
+ pr_debug("%s() send=%d,avail=%d,remote=%d\n", __func__,
+ self->send_credit, self->avail_credit, self->remote_credit);
+
+ /* Get exclusive access to the rx queue, otherwise don't touch it */
+ if (irda_lock(&self->rx_queue_lock) == FALSE)
+ return;
+
+ /*
+ * Reassemble all frames in receive queue and deliver them
+ */
+ while (!self->rx_sdu_busy && (skb = skb_dequeue(&self->rx_queue))) {
+ /* This bit will tell us if it's the last fragment or not */
+ more = skb->data[0] & 0x80;
+
+ /* Remove TTP header */
+ skb_pull(skb, TTP_HEADER);
+
+ /* Add the length of the remaining data */
+ self->rx_sdu_size += skb->len;
+
+ /*
+ * If SAR is disabled, or user has requested no reassembly
+ * of received fragments then we just deliver them
+ * immediately. This can be requested by clients that
+ * implements byte streams without any message boundaries
+ */
+ if (self->rx_max_sdu_size == TTP_SAR_DISABLE) {
+ irttp_do_data_indication(self, skb);
+ self->rx_sdu_size = 0;
+
+ continue;
+ }
+
+ /* Check if this is a fragment, and not the last fragment */
+ if (more) {
+ /*
+ * Queue the fragment if we still are within the
+ * limits of the maximum size of the rx_sdu
+ */
+ if (self->rx_sdu_size <= self->rx_max_sdu_size) {
+ pr_debug("%s(), queueing frag\n",
+ __func__);
+ skb_queue_tail(&self->rx_fragments, skb);
+ } else {
+ /* Free the part of the SDU that is too big */
+ dev_kfree_skb(skb);
+ }
+ continue;
+ }
+ /*
+ * This is the last fragment, so time to reassemble!
+ */
+ if ((self->rx_sdu_size <= self->rx_max_sdu_size) ||
+ (self->rx_max_sdu_size == TTP_SAR_UNBOUND)) {
+ /*
+ * A little optimizing. Only queue the fragment if
+ * there are other fragments. Since if this is the
+ * last and only fragment, there is no need to
+ * reassemble :-)
+ */
+ if (!skb_queue_empty(&self->rx_fragments)) {
+ skb_queue_tail(&self->rx_fragments,
+ skb);
+
+ skb = irttp_reassemble_skb(self);
+ }
+
+ /* Now we can deliver the reassembled skb */
+ irttp_do_data_indication(self, skb);
+ } else {
+ pr_debug("%s(), Truncated frame\n", __func__);
+
+ /* Free the part of the SDU that is too big */
+ dev_kfree_skb(skb);
+
+ /* Deliver only the valid but truncated part of SDU */
+ skb = irttp_reassemble_skb(self);
+
+ irttp_do_data_indication(self, skb);
+ }
+ self->rx_sdu_size = 0;
+ }
+
+ /*
+ * It's not trivial to keep track of how many credits are available
+ * by incrementing at each packet, because delivery may fail
+ * (irttp_do_data_indication() may requeue the frame) and because
+ * we need to take care of fragmentation.
+ * We want the other side to send up to initial_credit packets.
+ * We have some frames in our queues, and we have already allowed it
+ * to send remote_credit.
+ * No need to spinlock, write is atomic and self correcting...
+ * Jean II
+ */
+ self->avail_credit = (self->initial_credit -
+ (self->remote_credit +
+ skb_queue_len(&self->rx_queue) +
+ skb_queue_len(&self->rx_fragments)));
+
+ /* Do we have too much credits to send to peer ? */
+ if ((self->remote_credit <= TTP_RX_MIN_CREDIT) &&
+ (self->avail_credit > 0)) {
+ /* Send explicit credit frame */
+ irttp_give_credit(self);
+ /* Note : do *NOT* check if tx_queue is non-empty, that
+ * will produce deadlocks. I repeat : send a credit frame
+ * even if we have something to send in our Tx queue.
+ * If we have credits, it means that our Tx queue is blocked.
+ *
+ * Let's suppose the peer can't keep up with our Tx. He will
+ * flow control us by not sending us any credits, and we
+ * will stop Tx and start accumulating credits here.
+ * Up to the point where the peer will stop its Tx queue,
+ * for lack of credits.
+ * Let's assume the peer application is single threaded.
+ * It will block on Tx and never consume any Rx buffer.
+ * Deadlock. Guaranteed. - Jean II
+ */
+ }
+
+ /* Reset lock */
+ self->rx_queue_lock = 0;
+}
+
+#ifdef CONFIG_PROC_FS
+struct irttp_iter_state {
+ int id;
+};
+
+static void *irttp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct irttp_iter_state *iter = seq->private;
+ struct tsap_cb *self;
+
+ /* Protect our access to the tsap list */
+ spin_lock_irq(&irttp->tsaps->hb_spinlock);
+ iter->id = 0;
+
+ for (self = (struct tsap_cb *) hashbin_get_first(irttp->tsaps);
+ self != NULL;
+ self = (struct tsap_cb *) hashbin_get_next(irttp->tsaps)) {
+ if (iter->id == *pos)
+ break;
+ ++iter->id;
+ }
+
+ return self;
+}
+
+static void *irttp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct irttp_iter_state *iter = seq->private;
+
+ ++*pos;
+ ++iter->id;
+ return (void *) hashbin_get_next(irttp->tsaps);
+}
+
+static void irttp_seq_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_irq(&irttp->tsaps->hb_spinlock);
+}
+
+static int irttp_seq_show(struct seq_file *seq, void *v)
+{
+ const struct irttp_iter_state *iter = seq->private;
+ const struct tsap_cb *self = v;
+
+ seq_printf(seq, "TSAP %d, ", iter->id);
+ seq_printf(seq, "stsap_sel: %02x, ",
+ self->stsap_sel);
+ seq_printf(seq, "dtsap_sel: %02x\n",
+ self->dtsap_sel);
+ seq_printf(seq, " connected: %s, ",
+ self->connected ? "TRUE" : "FALSE");
+ seq_printf(seq, "avail credit: %d, ",
+ self->avail_credit);
+ seq_printf(seq, "remote credit: %d, ",
+ self->remote_credit);
+ seq_printf(seq, "send credit: %d\n",
+ self->send_credit);
+ seq_printf(seq, " tx packets: %lu, ",
+ self->stats.tx_packets);
+ seq_printf(seq, "rx packets: %lu, ",
+ self->stats.rx_packets);
+ seq_printf(seq, "tx_queue len: %u ",
+ skb_queue_len(&self->tx_queue));
+ seq_printf(seq, "rx_queue len: %u\n",
+ skb_queue_len(&self->rx_queue));
+ seq_printf(seq, " tx_sdu_busy: %s, ",
+ self->tx_sdu_busy ? "TRUE" : "FALSE");
+ seq_printf(seq, "rx_sdu_busy: %s\n",
+ self->rx_sdu_busy ? "TRUE" : "FALSE");
+ seq_printf(seq, " max_seg_size: %u, ",
+ self->max_seg_size);
+ seq_printf(seq, "tx_max_sdu_size: %u, ",
+ self->tx_max_sdu_size);
+ seq_printf(seq, "rx_max_sdu_size: %u\n",
+ self->rx_max_sdu_size);
+
+ seq_printf(seq, " Used by (%s)\n\n",
+ self->notify.name);
+ return 0;
+}
+
+static const struct seq_operations irttp_seq_ops = {
+ .start = irttp_seq_start,
+ .next = irttp_seq_next,
+ .stop = irttp_seq_stop,
+ .show = irttp_seq_show,
+};
+
+static int irttp_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_private(file, &irttp_seq_ops,
+ sizeof(struct irttp_iter_state));
+}
+
+const struct file_operations irttp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = irttp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif /* PROC_FS */
diff --git a/net/irda/parameters.c b/net/irda/parameters.c
new file mode 100644
index 000000000..16ce32ffe
--- /dev/null
+++ b/net/irda/parameters.c
@@ -0,0 +1,584 @@
+/*********************************************************************
+ *
+ * Filename: parameters.c
+ * Version: 1.0
+ * Description: A more general way to handle (pi,pl,pv) parameters
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Mon Jun 7 10:25:11 1999
+ * Modified at: Sun Jan 30 14:08:39 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/types.h>
+#include <linux/module.h>
+
+#include <asm/unaligned.h>
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/parameters.h>
+
+static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func);
+static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func);
+static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func);
+static int irda_extract_no_value(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func);
+
+static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func);
+static int irda_insert_no_value(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func);
+
+static int irda_param_unpack(__u8 *buf, char *fmt, ...);
+
+/* Parameter value call table. Must match PV_TYPE */
+static const PV_HANDLER pv_extract_table[] = {
+ irda_extract_integer, /* Handler for any length integers */
+ irda_extract_integer, /* Handler for 8 bits integers */
+ irda_extract_integer, /* Handler for 16 bits integers */
+ irda_extract_string, /* Handler for strings */
+ irda_extract_integer, /* Handler for 32 bits integers */
+ irda_extract_octseq, /* Handler for octet sequences */
+ irda_extract_no_value /* Handler for no value parameters */
+};
+
+static const PV_HANDLER pv_insert_table[] = {
+ irda_insert_integer, /* Handler for any length integers */
+ irda_insert_integer, /* Handler for 8 bits integers */
+ irda_insert_integer, /* Handler for 16 bits integers */
+ NULL, /* Handler for strings */
+ irda_insert_integer, /* Handler for 32 bits integers */
+ NULL, /* Handler for octet sequences */
+ irda_insert_no_value /* Handler for no value parameters */
+};
+
+/*
+ * Function irda_insert_no_value (self, buf, len, pi, type, func)
+ */
+static int irda_insert_no_value(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func)
+{
+ irda_param_t p;
+ int ret;
+
+ p.pi = pi;
+ p.pl = 0;
+
+ /* Call handler for this parameter */
+ ret = (*func)(self, &p, PV_GET);
+
+ /* Extract values anyway, since handler may need them */
+ irda_param_pack(buf, "bb", p.pi, p.pl);
+
+ if (ret < 0)
+ return ret;
+
+ return 2; /* Inserted pl+2 bytes */
+}
+
+/*
+ * Function irda_extract_no_value (self, buf, len, type, func)
+ *
+ * Extracts a parameter without a pv field (pl=0)
+ *
+ */
+static int irda_extract_no_value(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func)
+{
+ irda_param_t p;
+ int ret;
+
+ /* Extract values anyway, since handler may need them */
+ irda_param_unpack(buf, "bb", &p.pi, &p.pl);
+
+ /* Call handler for this parameter */
+ ret = (*func)(self, &p, PV_PUT);
+
+ if (ret < 0)
+ return ret;
+
+ return 2; /* Extracted pl+2 bytes */
+}
+
+/*
+ * Function irda_insert_integer (self, buf, len, pi, type, func)
+ */
+static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func)
+{
+ irda_param_t p;
+ int n = 0;
+ int err;
+
+ p.pi = pi; /* In case handler needs to know */
+ p.pl = type & PV_MASK; /* The integer type codes the length as well */
+ p.pv.i = 0; /* Clear value */
+
+ /* Call handler for this parameter */
+ err = (*func)(self, &p, PV_GET);
+ if (err < 0)
+ return err;
+
+ /*
+ * If parameter length is still 0, then (1) this is an any length
+ * integer, and (2) the handler function does not care which length
+ * we choose to use, so we pick the one the gives the fewest bytes.
+ */
+ if (p.pl == 0) {
+ if (p.pv.i < 0xff) {
+ pr_debug("%s(), using 1 byte\n", __func__);
+ p.pl = 1;
+ } else if (p.pv.i < 0xffff) {
+ pr_debug("%s(), using 2 bytes\n", __func__);
+ p.pl = 2;
+ } else {
+ pr_debug("%s(), using 4 bytes\n", __func__);
+ p.pl = 4; /* Default length */
+ }
+ }
+ /* Check if buffer is long enough for insertion */
+ if (len < (2+p.pl)) {
+ net_warn_ratelimited("%s: buffer too short for insertion!\n",
+ __func__);
+ return -1;
+ }
+ pr_debug("%s(), pi=%#x, pl=%d, pi=%d\n", __func__,
+ p.pi, p.pl, p.pv.i);
+ switch (p.pl) {
+ case 1:
+ n += irda_param_pack(buf, "bbb", p.pi, p.pl, (__u8) p.pv.i);
+ break;
+ case 2:
+ if (type & PV_BIG_ENDIAN)
+ p.pv.i = cpu_to_be16((__u16) p.pv.i);
+ else
+ p.pv.i = cpu_to_le16((__u16) p.pv.i);
+ n += irda_param_pack(buf, "bbs", p.pi, p.pl, (__u16) p.pv.i);
+ break;
+ case 4:
+ if (type & PV_BIG_ENDIAN)
+ cpu_to_be32s(&p.pv.i);
+ else
+ cpu_to_le32s(&p.pv.i);
+ n += irda_param_pack(buf, "bbi", p.pi, p.pl, p.pv.i);
+
+ break;
+ default:
+ net_warn_ratelimited("%s: length %d not supported\n",
+ __func__, p.pl);
+ /* Skip parameter */
+ return -1;
+ }
+
+ return p.pl+2; /* Inserted pl+2 bytes */
+}
+
+/*
+ * Function irda_extract integer (self, buf, len, pi, type, func)
+ *
+ * Extract a possibly variable length integer from buffer, and call
+ * handler for processing of the parameter
+ */
+static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func)
+{
+ irda_param_t p;
+ int n = 0;
+ int extract_len; /* Real length we extract */
+ int err;
+
+ p.pi = pi; /* In case handler needs to know */
+ p.pl = buf[1]; /* Extract length of value */
+ p.pv.i = 0; /* Clear value */
+ extract_len = p.pl; /* Default : extract all */
+
+ /* Check if buffer is long enough for parsing */
+ if (len < (2+p.pl)) {
+ net_warn_ratelimited("%s: buffer too short for parsing! Need %d bytes, but len is only %d\n",
+ __func__, p.pl, len);
+ return -1;
+ }
+
+ /*
+ * Check that the integer length is what we expect it to be. If the
+ * handler want a 16 bits integer then a 32 bits is not good enough
+ * PV_INTEGER means that the handler is flexible.
+ */
+ if (((type & PV_MASK) != PV_INTEGER) && ((type & PV_MASK) != p.pl)) {
+ net_err_ratelimited("%s: invalid parameter length! Expected %d bytes, but value had %d bytes!\n",
+ __func__, type & PV_MASK, p.pl);
+
+ /* Most parameters are bit/byte fields or little endian,
+ * so it's ok to only extract a subset of it (the subset
+ * that the handler expect). This is necessary, as some
+ * broken implementations seems to add extra undefined bits.
+ * If the parameter is shorter than we expect or is big
+ * endian, we can't play those tricks. Jean II */
+ if((p.pl < (type & PV_MASK)) || (type & PV_BIG_ENDIAN)) {
+ /* Skip parameter */
+ return p.pl+2;
+ } else {
+ /* Extract subset of it, fallthrough */
+ extract_len = type & PV_MASK;
+ }
+ }
+
+
+ switch (extract_len) {
+ case 1:
+ n += irda_param_unpack(buf+2, "b", &p.pv.i);
+ break;
+ case 2:
+ n += irda_param_unpack(buf+2, "s", &p.pv.i);
+ if (type & PV_BIG_ENDIAN)
+ p.pv.i = be16_to_cpu((__u16) p.pv.i);
+ else
+ p.pv.i = le16_to_cpu((__u16) p.pv.i);
+ break;
+ case 4:
+ n += irda_param_unpack(buf+2, "i", &p.pv.i);
+ if (type & PV_BIG_ENDIAN)
+ be32_to_cpus(&p.pv.i);
+ else
+ le32_to_cpus(&p.pv.i);
+ break;
+ default:
+ net_warn_ratelimited("%s: length %d not supported\n",
+ __func__, p.pl);
+
+ /* Skip parameter */
+ return p.pl+2;
+ }
+
+ pr_debug("%s(), pi=%#x, pl=%d, pi=%d\n", __func__,
+ p.pi, p.pl, p.pv.i);
+ /* Call handler for this parameter */
+ err = (*func)(self, &p, PV_PUT);
+ if (err < 0)
+ return err;
+
+ return p.pl+2; /* Extracted pl+2 bytes */
+}
+
+/*
+ * Function irda_extract_string (self, buf, len, type, func)
+ */
+static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func)
+{
+ char str[33];
+ irda_param_t p;
+ int err;
+
+ p.pi = pi; /* In case handler needs to know */
+ p.pl = buf[1]; /* Extract length of value */
+ if (p.pl > 32)
+ p.pl = 32;
+
+ pr_debug("%s(), pi=%#x, pl=%d\n", __func__,
+ p.pi, p.pl);
+
+ /* Check if buffer is long enough for parsing */
+ if (len < (2+p.pl)) {
+ net_warn_ratelimited("%s: buffer too short for parsing! Need %d bytes, but len is only %d\n",
+ __func__, p.pl, len);
+ return -1;
+ }
+
+ /* Should be safe to copy string like this since we have already
+ * checked that the buffer is long enough */
+ strncpy(str, buf+2, p.pl);
+
+ pr_debug("%s(), str=0x%02x 0x%02x\n",
+ __func__, (__u8)str[0], (__u8)str[1]);
+
+ /* Null terminate string */
+ str[p.pl] = '\0';
+
+ p.pv.c = str; /* Handler will need to take a copy */
+
+ /* Call handler for this parameter */
+ err = (*func)(self, &p, PV_PUT);
+ if (err < 0)
+ return err;
+
+ return p.pl+2; /* Extracted pl+2 bytes */
+}
+
+/*
+ * Function irda_extract_octseq (self, buf, len, type, func)
+ */
+static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi,
+ PV_TYPE type, PI_HANDLER func)
+{
+ irda_param_t p;
+
+ p.pi = pi; /* In case handler needs to know */
+ p.pl = buf[1]; /* Extract length of value */
+
+ /* Check if buffer is long enough for parsing */
+ if (len < (2+p.pl)) {
+ net_warn_ratelimited("%s: buffer too short for parsing! Need %d bytes, but len is only %d\n",
+ __func__, p.pl, len);
+ return -1;
+ }
+
+ pr_debug("%s(), not impl\n", __func__);
+
+ return p.pl+2; /* Extracted pl+2 bytes */
+}
+
+/*
+ * Function irda_param_pack (skb, fmt, ...)
+ *
+ * Format:
+ * 'i' = 32 bits integer
+ * 's' = string
+ *
+ */
+int irda_param_pack(__u8 *buf, char *fmt, ...)
+{
+ irda_pv_t arg;
+ va_list args;
+ char *p;
+ int n = 0;
+
+ va_start(args, fmt);
+
+ for (p = fmt; *p != '\0'; p++) {
+ switch (*p) {
+ case 'b': /* 8 bits unsigned byte */
+ buf[n++] = (__u8)va_arg(args, int);
+ break;
+ case 's': /* 16 bits unsigned short */
+ arg.i = (__u16)va_arg(args, int);
+ put_unaligned((__u16)arg.i, (__u16 *)(buf+n)); n+=2;
+ break;
+ case 'i': /* 32 bits unsigned integer */
+ arg.i = va_arg(args, __u32);
+ put_unaligned(arg.i, (__u32 *)(buf+n)); n+=4;
+ break;
+#if 0
+ case 'c': /* \0 terminated string */
+ arg.c = va_arg(args, char *);
+ strcpy(buf+n, arg.c);
+ n += strlen(arg.c) + 1;
+ break;
+#endif
+ default:
+ va_end(args);
+ return -1;
+ }
+ }
+ va_end(args);
+
+ return 0;
+}
+EXPORT_SYMBOL(irda_param_pack);
+
+/*
+ * Function irda_param_unpack (skb, fmt, ...)
+ */
+static int irda_param_unpack(__u8 *buf, char *fmt, ...)
+{
+ irda_pv_t arg;
+ va_list args;
+ char *p;
+ int n = 0;
+
+ va_start(args, fmt);
+
+ for (p = fmt; *p != '\0'; p++) {
+ switch (*p) {
+ case 'b': /* 8 bits byte */
+ arg.ip = va_arg(args, __u32 *);
+ *arg.ip = buf[n++];
+ break;
+ case 's': /* 16 bits short */
+ arg.ip = va_arg(args, __u32 *);
+ *arg.ip = get_unaligned((__u16 *)(buf+n)); n+=2;
+ break;
+ case 'i': /* 32 bits unsigned integer */
+ arg.ip = va_arg(args, __u32 *);
+ *arg.ip = get_unaligned((__u32 *)(buf+n)); n+=4;
+ break;
+#if 0
+ case 'c': /* \0 terminated string */
+ arg.c = va_arg(args, char *);
+ strcpy(arg.c, buf+n);
+ n += strlen(arg.c) + 1;
+ break;
+#endif
+ default:
+ va_end(args);
+ return -1;
+ }
+
+ }
+ va_end(args);
+
+ return 0;
+}
+
+/*
+ * Function irda_param_insert (self, pi, buf, len, info)
+ *
+ * Insert the specified parameter (pi) into buffer. Returns number of
+ * bytes inserted
+ */
+int irda_param_insert(void *self, __u8 pi, __u8 *buf, int len,
+ pi_param_info_t *info)
+{
+ const pi_minor_info_t *pi_minor_info;
+ __u8 pi_minor;
+ __u8 pi_major;
+ int type;
+ int ret = -1;
+ int n = 0;
+
+ IRDA_ASSERT(buf != NULL, return ret;);
+ IRDA_ASSERT(info != NULL, return ret;);
+
+ pi_minor = pi & info->pi_mask;
+ pi_major = pi >> info->pi_major_offset;
+
+ /* Check if the identifier value (pi) is valid */
+ if ((pi_major > info->len-1) ||
+ (pi_minor > info->tables[pi_major].len-1))
+ {
+ pr_debug("%s(), no handler for parameter=0x%02x\n",
+ __func__, pi);
+
+ /* Skip this parameter */
+ return -1;
+ }
+
+ /* Lookup the info on how to parse this parameter */
+ pi_minor_info = &info->tables[pi_major].pi_minor_call_table[pi_minor];
+
+ /* Find expected data type for this parameter identifier (pi)*/
+ type = pi_minor_info->type;
+
+ /* Check if handler has been implemented */
+ if (!pi_minor_info->func) {
+ net_info_ratelimited("%s: no handler for pi=%#x\n",
+ __func__, pi);
+ /* Skip this parameter */
+ return -1;
+ }
+
+ /* Insert parameter value */
+ ret = (*pv_insert_table[type & PV_MASK])(self, buf+n, len, pi, type,
+ pi_minor_info->func);
+ return ret;
+}
+EXPORT_SYMBOL(irda_param_insert);
+
+/*
+ * Function irda_param_extract (self, buf, len, info)
+ *
+ * Parse all parameters. If len is correct, then everything should be
+ * safe. Returns the number of bytes that was parsed
+ *
+ */
+static int irda_param_extract(void *self, __u8 *buf, int len,
+ pi_param_info_t *info)
+{
+ const pi_minor_info_t *pi_minor_info;
+ __u8 pi_minor;
+ __u8 pi_major;
+ int type;
+ int ret = -1;
+ int n = 0;
+
+ IRDA_ASSERT(buf != NULL, return ret;);
+ IRDA_ASSERT(info != NULL, return ret;);
+
+ pi_minor = buf[n] & info->pi_mask;
+ pi_major = buf[n] >> info->pi_major_offset;
+
+ /* Check if the identifier value (pi) is valid */
+ if ((pi_major > info->len-1) ||
+ (pi_minor > info->tables[pi_major].len-1))
+ {
+ pr_debug("%s(), no handler for parameter=0x%02x\n",
+ __func__, buf[0]);
+
+ /* Skip this parameter */
+ return 2 + buf[n + 1]; /* Continue */
+ }
+
+ /* Lookup the info on how to parse this parameter */
+ pi_minor_info = &info->tables[pi_major].pi_minor_call_table[pi_minor];
+
+ /* Find expected data type for this parameter identifier (pi)*/
+ type = pi_minor_info->type;
+
+ pr_debug("%s(), pi=[%d,%d], type=%d\n", __func__,
+ pi_major, pi_minor, type);
+
+ /* Check if handler has been implemented */
+ if (!pi_minor_info->func) {
+ net_info_ratelimited("%s: no handler for pi=%#x\n",
+ __func__, buf[n]);
+ /* Skip this parameter */
+ return 2 + buf[n + 1]; /* Continue */
+ }
+
+ /* Parse parameter value */
+ ret = (*pv_extract_table[type & PV_MASK])(self, buf+n, len, buf[n],
+ type, pi_minor_info->func);
+ return ret;
+}
+
+/*
+ * Function irda_param_extract_all (self, buf, len, info)
+ *
+ * Parse all parameters. If len is correct, then everything should be
+ * safe. Returns the number of bytes that was parsed
+ *
+ */
+int irda_param_extract_all(void *self, __u8 *buf, int len,
+ pi_param_info_t *info)
+{
+ int ret = -1;
+ int n = 0;
+
+ IRDA_ASSERT(buf != NULL, return ret;);
+ IRDA_ASSERT(info != NULL, return ret;);
+
+ /*
+ * Parse all parameters. Each parameter must be at least two bytes
+ * long or else there is no point in trying to parse it
+ */
+ while (len > 2) {
+ ret = irda_param_extract(self, buf+n, len, info);
+ if (ret < 0)
+ return ret;
+
+ n += ret;
+ len -= ret;
+ }
+ return n;
+}
+EXPORT_SYMBOL(irda_param_extract_all);
diff --git a/net/irda/qos.c b/net/irda/qos.c
new file mode 100644
index 000000000..25ba8509a
--- /dev/null
+++ b/net/irda/qos.c
@@ -0,0 +1,771 @@
+/*********************************************************************
+ *
+ * Filename: qos.c
+ * Version: 1.0
+ * Description: IrLAP QoS parameter negotiation
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Tue Sep 9 00:00:26 1997
+ * Modified at: Sun Jan 30 14:29:16 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2001 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ ********************************************************************/
+
+#include <linux/export.h>
+
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/parameters.h>
+#include <net/irda/qos.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlap_frame.h>
+
+/*
+ * Maximum values of the baud rate we negotiate with the other end.
+ * Most often, you don't have to change that, because Linux-IrDA will
+ * use the maximum offered by the link layer, which usually works fine.
+ * In some very rare cases, you may want to limit it to lower speeds...
+ */
+int sysctl_max_baud_rate = 16000000;
+/*
+ * Maximum value of the lap disconnect timer we negotiate with the other end.
+ * Most often, the value below represent the best compromise, but some user
+ * may want to keep the LAP alive longer or shorter in case of link failure.
+ * Remember that the threshold time (early warning) is fixed to 3s...
+ */
+int sysctl_max_noreply_time = 12;
+/*
+ * Minimum turn time to be applied before transmitting to the peer.
+ * Nonzero values (usec) are used as lower limit to the per-connection
+ * mtt value which was announced by the other end during negotiation.
+ * Might be helpful if the peer device provides too short mtt.
+ * Default is 10us which means using the unmodified value given by the
+ * peer except if it's 0 (0 is likely a bug in the other stack).
+ */
+unsigned int sysctl_min_tx_turn_time = 10;
+/*
+ * Maximum data size to be used in transmission in payload of LAP frame.
+ * There is a bit of confusion in the IrDA spec :
+ * The LAP spec defines the payload of a LAP frame (I field) to be
+ * 2048 bytes max (IrLAP 1.1, chapt 6.6.5, p40).
+ * On the other hand, the PHY mention frames of 2048 bytes max (IrPHY
+ * 1.2, chapt 5.3.2.1, p41). But, this number includes the LAP header
+ * (2 bytes), and CRC (32 bits at 4 Mb/s). So, for the I field (LAP
+ * payload), that's only 2042 bytes. Oups !
+ * My nsc-ircc hardware has troubles receiving 2048 bytes frames at 4 Mb/s,
+ * so adjust to 2042... I don't know if this bug applies only for 2048
+ * bytes frames or all negotiated frame sizes, but you can use the sysctl
+ * to play with this value anyway.
+ * Jean II */
+unsigned int sysctl_max_tx_data_size = 2042;
+/*
+ * Maximum transmit window, i.e. number of LAP frames between turn-around.
+ * This allow to override what the peer told us. Some peers are buggy and
+ * don't always support what they tell us.
+ * Jean II */
+unsigned int sysctl_max_tx_window = 7;
+
+static int irlap_param_baud_rate(void *instance, irda_param_t *param, int get);
+static int irlap_param_link_disconnect(void *instance, irda_param_t *parm,
+ int get);
+static int irlap_param_max_turn_time(void *instance, irda_param_t *param,
+ int get);
+static int irlap_param_data_size(void *instance, irda_param_t *param, int get);
+static int irlap_param_window_size(void *instance, irda_param_t *param,
+ int get);
+static int irlap_param_additional_bofs(void *instance, irda_param_t *parm,
+ int get);
+static int irlap_param_min_turn_time(void *instance, irda_param_t *param,
+ int get);
+
+#ifndef CONFIG_IRDA_DYNAMIC_WINDOW
+static __u32 irlap_requested_line_capacity(struct qos_info *qos);
+#endif
+
+static __u32 min_turn_times[] = { 10000, 5000, 1000, 500, 100, 50, 10, 0 }; /* us */
+static __u32 baud_rates[] = { 2400, 9600, 19200, 38400, 57600, 115200, 576000,
+ 1152000, 4000000, 16000000 }; /* bps */
+static __u32 data_sizes[] = { 64, 128, 256, 512, 1024, 2048 }; /* bytes */
+static __u32 add_bofs[] = { 48, 24, 12, 5, 3, 2, 1, 0 }; /* bytes */
+static __u32 max_turn_times[] = { 500, 250, 100, 50 }; /* ms */
+static __u32 link_disc_times[] = { 3, 8, 12, 16, 20, 25, 30, 40 }; /* secs */
+
+static __u32 max_line_capacities[10][4] = {
+ /* 500 ms 250 ms 100 ms 50 ms (max turn time) */
+ { 100, 0, 0, 0 }, /* 2400 bps */
+ { 400, 0, 0, 0 }, /* 9600 bps */
+ { 800, 0, 0, 0 }, /* 19200 bps */
+ { 1600, 0, 0, 0 }, /* 38400 bps */
+ { 2360, 0, 0, 0 }, /* 57600 bps */
+ { 4800, 2400, 960, 480 }, /* 115200 bps */
+ { 28800, 11520, 5760, 2880 }, /* 576000 bps */
+ { 57600, 28800, 11520, 5760 }, /* 1152000 bps */
+ { 200000, 100000, 40000, 20000 }, /* 4000000 bps */
+ { 800000, 400000, 160000, 80000 }, /* 16000000 bps */
+};
+
+static const pi_minor_info_t pi_minor_call_table_type_0[] = {
+ { NULL, 0 },
+/* 01 */{ irlap_param_baud_rate, PV_INTEGER | PV_LITTLE_ENDIAN },
+ { NULL, 0 },
+ { NULL, 0 },
+ { NULL, 0 },
+ { NULL, 0 },
+ { NULL, 0 },
+ { NULL, 0 },
+/* 08 */{ irlap_param_link_disconnect, PV_INT_8_BITS }
+};
+
+static const pi_minor_info_t pi_minor_call_table_type_1[] = {
+ { NULL, 0 },
+ { NULL, 0 },
+/* 82 */{ irlap_param_max_turn_time, PV_INT_8_BITS },
+/* 83 */{ irlap_param_data_size, PV_INT_8_BITS },
+/* 84 */{ irlap_param_window_size, PV_INT_8_BITS },
+/* 85 */{ irlap_param_additional_bofs, PV_INT_8_BITS },
+/* 86 */{ irlap_param_min_turn_time, PV_INT_8_BITS },
+};
+
+static const pi_major_info_t pi_major_call_table[] = {
+ { pi_minor_call_table_type_0, 9 },
+ { pi_minor_call_table_type_1, 7 },
+};
+
+static pi_param_info_t irlap_param_info = { pi_major_call_table, 2, 0x7f, 7 };
+
+/* ---------------------- LOCAL SUBROUTINES ---------------------- */
+/* Note : we start with a bunch of local subroutines.
+ * As the compiler is "one pass", this is the only way to get them to
+ * inline properly...
+ * Jean II
+ */
+/*
+ * Function value_index (value, array, size)
+ *
+ * Returns the index to the value in the specified array
+ */
+static inline int value_index(__u32 value, __u32 *array, int size)
+{
+ int i;
+
+ for (i=0; i < size; i++)
+ if (array[i] == value)
+ break;
+ return i;
+}
+
+/*
+ * Function index_value (index, array)
+ *
+ * Returns value to index in array, easy!
+ *
+ */
+static inline __u32 index_value(int index, __u32 *array)
+{
+ return array[index];
+}
+
+/*
+ * Function msb_index (word)
+ *
+ * Returns index to most significant bit (MSB) in word
+ *
+ */
+static int msb_index (__u16 word)
+{
+ __u16 msb = 0x8000;
+ int index = 15; /* Current MSB */
+
+ /* Check for buggy peers.
+ * Note : there is a small probability that it could be us, but I
+ * would expect driver authors to catch that pretty early and be
+ * able to check precisely what's going on. If a end user sees this,
+ * it's very likely the peer. - Jean II */
+ if (word == 0) {
+ net_warn_ratelimited("%s(), Detected buggy peer, adjust null PV to 0x1!\n",
+ __func__);
+ /* The only safe choice (we don't know the array size) */
+ word = 0x1;
+ }
+
+ while (msb) {
+ if (word & msb)
+ break; /* Found it! */
+ msb >>=1;
+ index--;
+ }
+ return index;
+}
+
+/*
+ * Function value_lower_bits (value, array)
+ *
+ * Returns a bit field marking all possibility lower than value.
+ */
+static inline int value_lower_bits(__u32 value, __u32 *array, int size, __u16 *field)
+{
+ int i;
+ __u16 mask = 0x1;
+ __u16 result = 0x0;
+
+ for (i=0; i < size; i++) {
+ /* Add the current value to the bit field, shift mask */
+ result |= mask;
+ mask <<= 1;
+ /* Finished ? */
+ if (array[i] >= value)
+ break;
+ }
+ /* Send back a valid index */
+ if(i >= size)
+ i = size - 1; /* Last item */
+ *field = result;
+ return i;
+}
+
+/*
+ * Function value_highest_bit (value, array)
+ *
+ * Returns a bit field marking the highest possibility lower than value.
+ */
+static inline int value_highest_bit(__u32 value, __u32 *array, int size, __u16 *field)
+{
+ int i;
+ __u16 mask = 0x1;
+ __u16 result = 0x0;
+
+ for (i=0; i < size; i++) {
+ /* Finished ? */
+ if (array[i] <= value)
+ break;
+ /* Shift mask */
+ mask <<= 1;
+ }
+ /* Set the current value to the bit field */
+ result |= mask;
+ /* Send back a valid index */
+ if(i >= size)
+ i = size - 1; /* Last item */
+ *field = result;
+ return i;
+}
+
+/* -------------------------- MAIN CALLS -------------------------- */
+
+/*
+ * Function irda_qos_compute_intersection (qos, new)
+ *
+ * Compute the intersection of the old QoS capabilities with new ones
+ *
+ */
+void irda_qos_compute_intersection(struct qos_info *qos, struct qos_info *new)
+{
+ IRDA_ASSERT(qos != NULL, return;);
+ IRDA_ASSERT(new != NULL, return;);
+
+ /* Apply */
+ qos->baud_rate.bits &= new->baud_rate.bits;
+ qos->window_size.bits &= new->window_size.bits;
+ qos->min_turn_time.bits &= new->min_turn_time.bits;
+ qos->max_turn_time.bits &= new->max_turn_time.bits;
+ qos->data_size.bits &= new->data_size.bits;
+ qos->link_disc_time.bits &= new->link_disc_time.bits;
+ qos->additional_bofs.bits &= new->additional_bofs.bits;
+
+ irda_qos_bits_to_value(qos);
+}
+
+/*
+ * Function irda_init_max_qos_capabilies (qos)
+ *
+ * The purpose of this function is for layers and drivers to be able to
+ * set the maximum QoS possible and then "and in" their own limitations
+ *
+ */
+void irda_init_max_qos_capabilies(struct qos_info *qos)
+{
+ int i;
+ /*
+ * These are the maximum supported values as specified on pages
+ * 39-43 in IrLAP
+ */
+
+ /* Use sysctl to set some configurable values... */
+ /* Set configured max speed */
+ i = value_lower_bits(sysctl_max_baud_rate, baud_rates, 10,
+ &qos->baud_rate.bits);
+ sysctl_max_baud_rate = index_value(i, baud_rates);
+
+ /* Set configured max disc time */
+ i = value_lower_bits(sysctl_max_noreply_time, link_disc_times, 8,
+ &qos->link_disc_time.bits);
+ sysctl_max_noreply_time = index_value(i, link_disc_times);
+
+ /* LSB is first byte, MSB is second byte */
+ qos->baud_rate.bits &= 0x03ff;
+
+ qos->window_size.bits = 0x7f;
+ qos->min_turn_time.bits = 0xff;
+ qos->max_turn_time.bits = 0x0f;
+ qos->data_size.bits = 0x3f;
+ qos->link_disc_time.bits &= 0xff;
+ qos->additional_bofs.bits = 0xff;
+}
+EXPORT_SYMBOL(irda_init_max_qos_capabilies);
+
+/*
+ * Function irlap_adjust_qos_settings (qos)
+ *
+ * Adjust QoS settings in case some values are not possible to use because
+ * of other settings
+ */
+static void irlap_adjust_qos_settings(struct qos_info *qos)
+{
+ __u32 line_capacity;
+ int index;
+
+ /*
+ * Make sure the mintt is sensible.
+ * Main culprit : Ericsson T39. - Jean II
+ */
+ if (sysctl_min_tx_turn_time > qos->min_turn_time.value) {
+ int i;
+
+ net_warn_ratelimited("%s(), Detected buggy peer, adjust mtt to %dus!\n",
+ __func__, sysctl_min_tx_turn_time);
+
+ /* We don't really need bits, but easier this way */
+ i = value_highest_bit(sysctl_min_tx_turn_time, min_turn_times,
+ 8, &qos->min_turn_time.bits);
+ sysctl_min_tx_turn_time = index_value(i, min_turn_times);
+ qos->min_turn_time.value = sysctl_min_tx_turn_time;
+ }
+
+ /*
+ * Not allowed to use a max turn time less than 500 ms if the baudrate
+ * is less than 115200
+ */
+ if ((qos->baud_rate.value < 115200) &&
+ (qos->max_turn_time.value < 500))
+ {
+ pr_debug("%s(), adjusting max turn time from %d to 500 ms\n",
+ __func__, qos->max_turn_time.value);
+ qos->max_turn_time.value = 500;
+ }
+
+ /*
+ * The data size must be adjusted according to the baud rate and max
+ * turn time
+ */
+ index = value_index(qos->data_size.value, data_sizes, 6);
+ line_capacity = irlap_max_line_capacity(qos->baud_rate.value,
+ qos->max_turn_time.value);
+
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+ while ((qos->data_size.value > line_capacity) && (index > 0)) {
+ qos->data_size.value = data_sizes[index--];
+ pr_debug("%s(), reducing data size to %d\n",
+ __func__, qos->data_size.value);
+ }
+#else /* Use method described in section 6.6.11 of IrLAP */
+ while (irlap_requested_line_capacity(qos) > line_capacity) {
+ IRDA_ASSERT(index != 0, return;);
+
+ /* Must be able to send at least one frame */
+ if (qos->window_size.value > 1) {
+ qos->window_size.value--;
+ pr_debug("%s(), reducing window size to %d\n",
+ __func__, qos->window_size.value);
+ } else if (index > 1) {
+ qos->data_size.value = data_sizes[index--];
+ pr_debug("%s(), reducing data size to %d\n",
+ __func__, qos->data_size.value);
+ } else {
+ net_warn_ratelimited("%s(), nothing more we can do!\n",
+ __func__);
+ }
+ }
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+ /*
+ * Fix tx data size according to user limits - Jean II
+ */
+ if (qos->data_size.value > sysctl_max_tx_data_size)
+ /* Allow non discrete adjustement to avoid losing capacity */
+ qos->data_size.value = sysctl_max_tx_data_size;
+ /*
+ * Override Tx window if user request it. - Jean II
+ */
+ if (qos->window_size.value > sysctl_max_tx_window)
+ qos->window_size.value = sysctl_max_tx_window;
+}
+
+/*
+ * Function irlap_negotiate (qos_device, qos_session, skb)
+ *
+ * Negotiate QoS values, not really that much negotiation :-)
+ * We just set the QoS capabilities for the peer station
+ *
+ */
+int irlap_qos_negotiate(struct irlap_cb *self, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = irda_param_extract_all(self, skb->data, skb->len,
+ &irlap_param_info);
+
+ /* Convert the negotiated bits to values */
+ irda_qos_bits_to_value(&self->qos_tx);
+ irda_qos_bits_to_value(&self->qos_rx);
+
+ irlap_adjust_qos_settings(&self->qos_tx);
+
+ pr_debug("Setting BAUD_RATE to %d bps.\n",
+ self->qos_tx.baud_rate.value);
+ pr_debug("Setting DATA_SIZE to %d bytes\n",
+ self->qos_tx.data_size.value);
+ pr_debug("Setting WINDOW_SIZE to %d\n",
+ self->qos_tx.window_size.value);
+ pr_debug("Setting XBOFS to %d\n",
+ self->qos_tx.additional_bofs.value);
+ pr_debug("Setting MAX_TURN_TIME to %d ms.\n",
+ self->qos_tx.max_turn_time.value);
+ pr_debug("Setting MIN_TURN_TIME to %d usecs.\n",
+ self->qos_tx.min_turn_time.value);
+ pr_debug("Setting LINK_DISC to %d secs.\n",
+ self->qos_tx.link_disc_time.value);
+ return ret;
+}
+
+/*
+ * Function irlap_insert_negotiation_params (qos, fp)
+ *
+ * Insert QoS negotiaion pararameters into frame
+ *
+ */
+int irlap_insert_qos_negotiation_params(struct irlap_cb *self,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ /* Insert data rate */
+ ret = irda_param_insert(self, PI_BAUD_RATE, skb_tail_pointer(skb),
+ skb_tailroom(skb), &irlap_param_info);
+ if (ret < 0)
+ return ret;
+ skb_put(skb, ret);
+
+ /* Insert max turnaround time */
+ ret = irda_param_insert(self, PI_MAX_TURN_TIME, skb_tail_pointer(skb),
+ skb_tailroom(skb), &irlap_param_info);
+ if (ret < 0)
+ return ret;
+ skb_put(skb, ret);
+
+ /* Insert data size */
+ ret = irda_param_insert(self, PI_DATA_SIZE, skb_tail_pointer(skb),
+ skb_tailroom(skb), &irlap_param_info);
+ if (ret < 0)
+ return ret;
+ skb_put(skb, ret);
+
+ /* Insert window size */
+ ret = irda_param_insert(self, PI_WINDOW_SIZE, skb_tail_pointer(skb),
+ skb_tailroom(skb), &irlap_param_info);
+ if (ret < 0)
+ return ret;
+ skb_put(skb, ret);
+
+ /* Insert additional BOFs */
+ ret = irda_param_insert(self, PI_ADD_BOFS, skb_tail_pointer(skb),
+ skb_tailroom(skb), &irlap_param_info);
+ if (ret < 0)
+ return ret;
+ skb_put(skb, ret);
+
+ /* Insert minimum turnaround time */
+ ret = irda_param_insert(self, PI_MIN_TURN_TIME, skb_tail_pointer(skb),
+ skb_tailroom(skb), &irlap_param_info);
+ if (ret < 0)
+ return ret;
+ skb_put(skb, ret);
+
+ /* Insert link disconnect/threshold time */
+ ret = irda_param_insert(self, PI_LINK_DISC, skb_tail_pointer(skb),
+ skb_tailroom(skb), &irlap_param_info);
+ if (ret < 0)
+ return ret;
+ skb_put(skb, ret);
+
+ return 0;
+}
+
+/*
+ * Function irlap_param_baud_rate (instance, param, get)
+ *
+ * Negotiate data-rate
+ *
+ */
+static int irlap_param_baud_rate(void *instance, irda_param_t *param, int get)
+{
+ __u16 final;
+
+ struct irlap_cb *self = (struct irlap_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ if (get) {
+ param->pv.i = self->qos_rx.baud_rate.bits;
+ pr_debug("%s(), baud rate = 0x%02x\n",
+ __func__, param->pv.i);
+ } else {
+ /*
+ * Stations must agree on baud rate, so calculate
+ * intersection
+ */
+ pr_debug("Requested BAUD_RATE: 0x%04x\n", (__u16)param->pv.i);
+ final = (__u16) param->pv.i & self->qos_rx.baud_rate.bits;
+
+ pr_debug("Final BAUD_RATE: 0x%04x\n", final);
+ self->qos_tx.baud_rate.bits = final;
+ self->qos_rx.baud_rate.bits = final;
+ }
+
+ return 0;
+}
+
+/*
+ * Function irlap_param_link_disconnect (instance, param, get)
+ *
+ * Negotiate link disconnect/threshold time.
+ *
+ */
+static int irlap_param_link_disconnect(void *instance, irda_param_t *param,
+ int get)
+{
+ __u16 final;
+
+ struct irlap_cb *self = (struct irlap_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->qos_rx.link_disc_time.bits;
+ else {
+ /*
+ * Stations must agree on link disconnect/threshold
+ * time.
+ */
+ pr_debug("LINK_DISC: %02x\n", (__u8)param->pv.i);
+ final = (__u8) param->pv.i & self->qos_rx.link_disc_time.bits;
+
+ pr_debug("Final LINK_DISC: %02x\n", final);
+ self->qos_tx.link_disc_time.bits = final;
+ self->qos_rx.link_disc_time.bits = final;
+ }
+ return 0;
+}
+
+/*
+ * Function irlap_param_max_turn_time (instance, param, get)
+ *
+ * Negotiate the maximum turnaround time. This is a type 1 parameter and
+ * will be negotiated independently for each station
+ *
+ */
+static int irlap_param_max_turn_time(void *instance, irda_param_t *param,
+ int get)
+{
+ struct irlap_cb *self = (struct irlap_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->qos_rx.max_turn_time.bits;
+ else
+ self->qos_tx.max_turn_time.bits = (__u8) param->pv.i;
+
+ return 0;
+}
+
+/*
+ * Function irlap_param_data_size (instance, param, get)
+ *
+ * Negotiate the data size. This is a type 1 parameter and
+ * will be negotiated independently for each station
+ *
+ */
+static int irlap_param_data_size(void *instance, irda_param_t *param, int get)
+{
+ struct irlap_cb *self = (struct irlap_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->qos_rx.data_size.bits;
+ else
+ self->qos_tx.data_size.bits = (__u8) param->pv.i;
+
+ return 0;
+}
+
+/*
+ * Function irlap_param_window_size (instance, param, get)
+ *
+ * Negotiate the window size. This is a type 1 parameter and
+ * will be negotiated independently for each station
+ *
+ */
+static int irlap_param_window_size(void *instance, irda_param_t *param,
+ int get)
+{
+ struct irlap_cb *self = (struct irlap_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->qos_rx.window_size.bits;
+ else
+ self->qos_tx.window_size.bits = (__u8) param->pv.i;
+
+ return 0;
+}
+
+/*
+ * Function irlap_param_additional_bofs (instance, param, get)
+ *
+ * Negotiate additional BOF characters. This is a type 1 parameter and
+ * will be negotiated independently for each station.
+ */
+static int irlap_param_additional_bofs(void *instance, irda_param_t *param, int get)
+{
+ struct irlap_cb *self = (struct irlap_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->qos_rx.additional_bofs.bits;
+ else
+ self->qos_tx.additional_bofs.bits = (__u8) param->pv.i;
+
+ return 0;
+}
+
+/*
+ * Function irlap_param_min_turn_time (instance, param, get)
+ *
+ * Negotiate the minimum turn around time. This is a type 1 parameter and
+ * will be negotiated independently for each station
+ */
+static int irlap_param_min_turn_time(void *instance, irda_param_t *param,
+ int get)
+{
+ struct irlap_cb *self = (struct irlap_cb *) instance;
+
+ IRDA_ASSERT(self != NULL, return -1;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+ if (get)
+ param->pv.i = self->qos_rx.min_turn_time.bits;
+ else
+ self->qos_tx.min_turn_time.bits = (__u8) param->pv.i;
+
+ return 0;
+}
+
+/*
+ * Function irlap_max_line_capacity (speed, max_turn_time, min_turn_time)
+ *
+ * Calculate the maximum line capacity
+ *
+ */
+__u32 irlap_max_line_capacity(__u32 speed, __u32 max_turn_time)
+{
+ __u32 line_capacity;
+ int i,j;
+
+ pr_debug("%s(), speed=%d, max_turn_time=%d\n",
+ __func__, speed, max_turn_time);
+
+ i = value_index(speed, baud_rates, 10);
+ j = value_index(max_turn_time, max_turn_times, 4);
+
+ IRDA_ASSERT(((i >=0) && (i <10)), return 0;);
+ IRDA_ASSERT(((j >=0) && (j <4)), return 0;);
+
+ line_capacity = max_line_capacities[i][j];
+
+ pr_debug("%s(), line capacity=%d bytes\n",
+ __func__, line_capacity);
+
+ return line_capacity;
+}
+
+#ifndef CONFIG_IRDA_DYNAMIC_WINDOW
+static __u32 irlap_requested_line_capacity(struct qos_info *qos)
+{
+ __u32 line_capacity;
+
+ line_capacity = qos->window_size.value *
+ (qos->data_size.value + 6 + qos->additional_bofs.value) +
+ irlap_min_turn_time_in_bytes(qos->baud_rate.value,
+ qos->min_turn_time.value);
+
+ pr_debug("%s(), requested line capacity=%d\n",
+ __func__, line_capacity);
+
+ return line_capacity;
+}
+#endif
+
+void irda_qos_bits_to_value(struct qos_info *qos)
+{
+ int index;
+
+ IRDA_ASSERT(qos != NULL, return;);
+
+ index = msb_index(qos->baud_rate.bits);
+ qos->baud_rate.value = baud_rates[index];
+
+ index = msb_index(qos->data_size.bits);
+ qos->data_size.value = data_sizes[index];
+
+ index = msb_index(qos->window_size.bits);
+ qos->window_size.value = index+1;
+
+ index = msb_index(qos->min_turn_time.bits);
+ qos->min_turn_time.value = min_turn_times[index];
+
+ index = msb_index(qos->max_turn_time.bits);
+ qos->max_turn_time.value = max_turn_times[index];
+
+ index = msb_index(qos->link_disc_time.bits);
+ qos->link_disc_time.value = link_disc_times[index];
+
+ index = msb_index(qos->additional_bofs.bits);
+ qos->additional_bofs.value = add_bofs[index];
+}
+EXPORT_SYMBOL(irda_qos_bits_to_value);
diff --git a/net/irda/timer.c b/net/irda/timer.c
new file mode 100644
index 000000000..0c4c115a5
--- /dev/null
+++ b/net/irda/timer.c
@@ -0,0 +1,231 @@
+/*********************************************************************
+ *
+ * Filename: timer.c
+ * Version:
+ * Description:
+ * Status: Experimental.
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Sat Aug 16 00:59:29 1997
+ * Modified at: Wed Dec 8 12:50:34 1999
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ *
+ * Copyright (c) 1997, 1999 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/delay.h>
+
+#include <net/irda/timer.h>
+#include <net/irda/irda.h>
+#include <net/irda/irda_device.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlmp.h>
+
+extern int sysctl_slot_timeout;
+
+static void irlap_slot_timer_expired(void* data);
+static void irlap_query_timer_expired(void* data);
+static void irlap_final_timer_expired(void* data);
+static void irlap_wd_timer_expired(void* data);
+static void irlap_backoff_timer_expired(void* data);
+static void irlap_media_busy_expired(void* data);
+
+void irlap_start_slot_timer(struct irlap_cb *self, int timeout)
+{
+ irda_start_timer(&self->slot_timer, timeout, (void *) self,
+ irlap_slot_timer_expired);
+}
+
+void irlap_start_query_timer(struct irlap_cb *self, int S, int s)
+{
+ int timeout;
+
+ /* Calculate when the peer discovery should end. Normally, we
+ * get the end-of-discovery frame, so this is just in case
+ * we miss it.
+ * Basically, we multiply the number of remaining slots by our
+ * slot time, plus add some extra time to properly receive the last
+ * discovery packet (which is longer due to extra discovery info),
+ * to avoid messing with for incoming connections requests and
+ * to accommodate devices that perform discovery slower than us.
+ * Jean II */
+ timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s)
+ + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT);
+
+ /* Set or re-set the timer. We reset the timer for each received
+ * discovery query, which allow us to automatically adjust to
+ * the speed of the peer discovery (faster or slower). Jean II */
+ irda_start_timer( &self->query_timer, timeout, (void *) self,
+ irlap_query_timer_expired);
+}
+
+void irlap_start_final_timer(struct irlap_cb *self, int timeout)
+{
+ irda_start_timer(&self->final_timer, timeout, (void *) self,
+ irlap_final_timer_expired);
+}
+
+void irlap_start_wd_timer(struct irlap_cb *self, int timeout)
+{
+ irda_start_timer(&self->wd_timer, timeout, (void *) self,
+ irlap_wd_timer_expired);
+}
+
+void irlap_start_backoff_timer(struct irlap_cb *self, int timeout)
+{
+ irda_start_timer(&self->backoff_timer, timeout, (void *) self,
+ irlap_backoff_timer_expired);
+}
+
+void irlap_start_mbusy_timer(struct irlap_cb *self, int timeout)
+{
+ irda_start_timer(&self->media_busy_timer, timeout,
+ (void *) self, irlap_media_busy_expired);
+}
+
+void irlap_stop_mbusy_timer(struct irlap_cb *self)
+{
+ /* If timer is activated, kill it! */
+ del_timer(&self->media_busy_timer);
+
+ /* If we are in NDM, there is a bunch of events in LAP that
+ * that be pending due to the media_busy condition, such as
+ * CONNECT_REQUEST and SEND_UI_FRAME. If we don't generate
+ * an event, they will wait forever...
+ * Jean II */
+ if (self->state == LAP_NDM)
+ irlap_do_event(self, MEDIA_BUSY_TIMER_EXPIRED, NULL, NULL);
+}
+
+void irlmp_start_watchdog_timer(struct lsap_cb *self, int timeout)
+{
+ irda_start_timer(&self->watchdog_timer, timeout, (void *) self,
+ irlmp_watchdog_timer_expired);
+}
+
+void irlmp_start_discovery_timer(struct irlmp_cb *self, int timeout)
+{
+ irda_start_timer(&self->discovery_timer, timeout, (void *) self,
+ irlmp_discovery_timer_expired);
+}
+
+void irlmp_start_idle_timer(struct lap_cb *self, int timeout)
+{
+ irda_start_timer(&self->idle_timer, timeout, (void *) self,
+ irlmp_idle_timer_expired);
+}
+
+void irlmp_stop_idle_timer(struct lap_cb *self)
+{
+ /* If timer is activated, kill it! */
+ del_timer(&self->idle_timer);
+}
+
+/*
+ * Function irlap_slot_timer_expired (data)
+ *
+ * IrLAP slot timer has expired
+ *
+ */
+static void irlap_slot_timer_expired(void *data)
+{
+ struct irlap_cb *self = (struct irlap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ irlap_do_event(self, SLOT_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Function irlap_query_timer_expired (data)
+ *
+ * IrLAP query timer has expired
+ *
+ */
+static void irlap_query_timer_expired(void *data)
+{
+ struct irlap_cb *self = (struct irlap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ irlap_do_event(self, QUERY_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Function irda_final_timer_expired (data)
+ *
+ *
+ *
+ */
+static void irlap_final_timer_expired(void *data)
+{
+ struct irlap_cb *self = (struct irlap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ irlap_do_event(self, FINAL_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Function irda_wd_timer_expired (data)
+ *
+ *
+ *
+ */
+static void irlap_wd_timer_expired(void *data)
+{
+ struct irlap_cb *self = (struct irlap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ irlap_do_event(self, WD_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Function irda_backoff_timer_expired (data)
+ *
+ *
+ *
+ */
+static void irlap_backoff_timer_expired(void *data)
+{
+ struct irlap_cb *self = (struct irlap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+ IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+ irlap_do_event(self, BACKOFF_TIMER_EXPIRED, NULL, NULL);
+}
+
+
+/*
+ * Function irtty_media_busy_expired (data)
+ *
+ *
+ */
+static void irlap_media_busy_expired(void *data)
+{
+ struct irlap_cb *self = (struct irlap_cb *) data;
+
+ IRDA_ASSERT(self != NULL, return;);
+
+ irda_device_set_media_busy(self->netdev, FALSE);
+ /* Note : the LAP event will be send in irlap_stop_mbusy_timer(),
+ * to catch other cases where the flag is cleared (for example
+ * after a discovery) - Jean II */
+}
diff --git a/net/irda/wrapper.c b/net/irda/wrapper.c
new file mode 100644
index 000000000..40a0f993b
--- /dev/null
+++ b/net/irda/wrapper.c
@@ -0,0 +1,492 @@
+/*********************************************************************
+ *
+ * Filename: wrapper.c
+ * Version: 1.2
+ * Description: IrDA SIR async wrapper layer
+ * Status: Stable
+ * Author: Dag Brattli <dagb@cs.uit.no>
+ * Created at: Mon Aug 4 20:40:53 1997
+ * Modified at: Fri Jan 28 13:21:09 2000
+ * Modified by: Dag Brattli <dagb@cs.uit.no>
+ * Modified at: Fri May 28 3:11 CST 1999
+ * Modified by: Horst von Brand <vonbrand@sleipnir.valparaiso.cl>
+ *
+ * Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ * All Rights Reserved.
+ * Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * Neither Dag Brattli nor University of Tromsø admit liability nor
+ * provide warranty for any of this software. This material is
+ * provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/wrapper.h>
+#include <net/irda/crc.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlap_frame.h>
+#include <net/irda/irda_device.h>
+
+/************************** FRAME WRAPPING **************************/
+/*
+ * Unwrap and unstuff SIR frames
+ *
+ * Note : at FIR and MIR, HDLC framing is used and usually handled
+ * by the controller, so we come here only for SIR... Jean II
+ */
+
+/*
+ * Function stuff_byte (byte, buf)
+ *
+ * Byte stuff one single byte and put the result in buffer pointed to by
+ * buf. The buffer must at all times be able to have two bytes inserted.
+ *
+ * This is in a tight loop, better inline it, so need to be prior to callers.
+ * (2000 bytes on P6 200MHz, non-inlined ~370us, inline ~170us) - Jean II
+ */
+static inline int stuff_byte(__u8 byte, __u8 *buf)
+{
+ switch (byte) {
+ case BOF: /* FALLTHROUGH */
+ case EOF: /* FALLTHROUGH */
+ case CE:
+ /* Insert transparently coded */
+ buf[0] = CE; /* Send link escape */
+ buf[1] = byte^IRDA_TRANS; /* Complement bit 5 */
+ return 2;
+ /* break; */
+ default:
+ /* Non-special value, no transparency required */
+ buf[0] = byte;
+ return 1;
+ /* break; */
+ }
+}
+
+/*
+ * Function async_wrap (skb, *tx_buff, buffsize)
+ *
+ * Makes a new buffer with wrapping and stuffing, should check that
+ * we don't get tx buffer overflow.
+ */
+int async_wrap_skb(struct sk_buff *skb, __u8 *tx_buff, int buffsize)
+{
+ struct irda_skb_cb *cb = (struct irda_skb_cb *) skb->cb;
+ int xbofs;
+ int i;
+ int n;
+ union {
+ __u16 value;
+ __u8 bytes[2];
+ } fcs;
+
+ /* Initialize variables */
+ fcs.value = INIT_FCS;
+ n = 0;
+
+ /*
+ * Send XBOF's for required min. turn time and for the negotiated
+ * additional XBOFS
+ */
+
+ if (cb->magic != LAP_MAGIC) {
+ /*
+ * This will happen for all frames sent from user-space.
+ * Nothing to worry about, but we set the default number of
+ * BOF's
+ */
+ pr_debug("%s(), wrong magic in skb!\n", __func__);
+ xbofs = 10;
+ } else
+ xbofs = cb->xbofs + cb->xbofs_delay;
+
+ pr_debug("%s(), xbofs=%d\n", __func__, xbofs);
+
+ /* Check that we never use more than 115 + 48 xbofs */
+ if (xbofs > 163) {
+ pr_debug("%s(), too many xbofs (%d)\n", __func__,
+ xbofs);
+ xbofs = 163;
+ }
+
+ memset(tx_buff + n, XBOF, xbofs);
+ n += xbofs;
+
+ /* Start of packet character BOF */
+ tx_buff[n++] = BOF;
+
+ /* Insert frame and calc CRC */
+ for (i=0; i < skb->len; i++) {
+ /*
+ * Check for the possibility of tx buffer overflow. We use
+ * bufsize-5 since the maximum number of bytes that can be
+ * transmitted after this point is 5.
+ */
+ if(n >= (buffsize-5)) {
+ net_err_ratelimited("%s(), tx buffer overflow (n=%d)\n",
+ __func__, n);
+ return n;
+ }
+
+ n += stuff_byte(skb->data[i], tx_buff+n);
+ fcs.value = irda_fcs(fcs.value, skb->data[i]);
+ }
+
+ /* Insert CRC in little endian format (LSB first) */
+ fcs.value = ~fcs.value;
+#ifdef __LITTLE_ENDIAN
+ n += stuff_byte(fcs.bytes[0], tx_buff+n);
+ n += stuff_byte(fcs.bytes[1], tx_buff+n);
+#else /* ifdef __BIG_ENDIAN */
+ n += stuff_byte(fcs.bytes[1], tx_buff+n);
+ n += stuff_byte(fcs.bytes[0], tx_buff+n);
+#endif
+ tx_buff[n++] = EOF;
+
+ return n;
+}
+EXPORT_SYMBOL(async_wrap_skb);
+
+/************************* FRAME UNWRAPPING *************************/
+/*
+ * Unwrap and unstuff SIR frames
+ *
+ * Complete rewrite by Jean II :
+ * More inline, faster, more compact, more logical. Jean II
+ * (16 bytes on P6 200MHz, old 5 to 7 us, new 4 to 6 us)
+ * (24 bytes on P6 200MHz, old 9 to 10 us, new 7 to 8 us)
+ * (for reference, 115200 b/s is 1 byte every 69 us)
+ * And reduce wrapper.o by ~900B in the process ;-)
+ *
+ * Then, we have the addition of ZeroCopy, which is optional
+ * (i.e. the driver must initiate it) and improve final processing.
+ * (2005 B frame + EOF on P6 200MHz, without 30 to 50 us, with 10 to 25 us)
+ *
+ * Note : at FIR and MIR, HDLC framing is used and usually handled
+ * by the controller, so we come here only for SIR... Jean II
+ */
+
+/*
+ * We can also choose where we want to do the CRC calculation. We can
+ * do it "inline", as we receive the bytes, or "postponed", when
+ * receiving the End-Of-Frame.
+ * (16 bytes on P6 200MHz, inlined 4 to 6 us, postponed 4 to 5 us)
+ * (24 bytes on P6 200MHz, inlined 7 to 8 us, postponed 5 to 7 us)
+ * With ZeroCopy :
+ * (2005 B frame on P6 200MHz, inlined 10 to 25 us, postponed 140 to 180 us)
+ * Without ZeroCopy :
+ * (2005 B frame on P6 200MHz, inlined 30 to 50 us, postponed 150 to 180 us)
+ * (Note : numbers taken with irq disabled)
+ *
+ * From those numbers, it's not clear which is the best strategy, because
+ * we end up running through a lot of data one way or another (i.e. cache
+ * misses). I personally prefer to avoid the huge latency spike of the
+ * "postponed" solution, because it come just at the time when we have
+ * lot's of protocol processing to do and it will hurt our ability to
+ * reach low link turnaround times... Jean II
+ */
+//#define POSTPONE_RX_CRC
+
+/*
+ * Function async_bump (buf, len, stats)
+ *
+ * Got a frame, make a copy of it, and pass it up the stack! We can try
+ * to inline it since it's only called from state_inside_frame
+ */
+static inline void
+async_bump(struct net_device *dev,
+ struct net_device_stats *stats,
+ iobuff_t *rx_buff)
+{
+ struct sk_buff *newskb;
+ struct sk_buff *dataskb;
+ int docopy;
+
+ /* Check if we need to copy the data to a new skb or not.
+ * If the driver doesn't use ZeroCopy Rx, we have to do it.
+ * With ZeroCopy Rx, the rx_buff already point to a valid
+ * skb. But, if the frame is small, it is more efficient to
+ * copy it to save memory (copy will be fast anyway - that's
+ * called Rx-copy-break). Jean II */
+ docopy = ((rx_buff->skb == NULL) ||
+ (rx_buff->len < IRDA_RX_COPY_THRESHOLD));
+
+ /* Allocate a new skb */
+ newskb = dev_alloc_skb(docopy ? rx_buff->len + 1 : rx_buff->truesize);
+ if (!newskb) {
+ stats->rx_dropped++;
+ /* We could deliver the current skb if doing ZeroCopy Rx,
+ * but this would stall the Rx path. Better drop the
+ * packet... Jean II */
+ return;
+ }
+
+ /* Align IP header to 20 bytes (i.e. increase skb->data)
+ * Note this is only useful with IrLAN, as PPP has a variable
+ * header size (2 or 1 bytes) - Jean II */
+ skb_reserve(newskb, 1);
+
+ if(docopy) {
+ /* Copy data without CRC (length already checked) */
+ skb_copy_to_linear_data(newskb, rx_buff->data,
+ rx_buff->len - 2);
+ /* Deliver this skb */
+ dataskb = newskb;
+ } else {
+ /* We are using ZeroCopy. Deliver old skb */
+ dataskb = rx_buff->skb;
+ /* And hook the new skb to the rx_buff */
+ rx_buff->skb = newskb;
+ rx_buff->head = newskb->data; /* NOT newskb->head */
+ //printk(KERN_DEBUG "ZeroCopy : len = %d, dataskb = %p, newskb = %p\n", rx_buff->len, dataskb, newskb);
+ }
+
+ /* Set proper length on skb (without CRC) */
+ skb_put(dataskb, rx_buff->len - 2);
+
+ /* Feed it to IrLAP layer */
+ dataskb->dev = dev;
+ skb_reset_mac_header(dataskb);
+ dataskb->protocol = htons(ETH_P_IRDA);
+
+ netif_rx(dataskb);
+
+ stats->rx_packets++;
+ stats->rx_bytes += rx_buff->len;
+
+ /* Clean up rx_buff (redundant with async_unwrap_bof() ???) */
+ rx_buff->data = rx_buff->head;
+ rx_buff->len = 0;
+}
+
+/*
+ * Function async_unwrap_bof(dev, byte)
+ *
+ * Handle Beginning Of Frame character received within a frame
+ *
+ */
+static inline void
+async_unwrap_bof(struct net_device *dev,
+ struct net_device_stats *stats,
+ iobuff_t *rx_buff, __u8 byte)
+{
+ switch(rx_buff->state) {
+ case LINK_ESCAPE:
+ case INSIDE_FRAME:
+ /* Not supposed to happen, the previous frame is not
+ * finished - Jean II */
+ pr_debug("%s(), Discarding incomplete frame\n",
+ __func__);
+ stats->rx_errors++;
+ stats->rx_missed_errors++;
+ irda_device_set_media_busy(dev, TRUE);
+ break;
+
+ case OUTSIDE_FRAME:
+ case BEGIN_FRAME:
+ default:
+ /* We may receive multiple BOF at the start of frame */
+ break;
+ }
+
+ /* Now receiving frame */
+ rx_buff->state = BEGIN_FRAME;
+ rx_buff->in_frame = TRUE;
+
+ /* Time to initialize receive buffer */
+ rx_buff->data = rx_buff->head;
+ rx_buff->len = 0;
+ rx_buff->fcs = INIT_FCS;
+}
+
+/*
+ * Function async_unwrap_eof(dev, byte)
+ *
+ * Handle End Of Frame character received within a frame
+ *
+ */
+static inline void
+async_unwrap_eof(struct net_device *dev,
+ struct net_device_stats *stats,
+ iobuff_t *rx_buff, __u8 byte)
+{
+#ifdef POSTPONE_RX_CRC
+ int i;
+#endif
+
+ switch(rx_buff->state) {
+ case OUTSIDE_FRAME:
+ /* Probably missed the BOF */
+ stats->rx_errors++;
+ stats->rx_missed_errors++;
+ irda_device_set_media_busy(dev, TRUE);
+ break;
+
+ case BEGIN_FRAME:
+ case LINK_ESCAPE:
+ case INSIDE_FRAME:
+ default:
+ /* Note : in the case of BEGIN_FRAME and LINK_ESCAPE,
+ * the fcs will most likely not match and generate an
+ * error, as expected - Jean II */
+ rx_buff->state = OUTSIDE_FRAME;
+ rx_buff->in_frame = FALSE;
+
+#ifdef POSTPONE_RX_CRC
+ /* If we haven't done the CRC as we receive bytes, we
+ * must do it now... Jean II */
+ for(i = 0; i < rx_buff->len; i++)
+ rx_buff->fcs = irda_fcs(rx_buff->fcs,
+ rx_buff->data[i]);
+#endif
+
+ /* Test FCS and signal success if the frame is good */
+ if (rx_buff->fcs == GOOD_FCS) {
+ /* Deliver frame */
+ async_bump(dev, stats, rx_buff);
+ break;
+ } else {
+ /* Wrong CRC, discard frame! */
+ irda_device_set_media_busy(dev, TRUE);
+
+ pr_debug("%s(), crc error\n", __func__);
+ stats->rx_errors++;
+ stats->rx_crc_errors++;
+ }
+ break;
+ }
+}
+
+/*
+ * Function async_unwrap_ce(dev, byte)
+ *
+ * Handle Character Escape character received within a frame
+ *
+ */
+static inline void
+async_unwrap_ce(struct net_device *dev,
+ struct net_device_stats *stats,
+ iobuff_t *rx_buff, __u8 byte)
+{
+ switch(rx_buff->state) {
+ case OUTSIDE_FRAME:
+ /* Activate carrier sense */
+ irda_device_set_media_busy(dev, TRUE);
+ break;
+
+ case LINK_ESCAPE:
+ net_warn_ratelimited("%s: state not defined\n", __func__);
+ break;
+
+ case BEGIN_FRAME:
+ case INSIDE_FRAME:
+ default:
+ /* Stuffed byte coming */
+ rx_buff->state = LINK_ESCAPE;
+ break;
+ }
+}
+
+/*
+ * Function async_unwrap_other(dev, byte)
+ *
+ * Handle other characters received within a frame
+ *
+ */
+static inline void
+async_unwrap_other(struct net_device *dev,
+ struct net_device_stats *stats,
+ iobuff_t *rx_buff, __u8 byte)
+{
+ switch(rx_buff->state) {
+ /* This is on the critical path, case are ordered by
+ * probability (most frequent first) - Jean II */
+ case INSIDE_FRAME:
+ /* Must be the next byte of the frame */
+ if (rx_buff->len < rx_buff->truesize) {
+ rx_buff->data[rx_buff->len++] = byte;
+#ifndef POSTPONE_RX_CRC
+ rx_buff->fcs = irda_fcs(rx_buff->fcs, byte);
+#endif
+ } else {
+ pr_debug("%s(), Rx buffer overflow, aborting\n",
+ __func__);
+ rx_buff->state = OUTSIDE_FRAME;
+ }
+ break;
+
+ case LINK_ESCAPE:
+ /*
+ * Stuffed char, complement bit 5 of byte
+ * following CE, IrLAP p.114
+ */
+ byte ^= IRDA_TRANS;
+ if (rx_buff->len < rx_buff->truesize) {
+ rx_buff->data[rx_buff->len++] = byte;
+#ifndef POSTPONE_RX_CRC
+ rx_buff->fcs = irda_fcs(rx_buff->fcs, byte);
+#endif
+ rx_buff->state = INSIDE_FRAME;
+ } else {
+ pr_debug("%s(), Rx buffer overflow, aborting\n",
+ __func__);
+ rx_buff->state = OUTSIDE_FRAME;
+ }
+ break;
+
+ case OUTSIDE_FRAME:
+ /* Activate carrier sense */
+ if(byte != XBOF)
+ irda_device_set_media_busy(dev, TRUE);
+ break;
+
+ case BEGIN_FRAME:
+ default:
+ rx_buff->data[rx_buff->len++] = byte;
+#ifndef POSTPONE_RX_CRC
+ rx_buff->fcs = irda_fcs(rx_buff->fcs, byte);
+#endif
+ rx_buff->state = INSIDE_FRAME;
+ break;
+ }
+}
+
+/*
+ * Function async_unwrap_char (dev, rx_buff, byte)
+ *
+ * Parse and de-stuff frame received from the IrDA-port
+ *
+ * This is the main entry point for SIR drivers.
+ */
+void async_unwrap_char(struct net_device *dev,
+ struct net_device_stats *stats,
+ iobuff_t *rx_buff, __u8 byte)
+{
+ switch(byte) {
+ case CE:
+ async_unwrap_ce(dev, stats, rx_buff, byte);
+ break;
+ case BOF:
+ async_unwrap_bof(dev, stats, rx_buff, byte);
+ break;
+ case EOF:
+ async_unwrap_eof(dev, stats, rx_buff, byte);
+ break;
+ default:
+ async_unwrap_other(dev, stats, rx_buff, byte);
+ break;
+ }
+}
+EXPORT_SYMBOL(async_unwrap_char);
+
diff --git a/net/iucv/Kconfig b/net/iucv/Kconfig
new file mode 100644
index 000000000..497fbe732
--- /dev/null
+++ b/net/iucv/Kconfig
@@ -0,0 +1,17 @@
+config IUCV
+ depends on S390
+ def_tristate y if S390
+ prompt "IUCV support (S390 - z/VM only)"
+ help
+ Select this option if you want to use inter-user communication
+ under VM or VIF. If you run on z/VM, say "Y" to enable a fast
+ communication link between VM guests.
+
+config AFIUCV
+ depends on S390
+ def_tristate m if QETH_L3 || IUCV
+ prompt "AF_IUCV Socket support (S390 - z/VM and HiperSockets transport)"
+ help
+ Select this option if you want to use AF_IUCV socket applications
+ based on z/VM inter-user communication vehicle or based on
+ HiperSockets.
diff --git a/net/iucv/Makefile b/net/iucv/Makefile
new file mode 100644
index 000000000..7bfdc8532
--- /dev/null
+++ b/net/iucv/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for IUCV
+#
+
+obj-$(CONFIG_IUCV) += iucv.o
+obj-$(CONFIG_AFIUCV) += af_iucv.o
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
new file mode 100644
index 000000000..6daa52a18
--- /dev/null
+++ b/net/iucv/af_iucv.c
@@ -0,0 +1,2463 @@
+/*
+ * IUCV protocol stack for Linux on zSeries
+ *
+ * Copyright IBM Corp. 2006, 2009
+ *
+ * Author(s): Jennifer Hunt <jenhunt@us.ibm.com>
+ * Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ * PM functions:
+ * Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#define KMSG_COMPONENT "af_iucv"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+#include <asm/ebcdic.h>
+#include <asm/cpcmd.h>
+#include <linux/kmod.h>
+
+#include <net/iucv/af_iucv.h>
+
+#define VERSION "1.2"
+
+static char iucv_userid[80];
+
+static const struct proto_ops iucv_sock_ops;
+
+static struct proto iucv_proto = {
+ .name = "AF_IUCV",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct iucv_sock),
+};
+
+static struct iucv_interface *pr_iucv;
+
+/* special AF_IUCV IPRM messages */
+static const u8 iprm_shutdown[8] =
+ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01};
+
+#define TRGCLS_SIZE (sizeof(((struct iucv_message *)0)->class))
+
+#define __iucv_sock_wait(sk, condition, timeo, ret) \
+do { \
+ DEFINE_WAIT(__wait); \
+ long __timeo = timeo; \
+ ret = 0; \
+ prepare_to_wait(sk_sleep(sk), &__wait, TASK_INTERRUPTIBLE); \
+ while (!(condition)) { \
+ if (!__timeo) { \
+ ret = -EAGAIN; \
+ break; \
+ } \
+ if (signal_pending(current)) { \
+ ret = sock_intr_errno(__timeo); \
+ break; \
+ } \
+ release_sock(sk); \
+ __timeo = schedule_timeout(__timeo); \
+ lock_sock(sk); \
+ ret = sock_error(sk); \
+ if (ret) \
+ break; \
+ } \
+ finish_wait(sk_sleep(sk), &__wait); \
+} while (0)
+
+#define iucv_sock_wait(sk, condition, timeo) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ __iucv_sock_wait(sk, condition, timeo, __ret); \
+ __ret; \
+})
+
+static void iucv_sock_kill(struct sock *sk);
+static void iucv_sock_close(struct sock *sk);
+static void iucv_sever_path(struct sock *, int);
+
+static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
+static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
+ struct sk_buff *skb, u8 flags);
+static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify);
+
+/* Call Back functions */
+static void iucv_callback_rx(struct iucv_path *, struct iucv_message *);
+static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *);
+static void iucv_callback_connack(struct iucv_path *, u8 ipuser[16]);
+static int iucv_callback_connreq(struct iucv_path *, u8 ipvmid[8],
+ u8 ipuser[16]);
+static void iucv_callback_connrej(struct iucv_path *, u8 ipuser[16]);
+static void iucv_callback_shutdown(struct iucv_path *, u8 ipuser[16]);
+
+static struct iucv_sock_list iucv_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock),
+ .autobind_name = ATOMIC_INIT(0)
+};
+
+static struct iucv_handler af_iucv_handler = {
+ .path_pending = iucv_callback_connreq,
+ .path_complete = iucv_callback_connack,
+ .path_severed = iucv_callback_connrej,
+ .message_pending = iucv_callback_rx,
+ .message_complete = iucv_callback_txdone,
+ .path_quiesced = iucv_callback_shutdown,
+};
+
+static inline void high_nmcpy(unsigned char *dst, char *src)
+{
+ memcpy(dst, src, 8);
+}
+
+static inline void low_nmcpy(unsigned char *dst, char *src)
+{
+ memcpy(&dst[8], src, 8);
+}
+
+static int afiucv_pm_prepare(struct device *dev)
+{
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_WARNING "afiucv_pm_prepare\n");
+#endif
+ return 0;
+}
+
+static void afiucv_pm_complete(struct device *dev)
+{
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_WARNING "afiucv_pm_complete\n");
+#endif
+}
+
+/**
+ * afiucv_pm_freeze() - Freeze PM callback
+ * @dev: AFIUCV dummy device
+ *
+ * Sever all established IUCV communication pathes
+ */
+static int afiucv_pm_freeze(struct device *dev)
+{
+ struct iucv_sock *iucv;
+ struct sock *sk;
+ int err = 0;
+
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_WARNING "afiucv_pm_freeze\n");
+#endif
+ read_lock(&iucv_sk_list.lock);
+ sk_for_each(sk, &iucv_sk_list.head) {
+ iucv = iucv_sk(sk);
+ switch (sk->sk_state) {
+ case IUCV_DISCONN:
+ case IUCV_CLOSING:
+ case IUCV_CONNECTED:
+ iucv_sever_path(sk, 0);
+ break;
+ case IUCV_OPEN:
+ case IUCV_BOUND:
+ case IUCV_LISTEN:
+ case IUCV_CLOSED:
+ default:
+ break;
+ }
+ skb_queue_purge(&iucv->send_skb_q);
+ skb_queue_purge(&iucv->backlog_skb_q);
+ }
+ read_unlock(&iucv_sk_list.lock);
+ return err;
+}
+
+/**
+ * afiucv_pm_restore_thaw() - Thaw and restore PM callback
+ * @dev: AFIUCV dummy device
+ *
+ * socket clean up after freeze
+ */
+static int afiucv_pm_restore_thaw(struct device *dev)
+{
+ struct sock *sk;
+
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_WARNING "afiucv_pm_restore_thaw\n");
+#endif
+ read_lock(&iucv_sk_list.lock);
+ sk_for_each(sk, &iucv_sk_list.head) {
+ switch (sk->sk_state) {
+ case IUCV_CONNECTED:
+ sk->sk_err = EPIPE;
+ sk->sk_state = IUCV_DISCONN;
+ sk->sk_state_change(sk);
+ break;
+ case IUCV_DISCONN:
+ case IUCV_CLOSING:
+ case IUCV_LISTEN:
+ case IUCV_BOUND:
+ case IUCV_OPEN:
+ default:
+ break;
+ }
+ }
+ read_unlock(&iucv_sk_list.lock);
+ return 0;
+}
+
+static const struct dev_pm_ops afiucv_pm_ops = {
+ .prepare = afiucv_pm_prepare,
+ .complete = afiucv_pm_complete,
+ .freeze = afiucv_pm_freeze,
+ .thaw = afiucv_pm_restore_thaw,
+ .restore = afiucv_pm_restore_thaw,
+};
+
+static struct device_driver af_iucv_driver = {
+ .owner = THIS_MODULE,
+ .name = "afiucv",
+ .bus = NULL,
+ .pm = &afiucv_pm_ops,
+};
+
+/* dummy device used as trigger for PM functions */
+static struct device *af_iucv_dev;
+
+/**
+ * iucv_msg_length() - Returns the length of an iucv message.
+ * @msg: Pointer to struct iucv_message, MUST NOT be NULL
+ *
+ * The function returns the length of the specified iucv message @msg of data
+ * stored in a buffer and of data stored in the parameter list (PRMDATA).
+ *
+ * For IUCV_IPRMDATA, AF_IUCV uses the following convention to transport socket
+ * data:
+ * PRMDATA[0..6] socket data (max 7 bytes);
+ * PRMDATA[7] socket data length value (len is 0xff - PRMDATA[7])
+ *
+ * The socket data length is computed by subtracting the socket data length
+ * value from 0xFF.
+ * If the socket data len is greater 7, then PRMDATA can be used for special
+ * notifications (see iucv_sock_shutdown); and further,
+ * if the socket data len is > 7, the function returns 8.
+ *
+ * Use this function to allocate socket buffers to store iucv message data.
+ */
+static inline size_t iucv_msg_length(struct iucv_message *msg)
+{
+ size_t datalen;
+
+ if (msg->flags & IUCV_IPRMDATA) {
+ datalen = 0xff - msg->rmmsg[7];
+ return (datalen < 8) ? datalen : 8;
+ }
+ return msg->length;
+}
+
+/**
+ * iucv_sock_in_state() - check for specific states
+ * @sk: sock structure
+ * @state: first iucv sk state
+ * @state: second iucv sk state
+ *
+ * Returns true if the socket in either in the first or second state.
+ */
+static int iucv_sock_in_state(struct sock *sk, int state, int state2)
+{
+ return (sk->sk_state == state || sk->sk_state == state2);
+}
+
+/**
+ * iucv_below_msglim() - function to check if messages can be sent
+ * @sk: sock structure
+ *
+ * Returns true if the send queue length is lower than the message limit.
+ * Always returns true if the socket is not connected (no iucv path for
+ * checking the message limit).
+ */
+static inline int iucv_below_msglim(struct sock *sk)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+
+ if (sk->sk_state != IUCV_CONNECTED)
+ return 1;
+ if (iucv->transport == AF_IUCV_TRANS_IUCV)
+ return (skb_queue_len(&iucv->send_skb_q) < iucv->path->msglim);
+ else
+ return ((atomic_read(&iucv->msg_sent) < iucv->msglimit_peer) &&
+ (atomic_read(&iucv->pendings) <= 0));
+}
+
+/**
+ * iucv_sock_wake_msglim() - Wake up thread waiting on msg limit
+ */
+static void iucv_sock_wake_msglim(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
+}
+
+/**
+ * afiucv_hs_send() - send a message through HiperSockets transport
+ */
+static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
+ struct sk_buff *skb, u8 flags)
+{
+ struct iucv_sock *iucv = iucv_sk(sock);
+ struct af_iucv_trans_hdr *phs_hdr;
+ struct sk_buff *nskb;
+ int err, confirm_recv = 0;
+
+ memset(skb->head, 0, ETH_HLEN);
+ phs_hdr = (struct af_iucv_trans_hdr *)skb_push(skb,
+ sizeof(struct af_iucv_trans_hdr));
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ memset(phs_hdr, 0, sizeof(struct af_iucv_trans_hdr));
+
+ phs_hdr->magic = ETH_P_AF_IUCV;
+ phs_hdr->version = 1;
+ phs_hdr->flags = flags;
+ if (flags == AF_IUCV_FLAG_SYN)
+ phs_hdr->window = iucv->msglimit;
+ else if ((flags == AF_IUCV_FLAG_WIN) || !flags) {
+ confirm_recv = atomic_read(&iucv->msg_recv);
+ phs_hdr->window = confirm_recv;
+ if (confirm_recv)
+ phs_hdr->flags = phs_hdr->flags | AF_IUCV_FLAG_WIN;
+ }
+ memcpy(phs_hdr->destUserID, iucv->dst_user_id, 8);
+ memcpy(phs_hdr->destAppName, iucv->dst_name, 8);
+ memcpy(phs_hdr->srcUserID, iucv->src_user_id, 8);
+ memcpy(phs_hdr->srcAppName, iucv->src_name, 8);
+ ASCEBC(phs_hdr->destUserID, sizeof(phs_hdr->destUserID));
+ ASCEBC(phs_hdr->destAppName, sizeof(phs_hdr->destAppName));
+ ASCEBC(phs_hdr->srcUserID, sizeof(phs_hdr->srcUserID));
+ ASCEBC(phs_hdr->srcAppName, sizeof(phs_hdr->srcAppName));
+ if (imsg)
+ memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message));
+
+ skb->dev = iucv->hs_dev;
+ if (!skb->dev)
+ return -ENODEV;
+ if (!(skb->dev->flags & IFF_UP) || !netif_carrier_ok(skb->dev))
+ return -ENETDOWN;
+ if (skb->len > skb->dev->mtu) {
+ if (sock->sk_type == SOCK_SEQPACKET)
+ return -EMSGSIZE;
+ else
+ skb_trim(skb, skb->dev->mtu);
+ }
+ skb->protocol = ETH_P_AF_IUCV;
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+ skb_queue_tail(&iucv->send_skb_q, nskb);
+ err = dev_queue_xmit(skb);
+ if (net_xmit_eval(err)) {
+ skb_unlink(nskb, &iucv->send_skb_q);
+ kfree_skb(nskb);
+ } else {
+ atomic_sub(confirm_recv, &iucv->msg_recv);
+ WARN_ON(atomic_read(&iucv->msg_recv) < 0);
+ }
+ return net_xmit_eval(err);
+}
+
+static struct sock *__iucv_get_sock_by_name(char *nm)
+{
+ struct sock *sk;
+
+ sk_for_each(sk, &iucv_sk_list.head)
+ if (!memcmp(&iucv_sk(sk)->src_name, nm, 8))
+ return sk;
+
+ return NULL;
+}
+
+static void iucv_sock_destruct(struct sock *sk)
+{
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_error_queue);
+
+ sk_mem_reclaim(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Attempt to release alive iucv socket %p\n", sk);
+ return;
+ }
+
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
+}
+
+/* Cleanup Listen */
+static void iucv_sock_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ /* Close non-accepted connections */
+ while ((sk = iucv_accept_dequeue(parent, NULL))) {
+ iucv_sock_close(sk);
+ iucv_sock_kill(sk);
+ }
+
+ parent->sk_state = IUCV_CLOSED;
+}
+
+/* Kill socket (only if zapped and orphaned) */
+static void iucv_sock_kill(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+ return;
+
+ iucv_sock_unlink(&iucv_sk_list, sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+}
+
+/* Terminate an IUCV path */
+static void iucv_sever_path(struct sock *sk, int with_user_data)
+{
+ unsigned char user_data[16];
+ struct iucv_sock *iucv = iucv_sk(sk);
+ struct iucv_path *path = iucv->path;
+
+ if (iucv->path) {
+ iucv->path = NULL;
+ if (with_user_data) {
+ low_nmcpy(user_data, iucv->src_name);
+ high_nmcpy(user_data, iucv->dst_name);
+ ASCEBC(user_data, sizeof(user_data));
+ pr_iucv->path_sever(path, user_data);
+ } else
+ pr_iucv->path_sever(path, NULL);
+ iucv_path_free(path);
+ }
+}
+
+/* Send FIN through an IUCV socket for HIPER transport */
+static int iucv_send_ctrl(struct sock *sk, u8 flags)
+{
+ int err = 0;
+ int blen;
+ struct sk_buff *skb;
+
+ blen = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
+ skb = sock_alloc_send_skb(sk, blen, 1, &err);
+ if (skb) {
+ skb_reserve(skb, blen);
+ err = afiucv_hs_send(NULL, sk, skb, flags);
+ }
+ return err;
+}
+
+/* Close an IUCV socket */
+static void iucv_sock_close(struct sock *sk)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+ unsigned long timeo;
+ int err = 0;
+
+ lock_sock(sk);
+
+ switch (sk->sk_state) {
+ case IUCV_LISTEN:
+ iucv_sock_cleanup_listen(sk);
+ break;
+
+ case IUCV_CONNECTED:
+ if (iucv->transport == AF_IUCV_TRANS_HIPER) {
+ err = iucv_send_ctrl(sk, AF_IUCV_FLAG_FIN);
+ sk->sk_state = IUCV_DISCONN;
+ sk->sk_state_change(sk);
+ }
+ case IUCV_DISCONN: /* fall through */
+ sk->sk_state = IUCV_CLOSING;
+ sk->sk_state_change(sk);
+
+ if (!err && !skb_queue_empty(&iucv->send_skb_q)) {
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+ timeo = sk->sk_lingertime;
+ else
+ timeo = IUCV_DISCONN_TIMEOUT;
+ iucv_sock_wait(sk,
+ iucv_sock_in_state(sk, IUCV_CLOSED, 0),
+ timeo);
+ }
+
+ case IUCV_CLOSING: /* fall through */
+ sk->sk_state = IUCV_CLOSED;
+ sk->sk_state_change(sk);
+
+ sk->sk_err = ECONNRESET;
+ sk->sk_state_change(sk);
+
+ skb_queue_purge(&iucv->send_skb_q);
+ skb_queue_purge(&iucv->backlog_skb_q);
+
+ default: /* fall through */
+ iucv_sever_path(sk, 1);
+ }
+
+ if (iucv->hs_dev) {
+ dev_put(iucv->hs_dev);
+ iucv->hs_dev = NULL;
+ sk->sk_bound_dev_if = 0;
+ }
+
+ /* mark socket for deletion by iucv_sock_kill() */
+ sock_set_flag(sk, SOCK_ZAPPED);
+
+ release_sock(sk);
+}
+
+static void iucv_sock_init(struct sock *sk, struct sock *parent)
+{
+ if (parent)
+ sk->sk_type = parent->sk_type;
+}
+
+static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio)
+{
+ struct sock *sk;
+ struct iucv_sock *iucv;
+
+ sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto);
+ if (!sk)
+ return NULL;
+ iucv = iucv_sk(sk);
+
+ sock_init_data(sock, sk);
+ INIT_LIST_HEAD(&iucv->accept_q);
+ spin_lock_init(&iucv->accept_q_lock);
+ skb_queue_head_init(&iucv->send_skb_q);
+ INIT_LIST_HEAD(&iucv->message_q.list);
+ spin_lock_init(&iucv->message_q.lock);
+ skb_queue_head_init(&iucv->backlog_skb_q);
+ iucv->send_tag = 0;
+ atomic_set(&iucv->pendings, 0);
+ iucv->flags = 0;
+ iucv->msglimit = 0;
+ atomic_set(&iucv->msg_sent, 0);
+ atomic_set(&iucv->msg_recv, 0);
+ iucv->path = NULL;
+ iucv->sk_txnotify = afiucv_hs_callback_txnotify;
+ memset(&iucv->src_user_id , 0, 32);
+ if (pr_iucv)
+ iucv->transport = AF_IUCV_TRANS_IUCV;
+ else
+ iucv->transport = AF_IUCV_TRANS_HIPER;
+
+ sk->sk_destruct = iucv_sock_destruct;
+ sk->sk_sndtimeo = IUCV_CONN_TIMEOUT;
+ sk->sk_allocation = GFP_DMA;
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = proto;
+ sk->sk_state = IUCV_OPEN;
+
+ iucv_sock_link(&iucv_sk_list, sk);
+ return sk;
+}
+
+/* Create an IUCV socket */
+static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ if (protocol && protocol != PF_IUCV)
+ return -EPROTONOSUPPORT;
+
+ sock->state = SS_UNCONNECTED;
+
+ switch (sock->type) {
+ case SOCK_STREAM:
+ sock->ops = &iucv_sock_ops;
+ break;
+ case SOCK_SEQPACKET:
+ /* currently, proto ops can handle both sk types */
+ sock->ops = &iucv_sock_ops;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL);
+ if (!sk)
+ return -ENOMEM;
+
+ iucv_sock_init(sk, NULL);
+
+ return 0;
+}
+
+void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk)
+{
+ write_lock_bh(&l->lock);
+ sk_add_node(sk, &l->head);
+ write_unlock_bh(&l->lock);
+}
+
+void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk)
+{
+ write_lock_bh(&l->lock);
+ sk_del_node_init(sk);
+ write_unlock_bh(&l->lock);
+}
+
+void iucv_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+ unsigned long flags;
+ struct iucv_sock *par = iucv_sk(parent);
+
+ sock_hold(sk);
+ spin_lock_irqsave(&par->accept_q_lock, flags);
+ list_add_tail(&iucv_sk(sk)->accept_q, &par->accept_q);
+ spin_unlock_irqrestore(&par->accept_q_lock, flags);
+ iucv_sk(sk)->parent = parent;
+ sk_acceptq_added(parent);
+}
+
+void iucv_accept_unlink(struct sock *sk)
+{
+ unsigned long flags;
+ struct iucv_sock *par = iucv_sk(iucv_sk(sk)->parent);
+
+ spin_lock_irqsave(&par->accept_q_lock, flags);
+ list_del_init(&iucv_sk(sk)->accept_q);
+ spin_unlock_irqrestore(&par->accept_q_lock, flags);
+ sk_acceptq_removed(iucv_sk(sk)->parent);
+ iucv_sk(sk)->parent = NULL;
+ sock_put(sk);
+}
+
+struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock)
+{
+ struct iucv_sock *isk, *n;
+ struct sock *sk;
+
+ list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *) isk;
+ lock_sock(sk);
+
+ if (sk->sk_state == IUCV_CLOSED) {
+ iucv_accept_unlink(sk);
+ release_sock(sk);
+ continue;
+ }
+
+ if (sk->sk_state == IUCV_CONNECTED ||
+ sk->sk_state == IUCV_DISCONN ||
+ !newsock) {
+ iucv_accept_unlink(sk);
+ if (newsock)
+ sock_graft(sk, newsock);
+
+ release_sock(sk);
+ return sk;
+ }
+
+ release_sock(sk);
+ }
+ return NULL;
+}
+
+static void __iucv_auto_name(struct iucv_sock *iucv)
+{
+ char name[12];
+
+ sprintf(name, "%08x", atomic_inc_return(&iucv_sk_list.autobind_name));
+ while (__iucv_get_sock_by_name(name)) {
+ sprintf(name, "%08x",
+ atomic_inc_return(&iucv_sk_list.autobind_name));
+ }
+ memcpy(iucv->src_name, name, 8);
+}
+
+/* Bind an unbound socket */
+static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
+ int addr_len)
+{
+ struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv;
+ int err = 0;
+ struct net_device *dev;
+ char uid[9];
+
+ /* Verify the input sockaddr */
+ if (!addr || addr->sa_family != AF_IUCV)
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sk->sk_state != IUCV_OPEN) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ write_lock_bh(&iucv_sk_list.lock);
+
+ iucv = iucv_sk(sk);
+ if (__iucv_get_sock_by_name(sa->siucv_name)) {
+ err = -EADDRINUSE;
+ goto done_unlock;
+ }
+ if (iucv->path)
+ goto done_unlock;
+
+ /* Bind the socket */
+ if (pr_iucv)
+ if (!memcmp(sa->siucv_user_id, iucv_userid, 8))
+ goto vm_bind; /* VM IUCV transport */
+
+ /* try hiper transport */
+ memcpy(uid, sa->siucv_user_id, sizeof(uid));
+ ASCEBC(uid, 8);
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if (!memcmp(dev->perm_addr, uid, 8)) {
+ memcpy(iucv->src_user_id, sa->siucv_user_id, 8);
+ /* Check for unitialized siucv_name */
+ if (strncmp(sa->siucv_name, " ", 8) == 0)
+ __iucv_auto_name(iucv);
+ else
+ memcpy(iucv->src_name, sa->siucv_name, 8);
+ sk->sk_bound_dev_if = dev->ifindex;
+ iucv->hs_dev = dev;
+ dev_hold(dev);
+ sk->sk_state = IUCV_BOUND;
+ iucv->transport = AF_IUCV_TRANS_HIPER;
+ if (!iucv->msglimit)
+ iucv->msglimit = IUCV_HIPER_MSGLIM_DEFAULT;
+ rcu_read_unlock();
+ goto done_unlock;
+ }
+ }
+ rcu_read_unlock();
+vm_bind:
+ if (pr_iucv) {
+ /* use local userid for backward compat */
+ memcpy(iucv->src_name, sa->siucv_name, 8);
+ memcpy(iucv->src_user_id, iucv_userid, 8);
+ sk->sk_state = IUCV_BOUND;
+ iucv->transport = AF_IUCV_TRANS_IUCV;
+ if (!iucv->msglimit)
+ iucv->msglimit = IUCV_QUEUELEN_DEFAULT;
+ goto done_unlock;
+ }
+ /* found no dev to bind */
+ err = -ENODEV;
+done_unlock:
+ /* Release the socket list lock */
+ write_unlock_bh(&iucv_sk_list.lock);
+done:
+ release_sock(sk);
+ return err;
+}
+
+/* Automatically bind an unbound socket */
+static int iucv_sock_autobind(struct sock *sk)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+ int err = 0;
+
+ if (unlikely(!pr_iucv))
+ return -EPROTO;
+
+ memcpy(iucv->src_user_id, iucv_userid, 8);
+
+ write_lock_bh(&iucv_sk_list.lock);
+ __iucv_auto_name(iucv);
+ write_unlock_bh(&iucv_sk_list.lock);
+
+ if (!iucv->msglimit)
+ iucv->msglimit = IUCV_QUEUELEN_DEFAULT;
+
+ return err;
+}
+
+static int afiucv_path_connect(struct socket *sock, struct sockaddr *addr)
+{
+ struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv = iucv_sk(sk);
+ unsigned char user_data[16];
+ int err;
+
+ high_nmcpy(user_data, sa->siucv_name);
+ low_nmcpy(user_data, iucv->src_name);
+ ASCEBC(user_data, sizeof(user_data));
+
+ /* Create path. */
+ iucv->path = iucv_path_alloc(iucv->msglimit,
+ IUCV_IPRMDATA, GFP_KERNEL);
+ if (!iucv->path) {
+ err = -ENOMEM;
+ goto done;
+ }
+ err = pr_iucv->path_connect(iucv->path, &af_iucv_handler,
+ sa->siucv_user_id, NULL, user_data,
+ sk);
+ if (err) {
+ iucv_path_free(iucv->path);
+ iucv->path = NULL;
+ switch (err) {
+ case 0x0b: /* Target communicator is not logged on */
+ err = -ENETUNREACH;
+ break;
+ case 0x0d: /* Max connections for this guest exceeded */
+ case 0x0e: /* Max connections for target guest exceeded */
+ err = -EAGAIN;
+ break;
+ case 0x0f: /* Missing IUCV authorization */
+ err = -EACCES;
+ break;
+ default:
+ err = -ECONNREFUSED;
+ break;
+ }
+ }
+done:
+ return err;
+}
+
+/* Connect an unconnected socket */
+static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv = iucv_sk(sk);
+ int err;
+
+ if (addr->sa_family != AF_IUCV || alen < sizeof(struct sockaddr_iucv))
+ return -EINVAL;
+
+ if (sk->sk_state != IUCV_OPEN && sk->sk_state != IUCV_BOUND)
+ return -EBADFD;
+
+ if (sk->sk_state == IUCV_OPEN &&
+ iucv->transport == AF_IUCV_TRANS_HIPER)
+ return -EBADFD; /* explicit bind required */
+
+ if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_SEQPACKET)
+ return -EINVAL;
+
+ if (sk->sk_state == IUCV_OPEN) {
+ err = iucv_sock_autobind(sk);
+ if (unlikely(err))
+ return err;
+ }
+
+ lock_sock(sk);
+
+ /* Set the destination information */
+ memcpy(iucv->dst_user_id, sa->siucv_user_id, 8);
+ memcpy(iucv->dst_name, sa->siucv_name, 8);
+
+ if (iucv->transport == AF_IUCV_TRANS_HIPER)
+ err = iucv_send_ctrl(sock->sk, AF_IUCV_FLAG_SYN);
+ else
+ err = afiucv_path_connect(sock, addr);
+ if (err)
+ goto done;
+
+ if (sk->sk_state != IUCV_CONNECTED)
+ err = iucv_sock_wait(sk, iucv_sock_in_state(sk, IUCV_CONNECTED,
+ IUCV_DISCONN),
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
+
+ if (sk->sk_state == IUCV_DISCONN || sk->sk_state == IUCV_CLOSED)
+ err = -ECONNREFUSED;
+
+ if (err && iucv->transport == AF_IUCV_TRANS_IUCV)
+ iucv_sever_path(sk, 0);
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+/* Move a socket into listening state. */
+static int iucv_sock_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sk->sk_state != IUCV_BOUND)
+ goto done;
+
+ if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
+ goto done;
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = IUCV_LISTEN;
+ err = 0;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+/* Accept a pending connection */
+static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
+ int flags)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct sock *sk = sock->sk, *nsk;
+ long timeo;
+ int err = 0;
+
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+ if (sk->sk_state != IUCV_LISTEN) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ /* Wait for an incoming connection */
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (!(nsk = iucv_accept_dequeue(sk, newsock))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+ if (sk->sk_state != IUCV_LISTEN) {
+ err = -EBADFD;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (err)
+ goto done;
+
+ newsock->state = SS_CONNECTED;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int *len, int peer)
+{
+ struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr;
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv = iucv_sk(sk);
+
+ addr->sa_family = AF_IUCV;
+ *len = sizeof(struct sockaddr_iucv);
+
+ if (peer) {
+ memcpy(siucv->siucv_user_id, iucv->dst_user_id, 8);
+ memcpy(siucv->siucv_name, iucv->dst_name, 8);
+ } else {
+ memcpy(siucv->siucv_user_id, iucv->src_user_id, 8);
+ memcpy(siucv->siucv_name, iucv->src_name, 8);
+ }
+ memset(&siucv->siucv_port, 0, sizeof(siucv->siucv_port));
+ memset(&siucv->siucv_addr, 0, sizeof(siucv->siucv_addr));
+ memset(&siucv->siucv_nodeid, 0, sizeof(siucv->siucv_nodeid));
+
+ return 0;
+}
+
+/**
+ * iucv_send_iprm() - Send socket data in parameter list of an iucv message.
+ * @path: IUCV path
+ * @msg: Pointer to a struct iucv_message
+ * @skb: The socket data to send, skb->len MUST BE <= 7
+ *
+ * Send the socket data in the parameter list in the iucv message
+ * (IUCV_IPRMDATA). The socket data is stored at index 0 to 6 in the parameter
+ * list and the socket data len at index 7 (last byte).
+ * See also iucv_msg_length().
+ *
+ * Returns the error code from the iucv_message_send() call.
+ */
+static int iucv_send_iprm(struct iucv_path *path, struct iucv_message *msg,
+ struct sk_buff *skb)
+{
+ u8 prmdata[8];
+
+ memcpy(prmdata, (void *) skb->data, skb->len);
+ prmdata[7] = 0xff - (u8) skb->len;
+ return pr_iucv->message_send(path, msg, IUCV_IPRMDATA, 0,
+ (void *) prmdata, 8);
+}
+
+static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv = iucv_sk(sk);
+ struct sk_buff *skb;
+ struct iucv_message txmsg;
+ struct cmsghdr *cmsg;
+ int cmsg_done;
+ long timeo;
+ char user_id[9];
+ char appl_id[9];
+ int err;
+ int noblock = msg->msg_flags & MSG_DONTWAIT;
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ /* SOCK_SEQPACKET: we do not support segmented records */
+ if (sk->sk_type == SOCK_SEQPACKET && !(msg->msg_flags & MSG_EOR))
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ err = -EPIPE;
+ goto out;
+ }
+
+ /* Return if the socket is not in connected state */
+ if (sk->sk_state != IUCV_CONNECTED) {
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ /* initialize defaults */
+ cmsg_done = 0; /* check for duplicate headers */
+ txmsg.class = 0;
+
+ /* iterate over control messages */
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (cmsg->cmsg_level != SOL_IUCV)
+ continue;
+
+ if (cmsg->cmsg_type & cmsg_done) {
+ err = -EINVAL;
+ goto out;
+ }
+ cmsg_done |= cmsg->cmsg_type;
+
+ switch (cmsg->cmsg_type) {
+ case SCM_IUCV_TRGCLS:
+ if (cmsg->cmsg_len != CMSG_LEN(TRGCLS_SIZE)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* set iucv message target class */
+ memcpy(&txmsg.class,
+ (void *) CMSG_DATA(cmsg), TRGCLS_SIZE);
+
+ break;
+
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* allocate one skb for each iucv message:
+ * this is fine for SOCK_SEQPACKET (unless we want to support
+ * segmented records using the MSG_EOR flag), but
+ * for SOCK_STREAM we might want to improve it in future */
+ if (iucv->transport == AF_IUCV_TRANS_HIPER)
+ skb = sock_alloc_send_skb(sk,
+ len + sizeof(struct af_iucv_trans_hdr) + ETH_HLEN,
+ noblock, &err);
+ else
+ skb = sock_alloc_send_skb(sk, len, noblock, &err);
+ if (!skb)
+ goto out;
+ if (iucv->transport == AF_IUCV_TRANS_HIPER)
+ skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN);
+ if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
+ err = -EFAULT;
+ goto fail;
+ }
+
+ /* wait if outstanding messages for iucv path has reached */
+ timeo = sock_sndtimeo(sk, noblock);
+ err = iucv_sock_wait(sk, iucv_below_msglim(sk), timeo);
+ if (err)
+ goto fail;
+
+ /* return -ECONNRESET if the socket is no longer connected */
+ if (sk->sk_state != IUCV_CONNECTED) {
+ err = -ECONNRESET;
+ goto fail;
+ }
+
+ /* increment and save iucv message tag for msg_completion cbk */
+ txmsg.tag = iucv->send_tag++;
+ IUCV_SKB_CB(skb)->tag = txmsg.tag;
+
+ if (iucv->transport == AF_IUCV_TRANS_HIPER) {
+ atomic_inc(&iucv->msg_sent);
+ err = afiucv_hs_send(&txmsg, sk, skb, 0);
+ if (err) {
+ atomic_dec(&iucv->msg_sent);
+ goto fail;
+ }
+ goto release;
+ }
+ skb_queue_tail(&iucv->send_skb_q, skb);
+
+ if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags)
+ && skb->len <= 7) {
+ err = iucv_send_iprm(iucv->path, &txmsg, skb);
+
+ /* on success: there is no message_complete callback
+ * for an IPRMDATA msg; remove skb from send queue */
+ if (err == 0) {
+ skb_unlink(skb, &iucv->send_skb_q);
+ kfree_skb(skb);
+ }
+
+ /* this error should never happen since the
+ * IUCV_IPRMDATA path flag is set... sever path */
+ if (err == 0x15) {
+ pr_iucv->path_sever(iucv->path, NULL);
+ skb_unlink(skb, &iucv->send_skb_q);
+ err = -EPIPE;
+ goto fail;
+ }
+ } else
+ err = pr_iucv->message_send(iucv->path, &txmsg, 0, 0,
+ (void *) skb->data, skb->len);
+ if (err) {
+ if (err == 3) {
+ user_id[8] = 0;
+ memcpy(user_id, iucv->dst_user_id, 8);
+ appl_id[8] = 0;
+ memcpy(appl_id, iucv->dst_name, 8);
+ pr_err("Application %s on z/VM guest %s"
+ " exceeds message limit\n",
+ appl_id, user_id);
+ err = -EAGAIN;
+ } else
+ err = -EPIPE;
+ skb_unlink(skb, &iucv->send_skb_q);
+ goto fail;
+ }
+
+release:
+ release_sock(sk);
+ return len;
+
+fail:
+ kfree_skb(skb);
+out:
+ release_sock(sk);
+ return err;
+}
+
+/* iucv_fragment_skb() - Fragment a single IUCV message into multiple skb's
+ *
+ * Locking: must be called with message_q.lock held
+ */
+static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len)
+{
+ int dataleft, size, copied = 0;
+ struct sk_buff *nskb;
+
+ dataleft = len;
+ while (dataleft) {
+ if (dataleft >= sk->sk_rcvbuf / 4)
+ size = sk->sk_rcvbuf / 4;
+ else
+ size = dataleft;
+
+ nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA);
+ if (!nskb)
+ return -ENOMEM;
+
+ /* copy target class to control buffer of new skb */
+ IUCV_SKB_CB(nskb)->class = IUCV_SKB_CB(skb)->class;
+
+ /* copy data fragment */
+ memcpy(nskb->data, skb->data + copied, size);
+ copied += size;
+ dataleft -= size;
+
+ skb_reset_transport_header(nskb);
+ skb_reset_network_header(nskb);
+ nskb->len = size;
+
+ skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, nskb);
+ }
+
+ return 0;
+}
+
+/* iucv_process_message() - Receive a single outstanding IUCV message
+ *
+ * Locking: must be called with message_q.lock held
+ */
+static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
+ struct iucv_path *path,
+ struct iucv_message *msg)
+{
+ int rc;
+ unsigned int len;
+
+ len = iucv_msg_length(msg);
+
+ /* store msg target class in the second 4 bytes of skb ctrl buffer */
+ /* Note: the first 4 bytes are reserved for msg tag */
+ IUCV_SKB_CB(skb)->class = msg->class;
+
+ /* check for special IPRM messages (e.g. iucv_sock_shutdown) */
+ if ((msg->flags & IUCV_IPRMDATA) && len > 7) {
+ if (memcmp(msg->rmmsg, iprm_shutdown, 8) == 0) {
+ skb->data = NULL;
+ skb->len = 0;
+ }
+ } else {
+ rc = pr_iucv->message_receive(path, msg,
+ msg->flags & IUCV_IPRMDATA,
+ skb->data, len, NULL);
+ if (rc) {
+ kfree_skb(skb);
+ return;
+ }
+ /* we need to fragment iucv messages for SOCK_STREAM only;
+ * for SOCK_SEQPACKET, it is only relevant if we support
+ * record segmentation using MSG_EOR (see also recvmsg()) */
+ if (sk->sk_type == SOCK_STREAM &&
+ skb->truesize >= sk->sk_rcvbuf / 4) {
+ rc = iucv_fragment_skb(sk, skb, len);
+ kfree_skb(skb);
+ skb = NULL;
+ if (rc) {
+ pr_iucv->path_sever(path, NULL);
+ return;
+ }
+ skb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q);
+ } else {
+ skb_reset_transport_header(skb);
+ skb_reset_network_header(skb);
+ skb->len = len;
+ }
+ }
+
+ IUCV_SKB_CB(skb)->offset = 0;
+ if (sock_queue_rcv_skb(sk, skb))
+ skb_queue_head(&iucv_sk(sk)->backlog_skb_q, skb);
+}
+
+/* iucv_process_message_q() - Process outstanding IUCV messages
+ *
+ * Locking: must be called with message_q.lock held
+ */
+static void iucv_process_message_q(struct sock *sk)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+ struct sk_buff *skb;
+ struct sock_msg_q *p, *n;
+
+ list_for_each_entry_safe(p, n, &iucv->message_q.list, list) {
+ skb = alloc_skb(iucv_msg_length(&p->msg), GFP_ATOMIC | GFP_DMA);
+ if (!skb)
+ break;
+ iucv_process_message(sk, skb, p->path, &p->msg);
+ list_del(&p->list);
+ kfree(p);
+ if (!skb_queue_empty(&iucv->backlog_skb_q))
+ break;
+ }
+}
+
+static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ int noblock = flags & MSG_DONTWAIT;
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv = iucv_sk(sk);
+ unsigned int copied, rlen;
+ struct sk_buff *skb, *rskb, *cskb;
+ int err = 0;
+ u32 offset;
+
+ if ((sk->sk_state == IUCV_DISCONN) &&
+ skb_queue_empty(&iucv->backlog_skb_q) &&
+ skb_queue_empty(&sk->sk_receive_queue) &&
+ list_empty(&iucv->message_q.list))
+ return 0;
+
+ if (flags & (MSG_OOB))
+ return -EOPNOTSUPP;
+
+ /* receive/dequeue next skb:
+ * the function understands MSG_PEEK and, thus, does not dequeue skb */
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb) {
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 0;
+ return err;
+ }
+
+ offset = IUCV_SKB_CB(skb)->offset;
+ rlen = skb->len - offset; /* real length of skb */
+ copied = min_t(unsigned int, rlen, len);
+ if (!rlen)
+ sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN;
+
+ cskb = skb;
+ if (skb_copy_datagram_msg(cskb, offset, msg, copied)) {
+ if (!(flags & MSG_PEEK))
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ return -EFAULT;
+ }
+
+ /* SOCK_SEQPACKET: set MSG_TRUNC if recv buf size is too small */
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ if (copied < rlen)
+ msg->msg_flags |= MSG_TRUNC;
+ /* each iucv message contains a complete record */
+ msg->msg_flags |= MSG_EOR;
+ }
+
+ /* create control message to store iucv msg target class:
+ * get the trgcls from the control buffer of the skb due to
+ * fragmentation of original iucv message. */
+ err = put_cmsg(msg, SOL_IUCV, SCM_IUCV_TRGCLS,
+ sizeof(IUCV_SKB_CB(skb)->class),
+ (void *)&IUCV_SKB_CB(skb)->class);
+ if (err) {
+ if (!(flags & MSG_PEEK))
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ return err;
+ }
+
+ /* Mark read part of skb as used */
+ if (!(flags & MSG_PEEK)) {
+
+ /* SOCK_STREAM: re-queue skb if it contains unreceived data */
+ if (sk->sk_type == SOCK_STREAM) {
+ if (copied < rlen) {
+ IUCV_SKB_CB(skb)->offset = offset + copied;
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ goto done;
+ }
+ }
+
+ kfree_skb(skb);
+ if (iucv->transport == AF_IUCV_TRANS_HIPER) {
+ atomic_inc(&iucv->msg_recv);
+ if (atomic_read(&iucv->msg_recv) > iucv->msglimit) {
+ WARN_ON(1);
+ iucv_sock_close(sk);
+ return -EFAULT;
+ }
+ }
+
+ /* Queue backlog skbs */
+ spin_lock_bh(&iucv->message_q.lock);
+ rskb = skb_dequeue(&iucv->backlog_skb_q);
+ while (rskb) {
+ IUCV_SKB_CB(rskb)->offset = 0;
+ if (sock_queue_rcv_skb(sk, rskb)) {
+ skb_queue_head(&iucv->backlog_skb_q,
+ rskb);
+ break;
+ } else {
+ rskb = skb_dequeue(&iucv->backlog_skb_q);
+ }
+ }
+ if (skb_queue_empty(&iucv->backlog_skb_q)) {
+ if (!list_empty(&iucv->message_q.list))
+ iucv_process_message_q(sk);
+ if (atomic_read(&iucv->msg_recv) >=
+ iucv->msglimit / 2) {
+ err = iucv_send_ctrl(sk, AF_IUCV_FLAG_WIN);
+ if (err) {
+ sk->sk_state = IUCV_DISCONN;
+ sk->sk_state_change(sk);
+ }
+ }
+ }
+ spin_unlock_bh(&iucv->message_q.lock);
+ }
+
+done:
+ /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */
+ if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC))
+ copied = rlen;
+
+ return copied;
+}
+
+static inline unsigned int iucv_accept_poll(struct sock *parent)
+{
+ struct iucv_sock *isk, *n;
+ struct sock *sk;
+
+ list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *) isk;
+
+ if (sk->sk_state == IUCV_CONNECTED)
+ return POLLIN | POLLRDNORM;
+ }
+
+ return 0;
+}
+
+unsigned int iucv_sock_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask = 0;
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+
+ if (sk->sk_state == IUCV_LISTEN)
+ return iucv_accept_poll(sk);
+
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP;
+
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sk->sk_state == IUCV_CLOSED)
+ mask |= POLLHUP;
+
+ if (sk->sk_state == IUCV_DISCONN)
+ mask |= POLLIN;
+
+ if (sock_writeable(sk) && iucv_below_msglim(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ return mask;
+}
+
+static int iucv_sock_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv = iucv_sk(sk);
+ struct iucv_message txmsg;
+ int err = 0;
+
+ how++;
+
+ if ((how & ~SHUTDOWN_MASK) || !how)
+ return -EINVAL;
+
+ lock_sock(sk);
+ switch (sk->sk_state) {
+ case IUCV_LISTEN:
+ case IUCV_DISCONN:
+ case IUCV_CLOSING:
+ case IUCV_CLOSED:
+ err = -ENOTCONN;
+ goto fail;
+ default:
+ break;
+ }
+
+ if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) {
+ if (iucv->transport == AF_IUCV_TRANS_IUCV) {
+ txmsg.class = 0;
+ txmsg.tag = 0;
+ err = pr_iucv->message_send(iucv->path, &txmsg,
+ IUCV_IPRMDATA, 0, (void *) iprm_shutdown, 8);
+ if (err) {
+ switch (err) {
+ case 1:
+ err = -ENOTCONN;
+ break;
+ case 2:
+ err = -ECONNRESET;
+ break;
+ default:
+ err = -ENOTCONN;
+ break;
+ }
+ }
+ } else
+ iucv_send_ctrl(sk, AF_IUCV_FLAG_SHT);
+ }
+
+ sk->sk_shutdown |= how;
+ if (how == RCV_SHUTDOWN || how == SHUTDOWN_MASK) {
+ if ((iucv->transport == AF_IUCV_TRANS_IUCV) &&
+ iucv->path) {
+ err = pr_iucv->path_quiesce(iucv->path, NULL);
+ if (err)
+ err = -ENOTCONN;
+/* skb_queue_purge(&sk->sk_receive_queue); */
+ }
+ skb_queue_purge(&sk->sk_receive_queue);
+ }
+
+ /* Wake up anyone sleeping in poll */
+ sk->sk_state_change(sk);
+
+fail:
+ release_sock(sk);
+ return err;
+}
+
+static int iucv_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ if (!sk)
+ return 0;
+
+ iucv_sock_close(sk);
+
+ sock_orphan(sk);
+ iucv_sock_kill(sk);
+ return err;
+}
+
+/* getsockopt and setsockopt */
+static int iucv_sock_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv = iucv_sk(sk);
+ int val;
+ int rc;
+
+ if (level != SOL_IUCV)
+ return -ENOPROTOOPT;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+
+ rc = 0;
+
+ lock_sock(sk);
+ switch (optname) {
+ case SO_IPRMDATA_MSG:
+ if (val)
+ iucv->flags |= IUCV_IPRMDATA;
+ else
+ iucv->flags &= ~IUCV_IPRMDATA;
+ break;
+ case SO_MSGLIMIT:
+ switch (sk->sk_state) {
+ case IUCV_OPEN:
+ case IUCV_BOUND:
+ if (val < 1 || val > (u16)(~0))
+ rc = -EINVAL;
+ else
+ iucv->msglimit = val;
+ break;
+ default:
+ rc = -EINVAL;
+ break;
+ }
+ break;
+ default:
+ rc = -ENOPROTOOPT;
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
+static int iucv_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct iucv_sock *iucv = iucv_sk(sk);
+ unsigned int val;
+ int len;
+
+ if (level != SOL_IUCV)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < 0)
+ return -EINVAL;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ switch (optname) {
+ case SO_IPRMDATA_MSG:
+ val = (iucv->flags & IUCV_IPRMDATA) ? 1 : 0;
+ break;
+ case SO_MSGLIMIT:
+ lock_sock(sk);
+ val = (iucv->path != NULL) ? iucv->path->msglim /* connected */
+ : iucv->msglimit; /* default */
+ release_sock(sk);
+ break;
+ case SO_MSGSIZE:
+ if (sk->sk_state == IUCV_OPEN)
+ return -EBADFD;
+ val = (iucv->hs_dev) ? iucv->hs_dev->mtu -
+ sizeof(struct af_iucv_trans_hdr) - ETH_HLEN :
+ 0x7fffffff;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+
+/* Callback wrappers - called from iucv base support */
+static int iucv_callback_connreq(struct iucv_path *path,
+ u8 ipvmid[8], u8 ipuser[16])
+{
+ unsigned char user_data[16];
+ unsigned char nuser_data[16];
+ unsigned char src_name[8];
+ struct sock *sk, *nsk;
+ struct iucv_sock *iucv, *niucv;
+ int err;
+
+ memcpy(src_name, ipuser, 8);
+ EBCASC(src_name, 8);
+ /* Find out if this path belongs to af_iucv. */
+ read_lock(&iucv_sk_list.lock);
+ iucv = NULL;
+ sk = NULL;
+ sk_for_each(sk, &iucv_sk_list.head)
+ if (sk->sk_state == IUCV_LISTEN &&
+ !memcmp(&iucv_sk(sk)->src_name, src_name, 8)) {
+ /*
+ * Found a listening socket with
+ * src_name == ipuser[0-7].
+ */
+ iucv = iucv_sk(sk);
+ break;
+ }
+ read_unlock(&iucv_sk_list.lock);
+ if (!iucv)
+ /* No socket found, not one of our paths. */
+ return -EINVAL;
+
+ bh_lock_sock(sk);
+
+ /* Check if parent socket is listening */
+ low_nmcpy(user_data, iucv->src_name);
+ high_nmcpy(user_data, iucv->dst_name);
+ ASCEBC(user_data, sizeof(user_data));
+ if (sk->sk_state != IUCV_LISTEN) {
+ err = pr_iucv->path_sever(path, user_data);
+ iucv_path_free(path);
+ goto fail;
+ }
+
+ /* Check for backlog size */
+ if (sk_acceptq_is_full(sk)) {
+ err = pr_iucv->path_sever(path, user_data);
+ iucv_path_free(path);
+ goto fail;
+ }
+
+ /* Create the new socket */
+ nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+ if (!nsk) {
+ err = pr_iucv->path_sever(path, user_data);
+ iucv_path_free(path);
+ goto fail;
+ }
+
+ niucv = iucv_sk(nsk);
+ iucv_sock_init(nsk, sk);
+
+ /* Set the new iucv_sock */
+ memcpy(niucv->dst_name, ipuser + 8, 8);
+ EBCASC(niucv->dst_name, 8);
+ memcpy(niucv->dst_user_id, ipvmid, 8);
+ memcpy(niucv->src_name, iucv->src_name, 8);
+ memcpy(niucv->src_user_id, iucv->src_user_id, 8);
+ niucv->path = path;
+
+ /* Call iucv_accept */
+ high_nmcpy(nuser_data, ipuser + 8);
+ memcpy(nuser_data + 8, niucv->src_name, 8);
+ ASCEBC(nuser_data + 8, 8);
+
+ /* set message limit for path based on msglimit of accepting socket */
+ niucv->msglimit = iucv->msglimit;
+ path->msglim = iucv->msglimit;
+ err = pr_iucv->path_accept(path, &af_iucv_handler, nuser_data, nsk);
+ if (err) {
+ iucv_sever_path(nsk, 1);
+ iucv_sock_kill(nsk);
+ goto fail;
+ }
+
+ iucv_accept_enqueue(sk, nsk);
+
+ /* Wake up accept */
+ nsk->sk_state = IUCV_CONNECTED;
+ sk->sk_data_ready(sk);
+ err = 0;
+fail:
+ bh_unlock_sock(sk);
+ return 0;
+}
+
+static void iucv_callback_connack(struct iucv_path *path, u8 ipuser[16])
+{
+ struct sock *sk = path->private;
+
+ sk->sk_state = IUCV_CONNECTED;
+ sk->sk_state_change(sk);
+}
+
+static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg)
+{
+ struct sock *sk = path->private;
+ struct iucv_sock *iucv = iucv_sk(sk);
+ struct sk_buff *skb;
+ struct sock_msg_q *save_msg;
+ int len;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ pr_iucv->message_reject(path, msg);
+ return;
+ }
+
+ spin_lock(&iucv->message_q.lock);
+
+ if (!list_empty(&iucv->message_q.list) ||
+ !skb_queue_empty(&iucv->backlog_skb_q))
+ goto save_message;
+
+ len = atomic_read(&sk->sk_rmem_alloc);
+ len += SKB_TRUESIZE(iucv_msg_length(msg));
+ if (len > sk->sk_rcvbuf)
+ goto save_message;
+
+ skb = alloc_skb(iucv_msg_length(msg), GFP_ATOMIC | GFP_DMA);
+ if (!skb)
+ goto save_message;
+
+ iucv_process_message(sk, skb, path, msg);
+ goto out_unlock;
+
+save_message:
+ save_msg = kzalloc(sizeof(struct sock_msg_q), GFP_ATOMIC | GFP_DMA);
+ if (!save_msg)
+ goto out_unlock;
+ save_msg->path = path;
+ save_msg->msg = *msg;
+
+ list_add_tail(&save_msg->list, &iucv->message_q.list);
+
+out_unlock:
+ spin_unlock(&iucv->message_q.lock);
+}
+
+static void iucv_callback_txdone(struct iucv_path *path,
+ struct iucv_message *msg)
+{
+ struct sock *sk = path->private;
+ struct sk_buff *this = NULL;
+ struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q;
+ struct sk_buff *list_skb = list->next;
+ unsigned long flags;
+
+ bh_lock_sock(sk);
+ if (!skb_queue_empty(list)) {
+ spin_lock_irqsave(&list->lock, flags);
+
+ while (list_skb != (struct sk_buff *)list) {
+ if (msg->tag == IUCV_SKB_CB(list_skb)->tag) {
+ this = list_skb;
+ break;
+ }
+ list_skb = list_skb->next;
+ }
+ if (this)
+ __skb_unlink(this, list);
+
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ if (this) {
+ kfree_skb(this);
+ /* wake up any process waiting for sending */
+ iucv_sock_wake_msglim(sk);
+ }
+ }
+
+ if (sk->sk_state == IUCV_CLOSING) {
+ if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) {
+ sk->sk_state = IUCV_CLOSED;
+ sk->sk_state_change(sk);
+ }
+ }
+ bh_unlock_sock(sk);
+
+}
+
+static void iucv_callback_connrej(struct iucv_path *path, u8 ipuser[16])
+{
+ struct sock *sk = path->private;
+
+ if (sk->sk_state == IUCV_CLOSED)
+ return;
+
+ bh_lock_sock(sk);
+ iucv_sever_path(sk, 1);
+ sk->sk_state = IUCV_DISCONN;
+
+ sk->sk_state_change(sk);
+ bh_unlock_sock(sk);
+}
+
+/* called if the other communication side shuts down its RECV direction;
+ * in turn, the callback sets SEND_SHUTDOWN to disable sending of data.
+ */
+static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16])
+{
+ struct sock *sk = path->private;
+
+ bh_lock_sock(sk);
+ if (sk->sk_state != IUCV_CLOSED) {
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ sk->sk_state_change(sk);
+ }
+ bh_unlock_sock(sk);
+}
+
+/***************** HiperSockets transport callbacks ********************/
+static void afiucv_swap_src_dest(struct sk_buff *skb)
+{
+ struct af_iucv_trans_hdr *trans_hdr =
+ (struct af_iucv_trans_hdr *)skb->data;
+ char tmpID[8];
+ char tmpName[8];
+
+ ASCEBC(trans_hdr->destUserID, sizeof(trans_hdr->destUserID));
+ ASCEBC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName));
+ ASCEBC(trans_hdr->srcUserID, sizeof(trans_hdr->srcUserID));
+ ASCEBC(trans_hdr->srcAppName, sizeof(trans_hdr->srcAppName));
+ memcpy(tmpID, trans_hdr->srcUserID, 8);
+ memcpy(tmpName, trans_hdr->srcAppName, 8);
+ memcpy(trans_hdr->srcUserID, trans_hdr->destUserID, 8);
+ memcpy(trans_hdr->srcAppName, trans_hdr->destAppName, 8);
+ memcpy(trans_hdr->destUserID, tmpID, 8);
+ memcpy(trans_hdr->destAppName, tmpName, 8);
+ skb_push(skb, ETH_HLEN);
+ memset(skb->data, 0, ETH_HLEN);
+}
+
+/**
+ * afiucv_hs_callback_syn - react on received SYN
+ **/
+static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
+{
+ struct sock *nsk;
+ struct iucv_sock *iucv, *niucv;
+ struct af_iucv_trans_hdr *trans_hdr;
+ int err;
+
+ iucv = iucv_sk(sk);
+ trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
+ if (!iucv) {
+ /* no sock - connection refused */
+ afiucv_swap_src_dest(skb);
+ trans_hdr->flags = AF_IUCV_FLAG_SYN | AF_IUCV_FLAG_FIN;
+ err = dev_queue_xmit(skb);
+ goto out;
+ }
+
+ nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+ bh_lock_sock(sk);
+ if ((sk->sk_state != IUCV_LISTEN) ||
+ sk_acceptq_is_full(sk) ||
+ !nsk) {
+ /* error on server socket - connection refused */
+ afiucv_swap_src_dest(skb);
+ trans_hdr->flags = AF_IUCV_FLAG_SYN | AF_IUCV_FLAG_FIN;
+ err = dev_queue_xmit(skb);
+ iucv_sock_kill(nsk);
+ bh_unlock_sock(sk);
+ goto out;
+ }
+
+ niucv = iucv_sk(nsk);
+ iucv_sock_init(nsk, sk);
+ niucv->transport = AF_IUCV_TRANS_HIPER;
+ niucv->msglimit = iucv->msglimit;
+ if (!trans_hdr->window)
+ niucv->msglimit_peer = IUCV_HIPER_MSGLIM_DEFAULT;
+ else
+ niucv->msglimit_peer = trans_hdr->window;
+ memcpy(niucv->dst_name, trans_hdr->srcAppName, 8);
+ memcpy(niucv->dst_user_id, trans_hdr->srcUserID, 8);
+ memcpy(niucv->src_name, iucv->src_name, 8);
+ memcpy(niucv->src_user_id, iucv->src_user_id, 8);
+ nsk->sk_bound_dev_if = sk->sk_bound_dev_if;
+ niucv->hs_dev = iucv->hs_dev;
+ dev_hold(niucv->hs_dev);
+ afiucv_swap_src_dest(skb);
+ trans_hdr->flags = AF_IUCV_FLAG_SYN | AF_IUCV_FLAG_ACK;
+ trans_hdr->window = niucv->msglimit;
+ /* if receiver acks the xmit connection is established */
+ err = dev_queue_xmit(skb);
+ if (!err) {
+ iucv_accept_enqueue(sk, nsk);
+ nsk->sk_state = IUCV_CONNECTED;
+ sk->sk_data_ready(sk);
+ } else
+ iucv_sock_kill(nsk);
+ bh_unlock_sock(sk);
+
+out:
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * afiucv_hs_callback_synack() - react on received SYN-ACK
+ **/
+static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+ struct af_iucv_trans_hdr *trans_hdr =
+ (struct af_iucv_trans_hdr *)skb->data;
+
+ if (!iucv)
+ goto out;
+ if (sk->sk_state != IUCV_BOUND)
+ goto out;
+ bh_lock_sock(sk);
+ iucv->msglimit_peer = trans_hdr->window;
+ sk->sk_state = IUCV_CONNECTED;
+ sk->sk_state_change(sk);
+ bh_unlock_sock(sk);
+out:
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * afiucv_hs_callback_synfin() - react on received SYN_FIN
+ **/
+static int afiucv_hs_callback_synfin(struct sock *sk, struct sk_buff *skb)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+
+ if (!iucv)
+ goto out;
+ if (sk->sk_state != IUCV_BOUND)
+ goto out;
+ bh_lock_sock(sk);
+ sk->sk_state = IUCV_DISCONN;
+ sk->sk_state_change(sk);
+ bh_unlock_sock(sk);
+out:
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * afiucv_hs_callback_fin() - react on received FIN
+ **/
+static int afiucv_hs_callback_fin(struct sock *sk, struct sk_buff *skb)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+
+ /* other end of connection closed */
+ if (!iucv)
+ goto out;
+ bh_lock_sock(sk);
+ if (sk->sk_state == IUCV_CONNECTED) {
+ sk->sk_state = IUCV_DISCONN;
+ sk->sk_state_change(sk);
+ }
+ bh_unlock_sock(sk);
+out:
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * afiucv_hs_callback_win() - react on received WIN
+ **/
+static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+ struct af_iucv_trans_hdr *trans_hdr =
+ (struct af_iucv_trans_hdr *)skb->data;
+
+ if (!iucv)
+ return NET_RX_SUCCESS;
+
+ if (sk->sk_state != IUCV_CONNECTED)
+ return NET_RX_SUCCESS;
+
+ atomic_sub(trans_hdr->window, &iucv->msg_sent);
+ iucv_sock_wake_msglim(sk);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * afiucv_hs_callback_rx() - react on received data
+ **/
+static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
+{
+ struct iucv_sock *iucv = iucv_sk(sk);
+
+ if (!iucv) {
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
+ if (sk->sk_state != IUCV_CONNECTED) {
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
+ /* write stuff from iucv_msg to skb cb */
+ if (skb->len < sizeof(struct af_iucv_trans_hdr)) {
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+ skb_pull(skb, sizeof(struct af_iucv_trans_hdr));
+ skb_reset_transport_header(skb);
+ skb_reset_network_header(skb);
+ IUCV_SKB_CB(skb)->offset = 0;
+ spin_lock(&iucv->message_q.lock);
+ if (skb_queue_empty(&iucv->backlog_skb_q)) {
+ if (sock_queue_rcv_skb(sk, skb)) {
+ /* handle rcv queue full */
+ skb_queue_tail(&iucv->backlog_skb_q, skb);
+ }
+ } else
+ skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, skb);
+ spin_unlock(&iucv->message_q.lock);
+ return NET_RX_SUCCESS;
+}
+
+/**
+ * afiucv_hs_rcv() - base function for arriving data through HiperSockets
+ * transport
+ * called from netif RX softirq
+ **/
+static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct sock *sk;
+ struct iucv_sock *iucv;
+ struct af_iucv_trans_hdr *trans_hdr;
+ char nullstring[8];
+ int err = 0;
+
+ skb_pull(skb, ETH_HLEN);
+ trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
+ EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName));
+ EBCASC(trans_hdr->destUserID, sizeof(trans_hdr->destUserID));
+ EBCASC(trans_hdr->srcAppName, sizeof(trans_hdr->srcAppName));
+ EBCASC(trans_hdr->srcUserID, sizeof(trans_hdr->srcUserID));
+ memset(nullstring, 0, sizeof(nullstring));
+ iucv = NULL;
+ sk = NULL;
+ read_lock(&iucv_sk_list.lock);
+ sk_for_each(sk, &iucv_sk_list.head) {
+ if (trans_hdr->flags == AF_IUCV_FLAG_SYN) {
+ if ((!memcmp(&iucv_sk(sk)->src_name,
+ trans_hdr->destAppName, 8)) &&
+ (!memcmp(&iucv_sk(sk)->src_user_id,
+ trans_hdr->destUserID, 8)) &&
+ (!memcmp(&iucv_sk(sk)->dst_name, nullstring, 8)) &&
+ (!memcmp(&iucv_sk(sk)->dst_user_id,
+ nullstring, 8))) {
+ iucv = iucv_sk(sk);
+ break;
+ }
+ } else {
+ if ((!memcmp(&iucv_sk(sk)->src_name,
+ trans_hdr->destAppName, 8)) &&
+ (!memcmp(&iucv_sk(sk)->src_user_id,
+ trans_hdr->destUserID, 8)) &&
+ (!memcmp(&iucv_sk(sk)->dst_name,
+ trans_hdr->srcAppName, 8)) &&
+ (!memcmp(&iucv_sk(sk)->dst_user_id,
+ trans_hdr->srcUserID, 8))) {
+ iucv = iucv_sk(sk);
+ break;
+ }
+ }
+ }
+ read_unlock(&iucv_sk_list.lock);
+ if (!iucv)
+ sk = NULL;
+
+ /* no sock
+ how should we send with no sock
+ 1) send without sock no send rc checking?
+ 2) introduce default sock to handle this cases
+
+ SYN -> send SYN|ACK in good case, send SYN|FIN in bad case
+ data -> send FIN
+ SYN|ACK, SYN|FIN, FIN -> no action? */
+
+ switch (trans_hdr->flags) {
+ case AF_IUCV_FLAG_SYN:
+ /* connect request */
+ err = afiucv_hs_callback_syn(sk, skb);
+ break;
+ case (AF_IUCV_FLAG_SYN | AF_IUCV_FLAG_ACK):
+ /* connect request confirmed */
+ err = afiucv_hs_callback_synack(sk, skb);
+ break;
+ case (AF_IUCV_FLAG_SYN | AF_IUCV_FLAG_FIN):
+ /* connect request refused */
+ err = afiucv_hs_callback_synfin(sk, skb);
+ break;
+ case (AF_IUCV_FLAG_FIN):
+ /* close request */
+ err = afiucv_hs_callback_fin(sk, skb);
+ break;
+ case (AF_IUCV_FLAG_WIN):
+ err = afiucv_hs_callback_win(sk, skb);
+ if (skb->len == sizeof(struct af_iucv_trans_hdr)) {
+ kfree_skb(skb);
+ break;
+ }
+ /* fall through and receive non-zero length data */
+ case (AF_IUCV_FLAG_SHT):
+ /* shutdown request */
+ /* fall through and receive zero length data */
+ case 0:
+ /* plain data frame */
+ IUCV_SKB_CB(skb)->class = trans_hdr->iucv_hdr.class;
+ err = afiucv_hs_callback_rx(sk, skb);
+ break;
+ default:
+ ;
+ }
+
+ return err;
+}
+
+/**
+ * afiucv_hs_callback_txnotify() - handle send notifcations from HiperSockets
+ * transport
+ **/
+static void afiucv_hs_callback_txnotify(struct sk_buff *skb,
+ enum iucv_tx_notify n)
+{
+ struct sock *isk = skb->sk;
+ struct sock *sk = NULL;
+ struct iucv_sock *iucv = NULL;
+ struct sk_buff_head *list;
+ struct sk_buff *list_skb;
+ struct sk_buff *nskb;
+ unsigned long flags;
+
+ read_lock_irqsave(&iucv_sk_list.lock, flags);
+ sk_for_each(sk, &iucv_sk_list.head)
+ if (sk == isk) {
+ iucv = iucv_sk(sk);
+ break;
+ }
+ read_unlock_irqrestore(&iucv_sk_list.lock, flags);
+
+ if (!iucv || sock_flag(sk, SOCK_ZAPPED))
+ return;
+
+ list = &iucv->send_skb_q;
+ spin_lock_irqsave(&list->lock, flags);
+ if (skb_queue_empty(list))
+ goto out_unlock;
+ list_skb = list->next;
+ nskb = list_skb->next;
+ while (list_skb != (struct sk_buff *)list) {
+ if (skb_shinfo(list_skb) == skb_shinfo(skb)) {
+ switch (n) {
+ case TX_NOTIFY_OK:
+ __skb_unlink(list_skb, list);
+ kfree_skb(list_skb);
+ iucv_sock_wake_msglim(sk);
+ break;
+ case TX_NOTIFY_PENDING:
+ atomic_inc(&iucv->pendings);
+ break;
+ case TX_NOTIFY_DELAYED_OK:
+ __skb_unlink(list_skb, list);
+ atomic_dec(&iucv->pendings);
+ if (atomic_read(&iucv->pendings) <= 0)
+ iucv_sock_wake_msglim(sk);
+ kfree_skb(list_skb);
+ break;
+ case TX_NOTIFY_UNREACHABLE:
+ case TX_NOTIFY_DELAYED_UNREACHABLE:
+ case TX_NOTIFY_TPQFULL: /* not yet used */
+ case TX_NOTIFY_GENERALERROR:
+ case TX_NOTIFY_DELAYED_GENERALERROR:
+ __skb_unlink(list_skb, list);
+ kfree_skb(list_skb);
+ if (sk->sk_state == IUCV_CONNECTED) {
+ sk->sk_state = IUCV_DISCONN;
+ sk->sk_state_change(sk);
+ }
+ break;
+ }
+ break;
+ }
+ list_skb = nskb;
+ nskb = nskb->next;
+ }
+out_unlock:
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ if (sk->sk_state == IUCV_CLOSING) {
+ if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) {
+ sk->sk_state = IUCV_CLOSED;
+ sk->sk_state_change(sk);
+ }
+ }
+
+}
+
+/*
+ * afiucv_netdev_event: handle netdev notifier chain events
+ */
+static int afiucv_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ struct sock *sk;
+ struct iucv_sock *iucv;
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ case NETDEV_GOING_DOWN:
+ sk_for_each(sk, &iucv_sk_list.head) {
+ iucv = iucv_sk(sk);
+ if ((iucv->hs_dev == event_dev) &&
+ (sk->sk_state == IUCV_CONNECTED)) {
+ if (event == NETDEV_GOING_DOWN)
+ iucv_send_ctrl(sk, AF_IUCV_FLAG_FIN);
+ sk->sk_state = IUCV_DISCONN;
+ sk->sk_state_change(sk);
+ }
+ }
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block afiucv_netdev_notifier = {
+ .notifier_call = afiucv_netdev_event,
+};
+
+static const struct proto_ops iucv_sock_ops = {
+ .family = PF_IUCV,
+ .owner = THIS_MODULE,
+ .release = iucv_sock_release,
+ .bind = iucv_sock_bind,
+ .connect = iucv_sock_connect,
+ .listen = iucv_sock_listen,
+ .accept = iucv_sock_accept,
+ .getname = iucv_sock_getname,
+ .sendmsg = iucv_sock_sendmsg,
+ .recvmsg = iucv_sock_recvmsg,
+ .poll = iucv_sock_poll,
+ .ioctl = sock_no_ioctl,
+ .mmap = sock_no_mmap,
+ .socketpair = sock_no_socketpair,
+ .shutdown = iucv_sock_shutdown,
+ .setsockopt = iucv_sock_setsockopt,
+ .getsockopt = iucv_sock_getsockopt,
+};
+
+static const struct net_proto_family iucv_sock_family_ops = {
+ .family = AF_IUCV,
+ .owner = THIS_MODULE,
+ .create = iucv_sock_create,
+};
+
+static struct packet_type iucv_packet_type = {
+ .type = cpu_to_be16(ETH_P_AF_IUCV),
+ .func = afiucv_hs_rcv,
+};
+
+static int afiucv_iucv_init(void)
+{
+ int err;
+
+ err = pr_iucv->iucv_register(&af_iucv_handler, 0);
+ if (err)
+ goto out;
+ /* establish dummy device */
+ af_iucv_driver.bus = pr_iucv->bus;
+ err = driver_register(&af_iucv_driver);
+ if (err)
+ goto out_iucv;
+ af_iucv_dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+ if (!af_iucv_dev) {
+ err = -ENOMEM;
+ goto out_driver;
+ }
+ dev_set_name(af_iucv_dev, "af_iucv");
+ af_iucv_dev->bus = pr_iucv->bus;
+ af_iucv_dev->parent = pr_iucv->root;
+ af_iucv_dev->release = (void (*)(struct device *))kfree;
+ af_iucv_dev->driver = &af_iucv_driver;
+ err = device_register(af_iucv_dev);
+ if (err)
+ goto out_driver;
+ return 0;
+
+out_driver:
+ driver_unregister(&af_iucv_driver);
+out_iucv:
+ pr_iucv->iucv_unregister(&af_iucv_handler, 0);
+out:
+ return err;
+}
+
+static int __init afiucv_init(void)
+{
+ int err;
+
+ if (MACHINE_IS_VM) {
+ cpcmd("QUERY USERID", iucv_userid, sizeof(iucv_userid), &err);
+ if (unlikely(err)) {
+ WARN_ON(err);
+ err = -EPROTONOSUPPORT;
+ goto out;
+ }
+
+ pr_iucv = try_then_request_module(symbol_get(iucv_if), "iucv");
+ if (!pr_iucv) {
+ printk(KERN_WARNING "iucv_if lookup failed\n");
+ memset(&iucv_userid, 0, sizeof(iucv_userid));
+ }
+ } else {
+ memset(&iucv_userid, 0, sizeof(iucv_userid));
+ pr_iucv = NULL;
+ }
+
+ err = proto_register(&iucv_proto, 0);
+ if (err)
+ goto out;
+ err = sock_register(&iucv_sock_family_ops);
+ if (err)
+ goto out_proto;
+
+ if (pr_iucv) {
+ err = afiucv_iucv_init();
+ if (err)
+ goto out_sock;
+ } else
+ register_netdevice_notifier(&afiucv_netdev_notifier);
+ dev_add_pack(&iucv_packet_type);
+ return 0;
+
+out_sock:
+ sock_unregister(PF_IUCV);
+out_proto:
+ proto_unregister(&iucv_proto);
+out:
+ if (pr_iucv)
+ symbol_put(iucv_if);
+ return err;
+}
+
+static void __exit afiucv_exit(void)
+{
+ if (pr_iucv) {
+ device_unregister(af_iucv_dev);
+ driver_unregister(&af_iucv_driver);
+ pr_iucv->iucv_unregister(&af_iucv_handler, 0);
+ symbol_put(iucv_if);
+ } else
+ unregister_netdevice_notifier(&afiucv_netdev_notifier);
+ dev_remove_pack(&iucv_packet_type);
+ sock_unregister(PF_IUCV);
+ proto_unregister(&iucv_proto);
+}
+
+module_init(afiucv_init);
+module_exit(afiucv_exit);
+
+MODULE_AUTHOR("Jennifer Hunt <jenhunt@us.ibm.com>");
+MODULE_DESCRIPTION("IUCV Sockets ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_IUCV);
+
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
new file mode 100644
index 000000000..2a6a1fdd6
--- /dev/null
+++ b/net/iucv/iucv.c
@@ -0,0 +1,2119 @@
+/*
+ * IUCV base infrastructure.
+ *
+ * Copyright IBM Corp. 2001, 2009
+ *
+ * Author(s):
+ * Original source:
+ * Alan Altmark (Alan_Altmark@us.ibm.com) Sept. 2000
+ * Xenia Tkatschow (xenia@us.ibm.com)
+ * 2Gb awareness and general cleanup:
+ * Fritz Elfert (elfert@de.ibm.com, felfert@millenux.com)
+ * Rewritten for af_iucv:
+ * Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * PM functions:
+ * Ursula Braun (ursula.braun@de.ibm.com)
+ *
+ * Documentation used:
+ * The original source
+ * CP Programming Service, IBM document # SC24-5760
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define KMSG_COMPONENT "iucv"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/reboot.h>
+#include <net/iucv/iucv.h>
+#include <linux/atomic.h>
+#include <asm/ebcdic.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/smp.h>
+
+/*
+ * FLAGS:
+ * All flags are defined in the field IPFLAGS1 of each function
+ * and can be found in CP Programming Services.
+ * IPSRCCLS - Indicates you have specified a source class.
+ * IPTRGCLS - Indicates you have specified a target class.
+ * IPFGPID - Indicates you have specified a pathid.
+ * IPFGMID - Indicates you have specified a message ID.
+ * IPNORPY - Indicates a one-way message. No reply expected.
+ * IPALL - Indicates that all paths are affected.
+ */
+#define IUCV_IPSRCCLS 0x01
+#define IUCV_IPTRGCLS 0x01
+#define IUCV_IPFGPID 0x02
+#define IUCV_IPFGMID 0x04
+#define IUCV_IPNORPY 0x10
+#define IUCV_IPALL 0x80
+
+static int iucv_bus_match(struct device *dev, struct device_driver *drv)
+{
+ return 0;
+}
+
+enum iucv_pm_states {
+ IUCV_PM_INITIAL = 0,
+ IUCV_PM_FREEZING = 1,
+ IUCV_PM_THAWING = 2,
+ IUCV_PM_RESTORING = 3,
+};
+static enum iucv_pm_states iucv_pm_state;
+
+static int iucv_pm_prepare(struct device *);
+static void iucv_pm_complete(struct device *);
+static int iucv_pm_freeze(struct device *);
+static int iucv_pm_thaw(struct device *);
+static int iucv_pm_restore(struct device *);
+
+static const struct dev_pm_ops iucv_pm_ops = {
+ .prepare = iucv_pm_prepare,
+ .complete = iucv_pm_complete,
+ .freeze = iucv_pm_freeze,
+ .thaw = iucv_pm_thaw,
+ .restore = iucv_pm_restore,
+};
+
+struct bus_type iucv_bus = {
+ .name = "iucv",
+ .match = iucv_bus_match,
+ .pm = &iucv_pm_ops,
+};
+EXPORT_SYMBOL(iucv_bus);
+
+struct device *iucv_root;
+EXPORT_SYMBOL(iucv_root);
+
+static int iucv_available;
+
+/* General IUCV interrupt structure */
+struct iucv_irq_data {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iptype;
+ u32 res2[8];
+};
+
+struct iucv_irq_list {
+ struct list_head list;
+ struct iucv_irq_data data;
+};
+
+static struct iucv_irq_data *iucv_irq_data[NR_CPUS];
+static cpumask_t iucv_buffer_cpumask = { CPU_BITS_NONE };
+static cpumask_t iucv_irq_cpumask = { CPU_BITS_NONE };
+
+/*
+ * Queue of interrupt buffers lock for delivery via the tasklet
+ * (fast but can't call smp_call_function).
+ */
+static LIST_HEAD(iucv_task_queue);
+
+/*
+ * The tasklet for fast delivery of iucv interrupts.
+ */
+static void iucv_tasklet_fn(unsigned long);
+static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0);
+
+/*
+ * Queue of interrupt buffers for delivery via a work queue
+ * (slower but can call smp_call_function).
+ */
+static LIST_HEAD(iucv_work_queue);
+
+/*
+ * The work element to deliver path pending interrupts.
+ */
+static void iucv_work_fn(struct work_struct *work);
+static DECLARE_WORK(iucv_work, iucv_work_fn);
+
+/*
+ * Spinlock protecting task and work queue.
+ */
+static DEFINE_SPINLOCK(iucv_queue_lock);
+
+enum iucv_command_codes {
+ IUCV_QUERY = 0,
+ IUCV_RETRIEVE_BUFFER = 2,
+ IUCV_SEND = 4,
+ IUCV_RECEIVE = 5,
+ IUCV_REPLY = 6,
+ IUCV_REJECT = 8,
+ IUCV_PURGE = 9,
+ IUCV_ACCEPT = 10,
+ IUCV_CONNECT = 11,
+ IUCV_DECLARE_BUFFER = 12,
+ IUCV_QUIESCE = 13,
+ IUCV_RESUME = 14,
+ IUCV_SEVER = 15,
+ IUCV_SETMASK = 16,
+ IUCV_SETCONTROLMASK = 17,
+};
+
+/*
+ * Error messages that are used with the iucv_sever function. They get
+ * converted to EBCDIC.
+ */
+static char iucv_error_no_listener[16] = "NO LISTENER";
+static char iucv_error_no_memory[16] = "NO MEMORY";
+static char iucv_error_pathid[16] = "INVALID PATHID";
+
+/*
+ * iucv_handler_list: List of registered handlers.
+ */
+static LIST_HEAD(iucv_handler_list);
+
+/*
+ * iucv_path_table: an array of iucv_path structures.
+ */
+static struct iucv_path **iucv_path_table;
+static unsigned long iucv_max_pathid;
+
+/*
+ * iucv_lock: spinlock protecting iucv_handler_list and iucv_pathid_table
+ */
+static DEFINE_SPINLOCK(iucv_table_lock);
+
+/*
+ * iucv_active_cpu: contains the number of the cpu executing the tasklet
+ * or the work handler. Needed for iucv_path_sever called from tasklet.
+ */
+static int iucv_active_cpu = -1;
+
+/*
+ * Mutex and wait queue for iucv_register/iucv_unregister.
+ */
+static DEFINE_MUTEX(iucv_register_mutex);
+
+/*
+ * Counter for number of non-smp capable handlers.
+ */
+static int iucv_nonsmp_handler;
+
+/*
+ * IUCV control data structure. Used by iucv_path_accept, iucv_path_connect,
+ * iucv_path_quiesce and iucv_path_sever.
+ */
+struct iucv_cmd_control {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iprcode;
+ u16 ipmsglim;
+ u16 res1;
+ u8 ipvmid[8];
+ u8 ipuser[16];
+ u8 iptarget[8];
+} __attribute__ ((packed,aligned(8)));
+
+/*
+ * Data in parameter list iucv structure. Used by iucv_message_send,
+ * iucv_message_send2way and iucv_message_reply.
+ */
+struct iucv_cmd_dpl {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iprcode;
+ u32 ipmsgid;
+ u32 iptrgcls;
+ u8 iprmmsg[8];
+ u32 ipsrccls;
+ u32 ipmsgtag;
+ u32 ipbfadr2;
+ u32 ipbfln2f;
+ u32 res;
+} __attribute__ ((packed,aligned(8)));
+
+/*
+ * Data in buffer iucv structure. Used by iucv_message_receive,
+ * iucv_message_reject, iucv_message_send, iucv_message_send2way
+ * and iucv_declare_cpu.
+ */
+struct iucv_cmd_db {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iprcode;
+ u32 ipmsgid;
+ u32 iptrgcls;
+ u32 ipbfadr1;
+ u32 ipbfln1f;
+ u32 ipsrccls;
+ u32 ipmsgtag;
+ u32 ipbfadr2;
+ u32 ipbfln2f;
+ u32 res;
+} __attribute__ ((packed,aligned(8)));
+
+/*
+ * Purge message iucv structure. Used by iucv_message_purge.
+ */
+struct iucv_cmd_purge {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iprcode;
+ u32 ipmsgid;
+ u8 ipaudit[3];
+ u8 res1[5];
+ u32 res2;
+ u32 ipsrccls;
+ u32 ipmsgtag;
+ u32 res3[3];
+} __attribute__ ((packed,aligned(8)));
+
+/*
+ * Set mask iucv structure. Used by iucv_enable_cpu.
+ */
+struct iucv_cmd_set_mask {
+ u8 ipmask;
+ u8 res1[2];
+ u8 iprcode;
+ u32 res2[9];
+} __attribute__ ((packed,aligned(8)));
+
+union iucv_param {
+ struct iucv_cmd_control ctrl;
+ struct iucv_cmd_dpl dpl;
+ struct iucv_cmd_db db;
+ struct iucv_cmd_purge purge;
+ struct iucv_cmd_set_mask set_mask;
+};
+
+/*
+ * Anchor for per-cpu IUCV command parameter block.
+ */
+static union iucv_param *iucv_param[NR_CPUS];
+static union iucv_param *iucv_param_irq[NR_CPUS];
+
+/**
+ * iucv_call_b2f0
+ * @code: identifier of IUCV call to CP.
+ * @parm: pointer to a struct iucv_parm block
+ *
+ * Calls CP to execute IUCV commands.
+ *
+ * Returns the result of the CP IUCV call.
+ */
+static inline int iucv_call_b2f0(int command, union iucv_param *parm)
+{
+ register unsigned long reg0 asm ("0");
+ register unsigned long reg1 asm ("1");
+ int ccode;
+
+ reg0 = command;
+ reg1 = virt_to_phys(parm);
+ asm volatile(
+ " .long 0xb2f01000\n"
+ " ipm %0\n"
+ " srl %0,28\n"
+ : "=d" (ccode), "=m" (*parm), "+d" (reg0), "+a" (reg1)
+ : "m" (*parm) : "cc");
+ return (ccode == 1) ? parm->ctrl.iprcode : ccode;
+}
+
+/**
+ * iucv_query_maxconn
+ *
+ * Determines the maximum number of connections that may be established.
+ *
+ * Returns the maximum number of connections or -EPERM is IUCV is not
+ * available.
+ */
+static int iucv_query_maxconn(void)
+{
+ register unsigned long reg0 asm ("0");
+ register unsigned long reg1 asm ("1");
+ void *param;
+ int ccode;
+
+ param = kzalloc(sizeof(union iucv_param), GFP_KERNEL|GFP_DMA);
+ if (!param)
+ return -ENOMEM;
+ reg0 = IUCV_QUERY;
+ reg1 = (unsigned long) param;
+ asm volatile (
+ " .long 0xb2f01000\n"
+ " ipm %0\n"
+ " srl %0,28\n"
+ : "=d" (ccode), "+d" (reg0), "+d" (reg1) : : "cc");
+ if (ccode == 0)
+ iucv_max_pathid = reg1;
+ kfree(param);
+ return ccode ? -EPERM : 0;
+}
+
+/**
+ * iucv_allow_cpu
+ * @data: unused
+ *
+ * Allow iucv interrupts on this cpu.
+ */
+static void iucv_allow_cpu(void *data)
+{
+ int cpu = smp_processor_id();
+ union iucv_param *parm;
+
+ /*
+ * Enable all iucv interrupts.
+ * ipmask contains bits for the different interrupts
+ * 0x80 - Flag to allow nonpriority message pending interrupts
+ * 0x40 - Flag to allow priority message pending interrupts
+ * 0x20 - Flag to allow nonpriority message completion interrupts
+ * 0x10 - Flag to allow priority message completion interrupts
+ * 0x08 - Flag to allow IUCV control interrupts
+ */
+ parm = iucv_param_irq[cpu];
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->set_mask.ipmask = 0xf8;
+ iucv_call_b2f0(IUCV_SETMASK, parm);
+
+ /*
+ * Enable all iucv control interrupts.
+ * ipmask contains bits for the different interrupts
+ * 0x80 - Flag to allow pending connections interrupts
+ * 0x40 - Flag to allow connection complete interrupts
+ * 0x20 - Flag to allow connection severed interrupts
+ * 0x10 - Flag to allow connection quiesced interrupts
+ * 0x08 - Flag to allow connection resumed interrupts
+ */
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->set_mask.ipmask = 0xf8;
+ iucv_call_b2f0(IUCV_SETCONTROLMASK, parm);
+ /* Set indication that iucv interrupts are allowed for this cpu. */
+ cpumask_set_cpu(cpu, &iucv_irq_cpumask);
+}
+
+/**
+ * iucv_block_cpu
+ * @data: unused
+ *
+ * Block iucv interrupts on this cpu.
+ */
+static void iucv_block_cpu(void *data)
+{
+ int cpu = smp_processor_id();
+ union iucv_param *parm;
+
+ /* Disable all iucv interrupts. */
+ parm = iucv_param_irq[cpu];
+ memset(parm, 0, sizeof(union iucv_param));
+ iucv_call_b2f0(IUCV_SETMASK, parm);
+
+ /* Clear indication that iucv interrupts are allowed for this cpu. */
+ cpumask_clear_cpu(cpu, &iucv_irq_cpumask);
+}
+
+/**
+ * iucv_block_cpu_almost
+ * @data: unused
+ *
+ * Allow connection-severed interrupts only on this cpu.
+ */
+static void iucv_block_cpu_almost(void *data)
+{
+ int cpu = smp_processor_id();
+ union iucv_param *parm;
+
+ /* Allow iucv control interrupts only */
+ parm = iucv_param_irq[cpu];
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->set_mask.ipmask = 0x08;
+ iucv_call_b2f0(IUCV_SETMASK, parm);
+ /* Allow iucv-severed interrupt only */
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->set_mask.ipmask = 0x20;
+ iucv_call_b2f0(IUCV_SETCONTROLMASK, parm);
+
+ /* Clear indication that iucv interrupts are allowed for this cpu. */
+ cpumask_clear_cpu(cpu, &iucv_irq_cpumask);
+}
+
+/**
+ * iucv_declare_cpu
+ * @data: unused
+ *
+ * Declare a interrupt buffer on this cpu.
+ */
+static void iucv_declare_cpu(void *data)
+{
+ int cpu = smp_processor_id();
+ union iucv_param *parm;
+ int rc;
+
+ if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask))
+ return;
+
+ /* Declare interrupt buffer. */
+ parm = iucv_param_irq[cpu];
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->db.ipbfadr1 = virt_to_phys(iucv_irq_data[cpu]);
+ rc = iucv_call_b2f0(IUCV_DECLARE_BUFFER, parm);
+ if (rc) {
+ char *err = "Unknown";
+ switch (rc) {
+ case 0x03:
+ err = "Directory error";
+ break;
+ case 0x0a:
+ err = "Invalid length";
+ break;
+ case 0x13:
+ err = "Buffer already exists";
+ break;
+ case 0x3e:
+ err = "Buffer overlap";
+ break;
+ case 0x5c:
+ err = "Paging or storage error";
+ break;
+ }
+ pr_warn("Defining an interrupt buffer on CPU %i failed with 0x%02x (%s)\n",
+ cpu, rc, err);
+ return;
+ }
+
+ /* Set indication that an iucv buffer exists for this cpu. */
+ cpumask_set_cpu(cpu, &iucv_buffer_cpumask);
+
+ if (iucv_nonsmp_handler == 0 || cpumask_empty(&iucv_irq_cpumask))
+ /* Enable iucv interrupts on this cpu. */
+ iucv_allow_cpu(NULL);
+ else
+ /* Disable iucv interrupts on this cpu. */
+ iucv_block_cpu(NULL);
+}
+
+/**
+ * iucv_retrieve_cpu
+ * @data: unused
+ *
+ * Retrieve interrupt buffer on this cpu.
+ */
+static void iucv_retrieve_cpu(void *data)
+{
+ int cpu = smp_processor_id();
+ union iucv_param *parm;
+
+ if (!cpumask_test_cpu(cpu, &iucv_buffer_cpumask))
+ return;
+
+ /* Block iucv interrupts. */
+ iucv_block_cpu(NULL);
+
+ /* Retrieve interrupt buffer. */
+ parm = iucv_param_irq[cpu];
+ iucv_call_b2f0(IUCV_RETRIEVE_BUFFER, parm);
+
+ /* Clear indication that an iucv buffer exists for this cpu. */
+ cpumask_clear_cpu(cpu, &iucv_buffer_cpumask);
+}
+
+/**
+ * iucv_setmask_smp
+ *
+ * Allow iucv interrupts on all cpus.
+ */
+static void iucv_setmask_mp(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ /* Enable all cpus with a declared buffer. */
+ if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask) &&
+ !cpumask_test_cpu(cpu, &iucv_irq_cpumask))
+ smp_call_function_single(cpu, iucv_allow_cpu,
+ NULL, 1);
+ put_online_cpus();
+}
+
+/**
+ * iucv_setmask_up
+ *
+ * Allow iucv interrupts on a single cpu.
+ */
+static void iucv_setmask_up(void)
+{
+ cpumask_t cpumask;
+ int cpu;
+
+ /* Disable all cpu but the first in cpu_irq_cpumask. */
+ cpumask_copy(&cpumask, &iucv_irq_cpumask);
+ cpumask_clear_cpu(cpumask_first(&iucv_irq_cpumask), &cpumask);
+ for_each_cpu(cpu, &cpumask)
+ smp_call_function_single(cpu, iucv_block_cpu, NULL, 1);
+}
+
+/**
+ * iucv_enable
+ *
+ * This function makes iucv ready for use. It allocates the pathid
+ * table, declares an iucv interrupt buffer and enables the iucv
+ * interrupts. Called when the first user has registered an iucv
+ * handler.
+ */
+static int iucv_enable(void)
+{
+ size_t alloc_size;
+ int cpu, rc;
+
+ get_online_cpus();
+ rc = -ENOMEM;
+ alloc_size = iucv_max_pathid * sizeof(struct iucv_path);
+ iucv_path_table = kzalloc(alloc_size, GFP_KERNEL);
+ if (!iucv_path_table)
+ goto out;
+ /* Declare per cpu buffers. */
+ rc = -EIO;
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
+ if (cpumask_empty(&iucv_buffer_cpumask))
+ /* No cpu could declare an iucv buffer. */
+ goto out;
+ put_online_cpus();
+ return 0;
+out:
+ kfree(iucv_path_table);
+ iucv_path_table = NULL;
+ put_online_cpus();
+ return rc;
+}
+
+/**
+ * iucv_disable
+ *
+ * This function shuts down iucv. It disables iucv interrupts, retrieves
+ * the iucv interrupt buffer and frees the pathid table. Called after the
+ * last user unregister its iucv handler.
+ */
+static void iucv_disable(void)
+{
+ get_online_cpus();
+ on_each_cpu(iucv_retrieve_cpu, NULL, 1);
+ kfree(iucv_path_table);
+ iucv_path_table = NULL;
+ put_online_cpus();
+}
+
+static void free_iucv_data(int cpu)
+{
+ kfree(iucv_param_irq[cpu]);
+ iucv_param_irq[cpu] = NULL;
+ kfree(iucv_param[cpu]);
+ iucv_param[cpu] = NULL;
+ kfree(iucv_irq_data[cpu]);
+ iucv_irq_data[cpu] = NULL;
+}
+
+static int alloc_iucv_data(int cpu)
+{
+ /* Note: GFP_DMA used to get memory below 2G */
+ iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
+ GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+ if (!iucv_irq_data[cpu])
+ goto out_free;
+
+ /* Allocate parameter blocks. */
+ iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
+ GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+ if (!iucv_param[cpu])
+ goto out_free;
+
+ iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
+ GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+ if (!iucv_param_irq[cpu])
+ goto out_free;
+
+ return 0;
+
+out_free:
+ free_iucv_data(cpu);
+ return -ENOMEM;
+}
+
+static int iucv_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ cpumask_t cpumask;
+ long cpu = (long) hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (alloc_iucv_data(cpu))
+ return notifier_from_errno(-ENOMEM);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ free_iucv_data(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ if (!iucv_path_table)
+ break;
+ smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ if (!iucv_path_table)
+ break;
+ cpumask_copy(&cpumask, &iucv_buffer_cpumask);
+ cpumask_clear_cpu(cpu, &cpumask);
+ if (cpumask_empty(&cpumask))
+ /* Can't offline last IUCV enabled cpu. */
+ return notifier_from_errno(-EINVAL);
+ smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
+ if (cpumask_empty(&iucv_irq_cpumask))
+ smp_call_function_single(
+ cpumask_first(&iucv_buffer_cpumask),
+ iucv_allow_cpu, NULL, 1);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata iucv_cpu_notifier = {
+ .notifier_call = iucv_cpu_notify,
+};
+
+/**
+ * iucv_sever_pathid
+ * @pathid: path identification number.
+ * @userdata: 16-bytes of user data.
+ *
+ * Sever an iucv path to free up the pathid. Used internally.
+ */
+static int iucv_sever_pathid(u16 pathid, u8 userdata[16])
+{
+ union iucv_param *parm;
+
+ parm = iucv_param_irq[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ if (userdata)
+ memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+ parm->ctrl.ippathid = pathid;
+ return iucv_call_b2f0(IUCV_SEVER, parm);
+}
+
+/**
+ * __iucv_cleanup_queue
+ * @dummy: unused dummy argument
+ *
+ * Nop function called via smp_call_function to force work items from
+ * pending external iucv interrupts to the work queue.
+ */
+static void __iucv_cleanup_queue(void *dummy)
+{
+}
+
+/**
+ * iucv_cleanup_queue
+ *
+ * Function called after a path has been severed to find all remaining
+ * work items for the now stale pathid. The caller needs to hold the
+ * iucv_table_lock.
+ */
+static void iucv_cleanup_queue(void)
+{
+ struct iucv_irq_list *p, *n;
+
+ /*
+ * When a path is severed, the pathid can be reused immediately
+ * on a iucv connect or a connection pending interrupt. Remove
+ * all entries from the task queue that refer to a stale pathid
+ * (iucv_path_table[ix] == NULL). Only then do the iucv connect
+ * or deliver the connection pending interrupt. To get all the
+ * pending interrupts force them to the work queue by calling
+ * an empty function on all cpus.
+ */
+ smp_call_function(__iucv_cleanup_queue, NULL, 1);
+ spin_lock_irq(&iucv_queue_lock);
+ list_for_each_entry_safe(p, n, &iucv_task_queue, list) {
+ /* Remove stale work items from the task queue. */
+ if (iucv_path_table[p->data.ippathid] == NULL) {
+ list_del(&p->list);
+ kfree(p);
+ }
+ }
+ spin_unlock_irq(&iucv_queue_lock);
+}
+
+/**
+ * iucv_register:
+ * @handler: address of iucv handler structure
+ * @smp: != 0 indicates that the handler can deal with out of order messages
+ *
+ * Registers a driver with IUCV.
+ *
+ * Returns 0 on success, -ENOMEM if the memory allocation for the pathid
+ * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus.
+ */
+int iucv_register(struct iucv_handler *handler, int smp)
+{
+ int rc;
+
+ if (!iucv_available)
+ return -ENOSYS;
+ mutex_lock(&iucv_register_mutex);
+ if (!smp)
+ iucv_nonsmp_handler++;
+ if (list_empty(&iucv_handler_list)) {
+ rc = iucv_enable();
+ if (rc)
+ goto out_mutex;
+ } else if (!smp && iucv_nonsmp_handler == 1)
+ iucv_setmask_up();
+ INIT_LIST_HEAD(&handler->paths);
+
+ spin_lock_bh(&iucv_table_lock);
+ list_add_tail(&handler->list, &iucv_handler_list);
+ spin_unlock_bh(&iucv_table_lock);
+ rc = 0;
+out_mutex:
+ mutex_unlock(&iucv_register_mutex);
+ return rc;
+}
+EXPORT_SYMBOL(iucv_register);
+
+/**
+ * iucv_unregister
+ * @handler: address of iucv handler structure
+ * @smp: != 0 indicates that the handler can deal with out of order messages
+ *
+ * Unregister driver from IUCV.
+ */
+void iucv_unregister(struct iucv_handler *handler, int smp)
+{
+ struct iucv_path *p, *n;
+
+ mutex_lock(&iucv_register_mutex);
+ spin_lock_bh(&iucv_table_lock);
+ /* Remove handler from the iucv_handler_list. */
+ list_del_init(&handler->list);
+ /* Sever all pathids still referring to the handler. */
+ list_for_each_entry_safe(p, n, &handler->paths, list) {
+ iucv_sever_pathid(p->pathid, NULL);
+ iucv_path_table[p->pathid] = NULL;
+ list_del(&p->list);
+ iucv_path_free(p);
+ }
+ spin_unlock_bh(&iucv_table_lock);
+ if (!smp)
+ iucv_nonsmp_handler--;
+ if (list_empty(&iucv_handler_list))
+ iucv_disable();
+ else if (!smp && iucv_nonsmp_handler == 0)
+ iucv_setmask_mp();
+ mutex_unlock(&iucv_register_mutex);
+}
+EXPORT_SYMBOL(iucv_unregister);
+
+static int iucv_reboot_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ int i;
+
+ if (cpumask_empty(&iucv_irq_cpumask))
+ return NOTIFY_DONE;
+
+ get_online_cpus();
+ on_each_cpu_mask(&iucv_irq_cpumask, iucv_block_cpu, NULL, 1);
+ preempt_disable();
+ for (i = 0; i < iucv_max_pathid; i++) {
+ if (iucv_path_table[i])
+ iucv_sever_pathid(i, NULL);
+ }
+ preempt_enable();
+ put_online_cpus();
+ iucv_disable();
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block iucv_reboot_notifier = {
+ .notifier_call = iucv_reboot_event,
+};
+
+/**
+ * iucv_path_accept
+ * @path: address of iucv path structure
+ * @handler: address of iucv handler structure
+ * @userdata: 16 bytes of data reflected to the communication partner
+ * @private: private data passed to interrupt handlers for this path
+ *
+ * This function is issued after the user received a connection pending
+ * external interrupt and now wishes to complete the IUCV communication path.
+ *
+ * Returns the result of the CP IUCV call.
+ */
+int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler,
+ u8 userdata[16], void *private)
+{
+ union iucv_param *parm;
+ int rc;
+
+ local_bh_disable();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ /* Prepare parameter block. */
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->ctrl.ippathid = path->pathid;
+ parm->ctrl.ipmsglim = path->msglim;
+ if (userdata)
+ memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+ parm->ctrl.ipflags1 = path->flags;
+
+ rc = iucv_call_b2f0(IUCV_ACCEPT, parm);
+ if (!rc) {
+ path->private = private;
+ path->msglim = parm->ctrl.ipmsglim;
+ path->flags = parm->ctrl.ipflags1;
+ }
+out:
+ local_bh_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_path_accept);
+
+/**
+ * iucv_path_connect
+ * @path: address of iucv path structure
+ * @handler: address of iucv handler structure
+ * @userid: 8-byte user identification
+ * @system: 8-byte target system identification
+ * @userdata: 16 bytes of data reflected to the communication partner
+ * @private: private data passed to interrupt handlers for this path
+ *
+ * This function establishes an IUCV path. Although the connect may complete
+ * successfully, you are not able to use the path until you receive an IUCV
+ * Connection Complete external interrupt.
+ *
+ * Returns the result of the CP IUCV call.
+ */
+int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
+ u8 userid[8], u8 system[8], u8 userdata[16],
+ void *private)
+{
+ union iucv_param *parm;
+ int rc;
+
+ spin_lock_bh(&iucv_table_lock);
+ iucv_cleanup_queue();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->ctrl.ipmsglim = path->msglim;
+ parm->ctrl.ipflags1 = path->flags;
+ if (userid) {
+ memcpy(parm->ctrl.ipvmid, userid, sizeof(parm->ctrl.ipvmid));
+ ASCEBC(parm->ctrl.ipvmid, sizeof(parm->ctrl.ipvmid));
+ EBC_TOUPPER(parm->ctrl.ipvmid, sizeof(parm->ctrl.ipvmid));
+ }
+ if (system) {
+ memcpy(parm->ctrl.iptarget, system,
+ sizeof(parm->ctrl.iptarget));
+ ASCEBC(parm->ctrl.iptarget, sizeof(parm->ctrl.iptarget));
+ EBC_TOUPPER(parm->ctrl.iptarget, sizeof(parm->ctrl.iptarget));
+ }
+ if (userdata)
+ memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+
+ rc = iucv_call_b2f0(IUCV_CONNECT, parm);
+ if (!rc) {
+ if (parm->ctrl.ippathid < iucv_max_pathid) {
+ path->pathid = parm->ctrl.ippathid;
+ path->msglim = parm->ctrl.ipmsglim;
+ path->flags = parm->ctrl.ipflags1;
+ path->handler = handler;
+ path->private = private;
+ list_add_tail(&path->list, &handler->paths);
+ iucv_path_table[path->pathid] = path;
+ } else {
+ iucv_sever_pathid(parm->ctrl.ippathid,
+ iucv_error_pathid);
+ rc = -EIO;
+ }
+ }
+out:
+ spin_unlock_bh(&iucv_table_lock);
+ return rc;
+}
+EXPORT_SYMBOL(iucv_path_connect);
+
+/**
+ * iucv_path_quiesce:
+ * @path: address of iucv path structure
+ * @userdata: 16 bytes of data reflected to the communication partner
+ *
+ * This function temporarily suspends incoming messages on an IUCV path.
+ * You can later reactivate the path by invoking the iucv_resume function.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_path_quiesce(struct iucv_path *path, u8 userdata[16])
+{
+ union iucv_param *parm;
+ int rc;
+
+ local_bh_disable();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ if (userdata)
+ memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+ parm->ctrl.ippathid = path->pathid;
+ rc = iucv_call_b2f0(IUCV_QUIESCE, parm);
+out:
+ local_bh_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_path_quiesce);
+
+/**
+ * iucv_path_resume:
+ * @path: address of iucv path structure
+ * @userdata: 16 bytes of data reflected to the communication partner
+ *
+ * This function resumes incoming messages on an IUCV path that has
+ * been stopped with iucv_path_quiesce.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_path_resume(struct iucv_path *path, u8 userdata[16])
+{
+ union iucv_param *parm;
+ int rc;
+
+ local_bh_disable();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ if (userdata)
+ memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+ parm->ctrl.ippathid = path->pathid;
+ rc = iucv_call_b2f0(IUCV_RESUME, parm);
+out:
+ local_bh_enable();
+ return rc;
+}
+
+/**
+ * iucv_path_sever
+ * @path: address of iucv path structure
+ * @userdata: 16 bytes of data reflected to the communication partner
+ *
+ * This function terminates an IUCV path.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_path_sever(struct iucv_path *path, u8 userdata[16])
+{
+ int rc;
+
+ preempt_disable();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ if (iucv_active_cpu != smp_processor_id())
+ spin_lock_bh(&iucv_table_lock);
+ rc = iucv_sever_pathid(path->pathid, userdata);
+ iucv_path_table[path->pathid] = NULL;
+ list_del_init(&path->list);
+ if (iucv_active_cpu != smp_processor_id())
+ spin_unlock_bh(&iucv_table_lock);
+out:
+ preempt_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_path_sever);
+
+/**
+ * iucv_message_purge
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @srccls: source class of message
+ *
+ * Cancels a message you have sent.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg,
+ u32 srccls)
+{
+ union iucv_param *parm;
+ int rc;
+
+ local_bh_disable();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->purge.ippathid = path->pathid;
+ parm->purge.ipmsgid = msg->id;
+ parm->purge.ipsrccls = srccls;
+ parm->purge.ipflags1 = IUCV_IPSRCCLS | IUCV_IPFGMID | IUCV_IPFGPID;
+ rc = iucv_call_b2f0(IUCV_PURGE, parm);
+ if (!rc) {
+ msg->audit = (*(u32 *) &parm->purge.ipaudit) >> 8;
+ msg->tag = parm->purge.ipmsgtag;
+ }
+out:
+ local_bh_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_message_purge);
+
+/**
+ * iucv_message_receive_iprmdata
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is received (IUCV_IPBUFLST)
+ * @buffer: address of data buffer or address of struct iucv_array
+ * @size: length of data buffer
+ * @residual:
+ *
+ * Internal function used by iucv_message_receive and __iucv_message_receive
+ * to receive RMDATA data stored in struct iucv_message.
+ */
+static int iucv_message_receive_iprmdata(struct iucv_path *path,
+ struct iucv_message *msg,
+ u8 flags, void *buffer,
+ size_t size, size_t *residual)
+{
+ struct iucv_array *array;
+ u8 *rmmsg;
+ size_t copy;
+
+ /*
+ * Message is 8 bytes long and has been stored to the
+ * message descriptor itself.
+ */
+ if (residual)
+ *residual = abs(size - 8);
+ rmmsg = msg->rmmsg;
+ if (flags & IUCV_IPBUFLST) {
+ /* Copy to struct iucv_array. */
+ size = (size < 8) ? size : 8;
+ for (array = buffer; size > 0; array++) {
+ copy = min_t(size_t, size, array->length);
+ memcpy((u8 *)(addr_t) array->address,
+ rmmsg, copy);
+ rmmsg += copy;
+ size -= copy;
+ }
+ } else {
+ /* Copy to direct buffer. */
+ memcpy(buffer, rmmsg, min_t(size_t, size, 8));
+ }
+ return 0;
+}
+
+/**
+ * __iucv_message_receive
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is received (IUCV_IPBUFLST)
+ * @buffer: address of data buffer or address of struct iucv_array
+ * @size: length of data buffer
+ * @residual:
+ *
+ * This function receives messages that are being sent to you over
+ * established paths. This function will deal with RMDATA messages
+ * embedded in struct iucv_message as well.
+ *
+ * Locking: no locking
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
+ u8 flags, void *buffer, size_t size, size_t *residual)
+{
+ union iucv_param *parm;
+ int rc;
+
+ if (msg->flags & IUCV_IPRMDATA)
+ return iucv_message_receive_iprmdata(path, msg, flags,
+ buffer, size, residual);
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+ parm->db.ipbfln1f = (u32) size;
+ parm->db.ipmsgid = msg->id;
+ parm->db.ippathid = path->pathid;
+ parm->db.iptrgcls = msg->class;
+ parm->db.ipflags1 = (flags | IUCV_IPFGPID |
+ IUCV_IPFGMID | IUCV_IPTRGCLS);
+ rc = iucv_call_b2f0(IUCV_RECEIVE, parm);
+ if (!rc || rc == 5) {
+ msg->flags = parm->db.ipflags1;
+ if (residual)
+ *residual = parm->db.ipbfln1f;
+ }
+out:
+ return rc;
+}
+EXPORT_SYMBOL(__iucv_message_receive);
+
+/**
+ * iucv_message_receive
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is received (IUCV_IPBUFLST)
+ * @buffer: address of data buffer or address of struct iucv_array
+ * @size: length of data buffer
+ * @residual:
+ *
+ * This function receives messages that are being sent to you over
+ * established paths. This function will deal with RMDATA messages
+ * embedded in struct iucv_message as well.
+ *
+ * Locking: local_bh_enable/local_bh_disable
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
+ u8 flags, void *buffer, size_t size, size_t *residual)
+{
+ int rc;
+
+ if (msg->flags & IUCV_IPRMDATA)
+ return iucv_message_receive_iprmdata(path, msg, flags,
+ buffer, size, residual);
+ local_bh_disable();
+ rc = __iucv_message_receive(path, msg, flags, buffer, size, residual);
+ local_bh_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_message_receive);
+
+/**
+ * iucv_message_reject
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ *
+ * The reject function refuses a specified message. Between the time you
+ * are notified of a message and the time that you complete the message,
+ * the message may be rejected.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg)
+{
+ union iucv_param *parm;
+ int rc;
+
+ local_bh_disable();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ parm->db.ippathid = path->pathid;
+ parm->db.ipmsgid = msg->id;
+ parm->db.iptrgcls = msg->class;
+ parm->db.ipflags1 = (IUCV_IPTRGCLS | IUCV_IPFGMID | IUCV_IPFGPID);
+ rc = iucv_call_b2f0(IUCV_REJECT, parm);
+out:
+ local_bh_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_message_reject);
+
+/**
+ * iucv_message_reply
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the reply is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
+ * @reply: address of reply data buffer or address of struct iucv_array
+ * @size: length of reply data buffer
+ *
+ * This function responds to the two-way messages that you receive. You
+ * must identify completely the message to which you wish to reply. ie,
+ * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into
+ * the parameter list.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg,
+ u8 flags, void *reply, size_t size)
+{
+ union iucv_param *parm;
+ int rc;
+
+ local_bh_disable();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ if (flags & IUCV_IPRMDATA) {
+ parm->dpl.ippathid = path->pathid;
+ parm->dpl.ipflags1 = flags;
+ parm->dpl.ipmsgid = msg->id;
+ parm->dpl.iptrgcls = msg->class;
+ memcpy(parm->dpl.iprmmsg, reply, min_t(size_t, size, 8));
+ } else {
+ parm->db.ipbfadr1 = (u32)(addr_t) reply;
+ parm->db.ipbfln1f = (u32) size;
+ parm->db.ippathid = path->pathid;
+ parm->db.ipflags1 = flags;
+ parm->db.ipmsgid = msg->id;
+ parm->db.iptrgcls = msg->class;
+ }
+ rc = iucv_call_b2f0(IUCV_REPLY, parm);
+out:
+ local_bh_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_message_reply);
+
+/**
+ * __iucv_message_send
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
+ * @srccls: source class of message
+ * @buffer: address of send buffer or address of struct iucv_array
+ * @size: length of send buffer
+ *
+ * This function transmits data to another application. Data to be
+ * transmitted is in a buffer and this is a one-way message and the
+ * receiver will not reply to the message.
+ *
+ * Locking: no locking
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
+ u8 flags, u32 srccls, void *buffer, size_t size)
+{
+ union iucv_param *parm;
+ int rc;
+
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ if (flags & IUCV_IPRMDATA) {
+ /* Message of 8 bytes can be placed into the parameter list. */
+ parm->dpl.ippathid = path->pathid;
+ parm->dpl.ipflags1 = flags | IUCV_IPNORPY;
+ parm->dpl.iptrgcls = msg->class;
+ parm->dpl.ipsrccls = srccls;
+ parm->dpl.ipmsgtag = msg->tag;
+ memcpy(parm->dpl.iprmmsg, buffer, 8);
+ } else {
+ parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+ parm->db.ipbfln1f = (u32) size;
+ parm->db.ippathid = path->pathid;
+ parm->db.ipflags1 = flags | IUCV_IPNORPY;
+ parm->db.iptrgcls = msg->class;
+ parm->db.ipsrccls = srccls;
+ parm->db.ipmsgtag = msg->tag;
+ }
+ rc = iucv_call_b2f0(IUCV_SEND, parm);
+ if (!rc)
+ msg->id = parm->db.ipmsgid;
+out:
+ return rc;
+}
+EXPORT_SYMBOL(__iucv_message_send);
+
+/**
+ * iucv_message_send
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
+ * @srccls: source class of message
+ * @buffer: address of send buffer or address of struct iucv_array
+ * @size: length of send buffer
+ *
+ * This function transmits data to another application. Data to be
+ * transmitted is in a buffer and this is a one-way message and the
+ * receiver will not reply to the message.
+ *
+ * Locking: local_bh_enable/local_bh_disable
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
+ u8 flags, u32 srccls, void *buffer, size_t size)
+{
+ int rc;
+
+ local_bh_disable();
+ rc = __iucv_message_send(path, msg, flags, srccls, buffer, size);
+ local_bh_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_message_send);
+
+/**
+ * iucv_message_send2way
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is sent and the reply is received
+ * (IUCV_IPRMDATA, IUCV_IPBUFLST, IUCV_IPPRTY, IUCV_ANSLST)
+ * @srccls: source class of message
+ * @buffer: address of send buffer or address of struct iucv_array
+ * @size: length of send buffer
+ * @ansbuf: address of answer buffer or address of struct iucv_array
+ * @asize: size of reply buffer
+ *
+ * This function transmits data to another application. Data to be
+ * transmitted is in a buffer. The receiver of the send is expected to
+ * reply to the message and a buffer is provided into which IUCV moves
+ * the reply to this message.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg,
+ u8 flags, u32 srccls, void *buffer, size_t size,
+ void *answer, size_t asize, size_t *residual)
+{
+ union iucv_param *parm;
+ int rc;
+
+ local_bh_disable();
+ if (cpumask_empty(&iucv_buffer_cpumask)) {
+ rc = -EIO;
+ goto out;
+ }
+ parm = iucv_param[smp_processor_id()];
+ memset(parm, 0, sizeof(union iucv_param));
+ if (flags & IUCV_IPRMDATA) {
+ parm->dpl.ippathid = path->pathid;
+ parm->dpl.ipflags1 = path->flags; /* priority message */
+ parm->dpl.iptrgcls = msg->class;
+ parm->dpl.ipsrccls = srccls;
+ parm->dpl.ipmsgtag = msg->tag;
+ parm->dpl.ipbfadr2 = (u32)(addr_t) answer;
+ parm->dpl.ipbfln2f = (u32) asize;
+ memcpy(parm->dpl.iprmmsg, buffer, 8);
+ } else {
+ parm->db.ippathid = path->pathid;
+ parm->db.ipflags1 = path->flags; /* priority message */
+ parm->db.iptrgcls = msg->class;
+ parm->db.ipsrccls = srccls;
+ parm->db.ipmsgtag = msg->tag;
+ parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+ parm->db.ipbfln1f = (u32) size;
+ parm->db.ipbfadr2 = (u32)(addr_t) answer;
+ parm->db.ipbfln2f = (u32) asize;
+ }
+ rc = iucv_call_b2f0(IUCV_SEND, parm);
+ if (!rc)
+ msg->id = parm->db.ipmsgid;
+out:
+ local_bh_enable();
+ return rc;
+}
+EXPORT_SYMBOL(iucv_message_send2way);
+
+/**
+ * iucv_path_pending
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection pending work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_pending {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iptype;
+ u16 ipmsglim;
+ u16 res1;
+ u8 ipvmid[8];
+ u8 ipuser[16];
+ u32 res3;
+ u8 ippollfg;
+ u8 res4[3];
+} __packed;
+
+static void iucv_path_pending(struct iucv_irq_data *data)
+{
+ struct iucv_path_pending *ipp = (void *) data;
+ struct iucv_handler *handler;
+ struct iucv_path *path;
+ char *error;
+
+ BUG_ON(iucv_path_table[ipp->ippathid]);
+ /* New pathid, handler found. Create a new path struct. */
+ error = iucv_error_no_memory;
+ path = iucv_path_alloc(ipp->ipmsglim, ipp->ipflags1, GFP_ATOMIC);
+ if (!path)
+ goto out_sever;
+ path->pathid = ipp->ippathid;
+ iucv_path_table[path->pathid] = path;
+ EBCASC(ipp->ipvmid, 8);
+
+ /* Call registered handler until one is found that wants the path. */
+ list_for_each_entry(handler, &iucv_handler_list, list) {
+ if (!handler->path_pending)
+ continue;
+ /*
+ * Add path to handler to allow a call to iucv_path_sever
+ * inside the path_pending function. If the handler returns
+ * an error remove the path from the handler again.
+ */
+ list_add(&path->list, &handler->paths);
+ path->handler = handler;
+ if (!handler->path_pending(path, ipp->ipvmid, ipp->ipuser))
+ return;
+ list_del(&path->list);
+ path->handler = NULL;
+ }
+ /* No handler wanted the path. */
+ iucv_path_table[path->pathid] = NULL;
+ iucv_path_free(path);
+ error = iucv_error_no_listener;
+out_sever:
+ iucv_sever_pathid(ipp->ippathid, error);
+}
+
+/**
+ * iucv_path_complete
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection complete work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_complete {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iptype;
+ u16 ipmsglim;
+ u16 res1;
+ u8 res2[8];
+ u8 ipuser[16];
+ u32 res3;
+ u8 ippollfg;
+ u8 res4[3];
+} __packed;
+
+static void iucv_path_complete(struct iucv_irq_data *data)
+{
+ struct iucv_path_complete *ipc = (void *) data;
+ struct iucv_path *path = iucv_path_table[ipc->ippathid];
+
+ if (path)
+ path->flags = ipc->ipflags1;
+ if (path && path->handler && path->handler->path_complete)
+ path->handler->path_complete(path, ipc->ipuser);
+}
+
+/**
+ * iucv_path_severed
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection severed work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_severed {
+ u16 ippathid;
+ u8 res1;
+ u8 iptype;
+ u32 res2;
+ u8 res3[8];
+ u8 ipuser[16];
+ u32 res4;
+ u8 ippollfg;
+ u8 res5[3];
+} __packed;
+
+static void iucv_path_severed(struct iucv_irq_data *data)
+{
+ struct iucv_path_severed *ips = (void *) data;
+ struct iucv_path *path = iucv_path_table[ips->ippathid];
+
+ if (!path || !path->handler) /* Already severed */
+ return;
+ if (path->handler->path_severed)
+ path->handler->path_severed(path, ips->ipuser);
+ else {
+ iucv_sever_pathid(path->pathid, NULL);
+ iucv_path_table[path->pathid] = NULL;
+ list_del(&path->list);
+ iucv_path_free(path);
+ }
+}
+
+/**
+ * iucv_path_quiesced
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection quiesced work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_quiesced {
+ u16 ippathid;
+ u8 res1;
+ u8 iptype;
+ u32 res2;
+ u8 res3[8];
+ u8 ipuser[16];
+ u32 res4;
+ u8 ippollfg;
+ u8 res5[3];
+} __packed;
+
+static void iucv_path_quiesced(struct iucv_irq_data *data)
+{
+ struct iucv_path_quiesced *ipq = (void *) data;
+ struct iucv_path *path = iucv_path_table[ipq->ippathid];
+
+ if (path && path->handler && path->handler->path_quiesced)
+ path->handler->path_quiesced(path, ipq->ipuser);
+}
+
+/**
+ * iucv_path_resumed
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection resumed work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_resumed {
+ u16 ippathid;
+ u8 res1;
+ u8 iptype;
+ u32 res2;
+ u8 res3[8];
+ u8 ipuser[16];
+ u32 res4;
+ u8 ippollfg;
+ u8 res5[3];
+} __packed;
+
+static void iucv_path_resumed(struct iucv_irq_data *data)
+{
+ struct iucv_path_resumed *ipr = (void *) data;
+ struct iucv_path *path = iucv_path_table[ipr->ippathid];
+
+ if (path && path->handler && path->handler->path_resumed)
+ path->handler->path_resumed(path, ipr->ipuser);
+}
+
+/**
+ * iucv_message_complete
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process message complete work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_message_complete {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iptype;
+ u32 ipmsgid;
+ u32 ipaudit;
+ u8 iprmmsg[8];
+ u32 ipsrccls;
+ u32 ipmsgtag;
+ u32 res;
+ u32 ipbfln2f;
+ u8 ippollfg;
+ u8 res2[3];
+} __packed;
+
+static void iucv_message_complete(struct iucv_irq_data *data)
+{
+ struct iucv_message_complete *imc = (void *) data;
+ struct iucv_path *path = iucv_path_table[imc->ippathid];
+ struct iucv_message msg;
+
+ if (path && path->handler && path->handler->message_complete) {
+ msg.flags = imc->ipflags1;
+ msg.id = imc->ipmsgid;
+ msg.audit = imc->ipaudit;
+ memcpy(msg.rmmsg, imc->iprmmsg, 8);
+ msg.class = imc->ipsrccls;
+ msg.tag = imc->ipmsgtag;
+ msg.length = imc->ipbfln2f;
+ path->handler->message_complete(path, &msg);
+ }
+}
+
+/**
+ * iucv_message_pending
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process message pending work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_message_pending {
+ u16 ippathid;
+ u8 ipflags1;
+ u8 iptype;
+ u32 ipmsgid;
+ u32 iptrgcls;
+ union {
+ u32 iprmmsg1_u32;
+ u8 iprmmsg1[4];
+ } ln1msg1;
+ union {
+ u32 ipbfln1f;
+ u8 iprmmsg2[4];
+ } ln1msg2;
+ u32 res1[3];
+ u32 ipbfln2f;
+ u8 ippollfg;
+ u8 res2[3];
+} __packed;
+
+static void iucv_message_pending(struct iucv_irq_data *data)
+{
+ struct iucv_message_pending *imp = (void *) data;
+ struct iucv_path *path = iucv_path_table[imp->ippathid];
+ struct iucv_message msg;
+
+ if (path && path->handler && path->handler->message_pending) {
+ msg.flags = imp->ipflags1;
+ msg.id = imp->ipmsgid;
+ msg.class = imp->iptrgcls;
+ if (imp->ipflags1 & IUCV_IPRMDATA) {
+ memcpy(msg.rmmsg, imp->ln1msg1.iprmmsg1, 8);
+ msg.length = 8;
+ } else
+ msg.length = imp->ln1msg2.ipbfln1f;
+ msg.reply_size = imp->ipbfln2f;
+ path->handler->message_pending(path, &msg);
+ }
+}
+
+/**
+ * iucv_tasklet_fn:
+ *
+ * This tasklet loops over the queue of irq buffers created by
+ * iucv_external_interrupt, calls the appropriate action handler
+ * and then frees the buffer.
+ */
+static void iucv_tasklet_fn(unsigned long ignored)
+{
+ typedef void iucv_irq_fn(struct iucv_irq_data *);
+ static iucv_irq_fn *irq_fn[] = {
+ [0x02] = iucv_path_complete,
+ [0x03] = iucv_path_severed,
+ [0x04] = iucv_path_quiesced,
+ [0x05] = iucv_path_resumed,
+ [0x06] = iucv_message_complete,
+ [0x07] = iucv_message_complete,
+ [0x08] = iucv_message_pending,
+ [0x09] = iucv_message_pending,
+ };
+ LIST_HEAD(task_queue);
+ struct iucv_irq_list *p, *n;
+
+ /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */
+ if (!spin_trylock(&iucv_table_lock)) {
+ tasklet_schedule(&iucv_tasklet);
+ return;
+ }
+ iucv_active_cpu = smp_processor_id();
+
+ spin_lock_irq(&iucv_queue_lock);
+ list_splice_init(&iucv_task_queue, &task_queue);
+ spin_unlock_irq(&iucv_queue_lock);
+
+ list_for_each_entry_safe(p, n, &task_queue, list) {
+ list_del_init(&p->list);
+ irq_fn[p->data.iptype](&p->data);
+ kfree(p);
+ }
+
+ iucv_active_cpu = -1;
+ spin_unlock(&iucv_table_lock);
+}
+
+/**
+ * iucv_work_fn:
+ *
+ * This work function loops over the queue of path pending irq blocks
+ * created by iucv_external_interrupt, calls the appropriate action
+ * handler and then frees the buffer.
+ */
+static void iucv_work_fn(struct work_struct *work)
+{
+ LIST_HEAD(work_queue);
+ struct iucv_irq_list *p, *n;
+
+ /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */
+ spin_lock_bh(&iucv_table_lock);
+ iucv_active_cpu = smp_processor_id();
+
+ spin_lock_irq(&iucv_queue_lock);
+ list_splice_init(&iucv_work_queue, &work_queue);
+ spin_unlock_irq(&iucv_queue_lock);
+
+ iucv_cleanup_queue();
+ list_for_each_entry_safe(p, n, &work_queue, list) {
+ list_del_init(&p->list);
+ iucv_path_pending(&p->data);
+ kfree(p);
+ }
+
+ iucv_active_cpu = -1;
+ spin_unlock_bh(&iucv_table_lock);
+}
+
+/**
+ * iucv_external_interrupt
+ * @code: irq code
+ *
+ * Handles external interrupts coming in from CP.
+ * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn().
+ */
+static void iucv_external_interrupt(struct ext_code ext_code,
+ unsigned int param32, unsigned long param64)
+{
+ struct iucv_irq_data *p;
+ struct iucv_irq_list *work;
+
+ inc_irq_stat(IRQEXT_IUC);
+ p = iucv_irq_data[smp_processor_id()];
+ if (p->ippathid >= iucv_max_pathid) {
+ WARN_ON(p->ippathid >= iucv_max_pathid);
+ iucv_sever_pathid(p->ippathid, iucv_error_no_listener);
+ return;
+ }
+ BUG_ON(p->iptype < 0x01 || p->iptype > 0x09);
+ work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC);
+ if (!work) {
+ pr_warn("iucv_external_interrupt: out of memory\n");
+ return;
+ }
+ memcpy(&work->data, p, sizeof(work->data));
+ spin_lock(&iucv_queue_lock);
+ if (p->iptype == 0x01) {
+ /* Path pending interrupt. */
+ list_add_tail(&work->list, &iucv_work_queue);
+ schedule_work(&iucv_work);
+ } else {
+ /* The other interrupts. */
+ list_add_tail(&work->list, &iucv_task_queue);
+ tasklet_schedule(&iucv_tasklet);
+ }
+ spin_unlock(&iucv_queue_lock);
+}
+
+static int iucv_pm_prepare(struct device *dev)
+{
+ int rc = 0;
+
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_INFO "iucv_pm_prepare\n");
+#endif
+ if (dev->driver && dev->driver->pm && dev->driver->pm->prepare)
+ rc = dev->driver->pm->prepare(dev);
+ return rc;
+}
+
+static void iucv_pm_complete(struct device *dev)
+{
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_INFO "iucv_pm_complete\n");
+#endif
+ if (dev->driver && dev->driver->pm && dev->driver->pm->complete)
+ dev->driver->pm->complete(dev);
+}
+
+/**
+ * iucv_path_table_empty() - determine if iucv path table is empty
+ *
+ * Returns 0 if there are still iucv pathes defined
+ * 1 if there are no iucv pathes defined
+ */
+int iucv_path_table_empty(void)
+{
+ int i;
+
+ for (i = 0; i < iucv_max_pathid; i++) {
+ if (iucv_path_table[i])
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * iucv_pm_freeze() - Freeze PM callback
+ * @dev: iucv-based device
+ *
+ * disable iucv interrupts
+ * invoke callback function of the iucv-based driver
+ * shut down iucv, if no iucv-pathes are established anymore
+ */
+static int iucv_pm_freeze(struct device *dev)
+{
+ int cpu;
+ struct iucv_irq_list *p, *n;
+ int rc = 0;
+
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_WARNING "iucv_pm_freeze\n");
+#endif
+ if (iucv_pm_state != IUCV_PM_FREEZING) {
+ for_each_cpu(cpu, &iucv_irq_cpumask)
+ smp_call_function_single(cpu, iucv_block_cpu_almost,
+ NULL, 1);
+ cancel_work_sync(&iucv_work);
+ list_for_each_entry_safe(p, n, &iucv_work_queue, list) {
+ list_del_init(&p->list);
+ iucv_sever_pathid(p->data.ippathid,
+ iucv_error_no_listener);
+ kfree(p);
+ }
+ }
+ iucv_pm_state = IUCV_PM_FREEZING;
+ if (dev->driver && dev->driver->pm && dev->driver->pm->freeze)
+ rc = dev->driver->pm->freeze(dev);
+ if (iucv_path_table_empty())
+ iucv_disable();
+ return rc;
+}
+
+/**
+ * iucv_pm_thaw() - Thaw PM callback
+ * @dev: iucv-based device
+ *
+ * make iucv ready for use again: allocate path table, declare interrupt buffers
+ * and enable iucv interrupts
+ * invoke callback function of the iucv-based driver
+ */
+static int iucv_pm_thaw(struct device *dev)
+{
+ int rc = 0;
+
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_WARNING "iucv_pm_thaw\n");
+#endif
+ iucv_pm_state = IUCV_PM_THAWING;
+ if (!iucv_path_table) {
+ rc = iucv_enable();
+ if (rc)
+ goto out;
+ }
+ if (cpumask_empty(&iucv_irq_cpumask)) {
+ if (iucv_nonsmp_handler)
+ /* enable interrupts on one cpu */
+ iucv_allow_cpu(NULL);
+ else
+ /* enable interrupts on all cpus */
+ iucv_setmask_mp();
+ }
+ if (dev->driver && dev->driver->pm && dev->driver->pm->thaw)
+ rc = dev->driver->pm->thaw(dev);
+out:
+ return rc;
+}
+
+/**
+ * iucv_pm_restore() - Restore PM callback
+ * @dev: iucv-based device
+ *
+ * make iucv ready for use again: allocate path table, declare interrupt buffers
+ * and enable iucv interrupts
+ * invoke callback function of the iucv-based driver
+ */
+static int iucv_pm_restore(struct device *dev)
+{
+ int rc = 0;
+
+#ifdef CONFIG_PM_DEBUG
+ printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table);
+#endif
+ if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table)
+ pr_warn("Suspending Linux did not completely close all IUCV connections\n");
+ iucv_pm_state = IUCV_PM_RESTORING;
+ if (cpumask_empty(&iucv_irq_cpumask)) {
+ rc = iucv_query_maxconn();
+ rc = iucv_enable();
+ if (rc)
+ goto out;
+ }
+ if (dev->driver && dev->driver->pm && dev->driver->pm->restore)
+ rc = dev->driver->pm->restore(dev);
+out:
+ return rc;
+}
+
+struct iucv_interface iucv_if = {
+ .message_receive = iucv_message_receive,
+ .__message_receive = __iucv_message_receive,
+ .message_reply = iucv_message_reply,
+ .message_reject = iucv_message_reject,
+ .message_send = iucv_message_send,
+ .__message_send = __iucv_message_send,
+ .message_send2way = iucv_message_send2way,
+ .message_purge = iucv_message_purge,
+ .path_accept = iucv_path_accept,
+ .path_connect = iucv_path_connect,
+ .path_quiesce = iucv_path_quiesce,
+ .path_resume = iucv_path_resume,
+ .path_sever = iucv_path_sever,
+ .iucv_register = iucv_register,
+ .iucv_unregister = iucv_unregister,
+ .bus = NULL,
+ .root = NULL,
+};
+EXPORT_SYMBOL(iucv_if);
+
+/**
+ * iucv_init
+ *
+ * Allocates and initializes various data structures.
+ */
+static int __init iucv_init(void)
+{
+ int rc;
+ int cpu;
+
+ if (!MACHINE_IS_VM) {
+ rc = -EPROTONOSUPPORT;
+ goto out;
+ }
+ ctl_set_bit(0, 1);
+ rc = iucv_query_maxconn();
+ if (rc)
+ goto out_ctl;
+ rc = register_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
+ if (rc)
+ goto out_ctl;
+ iucv_root = root_device_register("iucv");
+ if (IS_ERR(iucv_root)) {
+ rc = PTR_ERR(iucv_root);
+ goto out_int;
+ }
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ if (alloc_iucv_data(cpu)) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+ }
+ rc = __register_hotcpu_notifier(&iucv_cpu_notifier);
+ if (rc)
+ goto out_free;
+
+ cpu_notifier_register_done();
+
+ rc = register_reboot_notifier(&iucv_reboot_notifier);
+ if (rc)
+ goto out_cpu;
+ ASCEBC(iucv_error_no_listener, 16);
+ ASCEBC(iucv_error_no_memory, 16);
+ ASCEBC(iucv_error_pathid, 16);
+ iucv_available = 1;
+ rc = bus_register(&iucv_bus);
+ if (rc)
+ goto out_reboot;
+ iucv_if.root = iucv_root;
+ iucv_if.bus = &iucv_bus;
+ return 0;
+
+out_reboot:
+ unregister_reboot_notifier(&iucv_reboot_notifier);
+out_cpu:
+ cpu_notifier_register_begin();
+ __unregister_hotcpu_notifier(&iucv_cpu_notifier);
+out_free:
+ for_each_possible_cpu(cpu)
+ free_iucv_data(cpu);
+
+ cpu_notifier_register_done();
+
+ root_device_unregister(iucv_root);
+out_int:
+ unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
+out_ctl:
+ ctl_clear_bit(0, 1);
+out:
+ return rc;
+}
+
+/**
+ * iucv_exit
+ *
+ * Frees everything allocated from iucv_init.
+ */
+static void __exit iucv_exit(void)
+{
+ struct iucv_irq_list *p, *n;
+ int cpu;
+
+ spin_lock_irq(&iucv_queue_lock);
+ list_for_each_entry_safe(p, n, &iucv_task_queue, list)
+ kfree(p);
+ list_for_each_entry_safe(p, n, &iucv_work_queue, list)
+ kfree(p);
+ spin_unlock_irq(&iucv_queue_lock);
+ unregister_reboot_notifier(&iucv_reboot_notifier);
+ cpu_notifier_register_begin();
+ __unregister_hotcpu_notifier(&iucv_cpu_notifier);
+ for_each_possible_cpu(cpu)
+ free_iucv_data(cpu);
+ cpu_notifier_register_done();
+ root_device_unregister(iucv_root);
+ bus_unregister(&iucv_bus);
+ unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
+}
+
+subsys_initcall(iucv_init);
+module_exit(iucv_exit);
+
+MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert (felfert@millenux.com)");
+MODULE_DESCRIPTION("Linux for S/390 IUCV lowlevel driver");
+MODULE_LICENSE("GPL");
diff --git a/net/key/Makefile b/net/key/Makefile
new file mode 100644
index 000000000..857608042
--- /dev/null
+++ b/net/key/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the key AF.
+#
+
+obj-$(CONFIG_NET_KEY) += af_key.o
diff --git a/net/key/af_key.c b/net/key/af_key.c
new file mode 100644
index 000000000..f0d52d721
--- /dev/null
+++ b/net/key/af_key.c
@@ -0,0 +1,3871 @@
+/*
+ * net/key/af_key.c An implementation of PF_KEYv2 sockets.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Maxim Giryaev <gem@asplinux.ru>
+ * David S. Miller <davem@redhat.com>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * Kazunori MIYAZAWA / USAGI Project <miyazawa@linux-ipv6.org>
+ * Derek Atkins <derek@ihtfp.com>
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/xfrm.h>
+
+#include <net/sock.h>
+
+#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x))
+#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x))
+
+static int pfkey_net_id __read_mostly;
+struct netns_pfkey {
+ /* List of all pfkey sockets. */
+ struct hlist_head table;
+ atomic_t socks_nr;
+};
+static DEFINE_MUTEX(pfkey_mutex);
+
+#define DUMMY_MARK 0
+static const struct xfrm_mark dummy_mark = {0, 0};
+struct pfkey_sock {
+ /* struct sock must be the first member of struct pfkey_sock */
+ struct sock sk;
+ int registered;
+ int promisc;
+
+ struct {
+ uint8_t msg_version;
+ uint32_t msg_portid;
+ int (*dump)(struct pfkey_sock *sk);
+ void (*done)(struct pfkey_sock *sk);
+ union {
+ struct xfrm_policy_walk policy;
+ struct xfrm_state_walk state;
+ } u;
+ struct sk_buff *skb;
+ } dump;
+};
+
+static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
+{
+ return (struct pfkey_sock *)sk;
+}
+
+static int pfkey_can_dump(const struct sock *sk)
+{
+ if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf)
+ return 1;
+ return 0;
+}
+
+static void pfkey_terminate_dump(struct pfkey_sock *pfk)
+{
+ if (pfk->dump.dump) {
+ if (pfk->dump.skb) {
+ kfree_skb(pfk->dump.skb);
+ pfk->dump.skb = NULL;
+ }
+ pfk->dump.done(pfk);
+ pfk->dump.dump = NULL;
+ pfk->dump.done = NULL;
+ }
+}
+
+static void pfkey_sock_destruct(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+ pfkey_terminate_dump(pfkey_sk(sk));
+ skb_queue_purge(&sk->sk_receive_queue);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Attempt to release alive pfkey socket: %p\n", sk);
+ return;
+ }
+
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+
+ atomic_dec(&net_pfkey->socks_nr);
+}
+
+static const struct proto_ops pfkey_ops;
+
+static void pfkey_insert(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+ mutex_lock(&pfkey_mutex);
+ sk_add_node_rcu(sk, &net_pfkey->table);
+ mutex_unlock(&pfkey_mutex);
+}
+
+static void pfkey_remove(struct sock *sk)
+{
+ mutex_lock(&pfkey_mutex);
+ sk_del_node_init_rcu(sk);
+ mutex_unlock(&pfkey_mutex);
+}
+
+static struct proto key_proto = {
+ .name = "KEY",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct pfkey_sock),
+};
+
+static int pfkey_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+ struct sock *sk;
+ int err;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+ if (protocol != PF_KEY_V2)
+ return -EPROTONOSUPPORT;
+
+ err = -ENOMEM;
+ sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto);
+ if (sk == NULL)
+ goto out;
+
+ sock->ops = &pfkey_ops;
+ sock_init_data(sock, sk);
+
+ sk->sk_family = PF_KEY;
+ sk->sk_destruct = pfkey_sock_destruct;
+
+ atomic_inc(&net_pfkey->socks_nr);
+
+ pfkey_insert(sk);
+
+ return 0;
+out:
+ return err;
+}
+
+static int pfkey_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 0;
+
+ pfkey_remove(sk);
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+ skb_queue_purge(&sk->sk_write_queue);
+
+ synchronize_rcu();
+ sock_put(sk);
+
+ return 0;
+}
+
+static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
+ gfp_t allocation, struct sock *sk)
+{
+ int err = -ENOBUFS;
+
+ sock_hold(sk);
+ if (*skb2 == NULL) {
+ if (atomic_read(&skb->users) != 1) {
+ *skb2 = skb_clone(skb, allocation);
+ } else {
+ *skb2 = skb;
+ atomic_inc(&skb->users);
+ }
+ }
+ if (*skb2 != NULL) {
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
+ skb_set_owner_r(*skb2, sk);
+ skb_queue_tail(&sk->sk_receive_queue, *skb2);
+ sk->sk_data_ready(sk);
+ *skb2 = NULL;
+ err = 0;
+ }
+ }
+ sock_put(sk);
+ return err;
+}
+
+/* Send SKB to all pfkey sockets matching selected criteria. */
+#define BROADCAST_ALL 0
+#define BROADCAST_ONE 1
+#define BROADCAST_REGISTERED 2
+#define BROADCAST_PROMISC_ONLY 4
+static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
+ int broadcast_flags, struct sock *one_sk,
+ struct net *net)
+{
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+ struct sock *sk;
+ struct sk_buff *skb2 = NULL;
+ int err = -ESRCH;
+
+ /* XXX Do we need something like netlink_overrun? I think
+ * XXX PF_KEY socket apps will not mind current behavior.
+ */
+ if (!skb)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ sk_for_each_rcu(sk, &net_pfkey->table) {
+ struct pfkey_sock *pfk = pfkey_sk(sk);
+ int err2;
+
+ /* Yes, it means that if you are meant to receive this
+ * pfkey message you receive it twice as promiscuous
+ * socket.
+ */
+ if (pfk->promisc)
+ pfkey_broadcast_one(skb, &skb2, allocation, sk);
+
+ /* the exact target will be processed later */
+ if (sk == one_sk)
+ continue;
+ if (broadcast_flags != BROADCAST_ALL) {
+ if (broadcast_flags & BROADCAST_PROMISC_ONLY)
+ continue;
+ if ((broadcast_flags & BROADCAST_REGISTERED) &&
+ !pfk->registered)
+ continue;
+ if (broadcast_flags & BROADCAST_ONE)
+ continue;
+ }
+
+ err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk);
+
+ /* Error is cleare after succecful sending to at least one
+ * registered KM */
+ if ((broadcast_flags & BROADCAST_REGISTERED) && err)
+ err = err2;
+ }
+ rcu_read_unlock();
+
+ if (one_sk != NULL)
+ err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
+
+ kfree_skb(skb2);
+ kfree_skb(skb);
+ return err;
+}
+
+static int pfkey_do_dump(struct pfkey_sock *pfk)
+{
+ struct sadb_msg *hdr;
+ int rc;
+
+ rc = pfk->dump.dump(pfk);
+ if (rc == -ENOBUFS)
+ return 0;
+
+ if (pfk->dump.skb) {
+ if (!pfkey_can_dump(&pfk->sk))
+ return 0;
+
+ hdr = (struct sadb_msg *) pfk->dump.skb->data;
+ hdr->sadb_msg_seq = 0;
+ hdr->sadb_msg_errno = rc;
+ pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+ &pfk->sk, sock_net(&pfk->sk));
+ pfk->dump.skb = NULL;
+ }
+
+ pfkey_terminate_dump(pfk);
+ return rc;
+}
+
+static inline void pfkey_hdr_dup(struct sadb_msg *new,
+ const struct sadb_msg *orig)
+{
+ *new = *orig;
+}
+
+static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk)
+{
+ struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
+ struct sadb_msg *hdr;
+
+ if (!skb)
+ return -ENOBUFS;
+
+ /* Woe be to the platform trying to support PFKEY yet
+ * having normal errnos outside the 1-255 range, inclusive.
+ */
+ err = -err;
+ if (err == ERESTARTSYS ||
+ err == ERESTARTNOHAND ||
+ err == ERESTARTNOINTR)
+ err = EINTR;
+ if (err >= 512)
+ err = EINVAL;
+ BUG_ON(err <= 0 || err >= 256);
+
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+ pfkey_hdr_dup(hdr, orig);
+ hdr->sadb_msg_errno = (uint8_t) err;
+ hdr->sadb_msg_len = (sizeof(struct sadb_msg) /
+ sizeof(uint64_t));
+
+ pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk));
+
+ return 0;
+}
+
+static const u8 sadb_ext_min_len[] = {
+ [SADB_EXT_RESERVED] = (u8) 0,
+ [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa),
+ [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime),
+ [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime),
+ [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime),
+ [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address),
+ [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address),
+ [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address),
+ [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key),
+ [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key),
+ [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident),
+ [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident),
+ [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens),
+ [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop),
+ [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported),
+ [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported),
+ [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange),
+ [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate),
+ [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy),
+ [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2),
+ [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type),
+ [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
+ [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port),
+ [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address),
+ [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx),
+ [SADB_X_EXT_KMADDRESS] = (u8) sizeof(struct sadb_x_kmaddress),
+ [SADB_X_EXT_FILTER] = (u8) sizeof(struct sadb_x_filter),
+};
+
+/* Verify sadb_address_{len,prefixlen} against sa_family. */
+static int verify_address_len(const void *p)
+{
+ const struct sadb_address *sp = p;
+ const struct sockaddr *addr = (const struct sockaddr *)(sp + 1);
+ const struct sockaddr_in *sin;
+#if IS_ENABLED(CONFIG_IPV6)
+ const struct sockaddr_in6 *sin6;
+#endif
+ int len;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t));
+ if (sp->sadb_address_len != len ||
+ sp->sadb_address_prefixlen > 32)
+ return -EINVAL;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t));
+ if (sp->sadb_address_len != len ||
+ sp->sadb_address_prefixlen > 128)
+ return -EINVAL;
+ break;
+#endif
+ default:
+ /* It is user using kernel to keep track of security
+ * associations for another protocol, such as
+ * OSPF/RSVP/RIPV2/MIP. It is user's job to verify
+ * lengths.
+ *
+ * XXX Actually, association/policy database is not yet
+ * XXX able to cope with arbitrary sockaddr families.
+ * XXX When it can, remove this -EINVAL. -DaveM
+ */
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx)
+{
+ return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) +
+ sec_ctx->sadb_x_ctx_len,
+ sizeof(uint64_t));
+}
+
+static inline int verify_sec_ctx_len(const void *p)
+{
+ const struct sadb_x_sec_ctx *sec_ctx = p;
+ int len = sec_ctx->sadb_x_ctx_len;
+
+ if (len > PAGE_SIZE)
+ return -EINVAL;
+
+ len = pfkey_sec_ctx_len(sec_ctx);
+
+ if (sec_ctx->sadb_x_sec_len != len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx,
+ gfp_t gfp)
+{
+ struct xfrm_user_sec_ctx *uctx = NULL;
+ int ctx_size = sec_ctx->sadb_x_ctx_len;
+
+ uctx = kmalloc((sizeof(*uctx)+ctx_size), gfp);
+
+ if (!uctx)
+ return NULL;
+
+ uctx->len = pfkey_sec_ctx_len(sec_ctx);
+ uctx->exttype = sec_ctx->sadb_x_sec_exttype;
+ uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi;
+ uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg;
+ uctx->ctx_len = sec_ctx->sadb_x_ctx_len;
+ memcpy(uctx + 1, sec_ctx + 1,
+ uctx->ctx_len);
+
+ return uctx;
+}
+
+static int present_and_same_family(const struct sadb_address *src,
+ const struct sadb_address *dst)
+{
+ const struct sockaddr *s_addr, *d_addr;
+
+ if (!src || !dst)
+ return 0;
+
+ s_addr = (const struct sockaddr *)(src + 1);
+ d_addr = (const struct sockaddr *)(dst + 1);
+ if (s_addr->sa_family != d_addr->sa_family)
+ return 0;
+ if (s_addr->sa_family != AF_INET
+#if IS_ENABLED(CONFIG_IPV6)
+ && s_addr->sa_family != AF_INET6
+#endif
+ )
+ return 0;
+
+ return 1;
+}
+
+static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs)
+{
+ const char *p = (char *) hdr;
+ int len = skb->len;
+
+ len -= sizeof(*hdr);
+ p += sizeof(*hdr);
+ while (len > 0) {
+ const struct sadb_ext *ehdr = (const struct sadb_ext *) p;
+ uint16_t ext_type;
+ int ext_len;
+
+ ext_len = ehdr->sadb_ext_len;
+ ext_len *= sizeof(uint64_t);
+ ext_type = ehdr->sadb_ext_type;
+ if (ext_len < sizeof(uint64_t) ||
+ ext_len > len ||
+ ext_type == SADB_EXT_RESERVED)
+ return -EINVAL;
+
+ if (ext_type <= SADB_EXT_MAX) {
+ int min = (int) sadb_ext_min_len[ext_type];
+ if (ext_len < min)
+ return -EINVAL;
+ if (ext_hdrs[ext_type-1] != NULL)
+ return -EINVAL;
+ if (ext_type == SADB_EXT_ADDRESS_SRC ||
+ ext_type == SADB_EXT_ADDRESS_DST ||
+ ext_type == SADB_EXT_ADDRESS_PROXY ||
+ ext_type == SADB_X_EXT_NAT_T_OA) {
+ if (verify_address_len(p))
+ return -EINVAL;
+ }
+ if (ext_type == SADB_X_EXT_SEC_CTX) {
+ if (verify_sec_ctx_len(p))
+ return -EINVAL;
+ }
+ ext_hdrs[ext_type-1] = (void *) p;
+ }
+ p += ext_len;
+ len -= ext_len;
+ }
+
+ return 0;
+}
+
+static uint16_t
+pfkey_satype2proto(uint8_t satype)
+{
+ switch (satype) {
+ case SADB_SATYPE_UNSPEC:
+ return IPSEC_PROTO_ANY;
+ case SADB_SATYPE_AH:
+ return IPPROTO_AH;
+ case SADB_SATYPE_ESP:
+ return IPPROTO_ESP;
+ case SADB_X_SATYPE_IPCOMP:
+ return IPPROTO_COMP;
+ default:
+ return 0;
+ }
+ /* NOTREACHED */
+}
+
+static uint8_t
+pfkey_proto2satype(uint16_t proto)
+{
+ switch (proto) {
+ case IPPROTO_AH:
+ return SADB_SATYPE_AH;
+ case IPPROTO_ESP:
+ return SADB_SATYPE_ESP;
+ case IPPROTO_COMP:
+ return SADB_X_SATYPE_IPCOMP;
+ default:
+ return 0;
+ }
+ /* NOTREACHED */
+}
+
+/* BTW, this scheme means that there is no way with PFKEY2 sockets to
+ * say specifically 'just raw sockets' as we encode them as 255.
+ */
+
+static uint8_t pfkey_proto_to_xfrm(uint8_t proto)
+{
+ return proto == IPSEC_PROTO_ANY ? 0 : proto;
+}
+
+static uint8_t pfkey_proto_from_xfrm(uint8_t proto)
+{
+ return proto ? proto : IPSEC_PROTO_ANY;
+}
+
+static inline int pfkey_sockaddr_len(sa_family_t family)
+{
+ switch (family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+#endif
+ }
+ return 0;
+}
+
+static
+int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr)
+{
+ switch (sa->sa_family) {
+ case AF_INET:
+ xaddr->a4 =
+ ((struct sockaddr_in *)sa)->sin_addr.s_addr;
+ return AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ memcpy(xaddr->a6,
+ &((struct sockaddr_in6 *)sa)->sin6_addr,
+ sizeof(struct in6_addr));
+ return AF_INET6;
+#endif
+ }
+ return 0;
+}
+
+static
+int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr)
+{
+ return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1),
+ xaddr);
+}
+
+static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ const struct sadb_sa *sa;
+ const struct sadb_address *addr;
+ uint16_t proto;
+ unsigned short family;
+ xfrm_address_t *xaddr;
+
+ sa = ext_hdrs[SADB_EXT_SA - 1];
+ if (sa == NULL)
+ return NULL;
+
+ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+ if (proto == 0)
+ return NULL;
+
+ /* sadb_address_len should be checked by caller */
+ addr = ext_hdrs[SADB_EXT_ADDRESS_DST - 1];
+ if (addr == NULL)
+ return NULL;
+
+ family = ((const struct sockaddr *)(addr + 1))->sa_family;
+ switch (family) {
+ case AF_INET:
+ xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr;
+ break;
+#endif
+ default:
+ xaddr = NULL;
+ }
+
+ if (!xaddr)
+ return NULL;
+
+ return xfrm_state_lookup(net, DUMMY_MARK, xaddr, sa->sadb_sa_spi, proto, family);
+}
+
+#define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1)))
+
+static int
+pfkey_sockaddr_size(sa_family_t family)
+{
+ return PFKEY_ALIGN8(pfkey_sockaddr_len(family));
+}
+
+static inline int pfkey_mode_from_xfrm(int mode)
+{
+ switch(mode) {
+ case XFRM_MODE_TRANSPORT:
+ return IPSEC_MODE_TRANSPORT;
+ case XFRM_MODE_TUNNEL:
+ return IPSEC_MODE_TUNNEL;
+ case XFRM_MODE_BEET:
+ return IPSEC_MODE_BEET;
+ default:
+ return -1;
+ }
+}
+
+static inline int pfkey_mode_to_xfrm(int mode)
+{
+ switch(mode) {
+ case IPSEC_MODE_ANY: /*XXX*/
+ case IPSEC_MODE_TRANSPORT:
+ return XFRM_MODE_TRANSPORT;
+ case IPSEC_MODE_TUNNEL:
+ return XFRM_MODE_TUNNEL;
+ case IPSEC_MODE_BEET:
+ return XFRM_MODE_BEET;
+ default:
+ return -1;
+ }
+}
+
+static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port,
+ struct sockaddr *sa,
+ unsigned short family)
+{
+ switch (family) {
+ case AF_INET:
+ {
+ struct sockaddr_in *sin = (struct sockaddr_in *)sa;
+ sin->sin_family = AF_INET;
+ sin->sin_port = port;
+ sin->sin_addr.s_addr = xaddr->a4;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ return 32;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = port;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_addr = xaddr->in6;
+ sin6->sin6_scope_id = 0;
+ return 128;
+ }
+#endif
+ }
+ return 0;
+}
+
+static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
+ int add_keys, int hsc)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+ struct sadb_sa *sa;
+ struct sadb_lifetime *lifetime;
+ struct sadb_address *addr;
+ struct sadb_key *key;
+ struct sadb_x_sa2 *sa2;
+ struct sadb_x_sec_ctx *sec_ctx;
+ struct xfrm_sec_ctx *xfrm_ctx;
+ int ctx_size = 0;
+ int size;
+ int auth_key_size = 0;
+ int encrypt_key_size = 0;
+ int sockaddr_size;
+ struct xfrm_encap_tmpl *natt = NULL;
+ int mode;
+
+ /* address family check */
+ sockaddr_size = pfkey_sockaddr_size(x->props.family);
+ if (!sockaddr_size)
+ return ERR_PTR(-EINVAL);
+
+ /* base, SA, (lifetime (HSC),) address(SD), (address(P),)
+ key(AE), (identity(SD),) (sensitivity)> */
+ size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) +
+ sizeof(struct sadb_lifetime) +
+ ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) +
+ ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) +
+ sizeof(struct sadb_address)*2 +
+ sockaddr_size*2 +
+ sizeof(struct sadb_x_sa2);
+
+ if ((xfrm_ctx = x->security)) {
+ ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
+ size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
+ }
+
+ /* identity & sensitivity */
+ if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family))
+ size += sizeof(struct sadb_address) + sockaddr_size;
+
+ if (add_keys) {
+ if (x->aalg && x->aalg->alg_key_len) {
+ auth_key_size =
+ PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8);
+ size += sizeof(struct sadb_key) + auth_key_size;
+ }
+ if (x->ealg && x->ealg->alg_key_len) {
+ encrypt_key_size =
+ PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8);
+ size += sizeof(struct sadb_key) + encrypt_key_size;
+ }
+ }
+ if (x->encap)
+ natt = x->encap;
+
+ if (natt && natt->encap_type) {
+ size += sizeof(struct sadb_x_nat_t_type);
+ size += sizeof(struct sadb_x_nat_t_port);
+ size += sizeof(struct sadb_x_nat_t_port);
+ }
+
+ skb = alloc_skb(size + 16, GFP_ATOMIC);
+ if (skb == NULL)
+ return ERR_PTR(-ENOBUFS);
+
+ /* call should fill header later */
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+ memset(hdr, 0, size); /* XXX do we need this ? */
+ hdr->sadb_msg_len = size / sizeof(uint64_t);
+
+ /* sa */
+ sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa));
+ sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t);
+ sa->sadb_sa_exttype = SADB_EXT_SA;
+ sa->sadb_sa_spi = x->id.spi;
+ sa->sadb_sa_replay = x->props.replay_window;
+ switch (x->km.state) {
+ case XFRM_STATE_VALID:
+ sa->sadb_sa_state = x->km.dying ?
+ SADB_SASTATE_DYING : SADB_SASTATE_MATURE;
+ break;
+ case XFRM_STATE_ACQ:
+ sa->sadb_sa_state = SADB_SASTATE_LARVAL;
+ break;
+ default:
+ sa->sadb_sa_state = SADB_SASTATE_DEAD;
+ break;
+ }
+ sa->sadb_sa_auth = 0;
+ if (x->aalg) {
+ struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ sa->sadb_sa_auth = (a && a->pfkey_supported) ?
+ a->desc.sadb_alg_id : 0;
+ }
+ sa->sadb_sa_encrypt = 0;
+ BUG_ON(x->ealg && x->calg);
+ if (x->ealg) {
+ struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0);
+ sa->sadb_sa_encrypt = (a && a->pfkey_supported) ?
+ a->desc.sadb_alg_id : 0;
+ }
+ /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */
+ if (x->calg) {
+ struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0);
+ sa->sadb_sa_encrypt = (a && a->pfkey_supported) ?
+ a->desc.sadb_alg_id : 0;
+ }
+
+ sa->sadb_sa_flags = 0;
+ if (x->props.flags & XFRM_STATE_NOECN)
+ sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN;
+ if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+ sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP;
+ if (x->props.flags & XFRM_STATE_NOPMTUDISC)
+ sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC;
+
+ /* hard time */
+ if (hsc & 2) {
+ lifetime = (struct sadb_lifetime *) skb_put(skb,
+ sizeof(struct sadb_lifetime));
+ lifetime->sadb_lifetime_len =
+ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
+ lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit);
+ lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit);
+ lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds;
+ lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds;
+ }
+ /* soft time */
+ if (hsc & 1) {
+ lifetime = (struct sadb_lifetime *) skb_put(skb,
+ sizeof(struct sadb_lifetime));
+ lifetime->sadb_lifetime_len =
+ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
+ lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit);
+ lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit);
+ lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds;
+ lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds;
+ }
+ /* current time */
+ lifetime = (struct sadb_lifetime *) skb_put(skb,
+ sizeof(struct sadb_lifetime));
+ lifetime->sadb_lifetime_len =
+ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
+ lifetime->sadb_lifetime_allocations = x->curlft.packets;
+ lifetime->sadb_lifetime_bytes = x->curlft.bytes;
+ lifetime->sadb_lifetime_addtime = x->curlft.add_time;
+ lifetime->sadb_lifetime_usetime = x->curlft.use_time;
+ /* src address */
+ addr = (struct sadb_address*) skb_put(skb,
+ sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+ /* "if the ports are non-zero, then the sadb_address_proto field,
+ normally zero, MUST be filled in with the transport
+ protocol's number." - RFC2367 */
+ addr->sadb_address_proto = 0;
+ addr->sadb_address_reserved = 0;
+
+ addr->sadb_address_prefixlen =
+ pfkey_sockaddr_fill(&x->props.saddr, 0,
+ (struct sockaddr *) (addr + 1),
+ x->props.family);
+ if (!addr->sadb_address_prefixlen)
+ BUG();
+
+ /* dst address */
+ addr = (struct sadb_address*) skb_put(skb,
+ sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+ addr->sadb_address_proto = 0;
+ addr->sadb_address_reserved = 0;
+
+ addr->sadb_address_prefixlen =
+ pfkey_sockaddr_fill(&x->id.daddr, 0,
+ (struct sockaddr *) (addr + 1),
+ x->props.family);
+ if (!addr->sadb_address_prefixlen)
+ BUG();
+
+ if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr,
+ x->props.family)) {
+ addr = (struct sadb_address*) skb_put(skb,
+ sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY;
+ addr->sadb_address_proto =
+ pfkey_proto_from_xfrm(x->sel.proto);
+ addr->sadb_address_prefixlen = x->sel.prefixlen_s;
+ addr->sadb_address_reserved = 0;
+
+ pfkey_sockaddr_fill(&x->sel.saddr, x->sel.sport,
+ (struct sockaddr *) (addr + 1),
+ x->props.family);
+ }
+
+ /* auth key */
+ if (add_keys && auth_key_size) {
+ key = (struct sadb_key *) skb_put(skb,
+ sizeof(struct sadb_key)+auth_key_size);
+ key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) /
+ sizeof(uint64_t);
+ key->sadb_key_exttype = SADB_EXT_KEY_AUTH;
+ key->sadb_key_bits = x->aalg->alg_key_len;
+ key->sadb_key_reserved = 0;
+ memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8);
+ }
+ /* encrypt key */
+ if (add_keys && encrypt_key_size) {
+ key = (struct sadb_key *) skb_put(skb,
+ sizeof(struct sadb_key)+encrypt_key_size);
+ key->sadb_key_len = (sizeof(struct sadb_key) +
+ encrypt_key_size) / sizeof(uint64_t);
+ key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT;
+ key->sadb_key_bits = x->ealg->alg_key_len;
+ key->sadb_key_reserved = 0;
+ memcpy(key + 1, x->ealg->alg_key,
+ (x->ealg->alg_key_len+7)/8);
+ }
+
+ /* sa */
+ sa2 = (struct sadb_x_sa2 *) skb_put(skb, sizeof(struct sadb_x_sa2));
+ sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t);
+ sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2;
+ if ((mode = pfkey_mode_from_xfrm(x->props.mode)) < 0) {
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+ }
+ sa2->sadb_x_sa2_mode = mode;
+ sa2->sadb_x_sa2_reserved1 = 0;
+ sa2->sadb_x_sa2_reserved2 = 0;
+ sa2->sadb_x_sa2_sequence = 0;
+ sa2->sadb_x_sa2_reqid = x->props.reqid;
+
+ if (natt && natt->encap_type) {
+ struct sadb_x_nat_t_type *n_type;
+ struct sadb_x_nat_t_port *n_port;
+
+ /* type */
+ n_type = (struct sadb_x_nat_t_type*) skb_put(skb, sizeof(*n_type));
+ n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t);
+ n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE;
+ n_type->sadb_x_nat_t_type_type = natt->encap_type;
+ n_type->sadb_x_nat_t_type_reserved[0] = 0;
+ n_type->sadb_x_nat_t_type_reserved[1] = 0;
+ n_type->sadb_x_nat_t_type_reserved[2] = 0;
+
+ /* source port */
+ n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
+ n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
+ n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT;
+ n_port->sadb_x_nat_t_port_port = natt->encap_sport;
+ n_port->sadb_x_nat_t_port_reserved = 0;
+
+ /* dest port */
+ n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
+ n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
+ n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT;
+ n_port->sadb_x_nat_t_port_port = natt->encap_dport;
+ n_port->sadb_x_nat_t_port_reserved = 0;
+ }
+
+ /* security context */
+ if (xfrm_ctx) {
+ sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
+ sizeof(struct sadb_x_sec_ctx) + ctx_size);
+ sec_ctx->sadb_x_sec_len =
+ (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
+ sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+ sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+ sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+ sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+ memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+ xfrm_ctx->ctx_len);
+ }
+
+ return skb;
+}
+
+
+static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x)
+{
+ struct sk_buff *skb;
+
+ skb = __pfkey_xfrm_state2msg(x, 1, 3);
+
+ return skb;
+}
+
+static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x,
+ int hsc)
+{
+ return __pfkey_xfrm_state2msg(x, 0, hsc);
+}
+
+static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
+ const struct sadb_msg *hdr,
+ void * const *ext_hdrs)
+{
+ struct xfrm_state *x;
+ const struct sadb_lifetime *lifetime;
+ const struct sadb_sa *sa;
+ const struct sadb_key *key;
+ const struct sadb_x_sec_ctx *sec_ctx;
+ uint16_t proto;
+ int err;
+
+
+ sa = ext_hdrs[SADB_EXT_SA - 1];
+ if (!sa ||
+ !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
+ return ERR_PTR(-EINVAL);
+ if (hdr->sadb_msg_satype == SADB_SATYPE_ESP &&
+ !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1])
+ return ERR_PTR(-EINVAL);
+ if (hdr->sadb_msg_satype == SADB_SATYPE_AH &&
+ !ext_hdrs[SADB_EXT_KEY_AUTH-1])
+ return ERR_PTR(-EINVAL);
+ if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] !=
+ !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1])
+ return ERR_PTR(-EINVAL);
+
+ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+ if (proto == 0)
+ return ERR_PTR(-EINVAL);
+
+ /* default error is no buffer space */
+ err = -ENOBUFS;
+
+ /* RFC2367:
+
+ Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message.
+ SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not
+ sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state.
+ Therefore, the sadb_sa_state field of all submitted SAs MUST be
+ SADB_SASTATE_MATURE and the kernel MUST return an error if this is
+ not true.
+
+ However, KAME setkey always uses SADB_SASTATE_LARVAL.
+ Hence, we have to _ignore_ sadb_sa_state, which is also reasonable.
+ */
+ if (sa->sadb_sa_auth > SADB_AALG_MAX ||
+ (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP &&
+ sa->sadb_sa_encrypt > SADB_X_CALG_MAX) ||
+ sa->sadb_sa_encrypt > SADB_EALG_MAX)
+ return ERR_PTR(-EINVAL);
+ key = ext_hdrs[SADB_EXT_KEY_AUTH - 1];
+ if (key != NULL &&
+ sa->sadb_sa_auth != SADB_X_AALG_NULL &&
+ ((key->sadb_key_bits+7) / 8 == 0 ||
+ (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t)))
+ return ERR_PTR(-EINVAL);
+ key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1];
+ if (key != NULL &&
+ sa->sadb_sa_encrypt != SADB_EALG_NULL &&
+ ((key->sadb_key_bits+7) / 8 == 0 ||
+ (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t)))
+ return ERR_PTR(-EINVAL);
+
+ x = xfrm_state_alloc(net);
+ if (x == NULL)
+ return ERR_PTR(-ENOBUFS);
+
+ x->id.proto = proto;
+ x->id.spi = sa->sadb_sa_spi;
+ x->props.replay_window = min_t(unsigned int, sa->sadb_sa_replay,
+ (sizeof(x->replay.bitmap) * 8));
+ if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN)
+ x->props.flags |= XFRM_STATE_NOECN;
+ if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP)
+ x->props.flags |= XFRM_STATE_DECAP_DSCP;
+ if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
+ x->props.flags |= XFRM_STATE_NOPMTUDISC;
+
+ lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD - 1];
+ if (lifetime != NULL) {
+ x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
+ x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
+ x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime;
+ x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime;
+ }
+ lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT - 1];
+ if (lifetime != NULL) {
+ x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
+ x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
+ x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
+ x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
+ }
+
+ sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1];
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL);
+
+ if (!uctx)
+ goto out;
+
+ err = security_xfrm_state_alloc(x, uctx);
+ kfree(uctx);
+
+ if (err)
+ goto out;
+ }
+
+ key = ext_hdrs[SADB_EXT_KEY_AUTH - 1];
+ if (sa->sadb_sa_auth) {
+ int keysize = 0;
+ struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth);
+ if (!a || !a->pfkey_supported) {
+ err = -ENOSYS;
+ goto out;
+ }
+ if (key)
+ keysize = (key->sadb_key_bits + 7) / 8;
+ x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL);
+ if (!x->aalg)
+ goto out;
+ strcpy(x->aalg->alg_name, a->name);
+ x->aalg->alg_key_len = 0;
+ if (key) {
+ x->aalg->alg_key_len = key->sadb_key_bits;
+ memcpy(x->aalg->alg_key, key+1, keysize);
+ }
+ x->aalg->alg_trunc_len = a->uinfo.auth.icv_truncbits;
+ x->props.aalgo = sa->sadb_sa_auth;
+ /* x->algo.flags = sa->sadb_sa_flags; */
+ }
+ if (sa->sadb_sa_encrypt) {
+ if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) {
+ struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt);
+ if (!a || !a->pfkey_supported) {
+ err = -ENOSYS;
+ goto out;
+ }
+ x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL);
+ if (!x->calg)
+ goto out;
+ strcpy(x->calg->alg_name, a->name);
+ x->props.calgo = sa->sadb_sa_encrypt;
+ } else {
+ int keysize = 0;
+ struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt);
+ if (!a || !a->pfkey_supported) {
+ err = -ENOSYS;
+ goto out;
+ }
+ key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1];
+ if (key)
+ keysize = (key->sadb_key_bits + 7) / 8;
+ x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL);
+ if (!x->ealg)
+ goto out;
+ strcpy(x->ealg->alg_name, a->name);
+ x->ealg->alg_key_len = 0;
+ if (key) {
+ x->ealg->alg_key_len = key->sadb_key_bits;
+ memcpy(x->ealg->alg_key, key+1, keysize);
+ }
+ x->props.ealgo = sa->sadb_sa_encrypt;
+ }
+ }
+ /* x->algo.flags = sa->sadb_sa_flags; */
+
+ x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ &x->props.saddr);
+ pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1],
+ &x->id.daddr);
+
+ if (ext_hdrs[SADB_X_EXT_SA2-1]) {
+ const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1];
+ int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode);
+ if (mode < 0) {
+ err = -EINVAL;
+ goto out;
+ }
+ x->props.mode = mode;
+ x->props.reqid = sa2->sadb_x_sa2_reqid;
+ }
+
+ if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) {
+ const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1];
+
+ /* Nobody uses this, but we try. */
+ x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr);
+ x->sel.prefixlen_s = addr->sadb_address_prefixlen;
+ }
+
+ if (!x->sel.family)
+ x->sel.family = x->props.family;
+
+ if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) {
+ const struct sadb_x_nat_t_type* n_type;
+ struct xfrm_encap_tmpl *natt;
+
+ x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL);
+ if (!x->encap)
+ goto out;
+
+ natt = x->encap;
+ n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1];
+ natt->encap_type = n_type->sadb_x_nat_t_type_type;
+
+ if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) {
+ const struct sadb_x_nat_t_port *n_port =
+ ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1];
+ natt->encap_sport = n_port->sadb_x_nat_t_port_port;
+ }
+ if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) {
+ const struct sadb_x_nat_t_port *n_port =
+ ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1];
+ natt->encap_dport = n_port->sadb_x_nat_t_port_port;
+ }
+ memset(&natt->encap_oa, 0, sizeof(natt->encap_oa));
+ }
+
+ err = xfrm_init_state(x);
+ if (err)
+ goto out;
+
+ x->km.seq = hdr->sadb_msg_seq;
+ return x;
+
+out:
+ x->km.state = XFRM_STATE_DEAD;
+ xfrm_state_put(x);
+ return ERR_PTR(err);
+}
+
+static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ return -EOPNOTSUPP;
+}
+
+static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ struct sk_buff *resp_skb;
+ struct sadb_x_sa2 *sa2;
+ struct sadb_address *saddr, *daddr;
+ struct sadb_msg *out_hdr;
+ struct sadb_spirange *range;
+ struct xfrm_state *x = NULL;
+ int mode;
+ int err;
+ u32 min_spi, max_spi;
+ u32 reqid;
+ u8 proto;
+ unsigned short family;
+ xfrm_address_t *xsaddr = NULL, *xdaddr = NULL;
+
+ if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
+ return -EINVAL;
+
+ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+ if (proto == 0)
+ return -EINVAL;
+
+ if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) {
+ mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode);
+ if (mode < 0)
+ return -EINVAL;
+ reqid = sa2->sadb_x_sa2_reqid;
+ } else {
+ mode = 0;
+ reqid = 0;
+ }
+
+ saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1];
+ daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1];
+
+ family = ((struct sockaddr *)(saddr + 1))->sa_family;
+ switch (family) {
+ case AF_INET:
+ xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr;
+ xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr;
+ xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr;
+ break;
+#endif
+ }
+
+ if (hdr->sadb_msg_seq) {
+ x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq);
+ if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) {
+ xfrm_state_put(x);
+ x = NULL;
+ }
+ }
+
+ if (!x)
+ x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family);
+
+ if (x == NULL)
+ return -ENOENT;
+
+ min_spi = 0x100;
+ max_spi = 0x0fffffff;
+
+ range = ext_hdrs[SADB_EXT_SPIRANGE-1];
+ if (range) {
+ min_spi = range->sadb_spirange_min;
+ max_spi = range->sadb_spirange_max;
+ }
+
+ err = verify_spi_info(x->id.proto, min_spi, max_spi);
+ if (err) {
+ xfrm_state_put(x);
+ return err;
+ }
+
+ err = xfrm_alloc_spi(x, min_spi, max_spi);
+ resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x);
+
+ if (IS_ERR(resp_skb)) {
+ xfrm_state_put(x);
+ return PTR_ERR(resp_skb);
+ }
+
+ out_hdr = (struct sadb_msg *) resp_skb->data;
+ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
+ out_hdr->sadb_msg_type = SADB_GETSPI;
+ out_hdr->sadb_msg_satype = pfkey_proto2satype(proto);
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_reserved = 0;
+ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
+ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
+
+ xfrm_state_put(x);
+
+ pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net);
+
+ return 0;
+}
+
+static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ struct xfrm_state *x;
+
+ if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8)
+ return -EOPNOTSUPP;
+
+ if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0)
+ return 0;
+
+ x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq);
+ if (x == NULL)
+ return 0;
+
+ spin_lock_bh(&x->lock);
+ if (x->km.state == XFRM_STATE_ACQ)
+ x->km.state = XFRM_STATE_ERROR;
+
+ spin_unlock_bh(&x->lock);
+ xfrm_state_put(x);
+ return 0;
+}
+
+static inline int event2poltype(int event)
+{
+ switch (event) {
+ case XFRM_MSG_DELPOLICY:
+ return SADB_X_SPDDELETE;
+ case XFRM_MSG_NEWPOLICY:
+ return SADB_X_SPDADD;
+ case XFRM_MSG_UPDPOLICY:
+ return SADB_X_SPDUPDATE;
+ case XFRM_MSG_POLEXPIRE:
+ // return SADB_X_SPDEXPIRE;
+ default:
+ pr_err("pfkey: Unknown policy event %d\n", event);
+ break;
+ }
+
+ return 0;
+}
+
+static inline int event2keytype(int event)
+{
+ switch (event) {
+ case XFRM_MSG_DELSA:
+ return SADB_DELETE;
+ case XFRM_MSG_NEWSA:
+ return SADB_ADD;
+ case XFRM_MSG_UPDSA:
+ return SADB_UPDATE;
+ case XFRM_MSG_EXPIRE:
+ return SADB_EXPIRE;
+ default:
+ pr_err("pfkey: Unknown SA event %d\n", event);
+ break;
+ }
+
+ return 0;
+}
+
+/* ADD/UPD/DEL */
+static int key_notify_sa(struct xfrm_state *x, const struct km_event *c)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+
+ skb = pfkey_xfrm_state2msg(x);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ hdr = (struct sadb_msg *) skb->data;
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_type = event2keytype(c->event);
+ hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+ hdr->sadb_msg_errno = 0;
+ hdr->sadb_msg_reserved = 0;
+ hdr->sadb_msg_seq = c->seq;
+ hdr->sadb_msg_pid = c->portid;
+
+ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x));
+
+ return 0;
+}
+
+static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ struct xfrm_state *x;
+ int err;
+ struct km_event c;
+
+ x = pfkey_msg2xfrm_state(net, hdr, ext_hdrs);
+ if (IS_ERR(x))
+ return PTR_ERR(x);
+
+ xfrm_state_hold(x);
+ if (hdr->sadb_msg_type == SADB_ADD)
+ err = xfrm_state_add(x);
+ else
+ err = xfrm_state_update(x);
+
+ xfrm_audit_state_add(x, err ? 0 : 1, true);
+
+ if (err < 0) {
+ x->km.state = XFRM_STATE_DEAD;
+ __xfrm_state_put(x);
+ goto out;
+ }
+
+ if (hdr->sadb_msg_type == SADB_ADD)
+ c.event = XFRM_MSG_NEWSA;
+ else
+ c.event = XFRM_MSG_UPDSA;
+ c.seq = hdr->sadb_msg_seq;
+ c.portid = hdr->sadb_msg_pid;
+ km_state_notify(x, &c);
+out:
+ xfrm_state_put(x);
+ return err;
+}
+
+static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ struct xfrm_state *x;
+ struct km_event c;
+ int err;
+
+ if (!ext_hdrs[SADB_EXT_SA-1] ||
+ !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
+ return -EINVAL;
+
+ x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs);
+ if (x == NULL)
+ return -ESRCH;
+
+ if ((err = security_xfrm_state_delete(x)))
+ goto out;
+
+ if (xfrm_state_kern(x)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ err = xfrm_state_delete(x);
+
+ if (err < 0)
+ goto out;
+
+ c.seq = hdr->sadb_msg_seq;
+ c.portid = hdr->sadb_msg_pid;
+ c.event = XFRM_MSG_DELSA;
+ km_state_notify(x, &c);
+out:
+ xfrm_audit_state_delete(x, err ? 0 : 1, true);
+ xfrm_state_put(x);
+
+ return err;
+}
+
+static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ __u8 proto;
+ struct sk_buff *out_skb;
+ struct sadb_msg *out_hdr;
+ struct xfrm_state *x;
+
+ if (!ext_hdrs[SADB_EXT_SA-1] ||
+ !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
+ return -EINVAL;
+
+ x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs);
+ if (x == NULL)
+ return -ESRCH;
+
+ out_skb = pfkey_xfrm_state2msg(x);
+ proto = x->id.proto;
+ xfrm_state_put(x);
+ if (IS_ERR(out_skb))
+ return PTR_ERR(out_skb);
+
+ out_hdr = (struct sadb_msg *) out_skb->data;
+ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
+ out_hdr->sadb_msg_type = SADB_GET;
+ out_hdr->sadb_msg_satype = pfkey_proto2satype(proto);
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_reserved = 0;
+ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
+ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
+ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
+
+ return 0;
+}
+
+static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig,
+ gfp_t allocation)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+ int len, auth_len, enc_len, i;
+
+ auth_len = xfrm_count_pfkey_auth_supported();
+ if (auth_len) {
+ auth_len *= sizeof(struct sadb_alg);
+ auth_len += sizeof(struct sadb_supported);
+ }
+
+ enc_len = xfrm_count_pfkey_enc_supported();
+ if (enc_len) {
+ enc_len *= sizeof(struct sadb_alg);
+ enc_len += sizeof(struct sadb_supported);
+ }
+
+ len = enc_len + auth_len + sizeof(struct sadb_msg);
+
+ skb = alloc_skb(len + 16, allocation);
+ if (!skb)
+ goto out_put_algs;
+
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(*hdr));
+ pfkey_hdr_dup(hdr, orig);
+ hdr->sadb_msg_errno = 0;
+ hdr->sadb_msg_len = len / sizeof(uint64_t);
+
+ if (auth_len) {
+ struct sadb_supported *sp;
+ struct sadb_alg *ap;
+
+ sp = (struct sadb_supported *) skb_put(skb, auth_len);
+ ap = (struct sadb_alg *) (sp + 1);
+
+ sp->sadb_supported_len = auth_len / sizeof(uint64_t);
+ sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
+
+ for (i = 0; ; i++) {
+ struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
+ if (!aalg)
+ break;
+ if (!aalg->pfkey_supported)
+ continue;
+ if (aalg->available)
+ *ap++ = aalg->desc;
+ }
+ }
+
+ if (enc_len) {
+ struct sadb_supported *sp;
+ struct sadb_alg *ap;
+
+ sp = (struct sadb_supported *) skb_put(skb, enc_len);
+ ap = (struct sadb_alg *) (sp + 1);
+
+ sp->sadb_supported_len = enc_len / sizeof(uint64_t);
+ sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT;
+
+ for (i = 0; ; i++) {
+ struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
+ if (!ealg)
+ break;
+ if (!ealg->pfkey_supported)
+ continue;
+ if (ealg->available)
+ *ap++ = ealg->desc;
+ }
+ }
+
+out_put_algs:
+ return skb;
+}
+
+static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct pfkey_sock *pfk = pfkey_sk(sk);
+ struct sk_buff *supp_skb;
+
+ if (hdr->sadb_msg_satype > SADB_SATYPE_MAX)
+ return -EINVAL;
+
+ if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) {
+ if (pfk->registered&(1<<hdr->sadb_msg_satype))
+ return -EEXIST;
+ pfk->registered |= (1<<hdr->sadb_msg_satype);
+ }
+
+ xfrm_probe_algs();
+
+ supp_skb = compose_sadb_supported(hdr, GFP_KERNEL);
+ if (!supp_skb) {
+ if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC)
+ pfk->registered &= ~(1<<hdr->sadb_msg_satype);
+
+ return -ENOBUFS;
+ }
+
+ pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk));
+
+ return 0;
+}
+
+static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+
+ skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
+ if (!skb)
+ return -ENOBUFS;
+
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+ memcpy(hdr, ihdr, sizeof(struct sadb_msg));
+ hdr->sadb_msg_errno = (uint8_t) 0;
+ hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+
+ return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
+}
+
+static int key_notify_sa_flush(const struct km_event *c)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+
+ skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
+ if (!skb)
+ return -ENOBUFS;
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+ hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto);
+ hdr->sadb_msg_type = SADB_FLUSH;
+ hdr->sadb_msg_seq = c->seq;
+ hdr->sadb_msg_pid = c->portid;
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_errno = (uint8_t) 0;
+ hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+ hdr->sadb_msg_reserved = 0;
+
+ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
+
+ return 0;
+}
+
+static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ unsigned int proto;
+ struct km_event c;
+ int err, err2;
+
+ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+ if (proto == 0)
+ return -EINVAL;
+
+ err = xfrm_state_flush(net, proto, true);
+ err2 = unicast_flush_resp(sk, hdr);
+ if (err || err2) {
+ if (err == -ESRCH) /* empty table - go quietly */
+ err = 0;
+ return err ? err : err2;
+ }
+
+ c.data.proto = proto;
+ c.seq = hdr->sadb_msg_seq;
+ c.portid = hdr->sadb_msg_pid;
+ c.event = XFRM_MSG_FLUSHSA;
+ c.net = net;
+ km_state_notify(NULL, &c);
+
+ return 0;
+}
+
+static int dump_sa(struct xfrm_state *x, int count, void *ptr)
+{
+ struct pfkey_sock *pfk = ptr;
+ struct sk_buff *out_skb;
+ struct sadb_msg *out_hdr;
+
+ if (!pfkey_can_dump(&pfk->sk))
+ return -ENOBUFS;
+
+ out_skb = pfkey_xfrm_state2msg(x);
+ if (IS_ERR(out_skb))
+ return PTR_ERR(out_skb);
+
+ out_hdr = (struct sadb_msg *) out_skb->data;
+ out_hdr->sadb_msg_version = pfk->dump.msg_version;
+ out_hdr->sadb_msg_type = SADB_DUMP;
+ out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_reserved = 0;
+ out_hdr->sadb_msg_seq = count + 1;
+ out_hdr->sadb_msg_pid = pfk->dump.msg_portid;
+
+ if (pfk->dump.skb)
+ pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+ &pfk->sk, sock_net(&pfk->sk));
+ pfk->dump.skb = out_skb;
+
+ return 0;
+}
+
+static int pfkey_dump_sa(struct pfkey_sock *pfk)
+{
+ struct net *net = sock_net(&pfk->sk);
+ return xfrm_state_walk(net, &pfk->dump.u.state, dump_sa, (void *) pfk);
+}
+
+static void pfkey_dump_sa_done(struct pfkey_sock *pfk)
+{
+ struct net *net = sock_net(&pfk->sk);
+
+ xfrm_state_walk_done(&pfk->dump.u.state, net);
+}
+
+static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ u8 proto;
+ struct xfrm_address_filter *filter = NULL;
+ struct pfkey_sock *pfk = pfkey_sk(sk);
+
+ if (pfk->dump.dump != NULL)
+ return -EBUSY;
+
+ proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+ if (proto == 0)
+ return -EINVAL;
+
+ if (ext_hdrs[SADB_X_EXT_FILTER - 1]) {
+ struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1];
+
+ filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+ if (filter == NULL)
+ return -ENOMEM;
+
+ memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr,
+ sizeof(xfrm_address_t));
+ memcpy(&filter->daddr, &xfilter->sadb_x_filter_daddr,
+ sizeof(xfrm_address_t));
+ filter->family = xfilter->sadb_x_filter_family;
+ filter->splen = xfilter->sadb_x_filter_splen;
+ filter->dplen = xfilter->sadb_x_filter_dplen;
+ }
+
+ pfk->dump.msg_version = hdr->sadb_msg_version;
+ pfk->dump.msg_portid = hdr->sadb_msg_pid;
+ pfk->dump.dump = pfkey_dump_sa;
+ pfk->dump.done = pfkey_dump_sa_done;
+ xfrm_state_walk_init(&pfk->dump.u.state, proto, filter);
+
+ return pfkey_do_dump(pfk);
+}
+
+static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct pfkey_sock *pfk = pfkey_sk(sk);
+ int satype = hdr->sadb_msg_satype;
+ bool reset_errno = false;
+
+ if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) {
+ reset_errno = true;
+ if (satype != 0 && satype != 1)
+ return -EINVAL;
+ pfk->promisc = satype;
+ }
+ if (reset_errno && skb_cloned(skb))
+ skb = skb_copy(skb, GFP_KERNEL);
+ else
+ skb = skb_clone(skb, GFP_KERNEL);
+
+ if (reset_errno && skb) {
+ struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data;
+ new_hdr->sadb_msg_errno = 0;
+ }
+
+ pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk));
+ return 0;
+}
+
+static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr)
+{
+ int i;
+ u32 reqid = *(u32*)ptr;
+
+ for (i=0; i<xp->xfrm_nr; i++) {
+ if (xp->xfrm_vec[i].reqid == reqid)
+ return -EEXIST;
+ }
+ return 0;
+}
+
+static u32 gen_reqid(struct net *net)
+{
+ struct xfrm_policy_walk walk;
+ u32 start;
+ int rc;
+ static u32 reqid = IPSEC_MANUAL_REQID_MAX;
+
+ start = reqid;
+ do {
+ ++reqid;
+ if (reqid == 0)
+ reqid = IPSEC_MANUAL_REQID_MAX+1;
+ xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN);
+ rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid);
+ xfrm_policy_walk_done(&walk, net);
+ if (rc != -EEXIST)
+ return reqid;
+ } while (reqid != start);
+ return 0;
+}
+
+static int
+parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
+{
+ struct net *net = xp_net(xp);
+ struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr;
+ int mode;
+
+ if (xp->xfrm_nr >= XFRM_MAX_DEPTH)
+ return -ELOOP;
+
+ if (rq->sadb_x_ipsecrequest_mode == 0)
+ return -EINVAL;
+
+ t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */
+ if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0)
+ return -EINVAL;
+ t->mode = mode;
+ if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE)
+ t->optional = 1;
+ else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) {
+ t->reqid = rq->sadb_x_ipsecrequest_reqid;
+ if (t->reqid > IPSEC_MANUAL_REQID_MAX)
+ t->reqid = 0;
+ if (!t->reqid && !(t->reqid = gen_reqid(net)))
+ return -ENOBUFS;
+ }
+
+ /* addresses present only in tunnel mode */
+ if (t->mode == XFRM_MODE_TUNNEL) {
+ u8 *sa = (u8 *) (rq + 1);
+ int family, socklen;
+
+ family = pfkey_sockaddr_extract((struct sockaddr *)sa,
+ &t->saddr);
+ if (!family)
+ return -EINVAL;
+
+ socklen = pfkey_sockaddr_len(family);
+ if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen),
+ &t->id.daddr) != family)
+ return -EINVAL;
+ t->encap_family = family;
+ } else
+ t->encap_family = xp->family;
+
+ /* No way to set this via kame pfkey */
+ t->allalgs = 1;
+ xp->xfrm_nr++;
+ return 0;
+}
+
+static int
+parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
+{
+ int err;
+ int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy);
+ struct sadb_x_ipsecrequest *rq = (void*)(pol+1);
+
+ if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy))
+ return -EINVAL;
+
+ while (len >= sizeof(struct sadb_x_ipsecrequest)) {
+ if ((err = parse_ipsecrequest(xp, rq)) < 0)
+ return err;
+ len -= rq->sadb_x_ipsecrequest_len;
+ rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len);
+ }
+ return 0;
+}
+
+static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp)
+{
+ struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+
+ if (xfrm_ctx) {
+ int len = sizeof(struct sadb_x_sec_ctx);
+ len += xfrm_ctx->ctx_len;
+ return PFKEY_ALIGN8(len);
+ }
+ return 0;
+}
+
+static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp)
+{
+ const struct xfrm_tmpl *t;
+ int sockaddr_size = pfkey_sockaddr_size(xp->family);
+ int socklen = 0;
+ int i;
+
+ for (i=0; i<xp->xfrm_nr; i++) {
+ t = xp->xfrm_vec + i;
+ socklen += pfkey_sockaddr_len(t->encap_family);
+ }
+
+ return sizeof(struct sadb_msg) +
+ (sizeof(struct sadb_lifetime) * 3) +
+ (sizeof(struct sadb_address) * 2) +
+ (sockaddr_size * 2) +
+ sizeof(struct sadb_x_policy) +
+ (xp->xfrm_nr * sizeof(struct sadb_x_ipsecrequest)) +
+ (socklen * 2) +
+ pfkey_xfrm_policy2sec_ctx_size(xp);
+}
+
+static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp)
+{
+ struct sk_buff *skb;
+ int size;
+
+ size = pfkey_xfrm_policy2msg_size(xp);
+
+ skb = alloc_skb(size + 16, GFP_ATOMIC);
+ if (skb == NULL)
+ return ERR_PTR(-ENOBUFS);
+
+ return skb;
+}
+
+static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir)
+{
+ struct sadb_msg *hdr;
+ struct sadb_address *addr;
+ struct sadb_lifetime *lifetime;
+ struct sadb_x_policy *pol;
+ struct sadb_x_sec_ctx *sec_ctx;
+ struct xfrm_sec_ctx *xfrm_ctx;
+ int i;
+ int size;
+ int sockaddr_size = pfkey_sockaddr_size(xp->family);
+ int socklen = pfkey_sockaddr_len(xp->family);
+
+ size = pfkey_xfrm_policy2msg_size(xp);
+
+ /* call should fill header later */
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+ memset(hdr, 0, size); /* XXX do we need this ? */
+
+ /* src address */
+ addr = (struct sadb_address*) skb_put(skb,
+ sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+ addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto);
+ addr->sadb_address_prefixlen = xp->selector.prefixlen_s;
+ addr->sadb_address_reserved = 0;
+ if (!pfkey_sockaddr_fill(&xp->selector.saddr,
+ xp->selector.sport,
+ (struct sockaddr *) (addr + 1),
+ xp->family))
+ BUG();
+
+ /* dst address */
+ addr = (struct sadb_address*) skb_put(skb,
+ sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+ addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto);
+ addr->sadb_address_prefixlen = xp->selector.prefixlen_d;
+ addr->sadb_address_reserved = 0;
+
+ pfkey_sockaddr_fill(&xp->selector.daddr, xp->selector.dport,
+ (struct sockaddr *) (addr + 1),
+ xp->family);
+
+ /* hard time */
+ lifetime = (struct sadb_lifetime *) skb_put(skb,
+ sizeof(struct sadb_lifetime));
+ lifetime->sadb_lifetime_len =
+ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
+ lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit);
+ lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit);
+ lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds;
+ lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds;
+ /* soft time */
+ lifetime = (struct sadb_lifetime *) skb_put(skb,
+ sizeof(struct sadb_lifetime));
+ lifetime->sadb_lifetime_len =
+ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
+ lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit);
+ lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit);
+ lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds;
+ lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds;
+ /* current time */
+ lifetime = (struct sadb_lifetime *) skb_put(skb,
+ sizeof(struct sadb_lifetime));
+ lifetime->sadb_lifetime_len =
+ sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+ lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
+ lifetime->sadb_lifetime_allocations = xp->curlft.packets;
+ lifetime->sadb_lifetime_bytes = xp->curlft.bytes;
+ lifetime->sadb_lifetime_addtime = xp->curlft.add_time;
+ lifetime->sadb_lifetime_usetime = xp->curlft.use_time;
+
+ pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy));
+ pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t);
+ pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
+ pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD;
+ if (xp->action == XFRM_POLICY_ALLOW) {
+ if (xp->xfrm_nr)
+ pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC;
+ else
+ pol->sadb_x_policy_type = IPSEC_POLICY_NONE;
+ }
+ pol->sadb_x_policy_dir = dir+1;
+ pol->sadb_x_policy_reserved = 0;
+ pol->sadb_x_policy_id = xp->index;
+ pol->sadb_x_policy_priority = xp->priority;
+
+ for (i=0; i<xp->xfrm_nr; i++) {
+ const struct xfrm_tmpl *t = xp->xfrm_vec + i;
+ struct sadb_x_ipsecrequest *rq;
+ int req_size;
+ int mode;
+
+ req_size = sizeof(struct sadb_x_ipsecrequest);
+ if (t->mode == XFRM_MODE_TUNNEL) {
+ socklen = pfkey_sockaddr_len(t->encap_family);
+ req_size += socklen * 2;
+ } else {
+ size -= 2*socklen;
+ }
+ rq = (void*)skb_put(skb, req_size);
+ pol->sadb_x_policy_len += req_size/8;
+ memset(rq, 0, sizeof(*rq));
+ rq->sadb_x_ipsecrequest_len = req_size;
+ rq->sadb_x_ipsecrequest_proto = t->id.proto;
+ if ((mode = pfkey_mode_from_xfrm(t->mode)) < 0)
+ return -EINVAL;
+ rq->sadb_x_ipsecrequest_mode = mode;
+ rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE;
+ if (t->reqid)
+ rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE;
+ if (t->optional)
+ rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE;
+ rq->sadb_x_ipsecrequest_reqid = t->reqid;
+
+ if (t->mode == XFRM_MODE_TUNNEL) {
+ u8 *sa = (void *)(rq + 1);
+ pfkey_sockaddr_fill(&t->saddr, 0,
+ (struct sockaddr *)sa,
+ t->encap_family);
+ pfkey_sockaddr_fill(&t->id.daddr, 0,
+ (struct sockaddr *) (sa + socklen),
+ t->encap_family);
+ }
+ }
+
+ /* security context */
+ if ((xfrm_ctx = xp->security)) {
+ int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp);
+
+ sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size);
+ sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t);
+ sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+ sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+ sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+ sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+ memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+ xfrm_ctx->ctx_len);
+ }
+
+ hdr->sadb_msg_len = size / sizeof(uint64_t);
+ hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
+
+ return 0;
+}
+
+static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+ struct sk_buff *out_skb;
+ struct sadb_msg *out_hdr;
+ int err;
+
+ out_skb = pfkey_xfrm_policy2msg_prep(xp);
+ if (IS_ERR(out_skb))
+ return PTR_ERR(out_skb);
+
+ err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
+ if (err < 0)
+ return err;
+
+ out_hdr = (struct sadb_msg *) out_skb->data;
+ out_hdr->sadb_msg_version = PF_KEY_V2;
+
+ if (c->data.byid && c->event == XFRM_MSG_DELPOLICY)
+ out_hdr->sadb_msg_type = SADB_X_SPDDELETE2;
+ else
+ out_hdr->sadb_msg_type = event2poltype(c->event);
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_seq = c->seq;
+ out_hdr->sadb_msg_pid = c->portid;
+ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp));
+ return 0;
+
+}
+
+static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ int err = 0;
+ struct sadb_lifetime *lifetime;
+ struct sadb_address *sa;
+ struct sadb_x_policy *pol;
+ struct xfrm_policy *xp;
+ struct km_event c;
+ struct sadb_x_sec_ctx *sec_ctx;
+
+ if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
+ !ext_hdrs[SADB_X_EXT_POLICY-1])
+ return -EINVAL;
+
+ pol = ext_hdrs[SADB_X_EXT_POLICY-1];
+ if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC)
+ return -EINVAL;
+ if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX)
+ return -EINVAL;
+
+ xp = xfrm_policy_alloc(net, GFP_KERNEL);
+ if (xp == NULL)
+ return -ENOBUFS;
+
+ xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ?
+ XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW);
+ xp->priority = pol->sadb_x_policy_priority;
+
+ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1];
+ xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr);
+ xp->selector.family = xp->family;
+ xp->selector.prefixlen_s = sa->sadb_address_prefixlen;
+ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+ xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port;
+ if (xp->selector.sport)
+ xp->selector.sport_mask = htons(0xffff);
+
+ sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1];
+ pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr);
+ xp->selector.prefixlen_d = sa->sadb_address_prefixlen;
+
+ /* Amusing, we set this twice. KAME apps appear to set same value
+ * in both addresses.
+ */
+ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+
+ xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port;
+ if (xp->selector.dport)
+ xp->selector.dport_mask = htons(0xffff);
+
+ sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1];
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL);
+
+ if (!uctx) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ err = security_xfrm_policy_alloc(&xp->security, uctx, GFP_KERNEL);
+ kfree(uctx);
+
+ if (err)
+ goto out;
+ }
+
+ xp->lft.soft_byte_limit = XFRM_INF;
+ xp->lft.hard_byte_limit = XFRM_INF;
+ xp->lft.soft_packet_limit = XFRM_INF;
+ xp->lft.hard_packet_limit = XFRM_INF;
+ if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) {
+ xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
+ xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
+ xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime;
+ xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime;
+ }
+ if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) {
+ xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
+ xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
+ xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
+ xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
+ }
+ xp->xfrm_nr = 0;
+ if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
+ (err = parse_ipsecrequests(xp, pol)) < 0)
+ goto out;
+
+ err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
+ hdr->sadb_msg_type != SADB_X_SPDUPDATE);
+
+ xfrm_audit_policy_add(xp, err ? 0 : 1, true);
+
+ if (err)
+ goto out;
+
+ if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
+ c.event = XFRM_MSG_UPDPOLICY;
+ else
+ c.event = XFRM_MSG_NEWPOLICY;
+
+ c.seq = hdr->sadb_msg_seq;
+ c.portid = hdr->sadb_msg_pid;
+
+ km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+ xfrm_pol_put(xp);
+ return 0;
+
+out:
+ xp->walk.dead = 1;
+ xfrm_policy_destroy(xp);
+ return err;
+}
+
+static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ int err;
+ struct sadb_address *sa;
+ struct sadb_x_policy *pol;
+ struct xfrm_policy *xp;
+ struct xfrm_selector sel;
+ struct km_event c;
+ struct sadb_x_sec_ctx *sec_ctx;
+ struct xfrm_sec_ctx *pol_ctx = NULL;
+
+ if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+ ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
+ !ext_hdrs[SADB_X_EXT_POLICY-1])
+ return -EINVAL;
+
+ pol = ext_hdrs[SADB_X_EXT_POLICY-1];
+ if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX)
+ return -EINVAL;
+
+ memset(&sel, 0, sizeof(sel));
+
+ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1];
+ sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr);
+ sel.prefixlen_s = sa->sadb_address_prefixlen;
+ sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+ sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port;
+ if (sel.sport)
+ sel.sport_mask = htons(0xffff);
+
+ sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1];
+ pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr);
+ sel.prefixlen_d = sa->sadb_address_prefixlen;
+ sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+ sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port;
+ if (sel.dport)
+ sel.dport_mask = htons(0xffff);
+
+ sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1];
+ if (sec_ctx != NULL) {
+ struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL);
+
+ if (!uctx)
+ return -ENOMEM;
+
+ err = security_xfrm_policy_alloc(&pol_ctx, uctx, GFP_KERNEL);
+ kfree(uctx);
+ if (err)
+ return err;
+ }
+
+ xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+ pol->sadb_x_policy_dir - 1, &sel, pol_ctx,
+ 1, &err);
+ security_xfrm_policy_free(pol_ctx);
+ if (xp == NULL)
+ return -ENOENT;
+
+ xfrm_audit_policy_delete(xp, err ? 0 : 1, true);
+
+ if (err)
+ goto out;
+
+ c.seq = hdr->sadb_msg_seq;
+ c.portid = hdr->sadb_msg_pid;
+ c.data.byid = 0;
+ c.event = XFRM_MSG_DELPOLICY;
+ km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+
+out:
+ xfrm_pol_put(xp);
+ if (err == 0)
+ xfrm_garbage_collect(net);
+ return err;
+}
+
+static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir)
+{
+ int err;
+ struct sk_buff *out_skb;
+ struct sadb_msg *out_hdr;
+ err = 0;
+
+ out_skb = pfkey_xfrm_policy2msg_prep(xp);
+ if (IS_ERR(out_skb)) {
+ err = PTR_ERR(out_skb);
+ goto out;
+ }
+ err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
+ if (err < 0)
+ goto out;
+
+ out_hdr = (struct sadb_msg *) out_skb->data;
+ out_hdr->sadb_msg_version = hdr->sadb_msg_version;
+ out_hdr->sadb_msg_type = hdr->sadb_msg_type;
+ out_hdr->sadb_msg_satype = 0;
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
+ out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
+ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp));
+ err = 0;
+
+out:
+ return err;
+}
+
+#ifdef CONFIG_NET_KEY_MIGRATE
+static int pfkey_sockaddr_pair_size(sa_family_t family)
+{
+ return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2);
+}
+
+static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
+ xfrm_address_t *saddr, xfrm_address_t *daddr,
+ u16 *family)
+{
+ int af, socklen;
+
+ if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
+ return -EINVAL;
+
+ af = pfkey_sockaddr_extract(sa, saddr);
+ if (!af)
+ return -EINVAL;
+
+ socklen = pfkey_sockaddr_len(af);
+ if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen),
+ daddr) != af)
+ return -EINVAL;
+
+ *family = af;
+ return 0;
+}
+
+static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
+ struct xfrm_migrate *m)
+{
+ int err;
+ struct sadb_x_ipsecrequest *rq2;
+ int mode;
+
+ if (len <= sizeof(struct sadb_x_ipsecrequest) ||
+ len < rq1->sadb_x_ipsecrequest_len)
+ return -EINVAL;
+
+ /* old endoints */
+ err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1),
+ rq1->sadb_x_ipsecrequest_len,
+ &m->old_saddr, &m->old_daddr,
+ &m->old_family);
+ if (err)
+ return err;
+
+ rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len);
+ len -= rq1->sadb_x_ipsecrequest_len;
+
+ if (len <= sizeof(struct sadb_x_ipsecrequest) ||
+ len < rq2->sadb_x_ipsecrequest_len)
+ return -EINVAL;
+
+ /* new endpoints */
+ err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1),
+ rq2->sadb_x_ipsecrequest_len,
+ &m->new_saddr, &m->new_daddr,
+ &m->new_family);
+ if (err)
+ return err;
+
+ if (rq1->sadb_x_ipsecrequest_proto != rq2->sadb_x_ipsecrequest_proto ||
+ rq1->sadb_x_ipsecrequest_mode != rq2->sadb_x_ipsecrequest_mode ||
+ rq1->sadb_x_ipsecrequest_reqid != rq2->sadb_x_ipsecrequest_reqid)
+ return -EINVAL;
+
+ m->proto = rq1->sadb_x_ipsecrequest_proto;
+ if ((mode = pfkey_mode_to_xfrm(rq1->sadb_x_ipsecrequest_mode)) < 0)
+ return -EINVAL;
+ m->mode = mode;
+ m->reqid = rq1->sadb_x_ipsecrequest_reqid;
+
+ return ((int)(rq1->sadb_x_ipsecrequest_len +
+ rq2->sadb_x_ipsecrequest_len));
+}
+
+static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
+ const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ int i, len, ret, err = -EINVAL;
+ u8 dir;
+ struct sadb_address *sa;
+ struct sadb_x_kmaddress *kma;
+ struct sadb_x_policy *pol;
+ struct sadb_x_ipsecrequest *rq;
+ struct xfrm_selector sel;
+ struct xfrm_migrate m[XFRM_MAX_DEPTH];
+ struct xfrm_kmaddress k;
+ struct net *net = sock_net(sk);
+
+ if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1],
+ ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) ||
+ !ext_hdrs[SADB_X_EXT_POLICY - 1]) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1];
+ pol = ext_hdrs[SADB_X_EXT_POLICY - 1];
+
+ if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (kma) {
+ /* convert sadb_x_kmaddress to xfrm_kmaddress */
+ k.reserved = kma->sadb_x_kmaddress_reserved;
+ ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1),
+ 8*(kma->sadb_x_kmaddress_len) - sizeof(*kma),
+ &k.local, &k.remote, &k.family);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ }
+
+ dir = pol->sadb_x_policy_dir - 1;
+ memset(&sel, 0, sizeof(sel));
+
+ /* set source address info of selector */
+ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC - 1];
+ sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr);
+ sel.prefixlen_s = sa->sadb_address_prefixlen;
+ sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+ sel.sport = ((struct sockaddr_in *)(sa + 1))->sin_port;
+ if (sel.sport)
+ sel.sport_mask = htons(0xffff);
+
+ /* set destination address info of selector */
+ sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1];
+ pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr);
+ sel.prefixlen_d = sa->sadb_address_prefixlen;
+ sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+ sel.dport = ((struct sockaddr_in *)(sa + 1))->sin_port;
+ if (sel.dport)
+ sel.dport_mask = htons(0xffff);
+
+ rq = (struct sadb_x_ipsecrequest *)(pol + 1);
+
+ /* extract ipsecrequests */
+ i = 0;
+ len = pol->sadb_x_policy_len * 8 - sizeof(struct sadb_x_policy);
+
+ while (len > 0 && i < XFRM_MAX_DEPTH) {
+ ret = ipsecrequests_to_migrate(rq, len, &m[i]);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ } else {
+ rq = (struct sadb_x_ipsecrequest *)((u8 *)rq + ret);
+ len -= ret;
+ i++;
+ }
+ }
+
+ if (!i || len > 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i,
+ kma ? &k : NULL, net);
+
+ out:
+ return err;
+}
+#else
+static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
+ const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ return -ENOPROTOOPT;
+}
+#endif
+
+
+static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ unsigned int dir;
+ int err = 0, delete;
+ struct sadb_x_policy *pol;
+ struct xfrm_policy *xp;
+ struct km_event c;
+
+ if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL)
+ return -EINVAL;
+
+ dir = xfrm_policy_id2dir(pol->sadb_x_policy_id);
+ if (dir >= XFRM_POLICY_MAX)
+ return -EINVAL;
+
+ delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2);
+ xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+ dir, pol->sadb_x_policy_id, delete, &err);
+ if (xp == NULL)
+ return -ENOENT;
+
+ if (delete) {
+ xfrm_audit_policy_delete(xp, err ? 0 : 1, true);
+
+ if (err)
+ goto out;
+ c.seq = hdr->sadb_msg_seq;
+ c.portid = hdr->sadb_msg_pid;
+ c.data.byid = 1;
+ c.event = XFRM_MSG_DELPOLICY;
+ km_policy_notify(xp, dir, &c);
+ } else {
+ err = key_pol_get_resp(sk, xp, hdr, dir);
+ }
+
+out:
+ xfrm_pol_put(xp);
+ if (delete && err == 0)
+ xfrm_garbage_collect(net);
+ return err;
+}
+
+static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
+{
+ struct pfkey_sock *pfk = ptr;
+ struct sk_buff *out_skb;
+ struct sadb_msg *out_hdr;
+ int err;
+
+ if (!pfkey_can_dump(&pfk->sk))
+ return -ENOBUFS;
+
+ out_skb = pfkey_xfrm_policy2msg_prep(xp);
+ if (IS_ERR(out_skb))
+ return PTR_ERR(out_skb);
+
+ err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
+ if (err < 0)
+ return err;
+
+ out_hdr = (struct sadb_msg *) out_skb->data;
+ out_hdr->sadb_msg_version = pfk->dump.msg_version;
+ out_hdr->sadb_msg_type = SADB_X_SPDDUMP;
+ out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_seq = count + 1;
+ out_hdr->sadb_msg_pid = pfk->dump.msg_portid;
+
+ if (pfk->dump.skb)
+ pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+ &pfk->sk, sock_net(&pfk->sk));
+ pfk->dump.skb = out_skb;
+
+ return 0;
+}
+
+static int pfkey_dump_sp(struct pfkey_sock *pfk)
+{
+ struct net *net = sock_net(&pfk->sk);
+ return xfrm_policy_walk(net, &pfk->dump.u.policy, dump_sp, (void *) pfk);
+}
+
+static void pfkey_dump_sp_done(struct pfkey_sock *pfk)
+{
+ struct net *net = sock_net((struct sock *)pfk);
+
+ xfrm_policy_walk_done(&pfk->dump.u.policy, net);
+}
+
+static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct pfkey_sock *pfk = pfkey_sk(sk);
+
+ if (pfk->dump.dump != NULL)
+ return -EBUSY;
+
+ pfk->dump.msg_version = hdr->sadb_msg_version;
+ pfk->dump.msg_portid = hdr->sadb_msg_pid;
+ pfk->dump.dump = pfkey_dump_sp;
+ pfk->dump.done = pfkey_dump_sp_done;
+ xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN);
+
+ return pfkey_do_dump(pfk);
+}
+
+static int key_notify_policy_flush(const struct km_event *c)
+{
+ struct sk_buff *skb_out;
+ struct sadb_msg *hdr;
+
+ skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
+ if (!skb_out)
+ return -ENOBUFS;
+ hdr = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
+ hdr->sadb_msg_type = SADB_X_SPDFLUSH;
+ hdr->sadb_msg_seq = c->seq;
+ hdr->sadb_msg_pid = c->portid;
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_errno = (uint8_t) 0;
+ hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
+ hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+ hdr->sadb_msg_reserved = 0;
+ pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
+ return 0;
+
+}
+
+static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+ struct net *net = sock_net(sk);
+ struct km_event c;
+ int err, err2;
+
+ err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true);
+ err2 = unicast_flush_resp(sk, hdr);
+ if (err || err2) {
+ if (err == -ESRCH) /* empty table - old silent behavior */
+ return 0;
+ return err;
+ }
+
+ c.data.type = XFRM_POLICY_TYPE_MAIN;
+ c.event = XFRM_MSG_FLUSHPOLICY;
+ c.portid = hdr->sadb_msg_pid;
+ c.seq = hdr->sadb_msg_seq;
+ c.net = net;
+ km_policy_notify(NULL, 0, &c);
+
+ return 0;
+}
+
+typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb,
+ const struct sadb_msg *hdr, void * const *ext_hdrs);
+static const pfkey_handler pfkey_funcs[SADB_MAX + 1] = {
+ [SADB_RESERVED] = pfkey_reserved,
+ [SADB_GETSPI] = pfkey_getspi,
+ [SADB_UPDATE] = pfkey_add,
+ [SADB_ADD] = pfkey_add,
+ [SADB_DELETE] = pfkey_delete,
+ [SADB_GET] = pfkey_get,
+ [SADB_ACQUIRE] = pfkey_acquire,
+ [SADB_REGISTER] = pfkey_register,
+ [SADB_EXPIRE] = NULL,
+ [SADB_FLUSH] = pfkey_flush,
+ [SADB_DUMP] = pfkey_dump,
+ [SADB_X_PROMISC] = pfkey_promisc,
+ [SADB_X_PCHANGE] = NULL,
+ [SADB_X_SPDUPDATE] = pfkey_spdadd,
+ [SADB_X_SPDADD] = pfkey_spdadd,
+ [SADB_X_SPDDELETE] = pfkey_spddelete,
+ [SADB_X_SPDGET] = pfkey_spdget,
+ [SADB_X_SPDACQUIRE] = NULL,
+ [SADB_X_SPDDUMP] = pfkey_spddump,
+ [SADB_X_SPDFLUSH] = pfkey_spdflush,
+ [SADB_X_SPDSETIDX] = pfkey_spdadd,
+ [SADB_X_SPDDELETE2] = pfkey_spdget,
+ [SADB_X_MIGRATE] = pfkey_migrate,
+};
+
+static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr)
+{
+ void *ext_hdrs[SADB_EXT_MAX];
+ int err;
+
+ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
+ BROADCAST_PROMISC_ONLY, NULL, sock_net(sk));
+
+ memset(ext_hdrs, 0, sizeof(ext_hdrs));
+ err = parse_exthdrs(skb, hdr, ext_hdrs);
+ if (!err) {
+ err = -EOPNOTSUPP;
+ if (pfkey_funcs[hdr->sadb_msg_type])
+ err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs);
+ }
+ return err;
+}
+
+static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp)
+{
+ struct sadb_msg *hdr = NULL;
+
+ if (skb->len < sizeof(*hdr)) {
+ *errp = -EMSGSIZE;
+ } else {
+ hdr = (struct sadb_msg *) skb->data;
+ if (hdr->sadb_msg_version != PF_KEY_V2 ||
+ hdr->sadb_msg_reserved != 0 ||
+ (hdr->sadb_msg_type <= SADB_RESERVED ||
+ hdr->sadb_msg_type > SADB_MAX)) {
+ hdr = NULL;
+ *errp = -EINVAL;
+ } else if (hdr->sadb_msg_len != (skb->len /
+ sizeof(uint64_t)) ||
+ hdr->sadb_msg_len < (sizeof(struct sadb_msg) /
+ sizeof(uint64_t))) {
+ hdr = NULL;
+ *errp = -EMSGSIZE;
+ } else {
+ *errp = 0;
+ }
+ }
+ return hdr;
+}
+
+static inline int aalg_tmpl_set(const struct xfrm_tmpl *t,
+ const struct xfrm_algo_desc *d)
+{
+ unsigned int id = d->desc.sadb_alg_id;
+
+ if (id >= sizeof(t->aalgos) * 8)
+ return 0;
+
+ return (t->aalgos >> id) & 1;
+}
+
+static inline int ealg_tmpl_set(const struct xfrm_tmpl *t,
+ const struct xfrm_algo_desc *d)
+{
+ unsigned int id = d->desc.sadb_alg_id;
+
+ if (id >= sizeof(t->ealgos) * 8)
+ return 0;
+
+ return (t->ealgos >> id) & 1;
+}
+
+static int count_ah_combs(const struct xfrm_tmpl *t)
+{
+ int i, sz = 0;
+
+ for (i = 0; ; i++) {
+ const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
+ if (!aalg)
+ break;
+ if (!aalg->pfkey_supported)
+ continue;
+ if (aalg_tmpl_set(t, aalg) && aalg->available)
+ sz += sizeof(struct sadb_comb);
+ }
+ return sz + sizeof(struct sadb_prop);
+}
+
+static int count_esp_combs(const struct xfrm_tmpl *t)
+{
+ int i, k, sz = 0;
+
+ for (i = 0; ; i++) {
+ const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
+ if (!ealg)
+ break;
+
+ if (!ealg->pfkey_supported)
+ continue;
+
+ if (!(ealg_tmpl_set(t, ealg) && ealg->available))
+ continue;
+
+ for (k = 1; ; k++) {
+ const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
+ if (!aalg)
+ break;
+
+ if (!aalg->pfkey_supported)
+ continue;
+
+ if (aalg_tmpl_set(t, aalg) && aalg->available)
+ sz += sizeof(struct sadb_comb);
+ }
+ }
+ return sz + sizeof(struct sadb_prop);
+}
+
+static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
+{
+ struct sadb_prop *p;
+ int i;
+
+ p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop));
+ p->sadb_prop_len = sizeof(struct sadb_prop)/8;
+ p->sadb_prop_exttype = SADB_EXT_PROPOSAL;
+ p->sadb_prop_replay = 32;
+ memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved));
+
+ for (i = 0; ; i++) {
+ const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
+ if (!aalg)
+ break;
+
+ if (!aalg->pfkey_supported)
+ continue;
+
+ if (aalg_tmpl_set(t, aalg) && aalg->available) {
+ struct sadb_comb *c;
+ c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb));
+ memset(c, 0, sizeof(*c));
+ p->sadb_prop_len += sizeof(struct sadb_comb)/8;
+ c->sadb_comb_auth = aalg->desc.sadb_alg_id;
+ c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits;
+ c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits;
+ c->sadb_comb_hard_addtime = 24*60*60;
+ c->sadb_comb_soft_addtime = 20*60*60;
+ c->sadb_comb_hard_usetime = 8*60*60;
+ c->sadb_comb_soft_usetime = 7*60*60;
+ }
+ }
+}
+
+static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
+{
+ struct sadb_prop *p;
+ int i, k;
+
+ p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop));
+ p->sadb_prop_len = sizeof(struct sadb_prop)/8;
+ p->sadb_prop_exttype = SADB_EXT_PROPOSAL;
+ p->sadb_prop_replay = 32;
+ memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved));
+
+ for (i=0; ; i++) {
+ const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
+ if (!ealg)
+ break;
+
+ if (!ealg->pfkey_supported)
+ continue;
+
+ if (!(ealg_tmpl_set(t, ealg) && ealg->available))
+ continue;
+
+ for (k = 1; ; k++) {
+ struct sadb_comb *c;
+ const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
+ if (!aalg)
+ break;
+ if (!aalg->pfkey_supported)
+ continue;
+ if (!(aalg_tmpl_set(t, aalg) && aalg->available))
+ continue;
+ c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb));
+ memset(c, 0, sizeof(*c));
+ p->sadb_prop_len += sizeof(struct sadb_comb)/8;
+ c->sadb_comb_auth = aalg->desc.sadb_alg_id;
+ c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits;
+ c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits;
+ c->sadb_comb_encrypt = ealg->desc.sadb_alg_id;
+ c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits;
+ c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits;
+ c->sadb_comb_hard_addtime = 24*60*60;
+ c->sadb_comb_soft_addtime = 20*60*60;
+ c->sadb_comb_hard_usetime = 8*60*60;
+ c->sadb_comb_soft_usetime = 7*60*60;
+ }
+ }
+}
+
+static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c)
+{
+ return 0;
+}
+
+static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c)
+{
+ struct sk_buff *out_skb;
+ struct sadb_msg *out_hdr;
+ int hard;
+ int hsc;
+
+ hard = c->data.hard;
+ if (hard)
+ hsc = 2;
+ else
+ hsc = 1;
+
+ out_skb = pfkey_xfrm_state2msg_expire(x, hsc);
+ if (IS_ERR(out_skb))
+ return PTR_ERR(out_skb);
+
+ out_hdr = (struct sadb_msg *) out_skb->data;
+ out_hdr->sadb_msg_version = PF_KEY_V2;
+ out_hdr->sadb_msg_type = SADB_EXPIRE;
+ out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_reserved = 0;
+ out_hdr->sadb_msg_seq = 0;
+ out_hdr->sadb_msg_pid = 0;
+
+ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+ return 0;
+}
+
+static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c)
+{
+ struct net *net = x ? xs_net(x) : c->net;
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+ if (atomic_read(&net_pfkey->socks_nr) == 0)
+ return 0;
+
+ switch (c->event) {
+ case XFRM_MSG_EXPIRE:
+ return key_notify_sa_expire(x, c);
+ case XFRM_MSG_DELSA:
+ case XFRM_MSG_NEWSA:
+ case XFRM_MSG_UPDSA:
+ return key_notify_sa(x, c);
+ case XFRM_MSG_FLUSHSA:
+ return key_notify_sa_flush(c);
+ case XFRM_MSG_NEWAE: /* not yet supported */
+ break;
+ default:
+ pr_err("pfkey: Unknown SA event %d\n", c->event);
+ break;
+ }
+
+ return 0;
+}
+
+static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+ if (xp && xp->type != XFRM_POLICY_TYPE_MAIN)
+ return 0;
+
+ switch (c->event) {
+ case XFRM_MSG_POLEXPIRE:
+ return key_notify_policy_expire(xp, c);
+ case XFRM_MSG_DELPOLICY:
+ case XFRM_MSG_NEWPOLICY:
+ case XFRM_MSG_UPDPOLICY:
+ return key_notify_policy(xp, dir, c);
+ case XFRM_MSG_FLUSHPOLICY:
+ if (c->data.type != XFRM_POLICY_TYPE_MAIN)
+ break;
+ return key_notify_policy_flush(c);
+ default:
+ pr_err("pfkey: Unknown policy event %d\n", c->event);
+ break;
+ }
+
+ return 0;
+}
+
+static u32 get_acqseq(void)
+{
+ u32 res;
+ static atomic_t acqseq;
+
+ do {
+ res = atomic_inc_return(&acqseq);
+ } while (!res);
+ return res;
+}
+
+static bool pfkey_is_alive(const struct km_event *c)
+{
+ struct netns_pfkey *net_pfkey = net_generic(c->net, pfkey_net_id);
+ struct sock *sk;
+ bool is_alive = false;
+
+ rcu_read_lock();
+ sk_for_each_rcu(sk, &net_pfkey->table) {
+ if (pfkey_sk(sk)->registered) {
+ is_alive = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return is_alive;
+}
+
+static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+ struct sadb_address *addr;
+ struct sadb_x_policy *pol;
+ int sockaddr_size;
+ int size;
+ struct sadb_x_sec_ctx *sec_ctx;
+ struct xfrm_sec_ctx *xfrm_ctx;
+ int ctx_size = 0;
+
+ sockaddr_size = pfkey_sockaddr_size(x->props.family);
+ if (!sockaddr_size)
+ return -EINVAL;
+
+ size = sizeof(struct sadb_msg) +
+ (sizeof(struct sadb_address) * 2) +
+ (sockaddr_size * 2) +
+ sizeof(struct sadb_x_policy);
+
+ if (x->id.proto == IPPROTO_AH)
+ size += count_ah_combs(t);
+ else if (x->id.proto == IPPROTO_ESP)
+ size += count_esp_combs(t);
+
+ if ((xfrm_ctx = x->security)) {
+ ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
+ size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
+ }
+
+ skb = alloc_skb(size + 16, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_type = SADB_ACQUIRE;
+ hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+ hdr->sadb_msg_len = size / sizeof(uint64_t);
+ hdr->sadb_msg_errno = 0;
+ hdr->sadb_msg_reserved = 0;
+ hdr->sadb_msg_seq = x->km.seq = get_acqseq();
+ hdr->sadb_msg_pid = 0;
+
+ /* src address */
+ addr = (struct sadb_address*) skb_put(skb,
+ sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+ addr->sadb_address_proto = 0;
+ addr->sadb_address_reserved = 0;
+ addr->sadb_address_prefixlen =
+ pfkey_sockaddr_fill(&x->props.saddr, 0,
+ (struct sockaddr *) (addr + 1),
+ x->props.family);
+ if (!addr->sadb_address_prefixlen)
+ BUG();
+
+ /* dst address */
+ addr = (struct sadb_address*) skb_put(skb,
+ sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+ addr->sadb_address_proto = 0;
+ addr->sadb_address_reserved = 0;
+ addr->sadb_address_prefixlen =
+ pfkey_sockaddr_fill(&x->id.daddr, 0,
+ (struct sockaddr *) (addr + 1),
+ x->props.family);
+ if (!addr->sadb_address_prefixlen)
+ BUG();
+
+ pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy));
+ pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t);
+ pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
+ pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC;
+ pol->sadb_x_policy_dir = XFRM_POLICY_OUT + 1;
+ pol->sadb_x_policy_reserved = 0;
+ pol->sadb_x_policy_id = xp->index;
+ pol->sadb_x_policy_priority = xp->priority;
+
+ /* Set sadb_comb's. */
+ if (x->id.proto == IPPROTO_AH)
+ dump_ah_combs(skb, t);
+ else if (x->id.proto == IPPROTO_ESP)
+ dump_esp_combs(skb, t);
+
+ /* security context */
+ if (xfrm_ctx) {
+ sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
+ sizeof(struct sadb_x_sec_ctx) + ctx_size);
+ sec_ctx->sadb_x_sec_len =
+ (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
+ sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+ sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+ sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+ sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+ memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+ xfrm_ctx->ctx_len);
+ }
+
+ return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+}
+
+static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
+ u8 *data, int len, int *dir)
+{
+ struct net *net = sock_net(sk);
+ struct xfrm_policy *xp;
+ struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
+ struct sadb_x_sec_ctx *sec_ctx;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ if (opt != IP_IPSEC_POLICY) {
+ *dir = -EOPNOTSUPP;
+ return NULL;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ if (opt != IPV6_IPSEC_POLICY) {
+ *dir = -EOPNOTSUPP;
+ return NULL;
+ }
+ break;
+#endif
+ default:
+ *dir = -EINVAL;
+ return NULL;
+ }
+
+ *dir = -EINVAL;
+
+ if (len < sizeof(struct sadb_x_policy) ||
+ pol->sadb_x_policy_len*8 > len ||
+ pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS ||
+ (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND))
+ return NULL;
+
+ xp = xfrm_policy_alloc(net, GFP_ATOMIC);
+ if (xp == NULL) {
+ *dir = -ENOBUFS;
+ return NULL;
+ }
+
+ xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ?
+ XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW);
+
+ xp->lft.soft_byte_limit = XFRM_INF;
+ xp->lft.hard_byte_limit = XFRM_INF;
+ xp->lft.soft_packet_limit = XFRM_INF;
+ xp->lft.hard_packet_limit = XFRM_INF;
+ xp->family = sk->sk_family;
+
+ xp->xfrm_nr = 0;
+ if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
+ (*dir = parse_ipsecrequests(xp, pol)) < 0)
+ goto out;
+
+ /* security context too */
+ if (len >= (pol->sadb_x_policy_len*8 +
+ sizeof(struct sadb_x_sec_ctx))) {
+ char *p = (char *)pol;
+ struct xfrm_user_sec_ctx *uctx;
+
+ p += pol->sadb_x_policy_len*8;
+ sec_ctx = (struct sadb_x_sec_ctx *)p;
+ if (len < pol->sadb_x_policy_len*8 +
+ sec_ctx->sadb_x_sec_len) {
+ *dir = -EINVAL;
+ goto out;
+ }
+ if ((*dir = verify_sec_ctx_len(p)))
+ goto out;
+ uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_ATOMIC);
+ *dir = security_xfrm_policy_alloc(&xp->security, uctx, GFP_ATOMIC);
+ kfree(uctx);
+
+ if (*dir)
+ goto out;
+ }
+
+ *dir = pol->sadb_x_policy_dir-1;
+ return xp;
+
+out:
+ xp->walk.dead = 1;
+ xfrm_policy_destroy(xp);
+ return NULL;
+}
+
+static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+ struct sadb_sa *sa;
+ struct sadb_address *addr;
+ struct sadb_x_nat_t_port *n_port;
+ int sockaddr_size;
+ int size;
+ __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0);
+ struct xfrm_encap_tmpl *natt = NULL;
+
+ sockaddr_size = pfkey_sockaddr_size(x->props.family);
+ if (!sockaddr_size)
+ return -EINVAL;
+
+ if (!satype)
+ return -EINVAL;
+
+ if (!x->encap)
+ return -EINVAL;
+
+ natt = x->encap;
+
+ /* Build an SADB_X_NAT_T_NEW_MAPPING message:
+ *
+ * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) |
+ * ADDRESS_DST (new addr) | NAT_T_DPORT (new port)
+ */
+
+ size = sizeof(struct sadb_msg) +
+ sizeof(struct sadb_sa) +
+ (sizeof(struct sadb_address) * 2) +
+ (sockaddr_size * 2) +
+ (sizeof(struct sadb_x_nat_t_port) * 2);
+
+ skb = alloc_skb(size + 16, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING;
+ hdr->sadb_msg_satype = satype;
+ hdr->sadb_msg_len = size / sizeof(uint64_t);
+ hdr->sadb_msg_errno = 0;
+ hdr->sadb_msg_reserved = 0;
+ hdr->sadb_msg_seq = x->km.seq = get_acqseq();
+ hdr->sadb_msg_pid = 0;
+
+ /* SA */
+ sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa));
+ sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t);
+ sa->sadb_sa_exttype = SADB_EXT_SA;
+ sa->sadb_sa_spi = x->id.spi;
+ sa->sadb_sa_replay = 0;
+ sa->sadb_sa_state = 0;
+ sa->sadb_sa_auth = 0;
+ sa->sadb_sa_encrypt = 0;
+ sa->sadb_sa_flags = 0;
+
+ /* ADDRESS_SRC (old addr) */
+ addr = (struct sadb_address*)
+ skb_put(skb, sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+ addr->sadb_address_proto = 0;
+ addr->sadb_address_reserved = 0;
+ addr->sadb_address_prefixlen =
+ pfkey_sockaddr_fill(&x->props.saddr, 0,
+ (struct sockaddr *) (addr + 1),
+ x->props.family);
+ if (!addr->sadb_address_prefixlen)
+ BUG();
+
+ /* NAT_T_SPORT (old port) */
+ n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
+ n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
+ n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT;
+ n_port->sadb_x_nat_t_port_port = natt->encap_sport;
+ n_port->sadb_x_nat_t_port_reserved = 0;
+
+ /* ADDRESS_DST (new addr) */
+ addr = (struct sadb_address*)
+ skb_put(skb, sizeof(struct sadb_address)+sockaddr_size);
+ addr->sadb_address_len =
+ (sizeof(struct sadb_address)+sockaddr_size)/
+ sizeof(uint64_t);
+ addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+ addr->sadb_address_proto = 0;
+ addr->sadb_address_reserved = 0;
+ addr->sadb_address_prefixlen =
+ pfkey_sockaddr_fill(ipaddr, 0,
+ (struct sockaddr *) (addr + 1),
+ x->props.family);
+ if (!addr->sadb_address_prefixlen)
+ BUG();
+
+ /* NAT_T_DPORT (new port) */
+ n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
+ n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
+ n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT;
+ n_port->sadb_x_nat_t_port_port = sport;
+ n_port->sadb_x_nat_t_port_reserved = 0;
+
+ return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+}
+
+#ifdef CONFIG_NET_KEY_MIGRATE
+static int set_sadb_address(struct sk_buff *skb, int sasize, int type,
+ const struct xfrm_selector *sel)
+{
+ struct sadb_address *addr;
+ addr = (struct sadb_address *)skb_put(skb, sizeof(struct sadb_address) + sasize);
+ addr->sadb_address_len = (sizeof(struct sadb_address) + sasize)/8;
+ addr->sadb_address_exttype = type;
+ addr->sadb_address_proto = sel->proto;
+ addr->sadb_address_reserved = 0;
+
+ switch (type) {
+ case SADB_EXT_ADDRESS_SRC:
+ addr->sadb_address_prefixlen = sel->prefixlen_s;
+ pfkey_sockaddr_fill(&sel->saddr, 0,
+ (struct sockaddr *)(addr + 1),
+ sel->family);
+ break;
+ case SADB_EXT_ADDRESS_DST:
+ addr->sadb_address_prefixlen = sel->prefixlen_d;
+ pfkey_sockaddr_fill(&sel->daddr, 0,
+ (struct sockaddr *)(addr + 1),
+ sel->family);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k)
+{
+ struct sadb_x_kmaddress *kma;
+ u8 *sa;
+ int family = k->family;
+ int socklen = pfkey_sockaddr_len(family);
+ int size_req;
+
+ size_req = (sizeof(struct sadb_x_kmaddress) +
+ pfkey_sockaddr_pair_size(family));
+
+ kma = (struct sadb_x_kmaddress *)skb_put(skb, size_req);
+ memset(kma, 0, size_req);
+ kma->sadb_x_kmaddress_len = size_req / 8;
+ kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS;
+ kma->sadb_x_kmaddress_reserved = k->reserved;
+
+ sa = (u8 *)(kma + 1);
+ if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) ||
+ !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int set_ipsecrequest(struct sk_buff *skb,
+ uint8_t proto, uint8_t mode, int level,
+ uint32_t reqid, uint8_t family,
+ const xfrm_address_t *src, const xfrm_address_t *dst)
+{
+ struct sadb_x_ipsecrequest *rq;
+ u8 *sa;
+ int socklen = pfkey_sockaddr_len(family);
+ int size_req;
+
+ size_req = sizeof(struct sadb_x_ipsecrequest) +
+ pfkey_sockaddr_pair_size(family);
+
+ rq = (struct sadb_x_ipsecrequest *)skb_put(skb, size_req);
+ memset(rq, 0, size_req);
+ rq->sadb_x_ipsecrequest_len = size_req;
+ rq->sadb_x_ipsecrequest_proto = proto;
+ rq->sadb_x_ipsecrequest_mode = mode;
+ rq->sadb_x_ipsecrequest_level = level;
+ rq->sadb_x_ipsecrequest_reqid = reqid;
+
+ sa = (u8 *) (rq + 1);
+ if (!pfkey_sockaddr_fill(src, 0, (struct sockaddr *)sa, family) ||
+ !pfkey_sockaddr_fill(dst, 0, (struct sockaddr *)(sa + socklen), family))
+ return -EINVAL;
+
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_NET_KEY_MIGRATE
+static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_bundles,
+ const struct xfrm_kmaddress *k)
+{
+ int i;
+ int sasize_sel;
+ int size = 0;
+ int size_pol = 0;
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+ struct sadb_x_policy *pol;
+ const struct xfrm_migrate *mp;
+
+ if (type != XFRM_POLICY_TYPE_MAIN)
+ return 0;
+
+ if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH)
+ return -EINVAL;
+
+ if (k != NULL) {
+ /* addresses for KM */
+ size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) +
+ pfkey_sockaddr_pair_size(k->family));
+ }
+
+ /* selector */
+ sasize_sel = pfkey_sockaddr_size(sel->family);
+ if (!sasize_sel)
+ return -EINVAL;
+ size += (sizeof(struct sadb_address) + sasize_sel) * 2;
+
+ /* policy info */
+ size_pol += sizeof(struct sadb_x_policy);
+
+ /* ipsecrequests */
+ for (i = 0, mp = m; i < num_bundles; i++, mp++) {
+ /* old locator pair */
+ size_pol += sizeof(struct sadb_x_ipsecrequest) +
+ pfkey_sockaddr_pair_size(mp->old_family);
+ /* new locator pair */
+ size_pol += sizeof(struct sadb_x_ipsecrequest) +
+ pfkey_sockaddr_pair_size(mp->new_family);
+ }
+
+ size += sizeof(struct sadb_msg) + size_pol;
+
+ /* alloc buffer */
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ hdr = (struct sadb_msg *)skb_put(skb, sizeof(struct sadb_msg));
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_type = SADB_X_MIGRATE;
+ hdr->sadb_msg_satype = pfkey_proto2satype(m->proto);
+ hdr->sadb_msg_len = size / 8;
+ hdr->sadb_msg_errno = 0;
+ hdr->sadb_msg_reserved = 0;
+ hdr->sadb_msg_seq = 0;
+ hdr->sadb_msg_pid = 0;
+
+ /* Addresses to be used by KM for negotiation, if ext is available */
+ if (k != NULL && (set_sadb_kmaddress(skb, k) < 0))
+ goto err;
+
+ /* selector src */
+ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel);
+
+ /* selector dst */
+ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_DST, sel);
+
+ /* policy information */
+ pol = (struct sadb_x_policy *)skb_put(skb, sizeof(struct sadb_x_policy));
+ pol->sadb_x_policy_len = size_pol / 8;
+ pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
+ pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC;
+ pol->sadb_x_policy_dir = dir + 1;
+ pol->sadb_x_policy_reserved = 0;
+ pol->sadb_x_policy_id = 0;
+ pol->sadb_x_policy_priority = 0;
+
+ for (i = 0, mp = m; i < num_bundles; i++, mp++) {
+ /* old ipsecrequest */
+ int mode = pfkey_mode_from_xfrm(mp->mode);
+ if (mode < 0)
+ goto err;
+ if (set_ipsecrequest(skb, mp->proto, mode,
+ (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE),
+ mp->reqid, mp->old_family,
+ &mp->old_saddr, &mp->old_daddr) < 0)
+ goto err;
+
+ /* new ipsecrequest */
+ if (set_ipsecrequest(skb, mp->proto, mode,
+ (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE),
+ mp->reqid, mp->new_family,
+ &mp->new_saddr, &mp->new_daddr) < 0)
+ goto err;
+ }
+
+ /* broadcast migrate message to sockets */
+ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net);
+
+ return 0;
+
+err:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+#else
+static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_bundles,
+ const struct xfrm_kmaddress *k)
+{
+ return -ENOPROTOOPT;
+}
+#endif
+
+static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb = NULL;
+ struct sadb_msg *hdr = NULL;
+ int err;
+ struct net *net = sock_net(sk);
+
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags & MSG_OOB)
+ goto out;
+
+ err = -EMSGSIZE;
+ if ((unsigned int)len > sk->sk_sndbuf - 32)
+ goto out;
+
+ err = -ENOBUFS;
+ skb = alloc_skb(len, GFP_KERNEL);
+ if (skb == NULL)
+ goto out;
+
+ err = -EFAULT;
+ if (memcpy_from_msg(skb_put(skb,len), msg, len))
+ goto out;
+
+ hdr = pfkey_get_base_msg(skb, &err);
+ if (!hdr)
+ goto out;
+
+ mutex_lock(&net->xfrm.xfrm_cfg_mutex);
+ err = pfkey_process(sk, skb, hdr);
+ mutex_unlock(&net->xfrm.xfrm_cfg_mutex);
+
+out:
+ if (err && hdr && pfkey_error(hdr, err, sk) == 0)
+ err = 0;
+ kfree_skb(skb);
+
+ return err ? : len;
+}
+
+static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct pfkey_sock *pfk = pfkey_sk(sk);
+ struct sk_buff *skb;
+ int copied, err;
+
+ err = -EINVAL;
+ if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
+ goto out;
+
+ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
+ if (skb == NULL)
+ goto out;
+
+ copied = skb->len;
+ if (copied > len) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ skb_reset_transport_header(skb);
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ err = (flags & MSG_TRUNC) ? skb->len : copied;
+
+ if (pfk->dump.dump != NULL &&
+ 3 * atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ pfkey_do_dump(pfk);
+
+out_free:
+ skb_free_datagram(sk, skb);
+out:
+ return err;
+}
+
+static const struct proto_ops pfkey_ops = {
+ .family = PF_KEY,
+ .owner = THIS_MODULE,
+ /* Operations that make no sense on pfkey sockets. */
+ .bind = sock_no_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+
+ /* Now the operations that really occur. */
+ .release = pfkey_release,
+ .poll = datagram_poll,
+ .sendmsg = pfkey_sendmsg,
+ .recvmsg = pfkey_recvmsg,
+};
+
+static const struct net_proto_family pfkey_family_ops = {
+ .family = PF_KEY,
+ .create = pfkey_create,
+ .owner = THIS_MODULE,
+};
+
+#ifdef CONFIG_PROC_FS
+static int pfkey_seq_show(struct seq_file *f, void *v)
+{
+ struct sock *s = sk_entry(v);
+
+ if (v == SEQ_START_TOKEN)
+ seq_printf(f ,"sk RefCnt Rmem Wmem User Inode\n");
+ else
+ seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n",
+ s,
+ atomic_read(&s->sk_refcnt),
+ sk_rmem_alloc_get(s),
+ sk_wmem_alloc_get(s),
+ from_kuid_munged(seq_user_ns(f), sock_i_uid(s)),
+ sock_i_ino(s)
+ );
+ return 0;
+}
+
+static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos)
+ __acquires(rcu)
+{
+ struct net *net = seq_file_net(f);
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+ rcu_read_lock();
+ return seq_hlist_start_head_rcu(&net_pfkey->table, *ppos);
+}
+
+static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+ struct net *net = seq_file_net(f);
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+ return seq_hlist_next_rcu(v, &net_pfkey->table, ppos);
+}
+
+static void pfkey_seq_stop(struct seq_file *f, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static const struct seq_operations pfkey_seq_ops = {
+ .start = pfkey_seq_start,
+ .next = pfkey_seq_next,
+ .stop = pfkey_seq_stop,
+ .show = pfkey_seq_show,
+};
+
+static int pfkey_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &pfkey_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations pfkey_proc_ops = {
+ .open = pfkey_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init pfkey_init_proc(struct net *net)
+{
+ struct proc_dir_entry *e;
+
+ e = proc_create("pfkey", 0, net->proc_net, &pfkey_proc_ops);
+ if (e == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit pfkey_exit_proc(struct net *net)
+{
+ remove_proc_entry("pfkey", net->proc_net);
+}
+#else
+static inline int pfkey_init_proc(struct net *net)
+{
+ return 0;
+}
+
+static inline void pfkey_exit_proc(struct net *net)
+{
+}
+#endif
+
+static struct xfrm_mgr pfkeyv2_mgr =
+{
+ .id = "pfkeyv2",
+ .notify = pfkey_send_notify,
+ .acquire = pfkey_send_acquire,
+ .compile_policy = pfkey_compile_policy,
+ .new_mapping = pfkey_send_new_mapping,
+ .notify_policy = pfkey_send_policy_notify,
+ .migrate = pfkey_send_migrate,
+ .is_alive = pfkey_is_alive,
+};
+
+static int __net_init pfkey_net_init(struct net *net)
+{
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+ int rv;
+
+ INIT_HLIST_HEAD(&net_pfkey->table);
+ atomic_set(&net_pfkey->socks_nr, 0);
+
+ rv = pfkey_init_proc(net);
+
+ return rv;
+}
+
+static void __net_exit pfkey_net_exit(struct net *net)
+{
+ struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+ pfkey_exit_proc(net);
+ BUG_ON(!hlist_empty(&net_pfkey->table));
+}
+
+static struct pernet_operations pfkey_net_ops = {
+ .init = pfkey_net_init,
+ .exit = pfkey_net_exit,
+ .id = &pfkey_net_id,
+ .size = sizeof(struct netns_pfkey),
+};
+
+static void __exit ipsec_pfkey_exit(void)
+{
+ xfrm_unregister_km(&pfkeyv2_mgr);
+ sock_unregister(PF_KEY);
+ unregister_pernet_subsys(&pfkey_net_ops);
+ proto_unregister(&key_proto);
+}
+
+static int __init ipsec_pfkey_init(void)
+{
+ int err = proto_register(&key_proto, 0);
+
+ if (err != 0)
+ goto out;
+
+ err = register_pernet_subsys(&pfkey_net_ops);
+ if (err != 0)
+ goto out_unregister_key_proto;
+ err = sock_register(&pfkey_family_ops);
+ if (err != 0)
+ goto out_unregister_pernet;
+ err = xfrm_register_km(&pfkeyv2_mgr);
+ if (err != 0)
+ goto out_sock_unregister;
+out:
+ return err;
+
+out_sock_unregister:
+ sock_unregister(PF_KEY);
+out_unregister_pernet:
+ unregister_pernet_subsys(&pfkey_net_ops);
+out_unregister_key_proto:
+ proto_unregister(&key_proto);
+ goto out;
+}
+
+module_init(ipsec_pfkey_init);
+module_exit(ipsec_pfkey_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KEY);
diff --git a/net/l2tp/Kconfig b/net/l2tp/Kconfig
new file mode 100644
index 000000000..378c73b26
--- /dev/null
+++ b/net/l2tp/Kconfig
@@ -0,0 +1,109 @@
+#
+# Layer Two Tunneling Protocol (L2TP)
+#
+
+menuconfig L2TP
+ tristate "Layer Two Tunneling Protocol (L2TP)"
+ depends on (IPV6 || IPV6=n)
+ depends on INET
+ select NET_UDP_TUNNEL
+ ---help---
+ Layer Two Tunneling Protocol
+
+ From RFC 2661 <http://www.ietf.org/rfc/rfc2661.txt>.
+
+ L2TP facilitates the tunneling of packets across an
+ intervening network in a way that is as transparent as
+ possible to both end-users and applications.
+
+ L2TP is often used to tunnel PPP traffic over IP
+ tunnels. One IP tunnel may carry thousands of individual PPP
+ connections. L2TP is also used as a VPN protocol, popular
+ with home workers to connect to their offices.
+
+ L2TPv3 allows other protocols as well as PPP to be carried
+ over L2TP tunnels. L2TPv3 is defined in RFC 3931
+ <http://www.ietf.org/rfc/rfc3931.txt>.
+
+ The kernel component handles only L2TP data packets: a
+ userland daemon handles L2TP the control protocol (tunnel
+ and session setup). One such daemon is OpenL2TP
+ (http://openl2tp.org/).
+
+ If you don't need L2TP, say N. To compile all L2TP code as
+ modules, choose M here.
+
+config L2TP_DEBUGFS
+ tristate "L2TP debugfs support"
+ depends on L2TP && DEBUG_FS
+ help
+ Support for l2tp directory in debugfs filesystem. This may be
+ used to dump internal state of the l2tp drivers for problem
+ analysis.
+
+ If unsure, say 'Y'.
+
+ To compile this driver as a module, choose M here. The module
+ will be called l2tp_debugfs.
+
+config L2TP_V3
+ bool "L2TPv3 support"
+ depends on L2TP
+ help
+ Layer Two Tunneling Protocol Version 3
+
+ From RFC 3931 <http://www.ietf.org/rfc/rfc3931.txt>.
+
+ The Layer Two Tunneling Protocol (L2TP) provides a dynamic
+ mechanism for tunneling Layer 2 (L2) "circuits" across a
+ packet-oriented data network (e.g., over IP). L2TP, as
+ originally defined in RFC 2661, is a standard method for
+ tunneling Point-to-Point Protocol (PPP) [RFC1661] sessions.
+ L2TP has since been adopted for tunneling a number of other
+ L2 protocols, including ATM, Frame Relay, HDLC and even raw
+ ethernet frames.
+
+ If you are connecting to L2TPv3 equipment, or you want to
+ tunnel raw ethernet frames using L2TP, say Y here. If
+ unsure, say N.
+
+config L2TP_IP
+ tristate "L2TP IP encapsulation for L2TPv3"
+ depends on L2TP_V3
+ help
+ Support for L2TP-over-IP socket family.
+
+ The L2TPv3 protocol defines two possible encapsulations for
+ L2TP frames, namely UDP and plain IP (without UDP). This
+ driver provides a new L2TPIP socket family with which
+ userspace L2TPv3 daemons may create L2TP/IP tunnel sockets
+ when UDP encapsulation is not required. When L2TP is carried
+ in IP packets, it used IP protocol number 115, so this port
+ must be enabled in firewalls.
+
+ To compile this driver as a module, choose M here. The module
+ will be called l2tp_ip.
+
+config L2TP_ETH
+ tristate "L2TP ethernet pseudowire support for L2TPv3"
+ depends on L2TP_V3
+ help
+ Support for carrying raw ethernet frames over L2TPv3.
+
+ From RFC 4719 <http://www.ietf.org/rfc/rfc4719.txt>.
+
+ The Layer 2 Tunneling Protocol, Version 3 (L2TPv3) can be
+ used as a control protocol and for data encapsulation to set
+ up Pseudowires for transporting layer 2 Packet Data Units
+ across an IP network [RFC3931].
+
+ This driver provides an ethernet virtual interface for each
+ L2TP ethernet pseudowire instance. Standard Linux tools may
+ be used to assign an IP address to the local virtual
+ interface, or add the interface to a bridge.
+
+ If you are using L2TPv3, you will almost certainly want to
+ enable this option.
+
+ To compile this driver as a module, choose M here. The module
+ will be called l2tp_eth.
diff --git a/net/l2tp/Makefile b/net/l2tp/Makefile
new file mode 100644
index 000000000..2870f41ea
--- /dev/null
+++ b/net/l2tp/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the L2TP.
+#
+
+obj-$(CONFIG_L2TP) += l2tp_core.o
+
+# Build l2tp as modules if L2TP is M
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_PPPOL2TP)) += l2tp_ppp.o
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_IP)) += l2tp_ip.o
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_V3)) += l2tp_netlink.o
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_ETH)) += l2tp_eth.o
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_DEBUGFS)) += l2tp_debugfs.o
+ifneq ($(CONFIG_IPV6),)
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_IP)) += l2tp_ip6.o
+endif
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
new file mode 100644
index 000000000..a29a50449
--- /dev/null
+++ b/net/l2tp/l2tp_core.c
@@ -0,0 +1,1901 @@
+/*
+ * L2TP core.
+ *
+ * Copyright (c) 2008,2009,2010 Katalix Systems Ltd
+ *
+ * This file contains some code of the original L2TPv2 pppol2tp
+ * driver, which has the following copyright:
+ *
+ * Authors: Martijn van Oosterhout <kleptog@svana.org>
+ * James Chapman (jchapman@katalix.com)
+ * Contributors:
+ * Michal Ostrowski <mostrows@speakeasy.net>
+ * Arnaldo Carvalho de Melo <acme@xconectiva.com.br>
+ * David S. Miller (davem@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/l2tp.h>
+#include <linux/hash.h>
+#include <linux/sort.h>
+#include <linux/file.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#include <net/protocol.h>
+#include <net/inet6_connection_sock.h>
+#include <net/inet_ecn.h>
+#include <net/ip6_route.h>
+#include <net/ip6_checksum.h>
+
+#include <asm/byteorder.h>
+#include <linux/atomic.h>
+
+#include "l2tp_core.h"
+
+#define L2TP_DRV_VERSION "V2.0"
+
+/* L2TP header constants */
+#define L2TP_HDRFLAG_T 0x8000
+#define L2TP_HDRFLAG_L 0x4000
+#define L2TP_HDRFLAG_S 0x0800
+#define L2TP_HDRFLAG_O 0x0200
+#define L2TP_HDRFLAG_P 0x0100
+
+#define L2TP_HDR_VER_MASK 0x000F
+#define L2TP_HDR_VER_2 0x0002
+#define L2TP_HDR_VER_3 0x0003
+
+/* L2TPv3 default L2-specific sublayer */
+#define L2TP_SLFLAG_S 0x40000000
+#define L2TP_SL_SEQ_MASK 0x00ffffff
+
+#define L2TP_HDR_SIZE_SEQ 10
+#define L2TP_HDR_SIZE_NOSEQ 6
+
+/* Default trace flags */
+#define L2TP_DEFAULT_DEBUG_FLAGS 0
+
+/* Private data stored for received packets in the skb.
+ */
+struct l2tp_skb_cb {
+ u32 ns;
+ u16 has_seq;
+ u16 length;
+ unsigned long expires;
+};
+
+#define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *) &skb->cb[sizeof(struct inet_skb_parm)])
+
+static atomic_t l2tp_tunnel_count;
+static atomic_t l2tp_session_count;
+static struct workqueue_struct *l2tp_wq;
+
+/* per-net private data for this module */
+static unsigned int l2tp_net_id;
+struct l2tp_net {
+ struct list_head l2tp_tunnel_list;
+ spinlock_t l2tp_tunnel_list_lock;
+ struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2];
+ spinlock_t l2tp_session_hlist_lock;
+};
+
+static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
+
+static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk)
+{
+ return sk->sk_user_data;
+}
+
+static inline struct l2tp_net *l2tp_pernet(struct net *net)
+{
+ BUG_ON(!net);
+
+ return net_generic(net, l2tp_net_id);
+}
+
+/* Tunnel reference counts. Incremented per session that is added to
+ * the tunnel.
+ */
+static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel)
+{
+ atomic_inc(&tunnel->ref_count);
+}
+
+static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel)
+{
+ if (atomic_dec_and_test(&tunnel->ref_count))
+ l2tp_tunnel_free(tunnel);
+}
+#ifdef L2TP_REFCNT_DEBUG
+#define l2tp_tunnel_inc_refcount(_t) \
+do { \
+ pr_debug("l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", \
+ __func__, __LINE__, (_t)->name, \
+ atomic_read(&_t->ref_count)); \
+ l2tp_tunnel_inc_refcount_1(_t); \
+} while (0)
+#define l2tp_tunnel_dec_refcount(_t) \
+do { \
+ pr_debug("l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", \
+ __func__, __LINE__, (_t)->name, \
+ atomic_read(&_t->ref_count)); \
+ l2tp_tunnel_dec_refcount_1(_t); \
+} while (0)
+#else
+#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t)
+#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t)
+#endif
+
+/* Session hash global list for L2TPv3.
+ * The session_id SHOULD be random according to RFC3931, but several
+ * L2TP implementations use incrementing session_ids. So we do a real
+ * hash on the session_id, rather than a simple bitmask.
+ */
+static inline struct hlist_head *
+l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id)
+{
+ return &pn->l2tp_session_hlist[hash_32(session_id, L2TP_HASH_BITS_2)];
+
+}
+
+/* Lookup the tunnel socket, possibly involving the fs code if the socket is
+ * owned by userspace. A struct sock returned from this function must be
+ * released using l2tp_tunnel_sock_put once you're done with it.
+ */
+static struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel)
+{
+ int err = 0;
+ struct socket *sock = NULL;
+ struct sock *sk = NULL;
+
+ if (!tunnel)
+ goto out;
+
+ if (tunnel->fd >= 0) {
+ /* Socket is owned by userspace, who might be in the process
+ * of closing it. Look the socket up using the fd to ensure
+ * consistency.
+ */
+ sock = sockfd_lookup(tunnel->fd, &err);
+ if (sock)
+ sk = sock->sk;
+ } else {
+ /* Socket is owned by kernelspace */
+ sk = tunnel->sock;
+ sock_hold(sk);
+ }
+
+out:
+ return sk;
+}
+
+/* Drop a reference to a tunnel socket obtained via. l2tp_tunnel_sock_put */
+static void l2tp_tunnel_sock_put(struct sock *sk)
+{
+ struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
+ if (tunnel) {
+ if (tunnel->fd >= 0) {
+ /* Socket is owned by userspace */
+ sockfd_put(sk->sk_socket);
+ }
+ sock_put(sk);
+ }
+ sock_put(sk);
+}
+
+/* Lookup a session by id in the global session list
+ */
+static struct l2tp_session *l2tp_session_find_2(struct net *net, u32 session_id)
+{
+ struct l2tp_net *pn = l2tp_pernet(net);
+ struct hlist_head *session_list =
+ l2tp_session_id_hash_2(pn, session_id);
+ struct l2tp_session *session;
+
+ rcu_read_lock_bh();
+ hlist_for_each_entry_rcu(session, session_list, global_hlist) {
+ if (session->session_id == session_id) {
+ rcu_read_unlock_bh();
+ return session;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ return NULL;
+}
+
+/* Session hash list.
+ * The session_id SHOULD be random according to RFC2661, but several
+ * L2TP implementations (Cisco and Microsoft) use incrementing
+ * session_ids. So we do a real hash on the session_id, rather than a
+ * simple bitmask.
+ */
+static inline struct hlist_head *
+l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id)
+{
+ return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)];
+}
+
+/* Lookup a session by id
+ */
+struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id)
+{
+ struct hlist_head *session_list;
+ struct l2tp_session *session;
+
+ /* In L2TPv3, session_ids are unique over all tunnels and we
+ * sometimes need to look them up before we know the
+ * tunnel.
+ */
+ if (tunnel == NULL)
+ return l2tp_session_find_2(net, session_id);
+
+ session_list = l2tp_session_id_hash(tunnel, session_id);
+ read_lock_bh(&tunnel->hlist_lock);
+ hlist_for_each_entry(session, session_list, hlist) {
+ if (session->session_id == session_id) {
+ read_unlock_bh(&tunnel->hlist_lock);
+ return session;
+ }
+ }
+ read_unlock_bh(&tunnel->hlist_lock);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_find);
+
+struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
+{
+ int hash;
+ struct l2tp_session *session;
+ int count = 0;
+
+ read_lock_bh(&tunnel->hlist_lock);
+ for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
+ hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) {
+ if (++count > nth) {
+ read_unlock_bh(&tunnel->hlist_lock);
+ return session;
+ }
+ }
+ }
+
+ read_unlock_bh(&tunnel->hlist_lock);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_find_nth);
+
+/* Lookup a session by interface name.
+ * This is very inefficient but is only used by management interfaces.
+ */
+struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
+{
+ struct l2tp_net *pn = l2tp_pernet(net);
+ int hash;
+ struct l2tp_session *session;
+
+ rcu_read_lock_bh();
+ for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) {
+ hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) {
+ if (!strcmp(session->ifname, ifname)) {
+ rcu_read_unlock_bh();
+ return session;
+ }
+ }
+ }
+
+ rcu_read_unlock_bh();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_find_by_ifname);
+
+/* Lookup a tunnel by id
+ */
+struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id)
+{
+ struct l2tp_tunnel *tunnel;
+ struct l2tp_net *pn = l2tp_pernet(net);
+
+ rcu_read_lock_bh();
+ list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
+ if (tunnel->tunnel_id == tunnel_id) {
+ rcu_read_unlock_bh();
+ return tunnel;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_find);
+
+struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth)
+{
+ struct l2tp_net *pn = l2tp_pernet(net);
+ struct l2tp_tunnel *tunnel;
+ int count = 0;
+
+ rcu_read_lock_bh();
+ list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
+ if (++count > nth) {
+ rcu_read_unlock_bh();
+ return tunnel;
+ }
+ }
+
+ rcu_read_unlock_bh();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_find_nth);
+
+/*****************************************************************************
+ * Receive data handling
+ *****************************************************************************/
+
+/* Queue a skb in order. We come here only if the skb has an L2TP sequence
+ * number.
+ */
+static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *skb)
+{
+ struct sk_buff *skbp;
+ struct sk_buff *tmp;
+ u32 ns = L2TP_SKB_CB(skb)->ns;
+
+ spin_lock_bh(&session->reorder_q.lock);
+ skb_queue_walk_safe(&session->reorder_q, skbp, tmp) {
+ if (L2TP_SKB_CB(skbp)->ns > ns) {
+ __skb_queue_before(&session->reorder_q, skbp, skb);
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n",
+ session->name, ns, L2TP_SKB_CB(skbp)->ns,
+ skb_queue_len(&session->reorder_q));
+ atomic_long_inc(&session->stats.rx_oos_packets);
+ goto out;
+ }
+ }
+
+ __skb_queue_tail(&session->reorder_q, skb);
+
+out:
+ spin_unlock_bh(&session->reorder_q.lock);
+}
+
+/* Dequeue a single skb.
+ */
+static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *skb)
+{
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ int length = L2TP_SKB_CB(skb)->length;
+
+ /* We're about to requeue the skb, so return resources
+ * to its current owner (a socket receive buffer).
+ */
+ skb_orphan(skb);
+
+ atomic_long_inc(&tunnel->stats.rx_packets);
+ atomic_long_add(length, &tunnel->stats.rx_bytes);
+ atomic_long_inc(&session->stats.rx_packets);
+ atomic_long_add(length, &session->stats.rx_bytes);
+
+ if (L2TP_SKB_CB(skb)->has_seq) {
+ /* Bump our Nr */
+ session->nr++;
+ session->nr &= session->nr_max;
+
+ l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated nr to %hu\n",
+ session->name, session->nr);
+ }
+
+ /* call private receive handler */
+ if (session->recv_skb != NULL)
+ (*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length);
+ else
+ kfree_skb(skb);
+
+ if (session->deref)
+ (*session->deref)(session);
+}
+
+/* Dequeue skbs from the session's reorder_q, subject to packet order.
+ * Skbs that have been in the queue for too long are simply discarded.
+ */
+static void l2tp_recv_dequeue(struct l2tp_session *session)
+{
+ struct sk_buff *skb;
+ struct sk_buff *tmp;
+
+ /* If the pkt at the head of the queue has the nr that we
+ * expect to send up next, dequeue it and any other
+ * in-sequence packets behind it.
+ */
+start:
+ spin_lock_bh(&session->reorder_q.lock);
+ skb_queue_walk_safe(&session->reorder_q, skb, tmp) {
+ if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) {
+ atomic_long_inc(&session->stats.rx_seq_discards);
+ atomic_long_inc(&session->stats.rx_errors);
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: oos pkt %u len %d discarded (too old), waiting for %u, reorder_q_len=%d\n",
+ session->name, L2TP_SKB_CB(skb)->ns,
+ L2TP_SKB_CB(skb)->length, session->nr,
+ skb_queue_len(&session->reorder_q));
+ session->reorder_skip = 1;
+ __skb_unlink(skb, &session->reorder_q);
+ kfree_skb(skb);
+ if (session->deref)
+ (*session->deref)(session);
+ continue;
+ }
+
+ if (L2TP_SKB_CB(skb)->has_seq) {
+ if (session->reorder_skip) {
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: advancing nr to next pkt: %u -> %u",
+ session->name, session->nr,
+ L2TP_SKB_CB(skb)->ns);
+ session->reorder_skip = 0;
+ session->nr = L2TP_SKB_CB(skb)->ns;
+ }
+ if (L2TP_SKB_CB(skb)->ns != session->nr) {
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: holding oos pkt %u len %d, waiting for %u, reorder_q_len=%d\n",
+ session->name, L2TP_SKB_CB(skb)->ns,
+ L2TP_SKB_CB(skb)->length, session->nr,
+ skb_queue_len(&session->reorder_q));
+ goto out;
+ }
+ }
+ __skb_unlink(skb, &session->reorder_q);
+
+ /* Process the skb. We release the queue lock while we
+ * do so to let other contexts process the queue.
+ */
+ spin_unlock_bh(&session->reorder_q.lock);
+ l2tp_recv_dequeue_skb(session, skb);
+ goto start;
+ }
+
+out:
+ spin_unlock_bh(&session->reorder_q.lock);
+}
+
+static int l2tp_seq_check_rx_window(struct l2tp_session *session, u32 nr)
+{
+ u32 nws;
+
+ if (nr >= session->nr)
+ nws = nr - session->nr;
+ else
+ nws = (session->nr_max + 1) - (session->nr - nr);
+
+ return nws < session->nr_window_size;
+}
+
+/* If packet has sequence numbers, queue it if acceptable. Returns 0 if
+ * acceptable, else non-zero.
+ */
+static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
+{
+ if (!l2tp_seq_check_rx_window(session, L2TP_SKB_CB(skb)->ns)) {
+ /* Packet sequence number is outside allowed window.
+ * Discard it.
+ */
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: pkt %u len %d discarded, outside window, nr=%u\n",
+ session->name, L2TP_SKB_CB(skb)->ns,
+ L2TP_SKB_CB(skb)->length, session->nr);
+ goto discard;
+ }
+
+ if (session->reorder_timeout != 0) {
+ /* Packet reordering enabled. Add skb to session's
+ * reorder queue, in order of ns.
+ */
+ l2tp_recv_queue_skb(session, skb);
+ goto out;
+ }
+
+ /* Packet reordering disabled. Discard out-of-sequence packets, while
+ * tracking the number if in-sequence packets after the first OOS packet
+ * is seen. After nr_oos_count_max in-sequence packets, reset the
+ * sequence number to re-enable packet reception.
+ */
+ if (L2TP_SKB_CB(skb)->ns == session->nr) {
+ skb_queue_tail(&session->reorder_q, skb);
+ } else {
+ u32 nr_oos = L2TP_SKB_CB(skb)->ns;
+ u32 nr_next = (session->nr_oos + 1) & session->nr_max;
+
+ if (nr_oos == nr_next)
+ session->nr_oos_count++;
+ else
+ session->nr_oos_count = 0;
+
+ session->nr_oos = nr_oos;
+ if (session->nr_oos_count > session->nr_oos_count_max) {
+ session->reorder_skip = 1;
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: %d oos packets received. Resetting sequence numbers\n",
+ session->name, session->nr_oos_count);
+ }
+ if (!session->reorder_skip) {
+ atomic_long_inc(&session->stats.rx_seq_discards);
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: oos pkt %u len %d discarded, waiting for %u, reorder_q_len=%d\n",
+ session->name, L2TP_SKB_CB(skb)->ns,
+ L2TP_SKB_CB(skb)->length, session->nr,
+ skb_queue_len(&session->reorder_q));
+ goto discard;
+ }
+ skb_queue_tail(&session->reorder_q, skb);
+ }
+
+out:
+ return 0;
+
+discard:
+ return 1;
+}
+
+/* Do receive processing of L2TP data frames. We handle both L2TPv2
+ * and L2TPv3 data frames here.
+ *
+ * L2TPv2 Data Message Header
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |T|L|x|x|S|x|O|P|x|x|x|x| Ver | Length (opt) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Tunnel ID | Session ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Ns (opt) | Nr (opt) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Offset Size (opt) | Offset pad... (opt)
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Data frames are marked by T=0. All other fields are the same as
+ * those in L2TP control frames.
+ *
+ * L2TPv3 Data Message Header
+ *
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | L2TP Session Header |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | L2-Specific Sublayer |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Tunnel Payload ...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * L2TPv3 Session Header Over IP
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Session ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Cookie (optional, maximum 64 bits)...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * L2TPv3 L2-Specific Sublayer Format
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |x|S|x|x|x|x|x|x| Sequence Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Cookie value, sublayer format and offset (pad) are negotiated with
+ * the peer when the session is set up. Unlike L2TPv2, we do not need
+ * to parse the packet header to determine if optional fields are
+ * present.
+ *
+ * Caller must already have parsed the frame and determined that it is
+ * a data (not control) frame before coming here. Fields up to the
+ * session-id have already been parsed and ptr points to the data
+ * after the session-id.
+ */
+void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
+ unsigned char *ptr, unsigned char *optr, u16 hdrflags,
+ int length, int (*payload_hook)(struct sk_buff *skb))
+{
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ int offset;
+ u32 ns, nr;
+
+ /* The ref count is increased since we now hold a pointer to
+ * the session. Take care to decrement the refcnt when exiting
+ * this function from now on...
+ */
+ l2tp_session_inc_refcount(session);
+ if (session->ref)
+ (*session->ref)(session);
+
+ /* Parse and check optional cookie */
+ if (session->peer_cookie_len > 0) {
+ if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
+ l2tp_info(tunnel, L2TP_MSG_DATA,
+ "%s: cookie mismatch (%u/%u). Discarding.\n",
+ tunnel->name, tunnel->tunnel_id,
+ session->session_id);
+ atomic_long_inc(&session->stats.rx_cookie_discards);
+ goto discard;
+ }
+ ptr += session->peer_cookie_len;
+ }
+
+ /* Handle the optional sequence numbers. Sequence numbers are
+ * in different places for L2TPv2 and L2TPv3.
+ *
+ * If we are the LAC, enable/disable sequence numbers under
+ * the control of the LNS. If no sequence numbers present but
+ * we were expecting them, discard frame.
+ */
+ ns = nr = 0;
+ L2TP_SKB_CB(skb)->has_seq = 0;
+ if (tunnel->version == L2TP_HDR_VER_2) {
+ if (hdrflags & L2TP_HDRFLAG_S) {
+ ns = ntohs(*(__be16 *) ptr);
+ ptr += 2;
+ nr = ntohs(*(__be16 *) ptr);
+ ptr += 2;
+
+ /* Store L2TP info in the skb */
+ L2TP_SKB_CB(skb)->ns = ns;
+ L2TP_SKB_CB(skb)->has_seq = 1;
+
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: recv data ns=%u, nr=%u, session nr=%u\n",
+ session->name, ns, nr, session->nr);
+ }
+ } else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
+ u32 l2h = ntohl(*(__be32 *) ptr);
+
+ if (l2h & 0x40000000) {
+ ns = l2h & 0x00ffffff;
+
+ /* Store L2TP info in the skb */
+ L2TP_SKB_CB(skb)->ns = ns;
+ L2TP_SKB_CB(skb)->has_seq = 1;
+
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: recv data ns=%u, session nr=%u\n",
+ session->name, ns, session->nr);
+ }
+ }
+
+ /* Advance past L2-specific header, if present */
+ ptr += session->l2specific_len;
+
+ if (L2TP_SKB_CB(skb)->has_seq) {
+ /* Received a packet with sequence numbers. If we're the LNS,
+ * check if we sre sending sequence numbers and if not,
+ * configure it so.
+ */
+ if ((!session->lns_mode) && (!session->send_seq)) {
+ l2tp_info(session, L2TP_MSG_SEQ,
+ "%s: requested to enable seq numbers by LNS\n",
+ session->name);
+ session->send_seq = -1;
+ l2tp_session_set_header_len(session, tunnel->version);
+ }
+ } else {
+ /* No sequence numbers.
+ * If user has configured mandatory sequence numbers, discard.
+ */
+ if (session->recv_seq) {
+ l2tp_warn(session, L2TP_MSG_SEQ,
+ "%s: recv data has no seq numbers when required. Discarding.\n",
+ session->name);
+ atomic_long_inc(&session->stats.rx_seq_discards);
+ goto discard;
+ }
+
+ /* If we're the LAC and we're sending sequence numbers, the
+ * LNS has requested that we no longer send sequence numbers.
+ * If we're the LNS and we're sending sequence numbers, the
+ * LAC is broken. Discard the frame.
+ */
+ if ((!session->lns_mode) && (session->send_seq)) {
+ l2tp_info(session, L2TP_MSG_SEQ,
+ "%s: requested to disable seq numbers by LNS\n",
+ session->name);
+ session->send_seq = 0;
+ l2tp_session_set_header_len(session, tunnel->version);
+ } else if (session->send_seq) {
+ l2tp_warn(session, L2TP_MSG_SEQ,
+ "%s: recv data has no seq numbers when required. Discarding.\n",
+ session->name);
+ atomic_long_inc(&session->stats.rx_seq_discards);
+ goto discard;
+ }
+ }
+
+ /* Session data offset is handled differently for L2TPv2 and
+ * L2TPv3. For L2TPv2, there is an optional 16-bit value in
+ * the header. For L2TPv3, the offset is negotiated using AVPs
+ * in the session setup control protocol.
+ */
+ if (tunnel->version == L2TP_HDR_VER_2) {
+ /* If offset bit set, skip it. */
+ if (hdrflags & L2TP_HDRFLAG_O) {
+ offset = ntohs(*(__be16 *)ptr);
+ ptr += 2 + offset;
+ }
+ } else
+ ptr += session->offset;
+
+ offset = ptr - optr;
+ if (!pskb_may_pull(skb, offset))
+ goto discard;
+
+ __skb_pull(skb, offset);
+
+ /* If caller wants to process the payload before we queue the
+ * packet, do so now.
+ */
+ if (payload_hook)
+ if ((*payload_hook)(skb))
+ goto discard;
+
+ /* Prepare skb for adding to the session's reorder_q. Hold
+ * packets for max reorder_timeout or 1 second if not
+ * reordering.
+ */
+ L2TP_SKB_CB(skb)->length = length;
+ L2TP_SKB_CB(skb)->expires = jiffies +
+ (session->reorder_timeout ? session->reorder_timeout : HZ);
+
+ /* Add packet to the session's receive queue. Reordering is done here, if
+ * enabled. Saved L2TP protocol info is stored in skb->sb[].
+ */
+ if (L2TP_SKB_CB(skb)->has_seq) {
+ if (l2tp_recv_data_seq(session, skb))
+ goto discard;
+ } else {
+ /* No sequence numbers. Add the skb to the tail of the
+ * reorder queue. This ensures that it will be
+ * delivered after all previous sequenced skbs.
+ */
+ skb_queue_tail(&session->reorder_q, skb);
+ }
+
+ /* Try to dequeue as many skbs from reorder_q as we can. */
+ l2tp_recv_dequeue(session);
+
+ l2tp_session_dec_refcount(session);
+
+ return;
+
+discard:
+ atomic_long_inc(&session->stats.rx_errors);
+ kfree_skb(skb);
+
+ if (session->deref)
+ (*session->deref)(session);
+
+ l2tp_session_dec_refcount(session);
+}
+EXPORT_SYMBOL(l2tp_recv_common);
+
+/* Drop skbs from the session's reorder_q
+ */
+int l2tp_session_queue_purge(struct l2tp_session *session)
+{
+ struct sk_buff *skb = NULL;
+ BUG_ON(!session);
+ BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+ while ((skb = skb_dequeue(&session->reorder_q))) {
+ atomic_long_inc(&session->stats.rx_errors);
+ kfree_skb(skb);
+ if (session->deref)
+ (*session->deref)(session);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_queue_purge);
+
+/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame
+ * here. The skb is not on a list when we get here.
+ * Returns 0 if the packet was a data packet and was successfully passed on.
+ * Returns 1 if the packet was not a good data packet and could not be
+ * forwarded. All such packets are passed up to userspace to deal with.
+ */
+static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
+ int (*payload_hook)(struct sk_buff *skb))
+{
+ struct l2tp_session *session = NULL;
+ unsigned char *ptr, *optr;
+ u16 hdrflags;
+ u32 tunnel_id, session_id;
+ u16 version;
+ int length;
+
+ /* UDP has verifed checksum */
+
+ /* UDP always verifies the packet length. */
+ __skb_pull(skb, sizeof(struct udphdr));
+
+ /* Short packet? */
+ if (!pskb_may_pull(skb, L2TP_HDR_SIZE_SEQ)) {
+ l2tp_info(tunnel, L2TP_MSG_DATA,
+ "%s: recv short packet (len=%d)\n",
+ tunnel->name, skb->len);
+ goto error;
+ }
+
+ /* Trace packet contents, if enabled */
+ if (tunnel->debug & L2TP_MSG_DATA) {
+ length = min(32u, skb->len);
+ if (!pskb_may_pull(skb, length))
+ goto error;
+
+ pr_debug("%s: recv\n", tunnel->name);
+ print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
+ }
+
+ /* Point to L2TP header */
+ optr = ptr = skb->data;
+
+ /* Get L2TP header flags */
+ hdrflags = ntohs(*(__be16 *) ptr);
+
+ /* Check protocol version */
+ version = hdrflags & L2TP_HDR_VER_MASK;
+ if (version != tunnel->version) {
+ l2tp_info(tunnel, L2TP_MSG_DATA,
+ "%s: recv protocol version mismatch: got %d expected %d\n",
+ tunnel->name, version, tunnel->version);
+ goto error;
+ }
+
+ /* Get length of L2TP packet */
+ length = skb->len;
+
+ /* If type is control packet, it is handled by userspace. */
+ if (hdrflags & L2TP_HDRFLAG_T) {
+ l2tp_dbg(tunnel, L2TP_MSG_DATA,
+ "%s: recv control packet, len=%d\n",
+ tunnel->name, length);
+ goto error;
+ }
+
+ /* Skip flags */
+ ptr += 2;
+
+ if (tunnel->version == L2TP_HDR_VER_2) {
+ /* If length is present, skip it */
+ if (hdrflags & L2TP_HDRFLAG_L)
+ ptr += 2;
+
+ /* Extract tunnel and session ID */
+ tunnel_id = ntohs(*(__be16 *) ptr);
+ ptr += 2;
+ session_id = ntohs(*(__be16 *) ptr);
+ ptr += 2;
+ } else {
+ ptr += 2; /* skip reserved bits */
+ tunnel_id = tunnel->tunnel_id;
+ session_id = ntohl(*(__be32 *) ptr);
+ ptr += 4;
+ }
+
+ /* Find the session context */
+ session = l2tp_session_find(tunnel->l2tp_net, tunnel, session_id);
+ if (!session || !session->recv_skb) {
+ /* Not found? Pass to userspace to deal with */
+ l2tp_info(tunnel, L2TP_MSG_DATA,
+ "%s: no session found (%u/%u). Passing up.\n",
+ tunnel->name, tunnel_id, session_id);
+ goto error;
+ }
+
+ l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
+
+ return 0;
+
+error:
+ /* Put UDP header back */
+ __skb_push(skb, sizeof(struct udphdr));
+
+ return 1;
+}
+
+/* UDP encapsulation receive handler. See net/ipv4/udp.c.
+ * Return codes:
+ * 0 : success.
+ * <0: error
+ * >0: skb should be passed up to userspace as UDP.
+ */
+int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct l2tp_tunnel *tunnel;
+
+ tunnel = l2tp_sock_to_tunnel(sk);
+ if (tunnel == NULL)
+ goto pass_up;
+
+ l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n",
+ tunnel->name, skb->len);
+
+ if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook))
+ goto pass_up_put;
+
+ sock_put(sk);
+ return 0;
+
+pass_up_put:
+ sock_put(sk);
+pass_up:
+ return 1;
+}
+EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv);
+
+/************************************************************************
+ * Transmit handling
+ ***********************************************************************/
+
+/* Build an L2TP header for the session into the buffer provided.
+ */
+static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf)
+{
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ __be16 *bufp = buf;
+ __be16 *optr = buf;
+ u16 flags = L2TP_HDR_VER_2;
+ u32 tunnel_id = tunnel->peer_tunnel_id;
+ u32 session_id = session->peer_session_id;
+
+ if (session->send_seq)
+ flags |= L2TP_HDRFLAG_S;
+
+ /* Setup L2TP header. */
+ *bufp++ = htons(flags);
+ *bufp++ = htons(tunnel_id);
+ *bufp++ = htons(session_id);
+ if (session->send_seq) {
+ *bufp++ = htons(session->ns);
+ *bufp++ = 0;
+ session->ns++;
+ session->ns &= 0xffff;
+ l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated ns to %u\n",
+ session->name, session->ns);
+ }
+
+ return bufp - optr;
+}
+
+static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
+{
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ char *bufp = buf;
+ char *optr = bufp;
+
+ /* Setup L2TP header. The header differs slightly for UDP and
+ * IP encapsulations. For UDP, there is 4 bytes of flags.
+ */
+ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
+ u16 flags = L2TP_HDR_VER_3;
+ *((__be16 *) bufp) = htons(flags);
+ bufp += 2;
+ *((__be16 *) bufp) = 0;
+ bufp += 2;
+ }
+
+ *((__be32 *) bufp) = htonl(session->peer_session_id);
+ bufp += 4;
+ if (session->cookie_len) {
+ memcpy(bufp, &session->cookie[0], session->cookie_len);
+ bufp += session->cookie_len;
+ }
+ if (session->l2specific_len) {
+ if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
+ u32 l2h = 0;
+ if (session->send_seq) {
+ l2h = 0x40000000 | session->ns;
+ session->ns++;
+ session->ns &= 0xffffff;
+ l2tp_dbg(session, L2TP_MSG_SEQ,
+ "%s: updated ns to %u\n",
+ session->name, session->ns);
+ }
+
+ *((__be32 *) bufp) = htonl(l2h);
+ }
+ bufp += session->l2specific_len;
+ }
+ if (session->offset)
+ bufp += session->offset;
+
+ return bufp - optr;
+}
+
+static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
+ struct flowi *fl, size_t data_len)
+{
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ unsigned int len = skb->len;
+ int error;
+
+ /* Debug */
+ if (session->send_seq)
+ l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes, ns=%u\n",
+ session->name, data_len, session->ns - 1);
+ else
+ l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %Zd bytes\n",
+ session->name, data_len);
+
+ if (session->debug & L2TP_MSG_DATA) {
+ int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
+ unsigned char *datap = skb->data + uhlen;
+
+ pr_debug("%s: xmit\n", session->name);
+ print_hex_dump_bytes("", DUMP_PREFIX_OFFSET,
+ datap, min_t(size_t, 32, len - uhlen));
+ }
+
+ /* Queue the packet to IP for output */
+ skb->ignore_df = 1;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tunnel->sock->sk_family == PF_INET6 && !tunnel->v4mapped)
+ error = inet6_csk_xmit(tunnel->sock, skb, NULL);
+ else
+#endif
+ error = ip_queue_xmit(tunnel->sock, skb, fl);
+
+ /* Update stats */
+ if (error >= 0) {
+ atomic_long_inc(&tunnel->stats.tx_packets);
+ atomic_long_add(len, &tunnel->stats.tx_bytes);
+ atomic_long_inc(&session->stats.tx_packets);
+ atomic_long_add(len, &session->stats.tx_bytes);
+ } else {
+ atomic_long_inc(&tunnel->stats.tx_errors);
+ atomic_long_inc(&session->stats.tx_errors);
+ }
+
+ return 0;
+}
+
+/* If caller requires the skb to have a ppp header, the header must be
+ * inserted in the skb data before calling this function.
+ */
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len)
+{
+ int data_len = skb->len;
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ struct sock *sk = tunnel->sock;
+ struct flowi *fl;
+ struct udphdr *uh;
+ struct inet_sock *inet;
+ int headroom;
+ int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
+ int udp_len;
+ int ret = NET_XMIT_SUCCESS;
+
+ /* Check that there's enough headroom in the skb to insert IP,
+ * UDP and L2TP headers. If not enough, expand it to
+ * make room. Adjust truesize.
+ */
+ headroom = NET_SKB_PAD + sizeof(struct iphdr) +
+ uhlen + hdr_len;
+ if (skb_cow_head(skb, headroom)) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ /* Setup L2TP header */
+ session->build_header(session, __skb_push(skb, hdr_len));
+
+ /* Reset skb netfilter state */
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+ IPSKB_REROUTED);
+ nf_reset(skb);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ kfree_skb(skb);
+ ret = NET_XMIT_DROP;
+ goto out_unlock;
+ }
+
+ /* Get routing info from the tunnel socket */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0)));
+
+ inet = inet_sk(sk);
+ fl = &inet->cork.fl;
+ switch (tunnel->encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ /* Setup UDP header */
+ __skb_push(skb, sizeof(*uh));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+ uh->source = inet->inet_sport;
+ uh->dest = inet->inet_dport;
+ udp_len = uhlen + hdr_len + data_len;
+ uh->len = htons(udp_len);
+
+ /* Calculate UDP checksum if configured to do so */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == PF_INET6 && !tunnel->v4mapped)
+ udp6_set_csum(udp_get_no_check6_tx(sk),
+ skb, &inet6_sk(sk)->saddr,
+ &sk->sk_v6_daddr, udp_len);
+ else
+#endif
+ udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr,
+ inet->inet_daddr, udp_len);
+ break;
+
+ case L2TP_ENCAPTYPE_IP:
+ break;
+ }
+
+ l2tp_xmit_core(session, skb, fl, data_len);
+out_unlock:
+ bh_unlock_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
+
+/*****************************************************************************
+ * Tinnel and session create/destroy.
+ *****************************************************************************/
+
+/* Tunnel socket destruct hook.
+ * The tunnel context is deleted only when all session sockets have been
+ * closed.
+ */
+static void l2tp_tunnel_destruct(struct sock *sk)
+{
+ struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
+ struct l2tp_net *pn;
+
+ if (tunnel == NULL)
+ goto end;
+
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name);
+
+
+ /* Disable udp encapsulation */
+ switch (tunnel->encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ /* No longer an encapsulation socket. See net/ipv4/udp.c */
+ (udp_sk(sk))->encap_type = 0;
+ (udp_sk(sk))->encap_rcv = NULL;
+ (udp_sk(sk))->encap_destroy = NULL;
+ break;
+ case L2TP_ENCAPTYPE_IP:
+ break;
+ }
+
+ /* Remove hooks into tunnel socket */
+ sk->sk_destruct = tunnel->old_sk_destruct;
+ sk->sk_user_data = NULL;
+ tunnel->sock = NULL;
+
+ /* Remove the tunnel struct from the tunnel list */
+ pn = l2tp_pernet(tunnel->l2tp_net);
+ spin_lock_bh(&pn->l2tp_tunnel_list_lock);
+ list_del_rcu(&tunnel->list);
+ spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+ atomic_dec(&l2tp_tunnel_count);
+
+ l2tp_tunnel_closeall(tunnel);
+ l2tp_tunnel_dec_refcount(tunnel);
+
+ /* Call the original destructor */
+ if (sk->sk_destruct)
+ (*sk->sk_destruct)(sk);
+end:
+ return;
+}
+
+/* When the tunnel is closed, all the attached sessions need to go too.
+ */
+void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
+{
+ int hash;
+ struct hlist_node *walk;
+ struct hlist_node *tmp;
+ struct l2tp_session *session;
+
+ BUG_ON(tunnel == NULL);
+
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing all sessions...\n",
+ tunnel->name);
+
+ write_lock_bh(&tunnel->hlist_lock);
+ for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
+again:
+ hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) {
+ session = hlist_entry(walk, struct l2tp_session, hlist);
+
+ l2tp_info(session, L2TP_MSG_CONTROL,
+ "%s: closing session\n", session->name);
+
+ hlist_del_init(&session->hlist);
+
+ if (session->ref != NULL)
+ (*session->ref)(session);
+
+ write_unlock_bh(&tunnel->hlist_lock);
+
+ __l2tp_session_unhash(session);
+ l2tp_session_queue_purge(session);
+
+ if (session->session_close != NULL)
+ (*session->session_close)(session);
+
+ if (session->deref != NULL)
+ (*session->deref)(session);
+
+ l2tp_session_dec_refcount(session);
+
+ write_lock_bh(&tunnel->hlist_lock);
+
+ /* Now restart from the beginning of this hash
+ * chain. We always remove a session from the
+ * list so we are guaranteed to make forward
+ * progress.
+ */
+ goto again;
+ }
+ }
+ write_unlock_bh(&tunnel->hlist_lock);
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall);
+
+/* Tunnel socket destroy hook for UDP encapsulation */
+static void l2tp_udp_encap_destroy(struct sock *sk)
+{
+ struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
+ if (tunnel) {
+ l2tp_tunnel_closeall(tunnel);
+ sock_put(sk);
+ }
+}
+
+/* Really kill the tunnel.
+ * Come here only when all sessions have been cleared from the tunnel.
+ */
+static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
+{
+ BUG_ON(atomic_read(&tunnel->ref_count) != 0);
+ BUG_ON(tunnel->sock != NULL);
+ l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: free...\n", tunnel->name);
+ kfree_rcu(tunnel, rcu);
+}
+
+/* Workqueue tunnel deletion function */
+static void l2tp_tunnel_del_work(struct work_struct *work)
+{
+ struct l2tp_tunnel *tunnel = NULL;
+ struct socket *sock = NULL;
+ struct sock *sk = NULL;
+
+ tunnel = container_of(work, struct l2tp_tunnel, del_work);
+ sk = l2tp_tunnel_sock_lookup(tunnel);
+ if (!sk)
+ return;
+
+ sock = sk->sk_socket;
+
+ /* If the tunnel socket was created by userspace, then go through the
+ * inet layer to shut the socket down, and let userspace close it.
+ * Otherwise, if we created the socket directly within the kernel, use
+ * the sk API to release it here.
+ * In either case the tunnel resources are freed in the socket
+ * destructor when the tunnel socket goes away.
+ */
+ if (tunnel->fd >= 0) {
+ if (sock)
+ inet_shutdown(sock, 2);
+ } else {
+ if (sock)
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sk);
+ }
+
+ l2tp_tunnel_sock_put(sk);
+}
+
+/* Create a socket for the tunnel, if one isn't set up by
+ * userspace. This is used for static tunnels where there is no
+ * managing L2TP daemon.
+ *
+ * Since we don't want these sockets to keep a namespace alive by
+ * themselves, we drop the socket's namespace refcount after creation.
+ * These sockets are freed when the namespace exits using the pernet
+ * exit hook.
+ */
+static int l2tp_tunnel_sock_create(struct net *net,
+ u32 tunnel_id,
+ u32 peer_tunnel_id,
+ struct l2tp_tunnel_cfg *cfg,
+ struct socket **sockp)
+{
+ int err = -EINVAL;
+ struct socket *sock = NULL;
+ struct udp_port_cfg udp_conf;
+
+ switch (cfg->encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ memset(&udp_conf, 0, sizeof(udp_conf));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (cfg->local_ip6 && cfg->peer_ip6) {
+ udp_conf.family = AF_INET6;
+ memcpy(&udp_conf.local_ip6, cfg->local_ip6,
+ sizeof(udp_conf.local_ip6));
+ memcpy(&udp_conf.peer_ip6, cfg->peer_ip6,
+ sizeof(udp_conf.peer_ip6));
+ udp_conf.use_udp6_tx_checksums =
+ cfg->udp6_zero_tx_checksums;
+ udp_conf.use_udp6_rx_checksums =
+ cfg->udp6_zero_rx_checksums;
+ } else
+#endif
+ {
+ udp_conf.family = AF_INET;
+ udp_conf.local_ip = cfg->local_ip;
+ udp_conf.peer_ip = cfg->peer_ip;
+ udp_conf.use_udp_checksums = cfg->use_udp_checksums;
+ }
+
+ udp_conf.local_udp_port = htons(cfg->local_udp_port);
+ udp_conf.peer_udp_port = htons(cfg->peer_udp_port);
+
+ err = udp_sock_create(net, &udp_conf, &sock);
+ if (err < 0)
+ goto out;
+
+ break;
+
+ case L2TP_ENCAPTYPE_IP:
+#if IS_ENABLED(CONFIG_IPV6)
+ if (cfg->local_ip6 && cfg->peer_ip6) {
+ struct sockaddr_l2tpip6 ip6_addr = {0};
+
+ err = sock_create_kern(AF_INET6, SOCK_DGRAM,
+ IPPROTO_L2TP, &sock);
+ if (err < 0)
+ goto out;
+
+ sk_change_net(sock->sk, net);
+
+ ip6_addr.l2tp_family = AF_INET6;
+ memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6,
+ sizeof(ip6_addr.l2tp_addr));
+ ip6_addr.l2tp_conn_id = tunnel_id;
+ err = kernel_bind(sock, (struct sockaddr *) &ip6_addr,
+ sizeof(ip6_addr));
+ if (err < 0)
+ goto out;
+
+ ip6_addr.l2tp_family = AF_INET6;
+ memcpy(&ip6_addr.l2tp_addr, cfg->peer_ip6,
+ sizeof(ip6_addr.l2tp_addr));
+ ip6_addr.l2tp_conn_id = peer_tunnel_id;
+ err = kernel_connect(sock,
+ (struct sockaddr *) &ip6_addr,
+ sizeof(ip6_addr), 0);
+ if (err < 0)
+ goto out;
+ } else
+#endif
+ {
+ struct sockaddr_l2tpip ip_addr = {0};
+
+ err = sock_create_kern(AF_INET, SOCK_DGRAM,
+ IPPROTO_L2TP, &sock);
+ if (err < 0)
+ goto out;
+
+ sk_change_net(sock->sk, net);
+
+ ip_addr.l2tp_family = AF_INET;
+ ip_addr.l2tp_addr = cfg->local_ip;
+ ip_addr.l2tp_conn_id = tunnel_id;
+ err = kernel_bind(sock, (struct sockaddr *) &ip_addr,
+ sizeof(ip_addr));
+ if (err < 0)
+ goto out;
+
+ ip_addr.l2tp_family = AF_INET;
+ ip_addr.l2tp_addr = cfg->peer_ip;
+ ip_addr.l2tp_conn_id = peer_tunnel_id;
+ err = kernel_connect(sock, (struct sockaddr *) &ip_addr,
+ sizeof(ip_addr), 0);
+ if (err < 0)
+ goto out;
+ }
+ break;
+
+ default:
+ goto out;
+ }
+
+out:
+ *sockp = sock;
+ if ((err < 0) && sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+ *sockp = NULL;
+ }
+
+ return err;
+}
+
+static struct lock_class_key l2tp_socket_class;
+
+int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp)
+{
+ struct l2tp_tunnel *tunnel = NULL;
+ int err;
+ struct socket *sock = NULL;
+ struct sock *sk = NULL;
+ struct l2tp_net *pn;
+ enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP;
+
+ /* Get the tunnel socket from the fd, which was opened by
+ * the userspace L2TP daemon. If not specified, create a
+ * kernel socket.
+ */
+ if (fd < 0) {
+ err = l2tp_tunnel_sock_create(net, tunnel_id, peer_tunnel_id,
+ cfg, &sock);
+ if (err < 0)
+ goto err;
+ } else {
+ sock = sockfd_lookup(fd, &err);
+ if (!sock) {
+ pr_err("tunl %u: sockfd_lookup(fd=%d) returned %d\n",
+ tunnel_id, fd, err);
+ err = -EBADF;
+ goto err;
+ }
+
+ /* Reject namespace mismatches */
+ if (!net_eq(sock_net(sock->sk), net)) {
+ pr_err("tunl %u: netns mismatch\n", tunnel_id);
+ err = -EINVAL;
+ goto err;
+ }
+ }
+
+ sk = sock->sk;
+
+ if (cfg != NULL)
+ encap = cfg->encap;
+
+ /* Quick sanity checks */
+ switch (encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ err = -EPROTONOSUPPORT;
+ if (sk->sk_protocol != IPPROTO_UDP) {
+ pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n",
+ tunnel_id, fd, sk->sk_protocol, IPPROTO_UDP);
+ goto err;
+ }
+ break;
+ case L2TP_ENCAPTYPE_IP:
+ err = -EPROTONOSUPPORT;
+ if (sk->sk_protocol != IPPROTO_L2TP) {
+ pr_err("tunl %hu: fd %d wrong protocol, got %d, expected %d\n",
+ tunnel_id, fd, sk->sk_protocol, IPPROTO_L2TP);
+ goto err;
+ }
+ break;
+ }
+
+ /* Check if this socket has already been prepped */
+ tunnel = l2tp_tunnel(sk);
+ if (tunnel != NULL) {
+ /* This socket has already been prepped */
+ err = -EBUSY;
+ goto err;
+ }
+
+ tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL);
+ if (tunnel == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ tunnel->version = version;
+ tunnel->tunnel_id = tunnel_id;
+ tunnel->peer_tunnel_id = peer_tunnel_id;
+ tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS;
+
+ tunnel->magic = L2TP_TUNNEL_MAGIC;
+ sprintf(&tunnel->name[0], "tunl %u", tunnel_id);
+ rwlock_init(&tunnel->hlist_lock);
+
+ /* The net we belong to */
+ tunnel->l2tp_net = net;
+ pn = l2tp_pernet(net);
+
+ if (cfg != NULL)
+ tunnel->debug = cfg->debug;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == PF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (ipv6_addr_v4mapped(&np->saddr) &&
+ ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ tunnel->v4mapped = true;
+ inet->inet_saddr = np->saddr.s6_addr32[3];
+ inet->inet_rcv_saddr = sk->sk_v6_rcv_saddr.s6_addr32[3];
+ inet->inet_daddr = sk->sk_v6_daddr.s6_addr32[3];
+ } else {
+ tunnel->v4mapped = false;
+ }
+ }
+#endif
+
+ /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
+ tunnel->encap = encap;
+ if (encap == L2TP_ENCAPTYPE_UDP) {
+ struct udp_tunnel_sock_cfg udp_cfg;
+
+ udp_cfg.sk_user_data = tunnel;
+ udp_cfg.encap_type = UDP_ENCAP_L2TPINUDP;
+ udp_cfg.encap_rcv = l2tp_udp_encap_recv;
+ udp_cfg.encap_destroy = l2tp_udp_encap_destroy;
+
+ setup_udp_tunnel_sock(net, sock, &udp_cfg);
+ } else {
+ sk->sk_user_data = tunnel;
+ }
+
+ /* Hook on the tunnel socket destructor so that we can cleanup
+ * if the tunnel socket goes away.
+ */
+ tunnel->old_sk_destruct = sk->sk_destruct;
+ sk->sk_destruct = &l2tp_tunnel_destruct;
+ tunnel->sock = sk;
+ tunnel->fd = fd;
+ lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock");
+
+ sk->sk_allocation = GFP_ATOMIC;
+
+ /* Init delete workqueue struct */
+ INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work);
+
+ /* Add tunnel to our list */
+ INIT_LIST_HEAD(&tunnel->list);
+ atomic_inc(&l2tp_tunnel_count);
+
+ /* Bump the reference count. The tunnel context is deleted
+ * only when this drops to zero. Must be done before list insertion
+ */
+ l2tp_tunnel_inc_refcount(tunnel);
+ spin_lock_bh(&pn->l2tp_tunnel_list_lock);
+ list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list);
+ spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+
+ err = 0;
+err:
+ if (tunnelp)
+ *tunnelp = tunnel;
+
+ /* If tunnel's socket was created by the kernel, it doesn't
+ * have a file.
+ */
+ if (sock && sock->file)
+ sockfd_put(sock);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
+
+/* This function is used by the netlink TUNNEL_DELETE command.
+ */
+int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
+{
+ l2tp_tunnel_closeall(tunnel);
+ return (false == queue_work(l2tp_wq, &tunnel->del_work));
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_delete);
+
+/* Really kill the session.
+ */
+void l2tp_session_free(struct l2tp_session *session)
+{
+ struct l2tp_tunnel *tunnel = session->tunnel;
+
+ BUG_ON(atomic_read(&session->ref_count) != 0);
+
+ if (tunnel) {
+ BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC);
+ if (session->session_id != 0)
+ atomic_dec(&l2tp_session_count);
+ sock_put(tunnel->sock);
+ session->tunnel = NULL;
+ l2tp_tunnel_dec_refcount(tunnel);
+ }
+
+ kfree(session);
+}
+EXPORT_SYMBOL_GPL(l2tp_session_free);
+
+/* Remove an l2tp session from l2tp_core's hash lists.
+ * Provides a tidyup interface for pseudowire code which can't just route all
+ * shutdown via. l2tp_session_delete and a pseudowire-specific session_close
+ * callback.
+ */
+void __l2tp_session_unhash(struct l2tp_session *session)
+{
+ struct l2tp_tunnel *tunnel = session->tunnel;
+
+ /* Remove the session from core hashes */
+ if (tunnel) {
+ /* Remove from the per-tunnel hash */
+ write_lock_bh(&tunnel->hlist_lock);
+ hlist_del_init(&session->hlist);
+ write_unlock_bh(&tunnel->hlist_lock);
+
+ /* For L2TPv3 we have a per-net hash: remove from there, too */
+ if (tunnel->version != L2TP_HDR_VER_2) {
+ struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
+ spin_lock_bh(&pn->l2tp_session_hlist_lock);
+ hlist_del_init_rcu(&session->global_hlist);
+ spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+ synchronize_rcu();
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(__l2tp_session_unhash);
+
+/* This function is used by the netlink SESSION_DELETE command and by
+ pseudowire modules.
+ */
+int l2tp_session_delete(struct l2tp_session *session)
+{
+ if (session->ref)
+ (*session->ref)(session);
+ __l2tp_session_unhash(session);
+ l2tp_session_queue_purge(session);
+ if (session->session_close != NULL)
+ (*session->session_close)(session);
+ if (session->deref)
+ (*session->deref)(session);
+ l2tp_session_dec_refcount(session);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_delete);
+
+/* We come here whenever a session's send_seq, cookie_len or
+ * l2specific_len parameters are set.
+ */
+void l2tp_session_set_header_len(struct l2tp_session *session, int version)
+{
+ if (version == L2TP_HDR_VER_2) {
+ session->hdr_len = 6;
+ if (session->send_seq)
+ session->hdr_len += 4;
+ } else {
+ session->hdr_len = 4 + session->cookie_len + session->l2specific_len + session->offset;
+ if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP)
+ session->hdr_len += 4;
+ }
+
+}
+EXPORT_SYMBOL_GPL(l2tp_session_set_header_len);
+
+struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
+{
+ struct l2tp_session *session;
+
+ session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL);
+ if (session != NULL) {
+ session->magic = L2TP_SESSION_MAGIC;
+ session->tunnel = tunnel;
+
+ session->session_id = session_id;
+ session->peer_session_id = peer_session_id;
+ session->nr = 0;
+ if (tunnel->version == L2TP_HDR_VER_2)
+ session->nr_max = 0xffff;
+ else
+ session->nr_max = 0xffffff;
+ session->nr_window_size = session->nr_max / 2;
+ session->nr_oos_count_max = 4;
+
+ /* Use NR of first received packet */
+ session->reorder_skip = 1;
+
+ sprintf(&session->name[0], "sess %u/%u",
+ tunnel->tunnel_id, session->session_id);
+
+ skb_queue_head_init(&session->reorder_q);
+
+ INIT_HLIST_NODE(&session->hlist);
+ INIT_HLIST_NODE(&session->global_hlist);
+
+ /* Inherit debug options from tunnel */
+ session->debug = tunnel->debug;
+
+ if (cfg) {
+ session->pwtype = cfg->pw_type;
+ session->debug = cfg->debug;
+ session->mtu = cfg->mtu;
+ session->mru = cfg->mru;
+ session->send_seq = cfg->send_seq;
+ session->recv_seq = cfg->recv_seq;
+ session->lns_mode = cfg->lns_mode;
+ session->reorder_timeout = cfg->reorder_timeout;
+ session->offset = cfg->offset;
+ session->l2specific_type = cfg->l2specific_type;
+ session->l2specific_len = cfg->l2specific_len;
+ session->cookie_len = cfg->cookie_len;
+ memcpy(&session->cookie[0], &cfg->cookie[0], cfg->cookie_len);
+ session->peer_cookie_len = cfg->peer_cookie_len;
+ memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len);
+ }
+
+ if (tunnel->version == L2TP_HDR_VER_2)
+ session->build_header = l2tp_build_l2tpv2_header;
+ else
+ session->build_header = l2tp_build_l2tpv3_header;
+
+ l2tp_session_set_header_len(session, tunnel->version);
+
+ /* Bump the reference count. The session context is deleted
+ * only when this drops to zero.
+ */
+ l2tp_session_inc_refcount(session);
+ l2tp_tunnel_inc_refcount(tunnel);
+
+ /* Ensure tunnel socket isn't deleted */
+ sock_hold(tunnel->sock);
+
+ /* Add session to the tunnel's hash list */
+ write_lock_bh(&tunnel->hlist_lock);
+ hlist_add_head(&session->hlist,
+ l2tp_session_id_hash(tunnel, session_id));
+ write_unlock_bh(&tunnel->hlist_lock);
+
+ /* And to the global session list if L2TPv3 */
+ if (tunnel->version != L2TP_HDR_VER_2) {
+ struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
+
+ spin_lock_bh(&pn->l2tp_session_hlist_lock);
+ hlist_add_head_rcu(&session->global_hlist,
+ l2tp_session_id_hash_2(pn, session_id));
+ spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+ }
+
+ /* Ignore management session in session count value */
+ if (session->session_id != 0)
+ atomic_inc(&l2tp_session_count);
+ }
+
+ return session;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_create);
+
+/*****************************************************************************
+ * Init and cleanup
+ *****************************************************************************/
+
+static __net_init int l2tp_init_net(struct net *net)
+{
+ struct l2tp_net *pn = net_generic(net, l2tp_net_id);
+ int hash;
+
+ INIT_LIST_HEAD(&pn->l2tp_tunnel_list);
+ spin_lock_init(&pn->l2tp_tunnel_list_lock);
+
+ for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++)
+ INIT_HLIST_HEAD(&pn->l2tp_session_hlist[hash]);
+
+ spin_lock_init(&pn->l2tp_session_hlist_lock);
+
+ return 0;
+}
+
+static __net_exit void l2tp_exit_net(struct net *net)
+{
+ struct l2tp_net *pn = l2tp_pernet(net);
+ struct l2tp_tunnel *tunnel = NULL;
+
+ rcu_read_lock_bh();
+ list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
+ (void)l2tp_tunnel_delete(tunnel);
+ }
+ rcu_read_unlock_bh();
+}
+
+static struct pernet_operations l2tp_net_ops = {
+ .init = l2tp_init_net,
+ .exit = l2tp_exit_net,
+ .id = &l2tp_net_id,
+ .size = sizeof(struct l2tp_net),
+};
+
+static int __init l2tp_init(void)
+{
+ int rc = 0;
+
+ rc = register_pernet_device(&l2tp_net_ops);
+ if (rc)
+ goto out;
+
+ l2tp_wq = alloc_workqueue("l2tp", WQ_UNBOUND, 0);
+ if (!l2tp_wq) {
+ pr_err("alloc_workqueue failed\n");
+ unregister_pernet_device(&l2tp_net_ops);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ pr_info("L2TP core driver, %s\n", L2TP_DRV_VERSION);
+
+out:
+ return rc;
+}
+
+static void __exit l2tp_exit(void)
+{
+ unregister_pernet_device(&l2tp_net_ops);
+ if (l2tp_wq) {
+ destroy_workqueue(l2tp_wq);
+ l2tp_wq = NULL;
+ }
+}
+
+module_init(l2tp_init);
+module_exit(l2tp_exit);
+
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP core");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(L2TP_DRV_VERSION);
+
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
new file mode 100644
index 000000000..68aa9ffd4
--- /dev/null
+++ b/net/l2tp/l2tp_core.h
@@ -0,0 +1,324 @@
+/*
+ * L2TP internal definitions.
+ *
+ * Copyright (c) 2008,2009 Katalix Systems Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _L2TP_CORE_H_
+#define _L2TP_CORE_H_
+
+/* Just some random numbers */
+#define L2TP_TUNNEL_MAGIC 0x42114DDA
+#define L2TP_SESSION_MAGIC 0x0C04EB7D
+
+/* Per tunnel, session hash table size */
+#define L2TP_HASH_BITS 4
+#define L2TP_HASH_SIZE (1 << L2TP_HASH_BITS)
+
+/* System-wide, session hash table size */
+#define L2TP_HASH_BITS_2 8
+#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2)
+
+/* Debug message categories for the DEBUG socket option */
+enum {
+ L2TP_MSG_DEBUG = (1 << 0), /* verbose debug (if
+ * compiled in) */
+ L2TP_MSG_CONTROL = (1 << 1), /* userspace - kernel
+ * interface */
+ L2TP_MSG_SEQ = (1 << 2), /* sequence numbers */
+ L2TP_MSG_DATA = (1 << 3), /* data packets */
+};
+
+struct sk_buff;
+
+struct l2tp_stats {
+ atomic_long_t tx_packets;
+ atomic_long_t tx_bytes;
+ atomic_long_t tx_errors;
+ atomic_long_t rx_packets;
+ atomic_long_t rx_bytes;
+ atomic_long_t rx_seq_discards;
+ atomic_long_t rx_oos_packets;
+ atomic_long_t rx_errors;
+ atomic_long_t rx_cookie_discards;
+};
+
+struct l2tp_tunnel;
+
+/* Describes a session. Contains information to determine incoming
+ * packets and transmit outgoing ones.
+ */
+struct l2tp_session_cfg {
+ enum l2tp_pwtype pw_type;
+ unsigned int data_seq:2; /* data sequencing level
+ * 0 => none, 1 => IP only,
+ * 2 => all
+ */
+ unsigned int recv_seq:1; /* expect receive packets with
+ * sequence numbers? */
+ unsigned int send_seq:1; /* send packets with sequence
+ * numbers? */
+ unsigned int lns_mode:1; /* behave as LNS? LAC enables
+ * sequence numbers under
+ * control of LNS. */
+ int debug; /* bitmask of debug message
+ * categories */
+ u16 vlan_id; /* VLAN pseudowire only */
+ u16 offset; /* offset to payload */
+ u16 l2specific_len; /* Layer 2 specific length */
+ u16 l2specific_type; /* Layer 2 specific type */
+ u8 cookie[8]; /* optional cookie */
+ int cookie_len; /* 0, 4 or 8 bytes */
+ u8 peer_cookie[8]; /* peer's cookie */
+ int peer_cookie_len; /* 0, 4 or 8 bytes */
+ int reorder_timeout; /* configured reorder timeout
+ * (in jiffies) */
+ int mtu;
+ int mru;
+ char *ifname;
+};
+
+struct l2tp_session {
+ int magic; /* should be
+ * L2TP_SESSION_MAGIC */
+
+ struct l2tp_tunnel *tunnel; /* back pointer to tunnel
+ * context */
+ u32 session_id;
+ u32 peer_session_id;
+ u8 cookie[8];
+ int cookie_len;
+ u8 peer_cookie[8];
+ int peer_cookie_len;
+ u16 offset; /* offset from end of L2TP header
+ to beginning of data */
+ u16 l2specific_len;
+ u16 l2specific_type;
+ u16 hdr_len;
+ u32 nr; /* session NR state (receive) */
+ u32 ns; /* session NR state (send) */
+ struct sk_buff_head reorder_q; /* receive reorder queue */
+ u32 nr_max; /* max NR. Depends on tunnel */
+ u32 nr_window_size; /* NR window size */
+ u32 nr_oos; /* NR of last OOS packet */
+ int nr_oos_count; /* For OOS recovery */
+ int nr_oos_count_max;
+ struct hlist_node hlist; /* Hash list node */
+ atomic_t ref_count;
+
+ char name[32]; /* for logging */
+ char ifname[IFNAMSIZ];
+ unsigned int data_seq:2; /* data sequencing level
+ * 0 => none, 1 => IP only,
+ * 2 => all
+ */
+ unsigned int recv_seq:1; /* expect receive packets with
+ * sequence numbers? */
+ unsigned int send_seq:1; /* send packets with sequence
+ * numbers? */
+ unsigned int lns_mode:1; /* behave as LNS? LAC enables
+ * sequence numbers under
+ * control of LNS. */
+ int debug; /* bitmask of debug message
+ * categories */
+ int reorder_timeout; /* configured reorder timeout
+ * (in jiffies) */
+ int reorder_skip; /* set if skip to next nr */
+ int mtu;
+ int mru;
+ enum l2tp_pwtype pwtype;
+ struct l2tp_stats stats;
+ struct hlist_node global_hlist; /* Global hash list node */
+
+ int (*build_header)(struct l2tp_session *session, void *buf);
+ void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len);
+ void (*session_close)(struct l2tp_session *session);
+ void (*ref)(struct l2tp_session *session);
+ void (*deref)(struct l2tp_session *session);
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+ void (*show)(struct seq_file *m, void *priv);
+#endif
+ uint8_t priv[0]; /* private data */
+};
+
+/* Describes the tunnel. It contains info to track all the associated
+ * sessions so incoming packets can be sorted out
+ */
+struct l2tp_tunnel_cfg {
+ int debug; /* bitmask of debug message
+ * categories */
+ enum l2tp_encap_type encap;
+
+ /* Used only for kernel-created sockets */
+ struct in_addr local_ip;
+ struct in_addr peer_ip;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr *local_ip6;
+ struct in6_addr *peer_ip6;
+#endif
+ u16 local_udp_port;
+ u16 peer_udp_port;
+ unsigned int use_udp_checksums:1,
+ udp6_zero_tx_checksums:1,
+ udp6_zero_rx_checksums:1;
+};
+
+struct l2tp_tunnel {
+ int magic; /* Should be L2TP_TUNNEL_MAGIC */
+ struct rcu_head rcu;
+ rwlock_t hlist_lock; /* protect session_hlist */
+ struct hlist_head session_hlist[L2TP_HASH_SIZE];
+ /* hashed list of sessions,
+ * hashed by id */
+ u32 tunnel_id;
+ u32 peer_tunnel_id;
+ int version; /* 2=>L2TPv2, 3=>L2TPv3 */
+
+ char name[20]; /* for logging */
+ int debug; /* bitmask of debug message
+ * categories */
+ enum l2tp_encap_type encap;
+ struct l2tp_stats stats;
+
+ struct list_head list; /* Keep a list of all tunnels */
+ struct net *l2tp_net; /* the net we belong to */
+
+ atomic_t ref_count;
+#ifdef CONFIG_DEBUG_FS
+ void (*show)(struct seq_file *m, void *arg);
+#endif
+ int (*recv_payload_hook)(struct sk_buff *skb);
+ void (*old_sk_destruct)(struct sock *);
+ struct sock *sock; /* Parent socket */
+ int fd; /* Parent fd, if tunnel socket
+ * was created by userspace */
+#if IS_ENABLED(CONFIG_IPV6)
+ bool v4mapped;
+#endif
+
+ struct work_struct del_work;
+
+ uint8_t priv[0]; /* private data */
+};
+
+struct l2tp_nl_cmd_ops {
+ int (*session_create)(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg);
+ int (*session_delete)(struct l2tp_session *session);
+};
+
+static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel)
+{
+ return &tunnel->priv[0];
+}
+
+static inline void *l2tp_session_priv(struct l2tp_session *session)
+{
+ return &session->priv[0];
+}
+
+static inline struct l2tp_tunnel *l2tp_sock_to_tunnel(struct sock *sk)
+{
+ struct l2tp_tunnel *tunnel;
+
+ if (sk == NULL)
+ return NULL;
+
+ sock_hold(sk);
+ tunnel = (struct l2tp_tunnel *)(sk->sk_user_data);
+ if (tunnel == NULL) {
+ sock_put(sk);
+ goto out;
+ }
+
+ BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC);
+
+out:
+ return tunnel;
+}
+
+struct l2tp_session *l2tp_session_find(struct net *net,
+ struct l2tp_tunnel *tunnel,
+ u32 session_id);
+struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth);
+struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname);
+struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id);
+struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth);
+
+int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
+ u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
+ struct l2tp_tunnel **tunnelp);
+void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
+int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
+struct l2tp_session *l2tp_session_create(int priv_size,
+ struct l2tp_tunnel *tunnel,
+ u32 session_id, u32 peer_session_id,
+ struct l2tp_session_cfg *cfg);
+void __l2tp_session_unhash(struct l2tp_session *session);
+int l2tp_session_delete(struct l2tp_session *session);
+void l2tp_session_free(struct l2tp_session *session);
+void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
+ unsigned char *ptr, unsigned char *optr, u16 hdrflags,
+ int length, int (*payload_hook)(struct sk_buff *skb));
+int l2tp_session_queue_purge(struct l2tp_session *session);
+int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
+void l2tp_session_set_header_len(struct l2tp_session *session, int version);
+
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
+ int hdr_len);
+
+int l2tp_nl_register_ops(enum l2tp_pwtype pw_type,
+ const struct l2tp_nl_cmd_ops *ops);
+void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
+
+/* Session reference counts. Incremented when code obtains a reference
+ * to a session.
+ */
+static inline void l2tp_session_inc_refcount_1(struct l2tp_session *session)
+{
+ atomic_inc(&session->ref_count);
+}
+
+static inline void l2tp_session_dec_refcount_1(struct l2tp_session *session)
+{
+ if (atomic_dec_and_test(&session->ref_count))
+ l2tp_session_free(session);
+}
+
+#ifdef L2TP_REFCNT_DEBUG
+#define l2tp_session_inc_refcount(_s) \
+do { \
+ pr_debug("l2tp_session_inc_refcount: %s:%d %s: cnt=%d\n", \
+ __func__, __LINE__, (_s)->name, \
+ atomic_read(&_s->ref_count)); \
+ l2tp_session_inc_refcount_1(_s); \
+} while (0)
+#define l2tp_session_dec_refcount(_s) \
+do { \
+ pr_debug("l2tp_session_dec_refcount: %s:%d %s: cnt=%d\n", \
+ __func__, __LINE__, (_s)->name, \
+ atomic_read(&_s->ref_count)); \
+ l2tp_session_dec_refcount_1(_s); \
+} while (0)
+#else
+#define l2tp_session_inc_refcount(s) l2tp_session_inc_refcount_1(s)
+#define l2tp_session_dec_refcount(s) l2tp_session_dec_refcount_1(s)
+#endif
+
+#define l2tp_printk(ptr, type, func, fmt, ...) \
+do { \
+ if (((ptr)->debug) & (type)) \
+ func(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define l2tp_warn(ptr, type, fmt, ...) \
+ l2tp_printk(ptr, type, pr_warn, fmt, ##__VA_ARGS__)
+#define l2tp_info(ptr, type, fmt, ...) \
+ l2tp_printk(ptr, type, pr_info, fmt, ##__VA_ARGS__)
+#define l2tp_dbg(ptr, type, fmt, ...) \
+ l2tp_printk(ptr, type, pr_debug, fmt, ##__VA_ARGS__)
+
+#endif /* _L2TP_CORE_H_ */
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
new file mode 100644
index 000000000..2d6760a2a
--- /dev/null
+++ b/net/l2tp/l2tp_debugfs.c
@@ -0,0 +1,352 @@
+/*
+ * L2TP subsystem debugfs
+ *
+ * Copyright (c) 2010 Katalix Systems Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/hash.h>
+#include <linux/l2tp.h>
+#include <linux/in.h>
+#include <linux/etherdevice.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp_states.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "l2tp_core.h"
+
+static struct dentry *rootdir;
+static struct dentry *tunnels;
+
+struct l2tp_dfs_seq_data {
+ struct net *net;
+ int tunnel_idx; /* current tunnel */
+ int session_idx; /* index of session within current tunnel */
+ struct l2tp_tunnel *tunnel;
+ struct l2tp_session *session; /* NULL means get next tunnel */
+};
+
+static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
+{
+ pd->tunnel = l2tp_tunnel_find_nth(pd->net, pd->tunnel_idx);
+ pd->tunnel_idx++;
+}
+
+static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
+{
+ pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx);
+ pd->session_idx++;
+
+ if (pd->session == NULL) {
+ pd->session_idx = 0;
+ l2tp_dfs_next_tunnel(pd);
+ }
+
+}
+
+static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs)
+{
+ struct l2tp_dfs_seq_data *pd = SEQ_START_TOKEN;
+ loff_t pos = *offs;
+
+ if (!pos)
+ goto out;
+
+ BUG_ON(m->private == NULL);
+ pd = m->private;
+
+ if (pd->tunnel == NULL)
+ l2tp_dfs_next_tunnel(pd);
+ else
+ l2tp_dfs_next_session(pd);
+
+ /* NULL tunnel and session indicates end of list */
+ if ((pd->tunnel == NULL) && (pd->session == NULL))
+ pd = NULL;
+
+out:
+ return pd;
+}
+
+
+static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return NULL;
+}
+
+static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
+{
+ /* nothing to do */
+}
+
+static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
+{
+ struct l2tp_tunnel *tunnel = v;
+ int session_count = 0;
+ int hash;
+ struct hlist_node *walk;
+ struct hlist_node *tmp;
+
+ read_lock_bh(&tunnel->hlist_lock);
+ for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
+ hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) {
+ struct l2tp_session *session;
+
+ session = hlist_entry(walk, struct l2tp_session, hlist);
+ if (session->session_id == 0)
+ continue;
+
+ session_count++;
+ }
+ }
+ read_unlock_bh(&tunnel->hlist_lock);
+
+ seq_printf(m, "\nTUNNEL %u peer %u", tunnel->tunnel_id, tunnel->peer_tunnel_id);
+ if (tunnel->sock) {
+ struct inet_sock *inet = inet_sk(tunnel->sock);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tunnel->sock->sk_family == AF_INET6) {
+ const struct ipv6_pinfo *np = inet6_sk(tunnel->sock);
+
+ seq_printf(m, " from %pI6c to %pI6c\n",
+ &np->saddr, &tunnel->sock->sk_v6_daddr);
+ } else
+#endif
+ seq_printf(m, " from %pI4 to %pI4\n",
+ &inet->inet_saddr, &inet->inet_daddr);
+ if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
+ seq_printf(m, " source port %hu, dest port %hu\n",
+ ntohs(inet->inet_sport), ntohs(inet->inet_dport));
+ }
+ seq_printf(m, " L2TPv%d, %s\n", tunnel->version,
+ tunnel->encap == L2TP_ENCAPTYPE_UDP ? "UDP" :
+ tunnel->encap == L2TP_ENCAPTYPE_IP ? "IP" :
+ "");
+ seq_printf(m, " %d sessions, refcnt %d/%d\n", session_count,
+ tunnel->sock ? atomic_read(&tunnel->sock->sk_refcnt) : 0,
+ atomic_read(&tunnel->ref_count));
+
+ seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n",
+ tunnel->debug,
+ atomic_long_read(&tunnel->stats.tx_packets),
+ atomic_long_read(&tunnel->stats.tx_bytes),
+ atomic_long_read(&tunnel->stats.tx_errors),
+ atomic_long_read(&tunnel->stats.rx_packets),
+ atomic_long_read(&tunnel->stats.rx_bytes),
+ atomic_long_read(&tunnel->stats.rx_errors));
+
+ if (tunnel->show != NULL)
+ tunnel->show(m, tunnel);
+}
+
+static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
+{
+ struct l2tp_session *session = v;
+
+ seq_printf(m, " SESSION %u, peer %u, %s\n", session->session_id,
+ session->peer_session_id,
+ session->pwtype == L2TP_PWTYPE_ETH ? "ETH" :
+ session->pwtype == L2TP_PWTYPE_PPP ? "PPP" :
+ "");
+ if (session->send_seq || session->recv_seq)
+ seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns);
+ seq_printf(m, " refcnt %d\n", atomic_read(&session->ref_count));
+ seq_printf(m, " config %d/%d/%c/%c/%s/%s %08x %u\n",
+ session->mtu, session->mru,
+ session->recv_seq ? 'R' : '-',
+ session->send_seq ? 'S' : '-',
+ session->data_seq == 1 ? "IPSEQ" :
+ session->data_seq == 2 ? "DATASEQ" : "-",
+ session->lns_mode ? "LNS" : "LAC",
+ session->debug,
+ jiffies_to_msecs(session->reorder_timeout));
+ seq_printf(m, " offset %hu l2specific %hu/%hu\n",
+ session->offset, session->l2specific_type, session->l2specific_len);
+ if (session->cookie_len) {
+ seq_printf(m, " cookie %02x%02x%02x%02x",
+ session->cookie[0], session->cookie[1],
+ session->cookie[2], session->cookie[3]);
+ if (session->cookie_len == 8)
+ seq_printf(m, "%02x%02x%02x%02x",
+ session->cookie[4], session->cookie[5],
+ session->cookie[6], session->cookie[7]);
+ seq_printf(m, "\n");
+ }
+ if (session->peer_cookie_len) {
+ seq_printf(m, " peer cookie %02x%02x%02x%02x",
+ session->peer_cookie[0], session->peer_cookie[1],
+ session->peer_cookie[2], session->peer_cookie[3]);
+ if (session->peer_cookie_len == 8)
+ seq_printf(m, "%02x%02x%02x%02x",
+ session->peer_cookie[4], session->peer_cookie[5],
+ session->peer_cookie[6], session->peer_cookie[7]);
+ seq_printf(m, "\n");
+ }
+
+ seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n",
+ session->nr, session->ns,
+ atomic_long_read(&session->stats.tx_packets),
+ atomic_long_read(&session->stats.tx_bytes),
+ atomic_long_read(&session->stats.tx_errors),
+ atomic_long_read(&session->stats.rx_packets),
+ atomic_long_read(&session->stats.rx_bytes),
+ atomic_long_read(&session->stats.rx_errors));
+
+ if (session->show != NULL)
+ session->show(m, session);
+}
+
+static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
+{
+ struct l2tp_dfs_seq_data *pd = v;
+
+ /* display header on line 1 */
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(m, "TUNNEL ID, peer ID from IP to IP\n");
+ seq_puts(m, " L2TPv2/L2TPv3, UDP/IP\n");
+ seq_puts(m, " sessions session-count, refcnt refcnt/sk->refcnt\n");
+ seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
+ seq_puts(m, " SESSION ID, peer ID, PWTYPE\n");
+ seq_puts(m, " refcnt cnt\n");
+ seq_puts(m, " offset OFFSET l2specific TYPE/LEN\n");
+ seq_puts(m, " [ cookie ]\n");
+ seq_puts(m, " [ peer cookie ]\n");
+ seq_puts(m, " config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n");
+ seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
+ goto out;
+ }
+
+ /* Show the tunnel or session context */
+ if (pd->session == NULL)
+ l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
+ else
+ l2tp_dfs_seq_session_show(m, pd->session);
+
+out:
+ return 0;
+}
+
+static const struct seq_operations l2tp_dfs_seq_ops = {
+ .start = l2tp_dfs_seq_start,
+ .next = l2tp_dfs_seq_next,
+ .stop = l2tp_dfs_seq_stop,
+ .show = l2tp_dfs_seq_show,
+};
+
+static int l2tp_dfs_seq_open(struct inode *inode, struct file *file)
+{
+ struct l2tp_dfs_seq_data *pd;
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (pd == NULL)
+ goto out;
+
+ /* Derive the network namespace from the pid opening the
+ * file.
+ */
+ pd->net = get_net_ns_by_pid(current->pid);
+ if (IS_ERR(pd->net)) {
+ rc = PTR_ERR(pd->net);
+ goto err_free_pd;
+ }
+
+ rc = seq_open(file, &l2tp_dfs_seq_ops);
+ if (rc)
+ goto err_free_net;
+
+ seq = file->private_data;
+ seq->private = pd;
+
+out:
+ return rc;
+
+err_free_net:
+ put_net(pd->net);
+err_free_pd:
+ kfree(pd);
+ goto out;
+}
+
+static int l2tp_dfs_seq_release(struct inode *inode, struct file *file)
+{
+ struct l2tp_dfs_seq_data *pd;
+ struct seq_file *seq;
+
+ seq = file->private_data;
+ pd = seq->private;
+ if (pd->net)
+ put_net(pd->net);
+ kfree(pd);
+ seq_release(inode, file);
+
+ return 0;
+}
+
+static const struct file_operations l2tp_dfs_fops = {
+ .owner = THIS_MODULE,
+ .open = l2tp_dfs_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = l2tp_dfs_seq_release,
+};
+
+static int __init l2tp_debugfs_init(void)
+{
+ int rc = 0;
+
+ rootdir = debugfs_create_dir("l2tp", NULL);
+ if (IS_ERR(rootdir)) {
+ rc = PTR_ERR(rootdir);
+ rootdir = NULL;
+ goto out;
+ }
+
+ tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops);
+ if (tunnels == NULL)
+ rc = -EIO;
+
+ pr_info("L2TP debugfs support\n");
+
+out:
+ if (rc)
+ pr_warn("unable to init\n");
+
+ return rc;
+}
+
+static void __exit l2tp_debugfs_exit(void)
+{
+ debugfs_remove(tunnels);
+ debugfs_remove(rootdir);
+}
+
+module_init(l2tp_debugfs_init);
+module_exit(l2tp_debugfs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP debugfs driver");
+MODULE_VERSION("1.0");
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
new file mode 100644
index 000000000..4b552873b
--- /dev/null
+++ b/net/l2tp/l2tp_eth.c
@@ -0,0 +1,360 @@
+/*
+ * L2TPv3 ethernet pseudowire driver
+ *
+ * Copyright (c) 2008,2009,2010 Katalix Systems Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/hash.h>
+#include <linux/l2tp.h>
+#include <linux/in.h>
+#include <linux/etherdevice.h>
+#include <linux/spinlock.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp_states.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "l2tp_core.h"
+
+/* Default device name. May be overridden by name specified by user */
+#define L2TP_ETH_DEV_NAME "l2tpeth%d"
+
+/* via netdev_priv() */
+struct l2tp_eth {
+ struct net_device *dev;
+ struct sock *tunnel_sock;
+ struct l2tp_session *session;
+ struct list_head list;
+ atomic_long_t tx_bytes;
+ atomic_long_t tx_packets;
+ atomic_long_t tx_dropped;
+ atomic_long_t rx_bytes;
+ atomic_long_t rx_packets;
+ atomic_long_t rx_errors;
+};
+
+/* via l2tp_session_priv() */
+struct l2tp_eth_sess {
+ struct net_device *dev;
+};
+
+/* per-net private data for this module */
+static unsigned int l2tp_eth_net_id;
+struct l2tp_eth_net {
+ struct list_head l2tp_eth_dev_list;
+ spinlock_t l2tp_eth_lock;
+};
+
+static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net)
+{
+ return net_generic(net, l2tp_eth_net_id);
+}
+
+static struct lock_class_key l2tp_eth_tx_busylock;
+static int l2tp_eth_dev_init(struct net_device *dev)
+{
+ struct l2tp_eth *priv = netdev_priv(dev);
+
+ priv->dev = dev;
+ eth_hw_addr_random(dev);
+ eth_broadcast_addr(dev->broadcast);
+ dev->qdisc_tx_busylock = &l2tp_eth_tx_busylock;
+ return 0;
+}
+
+static void l2tp_eth_dev_uninit(struct net_device *dev)
+{
+ struct l2tp_eth *priv = netdev_priv(dev);
+ struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev));
+
+ spin_lock(&pn->l2tp_eth_lock);
+ list_del_init(&priv->list);
+ spin_unlock(&pn->l2tp_eth_lock);
+ dev_put(dev);
+}
+
+static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct l2tp_eth *priv = netdev_priv(dev);
+ struct l2tp_session *session = priv->session;
+ unsigned int len = skb->len;
+ int ret = l2tp_xmit_skb(session, skb, session->hdr_len);
+
+ if (likely(ret == NET_XMIT_SUCCESS)) {
+ atomic_long_add(len, &priv->tx_bytes);
+ atomic_long_inc(&priv->tx_packets);
+ } else {
+ atomic_long_inc(&priv->tx_dropped);
+ }
+ return NETDEV_TX_OK;
+}
+
+static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct l2tp_eth *priv = netdev_priv(dev);
+
+ stats->tx_bytes = atomic_long_read(&priv->tx_bytes);
+ stats->tx_packets = atomic_long_read(&priv->tx_packets);
+ stats->tx_dropped = atomic_long_read(&priv->tx_dropped);
+ stats->rx_bytes = atomic_long_read(&priv->rx_bytes);
+ stats->rx_packets = atomic_long_read(&priv->rx_packets);
+ stats->rx_errors = atomic_long_read(&priv->rx_errors);
+ return stats;
+}
+
+
+static struct net_device_ops l2tp_eth_netdev_ops = {
+ .ndo_init = l2tp_eth_dev_init,
+ .ndo_uninit = l2tp_eth_dev_uninit,
+ .ndo_start_xmit = l2tp_eth_dev_xmit,
+ .ndo_get_stats64 = l2tp_eth_get_stats64,
+ .ndo_set_mac_address = eth_mac_addr,
+};
+
+static void l2tp_eth_dev_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->features |= NETIF_F_LLTX;
+ dev->netdev_ops = &l2tp_eth_netdev_ops;
+ dev->destructor = free_netdev;
+}
+
+static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
+{
+ struct l2tp_eth_sess *spriv = l2tp_session_priv(session);
+ struct net_device *dev = spriv->dev;
+ struct l2tp_eth *priv = netdev_priv(dev);
+
+ if (session->debug & L2TP_MSG_DATA) {
+ unsigned int length;
+
+ length = min(32u, skb->len);
+ if (!pskb_may_pull(skb, length))
+ goto error;
+
+ pr_debug("%s: eth recv\n", session->name);
+ print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
+ }
+
+ if (!pskb_may_pull(skb, ETH_HLEN))
+ goto error;
+
+ secpath_reset(skb);
+
+ /* checksums verified by L2TP */
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb_dst_drop(skb);
+ nf_reset(skb);
+
+ if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) {
+ atomic_long_inc(&priv->rx_packets);
+ atomic_long_add(data_len, &priv->rx_bytes);
+ } else {
+ atomic_long_inc(&priv->rx_errors);
+ }
+ return;
+
+error:
+ atomic_long_inc(&priv->rx_errors);
+ kfree_skb(skb);
+}
+
+static void l2tp_eth_delete(struct l2tp_session *session)
+{
+ struct l2tp_eth_sess *spriv;
+ struct net_device *dev;
+
+ if (session) {
+ spriv = l2tp_session_priv(session);
+ dev = spriv->dev;
+ if (dev) {
+ unregister_netdev(dev);
+ spriv->dev = NULL;
+ module_put(THIS_MODULE);
+ }
+ }
+}
+
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+static void l2tp_eth_show(struct seq_file *m, void *arg)
+{
+ struct l2tp_session *session = arg;
+ struct l2tp_eth_sess *spriv = l2tp_session_priv(session);
+ struct net_device *dev = spriv->dev;
+
+ seq_printf(m, " interface %s\n", dev->name);
+}
+#endif
+
+static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
+{
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+ struct l2tp_tunnel *tunnel;
+ struct l2tp_session *session;
+ struct l2tp_eth *priv;
+ struct l2tp_eth_sess *spriv;
+ int rc;
+ struct l2tp_eth_net *pn;
+
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+ if (!tunnel) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ session = l2tp_session_find(net, tunnel, session_id);
+ if (session) {
+ rc = -EEXIST;
+ goto out;
+ }
+
+ if (cfg->ifname) {
+ dev = dev_get_by_name(net, cfg->ifname);
+ if (dev) {
+ dev_put(dev);
+ rc = -EEXIST;
+ goto out;
+ }
+ strlcpy(name, cfg->ifname, IFNAMSIZ);
+ } else
+ strcpy(name, L2TP_ETH_DEV_NAME);
+
+ session = l2tp_session_create(sizeof(*spriv), tunnel, session_id,
+ peer_session_id, cfg);
+ if (!session) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN,
+ l2tp_eth_dev_setup);
+ if (!dev) {
+ rc = -ENOMEM;
+ goto out_del_session;
+ }
+
+ dev_net_set(dev, net);
+ if (session->mtu == 0)
+ session->mtu = dev->mtu - session->hdr_len;
+ dev->mtu = session->mtu;
+ dev->needed_headroom += session->hdr_len;
+
+ priv = netdev_priv(dev);
+ priv->dev = dev;
+ priv->session = session;
+ INIT_LIST_HEAD(&priv->list);
+
+ priv->tunnel_sock = tunnel->sock;
+ session->recv_skb = l2tp_eth_dev_recv;
+ session->session_close = l2tp_eth_delete;
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+ session->show = l2tp_eth_show;
+#endif
+
+ spriv = l2tp_session_priv(session);
+ spriv->dev = dev;
+
+ rc = register_netdev(dev);
+ if (rc < 0)
+ goto out_del_dev;
+
+ __module_get(THIS_MODULE);
+ /* Must be done after register_netdev() */
+ strlcpy(session->ifname, dev->name, IFNAMSIZ);
+
+ dev_hold(dev);
+ pn = l2tp_eth_pernet(dev_net(dev));
+ spin_lock(&pn->l2tp_eth_lock);
+ list_add(&priv->list, &pn->l2tp_eth_dev_list);
+ spin_unlock(&pn->l2tp_eth_lock);
+
+ return 0;
+
+out_del_dev:
+ free_netdev(dev);
+ spriv->dev = NULL;
+out_del_session:
+ l2tp_session_delete(session);
+out:
+ return rc;
+}
+
+static __net_init int l2tp_eth_init_net(struct net *net)
+{
+ struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id);
+
+ INIT_LIST_HEAD(&pn->l2tp_eth_dev_list);
+ spin_lock_init(&pn->l2tp_eth_lock);
+
+ return 0;
+}
+
+static struct pernet_operations l2tp_eth_net_ops = {
+ .init = l2tp_eth_init_net,
+ .id = &l2tp_eth_net_id,
+ .size = sizeof(struct l2tp_eth_net),
+};
+
+
+static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = {
+ .session_create = l2tp_eth_create,
+ .session_delete = l2tp_session_delete,
+};
+
+
+static int __init l2tp_eth_init(void)
+{
+ int err = 0;
+
+ err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops);
+ if (err)
+ goto out;
+
+ err = register_pernet_device(&l2tp_eth_net_ops);
+ if (err)
+ goto out_unreg;
+
+ pr_info("L2TP ethernet pseudowire support (L2TPv3)\n");
+
+ return 0;
+
+out_unreg:
+ l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH);
+out:
+ return err;
+}
+
+static void __exit l2tp_eth_exit(void)
+{
+ unregister_pernet_device(&l2tp_eth_net_ops);
+ l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH);
+}
+
+module_init(l2tp_eth_init);
+module_exit(l2tp_eth_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP ethernet pseudowire driver");
+MODULE_VERSION("1.0");
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
new file mode 100644
index 000000000..79649937e
--- /dev/null
+++ b/net/l2tp/l2tp_ip.c
@@ -0,0 +1,657 @@
+/*
+ * L2TPv3 IP encapsulation support
+ *
+ * Copyright (c) 2008,2009,2010 Katalix Systems Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/icmp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/socket.h>
+#include <linux/l2tp.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp_states.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+#include "l2tp_core.h"
+
+struct l2tp_ip_sock {
+ /* inet_sock has to be the first member of l2tp_ip_sock */
+ struct inet_sock inet;
+
+ u32 conn_id;
+ u32 peer_conn_id;
+};
+
+static DEFINE_RWLOCK(l2tp_ip_lock);
+static struct hlist_head l2tp_ip_table;
+static struct hlist_head l2tp_ip_bind_table;
+
+static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk)
+{
+ return (struct l2tp_ip_sock *)sk;
+}
+
+static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
+{
+ struct sock *sk;
+
+ sk_for_each_bound(sk, &l2tp_ip_bind_table) {
+ struct inet_sock *inet = inet_sk(sk);
+ struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+
+ if (l2tp == NULL)
+ continue;
+
+ if ((l2tp->conn_id == tunnel_id) &&
+ net_eq(sock_net(sk), net) &&
+ !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ goto found;
+ }
+
+ sk = NULL;
+found:
+ return sk;
+}
+
+static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
+{
+ struct sock *sk = __l2tp_ip_bind_lookup(net, laddr, dif, tunnel_id);
+ if (sk)
+ sock_hold(sk);
+
+ return sk;
+}
+
+/* When processing receive frames, there are two cases to
+ * consider. Data frames consist of a non-zero session-id and an
+ * optional cookie. Control frames consist of a regular L2TP header
+ * preceded by 32-bits of zeros.
+ *
+ * L2TPv3 Session Header Over IP
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Session ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Cookie (optional, maximum 64 bits)...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * L2TPv3 Control Message Header Over IP
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | (32 bits of zeros) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Control Connection ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Ns | Nr |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * All control frames are passed to userspace.
+ */
+static int l2tp_ip_recv(struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb->dev);
+ struct sock *sk;
+ u32 session_id;
+ u32 tunnel_id;
+ unsigned char *ptr, *optr;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel = NULL;
+ int length;
+
+ /* Point to L2TP header */
+ optr = ptr = skb->data;
+
+ if (!pskb_may_pull(skb, 4))
+ goto discard;
+
+ session_id = ntohl(*((__be32 *) ptr));
+ ptr += 4;
+
+ /* RFC3931: L2TP/IP packets have the first 4 bytes containing
+ * the session_id. If it is 0, the packet is a L2TP control
+ * frame and the session_id value can be discarded.
+ */
+ if (session_id == 0) {
+ __skb_pull(skb, 4);
+ goto pass_up;
+ }
+
+ /* Ok, this is a data packet. Lookup the session. */
+ session = l2tp_session_find(net, NULL, session_id);
+ if (session == NULL)
+ goto discard;
+
+ tunnel = session->tunnel;
+ if (tunnel == NULL)
+ goto discard;
+
+ /* Trace packet contents, if enabled */
+ if (tunnel->debug & L2TP_MSG_DATA) {
+ length = min(32u, skb->len);
+ if (!pskb_may_pull(skb, length))
+ goto discard;
+
+ pr_debug("%s: ip recv\n", tunnel->name);
+ print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
+ }
+
+ l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
+
+ return 0;
+
+pass_up:
+ /* Get the tunnel_id from the L2TP header */
+ if (!pskb_may_pull(skb, 12))
+ goto discard;
+
+ if ((skb->data[0] & 0xc0) != 0xc0)
+ goto discard;
+
+ tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+ if (tunnel != NULL)
+ sk = tunnel->sock;
+ else {
+ struct iphdr *iph = (struct iphdr *) skb_network_header(skb);
+
+ read_lock_bh(&l2tp_ip_lock);
+ sk = __l2tp_ip_bind_lookup(net, iph->daddr, 0, tunnel_id);
+ read_unlock_bh(&l2tp_ip_lock);
+ }
+
+ if (sk == NULL)
+ goto discard;
+
+ sock_hold(sk);
+
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_put;
+
+ nf_reset(skb);
+
+ return sk_receive_skb(sk, skb, 1);
+
+discard_put:
+ sock_put(sk);
+
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int l2tp_ip_open(struct sock *sk)
+{
+ /* Prevent autobind. We don't have ports. */
+ inet_sk(sk)->inet_num = IPPROTO_L2TP;
+
+ write_lock_bh(&l2tp_ip_lock);
+ sk_add_node(sk, &l2tp_ip_table);
+ write_unlock_bh(&l2tp_ip_lock);
+
+ return 0;
+}
+
+static void l2tp_ip_close(struct sock *sk, long timeout)
+{
+ write_lock_bh(&l2tp_ip_lock);
+ hlist_del_init(&sk->sk_bind_node);
+ sk_del_node_init(sk);
+ write_unlock_bh(&l2tp_ip_lock);
+ sk_common_release(sk);
+}
+
+static void l2tp_ip_destroy_sock(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
+
+ while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+ kfree_skb(skb);
+
+ if (tunnel) {
+ l2tp_tunnel_closeall(tunnel);
+ sock_put(sk);
+ }
+
+ sk_refcnt_debug_dec(sk);
+}
+
+static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *) uaddr;
+ struct net *net = sock_net(sk);
+ int ret;
+ int chk_addr_ret;
+
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ return -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_l2tpip))
+ return -EINVAL;
+ if (addr->l2tp_family != AF_INET)
+ return -EINVAL;
+
+ ret = -EADDRINUSE;
+ read_lock_bh(&l2tp_ip_lock);
+ if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr,
+ sk->sk_bound_dev_if, addr->l2tp_conn_id))
+ goto out_in_use;
+
+ read_unlock_bh(&l2tp_ip_lock);
+
+ lock_sock(sk);
+ if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
+ goto out;
+
+ chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr);
+ ret = -EADDRNOTAVAIL;
+ if (addr->l2tp_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ goto out;
+
+ if (addr->l2tp_addr.s_addr)
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->inet_saddr = 0; /* Use device */
+ sk_dst_reset(sk);
+
+ l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id;
+
+ write_lock_bh(&l2tp_ip_lock);
+ sk_add_bind_node(sk, &l2tp_ip_bind_table);
+ sk_del_node_init(sk);
+ write_unlock_bh(&l2tp_ip_lock);
+ ret = 0;
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+out:
+ release_sock(sk);
+
+ return ret;
+
+out_in_use:
+ read_unlock_bh(&l2tp_ip_lock);
+
+ return ret;
+}
+
+static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr;
+ int rc;
+
+ if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
+ return -EINVAL;
+
+ if (addr_len < sizeof(*lsa))
+ return -EINVAL;
+
+ if (ipv4_is_multicast(lsa->l2tp_addr.s_addr))
+ return -EINVAL;
+
+ rc = ip4_datagram_connect(sk, uaddr, addr_len);
+ if (rc < 0)
+ return rc;
+
+ lock_sock(sk);
+
+ l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
+
+ write_lock_bh(&l2tp_ip_lock);
+ hlist_del_init(&sk->sk_bind_node);
+ sk_add_bind_node(sk, &l2tp_ip_bind_table);
+ write_unlock_bh(&l2tp_ip_lock);
+
+ release_sock(sk);
+ return rc;
+}
+
+static int l2tp_ip_disconnect(struct sock *sk, int flags)
+{
+ if (sock_flag(sk, SOCK_ZAPPED))
+ return 0;
+
+ return udp_disconnect(sk, flags);
+}
+
+static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk);
+ struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr;
+
+ memset(lsa, 0, sizeof(*lsa));
+ lsa->l2tp_family = AF_INET;
+ if (peer) {
+ if (!inet->inet_dport)
+ return -ENOTCONN;
+ lsa->l2tp_conn_id = lsk->peer_conn_id;
+ lsa->l2tp_addr.s_addr = inet->inet_daddr;
+ } else {
+ __be32 addr = inet->inet_rcv_saddr;
+ if (!addr)
+ addr = inet->inet_saddr;
+ lsa->l2tp_conn_id = lsk->conn_id;
+ lsa->l2tp_addr.s_addr = addr;
+ }
+ *uaddr_len = sizeof(*lsa);
+ return 0;
+}
+
+static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+
+ /* Charge it to the socket, dropping if the queue is full. */
+ rc = sock_queue_rcv_skb(sk, skb);
+ if (rc < 0)
+ goto drop;
+
+ return 0;
+
+drop:
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
+ kfree_skb(skb);
+ return -1;
+}
+
+/* Userspace will call sendmsg() on the tunnel socket to send L2TP
+ * control frames.
+ */
+static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct sk_buff *skb;
+ int rc;
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = NULL;
+ struct flowi4 *fl4;
+ int connected = 0;
+ __be32 daddr;
+
+ lock_sock(sk);
+
+ rc = -ENOTCONN;
+ if (sock_flag(sk, SOCK_DEAD))
+ goto out;
+
+ /* Get and verify the address. */
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name);
+ rc = -EINVAL;
+ if (msg->msg_namelen < sizeof(*lip))
+ goto out;
+
+ if (lip->l2tp_family != AF_INET) {
+ rc = -EAFNOSUPPORT;
+ if (lip->l2tp_family != AF_UNSPEC)
+ goto out;
+ }
+
+ daddr = lip->l2tp_addr.s_addr;
+ } else {
+ rc = -EDESTADDRREQ;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ daddr = inet->inet_daddr;
+ connected = 1;
+ }
+
+ /* Allocate a socket buffer */
+ rc = -ENOMEM;
+ skb = sock_wmalloc(sk, 2 + NET_SKB_PAD + sizeof(struct iphdr) +
+ 4 + len, 0, GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ /* Reserve space for headers, putting IP header on 4-byte boundary. */
+ skb_reserve(skb, 2 + NET_SKB_PAD);
+ skb_reset_network_header(skb);
+ skb_reserve(skb, sizeof(struct iphdr));
+ skb_reset_transport_header(skb);
+
+ /* Insert 0 session_id */
+ *((__be32 *) skb_put(skb, 4)) = 0;
+
+ /* Copy user data into skb */
+ rc = memcpy_from_msg(skb_put(skb, len), msg, len);
+ if (rc < 0) {
+ kfree_skb(skb);
+ goto error;
+ }
+
+ fl4 = &inet->cork.fl.u.ip4;
+ if (connected)
+ rt = (struct rtable *) __sk_dst_check(sk, 0);
+
+ rcu_read_lock();
+ if (rt == NULL) {
+ const struct ip_options_rcu *inet_opt;
+
+ inet_opt = rcu_dereference(inet->inet_opt);
+
+ /* Use correct destination address if we have options. */
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times
+ * itself out.
+ */
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (IS_ERR(rt))
+ goto no_route;
+ if (connected) {
+ sk_setup_caps(sk, &rt->dst);
+ } else {
+ skb_dst_set(skb, &rt->dst);
+ goto xmit;
+ }
+ }
+
+ /* We dont need to clone dst here, it is guaranteed to not disappear.
+ * __dev_xmit_skb() might force a refcount if needed.
+ */
+ skb_dst_set_noref(skb, &rt->dst);
+
+xmit:
+ /* Queue the packet to IP for output */
+ rc = ip_queue_xmit(sk, skb, &inet->cork.fl);
+ rcu_read_unlock();
+
+error:
+ if (rc >= 0)
+ rc = len;
+
+out:
+ release_sock(sk);
+ return rc;
+
+no_route:
+ rcu_read_unlock();
+ IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb(skb);
+ rc = -EHOSTUNREACH;
+ goto out;
+}
+
+static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg,
+ size_t len, int noblock, int flags, int *addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct sk_buff *skb;
+
+ if (flags & MSG_OOB)
+ goto out;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ /* Copy the address. */
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ sin->sin_port = 0;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *addr_len = sizeof(*sin);
+ }
+ if (inet->cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out:
+ return err ? err : copied;
+}
+
+static struct proto l2tp_ip_prot = {
+ .name = "L2TP/IP",
+ .owner = THIS_MODULE,
+ .init = l2tp_ip_open,
+ .close = l2tp_ip_close,
+ .bind = l2tp_ip_bind,
+ .connect = l2tp_ip_connect,
+ .disconnect = l2tp_ip_disconnect,
+ .ioctl = udp_ioctl,
+ .destroy = l2tp_ip_destroy_sock,
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .sendmsg = l2tp_ip_sendmsg,
+ .recvmsg = l2tp_ip_recvmsg,
+ .backlog_rcv = l2tp_ip_backlog_recv,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .obj_size = sizeof(struct l2tp_ip_sock),
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ip_setsockopt,
+ .compat_getsockopt = compat_ip_getsockopt,
+#endif
+};
+
+static const struct proto_ops l2tp_ip_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = l2tp_ip_getname,
+ .poll = datagram_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw l2tp_ip_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_L2TP,
+ .prot = &l2tp_ip_prot,
+ .ops = &l2tp_ip_ops,
+};
+
+static struct net_protocol l2tp_ip_protocol __read_mostly = {
+ .handler = l2tp_ip_recv,
+ .netns_ok = 1,
+};
+
+static int __init l2tp_ip_init(void)
+{
+ int err;
+
+ pr_info("L2TP IP encapsulation support (L2TPv3)\n");
+
+ err = proto_register(&l2tp_ip_prot, 1);
+ if (err != 0)
+ goto out;
+
+ err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP);
+ if (err)
+ goto out1;
+
+ inet_register_protosw(&l2tp_ip_protosw);
+ return 0;
+
+out1:
+ proto_unregister(&l2tp_ip_prot);
+out:
+ return err;
+}
+
+static void __exit l2tp_ip_exit(void)
+{
+ inet_unregister_protosw(&l2tp_ip_protosw);
+ inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP);
+ proto_unregister(&l2tp_ip_prot);
+}
+
+module_init(l2tp_ip_init);
+module_exit(l2tp_ip_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP over IP");
+MODULE_VERSION("1.0");
+
+/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like
+ * enums
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
new file mode 100644
index 000000000..d1ded3777
--- /dev/null
+++ b/net/l2tp/l2tp_ip6.c
@@ -0,0 +1,803 @@
+/*
+ * L2TPv3 IP encapsulation support for IPv6
+ *
+ * Copyright (c) 2012 Katalix Systems Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/icmp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/socket.h>
+#include <linux/l2tp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp_states.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+#include <net/transp_v6.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+
+#include "l2tp_core.h"
+
+struct l2tp_ip6_sock {
+ /* inet_sock has to be the first member of l2tp_ip6_sock */
+ struct inet_sock inet;
+
+ u32 conn_id;
+ u32 peer_conn_id;
+
+ /* ipv6_pinfo has to be the last member of l2tp_ip6_sock, see
+ inet6_sk_generic */
+ struct ipv6_pinfo inet6;
+};
+
+static DEFINE_RWLOCK(l2tp_ip6_lock);
+static struct hlist_head l2tp_ip6_table;
+static struct hlist_head l2tp_ip6_bind_table;
+
+static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
+{
+ return (struct l2tp_ip6_sock *)sk;
+}
+
+static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
+ struct in6_addr *laddr,
+ int dif, u32 tunnel_id)
+{
+ struct sock *sk;
+
+ sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
+ const struct in6_addr *addr = inet6_rcv_saddr(sk);
+ struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
+
+ if (l2tp == NULL)
+ continue;
+
+ if ((l2tp->conn_id == tunnel_id) &&
+ net_eq(sock_net(sk), net) &&
+ !(addr && ipv6_addr_equal(addr, laddr)) &&
+ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ goto found;
+ }
+
+ sk = NULL;
+found:
+ return sk;
+}
+
+static inline struct sock *l2tp_ip6_bind_lookup(struct net *net,
+ struct in6_addr *laddr,
+ int dif, u32 tunnel_id)
+{
+ struct sock *sk = __l2tp_ip6_bind_lookup(net, laddr, dif, tunnel_id);
+ if (sk)
+ sock_hold(sk);
+
+ return sk;
+}
+
+/* When processing receive frames, there are two cases to
+ * consider. Data frames consist of a non-zero session-id and an
+ * optional cookie. Control frames consist of a regular L2TP header
+ * preceded by 32-bits of zeros.
+ *
+ * L2TPv3 Session Header Over IP
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Session ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Cookie (optional, maximum 64 bits)...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * L2TPv3 Control Message Header Over IP
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | (32 bits of zeros) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Control Connection ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Ns | Nr |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * All control frames are passed to userspace.
+ */
+static int l2tp_ip6_recv(struct sk_buff *skb)
+{
+ struct sock *sk;
+ u32 session_id;
+ u32 tunnel_id;
+ unsigned char *ptr, *optr;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel = NULL;
+ int length;
+
+ /* Point to L2TP header */
+ optr = ptr = skb->data;
+
+ if (!pskb_may_pull(skb, 4))
+ goto discard;
+
+ session_id = ntohl(*((__be32 *) ptr));
+ ptr += 4;
+
+ /* RFC3931: L2TP/IP packets have the first 4 bytes containing
+ * the session_id. If it is 0, the packet is a L2TP control
+ * frame and the session_id value can be discarded.
+ */
+ if (session_id == 0) {
+ __skb_pull(skb, 4);
+ goto pass_up;
+ }
+
+ /* Ok, this is a data packet. Lookup the session. */
+ session = l2tp_session_find(&init_net, NULL, session_id);
+ if (session == NULL)
+ goto discard;
+
+ tunnel = session->tunnel;
+ if (tunnel == NULL)
+ goto discard;
+
+ /* Trace packet contents, if enabled */
+ if (tunnel->debug & L2TP_MSG_DATA) {
+ length = min(32u, skb->len);
+ if (!pskb_may_pull(skb, length))
+ goto discard;
+
+ pr_debug("%s: ip recv\n", tunnel->name);
+ print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
+ }
+
+ l2tp_recv_common(session, skb, ptr, optr, 0, skb->len,
+ tunnel->recv_payload_hook);
+ return 0;
+
+pass_up:
+ /* Get the tunnel_id from the L2TP header */
+ if (!pskb_may_pull(skb, 12))
+ goto discard;
+
+ if ((skb->data[0] & 0xc0) != 0xc0)
+ goto discard;
+
+ tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
+ tunnel = l2tp_tunnel_find(&init_net, tunnel_id);
+ if (tunnel != NULL)
+ sk = tunnel->sock;
+ else {
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ read_lock_bh(&l2tp_ip6_lock);
+ sk = __l2tp_ip6_bind_lookup(&init_net, &iph->daddr,
+ 0, tunnel_id);
+ read_unlock_bh(&l2tp_ip6_lock);
+ }
+
+ if (sk == NULL)
+ goto discard;
+
+ sock_hold(sk);
+
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ goto discard_put;
+
+ nf_reset(skb);
+
+ return sk_receive_skb(sk, skb, 1);
+
+discard_put:
+ sock_put(sk);
+
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int l2tp_ip6_open(struct sock *sk)
+{
+ /* Prevent autobind. We don't have ports. */
+ inet_sk(sk)->inet_num = IPPROTO_L2TP;
+
+ write_lock_bh(&l2tp_ip6_lock);
+ sk_add_node(sk, &l2tp_ip6_table);
+ write_unlock_bh(&l2tp_ip6_lock);
+
+ return 0;
+}
+
+static void l2tp_ip6_close(struct sock *sk, long timeout)
+{
+ write_lock_bh(&l2tp_ip6_lock);
+ hlist_del_init(&sk->sk_bind_node);
+ sk_del_node_init(sk);
+ write_unlock_bh(&l2tp_ip6_lock);
+
+ sk_common_release(sk);
+}
+
+static void l2tp_ip6_destroy_sock(struct sock *sk)
+{
+ struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
+
+ lock_sock(sk);
+ ip6_flush_pending_frames(sk);
+ release_sock(sk);
+
+ if (tunnel) {
+ l2tp_tunnel_closeall(tunnel);
+ sock_put(sk);
+ }
+
+ inet6_destroy_sock(sk);
+}
+
+static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
+ __be32 v4addr = 0;
+ int addr_type;
+ int err;
+
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ return -EINVAL;
+ if (addr->l2tp_family != AF_INET6)
+ return -EINVAL;
+ if (addr_len < sizeof(*addr))
+ return -EINVAL;
+
+ addr_type = ipv6_addr_type(&addr->l2tp_addr);
+
+ /* l2tp_ip6 sockets are IPv6 only */
+ if (addr_type == IPV6_ADDR_MAPPED)
+ return -EADDRNOTAVAIL;
+
+ /* L2TP is point-point, not multicast */
+ if (addr_type & IPV6_ADDR_MULTICAST)
+ return -EADDRNOTAVAIL;
+
+ err = -EADDRINUSE;
+ read_lock_bh(&l2tp_ip6_lock);
+ if (__l2tp_ip6_bind_lookup(&init_net, &addr->l2tp_addr,
+ sk->sk_bound_dev_if, addr->l2tp_conn_id))
+ goto out_in_use;
+ read_unlock_bh(&l2tp_ip6_lock);
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sk->sk_state != TCP_CLOSE)
+ goto out_unlock;
+
+ /* Check if the address belongs to the host. */
+ rcu_read_lock();
+ if (addr_type != IPV6_ADDR_ANY) {
+ struct net_device *dev = NULL;
+
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ addr->l2tp_scope_id) {
+ /* Override any existing binding, if another
+ * one is supplied by user.
+ */
+ sk->sk_bound_dev_if = addr->l2tp_scope_id;
+ }
+
+ /* Binding to link-local address requires an
+ interface */
+ if (!sk->sk_bound_dev_if)
+ goto out_unlock_rcu;
+
+ err = -ENODEV;
+ dev = dev_get_by_index_rcu(sock_net(sk),
+ sk->sk_bound_dev_if);
+ if (!dev)
+ goto out_unlock_rcu;
+ }
+
+ /* ipv4 addr of the socket is invalid. Only the
+ * unspecified and mapped address have a v4 equivalent.
+ */
+ v4addr = LOOPBACK4_IPV6;
+ err = -EADDRNOTAVAIL;
+ if (!ipv6_chk_addr(sock_net(sk), &addr->l2tp_addr, dev, 0))
+ goto out_unlock_rcu;
+ }
+ rcu_read_unlock();
+
+ inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
+ sk->sk_v6_rcv_saddr = addr->l2tp_addr;
+ np->saddr = addr->l2tp_addr;
+
+ l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id;
+
+ write_lock_bh(&l2tp_ip6_lock);
+ sk_add_bind_node(sk, &l2tp_ip6_bind_table);
+ sk_del_node_init(sk);
+ write_unlock_bh(&l2tp_ip6_lock);
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+ release_sock(sk);
+ return 0;
+
+out_unlock_rcu:
+ rcu_read_unlock();
+out_unlock:
+ release_sock(sk);
+ return err;
+
+out_in_use:
+ read_unlock_bh(&l2tp_ip6_lock);
+ return err;
+}
+
+static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *) uaddr;
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+ struct in6_addr *daddr;
+ int addr_type;
+ int rc;
+
+ if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
+ return -EINVAL;
+
+ if (addr_len < sizeof(*lsa))
+ return -EINVAL;
+
+ if (usin->sin6_family != AF_INET6)
+ return -EINVAL;
+
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+ if (addr_type & IPV6_ADDR_MULTICAST)
+ return -EINVAL;
+
+ if (addr_type & IPV6_ADDR_MAPPED) {
+ daddr = &usin->sin6_addr;
+ if (ipv4_is_multicast(daddr->s6_addr32[3]))
+ return -EINVAL;
+ }
+
+ rc = ip6_datagram_connect(sk, uaddr, addr_len);
+
+ lock_sock(sk);
+
+ l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
+
+ write_lock_bh(&l2tp_ip6_lock);
+ hlist_del_init(&sk->sk_bind_node);
+ sk_add_bind_node(sk, &l2tp_ip6_bind_table);
+ write_unlock_bh(&l2tp_ip6_lock);
+
+ release_sock(sk);
+
+ return rc;
+}
+
+static int l2tp_ip6_disconnect(struct sock *sk, int flags)
+{
+ if (sock_flag(sk, SOCK_ZAPPED))
+ return 0;
+
+ return udp_disconnect(sk, flags);
+}
+
+static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr;
+ struct sock *sk = sock->sk;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct l2tp_ip6_sock *lsk = l2tp_ip6_sk(sk);
+
+ lsa->l2tp_family = AF_INET6;
+ lsa->l2tp_flowinfo = 0;
+ lsa->l2tp_scope_id = 0;
+ lsa->l2tp_unused = 0;
+ if (peer) {
+ if (!lsk->peer_conn_id)
+ return -ENOTCONN;
+ lsa->l2tp_conn_id = lsk->peer_conn_id;
+ lsa->l2tp_addr = sk->sk_v6_daddr;
+ if (np->sndflow)
+ lsa->l2tp_flowinfo = np->flow_label;
+ } else {
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ lsa->l2tp_addr = np->saddr;
+ else
+ lsa->l2tp_addr = sk->sk_v6_rcv_saddr;
+
+ lsa->l2tp_conn_id = lsk->conn_id;
+ }
+ if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL)
+ lsa->l2tp_scope_id = sk->sk_bound_dev_if;
+ *uaddr_len = sizeof(*lsa);
+ return 0;
+}
+
+static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+
+ /* Charge it to the socket, dropping if the queue is full. */
+ rc = sock_queue_rcv_skb(sk, skb);
+ if (rc < 0)
+ goto drop;
+
+ return 0;
+
+drop:
+ IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS);
+ kfree_skb(skb);
+ return -1;
+}
+
+static int l2tp_ip6_push_pending_frames(struct sock *sk)
+{
+ struct sk_buff *skb;
+ __be32 *transhdr = NULL;
+ int err = 0;
+
+ skb = skb_peek(&sk->sk_write_queue);
+ if (skb == NULL)
+ goto out;
+
+ transhdr = (__be32 *)skb_transport_header(skb);
+ *transhdr = 0;
+
+ err = ip6_push_pending_frames(sk);
+
+out:
+ return err;
+}
+
+/* Userspace will call sendmsg() on the tunnel socket to send L2TP
+ * control frames.
+ */
+static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct ipv6_txoptions opt_space;
+ DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name);
+ struct in6_addr *daddr, *final_p, final;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt = NULL;
+ struct ip6_flowlabel *flowlabel = NULL;
+ struct dst_entry *dst = NULL;
+ struct flowi6 fl6;
+ int addr_len = msg->msg_namelen;
+ int hlimit = -1;
+ int tclass = -1;
+ int dontfrag = -1;
+ int transhdrlen = 4; /* zero session-id */
+ int ulen = len + transhdrlen;
+ int err;
+
+ /* Rough check on arithmetic overflow,
+ better check is made in ip6_append_data().
+ */
+ if (len > INT_MAX)
+ return -EMSGSIZE;
+
+ /* Mirror BSD error message compatibility */
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ /*
+ * Get and verify the address.
+ */
+ memset(&fl6, 0, sizeof(fl6));
+
+ fl6.flowi6_mark = sk->sk_mark;
+
+ if (lsa) {
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (lsa->l2tp_family && lsa->l2tp_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ daddr = &lsa->l2tp_addr;
+ if (np->sndflow) {
+ fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK;
+ if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (flowlabel == NULL)
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Otherwise it will be difficult to maintain
+ * sk->sk_dst_cache.
+ */
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
+ daddr = &sk->sk_v6_daddr;
+
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ lsa->l2tp_scope_id &&
+ ipv6_addr_type(daddr) & IPV6_ADDR_LINKLOCAL)
+ fl6.flowi6_oif = lsa->l2tp_scope_id;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EDESTADDRREQ;
+
+ daddr = &sk->sk_v6_daddr;
+ fl6.flowlabel = np->flow_label;
+ }
+
+ if (fl6.flowi6_oif == 0)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+
+ if (msg->msg_controllen) {
+ opt = &opt_space;
+ memset(opt, 0, sizeof(struct ipv6_txoptions));
+ opt->tot_len = sizeof(struct ipv6_txoptions);
+
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt,
+ &hlimit, &tclass, &dontfrag);
+ if (err < 0) {
+ fl6_sock_release(flowlabel);
+ return err;
+ }
+ if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (flowlabel == NULL)
+ return -EINVAL;
+ }
+ if (!(opt->opt_nflen|opt->opt_flen))
+ opt = NULL;
+ }
+
+ if (opt == NULL)
+ opt = np->opt;
+ if (flowlabel)
+ opt = fl6_merge_options(&opt_space, flowlabel, opt);
+ opt = ipv6_fixup_options(&opt_space, opt);
+
+ fl6.flowi6_proto = sk->sk_protocol;
+ if (!ipv6_addr_any(daddr))
+ fl6.daddr = *daddr;
+ else
+ fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
+ fl6.saddr = np->saddr;
+
+ final_p = fl6_update_dst(&fl6, opt, &final);
+
+ if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+ fl6.flowi6_oif = np->mcast_oif;
+ else if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->ucast_oif;
+
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto out;
+ }
+
+ if (hlimit < 0)
+ hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+
+ if (tclass < 0)
+ tclass = np->tclass;
+
+ if (dontfrag < 0)
+ dontfrag = np->dontfrag;
+
+ if (msg->msg_flags & MSG_CONFIRM)
+ goto do_confirm;
+
+back_from_confirm:
+ lock_sock(sk);
+ err = ip6_append_data(sk, ip_generic_getfrag, msg,
+ ulen, transhdrlen, hlimit, tclass, opt,
+ &fl6, (struct rt6_info *)dst,
+ msg->msg_flags, dontfrag);
+ if (err)
+ ip6_flush_pending_frames(sk);
+ else if (!(msg->msg_flags & MSG_MORE))
+ err = l2tp_ip6_push_pending_frames(sk);
+ release_sock(sk);
+done:
+ dst_release(dst);
+out:
+ fl6_sock_release(flowlabel);
+
+ return err < 0 ? err : len;
+
+do_confirm:
+ dst_confirm(dst);
+ if (!(msg->msg_flags & MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto done;
+}
+
+static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name);
+ size_t copied = 0;
+ int err = -EOPNOTSUPP;
+ struct sk_buff *skb;
+
+ if (flags & MSG_OOB)
+ goto out;
+
+ if (addr_len)
+ *addr_len = sizeof(*lsa);
+
+ if (flags & MSG_ERRQUEUE)
+ return ipv6_recv_error(sk, msg, len, addr_len);
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto done;
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ /* Copy the address. */
+ if (lsa) {
+ lsa->l2tp_family = AF_INET6;
+ lsa->l2tp_unused = 0;
+ lsa->l2tp_addr = ipv6_hdr(skb)->saddr;
+ lsa->l2tp_flowinfo = 0;
+ lsa->l2tp_scope_id = 0;
+ lsa->l2tp_conn_id = 0;
+ if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL)
+ lsa->l2tp_scope_id = inet6_iif(skb);
+ }
+
+ if (np->rxopt.all)
+ ip6_datagram_recv_ctl(sk, msg, skb);
+
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+done:
+ skb_free_datagram(sk, skb);
+out:
+ return err ? err : copied;
+}
+
+static struct proto l2tp_ip6_prot = {
+ .name = "L2TP/IPv6",
+ .owner = THIS_MODULE,
+ .init = l2tp_ip6_open,
+ .close = l2tp_ip6_close,
+ .bind = l2tp_ip6_bind,
+ .connect = l2tp_ip6_connect,
+ .disconnect = l2tp_ip6_disconnect,
+ .ioctl = udp_ioctl,
+ .destroy = l2tp_ip6_destroy_sock,
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .sendmsg = l2tp_ip6_sendmsg,
+ .recvmsg = l2tp_ip6_recvmsg,
+ .backlog_rcv = l2tp_ip6_backlog_recv,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .obj_size = sizeof(struct l2tp_ip6_sock),
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ipv6_setsockopt,
+ .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+static const struct proto_ops l2tp_ip6_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = l2tp_ip6_getname,
+ .poll = datagram_poll,
+ .ioctl = inet6_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw l2tp_ip6_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_L2TP,
+ .prot = &l2tp_ip6_prot,
+ .ops = &l2tp_ip6_ops,
+};
+
+static struct inet6_protocol l2tp_ip6_protocol __read_mostly = {
+ .handler = l2tp_ip6_recv,
+};
+
+static int __init l2tp_ip6_init(void)
+{
+ int err;
+
+ pr_info("L2TP IP encapsulation support for IPv6 (L2TPv3)\n");
+
+ err = proto_register(&l2tp_ip6_prot, 1);
+ if (err != 0)
+ goto out;
+
+ err = inet6_add_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP);
+ if (err)
+ goto out1;
+
+ inet6_register_protosw(&l2tp_ip6_protosw);
+ return 0;
+
+out1:
+ proto_unregister(&l2tp_ip6_prot);
+out:
+ return err;
+}
+
+static void __exit l2tp_ip6_exit(void)
+{
+ inet6_unregister_protosw(&l2tp_ip6_protosw);
+ inet6_del_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP);
+ proto_unregister(&l2tp_ip6_prot);
+}
+
+module_init(l2tp_ip6_init);
+module_exit(l2tp_ip6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Chris Elston <celston@katalix.com>");
+MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6");
+MODULE_VERSION("1.0");
+
+/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like
+ * enums
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 2, IPPROTO_L2TP);
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
new file mode 100644
index 000000000..9e13c2ff8
--- /dev/null
+++ b/net/l2tp/l2tp_netlink.c
@@ -0,0 +1,999 @@
+/*
+ * L2TP netlink layer, for management
+ *
+ * Copyright (c) 2008,2009,2010 Katalix Systems Ltd
+ *
+ * Partly based on the IrDA nelink implementation
+ * (see net/irda/irnetlink.c) which is:
+ * Copyright (c) 2007 Samuel Ortiz <samuel@sortiz.org>
+ * which is in turn partly based on the wireless netlink code:
+ * Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/udp.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/socket.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <net/net_namespace.h>
+
+#include <linux/l2tp.h>
+
+#include "l2tp_core.h"
+
+
+static struct genl_family l2tp_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .name = L2TP_GENL_NAME,
+ .version = L2TP_GENL_VERSION,
+ .hdrsize = 0,
+ .maxattr = L2TP_ATTR_MAX,
+ .netnsok = true,
+};
+
+static const struct genl_multicast_group l2tp_multicast_group[] = {
+ {
+ .name = L2TP_GENL_MCGROUP,
+ },
+};
+
+static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq,
+ int flags, struct l2tp_tunnel *tunnel, u8 cmd);
+static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq,
+ int flags, struct l2tp_session *session,
+ u8 cmd);
+
+/* Accessed under genl lock */
+static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX];
+
+static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info)
+{
+ u32 tunnel_id;
+ u32 session_id;
+ char *ifname;
+ struct l2tp_tunnel *tunnel;
+ struct l2tp_session *session = NULL;
+ struct net *net = genl_info_net(info);
+
+ if (info->attrs[L2TP_ATTR_IFNAME]) {
+ ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
+ session = l2tp_session_find_by_ifname(net, ifname);
+ } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) &&
+ (info->attrs[L2TP_ATTR_CONN_ID])) {
+ tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+ session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+ if (tunnel)
+ session = l2tp_session_find(net, tunnel, session_id);
+ }
+
+ return session;
+}
+
+static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int ret = -ENOBUFS;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &l2tp_nl_family, 0, L2TP_CMD_NOOP);
+ if (!hdr) {
+ ret = -EMSGSIZE;
+ goto err_out;
+ }
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
+
+err_out:
+ nlmsg_free(msg);
+
+out:
+ return ret;
+}
+
+static int l2tp_tunnel_notify(struct genl_family *family,
+ struct genl_info *info,
+ struct l2tp_tunnel *tunnel,
+ u8 cmd)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq,
+ NLM_F_ACK, tunnel, cmd);
+
+ if (ret >= 0)
+ return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+
+ nlmsg_free(msg);
+
+ return ret;
+}
+
+static int l2tp_session_notify(struct genl_family *family,
+ struct genl_info *info,
+ struct l2tp_session *session,
+ u8 cmd)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
+ NLM_F_ACK, session, cmd);
+
+ if (ret >= 0)
+ return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+
+ nlmsg_free(msg);
+
+ return ret;
+}
+
+static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info)
+{
+ u32 tunnel_id;
+ u32 peer_tunnel_id;
+ int proto_version;
+ int fd;
+ int ret = 0;
+ struct l2tp_tunnel_cfg cfg = { 0, };
+ struct l2tp_tunnel *tunnel;
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+
+ if (!info->attrs[L2TP_ATTR_PEER_CONN_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ peer_tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_CONN_ID]);
+
+ if (!info->attrs[L2TP_ATTR_PROTO_VERSION]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ proto_version = nla_get_u8(info->attrs[L2TP_ATTR_PROTO_VERSION]);
+
+ if (!info->attrs[L2TP_ATTR_ENCAP_TYPE]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ cfg.encap = nla_get_u16(info->attrs[L2TP_ATTR_ENCAP_TYPE]);
+
+ fd = -1;
+ if (info->attrs[L2TP_ATTR_FD]) {
+ fd = nla_get_u32(info->attrs[L2TP_ATTR_FD]);
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ if (info->attrs[L2TP_ATTR_IP6_SADDR] &&
+ info->attrs[L2TP_ATTR_IP6_DADDR]) {
+ cfg.local_ip6 = nla_data(
+ info->attrs[L2TP_ATTR_IP6_SADDR]);
+ cfg.peer_ip6 = nla_data(
+ info->attrs[L2TP_ATTR_IP6_DADDR]);
+ } else
+#endif
+ if (info->attrs[L2TP_ATTR_IP_SADDR] &&
+ info->attrs[L2TP_ATTR_IP_DADDR]) {
+ cfg.local_ip.s_addr = nla_get_in_addr(
+ info->attrs[L2TP_ATTR_IP_SADDR]);
+ cfg.peer_ip.s_addr = nla_get_in_addr(
+ info->attrs[L2TP_ATTR_IP_DADDR]);
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (info->attrs[L2TP_ATTR_UDP_SPORT])
+ cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]);
+ if (info->attrs[L2TP_ATTR_UDP_DPORT])
+ cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]);
+ if (info->attrs[L2TP_ATTR_UDP_CSUM])
+ cfg.use_udp_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_CSUM]);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX])
+ cfg.udp6_zero_tx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]);
+ if (info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX])
+ cfg.udp6_zero_rx_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]);
+#endif
+ }
+
+ if (info->attrs[L2TP_ATTR_DEBUG])
+ cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
+
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+ if (tunnel != NULL) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = -EINVAL;
+ switch (cfg.encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ case L2TP_ENCAPTYPE_IP:
+ ret = l2tp_tunnel_create(net, fd, proto_version, tunnel_id,
+ peer_tunnel_id, &cfg, &tunnel);
+ break;
+ }
+
+ if (ret >= 0)
+ ret = l2tp_tunnel_notify(&l2tp_nl_family, info,
+ tunnel, L2TP_CMD_TUNNEL_CREATE);
+out:
+ return ret;
+}
+
+static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info)
+{
+ struct l2tp_tunnel *tunnel;
+ u32 tunnel_id;
+ int ret = 0;
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+ if (tunnel == NULL) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ l2tp_tunnel_notify(&l2tp_nl_family, info,
+ tunnel, L2TP_CMD_TUNNEL_DELETE);
+
+ (void) l2tp_tunnel_delete(tunnel);
+
+out:
+ return ret;
+}
+
+static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info)
+{
+ struct l2tp_tunnel *tunnel;
+ u32 tunnel_id;
+ int ret = 0;
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+ if (tunnel == NULL) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (info->attrs[L2TP_ATTR_DEBUG])
+ tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
+
+ ret = l2tp_tunnel_notify(&l2tp_nl_family, info,
+ tunnel, L2TP_CMD_TUNNEL_MODIFY);
+
+out:
+ return ret;
+}
+
+static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags,
+ struct l2tp_tunnel *tunnel, u8 cmd)
+{
+ void *hdr;
+ struct nlattr *nest;
+ struct sock *sk = NULL;
+ struct inet_sock *inet;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct ipv6_pinfo *np = NULL;
+#endif
+
+ hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) ||
+ nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
+ nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) ||
+ nla_put_u32(skb, L2TP_ATTR_DEBUG, tunnel->debug) ||
+ nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, L2TP_ATTR_STATS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS,
+ atomic_long_read(&tunnel->stats.tx_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_TX_BYTES,
+ atomic_long_read(&tunnel->stats.tx_bytes)) ||
+ nla_put_u64(skb, L2TP_ATTR_TX_ERRORS,
+ atomic_long_read(&tunnel->stats.tx_errors)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_PACKETS,
+ atomic_long_read(&tunnel->stats.rx_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_BYTES,
+ atomic_long_read(&tunnel->stats.rx_bytes)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
+ atomic_long_read(&tunnel->stats.rx_seq_discards)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS,
+ atomic_long_read(&tunnel->stats.rx_oos_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_ERRORS,
+ atomic_long_read(&tunnel->stats.rx_errors)))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ sk = tunnel->sock;
+ if (!sk)
+ goto out;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ np = inet6_sk(sk);
+#endif
+
+ inet = inet_sk(sk);
+
+ switch (tunnel->encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) ||
+ nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)) ||
+ nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx))
+ goto nla_put_failure;
+ /* NOBREAK */
+ case L2TP_ENCAPTYPE_IP:
+#if IS_ENABLED(CONFIG_IPV6)
+ if (np) {
+ if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR,
+ &np->saddr) ||
+ nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR,
+ &sk->sk_v6_daddr))
+ goto nla_put_failure;
+ } else
+#endif
+ if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR,
+ inet->inet_saddr) ||
+ nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR,
+ inet->inet_daddr))
+ goto nla_put_failure;
+ break;
+ }
+
+out:
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -1;
+}
+
+static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct l2tp_tunnel *tunnel;
+ struct sk_buff *msg;
+ u32 tunnel_id;
+ int ret = -ENOBUFS;
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+ if (tunnel == NULL) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq,
+ NLM_F_ACK, tunnel, L2TP_CMD_TUNNEL_GET);
+ if (ret < 0)
+ goto err_out;
+
+ return genlmsg_unicast(net, msg, info->snd_portid);
+
+err_out:
+ nlmsg_free(msg);
+
+out:
+ return ret;
+}
+
+static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int ti = cb->args[0];
+ struct l2tp_tunnel *tunnel;
+ struct net *net = sock_net(skb->sk);
+
+ for (;;) {
+ tunnel = l2tp_tunnel_find_nth(net, ti);
+ if (tunnel == NULL)
+ goto out;
+
+ if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ tunnel, L2TP_CMD_TUNNEL_GET) < 0)
+ goto out;
+
+ ti++;
+ }
+
+out:
+ cb->args[0] = ti;
+
+ return skb->len;
+}
+
+static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *info)
+{
+ u32 tunnel_id = 0;
+ u32 session_id;
+ u32 peer_session_id;
+ int ret = 0;
+ struct l2tp_tunnel *tunnel;
+ struct l2tp_session *session;
+ struct l2tp_session_cfg cfg = { 0, };
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+ if (!tunnel) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!info->attrs[L2TP_ATTR_SESSION_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
+ session = l2tp_session_find(net, tunnel, session_id);
+ if (session) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]);
+
+ if (!info->attrs[L2TP_ATTR_PW_TYPE]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]);
+ if (cfg.pw_type >= __L2TP_PWTYPE_MAX) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (tunnel->version > 2) {
+ if (info->attrs[L2TP_ATTR_OFFSET])
+ cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]);
+
+ if (info->attrs[L2TP_ATTR_DATA_SEQ])
+ cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
+
+ cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT;
+ if (info->attrs[L2TP_ATTR_L2SPEC_TYPE])
+ cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]);
+
+ cfg.l2specific_len = 4;
+ if (info->attrs[L2TP_ATTR_L2SPEC_LEN])
+ cfg.l2specific_len = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_LEN]);
+
+ if (info->attrs[L2TP_ATTR_COOKIE]) {
+ u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]);
+ if (len > 8) {
+ ret = -EINVAL;
+ goto out;
+ }
+ cfg.cookie_len = len;
+ memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len);
+ }
+ if (info->attrs[L2TP_ATTR_PEER_COOKIE]) {
+ u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]);
+ if (len > 8) {
+ ret = -EINVAL;
+ goto out;
+ }
+ cfg.peer_cookie_len = len;
+ memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len);
+ }
+ if (info->attrs[L2TP_ATTR_IFNAME])
+ cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
+
+ if (info->attrs[L2TP_ATTR_VLAN_ID])
+ cfg.vlan_id = nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]);
+ }
+
+ if (info->attrs[L2TP_ATTR_DEBUG])
+ cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
+
+ if (info->attrs[L2TP_ATTR_RECV_SEQ])
+ cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
+
+ if (info->attrs[L2TP_ATTR_SEND_SEQ])
+ cfg.send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]);
+
+ if (info->attrs[L2TP_ATTR_LNS_MODE])
+ cfg.lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]);
+
+ if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
+ cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
+
+ if (info->attrs[L2TP_ATTR_MTU])
+ cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
+
+ if (info->attrs[L2TP_ATTR_MRU])
+ cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
+
+ if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) ||
+ (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) {
+ ret = -EPROTONOSUPPORT;
+ goto out;
+ }
+
+ /* Check that pseudowire-specific params are present */
+ switch (cfg.pw_type) {
+ case L2TP_PWTYPE_NONE:
+ break;
+ case L2TP_PWTYPE_ETH_VLAN:
+ if (!info->attrs[L2TP_ATTR_VLAN_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case L2TP_PWTYPE_ETH:
+ break;
+ case L2TP_PWTYPE_PPP:
+ case L2TP_PWTYPE_PPP_AC:
+ break;
+ case L2TP_PWTYPE_IP:
+ default:
+ ret = -EPROTONOSUPPORT;
+ break;
+ }
+
+ ret = -EPROTONOSUPPORT;
+ if (l2tp_nl_cmd_ops[cfg.pw_type]->session_create)
+ ret = (*l2tp_nl_cmd_ops[cfg.pw_type]->session_create)(net, tunnel_id,
+ session_id, peer_session_id, &cfg);
+
+ if (ret >= 0) {
+ session = l2tp_session_find(net, tunnel, session_id);
+ if (session)
+ ret = l2tp_session_notify(&l2tp_nl_family, info, session,
+ L2TP_CMD_SESSION_CREATE);
+ }
+
+out:
+ return ret;
+}
+
+static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret = 0;
+ struct l2tp_session *session;
+ u16 pw_type;
+
+ session = l2tp_nl_session_find(info);
+ if (session == NULL) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ l2tp_session_notify(&l2tp_nl_family, info,
+ session, L2TP_CMD_SESSION_DELETE);
+
+ pw_type = session->pwtype;
+ if (pw_type < __L2TP_PWTYPE_MAX)
+ if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete)
+ ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session);
+
+out:
+ return ret;
+}
+
+static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret = 0;
+ struct l2tp_session *session;
+
+ session = l2tp_nl_session_find(info);
+ if (session == NULL) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (info->attrs[L2TP_ATTR_DEBUG])
+ session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
+
+ if (info->attrs[L2TP_ATTR_DATA_SEQ])
+ session->data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
+
+ if (info->attrs[L2TP_ATTR_RECV_SEQ])
+ session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
+
+ if (info->attrs[L2TP_ATTR_SEND_SEQ]) {
+ session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]);
+ l2tp_session_set_header_len(session, session->tunnel->version);
+ }
+
+ if (info->attrs[L2TP_ATTR_LNS_MODE])
+ session->lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]);
+
+ if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
+ session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
+
+ if (info->attrs[L2TP_ATTR_MTU])
+ session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
+
+ if (info->attrs[L2TP_ATTR_MRU])
+ session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
+
+ ret = l2tp_session_notify(&l2tp_nl_family, info,
+ session, L2TP_CMD_SESSION_MODIFY);
+
+out:
+ return ret;
+}
+
+static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int flags,
+ struct l2tp_session *session, u8 cmd)
+{
+ void *hdr;
+ struct nlattr *nest;
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ struct sock *sk = NULL;
+
+ sk = tunnel->sock;
+
+ hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
+ nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) ||
+ nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) ||
+ nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID,
+ session->peer_session_id) ||
+ nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
+ nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) ||
+ nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu) ||
+ (session->mru &&
+ nla_put_u16(skb, L2TP_ATTR_MRU, session->mru)))
+ goto nla_put_failure;
+
+ if ((session->ifname[0] &&
+ nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) ||
+ (session->cookie_len &&
+ nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len,
+ &session->cookie[0])) ||
+ (session->peer_cookie_len &&
+ nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len,
+ &session->peer_cookie[0])) ||
+ nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) ||
+ nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) ||
+ nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) ||
+#ifdef CONFIG_XFRM
+ (((sk) && (sk->sk_policy[0] || sk->sk_policy[1])) &&
+ nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) ||
+#endif
+ (session->reorder_timeout &&
+ nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout)))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, L2TP_ATTR_STATS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS,
+ atomic_long_read(&session->stats.tx_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_TX_BYTES,
+ atomic_long_read(&session->stats.tx_bytes)) ||
+ nla_put_u64(skb, L2TP_ATTR_TX_ERRORS,
+ atomic_long_read(&session->stats.tx_errors)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_PACKETS,
+ atomic_long_read(&session->stats.rx_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_BYTES,
+ atomic_long_read(&session->stats.rx_bytes)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
+ atomic_long_read(&session->stats.rx_seq_discards)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS,
+ atomic_long_read(&session->stats.rx_oos_packets)) ||
+ nla_put_u64(skb, L2TP_ATTR_RX_ERRORS,
+ atomic_long_read(&session->stats.rx_errors)))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -1;
+}
+
+static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct l2tp_session *session;
+ struct sk_buff *msg;
+ int ret;
+
+ session = l2tp_nl_session_find(info);
+ if (session == NULL) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
+ 0, session, L2TP_CMD_SESSION_GET);
+ if (ret < 0)
+ goto err_out;
+
+ return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
+
+err_out:
+ nlmsg_free(msg);
+
+out:
+ return ret;
+}
+
+static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel = NULL;
+ int ti = cb->args[0];
+ int si = cb->args[1];
+
+ for (;;) {
+ if (tunnel == NULL) {
+ tunnel = l2tp_tunnel_find_nth(net, ti);
+ if (tunnel == NULL)
+ goto out;
+ }
+
+ session = l2tp_session_find_nth(tunnel, si);
+ if (session == NULL) {
+ ti++;
+ tunnel = NULL;
+ si = 0;
+ continue;
+ }
+
+ if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ session, L2TP_CMD_SESSION_GET) < 0)
+ break;
+
+ si++;
+ }
+
+out:
+ cb->args[0] = ti;
+ cb->args[1] = si;
+
+ return skb->len;
+}
+
+static struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
+ [L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, },
+ [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, },
+ [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, },
+ [L2TP_ATTR_OFFSET] = { .type = NLA_U16, },
+ [L2TP_ATTR_DATA_SEQ] = { .type = NLA_U8, },
+ [L2TP_ATTR_L2SPEC_TYPE] = { .type = NLA_U8, },
+ [L2TP_ATTR_L2SPEC_LEN] = { .type = NLA_U8, },
+ [L2TP_ATTR_PROTO_VERSION] = { .type = NLA_U8, },
+ [L2TP_ATTR_CONN_ID] = { .type = NLA_U32, },
+ [L2TP_ATTR_PEER_CONN_ID] = { .type = NLA_U32, },
+ [L2TP_ATTR_SESSION_ID] = { .type = NLA_U32, },
+ [L2TP_ATTR_PEER_SESSION_ID] = { .type = NLA_U32, },
+ [L2TP_ATTR_UDP_CSUM] = { .type = NLA_U8, },
+ [L2TP_ATTR_VLAN_ID] = { .type = NLA_U16, },
+ [L2TP_ATTR_DEBUG] = { .type = NLA_U32, },
+ [L2TP_ATTR_RECV_SEQ] = { .type = NLA_U8, },
+ [L2TP_ATTR_SEND_SEQ] = { .type = NLA_U8, },
+ [L2TP_ATTR_LNS_MODE] = { .type = NLA_U8, },
+ [L2TP_ATTR_USING_IPSEC] = { .type = NLA_U8, },
+ [L2TP_ATTR_RECV_TIMEOUT] = { .type = NLA_MSECS, },
+ [L2TP_ATTR_FD] = { .type = NLA_U32, },
+ [L2TP_ATTR_IP_SADDR] = { .type = NLA_U32, },
+ [L2TP_ATTR_IP_DADDR] = { .type = NLA_U32, },
+ [L2TP_ATTR_UDP_SPORT] = { .type = NLA_U16, },
+ [L2TP_ATTR_UDP_DPORT] = { .type = NLA_U16, },
+ [L2TP_ATTR_MTU] = { .type = NLA_U16, },
+ [L2TP_ATTR_MRU] = { .type = NLA_U16, },
+ [L2TP_ATTR_STATS] = { .type = NLA_NESTED, },
+ [L2TP_ATTR_IP6_SADDR] = {
+ .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr),
+ },
+ [L2TP_ATTR_IP6_DADDR] = {
+ .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr),
+ },
+ [L2TP_ATTR_IFNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1,
+ },
+ [L2TP_ATTR_COOKIE] = {
+ .type = NLA_BINARY,
+ .len = 8,
+ },
+ [L2TP_ATTR_PEER_COOKIE] = {
+ .type = NLA_BINARY,
+ .len = 8,
+ },
+};
+
+static const struct genl_ops l2tp_nl_ops[] = {
+ {
+ .cmd = L2TP_CMD_NOOP,
+ .doit = l2tp_nl_cmd_noop,
+ .policy = l2tp_nl_policy,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = L2TP_CMD_TUNNEL_CREATE,
+ .doit = l2tp_nl_cmd_tunnel_create,
+ .policy = l2tp_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = L2TP_CMD_TUNNEL_DELETE,
+ .doit = l2tp_nl_cmd_tunnel_delete,
+ .policy = l2tp_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = L2TP_CMD_TUNNEL_MODIFY,
+ .doit = l2tp_nl_cmd_tunnel_modify,
+ .policy = l2tp_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = L2TP_CMD_TUNNEL_GET,
+ .doit = l2tp_nl_cmd_tunnel_get,
+ .dumpit = l2tp_nl_cmd_tunnel_dump,
+ .policy = l2tp_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = L2TP_CMD_SESSION_CREATE,
+ .doit = l2tp_nl_cmd_session_create,
+ .policy = l2tp_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = L2TP_CMD_SESSION_DELETE,
+ .doit = l2tp_nl_cmd_session_delete,
+ .policy = l2tp_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = L2TP_CMD_SESSION_MODIFY,
+ .doit = l2tp_nl_cmd_session_modify,
+ .policy = l2tp_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = L2TP_CMD_SESSION_GET,
+ .doit = l2tp_nl_cmd_session_get,
+ .dumpit = l2tp_nl_cmd_session_dump,
+ .policy = l2tp_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops)
+{
+ int ret;
+
+ ret = -EINVAL;
+ if (pw_type >= __L2TP_PWTYPE_MAX)
+ goto err;
+
+ genl_lock();
+ ret = -EBUSY;
+ if (l2tp_nl_cmd_ops[pw_type])
+ goto out;
+
+ l2tp_nl_cmd_ops[pw_type] = ops;
+ ret = 0;
+
+out:
+ genl_unlock();
+err:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(l2tp_nl_register_ops);
+
+void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type)
+{
+ if (pw_type < __L2TP_PWTYPE_MAX) {
+ genl_lock();
+ l2tp_nl_cmd_ops[pw_type] = NULL;
+ genl_unlock();
+ }
+}
+EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
+
+static int l2tp_nl_init(void)
+{
+ pr_info("L2TP netlink interface\n");
+ return genl_register_family_with_ops_groups(&l2tp_nl_family,
+ l2tp_nl_ops,
+ l2tp_multicast_group);
+}
+
+static void l2tp_nl_cleanup(void)
+{
+ genl_unregister_family(&l2tp_nl_family);
+}
+
+module_init(l2tp_nl_init);
+module_exit(l2tp_nl_cleanup);
+
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP netlink");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+MODULE_ALIAS_GENL_FAMILY("l2tp");
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
new file mode 100644
index 000000000..e9b0dec56
--- /dev/null
+++ b/net/l2tp/l2tp_ppp.c
@@ -0,0 +1,1865 @@
+/*****************************************************************************
+ * Linux PPP over L2TP (PPPoX/PPPoL2TP) Sockets
+ *
+ * PPPoX --- Generic PPP encapsulation socket family
+ * PPPoL2TP --- PPP over L2TP (RFC 2661)
+ *
+ * Version: 2.0.0
+ *
+ * Authors: James Chapman (jchapman@katalix.com)
+ *
+ * Based on original work by Martijn van Oosterhout <kleptog@svana.org>
+ *
+ * License:
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* This driver handles only L2TP data frames; control frames are handled by a
+ * userspace application.
+ *
+ * To send data in an L2TP session, userspace opens a PPPoL2TP socket and
+ * attaches it to a bound UDP socket with local tunnel_id / session_id and
+ * peer tunnel_id / session_id set. Data can then be sent or received using
+ * regular socket sendmsg() / recvmsg() calls. Kernel parameters of the socket
+ * can be read or modified using ioctl() or [gs]etsockopt() calls.
+ *
+ * When a PPPoL2TP socket is connected with local and peer session_id values
+ * zero, the socket is treated as a special tunnel management socket.
+ *
+ * Here's example userspace code to create a socket for sending/receiving data
+ * over an L2TP session:-
+ *
+ * struct sockaddr_pppol2tp sax;
+ * int fd;
+ * int session_fd;
+ *
+ * fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP);
+ *
+ * sax.sa_family = AF_PPPOX;
+ * sax.sa_protocol = PX_PROTO_OL2TP;
+ * sax.pppol2tp.fd = tunnel_fd; // bound UDP socket
+ * sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr;
+ * sax.pppol2tp.addr.sin_port = addr->sin_port;
+ * sax.pppol2tp.addr.sin_family = AF_INET;
+ * sax.pppol2tp.s_tunnel = tunnel_id;
+ * sax.pppol2tp.s_session = session_id;
+ * sax.pppol2tp.d_tunnel = peer_tunnel_id;
+ * sax.pppol2tp.d_session = peer_session_id;
+ *
+ * session_fd = connect(fd, (struct sockaddr *)&sax, sizeof(sax));
+ *
+ * A pppd plugin that allows PPP traffic to be carried over L2TP using
+ * this driver is available from the OpenL2TP project at
+ * http://openl2tp.sourceforge.net.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/if_pppox.h>
+#include <linux/if_pppol2tp.h>
+#include <net/sock.h>
+#include <linux/ppp_channel.h>
+#include <linux/ppp_defs.h>
+#include <linux/ppp-ioctl.h>
+#include <linux/file.h>
+#include <linux/hash.h>
+#include <linux/sort.h>
+#include <linux/proc_fs.h>
+#include <linux/l2tp.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+
+#include <asm/byteorder.h>
+#include <linux/atomic.h>
+
+#include "l2tp_core.h"
+
+#define PPPOL2TP_DRV_VERSION "V2.0"
+
+/* Space for UDP, L2TP and PPP headers */
+#define PPPOL2TP_HEADER_OVERHEAD 40
+
+/* Number of bytes to build transmit L2TP headers.
+ * Unfortunately the size is different depending on whether sequence numbers
+ * are enabled.
+ */
+#define PPPOL2TP_L2TP_HDR_SIZE_SEQ 10
+#define PPPOL2TP_L2TP_HDR_SIZE_NOSEQ 6
+
+/* Private data of each session. This data lives at the end of struct
+ * l2tp_session, referenced via session->priv[].
+ */
+struct pppol2tp_session {
+ int owner; /* pid that opened the socket */
+
+ struct sock *sock; /* Pointer to the session
+ * PPPoX socket */
+ struct sock *tunnel_sock; /* Pointer to the tunnel UDP
+ * socket */
+ int flags; /* accessed by PPPIOCGFLAGS.
+ * Unused. */
+};
+
+static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
+
+static const struct ppp_channel_ops pppol2tp_chan_ops = {
+ .start_xmit = pppol2tp_xmit,
+};
+
+static const struct proto_ops pppol2tp_ops;
+
+/* Helpers to obtain tunnel/session contexts from sockets.
+ */
+static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk)
+{
+ struct l2tp_session *session;
+
+ if (sk == NULL)
+ return NULL;
+
+ sock_hold(sk);
+ session = (struct l2tp_session *)(sk->sk_user_data);
+ if (session == NULL) {
+ sock_put(sk);
+ goto out;
+ }
+
+ BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+
+out:
+ return session;
+}
+
+/*****************************************************************************
+ * Receive data handling
+ *****************************************************************************/
+
+static int pppol2tp_recv_payload_hook(struct sk_buff *skb)
+{
+ /* Skip PPP header, if present. In testing, Microsoft L2TP clients
+ * don't send the PPP header (PPP header compression enabled), but
+ * other clients can include the header. So we cope with both cases
+ * here. The PPP header is always FF03 when using L2TP.
+ *
+ * Note that skb->data[] isn't dereferenced from a u16 ptr here since
+ * the field may be unaligned.
+ */
+ if (!pskb_may_pull(skb, 2))
+ return 1;
+
+ if ((skb->data[0] == 0xff) && (skb->data[1] == 0x03))
+ skb_pull(skb, 2);
+
+ return 0;
+}
+
+/* Receive message. This is the recvmsg for the PPPoL2TP socket.
+ */
+static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ int err;
+ struct sk_buff *skb;
+ struct sock *sk = sock->sk;
+
+ err = -EIO;
+ if (sk->sk_state & PPPOX_BOUND)
+ goto end;
+
+ err = 0;
+ skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+ flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ goto end;
+
+ if (len > skb->len)
+ len = skb->len;
+ else if (len < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+
+ err = skb_copy_datagram_msg(skb, 0, msg, len);
+ if (likely(err == 0))
+ err = len;
+
+ kfree_skb(skb);
+end:
+ return err;
+}
+
+static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
+{
+ struct pppol2tp_session *ps = l2tp_session_priv(session);
+ struct sock *sk = NULL;
+
+ /* If the socket is bound, send it in to PPP's input queue. Otherwise
+ * queue it on the session socket.
+ */
+ sk = ps->sock;
+ if (sk == NULL)
+ goto no_sock;
+
+ if (sk->sk_state & PPPOX_BOUND) {
+ struct pppox_sock *po;
+ l2tp_dbg(session, PPPOL2TP_MSG_DATA,
+ "%s: recv %d byte data frame, passing to ppp\n",
+ session->name, data_len);
+
+ /* We need to forget all info related to the L2TP packet
+ * gathered in the skb as we are going to reuse the same
+ * skb for the inner packet.
+ * Namely we need to:
+ * - reset xfrm (IPSec) information as it applies to
+ * the outer L2TP packet and not to the inner one
+ * - release the dst to force a route lookup on the inner
+ * IP packet since skb->dst currently points to the dst
+ * of the UDP tunnel
+ * - reset netfilter information as it doesn't apply
+ * to the inner packet either
+ */
+ secpath_reset(skb);
+ skb_dst_drop(skb);
+ nf_reset(skb);
+
+ po = pppox_sk(sk);
+ ppp_input(&po->chan, skb);
+ } else {
+ l2tp_dbg(session, PPPOL2TP_MSG_DATA,
+ "%s: recv %d byte data frame, passing to L2TP socket\n",
+ session->name, data_len);
+
+ if (sock_queue_rcv_skb(sk, skb) < 0) {
+ atomic_long_inc(&session->stats.rx_errors);
+ kfree_skb(skb);
+ }
+ }
+
+ return;
+
+no_sock:
+ l2tp_info(session, PPPOL2TP_MSG_DATA, "%s: no socket\n", session->name);
+ kfree_skb(skb);
+}
+
+static void pppol2tp_session_sock_hold(struct l2tp_session *session)
+{
+ struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+ if (ps->sock)
+ sock_hold(ps->sock);
+}
+
+static void pppol2tp_session_sock_put(struct l2tp_session *session)
+{
+ struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+ if (ps->sock)
+ sock_put(ps->sock);
+}
+
+/************************************************************************
+ * Transmit handling
+ ***********************************************************************/
+
+/* This is the sendmsg for the PPPoL2TP pppol2tp_session socket. We come here
+ * when a user application does a sendmsg() on the session socket. L2TP and
+ * PPP headers must be inserted into the user's data.
+ */
+static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
+ size_t total_len)
+{
+ static const unsigned char ppph[2] = { 0xff, 0x03 };
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int error;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
+ struct pppol2tp_session *ps;
+ int uhlen;
+
+ error = -ENOTCONN;
+ if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED))
+ goto error;
+
+ /* Get session and tunnel contexts */
+ error = -EBADF;
+ session = pppol2tp_sock_to_session(sk);
+ if (session == NULL)
+ goto error;
+
+ ps = l2tp_session_priv(session);
+ tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+ if (tunnel == NULL)
+ goto error_put_sess;
+
+ uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
+
+ /* Allocate a socket buffer */
+ error = -ENOMEM;
+ skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) +
+ uhlen + session->hdr_len +
+ sizeof(ppph) + total_len,
+ 0, GFP_KERNEL);
+ if (!skb)
+ goto error_put_sess_tun;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, NET_SKB_PAD);
+ skb_reset_network_header(skb);
+ skb_reserve(skb, sizeof(struct iphdr));
+ skb_reset_transport_header(skb);
+ skb_reserve(skb, uhlen);
+
+ /* Add PPP header */
+ skb->data[0] = ppph[0];
+ skb->data[1] = ppph[1];
+ skb_put(skb, 2);
+
+ /* Copy user data into skb */
+ error = memcpy_from_msg(skb_put(skb, total_len), m, total_len);
+ if (error < 0) {
+ kfree_skb(skb);
+ goto error_put_sess_tun;
+ }
+
+ local_bh_disable();
+ l2tp_xmit_skb(session, skb, session->hdr_len);
+ local_bh_enable();
+
+ sock_put(ps->tunnel_sock);
+ sock_put(sk);
+
+ return total_len;
+
+error_put_sess_tun:
+ sock_put(ps->tunnel_sock);
+error_put_sess:
+ sock_put(sk);
+error:
+ return error;
+}
+
+/* Transmit function called by generic PPP driver. Sends PPP frame
+ * over PPPoL2TP socket.
+ *
+ * This is almost the same as pppol2tp_sendmsg(), but rather than
+ * being called with a msghdr from userspace, it is called with a skb
+ * from the kernel.
+ *
+ * The supplied skb from ppp doesn't have enough headroom for the
+ * insertion of L2TP, UDP and IP headers so we need to allocate more
+ * headroom in the skb. This will create a cloned skb. But we must be
+ * careful in the error case because the caller will expect to free
+ * the skb it supplied, not our cloned skb. So we take care to always
+ * leave the original skb unfreed if we return an error.
+ */
+static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+ static const u8 ppph[2] = { 0xff, 0x03 };
+ struct sock *sk = (struct sock *) chan->private;
+ struct sock *sk_tun;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
+ struct pppol2tp_session *ps;
+ int uhlen, headroom;
+
+ if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED))
+ goto abort;
+
+ /* Get session and tunnel contexts from the socket */
+ session = pppol2tp_sock_to_session(sk);
+ if (session == NULL)
+ goto abort;
+
+ ps = l2tp_session_priv(session);
+ sk_tun = ps->tunnel_sock;
+ if (sk_tun == NULL)
+ goto abort_put_sess;
+ tunnel = l2tp_sock_to_tunnel(sk_tun);
+ if (tunnel == NULL)
+ goto abort_put_sess;
+
+ uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
+ headroom = NET_SKB_PAD +
+ sizeof(struct iphdr) + /* IP header */
+ uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */
+ session->hdr_len + /* L2TP header */
+ sizeof(ppph); /* PPP header */
+ if (skb_cow_head(skb, headroom))
+ goto abort_put_sess_tun;
+
+ /* Setup PPP header */
+ __skb_push(skb, sizeof(ppph));
+ skb->data[0] = ppph[0];
+ skb->data[1] = ppph[1];
+
+ local_bh_disable();
+ l2tp_xmit_skb(session, skb, session->hdr_len);
+ local_bh_enable();
+
+ sock_put(sk_tun);
+ sock_put(sk);
+ return 1;
+
+abort_put_sess_tun:
+ sock_put(sk_tun);
+abort_put_sess:
+ sock_put(sk);
+abort:
+ /* Free the original skb */
+ kfree_skb(skb);
+ return 1;
+}
+
+/*****************************************************************************
+ * Session (and tunnel control) socket create/destroy.
+ *****************************************************************************/
+
+/* Called by l2tp_core when a session socket is being closed.
+ */
+static void pppol2tp_session_close(struct l2tp_session *session)
+{
+ struct pppol2tp_session *ps = l2tp_session_priv(session);
+ struct sock *sk = ps->sock;
+ struct socket *sock = sk->sk_socket;
+
+ BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+
+ if (sock) {
+ inet_shutdown(sock, 2);
+ /* Don't let the session go away before our socket does */
+ l2tp_session_inc_refcount(session);
+ }
+}
+
+/* Really kill the session socket. (Called from sock_put() if
+ * refcnt == 0.)
+ */
+static void pppol2tp_session_destruct(struct sock *sk)
+{
+ struct l2tp_session *session = sk->sk_user_data;
+ if (session) {
+ sk->sk_user_data = NULL;
+ BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+ l2tp_session_dec_refcount(session);
+ }
+}
+
+/* Called when the PPPoX socket (session) is closed.
+ */
+static int pppol2tp_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct l2tp_session *session;
+ int error;
+
+ if (!sk)
+ return 0;
+
+ error = -EBADF;
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_DEAD) != 0)
+ goto error;
+
+ pppox_unbind_sock(sk);
+
+ /* Signal the death of the socket. */
+ sk->sk_state = PPPOX_DEAD;
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ session = pppol2tp_sock_to_session(sk);
+
+ /* Purge any queued data */
+ if (session != NULL) {
+ __l2tp_session_unhash(session);
+ l2tp_session_queue_purge(session);
+ sock_put(sk);
+ }
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+
+ release_sock(sk);
+
+ /* This will delete the session context via
+ * pppol2tp_session_destruct() if the socket's refcnt drops to
+ * zero.
+ */
+ sock_put(sk);
+
+ return 0;
+
+error:
+ release_sock(sk);
+ return error;
+}
+
+static struct proto pppol2tp_sk_proto = {
+ .name = "PPPOL2TP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct pppox_sock),
+};
+
+static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+
+ rc = l2tp_udp_encap_recv(sk, skb);
+ if (rc)
+ kfree_skb(skb);
+
+ return NET_RX_SUCCESS;
+}
+
+/* socket() handler. Initialize a new struct sock.
+ */
+static int pppol2tp_create(struct net *net, struct socket *sock)
+{
+ int error = -ENOMEM;
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
+ if (!sk)
+ goto out;
+
+ sock_init_data(sock, sk);
+
+ sock->state = SS_UNCONNECTED;
+ sock->ops = &pppol2tp_ops;
+
+ sk->sk_backlog_rcv = pppol2tp_backlog_recv;
+ sk->sk_protocol = PX_PROTO_OL2TP;
+ sk->sk_family = PF_PPPOX;
+ sk->sk_state = PPPOX_NONE;
+ sk->sk_type = SOCK_STREAM;
+ sk->sk_destruct = pppol2tp_session_destruct;
+
+ error = 0;
+
+out:
+ return error;
+}
+
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+static void pppol2tp_show(struct seq_file *m, void *arg)
+{
+ struct l2tp_session *session = arg;
+ struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+ if (ps) {
+ struct pppox_sock *po = pppox_sk(ps->sock);
+ if (po)
+ seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan));
+ }
+}
+#endif
+
+/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket
+ */
+static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
+ int sockaddr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr;
+ struct pppox_sock *po = pppox_sk(sk);
+ struct l2tp_session *session = NULL;
+ struct l2tp_tunnel *tunnel;
+ struct pppol2tp_session *ps;
+ struct dst_entry *dst;
+ struct l2tp_session_cfg cfg = { 0, };
+ int error = 0;
+ u32 tunnel_id, peer_tunnel_id;
+ u32 session_id, peer_session_id;
+ int ver = 2;
+ int fd;
+
+ lock_sock(sk);
+
+ error = -EINVAL;
+ if (sp->sa_protocol != PX_PROTO_OL2TP)
+ goto end;
+
+ /* Check for already bound sockets */
+ error = -EBUSY;
+ if (sk->sk_state & PPPOX_CONNECTED)
+ goto end;
+
+ /* We don't supporting rebinding anyway */
+ error = -EALREADY;
+ if (sk->sk_user_data)
+ goto end; /* socket is already attached */
+
+ /* Get params from socket address. Handle L2TPv2 and L2TPv3.
+ * This is nasty because there are different sockaddr_pppol2tp
+ * structs for L2TPv2, L2TPv3, over IPv4 and IPv6. We use
+ * the sockaddr size to determine which structure the caller
+ * is using.
+ */
+ peer_tunnel_id = 0;
+ if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) {
+ fd = sp->pppol2tp.fd;
+ tunnel_id = sp->pppol2tp.s_tunnel;
+ peer_tunnel_id = sp->pppol2tp.d_tunnel;
+ session_id = sp->pppol2tp.s_session;
+ peer_session_id = sp->pppol2tp.d_session;
+ } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) {
+ struct sockaddr_pppol2tpv3 *sp3 =
+ (struct sockaddr_pppol2tpv3 *) sp;
+ ver = 3;
+ fd = sp3->pppol2tp.fd;
+ tunnel_id = sp3->pppol2tp.s_tunnel;
+ peer_tunnel_id = sp3->pppol2tp.d_tunnel;
+ session_id = sp3->pppol2tp.s_session;
+ peer_session_id = sp3->pppol2tp.d_session;
+ } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpin6)) {
+ struct sockaddr_pppol2tpin6 *sp6 =
+ (struct sockaddr_pppol2tpin6 *) sp;
+ fd = sp6->pppol2tp.fd;
+ tunnel_id = sp6->pppol2tp.s_tunnel;
+ peer_tunnel_id = sp6->pppol2tp.d_tunnel;
+ session_id = sp6->pppol2tp.s_session;
+ peer_session_id = sp6->pppol2tp.d_session;
+ } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3in6)) {
+ struct sockaddr_pppol2tpv3in6 *sp6 =
+ (struct sockaddr_pppol2tpv3in6 *) sp;
+ ver = 3;
+ fd = sp6->pppol2tp.fd;
+ tunnel_id = sp6->pppol2tp.s_tunnel;
+ peer_tunnel_id = sp6->pppol2tp.d_tunnel;
+ session_id = sp6->pppol2tp.s_session;
+ peer_session_id = sp6->pppol2tp.d_session;
+ } else {
+ error = -EINVAL;
+ goto end; /* bad socket address */
+ }
+
+ /* Don't bind if tunnel_id is 0 */
+ error = -EINVAL;
+ if (tunnel_id == 0)
+ goto end;
+
+ tunnel = l2tp_tunnel_find(sock_net(sk), tunnel_id);
+
+ /* Special case: create tunnel context if session_id and
+ * peer_session_id is 0. Otherwise look up tunnel using supplied
+ * tunnel id.
+ */
+ if ((session_id == 0) && (peer_session_id == 0)) {
+ if (tunnel == NULL) {
+ struct l2tp_tunnel_cfg tcfg = {
+ .encap = L2TP_ENCAPTYPE_UDP,
+ .debug = 0,
+ };
+ error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel);
+ if (error < 0)
+ goto end;
+ }
+ } else {
+ /* Error if we can't find the tunnel */
+ error = -ENOENT;
+ if (tunnel == NULL)
+ goto end;
+
+ /* Error if socket is not prepped */
+ if (tunnel->sock == NULL)
+ goto end;
+ }
+
+ if (tunnel->recv_payload_hook == NULL)
+ tunnel->recv_payload_hook = pppol2tp_recv_payload_hook;
+
+ if (tunnel->peer_tunnel_id == 0)
+ tunnel->peer_tunnel_id = peer_tunnel_id;
+
+ /* Create session if it doesn't already exist. We handle the
+ * case where a session was previously created by the netlink
+ * interface by checking that the session doesn't already have
+ * a socket and its tunnel socket are what we expect. If any
+ * of those checks fail, return EEXIST to the caller.
+ */
+ session = l2tp_session_find(sock_net(sk), tunnel, session_id);
+ if (session == NULL) {
+ /* Default MTU must allow space for UDP/L2TP/PPP
+ * headers.
+ */
+ cfg.mtu = cfg.mru = 1500 - PPPOL2TP_HEADER_OVERHEAD;
+
+ /* Allocate and initialize a new session context. */
+ session = l2tp_session_create(sizeof(struct pppol2tp_session),
+ tunnel, session_id,
+ peer_session_id, &cfg);
+ if (session == NULL) {
+ error = -ENOMEM;
+ goto end;
+ }
+ } else {
+ ps = l2tp_session_priv(session);
+ error = -EEXIST;
+ if (ps->sock != NULL)
+ goto end;
+
+ /* consistency checks */
+ if (ps->tunnel_sock != tunnel->sock)
+ goto end;
+ }
+
+ /* Associate session with its PPPoL2TP socket */
+ ps = l2tp_session_priv(session);
+ ps->owner = current->pid;
+ ps->sock = sk;
+ ps->tunnel_sock = tunnel->sock;
+
+ session->recv_skb = pppol2tp_recv;
+ session->session_close = pppol2tp_session_close;
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+ session->show = pppol2tp_show;
+#endif
+
+ /* We need to know each time a skb is dropped from the reorder
+ * queue.
+ */
+ session->ref = pppol2tp_session_sock_hold;
+ session->deref = pppol2tp_session_sock_put;
+
+ /* If PMTU discovery was enabled, use the MTU that was discovered */
+ dst = sk_dst_get(tunnel->sock);
+ if (dst != NULL) {
+ u32 pmtu = dst_mtu(dst);
+
+ if (pmtu != 0)
+ session->mtu = session->mru = pmtu -
+ PPPOL2TP_HEADER_OVERHEAD;
+ dst_release(dst);
+ }
+
+ /* Special case: if source & dest session_id == 0x0000, this
+ * socket is being created to manage the tunnel. Just set up
+ * the internal context for use by ioctl() and sockopt()
+ * handlers.
+ */
+ if ((session->session_id == 0) &&
+ (session->peer_session_id == 0)) {
+ error = 0;
+ goto out_no_ppp;
+ }
+
+ /* The only header we need to worry about is the L2TP
+ * header. This size is different depending on whether
+ * sequence numbers are enabled for the data channel.
+ */
+ po->chan.hdrlen = PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
+
+ po->chan.private = sk;
+ po->chan.ops = &pppol2tp_chan_ops;
+ po->chan.mtu = session->mtu;
+
+ error = ppp_register_net_channel(sock_net(sk), &po->chan);
+ if (error)
+ goto end;
+
+out_no_ppp:
+ /* This is how we get the session context from the socket. */
+ sk->sk_user_data = session;
+ sk->sk_state = PPPOX_CONNECTED;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n",
+ session->name);
+
+end:
+ release_sock(sk);
+
+ return error;
+}
+
+#ifdef CONFIG_L2TP_V3
+
+/* Called when creating sessions via the netlink interface.
+ */
+static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
+{
+ int error;
+ struct l2tp_tunnel *tunnel;
+ struct l2tp_session *session;
+ struct pppol2tp_session *ps;
+
+ tunnel = l2tp_tunnel_find(net, tunnel_id);
+
+ /* Error if we can't find the tunnel */
+ error = -ENOENT;
+ if (tunnel == NULL)
+ goto out;
+
+ /* Error if tunnel socket is not prepped */
+ if (tunnel->sock == NULL)
+ goto out;
+
+ /* Check that this session doesn't already exist */
+ error = -EEXIST;
+ session = l2tp_session_find(net, tunnel, session_id);
+ if (session != NULL)
+ goto out;
+
+ /* Default MTU values. */
+ if (cfg->mtu == 0)
+ cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
+ if (cfg->mru == 0)
+ cfg->mru = cfg->mtu;
+
+ /* Allocate and initialize a new session context. */
+ error = -ENOMEM;
+ session = l2tp_session_create(sizeof(struct pppol2tp_session),
+ tunnel, session_id,
+ peer_session_id, cfg);
+ if (session == NULL)
+ goto out;
+
+ ps = l2tp_session_priv(session);
+ ps->tunnel_sock = tunnel->sock;
+
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: created\n",
+ session->name);
+
+ error = 0;
+
+out:
+ return error;
+}
+
+#endif /* CONFIG_L2TP_V3 */
+
+/* getname() support.
+ */
+static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *usockaddr_len, int peer)
+{
+ int len = 0;
+ int error = 0;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet;
+ struct pppol2tp_session *pls;
+
+ error = -ENOTCONN;
+ if (sk == NULL)
+ goto end;
+ if (sk->sk_state != PPPOX_CONNECTED)
+ goto end;
+
+ error = -EBADF;
+ session = pppol2tp_sock_to_session(sk);
+ if (session == NULL)
+ goto end;
+
+ pls = l2tp_session_priv(session);
+ tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock);
+ if (tunnel == NULL) {
+ error = -EBADF;
+ goto end_put_sess;
+ }
+
+ inet = inet_sk(tunnel->sock);
+ if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) {
+ struct sockaddr_pppol2tp sp;
+ len = sizeof(sp);
+ memset(&sp, 0, len);
+ sp.sa_family = AF_PPPOX;
+ sp.sa_protocol = PX_PROTO_OL2TP;
+ sp.pppol2tp.fd = tunnel->fd;
+ sp.pppol2tp.pid = pls->owner;
+ sp.pppol2tp.s_tunnel = tunnel->tunnel_id;
+ sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id;
+ sp.pppol2tp.s_session = session->session_id;
+ sp.pppol2tp.d_session = session->peer_session_id;
+ sp.pppol2tp.addr.sin_family = AF_INET;
+ sp.pppol2tp.addr.sin_port = inet->inet_dport;
+ sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr;
+ memcpy(uaddr, &sp, len);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if ((tunnel->version == 2) &&
+ (tunnel->sock->sk_family == AF_INET6)) {
+ struct sockaddr_pppol2tpin6 sp;
+
+ len = sizeof(sp);
+ memset(&sp, 0, len);
+ sp.sa_family = AF_PPPOX;
+ sp.sa_protocol = PX_PROTO_OL2TP;
+ sp.pppol2tp.fd = tunnel->fd;
+ sp.pppol2tp.pid = pls->owner;
+ sp.pppol2tp.s_tunnel = tunnel->tunnel_id;
+ sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id;
+ sp.pppol2tp.s_session = session->session_id;
+ sp.pppol2tp.d_session = session->peer_session_id;
+ sp.pppol2tp.addr.sin6_family = AF_INET6;
+ sp.pppol2tp.addr.sin6_port = inet->inet_dport;
+ memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr,
+ sizeof(tunnel->sock->sk_v6_daddr));
+ memcpy(uaddr, &sp, len);
+ } else if ((tunnel->version == 3) &&
+ (tunnel->sock->sk_family == AF_INET6)) {
+ struct sockaddr_pppol2tpv3in6 sp;
+
+ len = sizeof(sp);
+ memset(&sp, 0, len);
+ sp.sa_family = AF_PPPOX;
+ sp.sa_protocol = PX_PROTO_OL2TP;
+ sp.pppol2tp.fd = tunnel->fd;
+ sp.pppol2tp.pid = pls->owner;
+ sp.pppol2tp.s_tunnel = tunnel->tunnel_id;
+ sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id;
+ sp.pppol2tp.s_session = session->session_id;
+ sp.pppol2tp.d_session = session->peer_session_id;
+ sp.pppol2tp.addr.sin6_family = AF_INET6;
+ sp.pppol2tp.addr.sin6_port = inet->inet_dport;
+ memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr,
+ sizeof(tunnel->sock->sk_v6_daddr));
+ memcpy(uaddr, &sp, len);
+#endif
+ } else if (tunnel->version == 3) {
+ struct sockaddr_pppol2tpv3 sp;
+ len = sizeof(sp);
+ memset(&sp, 0, len);
+ sp.sa_family = AF_PPPOX;
+ sp.sa_protocol = PX_PROTO_OL2TP;
+ sp.pppol2tp.fd = tunnel->fd;
+ sp.pppol2tp.pid = pls->owner;
+ sp.pppol2tp.s_tunnel = tunnel->tunnel_id;
+ sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id;
+ sp.pppol2tp.s_session = session->session_id;
+ sp.pppol2tp.d_session = session->peer_session_id;
+ sp.pppol2tp.addr.sin_family = AF_INET;
+ sp.pppol2tp.addr.sin_port = inet->inet_dport;
+ sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr;
+ memcpy(uaddr, &sp, len);
+ }
+
+ *usockaddr_len = len;
+
+ sock_put(pls->tunnel_sock);
+end_put_sess:
+ sock_put(sk);
+ error = 0;
+
+end:
+ return error;
+}
+
+/****************************************************************************
+ * ioctl() handlers.
+ *
+ * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP
+ * sockets. However, in order to control kernel tunnel features, we allow
+ * userspace to create a special "tunnel" PPPoX socket which is used for
+ * control only. Tunnel PPPoX sockets have session_id == 0 and simply allow
+ * the user application to issue L2TP setsockopt(), getsockopt() and ioctl()
+ * calls.
+ ****************************************************************************/
+
+static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest,
+ struct l2tp_stats *stats)
+{
+ dest->tx_packets = atomic_long_read(&stats->tx_packets);
+ dest->tx_bytes = atomic_long_read(&stats->tx_bytes);
+ dest->tx_errors = atomic_long_read(&stats->tx_errors);
+ dest->rx_packets = atomic_long_read(&stats->rx_packets);
+ dest->rx_bytes = atomic_long_read(&stats->rx_bytes);
+ dest->rx_seq_discards = atomic_long_read(&stats->rx_seq_discards);
+ dest->rx_oos_packets = atomic_long_read(&stats->rx_oos_packets);
+ dest->rx_errors = atomic_long_read(&stats->rx_errors);
+}
+
+/* Session ioctl helper.
+ */
+static int pppol2tp_session_ioctl(struct l2tp_session *session,
+ unsigned int cmd, unsigned long arg)
+{
+ struct ifreq ifr;
+ int err = 0;
+ struct sock *sk;
+ int val = (int) arg;
+ struct pppol2tp_session *ps = l2tp_session_priv(session);
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ struct pppol2tp_ioc_stats stats;
+
+ l2tp_dbg(session, PPPOL2TP_MSG_CONTROL,
+ "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n",
+ session->name, cmd, arg);
+
+ sk = ps->sock;
+ sock_hold(sk);
+
+ switch (cmd) {
+ case SIOCGIFMTU:
+ err = -ENXIO;
+ if (!(sk->sk_state & PPPOX_CONNECTED))
+ break;
+
+ err = -EFAULT;
+ if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
+ break;
+ ifr.ifr_mtu = session->mtu;
+ if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq)))
+ break;
+
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mtu=%d\n",
+ session->name, session->mtu);
+ err = 0;
+ break;
+
+ case SIOCSIFMTU:
+ err = -ENXIO;
+ if (!(sk->sk_state & PPPOX_CONNECTED))
+ break;
+
+ err = -EFAULT;
+ if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
+ break;
+
+ session->mtu = ifr.ifr_mtu;
+
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mtu=%d\n",
+ session->name, session->mtu);
+ err = 0;
+ break;
+
+ case PPPIOCGMRU:
+ err = -ENXIO;
+ if (!(sk->sk_state & PPPOX_CONNECTED))
+ break;
+
+ err = -EFAULT;
+ if (put_user(session->mru, (int __user *) arg))
+ break;
+
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get mru=%d\n",
+ session->name, session->mru);
+ err = 0;
+ break;
+
+ case PPPIOCSMRU:
+ err = -ENXIO;
+ if (!(sk->sk_state & PPPOX_CONNECTED))
+ break;
+
+ err = -EFAULT;
+ if (get_user(val, (int __user *) arg))
+ break;
+
+ session->mru = val;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set mru=%d\n",
+ session->name, session->mru);
+ err = 0;
+ break;
+
+ case PPPIOCGFLAGS:
+ err = -EFAULT;
+ if (put_user(ps->flags, (int __user *) arg))
+ break;
+
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get flags=%d\n",
+ session->name, ps->flags);
+ err = 0;
+ break;
+
+ case PPPIOCSFLAGS:
+ err = -EFAULT;
+ if (get_user(val, (int __user *) arg))
+ break;
+ ps->flags = val;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set flags=%d\n",
+ session->name, ps->flags);
+ err = 0;
+ break;
+
+ case PPPIOCGL2TPSTATS:
+ err = -ENXIO;
+ if (!(sk->sk_state & PPPOX_CONNECTED))
+ break;
+
+ memset(&stats, 0, sizeof(stats));
+ stats.tunnel_id = tunnel->tunnel_id;
+ stats.session_id = session->session_id;
+ pppol2tp_copy_stats(&stats, &session->stats);
+ if (copy_to_user((void __user *) arg, &stats,
+ sizeof(stats)))
+ break;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n",
+ session->name);
+ err = 0;
+ break;
+
+ default:
+ err = -ENOSYS;
+ break;
+ }
+
+ sock_put(sk);
+
+ return err;
+}
+
+/* Tunnel ioctl helper.
+ *
+ * Note the special handling for PPPIOCGL2TPSTATS below. If the ioctl data
+ * specifies a session_id, the session ioctl handler is called. This allows an
+ * application to retrieve session stats via a tunnel socket.
+ */
+static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
+ unsigned int cmd, unsigned long arg)
+{
+ int err = 0;
+ struct sock *sk;
+ struct pppol2tp_ioc_stats stats;
+
+ l2tp_dbg(tunnel, PPPOL2TP_MSG_CONTROL,
+ "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n",
+ tunnel->name, cmd, arg);
+
+ sk = tunnel->sock;
+ sock_hold(sk);
+
+ switch (cmd) {
+ case PPPIOCGL2TPSTATS:
+ err = -ENXIO;
+ if (!(sk->sk_state & PPPOX_CONNECTED))
+ break;
+
+ if (copy_from_user(&stats, (void __user *) arg,
+ sizeof(stats))) {
+ err = -EFAULT;
+ break;
+ }
+ if (stats.session_id != 0) {
+ /* resend to session ioctl handler */
+ struct l2tp_session *session =
+ l2tp_session_find(sock_net(sk), tunnel, stats.session_id);
+ if (session != NULL)
+ err = pppol2tp_session_ioctl(session, cmd, arg);
+ else
+ err = -EBADR;
+ break;
+ }
+#ifdef CONFIG_XFRM
+ stats.using_ipsec = (sk->sk_policy[0] || sk->sk_policy[1]) ? 1 : 0;
+#endif
+ pppol2tp_copy_stats(&stats, &tunnel->stats);
+ if (copy_to_user((void __user *) arg, &stats, sizeof(stats))) {
+ err = -EFAULT;
+ break;
+ }
+ l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get L2TP stats\n",
+ tunnel->name);
+ err = 0;
+ break;
+
+ default:
+ err = -ENOSYS;
+ break;
+ }
+
+ sock_put(sk);
+
+ return err;
+}
+
+/* Main ioctl() handler.
+ * Dispatch to tunnel or session helpers depending on the socket.
+ */
+static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
+ struct pppol2tp_session *ps;
+ int err;
+
+ if (!sk)
+ return 0;
+
+ err = -EBADF;
+ if (sock_flag(sk, SOCK_DEAD) != 0)
+ goto end;
+
+ err = -ENOTCONN;
+ if ((sk->sk_user_data == NULL) ||
+ (!(sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND))))
+ goto end;
+
+ /* Get session context from the socket */
+ err = -EBADF;
+ session = pppol2tp_sock_to_session(sk);
+ if (session == NULL)
+ goto end;
+
+ /* Special case: if session's session_id is zero, treat ioctl as a
+ * tunnel ioctl
+ */
+ ps = l2tp_session_priv(session);
+ if ((session->session_id == 0) &&
+ (session->peer_session_id == 0)) {
+ err = -EBADF;
+ tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+ if (tunnel == NULL)
+ goto end_put_sess;
+
+ err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg);
+ sock_put(ps->tunnel_sock);
+ goto end_put_sess;
+ }
+
+ err = pppol2tp_session_ioctl(session, cmd, arg);
+
+end_put_sess:
+ sock_put(sk);
+end:
+ return err;
+}
+
+/*****************************************************************************
+ * setsockopt() / getsockopt() support.
+ *
+ * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP
+ * sockets. In order to control kernel tunnel features, we allow userspace to
+ * create a special "tunnel" PPPoX socket which is used for control only.
+ * Tunnel PPPoX sockets have session_id == 0 and simply allow the user
+ * application to issue L2TP setsockopt(), getsockopt() and ioctl() calls.
+ *****************************************************************************/
+
+/* Tunnel setsockopt() helper.
+ */
+static int pppol2tp_tunnel_setsockopt(struct sock *sk,
+ struct l2tp_tunnel *tunnel,
+ int optname, int val)
+{
+ int err = 0;
+
+ switch (optname) {
+ case PPPOL2TP_SO_DEBUG:
+ tunnel->debug = val;
+ l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n",
+ tunnel->name, tunnel->debug);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ return err;
+}
+
+/* Session setsockopt helper.
+ */
+static int pppol2tp_session_setsockopt(struct sock *sk,
+ struct l2tp_session *session,
+ int optname, int val)
+{
+ int err = 0;
+ struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+ switch (optname) {
+ case PPPOL2TP_SO_RECVSEQ:
+ if ((val != 0) && (val != 1)) {
+ err = -EINVAL;
+ break;
+ }
+ session->recv_seq = val ? -1 : 0;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ "%s: set recv_seq=%d\n",
+ session->name, session->recv_seq);
+ break;
+
+ case PPPOL2TP_SO_SENDSEQ:
+ if ((val != 0) && (val != 1)) {
+ err = -EINVAL;
+ break;
+ }
+ session->send_seq = val ? -1 : 0;
+ {
+ struct sock *ssk = ps->sock;
+ struct pppox_sock *po = pppox_sk(ssk);
+ po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ :
+ PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
+ }
+ l2tp_session_set_header_len(session, session->tunnel->version);
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ "%s: set send_seq=%d\n",
+ session->name, session->send_seq);
+ break;
+
+ case PPPOL2TP_SO_LNSMODE:
+ if ((val != 0) && (val != 1)) {
+ err = -EINVAL;
+ break;
+ }
+ session->lns_mode = val ? -1 : 0;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ "%s: set lns_mode=%d\n",
+ session->name, session->lns_mode);
+ break;
+
+ case PPPOL2TP_SO_DEBUG:
+ session->debug = val;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: set debug=%x\n",
+ session->name, session->debug);
+ break;
+
+ case PPPOL2TP_SO_REORDERTO:
+ session->reorder_timeout = msecs_to_jiffies(val);
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ "%s: set reorder_timeout=%d\n",
+ session->name, session->reorder_timeout);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ return err;
+}
+
+/* Main setsockopt() entry point.
+ * Does API checks, then calls either the tunnel or session setsockopt
+ * handler, according to whether the PPPoL2TP socket is a for a regular
+ * session or the special tunnel type.
+ */
+static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
+ struct pppol2tp_session *ps;
+ int val;
+ int err;
+
+ if (level != SOL_PPPOL2TP)
+ return -EINVAL;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ err = -ENOTCONN;
+ if (sk->sk_user_data == NULL)
+ goto end;
+
+ /* Get session context from the socket */
+ err = -EBADF;
+ session = pppol2tp_sock_to_session(sk);
+ if (session == NULL)
+ goto end;
+
+ /* Special case: if session_id == 0x0000, treat as operation on tunnel
+ */
+ ps = l2tp_session_priv(session);
+ if ((session->session_id == 0) &&
+ (session->peer_session_id == 0)) {
+ err = -EBADF;
+ tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+ if (tunnel == NULL)
+ goto end_put_sess;
+
+ err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val);
+ sock_put(ps->tunnel_sock);
+ } else
+ err = pppol2tp_session_setsockopt(sk, session, optname, val);
+
+ err = 0;
+
+end_put_sess:
+ sock_put(sk);
+end:
+ return err;
+}
+
+/* Tunnel getsockopt helper. Called with sock locked.
+ */
+static int pppol2tp_tunnel_getsockopt(struct sock *sk,
+ struct l2tp_tunnel *tunnel,
+ int optname, int *val)
+{
+ int err = 0;
+
+ switch (optname) {
+ case PPPOL2TP_SO_DEBUG:
+ *val = tunnel->debug;
+ l2tp_info(tunnel, PPPOL2TP_MSG_CONTROL, "%s: get debug=%x\n",
+ tunnel->name, tunnel->debug);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ return err;
+}
+
+/* Session getsockopt helper. Called with sock locked.
+ */
+static int pppol2tp_session_getsockopt(struct sock *sk,
+ struct l2tp_session *session,
+ int optname, int *val)
+{
+ int err = 0;
+
+ switch (optname) {
+ case PPPOL2TP_SO_RECVSEQ:
+ *val = session->recv_seq;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ "%s: get recv_seq=%d\n", session->name, *val);
+ break;
+
+ case PPPOL2TP_SO_SENDSEQ:
+ *val = session->send_seq;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ "%s: get send_seq=%d\n", session->name, *val);
+ break;
+
+ case PPPOL2TP_SO_LNSMODE:
+ *val = session->lns_mode;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ "%s: get lns_mode=%d\n", session->name, *val);
+ break;
+
+ case PPPOL2TP_SO_DEBUG:
+ *val = session->debug;
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL, "%s: get debug=%d\n",
+ session->name, *val);
+ break;
+
+ case PPPOL2TP_SO_REORDERTO:
+ *val = (int) jiffies_to_msecs(session->reorder_timeout);
+ l2tp_info(session, PPPOL2TP_MSG_CONTROL,
+ "%s: get reorder_timeout=%d\n", session->name, *val);
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ }
+
+ return err;
+}
+
+/* Main getsockopt() entry point.
+ * Does API checks, then calls either the tunnel or session getsockopt
+ * handler, according to whether the PPPoX socket is a for a regular session
+ * or the special tunnel type.
+ */
+static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
+ int val, len;
+ int err;
+ struct pppol2tp_session *ps;
+
+ if (level != SOL_PPPOL2TP)
+ return -EINVAL;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ err = -ENOTCONN;
+ if (sk->sk_user_data == NULL)
+ goto end;
+
+ /* Get the session context */
+ err = -EBADF;
+ session = pppol2tp_sock_to_session(sk);
+ if (session == NULL)
+ goto end;
+
+ /* Special case: if session_id == 0x0000, treat as operation on tunnel */
+ ps = l2tp_session_priv(session);
+ if ((session->session_id == 0) &&
+ (session->peer_session_id == 0)) {
+ err = -EBADF;
+ tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+ if (tunnel == NULL)
+ goto end_put_sess;
+
+ err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val);
+ sock_put(ps->tunnel_sock);
+ } else
+ err = pppol2tp_session_getsockopt(sk, session, optname, &val);
+
+ err = -EFAULT;
+ if (put_user(len, optlen))
+ goto end_put_sess;
+
+ if (copy_to_user((void __user *) optval, &val, len))
+ goto end_put_sess;
+
+ err = 0;
+
+end_put_sess:
+ sock_put(sk);
+end:
+ return err;
+}
+
+/*****************************************************************************
+ * /proc filesystem for debug
+ * Since the original pppol2tp driver provided /proc/net/pppol2tp for
+ * L2TPv2, we dump only L2TPv2 tunnels and sessions here.
+ *****************************************************************************/
+
+static unsigned int pppol2tp_net_id;
+
+#ifdef CONFIG_PROC_FS
+
+struct pppol2tp_seq_data {
+ struct seq_net_private p;
+ int tunnel_idx; /* current tunnel */
+ int session_idx; /* index of session within current tunnel */
+ struct l2tp_tunnel *tunnel;
+ struct l2tp_session *session; /* NULL means get next tunnel */
+};
+
+static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
+{
+ for (;;) {
+ pd->tunnel = l2tp_tunnel_find_nth(net, pd->tunnel_idx);
+ pd->tunnel_idx++;
+
+ if (pd->tunnel == NULL)
+ break;
+
+ /* Ignore L2TPv3 tunnels */
+ if (pd->tunnel->version < 3)
+ break;
+ }
+}
+
+static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
+{
+ pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx);
+ pd->session_idx++;
+
+ if (pd->session == NULL) {
+ pd->session_idx = 0;
+ pppol2tp_next_tunnel(net, pd);
+ }
+}
+
+static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs)
+{
+ struct pppol2tp_seq_data *pd = SEQ_START_TOKEN;
+ loff_t pos = *offs;
+ struct net *net;
+
+ if (!pos)
+ goto out;
+
+ BUG_ON(m->private == NULL);
+ pd = m->private;
+ net = seq_file_net(m);
+
+ if (pd->tunnel == NULL)
+ pppol2tp_next_tunnel(net, pd);
+ else
+ pppol2tp_next_session(net, pd);
+
+ /* NULL tunnel and session indicates end of list */
+ if ((pd->tunnel == NULL) && (pd->session == NULL))
+ pd = NULL;
+
+out:
+ return pd;
+}
+
+static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return NULL;
+}
+
+static void pppol2tp_seq_stop(struct seq_file *p, void *v)
+{
+ /* nothing to do */
+}
+
+static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)
+{
+ struct l2tp_tunnel *tunnel = v;
+
+ seq_printf(m, "\nTUNNEL '%s', %c %d\n",
+ tunnel->name,
+ (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N',
+ atomic_read(&tunnel->ref_count) - 1);
+ seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n",
+ tunnel->debug,
+ atomic_long_read(&tunnel->stats.tx_packets),
+ atomic_long_read(&tunnel->stats.tx_bytes),
+ atomic_long_read(&tunnel->stats.tx_errors),
+ atomic_long_read(&tunnel->stats.rx_packets),
+ atomic_long_read(&tunnel->stats.rx_bytes),
+ atomic_long_read(&tunnel->stats.rx_errors));
+}
+
+static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
+{
+ struct l2tp_session *session = v;
+ struct l2tp_tunnel *tunnel = session->tunnel;
+ struct pppol2tp_session *ps = l2tp_session_priv(session);
+ struct pppox_sock *po = pppox_sk(ps->sock);
+ u32 ip = 0;
+ u16 port = 0;
+
+ if (tunnel->sock) {
+ struct inet_sock *inet = inet_sk(tunnel->sock);
+ ip = ntohl(inet->inet_saddr);
+ port = ntohs(inet->inet_sport);
+ }
+
+ seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> "
+ "%04X/%04X %d %c\n",
+ session->name, ip, port,
+ tunnel->tunnel_id,
+ session->session_id,
+ tunnel->peer_tunnel_id,
+ session->peer_session_id,
+ ps->sock->sk_state,
+ (session == ps->sock->sk_user_data) ?
+ 'Y' : 'N');
+ seq_printf(m, " %d/%d/%c/%c/%s %08x %u\n",
+ session->mtu, session->mru,
+ session->recv_seq ? 'R' : '-',
+ session->send_seq ? 'S' : '-',
+ session->lns_mode ? "LNS" : "LAC",
+ session->debug,
+ jiffies_to_msecs(session->reorder_timeout));
+ seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n",
+ session->nr, session->ns,
+ atomic_long_read(&session->stats.tx_packets),
+ atomic_long_read(&session->stats.tx_bytes),
+ atomic_long_read(&session->stats.tx_errors),
+ atomic_long_read(&session->stats.rx_packets),
+ atomic_long_read(&session->stats.rx_bytes),
+ atomic_long_read(&session->stats.rx_errors));
+
+ if (po)
+ seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan));
+}
+
+static int pppol2tp_seq_show(struct seq_file *m, void *v)
+{
+ struct pppol2tp_seq_data *pd = v;
+
+ /* display header on line 1 */
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n");
+ seq_puts(m, "TUNNEL name, user-data-ok session-count\n");
+ seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
+ seq_puts(m, " SESSION name, addr/port src-tid/sid "
+ "dest-tid/sid state user-data-ok\n");
+ seq_puts(m, " mtu/mru/rcvseq/sendseq/lns debug reorderto\n");
+ seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
+ goto out;
+ }
+
+ /* Show the tunnel or session context.
+ */
+ if (pd->session == NULL)
+ pppol2tp_seq_tunnel_show(m, pd->tunnel);
+ else
+ pppol2tp_seq_session_show(m, pd->session);
+
+out:
+ return 0;
+}
+
+static const struct seq_operations pppol2tp_seq_ops = {
+ .start = pppol2tp_seq_start,
+ .next = pppol2tp_seq_next,
+ .stop = pppol2tp_seq_stop,
+ .show = pppol2tp_seq_show,
+};
+
+/* Called when our /proc file is opened. We allocate data for use when
+ * iterating our tunnel / session contexts and store it in the private
+ * data of the seq_file.
+ */
+static int pppol2tp_proc_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &pppol2tp_seq_ops,
+ sizeof(struct pppol2tp_seq_data));
+}
+
+static const struct file_operations pppol2tp_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = pppol2tp_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+/*****************************************************************************
+ * Network namespace
+ *****************************************************************************/
+
+static __net_init int pppol2tp_init_net(struct net *net)
+{
+ struct proc_dir_entry *pde;
+ int err = 0;
+
+ pde = proc_create("pppol2tp", S_IRUGO, net->proc_net,
+ &pppol2tp_proc_fops);
+ if (!pde) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+out:
+ return err;
+}
+
+static __net_exit void pppol2tp_exit_net(struct net *net)
+{
+ remove_proc_entry("pppol2tp", net->proc_net);
+}
+
+static struct pernet_operations pppol2tp_net_ops = {
+ .init = pppol2tp_init_net,
+ .exit = pppol2tp_exit_net,
+ .id = &pppol2tp_net_id,
+};
+
+/*****************************************************************************
+ * Init and cleanup
+ *****************************************************************************/
+
+static const struct proto_ops pppol2tp_ops = {
+ .family = AF_PPPOX,
+ .owner = THIS_MODULE,
+ .release = pppol2tp_release,
+ .bind = sock_no_bind,
+ .connect = pppol2tp_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = pppol2tp_getname,
+ .poll = datagram_poll,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = pppol2tp_setsockopt,
+ .getsockopt = pppol2tp_getsockopt,
+ .sendmsg = pppol2tp_sendmsg,
+ .recvmsg = pppol2tp_recvmsg,
+ .mmap = sock_no_mmap,
+ .ioctl = pppox_ioctl,
+};
+
+static const struct pppox_proto pppol2tp_proto = {
+ .create = pppol2tp_create,
+ .ioctl = pppol2tp_ioctl,
+ .owner = THIS_MODULE,
+};
+
+#ifdef CONFIG_L2TP_V3
+
+static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = {
+ .session_create = pppol2tp_session_create,
+ .session_delete = l2tp_session_delete,
+};
+
+#endif /* CONFIG_L2TP_V3 */
+
+static int __init pppol2tp_init(void)
+{
+ int err;
+
+ err = register_pernet_device(&pppol2tp_net_ops);
+ if (err)
+ goto out;
+
+ err = proto_register(&pppol2tp_sk_proto, 0);
+ if (err)
+ goto out_unregister_pppol2tp_pernet;
+
+ err = register_pppox_proto(PX_PROTO_OL2TP, &pppol2tp_proto);
+ if (err)
+ goto out_unregister_pppol2tp_proto;
+
+#ifdef CONFIG_L2TP_V3
+ err = l2tp_nl_register_ops(L2TP_PWTYPE_PPP, &pppol2tp_nl_cmd_ops);
+ if (err)
+ goto out_unregister_pppox;
+#endif
+
+ pr_info("PPPoL2TP kernel driver, %s\n", PPPOL2TP_DRV_VERSION);
+
+out:
+ return err;
+
+#ifdef CONFIG_L2TP_V3
+out_unregister_pppox:
+ unregister_pppox_proto(PX_PROTO_OL2TP);
+#endif
+out_unregister_pppol2tp_proto:
+ proto_unregister(&pppol2tp_sk_proto);
+out_unregister_pppol2tp_pernet:
+ unregister_pernet_device(&pppol2tp_net_ops);
+ goto out;
+}
+
+static void __exit pppol2tp_exit(void)
+{
+#ifdef CONFIG_L2TP_V3
+ l2tp_nl_unregister_ops(L2TP_PWTYPE_PPP);
+#endif
+ unregister_pppox_proto(PX_PROTO_OL2TP);
+ proto_unregister(&pppol2tp_sk_proto);
+ unregister_pernet_device(&pppol2tp_net_ops);
+}
+
+module_init(pppol2tp_init);
+module_exit(pppol2tp_exit);
+
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("PPP over L2TP over UDP");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(PPPOL2TP_DRV_VERSION);
+MODULE_ALIAS("pppox-proto-" __stringify(PX_PROTO_OL2TP));
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 000000000..6481839b7
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,21 @@
+#
+# LAPB Data Link Drive
+#
+
+config LAPB
+ tristate "LAPB Data Link Driver"
+ ---help---
+ Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
+ the lower) part of the X.25 protocol. It offers a reliable
+ connection service to exchange data frames with one other host, and
+ it is used to transport higher level protocols (mostly X.25 Packet
+ Layer, the higher part of X.25, but others are possible as well).
+ Usually, LAPB is used with specialized X.21 network cards, but Linux
+ currently supports LAPB only over Ethernet connections. If you want
+ to use LAPB connections over Ethernet, say Y here and to "LAPB over
+ Ethernet driver" below. Read
+ <file:Documentation/networking/lapb-module.txt> for technical
+ details.
+
+ To compile this driver as a module, choose M here: the
+ module will be called lapb. If unsure, say N.
diff --git a/net/lapb/Makefile b/net/lapb/Makefile
new file mode 100644
index 000000000..fff797dfc
--- /dev/null
+++ b/net/lapb/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux LAPB layer.
+#
+
+obj-$(CONFIG_LAPB) += lapb.o
+
+lapb-y := lapb_in.o lapb_out.o lapb_subr.o lapb_timer.o lapb_iface.o
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
new file mode 100644
index 000000000..fc60d9d73
--- /dev/null
+++ b/net/lapb/lapb_iface.c
@@ -0,0 +1,441 @@
+/*
+ * LAPB release 002
+ *
+ * This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * LAPB 001 Jonathan Naylor Started Coding
+ * LAPB 002 Jonathan Naylor New timer architecture.
+ * 2000-10-29 Henner Eisen lapb_data_indication() return status.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <net/lapb.h>
+
+static LIST_HEAD(lapb_list);
+static DEFINE_RWLOCK(lapb_list_lock);
+
+/*
+ * Free an allocated lapb control block.
+ */
+static void lapb_free_cb(struct lapb_cb *lapb)
+{
+ kfree(lapb);
+}
+
+static __inline__ void lapb_hold(struct lapb_cb *lapb)
+{
+ atomic_inc(&lapb->refcnt);
+}
+
+static __inline__ void lapb_put(struct lapb_cb *lapb)
+{
+ if (atomic_dec_and_test(&lapb->refcnt))
+ lapb_free_cb(lapb);
+}
+
+/*
+ * Socket removal during an interrupt is now safe.
+ */
+static void __lapb_remove_cb(struct lapb_cb *lapb)
+{
+ if (lapb->node.next) {
+ list_del(&lapb->node);
+ lapb_put(lapb);
+ }
+}
+EXPORT_SYMBOL(lapb_register);
+
+/*
+ * Add a socket to the bound sockets list.
+ */
+static void __lapb_insert_cb(struct lapb_cb *lapb)
+{
+ list_add(&lapb->node, &lapb_list);
+ lapb_hold(lapb);
+}
+
+static struct lapb_cb *__lapb_devtostruct(struct net_device *dev)
+{
+ struct list_head *entry;
+ struct lapb_cb *lapb, *use = NULL;
+
+ list_for_each(entry, &lapb_list) {
+ lapb = list_entry(entry, struct lapb_cb, node);
+ if (lapb->dev == dev) {
+ use = lapb;
+ break;
+ }
+ }
+
+ if (use)
+ lapb_hold(use);
+
+ return use;
+}
+
+static struct lapb_cb *lapb_devtostruct(struct net_device *dev)
+{
+ struct lapb_cb *rc;
+
+ read_lock_bh(&lapb_list_lock);
+ rc = __lapb_devtostruct(dev);
+ read_unlock_bh(&lapb_list_lock);
+
+ return rc;
+}
+/*
+ * Create an empty LAPB control block.
+ */
+static struct lapb_cb *lapb_create_cb(void)
+{
+ struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC);
+
+
+ if (!lapb)
+ goto out;
+
+ skb_queue_head_init(&lapb->write_queue);
+ skb_queue_head_init(&lapb->ack_queue);
+
+ init_timer(&lapb->t1timer);
+ init_timer(&lapb->t2timer);
+
+ lapb->t1 = LAPB_DEFAULT_T1;
+ lapb->t2 = LAPB_DEFAULT_T2;
+ lapb->n2 = LAPB_DEFAULT_N2;
+ lapb->mode = LAPB_DEFAULT_MODE;
+ lapb->window = LAPB_DEFAULT_WINDOW;
+ lapb->state = LAPB_STATE_0;
+ atomic_set(&lapb->refcnt, 1);
+out:
+ return lapb;
+}
+
+int lapb_register(struct net_device *dev,
+ const struct lapb_register_struct *callbacks)
+{
+ struct lapb_cb *lapb;
+ int rc = LAPB_BADTOKEN;
+
+ write_lock_bh(&lapb_list_lock);
+
+ lapb = __lapb_devtostruct(dev);
+ if (lapb) {
+ lapb_put(lapb);
+ goto out;
+ }
+
+ lapb = lapb_create_cb();
+ rc = LAPB_NOMEM;
+ if (!lapb)
+ goto out;
+
+ lapb->dev = dev;
+ lapb->callbacks = callbacks;
+
+ __lapb_insert_cb(lapb);
+
+ lapb_start_t1timer(lapb);
+
+ rc = LAPB_OK;
+out:
+ write_unlock_bh(&lapb_list_lock);
+ return rc;
+}
+
+int lapb_unregister(struct net_device *dev)
+{
+ struct lapb_cb *lapb;
+ int rc = LAPB_BADTOKEN;
+
+ write_lock_bh(&lapb_list_lock);
+ lapb = __lapb_devtostruct(dev);
+ if (!lapb)
+ goto out;
+
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+
+ lapb_clear_queues(lapb);
+
+ __lapb_remove_cb(lapb);
+
+ lapb_put(lapb);
+ rc = LAPB_OK;
+out:
+ write_unlock_bh(&lapb_list_lock);
+ return rc;
+}
+EXPORT_SYMBOL(lapb_unregister);
+
+int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms)
+{
+ int rc = LAPB_BADTOKEN;
+ struct lapb_cb *lapb = lapb_devtostruct(dev);
+
+ if (!lapb)
+ goto out;
+
+ parms->t1 = lapb->t1 / HZ;
+ parms->t2 = lapb->t2 / HZ;
+ parms->n2 = lapb->n2;
+ parms->n2count = lapb->n2count;
+ parms->state = lapb->state;
+ parms->window = lapb->window;
+ parms->mode = lapb->mode;
+
+ if (!timer_pending(&lapb->t1timer))
+ parms->t1timer = 0;
+ else
+ parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ;
+
+ if (!timer_pending(&lapb->t2timer))
+ parms->t2timer = 0;
+ else
+ parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ;
+
+ lapb_put(lapb);
+ rc = LAPB_OK;
+out:
+ return rc;
+}
+EXPORT_SYMBOL(lapb_getparms);
+
+int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms)
+{
+ int rc = LAPB_BADTOKEN;
+ struct lapb_cb *lapb = lapb_devtostruct(dev);
+
+ if (!lapb)
+ goto out;
+
+ rc = LAPB_INVALUE;
+ if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1)
+ goto out_put;
+
+ if (lapb->state == LAPB_STATE_0) {
+ if (parms->mode & LAPB_EXTENDED) {
+ if (parms->window < 1 || parms->window > 127)
+ goto out_put;
+ } else {
+ if (parms->window < 1 || parms->window > 7)
+ goto out_put;
+ }
+ lapb->mode = parms->mode;
+ lapb->window = parms->window;
+ }
+
+ lapb->t1 = parms->t1 * HZ;
+ lapb->t2 = parms->t2 * HZ;
+ lapb->n2 = parms->n2;
+
+ rc = LAPB_OK;
+out_put:
+ lapb_put(lapb);
+out:
+ return rc;
+}
+EXPORT_SYMBOL(lapb_setparms);
+
+int lapb_connect_request(struct net_device *dev)
+{
+ struct lapb_cb *lapb = lapb_devtostruct(dev);
+ int rc = LAPB_BADTOKEN;
+
+ if (!lapb)
+ goto out;
+
+ rc = LAPB_OK;
+ if (lapb->state == LAPB_STATE_1)
+ goto out_put;
+
+ rc = LAPB_CONNECTED;
+ if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4)
+ goto out_put;
+
+ lapb_establish_data_link(lapb);
+
+ lapb_dbg(0, "(%p) S0 -> S1\n", lapb->dev);
+ lapb->state = LAPB_STATE_1;
+
+ rc = LAPB_OK;
+out_put:
+ lapb_put(lapb);
+out:
+ return rc;
+}
+EXPORT_SYMBOL(lapb_connect_request);
+
+int lapb_disconnect_request(struct net_device *dev)
+{
+ struct lapb_cb *lapb = lapb_devtostruct(dev);
+ int rc = LAPB_BADTOKEN;
+
+ if (!lapb)
+ goto out;
+
+ switch (lapb->state) {
+ case LAPB_STATE_0:
+ rc = LAPB_NOTCONNECTED;
+ goto out_put;
+
+ case LAPB_STATE_1:
+ lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev);
+ lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev);
+ lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND);
+ lapb->state = LAPB_STATE_0;
+ lapb_start_t1timer(lapb);
+ rc = LAPB_NOTCONNECTED;
+ goto out_put;
+
+ case LAPB_STATE_2:
+ rc = LAPB_OK;
+ goto out_put;
+ }
+
+ lapb_clear_queues(lapb);
+ lapb->n2count = 0;
+ lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND);
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_2;
+
+ lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev);
+ lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev);
+
+ rc = LAPB_OK;
+out_put:
+ lapb_put(lapb);
+out:
+ return rc;
+}
+EXPORT_SYMBOL(lapb_disconnect_request);
+
+int lapb_data_request(struct net_device *dev, struct sk_buff *skb)
+{
+ struct lapb_cb *lapb = lapb_devtostruct(dev);
+ int rc = LAPB_BADTOKEN;
+
+ if (!lapb)
+ goto out;
+
+ rc = LAPB_NOTCONNECTED;
+ if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4)
+ goto out_put;
+
+ skb_queue_tail(&lapb->write_queue, skb);
+ lapb_kick(lapb);
+ rc = LAPB_OK;
+out_put:
+ lapb_put(lapb);
+out:
+ return rc;
+}
+EXPORT_SYMBOL(lapb_data_request);
+
+int lapb_data_received(struct net_device *dev, struct sk_buff *skb)
+{
+ struct lapb_cb *lapb = lapb_devtostruct(dev);
+ int rc = LAPB_BADTOKEN;
+
+ if (lapb) {
+ lapb_data_input(lapb, skb);
+ lapb_put(lapb);
+ rc = LAPB_OK;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lapb_data_received);
+
+void lapb_connect_confirmation(struct lapb_cb *lapb, int reason)
+{
+ if (lapb->callbacks->connect_confirmation)
+ lapb->callbacks->connect_confirmation(lapb->dev, reason);
+}
+
+void lapb_connect_indication(struct lapb_cb *lapb, int reason)
+{
+ if (lapb->callbacks->connect_indication)
+ lapb->callbacks->connect_indication(lapb->dev, reason);
+}
+
+void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason)
+{
+ if (lapb->callbacks->disconnect_confirmation)
+ lapb->callbacks->disconnect_confirmation(lapb->dev, reason);
+}
+
+void lapb_disconnect_indication(struct lapb_cb *lapb, int reason)
+{
+ if (lapb->callbacks->disconnect_indication)
+ lapb->callbacks->disconnect_indication(lapb->dev, reason);
+}
+
+int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb)
+{
+ if (lapb->callbacks->data_indication)
+ return lapb->callbacks->data_indication(lapb->dev, skb);
+
+ kfree_skb(skb);
+ return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */
+}
+
+int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb)
+{
+ int used = 0;
+
+ if (lapb->callbacks->data_transmit) {
+ lapb->callbacks->data_transmit(lapb->dev, skb);
+ used = 1;
+ }
+
+ return used;
+}
+
+static int __init lapb_init(void)
+{
+ return 0;
+}
+
+static void __exit lapb_exit(void)
+{
+ WARN_ON(!list_empty(&lapb_list));
+}
+
+MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol");
+MODULE_LICENSE("GPL");
+
+module_init(lapb_init);
+module_exit(lapb_exit);
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
new file mode 100644
index 000000000..5dba89913
--- /dev/null
+++ b/net/lapb/lapb_in.c
@@ -0,0 +1,562 @@
+/*
+ * LAPB release 002
+ *
+ * This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * LAPB 001 Jonathan Naulor Started Coding
+ * LAPB 002 Jonathan Naylor New timer architecture.
+ * 2000-10-29 Henner Eisen lapb_data_indication() return status.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/lapb.h>
+
+/*
+ * State machine for state 0, Disconnected State.
+ * The handling of the timer(s) is in file lapb_timer.c.
+ */
+static void lapb_state0_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+ struct lapb_frame *frame)
+{
+ switch (frame->type) {
+ case LAPB_SABM:
+ lapb_dbg(1, "(%p) S0 RX SABM(%d)\n", lapb->dev, frame->pf);
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S0 TX DM(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ } else {
+ lapb_dbg(1, "(%p) S0 TX UA(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_dbg(0, "(%p) S0 -> S3\n", lapb->dev);
+ lapb_send_control(lapb, LAPB_UA, frame->pf,
+ LAPB_RESPONSE);
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_3;
+ lapb->condition = 0x00;
+ lapb->n2count = 0;
+ lapb->vs = 0;
+ lapb->vr = 0;
+ lapb->va = 0;
+ lapb_connect_indication(lapb, LAPB_OK);
+ }
+ break;
+
+ case LAPB_SABME:
+ lapb_dbg(1, "(%p) S0 RX SABME(%d)\n", lapb->dev, frame->pf);
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S0 TX UA(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_dbg(0, "(%p) S0 -> S3\n", lapb->dev);
+ lapb_send_control(lapb, LAPB_UA, frame->pf,
+ LAPB_RESPONSE);
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_3;
+ lapb->condition = 0x00;
+ lapb->n2count = 0;
+ lapb->vs = 0;
+ lapb->vr = 0;
+ lapb->va = 0;
+ lapb_connect_indication(lapb, LAPB_OK);
+ } else {
+ lapb_dbg(1, "(%p) S0 TX DM(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ }
+ break;
+
+ case LAPB_DISC:
+ lapb_dbg(1, "(%p) S0 RX DISC(%d)\n", lapb->dev, frame->pf);
+ lapb_dbg(1, "(%p) S0 TX UA(%d)\n", lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_UA, frame->pf, LAPB_RESPONSE);
+ break;
+
+ default:
+ break;
+ }
+
+ kfree_skb(skb);
+}
+
+/*
+ * State machine for state 1, Awaiting Connection State.
+ * The handling of the timer(s) is in file lapb_timer.c.
+ */
+static void lapb_state1_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+ struct lapb_frame *frame)
+{
+ switch (frame->type) {
+ case LAPB_SABM:
+ lapb_dbg(1, "(%p) S1 RX SABM(%d)\n", lapb->dev, frame->pf);
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S1 TX DM(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ } else {
+ lapb_dbg(1, "(%p) S1 TX UA(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_UA, frame->pf,
+ LAPB_RESPONSE);
+ }
+ break;
+
+ case LAPB_SABME:
+ lapb_dbg(1, "(%p) S1 RX SABME(%d)\n", lapb->dev, frame->pf);
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S1 TX UA(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_UA, frame->pf,
+ LAPB_RESPONSE);
+ } else {
+ lapb_dbg(1, "(%p) S1 TX DM(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ }
+ break;
+
+ case LAPB_DISC:
+ lapb_dbg(1, "(%p) S1 RX DISC(%d)\n", lapb->dev, frame->pf);
+ lapb_dbg(1, "(%p) S1 TX DM(%d)\n", lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf, LAPB_RESPONSE);
+ break;
+
+ case LAPB_UA:
+ lapb_dbg(1, "(%p) S1 RX UA(%d)\n", lapb->dev, frame->pf);
+ if (frame->pf) {
+ lapb_dbg(0, "(%p) S1 -> S3\n", lapb->dev);
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_3;
+ lapb->condition = 0x00;
+ lapb->n2count = 0;
+ lapb->vs = 0;
+ lapb->vr = 0;
+ lapb->va = 0;
+ lapb_connect_confirmation(lapb, LAPB_OK);
+ }
+ break;
+
+ case LAPB_DM:
+ lapb_dbg(1, "(%p) S1 RX DM(%d)\n", lapb->dev, frame->pf);
+ if (frame->pf) {
+ lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev);
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb_disconnect_indication(lapb, LAPB_REFUSED);
+ }
+ break;
+ }
+
+ kfree_skb(skb);
+}
+
+/*
+ * State machine for state 2, Awaiting Release State.
+ * The handling of the timer(s) is in file lapb_timer.c
+ */
+static void lapb_state2_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+ struct lapb_frame *frame)
+{
+ switch (frame->type) {
+ case LAPB_SABM:
+ case LAPB_SABME:
+ lapb_dbg(1, "(%p) S2 RX {SABM,SABME}(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_dbg(1, "(%p) S2 TX DM(%d)\n", lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf, LAPB_RESPONSE);
+ break;
+
+ case LAPB_DISC:
+ lapb_dbg(1, "(%p) S2 RX DISC(%d)\n", lapb->dev, frame->pf);
+ lapb_dbg(1, "(%p) S2 TX UA(%d)\n", lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_UA, frame->pf, LAPB_RESPONSE);
+ break;
+
+ case LAPB_UA:
+ lapb_dbg(1, "(%p) S2 RX UA(%d)\n", lapb->dev, frame->pf);
+ if (frame->pf) {
+ lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev);
+ lapb->state = LAPB_STATE_0;
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb_disconnect_confirmation(lapb, LAPB_OK);
+ }
+ break;
+
+ case LAPB_DM:
+ lapb_dbg(1, "(%p) S2 RX DM(%d)\n", lapb->dev, frame->pf);
+ if (frame->pf) {
+ lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev);
+ lapb->state = LAPB_STATE_0;
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb_disconnect_confirmation(lapb, LAPB_NOTCONNECTED);
+ }
+ break;
+
+ case LAPB_I:
+ case LAPB_REJ:
+ case LAPB_RNR:
+ case LAPB_RR:
+ lapb_dbg(1, "(%p) S2 RX {I,REJ,RNR,RR}(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_dbg(1, "(%p) S2 RX DM(%d)\n", lapb->dev, frame->pf);
+ if (frame->pf)
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ break;
+ }
+
+ kfree_skb(skb);
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file lapb_timer.c
+ */
+static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+ struct lapb_frame *frame)
+{
+ int queued = 0;
+ int modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS :
+ LAPB_SMODULUS;
+
+ switch (frame->type) {
+ case LAPB_SABM:
+ lapb_dbg(1, "(%p) S3 RX SABM(%d)\n", lapb->dev, frame->pf);
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S3 TX DM(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ } else {
+ lapb_dbg(1, "(%p) S3 TX UA(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_UA, frame->pf,
+ LAPB_RESPONSE);
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->condition = 0x00;
+ lapb->n2count = 0;
+ lapb->vs = 0;
+ lapb->vr = 0;
+ lapb->va = 0;
+ lapb_requeue_frames(lapb);
+ }
+ break;
+
+ case LAPB_SABME:
+ lapb_dbg(1, "(%p) S3 RX SABME(%d)\n", lapb->dev, frame->pf);
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S3 TX UA(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_UA, frame->pf,
+ LAPB_RESPONSE);
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->condition = 0x00;
+ lapb->n2count = 0;
+ lapb->vs = 0;
+ lapb->vr = 0;
+ lapb->va = 0;
+ lapb_requeue_frames(lapb);
+ } else {
+ lapb_dbg(1, "(%p) S3 TX DM(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ }
+ break;
+
+ case LAPB_DISC:
+ lapb_dbg(1, "(%p) S3 RX DISC(%d)\n", lapb->dev, frame->pf);
+ lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev);
+ lapb_clear_queues(lapb);
+ lapb_send_control(lapb, LAPB_UA, frame->pf, LAPB_RESPONSE);
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb_disconnect_indication(lapb, LAPB_OK);
+ break;
+
+ case LAPB_DM:
+ lapb_dbg(1, "(%p) S3 RX DM(%d)\n", lapb->dev, frame->pf);
+ lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev);
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb_disconnect_indication(lapb, LAPB_NOTCONNECTED);
+ break;
+
+ case LAPB_RNR:
+ lapb_dbg(1, "(%p) S3 RX RNR(%d) R%d\n",
+ lapb->dev, frame->pf, frame->nr);
+ lapb->condition |= LAPB_PEER_RX_BUSY_CONDITION;
+ lapb_check_need_response(lapb, frame->cr, frame->pf);
+ if (lapb_validate_nr(lapb, frame->nr)) {
+ lapb_check_iframes_acked(lapb, frame->nr);
+ } else {
+ lapb->frmr_data = *frame;
+ lapb->frmr_type = LAPB_FRMR_Z;
+ lapb_transmit_frmr(lapb);
+ lapb_dbg(0, "(%p) S3 -> S4\n", lapb->dev);
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_4;
+ lapb->n2count = 0;
+ }
+ break;
+
+ case LAPB_RR:
+ lapb_dbg(1, "(%p) S3 RX RR(%d) R%d\n",
+ lapb->dev, frame->pf, frame->nr);
+ lapb->condition &= ~LAPB_PEER_RX_BUSY_CONDITION;
+ lapb_check_need_response(lapb, frame->cr, frame->pf);
+ if (lapb_validate_nr(lapb, frame->nr)) {
+ lapb_check_iframes_acked(lapb, frame->nr);
+ } else {
+ lapb->frmr_data = *frame;
+ lapb->frmr_type = LAPB_FRMR_Z;
+ lapb_transmit_frmr(lapb);
+ lapb_dbg(0, "(%p) S3 -> S4\n", lapb->dev);
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_4;
+ lapb->n2count = 0;
+ }
+ break;
+
+ case LAPB_REJ:
+ lapb_dbg(1, "(%p) S3 RX REJ(%d) R%d\n",
+ lapb->dev, frame->pf, frame->nr);
+ lapb->condition &= ~LAPB_PEER_RX_BUSY_CONDITION;
+ lapb_check_need_response(lapb, frame->cr, frame->pf);
+ if (lapb_validate_nr(lapb, frame->nr)) {
+ lapb_frames_acked(lapb, frame->nr);
+ lapb_stop_t1timer(lapb);
+ lapb->n2count = 0;
+ lapb_requeue_frames(lapb);
+ } else {
+ lapb->frmr_data = *frame;
+ lapb->frmr_type = LAPB_FRMR_Z;
+ lapb_transmit_frmr(lapb);
+ lapb_dbg(0, "(%p) S3 -> S4\n", lapb->dev);
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_4;
+ lapb->n2count = 0;
+ }
+ break;
+
+ case LAPB_I:
+ lapb_dbg(1, "(%p) S3 RX I(%d) S%d R%d\n",
+ lapb->dev, frame->pf, frame->ns, frame->nr);
+ if (!lapb_validate_nr(lapb, frame->nr)) {
+ lapb->frmr_data = *frame;
+ lapb->frmr_type = LAPB_FRMR_Z;
+ lapb_transmit_frmr(lapb);
+ lapb_dbg(0, "(%p) S3 -> S4\n", lapb->dev);
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_4;
+ lapb->n2count = 0;
+ break;
+ }
+ if (lapb->condition & LAPB_PEER_RX_BUSY_CONDITION)
+ lapb_frames_acked(lapb, frame->nr);
+ else
+ lapb_check_iframes_acked(lapb, frame->nr);
+
+ if (frame->ns == lapb->vr) {
+ int cn;
+ cn = lapb_data_indication(lapb, skb);
+ queued = 1;
+ /*
+ * If upper layer has dropped the frame, we
+ * basically ignore any further protocol
+ * processing. This will cause the peer
+ * to re-transmit the frame later like
+ * a frame lost on the wire.
+ */
+ if (cn == NET_RX_DROP) {
+ pr_debug("rx congestion\n");
+ break;
+ }
+ lapb->vr = (lapb->vr + 1) % modulus;
+ lapb->condition &= ~LAPB_REJECT_CONDITION;
+ if (frame->pf)
+ lapb_enquiry_response(lapb);
+ else {
+ if (!(lapb->condition &
+ LAPB_ACK_PENDING_CONDITION)) {
+ lapb->condition |= LAPB_ACK_PENDING_CONDITION;
+ lapb_start_t2timer(lapb);
+ }
+ }
+ } else {
+ if (lapb->condition & LAPB_REJECT_CONDITION) {
+ if (frame->pf)
+ lapb_enquiry_response(lapb);
+ } else {
+ lapb_dbg(1, "(%p) S3 TX REJ(%d) R%d\n",
+ lapb->dev, frame->pf, lapb->vr);
+ lapb->condition |= LAPB_REJECT_CONDITION;
+ lapb_send_control(lapb, LAPB_REJ, frame->pf,
+ LAPB_RESPONSE);
+ lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+ }
+ }
+ break;
+
+ case LAPB_FRMR:
+ lapb_dbg(1, "(%p) S3 RX FRMR(%d) %02X %02X %02X %02X %02X\n",
+ lapb->dev, frame->pf,
+ skb->data[0], skb->data[1], skb->data[2],
+ skb->data[3], skb->data[4]);
+ lapb_establish_data_link(lapb);
+ lapb_dbg(0, "(%p) S3 -> S1\n", lapb->dev);
+ lapb_requeue_frames(lapb);
+ lapb->state = LAPB_STATE_1;
+ break;
+
+ case LAPB_ILLEGAL:
+ lapb_dbg(1, "(%p) S3 RX ILLEGAL(%d)\n", lapb->dev, frame->pf);
+ lapb->frmr_data = *frame;
+ lapb->frmr_type = LAPB_FRMR_W;
+ lapb_transmit_frmr(lapb);
+ lapb_dbg(0, "(%p) S3 -> S4\n", lapb->dev);
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_4;
+ lapb->n2count = 0;
+ break;
+ }
+
+ if (!queued)
+ kfree_skb(skb);
+}
+
+/*
+ * State machine for state 4, Frame Reject State.
+ * The handling of the timer(s) is in file lapb_timer.c.
+ */
+static void lapb_state4_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+ struct lapb_frame *frame)
+{
+ switch (frame->type) {
+ case LAPB_SABM:
+ lapb_dbg(1, "(%p) S4 RX SABM(%d)\n", lapb->dev, frame->pf);
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S4 TX DM(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ } else {
+ lapb_dbg(1, "(%p) S4 TX UA(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_dbg(0, "(%p) S4 -> S3\n", lapb->dev);
+ lapb_send_control(lapb, LAPB_UA, frame->pf,
+ LAPB_RESPONSE);
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_3;
+ lapb->condition = 0x00;
+ lapb->n2count = 0;
+ lapb->vs = 0;
+ lapb->vr = 0;
+ lapb->va = 0;
+ lapb_connect_indication(lapb, LAPB_OK);
+ }
+ break;
+
+ case LAPB_SABME:
+ lapb_dbg(1, "(%p) S4 RX SABME(%d)\n", lapb->dev, frame->pf);
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S4 TX UA(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_dbg(0, "(%p) S4 -> S3\n", lapb->dev);
+ lapb_send_control(lapb, LAPB_UA, frame->pf,
+ LAPB_RESPONSE);
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ lapb->state = LAPB_STATE_3;
+ lapb->condition = 0x00;
+ lapb->n2count = 0;
+ lapb->vs = 0;
+ lapb->vr = 0;
+ lapb->va = 0;
+ lapb_connect_indication(lapb, LAPB_OK);
+ } else {
+ lapb_dbg(1, "(%p) S4 TX DM(%d)\n",
+ lapb->dev, frame->pf);
+ lapb_send_control(lapb, LAPB_DM, frame->pf,
+ LAPB_RESPONSE);
+ }
+ break;
+ }
+
+ kfree_skb(skb);
+}
+
+/*
+ * Process an incoming LAPB frame
+ */
+void lapb_data_input(struct lapb_cb *lapb, struct sk_buff *skb)
+{
+ struct lapb_frame frame;
+
+ if (lapb_decode(lapb, skb, &frame) < 0) {
+ kfree_skb(skb);
+ return;
+ }
+
+ switch (lapb->state) {
+ case LAPB_STATE_0:
+ lapb_state0_machine(lapb, skb, &frame); break;
+ case LAPB_STATE_1:
+ lapb_state1_machine(lapb, skb, &frame); break;
+ case LAPB_STATE_2:
+ lapb_state2_machine(lapb, skb, &frame); break;
+ case LAPB_STATE_3:
+ lapb_state3_machine(lapb, skb, &frame); break;
+ case LAPB_STATE_4:
+ lapb_state4_machine(lapb, skb, &frame); break;
+ }
+
+ lapb_kick(lapb);
+}
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
new file mode 100644
index 000000000..ba4d015bd
--- /dev/null
+++ b/net/lapb/lapb_out.c
@@ -0,0 +1,211 @@
+/*
+ * LAPB release 002
+ *
+ * This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * LAPB 001 Jonathan Naylor Started Coding
+ * LAPB 002 Jonathan Naylor New timer architecture.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/lapb.h>
+
+/*
+ * This procedure is passed a buffer descriptor for an iframe. It builds
+ * the rest of the control part of the frame and then writes it out.
+ */
+static void lapb_send_iframe(struct lapb_cb *lapb, struct sk_buff *skb, int poll_bit)
+{
+ unsigned char *frame;
+
+ if (!skb)
+ return;
+
+ if (lapb->mode & LAPB_EXTENDED) {
+ frame = skb_push(skb, 2);
+
+ frame[0] = LAPB_I;
+ frame[0] |= lapb->vs << 1;
+ frame[1] = poll_bit ? LAPB_EPF : 0;
+ frame[1] |= lapb->vr << 1;
+ } else {
+ frame = skb_push(skb, 1);
+
+ *frame = LAPB_I;
+ *frame |= poll_bit ? LAPB_SPF : 0;
+ *frame |= lapb->vr << 5;
+ *frame |= lapb->vs << 1;
+ }
+
+ lapb_dbg(1, "(%p) S%d TX I(%d) S%d R%d\n",
+ lapb->dev, lapb->state, poll_bit, lapb->vs, lapb->vr);
+
+ lapb_transmit_buffer(lapb, skb, LAPB_COMMAND);
+}
+
+void lapb_kick(struct lapb_cb *lapb)
+{
+ struct sk_buff *skb, *skbn;
+ unsigned short modulus, start, end;
+
+ modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS;
+ start = !skb_peek(&lapb->ack_queue) ? lapb->va : lapb->vs;
+ end = (lapb->va + lapb->window) % modulus;
+
+ if (!(lapb->condition & LAPB_PEER_RX_BUSY_CONDITION) &&
+ start != end && skb_peek(&lapb->write_queue)) {
+ lapb->vs = start;
+
+ /*
+ * Dequeue the frame and copy it.
+ */
+ skb = skb_dequeue(&lapb->write_queue);
+
+ do {
+ if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ skb_queue_head(&lapb->write_queue, skb);
+ break;
+ }
+
+ if (skb->sk)
+ skb_set_owner_w(skbn, skb->sk);
+
+ /*
+ * Transmit the frame copy.
+ */
+ lapb_send_iframe(lapb, skbn, LAPB_POLLOFF);
+
+ lapb->vs = (lapb->vs + 1) % modulus;
+
+ /*
+ * Requeue the original data frame.
+ */
+ skb_queue_tail(&lapb->ack_queue, skb);
+
+ } while (lapb->vs != end && (skb = skb_dequeue(&lapb->write_queue)) != NULL);
+
+ lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+
+ if (!lapb_t1timer_running(lapb))
+ lapb_start_t1timer(lapb);
+ }
+}
+
+void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type)
+{
+ unsigned char *ptr;
+
+ ptr = skb_push(skb, 1);
+
+ if (lapb->mode & LAPB_MLP) {
+ if (lapb->mode & LAPB_DCE) {
+ if (type == LAPB_COMMAND)
+ *ptr = LAPB_ADDR_C;
+ if (type == LAPB_RESPONSE)
+ *ptr = LAPB_ADDR_D;
+ } else {
+ if (type == LAPB_COMMAND)
+ *ptr = LAPB_ADDR_D;
+ if (type == LAPB_RESPONSE)
+ *ptr = LAPB_ADDR_C;
+ }
+ } else {
+ if (lapb->mode & LAPB_DCE) {
+ if (type == LAPB_COMMAND)
+ *ptr = LAPB_ADDR_A;
+ if (type == LAPB_RESPONSE)
+ *ptr = LAPB_ADDR_B;
+ } else {
+ if (type == LAPB_COMMAND)
+ *ptr = LAPB_ADDR_B;
+ if (type == LAPB_RESPONSE)
+ *ptr = LAPB_ADDR_A;
+ }
+ }
+
+ lapb_dbg(2, "(%p) S%d TX %02X %02X %02X\n",
+ lapb->dev, lapb->state,
+ skb->data[0], skb->data[1], skb->data[2]);
+
+ if (!lapb_data_transmit(lapb, skb))
+ kfree_skb(skb);
+}
+
+void lapb_establish_data_link(struct lapb_cb *lapb)
+{
+ lapb->condition = 0x00;
+ lapb->n2count = 0;
+
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S%d TX SABME(1)\n", lapb->dev, lapb->state);
+ lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND);
+ } else {
+ lapb_dbg(1, "(%p) S%d TX SABM(1)\n", lapb->dev, lapb->state);
+ lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND);
+ }
+
+ lapb_start_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+}
+
+void lapb_enquiry_response(struct lapb_cb *lapb)
+{
+ lapb_dbg(1, "(%p) S%d TX RR(1) R%d\n",
+ lapb->dev, lapb->state, lapb->vr);
+
+ lapb_send_control(lapb, LAPB_RR, LAPB_POLLON, LAPB_RESPONSE);
+
+ lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+}
+
+void lapb_timeout_response(struct lapb_cb *lapb)
+{
+ lapb_dbg(1, "(%p) S%d TX RR(0) R%d\n",
+ lapb->dev, lapb->state, lapb->vr);
+ lapb_send_control(lapb, LAPB_RR, LAPB_POLLOFF, LAPB_RESPONSE);
+
+ lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+}
+
+void lapb_check_iframes_acked(struct lapb_cb *lapb, unsigned short nr)
+{
+ if (lapb->vs == nr) {
+ lapb_frames_acked(lapb, nr);
+ lapb_stop_t1timer(lapb);
+ lapb->n2count = 0;
+ } else if (lapb->va != nr) {
+ lapb_frames_acked(lapb, nr);
+ lapb_start_t1timer(lapb);
+ }
+}
+
+void lapb_check_need_response(struct lapb_cb *lapb, int type, int pf)
+{
+ if (type == LAPB_COMMAND && pf)
+ lapb_enquiry_response(lapb);
+}
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
new file mode 100644
index 000000000..9d0a426ec
--- /dev/null
+++ b/net/lapb/lapb_subr.c
@@ -0,0 +1,308 @@
+/*
+ * LAPB release 002
+ *
+ * This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * LAPB 001 Jonathan Naylor Started Coding
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/lapb.h>
+
+/*
+ * This routine purges all the queues of frames.
+ */
+void lapb_clear_queues(struct lapb_cb *lapb)
+{
+ skb_queue_purge(&lapb->write_queue);
+ skb_queue_purge(&lapb->ack_queue);
+}
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+ */
+void lapb_frames_acked(struct lapb_cb *lapb, unsigned short nr)
+{
+ struct sk_buff *skb;
+ int modulus;
+
+ modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS;
+
+ /*
+ * Remove all the ack-ed frames from the ack queue.
+ */
+ if (lapb->va != nr)
+ while (skb_peek(&lapb->ack_queue) && lapb->va != nr) {
+ skb = skb_dequeue(&lapb->ack_queue);
+ kfree_skb(skb);
+ lapb->va = (lapb->va + 1) % modulus;
+ }
+}
+
+void lapb_requeue_frames(struct lapb_cb *lapb)
+{
+ struct sk_buff *skb, *skb_prev = NULL;
+
+ /*
+ * Requeue all the un-ack-ed frames on the output queue to be picked
+ * up by lapb_kick called from the timer. This arrangement handles the
+ * possibility of an empty output queue.
+ */
+ while ((skb = skb_dequeue(&lapb->ack_queue)) != NULL) {
+ if (!skb_prev)
+ skb_queue_head(&lapb->write_queue, skb);
+ else
+ skb_append(skb_prev, skb, &lapb->write_queue);
+ skb_prev = skb;
+ }
+}
+
+/*
+ * Validate that the value of nr is between va and vs. Return true or
+ * false for testing.
+ */
+int lapb_validate_nr(struct lapb_cb *lapb, unsigned short nr)
+{
+ unsigned short vc = lapb->va;
+ int modulus;
+
+ modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS;
+
+ while (vc != lapb->vs) {
+ if (nr == vc)
+ return 1;
+ vc = (vc + 1) % modulus;
+ }
+
+ return nr == lapb->vs;
+}
+
+/*
+ * This routine is the centralised routine for parsing the control
+ * information for the different frame formats.
+ */
+int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb,
+ struct lapb_frame *frame)
+{
+ frame->type = LAPB_ILLEGAL;
+
+ lapb_dbg(2, "(%p) S%d RX %02X %02X %02X\n",
+ lapb->dev, lapb->state,
+ skb->data[0], skb->data[1], skb->data[2]);
+
+ /* We always need to look at 2 bytes, sometimes we need
+ * to look at 3 and those cases are handled below.
+ */
+ if (!pskb_may_pull(skb, 2))
+ return -1;
+
+ if (lapb->mode & LAPB_MLP) {
+ if (lapb->mode & LAPB_DCE) {
+ if (skb->data[0] == LAPB_ADDR_D)
+ frame->cr = LAPB_COMMAND;
+ if (skb->data[0] == LAPB_ADDR_C)
+ frame->cr = LAPB_RESPONSE;
+ } else {
+ if (skb->data[0] == LAPB_ADDR_C)
+ frame->cr = LAPB_COMMAND;
+ if (skb->data[0] == LAPB_ADDR_D)
+ frame->cr = LAPB_RESPONSE;
+ }
+ } else {
+ if (lapb->mode & LAPB_DCE) {
+ if (skb->data[0] == LAPB_ADDR_B)
+ frame->cr = LAPB_COMMAND;
+ if (skb->data[0] == LAPB_ADDR_A)
+ frame->cr = LAPB_RESPONSE;
+ } else {
+ if (skb->data[0] == LAPB_ADDR_A)
+ frame->cr = LAPB_COMMAND;
+ if (skb->data[0] == LAPB_ADDR_B)
+ frame->cr = LAPB_RESPONSE;
+ }
+ }
+
+ skb_pull(skb, 1);
+
+ if (lapb->mode & LAPB_EXTENDED) {
+ if (!(skb->data[0] & LAPB_S)) {
+ if (!pskb_may_pull(skb, 2))
+ return -1;
+ /*
+ * I frame - carries NR/NS/PF
+ */
+ frame->type = LAPB_I;
+ frame->ns = (skb->data[0] >> 1) & 0x7F;
+ frame->nr = (skb->data[1] >> 1) & 0x7F;
+ frame->pf = skb->data[1] & LAPB_EPF;
+ frame->control[0] = skb->data[0];
+ frame->control[1] = skb->data[1];
+ skb_pull(skb, 2);
+ } else if ((skb->data[0] & LAPB_U) == 1) {
+ if (!pskb_may_pull(skb, 2))
+ return -1;
+ /*
+ * S frame - take out PF/NR
+ */
+ frame->type = skb->data[0] & 0x0F;
+ frame->nr = (skb->data[1] >> 1) & 0x7F;
+ frame->pf = skb->data[1] & LAPB_EPF;
+ frame->control[0] = skb->data[0];
+ frame->control[1] = skb->data[1];
+ skb_pull(skb, 2);
+ } else if ((skb->data[0] & LAPB_U) == 3) {
+ /*
+ * U frame - take out PF
+ */
+ frame->type = skb->data[0] & ~LAPB_SPF;
+ frame->pf = skb->data[0] & LAPB_SPF;
+ frame->control[0] = skb->data[0];
+ frame->control[1] = 0x00;
+ skb_pull(skb, 1);
+ }
+ } else {
+ if (!(skb->data[0] & LAPB_S)) {
+ /*
+ * I frame - carries NR/NS/PF
+ */
+ frame->type = LAPB_I;
+ frame->ns = (skb->data[0] >> 1) & 0x07;
+ frame->nr = (skb->data[0] >> 5) & 0x07;
+ frame->pf = skb->data[0] & LAPB_SPF;
+ } else if ((skb->data[0] & LAPB_U) == 1) {
+ /*
+ * S frame - take out PF/NR
+ */
+ frame->type = skb->data[0] & 0x0F;
+ frame->nr = (skb->data[0] >> 5) & 0x07;
+ frame->pf = skb->data[0] & LAPB_SPF;
+ } else if ((skb->data[0] & LAPB_U) == 3) {
+ /*
+ * U frame - take out PF
+ */
+ frame->type = skb->data[0] & ~LAPB_SPF;
+ frame->pf = skb->data[0] & LAPB_SPF;
+ }
+
+ frame->control[0] = skb->data[0];
+
+ skb_pull(skb, 1);
+ }
+
+ return 0;
+}
+
+/*
+ * This routine is called when the HDLC layer internally generates a
+ * command or response for the remote machine ( eg. RR, UA etc. ).
+ * Only supervisory or unnumbered frames are processed, FRMRs are handled
+ * by lapb_transmit_frmr below.
+ */
+void lapb_send_control(struct lapb_cb *lapb, int frametype,
+ int poll_bit, int type)
+{
+ struct sk_buff *skb;
+ unsigned char *dptr;
+
+ if ((skb = alloc_skb(LAPB_HEADER_LEN + 3, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skb, LAPB_HEADER_LEN + 1);
+
+ if (lapb->mode & LAPB_EXTENDED) {
+ if ((frametype & LAPB_U) == LAPB_U) {
+ dptr = skb_put(skb, 1);
+ *dptr = frametype;
+ *dptr |= poll_bit ? LAPB_SPF : 0;
+ } else {
+ dptr = skb_put(skb, 2);
+ dptr[0] = frametype;
+ dptr[1] = (lapb->vr << 1);
+ dptr[1] |= poll_bit ? LAPB_EPF : 0;
+ }
+ } else {
+ dptr = skb_put(skb, 1);
+ *dptr = frametype;
+ *dptr |= poll_bit ? LAPB_SPF : 0;
+ if ((frametype & LAPB_U) == LAPB_S) /* S frames carry NR */
+ *dptr |= (lapb->vr << 5);
+ }
+
+ lapb_transmit_buffer(lapb, skb, type);
+}
+
+/*
+ * This routine generates FRMRs based on information previously stored in
+ * the LAPB control block.
+ */
+void lapb_transmit_frmr(struct lapb_cb *lapb)
+{
+ struct sk_buff *skb;
+ unsigned char *dptr;
+
+ if ((skb = alloc_skb(LAPB_HEADER_LEN + 7, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skb, LAPB_HEADER_LEN + 1);
+
+ if (lapb->mode & LAPB_EXTENDED) {
+ dptr = skb_put(skb, 6);
+ *dptr++ = LAPB_FRMR;
+ *dptr++ = lapb->frmr_data.control[0];
+ *dptr++ = lapb->frmr_data.control[1];
+ *dptr++ = (lapb->vs << 1) & 0xFE;
+ *dptr = (lapb->vr << 1) & 0xFE;
+ if (lapb->frmr_data.cr == LAPB_RESPONSE)
+ *dptr |= 0x01;
+ dptr++;
+ *dptr++ = lapb->frmr_type;
+
+ lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X %02X %02X\n",
+ lapb->dev, lapb->state,
+ skb->data[1], skb->data[2], skb->data[3],
+ skb->data[4], skb->data[5]);
+ } else {
+ dptr = skb_put(skb, 4);
+ *dptr++ = LAPB_FRMR;
+ *dptr++ = lapb->frmr_data.control[0];
+ *dptr = (lapb->vs << 1) & 0x0E;
+ *dptr |= (lapb->vr << 5) & 0xE0;
+ if (lapb->frmr_data.cr == LAPB_RESPONSE)
+ *dptr |= 0x10;
+ dptr++;
+ *dptr++ = lapb->frmr_type;
+
+ lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X\n",
+ lapb->dev, lapb->state, skb->data[1],
+ skb->data[2], skb->data[3]);
+ }
+
+ lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE);
+}
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
new file mode 100644
index 000000000..355cc3b6f
--- /dev/null
+++ b/net/lapb/lapb_timer.c
@@ -0,0 +1,179 @@
+/*
+ * LAPB release 002
+ *
+ * This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * LAPB 001 Jonathan Naylor Started Coding
+ * LAPB 002 Jonathan Naylor New timer architecture.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/lapb.h>
+
+static void lapb_t1timer_expiry(unsigned long);
+static void lapb_t2timer_expiry(unsigned long);
+
+void lapb_start_t1timer(struct lapb_cb *lapb)
+{
+ del_timer(&lapb->t1timer);
+
+ lapb->t1timer.data = (unsigned long)lapb;
+ lapb->t1timer.function = &lapb_t1timer_expiry;
+ lapb->t1timer.expires = jiffies + lapb->t1;
+
+ add_timer(&lapb->t1timer);
+}
+
+void lapb_start_t2timer(struct lapb_cb *lapb)
+{
+ del_timer(&lapb->t2timer);
+
+ lapb->t2timer.data = (unsigned long)lapb;
+ lapb->t2timer.function = &lapb_t2timer_expiry;
+ lapb->t2timer.expires = jiffies + lapb->t2;
+
+ add_timer(&lapb->t2timer);
+}
+
+void lapb_stop_t1timer(struct lapb_cb *lapb)
+{
+ del_timer(&lapb->t1timer);
+}
+
+void lapb_stop_t2timer(struct lapb_cb *lapb)
+{
+ del_timer(&lapb->t2timer);
+}
+
+int lapb_t1timer_running(struct lapb_cb *lapb)
+{
+ return timer_pending(&lapb->t1timer);
+}
+
+static void lapb_t2timer_expiry(unsigned long param)
+{
+ struct lapb_cb *lapb = (struct lapb_cb *)param;
+
+ if (lapb->condition & LAPB_ACK_PENDING_CONDITION) {
+ lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+ lapb_timeout_response(lapb);
+ }
+}
+
+static void lapb_t1timer_expiry(unsigned long param)
+{
+ struct lapb_cb *lapb = (struct lapb_cb *)param;
+
+ switch (lapb->state) {
+
+ /*
+ * If we are a DCE, keep going DM .. DM .. DM
+ */
+ case LAPB_STATE_0:
+ if (lapb->mode & LAPB_DCE)
+ lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE);
+ break;
+
+ /*
+ * Awaiting connection state, send SABM(E), up to N2 times.
+ */
+ case LAPB_STATE_1:
+ if (lapb->n2count == lapb->n2) {
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
+ lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev);
+ return;
+ } else {
+ lapb->n2count++;
+ if (lapb->mode & LAPB_EXTENDED) {
+ lapb_dbg(1, "(%p) S1 TX SABME(1)\n",
+ lapb->dev);
+ lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND);
+ } else {
+ lapb_dbg(1, "(%p) S1 TX SABM(1)\n",
+ lapb->dev);
+ lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND);
+ }
+ }
+ break;
+
+ /*
+ * Awaiting disconnection state, send DISC, up to N2 times.
+ */
+ case LAPB_STATE_2:
+ if (lapb->n2count == lapb->n2) {
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT);
+ lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev);
+ return;
+ } else {
+ lapb->n2count++;
+ lapb_dbg(1, "(%p) S2 TX DISC(1)\n", lapb->dev);
+ lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND);
+ }
+ break;
+
+ /*
+ * Data transfer state, restransmit I frames, up to N2 times.
+ */
+ case LAPB_STATE_3:
+ if (lapb->n2count == lapb->n2) {
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb_stop_t2timer(lapb);
+ lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
+ lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev);
+ return;
+ } else {
+ lapb->n2count++;
+ lapb_requeue_frames(lapb);
+ lapb_kick(lapb);
+ }
+ break;
+
+ /*
+ * Frame reject state, restransmit FRMR frames, up to N2 times.
+ */
+ case LAPB_STATE_4:
+ if (lapb->n2count == lapb->n2) {
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
+ lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev);
+ return;
+ } else {
+ lapb->n2count++;
+ lapb_transmit_frmr(lapb);
+ }
+ break;
+ }
+
+ lapb_start_t1timer(lapb);
+}
diff --git a/net/llc/Kconfig b/net/llc/Kconfig
new file mode 100644
index 000000000..b91c65108
--- /dev/null
+++ b/net/llc/Kconfig
@@ -0,0 +1,10 @@
+config LLC
+ tristate
+ depends on NET
+
+config LLC2
+ tristate "ANSI/IEEE 802.2 LLC type 2 Support"
+ select LLC
+ help
+ This is a Logical Link Layer type 2, connection oriented support.
+ Select this if you want to have support for PF_LLC sockets.
diff --git a/net/llc/Makefile b/net/llc/Makefile
new file mode 100644
index 000000000..4e260cff3
--- /dev/null
+++ b/net/llc/Makefile
@@ -0,0 +1,25 @@
+###########################################################################
+# Makefile for the Linux 802.2 LLC (fully-functional) layer.
+#
+# Copyright (c) 1997 by Procom Technology,Inc.
+# 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+#
+# This program can be redistributed or modified under the terms of the
+# GNU General Public License as published by the Free Software Foundation.
+# This program is distributed without any warranty or implied warranty
+# of merchantability or fitness for a particular purpose.
+#
+# See the GNU General Public License for more details.
+###########################################################################
+
+obj-$(CONFIG_LLC) += llc.o
+
+llc-y := llc_core.o llc_input.o llc_output.o
+
+obj-$(CONFIG_LLC2) += llc2.o
+
+llc2-y := llc_if.o llc_c_ev.o llc_c_ac.o llc_conn.o llc_c_st.o llc_pdu.o \
+ llc_sap.o llc_s_ac.o llc_s_ev.o llc_s_st.o af_llc.o llc_station.o
+
+llc2-$(CONFIG_PROC_FS) += llc_proc.o
+llc2-$(CONFIG_SYSCTL) += sysctl_net_llc.o
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
new file mode 100644
index 000000000..17a8dff06
--- /dev/null
+++ b/net/llc/af_llc.c
@@ -0,0 +1,1250 @@
+/*
+ * af_llc.c - LLC User Interface SAPs
+ * Description:
+ * Functions in this module are implementation of socket based llc
+ * communications for the Linux operating system. Support of llc class
+ * one and class two is provided via SOCK_DGRAM and SOCK_STREAM
+ * respectively.
+ *
+ * An llc2 connection is (mac + sap), only one llc2 sap connection
+ * is allowed per mac. Though one sap may have multiple mac + sap
+ * connections.
+ *
+ * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org>
+ * 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/llc.h>
+#include <net/llc_sap.h>
+#include <net/llc_pdu.h>
+#include <net/llc_conn.h>
+#include <net/tcp_states.h>
+
+/* remember: uninitialized global data is zeroed because its in .bss */
+static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
+static u16 llc_ui_sap_link_no_max[256];
+static struct sockaddr_llc llc_ui_addrnull;
+static const struct proto_ops llc_ui_ops;
+
+static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
+static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
+static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
+
+#if 0
+#define dprintk(args...) printk(KERN_DEBUG args)
+#else
+#define dprintk(args...)
+#endif
+
+/* Maybe we'll add some more in the future. */
+#define LLC_CMSG_PKTINFO 1
+
+
+/**
+ * llc_ui_next_link_no - return the next unused link number for a sap
+ * @sap: Address of sap to get link number from.
+ *
+ * Return the next unused link number for a given sap.
+ */
+static inline u16 llc_ui_next_link_no(int sap)
+{
+ return llc_ui_sap_link_no_max[sap]++;
+}
+
+/**
+ * llc_proto_type - return eth protocol for ARP header type
+ * @arphrd: ARP header type.
+ *
+ * Given an ARP header type return the corresponding ethernet protocol.
+ */
+static inline __be16 llc_proto_type(u16 arphrd)
+{
+ return htons(ETH_P_802_2);
+}
+
+/**
+ * llc_ui_addr_null - determines if a address structure is null
+ * @addr: Address to test if null.
+ */
+static inline u8 llc_ui_addr_null(struct sockaddr_llc *addr)
+{
+ return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr));
+}
+
+/**
+ * llc_ui_header_len - return length of llc header based on operation
+ * @sk: Socket which contains a valid llc socket type.
+ * @addr: Complete sockaddr_llc structure received from the user.
+ *
+ * Provide the length of the llc header depending on what kind of
+ * operation the user would like to perform and the type of socket.
+ * Returns the correct llc header length.
+ */
+static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr)
+{
+ u8 rc = LLC_PDU_LEN_U;
+
+ if (addr->sllc_test || addr->sllc_xid)
+ rc = LLC_PDU_LEN_U;
+ else if (sk->sk_type == SOCK_STREAM)
+ rc = LLC_PDU_LEN_I;
+ return rc;
+}
+
+/**
+ * llc_ui_send_data - send data via reliable llc2 connection
+ * @sk: Connection the socket is using.
+ * @skb: Data the user wishes to send.
+ * @noblock: can we block waiting for data?
+ *
+ * Send data via reliable llc2 connection.
+ * Returns 0 upon success, non-zero if action did not succeed.
+ */
+static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock)
+{
+ struct llc_sock* llc = llc_sk(sk);
+ int rc = 0;
+
+ if (unlikely(llc_data_accept_state(llc->state) ||
+ llc->remote_busy_flag ||
+ llc->p_flag)) {
+ long timeout = sock_sndtimeo(sk, noblock);
+
+ rc = llc_ui_wait_for_busy_core(sk, timeout);
+ }
+ if (unlikely(!rc))
+ rc = llc_build_and_send_pkt(sk, skb);
+ return rc;
+}
+
+static void llc_ui_sk_init(struct socket *sock, struct sock *sk)
+{
+ sock_graft(sk, sock);
+ sk->sk_type = sock->type;
+ sock->ops = &llc_ui_ops;
+}
+
+static struct proto llc_proto = {
+ .name = "LLC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct llc_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+};
+
+/**
+ * llc_ui_create - alloc and init a new llc_ui socket
+ * @net: network namespace (must be default network)
+ * @sock: Socket to initialize and attach allocated sk to.
+ * @protocol: Unused.
+ * @kern: on behalf of kernel or userspace
+ *
+ * Allocate and initialize a new llc_ui socket, validate the user wants a
+ * socket type we have available.
+ * Returns 0 upon success, negative upon failure.
+ */
+static int llc_ui_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ int rc = -ESOCKTNOSUPPORT;
+
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) {
+ rc = -ENOMEM;
+ sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto);
+ if (sk) {
+ rc = 0;
+ llc_ui_sk_init(sock, sk);
+ }
+ }
+ return rc;
+}
+
+/**
+ * llc_ui_release - shutdown socket
+ * @sock: Socket to release.
+ *
+ * Shutdown and deallocate an existing socket.
+ */
+static int llc_ui_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc;
+
+ if (unlikely(sk == NULL))
+ goto out;
+ sock_hold(sk);
+ lock_sock(sk);
+ llc = llc_sk(sk);
+ dprintk("%s: closing local(%02X) remote(%02X)\n", __func__,
+ llc->laddr.lsap, llc->daddr.lsap);
+ if (!llc_send_disc(sk))
+ llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ llc_sap_remove_socket(llc->sap, sk);
+ release_sock(sk);
+ if (llc->dev)
+ dev_put(llc->dev);
+ sock_put(sk);
+ llc_sk_free(sk);
+out:
+ return 0;
+}
+
+/**
+ * llc_ui_autoport - provide dynamically allocate SAP number
+ *
+ * Provide the caller with a dynamically allocated SAP number according
+ * to the rules that are set in this function. Returns: 0, upon failure,
+ * SAP number otherwise.
+ */
+static int llc_ui_autoport(void)
+{
+ struct llc_sap *sap;
+ int i, tries = 0;
+
+ while (tries < LLC_SAP_DYN_TRIES) {
+ for (i = llc_ui_sap_last_autoport;
+ i < LLC_SAP_DYN_STOP; i += 2) {
+ sap = llc_sap_find(i);
+ if (!sap) {
+ llc_ui_sap_last_autoport = i + 2;
+ goto out;
+ }
+ llc_sap_put(sap);
+ }
+ llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
+ tries++;
+ }
+ i = 0;
+out:
+ return i;
+}
+
+/**
+ * llc_ui_autobind - automatically bind a socket to a sap
+ * @sock: socket to bind
+ * @addr: address to connect to
+ *
+ * Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't
+ * specifically used llc_ui_bind to bind to an specific address/sap
+ *
+ * Returns: 0 upon success, negative otherwise.
+ */
+static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
+{
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_sap *sap;
+ int rc = -EINVAL;
+
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ goto out;
+ rc = -ENODEV;
+ if (sk->sk_bound_dev_if) {
+ llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if);
+ if (llc->dev && addr->sllc_arphrd != llc->dev->type) {
+ dev_put(llc->dev);
+ llc->dev = NULL;
+ }
+ } else
+ llc->dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd);
+ if (!llc->dev)
+ goto out;
+ rc = -EUSERS;
+ llc->laddr.lsap = llc_ui_autoport();
+ if (!llc->laddr.lsap)
+ goto out;
+ rc = -EBUSY; /* some other network layer is using the sap */
+ sap = llc_sap_open(llc->laddr.lsap, NULL);
+ if (!sap)
+ goto out;
+ memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN);
+ memcpy(&llc->addr, addr, sizeof(llc->addr));
+ /* assign new connection to its SAP */
+ llc_sap_add_socket(sap, sk);
+ sock_reset_flag(sk, SOCK_ZAPPED);
+ rc = 0;
+out:
+ return rc;
+}
+
+/**
+ * llc_ui_bind - bind a socket to a specific address.
+ * @sock: Socket to bind an address to.
+ * @uaddr: Address the user wants the socket bound to.
+ * @addrlen: Length of the uaddr structure.
+ *
+ * Bind a socket to a specific address. For llc a user is able to bind to
+ * a specific sap only or mac + sap.
+ * If the user desires to bind to a specific mac + sap, it is possible to
+ * have multiple sap connections via multiple macs.
+ * Bind and autobind for that matter must enforce the correct sap usage
+ * otherwise all hell will break loose.
+ * Returns: 0 upon success, negative otherwise.
+ */
+static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
+{
+ struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr;
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_sap *sap;
+ int rc = -EINVAL;
+
+ dprintk("%s: binding %02X\n", __func__, addr->sllc_sap);
+ if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr)))
+ goto out;
+ rc = -EAFNOSUPPORT;
+ if (unlikely(addr->sllc_family != AF_LLC))
+ goto out;
+ rc = -ENODEV;
+ rcu_read_lock();
+ if (sk->sk_bound_dev_if) {
+ llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if);
+ if (llc->dev) {
+ if (!addr->sllc_arphrd)
+ addr->sllc_arphrd = llc->dev->type;
+ if (is_zero_ether_addr(addr->sllc_mac))
+ memcpy(addr->sllc_mac, llc->dev->dev_addr,
+ IFHWADDRLEN);
+ if (addr->sllc_arphrd != llc->dev->type ||
+ !ether_addr_equal(addr->sllc_mac,
+ llc->dev->dev_addr)) {
+ rc = -EINVAL;
+ llc->dev = NULL;
+ }
+ }
+ } else
+ llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd,
+ addr->sllc_mac);
+ if (llc->dev)
+ dev_hold(llc->dev);
+ rcu_read_unlock();
+ if (!llc->dev)
+ goto out;
+ if (!addr->sllc_sap) {
+ rc = -EUSERS;
+ addr->sllc_sap = llc_ui_autoport();
+ if (!addr->sllc_sap)
+ goto out;
+ }
+ sap = llc_sap_find(addr->sllc_sap);
+ if (!sap) {
+ sap = llc_sap_open(addr->sllc_sap, NULL);
+ rc = -EBUSY; /* some other network layer is using the sap */
+ if (!sap)
+ goto out;
+ } else {
+ struct llc_addr laddr, daddr;
+ struct sock *ask;
+
+ memset(&laddr, 0, sizeof(laddr));
+ memset(&daddr, 0, sizeof(daddr));
+ /*
+ * FIXME: check if the address is multicast,
+ * only SOCK_DGRAM can do this.
+ */
+ memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN);
+ laddr.lsap = addr->sllc_sap;
+ rc = -EADDRINUSE; /* mac + sap clash. */
+ ask = llc_lookup_established(sap, &daddr, &laddr);
+ if (ask) {
+ sock_put(ask);
+ goto out_put;
+ }
+ }
+ llc->laddr.lsap = addr->sllc_sap;
+ memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN);
+ memcpy(&llc->addr, addr, sizeof(llc->addr));
+ /* assign new connection to its SAP */
+ llc_sap_add_socket(sap, sk);
+ sock_reset_flag(sk, SOCK_ZAPPED);
+ rc = 0;
+out_put:
+ llc_sap_put(sap);
+out:
+ return rc;
+}
+
+/**
+ * llc_ui_shutdown - shutdown a connect llc2 socket.
+ * @sock: Socket to shutdown.
+ * @how: What part of the socket to shutdown.
+ *
+ * Shutdown a connected llc2 socket. Currently this function only supports
+ * shutting down both sends and receives (2), we could probably make this
+ * function such that a user can shutdown only half the connection but not
+ * right now.
+ * Returns: 0 upon success, negative otherwise.
+ */
+static int llc_ui_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int rc = -ENOTCONN;
+
+ lock_sock(sk);
+ if (unlikely(sk->sk_state != TCP_ESTABLISHED))
+ goto out;
+ rc = -EINVAL;
+ if (how != 2)
+ goto out;
+ rc = llc_send_disc(sk);
+ if (!rc)
+ rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+ /* Wake up anyone sleeping in poll */
+ sk->sk_state_change(sk);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+/**
+ * llc_ui_connect - Connect to a remote llc2 mac + sap.
+ * @sock: Socket which will be connected to the remote destination.
+ * @uaddr: Remote and possibly the local address of the new connection.
+ * @addrlen: Size of uaddr structure.
+ * @flags: Operational flags specified by the user.
+ *
+ * Connect to a remote llc2 mac + sap. The caller must specify the
+ * destination mac and address to connect to. If the user hasn't previously
+ * called bind(2) with a smac the address of the first interface of the
+ * specified arp type will be used.
+ * This function will autobind if user did not previously call bind.
+ * Returns: 0 upon success, negative otherwise.
+ */
+static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addrlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr;
+ int rc = -EINVAL;
+
+ lock_sock(sk);
+ if (unlikely(addrlen != sizeof(*addr)))
+ goto out;
+ rc = -EAFNOSUPPORT;
+ if (unlikely(addr->sllc_family != AF_LLC))
+ goto out;
+ if (unlikely(sk->sk_type != SOCK_STREAM))
+ goto out;
+ rc = -EALREADY;
+ if (unlikely(sock->state == SS_CONNECTING))
+ goto out;
+ /* bind connection to sap if user hasn't done it. */
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ /* bind to sap with null dev, exclusive */
+ rc = llc_ui_autobind(sock, addr);
+ if (rc)
+ goto out;
+ }
+ llc->daddr.lsap = addr->sllc_sap;
+ memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN);
+ sock->state = SS_CONNECTING;
+ sk->sk_state = TCP_SYN_SENT;
+ llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap);
+ rc = llc_establish_connection(sk, llc->dev->dev_addr,
+ addr->sllc_mac, addr->sllc_sap);
+ if (rc) {
+ dprintk("%s: llc_ui_send_conn failed :-(\n", __func__);
+ sock->state = SS_UNCONNECTED;
+ sk->sk_state = TCP_CLOSE;
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_SYN_SENT) {
+ const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ if (!timeo || !llc_ui_wait_for_conn(sk, timeo))
+ goto out;
+
+ rc = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out;
+ }
+
+ if (sk->sk_state == TCP_CLOSE)
+ goto sock_error;
+
+ sock->state = SS_CONNECTED;
+ rc = 0;
+out:
+ release_sock(sk);
+ return rc;
+sock_error:
+ rc = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ goto out;
+}
+
+/**
+ * llc_ui_listen - allow a normal socket to accept incoming connections
+ * @sock: Socket to allow incoming connections on.
+ * @backlog: Number of connections to queue.
+ *
+ * Allow a normal socket to accept incoming connections.
+ * Returns 0 upon success, negative otherwise.
+ */
+static int llc_ui_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int rc = -EINVAL;
+
+ lock_sock(sk);
+ if (unlikely(sock->state != SS_UNCONNECTED))
+ goto out;
+ rc = -EOPNOTSUPP;
+ if (unlikely(sk->sk_type != SOCK_STREAM))
+ goto out;
+ rc = -EAGAIN;
+ if (sock_flag(sk, SOCK_ZAPPED))
+ goto out;
+ rc = 0;
+ if (!(unsigned int)backlog) /* BSDism */
+ backlog = 1;
+ sk->sk_max_ack_backlog = backlog;
+ if (sk->sk_state != TCP_LISTEN) {
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = TCP_LISTEN;
+ }
+ sk->sk_socket->flags |= __SO_ACCEPTCON;
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
+{
+ DEFINE_WAIT(wait);
+ int rc = 0;
+
+ while (1) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE))
+ break;
+ rc = -ERESTARTSYS;
+ if (signal_pending(current))
+ break;
+ rc = -EAGAIN;
+ if (!timeout)
+ break;
+ rc = 0;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return rc;
+}
+
+static int llc_ui_wait_for_conn(struct sock *sk, long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ while (1) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT))
+ break;
+ if (signal_pending(current) || !timeout)
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return timeout;
+}
+
+static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
+{
+ DEFINE_WAIT(wait);
+ struct llc_sock *llc = llc_sk(sk);
+ int rc;
+
+ while (1) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ rc = 0;
+ if (sk_wait_event(sk, &timeout,
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ (!llc_data_accept_state(llc->state) &&
+ !llc->remote_busy_flag &&
+ !llc->p_flag)))
+ break;
+ rc = -ERESTARTSYS;
+ if (signal_pending(current))
+ break;
+ rc = -EAGAIN;
+ if (!timeout)
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return rc;
+}
+
+static int llc_wait_data(struct sock *sk, long timeo)
+{
+ int rc;
+
+ while (1) {
+ /*
+ * POSIX 1003.1g mandates this order.
+ */
+ rc = sock_error(sk);
+ if (rc)
+ break;
+ rc = 0;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ rc = -EAGAIN;
+ if (!timeo)
+ break;
+ rc = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ rc = 0;
+ if (sk_wait_data(sk, &timeo))
+ break;
+ }
+ return rc;
+}
+
+static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(skb->sk);
+
+ if (llc->cmsg_flags & LLC_CMSG_PKTINFO) {
+ struct llc_pktinfo info;
+
+ info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex;
+ llc_pdu_decode_dsap(skb, &info.lpi_sap);
+ llc_pdu_decode_da(skb, info.lpi_mac);
+ put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info);
+ }
+}
+
+/**
+ * llc_ui_accept - accept a new incoming connection.
+ * @sock: Socket which connections arrive on.
+ * @newsock: Socket to move incoming connection to.
+ * @flags: User specified operational flags.
+ *
+ * Accept a new incoming connection.
+ * Returns 0 upon success, negative otherwise.
+ */
+static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk = sock->sk, *newsk;
+ struct llc_sock *llc, *newllc;
+ struct sk_buff *skb;
+ int rc = -EOPNOTSUPP;
+
+ dprintk("%s: accepting on %02X\n", __func__,
+ llc_sk(sk)->laddr.lsap);
+ lock_sock(sk);
+ if (unlikely(sk->sk_type != SOCK_STREAM))
+ goto out;
+ rc = -EINVAL;
+ if (unlikely(sock->state != SS_UNCONNECTED ||
+ sk->sk_state != TCP_LISTEN))
+ goto out;
+ /* wait for a connection to arrive. */
+ if (skb_queue_empty(&sk->sk_receive_queue)) {
+ rc = llc_wait_data(sk, sk->sk_rcvtimeo);
+ if (rc)
+ goto out;
+ }
+ dprintk("%s: got a new connection on %02X\n", __func__,
+ llc_sk(sk)->laddr.lsap);
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ rc = -EINVAL;
+ if (!skb->sk)
+ goto frees;
+ rc = 0;
+ newsk = skb->sk;
+ /* attach connection to a new socket. */
+ llc_ui_sk_init(newsock, newsk);
+ sock_reset_flag(newsk, SOCK_ZAPPED);
+ newsk->sk_state = TCP_ESTABLISHED;
+ newsock->state = SS_CONNECTED;
+ llc = llc_sk(sk);
+ newllc = llc_sk(newsk);
+ memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr));
+ newllc->link = llc_ui_next_link_no(newllc->laddr.lsap);
+
+ /* put original socket back into a clean listen state. */
+ sk->sk_state = TCP_LISTEN;
+ sk->sk_ack_backlog--;
+ dprintk("%s: ok success on %02X, client on %02X\n", __func__,
+ llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap);
+frees:
+ kfree_skb(skb);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+/**
+ * llc_ui_recvmsg - copy received data to the socket user.
+ * @sock: Socket to copy data from.
+ * @msg: Various user space related information.
+ * @len: Size of user buffer.
+ * @flags: User specified flags.
+ *
+ * Copy received data to the socket user.
+ * Returns non-negative upon success, negative otherwise.
+ */
+static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ DECLARE_SOCKADDR(struct sockaddr_llc *, uaddr, msg->msg_name);
+ const int nonblock = flags & MSG_DONTWAIT;
+ struct sk_buff *skb = NULL;
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+ unsigned long cpu_flags;
+ size_t copied = 0;
+ u32 peek_seq = 0;
+ u32 *seq, skb_len;
+ unsigned long used;
+ int target; /* Read at least this many bytes */
+ long timeo;
+
+ lock_sock(sk);
+ copied = -ENOTCONN;
+ if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN))
+ goto out;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ seq = &llc->copied_seq;
+ if (flags & MSG_PEEK) {
+ peek_seq = llc->copied_seq;
+ seq = &peek_seq;
+ }
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ copied = 0;
+
+ do {
+ u32 offset;
+
+ /*
+ * We need to check signals first, to get correct SIGURG
+ * handling. FIXME: Need to check this doesn't impact 1003.1g
+ * and move it down to the bottom of the loop
+ */
+ if (signal_pending(current)) {
+ if (copied)
+ break;
+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+ break;
+ }
+
+ /* Next get a buffer. */
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb) {
+ offset = *seq;
+ goto found_ok_skb;
+ }
+ /* Well, if we have backlog, try to process it now yet. */
+
+ if (copied >= target && !sk->sk_backlog.tail)
+ break;
+
+ if (copied) {
+ if (sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ (flags & MSG_PEEK))
+ break;
+ } else {
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
+ }
+
+ if (copied >= target) { /* Do not sleep, just process backlog. */
+ release_sock(sk);
+ lock_sock(sk);
+ } else
+ sk_wait_data(sk, &timeo);
+
+ if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) {
+ net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n",
+ current->comm,
+ task_pid_nr(current));
+ peek_seq = llc->copied_seq;
+ }
+ continue;
+ found_ok_skb:
+ skb_len = skb->len;
+ /* Ok so how much can we use? */
+ used = skb->len - offset;
+ if (len < used)
+ used = len;
+
+ if (!(flags & MSG_TRUNC)) {
+ int rc = skb_copy_datagram_msg(skb, offset, msg, used);
+ if (rc) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ }
+
+ *seq += used;
+ copied += used;
+ len -= used;
+
+ /* For non stream protcols we get one packet per recvmsg call */
+ if (sk->sk_type != SOCK_STREAM)
+ goto copy_uaddr;
+
+ if (!(flags & MSG_PEEK)) {
+ spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
+ sk_eat_skb(sk, skb);
+ spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+ *seq = 0;
+ }
+
+ /* Partial read */
+ if (used + offset < skb_len)
+ continue;
+ } while (len > 0);
+
+out:
+ release_sock(sk);
+ return copied;
+copy_uaddr:
+ if (uaddr != NULL && skb != NULL) {
+ memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr));
+ msg->msg_namelen = sizeof(*uaddr);
+ }
+ if (llc_sk(sk)->cmsg_flags)
+ llc_cmsg_rcv(msg, skb);
+
+ if (!(flags & MSG_PEEK)) {
+ spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
+ sk_eat_skb(sk, skb);
+ spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+ *seq = 0;
+ }
+
+ goto out;
+}
+
+/**
+ * llc_ui_sendmsg - Transmit data provided by the socket user.
+ * @sock: Socket to transmit data from.
+ * @msg: Various user related information.
+ * @len: Length of data to transmit.
+ *
+ * Transmit data provided by the socket user.
+ * Returns non-negative upon success, negative otherwise.
+ */
+static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
+ int flags = msg->msg_flags;
+ int noblock = flags & MSG_DONTWAIT;
+ struct sk_buff *skb;
+ size_t size = 0;
+ int rc = -EINVAL, copied = 0, hdrlen;
+
+ dprintk("%s: sending from %02X to %02X\n", __func__,
+ llc->laddr.lsap, llc->daddr.lsap);
+ lock_sock(sk);
+ if (addr) {
+ if (msg->msg_namelen < sizeof(*addr))
+ goto release;
+ } else {
+ if (llc_ui_addr_null(&llc->addr))
+ goto release;
+ addr = &llc->addr;
+ }
+ /* must bind connection to sap if user hasn't done it. */
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ /* bind to sap with null dev, exclusive. */
+ rc = llc_ui_autobind(sock, addr);
+ if (rc)
+ goto release;
+ }
+ hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr);
+ size = hdrlen + len;
+ if (size > llc->dev->mtu)
+ size = llc->dev->mtu;
+ copied = size - hdrlen;
+ release_sock(sk);
+ skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+ lock_sock(sk);
+ if (!skb)
+ goto release;
+ skb->dev = llc->dev;
+ skb->protocol = llc_proto_type(addr->sllc_arphrd);
+ skb_reserve(skb, hdrlen);
+ rc = memcpy_from_msg(skb_put(skb, copied), msg, copied);
+ if (rc)
+ goto out;
+ if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) {
+ llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac,
+ addr->sllc_sap);
+ goto out;
+ }
+ if (addr->sllc_test) {
+ llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac,
+ addr->sllc_sap);
+ goto out;
+ }
+ if (addr->sllc_xid) {
+ llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac,
+ addr->sllc_sap);
+ goto out;
+ }
+ rc = -ENOPROTOOPT;
+ if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua))
+ goto out;
+ rc = llc_ui_send_data(sk, skb, noblock);
+out:
+ if (rc) {
+ kfree_skb(skb);
+release:
+ dprintk("%s: failed sending from %02X to %02X: %d\n",
+ __func__, llc->laddr.lsap, llc->daddr.lsap, rc);
+ }
+ release_sock(sk);
+ return rc ? : copied;
+}
+
+/**
+ * llc_ui_getname - return the address info of a socket
+ * @sock: Socket to get address of.
+ * @uaddr: Address structure to return information.
+ * @uaddrlen: Length of address structure.
+ * @peer: Does user want local or remote address information.
+ *
+ * Return the address information of a socket.
+ */
+static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddrlen, int peer)
+{
+ struct sockaddr_llc sllc;
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+ int rc = -EBADF;
+
+ memset(&sllc, 0, sizeof(sllc));
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_ZAPPED))
+ goto out;
+ *uaddrlen = sizeof(sllc);
+ if (peer) {
+ rc = -ENOTCONN;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ if(llc->dev)
+ sllc.sllc_arphrd = llc->dev->type;
+ sllc.sllc_sap = llc->daddr.lsap;
+ memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN);
+ } else {
+ rc = -EINVAL;
+ if (!llc->sap)
+ goto out;
+ sllc.sllc_sap = llc->sap->laddr.lsap;
+
+ if (llc->dev) {
+ sllc.sllc_arphrd = llc->dev->type;
+ memcpy(&sllc.sllc_mac, llc->dev->dev_addr,
+ IFHWADDRLEN);
+ }
+ }
+ rc = 0;
+ sllc.sllc_family = AF_LLC;
+ memcpy(uaddr, &sllc, sizeof(sllc));
+out:
+ release_sock(sk);
+ return rc;
+}
+
+/**
+ * llc_ui_ioctl - io controls for PF_LLC
+ * @sock: Socket to get/set info
+ * @cmd: command
+ * @arg: optional argument for cmd
+ *
+ * get/set info on llc sockets
+ */
+static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ return -ENOIOCTLCMD;
+}
+
+/**
+ * llc_ui_setsockopt - set various connection specific parameters.
+ * @sock: Socket to set options on.
+ * @level: Socket level user is requesting operations on.
+ * @optname: Operation name.
+ * @optval: User provided operation data.
+ * @optlen: Length of optval.
+ *
+ * Set various connection specific parameters.
+ */
+static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+ unsigned int opt;
+ int rc = -EINVAL;
+
+ lock_sock(sk);
+ if (unlikely(level != SOL_LLC || optlen != sizeof(int)))
+ goto out;
+ rc = get_user(opt, (int __user *)optval);
+ if (rc)
+ goto out;
+ rc = -EINVAL;
+ switch (optname) {
+ case LLC_OPT_RETRY:
+ if (opt > LLC_OPT_MAX_RETRY)
+ goto out;
+ llc->n2 = opt;
+ break;
+ case LLC_OPT_SIZE:
+ if (opt > LLC_OPT_MAX_SIZE)
+ goto out;
+ llc->n1 = opt;
+ break;
+ case LLC_OPT_ACK_TMR_EXP:
+ if (opt > LLC_OPT_MAX_ACK_TMR_EXP)
+ goto out;
+ llc->ack_timer.expire = opt * HZ;
+ break;
+ case LLC_OPT_P_TMR_EXP:
+ if (opt > LLC_OPT_MAX_P_TMR_EXP)
+ goto out;
+ llc->pf_cycle_timer.expire = opt * HZ;
+ break;
+ case LLC_OPT_REJ_TMR_EXP:
+ if (opt > LLC_OPT_MAX_REJ_TMR_EXP)
+ goto out;
+ llc->rej_sent_timer.expire = opt * HZ;
+ break;
+ case LLC_OPT_BUSY_TMR_EXP:
+ if (opt > LLC_OPT_MAX_BUSY_TMR_EXP)
+ goto out;
+ llc->busy_state_timer.expire = opt * HZ;
+ break;
+ case LLC_OPT_TX_WIN:
+ if (opt > LLC_OPT_MAX_WIN)
+ goto out;
+ llc->k = opt;
+ break;
+ case LLC_OPT_RX_WIN:
+ if (opt > LLC_OPT_MAX_WIN)
+ goto out;
+ llc->rw = opt;
+ break;
+ case LLC_OPT_PKTINFO:
+ if (opt)
+ llc->cmsg_flags |= LLC_CMSG_PKTINFO;
+ else
+ llc->cmsg_flags &= ~LLC_CMSG_PKTINFO;
+ break;
+ default:
+ rc = -ENOPROTOOPT;
+ goto out;
+ }
+ rc = 0;
+out:
+ release_sock(sk);
+ return rc;
+}
+
+/**
+ * llc_ui_getsockopt - get connection specific socket info
+ * @sock: Socket to get information from.
+ * @level: Socket level user is requesting operations on.
+ * @optname: Operation name.
+ * @optval: Variable to return operation data in.
+ * @optlen: Length of optval.
+ *
+ * Get connection specific socket information.
+ */
+static int llc_ui_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+ int val = 0, len = 0, rc = -EINVAL;
+
+ lock_sock(sk);
+ if (unlikely(level != SOL_LLC))
+ goto out;
+ rc = get_user(len, optlen);
+ if (rc)
+ goto out;
+ rc = -EINVAL;
+ if (len != sizeof(int))
+ goto out;
+ switch (optname) {
+ case LLC_OPT_RETRY:
+ val = llc->n2; break;
+ case LLC_OPT_SIZE:
+ val = llc->n1; break;
+ case LLC_OPT_ACK_TMR_EXP:
+ val = llc->ack_timer.expire / HZ; break;
+ case LLC_OPT_P_TMR_EXP:
+ val = llc->pf_cycle_timer.expire / HZ; break;
+ case LLC_OPT_REJ_TMR_EXP:
+ val = llc->rej_sent_timer.expire / HZ; break;
+ case LLC_OPT_BUSY_TMR_EXP:
+ val = llc->busy_state_timer.expire / HZ; break;
+ case LLC_OPT_TX_WIN:
+ val = llc->k; break;
+ case LLC_OPT_RX_WIN:
+ val = llc->rw; break;
+ case LLC_OPT_PKTINFO:
+ val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0;
+ break;
+ default:
+ rc = -ENOPROTOOPT;
+ goto out;
+ }
+ rc = 0;
+ if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+ rc = -EFAULT;
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static const struct net_proto_family llc_ui_family_ops = {
+ .family = PF_LLC,
+ .create = llc_ui_create,
+ .owner = THIS_MODULE,
+};
+
+static const struct proto_ops llc_ui_ops = {
+ .family = PF_LLC,
+ .owner = THIS_MODULE,
+ .release = llc_ui_release,
+ .bind = llc_ui_bind,
+ .connect = llc_ui_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = llc_ui_accept,
+ .getname = llc_ui_getname,
+ .poll = datagram_poll,
+ .ioctl = llc_ui_ioctl,
+ .listen = llc_ui_listen,
+ .shutdown = llc_ui_shutdown,
+ .setsockopt = llc_ui_setsockopt,
+ .getsockopt = llc_ui_getsockopt,
+ .sendmsg = llc_ui_sendmsg,
+ .recvmsg = llc_ui_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static const char llc_proc_err_msg[] __initconst =
+ KERN_CRIT "LLC: Unable to register the proc_fs entries\n";
+static const char llc_sysctl_err_msg[] __initconst =
+ KERN_CRIT "LLC: Unable to register the sysctl entries\n";
+static const char llc_sock_err_msg[] __initconst =
+ KERN_CRIT "LLC: Unable to register the network family\n";
+
+static int __init llc2_init(void)
+{
+ int rc = proto_register(&llc_proto, 0);
+
+ if (rc != 0)
+ goto out;
+
+ llc_build_offset_table();
+ llc_station_init();
+ llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
+ rc = llc_proc_init();
+ if (rc != 0) {
+ printk(llc_proc_err_msg);
+ goto out_station;
+ }
+ rc = llc_sysctl_init();
+ if (rc) {
+ printk(llc_sysctl_err_msg);
+ goto out_proc;
+ }
+ rc = sock_register(&llc_ui_family_ops);
+ if (rc) {
+ printk(llc_sock_err_msg);
+ goto out_sysctl;
+ }
+ llc_add_pack(LLC_DEST_SAP, llc_sap_handler);
+ llc_add_pack(LLC_DEST_CONN, llc_conn_handler);
+out:
+ return rc;
+out_sysctl:
+ llc_sysctl_exit();
+out_proc:
+ llc_proc_exit();
+out_station:
+ llc_station_exit();
+ proto_unregister(&llc_proto);
+ goto out;
+}
+
+static void __exit llc2_exit(void)
+{
+ llc_station_exit();
+ llc_remove_pack(LLC_DEST_SAP);
+ llc_remove_pack(LLC_DEST_CONN);
+ sock_unregister(PF_LLC);
+ llc_proc_exit();
+ llc_sysctl_exit();
+ proto_unregister(&llc_proto);
+}
+
+module_init(llc2_init);
+module_exit(llc2_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003");
+MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support");
+MODULE_ALIAS_NETPROTO(PF_LLC);
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
new file mode 100644
index 000000000..ea225bd26
--- /dev/null
+++ b/net/llc/llc_c_ac.c
@@ -0,0 +1,1444 @@
+/*
+ * llc_c_ac.c - actions performed during connection state transition.
+ *
+ * Description:
+ * Functions in this module are implementation of connection component actions
+ * Details of actions can be found in IEEE-802.2 standard document.
+ * All functions have one connection and one event as input argument. All of
+ * them return 0 On success and 1 otherwise.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/llc_conn.h>
+#include <net/llc_sap.h>
+#include <net/sock.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_st.h>
+#include <net/llc_pdu.h>
+#include <net/llc.h>
+
+
+static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb);
+static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb);
+static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *ev);
+
+static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb);
+
+static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk,
+ struct sk_buff *skb);
+
+static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb);
+
+#define INCORRECT 0
+
+int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (llc->remote_busy_flag) {
+ u8 nr;
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ llc->remote_busy_flag = 0;
+ del_timer(&llc->busy_state_timer.timer);
+ nr = LLC_I_GET_NR(pdu);
+ llc_conn_resend_i_pdu_as_cmd(sk, nr, 0);
+ }
+ return 0;
+}
+
+int llc_conn_ac_conn_ind(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->ind_prim = LLC_CONN_PRIM;
+ return 0;
+}
+
+int llc_conn_ac_conn_confirm(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->cfm_prim = LLC_CONN_PRIM;
+ return 0;
+}
+
+static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->cfm_prim = LLC_DATA_PRIM;
+ return 0;
+}
+
+int llc_conn_ac_data_ind(struct sock *sk, struct sk_buff *skb)
+{
+ llc_conn_rtn_pdu(sk, skb);
+ return 0;
+}
+
+int llc_conn_ac_disc_ind(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ u8 reason = 0;
+ int rc = 0;
+
+ if (ev->type == LLC_CONN_EV_TYPE_PDU) {
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ if (LLC_PDU_IS_RSP(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM)
+ reason = LLC_DISC_REASON_RX_DM_RSP_PDU;
+ else if (LLC_PDU_IS_CMD(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC)
+ reason = LLC_DISC_REASON_RX_DISC_CMD_PDU;
+ } else if (ev->type == LLC_CONN_EV_TYPE_ACK_TMR)
+ reason = LLC_DISC_REASON_ACK_TMR_EXP;
+ else
+ rc = -EINVAL;
+ if (!rc) {
+ ev->reason = reason;
+ ev->ind_prim = LLC_DISC_PRIM;
+ }
+ return rc;
+}
+
+int llc_conn_ac_disc_confirm(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->reason = ev->status;
+ ev->cfm_prim = LLC_DISC_PRIM;
+ return 0;
+}
+
+int llc_conn_ac_rst_ind(struct sock *sk, struct sk_buff *skb)
+{
+ u8 reason = 0;
+ int rc = 1;
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+ struct llc_sock *llc = llc_sk(sk);
+
+ switch (ev->type) {
+ case LLC_CONN_EV_TYPE_PDU:
+ if (LLC_PDU_IS_RSP(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR) {
+ reason = LLC_RESET_REASON_LOCAL;
+ rc = 0;
+ } else if (LLC_PDU_IS_CMD(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME) {
+ reason = LLC_RESET_REASON_REMOTE;
+ rc = 0;
+ }
+ break;
+ case LLC_CONN_EV_TYPE_ACK_TMR:
+ case LLC_CONN_EV_TYPE_P_TMR:
+ case LLC_CONN_EV_TYPE_REJ_TMR:
+ case LLC_CONN_EV_TYPE_BUSY_TMR:
+ if (llc->retry_count > llc->n2) {
+ reason = LLC_RESET_REASON_LOCAL;
+ rc = 0;
+ }
+ break;
+ }
+ if (!rc) {
+ ev->reason = reason;
+ ev->ind_prim = LLC_RESET_PRIM;
+ }
+ return rc;
+}
+
+int llc_conn_ac_rst_confirm(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->reason = 0;
+ ev->cfm_prim = LLC_RESET_PRIM;
+ return 0;
+}
+
+int llc_conn_ac_clear_remote_busy_if_f_eq_1(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ if (LLC_PDU_IS_RSP(pdu) &&
+ LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_1(pdu) && llc_sk(sk)->ack_pf)
+ llc_conn_ac_clear_remote_busy(sk, skb);
+ return 0;
+}
+
+int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (llc->data_flag == 2)
+ del_timer(&llc->rej_sent_timer.timer);
+ return 0;
+}
+
+int llc_conn_ac_send_disc_cmd_p_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_disc_cmd(nskb, 1);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ llc_conn_ac_set_p_flag_1(sk, skb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_dm_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+ u8 f_bit;
+
+ llc_pdu_decode_pf_bit(skb, &f_bit);
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_dm_rsp(nskb, f_bit);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_dm_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_dm_rsp(nskb, 1);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_frmr_rsp_f_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ u8 f_bit;
+ int rc = -ENOBUFS;
+ struct sk_buff *nskb;
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc->rx_pdu_hdr = *((u32 *)pdu);
+ if (LLC_PDU_IS_CMD(pdu))
+ llc_pdu_decode_pf_bit(skb, &f_bit);
+ else
+ f_bit = 0;
+ nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U,
+ sizeof(struct llc_frmr_info));
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS,
+ llc->vR, INCORRECT);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_resend_frmr_rsp_f_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U,
+ sizeof(struct llc_frmr_info));
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+ struct llc_pdu_sn *pdu = (struct llc_pdu_sn *)&llc->rx_pdu_hdr;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_frmr_rsp(nskb, pdu, 0, llc->vS,
+ llc->vR, INCORRECT);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_resend_frmr_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
+{
+ u8 f_bit;
+ int rc = -ENOBUFS;
+ struct sk_buff *nskb;
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc_pdu_decode_pf_bit(skb, &f_bit);
+ nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U,
+ sizeof(struct llc_frmr_info));
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS,
+ llc->vR, INCORRECT);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR);
+ rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
+ if (likely(!rc)) {
+ llc_conn_send_pdu(sk, skb);
+ llc_conn_ac_inc_vs_by_1(sk, skb);
+ }
+ return rc;
+}
+
+static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
+ rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
+ if (likely(!rc)) {
+ llc_conn_send_pdu(sk, skb);
+ llc_conn_ac_inc_vs_by_1(sk, skb);
+ }
+ return rc;
+}
+
+int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
+ rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
+ if (likely(!rc)) {
+ llc_conn_send_pdu(sk, skb);
+ llc_conn_ac_inc_vs_by_1(sk, skb);
+ }
+ return 0;
+}
+
+int llc_conn_ac_resend_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ u8 nr = LLC_I_GET_NR(pdu);
+
+ llc_conn_resend_i_pdu_as_cmd(sk, nr, 0);
+ return 0;
+}
+
+int llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr(struct sock *sk,
+ struct sk_buff *skb)
+{
+ u8 nr;
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (likely(!rc))
+ llc_conn_send_pdu(sk, nskb);
+ else
+ kfree_skb(skb);
+ }
+ if (rc) {
+ nr = LLC_I_GET_NR(pdu);
+ rc = 0;
+ llc_conn_resend_i_pdu_as_cmd(sk, nr, 0);
+ }
+ return rc;
+}
+
+int llc_conn_ac_resend_i_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ u8 nr = LLC_I_GET_NR(pdu);
+
+ llc_conn_resend_i_pdu_as_rsp(sk, nr, 1);
+ return 0;
+}
+
+int llc_conn_ac_send_rej_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_rej_cmd(nskb, 1, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_rej_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rej_rsp(nskb, 1, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_rej_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rej_rsp(nskb, 0, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_rnr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_rnr_cmd(nskb, 1, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_rnr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rnr_rsp(nskb, 1, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_set_remote_busy(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (!llc->remote_busy_flag) {
+ llc->remote_busy_flag = 1;
+ mod_timer(&llc->busy_state_timer.timer,
+ jiffies + llc->busy_state_timer.expire);
+ }
+ return 0;
+}
+
+int llc_conn_ac_opt_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_rr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_rr_cmd(nskb, 1, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_rr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+ u8 f_bit = 1;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_ack_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rr_rsp(nskb, 1, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_rr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_ack_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+void llc_conn_set_p_flag(struct sock *sk, u8 value)
+{
+ int state_changed = llc_sk(sk)->p_flag && !value;
+
+ llc_sk(sk)->p_flag = value;
+
+ if (state_changed)
+ sk->sk_state_change(sk);
+}
+
+int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+ u8 *dmac = llc->daddr.mac;
+
+ if (llc->dev->flags & IFF_LOOPBACK)
+ dmac = llc->dev->dev_addr;
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_sabme_cmd(nskb, 1);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, dmac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ llc_conn_set_p_flag(sk, 1);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_send_ua_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
+{
+ u8 f_bit;
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+ llc_pdu_decode_pf_bit(skb, &f_bit);
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ nskb->dev = llc->dev;
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_ua_rsp(nskb, f_bit);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+int llc_conn_ac_set_s_flag_0(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->s_flag = 0;
+ return 0;
+}
+
+int llc_conn_ac_set_s_flag_1(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->s_flag = 1;
+ return 0;
+}
+
+int llc_conn_ac_start_p_timer(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc_conn_set_p_flag(sk, 1);
+ mod_timer(&llc->pf_cycle_timer.timer,
+ jiffies + llc->pf_cycle_timer.expire);
+ return 0;
+}
+
+/**
+ * llc_conn_ac_send_ack_if_needed - check if ack is needed
+ * @sk: current connection structure
+ * @skb: current event
+ *
+ * Checks number of received PDUs which have not been acknowledged, yet,
+ * If number of them reaches to "npta"(Number of PDUs To Acknowledge) then
+ * sends an RR response as acknowledgement for them. Returns 0 for
+ * success, 1 otherwise.
+ */
+int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb)
+{
+ u8 pf_bit;
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc_pdu_decode_pf_bit(skb, &pf_bit);
+ llc->ack_pf |= pf_bit & 1;
+ if (!llc->ack_must_be_send) {
+ llc->first_pdu_Ns = llc->vR;
+ llc->ack_must_be_send = 1;
+ llc->ack_pf = pf_bit & 1;
+ }
+ if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO)
+ % LLC_2_SEQ_NBR_MODULO) >= llc->npta) {
+ llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb);
+ llc->ack_must_be_send = 0;
+ llc->ack_pf = 0;
+ llc_conn_ac_inc_npta_value(sk, skb);
+ }
+ return 0;
+}
+
+/**
+ * llc_conn_ac_rst_sendack_flag - resets ack_must_be_send flag
+ * @sk: current connection structure
+ * @skb: current event
+ *
+ * This action resets ack_must_be_send flag of given connection, this flag
+ * indicates if there is any PDU which has not been acknowledged yet.
+ * Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->ack_must_be_send = llc_sk(sk)->ack_pf = 0;
+ return 0;
+}
+
+/**
+ * llc_conn_ac_send_i_rsp_f_set_ackpf - acknowledge received PDUs
+ * @sk: current connection structure
+ * @skb: current event
+ *
+ * Sends an I response PDU with f-bit set to ack_pf flag as acknowledge to
+ * all received PDUs which have not been acknowledged, yet. ack_pf flag is
+ * set to one if one PDU with p-bit set to one is received. Returns 0 for
+ * success, 1 otherwise.
+ */
+static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk,
+ struct sk_buff *skb)
+{
+ int rc;
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR);
+ rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
+ if (likely(!rc)) {
+ llc_conn_send_pdu(sk, skb);
+ llc_conn_ac_inc_vs_by_1(sk, skb);
+ }
+ return rc;
+}
+
+/**
+ * llc_conn_ac_send_i_as_ack - sends an I-format PDU to acknowledge rx PDUs
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * This action sends an I-format PDU as acknowledge to received PDUs which
+ * have not been acknowledged, yet, if there is any. By using of this
+ * action number of acknowledgements decreases, this technic is called
+ * piggy backing. Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (llc->ack_must_be_send) {
+ llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb);
+ llc->ack_must_be_send = 0 ;
+ llc->ack_pf = 0;
+ } else
+ llc_conn_ac_send_i_cmd_p_set_0(sk, skb);
+ return 0;
+}
+
+/**
+ * llc_conn_ac_send_rr_rsp_f_set_ackpf - ack all rx PDUs not yet acked
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * This action sends an RR response with f-bit set to ack_pf flag as
+ * acknowledge to all received PDUs which have not been acknowledged, yet,
+ * if there is any. ack_pf flag indicates if a PDU has been received with
+ * p-bit set to one. Returns 0 for success, 1 otherwise.
+ */
+static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk,
+ struct sk_buff *skb)
+{
+ int rc = -ENOBUFS;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+ if (nskb) {
+ struct llc_sap *sap = llc->sap;
+
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+ llc->daddr.lsap, LLC_PDU_RSP);
+ llc_pdu_init_as_rr_rsp(nskb, llc->ack_pf, llc->vR);
+ rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+ if (unlikely(rc))
+ goto free;
+ llc_conn_send_pdu(sk, nskb);
+ }
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+/**
+ * llc_conn_ac_inc_npta_value - tries to make value of npta greater
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * After "inc_cntr" times calling of this action, "npta" increase by one.
+ * this action tries to make vale of "npta" greater as possible; number of
+ * acknowledgements decreases by increasing of "npta". Returns 0 for
+ * success, 1 otherwise.
+ */
+static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (!llc->inc_cntr) {
+ llc->dec_step = 0;
+ llc->dec_cntr = llc->inc_cntr = 2;
+ ++llc->npta;
+ if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO)
+ llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO;
+ } else
+ --llc->inc_cntr;
+ return 0;
+}
+
+/**
+ * llc_conn_ac_adjust_npta_by_rr - decreases "npta" by one
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * After receiving "dec_cntr" times RR command, this action decreases
+ * "npta" by one. Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_adjust_npta_by_rr(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (!llc->connect_step && !llc->remote_busy_flag) {
+ if (!llc->dec_step) {
+ if (!llc->dec_cntr) {
+ llc->inc_cntr = llc->dec_cntr = 2;
+ if (llc->npta > 0)
+ llc->npta = llc->npta - 1;
+ } else
+ llc->dec_cntr -=1;
+ }
+ } else
+ llc->connect_step = 0 ;
+ return 0;
+}
+
+/**
+ * llc_conn_ac_adjust_npta_by_rnr - decreases "npta" by one
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * After receiving "dec_cntr" times RNR command, this action decreases
+ * "npta" by one. Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_adjust_npta_by_rnr(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (llc->remote_busy_flag)
+ if (!llc->dec_step) {
+ if (!llc->dec_cntr) {
+ llc->inc_cntr = llc->dec_cntr = 2;
+ if (llc->npta > 0)
+ --llc->npta;
+ } else
+ --llc->dec_cntr;
+ }
+ return 0;
+}
+
+/**
+ * llc_conn_ac_dec_tx_win_size - decreases tx window size
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * After receiving of a REJ command or response, transmit window size is
+ * decreased by number of PDUs which are outstanding yet. Returns 0 for
+ * success, 1 otherwise.
+ */
+int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+ u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q);
+
+ if (llc->k - unacked_pdu < 1)
+ llc->k = 1;
+ else
+ llc->k -= unacked_pdu;
+ return 0;
+}
+
+/**
+ * llc_conn_ac_inc_tx_win_size - tx window size is inc by 1
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * After receiving an RR response with f-bit set to one, transmit window
+ * size is increased by one. Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc->k += 1;
+ if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO)
+ llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO;
+ return 0;
+}
+
+int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ del_timer(&llc->pf_cycle_timer.timer);
+ del_timer(&llc->ack_timer.timer);
+ del_timer(&llc->rej_sent_timer.timer);
+ del_timer(&llc->busy_state_timer.timer);
+ llc->ack_must_be_send = 0;
+ llc->ack_pf = 0;
+ return 0;
+}
+
+int llc_conn_ac_stop_other_timers(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ del_timer(&llc->rej_sent_timer.timer);
+ del_timer(&llc->pf_cycle_timer.timer);
+ del_timer(&llc->busy_state_timer.timer);
+ llc->ack_must_be_send = 0;
+ llc->ack_pf = 0;
+ return 0;
+}
+
+int llc_conn_ac_start_ack_timer(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire);
+ return 0;
+}
+
+int llc_conn_ac_start_rej_timer(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ mod_timer(&llc->rej_sent_timer.timer,
+ jiffies + llc->rej_sent_timer.expire);
+ return 0;
+}
+
+int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (!timer_pending(&llc->ack_timer.timer))
+ mod_timer(&llc->ack_timer.timer,
+ jiffies + llc->ack_timer.expire);
+ return 0;
+}
+
+int llc_conn_ac_stop_ack_timer(struct sock *sk, struct sk_buff *skb)
+{
+ del_timer(&llc_sk(sk)->ack_timer.timer);
+ return 0;
+}
+
+int llc_conn_ac_stop_p_timer(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ del_timer(&llc->pf_cycle_timer.timer);
+ llc_conn_set_p_flag(sk, 0);
+ return 0;
+}
+
+int llc_conn_ac_stop_rej_timer(struct sock *sk, struct sk_buff *skb)
+{
+ del_timer(&llc_sk(sk)->rej_sent_timer.timer);
+ return 0;
+}
+
+int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb)
+{
+ int acked;
+ u16 unacked = 0;
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc->last_nr = PDU_SUPV_GET_Nr(pdu);
+ acked = llc_conn_remove_acked_pdus(sk, llc->last_nr, &unacked);
+ /* On loopback we don't queue I frames in unack_pdu_q queue. */
+ if (acked > 0 || (llc->dev->flags & IFF_LOOPBACK)) {
+ llc->retry_count = 0;
+ del_timer(&llc->ack_timer.timer);
+ if (llc->failed_data_req) {
+ /* already, we did not accept data from upper layer
+ * (tx_window full or unacceptable state). Now, we
+ * can send data and must inform to upper layer.
+ */
+ llc->failed_data_req = 0;
+ llc_conn_ac_data_confirm(sk, skb);
+ }
+ if (unacked)
+ mod_timer(&llc->ack_timer.timer,
+ jiffies + llc->ack_timer.expire);
+ } else if (llc->failed_data_req) {
+ u8 f_bit;
+
+ llc_pdu_decode_pf_bit(skb, &f_bit);
+ if (f_bit == 1) {
+ llc->failed_data_req = 0;
+ llc_conn_ac_data_confirm(sk, skb);
+ }
+ }
+ return 0;
+}
+
+int llc_conn_ac_upd_p_flag(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ if (LLC_PDU_IS_RSP(pdu)) {
+ u8 f_bit;
+
+ llc_pdu_decode_pf_bit(skb, &f_bit);
+ if (f_bit) {
+ llc_conn_set_p_flag(sk, 0);
+ llc_conn_ac_stop_p_timer(sk, skb);
+ }
+ }
+ return 0;
+}
+
+int llc_conn_ac_set_data_flag_2(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->data_flag = 2;
+ return 0;
+}
+
+int llc_conn_ac_set_data_flag_0(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->data_flag = 0;
+ return 0;
+}
+
+int llc_conn_ac_set_data_flag_1(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->data_flag = 1;
+ return 0;
+}
+
+int llc_conn_ac_set_data_flag_1_if_data_flag_eq_0(struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (!llc_sk(sk)->data_flag)
+ llc_sk(sk)->data_flag = 1;
+ return 0;
+}
+
+int llc_conn_ac_set_p_flag_0(struct sock *sk, struct sk_buff *skb)
+{
+ llc_conn_set_p_flag(sk, 0);
+ return 0;
+}
+
+static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb)
+{
+ llc_conn_set_p_flag(sk, 1);
+ return 0;
+}
+
+int llc_conn_ac_set_remote_busy_0(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->remote_busy_flag = 0;
+ return 0;
+}
+
+int llc_conn_ac_set_cause_flag_0(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->cause_flag = 0;
+ return 0;
+}
+
+int llc_conn_ac_set_cause_flag_1(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->cause_flag = 1;
+ return 0;
+}
+
+int llc_conn_ac_set_retry_cnt_0(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->retry_count = 0;
+ return 0;
+}
+
+int llc_conn_ac_inc_retry_cnt_by_1(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->retry_count++;
+ return 0;
+}
+
+int llc_conn_ac_set_vr_0(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->vR = 0;
+ return 0;
+}
+
+int llc_conn_ac_inc_vr_by_1(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->vR = PDU_GET_NEXT_Vr(llc_sk(sk)->vR);
+ return 0;
+}
+
+int llc_conn_ac_set_vs_0(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->vS = 0;
+ return 0;
+}
+
+int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->vS = llc_sk(sk)->last_nr;
+ return 0;
+}
+
+static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO;
+ return 0;
+}
+
+static void llc_conn_tmr_common_cb(unsigned long timeout_data, u8 type)
+{
+ struct sock *sk = (struct sock *)timeout_data;
+ struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
+
+ bh_lock_sock(sk);
+ if (skb) {
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ skb_set_owner_r(skb, sk);
+ ev->type = type;
+ llc_process_tmr_ev(sk, skb);
+ }
+ bh_unlock_sock(sk);
+}
+
+void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data)
+{
+ llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_P_TMR);
+}
+
+void llc_conn_busy_tmr_cb(unsigned long timeout_data)
+{
+ llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_BUSY_TMR);
+}
+
+void llc_conn_ack_tmr_cb(unsigned long timeout_data)
+{
+ llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_ACK_TMR);
+}
+
+void llc_conn_rej_tmr_cb(unsigned long timeout_data)
+{
+ llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_REJ_TMR);
+}
+
+int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk(sk)->X = llc_sk(sk)->vS;
+ llc_conn_ac_set_vs_nr(sk, skb);
+ return 0;
+}
+
+int llc_conn_ac_upd_vs(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ u8 nr = PDU_SUPV_GET_Nr(pdu);
+
+ if (llc_circular_between(llc_sk(sk)->vS, nr, llc_sk(sk)->X))
+ llc_conn_ac_set_vs_nr(sk, skb);
+ return 0;
+}
+
+/*
+ * Non-standard actions; these not contained in IEEE specification; for
+ * our own usage
+ */
+/**
+ * llc_conn_disc - removes connection from SAP list and frees it
+ * @sk: closed connection
+ * @skb: occurred event
+ */
+int llc_conn_disc(struct sock *sk, struct sk_buff *skb)
+{
+ /* FIXME: this thing seems to want to die */
+ return 0;
+}
+
+/**
+ * llc_conn_reset - resets connection
+ * @sk : reseting connection.
+ * @skb: occurred event.
+ *
+ * Stop all timers, empty all queues and reset all flags.
+ */
+int llc_conn_reset(struct sock *sk, struct sk_buff *skb)
+{
+ llc_sk_reset(sk);
+ return 0;
+}
+
+/**
+ * llc_circular_between - designates that b is between a and c or not
+ * @a: lower bound
+ * @b: element to see if is between a and b
+ * @c: upper bound
+ *
+ * This function designates that b is between a and c or not (for example,
+ * 0 is between 127 and 1). Returns 1 if b is between a and c, 0
+ * otherwise.
+ */
+u8 llc_circular_between(u8 a, u8 b, u8 c)
+{
+ b = b - a;
+ c = c - a;
+ return b <= c;
+}
+
+/**
+ * llc_process_tmr_ev - timer backend
+ * @sk: active connection
+ * @skb: occurred event
+ *
+ * This function is called from timer callback functions. When connection
+ * is busy (during sending a data frame) timer expiration event must be
+ * queued. Otherwise this event can be sent to connection state machine.
+ * Queued events will process by llc_backlog_rcv function after sending
+ * data frame.
+ */
+static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb)
+{
+ if (llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) {
+ printk(KERN_WARNING "%s: timer called on closed connection\n",
+ __func__);
+ kfree_skb(skb);
+ } else {
+ if (!sock_owned_by_user(sk))
+ llc_conn_state_process(sk, skb);
+ else {
+ llc_set_backlog_type(skb, LLC_EVENT);
+ __sk_add_backlog(sk, skb);
+ }
+ }
+}
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
new file mode 100644
index 000000000..523fdd1cf
--- /dev/null
+++ b/net/llc/llc_c_ev.c
@@ -0,0 +1,748 @@
+/*
+ * llc_c_ev.c - Connection component state transition event qualifiers
+ *
+ * A 'state' consists of a number of possible event matching functions,
+ * the actions associated with each being executed when that event is
+ * matched; a 'state machine' accepts events in a serial fashion from an
+ * event queue. Each event is passed to each successive event matching
+ * function until a match is made (the event matching function returns
+ * success, or '0') or the list of event matching functions is exhausted.
+ * If a match is made, the actions associated with the event are executed
+ * and the state is changed to that event's transition state. Before some
+ * events are recognized, even after a match has been made, a certain
+ * number of 'event qualifier' functions must also be executed. If these
+ * all execute successfully, then the event is finally executed.
+ *
+ * These event functions must return 0 for success, to show a matched
+ * event, of 1 if the event does not match. Event qualifier functions
+ * must return a 0 for success or a non-zero for failure. Each function
+ * is simply responsible for verifying one single thing and returning
+ * either a success or failure.
+ *
+ * All of followed event functions are described in 802.2 LLC Protocol
+ * standard document except two functions that we added that will explain
+ * in their comments, at below.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/netdevice.h>
+#include <net/llc_conn.h>
+#include <net/llc_sap.h>
+#include <net/sock.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_pdu.h>
+
+#if 1
+#define dprintk(args...) printk(KERN_DEBUG args)
+#else
+#define dprintk(args...)
+#endif
+
+/**
+ * llc_util_ns_inside_rx_window - check if sequence number is in rx window
+ * @ns: sequence number of received pdu.
+ * @vr: sequence number which receiver expects to receive.
+ * @rw: receive window size of receiver.
+ *
+ * Checks if sequence number of received PDU is in range of receive
+ * window. Returns 0 for success, 1 otherwise
+ */
+static u16 llc_util_ns_inside_rx_window(u8 ns, u8 vr, u8 rw)
+{
+ return !llc_circular_between(vr, ns,
+ (vr + rw - 1) % LLC_2_SEQ_NBR_MODULO);
+}
+
+/**
+ * llc_util_nr_inside_tx_window - check if sequence number is in tx window
+ * @sk: current connection.
+ * @nr: N(R) of received PDU.
+ *
+ * This routine checks if N(R) of received PDU is in range of transmit
+ * window; on the other hand checks if received PDU acknowledges some
+ * outstanding PDUs that are in transmit window. Returns 0 for success, 1
+ * otherwise.
+ */
+static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
+{
+ u8 nr1, nr2;
+ struct sk_buff *skb;
+ struct llc_pdu_sn *pdu;
+ struct llc_sock *llc = llc_sk(sk);
+ int rc = 0;
+
+ if (llc->dev->flags & IFF_LOOPBACK)
+ goto out;
+ rc = 1;
+ if (skb_queue_empty(&llc->pdu_unack_q))
+ goto out;
+ skb = skb_peek(&llc->pdu_unack_q);
+ pdu = llc_pdu_sn_hdr(skb);
+ nr1 = LLC_I_GET_NS(pdu);
+ skb = skb_peek_tail(&llc->pdu_unack_q);
+ pdu = llc_pdu_sn_hdr(skb);
+ nr2 = LLC_I_GET_NS(pdu);
+ rc = !llc_circular_between(nr1, nr, (nr2 + 1) % LLC_2_SEQ_NBR_MODULO);
+out:
+ return rc;
+}
+
+int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->prim == LLC_CONN_PRIM &&
+ ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->prim == LLC_DATA_PRIM &&
+ ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->prim == LLC_DISC_PRIM &&
+ ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->prim == LLC_RESET_PRIM &&
+ ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
+ ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1;
+}
+
+int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
+ ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1;
+}
+
+int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb)
+{
+ return 1;
+}
+
+int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1;
+}
+
+int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1;
+}
+
+int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return llc_conn_space(sk, skb) &&
+ LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_0(pdu) &&
+ LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return llc_conn_space(sk, skb) &&
+ LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_1(pdu) &&
+ LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk,
+ struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_0(pdu) && ns != vr &&
+ !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk,
+ struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_1(pdu) && ns != vr &&
+ !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk,
+ struct sk_buff *skb)
+{
+ const struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+ const u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ ns != vr &&
+ llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+ if (!rc)
+ dprintk("%s: matched, state=%d, ns=%d, vr=%d\n",
+ __func__, llc_sk(sk)->state, ns, vr);
+ return rc;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return llc_conn_space(sk, skb) &&
+ LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_0(pdu) &&
+ LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_1(pdu) &&
+ LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return llc_conn_space(sk, skb) &&
+ LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk,
+ struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_0(pdu) && ns != vr &&
+ !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk,
+ struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ LLC_I_PF_IS_1(pdu) && ns != vr &&
+ !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk,
+ struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr &&
+ !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk,
+ struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vr = llc_sk(sk)->vR;
+ const u8 ns = LLC_I_GET_NS(pdu);
+ const u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+ ns != vr &&
+ llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+ if (!rc)
+ dprintk("%s: matched, state=%d, ns=%d, vr=%d\n",
+ __func__, llc_sk(sk)->state, ns, vr);
+ return rc;
+}
+
+int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_0(pdu) &&
+ LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_1(pdu) &&
+ LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_0(pdu) &&
+ LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_1(pdu) &&
+ LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_0(pdu) &&
+ LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_1(pdu) &&
+ LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_0(pdu) &&
+ LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_1(pdu) &&
+ LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_0(pdu) &&
+ LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_1(pdu) &&
+ LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return llc_conn_space(sk, skb) &&
+ LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_0(pdu) &&
+ LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ return llc_conn_space(sk, skb) &&
+ LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+ LLC_S_PF_IS_1(pdu) &&
+ LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1;
+}
+
+int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_UA ? 0 : 1;
+}
+
+int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+ u16 rc = 1;
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ if (LLC_PDU_IS_CMD(pdu)) {
+ if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) {
+ if (LLC_I_PF_IS_1(pdu))
+ rc = 0;
+ } else if (LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PF_IS_1(pdu))
+ rc = 0;
+ }
+ return rc;
+}
+
+int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ u16 rc = 1;
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ if (LLC_PDU_IS_CMD(pdu)) {
+ if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu))
+ rc = 0;
+ else if (LLC_PDU_TYPE_IS_U(pdu))
+ switch (LLC_U_PDU_CMD(pdu)) {
+ case LLC_2_PDU_CMD_SABME:
+ case LLC_2_PDU_CMD_DISC:
+ rc = 0;
+ break;
+ }
+ }
+ return rc;
+}
+
+int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+ u16 rc = 1;
+ const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ if (LLC_PDU_IS_RSP(pdu)) {
+ if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu))
+ rc = 0;
+ else if (LLC_PDU_TYPE_IS_U(pdu))
+ switch (LLC_U_PDU_RSP(pdu)) {
+ case LLC_2_PDU_RSP_UA:
+ case LLC_2_PDU_RSP_DM:
+ case LLC_2_PDU_RSP_FRMR:
+ rc = 0;
+ break;
+ }
+ }
+
+ return rc;
+}
+
+int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk,
+ struct sk_buff *skb)
+{
+ u16 rc = 1;
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vs = llc_sk(sk)->vS;
+ const u8 nr = LLC_I_GET_NR(pdu);
+
+ if (LLC_PDU_IS_CMD(pdu) &&
+ (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) &&
+ nr != vs && llc_util_nr_inside_tx_window(sk, nr)) {
+ dprintk("%s: matched, state=%d, vs=%d, nr=%d\n",
+ __func__, llc_sk(sk)->state, vs, nr);
+ rc = 0;
+ }
+ return rc;
+}
+
+int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk,
+ struct sk_buff *skb)
+{
+ u16 rc = 1;
+ const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+ const u8 vs = llc_sk(sk)->vS;
+ const u8 nr = LLC_I_GET_NR(pdu);
+
+ if (LLC_PDU_IS_RSP(pdu) &&
+ (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) &&
+ nr != vs && llc_util_nr_inside_tx_window(sk, nr)) {
+ rc = 0;
+ dprintk("%s: matched, state=%d, vs=%d, nr=%d\n",
+ __func__, llc_sk(sk)->state, vs, nr);
+ }
+ return rc;
+}
+
+int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb)
+{
+ return 0;
+}
+
+int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->type != LLC_CONN_EV_TYPE_P_TMR;
+}
+
+int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->type != LLC_CONN_EV_TYPE_ACK_TMR;
+}
+
+int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->type != LLC_CONN_EV_TYPE_REJ_TMR;
+}
+
+int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR;
+}
+
+int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb)
+{
+ return 1;
+}
+
+int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb)
+{
+ const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
+ ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1;
+}
+
+/* Event qualifier functions
+ *
+ * these functions simply verify the value of a state flag associated with
+ * the connection and return either a 0 for success or a non-zero value
+ * for not-success; verify the event is the type we expect
+ */
+int llc_conn_ev_qlfy_data_flag_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+ return llc_sk(sk)->data_flag != 1;
+}
+
+int llc_conn_ev_qlfy_data_flag_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+ return llc_sk(sk)->data_flag;
+}
+
+int llc_conn_ev_qlfy_data_flag_eq_2(struct sock *sk, struct sk_buff *skb)
+{
+ return llc_sk(sk)->data_flag != 2;
+}
+
+int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+ return llc_sk(sk)->p_flag != 1;
+}
+
+/**
+ * conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * This function determines when frame which is sent, is last frame of
+ * transmit window, if it is then this function return zero else return
+ * one. This function is used for sending last frame of transmit window
+ * as I-format command with p-bit set to one. Returns 0 if frame is last
+ * frame, 1 otherwise.
+ */
+int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+ return !(skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k);
+}
+
+/**
+ * conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window
+ * @sk: current connection structure.
+ * @skb: current event.
+ *
+ * This function determines when frame which is sent, isn't last frame of
+ * transmit window, if it isn't then this function return zero else return
+ * one. Returns 0 if frame isn't last frame, 1 otherwise.
+ */
+int llc_conn_ev_qlfy_last_frame_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+ return skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k;
+}
+
+int llc_conn_ev_qlfy_p_flag_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+ return llc_sk(sk)->p_flag;
+}
+
+int llc_conn_ev_qlfy_p_flag_eq_f(struct sock *sk, struct sk_buff *skb)
+{
+ u8 f_bit;
+
+ llc_pdu_decode_pf_bit(skb, &f_bit);
+ return llc_sk(sk)->p_flag == f_bit ? 0 : 1;
+}
+
+int llc_conn_ev_qlfy_remote_busy_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+ return llc_sk(sk)->remote_busy_flag;
+}
+
+int llc_conn_ev_qlfy_remote_busy_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+ return !llc_sk(sk)->remote_busy_flag;
+}
+
+int llc_conn_ev_qlfy_retry_cnt_lt_n2(struct sock *sk, struct sk_buff *skb)
+{
+ return !(llc_sk(sk)->retry_count < llc_sk(sk)->n2);
+}
+
+int llc_conn_ev_qlfy_retry_cnt_gte_n2(struct sock *sk, struct sk_buff *skb)
+{
+ return !(llc_sk(sk)->retry_count >= llc_sk(sk)->n2);
+}
+
+int llc_conn_ev_qlfy_s_flag_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+ return !llc_sk(sk)->s_flag;
+}
+
+int llc_conn_ev_qlfy_s_flag_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+ return llc_sk(sk)->s_flag;
+}
+
+int llc_conn_ev_qlfy_cause_flag_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+ return !llc_sk(sk)->cause_flag;
+}
+
+int llc_conn_ev_qlfy_cause_flag_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+ return llc_sk(sk)->cause_flag;
+}
+
+int llc_conn_ev_qlfy_set_status_conn(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->status = LLC_STATUS_CONN;
+ return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_disc(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->status = LLC_STATUS_DISC;
+ return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_failed(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->status = LLC_STATUS_FAILED;
+ return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_remote_busy(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->status = LLC_STATUS_REMOTE_BUSY;
+ return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_refuse(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->status = LLC_STATUS_REFUSE;
+ return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_conflict(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->status = LLC_STATUS_CONFLICT;
+ return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_rst_done(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->status = LLC_STATUS_RESET_DONE;
+ return 0;
+}
diff --git a/net/llc/llc_c_st.c b/net/llc/llc_c_st.c
new file mode 100644
index 000000000..2467573b5
--- /dev/null
+++ b/net/llc/llc_c_st.c
@@ -0,0 +1,4946 @@
+/*
+ * llc_c_st.c - This module contains state transition of connection component.
+ *
+ * Description of event functions and actions there is in 802.2 LLC standard,
+ * or in "llc_c_ac.c" and "llc_c_ev.c" modules.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/types.h>
+#include <net/llc_if.h>
+#include <net/llc_sap.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_st.h>
+
+#define NONE NULL
+
+/* COMMON CONNECTION STATE transitions
+ * Common transitions for
+ * LLC_CONN_STATE_NORMAL,
+ * LLC_CONN_STATE_BUSY,
+ * LLC_CONN_STATE_REJ,
+ * LLC_CONN_STATE_AWAIT,
+ * LLC_CONN_STATE_AWAIT_BUSY and
+ * LLC_CONN_STATE_AWAIT_REJ states
+ */
+/* State transitions for LLC_CONN_EV_DISC_REQ event */
+static const llc_conn_action_t llc_common_actions_1[] = {
+ [0] = llc_conn_ac_send_disc_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_set_cause_flag_1,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_1 = {
+ .ev = llc_conn_ev_disc_req,
+ .next_state = LLC_CONN_STATE_D_CONN,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RESET_REQ event */
+static const llc_conn_action_t llc_common_actions_2[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_set_cause_flag_1,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_2 = {
+ .ev = llc_conn_ev_rst_req,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_common_actions_3[] = {
+ [0] = llc_conn_ac_stop_all_timers,
+ [1] = llc_conn_ac_set_vs_0,
+ [2] = llc_conn_ac_set_vr_0,
+ [3] = llc_conn_ac_send_ua_rsp_f_set_p,
+ [4] = llc_conn_ac_rst_ind,
+ [5] = llc_conn_ac_set_p_flag_0,
+ [6] = llc_conn_ac_set_remote_busy_0,
+ [7] = llc_conn_reset,
+ [8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_3 = {
+ .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_common_actions_4[] = {
+ [0] = llc_conn_ac_stop_all_timers,
+ [1] = llc_conn_ac_send_ua_rsp_f_set_p,
+ [2] = llc_conn_ac_disc_ind,
+ [3] = llc_conn_disc,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_4 = {
+ .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_FRMR_RSP_Fbit_SET_X event */
+static const llc_conn_action_t llc_common_actions_5[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_rst_ind,
+ [5] = llc_conn_ac_set_cause_flag_0,
+ [6] = llc_conn_reset,
+ [7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_5 = {
+ .ev = llc_conn_ev_rx_frmr_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */
+static const llc_conn_action_t llc_common_actions_6[] = {
+ [0] = llc_conn_ac_disc_ind,
+ [1] = llc_conn_ac_stop_all_timers,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_6 = {
+ .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_RX_ZZZ_CMD_Pbit_SET_X_INVAL_Nr event */
+static const llc_conn_action_t llc_common_actions_7a[] = {
+ [0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_7a = {
+ .ev = llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_7a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_X_INVAL_Ns event */
+static const llc_conn_action_t llc_common_actions_7b[] = {
+ [0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_7b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_7b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_ZZZ_RSP_Fbit_SET_X_INVAL_Nr event */
+static const llc_conn_action_t llc_common_actions_8a[] = {
+ [0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_8a = {
+ .ev = llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X_INVAL_Ns event */
+static const llc_conn_action_t llc_common_actions_8b[] = {
+ [0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_8b = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_BAD_PDU event */
+static const llc_conn_action_t llc_common_actions_8c[] = {
+ [0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_8c = {
+ .ev = llc_conn_ev_rx_bad_pdu,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_8c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event */
+static const llc_conn_action_t llc_common_actions_9[] = {
+ [0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_9 = {
+ .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_common_actions_9,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_1 event */
+#if 0
+static const llc_conn_ev_qfyr_t llc_common_ev_qfyrs_10[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_common_actions_10[] = {
+ [0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_10 = {
+ .ev = llc_conn_ev_rx_xxx_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = llc_common_ev_qfyrs_10,
+ .ev_actions = llc_common_actions_10,
+};
+#endif
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11a[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_common_actions_11a[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_set_cause_flag_0,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_11a = {
+ .ev = llc_conn_ev_p_tmr_exp,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = llc_common_ev_qfyrs_11a,
+ .ev_actions = llc_common_actions_11a,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11b[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_common_actions_11b[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_set_cause_flag_0,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_11b = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = llc_common_ev_qfyrs_11b,
+ .ev_actions = llc_common_actions_11b,
+};
+
+/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11c[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_common_actions_11c[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_set_cause_flag_0,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_11c = {
+ .ev = llc_conn_ev_rej_tmr_exp,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = llc_common_ev_qfyrs_11c,
+ .ev_actions = llc_common_actions_11c,
+};
+
+/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11d[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_common_actions_11d[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_stop_other_timers,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_set_cause_flag_0,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_11d = {
+ .ev = llc_conn_ev_busy_tmr_exp,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = llc_common_ev_qfyrs_11d,
+ .ev_actions = llc_common_actions_11d,
+};
+
+/*
+ * Common dummy state transition; must be last entry for all state
+ * transition groups - it'll be on .bss, so will be zeroed.
+ */
+static struct llc_conn_state_trans llc_common_state_trans_end;
+
+/* LLC_CONN_STATE_ADM transitions */
+/* State transitions for LLC_CONN_EV_CONN_REQ event */
+static const llc_conn_action_t llc_adm_actions_1[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_set_retry_cnt_0,
+ [3] = llc_conn_ac_set_s_flag_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_1 = {
+ .ev = llc_conn_ev_conn_req,
+ .next_state = LLC_CONN_STATE_SETUP,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_adm_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_adm_actions_2[] = {
+ [0] = llc_conn_ac_send_ua_rsp_f_set_p,
+ [1] = llc_conn_ac_set_vs_0,
+ [2] = llc_conn_ac_set_vr_0,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_set_p_flag_0,
+ [5] = llc_conn_ac_set_remote_busy_0,
+ [6] = llc_conn_ac_conn_ind,
+ [7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_2 = {
+ .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_adm_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_adm_actions_3[] = {
+ [0] = llc_conn_ac_send_dm_rsp_f_set_p,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_3 = {
+ .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_adm_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_adm_actions_4[] = {
+ [0] = llc_conn_ac_send_dm_rsp_f_set_1,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_4 = {
+ .ev = llc_conn_ev_rx_xxx_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_adm_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_YYY event */
+static const llc_conn_action_t llc_adm_actions_5[] = {
+ [0] = llc_conn_disc,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_5 = {
+ .ev = llc_conn_ev_rx_any_frame,
+ .next_state = LLC_CONN_OUT_OF_SVC,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_adm_actions_5,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_adm_state_transitions[] = {
+ [0] = &llc_adm_state_trans_1, /* Request */
+ [1] = &llc_common_state_trans_end,
+ [2] = &llc_common_state_trans_end, /* local_busy */
+ [3] = &llc_common_state_trans_end, /* init_pf_cycle */
+ [4] = &llc_common_state_trans_end, /* timer */
+ [5] = &llc_adm_state_trans_2, /* Receive frame */
+ [6] = &llc_adm_state_trans_3,
+ [7] = &llc_adm_state_trans_4,
+ [8] = &llc_adm_state_trans_5,
+ [9] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_SETUP transitions */
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_setup_actions_1[] = {
+ [0] = llc_conn_ac_send_ua_rsp_f_set_p,
+ [1] = llc_conn_ac_set_vs_0,
+ [2] = llc_conn_ac_set_vr_0,
+ [3] = llc_conn_ac_set_s_flag_1,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_1 = {
+ .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_SETUP,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_setup_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_2[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = llc_conn_ev_qlfy_set_status_conn,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_setup_actions_2[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_ac_set_vs_0,
+ [2] = llc_conn_ac_set_vr_0,
+ [3] = llc_conn_ac_upd_p_flag,
+ [4] = llc_conn_ac_set_remote_busy_0,
+ [5] = llc_conn_ac_conn_confirm,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_2 = {
+ .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_setup_ev_qfyrs_2,
+ .ev_actions = llc_setup_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_3[] = {
+ [0] = llc_conn_ev_qlfy_s_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_conn,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_setup_actions_3[] = {
+ [0] = llc_conn_ac_set_p_flag_0,
+ [1] = llc_conn_ac_set_remote_busy_0,
+ [2] = llc_conn_ac_conn_confirm,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_3 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_setup_ev_qfyrs_3,
+ .ev_actions = llc_setup_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_4[] = {
+ [0] = llc_conn_ev_qlfy_set_status_disc,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_setup_actions_4[] = {
+ [0] = llc_conn_ac_send_dm_rsp_f_set_p,
+ [1] = llc_conn_ac_stop_ack_timer,
+ [2] = llc_conn_ac_conn_confirm,
+ [3] = llc_conn_disc,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_4 = {
+ .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_setup_ev_qfyrs_4,
+ .ev_actions = llc_setup_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_5[] = {
+ [0] = llc_conn_ev_qlfy_set_status_disc,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_setup_actions_5[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_ac_conn_confirm,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_5 = {
+ .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_setup_ev_qfyrs_5,
+ .ev_actions = llc_setup_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_7[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = llc_conn_ev_qlfy_s_flag_eq_0,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_setup_actions_7[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_7 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_SETUP,
+ .ev_qualifiers = llc_setup_ev_qfyrs_7,
+ .ev_actions = llc_setup_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_8[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = llc_conn_ev_qlfy_s_flag_eq_0,
+ [2] = llc_conn_ev_qlfy_set_status_failed,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_setup_actions_8[] = {
+ [0] = llc_conn_ac_conn_confirm,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_8 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_setup_ev_qfyrs_8,
+ .ev_actions = llc_setup_actions_8,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_setup_state_transitions[] = {
+ [0] = &llc_common_state_trans_end, /* Request */
+ [1] = &llc_common_state_trans_end, /* local busy */
+ [2] = &llc_common_state_trans_end, /* init_pf_cycle */
+ [3] = &llc_setup_state_trans_3, /* Timer */
+ [4] = &llc_setup_state_trans_7,
+ [5] = &llc_setup_state_trans_8,
+ [6] = &llc_common_state_trans_end,
+ [7] = &llc_setup_state_trans_1, /* Receive frame */
+ [8] = &llc_setup_state_trans_2,
+ [9] = &llc_setup_state_trans_4,
+ [10] = &llc_setup_state_trans_5,
+ [11] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_NORMAL transitions */
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_1[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [2] = llc_conn_ev_qlfy_last_frame_eq_0,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_1[] = {
+ [0] = llc_conn_ac_send_i_as_ack,
+ [1] = llc_conn_ac_start_ack_tmr_if_not_running,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_1 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_1,
+ .ev_actions = llc_normal_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [2] = llc_conn_ev_qlfy_last_frame_eq_1,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_2[] = {
+ [0] = llc_conn_ac_send_i_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_2 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_2,
+ .ev_actions = llc_normal_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2_1[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_remote_busy,
+ [2] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_normal_actions_2_1[1];
+
+static struct llc_conn_state_trans llc_normal_state_trans_2_1 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_2_1,
+ .ev_actions = llc_normal_actions_2_1,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_3[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_3[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rnr_xxx_x_set_0,
+ [2] = llc_conn_ac_set_data_flag_0,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_3 = {
+ .ev = llc_conn_ev_local_busy_detected,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_normal_ev_qfyrs_3,
+ .ev_actions = llc_normal_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_4[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_4[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rnr_xxx_x_set_0,
+ [2] = llc_conn_ac_set_data_flag_0,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_4 = {
+ .ev = llc_conn_ev_local_busy_detected,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_normal_ev_qfyrs_4,
+ .ev_actions = llc_normal_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_5a[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [2] = llc_conn_ac_upd_nr_received,
+ [3] = llc_conn_ac_upd_p_flag,
+ [4] = llc_conn_ac_start_rej_timer,
+ [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_5a = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_normal_ev_qfyrs_5a,
+ .ev_actions = llc_normal_actions_5a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_5b[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [2] = llc_conn_ac_upd_nr_received,
+ [3] = llc_conn_ac_upd_p_flag,
+ [4] = llc_conn_ac_start_rej_timer,
+ [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_5b = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_normal_ev_qfyrs_5b,
+ .ev_actions = llc_normal_actions_5b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5c[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_5c[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [2] = llc_conn_ac_upd_nr_received,
+ [3] = llc_conn_ac_upd_p_flag,
+ [4] = llc_conn_ac_start_rej_timer,
+ [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_5c = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_normal_ev_qfyrs_5c,
+ .ev_actions = llc_normal_actions_5c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_6a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_6a[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [2] = llc_conn_ac_upd_nr_received,
+ [3] = llc_conn_ac_start_rej_timer,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_6a = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_normal_ev_qfyrs_6a,
+ .ev_actions = llc_normal_actions_6a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_6b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_6b[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [2] = llc_conn_ac_upd_nr_received,
+ [3] = llc_conn_ac_start_rej_timer,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_6b = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_normal_ev_qfyrs_6b,
+ .ev_actions = llc_normal_actions_6b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_normal_actions_7[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rej_rsp_f_set_1,
+ [2] = llc_conn_ac_upd_nr_received,
+ [3] = llc_conn_ac_start_rej_timer,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_7 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_8[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [5] = llc_conn_ac_send_ack_if_needed,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_8a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_8a,
+ .ev_actions = llc_normal_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_8b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_8b,
+ .ev_actions = llc_normal_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_9a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_9a[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_data_ind,
+ [3] = llc_conn_ac_send_ack_if_needed,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_9a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_9a,
+ .ev_actions = llc_normal_actions_9a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_9b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_9b[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_data_ind,
+ [3] = llc_conn_ac_send_ack_if_needed,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_9b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_9b,
+ .ev_actions = llc_normal_actions_9b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_normal_actions_10[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_send_ack_rsp_f_set_1,
+ [2] = llc_conn_ac_rst_sendack_flag,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_data_ind,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_10 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_10,
+};
+
+/* State transitions for * LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_normal_actions_11a[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_11a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_11a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_normal_actions_11b[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_11b = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_11b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_11c[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_11c[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_inc_tx_win_size,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_11c = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_11c,
+ .ev_actions = llc_normal_actions_11c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_normal_actions_12[] = {
+ [0] = llc_conn_ac_send_ack_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_adjust_npta_by_rr,
+ [3] = llc_conn_ac_rst_sendack_flag,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_12 = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_12,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_normal_actions_13a[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_13a = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_13a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_normal_actions_13b[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_13b = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_13b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_13c[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_13c[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_13c = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_13c,
+ .ev_actions = llc_normal_actions_13c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_normal_actions_14[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_adjust_npta_by_rnr,
+ [3] = llc_conn_ac_rst_sendack_flag,
+ [4] = llc_conn_ac_set_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_14 = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_14,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_15a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_15a[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_dec_tx_win_size,
+ [4] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [5] = llc_conn_ac_clear_remote_busy,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_15a = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_15a,
+ .ev_actions = llc_normal_actions_15a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_15b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_15b[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_dec_tx_win_size,
+ [4] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [5] = llc_conn_ac_clear_remote_busy,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_15b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_15b,
+ .ev_actions = llc_normal_actions_15b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_16a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_16a[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_dec_tx_win_size,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_16a = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_16a,
+ .ev_actions = llc_normal_actions_16a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_16b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_16b[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_dec_tx_win_size,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_16b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_16b,
+ .ev_actions = llc_normal_actions_16b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_normal_actions_17[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_dec_tx_win_size,
+ [3] = llc_conn_ac_resend_i_rsp_f_set_1,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_17 = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_normal_actions_17,
+};
+
+/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_18[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_18[] = {
+ [0] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_18 = {
+ .ev = llc_conn_ev_init_p_f_cycle,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_18,
+ .ev_actions = llc_normal_actions_18,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_19[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_19[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [2] = llc_conn_ac_rst_vs,
+ [3] = llc_conn_ac_start_p_timer,
+ [4] = llc_conn_ac_inc_retry_cnt_by_1,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_19 = {
+ .ev = llc_conn_ev_p_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = llc_normal_ev_qfyrs_19,
+ .ev_actions = llc_normal_actions_19,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_20a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_20a[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [2] = llc_conn_ac_rst_vs,
+ [3] = llc_conn_ac_start_p_timer,
+ [4] = llc_conn_ac_inc_retry_cnt_by_1,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_20a = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = llc_normal_ev_qfyrs_20a,
+ .ev_actions = llc_normal_actions_20a,
+};
+
+/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_20b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_20b[] = {
+ [0] = llc_conn_ac_rst_sendack_flag,
+ [1] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [2] = llc_conn_ac_rst_vs,
+ [3] = llc_conn_ac_start_p_timer,
+ [4] = llc_conn_ac_inc_retry_cnt_by_1,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_20b = {
+ .ev = llc_conn_ev_busy_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = llc_normal_ev_qfyrs_20b,
+ .ev_actions = llc_normal_actions_20b,
+};
+
+/* State transitions for LLC_CONN_EV_TX_BUFF_FULL event */
+static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_21[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_normal_actions_21[] = {
+ [0] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_21 = {
+ .ev = llc_conn_ev_tx_buffer_full,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_normal_ev_qfyrs_21,
+ .ev_actions = llc_normal_actions_21,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_normal_state_transitions[] = {
+ [0] = &llc_normal_state_trans_1, /* Requests */
+ [1] = &llc_normal_state_trans_2,
+ [2] = &llc_normal_state_trans_2_1,
+ [3] = &llc_common_state_trans_1,
+ [4] = &llc_common_state_trans_2,
+ [5] = &llc_common_state_trans_end,
+ [6] = &llc_normal_state_trans_21,
+ [7] = &llc_normal_state_trans_3, /* Local busy */
+ [8] = &llc_normal_state_trans_4,
+ [9] = &llc_common_state_trans_end,
+ [10] = &llc_normal_state_trans_18, /* Init pf cycle */
+ [11] = &llc_common_state_trans_end,
+ [12] = &llc_common_state_trans_11a, /* Timers */
+ [13] = &llc_common_state_trans_11b,
+ [14] = &llc_common_state_trans_11c,
+ [15] = &llc_common_state_trans_11d,
+ [16] = &llc_normal_state_trans_19,
+ [17] = &llc_normal_state_trans_20a,
+ [18] = &llc_normal_state_trans_20b,
+ [19] = &llc_common_state_trans_end,
+ [20] = &llc_normal_state_trans_8b, /* Receive frames */
+ [21] = &llc_normal_state_trans_9b,
+ [22] = &llc_normal_state_trans_10,
+ [23] = &llc_normal_state_trans_11b,
+ [24] = &llc_normal_state_trans_11c,
+ [25] = &llc_normal_state_trans_5a,
+ [26] = &llc_normal_state_trans_5b,
+ [27] = &llc_normal_state_trans_5c,
+ [28] = &llc_normal_state_trans_6a,
+ [29] = &llc_normal_state_trans_6b,
+ [30] = &llc_normal_state_trans_7,
+ [31] = &llc_normal_state_trans_8a,
+ [32] = &llc_normal_state_trans_9a,
+ [33] = &llc_normal_state_trans_11a,
+ [34] = &llc_normal_state_trans_12,
+ [35] = &llc_normal_state_trans_13a,
+ [36] = &llc_normal_state_trans_13b,
+ [37] = &llc_normal_state_trans_13c,
+ [38] = &llc_normal_state_trans_14,
+ [39] = &llc_normal_state_trans_15a,
+ [40] = &llc_normal_state_trans_15b,
+ [41] = &llc_normal_state_trans_16a,
+ [42] = &llc_normal_state_trans_16b,
+ [43] = &llc_normal_state_trans_17,
+ [44] = &llc_common_state_trans_3,
+ [45] = &llc_common_state_trans_4,
+ [46] = &llc_common_state_trans_5,
+ [47] = &llc_common_state_trans_6,
+ [48] = &llc_common_state_trans_7a,
+ [49] = &llc_common_state_trans_7b,
+ [50] = &llc_common_state_trans_8a,
+ [51] = &llc_common_state_trans_8b,
+ [52] = &llc_common_state_trans_8c,
+ [53] = &llc_common_state_trans_9,
+ /* [54] = &llc_common_state_trans_10, */
+ [54] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_BUSY transitions */
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_1[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_1[] = {
+ [0] = llc_conn_ac_send_i_xxx_x_set_0,
+ [1] = llc_conn_ac_start_ack_tmr_if_not_running,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_1 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_1,
+ .ev_actions = llc_busy_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_2[] = {
+ [0] = llc_conn_ac_send_i_xxx_x_set_0,
+ [1] = llc_conn_ac_start_ack_tmr_if_not_running,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_2 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_2,
+ .ev_actions = llc_busy_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2_1[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_remote_busy,
+ [2] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_busy_actions_2_1[1];
+
+static struct llc_conn_state_trans llc_busy_state_trans_2_1 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_2_1,
+ .ev_actions = llc_busy_actions_2_1,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_3[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_3[] = {
+ [0] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [1] = llc_conn_ac_start_rej_timer,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_3 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_busy_ev_qfyrs_3,
+ .ev_actions = llc_busy_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_4[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_4[] = {
+ [0] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [1] = llc_conn_ac_start_rej_timer,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_4 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_busy_ev_qfyrs_4,
+ .ev_actions = llc_busy_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_5[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_5[] = {
+ [0] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_5 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_busy_ev_qfyrs_5,
+ .ev_actions = llc_busy_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_6[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_6[] = {
+ [0] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_6 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_busy_ev_qfyrs_6,
+ .ev_actions = llc_busy_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_7[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_2,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_7[] = {
+ [0] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_7 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_busy_ev_qfyrs_7,
+ .ev_actions = llc_busy_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_8[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_2,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_8[] = {
+ [0] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_8 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_busy_ev_qfyrs_8,
+ .ev_actions = llc_busy_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_9a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_9a[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_p_flag,
+ [2] = llc_conn_ac_upd_nr_received,
+ [3] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+ [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_9a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_9a,
+ .ev_actions = llc_busy_actions_9a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_9b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_9b[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_p_flag,
+ [2] = llc_conn_ac_upd_nr_received,
+ [3] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+ [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_9b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_9b,
+ .ev_actions = llc_busy_actions_9b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_10a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_10a[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_10a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_10a,
+ .ev_actions = llc_busy_actions_10a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_10b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_10b[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_10b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_10b,
+ .ev_actions = llc_busy_actions_10b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_busy_actions_11[] = {
+ [0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_11 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_11,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_busy_actions_12[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+ [5] = llc_conn_ac_set_data_flag_0,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_12 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_12,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_13a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_13a[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+ [6] = llc_conn_ac_set_data_flag_0,
+ [7] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_13a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_13a,
+ .ev_actions = llc_busy_actions_13a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_13b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_13b[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+ [6] = llc_conn_ac_set_data_flag_0,
+ [7] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_13b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_13b,
+ .ev_actions = llc_busy_actions_13b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_14a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_14a[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+ [5] = llc_conn_ac_set_data_flag_0,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_14a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_14a,
+ .ev_actions = llc_busy_actions_14a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_14b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_14b[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+ [5] = llc_conn_ac_set_data_flag_0,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_14b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_14b,
+ .ev_actions = llc_busy_actions_14b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_busy_actions_15a[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_15a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_15a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_busy_actions_15b[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_15b = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_15b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_15c[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_15c[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_15c = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_15c,
+ .ev_actions = llc_busy_actions_15c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_busy_actions_16[] = {
+ [0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_16 = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_16,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_busy_actions_17a[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_17a = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_17a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_busy_actions_17b[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_17b = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_17b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_17c[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_17c[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_17c = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_17c,
+ .ev_actions = llc_busy_actions_17c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_busy_actions_18[] = {
+ [0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_18 = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_18,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_19a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_19a[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_19a = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_19a,
+ .ev_actions = llc_busy_actions_19a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_19b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_19b[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_19b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_19b,
+ .ev_actions = llc_busy_actions_19b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_20a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_20a[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_20a = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_20a,
+ .ev_actions = llc_busy_actions_20a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_20b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_20b[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_20b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_20b,
+ .ev_actions = llc_busy_actions_20b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_busy_actions_21[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_21 = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_busy_actions_21,
+};
+
+/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_22[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_22[] = {
+ [0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_22 = {
+ .ev = llc_conn_ev_init_p_f_cycle,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_22,
+ .ev_actions = llc_busy_actions_22,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_23[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_23[] = {
+ [0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+ [1] = llc_conn_ac_rst_vs,
+ [2] = llc_conn_ac_start_p_timer,
+ [3] = llc_conn_ac_inc_retry_cnt_by_1,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_23 = {
+ .ev = llc_conn_ev_p_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_23,
+ .ev_actions = llc_busy_actions_23,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_24a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_24a[] = {
+ [0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = llc_conn_ac_rst_vs,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_24a = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_24a,
+ .ev_actions = llc_busy_actions_24a,
+};
+
+/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_24b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_24b[] = {
+ [0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = llc_conn_ac_rst_vs,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_24b = {
+ .ev = llc_conn_ev_busy_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_24b,
+ .ev_actions = llc_busy_actions_24b,
+};
+
+/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_25[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_25[] = {
+ [0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = llc_conn_ac_rst_vs,
+ [4] = llc_conn_ac_set_data_flag_1,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_25 = {
+ .ev = llc_conn_ev_rej_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_25,
+ .ev_actions = llc_busy_actions_25,
+};
+
+/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_26[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_busy_actions_26[] = {
+ [0] = llc_conn_ac_set_data_flag_1,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_26 = {
+ .ev = llc_conn_ev_rej_tmr_exp,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_busy_ev_qfyrs_26,
+ .ev_actions = llc_busy_actions_26,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_busy_state_transitions[] = {
+ [0] = &llc_common_state_trans_1, /* Request */
+ [1] = &llc_common_state_trans_2,
+ [2] = &llc_busy_state_trans_1,
+ [3] = &llc_busy_state_trans_2,
+ [4] = &llc_busy_state_trans_2_1,
+ [5] = &llc_common_state_trans_end,
+ [6] = &llc_busy_state_trans_3, /* Local busy */
+ [7] = &llc_busy_state_trans_4,
+ [8] = &llc_busy_state_trans_5,
+ [9] = &llc_busy_state_trans_6,
+ [10] = &llc_busy_state_trans_7,
+ [11] = &llc_busy_state_trans_8,
+ [12] = &llc_common_state_trans_end,
+ [13] = &llc_busy_state_trans_22, /* Initiate PF cycle */
+ [14] = &llc_common_state_trans_end,
+ [15] = &llc_common_state_trans_11a, /* Timer */
+ [16] = &llc_common_state_trans_11b,
+ [17] = &llc_common_state_trans_11c,
+ [18] = &llc_common_state_trans_11d,
+ [19] = &llc_busy_state_trans_23,
+ [20] = &llc_busy_state_trans_24a,
+ [21] = &llc_busy_state_trans_24b,
+ [22] = &llc_busy_state_trans_25,
+ [23] = &llc_busy_state_trans_26,
+ [24] = &llc_common_state_trans_end,
+ [25] = &llc_busy_state_trans_9a, /* Receive frame */
+ [26] = &llc_busy_state_trans_9b,
+ [27] = &llc_busy_state_trans_10a,
+ [28] = &llc_busy_state_trans_10b,
+ [29] = &llc_busy_state_trans_11,
+ [30] = &llc_busy_state_trans_12,
+ [31] = &llc_busy_state_trans_13a,
+ [32] = &llc_busy_state_trans_13b,
+ [33] = &llc_busy_state_trans_14a,
+ [34] = &llc_busy_state_trans_14b,
+ [35] = &llc_busy_state_trans_15a,
+ [36] = &llc_busy_state_trans_15b,
+ [37] = &llc_busy_state_trans_15c,
+ [38] = &llc_busy_state_trans_16,
+ [39] = &llc_busy_state_trans_17a,
+ [40] = &llc_busy_state_trans_17b,
+ [41] = &llc_busy_state_trans_17c,
+ [42] = &llc_busy_state_trans_18,
+ [43] = &llc_busy_state_trans_19a,
+ [44] = &llc_busy_state_trans_19b,
+ [45] = &llc_busy_state_trans_20a,
+ [46] = &llc_busy_state_trans_20b,
+ [47] = &llc_busy_state_trans_21,
+ [48] = &llc_common_state_trans_3,
+ [49] = &llc_common_state_trans_4,
+ [50] = &llc_common_state_trans_5,
+ [51] = &llc_common_state_trans_6,
+ [52] = &llc_common_state_trans_7a,
+ [53] = &llc_common_state_trans_7b,
+ [54] = &llc_common_state_trans_8a,
+ [55] = &llc_common_state_trans_8b,
+ [56] = &llc_common_state_trans_8c,
+ [57] = &llc_common_state_trans_9,
+ /* [58] = &llc_common_state_trans_10, */
+ [58] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_REJ transitions */
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_1[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_1[] = {
+ [0] = llc_conn_ac_send_i_xxx_x_set_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_1 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_1,
+ .ev_actions = llc_reject_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+ [1] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_2[] = {
+ [0] = llc_conn_ac_send_i_xxx_x_set_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_2 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_2,
+ .ev_actions = llc_reject_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2_1[] = {
+ [0] = llc_conn_ev_qlfy_remote_busy_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_remote_busy,
+ [2] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_reject_actions_2_1[1];
+
+static struct llc_conn_state_trans llc_reject_state_trans_2_1 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_2_1,
+ .ev_actions = llc_reject_actions_2_1,
+};
+
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_3[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_3[] = {
+ [0] = llc_conn_ac_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_set_data_flag_2,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_3 = {
+ .ev = llc_conn_ev_local_busy_detected,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_reject_ev_qfyrs_3,
+ .ev_actions = llc_reject_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_4[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_4[] = {
+ [0] = llc_conn_ac_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_set_data_flag_2,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_4 = {
+ .ev = llc_conn_ev_local_busy_detected,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = llc_reject_ev_qfyrs_4,
+ .ev_actions = llc_reject_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_action_t llc_reject_actions_5a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_p_flag,
+ [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_5a = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_5a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_action_t llc_reject_actions_5b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_p_flag,
+ [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_5b = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_5b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_5c[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_5c[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_p_flag,
+ [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_5c = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_5c,
+ .ev_actions = llc_reject_actions_5c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_reject_actions_6[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_6 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_7a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_7a[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_send_ack_xxx_x_set_0,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [6] = llc_conn_ac_stop_rej_timer,
+ [7] = NULL,
+
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_7a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_reject_ev_qfyrs_7a,
+ .ev_actions = llc_reject_actions_7a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_7b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_7b[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_send_ack_xxx_x_set_0,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+ [6] = llc_conn_ac_stop_rej_timer,
+ [7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_7b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_reject_ev_qfyrs_7b,
+ .ev_actions = llc_reject_actions_7b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_8a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_8a[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_ack_xxx_x_set_0,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_stop_rej_timer,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_8a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_reject_ev_qfyrs_8a,
+ .ev_actions = llc_reject_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_8b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_8b[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_ack_xxx_x_set_0,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_stop_rej_timer,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_8b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_reject_ev_qfyrs_8b,
+ .ev_actions = llc_reject_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_reject_actions_9[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_ack_rsp_f_set_1,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_stop_rej_timer,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_9 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_9,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_reject_actions_10a[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_10a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_10a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_reject_actions_10b[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_10b = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_10b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_10c[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_10c[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_10c = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_10c,
+ .ev_actions = llc_reject_actions_10c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_reject_actions_11[] = {
+ [0] = llc_conn_ac_send_ack_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_11 = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_11,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_reject_actions_12a[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_12a = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_12a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_reject_actions_12b[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_12b = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_12b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_12c[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_12c[] = {
+ [0] = llc_conn_ac_upd_p_flag,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_12c = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_12c,
+ .ev_actions = llc_reject_actions_12c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_reject_actions_13[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_13 = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_13,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_14a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_14a[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_14a = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_14a,
+ .ev_actions = llc_reject_actions_14a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_14b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_14b[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_p_flag,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_14b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_14b,
+ .ev_actions = llc_reject_actions_14b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_15a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_15a[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_15a = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_15a,
+ .ev_actions = llc_reject_actions_15a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_15b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_15b[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_15b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_15b,
+ .ev_actions = llc_reject_actions_15b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_reject_actions_16[] = {
+ [0] = llc_conn_ac_set_vs_nr,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_resend_i_rsp_f_set_1,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_16 = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_reject_actions_16,
+};
+
+/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_17[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_17[] = {
+ [0] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_17 = {
+ .ev = llc_conn_ev_init_p_f_cycle,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_17,
+ .ev_actions = llc_reject_actions_17,
+};
+
+/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_18[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_18[] = {
+ [0] = llc_conn_ac_send_rej_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_start_rej_timer,
+ [3] = llc_conn_ac_inc_retry_cnt_by_1,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_18 = {
+ .ev = llc_conn_ev_rej_tmr_exp,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_18,
+ .ev_actions = llc_reject_actions_18,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_19[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_19[] = {
+ [0] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_start_rej_timer,
+ [3] = llc_conn_ac_inc_retry_cnt_by_1,
+ [4] = llc_conn_ac_rst_vs,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_19 = {
+ .ev = llc_conn_ev_p_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_19,
+ .ev_actions = llc_reject_actions_19,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_20a[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_20a[] = {
+ [0] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_start_rej_timer,
+ [3] = llc_conn_ac_inc_retry_cnt_by_1,
+ [4] = llc_conn_ac_rst_vs,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_20a = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_20a,
+ .ev_actions = llc_reject_actions_20a,
+};
+
+/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_20b[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_reject_actions_20b[] = {
+ [0] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_start_rej_timer,
+ [3] = llc_conn_ac_inc_retry_cnt_by_1,
+ [4] = llc_conn_ac_rst_vs,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_20b = {
+ .ev = llc_conn_ev_busy_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = llc_reject_ev_qfyrs_20b,
+ .ev_actions = llc_reject_actions_20b,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_reject_state_transitions[] = {
+ [0] = &llc_common_state_trans_1, /* Request */
+ [1] = &llc_common_state_trans_2,
+ [2] = &llc_common_state_trans_end,
+ [3] = &llc_reject_state_trans_1,
+ [4] = &llc_reject_state_trans_2,
+ [5] = &llc_reject_state_trans_2_1,
+ [6] = &llc_reject_state_trans_3, /* Local busy */
+ [7] = &llc_reject_state_trans_4,
+ [8] = &llc_common_state_trans_end,
+ [9] = &llc_reject_state_trans_17, /* Initiate PF cycle */
+ [10] = &llc_common_state_trans_end,
+ [11] = &llc_common_state_trans_11a, /* Timer */
+ [12] = &llc_common_state_trans_11b,
+ [13] = &llc_common_state_trans_11c,
+ [14] = &llc_common_state_trans_11d,
+ [15] = &llc_reject_state_trans_18,
+ [16] = &llc_reject_state_trans_19,
+ [17] = &llc_reject_state_trans_20a,
+ [18] = &llc_reject_state_trans_20b,
+ [19] = &llc_common_state_trans_end,
+ [20] = &llc_common_state_trans_3, /* Receive frame */
+ [21] = &llc_common_state_trans_4,
+ [22] = &llc_common_state_trans_5,
+ [23] = &llc_common_state_trans_6,
+ [24] = &llc_common_state_trans_7a,
+ [25] = &llc_common_state_trans_7b,
+ [26] = &llc_common_state_trans_8a,
+ [27] = &llc_common_state_trans_8b,
+ [28] = &llc_common_state_trans_8c,
+ [29] = &llc_common_state_trans_9,
+ /* [30] = &llc_common_state_trans_10, */
+ [30] = &llc_reject_state_trans_5a,
+ [31] = &llc_reject_state_trans_5b,
+ [32] = &llc_reject_state_trans_5c,
+ [33] = &llc_reject_state_trans_6,
+ [34] = &llc_reject_state_trans_7a,
+ [35] = &llc_reject_state_trans_7b,
+ [36] = &llc_reject_state_trans_8a,
+ [37] = &llc_reject_state_trans_8b,
+ [38] = &llc_reject_state_trans_9,
+ [39] = &llc_reject_state_trans_10a,
+ [40] = &llc_reject_state_trans_10b,
+ [41] = &llc_reject_state_trans_10c,
+ [42] = &llc_reject_state_trans_11,
+ [43] = &llc_reject_state_trans_12a,
+ [44] = &llc_reject_state_trans_12b,
+ [45] = &llc_reject_state_trans_12c,
+ [46] = &llc_reject_state_trans_13,
+ [47] = &llc_reject_state_trans_14a,
+ [48] = &llc_reject_state_trans_14b,
+ [49] = &llc_reject_state_trans_15a,
+ [50] = &llc_reject_state_trans_15b,
+ [51] = &llc_reject_state_trans_16,
+ [52] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_AWAIT transitions */
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static const llc_conn_ev_qfyr_t llc_await_ev_qfyrs_1_0[] = {
+ [0] = llc_conn_ev_qlfy_set_status_refuse,
+ [1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_await_actions_1_0[1];
+
+static struct llc_conn_state_trans llc_await_state_trans_1_0 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = llc_await_ev_qfyrs_1_0,
+ .ev_actions = llc_await_actions_1_0,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static const llc_conn_action_t llc_await_actions_1[] = {
+ [0] = llc_conn_ac_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_set_data_flag_0,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_1 = {
+ .ev = llc_conn_ev_local_busy_detected,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_actions_2[] = {
+ [0] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_stop_p_timer,
+ [4] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [5] = llc_conn_ac_start_rej_timer,
+ [6] = llc_conn_ac_clear_remote_busy,
+ [7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_2 = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_actions_3a[] = {
+ [0] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_start_rej_timer,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_3a = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_3a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_actions_3b[] = {
+ [0] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_start_rej_timer,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_3b = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_3b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_actions_4[] = {
+ [0] = llc_conn_ac_send_rej_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_start_rej_timer,
+ [4] = llc_conn_ac_start_p_timer,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_4 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_actions_5[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_upd_vs,
+ [5] = llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr,
+ [6] = llc_conn_ac_clear_remote_busy,
+ [7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_5 = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_actions_6a[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_upd_vs,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_6a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_6a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_actions_6b[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_upd_vs,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_6b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_6b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_actions_7[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_upd_vs,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_7 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_actions_8a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_8a = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_actions_8b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_8b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_actions_9a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_9a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_9a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_actions_9b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_9b = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_9b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_actions_9c[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_9c = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_9c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_actions_9d[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_9d = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_9d,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_actions_10a[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_10a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_10a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_actions_10b[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_10b = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_10b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_actions_11[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_set_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_11 = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_11,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_actions_12a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_12a = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_12a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_actions_12b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_12b = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_12b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_actions_13[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_set_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_13 = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_actions_13,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_await_ev_qfyrs_14[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_await_actions_14[] = {
+ [0] = llc_conn_ac_send_rr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_14 = {
+ .ev = llc_conn_ev_p_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = llc_await_ev_qfyrs_14,
+ .ev_actions = llc_await_actions_14,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_await_state_transitions[] = {
+ [0] = &llc_common_state_trans_1, /* Request */
+ [1] = &llc_common_state_trans_2,
+ [2] = &llc_await_state_trans_1_0,
+ [3] = &llc_common_state_trans_end,
+ [4] = &llc_await_state_trans_1, /* Local busy */
+ [5] = &llc_common_state_trans_end,
+ [6] = &llc_common_state_trans_end, /* Initiate PF Cycle */
+ [7] = &llc_common_state_trans_11a, /* Timer */
+ [8] = &llc_common_state_trans_11b,
+ [9] = &llc_common_state_trans_11c,
+ [10] = &llc_common_state_trans_11d,
+ [11] = &llc_await_state_trans_14,
+ [12] = &llc_common_state_trans_end,
+ [13] = &llc_common_state_trans_3, /* Receive frame */
+ [14] = &llc_common_state_trans_4,
+ [15] = &llc_common_state_trans_5,
+ [16] = &llc_common_state_trans_6,
+ [17] = &llc_common_state_trans_7a,
+ [18] = &llc_common_state_trans_7b,
+ [19] = &llc_common_state_trans_8a,
+ [20] = &llc_common_state_trans_8b,
+ [21] = &llc_common_state_trans_8c,
+ [22] = &llc_common_state_trans_9,
+ /* [23] = &llc_common_state_trans_10, */
+ [23] = &llc_await_state_trans_2,
+ [24] = &llc_await_state_trans_3a,
+ [25] = &llc_await_state_trans_3b,
+ [26] = &llc_await_state_trans_4,
+ [27] = &llc_await_state_trans_5,
+ [28] = &llc_await_state_trans_6a,
+ [29] = &llc_await_state_trans_6b,
+ [30] = &llc_await_state_trans_7,
+ [31] = &llc_await_state_trans_8a,
+ [32] = &llc_await_state_trans_8b,
+ [33] = &llc_await_state_trans_9a,
+ [34] = &llc_await_state_trans_9b,
+ [35] = &llc_await_state_trans_9c,
+ [36] = &llc_await_state_trans_9d,
+ [37] = &llc_await_state_trans_10a,
+ [38] = &llc_await_state_trans_10b,
+ [39] = &llc_await_state_trans_11,
+ [40] = &llc_await_state_trans_12a,
+ [41] = &llc_await_state_trans_12b,
+ [42] = &llc_await_state_trans_13,
+ [43] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_AWAIT_BUSY transitions */
+/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */
+static const llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1_0[] = {
+ [0] = llc_conn_ev_qlfy_set_status_refuse,
+ [1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_await_busy_actions_1_0[1];
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_1_0 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = llc_await_busy_ev_qfyrs_1_0,
+ .ev_actions = llc_await_busy_actions_1_0,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_1,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_await_busy_actions_1[] = {
+ [0] = llc_conn_ac_send_rej_xxx_x_set_0,
+ [1] = llc_conn_ac_start_rej_timer,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_1 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = llc_await_busy_ev_qfyrs_1,
+ .ev_actions = llc_await_busy_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_2[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_0,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_await_busy_actions_2[] = {
+ [0] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_2 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = llc_await_busy_ev_qfyrs_2,
+ .ev_actions = llc_await_busy_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static const llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_3[] = {
+ [0] = llc_conn_ev_qlfy_data_flag_eq_2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_await_busy_actions_3[] = {
+ [0] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_3 = {
+ .ev = llc_conn_ev_local_busy_cleared,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = llc_await_busy_ev_qfyrs_3,
+ .ev_actions = llc_await_busy_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_busy_actions_4[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_stop_p_timer,
+ [4] = llc_conn_ac_set_data_flag_1,
+ [5] = llc_conn_ac_clear_remote_busy,
+ [6] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_4 = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_busy_actions_5a[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_set_data_flag_1,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_5a = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_5a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_busy_actions_5b[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_set_data_flag_1,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_5b = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_5b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_busy_actions_6[] = {
+ [0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_set_data_flag_1,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_6 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_busy_actions_7[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_inc_vr_by_1,
+ [2] = llc_conn_ac_data_ind,
+ [3] = llc_conn_ac_stop_p_timer,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_upd_vs,
+ [6] = llc_conn_ac_set_data_flag_0,
+ [7] = llc_conn_ac_clear_remote_busy,
+ [8] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [9] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_7 = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_busy_actions_8a[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_inc_vr_by_1,
+ [2] = llc_conn_ac_data_ind,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_upd_vs,
+ [5] = llc_conn_ac_set_data_flag_0,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_8a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_busy_actions_8b[] = {
+ [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_inc_vr_by_1,
+ [2] = llc_conn_ac_data_ind,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_upd_vs,
+ [5] = llc_conn_ac_set_data_flag_0,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_8b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_busy_actions_9[] = {
+ [0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [1] = llc_conn_ac_inc_vr_by_1,
+ [2] = llc_conn_ac_data_ind,
+ [3] = llc_conn_ac_upd_nr_received,
+ [4] = llc_conn_ac_upd_vs,
+ [5] = llc_conn_ac_set_data_flag_0,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_9 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_9,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_busy_actions_10a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_10a = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_10a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_busy_actions_10b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_10b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_10b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_busy_actions_11a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_11a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_11a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_busy_actions_11b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_11b = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_11b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_busy_actions_11c[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_11c = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_11c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_busy_actions_11d[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_11d = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_11d,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_busy_actions_12a[] = {
+ [0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_12a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_12a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_busy_actions_12b[] = {
+ [0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_12b = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_12b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_busy_actions_13[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_set_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_13 = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_13,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_busy_actions_14a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_14a = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_14a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_busy_actions_14b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_14b = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_14b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_busy_actions_15[] = {
+ [0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_set_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_15 = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_busy_actions_15,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_16[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_await_busy_actions_16[] = {
+ [0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+ [1] = llc_conn_ac_start_p_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_16 = {
+ .ev = llc_conn_ev_p_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = llc_await_busy_ev_qfyrs_16,
+ .ev_actions = llc_await_busy_actions_16,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_await_busy_state_transitions[] = {
+ [0] = &llc_common_state_trans_1, /* Request */
+ [1] = &llc_common_state_trans_2,
+ [2] = &llc_await_busy_state_trans_1_0,
+ [3] = &llc_common_state_trans_end,
+ [4] = &llc_await_busy_state_trans_1, /* Local busy */
+ [5] = &llc_await_busy_state_trans_2,
+ [6] = &llc_await_busy_state_trans_3,
+ [7] = &llc_common_state_trans_end,
+ [8] = &llc_common_state_trans_end, /* Initiate PF cycle */
+ [9] = &llc_common_state_trans_11a, /* Timer */
+ [10] = &llc_common_state_trans_11b,
+ [11] = &llc_common_state_trans_11c,
+ [12] = &llc_common_state_trans_11d,
+ [13] = &llc_await_busy_state_trans_16,
+ [14] = &llc_common_state_trans_end,
+ [15] = &llc_await_busy_state_trans_4, /* Receive frame */
+ [16] = &llc_await_busy_state_trans_5a,
+ [17] = &llc_await_busy_state_trans_5b,
+ [18] = &llc_await_busy_state_trans_6,
+ [19] = &llc_await_busy_state_trans_7,
+ [20] = &llc_await_busy_state_trans_8a,
+ [21] = &llc_await_busy_state_trans_8b,
+ [22] = &llc_await_busy_state_trans_9,
+ [23] = &llc_await_busy_state_trans_10a,
+ [24] = &llc_await_busy_state_trans_10b,
+ [25] = &llc_await_busy_state_trans_11a,
+ [26] = &llc_await_busy_state_trans_11b,
+ [27] = &llc_await_busy_state_trans_11c,
+ [28] = &llc_await_busy_state_trans_11d,
+ [29] = &llc_await_busy_state_trans_12a,
+ [30] = &llc_await_busy_state_trans_12b,
+ [31] = &llc_await_busy_state_trans_13,
+ [32] = &llc_await_busy_state_trans_14a,
+ [33] = &llc_await_busy_state_trans_14b,
+ [34] = &llc_await_busy_state_trans_15,
+ [35] = &llc_common_state_trans_3,
+ [36] = &llc_common_state_trans_4,
+ [37] = &llc_common_state_trans_5,
+ [38] = &llc_common_state_trans_6,
+ [39] = &llc_common_state_trans_7a,
+ [40] = &llc_common_state_trans_7b,
+ [41] = &llc_common_state_trans_8a,
+ [42] = &llc_common_state_trans_8b,
+ [43] = &llc_common_state_trans_8c,
+ [44] = &llc_common_state_trans_9,
+ /* [45] = &llc_common_state_trans_10, */
+ [45] = &llc_common_state_trans_end,
+};
+
+/* ----------------- LLC_CONN_STATE_AWAIT_REJ transitions --------------- */
+/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */
+static const llc_conn_ev_qfyr_t llc_await_reject_ev_qfyrs_1_0[] = {
+ [0] = llc_conn_ev_qlfy_set_status_refuse,
+ [1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_await_reject_actions_1_0[1];
+
+static struct llc_conn_state_trans llc_await_reject_state_trans_1_0 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = llc_await_reject_ev_qfyrs_1_0,
+ .ev_actions = llc_await_reject_actions_1_0,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static const llc_conn_action_t llc_await_rejct_actions_1[] = {
+ [0] = llc_conn_ac_send_rnr_xxx_x_set_0,
+ [1] = llc_conn_ac_set_data_flag_2,
+ [2] = NULL
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_1 = {
+ .ev = llc_conn_ev_local_busy_detected,
+ .next_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_rejct_actions_2a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = NULL
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_2a = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_2a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_rejct_actions_2b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = NULL
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_2b = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_2b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_rejct_actions_3[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = NULL
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_3 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_rejct_actions_4[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_stop_rej_timer,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_upd_vs,
+ [6] = llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr,
+ [7] = llc_conn_ac_clear_remote_busy,
+ [8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_4 = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_rejct_actions_5a[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [3] = llc_conn_ac_stop_rej_timer,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_upd_vs,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_5a = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_5a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_rejct_actions_5b[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_rr_xxx_x_set_0,
+ [3] = llc_conn_ac_stop_rej_timer,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_upd_vs,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_5b = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_5b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_rejct_actions_6[] = {
+ [0] = llc_conn_ac_inc_vr_by_1,
+ [1] = llc_conn_ac_data_ind,
+ [2] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [3] = llc_conn_ac_stop_rej_timer,
+ [4] = llc_conn_ac_upd_nr_received,
+ [5] = llc_conn_ac_upd_vs,
+ [6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_6 = {
+ .ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_rejct_actions_7a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_7a = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_7a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_rejct_actions_7b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_7b = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_7b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static const llc_conn_action_t llc_await_rejct_actions_7c[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_resend_i_xxx_x_set_0,
+ [4] = llc_conn_ac_clear_remote_busy,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_7c = {
+ .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_7c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_rejct_actions_8a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_8a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_rejct_actions_8b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_8b = {
+ .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_rejct_actions_8c[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_8c = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_8c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_rejct_actions_8d[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_clear_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_8d = {
+ .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_8d,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_rejct_actions_9a[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_9a = {
+ .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_9a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_rejct_actions_9b[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_clear_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_9b = {
+ .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_9b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static const llc_conn_action_t llc_await_rejct_actions_10[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_stop_p_timer,
+ [3] = llc_conn_ac_set_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_10 = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+ .next_state = LLC_CONN_STATE_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_10,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static const llc_conn_action_t llc_await_rejct_actions_11a[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_11a = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_11a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static const llc_conn_action_t llc_await_rejct_actions_11b[] = {
+ [0] = llc_conn_ac_upd_nr_received,
+ [1] = llc_conn_ac_upd_vs,
+ [2] = llc_conn_ac_set_remote_busy,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_11b = {
+ .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_11b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static const llc_conn_action_t llc_await_rejct_actions_12[] = {
+ [0] = llc_conn_ac_send_rr_rsp_f_set_1,
+ [1] = llc_conn_ac_upd_nr_received,
+ [2] = llc_conn_ac_upd_vs,
+ [3] = llc_conn_ac_set_remote_busy,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_12 = {
+ .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_await_rejct_actions_12,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_await_rejct_ev_qfyrs_13[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_await_rejct_actions_13[] = {
+ [0] = llc_conn_ac_send_rej_cmd_p_set_1,
+ [1] = llc_conn_ac_stop_p_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_13 = {
+ .ev = llc_conn_ev_p_tmr_exp,
+ .next_state = LLC_CONN_STATE_AWAIT_REJ,
+ .ev_qualifiers = llc_await_rejct_ev_qfyrs_13,
+ .ev_actions = llc_await_rejct_actions_13,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_await_rejct_state_transitions[] = {
+ [0] = &llc_await_reject_state_trans_1_0,
+ [1] = &llc_common_state_trans_1, /* requests */
+ [2] = &llc_common_state_trans_2,
+ [3] = &llc_common_state_trans_end,
+ [4] = &llc_await_rejct_state_trans_1, /* local busy */
+ [5] = &llc_common_state_trans_end,
+ [6] = &llc_common_state_trans_end, /* Initiate PF cycle */
+ [7] = &llc_await_rejct_state_trans_13, /* timers */
+ [8] = &llc_common_state_trans_11a,
+ [9] = &llc_common_state_trans_11b,
+ [10] = &llc_common_state_trans_11c,
+ [11] = &llc_common_state_trans_11d,
+ [12] = &llc_common_state_trans_end,
+ [13] = &llc_await_rejct_state_trans_2a, /* receive frames */
+ [14] = &llc_await_rejct_state_trans_2b,
+ [15] = &llc_await_rejct_state_trans_3,
+ [16] = &llc_await_rejct_state_trans_4,
+ [17] = &llc_await_rejct_state_trans_5a,
+ [18] = &llc_await_rejct_state_trans_5b,
+ [19] = &llc_await_rejct_state_trans_6,
+ [20] = &llc_await_rejct_state_trans_7a,
+ [21] = &llc_await_rejct_state_trans_7b,
+ [22] = &llc_await_rejct_state_trans_7c,
+ [23] = &llc_await_rejct_state_trans_8a,
+ [24] = &llc_await_rejct_state_trans_8b,
+ [25] = &llc_await_rejct_state_trans_8c,
+ [26] = &llc_await_rejct_state_trans_8d,
+ [27] = &llc_await_rejct_state_trans_9a,
+ [28] = &llc_await_rejct_state_trans_9b,
+ [29] = &llc_await_rejct_state_trans_10,
+ [30] = &llc_await_rejct_state_trans_11a,
+ [31] = &llc_await_rejct_state_trans_11b,
+ [32] = &llc_await_rejct_state_trans_12,
+ [33] = &llc_common_state_trans_3,
+ [34] = &llc_common_state_trans_4,
+ [35] = &llc_common_state_trans_5,
+ [36] = &llc_common_state_trans_6,
+ [37] = &llc_common_state_trans_7a,
+ [38] = &llc_common_state_trans_7b,
+ [39] = &llc_common_state_trans_8a,
+ [40] = &llc_common_state_trans_8b,
+ [41] = &llc_common_state_trans_8c,
+ [42] = &llc_common_state_trans_9,
+ /* [43] = &llc_common_state_trans_10, */
+ [43] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_D_CONN transitions */
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event,
+ * cause_flag = 1 */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_1[] = {
+ [0] = llc_conn_ev_qlfy_cause_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_conflict,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_1[] = {
+ [0] = llc_conn_ac_send_dm_rsp_f_set_p,
+ [1] = llc_conn_ac_stop_ack_timer,
+ [2] = llc_conn_ac_disc_confirm,
+ [3] = llc_conn_disc,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_1 = {
+ .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_1,
+ .ev_actions = llc_d_conn_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event,
+ * cause_flag = 0
+ */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_1_1[] = {
+ [0] = llc_conn_ev_qlfy_cause_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_set_status_conflict,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_1_1[] = {
+ [0] = llc_conn_ac_send_dm_rsp_f_set_p,
+ [1] = llc_conn_ac_stop_ack_timer,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_1_1 = {
+ .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_1_1,
+ .ev_actions = llc_d_conn_actions_1_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event,
+ * cause_flag = 1
+ */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_2[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = llc_conn_ev_qlfy_cause_flag_eq_1,
+ [2] = llc_conn_ev_qlfy_set_status_disc,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_2[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_ac_disc_confirm,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_2 = {
+ .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_2,
+ .ev_actions = llc_d_conn_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event,
+ * cause_flag = 0
+ */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_2_1[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = llc_conn_ev_qlfy_cause_flag_eq_0,
+ [2] = llc_conn_ev_qlfy_set_status_disc,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_2_1[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_2_1 = {
+ .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_2_1,
+ .ev_actions = llc_d_conn_actions_2_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_d_conn_actions_3[] = {
+ [0] = llc_conn_ac_send_ua_rsp_f_set_p,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_3 = {
+ .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_D_CONN,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_d_conn_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event,
+ * cause_flag = 1
+ */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_4[] = {
+ [0] = llc_conn_ev_qlfy_cause_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_disc,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_4[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_ac_disc_confirm,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_4 = {
+ .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_4,
+ .ev_actions = llc_d_conn_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event,
+ * cause_flag = 0
+ */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_4_1[] = {
+ [0] = llc_conn_ev_qlfy_cause_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_set_status_disc,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_4_1[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_4_1 = {
+ .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_4_1,
+ .ev_actions = llc_d_conn_actions_4_1,
+};
+
+/*
+ * State transition for
+ * LLC_CONN_EV_DATA_CONN_REQ event
+ */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_5[] = {
+ [0] = llc_conn_ev_qlfy_set_status_refuse,
+ [1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_d_conn_actions_5[1];
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_5 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_D_CONN,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_5,
+ .ev_actions = llc_d_conn_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_6[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_6[] = {
+ [0] = llc_conn_ac_send_disc_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_6 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_D_CONN,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_6,
+ .ev_actions = llc_d_conn_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event, cause_flag = 1 */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_7[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = llc_conn_ev_qlfy_cause_flag_eq_1,
+ [2] = llc_conn_ev_qlfy_set_status_failed,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_7[] = {
+ [0] = llc_conn_ac_disc_confirm,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_7 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_7,
+ .ev_actions = llc_d_conn_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event, cause_flag = 0 */
+static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_8[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = llc_conn_ev_qlfy_cause_flag_eq_0,
+ [2] = llc_conn_ev_qlfy_set_status_failed,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_d_conn_actions_8[] = {
+ [0] = llc_conn_disc,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_8 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_d_conn_ev_qfyrs_8,
+ .ev_actions = llc_d_conn_actions_8,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_d_conn_state_transitions[] = {
+ [0] = &llc_d_conn_state_trans_5, /* Request */
+ [1] = &llc_common_state_trans_end,
+ [2] = &llc_common_state_trans_end, /* Local busy */
+ [3] = &llc_common_state_trans_end, /* Initiate PF cycle */
+ [4] = &llc_d_conn_state_trans_6, /* Timer */
+ [5] = &llc_d_conn_state_trans_7,
+ [6] = &llc_d_conn_state_trans_8,
+ [7] = &llc_common_state_trans_end,
+ [8] = &llc_d_conn_state_trans_1, /* Receive frame */
+ [9] = &llc_d_conn_state_trans_1_1,
+ [10] = &llc_d_conn_state_trans_2,
+ [11] = &llc_d_conn_state_trans_2_1,
+ [12] = &llc_d_conn_state_trans_3,
+ [13] = &llc_d_conn_state_trans_4,
+ [14] = &llc_d_conn_state_trans_4_1,
+ [15] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_RESET transitions */
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_rst_actions_1[] = {
+ [0] = llc_conn_ac_set_vs_0,
+ [1] = llc_conn_ac_set_vr_0,
+ [2] = llc_conn_ac_set_s_flag_1,
+ [3] = llc_conn_ac_send_ua_rsp_f_set_p,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_1 = {
+ .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_rst_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event,
+ * cause_flag = 1
+ */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_2[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = llc_conn_ev_qlfy_cause_flag_eq_1,
+ [2] = llc_conn_ev_qlfy_set_status_conn,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_rst_actions_2[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_ac_set_vs_0,
+ [2] = llc_conn_ac_set_vr_0,
+ [3] = llc_conn_ac_upd_p_flag,
+ [4] = llc_conn_ac_rst_confirm,
+ [5] = llc_conn_ac_set_remote_busy_0,
+ [6] = llc_conn_reset,
+ [7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_2 = {
+ .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_rst_ev_qfyrs_2,
+ .ev_actions = llc_rst_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event,
+ * cause_flag = 0
+ */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_2_1[] = {
+ [0] = llc_conn_ev_qlfy_p_flag_eq_f,
+ [1] = llc_conn_ev_qlfy_cause_flag_eq_0,
+ [2] = llc_conn_ev_qlfy_set_status_rst_done,
+ [3] = NULL,
+};
+
+static const llc_conn_action_t llc_rst_actions_2_1[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_ac_set_vs_0,
+ [2] = llc_conn_ac_set_vr_0,
+ [3] = llc_conn_ac_upd_p_flag,
+ [4] = llc_conn_ac_rst_confirm,
+ [5] = llc_conn_ac_set_remote_busy_0,
+ [6] = llc_conn_reset,
+ [7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_2_1 = {
+ .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_rst_ev_qfyrs_2_1,
+ .ev_actions = llc_rst_actions_2_1,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_3[] = {
+ [0] = llc_conn_ev_qlfy_s_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_rst_done,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_rst_actions_3[] = {
+ [0] = llc_conn_ac_set_p_flag_0,
+ [1] = llc_conn_ac_set_remote_busy_0,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_3 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = llc_rst_ev_qfyrs_3,
+ .ev_actions = llc_rst_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event,
+ * cause_flag = 1
+ */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_4[] = {
+ [0] = llc_conn_ev_qlfy_cause_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_disc,
+ [2] = NULL,
+};
+static const llc_conn_action_t llc_rst_actions_4[] = {
+ [0] = llc_conn_ac_send_dm_rsp_f_set_p,
+ [1] = llc_conn_ac_disc_ind,
+ [2] = llc_conn_ac_stop_ack_timer,
+ [3] = llc_conn_disc,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_4 = {
+ .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_rst_ev_qfyrs_4,
+ .ev_actions = llc_rst_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event,
+ * cause_flag = 0
+ */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_4_1[] = {
+ [0] = llc_conn_ev_qlfy_cause_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_set_status_refuse,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_rst_actions_4_1[] = {
+ [0] = llc_conn_ac_send_dm_rsp_f_set_p,
+ [1] = llc_conn_ac_stop_ack_timer,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_4_1 = {
+ .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_rst_ev_qfyrs_4_1,
+ .ev_actions = llc_rst_actions_4_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event,
+ * cause_flag = 1
+ */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_5[] = {
+ [0] = llc_conn_ev_qlfy_cause_flag_eq_1,
+ [1] = llc_conn_ev_qlfy_set_status_disc,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_rst_actions_5[] = {
+ [0] = llc_conn_ac_disc_ind,
+ [1] = llc_conn_ac_stop_ack_timer,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_5 = {
+ .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_rst_ev_qfyrs_5,
+ .ev_actions = llc_rst_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event,
+ * cause_flag = 0
+ */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_5_1[] = {
+ [0] = llc_conn_ev_qlfy_cause_flag_eq_0,
+ [1] = llc_conn_ev_qlfy_set_status_refuse,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_rst_actions_5_1[] = {
+ [0] = llc_conn_ac_stop_ack_timer,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_5_1 = {
+ .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_rst_ev_qfyrs_5_1,
+ .ev_actions = llc_rst_actions_5_1,
+};
+
+/* State transitions for DATA_CONN_REQ event */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_6[] = {
+ [0] = llc_conn_ev_qlfy_set_status_refuse,
+ [1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_rst_actions_6[1];
+
+static struct llc_conn_state_trans llc_rst_state_trans_6 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = llc_rst_ev_qfyrs_6,
+ .ev_actions = llc_rst_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_7[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = llc_conn_ev_qlfy_s_flag_eq_0,
+ [2] = NULL,
+};
+
+static const llc_conn_action_t llc_rst_actions_7[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_7 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = llc_rst_ev_qfyrs_7,
+ .ev_actions = llc_rst_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_8[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = llc_conn_ev_qlfy_s_flag_eq_0,
+ [2] = llc_conn_ev_qlfy_cause_flag_eq_1,
+ [3] = llc_conn_ev_qlfy_set_status_failed,
+ [4] = NULL,
+};
+static const llc_conn_action_t llc_rst_actions_8[] = {
+ [0] = llc_conn_ac_disc_ind,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_8 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_rst_ev_qfyrs_8,
+ .ev_actions = llc_rst_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_8_1[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = llc_conn_ev_qlfy_s_flag_eq_0,
+ [2] = llc_conn_ev_qlfy_cause_flag_eq_0,
+ [3] = llc_conn_ev_qlfy_set_status_failed,
+ [4] = NULL,
+};
+static const llc_conn_action_t llc_rst_actions_8_1[] = {
+ [0] = llc_conn_ac_disc_ind,
+ [1] = llc_conn_disc,
+ [2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_8_1 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = llc_rst_ev_qfyrs_8_1,
+ .ev_actions = llc_rst_actions_8_1,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_rst_state_transitions[] = {
+ [0] = &llc_rst_state_trans_6, /* Request */
+ [1] = &llc_common_state_trans_end,
+ [2] = &llc_common_state_trans_end, /* Local busy */
+ [3] = &llc_common_state_trans_end, /* Initiate PF cycle */
+ [4] = &llc_rst_state_trans_3, /* Timer */
+ [5] = &llc_rst_state_trans_7,
+ [6] = &llc_rst_state_trans_8,
+ [7] = &llc_rst_state_trans_8_1,
+ [8] = &llc_common_state_trans_end,
+ [9] = &llc_rst_state_trans_1, /* Receive frame */
+ [10] = &llc_rst_state_trans_2,
+ [11] = &llc_rst_state_trans_2_1,
+ [12] = &llc_rst_state_trans_4,
+ [13] = &llc_rst_state_trans_4_1,
+ [14] = &llc_rst_state_trans_5,
+ [15] = &llc_rst_state_trans_5_1,
+ [16] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_ERROR transitions */
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_error_actions_1[] = {
+ [0] = llc_conn_ac_set_vs_0,
+ [1] = llc_conn_ac_set_vr_0,
+ [2] = llc_conn_ac_send_ua_rsp_f_set_p,
+ [3] = llc_conn_ac_rst_ind,
+ [4] = llc_conn_ac_set_p_flag_0,
+ [5] = llc_conn_ac_set_remote_busy_0,
+ [6] = llc_conn_ac_stop_ack_timer,
+ [7] = llc_conn_reset,
+ [8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_1 = {
+ .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_NORMAL,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_error_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_error_actions_2[] = {
+ [0] = llc_conn_ac_send_ua_rsp_f_set_p,
+ [1] = llc_conn_ac_disc_ind,
+ [2] = llc_conn_ac_stop_ack_timer,
+ [3] = llc_conn_disc,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_2 = {
+ .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_error_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */
+static const llc_conn_action_t llc_error_actions_3[] = {
+ [0] = llc_conn_ac_disc_ind,
+ [1] = llc_conn_ac_stop_ack_timer,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_3 = {
+ .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_error_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_FRMR_RSP_Fbit_SET_X event */
+static const llc_conn_action_t llc_error_actions_4[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_set_retry_cnt_0,
+ [3] = llc_conn_ac_set_cause_flag_0,
+ [4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_4 = {
+ .ev = llc_conn_ev_rx_frmr_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_error_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_CMD_Pbit_SET_X event */
+static const llc_conn_action_t llc_error_actions_5[] = {
+ [0] = llc_conn_ac_resend_frmr_rsp_f_set_p,
+ [1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_5 = {
+ .ev = llc_conn_ev_rx_xxx_cmd_pbit_set_x,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_error_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_X event */
+static struct llc_conn_state_trans llc_error_state_trans_6 = {
+ .ev = llc_conn_ev_rx_xxx_rsp_fbit_set_x,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = NONE,
+ .ev_actions = NONE,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_error_ev_qfyrs_7[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_error_actions_7[] = {
+ [0] = llc_conn_ac_resend_frmr_rsp_f_set_0,
+ [1] = llc_conn_ac_start_ack_timer,
+ [2] = llc_conn_ac_inc_retry_cnt_by_1,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_7 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = llc_error_ev_qfyrs_7,
+ .ev_actions = llc_error_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static const llc_conn_ev_qfyr_t llc_error_ev_qfyrs_8[] = {
+ [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+ [1] = NULL,
+};
+
+static const llc_conn_action_t llc_error_actions_8[] = {
+ [0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+ [1] = llc_conn_ac_set_s_flag_0,
+ [2] = llc_conn_ac_start_ack_timer,
+ [3] = llc_conn_ac_set_retry_cnt_0,
+ [4] = llc_conn_ac_set_cause_flag_0,
+ [5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_8 = {
+ .ev = llc_conn_ev_ack_tmr_exp,
+ .next_state = LLC_CONN_STATE_RESET,
+ .ev_qualifiers = llc_error_ev_qfyrs_8,
+ .ev_actions = llc_error_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */
+static const llc_conn_ev_qfyr_t llc_error_ev_qfyrs_9[] = {
+ [0] = llc_conn_ev_qlfy_set_status_refuse,
+ [1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static const llc_conn_action_t llc_error_actions_9[1];
+
+static struct llc_conn_state_trans llc_error_state_trans_9 = {
+ .ev = llc_conn_ev_data_req,
+ .next_state = LLC_CONN_STATE_ERROR,
+ .ev_qualifiers = llc_error_ev_qfyrs_9,
+ .ev_actions = llc_error_actions_9,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_error_state_transitions[] = {
+ [0] = &llc_error_state_trans_9, /* Request */
+ [1] = &llc_common_state_trans_end,
+ [2] = &llc_common_state_trans_end, /* Local busy */
+ [3] = &llc_common_state_trans_end, /* Initiate PF cycle */
+ [4] = &llc_error_state_trans_7, /* Timer */
+ [5] = &llc_error_state_trans_8,
+ [6] = &llc_common_state_trans_end,
+ [7] = &llc_error_state_trans_1, /* Receive frame */
+ [8] = &llc_error_state_trans_2,
+ [9] = &llc_error_state_trans_3,
+ [10] = &llc_error_state_trans_4,
+ [11] = &llc_error_state_trans_5,
+ [12] = &llc_error_state_trans_6,
+ [13] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_TEMP transitions */
+/* State transitions for LLC_CONN_EV_DISC_REQ event */
+static const llc_conn_action_t llc_temp_actions_1[] = {
+ [0] = llc_conn_ac_stop_all_timers,
+ [1] = llc_conn_ac_send_disc_cmd_p_set_x,
+ [2] = llc_conn_disc,
+ [3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_temp_state_trans_1 = {
+ .ev = llc_conn_ev_disc_req,
+ .next_state = LLC_CONN_STATE_ADM,
+ .ev_qualifiers = NONE,
+ .ev_actions = llc_temp_actions_1,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_temp_state_transitions[] = {
+ [0] = &llc_temp_state_trans_1, /* requests */
+ [1] = &llc_common_state_trans_end,
+ [2] = &llc_common_state_trans_end, /* local busy */
+ [3] = &llc_common_state_trans_end, /* init_pf_cycle */
+ [4] = &llc_common_state_trans_end, /* timer */
+ [5] = &llc_common_state_trans_end, /* receive */
+};
+
+/* Connection State Transition Table */
+struct llc_conn_state llc_conn_state_table[NBR_CONN_STATES] = {
+ [LLC_CONN_STATE_ADM - 1] = {
+ .current_state = LLC_CONN_STATE_ADM,
+ .transitions = llc_adm_state_transitions,
+ },
+ [LLC_CONN_STATE_SETUP - 1] = {
+ .current_state = LLC_CONN_STATE_SETUP,
+ .transitions = llc_setup_state_transitions,
+ },
+ [LLC_CONN_STATE_NORMAL - 1] = {
+ .current_state = LLC_CONN_STATE_NORMAL,
+ .transitions = llc_normal_state_transitions,
+ },
+ [LLC_CONN_STATE_BUSY - 1] = {
+ .current_state = LLC_CONN_STATE_BUSY,
+ .transitions = llc_busy_state_transitions,
+ },
+ [LLC_CONN_STATE_REJ - 1] = {
+ .current_state = LLC_CONN_STATE_REJ,
+ .transitions = llc_reject_state_transitions,
+ },
+ [LLC_CONN_STATE_AWAIT - 1] = {
+ .current_state = LLC_CONN_STATE_AWAIT,
+ .transitions = llc_await_state_transitions,
+ },
+ [LLC_CONN_STATE_AWAIT_BUSY - 1] = {
+ .current_state = LLC_CONN_STATE_AWAIT_BUSY,
+ .transitions = llc_await_busy_state_transitions,
+ },
+ [LLC_CONN_STATE_AWAIT_REJ - 1] = {
+ .current_state = LLC_CONN_STATE_AWAIT_REJ,
+ .transitions = llc_await_rejct_state_transitions,
+ },
+ [LLC_CONN_STATE_D_CONN - 1] = {
+ .current_state = LLC_CONN_STATE_D_CONN,
+ .transitions = llc_d_conn_state_transitions,
+ },
+ [LLC_CONN_STATE_RESET - 1] = {
+ .current_state = LLC_CONN_STATE_RESET,
+ .transitions = llc_rst_state_transitions,
+ },
+ [LLC_CONN_STATE_ERROR - 1] = {
+ .current_state = LLC_CONN_STATE_ERROR,
+ .transitions = llc_error_state_transitions,
+ },
+ [LLC_CONN_STATE_TEMP - 1] = {
+ .current_state = LLC_CONN_STATE_TEMP,
+ .transitions = llc_temp_state_transitions,
+ },
+};
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
new file mode 100644
index 000000000..81a61fce3
--- /dev/null
+++ b/net/llc/llc_conn.c
@@ -0,0 +1,1016 @@
+/*
+ * llc_conn.c - Driver routines for connection component.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/llc_sap.h>
+#include <net/llc_conn.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_st.h>
+#include <net/llc_pdu.h>
+
+#if 0
+#define dprintk(args...) printk(KERN_DEBUG args)
+#else
+#define dprintk(args...)
+#endif
+
+static int llc_find_offset(int state, int ev_type);
+static void llc_conn_send_pdus(struct sock *sk);
+static int llc_conn_service(struct sock *sk, struct sk_buff *skb);
+static int llc_exec_conn_trans_actions(struct sock *sk,
+ struct llc_conn_state_trans *trans,
+ struct sk_buff *ev);
+static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
+ struct sk_buff *skb);
+
+/* Offset table on connection states transition diagram */
+static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV];
+
+int sysctl_llc2_ack_timeout = LLC2_ACK_TIME * HZ;
+int sysctl_llc2_p_timeout = LLC2_P_TIME * HZ;
+int sysctl_llc2_rej_timeout = LLC2_REJ_TIME * HZ;
+int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ;
+
+/**
+ * llc_conn_state_process - sends event to connection state machine
+ * @sk: connection
+ * @skb: occurred event
+ *
+ * Sends an event to connection state machine. After processing event
+ * (executing it's actions and changing state), upper layer will be
+ * indicated or confirmed, if needed. Returns 0 for success, 1 for
+ * failure. The socket lock has to be held before calling this function.
+ */
+int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
+{
+ int rc;
+ struct llc_sock *llc = llc_sk(skb->sk);
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ /*
+ * We have to hold the skb, because llc_conn_service will kfree it in
+ * the sending path and we need to look at the skb->cb, where we encode
+ * llc_conn_state_ev.
+ */
+ skb_get(skb);
+ ev->ind_prim = ev->cfm_prim = 0;
+ /*
+ * Send event to state machine
+ */
+ rc = llc_conn_service(skb->sk, skb);
+ if (unlikely(rc != 0)) {
+ printk(KERN_ERR "%s: llc_conn_service failed\n", __func__);
+ goto out_kfree_skb;
+ }
+
+ if (unlikely(!ev->ind_prim && !ev->cfm_prim)) {
+ /* indicate or confirm not required */
+ if (!skb->next)
+ goto out_kfree_skb;
+ goto out_skb_put;
+ }
+
+ if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */
+ skb_get(skb);
+
+ switch (ev->ind_prim) {
+ case LLC_DATA_PRIM:
+ llc_save_primitive(sk, skb, LLC_DATA_PRIM);
+ if (unlikely(sock_queue_rcv_skb(sk, skb))) {
+ /*
+ * shouldn't happen
+ */
+ printk(KERN_ERR "%s: sock_queue_rcv_skb failed!\n",
+ __func__);
+ kfree_skb(skb);
+ }
+ break;
+ case LLC_CONN_PRIM:
+ /*
+ * Can't be sock_queue_rcv_skb, because we have to leave the
+ * skb->sk pointing to the newly created struct sock in
+ * llc_conn_handler. -acme
+ */
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_state_change(sk);
+ break;
+ case LLC_DISC_PRIM:
+ sock_hold(sk);
+ if (sk->sk_type == SOCK_STREAM &&
+ sk->sk_state == TCP_ESTABLISHED) {
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ sk->sk_socket->state = SS_UNCONNECTED;
+ sk->sk_state = TCP_CLOSE;
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_state_change(sk);
+ }
+ }
+ kfree_skb(skb);
+ sock_put(sk);
+ break;
+ case LLC_RESET_PRIM:
+ /*
+ * FIXME:
+ * RESET is not being notified to upper layers for now
+ */
+ printk(KERN_INFO "%s: received a reset ind!\n", __func__);
+ kfree_skb(skb);
+ break;
+ default:
+ if (ev->ind_prim) {
+ printk(KERN_INFO "%s: received unknown %d prim!\n",
+ __func__, ev->ind_prim);
+ kfree_skb(skb);
+ }
+ /* No indication */
+ break;
+ }
+
+ switch (ev->cfm_prim) {
+ case LLC_DATA_PRIM:
+ if (!llc_data_accept_state(llc->state))
+ sk->sk_write_space(sk);
+ else
+ rc = llc->failed_data_req = 1;
+ break;
+ case LLC_CONN_PRIM:
+ if (sk->sk_type == SOCK_STREAM &&
+ sk->sk_state == TCP_SYN_SENT) {
+ if (ev->status) {
+ sk->sk_socket->state = SS_UNCONNECTED;
+ sk->sk_state = TCP_CLOSE;
+ } else {
+ sk->sk_socket->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
+ }
+ sk->sk_state_change(sk);
+ }
+ break;
+ case LLC_DISC_PRIM:
+ sock_hold(sk);
+ if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSING) {
+ sk->sk_socket->state = SS_UNCONNECTED;
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_state_change(sk);
+ }
+ sock_put(sk);
+ break;
+ case LLC_RESET_PRIM:
+ /*
+ * FIXME:
+ * RESET is not being notified to upper layers for now
+ */
+ printk(KERN_INFO "%s: received a reset conf!\n", __func__);
+ break;
+ default:
+ if (ev->cfm_prim) {
+ printk(KERN_INFO "%s: received unknown %d prim!\n",
+ __func__, ev->cfm_prim);
+ break;
+ }
+ goto out_skb_put; /* No confirmation */
+ }
+out_kfree_skb:
+ kfree_skb(skb);
+out_skb_put:
+ kfree_skb(skb);
+ return rc;
+}
+
+void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb)
+{
+ /* queue PDU to send to MAC layer */
+ skb_queue_tail(&sk->sk_write_queue, skb);
+ llc_conn_send_pdus(sk);
+}
+
+/**
+ * llc_conn_rtn_pdu - sends received data pdu to upper layer
+ * @sk: Active connection
+ * @skb: Received data frame
+ *
+ * Sends received data pdu to upper layer (by using indicate function).
+ * Prepares service parameters (prim and prim_data). calling indication
+ * function will be done in llc_conn_state_process.
+ */
+void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->ind_prim = LLC_DATA_PRIM;
+}
+
+/**
+ * llc_conn_resend_i_pdu_as_cmd - resend all all unacknowledged I PDUs
+ * @sk: active connection
+ * @nr: NR
+ * @first_p_bit: p_bit value of first pdu
+ *
+ * Resend all unacknowledged I PDUs, starting with the NR; send first as
+ * command PDU with P bit equal first_p_bit; if more than one send
+ * subsequent as command PDUs with P bit equal zero (0).
+ */
+void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit)
+{
+ struct sk_buff *skb;
+ struct llc_pdu_sn *pdu;
+ u16 nbr_unack_pdus;
+ struct llc_sock *llc;
+ u8 howmany_resend = 0;
+
+ llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus);
+ if (!nbr_unack_pdus)
+ goto out;
+ /*
+ * Process unack PDUs only if unack queue is not empty; remove
+ * appropriate PDUs, fix them up, and put them on mac_pdu_q.
+ */
+ llc = llc_sk(sk);
+
+ while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) {
+ pdu = llc_pdu_sn_hdr(skb);
+ llc_pdu_set_cmd_rsp(skb, LLC_PDU_CMD);
+ llc_pdu_set_pf_bit(skb, first_p_bit);
+ skb_queue_tail(&sk->sk_write_queue, skb);
+ first_p_bit = 0;
+ llc->vS = LLC_I_GET_NS(pdu);
+ howmany_resend++;
+ }
+ if (howmany_resend > 0)
+ llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO;
+ /* any PDUs to re-send are queued up; start sending to MAC */
+ llc_conn_send_pdus(sk);
+out:;
+}
+
+/**
+ * llc_conn_resend_i_pdu_as_rsp - Resend all unacknowledged I PDUs
+ * @sk: active connection.
+ * @nr: NR
+ * @first_f_bit: f_bit value of first pdu.
+ *
+ * Resend all unacknowledged I PDUs, starting with the NR; send first as
+ * response PDU with F bit equal first_f_bit; if more than one send
+ * subsequent as response PDUs with F bit equal zero (0).
+ */
+void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit)
+{
+ struct sk_buff *skb;
+ u16 nbr_unack_pdus;
+ struct llc_sock *llc = llc_sk(sk);
+ u8 howmany_resend = 0;
+
+ llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus);
+ if (!nbr_unack_pdus)
+ goto out;
+ /*
+ * Process unack PDUs only if unack queue is not empty; remove
+ * appropriate PDUs, fix them up, and put them on mac_pdu_q
+ */
+ while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) {
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ llc_pdu_set_cmd_rsp(skb, LLC_PDU_RSP);
+ llc_pdu_set_pf_bit(skb, first_f_bit);
+ skb_queue_tail(&sk->sk_write_queue, skb);
+ first_f_bit = 0;
+ llc->vS = LLC_I_GET_NS(pdu);
+ howmany_resend++;
+ }
+ if (howmany_resend > 0)
+ llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO;
+ /* any PDUs to re-send are queued up; start sending to MAC */
+ llc_conn_send_pdus(sk);
+out:;
+}
+
+/**
+ * llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue
+ * @sk: active connection
+ * nr: NR
+ * how_many_unacked: size of pdu_unack_q after removing acked pdus
+ *
+ * Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns
+ * the number of pdus that removed from queue.
+ */
+int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked)
+{
+ int pdu_pos, i;
+ struct sk_buff *skb;
+ struct llc_pdu_sn *pdu;
+ int nbr_acked = 0;
+ struct llc_sock *llc = llc_sk(sk);
+ int q_len = skb_queue_len(&llc->pdu_unack_q);
+
+ if (!q_len)
+ goto out;
+ skb = skb_peek(&llc->pdu_unack_q);
+ pdu = llc_pdu_sn_hdr(skb);
+
+ /* finding position of last acked pdu in queue */
+ pdu_pos = ((int)LLC_2_SEQ_NBR_MODULO + (int)nr -
+ (int)LLC_I_GET_NS(pdu)) % LLC_2_SEQ_NBR_MODULO;
+
+ for (i = 0; i < pdu_pos && i < q_len; i++) {
+ skb = skb_dequeue(&llc->pdu_unack_q);
+ kfree_skb(skb);
+ nbr_acked++;
+ }
+out:
+ *how_many_unacked = skb_queue_len(&llc->pdu_unack_q);
+ return nbr_acked;
+}
+
+/**
+ * llc_conn_send_pdus - Sends queued PDUs
+ * @sk: active connection
+ *
+ * Sends queued pdus to MAC layer for transmission.
+ */
+static void llc_conn_send_pdus(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) {
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ if (LLC_PDU_TYPE_IS_I(pdu) &&
+ !(skb->dev->flags & IFF_LOOPBACK)) {
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb);
+ if (!skb2)
+ break;
+ skb = skb2;
+ }
+ dev_queue_xmit(skb);
+ }
+}
+
+/**
+ * llc_conn_service - finds transition and changes state of connection
+ * @sk: connection
+ * @skb: happened event
+ *
+ * This function finds transition that matches with happened event, then
+ * executes related actions and finally changes state of connection.
+ * Returns 0 for success, 1 for failure.
+ */
+static int llc_conn_service(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = 1;
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_conn_state_trans *trans;
+
+ if (llc->state > NBR_CONN_STATES)
+ goto out;
+ rc = 0;
+ trans = llc_qualify_conn_ev(sk, skb);
+ if (trans) {
+ rc = llc_exec_conn_trans_actions(sk, trans, skb);
+ if (!rc && trans->next_state != NO_STATE_CHANGE) {
+ llc->state = trans->next_state;
+ if (!llc_data_accept_state(llc->state))
+ sk->sk_state_change(sk);
+ }
+ }
+out:
+ return rc;
+}
+
+/**
+ * llc_qualify_conn_ev - finds transition for event
+ * @sk: connection
+ * @skb: happened event
+ *
+ * This function finds transition that matches with happened event.
+ * Returns pointer to found transition on success, %NULL otherwise.
+ */
+static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct llc_conn_state_trans **next_trans;
+ const llc_conn_ev_qfyr_t *next_qualifier;
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_conn_state *curr_state =
+ &llc_conn_state_table[llc->state - 1];
+
+ /* search thru events for this state until
+ * list exhausted or until no more
+ */
+ for (next_trans = curr_state->transitions +
+ llc_find_offset(llc->state - 1, ev->type);
+ (*next_trans)->ev; next_trans++) {
+ if (!((*next_trans)->ev)(sk, skb)) {
+ /* got POSSIBLE event match; the event may require
+ * qualification based on the values of a number of
+ * state flags; if all qualifications are met (i.e.,
+ * if all qualifying functions return success, or 0,
+ * then this is THE event we're looking for
+ */
+ for (next_qualifier = (*next_trans)->ev_qualifiers;
+ next_qualifier && *next_qualifier &&
+ !(*next_qualifier)(sk, skb); next_qualifier++)
+ /* nothing */;
+ if (!next_qualifier || !*next_qualifier)
+ /* all qualifiers executed successfully; this is
+ * our transition; return it so we can perform
+ * the associated actions & change the state
+ */
+ return *next_trans;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * llc_exec_conn_trans_actions - executes related actions
+ * @sk: connection
+ * @trans: transition that it's actions must be performed
+ * @skb: event
+ *
+ * Executes actions that is related to happened event. Returns 0 for
+ * success, 1 to indicate failure of at least one action.
+ */
+static int llc_exec_conn_trans_actions(struct sock *sk,
+ struct llc_conn_state_trans *trans,
+ struct sk_buff *skb)
+{
+ int rc = 0;
+ const llc_conn_action_t *next_action;
+
+ for (next_action = trans->ev_actions;
+ next_action && *next_action; next_action++) {
+ int rc2 = (*next_action)(sk, skb);
+
+ if (rc2 == 2) {
+ rc = rc2;
+ break;
+ } else if (rc2)
+ rc = 1;
+ }
+ return rc;
+}
+
+static inline bool llc_estab_match(const struct llc_sap *sap,
+ const struct llc_addr *daddr,
+ const struct llc_addr *laddr,
+ const struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ return llc->laddr.lsap == laddr->lsap &&
+ llc->daddr.lsap == daddr->lsap &&
+ ether_addr_equal(llc->laddr.mac, laddr->mac) &&
+ ether_addr_equal(llc->daddr.mac, daddr->mac);
+}
+
+/**
+ * __llc_lookup_established - Finds connection for the remote/local sap/mac
+ * @sap: SAP
+ * @daddr: address of remote LLC (MAC + SAP)
+ * @laddr: address of local LLC (MAC + SAP)
+ *
+ * Search connection list of the SAP and finds connection using the remote
+ * mac, remote sap, local mac, and local sap. Returns pointer for
+ * connection found, %NULL otherwise.
+ * Caller has to make sure local_bh is disabled.
+ */
+static struct sock *__llc_lookup_established(struct llc_sap *sap,
+ struct llc_addr *daddr,
+ struct llc_addr *laddr)
+{
+ struct sock *rc;
+ struct hlist_nulls_node *node;
+ int slot = llc_sk_laddr_hashfn(sap, laddr);
+ struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot];
+
+ rcu_read_lock();
+again:
+ sk_nulls_for_each_rcu(rc, node, laddr_hb) {
+ if (llc_estab_match(sap, daddr, laddr, rc)) {
+ /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
+ goto again;
+ if (unlikely(llc_sk(rc)->sap != sap ||
+ !llc_estab_match(sap, daddr, laddr, rc))) {
+ sock_put(rc);
+ continue;
+ }
+ goto found;
+ }
+ }
+ rc = NULL;
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (unlikely(get_nulls_value(node) != slot))
+ goto again;
+found:
+ rcu_read_unlock();
+ return rc;
+}
+
+struct sock *llc_lookup_established(struct llc_sap *sap,
+ struct llc_addr *daddr,
+ struct llc_addr *laddr)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+ sk = __llc_lookup_established(sap, daddr, laddr);
+ local_bh_enable();
+ return sk;
+}
+
+static inline bool llc_listener_match(const struct llc_sap *sap,
+ const struct llc_addr *laddr,
+ const struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ return sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN &&
+ llc->laddr.lsap == laddr->lsap &&
+ ether_addr_equal(llc->laddr.mac, laddr->mac);
+}
+
+static struct sock *__llc_lookup_listener(struct llc_sap *sap,
+ struct llc_addr *laddr)
+{
+ struct sock *rc;
+ struct hlist_nulls_node *node;
+ int slot = llc_sk_laddr_hashfn(sap, laddr);
+ struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot];
+
+ rcu_read_lock();
+again:
+ sk_nulls_for_each_rcu(rc, node, laddr_hb) {
+ if (llc_listener_match(sap, laddr, rc)) {
+ /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
+ goto again;
+ if (unlikely(llc_sk(rc)->sap != sap ||
+ !llc_listener_match(sap, laddr, rc))) {
+ sock_put(rc);
+ continue;
+ }
+ goto found;
+ }
+ }
+ rc = NULL;
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (unlikely(get_nulls_value(node) != slot))
+ goto again;
+found:
+ rcu_read_unlock();
+ return rc;
+}
+
+/**
+ * llc_lookup_listener - Finds listener for local MAC + SAP
+ * @sap: SAP
+ * @laddr: address of local LLC (MAC + SAP)
+ *
+ * Search connection list of the SAP and finds connection listening on
+ * local mac, and local sap. Returns pointer for parent socket found,
+ * %NULL otherwise.
+ * Caller has to make sure local_bh is disabled.
+ */
+static struct sock *llc_lookup_listener(struct llc_sap *sap,
+ struct llc_addr *laddr)
+{
+ static struct llc_addr null_addr;
+ struct sock *rc = __llc_lookup_listener(sap, laddr);
+
+ if (!rc)
+ rc = __llc_lookup_listener(sap, &null_addr);
+
+ return rc;
+}
+
+static struct sock *__llc_lookup(struct llc_sap *sap,
+ struct llc_addr *daddr,
+ struct llc_addr *laddr)
+{
+ struct sock *sk = __llc_lookup_established(sap, daddr, laddr);
+
+ return sk ? : llc_lookup_listener(sap, laddr);
+}
+
+/**
+ * llc_data_accept_state - designates if in this state data can be sent.
+ * @state: state of connection.
+ *
+ * Returns 0 if data can be sent, 1 otherwise.
+ */
+u8 llc_data_accept_state(u8 state)
+{
+ return state != LLC_CONN_STATE_NORMAL && state != LLC_CONN_STATE_BUSY &&
+ state != LLC_CONN_STATE_REJ;
+}
+
+/**
+ * llc_find_next_offset - finds offset for next category of transitions
+ * @state: state table.
+ * @offset: start offset.
+ *
+ * Finds offset of next category of transitions in transition table.
+ * Returns the start index of next category.
+ */
+static u16 __init llc_find_next_offset(struct llc_conn_state *state, u16 offset)
+{
+ u16 cnt = 0;
+ struct llc_conn_state_trans **next_trans;
+
+ for (next_trans = state->transitions + offset;
+ (*next_trans)->ev; next_trans++)
+ ++cnt;
+ return cnt;
+}
+
+/**
+ * llc_build_offset_table - builds offset table of connection
+ *
+ * Fills offset table of connection state transition table
+ * (llc_offset_table).
+ */
+void __init llc_build_offset_table(void)
+{
+ struct llc_conn_state *curr_state;
+ int state, ev_type, next_offset;
+
+ for (state = 0; state < NBR_CONN_STATES; state++) {
+ curr_state = &llc_conn_state_table[state];
+ next_offset = 0;
+ for (ev_type = 0; ev_type < NBR_CONN_EV; ev_type++) {
+ llc_offset_table[state][ev_type] = next_offset;
+ next_offset += llc_find_next_offset(curr_state,
+ next_offset) + 1;
+ }
+ }
+}
+
+/**
+ * llc_find_offset - finds start offset of category of transitions
+ * @state: state of connection
+ * @ev_type: type of happened event
+ *
+ * Finds start offset of desired category of transitions. Returns the
+ * desired start offset.
+ */
+static int llc_find_offset(int state, int ev_type)
+{
+ int rc = 0;
+ /* at this stage, llc_offset_table[..][2] is not important. it is for
+ * init_pf_cycle and I don't know what is it.
+ */
+ switch (ev_type) {
+ case LLC_CONN_EV_TYPE_PRIM:
+ rc = llc_offset_table[state][0]; break;
+ case LLC_CONN_EV_TYPE_PDU:
+ rc = llc_offset_table[state][4]; break;
+ case LLC_CONN_EV_TYPE_SIMPLE:
+ rc = llc_offset_table[state][1]; break;
+ case LLC_CONN_EV_TYPE_P_TMR:
+ case LLC_CONN_EV_TYPE_ACK_TMR:
+ case LLC_CONN_EV_TYPE_REJ_TMR:
+ case LLC_CONN_EV_TYPE_BUSY_TMR:
+ rc = llc_offset_table[state][3]; break;
+ }
+ return rc;
+}
+
+/**
+ * llc_sap_add_socket - adds a socket to a SAP
+ * @sap: SAP
+ * @sk: socket
+ *
+ * This function adds a socket to the hash tables of a SAP.
+ */
+void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+ struct hlist_head *dev_hb = llc_sk_dev_hash(sap, llc->dev->ifindex);
+ struct hlist_nulls_head *laddr_hb = llc_sk_laddr_hash(sap, &llc->laddr);
+
+ llc_sap_hold(sap);
+ llc_sk(sk)->sap = sap;
+
+ spin_lock_bh(&sap->sk_lock);
+ sap->sk_count++;
+ sk_nulls_add_node_rcu(sk, laddr_hb);
+ hlist_add_head(&llc->dev_hash_node, dev_hb);
+ spin_unlock_bh(&sap->sk_lock);
+}
+
+/**
+ * llc_sap_remove_socket - removes a socket from SAP
+ * @sap: SAP
+ * @sk: socket
+ *
+ * This function removes a connection from the hash tables of a SAP if
+ * the connection was in this list.
+ */
+void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ spin_lock_bh(&sap->sk_lock);
+ sk_nulls_del_node_init_rcu(sk);
+ hlist_del(&llc->dev_hash_node);
+ sap->sk_count--;
+ spin_unlock_bh(&sap->sk_lock);
+ llc_sap_put(sap);
+}
+
+/**
+ * llc_conn_rcv - sends received pdus to the connection state machine
+ * @sk: current connection structure.
+ * @skb: received frame.
+ *
+ * Sends received pdus to the connection state machine.
+ */
+static int llc_conn_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->type = LLC_CONN_EV_TYPE_PDU;
+ ev->reason = 0;
+ return llc_conn_state_process(sk, skb);
+}
+
+static struct sock *llc_create_incoming_sock(struct sock *sk,
+ struct net_device *dev,
+ struct llc_addr *saddr,
+ struct llc_addr *daddr)
+{
+ struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC,
+ sk->sk_prot);
+ struct llc_sock *newllc, *llc = llc_sk(sk);
+
+ if (!newsk)
+ goto out;
+ newllc = llc_sk(newsk);
+ memcpy(&newllc->laddr, daddr, sizeof(newllc->laddr));
+ memcpy(&newllc->daddr, saddr, sizeof(newllc->daddr));
+ newllc->dev = dev;
+ dev_hold(dev);
+ llc_sap_add_socket(llc->sap, newsk);
+ llc_sap_hold(llc->sap);
+out:
+ return newsk;
+}
+
+void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_addr saddr, daddr;
+ struct sock *sk;
+
+ llc_pdu_decode_sa(skb, saddr.mac);
+ llc_pdu_decode_ssap(skb, &saddr.lsap);
+ llc_pdu_decode_da(skb, daddr.mac);
+ llc_pdu_decode_dsap(skb, &daddr.lsap);
+
+ sk = __llc_lookup(sap, &saddr, &daddr);
+ if (!sk)
+ goto drop;
+
+ bh_lock_sock(sk);
+ /*
+ * This has to be done here and not at the upper layer ->accept
+ * method because of the way the PROCOM state machine works:
+ * it needs to set several state variables (see, for instance,
+ * llc_adm_actions_2 in net/llc/llc_c_st.c) and send a packet to
+ * the originator of the new connection, and this state has to be
+ * in the newly created struct sock private area. -acme
+ */
+ if (unlikely(sk->sk_state == TCP_LISTEN)) {
+ struct sock *newsk = llc_create_incoming_sock(sk, skb->dev,
+ &saddr, &daddr);
+ if (!newsk)
+ goto drop_unlock;
+ skb_set_owner_r(skb, newsk);
+ } else {
+ /*
+ * Can't be skb_set_owner_r, this will be done at the
+ * llc_conn_state_process function, later on, when we will use
+ * skb_queue_rcv_skb to send it to upper layers, this is
+ * another trick required to cope with how the PROCOM state
+ * machine works. -acme
+ */
+ skb->sk = sk;
+ }
+ if (!sock_owned_by_user(sk))
+ llc_conn_rcv(sk, skb);
+ else {
+ dprintk("%s: adding to backlog...\n", __func__);
+ llc_set_backlog_type(skb, LLC_PACKET);
+ if (sk_add_backlog(sk, skb, sk->sk_rcvbuf))
+ goto drop_unlock;
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+ return;
+drop:
+ kfree_skb(skb);
+ return;
+drop_unlock:
+ kfree_skb(skb);
+ goto out;
+}
+
+#undef LLC_REFCNT_DEBUG
+#ifdef LLC_REFCNT_DEBUG
+static atomic_t llc_sock_nr;
+#endif
+
+/**
+ * llc_backlog_rcv - Processes rx frames and expired timers.
+ * @sk: LLC sock (p8022 connection)
+ * @skb: queued rx frame or event
+ *
+ * This function processes frames that has received and timers that has
+ * expired during sending an I pdu (refer to data_req_handler). frames
+ * queue by llc_rcv function (llc_mac.c) and timers queue by timer
+ * callback functions(llc_c_ac.c).
+ */
+static int llc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int rc = 0;
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (likely(llc_backlog_type(skb) == LLC_PACKET)) {
+ if (likely(llc->state > 1)) /* not closed */
+ rc = llc_conn_rcv(sk, skb);
+ else
+ goto out_kfree_skb;
+ } else if (llc_backlog_type(skb) == LLC_EVENT) {
+ /* timer expiration event */
+ if (likely(llc->state > 1)) /* not closed */
+ rc = llc_conn_state_process(sk, skb);
+ else
+ goto out_kfree_skb;
+ } else {
+ printk(KERN_ERR "%s: invalid skb in backlog\n", __func__);
+ goto out_kfree_skb;
+ }
+out:
+ return rc;
+out_kfree_skb:
+ kfree_skb(skb);
+ goto out;
+}
+
+/**
+ * llc_sk_init - Initializes a socket with default llc values.
+ * @sk: socket to initialize.
+ *
+ * Initializes a socket with default llc values.
+ */
+static void llc_sk_init(struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc->state = LLC_CONN_STATE_ADM;
+ llc->inc_cntr = llc->dec_cntr = 2;
+ llc->dec_step = llc->connect_step = 1;
+
+ setup_timer(&llc->ack_timer.timer, llc_conn_ack_tmr_cb,
+ (unsigned long)sk);
+ llc->ack_timer.expire = sysctl_llc2_ack_timeout;
+
+ setup_timer(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb,
+ (unsigned long)sk);
+ llc->pf_cycle_timer.expire = sysctl_llc2_p_timeout;
+
+ setup_timer(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb,
+ (unsigned long)sk);
+ llc->rej_sent_timer.expire = sysctl_llc2_rej_timeout;
+
+ setup_timer(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb,
+ (unsigned long)sk);
+ llc->busy_state_timer.expire = sysctl_llc2_busy_timeout;
+
+ llc->n2 = 2; /* max retransmit */
+ llc->k = 2; /* tx win size, will adjust dynam */
+ llc->rw = 128; /* rx win size (opt and equal to
+ * tx_win of remote LLC) */
+ skb_queue_head_init(&llc->pdu_unack_q);
+ sk->sk_backlog_rcv = llc_backlog_rcv;
+}
+
+/**
+ * llc_sk_alloc - Allocates LLC sock
+ * @family: upper layer protocol family
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Allocates a LLC sock and initializes it. Returns the new LLC sock
+ * or %NULL if there's no memory available for one
+ */
+struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot)
+{
+ struct sock *sk = sk_alloc(net, family, priority, prot);
+
+ if (!sk)
+ goto out;
+ llc_sk_init(sk);
+ sock_init_data(NULL, sk);
+#ifdef LLC_REFCNT_DEBUG
+ atomic_inc(&llc_sock_nr);
+ printk(KERN_DEBUG "LLC socket %p created in %s, now we have %d alive\n", sk,
+ __func__, atomic_read(&llc_sock_nr));
+#endif
+out:
+ return sk;
+}
+
+/**
+ * llc_sk_free - Frees a LLC socket
+ * @sk - socket to free
+ *
+ * Frees a LLC socket
+ */
+void llc_sk_free(struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc->state = LLC_CONN_OUT_OF_SVC;
+ /* Stop all (possibly) running timers */
+ llc_conn_ac_stop_all_timers(sk, NULL);
+#ifdef DEBUG_LLC_CONN_ALLOC
+ printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __func__,
+ skb_queue_len(&llc->pdu_unack_q),
+ skb_queue_len(&sk->sk_write_queue));
+#endif
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+ skb_queue_purge(&llc->pdu_unack_q);
+#ifdef LLC_REFCNT_DEBUG
+ if (atomic_read(&sk->sk_refcnt) != 1) {
+ printk(KERN_DEBUG "Destruction of LLC sock %p delayed in %s, cnt=%d\n",
+ sk, __func__, atomic_read(&sk->sk_refcnt));
+ printk(KERN_DEBUG "%d LLC sockets are still alive\n",
+ atomic_read(&llc_sock_nr));
+ } else {
+ atomic_dec(&llc_sock_nr);
+ printk(KERN_DEBUG "LLC socket %p released in %s, %d are still alive\n", sk,
+ __func__, atomic_read(&llc_sock_nr));
+ }
+#endif
+ sock_put(sk);
+}
+
+/**
+ * llc_sk_reset - resets a connection
+ * @sk: LLC socket to reset
+ *
+ * Resets a connection to the out of service state. Stops its timers
+ * and frees any frames in the queues of the connection.
+ */
+void llc_sk_reset(struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ llc_conn_ac_stop_all_timers(sk, NULL);
+ skb_queue_purge(&sk->sk_write_queue);
+ skb_queue_purge(&llc->pdu_unack_q);
+ llc->remote_busy_flag = 0;
+ llc->cause_flag = 0;
+ llc->retry_count = 0;
+ llc_conn_set_p_flag(sk, 0);
+ llc->f_flag = 0;
+ llc->s_flag = 0;
+ llc->ack_pf = 0;
+ llc->first_pdu_Ns = 0;
+ llc->ack_must_be_send = 0;
+ llc->dec_step = 1;
+ llc->inc_cntr = 2;
+ llc->dec_cntr = 2;
+ llc->X = 0;
+ llc->failed_data_req = 0 ;
+ llc->last_nr = 0;
+}
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
new file mode 100644
index 000000000..842851cef
--- /dev/null
+++ b/net/llc/llc_core.c
@@ -0,0 +1,168 @@
+/*
+ * llc_core.c - Minimum needed routines for sap handling and module init/exit
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/llc.h>
+
+LIST_HEAD(llc_sap_list);
+static DEFINE_SPINLOCK(llc_sap_list_lock);
+
+/**
+ * llc_sap_alloc - allocates and initializes sap.
+ *
+ * Allocates and initializes sap.
+ */
+static struct llc_sap *llc_sap_alloc(void)
+{
+ struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC);
+ int i;
+
+ if (sap) {
+ /* sap->laddr.mac - leave as a null, it's filled by bind */
+ sap->state = LLC_SAP_STATE_ACTIVE;
+ spin_lock_init(&sap->sk_lock);
+ for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++)
+ INIT_HLIST_NULLS_HEAD(&sap->sk_laddr_hash[i], i);
+ atomic_set(&sap->refcnt, 1);
+ }
+ return sap;
+}
+
+static struct llc_sap *__llc_sap_find(unsigned char sap_value)
+{
+ struct llc_sap *sap;
+
+ list_for_each_entry(sap, &llc_sap_list, node)
+ if (sap->laddr.lsap == sap_value)
+ goto out;
+ sap = NULL;
+out:
+ return sap;
+}
+
+/**
+ * llc_sap_find - searchs a SAP in station
+ * @sap_value: sap to be found
+ *
+ * Searchs for a sap in the sap list of the LLC's station upon the sap ID.
+ * If the sap is found it will be refcounted and the user will have to do
+ * a llc_sap_put after use.
+ * Returns the sap or %NULL if not found.
+ */
+struct llc_sap *llc_sap_find(unsigned char sap_value)
+{
+ struct llc_sap *sap;
+
+ rcu_read_lock_bh();
+ sap = __llc_sap_find(sap_value);
+ if (sap)
+ llc_sap_hold(sap);
+ rcu_read_unlock_bh();
+ return sap;
+}
+
+/**
+ * llc_sap_open - open interface to the upper layers.
+ * @lsap: SAP number.
+ * @func: rcv func for datalink protos
+ *
+ * Interface function to upper layer. Each one who wants to get a SAP
+ * (for example NetBEUI) should call this function. Returns the opened
+ * SAP for success, NULL for failure.
+ */
+struct llc_sap *llc_sap_open(unsigned char lsap,
+ int (*func)(struct sk_buff *skb,
+ struct net_device *dev,
+ struct packet_type *pt,
+ struct net_device *orig_dev))
+{
+ struct llc_sap *sap = NULL;
+
+ spin_lock_bh(&llc_sap_list_lock);
+ if (__llc_sap_find(lsap)) /* SAP already exists */
+ goto out;
+ sap = llc_sap_alloc();
+ if (!sap)
+ goto out;
+ sap->laddr.lsap = lsap;
+ sap->rcv_func = func;
+ list_add_tail_rcu(&sap->node, &llc_sap_list);
+out:
+ spin_unlock_bh(&llc_sap_list_lock);
+ return sap;
+}
+
+/**
+ * llc_sap_close - close interface for upper layers.
+ * @sap: SAP to be closed.
+ *
+ * Close interface function to upper layer. Each one who wants to
+ * close an open SAP (for example NetBEUI) should call this function.
+ * Removes this sap from the list of saps in the station and then
+ * frees the memory for this sap.
+ */
+void llc_sap_close(struct llc_sap *sap)
+{
+ WARN_ON(sap->sk_count);
+
+ spin_lock_bh(&llc_sap_list_lock);
+ list_del_rcu(&sap->node);
+ spin_unlock_bh(&llc_sap_list_lock);
+
+ synchronize_rcu();
+
+ kfree(sap);
+}
+
+static struct packet_type llc_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_802_2),
+ .func = llc_rcv,
+};
+
+static struct packet_type llc_tr_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_TR_802_2),
+ .func = llc_rcv,
+};
+
+static int __init llc_init(void)
+{
+ dev_add_pack(&llc_packet_type);
+ dev_add_pack(&llc_tr_packet_type);
+ return 0;
+}
+
+static void __exit llc_exit(void)
+{
+ dev_remove_pack(&llc_packet_type);
+ dev_remove_pack(&llc_tr_packet_type);
+}
+
+module_init(llc_init);
+module_exit(llc_exit);
+
+EXPORT_SYMBOL(llc_sap_list);
+EXPORT_SYMBOL(llc_sap_find);
+EXPORT_SYMBOL(llc_sap_open);
+EXPORT_SYMBOL(llc_sap_close);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003");
+MODULE_DESCRIPTION("LLC IEEE 802.2 core support");
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
new file mode 100644
index 000000000..6daf391b3
--- /dev/null
+++ b/net/llc/llc_if.c
@@ -0,0 +1,154 @@
+/*
+ * llc_if.c - Defines LLC interface to upper layer
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/errno.h>
+#include <net/llc_if.h>
+#include <net/llc_sap.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_conn.h>
+#include <net/sock.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_st.h>
+#include <net/tcp_states.h>
+
+/**
+ * llc_build_and_send_pkt - Connection data sending for upper layers.
+ * @sk: connection
+ * @skb: packet to send
+ *
+ * This function is called when upper layer wants to send data using
+ * connection oriented communication mode. During sending data, connection
+ * will be locked and received frames and expired timers will be queued.
+ * Returns 0 for success, -ECONNABORTED when the connection already
+ * closed and -EBUSY when sending data is not permitted in this state or
+ * LLC has send an I pdu with p bit set to 1 and is waiting for it's
+ * response.
+ */
+int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb)
+{
+ struct llc_conn_state_ev *ev;
+ int rc = -ECONNABORTED;
+ struct llc_sock *llc = llc_sk(sk);
+
+ if (unlikely(llc->state == LLC_CONN_STATE_ADM))
+ goto out;
+ rc = -EBUSY;
+ if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */
+ llc->p_flag)) {
+ llc->failed_data_req = 1;
+ goto out;
+ }
+ ev = llc_conn_ev(skb);
+ ev->type = LLC_CONN_EV_TYPE_PRIM;
+ ev->prim = LLC_DATA_PRIM;
+ ev->prim_type = LLC_PRIM_TYPE_REQ;
+ skb->dev = llc->dev;
+ rc = llc_conn_state_process(sk, skb);
+out:
+ return rc;
+}
+
+/**
+ * llc_establish_connection - Called by upper layer to establish a conn
+ * @sk: connection
+ * @lmac: local mac address
+ * @dmac: destination mac address
+ * @dsap: destination sap
+ *
+ * Upper layer calls this to establish an LLC connection with a remote
+ * machine. This function packages a proper event and sends it connection
+ * component state machine. Success or failure of connection
+ * establishment will inform to upper layer via calling it's confirm
+ * function and passing proper information.
+ */
+int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap)
+{
+ int rc = -EISCONN;
+ struct llc_addr laddr, daddr;
+ struct sk_buff *skb;
+ struct llc_sock *llc = llc_sk(sk);
+ struct sock *existing;
+
+ laddr.lsap = llc->sap->laddr.lsap;
+ daddr.lsap = dsap;
+ memcpy(daddr.mac, dmac, sizeof(daddr.mac));
+ memcpy(laddr.mac, lmac, sizeof(laddr.mac));
+ existing = llc_lookup_established(llc->sap, &daddr, &laddr);
+ if (existing) {
+ if (existing->sk_state == TCP_ESTABLISHED) {
+ sk = existing;
+ goto out_put;
+ } else
+ sock_put(existing);
+ }
+ sock_hold(sk);
+ rc = -ENOMEM;
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (skb) {
+ struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+ ev->type = LLC_CONN_EV_TYPE_PRIM;
+ ev->prim = LLC_CONN_PRIM;
+ ev->prim_type = LLC_PRIM_TYPE_REQ;
+ skb_set_owner_w(skb, sk);
+ rc = llc_conn_state_process(sk, skb);
+ }
+out_put:
+ sock_put(sk);
+ return rc;
+}
+
+/**
+ * llc_send_disc - Called by upper layer to close a connection
+ * @sk: connection to be closed
+ *
+ * Upper layer calls this when it wants to close an established LLC
+ * connection with a remote machine. This function packages a proper event
+ * and sends it to connection component state machine. Returns 0 for
+ * success, 1 otherwise.
+ */
+int llc_send_disc(struct sock *sk)
+{
+ u16 rc = 1;
+ struct llc_conn_state_ev *ev;
+ struct sk_buff *skb;
+
+ sock_hold(sk);
+ if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_ESTABLISHED ||
+ llc_sk(sk)->state == LLC_CONN_STATE_ADM ||
+ llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC)
+ goto out;
+ /*
+ * Postpone unassigning the connection from its SAP and returning the
+ * connection until all ACTIONs have been completely executed
+ */
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+ skb_set_owner_w(skb, sk);
+ sk->sk_state = TCP_CLOSING;
+ ev = llc_conn_ev(skb);
+ ev->type = LLC_CONN_EV_TYPE_PRIM;
+ ev->prim = LLC_DISC_PRIM;
+ ev->prim_type = LLC_PRIM_TYPE_REQ;
+ rc = llc_conn_state_process(sk, skb);
+out:
+ sock_put(sk);
+ return rc;
+}
+
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
new file mode 100644
index 000000000..dd3e83328
--- /dev/null
+++ b/net/llc/llc_input.c
@@ -0,0 +1,226 @@
+/*
+ * llc_input.c - Minimal input path for LLC
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/llc_sap.h>
+
+#if 0
+#define dprintk(args...) printk(KERN_DEBUG args)
+#else
+#define dprintk(args...)
+#endif
+
+/*
+ * Packet handler for the station, registerable because in the minimal
+ * LLC core that is taking shape only the very minimal subset of LLC that
+ * is needed for things like IPX, Appletalk, etc will stay, with all the
+ * rest in the llc1 and llc2 modules.
+ */
+static void (*llc_station_handler)(struct sk_buff *skb);
+
+/*
+ * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN.
+ */
+static void (*llc_type_handlers[2])(struct llc_sap *sap,
+ struct sk_buff *skb);
+
+void llc_add_pack(int type, void (*handler)(struct llc_sap *sap,
+ struct sk_buff *skb))
+{
+ smp_wmb(); /* ensure initialisation is complete before it's called */
+ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN)
+ llc_type_handlers[type - 1] = handler;
+}
+
+void llc_remove_pack(int type)
+{
+ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN)
+ llc_type_handlers[type - 1] = NULL;
+ synchronize_net();
+}
+
+void llc_set_station_handler(void (*handler)(struct sk_buff *skb))
+{
+ /* Ensure initialisation is complete before it's called */
+ if (handler)
+ smp_wmb();
+
+ llc_station_handler = handler;
+
+ if (!handler)
+ synchronize_net();
+}
+
+/**
+ * llc_pdu_type - returns which LLC component must handle for PDU
+ * @skb: input skb
+ *
+ * This function returns which LLC component must handle this PDU.
+ */
+static __inline__ int llc_pdu_type(struct sk_buff *skb)
+{
+ int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U)
+ goto out;
+ switch (LLC_U_PDU_CMD(pdu)) {
+ case LLC_1_PDU_CMD_XID:
+ case LLC_1_PDU_CMD_UI:
+ case LLC_1_PDU_CMD_TEST:
+ type = LLC_DEST_SAP;
+ break;
+ case LLC_2_PDU_CMD_SABME:
+ case LLC_2_PDU_CMD_DISC:
+ case LLC_2_PDU_RSP_UA:
+ case LLC_2_PDU_RSP_DM:
+ case LLC_2_PDU_RSP_FRMR:
+ break;
+ default:
+ type = LLC_DEST_INVALID;
+ break;
+ }
+out:
+ return type;
+}
+
+/**
+ * llc_fixup_skb - initializes skb pointers
+ * @skb: This argument points to incoming skb
+ *
+ * Initializes internal skb pointer to start of network layer by deriving
+ * length of LLC header; finds length of LLC control field in LLC header
+ * by looking at the two lowest-order bits of the first control field
+ * byte; field is either 3 or 4 bytes long.
+ */
+static inline int llc_fixup_skb(struct sk_buff *skb)
+{
+ u8 llc_len = 2;
+ struct llc_pdu_un *pdu;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*pdu))))
+ return 0;
+
+ pdu = (struct llc_pdu_un *)skb->data;
+ if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U)
+ llc_len = 1;
+ llc_len += 2;
+
+ if (unlikely(!pskb_may_pull(skb, llc_len)))
+ return 0;
+
+ skb->transport_header += llc_len;
+ skb_pull(skb, llc_len);
+ if (skb->protocol == htons(ETH_P_802_2)) {
+ __be16 pdulen = eth_hdr(skb)->h_proto;
+ s32 data_size = ntohs(pdulen) - llc_len;
+
+ if (data_size < 0 ||
+ !pskb_may_pull(skb, data_size))
+ return 0;
+ if (unlikely(pskb_trim_rcsum(skb, data_size)))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * llc_rcv - 802.2 entry point from net lower layers
+ * @skb: received pdu
+ * @dev: device that receive pdu
+ * @pt: packet type
+ *
+ * When the system receives a 802.2 frame this function is called. It
+ * checks SAP and connection of received pdu and passes frame to
+ * llc_{station,sap,conn}_rcv for sending to proper state machine. If
+ * the frame is related to a busy connection (a connection is sending
+ * data now), it queues this frame in the connection's backlog.
+ */
+int llc_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct llc_sap *sap;
+ struct llc_pdu_sn *pdu;
+ int dest;
+ int (*rcv)(struct sk_buff *, struct net_device *,
+ struct packet_type *, struct net_device *);
+ void (*sta_handler)(struct sk_buff *skb);
+ void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ /*
+ * When the interface is in promisc. mode, drop all the crap that it
+ * receives, do not try to analyse it.
+ */
+ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
+ dprintk("%s: PACKET_OTHERHOST\n", __func__);
+ goto drop;
+ }
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto out;
+ if (unlikely(!llc_fixup_skb(skb)))
+ goto drop;
+ pdu = llc_pdu_sn_hdr(skb);
+ if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */
+ goto handle_station;
+ sap = llc_sap_find(pdu->dsap);
+ if (unlikely(!sap)) {/* unknown SAP */
+ dprintk("%s: llc_sap_find(%02X) failed!\n", __func__,
+ pdu->dsap);
+ goto drop;
+ }
+ /*
+ * First the upper layer protocols that don't need the full
+ * LLC functionality
+ */
+ rcv = rcu_dereference(sap->rcv_func);
+ dest = llc_pdu_type(skb);
+ sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL;
+ if (unlikely(!sap_handler)) {
+ if (rcv)
+ rcv(skb, dev, pt, orig_dev);
+ else
+ kfree_skb(skb);
+ } else {
+ if (rcv) {
+ struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
+ if (cskb)
+ rcv(cskb, dev, pt, orig_dev);
+ }
+ sap_handler(sap, skb);
+ }
+ llc_sap_put(sap);
+out:
+ return 0;
+drop:
+ kfree_skb(skb);
+ goto out;
+handle_station:
+ sta_handler = ACCESS_ONCE(llc_station_handler);
+ if (!sta_handler)
+ goto drop;
+ sta_handler(skb);
+ goto out;
+}
+
+EXPORT_SYMBOL(llc_add_pack);
+EXPORT_SYMBOL(llc_remove_pack);
+EXPORT_SYMBOL(llc_set_station_handler);
diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c
new file mode 100644
index 000000000..94425e421
--- /dev/null
+++ b/net/llc/llc_output.c
@@ -0,0 +1,79 @@
+/*
+ * llc_output.c - LLC minimal output path
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License version 2 for more details.
+ */
+
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+
+/**
+ * llc_mac_hdr_init - fills MAC header fields
+ * @skb: Address of the frame to initialize its MAC header
+ * @sa: The MAC source address
+ * @da: The MAC destination address
+ *
+ * Fills MAC header fields, depending on MAC type. Returns 0, If MAC type
+ * is a valid type and initialization completes correctly 1, otherwise.
+ */
+int llc_mac_hdr_init(struct sk_buff *skb,
+ const unsigned char *sa, const unsigned char *da)
+{
+ int rc = -EINVAL;
+
+ switch (skb->dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_LOOPBACK:
+ rc = dev_hard_header(skb, skb->dev, ETH_P_802_2, da, sa,
+ skb->len);
+ if (rc > 0)
+ rc = 0;
+ break;
+ default:
+ break;
+ }
+ return rc;
+}
+
+/**
+ * llc_build_and_send_ui_pkt - unitdata request interface for upper layers
+ * @sap: sap to use
+ * @skb: packet to send
+ * @dmac: destination mac address
+ * @dsap: destination sap
+ *
+ * Upper layers calls this function when upper layer wants to send data
+ * using connection-less mode communication (UI pdu).
+ *
+ * Accept data frame from network layer to be sent using connection-
+ * less mode communication; timeout/retries handled by network layer;
+ * package primitive as an event and send to SAP event handler
+ */
+int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
+ unsigned char *dmac, unsigned char dsap)
+{
+ int rc;
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+ dsap, LLC_PDU_CMD);
+ llc_pdu_init_as_ui_cmd(skb);
+ rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac);
+ if (likely(!rc))
+ rc = dev_queue_xmit(skb);
+ return rc;
+}
+
+EXPORT_SYMBOL(llc_mac_hdr_init);
+EXPORT_SYMBOL(llc_build_and_send_ui_pkt);
diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c
new file mode 100644
index 000000000..2e6cb7919
--- /dev/null
+++ b/net/llc/llc_pdu.c
@@ -0,0 +1,372 @@
+/*
+ * llc_pdu.c - access to PDU internals
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/netdevice.h>
+#include <net/llc_pdu.h>
+
+static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type);
+static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu);
+
+void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type)
+{
+ llc_pdu_un_hdr(skb)->ssap |= pdu_type;
+}
+
+/**
+ * pdu_set_pf_bit - sets poll/final bit in LLC header
+ * @pdu_frame: input frame that p/f bit must be set into it.
+ * @bit_value: poll/final bit (0 or 1).
+ *
+ * This function sets poll/final bit in LLC header (based on type of PDU).
+ * in I or S pdus, p/f bit is right bit of fourth byte in header. in U
+ * pdus p/f bit is fifth bit of third byte.
+ */
+void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value)
+{
+ u8 pdu_type;
+ struct llc_pdu_sn *pdu;
+
+ llc_pdu_decode_pdu_type(skb, &pdu_type);
+ pdu = llc_pdu_sn_hdr(skb);
+
+ switch (pdu_type) {
+ case LLC_PDU_TYPE_I:
+ case LLC_PDU_TYPE_S:
+ pdu->ctrl_2 = (pdu->ctrl_2 & 0xFE) | bit_value;
+ break;
+ case LLC_PDU_TYPE_U:
+ pdu->ctrl_1 |= (pdu->ctrl_1 & 0xEF) | (bit_value << 4);
+ break;
+ }
+}
+
+/**
+ * llc_pdu_decode_pf_bit - extracs poll/final bit from LLC header
+ * @skb: input skb that p/f bit must be extracted from it
+ * @pf_bit: poll/final bit (0 or 1)
+ *
+ * This function extracts poll/final bit from LLC header (based on type of
+ * PDU). In I or S pdus, p/f bit is right bit of fourth byte in header. In
+ * U pdus p/f bit is fifth bit of third byte.
+ */
+void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit)
+{
+ u8 pdu_type;
+ struct llc_pdu_sn *pdu;
+
+ llc_pdu_decode_pdu_type(skb, &pdu_type);
+ pdu = llc_pdu_sn_hdr(skb);
+
+ switch (pdu_type) {
+ case LLC_PDU_TYPE_I:
+ case LLC_PDU_TYPE_S:
+ *pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK;
+ break;
+ case LLC_PDU_TYPE_U:
+ *pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4;
+ break;
+ }
+}
+
+/**
+ * llc_pdu_init_as_disc_cmd - Builds DISC PDU
+ * @skb: Address of the skb to build
+ * @p_bit: The P bit to set in the PDU
+ *
+ * Builds a pdu frame as a DISC command.
+ */
+void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit)
+{
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_U;
+ pdu->ctrl_1 |= LLC_2_PDU_CMD_DISC;
+ pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+}
+
+/**
+ * llc_pdu_init_as_i_cmd - builds I pdu
+ * @skb: Address of the skb to build
+ * @p_bit: The P bit to set in the PDU
+ * @ns: The sequence number of the data PDU
+ * @nr: The seq. number of the expected I PDU from the remote
+ *
+ * Builds a pdu frame as an I command.
+ */
+void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_I;
+ pdu->ctrl_2 = 0;
+ pdu->ctrl_2 |= (p_bit & LLC_I_PF_BIT_MASK); /* p/f bit */
+ pdu->ctrl_1 |= (ns << 1) & 0xFE; /* set N(S) in bits 2..8 */
+ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ * llc_pdu_init_as_rej_cmd - builds REJ PDU
+ * @skb: Address of the skb to build
+ * @p_bit: The P bit to set in the PDU
+ * @nr: The seq. number of the expected I PDU from the remote
+ *
+ * Builds a pdu frame as a REJ command.
+ */
+void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_S;
+ pdu->ctrl_1 |= LLC_2_PDU_CMD_REJ;
+ pdu->ctrl_2 = 0;
+ pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK;
+ pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */
+ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ * llc_pdu_init_as_rnr_cmd - builds RNR pdu
+ * @skb: Address of the skb to build
+ * @p_bit: The P bit to set in the PDU
+ * @nr: The seq. number of the expected I PDU from the remote
+ *
+ * Builds a pdu frame as an RNR command.
+ */
+void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_S;
+ pdu->ctrl_1 |= LLC_2_PDU_CMD_RNR;
+ pdu->ctrl_2 = 0;
+ pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK;
+ pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */
+ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ * llc_pdu_init_as_rr_cmd - Builds RR pdu
+ * @skb: Address of the skb to build
+ * @p_bit: The P bit to set in the PDU
+ * @nr: The seq. number of the expected I PDU from the remote
+ *
+ * Builds a pdu frame as an RR command.
+ */
+void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_S;
+ pdu->ctrl_1 |= LLC_2_PDU_CMD_RR;
+ pdu->ctrl_2 = p_bit & LLC_S_PF_BIT_MASK;
+ pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */
+ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ * llc_pdu_init_as_sabme_cmd - builds SABME pdu
+ * @skb: Address of the skb to build
+ * @p_bit: The P bit to set in the PDU
+ *
+ * Builds a pdu frame as an SABME command.
+ */
+void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit)
+{
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_U;
+ pdu->ctrl_1 |= LLC_2_PDU_CMD_SABME;
+ pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+}
+
+/**
+ * llc_pdu_init_as_dm_rsp - builds DM response pdu
+ * @skb: Address of the skb to build
+ * @f_bit: The F bit to set in the PDU
+ *
+ * Builds a pdu frame as a DM response.
+ */
+void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit)
+{
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_U;
+ pdu->ctrl_1 |= LLC_2_PDU_RSP_DM;
+ pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+}
+
+/**
+ * llc_pdu_init_as_frmr_rsp - builds FRMR response PDU
+ * @skb: Address of the frame to build
+ * @prev_pdu: The rejected PDU frame
+ * @f_bit: The F bit to set in the PDU
+ * @vs: tx state vari value for the data link conn at the rejecting LLC
+ * @vr: rx state var value for the data link conn at the rejecting LLC
+ * @vzyxw: completely described in the IEEE Std 802.2 document (Pg 55)
+ *
+ * Builds a pdu frame as a FRMR response.
+ */
+void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu,
+ u8 f_bit, u8 vs, u8 vr, u8 vzyxw)
+{
+ struct llc_frmr_info *frmr_info;
+ u8 prev_pf = 0;
+ u8 *ctrl;
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_U;
+ pdu->ctrl_1 |= LLC_2_PDU_RSP_FRMR;
+ pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+
+ frmr_info = (struct llc_frmr_info *)&pdu->ctrl_2;
+ ctrl = (u8 *)&prev_pdu->ctrl_1;
+ FRMR_INFO_SET_REJ_CNTRL(frmr_info,ctrl);
+ FRMR_INFO_SET_Vs(frmr_info, vs);
+ FRMR_INFO_SET_Vr(frmr_info, vr);
+ prev_pf = llc_pdu_get_pf_bit(prev_pdu);
+ FRMR_INFO_SET_C_R_BIT(frmr_info, prev_pf);
+ FRMR_INFO_SET_INVALID_PDU_CTRL_IND(frmr_info, vzyxw);
+ FRMR_INFO_SET_INVALID_PDU_INFO_IND(frmr_info, vzyxw);
+ FRMR_INFO_SET_PDU_INFO_2LONG_IND(frmr_info, vzyxw);
+ FRMR_INFO_SET_PDU_INVALID_Nr_IND(frmr_info, vzyxw);
+ FRMR_INFO_SET_PDU_INVALID_Ns_IND(frmr_info, vzyxw);
+ skb_put(skb, sizeof(struct llc_frmr_info));
+}
+
+/**
+ * llc_pdu_init_as_rr_rsp - builds RR response pdu
+ * @skb: Address of the skb to build
+ * @f_bit: The F bit to set in the PDU
+ * @nr: The seq. number of the expected data PDU from the remote
+ *
+ * Builds a pdu frame as an RR response.
+ */
+void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_S;
+ pdu->ctrl_1 |= LLC_2_PDU_RSP_RR;
+ pdu->ctrl_2 = 0;
+ pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK;
+ pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */
+ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ * llc_pdu_init_as_rej_rsp - builds REJ response pdu
+ * @skb: Address of the skb to build
+ * @f_bit: The F bit to set in the PDU
+ * @nr: The seq. number of the expected data PDU from the remote
+ *
+ * Builds a pdu frame as a REJ response.
+ */
+void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_S;
+ pdu->ctrl_1 |= LLC_2_PDU_RSP_REJ;
+ pdu->ctrl_2 = 0;
+ pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK;
+ pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */
+ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ * llc_pdu_init_as_rnr_rsp - builds RNR response pdu
+ * @skb: Address of the frame to build
+ * @f_bit: The F bit to set in the PDU
+ * @nr: The seq. number of the expected data PDU from the remote
+ *
+ * Builds a pdu frame as an RNR response.
+ */
+void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr)
+{
+ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_S;
+ pdu->ctrl_1 |= LLC_2_PDU_RSP_RNR;
+ pdu->ctrl_2 = 0;
+ pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK;
+ pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */
+ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ * llc_pdu_init_as_ua_rsp - builds UA response pdu
+ * @skb: Address of the frame to build
+ * @f_bit: The F bit to set in the PDU
+ *
+ * Builds a pdu frame as a UA response.
+ */
+void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit)
+{
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ pdu->ctrl_1 = LLC_PDU_TYPE_U;
+ pdu->ctrl_1 |= LLC_2_PDU_RSP_UA;
+ pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+}
+
+/**
+ * llc_pdu_decode_pdu_type - designates PDU type
+ * @skb: input skb that type of it must be designated.
+ * @type: type of PDU (output argument).
+ *
+ * This function designates type of PDU (I, S or U).
+ */
+static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type)
+{
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ if (pdu->ctrl_1 & 1) {
+ if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U)
+ *type = LLC_PDU_TYPE_U;
+ else
+ *type = LLC_PDU_TYPE_S;
+ } else
+ *type = LLC_PDU_TYPE_I;
+}
+
+/**
+ * llc_pdu_get_pf_bit - extracts p/f bit of input PDU
+ * @pdu: pointer to LLC header.
+ *
+ * This function extracts p/f bit of input PDU. at first examines type of
+ * PDU and then extracts p/f bit. Returns the p/f bit.
+ */
+static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu)
+{
+ u8 pdu_type;
+ u8 pf_bit = 0;
+
+ if (pdu->ctrl_1 & 1) {
+ if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U)
+ pdu_type = LLC_PDU_TYPE_U;
+ else
+ pdu_type = LLC_PDU_TYPE_S;
+ } else
+ pdu_type = LLC_PDU_TYPE_I;
+ switch (pdu_type) {
+ case LLC_PDU_TYPE_I:
+ case LLC_PDU_TYPE_S:
+ pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK;
+ break;
+ case LLC_PDU_TYPE_U:
+ pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4;
+ break;
+ }
+ return pf_bit;
+}
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
new file mode 100644
index 000000000..1a3c7e0f5
--- /dev/null
+++ b/net/llc/llc_proc.c
@@ -0,0 +1,277 @@
+/*
+ * proc_llc.c - proc interface for LLC
+ *
+ * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org>
+ * 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/errno.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/llc.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_st.h>
+#include <net/llc_conn.h>
+
+static void llc_ui_format_mac(struct seq_file *seq, u8 *addr)
+{
+ seq_printf(seq, "%pM", addr);
+}
+
+static struct sock *llc_get_sk_idx(loff_t pos)
+{
+ struct llc_sap *sap;
+ struct sock *sk = NULL;
+ int i;
+
+ list_for_each_entry_rcu(sap, &llc_sap_list, node) {
+ spin_lock_bh(&sap->sk_lock);
+ for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++) {
+ struct hlist_nulls_head *head = &sap->sk_laddr_hash[i];
+ struct hlist_nulls_node *node;
+
+ sk_nulls_for_each(sk, node, head) {
+ if (!pos)
+ goto found; /* keep the lock */
+ --pos;
+ }
+ }
+ spin_unlock_bh(&sap->sk_lock);
+ }
+ sk = NULL;
+found:
+ return sk;
+}
+
+static void *llc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t l = *pos;
+
+ rcu_read_lock_bh();
+ return l ? llc_get_sk_idx(--l) : SEQ_START_TOKEN;
+}
+
+static struct sock *laddr_hash_next(struct llc_sap *sap, int bucket)
+{
+ struct hlist_nulls_node *node;
+ struct sock *sk = NULL;
+
+ while (++bucket < LLC_SK_LADDR_HASH_ENTRIES)
+ sk_nulls_for_each(sk, node, &sap->sk_laddr_hash[bucket])
+ goto out;
+
+out:
+ return sk;
+}
+
+static void *llc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock* sk, *next;
+ struct llc_sock *llc;
+ struct llc_sap *sap;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN) {
+ sk = llc_get_sk_idx(0);
+ goto out;
+ }
+ sk = v;
+ next = sk_nulls_next(sk);
+ if (next) {
+ sk = next;
+ goto out;
+ }
+ llc = llc_sk(sk);
+ sap = llc->sap;
+ sk = laddr_hash_next(sap, llc_sk_laddr_hashfn(sap, &llc->laddr));
+ if (sk)
+ goto out;
+ spin_unlock_bh(&sap->sk_lock);
+ list_for_each_entry_continue_rcu(sap, &llc_sap_list, node) {
+ spin_lock_bh(&sap->sk_lock);
+ sk = laddr_hash_next(sap, -1);
+ if (sk)
+ break; /* keep the lock */
+ spin_unlock_bh(&sap->sk_lock);
+ }
+out:
+ return sk;
+}
+
+static void llc_seq_stop(struct seq_file *seq, void *v)
+{
+ if (v && v != SEQ_START_TOKEN) {
+ struct sock *sk = v;
+ struct llc_sock *llc = llc_sk(sk);
+ struct llc_sap *sap = llc->sap;
+
+ spin_unlock_bh(&sap->sk_lock);
+ }
+ rcu_read_unlock_bh();
+}
+
+static int llc_seq_socket_show(struct seq_file *seq, void *v)
+{
+ struct sock* sk;
+ struct llc_sock *llc;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "SKt Mc local_mac_sap remote_mac_sap "
+ " tx_queue rx_queue st uid link\n");
+ goto out;
+ }
+ sk = v;
+ llc = llc_sk(sk);
+
+ /* FIXME: check if the address is multicast */
+ seq_printf(seq, "%2X %2X ", sk->sk_type, 0);
+
+ if (llc->dev)
+ llc_ui_format_mac(seq, llc->dev->dev_addr);
+ else {
+ u8 addr[6] = {0,0,0,0,0,0};
+ llc_ui_format_mac(seq, addr);
+ }
+ seq_printf(seq, "@%02X ", llc->sap->laddr.lsap);
+ llc_ui_format_mac(seq, llc->daddr.mac);
+ seq_printf(seq, "@%02X %8d %8d %2d %3u %4d\n", llc->daddr.lsap,
+ sk_wmem_alloc_get(sk),
+ sk_rmem_alloc_get(sk) - llc->copied_seq,
+ sk->sk_state,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+ llc->link);
+out:
+ return 0;
+}
+
+static const char *const llc_conn_state_names[] = {
+ [LLC_CONN_STATE_ADM] = "adm",
+ [LLC_CONN_STATE_SETUP] = "setup",
+ [LLC_CONN_STATE_NORMAL] = "normal",
+ [LLC_CONN_STATE_BUSY] = "busy",
+ [LLC_CONN_STATE_REJ] = "rej",
+ [LLC_CONN_STATE_AWAIT] = "await",
+ [LLC_CONN_STATE_AWAIT_BUSY] = "await_busy",
+ [LLC_CONN_STATE_AWAIT_REJ] = "await_rej",
+ [LLC_CONN_STATE_D_CONN] = "d_conn",
+ [LLC_CONN_STATE_RESET] = "reset",
+ [LLC_CONN_STATE_ERROR] = "error",
+ [LLC_CONN_STATE_TEMP] = "temp",
+};
+
+static int llc_seq_core_show(struct seq_file *seq, void *v)
+{
+ struct sock* sk;
+ struct llc_sock *llc;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Connection list:\n"
+ "dsap state retr txw rxw pf ff sf df rs cs "
+ "tack tpfc trs tbs blog busr\n");
+ goto out;
+ }
+ sk = v;
+ llc = llc_sk(sk);
+
+ seq_printf(seq, " %02X %-10s %3d %3d %3d %2d %2d %2d %2d %2d %2d "
+ "%4d %4d %3d %3d %4d %4d\n",
+ llc->daddr.lsap, llc_conn_state_names[llc->state],
+ llc->retry_count, llc->k, llc->rw, llc->p_flag, llc->f_flag,
+ llc->s_flag, llc->data_flag, llc->remote_busy_flag,
+ llc->cause_flag, timer_pending(&llc->ack_timer.timer),
+ timer_pending(&llc->pf_cycle_timer.timer),
+ timer_pending(&llc->rej_sent_timer.timer),
+ timer_pending(&llc->busy_state_timer.timer),
+ !!sk->sk_backlog.tail, !!sock_owned_by_user(sk));
+out:
+ return 0;
+}
+
+static const struct seq_operations llc_seq_socket_ops = {
+ .start = llc_seq_start,
+ .next = llc_seq_next,
+ .stop = llc_seq_stop,
+ .show = llc_seq_socket_show,
+};
+
+static const struct seq_operations llc_seq_core_ops = {
+ .start = llc_seq_start,
+ .next = llc_seq_next,
+ .stop = llc_seq_stop,
+ .show = llc_seq_core_show,
+};
+
+static int llc_seq_socket_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &llc_seq_socket_ops);
+}
+
+static int llc_seq_core_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &llc_seq_core_ops);
+}
+
+static const struct file_operations llc_seq_socket_fops = {
+ .owner = THIS_MODULE,
+ .open = llc_seq_socket_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations llc_seq_core_fops = {
+ .owner = THIS_MODULE,
+ .open = llc_seq_core_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct proc_dir_entry *llc_proc_dir;
+
+int __init llc_proc_init(void)
+{
+ int rc = -ENOMEM;
+ struct proc_dir_entry *p;
+
+ llc_proc_dir = proc_mkdir("llc", init_net.proc_net);
+ if (!llc_proc_dir)
+ goto out;
+
+ p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops);
+ if (!p)
+ goto out_socket;
+
+ p = proc_create("core", S_IRUGO, llc_proc_dir, &llc_seq_core_fops);
+ if (!p)
+ goto out_core;
+
+ rc = 0;
+out:
+ return rc;
+out_core:
+ remove_proc_entry("socket", llc_proc_dir);
+out_socket:
+ remove_proc_entry("llc", init_net.proc_net);
+ goto out;
+}
+
+void llc_proc_exit(void)
+{
+ remove_proc_entry("socket", llc_proc_dir);
+ remove_proc_entry("core", llc_proc_dir);
+ remove_proc_entry("llc", init_net.proc_net);
+}
diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c
new file mode 100644
index 000000000..a94bd56bc
--- /dev/null
+++ b/net/llc/llc_s_ac.c
@@ -0,0 +1,208 @@
+/*
+ * llc_s_ac.c - actions performed during sap state transition.
+ *
+ * Description :
+ * Functions in this module are implementation of sap component actions.
+ * Details of actions can be found in IEEE-802.2 standard document.
+ * All functions have one sap and one event as input argument. All of
+ * them return 0 On success and 1 otherwise.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/netdevice.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_sap.h>
+
+
+/**
+ * llc_sap_action_unit_data_ind - forward UI PDU to network layer
+ * @sap: SAP
+ * @skb: the event to forward
+ *
+ * Received a UI PDU from MAC layer; forward to network layer as a
+ * UNITDATA INDICATION; verify our event is the kind we expect
+ */
+int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb)
+{
+ llc_sap_rtn_pdu(sap, skb);
+ return 0;
+}
+
+/**
+ * llc_sap_action_send_ui - sends UI PDU resp to UNITDATA REQ to MAC layer
+ * @sap: SAP
+ * @skb: the event to send
+ *
+ * Sends a UI PDU to the MAC layer in response to a UNITDATA REQUEST
+ * primitive from the network layer. Verifies event is a primitive type of
+ * event. Verify the primitive is a UNITDATA REQUEST.
+ */
+int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ int rc;
+
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
+ ev->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_ui_cmd(skb);
+ rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
+ if (likely(!rc))
+ rc = dev_queue_xmit(skb);
+ return rc;
+}
+
+/**
+ * llc_sap_action_send_xid_c - send XID PDU as response to XID REQ
+ * @sap: SAP
+ * @skb: the event to send
+ *
+ * Send a XID command PDU to MAC layer in response to a XID REQUEST
+ * primitive from the network layer. Verify event is a primitive type
+ * event. Verify the primitive is a XID REQUEST.
+ */
+int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ int rc;
+
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
+ ev->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0);
+ rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
+ if (likely(!rc))
+ rc = dev_queue_xmit(skb);
+ return rc;
+}
+
+/**
+ * llc_sap_action_send_xid_r - send XID PDU resp to MAC for received XID
+ * @sap: SAP
+ * @skb: the event to send
+ *
+ * Send XID response PDU to MAC in response to an earlier received XID
+ * command PDU. Verify event is a PDU type event
+ */
+int llc_sap_action_send_xid_r(struct llc_sap *sap, struct sk_buff *skb)
+{
+ u8 mac_da[ETH_ALEN], mac_sa[ETH_ALEN], dsap;
+ int rc = 1;
+ struct sk_buff *nskb;
+
+ llc_pdu_decode_sa(skb, mac_da);
+ llc_pdu_decode_da(skb, mac_sa);
+ llc_pdu_decode_ssap(skb, &dsap);
+ nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U,
+ sizeof(struct llc_xid_info));
+ if (!nskb)
+ goto out;
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap,
+ LLC_PDU_RSP);
+ llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 0);
+ rc = llc_mac_hdr_init(nskb, mac_sa, mac_da);
+ if (likely(!rc))
+ rc = dev_queue_xmit(nskb);
+out:
+ return rc;
+}
+
+/**
+ * llc_sap_action_send_test_c - send TEST PDU to MAC in resp to TEST REQ
+ * @sap: SAP
+ * @skb: the event to send
+ *
+ * Send a TEST command PDU to the MAC layer in response to a TEST REQUEST
+ * primitive from the network layer. Verify event is a primitive type
+ * event; verify the primitive is a TEST REQUEST.
+ */
+int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ int rc;
+
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
+ ev->daddr.lsap, LLC_PDU_CMD);
+ llc_pdu_init_as_test_cmd(skb);
+ rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
+ if (likely(!rc))
+ rc = dev_queue_xmit(skb);
+ return rc;
+}
+
+int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb)
+{
+ u8 mac_da[ETH_ALEN], mac_sa[ETH_ALEN], dsap;
+ struct sk_buff *nskb;
+ int rc = 1;
+ u32 data_size;
+
+ llc_pdu_decode_sa(skb, mac_da);
+ llc_pdu_decode_da(skb, mac_sa);
+ llc_pdu_decode_ssap(skb, &dsap);
+
+ /* The test request command is type U (llc_len = 3) */
+ data_size = ntohs(eth_hdr(skb)->h_proto) - 3;
+ nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size);
+ if (!nskb)
+ goto out;
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap,
+ LLC_PDU_RSP);
+ llc_pdu_init_as_test_rsp(nskb, skb);
+ rc = llc_mac_hdr_init(nskb, mac_sa, mac_da);
+ if (likely(!rc))
+ rc = dev_queue_xmit(nskb);
+out:
+ return rc;
+}
+
+/**
+ * llc_sap_action_report_status - report data link status to layer mgmt
+ * @sap: SAP
+ * @skb: the event to send
+ *
+ * Report data link status to layer management. Verify our event is the
+ * kind we expect.
+ */
+int llc_sap_action_report_status(struct llc_sap *sap, struct sk_buff *skb)
+{
+ return 0;
+}
+
+/**
+ * llc_sap_action_xid_ind - send XID PDU resp to net layer via XID IND
+ * @sap: SAP
+ * @skb: the event to send
+ *
+ * Send a XID response PDU to the network layer via a XID INDICATION
+ * primitive.
+ */
+int llc_sap_action_xid_ind(struct llc_sap *sap, struct sk_buff *skb)
+{
+ llc_sap_rtn_pdu(sap, skb);
+ return 0;
+}
+
+/**
+ * llc_sap_action_test_ind - send TEST PDU to net layer via TEST IND
+ * @sap: SAP
+ * @skb: the event to send
+ *
+ * Send a TEST response PDU to the network layer via a TEST INDICATION
+ * primitive. Verify our event is a PDU type event.
+ */
+int llc_sap_action_test_ind(struct llc_sap *sap, struct sk_buff *skb)
+{
+ llc_sap_rtn_pdu(sap, skb);
+ return 0;
+}
diff --git a/net/llc/llc_s_ev.c b/net/llc/llc_s_ev.c
new file mode 100644
index 000000000..a74d2a1d6
--- /dev/null
+++ b/net/llc/llc_s_ev.c
@@ -0,0 +1,115 @@
+/*
+ * llc_s_ev.c - Defines SAP component events
+ *
+ * The followed event functions are SAP component events which are described
+ * in 802.2 LLC protocol standard document.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/llc_if.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_pdu.h>
+
+int llc_sap_ev_activation_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_SIMPLE &&
+ ev->prim_type == LLC_SAP_EV_ACTIVATION_REQ ? 0 : 1;
+}
+
+int llc_sap_ev_rx_ui(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_UI ? 0 : 1;
+}
+
+int llc_sap_ev_unitdata_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_PRIM &&
+ ev->prim == LLC_DATAUNIT_PRIM &&
+ ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+
+}
+
+int llc_sap_ev_xid_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_PRIM &&
+ ev->prim == LLC_XID_PRIM &&
+ ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_sap_ev_rx_xid_c(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID ? 0 : 1;
+}
+
+int llc_sap_ev_rx_xid_r(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_RSP(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID ? 0 : 1;
+}
+
+int llc_sap_ev_test_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_PRIM &&
+ ev->prim == LLC_TEST_PRIM &&
+ ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_sap_ev_rx_test_c(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST ? 0 : 1;
+}
+
+int llc_sap_ev_rx_test_r(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_RSP(pdu) &&
+ LLC_PDU_TYPE_IS_U(pdu) &&
+ LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_TEST ? 0 : 1;
+}
+
+int llc_sap_ev_deactivation_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ return ev->type == LLC_SAP_EV_TYPE_SIMPLE &&
+ ev->prim_type == LLC_SAP_EV_DEACTIVATION_REQ ? 0 : 1;
+}
diff --git a/net/llc/llc_s_st.c b/net/llc/llc_s_st.c
new file mode 100644
index 000000000..308c61688
--- /dev/null
+++ b/net/llc/llc_s_st.c
@@ -0,0 +1,183 @@
+/*
+ * llc_s_st.c - Defines SAP component state machine transitions.
+ *
+ * The followed transitions are SAP component state machine transitions
+ * which are described in 802.2 LLC protocol standard document.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/types.h>
+#include <net/llc_if.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_s_st.h>
+
+/* dummy last-transition indicator; common to all state transition groups
+ * last entry for this state
+ * all members are zeros, .bss zeroes it
+ */
+static struct llc_sap_state_trans llc_sap_state_trans_end;
+
+/* state LLC_SAP_STATE_INACTIVE transition for
+ * LLC_SAP_EV_ACTIVATION_REQ event
+ */
+static const llc_sap_action_t llc_sap_inactive_state_actions_1[] = {
+ [0] = llc_sap_action_report_status,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_inactive_state_trans_1 = {
+ .ev = llc_sap_ev_activation_req,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_inactive_state_actions_1,
+};
+
+/* array of pointers; one to each transition */
+static struct llc_sap_state_trans *llc_sap_inactive_state_transitions[] = {
+ [0] = &llc_sap_inactive_state_trans_1,
+ [1] = &llc_sap_state_trans_end,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_UI event */
+static const llc_sap_action_t llc_sap_active_state_actions_1[] = {
+ [0] = llc_sap_action_unitdata_ind,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_1 = {
+ .ev = llc_sap_ev_rx_ui,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_active_state_actions_1,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_UNITDATA_REQ event */
+static const llc_sap_action_t llc_sap_active_state_actions_2[] = {
+ [0] = llc_sap_action_send_ui,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_2 = {
+ .ev = llc_sap_ev_unitdata_req,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_active_state_actions_2,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_XID_REQ event */
+static const llc_sap_action_t llc_sap_active_state_actions_3[] = {
+ [0] = llc_sap_action_send_xid_c,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_3 = {
+ .ev = llc_sap_ev_xid_req,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_active_state_actions_3,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_XID_C event */
+static const llc_sap_action_t llc_sap_active_state_actions_4[] = {
+ [0] = llc_sap_action_send_xid_r,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_4 = {
+ .ev = llc_sap_ev_rx_xid_c,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_active_state_actions_4,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_XID_R event */
+static const llc_sap_action_t llc_sap_active_state_actions_5[] = {
+ [0] = llc_sap_action_xid_ind,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_5 = {
+ .ev = llc_sap_ev_rx_xid_r,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_active_state_actions_5,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_TEST_REQ event */
+static const llc_sap_action_t llc_sap_active_state_actions_6[] = {
+ [0] = llc_sap_action_send_test_c,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_6 = {
+ .ev = llc_sap_ev_test_req,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_active_state_actions_6,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_TEST_C event */
+static const llc_sap_action_t llc_sap_active_state_actions_7[] = {
+ [0] = llc_sap_action_send_test_r,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_7 = {
+ .ev = llc_sap_ev_rx_test_c,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_active_state_actions_7
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_TEST_R event */
+static const llc_sap_action_t llc_sap_active_state_actions_8[] = {
+ [0] = llc_sap_action_test_ind,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_8 = {
+ .ev = llc_sap_ev_rx_test_r,
+ .next_state = LLC_SAP_STATE_ACTIVE,
+ .ev_actions = llc_sap_active_state_actions_8,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for
+ * LLC_SAP_EV_DEACTIVATION_REQ event
+ */
+static const llc_sap_action_t llc_sap_active_state_actions_9[] = {
+ [0] = llc_sap_action_report_status,
+ [1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_9 = {
+ .ev = llc_sap_ev_deactivation_req,
+ .next_state = LLC_SAP_STATE_INACTIVE,
+ .ev_actions = llc_sap_active_state_actions_9
+};
+
+/* array of pointers; one to each transition */
+static struct llc_sap_state_trans *llc_sap_active_state_transitions[] = {
+ [0] = &llc_sap_active_state_trans_2,
+ [1] = &llc_sap_active_state_trans_1,
+ [2] = &llc_sap_active_state_trans_3,
+ [3] = &llc_sap_active_state_trans_4,
+ [4] = &llc_sap_active_state_trans_5,
+ [5] = &llc_sap_active_state_trans_6,
+ [6] = &llc_sap_active_state_trans_7,
+ [7] = &llc_sap_active_state_trans_8,
+ [8] = &llc_sap_active_state_trans_9,
+ [9] = &llc_sap_state_trans_end,
+};
+
+/* SAP state transition table */
+struct llc_sap_state llc_sap_state_table[LLC_NR_SAP_STATES] = {
+ [LLC_SAP_STATE_INACTIVE - 1] = {
+ .curr_state = LLC_SAP_STATE_INACTIVE,
+ .transitions = llc_sap_inactive_state_transitions,
+ },
+ [LLC_SAP_STATE_ACTIVE - 1] = {
+ .curr_state = LLC_SAP_STATE_ACTIVE,
+ .transitions = llc_sap_active_state_transitions,
+ },
+};
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
new file mode 100644
index 000000000..d0e1e804e
--- /dev/null
+++ b/net/llc/llc_sap.c
@@ -0,0 +1,439 @@
+/*
+ * llc_sap.c - driver routines for SAP component.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <net/llc.h>
+#include <net/llc_if.h>
+#include <net/llc_conn.h>
+#include <net/llc_pdu.h>
+#include <net/llc_sap.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_s_st.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+
+static int llc_mac_header_len(unsigned short devtype)
+{
+ switch (devtype) {
+ case ARPHRD_ETHER:
+ case ARPHRD_LOOPBACK:
+ return sizeof(struct ethhdr);
+ }
+ return 0;
+}
+
+/**
+ * llc_alloc_frame - allocates sk_buff for frame
+ * @dev: network device this skb will be sent over
+ * @type: pdu type to allocate
+ * @data_size: data size to allocate
+ *
+ * Allocates an sk_buff for frame and initializes sk_buff fields.
+ * Returns allocated skb or %NULL when out of memory.
+ */
+struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev,
+ u8 type, u32 data_size)
+{
+ int hlen = type == LLC_PDU_TYPE_U ? 3 : 4;
+ struct sk_buff *skb;
+
+ hlen += llc_mac_header_len(dev->type);
+ skb = alloc_skb(hlen + data_size, GFP_ATOMIC);
+
+ if (skb) {
+ skb_reset_mac_header(skb);
+ skb_reserve(skb, hlen);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->protocol = htons(ETH_P_802_2);
+ skb->dev = dev;
+ if (sk != NULL)
+ skb_set_owner_w(skb, sk);
+ }
+ return skb;
+}
+
+void llc_save_primitive(struct sock *sk, struct sk_buff *skb, u8 prim)
+{
+ struct sockaddr_llc *addr;
+
+ /* save primitive for use by the user. */
+ addr = llc_ui_skb_cb(skb);
+
+ memset(addr, 0, sizeof(*addr));
+ addr->sllc_family = sk->sk_family;
+ addr->sllc_arphrd = skb->dev->type;
+ addr->sllc_test = prim == LLC_TEST_PRIM;
+ addr->sllc_xid = prim == LLC_XID_PRIM;
+ addr->sllc_ua = prim == LLC_DATAUNIT_PRIM;
+ llc_pdu_decode_sa(skb, addr->sllc_mac);
+ llc_pdu_decode_ssap(skb, &addr->sllc_sap);
+}
+
+/**
+ * llc_sap_rtn_pdu - Informs upper layer on rx of an UI, XID or TEST pdu.
+ * @sap: pointer to SAP
+ * @skb: received pdu
+ */
+void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ switch (LLC_U_PDU_RSP(pdu)) {
+ case LLC_1_PDU_CMD_TEST:
+ ev->prim = LLC_TEST_PRIM; break;
+ case LLC_1_PDU_CMD_XID:
+ ev->prim = LLC_XID_PRIM; break;
+ case LLC_1_PDU_CMD_UI:
+ ev->prim = LLC_DATAUNIT_PRIM; break;
+ }
+ ev->ind_cfm_flag = LLC_IND;
+}
+
+/**
+ * llc_find_sap_trans - finds transition for event
+ * @sap: pointer to SAP
+ * @skb: happened event
+ *
+ * This function finds transition that matches with happened event.
+ * Returns the pointer to found transition on success or %NULL for
+ * failure.
+ */
+static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap,
+ struct sk_buff *skb)
+{
+ int i = 0;
+ struct llc_sap_state_trans *rc = NULL;
+ struct llc_sap_state_trans **next_trans;
+ struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1];
+ /*
+ * Search thru events for this state until list exhausted or until
+ * its obvious the event is not valid for the current state
+ */
+ for (next_trans = curr_state->transitions; next_trans[i]->ev; i++)
+ if (!next_trans[i]->ev(sap, skb)) {
+ rc = next_trans[i]; /* got event match; return it */
+ break;
+ }
+ return rc;
+}
+
+/**
+ * llc_exec_sap_trans_actions - execute actions related to event
+ * @sap: pointer to SAP
+ * @trans: pointer to transition that it's actions must be performed
+ * @skb: happened event.
+ *
+ * This function executes actions that is related to happened event.
+ * Returns 0 for success and 1 for failure of at least one action.
+ */
+static int llc_exec_sap_trans_actions(struct llc_sap *sap,
+ struct llc_sap_state_trans *trans,
+ struct sk_buff *skb)
+{
+ int rc = 0;
+ const llc_sap_action_t *next_action = trans->ev_actions;
+
+ for (; next_action && *next_action; next_action++)
+ if ((*next_action)(sap, skb))
+ rc = 1;
+ return rc;
+}
+
+/**
+ * llc_sap_next_state - finds transition, execs actions & change SAP state
+ * @sap: pointer to SAP
+ * @skb: happened event
+ *
+ * This function finds transition that matches with happened event, then
+ * executes related actions and finally changes state of SAP. It returns
+ * 0 on success and 1 for failure.
+ */
+static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb)
+{
+ int rc = 1;
+ struct llc_sap_state_trans *trans;
+
+ if (sap->state > LLC_NR_SAP_STATES)
+ goto out;
+ trans = llc_find_sap_trans(sap, skb);
+ if (!trans)
+ goto out;
+ /*
+ * Got the state to which we next transition; perform the actions
+ * associated with this transition before actually transitioning to the
+ * next state
+ */
+ rc = llc_exec_sap_trans_actions(sap, trans, skb);
+ if (rc)
+ goto out;
+ /*
+ * Transition SAP to next state if all actions execute successfully
+ */
+ sap->state = trans->next_state;
+out:
+ return rc;
+}
+
+/**
+ * llc_sap_state_process - sends event to SAP state machine
+ * @sap: sap to use
+ * @skb: pointer to occurred event
+ *
+ * After executing actions of the event, upper layer will be indicated
+ * if needed(on receiving an UI frame). sk can be null for the
+ * datalink_proto case.
+ */
+static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ /*
+ * We have to hold the skb, because llc_sap_next_state
+ * will kfree it in the sending path and we need to
+ * look at the skb->cb, where we encode llc_sap_state_ev.
+ */
+ skb_get(skb);
+ ev->ind_cfm_flag = 0;
+ llc_sap_next_state(sap, skb);
+ if (ev->ind_cfm_flag == LLC_IND) {
+ if (skb->sk->sk_state == TCP_LISTEN)
+ kfree_skb(skb);
+ else {
+ llc_save_primitive(skb->sk, skb, ev->prim);
+
+ /* queue skb to the user. */
+ if (sock_queue_rcv_skb(skb->sk, skb))
+ kfree_skb(skb);
+ }
+ }
+ kfree_skb(skb);
+}
+
+/**
+ * llc_build_and_send_test_pkt - TEST interface for upper layers.
+ * @sap: sap to use
+ * @skb: packet to send
+ * @dmac: destination mac address
+ * @dsap: destination sap
+ *
+ * This function is called when upper layer wants to send a TEST pdu.
+ * Returns 0 for success, 1 otherwise.
+ */
+void llc_build_and_send_test_pkt(struct llc_sap *sap,
+ struct sk_buff *skb, u8 *dmac, u8 dsap)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ ev->saddr.lsap = sap->laddr.lsap;
+ ev->daddr.lsap = dsap;
+ memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN);
+ memcpy(ev->daddr.mac, dmac, IFHWADDRLEN);
+
+ ev->type = LLC_SAP_EV_TYPE_PRIM;
+ ev->prim = LLC_TEST_PRIM;
+ ev->prim_type = LLC_PRIM_TYPE_REQ;
+ llc_sap_state_process(sap, skb);
+}
+
+/**
+ * llc_build_and_send_xid_pkt - XID interface for upper layers
+ * @sap: sap to use
+ * @skb: packet to send
+ * @dmac: destination mac address
+ * @dsap: destination sap
+ *
+ * This function is called when upper layer wants to send a XID pdu.
+ * Returns 0 for success, 1 otherwise.
+ */
+void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb,
+ u8 *dmac, u8 dsap)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ ev->saddr.lsap = sap->laddr.lsap;
+ ev->daddr.lsap = dsap;
+ memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN);
+ memcpy(ev->daddr.mac, dmac, IFHWADDRLEN);
+
+ ev->type = LLC_SAP_EV_TYPE_PRIM;
+ ev->prim = LLC_XID_PRIM;
+ ev->prim_type = LLC_PRIM_TYPE_REQ;
+ llc_sap_state_process(sap, skb);
+}
+
+/**
+ * llc_sap_rcv - sends received pdus to the sap state machine
+ * @sap: current sap component structure.
+ * @skb: received frame.
+ *
+ * Sends received pdus to the sap state machine.
+ */
+static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
+ struct sock *sk)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+ ev->type = LLC_SAP_EV_TYPE_PDU;
+ ev->reason = 0;
+ skb->sk = sk;
+ llc_sap_state_process(sap, skb);
+}
+
+static inline bool llc_dgram_match(const struct llc_sap *sap,
+ const struct llc_addr *laddr,
+ const struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ return sk->sk_type == SOCK_DGRAM &&
+ llc->laddr.lsap == laddr->lsap &&
+ ether_addr_equal(llc->laddr.mac, laddr->mac);
+}
+
+/**
+ * llc_lookup_dgram - Finds dgram socket for the local sap/mac
+ * @sap: SAP
+ * @laddr: address of local LLC (MAC + SAP)
+ *
+ * Search socket list of the SAP and finds connection using the local
+ * mac, and local sap. Returns pointer for socket found, %NULL otherwise.
+ */
+static struct sock *llc_lookup_dgram(struct llc_sap *sap,
+ const struct llc_addr *laddr)
+{
+ struct sock *rc;
+ struct hlist_nulls_node *node;
+ int slot = llc_sk_laddr_hashfn(sap, laddr);
+ struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot];
+
+ rcu_read_lock_bh();
+again:
+ sk_nulls_for_each_rcu(rc, node, laddr_hb) {
+ if (llc_dgram_match(sap, laddr, rc)) {
+ /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
+ goto again;
+ if (unlikely(llc_sk(rc)->sap != sap ||
+ !llc_dgram_match(sap, laddr, rc))) {
+ sock_put(rc);
+ continue;
+ }
+ goto found;
+ }
+ }
+ rc = NULL;
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (unlikely(get_nulls_value(node) != slot))
+ goto again;
+found:
+ rcu_read_unlock_bh();
+ return rc;
+}
+
+static inline bool llc_mcast_match(const struct llc_sap *sap,
+ const struct llc_addr *laddr,
+ const struct sk_buff *skb,
+ const struct sock *sk)
+{
+ struct llc_sock *llc = llc_sk(sk);
+
+ return sk->sk_type == SOCK_DGRAM &&
+ llc->laddr.lsap == laddr->lsap &&
+ llc->dev == skb->dev;
+}
+
+static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb,
+ struct sock **stack, int count)
+{
+ struct sk_buff *skb1;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ skb1 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb1) {
+ sock_put(stack[i]);
+ continue;
+ }
+
+ llc_sap_rcv(sap, skb1, stack[i]);
+ sock_put(stack[i]);
+ }
+}
+
+/**
+ * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets.
+ * @sap: SAP
+ * @laddr: address of local LLC (MAC + SAP)
+ *
+ * Search socket list of the SAP and finds connections with same sap.
+ * Deliver clone to each.
+ */
+static void llc_sap_mcast(struct llc_sap *sap,
+ const struct llc_addr *laddr,
+ struct sk_buff *skb)
+{
+ int i = 0, count = 256 / sizeof(struct sock *);
+ struct sock *sk, *stack[count];
+ struct llc_sock *llc;
+ struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex);
+
+ spin_lock_bh(&sap->sk_lock);
+ hlist_for_each_entry(llc, dev_hb, dev_hash_node) {
+
+ sk = &llc->sk;
+
+ if (!llc_mcast_match(sap, laddr, skb, sk))
+ continue;
+
+ sock_hold(sk);
+ if (i < count)
+ stack[i++] = sk;
+ else {
+ llc_do_mcast(sap, skb, stack, i);
+ i = 0;
+ }
+ }
+ spin_unlock_bh(&sap->sk_lock);
+
+ llc_do_mcast(sap, skb, stack, i);
+}
+
+
+void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
+{
+ struct llc_addr laddr;
+
+ llc_pdu_decode_da(skb, laddr.mac);
+ llc_pdu_decode_dsap(skb, &laddr.lsap);
+
+ if (is_multicast_ether_addr(laddr.mac)) {
+ llc_sap_mcast(sap, &laddr, skb);
+ kfree_skb(skb);
+ } else {
+ struct sock *sk = llc_lookup_dgram(sap, &laddr);
+ if (sk) {
+ llc_sap_rcv(sap, skb, sk);
+ sock_put(sk);
+ } else
+ kfree_skb(skb);
+ }
+}
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
new file mode 100644
index 000000000..204a8351e
--- /dev/null
+++ b/net/llc/llc_station.c
@@ -0,0 +1,125 @@
+/*
+ * llc_station.c - station component of LLC
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/llc.h>
+#include <net/llc_sap.h>
+#include <net/llc_conn.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_st.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_s_st.h>
+#include <net/llc_pdu.h>
+
+static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb)
+{
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && /* command PDU */
+ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */
+ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID &&
+ !pdu->dsap ? 0 : 1; /* NULL DSAP value */
+}
+
+static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb)
+{
+ struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+ return LLC_PDU_IS_CMD(pdu) && /* command PDU */
+ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */
+ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST &&
+ !pdu->dsap ? 0 : 1; /* NULL DSAP */
+}
+
+static int llc_station_ac_send_xid_r(struct sk_buff *skb)
+{
+ u8 mac_da[ETH_ALEN], dsap;
+ int rc = 1;
+ struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U,
+ sizeof(struct llc_xid_info));
+
+ if (!nskb)
+ goto out;
+ rc = 0;
+ llc_pdu_decode_sa(skb, mac_da);
+ llc_pdu_decode_ssap(skb, &dsap);
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
+ llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 127);
+ rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da);
+ if (unlikely(rc))
+ goto free;
+ dev_queue_xmit(nskb);
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+static int llc_station_ac_send_test_r(struct sk_buff *skb)
+{
+ u8 mac_da[ETH_ALEN], dsap;
+ int rc = 1;
+ u32 data_size;
+ struct sk_buff *nskb;
+
+ /* The test request command is type U (llc_len = 3) */
+ data_size = ntohs(eth_hdr(skb)->h_proto) - 3;
+ nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size);
+
+ if (!nskb)
+ goto out;
+ rc = 0;
+ llc_pdu_decode_sa(skb, mac_da);
+ llc_pdu_decode_ssap(skb, &dsap);
+ llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
+ llc_pdu_init_as_test_rsp(nskb, skb);
+ rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da);
+ if (unlikely(rc))
+ goto free;
+ dev_queue_xmit(nskb);
+out:
+ return rc;
+free:
+ kfree_skb(nskb);
+ goto out;
+}
+
+/**
+ * llc_station_rcv - send received pdu to the station state machine
+ * @skb: received frame.
+ *
+ * Sends data unit to station state machine.
+ */
+static void llc_station_rcv(struct sk_buff *skb)
+{
+ if (llc_stat_ev_rx_null_dsap_xid_c(skb))
+ llc_station_ac_send_xid_r(skb);
+ else if (llc_stat_ev_rx_null_dsap_test_c(skb))
+ llc_station_ac_send_test_r(skb);
+ kfree_skb(skb);
+}
+
+void __init llc_station_init(void)
+{
+ llc_set_station_handler(llc_station_rcv);
+}
+
+void llc_station_exit(void)
+{
+ llc_set_station_handler(NULL);
+}
diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c
new file mode 100644
index 000000000..799bafc2a
--- /dev/null
+++ b/net/llc/sysctl_net_llc.c
@@ -0,0 +1,78 @@
+/*
+ * sysctl_net_llc.c: sysctl interface to LLC net subsystem.
+ *
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+#include <net/llc.h>
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+static struct ctl_table llc2_timeout_table[] = {
+ {
+ .procname = "ack",
+ .data = &sysctl_llc2_ack_timeout,
+ .maxlen = sizeof(sysctl_llc2_ack_timeout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "busy",
+ .data = &sysctl_llc2_busy_timeout,
+ .maxlen = sizeof(sysctl_llc2_busy_timeout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "p",
+ .data = &sysctl_llc2_p_timeout,
+ .maxlen = sizeof(sysctl_llc2_p_timeout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "rej",
+ .data = &sysctl_llc2_rej_timeout,
+ .maxlen = sizeof(sysctl_llc2_rej_timeout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { },
+};
+
+static struct ctl_table llc_station_table[] = {
+ { },
+};
+
+static struct ctl_table_header *llc2_timeout_header;
+static struct ctl_table_header *llc_station_header;
+
+int __init llc_sysctl_init(void)
+{
+ llc2_timeout_header = register_net_sysctl(&init_net, "net/llc/llc2/timeout", llc2_timeout_table);
+ llc_station_header = register_net_sysctl(&init_net, "net/llc/station", llc_station_table);
+
+ if (!llc2_timeout_header || !llc_station_header) {
+ llc_sysctl_exit();
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void llc_sysctl_exit(void)
+{
+ if (llc2_timeout_header) {
+ unregister_net_sysctl_table(llc2_timeout_header);
+ llc2_timeout_header = NULL;
+ }
+ if (llc_station_header) {
+ unregister_net_sysctl_table(llc_station_header);
+ llc_station_header = NULL;
+ }
+}
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
new file mode 100644
index 000000000..64a012a0c
--- /dev/null
+++ b/net/mac80211/Kconfig
@@ -0,0 +1,307 @@
+config MAC80211
+ tristate "Generic IEEE 802.11 Networking Stack (mac80211)"
+ depends on CFG80211
+ select CRYPTO
+ select CRYPTO_ARC4
+ select CRYPTO_AES
+ select CRYPTO_CCM
+ select CRYPTO_GCM
+ select CRC32
+ select AVERAGE
+ ---help---
+ This option enables the hardware independent IEEE 802.11
+ networking stack.
+
+comment "CFG80211 needs to be enabled for MAC80211"
+ depends on CFG80211=n
+
+if MAC80211 != n
+
+config MAC80211_HAS_RC
+ bool
+
+config MAC80211_RC_MINSTREL
+ bool "Minstrel" if EXPERT
+ select MAC80211_HAS_RC
+ default y
+ ---help---
+ This option enables the 'minstrel' TX rate control algorithm
+
+config MAC80211_RC_MINSTREL_HT
+ bool "Minstrel 802.11n support" if EXPERT
+ depends on MAC80211_RC_MINSTREL
+ default y
+ ---help---
+ This option enables the 'minstrel_ht' TX rate control algorithm
+
+config MAC80211_RC_MINSTREL_VHT
+ bool "Minstrel 802.11ac support" if EXPERT
+ depends on MAC80211_RC_MINSTREL_HT
+ default n
+ ---help---
+ This option enables VHT in the 'minstrel_ht' TX rate control algorithm
+
+choice
+ prompt "Default rate control algorithm"
+ depends on MAC80211_HAS_RC
+ default MAC80211_RC_DEFAULT_MINSTREL
+ ---help---
+ This option selects the default rate control algorithm
+ mac80211 will use. Note that this default can still be
+ overridden through the ieee80211_default_rc_algo module
+ parameter if different algorithms are available.
+
+config MAC80211_RC_DEFAULT_MINSTREL
+ bool "Minstrel"
+ depends on MAC80211_RC_MINSTREL
+ ---help---
+ Select Minstrel as the default rate control algorithm.
+
+
+endchoice
+
+config MAC80211_RC_DEFAULT
+ string
+ default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT
+ default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL
+ default ""
+
+endif
+
+comment "Some wireless drivers require a rate control algorithm"
+ depends on MAC80211 && MAC80211_HAS_RC=n
+
+config MAC80211_MESH
+ bool "Enable mac80211 mesh networking (pre-802.11s) support"
+ depends on MAC80211
+ ---help---
+ This options enables support of Draft 802.11s mesh networking.
+ The implementation is based on Draft 2.08 of the Mesh Networking
+ amendment. However, no compliance with that draft is claimed or even
+ possible, as drafts leave a number of identifiers to be defined after
+ ratification. For more information visit http://o11s.org/.
+
+config MAC80211_LEDS
+ bool "Enable LED triggers"
+ depends on MAC80211
+ depends on LEDS_CLASS
+ select LEDS_TRIGGERS
+ ---help---
+ This option enables a few LED triggers for different
+ packet receive/transmit events.
+
+config MAC80211_DEBUGFS
+ bool "Export mac80211 internals in DebugFS"
+ depends on MAC80211 && DEBUG_FS
+ ---help---
+ Select this to see extensive information about
+ the internal state of mac80211 in debugfs.
+
+ Say N unless you know you need this.
+
+config MAC80211_MESSAGE_TRACING
+ bool "Trace all mac80211 debug messages"
+ depends on MAC80211
+ ---help---
+ Select this option to have mac80211 register the
+ mac80211_msg trace subsystem with tracepoints to
+ collect all debugging messages, independent of
+ printing them into the kernel log.
+
+ The overhead in this option is that all the messages
+ need to be present in the binary and formatted at
+ runtime for tracing.
+
+menuconfig MAC80211_DEBUG_MENU
+ bool "Select mac80211 debugging features"
+ depends on MAC80211
+ ---help---
+ This option collects various mac80211 debug settings.
+
+config MAC80211_NOINLINE
+ bool "Do not inline TX/RX handlers"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ This option affects code generation in mac80211, when
+ selected some functions are marked "noinline" to allow
+ easier debugging of problems in the transmit and receive
+ paths.
+
+ This option increases code size a bit and inserts a lot
+ of function calls in the code, but is otherwise safe to
+ enable.
+
+ If unsure, say N unless you expect to be finding problems
+ in mac80211.
+
+config MAC80211_VERBOSE_DEBUG
+ bool "Verbose debugging output"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ Selecting this option causes mac80211 to print out
+ many debugging messages. It should not be selected
+ on production systems as some of the messages are
+ remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_MLME_DEBUG
+ bool "Verbose managed MLME output"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ Selecting this option causes mac80211 to print out
+ debugging messages for the managed-mode MLME. It
+ should not be selected on production systems as some
+ of the messages are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_STA_DEBUG
+ bool "Verbose station debugging"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ Selecting this option causes mac80211 to print out
+ debugging messages for station addition/removal.
+
+ Do not select this option.
+
+config MAC80211_HT_DEBUG
+ bool "Verbose HT debugging"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ This option enables 802.11n High Throughput features
+ debug tracing output.
+
+ It should not be selected on production systems as some
+ of the messages are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_OCB_DEBUG
+ bool "Verbose OCB debugging"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ Selecting this option causes mac80211 to print out
+ very verbose OCB debugging messages. It should not
+ be selected on production systems as those messages
+ are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_IBSS_DEBUG
+ bool "Verbose IBSS debugging"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ Selecting this option causes mac80211 to print out
+ very verbose IBSS debugging messages. It should not
+ be selected on production systems as those messages
+ are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_PS_DEBUG
+ bool "Verbose powersave mode debugging"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ Selecting this option causes mac80211 to print out very
+ verbose power save mode debugging messages (when mac80211
+ is an AP and has power saving stations.)
+ It should not be selected on production systems as those
+ messages are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_MPL_DEBUG
+ bool "Verbose mesh peer link debugging"
+ depends on MAC80211_DEBUG_MENU
+ depends on MAC80211_MESH
+ ---help---
+ Selecting this option causes mac80211 to print out very
+ verbose mesh peer link debugging messages (when mac80211
+ is taking part in a mesh network).
+ It should not be selected on production systems as those
+ messages are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_MPATH_DEBUG
+ bool "Verbose mesh path debugging"
+ depends on MAC80211_DEBUG_MENU
+ depends on MAC80211_MESH
+ ---help---
+ Selecting this option causes mac80211 to print out very
+ verbose mesh path selection debugging messages (when mac80211
+ is taking part in a mesh network).
+ It should not be selected on production systems as those
+ messages are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_MHWMP_DEBUG
+ bool "Verbose mesh HWMP routing debugging"
+ depends on MAC80211_DEBUG_MENU
+ depends on MAC80211_MESH
+ ---help---
+ Selecting this option causes mac80211 to print out very
+ verbose mesh routing (HWMP) debugging messages (when mac80211
+ is taking part in a mesh network).
+ It should not be selected on production systems as those
+ messages are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_MESH_SYNC_DEBUG
+ bool "Verbose mesh synchronization debugging"
+ depends on MAC80211_DEBUG_MENU
+ depends on MAC80211_MESH
+ ---help---
+ Selecting this option causes mac80211 to print out very verbose mesh
+ synchronization debugging messages (when mac80211 is taking part in a
+ mesh network).
+
+ Do not select this option.
+
+config MAC80211_MESH_CSA_DEBUG
+ bool "Verbose mesh channel switch debugging"
+ depends on MAC80211_DEBUG_MENU
+ depends on MAC80211_MESH
+ ---help---
+ Selecting this option causes mac80211 to print out very verbose mesh
+ channel switch debugging messages (when mac80211 is taking part in a
+ mesh network).
+
+ Do not select this option.
+
+config MAC80211_MESH_PS_DEBUG
+ bool "Verbose mesh powersave debugging"
+ depends on MAC80211_DEBUG_MENU
+ depends on MAC80211_MESH
+ ---help---
+ Selecting this option causes mac80211 to print out very verbose mesh
+ powersave debugging messages (when mac80211 is taking part in a
+ mesh network).
+
+ Do not select this option.
+
+config MAC80211_TDLS_DEBUG
+ bool "Verbose TDLS debugging"
+ depends on MAC80211_DEBUG_MENU
+ ---help---
+ Selecting this option causes mac80211 to print out very
+ verbose TDLS selection debugging messages (when mac80211
+ is a TDLS STA).
+ It should not be selected on production systems as those
+ messages are remotely triggerable.
+
+ Do not select this option.
+
+config MAC80211_DEBUG_COUNTERS
+ bool "Extra statistics for TX/RX debugging"
+ depends on MAC80211_DEBUG_MENU
+ depends on MAC80211_DEBUGFS
+ ---help---
+ Selecting this option causes mac80211 to keep additional
+ and very verbose statistics about TX and RX handler use
+ and show them in debugfs.
+
+ If unsure, say N.
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
new file mode 100644
index 000000000..3275f0188
--- /dev/null
+++ b/net/mac80211/Makefile
@@ -0,0 +1,63 @@
+obj-$(CONFIG_MAC80211) += mac80211.o
+
+# mac80211 objects
+mac80211-y := \
+ main.o status.o \
+ sta_info.o \
+ wep.o \
+ wpa.o \
+ scan.o offchannel.o \
+ ht.o agg-tx.o agg-rx.o \
+ vht.o \
+ ibss.o \
+ iface.o \
+ rate.o \
+ michael.o \
+ tkip.o \
+ aes_ccm.o \
+ aes_gcm.o \
+ aes_cmac.o \
+ aes_gmac.o \
+ cfg.o \
+ ethtool.o \
+ rx.o \
+ spectmgmt.o \
+ tx.o \
+ key.o \
+ util.o \
+ wme.o \
+ event.o \
+ chan.o \
+ trace.o mlme.o \
+ tdls.o \
+ ocb.o
+
+mac80211-$(CONFIG_MAC80211_LEDS) += led.o
+mac80211-$(CONFIG_MAC80211_DEBUGFS) += \
+ debugfs.o \
+ debugfs_sta.o \
+ debugfs_netdev.o \
+ debugfs_key.o
+
+mac80211-$(CONFIG_MAC80211_MESH) += \
+ mesh.o \
+ mesh_pathtbl.o \
+ mesh_plink.o \
+ mesh_hwmp.o \
+ mesh_sync.o \
+ mesh_ps.o
+
+mac80211-$(CONFIG_PM) += pm.o
+
+CFLAGS_trace.o := -I$(src)
+
+rc80211_minstrel-y := rc80211_minstrel.o
+rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o
+
+rc80211_minstrel_ht-y := rc80211_minstrel_ht.o
+rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o
+
+mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
+mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
+
+ccflags-y += -D__CHECK_ENDIAN__ -DDEBUG
diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
new file mode 100644
index 000000000..208df7c0b
--- /dev/null
+++ b/net/mac80211/aes_ccm.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2003-2004, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ *
+ * Rewrite: Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/aes.h>
+
+#include <net/mac80211.h>
+#include "key.h"
+#include "aes_ccm.h"
+
+void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic,
+ size_t mic_len)
+{
+ struct scatterlist assoc, pt, ct[2];
+
+ char aead_req_data[sizeof(struct aead_request) +
+ crypto_aead_reqsize(tfm)]
+ __aligned(__alignof__(struct aead_request));
+ struct aead_request *aead_req = (void *) aead_req_data;
+
+ memset(aead_req, 0, sizeof(aead_req_data));
+
+ sg_init_one(&pt, data, data_len);
+ sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_init_table(ct, 2);
+ sg_set_buf(&ct[0], data, data_len);
+ sg_set_buf(&ct[1], mic, mic_len);
+
+ aead_request_set_tfm(aead_req, tfm);
+ aead_request_set_assoc(aead_req, &assoc, assoc.length);
+ aead_request_set_crypt(aead_req, &pt, ct, data_len, b_0);
+
+ crypto_aead_encrypt(aead_req);
+}
+
+int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic,
+ size_t mic_len)
+{
+ struct scatterlist assoc, pt, ct[2];
+ char aead_req_data[sizeof(struct aead_request) +
+ crypto_aead_reqsize(tfm)]
+ __aligned(__alignof__(struct aead_request));
+ struct aead_request *aead_req = (void *) aead_req_data;
+
+ if (data_len == 0)
+ return -EINVAL;
+
+ memset(aead_req, 0, sizeof(aead_req_data));
+
+ sg_init_one(&pt, data, data_len);
+ sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_init_table(ct, 2);
+ sg_set_buf(&ct[0], data, data_len);
+ sg_set_buf(&ct[1], mic, mic_len);
+
+ aead_request_set_tfm(aead_req, tfm);
+ aead_request_set_assoc(aead_req, &assoc, assoc.length);
+ aead_request_set_crypt(aead_req, ct, &pt, data_len + mic_len, b_0);
+
+ return crypto_aead_decrypt(aead_req);
+}
+
+struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[],
+ size_t key_len,
+ size_t mic_len)
+{
+ struct crypto_aead *tfm;
+ int err;
+
+ tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return tfm;
+
+ err = crypto_aead_setkey(tfm, key, key_len);
+ if (err)
+ goto free_aead;
+ err = crypto_aead_setauthsize(tfm, mic_len);
+ if (err)
+ goto free_aead;
+
+ return tfm;
+
+free_aead:
+ crypto_free_aead(tfm);
+ return ERR_PTR(err);
+}
+
+void ieee80211_aes_key_free(struct crypto_aead *tfm)
+{
+ crypto_free_aead(tfm);
+}
diff --git a/net/mac80211/aes_ccm.h b/net/mac80211/aes_ccm.h
new file mode 100644
index 000000000..6a73d1e4d
--- /dev/null
+++ b/net/mac80211/aes_ccm.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2003-2004, Instant802 Networks, Inc.
+ * Copyright 2006, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef AES_CCM_H
+#define AES_CCM_H
+
+#include <linux/crypto.h>
+
+struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[],
+ size_t key_len,
+ size_t mic_len);
+void ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic,
+ size_t mic_len);
+int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic,
+ size_t mic_len);
+void ieee80211_aes_key_free(struct crypto_aead *tfm);
+
+#endif /* AES_CCM_H */
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
new file mode 100644
index 000000000..4192806be
--- /dev/null
+++ b/net/mac80211/aes_cmac.c
@@ -0,0 +1,164 @@
+/*
+ * AES-128-CMAC with TLen 16 for IEEE 802.11w BIP
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <crypto/aes.h>
+
+#include <net/mac80211.h>
+#include "key.h"
+#include "aes_cmac.h"
+
+#define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */
+#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
+#define AAD_LEN 20
+
+
+static void gf_mulx(u8 *pad)
+{
+ int i, carry;
+
+ carry = pad[0] & 0x80;
+ for (i = 0; i < AES_BLOCK_SIZE - 1; i++)
+ pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7);
+ pad[AES_BLOCK_SIZE - 1] <<= 1;
+ if (carry)
+ pad[AES_BLOCK_SIZE - 1] ^= 0x87;
+}
+
+static void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
+ const u8 *addr[], const size_t *len, u8 *mac,
+ size_t mac_len)
+{
+ u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
+ const u8 *pos, *end;
+ size_t i, e, left, total_len;
+
+ memset(cbc, 0, AES_BLOCK_SIZE);
+
+ total_len = 0;
+ for (e = 0; e < num_elem; e++)
+ total_len += len[e];
+ left = total_len;
+
+ e = 0;
+ pos = addr[0];
+ end = pos + len[0];
+
+ while (left >= AES_BLOCK_SIZE) {
+ for (i = 0; i < AES_BLOCK_SIZE; i++) {
+ cbc[i] ^= *pos++;
+ if (pos >= end) {
+ e++;
+ pos = addr[e];
+ end = pos + len[e];
+ }
+ }
+ if (left > AES_BLOCK_SIZE)
+ crypto_cipher_encrypt_one(tfm, cbc, cbc);
+ left -= AES_BLOCK_SIZE;
+ }
+
+ memset(pad, 0, AES_BLOCK_SIZE);
+ crypto_cipher_encrypt_one(tfm, pad, pad);
+ gf_mulx(pad);
+
+ if (left || total_len == 0) {
+ for (i = 0; i < left; i++) {
+ cbc[i] ^= *pos++;
+ if (pos >= end) {
+ e++;
+ pos = addr[e];
+ end = pos + len[e];
+ }
+ }
+ cbc[left] ^= 0x80;
+ gf_mulx(pad);
+ }
+
+ for (i = 0; i < AES_BLOCK_SIZE; i++)
+ pad[i] ^= cbc[i];
+ crypto_cipher_encrypt_one(tfm, pad, pad);
+ memcpy(mac, pad, mac_len);
+}
+
+
+void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
+ const u8 *data, size_t data_len, u8 *mic)
+{
+ const u8 *addr[3];
+ size_t len[3];
+ u8 zero[CMAC_TLEN];
+
+ memset(zero, 0, CMAC_TLEN);
+ addr[0] = aad;
+ len[0] = AAD_LEN;
+ addr[1] = data;
+ len[1] = data_len - CMAC_TLEN;
+ addr[2] = zero;
+ len[2] = CMAC_TLEN;
+
+ aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN);
+}
+
+void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+ const u8 *data, size_t data_len, u8 *mic)
+{
+ const u8 *addr[3];
+ size_t len[3];
+ u8 zero[CMAC_TLEN_256];
+
+ memset(zero, 0, CMAC_TLEN_256);
+ addr[0] = aad;
+ len[0] = AAD_LEN;
+ addr[1] = data;
+ len[1] = data_len - CMAC_TLEN_256;
+ addr[2] = zero;
+ len[2] = CMAC_TLEN_256;
+
+ aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256);
+}
+
+struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
+ size_t key_len)
+{
+ struct crypto_cipher *tfm;
+
+ tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+ if (!IS_ERR(tfm))
+ crypto_cipher_setkey(tfm, key, key_len);
+
+ return tfm;
+}
+
+
+void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
+{
+ crypto_free_cipher(tfm);
+}
+
+void ieee80211_aes_cmac_calculate_k1_k2(struct ieee80211_key_conf *keyconf,
+ u8 *k1, u8 *k2)
+{
+ u8 l[AES_BLOCK_SIZE] = {};
+ struct ieee80211_key *key =
+ container_of(keyconf, struct ieee80211_key, conf);
+
+ crypto_cipher_encrypt_one(key->u.aes_cmac.tfm, l, l);
+
+ memcpy(k1, l, AES_BLOCK_SIZE);
+ gf_mulx(k1);
+
+ memcpy(k2, k1, AES_BLOCK_SIZE);
+ gf_mulx(k2);
+}
+EXPORT_SYMBOL(ieee80211_aes_cmac_calculate_k1_k2);
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
new file mode 100644
index 000000000..3702041f4
--- /dev/null
+++ b/net/mac80211/aes_cmac.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef AES_CMAC_H
+#define AES_CMAC_H
+
+#include <linux/crypto.h>
+
+struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
+ size_t key_len);
+void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
+ const u8 *data, size_t data_len, u8 *mic);
+void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+ const u8 *data, size_t data_len, u8 *mic);
+void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm);
+
+#endif /* AES_CMAC_H */
diff --git a/net/mac80211/aes_gcm.c b/net/mac80211/aes_gcm.c
new file mode 100644
index 000000000..fd278bbe1
--- /dev/null
+++ b/net/mac80211/aes_gcm.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2014-2015, Qualcomm Atheros, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/aes.h>
+
+#include <net/mac80211.h>
+#include "key.h"
+#include "aes_gcm.h"
+
+void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic)
+{
+ struct scatterlist assoc, pt, ct[2];
+
+ char aead_req_data[sizeof(struct aead_request) +
+ crypto_aead_reqsize(tfm)]
+ __aligned(__alignof__(struct aead_request));
+ struct aead_request *aead_req = (void *)aead_req_data;
+
+ memset(aead_req, 0, sizeof(aead_req_data));
+
+ sg_init_one(&pt, data, data_len);
+ sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_init_table(ct, 2);
+ sg_set_buf(&ct[0], data, data_len);
+ sg_set_buf(&ct[1], mic, IEEE80211_GCMP_MIC_LEN);
+
+ aead_request_set_tfm(aead_req, tfm);
+ aead_request_set_assoc(aead_req, &assoc, assoc.length);
+ aead_request_set_crypt(aead_req, &pt, ct, data_len, j_0);
+
+ crypto_aead_encrypt(aead_req);
+}
+
+int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic)
+{
+ struct scatterlist assoc, pt, ct[2];
+ char aead_req_data[sizeof(struct aead_request) +
+ crypto_aead_reqsize(tfm)]
+ __aligned(__alignof__(struct aead_request));
+ struct aead_request *aead_req = (void *)aead_req_data;
+
+ if (data_len == 0)
+ return -EINVAL;
+
+ memset(aead_req, 0, sizeof(aead_req_data));
+
+ sg_init_one(&pt, data, data_len);
+ sg_init_one(&assoc, &aad[2], be16_to_cpup((__be16 *)aad));
+ sg_init_table(ct, 2);
+ sg_set_buf(&ct[0], data, data_len);
+ sg_set_buf(&ct[1], mic, IEEE80211_GCMP_MIC_LEN);
+
+ aead_request_set_tfm(aead_req, tfm);
+ aead_request_set_assoc(aead_req, &assoc, assoc.length);
+ aead_request_set_crypt(aead_req, ct, &pt,
+ data_len + IEEE80211_GCMP_MIC_LEN, j_0);
+
+ return crypto_aead_decrypt(aead_req);
+}
+
+struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[],
+ size_t key_len)
+{
+ struct crypto_aead *tfm;
+ int err;
+
+ tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return tfm;
+
+ err = crypto_aead_setkey(tfm, key, key_len);
+ if (err)
+ goto free_aead;
+ err = crypto_aead_setauthsize(tfm, IEEE80211_GCMP_MIC_LEN);
+ if (err)
+ goto free_aead;
+
+ return tfm;
+
+free_aead:
+ crypto_free_aead(tfm);
+ return ERR_PTR(err);
+}
+
+void ieee80211_aes_gcm_key_free(struct crypto_aead *tfm)
+{
+ crypto_free_aead(tfm);
+}
diff --git a/net/mac80211/aes_gcm.h b/net/mac80211/aes_gcm.h
new file mode 100644
index 000000000..1347fda6b
--- /dev/null
+++ b/net/mac80211/aes_gcm.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2014-2015, Qualcomm Atheros, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef AES_GCM_H
+#define AES_GCM_H
+
+#include <linux/crypto.h>
+
+void ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic);
+int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad,
+ u8 *data, size_t data_len, u8 *mic);
+struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[],
+ size_t key_len);
+void ieee80211_aes_gcm_key_free(struct crypto_aead *tfm);
+
+#endif /* AES_GCM_H */
diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c
new file mode 100644
index 000000000..f1321b7d6
--- /dev/null
+++ b/net/mac80211/aes_gmac.c
@@ -0,0 +1,84 @@
+/*
+ * AES-GMAC for IEEE 802.11 BIP-GMAC-128 and BIP-GMAC-256
+ * Copyright 2015, Qualcomm Atheros, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/aes.h>
+
+#include <net/mac80211.h>
+#include "key.h"
+#include "aes_gmac.h"
+
+#define GMAC_MIC_LEN 16
+#define GMAC_NONCE_LEN 12
+#define AAD_LEN 20
+
+int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
+ const u8 *data, size_t data_len, u8 *mic)
+{
+ struct scatterlist sg[3], ct[1];
+ char aead_req_data[sizeof(struct aead_request) +
+ crypto_aead_reqsize(tfm)]
+ __aligned(__alignof__(struct aead_request));
+ struct aead_request *aead_req = (void *)aead_req_data;
+ u8 zero[GMAC_MIC_LEN], iv[AES_BLOCK_SIZE];
+
+ if (data_len < GMAC_MIC_LEN)
+ return -EINVAL;
+
+ memset(aead_req, 0, sizeof(aead_req_data));
+
+ memset(zero, 0, GMAC_MIC_LEN);
+ sg_init_table(sg, 3);
+ sg_set_buf(&sg[0], aad, AAD_LEN);
+ sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN);
+ sg_set_buf(&sg[2], zero, GMAC_MIC_LEN);
+
+ memcpy(iv, nonce, GMAC_NONCE_LEN);
+ memset(iv + GMAC_NONCE_LEN, 0, sizeof(iv) - GMAC_NONCE_LEN);
+ iv[AES_BLOCK_SIZE - 1] = 0x01;
+
+ sg_init_table(ct, 1);
+ sg_set_buf(&ct[0], mic, GMAC_MIC_LEN);
+
+ aead_request_set_tfm(aead_req, tfm);
+ aead_request_set_assoc(aead_req, sg, AAD_LEN + data_len);
+ aead_request_set_crypt(aead_req, NULL, ct, 0, iv);
+
+ crypto_aead_encrypt(aead_req);
+
+ return 0;
+}
+
+struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[],
+ size_t key_len)
+{
+ struct crypto_aead *tfm;
+ int err;
+
+ tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return tfm;
+
+ err = crypto_aead_setkey(tfm, key, key_len);
+ if (!err)
+ err = crypto_aead_setauthsize(tfm, GMAC_MIC_LEN);
+ if (!err)
+ return tfm;
+
+ crypto_free_aead(tfm);
+ return ERR_PTR(err);
+}
+
+void ieee80211_aes_gmac_key_free(struct crypto_aead *tfm)
+{
+ crypto_free_aead(tfm);
+}
diff --git a/net/mac80211/aes_gmac.h b/net/mac80211/aes_gmac.h
new file mode 100644
index 000000000..d328204d7
--- /dev/null
+++ b/net/mac80211/aes_gmac.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2015, Qualcomm Atheros, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef AES_GMAC_H
+#define AES_GMAC_H
+
+#include <linux/crypto.h>
+
+struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[],
+ size_t key_len);
+int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
+ const u8 *data, size_t data_len, u8 *mic);
+void ieee80211_aes_gmac_key_free(struct crypto_aead *tfm);
+
+#endif /* AES_GMAC_H */
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
new file mode 100644
index 000000000..5c564a68f
--- /dev/null
+++ b/net/mac80211/agg-rx.c
@@ -0,0 +1,426 @@
+/*
+ * HT handling
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/**
+ * DOC: RX A-MPDU aggregation
+ *
+ * Aggregation on the RX side requires only implementing the
+ * @ampdu_action callback that is invoked to start/stop any
+ * block-ack sessions for RX aggregation.
+ *
+ * When RX aggregation is started by the peer, the driver is
+ * notified via @ampdu_action function, with the
+ * %IEEE80211_AMPDU_RX_START action, and may reject the request
+ * in which case a negative response is sent to the peer, if it
+ * accepts it a positive response is sent.
+ *
+ * While the session is active, the device/driver are required
+ * to de-aggregate frames and pass them up one by one to mac80211,
+ * which will handle the reorder buffer.
+ *
+ * When the aggregation session is stopped again by the peer or
+ * ourselves, the driver's @ampdu_action function will be called
+ * with the action %IEEE80211_AMPDU_RX_STOP. In this case, the
+ * call must not fail.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+
+static void ieee80211_free_tid_rx(struct rcu_head *h)
+{
+ struct tid_ampdu_rx *tid_rx =
+ container_of(h, struct tid_ampdu_rx, rcu_head);
+ int i;
+
+ for (i = 0; i < tid_rx->buf_size; i++)
+ __skb_queue_purge(&tid_rx->reorder_buf[i]);
+ kfree(tid_rx->reorder_buf);
+ kfree(tid_rx->reorder_time);
+ kfree(tid_rx);
+}
+
+void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+ u16 initiator, u16 reason, bool tx)
+{
+ struct ieee80211_local *local = sta->local;
+ struct tid_ampdu_rx *tid_rx;
+
+ lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
+ tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid],
+ lockdep_is_held(&sta->ampdu_mlme.mtx));
+
+ if (!tid_rx)
+ return;
+
+ RCU_INIT_POINTER(sta->ampdu_mlme.tid_rx[tid], NULL);
+
+ ht_dbg(sta->sdata,
+ "Rx BA session stop requested for %pM tid %u %s reason: %d\n",
+ sta->sta.addr, tid,
+ initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator",
+ (int)reason);
+
+ if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP,
+ &sta->sta, tid, NULL, 0))
+ sdata_info(sta->sdata,
+ "HW problem - can not stop rx aggregation for %pM tid %d\n",
+ sta->sta.addr, tid);
+
+ /* check if this is a self generated aggregation halt */
+ if (initiator == WLAN_BACK_RECIPIENT && tx)
+ ieee80211_send_delba(sta->sdata, sta->sta.addr,
+ tid, WLAN_BACK_RECIPIENT, reason);
+
+ del_timer_sync(&tid_rx->session_timer);
+
+ /* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */
+ spin_lock_bh(&tid_rx->reorder_lock);
+ tid_rx->removed = true;
+ spin_unlock_bh(&tid_rx->reorder_lock);
+ del_timer_sync(&tid_rx->reorder_timer);
+
+ call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx);
+}
+
+void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+ u16 initiator, u16 reason, bool tx)
+{
+ mutex_lock(&sta->ampdu_mlme.mtx);
+ ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, tx);
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+}
+
+void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap,
+ const u8 *addr)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct sta_info *sta;
+ int i;
+
+ rcu_read_lock();
+ sta = sta_info_get_bss(sdata, addr);
+ if (!sta) {
+ rcu_read_unlock();
+ return;
+ }
+
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++)
+ if (ba_rx_bitmap & BIT(i))
+ set_bit(i, sta->ampdu_mlme.tid_rx_stop_requested);
+
+ ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_stop_rx_ba_session);
+
+/*
+ * After accepting the AddBA Request we activated a timer,
+ * resetting it after each frame that arrives from the originator.
+ */
+static void sta_rx_agg_session_timer_expired(unsigned long data)
+{
+ /* not an elegant detour, but there is no choice as the timer passes
+ * only one argument, and various sta_info are needed here, so init
+ * flow in sta_info_create gives the TID as data, while the timer_to_id
+ * array gives the sta through container_of */
+ u8 *ptid = (u8 *)data;
+ u8 *timer_to_id = ptid - *ptid;
+ struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+ timer_to_tid[0]);
+ struct tid_ampdu_rx *tid_rx;
+ unsigned long timeout;
+
+ rcu_read_lock();
+ tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[*ptid]);
+ if (!tid_rx) {
+ rcu_read_unlock();
+ return;
+ }
+
+ timeout = tid_rx->last_rx + TU_TO_JIFFIES(tid_rx->timeout);
+ if (time_is_after_jiffies(timeout)) {
+ mod_timer(&tid_rx->session_timer, timeout);
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ ht_dbg(sta->sdata, "RX session timer expired on %pM tid %d\n",
+ sta->sta.addr, (u16)*ptid);
+
+ set_bit(*ptid, sta->ampdu_mlme.tid_rx_timer_expired);
+ ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work);
+}
+
+static void sta_rx_agg_reorder_timer_expired(unsigned long data)
+{
+ u8 *ptid = (u8 *)data;
+ u8 *timer_to_id = ptid - *ptid;
+ struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+ timer_to_tid[0]);
+
+ rcu_read_lock();
+ ieee80211_release_reorder_timeout(sta, *ptid);
+ rcu_read_unlock();
+}
+
+static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
+ u8 dialog_token, u16 status, u16 policy,
+ u16 buf_size, u16 timeout)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ u16 capab;
+
+ skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+ memset(mgmt, 0, 24);
+ memcpy(mgmt->da, da, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN);
+
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+
+ skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp));
+ mgmt->u.action.category = WLAN_CATEGORY_BACK;
+ mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
+ mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
+
+ capab = (u16)(policy << 1); /* bit 1 aggregation policy */
+ capab |= (u16)(tid << 2); /* bit 5:2 TID number */
+ capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */
+
+ mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab);
+ mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
+ mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+void __ieee80211_start_rx_ba_session(struct sta_info *sta,
+ u8 dialog_token, u16 timeout,
+ u16 start_seq_num, u16 ba_policy, u16 tid,
+ u16 buf_size, bool tx, bool auto_seq)
+{
+ struct ieee80211_local *local = sta->sdata->local;
+ struct tid_ampdu_rx *tid_agg_rx;
+ int i, ret = -EOPNOTSUPP;
+ u16 status = WLAN_STATUS_REQUEST_DECLINED;
+
+ if (!sta->sta.ht_cap.ht_supported) {
+ ht_dbg(sta->sdata,
+ "STA %pM erroneously requests BA session on tid %d w/o QoS\n",
+ sta->sta.addr, tid);
+ /* send a response anyway, it's an error case if we get here */
+ goto end_no_lock;
+ }
+
+ if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) {
+ ht_dbg(sta->sdata,
+ "Suspend in progress - Denying ADDBA request (%pM tid %d)\n",
+ sta->sta.addr, tid);
+ goto end_no_lock;
+ }
+
+ /* sanity check for incoming parameters:
+ * check if configuration can support the BA policy
+ * and if buffer size does not exceeds max value */
+ /* XXX: check own ht delayed BA capability?? */
+ if (((ba_policy != 1) &&
+ (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) ||
+ (buf_size > IEEE80211_MAX_AMPDU_BUF)) {
+ status = WLAN_STATUS_INVALID_QOS_PARAM;
+ ht_dbg_ratelimited(sta->sdata,
+ "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n",
+ sta->sta.addr, tid, ba_policy, buf_size);
+ goto end_no_lock;
+ }
+ /* determine default buffer size */
+ if (buf_size == 0)
+ buf_size = IEEE80211_MAX_AMPDU_BUF;
+
+ /* make sure the size doesn't exceed the maximum supported by the hw */
+ if (buf_size > local->hw.max_rx_aggregation_subframes)
+ buf_size = local->hw.max_rx_aggregation_subframes;
+
+ /* examine state machine */
+ mutex_lock(&sta->ampdu_mlme.mtx);
+
+ if (sta->ampdu_mlme.tid_rx[tid]) {
+ ht_dbg_ratelimited(sta->sdata,
+ "unexpected AddBA Req from %pM on tid %u\n",
+ sta->sta.addr, tid);
+
+ /* delete existing Rx BA session on the same tid */
+ ___ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+ WLAN_STATUS_UNSPECIFIED_QOS,
+ false);
+ }
+
+ /* prepare A-MPDU MLME for Rx aggregation */
+ tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+ if (!tid_agg_rx)
+ goto end;
+
+ spin_lock_init(&tid_agg_rx->reorder_lock);
+
+ /* rx timer */
+ tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired;
+ tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+ init_timer_deferrable(&tid_agg_rx->session_timer);
+
+ /* rx reorder timer */
+ tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired;
+ tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+ init_timer(&tid_agg_rx->reorder_timer);
+
+ /* prepare reordering buffer */
+ tid_agg_rx->reorder_buf =
+ kcalloc(buf_size, sizeof(struct sk_buff_head), GFP_KERNEL);
+ tid_agg_rx->reorder_time =
+ kcalloc(buf_size, sizeof(unsigned long), GFP_KERNEL);
+ if (!tid_agg_rx->reorder_buf || !tid_agg_rx->reorder_time) {
+ kfree(tid_agg_rx->reorder_buf);
+ kfree(tid_agg_rx->reorder_time);
+ kfree(tid_agg_rx);
+ goto end;
+ }
+
+ for (i = 0; i < buf_size; i++)
+ __skb_queue_head_init(&tid_agg_rx->reorder_buf[i]);
+
+ ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START,
+ &sta->sta, tid, &start_seq_num, 0);
+ ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n",
+ sta->sta.addr, tid, ret);
+ if (ret) {
+ kfree(tid_agg_rx->reorder_buf);
+ kfree(tid_agg_rx->reorder_time);
+ kfree(tid_agg_rx);
+ goto end;
+ }
+
+ /* update data */
+ tid_agg_rx->dialog_token = dialog_token;
+ tid_agg_rx->ssn = start_seq_num;
+ tid_agg_rx->head_seq_num = start_seq_num;
+ tid_agg_rx->buf_size = buf_size;
+ tid_agg_rx->timeout = timeout;
+ tid_agg_rx->stored_mpdu_num = 0;
+ tid_agg_rx->auto_seq = auto_seq;
+ status = WLAN_STATUS_SUCCESS;
+
+ /* activate it for RX */
+ rcu_assign_pointer(sta->ampdu_mlme.tid_rx[tid], tid_agg_rx);
+
+ if (timeout) {
+ mod_timer(&tid_agg_rx->session_timer, TU_TO_EXP_TIME(timeout));
+ tid_agg_rx->last_rx = jiffies;
+ }
+
+end:
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+
+end_no_lock:
+ if (tx)
+ ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
+ dialog_token, status, 1, buf_size,
+ timeout);
+}
+
+void ieee80211_process_addba_request(struct ieee80211_local *local,
+ struct sta_info *sta,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num;
+ u8 dialog_token;
+
+ /* extract session parameters from addba request frame */
+ dialog_token = mgmt->u.action.u.addba_req.dialog_token;
+ timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout);
+ start_seq_num =
+ le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4;
+
+ capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab);
+ ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1;
+ tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
+ buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+
+ __ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
+ start_seq_num, ba_policy, tid,
+ buf_size, true, false);
+}
+
+void ieee80211_start_rx_ba_session_offl(struct ieee80211_vif *vif,
+ const u8 *addr, u16 tid)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_rx_agg *rx_agg;
+ struct sk_buff *skb = dev_alloc_skb(0);
+
+ if (unlikely(!skb))
+ return;
+
+ rx_agg = (struct ieee80211_rx_agg *) &skb->cb;
+ memcpy(&rx_agg->addr, addr, ETH_ALEN);
+ rx_agg->tid = tid;
+
+ skb->pkt_type = IEEE80211_SDATA_QUEUE_RX_AGG_START;
+ skb_queue_tail(&sdata->skb_queue, skb);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+EXPORT_SYMBOL(ieee80211_start_rx_ba_session_offl);
+
+void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif,
+ const u8 *addr, u16 tid)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_rx_agg *rx_agg;
+ struct sk_buff *skb = dev_alloc_skb(0);
+
+ if (unlikely(!skb))
+ return;
+
+ rx_agg = (struct ieee80211_rx_agg *) &skb->cb;
+ memcpy(&rx_agg->addr, addr, ETH_ALEN);
+ rx_agg->tid = tid;
+
+ skb->pkt_type = IEEE80211_SDATA_QUEUE_RX_AGG_STOP;
+ skb_queue_tail(&sdata->skb_queue, skb);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+EXPORT_SYMBOL(ieee80211_stop_rx_ba_session_offl);
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
new file mode 100644
index 000000000..cce9d425c
--- /dev/null
+++ b/net/mac80211/agg-tx.c
@@ -0,0 +1,989 @@
+/*
+ * HT handling
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "wme.h"
+
+/**
+ * DOC: TX A-MPDU aggregation
+ *
+ * Aggregation on the TX side requires setting the hardware flag
+ * %IEEE80211_HW_AMPDU_AGGREGATION. The driver will then be handed
+ * packets with a flag indicating A-MPDU aggregation. The driver
+ * or device is responsible for actually aggregating the frames,
+ * as well as deciding how many and which to aggregate.
+ *
+ * When TX aggregation is started by some subsystem (usually the rate
+ * control algorithm would be appropriate) by calling the
+ * ieee80211_start_tx_ba_session() function, the driver will be
+ * notified via its @ampdu_action function, with the
+ * %IEEE80211_AMPDU_TX_START action.
+ *
+ * In response to that, the driver is later required to call the
+ * ieee80211_start_tx_ba_cb_irqsafe() function, which will really
+ * start the aggregation session after the peer has also responded.
+ * If the peer responds negatively, the session will be stopped
+ * again right away. Note that it is possible for the aggregation
+ * session to be stopped before the driver has indicated that it
+ * is done setting it up, in which case it must not indicate the
+ * setup completion.
+ *
+ * Also note that, since we also need to wait for a response from
+ * the peer, the driver is notified of the completion of the
+ * handshake by the %IEEE80211_AMPDU_TX_OPERATIONAL action to the
+ * @ampdu_action callback.
+ *
+ * Similarly, when the aggregation session is stopped by the peer
+ * or something calling ieee80211_stop_tx_ba_session(), the driver's
+ * @ampdu_action function will be called with the action
+ * %IEEE80211_AMPDU_TX_STOP. In this case, the call must not fail,
+ * and the driver must later call ieee80211_stop_tx_ba_cb_irqsafe().
+ * Note that the sta can get destroyed before the BA tear down is
+ * complete.
+ */
+
+static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
+ const u8 *da, u16 tid,
+ u8 dialog_token, u16 start_seq_num,
+ u16 agg_size, u16 timeout)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ u16 capab;
+
+ skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+ memset(mgmt, 0, 24);
+ memcpy(mgmt->da, da, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN);
+
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+
+ skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req));
+
+ mgmt->u.action.category = WLAN_CATEGORY_BACK;
+ mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ;
+
+ mgmt->u.action.u.addba_req.dialog_token = dialog_token;
+ capab = (u16)(1 << 1); /* bit 1 aggregation policy */
+ capab |= (u16)(tid << 2); /* bit 5:2 TID number */
+ capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */
+
+ mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab);
+
+ mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout);
+ mgmt->u.action.u.addba_req.start_seq_num =
+ cpu_to_le16(start_seq_num << 4);
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_bar *bar;
+ u16 bar_control = 0;
+
+ skb = dev_alloc_skb(sizeof(*bar) + local->hw.extra_tx_headroom);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+ bar = (struct ieee80211_bar *)skb_put(skb, sizeof(*bar));
+ memset(bar, 0, sizeof(*bar));
+ bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
+ IEEE80211_STYPE_BACK_REQ);
+ memcpy(bar->ra, ra, ETH_ALEN);
+ memcpy(bar->ta, sdata->vif.addr, ETH_ALEN);
+ bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL;
+ bar_control |= (u16)IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA;
+ bar_control |= (u16)(tid << IEEE80211_BAR_CTRL_TID_INFO_SHIFT);
+ bar->control = cpu_to_le16(bar_control);
+ bar->start_seq_num = cpu_to_le16(ssn);
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
+ IEEE80211_TX_CTL_REQ_TX_STATUS;
+ ieee80211_tx_skb_tid(sdata, skb, tid);
+}
+EXPORT_SYMBOL(ieee80211_send_bar);
+
+void ieee80211_assign_tid_tx(struct sta_info *sta, int tid,
+ struct tid_ampdu_tx *tid_tx)
+{
+ lockdep_assert_held(&sta->ampdu_mlme.mtx);
+ lockdep_assert_held(&sta->lock);
+ rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], tid_tx);
+}
+
+/*
+ * When multiple aggregation sessions on multiple stations
+ * are being created/destroyed simultaneously, we need to
+ * refcount the global queue stop caused by that in order
+ * to not get into a situation where one of the aggregation
+ * setup or teardown re-enables queues before the other is
+ * ready to handle that.
+ *
+ * These two functions take care of this issue by keeping
+ * a global "agg_queue_stop" refcount.
+ */
+static void __acquires(agg_queue)
+ieee80211_stop_queue_agg(struct ieee80211_sub_if_data *sdata, int tid)
+{
+ int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)];
+
+ /* we do refcounting here, so don't use the queue reason refcounting */
+
+ if (atomic_inc_return(&sdata->local->agg_queue_stop[queue]) == 1)
+ ieee80211_stop_queue_by_reason(
+ &sdata->local->hw, queue,
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
+ false);
+ __acquire(agg_queue);
+}
+
+static void __releases(agg_queue)
+ieee80211_wake_queue_agg(struct ieee80211_sub_if_data *sdata, int tid)
+{
+ int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)];
+
+ if (atomic_dec_return(&sdata->local->agg_queue_stop[queue]) == 0)
+ ieee80211_wake_queue_by_reason(
+ &sdata->local->hw, queue,
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
+ false);
+ __release(agg_queue);
+}
+
+static void
+ieee80211_agg_stop_txq(struct sta_info *sta, int tid)
+{
+ struct ieee80211_txq *txq = sta->sta.txq[tid];
+ struct txq_info *txqi;
+
+ if (!txq)
+ return;
+
+ txqi = to_txq_info(txq);
+
+ /* Lock here to protect against further seqno updates on dequeue */
+ spin_lock_bh(&txqi->queue.lock);
+ set_bit(IEEE80211_TXQ_STOP, &txqi->flags);
+ spin_unlock_bh(&txqi->queue.lock);
+}
+
+static void
+ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable)
+{
+ struct ieee80211_txq *txq = sta->sta.txq[tid];
+ struct txq_info *txqi;
+
+ if (!txq)
+ return;
+
+ txqi = to_txq_info(txq);
+
+ if (enable)
+ set_bit(IEEE80211_TXQ_AMPDU, &txqi->flags);
+ else
+ clear_bit(IEEE80211_TXQ_AMPDU, &txqi->flags);
+
+ clear_bit(IEEE80211_TXQ_STOP, &txqi->flags);
+ drv_wake_tx_queue(sta->sdata->local, txqi);
+}
+
+/*
+ * splice packets from the STA's pending to the local pending,
+ * requires a call to ieee80211_agg_splice_finish later
+ */
+static void __acquires(agg_queue)
+ieee80211_agg_splice_packets(struct ieee80211_sub_if_data *sdata,
+ struct tid_ampdu_tx *tid_tx, u16 tid)
+{
+ struct ieee80211_local *local = sdata->local;
+ int queue = sdata->vif.hw_queue[ieee80211_ac_from_tid(tid)];
+ unsigned long flags;
+
+ ieee80211_stop_queue_agg(sdata, tid);
+
+ if (WARN(!tid_tx,
+ "TID %d gone but expected when splicing aggregates from the pending queue\n",
+ tid))
+ return;
+
+ if (!skb_queue_empty(&tid_tx->pending)) {
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ /* copy over remaining packets */
+ skb_queue_splice_tail_init(&tid_tx->pending,
+ &local->pending[queue]);
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ }
+}
+
+static void __releases(agg_queue)
+ieee80211_agg_splice_finish(struct ieee80211_sub_if_data *sdata, u16 tid)
+{
+ ieee80211_wake_queue_agg(sdata, tid);
+}
+
+static void ieee80211_remove_tid_tx(struct sta_info *sta, int tid)
+{
+ struct tid_ampdu_tx *tid_tx;
+
+ lockdep_assert_held(&sta->ampdu_mlme.mtx);
+ lockdep_assert_held(&sta->lock);
+
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+ /*
+ * When we get here, the TX path will not be lockless any more wrt.
+ * aggregation, since the OPERATIONAL bit has long been cleared.
+ * Thus it will block on getting the lock, if it occurs. So if we
+ * stop the queue now, we will not get any more packets, and any
+ * that might be being processed will wait for us here, thereby
+ * guaranteeing that no packets go to the tid_tx pending queue any
+ * more.
+ */
+
+ ieee80211_agg_splice_packets(sta->sdata, tid_tx, tid);
+
+ /* future packets must not find the tid_tx struct any more */
+ ieee80211_assign_tid_tx(sta, tid, NULL);
+
+ ieee80211_agg_splice_finish(sta->sdata, tid);
+ ieee80211_agg_start_txq(sta, tid, false);
+
+ kfree_rcu(tid_tx, rcu_head);
+}
+
+int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+ enum ieee80211_agg_stop_reason reason)
+{
+ struct ieee80211_local *local = sta->local;
+ struct tid_ampdu_tx *tid_tx;
+ enum ieee80211_ampdu_mlme_action action;
+ int ret;
+
+ lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
+ switch (reason) {
+ case AGG_STOP_DECLINED:
+ case AGG_STOP_LOCAL_REQUEST:
+ case AGG_STOP_PEER_REQUEST:
+ action = IEEE80211_AMPDU_TX_STOP_CONT;
+ break;
+ case AGG_STOP_DESTROY_STA:
+ action = IEEE80211_AMPDU_TX_STOP_FLUSH;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&sta->lock);
+
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+ if (!tid_tx) {
+ spin_unlock_bh(&sta->lock);
+ return -ENOENT;
+ }
+
+ /*
+ * if we're already stopping ignore any new requests to stop
+ * unless we're destroying it in which case notify the driver
+ */
+ if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+ spin_unlock_bh(&sta->lock);
+ if (reason != AGG_STOP_DESTROY_STA)
+ return -EALREADY;
+ ret = drv_ampdu_action(local, sta->sdata,
+ IEEE80211_AMPDU_TX_STOP_FLUSH_CONT,
+ &sta->sta, tid, NULL, 0);
+ WARN_ON_ONCE(ret);
+ return 0;
+ }
+
+ if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) {
+ /* not even started yet! */
+ ieee80211_assign_tid_tx(sta, tid, NULL);
+ spin_unlock_bh(&sta->lock);
+ kfree_rcu(tid_tx, rcu_head);
+ return 0;
+ }
+
+ set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state);
+
+ spin_unlock_bh(&sta->lock);
+
+ ht_dbg(sta->sdata, "Tx BA session stop requested for %pM tid %u\n",
+ sta->sta.addr, tid);
+
+ del_timer_sync(&tid_tx->addba_resp_timer);
+ del_timer_sync(&tid_tx->session_timer);
+
+ /*
+ * After this packets are no longer handed right through
+ * to the driver but are put onto tid_tx->pending instead,
+ * with locking to ensure proper access.
+ */
+ clear_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state);
+
+ /*
+ * There might be a few packets being processed right now (on
+ * another CPU) that have already gotten past the aggregation
+ * check when it was still OPERATIONAL and consequently have
+ * IEEE80211_TX_CTL_AMPDU set. In that case, this code might
+ * call into the driver at the same time or even before the
+ * TX paths calls into it, which could confuse the driver.
+ *
+ * Wait for all currently running TX paths to finish before
+ * telling the driver. New packets will not go through since
+ * the aggregation session is no longer OPERATIONAL.
+ */
+ synchronize_net();
+
+ tid_tx->stop_initiator = reason == AGG_STOP_PEER_REQUEST ?
+ WLAN_BACK_RECIPIENT :
+ WLAN_BACK_INITIATOR;
+ tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST;
+
+ ret = drv_ampdu_action(local, sta->sdata, action,
+ &sta->sta, tid, NULL, 0);
+
+ /* HW shall not deny going back to legacy */
+ if (WARN_ON(ret)) {
+ /*
+ * We may have pending packets get stuck in this case...
+ * Not bothering with a workaround for now.
+ */
+ }
+
+ /*
+ * In the case of AGG_STOP_DESTROY_STA, the driver won't
+ * necessarily call ieee80211_stop_tx_ba_cb(), so this may
+ * seem like we can leave the tid_tx data pending forever.
+ * This is true, in a way, but "forever" is only until the
+ * station struct is actually destroyed. In the meantime,
+ * leaving it around ensures that we don't transmit packets
+ * to the driver on this TID which might confuse it.
+ */
+
+ return 0;
+}
+
+/*
+ * After sending add Block Ack request we activated a timer until
+ * add Block Ack response will arrive from the recipient.
+ * If this timer expires sta_addba_resp_timer_expired will be executed.
+ */
+static void sta_addba_resp_timer_expired(unsigned long data)
+{
+ /* not an elegant detour, but there is no choice as the timer passes
+ * only one argument, and both sta_info and TID are needed, so init
+ * flow in sta_info_create gives the TID as data, while the timer_to_id
+ * array gives the sta through container_of */
+ u16 tid = *(u8 *)data;
+ struct sta_info *sta = container_of((void *)data,
+ struct sta_info, timer_to_tid[tid]);
+ struct tid_ampdu_tx *tid_tx;
+
+ /* check if the TID waits for addBA response */
+ rcu_read_lock();
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+ if (!tid_tx ||
+ test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) {
+ rcu_read_unlock();
+ ht_dbg(sta->sdata,
+ "timer expired on %pM tid %d but we are not (or no longer) expecting addBA response there\n",
+ sta->sta.addr, tid);
+ return;
+ }
+
+ ht_dbg(sta->sdata, "addBA response timer expired on %pM tid %d\n",
+ sta->sta.addr, tid);
+
+ ieee80211_stop_tx_ba_session(&sta->sta, tid);
+ rcu_read_unlock();
+}
+
+void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
+{
+ struct tid_ampdu_tx *tid_tx;
+ struct ieee80211_local *local = sta->local;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u16 start_seq_num;
+ int ret;
+
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+ /*
+ * Start queuing up packets for this aggregation session.
+ * We're going to release them once the driver is OK with
+ * that.
+ */
+ clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state);
+
+ ieee80211_agg_stop_txq(sta, tid);
+
+ /*
+ * Make sure no packets are being processed. This ensures that
+ * we have a valid starting sequence number and that in-flight
+ * packets have been flushed out and no packets for this TID
+ * will go into the driver during the ampdu_action call.
+ */
+ synchronize_net();
+
+ start_seq_num = sta->tid_seq[tid] >> 4;
+
+ ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START,
+ &sta->sta, tid, &start_seq_num, 0);
+ if (ret) {
+ ht_dbg(sdata,
+ "BA request denied - HW unavailable for %pM tid %d\n",
+ sta->sta.addr, tid);
+ spin_lock_bh(&sta->lock);
+ ieee80211_agg_splice_packets(sdata, tid_tx, tid);
+ ieee80211_assign_tid_tx(sta, tid, NULL);
+ ieee80211_agg_splice_finish(sdata, tid);
+ spin_unlock_bh(&sta->lock);
+
+ ieee80211_agg_start_txq(sta, tid, false);
+
+ kfree_rcu(tid_tx, rcu_head);
+ return;
+ }
+
+ /* activate the timer for the recipient's addBA response */
+ mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL);
+ ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n",
+ sta->sta.addr, tid);
+
+ spin_lock_bh(&sta->lock);
+ sta->ampdu_mlme.last_addba_req_time[tid] = jiffies;
+ sta->ampdu_mlme.addba_req_num[tid]++;
+ spin_unlock_bh(&sta->lock);
+
+ /* send AddBA request */
+ ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
+ tid_tx->dialog_token, start_seq_num,
+ local->hw.max_tx_aggregation_subframes,
+ tid_tx->timeout);
+}
+
+/*
+ * After accepting the AddBA Response we activated a timer,
+ * resetting it after each frame that we send.
+ */
+static void sta_tx_agg_session_timer_expired(unsigned long data)
+{
+ /* not an elegant detour, but there is no choice as the timer passes
+ * only one argument, and various sta_info are needed here, so init
+ * flow in sta_info_create gives the TID as data, while the timer_to_id
+ * array gives the sta through container_of */
+ u8 *ptid = (u8 *)data;
+ u8 *timer_to_id = ptid - *ptid;
+ struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+ timer_to_tid[0]);
+ struct tid_ampdu_tx *tid_tx;
+ unsigned long timeout;
+
+ rcu_read_lock();
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[*ptid]);
+ if (!tid_tx || test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ timeout = tid_tx->last_tx + TU_TO_JIFFIES(tid_tx->timeout);
+ if (time_is_after_jiffies(timeout)) {
+ mod_timer(&tid_tx->session_timer, timeout);
+ rcu_read_unlock();
+ return;
+ }
+
+ rcu_read_unlock();
+
+ ht_dbg(sta->sdata, "tx session timer expired on %pM tid %d\n",
+ sta->sta.addr, (u16)*ptid);
+
+ ieee80211_stop_tx_ba_session(&sta->sta, *ptid);
+}
+
+int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
+ u16 timeout)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct tid_ampdu_tx *tid_tx;
+ int ret = 0;
+
+ trace_api_start_tx_ba_session(pubsta, tid);
+
+ if (WARN(sta->reserved_tid == tid,
+ "Requested to start BA session on reserved tid=%d", tid))
+ return -EINVAL;
+
+ if (!pubsta->ht_cap.ht_supported)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!local->ops->ampdu_action))
+ return -EINVAL;
+
+ if ((tid >= IEEE80211_NUM_TIDS) ||
+ !(local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) ||
+ (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW))
+ return -EINVAL;
+
+ ht_dbg(sdata, "Open BA session requested for %pM tid %u\n",
+ pubsta->addr, tid);
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
+ sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ sdata->vif.type != NL80211_IFTYPE_AP &&
+ sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ return -EINVAL;
+
+ if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) {
+ ht_dbg(sdata,
+ "BA sessions blocked - Denying BA session request %pM tid %d\n",
+ sta->sta.addr, tid);
+ return -EINVAL;
+ }
+
+ /*
+ * 802.11n-2009 11.5.1.1: If the initiating STA is an HT STA, is a
+ * member of an IBSS, and has no other existing Block Ack agreement
+ * with the recipient STA, then the initiating STA shall transmit a
+ * Probe Request frame to the recipient STA and shall not transmit an
+ * ADDBA Request frame unless it receives a Probe Response frame
+ * from the recipient within dot11ADDBAFailureTimeout.
+ *
+ * The probe request mechanism for ADDBA is currently not implemented,
+ * but we only build up Block Ack session with HT STAs. This information
+ * is set when we receive a bss info from a probe response or a beacon.
+ */
+ if (sta->sdata->vif.type == NL80211_IFTYPE_ADHOC &&
+ !sta->sta.ht_cap.ht_supported) {
+ ht_dbg(sdata,
+ "BA request denied - IBSS STA %pM does not advertise HT support\n",
+ pubsta->addr);
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&sta->lock);
+
+ /* we have tried too many times, receiver does not want A-MPDU */
+ if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) {
+ ret = -EBUSY;
+ goto err_unlock_sta;
+ }
+
+ /*
+ * if we have tried more than HT_AGG_BURST_RETRIES times we
+ * will spread our requests in time to avoid stalling connection
+ * for too long
+ */
+ if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_BURST_RETRIES &&
+ time_before(jiffies, sta->ampdu_mlme.last_addba_req_time[tid] +
+ HT_AGG_RETRIES_PERIOD)) {
+ ht_dbg(sdata,
+ "BA request denied - waiting a grace period after %d failed requests on %pM tid %u\n",
+ sta->ampdu_mlme.addba_req_num[tid], sta->sta.addr, tid);
+ ret = -EBUSY;
+ goto err_unlock_sta;
+ }
+
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+ /* check if the TID is not in aggregation flow already */
+ if (tid_tx || sta->ampdu_mlme.tid_start_tx[tid]) {
+ ht_dbg(sdata,
+ "BA request denied - session is not idle on %pM tid %u\n",
+ sta->sta.addr, tid);
+ ret = -EAGAIN;
+ goto err_unlock_sta;
+ }
+
+ /* prepare A-MPDU MLME for Tx aggregation */
+ tid_tx = kzalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
+ if (!tid_tx) {
+ ret = -ENOMEM;
+ goto err_unlock_sta;
+ }
+
+ skb_queue_head_init(&tid_tx->pending);
+ __set_bit(HT_AGG_STATE_WANT_START, &tid_tx->state);
+
+ tid_tx->timeout = timeout;
+
+ /* response timer */
+ tid_tx->addba_resp_timer.function = sta_addba_resp_timer_expired;
+ tid_tx->addba_resp_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+ init_timer(&tid_tx->addba_resp_timer);
+
+ /* tx timer */
+ tid_tx->session_timer.function = sta_tx_agg_session_timer_expired;
+ tid_tx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+ init_timer_deferrable(&tid_tx->session_timer);
+
+ /* assign a dialog token */
+ sta->ampdu_mlme.dialog_token_allocator++;
+ tid_tx->dialog_token = sta->ampdu_mlme.dialog_token_allocator;
+
+ /*
+ * Finally, assign it to the start array; the work item will
+ * collect it and move it to the normal array.
+ */
+ sta->ampdu_mlme.tid_start_tx[tid] = tid_tx;
+
+ ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+
+ /* this flow continues off the work */
+ err_unlock_sta:
+ spin_unlock_bh(&sta->lock);
+ return ret;
+}
+EXPORT_SYMBOL(ieee80211_start_tx_ba_session);
+
+static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
+ struct sta_info *sta, u16 tid)
+{
+ struct tid_ampdu_tx *tid_tx;
+
+ lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+ ht_dbg(sta->sdata, "Aggregation is on for %pM tid %d\n",
+ sta->sta.addr, tid);
+
+ drv_ampdu_action(local, sta->sdata,
+ IEEE80211_AMPDU_TX_OPERATIONAL,
+ &sta->sta, tid, NULL, tid_tx->buf_size);
+
+ /*
+ * synchronize with TX path, while splicing the TX path
+ * should block so it won't put more packets onto pending.
+ */
+ spin_lock_bh(&sta->lock);
+
+ ieee80211_agg_splice_packets(sta->sdata, tid_tx, tid);
+ /*
+ * Now mark as operational. This will be visible
+ * in the TX path, and lets it go lock-free in
+ * the common case.
+ */
+ set_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state);
+ ieee80211_agg_splice_finish(sta->sdata, tid);
+
+ spin_unlock_bh(&sta->lock);
+
+ ieee80211_agg_start_txq(sta, tid, true);
+}
+
+void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct tid_ampdu_tx *tid_tx;
+
+ trace_api_start_tx_ba_cb(sdata, ra, tid);
+
+ if (tid >= IEEE80211_NUM_TIDS) {
+ ht_dbg(sdata, "Bad TID value: tid = %d (>= %d)\n",
+ tid, IEEE80211_NUM_TIDS);
+ return;
+ }
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get_bss(sdata, ra);
+ if (!sta) {
+ mutex_unlock(&local->sta_mtx);
+ ht_dbg(sdata, "Could not find station: %pM\n", ra);
+ return;
+ }
+
+ mutex_lock(&sta->ampdu_mlme.mtx);
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+ if (WARN_ON(!tid_tx)) {
+ ht_dbg(sdata, "addBA was not requested!\n");
+ goto unlock;
+ }
+
+ if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)))
+ goto unlock;
+
+ if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state))
+ ieee80211_agg_tx_operational(local, sta, tid);
+
+ unlock:
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+ mutex_unlock(&local->sta_mtx);
+}
+
+void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
+ const u8 *ra, u16 tid)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_ra_tid *ra_tid;
+ struct sk_buff *skb = dev_alloc_skb(0);
+
+ if (unlikely(!skb))
+ return;
+
+ ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
+ memcpy(&ra_tid->ra, ra, ETH_ALEN);
+ ra_tid->tid = tid;
+
+ skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_START;
+ skb_queue_tail(&sdata->skb_queue, skb);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
+
+int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+ enum ieee80211_agg_stop_reason reason)
+{
+ int ret;
+
+ mutex_lock(&sta->ampdu_mlme.mtx);
+
+ ret = ___ieee80211_stop_tx_ba_session(sta, tid, reason);
+
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+
+ return ret;
+}
+
+int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct tid_ampdu_tx *tid_tx;
+ int ret = 0;
+
+ trace_api_stop_tx_ba_session(pubsta, tid);
+
+ if (!local->ops->ampdu_action)
+ return -EINVAL;
+
+ if (tid >= IEEE80211_NUM_TIDS)
+ return -EINVAL;
+
+ spin_lock_bh(&sta->lock);
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+ if (!tid_tx) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ WARN(sta->reserved_tid == tid,
+ "Requested to stop BA session on reserved tid=%d", tid);
+
+ if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+ /* already in progress stopping it */
+ ret = 0;
+ goto unlock;
+ }
+
+ set_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state);
+ ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+
+ unlock:
+ spin_unlock_bh(&sta->lock);
+ return ret;
+}
+EXPORT_SYMBOL(ieee80211_stop_tx_ba_session);
+
+void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct tid_ampdu_tx *tid_tx;
+ bool send_delba = false;
+
+ trace_api_stop_tx_ba_cb(sdata, ra, tid);
+
+ if (tid >= IEEE80211_NUM_TIDS) {
+ ht_dbg(sdata, "Bad TID value: tid = %d (>= %d)\n",
+ tid, IEEE80211_NUM_TIDS);
+ return;
+ }
+
+ ht_dbg(sdata, "Stopping Tx BA session for %pM tid %d\n", ra, tid);
+
+ mutex_lock(&local->sta_mtx);
+
+ sta = sta_info_get_bss(sdata, ra);
+ if (!sta) {
+ ht_dbg(sdata, "Could not find station: %pM\n", ra);
+ goto unlock;
+ }
+
+ mutex_lock(&sta->ampdu_mlme.mtx);
+ spin_lock_bh(&sta->lock);
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+ if (!tid_tx || !test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+ ht_dbg(sdata,
+ "unexpected callback to A-MPDU stop for %pM tid %d\n",
+ sta->sta.addr, tid);
+ goto unlock_sta;
+ }
+
+ if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR && tid_tx->tx_stop)
+ send_delba = true;
+
+ ieee80211_remove_tid_tx(sta, tid);
+
+ unlock_sta:
+ spin_unlock_bh(&sta->lock);
+
+ if (send_delba)
+ ieee80211_send_delba(sdata, ra, tid,
+ WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
+
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+ unlock:
+ mutex_unlock(&local->sta_mtx);
+}
+
+void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
+ const u8 *ra, u16 tid)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_ra_tid *ra_tid;
+ struct sk_buff *skb = dev_alloc_skb(0);
+
+ if (unlikely(!skb))
+ return;
+
+ ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
+ memcpy(&ra_tid->ra, ra, ETH_ALEN);
+ ra_tid->tid = tid;
+
+ skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_STOP;
+ skb_queue_tail(&sdata->skb_queue, skb);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe);
+
+
+void ieee80211_process_addba_resp(struct ieee80211_local *local,
+ struct sta_info *sta,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ struct tid_ampdu_tx *tid_tx;
+ u16 capab, tid;
+ u8 buf_size;
+
+ capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
+ tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
+ buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+
+ mutex_lock(&sta->ampdu_mlme.mtx);
+
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+ if (!tid_tx)
+ goto out;
+
+ if (mgmt->u.action.u.addba_resp.dialog_token != tid_tx->dialog_token) {
+ ht_dbg(sta->sdata, "wrong addBA response token, %pM tid %d\n",
+ sta->sta.addr, tid);
+ goto out;
+ }
+
+ del_timer_sync(&tid_tx->addba_resp_timer);
+
+ ht_dbg(sta->sdata, "switched off addBA timer for %pM tid %d\n",
+ sta->sta.addr, tid);
+
+ /*
+ * addba_resp_timer may have fired before we got here, and
+ * caused WANT_STOP to be set. If the stop then was already
+ * processed further, STOPPING might be set.
+ */
+ if (test_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state) ||
+ test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+ ht_dbg(sta->sdata,
+ "got addBA resp for %pM tid %d but we already gave up\n",
+ sta->sta.addr, tid);
+ goto out;
+ }
+
+ /*
+ * IEEE 802.11-2007 7.3.1.14:
+ * In an ADDBA Response frame, when the Status Code field
+ * is set to 0, the Buffer Size subfield is set to a value
+ * of at least 1.
+ */
+ if (le16_to_cpu(mgmt->u.action.u.addba_resp.status)
+ == WLAN_STATUS_SUCCESS && buf_size) {
+ if (test_and_set_bit(HT_AGG_STATE_RESPONSE_RECEIVED,
+ &tid_tx->state)) {
+ /* ignore duplicate response */
+ goto out;
+ }
+
+ tid_tx->buf_size = buf_size;
+
+ if (test_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))
+ ieee80211_agg_tx_operational(local, sta, tid);
+
+ sta->ampdu_mlme.addba_req_num[tid] = 0;
+
+ if (tid_tx->timeout) {
+ mod_timer(&tid_tx->session_timer,
+ TU_TO_EXP_TIME(tid_tx->timeout));
+ tid_tx->last_tx = jiffies;
+ }
+
+ } else {
+ ___ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_DECLINED);
+ }
+
+ out:
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+}
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
new file mode 100644
index 000000000..f06d42267
--- /dev/null
+++ b/net/mac80211/cfg.c
@@ -0,0 +1,3782 @@
+/*
+ * mac80211 configuration hooks for cfg80211
+ *
+ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This file is GPLv2 as found in COPYING.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/nl80211.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "cfg.h"
+#include "rate.h"
+#include "mesh.h"
+#include "wme.h"
+
+static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy,
+ const char *name,
+ unsigned char name_assign_type,
+ enum nl80211_iftype type,
+ u32 *flags,
+ struct vif_params *params)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct wireless_dev *wdev;
+ struct ieee80211_sub_if_data *sdata;
+ int err;
+
+ err = ieee80211_if_add(local, name, name_assign_type, &wdev, type, params);
+ if (err)
+ return ERR_PTR(err);
+
+ if (type == NL80211_IFTYPE_MONITOR && flags) {
+ sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ sdata->u.mntr_flags = *flags;
+ }
+
+ return wdev;
+}
+
+static int ieee80211_del_iface(struct wiphy *wiphy, struct wireless_dev *wdev)
+{
+ ieee80211_if_remove(IEEE80211_WDEV_TO_SUB_IF(wdev));
+
+ return 0;
+}
+
+static int ieee80211_change_iface(struct wiphy *wiphy,
+ struct net_device *dev,
+ enum nl80211_iftype type, u32 *flags,
+ struct vif_params *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ int ret;
+
+ ret = ieee80211_if_change_type(sdata, type);
+ if (ret)
+ return ret;
+
+ if (type == NL80211_IFTYPE_AP_VLAN &&
+ params && params->use_4addr == 0)
+ RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);
+ else if (type == NL80211_IFTYPE_STATION &&
+ params && params->use_4addr >= 0)
+ sdata->u.mgd.use_4addr = params->use_4addr;
+
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
+ struct ieee80211_local *local = sdata->local;
+
+ if (ieee80211_sdata_running(sdata)) {
+ u32 mask = MONITOR_FLAG_COOK_FRAMES |
+ MONITOR_FLAG_ACTIVE;
+
+ /*
+ * Prohibit MONITOR_FLAG_COOK_FRAMES and
+ * MONITOR_FLAG_ACTIVE to be changed while the
+ * interface is up.
+ * Else we would need to add a lot of cruft
+ * to update everything:
+ * cooked_mntrs, monitor and all fif_* counters
+ * reconfigure hardware
+ */
+ if ((*flags & mask) != (sdata->u.mntr_flags & mask))
+ return -EBUSY;
+
+ ieee80211_adjust_monitor_flags(sdata, -1);
+ sdata->u.mntr_flags = *flags;
+ ieee80211_adjust_monitor_flags(sdata, 1);
+
+ ieee80211_configure_filter(local);
+ } else {
+ /*
+ * Because the interface is down, ieee80211_do_stop
+ * and ieee80211_do_open take care of "everything"
+ * mentioned in the comment above.
+ */
+ sdata->u.mntr_flags = *flags;
+ }
+ }
+
+ return 0;
+}
+
+static int ieee80211_start_p2p_device(struct wiphy *wiphy,
+ struct wireless_dev *wdev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ int ret;
+
+ mutex_lock(&sdata->local->chanctx_mtx);
+ ret = ieee80211_check_combinations(sdata, NULL, 0, 0);
+ mutex_unlock(&sdata->local->chanctx_mtx);
+ if (ret < 0)
+ return ret;
+
+ return ieee80211_do_open(wdev, true);
+}
+
+static void ieee80211_stop_p2p_device(struct wiphy *wiphy,
+ struct wireless_dev *wdev)
+{
+ ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev));
+}
+
+static int ieee80211_set_noack_map(struct wiphy *wiphy,
+ struct net_device *dev,
+ u16 noack_map)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ sdata->noack_map = noack_map;
+ return 0;
+}
+
+static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
+ u8 key_idx, bool pairwise, const u8 *mac_addr,
+ struct key_params *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta = NULL;
+ const struct ieee80211_cipher_scheme *cs = NULL;
+ struct ieee80211_key *key;
+ int err;
+
+ if (!ieee80211_sdata_running(sdata))
+ return -ENETDOWN;
+
+ /* reject WEP and TKIP keys if WEP failed to initialize */
+ switch (params->cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_TKIP:
+ case WLAN_CIPHER_SUITE_WEP104:
+ if (IS_ERR(local->wep_tx_tfm))
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ break;
+ default:
+ cs = ieee80211_cs_get(local, params->cipher, sdata->vif.type);
+ break;
+ }
+
+ key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len,
+ params->key, params->seq_len, params->seq,
+ cs);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ if (pairwise)
+ key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
+
+ mutex_lock(&local->sta_mtx);
+
+ if (mac_addr) {
+ if (ieee80211_vif_is_mesh(&sdata->vif))
+ sta = sta_info_get(sdata, mac_addr);
+ else
+ sta = sta_info_get_bss(sdata, mac_addr);
+ /*
+ * The ASSOC test makes sure the driver is ready to
+ * receive the key. When wpa_supplicant has roamed
+ * using FT, it attempts to set the key before
+ * association has completed, this rejects that attempt
+ * so it will set the key again after association.
+ *
+ * TODO: accept the key if we have a station entry and
+ * add it to the device after the station.
+ */
+ if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) {
+ ieee80211_key_free_unused(key);
+ err = -ENOENT;
+ goto out_unlock;
+ }
+ }
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED)
+ key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT;
+ break;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ /* Keys without a station are used for TX only */
+ if (key->sta && test_sta_flag(key->sta, WLAN_STA_MFP))
+ key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ /* no MFP (yet) */
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+#ifdef CONFIG_MAC80211_MESH
+ if (sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE)
+ key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT;
+ break;
+#endif
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_OCB:
+ /* shouldn't happen */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (sta)
+ sta->cipher_scheme = cs;
+
+ err = ieee80211_key_link(key, sdata, sta);
+
+ out_unlock:
+ mutex_unlock(&local->sta_mtx);
+
+ return err;
+}
+
+static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
+ u8 key_idx, bool pairwise, const u8 *mac_addr)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct ieee80211_key *key = NULL;
+ int ret;
+
+ mutex_lock(&local->sta_mtx);
+ mutex_lock(&local->key_mtx);
+
+ if (mac_addr) {
+ ret = -ENOENT;
+
+ sta = sta_info_get_bss(sdata, mac_addr);
+ if (!sta)
+ goto out_unlock;
+
+ if (pairwise)
+ key = key_mtx_dereference(local, sta->ptk[key_idx]);
+ else
+ key = key_mtx_dereference(local, sta->gtk[key_idx]);
+ } else
+ key = key_mtx_dereference(local, sdata->keys[key_idx]);
+
+ if (!key) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ ieee80211_key_free(key, true);
+
+ ret = 0;
+ out_unlock:
+ mutex_unlock(&local->key_mtx);
+ mutex_unlock(&local->sta_mtx);
+
+ return ret;
+}
+
+static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
+ u8 key_idx, bool pairwise, const u8 *mac_addr,
+ void *cookie,
+ void (*callback)(void *cookie,
+ struct key_params *params))
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta = NULL;
+ u8 seq[6] = {0};
+ struct key_params params;
+ struct ieee80211_key *key = NULL;
+ u64 pn64;
+ u32 iv32;
+ u16 iv16;
+ int err = -ENOENT;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ rcu_read_lock();
+
+ if (mac_addr) {
+ sta = sta_info_get_bss(sdata, mac_addr);
+ if (!sta)
+ goto out;
+
+ if (pairwise && key_idx < NUM_DEFAULT_KEYS)
+ key = rcu_dereference(sta->ptk[key_idx]);
+ else if (!pairwise &&
+ key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+ key = rcu_dereference(sta->gtk[key_idx]);
+ } else
+ key = rcu_dereference(sdata->keys[key_idx]);
+
+ if (!key)
+ goto out;
+
+ memset(&params, 0, sizeof(params));
+
+ params.cipher = key->conf.cipher;
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_TKIP:
+ iv32 = key->u.tkip.tx.iv32;
+ iv16 = key->u.tkip.tx.iv16;
+
+ if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
+ drv_get_tkip_seq(sdata->local,
+ key->conf.hw_key_idx,
+ &iv32, &iv16);
+
+ seq[0] = iv16 & 0xff;
+ seq[1] = (iv16 >> 8) & 0xff;
+ seq[2] = iv32 & 0xff;
+ seq[3] = (iv32 >> 8) & 0xff;
+ seq[4] = (iv32 >> 16) & 0xff;
+ seq[5] = (iv32 >> 24) & 0xff;
+ params.seq = seq;
+ params.seq_len = 6;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ pn64 = atomic64_read(&key->u.ccmp.tx_pn);
+ seq[0] = pn64;
+ seq[1] = pn64 >> 8;
+ seq[2] = pn64 >> 16;
+ seq[3] = pn64 >> 24;
+ seq[4] = pn64 >> 32;
+ seq[5] = pn64 >> 40;
+ params.seq = seq;
+ params.seq_len = 6;
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ pn64 = atomic64_read(&key->u.aes_cmac.tx_pn);
+ seq[0] = pn64;
+ seq[1] = pn64 >> 8;
+ seq[2] = pn64 >> 16;
+ seq[3] = pn64 >> 24;
+ seq[4] = pn64 >> 32;
+ seq[5] = pn64 >> 40;
+ params.seq = seq;
+ params.seq_len = 6;
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ pn64 = atomic64_read(&key->u.aes_gmac.tx_pn);
+ seq[0] = pn64;
+ seq[1] = pn64 >> 8;
+ seq[2] = pn64 >> 16;
+ seq[3] = pn64 >> 24;
+ seq[4] = pn64 >> 32;
+ seq[5] = pn64 >> 40;
+ params.seq = seq;
+ params.seq_len = 6;
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ pn64 = atomic64_read(&key->u.gcmp.tx_pn);
+ seq[0] = pn64;
+ seq[1] = pn64 >> 8;
+ seq[2] = pn64 >> 16;
+ seq[3] = pn64 >> 24;
+ seq[4] = pn64 >> 32;
+ seq[5] = pn64 >> 40;
+ params.seq = seq;
+ params.seq_len = 6;
+ break;
+ }
+
+ params.key = key->conf.key;
+ params.key_len = key->conf.keylen;
+
+ callback(cookie, &params);
+ err = 0;
+
+ out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int ieee80211_config_default_key(struct wiphy *wiphy,
+ struct net_device *dev,
+ u8 key_idx, bool uni,
+ bool multi)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ ieee80211_set_default_key(sdata, key_idx, uni, multi);
+
+ return 0;
+}
+
+static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy,
+ struct net_device *dev,
+ u8 key_idx)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ ieee80211_set_default_mgmt_key(sdata, key_idx);
+
+ return 0;
+}
+
+void sta_set_rate_info_tx(struct sta_info *sta,
+ const struct ieee80211_tx_rate *rate,
+ struct rate_info *rinfo)
+{
+ rinfo->flags = 0;
+ if (rate->flags & IEEE80211_TX_RC_MCS) {
+ rinfo->flags |= RATE_INFO_FLAGS_MCS;
+ rinfo->mcs = rate->idx;
+ } else if (rate->flags & IEEE80211_TX_RC_VHT_MCS) {
+ rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS;
+ rinfo->mcs = ieee80211_rate_get_vht_mcs(rate);
+ rinfo->nss = ieee80211_rate_get_vht_nss(rate);
+ } else {
+ struct ieee80211_supported_band *sband;
+ int shift = ieee80211_vif_get_shift(&sta->sdata->vif);
+ u16 brate;
+
+ sband = sta->local->hw.wiphy->bands[
+ ieee80211_get_sdata_band(sta->sdata)];
+ brate = sband->bitrates[rate->idx].bitrate;
+ rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
+ }
+ if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ rinfo->bw = RATE_INFO_BW_40;
+ else if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+ rinfo->bw = RATE_INFO_BW_80;
+ else if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH)
+ rinfo->bw = RATE_INFO_BW_160;
+ else
+ rinfo->bw = RATE_INFO_BW_20;
+ if (rate->flags & IEEE80211_TX_RC_SHORT_GI)
+ rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
+}
+
+void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
+{
+ rinfo->flags = 0;
+
+ if (sta->last_rx_rate_flag & RX_FLAG_HT) {
+ rinfo->flags |= RATE_INFO_FLAGS_MCS;
+ rinfo->mcs = sta->last_rx_rate_idx;
+ } else if (sta->last_rx_rate_flag & RX_FLAG_VHT) {
+ rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS;
+ rinfo->nss = sta->last_rx_rate_vht_nss;
+ rinfo->mcs = sta->last_rx_rate_idx;
+ } else {
+ struct ieee80211_supported_band *sband;
+ int shift = ieee80211_vif_get_shift(&sta->sdata->vif);
+ u16 brate;
+
+ sband = sta->local->hw.wiphy->bands[
+ ieee80211_get_sdata_band(sta->sdata)];
+ brate = sband->bitrates[sta->last_rx_rate_idx].bitrate;
+ rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
+ }
+
+ if (sta->last_rx_rate_flag & RX_FLAG_SHORT_GI)
+ rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
+
+ if (sta->last_rx_rate_flag & RX_FLAG_5MHZ)
+ rinfo->bw = RATE_INFO_BW_5;
+ else if (sta->last_rx_rate_flag & RX_FLAG_10MHZ)
+ rinfo->bw = RATE_INFO_BW_10;
+ else if (sta->last_rx_rate_flag & RX_FLAG_40MHZ)
+ rinfo->bw = RATE_INFO_BW_40;
+ else if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_80MHZ)
+ rinfo->bw = RATE_INFO_BW_80;
+ else if (sta->last_rx_rate_vht_flag & RX_VHT_FLAG_160MHZ)
+ rinfo->bw = RATE_INFO_BW_160;
+ else
+ rinfo->bw = RATE_INFO_BW_20;
+}
+
+static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev,
+ int idx, u8 *mac, struct station_info *sinfo)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ int ret = -ENOENT;
+
+ mutex_lock(&local->sta_mtx);
+
+ sta = sta_info_get_by_idx(sdata, idx);
+ if (sta) {
+ ret = 0;
+ memcpy(mac, sta->sta.addr, ETH_ALEN);
+ sta_set_sinfo(sta, sinfo);
+ }
+
+ mutex_unlock(&local->sta_mtx);
+
+ return ret;
+}
+
+static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev,
+ int idx, struct survey_info *survey)
+{
+ struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+
+ return drv_get_survey(local, idx, survey);
+}
+
+static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *mac, struct station_info *sinfo)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ int ret = -ENOENT;
+
+ mutex_lock(&local->sta_mtx);
+
+ sta = sta_info_get_bss(sdata, mac);
+ if (sta) {
+ ret = 0;
+ sta_set_sinfo(sta, sinfo);
+ }
+
+ mutex_unlock(&local->sta_mtx);
+
+ return ret;
+}
+
+static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata;
+ int ret = 0;
+
+ if (cfg80211_chandef_identical(&local->monitor_chandef, chandef))
+ return 0;
+
+ mutex_lock(&local->mtx);
+ mutex_lock(&local->iflist_mtx);
+ if (local->use_chanctx) {
+ sdata = rcu_dereference_protected(
+ local->monitor_sdata,
+ lockdep_is_held(&local->iflist_mtx));
+ if (sdata) {
+ ieee80211_vif_release_channel(sdata);
+ ret = ieee80211_vif_use_channel(sdata, chandef,
+ IEEE80211_CHANCTX_EXCLUSIVE);
+ }
+ } else if (local->open_count == local->monitors) {
+ local->_oper_chandef = *chandef;
+ ieee80211_hw_config(local, 0);
+ }
+
+ if (ret == 0)
+ local->monitor_chandef = *chandef;
+ mutex_unlock(&local->iflist_mtx);
+ mutex_unlock(&local->mtx);
+
+ return ret;
+}
+
+static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
+ const u8 *resp, size_t resp_len,
+ const struct ieee80211_csa_settings *csa)
+{
+ struct probe_resp *new, *old;
+
+ if (!resp || !resp_len)
+ return 1;
+
+ old = sdata_dereference(sdata->u.ap.probe_resp, sdata);
+
+ new = kzalloc(sizeof(struct probe_resp) + resp_len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ new->len = resp_len;
+ memcpy(new->data, resp, resp_len);
+
+ if (csa)
+ memcpy(new->csa_counter_offsets, csa->counter_offsets_presp,
+ csa->n_counter_offsets_presp *
+ sizeof(new->csa_counter_offsets[0]));
+
+ rcu_assign_pointer(sdata->u.ap.probe_resp, new);
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ return 0;
+}
+
+static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_beacon_data *params,
+ const struct ieee80211_csa_settings *csa)
+{
+ struct beacon_data *new, *old;
+ int new_head_len, new_tail_len;
+ int size, err;
+ u32 changed = BSS_CHANGED_BEACON;
+
+ old = sdata_dereference(sdata->u.ap.beacon, sdata);
+
+
+ /* Need to have a beacon head if we don't have one yet */
+ if (!params->head && !old)
+ return -EINVAL;
+
+ /* new or old head? */
+ if (params->head)
+ new_head_len = params->head_len;
+ else
+ new_head_len = old->head_len;
+
+ /* new or old tail? */
+ if (params->tail || !old)
+ /* params->tail_len will be zero for !params->tail */
+ new_tail_len = params->tail_len;
+ else
+ new_tail_len = old->tail_len;
+
+ size = sizeof(*new) + new_head_len + new_tail_len;
+
+ new = kzalloc(size, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ /* start filling the new info now */
+
+ /*
+ * pointers go into the block we allocated,
+ * memory is | beacon_data | head | tail |
+ */
+ new->head = ((u8 *) new) + sizeof(*new);
+ new->tail = new->head + new_head_len;
+ new->head_len = new_head_len;
+ new->tail_len = new_tail_len;
+
+ if (csa) {
+ new->csa_current_counter = csa->count;
+ memcpy(new->csa_counter_offsets, csa->counter_offsets_beacon,
+ csa->n_counter_offsets_beacon *
+ sizeof(new->csa_counter_offsets[0]));
+ }
+
+ /* copy in head */
+ if (params->head)
+ memcpy(new->head, params->head, new_head_len);
+ else
+ memcpy(new->head, old->head, new_head_len);
+
+ /* copy in optional tail */
+ if (params->tail)
+ memcpy(new->tail, params->tail, new_tail_len);
+ else
+ if (old)
+ memcpy(new->tail, old->tail, new_tail_len);
+
+ err = ieee80211_set_probe_resp(sdata, params->probe_resp,
+ params->probe_resp_len, csa);
+ if (err < 0)
+ return err;
+ if (err == 0)
+ changed |= BSS_CHANGED_AP_PROBE_RESP;
+
+ rcu_assign_pointer(sdata->u.ap.beacon, new);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ return changed;
+}
+
+static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_ap_settings *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct beacon_data *old;
+ struct ieee80211_sub_if_data *vlan;
+ u32 changed = BSS_CHANGED_BEACON_INT |
+ BSS_CHANGED_BEACON_ENABLED |
+ BSS_CHANGED_BEACON |
+ BSS_CHANGED_SSID |
+ BSS_CHANGED_P2P_PS |
+ BSS_CHANGED_TXPOWER;
+ int err;
+
+ old = sdata_dereference(sdata->u.ap.beacon, sdata);
+ if (old)
+ return -EALREADY;
+
+ switch (params->smps_mode) {
+ case NL80211_SMPS_OFF:
+ sdata->smps_mode = IEEE80211_SMPS_OFF;
+ break;
+ case NL80211_SMPS_STATIC:
+ sdata->smps_mode = IEEE80211_SMPS_STATIC;
+ break;
+ case NL80211_SMPS_DYNAMIC:
+ sdata->smps_mode = IEEE80211_SMPS_DYNAMIC;
+ break;
+ default:
+ return -EINVAL;
+ }
+ sdata->needed_rx_chains = sdata->local->rx_chains;
+
+ mutex_lock(&local->mtx);
+ err = ieee80211_vif_use_channel(sdata, &params->chandef,
+ IEEE80211_CHANCTX_SHARED);
+ if (!err)
+ ieee80211_vif_copy_chanctx_to_vlans(sdata, false);
+ mutex_unlock(&local->mtx);
+ if (err)
+ return err;
+
+ /*
+ * Apply control port protocol, this allows us to
+ * not encrypt dynamic WEP control frames.
+ */
+ sdata->control_port_protocol = params->crypto.control_port_ethertype;
+ sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt;
+ sdata->encrypt_headroom = ieee80211_cs_headroom(sdata->local,
+ &params->crypto,
+ sdata->vif.type);
+
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
+ vlan->control_port_protocol =
+ params->crypto.control_port_ethertype;
+ vlan->control_port_no_encrypt =
+ params->crypto.control_port_no_encrypt;
+ vlan->encrypt_headroom =
+ ieee80211_cs_headroom(sdata->local,
+ &params->crypto,
+ vlan->vif.type);
+ }
+
+ sdata->vif.bss_conf.beacon_int = params->beacon_interval;
+ sdata->vif.bss_conf.dtim_period = params->dtim_period;
+ sdata->vif.bss_conf.enable_beacon = true;
+
+ sdata->vif.bss_conf.ssid_len = params->ssid_len;
+ if (params->ssid_len)
+ memcpy(sdata->vif.bss_conf.ssid, params->ssid,
+ params->ssid_len);
+ sdata->vif.bss_conf.hidden_ssid =
+ (params->hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE);
+
+ memset(&sdata->vif.bss_conf.p2p_noa_attr, 0,
+ sizeof(sdata->vif.bss_conf.p2p_noa_attr));
+ sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow =
+ params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK;
+ if (params->p2p_opp_ps)
+ sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
+ IEEE80211_P2P_OPPPS_ENABLE_BIT;
+
+ err = ieee80211_assign_beacon(sdata, &params->beacon, NULL);
+ if (err < 0) {
+ ieee80211_vif_release_channel(sdata);
+ return err;
+ }
+ changed |= err;
+
+ err = drv_start_ap(sdata->local, sdata);
+ if (err) {
+ old = sdata_dereference(sdata->u.ap.beacon, sdata);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+ RCU_INIT_POINTER(sdata->u.ap.beacon, NULL);
+ ieee80211_vif_release_channel(sdata);
+ return err;
+ }
+
+ ieee80211_recalc_dtim(local, sdata);
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ netif_carrier_on(dev);
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ netif_carrier_on(vlan->dev);
+
+ return 0;
+}
+
+static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_beacon_data *params)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct beacon_data *old;
+ int err;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ sdata_assert_lock(sdata);
+
+ /* don't allow changing the beacon while CSA is in place - offset
+ * of channel switch counter may change
+ */
+ if (sdata->vif.csa_active)
+ return -EBUSY;
+
+ old = sdata_dereference(sdata->u.ap.beacon, sdata);
+ if (!old)
+ return -ENOENT;
+
+ err = ieee80211_assign_beacon(sdata, params, NULL);
+ if (err < 0)
+ return err;
+ ieee80211_bss_info_change_notify(sdata, err);
+ return 0;
+}
+
+static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_sub_if_data *vlan;
+ struct ieee80211_local *local = sdata->local;
+ struct beacon_data *old_beacon;
+ struct probe_resp *old_probe_resp;
+ struct cfg80211_chan_def chandef;
+
+ sdata_assert_lock(sdata);
+
+ old_beacon = sdata_dereference(sdata->u.ap.beacon, sdata);
+ if (!old_beacon)
+ return -ENOENT;
+ old_probe_resp = sdata_dereference(sdata->u.ap.probe_resp, sdata);
+
+ /* abort any running channel switch */
+ mutex_lock(&local->mtx);
+ sdata->vif.csa_active = false;
+ if (sdata->csa_block_tx) {
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_CSA);
+ sdata->csa_block_tx = false;
+ }
+
+ mutex_unlock(&local->mtx);
+
+ kfree(sdata->u.ap.next_beacon);
+ sdata->u.ap.next_beacon = NULL;
+
+ /* turn off carrier for this interface and dependent VLANs */
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ netif_carrier_off(vlan->dev);
+ netif_carrier_off(dev);
+
+ /* remove beacon and probe response */
+ RCU_INIT_POINTER(sdata->u.ap.beacon, NULL);
+ RCU_INIT_POINTER(sdata->u.ap.probe_resp, NULL);
+ kfree_rcu(old_beacon, rcu_head);
+ if (old_probe_resp)
+ kfree_rcu(old_probe_resp, rcu_head);
+ sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF;
+
+ __sta_info_flush(sdata, true);
+ ieee80211_free_keys(sdata, true);
+
+ sdata->vif.bss_conf.enable_beacon = false;
+ sdata->vif.bss_conf.ssid_len = 0;
+ clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
+
+ if (sdata->wdev.cac_started) {
+ chandef = sdata->vif.bss_conf.chandef;
+ cancel_delayed_work_sync(&sdata->dfs_cac_timer_work);
+ cfg80211_cac_event(sdata->dev, &chandef,
+ NL80211_RADAR_CAC_ABORTED,
+ GFP_KERNEL);
+ }
+
+ drv_stop_ap(sdata->local, sdata);
+
+ /* free all potentially still buffered bcast frames */
+ local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf);
+ skb_queue_purge(&sdata->u.ap.ps.bc_buf);
+
+ mutex_lock(&local->mtx);
+ ieee80211_vif_copy_chanctx_to_vlans(sdata, true);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&local->mtx);
+
+ return 0;
+}
+
+/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */
+struct iapp_layer2_update {
+ u8 da[ETH_ALEN]; /* broadcast */
+ u8 sa[ETH_ALEN]; /* STA addr */
+ __be16 len; /* 6 */
+ u8 dsap; /* 0 */
+ u8 ssap; /* 0 */
+ u8 control;
+ u8 xid_info[3];
+} __packed;
+
+static void ieee80211_send_layer2_update(struct sta_info *sta)
+{
+ struct iapp_layer2_update *msg;
+ struct sk_buff *skb;
+
+ /* Send Level 2 Update Frame to update forwarding tables in layer 2
+ * bridge devices */
+
+ skb = dev_alloc_skb(sizeof(*msg));
+ if (!skb)
+ return;
+ msg = (struct iapp_layer2_update *)skb_put(skb, sizeof(*msg));
+
+ /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID)
+ * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */
+
+ eth_broadcast_addr(msg->da);
+ memcpy(msg->sa, sta->sta.addr, ETH_ALEN);
+ msg->len = htons(6);
+ msg->dsap = 0;
+ msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */
+ msg->control = 0xaf; /* XID response lsb.1111F101.
+ * F=0 (no poll command; unsolicited frame) */
+ msg->xid_info[0] = 0x81; /* XID format identifier */
+ msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */
+ msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */
+
+ skb->dev = sta->sdata->dev;
+ skb->protocol = eth_type_trans(skb, sta->sdata->dev);
+ memset(skb->cb, 0, sizeof(skb->cb));
+ netif_rx_ni(skb);
+}
+
+static int sta_apply_auth_flags(struct ieee80211_local *local,
+ struct sta_info *sta,
+ u32 mask, u32 set)
+{
+ int ret;
+
+ if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) &&
+ set & BIT(NL80211_STA_FLAG_AUTHENTICATED) &&
+ !test_sta_flag(sta, WLAN_STA_AUTH)) {
+ ret = sta_info_move_state(sta, IEEE80211_STA_AUTH);
+ if (ret)
+ return ret;
+ }
+
+ if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) &&
+ set & BIT(NL80211_STA_FLAG_ASSOCIATED) &&
+ !test_sta_flag(sta, WLAN_STA_ASSOC)) {
+ /*
+ * When peer becomes associated, init rate control as
+ * well. Some drivers require rate control initialized
+ * before drv_sta_state() is called.
+ */
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER))
+ rate_control_rate_init(sta);
+
+ ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
+ if (ret)
+ return ret;
+ }
+
+ if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) {
+ if (set & BIT(NL80211_STA_FLAG_AUTHORIZED))
+ ret = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED);
+ else if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
+ else
+ ret = 0;
+ if (ret)
+ return ret;
+ }
+
+ if (mask & BIT(NL80211_STA_FLAG_ASSOCIATED) &&
+ !(set & BIT(NL80211_STA_FLAG_ASSOCIATED)) &&
+ test_sta_flag(sta, WLAN_STA_ASSOC)) {
+ ret = sta_info_move_state(sta, IEEE80211_STA_AUTH);
+ if (ret)
+ return ret;
+ }
+
+ if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED) &&
+ !(set & BIT(NL80211_STA_FLAG_AUTHENTICATED)) &&
+ test_sta_flag(sta, WLAN_STA_AUTH)) {
+ ret = sta_info_move_state(sta, IEEE80211_STA_NONE);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sta_apply_parameters(struct ieee80211_local *local,
+ struct sta_info *sta,
+ struct station_parameters *params)
+{
+ int ret = 0;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ u32 mask, set;
+
+ sband = local->hw.wiphy->bands[band];
+
+ mask = params->sta_flags_mask;
+ set = params->sta_flags_set;
+
+ if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ /*
+ * In mesh mode, ASSOCIATED isn't part of the nl80211
+ * API but must follow AUTHENTICATED for driver state.
+ */
+ if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED))
+ mask |= BIT(NL80211_STA_FLAG_ASSOCIATED);
+ if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED))
+ set |= BIT(NL80211_STA_FLAG_ASSOCIATED);
+ } else if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ /*
+ * TDLS -- everything follows authorized, but
+ * only becoming authorized is possible, not
+ * going back
+ */
+ if (set & BIT(NL80211_STA_FLAG_AUTHORIZED)) {
+ set |= BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_ASSOCIATED);
+ mask |= BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_ASSOCIATED);
+ }
+ }
+
+ if (mask & BIT(NL80211_STA_FLAG_WME) &&
+ local->hw.queues >= IEEE80211_NUM_ACS)
+ sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME);
+
+ /* auth flags will be set later for TDLS stations */
+ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ ret = sta_apply_auth_flags(local, sta, mask, set);
+ if (ret)
+ return ret;
+ }
+
+ if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) {
+ if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE))
+ set_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE);
+ else
+ clear_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE);
+ }
+
+ if (mask & BIT(NL80211_STA_FLAG_MFP)) {
+ sta->sta.mfp = !!(set & BIT(NL80211_STA_FLAG_MFP));
+ if (set & BIT(NL80211_STA_FLAG_MFP))
+ set_sta_flag(sta, WLAN_STA_MFP);
+ else
+ clear_sta_flag(sta, WLAN_STA_MFP);
+ }
+
+ if (mask & BIT(NL80211_STA_FLAG_TDLS_PEER)) {
+ if (set & BIT(NL80211_STA_FLAG_TDLS_PEER))
+ set_sta_flag(sta, WLAN_STA_TDLS_PEER);
+ else
+ clear_sta_flag(sta, WLAN_STA_TDLS_PEER);
+ }
+
+ /* mark TDLS channel switch support, if the AP allows it */
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
+ !sdata->u.mgd.tdls_chan_switch_prohibited &&
+ params->ext_capab_len >= 4 &&
+ params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH)
+ set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH);
+
+ if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) {
+ sta->sta.uapsd_queues = params->uapsd_queues;
+ sta->sta.max_sp = params->max_sp;
+ }
+
+ /*
+ * cfg80211 validates this (1-2007) and allows setting the AID
+ * only when creating a new station entry
+ */
+ if (params->aid)
+ sta->sta.aid = params->aid;
+
+ /*
+ * Some of the following updates would be racy if called on an
+ * existing station, via ieee80211_change_station(). However,
+ * all such changes are rejected by cfg80211 except for updates
+ * changing the supported rates on an existing but not yet used
+ * TDLS peer.
+ */
+
+ if (params->listen_interval >= 0)
+ sta->listen_interval = params->listen_interval;
+
+ if (params->supported_rates) {
+ ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
+ sband, params->supported_rates,
+ params->supported_rates_len,
+ &sta->sta.supp_rates[band]);
+ }
+
+ if (params->ht_capa)
+ ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
+ params->ht_capa, sta);
+
+ if (params->vht_capa)
+ ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
+ params->vht_capa, sta);
+
+ if (params->opmode_notif_used) {
+ /* returned value is only needed for rc update, but the
+ * rc isn't initialized here yet, so ignore it
+ */
+ __ieee80211_vht_handle_opmode(sdata, sta,
+ params->opmode_notif,
+ band, false);
+ }
+
+ if (ieee80211_vif_is_mesh(&sdata->vif)) {
+#ifdef CONFIG_MAC80211_MESH
+ u32 changed = 0;
+
+ if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) {
+ switch (params->plink_state) {
+ case NL80211_PLINK_ESTAB:
+ if (sta->plink_state != NL80211_PLINK_ESTAB)
+ changed = mesh_plink_inc_estab_count(
+ sdata);
+ sta->plink_state = params->plink_state;
+
+ ieee80211_mps_sta_status_update(sta);
+ changed |= ieee80211_mps_set_sta_local_pm(sta,
+ sdata->u.mesh.mshcfg.power_mode);
+ break;
+ case NL80211_PLINK_LISTEN:
+ case NL80211_PLINK_BLOCKED:
+ case NL80211_PLINK_OPN_SNT:
+ case NL80211_PLINK_OPN_RCVD:
+ case NL80211_PLINK_CNF_RCVD:
+ case NL80211_PLINK_HOLDING:
+ if (sta->plink_state == NL80211_PLINK_ESTAB)
+ changed = mesh_plink_dec_estab_count(
+ sdata);
+ sta->plink_state = params->plink_state;
+
+ ieee80211_mps_sta_status_update(sta);
+ changed |= ieee80211_mps_set_sta_local_pm(sta,
+ NL80211_MESH_POWER_UNKNOWN);
+ break;
+ default:
+ /* nothing */
+ break;
+ }
+ }
+
+ switch (params->plink_action) {
+ case NL80211_PLINK_ACTION_NO_ACTION:
+ /* nothing */
+ break;
+ case NL80211_PLINK_ACTION_OPEN:
+ changed |= mesh_plink_open(sta);
+ break;
+ case NL80211_PLINK_ACTION_BLOCK:
+ changed |= mesh_plink_block(sta);
+ break;
+ }
+
+ if (params->local_pm)
+ changed |=
+ ieee80211_mps_set_sta_local_pm(sta,
+ params->local_pm);
+ ieee80211_mbss_info_change_notify(sdata, changed);
+#endif
+ }
+
+ /* set the STA state after all sta info from usermode has been set */
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ ret = sta_apply_auth_flags(local, sta, mask, set);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *mac,
+ struct station_parameters *params)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct sta_info *sta;
+ struct ieee80211_sub_if_data *sdata;
+ int err;
+ int layer2_update;
+
+ if (params->vlan) {
+ sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ sdata->vif.type != NL80211_IFTYPE_AP)
+ return -EINVAL;
+ } else
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ if (ether_addr_equal(mac, sdata->vif.addr))
+ return -EINVAL;
+
+ if (is_multicast_ether_addr(mac))
+ return -EINVAL;
+
+ sta = sta_info_alloc(sdata, mac, GFP_KERNEL);
+ if (!sta)
+ return -ENOMEM;
+
+ /*
+ * defaults -- if userspace wants something else we'll
+ * change it accordingly in sta_apply_parameters()
+ */
+ if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))) {
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
+ sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
+ } else {
+ sta->sta.tdls = true;
+ }
+
+ err = sta_apply_parameters(local, sta, params);
+ if (err) {
+ sta_info_free(local, sta);
+ return err;
+ }
+
+ /*
+ * for TDLS, rate control should be initialized only when
+ * rates are known and station is marked authorized
+ */
+ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER))
+ rate_control_rate_init(sta);
+
+ layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ sdata->vif.type == NL80211_IFTYPE_AP;
+
+ err = sta_info_insert_rcu(sta);
+ if (err) {
+ rcu_read_unlock();
+ return err;
+ }
+
+ if (layer2_update)
+ ieee80211_send_layer2_update(sta);
+
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev,
+ struct station_del_parameters *params)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ if (params->mac)
+ return sta_info_destroy_addr_bss(sdata, params->mac);
+
+ sta_info_flush(sdata);
+ return 0;
+}
+
+static int ieee80211_change_station(struct wiphy *wiphy,
+ struct net_device *dev, const u8 *mac,
+ struct station_parameters *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct sta_info *sta;
+ struct ieee80211_sub_if_data *vlansdata;
+ enum cfg80211_station_type statype;
+ int err;
+
+ mutex_lock(&local->sta_mtx);
+
+ sta = sta_info_get_bss(sdata, mac);
+ if (!sta) {
+ err = -ENOENT;
+ goto out_err;
+ }
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_MESH_POINT:
+ if (sdata->u.mesh.user_mpm)
+ statype = CFG80211_STA_MESH_PEER_USER;
+ else
+ statype = CFG80211_STA_MESH_PEER_KERNEL;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ statype = CFG80211_STA_IBSS;
+ break;
+ case NL80211_IFTYPE_STATION:
+ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
+ statype = CFG80211_STA_AP_STA;
+ break;
+ }
+ if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ statype = CFG80211_STA_TDLS_PEER_ACTIVE;
+ else
+ statype = CFG80211_STA_TDLS_PEER_SETUP;
+ break;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ statype = CFG80211_STA_AP_CLIENT;
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ err = cfg80211_check_station_change(wiphy, params, statype);
+ if (err)
+ goto out_err;
+
+ if (params->vlan && params->vlan != sta->sdata->dev) {
+ bool prev_4addr = false;
+ bool new_4addr = false;
+
+ vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
+
+ if (params->vlan->ieee80211_ptr->use_4addr) {
+ if (vlansdata->u.vlan.sta) {
+ err = -EBUSY;
+ goto out_err;
+ }
+
+ rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
+ new_4addr = true;
+ }
+
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ sta->sdata->u.vlan.sta) {
+ RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL);
+ prev_4addr = true;
+ }
+
+ sta->sdata = vlansdata;
+
+ if (sta->sta_state == IEEE80211_STA_AUTHORIZED &&
+ prev_4addr != new_4addr) {
+ if (new_4addr)
+ atomic_dec(&sta->sdata->bss->num_mcast_sta);
+ else
+ atomic_inc(&sta->sdata->bss->num_mcast_sta);
+ }
+
+ ieee80211_send_layer2_update(sta);
+ }
+
+ err = sta_apply_parameters(local, sta, params);
+ if (err)
+ goto out_err;
+
+ mutex_unlock(&local->sta_mtx);
+
+ if ((sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
+ sta->known_smps_mode != sta->sdata->bss->req_smps &&
+ test_sta_flag(sta, WLAN_STA_AUTHORIZED) &&
+ sta_info_tx_streams(sta) != 1) {
+ ht_dbg(sta->sdata,
+ "%pM just authorized and MIMO capable - update SMPS\n",
+ sta->sta.addr);
+ ieee80211_send_smps_action(sta->sdata,
+ sta->sdata->bss->req_smps,
+ sta->sta.addr,
+ sta->sdata->vif.bss_conf.bssid);
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) {
+ ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps_vif(sdata);
+ }
+
+ return 0;
+out_err:
+ mutex_unlock(&local->sta_mtx);
+ return err;
+}
+
+#ifdef CONFIG_MAC80211_MESH
+static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *dst, const u8 *next_hop)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct mesh_path *mpath;
+ struct sta_info *sta;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, next_hop);
+ if (!sta) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ mpath = mesh_path_add(sdata, dst);
+ if (IS_ERR(mpath)) {
+ rcu_read_unlock();
+ return PTR_ERR(mpath);
+ }
+
+ mesh_path_fix_nexthop(mpath, sta);
+
+ rcu_read_unlock();
+ return 0;
+}
+
+static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *dst)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ if (dst)
+ return mesh_path_del(sdata, dst);
+
+ mesh_path_flush_by_iface(sdata);
+ return 0;
+}
+
+static int ieee80211_change_mpath(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *dst, const u8 *next_hop)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct mesh_path *mpath;
+ struct sta_info *sta;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ rcu_read_lock();
+
+ sta = sta_info_get(sdata, next_hop);
+ if (!sta) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ mpath = mesh_path_lookup(sdata, dst);
+ if (!mpath) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ mesh_path_fix_nexthop(mpath, sta);
+
+ rcu_read_unlock();
+ return 0;
+}
+
+static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
+ struct mpath_info *pinfo)
+{
+ struct sta_info *next_hop_sta = rcu_dereference(mpath->next_hop);
+
+ if (next_hop_sta)
+ memcpy(next_hop, next_hop_sta->sta.addr, ETH_ALEN);
+ else
+ eth_zero_addr(next_hop);
+
+ memset(pinfo, 0, sizeof(*pinfo));
+
+ pinfo->generation = mesh_paths_generation;
+
+ pinfo->filled = MPATH_INFO_FRAME_QLEN |
+ MPATH_INFO_SN |
+ MPATH_INFO_METRIC |
+ MPATH_INFO_EXPTIME |
+ MPATH_INFO_DISCOVERY_TIMEOUT |
+ MPATH_INFO_DISCOVERY_RETRIES |
+ MPATH_INFO_FLAGS;
+
+ pinfo->frame_qlen = mpath->frame_queue.qlen;
+ pinfo->sn = mpath->sn;
+ pinfo->metric = mpath->metric;
+ if (time_before(jiffies, mpath->exp_time))
+ pinfo->exptime = jiffies_to_msecs(mpath->exp_time - jiffies);
+ pinfo->discovery_timeout =
+ jiffies_to_msecs(mpath->discovery_timeout);
+ pinfo->discovery_retries = mpath->discovery_retries;
+ if (mpath->flags & MESH_PATH_ACTIVE)
+ pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE;
+ if (mpath->flags & MESH_PATH_RESOLVING)
+ pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING;
+ if (mpath->flags & MESH_PATH_SN_VALID)
+ pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID;
+ if (mpath->flags & MESH_PATH_FIXED)
+ pinfo->flags |= NL80211_MPATH_FLAG_FIXED;
+ if (mpath->flags & MESH_PATH_RESOLVED)
+ pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED;
+}
+
+static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
+ u8 *dst, u8 *next_hop, struct mpath_info *pinfo)
+
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct mesh_path *mpath;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ rcu_read_lock();
+ mpath = mesh_path_lookup(sdata, dst);
+ if (!mpath) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ memcpy(dst, mpath->dst, ETH_ALEN);
+ mpath_set_pinfo(mpath, next_hop, pinfo);
+ rcu_read_unlock();
+ return 0;
+}
+
+static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev,
+ int idx, u8 *dst, u8 *next_hop,
+ struct mpath_info *pinfo)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct mesh_path *mpath;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ rcu_read_lock();
+ mpath = mesh_path_lookup_by_idx(sdata, idx);
+ if (!mpath) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ memcpy(dst, mpath->dst, ETH_ALEN);
+ mpath_set_pinfo(mpath, next_hop, pinfo);
+ rcu_read_unlock();
+ return 0;
+}
+
+static void mpp_set_pinfo(struct mesh_path *mpath, u8 *mpp,
+ struct mpath_info *pinfo)
+{
+ memset(pinfo, 0, sizeof(*pinfo));
+ memcpy(mpp, mpath->mpp, ETH_ALEN);
+
+ pinfo->generation = mpp_paths_generation;
+}
+
+static int ieee80211_get_mpp(struct wiphy *wiphy, struct net_device *dev,
+ u8 *dst, u8 *mpp, struct mpath_info *pinfo)
+
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct mesh_path *mpath;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ rcu_read_lock();
+ mpath = mpp_path_lookup(sdata, dst);
+ if (!mpath) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ memcpy(dst, mpath->dst, ETH_ALEN);
+ mpp_set_pinfo(mpath, mpp, pinfo);
+ rcu_read_unlock();
+ return 0;
+}
+
+static int ieee80211_dump_mpp(struct wiphy *wiphy, struct net_device *dev,
+ int idx, u8 *dst, u8 *mpp,
+ struct mpath_info *pinfo)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct mesh_path *mpath;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ rcu_read_lock();
+ mpath = mpp_path_lookup_by_idx(sdata, idx);
+ if (!mpath) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ memcpy(dst, mpath->dst, ETH_ALEN);
+ mpp_set_pinfo(mpath, mpp, pinfo);
+ rcu_read_unlock();
+ return 0;
+}
+
+static int ieee80211_get_mesh_config(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct mesh_config *conf)
+{
+ struct ieee80211_sub_if_data *sdata;
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ memcpy(conf, &(sdata->u.mesh.mshcfg), sizeof(struct mesh_config));
+ return 0;
+}
+
+static inline bool _chg_mesh_attr(enum nl80211_meshconf_params parm, u32 mask)
+{
+ return (mask >> (parm-1)) & 0x1;
+}
+
+static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
+ const struct mesh_setup *setup)
+{
+ u8 *new_ie;
+ const u8 *old_ie;
+ struct ieee80211_sub_if_data *sdata = container_of(ifmsh,
+ struct ieee80211_sub_if_data, u.mesh);
+
+ /* allocate information elements */
+ new_ie = NULL;
+ old_ie = ifmsh->ie;
+
+ if (setup->ie_len) {
+ new_ie = kmemdup(setup->ie, setup->ie_len,
+ GFP_KERNEL);
+ if (!new_ie)
+ return -ENOMEM;
+ }
+ ifmsh->ie_len = setup->ie_len;
+ ifmsh->ie = new_ie;
+ kfree(old_ie);
+
+ /* now copy the rest of the setup parameters */
+ ifmsh->mesh_id_len = setup->mesh_id_len;
+ memcpy(ifmsh->mesh_id, setup->mesh_id, ifmsh->mesh_id_len);
+ ifmsh->mesh_sp_id = setup->sync_method;
+ ifmsh->mesh_pp_id = setup->path_sel_proto;
+ ifmsh->mesh_pm_id = setup->path_metric;
+ ifmsh->user_mpm = setup->user_mpm;
+ ifmsh->mesh_auth_id = setup->auth_id;
+ ifmsh->security = IEEE80211_MESH_SEC_NONE;
+ if (setup->is_authenticated)
+ ifmsh->security |= IEEE80211_MESH_SEC_AUTHED;
+ if (setup->is_secure)
+ ifmsh->security |= IEEE80211_MESH_SEC_SECURED;
+
+ /* mcast rate setting in Mesh Node */
+ memcpy(sdata->vif.bss_conf.mcast_rate, setup->mcast_rate,
+ sizeof(setup->mcast_rate));
+ sdata->vif.bss_conf.basic_rates = setup->basic_rates;
+
+ sdata->vif.bss_conf.beacon_int = setup->beacon_interval;
+ sdata->vif.bss_conf.dtim_period = setup->dtim_period;
+
+ return 0;
+}
+
+static int ieee80211_update_mesh_config(struct wiphy *wiphy,
+ struct net_device *dev, u32 mask,
+ const struct mesh_config *nconf)
+{
+ struct mesh_config *conf;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_if_mesh *ifmsh;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ ifmsh = &sdata->u.mesh;
+
+ /* Set the config options which we are interested in setting */
+ conf = &(sdata->u.mesh.mshcfg);
+ if (_chg_mesh_attr(NL80211_MESHCONF_RETRY_TIMEOUT, mask))
+ conf->dot11MeshRetryTimeout = nconf->dot11MeshRetryTimeout;
+ if (_chg_mesh_attr(NL80211_MESHCONF_CONFIRM_TIMEOUT, mask))
+ conf->dot11MeshConfirmTimeout = nconf->dot11MeshConfirmTimeout;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HOLDING_TIMEOUT, mask))
+ conf->dot11MeshHoldingTimeout = nconf->dot11MeshHoldingTimeout;
+ if (_chg_mesh_attr(NL80211_MESHCONF_MAX_PEER_LINKS, mask))
+ conf->dot11MeshMaxPeerLinks = nconf->dot11MeshMaxPeerLinks;
+ if (_chg_mesh_attr(NL80211_MESHCONF_MAX_RETRIES, mask))
+ conf->dot11MeshMaxRetries = nconf->dot11MeshMaxRetries;
+ if (_chg_mesh_attr(NL80211_MESHCONF_TTL, mask))
+ conf->dot11MeshTTL = nconf->dot11MeshTTL;
+ if (_chg_mesh_attr(NL80211_MESHCONF_ELEMENT_TTL, mask))
+ conf->element_ttl = nconf->element_ttl;
+ if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask)) {
+ if (ifmsh->user_mpm)
+ return -EBUSY;
+ conf->auto_open_plinks = nconf->auto_open_plinks;
+ }
+ if (_chg_mesh_attr(NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR, mask))
+ conf->dot11MeshNbrOffsetMaxNeighbor =
+ nconf->dot11MeshNbrOffsetMaxNeighbor;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, mask))
+ conf->dot11MeshHWMPmaxPREQretries =
+ nconf->dot11MeshHWMPmaxPREQretries;
+ if (_chg_mesh_attr(NL80211_MESHCONF_PATH_REFRESH_TIME, mask))
+ conf->path_refresh_time = nconf->path_refresh_time;
+ if (_chg_mesh_attr(NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, mask))
+ conf->min_discovery_timeout = nconf->min_discovery_timeout;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, mask))
+ conf->dot11MeshHWMPactivePathTimeout =
+ nconf->dot11MeshHWMPactivePathTimeout;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, mask))
+ conf->dot11MeshHWMPpreqMinInterval =
+ nconf->dot11MeshHWMPpreqMinInterval;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL, mask))
+ conf->dot11MeshHWMPperrMinInterval =
+ nconf->dot11MeshHWMPperrMinInterval;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+ mask))
+ conf->dot11MeshHWMPnetDiameterTraversalTime =
+ nconf->dot11MeshHWMPnetDiameterTraversalTime;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOTMODE, mask)) {
+ conf->dot11MeshHWMPRootMode = nconf->dot11MeshHWMPRootMode;
+ ieee80211_mesh_root_setup(ifmsh);
+ }
+ if (_chg_mesh_attr(NL80211_MESHCONF_GATE_ANNOUNCEMENTS, mask)) {
+ /* our current gate announcement implementation rides on root
+ * announcements, so require this ifmsh to also be a root node
+ * */
+ if (nconf->dot11MeshGateAnnouncementProtocol &&
+ !(conf->dot11MeshHWMPRootMode > IEEE80211_ROOTMODE_ROOT)) {
+ conf->dot11MeshHWMPRootMode = IEEE80211_PROACTIVE_RANN;
+ ieee80211_mesh_root_setup(ifmsh);
+ }
+ conf->dot11MeshGateAnnouncementProtocol =
+ nconf->dot11MeshGateAnnouncementProtocol;
+ }
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_RANN_INTERVAL, mask))
+ conf->dot11MeshHWMPRannInterval =
+ nconf->dot11MeshHWMPRannInterval;
+ if (_chg_mesh_attr(NL80211_MESHCONF_FORWARDING, mask))
+ conf->dot11MeshForwarding = nconf->dot11MeshForwarding;
+ if (_chg_mesh_attr(NL80211_MESHCONF_RSSI_THRESHOLD, mask)) {
+ /* our RSSI threshold implementation is supported only for
+ * devices that report signal in dBm.
+ */
+ if (!(sdata->local->hw.flags & IEEE80211_HW_SIGNAL_DBM))
+ return -ENOTSUPP;
+ conf->rssi_threshold = nconf->rssi_threshold;
+ }
+ if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) {
+ conf->ht_opmode = nconf->ht_opmode;
+ sdata->vif.bss_conf.ht_operation_mode = nconf->ht_opmode;
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+ }
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, mask))
+ conf->dot11MeshHWMPactivePathToRootTimeout =
+ nconf->dot11MeshHWMPactivePathToRootTimeout;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOT_INTERVAL, mask))
+ conf->dot11MeshHWMProotInterval =
+ nconf->dot11MeshHWMProotInterval;
+ if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL, mask))
+ conf->dot11MeshHWMPconfirmationInterval =
+ nconf->dot11MeshHWMPconfirmationInterval;
+ if (_chg_mesh_attr(NL80211_MESHCONF_POWER_MODE, mask)) {
+ conf->power_mode = nconf->power_mode;
+ ieee80211_mps_local_status_update(sdata);
+ }
+ if (_chg_mesh_attr(NL80211_MESHCONF_AWAKE_WINDOW, mask))
+ conf->dot11MeshAwakeWindowDuration =
+ nconf->dot11MeshAwakeWindowDuration;
+ if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask))
+ conf->plink_timeout = nconf->plink_timeout;
+ ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON);
+ return 0;
+}
+
+static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
+ const struct mesh_config *conf,
+ const struct mesh_setup *setup)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ int err;
+
+ memcpy(&ifmsh->mshcfg, conf, sizeof(struct mesh_config));
+ err = copy_mesh_setup(ifmsh, setup);
+ if (err)
+ return err;
+
+ /* can mesh use other SMPS modes? */
+ sdata->smps_mode = IEEE80211_SMPS_OFF;
+ sdata->needed_rx_chains = sdata->local->rx_chains;
+
+ mutex_lock(&sdata->local->mtx);
+ err = ieee80211_vif_use_channel(sdata, &setup->chandef,
+ IEEE80211_CHANCTX_SHARED);
+ mutex_unlock(&sdata->local->mtx);
+ if (err)
+ return err;
+
+ return ieee80211_start_mesh(sdata);
+}
+
+static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ ieee80211_stop_mesh(sdata);
+ mutex_lock(&sdata->local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&sdata->local->mtx);
+
+ return 0;
+}
+#endif
+
+static int ieee80211_change_bss(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct bss_parameters *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ enum ieee80211_band band;
+ u32 changed = 0;
+
+ if (!sdata_dereference(sdata->u.ap.beacon, sdata))
+ return -ENOENT;
+
+ band = ieee80211_get_sdata_band(sdata);
+
+ if (params->use_cts_prot >= 0) {
+ sdata->vif.bss_conf.use_cts_prot = params->use_cts_prot;
+ changed |= BSS_CHANGED_ERP_CTS_PROT;
+ }
+ if (params->use_short_preamble >= 0) {
+ sdata->vif.bss_conf.use_short_preamble =
+ params->use_short_preamble;
+ changed |= BSS_CHANGED_ERP_PREAMBLE;
+ }
+
+ if (!sdata->vif.bss_conf.use_short_slot &&
+ band == IEEE80211_BAND_5GHZ) {
+ sdata->vif.bss_conf.use_short_slot = true;
+ changed |= BSS_CHANGED_ERP_SLOT;
+ }
+
+ if (params->use_short_slot_time >= 0) {
+ sdata->vif.bss_conf.use_short_slot =
+ params->use_short_slot_time;
+ changed |= BSS_CHANGED_ERP_SLOT;
+ }
+
+ if (params->basic_rates) {
+ ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
+ wiphy->bands[band],
+ params->basic_rates,
+ params->basic_rates_len,
+ &sdata->vif.bss_conf.basic_rates);
+ changed |= BSS_CHANGED_BASIC_RATES;
+ }
+
+ if (params->ap_isolate >= 0) {
+ if (params->ap_isolate)
+ sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
+ else
+ sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
+ }
+
+ if (params->ht_opmode >= 0) {
+ sdata->vif.bss_conf.ht_operation_mode =
+ (u16) params->ht_opmode;
+ changed |= BSS_CHANGED_HT;
+ }
+
+ if (params->p2p_ctwindow >= 0) {
+ sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &=
+ ~IEEE80211_P2P_OPPPS_CTWINDOW_MASK;
+ sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
+ params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK;
+ changed |= BSS_CHANGED_P2P_PS;
+ }
+
+ if (params->p2p_opp_ps > 0) {
+ sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
+ IEEE80211_P2P_OPPPS_ENABLE_BIT;
+ changed |= BSS_CHANGED_P2P_PS;
+ } else if (params->p2p_opp_ps == 0) {
+ sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &=
+ ~IEEE80211_P2P_OPPPS_ENABLE_BIT;
+ changed |= BSS_CHANGED_P2P_PS;
+ }
+
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ return 0;
+}
+
+static int ieee80211_set_txq_params(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct ieee80211_txq_params *params)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_tx_queue_params p;
+
+ if (!local->ops->conf_tx)
+ return -EOPNOTSUPP;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ return -EOPNOTSUPP;
+
+ memset(&p, 0, sizeof(p));
+ p.aifs = params->aifs;
+ p.cw_max = params->cwmax;
+ p.cw_min = params->cwmin;
+ p.txop = params->txop;
+
+ /*
+ * Setting tx queue params disables u-apsd because it's only
+ * called in master mode.
+ */
+ p.uapsd = false;
+
+ sdata->tx_conf[params->ac] = p;
+ if (drv_conf_tx(local, sdata, params->ac, &p)) {
+ wiphy_debug(local->hw.wiphy,
+ "failed to set TX queue parameters for AC %d\n",
+ params->ac);
+ return -EINVAL;
+ }
+
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
+
+ return 0;
+}
+
+#ifdef CONFIG_PM
+static int ieee80211_suspend(struct wiphy *wiphy,
+ struct cfg80211_wowlan *wowlan)
+{
+ return __ieee80211_suspend(wiphy_priv(wiphy), wowlan);
+}
+
+static int ieee80211_resume(struct wiphy *wiphy)
+{
+ return __ieee80211_resume(wiphy_priv(wiphy));
+}
+#else
+#define ieee80211_suspend NULL
+#define ieee80211_resume NULL
+#endif
+
+static int ieee80211_scan(struct wiphy *wiphy,
+ struct cfg80211_scan_request *req)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev);
+
+ switch (ieee80211_vif_type_p2p(&sdata->vif)) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ break;
+ case NL80211_IFTYPE_P2P_GO:
+ if (sdata->local->ops->hw_scan)
+ break;
+ /*
+ * FIXME: implement NoA while scanning in software,
+ * for now fall through to allow scanning only when
+ * beaconing hasn't been configured yet
+ */
+ case NL80211_IFTYPE_AP:
+ /*
+ * If the scan has been forced (and the driver supports
+ * forcing), don't care about being beaconing already.
+ * This will create problems to the attached stations (e.g. all
+ * the frames sent while scanning on other channel will be
+ * lost)
+ */
+ if (sdata->u.ap.beacon &&
+ (!(wiphy->features & NL80211_FEATURE_AP_SCAN) ||
+ !(req->flags & NL80211_SCAN_FLAG_AP)))
+ return -EOPNOTSUPP;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return ieee80211_request_scan(sdata, req);
+}
+
+static int
+ieee80211_sched_scan_start(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_sched_scan_request *req)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ if (!sdata->local->ops->sched_scan_start)
+ return -EOPNOTSUPP;
+
+ return ieee80211_request_sched_scan_start(sdata, req);
+}
+
+static int
+ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ if (!sdata->local->ops->sched_scan_stop)
+ return -EOPNOTSUPP;
+
+ return ieee80211_request_sched_scan_stop(sdata);
+}
+
+static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_auth_request *req)
+{
+ return ieee80211_mgd_auth(IEEE80211_DEV_TO_SUB_IF(dev), req);
+}
+
+static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_assoc_request *req)
+{
+ return ieee80211_mgd_assoc(IEEE80211_DEV_TO_SUB_IF(dev), req);
+}
+
+static int ieee80211_deauth(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_deauth_request *req)
+{
+ return ieee80211_mgd_deauth(IEEE80211_DEV_TO_SUB_IF(dev), req);
+}
+
+static int ieee80211_disassoc(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_disassoc_request *req)
+{
+ return ieee80211_mgd_disassoc(IEEE80211_DEV_TO_SUB_IF(dev), req);
+}
+
+static int ieee80211_join_ibss(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_ibss_params *params)
+{
+ return ieee80211_ibss_join(IEEE80211_DEV_TO_SUB_IF(dev), params);
+}
+
+static int ieee80211_leave_ibss(struct wiphy *wiphy, struct net_device *dev)
+{
+ return ieee80211_ibss_leave(IEEE80211_DEV_TO_SUB_IF(dev));
+}
+
+static int ieee80211_join_ocb(struct wiphy *wiphy, struct net_device *dev,
+ struct ocb_setup *setup)
+{
+ return ieee80211_ocb_join(IEEE80211_DEV_TO_SUB_IF(dev), setup);
+}
+
+static int ieee80211_leave_ocb(struct wiphy *wiphy, struct net_device *dev)
+{
+ return ieee80211_ocb_leave(IEEE80211_DEV_TO_SUB_IF(dev));
+}
+
+static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev,
+ int rate[IEEE80211_NUM_BANDS])
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ memcpy(sdata->vif.bss_conf.mcast_rate, rate,
+ sizeof(int) * IEEE80211_NUM_BANDS);
+
+ return 0;
+}
+
+static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ int err;
+
+ if (changed & WIPHY_PARAM_FRAG_THRESHOLD) {
+ err = drv_set_frag_threshold(local, wiphy->frag_threshold);
+
+ if (err)
+ return err;
+ }
+
+ if ((changed & WIPHY_PARAM_COVERAGE_CLASS) ||
+ (changed & WIPHY_PARAM_DYN_ACK)) {
+ s16 coverage_class;
+
+ coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ?
+ wiphy->coverage_class : -1;
+ err = drv_set_coverage_class(local, coverage_class);
+
+ if (err)
+ return err;
+ }
+
+ if (changed & WIPHY_PARAM_RTS_THRESHOLD) {
+ err = drv_set_rts_threshold(local, wiphy->rts_threshold);
+
+ if (err)
+ return err;
+ }
+
+ if (changed & WIPHY_PARAM_RETRY_SHORT) {
+ if (wiphy->retry_short > IEEE80211_MAX_TX_RETRY)
+ return -EINVAL;
+ local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
+ }
+ if (changed & WIPHY_PARAM_RETRY_LONG) {
+ if (wiphy->retry_long > IEEE80211_MAX_TX_RETRY)
+ return -EINVAL;
+ local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
+ }
+ if (changed &
+ (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG))
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
+
+ return 0;
+}
+
+static int ieee80211_set_tx_power(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ enum nl80211_tx_power_setting type, int mbm)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata;
+ enum nl80211_tx_power_setting txp_type = type;
+ bool update_txp_type = false;
+
+ if (wdev) {
+ sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+
+ switch (type) {
+ case NL80211_TX_POWER_AUTOMATIC:
+ sdata->user_power_level = IEEE80211_UNSET_POWER_LEVEL;
+ txp_type = NL80211_TX_POWER_LIMITED;
+ break;
+ case NL80211_TX_POWER_LIMITED:
+ case NL80211_TX_POWER_FIXED:
+ if (mbm < 0 || (mbm % 100))
+ return -EOPNOTSUPP;
+ sdata->user_power_level = MBM_TO_DBM(mbm);
+ break;
+ }
+
+ if (txp_type != sdata->vif.bss_conf.txpower_type) {
+ update_txp_type = true;
+ sdata->vif.bss_conf.txpower_type = txp_type;
+ }
+
+ ieee80211_recalc_txpower(sdata, update_txp_type);
+
+ return 0;
+ }
+
+ switch (type) {
+ case NL80211_TX_POWER_AUTOMATIC:
+ local->user_power_level = IEEE80211_UNSET_POWER_LEVEL;
+ txp_type = NL80211_TX_POWER_LIMITED;
+ break;
+ case NL80211_TX_POWER_LIMITED:
+ case NL80211_TX_POWER_FIXED:
+ if (mbm < 0 || (mbm % 100))
+ return -EOPNOTSUPP;
+ local->user_power_level = MBM_TO_DBM(mbm);
+ break;
+ }
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ sdata->user_power_level = local->user_power_level;
+ if (txp_type != sdata->vif.bss_conf.txpower_type)
+ update_txp_type = true;
+ sdata->vif.bss_conf.txpower_type = txp_type;
+ }
+ list_for_each_entry(sdata, &local->interfaces, list)
+ ieee80211_recalc_txpower(sdata, update_txp_type);
+ mutex_unlock(&local->iflist_mtx);
+
+ return 0;
+}
+
+static int ieee80211_get_tx_power(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ int *dbm)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+
+ if (local->ops->get_txpower)
+ return drv_get_txpower(local, sdata, dbm);
+
+ if (!local->use_chanctx)
+ *dbm = local->hw.conf.power_level;
+ else
+ *dbm = sdata->vif.bss_conf.txpower;
+
+ return 0;
+}
+
+static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *addr)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ memcpy(&sdata->u.wds.remote_addr, addr, ETH_ALEN);
+
+ return 0;
+}
+
+static void ieee80211_rfkill_poll(struct wiphy *wiphy)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ drv_rfkill_poll(local);
+}
+
+#ifdef CONFIG_NL80211_TESTMODE
+static int ieee80211_testmode_cmd(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ void *data, int len)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_vif *vif = NULL;
+
+ if (!local->ops->testmode_cmd)
+ return -EOPNOTSUPP;
+
+ if (wdev) {
+ struct ieee80211_sub_if_data *sdata;
+
+ sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ if (sdata->flags & IEEE80211_SDATA_IN_DRIVER)
+ vif = &sdata->vif;
+ }
+
+ return local->ops->testmode_cmd(&local->hw, vif, data, len);
+}
+
+static int ieee80211_testmode_dump(struct wiphy *wiphy,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ void *data, int len)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ if (!local->ops->testmode_dump)
+ return -EOPNOTSUPP;
+
+ return local->ops->testmode_dump(&local->hw, skb, cb, data, len);
+}
+#endif
+
+int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_smps_mode smps_mode)
+{
+ struct sta_info *sta;
+ enum ieee80211_smps_mode old_req;
+
+ if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP))
+ return -EINVAL;
+
+ if (sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
+ return 0;
+
+ old_req = sdata->u.ap.req_smps;
+ sdata->u.ap.req_smps = smps_mode;
+
+ /* AUTOMATIC doesn't mean much for AP - don't allow it */
+ if (old_req == smps_mode ||
+ smps_mode == IEEE80211_SMPS_AUTOMATIC)
+ return 0;
+
+ /* If no associated stations, there's no need to do anything */
+ if (!atomic_read(&sdata->u.ap.num_mcast_sta)) {
+ sdata->smps_mode = smps_mode;
+ ieee80211_queue_work(&sdata->local->hw, &sdata->recalc_smps);
+ return 0;
+ }
+
+ ht_dbg(sdata,
+ "SMPS %d requested in AP mode, sending Action frame to %d stations\n",
+ smps_mode, atomic_read(&sdata->u.ap.num_mcast_sta));
+
+ mutex_lock(&sdata->local->sta_mtx);
+ list_for_each_entry(sta, &sdata->local->sta_list, list) {
+ /*
+ * Only stations associated to our AP and
+ * associated VLANs
+ */
+ if (sta->sdata->bss != &sdata->u.ap)
+ continue;
+
+ /* This station doesn't support MIMO - skip it */
+ if (sta_info_tx_streams(sta) == 1)
+ continue;
+
+ /*
+ * Don't wake up a STA just to send the action frame
+ * unless we are getting more restrictive.
+ */
+ if (test_sta_flag(sta, WLAN_STA_PS_STA) &&
+ !ieee80211_smps_is_restrictive(sta->known_smps_mode,
+ smps_mode)) {
+ ht_dbg(sdata, "Won't send SMPS to sleeping STA %pM\n",
+ sta->sta.addr);
+ continue;
+ }
+
+ /*
+ * If the STA is not authorized, wait until it gets
+ * authorized and the action frame will be sent then.
+ */
+ if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ continue;
+
+ ht_dbg(sdata, "Sending SMPS to %pM\n", sta->sta.addr);
+ ieee80211_send_smps_action(sdata, smps_mode, sta->sta.addr,
+ sdata->vif.bss_conf.bssid);
+ }
+ mutex_unlock(&sdata->local->sta_mtx);
+
+ sdata->smps_mode = smps_mode;
+ ieee80211_queue_work(&sdata->local->hw, &sdata->recalc_smps);
+
+ return 0;
+}
+
+int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_smps_mode smps_mode)
+{
+ const u8 *ap;
+ enum ieee80211_smps_mode old_req;
+ int err;
+
+ lockdep_assert_held(&sdata->wdev.mtx);
+
+ if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION))
+ return -EINVAL;
+
+ old_req = sdata->u.mgd.req_smps;
+ sdata->u.mgd.req_smps = smps_mode;
+
+ if (old_req == smps_mode &&
+ smps_mode != IEEE80211_SMPS_AUTOMATIC)
+ return 0;
+
+ /*
+ * If not associated, or current association is not an HT
+ * association, there's no need to do anything, just store
+ * the new value until we associate.
+ */
+ if (!sdata->u.mgd.associated ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
+ return 0;
+
+ ap = sdata->u.mgd.associated->bssid;
+
+ if (smps_mode == IEEE80211_SMPS_AUTOMATIC) {
+ if (sdata->u.mgd.powersave)
+ smps_mode = IEEE80211_SMPS_DYNAMIC;
+ else
+ smps_mode = IEEE80211_SMPS_OFF;
+ }
+
+ /* send SM PS frame to AP */
+ err = ieee80211_send_smps_action(sdata, smps_mode,
+ ap, ap);
+ if (err)
+ sdata->u.mgd.req_smps = old_req;
+
+ return err;
+}
+
+static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
+ bool enabled, int timeout)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return -EOPNOTSUPP;
+
+ if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS))
+ return -EOPNOTSUPP;
+
+ if (enabled == sdata->u.mgd.powersave &&
+ timeout == local->dynamic_ps_forced_timeout)
+ return 0;
+
+ sdata->u.mgd.powersave = enabled;
+ local->dynamic_ps_forced_timeout = timeout;
+
+ /* no change, but if automatic follow powersave */
+ sdata_lock(sdata);
+ __ieee80211_request_smps_mgd(sdata, sdata->u.mgd.req_smps);
+ sdata_unlock(sdata);
+
+ if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+
+ ieee80211_recalc_ps(local, -1);
+ ieee80211_recalc_ps_vif(sdata);
+
+ return 0;
+}
+
+static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
+ struct net_device *dev,
+ s32 rssi_thold, u32 rssi_hyst)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_vif *vif = &sdata->vif;
+ struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
+
+ if (rssi_thold == bss_conf->cqm_rssi_thold &&
+ rssi_hyst == bss_conf->cqm_rssi_hyst)
+ return 0;
+
+ bss_conf->cqm_rssi_thold = rssi_thold;
+ bss_conf->cqm_rssi_hyst = rssi_hyst;
+
+ /* tell the driver upon association, unless already associated */
+ if (sdata->u.mgd.associated &&
+ sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_CQM);
+
+ return 0;
+}
+
+static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
+ struct net_device *dev,
+ const u8 *addr,
+ const struct cfg80211_bitrate_mask *mask)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+ int i, ret;
+
+ if (!ieee80211_sdata_running(sdata))
+ return -ENETDOWN;
+
+ if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) {
+ ret = drv_set_bitrate_mask(local, sdata, mask);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ struct ieee80211_supported_band *sband = wiphy->bands[i];
+ int j;
+
+ sdata->rc_rateidx_mask[i] = mask->control[i].legacy;
+ memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs,
+ sizeof(mask->control[i].ht_mcs));
+
+ sdata->rc_has_mcs_mask[i] = false;
+ if (!sband)
+ continue;
+
+ for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++)
+ if (~sdata->rc_rateidx_mcs_mask[i][j]) {
+ sdata->rc_has_mcs_mask[i] = true;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local,
+ struct ieee80211_roc_work *new_roc,
+ struct ieee80211_roc_work *cur_roc)
+{
+ unsigned long now = jiffies;
+ unsigned long remaining = cur_roc->hw_start_time +
+ msecs_to_jiffies(cur_roc->duration) -
+ now;
+
+ if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun))
+ return false;
+
+ /* if it doesn't fit entirely, schedule a new one */
+ if (new_roc->duration > jiffies_to_msecs(remaining))
+ return false;
+
+ ieee80211_handle_roc_started(new_roc);
+
+ /* add to dependents so we send the expired event properly */
+ list_add_tail(&new_roc->list, &cur_roc->dependents);
+ return true;
+}
+
+static int ieee80211_start_roc_work(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel *channel,
+ unsigned int duration, u64 *cookie,
+ struct sk_buff *txskb,
+ enum ieee80211_roc_type type)
+{
+ struct ieee80211_roc_work *roc, *tmp;
+ bool queued = false;
+ int ret;
+
+ lockdep_assert_held(&local->mtx);
+
+ if (local->use_chanctx && !local->ops->remain_on_channel)
+ return -EOPNOTSUPP;
+
+ roc = kzalloc(sizeof(*roc), GFP_KERNEL);
+ if (!roc)
+ return -ENOMEM;
+
+ /*
+ * If the duration is zero, then the driver
+ * wouldn't actually do anything. Set it to
+ * 10 for now.
+ *
+ * TODO: cancel the off-channel operation
+ * when we get the SKB's TX status and
+ * the wait time was zero before.
+ */
+ if (!duration)
+ duration = 10;
+
+ roc->chan = channel;
+ roc->duration = duration;
+ roc->req_duration = duration;
+ roc->frame = txskb;
+ roc->type = type;
+ roc->mgmt_tx_cookie = (unsigned long)txskb;
+ roc->sdata = sdata;
+ INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work);
+ INIT_LIST_HEAD(&roc->dependents);
+
+ /*
+ * cookie is either the roc cookie (for normal roc)
+ * or the SKB (for mgmt TX)
+ */
+ if (!txskb) {
+ /* local->mtx protects this */
+ local->roc_cookie_counter++;
+ roc->cookie = local->roc_cookie_counter;
+ /* wow, you wrapped 64 bits ... more likely a bug */
+ if (WARN_ON(roc->cookie == 0)) {
+ roc->cookie = 1;
+ local->roc_cookie_counter++;
+ }
+ *cookie = roc->cookie;
+ } else {
+ *cookie = (unsigned long)txskb;
+ }
+
+ /* if there's one pending or we're scanning, queue this one */
+ if (!list_empty(&local->roc_list) ||
+ local->scanning || ieee80211_is_radar_required(local))
+ goto out_check_combine;
+
+ /* if not HW assist, just queue & schedule work */
+ if (!local->ops->remain_on_channel) {
+ ieee80211_queue_delayed_work(&local->hw, &roc->work, 0);
+ goto out_queue;
+ }
+
+ /* otherwise actually kick it off here (for error handling) */
+
+ ret = drv_remain_on_channel(local, sdata, channel, duration, type);
+ if (ret) {
+ kfree(roc);
+ return ret;
+ }
+
+ roc->started = true;
+ goto out_queue;
+
+ out_check_combine:
+ list_for_each_entry(tmp, &local->roc_list, list) {
+ if (tmp->chan != channel || tmp->sdata != sdata)
+ continue;
+
+ /*
+ * Extend this ROC if possible:
+ *
+ * If it hasn't started yet, just increase the duration
+ * and add the new one to the list of dependents.
+ * If the type of the new ROC has higher priority, modify the
+ * type of the previous one to match that of the new one.
+ */
+ if (!tmp->started) {
+ list_add_tail(&roc->list, &tmp->dependents);
+ tmp->duration = max(tmp->duration, roc->duration);
+ tmp->type = max(tmp->type, roc->type);
+ queued = true;
+ break;
+ }
+
+ /* If it has already started, it's more difficult ... */
+ if (local->ops->remain_on_channel) {
+ /*
+ * In the offloaded ROC case, if it hasn't begun, add
+ * this new one to the dependent list to be handled
+ * when the master one begins. If it has begun,
+ * check if it fits entirely within the existing one,
+ * in which case it will just be dependent as well.
+ * Otherwise, schedule it by itself.
+ */
+ if (!tmp->hw_begun) {
+ list_add_tail(&roc->list, &tmp->dependents);
+ queued = true;
+ break;
+ }
+
+ if (ieee80211_coalesce_started_roc(local, roc, tmp))
+ queued = true;
+ } else if (del_timer_sync(&tmp->work.timer)) {
+ unsigned long new_end;
+
+ /*
+ * In the software ROC case, cancel the timer, if
+ * that fails then the finish work is already
+ * queued/pending and thus we queue the new ROC
+ * normally, if that succeeds then we can extend
+ * the timer duration and TX the frame (if any.)
+ */
+
+ list_add_tail(&roc->list, &tmp->dependents);
+ queued = true;
+
+ new_end = jiffies + msecs_to_jiffies(roc->duration);
+
+ /* ok, it was started & we canceled timer */
+ if (time_after(new_end, tmp->work.timer.expires))
+ mod_timer(&tmp->work.timer, new_end);
+ else
+ add_timer(&tmp->work.timer);
+
+ ieee80211_handle_roc_started(roc);
+ }
+ break;
+ }
+
+ out_queue:
+ if (!queued)
+ list_add_tail(&roc->list, &local->roc_list);
+
+ return 0;
+}
+
+static int ieee80211_remain_on_channel(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ struct ieee80211_channel *chan,
+ unsigned int duration,
+ u64 *cookie)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_local *local = sdata->local;
+ int ret;
+
+ mutex_lock(&local->mtx);
+ ret = ieee80211_start_roc_work(local, sdata, chan,
+ duration, cookie, NULL,
+ IEEE80211_ROC_TYPE_NORMAL);
+ mutex_unlock(&local->mtx);
+
+ return ret;
+}
+
+static int ieee80211_cancel_roc(struct ieee80211_local *local,
+ u64 cookie, bool mgmt_tx)
+{
+ struct ieee80211_roc_work *roc, *tmp, *found = NULL;
+ int ret;
+
+ mutex_lock(&local->mtx);
+ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) {
+ struct ieee80211_roc_work *dep, *tmp2;
+
+ list_for_each_entry_safe(dep, tmp2, &roc->dependents, list) {
+ if (!mgmt_tx && dep->cookie != cookie)
+ continue;
+ else if (mgmt_tx && dep->mgmt_tx_cookie != cookie)
+ continue;
+ /* found dependent item -- just remove it */
+ list_del(&dep->list);
+ mutex_unlock(&local->mtx);
+
+ ieee80211_roc_notify_destroy(dep, true);
+ return 0;
+ }
+
+ if (!mgmt_tx && roc->cookie != cookie)
+ continue;
+ else if (mgmt_tx && roc->mgmt_tx_cookie != cookie)
+ continue;
+
+ found = roc;
+ break;
+ }
+
+ if (!found) {
+ mutex_unlock(&local->mtx);
+ return -ENOENT;
+ }
+
+ /*
+ * We found the item to cancel, so do that. Note that it
+ * may have dependents, which we also cancel (and send
+ * the expired signal for.) Not doing so would be quite
+ * tricky here, but we may need to fix it later.
+ */
+
+ if (local->ops->remain_on_channel) {
+ if (found->started) {
+ ret = drv_cancel_remain_on_channel(local);
+ if (WARN_ON_ONCE(ret)) {
+ mutex_unlock(&local->mtx);
+ return ret;
+ }
+ }
+
+ list_del(&found->list);
+
+ if (found->started)
+ ieee80211_start_next_roc(local);
+ mutex_unlock(&local->mtx);
+
+ ieee80211_roc_notify_destroy(found, true);
+ } else {
+ /* work may be pending so use it all the time */
+ found->abort = true;
+ ieee80211_queue_delayed_work(&local->hw, &found->work, 0);
+
+ mutex_unlock(&local->mtx);
+
+ /* work will clean up etc */
+ flush_delayed_work(&found->work);
+ WARN_ON(!found->to_be_freed);
+ kfree(found);
+ }
+
+ return 0;
+}
+
+static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ u64 cookie)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_local *local = sdata->local;
+
+ return ieee80211_cancel_roc(local, cookie, false);
+}
+
+static int ieee80211_start_radar_detection(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_chan_def *chandef,
+ u32 cac_time_ms)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ int err;
+
+ mutex_lock(&local->mtx);
+ if (!list_empty(&local->roc_list) || local->scanning) {
+ err = -EBUSY;
+ goto out_unlock;
+ }
+
+ /* whatever, but channel contexts should not complain about that one */
+ sdata->smps_mode = IEEE80211_SMPS_OFF;
+ sdata->needed_rx_chains = local->rx_chains;
+
+ err = ieee80211_vif_use_channel(sdata, chandef,
+ IEEE80211_CHANCTX_SHARED);
+ if (err)
+ goto out_unlock;
+
+ ieee80211_queue_delayed_work(&sdata->local->hw,
+ &sdata->dfs_cac_timer_work,
+ msecs_to_jiffies(cac_time_ms));
+
+ out_unlock:
+ mutex_unlock(&local->mtx);
+ return err;
+}
+
+static struct cfg80211_beacon_data *
+cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
+{
+ struct cfg80211_beacon_data *new_beacon;
+ u8 *pos;
+ int len;
+
+ len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len +
+ beacon->proberesp_ies_len + beacon->assocresp_ies_len +
+ beacon->probe_resp_len;
+
+ new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL);
+ if (!new_beacon)
+ return NULL;
+
+ pos = (u8 *)(new_beacon + 1);
+ if (beacon->head_len) {
+ new_beacon->head_len = beacon->head_len;
+ new_beacon->head = pos;
+ memcpy(pos, beacon->head, beacon->head_len);
+ pos += beacon->head_len;
+ }
+ if (beacon->tail_len) {
+ new_beacon->tail_len = beacon->tail_len;
+ new_beacon->tail = pos;
+ memcpy(pos, beacon->tail, beacon->tail_len);
+ pos += beacon->tail_len;
+ }
+ if (beacon->beacon_ies_len) {
+ new_beacon->beacon_ies_len = beacon->beacon_ies_len;
+ new_beacon->beacon_ies = pos;
+ memcpy(pos, beacon->beacon_ies, beacon->beacon_ies_len);
+ pos += beacon->beacon_ies_len;
+ }
+ if (beacon->proberesp_ies_len) {
+ new_beacon->proberesp_ies_len = beacon->proberesp_ies_len;
+ new_beacon->proberesp_ies = pos;
+ memcpy(pos, beacon->proberesp_ies, beacon->proberesp_ies_len);
+ pos += beacon->proberesp_ies_len;
+ }
+ if (beacon->assocresp_ies_len) {
+ new_beacon->assocresp_ies_len = beacon->assocresp_ies_len;
+ new_beacon->assocresp_ies = pos;
+ memcpy(pos, beacon->assocresp_ies, beacon->assocresp_ies_len);
+ pos += beacon->assocresp_ies_len;
+ }
+ if (beacon->probe_resp_len) {
+ new_beacon->probe_resp_len = beacon->probe_resp_len;
+ beacon->probe_resp = pos;
+ memcpy(pos, beacon->probe_resp, beacon->probe_resp_len);
+ pos += beacon->probe_resp_len;
+ }
+
+ return new_beacon;
+}
+
+void ieee80211_csa_finish(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->csa_finalize_work);
+}
+EXPORT_SYMBOL(ieee80211_csa_finish);
+
+static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata,
+ u32 *changed)
+{
+ int err;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP:
+ err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon,
+ NULL);
+ kfree(sdata->u.ap.next_beacon);
+ sdata->u.ap.next_beacon = NULL;
+
+ if (err < 0)
+ return err;
+ *changed |= err;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ err = ieee80211_ibss_finish_csa(sdata);
+ if (err < 0)
+ return err;
+ *changed |= err;
+ break;
+#ifdef CONFIG_MAC80211_MESH
+ case NL80211_IFTYPE_MESH_POINT:
+ err = ieee80211_mesh_finish_csa(sdata);
+ if (err < 0)
+ return err;
+ *changed |= err;
+ break;
+#endif
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ u32 changed = 0;
+ int err;
+
+ sdata_assert_lock(sdata);
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ /*
+ * using reservation isn't immediate as it may be deferred until later
+ * with multi-vif. once reservation is complete it will re-schedule the
+ * work with no reserved_chanctx so verify chandef to check if it
+ * completed successfully
+ */
+
+ if (sdata->reserved_chanctx) {
+ /*
+ * with multi-vif csa driver may call ieee80211_csa_finish()
+ * many times while waiting for other interfaces to use their
+ * reservations
+ */
+ if (sdata->reserved_ready)
+ return 0;
+
+ return ieee80211_vif_use_reserved_context(sdata);
+ }
+
+ if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef,
+ &sdata->csa_chandef))
+ return -EINVAL;
+
+ sdata->vif.csa_active = false;
+
+ err = ieee80211_set_after_csa_beacon(sdata, &changed);
+ if (err)
+ return err;
+
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ if (sdata->csa_block_tx) {
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_CSA);
+ sdata->csa_block_tx = false;
+ }
+
+ err = drv_post_channel_switch(sdata);
+ if (err)
+ return err;
+
+ cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef);
+
+ return 0;
+}
+
+static void ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
+{
+ if (__ieee80211_csa_finalize(sdata)) {
+ sdata_info(sdata, "failed to finalize CSA, disconnecting\n");
+ cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev,
+ GFP_KERNEL);
+ }
+}
+
+void ieee80211_csa_finalize_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ csa_finalize_work);
+ struct ieee80211_local *local = sdata->local;
+
+ sdata_lock(sdata);
+ mutex_lock(&local->mtx);
+ mutex_lock(&local->chanctx_mtx);
+
+ /* AP might have been stopped while waiting for the lock. */
+ if (!sdata->vif.csa_active)
+ goto unlock;
+
+ if (!ieee80211_sdata_running(sdata))
+ goto unlock;
+
+ ieee80211_csa_finalize(sdata);
+
+unlock:
+ mutex_unlock(&local->chanctx_mtx);
+ mutex_unlock(&local->mtx);
+ sdata_unlock(sdata);
+}
+
+static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_csa_settings *params,
+ u32 *changed)
+{
+ struct ieee80211_csa_settings csa = {};
+ int err;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP:
+ sdata->u.ap.next_beacon =
+ cfg80211_beacon_dup(&params->beacon_after);
+ if (!sdata->u.ap.next_beacon)
+ return -ENOMEM;
+
+ /*
+ * With a count of 0, we don't have to wait for any
+ * TBTT before switching, so complete the CSA
+ * immediately. In theory, with a count == 1 we
+ * should delay the switch until just before the next
+ * TBTT, but that would complicate things so we switch
+ * immediately too. If we would delay the switch
+ * until the next TBTT, we would have to set the probe
+ * response here.
+ *
+ * TODO: A channel switch with count <= 1 without
+ * sending a CSA action frame is kind of useless,
+ * because the clients won't know we're changing
+ * channels. The action frame must be implemented
+ * either here or in the userspace.
+ */
+ if (params->count <= 1)
+ break;
+
+ if ((params->n_counter_offsets_beacon >
+ IEEE80211_MAX_CSA_COUNTERS_NUM) ||
+ (params->n_counter_offsets_presp >
+ IEEE80211_MAX_CSA_COUNTERS_NUM))
+ return -EINVAL;
+
+ csa.counter_offsets_beacon = params->counter_offsets_beacon;
+ csa.counter_offsets_presp = params->counter_offsets_presp;
+ csa.n_counter_offsets_beacon = params->n_counter_offsets_beacon;
+ csa.n_counter_offsets_presp = params->n_counter_offsets_presp;
+ csa.count = params->count;
+
+ err = ieee80211_assign_beacon(sdata, &params->beacon_csa, &csa);
+ if (err < 0) {
+ kfree(sdata->u.ap.next_beacon);
+ return err;
+ }
+ *changed |= err;
+
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ if (!sdata->vif.bss_conf.ibss_joined)
+ return -EINVAL;
+
+ if (params->chandef.width != sdata->u.ibss.chandef.width)
+ return -EINVAL;
+
+ switch (params->chandef.width) {
+ case NL80211_CHAN_WIDTH_40:
+ if (cfg80211_get_chandef_type(&params->chandef) !=
+ cfg80211_get_chandef_type(&sdata->u.ibss.chandef))
+ return -EINVAL;
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* changes into another band are not supported */
+ if (sdata->u.ibss.chandef.chan->band !=
+ params->chandef.chan->band)
+ return -EINVAL;
+
+ /* see comments in the NL80211_IFTYPE_AP block */
+ if (params->count > 1) {
+ err = ieee80211_ibss_csa_beacon(sdata, params);
+ if (err < 0)
+ return err;
+ *changed |= err;
+ }
+
+ ieee80211_send_action_csa(sdata, params);
+
+ break;
+#ifdef CONFIG_MAC80211_MESH
+ case NL80211_IFTYPE_MESH_POINT: {
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+ if (params->chandef.width != sdata->vif.bss_conf.chandef.width)
+ return -EINVAL;
+
+ /* changes into another band are not supported */
+ if (sdata->vif.bss_conf.chandef.chan->band !=
+ params->chandef.chan->band)
+ return -EINVAL;
+
+ if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_NONE) {
+ ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_INIT;
+ if (!ifmsh->pre_value)
+ ifmsh->pre_value = 1;
+ else
+ ifmsh->pre_value++;
+ }
+
+ /* see comments in the NL80211_IFTYPE_AP block */
+ if (params->count > 1) {
+ err = ieee80211_mesh_csa_beacon(sdata, params);
+ if (err < 0) {
+ ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE;
+ return err;
+ }
+ *changed |= err;
+ }
+
+ if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT)
+ ieee80211_send_action_csa(sdata, params);
+
+ break;
+ }
+#endif
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int
+__ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_csa_settings *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_channel_switch ch_switch;
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *chanctx;
+ u32 changed = 0;
+ int err;
+
+ sdata_assert_lock(sdata);
+ lockdep_assert_held(&local->mtx);
+
+ if (!list_empty(&local->roc_list) || local->scanning)
+ return -EBUSY;
+
+ if (sdata->wdev.cac_started)
+ return -EBUSY;
+
+ if (cfg80211_chandef_identical(&params->chandef,
+ &sdata->vif.bss_conf.chandef))
+ return -EINVAL;
+
+ /* don't allow another channel switch if one is already active. */
+ if (sdata->vif.csa_active)
+ return -EBUSY;
+
+ mutex_lock(&local->chanctx_mtx);
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ if (!conf) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ chanctx = container_of(conf, struct ieee80211_chanctx, conf);
+ if (!chanctx) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ ch_switch.timestamp = 0;
+ ch_switch.device_timestamp = 0;
+ ch_switch.block_tx = params->block_tx;
+ ch_switch.chandef = params->chandef;
+ ch_switch.count = params->count;
+
+ err = drv_pre_channel_switch(sdata, &ch_switch);
+ if (err)
+ goto out;
+
+ err = ieee80211_vif_reserve_chanctx(sdata, &params->chandef,
+ chanctx->mode,
+ params->radar_required);
+ if (err)
+ goto out;
+
+ /* if reservation is invalid then this will fail */
+ err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0);
+ if (err) {
+ ieee80211_vif_unreserve_chanctx(sdata);
+ goto out;
+ }
+
+ err = ieee80211_set_csa_beacon(sdata, params, &changed);
+ if (err) {
+ ieee80211_vif_unreserve_chanctx(sdata);
+ goto out;
+ }
+
+ sdata->csa_chandef = params->chandef;
+ sdata->csa_block_tx = params->block_tx;
+ sdata->vif.csa_active = true;
+
+ if (sdata->csa_block_tx)
+ ieee80211_stop_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_CSA);
+
+ cfg80211_ch_switch_started_notify(sdata->dev, &sdata->csa_chandef,
+ params->count);
+
+ if (changed) {
+ ieee80211_bss_info_change_notify(sdata, changed);
+ drv_channel_switch_beacon(sdata, &params->chandef);
+ } else {
+ /* if the beacon didn't change, we can finalize immediately */
+ ieee80211_csa_finalize(sdata);
+ }
+
+out:
+ mutex_unlock(&local->chanctx_mtx);
+ return err;
+}
+
+int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_csa_settings *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ int err;
+
+ mutex_lock(&local->mtx);
+ err = __ieee80211_channel_switch(wiphy, dev, params);
+ mutex_unlock(&local->mtx);
+
+ return err;
+}
+
+static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct cfg80211_mgmt_tx_params *params,
+ u64 *cookie)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct sta_info *sta;
+ const struct ieee80211_mgmt *mgmt = (void *)params->buf;
+ bool need_offchan = false;
+ u32 flags;
+ int ret;
+ u8 *data;
+
+ if (params->dont_wait_for_ack)
+ flags = IEEE80211_TX_CTL_NO_ACK;
+ else
+ flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX |
+ IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+ if (params->no_cck)
+ flags |= IEEE80211_TX_CTL_NO_CCK_RATE;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_ADHOC:
+ if (!sdata->vif.bss_conf.ibss_joined)
+ need_offchan = true;
+ /* fall through */
+#ifdef CONFIG_MAC80211_MESH
+ case NL80211_IFTYPE_MESH_POINT:
+ if (ieee80211_vif_is_mesh(&sdata->vif) &&
+ !sdata->u.mesh.mesh_id_len)
+ need_offchan = true;
+ /* fall through */
+#endif
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ !ieee80211_vif_is_mesh(&sdata->vif) &&
+ !rcu_access_pointer(sdata->bss->beacon))
+ need_offchan = true;
+ if (!ieee80211_is_action(mgmt->frame_control) ||
+ mgmt->u.action.category == WLAN_CATEGORY_PUBLIC ||
+ mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED ||
+ mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT)
+ break;
+ rcu_read_lock();
+ sta = sta_info_get(sdata, mgmt->da);
+ rcu_read_unlock();
+ if (!sta)
+ return -ENOLINK;
+ break;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ if (!sdata->u.mgd.associated)
+ need_offchan = true;
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ need_offchan = true;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ /* configurations requiring offchan cannot work if no channel has been
+ * specified
+ */
+ if (need_offchan && !params->chan)
+ return -EINVAL;
+
+ mutex_lock(&local->mtx);
+
+ /* Check if the operating channel is the requested channel */
+ if (!need_offchan) {
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+
+ if (chanctx_conf) {
+ need_offchan = params->chan &&
+ (params->chan !=
+ chanctx_conf->def.chan);
+ } else if (!params->chan) {
+ ret = -EINVAL;
+ rcu_read_unlock();
+ goto out_unlock;
+ } else {
+ need_offchan = true;
+ }
+ rcu_read_unlock();
+ }
+
+ if (need_offchan && !params->offchan) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ data = skb_put(skb, params->len);
+ memcpy(data, params->buf, params->len);
+
+ /* Update CSA counters */
+ if (sdata->vif.csa_active &&
+ (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_MESH_POINT ||
+ sdata->vif.type == NL80211_IFTYPE_ADHOC) &&
+ params->n_csa_offsets) {
+ int i;
+ struct beacon_data *beacon = NULL;
+
+ rcu_read_lock();
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ beacon = rcu_dereference(sdata->u.ap.beacon);
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ beacon = rcu_dereference(sdata->u.ibss.presp);
+ else if (ieee80211_vif_is_mesh(&sdata->vif))
+ beacon = rcu_dereference(sdata->u.mesh.beacon);
+
+ if (beacon)
+ for (i = 0; i < params->n_csa_offsets; i++)
+ data[params->csa_offsets[i]] =
+ beacon->csa_current_counter;
+
+ rcu_read_unlock();
+ }
+
+ IEEE80211_SKB_CB(skb)->flags = flags;
+
+ skb->dev = sdata->dev;
+
+ if (!need_offchan) {
+ *cookie = (unsigned long) skb;
+ ieee80211_tx_skb(sdata, skb);
+ ret = 0;
+ goto out_unlock;
+ }
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN |
+ IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
+ if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
+ IEEE80211_SKB_CB(skb)->hw_queue =
+ local->hw.offchannel_tx_hw_queue;
+
+ /* This will handle all kinds of coalescing and immediate TX */
+ ret = ieee80211_start_roc_work(local, sdata, params->chan,
+ params->wait, cookie, skb,
+ IEEE80211_ROC_TYPE_MGMT_TX);
+ if (ret)
+ kfree_skb(skb);
+ out_unlock:
+ mutex_unlock(&local->mtx);
+ return ret;
+}
+
+static int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ u64 cookie)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ return ieee80211_cancel_roc(local, cookie, true);
+}
+
+static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ u16 frame_type, bool reg)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ switch (frame_type) {
+ case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ:
+ if (reg)
+ local->probe_req_reg++;
+ else
+ local->probe_req_reg--;
+
+ if (!local->open_count)
+ break;
+
+ ieee80211_queue_work(&local->hw, &local->reconfig_filter);
+ break;
+ default:
+ break;
+ }
+}
+
+static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ if (local->started)
+ return -EOPNOTSUPP;
+
+ return drv_set_antenna(local, tx_ant, rx_ant);
+}
+
+static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ return drv_get_antenna(local, tx_ant, rx_ant);
+}
+
+static int ieee80211_set_rekey_data(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_gtk_rekey_data *data)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ if (!local->ops->set_rekey_data)
+ return -EOPNOTSUPP;
+
+ drv_set_rekey_data(local, sdata, data);
+
+ return 0;
+}
+
+static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, u64 *cookie)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_qos_hdr *nullfunc;
+ struct sk_buff *skb;
+ int size = sizeof(*nullfunc);
+ __le16 fc;
+ bool qos;
+ struct ieee80211_tx_info *info;
+ struct sta_info *sta;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ enum ieee80211_band band;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ band = chanctx_conf->def.chan->band;
+ sta = sta_info_get_bss(sdata, peer);
+ if (sta) {
+ qos = sta->sta.wme;
+ } else {
+ rcu_read_unlock();
+ return -ENOLINK;
+ }
+
+ if (qos) {
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
+ IEEE80211_STYPE_QOS_NULLFUNC |
+ IEEE80211_FCTL_FROMDS);
+ } else {
+ size -= 2;
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
+ IEEE80211_STYPE_NULLFUNC |
+ IEEE80211_FCTL_FROMDS);
+ }
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + size);
+ if (!skb) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
+
+ skb->dev = dev;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ nullfunc = (void *) skb_put(skb, size);
+ nullfunc->frame_control = fc;
+ nullfunc->duration_id = 0;
+ memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN);
+ memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN);
+ nullfunc->seq_ctrl = 0;
+
+ info = IEEE80211_SKB_CB(skb);
+
+ info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS |
+ IEEE80211_TX_INTFL_NL80211_FRAME_TX;
+ info->band = band;
+
+ skb_set_queue_mapping(skb, IEEE80211_AC_VO);
+ skb->priority = 7;
+ if (qos)
+ nullfunc->qos_ctrl = cpu_to_le16(7);
+
+ local_bh_disable();
+ ieee80211_xmit(sdata, sta, skb);
+ local_bh_enable();
+ rcu_read_unlock();
+
+ *cookie = (unsigned long) skb;
+ return 0;
+}
+
+static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ int ret = -ENODATA;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (chanctx_conf) {
+ *chandef = sdata->vif.bss_conf.chandef;
+ ret = 0;
+ } else if (local->open_count > 0 &&
+ local->open_count == local->monitors &&
+ sdata->vif.type == NL80211_IFTYPE_MONITOR) {
+ if (local->use_chanctx)
+ *chandef = local->monitor_chandef;
+ else
+ *chandef = local->_oper_chandef;
+ ret = 0;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+#ifdef CONFIG_PM
+static void ieee80211_set_wakeup(struct wiphy *wiphy, bool enabled)
+{
+ drv_set_wakeup(wiphy_priv(wiphy), enabled);
+}
+#endif
+
+static int ieee80211_set_qos_map(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_qos_map *qos_map)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct mac80211_qos_map *new_qos_map, *old_qos_map;
+
+ if (qos_map) {
+ new_qos_map = kzalloc(sizeof(*new_qos_map), GFP_KERNEL);
+ if (!new_qos_map)
+ return -ENOMEM;
+ memcpy(&new_qos_map->qos_map, qos_map, sizeof(*qos_map));
+ } else {
+ /* A NULL qos_map was passed to disable QoS mapping */
+ new_qos_map = NULL;
+ }
+
+ old_qos_map = sdata_dereference(sdata->qos_map, sdata);
+ rcu_assign_pointer(sdata->qos_map, new_qos_map);
+ if (old_qos_map)
+ kfree_rcu(old_qos_map, rcu_head);
+
+ return 0;
+}
+
+static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ int ret;
+ u32 changed = 0;
+
+ ret = ieee80211_vif_change_bandwidth(sdata, chandef, &changed);
+ if (ret == 0)
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ return ret;
+}
+
+static int ieee80211_add_tx_ts(struct wiphy *wiphy, struct net_device *dev,
+ u8 tsid, const u8 *peer, u8 up,
+ u16 admitted_time)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ int ac = ieee802_1d_to_ac[up];
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return -EOPNOTSUPP;
+
+ if (!(sdata->wmm_acm & BIT(up)))
+ return -EINVAL;
+
+ if (ifmgd->tx_tspec[ac].admitted_time)
+ return -EBUSY;
+
+ if (admitted_time) {
+ ifmgd->tx_tspec[ac].admitted_time = 32 * admitted_time;
+ ifmgd->tx_tspec[ac].tsid = tsid;
+ ifmgd->tx_tspec[ac].up = up;
+ }
+
+ return 0;
+}
+
+static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev,
+ u8 tsid, const u8 *peer)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ int ac;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac];
+
+ /* skip unused entries */
+ if (!tx_tspec->admitted_time)
+ continue;
+
+ if (tx_tspec->tsid != tsid)
+ continue;
+
+ /* due to this new packets will be reassigned to non-ACM ACs */
+ tx_tspec->up = -1;
+
+ /* Make sure that all packets have been sent to avoid to
+ * restore the QoS params on packets that are still on the
+ * queues.
+ */
+ synchronize_net();
+ ieee80211_flush_queues(local, sdata, false);
+
+ /* restore the normal QoS parameters
+ * (unconditionally to avoid races)
+ */
+ tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE;
+ tx_tspec->downgraded = false;
+ ieee80211_sta_handle_tspec_ac_params(sdata);
+
+ /* finally clear all the data */
+ memset(tx_tspec, 0, sizeof(*tx_tspec));
+
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+const struct cfg80211_ops mac80211_config_ops = {
+ .add_virtual_intf = ieee80211_add_iface,
+ .del_virtual_intf = ieee80211_del_iface,
+ .change_virtual_intf = ieee80211_change_iface,
+ .start_p2p_device = ieee80211_start_p2p_device,
+ .stop_p2p_device = ieee80211_stop_p2p_device,
+ .add_key = ieee80211_add_key,
+ .del_key = ieee80211_del_key,
+ .get_key = ieee80211_get_key,
+ .set_default_key = ieee80211_config_default_key,
+ .set_default_mgmt_key = ieee80211_config_default_mgmt_key,
+ .start_ap = ieee80211_start_ap,
+ .change_beacon = ieee80211_change_beacon,
+ .stop_ap = ieee80211_stop_ap,
+ .add_station = ieee80211_add_station,
+ .del_station = ieee80211_del_station,
+ .change_station = ieee80211_change_station,
+ .get_station = ieee80211_get_station,
+ .dump_station = ieee80211_dump_station,
+ .dump_survey = ieee80211_dump_survey,
+#ifdef CONFIG_MAC80211_MESH
+ .add_mpath = ieee80211_add_mpath,
+ .del_mpath = ieee80211_del_mpath,
+ .change_mpath = ieee80211_change_mpath,
+ .get_mpath = ieee80211_get_mpath,
+ .dump_mpath = ieee80211_dump_mpath,
+ .get_mpp = ieee80211_get_mpp,
+ .dump_mpp = ieee80211_dump_mpp,
+ .update_mesh_config = ieee80211_update_mesh_config,
+ .get_mesh_config = ieee80211_get_mesh_config,
+ .join_mesh = ieee80211_join_mesh,
+ .leave_mesh = ieee80211_leave_mesh,
+#endif
+ .join_ocb = ieee80211_join_ocb,
+ .leave_ocb = ieee80211_leave_ocb,
+ .change_bss = ieee80211_change_bss,
+ .set_txq_params = ieee80211_set_txq_params,
+ .set_monitor_channel = ieee80211_set_monitor_channel,
+ .suspend = ieee80211_suspend,
+ .resume = ieee80211_resume,
+ .scan = ieee80211_scan,
+ .sched_scan_start = ieee80211_sched_scan_start,
+ .sched_scan_stop = ieee80211_sched_scan_stop,
+ .auth = ieee80211_auth,
+ .assoc = ieee80211_assoc,
+ .deauth = ieee80211_deauth,
+ .disassoc = ieee80211_disassoc,
+ .join_ibss = ieee80211_join_ibss,
+ .leave_ibss = ieee80211_leave_ibss,
+ .set_mcast_rate = ieee80211_set_mcast_rate,
+ .set_wiphy_params = ieee80211_set_wiphy_params,
+ .set_tx_power = ieee80211_set_tx_power,
+ .get_tx_power = ieee80211_get_tx_power,
+ .set_wds_peer = ieee80211_set_wds_peer,
+ .rfkill_poll = ieee80211_rfkill_poll,
+ CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd)
+ CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump)
+ .set_power_mgmt = ieee80211_set_power_mgmt,
+ .set_bitrate_mask = ieee80211_set_bitrate_mask,
+ .remain_on_channel = ieee80211_remain_on_channel,
+ .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel,
+ .mgmt_tx = ieee80211_mgmt_tx,
+ .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait,
+ .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config,
+ .mgmt_frame_register = ieee80211_mgmt_frame_register,
+ .set_antenna = ieee80211_set_antenna,
+ .get_antenna = ieee80211_get_antenna,
+ .set_rekey_data = ieee80211_set_rekey_data,
+ .tdls_oper = ieee80211_tdls_oper,
+ .tdls_mgmt = ieee80211_tdls_mgmt,
+ .tdls_channel_switch = ieee80211_tdls_channel_switch,
+ .tdls_cancel_channel_switch = ieee80211_tdls_cancel_channel_switch,
+ .probe_client = ieee80211_probe_client,
+ .set_noack_map = ieee80211_set_noack_map,
+#ifdef CONFIG_PM
+ .set_wakeup = ieee80211_set_wakeup,
+#endif
+ .get_channel = ieee80211_cfg_get_channel,
+ .start_radar_detection = ieee80211_start_radar_detection,
+ .channel_switch = ieee80211_channel_switch,
+ .set_qos_map = ieee80211_set_qos_map,
+ .set_ap_chanwidth = ieee80211_set_ap_chanwidth,
+ .add_tx_ts = ieee80211_add_tx_ts,
+ .del_tx_ts = ieee80211_del_tx_ts,
+};
diff --git a/net/mac80211/cfg.h b/net/mac80211/cfg.h
new file mode 100644
index 000000000..2d51f62dc
--- /dev/null
+++ b/net/mac80211/cfg.h
@@ -0,0 +1,9 @@
+/*
+ * mac80211 configuration hooks for cfg80211
+ */
+#ifndef __CFG_H
+#define __CFG_H
+
+extern const struct cfg80211_ops mac80211_config_ops;
+
+#endif /* __CFG_H */
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
new file mode 100644
index 000000000..5bcd4e558
--- /dev/null
+++ b/net/mac80211/chan.c
@@ -0,0 +1,1760 @@
+/*
+ * mac80211 - channel management
+ */
+
+#include <linux/nl80211.h>
+#include <linux/export.h>
+#include <linux/rtnetlink.h>
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+
+static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ struct ieee80211_sub_if_data *sdata;
+ int num = 0;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list)
+ num++;
+
+ return num;
+}
+
+static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ struct ieee80211_sub_if_data *sdata;
+ int num = 0;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ list_for_each_entry(sdata, &ctx->reserved_vifs, reserved_chanctx_list)
+ num++;
+
+ return num;
+}
+
+int ieee80211_chanctx_refcount(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ return ieee80211_chanctx_num_assigned(local, ctx) +
+ ieee80211_chanctx_num_reserved(local, ctx);
+}
+
+static int ieee80211_num_chanctx(struct ieee80211_local *local)
+{
+ struct ieee80211_chanctx *ctx;
+ int num = 0;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ list_for_each_entry(ctx, &local->chanctx_list, list)
+ num++;
+
+ return num;
+}
+
+static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local)
+{
+ lockdep_assert_held(&local->chanctx_mtx);
+ return ieee80211_num_chanctx(local) < ieee80211_max_num_channels(local);
+}
+
+static struct ieee80211_chanctx *
+ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local __maybe_unused = sdata->local;
+ struct ieee80211_chanctx_conf *conf;
+
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ if (!conf)
+ return NULL;
+
+ return container_of(conf, struct ieee80211_chanctx, conf);
+}
+
+static const struct cfg80211_chan_def *
+ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ const struct cfg80211_chan_def *compat)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ list_for_each_entry(sdata, &ctx->reserved_vifs,
+ reserved_chanctx_list) {
+ if (!compat)
+ compat = &sdata->reserved_chandef;
+
+ compat = cfg80211_chandef_compatible(&sdata->reserved_chandef,
+ compat);
+ if (!compat)
+ break;
+ }
+
+ return compat;
+}
+
+static const struct cfg80211_chan_def *
+ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ const struct cfg80211_chan_def *compat)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ list_for_each_entry(sdata, &ctx->assigned_vifs,
+ assigned_chanctx_list) {
+ if (sdata->reserved_chanctx != NULL)
+ continue;
+
+ if (!compat)
+ compat = &sdata->vif.bss_conf.chandef;
+
+ compat = cfg80211_chandef_compatible(
+ &sdata->vif.bss_conf.chandef, compat);
+ if (!compat)
+ break;
+ }
+
+ return compat;
+}
+
+static const struct cfg80211_chan_def *
+ieee80211_chanctx_combined_chandef(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ const struct cfg80211_chan_def *compat)
+{
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat);
+ if (!compat)
+ return NULL;
+
+ compat = ieee80211_chanctx_non_reserved_chandef(local, ctx, compat);
+ if (!compat)
+ return NULL;
+
+ return compat;
+}
+
+static bool
+ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ const struct cfg80211_chan_def *def)
+{
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ if (ieee80211_chanctx_combined_chandef(local, ctx, def))
+ return true;
+
+ if (!list_empty(&ctx->reserved_vifs) &&
+ ieee80211_chanctx_reserved_chandef(local, ctx, def))
+ return true;
+
+ return false;
+}
+
+static struct ieee80211_chanctx *
+ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode mode)
+{
+ struct ieee80211_chanctx *ctx;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ if (mode == IEEE80211_CHANCTX_EXCLUSIVE)
+ return NULL;
+
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
+ continue;
+
+ if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE)
+ continue;
+
+ if (!ieee80211_chanctx_can_reserve_chandef(local, ctx,
+ chandef))
+ continue;
+
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta)
+{
+ switch (sta->bandwidth) {
+ case IEEE80211_STA_RX_BW_20:
+ if (sta->ht_cap.ht_supported)
+ return NL80211_CHAN_WIDTH_20;
+ else
+ return NL80211_CHAN_WIDTH_20_NOHT;
+ case IEEE80211_STA_RX_BW_40:
+ return NL80211_CHAN_WIDTH_40;
+ case IEEE80211_STA_RX_BW_80:
+ return NL80211_CHAN_WIDTH_80;
+ case IEEE80211_STA_RX_BW_160:
+ /*
+ * This applied for both 160 and 80+80. since we use
+ * the returned value to consider degradation of
+ * ctx->conf.min_def, we have to make sure to take
+ * the bigger one (NL80211_CHAN_WIDTH_160).
+ * Otherwise we might try degrading even when not
+ * needed, as the max required sta_bw returned (80+80)
+ * might be smaller than the configured bw (160).
+ */
+ return NL80211_CHAN_WIDTH_160;
+ default:
+ WARN_ON(1);
+ return NL80211_CHAN_WIDTH_20;
+ }
+}
+
+static enum nl80211_chan_width
+ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
+{
+ enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT;
+ struct sta_info *sta;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+ if (sdata != sta->sdata &&
+ !(sta->sdata->bss && sta->sdata->bss == sdata->bss))
+ continue;
+
+ if (!sta->uploaded)
+ continue;
+
+ max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta));
+ }
+ rcu_read_unlock();
+
+ return max_bw;
+}
+
+static enum nl80211_chan_width
+ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
+ struct ieee80211_chanctx_conf *conf)
+{
+ struct ieee80211_sub_if_data *sdata;
+ enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ struct ieee80211_vif *vif = &sdata->vif;
+ enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT;
+
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
+ continue;
+
+ switch (vif->type) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ width = ieee80211_get_max_required_bw(sdata);
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ continue;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_OCB:
+ width = vif->bss_conf.chandef.width;
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
+ WARN_ON_ONCE(1);
+ }
+ max_bw = max(max_bw, width);
+ }
+
+ /* use the configured bandwidth in case of monitor interface */
+ sdata = rcu_dereference(local->monitor_sdata);
+ if (sdata && rcu_access_pointer(sdata->vif.chanctx_conf) == conf)
+ max_bw = max(max_bw, conf->def.width);
+
+ rcu_read_unlock();
+
+ return max_bw;
+}
+
+/*
+ * recalc the min required chan width of the channel context, which is
+ * the max of min required widths of all the interfaces bound to this
+ * channel context.
+ */
+void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ enum nl80211_chan_width max_bw;
+ struct cfg80211_chan_def min_def;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ /* don't optimize 5MHz, 10MHz, and radar_enabled confs */
+ if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 ||
+ ctx->conf.def.width == NL80211_CHAN_WIDTH_10 ||
+ ctx->conf.radar_enabled) {
+ ctx->conf.min_def = ctx->conf.def;
+ return;
+ }
+
+ max_bw = ieee80211_get_chanctx_max_required_bw(local, &ctx->conf);
+
+ /* downgrade chandef up to max_bw */
+ min_def = ctx->conf.def;
+ while (min_def.width > max_bw)
+ ieee80211_chandef_downgrade(&min_def);
+
+ if (cfg80211_chandef_identical(&ctx->conf.min_def, &min_def))
+ return;
+
+ ctx->conf.min_def = min_def;
+ if (!ctx->driver_present)
+ return;
+
+ drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_MIN_WIDTH);
+}
+
+static void ieee80211_change_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ const struct cfg80211_chan_def *chandef)
+{
+ if (cfg80211_chandef_identical(&ctx->conf.def, chandef))
+ return;
+
+ WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef));
+
+ ctx->conf.def = *chandef;
+ drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_WIDTH);
+ ieee80211_recalc_chanctx_min_def(local, ctx);
+
+ if (!local->use_chanctx) {
+ local->_oper_chandef = *chandef;
+ ieee80211_hw_config(local, 0);
+ }
+}
+
+static struct ieee80211_chanctx *
+ieee80211_find_chanctx(struct ieee80211_local *local,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode mode)
+{
+ struct ieee80211_chanctx *ctx;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ if (mode == IEEE80211_CHANCTX_EXCLUSIVE)
+ return NULL;
+
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ const struct cfg80211_chan_def *compat;
+
+ if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE)
+ continue;
+
+ if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE)
+ continue;
+
+ compat = cfg80211_chandef_compatible(&ctx->conf.def, chandef);
+ if (!compat)
+ continue;
+
+ compat = ieee80211_chanctx_reserved_chandef(local, ctx,
+ compat);
+ if (!compat)
+ continue;
+
+ ieee80211_change_chanctx(local, ctx, compat);
+
+ return ctx;
+ }
+
+ return NULL;
+}
+
+bool ieee80211_is_radar_required(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ lockdep_assert_held(&local->mtx);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (sdata->radar_required) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+static bool
+ieee80211_chanctx_radar_required(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ struct ieee80211_chanctx_conf *conf = &ctx->conf;
+ struct ieee80211_sub_if_data *sdata;
+ bool required = false;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_held(&local->mtx);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
+ continue;
+ if (!sdata->radar_required)
+ continue;
+
+ required = true;
+ break;
+ }
+ rcu_read_unlock();
+
+ return required;
+}
+
+static struct ieee80211_chanctx *
+ieee80211_alloc_chanctx(struct ieee80211_local *local,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode mode)
+{
+ struct ieee80211_chanctx *ctx;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ ctx = kzalloc(sizeof(*ctx) + local->hw.chanctx_data_size, GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ INIT_LIST_HEAD(&ctx->assigned_vifs);
+ INIT_LIST_HEAD(&ctx->reserved_vifs);
+ ctx->conf.def = *chandef;
+ ctx->conf.rx_chains_static = 1;
+ ctx->conf.rx_chains_dynamic = 1;
+ ctx->mode = mode;
+ ctx->conf.radar_enabled = false;
+ ieee80211_recalc_chanctx_min_def(local, ctx);
+
+ return ctx;
+}
+
+static int ieee80211_add_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ u32 changed;
+ int err;
+
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ if (!local->use_chanctx)
+ local->hw.conf.radar_enabled = ctx->conf.radar_enabled;
+
+ /* turn idle off *before* setting channel -- some drivers need that */
+ changed = ieee80211_idle_off(local);
+ if (changed)
+ ieee80211_hw_config(local, changed);
+
+ if (!local->use_chanctx) {
+ local->_oper_chandef = ctx->conf.def;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+ } else {
+ err = drv_add_chanctx(local, ctx);
+ if (err) {
+ ieee80211_recalc_idle(local);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static struct ieee80211_chanctx *
+ieee80211_new_chanctx(struct ieee80211_local *local,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode mode)
+{
+ struct ieee80211_chanctx *ctx;
+ int err;
+
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ ctx = ieee80211_alloc_chanctx(local, chandef, mode);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ err = ieee80211_add_chanctx(local, ctx);
+ if (err) {
+ kfree(ctx);
+ return ERR_PTR(err);
+ }
+
+ list_add_rcu(&ctx->list, &local->chanctx_list);
+ return ctx;
+}
+
+static void ieee80211_del_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ if (!local->use_chanctx) {
+ struct cfg80211_chan_def *chandef = &local->_oper_chandef;
+ chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
+ chandef->center_freq1 = chandef->chan->center_freq;
+ chandef->center_freq2 = 0;
+
+ /* NOTE: Disabling radar is only valid here for
+ * single channel context. To be sure, check it ...
+ */
+ WARN_ON(local->hw.conf.radar_enabled &&
+ !list_empty(&local->chanctx_list));
+
+ local->hw.conf.radar_enabled = false;
+
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+ } else {
+ drv_remove_chanctx(local, ctx);
+ }
+
+ ieee80211_recalc_idle(local);
+}
+
+static void ieee80211_free_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ WARN_ON_ONCE(ieee80211_chanctx_refcount(local, ctx) != 0);
+
+ list_del_rcu(&ctx->list);
+ ieee80211_del_chanctx(local, ctx);
+ kfree_rcu(ctx, rcu_head);
+}
+
+static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ struct ieee80211_chanctx_conf *conf = &ctx->conf;
+ struct ieee80211_sub_if_data *sdata;
+ const struct cfg80211_chan_def *compat = NULL;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
+ continue;
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ continue;
+
+ if (!compat)
+ compat = &sdata->vif.bss_conf.chandef;
+
+ compat = cfg80211_chandef_compatible(
+ &sdata->vif.bss_conf.chandef, compat);
+ if (WARN_ON_ONCE(!compat))
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!compat)
+ return;
+
+ ieee80211_change_chanctx(local, ctx, compat);
+}
+
+static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *chanctx)
+{
+ bool radar_enabled;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+ /* for ieee80211_is_radar_required */
+ lockdep_assert_held(&local->mtx);
+
+ radar_enabled = ieee80211_chanctx_radar_required(local, chanctx);
+
+ if (radar_enabled == chanctx->conf.radar_enabled)
+ return;
+
+ chanctx->conf.radar_enabled = radar_enabled;
+
+ if (!local->use_chanctx) {
+ local->hw.conf.radar_enabled = chanctx->conf.radar_enabled;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+ }
+
+ drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR);
+}
+
+static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_chanctx *new_ctx)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *curr_ctx = NULL;
+ int ret = 0;
+
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+
+ if (conf) {
+ curr_ctx = container_of(conf, struct ieee80211_chanctx, conf);
+
+ drv_unassign_vif_chanctx(local, sdata, curr_ctx);
+ conf = NULL;
+ list_del(&sdata->assigned_chanctx_list);
+ }
+
+ if (new_ctx) {
+ ret = drv_assign_vif_chanctx(local, sdata, new_ctx);
+ if (ret)
+ goto out;
+
+ conf = &new_ctx->conf;
+ list_add(&sdata->assigned_chanctx_list,
+ &new_ctx->assigned_vifs);
+ }
+
+out:
+ rcu_assign_pointer(sdata->vif.chanctx_conf, conf);
+
+ sdata->vif.bss_conf.idle = !conf;
+
+ if (curr_ctx && ieee80211_chanctx_num_assigned(local, curr_ctx) > 0) {
+ ieee80211_recalc_chanctx_chantype(local, curr_ctx);
+ ieee80211_recalc_smps_chanctx(local, curr_ctx);
+ ieee80211_recalc_radar_chanctx(local, curr_ctx);
+ ieee80211_recalc_chanctx_min_def(local, curr_ctx);
+ }
+
+ if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) {
+ ieee80211_recalc_txpower(sdata, false);
+ ieee80211_recalc_chanctx_min_def(local, new_ctx);
+ }
+
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ ieee80211_bss_info_change_notify(sdata,
+ BSS_CHANGED_IDLE);
+
+ return ret;
+}
+
+void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *chanctx)
+{
+ struct ieee80211_sub_if_data *sdata;
+ u8 rx_chains_static, rx_chains_dynamic;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ rx_chains_static = 1;
+ rx_chains_dynamic = 1;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ u8 needed_static, needed_dynamic;
+
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if (rcu_access_pointer(sdata->vif.chanctx_conf) !=
+ &chanctx->conf)
+ continue;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_P2P_DEVICE:
+ continue;
+ case NL80211_IFTYPE_STATION:
+ if (!sdata->u.mgd.associated)
+ continue;
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ continue;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_OCB:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ switch (sdata->smps_mode) {
+ default:
+ WARN_ONCE(1, "Invalid SMPS mode %d\n",
+ sdata->smps_mode);
+ /* fall through */
+ case IEEE80211_SMPS_OFF:
+ needed_static = sdata->needed_rx_chains;
+ needed_dynamic = sdata->needed_rx_chains;
+ break;
+ case IEEE80211_SMPS_DYNAMIC:
+ needed_static = 1;
+ needed_dynamic = sdata->needed_rx_chains;
+ break;
+ case IEEE80211_SMPS_STATIC:
+ needed_static = 1;
+ needed_dynamic = 1;
+ break;
+ }
+
+ rx_chains_static = max(rx_chains_static, needed_static);
+ rx_chains_dynamic = max(rx_chains_dynamic, needed_dynamic);
+ }
+
+ /* Disable SMPS for the monitor interface */
+ sdata = rcu_dereference(local->monitor_sdata);
+ if (sdata &&
+ rcu_access_pointer(sdata->vif.chanctx_conf) == &chanctx->conf)
+ rx_chains_dynamic = rx_chains_static = local->rx_chains;
+
+ rcu_read_unlock();
+
+ if (!local->use_chanctx) {
+ if (rx_chains_static > 1)
+ local->smps_mode = IEEE80211_SMPS_OFF;
+ else if (rx_chains_dynamic > 1)
+ local->smps_mode = IEEE80211_SMPS_DYNAMIC;
+ else
+ local->smps_mode = IEEE80211_SMPS_STATIC;
+ ieee80211_hw_config(local, 0);
+ }
+
+ if (rx_chains_static == chanctx->conf.rx_chains_static &&
+ rx_chains_dynamic == chanctx->conf.rx_chains_dynamic)
+ return;
+
+ chanctx->conf.rx_chains_static = rx_chains_static;
+ chanctx->conf.rx_chains_dynamic = rx_chains_dynamic;
+ drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RX_CHAINS);
+}
+
+static void
+__ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
+ bool clear)
+{
+ struct ieee80211_local *local __maybe_unused = sdata->local;
+ struct ieee80211_sub_if_data *vlan;
+ struct ieee80211_chanctx_conf *conf;
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP))
+ return;
+
+ lockdep_assert_held(&local->mtx);
+
+ /* Check that conf exists, even when clearing this function
+ * must be called with the AP's channel context still there
+ * as it would otherwise cause VLANs to have an invalid
+ * channel context pointer for a while, possibly pointing
+ * to a channel context that has already been freed.
+ */
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ WARN_ON(!conf);
+
+ if (clear)
+ conf = NULL;
+
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ rcu_assign_pointer(vlan->vif.chanctx_conf, conf);
+}
+
+void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
+ bool clear)
+{
+ struct ieee80211_local *local = sdata->local;
+
+ mutex_lock(&local->chanctx_mtx);
+
+ __ieee80211_vif_copy_chanctx_to_vlans(sdata, clear);
+
+ mutex_unlock(&local->chanctx_mtx);
+}
+
+int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_chanctx *ctx = sdata->reserved_chanctx;
+
+ lockdep_assert_held(&sdata->local->chanctx_mtx);
+
+ if (WARN_ON(!ctx))
+ return -EINVAL;
+
+ list_del(&sdata->reserved_chanctx_list);
+ sdata->reserved_chanctx = NULL;
+
+ if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) {
+ if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) {
+ if (WARN_ON(!ctx->replace_ctx))
+ return -EINVAL;
+
+ WARN_ON(ctx->replace_ctx->replace_state !=
+ IEEE80211_CHANCTX_WILL_BE_REPLACED);
+ WARN_ON(ctx->replace_ctx->replace_ctx != ctx);
+
+ ctx->replace_ctx->replace_ctx = NULL;
+ ctx->replace_ctx->replace_state =
+ IEEE80211_CHANCTX_REPLACE_NONE;
+
+ list_del_rcu(&ctx->list);
+ kfree_rcu(ctx, rcu_head);
+ } else {
+ ieee80211_free_chanctx(sdata->local, ctx);
+ }
+ }
+
+ return 0;
+}
+
+int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode mode,
+ bool radar_required)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx *new_ctx, *curr_ctx, *ctx;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ curr_ctx = ieee80211_vif_get_chanctx(sdata);
+ if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx)
+ return -ENOTSUPP;
+
+ new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode);
+ if (!new_ctx) {
+ if (ieee80211_can_create_new_chanctx(local)) {
+ new_ctx = ieee80211_new_chanctx(local, chandef, mode);
+ if (IS_ERR(new_ctx))
+ return PTR_ERR(new_ctx);
+ } else {
+ if (!curr_ctx ||
+ (curr_ctx->replace_state ==
+ IEEE80211_CHANCTX_WILL_BE_REPLACED) ||
+ !list_empty(&curr_ctx->reserved_vifs)) {
+ /*
+ * Another vif already requested this context
+ * for a reservation. Find another one hoping
+ * all vifs assigned to it will also switch
+ * soon enough.
+ *
+ * TODO: This needs a little more work as some
+ * cases (more than 2 chanctx capable devices)
+ * may fail which could otherwise succeed
+ * provided some channel context juggling was
+ * performed.
+ *
+ * Consider ctx1..3, vif1..6, each ctx has 2
+ * vifs. vif1 and vif2 from ctx1 request new
+ * different chandefs starting 2 in-place
+ * reserations with ctx4 and ctx5 replacing
+ * ctx1 and ctx2 respectively. Next vif5 and
+ * vif6 from ctx3 reserve ctx4. If vif3 and
+ * vif4 remain on ctx2 as they are then this
+ * fails unless `replace_ctx` from ctx5 is
+ * replaced with ctx3.
+ */
+ list_for_each_entry(ctx, &local->chanctx_list,
+ list) {
+ if (ctx->replace_state !=
+ IEEE80211_CHANCTX_REPLACE_NONE)
+ continue;
+
+ if (!list_empty(&ctx->reserved_vifs))
+ continue;
+
+ curr_ctx = ctx;
+ break;
+ }
+ }
+
+ /*
+ * If that's true then all available contexts already
+ * have reservations and cannot be used.
+ */
+ if (!curr_ctx ||
+ (curr_ctx->replace_state ==
+ IEEE80211_CHANCTX_WILL_BE_REPLACED) ||
+ !list_empty(&curr_ctx->reserved_vifs))
+ return -EBUSY;
+
+ new_ctx = ieee80211_alloc_chanctx(local, chandef, mode);
+ if (!new_ctx)
+ return -ENOMEM;
+
+ new_ctx->replace_ctx = curr_ctx;
+ new_ctx->replace_state =
+ IEEE80211_CHANCTX_REPLACES_OTHER;
+
+ curr_ctx->replace_ctx = new_ctx;
+ curr_ctx->replace_state =
+ IEEE80211_CHANCTX_WILL_BE_REPLACED;
+
+ list_add_rcu(&new_ctx->list, &local->chanctx_list);
+ }
+ }
+
+ list_add(&sdata->reserved_chanctx_list, &new_ctx->reserved_vifs);
+ sdata->reserved_chanctx = new_ctx;
+ sdata->reserved_chandef = *chandef;
+ sdata->reserved_radar_required = radar_required;
+ sdata->reserved_ready = false;
+
+ return 0;
+}
+
+static void
+ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata)
+{
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_OCB:
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->csa_finalize_work);
+ break;
+ case NL80211_IFTYPE_STATION:
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->u.mgd.chswitch_work);
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ case NUM_NL80211_IFTYPES:
+ WARN_ON(1);
+ break;
+ }
+}
+
+static void
+ieee80211_vif_update_chandef(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_sub_if_data *vlan;
+
+ sdata->vif.bss_conf.chandef = *chandef;
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ return;
+
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ vlan->vif.bss_conf.chandef = *chandef;
+}
+
+static int
+ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_vif_chanctx_switch vif_chsw[1] = {};
+ struct ieee80211_chanctx *old_ctx, *new_ctx;
+ const struct cfg80211_chan_def *chandef;
+ u32 changed = 0;
+ int err;
+
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ new_ctx = sdata->reserved_chanctx;
+ old_ctx = ieee80211_vif_get_chanctx(sdata);
+
+ if (WARN_ON(!sdata->reserved_ready))
+ return -EBUSY;
+
+ if (WARN_ON(!new_ctx))
+ return -EINVAL;
+
+ if (WARN_ON(!old_ctx))
+ return -EINVAL;
+
+ if (WARN_ON(new_ctx->replace_state ==
+ IEEE80211_CHANCTX_REPLACES_OTHER))
+ return -EINVAL;
+
+ chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
+ &sdata->reserved_chandef);
+ if (WARN_ON(!chandef))
+ return -EINVAL;
+
+ vif_chsw[0].vif = &sdata->vif;
+ vif_chsw[0].old_ctx = &old_ctx->conf;
+ vif_chsw[0].new_ctx = &new_ctx->conf;
+
+ list_del(&sdata->reserved_chanctx_list);
+ sdata->reserved_chanctx = NULL;
+
+ err = drv_switch_vif_chanctx(local, vif_chsw, 1,
+ CHANCTX_SWMODE_REASSIGN_VIF);
+ if (err) {
+ if (ieee80211_chanctx_refcount(local, new_ctx) == 0)
+ ieee80211_free_chanctx(local, new_ctx);
+
+ goto out;
+ }
+
+ list_move(&sdata->assigned_chanctx_list, &new_ctx->assigned_vifs);
+ rcu_assign_pointer(sdata->vif.chanctx_conf, &new_ctx->conf);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ __ieee80211_vif_copy_chanctx_to_vlans(sdata, false);
+
+ if (ieee80211_chanctx_refcount(local, old_ctx) == 0)
+ ieee80211_free_chanctx(local, old_ctx);
+
+ if (sdata->vif.bss_conf.chandef.width != sdata->reserved_chandef.width)
+ changed = BSS_CHANGED_BANDWIDTH;
+
+ ieee80211_vif_update_chandef(sdata, &sdata->reserved_chandef);
+
+ ieee80211_recalc_smps_chanctx(local, new_ctx);
+ ieee80211_recalc_radar_chanctx(local, new_ctx);
+ ieee80211_recalc_chanctx_min_def(local, new_ctx);
+
+ if (changed)
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+out:
+ ieee80211_vif_chanctx_reservation_complete(sdata);
+ return err;
+}
+
+static int
+ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx *old_ctx, *new_ctx;
+ const struct cfg80211_chan_def *chandef;
+ int err;
+
+ old_ctx = ieee80211_vif_get_chanctx(sdata);
+ new_ctx = sdata->reserved_chanctx;
+
+ if (WARN_ON(!sdata->reserved_ready))
+ return -EINVAL;
+
+ if (WARN_ON(old_ctx))
+ return -EINVAL;
+
+ if (WARN_ON(!new_ctx))
+ return -EINVAL;
+
+ if (WARN_ON(new_ctx->replace_state ==
+ IEEE80211_CHANCTX_REPLACES_OTHER))
+ return -EINVAL;
+
+ chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
+ &sdata->reserved_chandef);
+ if (WARN_ON(!chandef))
+ return -EINVAL;
+
+ list_del(&sdata->reserved_chanctx_list);
+ sdata->reserved_chanctx = NULL;
+
+ err = ieee80211_assign_vif_chanctx(sdata, new_ctx);
+ if (err) {
+ if (ieee80211_chanctx_refcount(local, new_ctx) == 0)
+ ieee80211_free_chanctx(local, new_ctx);
+
+ goto out;
+ }
+
+out:
+ ieee80211_vif_chanctx_reservation_complete(sdata);
+ return err;
+}
+
+static bool
+ieee80211_vif_has_in_place_reservation(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_chanctx *old_ctx, *new_ctx;
+
+ lockdep_assert_held(&sdata->local->chanctx_mtx);
+
+ new_ctx = sdata->reserved_chanctx;
+ old_ctx = ieee80211_vif_get_chanctx(sdata);
+
+ if (!old_ctx)
+ return false;
+
+ if (WARN_ON(!new_ctx))
+ return false;
+
+ if (old_ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED)
+ return false;
+
+ if (new_ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+ return false;
+
+ return true;
+}
+
+static int ieee80211_chsw_switch_hwconf(struct ieee80211_local *local,
+ struct ieee80211_chanctx *new_ctx)
+{
+ const struct cfg80211_chan_def *chandef;
+
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ chandef = ieee80211_chanctx_reserved_chandef(local, new_ctx, NULL);
+ if (WARN_ON(!chandef))
+ return -EINVAL;
+
+ local->hw.conf.radar_enabled = new_ctx->conf.radar_enabled;
+ local->_oper_chandef = *chandef;
+ ieee80211_hw_config(local, 0);
+
+ return 0;
+}
+
+static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local,
+ int n_vifs)
+{
+ struct ieee80211_vif_chanctx_switch *vif_chsw;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_chanctx *ctx, *old_ctx;
+ int i, err;
+
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ vif_chsw = kzalloc(sizeof(vif_chsw[0]) * n_vifs, GFP_KERNEL);
+ if (!vif_chsw)
+ return -ENOMEM;
+
+ i = 0;
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+ continue;
+
+ if (WARN_ON(!ctx->replace_ctx)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ list_for_each_entry(sdata, &ctx->reserved_vifs,
+ reserved_chanctx_list) {
+ if (!ieee80211_vif_has_in_place_reservation(
+ sdata))
+ continue;
+
+ old_ctx = ieee80211_vif_get_chanctx(sdata);
+ vif_chsw[i].vif = &sdata->vif;
+ vif_chsw[i].old_ctx = &old_ctx->conf;
+ vif_chsw[i].new_ctx = &ctx->conf;
+
+ i++;
+ }
+ }
+
+ err = drv_switch_vif_chanctx(local, vif_chsw, n_vifs,
+ CHANCTX_SWMODE_SWAP_CONTEXTS);
+
+out:
+ kfree(vif_chsw);
+ return err;
+}
+
+static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local)
+{
+ struct ieee80211_chanctx *ctx;
+ int err;
+
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+ continue;
+
+ if (!list_empty(&ctx->replace_ctx->assigned_vifs))
+ continue;
+
+ ieee80211_del_chanctx(local, ctx->replace_ctx);
+ err = ieee80211_add_chanctx(local, ctx);
+ if (err)
+ goto err;
+ }
+
+ return 0;
+
+err:
+ WARN_ON(ieee80211_add_chanctx(local, ctx));
+ list_for_each_entry_continue_reverse(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+ continue;
+
+ if (!list_empty(&ctx->replace_ctx->assigned_vifs))
+ continue;
+
+ ieee80211_del_chanctx(local, ctx);
+ WARN_ON(ieee80211_add_chanctx(local, ctx->replace_ctx));
+ }
+
+ return err;
+}
+
+static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata, *sdata_tmp;
+ struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
+ struct ieee80211_chanctx *new_ctx = NULL;
+ int i, err, n_assigned, n_reserved, n_ready;
+ int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0;
+
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ /*
+ * If there are 2 independent pairs of channel contexts performing
+ * cross-switch of their vifs this code will still wait until both are
+ * ready even though it could be possible to switch one before the
+ * other is ready.
+ *
+ * For practical reasons and code simplicity just do a single huge
+ * switch.
+ */
+
+ /*
+ * Verify if the reservation is still feasible.
+ * - if it's not then disconnect
+ * - if it is but not all vifs necessary are ready then defer
+ */
+
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+ continue;
+
+ if (WARN_ON(!ctx->replace_ctx)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ if (!local->use_chanctx)
+ new_ctx = ctx;
+
+ n_ctx++;
+
+ n_assigned = 0;
+ n_reserved = 0;
+ n_ready = 0;
+
+ list_for_each_entry(sdata, &ctx->replace_ctx->assigned_vifs,
+ assigned_chanctx_list) {
+ n_assigned++;
+ if (sdata->reserved_chanctx) {
+ n_reserved++;
+ if (sdata->reserved_ready)
+ n_ready++;
+ }
+ }
+
+ if (n_assigned != n_reserved) {
+ if (n_ready == n_reserved) {
+ wiphy_info(local->hw.wiphy,
+ "channel context reservation cannot be finalized because some interfaces aren't switching\n");
+ err = -EBUSY;
+ goto err;
+ }
+
+ return -EAGAIN;
+ }
+
+ ctx->conf.radar_enabled = false;
+ list_for_each_entry(sdata, &ctx->reserved_vifs,
+ reserved_chanctx_list) {
+ if (ieee80211_vif_has_in_place_reservation(sdata) &&
+ !sdata->reserved_ready)
+ return -EAGAIN;
+
+ old_ctx = ieee80211_vif_get_chanctx(sdata);
+ if (old_ctx) {
+ if (old_ctx->replace_state ==
+ IEEE80211_CHANCTX_WILL_BE_REPLACED)
+ n_vifs_switch++;
+ else
+ n_vifs_assign++;
+ } else {
+ n_vifs_ctxless++;
+ }
+
+ if (sdata->reserved_radar_required)
+ ctx->conf.radar_enabled = true;
+ }
+ }
+
+ if (WARN_ON(n_ctx == 0) ||
+ WARN_ON(n_vifs_switch == 0 &&
+ n_vifs_assign == 0 &&
+ n_vifs_ctxless == 0) ||
+ WARN_ON(n_ctx > 1 && !local->use_chanctx) ||
+ WARN_ON(!new_ctx && !local->use_chanctx)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * All necessary vifs are ready. Perform the switch now depending on
+ * reservations and driver capabilities.
+ */
+
+ if (local->use_chanctx) {
+ if (n_vifs_switch > 0) {
+ err = ieee80211_chsw_switch_vifs(local, n_vifs_switch);
+ if (err)
+ goto err;
+ }
+
+ if (n_vifs_assign > 0 || n_vifs_ctxless > 0) {
+ err = ieee80211_chsw_switch_ctxs(local);
+ if (err)
+ goto err;
+ }
+ } else {
+ err = ieee80211_chsw_switch_hwconf(local, new_ctx);
+ if (err)
+ goto err;
+ }
+
+ /*
+ * Update all structures, values and pointers to point to new channel
+ * context(s).
+ */
+
+ i = 0;
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+ continue;
+
+ if (WARN_ON(!ctx->replace_ctx)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ list_for_each_entry(sdata, &ctx->reserved_vifs,
+ reserved_chanctx_list) {
+ u32 changed = 0;
+
+ if (!ieee80211_vif_has_in_place_reservation(sdata))
+ continue;
+
+ rcu_assign_pointer(sdata->vif.chanctx_conf, &ctx->conf);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ __ieee80211_vif_copy_chanctx_to_vlans(sdata,
+ false);
+
+ sdata->radar_required = sdata->reserved_radar_required;
+
+ if (sdata->vif.bss_conf.chandef.width !=
+ sdata->reserved_chandef.width)
+ changed = BSS_CHANGED_BANDWIDTH;
+
+ ieee80211_vif_update_chandef(sdata, &sdata->reserved_chandef);
+ if (changed)
+ ieee80211_bss_info_change_notify(sdata,
+ changed);
+
+ ieee80211_recalc_txpower(sdata, false);
+ }
+
+ ieee80211_recalc_chanctx_chantype(local, ctx);
+ ieee80211_recalc_smps_chanctx(local, ctx);
+ ieee80211_recalc_radar_chanctx(local, ctx);
+ ieee80211_recalc_chanctx_min_def(local, ctx);
+
+ list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
+ reserved_chanctx_list) {
+ if (ieee80211_vif_get_chanctx(sdata) != ctx)
+ continue;
+
+ list_del(&sdata->reserved_chanctx_list);
+ list_move(&sdata->assigned_chanctx_list,
+ &ctx->assigned_vifs);
+ sdata->reserved_chanctx = NULL;
+
+ ieee80211_vif_chanctx_reservation_complete(sdata);
+ }
+
+ /*
+ * This context might have been a dependency for an already
+ * ready re-assign reservation interface that was deferred. Do
+ * not propagate error to the caller though. The in-place
+ * reservation for originally requested interface has already
+ * succeeded at this point.
+ */
+ list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
+ reserved_chanctx_list) {
+ if (WARN_ON(ieee80211_vif_has_in_place_reservation(
+ sdata)))
+ continue;
+
+ if (WARN_ON(sdata->reserved_chanctx != ctx))
+ continue;
+
+ if (!sdata->reserved_ready)
+ continue;
+
+ if (ieee80211_vif_get_chanctx(sdata))
+ err = ieee80211_vif_use_reserved_reassign(
+ sdata);
+ else
+ err = ieee80211_vif_use_reserved_assign(sdata);
+
+ if (err) {
+ sdata_info(sdata,
+ "failed to finalize (re-)assign reservation (err=%d)\n",
+ err);
+ ieee80211_vif_unreserve_chanctx(sdata);
+ cfg80211_stop_iface(local->hw.wiphy,
+ &sdata->wdev,
+ GFP_KERNEL);
+ }
+ }
+ }
+
+ /*
+ * Finally free old contexts
+ */
+
+ list_for_each_entry_safe(ctx, ctx_tmp, &local->chanctx_list, list) {
+ if (ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED)
+ continue;
+
+ ctx->replace_ctx->replace_ctx = NULL;
+ ctx->replace_ctx->replace_state =
+ IEEE80211_CHANCTX_REPLACE_NONE;
+
+ list_del_rcu(&ctx->list);
+ kfree_rcu(ctx, rcu_head);
+ }
+
+ return 0;
+
+err:
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+ continue;
+
+ list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
+ reserved_chanctx_list) {
+ ieee80211_vif_unreserve_chanctx(sdata);
+ ieee80211_vif_chanctx_reservation_complete(sdata);
+ }
+ }
+
+ return err;
+}
+
+static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *ctx;
+ bool use_reserved_switch = false;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ if (!conf)
+ return;
+
+ ctx = container_of(conf, struct ieee80211_chanctx, conf);
+
+ if (sdata->reserved_chanctx) {
+ if (sdata->reserved_chanctx->replace_state ==
+ IEEE80211_CHANCTX_REPLACES_OTHER &&
+ ieee80211_chanctx_num_reserved(local,
+ sdata->reserved_chanctx) > 1)
+ use_reserved_switch = true;
+
+ ieee80211_vif_unreserve_chanctx(sdata);
+ }
+
+ ieee80211_assign_vif_chanctx(sdata, NULL);
+ if (ieee80211_chanctx_refcount(local, ctx) == 0)
+ ieee80211_free_chanctx(local, ctx);
+
+ sdata->radar_required = false;
+
+ /* Unreserving may ready an in-place reservation. */
+ if (use_reserved_switch)
+ ieee80211_vif_use_reserved_switch(local);
+}
+
+int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode mode)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx *ctx;
+ u8 radar_detect_width = 0;
+ int ret;
+
+ lockdep_assert_held(&local->mtx);
+
+ WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev));
+
+ mutex_lock(&local->chanctx_mtx);
+
+ ret = cfg80211_chandef_dfs_required(local->hw.wiphy,
+ chandef,
+ sdata->wdev.iftype);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ radar_detect_width = BIT(chandef->width);
+
+ sdata->radar_required = ret;
+
+ ret = ieee80211_check_combinations(sdata, chandef, mode,
+ radar_detect_width);
+ if (ret < 0)
+ goto out;
+
+ __ieee80211_vif_release_channel(sdata);
+
+ ctx = ieee80211_find_chanctx(local, chandef, mode);
+ if (!ctx)
+ ctx = ieee80211_new_chanctx(local, chandef, mode);
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
+ goto out;
+ }
+
+ ieee80211_vif_update_chandef(sdata, chandef);
+
+ ret = ieee80211_assign_vif_chanctx(sdata, ctx);
+ if (ret) {
+ /* if assign fails refcount stays the same */
+ if (ieee80211_chanctx_refcount(local, ctx) == 0)
+ ieee80211_free_chanctx(local, ctx);
+ goto out;
+ }
+
+ ieee80211_recalc_smps_chanctx(local, ctx);
+ ieee80211_recalc_radar_chanctx(local, ctx);
+ out:
+ if (ret)
+ sdata->radar_required = false;
+
+ mutex_unlock(&local->chanctx_mtx);
+ return ret;
+}
+
+int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx *new_ctx;
+ struct ieee80211_chanctx *old_ctx;
+ int err;
+
+ lockdep_assert_held(&local->mtx);
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ new_ctx = sdata->reserved_chanctx;
+ old_ctx = ieee80211_vif_get_chanctx(sdata);
+
+ if (WARN_ON(!new_ctx))
+ return -EINVAL;
+
+ if (WARN_ON(new_ctx->replace_state ==
+ IEEE80211_CHANCTX_WILL_BE_REPLACED))
+ return -EINVAL;
+
+ if (WARN_ON(sdata->reserved_ready))
+ return -EINVAL;
+
+ sdata->reserved_ready = true;
+
+ if (new_ctx->replace_state == IEEE80211_CHANCTX_REPLACE_NONE) {
+ if (old_ctx)
+ err = ieee80211_vif_use_reserved_reassign(sdata);
+ else
+ err = ieee80211_vif_use_reserved_assign(sdata);
+
+ if (err)
+ return err;
+ }
+
+ /*
+ * In-place reservation may need to be finalized now either if:
+ * a) sdata is taking part in the swapping itself and is the last one
+ * b) sdata has switched with a re-assign reservation to an existing
+ * context readying in-place switching of old_ctx
+ *
+ * In case of (b) do not propagate the error up because the requested
+ * sdata already switched successfully. Just spill an extra warning.
+ * The ieee80211_vif_use_reserved_switch() already stops all necessary
+ * interfaces upon failure.
+ */
+ if ((old_ctx &&
+ old_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) ||
+ new_ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) {
+ err = ieee80211_vif_use_reserved_switch(local);
+ if (err && err != -EAGAIN) {
+ if (new_ctx->replace_state ==
+ IEEE80211_CHANCTX_REPLACES_OTHER)
+ return err;
+
+ wiphy_info(local->hw.wiphy,
+ "depending in-place reservation failed (err=%d)\n",
+ err);
+ }
+ }
+
+ return 0;
+}
+
+int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef,
+ u32 *changed)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *ctx;
+ const struct cfg80211_chan_def *compat;
+ int ret;
+
+ if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
+ IEEE80211_CHAN_DISABLED))
+ return -EINVAL;
+
+ mutex_lock(&local->chanctx_mtx);
+ if (cfg80211_chandef_identical(chandef, &sdata->vif.bss_conf.chandef)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (chandef->width == NL80211_CHAN_WIDTH_20_NOHT ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ if (!conf) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ctx = container_of(conf, struct ieee80211_chanctx, conf);
+
+ compat = cfg80211_chandef_compatible(&conf->def, chandef);
+ if (!compat) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ switch (ctx->replace_state) {
+ case IEEE80211_CHANCTX_REPLACE_NONE:
+ if (!ieee80211_chanctx_reserved_chandef(local, ctx, compat)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ break;
+ case IEEE80211_CHANCTX_WILL_BE_REPLACED:
+ /* TODO: Perhaps the bandwidth change could be treated as a
+ * reservation itself? */
+ ret = -EBUSY;
+ goto out;
+ case IEEE80211_CHANCTX_REPLACES_OTHER:
+ /* channel context that is going to replace another channel
+ * context doesn't really exist and shouldn't be assigned
+ * anywhere yet */
+ WARN_ON(1);
+ break;
+ }
+
+ ieee80211_vif_update_chandef(sdata, chandef);
+
+ ieee80211_recalc_chanctx_chantype(local, ctx);
+
+ *changed |= BSS_CHANGED_BANDWIDTH;
+ ret = 0;
+ out:
+ mutex_unlock(&local->chanctx_mtx);
+ return ret;
+}
+
+void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata)
+{
+ WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev));
+
+ lockdep_assert_held(&sdata->local->mtx);
+
+ mutex_lock(&sdata->local->chanctx_mtx);
+ __ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&sdata->local->chanctx_mtx);
+}
+
+void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *ap;
+ struct ieee80211_chanctx_conf *conf;
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->bss))
+ return;
+
+ ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap);
+
+ mutex_lock(&local->chanctx_mtx);
+
+ conf = rcu_dereference_protected(ap->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ rcu_assign_pointer(sdata->vif.chanctx_conf, conf);
+ mutex_unlock(&local->chanctx_mtx);
+}
+
+void ieee80211_iter_chan_contexts_atomic(
+ struct ieee80211_hw *hw,
+ void (*iter)(struct ieee80211_hw *hw,
+ struct ieee80211_chanctx_conf *chanctx_conf,
+ void *data),
+ void *iter_data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_chanctx *ctx;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &local->chanctx_list, list)
+ if (ctx->driver_present)
+ iter(hw, &ctx->conf, iter_data);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_atomic);
diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h
new file mode 100644
index 000000000..1956b3115
--- /dev/null
+++ b/net/mac80211/debug.h
@@ -0,0 +1,200 @@
+#ifndef __MAC80211_DEBUG_H
+#define __MAC80211_DEBUG_H
+#include <net/cfg80211.h>
+
+#ifdef CONFIG_MAC80211_OCB_DEBUG
+#define MAC80211_OCB_DEBUG 1
+#else
+#define MAC80211_OCB_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+#define MAC80211_IBSS_DEBUG 1
+#else
+#define MAC80211_IBSS_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_PS_DEBUG
+#define MAC80211_PS_DEBUG 1
+#else
+#define MAC80211_PS_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+#define MAC80211_HT_DEBUG 1
+#else
+#define MAC80211_HT_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_MPL_DEBUG
+#define MAC80211_MPL_DEBUG 1
+#else
+#define MAC80211_MPL_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_MPATH_DEBUG
+#define MAC80211_MPATH_DEBUG 1
+#else
+#define MAC80211_MPATH_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_MHWMP_DEBUG
+#define MAC80211_MHWMP_DEBUG 1
+#else
+#define MAC80211_MHWMP_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_MESH_SYNC_DEBUG
+#define MAC80211_MESH_SYNC_DEBUG 1
+#else
+#define MAC80211_MESH_SYNC_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_MESH_CSA_DEBUG
+#define MAC80211_MESH_CSA_DEBUG 1
+#else
+#define MAC80211_MESH_CSA_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_MESH_PS_DEBUG
+#define MAC80211_MESH_PS_DEBUG 1
+#else
+#define MAC80211_MESH_PS_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_TDLS_DEBUG
+#define MAC80211_TDLS_DEBUG 1
+#else
+#define MAC80211_TDLS_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_STA_DEBUG
+#define MAC80211_STA_DEBUG 1
+#else
+#define MAC80211_STA_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_MLME_DEBUG
+#define MAC80211_MLME_DEBUG 1
+#else
+#define MAC80211_MLME_DEBUG 0
+#endif
+
+#ifdef CONFIG_MAC80211_MESSAGE_TRACING
+void __sdata_info(const char *fmt, ...) __printf(1, 2);
+void __sdata_dbg(bool print, const char *fmt, ...) __printf(2, 3);
+void __sdata_err(const char *fmt, ...) __printf(1, 2);
+void __wiphy_dbg(struct wiphy *wiphy, bool print, const char *fmt, ...)
+ __printf(3, 4);
+
+#define _sdata_info(sdata, fmt, ...) \
+ __sdata_info("%s: " fmt, (sdata)->name, ##__VA_ARGS__)
+#define _sdata_dbg(print, sdata, fmt, ...) \
+ __sdata_dbg(print, "%s: " fmt, (sdata)->name, ##__VA_ARGS__)
+#define _sdata_err(sdata, fmt, ...) \
+ __sdata_err("%s: " fmt, (sdata)->name, ##__VA_ARGS__)
+#define _wiphy_dbg(print, wiphy, fmt, ...) \
+ __wiphy_dbg(wiphy, print, fmt, ##__VA_ARGS__)
+#else
+#define _sdata_info(sdata, fmt, ...) \
+do { \
+ pr_info("%s: " fmt, \
+ (sdata)->name, ##__VA_ARGS__); \
+} while (0)
+
+#define _sdata_dbg(print, sdata, fmt, ...) \
+do { \
+ if (print) \
+ pr_debug("%s: " fmt, \
+ (sdata)->name, ##__VA_ARGS__); \
+} while (0)
+
+#define _sdata_err(sdata, fmt, ...) \
+do { \
+ pr_err("%s: " fmt, \
+ (sdata)->name, ##__VA_ARGS__); \
+} while (0)
+
+#define _wiphy_dbg(print, wiphy, fmt, ...) \
+do { \
+ if (print) \
+ wiphy_dbg((wiphy), fmt, ##__VA_ARGS__); \
+} while (0)
+#endif
+
+#define sdata_info(sdata, fmt, ...) \
+ _sdata_info(sdata, fmt, ##__VA_ARGS__)
+#define sdata_err(sdata, fmt, ...) \
+ _sdata_err(sdata, fmt, ##__VA_ARGS__)
+#define sdata_dbg(sdata, fmt, ...) \
+ _sdata_dbg(1, sdata, fmt, ##__VA_ARGS__)
+
+#define ht_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_HT_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define ht_dbg_ratelimited(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_HT_DEBUG && net_ratelimit(), \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define ocb_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_OCB_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define ibss_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_IBSS_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define ps_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_PS_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define ps_dbg_hw(hw, fmt, ...) \
+ _wiphy_dbg(MAC80211_PS_DEBUG, \
+ (hw)->wiphy, fmt, ##__VA_ARGS__)
+
+#define ps_dbg_ratelimited(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_PS_DEBUG && net_ratelimit(), \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define mpl_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_MPL_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define mpath_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_MPATH_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define mhwmp_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_MHWMP_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define msync_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_MESH_SYNC_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define mcsa_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_MESH_CSA_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define mps_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_MESH_PS_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define tdls_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_TDLS_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define sta_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_STA_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define mlme_dbg(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_MLME_DEBUG, \
+ sdata, fmt, ##__VA_ARGS__)
+
+#define mlme_dbg_ratelimited(sdata, fmt, ...) \
+ _sdata_dbg(MAC80211_MLME_DEBUG && net_ratelimit(), \
+ sdata, fmt, ##__VA_ARGS__)
+
+#endif /* __MAC80211_DEBUG_H */
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
new file mode 100644
index 000000000..23813ebb3
--- /dev/null
+++ b/net/mac80211/debugfs.c
@@ -0,0 +1,310 @@
+
+/*
+ * mac80211 debugfs for wireless PHYs
+ *
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * GPLv2
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/rtnetlink.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "debugfs.h"
+
+#define DEBUGFS_FORMAT_BUFFER_SIZE 100
+
+int mac80211_format_buffer(char __user *userbuf, size_t count,
+ loff_t *ppos, char *fmt, ...)
+{
+ va_list args;
+ char buf[DEBUGFS_FORMAT_BUFFER_SIZE];
+ int res;
+
+ va_start(args, fmt);
+ res = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+
+#define DEBUGFS_READONLY_FILE_FN(name, fmt, value...) \
+static ssize_t name## _read(struct file *file, char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct ieee80211_local *local = file->private_data; \
+ \
+ return mac80211_format_buffer(userbuf, count, ppos, \
+ fmt "\n", ##value); \
+}
+
+#define DEBUGFS_READONLY_FILE_OPS(name) \
+static const struct file_operations name## _ops = { \
+ .read = name## _read, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+};
+
+#define DEBUGFS_READONLY_FILE(name, fmt, value...) \
+ DEBUGFS_READONLY_FILE_FN(name, fmt, value) \
+ DEBUGFS_READONLY_FILE_OPS(name)
+
+#define DEBUGFS_ADD(name) \
+ debugfs_create_file(#name, 0400, phyd, local, &name## _ops);
+
+#define DEBUGFS_ADD_MODE(name, mode) \
+ debugfs_create_file(#name, mode, phyd, local, &name## _ops);
+
+
+DEBUGFS_READONLY_FILE(user_power, "%d",
+ local->user_power_level);
+DEBUGFS_READONLY_FILE(power, "%d",
+ local->hw.conf.power_level);
+DEBUGFS_READONLY_FILE(total_ps_buffered, "%d",
+ local->total_ps_buffered);
+DEBUGFS_READONLY_FILE(wep_iv, "%#08x",
+ local->wep_iv & 0xffffff);
+DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s",
+ local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver");
+
+#ifdef CONFIG_PM
+static ssize_t reset_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+
+ rtnl_lock();
+ __ieee80211_suspend(&local->hw, NULL);
+ __ieee80211_resume(&local->hw);
+ rtnl_unlock();
+
+ return count;
+}
+
+static const struct file_operations reset_ops = {
+ .write = reset_write,
+ .open = simple_open,
+ .llseek = noop_llseek,
+};
+#endif
+
+static ssize_t hwflags_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ int mxln = 500;
+ ssize_t rv;
+ char *buf = kzalloc(mxln, GFP_KERNEL);
+ int sf = 0; /* how many written so far */
+
+ if (!buf)
+ return 0;
+
+ sf += scnprintf(buf, mxln - sf, "0x%x\n", local->hw.flags);
+ if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
+ sf += scnprintf(buf + sf, mxln - sf, "HAS_RATE_CONTROL\n");
+ if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
+ sf += scnprintf(buf + sf, mxln - sf, "RX_INCLUDES_FCS\n");
+ if (local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING)
+ sf += scnprintf(buf + sf, mxln - sf,
+ "HOST_BCAST_PS_BUFFERING\n");
+ if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)
+ sf += scnprintf(buf + sf, mxln - sf,
+ "2GHZ_SHORT_SLOT_INCAPABLE\n");
+ if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)
+ sf += scnprintf(buf + sf, mxln - sf,
+ "2GHZ_SHORT_PREAMBLE_INCAPABLE\n");
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
+ sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n");
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ sf += scnprintf(buf + sf, mxln - sf, "SIGNAL_DBM\n");
+ if (local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC)
+ sf += scnprintf(buf + sf, mxln - sf,
+ "NEED_DTIM_BEFORE_ASSOC\n");
+ if (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)
+ sf += scnprintf(buf + sf, mxln - sf, "SPECTRUM_MGMT\n");
+ if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION)
+ sf += scnprintf(buf + sf, mxln - sf, "AMPDU_AGGREGATION\n");
+ if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS)
+ sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PS\n");
+ if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)
+ sf += scnprintf(buf + sf, mxln - sf, "PS_NULLFUNC_STACK\n");
+ if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
+ sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n");
+ if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE)
+ sf += scnprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n");
+ if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ sf += scnprintf(buf + sf, mxln - sf,
+ "REPORTS_TX_ACK_STATUS\n");
+ if (local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+ sf += scnprintf(buf + sf, mxln - sf, "CONNECTION_MONITOR\n");
+ if (local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK)
+ sf += scnprintf(buf + sf, mxln - sf, "SUPPORTS_PER_STA_GTK\n");
+ if (local->hw.flags & IEEE80211_HW_AP_LINK_PS)
+ sf += scnprintf(buf + sf, mxln - sf, "AP_LINK_PS\n");
+ if (local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)
+ sf += scnprintf(buf + sf, mxln - sf, "TX_AMPDU_SETUP_IN_HW\n");
+
+ rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
+ kfree(buf);
+ return rv;
+}
+
+static ssize_t queues_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ unsigned long flags;
+ char buf[IEEE80211_MAX_QUEUES * 20];
+ int q, res = 0;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ for (q = 0; q < local->hw.queues; q++)
+ res += sprintf(buf + res, "%02d: %#.8lx/%d\n", q,
+ local->queue_stop_reasons[q],
+ skb_queue_len(&local->pending[q]));
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, res);
+}
+
+DEBUGFS_READONLY_FILE_OPS(hwflags);
+DEBUGFS_READONLY_FILE_OPS(queues);
+
+/* statistics stuff */
+
+static ssize_t format_devstat_counter(struct ieee80211_local *local,
+ char __user *userbuf,
+ size_t count, loff_t *ppos,
+ int (*printvalue)(struct ieee80211_low_level_stats *stats, char *buf,
+ int buflen))
+{
+ struct ieee80211_low_level_stats stats;
+ char buf[20];
+ int res;
+
+ rtnl_lock();
+ res = drv_get_stats(local, &stats);
+ rtnl_unlock();
+ if (res)
+ return res;
+ res = printvalue(&stats, buf, sizeof(buf));
+ return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+
+#define DEBUGFS_DEVSTATS_FILE(name) \
+static int print_devstats_##name(struct ieee80211_low_level_stats *stats,\
+ char *buf, int buflen) \
+{ \
+ return scnprintf(buf, buflen, "%u\n", stats->name); \
+} \
+static ssize_t stats_ ##name## _read(struct file *file, \
+ char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ return format_devstat_counter(file->private_data, \
+ userbuf, \
+ count, \
+ ppos, \
+ print_devstats_##name); \
+} \
+ \
+static const struct file_operations stats_ ##name## _ops = { \
+ .read = stats_ ##name## _read, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+};
+
+#define DEBUGFS_STATS_ADD(name, field) \
+ debugfs_create_u32(#name, 0400, statsd, (u32 *) &field);
+#define DEBUGFS_DEVSTATS_ADD(name) \
+ debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops);
+
+DEBUGFS_DEVSTATS_FILE(dot11ACKFailureCount);
+DEBUGFS_DEVSTATS_FILE(dot11RTSFailureCount);
+DEBUGFS_DEVSTATS_FILE(dot11FCSErrorCount);
+DEBUGFS_DEVSTATS_FILE(dot11RTSSuccessCount);
+
+void debugfs_hw_add(struct ieee80211_local *local)
+{
+ struct dentry *phyd = local->hw.wiphy->debugfsdir;
+ struct dentry *statsd;
+
+ if (!phyd)
+ return;
+
+ local->debugfs.keys = debugfs_create_dir("keys", phyd);
+
+ DEBUGFS_ADD(total_ps_buffered);
+ DEBUGFS_ADD(wep_iv);
+ DEBUGFS_ADD(queues);
+#ifdef CONFIG_PM
+ DEBUGFS_ADD_MODE(reset, 0200);
+#endif
+ DEBUGFS_ADD(hwflags);
+ DEBUGFS_ADD(user_power);
+ DEBUGFS_ADD(power);
+
+ statsd = debugfs_create_dir("statistics", phyd);
+
+ /* if the dir failed, don't put all the other things into the root! */
+ if (!statsd)
+ return;
+
+ DEBUGFS_STATS_ADD(transmitted_fragment_count,
+ local->dot11TransmittedFragmentCount);
+ DEBUGFS_STATS_ADD(multicast_transmitted_frame_count,
+ local->dot11MulticastTransmittedFrameCount);
+ DEBUGFS_STATS_ADD(failed_count, local->dot11FailedCount);
+ DEBUGFS_STATS_ADD(retry_count, local->dot11RetryCount);
+ DEBUGFS_STATS_ADD(multiple_retry_count,
+ local->dot11MultipleRetryCount);
+ DEBUGFS_STATS_ADD(frame_duplicate_count,
+ local->dot11FrameDuplicateCount);
+ DEBUGFS_STATS_ADD(received_fragment_count,
+ local->dot11ReceivedFragmentCount);
+ DEBUGFS_STATS_ADD(multicast_received_frame_count,
+ local->dot11MulticastReceivedFrameCount);
+ DEBUGFS_STATS_ADD(transmitted_frame_count,
+ local->dot11TransmittedFrameCount);
+#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
+ DEBUGFS_STATS_ADD(tx_handlers_drop, local->tx_handlers_drop);
+ DEBUGFS_STATS_ADD(tx_handlers_queued, local->tx_handlers_queued);
+ DEBUGFS_STATS_ADD(tx_handlers_drop_fragment,
+ local->tx_handlers_drop_fragment);
+ DEBUGFS_STATS_ADD(tx_handlers_drop_wep,
+ local->tx_handlers_drop_wep);
+ DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc,
+ local->tx_handlers_drop_not_assoc);
+ DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port,
+ local->tx_handlers_drop_unauth_port);
+ DEBUGFS_STATS_ADD(rx_handlers_drop, local->rx_handlers_drop);
+ DEBUGFS_STATS_ADD(rx_handlers_queued, local->rx_handlers_queued);
+ DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc,
+ local->rx_handlers_drop_nullfunc);
+ DEBUGFS_STATS_ADD(rx_handlers_drop_defrag,
+ local->rx_handlers_drop_defrag);
+ DEBUGFS_STATS_ADD(rx_handlers_drop_short,
+ local->rx_handlers_drop_short);
+ DEBUGFS_STATS_ADD(tx_expand_skb_head,
+ local->tx_expand_skb_head);
+ DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned,
+ local->tx_expand_skb_head_cloned);
+ DEBUGFS_STATS_ADD(rx_expand_skb_head,
+ local->rx_expand_skb_head);
+ DEBUGFS_STATS_ADD(rx_expand_skb_head2,
+ local->rx_expand_skb_head2);
+ DEBUGFS_STATS_ADD(rx_handlers_fragments,
+ local->rx_handlers_fragments);
+ DEBUGFS_STATS_ADD(tx_status_drop,
+ local->tx_status_drop);
+#endif
+ DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount);
+ DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount);
+ DEBUGFS_DEVSTATS_ADD(dot11FCSErrorCount);
+ DEBUGFS_DEVSTATS_ADD(dot11RTSSuccessCount);
+}
diff --git a/net/mac80211/debugfs.h b/net/mac80211/debugfs.h
new file mode 100644
index 000000000..60c35afee
--- /dev/null
+++ b/net/mac80211/debugfs.h
@@ -0,0 +1,16 @@
+#ifndef __MAC80211_DEBUGFS_H
+#define __MAC80211_DEBUGFS_H
+
+#include "ieee80211_i.h"
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+void debugfs_hw_add(struct ieee80211_local *local);
+int __printf(4, 5) mac80211_format_buffer(char __user *userbuf, size_t count,
+ loff_t *ppos, char *fmt, ...);
+#else
+static inline void debugfs_hw_add(struct ieee80211_local *local)
+{
+}
+#endif
+
+#endif /* __MAC80211_DEBUGFS_H */
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
new file mode 100644
index 000000000..71ac1b5f4
--- /dev/null
+++ b/net/mac80211/debugfs_key.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright 2003-2005 Devicescape Software, Inc.
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kobject.h>
+#include <linux/slab.h>
+#include "ieee80211_i.h"
+#include "key.h"
+#include "debugfs.h"
+#include "debugfs_key.h"
+
+#define KEY_READ(name, prop, format_string) \
+static ssize_t key_##name##_read(struct file *file, \
+ char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct ieee80211_key *key = file->private_data; \
+ return mac80211_format_buffer(userbuf, count, ppos, \
+ format_string, key->prop); \
+}
+#define KEY_READ_D(name) KEY_READ(name, name, "%d\n")
+#define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n")
+
+#define KEY_OPS(name) \
+static const struct file_operations key_ ##name## _ops = { \
+ .read = key_##name##_read, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+}
+
+#define KEY_FILE(name, format) \
+ KEY_READ_##format(name) \
+ KEY_OPS(name)
+
+#define KEY_CONF_READ(name, format_string) \
+ KEY_READ(conf_##name, conf.name, format_string)
+#define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n")
+
+#define KEY_CONF_OPS(name) \
+static const struct file_operations key_ ##name## _ops = { \
+ .read = key_conf_##name##_read, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+}
+
+#define KEY_CONF_FILE(name, format) \
+ KEY_CONF_READ_##format(name) \
+ KEY_CONF_OPS(name)
+
+KEY_CONF_FILE(keylen, D);
+KEY_CONF_FILE(keyidx, D);
+KEY_CONF_FILE(hw_key_idx, D);
+KEY_FILE(flags, X);
+KEY_FILE(tx_rx_count, D);
+KEY_READ(ifindex, sdata->name, "%s\n");
+KEY_OPS(ifindex);
+
+static ssize_t key_algorithm_read(struct file *file,
+ char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[15];
+ struct ieee80211_key *key = file->private_data;
+ u32 c = key->conf.cipher;
+
+ sprintf(buf, "%.2x-%.2x-%.2x:%d\n",
+ c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff);
+ return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
+}
+KEY_OPS(algorithm);
+
+static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u64 pn;
+ char buf[20];
+ int len;
+ struct ieee80211_key *key = file->private_data;
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ len = scnprintf(buf, sizeof(buf), "\n");
+ break;
+ case WLAN_CIPHER_SUITE_TKIP:
+ len = scnprintf(buf, sizeof(buf), "%08x %04x\n",
+ key->u.tkip.tx.iv32,
+ key->u.tkip.tx.iv16);
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ pn = atomic64_read(&key->u.ccmp.tx_pn);
+ len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
+ (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
+ (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ pn = atomic64_read(&key->u.aes_cmac.tx_pn);
+ len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
+ (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
+ (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ pn = atomic64_read(&key->u.aes_gmac.tx_pn);
+ len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
+ (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
+ (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ pn = atomic64_read(&key->u.gcmp.tx_pn);
+ len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
+ (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
+ (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
+ break;
+ default:
+ return 0;
+ }
+ return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(tx_spec);
+
+static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_key *key = file->private_data;
+ char buf[14*IEEE80211_NUM_TIDS+1], *p = buf;
+ int i, len;
+ const u8 *rpn;
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ len = scnprintf(buf, sizeof(buf), "\n");
+ break;
+ case WLAN_CIPHER_SUITE_TKIP:
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++)
+ p += scnprintf(p, sizeof(buf)+buf-p,
+ "%08x %04x\n",
+ key->u.tkip.rx[i].iv32,
+ key->u.tkip.rx[i].iv16);
+ len = p - buf;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
+ rpn = key->u.ccmp.rx_pn[i];
+ p += scnprintf(p, sizeof(buf)+buf-p,
+ "%02x%02x%02x%02x%02x%02x\n",
+ rpn[0], rpn[1], rpn[2],
+ rpn[3], rpn[4], rpn[5]);
+ }
+ len = p - buf;
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ rpn = key->u.aes_cmac.rx_pn;
+ p += scnprintf(p, sizeof(buf)+buf-p,
+ "%02x%02x%02x%02x%02x%02x\n",
+ rpn[0], rpn[1], rpn[2],
+ rpn[3], rpn[4], rpn[5]);
+ len = p - buf;
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ rpn = key->u.aes_gmac.rx_pn;
+ p += scnprintf(p, sizeof(buf)+buf-p,
+ "%02x%02x%02x%02x%02x%02x\n",
+ rpn[0], rpn[1], rpn[2],
+ rpn[3], rpn[4], rpn[5]);
+ len = p - buf;
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
+ rpn = key->u.gcmp.rx_pn[i];
+ p += scnprintf(p, sizeof(buf)+buf-p,
+ "%02x%02x%02x%02x%02x%02x\n",
+ rpn[0], rpn[1], rpn[2],
+ rpn[3], rpn[4], rpn[5]);
+ }
+ len = p - buf;
+ break;
+ default:
+ return 0;
+ }
+ return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(rx_spec);
+
+static ssize_t key_replays_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_key *key = file->private_data;
+ char buf[20];
+ int len;
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ len = scnprintf(buf, sizeof(buf), "%u\n",
+ key->u.aes_cmac.replays);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ len = scnprintf(buf, sizeof(buf), "%u\n",
+ key->u.aes_gmac.replays);
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ len = scnprintf(buf, sizeof(buf), "%u\n", key->u.gcmp.replays);
+ break;
+ default:
+ return 0;
+ }
+ return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(replays);
+
+static ssize_t key_icverrors_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_key *key = file->private_data;
+ char buf[20];
+ int len;
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ len = scnprintf(buf, sizeof(buf), "%u\n",
+ key->u.aes_cmac.icverrors);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ len = scnprintf(buf, sizeof(buf), "%u\n",
+ key->u.aes_gmac.icverrors);
+ break;
+ default:
+ return 0;
+ }
+ return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(icverrors);
+
+static ssize_t key_mic_failures_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_key *key = file->private_data;
+ char buf[20];
+ int len;
+
+ if (key->conf.cipher != WLAN_CIPHER_SUITE_TKIP)
+ return -EINVAL;
+
+ len = scnprintf(buf, sizeof(buf), "%u\n", key->u.tkip.mic_failures);
+
+ return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(mic_failures);
+
+static ssize_t key_key_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_key *key = file->private_data;
+ int i, bufsize = 2 * key->conf.keylen + 2;
+ char *buf = kmalloc(bufsize, GFP_KERNEL);
+ char *p = buf;
+ ssize_t res;
+
+ if (!buf)
+ return -ENOMEM;
+
+ for (i = 0; i < key->conf.keylen; i++)
+ p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]);
+ p += scnprintf(p, bufsize+buf-p, "\n");
+ res = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ kfree(buf);
+ return res;
+}
+KEY_OPS(key);
+
+#define DEBUGFS_ADD(name) \
+ debugfs_create_file(#name, 0400, key->debugfs.dir, \
+ key, &key_##name##_ops);
+
+void ieee80211_debugfs_key_add(struct ieee80211_key *key)
+{
+ static int keycount;
+ char buf[100];
+ struct sta_info *sta;
+
+ if (!key->local->debugfs.keys)
+ return;
+
+ sprintf(buf, "%d", keycount);
+ key->debugfs.cnt = keycount;
+ keycount++;
+ key->debugfs.dir = debugfs_create_dir(buf,
+ key->local->debugfs.keys);
+
+ if (!key->debugfs.dir)
+ return;
+
+ sta = key->sta;
+ if (sta) {
+ sprintf(buf, "../../netdev:%s/stations/%pM",
+ sta->sdata->name, sta->sta.addr);
+ key->debugfs.stalink =
+ debugfs_create_symlink("station", key->debugfs.dir, buf);
+ }
+
+ DEBUGFS_ADD(keylen);
+ DEBUGFS_ADD(flags);
+ DEBUGFS_ADD(keyidx);
+ DEBUGFS_ADD(hw_key_idx);
+ DEBUGFS_ADD(tx_rx_count);
+ DEBUGFS_ADD(algorithm);
+ DEBUGFS_ADD(tx_spec);
+ DEBUGFS_ADD(rx_spec);
+ DEBUGFS_ADD(replays);
+ DEBUGFS_ADD(icverrors);
+ DEBUGFS_ADD(mic_failures);
+ DEBUGFS_ADD(key);
+ DEBUGFS_ADD(ifindex);
+};
+
+void ieee80211_debugfs_key_remove(struct ieee80211_key *key)
+{
+ if (!key)
+ return;
+
+ debugfs_remove_recursive(key->debugfs.dir);
+ key->debugfs.dir = NULL;
+}
+
+void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata)
+{
+ char buf[50];
+ struct ieee80211_key *key;
+
+ if (!sdata->vif.debugfs_dir)
+ return;
+
+ lockdep_assert_held(&sdata->local->key_mtx);
+
+ debugfs_remove(sdata->debugfs.default_unicast_key);
+ sdata->debugfs.default_unicast_key = NULL;
+
+ if (sdata->default_unicast_key) {
+ key = key_mtx_dereference(sdata->local,
+ sdata->default_unicast_key);
+ sprintf(buf, "../keys/%d", key->debugfs.cnt);
+ sdata->debugfs.default_unicast_key =
+ debugfs_create_symlink("default_unicast_key",
+ sdata->vif.debugfs_dir, buf);
+ }
+
+ debugfs_remove(sdata->debugfs.default_multicast_key);
+ sdata->debugfs.default_multicast_key = NULL;
+
+ if (sdata->default_multicast_key) {
+ key = key_mtx_dereference(sdata->local,
+ sdata->default_multicast_key);
+ sprintf(buf, "../keys/%d", key->debugfs.cnt);
+ sdata->debugfs.default_multicast_key =
+ debugfs_create_symlink("default_multicast_key",
+ sdata->vif.debugfs_dir, buf);
+ }
+}
+
+void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata)
+{
+ char buf[50];
+ struct ieee80211_key *key;
+
+ if (!sdata->vif.debugfs_dir)
+ return;
+
+ key = key_mtx_dereference(sdata->local,
+ sdata->default_mgmt_key);
+ if (key) {
+ sprintf(buf, "../keys/%d", key->debugfs.cnt);
+ sdata->debugfs.default_mgmt_key =
+ debugfs_create_symlink("default_mgmt_key",
+ sdata->vif.debugfs_dir, buf);
+ } else
+ ieee80211_debugfs_key_remove_mgmt_default(sdata);
+}
+
+void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata)
+{
+ if (!sdata)
+ return;
+
+ debugfs_remove(sdata->debugfs.default_mgmt_key);
+ sdata->debugfs.default_mgmt_key = NULL;
+}
+
+void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
+ struct sta_info *sta)
+{
+ debugfs_remove(key->debugfs.stalink);
+ key->debugfs.stalink = NULL;
+}
diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h
new file mode 100644
index 000000000..32adc77e9
--- /dev/null
+++ b/net/mac80211/debugfs_key.h
@@ -0,0 +1,33 @@
+#ifndef __MAC80211_DEBUGFS_KEY_H
+#define __MAC80211_DEBUGFS_KEY_H
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+void ieee80211_debugfs_key_add(struct ieee80211_key *key);
+void ieee80211_debugfs_key_remove(struct ieee80211_key *key);
+void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_add_mgmt_default(
+ struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_remove_mgmt_default(
+ struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
+ struct sta_info *sta);
+#else
+static inline void ieee80211_debugfs_key_add(struct ieee80211_key *key)
+{}
+static inline void ieee80211_debugfs_key_remove(struct ieee80211_key *key)
+{}
+static inline void ieee80211_debugfs_key_update_default(
+ struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_key_add_mgmt_default(
+ struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_key_remove_mgmt_default(
+ struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
+ struct sta_info *sta)
+{}
+#endif
+
+#endif /* __MAC80211_DEBUGFS_KEY_H */
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
new file mode 100644
index 000000000..29236e832
--- /dev/null
+++ b/net/mac80211/debugfs_netdev.c
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <net/mac80211.h>
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+#include "debugfs.h"
+#include "debugfs_netdev.h"
+#include "driver-ops.h"
+
+static ssize_t ieee80211_if_read(
+ struct ieee80211_sub_if_data *sdata,
+ char __user *userbuf,
+ size_t count, loff_t *ppos,
+ ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
+{
+ char buf[70];
+ ssize_t ret = -EINVAL;
+
+ read_lock(&dev_base_lock);
+ ret = (*format)(sdata, buf, sizeof(buf));
+ read_unlock(&dev_base_lock);
+
+ if (ret >= 0)
+ ret = simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+
+ return ret;
+}
+
+static ssize_t ieee80211_if_write(
+ struct ieee80211_sub_if_data *sdata,
+ const char __user *userbuf,
+ size_t count, loff_t *ppos,
+ ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int))
+{
+ char buf[64];
+ ssize_t ret;
+
+ if (count >= sizeof(buf))
+ return -E2BIG;
+
+ if (copy_from_user(buf, userbuf, count))
+ return -EFAULT;
+ buf[count] = '\0';
+
+ ret = -ENODEV;
+ rtnl_lock();
+ ret = (*write)(sdata, buf, count);
+ rtnl_unlock();
+
+ return ret;
+}
+
+#define IEEE80211_IF_FMT(name, field, format_string) \
+static ssize_t ieee80211_if_fmt_##name( \
+ const struct ieee80211_sub_if_data *sdata, char *buf, \
+ int buflen) \
+{ \
+ return scnprintf(buf, buflen, format_string, sdata->field); \
+}
+#define IEEE80211_IF_FMT_DEC(name, field) \
+ IEEE80211_IF_FMT(name, field, "%d\n")
+#define IEEE80211_IF_FMT_HEX(name, field) \
+ IEEE80211_IF_FMT(name, field, "%#x\n")
+#define IEEE80211_IF_FMT_LHEX(name, field) \
+ IEEE80211_IF_FMT(name, field, "%#lx\n")
+#define IEEE80211_IF_FMT_SIZE(name, field) \
+ IEEE80211_IF_FMT(name, field, "%zd\n")
+
+#define IEEE80211_IF_FMT_HEXARRAY(name, field) \
+static ssize_t ieee80211_if_fmt_##name( \
+ const struct ieee80211_sub_if_data *sdata, \
+ char *buf, int buflen) \
+{ \
+ char *p = buf; \
+ int i; \
+ for (i = 0; i < sizeof(sdata->field); i++) { \
+ p += scnprintf(p, buflen + buf - p, "%.2x ", \
+ sdata->field[i]); \
+ } \
+ p += scnprintf(p, buflen + buf - p, "\n"); \
+ return p - buf; \
+}
+
+#define IEEE80211_IF_FMT_ATOMIC(name, field) \
+static ssize_t ieee80211_if_fmt_##name( \
+ const struct ieee80211_sub_if_data *sdata, \
+ char *buf, int buflen) \
+{ \
+ return scnprintf(buf, buflen, "%d\n", atomic_read(&sdata->field));\
+}
+
+#define IEEE80211_IF_FMT_MAC(name, field) \
+static ssize_t ieee80211_if_fmt_##name( \
+ const struct ieee80211_sub_if_data *sdata, char *buf, \
+ int buflen) \
+{ \
+ return scnprintf(buf, buflen, "%pM\n", sdata->field); \
+}
+
+#define IEEE80211_IF_FMT_DEC_DIV_16(name, field) \
+static ssize_t ieee80211_if_fmt_##name( \
+ const struct ieee80211_sub_if_data *sdata, \
+ char *buf, int buflen) \
+{ \
+ return scnprintf(buf, buflen, "%d\n", sdata->field / 16); \
+}
+
+#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, field) \
+static ssize_t ieee80211_if_fmt_##name( \
+ const struct ieee80211_sub_if_data *sdata, \
+ char *buf, int buflen) \
+{ \
+ return scnprintf(buf, buflen, "%d\n", \
+ jiffies_to_msecs(sdata->field)); \
+}
+
+#define _IEEE80211_IF_FILE_OPS(name, _read, _write) \
+static const struct file_operations name##_ops = { \
+ .read = (_read), \
+ .write = (_write), \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+}
+
+#define _IEEE80211_IF_FILE_R_FN(name) \
+static ssize_t ieee80211_if_read_##name(struct file *file, \
+ char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ return ieee80211_if_read(file->private_data, \
+ userbuf, count, ppos, \
+ ieee80211_if_fmt_##name); \
+}
+
+#define _IEEE80211_IF_FILE_W_FN(name) \
+static ssize_t ieee80211_if_write_##name(struct file *file, \
+ const char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ return ieee80211_if_write(file->private_data, userbuf, count, \
+ ppos, ieee80211_if_parse_##name); \
+}
+
+#define IEEE80211_IF_FILE_R(name) \
+ _IEEE80211_IF_FILE_R_FN(name) \
+ _IEEE80211_IF_FILE_OPS(name, ieee80211_if_read_##name, NULL)
+
+#define IEEE80211_IF_FILE_W(name) \
+ _IEEE80211_IF_FILE_W_FN(name) \
+ _IEEE80211_IF_FILE_OPS(name, NULL, ieee80211_if_write_##name)
+
+#define IEEE80211_IF_FILE_RW(name) \
+ _IEEE80211_IF_FILE_R_FN(name) \
+ _IEEE80211_IF_FILE_W_FN(name) \
+ _IEEE80211_IF_FILE_OPS(name, ieee80211_if_read_##name, \
+ ieee80211_if_write_##name)
+
+#define IEEE80211_IF_FILE(name, field, format) \
+ IEEE80211_IF_FMT_##format(name, field) \
+ IEEE80211_IF_FILE_R(name)
+
+/* common attributes */
+IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[IEEE80211_BAND_2GHZ],
+ HEX);
+IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[IEEE80211_BAND_5GHZ],
+ HEX);
+IEEE80211_IF_FILE(rc_rateidx_mcs_mask_2ghz,
+ rc_rateidx_mcs_mask[IEEE80211_BAND_2GHZ], HEXARRAY);
+IEEE80211_IF_FILE(rc_rateidx_mcs_mask_5ghz,
+ rc_rateidx_mcs_mask[IEEE80211_BAND_5GHZ], HEXARRAY);
+
+IEEE80211_IF_FILE(flags, flags, HEX);
+IEEE80211_IF_FILE(state, state, LHEX);
+IEEE80211_IF_FILE(txpower, vif.bss_conf.txpower, DEC);
+IEEE80211_IF_FILE(ap_power_level, ap_power_level, DEC);
+IEEE80211_IF_FILE(user_power_level, user_power_level, DEC);
+
+static ssize_t
+ieee80211_if_fmt_hw_queues(const struct ieee80211_sub_if_data *sdata,
+ char *buf, int buflen)
+{
+ int len;
+
+ len = scnprintf(buf, buflen, "AC queues: VO:%d VI:%d BE:%d BK:%d\n",
+ sdata->vif.hw_queue[IEEE80211_AC_VO],
+ sdata->vif.hw_queue[IEEE80211_AC_VI],
+ sdata->vif.hw_queue[IEEE80211_AC_BE],
+ sdata->vif.hw_queue[IEEE80211_AC_BK]);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ len += scnprintf(buf + len, buflen - len, "cab queue: %d\n",
+ sdata->vif.cab_queue);
+
+ return len;
+}
+IEEE80211_IF_FILE_R(hw_queues);
+
+/* STA attributes */
+IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
+IEEE80211_IF_FILE(aid, u.mgd.aid, DEC);
+IEEE80211_IF_FILE(last_beacon, u.mgd.last_beacon_signal, DEC);
+IEEE80211_IF_FILE(ave_beacon, u.mgd.ave_beacon_signal, DEC_DIV_16);
+IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS);
+
+static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_smps_mode smps_mode)
+{
+ struct ieee80211_local *local = sdata->local;
+ int err;
+
+ if (!(local->hw.wiphy->features & NL80211_FEATURE_STATIC_SMPS) &&
+ smps_mode == IEEE80211_SMPS_STATIC)
+ return -EINVAL;
+
+ /* auto should be dynamic if in PS mode */
+ if (!(local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) &&
+ (smps_mode == IEEE80211_SMPS_DYNAMIC ||
+ smps_mode == IEEE80211_SMPS_AUTOMATIC))
+ return -EINVAL;
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+ sdata->vif.type != NL80211_IFTYPE_AP)
+ return -EOPNOTSUPP;
+
+ sdata_lock(sdata);
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ err = __ieee80211_request_smps_mgd(sdata, smps_mode);
+ else
+ err = __ieee80211_request_smps_ap(sdata, smps_mode);
+ sdata_unlock(sdata);
+
+ return err;
+}
+
+static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = {
+ [IEEE80211_SMPS_AUTOMATIC] = "auto",
+ [IEEE80211_SMPS_OFF] = "off",
+ [IEEE80211_SMPS_STATIC] = "static",
+ [IEEE80211_SMPS_DYNAMIC] = "dynamic",
+};
+
+static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata,
+ char *buf, int buflen)
+{
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ return snprintf(buf, buflen, "request: %s\nused: %s\n",
+ smps_modes[sdata->u.mgd.req_smps],
+ smps_modes[sdata->smps_mode]);
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ return snprintf(buf, buflen, "request: %s\nused: %s\n",
+ smps_modes[sdata->u.ap.req_smps],
+ smps_modes[sdata->smps_mode]);
+ return -EINVAL;
+}
+
+static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata,
+ const char *buf, int buflen)
+{
+ enum ieee80211_smps_mode mode;
+
+ for (mode = 0; mode < IEEE80211_SMPS_NUM_MODES; mode++) {
+ if (strncmp(buf, smps_modes[mode], buflen) == 0) {
+ int err = ieee80211_set_smps(sdata, mode);
+ if (!err)
+ return buflen;
+ return err;
+ }
+ }
+
+ return -EINVAL;
+}
+IEEE80211_IF_FILE_RW(smps);
+
+static ssize_t ieee80211_if_parse_tkip_mic_test(
+ struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
+{
+ struct ieee80211_local *local = sdata->local;
+ u8 addr[ETH_ALEN];
+ struct sk_buff *skb;
+ struct ieee80211_hdr *hdr;
+ __le16 fc;
+
+ if (!mac_pton(buf, addr))
+ return -EINVAL;
+
+ if (!ieee80211_sdata_running(sdata))
+ return -ENOTCONN;
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24 + 100);
+ if (!skb)
+ return -ENOMEM;
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ hdr = (struct ieee80211_hdr *) skb_put(skb, 24);
+ memset(hdr, 0, 24);
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP:
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+ /* DA BSSID SA */
+ memcpy(hdr->addr1, addr, ETH_ALEN);
+ memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(hdr->addr3, sdata->vif.addr, ETH_ALEN);
+ break;
+ case NL80211_IFTYPE_STATION:
+ fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+ /* BSSID SA DA */
+ sdata_lock(sdata);
+ if (!sdata->u.mgd.associated) {
+ sdata_unlock(sdata);
+ dev_kfree_skb(skb);
+ return -ENOTCONN;
+ }
+ memcpy(hdr->addr1, sdata->u.mgd.associated->bssid, ETH_ALEN);
+ memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(hdr->addr3, addr, ETH_ALEN);
+ sdata_unlock(sdata);
+ break;
+ default:
+ dev_kfree_skb(skb);
+ return -EOPNOTSUPP;
+ }
+ hdr->frame_control = fc;
+
+ /*
+ * Add some length to the test frame to make it look bit more valid.
+ * The exact contents does not matter since the recipient is required
+ * to drop this because of the Michael MIC failure.
+ */
+ memset(skb_put(skb, 50), 0, 50);
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_TKIP_MIC_FAILURE;
+
+ ieee80211_tx_skb(sdata, skb);
+
+ return buflen;
+}
+IEEE80211_IF_FILE_W(tkip_mic_test);
+
+static ssize_t ieee80211_if_parse_beacon_loss(
+ struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
+{
+ if (!ieee80211_sdata_running(sdata) || !sdata->vif.bss_conf.assoc)
+ return -ENOTCONN;
+
+ ieee80211_beacon_loss(&sdata->vif);
+
+ return buflen;
+}
+IEEE80211_IF_FILE_W(beacon_loss);
+
+static ssize_t ieee80211_if_fmt_uapsd_queues(
+ const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+ const struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ return snprintf(buf, buflen, "0x%x\n", ifmgd->uapsd_queues);
+}
+
+static ssize_t ieee80211_if_parse_uapsd_queues(
+ struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u8 val;
+ int ret;
+
+ ret = kstrtou8(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val & ~IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK)
+ return -ERANGE;
+
+ ifmgd->uapsd_queues = val;
+
+ return buflen;
+}
+IEEE80211_IF_FILE_RW(uapsd_queues);
+
+static ssize_t ieee80211_if_fmt_uapsd_max_sp_len(
+ const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+ const struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ return snprintf(buf, buflen, "0x%x\n", ifmgd->uapsd_max_sp_len);
+}
+
+static ssize_t ieee80211_if_parse_uapsd_max_sp_len(
+ struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return -EINVAL;
+
+ if (val & ~IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK)
+ return -ERANGE;
+
+ ifmgd->uapsd_max_sp_len = val;
+
+ return buflen;
+}
+IEEE80211_IF_FILE_RW(uapsd_max_sp_len);
+
+/* AP attributes */
+IEEE80211_IF_FILE(num_mcast_sta, u.ap.num_mcast_sta, ATOMIC);
+IEEE80211_IF_FILE(num_sta_ps, u.ap.ps.num_sta_ps, ATOMIC);
+IEEE80211_IF_FILE(dtim_count, u.ap.ps.dtim_count, DEC);
+
+static ssize_t ieee80211_if_fmt_num_buffered_multicast(
+ const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+ return scnprintf(buf, buflen, "%u\n",
+ skb_queue_len(&sdata->u.ap.ps.bc_buf));
+}
+IEEE80211_IF_FILE_R(num_buffered_multicast);
+
+/* IBSS attributes */
+static ssize_t ieee80211_if_fmt_tsf(
+ const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+ struct ieee80211_local *local = sdata->local;
+ u64 tsf;
+
+ tsf = drv_get_tsf(local, (struct ieee80211_sub_if_data *)sdata);
+
+ return scnprintf(buf, buflen, "0x%016llx\n", (unsigned long long) tsf);
+}
+
+static ssize_t ieee80211_if_parse_tsf(
+ struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
+{
+ struct ieee80211_local *local = sdata->local;
+ unsigned long long tsf;
+ int ret;
+ int tsf_is_delta = 0;
+
+ if (strncmp(buf, "reset", 5) == 0) {
+ if (local->ops->reset_tsf) {
+ drv_reset_tsf(local, sdata);
+ wiphy_info(local->hw.wiphy, "debugfs reset TSF\n");
+ }
+ } else {
+ if (buflen > 10 && buf[1] == '=') {
+ if (buf[0] == '+')
+ tsf_is_delta = 1;
+ else if (buf[0] == '-')
+ tsf_is_delta = -1;
+ else
+ return -EINVAL;
+ buf += 2;
+ }
+ ret = kstrtoull(buf, 10, &tsf);
+ if (ret < 0)
+ return ret;
+ if (tsf_is_delta)
+ tsf = drv_get_tsf(local, sdata) + tsf_is_delta * tsf;
+ if (local->ops->set_tsf) {
+ drv_set_tsf(local, sdata, tsf);
+ wiphy_info(local->hw.wiphy,
+ "debugfs set TSF to %#018llx\n", tsf);
+ }
+ }
+
+ ieee80211_recalc_dtim(local, sdata);
+ return buflen;
+}
+IEEE80211_IF_FILE_RW(tsf);
+
+
+/* WDS attributes */
+IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC);
+
+#ifdef CONFIG_MAC80211_MESH
+IEEE80211_IF_FILE(estab_plinks, u.mesh.estab_plinks, ATOMIC);
+
+/* Mesh stats attributes */
+IEEE80211_IF_FILE(fwded_mcast, u.mesh.mshstats.fwded_mcast, DEC);
+IEEE80211_IF_FILE(fwded_unicast, u.mesh.mshstats.fwded_unicast, DEC);
+IEEE80211_IF_FILE(fwded_frames, u.mesh.mshstats.fwded_frames, DEC);
+IEEE80211_IF_FILE(dropped_frames_ttl, u.mesh.mshstats.dropped_frames_ttl, DEC);
+IEEE80211_IF_FILE(dropped_frames_congestion,
+ u.mesh.mshstats.dropped_frames_congestion, DEC);
+IEEE80211_IF_FILE(dropped_frames_no_route,
+ u.mesh.mshstats.dropped_frames_no_route, DEC);
+
+/* Mesh parameters */
+IEEE80211_IF_FILE(dot11MeshMaxRetries,
+ u.mesh.mshcfg.dot11MeshMaxRetries, DEC);
+IEEE80211_IF_FILE(dot11MeshRetryTimeout,
+ u.mesh.mshcfg.dot11MeshRetryTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshConfirmTimeout,
+ u.mesh.mshcfg.dot11MeshConfirmTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshHoldingTimeout,
+ u.mesh.mshcfg.dot11MeshHoldingTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshTTL, u.mesh.mshcfg.dot11MeshTTL, DEC);
+IEEE80211_IF_FILE(element_ttl, u.mesh.mshcfg.element_ttl, DEC);
+IEEE80211_IF_FILE(auto_open_plinks, u.mesh.mshcfg.auto_open_plinks, DEC);
+IEEE80211_IF_FILE(dot11MeshMaxPeerLinks,
+ u.mesh.mshcfg.dot11MeshMaxPeerLinks, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPactivePathTimeout,
+ u.mesh.mshcfg.dot11MeshHWMPactivePathTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPpreqMinInterval,
+ u.mesh.mshcfg.dot11MeshHWMPpreqMinInterval, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPperrMinInterval,
+ u.mesh.mshcfg.dot11MeshHWMPperrMinInterval, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPnetDiameterTraversalTime,
+ u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPmaxPREQretries,
+ u.mesh.mshcfg.dot11MeshHWMPmaxPREQretries, DEC);
+IEEE80211_IF_FILE(path_refresh_time,
+ u.mesh.mshcfg.path_refresh_time, DEC);
+IEEE80211_IF_FILE(min_discovery_timeout,
+ u.mesh.mshcfg.min_discovery_timeout, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPRootMode,
+ u.mesh.mshcfg.dot11MeshHWMPRootMode, DEC);
+IEEE80211_IF_FILE(dot11MeshGateAnnouncementProtocol,
+ u.mesh.mshcfg.dot11MeshGateAnnouncementProtocol, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPRannInterval,
+ u.mesh.mshcfg.dot11MeshHWMPRannInterval, DEC);
+IEEE80211_IF_FILE(dot11MeshForwarding, u.mesh.mshcfg.dot11MeshForwarding, DEC);
+IEEE80211_IF_FILE(rssi_threshold, u.mesh.mshcfg.rssi_threshold, DEC);
+IEEE80211_IF_FILE(ht_opmode, u.mesh.mshcfg.ht_opmode, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPactivePathToRootTimeout,
+ u.mesh.mshcfg.dot11MeshHWMPactivePathToRootTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMProotInterval,
+ u.mesh.mshcfg.dot11MeshHWMProotInterval, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPconfirmationInterval,
+ u.mesh.mshcfg.dot11MeshHWMPconfirmationInterval, DEC);
+IEEE80211_IF_FILE(power_mode, u.mesh.mshcfg.power_mode, DEC);
+IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration,
+ u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC);
+#endif
+
+#define DEBUGFS_ADD_MODE(name, mode) \
+ debugfs_create_file(#name, mode, sdata->vif.debugfs_dir, \
+ sdata, &name##_ops);
+
+#define DEBUGFS_ADD(name) DEBUGFS_ADD_MODE(name, 0400)
+
+static void add_common_files(struct ieee80211_sub_if_data *sdata)
+{
+ DEBUGFS_ADD(rc_rateidx_mask_2ghz);
+ DEBUGFS_ADD(rc_rateidx_mask_5ghz);
+ DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz);
+ DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz);
+ DEBUGFS_ADD(hw_queues);
+}
+
+static void add_sta_files(struct ieee80211_sub_if_data *sdata)
+{
+ DEBUGFS_ADD(bssid);
+ DEBUGFS_ADD(aid);
+ DEBUGFS_ADD(last_beacon);
+ DEBUGFS_ADD(ave_beacon);
+ DEBUGFS_ADD(beacon_timeout);
+ DEBUGFS_ADD_MODE(smps, 0600);
+ DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
+ DEBUGFS_ADD_MODE(beacon_loss, 0200);
+ DEBUGFS_ADD_MODE(uapsd_queues, 0600);
+ DEBUGFS_ADD_MODE(uapsd_max_sp_len, 0600);
+}
+
+static void add_ap_files(struct ieee80211_sub_if_data *sdata)
+{
+ DEBUGFS_ADD(num_mcast_sta);
+ DEBUGFS_ADD_MODE(smps, 0600);
+ DEBUGFS_ADD(num_sta_ps);
+ DEBUGFS_ADD(dtim_count);
+ DEBUGFS_ADD(num_buffered_multicast);
+ DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
+}
+
+static void add_ibss_files(struct ieee80211_sub_if_data *sdata)
+{
+ DEBUGFS_ADD_MODE(tsf, 0600);
+}
+
+static void add_wds_files(struct ieee80211_sub_if_data *sdata)
+{
+ DEBUGFS_ADD(peer);
+}
+
+#ifdef CONFIG_MAC80211_MESH
+
+static void add_mesh_files(struct ieee80211_sub_if_data *sdata)
+{
+ DEBUGFS_ADD_MODE(tsf, 0600);
+ DEBUGFS_ADD_MODE(estab_plinks, 0400);
+}
+
+static void add_mesh_stats(struct ieee80211_sub_if_data *sdata)
+{
+ struct dentry *dir = debugfs_create_dir("mesh_stats",
+ sdata->vif.debugfs_dir);
+#define MESHSTATS_ADD(name)\
+ debugfs_create_file(#name, 0400, dir, sdata, &name##_ops);
+
+ MESHSTATS_ADD(fwded_mcast);
+ MESHSTATS_ADD(fwded_unicast);
+ MESHSTATS_ADD(fwded_frames);
+ MESHSTATS_ADD(dropped_frames_ttl);
+ MESHSTATS_ADD(dropped_frames_no_route);
+ MESHSTATS_ADD(dropped_frames_congestion);
+#undef MESHSTATS_ADD
+}
+
+static void add_mesh_config(struct ieee80211_sub_if_data *sdata)
+{
+ struct dentry *dir = debugfs_create_dir("mesh_config",
+ sdata->vif.debugfs_dir);
+
+#define MESHPARAMS_ADD(name) \
+ debugfs_create_file(#name, 0600, dir, sdata, &name##_ops);
+
+ MESHPARAMS_ADD(dot11MeshMaxRetries);
+ MESHPARAMS_ADD(dot11MeshRetryTimeout);
+ MESHPARAMS_ADD(dot11MeshConfirmTimeout);
+ MESHPARAMS_ADD(dot11MeshHoldingTimeout);
+ MESHPARAMS_ADD(dot11MeshTTL);
+ MESHPARAMS_ADD(element_ttl);
+ MESHPARAMS_ADD(auto_open_plinks);
+ MESHPARAMS_ADD(dot11MeshMaxPeerLinks);
+ MESHPARAMS_ADD(dot11MeshHWMPactivePathTimeout);
+ MESHPARAMS_ADD(dot11MeshHWMPpreqMinInterval);
+ MESHPARAMS_ADD(dot11MeshHWMPperrMinInterval);
+ MESHPARAMS_ADD(dot11MeshHWMPnetDiameterTraversalTime);
+ MESHPARAMS_ADD(dot11MeshHWMPmaxPREQretries);
+ MESHPARAMS_ADD(path_refresh_time);
+ MESHPARAMS_ADD(min_discovery_timeout);
+ MESHPARAMS_ADD(dot11MeshHWMPRootMode);
+ MESHPARAMS_ADD(dot11MeshHWMPRannInterval);
+ MESHPARAMS_ADD(dot11MeshForwarding);
+ MESHPARAMS_ADD(dot11MeshGateAnnouncementProtocol);
+ MESHPARAMS_ADD(rssi_threshold);
+ MESHPARAMS_ADD(ht_opmode);
+ MESHPARAMS_ADD(dot11MeshHWMPactivePathToRootTimeout);
+ MESHPARAMS_ADD(dot11MeshHWMProotInterval);
+ MESHPARAMS_ADD(dot11MeshHWMPconfirmationInterval);
+ MESHPARAMS_ADD(power_mode);
+ MESHPARAMS_ADD(dot11MeshAwakeWindowDuration);
+#undef MESHPARAMS_ADD
+}
+#endif
+
+static void add_files(struct ieee80211_sub_if_data *sdata)
+{
+ if (!sdata->vif.debugfs_dir)
+ return;
+
+ DEBUGFS_ADD(flags);
+ DEBUGFS_ADD(state);
+ DEBUGFS_ADD(txpower);
+ DEBUGFS_ADD(user_power_level);
+ DEBUGFS_ADD(ap_power_level);
+
+ if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ add_common_files(sdata);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_MESH_POINT:
+#ifdef CONFIG_MAC80211_MESH
+ add_mesh_files(sdata);
+ add_mesh_stats(sdata);
+ add_mesh_config(sdata);
+#endif
+ break;
+ case NL80211_IFTYPE_STATION:
+ add_sta_files(sdata);
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ add_ibss_files(sdata);
+ break;
+ case NL80211_IFTYPE_AP:
+ add_ap_files(sdata);
+ break;
+ case NL80211_IFTYPE_WDS:
+ add_wds_files(sdata);
+ break;
+ default:
+ break;
+ }
+}
+
+void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
+{
+ char buf[10+IFNAMSIZ];
+
+ sprintf(buf, "netdev:%s", sdata->name);
+ sdata->vif.debugfs_dir = debugfs_create_dir(buf,
+ sdata->local->hw.wiphy->debugfsdir);
+ if (sdata->vif.debugfs_dir)
+ sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
+ sdata->vif.debugfs_dir);
+ add_files(sdata);
+}
+
+void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
+{
+ if (!sdata->vif.debugfs_dir)
+ return;
+
+ debugfs_remove_recursive(sdata->vif.debugfs_dir);
+ sdata->vif.debugfs_dir = NULL;
+}
+
+void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
+{
+ struct dentry *dir;
+ char buf[10 + IFNAMSIZ];
+
+ dir = sdata->vif.debugfs_dir;
+
+ if (!dir)
+ return;
+
+ sprintf(buf, "netdev:%s", sdata->name);
+ if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf))
+ sdata_err(sdata,
+ "debugfs: failed to rename debugfs dir to %s\n",
+ buf);
+}
diff --git a/net/mac80211/debugfs_netdev.h b/net/mac80211/debugfs_netdev.h
new file mode 100644
index 000000000..9f5501a9a
--- /dev/null
+++ b/net/mac80211/debugfs_netdev.h
@@ -0,0 +1,24 @@
+/* routines exported for debugfs handling */
+
+#ifndef __IEEE80211_DEBUGFS_NETDEV_H
+#define __IEEE80211_DEBUGFS_NETDEV_H
+
+#include "ieee80211_i.h"
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata);
+#else
+static inline void ieee80211_debugfs_add_netdev(
+ struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_remove_netdev(
+ struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_rename_netdev(
+ struct ieee80211_sub_if_data *sdata)
+{}
+#endif
+
+#endif /* __IEEE80211_DEBUGFS_NETDEV_H */
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
new file mode 100644
index 000000000..252859e90
--- /dev/null
+++ b/net/mac80211/debugfs_sta.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright 2003-2005 Devicescape Software, Inc.
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/ieee80211.h>
+#include "ieee80211_i.h"
+#include "debugfs.h"
+#include "debugfs_sta.h"
+#include "sta_info.h"
+#include "driver-ops.h"
+
+/* sta attributtes */
+
+#define STA_READ(name, field, format_string) \
+static ssize_t sta_ ##name## _read(struct file *file, \
+ char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct sta_info *sta = file->private_data; \
+ return mac80211_format_buffer(userbuf, count, ppos, \
+ format_string, sta->field); \
+}
+#define STA_READ_D(name, field) STA_READ(name, field, "%d\n")
+#define STA_READ_U(name, field) STA_READ(name, field, "%u\n")
+#define STA_READ_S(name, field) STA_READ(name, field, "%s\n")
+
+#define STA_OPS(name) \
+static const struct file_operations sta_ ##name## _ops = { \
+ .read = sta_##name##_read, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+}
+
+#define STA_OPS_RW(name) \
+static const struct file_operations sta_ ##name## _ops = { \
+ .read = sta_##name##_read, \
+ .write = sta_##name##_write, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+}
+
+#define STA_FILE(name, field, format) \
+ STA_READ_##format(name, field) \
+ STA_OPS(name)
+
+STA_FILE(aid, sta.aid, D);
+STA_FILE(dev, sdata->name, S);
+STA_FILE(last_signal, last_signal, D);
+STA_FILE(last_ack_signal, last_ack_signal, D);
+STA_FILE(beacon_loss_count, beacon_loss_count, D);
+
+static ssize_t sta_flags_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[121];
+ struct sta_info *sta = file->private_data;
+
+#define TEST(flg) \
+ test_sta_flag(sta, WLAN_STA_##flg) ? #flg "\n" : ""
+
+ int res = scnprintf(buf, sizeof(buf),
+ "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+ TEST(AUTH), TEST(ASSOC), TEST(PS_STA),
+ TEST(PS_DRIVER), TEST(AUTHORIZED),
+ TEST(SHORT_PREAMBLE),
+ sta->sta.wme ? "WME\n" : "",
+ TEST(WDS), TEST(CLEAR_PS_FILT),
+ TEST(MFP), TEST(BLOCK_BA), TEST(PSPOLL),
+ TEST(UAPSD), TEST(SP), TEST(TDLS_PEER),
+ TEST(TDLS_PEER_AUTH), TEST(TDLS_INITIATOR),
+ TEST(TDLS_CHAN_SWITCH), TEST(TDLS_OFF_CHANNEL),
+ TEST(4ADDR_EVENT), TEST(INSERTED),
+ TEST(RATE_CONTROL), TEST(TOFFSET_KNOWN),
+ TEST(MPSP_OWNER), TEST(MPSP_RECIPIENT));
+#undef TEST
+ return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+STA_OPS(flags);
+
+static ssize_t sta_num_ps_buf_frames_read(struct file *file,
+ char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ char buf[17*IEEE80211_NUM_ACS], *p = buf;
+ int ac;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+ p += scnprintf(p, sizeof(buf)+buf-p, "AC%d: %d\n", ac,
+ skb_queue_len(&sta->ps_tx_buf[ac]) +
+ skb_queue_len(&sta->tx_filtered[ac]));
+ return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+}
+STA_OPS(num_ps_buf_frames);
+
+static ssize_t sta_inactive_ms_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ return mac80211_format_buffer(userbuf, count, ppos, "%d\n",
+ jiffies_to_msecs(jiffies - sta->last_rx));
+}
+STA_OPS(inactive_ms);
+
+
+static ssize_t sta_connected_time_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct timespec uptime;
+ struct tm result;
+ long connected_time_secs;
+ char buf[100];
+ int res;
+ ktime_get_ts(&uptime);
+ connected_time_secs = uptime.tv_sec - sta->last_connected;
+ time_to_tm(connected_time_secs, 0, &result);
+ result.tm_year -= 70;
+ result.tm_mday -= 1;
+ res = scnprintf(buf, sizeof(buf),
+ "years - %ld\nmonths - %d\ndays - %d\nclock - %d:%d:%d\n\n",
+ result.tm_year, result.tm_mon, result.tm_mday,
+ result.tm_hour, result.tm_min, result.tm_sec);
+ return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+STA_OPS(connected_time);
+
+
+
+static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[15*IEEE80211_NUM_TIDS], *p = buf;
+ int i;
+ struct sta_info *sta = file->private_data;
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++)
+ p += scnprintf(p, sizeof(buf)+buf-p, "%x ",
+ le16_to_cpu(sta->last_seq_ctrl[i]));
+ p += scnprintf(p, sizeof(buf)+buf-p, "\n");
+ return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+}
+STA_OPS(last_seq_ctrl);
+
+static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[71 + IEEE80211_NUM_TIDS * 40], *p = buf;
+ int i;
+ struct sta_info *sta = file->private_data;
+ struct tid_ampdu_rx *tid_rx;
+ struct tid_ampdu_tx *tid_tx;
+
+ rcu_read_lock();
+
+ p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n",
+ sta->ampdu_mlme.dialog_token_allocator + 1);
+ p += scnprintf(p, sizeof(buf) + buf - p,
+ "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
+
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]);
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]);
+
+ p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i);
+ p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_rx);
+ p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
+ tid_rx ? tid_rx->dialog_token : 0);
+ p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x",
+ tid_rx ? tid_rx->ssn : 0);
+
+ p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_tx);
+ p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
+ tid_tx ? tid_tx->dialog_token : 0);
+ p += scnprintf(p, sizeof(buf) + buf - p, "\t%03d",
+ tid_tx ? skb_queue_len(&tid_tx->pending) : 0);
+ p += scnprintf(p, sizeof(buf) + buf - p, "\n");
+ }
+ rcu_read_unlock();
+
+ return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+}
+
+static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char _buf[12] = {}, *buf = _buf;
+ struct sta_info *sta = file->private_data;
+ bool start, tx;
+ unsigned long tid;
+ int ret;
+
+ if (count > sizeof(_buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, userbuf, count))
+ return -EFAULT;
+
+ buf[sizeof(_buf) - 1] = '\0';
+
+ if (strncmp(buf, "tx ", 3) == 0) {
+ buf += 3;
+ tx = true;
+ } else if (strncmp(buf, "rx ", 3) == 0) {
+ buf += 3;
+ tx = false;
+ } else
+ return -EINVAL;
+
+ if (strncmp(buf, "start ", 6) == 0) {
+ buf += 6;
+ start = true;
+ if (!tx)
+ return -EINVAL;
+ } else if (strncmp(buf, "stop ", 5) == 0) {
+ buf += 5;
+ start = false;
+ } else
+ return -EINVAL;
+
+ ret = kstrtoul(buf, 0, &tid);
+ if (ret)
+ return ret;
+
+ if (tid >= IEEE80211_NUM_TIDS)
+ return -EINVAL;
+
+ if (tx) {
+ if (start)
+ ret = ieee80211_start_tx_ba_session(&sta->sta, tid, 5000);
+ else
+ ret = ieee80211_stop_tx_ba_session(&sta->sta, tid);
+ } else {
+ __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+ 3, true);
+ ret = 0;
+ }
+
+ return ret ?: count;
+}
+STA_OPS_RW(agg_status);
+
+static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+#define PRINT_HT_CAP(_cond, _str) \
+ do { \
+ if (_cond) \
+ p += scnprintf(p, sizeof(buf)+buf-p, "\t" _str "\n"); \
+ } while (0)
+ char buf[512], *p = buf;
+ int i;
+ struct sta_info *sta = file->private_data;
+ struct ieee80211_sta_ht_cap *htc = &sta->sta.ht_cap;
+
+ p += scnprintf(p, sizeof(buf) + buf - p, "ht %ssupported\n",
+ htc->ht_supported ? "" : "not ");
+ if (htc->ht_supported) {
+ p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.4x\n", htc->cap);
+
+ PRINT_HT_CAP((htc->cap & BIT(0)), "RX LDPC");
+ PRINT_HT_CAP((htc->cap & BIT(1)), "HT20/HT40");
+ PRINT_HT_CAP(!(htc->cap & BIT(1)), "HT20");
+
+ PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 0, "Static SM Power Save");
+ PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 1, "Dynamic SM Power Save");
+ PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 3, "SM Power Save disabled");
+
+ PRINT_HT_CAP((htc->cap & BIT(4)), "RX Greenfield");
+ PRINT_HT_CAP((htc->cap & BIT(5)), "RX HT20 SGI");
+ PRINT_HT_CAP((htc->cap & BIT(6)), "RX HT40 SGI");
+ PRINT_HT_CAP((htc->cap & BIT(7)), "TX STBC");
+
+ PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 0, "No RX STBC");
+ PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 1, "RX STBC 1-stream");
+ PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 2, "RX STBC 2-streams");
+ PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 3, "RX STBC 3-streams");
+
+ PRINT_HT_CAP((htc->cap & BIT(10)), "HT Delayed Block Ack");
+
+ PRINT_HT_CAP(!(htc->cap & BIT(11)), "Max AMSDU length: "
+ "3839 bytes");
+ PRINT_HT_CAP((htc->cap & BIT(11)), "Max AMSDU length: "
+ "7935 bytes");
+
+ /*
+ * For beacons and probe response this would mean the BSS
+ * does or does not allow the usage of DSSS/CCK HT40.
+ * Otherwise it means the STA does or does not use
+ * DSSS/CCK HT40.
+ */
+ PRINT_HT_CAP((htc->cap & BIT(12)), "DSSS/CCK HT40");
+ PRINT_HT_CAP(!(htc->cap & BIT(12)), "No DSSS/CCK HT40");
+
+ /* BIT(13) is reserved */
+
+ PRINT_HT_CAP((htc->cap & BIT(14)), "40 MHz Intolerant");
+
+ PRINT_HT_CAP((htc->cap & BIT(15)), "L-SIG TXOP protection");
+
+ p += scnprintf(p, sizeof(buf)+buf-p, "ampdu factor/density: %d/%d\n",
+ htc->ampdu_factor, htc->ampdu_density);
+ p += scnprintf(p, sizeof(buf)+buf-p, "MCS mask:");
+
+ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
+ p += scnprintf(p, sizeof(buf)+buf-p, " %.2x",
+ htc->mcs.rx_mask[i]);
+ p += scnprintf(p, sizeof(buf)+buf-p, "\n");
+
+ /* If not set this is meaningless */
+ if (le16_to_cpu(htc->mcs.rx_highest)) {
+ p += scnprintf(p, sizeof(buf)+buf-p,
+ "MCS rx highest: %d Mbps\n",
+ le16_to_cpu(htc->mcs.rx_highest));
+ }
+
+ p += scnprintf(p, sizeof(buf)+buf-p, "MCS tx params: %x\n",
+ htc->mcs.tx_params);
+ }
+
+ return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+}
+STA_OPS(ht_capa);
+
+static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[128], *p = buf;
+ struct sta_info *sta = file->private_data;
+ struct ieee80211_sta_vht_cap *vhtc = &sta->sta.vht_cap;
+
+ p += scnprintf(p, sizeof(buf) + buf - p, "VHT %ssupported\n",
+ vhtc->vht_supported ? "" : "not ");
+ if (vhtc->vht_supported) {
+ p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.8x\n", vhtc->cap);
+
+ p += scnprintf(p, sizeof(buf)+buf-p, "RX MCS: %.4x\n",
+ le16_to_cpu(vhtc->vht_mcs.rx_mcs_map));
+ if (vhtc->vht_mcs.rx_highest)
+ p += scnprintf(p, sizeof(buf)+buf-p,
+ "MCS RX highest: %d Mbps\n",
+ le16_to_cpu(vhtc->vht_mcs.rx_highest));
+ p += scnprintf(p, sizeof(buf)+buf-p, "TX MCS: %.4x\n",
+ le16_to_cpu(vhtc->vht_mcs.tx_mcs_map));
+ if (vhtc->vht_mcs.tx_highest)
+ p += scnprintf(p, sizeof(buf)+buf-p,
+ "MCS TX highest: %d Mbps\n",
+ le16_to_cpu(vhtc->vht_mcs.tx_highest));
+ }
+
+ return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+}
+STA_OPS(vht_capa);
+
+static ssize_t sta_current_tx_rate_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct rate_info rinfo;
+ u16 rate;
+ sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo);
+ rate = cfg80211_calculate_bitrate(&rinfo);
+
+ return mac80211_format_buffer(userbuf, count, ppos,
+ "%d.%d MBit/s\n",
+ rate/10, rate%10);
+}
+STA_OPS(current_tx_rate);
+
+static ssize_t sta_last_rx_rate_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct rate_info rinfo;
+ u16 rate;
+
+ sta_set_rate_info_rx(sta, &rinfo);
+
+ rate = cfg80211_calculate_bitrate(&rinfo);
+
+ return mac80211_format_buffer(userbuf, count, ppos,
+ "%d.%d MBit/s\n",
+ rate/10, rate%10);
+}
+STA_OPS(last_rx_rate);
+
+#define DEBUGFS_ADD(name) \
+ debugfs_create_file(#name, 0400, \
+ sta->debugfs.dir, sta, &sta_ ##name## _ops);
+
+#define DEBUGFS_ADD_COUNTER(name, field) \
+ if (sizeof(sta->field) == sizeof(u32)) \
+ debugfs_create_u32(#name, 0400, sta->debugfs.dir, \
+ (u32 *) &sta->field); \
+ else \
+ debugfs_create_u64(#name, 0400, sta->debugfs.dir, \
+ (u64 *) &sta->field);
+
+void ieee80211_sta_debugfs_add(struct sta_info *sta)
+{
+ struct ieee80211_local *local = sta->local;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations;
+ u8 mac[3*ETH_ALEN];
+
+ sta->debugfs.add_has_run = true;
+
+ if (!stations_dir)
+ return;
+
+ snprintf(mac, sizeof(mac), "%pM", sta->sta.addr);
+
+ /*
+ * This might fail due to a race condition:
+ * When mac80211 unlinks a station, the debugfs entries
+ * remain, but it is already possible to link a new
+ * station with the same address which triggers adding
+ * it to debugfs; therefore, if the old station isn't
+ * destroyed quickly enough the old station's debugfs
+ * dir might still be around.
+ */
+ sta->debugfs.dir = debugfs_create_dir(mac, stations_dir);
+ if (!sta->debugfs.dir)
+ return;
+
+ DEBUGFS_ADD(flags);
+ DEBUGFS_ADD(num_ps_buf_frames);
+ DEBUGFS_ADD(inactive_ms);
+ DEBUGFS_ADD(connected_time);
+ DEBUGFS_ADD(last_seq_ctrl);
+ DEBUGFS_ADD(agg_status);
+ DEBUGFS_ADD(dev);
+ DEBUGFS_ADD(last_signal);
+ DEBUGFS_ADD(beacon_loss_count);
+ DEBUGFS_ADD(ht_capa);
+ DEBUGFS_ADD(vht_capa);
+ DEBUGFS_ADD(last_ack_signal);
+ DEBUGFS_ADD(current_tx_rate);
+ DEBUGFS_ADD(last_rx_rate);
+
+ DEBUGFS_ADD_COUNTER(rx_packets, rx_packets);
+ DEBUGFS_ADD_COUNTER(tx_packets, tx_packets);
+ DEBUGFS_ADD_COUNTER(rx_bytes, rx_bytes);
+ DEBUGFS_ADD_COUNTER(tx_bytes, tx_bytes);
+ DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates);
+ DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments);
+ DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped);
+ DEBUGFS_ADD_COUNTER(tx_fragments, tx_fragments);
+ DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count);
+ DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed);
+ DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count);
+
+ if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
+ debugfs_create_x32("driver_buffered_tids", 0400,
+ sta->debugfs.dir,
+ (u32 *)&sta->driver_buffered_tids);
+ else
+ debugfs_create_x64("driver_buffered_tids", 0400,
+ sta->debugfs.dir,
+ (u64 *)&sta->driver_buffered_tids);
+
+ drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs.dir);
+}
+
+void ieee80211_sta_debugfs_remove(struct sta_info *sta)
+{
+ struct ieee80211_local *local = sta->local;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+ drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs.dir);
+ debugfs_remove_recursive(sta->debugfs.dir);
+ sta->debugfs.dir = NULL;
+}
diff --git a/net/mac80211/debugfs_sta.h b/net/mac80211/debugfs_sta.h
new file mode 100644
index 000000000..8b6089032
--- /dev/null
+++ b/net/mac80211/debugfs_sta.h
@@ -0,0 +1,14 @@
+#ifndef __MAC80211_DEBUGFS_STA_H
+#define __MAC80211_DEBUGFS_STA_H
+
+#include "sta_info.h"
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+void ieee80211_sta_debugfs_add(struct sta_info *sta);
+void ieee80211_sta_debugfs_remove(struct sta_info *sta);
+#else
+static inline void ieee80211_sta_debugfs_add(struct sta_info *sta) {}
+static inline void ieee80211_sta_debugfs_remove(struct sta_info *sta) {}
+#endif
+
+#endif /* __MAC80211_DEBUGFS_STA_H */
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
new file mode 100644
index 000000000..26e1ca8a4
--- /dev/null
+++ b/net/mac80211/driver-ops.h
@@ -0,0 +1,1382 @@
+#ifndef __MAC80211_DRIVER_OPS
+#define __MAC80211_DRIVER_OPS
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "trace.h"
+
+static inline bool check_sdata_in_driver(struct ieee80211_sub_if_data *sdata)
+{
+ return !WARN(!(sdata->flags & IEEE80211_SDATA_IN_DRIVER),
+ "%s: Failed check-sdata-in-driver check, flags: 0x%x\n",
+ sdata->dev ? sdata->dev->name : sdata->name, sdata->flags);
+}
+
+static inline struct ieee80211_sub_if_data *
+get_bss_sdata(struct ieee80211_sub_if_data *sdata)
+{
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
+ u.ap);
+
+ return sdata;
+}
+
+static inline void drv_tx(struct ieee80211_local *local,
+ struct ieee80211_tx_control *control,
+ struct sk_buff *skb)
+{
+ local->ops->tx(&local->hw, control, skb);
+}
+
+static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata,
+ u32 sset, u8 *data)
+{
+ struct ieee80211_local *local = sdata->local;
+ if (local->ops->get_et_strings) {
+ trace_drv_get_et_strings(local, sset);
+ local->ops->get_et_strings(&local->hw, &sdata->vif, sset, data);
+ trace_drv_return_void(local);
+ }
+}
+
+static inline void drv_get_et_stats(struct ieee80211_sub_if_data *sdata,
+ struct ethtool_stats *stats,
+ u64 *data)
+{
+ struct ieee80211_local *local = sdata->local;
+ if (local->ops->get_et_stats) {
+ trace_drv_get_et_stats(local);
+ local->ops->get_et_stats(&local->hw, &sdata->vif, stats, data);
+ trace_drv_return_void(local);
+ }
+}
+
+static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata,
+ int sset)
+{
+ struct ieee80211_local *local = sdata->local;
+ int rv = 0;
+ if (local->ops->get_et_sset_count) {
+ trace_drv_get_et_sset_count(local, sset);
+ rv = local->ops->get_et_sset_count(&local->hw, &sdata->vif,
+ sset);
+ trace_drv_return_int(local, rv);
+ }
+ return rv;
+}
+
+static inline int drv_start(struct ieee80211_local *local)
+{
+ int ret;
+
+ might_sleep();
+
+ trace_drv_start(local);
+ local->started = true;
+ smp_mb();
+ ret = local->ops->start(&local->hw);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_stop(struct ieee80211_local *local)
+{
+ might_sleep();
+
+ trace_drv_stop(local);
+ local->ops->stop(&local->hw);
+ trace_drv_return_void(local);
+
+ /* sync away all work on the tasklet before clearing started */
+ tasklet_disable(&local->tasklet);
+ tasklet_enable(&local->tasklet);
+
+ barrier();
+
+ local->started = false;
+}
+
+#ifdef CONFIG_PM
+static inline int drv_suspend(struct ieee80211_local *local,
+ struct cfg80211_wowlan *wowlan)
+{
+ int ret;
+
+ might_sleep();
+
+ trace_drv_suspend(local);
+ ret = local->ops->suspend(&local->hw, wowlan);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline int drv_resume(struct ieee80211_local *local)
+{
+ int ret;
+
+ might_sleep();
+
+ trace_drv_resume(local);
+ ret = local->ops->resume(&local->hw);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_set_wakeup(struct ieee80211_local *local,
+ bool enabled)
+{
+ might_sleep();
+
+ if (!local->ops->set_wakeup)
+ return;
+
+ trace_drv_set_wakeup(local, enabled);
+ local->ops->set_wakeup(&local->hw, enabled);
+ trace_drv_return_void(local);
+}
+#endif
+
+static inline int drv_add_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ int ret;
+
+ might_sleep();
+
+ if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+ !(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF) &&
+ !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))))
+ return -EINVAL;
+
+ trace_drv_add_interface(local, sdata);
+ ret = local->ops->add_interface(&local->hw, &sdata->vif);
+ trace_drv_return_int(local, ret);
+
+ if (ret == 0)
+ sdata->flags |= IEEE80211_SDATA_IN_DRIVER;
+
+ return ret;
+}
+
+static inline int drv_change_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type, bool p2p)
+{
+ int ret;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_change_interface(local, sdata, type, p2p);
+ ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_remove_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_remove_interface(local, sdata);
+ local->ops->remove_interface(&local->hw, &sdata->vif);
+ sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER;
+ trace_drv_return_void(local);
+}
+
+static inline int drv_config(struct ieee80211_local *local, u32 changed)
+{
+ int ret;
+
+ might_sleep();
+
+ trace_drv_config(local, changed);
+ ret = local->ops->config(&local->hw, changed);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_bss_info_changed(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_bss_conf *info,
+ u32 changed)
+{
+ might_sleep();
+
+ if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON |
+ BSS_CHANGED_BEACON_ENABLED) &&
+ sdata->vif.type != NL80211_IFTYPE_AP &&
+ sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
+ sdata->vif.type != NL80211_IFTYPE_OCB))
+ return;
+
+ if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE ||
+ sdata->vif.type == NL80211_IFTYPE_MONITOR))
+ return;
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_bss_info_changed(local, sdata, info, changed);
+ if (local->ops->bss_info_changed)
+ local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed);
+ trace_drv_return_void(local);
+}
+
+static inline u64 drv_prepare_multicast(struct ieee80211_local *local,
+ struct netdev_hw_addr_list *mc_list)
+{
+ u64 ret = 0;
+
+ trace_drv_prepare_multicast(local, mc_list->count);
+
+ if (local->ops->prepare_multicast)
+ ret = local->ops->prepare_multicast(&local->hw, mc_list);
+
+ trace_drv_return_u64(local, ret);
+
+ return ret;
+}
+
+static inline void drv_configure_filter(struct ieee80211_local *local,
+ unsigned int changed_flags,
+ unsigned int *total_flags,
+ u64 multicast)
+{
+ might_sleep();
+
+ trace_drv_configure_filter(local, changed_flags, total_flags,
+ multicast);
+ local->ops->configure_filter(&local->hw, changed_flags, total_flags,
+ multicast);
+ trace_drv_return_void(local);
+}
+
+static inline int drv_set_tim(struct ieee80211_local *local,
+ struct ieee80211_sta *sta, bool set)
+{
+ int ret = 0;
+ trace_drv_set_tim(local, sta, set);
+ if (local->ops->set_tim)
+ ret = local->ops->set_tim(&local->hw, sta, set);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline int drv_set_key(struct ieee80211_local *local,
+ enum set_key_cmd cmd,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ struct ieee80211_key_conf *key)
+{
+ int ret;
+
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_set_key(local, cmd, sdata, sta, key);
+ ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_update_tkip_key(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_key_conf *conf,
+ struct sta_info *sta, u32 iv32,
+ u16 *phase1key)
+{
+ struct ieee80211_sta *ista = NULL;
+
+ if (sta)
+ ista = &sta->sta;
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_update_tkip_key(local, sdata, conf, ista, iv32);
+ if (local->ops->update_tkip_key)
+ local->ops->update_tkip_key(&local->hw, &sdata->vif, conf,
+ ista, iv32, phase1key);
+ trace_drv_return_void(local);
+}
+
+static inline int drv_hw_scan(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_scan_request *req)
+{
+ int ret;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_hw_scan(local, sdata);
+ ret = local->ops->hw_scan(&local->hw, &sdata->vif, req);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_cancel_hw_scan(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_cancel_hw_scan(local, sdata);
+ local->ops->cancel_hw_scan(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline int
+drv_sched_scan_start(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_sched_scan_request *req,
+ struct ieee80211_scan_ies *ies)
+{
+ int ret;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_sched_scan_start(local, sdata);
+ ret = local->ops->sched_scan_start(&local->hw, &sdata->vif,
+ req, ies);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline int drv_sched_scan_stop(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ int ret;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_sched_scan_stop(local, sdata);
+ ret = local->ops->sched_scan_stop(&local->hw, &sdata->vif);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline void drv_sw_scan_start(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const u8 *mac_addr)
+{
+ might_sleep();
+
+ trace_drv_sw_scan_start(local, sdata, mac_addr);
+ if (local->ops->sw_scan_start)
+ local->ops->sw_scan_start(&local->hw, &sdata->vif, mac_addr);
+ trace_drv_return_void(local);
+}
+
+static inline void drv_sw_scan_complete(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+
+ trace_drv_sw_scan_complete(local, sdata);
+ if (local->ops->sw_scan_complete)
+ local->ops->sw_scan_complete(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline int drv_get_stats(struct ieee80211_local *local,
+ struct ieee80211_low_level_stats *stats)
+{
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+
+ if (local->ops->get_stats)
+ ret = local->ops->get_stats(&local->hw, stats);
+ trace_drv_get_stats(local, stats, ret);
+
+ return ret;
+}
+
+static inline void drv_get_tkip_seq(struct ieee80211_local *local,
+ u8 hw_key_idx, u32 *iv32, u16 *iv16)
+{
+ if (local->ops->get_tkip_seq)
+ local->ops->get_tkip_seq(&local->hw, hw_key_idx, iv32, iv16);
+ trace_drv_get_tkip_seq(local, hw_key_idx, iv32, iv16);
+}
+
+static inline int drv_set_frag_threshold(struct ieee80211_local *local,
+ u32 value)
+{
+ int ret = 0;
+
+ might_sleep();
+
+ trace_drv_set_frag_threshold(local, value);
+ if (local->ops->set_frag_threshold)
+ ret = local->ops->set_frag_threshold(&local->hw, value);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline int drv_set_rts_threshold(struct ieee80211_local *local,
+ u32 value)
+{
+ int ret = 0;
+
+ might_sleep();
+
+ trace_drv_set_rts_threshold(local, value);
+ if (local->ops->set_rts_threshold)
+ ret = local->ops->set_rts_threshold(&local->hw, value);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline int drv_set_coverage_class(struct ieee80211_local *local,
+ s16 value)
+{
+ int ret = 0;
+ might_sleep();
+
+ trace_drv_set_coverage_class(local, value);
+ if (local->ops->set_coverage_class)
+ local->ops->set_coverage_class(&local->hw, value);
+ else
+ ret = -EOPNOTSUPP;
+
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_sta_notify(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum sta_notify_cmd cmd,
+ struct ieee80211_sta *sta)
+{
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_sta_notify(local, sdata, cmd, sta);
+ if (local->ops->sta_notify)
+ local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta);
+ trace_drv_return_void(local);
+}
+
+static inline int drv_sta_add(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta)
+{
+ int ret = 0;
+
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_sta_add(local, sdata, sta);
+ if (local->ops->sta_add)
+ ret = local->ops->sta_add(&local->hw, &sdata->vif, sta);
+
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline void drv_sta_remove(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta)
+{
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_sta_remove(local, sdata, sta);
+ if (local->ops->sta_remove)
+ local->ops->sta_remove(&local->hw, &sdata->vif, sta);
+
+ trace_drv_return_void(local);
+}
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ struct dentry *dir)
+{
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ if (local->ops->sta_add_debugfs)
+ local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
+ sta, dir);
+}
+
+static inline void drv_sta_remove_debugfs(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ struct dentry *dir)
+{
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ check_sdata_in_driver(sdata);
+
+ if (local->ops->sta_remove_debugfs)
+ local->ops->sta_remove_debugfs(&local->hw, &sdata->vif,
+ sta, dir);
+}
+#endif
+
+static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
+{
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_sta_pre_rcu_remove(local, sdata, &sta->sta);
+ if (local->ops->sta_pre_rcu_remove)
+ local->ops->sta_pre_rcu_remove(&local->hw, &sdata->vif,
+ &sta->sta);
+ trace_drv_return_void(local);
+}
+
+static inline __must_check
+int drv_sta_state(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ enum ieee80211_sta_state old_state,
+ enum ieee80211_sta_state new_state)
+{
+ int ret = 0;
+
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state);
+ if (local->ops->sta_state) {
+ ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta,
+ old_state, new_state);
+ } else if (old_state == IEEE80211_STA_AUTH &&
+ new_state == IEEE80211_STA_ASSOC) {
+ ret = drv_sta_add(local, sdata, &sta->sta);
+ if (ret == 0)
+ sta->uploaded = true;
+ } else if (old_state == IEEE80211_STA_ASSOC &&
+ new_state == IEEE80211_STA_AUTH) {
+ drv_sta_remove(local, sdata, &sta->sta);
+ }
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_sta_rc_update(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, u32 changed)
+{
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED &&
+ (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT));
+
+ trace_drv_sta_rc_update(local, sdata, sta, changed);
+ if (local->ops->sta_rc_update)
+ local->ops->sta_rc_update(&local->hw, &sdata->vif,
+ sta, changed);
+
+ trace_drv_return_void(local);
+}
+
+static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta)
+{
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_sta_rate_tbl_update(local, sdata, sta);
+ if (local->ops->sta_rate_tbl_update)
+ local->ops->sta_rate_tbl_update(&local->hw, &sdata->vif, sta);
+
+ trace_drv_return_void(local);
+}
+
+static inline void drv_sta_statistics(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ struct station_info *sinfo)
+{
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_sta_statistics(local, sdata, sta);
+ if (local->ops->sta_statistics)
+ local->ops->sta_statistics(&local->hw, &sdata->vif, sta, sinfo);
+ trace_drv_return_void(local);
+}
+
+static inline int drv_conf_tx(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata, u16 ac,
+ const struct ieee80211_tx_queue_params *params)
+{
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ if (WARN_ONCE(params->cw_min == 0 ||
+ params->cw_min > params->cw_max,
+ "%s: invalid CW_min/CW_max: %d/%d\n",
+ sdata->name, params->cw_min, params->cw_max))
+ return -EINVAL;
+
+ trace_drv_conf_tx(local, sdata, ac, params);
+ if (local->ops->conf_tx)
+ ret = local->ops->conf_tx(&local->hw, &sdata->vif,
+ ac, params);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline u64 drv_get_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ u64 ret = -1ULL;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return ret;
+
+ trace_drv_get_tsf(local, sdata);
+ if (local->ops->get_tsf)
+ ret = local->ops->get_tsf(&local->hw, &sdata->vif);
+ trace_drv_return_u64(local, ret);
+ return ret;
+}
+
+static inline void drv_set_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ u64 tsf)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_set_tsf(local, sdata, tsf);
+ if (local->ops->set_tsf)
+ local->ops->set_tsf(&local->hw, &sdata->vif, tsf);
+ trace_drv_return_void(local);
+}
+
+static inline void drv_reset_tsf(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_reset_tsf(local, sdata);
+ if (local->ops->reset_tsf)
+ local->ops->reset_tsf(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline int drv_tx_last_beacon(struct ieee80211_local *local)
+{
+ int ret = 0; /* default unsupported op for less congestion */
+
+ might_sleep();
+
+ trace_drv_tx_last_beacon(local);
+ if (local->ops->tx_last_beacon)
+ ret = local->ops->tx_last_beacon(&local->hw);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline int drv_ampdu_action(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_ampdu_mlme_action action,
+ struct ieee80211_sta *sta, u16 tid,
+ u16 *ssn, u8 buf_size)
+{
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+
+ sdata = get_bss_sdata(sdata);
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, buf_size);
+
+ if (local->ops->ampdu_action)
+ ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action,
+ sta, tid, ssn, buf_size);
+
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline int drv_get_survey(struct ieee80211_local *local, int idx,
+ struct survey_info *survey)
+{
+ int ret = -EOPNOTSUPP;
+
+ trace_drv_get_survey(local, idx, survey);
+
+ if (local->ops->get_survey)
+ ret = local->ops->get_survey(&local->hw, idx, survey);
+
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline void drv_rfkill_poll(struct ieee80211_local *local)
+{
+ might_sleep();
+
+ if (local->ops->rfkill_poll)
+ local->ops->rfkill_poll(&local->hw);
+}
+
+static inline void drv_flush(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ u32 queues, bool drop)
+{
+ struct ieee80211_vif *vif = sdata ? &sdata->vif : NULL;
+
+ might_sleep();
+
+ if (sdata && !check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_flush(local, queues, drop);
+ if (local->ops->flush)
+ local->ops->flush(&local->hw, vif, queues, drop);
+ trace_drv_return_void(local);
+}
+
+static inline void drv_channel_switch(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel_switch *ch_switch)
+{
+ might_sleep();
+
+ trace_drv_channel_switch(local, sdata, ch_switch);
+ local->ops->channel_switch(&local->hw, &sdata->vif, ch_switch);
+ trace_drv_return_void(local);
+}
+
+
+static inline int drv_set_antenna(struct ieee80211_local *local,
+ u32 tx_ant, u32 rx_ant)
+{
+ int ret = -EOPNOTSUPP;
+ might_sleep();
+ if (local->ops->set_antenna)
+ ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant);
+ trace_drv_set_antenna(local, tx_ant, rx_ant, ret);
+ return ret;
+}
+
+static inline int drv_get_antenna(struct ieee80211_local *local,
+ u32 *tx_ant, u32 *rx_ant)
+{
+ int ret = -EOPNOTSUPP;
+ might_sleep();
+ if (local->ops->get_antenna)
+ ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant);
+ trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret);
+ return ret;
+}
+
+static inline int drv_remain_on_channel(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel *chan,
+ unsigned int duration,
+ enum ieee80211_roc_type type)
+{
+ int ret;
+
+ might_sleep();
+
+ trace_drv_remain_on_channel(local, sdata, chan, duration, type);
+ ret = local->ops->remain_on_channel(&local->hw, &sdata->vif,
+ chan, duration, type);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local)
+{
+ int ret;
+
+ might_sleep();
+
+ trace_drv_cancel_remain_on_channel(local);
+ ret = local->ops->cancel_remain_on_channel(&local->hw);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline int drv_set_ringparam(struct ieee80211_local *local,
+ u32 tx, u32 rx)
+{
+ int ret = -ENOTSUPP;
+
+ might_sleep();
+
+ trace_drv_set_ringparam(local, tx, rx);
+ if (local->ops->set_ringparam)
+ ret = local->ops->set_ringparam(&local->hw, tx, rx);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline void drv_get_ringparam(struct ieee80211_local *local,
+ u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max)
+{
+ might_sleep();
+
+ trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max);
+ if (local->ops->get_ringparam)
+ local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max);
+ trace_drv_return_void(local);
+}
+
+static inline bool drv_tx_frames_pending(struct ieee80211_local *local)
+{
+ bool ret = false;
+
+ might_sleep();
+
+ trace_drv_tx_frames_pending(local);
+ if (local->ops->tx_frames_pending)
+ ret = local->ops->tx_frames_pending(&local->hw);
+ trace_drv_return_bool(local, ret);
+
+ return ret;
+}
+
+static inline int drv_set_bitrate_mask(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_bitrate_mask *mask)
+{
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_set_bitrate_mask(local, sdata, mask);
+ if (local->ops->set_bitrate_mask)
+ ret = local->ops->set_bitrate_mask(&local->hw,
+ &sdata->vif, mask);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline void drv_set_rekey_data(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_gtk_rekey_data *data)
+{
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_set_rekey_data(local, sdata, data);
+ if (local->ops->set_rekey_data)
+ local->ops->set_rekey_data(&local->hw, &sdata->vif, data);
+ trace_drv_return_void(local);
+}
+
+static inline void drv_event_callback(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const struct ieee80211_event *event)
+{
+ trace_drv_event_callback(local, sdata, event);
+ if (local->ops->event_callback)
+ local->ops->event_callback(&local->hw, &sdata->vif, event);
+ trace_drv_return_void(local);
+}
+
+static inline void
+drv_release_buffered_frames(struct ieee80211_local *local,
+ struct sta_info *sta, u16 tids, int num_frames,
+ enum ieee80211_frame_release_type reason,
+ bool more_data)
+{
+ trace_drv_release_buffered_frames(local, &sta->sta, tids, num_frames,
+ reason, more_data);
+ if (local->ops->release_buffered_frames)
+ local->ops->release_buffered_frames(&local->hw, &sta->sta, tids,
+ num_frames, reason,
+ more_data);
+ trace_drv_return_void(local);
+}
+
+static inline void
+drv_allow_buffered_frames(struct ieee80211_local *local,
+ struct sta_info *sta, u16 tids, int num_frames,
+ enum ieee80211_frame_release_type reason,
+ bool more_data)
+{
+ trace_drv_allow_buffered_frames(local, &sta->sta, tids, num_frames,
+ reason, more_data);
+ if (local->ops->allow_buffered_frames)
+ local->ops->allow_buffered_frames(&local->hw, &sta->sta,
+ tids, num_frames, reason,
+ more_data);
+ trace_drv_return_void(local);
+}
+
+static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+ WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);
+
+ trace_drv_mgd_prepare_tx(local, sdata);
+ if (local->ops->mgd_prepare_tx)
+ local->ops->mgd_prepare_tx(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline void
+drv_mgd_protect_tdls_discover(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+ WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);
+
+ trace_drv_mgd_protect_tdls_discover(local, sdata);
+ if (local->ops->mgd_protect_tdls_discover)
+ local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline int drv_add_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ int ret = -EOPNOTSUPP;
+
+ trace_drv_add_chanctx(local, ctx);
+ if (local->ops->add_chanctx)
+ ret = local->ops->add_chanctx(&local->hw, &ctx->conf);
+ trace_drv_return_int(local, ret);
+ if (!ret)
+ ctx->driver_present = true;
+
+ return ret;
+}
+
+static inline void drv_remove_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ if (WARN_ON(!ctx->driver_present))
+ return;
+
+ trace_drv_remove_chanctx(local, ctx);
+ if (local->ops->remove_chanctx)
+ local->ops->remove_chanctx(&local->hw, &ctx->conf);
+ trace_drv_return_void(local);
+ ctx->driver_present = false;
+}
+
+static inline void drv_change_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ u32 changed)
+{
+ trace_drv_change_chanctx(local, ctx, changed);
+ if (local->ops->change_chanctx) {
+ WARN_ON_ONCE(!ctx->driver_present);
+ local->ops->change_chanctx(&local->hw, &ctx->conf, changed);
+ }
+ trace_drv_return_void(local);
+}
+
+static inline int drv_assign_vif_chanctx(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_chanctx *ctx)
+{
+ int ret = 0;
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_assign_vif_chanctx(local, sdata, ctx);
+ if (local->ops->assign_vif_chanctx) {
+ WARN_ON_ONCE(!ctx->driver_present);
+ ret = local->ops->assign_vif_chanctx(&local->hw,
+ &sdata->vif,
+ &ctx->conf);
+ }
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline void drv_unassign_vif_chanctx(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_chanctx *ctx)
+{
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_unassign_vif_chanctx(local, sdata, ctx);
+ if (local->ops->unassign_vif_chanctx) {
+ WARN_ON_ONCE(!ctx->driver_present);
+ local->ops->unassign_vif_chanctx(&local->hw,
+ &sdata->vif,
+ &ctx->conf);
+ }
+ trace_drv_return_void(local);
+}
+
+static inline int
+drv_switch_vif_chanctx(struct ieee80211_local *local,
+ struct ieee80211_vif_chanctx_switch *vifs,
+ int n_vifs,
+ enum ieee80211_chanctx_switch_mode mode)
+{
+ int ret = 0;
+ int i;
+
+ if (!local->ops->switch_vif_chanctx)
+ return -EOPNOTSUPP;
+
+ for (i = 0; i < n_vifs; i++) {
+ struct ieee80211_chanctx *new_ctx =
+ container_of(vifs[i].new_ctx,
+ struct ieee80211_chanctx,
+ conf);
+ struct ieee80211_chanctx *old_ctx =
+ container_of(vifs[i].old_ctx,
+ struct ieee80211_chanctx,
+ conf);
+
+ WARN_ON_ONCE(!old_ctx->driver_present);
+ WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS &&
+ new_ctx->driver_present) ||
+ (mode == CHANCTX_SWMODE_REASSIGN_VIF &&
+ !new_ctx->driver_present));
+ }
+
+ trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode);
+ ret = local->ops->switch_vif_chanctx(&local->hw,
+ vifs, n_vifs, mode);
+ trace_drv_return_int(local, ret);
+
+ if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) {
+ for (i = 0; i < n_vifs; i++) {
+ struct ieee80211_chanctx *new_ctx =
+ container_of(vifs[i].new_ctx,
+ struct ieee80211_chanctx,
+ conf);
+ struct ieee80211_chanctx *old_ctx =
+ container_of(vifs[i].old_ctx,
+ struct ieee80211_chanctx,
+ conf);
+
+ new_ctx->driver_present = true;
+ old_ctx->driver_present = false;
+ }
+ }
+
+ return ret;
+}
+
+static inline int drv_start_ap(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ int ret = 0;
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_start_ap(local, sdata, &sdata->vif.bss_conf);
+ if (local->ops->start_ap)
+ ret = local->ops->start_ap(&local->hw, &sdata->vif);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_stop_ap(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_stop_ap(local, sdata);
+ if (local->ops->stop_ap)
+ local->ops->stop_ap(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline void
+drv_reconfig_complete(struct ieee80211_local *local,
+ enum ieee80211_reconfig_type reconfig_type)
+{
+ might_sleep();
+
+ trace_drv_reconfig_complete(local, reconfig_type);
+ if (local->ops->reconfig_complete)
+ local->ops->reconfig_complete(&local->hw, reconfig_type);
+ trace_drv_return_void(local);
+}
+
+static inline void
+drv_set_default_unicast_key(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ int key_idx)
+{
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ WARN_ON_ONCE(key_idx < -1 || key_idx > 3);
+
+ trace_drv_set_default_unicast_key(local, sdata, key_idx);
+ if (local->ops->set_default_unicast_key)
+ local->ops->set_default_unicast_key(&local->hw, &sdata->vif,
+ key_idx);
+ trace_drv_return_void(local);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline void drv_ipv6_addr_change(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct inet6_dev *idev)
+{
+ trace_drv_ipv6_addr_change(local, sdata);
+ if (local->ops->ipv6_addr_change)
+ local->ops->ipv6_addr_change(&local->hw, &sdata->vif, idev);
+ trace_drv_return_void(local);
+}
+#endif
+
+static inline void
+drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_local *local = sdata->local;
+
+ if (local->ops->channel_switch_beacon) {
+ trace_drv_channel_switch_beacon(local, sdata, chandef);
+ local->ops->channel_switch_beacon(&local->hw, &sdata->vif,
+ chandef);
+ }
+}
+
+static inline int
+drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel_switch *ch_switch)
+{
+ struct ieee80211_local *local = sdata->local;
+ int ret = 0;
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_pre_channel_switch(local, sdata, ch_switch);
+ if (local->ops->pre_channel_switch)
+ ret = local->ops->pre_channel_switch(&local->hw, &sdata->vif,
+ ch_switch);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline int
+drv_post_channel_switch(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ int ret = 0;
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_post_channel_switch(local, sdata);
+ if (local->ops->post_channel_switch)
+ ret = local->ops->post_channel_switch(&local->hw, &sdata->vif);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline int drv_join_ibss(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ int ret = 0;
+
+ might_sleep();
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ trace_drv_join_ibss(local, sdata, &sdata->vif.bss_conf);
+ if (local->ops->join_ibss)
+ ret = local->ops->join_ibss(&local->hw, &sdata->vif);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void drv_leave_ibss(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ might_sleep();
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_leave_ibss(local, sdata);
+ if (local->ops->leave_ibss)
+ local->ops->leave_ibss(&local->hw, &sdata->vif);
+ trace_drv_return_void(local);
+}
+
+static inline u32 drv_get_expected_throughput(struct ieee80211_local *local,
+ struct ieee80211_sta *sta)
+{
+ u32 ret = 0;
+
+ trace_drv_get_expected_throughput(sta);
+ if (local->ops->get_expected_throughput)
+ ret = local->ops->get_expected_throughput(sta);
+ trace_drv_return_u32(local, ret);
+
+ return ret;
+}
+
+static inline int drv_get_txpower(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata, int *dbm)
+{
+ int ret;
+
+ if (!local->ops->get_txpower)
+ return -EOPNOTSUPP;
+
+ ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm);
+ trace_drv_get_txpower(local, sdata, *dbm, ret);
+
+ return ret;
+}
+
+static inline int
+drv_tdls_channel_switch(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, u8 oper_class,
+ struct cfg80211_chan_def *chandef,
+ struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie)
+{
+ int ret;
+
+ might_sleep();
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ if (!local->ops->tdls_channel_switch)
+ return -EOPNOTSUPP;
+
+ trace_drv_tdls_channel_switch(local, sdata, sta, oper_class, chandef);
+ ret = local->ops->tdls_channel_switch(&local->hw, &sdata->vif, sta,
+ oper_class, chandef, tmpl_skb,
+ ch_sw_tm_ie);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
+static inline void
+drv_tdls_cancel_channel_switch(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta)
+{
+ might_sleep();
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ if (!local->ops->tdls_cancel_channel_switch)
+ return;
+
+ trace_drv_tdls_cancel_channel_switch(local, sdata, sta);
+ local->ops->tdls_cancel_channel_switch(&local->hw, &sdata->vif, sta);
+ trace_drv_return_void(local);
+}
+
+static inline void
+drv_tdls_recv_channel_switch(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_tdls_ch_sw_params *params)
+{
+ trace_drv_tdls_recv_channel_switch(local, sdata, params);
+ if (local->ops->tdls_recv_channel_switch)
+ local->ops->tdls_recv_channel_switch(&local->hw, &sdata->vif,
+ params);
+ trace_drv_return_void(local);
+}
+
+static inline void drv_wake_tx_queue(struct ieee80211_local *local,
+ struct txq_info *txq)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif);
+
+ if (!check_sdata_in_driver(sdata))
+ return;
+
+ trace_drv_wake_tx_queue(local, sdata, txq);
+ local->ops->wake_tx_queue(&local->hw, &txq->txq);
+}
+
+#endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
new file mode 100644
index 000000000..52bcea6ad
--- /dev/null
+++ b/net/mac80211/ethtool.c
@@ -0,0 +1,244 @@
+/*
+ * mac80211 ethtool hooks for cfg80211
+ *
+ * Copied from cfg.c - originally
+ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2014 Intel Corporation (Author: Johannes Berg)
+ *
+ * This file is GPLv2 as found in COPYING.
+ */
+#include <linux/types.h>
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+#include "driver-ops.h"
+
+static int ieee80211_set_ringparam(struct net_device *dev,
+ struct ethtool_ringparam *rp)
+{
+ struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy);
+
+ if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0)
+ return -EINVAL;
+
+ return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending);
+}
+
+static void ieee80211_get_ringparam(struct net_device *dev,
+ struct ethtool_ringparam *rp)
+{
+ struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy);
+
+ memset(rp, 0, sizeof(*rp));
+
+ drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending,
+ &rp->rx_pending, &rp->rx_max_pending);
+}
+
+static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = {
+ "rx_packets", "rx_bytes",
+ "rx_duplicates", "rx_fragments", "rx_dropped",
+ "tx_packets", "tx_bytes", "tx_fragments",
+ "tx_filtered", "tx_retry_failed", "tx_retries",
+ "beacon_loss", "sta_state", "txrate", "rxrate", "signal",
+ "channel", "noise", "ch_time", "ch_time_busy",
+ "ch_time_ext_busy", "ch_time_rx", "ch_time_tx"
+};
+#define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats)
+
+static int ieee80211_get_sset_count(struct net_device *dev, int sset)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ int rv = 0;
+
+ if (sset == ETH_SS_STATS)
+ rv += STA_STATS_LEN;
+
+ rv += drv_get_et_sset_count(sdata, sset);
+
+ if (rv == 0)
+ return -EOPNOTSUPP;
+ return rv;
+}
+
+static void ieee80211_get_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ u64 *data)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_channel *channel;
+ struct sta_info *sta;
+ struct ieee80211_local *local = sdata->local;
+ struct station_info sinfo;
+ struct survey_info survey;
+ int i, q;
+#define STA_STATS_SURVEY_LEN 7
+
+ memset(data, 0, sizeof(u64) * STA_STATS_LEN);
+
+#define ADD_STA_STATS(sta) \
+ do { \
+ data[i++] += sta->rx_packets; \
+ data[i++] += sta->rx_bytes; \
+ data[i++] += sta->num_duplicates; \
+ data[i++] += sta->rx_fragments; \
+ data[i++] += sta->rx_dropped; \
+ \
+ data[i++] += sinfo.tx_packets; \
+ data[i++] += sinfo.tx_bytes; \
+ data[i++] += sta->tx_fragments; \
+ data[i++] += sta->tx_filtered_count; \
+ data[i++] += sta->tx_retry_failed; \
+ data[i++] += sta->tx_retry_count; \
+ data[i++] += sta->beacon_loss_count; \
+ } while (0)
+
+ /* For Managed stations, find the single station based on BSSID
+ * and use that. For interface types, iterate through all available
+ * stations and add stats for any station that is assigned to this
+ * network device.
+ */
+
+ mutex_lock(&local->sta_mtx);
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ sta = sta_info_get_bss(sdata, sdata->u.mgd.bssid);
+
+ if (!(sta && !WARN_ON(sta->sdata->dev != dev)))
+ goto do_survey;
+
+ sinfo.filled = 0;
+ sta_set_sinfo(sta, &sinfo);
+
+ i = 0;
+ ADD_STA_STATS(sta);
+
+ data[i++] = sta->sta_state;
+
+
+ if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+ data[i] = 100000 *
+ cfg80211_calculate_bitrate(&sinfo.txrate);
+ i++;
+ if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
+ data[i] = 100000 *
+ cfg80211_calculate_bitrate(&sinfo.rxrate);
+ i++;
+
+ if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
+ data[i] = (u8)sinfo.signal_avg;
+ i++;
+ } else {
+ list_for_each_entry(sta, &local->sta_list, list) {
+ /* Make sure this station belongs to the proper dev */
+ if (sta->sdata->dev != dev)
+ continue;
+
+ sinfo.filled = 0;
+ sta_set_sinfo(sta, &sinfo);
+ i = 0;
+ ADD_STA_STATS(sta);
+ }
+ }
+
+do_survey:
+ i = STA_STATS_LEN - STA_STATS_SURVEY_LEN;
+ /* Get survey stats for current channel */
+ survey.filled = 0;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (chanctx_conf)
+ channel = chanctx_conf->def.chan;
+ else
+ channel = NULL;
+ rcu_read_unlock();
+
+ if (channel) {
+ q = 0;
+ do {
+ survey.filled = 0;
+ if (drv_get_survey(local, q, &survey) != 0) {
+ survey.filled = 0;
+ break;
+ }
+ q++;
+ } while (channel != survey.channel);
+ }
+
+ if (survey.filled)
+ data[i++] = survey.channel->center_freq;
+ else
+ data[i++] = 0;
+ if (survey.filled & SURVEY_INFO_NOISE_DBM)
+ data[i++] = (u8)survey.noise;
+ else
+ data[i++] = -1LL;
+ if (survey.filled & SURVEY_INFO_TIME)
+ data[i++] = survey.time;
+ else
+ data[i++] = -1LL;
+ if (survey.filled & SURVEY_INFO_TIME_BUSY)
+ data[i++] = survey.time_busy;
+ else
+ data[i++] = -1LL;
+ if (survey.filled & SURVEY_INFO_TIME_EXT_BUSY)
+ data[i++] = survey.time_ext_busy;
+ else
+ data[i++] = -1LL;
+ if (survey.filled & SURVEY_INFO_TIME_RX)
+ data[i++] = survey.time_rx;
+ else
+ data[i++] = -1LL;
+ if (survey.filled & SURVEY_INFO_TIME_TX)
+ data[i++] = survey.time_tx;
+ else
+ data[i++] = -1LL;
+
+ mutex_unlock(&local->sta_mtx);
+
+ if (WARN_ON(i != STA_STATS_LEN))
+ return;
+
+ drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN]));
+}
+
+static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ int sz_sta_stats = 0;
+
+ if (sset == ETH_SS_STATS) {
+ sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats);
+ memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats);
+ }
+ drv_get_et_strings(sdata, sset, &(data[sz_sta_stats]));
+}
+
+static int ieee80211_get_regs_len(struct net_device *dev)
+{
+ return 0;
+}
+
+static void ieee80211_get_regs(struct net_device *dev,
+ struct ethtool_regs *regs,
+ void *data)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ regs->version = wdev->wiphy->hw_version;
+ regs->len = 0;
+}
+
+const struct ethtool_ops ieee80211_ethtool_ops = {
+ .get_drvinfo = cfg80211_get_drvinfo,
+ .get_regs_len = ieee80211_get_regs_len,
+ .get_regs = ieee80211_get_regs,
+ .get_link = ethtool_op_get_link,
+ .get_ringparam = ieee80211_get_ringparam,
+ .set_ringparam = ieee80211_set_ringparam,
+ .get_strings = ieee80211_get_strings,
+ .get_ethtool_stats = ieee80211_get_stats,
+ .get_sset_count = ieee80211_get_sset_count,
+};
diff --git a/net/mac80211/event.c b/net/mac80211/event.c
new file mode 100644
index 000000000..01ae75951
--- /dev/null
+++ b/net/mac80211/event.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * mac80211 - events
+ */
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+
+/*
+ * Indicate a failed Michael MIC to userspace. If the caller knows the TSC of
+ * the frame that generated the MIC failure (i.e., if it was provided by the
+ * driver or is still in the frame), it should provide that information.
+ */
+void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
+ struct ieee80211_hdr *hdr, const u8 *tsc,
+ gfp_t gfp)
+{
+ cfg80211_michael_mic_failure(sdata->dev, hdr->addr2,
+ (hdr->addr1[0] & 0x01) ?
+ NL80211_KEYTYPE_GROUP :
+ NL80211_KEYTYPE_PAIRWISE,
+ keyidx, tsc, gfp);
+}
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
new file mode 100644
index 000000000..7a76ce639
--- /dev/null
+++ b/net/mac80211/ht.c
@@ -0,0 +1,522 @@
+/*
+ * HT handling
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+
+static void __check_htcap_disable(struct ieee80211_ht_cap *ht_capa,
+ struct ieee80211_ht_cap *ht_capa_mask,
+ struct ieee80211_sta_ht_cap *ht_cap,
+ u16 flag)
+{
+ __le16 le_flag = cpu_to_le16(flag);
+ if (ht_capa_mask->cap_info & le_flag) {
+ if (!(ht_capa->cap_info & le_flag))
+ ht_cap->cap &= ~flag;
+ }
+}
+
+static void __check_htcap_enable(struct ieee80211_ht_cap *ht_capa,
+ struct ieee80211_ht_cap *ht_capa_mask,
+ struct ieee80211_sta_ht_cap *ht_cap,
+ u16 flag)
+{
+ __le16 le_flag = cpu_to_le16(flag);
+
+ if ((ht_capa_mask->cap_info & le_flag) &&
+ (ht_capa->cap_info & le_flag))
+ ht_cap->cap |= flag;
+}
+
+void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta_ht_cap *ht_cap)
+{
+ struct ieee80211_ht_cap *ht_capa, *ht_capa_mask;
+ u8 *scaps, *smask;
+ int i;
+
+ if (!ht_cap->ht_supported)
+ return;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ ht_capa = &sdata->u.mgd.ht_capa;
+ ht_capa_mask = &sdata->u.mgd.ht_capa_mask;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ ht_capa = &sdata->u.ibss.ht_capa;
+ ht_capa_mask = &sdata->u.ibss.ht_capa_mask;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ scaps = (u8 *)(&ht_capa->mcs.rx_mask);
+ smask = (u8 *)(&ht_capa_mask->mcs.rx_mask);
+
+ /* NOTE: If you add more over-rides here, update register_hw
+ * ht_capa_mod_mask logic in main.c as well.
+ * And, if this method can ever change ht_cap.ht_supported, fix
+ * the check in ieee80211_add_ht_ie.
+ */
+
+ /* check for HT over-rides, MCS rates first. */
+ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) {
+ u8 m = smask[i];
+ ht_cap->mcs.rx_mask[i] &= ~m; /* turn off all masked bits */
+ /* Add back rates that are supported */
+ ht_cap->mcs.rx_mask[i] |= (m & scaps[i]);
+ }
+
+ /* Force removal of HT-40 capabilities? */
+ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+ IEEE80211_HT_CAP_SUP_WIDTH_20_40);
+ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+ IEEE80211_HT_CAP_SGI_40);
+
+ /* Allow user to disable SGI-20 (SGI-40 is handled above) */
+ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+ IEEE80211_HT_CAP_SGI_20);
+
+ /* Allow user to disable the max-AMSDU bit. */
+ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+ IEEE80211_HT_CAP_MAX_AMSDU);
+
+ /* Allow user to disable LDPC */
+ __check_htcap_disable(ht_capa, ht_capa_mask, ht_cap,
+ IEEE80211_HT_CAP_LDPC_CODING);
+
+ /* Allow user to enable 40 MHz intolerant bit. */
+ __check_htcap_enable(ht_capa, ht_capa_mask, ht_cap,
+ IEEE80211_HT_CAP_40MHZ_INTOLERANT);
+
+ /* Allow user to decrease AMPDU factor */
+ if (ht_capa_mask->ampdu_params_info &
+ IEEE80211_HT_AMPDU_PARM_FACTOR) {
+ u8 n = ht_capa->ampdu_params_info &
+ IEEE80211_HT_AMPDU_PARM_FACTOR;
+ if (n < ht_cap->ampdu_factor)
+ ht_cap->ampdu_factor = n;
+ }
+
+ /* Allow the user to increase AMPDU density. */
+ if (ht_capa_mask->ampdu_params_info &
+ IEEE80211_HT_AMPDU_PARM_DENSITY) {
+ u8 n = (ht_capa->ampdu_params_info &
+ IEEE80211_HT_AMPDU_PARM_DENSITY)
+ >> IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT;
+ if (n > ht_cap->ampdu_density)
+ ht_cap->ampdu_density = n;
+ }
+}
+
+
+bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const struct ieee80211_ht_cap *ht_cap_ie,
+ struct sta_info *sta)
+{
+ struct ieee80211_sta_ht_cap ht_cap, own_cap;
+ u8 ampdu_info, tx_mcs_set_cap;
+ int i, max_tx_streams;
+ bool changed;
+ enum ieee80211_sta_rx_bandwidth bw;
+ enum ieee80211_smps_mode smps_mode;
+
+ memset(&ht_cap, 0, sizeof(ht_cap));
+
+ if (!ht_cap_ie || !sband->ht_cap.ht_supported)
+ goto apply;
+
+ ht_cap.ht_supported = true;
+
+ own_cap = sband->ht_cap;
+
+ /*
+ * If user has specified capability over-rides, take care
+ * of that if the station we're setting up is the AP or TDLS peer that
+ * we advertised a restricted capability set to. Override
+ * our own capabilities and then use those below.
+ */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION ||
+ sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ ieee80211_apply_htcap_overrides(sdata, &own_cap);
+
+ /*
+ * The bits listed in this expression should be
+ * the same for the peer and us, if the station
+ * advertises more then we can't use those thus
+ * we mask them out.
+ */
+ ht_cap.cap = le16_to_cpu(ht_cap_ie->cap_info) &
+ (own_cap.cap | ~(IEEE80211_HT_CAP_LDPC_CODING |
+ IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+ IEEE80211_HT_CAP_GRN_FLD |
+ IEEE80211_HT_CAP_SGI_20 |
+ IEEE80211_HT_CAP_SGI_40 |
+ IEEE80211_HT_CAP_DSSSCCK40));
+
+ /*
+ * The STBC bits are asymmetric -- if we don't have
+ * TX then mask out the peer's RX and vice versa.
+ */
+ if (!(own_cap.cap & IEEE80211_HT_CAP_TX_STBC))
+ ht_cap.cap &= ~IEEE80211_HT_CAP_RX_STBC;
+ if (!(own_cap.cap & IEEE80211_HT_CAP_RX_STBC))
+ ht_cap.cap &= ~IEEE80211_HT_CAP_TX_STBC;
+
+ ampdu_info = ht_cap_ie->ampdu_params_info;
+ ht_cap.ampdu_factor =
+ ampdu_info & IEEE80211_HT_AMPDU_PARM_FACTOR;
+ ht_cap.ampdu_density =
+ (ampdu_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> 2;
+
+ /* own MCS TX capabilities */
+ tx_mcs_set_cap = own_cap.mcs.tx_params;
+
+ /* Copy peer MCS TX capabilities, the driver might need them. */
+ ht_cap.mcs.tx_params = ht_cap_ie->mcs.tx_params;
+
+ /* can we TX with MCS rates? */
+ if (!(tx_mcs_set_cap & IEEE80211_HT_MCS_TX_DEFINED))
+ goto apply;
+
+ /* Counting from 0, therefore +1 */
+ if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_RX_DIFF)
+ max_tx_streams =
+ ((tx_mcs_set_cap & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK)
+ >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1;
+ else
+ max_tx_streams = IEEE80211_HT_MCS_TX_MAX_STREAMS;
+
+ /*
+ * 802.11n-2009 20.3.5 / 20.6 says:
+ * - indices 0 to 7 and 32 are single spatial stream
+ * - 8 to 31 are multiple spatial streams using equal modulation
+ * [8..15 for two streams, 16..23 for three and 24..31 for four]
+ * - remainder are multiple spatial streams using unequal modulation
+ */
+ for (i = 0; i < max_tx_streams; i++)
+ ht_cap.mcs.rx_mask[i] =
+ own_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i];
+
+ if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION)
+ for (i = IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE;
+ i < IEEE80211_HT_MCS_MASK_LEN; i++)
+ ht_cap.mcs.rx_mask[i] =
+ own_cap.mcs.rx_mask[i] &
+ ht_cap_ie->mcs.rx_mask[i];
+
+ /* handle MCS rate 32 too */
+ if (own_cap.mcs.rx_mask[32/8] & ht_cap_ie->mcs.rx_mask[32/8] & 1)
+ ht_cap.mcs.rx_mask[32/8] |= 1;
+
+ /* set Rx highest rate */
+ ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest;
+
+ apply:
+ changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap));
+
+ memcpy(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap));
+
+ switch (sdata->vif.bss_conf.chandef.width) {
+ default:
+ WARN_ON_ONCE(1);
+ /* fall through */
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ bw = IEEE80211_STA_RX_BW_20;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_160:
+ bw = ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
+ IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20;
+ break;
+ }
+
+ sta->sta.bandwidth = bw;
+
+ sta->cur_max_bandwidth =
+ ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
+ IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20;
+
+ switch ((ht_cap.cap & IEEE80211_HT_CAP_SM_PS)
+ >> IEEE80211_HT_CAP_SM_PS_SHIFT) {
+ case WLAN_HT_CAP_SM_PS_INVALID:
+ case WLAN_HT_CAP_SM_PS_STATIC:
+ smps_mode = IEEE80211_SMPS_STATIC;
+ break;
+ case WLAN_HT_CAP_SM_PS_DYNAMIC:
+ smps_mode = IEEE80211_SMPS_DYNAMIC;
+ break;
+ case WLAN_HT_CAP_SM_PS_DISABLED:
+ smps_mode = IEEE80211_SMPS_OFF;
+ break;
+ }
+
+ if (smps_mode != sta->sta.smps_mode)
+ changed = true;
+ sta->sta.smps_mode = smps_mode;
+
+ return changed;
+}
+
+void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
+ enum ieee80211_agg_stop_reason reason)
+{
+ int i;
+
+ cancel_work_sync(&sta->ampdu_mlme.work);
+
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ __ieee80211_stop_tx_ba_session(sta, i, reason);
+ __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT,
+ WLAN_REASON_QSTA_LEAVE_QBSS,
+ reason != AGG_STOP_DESTROY_STA &&
+ reason != AGG_STOP_PEER_REQUEST);
+ }
+}
+
+void ieee80211_ba_session_work(struct work_struct *work)
+{
+ struct sta_info *sta =
+ container_of(work, struct sta_info, ampdu_mlme.work);
+ struct tid_ampdu_tx *tid_tx;
+ int tid;
+
+ /*
+ * When this flag is set, new sessions should be
+ * blocked, and existing sessions will be torn
+ * down by the code that set the flag, so this
+ * need not run.
+ */
+ if (test_sta_flag(sta, WLAN_STA_BLOCK_BA))
+ return;
+
+ mutex_lock(&sta->ampdu_mlme.mtx);
+ for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
+ if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired))
+ ___ieee80211_stop_rx_ba_session(
+ sta, tid, WLAN_BACK_RECIPIENT,
+ WLAN_REASON_QSTA_TIMEOUT, true);
+
+ if (test_and_clear_bit(tid,
+ sta->ampdu_mlme.tid_rx_stop_requested))
+ ___ieee80211_stop_rx_ba_session(
+ sta, tid, WLAN_BACK_RECIPIENT,
+ WLAN_REASON_UNSPECIFIED, true);
+
+ spin_lock_bh(&sta->lock);
+
+ tid_tx = sta->ampdu_mlme.tid_start_tx[tid];
+ if (tid_tx) {
+ /*
+ * Assign it over to the normal tid_tx array
+ * where it "goes live".
+ */
+
+ sta->ampdu_mlme.tid_start_tx[tid] = NULL;
+ /* could there be a race? */
+ if (sta->ampdu_mlme.tid_tx[tid])
+ kfree(tid_tx);
+ else
+ ieee80211_assign_tid_tx(sta, tid, tid_tx);
+ spin_unlock_bh(&sta->lock);
+
+ ieee80211_tx_ba_session_handle_start(sta, tid);
+ continue;
+ }
+ spin_unlock_bh(&sta->lock);
+
+ tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+ if (tid_tx && test_and_clear_bit(HT_AGG_STATE_WANT_STOP,
+ &tid_tx->state))
+ ___ieee80211_stop_tx_ba_session(sta, tid,
+ AGG_STOP_LOCAL_REQUEST);
+ }
+ mutex_unlock(&sta->ampdu_mlme.mtx);
+}
+
+void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
+ const u8 *da, u16 tid,
+ u16 initiator, u16 reason_code)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ u16 params;
+
+ skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+ memset(mgmt, 0, 24);
+ memcpy(mgmt->da, da, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN);
+
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+
+ skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba));
+
+ mgmt->u.action.category = WLAN_CATEGORY_BACK;
+ mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA;
+ params = (u16)(initiator << 11); /* bit 11 initiator */
+ params |= (u16)(tid << 12); /* bit 15:12 TID number */
+
+ mgmt->u.action.u.delba.params = cpu_to_le16(params);
+ mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code);
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ u16 tid, params;
+ u16 initiator;
+
+ params = le16_to_cpu(mgmt->u.action.u.delba.params);
+ tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12;
+ initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11;
+
+ ht_dbg_ratelimited(sdata, "delba from %pM (%s) tid %d reason code %d\n",
+ mgmt->sa, initiator ? "initiator" : "recipient",
+ tid,
+ le16_to_cpu(mgmt->u.action.u.delba.reason_code));
+
+ if (initiator == WLAN_BACK_INITIATOR)
+ __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0,
+ true);
+ else
+ __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_PEER_REQUEST);
+}
+
+int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_smps_mode smps, const u8 *da,
+ const u8 *bssid)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *action_frame;
+
+ /* 27 = header + category + action + smps mode */
+ skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+ action_frame = (void *)skb_put(skb, 27);
+ memcpy(action_frame->da, da, ETH_ALEN);
+ memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN);
+ memcpy(action_frame->bssid, bssid, ETH_ALEN);
+ action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+ action_frame->u.action.category = WLAN_CATEGORY_HT;
+ action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS;
+ switch (smps) {
+ case IEEE80211_SMPS_AUTOMATIC:
+ case IEEE80211_SMPS_NUM_MODES:
+ WARN_ON(1);
+ case IEEE80211_SMPS_OFF:
+ action_frame->u.action.u.ht_smps.smps_control =
+ WLAN_HT_SMPS_CONTROL_DISABLED;
+ break;
+ case IEEE80211_SMPS_STATIC:
+ action_frame->u.action.u.ht_smps.smps_control =
+ WLAN_HT_SMPS_CONTROL_STATIC;
+ break;
+ case IEEE80211_SMPS_DYNAMIC:
+ action_frame->u.action.u.ht_smps.smps_control =
+ WLAN_HT_SMPS_CONTROL_DYNAMIC;
+ break;
+ }
+
+ /* we'll do more on status of this frame */
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+ ieee80211_tx_skb(sdata, skb);
+
+ return 0;
+}
+
+void ieee80211_request_smps_mgd_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ u.mgd.request_smps_work);
+
+ sdata_lock(sdata);
+ __ieee80211_request_smps_mgd(sdata, sdata->u.mgd.driver_smps_mode);
+ sdata_unlock(sdata);
+}
+
+void ieee80211_request_smps_ap_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ u.ap.request_smps_work);
+
+ sdata_lock(sdata);
+ if (sdata_dereference(sdata->u.ap.beacon, sdata))
+ __ieee80211_request_smps_ap(sdata,
+ sdata->u.ap.driver_smps_mode);
+ sdata_unlock(sdata);
+}
+
+void ieee80211_request_smps(struct ieee80211_vif *vif,
+ enum ieee80211_smps_mode smps_mode)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (WARN_ON_ONCE(vif->type != NL80211_IFTYPE_STATION &&
+ vif->type != NL80211_IFTYPE_AP))
+ return;
+
+ if (vif->type == NL80211_IFTYPE_STATION) {
+ if (sdata->u.mgd.driver_smps_mode == smps_mode)
+ return;
+ sdata->u.mgd.driver_smps_mode = smps_mode;
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->u.mgd.request_smps_work);
+ } else {
+ /* AUTOMATIC is meaningless in AP mode */
+ if (WARN_ON_ONCE(smps_mode == IEEE80211_SMPS_AUTOMATIC))
+ return;
+ if (sdata->u.ap.driver_smps_mode == smps_mode)
+ return;
+ sdata->u.ap.driver_smps_mode = smps_mode;
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->u.ap.request_smps_work);
+ }
+}
+/* this might change ... don't want non-open drivers using it */
+EXPORT_SYMBOL_GPL(ieee80211_request_smps);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
new file mode 100644
index 000000000..a9c9d961f
--- /dev/null
+++ b/net/mac80211/ibss.c
@@ -0,0 +1,1861 @@
+/*
+ * IBSS mode implementation
+ * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/mac80211.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+
+#define IEEE80211_SCAN_INTERVAL (2 * HZ)
+#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
+
+#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
+#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
+#define IEEE80211_IBSS_RSN_INACTIVITY_LIMIT (10 * HZ)
+
+#define IEEE80211_IBSS_MAX_STA_ENTRIES 128
+
+static struct beacon_data *
+ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
+ const int beacon_int, const u32 basic_rates,
+ const u16 capability, u64 tsf,
+ struct cfg80211_chan_def *chandef,
+ bool *have_higher_than_11mbit,
+ struct cfg80211_csa_settings *csa_settings)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ int rates_n = 0, i, ri;
+ struct ieee80211_mgmt *mgmt;
+ u8 *pos;
+ struct ieee80211_supported_band *sband;
+ u32 rate_flags, rates = 0, rates_added = 0;
+ struct beacon_data *presp;
+ int frame_len;
+ int shift;
+
+ /* Build IBSS probe response */
+ frame_len = sizeof(struct ieee80211_hdr_3addr) +
+ 12 /* struct ieee80211_mgmt.u.beacon */ +
+ 2 + IEEE80211_MAX_SSID_LEN /* max SSID */ +
+ 2 + 8 /* max Supported Rates */ +
+ 3 /* max DS params */ +
+ 4 /* IBSS params */ +
+ 5 /* Channel Switch Announcement */ +
+ 2 + (IEEE80211_MAX_SUPP_RATES - 8) +
+ 2 + sizeof(struct ieee80211_ht_cap) +
+ 2 + sizeof(struct ieee80211_ht_operation) +
+ ifibss->ie_len;
+ presp = kzalloc(sizeof(*presp) + frame_len, GFP_KERNEL);
+ if (!presp)
+ return NULL;
+
+ presp->head = (void *)(presp + 1);
+
+ mgmt = (void *) presp->head;
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_PROBE_RESP);
+ eth_broadcast_addr(mgmt->da);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN);
+ mgmt->u.beacon.beacon_int = cpu_to_le16(beacon_int);
+ mgmt->u.beacon.timestamp = cpu_to_le64(tsf);
+ mgmt->u.beacon.capab_info = cpu_to_le16(capability);
+
+ pos = (u8 *)mgmt + offsetof(struct ieee80211_mgmt, u.beacon.variable);
+
+ *pos++ = WLAN_EID_SSID;
+ *pos++ = ifibss->ssid_len;
+ memcpy(pos, ifibss->ssid, ifibss->ssid_len);
+ pos += ifibss->ssid_len;
+
+ sband = local->hw.wiphy->bands[chandef->chan->band];
+ rate_flags = ieee80211_chandef_rate_flags(chandef);
+ shift = ieee80211_chandef_get_shift(chandef);
+ rates_n = 0;
+ if (have_higher_than_11mbit)
+ *have_higher_than_11mbit = false;
+
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+ if (sband->bitrates[i].bitrate > 110 &&
+ have_higher_than_11mbit)
+ *have_higher_than_11mbit = true;
+
+ rates |= BIT(i);
+ rates_n++;
+ }
+
+ *pos++ = WLAN_EID_SUPP_RATES;
+ *pos++ = min_t(int, 8, rates_n);
+ for (ri = 0; ri < sband->n_bitrates; ri++) {
+ int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate,
+ 5 * (1 << shift));
+ u8 basic = 0;
+ if (!(rates & BIT(ri)))
+ continue;
+
+ if (basic_rates & BIT(ri))
+ basic = 0x80;
+ *pos++ = basic | (u8) rate;
+ if (++rates_added == 8) {
+ ri++; /* continue at next rate for EXT_SUPP_RATES */
+ break;
+ }
+ }
+
+ if (sband->band == IEEE80211_BAND_2GHZ) {
+ *pos++ = WLAN_EID_DS_PARAMS;
+ *pos++ = 1;
+ *pos++ = ieee80211_frequency_to_channel(
+ chandef->chan->center_freq);
+ }
+
+ *pos++ = WLAN_EID_IBSS_PARAMS;
+ *pos++ = 2;
+ /* FIX: set ATIM window based on scan results */
+ *pos++ = 0;
+ *pos++ = 0;
+
+ if (csa_settings) {
+ *pos++ = WLAN_EID_CHANNEL_SWITCH;
+ *pos++ = 3;
+ *pos++ = csa_settings->block_tx ? 1 : 0;
+ *pos++ = ieee80211_frequency_to_channel(
+ csa_settings->chandef.chan->center_freq);
+ presp->csa_counter_offsets[0] = (pos - presp->head);
+ *pos++ = csa_settings->count;
+ presp->csa_current_counter = csa_settings->count;
+ }
+
+ /* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */
+ if (rates_n > 8) {
+ *pos++ = WLAN_EID_EXT_SUPP_RATES;
+ *pos++ = rates_n - 8;
+ for (; ri < sband->n_bitrates; ri++) {
+ int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate,
+ 5 * (1 << shift));
+ u8 basic = 0;
+ if (!(rates & BIT(ri)))
+ continue;
+
+ if (basic_rates & BIT(ri))
+ basic = 0x80;
+ *pos++ = basic | (u8) rate;
+ }
+ }
+
+ if (ifibss->ie_len) {
+ memcpy(pos, ifibss->ie, ifibss->ie_len);
+ pos += ifibss->ie_len;
+ }
+
+ /* add HT capability and information IEs */
+ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT &&
+ chandef->width != NL80211_CHAN_WIDTH_5 &&
+ chandef->width != NL80211_CHAN_WIDTH_10 &&
+ sband->ht_cap.ht_supported) {
+ struct ieee80211_sta_ht_cap ht_cap;
+
+ memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap));
+ ieee80211_apply_htcap_overrides(sdata, &ht_cap);
+
+ pos = ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap);
+ /*
+ * Note: According to 802.11n-2009 9.13.3.1, HT Protection
+ * field and RIFS Mode are reserved in IBSS mode, therefore
+ * keep them at 0
+ */
+ pos = ieee80211_ie_build_ht_oper(pos, &sband->ht_cap,
+ chandef, 0);
+
+ /* add VHT capability and information IEs */
+ if (chandef->width != NL80211_CHAN_WIDTH_20 &&
+ chandef->width != NL80211_CHAN_WIDTH_40 &&
+ sband->vht_cap.vht_supported) {
+ pos = ieee80211_ie_build_vht_cap(pos, &sband->vht_cap,
+ sband->vht_cap.cap);
+ pos = ieee80211_ie_build_vht_oper(pos, &sband->vht_cap,
+ chandef);
+ }
+ }
+
+ if (local->hw.queues >= IEEE80211_NUM_ACS)
+ pos = ieee80211_add_wmm_info_ie(pos, 0); /* U-APSD not in use */
+
+ presp->head_len = pos - presp->head;
+ if (WARN_ON(presp->head_len > frame_len))
+ goto error;
+
+ return presp;
+error:
+ kfree(presp);
+ return NULL;
+}
+
+static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, const int beacon_int,
+ struct cfg80211_chan_def *req_chandef,
+ const u32 basic_rates,
+ const u16 capability, u64 tsf,
+ bool creator)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_mgmt *mgmt;
+ struct cfg80211_bss *bss;
+ u32 bss_change;
+ struct cfg80211_chan_def chandef;
+ struct ieee80211_channel *chan;
+ struct beacon_data *presp;
+ enum nl80211_bss_scan_width scan_width;
+ bool have_higher_than_11mbit;
+ bool radar_required;
+ int err;
+
+ sdata_assert_lock(sdata);
+
+ /* Reset own TSF to allow time synchronization work. */
+ drv_reset_tsf(local, sdata);
+
+ if (!ether_addr_equal(ifibss->bssid, bssid))
+ sta_info_flush(sdata);
+
+ /* if merging, indicate to driver that we leave the old IBSS */
+ if (sdata->vif.bss_conf.ibss_joined) {
+ sdata->vif.bss_conf.ibss_joined = false;
+ sdata->vif.bss_conf.ibss_creator = false;
+ sdata->vif.bss_conf.enable_beacon = false;
+ netif_carrier_off(sdata->dev);
+ ieee80211_bss_info_change_notify(sdata,
+ BSS_CHANGED_IBSS |
+ BSS_CHANGED_BEACON_ENABLED);
+ drv_leave_ibss(local, sdata);
+ }
+
+ presp = rcu_dereference_protected(ifibss->presp,
+ lockdep_is_held(&sdata->wdev.mtx));
+ RCU_INIT_POINTER(ifibss->presp, NULL);
+ if (presp)
+ kfree_rcu(presp, rcu_head);
+
+ /* make a copy of the chandef, it could be modified below. */
+ chandef = *req_chandef;
+ chan = chandef.chan;
+ if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef,
+ NL80211_IFTYPE_ADHOC)) {
+ if (chandef.width == NL80211_CHAN_WIDTH_5 ||
+ chandef.width == NL80211_CHAN_WIDTH_10 ||
+ chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ chandef.width == NL80211_CHAN_WIDTH_20) {
+ sdata_info(sdata,
+ "Failed to join IBSS, beacons forbidden\n");
+ return;
+ }
+ chandef.width = NL80211_CHAN_WIDTH_20;
+ chandef.center_freq1 = chan->center_freq;
+ /* check again for downgraded chandef */
+ if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chandef,
+ NL80211_IFTYPE_ADHOC)) {
+ sdata_info(sdata,
+ "Failed to join IBSS, beacons forbidden\n");
+ return;
+ }
+ }
+
+ err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
+ &chandef, NL80211_IFTYPE_ADHOC);
+ if (err < 0) {
+ sdata_info(sdata,
+ "Failed to join IBSS, invalid chandef\n");
+ return;
+ }
+ if (err > 0 && !ifibss->userspace_handles_dfs) {
+ sdata_info(sdata,
+ "Failed to join IBSS, DFS channel without control program\n");
+ return;
+ }
+
+ radar_required = err;
+
+ mutex_lock(&local->mtx);
+ if (ieee80211_vif_use_channel(sdata, &chandef,
+ ifibss->fixed_channel ?
+ IEEE80211_CHANCTX_SHARED :
+ IEEE80211_CHANCTX_EXCLUSIVE)) {
+ sdata_info(sdata, "Failed to join IBSS, no channel context\n");
+ mutex_unlock(&local->mtx);
+ return;
+ }
+ sdata->radar_required = radar_required;
+ mutex_unlock(&local->mtx);
+
+ memcpy(ifibss->bssid, bssid, ETH_ALEN);
+
+ presp = ieee80211_ibss_build_presp(sdata, beacon_int, basic_rates,
+ capability, tsf, &chandef,
+ &have_higher_than_11mbit, NULL);
+ if (!presp)
+ return;
+
+ rcu_assign_pointer(ifibss->presp, presp);
+ mgmt = (void *)presp->head;
+
+ sdata->vif.bss_conf.enable_beacon = true;
+ sdata->vif.bss_conf.beacon_int = beacon_int;
+ sdata->vif.bss_conf.basic_rates = basic_rates;
+ sdata->vif.bss_conf.ssid_len = ifibss->ssid_len;
+ memcpy(sdata->vif.bss_conf.ssid, ifibss->ssid, ifibss->ssid_len);
+ bss_change = BSS_CHANGED_BEACON_INT;
+ bss_change |= ieee80211_reset_erp_info(sdata);
+ bss_change |= BSS_CHANGED_BSSID;
+ bss_change |= BSS_CHANGED_BEACON;
+ bss_change |= BSS_CHANGED_BEACON_ENABLED;
+ bss_change |= BSS_CHANGED_BASIC_RATES;
+ bss_change |= BSS_CHANGED_HT;
+ bss_change |= BSS_CHANGED_IBSS;
+ bss_change |= BSS_CHANGED_SSID;
+
+ /*
+ * In 5 GHz/802.11a, we can always use short slot time.
+ * (IEEE 802.11-2012 18.3.8.7)
+ *
+ * In 2.4GHz, we must always use long slots in IBSS for compatibility
+ * reasons.
+ * (IEEE 802.11-2012 19.4.5)
+ *
+ * HT follows these specifications (IEEE 802.11-2012 20.3.18)
+ */
+ sdata->vif.bss_conf.use_short_slot = chan->band == IEEE80211_BAND_5GHZ;
+ bss_change |= BSS_CHANGED_ERP_SLOT;
+
+ /* cf. IEEE 802.11 9.2.12 */
+ if (chan->band == IEEE80211_BAND_2GHZ && have_higher_than_11mbit)
+ sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+ else
+ sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
+
+ ieee80211_set_wmm_default(sdata, true);
+
+ sdata->vif.bss_conf.ibss_joined = true;
+ sdata->vif.bss_conf.ibss_creator = creator;
+
+ err = drv_join_ibss(local, sdata);
+ if (err) {
+ sdata->vif.bss_conf.ibss_joined = false;
+ sdata->vif.bss_conf.ibss_creator = false;
+ sdata->vif.bss_conf.enable_beacon = false;
+ sdata->vif.bss_conf.ssid_len = 0;
+ RCU_INIT_POINTER(ifibss->presp, NULL);
+ kfree_rcu(presp, rcu_head);
+ mutex_lock(&local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&local->mtx);
+ sdata_info(sdata, "Failed to join IBSS, driver failure: %d\n",
+ err);
+ return;
+ }
+
+ ieee80211_bss_info_change_notify(sdata, bss_change);
+
+ ifibss->state = IEEE80211_IBSS_MLME_JOINED;
+ mod_timer(&ifibss->timer,
+ round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
+
+ scan_width = cfg80211_chandef_to_scan_width(&chandef);
+ bss = cfg80211_inform_bss_width_frame(local->hw.wiphy, chan,
+ scan_width, mgmt,
+ presp->head_len, 0, GFP_KERNEL);
+ cfg80211_put_bss(local->hw.wiphy, bss);
+ netif_carrier_on(sdata->dev);
+ cfg80211_ibss_joined(sdata->dev, ifibss->bssid, chan, GFP_KERNEL);
+}
+
+static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_bss *bss)
+{
+ struct cfg80211_bss *cbss =
+ container_of((void *)bss, struct cfg80211_bss, priv);
+ struct ieee80211_supported_band *sband;
+ struct cfg80211_chan_def chandef;
+ u32 basic_rates;
+ int i, j;
+ u16 beacon_int = cbss->beacon_interval;
+ const struct cfg80211_bss_ies *ies;
+ enum nl80211_channel_type chan_type;
+ u64 tsf;
+ u32 rate_flags;
+ int shift;
+
+ sdata_assert_lock(sdata);
+
+ if (beacon_int < 10)
+ beacon_int = 10;
+
+ switch (sdata->u.ibss.chandef.width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_40:
+ chan_type = cfg80211_get_chandef_type(&sdata->u.ibss.chandef);
+ cfg80211_chandef_create(&chandef, cbss->channel, chan_type);
+ break;
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ cfg80211_chandef_create(&chandef, cbss->channel,
+ NL80211_CHAN_WIDTH_20_NOHT);
+ chandef.width = sdata->u.ibss.chandef.width;
+ break;
+ case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_160:
+ chandef = sdata->u.ibss.chandef;
+ chandef.chan = cbss->channel;
+ break;
+ default:
+ /* fall back to 20 MHz for unsupported modes */
+ cfg80211_chandef_create(&chandef, cbss->channel,
+ NL80211_CHAN_WIDTH_20_NOHT);
+ break;
+ }
+
+ sband = sdata->local->hw.wiphy->bands[cbss->channel->band];
+ rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef);
+ shift = ieee80211_vif_get_shift(&sdata->vif);
+
+ basic_rates = 0;
+
+ for (i = 0; i < bss->supp_rates_len; i++) {
+ int rate = bss->supp_rates[i] & 0x7f;
+ bool is_basic = !!(bss->supp_rates[i] & 0x80);
+
+ for (j = 0; j < sband->n_bitrates; j++) {
+ int brate;
+ if ((rate_flags & sband->bitrates[j].flags)
+ != rate_flags)
+ continue;
+
+ brate = DIV_ROUND_UP(sband->bitrates[j].bitrate,
+ 5 * (1 << shift));
+ if (brate == rate) {
+ if (is_basic)
+ basic_rates |= BIT(j);
+ break;
+ }
+ }
+ }
+
+ rcu_read_lock();
+ ies = rcu_dereference(cbss->ies);
+ tsf = ies->tsf;
+ rcu_read_unlock();
+
+ __ieee80211_sta_join_ibss(sdata, cbss->bssid,
+ beacon_int,
+ &chandef,
+ basic_rates,
+ cbss->capability,
+ tsf, false);
+}
+
+int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_csa_settings *csa_settings)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct beacon_data *presp, *old_presp;
+ struct cfg80211_bss *cbss;
+ const struct cfg80211_bss_ies *ies;
+ u16 capability = 0;
+ u64 tsf;
+ int ret = 0;
+
+ sdata_assert_lock(sdata);
+
+ if (ifibss->privacy)
+ capability = WLAN_CAPABILITY_PRIVACY;
+
+ cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan,
+ ifibss->bssid, ifibss->ssid,
+ ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS,
+ IEEE80211_PRIVACY(ifibss->privacy));
+
+ if (WARN_ON(!cbss)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rcu_read_lock();
+ ies = rcu_dereference(cbss->ies);
+ tsf = ies->tsf;
+ rcu_read_unlock();
+ cfg80211_put_bss(sdata->local->hw.wiphy, cbss);
+
+ old_presp = rcu_dereference_protected(ifibss->presp,
+ lockdep_is_held(&sdata->wdev.mtx));
+
+ presp = ieee80211_ibss_build_presp(sdata,
+ sdata->vif.bss_conf.beacon_int,
+ sdata->vif.bss_conf.basic_rates,
+ capability, tsf, &ifibss->chandef,
+ NULL, csa_settings);
+ if (!presp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rcu_assign_pointer(ifibss->presp, presp);
+ if (old_presp)
+ kfree_rcu(old_presp, rcu_head);
+
+ return BSS_CHANGED_BEACON;
+ out:
+ return ret;
+}
+
+int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct cfg80211_bss *cbss;
+ int err, changed = 0;
+
+ sdata_assert_lock(sdata);
+
+ /* update cfg80211 bss information with the new channel */
+ if (!is_zero_ether_addr(ifibss->bssid)) {
+ cbss = cfg80211_get_bss(sdata->local->hw.wiphy,
+ ifibss->chandef.chan,
+ ifibss->bssid, ifibss->ssid,
+ ifibss->ssid_len,
+ IEEE80211_BSS_TYPE_IBSS,
+ IEEE80211_PRIVACY(ifibss->privacy));
+ /* XXX: should not really modify cfg80211 data */
+ if (cbss) {
+ cbss->channel = sdata->csa_chandef.chan;
+ cfg80211_put_bss(sdata->local->hw.wiphy, cbss);
+ }
+ }
+
+ ifibss->chandef = sdata->csa_chandef;
+
+ /* generate the beacon */
+ err = ieee80211_ibss_csa_beacon(sdata, NULL);
+ if (err < 0)
+ return err;
+
+ changed |= err;
+
+ return changed;
+}
+
+void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ cancel_work_sync(&ifibss->csa_connection_drop_work);
+}
+
+static struct sta_info *ieee80211_ibss_finish_sta(struct sta_info *sta)
+ __acquires(RCU)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u8 addr[ETH_ALEN];
+
+ memcpy(addr, sta->sta.addr, ETH_ALEN);
+
+ ibss_dbg(sdata, "Adding new IBSS station %pM\n", addr);
+
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
+ sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
+ /* authorize the station only if the network is not RSN protected. If
+ * not wait for the userspace to authorize it */
+ if (!sta->sdata->u.ibss.control_port)
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED);
+
+ rate_control_rate_init(sta);
+
+ /* If it fails, maybe we raced another insertion? */
+ if (sta_info_insert_rcu(sta))
+ return sta_info_get(sdata, addr);
+ return sta;
+}
+
+static struct sta_info *
+ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
+ const u8 *addr, u32 supp_rates)
+ __acquires(RCU)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_supported_band *sband;
+ enum nl80211_bss_scan_width scan_width;
+ int band;
+
+ /*
+ * XXX: Consider removing the least recently used entry and
+ * allow new one to be added.
+ */
+ if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
+ net_info_ratelimited("%s: No room for a new IBSS STA entry %pM\n",
+ sdata->name, addr);
+ rcu_read_lock();
+ return NULL;
+ }
+
+ if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH) {
+ rcu_read_lock();
+ return NULL;
+ }
+
+ if (!ether_addr_equal(bssid, sdata->u.ibss.bssid)) {
+ rcu_read_lock();
+ return NULL;
+ }
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON_ONCE(!chanctx_conf))
+ return NULL;
+ band = chanctx_conf->def.chan->band;
+ scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
+ rcu_read_unlock();
+
+ sta = sta_info_alloc(sdata, addr, GFP_KERNEL);
+ if (!sta) {
+ rcu_read_lock();
+ return NULL;
+ }
+
+ sta->last_rx = jiffies;
+
+ /* make sure mandatory rates are always added */
+ sband = local->hw.wiphy->bands[band];
+ sta->sta.supp_rates[band] = supp_rates |
+ ieee80211_mandatory_rates(sband, scan_width);
+
+ return ieee80211_ibss_finish_sta(sta);
+}
+
+static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ int active = 0;
+ struct sta_info *sta;
+
+ sdata_assert_lock(sdata);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sta->sdata == sdata &&
+ time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
+ jiffies)) {
+ active++;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return active;
+}
+
+static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ struct cfg80211_bss *cbss;
+ struct beacon_data *presp;
+ struct sta_info *sta;
+
+ if (!is_zero_ether_addr(ifibss->bssid)) {
+ cbss = cfg80211_get_bss(local->hw.wiphy, ifibss->chandef.chan,
+ ifibss->bssid, ifibss->ssid,
+ ifibss->ssid_len,
+ IEEE80211_BSS_TYPE_IBSS,
+ IEEE80211_PRIVACY(ifibss->privacy));
+
+ if (cbss) {
+ cfg80211_unlink_bss(local->hw.wiphy, cbss);
+ cfg80211_put_bss(sdata->local->hw.wiphy, cbss);
+ }
+ }
+
+ ifibss->state = IEEE80211_IBSS_MLME_SEARCH;
+
+ sta_info_flush(sdata);
+
+ spin_lock_bh(&ifibss->incomplete_lock);
+ while (!list_empty(&ifibss->incomplete_stations)) {
+ sta = list_first_entry(&ifibss->incomplete_stations,
+ struct sta_info, list);
+ list_del(&sta->list);
+ spin_unlock_bh(&ifibss->incomplete_lock);
+
+ sta_info_free(local, sta);
+ spin_lock_bh(&ifibss->incomplete_lock);
+ }
+ spin_unlock_bh(&ifibss->incomplete_lock);
+
+ netif_carrier_off(sdata->dev);
+
+ sdata->vif.bss_conf.ibss_joined = false;
+ sdata->vif.bss_conf.ibss_creator = false;
+ sdata->vif.bss_conf.enable_beacon = false;
+ sdata->vif.bss_conf.ssid_len = 0;
+
+ /* remove beacon */
+ presp = rcu_dereference_protected(ifibss->presp,
+ lockdep_is_held(&sdata->wdev.mtx));
+ RCU_INIT_POINTER(sdata->u.ibss.presp, NULL);
+ if (presp)
+ kfree_rcu(presp, rcu_head);
+
+ clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED |
+ BSS_CHANGED_IBSS);
+ drv_leave_ibss(local, sdata);
+ mutex_lock(&local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&local->mtx);
+}
+
+static void ieee80211_csa_connection_drop_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ u.ibss.csa_connection_drop_work);
+
+ sdata_lock(sdata);
+
+ ieee80211_ibss_disconnect(sdata);
+ synchronize_rcu();
+ skb_queue_purge(&sdata->skb_queue);
+
+ /* trigger a scan to find another IBSS network to join */
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+
+ sdata_unlock(sdata);
+}
+
+static void ieee80211_ibss_csa_mark_radar(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ int err;
+
+ /* if the current channel is a DFS channel, mark the channel as
+ * unavailable.
+ */
+ err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
+ &ifibss->chandef,
+ NL80211_IFTYPE_ADHOC);
+ if (err > 0)
+ cfg80211_radar_event(sdata->local->hw.wiphy, &ifibss->chandef,
+ GFP_ATOMIC);
+}
+
+static bool
+ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata,
+ struct ieee802_11_elems *elems,
+ bool beacon)
+{
+ struct cfg80211_csa_settings params;
+ struct ieee80211_csa_ie csa_ie;
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ enum nl80211_channel_type ch_type;
+ int err;
+ u32 sta_flags;
+
+ sdata_assert_lock(sdata);
+
+ sta_flags = IEEE80211_STA_DISABLE_VHT;
+ switch (ifibss->chandef.width) {
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ sta_flags |= IEEE80211_STA_DISABLE_HT;
+ /* fall through */
+ case NL80211_CHAN_WIDTH_20:
+ sta_flags |= IEEE80211_STA_DISABLE_40MHZ;
+ break;
+ default:
+ break;
+ }
+
+ memset(&params, 0, sizeof(params));
+ memset(&csa_ie, 0, sizeof(csa_ie));
+ err = ieee80211_parse_ch_switch_ie(sdata, elems,
+ ifibss->chandef.chan->band,
+ sta_flags, ifibss->bssid, &csa_ie);
+ /* can't switch to destination channel, fail */
+ if (err < 0)
+ goto disconnect;
+
+ /* did not contain a CSA */
+ if (err)
+ return false;
+
+ /* channel switch is not supported, disconnect */
+ if (!(sdata->local->hw.wiphy->flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH))
+ goto disconnect;
+
+ params.count = csa_ie.count;
+ params.chandef = csa_ie.chandef;
+
+ switch (ifibss->chandef.width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_40:
+ /* keep our current HT mode (HT20/HT40+/HT40-), even if
+ * another mode has been announced. The mode is not adopted
+ * within the beacon while doing CSA and we should therefore
+ * keep the mode which we announce.
+ */
+ ch_type = cfg80211_get_chandef_type(&ifibss->chandef);
+ cfg80211_chandef_create(&params.chandef, params.chandef.chan,
+ ch_type);
+ break;
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ if (params.chandef.width != ifibss->chandef.width) {
+ sdata_info(sdata,
+ "IBSS %pM received channel switch from incompatible channel width (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
+ ifibss->bssid,
+ params.chandef.chan->center_freq,
+ params.chandef.width,
+ params.chandef.center_freq1,
+ params.chandef.center_freq2);
+ goto disconnect;
+ }
+ break;
+ default:
+ /* should not happen, sta_flags should prevent VHT modes. */
+ WARN_ON(1);
+ goto disconnect;
+ }
+
+ if (!cfg80211_reg_can_beacon(sdata->local->hw.wiphy, &params.chandef,
+ NL80211_IFTYPE_ADHOC)) {
+ sdata_info(sdata,
+ "IBSS %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
+ ifibss->bssid,
+ params.chandef.chan->center_freq,
+ params.chandef.width,
+ params.chandef.center_freq1,
+ params.chandef.center_freq2);
+ goto disconnect;
+ }
+
+ err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
+ &params.chandef,
+ NL80211_IFTYPE_ADHOC);
+ if (err < 0)
+ goto disconnect;
+ if (err > 0 && !ifibss->userspace_handles_dfs) {
+ /* IBSS-DFS only allowed with a control program */
+ goto disconnect;
+ }
+
+ params.radar_required = err;
+
+ if (cfg80211_chandef_identical(&params.chandef,
+ &sdata->vif.bss_conf.chandef)) {
+ ibss_dbg(sdata,
+ "received csa with an identical chandef, ignoring\n");
+ return true;
+ }
+
+ /* all checks done, now perform the channel switch. */
+ ibss_dbg(sdata,
+ "received channel switch announcement to go to channel %d MHz\n",
+ params.chandef.chan->center_freq);
+
+ params.block_tx = !!csa_ie.mode;
+
+ if (ieee80211_channel_switch(sdata->local->hw.wiphy, sdata->dev,
+ &params))
+ goto disconnect;
+
+ ieee80211_ibss_csa_mark_radar(sdata);
+
+ return true;
+disconnect:
+ ibss_dbg(sdata, "Can't handle channel switch, disconnect\n");
+ ieee80211_queue_work(&sdata->local->hw,
+ &ifibss->csa_connection_drop_work);
+
+ ieee80211_ibss_csa_mark_radar(sdata);
+
+ return true;
+}
+
+static void
+ieee80211_rx_mgmt_spectrum_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_rx_status *rx_status,
+ struct ieee802_11_elems *elems)
+{
+ int required_len;
+
+ if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+ return;
+
+ /* CSA is the only action we handle for now */
+ if (mgmt->u.action.u.measurement.action_code !=
+ WLAN_ACTION_SPCT_CHL_SWITCH)
+ return;
+
+ required_len = IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.chan_switch);
+ if (len < required_len)
+ return;
+
+ if (!sdata->vif.csa_active)
+ ieee80211_ibss_process_chanswitch(sdata, elems, false);
+}
+
+static void ieee80211_rx_mgmt_deauth_ibss(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ u16 reason = le16_to_cpu(mgmt->u.deauth.reason_code);
+
+ if (len < IEEE80211_DEAUTH_FRAME_LEN)
+ return;
+
+ ibss_dbg(sdata, "RX DeAuth SA=%pM DA=%pM BSSID=%pM (reason: %d)\n",
+ mgmt->sa, mgmt->da, mgmt->bssid, reason);
+ sta_info_destroy_addr(sdata, mgmt->sa);
+}
+
+static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ u16 auth_alg, auth_transaction;
+
+ sdata_assert_lock(sdata);
+
+ if (len < 24 + 6)
+ return;
+
+ auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
+ auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
+
+ ibss_dbg(sdata,
+ "RX Auth SA=%pM DA=%pM BSSID=%pM (auth_transaction=%d)\n",
+ mgmt->sa, mgmt->da, mgmt->bssid, auth_transaction);
+
+ if (auth_alg != WLAN_AUTH_OPEN || auth_transaction != 1)
+ return;
+
+ /*
+ * IEEE 802.11 standard does not require authentication in IBSS
+ * networks and most implementations do not seem to use it.
+ * However, try to reply to authentication attempts if someone
+ * has actually implemented this.
+ */
+ ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, 0, NULL, 0,
+ mgmt->sa, sdata->u.ibss.bssid, NULL, 0, 0, 0);
+}
+
+static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_rx_status *rx_status,
+ struct ieee802_11_elems *elems,
+ struct ieee80211_channel *channel)
+{
+ struct sta_info *sta;
+ enum ieee80211_band band = rx_status->band;
+ enum nl80211_bss_scan_width scan_width;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
+ bool rates_updated = false;
+ u32 supp_rates = 0;
+
+ if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ return;
+
+ if (!ether_addr_equal(mgmt->bssid, sdata->u.ibss.bssid))
+ return;
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, mgmt->sa);
+
+ if (elems->supp_rates) {
+ supp_rates = ieee80211_sta_get_rates(sdata, elems,
+ band, NULL);
+ if (sta) {
+ u32 prev_rates;
+
+ prev_rates = sta->sta.supp_rates[band];
+ /* make sure mandatory rates are always added */
+ scan_width = NL80211_BSS_CHAN_WIDTH_20;
+ if (rx_status->flag & RX_FLAG_5MHZ)
+ scan_width = NL80211_BSS_CHAN_WIDTH_5;
+ if (rx_status->flag & RX_FLAG_10MHZ)
+ scan_width = NL80211_BSS_CHAN_WIDTH_10;
+
+ sta->sta.supp_rates[band] = supp_rates |
+ ieee80211_mandatory_rates(sband, scan_width);
+ if (sta->sta.supp_rates[band] != prev_rates) {
+ ibss_dbg(sdata,
+ "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n",
+ sta->sta.addr, prev_rates,
+ sta->sta.supp_rates[band]);
+ rates_updated = true;
+ }
+ } else {
+ rcu_read_unlock();
+ sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid,
+ mgmt->sa, supp_rates);
+ }
+ }
+
+ if (sta && elems->wmm_info && local->hw.queues >= IEEE80211_NUM_ACS)
+ sta->sta.wme = true;
+
+ if (sta && elems->ht_operation && elems->ht_cap_elem &&
+ sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT &&
+ sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_5 &&
+ sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_10) {
+ /* we both use HT */
+ struct ieee80211_ht_cap htcap_ie;
+ struct cfg80211_chan_def chandef;
+ enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth;
+
+ ieee80211_ht_oper_to_chandef(channel,
+ elems->ht_operation,
+ &chandef);
+
+ memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie));
+ rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
+ &htcap_ie,
+ sta);
+
+ if (elems->vht_operation && elems->vht_cap_elem &&
+ sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20 &&
+ sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_40) {
+ /* we both use VHT */
+ struct ieee80211_vht_cap cap_ie;
+ struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap;
+
+ ieee80211_vht_oper_to_chandef(channel,
+ elems->vht_operation,
+ &chandef);
+ memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie));
+ ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
+ &cap_ie, sta);
+ if (memcmp(&cap, &sta->sta.vht_cap, sizeof(cap)))
+ rates_updated |= true;
+ }
+
+ if (bw != sta->sta.bandwidth)
+ rates_updated |= true;
+
+ if (!cfg80211_chandef_compatible(&sdata->u.ibss.chandef,
+ &chandef))
+ WARN_ON_ONCE(1);
+ }
+
+ if (sta && rates_updated) {
+ u32 changed = IEEE80211_RC_SUPP_RATES_CHANGED;
+ u8 rx_nss = sta->sta.rx_nss;
+
+ /* Force rx_nss recalculation */
+ sta->sta.rx_nss = 0;
+ rate_control_rate_init(sta);
+ if (sta->sta.rx_nss != rx_nss)
+ changed |= IEEE80211_RC_NSS_CHANGED;
+
+ drv_sta_rc_update(local, sdata, &sta->sta, changed);
+ }
+
+ rcu_read_unlock();
+}
+
+static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_rx_status *rx_status,
+ struct ieee802_11_elems *elems)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct cfg80211_bss *cbss;
+ struct ieee80211_bss *bss;
+ struct ieee80211_channel *channel;
+ u64 beacon_timestamp, rx_timestamp;
+ u32 supp_rates = 0;
+ enum ieee80211_band band = rx_status->band;
+
+ channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq);
+ if (!channel)
+ return;
+
+ ieee80211_update_sta_info(sdata, mgmt, len, rx_status, elems, channel);
+
+ bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
+ channel);
+ if (!bss)
+ return;
+
+ cbss = container_of((void *)bss, struct cfg80211_bss, priv);
+
+ /* same for beacon and probe response */
+ beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);
+
+ /* check if we need to merge IBSS */
+
+ /* not an IBSS */
+ if (!(cbss->capability & WLAN_CAPABILITY_IBSS))
+ goto put_bss;
+
+ /* different channel */
+ if (sdata->u.ibss.fixed_channel &&
+ sdata->u.ibss.chandef.chan != cbss->channel)
+ goto put_bss;
+
+ /* different SSID */
+ if (elems->ssid_len != sdata->u.ibss.ssid_len ||
+ memcmp(elems->ssid, sdata->u.ibss.ssid,
+ sdata->u.ibss.ssid_len))
+ goto put_bss;
+
+ /* process channel switch */
+ if (sdata->vif.csa_active ||
+ ieee80211_ibss_process_chanswitch(sdata, elems, true))
+ goto put_bss;
+
+ /* same BSSID */
+ if (ether_addr_equal(cbss->bssid, sdata->u.ibss.bssid))
+ goto put_bss;
+
+ /* we use a fixed BSSID */
+ if (sdata->u.ibss.fixed_bssid)
+ goto put_bss;
+
+ if (ieee80211_have_rx_timestamp(rx_status)) {
+ /* time when timestamp field was received */
+ rx_timestamp =
+ ieee80211_calculate_rx_timestamp(local, rx_status,
+ len + FCS_LEN, 24);
+ } else {
+ /*
+ * second best option: get current TSF
+ * (will return -1 if not supported)
+ */
+ rx_timestamp = drv_get_tsf(local, sdata);
+ }
+
+ ibss_dbg(sdata,
+ "RX beacon SA=%pM BSSID=%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
+ mgmt->sa, mgmt->bssid,
+ (unsigned long long)rx_timestamp,
+ (unsigned long long)beacon_timestamp,
+ (unsigned long long)(rx_timestamp - beacon_timestamp),
+ jiffies);
+
+ if (beacon_timestamp > rx_timestamp) {
+ ibss_dbg(sdata,
+ "beacon TSF higher than local TSF - IBSS merge with BSSID %pM\n",
+ mgmt->bssid);
+ ieee80211_sta_join_ibss(sdata, bss);
+ supp_rates = ieee80211_sta_get_rates(sdata, elems, band, NULL);
+ ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa,
+ supp_rates);
+ rcu_read_unlock();
+ }
+
+ put_bss:
+ ieee80211_rx_bss_put(local, bss);
+}
+
+void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, const u8 *addr,
+ u32 supp_rates)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_supported_band *sband;
+ enum nl80211_bss_scan_width scan_width;
+ int band;
+
+ /*
+ * XXX: Consider removing the least recently used entry and
+ * allow new one to be added.
+ */
+ if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
+ net_info_ratelimited("%s: No room for a new IBSS STA entry %pM\n",
+ sdata->name, addr);
+ return;
+ }
+
+ if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH)
+ return;
+
+ if (!ether_addr_equal(bssid, sdata->u.ibss.bssid))
+ return;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON_ONCE(!chanctx_conf)) {
+ rcu_read_unlock();
+ return;
+ }
+ band = chanctx_conf->def.chan->band;
+ scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
+ rcu_read_unlock();
+
+ sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
+ if (!sta)
+ return;
+
+ sta->last_rx = jiffies;
+
+ /* make sure mandatory rates are always added */
+ sband = local->hw.wiphy->bands[band];
+ sta->sta.supp_rates[band] = supp_rates |
+ ieee80211_mandatory_rates(sband, scan_width);
+
+ spin_lock(&ifibss->incomplete_lock);
+ list_add(&sta->list, &ifibss->incomplete_stations);
+ spin_unlock(&ifibss->incomplete_lock);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta, *tmp;
+ unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT;
+ unsigned long exp_rsn_time = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT;
+
+ mutex_lock(&local->sta_mtx);
+
+ list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
+ if (sdata != sta->sdata)
+ continue;
+
+ if (time_after(jiffies, sta->last_rx + exp_time) ||
+ (time_after(jiffies, sta->last_rx + exp_rsn_time) &&
+ sta->sta_state != IEEE80211_STA_AUTHORIZED)) {
+ sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n",
+ sta->sta_state != IEEE80211_STA_AUTHORIZED ?
+ "not authorized " : "", sta->sta.addr);
+
+ WARN_ON(__sta_info_destroy(sta));
+ }
+ }
+
+ mutex_unlock(&local->sta_mtx);
+}
+
+/*
+ * This function is called with state == IEEE80211_IBSS_MLME_JOINED
+ */
+
+static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ enum nl80211_bss_scan_width scan_width;
+
+ sdata_assert_lock(sdata);
+
+ mod_timer(&ifibss->timer,
+ round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
+
+ ieee80211_ibss_sta_expire(sdata);
+
+ if (time_before(jiffies, ifibss->last_scan_completed +
+ IEEE80211_IBSS_MERGE_INTERVAL))
+ return;
+
+ if (ieee80211_sta_active_ibss(sdata))
+ return;
+
+ if (ifibss->fixed_channel)
+ return;
+
+ sdata_info(sdata,
+ "No active IBSS STAs - trying to scan for other IBSS networks with same SSID (merge)\n");
+
+ scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef);
+ ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len,
+ NULL, 0, scan_width);
+}
+
+static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ u8 bssid[ETH_ALEN];
+ u16 capability;
+ int i;
+
+ sdata_assert_lock(sdata);
+
+ if (ifibss->fixed_bssid) {
+ memcpy(bssid, ifibss->bssid, ETH_ALEN);
+ } else {
+ /* Generate random, not broadcast, locally administered BSSID. Mix in
+ * own MAC address to make sure that devices that do not have proper
+ * random number generator get different BSSID. */
+ get_random_bytes(bssid, ETH_ALEN);
+ for (i = 0; i < ETH_ALEN; i++)
+ bssid[i] ^= sdata->vif.addr[i];
+ bssid[0] &= ~0x01;
+ bssid[0] |= 0x02;
+ }
+
+ sdata_info(sdata, "Creating new IBSS network, BSSID %pM\n", bssid);
+
+ capability = WLAN_CAPABILITY_IBSS;
+
+ if (ifibss->privacy)
+ capability |= WLAN_CAPABILITY_PRIVACY;
+
+ __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int,
+ &ifibss->chandef, ifibss->basic_rates,
+ capability, 0, true);
+}
+
+static unsigned ibss_setup_channels(struct wiphy *wiphy,
+ struct ieee80211_channel **channels,
+ unsigned int channels_max,
+ u32 center_freq, u32 width)
+{
+ struct ieee80211_channel *chan = NULL;
+ unsigned int n_chan = 0;
+ u32 start_freq, end_freq, freq;
+
+ if (width <= 20) {
+ start_freq = center_freq;
+ end_freq = center_freq;
+ } else {
+ start_freq = center_freq - width / 2 + 10;
+ end_freq = center_freq + width / 2 - 10;
+ }
+
+ for (freq = start_freq; freq <= end_freq; freq += 20) {
+ chan = ieee80211_get_channel(wiphy, freq);
+ if (!chan)
+ continue;
+ if (n_chan >= channels_max)
+ return n_chan;
+
+ channels[n_chan] = chan;
+ n_chan++;
+ }
+
+ return n_chan;
+}
+
+static unsigned int
+ieee80211_ibss_setup_scan_channels(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef,
+ struct ieee80211_channel **channels,
+ unsigned int channels_max)
+{
+ unsigned int n_chan = 0;
+ u32 width, cf1, cf2 = 0;
+
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_40:
+ width = 40;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ cf2 = chandef->center_freq2;
+ /* fall through */
+ case NL80211_CHAN_WIDTH_80:
+ width = 80;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ width = 160;
+ break;
+ default:
+ width = 20;
+ break;
+ }
+
+ cf1 = chandef->center_freq1;
+
+ n_chan = ibss_setup_channels(wiphy, channels, channels_max, cf1, width);
+
+ if (cf2)
+ n_chan += ibss_setup_channels(wiphy, &channels[n_chan],
+ channels_max - n_chan, cf2,
+ width);
+
+ return n_chan;
+}
+
+/*
+ * This function is called with state == IEEE80211_IBSS_MLME_SEARCH
+ */
+
+static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ struct cfg80211_bss *cbss;
+ struct ieee80211_channel *chan = NULL;
+ const u8 *bssid = NULL;
+ enum nl80211_bss_scan_width scan_width;
+ int active_ibss;
+
+ sdata_assert_lock(sdata);
+
+ active_ibss = ieee80211_sta_active_ibss(sdata);
+ ibss_dbg(sdata, "sta_find_ibss (active_ibss=%d)\n", active_ibss);
+
+ if (active_ibss)
+ return;
+
+ if (ifibss->fixed_bssid)
+ bssid = ifibss->bssid;
+ if (ifibss->fixed_channel)
+ chan = ifibss->chandef.chan;
+ if (!is_zero_ether_addr(ifibss->bssid))
+ bssid = ifibss->bssid;
+ cbss = cfg80211_get_bss(local->hw.wiphy, chan, bssid,
+ ifibss->ssid, ifibss->ssid_len,
+ IEEE80211_BSS_TYPE_IBSS,
+ IEEE80211_PRIVACY(ifibss->privacy));
+
+ if (cbss) {
+ struct ieee80211_bss *bss;
+
+ bss = (void *)cbss->priv;
+ ibss_dbg(sdata,
+ "sta_find_ibss: selected %pM current %pM\n",
+ cbss->bssid, ifibss->bssid);
+ sdata_info(sdata,
+ "Selected IBSS BSSID %pM based on configured SSID\n",
+ cbss->bssid);
+
+ ieee80211_sta_join_ibss(sdata, bss);
+ ieee80211_rx_bss_put(local, bss);
+ return;
+ }
+
+ /* if a fixed bssid and a fixed freq have been provided create the IBSS
+ * directly and do not waste time scanning
+ */
+ if (ifibss->fixed_bssid && ifibss->fixed_channel) {
+ sdata_info(sdata, "Created IBSS using preconfigured BSSID %pM\n",
+ bssid);
+ ieee80211_sta_create_ibss(sdata);
+ return;
+ }
+
+
+ ibss_dbg(sdata, "sta_find_ibss: did not try to join ibss\n");
+
+ /* Selected IBSS not found in current scan results - try to scan */
+ if (time_after(jiffies, ifibss->last_scan_completed +
+ IEEE80211_SCAN_INTERVAL)) {
+ struct ieee80211_channel *channels[8];
+ unsigned int num;
+
+ sdata_info(sdata, "Trigger new scan to find an IBSS to join\n");
+
+ num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy,
+ &ifibss->chandef,
+ channels,
+ ARRAY_SIZE(channels));
+ scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef);
+ ieee80211_request_ibss_scan(sdata, ifibss->ssid,
+ ifibss->ssid_len, channels, num,
+ scan_width);
+ } else {
+ int interval = IEEE80211_SCAN_INTERVAL;
+
+ if (time_after(jiffies, ifibss->ibss_join_req +
+ IEEE80211_IBSS_JOIN_TIMEOUT))
+ ieee80211_sta_create_ibss(sdata);
+
+ mod_timer(&ifibss->timer,
+ round_jiffies(jiffies + interval));
+ }
+}
+
+static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *req)
+{
+ struct ieee80211_mgmt *mgmt = (void *)req->data;
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_local *local = sdata->local;
+ int tx_last_beacon, len = req->len;
+ struct sk_buff *skb;
+ struct beacon_data *presp;
+ u8 *pos, *end;
+
+ sdata_assert_lock(sdata);
+
+ presp = rcu_dereference_protected(ifibss->presp,
+ lockdep_is_held(&sdata->wdev.mtx));
+
+ if (ifibss->state != IEEE80211_IBSS_MLME_JOINED ||
+ len < 24 + 2 || !presp)
+ return;
+
+ tx_last_beacon = drv_tx_last_beacon(local);
+
+ ibss_dbg(sdata,
+ "RX ProbeReq SA=%pM DA=%pM BSSID=%pM (tx_last_beacon=%d)\n",
+ mgmt->sa, mgmt->da, mgmt->bssid, tx_last_beacon);
+
+ if (!tx_last_beacon && is_multicast_ether_addr(mgmt->da))
+ return;
+
+ if (!ether_addr_equal(mgmt->bssid, ifibss->bssid) &&
+ !is_broadcast_ether_addr(mgmt->bssid))
+ return;
+
+ end = ((u8 *) mgmt) + len;
+ pos = mgmt->u.probe_req.variable;
+ if (pos[0] != WLAN_EID_SSID ||
+ pos + 2 + pos[1] > end) {
+ ibss_dbg(sdata, "Invalid SSID IE in ProbeReq from %pM\n",
+ mgmt->sa);
+ return;
+ }
+ if (pos[1] != 0 &&
+ (pos[1] != ifibss->ssid_len ||
+ memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len))) {
+ /* Ignore ProbeReq for foreign SSID */
+ return;
+ }
+
+ /* Reply with ProbeResp */
+ skb = dev_alloc_skb(local->tx_headroom + presp->head_len);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->tx_headroom);
+ memcpy(skb_put(skb, presp->head_len), presp->head, presp->head_len);
+
+ memcpy(((struct ieee80211_mgmt *) skb->data)->da, mgmt->sa, ETH_ALEN);
+ ibss_dbg(sdata, "Sending ProbeResp to %pM\n", mgmt->sa);
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+ /* avoid excessive retries for probe request to wildcard SSIDs */
+ if (pos[1] == 0)
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_NO_ACK;
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+static
+void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_rx_status *rx_status)
+{
+ size_t baselen;
+ struct ieee802_11_elems elems;
+
+ BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) !=
+ offsetof(typeof(mgmt->u.beacon), variable));
+
+ /*
+ * either beacon or probe_resp but the variable field is at the
+ * same offset
+ */
+ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+ if (baselen > len)
+ return;
+
+ ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+ false, &elems);
+
+ ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
+}
+
+void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_rx_status *rx_status;
+ struct ieee80211_mgmt *mgmt;
+ u16 fc;
+ struct ieee802_11_elems elems;
+ int ies_len;
+
+ rx_status = IEEE80211_SKB_RXCB(skb);
+ mgmt = (struct ieee80211_mgmt *) skb->data;
+ fc = le16_to_cpu(mgmt->frame_control);
+
+ sdata_lock(sdata);
+
+ if (!sdata->u.ibss.ssid_len)
+ goto mgmt_out; /* not ready to merge yet */
+
+ switch (fc & IEEE80211_FCTL_STYPE) {
+ case IEEE80211_STYPE_PROBE_REQ:
+ ieee80211_rx_mgmt_probe_req(sdata, skb);
+ break;
+ case IEEE80211_STYPE_PROBE_RESP:
+ case IEEE80211_STYPE_BEACON:
+ ieee80211_rx_mgmt_probe_beacon(sdata, mgmt, skb->len,
+ rx_status);
+ break;
+ case IEEE80211_STYPE_AUTH:
+ ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_DEAUTH:
+ ieee80211_rx_mgmt_deauth_ibss(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_ACTION:
+ switch (mgmt->u.action.category) {
+ case WLAN_CATEGORY_SPECTRUM_MGMT:
+ ies_len = skb->len -
+ offsetof(struct ieee80211_mgmt,
+ u.action.u.chan_switch.variable);
+
+ if (ies_len < 0)
+ break;
+
+ ieee802_11_parse_elems(
+ mgmt->u.action.u.chan_switch.variable,
+ ies_len, true, &elems);
+
+ if (elems.parse_error)
+ break;
+
+ ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt, skb->len,
+ rx_status, &elems);
+ break;
+ }
+ }
+
+ mgmt_out:
+ sdata_unlock(sdata);
+}
+
+void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct sta_info *sta;
+
+ sdata_lock(sdata);
+
+ /*
+ * Work could be scheduled after scan or similar
+ * when we aren't even joined (or trying) with a
+ * network.
+ */
+ if (!ifibss->ssid_len)
+ goto out;
+
+ spin_lock_bh(&ifibss->incomplete_lock);
+ while (!list_empty(&ifibss->incomplete_stations)) {
+ sta = list_first_entry(&ifibss->incomplete_stations,
+ struct sta_info, list);
+ list_del(&sta->list);
+ spin_unlock_bh(&ifibss->incomplete_lock);
+
+ ieee80211_ibss_finish_sta(sta);
+ rcu_read_unlock();
+ spin_lock_bh(&ifibss->incomplete_lock);
+ }
+ spin_unlock_bh(&ifibss->incomplete_lock);
+
+ switch (ifibss->state) {
+ case IEEE80211_IBSS_MLME_SEARCH:
+ ieee80211_sta_find_ibss(sdata);
+ break;
+ case IEEE80211_IBSS_MLME_JOINED:
+ ieee80211_sta_merge_ibss(sdata);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+
+ out:
+ sdata_unlock(sdata);
+}
+
+static void ieee80211_ibss_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata =
+ (struct ieee80211_sub_if_data *) data;
+
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+}
+
+void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ setup_timer(&ifibss->timer, ieee80211_ibss_timer,
+ (unsigned long) sdata);
+ INIT_LIST_HEAD(&ifibss->incomplete_stations);
+ spin_lock_init(&ifibss->incomplete_lock);
+ INIT_WORK(&ifibss->csa_connection_drop_work,
+ ieee80211_csa_connection_drop_work);
+}
+
+/* scan finished notification */
+void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ continue;
+ sdata->u.ibss.last_scan_completed = jiffies;
+ ieee80211_queue_work(&local->hw, &sdata->work);
+ }
+ mutex_unlock(&local->iflist_mtx);
+}
+
+int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_ibss_params *params)
+{
+ u32 changed = 0;
+ u32 rate_flags;
+ struct ieee80211_supported_band *sband;
+ enum ieee80211_chanctx_mode chanmode;
+ struct ieee80211_local *local = sdata->local;
+ int radar_detect_width = 0;
+ int i;
+ int ret;
+
+ ret = cfg80211_chandef_dfs_required(local->hw.wiphy,
+ &params->chandef,
+ sdata->wdev.iftype);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0) {
+ if (!params->userspace_handles_dfs)
+ return -EINVAL;
+ radar_detect_width = BIT(params->chandef.width);
+ }
+
+ chanmode = (params->channel_fixed && !ret) ?
+ IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE;
+
+ mutex_lock(&local->chanctx_mtx);
+ ret = ieee80211_check_combinations(sdata, &params->chandef, chanmode,
+ radar_detect_width);
+ mutex_unlock(&local->chanctx_mtx);
+ if (ret < 0)
+ return ret;
+
+ if (params->bssid) {
+ memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN);
+ sdata->u.ibss.fixed_bssid = true;
+ } else
+ sdata->u.ibss.fixed_bssid = false;
+
+ sdata->u.ibss.privacy = params->privacy;
+ sdata->u.ibss.control_port = params->control_port;
+ sdata->u.ibss.userspace_handles_dfs = params->userspace_handles_dfs;
+ sdata->u.ibss.basic_rates = params->basic_rates;
+ sdata->u.ibss.last_scan_completed = jiffies;
+
+ /* fix basic_rates if channel does not support these rates */
+ rate_flags = ieee80211_chandef_rate_flags(&params->chandef);
+ sband = local->hw.wiphy->bands[params->chandef.chan->band];
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ sdata->u.ibss.basic_rates &= ~BIT(i);
+ }
+ memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate,
+ sizeof(params->mcast_rate));
+
+ sdata->vif.bss_conf.beacon_int = params->beacon_interval;
+
+ sdata->u.ibss.chandef = params->chandef;
+ sdata->u.ibss.fixed_channel = params->channel_fixed;
+
+ if (params->ie) {
+ sdata->u.ibss.ie = kmemdup(params->ie, params->ie_len,
+ GFP_KERNEL);
+ if (sdata->u.ibss.ie)
+ sdata->u.ibss.ie_len = params->ie_len;
+ }
+
+ sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH;
+ sdata->u.ibss.ibss_join_req = jiffies;
+
+ memcpy(sdata->u.ibss.ssid, params->ssid, params->ssid_len);
+ sdata->u.ibss.ssid_len = params->ssid_len;
+
+ memcpy(&sdata->u.ibss.ht_capa, &params->ht_capa,
+ sizeof(sdata->u.ibss.ht_capa));
+ memcpy(&sdata->u.ibss.ht_capa_mask, &params->ht_capa_mask,
+ sizeof(sdata->u.ibss.ht_capa_mask));
+
+ /*
+ * 802.11n-2009 9.13.3.1: In an IBSS, the HT Protection field is
+ * reserved, but an HT STA shall protect HT transmissions as though
+ * the HT Protection field were set to non-HT mixed mode.
+ *
+ * In an IBSS, the RIFS Mode field of the HT Operation element is
+ * also reserved, but an HT STA shall operate as though this field
+ * were set to 1.
+ */
+
+ sdata->vif.bss_conf.ht_operation_mode |=
+ IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED
+ | IEEE80211_HT_PARAM_RIFS_MODE;
+
+ changed |= BSS_CHANGED_HT;
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ sdata->smps_mode = IEEE80211_SMPS_OFF;
+ sdata->needed_rx_chains = local->rx_chains;
+
+ ieee80211_queue_work(&local->hw, &sdata->work);
+
+ return 0;
+}
+
+int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ ieee80211_ibss_disconnect(sdata);
+ ifibss->ssid_len = 0;
+ eth_zero_addr(ifibss->bssid);
+
+ /* remove beacon */
+ kfree(sdata->u.ibss.ie);
+
+ /* on the next join, re-program HT parameters */
+ memset(&ifibss->ht_capa, 0, sizeof(ifibss->ht_capa));
+ memset(&ifibss->ht_capa_mask, 0, sizeof(ifibss->ht_capa_mask));
+
+ synchronize_rcu();
+
+ skb_queue_purge(&sdata->skb_queue);
+
+ del_timer_sync(&sdata->u.ibss.timer);
+
+ return 0;
+}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
new file mode 100644
index 000000000..c0a9187bc
--- /dev/null
+++ b/net/mac80211/ieee80211_i.h
@@ -0,0 +1,2074 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef IEEE80211_I_H
+#define IEEE80211_I_H
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/if_ether.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/etherdevice.h>
+#include <linux/leds.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
+#include <net/ieee80211_radiotap.h>
+#include <net/cfg80211.h>
+#include <net/mac80211.h>
+#include "key.h"
+#include "sta_info.h"
+#include "debug.h"
+
+struct ieee80211_local;
+
+/* Maximum number of broadcast/multicast frames to buffer when some of the
+ * associated stations are using power saving. */
+#define AP_MAX_BC_BUFFER 128
+
+/* Maximum number of frames buffered to all STAs, including multicast frames.
+ * Note: increasing this limit increases the potential memory requirement. Each
+ * frame can be up to about 2 kB long. */
+#define TOTAL_MAX_TX_BUFFER 512
+
+/* Required encryption head and tailroom */
+#define IEEE80211_ENCRYPT_HEADROOM 8
+#define IEEE80211_ENCRYPT_TAILROOM 18
+
+/* IEEE 802.11 (Ch. 9.5 Defragmentation) requires support for concurrent
+ * reception of at least three fragmented frames. This limit can be increased
+ * by changing this define, at the cost of slower frame reassembly and
+ * increased memory use (about 2 kB of RAM per entry). */
+#define IEEE80211_FRAGMENT_MAX 4
+
+/* power level hasn't been configured (or set to automatic) */
+#define IEEE80211_UNSET_POWER_LEVEL INT_MIN
+
+/*
+ * Some APs experience problems when working with U-APSD. Decreasing the
+ * probability of that happening by using legacy mode for all ACs but VO isn't
+ * enough.
+ *
+ * Cisco 4410N originally forced us to enable VO by default only because it
+ * treated non-VO ACs as legacy.
+ *
+ * However some APs (notably Netgear R7000) silently reclassify packets to
+ * different ACs. Since u-APSD ACs require trigger frames for frame retrieval
+ * clients would never see some frames (e.g. ARP responses) or would fetch them
+ * accidentally after a long time.
+ *
+ * It makes little sense to enable u-APSD queues by default because it needs
+ * userspace applications to be aware of it to actually take advantage of the
+ * possible additional powersavings. Implicitly depending on driver autotrigger
+ * frame support doesn't make much sense.
+ */
+#define IEEE80211_DEFAULT_UAPSD_QUEUES 0
+
+#define IEEE80211_DEFAULT_MAX_SP_LEN \
+ IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL
+
+#define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */)
+
+struct ieee80211_fragment_entry {
+ unsigned long first_frag_time;
+ unsigned int seq;
+ unsigned int rx_queue;
+ unsigned int last_frag;
+ unsigned int extra_len;
+ struct sk_buff_head skb_list;
+ int ccmp; /* Whether fragments were encrypted with CCMP */
+ u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
+};
+
+
+struct ieee80211_bss {
+ u32 device_ts_beacon, device_ts_presp;
+
+ bool wmm_used;
+ bool uapsd_supported;
+
+#define IEEE80211_MAX_SUPP_RATES 32
+ u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
+ size_t supp_rates_len;
+ struct ieee80211_rate *beacon_rate;
+
+ /*
+ * During association, we save an ERP value from a probe response so
+ * that we can feed ERP info to the driver when handling the
+ * association completes. these fields probably won't be up-to-date
+ * otherwise, you probably don't want to use them.
+ */
+ bool has_erp_value;
+ u8 erp_value;
+
+ /* Keep track of the corruption of the last beacon/probe response. */
+ u8 corrupt_data;
+
+ /* Keep track of what bits of information we have valid info for. */
+ u8 valid_data;
+};
+
+/**
+ * enum ieee80211_corrupt_data_flags - BSS data corruption flags
+ * @IEEE80211_BSS_CORRUPT_BEACON: last beacon frame received was corrupted
+ * @IEEE80211_BSS_CORRUPT_PROBE_RESP: last probe response received was corrupted
+ *
+ * These are bss flags that are attached to a bss in the
+ * @corrupt_data field of &struct ieee80211_bss.
+ */
+enum ieee80211_bss_corrupt_data_flags {
+ IEEE80211_BSS_CORRUPT_BEACON = BIT(0),
+ IEEE80211_BSS_CORRUPT_PROBE_RESP = BIT(1)
+};
+
+/**
+ * enum ieee80211_valid_data_flags - BSS valid data flags
+ * @IEEE80211_BSS_VALID_WMM: WMM/UAPSD data was gathered from non-corrupt IE
+ * @IEEE80211_BSS_VALID_RATES: Supported rates were gathered from non-corrupt IE
+ * @IEEE80211_BSS_VALID_ERP: ERP flag was gathered from non-corrupt IE
+ *
+ * These are bss flags that are attached to a bss in the
+ * @valid_data field of &struct ieee80211_bss. They show which parts
+ * of the data structure were received as a result of an un-corrupted
+ * beacon/probe response.
+ */
+enum ieee80211_bss_valid_data_flags {
+ IEEE80211_BSS_VALID_WMM = BIT(1),
+ IEEE80211_BSS_VALID_RATES = BIT(2),
+ IEEE80211_BSS_VALID_ERP = BIT(3)
+};
+
+typedef unsigned __bitwise__ ieee80211_tx_result;
+#define TX_CONTINUE ((__force ieee80211_tx_result) 0u)
+#define TX_DROP ((__force ieee80211_tx_result) 1u)
+#define TX_QUEUED ((__force ieee80211_tx_result) 2u)
+
+#define IEEE80211_TX_UNICAST BIT(1)
+#define IEEE80211_TX_PS_BUFFERED BIT(2)
+
+struct ieee80211_tx_data {
+ struct sk_buff *skb;
+ struct sk_buff_head skbs;
+ struct ieee80211_local *local;
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+ struct ieee80211_key *key;
+ struct ieee80211_tx_rate rate;
+
+ unsigned int flags;
+};
+
+
+typedef unsigned __bitwise__ ieee80211_rx_result;
+#define RX_CONTINUE ((__force ieee80211_rx_result) 0u)
+#define RX_DROP_UNUSABLE ((__force ieee80211_rx_result) 1u)
+#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u)
+#define RX_QUEUED ((__force ieee80211_rx_result) 3u)
+
+/**
+ * enum ieee80211_packet_rx_flags - packet RX flags
+ * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed
+ * (incl. multicast frames)
+ * @IEEE80211_RX_FRAGMENTED: fragmented frame
+ * @IEEE80211_RX_AMSDU: a-MSDU packet
+ * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed
+ * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering
+ *
+ * These are per-frame flags that are attached to a frame in the
+ * @rx_flags field of &struct ieee80211_rx_status.
+ */
+enum ieee80211_packet_rx_flags {
+ IEEE80211_RX_RA_MATCH = BIT(1),
+ IEEE80211_RX_FRAGMENTED = BIT(2),
+ IEEE80211_RX_AMSDU = BIT(3),
+ IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4),
+ IEEE80211_RX_DEFERRED_RELEASE = BIT(5),
+};
+
+/**
+ * enum ieee80211_rx_flags - RX data flags
+ *
+ * @IEEE80211_RX_CMNTR: received on cooked monitor already
+ * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported
+ * to cfg80211_report_obss_beacon().
+ * @IEEE80211_RX_REORDER_TIMER: this frame is released by the
+ * reorder buffer timeout timer, not the normal RX path
+ *
+ * These flags are used across handling multiple interfaces
+ * for a single frame.
+ */
+enum ieee80211_rx_flags {
+ IEEE80211_RX_CMNTR = BIT(0),
+ IEEE80211_RX_BEACON_REPORTED = BIT(1),
+ IEEE80211_RX_REORDER_TIMER = BIT(2),
+};
+
+struct ieee80211_rx_data {
+ struct sk_buff *skb;
+ struct ieee80211_local *local;
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+ struct ieee80211_key *key;
+
+ unsigned int flags;
+
+ /*
+ * Index into sequence numbers array, 0..16
+ * since the last (16) is used for non-QoS,
+ * will be 16 on non-QoS frames.
+ */
+ int seqno_idx;
+
+ /*
+ * Index into the security IV/PN arrays, 0..16
+ * since the last (16) is used for CCMP-encrypted
+ * management frames, will be set to 16 on mgmt
+ * frames and 0 on non-QoS frames.
+ */
+ int security_idx;
+
+ u32 tkip_iv32;
+ u16 tkip_iv16;
+};
+
+struct ieee80211_csa_settings {
+ const u16 *counter_offsets_beacon;
+ const u16 *counter_offsets_presp;
+
+ int n_counter_offsets_beacon;
+ int n_counter_offsets_presp;
+
+ u8 count;
+};
+
+struct beacon_data {
+ u8 *head, *tail;
+ int head_len, tail_len;
+ struct ieee80211_meshconf_ie *meshconf;
+ u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM];
+ u8 csa_current_counter;
+ struct rcu_head rcu_head;
+};
+
+struct probe_resp {
+ struct rcu_head rcu_head;
+ int len;
+ u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM];
+ u8 data[0];
+};
+
+struct ps_data {
+ /* yes, this looks ugly, but guarantees that we can later use
+ * bitmap_empty :)
+ * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */
+ u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)]
+ __aligned(__alignof__(unsigned long));
+ struct sk_buff_head bc_buf;
+ atomic_t num_sta_ps; /* number of stations in PS mode */
+ int dtim_count;
+ bool dtim_bc_mc;
+};
+
+struct ieee80211_if_ap {
+ struct beacon_data __rcu *beacon;
+ struct probe_resp __rcu *probe_resp;
+
+ /* to be used after channel switch. */
+ struct cfg80211_beacon_data *next_beacon;
+ struct list_head vlans; /* write-protected with RTNL and local->mtx */
+
+ struct ps_data ps;
+ atomic_t num_mcast_sta; /* number of stations receiving multicast */
+ enum ieee80211_smps_mode req_smps, /* requested smps mode */
+ driver_smps_mode; /* smps mode request */
+
+ struct work_struct request_smps_work;
+};
+
+struct ieee80211_if_wds {
+ struct sta_info *sta;
+ u8 remote_addr[ETH_ALEN];
+};
+
+struct ieee80211_if_vlan {
+ struct list_head list; /* write-protected with RTNL and local->mtx */
+
+ /* used for all tx if the VLAN is configured to 4-addr mode */
+ struct sta_info __rcu *sta;
+};
+
+struct mesh_stats {
+ __u32 fwded_mcast; /* Mesh forwarded multicast frames */
+ __u32 fwded_unicast; /* Mesh forwarded unicast frames */
+ __u32 fwded_frames; /* Mesh total forwarded frames */
+ __u32 dropped_frames_ttl; /* Not transmitted since mesh_ttl == 0*/
+ __u32 dropped_frames_no_route; /* Not transmitted, no route found */
+ __u32 dropped_frames_congestion;/* Not forwarded due to congestion */
+};
+
+#define PREQ_Q_F_START 0x1
+#define PREQ_Q_F_REFRESH 0x2
+struct mesh_preq_queue {
+ struct list_head list;
+ u8 dst[ETH_ALEN];
+ u8 flags;
+};
+
+struct ieee80211_roc_work {
+ struct list_head list;
+ struct list_head dependents;
+
+ struct delayed_work work;
+
+ struct ieee80211_sub_if_data *sdata;
+
+ struct ieee80211_channel *chan;
+
+ bool started, abort, hw_begun, notified;
+ bool to_be_freed;
+ bool on_channel;
+
+ unsigned long hw_start_time;
+
+ u32 duration, req_duration;
+ struct sk_buff *frame;
+ u64 cookie, mgmt_tx_cookie;
+ enum ieee80211_roc_type type;
+};
+
+/* flags used in struct ieee80211_if_managed.flags */
+enum ieee80211_sta_flags {
+ IEEE80211_STA_CONNECTION_POLL = BIT(1),
+ IEEE80211_STA_CONTROL_PORT = BIT(2),
+ IEEE80211_STA_DISABLE_HT = BIT(4),
+ IEEE80211_STA_MFP_ENABLED = BIT(6),
+ IEEE80211_STA_UAPSD_ENABLED = BIT(7),
+ IEEE80211_STA_NULLFUNC_ACKED = BIT(8),
+ IEEE80211_STA_RESET_SIGNAL_AVE = BIT(9),
+ IEEE80211_STA_DISABLE_40MHZ = BIT(10),
+ IEEE80211_STA_DISABLE_VHT = BIT(11),
+ IEEE80211_STA_DISABLE_80P80MHZ = BIT(12),
+ IEEE80211_STA_DISABLE_160MHZ = BIT(13),
+ IEEE80211_STA_DISABLE_WMM = BIT(14),
+ IEEE80211_STA_ENABLE_RRM = BIT(15),
+};
+
+struct ieee80211_mgd_auth_data {
+ struct cfg80211_bss *bss;
+ unsigned long timeout;
+ int tries;
+ u16 algorithm, expected_transaction;
+
+ u8 key[WLAN_KEY_LEN_WEP104];
+ u8 key_len, key_idx;
+ bool done;
+ bool timeout_started;
+
+ u16 sae_trans, sae_status;
+ size_t data_len;
+ u8 data[];
+};
+
+struct ieee80211_mgd_assoc_data {
+ struct cfg80211_bss *bss;
+ const u8 *supp_rates;
+
+ unsigned long timeout;
+ int tries;
+
+ u16 capability;
+ u8 prev_bssid[ETH_ALEN];
+ u8 ssid[IEEE80211_MAX_SSID_LEN];
+ u8 ssid_len;
+ u8 supp_rates_len;
+ bool wmm, uapsd;
+ bool need_beacon;
+ bool synced;
+ bool timeout_started;
+
+ u8 ap_ht_param;
+
+ struct ieee80211_vht_cap ap_vht_cap;
+
+ size_t ie_len;
+ u8 ie[];
+};
+
+struct ieee80211_sta_tx_tspec {
+ /* timestamp of the first packet in the time slice */
+ unsigned long time_slice_start;
+
+ u32 admitted_time; /* in usecs, unlike over the air */
+ u8 tsid;
+ s8 up; /* signed to be able to invalidate with -1 during teardown */
+
+ /* consumed TX time in microseconds in the time slice */
+ u32 consumed_tx_time;
+ enum {
+ TX_TSPEC_ACTION_NONE = 0,
+ TX_TSPEC_ACTION_DOWNGRADE,
+ TX_TSPEC_ACTION_STOP_DOWNGRADE,
+ } action;
+ bool downgraded;
+};
+
+struct ieee80211_if_managed {
+ struct timer_list timer;
+ struct timer_list conn_mon_timer;
+ struct timer_list bcn_mon_timer;
+ struct timer_list chswitch_timer;
+ struct work_struct monitor_work;
+ struct work_struct chswitch_work;
+ struct work_struct beacon_connection_loss_work;
+ struct work_struct csa_connection_drop_work;
+
+ unsigned long beacon_timeout;
+ unsigned long probe_timeout;
+ int probe_send_count;
+ bool nullfunc_failed;
+ bool connection_loss;
+
+ struct cfg80211_bss *associated;
+ struct ieee80211_mgd_auth_data *auth_data;
+ struct ieee80211_mgd_assoc_data *assoc_data;
+
+ u8 bssid[ETH_ALEN];
+
+ u16 aid;
+
+ bool powersave; /* powersave requested for this iface */
+ bool broken_ap; /* AP is broken -- turn off powersave */
+ bool have_beacon;
+ u8 dtim_period;
+ enum ieee80211_smps_mode req_smps, /* requested smps mode */
+ driver_smps_mode; /* smps mode request */
+
+ struct work_struct request_smps_work;
+
+ unsigned int flags;
+
+ bool csa_waiting_bcn;
+ bool csa_ignored_same_chan;
+
+ bool beacon_crc_valid;
+ u32 beacon_crc;
+
+ bool status_acked;
+ bool status_received;
+ __le16 status_fc;
+
+ enum {
+ IEEE80211_MFP_DISABLED,
+ IEEE80211_MFP_OPTIONAL,
+ IEEE80211_MFP_REQUIRED
+ } mfp; /* management frame protection */
+
+ /*
+ * Bitmask of enabled u-apsd queues,
+ * IEEE80211_WMM_IE_STA_QOSINFO_AC_BE & co. Needs a new association
+ * to take effect.
+ */
+ unsigned int uapsd_queues;
+
+ /*
+ * Maximum number of buffered frames AP can deliver during a
+ * service period, IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL or similar.
+ * Needs a new association to take effect.
+ */
+ unsigned int uapsd_max_sp_len;
+
+ int wmm_last_param_set;
+
+ u8 use_4addr;
+
+ s16 p2p_noa_index;
+
+ /* Signal strength from the last Beacon frame in the current BSS. */
+ int last_beacon_signal;
+
+ /*
+ * Weighted average of the signal strength from Beacon frames in the
+ * current BSS. This is in units of 1/16 of the signal unit to maintain
+ * accuracy and to speed up calculations, i.e., the value need to be
+ * divided by 16 to get the actual value.
+ */
+ int ave_beacon_signal;
+
+ /*
+ * Number of Beacon frames used in ave_beacon_signal. This can be used
+ * to avoid generating less reliable cqm events that would be based
+ * only on couple of received frames.
+ */
+ unsigned int count_beacon_signal;
+
+ /*
+ * Last Beacon frame signal strength average (ave_beacon_signal / 16)
+ * that triggered a cqm event. 0 indicates that no event has been
+ * generated for the current association.
+ */
+ int last_cqm_event_signal;
+
+ /*
+ * State variables for keeping track of RSSI of the AP currently
+ * connected to and informing driver when RSSI has gone
+ * below/above a certain threshold.
+ */
+ int rssi_min_thold, rssi_max_thold;
+ int last_ave_beacon_signal;
+
+ struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */
+ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */
+ struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */
+ struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */
+
+ /* TDLS support */
+ u8 tdls_peer[ETH_ALEN] __aligned(2);
+ struct delayed_work tdls_peer_del_work;
+ struct sk_buff *orig_teardown_skb; /* The original teardown skb */
+ struct sk_buff *teardown_skb; /* A copy to send through the AP */
+ spinlock_t teardown_lock; /* To lock changing teardown_skb */
+ bool tdls_chan_switch_prohibited;
+
+ /* WMM-AC TSPEC support */
+ struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS];
+ /* Use a separate work struct so that we can do something here
+ * while the sdata->work is flushing the queues, for example.
+ * otherwise, in scenarios where we hardly get any traffic out
+ * on the BE queue, but there's a lot of VO traffic, we might
+ * get stuck in a downgraded situation and flush takes forever.
+ */
+ struct delayed_work tx_tspec_wk;
+};
+
+struct ieee80211_if_ibss {
+ struct timer_list timer;
+ struct work_struct csa_connection_drop_work;
+
+ unsigned long last_scan_completed;
+
+ u32 basic_rates;
+
+ bool fixed_bssid;
+ bool fixed_channel;
+ bool privacy;
+
+ bool control_port;
+ bool userspace_handles_dfs;
+
+ u8 bssid[ETH_ALEN] __aligned(2);
+ u8 ssid[IEEE80211_MAX_SSID_LEN];
+ u8 ssid_len, ie_len;
+ u8 *ie;
+ struct cfg80211_chan_def chandef;
+
+ unsigned long ibss_join_req;
+ /* probe response/beacon for IBSS */
+ struct beacon_data __rcu *presp;
+
+ struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */
+ struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */
+
+ spinlock_t incomplete_lock;
+ struct list_head incomplete_stations;
+
+ enum {
+ IEEE80211_IBSS_MLME_SEARCH,
+ IEEE80211_IBSS_MLME_JOINED,
+ } state;
+};
+
+/**
+ * struct ieee80211_if_ocb - OCB mode state
+ *
+ * @housekeeping_timer: timer for periodic invocation of a housekeeping task
+ * @wrkq_flags: OCB deferred task action
+ * @incomplete_lock: delayed STA insertion lock
+ * @incomplete_stations: list of STAs waiting for delayed insertion
+ * @joined: indication if the interface is connected to an OCB network
+ */
+struct ieee80211_if_ocb {
+ struct timer_list housekeeping_timer;
+ unsigned long wrkq_flags;
+
+ spinlock_t incomplete_lock;
+ struct list_head incomplete_stations;
+
+ bool joined;
+};
+
+/**
+ * struct ieee80211_mesh_sync_ops - Extensible synchronization framework interface
+ *
+ * these declarations define the interface, which enables
+ * vendor-specific mesh synchronization
+ *
+ */
+struct ieee802_11_elems;
+struct ieee80211_mesh_sync_ops {
+ void (*rx_bcn_presp)(struct ieee80211_sub_if_data *sdata,
+ u16 stype,
+ struct ieee80211_mgmt *mgmt,
+ struct ieee802_11_elems *elems,
+ struct ieee80211_rx_status *rx_status);
+
+ /* should be called with beacon_data under RCU read lock */
+ void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata,
+ struct beacon_data *beacon);
+ /* add other framework functions here */
+};
+
+struct mesh_csa_settings {
+ struct rcu_head rcu_head;
+ struct cfg80211_csa_settings settings;
+};
+
+struct ieee80211_if_mesh {
+ struct timer_list housekeeping_timer;
+ struct timer_list mesh_path_timer;
+ struct timer_list mesh_path_root_timer;
+
+ unsigned long wrkq_flags;
+ unsigned long mbss_changed;
+
+ u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN];
+ size_t mesh_id_len;
+ /* Active Path Selection Protocol Identifier */
+ u8 mesh_pp_id;
+ /* Active Path Selection Metric Identifier */
+ u8 mesh_pm_id;
+ /* Congestion Control Mode Identifier */
+ u8 mesh_cc_id;
+ /* Synchronization Protocol Identifier */
+ u8 mesh_sp_id;
+ /* Authentication Protocol Identifier */
+ u8 mesh_auth_id;
+ /* Local mesh Sequence Number */
+ u32 sn;
+ /* Last used PREQ ID */
+ u32 preq_id;
+ atomic_t mpaths;
+ /* Timestamp of last SN update */
+ unsigned long last_sn_update;
+ /* Time when it's ok to send next PERR */
+ unsigned long next_perr;
+ /* Timestamp of last PREQ sent */
+ unsigned long last_preq;
+ struct mesh_rmc *rmc;
+ spinlock_t mesh_preq_queue_lock;
+ struct mesh_preq_queue preq_queue;
+ int preq_queue_len;
+ struct mesh_stats mshstats;
+ struct mesh_config mshcfg;
+ atomic_t estab_plinks;
+ u32 mesh_seqnum;
+ bool accepting_plinks;
+ int num_gates;
+ struct beacon_data __rcu *beacon;
+ const u8 *ie;
+ u8 ie_len;
+ enum {
+ IEEE80211_MESH_SEC_NONE = 0x0,
+ IEEE80211_MESH_SEC_AUTHED = 0x1,
+ IEEE80211_MESH_SEC_SECURED = 0x2,
+ } security;
+ bool user_mpm;
+ /* Extensible Synchronization Framework */
+ const struct ieee80211_mesh_sync_ops *sync_ops;
+ s64 sync_offset_clockdrift_max;
+ spinlock_t sync_offset_lock;
+ bool adjusting_tbtt;
+ /* mesh power save */
+ enum nl80211_mesh_power_mode nonpeer_pm;
+ int ps_peers_light_sleep;
+ int ps_peers_deep_sleep;
+ struct ps_data ps;
+ /* Channel Switching Support */
+ struct mesh_csa_settings __rcu *csa;
+ enum {
+ IEEE80211_MESH_CSA_ROLE_NONE,
+ IEEE80211_MESH_CSA_ROLE_INIT,
+ IEEE80211_MESH_CSA_ROLE_REPEATER,
+ } csa_role;
+ u8 chsw_ttl;
+ u16 pre_value;
+
+ /* offset from skb->data while building IE */
+ int meshconf_offset;
+};
+
+#ifdef CONFIG_MAC80211_MESH
+#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \
+ do { (msh)->mshstats.name++; } while (0)
+#else
+#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \
+ do { } while (0)
+#endif
+
+/**
+ * enum ieee80211_sub_if_data_flags - virtual interface flags
+ *
+ * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets
+ * @IEEE80211_SDATA_PROMISC: interface is promisc
+ * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode
+ * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between
+ * associated stations and deliver multicast frames both
+ * back to wireless media and to the local net stack.
+ * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume.
+ * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver
+ */
+enum ieee80211_sub_if_data_flags {
+ IEEE80211_SDATA_ALLMULTI = BIT(0),
+ IEEE80211_SDATA_PROMISC = BIT(1),
+ IEEE80211_SDATA_OPERATING_GMODE = BIT(2),
+ IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3),
+ IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4),
+ IEEE80211_SDATA_IN_DRIVER = BIT(5),
+};
+
+/**
+ * enum ieee80211_sdata_state_bits - virtual interface state bits
+ * @SDATA_STATE_RUNNING: virtual interface is up & running; this
+ * mirrors netif_running() but is separate for interface type
+ * change handling while the interface is up
+ * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel
+ * mode, so queues are stopped
+ * @SDATA_STATE_OFFCHANNEL_BEACON_STOPPED: Beaconing was stopped due
+ * to offchannel, reset when offchannel returns
+ */
+enum ieee80211_sdata_state_bits {
+ SDATA_STATE_RUNNING,
+ SDATA_STATE_OFFCHANNEL,
+ SDATA_STATE_OFFCHANNEL_BEACON_STOPPED,
+};
+
+/**
+ * enum ieee80211_chanctx_mode - channel context configuration mode
+ *
+ * @IEEE80211_CHANCTX_SHARED: channel context may be used by
+ * multiple interfaces
+ * @IEEE80211_CHANCTX_EXCLUSIVE: channel context can be used
+ * only by a single interface. This can be used for example for
+ * non-fixed channel IBSS.
+ */
+enum ieee80211_chanctx_mode {
+ IEEE80211_CHANCTX_SHARED,
+ IEEE80211_CHANCTX_EXCLUSIVE
+};
+
+/**
+ * enum ieee80211_chanctx_replace_state - channel context replacement state
+ *
+ * This is used for channel context in-place reservations that require channel
+ * context switch/swap.
+ *
+ * @IEEE80211_CHANCTX_REPLACE_NONE: no replacement is taking place
+ * @IEEE80211_CHANCTX_WILL_BE_REPLACED: this channel context will be replaced
+ * by a (not yet registered) channel context pointed by %replace_ctx.
+ * @IEEE80211_CHANCTX_REPLACES_OTHER: this (not yet registered) channel context
+ * replaces an existing channel context pointed to by %replace_ctx.
+ */
+enum ieee80211_chanctx_replace_state {
+ IEEE80211_CHANCTX_REPLACE_NONE,
+ IEEE80211_CHANCTX_WILL_BE_REPLACED,
+ IEEE80211_CHANCTX_REPLACES_OTHER,
+};
+
+struct ieee80211_chanctx {
+ struct list_head list;
+ struct rcu_head rcu_head;
+
+ struct list_head assigned_vifs;
+ struct list_head reserved_vifs;
+
+ enum ieee80211_chanctx_replace_state replace_state;
+ struct ieee80211_chanctx *replace_ctx;
+
+ enum ieee80211_chanctx_mode mode;
+ bool driver_present;
+
+ struct ieee80211_chanctx_conf conf;
+};
+
+struct mac80211_qos_map {
+ struct cfg80211_qos_map qos_map;
+ struct rcu_head rcu_head;
+};
+
+enum txq_info_flags {
+ IEEE80211_TXQ_STOP,
+ IEEE80211_TXQ_AMPDU,
+};
+
+struct txq_info {
+ struct sk_buff_head queue;
+ unsigned long flags;
+
+ /* keep last! */
+ struct ieee80211_txq txq;
+};
+
+struct ieee80211_sub_if_data {
+ struct list_head list;
+
+ struct wireless_dev wdev;
+
+ /* keys */
+ struct list_head key_list;
+
+ /* count for keys needing tailroom space allocation */
+ int crypto_tx_tailroom_needed_cnt;
+ int crypto_tx_tailroom_pending_dec;
+ struct delayed_work dec_tailroom_needed_wk;
+
+ struct net_device *dev;
+ struct ieee80211_local *local;
+
+ unsigned int flags;
+
+ unsigned long state;
+
+ char name[IFNAMSIZ];
+
+ /* Fragment table for host-based reassembly */
+ struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX];
+ unsigned int fragment_next;
+
+ /* TID bitmap for NoAck policy */
+ u16 noack_map;
+
+ /* bit field of ACM bits (BIT(802.1D tag)) */
+ u8 wmm_acm;
+
+ struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
+ struct ieee80211_key __rcu *default_unicast_key;
+ struct ieee80211_key __rcu *default_multicast_key;
+ struct ieee80211_key __rcu *default_mgmt_key;
+
+ u16 sequence_number;
+ __be16 control_port_protocol;
+ bool control_port_no_encrypt;
+ int encrypt_headroom;
+
+ atomic_t txqs_len[IEEE80211_NUM_ACS];
+ struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
+ struct mac80211_qos_map __rcu *qos_map;
+
+ struct work_struct csa_finalize_work;
+ bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */
+ struct cfg80211_chan_def csa_chandef;
+
+ struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */
+ struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */
+
+ /* context reservation -- protected with chanctx_mtx */
+ struct ieee80211_chanctx *reserved_chanctx;
+ struct cfg80211_chan_def reserved_chandef;
+ bool reserved_radar_required;
+ bool reserved_ready;
+
+ /* used to reconfigure hardware SM PS */
+ struct work_struct recalc_smps;
+
+ struct work_struct work;
+ struct sk_buff_head skb_queue;
+
+ u8 needed_rx_chains;
+ enum ieee80211_smps_mode smps_mode;
+
+ int user_power_level; /* in dBm */
+ int ap_power_level; /* in dBm */
+
+ bool radar_required;
+ struct delayed_work dfs_cac_timer_work;
+
+ /*
+ * AP this belongs to: self in AP mode and
+ * corresponding AP in VLAN mode, NULL for
+ * all others (might be needed later in IBSS)
+ */
+ struct ieee80211_if_ap *bss;
+
+ /* bitmap of allowed (non-MCS) rate indexes for rate control */
+ u32 rc_rateidx_mask[IEEE80211_NUM_BANDS];
+
+ bool rc_has_mcs_mask[IEEE80211_NUM_BANDS];
+ u8 rc_rateidx_mcs_mask[IEEE80211_NUM_BANDS][IEEE80211_HT_MCS_MASK_LEN];
+
+ union {
+ struct ieee80211_if_ap ap;
+ struct ieee80211_if_wds wds;
+ struct ieee80211_if_vlan vlan;
+ struct ieee80211_if_managed mgd;
+ struct ieee80211_if_ibss ibss;
+ struct ieee80211_if_mesh mesh;
+ struct ieee80211_if_ocb ocb;
+ u32 mntr_flags;
+ } u;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct {
+ struct dentry *subdir_stations;
+ struct dentry *default_unicast_key;
+ struct dentry *default_multicast_key;
+ struct dentry *default_mgmt_key;
+ } debugfs;
+#endif
+
+ /* must be last, dynamically sized area in this! */
+ struct ieee80211_vif vif;
+};
+
+static inline
+struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p)
+{
+ return container_of(p, struct ieee80211_sub_if_data, vif);
+}
+
+static inline void sdata_lock(struct ieee80211_sub_if_data *sdata)
+ __acquires(&sdata->wdev.mtx)
+{
+ mutex_lock(&sdata->wdev.mtx);
+ __acquire(&sdata->wdev.mtx);
+}
+
+static inline void sdata_unlock(struct ieee80211_sub_if_data *sdata)
+ __releases(&sdata->wdev.mtx)
+{
+ mutex_unlock(&sdata->wdev.mtx);
+ __release(&sdata->wdev.mtx);
+}
+
+#define sdata_dereference(p, sdata) \
+ rcu_dereference_protected(p, lockdep_is_held(&sdata->wdev.mtx))
+
+static inline void
+sdata_assert_lock(struct ieee80211_sub_if_data *sdata)
+{
+ lockdep_assert_held(&sdata->wdev.mtx);
+}
+
+static inline enum ieee80211_band
+ieee80211_get_sdata_band(struct ieee80211_sub_if_data *sdata)
+{
+ enum ieee80211_band band = IEEE80211_BAND_2GHZ;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!WARN_ON(!chanctx_conf))
+ band = chanctx_conf->def.chan->band;
+ rcu_read_unlock();
+
+ return band;
+}
+
+static inline int
+ieee80211_chandef_get_shift(struct cfg80211_chan_def *chandef)
+{
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_5:
+ return 2;
+ case NL80211_CHAN_WIDTH_10:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static inline int
+ieee80211_vif_get_shift(struct ieee80211_vif *vif)
+{
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ int shift = 0;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(vif->chanctx_conf);
+ if (chanctx_conf)
+ shift = ieee80211_chandef_get_shift(&chanctx_conf->def);
+ rcu_read_unlock();
+
+ return shift;
+}
+
+struct ieee80211_rx_agg {
+ u8 addr[ETH_ALEN];
+ u16 tid;
+};
+
+enum sdata_queue_type {
+ IEEE80211_SDATA_QUEUE_TYPE_FRAME = 0,
+ IEEE80211_SDATA_QUEUE_AGG_START = 1,
+ IEEE80211_SDATA_QUEUE_AGG_STOP = 2,
+ IEEE80211_SDATA_QUEUE_RX_AGG_START = 3,
+ IEEE80211_SDATA_QUEUE_RX_AGG_STOP = 4,
+ IEEE80211_SDATA_QUEUE_TDLS_CHSW = 5,
+};
+
+enum {
+ IEEE80211_RX_MSG = 1,
+ IEEE80211_TX_STATUS_MSG = 2,
+};
+
+enum queue_stop_reason {
+ IEEE80211_QUEUE_STOP_REASON_DRIVER,
+ IEEE80211_QUEUE_STOP_REASON_PS,
+ IEEE80211_QUEUE_STOP_REASON_CSA,
+ IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
+ IEEE80211_QUEUE_STOP_REASON_SUSPEND,
+ IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
+ IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL,
+ IEEE80211_QUEUE_STOP_REASON_FLUSH,
+ IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN,
+ IEEE80211_QUEUE_STOP_REASON_RESERVE_TID,
+
+ IEEE80211_QUEUE_STOP_REASONS,
+};
+
+#ifdef CONFIG_MAC80211_LEDS
+struct tpt_led_trigger {
+ struct led_trigger trig;
+ char name[32];
+ const struct ieee80211_tpt_blink *blink_table;
+ unsigned int blink_table_len;
+ struct timer_list timer;
+ unsigned long prev_traffic;
+ unsigned long tx_bytes, rx_bytes;
+ unsigned int active, want;
+ bool running;
+};
+#endif
+
+/**
+ * mac80211 scan flags - currently active scan mode
+ *
+ * @SCAN_SW_SCANNING: We're currently in the process of scanning but may as
+ * well be on the operating channel
+ * @SCAN_HW_SCANNING: The hardware is scanning for us, we have no way to
+ * determine if we are on the operating channel or not
+ * @SCAN_ONCHANNEL_SCANNING: Do a software scan on only the current operating
+ * channel. This should not interrupt normal traffic.
+ * @SCAN_COMPLETED: Set for our scan work function when the driver reported
+ * that the scan completed.
+ * @SCAN_ABORTED: Set for our scan work function when the driver reported
+ * a scan complete for an aborted scan.
+ * @SCAN_HW_CANCELLED: Set for our scan work function when the scan is being
+ * cancelled.
+ */
+enum {
+ SCAN_SW_SCANNING,
+ SCAN_HW_SCANNING,
+ SCAN_ONCHANNEL_SCANNING,
+ SCAN_COMPLETED,
+ SCAN_ABORTED,
+ SCAN_HW_CANCELLED,
+};
+
+/**
+ * enum mac80211_scan_state - scan state machine states
+ *
+ * @SCAN_DECISION: Main entry point to the scan state machine, this state
+ * determines if we should keep on scanning or switch back to the
+ * operating channel
+ * @SCAN_SET_CHANNEL: Set the next channel to be scanned
+ * @SCAN_SEND_PROBE: Send probe requests and wait for probe responses
+ * @SCAN_SUSPEND: Suspend the scan and go back to operating channel to
+ * send out data
+ * @SCAN_RESUME: Resume the scan and scan the next channel
+ * @SCAN_ABORT: Abort the scan and go back to operating channel
+ */
+enum mac80211_scan_state {
+ SCAN_DECISION,
+ SCAN_SET_CHANNEL,
+ SCAN_SEND_PROBE,
+ SCAN_SUSPEND,
+ SCAN_RESUME,
+ SCAN_ABORT,
+};
+
+struct ieee80211_local {
+ /* embed the driver visible part.
+ * don't cast (use the static inlines below), but we keep
+ * it first anyway so they become a no-op */
+ struct ieee80211_hw hw;
+
+ const struct ieee80211_ops *ops;
+
+ /*
+ * private workqueue to mac80211. mac80211 makes this accessible
+ * via ieee80211_queue_work()
+ */
+ struct workqueue_struct *workqueue;
+
+ unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES];
+ int q_stop_reasons[IEEE80211_MAX_QUEUES][IEEE80211_QUEUE_STOP_REASONS];
+ /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */
+ spinlock_t queue_stop_reason_lock;
+
+ int open_count;
+ int monitors, cooked_mntrs;
+ /* number of interfaces with corresponding FIF_ flags */
+ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
+ fif_probe_req;
+ int probe_req_reg;
+ unsigned int filter_flags; /* FIF_* */
+
+ bool wiphy_ciphers_allocated;
+
+ bool use_chanctx;
+
+ /* protects the aggregated multicast list and filter calls */
+ spinlock_t filter_lock;
+
+ /* used for uploading changed mc list */
+ struct work_struct reconfig_filter;
+
+ /* aggregated multicast list */
+ struct netdev_hw_addr_list mc_list;
+
+ bool tim_in_locked_section; /* see ieee80211_beacon_get() */
+
+ /*
+ * suspended is true if we finished all the suspend _and_ we have
+ * not yet come up from resume. This is to be used by mac80211
+ * to ensure driver sanity during suspend and mac80211's own
+ * sanity. It can eventually be used for WoW as well.
+ */
+ bool suspended;
+
+ /*
+ * Resuming is true while suspended, but when we're reprogramming the
+ * hardware -- at that time it's allowed to use ieee80211_queue_work()
+ * again even though some other parts of the stack are still suspended
+ * and we still drop received frames to avoid waking the stack.
+ */
+ bool resuming;
+
+ /*
+ * quiescing is true during the suspend process _only_ to
+ * ease timer cancelling etc.
+ */
+ bool quiescing;
+
+ /* device is started */
+ bool started;
+
+ /* device is during a HW reconfig */
+ bool in_reconfig;
+
+ /* wowlan is enabled -- don't reconfig on resume */
+ bool wowlan;
+
+ struct work_struct radar_detected_work;
+
+ /* number of RX chains the hardware has */
+ u8 rx_chains;
+
+ int tx_headroom; /* required headroom for hardware/radiotap */
+
+ /* Tasklet and skb queue to process calls from IRQ mode. All frames
+ * added to skb_queue will be processed, but frames in
+ * skb_queue_unreliable may be dropped if the total length of these
+ * queues increases over the limit. */
+#define IEEE80211_IRQSAFE_QUEUE_LIMIT 128
+ struct tasklet_struct tasklet;
+ struct sk_buff_head skb_queue;
+ struct sk_buff_head skb_queue_unreliable;
+
+ spinlock_t rx_path_lock;
+
+ /* Station data */
+ /*
+ * The mutex only protects the list, hash table and
+ * counter, reads are done with RCU.
+ */
+ struct mutex sta_mtx;
+ spinlock_t tim_lock;
+ unsigned long num_sta;
+ struct list_head sta_list;
+ struct rhashtable sta_hash;
+ struct timer_list sta_cleanup;
+ int sta_generation;
+
+ struct sk_buff_head pending[IEEE80211_MAX_QUEUES];
+ struct tasklet_struct tx_pending_tasklet;
+
+ atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];
+
+ /* number of interfaces with corresponding IFF_ flags */
+ atomic_t iff_allmultis, iff_promiscs;
+
+ struct rate_control_ref *rate_ctrl;
+
+ struct crypto_cipher *wep_tx_tfm;
+ struct crypto_cipher *wep_rx_tfm;
+ u32 wep_iv;
+
+ /* see iface.c */
+ struct list_head interfaces;
+ struct mutex iflist_mtx;
+
+ /*
+ * Key mutex, protects sdata's key_list and sta_info's
+ * key pointers (write access, they're RCU.)
+ */
+ struct mutex key_mtx;
+
+ /* mutex for scan and work locking */
+ struct mutex mtx;
+
+ /* Scanning and BSS list */
+ unsigned long scanning;
+ struct cfg80211_ssid scan_ssid;
+ struct cfg80211_scan_request *int_scan_req;
+ struct cfg80211_scan_request __rcu *scan_req;
+ struct ieee80211_scan_request *hw_scan_req;
+ struct cfg80211_chan_def scan_chandef;
+ enum ieee80211_band hw_scan_band;
+ int scan_channel_idx;
+ int scan_ies_len;
+ int hw_scan_ies_bufsize;
+
+ struct work_struct sched_scan_stopped_work;
+ struct ieee80211_sub_if_data __rcu *sched_scan_sdata;
+ struct cfg80211_sched_scan_request __rcu *sched_scan_req;
+ u8 scan_addr[ETH_ALEN];
+
+ unsigned long leave_oper_channel_time;
+ enum mac80211_scan_state next_scan_state;
+ struct delayed_work scan_work;
+ struct ieee80211_sub_if_data __rcu *scan_sdata;
+ /* For backward compatibility only -- do not use */
+ struct cfg80211_chan_def _oper_chandef;
+
+ /* Temporary remain-on-channel for off-channel operations */
+ struct ieee80211_channel *tmp_channel;
+
+ /* channel contexts */
+ struct list_head chanctx_list;
+ struct mutex chanctx_mtx;
+
+ /* SNMP counters */
+ /* dot11CountersTable */
+ u32 dot11TransmittedFragmentCount;
+ u32 dot11MulticastTransmittedFrameCount;
+ u32 dot11FailedCount;
+ u32 dot11RetryCount;
+ u32 dot11MultipleRetryCount;
+ u32 dot11FrameDuplicateCount;
+ u32 dot11ReceivedFragmentCount;
+ u32 dot11MulticastReceivedFrameCount;
+ u32 dot11TransmittedFrameCount;
+
+#ifdef CONFIG_MAC80211_LEDS
+ struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led;
+ struct tpt_led_trigger *tpt_led_trigger;
+ char tx_led_name[32], rx_led_name[32],
+ assoc_led_name[32], radio_led_name[32];
+#endif
+
+#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
+ /* TX/RX handler statistics */
+ unsigned int tx_handlers_drop;
+ unsigned int tx_handlers_queued;
+ unsigned int tx_handlers_drop_fragment;
+ unsigned int tx_handlers_drop_wep;
+ unsigned int tx_handlers_drop_not_assoc;
+ unsigned int tx_handlers_drop_unauth_port;
+ unsigned int rx_handlers_drop;
+ unsigned int rx_handlers_queued;
+ unsigned int rx_handlers_drop_nullfunc;
+ unsigned int rx_handlers_drop_defrag;
+ unsigned int rx_handlers_drop_short;
+ unsigned int tx_expand_skb_head;
+ unsigned int tx_expand_skb_head_cloned;
+ unsigned int rx_expand_skb_head;
+ unsigned int rx_expand_skb_head2;
+ unsigned int rx_handlers_fragments;
+ unsigned int tx_status_drop;
+#define I802_DEBUG_INC(c) (c)++
+#else /* CONFIG_MAC80211_DEBUG_COUNTERS */
+#define I802_DEBUG_INC(c) do { } while (0)
+#endif /* CONFIG_MAC80211_DEBUG_COUNTERS */
+
+
+ int total_ps_buffered; /* total number of all buffered unicast and
+ * multicast packets for power saving stations
+ */
+
+ bool pspolling;
+ bool offchannel_ps_enabled;
+ /*
+ * PS can only be enabled when we have exactly one managed
+ * interface (and monitors) in PS, this then points there.
+ */
+ struct ieee80211_sub_if_data *ps_sdata;
+ struct work_struct dynamic_ps_enable_work;
+ struct work_struct dynamic_ps_disable_work;
+ struct timer_list dynamic_ps_timer;
+ struct notifier_block network_latency_notifier;
+ struct notifier_block ifa_notifier;
+ struct notifier_block ifa6_notifier;
+
+ /*
+ * The dynamic ps timeout configured from user space via WEXT -
+ * this will override whatever chosen by mac80211 internally.
+ */
+ int dynamic_ps_forced_timeout;
+
+ int user_power_level; /* in dBm, for all interfaces */
+
+ enum ieee80211_smps_mode smps_mode;
+
+ struct work_struct restart_work;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct local_debugfsdentries {
+ struct dentry *rcdir;
+ struct dentry *keys;
+ } debugfs;
+#endif
+
+ /*
+ * Remain-on-channel support
+ */
+ struct list_head roc_list;
+ struct work_struct hw_roc_start, hw_roc_done;
+ unsigned long hw_roc_start_time;
+ u64 roc_cookie_counter;
+
+ struct idr ack_status_frames;
+ spinlock_t ack_status_lock;
+
+ struct ieee80211_sub_if_data __rcu *p2p_sdata;
+
+ struct napi_struct *napi;
+
+ /* virtual monitor interface */
+ struct ieee80211_sub_if_data __rcu *monitor_sdata;
+ struct cfg80211_chan_def monitor_chandef;
+
+ /* extended capabilities provided by mac80211 */
+ u8 ext_capa[8];
+};
+
+static inline struct ieee80211_sub_if_data *
+IEEE80211_DEV_TO_SUB_IF(struct net_device *dev)
+{
+ return netdev_priv(dev);
+}
+
+static inline struct ieee80211_sub_if_data *
+IEEE80211_WDEV_TO_SUB_IF(struct wireless_dev *wdev)
+{
+ return container_of(wdev, struct ieee80211_sub_if_data, wdev);
+}
+
+/* this struct represents 802.11n's RA/TID combination */
+struct ieee80211_ra_tid {
+ u8 ra[ETH_ALEN];
+ u16 tid;
+};
+
+/* this struct holds the value parsing from channel switch IE */
+struct ieee80211_csa_ie {
+ struct cfg80211_chan_def chandef;
+ u8 mode;
+ u8 count;
+ u8 ttl;
+ u16 pre_value;
+};
+
+/* Parsed Information Elements */
+struct ieee802_11_elems {
+ const u8 *ie_start;
+ size_t total_len;
+
+ /* pointers to IEs */
+ const struct ieee80211_tdls_lnkie *lnk_id;
+ const struct ieee80211_ch_switch_timing *ch_sw_timing;
+ const u8 *ext_capab;
+ const u8 *ssid;
+ const u8 *supp_rates;
+ const u8 *ds_params;
+ const struct ieee80211_tim_ie *tim;
+ const u8 *challenge;
+ const u8 *rsn;
+ const u8 *erp_info;
+ const u8 *ext_supp_rates;
+ const u8 *wmm_info;
+ const u8 *wmm_param;
+ const struct ieee80211_ht_cap *ht_cap_elem;
+ const struct ieee80211_ht_operation *ht_operation;
+ const struct ieee80211_vht_cap *vht_cap_elem;
+ const struct ieee80211_vht_operation *vht_operation;
+ const struct ieee80211_meshconf_ie *mesh_config;
+ const u8 *mesh_id;
+ const u8 *peering;
+ const __le16 *awake_window;
+ const u8 *preq;
+ const u8 *prep;
+ const u8 *perr;
+ const struct ieee80211_rann_ie *rann;
+ const struct ieee80211_channel_sw_ie *ch_switch_ie;
+ const struct ieee80211_ext_chansw_ie *ext_chansw_ie;
+ const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
+ const u8 *country_elem;
+ const u8 *pwr_constr_elem;
+ const u8 *cisco_dtpc_elem;
+ const struct ieee80211_timeout_interval_ie *timeout_int;
+ const u8 *opmode_notif;
+ const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
+ const struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie;
+
+ /* length of them, respectively */
+ u8 ext_capab_len;
+ u8 ssid_len;
+ u8 supp_rates_len;
+ u8 tim_len;
+ u8 challenge_len;
+ u8 rsn_len;
+ u8 ext_supp_rates_len;
+ u8 wmm_info_len;
+ u8 wmm_param_len;
+ u8 mesh_id_len;
+ u8 peering_len;
+ u8 preq_len;
+ u8 prep_len;
+ u8 perr_len;
+ u8 country_elem_len;
+
+ /* whether a parse error occurred while retrieving these elements */
+ bool parse_error;
+};
+
+static inline struct ieee80211_local *hw_to_local(
+ struct ieee80211_hw *hw)
+{
+ return container_of(hw, struct ieee80211_local, hw);
+}
+
+static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq)
+{
+ return container_of(txq, struct txq_info, txq);
+}
+
+static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
+{
+ return ether_addr_equal(raddr, addr) ||
+ is_broadcast_ether_addr(raddr);
+}
+
+static inline bool
+ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
+{
+ WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START &&
+ status->flag & RX_FLAG_MACTIME_END);
+ return status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END);
+}
+
+u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
+ struct ieee80211_rx_status *status,
+ unsigned int mpdu_len,
+ unsigned int mpdu_offset);
+int ieee80211_hw_config(struct ieee80211_local *local, u32 changed);
+void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
+void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
+ u32 changed);
+void ieee80211_configure_filter(struct ieee80211_local *local);
+u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
+
+/* STA code */
+void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
+int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_auth_request *req);
+int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_assoc_request *req);
+int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_deauth_request *req);
+int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_disassoc_request *req);
+void ieee80211_send_pspoll(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata);
+void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency);
+void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata);
+int ieee80211_max_network_latency(struct notifier_block *nb,
+ unsigned long data, void *dummy);
+int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata,
+ __le16 fc, bool acked);
+void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata);
+
+/* IBSS code */
+void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
+void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, const u8 *addr, u32 supp_rates);
+int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_ibss_params *params);
+int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_csa_settings *csa_settings);
+int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata);
+
+/* OCB code */
+void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, const u8 *addr, u32 supp_rates);
+void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata);
+int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
+ struct ocb_setup *setup);
+int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata);
+
+/* mesh code */
+void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_csa_settings *csa_settings);
+int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata);
+
+/* scan/BSS handling */
+void ieee80211_scan_work(struct work_struct *work);
+int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
+ const u8 *ssid, u8 ssid_len,
+ struct ieee80211_channel **channels,
+ unsigned int n_channels,
+ enum nl80211_bss_scan_width scan_width);
+int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_scan_request *req);
+void ieee80211_scan_cancel(struct ieee80211_local *local);
+void ieee80211_run_deferred_scan(struct ieee80211_local *local);
+void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb);
+
+void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
+struct ieee80211_bss *
+ieee80211_bss_info_update(struct ieee80211_local *local,
+ struct ieee80211_rx_status *rx_status,
+ struct ieee80211_mgmt *mgmt,
+ size_t len,
+ struct ieee802_11_elems *elems,
+ struct ieee80211_channel *channel);
+void ieee80211_rx_bss_put(struct ieee80211_local *local,
+ struct ieee80211_bss *bss);
+
+/* scheduled scan handling */
+int
+__ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_sched_scan_request *req);
+int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_sched_scan_request *req);
+int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sched_scan_end(struct ieee80211_local *local);
+void ieee80211_sched_scan_stopped_work(struct work_struct *work);
+
+/* off-channel helpers */
+void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local);
+void ieee80211_offchannel_return(struct ieee80211_local *local);
+void ieee80211_roc_setup(struct ieee80211_local *local);
+void ieee80211_start_next_roc(struct ieee80211_local *local);
+void ieee80211_roc_purge(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata);
+void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free);
+void ieee80211_sw_roc_work(struct work_struct *work);
+void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc);
+
+/* channel switch handling */
+void ieee80211_csa_finalize_work(struct work_struct *work);
+int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_csa_settings *params);
+
+/* interface handling */
+int ieee80211_iface_init(void);
+void ieee80211_iface_exit(void);
+int ieee80211_if_add(struct ieee80211_local *local, const char *name,
+ unsigned char name_assign_type,
+ struct wireless_dev **new_wdev, enum nl80211_iftype type,
+ struct vif_params *params);
+int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type);
+void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
+void ieee80211_remove_interfaces(struct ieee80211_local *local);
+u32 ieee80211_idle_off(struct ieee80211_local *local);
+void ieee80211_recalc_idle(struct ieee80211_local *local);
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+ const int offset);
+int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up);
+void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata);
+int ieee80211_add_virtual_monitor(struct ieee80211_local *local);
+void ieee80211_del_virtual_monitor(struct ieee80211_local *local);
+
+bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata);
+void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
+ bool update_bss);
+
+static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
+{
+ return test_bit(SDATA_STATE_RUNNING, &sdata->state);
+}
+
+/* tx handling */
+void ieee80211_clear_tx_pending(struct ieee80211_local *local);
+void ieee80211_tx_pending(unsigned long data);
+netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+void __ieee80211_subif_start_xmit(struct sk_buff *skb,
+ struct net_device *dev,
+ u32 info_flags);
+void ieee80211_purge_tx_queue(struct ieee80211_hw *hw,
+ struct sk_buff_head *skbs);
+struct sk_buff *
+ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u32 info_flags);
+
+/* HT */
+void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta_ht_cap *ht_cap);
+bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const struct ieee80211_ht_cap *ht_cap_ie,
+ struct sta_info *sta);
+void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
+ const u8 *da, u16 tid,
+ u16 initiator, u16 reason_code);
+int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_smps_mode smps, const u8 *da,
+ const u8 *bssid);
+void ieee80211_request_smps_ap_work(struct work_struct *work);
+void ieee80211_request_smps_mgd_work(struct work_struct *work);
+bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old,
+ enum ieee80211_smps_mode smps_mode_new);
+
+void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+ u16 initiator, u16 reason, bool stop);
+void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+ u16 initiator, u16 reason, bool stop);
+void __ieee80211_start_rx_ba_session(struct sta_info *sta,
+ u8 dialog_token, u16 timeout,
+ u16 start_seq_num, u16 ba_policy, u16 tid,
+ u16 buf_size, bool tx, bool auto_seq);
+void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
+ enum ieee80211_agg_stop_reason reason);
+void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_mgmt *mgmt, size_t len);
+void ieee80211_process_addba_resp(struct ieee80211_local *local,
+ struct sta_info *sta,
+ struct ieee80211_mgmt *mgmt,
+ size_t len);
+void ieee80211_process_addba_request(struct ieee80211_local *local,
+ struct sta_info *sta,
+ struct ieee80211_mgmt *mgmt,
+ size_t len);
+
+int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+ enum ieee80211_agg_stop_reason reason);
+int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+ enum ieee80211_agg_stop_reason reason);
+void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid);
+void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid);
+void ieee80211_ba_session_work(struct work_struct *work);
+void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
+void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);
+
+u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs);
+
+/* VHT */
+void
+ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const struct ieee80211_vht_cap *vht_cap_ie,
+ struct sta_info *sta);
+enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta);
+enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta);
+void ieee80211_sta_set_rx_nss(struct sta_info *sta);
+u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, u8 opmode,
+ enum ieee80211_band band, bool nss_only);
+void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, u8 opmode,
+ enum ieee80211_band band, bool nss_only);
+void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta_vht_cap *vht_cap);
+
+/* Spectrum management */
+void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len);
+/**
+ * ieee80211_parse_ch_switch_ie - parses channel switch IEs
+ * @sdata: the sdata of the interface which has received the frame
+ * @elems: parsed 802.11 elements received with the frame
+ * @current_band: indicates the current band
+ * @sta_flags: contains information about own capabilities and restrictions
+ * to decide which channel switch announcements can be accepted. Only the
+ * following subset of &enum ieee80211_sta_flags are evaluated:
+ * %IEEE80211_STA_DISABLE_HT, %IEEE80211_STA_DISABLE_VHT,
+ * %IEEE80211_STA_DISABLE_40MHZ, %IEEE80211_STA_DISABLE_80P80MHZ,
+ * %IEEE80211_STA_DISABLE_160MHZ.
+ * @bssid: the currently connected bssid (for reporting)
+ * @csa_ie: parsed 802.11 csa elements on count, mode, chandef and mesh ttl.
+ All of them will be filled with if success only.
+ * Return: 0 on success, <0 on error and >0 if there is nothing to parse.
+ */
+int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
+ struct ieee802_11_elems *elems,
+ enum ieee80211_band current_band,
+ u32 sta_flags, u8 *bssid,
+ struct ieee80211_csa_ie *csa_ie);
+
+/* Suspend/resume and hw reconfiguration */
+int ieee80211_reconfig(struct ieee80211_local *local);
+void ieee80211_stop_device(struct ieee80211_local *local);
+
+int __ieee80211_suspend(struct ieee80211_hw *hw,
+ struct cfg80211_wowlan *wowlan);
+
+static inline int __ieee80211_resume(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) &&
+ !test_bit(SCAN_COMPLETED, &local->scanning),
+ "%s: resume with hardware scan still in progress\n",
+ wiphy_name(hw->wiphy));
+
+ return ieee80211_reconfig(hw_to_local(hw));
+}
+
+/* utility functions/constants */
+extern const void *const mac80211_wiphy_privid; /* for wiphy privid */
+u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
+ enum nl80211_iftype type);
+int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
+ int rate, int erp, int short_preamble,
+ int shift);
+void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
+ struct ieee80211_hdr *hdr, const u8 *tsc,
+ gfp_t gfp);
+void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
+ bool bss_notify);
+void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, struct sk_buff *skb);
+
+void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, int tid,
+ enum ieee80211_band band);
+
+static inline void
+ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, int tid,
+ enum ieee80211_band band)
+{
+ rcu_read_lock();
+ __ieee80211_tx_skb_tid_band(sdata, skb, tid, band);
+ rcu_read_unlock();
+}
+
+static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, int tid)
+{
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return;
+ }
+
+ __ieee80211_tx_skb_tid_band(sdata, skb, tid,
+ chanctx_conf->def.chan->band);
+ rcu_read_unlock();
+}
+
+static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */
+ ieee80211_tx_skb_tid(sdata, skb, 7);
+}
+
+u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
+ struct ieee802_11_elems *elems,
+ u64 filter, u32 crc);
+static inline void ieee802_11_parse_elems(const u8 *start, size_t len,
+ bool action,
+ struct ieee802_11_elems *elems)
+{
+ ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0);
+}
+
+static inline bool ieee80211_rx_reorder_ready(struct sk_buff_head *frames)
+{
+ struct sk_buff *tail = skb_peek_tail(frames);
+ struct ieee80211_rx_status *status;
+
+ if (!tail)
+ return false;
+
+ status = IEEE80211_SKB_RXCB(tail);
+ if (status->flag & RX_FLAG_AMSDU_MORE)
+ return false;
+
+ return true;
+}
+
+extern const int ieee802_1d_to_ac[8];
+
+static inline int ieee80211_ac_from_tid(int tid)
+{
+ return ieee802_1d_to_ac[tid & 7];
+}
+
+void ieee80211_dynamic_ps_enable_work(struct work_struct *work);
+void ieee80211_dynamic_ps_disable_work(struct work_struct *work);
+void ieee80211_dynamic_ps_timer(unsigned long data);
+void ieee80211_send_nullfunc(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ int powersave);
+void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_hdr *hdr);
+void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_hdr *hdr, bool ack, u16 tx_time);
+
+void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
+ unsigned long queues,
+ enum queue_stop_reason reason,
+ bool refcounted);
+void ieee80211_stop_vif_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum queue_stop_reason reason);
+void ieee80211_wake_vif_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum queue_stop_reason reason);
+void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
+ unsigned long queues,
+ enum queue_stop_reason reason,
+ bool refcounted);
+void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason,
+ bool refcounted);
+void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason,
+ bool refcounted);
+void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue);
+void ieee80211_add_pending_skb(struct ieee80211_local *local,
+ struct sk_buff *skb);
+void ieee80211_add_pending_skbs(struct ieee80211_local *local,
+ struct sk_buff_head *skbs);
+void ieee80211_flush_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata, bool drop);
+void __ieee80211_flush_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ unsigned int queues, bool drop);
+
+static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
+{
+ /*
+ * If quiescing is set, we are racing with __ieee80211_suspend.
+ * __ieee80211_suspend flushes the workers after setting quiescing,
+ * and we check quiescing / suspended before enqueing new workers.
+ * We should abort the worker to avoid the races below.
+ */
+ if (local->quiescing)
+ return false;
+
+ /*
+ * We might already be suspended if the following scenario occurs:
+ * __ieee80211_suspend Control path
+ *
+ * if (local->quiescing)
+ * return;
+ * local->quiescing = true;
+ * flush_workqueue();
+ * queue_work(...);
+ * local->suspended = true;
+ * local->quiescing = false;
+ * worker starts running...
+ */
+ if (local->suspended)
+ return false;
+
+ return true;
+}
+
+void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct txq_info *txq, int tid);
+void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+ u16 transaction, u16 auth_alg, u16 status,
+ const u8 *extra, size_t extra_len, const u8 *bssid,
+ const u8 *da, const u8 *key, u8 key_len, u8 key_idx,
+ u32 tx_flags);
+void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, u16 stype, u16 reason,
+ bool send_frame, u8 *frame_buf);
+int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
+ size_t buffer_len,
+ struct ieee80211_scan_ies *ie_desc,
+ const u8 *ie, size_t ie_len,
+ u8 bands_used, u32 *rate_masks,
+ struct cfg80211_chan_def *chandef);
+struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
+ const u8 *src, const u8 *dst,
+ u32 ratemask,
+ struct ieee80211_channel *chan,
+ const u8 *ssid, size_t ssid_len,
+ const u8 *ie, size_t ie_len,
+ bool directed);
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
+ const u8 *src, const u8 *dst,
+ const u8 *ssid, size_t ssid_len,
+ const u8 *ie, size_t ie_len,
+ u32 ratemask, bool directed, u32 tx_flags,
+ struct ieee80211_channel *channel, bool scan);
+
+u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
+ struct ieee802_11_elems *elems,
+ enum ieee80211_band band, u32 *basic_rates);
+int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_smps_mode smps_mode);
+int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_smps_mode smps_mode);
+void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata);
+void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata);
+
+size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset);
+u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
+ u16 cap);
+u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
+ const struct cfg80211_chan_def *chandef,
+ u16 prot_mode);
+u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
+ u32 cap);
+u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
+ const struct cfg80211_chan_def *chandef);
+int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
+ const struct ieee80211_supported_band *sband,
+ const u8 *srates, int srates_len, u32 *rates);
+int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, bool need_basic,
+ enum ieee80211_band band);
+int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, bool need_basic,
+ enum ieee80211_band band);
+u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
+
+/* channel management */
+void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
+ const struct ieee80211_ht_operation *ht_oper,
+ struct cfg80211_chan_def *chandef);
+void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan,
+ const struct ieee80211_vht_operation *oper,
+ struct cfg80211_chan_def *chandef);
+u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);
+
+int __must_check
+ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode mode);
+int __must_check
+ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode mode,
+ bool radar_required);
+int __must_check
+ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata);
+int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata);
+
+int __must_check
+ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef,
+ u32 *changed);
+void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata);
+void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata);
+void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
+ bool clear);
+int ieee80211_chanctx_refcount(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx);
+
+void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *chanctx);
+void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx);
+bool ieee80211_is_radar_required(struct ieee80211_local *local);
+
+void ieee80211_dfs_cac_timer(unsigned long data);
+void ieee80211_dfs_cac_timer_work(struct work_struct *work);
+void ieee80211_dfs_cac_cancel(struct ieee80211_local *local);
+void ieee80211_dfs_radar_detected_work(struct work_struct *work);
+int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_csa_settings *csa_settings);
+
+bool ieee80211_cs_valid(const struct ieee80211_cipher_scheme *cs);
+bool ieee80211_cs_list_valid(const struct ieee80211_cipher_scheme *cs, int n);
+const struct ieee80211_cipher_scheme *
+ieee80211_cs_get(struct ieee80211_local *local, u32 cipher,
+ enum nl80211_iftype iftype);
+int ieee80211_cs_headroom(struct ieee80211_local *local,
+ struct cfg80211_crypto_settings *crypto,
+ enum nl80211_iftype iftype);
+void ieee80211_recalc_dtim(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata);
+int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode chanmode,
+ u8 radar_detect);
+int ieee80211_max_num_channels(struct ieee80211_local *local);
+
+/* TDLS */
+int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, u8 action_code, u8 dialog_token,
+ u16 status_code, u32 peer_capability,
+ bool initiator, const u8 *extra_ies,
+ size_t extra_ies_len);
+int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, enum nl80211_tdls_operation oper);
+void ieee80211_tdls_peer_del_work(struct work_struct *wk);
+int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *addr, u8 oper_class,
+ struct cfg80211_chan_def *chandef);
+void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy,
+ struct net_device *dev,
+ const u8 *addr);
+void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+
+extern const struct ethtool_ops ieee80211_ethtool_ops;
+
+#ifdef CONFIG_MAC80211_NOINLINE
+#define debug_noinline noinline
+#else
+#define debug_noinline
+#endif
+
+#endif /* IEEE80211_I_H */
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
new file mode 100644
index 000000000..84cef600c
--- /dev/null
+++ b/net/mac80211/iface.c
@@ -0,0 +1,1912 @@
+/*
+ * Interface handling
+ *
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/mac80211.h>
+#include <net/ieee80211_radiotap.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+#include "debugfs_netdev.h"
+#include "mesh.h"
+#include "led.h"
+#include "driver-ops.h"
+#include "wme.h"
+#include "rate.h"
+
+/**
+ * DOC: Interface list locking
+ *
+ * The interface list in each struct ieee80211_local is protected
+ * three-fold:
+ *
+ * (1) modifications may only be done under the RTNL
+ * (2) modifications and readers are protected against each other by
+ * the iflist_mtx.
+ * (3) modifications are done in an RCU manner so atomic readers
+ * can traverse the list in RCU-safe blocks.
+ *
+ * As a consequence, reads (traversals) of the list can be protected
+ * by either the RTNL, the iflist_mtx or RCU.
+ */
+
+bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ int power;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ power = ieee80211_chandef_max_power(&chanctx_conf->def);
+ rcu_read_unlock();
+
+ if (sdata->user_power_level != IEEE80211_UNSET_POWER_LEVEL)
+ power = min(power, sdata->user_power_level);
+
+ if (sdata->ap_power_level != IEEE80211_UNSET_POWER_LEVEL)
+ power = min(power, sdata->ap_power_level);
+
+ if (power != sdata->vif.bss_conf.txpower) {
+ sdata->vif.bss_conf.txpower = power;
+ ieee80211_hw_config(sdata->local, 0);
+ return true;
+ }
+
+ return false;
+}
+
+void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
+ bool update_bss)
+{
+ if (__ieee80211_recalc_txpower(sdata) || update_bss)
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_TXPOWER);
+}
+
+static u32 __ieee80211_idle_off(struct ieee80211_local *local)
+{
+ if (!(local->hw.conf.flags & IEEE80211_CONF_IDLE))
+ return 0;
+
+ local->hw.conf.flags &= ~IEEE80211_CONF_IDLE;
+ return IEEE80211_CONF_CHANGE_IDLE;
+}
+
+static u32 __ieee80211_idle_on(struct ieee80211_local *local)
+{
+ if (local->hw.conf.flags & IEEE80211_CONF_IDLE)
+ return 0;
+
+ ieee80211_flush_queues(local, NULL, false);
+
+ local->hw.conf.flags |= IEEE80211_CONF_IDLE;
+ return IEEE80211_CONF_CHANGE_IDLE;
+}
+
+static u32 __ieee80211_recalc_idle(struct ieee80211_local *local,
+ bool force_active)
+{
+ bool working, scanning, active;
+ unsigned int led_trig_start = 0, led_trig_stop = 0;
+
+ lockdep_assert_held(&local->mtx);
+
+ active = force_active ||
+ !list_empty(&local->chanctx_list) ||
+ local->monitors;
+
+ working = !local->ops->remain_on_channel &&
+ !list_empty(&local->roc_list);
+
+ scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) ||
+ test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning);
+
+ if (working || scanning)
+ led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_WORK;
+ else
+ led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_WORK;
+
+ if (active)
+ led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED;
+ else
+ led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED;
+
+ ieee80211_mod_tpt_led_trig(local, led_trig_start, led_trig_stop);
+
+ if (working || scanning || active)
+ return __ieee80211_idle_off(local);
+ return __ieee80211_idle_on(local);
+}
+
+u32 ieee80211_idle_off(struct ieee80211_local *local)
+{
+ return __ieee80211_recalc_idle(local, true);
+}
+
+void ieee80211_recalc_idle(struct ieee80211_local *local)
+{
+ u32 change = __ieee80211_recalc_idle(local, false);
+ if (change)
+ ieee80211_hw_config(local, change);
+}
+
+static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if (new_mtu < 256 || new_mtu > IEEE80211_MAX_DATA_LEN)
+ return -EINVAL;
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
+ bool check_dup)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *iter;
+ u64 new, mask, tmp;
+ u8 *m;
+ int ret = 0;
+
+ if (is_zero_ether_addr(local->hw.wiphy->addr_mask))
+ return 0;
+
+ m = addr;
+ new = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
+ ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
+ ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);
+
+ m = local->hw.wiphy->addr_mask;
+ mask = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
+ ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
+ ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);
+
+ if (!check_dup)
+ return ret;
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(iter, &local->interfaces, list) {
+ if (iter == sdata)
+ continue;
+
+ if (iter->vif.type == NL80211_IFTYPE_MONITOR &&
+ !(iter->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ continue;
+
+ m = iter->vif.addr;
+ tmp = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
+ ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
+ ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);
+
+ if ((new & ~mask) != (tmp & ~mask)) {
+ ret = -EINVAL;
+ break;
+ }
+ }
+ mutex_unlock(&local->iflist_mtx);
+
+ return ret;
+}
+
+static int ieee80211_change_mac(struct net_device *dev, void *addr)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sockaddr *sa = addr;
+ bool check_dup = true;
+ int ret;
+
+ if (ieee80211_sdata_running(sdata))
+ return -EBUSY;
+
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+ !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ check_dup = false;
+
+ ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup);
+ if (ret)
+ return ret;
+
+ ret = eth_mac_addr(dev, sa);
+
+ if (ret == 0)
+ memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN);
+
+ return ret;
+}
+
+static inline int identical_mac_addr_allowed(int type1, int type2)
+{
+ return type1 == NL80211_IFTYPE_MONITOR ||
+ type2 == NL80211_IFTYPE_MONITOR ||
+ type1 == NL80211_IFTYPE_P2P_DEVICE ||
+ type2 == NL80211_IFTYPE_P2P_DEVICE ||
+ (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) ||
+ (type1 == NL80211_IFTYPE_WDS &&
+ (type2 == NL80211_IFTYPE_WDS ||
+ type2 == NL80211_IFTYPE_AP)) ||
+ (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) ||
+ (type1 == NL80211_IFTYPE_AP_VLAN &&
+ (type2 == NL80211_IFTYPE_AP ||
+ type2 == NL80211_IFTYPE_AP_VLAN));
+}
+
+static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype iftype)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *nsdata;
+ int ret;
+
+ ASSERT_RTNL();
+
+ /* we hold the RTNL here so can safely walk the list */
+ list_for_each_entry(nsdata, &local->interfaces, list) {
+ if (nsdata != sdata && ieee80211_sdata_running(nsdata)) {
+ /*
+ * Only OCB and monitor mode may coexist
+ */
+ if ((sdata->vif.type == NL80211_IFTYPE_OCB &&
+ nsdata->vif.type != NL80211_IFTYPE_MONITOR) ||
+ (sdata->vif.type != NL80211_IFTYPE_MONITOR &&
+ nsdata->vif.type == NL80211_IFTYPE_OCB))
+ return -EBUSY;
+
+ /*
+ * Allow only a single IBSS interface to be up at any
+ * time. This is restricted because beacon distribution
+ * cannot work properly if both are in the same IBSS.
+ *
+ * To remove this restriction we'd have to disallow them
+ * from setting the same SSID on different IBSS interfaces
+ * belonging to the same hardware. Then, however, we're
+ * faced with having to adopt two different TSF timers...
+ */
+ if (iftype == NL80211_IFTYPE_ADHOC &&
+ nsdata->vif.type == NL80211_IFTYPE_ADHOC)
+ return -EBUSY;
+ /*
+ * will not add another interface while any channel
+ * switch is active.
+ */
+ if (nsdata->vif.csa_active)
+ return -EBUSY;
+
+ /*
+ * The remaining checks are only performed for interfaces
+ * with the same MAC address.
+ */
+ if (!ether_addr_equal(sdata->vif.addr,
+ nsdata->vif.addr))
+ continue;
+
+ /*
+ * check whether it may have the same address
+ */
+ if (!identical_mac_addr_allowed(iftype,
+ nsdata->vif.type))
+ return -ENOTUNIQ;
+
+ /*
+ * can only add VLANs to enabled APs
+ */
+ if (iftype == NL80211_IFTYPE_AP_VLAN &&
+ nsdata->vif.type == NL80211_IFTYPE_AP)
+ sdata->bss = &nsdata->u.ap;
+ }
+ }
+
+ mutex_lock(&local->chanctx_mtx);
+ ret = ieee80211_check_combinations(sdata, NULL, 0, 0);
+ mutex_unlock(&local->chanctx_mtx);
+ return ret;
+}
+
+static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype iftype)
+{
+ int n_queues = sdata->local->hw.queues;
+ int i;
+
+ if (iftype != NL80211_IFTYPE_P2P_DEVICE) {
+ for (i = 0; i < IEEE80211_NUM_ACS; i++) {
+ if (WARN_ON_ONCE(sdata->vif.hw_queue[i] ==
+ IEEE80211_INVAL_HW_QUEUE))
+ return -EINVAL;
+ if (WARN_ON_ONCE(sdata->vif.hw_queue[i] >=
+ n_queues))
+ return -EINVAL;
+ }
+ }
+
+ if ((iftype != NL80211_IFTYPE_AP &&
+ iftype != NL80211_IFTYPE_P2P_GO &&
+ iftype != NL80211_IFTYPE_MESH_POINT) ||
+ !(sdata->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)) {
+ sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
+ return 0;
+ }
+
+ if (WARN_ON_ONCE(sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(sdata->vif.cab_queue >= n_queues))
+ return -EINVAL;
+
+ return 0;
+}
+
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+ const int offset)
+{
+ struct ieee80211_local *local = sdata->local;
+ u32 flags = sdata->u.mntr_flags;
+
+#define ADJUST(_f, _s) do { \
+ if (flags & MONITOR_FLAG_##_f) \
+ local->fif_##_s += offset; \
+ } while (0)
+
+ ADJUST(FCSFAIL, fcsfail);
+ ADJUST(PLCPFAIL, plcpfail);
+ ADJUST(CONTROL, control);
+ ADJUST(CONTROL, pspoll);
+ ADJUST(OTHER_BSS, other_bss);
+
+#undef ADJUST
+}
+
+static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ int i;
+
+ for (i = 0; i < IEEE80211_NUM_ACS; i++) {
+ if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
+ sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE;
+ else if (local->hw.queues >= IEEE80211_NUM_ACS)
+ sdata->vif.hw_queue[i] = i;
+ else
+ sdata->vif.hw_queue[i] = 0;
+ }
+ sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
+}
+
+int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+ int ret;
+
+ if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF))
+ return 0;
+
+ ASSERT_RTNL();
+
+ if (local->monitor_sdata)
+ return 0;
+
+ sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL);
+ if (!sdata)
+ return -ENOMEM;
+
+ /* set up data */
+ sdata->local = local;
+ sdata->vif.type = NL80211_IFTYPE_MONITOR;
+ snprintf(sdata->name, IFNAMSIZ, "%s-monitor",
+ wiphy_name(local->hw.wiphy));
+ sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
+
+ sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
+
+ ieee80211_set_default_queues(sdata);
+
+ ret = drv_add_interface(local, sdata);
+ if (WARN_ON(ret)) {
+ /* ok .. stupid driver, it asked for this! */
+ kfree(sdata);
+ return ret;
+ }
+
+ ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR);
+ if (ret) {
+ kfree(sdata);
+ return ret;
+ }
+
+ mutex_lock(&local->iflist_mtx);
+ rcu_assign_pointer(local->monitor_sdata, sdata);
+ mutex_unlock(&local->iflist_mtx);
+
+ mutex_lock(&local->mtx);
+ ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef,
+ IEEE80211_CHANCTX_EXCLUSIVE);
+ mutex_unlock(&local->mtx);
+ if (ret) {
+ mutex_lock(&local->iflist_mtx);
+ RCU_INIT_POINTER(local->monitor_sdata, NULL);
+ mutex_unlock(&local->iflist_mtx);
+ synchronize_net();
+ drv_remove_interface(local, sdata);
+ kfree(sdata);
+ return ret;
+ }
+
+ return 0;
+}
+
+void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ if (!(local->hw.flags & IEEE80211_HW_WANT_MONITOR_VIF))
+ return;
+
+ ASSERT_RTNL();
+
+ mutex_lock(&local->iflist_mtx);
+
+ sdata = rcu_dereference_protected(local->monitor_sdata,
+ lockdep_is_held(&local->iflist_mtx));
+ if (!sdata) {
+ mutex_unlock(&local->iflist_mtx);
+ return;
+ }
+
+ RCU_INIT_POINTER(local->monitor_sdata, NULL);
+ mutex_unlock(&local->iflist_mtx);
+
+ synchronize_net();
+
+ mutex_lock(&local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&local->mtx);
+
+ drv_remove_interface(local, sdata);
+
+ kfree(sdata);
+}
+
+/*
+ * NOTE: Be very careful when changing this function, it must NOT return
+ * an error on interface type changes that have been pre-checked, so most
+ * checks should be in ieee80211_check_concurrent_iface.
+ */
+int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct net_device *dev = wdev->netdev;
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ u32 changed = 0;
+ int res;
+ u32 hw_reconf_flags = 0;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_WDS:
+ if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
+ return -ENOLINK;
+ break;
+ case NL80211_IFTYPE_AP_VLAN: {
+ struct ieee80211_sub_if_data *master;
+
+ if (!sdata->bss)
+ return -ENOLINK;
+
+ mutex_lock(&local->mtx);
+ list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
+ mutex_unlock(&local->mtx);
+
+ master = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+ sdata->control_port_protocol =
+ master->control_port_protocol;
+ sdata->control_port_no_encrypt =
+ master->control_port_no_encrypt;
+ sdata->vif.cab_queue = master->vif.cab_queue;
+ memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
+ sizeof(sdata->vif.hw_queue));
+ sdata->vif.bss_conf.chandef = master->vif.bss_conf.chandef;
+
+ mutex_lock(&local->key_mtx);
+ sdata->crypto_tx_tailroom_needed_cnt +=
+ master->crypto_tx_tailroom_needed_cnt;
+ mutex_unlock(&local->key_mtx);
+
+ break;
+ }
+ case NL80211_IFTYPE_AP:
+ sdata->bss = &sdata->u.ap;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_OCB:
+ /* no special treatment */
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
+ /* cannot happen */
+ WARN_ON(1);
+ break;
+ }
+
+ if (local->open_count == 0) {
+ res = drv_start(local);
+ if (res)
+ goto err_del_bss;
+ /* we're brought up, everything changes */
+ hw_reconf_flags = ~0;
+ ieee80211_led_radio(local, true);
+ ieee80211_mod_tpt_led_trig(local,
+ IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
+ }
+
+ /*
+ * Copy the hopefully now-present MAC address to
+ * this interface, if it has the special null one.
+ */
+ if (dev && is_zero_ether_addr(dev->dev_addr)) {
+ memcpy(dev->dev_addr,
+ local->hw.wiphy->perm_addr,
+ ETH_ALEN);
+ memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
+
+ if (!is_valid_ether_addr(dev->dev_addr)) {
+ res = -EADDRNOTAVAIL;
+ goto err_stop;
+ }
+ }
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ /* no need to tell driver, but set carrier and chanctx */
+ if (rtnl_dereference(sdata->bss->beacon)) {
+ ieee80211_vif_vlan_copy_chanctx(sdata);
+ netif_carrier_on(dev);
+ } else {
+ netif_carrier_off(dev);
+ }
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+ local->cooked_mntrs++;
+ break;
+ }
+
+ if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+ res = drv_add_interface(local, sdata);
+ if (res)
+ goto err_stop;
+ } else if (local->monitors == 0 && local->open_count == 0) {
+ res = ieee80211_add_virtual_monitor(local);
+ if (res)
+ goto err_stop;
+ }
+
+ /* must be before the call to ieee80211_configure_filter */
+ local->monitors++;
+ if (local->monitors == 1) {
+ local->hw.conf.flags |= IEEE80211_CONF_MONITOR;
+ hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
+ }
+
+ ieee80211_adjust_monitor_flags(sdata, 1);
+ ieee80211_configure_filter(local);
+ mutex_lock(&local->mtx);
+ ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
+
+ netif_carrier_on(dev);
+ break;
+ default:
+ if (coming_up) {
+ ieee80211_del_virtual_monitor(local);
+
+ res = drv_add_interface(local, sdata);
+ if (res)
+ goto err_stop;
+ res = ieee80211_check_queues(sdata,
+ ieee80211_vif_type_p2p(&sdata->vif));
+ if (res)
+ goto err_del_interface;
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ local->fif_pspoll++;
+ local->fif_probe_req++;
+
+ ieee80211_configure_filter(local);
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ local->fif_probe_req++;
+ }
+
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE)
+ changed |= ieee80211_reset_erp_info(sdata);
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_OCB:
+ netif_carrier_off(dev);
+ break;
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ break;
+ default:
+ /* not reached */
+ WARN_ON(1);
+ }
+
+ /*
+ * set default queue parameters so drivers don't
+ * need to initialise the hardware if the hardware
+ * doesn't start up with sane defaults
+ */
+ ieee80211_set_wmm_default(sdata, true);
+ }
+
+ set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+ if (sdata->vif.type == NL80211_IFTYPE_WDS) {
+ /* Create STA entry for the WDS peer */
+ sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
+ GFP_KERNEL);
+ if (!sta) {
+ res = -ENOMEM;
+ goto err_del_interface;
+ }
+
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
+ sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED);
+
+ res = sta_info_insert(sta);
+ if (res) {
+ /* STA has been freed */
+ goto err_del_interface;
+ }
+
+ rate_control_rate_init(sta);
+ netif_carrier_on(dev);
+ } else if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) {
+ rcu_assign_pointer(local->p2p_sdata, sdata);
+ }
+
+ /*
+ * set_multicast_list will be invoked by the networking core
+ * which will check whether any increments here were done in
+ * error and sync them down to the hardware as filter flags.
+ */
+ if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
+ atomic_inc(&local->iff_allmultis);
+
+ if (sdata->flags & IEEE80211_SDATA_PROMISC)
+ atomic_inc(&local->iff_promiscs);
+
+ if (coming_up)
+ local->open_count++;
+
+ if (hw_reconf_flags)
+ ieee80211_hw_config(local, hw_reconf_flags);
+
+ ieee80211_recalc_ps(local, -1);
+
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ /* XXX: for AP_VLAN, actually track AP queues */
+ netif_tx_start_all_queues(dev);
+ } else if (dev) {
+ unsigned long flags;
+ int n_acs = IEEE80211_NUM_ACS;
+ int ac;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ n_acs = 1;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE ||
+ (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 &&
+ skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) {
+ for (ac = 0; ac < n_acs; ac++) {
+ int ac_queue = sdata->vif.hw_queue[ac];
+
+ if (local->queue_stop_reasons[ac_queue] == 0 &&
+ skb_queue_empty(&local->pending[ac_queue]))
+ netif_start_subqueue(dev, ac);
+ }
+ }
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ }
+
+ return 0;
+ err_del_interface:
+ drv_remove_interface(local, sdata);
+ err_stop:
+ if (!local->open_count)
+ drv_stop(local);
+ err_del_bss:
+ sdata->bss = NULL;
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ mutex_lock(&local->mtx);
+ list_del(&sdata->u.vlan.list);
+ mutex_unlock(&local->mtx);
+ }
+ /* might already be clear but that doesn't matter */
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+ return res;
+}
+
+static int ieee80211_open(struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ int err;
+
+ /* fail early if user set an invalid address */
+ if (!is_valid_ether_addr(dev->dev_addr))
+ return -EADDRNOTAVAIL;
+
+ err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type);
+ if (err)
+ return err;
+
+ return ieee80211_do_open(&sdata->wdev, true);
+}
+
+static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
+ bool going_down)
+{
+ struct ieee80211_local *local = sdata->local;
+ unsigned long flags;
+ struct sk_buff *skb, *tmp;
+ u32 hw_reconf_flags = 0;
+ int i, flushed;
+ struct ps_data *ps;
+ struct cfg80211_chan_def chandef;
+ bool cancel_scan;
+
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+ cancel_scan = rcu_access_pointer(local->scan_sdata) == sdata;
+ if (cancel_scan)
+ ieee80211_scan_cancel(local);
+
+ /*
+ * Stop TX on this interface first.
+ */
+ if (sdata->dev)
+ netif_tx_stop_all_queues(sdata->dev);
+
+ ieee80211_roc_purge(local, sdata);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ ieee80211_mgd_stop(sdata);
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ ieee80211_ibss_stop(sdata);
+ break;
+ case NL80211_IFTYPE_AP:
+ cancel_work_sync(&sdata->u.ap.request_smps_work);
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Remove all stations associated with this interface.
+ *
+ * This must be done before calling ops->remove_interface()
+ * because otherwise we can later invoke ops->sta_notify()
+ * whenever the STAs are removed, and that invalidates driver
+ * assumptions about always getting a vif pointer that is valid
+ * (because if we remove a STA after ops->remove_interface()
+ * the driver will have removed the vif info already!)
+ *
+ * In WDS mode a station must exist here and be flushed, for
+ * AP_VLANs stations may exist since there's nothing else that
+ * would have removed them, but in other modes there shouldn't
+ * be any stations.
+ */
+ flushed = sta_info_flush(sdata);
+ WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ ((sdata->vif.type != NL80211_IFTYPE_WDS && flushed > 0) ||
+ (sdata->vif.type == NL80211_IFTYPE_WDS && flushed != 1)));
+
+ /* don't count this interface for promisc/allmulti while it is down */
+ if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
+ atomic_dec(&local->iff_allmultis);
+
+ if (sdata->flags & IEEE80211_SDATA_PROMISC)
+ atomic_dec(&local->iff_promiscs);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ local->fif_pspoll--;
+ local->fif_probe_req--;
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ local->fif_probe_req--;
+ }
+
+ if (sdata->dev) {
+ netif_addr_lock_bh(sdata->dev);
+ spin_lock_bh(&local->filter_lock);
+ __hw_addr_unsync(&local->mc_list, &sdata->dev->mc,
+ sdata->dev->addr_len);
+ spin_unlock_bh(&local->filter_lock);
+ netif_addr_unlock_bh(sdata->dev);
+ }
+
+ del_timer_sync(&local->dynamic_ps_timer);
+ cancel_work_sync(&local->dynamic_ps_enable_work);
+
+ cancel_work_sync(&sdata->recalc_smps);
+ sdata_lock(sdata);
+ mutex_lock(&local->mtx);
+ sdata->vif.csa_active = false;
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ sdata->u.mgd.csa_waiting_bcn = false;
+ if (sdata->csa_block_tx) {
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_CSA);
+ sdata->csa_block_tx = false;
+ }
+ mutex_unlock(&local->mtx);
+ sdata_unlock(sdata);
+
+ cancel_work_sync(&sdata->csa_finalize_work);
+
+ cancel_delayed_work_sync(&sdata->dfs_cac_timer_work);
+
+ if (sdata->wdev.cac_started) {
+ chandef = sdata->vif.bss_conf.chandef;
+ WARN_ON(local->suspended);
+ mutex_lock(&local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&local->mtx);
+ cfg80211_cac_event(sdata->dev, &chandef,
+ NL80211_RADAR_CAC_ABORTED,
+ GFP_KERNEL);
+ }
+
+ /* APs need special treatment */
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ struct ieee80211_sub_if_data *vlan, *tmpsdata;
+
+ /* down all dependent devices, that is VLANs */
+ list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans,
+ u.vlan.list)
+ dev_close(vlan->dev);
+ WARN_ON(!list_empty(&sdata->u.ap.vlans));
+ } else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ /* remove all packets in parent bc_buf pointing to this dev */
+ ps = &sdata->bss->ps;
+
+ spin_lock_irqsave(&ps->bc_buf.lock, flags);
+ skb_queue_walk_safe(&ps->bc_buf, skb, tmp) {
+ if (skb->dev == sdata->dev) {
+ __skb_unlink(skb, &ps->bc_buf);
+ local->total_ps_buffered--;
+ ieee80211_free_txskb(&local->hw, skb);
+ }
+ }
+ spin_unlock_irqrestore(&ps->bc_buf.lock, flags);
+ }
+
+ if (going_down)
+ local->open_count--;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ mutex_lock(&local->mtx);
+ list_del(&sdata->u.vlan.list);
+ mutex_unlock(&local->mtx);
+ RCU_INIT_POINTER(sdata->vif.chanctx_conf, NULL);
+ /* see comment in the default case below */
+ ieee80211_free_keys(sdata, true);
+ /* no need to tell driver */
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+ local->cooked_mntrs--;
+ break;
+ }
+
+ local->monitors--;
+ if (local->monitors == 0) {
+ local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR;
+ hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
+ }
+
+ ieee80211_adjust_monitor_flags(sdata, -1);
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ /* relies on synchronize_rcu() below */
+ RCU_INIT_POINTER(local->p2p_sdata, NULL);
+ /* fall through */
+ default:
+ cancel_work_sync(&sdata->work);
+ /*
+ * When we get here, the interface is marked down.
+ * Free the remaining keys, if there are any
+ * (which can happen in AP mode if userspace sets
+ * keys before the interface is operating, and maybe
+ * also in WDS mode)
+ *
+ * Force the key freeing to always synchronize_net()
+ * to wait for the RX path in case it is using this
+ * interface enqueuing frames at this very time on
+ * another CPU.
+ */
+ ieee80211_free_keys(sdata, true);
+ skb_queue_purge(&sdata->skb_queue);
+ }
+
+ sdata->bss = NULL;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
+ skb_queue_walk_safe(&local->pending[i], skb, tmp) {
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ if (info->control.vif == &sdata->vif) {
+ __skb_unlink(skb, &local->pending[i]);
+ ieee80211_free_txskb(&local->hw, skb);
+ }
+ }
+ }
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+ if (sdata->vif.txq) {
+ struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+
+ ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
+ atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
+ }
+
+ if (local->open_count == 0)
+ ieee80211_clear_tx_pending(local);
+
+ /*
+ * If the interface goes down while suspended, presumably because
+ * the device was unplugged and that happens before our resume,
+ * then the driver is already unconfigured and the remainder of
+ * this function isn't needed.
+ * XXX: what about WoWLAN? If the device has software state, e.g.
+ * memory allocated, it might expect teardown commands from
+ * mac80211 here?
+ */
+ if (local->suspended) {
+ WARN_ON(local->wowlan);
+ WARN_ON(rtnl_dereference(local->monitor_sdata));
+ return;
+ }
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (local->monitors == 0)
+ ieee80211_del_virtual_monitor(local);
+
+ mutex_lock(&local->mtx);
+ ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
+
+ if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ break;
+
+ /* fall through */
+ default:
+ if (going_down)
+ drv_remove_interface(local, sdata);
+ }
+
+ ieee80211_recalc_ps(local, -1);
+
+ if (cancel_scan)
+ flush_delayed_work(&local->scan_work);
+
+ if (local->open_count == 0) {
+ ieee80211_stop_device(local);
+
+ /* no reconfiguring after stop! */
+ return;
+ }
+
+ /* do after stop to avoid reconfiguring when we stop anyway */
+ ieee80211_configure_filter(local);
+ ieee80211_hw_config(local, hw_reconf_flags);
+
+ if (local->monitors == local->open_count)
+ ieee80211_add_virtual_monitor(local);
+}
+
+static int ieee80211_stop(struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ ieee80211_do_stop(sdata, true);
+
+ return 0;
+}
+
+static void ieee80211_set_multicast_list(struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ int allmulti, promisc, sdata_allmulti, sdata_promisc;
+
+ allmulti = !!(dev->flags & IFF_ALLMULTI);
+ promisc = !!(dev->flags & IFF_PROMISC);
+ sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI);
+ sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC);
+
+ if (allmulti != sdata_allmulti) {
+ if (dev->flags & IFF_ALLMULTI)
+ atomic_inc(&local->iff_allmultis);
+ else
+ atomic_dec(&local->iff_allmultis);
+ sdata->flags ^= IEEE80211_SDATA_ALLMULTI;
+ }
+
+ if (promisc != sdata_promisc) {
+ if (dev->flags & IFF_PROMISC)
+ atomic_inc(&local->iff_promiscs);
+ else
+ atomic_dec(&local->iff_promiscs);
+ sdata->flags ^= IEEE80211_SDATA_PROMISC;
+ }
+ spin_lock_bh(&local->filter_lock);
+ __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len);
+ spin_unlock_bh(&local->filter_lock);
+ ieee80211_queue_work(&local->hw, &local->reconfig_filter);
+}
+
+/*
+ * Called when the netdev is removed or, by the code below, before
+ * the interface type changes.
+ */
+static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata)
+{
+ int i;
+
+ /* free extra data */
+ ieee80211_free_keys(sdata, false);
+
+ ieee80211_debugfs_remove_netdev(sdata);
+
+ for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++)
+ __skb_queue_purge(&sdata->fragments[i].skb_list);
+ sdata->fragment_next = 0;
+
+ if (ieee80211_vif_is_mesh(&sdata->vif))
+ mesh_rmc_free(sdata);
+}
+
+static void ieee80211_uninit(struct net_device *dev)
+{
+ ieee80211_teardown_sdata(IEEE80211_DEV_TO_SUB_IF(dev));
+}
+
+static u16 ieee80211_netdev_select_queue(struct net_device *dev,
+ struct sk_buff *skb,
+ void *accel_priv,
+ select_queue_fallback_t fallback)
+{
+ return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
+}
+
+static const struct net_device_ops ieee80211_dataif_ops = {
+ .ndo_open = ieee80211_open,
+ .ndo_stop = ieee80211_stop,
+ .ndo_uninit = ieee80211_uninit,
+ .ndo_start_xmit = ieee80211_subif_start_xmit,
+ .ndo_set_rx_mode = ieee80211_set_multicast_list,
+ .ndo_change_mtu = ieee80211_change_mtu,
+ .ndo_set_mac_address = ieee80211_change_mac,
+ .ndo_select_queue = ieee80211_netdev_select_queue,
+};
+
+static u16 ieee80211_monitor_select_queue(struct net_device *dev,
+ struct sk_buff *skb,
+ void *accel_priv,
+ select_queue_fallback_t fallback)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_hdr *hdr;
+ struct ieee80211_radiotap_header *rtap = (void *)skb->data;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ return 0;
+
+ if (skb->len < 4 ||
+ skb->len < le16_to_cpu(rtap->it_len) + 2 /* frame control */)
+ return 0; /* doesn't matter, frame will be dropped */
+
+ hdr = (void *)((u8 *)skb->data + le16_to_cpu(rtap->it_len));
+
+ return ieee80211_select_queue_80211(sdata, skb, hdr);
+}
+
+static const struct net_device_ops ieee80211_monitorif_ops = {
+ .ndo_open = ieee80211_open,
+ .ndo_stop = ieee80211_stop,
+ .ndo_uninit = ieee80211_uninit,
+ .ndo_start_xmit = ieee80211_monitor_start_xmit,
+ .ndo_set_rx_mode = ieee80211_set_multicast_list,
+ .ndo_change_mtu = ieee80211_change_mtu,
+ .ndo_set_mac_address = ieee80211_change_mac,
+ .ndo_select_queue = ieee80211_monitor_select_queue,
+};
+
+static void ieee80211_if_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->netdev_ops = &ieee80211_dataif_ops;
+ dev->destructor = free_netdev;
+}
+
+static void ieee80211_iface_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data, work);
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct sta_info *sta;
+ struct ieee80211_ra_tid *ra_tid;
+ struct ieee80211_rx_agg *rx_agg;
+
+ if (!ieee80211_sdata_running(sdata))
+ return;
+
+ if (local->scanning)
+ return;
+
+ if (!ieee80211_can_run_worker(local))
+ return;
+
+ /* first process frames */
+ while ((skb = skb_dequeue(&sdata->skb_queue))) {
+ struct ieee80211_mgmt *mgmt = (void *)skb->data;
+
+ if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_START) {
+ ra_tid = (void *)&skb->cb;
+ ieee80211_start_tx_ba_cb(&sdata->vif, ra_tid->ra,
+ ra_tid->tid);
+ } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_STOP) {
+ ra_tid = (void *)&skb->cb;
+ ieee80211_stop_tx_ba_cb(&sdata->vif, ra_tid->ra,
+ ra_tid->tid);
+ } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_START) {
+ rx_agg = (void *)&skb->cb;
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get_bss(sdata, rx_agg->addr);
+ if (sta)
+ __ieee80211_start_rx_ba_session(sta,
+ 0, 0, 0, 1, rx_agg->tid,
+ IEEE80211_MAX_AMPDU_BUF,
+ false, true);
+ mutex_unlock(&local->sta_mtx);
+ } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_RX_AGG_STOP) {
+ rx_agg = (void *)&skb->cb;
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get_bss(sdata, rx_agg->addr);
+ if (sta)
+ __ieee80211_stop_rx_ba_session(sta,
+ rx_agg->tid,
+ WLAN_BACK_RECIPIENT, 0,
+ false);
+ mutex_unlock(&local->sta_mtx);
+ } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_TDLS_CHSW) {
+ ieee80211_process_tdls_channel_switch(sdata, skb);
+ } else if (ieee80211_is_action(mgmt->frame_control) &&
+ mgmt->u.action.category == WLAN_CATEGORY_BACK) {
+ int len = skb->len;
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get_bss(sdata, mgmt->sa);
+ if (sta) {
+ switch (mgmt->u.action.u.addba_req.action_code) {
+ case WLAN_ACTION_ADDBA_REQ:
+ ieee80211_process_addba_request(
+ local, sta, mgmt, len);
+ break;
+ case WLAN_ACTION_ADDBA_RESP:
+ ieee80211_process_addba_resp(local, sta,
+ mgmt, len);
+ break;
+ case WLAN_ACTION_DELBA:
+ ieee80211_process_delba(sdata, sta,
+ mgmt, len);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+ }
+ mutex_unlock(&local->sta_mtx);
+ } else if (ieee80211_is_data_qos(mgmt->frame_control)) {
+ struct ieee80211_hdr *hdr = (void *)mgmt;
+ /*
+ * So the frame isn't mgmt, but frame_control
+ * is at the right place anyway, of course, so
+ * the if statement is correct.
+ *
+ * Warn if we have other data frame types here,
+ * they must not get here.
+ */
+ WARN_ON(hdr->frame_control &
+ cpu_to_le16(IEEE80211_STYPE_NULLFUNC));
+ WARN_ON(!(hdr->seq_ctrl &
+ cpu_to_le16(IEEE80211_SCTL_FRAG)));
+ /*
+ * This was a fragment of a frame, received while
+ * a block-ack session was active. That cannot be
+ * right, so terminate the session.
+ */
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get_bss(sdata, mgmt->sa);
+ if (sta) {
+ u16 tid = *ieee80211_get_qos_ctl(hdr) &
+ IEEE80211_QOS_CTL_TID_MASK;
+
+ __ieee80211_stop_rx_ba_session(
+ sta, tid, WLAN_BACK_RECIPIENT,
+ WLAN_REASON_QSTA_REQUIRE_SETUP,
+ true);
+ }
+ mutex_unlock(&local->sta_mtx);
+ } else switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ ieee80211_sta_rx_queued_mgmt(sdata, skb);
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ ieee80211_ibss_rx_queued_mgmt(sdata, skb);
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ if (!ieee80211_vif_is_mesh(&sdata->vif))
+ break;
+ ieee80211_mesh_rx_queued_mgmt(sdata, skb);
+ break;
+ default:
+ WARN(1, "frame for unexpected interface type");
+ break;
+ }
+
+ kfree_skb(skb);
+ }
+
+ /* then other type-dependent work */
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ ieee80211_sta_work(sdata);
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ ieee80211_ibss_work(sdata);
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ if (!ieee80211_vif_is_mesh(&sdata->vif))
+ break;
+ ieee80211_mesh_work(sdata);
+ break;
+ case NL80211_IFTYPE_OCB:
+ ieee80211_ocb_work(sdata);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ieee80211_recalc_smps_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data, recalc_smps);
+
+ ieee80211_recalc_smps(sdata);
+}
+
+/*
+ * Helper function to initialise an interface to a specific type.
+ */
+static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type)
+{
+ static const u8 bssid_wildcard[ETH_ALEN] = {0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff};
+
+ /* clear type-dependent union */
+ memset(&sdata->u, 0, sizeof(sdata->u));
+
+ /* and set some type-dependent values */
+ sdata->vif.type = type;
+ sdata->vif.p2p = false;
+ sdata->wdev.iftype = type;
+
+ sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE);
+ sdata->control_port_no_encrypt = false;
+ sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
+ sdata->vif.bss_conf.idle = true;
+
+ sdata->noack_map = 0;
+
+ /* only monitor/p2p-device differ */
+ if (sdata->dev) {
+ sdata->dev->netdev_ops = &ieee80211_dataif_ops;
+ sdata->dev->type = ARPHRD_ETHER;
+ }
+
+ skb_queue_head_init(&sdata->skb_queue);
+ INIT_WORK(&sdata->work, ieee80211_iface_work);
+ INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work);
+ INIT_WORK(&sdata->csa_finalize_work, ieee80211_csa_finalize_work);
+ INIT_LIST_HEAD(&sdata->assigned_chanctx_list);
+ INIT_LIST_HEAD(&sdata->reserved_chanctx_list);
+
+ switch (type) {
+ case NL80211_IFTYPE_P2P_GO:
+ type = NL80211_IFTYPE_AP;
+ sdata->vif.type = type;
+ sdata->vif.p2p = true;
+ /* fall through */
+ case NL80211_IFTYPE_AP:
+ skb_queue_head_init(&sdata->u.ap.ps.bc_buf);
+ INIT_LIST_HEAD(&sdata->u.ap.vlans);
+ INIT_WORK(&sdata->u.ap.request_smps_work,
+ ieee80211_request_smps_ap_work);
+ sdata->vif.bss_conf.bssid = sdata->vif.addr;
+ sdata->u.ap.req_smps = IEEE80211_SMPS_OFF;
+ break;
+ case NL80211_IFTYPE_P2P_CLIENT:
+ type = NL80211_IFTYPE_STATION;
+ sdata->vif.type = type;
+ sdata->vif.p2p = true;
+ /* fall through */
+ case NL80211_IFTYPE_STATION:
+ sdata->vif.bss_conf.bssid = sdata->u.mgd.bssid;
+ ieee80211_sta_setup_sdata(sdata);
+ break;
+ case NL80211_IFTYPE_OCB:
+ sdata->vif.bss_conf.bssid = bssid_wildcard;
+ ieee80211_ocb_setup_sdata(sdata);
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid;
+ ieee80211_ibss_setup_sdata(sdata);
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ if (ieee80211_vif_is_mesh(&sdata->vif))
+ ieee80211_mesh_init_sdata(sdata);
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP;
+ sdata->dev->netdev_ops = &ieee80211_monitorif_ops;
+ sdata->u.mntr_flags = MONITOR_FLAG_CONTROL |
+ MONITOR_FLAG_OTHER_BSS;
+ break;
+ case NL80211_IFTYPE_WDS:
+ sdata->vif.bss_conf.bssid = NULL;
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ sdata->vif.bss_conf.bssid = sdata->vif.addr;
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ BUG();
+ break;
+ }
+
+ ieee80211_debugfs_add_netdev(sdata);
+}
+
+static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type)
+{
+ struct ieee80211_local *local = sdata->local;
+ int ret, err;
+ enum nl80211_iftype internal_type = type;
+ bool p2p = false;
+
+ ASSERT_RTNL();
+
+ if (!local->ops->change_interface)
+ return -EBUSY;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_OCB:
+ /*
+ * Could maybe also all others here?
+ * Just not sure how that interacts
+ * with the RX/config path e.g. for
+ * mesh.
+ */
+ break;
+ default:
+ return -EBUSY;
+ }
+
+ switch (type) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_OCB:
+ /*
+ * Could probably support everything
+ * but WDS here (WDS do_open can fail
+ * under memory pressure, which this
+ * code isn't prepared to handle).
+ */
+ break;
+ case NL80211_IFTYPE_P2P_CLIENT:
+ p2p = true;
+ internal_type = NL80211_IFTYPE_STATION;
+ break;
+ case NL80211_IFTYPE_P2P_GO:
+ p2p = true;
+ internal_type = NL80211_IFTYPE_AP;
+ break;
+ default:
+ return -EBUSY;
+ }
+
+ ret = ieee80211_check_concurrent_iface(sdata, internal_type);
+ if (ret)
+ return ret;
+
+ ieee80211_do_stop(sdata, false);
+
+ ieee80211_teardown_sdata(sdata);
+
+ ret = drv_change_interface(local, sdata, internal_type, p2p);
+ if (ret)
+ type = ieee80211_vif_type_p2p(&sdata->vif);
+
+ /*
+ * Ignore return value here, there's not much we can do since
+ * the driver changed the interface type internally already.
+ * The warnings will hopefully make driver authors fix it :-)
+ */
+ ieee80211_check_queues(sdata, type);
+
+ ieee80211_setup_sdata(sdata, type);
+
+ err = ieee80211_do_open(&sdata->wdev, false);
+ WARN(err, "type change: do_open returned %d", err);
+
+ return ret;
+}
+
+int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type)
+{
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (type == ieee80211_vif_type_p2p(&sdata->vif))
+ return 0;
+
+ if (ieee80211_sdata_running(sdata)) {
+ ret = ieee80211_runtime_change_iftype(sdata, type);
+ if (ret)
+ return ret;
+ } else {
+ /* Purge and reset type-dependent state. */
+ ieee80211_teardown_sdata(sdata);
+ ieee80211_setup_sdata(sdata, type);
+ }
+
+ /* reset some values that shouldn't be kept across type changes */
+ if (type == NL80211_IFTYPE_STATION)
+ sdata->u.mgd.use_4addr = false;
+
+ return 0;
+}
+
+static void ieee80211_assign_perm_addr(struct ieee80211_local *local,
+ u8 *perm_addr, enum nl80211_iftype type)
+{
+ struct ieee80211_sub_if_data *sdata;
+ u64 mask, start, addr, val, inc;
+ u8 *m;
+ u8 tmp_addr[ETH_ALEN];
+ int i;
+
+ /* default ... something at least */
+ memcpy(perm_addr, local->hw.wiphy->perm_addr, ETH_ALEN);
+
+ if (is_zero_ether_addr(local->hw.wiphy->addr_mask) &&
+ local->hw.wiphy->n_addresses <= 1)
+ return;
+
+ mutex_lock(&local->iflist_mtx);
+
+ switch (type) {
+ case NL80211_IFTYPE_MONITOR:
+ /* doesn't matter */
+ break;
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_AP_VLAN:
+ /* match up with an AP interface */
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ continue;
+ memcpy(perm_addr, sdata->vif.addr, ETH_ALEN);
+ break;
+ }
+ /* keep default if no AP interface present */
+ break;
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
+ if (local->hw.flags & IEEE80211_HW_P2P_DEV_ADDR_FOR_INTF) {
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE)
+ continue;
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ memcpy(perm_addr, sdata->vif.addr, ETH_ALEN);
+ goto out_unlock;
+ }
+ }
+ /* otherwise fall through */
+ default:
+ /* assign a new address if possible -- try n_addresses first */
+ for (i = 0; i < local->hw.wiphy->n_addresses; i++) {
+ bool used = false;
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (ether_addr_equal(local->hw.wiphy->addresses[i].addr,
+ sdata->vif.addr)) {
+ used = true;
+ break;
+ }
+ }
+
+ if (!used) {
+ memcpy(perm_addr,
+ local->hw.wiphy->addresses[i].addr,
+ ETH_ALEN);
+ break;
+ }
+ }
+
+ /* try mask if available */
+ if (is_zero_ether_addr(local->hw.wiphy->addr_mask))
+ break;
+
+ m = local->hw.wiphy->addr_mask;
+ mask = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
+ ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
+ ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);
+
+ if (__ffs64(mask) + hweight64(mask) != fls64(mask)) {
+ /* not a contiguous mask ... not handled now! */
+ pr_info("not contiguous\n");
+ break;
+ }
+
+ /*
+ * Pick address of existing interface in case user changed
+ * MAC address manually, default to perm_addr.
+ */
+ m = local->hw.wiphy->perm_addr;
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR)
+ continue;
+ m = sdata->vif.addr;
+ break;
+ }
+ start = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
+ ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
+ ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);
+
+ inc = 1ULL<<__ffs64(mask);
+ val = (start & mask);
+ addr = (start & ~mask) | (val & mask);
+ do {
+ bool used = false;
+
+ tmp_addr[5] = addr >> 0*8;
+ tmp_addr[4] = addr >> 1*8;
+ tmp_addr[3] = addr >> 2*8;
+ tmp_addr[2] = addr >> 3*8;
+ tmp_addr[1] = addr >> 4*8;
+ tmp_addr[0] = addr >> 5*8;
+
+ val += inc;
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (ether_addr_equal(tmp_addr, sdata->vif.addr)) {
+ used = true;
+ break;
+ }
+ }
+
+ if (!used) {
+ memcpy(perm_addr, tmp_addr, ETH_ALEN);
+ break;
+ }
+ addr = (start & ~mask) | (val & mask);
+ } while (addr != start);
+
+ break;
+ }
+
+ out_unlock:
+ mutex_unlock(&local->iflist_mtx);
+}
+
+int ieee80211_if_add(struct ieee80211_local *local, const char *name,
+ unsigned char name_assign_type,
+ struct wireless_dev **new_wdev, enum nl80211_iftype type,
+ struct vif_params *params)
+{
+ struct net_device *ndev = NULL;
+ struct ieee80211_sub_if_data *sdata = NULL;
+ struct txq_info *txqi;
+ int ret, i;
+ int txqs = 1;
+
+ ASSERT_RTNL();
+
+ if (type == NL80211_IFTYPE_P2P_DEVICE) {
+ struct wireless_dev *wdev;
+
+ sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size,
+ GFP_KERNEL);
+ if (!sdata)
+ return -ENOMEM;
+ wdev = &sdata->wdev;
+
+ sdata->dev = NULL;
+ strlcpy(sdata->name, name, IFNAMSIZ);
+ ieee80211_assign_perm_addr(local, wdev->address, type);
+ memcpy(sdata->vif.addr, wdev->address, ETH_ALEN);
+ } else {
+ int size = ALIGN(sizeof(*sdata) + local->hw.vif_data_size,
+ sizeof(void *));
+ int txq_size = 0;
+
+ if (local->ops->wake_tx_queue)
+ txq_size += sizeof(struct txq_info) +
+ local->hw.txq_data_size;
+
+ if (local->hw.queues >= IEEE80211_NUM_ACS)
+ txqs = IEEE80211_NUM_ACS;
+
+ ndev = alloc_netdev_mqs(size + txq_size,
+ name, name_assign_type,
+ ieee80211_if_setup, txqs, 1);
+ if (!ndev)
+ return -ENOMEM;
+ dev_net_set(ndev, wiphy_net(local->hw.wiphy));
+
+ ndev->needed_headroom = local->tx_headroom +
+ 4*6 /* four MAC addresses */
+ + 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */
+ + 6 /* mesh */
+ + 8 /* rfc1042/bridge tunnel */
+ - ETH_HLEN /* ethernet hard_header_len */
+ + IEEE80211_ENCRYPT_HEADROOM;
+ ndev->needed_tailroom = IEEE80211_ENCRYPT_TAILROOM;
+
+ ret = dev_alloc_name(ndev, ndev->name);
+ if (ret < 0) {
+ free_netdev(ndev);
+ return ret;
+ }
+
+ ieee80211_assign_perm_addr(local, ndev->perm_addr, type);
+ if (params && is_valid_ether_addr(params->macaddr))
+ memcpy(ndev->dev_addr, params->macaddr, ETH_ALEN);
+ else
+ memcpy(ndev->dev_addr, ndev->perm_addr, ETH_ALEN);
+ SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy));
+
+ /* don't use IEEE80211_DEV_TO_SUB_IF -- it checks too much */
+ sdata = netdev_priv(ndev);
+ ndev->ieee80211_ptr = &sdata->wdev;
+ memcpy(sdata->vif.addr, ndev->dev_addr, ETH_ALEN);
+ memcpy(sdata->name, ndev->name, IFNAMSIZ);
+
+ if (txq_size) {
+ txqi = netdev_priv(ndev) + size;
+ ieee80211_init_tx_queue(sdata, NULL, txqi, 0);
+ }
+
+ sdata->dev = ndev;
+ }
+
+ /* initialise type-independent data */
+ sdata->wdev.wiphy = local->hw.wiphy;
+ sdata->local = local;
+
+ for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++)
+ skb_queue_head_init(&sdata->fragments[i].skb_list);
+
+ INIT_LIST_HEAD(&sdata->key_list);
+
+ INIT_DELAYED_WORK(&sdata->dfs_cac_timer_work,
+ ieee80211_dfs_cac_timer_work);
+ INIT_DELAYED_WORK(&sdata->dec_tailroom_needed_wk,
+ ieee80211_delayed_tailroom_dec);
+
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ struct ieee80211_supported_band *sband;
+ sband = local->hw.wiphy->bands[i];
+ sdata->rc_rateidx_mask[i] =
+ sband ? (1 << sband->n_bitrates) - 1 : 0;
+ if (sband)
+ memcpy(sdata->rc_rateidx_mcs_mask[i],
+ sband->ht_cap.mcs.rx_mask,
+ sizeof(sdata->rc_rateidx_mcs_mask[i]));
+ else
+ memset(sdata->rc_rateidx_mcs_mask[i], 0,
+ sizeof(sdata->rc_rateidx_mcs_mask[i]));
+ }
+
+ ieee80211_set_default_queues(sdata);
+
+ sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
+ sdata->user_power_level = local->user_power_level;
+
+ sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
+
+ /* setup type-dependent data */
+ ieee80211_setup_sdata(sdata, type);
+
+ if (ndev) {
+ if (params) {
+ ndev->ieee80211_ptr->use_4addr = params->use_4addr;
+ if (type == NL80211_IFTYPE_STATION)
+ sdata->u.mgd.use_4addr = params->use_4addr;
+ }
+
+ ndev->features |= local->hw.netdev_features;
+
+ netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops);
+
+ ret = register_netdevice(ndev);
+ if (ret) {
+ free_netdev(ndev);
+ return ret;
+ }
+ }
+
+ mutex_lock(&local->iflist_mtx);
+ list_add_tail_rcu(&sdata->list, &local->interfaces);
+ mutex_unlock(&local->iflist_mtx);
+
+ if (new_wdev)
+ *new_wdev = &sdata->wdev;
+
+ return 0;
+}
+
+void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata)
+{
+ ASSERT_RTNL();
+
+ mutex_lock(&sdata->local->iflist_mtx);
+ list_del_rcu(&sdata->list);
+ mutex_unlock(&sdata->local->iflist_mtx);
+
+ synchronize_rcu();
+
+ if (sdata->dev) {
+ unregister_netdevice(sdata->dev);
+ } else {
+ cfg80211_unregister_wdev(&sdata->wdev);
+ kfree(sdata);
+ }
+}
+
+void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata)
+{
+ if (WARN_ON_ONCE(!test_bit(SDATA_STATE_RUNNING, &sdata->state)))
+ return;
+ ieee80211_do_stop(sdata, true);
+ ieee80211_teardown_sdata(sdata);
+}
+
+/*
+ * Remove all interfaces, may only be called at hardware unregistration
+ * time because it doesn't do RCU-safe list removals.
+ */
+void ieee80211_remove_interfaces(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata, *tmp;
+ LIST_HEAD(unreg_list);
+ LIST_HEAD(wdev_list);
+
+ ASSERT_RTNL();
+
+ /*
+ * Close all AP_VLAN interfaces first, as otherwise they
+ * might be closed while the AP interface they belong to
+ * is closed, causing unregister_netdevice_many() to crash.
+ */
+ list_for_each_entry(sdata, &local->interfaces, list)
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ dev_close(sdata->dev);
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) {
+ list_del(&sdata->list);
+
+ if (sdata->dev)
+ unregister_netdevice_queue(sdata->dev, &unreg_list);
+ else
+ list_add(&sdata->list, &wdev_list);
+ }
+ mutex_unlock(&local->iflist_mtx);
+ unregister_netdevice_many(&unreg_list);
+
+ list_for_each_entry_safe(sdata, tmp, &wdev_list, list) {
+ list_del(&sdata->list);
+ cfg80211_unregister_wdev(&sdata->wdev);
+ kfree(sdata);
+ }
+}
+
+static int netdev_notify(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct ieee80211_sub_if_data *sdata;
+
+ if (state != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
+
+ if (!dev->ieee80211_ptr || !dev->ieee80211_ptr->wiphy)
+ return NOTIFY_DONE;
+
+ if (dev->ieee80211_ptr->wiphy->privid != mac80211_wiphy_privid)
+ return NOTIFY_DONE;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ memcpy(sdata->name, dev->name, IFNAMSIZ);
+ ieee80211_debugfs_rename_netdev(sdata);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mac80211_netdev_notifier = {
+ .notifier_call = netdev_notify,
+};
+
+int ieee80211_iface_init(void)
+{
+ return register_netdevice_notifier(&mac80211_netdev_notifier);
+}
+
+void ieee80211_iface_exit(void)
+{
+ unregister_netdevice_notifier(&mac80211_netdev_notifier);
+}
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
new file mode 100644
index 000000000..81e9785f3
--- /dev/null
+++ b/net/mac80211/key.c
@@ -0,0 +1,1169 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "debugfs_key.h"
+#include "aes_ccm.h"
+#include "aes_cmac.h"
+#include "aes_gmac.h"
+#include "aes_gcm.h"
+
+
+/**
+ * DOC: Key handling basics
+ *
+ * Key handling in mac80211 is done based on per-interface (sub_if_data)
+ * keys and per-station keys. Since each station belongs to an interface,
+ * each station key also belongs to that interface.
+ *
+ * Hardware acceleration is done on a best-effort basis for algorithms
+ * that are implemented in software, for each key the hardware is asked
+ * to enable that key for offloading but if it cannot do that the key is
+ * simply kept for software encryption (unless it is for an algorithm
+ * that isn't implemented in software).
+ * There is currently no way of knowing whether a key is handled in SW
+ * or HW except by looking into debugfs.
+ *
+ * All key management is internally protected by a mutex. Within all
+ * other parts of mac80211, key references are, just as STA structure
+ * references, protected by RCU. Note, however, that some things are
+ * unprotected, namely the key->sta dereferences within the hardware
+ * acceleration functions. This means that sta_info_destroy() must
+ * remove the key which waits for an RCU grace period.
+ */
+
+static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+
+static void assert_key_lock(struct ieee80211_local *local)
+{
+ lockdep_assert_held(&local->key_mtx);
+}
+
+static void
+update_vlan_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta)
+{
+ struct ieee80211_sub_if_data *vlan;
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ return;
+
+ /* crypto_tx_tailroom_needed_cnt is protected by this */
+ assert_key_lock(sdata->local);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ vlan->crypto_tx_tailroom_needed_cnt += delta;
+
+ rcu_read_unlock();
+}
+
+static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata)
+{
+ /*
+ * When this count is zero, SKB resizing for allocating tailroom
+ * for IV or MMIC is skipped. But, this check has created two race
+ * cases in xmit path while transiting from zero count to one:
+ *
+ * 1. SKB resize was skipped because no key was added but just before
+ * the xmit key is added and SW encryption kicks off.
+ *
+ * 2. SKB resize was skipped because all the keys were hw planted but
+ * just before xmit one of the key is deleted and SW encryption kicks
+ * off.
+ *
+ * In both the above case SW encryption will find not enough space for
+ * tailroom and exits with WARN_ON. (See WARN_ONs at wpa.c)
+ *
+ * Solution has been explained at
+ * http://mid.gmane.org/1308590980.4322.19.camel@jlt3.sipsolutions.net
+ */
+
+ assert_key_lock(sdata->local);
+
+ update_vlan_tailroom_need_count(sdata, 1);
+
+ if (!sdata->crypto_tx_tailroom_needed_cnt++) {
+ /*
+ * Flush all XMIT packets currently using HW encryption or no
+ * encryption at all if the count transition is from 0 -> 1.
+ */
+ synchronize_net();
+ }
+}
+
+static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata,
+ int delta)
+{
+ assert_key_lock(sdata->local);
+
+ WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt < delta);
+
+ update_vlan_tailroom_need_count(sdata, -delta);
+ sdata->crypto_tx_tailroom_needed_cnt -= delta;
+}
+
+static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+
+ if (key->flags & KEY_FLAG_TAINTED) {
+ /* If we get here, it's during resume and the key is
+ * tainted so shouldn't be used/programmed any more.
+ * However, its flags may still indicate that it was
+ * programmed into the device (since we're in resume)
+ * so clear that flag now to avoid trying to remove
+ * it again later.
+ */
+ key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+ return -EINVAL;
+ }
+
+ if (!key->local->ops->set_key)
+ goto out_unsupported;
+
+ assert_key_lock(key->local);
+
+ sta = key->sta;
+
+ /*
+ * If this is a per-STA GTK, check if it
+ * is supported; if not, return.
+ */
+ if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) &&
+ !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK))
+ goto out_unsupported;
+
+ if (sta && !sta->uploaded)
+ goto out_unsupported;
+
+ sdata = key->sdata;
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ /*
+ * The driver doesn't know anything about VLAN interfaces.
+ * Hence, don't send GTKs for VLAN interfaces to the driver.
+ */
+ if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE))
+ goto out_unsupported;
+ }
+
+ ret = drv_set_key(key->local, SET_KEY, sdata,
+ sta ? &sta->sta : NULL, &key->conf);
+
+ if (!ret) {
+ key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE;
+
+ if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) ||
+ (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
+ decrease_tailroom_need_count(sdata, 1);
+
+ WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) &&
+ (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV));
+
+ return 0;
+ }
+
+ if (ret != -ENOSPC && ret != -EOPNOTSUPP && ret != 1)
+ sdata_err(sdata,
+ "failed to set key (%d, %pM) to hardware (%d)\n",
+ key->conf.keyidx,
+ sta ? sta->sta.addr : bcast_addr, ret);
+
+ out_unsupported:
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ case WLAN_CIPHER_SUITE_TKIP:
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ /* all of these we can do in software - if driver can */
+ if (ret == 1)
+ return 0;
+ if (key->local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL)
+ return -EINVAL;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+ int ret;
+
+ might_sleep();
+
+ if (!key || !key->local->ops->set_key)
+ return;
+
+ assert_key_lock(key->local);
+
+ if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+ return;
+
+ sta = key->sta;
+ sdata = key->sdata;
+
+ if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) ||
+ (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
+ increment_tailroom_need_count(sdata);
+
+ ret = drv_set_key(key->local, DISABLE_KEY, sdata,
+ sta ? &sta->sta : NULL, &key->conf);
+
+ if (ret)
+ sdata_err(sdata,
+ "failed to remove key (%d, %pM) from hardware (%d)\n",
+ key->conf.keyidx,
+ sta ? sta->sta.addr : bcast_addr, ret);
+
+ key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+}
+
+static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
+ int idx, bool uni, bool multi)
+{
+ struct ieee80211_key *key = NULL;
+
+ assert_key_lock(sdata->local);
+
+ if (idx >= 0 && idx < NUM_DEFAULT_KEYS)
+ key = key_mtx_dereference(sdata->local, sdata->keys[idx]);
+
+ if (uni) {
+ rcu_assign_pointer(sdata->default_unicast_key, key);
+ drv_set_default_unicast_key(sdata->local, sdata, idx);
+ }
+
+ if (multi)
+ rcu_assign_pointer(sdata->default_multicast_key, key);
+
+ ieee80211_debugfs_key_update_default(sdata);
+}
+
+void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx,
+ bool uni, bool multi)
+{
+ mutex_lock(&sdata->local->key_mtx);
+ __ieee80211_set_default_key(sdata, idx, uni, multi);
+ mutex_unlock(&sdata->local->key_mtx);
+}
+
+static void
+__ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx)
+{
+ struct ieee80211_key *key = NULL;
+
+ assert_key_lock(sdata->local);
+
+ if (idx >= NUM_DEFAULT_KEYS &&
+ idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+ key = key_mtx_dereference(sdata->local, sdata->keys[idx]);
+
+ rcu_assign_pointer(sdata->default_mgmt_key, key);
+
+ ieee80211_debugfs_key_update_default(sdata);
+}
+
+void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
+ int idx)
+{
+ mutex_lock(&sdata->local->key_mtx);
+ __ieee80211_set_default_mgmt_key(sdata, idx);
+ mutex_unlock(&sdata->local->key_mtx);
+}
+
+
+static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ bool pairwise,
+ struct ieee80211_key *old,
+ struct ieee80211_key *new)
+{
+ int idx;
+ bool defunikey, defmultikey, defmgmtkey;
+
+ /* caller must provide at least one old/new */
+ if (WARN_ON(!new && !old))
+ return;
+
+ if (new)
+ list_add_tail(&new->list, &sdata->key_list);
+
+ WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
+
+ if (old)
+ idx = old->conf.keyidx;
+ else
+ idx = new->conf.keyidx;
+
+ if (sta) {
+ if (pairwise) {
+ rcu_assign_pointer(sta->ptk[idx], new);
+ sta->ptk_idx = idx;
+ } else {
+ rcu_assign_pointer(sta->gtk[idx], new);
+ sta->gtk_idx = idx;
+ }
+ } else {
+ defunikey = old &&
+ old == key_mtx_dereference(sdata->local,
+ sdata->default_unicast_key);
+ defmultikey = old &&
+ old == key_mtx_dereference(sdata->local,
+ sdata->default_multicast_key);
+ defmgmtkey = old &&
+ old == key_mtx_dereference(sdata->local,
+ sdata->default_mgmt_key);
+
+ if (defunikey && !new)
+ __ieee80211_set_default_key(sdata, -1, true, false);
+ if (defmultikey && !new)
+ __ieee80211_set_default_key(sdata, -1, false, true);
+ if (defmgmtkey && !new)
+ __ieee80211_set_default_mgmt_key(sdata, -1);
+
+ rcu_assign_pointer(sdata->keys[idx], new);
+ if (defunikey && new)
+ __ieee80211_set_default_key(sdata, new->conf.keyidx,
+ true, false);
+ if (defmultikey && new)
+ __ieee80211_set_default_key(sdata, new->conf.keyidx,
+ false, true);
+ if (defmgmtkey && new)
+ __ieee80211_set_default_mgmt_key(sdata,
+ new->conf.keyidx);
+ }
+
+ if (old)
+ list_del(&old->list);
+}
+
+struct ieee80211_key *
+ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
+ const u8 *key_data,
+ size_t seq_len, const u8 *seq,
+ const struct ieee80211_cipher_scheme *cs)
+{
+ struct ieee80211_key *key;
+ int i, j, err;
+
+ if (WARN_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS))
+ return ERR_PTR(-EINVAL);
+
+ key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL);
+ if (!key)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Default to software encryption; we'll later upload the
+ * key to the hardware if possible.
+ */
+ key->conf.flags = 0;
+ key->flags = 0;
+
+ key->conf.cipher = cipher;
+ key->conf.keyidx = idx;
+ key->conf.keylen = key_len;
+ switch (cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ key->conf.iv_len = IEEE80211_WEP_IV_LEN;
+ key->conf.icv_len = IEEE80211_WEP_ICV_LEN;
+ break;
+ case WLAN_CIPHER_SUITE_TKIP:
+ key->conf.iv_len = IEEE80211_TKIP_IV_LEN;
+ key->conf.icv_len = IEEE80211_TKIP_ICV_LEN;
+ if (seq) {
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ key->u.tkip.rx[i].iv32 =
+ get_unaligned_le32(&seq[2]);
+ key->u.tkip.rx[i].iv16 =
+ get_unaligned_le16(seq);
+ }
+ }
+ spin_lock_init(&key->u.tkip.txlock);
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ key->conf.iv_len = IEEE80211_CCMP_HDR_LEN;
+ key->conf.icv_len = IEEE80211_CCMP_MIC_LEN;
+ if (seq) {
+ for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++)
+ for (j = 0; j < IEEE80211_CCMP_PN_LEN; j++)
+ key->u.ccmp.rx_pn[i][j] =
+ seq[IEEE80211_CCMP_PN_LEN - j - 1];
+ }
+ /*
+ * Initialize AES key state here as an optimization so that
+ * it does not need to be initialized for every packet.
+ */
+ key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(
+ key_data, key_len, IEEE80211_CCMP_MIC_LEN);
+ if (IS_ERR(key->u.ccmp.tfm)) {
+ err = PTR_ERR(key->u.ccmp.tfm);
+ kfree(key);
+ return ERR_PTR(err);
+ }
+ break;
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ key->conf.iv_len = IEEE80211_CCMP_256_HDR_LEN;
+ key->conf.icv_len = IEEE80211_CCMP_256_MIC_LEN;
+ for (i = 0; seq && i < IEEE80211_NUM_TIDS + 1; i++)
+ for (j = 0; j < IEEE80211_CCMP_256_PN_LEN; j++)
+ key->u.ccmp.rx_pn[i][j] =
+ seq[IEEE80211_CCMP_256_PN_LEN - j - 1];
+ /* Initialize AES key state here as an optimization so that
+ * it does not need to be initialized for every packet.
+ */
+ key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(
+ key_data, key_len, IEEE80211_CCMP_256_MIC_LEN);
+ if (IS_ERR(key->u.ccmp.tfm)) {
+ err = PTR_ERR(key->u.ccmp.tfm);
+ kfree(key);
+ return ERR_PTR(err);
+ }
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ key->conf.iv_len = 0;
+ if (cipher == WLAN_CIPHER_SUITE_AES_CMAC)
+ key->conf.icv_len = sizeof(struct ieee80211_mmie);
+ else
+ key->conf.icv_len = sizeof(struct ieee80211_mmie_16);
+ if (seq)
+ for (j = 0; j < IEEE80211_CMAC_PN_LEN; j++)
+ key->u.aes_cmac.rx_pn[j] =
+ seq[IEEE80211_CMAC_PN_LEN - j - 1];
+ /*
+ * Initialize AES key state here as an optimization so that
+ * it does not need to be initialized for every packet.
+ */
+ key->u.aes_cmac.tfm =
+ ieee80211_aes_cmac_key_setup(key_data, key_len);
+ if (IS_ERR(key->u.aes_cmac.tfm)) {
+ err = PTR_ERR(key->u.aes_cmac.tfm);
+ kfree(key);
+ return ERR_PTR(err);
+ }
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ key->conf.iv_len = 0;
+ key->conf.icv_len = sizeof(struct ieee80211_mmie_16);
+ if (seq)
+ for (j = 0; j < IEEE80211_GMAC_PN_LEN; j++)
+ key->u.aes_gmac.rx_pn[j] =
+ seq[IEEE80211_GMAC_PN_LEN - j - 1];
+ /* Initialize AES key state here as an optimization so that
+ * it does not need to be initialized for every packet.
+ */
+ key->u.aes_gmac.tfm =
+ ieee80211_aes_gmac_key_setup(key_data, key_len);
+ if (IS_ERR(key->u.aes_gmac.tfm)) {
+ err = PTR_ERR(key->u.aes_gmac.tfm);
+ kfree(key);
+ return ERR_PTR(err);
+ }
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ key->conf.iv_len = IEEE80211_GCMP_HDR_LEN;
+ key->conf.icv_len = IEEE80211_GCMP_MIC_LEN;
+ for (i = 0; seq && i < IEEE80211_NUM_TIDS + 1; i++)
+ for (j = 0; j < IEEE80211_GCMP_PN_LEN; j++)
+ key->u.gcmp.rx_pn[i][j] =
+ seq[IEEE80211_GCMP_PN_LEN - j - 1];
+ /* Initialize AES key state here as an optimization so that
+ * it does not need to be initialized for every packet.
+ */
+ key->u.gcmp.tfm = ieee80211_aes_gcm_key_setup_encrypt(key_data,
+ key_len);
+ if (IS_ERR(key->u.gcmp.tfm)) {
+ err = PTR_ERR(key->u.gcmp.tfm);
+ kfree(key);
+ return ERR_PTR(err);
+ }
+ break;
+ default:
+ if (cs) {
+ size_t len = (seq_len > MAX_PN_LEN) ?
+ MAX_PN_LEN : seq_len;
+
+ key->conf.iv_len = cs->hdr_len;
+ key->conf.icv_len = cs->mic_len;
+ for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++)
+ for (j = 0; j < len; j++)
+ key->u.gen.rx_pn[i][j] =
+ seq[len - j - 1];
+ key->flags |= KEY_FLAG_CIPHER_SCHEME;
+ }
+ }
+ memcpy(key->conf.key, key_data, key_len);
+ INIT_LIST_HEAD(&key->list);
+
+ return key;
+}
+
+static void ieee80211_key_free_common(struct ieee80211_key *key)
+{
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ ieee80211_aes_key_free(key->u.ccmp.tfm);
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ ieee80211_aes_gmac_key_free(key->u.aes_gmac.tfm);
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ ieee80211_aes_gcm_key_free(key->u.gcmp.tfm);
+ break;
+ }
+ kzfree(key);
+}
+
+static void __ieee80211_key_destroy(struct ieee80211_key *key,
+ bool delay_tailroom)
+{
+ if (key->local)
+ ieee80211_key_disable_hw_accel(key);
+
+ if (key->local) {
+ struct ieee80211_sub_if_data *sdata = key->sdata;
+
+ ieee80211_debugfs_key_remove(key);
+
+ if (delay_tailroom) {
+ /* see ieee80211_delayed_tailroom_dec */
+ sdata->crypto_tx_tailroom_pending_dec++;
+ schedule_delayed_work(&sdata->dec_tailroom_needed_wk,
+ HZ/2);
+ } else {
+ decrease_tailroom_need_count(sdata, 1);
+ }
+ }
+
+ ieee80211_key_free_common(key);
+}
+
+static void ieee80211_key_destroy(struct ieee80211_key *key,
+ bool delay_tailroom)
+{
+ if (!key)
+ return;
+
+ /*
+ * Synchronize so the TX path can no longer be using
+ * this key before we free/remove it.
+ */
+ synchronize_net();
+
+ __ieee80211_key_destroy(key, delay_tailroom);
+}
+
+void ieee80211_key_free_unused(struct ieee80211_key *key)
+{
+ WARN_ON(key->sdata || key->local);
+ ieee80211_key_free_common(key);
+}
+
+int ieee80211_key_link(struct ieee80211_key *key,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_key *old_key;
+ int idx, ret;
+ bool pairwise;
+
+ pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
+ idx = key->conf.keyidx;
+ key->local = sdata->local;
+ key->sdata = sdata;
+ key->sta = sta;
+
+ mutex_lock(&sdata->local->key_mtx);
+
+ if (sta && pairwise)
+ old_key = key_mtx_dereference(sdata->local, sta->ptk[idx]);
+ else if (sta)
+ old_key = key_mtx_dereference(sdata->local, sta->gtk[idx]);
+ else
+ old_key = key_mtx_dereference(sdata->local, sdata->keys[idx]);
+
+ increment_tailroom_need_count(sdata);
+
+ ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
+ ieee80211_key_destroy(old_key, true);
+
+ ieee80211_debugfs_key_add(key);
+
+ if (!local->wowlan) {
+ ret = ieee80211_key_enable_hw_accel(key);
+ if (ret)
+ ieee80211_key_free(key, true);
+ } else {
+ ret = 0;
+ }
+
+ mutex_unlock(&sdata->local->key_mtx);
+
+ return ret;
+}
+
+void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom)
+{
+ if (!key)
+ return;
+
+ /*
+ * Replace key with nothingness if it was ever used.
+ */
+ if (key->sdata)
+ ieee80211_key_replace(key->sdata, key->sta,
+ key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
+ key, NULL);
+ ieee80211_key_destroy(key, delay_tailroom);
+}
+
+void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_key *key;
+ struct ieee80211_sub_if_data *vlan;
+
+ ASSERT_RTNL();
+
+ if (WARN_ON(!ieee80211_sdata_running(sdata)))
+ return;
+
+ mutex_lock(&sdata->local->key_mtx);
+
+ WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt ||
+ sdata->crypto_tx_tailroom_pending_dec);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt ||
+ vlan->crypto_tx_tailroom_pending_dec);
+ }
+
+ list_for_each_entry(key, &sdata->key_list, list) {
+ increment_tailroom_need_count(sdata);
+ ieee80211_key_enable_hw_accel(key);
+ }
+
+ mutex_unlock(&sdata->local->key_mtx);
+}
+
+void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_sub_if_data *vlan;
+
+ mutex_lock(&sdata->local->key_mtx);
+
+ sdata->crypto_tx_tailroom_needed_cnt = 0;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ vlan->crypto_tx_tailroom_needed_cnt = 0;
+ }
+
+ mutex_unlock(&sdata->local->key_mtx);
+}
+
+void ieee80211_iter_keys(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ void (*iter)(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta,
+ struct ieee80211_key_conf *key,
+ void *data),
+ void *iter_data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_key *key, *tmp;
+ struct ieee80211_sub_if_data *sdata;
+
+ ASSERT_RTNL();
+
+ mutex_lock(&local->key_mtx);
+ if (vif) {
+ sdata = vif_to_sdata(vif);
+ list_for_each_entry_safe(key, tmp, &sdata->key_list, list)
+ iter(hw, &sdata->vif,
+ key->sta ? &key->sta->sta : NULL,
+ &key->conf, iter_data);
+ } else {
+ list_for_each_entry(sdata, &local->interfaces, list)
+ list_for_each_entry_safe(key, tmp,
+ &sdata->key_list, list)
+ iter(hw, &sdata->vif,
+ key->sta ? &key->sta->sta : NULL,
+ &key->conf, iter_data);
+ }
+ mutex_unlock(&local->key_mtx);
+}
+EXPORT_SYMBOL(ieee80211_iter_keys);
+
+static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata,
+ struct list_head *keys)
+{
+ struct ieee80211_key *key, *tmp;
+
+ decrease_tailroom_need_count(sdata,
+ sdata->crypto_tx_tailroom_pending_dec);
+ sdata->crypto_tx_tailroom_pending_dec = 0;
+
+ ieee80211_debugfs_key_remove_mgmt_default(sdata);
+
+ list_for_each_entry_safe(key, tmp, &sdata->key_list, list) {
+ ieee80211_key_replace(key->sdata, key->sta,
+ key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
+ key, NULL);
+ list_add_tail(&key->list, keys);
+ }
+
+ ieee80211_debugfs_key_update_default(sdata);
+}
+
+void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata,
+ bool force_synchronize)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *vlan;
+ struct ieee80211_sub_if_data *master;
+ struct ieee80211_key *key, *tmp;
+ LIST_HEAD(keys);
+
+ cancel_delayed_work_sync(&sdata->dec_tailroom_needed_wk);
+
+ mutex_lock(&local->key_mtx);
+
+ ieee80211_free_keys_iface(sdata, &keys);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ ieee80211_free_keys_iface(vlan, &keys);
+ }
+
+ if (!list_empty(&keys) || force_synchronize)
+ synchronize_net();
+ list_for_each_entry_safe(key, tmp, &keys, list)
+ __ieee80211_key_destroy(key, false);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ if (sdata->bss) {
+ master = container_of(sdata->bss,
+ struct ieee80211_sub_if_data,
+ u.ap);
+
+ WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt !=
+ master->crypto_tx_tailroom_needed_cnt);
+ }
+ } else {
+ WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt ||
+ sdata->crypto_tx_tailroom_pending_dec);
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt ||
+ vlan->crypto_tx_tailroom_pending_dec);
+ }
+
+ mutex_unlock(&local->key_mtx);
+}
+
+void ieee80211_free_sta_keys(struct ieee80211_local *local,
+ struct sta_info *sta)
+{
+ struct ieee80211_key *key;
+ int i;
+
+ mutex_lock(&local->key_mtx);
+ for (i = 0; i < ARRAY_SIZE(sta->gtk); i++) {
+ key = key_mtx_dereference(local, sta->gtk[i]);
+ if (!key)
+ continue;
+ ieee80211_key_replace(key->sdata, key->sta,
+ key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
+ key, NULL);
+ __ieee80211_key_destroy(key, true);
+ }
+
+ for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
+ key = key_mtx_dereference(local, sta->ptk[i]);
+ if (!key)
+ continue;
+ ieee80211_key_replace(key->sdata, key->sta,
+ key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
+ key, NULL);
+ __ieee80211_key_destroy(key, true);
+ }
+
+ mutex_unlock(&local->key_mtx);
+}
+
+void ieee80211_delayed_tailroom_dec(struct work_struct *wk)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ sdata = container_of(wk, struct ieee80211_sub_if_data,
+ dec_tailroom_needed_wk.work);
+
+ /*
+ * The reason for the delayed tailroom needed decrementing is to
+ * make roaming faster: during roaming, all keys are first deleted
+ * and then new keys are installed. The first new key causes the
+ * crypto_tx_tailroom_needed_cnt to go from 0 to 1, which invokes
+ * the cost of synchronize_net() (which can be slow). Avoid this
+ * by deferring the crypto_tx_tailroom_needed_cnt decrementing on
+ * key removal for a while, so if we roam the value is larger than
+ * zero and no 0->1 transition happens.
+ *
+ * The cost is that if the AP switching was from an AP with keys
+ * to one without, we still allocate tailroom while it would no
+ * longer be needed. However, in the typical (fast) roaming case
+ * within an ESS this usually won't happen.
+ */
+
+ mutex_lock(&sdata->local->key_mtx);
+ decrease_tailroom_need_count(sdata,
+ sdata->crypto_tx_tailroom_pending_dec);
+ sdata->crypto_tx_tailroom_pending_dec = 0;
+ mutex_unlock(&sdata->local->key_mtx);
+}
+
+void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid,
+ const u8 *replay_ctr, gfp_t gfp)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ trace_api_gtk_rekey_notify(sdata, bssid, replay_ctr);
+
+ cfg80211_gtk_rekey_notify(sdata->dev, bssid, replay_ctr, gfp);
+}
+EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_notify);
+
+void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
+ struct ieee80211_key_seq *seq)
+{
+ struct ieee80211_key *key;
+ u64 pn64;
+
+ if (WARN_ON(!(keyconf->flags & IEEE80211_KEY_FLAG_GENERATE_IV)))
+ return;
+
+ key = container_of(keyconf, struct ieee80211_key, conf);
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_TKIP:
+ seq->tkip.iv32 = key->u.tkip.tx.iv32;
+ seq->tkip.iv16 = key->u.tkip.tx.iv16;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ pn64 = atomic64_read(&key->u.ccmp.tx_pn);
+ seq->ccmp.pn[5] = pn64;
+ seq->ccmp.pn[4] = pn64 >> 8;
+ seq->ccmp.pn[3] = pn64 >> 16;
+ seq->ccmp.pn[2] = pn64 >> 24;
+ seq->ccmp.pn[1] = pn64 >> 32;
+ seq->ccmp.pn[0] = pn64 >> 40;
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ pn64 = atomic64_read(&key->u.aes_cmac.tx_pn);
+ seq->ccmp.pn[5] = pn64;
+ seq->ccmp.pn[4] = pn64 >> 8;
+ seq->ccmp.pn[3] = pn64 >> 16;
+ seq->ccmp.pn[2] = pn64 >> 24;
+ seq->ccmp.pn[1] = pn64 >> 32;
+ seq->ccmp.pn[0] = pn64 >> 40;
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ pn64 = atomic64_read(&key->u.aes_gmac.tx_pn);
+ seq->ccmp.pn[5] = pn64;
+ seq->ccmp.pn[4] = pn64 >> 8;
+ seq->ccmp.pn[3] = pn64 >> 16;
+ seq->ccmp.pn[2] = pn64 >> 24;
+ seq->ccmp.pn[1] = pn64 >> 32;
+ seq->ccmp.pn[0] = pn64 >> 40;
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ pn64 = atomic64_read(&key->u.gcmp.tx_pn);
+ seq->gcmp.pn[5] = pn64;
+ seq->gcmp.pn[4] = pn64 >> 8;
+ seq->gcmp.pn[3] = pn64 >> 16;
+ seq->gcmp.pn[2] = pn64 >> 24;
+ seq->gcmp.pn[1] = pn64 >> 32;
+ seq->gcmp.pn[0] = pn64 >> 40;
+ break;
+ default:
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL(ieee80211_get_key_tx_seq);
+
+void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
+ int tid, struct ieee80211_key_seq *seq)
+{
+ struct ieee80211_key *key;
+ const u8 *pn;
+
+ key = container_of(keyconf, struct ieee80211_key, conf);
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_TKIP:
+ if (WARN_ON(tid < 0 || tid >= IEEE80211_NUM_TIDS))
+ return;
+ seq->tkip.iv32 = key->u.tkip.rx[tid].iv32;
+ seq->tkip.iv16 = key->u.tkip.rx[tid].iv16;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS))
+ return;
+ if (tid < 0)
+ pn = key->u.ccmp.rx_pn[IEEE80211_NUM_TIDS];
+ else
+ pn = key->u.ccmp.rx_pn[tid];
+ memcpy(seq->ccmp.pn, pn, IEEE80211_CCMP_PN_LEN);
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ if (WARN_ON(tid != 0))
+ return;
+ pn = key->u.aes_cmac.rx_pn;
+ memcpy(seq->aes_cmac.pn, pn, IEEE80211_CMAC_PN_LEN);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ if (WARN_ON(tid != 0))
+ return;
+ pn = key->u.aes_gmac.rx_pn;
+ memcpy(seq->aes_gmac.pn, pn, IEEE80211_GMAC_PN_LEN);
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS))
+ return;
+ if (tid < 0)
+ pn = key->u.gcmp.rx_pn[IEEE80211_NUM_TIDS];
+ else
+ pn = key->u.gcmp.rx_pn[tid];
+ memcpy(seq->gcmp.pn, pn, IEEE80211_GCMP_PN_LEN);
+ break;
+ }
+}
+EXPORT_SYMBOL(ieee80211_get_key_rx_seq);
+
+void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
+ struct ieee80211_key_seq *seq)
+{
+ struct ieee80211_key *key;
+ u64 pn64;
+
+ key = container_of(keyconf, struct ieee80211_key, conf);
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_TKIP:
+ key->u.tkip.tx.iv32 = seq->tkip.iv32;
+ key->u.tkip.tx.iv16 = seq->tkip.iv16;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ pn64 = (u64)seq->ccmp.pn[5] |
+ ((u64)seq->ccmp.pn[4] << 8) |
+ ((u64)seq->ccmp.pn[3] << 16) |
+ ((u64)seq->ccmp.pn[2] << 24) |
+ ((u64)seq->ccmp.pn[1] << 32) |
+ ((u64)seq->ccmp.pn[0] << 40);
+ atomic64_set(&key->u.ccmp.tx_pn, pn64);
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ pn64 = (u64)seq->aes_cmac.pn[5] |
+ ((u64)seq->aes_cmac.pn[4] << 8) |
+ ((u64)seq->aes_cmac.pn[3] << 16) |
+ ((u64)seq->aes_cmac.pn[2] << 24) |
+ ((u64)seq->aes_cmac.pn[1] << 32) |
+ ((u64)seq->aes_cmac.pn[0] << 40);
+ atomic64_set(&key->u.aes_cmac.tx_pn, pn64);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ pn64 = (u64)seq->aes_gmac.pn[5] |
+ ((u64)seq->aes_gmac.pn[4] << 8) |
+ ((u64)seq->aes_gmac.pn[3] << 16) |
+ ((u64)seq->aes_gmac.pn[2] << 24) |
+ ((u64)seq->aes_gmac.pn[1] << 32) |
+ ((u64)seq->aes_gmac.pn[0] << 40);
+ atomic64_set(&key->u.aes_gmac.tx_pn, pn64);
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ pn64 = (u64)seq->gcmp.pn[5] |
+ ((u64)seq->gcmp.pn[4] << 8) |
+ ((u64)seq->gcmp.pn[3] << 16) |
+ ((u64)seq->gcmp.pn[2] << 24) |
+ ((u64)seq->gcmp.pn[1] << 32) |
+ ((u64)seq->gcmp.pn[0] << 40);
+ atomic64_set(&key->u.gcmp.tx_pn, pn64);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(ieee80211_set_key_tx_seq);
+
+void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
+ int tid, struct ieee80211_key_seq *seq)
+{
+ struct ieee80211_key *key;
+ u8 *pn;
+
+ key = container_of(keyconf, struct ieee80211_key, conf);
+
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_TKIP:
+ if (WARN_ON(tid < 0 || tid >= IEEE80211_NUM_TIDS))
+ return;
+ key->u.tkip.rx[tid].iv32 = seq->tkip.iv32;
+ key->u.tkip.rx[tid].iv16 = seq->tkip.iv16;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS))
+ return;
+ if (tid < 0)
+ pn = key->u.ccmp.rx_pn[IEEE80211_NUM_TIDS];
+ else
+ pn = key->u.ccmp.rx_pn[tid];
+ memcpy(pn, seq->ccmp.pn, IEEE80211_CCMP_PN_LEN);
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ if (WARN_ON(tid != 0))
+ return;
+ pn = key->u.aes_cmac.rx_pn;
+ memcpy(pn, seq->aes_cmac.pn, IEEE80211_CMAC_PN_LEN);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ if (WARN_ON(tid != 0))
+ return;
+ pn = key->u.aes_gmac.rx_pn;
+ memcpy(pn, seq->aes_gmac.pn, IEEE80211_GMAC_PN_LEN);
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS))
+ return;
+ if (tid < 0)
+ pn = key->u.gcmp.rx_pn[IEEE80211_NUM_TIDS];
+ else
+ pn = key->u.gcmp.rx_pn[tid];
+ memcpy(pn, seq->gcmp.pn, IEEE80211_GCMP_PN_LEN);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq);
+
+void ieee80211_remove_key(struct ieee80211_key_conf *keyconf)
+{
+ struct ieee80211_key *key;
+
+ key = container_of(keyconf, struct ieee80211_key, conf);
+
+ assert_key_lock(key->local);
+
+ /*
+ * if key was uploaded, we assume the driver will/has remove(d)
+ * it, so adjust bookkeeping accordingly
+ */
+ if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
+ key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+
+ if (!((key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC) ||
+ (key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
+ increment_tailroom_need_count(key->sdata);
+ }
+
+ ieee80211_key_free(key, false);
+}
+EXPORT_SYMBOL_GPL(ieee80211_remove_key);
+
+struct ieee80211_key_conf *
+ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
+ struct ieee80211_key_conf *keyconf)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_key *key;
+ int err;
+
+ if (WARN_ON(!local->wowlan))
+ return ERR_PTR(-EINVAL);
+
+ if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+ return ERR_PTR(-EINVAL);
+
+ key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx,
+ keyconf->keylen, keyconf->key,
+ 0, NULL, NULL);
+ if (IS_ERR(key))
+ return ERR_CAST(key);
+
+ if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED)
+ key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT;
+
+ err = ieee80211_key_link(key, sdata, NULL);
+ if (err)
+ return ERR_PTR(err);
+
+ return &key->conf;
+}
+EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_add);
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
new file mode 100644
index 000000000..96557dd1e
--- /dev/null
+++ b/net/mac80211/key.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef IEEE80211_KEY_H
+#define IEEE80211_KEY_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/crypto.h>
+#include <linux/rcupdate.h>
+#include <net/mac80211.h>
+
+#define NUM_DEFAULT_KEYS 4
+#define NUM_DEFAULT_MGMT_KEYS 2
+#define MAX_PN_LEN 16
+
+struct ieee80211_local;
+struct ieee80211_sub_if_data;
+struct sta_info;
+
+/**
+ * enum ieee80211_internal_key_flags - internal key flags
+ *
+ * @KEY_FLAG_UPLOADED_TO_HARDWARE: Indicates that this key is present
+ * in the hardware for TX crypto hardware acceleration.
+ * @KEY_FLAG_TAINTED: Key is tainted and packets should be dropped.
+ * @KEY_FLAG_CIPHER_SCHEME: This key is for a hardware cipher scheme
+ */
+enum ieee80211_internal_key_flags {
+ KEY_FLAG_UPLOADED_TO_HARDWARE = BIT(0),
+ KEY_FLAG_TAINTED = BIT(1),
+ KEY_FLAG_CIPHER_SCHEME = BIT(2),
+};
+
+enum ieee80211_internal_tkip_state {
+ TKIP_STATE_NOT_INIT,
+ TKIP_STATE_PHASE1_DONE,
+ TKIP_STATE_PHASE1_HW_UPLOADED,
+};
+
+struct tkip_ctx {
+ u32 iv32; /* current iv32 */
+ u16 iv16; /* current iv16 */
+ u16 p1k[5]; /* p1k cache */
+ u32 p1k_iv32; /* iv32 for which p1k computed */
+ enum ieee80211_internal_tkip_state state;
+};
+
+struct ieee80211_key {
+ struct ieee80211_local *local;
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+
+ /* for sdata list */
+ struct list_head list;
+
+ /* protected by key mutex */
+ unsigned int flags;
+
+ union {
+ struct {
+ /* protects tx context */
+ spinlock_t txlock;
+
+ /* last used TSC */
+ struct tkip_ctx tx;
+
+ /* last received RSC */
+ struct tkip_ctx rx[IEEE80211_NUM_TIDS];
+
+ /* number of mic failures */
+ u32 mic_failures;
+ } tkip;
+ struct {
+ atomic64_t tx_pn;
+ /*
+ * Last received packet number. The first
+ * IEEE80211_NUM_TIDS counters are used with Data
+ * frames and the last counter is used with Robust
+ * Management frames.
+ */
+ u8 rx_pn[IEEE80211_NUM_TIDS + 1][IEEE80211_CCMP_PN_LEN];
+ struct crypto_aead *tfm;
+ u32 replays; /* dot11RSNAStatsCCMPReplays */
+ } ccmp;
+ struct {
+ atomic64_t tx_pn;
+ u8 rx_pn[IEEE80211_CMAC_PN_LEN];
+ struct crypto_cipher *tfm;
+ u32 replays; /* dot11RSNAStatsCMACReplays */
+ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
+ } aes_cmac;
+ struct {
+ atomic64_t tx_pn;
+ u8 rx_pn[IEEE80211_GMAC_PN_LEN];
+ struct crypto_aead *tfm;
+ u32 replays; /* dot11RSNAStatsCMACReplays */
+ u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
+ } aes_gmac;
+ struct {
+ atomic64_t tx_pn;
+ /* Last received packet number. The first
+ * IEEE80211_NUM_TIDS counters are used with Data
+ * frames and the last counter is used with Robust
+ * Management frames.
+ */
+ u8 rx_pn[IEEE80211_NUM_TIDS + 1][IEEE80211_GCMP_PN_LEN];
+ struct crypto_aead *tfm;
+ u32 replays; /* dot11RSNAStatsGCMPReplays */
+ } gcmp;
+ struct {
+ /* generic cipher scheme */
+ u8 rx_pn[IEEE80211_NUM_TIDS + 1][MAX_PN_LEN];
+ } gen;
+ } u;
+
+ /* number of times this key has been used */
+ int tx_rx_count;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct {
+ struct dentry *stalink;
+ struct dentry *dir;
+ int cnt;
+ } debugfs;
+#endif
+
+ /*
+ * key config, must be last because it contains key
+ * material as variable length member
+ */
+ struct ieee80211_key_conf conf;
+};
+
+struct ieee80211_key *
+ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
+ const u8 *key_data,
+ size_t seq_len, const u8 *seq,
+ const struct ieee80211_cipher_scheme *cs);
+/*
+ * Insert a key into data structures (sdata, sta if necessary)
+ * to make it used, free old key. On failure, also free the new key.
+ */
+int ieee80211_key_link(struct ieee80211_key *key,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta);
+void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom);
+void ieee80211_key_free_unused(struct ieee80211_key *key);
+void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx,
+ bool uni, bool multi);
+void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
+ int idx);
+void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata,
+ bool force_synchronize);
+void ieee80211_free_sta_keys(struct ieee80211_local *local,
+ struct sta_info *sta);
+void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata);
+void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata);
+
+#define key_mtx_dereference(local, ref) \
+ rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx)))
+
+void ieee80211_delayed_tailroom_dec(struct work_struct *wk);
+
+#endif /* IEEE80211_KEY_H */
diff --git a/net/mac80211/led.c b/net/mac80211/led.c
new file mode 100644
index 000000000..e2b836446
--- /dev/null
+++ b/net/mac80211/led.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright 2006, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* just for IFNAMSIZ */
+#include <linux/if.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include "led.h"
+
+#define MAC80211_BLINK_DELAY 50 /* ms */
+
+void ieee80211_led_rx(struct ieee80211_local *local)
+{
+ unsigned long led_delay = MAC80211_BLINK_DELAY;
+ if (unlikely(!local->rx_led))
+ return;
+ led_trigger_blink_oneshot(local->rx_led, &led_delay, &led_delay, 0);
+}
+
+void ieee80211_led_tx(struct ieee80211_local *local)
+{
+ unsigned long led_delay = MAC80211_BLINK_DELAY;
+ if (unlikely(!local->tx_led))
+ return;
+ led_trigger_blink_oneshot(local->tx_led, &led_delay, &led_delay, 0);
+}
+
+void ieee80211_led_assoc(struct ieee80211_local *local, bool associated)
+{
+ if (unlikely(!local->assoc_led))
+ return;
+ if (associated)
+ led_trigger_event(local->assoc_led, LED_FULL);
+ else
+ led_trigger_event(local->assoc_led, LED_OFF);
+}
+
+void ieee80211_led_radio(struct ieee80211_local *local, bool enabled)
+{
+ if (unlikely(!local->radio_led))
+ return;
+ if (enabled)
+ led_trigger_event(local->radio_led, LED_FULL);
+ else
+ led_trigger_event(local->radio_led, LED_OFF);
+}
+
+void ieee80211_led_names(struct ieee80211_local *local)
+{
+ snprintf(local->rx_led_name, sizeof(local->rx_led_name),
+ "%srx", wiphy_name(local->hw.wiphy));
+ snprintf(local->tx_led_name, sizeof(local->tx_led_name),
+ "%stx", wiphy_name(local->hw.wiphy));
+ snprintf(local->assoc_led_name, sizeof(local->assoc_led_name),
+ "%sassoc", wiphy_name(local->hw.wiphy));
+ snprintf(local->radio_led_name, sizeof(local->radio_led_name),
+ "%sradio", wiphy_name(local->hw.wiphy));
+}
+
+void ieee80211_led_init(struct ieee80211_local *local)
+{
+ local->rx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
+ if (local->rx_led) {
+ local->rx_led->name = local->rx_led_name;
+ if (led_trigger_register(local->rx_led)) {
+ kfree(local->rx_led);
+ local->rx_led = NULL;
+ }
+ }
+
+ local->tx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
+ if (local->tx_led) {
+ local->tx_led->name = local->tx_led_name;
+ if (led_trigger_register(local->tx_led)) {
+ kfree(local->tx_led);
+ local->tx_led = NULL;
+ }
+ }
+
+ local->assoc_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
+ if (local->assoc_led) {
+ local->assoc_led->name = local->assoc_led_name;
+ if (led_trigger_register(local->assoc_led)) {
+ kfree(local->assoc_led);
+ local->assoc_led = NULL;
+ }
+ }
+
+ local->radio_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
+ if (local->radio_led) {
+ local->radio_led->name = local->radio_led_name;
+ if (led_trigger_register(local->radio_led)) {
+ kfree(local->radio_led);
+ local->radio_led = NULL;
+ }
+ }
+
+ if (local->tpt_led_trigger) {
+ if (led_trigger_register(&local->tpt_led_trigger->trig)) {
+ kfree(local->tpt_led_trigger);
+ local->tpt_led_trigger = NULL;
+ }
+ }
+}
+
+void ieee80211_led_exit(struct ieee80211_local *local)
+{
+ if (local->radio_led) {
+ led_trigger_unregister(local->radio_led);
+ kfree(local->radio_led);
+ }
+ if (local->assoc_led) {
+ led_trigger_unregister(local->assoc_led);
+ kfree(local->assoc_led);
+ }
+ if (local->tx_led) {
+ led_trigger_unregister(local->tx_led);
+ kfree(local->tx_led);
+ }
+ if (local->rx_led) {
+ led_trigger_unregister(local->rx_led);
+ kfree(local->rx_led);
+ }
+
+ if (local->tpt_led_trigger) {
+ led_trigger_unregister(&local->tpt_led_trigger->trig);
+ kfree(local->tpt_led_trigger);
+ }
+}
+
+char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ return local->radio_led_name;
+}
+EXPORT_SYMBOL(__ieee80211_get_radio_led_name);
+
+char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ return local->assoc_led_name;
+}
+EXPORT_SYMBOL(__ieee80211_get_assoc_led_name);
+
+char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ return local->tx_led_name;
+}
+EXPORT_SYMBOL(__ieee80211_get_tx_led_name);
+
+char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ return local->rx_led_name;
+}
+EXPORT_SYMBOL(__ieee80211_get_rx_led_name);
+
+static unsigned long tpt_trig_traffic(struct ieee80211_local *local,
+ struct tpt_led_trigger *tpt_trig)
+{
+ unsigned long traffic, delta;
+
+ traffic = tpt_trig->tx_bytes + tpt_trig->rx_bytes;
+
+ delta = traffic - tpt_trig->prev_traffic;
+ tpt_trig->prev_traffic = traffic;
+ return DIV_ROUND_UP(delta, 1024 / 8);
+}
+
+static void tpt_trig_timer(unsigned long data)
+{
+ struct ieee80211_local *local = (void *)data;
+ struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger;
+ struct led_classdev *led_cdev;
+ unsigned long on, off, tpt;
+ int i;
+
+ if (!tpt_trig->running)
+ return;
+
+ mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ));
+
+ tpt = tpt_trig_traffic(local, tpt_trig);
+
+ /* default to just solid on */
+ on = 1;
+ off = 0;
+
+ for (i = tpt_trig->blink_table_len - 1; i >= 0; i--) {
+ if (tpt_trig->blink_table[i].throughput < 0 ||
+ tpt > tpt_trig->blink_table[i].throughput) {
+ off = tpt_trig->blink_table[i].blink_time / 2;
+ on = tpt_trig->blink_table[i].blink_time - off;
+ break;
+ }
+ }
+
+ read_lock(&tpt_trig->trig.leddev_list_lock);
+ list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list)
+ led_blink_set(led_cdev, &on, &off);
+ read_unlock(&tpt_trig->trig.leddev_list_lock);
+}
+
+char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw,
+ unsigned int flags,
+ const struct ieee80211_tpt_blink *blink_table,
+ unsigned int blink_table_len)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct tpt_led_trigger *tpt_trig;
+
+ if (WARN_ON(local->tpt_led_trigger))
+ return NULL;
+
+ tpt_trig = kzalloc(sizeof(struct tpt_led_trigger), GFP_KERNEL);
+ if (!tpt_trig)
+ return NULL;
+
+ snprintf(tpt_trig->name, sizeof(tpt_trig->name),
+ "%stpt", wiphy_name(local->hw.wiphy));
+
+ tpt_trig->trig.name = tpt_trig->name;
+
+ tpt_trig->blink_table = blink_table;
+ tpt_trig->blink_table_len = blink_table_len;
+ tpt_trig->want = flags;
+
+ setup_timer(&tpt_trig->timer, tpt_trig_timer, (unsigned long)local);
+
+ local->tpt_led_trigger = tpt_trig;
+
+ return tpt_trig->name;
+}
+EXPORT_SYMBOL(__ieee80211_create_tpt_led_trigger);
+
+static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local)
+{
+ struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger;
+
+ if (tpt_trig->running)
+ return;
+
+ /* reset traffic */
+ tpt_trig_traffic(local, tpt_trig);
+ tpt_trig->running = true;
+
+ tpt_trig_timer((unsigned long)local);
+ mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ));
+}
+
+static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local)
+{
+ struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger;
+ struct led_classdev *led_cdev;
+
+ if (!tpt_trig->running)
+ return;
+
+ tpt_trig->running = false;
+ del_timer_sync(&tpt_trig->timer);
+
+ read_lock(&tpt_trig->trig.leddev_list_lock);
+ list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list)
+ led_set_brightness(led_cdev, LED_OFF);
+ read_unlock(&tpt_trig->trig.leddev_list_lock);
+}
+
+void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
+ unsigned int types_on, unsigned int types_off)
+{
+ struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger;
+ bool allowed;
+
+ WARN_ON(types_on & types_off);
+
+ if (!tpt_trig)
+ return;
+
+ tpt_trig->active &= ~types_off;
+ tpt_trig->active |= types_on;
+
+ /*
+ * Regardless of wanted state, we shouldn't blink when
+ * the radio is disabled -- this can happen due to some
+ * code ordering issues with __ieee80211_recalc_idle()
+ * being called before the radio is started.
+ */
+ allowed = tpt_trig->active & IEEE80211_TPT_LEDTRIG_FL_RADIO;
+
+ if (!allowed || !(tpt_trig->active & tpt_trig->want))
+ ieee80211_stop_tpt_led_trig(local);
+ else
+ ieee80211_start_tpt_led_trig(local);
+}
diff --git a/net/mac80211/led.h b/net/mac80211/led.h
new file mode 100644
index 000000000..89f4344f1
--- /dev/null
+++ b/net/mac80211/led.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2006, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/leds.h>
+#include "ieee80211_i.h"
+
+#ifdef CONFIG_MAC80211_LEDS
+void ieee80211_led_rx(struct ieee80211_local *local);
+void ieee80211_led_tx(struct ieee80211_local *local);
+void ieee80211_led_assoc(struct ieee80211_local *local,
+ bool associated);
+void ieee80211_led_radio(struct ieee80211_local *local,
+ bool enabled);
+void ieee80211_led_names(struct ieee80211_local *local);
+void ieee80211_led_init(struct ieee80211_local *local);
+void ieee80211_led_exit(struct ieee80211_local *local);
+void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
+ unsigned int types_on, unsigned int types_off);
+#else
+static inline void ieee80211_led_rx(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_led_tx(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_led_assoc(struct ieee80211_local *local,
+ bool associated)
+{
+}
+static inline void ieee80211_led_radio(struct ieee80211_local *local,
+ bool enabled)
+{
+}
+static inline void ieee80211_led_names(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_led_init(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_led_exit(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
+ unsigned int types_on,
+ unsigned int types_off)
+{
+}
+#endif
+
+static inline void
+ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes)
+{
+#ifdef CONFIG_MAC80211_LEDS
+ if (local->tpt_led_trigger && ieee80211_is_data(fc))
+ local->tpt_led_trigger->tx_bytes += bytes;
+#endif
+}
+
+static inline void
+ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes)
+{
+#ifdef CONFIG_MAC80211_LEDS
+ if (local->tpt_led_trigger && ieee80211_is_data(fc))
+ local->tpt_led_trigger->rx_bytes += bytes;
+#endif
+}
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
new file mode 100644
index 000000000..e86daed83
--- /dev/null
+++ b/net/mac80211/main.c
@@ -0,0 +1,1266 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/mac80211.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitmap.h>
+#include <linux/pm_qos.h>
+#include <linux/inetdevice.h>
+#include <net/net_namespace.h>
+#include <net/cfg80211.h>
+#include <net/addrconf.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "mesh.h"
+#include "wep.h"
+#include "led.h"
+#include "cfg.h"
+#include "debugfs.h"
+
+void ieee80211_configure_filter(struct ieee80211_local *local)
+{
+ u64 mc;
+ unsigned int changed_flags;
+ unsigned int new_flags = 0;
+
+ if (atomic_read(&local->iff_promiscs))
+ new_flags |= FIF_PROMISC_IN_BSS;
+
+ if (atomic_read(&local->iff_allmultis))
+ new_flags |= FIF_ALLMULTI;
+
+ if (local->monitors || test_bit(SCAN_SW_SCANNING, &local->scanning) ||
+ test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning))
+ new_flags |= FIF_BCN_PRBRESP_PROMISC;
+
+ if (local->fif_probe_req || local->probe_req_reg)
+ new_flags |= FIF_PROBE_REQ;
+
+ if (local->fif_fcsfail)
+ new_flags |= FIF_FCSFAIL;
+
+ if (local->fif_plcpfail)
+ new_flags |= FIF_PLCPFAIL;
+
+ if (local->fif_control)
+ new_flags |= FIF_CONTROL;
+
+ if (local->fif_other_bss)
+ new_flags |= FIF_OTHER_BSS;
+
+ if (local->fif_pspoll)
+ new_flags |= FIF_PSPOLL;
+
+ spin_lock_bh(&local->filter_lock);
+ changed_flags = local->filter_flags ^ new_flags;
+
+ mc = drv_prepare_multicast(local, &local->mc_list);
+ spin_unlock_bh(&local->filter_lock);
+
+ /* be a bit nasty */
+ new_flags |= (1<<31);
+
+ drv_configure_filter(local, changed_flags, &new_flags, mc);
+
+ WARN_ON(new_flags & (1<<31));
+
+ local->filter_flags = new_flags & ~(1<<31);
+}
+
+static void ieee80211_reconfig_filter(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local, reconfig_filter);
+
+ ieee80211_configure_filter(local);
+}
+
+static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct cfg80211_chan_def chandef = {};
+ u32 changed = 0;
+ int power;
+ u32 offchannel_flag;
+
+ offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
+
+ if (local->scan_chandef.chan) {
+ chandef = local->scan_chandef;
+ } else if (local->tmp_channel) {
+ chandef.chan = local->tmp_channel;
+ chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
+ chandef.center_freq1 = chandef.chan->center_freq;
+ } else
+ chandef = local->_oper_chandef;
+
+ WARN(!cfg80211_chandef_valid(&chandef),
+ "control:%d MHz width:%d center: %d/%d MHz",
+ chandef.chan->center_freq, chandef.width,
+ chandef.center_freq1, chandef.center_freq2);
+
+ if (!cfg80211_chandef_identical(&chandef, &local->_oper_chandef))
+ local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
+ else
+ local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL;
+
+ offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
+
+ if (offchannel_flag ||
+ !cfg80211_chandef_identical(&local->hw.conf.chandef,
+ &local->_oper_chandef)) {
+ local->hw.conf.chandef = chandef;
+ changed |= IEEE80211_CONF_CHANGE_CHANNEL;
+ }
+
+ if (!conf_is_ht(&local->hw.conf)) {
+ /*
+ * mac80211.h documents that this is only valid
+ * when the channel is set to an HT type, and
+ * that otherwise STATIC is used.
+ */
+ local->hw.conf.smps_mode = IEEE80211_SMPS_STATIC;
+ } else if (local->hw.conf.smps_mode != local->smps_mode) {
+ local->hw.conf.smps_mode = local->smps_mode;
+ changed |= IEEE80211_CONF_CHANGE_SMPS;
+ }
+
+ power = ieee80211_chandef_max_power(&chandef);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!rcu_access_pointer(sdata->vif.chanctx_conf))
+ continue;
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ continue;
+ power = min(power, sdata->vif.bss_conf.txpower);
+ }
+ rcu_read_unlock();
+
+ if (local->hw.conf.power_level != power) {
+ changed |= IEEE80211_CONF_CHANGE_POWER;
+ local->hw.conf.power_level = power;
+ }
+
+ return changed;
+}
+
+int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
+{
+ int ret = 0;
+
+ might_sleep();
+
+ if (!local->use_chanctx)
+ changed |= ieee80211_hw_conf_chan(local);
+ else
+ changed &= ~(IEEE80211_CONF_CHANGE_CHANNEL |
+ IEEE80211_CONF_CHANGE_POWER);
+
+ if (changed && local->open_count) {
+ ret = drv_config(local, changed);
+ /*
+ * Goal:
+ * HW reconfiguration should never fail, the driver has told
+ * us what it can support so it should live up to that promise.
+ *
+ * Current status:
+ * rfkill is not integrated with mac80211 and a
+ * configuration command can thus fail if hardware rfkill
+ * is enabled
+ *
+ * FIXME: integrate rfkill with mac80211 and then add this
+ * WARN_ON() back
+ *
+ */
+ /* WARN_ON(ret); */
+ }
+
+ return ret;
+}
+
+void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
+ u32 changed)
+{
+ struct ieee80211_local *local = sdata->local;
+
+ if (!changed || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ return;
+
+ drv_bss_info_changed(local, sdata, &sdata->vif.bss_conf, changed);
+}
+
+u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata)
+{
+ sdata->vif.bss_conf.use_cts_prot = false;
+ sdata->vif.bss_conf.use_short_preamble = false;
+ sdata->vif.bss_conf.use_short_slot = false;
+ return BSS_CHANGED_ERP_CTS_PROT |
+ BSS_CHANGED_ERP_PREAMBLE |
+ BSS_CHANGED_ERP_SLOT;
+}
+
+static void ieee80211_tasklet_handler(unsigned long data)
+{
+ struct ieee80211_local *local = (struct ieee80211_local *) data;
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&local->skb_queue)) ||
+ (skb = skb_dequeue(&local->skb_queue_unreliable))) {
+ switch (skb->pkt_type) {
+ case IEEE80211_RX_MSG:
+ /* Clear skb->pkt_type in order to not confuse kernel
+ * netstack. */
+ skb->pkt_type = 0;
+ ieee80211_rx(&local->hw, skb);
+ break;
+ case IEEE80211_TX_STATUS_MSG:
+ skb->pkt_type = 0;
+ ieee80211_tx_status(&local->hw, skb);
+ break;
+ default:
+ WARN(1, "mac80211: Packet is of unknown type %d\n",
+ skb->pkt_type);
+ dev_kfree_skb(skb);
+ break;
+ }
+ }
+}
+
+static void ieee80211_restart_work(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local, restart_work);
+ struct ieee80211_sub_if_data *sdata;
+
+ /* wait for scan work complete */
+ flush_workqueue(local->workqueue);
+
+ WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
+ "%s called with hardware scan in progress\n", __func__);
+
+ rtnl_lock();
+ list_for_each_entry(sdata, &local->interfaces, list)
+ flush_delayed_work(&sdata->dec_tailroom_needed_wk);
+ ieee80211_scan_cancel(local);
+ ieee80211_reconfig(local);
+ rtnl_unlock();
+}
+
+void ieee80211_restart_hw(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ trace_api_restart_hw(local);
+
+ wiphy_info(hw->wiphy,
+ "Hardware restart was requested\n");
+
+ /* use this reason, ieee80211_reconfig will unblock it */
+ ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_SUSPEND,
+ false);
+
+ /*
+ * Stop all Rx during the reconfig. We don't want state changes
+ * or driver callbacks while this is in progress.
+ */
+ local->in_reconfig = true;
+ barrier();
+
+ schedule_work(&local->restart_work);
+}
+EXPORT_SYMBOL(ieee80211_restart_hw);
+
+#ifdef CONFIG_INET
+static int ieee80211_ifa_changed(struct notifier_block *nb,
+ unsigned long data, void *arg)
+{
+ struct in_ifaddr *ifa = arg;
+ struct ieee80211_local *local =
+ container_of(nb, struct ieee80211_local,
+ ifa_notifier);
+ struct net_device *ndev = ifa->ifa_dev->dev;
+ struct wireless_dev *wdev = ndev->ieee80211_ptr;
+ struct in_device *idev;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_bss_conf *bss_conf;
+ struct ieee80211_if_managed *ifmgd;
+ int c = 0;
+
+ /* Make sure it's our interface that got changed */
+ if (!wdev)
+ return NOTIFY_DONE;
+
+ if (wdev->wiphy != local->hw.wiphy)
+ return NOTIFY_DONE;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(ndev);
+ bss_conf = &sdata->vif.bss_conf;
+
+ /* ARP filtering is only supported in managed mode */
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return NOTIFY_DONE;
+
+ idev = __in_dev_get_rtnl(sdata->dev);
+ if (!idev)
+ return NOTIFY_DONE;
+
+ ifmgd = &sdata->u.mgd;
+ sdata_lock(sdata);
+
+ /* Copy the addresses to the bss_conf list */
+ ifa = idev->ifa_list;
+ while (ifa) {
+ if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN)
+ bss_conf->arp_addr_list[c] = ifa->ifa_address;
+ ifa = ifa->ifa_next;
+ c++;
+ }
+
+ bss_conf->arp_addr_cnt = c;
+
+ /* Configure driver only if associated (which also implies it is up) */
+ if (ifmgd->associated)
+ ieee80211_bss_info_change_notify(sdata,
+ BSS_CHANGED_ARP_FILTER);
+
+ sdata_unlock(sdata);
+
+ return NOTIFY_OK;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int ieee80211_ifa6_changed(struct notifier_block *nb,
+ unsigned long data, void *arg)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)arg;
+ struct inet6_dev *idev = ifa->idev;
+ struct net_device *ndev = ifa->idev->dev;
+ struct ieee80211_local *local =
+ container_of(nb, struct ieee80211_local, ifa6_notifier);
+ struct wireless_dev *wdev = ndev->ieee80211_ptr;
+ struct ieee80211_sub_if_data *sdata;
+
+ /* Make sure it's our interface that got changed */
+ if (!wdev || wdev->wiphy != local->hw.wiphy)
+ return NOTIFY_DONE;
+
+ sdata = IEEE80211_DEV_TO_SUB_IF(ndev);
+
+ /*
+ * For now only support station mode. This is mostly because
+ * doing AP would have to handle AP_VLAN in some way ...
+ */
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return NOTIFY_DONE;
+
+ drv_ipv6_addr_change(local, sdata, idev);
+
+ return NOTIFY_OK;
+}
+#endif
+
+/* There isn't a lot of sense in it, but you can transmit anything you like */
+static const struct ieee80211_txrx_stypes
+ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = {
+ [NL80211_IFTYPE_ADHOC] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
+ BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+ },
+ [NL80211_IFTYPE_STATION] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+ },
+ [NL80211_IFTYPE_AP] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+ BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
+ BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+ BIT(IEEE80211_STYPE_ACTION >> 4),
+ },
+ [NL80211_IFTYPE_AP_VLAN] = {
+ /* copy AP */
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+ BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
+ BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+ BIT(IEEE80211_STYPE_ACTION >> 4),
+ },
+ [NL80211_IFTYPE_P2P_CLIENT] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+ },
+ [NL80211_IFTYPE_P2P_GO] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+ BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
+ BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+ BIT(IEEE80211_STYPE_ACTION >> 4),
+ },
+ [NL80211_IFTYPE_MESH_POINT] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
+ BIT(IEEE80211_STYPE_DEAUTH >> 4),
+ },
+ [NL80211_IFTYPE_P2P_DEVICE] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+ },
+};
+
+static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = {
+ .ampdu_params_info = IEEE80211_HT_AMPDU_PARM_FACTOR |
+ IEEE80211_HT_AMPDU_PARM_DENSITY,
+
+ .cap_info = cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+ IEEE80211_HT_CAP_MAX_AMSDU |
+ IEEE80211_HT_CAP_SGI_20 |
+ IEEE80211_HT_CAP_SGI_40 |
+ IEEE80211_HT_CAP_LDPC_CODING |
+ IEEE80211_HT_CAP_40MHZ_INTOLERANT),
+ .mcs = {
+ .rx_mask = { 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, },
+ },
+};
+
+static const struct ieee80211_vht_cap mac80211_vht_capa_mod_mask = {
+ .vht_cap_info =
+ cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC |
+ IEEE80211_VHT_CAP_SHORT_GI_80 |
+ IEEE80211_VHT_CAP_SHORT_GI_160 |
+ IEEE80211_VHT_CAP_RXSTBC_1 |
+ IEEE80211_VHT_CAP_RXSTBC_2 |
+ IEEE80211_VHT_CAP_RXSTBC_3 |
+ IEEE80211_VHT_CAP_RXSTBC_4 |
+ IEEE80211_VHT_CAP_TXSTBC |
+ IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE |
+ IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE |
+ IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN |
+ IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN |
+ IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK),
+ .supp_mcs = {
+ .rx_mcs_map = cpu_to_le16(~0),
+ .tx_mcs_map = cpu_to_le16(~0),
+ },
+};
+
+struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
+ const struct ieee80211_ops *ops,
+ const char *requested_name)
+{
+ struct ieee80211_local *local;
+ int priv_size, i;
+ struct wiphy *wiphy;
+ bool use_chanctx;
+
+ if (WARN_ON(!ops->tx || !ops->start || !ops->stop || !ops->config ||
+ !ops->add_interface || !ops->remove_interface ||
+ !ops->configure_filter))
+ return NULL;
+
+ if (WARN_ON(ops->sta_state && (ops->sta_add || ops->sta_remove)))
+ return NULL;
+
+ /* check all or no channel context operations exist */
+ i = !!ops->add_chanctx + !!ops->remove_chanctx +
+ !!ops->change_chanctx + !!ops->assign_vif_chanctx +
+ !!ops->unassign_vif_chanctx;
+ if (WARN_ON(i != 0 && i != 5))
+ return NULL;
+ use_chanctx = i == 5;
+
+ /* Ensure 32-byte alignment of our private data and hw private data.
+ * We use the wiphy priv data for both our ieee80211_local and for
+ * the driver's private data
+ *
+ * In memory it'll be like this:
+ *
+ * +-------------------------+
+ * | struct wiphy |
+ * +-------------------------+
+ * | struct ieee80211_local |
+ * +-------------------------+
+ * | driver's private data |
+ * +-------------------------+
+ *
+ */
+ priv_size = ALIGN(sizeof(*local), NETDEV_ALIGN) + priv_data_len;
+
+ wiphy = wiphy_new_nm(&mac80211_config_ops, priv_size, requested_name);
+
+ if (!wiphy)
+ return NULL;
+
+ wiphy->mgmt_stypes = ieee80211_default_mgmt_stypes;
+
+ wiphy->privid = mac80211_wiphy_privid;
+
+ wiphy->flags |= WIPHY_FLAG_NETNS_OK |
+ WIPHY_FLAG_4ADDR_AP |
+ WIPHY_FLAG_4ADDR_STATION |
+ WIPHY_FLAG_REPORTS_OBSS |
+ WIPHY_FLAG_OFFCHAN_TX;
+
+ if (ops->remain_on_channel)
+ wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL;
+
+ wiphy->features |= NL80211_FEATURE_SK_TX_STATUS |
+ NL80211_FEATURE_SAE |
+ NL80211_FEATURE_HT_IBSS |
+ NL80211_FEATURE_VIF_TXPOWER |
+ NL80211_FEATURE_MAC_ON_CREATE |
+ NL80211_FEATURE_USERSPACE_MPM;
+
+ if (!ops->hw_scan)
+ wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
+ NL80211_FEATURE_AP_SCAN;
+
+
+ if (!ops->set_key)
+ wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
+
+ wiphy->bss_priv_size = sizeof(struct ieee80211_bss);
+
+ local = wiphy_priv(wiphy);
+
+ if (sta_info_init(local))
+ goto err_free;
+
+ local->hw.wiphy = wiphy;
+
+ local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN);
+
+ local->ops = ops;
+ local->use_chanctx = use_chanctx;
+
+ /* set up some defaults */
+ local->hw.queues = 1;
+ local->hw.max_rates = 1;
+ local->hw.max_report_rates = 0;
+ local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
+ local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
+ local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE;
+ local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
+ local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
+ local->hw.radiotap_mcs_details = IEEE80211_RADIOTAP_MCS_HAVE_MCS |
+ IEEE80211_RADIOTAP_MCS_HAVE_GI |
+ IEEE80211_RADIOTAP_MCS_HAVE_BW;
+ local->hw.radiotap_vht_details = IEEE80211_RADIOTAP_VHT_KNOWN_GI |
+ IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH;
+ local->hw.uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES;
+ local->hw.uapsd_max_sp_len = IEEE80211_DEFAULT_MAX_SP_LEN;
+ local->user_power_level = IEEE80211_UNSET_POWER_LEVEL;
+ wiphy->ht_capa_mod_mask = &mac80211_ht_capa_mod_mask;
+ wiphy->vht_capa_mod_mask = &mac80211_vht_capa_mod_mask;
+
+ local->ext_capa[7] = WLAN_EXT_CAPA8_OPMODE_NOTIF;
+
+ wiphy->extended_capabilities = local->ext_capa;
+ wiphy->extended_capabilities_mask = local->ext_capa;
+ wiphy->extended_capabilities_len =
+ ARRAY_SIZE(local->ext_capa);
+
+ INIT_LIST_HEAD(&local->interfaces);
+
+ __hw_addr_init(&local->mc_list);
+
+ mutex_init(&local->iflist_mtx);
+ mutex_init(&local->mtx);
+
+ mutex_init(&local->key_mtx);
+ spin_lock_init(&local->filter_lock);
+ spin_lock_init(&local->rx_path_lock);
+ spin_lock_init(&local->queue_stop_reason_lock);
+
+ INIT_LIST_HEAD(&local->chanctx_list);
+ mutex_init(&local->chanctx_mtx);
+
+ INIT_DELAYED_WORK(&local->scan_work, ieee80211_scan_work);
+
+ INIT_WORK(&local->restart_work, ieee80211_restart_work);
+
+ INIT_WORK(&local->radar_detected_work,
+ ieee80211_dfs_radar_detected_work);
+
+ INIT_WORK(&local->reconfig_filter, ieee80211_reconfig_filter);
+ local->smps_mode = IEEE80211_SMPS_OFF;
+
+ INIT_WORK(&local->dynamic_ps_enable_work,
+ ieee80211_dynamic_ps_enable_work);
+ INIT_WORK(&local->dynamic_ps_disable_work,
+ ieee80211_dynamic_ps_disable_work);
+ setup_timer(&local->dynamic_ps_timer,
+ ieee80211_dynamic_ps_timer, (unsigned long) local);
+
+ INIT_WORK(&local->sched_scan_stopped_work,
+ ieee80211_sched_scan_stopped_work);
+
+ spin_lock_init(&local->ack_status_lock);
+ idr_init(&local->ack_status_frames);
+
+ for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
+ skb_queue_head_init(&local->pending[i]);
+ atomic_set(&local->agg_queue_stop[i], 0);
+ }
+ tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
+ (unsigned long)local);
+
+ tasklet_init(&local->tasklet,
+ ieee80211_tasklet_handler,
+ (unsigned long) local);
+
+ skb_queue_head_init(&local->skb_queue);
+ skb_queue_head_init(&local->skb_queue_unreliable);
+
+ ieee80211_led_names(local);
+
+ ieee80211_roc_setup(local);
+
+ return &local->hw;
+ err_free:
+ wiphy_free(wiphy);
+ return NULL;
+}
+EXPORT_SYMBOL(ieee80211_alloc_hw_nm);
+
+static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
+{
+ bool have_wep = !(IS_ERR(local->wep_tx_tfm) ||
+ IS_ERR(local->wep_rx_tfm));
+ bool have_mfp = local->hw.flags & IEEE80211_HW_MFP_CAPABLE;
+ int n_suites = 0, r = 0, w = 0;
+ u32 *suites;
+ static const u32 cipher_suites[] = {
+ /* keep WEP first, it may be removed below */
+ WLAN_CIPHER_SUITE_WEP40,
+ WLAN_CIPHER_SUITE_WEP104,
+ WLAN_CIPHER_SUITE_TKIP,
+ WLAN_CIPHER_SUITE_CCMP,
+ WLAN_CIPHER_SUITE_CCMP_256,
+ WLAN_CIPHER_SUITE_GCMP,
+ WLAN_CIPHER_SUITE_GCMP_256,
+
+ /* keep last -- depends on hw flags! */
+ WLAN_CIPHER_SUITE_AES_CMAC,
+ WLAN_CIPHER_SUITE_BIP_CMAC_256,
+ WLAN_CIPHER_SUITE_BIP_GMAC_128,
+ WLAN_CIPHER_SUITE_BIP_GMAC_256,
+ };
+
+ if (local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL ||
+ local->hw.wiphy->cipher_suites) {
+ /* If the driver advertises, or doesn't support SW crypto,
+ * we only need to remove WEP if necessary.
+ */
+ if (have_wep)
+ return 0;
+
+ /* well if it has _no_ ciphers ... fine */
+ if (!local->hw.wiphy->n_cipher_suites)
+ return 0;
+
+ /* Driver provides cipher suites, but we need to exclude WEP */
+ suites = kmemdup(local->hw.wiphy->cipher_suites,
+ sizeof(u32) * local->hw.wiphy->n_cipher_suites,
+ GFP_KERNEL);
+ if (!suites)
+ return -ENOMEM;
+
+ for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) {
+ u32 suite = local->hw.wiphy->cipher_suites[r];
+
+ if (suite == WLAN_CIPHER_SUITE_WEP40 ||
+ suite == WLAN_CIPHER_SUITE_WEP104)
+ continue;
+ suites[w++] = suite;
+ }
+ } else if (!local->hw.cipher_schemes) {
+ /* If the driver doesn't have cipher schemes, there's nothing
+ * else to do other than assign the (software supported and
+ * perhaps offloaded) cipher suites.
+ */
+ local->hw.wiphy->cipher_suites = cipher_suites;
+ local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
+
+ if (!have_mfp)
+ local->hw.wiphy->n_cipher_suites -= 4;
+
+ if (!have_wep) {
+ local->hw.wiphy->cipher_suites += 2;
+ local->hw.wiphy->n_cipher_suites -= 2;
+ }
+
+ /* not dynamically allocated, so just return */
+ return 0;
+ } else {
+ const struct ieee80211_cipher_scheme *cs;
+
+ cs = local->hw.cipher_schemes;
+
+ /* Driver specifies cipher schemes only (but not cipher suites
+ * including the schemes)
+ *
+ * We start counting ciphers defined by schemes, TKIP, CCMP,
+ * CCMP-256, GCMP, and GCMP-256
+ */
+ n_suites = local->hw.n_cipher_schemes + 5;
+
+ /* check if we have WEP40 and WEP104 */
+ if (have_wep)
+ n_suites += 2;
+
+ /* check if we have AES_CMAC, BIP-CMAC-256, BIP-GMAC-128,
+ * BIP-GMAC-256
+ */
+ if (have_mfp)
+ n_suites += 4;
+
+ suites = kmalloc(sizeof(u32) * n_suites, GFP_KERNEL);
+ if (!suites)
+ return -ENOMEM;
+
+ suites[w++] = WLAN_CIPHER_SUITE_CCMP;
+ suites[w++] = WLAN_CIPHER_SUITE_CCMP_256;
+ suites[w++] = WLAN_CIPHER_SUITE_TKIP;
+ suites[w++] = WLAN_CIPHER_SUITE_GCMP;
+ suites[w++] = WLAN_CIPHER_SUITE_GCMP_256;
+
+ if (have_wep) {
+ suites[w++] = WLAN_CIPHER_SUITE_WEP40;
+ suites[w++] = WLAN_CIPHER_SUITE_WEP104;
+ }
+
+ if (have_mfp) {
+ suites[w++] = WLAN_CIPHER_SUITE_AES_CMAC;
+ suites[w++] = WLAN_CIPHER_SUITE_BIP_CMAC_256;
+ suites[w++] = WLAN_CIPHER_SUITE_BIP_GMAC_128;
+ suites[w++] = WLAN_CIPHER_SUITE_BIP_GMAC_256;
+ }
+
+ for (r = 0; r < local->hw.n_cipher_schemes; r++)
+ suites[w++] = cs[r].cipher;
+ }
+
+ local->hw.wiphy->cipher_suites = suites;
+ local->hw.wiphy->n_cipher_suites = w;
+ local->wiphy_ciphers_allocated = true;
+
+ return 0;
+}
+
+int ieee80211_register_hw(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ int result, i;
+ enum ieee80211_band band;
+ int channels, max_bitrates;
+ bool supp_ht, supp_vht;
+ netdev_features_t feature_whitelist;
+ struct cfg80211_chan_def dflt_chandef = {};
+
+ if (hw->flags & IEEE80211_HW_QUEUE_CONTROL &&
+ (local->hw.offchannel_tx_hw_queue == IEEE80211_INVAL_HW_QUEUE ||
+ local->hw.offchannel_tx_hw_queue >= local->hw.queues))
+ return -EINVAL;
+
+ if ((hw->wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH) &&
+ (!local->ops->tdls_channel_switch ||
+ !local->ops->tdls_cancel_channel_switch ||
+ !local->ops->tdls_recv_channel_switch))
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_PM
+ if (hw->wiphy->wowlan && (!local->ops->suspend || !local->ops->resume))
+ return -EINVAL;
+#endif
+
+ if (!local->use_chanctx) {
+ for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) {
+ const struct ieee80211_iface_combination *comb;
+
+ comb = &local->hw.wiphy->iface_combinations[i];
+
+ if (comb->num_different_channels > 1)
+ return -EINVAL;
+ }
+ } else {
+ /*
+ * WDS is currently prohibited when channel contexts are used
+ * because there's no clear definition of which channel WDS
+ * type interfaces use
+ */
+ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS))
+ return -EINVAL;
+
+ /* DFS is not supported with multi-channel combinations yet */
+ for (i = 0; i < local->hw.wiphy->n_iface_combinations; i++) {
+ const struct ieee80211_iface_combination *comb;
+
+ comb = &local->hw.wiphy->iface_combinations[i];
+
+ if (comb->radar_detect_widths &&
+ comb->num_different_channels > 1)
+ return -EINVAL;
+ }
+ }
+
+ /* Only HW csum features are currently compatible with mac80211 */
+ feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+ NETIF_F_HW_CSUM;
+ if (WARN_ON(hw->netdev_features & ~feature_whitelist))
+ return -EINVAL;
+
+ if (hw->max_report_rates == 0)
+ hw->max_report_rates = hw->max_rates;
+
+ local->rx_chains = 1;
+
+ /*
+ * generic code guarantees at least one band,
+ * set this very early because much code assumes
+ * that hw.conf.channel is assigned
+ */
+ channels = 0;
+ max_bitrates = 0;
+ supp_ht = false;
+ supp_vht = false;
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ struct ieee80211_supported_band *sband;
+
+ sband = local->hw.wiphy->bands[band];
+ if (!sband)
+ continue;
+
+ if (!dflt_chandef.chan) {
+ cfg80211_chandef_create(&dflt_chandef,
+ &sband->channels[0],
+ NL80211_CHAN_NO_HT);
+ /* init channel we're on */
+ if (!local->use_chanctx && !local->_oper_chandef.chan) {
+ local->hw.conf.chandef = dflt_chandef;
+ local->_oper_chandef = dflt_chandef;
+ }
+ local->monitor_chandef = dflt_chandef;
+ }
+
+ channels += sband->n_channels;
+
+ if (max_bitrates < sband->n_bitrates)
+ max_bitrates = sband->n_bitrates;
+ supp_ht = supp_ht || sband->ht_cap.ht_supported;
+ supp_vht = supp_vht || sband->vht_cap.vht_supported;
+
+ if (sband->ht_cap.ht_supported)
+ local->rx_chains =
+ max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs),
+ local->rx_chains);
+
+ /* TODO: consider VHT for RX chains, hopefully it's the same */
+ }
+
+ /* if low-level driver supports AP, we also support VLAN */
+ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP)) {
+ hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN);
+ hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_AP_VLAN);
+ }
+
+ /* mac80211 always supports monitor */
+ hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
+ hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_MONITOR);
+
+ /* mac80211 doesn't support more than one IBSS interface right now */
+ for (i = 0; i < hw->wiphy->n_iface_combinations; i++) {
+ const struct ieee80211_iface_combination *c;
+ int j;
+
+ c = &hw->wiphy->iface_combinations[i];
+
+ for (j = 0; j < c->n_limits; j++)
+ if ((c->limits[j].types & BIT(NL80211_IFTYPE_ADHOC)) &&
+ c->limits[j].max > 1)
+ return -EINVAL;
+ }
+
+ local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) +
+ sizeof(void *) * channels, GFP_KERNEL);
+ if (!local->int_scan_req)
+ return -ENOMEM;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ if (!local->hw.wiphy->bands[band])
+ continue;
+ local->int_scan_req->rates[band] = (u32) -1;
+ }
+
+#ifndef CONFIG_MAC80211_MESH
+ /* mesh depends on Kconfig, but drivers should set it if they want */
+ local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT);
+#endif
+
+ /* if the underlying driver supports mesh, mac80211 will (at least)
+ * provide routing of mesh authentication frames to userspace */
+ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_MESH_POINT))
+ local->hw.wiphy->flags |= WIPHY_FLAG_MESH_AUTH;
+
+ /* mac80211 supports control port protocol changing */
+ local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL;
+
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
+ local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
+ } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) {
+ local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC;
+ if (hw->max_signal <= 0) {
+ result = -EINVAL;
+ goto fail_wiphy_register;
+ }
+ }
+
+ /*
+ * Calculate scan IE length -- we need this to alloc
+ * memory and to subtract from the driver limit. It
+ * includes the DS Params, (extended) supported rates, and HT
+ * information -- SSID is the driver's responsibility.
+ */
+ local->scan_ies_len = 4 + max_bitrates /* (ext) supp rates */ +
+ 3 /* DS Params */;
+ if (supp_ht)
+ local->scan_ies_len += 2 + sizeof(struct ieee80211_ht_cap);
+
+ if (supp_vht)
+ local->scan_ies_len +=
+ 2 + sizeof(struct ieee80211_vht_cap);
+
+ if (!local->ops->hw_scan) {
+ /* For hw_scan, driver needs to set these up. */
+ local->hw.wiphy->max_scan_ssids = 4;
+ local->hw.wiphy->max_scan_ie_len = IEEE80211_MAX_DATA_LEN;
+ }
+
+ /*
+ * If the driver supports any scan IEs, then assume the
+ * limit includes the IEs mac80211 will add, otherwise
+ * leave it at zero and let the driver sort it out; we
+ * still pass our IEs to the driver but userspace will
+ * not be allowed to in that case.
+ */
+ if (local->hw.wiphy->max_scan_ie_len)
+ local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len;
+
+ WARN_ON(!ieee80211_cs_list_valid(local->hw.cipher_schemes,
+ local->hw.n_cipher_schemes));
+
+ result = ieee80211_init_cipher_suites(local);
+ if (result < 0)
+ goto fail_wiphy_register;
+
+ if (!local->ops->remain_on_channel)
+ local->hw.wiphy->max_remain_on_channel_duration = 5000;
+
+ /* mac80211 based drivers don't support internal TDLS setup */
+ if (local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)
+ local->hw.wiphy->flags |= WIPHY_FLAG_TDLS_EXTERNAL_SETUP;
+
+ /* mac80211 supports eCSA, if the driver supports STA CSA at all */
+ if (local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA)
+ local->ext_capa[0] |= WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING;
+
+ local->hw.wiphy->max_num_csa_counters = IEEE80211_MAX_CSA_COUNTERS_NUM;
+
+ result = wiphy_register(local->hw.wiphy);
+ if (result < 0)
+ goto fail_wiphy_register;
+
+ /*
+ * We use the number of queues for feature tests (QoS, HT) internally
+ * so restrict them appropriately.
+ */
+ if (hw->queues > IEEE80211_MAX_QUEUES)
+ hw->queues = IEEE80211_MAX_QUEUES;
+
+ local->workqueue =
+ alloc_ordered_workqueue("%s", 0, wiphy_name(local->hw.wiphy));
+ if (!local->workqueue) {
+ result = -ENOMEM;
+ goto fail_workqueue;
+ }
+
+ /*
+ * The hardware needs headroom for sending the frame,
+ * and we need some headroom for passing the frame to monitor
+ * interfaces, but never both at the same time.
+ */
+ local->tx_headroom = max_t(unsigned int , local->hw.extra_tx_headroom,
+ IEEE80211_TX_STATUS_HEADROOM);
+
+ debugfs_hw_add(local);
+
+ /*
+ * if the driver doesn't specify a max listen interval we
+ * use 5 which should be a safe default
+ */
+ if (local->hw.max_listen_interval == 0)
+ local->hw.max_listen_interval = 5;
+
+ local->hw.conf.listen_interval = local->hw.max_listen_interval;
+
+ local->dynamic_ps_forced_timeout = -1;
+
+ if (!local->hw.txq_ac_max_pending)
+ local->hw.txq_ac_max_pending = 64;
+
+ result = ieee80211_wep_init(local);
+ if (result < 0)
+ wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
+ result);
+
+ local->hw.conf.flags = IEEE80211_CONF_IDLE;
+
+ ieee80211_led_init(local);
+
+ rtnl_lock();
+
+ result = ieee80211_init_rate_ctrl_alg(local,
+ hw->rate_control_algorithm);
+ if (result < 0) {
+ wiphy_debug(local->hw.wiphy,
+ "Failed to initialize rate control algorithm\n");
+ goto fail_rate;
+ }
+
+ /* add one default STA interface if supported */
+ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) &&
+ !(hw->flags & IEEE80211_HW_NO_AUTO_VIF)) {
+ result = ieee80211_if_add(local, "wlan%d", NET_NAME_ENUM, NULL,
+ NL80211_IFTYPE_STATION, NULL);
+ if (result)
+ wiphy_warn(local->hw.wiphy,
+ "Failed to add default virtual iface\n");
+ }
+
+ rtnl_unlock();
+
+ local->network_latency_notifier.notifier_call =
+ ieee80211_max_network_latency;
+ result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY,
+ &local->network_latency_notifier);
+ if (result)
+ goto fail_pm_qos;
+
+#ifdef CONFIG_INET
+ local->ifa_notifier.notifier_call = ieee80211_ifa_changed;
+ result = register_inetaddr_notifier(&local->ifa_notifier);
+ if (result)
+ goto fail_ifa;
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+ local->ifa6_notifier.notifier_call = ieee80211_ifa6_changed;
+ result = register_inet6addr_notifier(&local->ifa6_notifier);
+ if (result)
+ goto fail_ifa6;
+#endif
+
+ return 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ fail_ifa6:
+#ifdef CONFIG_INET
+ unregister_inetaddr_notifier(&local->ifa_notifier);
+#endif
+#endif
+#if defined(CONFIG_INET) || defined(CONFIG_IPV6)
+ fail_ifa:
+ pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
+ &local->network_latency_notifier);
+#endif
+ fail_pm_qos:
+ rtnl_lock();
+ rate_control_deinitialize(local);
+ ieee80211_remove_interfaces(local);
+ fail_rate:
+ rtnl_unlock();
+ ieee80211_led_exit(local);
+ ieee80211_wep_free(local);
+ destroy_workqueue(local->workqueue);
+ fail_workqueue:
+ wiphy_unregister(local->hw.wiphy);
+ fail_wiphy_register:
+ if (local->wiphy_ciphers_allocated)
+ kfree(local->hw.wiphy->cipher_suites);
+ kfree(local->int_scan_req);
+ return result;
+}
+EXPORT_SYMBOL(ieee80211_register_hw);
+
+void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi,
+ struct net_device *napi_dev,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ netif_napi_add(napi_dev, napi, poll, weight);
+ local->napi = napi;
+}
+EXPORT_SYMBOL_GPL(ieee80211_napi_add);
+
+void ieee80211_unregister_hw(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ tasklet_kill(&local->tx_pending_tasklet);
+ tasklet_kill(&local->tasklet);
+
+ pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
+ &local->network_latency_notifier);
+#ifdef CONFIG_INET
+ unregister_inetaddr_notifier(&local->ifa_notifier);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&local->ifa6_notifier);
+#endif
+
+ rtnl_lock();
+
+ /*
+ * At this point, interface list manipulations are fine
+ * because the driver cannot be handing us frames any
+ * more and the tasklet is killed.
+ */
+ ieee80211_remove_interfaces(local);
+
+ rtnl_unlock();
+
+ cancel_work_sync(&local->restart_work);
+ cancel_work_sync(&local->reconfig_filter);
+ flush_work(&local->sched_scan_stopped_work);
+
+ ieee80211_clear_tx_pending(local);
+ rate_control_deinitialize(local);
+
+ if (skb_queue_len(&local->skb_queue) ||
+ skb_queue_len(&local->skb_queue_unreliable))
+ wiphy_warn(local->hw.wiphy, "skb_queue not empty\n");
+ skb_queue_purge(&local->skb_queue);
+ skb_queue_purge(&local->skb_queue_unreliable);
+
+ destroy_workqueue(local->workqueue);
+ wiphy_unregister(local->hw.wiphy);
+ ieee80211_wep_free(local);
+ ieee80211_led_exit(local);
+ kfree(local->int_scan_req);
+}
+EXPORT_SYMBOL(ieee80211_unregister_hw);
+
+static int ieee80211_free_ack_frame(int id, void *p, void *data)
+{
+ WARN_ONCE(1, "Have pending ack frames!\n");
+ kfree_skb(p);
+ return 0;
+}
+
+void ieee80211_free_hw(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ mutex_destroy(&local->iflist_mtx);
+ mutex_destroy(&local->mtx);
+
+ if (local->wiphy_ciphers_allocated)
+ kfree(local->hw.wiphy->cipher_suites);
+
+ idr_for_each(&local->ack_status_frames,
+ ieee80211_free_ack_frame, NULL);
+ idr_destroy(&local->ack_status_frames);
+
+ sta_info_stop(local);
+
+ wiphy_free(local->hw.wiphy);
+}
+EXPORT_SYMBOL(ieee80211_free_hw);
+
+static int __init ieee80211_init(void)
+{
+ struct sk_buff *skb;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct ieee80211_tx_info) > sizeof(skb->cb));
+ BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, driver_data) +
+ IEEE80211_TX_INFO_DRIVER_DATA_SIZE > sizeof(skb->cb));
+
+ ret = rc80211_minstrel_init();
+ if (ret)
+ return ret;
+
+ ret = rc80211_minstrel_ht_init();
+ if (ret)
+ goto err_minstrel;
+
+ ret = ieee80211_iface_init();
+ if (ret)
+ goto err_netdev;
+
+ return 0;
+ err_netdev:
+ rc80211_minstrel_ht_exit();
+ err_minstrel:
+ rc80211_minstrel_exit();
+
+ return ret;
+}
+
+static void __exit ieee80211_exit(void)
+{
+ rc80211_minstrel_ht_exit();
+ rc80211_minstrel_exit();
+
+ ieee80211s_stop();
+
+ ieee80211_iface_exit();
+
+ rcu_barrier();
+}
+
+
+subsys_initcall(ieee80211_init);
+module_exit(ieee80211_exit);
+
+MODULE_DESCRIPTION("IEEE 802.11 subsystem");
+MODULE_LICENSE("GPL");
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
new file mode 100644
index 000000000..817098add
--- /dev/null
+++ b/net/mac80211/mesh.c
@@ -0,0 +1,1345 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Authors: Luis Carlos Cobo <luisca@cozybit.com>
+ * Javier Cardona <javier@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "ieee80211_i.h"
+#include "mesh.h"
+#include "driver-ops.h"
+
+static int mesh_allocated;
+static struct kmem_cache *rm_cache;
+
+bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt)
+{
+ return (mgmt->u.action.u.mesh_action.action_code ==
+ WLAN_MESH_ACTION_HWMP_PATH_SELECTION);
+}
+
+void ieee80211s_init(void)
+{
+ mesh_pathtbl_init();
+ mesh_allocated = 1;
+ rm_cache = kmem_cache_create("mesh_rmc", sizeof(struct rmc_entry),
+ 0, 0, NULL);
+}
+
+void ieee80211s_stop(void)
+{
+ if (!mesh_allocated)
+ return;
+ mesh_pathtbl_unregister();
+ kmem_cache_destroy(rm_cache);
+}
+
+static void ieee80211_mesh_housekeeping_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata = (void *) data;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+ set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
+
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+/**
+ * mesh_matches_local - check if the config of a mesh point matches ours
+ *
+ * @sdata: local mesh subif
+ * @ie: information elements of a management frame from the mesh peer
+ *
+ * This function checks if the mesh configuration of a mesh point matches the
+ * local mesh configuration, i.e. if both nodes belong to the same mesh network.
+ */
+bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
+ struct ieee802_11_elems *ie)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u32 basic_rates = 0;
+ struct cfg80211_chan_def sta_chan_def;
+
+ /*
+ * As support for each feature is added, check for matching
+ * - On mesh config capabilities
+ * - Power Save Support En
+ * - Sync support enabled
+ * - Sync support active
+ * - Sync support required from peer
+ * - MDA enabled
+ * - Power management control on fc
+ */
+ if (!(ifmsh->mesh_id_len == ie->mesh_id_len &&
+ memcmp(ifmsh->mesh_id, ie->mesh_id, ie->mesh_id_len) == 0 &&
+ (ifmsh->mesh_pp_id == ie->mesh_config->meshconf_psel) &&
+ (ifmsh->mesh_pm_id == ie->mesh_config->meshconf_pmetric) &&
+ (ifmsh->mesh_cc_id == ie->mesh_config->meshconf_congest) &&
+ (ifmsh->mesh_sp_id == ie->mesh_config->meshconf_synch) &&
+ (ifmsh->mesh_auth_id == ie->mesh_config->meshconf_auth)))
+ return false;
+
+ ieee80211_sta_get_rates(sdata, ie, ieee80211_get_sdata_band(sdata),
+ &basic_rates);
+
+ if (sdata->vif.bss_conf.basic_rates != basic_rates)
+ return false;
+
+ ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan,
+ ie->ht_operation, &sta_chan_def);
+
+ if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef,
+ &sta_chan_def))
+ return false;
+
+ return true;
+}
+
+/**
+ * mesh_peer_accepts_plinks - check if an mp is willing to establish peer links
+ *
+ * @ie: information elements of a management frame from the mesh peer
+ */
+bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie)
+{
+ return (ie->mesh_config->meshconf_cap &
+ IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS) != 0;
+}
+
+/**
+ * mesh_accept_plinks_update - update accepting_plink in local mesh beacons
+ *
+ * @sdata: mesh interface in which mesh beacons are going to be updated
+ *
+ * Returns: beacon changed flag if the beacon content changed.
+ */
+u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata)
+{
+ bool free_plinks;
+ u32 changed = 0;
+
+ /* In case mesh_plink_free_count > 0 and mesh_plinktbl_capacity == 0,
+ * the mesh interface might be able to establish plinks with peers that
+ * are already on the table but are not on PLINK_ESTAB state. However,
+ * in general the mesh interface is not accepting peer link requests
+ * from new peers, and that must be reflected in the beacon
+ */
+ free_plinks = mesh_plink_availables(sdata);
+
+ if (free_plinks != sdata->u.mesh.accepting_plinks) {
+ sdata->u.mesh.accepting_plinks = free_plinks;
+ changed = BSS_CHANGED_BEACON;
+ }
+
+ return changed;
+}
+
+/*
+ * mesh_sta_cleanup - clean up any mesh sta state
+ *
+ * @sta: mesh sta to clean up.
+ */
+void mesh_sta_cleanup(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u32 changed;
+
+ /*
+ * maybe userspace handles peer allocation and peering, but in either
+ * case the beacon is still generated by the kernel and we might need
+ * an update.
+ */
+ changed = mesh_accept_plinks_update(sdata);
+ if (!sdata->u.mesh.user_mpm) {
+ changed |= mesh_plink_deactivate(sta);
+ del_timer_sync(&sta->plink_timer);
+ }
+
+ if (changed)
+ ieee80211_mbss_info_change_notify(sdata, changed);
+}
+
+int mesh_rmc_init(struct ieee80211_sub_if_data *sdata)
+{
+ int i;
+
+ sdata->u.mesh.rmc = kmalloc(sizeof(struct mesh_rmc), GFP_KERNEL);
+ if (!sdata->u.mesh.rmc)
+ return -ENOMEM;
+ sdata->u.mesh.rmc->idx_mask = RMC_BUCKETS - 1;
+ for (i = 0; i < RMC_BUCKETS; i++)
+ INIT_LIST_HEAD(&sdata->u.mesh.rmc->bucket[i]);
+ return 0;
+}
+
+void mesh_rmc_free(struct ieee80211_sub_if_data *sdata)
+{
+ struct mesh_rmc *rmc = sdata->u.mesh.rmc;
+ struct rmc_entry *p, *n;
+ int i;
+
+ if (!sdata->u.mesh.rmc)
+ return;
+
+ for (i = 0; i < RMC_BUCKETS; i++) {
+ list_for_each_entry_safe(p, n, &rmc->bucket[i], list) {
+ list_del(&p->list);
+ kmem_cache_free(rm_cache, p);
+ }
+ }
+
+ kfree(rmc);
+ sdata->u.mesh.rmc = NULL;
+}
+
+/**
+ * mesh_rmc_check - Check frame in recent multicast cache and add if absent.
+ *
+ * @sdata: interface
+ * @sa: source address
+ * @mesh_hdr: mesh_header
+ *
+ * Returns: 0 if the frame is not in the cache, nonzero otherwise.
+ *
+ * Checks using the source address and the mesh sequence number if we have
+ * received this frame lately. If the frame is not in the cache, it is added to
+ * it.
+ */
+int mesh_rmc_check(struct ieee80211_sub_if_data *sdata,
+ const u8 *sa, struct ieee80211s_hdr *mesh_hdr)
+{
+ struct mesh_rmc *rmc = sdata->u.mesh.rmc;
+ u32 seqnum = 0;
+ int entries = 0;
+ u8 idx;
+ struct rmc_entry *p, *n;
+
+ /* Don't care about endianness since only match matters */
+ memcpy(&seqnum, &mesh_hdr->seqnum, sizeof(mesh_hdr->seqnum));
+ idx = le32_to_cpu(mesh_hdr->seqnum) & rmc->idx_mask;
+ list_for_each_entry_safe(p, n, &rmc->bucket[idx], list) {
+ ++entries;
+ if (time_after(jiffies, p->exp_time) ||
+ entries == RMC_QUEUE_MAX_LEN) {
+ list_del(&p->list);
+ kmem_cache_free(rm_cache, p);
+ --entries;
+ } else if ((seqnum == p->seqnum) && ether_addr_equal(sa, p->sa))
+ return -1;
+ }
+
+ p = kmem_cache_alloc(rm_cache, GFP_ATOMIC);
+ if (!p)
+ return 0;
+
+ p->seqnum = seqnum;
+ p->exp_time = jiffies + RMC_TIMEOUT;
+ memcpy(p->sa, sa, ETH_ALEN);
+ list_add(&p->list, &rmc->bucket[idx]);
+ return 0;
+}
+
+int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u8 *pos, neighbors;
+ u8 meshconf_len = sizeof(struct ieee80211_meshconf_ie);
+
+ if (skb_tailroom(skb) < 2 + meshconf_len)
+ return -ENOMEM;
+
+ pos = skb_put(skb, 2 + meshconf_len);
+ *pos++ = WLAN_EID_MESH_CONFIG;
+ *pos++ = meshconf_len;
+
+ /* save a pointer for quick updates in pre-tbtt */
+ ifmsh->meshconf_offset = pos - skb->data;
+
+ /* Active path selection protocol ID */
+ *pos++ = ifmsh->mesh_pp_id;
+ /* Active path selection metric ID */
+ *pos++ = ifmsh->mesh_pm_id;
+ /* Congestion control mode identifier */
+ *pos++ = ifmsh->mesh_cc_id;
+ /* Synchronization protocol identifier */
+ *pos++ = ifmsh->mesh_sp_id;
+ /* Authentication Protocol identifier */
+ *pos++ = ifmsh->mesh_auth_id;
+ /* Mesh Formation Info - number of neighbors */
+ neighbors = atomic_read(&ifmsh->estab_plinks);
+ neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS);
+ *pos++ = neighbors << 1;
+ /* Mesh capability */
+ *pos = 0x00;
+ *pos |= ifmsh->mshcfg.dot11MeshForwarding ?
+ IEEE80211_MESHCONF_CAPAB_FORWARDING : 0x00;
+ *pos |= ifmsh->accepting_plinks ?
+ IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS : 0x00;
+ /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */
+ *pos |= ifmsh->ps_peers_deep_sleep ?
+ IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00;
+ *pos++ |= ifmsh->adjusting_tbtt ?
+ IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00;
+ *pos++ = 0x00;
+
+ return 0;
+}
+
+int mesh_add_meshid_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u8 *pos;
+
+ if (skb_tailroom(skb) < 2 + ifmsh->mesh_id_len)
+ return -ENOMEM;
+
+ pos = skb_put(skb, 2 + ifmsh->mesh_id_len);
+ *pos++ = WLAN_EID_MESH_ID;
+ *pos++ = ifmsh->mesh_id_len;
+ if (ifmsh->mesh_id_len)
+ memcpy(pos, ifmsh->mesh_id, ifmsh->mesh_id_len);
+
+ return 0;
+}
+
+static int mesh_add_awake_window_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u8 *pos;
+
+ /* see IEEE802.11-2012 13.14.6 */
+ if (ifmsh->ps_peers_light_sleep == 0 &&
+ ifmsh->ps_peers_deep_sleep == 0 &&
+ ifmsh->nonpeer_pm == NL80211_MESH_POWER_ACTIVE)
+ return 0;
+
+ if (skb_tailroom(skb) < 4)
+ return -ENOMEM;
+
+ pos = skb_put(skb, 2 + 2);
+ *pos++ = WLAN_EID_MESH_AWAKE_WINDOW;
+ *pos++ = 2;
+ put_unaligned_le16(ifmsh->mshcfg.dot11MeshAwakeWindowDuration, pos);
+
+ return 0;
+}
+
+int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u8 offset, len;
+ const u8 *data;
+
+ if (!ifmsh->ie || !ifmsh->ie_len)
+ return 0;
+
+ /* fast-forward to vendor IEs */
+ offset = ieee80211_ie_split_vendor(ifmsh->ie, ifmsh->ie_len, 0);
+
+ if (offset) {
+ len = ifmsh->ie_len - offset;
+ data = ifmsh->ie + offset;
+ if (skb_tailroom(skb) < len)
+ return -ENOMEM;
+ memcpy(skb_put(skb, len), data, len);
+ }
+
+ return 0;
+}
+
+int mesh_add_rsn_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u8 len = 0;
+ const u8 *data;
+
+ if (!ifmsh->ie || !ifmsh->ie_len)
+ return 0;
+
+ /* find RSN IE */
+ data = cfg80211_find_ie(WLAN_EID_RSN, ifmsh->ie, ifmsh->ie_len);
+ if (!data)
+ return 0;
+
+ len = data[1] + 2;
+
+ if (skb_tailroom(skb) < len)
+ return -ENOMEM;
+ memcpy(skb_put(skb, len), data, len);
+
+ return 0;
+}
+
+static int mesh_add_ds_params_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_channel *chan;
+ u8 *pos;
+
+ if (skb_tailroom(skb) < 3)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ chan = chanctx_conf->def.chan;
+ rcu_read_unlock();
+
+ pos = skb_put(skb, 2 + 1);
+ *pos++ = WLAN_EID_DS_PARAMS;
+ *pos++ = 1;
+ *pos++ = ieee80211_frequency_to_channel(chan->center_freq);
+
+ return 0;
+}
+
+int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ struct ieee80211_supported_band *sband;
+ u8 *pos;
+
+ sband = local->hw.wiphy->bands[band];
+ if (!sband->ht_cap.ht_supported ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+ return 0;
+
+ if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap))
+ return -ENOMEM;
+
+ pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap));
+ ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap);
+
+ return 0;
+}
+
+int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_channel *channel;
+ enum nl80211_channel_type channel_type =
+ cfg80211_get_chandef_type(&sdata->vif.bss_conf.chandef);
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_sta_ht_cap *ht_cap;
+ u8 *pos;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ channel = chanctx_conf->def.chan;
+ rcu_read_unlock();
+
+ sband = local->hw.wiphy->bands[channel->band];
+ ht_cap = &sband->ht_cap;
+
+ if (!ht_cap->ht_supported || channel_type == NL80211_CHAN_NO_HT)
+ return 0;
+
+ if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_operation))
+ return -ENOMEM;
+
+ pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_operation));
+ ieee80211_ie_build_ht_oper(pos, ht_cap, &sdata->vif.bss_conf.chandef,
+ sdata->vif.bss_conf.ht_operation_mode);
+
+ return 0;
+}
+
+static void ieee80211_mesh_path_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata =
+ (struct ieee80211_sub_if_data *) data;
+
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+}
+
+static void ieee80211_mesh_path_root_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata =
+ (struct ieee80211_sub_if_data *) data;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+ set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags);
+
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+}
+
+void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh)
+{
+ if (ifmsh->mshcfg.dot11MeshHWMPRootMode > IEEE80211_ROOTMODE_ROOT)
+ set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags);
+ else {
+ clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags);
+ /* stop running timer */
+ del_timer_sync(&ifmsh->mesh_path_root_timer);
+ }
+}
+
+/**
+ * ieee80211_fill_mesh_addresses - fill addresses of a locally originated mesh frame
+ * @hdr: 802.11 frame header
+ * @fc: frame control field
+ * @meshda: destination address in the mesh
+ * @meshsa: source address address in the mesh. Same as TA, as frame is
+ * locally originated.
+ *
+ * Return the length of the 802.11 (does not include a mesh control header)
+ */
+int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc,
+ const u8 *meshda, const u8 *meshsa)
+{
+ if (is_multicast_ether_addr(meshda)) {
+ *fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+ /* DA TA SA */
+ memcpy(hdr->addr1, meshda, ETH_ALEN);
+ memcpy(hdr->addr2, meshsa, ETH_ALEN);
+ memcpy(hdr->addr3, meshsa, ETH_ALEN);
+ return 24;
+ } else {
+ *fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
+ /* RA TA DA SA */
+ eth_zero_addr(hdr->addr1); /* RA is resolved later */
+ memcpy(hdr->addr2, meshsa, ETH_ALEN);
+ memcpy(hdr->addr3, meshda, ETH_ALEN);
+ memcpy(hdr->addr4, meshsa, ETH_ALEN);
+ return 30;
+ }
+}
+
+/**
+ * ieee80211_new_mesh_header - create a new mesh header
+ * @sdata: mesh interface to be used
+ * @meshhdr: uninitialized mesh header
+ * @addr4or5: 1st address in the ae header, which may correspond to address 4
+ * (if addr6 is NULL) or address 5 (if addr6 is present). It may
+ * be NULL.
+ * @addr6: 2nd address in the ae header, which corresponds to addr6 of the
+ * mesh frame
+ *
+ * Return the header length.
+ */
+int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211s_hdr *meshhdr,
+ const char *addr4or5, const char *addr6)
+{
+ if (WARN_ON(!addr4or5 && addr6))
+ return 0;
+
+ memset(meshhdr, 0, sizeof(*meshhdr));
+
+ meshhdr->ttl = sdata->u.mesh.mshcfg.dot11MeshTTL;
+
+ /* FIXME: racy -- TX on multiple queues can be concurrent */
+ put_unaligned(cpu_to_le32(sdata->u.mesh.mesh_seqnum), &meshhdr->seqnum);
+ sdata->u.mesh.mesh_seqnum++;
+
+ if (addr4or5 && !addr6) {
+ meshhdr->flags |= MESH_FLAGS_AE_A4;
+ memcpy(meshhdr->eaddr1, addr4or5, ETH_ALEN);
+ return 2 * ETH_ALEN;
+ } else if (addr4or5 && addr6) {
+ meshhdr->flags |= MESH_FLAGS_AE_A5_A6;
+ memcpy(meshhdr->eaddr1, addr4or5, ETH_ALEN);
+ memcpy(meshhdr->eaddr2, addr6, ETH_ALEN);
+ return 3 * ETH_ALEN;
+ }
+
+ return ETH_ALEN;
+}
+
+static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u32 changed;
+
+ if (ifmsh->mshcfg.plink_timeout > 0)
+ ieee80211_sta_expire(sdata, ifmsh->mshcfg.plink_timeout * HZ);
+ mesh_path_expire(sdata);
+
+ changed = mesh_accept_plinks_update(sdata);
+ ieee80211_mbss_info_change_notify(sdata, changed);
+
+ mod_timer(&ifmsh->housekeeping_timer,
+ round_jiffies(jiffies +
+ IEEE80211_MESH_HOUSEKEEPING_INTERVAL));
+}
+
+static void ieee80211_mesh_rootpath(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u32 interval;
+
+ mesh_path_tx_root_frame(sdata);
+
+ if (ifmsh->mshcfg.dot11MeshHWMPRootMode == IEEE80211_PROACTIVE_RANN)
+ interval = ifmsh->mshcfg.dot11MeshHWMPRannInterval;
+ else
+ interval = ifmsh->mshcfg.dot11MeshHWMProotInterval;
+
+ mod_timer(&ifmsh->mesh_path_root_timer,
+ round_jiffies(TU_TO_EXP_TIME(interval)));
+}
+
+static int
+ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
+{
+ struct beacon_data *bcn;
+ int head_len, tail_len;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct mesh_csa_settings *csa;
+ enum ieee80211_band band;
+ u8 *pos;
+ struct ieee80211_sub_if_data *sdata;
+ int hdr_len = offsetof(struct ieee80211_mgmt, u.beacon) +
+ sizeof(mgmt->u.beacon);
+
+ sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh);
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ band = chanctx_conf->def.chan->band;
+ rcu_read_unlock();
+
+ head_len = hdr_len +
+ 2 + /* NULL SSID */
+ /* Channel Switch Announcement */
+ 2 + sizeof(struct ieee80211_channel_sw_ie) +
+ /* Mesh Channel Swith Parameters */
+ 2 + sizeof(struct ieee80211_mesh_chansw_params_ie) +
+ 2 + 8 + /* supported rates */
+ 2 + 3; /* DS params */
+ tail_len = 2 + (IEEE80211_MAX_SUPP_RATES - 8) +
+ 2 + sizeof(struct ieee80211_ht_cap) +
+ 2 + sizeof(struct ieee80211_ht_operation) +
+ 2 + ifmsh->mesh_id_len +
+ 2 + sizeof(struct ieee80211_meshconf_ie) +
+ 2 + sizeof(__le16) + /* awake window */
+ ifmsh->ie_len;
+
+ bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL);
+ /* need an skb for IE builders to operate on */
+ skb = dev_alloc_skb(max(head_len, tail_len));
+
+ if (!bcn || !skb)
+ goto out_free;
+
+ /*
+ * pointers go into the block we allocated,
+ * memory is | beacon_data | head | tail |
+ */
+ bcn->head = ((u8 *) bcn) + sizeof(*bcn);
+
+ /* fill in the head */
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len);
+ memset(mgmt, 0, hdr_len);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_BEACON);
+ eth_broadcast_addr(mgmt->da);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+ ieee80211_mps_set_frame_flags(sdata, NULL, (void *) mgmt);
+ mgmt->u.beacon.beacon_int =
+ cpu_to_le16(sdata->vif.bss_conf.beacon_int);
+ mgmt->u.beacon.capab_info |= cpu_to_le16(
+ sdata->u.mesh.security ? WLAN_CAPABILITY_PRIVACY : 0);
+
+ pos = skb_put(skb, 2);
+ *pos++ = WLAN_EID_SSID;
+ *pos++ = 0x0;
+
+ rcu_read_lock();
+ csa = rcu_dereference(ifmsh->csa);
+ if (csa) {
+ pos = skb_put(skb, 13);
+ memset(pos, 0, 13);
+ *pos++ = WLAN_EID_CHANNEL_SWITCH;
+ *pos++ = 3;
+ *pos++ = 0x0;
+ *pos++ = ieee80211_frequency_to_channel(
+ csa->settings.chandef.chan->center_freq);
+ bcn->csa_current_counter = csa->settings.count;
+ bcn->csa_counter_offsets[0] = hdr_len + 6;
+ *pos++ = csa->settings.count;
+ *pos++ = WLAN_EID_CHAN_SWITCH_PARAM;
+ *pos++ = 6;
+ if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT) {
+ *pos++ = ifmsh->mshcfg.dot11MeshTTL;
+ *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR;
+ } else {
+ *pos++ = ifmsh->chsw_ttl;
+ }
+ *pos++ |= csa->settings.block_tx ?
+ WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT : 0x00;
+ put_unaligned_le16(WLAN_REASON_MESH_CHAN, pos);
+ pos += 2;
+ put_unaligned_le16(ifmsh->pre_value, pos);
+ pos += 2;
+ }
+ rcu_read_unlock();
+
+ if (ieee80211_add_srates_ie(sdata, skb, true, band) ||
+ mesh_add_ds_params_ie(sdata, skb))
+ goto out_free;
+
+ bcn->head_len = skb->len;
+ memcpy(bcn->head, skb->data, bcn->head_len);
+
+ /* now the tail */
+ skb_trim(skb, 0);
+ bcn->tail = bcn->head + bcn->head_len;
+
+ if (ieee80211_add_ext_srates_ie(sdata, skb, true, band) ||
+ mesh_add_rsn_ie(sdata, skb) ||
+ mesh_add_ht_cap_ie(sdata, skb) ||
+ mesh_add_ht_oper_ie(sdata, skb) ||
+ mesh_add_meshid_ie(sdata, skb) ||
+ mesh_add_meshconf_ie(sdata, skb) ||
+ mesh_add_awake_window_ie(sdata, skb) ||
+ mesh_add_vendor_ies(sdata, skb))
+ goto out_free;
+
+ bcn->tail_len = skb->len;
+ memcpy(bcn->tail, skb->data, bcn->tail_len);
+ bcn->meshconf = (struct ieee80211_meshconf_ie *)
+ (bcn->tail + ifmsh->meshconf_offset);
+
+ dev_kfree_skb(skb);
+ rcu_assign_pointer(ifmsh->beacon, bcn);
+ return 0;
+out_free:
+ kfree(bcn);
+ dev_kfree_skb(skb);
+ return -ENOMEM;
+}
+
+static int
+ieee80211_mesh_rebuild_beacon(struct ieee80211_sub_if_data *sdata)
+{
+ struct beacon_data *old_bcn;
+ int ret;
+
+ old_bcn = rcu_dereference_protected(sdata->u.mesh.beacon,
+ lockdep_is_held(&sdata->wdev.mtx));
+ ret = ieee80211_mesh_build_beacon(&sdata->u.mesh);
+ if (ret)
+ /* just reuse old beacon */
+ return ret;
+
+ if (old_bcn)
+ kfree_rcu(old_bcn, rcu_head);
+ return 0;
+}
+
+void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata,
+ u32 changed)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ unsigned long bits = changed;
+ u32 bit;
+
+ if (!bits)
+ return;
+
+ /* if we race with running work, worst case this work becomes a noop */
+ for_each_set_bit(bit, &bits, sizeof(changed) * BITS_PER_BYTE)
+ set_bit(bit, &ifmsh->mbss_changed);
+ set_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags);
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+}
+
+int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct ieee80211_local *local = sdata->local;
+ u32 changed = BSS_CHANGED_BEACON |
+ BSS_CHANGED_BEACON_ENABLED |
+ BSS_CHANGED_HT |
+ BSS_CHANGED_BASIC_RATES |
+ BSS_CHANGED_BEACON_INT;
+
+ local->fif_other_bss++;
+ /* mesh ifaces must set allmulti to forward mcast traffic */
+ atomic_inc(&local->iff_allmultis);
+ ieee80211_configure_filter(local);
+
+ ifmsh->mesh_cc_id = 0; /* Disabled */
+ /* register sync ops from extensible synchronization framework */
+ ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id);
+ ifmsh->adjusting_tbtt = false;
+ ifmsh->sync_offset_clockdrift_max = 0;
+ set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
+ ieee80211_mesh_root_setup(ifmsh);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+ sdata->vif.bss_conf.ht_operation_mode =
+ ifmsh->mshcfg.ht_opmode;
+ sdata->vif.bss_conf.enable_beacon = true;
+
+ changed |= ieee80211_mps_local_status_update(sdata);
+
+ if (ieee80211_mesh_build_beacon(ifmsh)) {
+ ieee80211_stop_mesh(sdata);
+ return -ENOMEM;
+ }
+
+ ieee80211_recalc_dtim(local, sdata);
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ netif_carrier_on(sdata->dev);
+ return 0;
+}
+
+void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct beacon_data *bcn;
+
+ netif_carrier_off(sdata->dev);
+
+ /* stop the beacon */
+ ifmsh->mesh_id_len = 0;
+ sdata->vif.bss_conf.enable_beacon = false;
+ clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
+ bcn = rcu_dereference_protected(ifmsh->beacon,
+ lockdep_is_held(&sdata->wdev.mtx));
+ RCU_INIT_POINTER(ifmsh->beacon, NULL);
+ kfree_rcu(bcn, rcu_head);
+
+ /* flush STAs and mpaths on this iface */
+ sta_info_flush(sdata);
+ mesh_path_flush_by_iface(sdata);
+
+ /* free all potentially still buffered group-addressed frames */
+ local->total_ps_buffered -= skb_queue_len(&ifmsh->ps.bc_buf);
+ skb_queue_purge(&ifmsh->ps.bc_buf);
+
+ del_timer_sync(&sdata->u.mesh.housekeeping_timer);
+ del_timer_sync(&sdata->u.mesh.mesh_path_root_timer);
+ del_timer_sync(&sdata->u.mesh.mesh_path_timer);
+
+ /* clear any mesh work (for next join) we may have accrued */
+ ifmsh->wrkq_flags = 0;
+ ifmsh->mbss_changed = 0;
+
+ local->fif_other_bss--;
+ atomic_dec(&local->iff_allmultis);
+ ieee80211_configure_filter(local);
+}
+
+static bool
+ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
+ struct ieee802_11_elems *elems, bool beacon)
+{
+ struct cfg80211_csa_settings params;
+ struct ieee80211_csa_ie csa_ie;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ int err;
+ u32 sta_flags;
+
+ sdata_assert_lock(sdata);
+
+ sta_flags = IEEE80211_STA_DISABLE_VHT;
+ switch (sdata->vif.bss_conf.chandef.width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ sta_flags |= IEEE80211_STA_DISABLE_HT;
+ case NL80211_CHAN_WIDTH_20:
+ sta_flags |= IEEE80211_STA_DISABLE_40MHZ;
+ break;
+ default:
+ break;
+ }
+
+ memset(&params, 0, sizeof(params));
+ memset(&csa_ie, 0, sizeof(csa_ie));
+ err = ieee80211_parse_ch_switch_ie(sdata, elems, band,
+ sta_flags, sdata->vif.addr,
+ &csa_ie);
+ if (err < 0)
+ return false;
+ if (err)
+ return false;
+
+ params.chandef = csa_ie.chandef;
+ params.count = csa_ie.count;
+
+ if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, &params.chandef,
+ IEEE80211_CHAN_DISABLED)) {
+ sdata_info(sdata,
+ "mesh STA %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), aborting\n",
+ sdata->vif.addr,
+ params.chandef.chan->center_freq,
+ params.chandef.width,
+ params.chandef.center_freq1,
+ params.chandef.center_freq2);
+ return false;
+ }
+
+ err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
+ &params.chandef,
+ NL80211_IFTYPE_MESH_POINT);
+ if (err < 0)
+ return false;
+ if (err > 0)
+ /* TODO: DFS not (yet) supported */
+ return false;
+
+ params.radar_required = err;
+
+ if (cfg80211_chandef_identical(&params.chandef,
+ &sdata->vif.bss_conf.chandef)) {
+ mcsa_dbg(sdata,
+ "received csa with an identical chandef, ignoring\n");
+ return true;
+ }
+
+ mcsa_dbg(sdata,
+ "received channel switch announcement to go to channel %d MHz\n",
+ params.chandef.chan->center_freq);
+
+ params.block_tx = csa_ie.mode & WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT;
+ if (beacon) {
+ ifmsh->chsw_ttl = csa_ie.ttl - 1;
+ if (ifmsh->pre_value >= csa_ie.pre_value)
+ return false;
+ ifmsh->pre_value = csa_ie.pre_value;
+ }
+
+ if (ifmsh->chsw_ttl >= ifmsh->mshcfg.dot11MeshTTL)
+ return false;
+
+ ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_REPEATER;
+
+ if (ieee80211_channel_switch(sdata->local->hw.wiphy, sdata->dev,
+ &params) < 0)
+ return false;
+
+ return true;
+}
+
+static void
+ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct sk_buff *presp;
+ struct beacon_data *bcn;
+ struct ieee80211_mgmt *hdr;
+ struct ieee802_11_elems elems;
+ size_t baselen;
+ u8 *pos;
+
+ pos = mgmt->u.probe_req.variable;
+ baselen = (u8 *) pos - (u8 *) mgmt;
+ if (baselen > len)
+ return;
+
+ ieee802_11_parse_elems(pos, len - baselen, false, &elems);
+
+ if (!elems.mesh_id)
+ return;
+
+ /* 802.11-2012 10.1.4.3.2 */
+ if ((!ether_addr_equal(mgmt->da, sdata->vif.addr) &&
+ !is_broadcast_ether_addr(mgmt->da)) ||
+ elems.ssid_len != 0)
+ return;
+
+ if (elems.mesh_id_len != 0 &&
+ (elems.mesh_id_len != ifmsh->mesh_id_len ||
+ memcmp(elems.mesh_id, ifmsh->mesh_id, ifmsh->mesh_id_len)))
+ return;
+
+ rcu_read_lock();
+ bcn = rcu_dereference(ifmsh->beacon);
+
+ if (!bcn)
+ goto out;
+
+ presp = dev_alloc_skb(local->tx_headroom +
+ bcn->head_len + bcn->tail_len);
+ if (!presp)
+ goto out;
+
+ skb_reserve(presp, local->tx_headroom);
+ memcpy(skb_put(presp, bcn->head_len), bcn->head, bcn->head_len);
+ memcpy(skb_put(presp, bcn->tail_len), bcn->tail, bcn->tail_len);
+ hdr = (struct ieee80211_mgmt *) presp->data;
+ hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_PROBE_RESP);
+ memcpy(hdr->da, mgmt->sa, ETH_ALEN);
+ IEEE80211_SKB_CB(presp)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ ieee80211_tx_skb(sdata, presp);
+out:
+ rcu_read_unlock();
+}
+
+static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
+ u16 stype,
+ struct ieee80211_mgmt *mgmt,
+ size_t len,
+ struct ieee80211_rx_status *rx_status)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct ieee802_11_elems elems;
+ struct ieee80211_channel *channel;
+ size_t baselen;
+ int freq;
+ enum ieee80211_band band = rx_status->band;
+
+ /* ignore ProbeResp to foreign address */
+ if (stype == IEEE80211_STYPE_PROBE_RESP &&
+ !ether_addr_equal(mgmt->da, sdata->vif.addr))
+ return;
+
+ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+ if (baselen > len)
+ return;
+
+ ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+ false, &elems);
+
+ /* ignore non-mesh or secure / unsecure mismatch */
+ if ((!elems.mesh_id || !elems.mesh_config) ||
+ (elems.rsn && sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) ||
+ (!elems.rsn && sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE))
+ return;
+
+ if (elems.ds_params)
+ freq = ieee80211_channel_to_frequency(elems.ds_params[0], band);
+ else
+ freq = rx_status->freq;
+
+ channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+ if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+ return;
+
+ if (mesh_matches_local(sdata, &elems))
+ mesh_neighbour_update(sdata, mgmt->sa, &elems);
+
+ if (ifmsh->sync_ops)
+ ifmsh->sync_ops->rx_bcn_presp(sdata,
+ stype, mgmt, &elems, rx_status);
+
+ if (ifmsh->csa_role != IEEE80211_MESH_CSA_ROLE_INIT &&
+ !sdata->vif.csa_active)
+ ieee80211_mesh_process_chnswitch(sdata, &elems, true);
+}
+
+int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct mesh_csa_settings *tmp_csa_settings;
+ int ret = 0;
+ int changed = 0;
+
+ /* Reset the TTL value and Initiator flag */
+ ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE;
+ ifmsh->chsw_ttl = 0;
+
+ /* Remove the CSA and MCSP elements from the beacon */
+ tmp_csa_settings = rcu_dereference(ifmsh->csa);
+ RCU_INIT_POINTER(ifmsh->csa, NULL);
+ if (tmp_csa_settings)
+ kfree_rcu(tmp_csa_settings, rcu_head);
+ ret = ieee80211_mesh_rebuild_beacon(sdata);
+ if (ret)
+ return -EINVAL;
+
+ changed |= BSS_CHANGED_BEACON;
+
+ mcsa_dbg(sdata, "complete switching to center freq %d MHz",
+ sdata->vif.bss_conf.chandef.chan->center_freq);
+ return changed;
+}
+
+int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_csa_settings *csa_settings)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct mesh_csa_settings *tmp_csa_settings;
+ int ret = 0;
+
+ tmp_csa_settings = kmalloc(sizeof(*tmp_csa_settings),
+ GFP_ATOMIC);
+ if (!tmp_csa_settings)
+ return -ENOMEM;
+
+ memcpy(&tmp_csa_settings->settings, csa_settings,
+ sizeof(struct cfg80211_csa_settings));
+
+ rcu_assign_pointer(ifmsh->csa, tmp_csa_settings);
+
+ ret = ieee80211_mesh_rebuild_beacon(sdata);
+ if (ret) {
+ tmp_csa_settings = rcu_dereference(ifmsh->csa);
+ RCU_INIT_POINTER(ifmsh->csa, NULL);
+ kfree_rcu(tmp_csa_settings, rcu_head);
+ return ret;
+ }
+
+ return BSS_CHANGED_BEACON;
+}
+
+static int mesh_fwd_csa_frame(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_mgmt *mgmt_fwd;
+ struct sk_buff *skb;
+ struct ieee80211_local *local = sdata->local;
+ u8 *pos = mgmt->u.action.u.chan_switch.variable;
+ size_t offset_ttl;
+
+ skb = dev_alloc_skb(local->tx_headroom + len);
+ if (!skb)
+ return -ENOMEM;
+ skb_reserve(skb, local->tx_headroom);
+ mgmt_fwd = (struct ieee80211_mgmt *) skb_put(skb, len);
+
+ /* offset_ttl is based on whether the secondary channel
+ * offset is available or not. Subtract 1 from the mesh TTL
+ * and disable the initiator flag before forwarding.
+ */
+ offset_ttl = (len < 42) ? 7 : 10;
+ *(pos + offset_ttl) -= 1;
+ *(pos + offset_ttl + 1) &= ~WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR;
+
+ memcpy(mgmt_fwd, mgmt, len);
+ eth_broadcast_addr(mgmt_fwd->da);
+ memcpy(mgmt_fwd->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt_fwd->bssid, sdata->vif.addr, ETH_ALEN);
+
+ ieee80211_tx_skb(sdata, skb);
+ return 0;
+}
+
+static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct ieee802_11_elems elems;
+ u16 pre_value;
+ bool fwd_csa = true;
+ size_t baselen;
+ u8 *pos;
+
+ if (mgmt->u.action.u.measurement.action_code !=
+ WLAN_ACTION_SPCT_CHL_SWITCH)
+ return;
+
+ pos = mgmt->u.action.u.chan_switch.variable;
+ baselen = offsetof(struct ieee80211_mgmt,
+ u.action.u.chan_switch.variable);
+ ieee802_11_parse_elems(pos, len - baselen, false, &elems);
+
+ ifmsh->chsw_ttl = elems.mesh_chansw_params_ie->mesh_ttl;
+ if (!--ifmsh->chsw_ttl)
+ fwd_csa = false;
+
+ pre_value = le16_to_cpu(elems.mesh_chansw_params_ie->mesh_pre_value);
+ if (ifmsh->pre_value >= pre_value)
+ return;
+
+ ifmsh->pre_value = pre_value;
+
+ if (!sdata->vif.csa_active &&
+ !ieee80211_mesh_process_chnswitch(sdata, &elems, false)) {
+ mcsa_dbg(sdata, "Failed to process CSA action frame");
+ return;
+ }
+
+ /* forward or re-broadcast the CSA frame */
+ if (fwd_csa) {
+ if (mesh_fwd_csa_frame(sdata, mgmt, len) < 0)
+ mcsa_dbg(sdata, "Failed to forward the CSA frame");
+ }
+}
+
+static void ieee80211_mesh_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len,
+ struct ieee80211_rx_status *rx_status)
+{
+ switch (mgmt->u.action.category) {
+ case WLAN_CATEGORY_SELF_PROTECTED:
+ switch (mgmt->u.action.u.self_prot.action_code) {
+ case WLAN_SP_MESH_PEERING_OPEN:
+ case WLAN_SP_MESH_PEERING_CLOSE:
+ case WLAN_SP_MESH_PEERING_CONFIRM:
+ mesh_rx_plink_frame(sdata, mgmt, len, rx_status);
+ break;
+ }
+ break;
+ case WLAN_CATEGORY_MESH_ACTION:
+ if (mesh_action_is_path_sel(mgmt))
+ mesh_rx_path_sel_frame(sdata, mgmt, len);
+ break;
+ case WLAN_CATEGORY_SPECTRUM_MGMT:
+ mesh_rx_csa_frame(sdata, mgmt, len);
+ break;
+ }
+}
+
+void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_rx_status *rx_status;
+ struct ieee80211_mgmt *mgmt;
+ u16 stype;
+
+ sdata_lock(sdata);
+
+ /* mesh already went down */
+ if (!sdata->u.mesh.mesh_id_len)
+ goto out;
+
+ rx_status = IEEE80211_SKB_RXCB(skb);
+ mgmt = (struct ieee80211_mgmt *) skb->data;
+ stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
+
+ switch (stype) {
+ case IEEE80211_STYPE_PROBE_RESP:
+ case IEEE80211_STYPE_BEACON:
+ ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len,
+ rx_status);
+ break;
+ case IEEE80211_STYPE_PROBE_REQ:
+ ieee80211_mesh_rx_probe_req(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_ACTION:
+ ieee80211_mesh_rx_mgmt_action(sdata, mgmt, skb->len, rx_status);
+ break;
+ }
+out:
+ sdata_unlock(sdata);
+}
+
+static void mesh_bss_info_changed(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u32 bit, changed = 0;
+
+ for_each_set_bit(bit, &ifmsh->mbss_changed,
+ sizeof(changed) * BITS_PER_BYTE) {
+ clear_bit(bit, &ifmsh->mbss_changed);
+ changed |= BIT(bit);
+ }
+
+ if (sdata->vif.bss_conf.enable_beacon &&
+ (changed & (BSS_CHANGED_BEACON |
+ BSS_CHANGED_HT |
+ BSS_CHANGED_BASIC_RATES |
+ BSS_CHANGED_BEACON_INT)))
+ if (ieee80211_mesh_rebuild_beacon(sdata))
+ return;
+
+ ieee80211_bss_info_change_notify(sdata, changed);
+}
+
+void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+ sdata_lock(sdata);
+
+ /* mesh already went down */
+ if (!sdata->u.mesh.mesh_id_len)
+ goto out;
+
+ if (ifmsh->preq_queue_len &&
+ time_after(jiffies,
+ ifmsh->last_preq + msecs_to_jiffies(ifmsh->mshcfg.dot11MeshHWMPpreqMinInterval)))
+ mesh_path_start_discovery(sdata);
+
+ if (test_and_clear_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags))
+ mesh_mpath_table_grow();
+
+ if (test_and_clear_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags))
+ mesh_mpp_table_grow();
+
+ if (test_and_clear_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags))
+ ieee80211_mesh_housekeeping(sdata);
+
+ if (test_and_clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags))
+ ieee80211_mesh_rootpath(sdata);
+
+ if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags))
+ mesh_sync_adjust_tbtt(sdata);
+
+ if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags))
+ mesh_bss_info_changed(sdata);
+out:
+ sdata_unlock(sdata);
+}
+
+void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list)
+ if (ieee80211_vif_is_mesh(&sdata->vif) &&
+ ieee80211_sdata_running(sdata))
+ ieee80211_queue_work(&local->hw, &sdata->work);
+ rcu_read_unlock();
+}
+
+void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ static u8 zero_addr[ETH_ALEN] = {};
+
+ setup_timer(&ifmsh->housekeeping_timer,
+ ieee80211_mesh_housekeeping_timer,
+ (unsigned long) sdata);
+
+ ifmsh->accepting_plinks = true;
+ atomic_set(&ifmsh->mpaths, 0);
+ mesh_rmc_init(sdata);
+ ifmsh->last_preq = jiffies;
+ ifmsh->next_perr = jiffies;
+ ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE;
+ /* Allocate all mesh structures when creating the first mesh interface. */
+ if (!mesh_allocated)
+ ieee80211s_init();
+ setup_timer(&ifmsh->mesh_path_timer,
+ ieee80211_mesh_path_timer,
+ (unsigned long) sdata);
+ setup_timer(&ifmsh->mesh_path_root_timer,
+ ieee80211_mesh_path_root_timer,
+ (unsigned long) sdata);
+ INIT_LIST_HEAD(&ifmsh->preq_queue.list);
+ skb_queue_head_init(&ifmsh->ps.bc_buf);
+ spin_lock_init(&ifmsh->mesh_preq_queue_lock);
+ spin_lock_init(&ifmsh->sync_offset_lock);
+ RCU_INIT_POINTER(ifmsh->beacon, NULL);
+
+ sdata->vif.bss_conf.bssid = zero_addr;
+}
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
new file mode 100644
index 000000000..50c8473cf
--- /dev/null
+++ b/net/mac80211/mesh.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Authors: Luis Carlos Cobo <luisca@cozybit.com>
+ * Javier Cardona <javier@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef IEEE80211S_H
+#define IEEE80211S_H
+
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include "ieee80211_i.h"
+
+
+/* Data structures */
+
+/**
+ * enum mesh_path_flags - mac80211 mesh path flags
+ *
+ *
+ *
+ * @MESH_PATH_ACTIVE: the mesh path can be used for forwarding
+ * @MESH_PATH_RESOLVING: the discovery process is running for this mesh path
+ * @MESH_PATH_SN_VALID: the mesh path contains a valid destination sequence
+ * number
+ * @MESH_PATH_FIXED: the mesh path has been manually set and should not be
+ * modified
+ * @MESH_PATH_RESOLVED: the mesh path can has been resolved
+ * @MESH_PATH_REQ_QUEUED: there is an unsent path request for this destination
+ * already queued up, waiting for the discovery process to start.
+ *
+ * MESH_PATH_RESOLVED is used by the mesh path timer to
+ * decide when to stop or cancel the mesh path discovery.
+ */
+enum mesh_path_flags {
+ MESH_PATH_ACTIVE = BIT(0),
+ MESH_PATH_RESOLVING = BIT(1),
+ MESH_PATH_SN_VALID = BIT(2),
+ MESH_PATH_FIXED = BIT(3),
+ MESH_PATH_RESOLVED = BIT(4),
+ MESH_PATH_REQ_QUEUED = BIT(5),
+};
+
+/**
+ * enum mesh_deferred_task_flags - mac80211 mesh deferred tasks
+ *
+ *
+ *
+ * @MESH_WORK_HOUSEKEEPING: run the periodic mesh housekeeping tasks
+ * @MESH_WORK_GROW_MPATH_TABLE: the mesh path table is full and needs
+ * to grow.
+ * @MESH_WORK_GROW_MPP_TABLE: the mesh portals table is full and needs to
+ * grow
+ * @MESH_WORK_ROOT: the mesh root station needs to send a frame
+ * @MESH_WORK_DRIFT_ADJUST: time to compensate for clock drift relative to other
+ * mesh nodes
+ * @MESH_WORK_MBSS_CHANGED: rebuild beacon and notify driver of BSS changes
+ */
+enum mesh_deferred_task_flags {
+ MESH_WORK_HOUSEKEEPING,
+ MESH_WORK_GROW_MPATH_TABLE,
+ MESH_WORK_GROW_MPP_TABLE,
+ MESH_WORK_ROOT,
+ MESH_WORK_DRIFT_ADJUST,
+ MESH_WORK_MBSS_CHANGED,
+};
+
+/**
+ * struct mesh_path - mac80211 mesh path structure
+ *
+ * @dst: mesh path destination mac address
+ * @sdata: mesh subif
+ * @next_hop: mesh neighbor to which frames for this destination will be
+ * forwarded
+ * @timer: mesh path discovery timer
+ * @frame_queue: pending queue for frames sent to this destination while the
+ * path is unresolved
+ * @sn: target sequence number
+ * @metric: current metric to this destination
+ * @hop_count: hops to destination
+ * @exp_time: in jiffies, when the path will expire or when it expired
+ * @discovery_timeout: timeout (lapse in jiffies) used for the last discovery
+ * retry
+ * @discovery_retries: number of discovery retries
+ * @flags: mesh path flags, as specified on &enum mesh_path_flags
+ * @state_lock: mesh path state lock used to protect changes to the
+ * mpath itself. No need to take this lock when adding or removing
+ * an mpath to a hash bucket on a path table.
+ * @rann_snd_addr: the RANN sender address
+ * @rann_metric: the aggregated path metric towards the root node
+ * @last_preq_to_root: Timestamp of last PREQ sent to root
+ * @is_root: the destination station of this path is a root node
+ * @is_gate: the destination station of this path is a mesh gate
+ *
+ *
+ * The combination of dst and sdata is unique in the mesh path table. Since the
+ * next_hop STA is only protected by RCU as well, deleting the STA must also
+ * remove/substitute the mesh_path structure and wait until that is no longer
+ * reachable before destroying the STA completely.
+ */
+struct mesh_path {
+ u8 dst[ETH_ALEN];
+ u8 mpp[ETH_ALEN]; /* used for MPP or MAP */
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info __rcu *next_hop;
+ struct timer_list timer;
+ struct sk_buff_head frame_queue;
+ struct rcu_head rcu;
+ u32 sn;
+ u32 metric;
+ u8 hop_count;
+ unsigned long exp_time;
+ u32 discovery_timeout;
+ u8 discovery_retries;
+ enum mesh_path_flags flags;
+ spinlock_t state_lock;
+ u8 rann_snd_addr[ETH_ALEN];
+ u32 rann_metric;
+ unsigned long last_preq_to_root;
+ bool is_root;
+ bool is_gate;
+};
+
+/**
+ * struct mesh_table
+ *
+ * @hash_buckets: array of hash buckets of the table
+ * @hashwlock: array of locks to protect write operations, one per bucket
+ * @hash_mask: 2^size_order - 1, used to compute hash idx
+ * @hash_rnd: random value used for hash computations
+ * @entries: number of entries in the table
+ * @free_node: function to free nodes of the table
+ * @copy_node: function to copy nodes of the table
+ * @size_order: determines size of the table, there will be 2^size_order hash
+ * buckets
+ * @mean_chain_len: maximum average length for the hash buckets' list, if it is
+ * reached, the table will grow
+ * @known_gates: list of known mesh gates and their mpaths by the station. The
+ * gate's mpath may or may not be resolved and active.
+ *
+ * rcu_head: RCU head to free the table
+ */
+struct mesh_table {
+ /* Number of buckets will be 2^N */
+ struct hlist_head *hash_buckets;
+ spinlock_t *hashwlock; /* One per bucket, for add/del */
+ unsigned int hash_mask; /* (2^size_order) - 1 */
+ __u32 hash_rnd; /* Used for hash generation */
+ atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */
+ void (*free_node) (struct hlist_node *p, bool free_leafs);
+ int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl);
+ int size_order;
+ int mean_chain_len;
+ struct hlist_head *known_gates;
+ spinlock_t gates_lock;
+
+ struct rcu_head rcu_head;
+};
+
+/* Recent multicast cache */
+/* RMC_BUCKETS must be a power of 2, maximum 256 */
+#define RMC_BUCKETS 256
+#define RMC_QUEUE_MAX_LEN 4
+#define RMC_TIMEOUT (3 * HZ)
+
+/**
+ * struct rmc_entry - entry in the Recent Multicast Cache
+ *
+ * @seqnum: mesh sequence number of the frame
+ * @exp_time: expiration time of the entry, in jiffies
+ * @sa: source address of the frame
+ *
+ * The Recent Multicast Cache keeps track of the latest multicast frames that
+ * have been received by a mesh interface and discards received multicast frames
+ * that are found in the cache.
+ */
+struct rmc_entry {
+ struct list_head list;
+ u32 seqnum;
+ unsigned long exp_time;
+ u8 sa[ETH_ALEN];
+};
+
+struct mesh_rmc {
+ struct list_head bucket[RMC_BUCKETS];
+ u32 idx_mask;
+};
+
+#define IEEE80211_MESH_HOUSEKEEPING_INTERVAL (60 * HZ)
+
+#define MESH_PATH_EXPIRE (600 * HZ)
+
+/* Default maximum number of plinks per interface */
+#define MESH_MAX_PLINKS 256
+
+/* Maximum number of paths per interface */
+#define MESH_MAX_MPATHS 1024
+
+/* Number of frames buffered per destination for unresolved destinations */
+#define MESH_FRAME_QUEUE_LEN 10
+
+/* Public interfaces */
+/* Various */
+int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc,
+ const u8 *da, const u8 *sa);
+int ieee80211_new_mesh_header(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211s_hdr *meshhdr,
+ const char *addr4or5, const char *addr6);
+int mesh_rmc_check(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr, struct ieee80211s_hdr *mesh_hdr);
+bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
+ struct ieee802_11_elems *ie);
+void mesh_ids_set_default(struct ieee80211_if_mesh *mesh);
+int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int mesh_add_meshid_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int mesh_add_rsn_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int mesh_add_vendor_ies(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+void mesh_rmc_free(struct ieee80211_sub_if_data *sdata);
+int mesh_rmc_init(struct ieee80211_sub_if_data *sdata);
+void ieee80211s_init(void);
+void ieee80211s_update_metric(struct ieee80211_local *local,
+ struct sta_info *sta, struct sk_buff *skb);
+void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata);
+int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
+void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh);
+const struct ieee80211_mesh_sync_ops *ieee80211_mesh_sync_ops_get(u8 method);
+/* wrapper for ieee80211_bss_info_change_notify() */
+void ieee80211_mbss_info_change_notify(struct ieee80211_sub_if_data *sdata,
+ u32 changed);
+
+/* mesh power save */
+u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata);
+u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta,
+ enum nl80211_mesh_power_mode pm);
+void ieee80211_mps_set_frame_flags(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_hdr *hdr);
+void ieee80211_mps_sta_status_update(struct sta_info *sta);
+void ieee80211_mps_rx_h_sta_process(struct sta_info *sta,
+ struct ieee80211_hdr *hdr);
+void ieee80211_mpsp_trigger_process(u8 *qc, struct sta_info *sta,
+ bool tx, bool acked);
+void ieee80211_mps_frame_release(struct sta_info *sta,
+ struct ieee802_11_elems *elems);
+
+/* Mesh paths */
+int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata);
+struct mesh_path *mesh_path_lookup(struct ieee80211_sub_if_data *sdata,
+ const u8 *dst);
+struct mesh_path *mpp_path_lookup(struct ieee80211_sub_if_data *sdata,
+ const u8 *dst);
+int mpp_path_add(struct ieee80211_sub_if_data *sdata,
+ const u8 *dst, const u8 *mpp);
+struct mesh_path *
+mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx);
+struct mesh_path *
+mpp_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx);
+void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop);
+void mesh_path_expire(struct ieee80211_sub_if_data *sdata);
+void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len);
+struct mesh_path *
+mesh_path_add(struct ieee80211_sub_if_data *sdata, const u8 *dst);
+
+int mesh_path_add_gate(struct mesh_path *mpath);
+int mesh_path_send_to_gates(struct mesh_path *mpath);
+int mesh_gate_num(struct ieee80211_sub_if_data *sdata);
+
+/* Mesh plinks */
+void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata,
+ u8 *hw_addr, struct ieee802_11_elems *ie);
+bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie);
+u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata);
+void mesh_plink_broken(struct sta_info *sta);
+u32 mesh_plink_deactivate(struct sta_info *sta);
+u32 mesh_plink_open(struct sta_info *sta);
+u32 mesh_plink_block(struct sta_info *sta);
+void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_rx_status *rx_status);
+void mesh_sta_cleanup(struct sta_info *sta);
+
+/* Private interfaces */
+/* Mesh tables */
+void mesh_mpath_table_grow(void);
+void mesh_mpp_table_grow(void);
+/* Mesh paths */
+int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata,
+ u8 ttl, const u8 *target, u32 target_sn,
+ u16 target_rcode, const u8 *ra);
+void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta);
+void mesh_path_flush_pending(struct mesh_path *mpath);
+void mesh_path_tx_pending(struct mesh_path *mpath);
+int mesh_pathtbl_init(void);
+void mesh_pathtbl_unregister(void);
+int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr);
+void mesh_path_timer(unsigned long data);
+void mesh_path_flush_by_nexthop(struct sta_info *sta);
+void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata);
+
+bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt);
+extern int mesh_paths_generation;
+extern int mpp_paths_generation;
+
+#ifdef CONFIG_MAC80211_MESH
+static inline
+u32 mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata)
+{
+ atomic_inc(&sdata->u.mesh.estab_plinks);
+ return mesh_accept_plinks_update(sdata) | BSS_CHANGED_BEACON;
+}
+
+static inline
+u32 mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata)
+{
+ atomic_dec(&sdata->u.mesh.estab_plinks);
+ return mesh_accept_plinks_update(sdata) | BSS_CHANGED_BEACON;
+}
+
+static inline int mesh_plink_free_count(struct ieee80211_sub_if_data *sdata)
+{
+ return sdata->u.mesh.mshcfg.dot11MeshMaxPeerLinks -
+ atomic_read(&sdata->u.mesh.estab_plinks);
+}
+
+static inline bool mesh_plink_availables(struct ieee80211_sub_if_data *sdata)
+{
+ return (min_t(long, mesh_plink_free_count(sdata),
+ MESH_MAX_PLINKS - sdata->local->num_sta)) > 0;
+}
+
+static inline void mesh_path_activate(struct mesh_path *mpath)
+{
+ mpath->flags |= MESH_PATH_ACTIVE | MESH_PATH_RESOLVED;
+}
+
+static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
+{
+ return sdata->u.mesh.mesh_pp_id == IEEE80211_PATH_PROTOCOL_HWMP;
+}
+
+void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local);
+
+void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
+void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata);
+void ieee80211s_stop(void);
+#else
+static inline void
+ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {}
+static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
+{ return false; }
+static inline void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211s_stop(void) {}
+#endif
+
+#endif /* IEEE80211S_H */
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
new file mode 100644
index 000000000..214e63b84
--- /dev/null
+++ b/net/mac80211/mesh_hwmp.c
@@ -0,0 +1,1224 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Author: Luis Carlos Cobo <luisca@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/etherdevice.h>
+#include <asm/unaligned.h>
+#include "wme.h"
+#include "mesh.h"
+
+#define TEST_FRAME_LEN 8192
+#define MAX_METRIC 0xffffffff
+#define ARITH_SHIFT 8
+
+#define MAX_PREQ_QUEUE_LEN 64
+
+/* Destination only */
+#define MP_F_DO 0x1
+/* Reply and forward */
+#define MP_F_RF 0x2
+/* Unknown Sequence Number */
+#define MP_F_USN 0x01
+/* Reason code Present */
+#define MP_F_RCODE 0x02
+
+static void mesh_queue_preq(struct mesh_path *, u8);
+
+static inline u32 u32_field_get(const u8 *preq_elem, int offset, bool ae)
+{
+ if (ae)
+ offset += 6;
+ return get_unaligned_le32(preq_elem + offset);
+}
+
+static inline u16 u16_field_get(const u8 *preq_elem, int offset, bool ae)
+{
+ if (ae)
+ offset += 6;
+ return get_unaligned_le16(preq_elem + offset);
+}
+
+/* HWMP IE processing macros */
+#define AE_F (1<<6)
+#define AE_F_SET(x) (*x & AE_F)
+#define PREQ_IE_FLAGS(x) (*(x))
+#define PREQ_IE_HOPCOUNT(x) (*(x + 1))
+#define PREQ_IE_TTL(x) (*(x + 2))
+#define PREQ_IE_PREQ_ID(x) u32_field_get(x, 3, 0)
+#define PREQ_IE_ORIG_ADDR(x) (x + 7)
+#define PREQ_IE_ORIG_SN(x) u32_field_get(x, 13, 0)
+#define PREQ_IE_LIFETIME(x) u32_field_get(x, 17, AE_F_SET(x))
+#define PREQ_IE_METRIC(x) u32_field_get(x, 21, AE_F_SET(x))
+#define PREQ_IE_TARGET_F(x) (*(AE_F_SET(x) ? x + 32 : x + 26))
+#define PREQ_IE_TARGET_ADDR(x) (AE_F_SET(x) ? x + 33 : x + 27)
+#define PREQ_IE_TARGET_SN(x) u32_field_get(x, 33, AE_F_SET(x))
+
+
+#define PREP_IE_FLAGS(x) PREQ_IE_FLAGS(x)
+#define PREP_IE_HOPCOUNT(x) PREQ_IE_HOPCOUNT(x)
+#define PREP_IE_TTL(x) PREQ_IE_TTL(x)
+#define PREP_IE_ORIG_ADDR(x) (AE_F_SET(x) ? x + 27 : x + 21)
+#define PREP_IE_ORIG_SN(x) u32_field_get(x, 27, AE_F_SET(x))
+#define PREP_IE_LIFETIME(x) u32_field_get(x, 13, AE_F_SET(x))
+#define PREP_IE_METRIC(x) u32_field_get(x, 17, AE_F_SET(x))
+#define PREP_IE_TARGET_ADDR(x) (x + 3)
+#define PREP_IE_TARGET_SN(x) u32_field_get(x, 9, 0)
+
+#define PERR_IE_TTL(x) (*(x))
+#define PERR_IE_TARGET_FLAGS(x) (*(x + 2))
+#define PERR_IE_TARGET_ADDR(x) (x + 3)
+#define PERR_IE_TARGET_SN(x) u32_field_get(x, 9, 0)
+#define PERR_IE_TARGET_RCODE(x) u16_field_get(x, 13, 0)
+
+#define MSEC_TO_TU(x) (x*1000/1024)
+#define SN_GT(x, y) ((s32)(y - x) < 0)
+#define SN_LT(x, y) ((s32)(x - y) < 0)
+
+#define net_traversal_jiffies(s) \
+ msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime)
+#define default_lifetime(s) \
+ MSEC_TO_TU(s->u.mesh.mshcfg.dot11MeshHWMPactivePathTimeout)
+#define min_preq_int_jiff(s) \
+ (msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPpreqMinInterval))
+#define max_preq_retries(s) (s->u.mesh.mshcfg.dot11MeshHWMPmaxPREQretries)
+#define disc_timeout_jiff(s) \
+ msecs_to_jiffies(sdata->u.mesh.mshcfg.min_discovery_timeout)
+#define root_path_confirmation_jiffies(s) \
+ msecs_to_jiffies(sdata->u.mesh.mshcfg.dot11MeshHWMPconfirmationInterval)
+
+enum mpath_frame_type {
+ MPATH_PREQ = 0,
+ MPATH_PREP,
+ MPATH_PERR,
+ MPATH_RANN
+};
+
+static const u8 broadcast_addr[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
+ const u8 *orig_addr, u32 orig_sn,
+ u8 target_flags, const u8 *target,
+ u32 target_sn, const u8 *da,
+ u8 hop_count, u8 ttl,
+ u32 lifetime, u32 metric, u32 preq_id,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ u8 *pos, ie_len;
+ int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.mesh_action) +
+ sizeof(mgmt->u.action.u.mesh_action);
+
+ skb = dev_alloc_skb(local->tx_headroom +
+ hdr_len +
+ 2 + 37); /* max HWMP IE */
+ if (!skb)
+ return -1;
+ skb_reserve(skb, local->tx_headroom);
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len);
+ memset(mgmt, 0, hdr_len);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+
+ memcpy(mgmt->da, da, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ /* BSSID == SA */
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+ mgmt->u.action.category = WLAN_CATEGORY_MESH_ACTION;
+ mgmt->u.action.u.mesh_action.action_code =
+ WLAN_MESH_ACTION_HWMP_PATH_SELECTION;
+
+ switch (action) {
+ case MPATH_PREQ:
+ mhwmp_dbg(sdata, "sending PREQ to %pM\n", target);
+ ie_len = 37;
+ pos = skb_put(skb, 2 + ie_len);
+ *pos++ = WLAN_EID_PREQ;
+ break;
+ case MPATH_PREP:
+ mhwmp_dbg(sdata, "sending PREP to %pM\n", orig_addr);
+ ie_len = 31;
+ pos = skb_put(skb, 2 + ie_len);
+ *pos++ = WLAN_EID_PREP;
+ break;
+ case MPATH_RANN:
+ mhwmp_dbg(sdata, "sending RANN from %pM\n", orig_addr);
+ ie_len = sizeof(struct ieee80211_rann_ie);
+ pos = skb_put(skb, 2 + ie_len);
+ *pos++ = WLAN_EID_RANN;
+ break;
+ default:
+ kfree_skb(skb);
+ return -ENOTSUPP;
+ }
+ *pos++ = ie_len;
+ *pos++ = flags;
+ *pos++ = hop_count;
+ *pos++ = ttl;
+ if (action == MPATH_PREP) {
+ memcpy(pos, target, ETH_ALEN);
+ pos += ETH_ALEN;
+ put_unaligned_le32(target_sn, pos);
+ pos += 4;
+ } else {
+ if (action == MPATH_PREQ) {
+ put_unaligned_le32(preq_id, pos);
+ pos += 4;
+ }
+ memcpy(pos, orig_addr, ETH_ALEN);
+ pos += ETH_ALEN;
+ put_unaligned_le32(orig_sn, pos);
+ pos += 4;
+ }
+ put_unaligned_le32(lifetime, pos); /* interval for RANN */
+ pos += 4;
+ put_unaligned_le32(metric, pos);
+ pos += 4;
+ if (action == MPATH_PREQ) {
+ *pos++ = 1; /* destination count */
+ *pos++ = target_flags;
+ memcpy(pos, target, ETH_ALEN);
+ pos += ETH_ALEN;
+ put_unaligned_le32(target_sn, pos);
+ pos += 4;
+ } else if (action == MPATH_PREP) {
+ memcpy(pos, orig_addr, ETH_ALEN);
+ pos += ETH_ALEN;
+ put_unaligned_le32(orig_sn, pos);
+ pos += 4;
+ }
+
+ ieee80211_tx_skb(sdata, skb);
+ return 0;
+}
+
+
+/* Headroom is not adjusted. Caller should ensure that skb has sufficient
+ * headroom in case the frame is encrypted. */
+static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, 0);
+ skb_set_transport_header(skb, 0);
+
+ /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */
+ skb_set_queue_mapping(skb, IEEE80211_AC_VO);
+ skb->priority = 7;
+
+ info->control.vif = &sdata->vif;
+ info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ ieee80211_set_qos_hdr(sdata, skb);
+ ieee80211_mps_set_frame_flags(sdata, NULL, hdr);
+}
+
+/**
+ * mesh_path_error_tx - Sends a PERR mesh management frame
+ *
+ * @ttl: allowed remaining hops
+ * @target: broken destination
+ * @target_sn: SN of the broken destination
+ * @target_rcode: reason code for this PERR
+ * @ra: node this frame is addressed to
+ * @sdata: local mesh subif
+ *
+ * Note: This function may be called with driver locks taken that the driver
+ * also acquires in the TX path. To avoid a deadlock we don't transmit the
+ * frame directly but add it to the pending queue instead.
+ */
+int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata,
+ u8 ttl, const u8 *target, u32 target_sn,
+ u16 target_rcode, const u8 *ra)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct ieee80211_mgmt *mgmt;
+ u8 *pos, ie_len;
+ int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.mesh_action) +
+ sizeof(mgmt->u.action.u.mesh_action);
+
+ if (time_before(jiffies, ifmsh->next_perr))
+ return -EAGAIN;
+
+ skb = dev_alloc_skb(local->tx_headroom +
+ sdata->encrypt_headroom +
+ IEEE80211_ENCRYPT_TAILROOM +
+ hdr_len +
+ 2 + 15 /* PERR IE */);
+ if (!skb)
+ return -1;
+ skb_reserve(skb, local->tx_headroom + sdata->encrypt_headroom);
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len);
+ memset(mgmt, 0, hdr_len);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+
+ memcpy(mgmt->da, ra, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ /* BSSID == SA */
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+ mgmt->u.action.category = WLAN_CATEGORY_MESH_ACTION;
+ mgmt->u.action.u.mesh_action.action_code =
+ WLAN_MESH_ACTION_HWMP_PATH_SELECTION;
+ ie_len = 15;
+ pos = skb_put(skb, 2 + ie_len);
+ *pos++ = WLAN_EID_PERR;
+ *pos++ = ie_len;
+ /* ttl */
+ *pos++ = ttl;
+ /* number of destinations */
+ *pos++ = 1;
+ /*
+ * flags bit, bit 1 is unset if we know the sequence number and
+ * bit 2 is set if we have a reason code
+ */
+ *pos = 0;
+ if (!target_sn)
+ *pos |= MP_F_USN;
+ if (target_rcode)
+ *pos |= MP_F_RCODE;
+ pos++;
+ memcpy(pos, target, ETH_ALEN);
+ pos += ETH_ALEN;
+ put_unaligned_le32(target_sn, pos);
+ pos += 4;
+ put_unaligned_le16(target_rcode, pos);
+
+ /* see note in function header */
+ prepare_frame_for_deferred_tx(sdata, skb);
+ ifmsh->next_perr = TU_TO_EXP_TIME(
+ ifmsh->mshcfg.dot11MeshHWMPperrMinInterval);
+ ieee80211_add_pending_skb(local, skb);
+ return 0;
+}
+
+void ieee80211s_update_metric(struct ieee80211_local *local,
+ struct sta_info *sta, struct sk_buff *skb)
+{
+ struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ int failed;
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ return;
+
+ failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK);
+
+ /* moving average, scaled to 100 */
+ sta->fail_avg = ((80 * sta->fail_avg + 5) / 100 + 20 * failed);
+ if (sta->fail_avg > 95)
+ mesh_plink_broken(sta);
+}
+
+static u32 airtime_link_metric_get(struct ieee80211_local *local,
+ struct sta_info *sta)
+{
+ struct rate_info rinfo;
+ /* This should be adjusted for each device */
+ int device_constant = 1 << ARITH_SHIFT;
+ int test_frame_len = TEST_FRAME_LEN << ARITH_SHIFT;
+ int s_unit = 1 << ARITH_SHIFT;
+ int rate, err;
+ u32 tx_time, estimated_retx;
+ u64 result;
+
+ if (sta->fail_avg >= 100)
+ return MAX_METRIC;
+
+ sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo);
+ rate = cfg80211_calculate_bitrate(&rinfo);
+ if (WARN_ON(!rate))
+ return MAX_METRIC;
+
+ err = (sta->fail_avg << ARITH_SHIFT) / 100;
+
+ /* bitrate is in units of 100 Kbps, while we need rate in units of
+ * 1Mbps. This will be corrected on tx_time computation.
+ */
+ tx_time = (device_constant + 10 * test_frame_len / rate);
+ estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err));
+ result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT) ;
+ return (u32)result;
+}
+
+/**
+ * hwmp_route_info_get - Update routing info to originator and transmitter
+ *
+ * @sdata: local mesh subif
+ * @mgmt: mesh management frame
+ * @hwmp_ie: hwmp information element (PREP or PREQ)
+ * @action: type of hwmp ie
+ *
+ * This function updates the path routing information to the originator and the
+ * transmitter of a HWMP PREQ or PREP frame.
+ *
+ * Returns: metric to frame originator or 0 if the frame should not be further
+ * processed
+ *
+ * Notes: this function is the only place (besides user-provided info) where
+ * path routing information is updated.
+ */
+static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ const u8 *hwmp_ie, enum mpath_frame_type action)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct mesh_path *mpath;
+ struct sta_info *sta;
+ bool fresh_info;
+ const u8 *orig_addr, *ta;
+ u32 orig_sn, orig_metric;
+ unsigned long orig_lifetime, exp_time;
+ u32 last_hop_metric, new_metric;
+ bool process = true;
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, mgmt->sa);
+ if (!sta) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ last_hop_metric = airtime_link_metric_get(local, sta);
+ /* Update and check originator routing info */
+ fresh_info = true;
+
+ switch (action) {
+ case MPATH_PREQ:
+ orig_addr = PREQ_IE_ORIG_ADDR(hwmp_ie);
+ orig_sn = PREQ_IE_ORIG_SN(hwmp_ie);
+ orig_lifetime = PREQ_IE_LIFETIME(hwmp_ie);
+ orig_metric = PREQ_IE_METRIC(hwmp_ie);
+ break;
+ case MPATH_PREP:
+ /* Originator here refers to the MP that was the target in the
+ * Path Request. We divert from the nomenclature in the draft
+ * so that we can easily use a single function to gather path
+ * information from both PREQ and PREP frames.
+ */
+ orig_addr = PREP_IE_TARGET_ADDR(hwmp_ie);
+ orig_sn = PREP_IE_TARGET_SN(hwmp_ie);
+ orig_lifetime = PREP_IE_LIFETIME(hwmp_ie);
+ orig_metric = PREP_IE_METRIC(hwmp_ie);
+ break;
+ default:
+ rcu_read_unlock();
+ return 0;
+ }
+ new_metric = orig_metric + last_hop_metric;
+ if (new_metric < orig_metric)
+ new_metric = MAX_METRIC;
+ exp_time = TU_TO_EXP_TIME(orig_lifetime);
+
+ if (ether_addr_equal(orig_addr, sdata->vif.addr)) {
+ /* This MP is the originator, we are not interested in this
+ * frame, except for updating transmitter's path info.
+ */
+ process = false;
+ fresh_info = false;
+ } else {
+ mpath = mesh_path_lookup(sdata, orig_addr);
+ if (mpath) {
+ spin_lock_bh(&mpath->state_lock);
+ if (mpath->flags & MESH_PATH_FIXED)
+ fresh_info = false;
+ else if ((mpath->flags & MESH_PATH_ACTIVE) &&
+ (mpath->flags & MESH_PATH_SN_VALID)) {
+ if (SN_GT(mpath->sn, orig_sn) ||
+ (mpath->sn == orig_sn &&
+ new_metric >= mpath->metric)) {
+ process = false;
+ fresh_info = false;
+ }
+ }
+ } else {
+ mpath = mesh_path_add(sdata, orig_addr);
+ if (IS_ERR(mpath)) {
+ rcu_read_unlock();
+ return 0;
+ }
+ spin_lock_bh(&mpath->state_lock);
+ }
+
+ if (fresh_info) {
+ mesh_path_assign_nexthop(mpath, sta);
+ mpath->flags |= MESH_PATH_SN_VALID;
+ mpath->metric = new_metric;
+ mpath->sn = orig_sn;
+ mpath->exp_time = time_after(mpath->exp_time, exp_time)
+ ? mpath->exp_time : exp_time;
+ mesh_path_activate(mpath);
+ spin_unlock_bh(&mpath->state_lock);
+ mesh_path_tx_pending(mpath);
+ /* draft says preq_id should be saved to, but there does
+ * not seem to be any use for it, skipping by now
+ */
+ } else
+ spin_unlock_bh(&mpath->state_lock);
+ }
+
+ /* Update and check transmitter routing info */
+ ta = mgmt->sa;
+ if (ether_addr_equal(orig_addr, ta))
+ fresh_info = false;
+ else {
+ fresh_info = true;
+
+ mpath = mesh_path_lookup(sdata, ta);
+ if (mpath) {
+ spin_lock_bh(&mpath->state_lock);
+ if ((mpath->flags & MESH_PATH_FIXED) ||
+ ((mpath->flags & MESH_PATH_ACTIVE) &&
+ (last_hop_metric > mpath->metric)))
+ fresh_info = false;
+ } else {
+ mpath = mesh_path_add(sdata, ta);
+ if (IS_ERR(mpath)) {
+ rcu_read_unlock();
+ return 0;
+ }
+ spin_lock_bh(&mpath->state_lock);
+ }
+
+ if (fresh_info) {
+ mesh_path_assign_nexthop(mpath, sta);
+ mpath->metric = last_hop_metric;
+ mpath->exp_time = time_after(mpath->exp_time, exp_time)
+ ? mpath->exp_time : exp_time;
+ mesh_path_activate(mpath);
+ spin_unlock_bh(&mpath->state_lock);
+ mesh_path_tx_pending(mpath);
+ } else
+ spin_unlock_bh(&mpath->state_lock);
+ }
+
+ rcu_read_unlock();
+
+ return process ? new_metric : 0;
+}
+
+static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ const u8 *preq_elem, u32 metric)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct mesh_path *mpath = NULL;
+ const u8 *target_addr, *orig_addr;
+ const u8 *da;
+ u8 target_flags, ttl, flags;
+ u32 orig_sn, target_sn, lifetime, orig_metric;
+ bool reply = false;
+ bool forward = true;
+ bool root_is_gate;
+
+ /* Update target SN, if present */
+ target_addr = PREQ_IE_TARGET_ADDR(preq_elem);
+ orig_addr = PREQ_IE_ORIG_ADDR(preq_elem);
+ target_sn = PREQ_IE_TARGET_SN(preq_elem);
+ orig_sn = PREQ_IE_ORIG_SN(preq_elem);
+ target_flags = PREQ_IE_TARGET_F(preq_elem);
+ orig_metric = metric;
+ /* Proactive PREQ gate announcements */
+ flags = PREQ_IE_FLAGS(preq_elem);
+ root_is_gate = !!(flags & RANN_FLAG_IS_GATE);
+
+ mhwmp_dbg(sdata, "received PREQ from %pM\n", orig_addr);
+
+ if (ether_addr_equal(target_addr, sdata->vif.addr)) {
+ mhwmp_dbg(sdata, "PREQ is for us\n");
+ forward = false;
+ reply = true;
+ metric = 0;
+ if (time_after(jiffies, ifmsh->last_sn_update +
+ net_traversal_jiffies(sdata)) ||
+ time_before(jiffies, ifmsh->last_sn_update)) {
+ ++ifmsh->sn;
+ ifmsh->last_sn_update = jiffies;
+ }
+ target_sn = ifmsh->sn;
+ } else if (is_broadcast_ether_addr(target_addr) &&
+ (target_flags & IEEE80211_PREQ_TO_FLAG)) {
+ rcu_read_lock();
+ mpath = mesh_path_lookup(sdata, orig_addr);
+ if (mpath) {
+ if (flags & IEEE80211_PREQ_PROACTIVE_PREP_FLAG) {
+ reply = true;
+ target_addr = sdata->vif.addr;
+ target_sn = ++ifmsh->sn;
+ metric = 0;
+ ifmsh->last_sn_update = jiffies;
+ }
+ if (root_is_gate)
+ mesh_path_add_gate(mpath);
+ }
+ rcu_read_unlock();
+ } else {
+ rcu_read_lock();
+ mpath = mesh_path_lookup(sdata, target_addr);
+ if (mpath) {
+ if ((!(mpath->flags & MESH_PATH_SN_VALID)) ||
+ SN_LT(mpath->sn, target_sn)) {
+ mpath->sn = target_sn;
+ mpath->flags |= MESH_PATH_SN_VALID;
+ } else if ((!(target_flags & MP_F_DO)) &&
+ (mpath->flags & MESH_PATH_ACTIVE)) {
+ reply = true;
+ metric = mpath->metric;
+ target_sn = mpath->sn;
+ if (target_flags & MP_F_RF)
+ target_flags |= MP_F_DO;
+ else
+ forward = false;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ if (reply) {
+ lifetime = PREQ_IE_LIFETIME(preq_elem);
+ ttl = ifmsh->mshcfg.element_ttl;
+ if (ttl != 0) {
+ mhwmp_dbg(sdata, "replying to the PREQ\n");
+ mesh_path_sel_frame_tx(MPATH_PREP, 0, orig_addr,
+ orig_sn, 0, target_addr,
+ target_sn, mgmt->sa, 0, ttl,
+ lifetime, metric, 0, sdata);
+ } else {
+ ifmsh->mshstats.dropped_frames_ttl++;
+ }
+ }
+
+ if (forward && ifmsh->mshcfg.dot11MeshForwarding) {
+ u32 preq_id;
+ u8 hopcount;
+
+ ttl = PREQ_IE_TTL(preq_elem);
+ lifetime = PREQ_IE_LIFETIME(preq_elem);
+ if (ttl <= 1) {
+ ifmsh->mshstats.dropped_frames_ttl++;
+ return;
+ }
+ mhwmp_dbg(sdata, "forwarding the PREQ from %pM\n", orig_addr);
+ --ttl;
+ preq_id = PREQ_IE_PREQ_ID(preq_elem);
+ hopcount = PREQ_IE_HOPCOUNT(preq_elem) + 1;
+ da = (mpath && mpath->is_root) ?
+ mpath->rann_snd_addr : broadcast_addr;
+
+ if (flags & IEEE80211_PREQ_PROACTIVE_PREP_FLAG) {
+ target_addr = PREQ_IE_TARGET_ADDR(preq_elem);
+ target_sn = PREQ_IE_TARGET_SN(preq_elem);
+ metric = orig_metric;
+ }
+
+ mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr,
+ orig_sn, target_flags, target_addr,
+ target_sn, da, hopcount, ttl, lifetime,
+ metric, preq_id, sdata);
+ if (!is_multicast_ether_addr(da))
+ ifmsh->mshstats.fwded_unicast++;
+ else
+ ifmsh->mshstats.fwded_mcast++;
+ ifmsh->mshstats.fwded_frames++;
+ }
+}
+
+
+static inline struct sta_info *
+next_hop_deref_protected(struct mesh_path *mpath)
+{
+ return rcu_dereference_protected(mpath->next_hop,
+ lockdep_is_held(&mpath->state_lock));
+}
+
+
+static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ const u8 *prep_elem, u32 metric)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct mesh_path *mpath;
+ const u8 *target_addr, *orig_addr;
+ u8 ttl, hopcount, flags;
+ u8 next_hop[ETH_ALEN];
+ u32 target_sn, orig_sn, lifetime;
+
+ mhwmp_dbg(sdata, "received PREP from %pM\n",
+ PREP_IE_TARGET_ADDR(prep_elem));
+
+ orig_addr = PREP_IE_ORIG_ADDR(prep_elem);
+ if (ether_addr_equal(orig_addr, sdata->vif.addr))
+ /* destination, no forwarding required */
+ return;
+
+ if (!ifmsh->mshcfg.dot11MeshForwarding)
+ return;
+
+ ttl = PREP_IE_TTL(prep_elem);
+ if (ttl <= 1) {
+ sdata->u.mesh.mshstats.dropped_frames_ttl++;
+ return;
+ }
+
+ rcu_read_lock();
+ mpath = mesh_path_lookup(sdata, orig_addr);
+ if (mpath)
+ spin_lock_bh(&mpath->state_lock);
+ else
+ goto fail;
+ if (!(mpath->flags & MESH_PATH_ACTIVE)) {
+ spin_unlock_bh(&mpath->state_lock);
+ goto fail;
+ }
+ memcpy(next_hop, next_hop_deref_protected(mpath)->sta.addr, ETH_ALEN);
+ spin_unlock_bh(&mpath->state_lock);
+ --ttl;
+ flags = PREP_IE_FLAGS(prep_elem);
+ lifetime = PREP_IE_LIFETIME(prep_elem);
+ hopcount = PREP_IE_HOPCOUNT(prep_elem) + 1;
+ target_addr = PREP_IE_TARGET_ADDR(prep_elem);
+ target_sn = PREP_IE_TARGET_SN(prep_elem);
+ orig_sn = PREP_IE_ORIG_SN(prep_elem);
+
+ mesh_path_sel_frame_tx(MPATH_PREP, flags, orig_addr, orig_sn, 0,
+ target_addr, target_sn, next_hop, hopcount,
+ ttl, lifetime, metric, 0, sdata);
+ rcu_read_unlock();
+
+ sdata->u.mesh.mshstats.fwded_unicast++;
+ sdata->u.mesh.mshstats.fwded_frames++;
+ return;
+
+fail:
+ rcu_read_unlock();
+ sdata->u.mesh.mshstats.dropped_frames_no_route++;
+}
+
+static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ const u8 *perr_elem)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct mesh_path *mpath;
+ u8 ttl;
+ const u8 *ta, *target_addr;
+ u32 target_sn;
+ u16 target_rcode;
+
+ ta = mgmt->sa;
+ ttl = PERR_IE_TTL(perr_elem);
+ if (ttl <= 1) {
+ ifmsh->mshstats.dropped_frames_ttl++;
+ return;
+ }
+ ttl--;
+ target_addr = PERR_IE_TARGET_ADDR(perr_elem);
+ target_sn = PERR_IE_TARGET_SN(perr_elem);
+ target_rcode = PERR_IE_TARGET_RCODE(perr_elem);
+
+ rcu_read_lock();
+ mpath = mesh_path_lookup(sdata, target_addr);
+ if (mpath) {
+ struct sta_info *sta;
+
+ spin_lock_bh(&mpath->state_lock);
+ sta = next_hop_deref_protected(mpath);
+ if (mpath->flags & MESH_PATH_ACTIVE &&
+ ether_addr_equal(ta, sta->sta.addr) &&
+ (!(mpath->flags & MESH_PATH_SN_VALID) ||
+ SN_GT(target_sn, mpath->sn))) {
+ mpath->flags &= ~MESH_PATH_ACTIVE;
+ mpath->sn = target_sn;
+ spin_unlock_bh(&mpath->state_lock);
+ if (!ifmsh->mshcfg.dot11MeshForwarding)
+ goto endperr;
+ mesh_path_error_tx(sdata, ttl, target_addr,
+ target_sn, target_rcode,
+ broadcast_addr);
+ } else
+ spin_unlock_bh(&mpath->state_lock);
+ }
+endperr:
+ rcu_read_unlock();
+}
+
+static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ const struct ieee80211_rann_ie *rann)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct mesh_path *mpath;
+ u8 ttl, flags, hopcount;
+ const u8 *orig_addr;
+ u32 orig_sn, metric, metric_txsta, interval;
+ bool root_is_gate;
+
+ ttl = rann->rann_ttl;
+ flags = rann->rann_flags;
+ root_is_gate = !!(flags & RANN_FLAG_IS_GATE);
+ orig_addr = rann->rann_addr;
+ orig_sn = le32_to_cpu(rann->rann_seq);
+ interval = le32_to_cpu(rann->rann_interval);
+ hopcount = rann->rann_hopcount;
+ hopcount++;
+ metric = le32_to_cpu(rann->rann_metric);
+
+ /* Ignore our own RANNs */
+ if (ether_addr_equal(orig_addr, sdata->vif.addr))
+ return;
+
+ mhwmp_dbg(sdata,
+ "received RANN from %pM via neighbour %pM (is_gate=%d)\n",
+ orig_addr, mgmt->sa, root_is_gate);
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, mgmt->sa);
+ if (!sta) {
+ rcu_read_unlock();
+ return;
+ }
+
+ metric_txsta = airtime_link_metric_get(local, sta);
+
+ mpath = mesh_path_lookup(sdata, orig_addr);
+ if (!mpath) {
+ mpath = mesh_path_add(sdata, orig_addr);
+ if (IS_ERR(mpath)) {
+ rcu_read_unlock();
+ sdata->u.mesh.mshstats.dropped_frames_no_route++;
+ return;
+ }
+ }
+
+ if (!(SN_LT(mpath->sn, orig_sn)) &&
+ !(mpath->sn == orig_sn && metric < mpath->rann_metric)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ if ((!(mpath->flags & (MESH_PATH_ACTIVE | MESH_PATH_RESOLVING)) ||
+ (time_after(jiffies, mpath->last_preq_to_root +
+ root_path_confirmation_jiffies(sdata)) ||
+ time_before(jiffies, mpath->last_preq_to_root))) &&
+ !(mpath->flags & MESH_PATH_FIXED) && (ttl != 0)) {
+ mhwmp_dbg(sdata,
+ "time to refresh root mpath %pM\n",
+ orig_addr);
+ mesh_queue_preq(mpath, PREQ_Q_F_START | PREQ_Q_F_REFRESH);
+ mpath->last_preq_to_root = jiffies;
+ }
+
+ mpath->sn = orig_sn;
+ mpath->rann_metric = metric + metric_txsta;
+ mpath->is_root = true;
+ /* Recording RANNs sender address to send individually
+ * addressed PREQs destined for root mesh STA */
+ memcpy(mpath->rann_snd_addr, mgmt->sa, ETH_ALEN);
+
+ if (root_is_gate)
+ mesh_path_add_gate(mpath);
+
+ if (ttl <= 1) {
+ ifmsh->mshstats.dropped_frames_ttl++;
+ rcu_read_unlock();
+ return;
+ }
+ ttl--;
+
+ if (ifmsh->mshcfg.dot11MeshForwarding) {
+ mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr,
+ orig_sn, 0, NULL, 0, broadcast_addr,
+ hopcount, ttl, interval,
+ metric + metric_txsta, 0, sdata);
+ }
+
+ rcu_read_unlock();
+}
+
+
+void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee802_11_elems elems;
+ size_t baselen;
+ u32 last_hop_metric;
+ struct sta_info *sta;
+
+ /* need action_code */
+ if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+ return;
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, mgmt->sa);
+ if (!sta || sta->plink_state != NL80211_PLINK_ESTAB) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt;
+ ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable,
+ len - baselen, false, &elems);
+
+ if (elems.preq) {
+ if (elems.preq_len != 37)
+ /* Right now we support just 1 destination and no AE */
+ return;
+ last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.preq,
+ MPATH_PREQ);
+ if (last_hop_metric)
+ hwmp_preq_frame_process(sdata, mgmt, elems.preq,
+ last_hop_metric);
+ }
+ if (elems.prep) {
+ if (elems.prep_len != 31)
+ /* Right now we support no AE */
+ return;
+ last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.prep,
+ MPATH_PREP);
+ if (last_hop_metric)
+ hwmp_prep_frame_process(sdata, mgmt, elems.prep,
+ last_hop_metric);
+ }
+ if (elems.perr) {
+ if (elems.perr_len != 15)
+ /* Right now we support only one destination per PERR */
+ return;
+ hwmp_perr_frame_process(sdata, mgmt, elems.perr);
+ }
+ if (elems.rann)
+ hwmp_rann_frame_process(sdata, mgmt, elems.rann);
+}
+
+/**
+ * mesh_queue_preq - queue a PREQ to a given destination
+ *
+ * @mpath: mesh path to discover
+ * @flags: special attributes of the PREQ to be sent
+ *
+ * Locking: the function must be called from within a rcu read lock block.
+ *
+ */
+static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
+{
+ struct ieee80211_sub_if_data *sdata = mpath->sdata;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct mesh_preq_queue *preq_node;
+
+ preq_node = kmalloc(sizeof(struct mesh_preq_queue), GFP_ATOMIC);
+ if (!preq_node) {
+ mhwmp_dbg(sdata, "could not allocate PREQ node\n");
+ return;
+ }
+
+ spin_lock_bh(&ifmsh->mesh_preq_queue_lock);
+ if (ifmsh->preq_queue_len == MAX_PREQ_QUEUE_LEN) {
+ spin_unlock_bh(&ifmsh->mesh_preq_queue_lock);
+ kfree(preq_node);
+ if (printk_ratelimit())
+ mhwmp_dbg(sdata, "PREQ node queue full\n");
+ return;
+ }
+
+ spin_lock(&mpath->state_lock);
+ if (mpath->flags & MESH_PATH_REQ_QUEUED) {
+ spin_unlock(&mpath->state_lock);
+ spin_unlock_bh(&ifmsh->mesh_preq_queue_lock);
+ kfree(preq_node);
+ return;
+ }
+
+ memcpy(preq_node->dst, mpath->dst, ETH_ALEN);
+ preq_node->flags = flags;
+
+ mpath->flags |= MESH_PATH_REQ_QUEUED;
+ spin_unlock(&mpath->state_lock);
+
+ list_add_tail(&preq_node->list, &ifmsh->preq_queue.list);
+ ++ifmsh->preq_queue_len;
+ spin_unlock_bh(&ifmsh->mesh_preq_queue_lock);
+
+ if (time_after(jiffies, ifmsh->last_preq + min_preq_int_jiff(sdata)))
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+
+ else if (time_before(jiffies, ifmsh->last_preq)) {
+ /* avoid long wait if did not send preqs for a long time
+ * and jiffies wrapped around
+ */
+ ifmsh->last_preq = jiffies - min_preq_int_jiff(sdata) - 1;
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+ } else
+ mod_timer(&ifmsh->mesh_path_timer, ifmsh->last_preq +
+ min_preq_int_jiff(sdata));
+}
+
+/**
+ * mesh_path_start_discovery - launch a path discovery from the PREQ queue
+ *
+ * @sdata: local mesh subif
+ */
+void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct mesh_preq_queue *preq_node;
+ struct mesh_path *mpath;
+ u8 ttl, target_flags;
+ const u8 *da;
+ u32 lifetime;
+
+ spin_lock_bh(&ifmsh->mesh_preq_queue_lock);
+ if (!ifmsh->preq_queue_len ||
+ time_before(jiffies, ifmsh->last_preq +
+ min_preq_int_jiff(sdata))) {
+ spin_unlock_bh(&ifmsh->mesh_preq_queue_lock);
+ return;
+ }
+
+ preq_node = list_first_entry(&ifmsh->preq_queue.list,
+ struct mesh_preq_queue, list);
+ list_del(&preq_node->list);
+ --ifmsh->preq_queue_len;
+ spin_unlock_bh(&ifmsh->mesh_preq_queue_lock);
+
+ rcu_read_lock();
+ mpath = mesh_path_lookup(sdata, preq_node->dst);
+ if (!mpath)
+ goto enddiscovery;
+
+ spin_lock_bh(&mpath->state_lock);
+ mpath->flags &= ~MESH_PATH_REQ_QUEUED;
+ if (preq_node->flags & PREQ_Q_F_START) {
+ if (mpath->flags & MESH_PATH_RESOLVING) {
+ spin_unlock_bh(&mpath->state_lock);
+ goto enddiscovery;
+ } else {
+ mpath->flags &= ~MESH_PATH_RESOLVED;
+ mpath->flags |= MESH_PATH_RESOLVING;
+ mpath->discovery_retries = 0;
+ mpath->discovery_timeout = disc_timeout_jiff(sdata);
+ }
+ } else if (!(mpath->flags & MESH_PATH_RESOLVING) ||
+ mpath->flags & MESH_PATH_RESOLVED) {
+ mpath->flags &= ~MESH_PATH_RESOLVING;
+ spin_unlock_bh(&mpath->state_lock);
+ goto enddiscovery;
+ }
+
+ ifmsh->last_preq = jiffies;
+
+ if (time_after(jiffies, ifmsh->last_sn_update +
+ net_traversal_jiffies(sdata)) ||
+ time_before(jiffies, ifmsh->last_sn_update)) {
+ ++ifmsh->sn;
+ sdata->u.mesh.last_sn_update = jiffies;
+ }
+ lifetime = default_lifetime(sdata);
+ ttl = sdata->u.mesh.mshcfg.element_ttl;
+ if (ttl == 0) {
+ sdata->u.mesh.mshstats.dropped_frames_ttl++;
+ spin_unlock_bh(&mpath->state_lock);
+ goto enddiscovery;
+ }
+
+ if (preq_node->flags & PREQ_Q_F_REFRESH)
+ target_flags = MP_F_DO;
+ else
+ target_flags = MP_F_RF;
+
+ spin_unlock_bh(&mpath->state_lock);
+ da = (mpath->is_root) ? mpath->rann_snd_addr : broadcast_addr;
+ mesh_path_sel_frame_tx(MPATH_PREQ, 0, sdata->vif.addr, ifmsh->sn,
+ target_flags, mpath->dst, mpath->sn, da, 0,
+ ttl, lifetime, 0, ifmsh->preq_id++, sdata);
+ mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout);
+
+enddiscovery:
+ rcu_read_unlock();
+ kfree(preq_node);
+}
+
+/**
+ * mesh_nexthop_resolve - lookup next hop; conditionally start path discovery
+ *
+ * @skb: 802.11 frame to be sent
+ * @sdata: network subif the frame will be sent through
+ *
+ * Lookup next hop for given skb and start path discovery if no
+ * forwarding information is found.
+ *
+ * Returns: 0 if the next hop was found and -ENOENT if the frame was queued.
+ * skb is freeed here if no mpath could be allocated.
+ */
+int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct mesh_path *mpath;
+ struct sk_buff *skb_to_free = NULL;
+ u8 *target_addr = hdr->addr3;
+ int err = 0;
+
+ /* Nulls are only sent to peers for PS and should be pre-addressed */
+ if (ieee80211_is_qos_nullfunc(hdr->frame_control))
+ return 0;
+
+ rcu_read_lock();
+ err = mesh_nexthop_lookup(sdata, skb);
+ if (!err)
+ goto endlookup;
+
+ /* no nexthop found, start resolving */
+ mpath = mesh_path_lookup(sdata, target_addr);
+ if (!mpath) {
+ mpath = mesh_path_add(sdata, target_addr);
+ if (IS_ERR(mpath)) {
+ mesh_path_discard_frame(sdata, skb);
+ err = PTR_ERR(mpath);
+ goto endlookup;
+ }
+ }
+
+ if (!(mpath->flags & MESH_PATH_RESOLVING))
+ mesh_queue_preq(mpath, PREQ_Q_F_START);
+
+ if (skb_queue_len(&mpath->frame_queue) >= MESH_FRAME_QUEUE_LEN)
+ skb_to_free = skb_dequeue(&mpath->frame_queue);
+
+ info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ ieee80211_set_qos_hdr(sdata, skb);
+ skb_queue_tail(&mpath->frame_queue, skb);
+ err = -ENOENT;
+ if (skb_to_free)
+ mesh_path_discard_frame(sdata, skb_to_free);
+
+endlookup:
+ rcu_read_unlock();
+ return err;
+}
+
+/**
+ * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame. Calling
+ * this function is considered "using" the associated mpath, so preempt a path
+ * refresh if this mpath expires soon.
+ *
+ * @skb: 802.11 frame to be sent
+ * @sdata: network subif the frame will be sent through
+ *
+ * Returns: 0 if the next hop was found. Nonzero otherwise.
+ */
+int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct mesh_path *mpath;
+ struct sta_info *next_hop;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ u8 *target_addr = hdr->addr3;
+ int err = -ENOENT;
+
+ rcu_read_lock();
+ mpath = mesh_path_lookup(sdata, target_addr);
+
+ if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE))
+ goto endlookup;
+
+ if (time_after(jiffies,
+ mpath->exp_time -
+ msecs_to_jiffies(sdata->u.mesh.mshcfg.path_refresh_time)) &&
+ ether_addr_equal(sdata->vif.addr, hdr->addr4) &&
+ !(mpath->flags & MESH_PATH_RESOLVING) &&
+ !(mpath->flags & MESH_PATH_FIXED))
+ mesh_queue_preq(mpath, PREQ_Q_F_START | PREQ_Q_F_REFRESH);
+
+ next_hop = rcu_dereference(mpath->next_hop);
+ if (next_hop) {
+ memcpy(hdr->addr1, next_hop->sta.addr, ETH_ALEN);
+ memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+ ieee80211_mps_set_frame_flags(sdata, next_hop, hdr);
+ err = 0;
+ }
+
+endlookup:
+ rcu_read_unlock();
+ return err;
+}
+
+void mesh_path_timer(unsigned long data)
+{
+ struct mesh_path *mpath = (void *) data;
+ struct ieee80211_sub_if_data *sdata = mpath->sdata;
+ int ret;
+
+ if (sdata->local->quiescing)
+ return;
+
+ spin_lock_bh(&mpath->state_lock);
+ if (mpath->flags & MESH_PATH_RESOLVED ||
+ (!(mpath->flags & MESH_PATH_RESOLVING))) {
+ mpath->flags &= ~(MESH_PATH_RESOLVING | MESH_PATH_RESOLVED);
+ spin_unlock_bh(&mpath->state_lock);
+ } else if (mpath->discovery_retries < max_preq_retries(sdata)) {
+ ++mpath->discovery_retries;
+ mpath->discovery_timeout *= 2;
+ mpath->flags &= ~MESH_PATH_REQ_QUEUED;
+ spin_unlock_bh(&mpath->state_lock);
+ mesh_queue_preq(mpath, 0);
+ } else {
+ mpath->flags = 0;
+ mpath->exp_time = jiffies;
+ spin_unlock_bh(&mpath->state_lock);
+ if (!mpath->is_gate && mesh_gate_num(sdata) > 0) {
+ ret = mesh_path_send_to_gates(mpath);
+ if (ret)
+ mhwmp_dbg(sdata, "no gate was reachable\n");
+ } else
+ mesh_path_flush_pending(mpath);
+ }
+}
+
+void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u32 interval = ifmsh->mshcfg.dot11MeshHWMPRannInterval;
+ u8 flags, target_flags = 0;
+
+ flags = (ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol)
+ ? RANN_FLAG_IS_GATE : 0;
+
+ switch (ifmsh->mshcfg.dot11MeshHWMPRootMode) {
+ case IEEE80211_PROACTIVE_RANN:
+ mesh_path_sel_frame_tx(MPATH_RANN, flags, sdata->vif.addr,
+ ++ifmsh->sn, 0, NULL, 0, broadcast_addr,
+ 0, ifmsh->mshcfg.element_ttl,
+ interval, 0, 0, sdata);
+ break;
+ case IEEE80211_PROACTIVE_PREQ_WITH_PREP:
+ flags |= IEEE80211_PREQ_PROACTIVE_PREP_FLAG;
+ case IEEE80211_PROACTIVE_PREQ_NO_PREP:
+ interval = ifmsh->mshcfg.dot11MeshHWMPactivePathToRootTimeout;
+ target_flags |= IEEE80211_PREQ_TO_FLAG |
+ IEEE80211_PREQ_USN_FLAG;
+ mesh_path_sel_frame_tx(MPATH_PREQ, flags, sdata->vif.addr,
+ ++ifmsh->sn, target_flags,
+ (u8 *) broadcast_addr, 0, broadcast_addr,
+ 0, ifmsh->mshcfg.element_ttl, interval,
+ 0, ifmsh->preq_id++, sdata);
+ break;
+ default:
+ mhwmp_dbg(sdata, "Proactive mechanism not supported\n");
+ return;
+ }
+}
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
new file mode 100644
index 000000000..b890e225a
--- /dev/null
+++ b/net/mac80211/mesh_pathtbl.c
@@ -0,0 +1,1142 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Author: Luis Carlos Cobo <luisca@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <net/mac80211.h>
+#include "wme.h"
+#include "ieee80211_i.h"
+#include "mesh.h"
+
+/* There will be initially 2^INIT_PATHS_SIZE_ORDER buckets */
+#define INIT_PATHS_SIZE_ORDER 2
+
+/* Keep the mean chain length below this constant */
+#define MEAN_CHAIN_LEN 2
+
+static inline bool mpath_expired(struct mesh_path *mpath)
+{
+ return (mpath->flags & MESH_PATH_ACTIVE) &&
+ time_after(jiffies, mpath->exp_time) &&
+ !(mpath->flags & MESH_PATH_FIXED);
+}
+
+struct mpath_node {
+ struct hlist_node list;
+ struct rcu_head rcu;
+ /* This indirection allows two different tables to point to the same
+ * mesh_path structure, useful when resizing
+ */
+ struct mesh_path *mpath;
+};
+
+static struct mesh_table __rcu *mesh_paths;
+static struct mesh_table __rcu *mpp_paths; /* Store paths for MPP&MAP */
+
+int mesh_paths_generation;
+int mpp_paths_generation;
+
+/* This lock will have the grow table function as writer and add / delete nodes
+ * as readers. RCU provides sufficient protection only when reading the table
+ * (i.e. doing lookups). Adding or adding or removing nodes requires we take
+ * the read lock or we risk operating on an old table. The write lock is only
+ * needed when modifying the number of buckets a table.
+ */
+static DEFINE_RWLOCK(pathtbl_resize_lock);
+
+
+static inline struct mesh_table *resize_dereference_mesh_paths(void)
+{
+ return rcu_dereference_protected(mesh_paths,
+ lockdep_is_held(&pathtbl_resize_lock));
+}
+
+static inline struct mesh_table *resize_dereference_mpp_paths(void)
+{
+ return rcu_dereference_protected(mpp_paths,
+ lockdep_is_held(&pathtbl_resize_lock));
+}
+
+/*
+ * CAREFUL -- "tbl" must not be an expression,
+ * in particular not an rcu_dereference(), since
+ * it's used twice. So it is illegal to do
+ * for_each_mesh_entry(rcu_dereference(...), ...)
+ */
+#define for_each_mesh_entry(tbl, node, i) \
+ for (i = 0; i <= tbl->hash_mask; i++) \
+ hlist_for_each_entry_rcu(node, &tbl->hash_buckets[i], list)
+
+
+static struct mesh_table *mesh_table_alloc(int size_order)
+{
+ int i;
+ struct mesh_table *newtbl;
+
+ newtbl = kmalloc(sizeof(struct mesh_table), GFP_ATOMIC);
+ if (!newtbl)
+ return NULL;
+
+ newtbl->hash_buckets = kzalloc(sizeof(struct hlist_head) *
+ (1 << size_order), GFP_ATOMIC);
+
+ if (!newtbl->hash_buckets) {
+ kfree(newtbl);
+ return NULL;
+ }
+
+ newtbl->hashwlock = kmalloc(sizeof(spinlock_t) *
+ (1 << size_order), GFP_ATOMIC);
+ if (!newtbl->hashwlock) {
+ kfree(newtbl->hash_buckets);
+ kfree(newtbl);
+ return NULL;
+ }
+
+ newtbl->size_order = size_order;
+ newtbl->hash_mask = (1 << size_order) - 1;
+ atomic_set(&newtbl->entries, 0);
+ get_random_bytes(&newtbl->hash_rnd,
+ sizeof(newtbl->hash_rnd));
+ for (i = 0; i <= newtbl->hash_mask; i++)
+ spin_lock_init(&newtbl->hashwlock[i]);
+ spin_lock_init(&newtbl->gates_lock);
+
+ return newtbl;
+}
+
+static void __mesh_table_free(struct mesh_table *tbl)
+{
+ kfree(tbl->hash_buckets);
+ kfree(tbl->hashwlock);
+ kfree(tbl);
+}
+
+static void mesh_table_free(struct mesh_table *tbl, bool free_leafs)
+{
+ struct hlist_head *mesh_hash;
+ struct hlist_node *p, *q;
+ struct mpath_node *gate;
+ int i;
+
+ mesh_hash = tbl->hash_buckets;
+ for (i = 0; i <= tbl->hash_mask; i++) {
+ spin_lock_bh(&tbl->hashwlock[i]);
+ hlist_for_each_safe(p, q, &mesh_hash[i]) {
+ tbl->free_node(p, free_leafs);
+ atomic_dec(&tbl->entries);
+ }
+ spin_unlock_bh(&tbl->hashwlock[i]);
+ }
+ if (free_leafs) {
+ spin_lock_bh(&tbl->gates_lock);
+ hlist_for_each_entry_safe(gate, q,
+ tbl->known_gates, list) {
+ hlist_del(&gate->list);
+ kfree(gate);
+ }
+ kfree(tbl->known_gates);
+ spin_unlock_bh(&tbl->gates_lock);
+ }
+
+ __mesh_table_free(tbl);
+}
+
+static int mesh_table_grow(struct mesh_table *oldtbl,
+ struct mesh_table *newtbl)
+{
+ struct hlist_head *oldhash;
+ struct hlist_node *p, *q;
+ int i;
+
+ if (atomic_read(&oldtbl->entries)
+ < oldtbl->mean_chain_len * (oldtbl->hash_mask + 1))
+ return -EAGAIN;
+
+ newtbl->free_node = oldtbl->free_node;
+ newtbl->mean_chain_len = oldtbl->mean_chain_len;
+ newtbl->copy_node = oldtbl->copy_node;
+ newtbl->known_gates = oldtbl->known_gates;
+ atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries));
+
+ oldhash = oldtbl->hash_buckets;
+ for (i = 0; i <= oldtbl->hash_mask; i++)
+ hlist_for_each(p, &oldhash[i])
+ if (oldtbl->copy_node(p, newtbl) < 0)
+ goto errcopy;
+
+ return 0;
+
+errcopy:
+ for (i = 0; i <= newtbl->hash_mask; i++) {
+ hlist_for_each_safe(p, q, &newtbl->hash_buckets[i])
+ oldtbl->free_node(p, 0);
+ }
+ return -ENOMEM;
+}
+
+static u32 mesh_table_hash(const u8 *addr, struct ieee80211_sub_if_data *sdata,
+ struct mesh_table *tbl)
+{
+ /* Use last four bytes of hw addr and interface index as hash index */
+ return jhash_2words(*(u32 *)(addr+2), sdata->dev->ifindex,
+ tbl->hash_rnd) & tbl->hash_mask;
+}
+
+
+/**
+ *
+ * mesh_path_assign_nexthop - update mesh path next hop
+ *
+ * @mpath: mesh path to update
+ * @sta: next hop to assign
+ *
+ * Locking: mpath->state_lock must be held when calling this function
+ */
+void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta)
+{
+ struct sk_buff *skb;
+ struct ieee80211_hdr *hdr;
+ unsigned long flags;
+
+ rcu_assign_pointer(mpath->next_hop, sta);
+
+ spin_lock_irqsave(&mpath->frame_queue.lock, flags);
+ skb_queue_walk(&mpath->frame_queue, skb) {
+ hdr = (struct ieee80211_hdr *) skb->data;
+ memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN);
+ memcpy(hdr->addr2, mpath->sdata->vif.addr, ETH_ALEN);
+ ieee80211_mps_set_frame_flags(sta->sdata, sta, hdr);
+ }
+
+ spin_unlock_irqrestore(&mpath->frame_queue.lock, flags);
+}
+
+static void prepare_for_gate(struct sk_buff *skb, char *dst_addr,
+ struct mesh_path *gate_mpath)
+{
+ struct ieee80211_hdr *hdr;
+ struct ieee80211s_hdr *mshdr;
+ int mesh_hdrlen, hdrlen;
+ char *next_hop;
+
+ hdr = (struct ieee80211_hdr *) skb->data;
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ mshdr = (struct ieee80211s_hdr *) (skb->data + hdrlen);
+
+ if (!(mshdr->flags & MESH_FLAGS_AE)) {
+ /* size of the fixed part of the mesh header */
+ mesh_hdrlen = 6;
+
+ /* make room for the two extended addresses */
+ skb_push(skb, 2 * ETH_ALEN);
+ memmove(skb->data, hdr, hdrlen + mesh_hdrlen);
+
+ hdr = (struct ieee80211_hdr *) skb->data;
+
+ /* we preserve the previous mesh header and only add
+ * the new addreses */
+ mshdr = (struct ieee80211s_hdr *) (skb->data + hdrlen);
+ mshdr->flags = MESH_FLAGS_AE_A5_A6;
+ memcpy(mshdr->eaddr1, hdr->addr3, ETH_ALEN);
+ memcpy(mshdr->eaddr2, hdr->addr4, ETH_ALEN);
+ }
+
+ /* update next hop */
+ hdr = (struct ieee80211_hdr *) skb->data;
+ rcu_read_lock();
+ next_hop = rcu_dereference(gate_mpath->next_hop)->sta.addr;
+ memcpy(hdr->addr1, next_hop, ETH_ALEN);
+ rcu_read_unlock();
+ memcpy(hdr->addr2, gate_mpath->sdata->vif.addr, ETH_ALEN);
+ memcpy(hdr->addr3, dst_addr, ETH_ALEN);
+}
+
+/**
+ *
+ * mesh_path_move_to_queue - Move or copy frames from one mpath queue to another
+ *
+ * This function is used to transfer or copy frames from an unresolved mpath to
+ * a gate mpath. The function also adds the Address Extension field and
+ * updates the next hop.
+ *
+ * If a frame already has an Address Extension field, only the next hop and
+ * destination addresses are updated.
+ *
+ * The gate mpath must be an active mpath with a valid mpath->next_hop.
+ *
+ * @mpath: An active mpath the frames will be sent to (i.e. the gate)
+ * @from_mpath: The failed mpath
+ * @copy: When true, copy all the frames to the new mpath queue. When false,
+ * move them.
+ */
+static void mesh_path_move_to_queue(struct mesh_path *gate_mpath,
+ struct mesh_path *from_mpath,
+ bool copy)
+{
+ struct sk_buff *skb, *fskb, *tmp;
+ struct sk_buff_head failq;
+ unsigned long flags;
+
+ if (WARN_ON(gate_mpath == from_mpath))
+ return;
+ if (WARN_ON(!gate_mpath->next_hop))
+ return;
+
+ __skb_queue_head_init(&failq);
+
+ spin_lock_irqsave(&from_mpath->frame_queue.lock, flags);
+ skb_queue_splice_init(&from_mpath->frame_queue, &failq);
+ spin_unlock_irqrestore(&from_mpath->frame_queue.lock, flags);
+
+ skb_queue_walk_safe(&failq, fskb, tmp) {
+ if (skb_queue_len(&gate_mpath->frame_queue) >=
+ MESH_FRAME_QUEUE_LEN) {
+ mpath_dbg(gate_mpath->sdata, "mpath queue full!\n");
+ break;
+ }
+
+ skb = skb_copy(fskb, GFP_ATOMIC);
+ if (WARN_ON(!skb))
+ break;
+
+ prepare_for_gate(skb, gate_mpath->dst, gate_mpath);
+ skb_queue_tail(&gate_mpath->frame_queue, skb);
+
+ if (copy)
+ continue;
+
+ __skb_unlink(fskb, &failq);
+ kfree_skb(fskb);
+ }
+
+ mpath_dbg(gate_mpath->sdata, "Mpath queue for gate %pM has %d frames\n",
+ gate_mpath->dst, skb_queue_len(&gate_mpath->frame_queue));
+
+ if (!copy)
+ return;
+
+ spin_lock_irqsave(&from_mpath->frame_queue.lock, flags);
+ skb_queue_splice(&failq, &from_mpath->frame_queue);
+ spin_unlock_irqrestore(&from_mpath->frame_queue.lock, flags);
+}
+
+
+static struct mesh_path *mpath_lookup(struct mesh_table *tbl, const u8 *dst,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct mesh_path *mpath;
+ struct hlist_head *bucket;
+ struct mpath_node *node;
+
+ bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)];
+ hlist_for_each_entry_rcu(node, bucket, list) {
+ mpath = node->mpath;
+ if (mpath->sdata == sdata &&
+ ether_addr_equal(dst, mpath->dst)) {
+ if (mpath_expired(mpath)) {
+ spin_lock_bh(&mpath->state_lock);
+ mpath->flags &= ~MESH_PATH_ACTIVE;
+ spin_unlock_bh(&mpath->state_lock);
+ }
+ return mpath;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * mesh_path_lookup - look up a path in the mesh path table
+ * @sdata: local subif
+ * @dst: hardware address (ETH_ALEN length) of destination
+ *
+ * Returns: pointer to the mesh path structure, or NULL if not found
+ *
+ * Locking: must be called within a read rcu section.
+ */
+struct mesh_path *
+mesh_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst)
+{
+ return mpath_lookup(rcu_dereference(mesh_paths), dst, sdata);
+}
+
+struct mesh_path *
+mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst)
+{
+ return mpath_lookup(rcu_dereference(mpp_paths), dst, sdata);
+}
+
+
+/**
+ * mesh_path_lookup_by_idx - look up a path in the mesh path table by its index
+ * @idx: index
+ * @sdata: local subif, or NULL for all entries
+ *
+ * Returns: pointer to the mesh path structure, or NULL if not found.
+ *
+ * Locking: must be called within a read rcu section.
+ */
+struct mesh_path *
+mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx)
+{
+ struct mesh_table *tbl = rcu_dereference(mesh_paths);
+ struct mpath_node *node;
+ int i;
+ int j = 0;
+
+ for_each_mesh_entry(tbl, node, i) {
+ if (sdata && node->mpath->sdata != sdata)
+ continue;
+ if (j++ == idx) {
+ if (mpath_expired(node->mpath)) {
+ spin_lock_bh(&node->mpath->state_lock);
+ node->mpath->flags &= ~MESH_PATH_ACTIVE;
+ spin_unlock_bh(&node->mpath->state_lock);
+ }
+ return node->mpath;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * mpp_path_lookup_by_idx - look up a path in the proxy path table by its index
+ * @idx: index
+ * @sdata: local subif, or NULL for all entries
+ *
+ * Returns: pointer to the proxy path structure, or NULL if not found.
+ *
+ * Locking: must be called within a read rcu section.
+ */
+struct mesh_path *
+mpp_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx)
+{
+ struct mesh_table *tbl = rcu_dereference(mpp_paths);
+ struct mpath_node *node;
+ int i;
+ int j = 0;
+
+ for_each_mesh_entry(tbl, node, i) {
+ if (sdata && node->mpath->sdata != sdata)
+ continue;
+ if (j++ == idx)
+ return node->mpath;
+ }
+
+ return NULL;
+}
+
+/**
+ * mesh_path_add_gate - add the given mpath to a mesh gate to our path table
+ * @mpath: gate path to add to table
+ */
+int mesh_path_add_gate(struct mesh_path *mpath)
+{
+ struct mesh_table *tbl;
+ struct mpath_node *gate, *new_gate;
+ int err;
+
+ rcu_read_lock();
+ tbl = rcu_dereference(mesh_paths);
+
+ hlist_for_each_entry_rcu(gate, tbl->known_gates, list)
+ if (gate->mpath == mpath) {
+ err = -EEXIST;
+ goto err_rcu;
+ }
+
+ new_gate = kzalloc(sizeof(struct mpath_node), GFP_ATOMIC);
+ if (!new_gate) {
+ err = -ENOMEM;
+ goto err_rcu;
+ }
+
+ mpath->is_gate = true;
+ mpath->sdata->u.mesh.num_gates++;
+ new_gate->mpath = mpath;
+ spin_lock_bh(&tbl->gates_lock);
+ hlist_add_head_rcu(&new_gate->list, tbl->known_gates);
+ spin_unlock_bh(&tbl->gates_lock);
+ mpath_dbg(mpath->sdata,
+ "Mesh path: Recorded new gate: %pM. %d known gates\n",
+ mpath->dst, mpath->sdata->u.mesh.num_gates);
+ err = 0;
+err_rcu:
+ rcu_read_unlock();
+ return err;
+}
+
+/**
+ * mesh_gate_del - remove a mesh gate from the list of known gates
+ * @tbl: table which holds our list of known gates
+ * @mpath: gate mpath
+ *
+ * Locking: must be called inside rcu_read_lock() section
+ */
+static void mesh_gate_del(struct mesh_table *tbl, struct mesh_path *mpath)
+{
+ struct mpath_node *gate;
+ struct hlist_node *q;
+
+ hlist_for_each_entry_safe(gate, q, tbl->known_gates, list) {
+ if (gate->mpath != mpath)
+ continue;
+ spin_lock_bh(&tbl->gates_lock);
+ hlist_del_rcu(&gate->list);
+ kfree_rcu(gate, rcu);
+ spin_unlock_bh(&tbl->gates_lock);
+ mpath->sdata->u.mesh.num_gates--;
+ mpath->is_gate = false;
+ mpath_dbg(mpath->sdata,
+ "Mesh path: Deleted gate: %pM. %d known gates\n",
+ mpath->dst, mpath->sdata->u.mesh.num_gates);
+ break;
+ }
+}
+
+/**
+ * mesh_gate_num - number of gates known to this interface
+ * @sdata: subif data
+ */
+int mesh_gate_num(struct ieee80211_sub_if_data *sdata)
+{
+ return sdata->u.mesh.num_gates;
+}
+
+/**
+ * mesh_path_add - allocate and add a new path to the mesh path table
+ * @dst: destination address of the path (ETH_ALEN length)
+ * @sdata: local subif
+ *
+ * Returns: 0 on success
+ *
+ * State: the initial state of the new path is set to 0
+ */
+struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
+ const u8 *dst)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct ieee80211_local *local = sdata->local;
+ struct mesh_table *tbl;
+ struct mesh_path *mpath, *new_mpath;
+ struct mpath_node *node, *new_node;
+ struct hlist_head *bucket;
+ int grow = 0;
+ int err;
+ u32 hash_idx;
+
+ if (ether_addr_equal(dst, sdata->vif.addr))
+ /* never add ourselves as neighbours */
+ return ERR_PTR(-ENOTSUPP);
+
+ if (is_multicast_ether_addr(dst))
+ return ERR_PTR(-ENOTSUPP);
+
+ if (atomic_add_unless(&sdata->u.mesh.mpaths, 1, MESH_MAX_MPATHS) == 0)
+ return ERR_PTR(-ENOSPC);
+
+ read_lock_bh(&pathtbl_resize_lock);
+ tbl = resize_dereference_mesh_paths();
+
+ hash_idx = mesh_table_hash(dst, sdata, tbl);
+ bucket = &tbl->hash_buckets[hash_idx];
+
+ spin_lock(&tbl->hashwlock[hash_idx]);
+
+ hlist_for_each_entry(node, bucket, list) {
+ mpath = node->mpath;
+ if (mpath->sdata == sdata &&
+ ether_addr_equal(dst, mpath->dst))
+ goto found;
+ }
+
+ err = -ENOMEM;
+ new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC);
+ if (!new_mpath)
+ goto err_path_alloc;
+
+ new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
+ if (!new_node)
+ goto err_node_alloc;
+
+ memcpy(new_mpath->dst, dst, ETH_ALEN);
+ eth_broadcast_addr(new_mpath->rann_snd_addr);
+ new_mpath->is_root = false;
+ new_mpath->sdata = sdata;
+ new_mpath->flags = 0;
+ skb_queue_head_init(&new_mpath->frame_queue);
+ new_node->mpath = new_mpath;
+ new_mpath->timer.data = (unsigned long) new_mpath;
+ new_mpath->timer.function = mesh_path_timer;
+ new_mpath->exp_time = jiffies;
+ spin_lock_init(&new_mpath->state_lock);
+ init_timer(&new_mpath->timer);
+
+ hlist_add_head_rcu(&new_node->list, bucket);
+ if (atomic_inc_return(&tbl->entries) >=
+ tbl->mean_chain_len * (tbl->hash_mask + 1))
+ grow = 1;
+
+ mesh_paths_generation++;
+
+ if (grow) {
+ set_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+ }
+ mpath = new_mpath;
+found:
+ spin_unlock(&tbl->hashwlock[hash_idx]);
+ read_unlock_bh(&pathtbl_resize_lock);
+ return mpath;
+
+err_node_alloc:
+ kfree(new_mpath);
+err_path_alloc:
+ atomic_dec(&sdata->u.mesh.mpaths);
+ spin_unlock(&tbl->hashwlock[hash_idx]);
+ read_unlock_bh(&pathtbl_resize_lock);
+ return ERR_PTR(err);
+}
+
+static void mesh_table_free_rcu(struct rcu_head *rcu)
+{
+ struct mesh_table *tbl = container_of(rcu, struct mesh_table, rcu_head);
+
+ mesh_table_free(tbl, false);
+}
+
+void mesh_mpath_table_grow(void)
+{
+ struct mesh_table *oldtbl, *newtbl;
+
+ write_lock_bh(&pathtbl_resize_lock);
+ oldtbl = resize_dereference_mesh_paths();
+ newtbl = mesh_table_alloc(oldtbl->size_order + 1);
+ if (!newtbl)
+ goto out;
+ if (mesh_table_grow(oldtbl, newtbl) < 0) {
+ __mesh_table_free(newtbl);
+ goto out;
+ }
+ rcu_assign_pointer(mesh_paths, newtbl);
+
+ call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu);
+
+ out:
+ write_unlock_bh(&pathtbl_resize_lock);
+}
+
+void mesh_mpp_table_grow(void)
+{
+ struct mesh_table *oldtbl, *newtbl;
+
+ write_lock_bh(&pathtbl_resize_lock);
+ oldtbl = resize_dereference_mpp_paths();
+ newtbl = mesh_table_alloc(oldtbl->size_order + 1);
+ if (!newtbl)
+ goto out;
+ if (mesh_table_grow(oldtbl, newtbl) < 0) {
+ __mesh_table_free(newtbl);
+ goto out;
+ }
+ rcu_assign_pointer(mpp_paths, newtbl);
+ call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu);
+
+ out:
+ write_unlock_bh(&pathtbl_resize_lock);
+}
+
+int mpp_path_add(struct ieee80211_sub_if_data *sdata,
+ const u8 *dst, const u8 *mpp)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct ieee80211_local *local = sdata->local;
+ struct mesh_table *tbl;
+ struct mesh_path *mpath, *new_mpath;
+ struct mpath_node *node, *new_node;
+ struct hlist_head *bucket;
+ int grow = 0;
+ int err = 0;
+ u32 hash_idx;
+
+ if (ether_addr_equal(dst, sdata->vif.addr))
+ /* never add ourselves as neighbours */
+ return -ENOTSUPP;
+
+ if (is_multicast_ether_addr(dst))
+ return -ENOTSUPP;
+
+ err = -ENOMEM;
+ new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC);
+ if (!new_mpath)
+ goto err_path_alloc;
+
+ new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
+ if (!new_node)
+ goto err_node_alloc;
+
+ read_lock_bh(&pathtbl_resize_lock);
+ memcpy(new_mpath->dst, dst, ETH_ALEN);
+ memcpy(new_mpath->mpp, mpp, ETH_ALEN);
+ new_mpath->sdata = sdata;
+ new_mpath->flags = 0;
+ skb_queue_head_init(&new_mpath->frame_queue);
+ new_node->mpath = new_mpath;
+ init_timer(&new_mpath->timer);
+ new_mpath->exp_time = jiffies;
+ spin_lock_init(&new_mpath->state_lock);
+
+ tbl = resize_dereference_mpp_paths();
+
+ hash_idx = mesh_table_hash(dst, sdata, tbl);
+ bucket = &tbl->hash_buckets[hash_idx];
+
+ spin_lock(&tbl->hashwlock[hash_idx]);
+
+ err = -EEXIST;
+ hlist_for_each_entry(node, bucket, list) {
+ mpath = node->mpath;
+ if (mpath->sdata == sdata &&
+ ether_addr_equal(dst, mpath->dst))
+ goto err_exists;
+ }
+
+ hlist_add_head_rcu(&new_node->list, bucket);
+ if (atomic_inc_return(&tbl->entries) >=
+ tbl->mean_chain_len * (tbl->hash_mask + 1))
+ grow = 1;
+
+ spin_unlock(&tbl->hashwlock[hash_idx]);
+ read_unlock_bh(&pathtbl_resize_lock);
+
+ mpp_paths_generation++;
+
+ if (grow) {
+ set_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+ }
+ return 0;
+
+err_exists:
+ spin_unlock(&tbl->hashwlock[hash_idx]);
+ read_unlock_bh(&pathtbl_resize_lock);
+ kfree(new_node);
+err_node_alloc:
+ kfree(new_mpath);
+err_path_alloc:
+ return err;
+}
+
+
+/**
+ * mesh_plink_broken - deactivates paths and sends perr when a link breaks
+ *
+ * @sta: broken peer link
+ *
+ * This function must be called from the rate control algorithm if enough
+ * delivery errors suggest that a peer link is no longer usable.
+ */
+void mesh_plink_broken(struct sta_info *sta)
+{
+ struct mesh_table *tbl;
+ static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ struct mesh_path *mpath;
+ struct mpath_node *node;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ int i;
+
+ rcu_read_lock();
+ tbl = rcu_dereference(mesh_paths);
+ for_each_mesh_entry(tbl, node, i) {
+ mpath = node->mpath;
+ if (rcu_access_pointer(mpath->next_hop) == sta &&
+ mpath->flags & MESH_PATH_ACTIVE &&
+ !(mpath->flags & MESH_PATH_FIXED)) {
+ spin_lock_bh(&mpath->state_lock);
+ mpath->flags &= ~MESH_PATH_ACTIVE;
+ ++mpath->sn;
+ spin_unlock_bh(&mpath->state_lock);
+ mesh_path_error_tx(sdata,
+ sdata->u.mesh.mshcfg.element_ttl,
+ mpath->dst, mpath->sn,
+ WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void mesh_path_node_reclaim(struct rcu_head *rp)
+{
+ struct mpath_node *node = container_of(rp, struct mpath_node, rcu);
+ struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
+
+ del_timer_sync(&node->mpath->timer);
+ atomic_dec(&sdata->u.mesh.mpaths);
+ kfree(node->mpath);
+ kfree(node);
+}
+
+/* needs to be called with the corresponding hashwlock taken */
+static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node)
+{
+ struct mesh_path *mpath;
+ mpath = node->mpath;
+ spin_lock(&mpath->state_lock);
+ mpath->flags |= MESH_PATH_RESOLVING;
+ if (mpath->is_gate)
+ mesh_gate_del(tbl, mpath);
+ hlist_del_rcu(&node->list);
+ call_rcu(&node->rcu, mesh_path_node_reclaim);
+ spin_unlock(&mpath->state_lock);
+ atomic_dec(&tbl->entries);
+}
+
+/**
+ * mesh_path_flush_by_nexthop - Deletes mesh paths if their next hop matches
+ *
+ * @sta: mesh peer to match
+ *
+ * RCU notes: this function is called when a mesh plink transitions from
+ * PLINK_ESTAB to any other state, since PLINK_ESTAB state is the only one that
+ * allows path creation. This will happen before the sta can be freed (because
+ * sta_info_destroy() calls this) so any reader in a rcu read block will be
+ * protected against the plink disappearing.
+ */
+void mesh_path_flush_by_nexthop(struct sta_info *sta)
+{
+ struct mesh_table *tbl;
+ struct mesh_path *mpath;
+ struct mpath_node *node;
+ int i;
+
+ rcu_read_lock();
+ read_lock_bh(&pathtbl_resize_lock);
+ tbl = resize_dereference_mesh_paths();
+ for_each_mesh_entry(tbl, node, i) {
+ mpath = node->mpath;
+ if (rcu_access_pointer(mpath->next_hop) == sta) {
+ spin_lock(&tbl->hashwlock[i]);
+ __mesh_path_del(tbl, node);
+ spin_unlock(&tbl->hashwlock[i]);
+ }
+ }
+ read_unlock_bh(&pathtbl_resize_lock);
+ rcu_read_unlock();
+}
+
+static void table_flush_by_iface(struct mesh_table *tbl,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct mesh_path *mpath;
+ struct mpath_node *node;
+ int i;
+
+ WARN_ON(!rcu_read_lock_held());
+ for_each_mesh_entry(tbl, node, i) {
+ mpath = node->mpath;
+ if (mpath->sdata != sdata)
+ continue;
+ spin_lock_bh(&tbl->hashwlock[i]);
+ __mesh_path_del(tbl, node);
+ spin_unlock_bh(&tbl->hashwlock[i]);
+ }
+}
+
+/**
+ * mesh_path_flush_by_iface - Deletes all mesh paths associated with a given iface
+ *
+ * This function deletes both mesh paths as well as mesh portal paths.
+ *
+ * @sdata: interface data to match
+ *
+ */
+void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
+{
+ struct mesh_table *tbl;
+
+ rcu_read_lock();
+ read_lock_bh(&pathtbl_resize_lock);
+ tbl = resize_dereference_mesh_paths();
+ table_flush_by_iface(tbl, sdata);
+ tbl = resize_dereference_mpp_paths();
+ table_flush_by_iface(tbl, sdata);
+ read_unlock_bh(&pathtbl_resize_lock);
+ rcu_read_unlock();
+}
+
+/**
+ * mesh_path_del - delete a mesh path from the table
+ *
+ * @addr: dst address (ETH_ALEN length)
+ * @sdata: local subif
+ *
+ * Returns: 0 if successful
+ */
+int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+{
+ struct mesh_table *tbl;
+ struct mesh_path *mpath;
+ struct mpath_node *node;
+ struct hlist_head *bucket;
+ int hash_idx;
+ int err = 0;
+
+ read_lock_bh(&pathtbl_resize_lock);
+ tbl = resize_dereference_mesh_paths();
+ hash_idx = mesh_table_hash(addr, sdata, tbl);
+ bucket = &tbl->hash_buckets[hash_idx];
+
+ spin_lock(&tbl->hashwlock[hash_idx]);
+ hlist_for_each_entry(node, bucket, list) {
+ mpath = node->mpath;
+ if (mpath->sdata == sdata &&
+ ether_addr_equal(addr, mpath->dst)) {
+ __mesh_path_del(tbl, node);
+ goto enddel;
+ }
+ }
+
+ err = -ENXIO;
+enddel:
+ mesh_paths_generation++;
+ spin_unlock(&tbl->hashwlock[hash_idx]);
+ read_unlock_bh(&pathtbl_resize_lock);
+ return err;
+}
+
+/**
+ * mesh_path_tx_pending - sends pending frames in a mesh path queue
+ *
+ * @mpath: mesh path to activate
+ *
+ * Locking: the state_lock of the mpath structure must NOT be held when calling
+ * this function.
+ */
+void mesh_path_tx_pending(struct mesh_path *mpath)
+{
+ if (mpath->flags & MESH_PATH_ACTIVE)
+ ieee80211_add_pending_skbs(mpath->sdata->local,
+ &mpath->frame_queue);
+}
+
+/**
+ * mesh_path_send_to_gates - sends pending frames to all known mesh gates
+ *
+ * @mpath: mesh path whose queue will be emptied
+ *
+ * If there is only one gate, the frames are transferred from the failed mpath
+ * queue to that gate's queue. If there are more than one gates, the frames
+ * are copied from each gate to the next. After frames are copied, the
+ * mpath queues are emptied onto the transmission queue.
+ */
+int mesh_path_send_to_gates(struct mesh_path *mpath)
+{
+ struct ieee80211_sub_if_data *sdata = mpath->sdata;
+ struct mesh_table *tbl;
+ struct mesh_path *from_mpath = mpath;
+ struct mpath_node *gate = NULL;
+ bool copy = false;
+ struct hlist_head *known_gates;
+
+ rcu_read_lock();
+ tbl = rcu_dereference(mesh_paths);
+ known_gates = tbl->known_gates;
+ rcu_read_unlock();
+
+ if (!known_gates)
+ return -EHOSTUNREACH;
+
+ hlist_for_each_entry_rcu(gate, known_gates, list) {
+ if (gate->mpath->sdata != sdata)
+ continue;
+
+ if (gate->mpath->flags & MESH_PATH_ACTIVE) {
+ mpath_dbg(sdata, "Forwarding to %pM\n", gate->mpath->dst);
+ mesh_path_move_to_queue(gate->mpath, from_mpath, copy);
+ from_mpath = gate->mpath;
+ copy = true;
+ } else {
+ mpath_dbg(sdata,
+ "Not forwarding %p (flags %#x)\n",
+ gate->mpath, gate->mpath->flags);
+ }
+ }
+
+ hlist_for_each_entry_rcu(gate, known_gates, list)
+ if (gate->mpath->sdata == sdata) {
+ mpath_dbg(sdata, "Sending to %pM\n", gate->mpath->dst);
+ mesh_path_tx_pending(gate->mpath);
+ }
+
+ return (from_mpath == mpath) ? -EHOSTUNREACH : 0;
+}
+
+/**
+ * mesh_path_discard_frame - discard a frame whose path could not be resolved
+ *
+ * @skb: frame to discard
+ * @sdata: network subif the frame was to be sent through
+ *
+ * Locking: the function must me called within a rcu_read_lock region
+ */
+void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ sdata->u.mesh.mshstats.dropped_frames_no_route++;
+}
+
+/**
+ * mesh_path_flush_pending - free the pending queue of a mesh path
+ *
+ * @mpath: mesh path whose queue has to be freed
+ *
+ * Locking: the function must me called within a rcu_read_lock region
+ */
+void mesh_path_flush_pending(struct mesh_path *mpath)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&mpath->frame_queue)) != NULL)
+ mesh_path_discard_frame(mpath->sdata, skb);
+}
+
+/**
+ * mesh_path_fix_nexthop - force a specific next hop for a mesh path
+ *
+ * @mpath: the mesh path to modify
+ * @next_hop: the next hop to force
+ *
+ * Locking: this function must be called holding mpath->state_lock
+ */
+void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop)
+{
+ spin_lock_bh(&mpath->state_lock);
+ mesh_path_assign_nexthop(mpath, next_hop);
+ mpath->sn = 0xffff;
+ mpath->metric = 0;
+ mpath->hop_count = 0;
+ mpath->exp_time = 0;
+ mpath->flags |= MESH_PATH_FIXED;
+ mesh_path_activate(mpath);
+ spin_unlock_bh(&mpath->state_lock);
+ mesh_path_tx_pending(mpath);
+}
+
+static void mesh_path_node_free(struct hlist_node *p, bool free_leafs)
+{
+ struct mesh_path *mpath;
+ struct mpath_node *node = hlist_entry(p, struct mpath_node, list);
+ mpath = node->mpath;
+ hlist_del_rcu(p);
+ if (free_leafs) {
+ del_timer_sync(&mpath->timer);
+ kfree(mpath);
+ }
+ kfree(node);
+}
+
+static int mesh_path_node_copy(struct hlist_node *p, struct mesh_table *newtbl)
+{
+ struct mesh_path *mpath;
+ struct mpath_node *node, *new_node;
+ u32 hash_idx;
+
+ new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
+ if (new_node == NULL)
+ return -ENOMEM;
+
+ node = hlist_entry(p, struct mpath_node, list);
+ mpath = node->mpath;
+ new_node->mpath = mpath;
+ hash_idx = mesh_table_hash(mpath->dst, mpath->sdata, newtbl);
+ hlist_add_head(&new_node->list,
+ &newtbl->hash_buckets[hash_idx]);
+ return 0;
+}
+
+int mesh_pathtbl_init(void)
+{
+ struct mesh_table *tbl_path, *tbl_mpp;
+ int ret;
+
+ tbl_path = mesh_table_alloc(INIT_PATHS_SIZE_ORDER);
+ if (!tbl_path)
+ return -ENOMEM;
+ tbl_path->free_node = &mesh_path_node_free;
+ tbl_path->copy_node = &mesh_path_node_copy;
+ tbl_path->mean_chain_len = MEAN_CHAIN_LEN;
+ tbl_path->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC);
+ if (!tbl_path->known_gates) {
+ ret = -ENOMEM;
+ goto free_path;
+ }
+ INIT_HLIST_HEAD(tbl_path->known_gates);
+
+
+ tbl_mpp = mesh_table_alloc(INIT_PATHS_SIZE_ORDER);
+ if (!tbl_mpp) {
+ ret = -ENOMEM;
+ goto free_path;
+ }
+ tbl_mpp->free_node = &mesh_path_node_free;
+ tbl_mpp->copy_node = &mesh_path_node_copy;
+ tbl_mpp->mean_chain_len = MEAN_CHAIN_LEN;
+ tbl_mpp->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC);
+ if (!tbl_mpp->known_gates) {
+ ret = -ENOMEM;
+ goto free_mpp;
+ }
+ INIT_HLIST_HEAD(tbl_mpp->known_gates);
+
+ /* Need no locking since this is during init */
+ RCU_INIT_POINTER(mesh_paths, tbl_path);
+ RCU_INIT_POINTER(mpp_paths, tbl_mpp);
+
+ return 0;
+
+free_mpp:
+ mesh_table_free(tbl_mpp, true);
+free_path:
+ mesh_table_free(tbl_path, true);
+ return ret;
+}
+
+void mesh_path_expire(struct ieee80211_sub_if_data *sdata)
+{
+ struct mesh_table *tbl;
+ struct mesh_path *mpath;
+ struct mpath_node *node;
+ int i;
+
+ rcu_read_lock();
+ tbl = rcu_dereference(mesh_paths);
+ for_each_mesh_entry(tbl, node, i) {
+ if (node->mpath->sdata != sdata)
+ continue;
+ mpath = node->mpath;
+ if ((!(mpath->flags & MESH_PATH_RESOLVING)) &&
+ (!(mpath->flags & MESH_PATH_FIXED)) &&
+ time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
+ mesh_path_del(mpath->sdata, mpath->dst);
+ }
+ rcu_read_unlock();
+}
+
+void mesh_pathtbl_unregister(void)
+{
+ /* no need for locking during exit path */
+ mesh_table_free(rcu_dereference_protected(mesh_paths, 1), true);
+ mesh_table_free(rcu_dereference_protected(mpp_paths, 1), true);
+}
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
new file mode 100644
index 000000000..60d737f14
--- /dev/null
+++ b/net/mac80211/mesh_plink.c
@@ -0,0 +1,1126 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Author: Luis Carlos Cobo <luisca@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+#include "mesh.h"
+
+#define PLINK_GET_LLID(p) (p + 2)
+#define PLINK_GET_PLID(p) (p + 4)
+
+#define mod_plink_timer(s, t) (mod_timer(&s->plink_timer, \
+ jiffies + msecs_to_jiffies(t)))
+
+enum plink_event {
+ PLINK_UNDEFINED,
+ OPN_ACPT,
+ OPN_RJCT,
+ OPN_IGNR,
+ CNF_ACPT,
+ CNF_RJCT,
+ CNF_IGNR,
+ CLS_ACPT,
+ CLS_IGNR
+};
+
+static const char * const mplstates[] = {
+ [NL80211_PLINK_LISTEN] = "LISTEN",
+ [NL80211_PLINK_OPN_SNT] = "OPN-SNT",
+ [NL80211_PLINK_OPN_RCVD] = "OPN-RCVD",
+ [NL80211_PLINK_CNF_RCVD] = "CNF_RCVD",
+ [NL80211_PLINK_ESTAB] = "ESTAB",
+ [NL80211_PLINK_HOLDING] = "HOLDING",
+ [NL80211_PLINK_BLOCKED] = "BLOCKED"
+};
+
+static const char * const mplevents[] = {
+ [PLINK_UNDEFINED] = "NONE",
+ [OPN_ACPT] = "OPN_ACPT",
+ [OPN_RJCT] = "OPN_RJCT",
+ [OPN_IGNR] = "OPN_IGNR",
+ [CNF_ACPT] = "CNF_ACPT",
+ [CNF_RJCT] = "CNF_RJCT",
+ [CNF_IGNR] = "CNF_IGNR",
+ [CLS_ACPT] = "CLS_ACPT",
+ [CLS_IGNR] = "CLS_IGNR"
+};
+
+static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_self_protected_actioncode action,
+ u8 *da, u16 llid, u16 plid, u16 reason);
+
+
+/* We only need a valid sta if user configured a minimum rssi_threshold. */
+static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
+{
+ s32 rssi_threshold = sdata->u.mesh.mshcfg.rssi_threshold;
+ return rssi_threshold == 0 ||
+ (sta && (s8) -ewma_read(&sta->avg_signal) > rssi_threshold);
+}
+
+/**
+ * mesh_plink_fsm_restart - restart a mesh peer link finite state machine
+ *
+ * @sta: mesh peer link to restart
+ *
+ * Locking: this function must be called holding sta->lock
+ */
+static inline void mesh_plink_fsm_restart(struct sta_info *sta)
+{
+ sta->plink_state = NL80211_PLINK_LISTEN;
+ sta->llid = sta->plid = sta->reason = 0;
+ sta->plink_retries = 0;
+}
+
+/*
+ * mesh_set_short_slot_time - enable / disable ERP short slot time.
+ *
+ * The standard indirectly mandates mesh STAs to turn off short slot time by
+ * disallowing advertising this (802.11-2012 8.4.1.4), but that doesn't mean we
+ * can't be sneaky about it. Enable short slot time if all mesh STAs in the
+ * MBSS support ERP rates.
+ *
+ * Returns BSS_CHANGED_ERP_SLOT or 0 for no change.
+ */
+static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
+ struct sta_info *sta;
+ u32 erp_rates = 0, changed = 0;
+ int i;
+ bool short_slot = false;
+
+ if (band == IEEE80211_BAND_5GHZ) {
+ /* (IEEE 802.11-2012 19.4.5) */
+ short_slot = true;
+ goto out;
+ } else if (band != IEEE80211_BAND_2GHZ ||
+ (band == IEEE80211_BAND_2GHZ &&
+ local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
+ goto out;
+
+ for (i = 0; i < sband->n_bitrates; i++)
+ if (sband->bitrates[i].flags & IEEE80211_RATE_ERP_G)
+ erp_rates |= BIT(i);
+
+ if (!erp_rates)
+ goto out;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sdata != sta->sdata ||
+ sta->plink_state != NL80211_PLINK_ESTAB)
+ continue;
+
+ short_slot = false;
+ if (erp_rates & sta->sta.supp_rates[band])
+ short_slot = true;
+ else
+ break;
+ }
+ rcu_read_unlock();
+
+out:
+ if (sdata->vif.bss_conf.use_short_slot != short_slot) {
+ sdata->vif.bss_conf.use_short_slot = short_slot;
+ changed = BSS_CHANGED_ERP_SLOT;
+ mpl_dbg(sdata, "mesh_plink %pM: ERP short slot time %d\n",
+ sdata->vif.addr, short_slot);
+ }
+ return changed;
+}
+
+/**
+ * mesh_set_ht_prot_mode - set correct HT protection mode
+ *
+ * Section 9.23.3.5 of IEEE 80211-2012 describes the protection rules for HT
+ * mesh STA in a MBSS. Three HT protection modes are supported for now, non-HT
+ * mixed mode, 20MHz-protection and no-protection mode. non-HT mixed mode is
+ * selected if any non-HT peers are present in our MBSS. 20MHz-protection mode
+ * is selected if all peers in our 20/40MHz MBSS support HT and atleast one
+ * HT20 peer is present. Otherwise no-protection mode is selected.
+ */
+static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ u16 ht_opmode;
+ bool non_ht_sta = false, ht20_sta = false;
+
+ switch (sdata->vif.bss_conf.chandef.width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ return 0;
+ default:
+ break;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sdata != sta->sdata ||
+ sta->plink_state != NL80211_PLINK_ESTAB)
+ continue;
+
+ if (sta->sta.bandwidth > IEEE80211_STA_RX_BW_20)
+ continue;
+
+ if (!sta->sta.ht_cap.ht_supported) {
+ mpl_dbg(sdata, "nonHT sta (%pM) is present\n",
+ sta->sta.addr);
+ non_ht_sta = true;
+ break;
+ }
+
+ mpl_dbg(sdata, "HT20 sta (%pM) is present\n", sta->sta.addr);
+ ht20_sta = true;
+ }
+ rcu_read_unlock();
+
+ if (non_ht_sta)
+ ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED;
+ else if (ht20_sta &&
+ sdata->vif.bss_conf.chandef.width > NL80211_CHAN_WIDTH_20)
+ ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_20MHZ;
+ else
+ ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONE;
+
+ if (sdata->vif.bss_conf.ht_operation_mode == ht_opmode)
+ return 0;
+
+ sdata->vif.bss_conf.ht_operation_mode = ht_opmode;
+ sdata->u.mesh.mshcfg.ht_opmode = ht_opmode;
+ mpl_dbg(sdata, "selected new HT protection mode %d\n", ht_opmode);
+ return BSS_CHANGED_HT;
+}
+
+/**
+ * __mesh_plink_deactivate - deactivate mesh peer link
+ *
+ * @sta: mesh peer link to deactivate
+ *
+ * All mesh paths with this peer as next hop will be flushed
+ * Returns beacon changed flag if the beacon content changed.
+ *
+ * Locking: the caller must hold sta->lock
+ */
+static u32 __mesh_plink_deactivate(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u32 changed = 0;
+
+ if (sta->plink_state == NL80211_PLINK_ESTAB)
+ changed = mesh_plink_dec_estab_count(sdata);
+ sta->plink_state = NL80211_PLINK_BLOCKED;
+ mesh_path_flush_by_nexthop(sta);
+
+ ieee80211_mps_sta_status_update(sta);
+ changed |= ieee80211_mps_set_sta_local_pm(sta,
+ NL80211_MESH_POWER_UNKNOWN);
+
+ return changed;
+}
+
+/**
+ * mesh_plink_deactivate - deactivate mesh peer link
+ *
+ * @sta: mesh peer link to deactivate
+ *
+ * All mesh paths with this peer as next hop will be flushed
+ */
+u32 mesh_plink_deactivate(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u32 changed;
+
+ spin_lock_bh(&sta->lock);
+ changed = __mesh_plink_deactivate(sta);
+ sta->reason = WLAN_REASON_MESH_PEER_CANCELED;
+ mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE,
+ sta->sta.addr, sta->llid, sta->plid,
+ sta->reason);
+ spin_unlock_bh(&sta->lock);
+
+ return changed;
+}
+
+static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_self_protected_actioncode action,
+ u8 *da, u16 llid, u16 plid, u16 reason)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_tx_info *info;
+ struct ieee80211_mgmt *mgmt;
+ bool include_plid = false;
+ u16 peering_proto = 0;
+ u8 *pos, ie_len = 4;
+ int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.self_prot) +
+ sizeof(mgmt->u.action.u.self_prot);
+ int err = -ENOMEM;
+
+ skb = dev_alloc_skb(local->tx_headroom +
+ hdr_len +
+ 2 + /* capability info */
+ 2 + /* AID */
+ 2 + 8 + /* supported rates */
+ 2 + (IEEE80211_MAX_SUPP_RATES - 8) +
+ 2 + sdata->u.mesh.mesh_id_len +
+ 2 + sizeof(struct ieee80211_meshconf_ie) +
+ 2 + sizeof(struct ieee80211_ht_cap) +
+ 2 + sizeof(struct ieee80211_ht_operation) +
+ 2 + 8 + /* peering IE */
+ sdata->u.mesh.ie_len);
+ if (!skb)
+ return err;
+ info = IEEE80211_SKB_CB(skb);
+ skb_reserve(skb, local->tx_headroom);
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len);
+ memset(mgmt, 0, hdr_len);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+ memcpy(mgmt->da, da, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+ mgmt->u.action.category = WLAN_CATEGORY_SELF_PROTECTED;
+ mgmt->u.action.u.self_prot.action_code = action;
+
+ if (action != WLAN_SP_MESH_PEERING_CLOSE) {
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+
+ /* capability info */
+ pos = skb_put(skb, 2);
+ memset(pos, 0, 2);
+ if (action == WLAN_SP_MESH_PEERING_CONFIRM) {
+ /* AID */
+ pos = skb_put(skb, 2);
+ put_unaligned_le16(plid, pos + 2);
+ }
+ if (ieee80211_add_srates_ie(sdata, skb, true, band) ||
+ ieee80211_add_ext_srates_ie(sdata, skb, true, band) ||
+ mesh_add_rsn_ie(sdata, skb) ||
+ mesh_add_meshid_ie(sdata, skb) ||
+ mesh_add_meshconf_ie(sdata, skb))
+ goto free;
+ } else { /* WLAN_SP_MESH_PEERING_CLOSE */
+ info->flags |= IEEE80211_TX_CTL_NO_ACK;
+ if (mesh_add_meshid_ie(sdata, skb))
+ goto free;
+ }
+
+ /* Add Mesh Peering Management element */
+ switch (action) {
+ case WLAN_SP_MESH_PEERING_OPEN:
+ break;
+ case WLAN_SP_MESH_PEERING_CONFIRM:
+ ie_len += 2;
+ include_plid = true;
+ break;
+ case WLAN_SP_MESH_PEERING_CLOSE:
+ if (plid) {
+ ie_len += 2;
+ include_plid = true;
+ }
+ ie_len += 2; /* reason code */
+ break;
+ default:
+ err = -EINVAL;
+ goto free;
+ }
+
+ if (WARN_ON(skb_tailroom(skb) < 2 + ie_len))
+ goto free;
+
+ pos = skb_put(skb, 2 + ie_len);
+ *pos++ = WLAN_EID_PEER_MGMT;
+ *pos++ = ie_len;
+ memcpy(pos, &peering_proto, 2);
+ pos += 2;
+ put_unaligned_le16(llid, pos);
+ pos += 2;
+ if (include_plid) {
+ put_unaligned_le16(plid, pos);
+ pos += 2;
+ }
+ if (action == WLAN_SP_MESH_PEERING_CLOSE) {
+ put_unaligned_le16(reason, pos);
+ pos += 2;
+ }
+
+ if (action != WLAN_SP_MESH_PEERING_CLOSE) {
+ if (mesh_add_ht_cap_ie(sdata, skb) ||
+ mesh_add_ht_oper_ie(sdata, skb))
+ goto free;
+ }
+
+ if (mesh_add_vendor_ies(sdata, skb))
+ goto free;
+
+ ieee80211_tx_skb(sdata, skb);
+ return 0;
+free:
+ kfree_skb(skb);
+ return err;
+}
+
+static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee802_11_elems *elems, bool insert)
+{
+ struct ieee80211_local *local = sdata->local;
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ struct ieee80211_supported_band *sband;
+ u32 rates, basic_rates = 0, changed = 0;
+ enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth;
+
+ sband = local->hw.wiphy->bands[band];
+ rates = ieee80211_sta_get_rates(sdata, elems, band, &basic_rates);
+
+ spin_lock_bh(&sta->lock);
+ sta->last_rx = jiffies;
+
+ /* rates and capabilities don't change during peering */
+ if (sta->plink_state == NL80211_PLINK_ESTAB)
+ goto out;
+
+ if (sta->sta.supp_rates[band] != rates)
+ changed |= IEEE80211_RC_SUPP_RATES_CHANGED;
+ sta->sta.supp_rates[band] = rates;
+
+ if (ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
+ elems->ht_cap_elem, sta))
+ changed |= IEEE80211_RC_BW_CHANGED;
+
+ if (bw != sta->sta.bandwidth)
+ changed |= IEEE80211_RC_BW_CHANGED;
+
+ /* HT peer is operating 20MHz-only */
+ if (elems->ht_operation &&
+ !(elems->ht_operation->ht_param &
+ IEEE80211_HT_PARAM_CHAN_WIDTH_ANY)) {
+ if (sta->sta.bandwidth != IEEE80211_STA_RX_BW_20)
+ changed |= IEEE80211_RC_BW_CHANGED;
+ sta->sta.bandwidth = IEEE80211_STA_RX_BW_20;
+ }
+
+ if (insert)
+ rate_control_rate_init(sta);
+ else
+ rate_control_rate_update(local, sband, sta, changed);
+out:
+ spin_unlock_bh(&sta->lock);
+}
+
+static struct sta_info *
+__mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr)
+{
+ struct sta_info *sta;
+
+ if (sdata->local->num_sta >= MESH_MAX_PLINKS)
+ return NULL;
+
+ sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL);
+ if (!sta)
+ return NULL;
+
+ sta->plink_state = NL80211_PLINK_LISTEN;
+ sta->sta.wme = true;
+
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
+ sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
+ sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED);
+
+ return sta;
+}
+
+static struct sta_info *
+mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr,
+ struct ieee802_11_elems *elems)
+{
+ struct sta_info *sta = NULL;
+
+ /* Userspace handles station allocation */
+ if (sdata->u.mesh.user_mpm ||
+ sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED)
+ cfg80211_notify_new_peer_candidate(sdata->dev, addr,
+ elems->ie_start,
+ elems->total_len,
+ GFP_KERNEL);
+ else
+ sta = __mesh_sta_info_alloc(sdata, addr);
+
+ return sta;
+}
+
+/*
+ * mesh_sta_info_get - return mesh sta info entry for @addr.
+ *
+ * @sdata: local meshif
+ * @addr: peer's address
+ * @elems: IEs from beacon or mesh peering frame.
+ *
+ * Return existing or newly allocated sta_info under RCU read lock.
+ * (re)initialize with given IEs.
+ */
+static struct sta_info *
+mesh_sta_info_get(struct ieee80211_sub_if_data *sdata,
+ u8 *addr, struct ieee802_11_elems *elems) __acquires(RCU)
+{
+ struct sta_info *sta = NULL;
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, addr);
+ if (sta) {
+ mesh_sta_info_init(sdata, sta, elems, false);
+ } else {
+ rcu_read_unlock();
+ /* can't run atomic */
+ sta = mesh_sta_info_alloc(sdata, addr, elems);
+ if (!sta) {
+ rcu_read_lock();
+ return NULL;
+ }
+
+ mesh_sta_info_init(sdata, sta, elems, true);
+
+ if (sta_info_insert_rcu(sta))
+ return NULL;
+ }
+
+ return sta;
+}
+
+/*
+ * mesh_neighbour_update - update or initialize new mesh neighbor.
+ *
+ * @sdata: local meshif
+ * @addr: peer's address
+ * @elems: IEs from beacon or mesh peering frame
+ *
+ * Initiates peering if appropriate.
+ */
+void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata,
+ u8 *hw_addr,
+ struct ieee802_11_elems *elems)
+{
+ struct sta_info *sta;
+ u32 changed = 0;
+
+ sta = mesh_sta_info_get(sdata, hw_addr, elems);
+ if (!sta)
+ goto out;
+
+ if (mesh_peer_accepts_plinks(elems) &&
+ sta->plink_state == NL80211_PLINK_LISTEN &&
+ sdata->u.mesh.accepting_plinks &&
+ sdata->u.mesh.mshcfg.auto_open_plinks &&
+ rssi_threshold_check(sdata, sta))
+ changed = mesh_plink_open(sta);
+
+ ieee80211_mps_frame_release(sta, elems);
+out:
+ rcu_read_unlock();
+ ieee80211_mbss_info_change_notify(sdata, changed);
+}
+
+static void mesh_plink_timer(unsigned long data)
+{
+ struct sta_info *sta;
+ u16 reason = 0;
+ struct ieee80211_sub_if_data *sdata;
+ struct mesh_config *mshcfg;
+ enum ieee80211_self_protected_actioncode action = 0;
+
+ /*
+ * This STA is valid because sta_info_destroy() will
+ * del_timer_sync() this timer after having made sure
+ * it cannot be readded (by deleting the plink.)
+ */
+ sta = (struct sta_info *) data;
+
+ if (sta->sdata->local->quiescing)
+ return;
+
+ spin_lock_bh(&sta->lock);
+
+ /* If a timer fires just before a state transition on another CPU,
+ * we may have already extended the timeout and changed state by the
+ * time we've acquired the lock and arrived here. In that case,
+ * skip this timer and wait for the new one.
+ */
+ if (time_before(jiffies, sta->plink_timer.expires)) {
+ mpl_dbg(sta->sdata,
+ "Ignoring timer for %pM in state %s (timer adjusted)",
+ sta->sta.addr, mplstates[sta->plink_state]);
+ spin_unlock_bh(&sta->lock);
+ return;
+ }
+
+ /* del_timer() and handler may race when entering these states */
+ if (sta->plink_state == NL80211_PLINK_LISTEN ||
+ sta->plink_state == NL80211_PLINK_ESTAB) {
+ mpl_dbg(sta->sdata,
+ "Ignoring timer for %pM in state %s (timer deleted)",
+ sta->sta.addr, mplstates[sta->plink_state]);
+ spin_unlock_bh(&sta->lock);
+ return;
+ }
+
+ mpl_dbg(sta->sdata,
+ "Mesh plink timer for %pM fired on state %s\n",
+ sta->sta.addr, mplstates[sta->plink_state]);
+ sdata = sta->sdata;
+ mshcfg = &sdata->u.mesh.mshcfg;
+
+ switch (sta->plink_state) {
+ case NL80211_PLINK_OPN_RCVD:
+ case NL80211_PLINK_OPN_SNT:
+ /* retry timer */
+ if (sta->plink_retries < mshcfg->dot11MeshMaxRetries) {
+ u32 rand;
+ mpl_dbg(sta->sdata,
+ "Mesh plink for %pM (retry, timeout): %d %d\n",
+ sta->sta.addr, sta->plink_retries,
+ sta->plink_timeout);
+ get_random_bytes(&rand, sizeof(u32));
+ sta->plink_timeout = sta->plink_timeout +
+ rand % sta->plink_timeout;
+ ++sta->plink_retries;
+ mod_plink_timer(sta, sta->plink_timeout);
+ action = WLAN_SP_MESH_PEERING_OPEN;
+ break;
+ }
+ reason = WLAN_REASON_MESH_MAX_RETRIES;
+ /* fall through on else */
+ case NL80211_PLINK_CNF_RCVD:
+ /* confirm timer */
+ if (!reason)
+ reason = WLAN_REASON_MESH_CONFIRM_TIMEOUT;
+ sta->plink_state = NL80211_PLINK_HOLDING;
+ mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout);
+ action = WLAN_SP_MESH_PEERING_CLOSE;
+ break;
+ case NL80211_PLINK_HOLDING:
+ /* holding timer */
+ del_timer(&sta->plink_timer);
+ mesh_plink_fsm_restart(sta);
+ break;
+ default:
+ break;
+ }
+ spin_unlock_bh(&sta->lock);
+ if (action)
+ mesh_plink_frame_tx(sdata, action, sta->sta.addr,
+ sta->llid, sta->plid, reason);
+}
+
+static inline void mesh_plink_timer_set(struct sta_info *sta, u32 timeout)
+{
+ sta->plink_timer.expires = jiffies + msecs_to_jiffies(timeout);
+ sta->plink_timer.data = (unsigned long) sta;
+ sta->plink_timer.function = mesh_plink_timer;
+ sta->plink_timeout = timeout;
+ add_timer(&sta->plink_timer);
+}
+
+static bool llid_in_use(struct ieee80211_sub_if_data *sdata,
+ u16 llid)
+{
+ struct ieee80211_local *local = sdata->local;
+ bool in_use = false;
+ struct sta_info *sta;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (!memcmp(&sta->llid, &llid, sizeof(llid))) {
+ in_use = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return in_use;
+}
+
+static u16 mesh_get_new_llid(struct ieee80211_sub_if_data *sdata)
+{
+ u16 llid;
+
+ do {
+ get_random_bytes(&llid, sizeof(llid));
+ /* for mesh PS we still only have the AID range for TIM bits */
+ llid = (llid % IEEE80211_MAX_AID) + 1;
+ } while (llid_in_use(sdata, llid));
+
+ return llid;
+}
+
+u32 mesh_plink_open(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u32 changed;
+
+ if (!test_sta_flag(sta, WLAN_STA_AUTH))
+ return 0;
+
+ spin_lock_bh(&sta->lock);
+ sta->llid = mesh_get_new_llid(sdata);
+ if (sta->plink_state != NL80211_PLINK_LISTEN &&
+ sta->plink_state != NL80211_PLINK_BLOCKED) {
+ spin_unlock_bh(&sta->lock);
+ return 0;
+ }
+ sta->plink_state = NL80211_PLINK_OPN_SNT;
+ mesh_plink_timer_set(sta, sdata->u.mesh.mshcfg.dot11MeshRetryTimeout);
+ spin_unlock_bh(&sta->lock);
+ mpl_dbg(sdata,
+ "Mesh plink: starting establishment with %pM\n",
+ sta->sta.addr);
+
+ /* set the non-peer mode to active during peering */
+ changed = ieee80211_mps_local_status_update(sdata);
+
+ mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_OPEN,
+ sta->sta.addr, sta->llid, 0, 0);
+ return changed;
+}
+
+u32 mesh_plink_block(struct sta_info *sta)
+{
+ u32 changed;
+
+ spin_lock_bh(&sta->lock);
+ changed = __mesh_plink_deactivate(sta);
+ sta->plink_state = NL80211_PLINK_BLOCKED;
+ spin_unlock_bh(&sta->lock);
+
+ return changed;
+}
+
+static void mesh_plink_close(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ enum plink_event event)
+{
+ struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg;
+
+ u16 reason = (event == CLS_ACPT) ?
+ WLAN_REASON_MESH_CLOSE : WLAN_REASON_MESH_CONFIG;
+
+ sta->reason = reason;
+ sta->plink_state = NL80211_PLINK_HOLDING;
+ mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout);
+}
+
+static u32 mesh_plink_establish(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
+{
+ struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg;
+ u32 changed = 0;
+
+ del_timer(&sta->plink_timer);
+ sta->plink_state = NL80211_PLINK_ESTAB;
+ changed |= mesh_plink_inc_estab_count(sdata);
+ changed |= mesh_set_ht_prot_mode(sdata);
+ changed |= mesh_set_short_slot_time(sdata);
+ mpl_dbg(sdata, "Mesh plink with %pM ESTABLISHED\n", sta->sta.addr);
+ ieee80211_mps_sta_status_update(sta);
+ changed |= ieee80211_mps_set_sta_local_pm(sta, mshcfg->power_mode);
+ return changed;
+}
+
+/**
+ * mesh_plink_fsm - step @sta MPM based on @event
+ *
+ * @sdata: interface
+ * @sta: mesh neighbor
+ * @event: peering event
+ *
+ * Return: changed MBSS flags
+ */
+static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, enum plink_event event)
+{
+ struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg;
+ enum ieee80211_self_protected_actioncode action = 0;
+ u32 changed = 0;
+
+ mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr,
+ mplstates[sta->plink_state], mplevents[event]);
+
+ spin_lock_bh(&sta->lock);
+ switch (sta->plink_state) {
+ case NL80211_PLINK_LISTEN:
+ switch (event) {
+ case CLS_ACPT:
+ mesh_plink_fsm_restart(sta);
+ break;
+ case OPN_ACPT:
+ sta->plink_state = NL80211_PLINK_OPN_RCVD;
+ sta->llid = mesh_get_new_llid(sdata);
+ mesh_plink_timer_set(sta,
+ mshcfg->dot11MeshRetryTimeout);
+
+ /* set the non-peer mode to active during peering */
+ changed |= ieee80211_mps_local_status_update(sdata);
+ action = WLAN_SP_MESH_PEERING_OPEN;
+ break;
+ default:
+ break;
+ }
+ break;
+ case NL80211_PLINK_OPN_SNT:
+ switch (event) {
+ case OPN_RJCT:
+ case CNF_RJCT:
+ case CLS_ACPT:
+ mesh_plink_close(sdata, sta, event);
+ action = WLAN_SP_MESH_PEERING_CLOSE;
+ break;
+ case OPN_ACPT:
+ /* retry timer is left untouched */
+ sta->plink_state = NL80211_PLINK_OPN_RCVD;
+ action = WLAN_SP_MESH_PEERING_CONFIRM;
+ break;
+ case CNF_ACPT:
+ sta->plink_state = NL80211_PLINK_CNF_RCVD;
+ mod_plink_timer(sta, mshcfg->dot11MeshConfirmTimeout);
+ break;
+ default:
+ break;
+ }
+ break;
+ case NL80211_PLINK_OPN_RCVD:
+ switch (event) {
+ case OPN_RJCT:
+ case CNF_RJCT:
+ case CLS_ACPT:
+ mesh_plink_close(sdata, sta, event);
+ action = WLAN_SP_MESH_PEERING_CLOSE;
+ break;
+ case OPN_ACPT:
+ action = WLAN_SP_MESH_PEERING_CONFIRM;
+ break;
+ case CNF_ACPT:
+ changed |= mesh_plink_establish(sdata, sta);
+ break;
+ default:
+ break;
+ }
+ break;
+ case NL80211_PLINK_CNF_RCVD:
+ switch (event) {
+ case OPN_RJCT:
+ case CNF_RJCT:
+ case CLS_ACPT:
+ mesh_plink_close(sdata, sta, event);
+ action = WLAN_SP_MESH_PEERING_CLOSE;
+ break;
+ case OPN_ACPT:
+ changed |= mesh_plink_establish(sdata, sta);
+ action = WLAN_SP_MESH_PEERING_CONFIRM;
+ break;
+ default:
+ break;
+ }
+ break;
+ case NL80211_PLINK_ESTAB:
+ switch (event) {
+ case CLS_ACPT:
+ changed |= __mesh_plink_deactivate(sta);
+ changed |= mesh_set_ht_prot_mode(sdata);
+ changed |= mesh_set_short_slot_time(sdata);
+ mesh_plink_close(sdata, sta, event);
+ action = WLAN_SP_MESH_PEERING_CLOSE;
+ break;
+ case OPN_ACPT:
+ action = WLAN_SP_MESH_PEERING_CONFIRM;
+ break;
+ default:
+ break;
+ }
+ break;
+ case NL80211_PLINK_HOLDING:
+ switch (event) {
+ case CLS_ACPT:
+ del_timer(&sta->plink_timer);
+ mesh_plink_fsm_restart(sta);
+ break;
+ case OPN_ACPT:
+ case CNF_ACPT:
+ case OPN_RJCT:
+ case CNF_RJCT:
+ action = WLAN_SP_MESH_PEERING_CLOSE;
+ break;
+ default:
+ break;
+ }
+ break;
+ default:
+ /* should not get here, PLINK_BLOCKED is dealt with at the
+ * beginning of the function
+ */
+ break;
+ }
+ spin_unlock_bh(&sta->lock);
+ if (action) {
+ mesh_plink_frame_tx(sdata, action, sta->sta.addr,
+ sta->llid, sta->plid, sta->reason);
+
+ /* also send confirm in open case */
+ if (action == WLAN_SP_MESH_PEERING_OPEN) {
+ mesh_plink_frame_tx(sdata,
+ WLAN_SP_MESH_PEERING_CONFIRM,
+ sta->sta.addr, sta->llid,
+ sta->plid, 0);
+ }
+ }
+
+ return changed;
+}
+
+/*
+ * mesh_plink_get_event - get correct MPM event
+ *
+ * @sdata: interface
+ * @sta: peer, leave NULL if processing a frame from a new suitable peer
+ * @elems: peering management IEs
+ * @ftype: frame type
+ * @llid: peer's peer link ID
+ * @plid: peer's local link ID
+ *
+ * Return: new peering event for @sta, but PLINK_UNDEFINED should be treated as
+ * an error.
+ */
+static enum plink_event
+mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee802_11_elems *elems,
+ enum ieee80211_self_protected_actioncode ftype,
+ u16 llid, u16 plid)
+{
+ enum plink_event event = PLINK_UNDEFINED;
+ u8 ie_len = elems->peering_len;
+ bool matches_local;
+
+ matches_local = (ftype == WLAN_SP_MESH_PEERING_CLOSE ||
+ mesh_matches_local(sdata, elems));
+
+ /* deny open request from non-matching peer */
+ if (!matches_local && !sta) {
+ event = OPN_RJCT;
+ goto out;
+ }
+
+ if (!sta) {
+ if (ftype != WLAN_SP_MESH_PEERING_OPEN) {
+ mpl_dbg(sdata, "Mesh plink: cls or cnf from unknown peer\n");
+ goto out;
+ }
+ /* ftype == WLAN_SP_MESH_PEERING_OPEN */
+ if (!mesh_plink_free_count(sdata)) {
+ mpl_dbg(sdata, "Mesh plink error: no more free plinks\n");
+ goto out;
+ }
+ } else {
+ if (!test_sta_flag(sta, WLAN_STA_AUTH)) {
+ mpl_dbg(sdata, "Mesh plink: Action frame from non-authed peer\n");
+ goto out;
+ }
+ if (sta->plink_state == NL80211_PLINK_BLOCKED)
+ goto out;
+ }
+
+ /* new matching peer */
+ if (!sta) {
+ event = OPN_ACPT;
+ goto out;
+ }
+
+ switch (ftype) {
+ case WLAN_SP_MESH_PEERING_OPEN:
+ if (!matches_local)
+ event = OPN_RJCT;
+ if (!mesh_plink_free_count(sdata) ||
+ (sta->plid && sta->plid != plid))
+ event = OPN_IGNR;
+ else
+ event = OPN_ACPT;
+ break;
+ case WLAN_SP_MESH_PEERING_CONFIRM:
+ if (!matches_local)
+ event = CNF_RJCT;
+ if (!mesh_plink_free_count(sdata) ||
+ sta->llid != llid ||
+ (sta->plid && sta->plid != plid))
+ event = CNF_IGNR;
+ else
+ event = CNF_ACPT;
+ break;
+ case WLAN_SP_MESH_PEERING_CLOSE:
+ if (sta->plink_state == NL80211_PLINK_ESTAB)
+ /* Do not check for llid or plid. This does not
+ * follow the standard but since multiple plinks
+ * per sta are not supported, it is necessary in
+ * order to avoid a livelock when MP A sees an
+ * establish peer link to MP B but MP B does not
+ * see it. This can be caused by a timeout in
+ * B's peer link establishment or B beign
+ * restarted.
+ */
+ event = CLS_ACPT;
+ else if (sta->plid != plid)
+ event = CLS_IGNR;
+ else if (ie_len == 8 && sta->llid != llid)
+ event = CLS_IGNR;
+ else
+ event = CLS_ACPT;
+ break;
+ default:
+ mpl_dbg(sdata, "Mesh plink: unknown frame subtype\n");
+ break;
+ }
+
+out:
+ return event;
+}
+
+static void
+mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ struct ieee802_11_elems *elems)
+{
+
+ struct sta_info *sta;
+ enum plink_event event;
+ enum ieee80211_self_protected_actioncode ftype;
+ u32 changed = 0;
+ u8 ie_len = elems->peering_len;
+ u16 plid, llid = 0;
+
+ if (!elems->peering) {
+ mpl_dbg(sdata,
+ "Mesh plink: missing necessary peer link ie\n");
+ return;
+ }
+
+ if (elems->rsn_len &&
+ sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) {
+ mpl_dbg(sdata,
+ "Mesh plink: can't establish link with secure peer\n");
+ return;
+ }
+
+ ftype = mgmt->u.action.u.self_prot.action_code;
+ if ((ftype == WLAN_SP_MESH_PEERING_OPEN && ie_len != 4) ||
+ (ftype == WLAN_SP_MESH_PEERING_CONFIRM && ie_len != 6) ||
+ (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len != 6
+ && ie_len != 8)) {
+ mpl_dbg(sdata,
+ "Mesh plink: incorrect plink ie length %d %d\n",
+ ftype, ie_len);
+ return;
+ }
+
+ if (ftype != WLAN_SP_MESH_PEERING_CLOSE &&
+ (!elems->mesh_id || !elems->mesh_config)) {
+ mpl_dbg(sdata, "Mesh plink: missing necessary ie\n");
+ return;
+ }
+ /* Note the lines below are correct, the llid in the frame is the plid
+ * from the point of view of this host.
+ */
+ plid = get_unaligned_le16(PLINK_GET_LLID(elems->peering));
+ if (ftype == WLAN_SP_MESH_PEERING_CONFIRM ||
+ (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len == 8))
+ llid = get_unaligned_le16(PLINK_GET_PLID(elems->peering));
+
+ /* WARNING: Only for sta pointer, is dropped & re-acquired */
+ rcu_read_lock();
+
+ sta = sta_info_get(sdata, mgmt->sa);
+
+ if (ftype == WLAN_SP_MESH_PEERING_OPEN &&
+ !rssi_threshold_check(sdata, sta)) {
+ mpl_dbg(sdata, "Mesh plink: %pM does not meet rssi threshold\n",
+ mgmt->sa);
+ goto unlock_rcu;
+ }
+
+ /* Now we will figure out the appropriate event... */
+ event = mesh_plink_get_event(sdata, sta, elems, ftype, llid, plid);
+
+ if (event == OPN_ACPT) {
+ rcu_read_unlock();
+ /* allocate sta entry if necessary and update info */
+ sta = mesh_sta_info_get(sdata, mgmt->sa, elems);
+ if (!sta) {
+ mpl_dbg(sdata, "Mesh plink: failed to init peer!\n");
+ goto unlock_rcu;
+ }
+ sta->plid = plid;
+ } else if (!sta && event == OPN_RJCT) {
+ mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE,
+ mgmt->sa, 0, plid,
+ WLAN_REASON_MESH_CONFIG);
+ goto unlock_rcu;
+ } else if (!sta || event == PLINK_UNDEFINED) {
+ /* something went wrong */
+ goto unlock_rcu;
+ }
+
+ /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */
+ if (!sta->plid && event == CNF_ACPT)
+ sta->plid = plid;
+
+ changed |= mesh_plink_fsm(sdata, sta, event);
+
+unlock_rcu:
+ rcu_read_unlock();
+
+ if (changed)
+ ieee80211_mbss_info_change_notify(sdata, changed);
+}
+
+void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_rx_status *rx_status)
+{
+ struct ieee802_11_elems elems;
+ size_t baselen;
+ u8 *baseaddr;
+
+ /* need action_code, aux */
+ if (len < IEEE80211_MIN_ACTION_SIZE + 3)
+ return;
+
+ if (sdata->u.mesh.user_mpm)
+ /* userspace must register for these */
+ return;
+
+ if (is_multicast_ether_addr(mgmt->da)) {
+ mpl_dbg(sdata,
+ "Mesh plink: ignore frame from multicast address\n");
+ return;
+ }
+
+ baseaddr = mgmt->u.action.u.self_prot.variable;
+ baselen = (u8 *) mgmt->u.action.u.self_prot.variable - (u8 *) mgmt;
+ if (mgmt->u.action.u.self_prot.action_code ==
+ WLAN_SP_MESH_PEERING_CONFIRM) {
+ baseaddr += 4;
+ baselen += 4;
+ }
+ ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems);
+ mesh_process_plink_frame(sdata, mgmt, &elems);
+}
diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c
new file mode 100644
index 000000000..ad8b377b4
--- /dev/null
+++ b/net/mac80211/mesh_ps.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright 2012-2013, Marco Porsch <marco.porsch@s2005.tu-chemnitz.de>
+ * Copyright 2012-2013, cozybit Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "mesh.h"
+#include "wme.h"
+
+
+/* mesh PS management */
+
+/**
+ * mps_qos_null_get - create pre-addressed QoS Null frame for mesh powersave
+ */
+static struct sk_buff *mps_qos_null_get(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_hdr *nullfunc; /* use 4addr header */
+ struct sk_buff *skb;
+ int size = sizeof(*nullfunc);
+ __le16 fc;
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + size + 2);
+ if (!skb)
+ return NULL;
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ nullfunc = (struct ieee80211_hdr *) skb_put(skb, size);
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC);
+ ieee80211_fill_mesh_addresses(nullfunc, &fc, sta->sta.addr,
+ sdata->vif.addr);
+ nullfunc->frame_control = fc;
+ nullfunc->duration_id = 0;
+ nullfunc->seq_ctrl = 0;
+ /* no address resolution for this frame -> set addr 1 immediately */
+ memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN);
+ memset(skb_put(skb, 2), 0, 2); /* append QoS control field */
+ ieee80211_mps_set_frame_flags(sdata, sta, nullfunc);
+
+ return skb;
+}
+
+/**
+ * mps_qos_null_tx - send a QoS Null to indicate link-specific power mode
+ */
+static void mps_qos_null_tx(struct sta_info *sta)
+{
+ struct sk_buff *skb;
+
+ skb = mps_qos_null_get(sta);
+ if (!skb)
+ return;
+
+ mps_dbg(sta->sdata, "announcing peer-specific power mode to %pM\n",
+ sta->sta.addr);
+
+ /* don't unintentionally start a MPSP */
+ if (!test_sta_flag(sta, WLAN_STA_PS_STA)) {
+ u8 *qc = ieee80211_get_qos_ctl((void *) skb->data);
+
+ qc[0] |= IEEE80211_QOS_CTL_EOSP;
+ }
+
+ ieee80211_tx_skb(sta->sdata, skb);
+}
+
+/**
+ * ieee80211_mps_local_status_update - track status of local link-specific PMs
+ *
+ * @sdata: local mesh subif
+ *
+ * sets the non-peer power mode and triggers the driver PS (re-)configuration
+ * Return BSS_CHANGED_BEACON if a beacon update is necessary.
+ */
+u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct sta_info *sta;
+ bool peering = false;
+ int light_sleep_cnt = 0;
+ int deep_sleep_cnt = 0;
+ u32 changed = 0;
+ enum nl80211_mesh_power_mode nonpeer_pm;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+ if (sdata != sta->sdata)
+ continue;
+
+ switch (sta->plink_state) {
+ case NL80211_PLINK_OPN_SNT:
+ case NL80211_PLINK_OPN_RCVD:
+ case NL80211_PLINK_CNF_RCVD:
+ peering = true;
+ break;
+ case NL80211_PLINK_ESTAB:
+ if (sta->local_pm == NL80211_MESH_POWER_LIGHT_SLEEP)
+ light_sleep_cnt++;
+ else if (sta->local_pm == NL80211_MESH_POWER_DEEP_SLEEP)
+ deep_sleep_cnt++;
+ break;
+ default:
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Set non-peer mode to active during peering/scanning/authentication
+ * (see IEEE802.11-2012 13.14.8.3). The non-peer mesh power mode is
+ * deep sleep if the local STA is in light or deep sleep towards at
+ * least one mesh peer (see 13.14.3.1). Otherwise, set it to the
+ * user-configured default value.
+ */
+ if (peering) {
+ mps_dbg(sdata, "setting non-peer PM to active for peering\n");
+ nonpeer_pm = NL80211_MESH_POWER_ACTIVE;
+ } else if (light_sleep_cnt || deep_sleep_cnt) {
+ mps_dbg(sdata, "setting non-peer PM to deep sleep\n");
+ nonpeer_pm = NL80211_MESH_POWER_DEEP_SLEEP;
+ } else {
+ mps_dbg(sdata, "setting non-peer PM to user value\n");
+ nonpeer_pm = ifmsh->mshcfg.power_mode;
+ }
+
+ /* need update if sleep counts move between 0 and non-zero */
+ if (ifmsh->nonpeer_pm != nonpeer_pm ||
+ !ifmsh->ps_peers_light_sleep != !light_sleep_cnt ||
+ !ifmsh->ps_peers_deep_sleep != !deep_sleep_cnt)
+ changed = BSS_CHANGED_BEACON;
+
+ ifmsh->nonpeer_pm = nonpeer_pm;
+ ifmsh->ps_peers_light_sleep = light_sleep_cnt;
+ ifmsh->ps_peers_deep_sleep = deep_sleep_cnt;
+
+ return changed;
+}
+
+/**
+ * ieee80211_mps_set_sta_local_pm - set local PM towards a mesh STA
+ *
+ * @sta: mesh STA
+ * @pm: the power mode to set
+ * Return BSS_CHANGED_BEACON if a beacon update is in order.
+ */
+u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta,
+ enum nl80211_mesh_power_mode pm)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+ if (sta->local_pm == pm)
+ return 0;
+
+ mps_dbg(sdata, "local STA operates in mode %d with %pM\n",
+ pm, sta->sta.addr);
+
+ sta->local_pm = pm;
+
+ /*
+ * announce peer-specific power mode transition
+ * (see IEEE802.11-2012 13.14.3.2 and 13.14.3.3)
+ */
+ if (sta->plink_state == NL80211_PLINK_ESTAB)
+ mps_qos_null_tx(sta);
+
+ return ieee80211_mps_local_status_update(sdata);
+}
+
+/**
+ * ieee80211_mps_set_frame_flags - set mesh PS flags in FC (and QoS Control)
+ *
+ * @sdata: local mesh subif
+ * @sta: mesh STA
+ * @hdr: 802.11 frame header
+ *
+ * see IEEE802.11-2012 8.2.4.1.7 and 8.2.4.5.11
+ *
+ * NOTE: sta must be given when an individually-addressed QoS frame header
+ * is handled, for group-addressed and management frames it is not used
+ */
+void ieee80211_mps_set_frame_flags(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_hdr *hdr)
+{
+ enum nl80211_mesh_power_mode pm;
+ u8 *qc;
+
+ if (WARN_ON(is_unicast_ether_addr(hdr->addr1) &&
+ ieee80211_is_data_qos(hdr->frame_control) &&
+ !sta))
+ return;
+
+ if (is_unicast_ether_addr(hdr->addr1) &&
+ ieee80211_is_data_qos(hdr->frame_control) &&
+ sta->plink_state == NL80211_PLINK_ESTAB)
+ pm = sta->local_pm;
+ else
+ pm = sdata->u.mesh.nonpeer_pm;
+
+ if (pm == NL80211_MESH_POWER_ACTIVE)
+ hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_PM);
+ else
+ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);
+
+ if (!ieee80211_is_data_qos(hdr->frame_control))
+ return;
+
+ qc = ieee80211_get_qos_ctl(hdr);
+
+ if ((is_unicast_ether_addr(hdr->addr1) &&
+ pm == NL80211_MESH_POWER_DEEP_SLEEP) ||
+ (is_multicast_ether_addr(hdr->addr1) &&
+ sdata->u.mesh.ps_peers_deep_sleep > 0))
+ qc[1] |= (IEEE80211_QOS_CTL_MESH_PS_LEVEL >> 8);
+ else
+ qc[1] &= ~(IEEE80211_QOS_CTL_MESH_PS_LEVEL >> 8);
+}
+
+/**
+ * ieee80211_mps_sta_status_update - update buffering status of neighbor STA
+ *
+ * @sta: mesh STA
+ *
+ * called after change of peering status or non-peer/peer-specific power mode
+ */
+void ieee80211_mps_sta_status_update(struct sta_info *sta)
+{
+ enum nl80211_mesh_power_mode pm;
+ bool do_buffer;
+
+ /* For non-assoc STA, prevent buffering or frame transmission */
+ if (sta->sta_state < IEEE80211_STA_ASSOC)
+ return;
+
+ /*
+ * use peer-specific power mode if peering is established and the
+ * peer's power mode is known
+ */
+ if (sta->plink_state == NL80211_PLINK_ESTAB &&
+ sta->peer_pm != NL80211_MESH_POWER_UNKNOWN)
+ pm = sta->peer_pm;
+ else
+ pm = sta->nonpeer_pm;
+
+ do_buffer = (pm != NL80211_MESH_POWER_ACTIVE);
+
+ /* clear the MPSP flags for non-peers or active STA */
+ if (sta->plink_state != NL80211_PLINK_ESTAB) {
+ clear_sta_flag(sta, WLAN_STA_MPSP_OWNER);
+ clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT);
+ } else if (!do_buffer) {
+ clear_sta_flag(sta, WLAN_STA_MPSP_OWNER);
+ }
+
+ /* Don't let the same PS state be set twice */
+ if (test_sta_flag(sta, WLAN_STA_PS_STA) == do_buffer)
+ return;
+
+ if (do_buffer) {
+ set_sta_flag(sta, WLAN_STA_PS_STA);
+ atomic_inc(&sta->sdata->u.mesh.ps.num_sta_ps);
+ mps_dbg(sta->sdata, "start PS buffering frames towards %pM\n",
+ sta->sta.addr);
+ } else {
+ ieee80211_sta_ps_deliver_wakeup(sta);
+ }
+}
+
+static void mps_set_sta_peer_pm(struct sta_info *sta,
+ struct ieee80211_hdr *hdr)
+{
+ enum nl80211_mesh_power_mode pm;
+ u8 *qc = ieee80211_get_qos_ctl(hdr);
+
+ /*
+ * Test Power Management field of frame control (PW) and
+ * mesh power save level subfield of QoS control field (PSL)
+ *
+ * | PM | PSL| Mesh PM |
+ * +----+----+---------+
+ * | 0 |Rsrv| Active |
+ * | 1 | 0 | Light |
+ * | 1 | 1 | Deep |
+ */
+ if (ieee80211_has_pm(hdr->frame_control)) {
+ if (qc[1] & (IEEE80211_QOS_CTL_MESH_PS_LEVEL >> 8))
+ pm = NL80211_MESH_POWER_DEEP_SLEEP;
+ else
+ pm = NL80211_MESH_POWER_LIGHT_SLEEP;
+ } else {
+ pm = NL80211_MESH_POWER_ACTIVE;
+ }
+
+ if (sta->peer_pm == pm)
+ return;
+
+ mps_dbg(sta->sdata, "STA %pM enters mode %d\n",
+ sta->sta.addr, pm);
+
+ sta->peer_pm = pm;
+
+ ieee80211_mps_sta_status_update(sta);
+}
+
+static void mps_set_sta_nonpeer_pm(struct sta_info *sta,
+ struct ieee80211_hdr *hdr)
+{
+ enum nl80211_mesh_power_mode pm;
+
+ if (ieee80211_has_pm(hdr->frame_control))
+ pm = NL80211_MESH_POWER_DEEP_SLEEP;
+ else
+ pm = NL80211_MESH_POWER_ACTIVE;
+
+ if (sta->nonpeer_pm == pm)
+ return;
+
+ mps_dbg(sta->sdata, "STA %pM sets non-peer mode to %d\n",
+ sta->sta.addr, pm);
+
+ sta->nonpeer_pm = pm;
+
+ ieee80211_mps_sta_status_update(sta);
+}
+
+/**
+ * ieee80211_mps_rx_h_sta_process - frame receive handler for mesh powersave
+ *
+ * @sta: STA info that transmitted the frame
+ * @hdr: IEEE 802.11 (QoS) Header
+ */
+void ieee80211_mps_rx_h_sta_process(struct sta_info *sta,
+ struct ieee80211_hdr *hdr)
+{
+ if (is_unicast_ether_addr(hdr->addr1) &&
+ ieee80211_is_data_qos(hdr->frame_control)) {
+ /*
+ * individually addressed QoS Data/Null frames contain
+ * peer link-specific PS mode towards the local STA
+ */
+ mps_set_sta_peer_pm(sta, hdr);
+
+ /* check for mesh Peer Service Period trigger frames */
+ ieee80211_mpsp_trigger_process(ieee80211_get_qos_ctl(hdr),
+ sta, false, false);
+ } else {
+ /*
+ * can only determine non-peer PS mode
+ * (see IEEE802.11-2012 8.2.4.1.7)
+ */
+ mps_set_sta_nonpeer_pm(sta, hdr);
+ }
+}
+
+
+/* mesh PS frame release */
+
+static void mpsp_trigger_send(struct sta_info *sta, bool rspi, bool eosp)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct sk_buff *skb;
+ struct ieee80211_hdr *nullfunc;
+ struct ieee80211_tx_info *info;
+ u8 *qc;
+
+ skb = mps_qos_null_get(sta);
+ if (!skb)
+ return;
+
+ nullfunc = (struct ieee80211_hdr *) skb->data;
+ if (!eosp)
+ nullfunc->frame_control |=
+ cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+ /*
+ * | RSPI | EOSP | MPSP triggering |
+ * +------+------+--------------------+
+ * | 0 | 0 | local STA is owner |
+ * | 0 | 1 | no MPSP (MPSP end) |
+ * | 1 | 0 | both STA are owner |
+ * | 1 | 1 | peer STA is owner | see IEEE802.11-2012 13.14.9.2
+ */
+ qc = ieee80211_get_qos_ctl(nullfunc);
+ if (rspi)
+ qc[1] |= (IEEE80211_QOS_CTL_RSPI >> 8);
+ if (eosp)
+ qc[0] |= IEEE80211_QOS_CTL_EOSP;
+
+ info = IEEE80211_SKB_CB(skb);
+
+ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER |
+ IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+ mps_dbg(sdata, "sending MPSP trigger%s%s to %pM\n",
+ rspi ? " RSPI" : "", eosp ? " EOSP" : "", sta->sta.addr);
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+/**
+ * mpsp_qos_null_append - append QoS Null frame to MPSP skb queue if needed
+ *
+ * To properly end a mesh MPSP the last transmitted frame has to set the EOSP
+ * flag in the QoS Control field. In case the current tailing frame is not a
+ * QoS Data frame, append a QoS Null to carry the flag.
+ */
+static void mpsp_qos_null_append(struct sta_info *sta,
+ struct sk_buff_head *frames)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct sk_buff *new_skb, *skb = skb_peek_tail(frames);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_tx_info *info;
+
+ if (ieee80211_is_data_qos(hdr->frame_control))
+ return;
+
+ new_skb = mps_qos_null_get(sta);
+ if (!new_skb)
+ return;
+
+ mps_dbg(sdata, "appending QoS Null in MPSP towards %pM\n",
+ sta->sta.addr);
+ /*
+ * This frame has to be transmitted last. Assign lowest priority to
+ * make sure it cannot pass other frames when releasing multiple ACs.
+ */
+ new_skb->priority = 1;
+ skb_set_queue_mapping(new_skb, IEEE80211_AC_BK);
+ ieee80211_set_qos_hdr(sdata, new_skb);
+
+ info = IEEE80211_SKB_CB(new_skb);
+ info->control.vif = &sdata->vif;
+ info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+
+ __skb_queue_tail(frames, new_skb);
+}
+
+/**
+ * mps_frame_deliver - transmit frames during mesh powersave
+ *
+ * @sta: STA info to transmit to
+ * @n_frames: number of frames to transmit. -1 for all
+ */
+static void mps_frame_deliver(struct sta_info *sta, int n_frames)
+{
+ struct ieee80211_local *local = sta->sdata->local;
+ int ac;
+ struct sk_buff_head frames;
+ struct sk_buff *skb;
+ bool more_data = false;
+
+ skb_queue_head_init(&frames);
+
+ /* collect frame(s) from buffers */
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ while (n_frames != 0) {
+ skb = skb_dequeue(&sta->tx_filtered[ac]);
+ if (!skb) {
+ skb = skb_dequeue(
+ &sta->ps_tx_buf[ac]);
+ if (skb)
+ local->total_ps_buffered--;
+ }
+ if (!skb)
+ break;
+ n_frames--;
+ __skb_queue_tail(&frames, skb);
+ }
+
+ if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
+ !skb_queue_empty(&sta->ps_tx_buf[ac]))
+ more_data = true;
+ }
+
+ /* nothing to send? -> EOSP */
+ if (skb_queue_empty(&frames)) {
+ mpsp_trigger_send(sta, false, true);
+ return;
+ }
+
+ /* in a MPSP make sure the last skb is a QoS Data frame */
+ if (test_sta_flag(sta, WLAN_STA_MPSP_OWNER))
+ mpsp_qos_null_append(sta, &frames);
+
+ mps_dbg(sta->sdata, "sending %d frames to PS STA %pM\n",
+ skb_queue_len(&frames), sta->sta.addr);
+
+ /* prepare collected frames for transmission */
+ skb_queue_walk(&frames, skb) {
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (void *) skb->data;
+
+ /*
+ * Tell TX path to send this frame even though the
+ * STA may still remain is PS mode after this frame
+ * exchange.
+ */
+ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER;
+
+ if (more_data || !skb_queue_is_last(&frames, skb))
+ hdr->frame_control |=
+ cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+ else
+ hdr->frame_control &=
+ cpu_to_le16(~IEEE80211_FCTL_MOREDATA);
+
+ if (skb_queue_is_last(&frames, skb) &&
+ ieee80211_is_data_qos(hdr->frame_control)) {
+ u8 *qoshdr = ieee80211_get_qos_ctl(hdr);
+
+ /* MPSP trigger frame ends service period */
+ *qoshdr |= IEEE80211_QOS_CTL_EOSP;
+ info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+ }
+ }
+
+ ieee80211_add_pending_skbs(local, &frames);
+ sta_info_recalc_tim(sta);
+}
+
+/**
+ * ieee80211_mpsp_trigger_process - track status of mesh Peer Service Periods
+ *
+ * @qc: QoS Control field
+ * @sta: peer to start a MPSP with
+ * @tx: frame was transmitted by the local STA
+ * @acked: frame has been transmitted successfully
+ *
+ * NOTE: active mode STA may only serve as MPSP owner
+ */
+void ieee80211_mpsp_trigger_process(u8 *qc, struct sta_info *sta,
+ bool tx, bool acked)
+{
+ u8 rspi = qc[1] & (IEEE80211_QOS_CTL_RSPI >> 8);
+ u8 eosp = qc[0] & IEEE80211_QOS_CTL_EOSP;
+
+ if (tx) {
+ if (rspi && acked)
+ set_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT);
+
+ if (eosp)
+ clear_sta_flag(sta, WLAN_STA_MPSP_OWNER);
+ else if (acked &&
+ test_sta_flag(sta, WLAN_STA_PS_STA) &&
+ !test_and_set_sta_flag(sta, WLAN_STA_MPSP_OWNER))
+ mps_frame_deliver(sta, -1);
+ } else {
+ if (eosp)
+ clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT);
+ else if (sta->local_pm != NL80211_MESH_POWER_ACTIVE)
+ set_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT);
+
+ if (rspi && !test_and_set_sta_flag(sta, WLAN_STA_MPSP_OWNER))
+ mps_frame_deliver(sta, -1);
+ }
+}
+
+/**
+ * ieee80211_mps_frame_release - release frames buffered due to mesh power save
+ *
+ * @sta: mesh STA
+ * @elems: IEs of beacon or probe response
+ *
+ * For peers if we have individually-addressed frames buffered or the peer
+ * indicates buffered frames, send a corresponding MPSP trigger frame. Since
+ * we do not evaluate the awake window duration, QoS Nulls are used as MPSP
+ * trigger frames. If the neighbour STA is not a peer, only send single frames.
+ */
+void ieee80211_mps_frame_release(struct sta_info *sta,
+ struct ieee802_11_elems *elems)
+{
+ int ac, buffer_local = 0;
+ bool has_buffered = false;
+
+ if (sta->plink_state == NL80211_PLINK_ESTAB)
+ has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len,
+ sta->llid);
+
+ if (has_buffered)
+ mps_dbg(sta->sdata, "%pM indicates buffered frames\n",
+ sta->sta.addr);
+
+ /* only transmit to PS STA with announced, non-zero awake window */
+ if (test_sta_flag(sta, WLAN_STA_PS_STA) &&
+ (!elems->awake_window || !le16_to_cpu(*elems->awake_window)))
+ return;
+
+ if (!test_sta_flag(sta, WLAN_STA_MPSP_OWNER))
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+ buffer_local += skb_queue_len(&sta->ps_tx_buf[ac]) +
+ skb_queue_len(&sta->tx_filtered[ac]);
+
+ if (!has_buffered && !buffer_local)
+ return;
+
+ if (sta->plink_state == NL80211_PLINK_ESTAB)
+ mpsp_trigger_send(sta, has_buffered, !buffer_local);
+ else
+ mps_frame_deliver(sta, 1);
+}
diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
new file mode 100644
index 000000000..09625d620
--- /dev/null
+++ b/net/mac80211/mesh_sync.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright 2011-2012, Pavel Zubarev <pavel.zubarev@gmail.com>
+ * Copyright 2011-2012, Marco Porsch <marco.porsch@s2005.tu-chemnitz.de>
+ * Copyright 2011-2012, cozybit Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "ieee80211_i.h"
+#include "mesh.h"
+#include "driver-ops.h"
+
+/* This is not in the standard. It represents a tolerable tbtt drift below
+ * which we do no TSF adjustment.
+ */
+#define TOFFSET_MINIMUM_ADJUSTMENT 10
+
+/* This is not in the standard. It is a margin added to the
+ * Toffset setpoint to mitigate TSF overcorrection
+ * introduced by TSF adjustment latency.
+ */
+#define TOFFSET_SET_MARGIN 20
+
+/* This is not in the standard. It represents the maximum Toffset jump above
+ * which we'll invalidate the Toffset setpoint and choose a new setpoint. This
+ * could be, for instance, in case a neighbor is restarted and its TSF counter
+ * reset.
+ */
+#define TOFFSET_MAXIMUM_ADJUSTMENT 30000 /* 30 ms */
+
+struct sync_method {
+ u8 method;
+ struct ieee80211_mesh_sync_ops ops;
+};
+
+/**
+ * mesh_peer_tbtt_adjusting - check if an mp is currently adjusting its TBTT
+ *
+ * @ie: information elements of a management frame from the mesh peer
+ */
+static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie)
+{
+ return (ie->mesh_config->meshconf_cap &
+ IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0;
+}
+
+void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ /* sdata->vif.bss_conf.beacon_int in 1024us units, 0.04% */
+ u64 beacon_int_fraction = sdata->vif.bss_conf.beacon_int * 1024 / 2500;
+ u64 tsf;
+ u64 tsfdelta;
+
+ spin_lock_bh(&ifmsh->sync_offset_lock);
+ if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) {
+ msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n",
+ (long long) ifmsh->sync_offset_clockdrift_max);
+ tsfdelta = -ifmsh->sync_offset_clockdrift_max;
+ ifmsh->sync_offset_clockdrift_max = 0;
+ } else {
+ msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting by %llu\n",
+ (long long) ifmsh->sync_offset_clockdrift_max,
+ (unsigned long long) beacon_int_fraction);
+ tsfdelta = -beacon_int_fraction;
+ ifmsh->sync_offset_clockdrift_max -= beacon_int_fraction;
+ }
+ spin_unlock_bh(&ifmsh->sync_offset_lock);
+
+ tsf = drv_get_tsf(local, sdata);
+ if (tsf != -1ULL)
+ drv_set_tsf(local, sdata, tsf + tsfdelta);
+}
+
+static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
+ u16 stype,
+ struct ieee80211_mgmt *mgmt,
+ struct ieee802_11_elems *elems,
+ struct ieee80211_rx_status *rx_status)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ u64 t_t, t_r;
+
+ WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET);
+
+ /* standard mentions only beacons */
+ if (stype != IEEE80211_STYPE_BEACON)
+ return;
+
+ /*
+ * Get time when timestamp field was received. If we don't
+ * have rx timestamps, then use current tsf as an approximation.
+ * drv_get_tsf() must be called before entering the rcu-read
+ * section.
+ */
+ if (ieee80211_have_rx_timestamp(rx_status))
+ t_r = ieee80211_calculate_rx_timestamp(local, rx_status,
+ 24 + 12 +
+ elems->total_len +
+ FCS_LEN,
+ 24);
+ else
+ t_r = drv_get_tsf(local, sdata);
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, mgmt->sa);
+ if (!sta)
+ goto no_sync;
+
+ /* check offset sync conditions (13.13.2.2.1)
+ *
+ * TODO also sync to
+ * dot11MeshNbrOffsetMaxNeighbor non-peer non-MBSS neighbors
+ */
+
+ if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) {
+ clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
+ msync_dbg(sdata, "STA %pM : is adjusting TBTT\n",
+ sta->sta.addr);
+ goto no_sync;
+ }
+
+ /* Timing offset calculation (see 13.13.2.2.2) */
+ t_t = le64_to_cpu(mgmt->u.beacon.timestamp);
+ sta->t_offset = t_t - t_r;
+
+ if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) {
+ s64 t_clockdrift = sta->t_offset_setpoint - sta->t_offset;
+ msync_dbg(sdata,
+ "STA %pM : sta->t_offset=%lld, sta->t_offset_setpoint=%lld, t_clockdrift=%lld\n",
+ sta->sta.addr, (long long) sta->t_offset,
+ (long long) sta->t_offset_setpoint,
+ (long long) t_clockdrift);
+
+ if (t_clockdrift > TOFFSET_MAXIMUM_ADJUSTMENT ||
+ t_clockdrift < -TOFFSET_MAXIMUM_ADJUSTMENT) {
+ msync_dbg(sdata,
+ "STA %pM : t_clockdrift=%lld too large, setpoint reset\n",
+ sta->sta.addr,
+ (long long) t_clockdrift);
+ clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
+ goto no_sync;
+ }
+
+ spin_lock_bh(&ifmsh->sync_offset_lock);
+ if (t_clockdrift > ifmsh->sync_offset_clockdrift_max)
+ ifmsh->sync_offset_clockdrift_max = t_clockdrift;
+ spin_unlock_bh(&ifmsh->sync_offset_lock);
+ } else {
+ sta->t_offset_setpoint = sta->t_offset - TOFFSET_SET_MARGIN;
+ set_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN);
+ msync_dbg(sdata,
+ "STA %pM : offset was invalid, sta->t_offset=%lld\n",
+ sta->sta.addr,
+ (long long) sta->t_offset);
+ }
+
+no_sync:
+ rcu_read_unlock();
+}
+
+static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata,
+ struct beacon_data *beacon)
+{
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u8 cap;
+
+ WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET);
+ WARN_ON(!rcu_read_lock_held());
+ cap = beacon->meshconf->meshconf_cap;
+
+ spin_lock_bh(&ifmsh->sync_offset_lock);
+
+ if (ifmsh->sync_offset_clockdrift_max > TOFFSET_MINIMUM_ADJUSTMENT) {
+ /* Since ajusting the tsf here would
+ * require a possibly blocking call
+ * to the driver tsf setter, we punt
+ * the tsf adjustment to the mesh tasklet
+ */
+ msync_dbg(sdata,
+ "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n",
+ ifmsh->sync_offset_clockdrift_max);
+ set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags);
+
+ ifmsh->adjusting_tbtt = true;
+ } else {
+ msync_dbg(sdata,
+ "TBTT : max clockdrift=%lld; too small to adjust\n",
+ (long long)ifmsh->sync_offset_clockdrift_max);
+ ifmsh->sync_offset_clockdrift_max = 0;
+
+ ifmsh->adjusting_tbtt = false;
+ }
+ spin_unlock_bh(&ifmsh->sync_offset_lock);
+
+ beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ?
+ IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap :
+ ~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap;
+}
+
+static const struct sync_method sync_methods[] = {
+ {
+ .method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET,
+ .ops = {
+ .rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp,
+ .adjust_tbtt = &mesh_sync_offset_adjust_tbtt,
+ }
+ },
+};
+
+const struct ieee80211_mesh_sync_ops *ieee80211_mesh_sync_ops_get(u8 method)
+{
+ int i;
+
+ for (i = 0 ; i < ARRAY_SIZE(sync_methods); ++i) {
+ if (sync_methods[i].method == method)
+ return &sync_methods[i].ops;
+ }
+ return NULL;
+}
diff --git a/net/mac80211/michael.c b/net/mac80211/michael.c
new file mode 100644
index 000000000..408649bd4
--- /dev/null
+++ b/net/mac80211/michael.c
@@ -0,0 +1,86 @@
+/*
+ * Michael MIC implementation - optimized for TKIP MIC operations
+ * Copyright 2002-2003, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/ieee80211.h>
+#include <asm/unaligned.h>
+
+#include "michael.h"
+
+static void michael_block(struct michael_mic_ctx *mctx, u32 val)
+{
+ mctx->l ^= val;
+ mctx->r ^= rol32(mctx->l, 17);
+ mctx->l += mctx->r;
+ mctx->r ^= ((mctx->l & 0xff00ff00) >> 8) |
+ ((mctx->l & 0x00ff00ff) << 8);
+ mctx->l += mctx->r;
+ mctx->r ^= rol32(mctx->l, 3);
+ mctx->l += mctx->r;
+ mctx->r ^= ror32(mctx->l, 2);
+ mctx->l += mctx->r;
+}
+
+static void michael_mic_hdr(struct michael_mic_ctx *mctx, const u8 *key,
+ struct ieee80211_hdr *hdr)
+{
+ u8 *da, *sa, tid;
+
+ da = ieee80211_get_DA(hdr);
+ sa = ieee80211_get_SA(hdr);
+ if (ieee80211_is_data_qos(hdr->frame_control))
+ tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+ else
+ tid = 0;
+
+ mctx->l = get_unaligned_le32(key);
+ mctx->r = get_unaligned_le32(key + 4);
+
+ /*
+ * A pseudo header (DA, SA, Priority, 0, 0, 0) is used in Michael MIC
+ * calculation, but it is _not_ transmitted
+ */
+ michael_block(mctx, get_unaligned_le32(da));
+ michael_block(mctx, get_unaligned_le16(&da[4]) |
+ (get_unaligned_le16(sa) << 16));
+ michael_block(mctx, get_unaligned_le32(&sa[2]));
+ michael_block(mctx, tid);
+}
+
+void michael_mic(const u8 *key, struct ieee80211_hdr *hdr,
+ const u8 *data, size_t data_len, u8 *mic)
+{
+ u32 val;
+ size_t block, blocks, left;
+ struct michael_mic_ctx mctx;
+
+ michael_mic_hdr(&mctx, key, hdr);
+
+ /* Real data */
+ blocks = data_len / 4;
+ left = data_len % 4;
+
+ for (block = 0; block < blocks; block++)
+ michael_block(&mctx, get_unaligned_le32(&data[block * 4]));
+
+ /* Partial block of 0..3 bytes and padding: 0x5a + 4..7 zeros to make
+ * total length a multiple of 4. */
+ val = 0x5a;
+ while (left > 0) {
+ val <<= 8;
+ left--;
+ val |= data[blocks * 4 + left];
+ }
+
+ michael_block(&mctx, val);
+ michael_block(&mctx, 0);
+
+ put_unaligned_le32(mctx.l, mic);
+ put_unaligned_le32(mctx.r, mic + 4);
+}
diff --git a/net/mac80211/michael.h b/net/mac80211/michael.h
new file mode 100644
index 000000000..0e4886f88
--- /dev/null
+++ b/net/mac80211/michael.h
@@ -0,0 +1,25 @@
+/*
+ * Michael MIC implementation - optimized for TKIP MIC operations
+ * Copyright 2002-2003, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MICHAEL_H
+#define MICHAEL_H
+
+#include <linux/types.h>
+#include <linux/ieee80211.h>
+
+#define MICHAEL_MIC_LEN 8
+
+struct michael_mic_ctx {
+ u32 l, r;
+};
+
+void michael_mic(const u8 *key, struct ieee80211_hdr *hdr,
+ const u8 *data, size_t data_len, u8 *mic);
+
+#endif /* MICHAEL_H */
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
new file mode 100644
index 000000000..26053bf2f
--- /dev/null
+++ b/net/mac80211/mlme.c
@@ -0,0 +1,5053 @@
+/*
+ * BSS client mode implementation
+ * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/rtnetlink.h>
+#include <linux/pm_qos.h>
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "led.h"
+
+#define IEEE80211_AUTH_TIMEOUT (HZ / 5)
+#define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2)
+#define IEEE80211_AUTH_TIMEOUT_SHORT (HZ / 10)
+#define IEEE80211_AUTH_MAX_TRIES 3
+#define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5)
+#define IEEE80211_ASSOC_TIMEOUT (HZ / 5)
+#define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2)
+#define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10)
+#define IEEE80211_ASSOC_MAX_TRIES 3
+
+static int max_nullfunc_tries = 2;
+module_param(max_nullfunc_tries, int, 0644);
+MODULE_PARM_DESC(max_nullfunc_tries,
+ "Maximum nullfunc tx tries before disconnecting (reason 4).");
+
+static int max_probe_tries = 5;
+module_param(max_probe_tries, int, 0644);
+MODULE_PARM_DESC(max_probe_tries,
+ "Maximum probe tries before disconnecting (reason 4).");
+
+/*
+ * Beacon loss timeout is calculated as N frames times the
+ * advertised beacon interval. This may need to be somewhat
+ * higher than what hardware might detect to account for
+ * delays in the host processing frames. But since we also
+ * probe on beacon miss before declaring the connection lost
+ * default to what we want.
+ */
+static int beacon_loss_count = 7;
+module_param(beacon_loss_count, int, 0644);
+MODULE_PARM_DESC(beacon_loss_count,
+ "Number of beacon intervals before we decide beacon was lost.");
+
+/*
+ * Time the connection can be idle before we probe
+ * it to see if we can still talk to the AP.
+ */
+#define IEEE80211_CONNECTION_IDLE_TIME (30 * HZ)
+/*
+ * Time we wait for a probe response after sending
+ * a probe request because of beacon loss or for
+ * checking the connection still works.
+ */
+static int probe_wait_ms = 500;
+module_param(probe_wait_ms, int, 0644);
+MODULE_PARM_DESC(probe_wait_ms,
+ "Maximum time(ms) to wait for probe response"
+ " before disconnecting (reason 4).");
+
+/*
+ * Weight given to the latest Beacon frame when calculating average signal
+ * strength for Beacon frames received in the current BSS. This must be
+ * between 1 and 15.
+ */
+#define IEEE80211_SIGNAL_AVE_WEIGHT 3
+
+/*
+ * How many Beacon frames need to have been used in average signal strength
+ * before starting to indicate signal change events.
+ */
+#define IEEE80211_SIGNAL_AVE_MIN_COUNT 4
+
+/*
+ * We can have multiple work items (and connection probing)
+ * scheduling this timer, but we need to take care to only
+ * reschedule it when it should fire _earlier_ than it was
+ * asked for before, or if it's not pending right now. This
+ * function ensures that. Note that it then is required to
+ * run this function for all timeouts after the first one
+ * has happened -- the work that runs from this timer will
+ * do that.
+ */
+static void run_again(struct ieee80211_sub_if_data *sdata,
+ unsigned long timeout)
+{
+ sdata_assert_lock(sdata);
+
+ if (!timer_pending(&sdata->u.mgd.timer) ||
+ time_before(timeout, sdata->u.mgd.timer.expires))
+ mod_timer(&sdata->u.mgd.timer, timeout);
+}
+
+void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata)
+{
+ if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)
+ return;
+
+ if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+ return;
+
+ mod_timer(&sdata->u.mgd.bcn_mon_timer,
+ round_jiffies_up(jiffies + sdata->u.mgd.beacon_timeout));
+}
+
+void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ if (unlikely(!sdata->u.mgd.associated))
+ return;
+
+ ifmgd->probe_send_count = 0;
+
+ if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+ return;
+
+ mod_timer(&sdata->u.mgd.conn_mon_timer,
+ round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
+}
+
+static int ecw2cw(int ecw)
+{
+ return (1 << ecw) - 1;
+}
+
+static u32
+ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_channel *channel,
+ const struct ieee80211_ht_cap *ht_cap,
+ const struct ieee80211_ht_operation *ht_oper,
+ const struct ieee80211_vht_operation *vht_oper,
+ struct cfg80211_chan_def *chandef, bool tracking)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct cfg80211_chan_def vht_chandef;
+ struct ieee80211_sta_ht_cap sta_ht_cap;
+ u32 ht_cfreq, ret;
+
+ memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
+ ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
+
+ chandef->chan = channel;
+ chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
+ chandef->center_freq1 = channel->center_freq;
+ chandef->center_freq2 = 0;
+
+ if (!ht_cap || !ht_oper || !sta_ht_cap.ht_supported) {
+ ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
+ goto out;
+ }
+
+ chandef->width = NL80211_CHAN_WIDTH_20;
+
+ if (!(ht_cap->cap_info &
+ cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40))) {
+ ret = IEEE80211_STA_DISABLE_40MHZ;
+ vht_chandef = *chandef;
+ goto out;
+ }
+
+ ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan,
+ channel->band);
+ /* check that channel matches the right operating channel */
+ if (!tracking && channel->center_freq != ht_cfreq) {
+ /*
+ * It's possible that some APs are confused here;
+ * Netgear WNDR3700 sometimes reports 4 higher than
+ * the actual channel in association responses, but
+ * since we look at probe response/beacon data here
+ * it should be OK.
+ */
+ sdata_info(sdata,
+ "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n",
+ channel->center_freq, ht_cfreq,
+ ht_oper->primary_chan, channel->band);
+ ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
+ goto out;
+ }
+
+ /* check 40 MHz support, if we have it */
+ if (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) {
+ switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
+ case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+ chandef->width = NL80211_CHAN_WIDTH_40;
+ chandef->center_freq1 += 10;
+ break;
+ case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+ chandef->width = NL80211_CHAN_WIDTH_40;
+ chandef->center_freq1 -= 10;
+ break;
+ }
+ } else {
+ /* 40 MHz (and 80 MHz) must be supported for VHT */
+ ret = IEEE80211_STA_DISABLE_VHT;
+ /* also mark 40 MHz disabled */
+ ret |= IEEE80211_STA_DISABLE_40MHZ;
+ goto out;
+ }
+
+ if (!vht_oper || !sband->vht_cap.vht_supported) {
+ ret = IEEE80211_STA_DISABLE_VHT;
+ goto out;
+ }
+
+ vht_chandef.chan = channel;
+ vht_chandef.center_freq1 =
+ ieee80211_channel_to_frequency(vht_oper->center_freq_seg1_idx,
+ channel->band);
+ vht_chandef.center_freq2 = 0;
+
+ switch (vht_oper->chan_width) {
+ case IEEE80211_VHT_CHANWIDTH_USE_HT:
+ vht_chandef.width = chandef->width;
+ vht_chandef.center_freq1 = chandef->center_freq1;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_80MHZ:
+ vht_chandef.width = NL80211_CHAN_WIDTH_80;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_160MHZ:
+ vht_chandef.width = NL80211_CHAN_WIDTH_160;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
+ vht_chandef.width = NL80211_CHAN_WIDTH_80P80;
+ vht_chandef.center_freq2 =
+ ieee80211_channel_to_frequency(
+ vht_oper->center_freq_seg2_idx,
+ channel->band);
+ break;
+ default:
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
+ sdata_info(sdata,
+ "AP VHT operation IE has invalid channel width (%d), disable VHT\n",
+ vht_oper->chan_width);
+ ret = IEEE80211_STA_DISABLE_VHT;
+ goto out;
+ }
+
+ if (!cfg80211_chandef_valid(&vht_chandef)) {
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
+ sdata_info(sdata,
+ "AP VHT information is invalid, disable VHT\n");
+ ret = IEEE80211_STA_DISABLE_VHT;
+ goto out;
+ }
+
+ if (cfg80211_chandef_identical(chandef, &vht_chandef)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) {
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
+ sdata_info(sdata,
+ "AP VHT information doesn't match HT, disable VHT\n");
+ ret = IEEE80211_STA_DISABLE_VHT;
+ goto out;
+ }
+
+ *chandef = vht_chandef;
+
+ ret = 0;
+
+out:
+ /*
+ * When tracking the current AP, don't do any further checks if the
+ * new chandef is identical to the one we're currently using for the
+ * connection. This keeps us from playing ping-pong with regulatory,
+ * without it the following can happen (for example):
+ * - connect to an AP with 80 MHz, world regdom allows 80 MHz
+ * - AP advertises regdom US
+ * - CRDA loads regdom US with 80 MHz prohibited (old database)
+ * - the code below detects an unsupported channel, downgrades, and
+ * we disconnect from the AP in the caller
+ * - disconnect causes CRDA to reload world regdomain and the game
+ * starts anew.
+ * (see https://bugzilla.kernel.org/show_bug.cgi?id=70881)
+ *
+ * It seems possible that there are still scenarios with CSA or real
+ * bandwidth changes where a this could happen, but those cases are
+ * less common and wouldn't completely prevent using the AP.
+ */
+ if (tracking &&
+ cfg80211_chandef_identical(chandef, &sdata->vif.bss_conf.chandef))
+ return ret;
+
+ /* don't print the message below for VHT mismatch if VHT is disabled */
+ if (ret & IEEE80211_STA_DISABLE_VHT)
+ vht_chandef = *chandef;
+
+ /*
+ * Ignore the DISABLED flag when we're already connected and only
+ * tracking the APs beacon for bandwidth changes - otherwise we
+ * might get disconnected here if we connect to an AP, update our
+ * regulatory information based on the AP's country IE and the
+ * information we have is wrong/outdated and disables the channel
+ * that we're actually using for the connection to the AP.
+ */
+ while (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
+ tracking ? 0 :
+ IEEE80211_CHAN_DISABLED)) {
+ if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) {
+ ret = IEEE80211_STA_DISABLE_HT |
+ IEEE80211_STA_DISABLE_VHT;
+ break;
+ }
+
+ ret |= ieee80211_chandef_downgrade(chandef);
+ }
+
+ if (chandef->width != vht_chandef.width && !tracking)
+ sdata_info(sdata,
+ "capabilities/regulatory prevented using AP HT/VHT configuration, downgraded\n");
+
+ WARN_ON_ONCE(!cfg80211_chandef_valid(chandef));
+ return ret;
+}
+
+static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ const struct ieee80211_ht_cap *ht_cap,
+ const struct ieee80211_ht_operation *ht_oper,
+ const struct ieee80211_vht_operation *vht_oper,
+ const u8 *bssid, u32 *changed)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_channel *chan;
+ struct cfg80211_chan_def chandef;
+ u16 ht_opmode;
+ u32 flags;
+ enum ieee80211_sta_rx_bandwidth new_sta_bw;
+ int ret;
+
+ /* if HT was/is disabled, don't track any bandwidth changes */
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_HT || !ht_oper)
+ return 0;
+
+ /* don't check VHT if we associated as non-VHT station */
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT)
+ vht_oper = NULL;
+
+ if (WARN_ON_ONCE(!sta))
+ return -EINVAL;
+
+ /*
+ * if bss configuration changed store the new one -
+ * this may be applicable even if channel is identical
+ */
+ ht_opmode = le16_to_cpu(ht_oper->operation_mode);
+ if (sdata->vif.bss_conf.ht_operation_mode != ht_opmode) {
+ *changed |= BSS_CHANGED_HT;
+ sdata->vif.bss_conf.ht_operation_mode = ht_opmode;
+ }
+
+ chan = sdata->vif.bss_conf.chandef.chan;
+ sband = local->hw.wiphy->bands[chan->band];
+
+ /* calculate new channel (type) based on HT/VHT operation IEs */
+ flags = ieee80211_determine_chantype(sdata, sband, chan,
+ ht_cap, ht_oper, vht_oper,
+ &chandef, true);
+
+ /*
+ * Downgrade the new channel if we associated with restricted
+ * capabilities. For example, if we associated as a 20 MHz STA
+ * to a 40 MHz AP (due to regulatory, capabilities or config
+ * reasons) then switching to a 40 MHz channel now won't do us
+ * any good -- we couldn't use it with the AP.
+ */
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_80P80MHZ &&
+ chandef.width == NL80211_CHAN_WIDTH_80P80)
+ flags |= ieee80211_chandef_downgrade(&chandef);
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_160MHZ &&
+ chandef.width == NL80211_CHAN_WIDTH_160)
+ flags |= ieee80211_chandef_downgrade(&chandef);
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_40MHZ &&
+ chandef.width > NL80211_CHAN_WIDTH_20)
+ flags |= ieee80211_chandef_downgrade(&chandef);
+
+ if (cfg80211_chandef_identical(&chandef, &sdata->vif.bss_conf.chandef))
+ return 0;
+
+ sdata_info(sdata,
+ "AP %pM changed bandwidth, new config is %d MHz, width %d (%d/%d MHz)\n",
+ ifmgd->bssid, chandef.chan->center_freq, chandef.width,
+ chandef.center_freq1, chandef.center_freq2);
+
+ if (flags != (ifmgd->flags & (IEEE80211_STA_DISABLE_HT |
+ IEEE80211_STA_DISABLE_VHT |
+ IEEE80211_STA_DISABLE_40MHZ |
+ IEEE80211_STA_DISABLE_80P80MHZ |
+ IEEE80211_STA_DISABLE_160MHZ)) ||
+ !cfg80211_chandef_valid(&chandef)) {
+ sdata_info(sdata,
+ "AP %pM changed bandwidth in a way we can't support - disconnect\n",
+ ifmgd->bssid);
+ return -EINVAL;
+ }
+
+ switch (chandef.width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ new_sta_bw = IEEE80211_STA_RX_BW_20;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ new_sta_bw = IEEE80211_STA_RX_BW_40;
+ break;
+ case NL80211_CHAN_WIDTH_80:
+ new_sta_bw = IEEE80211_STA_RX_BW_80;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_160:
+ new_sta_bw = IEEE80211_STA_RX_BW_160;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (new_sta_bw > sta->cur_max_bandwidth)
+ new_sta_bw = sta->cur_max_bandwidth;
+
+ if (new_sta_bw < sta->sta.bandwidth) {
+ sta->sta.bandwidth = new_sta_bw;
+ rate_control_rate_update(local, sband, sta,
+ IEEE80211_RC_BW_CHANGED);
+ }
+
+ ret = ieee80211_vif_change_bandwidth(sdata, &chandef, changed);
+ if (ret) {
+ sdata_info(sdata,
+ "AP %pM changed bandwidth to incompatible one - disconnect\n",
+ ifmgd->bssid);
+ return ret;
+ }
+
+ if (new_sta_bw > sta->sta.bandwidth) {
+ sta->sta.bandwidth = new_sta_bw;
+ rate_control_rate_update(local, sband, sta,
+ IEEE80211_RC_BW_CHANGED);
+ }
+
+ return 0;
+}
+
+/* frame sending functions */
+
+static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u8 ap_ht_param,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_channel *channel,
+ enum ieee80211_smps_mode smps)
+{
+ u8 *pos;
+ u32 flags = channel->flags;
+ u16 cap;
+ struct ieee80211_sta_ht_cap ht_cap;
+
+ BUILD_BUG_ON(sizeof(ht_cap) != sizeof(sband->ht_cap));
+
+ memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap));
+ ieee80211_apply_htcap_overrides(sdata, &ht_cap);
+
+ /* determine capability flags */
+ cap = ht_cap.cap;
+
+ switch (ap_ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
+ case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+ if (flags & IEEE80211_CHAN_NO_HT40PLUS) {
+ cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+ cap &= ~IEEE80211_HT_CAP_SGI_40;
+ }
+ break;
+ case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+ if (flags & IEEE80211_CHAN_NO_HT40MINUS) {
+ cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+ cap &= ~IEEE80211_HT_CAP_SGI_40;
+ }
+ break;
+ }
+
+ /*
+ * If 40 MHz was disabled associate as though we weren't
+ * capable of 40 MHz -- some broken APs will never fall
+ * back to trying to transmit in 20 MHz.
+ */
+ if (sdata->u.mgd.flags & IEEE80211_STA_DISABLE_40MHZ) {
+ cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+ cap &= ~IEEE80211_HT_CAP_SGI_40;
+ }
+
+ /* set SM PS mode properly */
+ cap &= ~IEEE80211_HT_CAP_SM_PS;
+ switch (smps) {
+ case IEEE80211_SMPS_AUTOMATIC:
+ case IEEE80211_SMPS_NUM_MODES:
+ WARN_ON(1);
+ case IEEE80211_SMPS_OFF:
+ cap |= WLAN_HT_CAP_SM_PS_DISABLED <<
+ IEEE80211_HT_CAP_SM_PS_SHIFT;
+ break;
+ case IEEE80211_SMPS_STATIC:
+ cap |= WLAN_HT_CAP_SM_PS_STATIC <<
+ IEEE80211_HT_CAP_SM_PS_SHIFT;
+ break;
+ case IEEE80211_SMPS_DYNAMIC:
+ cap |= WLAN_HT_CAP_SM_PS_DYNAMIC <<
+ IEEE80211_HT_CAP_SM_PS_SHIFT;
+ break;
+ }
+
+ /* reserve and fill IE */
+ pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2);
+ ieee80211_ie_build_ht_cap(pos, &ht_cap, cap);
+}
+
+static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_vht_cap *ap_vht_cap)
+{
+ u8 *pos;
+ u32 cap;
+ struct ieee80211_sta_vht_cap vht_cap;
+ u32 mask, ap_bf_sts, our_bf_sts;
+
+ BUILD_BUG_ON(sizeof(vht_cap) != sizeof(sband->vht_cap));
+
+ memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap));
+ ieee80211_apply_vhtcap_overrides(sdata, &vht_cap);
+
+ /* determine capability flags */
+ cap = vht_cap.cap;
+
+ if (sdata->u.mgd.flags & IEEE80211_STA_DISABLE_80P80MHZ) {
+ u32 bw = cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+
+ cap &= ~IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+ if (bw == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ ||
+ bw == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
+ cap |= IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ;
+ }
+
+ if (sdata->u.mgd.flags & IEEE80211_STA_DISABLE_160MHZ) {
+ cap &= ~IEEE80211_VHT_CAP_SHORT_GI_160;
+ cap &= ~IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+ }
+
+ /*
+ * Some APs apparently get confused if our capabilities are better
+ * than theirs, so restrict what we advertise in the assoc request.
+ */
+ if (!(ap_vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE)))
+ cap &= ~IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE;
+
+ mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK;
+
+ ap_bf_sts = le32_to_cpu(ap_vht_cap->vht_cap_info) & mask;
+ our_bf_sts = cap & mask;
+
+ if (ap_bf_sts < our_bf_sts) {
+ cap &= ~mask;
+ cap |= ap_bf_sts;
+ }
+
+ /* reserve and fill IE */
+ pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2);
+ ieee80211_ie_build_vht_cap(pos, &vht_cap, cap);
+}
+
+static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ u8 *pos, qos_info;
+ size_t offset = 0, noffset;
+ int i, count, rates_len, supp_rates_len, shift;
+ u16 capab;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_channel *chan;
+ u32 rate_flags, rates = 0;
+
+ sdata_assert_lock(sdata);
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return;
+ }
+ chan = chanctx_conf->def.chan;
+ rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
+ rcu_read_unlock();
+ sband = local->hw.wiphy->bands[chan->band];
+ shift = ieee80211_vif_get_shift(&sdata->vif);
+
+ if (assoc_data->supp_rates_len) {
+ /*
+ * Get all rates supported by the device and the AP as
+ * some APs don't like getting a superset of their rates
+ * in the association request (e.g. D-Link DAP 1353 in
+ * b-only mode)...
+ */
+ rates_len = ieee80211_parse_bitrates(&chanctx_conf->def, sband,
+ assoc_data->supp_rates,
+ assoc_data->supp_rates_len,
+ &rates);
+ } else {
+ /*
+ * In case AP not provide any supported rates information
+ * before association, we send information element(s) with
+ * all rates that we support.
+ */
+ rates_len = 0;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if ((rate_flags & sband->bitrates[i].flags)
+ != rate_flags)
+ continue;
+ rates |= BIT(i);
+ rates_len++;
+ }
+ }
+
+ skb = alloc_skb(local->hw.extra_tx_headroom +
+ sizeof(*mgmt) + /* bit too much but doesn't matter */
+ 2 + assoc_data->ssid_len + /* SSID */
+ 4 + rates_len + /* (extended) rates */
+ 4 + /* power capability */
+ 2 + 2 * sband->n_channels + /* supported channels */
+ 2 + sizeof(struct ieee80211_ht_cap) + /* HT */
+ 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */
+ assoc_data->ie_len + /* extra IEs */
+ 9, /* WMM */
+ GFP_KERNEL);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ capab = WLAN_CAPABILITY_ESS;
+
+ if (sband->band == IEEE80211_BAND_2GHZ) {
+ if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
+ capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
+ if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE))
+ capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
+ }
+
+ if (assoc_data->capability & WLAN_CAPABILITY_PRIVACY)
+ capab |= WLAN_CAPABILITY_PRIVACY;
+
+ if ((assoc_data->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) &&
+ (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT))
+ capab |= WLAN_CAPABILITY_SPECTRUM_MGMT;
+
+ if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM)
+ capab |= WLAN_CAPABILITY_RADIO_MEASURE;
+
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+ memset(mgmt, 0, 24);
+ memcpy(mgmt->da, assoc_data->bss->bssid, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, assoc_data->bss->bssid, ETH_ALEN);
+
+ if (!is_zero_ether_addr(assoc_data->prev_bssid)) {
+ skb_put(skb, 10);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_REASSOC_REQ);
+ mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
+ mgmt->u.reassoc_req.listen_interval =
+ cpu_to_le16(local->hw.conf.listen_interval);
+ memcpy(mgmt->u.reassoc_req.current_ap, assoc_data->prev_bssid,
+ ETH_ALEN);
+ } else {
+ skb_put(skb, 4);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ASSOC_REQ);
+ mgmt->u.assoc_req.capab_info = cpu_to_le16(capab);
+ mgmt->u.assoc_req.listen_interval =
+ cpu_to_le16(local->hw.conf.listen_interval);
+ }
+
+ /* SSID */
+ pos = skb_put(skb, 2 + assoc_data->ssid_len);
+ *pos++ = WLAN_EID_SSID;
+ *pos++ = assoc_data->ssid_len;
+ memcpy(pos, assoc_data->ssid, assoc_data->ssid_len);
+
+ /* add all rates which were marked to be used above */
+ supp_rates_len = rates_len;
+ if (supp_rates_len > 8)
+ supp_rates_len = 8;
+
+ pos = skb_put(skb, supp_rates_len + 2);
+ *pos++ = WLAN_EID_SUPP_RATES;
+ *pos++ = supp_rates_len;
+
+ count = 0;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (BIT(i) & rates) {
+ int rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+ 5 * (1 << shift));
+ *pos++ = (u8) rate;
+ if (++count == 8)
+ break;
+ }
+ }
+
+ if (rates_len > count) {
+ pos = skb_put(skb, rates_len - count + 2);
+ *pos++ = WLAN_EID_EXT_SUPP_RATES;
+ *pos++ = rates_len - count;
+
+ for (i++; i < sband->n_bitrates; i++) {
+ if (BIT(i) & rates) {
+ int rate;
+ rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+ 5 * (1 << shift));
+ *pos++ = (u8) rate;
+ }
+ }
+ }
+
+ if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT ||
+ capab & WLAN_CAPABILITY_RADIO_MEASURE) {
+ pos = skb_put(skb, 4);
+ *pos++ = WLAN_EID_PWR_CAPABILITY;
+ *pos++ = 2;
+ *pos++ = 0; /* min tx power */
+ /* max tx power */
+ *pos++ = ieee80211_chandef_max_power(&chanctx_conf->def);
+ }
+
+ if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) {
+ /* TODO: get this in reg domain format */
+ pos = skb_put(skb, 2 * sband->n_channels + 2);
+ *pos++ = WLAN_EID_SUPPORTED_CHANNELS;
+ *pos++ = 2 * sband->n_channels;
+ for (i = 0; i < sband->n_channels; i++) {
+ *pos++ = ieee80211_frequency_to_channel(
+ sband->channels[i].center_freq);
+ *pos++ = 1; /* one channel in the subband*/
+ }
+ }
+
+ /* if present, add any custom IEs that go before HT */
+ if (assoc_data->ie_len) {
+ static const u8 before_ht[] = {
+ WLAN_EID_SSID,
+ WLAN_EID_SUPP_RATES,
+ WLAN_EID_EXT_SUPP_RATES,
+ WLAN_EID_PWR_CAPABILITY,
+ WLAN_EID_SUPPORTED_CHANNELS,
+ WLAN_EID_RSN,
+ WLAN_EID_QOS_CAPA,
+ WLAN_EID_RRM_ENABLED_CAPABILITIES,
+ WLAN_EID_MOBILITY_DOMAIN,
+ WLAN_EID_FAST_BSS_TRANSITION, /* reassoc only */
+ WLAN_EID_RIC_DATA, /* reassoc only */
+ WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+ };
+ static const u8 after_ric[] = {
+ WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+ WLAN_EID_HT_CAPABILITY,
+ WLAN_EID_BSS_COEX_2040,
+ WLAN_EID_EXT_CAPABILITY,
+ WLAN_EID_QOS_TRAFFIC_CAPA,
+ WLAN_EID_TIM_BCAST_REQ,
+ WLAN_EID_INTERWORKING,
+ /* 60GHz doesn't happen right now */
+ WLAN_EID_VHT_CAPABILITY,
+ WLAN_EID_OPMODE_NOTIF,
+ };
+
+ noffset = ieee80211_ie_split_ric(assoc_data->ie,
+ assoc_data->ie_len,
+ before_ht,
+ ARRAY_SIZE(before_ht),
+ after_ric,
+ ARRAY_SIZE(after_ric),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, assoc_data->ie + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ if (WARN_ON_ONCE((ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)))
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+ ieee80211_add_ht_ie(sdata, skb, assoc_data->ap_ht_param,
+ sband, chan, sdata->smps_mode);
+
+ /* if present, add any custom IEs that go before VHT */
+ if (assoc_data->ie_len) {
+ static const u8 before_vht[] = {
+ WLAN_EID_SSID,
+ WLAN_EID_SUPP_RATES,
+ WLAN_EID_EXT_SUPP_RATES,
+ WLAN_EID_PWR_CAPABILITY,
+ WLAN_EID_SUPPORTED_CHANNELS,
+ WLAN_EID_RSN,
+ WLAN_EID_QOS_CAPA,
+ WLAN_EID_RRM_ENABLED_CAPABILITIES,
+ WLAN_EID_MOBILITY_DOMAIN,
+ WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+ WLAN_EID_HT_CAPABILITY,
+ WLAN_EID_BSS_COEX_2040,
+ WLAN_EID_EXT_CAPABILITY,
+ WLAN_EID_QOS_TRAFFIC_CAPA,
+ WLAN_EID_TIM_BCAST_REQ,
+ WLAN_EID_INTERWORKING,
+ };
+
+ /* RIC already taken above, so no need to handle here anymore */
+ noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len,
+ before_vht, ARRAY_SIZE(before_vht),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, assoc_data->ie + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
+ ieee80211_add_vht_ie(sdata, skb, sband,
+ &assoc_data->ap_vht_cap);
+
+ /* if present, add any custom non-vendor IEs that go after HT */
+ if (assoc_data->ie_len) {
+ noffset = ieee80211_ie_split_vendor(assoc_data->ie,
+ assoc_data->ie_len,
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, assoc_data->ie + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ if (assoc_data->wmm) {
+ if (assoc_data->uapsd) {
+ qos_info = ifmgd->uapsd_queues;
+ qos_info |= (ifmgd->uapsd_max_sp_len <<
+ IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT);
+ } else {
+ qos_info = 0;
+ }
+
+ pos = ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info);
+ }
+
+ /* add any remaining custom (i.e. vendor specific here) IEs */
+ if (assoc_data->ie_len) {
+ noffset = assoc_data->ie_len;
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, assoc_data->ie + offset, noffset - offset);
+ }
+
+ drv_mgd_prepare_tx(local, sdata);
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS |
+ IEEE80211_TX_INTFL_MLME_CONN_TX;
+ ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_send_pspoll(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_pspoll *pspoll;
+ struct sk_buff *skb;
+
+ skb = ieee80211_pspoll_get(&local->hw, &sdata->vif);
+ if (!skb)
+ return;
+
+ pspoll = (struct ieee80211_pspoll *) skb->data;
+ pspoll->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_send_nullfunc(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ int powersave)
+{
+ struct sk_buff *skb;
+ struct ieee80211_hdr_3addr *nullfunc;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif);
+ if (!skb)
+ return;
+
+ nullfunc = (struct ieee80211_hdr_3addr *) skb->data;
+ if (powersave)
+ nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
+ IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
+
+ if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+ if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL)
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_USE_MINRATE;
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct sk_buff *skb;
+ struct ieee80211_hdr *nullfunc;
+ __le16 fc;
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+ return;
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ nullfunc = (struct ieee80211_hdr *) skb_put(skb, 30);
+ memset(nullfunc, 0, 30);
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
+ IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
+ nullfunc->frame_control = fc;
+ memcpy(nullfunc->addr1, sdata->u.mgd.bssid, ETH_ALEN);
+ memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(nullfunc->addr3, sdata->u.mgd.bssid, ETH_ALEN);
+ memcpy(nullfunc->addr4, sdata->vif.addr, ETH_ALEN);
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ ieee80211_tx_skb(sdata, skb);
+}
+
+/* spectrum management related things */
+static void ieee80211_chswitch_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ int ret;
+
+ if (!ieee80211_sdata_running(sdata))
+ return;
+
+ sdata_lock(sdata);
+ mutex_lock(&local->mtx);
+ mutex_lock(&local->chanctx_mtx);
+
+ if (!ifmgd->associated)
+ goto out;
+
+ if (!sdata->vif.csa_active)
+ goto out;
+
+ /*
+ * using reservation isn't immediate as it may be deferred until later
+ * with multi-vif. once reservation is complete it will re-schedule the
+ * work with no reserved_chanctx so verify chandef to check if it
+ * completed successfully
+ */
+
+ if (sdata->reserved_chanctx) {
+ /*
+ * with multi-vif csa driver may call ieee80211_csa_finish()
+ * many times while waiting for other interfaces to use their
+ * reservations
+ */
+ if (sdata->reserved_ready)
+ goto out;
+
+ ret = ieee80211_vif_use_reserved_context(sdata);
+ if (ret) {
+ sdata_info(sdata,
+ "failed to use reserved channel context, disconnecting (err=%d)\n",
+ ret);
+ ieee80211_queue_work(&sdata->local->hw,
+ &ifmgd->csa_connection_drop_work);
+ goto out;
+ }
+
+ goto out;
+ }
+
+ if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef,
+ &sdata->csa_chandef)) {
+ sdata_info(sdata,
+ "failed to finalize channel switch, disconnecting\n");
+ ieee80211_queue_work(&sdata->local->hw,
+ &ifmgd->csa_connection_drop_work);
+ goto out;
+ }
+
+ /* XXX: shouldn't really modify cfg80211-owned data! */
+ ifmgd->associated->channel = sdata->csa_chandef.chan;
+
+ ifmgd->csa_waiting_bcn = true;
+
+ ieee80211_sta_reset_beacon_monitor(sdata);
+ ieee80211_sta_reset_conn_monitor(sdata);
+
+out:
+ mutex_unlock(&local->chanctx_mtx);
+ mutex_unlock(&local->mtx);
+ sdata_unlock(sdata);
+}
+
+static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ int ret;
+
+ sdata_assert_lock(sdata);
+
+ WARN_ON(!sdata->vif.csa_active);
+
+ if (sdata->csa_block_tx) {
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_CSA);
+ sdata->csa_block_tx = false;
+ }
+
+ sdata->vif.csa_active = false;
+ ifmgd->csa_waiting_bcn = false;
+
+ ret = drv_post_channel_switch(sdata);
+ if (ret) {
+ sdata_info(sdata,
+ "driver post channel switch failed, disconnecting\n");
+ ieee80211_queue_work(&local->hw,
+ &ifmgd->csa_connection_drop_work);
+ return;
+ }
+
+ cfg80211_ch_switch_notify(sdata->dev, &sdata->reserved_chandef);
+}
+
+void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ trace_api_chswitch_done(sdata, success);
+ if (!success) {
+ sdata_info(sdata,
+ "driver channel switch failed, disconnecting\n");
+ ieee80211_queue_work(&sdata->local->hw,
+ &ifmgd->csa_connection_drop_work);
+ } else {
+ ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work);
+ }
+}
+EXPORT_SYMBOL(ieee80211_chswitch_done);
+
+static void ieee80211_chswitch_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata =
+ (struct ieee80211_sub_if_data *) data;
+
+ ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.chswitch_work);
+}
+
+static void
+ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
+ u64 timestamp, u32 device_timestamp,
+ struct ieee802_11_elems *elems,
+ bool beacon)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct cfg80211_bss *cbss = ifmgd->associated;
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *chanctx;
+ enum ieee80211_band current_band;
+ struct ieee80211_csa_ie csa_ie;
+ struct ieee80211_channel_switch ch_switch;
+ int res;
+
+ sdata_assert_lock(sdata);
+
+ if (!cbss)
+ return;
+
+ if (local->scanning)
+ return;
+
+ /* disregard subsequent announcements if we are already processing */
+ if (sdata->vif.csa_active)
+ return;
+
+ current_band = cbss->channel->band;
+ memset(&csa_ie, 0, sizeof(csa_ie));
+ res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band,
+ ifmgd->flags,
+ ifmgd->associated->bssid, &csa_ie);
+ if (res < 0)
+ ieee80211_queue_work(&local->hw,
+ &ifmgd->csa_connection_drop_work);
+ if (res)
+ return;
+
+ if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef,
+ IEEE80211_CHAN_DISABLED)) {
+ sdata_info(sdata,
+ "AP %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
+ ifmgd->associated->bssid,
+ csa_ie.chandef.chan->center_freq,
+ csa_ie.chandef.width, csa_ie.chandef.center_freq1,
+ csa_ie.chandef.center_freq2);
+ ieee80211_queue_work(&local->hw,
+ &ifmgd->csa_connection_drop_work);
+ return;
+ }
+
+ if (cfg80211_chandef_identical(&csa_ie.chandef,
+ &sdata->vif.bss_conf.chandef)) {
+ if (ifmgd->csa_ignored_same_chan)
+ return;
+ sdata_info(sdata,
+ "AP %pM tries to chanswitch to same channel, ignore\n",
+ ifmgd->associated->bssid);
+ ifmgd->csa_ignored_same_chan = true;
+ return;
+ }
+
+ mutex_lock(&local->mtx);
+ mutex_lock(&local->chanctx_mtx);
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ if (!conf) {
+ sdata_info(sdata,
+ "no channel context assigned to vif?, disconnecting\n");
+ goto drop_connection;
+ }
+
+ chanctx = container_of(conf, struct ieee80211_chanctx, conf);
+
+ if (local->use_chanctx &&
+ !(local->hw.flags & IEEE80211_HW_CHANCTX_STA_CSA)) {
+ sdata_info(sdata,
+ "driver doesn't support chan-switch with channel contexts\n");
+ goto drop_connection;
+ }
+
+ ch_switch.timestamp = timestamp;
+ ch_switch.device_timestamp = device_timestamp;
+ ch_switch.block_tx = csa_ie.mode;
+ ch_switch.chandef = csa_ie.chandef;
+ ch_switch.count = csa_ie.count;
+
+ if (drv_pre_channel_switch(sdata, &ch_switch)) {
+ sdata_info(sdata,
+ "preparing for channel switch failed, disconnecting\n");
+ goto drop_connection;
+ }
+
+ res = ieee80211_vif_reserve_chanctx(sdata, &csa_ie.chandef,
+ chanctx->mode, false);
+ if (res) {
+ sdata_info(sdata,
+ "failed to reserve channel context for channel switch, disconnecting (err=%d)\n",
+ res);
+ goto drop_connection;
+ }
+ mutex_unlock(&local->chanctx_mtx);
+
+ sdata->vif.csa_active = true;
+ sdata->csa_chandef = csa_ie.chandef;
+ sdata->csa_block_tx = csa_ie.mode;
+ ifmgd->csa_ignored_same_chan = false;
+
+ if (sdata->csa_block_tx)
+ ieee80211_stop_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_CSA);
+ mutex_unlock(&local->mtx);
+
+ cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chandef,
+ csa_ie.count);
+
+ if (local->ops->channel_switch) {
+ /* use driver's channel switch callback */
+ drv_channel_switch(local, sdata, &ch_switch);
+ return;
+ }
+
+ /* channel switch handled in software */
+ if (csa_ie.count <= 1)
+ ieee80211_queue_work(&local->hw, &ifmgd->chswitch_work);
+ else
+ mod_timer(&ifmgd->chswitch_timer,
+ TU_TO_EXP_TIME((csa_ie.count - 1) *
+ cbss->beacon_interval));
+ return;
+ drop_connection:
+ ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work);
+ mutex_unlock(&local->chanctx_mtx);
+ mutex_unlock(&local->mtx);
+}
+
+static bool
+ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel *channel,
+ const u8 *country_ie, u8 country_ie_len,
+ const u8 *pwr_constr_elem,
+ int *chan_pwr, int *pwr_reduction)
+{
+ struct ieee80211_country_ie_triplet *triplet;
+ int chan = ieee80211_frequency_to_channel(channel->center_freq);
+ int i, chan_increment;
+ bool have_chan_pwr = false;
+
+ /* Invalid IE */
+ if (country_ie_len % 2 || country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
+ return false;
+
+ triplet = (void *)(country_ie + 3);
+ country_ie_len -= 3;
+
+ switch (channel->band) {
+ default:
+ WARN_ON_ONCE(1);
+ /* fall through */
+ case IEEE80211_BAND_2GHZ:
+ case IEEE80211_BAND_60GHZ:
+ chan_increment = 1;
+ break;
+ case IEEE80211_BAND_5GHZ:
+ chan_increment = 4;
+ break;
+ }
+
+ /* find channel */
+ while (country_ie_len >= 3) {
+ u8 first_channel = triplet->chans.first_channel;
+
+ if (first_channel >= IEEE80211_COUNTRY_EXTENSION_ID)
+ goto next;
+
+ for (i = 0; i < triplet->chans.num_channels; i++) {
+ if (first_channel + i * chan_increment == chan) {
+ have_chan_pwr = true;
+ *chan_pwr = triplet->chans.max_power;
+ break;
+ }
+ }
+ if (have_chan_pwr)
+ break;
+
+ next:
+ triplet++;
+ country_ie_len -= 3;
+ }
+
+ if (have_chan_pwr && pwr_constr_elem)
+ *pwr_reduction = *pwr_constr_elem;
+ else
+ *pwr_reduction = 0;
+
+ return have_chan_pwr;
+}
+
+static void ieee80211_find_cisco_dtpc(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel *channel,
+ const u8 *cisco_dtpc_ie,
+ int *pwr_level)
+{
+ /* From practical testing, the first data byte of the DTPC element
+ * seems to contain the requested dBm level, and the CLI on Cisco
+ * APs clearly state the range is -127 to 127 dBm, which indicates
+ * a signed byte, although it seemingly never actually goes negative.
+ * The other byte seems to always be zero.
+ */
+ *pwr_level = (__s8)cisco_dtpc_ie[4];
+}
+
+static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel *channel,
+ struct ieee80211_mgmt *mgmt,
+ const u8 *country_ie, u8 country_ie_len,
+ const u8 *pwr_constr_ie,
+ const u8 *cisco_dtpc_ie)
+{
+ bool has_80211h_pwr = false, has_cisco_pwr = false;
+ int chan_pwr = 0, pwr_reduction_80211h = 0;
+ int pwr_level_cisco, pwr_level_80211h;
+ int new_ap_level;
+ __le16 capab = mgmt->u.probe_resp.capab_info;
+
+ if (country_ie &&
+ (capab & cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT) ||
+ capab & cpu_to_le16(WLAN_CAPABILITY_RADIO_MEASURE))) {
+ has_80211h_pwr = ieee80211_find_80211h_pwr_constr(
+ sdata, channel, country_ie, country_ie_len,
+ pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h);
+ pwr_level_80211h =
+ max_t(int, 0, chan_pwr - pwr_reduction_80211h);
+ }
+
+ if (cisco_dtpc_ie) {
+ ieee80211_find_cisco_dtpc(
+ sdata, channel, cisco_dtpc_ie, &pwr_level_cisco);
+ has_cisco_pwr = true;
+ }
+
+ if (!has_80211h_pwr && !has_cisco_pwr)
+ return 0;
+
+ /* If we have both 802.11h and Cisco DTPC, apply both limits
+ * by picking the smallest of the two power levels advertised.
+ */
+ if (has_80211h_pwr &&
+ (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) {
+ sdata_dbg(sdata,
+ "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n",
+ pwr_level_80211h, chan_pwr, pwr_reduction_80211h,
+ sdata->u.mgd.bssid);
+ new_ap_level = pwr_level_80211h;
+ } else { /* has_cisco_pwr is always true here. */
+ sdata_dbg(sdata,
+ "Limiting TX power to %d dBm as advertised by %pM\n",
+ pwr_level_cisco, sdata->u.mgd.bssid);
+ new_ap_level = pwr_level_cisco;
+ }
+
+ if (sdata->ap_power_level == new_ap_level)
+ return 0;
+
+ sdata->ap_power_level = new_ap_level;
+ if (__ieee80211_recalc_txpower(sdata))
+ return BSS_CHANGED_TXPOWER;
+ return 0;
+}
+
+/* powersave */
+static void ieee80211_enable_ps(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_conf *conf = &local->hw.conf;
+
+ /*
+ * If we are scanning right now then the parameters will
+ * take effect when scan finishes.
+ */
+ if (local->scanning)
+ return;
+
+ if (conf->dynamic_ps_timeout > 0 &&
+ !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)) {
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(conf->dynamic_ps_timeout));
+ } else {
+ if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)
+ ieee80211_send_nullfunc(local, sdata, 1);
+
+ if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) &&
+ (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS))
+ return;
+
+ conf->flags |= IEEE80211_CONF_PS;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+ }
+}
+
+static void ieee80211_change_ps(struct ieee80211_local *local)
+{
+ struct ieee80211_conf *conf = &local->hw.conf;
+
+ if (local->ps_sdata) {
+ ieee80211_enable_ps(local, local->ps_sdata);
+ } else if (conf->flags & IEEE80211_CONF_PS) {
+ conf->flags &= ~IEEE80211_CONF_PS;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+ del_timer_sync(&local->dynamic_ps_timer);
+ cancel_work_sync(&local->dynamic_ps_enable_work);
+ }
+}
+
+static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *mgd = &sdata->u.mgd;
+ struct sta_info *sta = NULL;
+ bool authorized = false;
+
+ if (!mgd->powersave)
+ return false;
+
+ if (mgd->broken_ap)
+ return false;
+
+ if (!mgd->associated)
+ return false;
+
+ if (mgd->flags & IEEE80211_STA_CONNECTION_POLL)
+ return false;
+
+ if (!mgd->have_beacon)
+ return false;
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, mgd->bssid);
+ if (sta)
+ authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED);
+ rcu_read_unlock();
+
+ return authorized;
+}
+
+/* need to hold RTNL or interface lock */
+void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
+{
+ struct ieee80211_sub_if_data *sdata, *found = NULL;
+ int count = 0;
+ int timeout;
+
+ if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) {
+ local->ps_sdata = NULL;
+ return;
+ }
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ /* If an AP vif is found, then disable PS
+ * by setting the count to zero thereby setting
+ * ps_sdata to NULL.
+ */
+ count = 0;
+ break;
+ }
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ continue;
+ found = sdata;
+ count++;
+ }
+
+ if (count == 1 && ieee80211_powersave_allowed(found)) {
+ s32 beaconint_us;
+
+ if (latency < 0)
+ latency = pm_qos_request(PM_QOS_NETWORK_LATENCY);
+
+ beaconint_us = ieee80211_tu_to_usec(
+ found->vif.bss_conf.beacon_int);
+
+ timeout = local->dynamic_ps_forced_timeout;
+ if (timeout < 0) {
+ /*
+ * Go to full PSM if the user configures a very low
+ * latency requirement.
+ * The 2000 second value is there for compatibility
+ * until the PM_QOS_NETWORK_LATENCY is configured
+ * with real values.
+ */
+ if (latency > (1900 * USEC_PER_MSEC) &&
+ latency != (2000 * USEC_PER_SEC))
+ timeout = 0;
+ else
+ timeout = 100;
+ }
+ local->hw.conf.dynamic_ps_timeout = timeout;
+
+ if (beaconint_us > latency) {
+ local->ps_sdata = NULL;
+ } else {
+ int maxslp = 1;
+ u8 dtimper = found->u.mgd.dtim_period;
+
+ /* If the TIM IE is invalid, pretend the value is 1 */
+ if (!dtimper)
+ dtimper = 1;
+ else if (dtimper > 1)
+ maxslp = min_t(int, dtimper,
+ latency / beaconint_us);
+
+ local->hw.conf.max_sleep_period = maxslp;
+ local->hw.conf.ps_dtim_period = dtimper;
+ local->ps_sdata = found;
+ }
+ } else {
+ local->ps_sdata = NULL;
+ }
+
+ ieee80211_change_ps(local);
+}
+
+void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata)
+{
+ bool ps_allowed = ieee80211_powersave_allowed(sdata);
+
+ if (sdata->vif.bss_conf.ps != ps_allowed) {
+ sdata->vif.bss_conf.ps = ps_allowed;
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_PS);
+ }
+}
+
+void ieee80211_dynamic_ps_disable_work(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local,
+ dynamic_ps_disable_work);
+
+ if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+ local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+ }
+
+ ieee80211_wake_queues_by_reason(&local->hw,
+ IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_PS,
+ false);
+}
+
+void ieee80211_dynamic_ps_enable_work(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local,
+ dynamic_ps_enable_work);
+ struct ieee80211_sub_if_data *sdata = local->ps_sdata;
+ struct ieee80211_if_managed *ifmgd;
+ unsigned long flags;
+ int q;
+
+ /* can only happen when PS was just disabled anyway */
+ if (!sdata)
+ return;
+
+ ifmgd = &sdata->u.mgd;
+
+ if (local->hw.conf.flags & IEEE80211_CONF_PS)
+ return;
+
+ if (local->hw.conf.dynamic_ps_timeout > 0) {
+ /* don't enter PS if TX frames are pending */
+ if (drv_tx_frames_pending(local)) {
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(
+ local->hw.conf.dynamic_ps_timeout));
+ return;
+ }
+
+ /*
+ * transmission can be stopped by others which leads to
+ * dynamic_ps_timer expiry. Postpone the ps timer if it
+ * is not the actual idle state.
+ */
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ for (q = 0; q < local->hw.queues; q++) {
+ if (local->queue_stop_reasons[q]) {
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock,
+ flags);
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(
+ local->hw.conf.dynamic_ps_timeout));
+ return;
+ }
+ }
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ }
+
+ if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) &&
+ !(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) {
+ if (drv_tx_frames_pending(local)) {
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(
+ local->hw.conf.dynamic_ps_timeout));
+ } else {
+ ieee80211_send_nullfunc(local, sdata, 1);
+ /* Flush to get the tx status of nullfunc frame */
+ ieee80211_flush_queues(local, sdata, false);
+ }
+ }
+
+ if (!((local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) &&
+ (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) ||
+ (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) {
+ ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
+ local->hw.conf.flags |= IEEE80211_CONF_PS;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+ }
+}
+
+void ieee80211_dynamic_ps_timer(unsigned long data)
+{
+ struct ieee80211_local *local = (void *) data;
+
+ ieee80211_queue_work(&local->hw, &local->dynamic_ps_enable_work);
+}
+
+void ieee80211_dfs_cac_timer_work(struct work_struct *work)
+{
+ struct delayed_work *delayed_work =
+ container_of(work, struct delayed_work, work);
+ struct ieee80211_sub_if_data *sdata =
+ container_of(delayed_work, struct ieee80211_sub_if_data,
+ dfs_cac_timer_work);
+ struct cfg80211_chan_def chandef = sdata->vif.bss_conf.chandef;
+
+ mutex_lock(&sdata->local->mtx);
+ if (sdata->wdev.cac_started) {
+ ieee80211_vif_release_channel(sdata);
+ cfg80211_cac_event(sdata->dev, &chandef,
+ NL80211_RADAR_CAC_FINISHED,
+ GFP_KERNEL);
+ }
+ mutex_unlock(&sdata->local->mtx);
+}
+
+static bool
+__ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ bool ret = false;
+ int ac;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ return false;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac];
+ int non_acm_ac;
+ unsigned long now = jiffies;
+
+ if (tx_tspec->action == TX_TSPEC_ACTION_NONE &&
+ tx_tspec->admitted_time &&
+ time_after(now, tx_tspec->time_slice_start + HZ)) {
+ tx_tspec->consumed_tx_time = 0;
+ tx_tspec->time_slice_start = now;
+
+ if (tx_tspec->downgraded)
+ tx_tspec->action =
+ TX_TSPEC_ACTION_STOP_DOWNGRADE;
+ }
+
+ switch (tx_tspec->action) {
+ case TX_TSPEC_ACTION_STOP_DOWNGRADE:
+ /* take the original parameters */
+ if (drv_conf_tx(local, sdata, ac, &sdata->tx_conf[ac]))
+ sdata_err(sdata,
+ "failed to set TX queue parameters for queue %d\n",
+ ac);
+ tx_tspec->action = TX_TSPEC_ACTION_NONE;
+ tx_tspec->downgraded = false;
+ ret = true;
+ break;
+ case TX_TSPEC_ACTION_DOWNGRADE:
+ if (time_after(now, tx_tspec->time_slice_start + HZ)) {
+ tx_tspec->action = TX_TSPEC_ACTION_NONE;
+ ret = true;
+ break;
+ }
+ /* downgrade next lower non-ACM AC */
+ for (non_acm_ac = ac + 1;
+ non_acm_ac < IEEE80211_NUM_ACS;
+ non_acm_ac++)
+ if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac)))
+ break;
+ /* The loop will result in using BK even if it requires
+ * admission control, such configuration makes no sense
+ * and we have to transmit somehow - the AC selection
+ * does the same thing.
+ */
+ if (drv_conf_tx(local, sdata, ac,
+ &sdata->tx_conf[non_acm_ac]))
+ sdata_err(sdata,
+ "failed to set TX queue parameters for queue %d\n",
+ ac);
+ tx_tspec->action = TX_TSPEC_ACTION_NONE;
+ ret = true;
+ schedule_delayed_work(&ifmgd->tx_tspec_wk,
+ tx_tspec->time_slice_start + HZ - now + 1);
+ break;
+ case TX_TSPEC_ACTION_NONE:
+ /* nothing now */
+ break;
+ }
+ }
+
+ return ret;
+}
+
+void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata)
+{
+ if (__ieee80211_sta_handle_tspec_ac_params(sdata))
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
+}
+
+static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ sdata = container_of(work, struct ieee80211_sub_if_data,
+ u.mgd.tx_tspec_wk.work);
+ ieee80211_sta_handle_tspec_ac_params(sdata);
+}
+
+/* MLME */
+static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const u8 *wmm_param, size_t wmm_param_len)
+{
+ struct ieee80211_tx_queue_params params;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ size_t left;
+ int count;
+ const u8 *pos;
+ u8 uapsd_queues = 0;
+
+ if (!local->ops->conf_tx)
+ return false;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ return false;
+
+ if (!wmm_param)
+ return false;
+
+ if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1)
+ return false;
+
+ if (ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED)
+ uapsd_queues = ifmgd->uapsd_queues;
+
+ count = wmm_param[6] & 0x0f;
+ if (count == ifmgd->wmm_last_param_set)
+ return false;
+ ifmgd->wmm_last_param_set = count;
+
+ pos = wmm_param + 8;
+ left = wmm_param_len - 8;
+
+ memset(&params, 0, sizeof(params));
+
+ sdata->wmm_acm = 0;
+ for (; left >= 4; left -= 4, pos += 4) {
+ int aci = (pos[0] >> 5) & 0x03;
+ int acm = (pos[0] >> 4) & 0x01;
+ bool uapsd = false;
+ int queue;
+
+ switch (aci) {
+ case 1: /* AC_BK */
+ queue = 3;
+ if (acm)
+ sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
+ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK)
+ uapsd = true;
+ break;
+ case 2: /* AC_VI */
+ queue = 1;
+ if (acm)
+ sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
+ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI)
+ uapsd = true;
+ break;
+ case 3: /* AC_VO */
+ queue = 0;
+ if (acm)
+ sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
+ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO)
+ uapsd = true;
+ break;
+ case 0: /* AC_BE */
+ default:
+ queue = 2;
+ if (acm)
+ sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
+ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE)
+ uapsd = true;
+ break;
+ }
+
+ params.aifs = pos[0] & 0x0f;
+ params.cw_max = ecw2cw((pos[1] & 0xf0) >> 4);
+ params.cw_min = ecw2cw(pos[1] & 0x0f);
+ params.txop = get_unaligned_le16(pos + 2);
+ params.acm = acm;
+ params.uapsd = uapsd;
+
+ mlme_dbg(sdata,
+ "WMM queue=%d aci=%d acm=%d aifs=%d cWmin=%d cWmax=%d txop=%d uapsd=%d, downgraded=%d\n",
+ queue, aci, acm,
+ params.aifs, params.cw_min, params.cw_max,
+ params.txop, params.uapsd,
+ ifmgd->tx_tspec[queue].downgraded);
+ sdata->tx_conf[queue] = params;
+ if (!ifmgd->tx_tspec[queue].downgraded &&
+ drv_conf_tx(local, sdata, queue, &params))
+ sdata_err(sdata,
+ "failed to set TX queue parameters for queue %d\n",
+ queue);
+ }
+
+ /* enable WMM or activate new settings */
+ sdata->vif.bss_conf.qos = true;
+ return true;
+}
+
+static void __ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata)
+{
+ lockdep_assert_held(&sdata->local->mtx);
+
+ sdata->u.mgd.flags &= ~IEEE80211_STA_CONNECTION_POLL;
+ ieee80211_run_deferred_scan(sdata->local);
+}
+
+static void ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata)
+{
+ mutex_lock(&sdata->local->mtx);
+ __ieee80211_stop_poll(sdata);
+ mutex_unlock(&sdata->local->mtx);
+}
+
+static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
+ u16 capab, bool erp_valid, u8 erp)
+{
+ struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+ u32 changed = 0;
+ bool use_protection;
+ bool use_short_preamble;
+ bool use_short_slot;
+
+ if (erp_valid) {
+ use_protection = (erp & WLAN_ERP_USE_PROTECTION) != 0;
+ use_short_preamble = (erp & WLAN_ERP_BARKER_PREAMBLE) == 0;
+ } else {
+ use_protection = false;
+ use_short_preamble = !!(capab & WLAN_CAPABILITY_SHORT_PREAMBLE);
+ }
+
+ use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME);
+ if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_5GHZ)
+ use_short_slot = true;
+
+ if (use_protection != bss_conf->use_cts_prot) {
+ bss_conf->use_cts_prot = use_protection;
+ changed |= BSS_CHANGED_ERP_CTS_PROT;
+ }
+
+ if (use_short_preamble != bss_conf->use_short_preamble) {
+ bss_conf->use_short_preamble = use_short_preamble;
+ changed |= BSS_CHANGED_ERP_PREAMBLE;
+ }
+
+ if (use_short_slot != bss_conf->use_short_slot) {
+ bss_conf->use_short_slot = use_short_slot;
+ changed |= BSS_CHANGED_ERP_SLOT;
+ }
+
+ return changed;
+}
+
+static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_bss *cbss,
+ u32 bss_info_changed)
+{
+ struct ieee80211_bss *bss = (void *)cbss->priv;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+
+ bss_info_changed |= BSS_CHANGED_ASSOC;
+ bss_info_changed |= ieee80211_handle_bss_capability(sdata,
+ bss_conf->assoc_capability, bss->has_erp_value, bss->erp_value);
+
+ sdata->u.mgd.beacon_timeout = usecs_to_jiffies(ieee80211_tu_to_usec(
+ beacon_loss_count * bss_conf->beacon_int));
+
+ sdata->u.mgd.associated = cbss;
+ memcpy(sdata->u.mgd.bssid, cbss->bssid, ETH_ALEN);
+
+ sdata->u.mgd.flags |= IEEE80211_STA_RESET_SIGNAL_AVE;
+
+ if (sdata->vif.p2p) {
+ const struct cfg80211_bss_ies *ies;
+
+ rcu_read_lock();
+ ies = rcu_dereference(cbss->ies);
+ if (ies) {
+ int ret;
+
+ ret = cfg80211_get_p2p_attr(
+ ies->data, ies->len,
+ IEEE80211_P2P_ATTR_ABSENCE_NOTICE,
+ (u8 *) &bss_conf->p2p_noa_attr,
+ sizeof(bss_conf->p2p_noa_attr));
+ if (ret >= 2) {
+ sdata->u.mgd.p2p_noa_index =
+ bss_conf->p2p_noa_attr.index;
+ bss_info_changed |= BSS_CHANGED_P2P_PS;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ /* just to be sure */
+ ieee80211_stop_poll(sdata);
+
+ ieee80211_led_assoc(local, 1);
+
+ if (sdata->u.mgd.have_beacon) {
+ /*
+ * If the AP is buggy we may get here with no DTIM period
+ * known, so assume it's 1 which is the only safe assumption
+ * in that case, although if the TIM IE is broken powersave
+ * probably just won't work at all.
+ */
+ bss_conf->dtim_period = sdata->u.mgd.dtim_period ?: 1;
+ bss_conf->beacon_rate = bss->beacon_rate;
+ bss_info_changed |= BSS_CHANGED_BEACON_INFO;
+ } else {
+ bss_conf->beacon_rate = NULL;
+ bss_conf->dtim_period = 0;
+ }
+
+ bss_conf->assoc = 1;
+
+ /* Tell the driver to monitor connection quality (if supported) */
+ if (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI &&
+ bss_conf->cqm_rssi_thold)
+ bss_info_changed |= BSS_CHANGED_CQM;
+
+ /* Enable ARP filtering */
+ if (bss_conf->arp_addr_cnt)
+ bss_info_changed |= BSS_CHANGED_ARP_FILTER;
+
+ ieee80211_bss_info_change_notify(sdata, bss_info_changed);
+
+ mutex_lock(&local->iflist_mtx);
+ ieee80211_recalc_ps(local, -1);
+ mutex_unlock(&local->iflist_mtx);
+
+ ieee80211_recalc_smps(sdata);
+ ieee80211_recalc_ps_vif(sdata);
+
+ netif_carrier_on(sdata->dev);
+}
+
+static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
+ u16 stype, u16 reason, bool tx,
+ u8 *frame_buf)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_local *local = sdata->local;
+ u32 changed = 0;
+
+ sdata_assert_lock(sdata);
+
+ if (WARN_ON_ONCE(tx && !frame_buf))
+ return;
+
+ if (WARN_ON(!ifmgd->associated))
+ return;
+
+ ieee80211_stop_poll(sdata);
+
+ ifmgd->associated = NULL;
+ netif_carrier_off(sdata->dev);
+
+ /*
+ * if we want to get out of ps before disassoc (why?) we have
+ * to do it before sending disassoc, as otherwise the null-packet
+ * won't be valid.
+ */
+ if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+ local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+ }
+ local->ps_sdata = NULL;
+
+ /* disable per-vif ps */
+ ieee80211_recalc_ps_vif(sdata);
+
+ /* make sure ongoing transmission finishes */
+ synchronize_net();
+
+ /*
+ * drop any frame before deauth/disassoc, this can be data or
+ * management frame. Since we are disconnecting, we should not
+ * insist sending these frames which can take time and delay
+ * the disconnection and possible the roaming.
+ */
+ if (tx)
+ ieee80211_flush_queues(local, sdata, true);
+
+ /* deauthenticate/disassociate now */
+ if (tx || frame_buf)
+ ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype,
+ reason, tx, frame_buf);
+
+ /* flush out frame - make sure the deauth was actually sent */
+ if (tx)
+ ieee80211_flush_queues(local, sdata, false);
+
+ /* clear bssid only after building the needed mgmt frames */
+ eth_zero_addr(ifmgd->bssid);
+
+ /* remove AP and TDLS peers */
+ sta_info_flush(sdata);
+
+ /* finally reset all BSS / config parameters */
+ changed |= ieee80211_reset_erp_info(sdata);
+
+ ieee80211_led_assoc(local, 0);
+ changed |= BSS_CHANGED_ASSOC;
+ sdata->vif.bss_conf.assoc = false;
+
+ ifmgd->p2p_noa_index = -1;
+ memset(&sdata->vif.bss_conf.p2p_noa_attr, 0,
+ sizeof(sdata->vif.bss_conf.p2p_noa_attr));
+
+ /* on the next assoc, re-program HT/VHT parameters */
+ memset(&ifmgd->ht_capa, 0, sizeof(ifmgd->ht_capa));
+ memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask));
+ memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa));
+ memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask));
+
+ sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
+
+ del_timer_sync(&local->dynamic_ps_timer);
+ cancel_work_sync(&local->dynamic_ps_enable_work);
+
+ /* Disable ARP filtering */
+ if (sdata->vif.bss_conf.arp_addr_cnt)
+ changed |= BSS_CHANGED_ARP_FILTER;
+
+ sdata->vif.bss_conf.qos = false;
+ changed |= BSS_CHANGED_QOS;
+
+ /* The BSSID (not really interesting) and HT changed */
+ changed |= BSS_CHANGED_BSSID | BSS_CHANGED_HT;
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ /* disassociated - set to defaults now */
+ ieee80211_set_wmm_default(sdata, false);
+
+ del_timer_sync(&sdata->u.mgd.conn_mon_timer);
+ del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
+ del_timer_sync(&sdata->u.mgd.timer);
+ del_timer_sync(&sdata->u.mgd.chswitch_timer);
+
+ sdata->vif.bss_conf.dtim_period = 0;
+ sdata->vif.bss_conf.beacon_rate = NULL;
+
+ ifmgd->have_beacon = false;
+
+ ifmgd->flags = 0;
+ mutex_lock(&local->mtx);
+ ieee80211_vif_release_channel(sdata);
+
+ sdata->vif.csa_active = false;
+ ifmgd->csa_waiting_bcn = false;
+ ifmgd->csa_ignored_same_chan = false;
+ if (sdata->csa_block_tx) {
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_CSA);
+ sdata->csa_block_tx = false;
+ }
+ mutex_unlock(&local->mtx);
+
+ /* existing TX TSPEC sessions no longer exist */
+ memset(ifmgd->tx_tspec, 0, sizeof(ifmgd->tx_tspec));
+ cancel_delayed_work_sync(&ifmgd->tx_tspec_wk);
+
+ sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
+}
+
+void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_hdr *hdr)
+{
+ /*
+ * We can postpone the mgd.timer whenever receiving unicast frames
+ * from AP because we know that the connection is working both ways
+ * at that time. But multicast frames (and hence also beacons) must
+ * be ignored here, because we need to trigger the timer during
+ * data idle periods for sending the periodic probe request to the
+ * AP we're connected to.
+ */
+ if (is_multicast_ether_addr(hdr->addr1))
+ return;
+
+ ieee80211_sta_reset_conn_monitor(sdata);
+}
+
+static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_local *local = sdata->local;
+
+ mutex_lock(&local->mtx);
+ if (!(ifmgd->flags & IEEE80211_STA_CONNECTION_POLL))
+ goto out;
+
+ __ieee80211_stop_poll(sdata);
+
+ mutex_lock(&local->iflist_mtx);
+ ieee80211_recalc_ps(local, -1);
+ mutex_unlock(&local->iflist_mtx);
+
+ if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+ goto out;
+
+ /*
+ * We've received a probe response, but are not sure whether
+ * we have or will be receiving any beacons or data, so let's
+ * schedule the timers again, just in case.
+ */
+ ieee80211_sta_reset_beacon_monitor(sdata);
+
+ mod_timer(&ifmgd->conn_mon_timer,
+ round_jiffies_up(jiffies +
+ IEEE80211_CONNECTION_IDLE_TIME));
+out:
+ mutex_unlock(&local->mtx);
+}
+
+static void ieee80211_sta_tx_wmm_ac_notify(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_hdr *hdr,
+ u16 tx_time)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u16 tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+ int ac = ieee80211_ac_from_tid(tid);
+ struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac];
+ unsigned long now = jiffies;
+
+ if (likely(!tx_tspec->admitted_time))
+ return;
+
+ if (time_after(now, tx_tspec->time_slice_start + HZ)) {
+ tx_tspec->consumed_tx_time = 0;
+ tx_tspec->time_slice_start = now;
+
+ if (tx_tspec->downgraded) {
+ tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE;
+ schedule_delayed_work(&ifmgd->tx_tspec_wk, 0);
+ }
+ }
+
+ if (tx_tspec->downgraded)
+ return;
+
+ tx_tspec->consumed_tx_time += tx_time;
+
+ if (tx_tspec->consumed_tx_time >= tx_tspec->admitted_time) {
+ tx_tspec->downgraded = true;
+ tx_tspec->action = TX_TSPEC_ACTION_DOWNGRADE;
+ schedule_delayed_work(&ifmgd->tx_tspec_wk, 0);
+ }
+}
+
+void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_hdr *hdr, bool ack, u16 tx_time)
+{
+ ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time);
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ return;
+
+ if (ieee80211_is_nullfunc(hdr->frame_control) &&
+ sdata->u.mgd.probe_send_count > 0) {
+ if (ack)
+ ieee80211_sta_reset_conn_monitor(sdata);
+ else
+ sdata->u.mgd.nullfunc_failed = true;
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+ return;
+ }
+
+ if (ack)
+ ieee80211_sta_reset_conn_monitor(sdata);
+}
+
+static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ const u8 *ssid;
+ u8 *dst = ifmgd->associated->bssid;
+ u8 unicast_limit = max(1, max_probe_tries - 3);
+
+ /*
+ * Try sending broadcast probe requests for the last three
+ * probe requests after the first ones failed since some
+ * buggy APs only support broadcast probe requests.
+ */
+ if (ifmgd->probe_send_count >= unicast_limit)
+ dst = NULL;
+
+ /*
+ * When the hardware reports an accurate Tx ACK status, it's
+ * better to send a nullfunc frame instead of a probe request,
+ * as it will kick us off the AP quickly if we aren't associated
+ * anymore. The timeout will be reset if the frame is ACKed by
+ * the AP.
+ */
+ ifmgd->probe_send_count++;
+
+ if (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ ifmgd->nullfunc_failed = false;
+ ieee80211_send_nullfunc(sdata->local, sdata, 0);
+ } else {
+ int ssid_len;
+
+ rcu_read_lock();
+ ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID);
+ if (WARN_ON_ONCE(ssid == NULL))
+ ssid_len = 0;
+ else
+ ssid_len = ssid[1];
+
+ ieee80211_send_probe_req(sdata, sdata->vif.addr, dst,
+ ssid + 2, ssid_len, NULL,
+ 0, (u32) -1, true, 0,
+ ifmgd->associated->channel, false);
+ rcu_read_unlock();
+ }
+
+ ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms);
+ run_again(sdata, ifmgd->probe_timeout);
+}
+
+static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata,
+ bool beacon)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ bool already = false;
+
+ if (!ieee80211_sdata_running(sdata))
+ return;
+
+ sdata_lock(sdata);
+
+ if (!ifmgd->associated)
+ goto out;
+
+ mutex_lock(&sdata->local->mtx);
+
+ if (sdata->local->tmp_channel || sdata->local->scanning) {
+ mutex_unlock(&sdata->local->mtx);
+ goto out;
+ }
+
+ if (beacon) {
+ mlme_dbg_ratelimited(sdata,
+ "detected beacon loss from AP (missed %d beacons) - probing\n",
+ beacon_loss_count);
+
+ ieee80211_cqm_beacon_loss_notify(&sdata->vif, GFP_KERNEL);
+ }
+
+ /*
+ * The driver/our work has already reported this event or the
+ * connection monitoring has kicked in and we have already sent
+ * a probe request. Or maybe the AP died and the driver keeps
+ * reporting until we disassociate...
+ *
+ * In either case we have to ignore the current call to this
+ * function (except for setting the correct probe reason bit)
+ * because otherwise we would reset the timer every time and
+ * never check whether we received a probe response!
+ */
+ if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL)
+ already = true;
+
+ ifmgd->flags |= IEEE80211_STA_CONNECTION_POLL;
+
+ mutex_unlock(&sdata->local->mtx);
+
+ if (already)
+ goto out;
+
+ mutex_lock(&sdata->local->iflist_mtx);
+ ieee80211_recalc_ps(sdata->local, -1);
+ mutex_unlock(&sdata->local->iflist_mtx);
+
+ ifmgd->probe_send_count = 0;
+ ieee80211_mgd_probe_ap_send(sdata);
+ out:
+ sdata_unlock(sdata);
+}
+
+struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct cfg80211_bss *cbss;
+ struct sk_buff *skb;
+ const u8 *ssid;
+ int ssid_len;
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+ return NULL;
+
+ sdata_assert_lock(sdata);
+
+ if (ifmgd->associated)
+ cbss = ifmgd->associated;
+ else if (ifmgd->auth_data)
+ cbss = ifmgd->auth_data->bss;
+ else if (ifmgd->assoc_data)
+ cbss = ifmgd->assoc_data->bss;
+ else
+ return NULL;
+
+ rcu_read_lock();
+ ssid = ieee80211_bss_get_ie(cbss, WLAN_EID_SSID);
+ if (WARN_ON_ONCE(ssid == NULL))
+ ssid_len = 0;
+ else
+ ssid_len = ssid[1];
+
+ skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid,
+ (u32) -1, cbss->channel,
+ ssid + 2, ssid_len,
+ NULL, 0, true);
+ rcu_read_unlock();
+
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_ap_probereq_get);
+
+static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata,
+ const u8 *buf, size_t len, bool tx,
+ u16 reason)
+{
+ struct ieee80211_event event = {
+ .type = MLME_EVENT,
+ .u.mlme.data = tx ? DEAUTH_TX_EVENT : DEAUTH_RX_EVENT,
+ .u.mlme.reason = reason,
+ };
+
+ if (tx)
+ cfg80211_tx_mlme_mgmt(sdata->dev, buf, len);
+ else
+ cfg80211_rx_mlme_mgmt(sdata->dev, buf, len);
+
+ drv_event_callback(sdata->local, sdata, &event);
+}
+
+static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+ sdata_lock(sdata);
+ if (!ifmgd->associated) {
+ sdata_unlock(sdata);
+ return;
+ }
+
+ ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
+ WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
+ true, frame_buf);
+ mutex_lock(&local->mtx);
+ sdata->vif.csa_active = false;
+ ifmgd->csa_waiting_bcn = false;
+ if (sdata->csa_block_tx) {
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_CSA);
+ sdata->csa_block_tx = false;
+ }
+ mutex_unlock(&local->mtx);
+
+ ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true,
+ WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY);
+
+ sdata_unlock(sdata);
+}
+
+static void ieee80211_beacon_connection_loss_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ u.mgd.beacon_connection_loss_work);
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct sta_info *sta;
+
+ if (ifmgd->associated) {
+ rcu_read_lock();
+ sta = sta_info_get(sdata, ifmgd->bssid);
+ if (sta)
+ sta->beacon_loss_count++;
+ rcu_read_unlock();
+ }
+
+ if (ifmgd->connection_loss) {
+ sdata_info(sdata, "Connection to AP %pM lost\n",
+ ifmgd->bssid);
+ __ieee80211_disconnect(sdata);
+ } else {
+ ieee80211_mgd_probe_ap(sdata, true);
+ }
+}
+
+static void ieee80211_csa_connection_drop_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ u.mgd.csa_connection_drop_work);
+
+ __ieee80211_disconnect(sdata);
+}
+
+void ieee80211_beacon_loss(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_hw *hw = &sdata->local->hw;
+
+ trace_api_beacon_loss(sdata);
+
+ sdata->u.mgd.connection_loss = false;
+ ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work);
+}
+EXPORT_SYMBOL(ieee80211_beacon_loss);
+
+void ieee80211_connection_loss(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_hw *hw = &sdata->local->hw;
+
+ trace_api_connection_loss(sdata);
+
+ sdata->u.mgd.connection_loss = true;
+ ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work);
+}
+EXPORT_SYMBOL(ieee80211_connection_loss);
+
+
+static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
+ bool assoc)
+{
+ struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data;
+
+ sdata_assert_lock(sdata);
+
+ if (!assoc) {
+ /*
+ * we are not authenticated yet, the only timer that could be
+ * running is the timeout for the authentication response which
+ * which is not relevant anymore.
+ */
+ del_timer_sync(&sdata->u.mgd.timer);
+ sta_info_destroy_addr(sdata, auth_data->bss->bssid);
+
+ eth_zero_addr(sdata->u.mgd.bssid);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+ sdata->u.mgd.flags = 0;
+ mutex_lock(&sdata->local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&sdata->local->mtx);
+ }
+
+ cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss);
+ kfree(auth_data);
+ sdata->u.mgd.auth_data = NULL;
+}
+
+static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data;
+ u8 *pos;
+ struct ieee802_11_elems elems;
+ u32 tx_flags = 0;
+
+ pos = mgmt->u.auth.variable;
+ ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
+ if (!elems.challenge)
+ return;
+ auth_data->expected_transaction = 4;
+ drv_mgd_prepare_tx(sdata->local, sdata);
+ if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
+ IEEE80211_TX_INTFL_MLME_CONN_TX;
+ ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0,
+ elems.challenge - 2, elems.challenge_len + 2,
+ auth_data->bss->bssid, auth_data->bss->bssid,
+ auth_data->key, auth_data->key_len,
+ auth_data->key_idx, tx_flags);
+}
+
+static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u8 bssid[ETH_ALEN];
+ u16 auth_alg, auth_transaction, status_code;
+ struct sta_info *sta;
+ struct ieee80211_event event = {
+ .type = MLME_EVENT,
+ .u.mlme.data = AUTH_EVENT,
+ };
+
+ sdata_assert_lock(sdata);
+
+ if (len < 24 + 6)
+ return;
+
+ if (!ifmgd->auth_data || ifmgd->auth_data->done)
+ return;
+
+ memcpy(bssid, ifmgd->auth_data->bss->bssid, ETH_ALEN);
+
+ if (!ether_addr_equal(bssid, mgmt->bssid))
+ return;
+
+ auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
+ auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
+ status_code = le16_to_cpu(mgmt->u.auth.status_code);
+
+ if (auth_alg != ifmgd->auth_data->algorithm ||
+ auth_transaction != ifmgd->auth_data->expected_transaction) {
+ sdata_info(sdata, "%pM unexpected authentication state: alg %d (expected %d) transact %d (expected %d)\n",
+ mgmt->sa, auth_alg, ifmgd->auth_data->algorithm,
+ auth_transaction,
+ ifmgd->auth_data->expected_transaction);
+ return;
+ }
+
+ if (status_code != WLAN_STATUS_SUCCESS) {
+ sdata_info(sdata, "%pM denied authentication (status %d)\n",
+ mgmt->sa, status_code);
+ ieee80211_destroy_auth_data(sdata, false);
+ cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
+ event.u.mlme.status = MLME_DENIED;
+ event.u.mlme.reason = status_code;
+ drv_event_callback(sdata->local, sdata, &event);
+ return;
+ }
+
+ switch (ifmgd->auth_data->algorithm) {
+ case WLAN_AUTH_OPEN:
+ case WLAN_AUTH_LEAP:
+ case WLAN_AUTH_FT:
+ case WLAN_AUTH_SAE:
+ break;
+ case WLAN_AUTH_SHARED_KEY:
+ if (ifmgd->auth_data->expected_transaction != 4) {
+ ieee80211_auth_challenge(sdata, mgmt, len);
+ /* need another frame */
+ return;
+ }
+ break;
+ default:
+ WARN_ONCE(1, "invalid auth alg %d",
+ ifmgd->auth_data->algorithm);
+ return;
+ }
+
+ event.u.mlme.status = MLME_SUCCESS;
+ drv_event_callback(sdata->local, sdata, &event);
+ sdata_info(sdata, "authenticated\n");
+ ifmgd->auth_data->done = true;
+ ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC;
+ ifmgd->auth_data->timeout_started = true;
+ run_again(sdata, ifmgd->auth_data->timeout);
+
+ if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
+ ifmgd->auth_data->expected_transaction != 2) {
+ /*
+ * Report auth frame to user space for processing since another
+ * round of Authentication frames is still needed.
+ */
+ cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
+ return;
+ }
+
+ /* move station state to auth */
+ mutex_lock(&sdata->local->sta_mtx);
+ sta = sta_info_get(sdata, bssid);
+ if (!sta) {
+ WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid);
+ goto out_err;
+ }
+ if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) {
+ sdata_info(sdata, "failed moving %pM to auth\n", bssid);
+ goto out_err;
+ }
+ mutex_unlock(&sdata->local->sta_mtx);
+
+ cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
+ return;
+ out_err:
+ mutex_unlock(&sdata->local->sta_mtx);
+ /* ignore frame -- wait for timeout */
+}
+
+#define case_WLAN(type) \
+ case WLAN_REASON_##type: return #type
+
+static const char *ieee80211_get_reason_code_string(u16 reason_code)
+{
+ switch (reason_code) {
+ case_WLAN(UNSPECIFIED);
+ case_WLAN(PREV_AUTH_NOT_VALID);
+ case_WLAN(DEAUTH_LEAVING);
+ case_WLAN(DISASSOC_DUE_TO_INACTIVITY);
+ case_WLAN(DISASSOC_AP_BUSY);
+ case_WLAN(CLASS2_FRAME_FROM_NONAUTH_STA);
+ case_WLAN(CLASS3_FRAME_FROM_NONASSOC_STA);
+ case_WLAN(DISASSOC_STA_HAS_LEFT);
+ case_WLAN(STA_REQ_ASSOC_WITHOUT_AUTH);
+ case_WLAN(DISASSOC_BAD_POWER);
+ case_WLAN(DISASSOC_BAD_SUPP_CHAN);
+ case_WLAN(INVALID_IE);
+ case_WLAN(MIC_FAILURE);
+ case_WLAN(4WAY_HANDSHAKE_TIMEOUT);
+ case_WLAN(GROUP_KEY_HANDSHAKE_TIMEOUT);
+ case_WLAN(IE_DIFFERENT);
+ case_WLAN(INVALID_GROUP_CIPHER);
+ case_WLAN(INVALID_PAIRWISE_CIPHER);
+ case_WLAN(INVALID_AKMP);
+ case_WLAN(UNSUPP_RSN_VERSION);
+ case_WLAN(INVALID_RSN_IE_CAP);
+ case_WLAN(IEEE8021X_FAILED);
+ case_WLAN(CIPHER_SUITE_REJECTED);
+ case_WLAN(DISASSOC_UNSPECIFIED_QOS);
+ case_WLAN(DISASSOC_QAP_NO_BANDWIDTH);
+ case_WLAN(DISASSOC_LOW_ACK);
+ case_WLAN(DISASSOC_QAP_EXCEED_TXOP);
+ case_WLAN(QSTA_LEAVE_QBSS);
+ case_WLAN(QSTA_NOT_USE);
+ case_WLAN(QSTA_REQUIRE_SETUP);
+ case_WLAN(QSTA_TIMEOUT);
+ case_WLAN(QSTA_CIPHER_NOT_SUPP);
+ case_WLAN(MESH_PEER_CANCELED);
+ case_WLAN(MESH_MAX_PEERS);
+ case_WLAN(MESH_CONFIG);
+ case_WLAN(MESH_CLOSE);
+ case_WLAN(MESH_MAX_RETRIES);
+ case_WLAN(MESH_CONFIRM_TIMEOUT);
+ case_WLAN(MESH_INVALID_GTK);
+ case_WLAN(MESH_INCONSISTENT_PARAM);
+ case_WLAN(MESH_INVALID_SECURITY);
+ case_WLAN(MESH_PATH_ERROR);
+ case_WLAN(MESH_PATH_NOFORWARD);
+ case_WLAN(MESH_PATH_DEST_UNREACHABLE);
+ case_WLAN(MAC_EXISTS_IN_MBSS);
+ case_WLAN(MESH_CHAN_REGULATORY);
+ case_WLAN(MESH_CHAN);
+ default: return "<unknown>";
+ }
+}
+
+static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ const u8 *bssid = NULL;
+ u16 reason_code;
+
+ sdata_assert_lock(sdata);
+
+ if (len < 24 + 2)
+ return;
+
+ if (!ifmgd->associated ||
+ !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid))
+ return;
+
+ bssid = ifmgd->associated->bssid;
+
+ reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
+
+ sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n",
+ bssid, reason_code, ieee80211_get_reason_code_string(reason_code));
+
+ ieee80211_set_disassoc(sdata, 0, 0, false, NULL);
+
+ ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code);
+}
+
+
+static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u16 reason_code;
+
+ sdata_assert_lock(sdata);
+
+ if (len < 24 + 2)
+ return;
+
+ if (!ifmgd->associated ||
+ !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid))
+ return;
+
+ reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
+
+ sdata_info(sdata, "disassociated from %pM (Reason: %u)\n",
+ mgmt->sa, reason_code);
+
+ ieee80211_set_disassoc(sdata, 0, 0, false, NULL);
+
+ ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code);
+}
+
+static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
+ u8 *supp_rates, unsigned int supp_rates_len,
+ u32 *rates, u32 *basic_rates,
+ bool *have_higher_than_11mbit,
+ int *min_rate, int *min_rate_index,
+ int shift, u32 rate_flags)
+{
+ int i, j;
+
+ for (i = 0; i < supp_rates_len; i++) {
+ int rate = supp_rates[i] & 0x7f;
+ bool is_basic = !!(supp_rates[i] & 0x80);
+
+ if ((rate * 5 * (1 << shift)) > 110)
+ *have_higher_than_11mbit = true;
+
+ /*
+ * BSS_MEMBERSHIP_SELECTOR_HT_PHY is defined in 802.11n-2009
+ * 7.3.2.2 as a magic value instead of a rate. Hence, skip it.
+ *
+ * Note: Even through the membership selector and the basic
+ * rate flag share the same bit, they are not exactly
+ * the same.
+ */
+ if (!!(supp_rates[i] & 0x80) &&
+ (supp_rates[i] & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
+ continue;
+
+ for (j = 0; j < sband->n_bitrates; j++) {
+ struct ieee80211_rate *br;
+ int brate;
+
+ br = &sband->bitrates[j];
+ if ((rate_flags & br->flags) != rate_flags)
+ continue;
+
+ brate = DIV_ROUND_UP(br->bitrate, (1 << shift) * 5);
+ if (brate == rate) {
+ *rates |= BIT(j);
+ if (is_basic)
+ *basic_rates |= BIT(j);
+ if ((rate * 5) < *min_rate) {
+ *min_rate = rate * 5;
+ *min_rate_index = j;
+ }
+ break;
+ }
+ }
+ }
+}
+
+static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
+ bool assoc)
+{
+ struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
+
+ sdata_assert_lock(sdata);
+
+ if (!assoc) {
+ /*
+ * we are not associated yet, the only timer that could be
+ * running is the timeout for the association response which
+ * which is not relevant anymore.
+ */
+ del_timer_sync(&sdata->u.mgd.timer);
+ sta_info_destroy_addr(sdata, assoc_data->bss->bssid);
+
+ eth_zero_addr(sdata->u.mgd.bssid);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+ sdata->u.mgd.flags = 0;
+ mutex_lock(&sdata->local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&sdata->local->mtx);
+ }
+
+ kfree(assoc_data);
+ sdata->u.mgd.assoc_data = NULL;
+}
+
+static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_bss *cbss,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband;
+ struct sta_info *sta;
+ u8 *pos;
+ u16 capab_info, aid;
+ struct ieee802_11_elems elems;
+ struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+ const struct cfg80211_bss_ies *bss_ies = NULL;
+ struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
+ u32 changed = 0;
+ int err;
+ bool ret;
+
+ /* AssocResp and ReassocResp have identical structure */
+
+ aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
+ capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+
+ if ((aid & (BIT(15) | BIT(14))) != (BIT(15) | BIT(14)))
+ sdata_info(sdata, "invalid AID value 0x%x; bits 15:14 not set\n",
+ aid);
+ aid &= ~(BIT(15) | BIT(14));
+
+ ifmgd->broken_ap = false;
+
+ if (aid == 0 || aid > IEEE80211_MAX_AID) {
+ sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n",
+ aid);
+ aid = 0;
+ ifmgd->broken_ap = true;
+ }
+
+ pos = mgmt->u.assoc_resp.variable;
+ ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
+
+ if (!elems.supp_rates) {
+ sdata_info(sdata, "no SuppRates element in AssocResp\n");
+ return false;
+ }
+
+ ifmgd->aid = aid;
+ ifmgd->tdls_chan_switch_prohibited =
+ elems.ext_capab && elems.ext_capab_len >= 5 &&
+ (elems.ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED);
+
+ /*
+ * Some APs are erroneously not including some information in their
+ * (re)association response frames. Try to recover by using the data
+ * from the beacon or probe response. This seems to afflict mobile
+ * 2G/3G/4G wifi routers, reported models include the "Onda PN51T",
+ * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device.
+ */
+ if ((assoc_data->wmm && !elems.wmm_param) ||
+ (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
+ (!elems.ht_cap_elem || !elems.ht_operation)) ||
+ (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
+ (!elems.vht_cap_elem || !elems.vht_operation))) {
+ const struct cfg80211_bss_ies *ies;
+ struct ieee802_11_elems bss_elems;
+
+ rcu_read_lock();
+ ies = rcu_dereference(cbss->ies);
+ if (ies)
+ bss_ies = kmemdup(ies, sizeof(*ies) + ies->len,
+ GFP_ATOMIC);
+ rcu_read_unlock();
+ if (!bss_ies)
+ return false;
+
+ ieee802_11_parse_elems(bss_ies->data, bss_ies->len,
+ false, &bss_elems);
+ if (assoc_data->wmm &&
+ !elems.wmm_param && bss_elems.wmm_param) {
+ elems.wmm_param = bss_elems.wmm_param;
+ sdata_info(sdata,
+ "AP bug: WMM param missing from AssocResp\n");
+ }
+
+ /*
+ * Also check if we requested HT/VHT, otherwise the AP doesn't
+ * have to include the IEs in the (re)association response.
+ */
+ if (!elems.ht_cap_elem && bss_elems.ht_cap_elem &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
+ elems.ht_cap_elem = bss_elems.ht_cap_elem;
+ sdata_info(sdata,
+ "AP bug: HT capability missing from AssocResp\n");
+ }
+ if (!elems.ht_operation && bss_elems.ht_operation &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
+ elems.ht_operation = bss_elems.ht_operation;
+ sdata_info(sdata,
+ "AP bug: HT operation missing from AssocResp\n");
+ }
+ if (!elems.vht_cap_elem && bss_elems.vht_cap_elem &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) {
+ elems.vht_cap_elem = bss_elems.vht_cap_elem;
+ sdata_info(sdata,
+ "AP bug: VHT capa missing from AssocResp\n");
+ }
+ if (!elems.vht_operation && bss_elems.vht_operation &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) {
+ elems.vht_operation = bss_elems.vht_operation;
+ sdata_info(sdata,
+ "AP bug: VHT operation missing from AssocResp\n");
+ }
+ }
+
+ /*
+ * We previously checked these in the beacon/probe response, so
+ * they should be present here. This is just a safety net.
+ */
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
+ (!elems.wmm_param || !elems.ht_cap_elem || !elems.ht_operation)) {
+ sdata_info(sdata,
+ "HT AP is missing WMM params or HT capability/operation\n");
+ ret = false;
+ goto out;
+ }
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
+ (!elems.vht_cap_elem || !elems.vht_operation)) {
+ sdata_info(sdata,
+ "VHT AP is missing VHT capability/operation\n");
+ ret = false;
+ goto out;
+ }
+
+ mutex_lock(&sdata->local->sta_mtx);
+ /*
+ * station info was already allocated and inserted before
+ * the association and should be available to us
+ */
+ sta = sta_info_get(sdata, cbss->bssid);
+ if (WARN_ON(!sta)) {
+ mutex_unlock(&sdata->local->sta_mtx);
+ ret = false;
+ goto out;
+ }
+
+ sband = local->hw.wiphy->bands[ieee80211_get_sdata_band(sdata)];
+
+ /* Set up internal HT/VHT capabilities */
+ if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+ ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
+ elems.ht_cap_elem, sta);
+
+ if (elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
+ ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
+ elems.vht_cap_elem, sta);
+
+ /*
+ * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data
+ * in their association response, so ignore that data for our own
+ * configuration. If it changed since the last beacon, we'll get the
+ * next beacon and update then.
+ */
+
+ /*
+ * If an operating mode notification IE is present, override the
+ * NSS calculation (that would be done in rate_control_rate_init())
+ * and use the # of streams from that element.
+ */
+ if (elems.opmode_notif &&
+ !(*elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) {
+ u8 nss;
+
+ nss = *elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK;
+ nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT;
+ nss += 1;
+ sta->sta.rx_nss = nss;
+ }
+
+ rate_control_rate_init(sta);
+
+ if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) {
+ set_sta_flag(sta, WLAN_STA_MFP);
+ sta->sta.mfp = true;
+ } else {
+ sta->sta.mfp = false;
+ }
+
+ sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS;
+
+ err = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
+ if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT))
+ err = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED);
+ if (err) {
+ sdata_info(sdata,
+ "failed to move station %pM to desired state\n",
+ sta->sta.addr);
+ WARN_ON(__sta_info_destroy(sta));
+ mutex_unlock(&sdata->local->sta_mtx);
+ ret = false;
+ goto out;
+ }
+
+ mutex_unlock(&sdata->local->sta_mtx);
+
+ /*
+ * Always handle WMM once after association regardless
+ * of the first value the AP uses. Setting -1 here has
+ * that effect because the AP values is an unsigned
+ * 4-bit value.
+ */
+ ifmgd->wmm_last_param_set = -1;
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && elems.wmm_param)
+ ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
+ elems.wmm_param_len);
+ else
+ ieee80211_set_wmm_default(sdata, false);
+ changed |= BSS_CHANGED_QOS;
+
+ /* set AID and assoc capability,
+ * ieee80211_set_associated() will tell the driver */
+ bss_conf->aid = aid;
+ bss_conf->assoc_capability = capab_info;
+ ieee80211_set_associated(sdata, cbss, changed);
+
+ /*
+ * If we're using 4-addr mode, let the AP know that we're
+ * doing so, so that it can create the STA VLAN on its side
+ */
+ if (ifmgd->use_4addr)
+ ieee80211_send_4addr_nullfunc(local, sdata);
+
+ /*
+ * Start timer to probe the connection to the AP now.
+ * Also start the timer that will detect beacon loss.
+ */
+ ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt);
+ ieee80211_sta_reset_beacon_monitor(sdata);
+
+ ret = true;
+ out:
+ kfree(bss_ies);
+ return ret;
+}
+
+static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
+ u16 capab_info, status_code, aid;
+ struct ieee802_11_elems elems;
+ int ac, uapsd_queues = -1;
+ u8 *pos;
+ bool reassoc;
+ struct cfg80211_bss *bss;
+ struct ieee80211_event event = {
+ .type = MLME_EVENT,
+ .u.mlme.data = ASSOC_EVENT,
+ };
+
+ sdata_assert_lock(sdata);
+
+ if (!assoc_data)
+ return;
+ if (!ether_addr_equal(assoc_data->bss->bssid, mgmt->bssid))
+ return;
+
+ /*
+ * AssocResp and ReassocResp have identical structure, so process both
+ * of them in this function.
+ */
+
+ if (len < 24 + 6)
+ return;
+
+ reassoc = ieee80211_is_reassoc_req(mgmt->frame_control);
+ capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+ status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code);
+ aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
+
+ sdata_info(sdata,
+ "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n",
+ reassoc ? "Rea" : "A", mgmt->sa,
+ capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14))));
+
+ pos = mgmt->u.assoc_resp.variable;
+ ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
+
+ if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY &&
+ elems.timeout_int &&
+ elems.timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) {
+ u32 tu, ms;
+ tu = le32_to_cpu(elems.timeout_int->value);
+ ms = tu * 1024 / 1000;
+ sdata_info(sdata,
+ "%pM rejected association temporarily; comeback duration %u TU (%u ms)\n",
+ mgmt->sa, tu, ms);
+ assoc_data->timeout = jiffies + msecs_to_jiffies(ms);
+ assoc_data->timeout_started = true;
+ if (ms > IEEE80211_ASSOC_TIMEOUT)
+ run_again(sdata, assoc_data->timeout);
+ return;
+ }
+
+ bss = assoc_data->bss;
+
+ if (status_code != WLAN_STATUS_SUCCESS) {
+ sdata_info(sdata, "%pM denied association (code=%d)\n",
+ mgmt->sa, status_code);
+ ieee80211_destroy_assoc_data(sdata, false);
+ event.u.mlme.status = MLME_DENIED;
+ event.u.mlme.reason = status_code;
+ drv_event_callback(sdata->local, sdata, &event);
+ } else {
+ if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) {
+ /* oops -- internal error -- send timeout for now */
+ ieee80211_destroy_assoc_data(sdata, false);
+ cfg80211_assoc_timeout(sdata->dev, bss);
+ return;
+ }
+ event.u.mlme.status = MLME_SUCCESS;
+ drv_event_callback(sdata->local, sdata, &event);
+ sdata_info(sdata, "associated\n");
+
+ /*
+ * destroy assoc_data afterwards, as otherwise an idle
+ * recalc after assoc_data is NULL but before associated
+ * is set can cause the interface to go idle
+ */
+ ieee80211_destroy_assoc_data(sdata, true);
+
+ /* get uapsd queues configuration */
+ uapsd_queues = 0;
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+ if (sdata->tx_conf[ac].uapsd)
+ uapsd_queues |= BIT(ac);
+ }
+
+ cfg80211_rx_assoc_resp(sdata->dev, bss, (u8 *)mgmt, len, uapsd_queues);
+}
+
+static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_rx_status *rx_status,
+ struct ieee802_11_elems *elems)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_bss *bss;
+ struct ieee80211_channel *channel;
+
+ sdata_assert_lock(sdata);
+
+ channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq);
+ if (!channel)
+ return;
+
+ bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
+ channel);
+ if (bss) {
+ sdata->vif.bss_conf.beacon_rate = bss->beacon_rate;
+ ieee80211_rx_bss_put(local, bss);
+ }
+}
+
+
+static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_mgmt *mgmt = (void *)skb->data;
+ struct ieee80211_if_managed *ifmgd;
+ struct ieee80211_rx_status *rx_status = (void *) skb->cb;
+ size_t baselen, len = skb->len;
+ struct ieee802_11_elems elems;
+
+ ifmgd = &sdata->u.mgd;
+
+ sdata_assert_lock(sdata);
+
+ if (!ether_addr_equal(mgmt->da, sdata->vif.addr))
+ return; /* ignore ProbeResp to foreign address */
+
+ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+ if (baselen > len)
+ return;
+
+ ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+ false, &elems);
+
+ ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
+
+ if (ifmgd->associated &&
+ ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid))
+ ieee80211_reset_ap_probe(sdata);
+
+ if (ifmgd->auth_data && !ifmgd->auth_data->bss->proberesp_ies &&
+ ether_addr_equal(mgmt->bssid, ifmgd->auth_data->bss->bssid)) {
+ /* got probe response, continue with auth */
+ sdata_info(sdata, "direct probe responded\n");
+ ifmgd->auth_data->tries = 0;
+ ifmgd->auth_data->timeout = jiffies;
+ ifmgd->auth_data->timeout_started = true;
+ run_again(sdata, ifmgd->auth_data->timeout);
+ }
+}
+
+/*
+ * This is the canonical list of information elements we care about,
+ * the filter code also gives us all changes to the Microsoft OUI
+ * (00:50:F2) vendor IE which is used for WMM which we need to track,
+ * as well as the DTPC IE (part of the Cisco OUI) used for signaling
+ * changes to requested client power.
+ *
+ * We implement beacon filtering in software since that means we can
+ * avoid processing the frame here and in cfg80211, and userspace
+ * will not be able to tell whether the hardware supports it or not.
+ *
+ * XXX: This list needs to be dynamic -- userspace needs to be able to
+ * add items it requires. It also needs to be able to tell us to
+ * look out for other vendor IEs.
+ */
+static const u64 care_about_ies =
+ (1ULL << WLAN_EID_COUNTRY) |
+ (1ULL << WLAN_EID_ERP_INFO) |
+ (1ULL << WLAN_EID_CHANNEL_SWITCH) |
+ (1ULL << WLAN_EID_PWR_CONSTRAINT) |
+ (1ULL << WLAN_EID_HT_CAPABILITY) |
+ (1ULL << WLAN_EID_HT_OPERATION) |
+ (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN);
+
+static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee80211_rx_status *rx_status)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+ size_t baselen;
+ struct ieee802_11_elems elems;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_channel *chan;
+ struct sta_info *sta;
+ u32 changed = 0;
+ bool erp_valid;
+ u8 erp_value = 0;
+ u32 ncrc;
+ u8 *bssid;
+ u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+ sdata_assert_lock(sdata);
+
+ /* Process beacon from the current BSS */
+ baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
+ if (baselen > len)
+ return;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ rcu_read_unlock();
+ return;
+ }
+
+ if (rx_status->freq != chanctx_conf->def.chan->center_freq) {
+ rcu_read_unlock();
+ return;
+ }
+ chan = chanctx_conf->def.chan;
+ rcu_read_unlock();
+
+ if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon &&
+ ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) {
+ ieee802_11_parse_elems(mgmt->u.beacon.variable,
+ len - baselen, false, &elems);
+
+ ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
+ if (elems.tim && !elems.parse_error) {
+ const struct ieee80211_tim_ie *tim_ie = elems.tim;
+ ifmgd->dtim_period = tim_ie->dtim_period;
+ }
+ ifmgd->have_beacon = true;
+ ifmgd->assoc_data->need_beacon = false;
+ if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) {
+ sdata->vif.bss_conf.sync_tsf =
+ le64_to_cpu(mgmt->u.beacon.timestamp);
+ sdata->vif.bss_conf.sync_device_ts =
+ rx_status->device_timestamp;
+ if (elems.tim)
+ sdata->vif.bss_conf.sync_dtim_count =
+ elems.tim->dtim_count;
+ else
+ sdata->vif.bss_conf.sync_dtim_count = 0;
+ }
+ /* continue assoc process */
+ ifmgd->assoc_data->timeout = jiffies;
+ ifmgd->assoc_data->timeout_started = true;
+ run_again(sdata, ifmgd->assoc_data->timeout);
+ return;
+ }
+
+ if (!ifmgd->associated ||
+ !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid))
+ return;
+ bssid = ifmgd->associated->bssid;
+
+ /* Track average RSSI from the Beacon frames of the current AP */
+ ifmgd->last_beacon_signal = rx_status->signal;
+ if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) {
+ ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE;
+ ifmgd->ave_beacon_signal = rx_status->signal * 16;
+ ifmgd->last_cqm_event_signal = 0;
+ ifmgd->count_beacon_signal = 1;
+ ifmgd->last_ave_beacon_signal = 0;
+ } else {
+ ifmgd->ave_beacon_signal =
+ (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 +
+ (16 - IEEE80211_SIGNAL_AVE_WEIGHT) *
+ ifmgd->ave_beacon_signal) / 16;
+ ifmgd->count_beacon_signal++;
+ }
+
+ if (ifmgd->rssi_min_thold != ifmgd->rssi_max_thold &&
+ ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) {
+ int sig = ifmgd->ave_beacon_signal;
+ int last_sig = ifmgd->last_ave_beacon_signal;
+ struct ieee80211_event event = {
+ .type = RSSI_EVENT,
+ };
+
+ /*
+ * if signal crosses either of the boundaries, invoke callback
+ * with appropriate parameters
+ */
+ if (sig > ifmgd->rssi_max_thold &&
+ (last_sig <= ifmgd->rssi_min_thold || last_sig == 0)) {
+ ifmgd->last_ave_beacon_signal = sig;
+ event.u.rssi.data = RSSI_EVENT_HIGH;
+ drv_event_callback(local, sdata, &event);
+ } else if (sig < ifmgd->rssi_min_thold &&
+ (last_sig >= ifmgd->rssi_max_thold ||
+ last_sig == 0)) {
+ ifmgd->last_ave_beacon_signal = sig;
+ event.u.rssi.data = RSSI_EVENT_LOW;
+ drv_event_callback(local, sdata, &event);
+ }
+ }
+
+ if (bss_conf->cqm_rssi_thold &&
+ ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT &&
+ !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) {
+ int sig = ifmgd->ave_beacon_signal / 16;
+ int last_event = ifmgd->last_cqm_event_signal;
+ int thold = bss_conf->cqm_rssi_thold;
+ int hyst = bss_conf->cqm_rssi_hyst;
+ if (sig < thold &&
+ (last_event == 0 || sig < last_event - hyst)) {
+ ifmgd->last_cqm_event_signal = sig;
+ ieee80211_cqm_rssi_notify(
+ &sdata->vif,
+ NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
+ GFP_KERNEL);
+ } else if (sig > thold &&
+ (last_event == 0 || sig > last_event + hyst)) {
+ ifmgd->last_cqm_event_signal = sig;
+ ieee80211_cqm_rssi_notify(
+ &sdata->vif,
+ NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
+ GFP_KERNEL);
+ }
+ }
+
+ if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) {
+ mlme_dbg_ratelimited(sdata,
+ "cancelling AP probe due to a received beacon\n");
+ ieee80211_reset_ap_probe(sdata);
+ }
+
+ /*
+ * Push the beacon loss detection into the future since
+ * we are processing a beacon from the AP just now.
+ */
+ ieee80211_sta_reset_beacon_monitor(sdata);
+
+ ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
+ ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable,
+ len - baselen, false, &elems,
+ care_about_ies, ncrc);
+
+ if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) {
+ bool directed_tim = ieee80211_check_tim(elems.tim,
+ elems.tim_len,
+ ifmgd->aid);
+ if (directed_tim) {
+ if (local->hw.conf.dynamic_ps_timeout > 0) {
+ if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+ local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+ ieee80211_hw_config(local,
+ IEEE80211_CONF_CHANGE_PS);
+ }
+ ieee80211_send_nullfunc(local, sdata, 0);
+ } else if (!local->pspolling && sdata->u.mgd.powersave) {
+ local->pspolling = true;
+
+ /*
+ * Here is assumed that the driver will be
+ * able to send ps-poll frame and receive a
+ * response even though power save mode is
+ * enabled, but some drivers might require
+ * to disable power save here. This needs
+ * to be investigated.
+ */
+ ieee80211_send_pspoll(local, sdata);
+ }
+ }
+ }
+
+ if (sdata->vif.p2p) {
+ struct ieee80211_p2p_noa_attr noa = {};
+ int ret;
+
+ ret = cfg80211_get_p2p_attr(mgmt->u.beacon.variable,
+ len - baselen,
+ IEEE80211_P2P_ATTR_ABSENCE_NOTICE,
+ (u8 *) &noa, sizeof(noa));
+ if (ret >= 2) {
+ if (sdata->u.mgd.p2p_noa_index != noa.index) {
+ /* valid noa_attr and index changed */
+ sdata->u.mgd.p2p_noa_index = noa.index;
+ memcpy(&bss_conf->p2p_noa_attr, &noa, sizeof(noa));
+ changed |= BSS_CHANGED_P2P_PS;
+ /*
+ * make sure we update all information, the CRC
+ * mechanism doesn't look at P2P attributes.
+ */
+ ifmgd->beacon_crc_valid = false;
+ }
+ } else if (sdata->u.mgd.p2p_noa_index != -1) {
+ /* noa_attr not found and we had valid noa_attr before */
+ sdata->u.mgd.p2p_noa_index = -1;
+ memset(&bss_conf->p2p_noa_attr, 0, sizeof(bss_conf->p2p_noa_attr));
+ changed |= BSS_CHANGED_P2P_PS;
+ ifmgd->beacon_crc_valid = false;
+ }
+ }
+
+ if (ifmgd->csa_waiting_bcn)
+ ieee80211_chswitch_post_beacon(sdata);
+
+ /*
+ * Update beacon timing and dtim count on every beacon appearance. This
+ * will allow the driver to use the most updated values. Do it before
+ * comparing this one with last received beacon.
+ * IMPORTANT: These parameters would possibly be out of sync by the time
+ * the driver will use them. The synchronized view is currently
+ * guaranteed only in certain callbacks.
+ */
+ if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) {
+ sdata->vif.bss_conf.sync_tsf =
+ le64_to_cpu(mgmt->u.beacon.timestamp);
+ sdata->vif.bss_conf.sync_device_ts =
+ rx_status->device_timestamp;
+ if (elems.tim)
+ sdata->vif.bss_conf.sync_dtim_count =
+ elems.tim->dtim_count;
+ else
+ sdata->vif.bss_conf.sync_dtim_count = 0;
+ }
+
+ if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid)
+ return;
+ ifmgd->beacon_crc = ncrc;
+ ifmgd->beacon_crc_valid = true;
+
+ ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
+
+ ieee80211_sta_process_chanswitch(sdata, rx_status->mactime,
+ rx_status->device_timestamp,
+ &elems, true);
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) &&
+ ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
+ elems.wmm_param_len))
+ changed |= BSS_CHANGED_QOS;
+
+ /*
+ * If we haven't had a beacon before, tell the driver about the
+ * DTIM period (and beacon timing if desired) now.
+ */
+ if (!ifmgd->have_beacon) {
+ /* a few bogus AP send dtim_period = 0 or no TIM IE */
+ if (elems.tim)
+ bss_conf->dtim_period = elems.tim->dtim_period ?: 1;
+ else
+ bss_conf->dtim_period = 1;
+
+ changed |= BSS_CHANGED_BEACON_INFO;
+ ifmgd->have_beacon = true;
+
+ mutex_lock(&local->iflist_mtx);
+ ieee80211_recalc_ps(local, -1);
+ mutex_unlock(&local->iflist_mtx);
+
+ ieee80211_recalc_ps_vif(sdata);
+ }
+
+ if (elems.erp_info) {
+ erp_valid = true;
+ erp_value = elems.erp_info[0];
+ } else {
+ erp_valid = false;
+ }
+ changed |= ieee80211_handle_bss_capability(sdata,
+ le16_to_cpu(mgmt->u.beacon.capab_info),
+ erp_valid, erp_value);
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get(sdata, bssid);
+
+ if (ieee80211_config_bw(sdata, sta,
+ elems.ht_cap_elem, elems.ht_operation,
+ elems.vht_operation, bssid, &changed)) {
+ mutex_unlock(&local->sta_mtx);
+ ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
+ WLAN_REASON_DEAUTH_LEAVING,
+ true, deauth_buf);
+ ieee80211_report_disconnect(sdata, deauth_buf,
+ sizeof(deauth_buf), true,
+ WLAN_REASON_DEAUTH_LEAVING);
+ return;
+ }
+
+ if (sta && elems.opmode_notif)
+ ieee80211_vht_handle_opmode(sdata, sta, *elems.opmode_notif,
+ rx_status->band, true);
+ mutex_unlock(&local->sta_mtx);
+
+ changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt,
+ elems.country_elem,
+ elems.country_elem_len,
+ elems.pwr_constr_elem,
+ elems.cisco_dtpc_elem);
+
+ ieee80211_bss_info_change_notify(sdata, changed);
+}
+
+void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_rx_status *rx_status;
+ struct ieee80211_mgmt *mgmt;
+ u16 fc;
+ struct ieee802_11_elems elems;
+ int ies_len;
+
+ rx_status = (struct ieee80211_rx_status *) skb->cb;
+ mgmt = (struct ieee80211_mgmt *) skb->data;
+ fc = le16_to_cpu(mgmt->frame_control);
+
+ sdata_lock(sdata);
+
+ switch (fc & IEEE80211_FCTL_STYPE) {
+ case IEEE80211_STYPE_BEACON:
+ ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, rx_status);
+ break;
+ case IEEE80211_STYPE_PROBE_RESP:
+ ieee80211_rx_mgmt_probe_resp(sdata, skb);
+ break;
+ case IEEE80211_STYPE_AUTH:
+ ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_DEAUTH:
+ ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_DISASSOC:
+ ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_ASSOC_RESP:
+ case IEEE80211_STYPE_REASSOC_RESP:
+ ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len);
+ break;
+ case IEEE80211_STYPE_ACTION:
+ if (mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) {
+ ies_len = skb->len -
+ offsetof(struct ieee80211_mgmt,
+ u.action.u.chan_switch.variable);
+
+ if (ies_len < 0)
+ break;
+
+ ieee802_11_parse_elems(
+ mgmt->u.action.u.chan_switch.variable,
+ ies_len, true, &elems);
+
+ if (elems.parse_error)
+ break;
+
+ ieee80211_sta_process_chanswitch(sdata,
+ rx_status->mactime,
+ rx_status->device_timestamp,
+ &elems, false);
+ } else if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) {
+ ies_len = skb->len -
+ offsetof(struct ieee80211_mgmt,
+ u.action.u.ext_chan_switch.variable);
+
+ if (ies_len < 0)
+ break;
+
+ ieee802_11_parse_elems(
+ mgmt->u.action.u.ext_chan_switch.variable,
+ ies_len, true, &elems);
+
+ if (elems.parse_error)
+ break;
+
+ /* for the handling code pretend this was also an IE */
+ elems.ext_chansw_ie =
+ &mgmt->u.action.u.ext_chan_switch.data;
+
+ ieee80211_sta_process_chanswitch(sdata,
+ rx_status->mactime,
+ rx_status->device_timestamp,
+ &elems, false);
+ }
+ break;
+ }
+ sdata_unlock(sdata);
+}
+
+static void ieee80211_sta_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata =
+ (struct ieee80211_sub_if_data *) data;
+
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+}
+
+static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata,
+ u8 *bssid, u8 reason, bool tx)
+{
+ u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+ ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, reason,
+ tx, frame_buf);
+
+ ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true,
+ reason);
+}
+
+static int ieee80211_probe_auth(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_mgd_auth_data *auth_data = ifmgd->auth_data;
+ u32 tx_flags = 0;
+
+ sdata_assert_lock(sdata);
+
+ if (WARN_ON_ONCE(!auth_data))
+ return -EINVAL;
+
+ auth_data->tries++;
+
+ if (auth_data->tries > IEEE80211_AUTH_MAX_TRIES) {
+ sdata_info(sdata, "authentication with %pM timed out\n",
+ auth_data->bss->bssid);
+
+ /*
+ * Most likely AP is not in the range so remove the
+ * bss struct for that AP.
+ */
+ cfg80211_unlink_bss(local->hw.wiphy, auth_data->bss);
+
+ return -ETIMEDOUT;
+ }
+
+ drv_mgd_prepare_tx(local, sdata);
+
+ if (auth_data->bss->proberesp_ies) {
+ u16 trans = 1;
+ u16 status = 0;
+
+ sdata_info(sdata, "send auth to %pM (try %d/%d)\n",
+ auth_data->bss->bssid, auth_data->tries,
+ IEEE80211_AUTH_MAX_TRIES);
+
+ auth_data->expected_transaction = 2;
+
+ if (auth_data->algorithm == WLAN_AUTH_SAE) {
+ trans = auth_data->sae_trans;
+ status = auth_data->sae_status;
+ auth_data->expected_transaction = trans;
+ }
+
+ if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
+ IEEE80211_TX_INTFL_MLME_CONN_TX;
+
+ ieee80211_send_auth(sdata, trans, auth_data->algorithm, status,
+ auth_data->data, auth_data->data_len,
+ auth_data->bss->bssid,
+ auth_data->bss->bssid, NULL, 0, 0,
+ tx_flags);
+ } else {
+ const u8 *ssidie;
+
+ sdata_info(sdata, "direct probe to %pM (try %d/%i)\n",
+ auth_data->bss->bssid, auth_data->tries,
+ IEEE80211_AUTH_MAX_TRIES);
+
+ rcu_read_lock();
+ ssidie = ieee80211_bss_get_ie(auth_data->bss, WLAN_EID_SSID);
+ if (!ssidie) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ /*
+ * Direct probe is sent to broadcast address as some APs
+ * will not answer to direct packet in unassociated state.
+ */
+ ieee80211_send_probe_req(sdata, sdata->vif.addr, NULL,
+ ssidie + 2, ssidie[1],
+ NULL, 0, (u32) -1, true, 0,
+ auth_data->bss->channel, false);
+ rcu_read_unlock();
+ }
+
+ if (tx_flags == 0) {
+ auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT;
+ auth_data->timeout_started = true;
+ run_again(sdata, auth_data->timeout);
+ } else {
+ auth_data->timeout =
+ round_jiffies_up(jiffies + IEEE80211_AUTH_TIMEOUT_LONG);
+ auth_data->timeout_started = true;
+ run_again(sdata, auth_data->timeout);
+ }
+
+ return 0;
+}
+
+static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
+ struct ieee80211_local *local = sdata->local;
+
+ sdata_assert_lock(sdata);
+
+ assoc_data->tries++;
+ if (assoc_data->tries > IEEE80211_ASSOC_MAX_TRIES) {
+ sdata_info(sdata, "association with %pM timed out\n",
+ assoc_data->bss->bssid);
+
+ /*
+ * Most likely AP is not in the range so remove the
+ * bss struct for that AP.
+ */
+ cfg80211_unlink_bss(local->hw.wiphy, assoc_data->bss);
+
+ return -ETIMEDOUT;
+ }
+
+ sdata_info(sdata, "associate with %pM (try %d/%d)\n",
+ assoc_data->bss->bssid, assoc_data->tries,
+ IEEE80211_ASSOC_MAX_TRIES);
+ ieee80211_send_assoc(sdata);
+
+ if (!(local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) {
+ assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT;
+ assoc_data->timeout_started = true;
+ run_again(sdata, assoc_data->timeout);
+ } else {
+ assoc_data->timeout =
+ round_jiffies_up(jiffies +
+ IEEE80211_ASSOC_TIMEOUT_LONG);
+ assoc_data->timeout_started = true;
+ run_again(sdata, assoc_data->timeout);
+ }
+
+ return 0;
+}
+
+void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata,
+ __le16 fc, bool acked)
+{
+ struct ieee80211_local *local = sdata->local;
+
+ sdata->u.mgd.status_fc = fc;
+ sdata->u.mgd.status_acked = acked;
+ sdata->u.mgd.status_received = true;
+
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ sdata_lock(sdata);
+
+ if (ifmgd->status_received) {
+ __le16 fc = ifmgd->status_fc;
+ bool status_acked = ifmgd->status_acked;
+
+ ifmgd->status_received = false;
+ if (ifmgd->auth_data &&
+ (ieee80211_is_probe_req(fc) || ieee80211_is_auth(fc))) {
+ if (status_acked) {
+ ifmgd->auth_data->timeout =
+ jiffies + IEEE80211_AUTH_TIMEOUT_SHORT;
+ run_again(sdata, ifmgd->auth_data->timeout);
+ } else {
+ ifmgd->auth_data->timeout = jiffies - 1;
+ }
+ ifmgd->auth_data->timeout_started = true;
+ } else if (ifmgd->assoc_data &&
+ (ieee80211_is_assoc_req(fc) ||
+ ieee80211_is_reassoc_req(fc))) {
+ if (status_acked) {
+ ifmgd->assoc_data->timeout =
+ jiffies + IEEE80211_ASSOC_TIMEOUT_SHORT;
+ run_again(sdata, ifmgd->assoc_data->timeout);
+ } else {
+ ifmgd->assoc_data->timeout = jiffies - 1;
+ }
+ ifmgd->assoc_data->timeout_started = true;
+ }
+ }
+
+ if (ifmgd->auth_data && ifmgd->auth_data->timeout_started &&
+ time_after(jiffies, ifmgd->auth_data->timeout)) {
+ if (ifmgd->auth_data->done) {
+ /*
+ * ok ... we waited for assoc but userspace didn't,
+ * so let's just kill the auth data
+ */
+ ieee80211_destroy_auth_data(sdata, false);
+ } else if (ieee80211_probe_auth(sdata)) {
+ u8 bssid[ETH_ALEN];
+ struct ieee80211_event event = {
+ .type = MLME_EVENT,
+ .u.mlme.data = AUTH_EVENT,
+ .u.mlme.status = MLME_TIMEOUT,
+ };
+
+ memcpy(bssid, ifmgd->auth_data->bss->bssid, ETH_ALEN);
+
+ ieee80211_destroy_auth_data(sdata, false);
+
+ cfg80211_auth_timeout(sdata->dev, bssid);
+ drv_event_callback(sdata->local, sdata, &event);
+ }
+ } else if (ifmgd->auth_data && ifmgd->auth_data->timeout_started)
+ run_again(sdata, ifmgd->auth_data->timeout);
+
+ if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started &&
+ time_after(jiffies, ifmgd->assoc_data->timeout)) {
+ if ((ifmgd->assoc_data->need_beacon && !ifmgd->have_beacon) ||
+ ieee80211_do_assoc(sdata)) {
+ struct cfg80211_bss *bss = ifmgd->assoc_data->bss;
+ struct ieee80211_event event = {
+ .type = MLME_EVENT,
+ .u.mlme.data = ASSOC_EVENT,
+ .u.mlme.status = MLME_TIMEOUT,
+ };
+
+ ieee80211_destroy_assoc_data(sdata, false);
+ cfg80211_assoc_timeout(sdata->dev, bss);
+ drv_event_callback(sdata->local, sdata, &event);
+ }
+ } else if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started)
+ run_again(sdata, ifmgd->assoc_data->timeout);
+
+ if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL &&
+ ifmgd->associated) {
+ u8 bssid[ETH_ALEN];
+ int max_tries;
+
+ memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN);
+
+ if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ max_tries = max_nullfunc_tries;
+ else
+ max_tries = max_probe_tries;
+
+ /* ACK received for nullfunc probing frame */
+ if (!ifmgd->probe_send_count)
+ ieee80211_reset_ap_probe(sdata);
+ else if (ifmgd->nullfunc_failed) {
+ if (ifmgd->probe_send_count < max_tries) {
+ mlme_dbg(sdata,
+ "No ack for nullfunc frame to AP %pM, try %d/%i\n",
+ bssid, ifmgd->probe_send_count,
+ max_tries);
+ ieee80211_mgd_probe_ap_send(sdata);
+ } else {
+ mlme_dbg(sdata,
+ "No ack for nullfunc frame to AP %pM, disconnecting.\n",
+ bssid);
+ ieee80211_sta_connection_lost(sdata, bssid,
+ WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
+ false);
+ }
+ } else if (time_is_after_jiffies(ifmgd->probe_timeout))
+ run_again(sdata, ifmgd->probe_timeout);
+ else if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ mlme_dbg(sdata,
+ "Failed to send nullfunc to AP %pM after %dms, disconnecting\n",
+ bssid, probe_wait_ms);
+ ieee80211_sta_connection_lost(sdata, bssid,
+ WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false);
+ } else if (ifmgd->probe_send_count < max_tries) {
+ mlme_dbg(sdata,
+ "No probe response from AP %pM after %dms, try %d/%i\n",
+ bssid, probe_wait_ms,
+ ifmgd->probe_send_count, max_tries);
+ ieee80211_mgd_probe_ap_send(sdata);
+ } else {
+ /*
+ * We actually lost the connection ... or did we?
+ * Let's make sure!
+ */
+ wiphy_debug(local->hw.wiphy,
+ "%s: No probe response from AP %pM"
+ " after %dms, disconnecting.\n",
+ sdata->name,
+ bssid, probe_wait_ms);
+
+ ieee80211_sta_connection_lost(sdata, bssid,
+ WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false);
+ }
+ }
+
+ sdata_unlock(sdata);
+}
+
+static void ieee80211_sta_bcn_mon_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata =
+ (struct ieee80211_sub_if_data *) data;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn)
+ return;
+
+ sdata->u.mgd.connection_loss = false;
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->u.mgd.beacon_connection_loss_work);
+}
+
+static void ieee80211_sta_conn_mon_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata =
+ (struct ieee80211_sub_if_data *) data;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_local *local = sdata->local;
+
+ if (sdata->vif.csa_active && !ifmgd->csa_waiting_bcn)
+ return;
+
+ ieee80211_queue_work(&local->hw, &ifmgd->monitor_work);
+}
+
+static void ieee80211_sta_monitor_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ u.mgd.monitor_work);
+
+ ieee80211_mgd_probe_ap(sdata, false);
+}
+
+static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
+{
+ u32 flags;
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ __ieee80211_stop_poll(sdata);
+
+ /* let's probe the connection once */
+ flags = sdata->local->hw.flags;
+ if (!(flags & IEEE80211_HW_CONNECTION_MONITOR))
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->u.mgd.monitor_work);
+ /* and do all the other regular work too */
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+ }
+}
+
+#ifdef CONFIG_PM
+void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+ sdata_lock(sdata);
+
+ if (ifmgd->auth_data || ifmgd->assoc_data) {
+ const u8 *bssid = ifmgd->auth_data ?
+ ifmgd->auth_data->bss->bssid :
+ ifmgd->assoc_data->bss->bssid;
+
+ /*
+ * If we are trying to authenticate / associate while suspending,
+ * cfg80211 won't know and won't actually abort those attempts,
+ * thus we need to do that ourselves.
+ */
+ ieee80211_send_deauth_disassoc(sdata, bssid,
+ IEEE80211_STYPE_DEAUTH,
+ WLAN_REASON_DEAUTH_LEAVING,
+ false, frame_buf);
+ if (ifmgd->assoc_data)
+ ieee80211_destroy_assoc_data(sdata, false);
+ if (ifmgd->auth_data)
+ ieee80211_destroy_auth_data(sdata, false);
+ cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf,
+ IEEE80211_DEAUTH_FRAME_LEN);
+ }
+
+ /* This is a bit of a hack - we should find a better and more generic
+ * solution to this. Normally when suspending, cfg80211 will in fact
+ * deauthenticate. However, it doesn't (and cannot) stop an ongoing
+ * auth (not so important) or assoc (this is the problem) process.
+ *
+ * As a consequence, it can happen that we are in the process of both
+ * associating and suspending, and receive an association response
+ * after cfg80211 has checked if it needs to disconnect, but before
+ * we actually set the flag to drop incoming frames. This will then
+ * cause the workqueue flush to process the association response in
+ * the suspend, resulting in a successful association just before it
+ * tries to remove the interface from the driver, which now though
+ * has a channel context assigned ... this results in issues.
+ *
+ * To work around this (for now) simply deauth here again if we're
+ * now connected.
+ */
+ if (ifmgd->associated && !sdata->local->wowlan) {
+ u8 bssid[ETH_ALEN];
+ struct cfg80211_deauth_request req = {
+ .reason_code = WLAN_REASON_DEAUTH_LEAVING,
+ .bssid = bssid,
+ };
+
+ memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN);
+ ieee80211_mgd_deauth(sdata, &req);
+ }
+
+ sdata_unlock(sdata);
+}
+
+void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ sdata_lock(sdata);
+ if (!ifmgd->associated) {
+ sdata_unlock(sdata);
+ return;
+ }
+
+ if (sdata->flags & IEEE80211_SDATA_DISCONNECT_RESUME) {
+ sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_RESUME;
+ mlme_dbg(sdata, "driver requested disconnect after resume\n");
+ ieee80211_sta_connection_lost(sdata,
+ ifmgd->associated->bssid,
+ WLAN_REASON_UNSPECIFIED,
+ true);
+ sdata_unlock(sdata);
+ return;
+ }
+ sdata_unlock(sdata);
+}
+#endif
+
+/* interface setup */
+void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *ifmgd;
+
+ ifmgd = &sdata->u.mgd;
+ INIT_WORK(&ifmgd->monitor_work, ieee80211_sta_monitor_work);
+ INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work);
+ INIT_WORK(&ifmgd->beacon_connection_loss_work,
+ ieee80211_beacon_connection_loss_work);
+ INIT_WORK(&ifmgd->csa_connection_drop_work,
+ ieee80211_csa_connection_drop_work);
+ INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_mgd_work);
+ INIT_DELAYED_WORK(&ifmgd->tdls_peer_del_work,
+ ieee80211_tdls_peer_del_work);
+ setup_timer(&ifmgd->timer, ieee80211_sta_timer,
+ (unsigned long) sdata);
+ setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer,
+ (unsigned long) sdata);
+ setup_timer(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer,
+ (unsigned long) sdata);
+ setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer,
+ (unsigned long) sdata);
+ INIT_DELAYED_WORK(&ifmgd->tx_tspec_wk,
+ ieee80211_sta_handle_tspec_ac_params_wk);
+
+ ifmgd->flags = 0;
+ ifmgd->powersave = sdata->wdev.ps;
+ ifmgd->uapsd_queues = sdata->local->hw.uapsd_queues;
+ ifmgd->uapsd_max_sp_len = sdata->local->hw.uapsd_max_sp_len;
+ ifmgd->p2p_noa_index = -1;
+
+ if (sdata->local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS)
+ ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC;
+ else
+ ifmgd->req_smps = IEEE80211_SMPS_OFF;
+
+ /* Setup TDLS data */
+ spin_lock_init(&ifmgd->teardown_lock);
+ ifmgd->teardown_skb = NULL;
+ ifmgd->orig_teardown_skb = NULL;
+}
+
+/* scan finished notification */
+void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ /* Restart STA timers */
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (ieee80211_sdata_running(sdata))
+ ieee80211_restart_sta_timer(sdata);
+ }
+ rcu_read_unlock();
+}
+
+int ieee80211_max_network_latency(struct notifier_block *nb,
+ unsigned long data, void *dummy)
+{
+ s32 latency_usec = (s32) data;
+ struct ieee80211_local *local =
+ container_of(nb, struct ieee80211_local,
+ network_latency_notifier);
+
+ mutex_lock(&local->iflist_mtx);
+ ieee80211_recalc_ps(local, latency_usec);
+ mutex_unlock(&local->iflist_mtx);
+
+ return NOTIFY_OK;
+}
+
+static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_bss *cbss)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ const u8 *ht_cap_ie, *vht_cap_ie;
+ const struct ieee80211_ht_cap *ht_cap;
+ const struct ieee80211_vht_cap *vht_cap;
+ u8 chains = 1;
+
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_HT)
+ return chains;
+
+ ht_cap_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_CAPABILITY);
+ if (ht_cap_ie && ht_cap_ie[1] >= sizeof(*ht_cap)) {
+ ht_cap = (void *)(ht_cap_ie + 2);
+ chains = ieee80211_mcs_to_chains(&ht_cap->mcs);
+ /*
+ * TODO: use "Tx Maximum Number Spatial Streams Supported" and
+ * "Tx Unequal Modulation Supported" fields.
+ */
+ }
+
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT)
+ return chains;
+
+ vht_cap_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_VHT_CAPABILITY);
+ if (vht_cap_ie && vht_cap_ie[1] >= sizeof(*vht_cap)) {
+ u8 nss;
+ u16 tx_mcs_map;
+
+ vht_cap = (void *)(vht_cap_ie + 2);
+ tx_mcs_map = le16_to_cpu(vht_cap->supp_mcs.tx_mcs_map);
+ for (nss = 8; nss > 0; nss--) {
+ if (((tx_mcs_map >> (2 * (nss - 1))) & 3) !=
+ IEEE80211_VHT_MCS_NOT_SUPPORTED)
+ break;
+ }
+ /* TODO: use "Tx Highest Supported Long GI Data Rate" field? */
+ chains = max(chains, nss);
+ }
+
+ return chains;
+}
+
+static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_bss *cbss)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ const struct ieee80211_ht_cap *ht_cap = NULL;
+ const struct ieee80211_ht_operation *ht_oper = NULL;
+ const struct ieee80211_vht_operation *vht_oper = NULL;
+ struct ieee80211_supported_band *sband;
+ struct cfg80211_chan_def chandef;
+ int ret;
+
+ sband = local->hw.wiphy->bands[cbss->channel->band];
+
+ ifmgd->flags &= ~(IEEE80211_STA_DISABLE_40MHZ |
+ IEEE80211_STA_DISABLE_80P80MHZ |
+ IEEE80211_STA_DISABLE_160MHZ);
+
+ rcu_read_lock();
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
+ sband->ht_cap.ht_supported) {
+ const u8 *ht_oper_ie, *ht_cap_ie;
+
+ ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION);
+ if (ht_oper_ie && ht_oper_ie[1] >= sizeof(*ht_oper))
+ ht_oper = (void *)(ht_oper_ie + 2);
+
+ ht_cap_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_CAPABILITY);
+ if (ht_cap_ie && ht_cap_ie[1] >= sizeof(*ht_cap))
+ ht_cap = (void *)(ht_cap_ie + 2);
+
+ if (!ht_cap) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
+ ht_oper = NULL;
+ }
+ }
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
+ sband->vht_cap.vht_supported) {
+ const u8 *vht_oper_ie, *vht_cap;
+
+ vht_oper_ie = ieee80211_bss_get_ie(cbss,
+ WLAN_EID_VHT_OPERATION);
+ if (vht_oper_ie && vht_oper_ie[1] >= sizeof(*vht_oper))
+ vht_oper = (void *)(vht_oper_ie + 2);
+ if (vht_oper && !ht_oper) {
+ vht_oper = NULL;
+ sdata_info(sdata,
+ "AP advertised VHT without HT, disabling both\n");
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ }
+
+ vht_cap = ieee80211_bss_get_ie(cbss, WLAN_EID_VHT_CAPABILITY);
+ if (!vht_cap || vht_cap[1] < sizeof(struct ieee80211_vht_cap)) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ vht_oper = NULL;
+ }
+ }
+
+ ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
+ cbss->channel,
+ ht_cap, ht_oper, vht_oper,
+ &chandef, false);
+
+ sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss),
+ local->rx_chains);
+
+ rcu_read_unlock();
+
+ /* will change later if needed */
+ sdata->smps_mode = IEEE80211_SMPS_OFF;
+
+ mutex_lock(&local->mtx);
+ /*
+ * If this fails (possibly due to channel context sharing
+ * on incompatible channels, e.g. 80+80 and 160 sharing the
+ * same control channel) try to use a smaller bandwidth.
+ */
+ ret = ieee80211_vif_use_channel(sdata, &chandef,
+ IEEE80211_CHANCTX_SHARED);
+
+ /* don't downgrade for 5 and 10 MHz channels, though. */
+ if (chandef.width == NL80211_CHAN_WIDTH_5 ||
+ chandef.width == NL80211_CHAN_WIDTH_10)
+ goto out;
+
+ while (ret && chandef.width != NL80211_CHAN_WIDTH_20_NOHT) {
+ ifmgd->flags |= ieee80211_chandef_downgrade(&chandef);
+ ret = ieee80211_vif_use_channel(sdata, &chandef,
+ IEEE80211_CHANCTX_SHARED);
+ }
+ out:
+ mutex_unlock(&local->mtx);
+ return ret;
+}
+
+static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_bss *cbss, bool assoc)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_bss *bss = (void *)cbss->priv;
+ struct sta_info *new_sta = NULL;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_sta_ht_cap sta_ht_cap;
+ bool have_sta = false, is_override = false;
+ int err;
+
+ sband = local->hw.wiphy->bands[cbss->channel->band];
+
+ if (WARN_ON(!ifmgd->auth_data && !ifmgd->assoc_data))
+ return -EINVAL;
+
+ if (assoc) {
+ rcu_read_lock();
+ have_sta = sta_info_get(sdata, cbss->bssid);
+ rcu_read_unlock();
+ }
+
+ if (!have_sta) {
+ new_sta = sta_info_alloc(sdata, cbss->bssid, GFP_KERNEL);
+ if (!new_sta)
+ return -ENOMEM;
+ }
+
+ memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
+ ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
+
+ is_override = (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) !=
+ (sband->ht_cap.cap &
+ IEEE80211_HT_CAP_SUP_WIDTH_20_40);
+
+ if (new_sta || is_override) {
+ err = ieee80211_prep_channel(sdata, cbss);
+ if (err) {
+ if (new_sta)
+ sta_info_free(local, new_sta);
+ return -EINVAL;
+ }
+ }
+
+ if (new_sta) {
+ u32 rates = 0, basic_rates = 0;
+ bool have_higher_than_11mbit;
+ int min_rate = INT_MAX, min_rate_index = -1;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ const struct cfg80211_bss_ies *ies;
+ int shift = ieee80211_vif_get_shift(&sdata->vif);
+ u32 rate_flags;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ sta_info_free(local, new_sta);
+ return -EINVAL;
+ }
+ rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
+ rcu_read_unlock();
+
+ ieee80211_get_rates(sband, bss->supp_rates,
+ bss->supp_rates_len,
+ &rates, &basic_rates,
+ &have_higher_than_11mbit,
+ &min_rate, &min_rate_index,
+ shift, rate_flags);
+
+ /*
+ * This used to be a workaround for basic rates missing
+ * in the association response frame. Now that we no
+ * longer use the basic rates from there, it probably
+ * doesn't happen any more, but keep the workaround so
+ * in case some *other* APs are buggy in different ways
+ * we can connect -- with a warning.
+ */
+ if (!basic_rates && min_rate_index >= 0) {
+ sdata_info(sdata,
+ "No basic rates, using min rate instead\n");
+ basic_rates = BIT(min_rate_index);
+ }
+
+ new_sta->sta.supp_rates[cbss->channel->band] = rates;
+ sdata->vif.bss_conf.basic_rates = basic_rates;
+
+ /* cf. IEEE 802.11 9.2.12 */
+ if (cbss->channel->band == IEEE80211_BAND_2GHZ &&
+ have_higher_than_11mbit)
+ sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+ else
+ sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
+
+ memcpy(ifmgd->bssid, cbss->bssid, ETH_ALEN);
+
+ /* set timing information */
+ sdata->vif.bss_conf.beacon_int = cbss->beacon_interval;
+ rcu_read_lock();
+ ies = rcu_dereference(cbss->beacon_ies);
+ if (ies) {
+ const u8 *tim_ie;
+
+ sdata->vif.bss_conf.sync_tsf = ies->tsf;
+ sdata->vif.bss_conf.sync_device_ts =
+ bss->device_ts_beacon;
+ tim_ie = cfg80211_find_ie(WLAN_EID_TIM,
+ ies->data, ies->len);
+ if (tim_ie && tim_ie[1] >= 2)
+ sdata->vif.bss_conf.sync_dtim_count = tim_ie[2];
+ else
+ sdata->vif.bss_conf.sync_dtim_count = 0;
+ } else if (!(local->hw.flags &
+ IEEE80211_HW_TIMING_BEACON_ONLY)) {
+ ies = rcu_dereference(cbss->proberesp_ies);
+ /* must be non-NULL since beacon IEs were NULL */
+ sdata->vif.bss_conf.sync_tsf = ies->tsf;
+ sdata->vif.bss_conf.sync_device_ts =
+ bss->device_ts_presp;
+ sdata->vif.bss_conf.sync_dtim_count = 0;
+ } else {
+ sdata->vif.bss_conf.sync_tsf = 0;
+ sdata->vif.bss_conf.sync_device_ts = 0;
+ sdata->vif.bss_conf.sync_dtim_count = 0;
+ }
+ rcu_read_unlock();
+
+ /* tell driver about BSSID, basic rates and timing */
+ ieee80211_bss_info_change_notify(sdata,
+ BSS_CHANGED_BSSID | BSS_CHANGED_BASIC_RATES |
+ BSS_CHANGED_BEACON_INT);
+
+ if (assoc)
+ sta_info_pre_move_state(new_sta, IEEE80211_STA_AUTH);
+
+ err = sta_info_insert(new_sta);
+ new_sta = NULL;
+ if (err) {
+ sdata_info(sdata,
+ "failed to insert STA entry for the AP (error %d)\n",
+ err);
+ return err;
+ }
+ } else
+ WARN_ON_ONCE(!ether_addr_equal(ifmgd->bssid, cbss->bssid));
+
+ /* Cancel scan to ensure that nothing interferes with connection */
+ if (local->scanning)
+ ieee80211_scan_cancel(local);
+
+ return 0;
+}
+
+/* config hooks */
+int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_auth_request *req)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_mgd_auth_data *auth_data;
+ u16 auth_alg;
+ int err;
+
+ /* prepare auth data structure */
+
+ switch (req->auth_type) {
+ case NL80211_AUTHTYPE_OPEN_SYSTEM:
+ auth_alg = WLAN_AUTH_OPEN;
+ break;
+ case NL80211_AUTHTYPE_SHARED_KEY:
+ if (IS_ERR(local->wep_tx_tfm))
+ return -EOPNOTSUPP;
+ auth_alg = WLAN_AUTH_SHARED_KEY;
+ break;
+ case NL80211_AUTHTYPE_FT:
+ auth_alg = WLAN_AUTH_FT;
+ break;
+ case NL80211_AUTHTYPE_NETWORK_EAP:
+ auth_alg = WLAN_AUTH_LEAP;
+ break;
+ case NL80211_AUTHTYPE_SAE:
+ auth_alg = WLAN_AUTH_SAE;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ auth_data = kzalloc(sizeof(*auth_data) + req->sae_data_len +
+ req->ie_len, GFP_KERNEL);
+ if (!auth_data)
+ return -ENOMEM;
+
+ auth_data->bss = req->bss;
+
+ if (req->sae_data_len >= 4) {
+ __le16 *pos = (__le16 *) req->sae_data;
+ auth_data->sae_trans = le16_to_cpu(pos[0]);
+ auth_data->sae_status = le16_to_cpu(pos[1]);
+ memcpy(auth_data->data, req->sae_data + 4,
+ req->sae_data_len - 4);
+ auth_data->data_len += req->sae_data_len - 4;
+ }
+
+ if (req->ie && req->ie_len) {
+ memcpy(&auth_data->data[auth_data->data_len],
+ req->ie, req->ie_len);
+ auth_data->data_len += req->ie_len;
+ }
+
+ if (req->key && req->key_len) {
+ auth_data->key_len = req->key_len;
+ auth_data->key_idx = req->key_idx;
+ memcpy(auth_data->key, req->key, req->key_len);
+ }
+
+ auth_data->algorithm = auth_alg;
+
+ /* try to authenticate/probe */
+
+ if ((ifmgd->auth_data && !ifmgd->auth_data->done) ||
+ ifmgd->assoc_data) {
+ err = -EBUSY;
+ goto err_free;
+ }
+
+ if (ifmgd->auth_data)
+ ieee80211_destroy_auth_data(sdata, false);
+
+ /* prep auth_data so we don't go into idle on disassoc */
+ ifmgd->auth_data = auth_data;
+
+ if (ifmgd->associated) {
+ u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+ ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
+ WLAN_REASON_UNSPECIFIED,
+ false, frame_buf);
+
+ ieee80211_report_disconnect(sdata, frame_buf,
+ sizeof(frame_buf), true,
+ WLAN_REASON_UNSPECIFIED);
+ }
+
+ sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid);
+
+ err = ieee80211_prep_connection(sdata, req->bss, false);
+ if (err)
+ goto err_clear;
+
+ err = ieee80211_probe_auth(sdata);
+ if (err) {
+ sta_info_destroy_addr(sdata, req->bss->bssid);
+ goto err_clear;
+ }
+
+ /* hold our own reference */
+ cfg80211_ref_bss(local->hw.wiphy, auth_data->bss);
+ return 0;
+
+ err_clear:
+ eth_zero_addr(ifmgd->bssid);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+ ifmgd->auth_data = NULL;
+ err_free:
+ kfree(auth_data);
+ return err;
+}
+
+static bool ieee80211_usable_wmm_params(struct ieee80211_sub_if_data *sdata,
+ const u8 *wmm_param, int len)
+{
+ const u8 *pos;
+ size_t left;
+
+ if (len < 8)
+ return false;
+
+ if (wmm_param[5] != 1 /* version */)
+ return false;
+
+ pos = wmm_param + 8;
+ left = len - 8;
+
+ for (; left >= 4; left -= 4, pos += 4) {
+ u8 aifsn = pos[0] & 0x0f;
+ u8 ecwmin = pos[1] & 0x0f;
+ u8 ecwmax = (pos[1] & 0xf0) >> 4;
+ int aci = (pos[0] >> 5) & 0x03;
+
+ if (aifsn < 2) {
+ sdata_info(sdata,
+ "AP has invalid WMM params (AIFSN=%d for ACI %d), disabling WMM\n",
+ aifsn, aci);
+ return false;
+ }
+ if (ecwmin > ecwmax) {
+ sdata_info(sdata,
+ "AP has invalid WMM params (ECWmin/max=%d/%d for ACI %d), disabling WMM\n",
+ ecwmin, ecwmax, aci);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_assoc_request *req)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_bss *bss = (void *)req->bss->priv;
+ struct ieee80211_mgd_assoc_data *assoc_data;
+ const struct cfg80211_bss_ies *beacon_ies;
+ struct ieee80211_supported_band *sband;
+ const u8 *ssidie, *ht_ie, *vht_ie;
+ int i, err;
+
+ assoc_data = kzalloc(sizeof(*assoc_data) + req->ie_len, GFP_KERNEL);
+ if (!assoc_data)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ ssidie = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID);
+ if (!ssidie) {
+ rcu_read_unlock();
+ kfree(assoc_data);
+ return -EINVAL;
+ }
+ memcpy(assoc_data->ssid, ssidie + 2, ssidie[1]);
+ assoc_data->ssid_len = ssidie[1];
+ rcu_read_unlock();
+
+ if (ifmgd->associated) {
+ u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+ ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
+ WLAN_REASON_UNSPECIFIED,
+ false, frame_buf);
+
+ ieee80211_report_disconnect(sdata, frame_buf,
+ sizeof(frame_buf), true,
+ WLAN_REASON_UNSPECIFIED);
+ }
+
+ if (ifmgd->auth_data && !ifmgd->auth_data->done) {
+ err = -EBUSY;
+ goto err_free;
+ }
+
+ if (ifmgd->assoc_data) {
+ err = -EBUSY;
+ goto err_free;
+ }
+
+ if (ifmgd->auth_data) {
+ bool match;
+
+ /* keep sta info, bssid if matching */
+ match = ether_addr_equal(ifmgd->bssid, req->bss->bssid);
+ ieee80211_destroy_auth_data(sdata, match);
+ }
+
+ /* prepare assoc data */
+
+ ifmgd->beacon_crc_valid = false;
+
+ assoc_data->wmm = bss->wmm_used &&
+ (local->hw.queues >= IEEE80211_NUM_ACS);
+ if (assoc_data->wmm) {
+ /* try to check validity of WMM params IE */
+ const struct cfg80211_bss_ies *ies;
+ const u8 *wp, *start, *end;
+
+ rcu_read_lock();
+ ies = rcu_dereference(req->bss->ies);
+ start = ies->data;
+ end = start + ies->len;
+
+ while (true) {
+ wp = cfg80211_find_vendor_ie(
+ WLAN_OUI_MICROSOFT,
+ WLAN_OUI_TYPE_MICROSOFT_WMM,
+ start, end - start);
+ if (!wp)
+ break;
+ start = wp + wp[1] + 2;
+ /* if this IE is too short, try the next */
+ if (wp[1] <= 4)
+ continue;
+ /* if this IE is WMM params, we found what we wanted */
+ if (wp[6] == 1)
+ break;
+ }
+
+ if (!wp || !ieee80211_usable_wmm_params(sdata, wp + 2,
+ wp[1] - 2)) {
+ assoc_data->wmm = false;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_WMM;
+ }
+ rcu_read_unlock();
+ }
+
+ /*
+ * IEEE802.11n does not allow TKIP/WEP as pairwise ciphers in HT mode.
+ * We still associate in non-HT mode (11a/b/g) if any one of these
+ * ciphers is configured as pairwise.
+ * We can set this to true for non-11n hardware, that'll be checked
+ * separately along with the peer capabilities.
+ */
+ for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) {
+ if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 ||
+ req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP ||
+ req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ netdev_info(sdata->dev,
+ "disabling HT/VHT due to WEP/TKIP use\n");
+ }
+ }
+
+ if (req->flags & ASSOC_REQ_DISABLE_HT) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ }
+
+ if (req->flags & ASSOC_REQ_DISABLE_VHT)
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+
+ /* Also disable HT if we don't support it or the AP doesn't use WMM */
+ sband = local->hw.wiphy->bands[req->bss->channel->band];
+ if (!sband->ht_cap.ht_supported ||
+ local->hw.queues < IEEE80211_NUM_ACS || !bss->wmm_used ||
+ ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
+ if (!bss->wmm_used &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_WMM))
+ netdev_info(sdata->dev,
+ "disabling HT as WMM/QoS is not supported by the AP\n");
+ }
+
+ /* disable VHT if we don't support it or the AP doesn't use WMM */
+ if (!sband->vht_cap.vht_supported ||
+ local->hw.queues < IEEE80211_NUM_ACS || !bss->wmm_used ||
+ ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ if (!bss->wmm_used &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_WMM))
+ netdev_info(sdata->dev,
+ "disabling VHT as WMM/QoS is not supported by the AP\n");
+ }
+
+ memcpy(&ifmgd->ht_capa, &req->ht_capa, sizeof(ifmgd->ht_capa));
+ memcpy(&ifmgd->ht_capa_mask, &req->ht_capa_mask,
+ sizeof(ifmgd->ht_capa_mask));
+
+ memcpy(&ifmgd->vht_capa, &req->vht_capa, sizeof(ifmgd->vht_capa));
+ memcpy(&ifmgd->vht_capa_mask, &req->vht_capa_mask,
+ sizeof(ifmgd->vht_capa_mask));
+
+ if (req->ie && req->ie_len) {
+ memcpy(assoc_data->ie, req->ie, req->ie_len);
+ assoc_data->ie_len = req->ie_len;
+ }
+
+ assoc_data->bss = req->bss;
+
+ if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) {
+ if (ifmgd->powersave)
+ sdata->smps_mode = IEEE80211_SMPS_DYNAMIC;
+ else
+ sdata->smps_mode = IEEE80211_SMPS_OFF;
+ } else
+ sdata->smps_mode = ifmgd->req_smps;
+
+ assoc_data->capability = req->bss->capability;
+ assoc_data->supp_rates = bss->supp_rates;
+ assoc_data->supp_rates_len = bss->supp_rates_len;
+
+ rcu_read_lock();
+ ht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_HT_OPERATION);
+ if (ht_ie && ht_ie[1] >= sizeof(struct ieee80211_ht_operation))
+ assoc_data->ap_ht_param =
+ ((struct ieee80211_ht_operation *)(ht_ie + 2))->ht_param;
+ else
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
+ vht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_VHT_CAPABILITY);
+ if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap))
+ memcpy(&assoc_data->ap_vht_cap, vht_ie + 2,
+ sizeof(struct ieee80211_vht_cap));
+ else
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ rcu_read_unlock();
+
+ if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) &&
+ (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK),
+ "U-APSD not supported with HW_PS_NULLFUNC_STACK\n"))
+ sdata->vif.driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD;
+
+ if (bss->wmm_used && bss->uapsd_supported &&
+ (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD)) {
+ assoc_data->uapsd = true;
+ ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED;
+ } else {
+ assoc_data->uapsd = false;
+ ifmgd->flags &= ~IEEE80211_STA_UAPSD_ENABLED;
+ }
+
+ if (req->prev_bssid)
+ memcpy(assoc_data->prev_bssid, req->prev_bssid, ETH_ALEN);
+
+ if (req->use_mfp) {
+ ifmgd->mfp = IEEE80211_MFP_REQUIRED;
+ ifmgd->flags |= IEEE80211_STA_MFP_ENABLED;
+ } else {
+ ifmgd->mfp = IEEE80211_MFP_DISABLED;
+ ifmgd->flags &= ~IEEE80211_STA_MFP_ENABLED;
+ }
+
+ if (req->flags & ASSOC_REQ_USE_RRM)
+ ifmgd->flags |= IEEE80211_STA_ENABLE_RRM;
+ else
+ ifmgd->flags &= ~IEEE80211_STA_ENABLE_RRM;
+
+ if (req->crypto.control_port)
+ ifmgd->flags |= IEEE80211_STA_CONTROL_PORT;
+ else
+ ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT;
+
+ sdata->control_port_protocol = req->crypto.control_port_ethertype;
+ sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt;
+ sdata->encrypt_headroom = ieee80211_cs_headroom(local, &req->crypto,
+ sdata->vif.type);
+
+ /* kick off associate process */
+
+ ifmgd->assoc_data = assoc_data;
+ ifmgd->dtim_period = 0;
+ ifmgd->have_beacon = false;
+
+ err = ieee80211_prep_connection(sdata, req->bss, true);
+ if (err)
+ goto err_clear;
+
+ rcu_read_lock();
+ beacon_ies = rcu_dereference(req->bss->beacon_ies);
+
+ if (sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_BEFORE_ASSOC &&
+ !beacon_ies) {
+ /*
+ * Wait up to one beacon interval ...
+ * should this be more if we miss one?
+ */
+ sdata_info(sdata, "waiting for beacon from %pM\n",
+ ifmgd->bssid);
+ assoc_data->timeout = TU_TO_EXP_TIME(req->bss->beacon_interval);
+ assoc_data->timeout_started = true;
+ assoc_data->need_beacon = true;
+ } else if (beacon_ies) {
+ const u8 *tim_ie = cfg80211_find_ie(WLAN_EID_TIM,
+ beacon_ies->data,
+ beacon_ies->len);
+ u8 dtim_count = 0;
+
+ if (tim_ie && tim_ie[1] >= sizeof(struct ieee80211_tim_ie)) {
+ const struct ieee80211_tim_ie *tim;
+ tim = (void *)(tim_ie + 2);
+ ifmgd->dtim_period = tim->dtim_period;
+ dtim_count = tim->dtim_count;
+ }
+ ifmgd->have_beacon = true;
+ assoc_data->timeout = jiffies;
+ assoc_data->timeout_started = true;
+
+ if (local->hw.flags & IEEE80211_HW_TIMING_BEACON_ONLY) {
+ sdata->vif.bss_conf.sync_tsf = beacon_ies->tsf;
+ sdata->vif.bss_conf.sync_device_ts =
+ bss->device_ts_beacon;
+ sdata->vif.bss_conf.sync_dtim_count = dtim_count;
+ }
+ } else {
+ assoc_data->timeout = jiffies;
+ assoc_data->timeout_started = true;
+ }
+ rcu_read_unlock();
+
+ run_again(sdata, assoc_data->timeout);
+
+ if (bss->corrupt_data) {
+ char *corrupt_type = "data";
+ if (bss->corrupt_data & IEEE80211_BSS_CORRUPT_BEACON) {
+ if (bss->corrupt_data &
+ IEEE80211_BSS_CORRUPT_PROBE_RESP)
+ corrupt_type = "beacon and probe response";
+ else
+ corrupt_type = "beacon";
+ } else if (bss->corrupt_data & IEEE80211_BSS_CORRUPT_PROBE_RESP)
+ corrupt_type = "probe response";
+ sdata_info(sdata, "associating with AP with corrupt %s\n",
+ corrupt_type);
+ }
+
+ return 0;
+ err_clear:
+ eth_zero_addr(ifmgd->bssid);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
+ ifmgd->assoc_data = NULL;
+ err_free:
+ kfree(assoc_data);
+ return err;
+}
+
+int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_deauth_request *req)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+ bool tx = !req->local_state_change;
+
+ if (ifmgd->auth_data &&
+ ether_addr_equal(ifmgd->auth_data->bss->bssid, req->bssid)) {
+ sdata_info(sdata,
+ "aborting authentication with %pM by local choice (Reason: %u=%s)\n",
+ req->bssid, req->reason_code,
+ ieee80211_get_reason_code_string(req->reason_code));
+
+ drv_mgd_prepare_tx(sdata->local, sdata);
+ ieee80211_send_deauth_disassoc(sdata, req->bssid,
+ IEEE80211_STYPE_DEAUTH,
+ req->reason_code, tx,
+ frame_buf);
+ ieee80211_destroy_auth_data(sdata, false);
+ ieee80211_report_disconnect(sdata, frame_buf,
+ sizeof(frame_buf), true,
+ req->reason_code);
+
+ return 0;
+ }
+
+ if (ifmgd->associated &&
+ ether_addr_equal(ifmgd->associated->bssid, req->bssid)) {
+ sdata_info(sdata,
+ "deauthenticating from %pM by local choice (Reason: %u=%s)\n",
+ req->bssid, req->reason_code,
+ ieee80211_get_reason_code_string(req->reason_code));
+
+ ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
+ req->reason_code, tx, frame_buf);
+ ieee80211_report_disconnect(sdata, frame_buf,
+ sizeof(frame_buf), true,
+ req->reason_code);
+ return 0;
+ }
+
+ return -ENOTCONN;
+}
+
+int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_disassoc_request *req)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u8 bssid[ETH_ALEN];
+ u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+ /*
+ * cfg80211 should catch this ... but it's racy since
+ * we can receive a disassoc frame, process it, hand it
+ * to cfg80211 while that's in a locked section already
+ * trying to tell us that the user wants to disconnect.
+ */
+ if (ifmgd->associated != req->bss)
+ return -ENOLINK;
+
+ sdata_info(sdata,
+ "disassociating from %pM by local choice (Reason: %u=%s)\n",
+ req->bss->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code));
+
+ memcpy(bssid, req->bss->bssid, ETH_ALEN);
+ ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DISASSOC,
+ req->reason_code, !req->local_state_change,
+ frame_buf);
+
+ ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true,
+ req->reason_code);
+
+ return 0;
+}
+
+void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ /*
+ * Make sure some work items will not run after this,
+ * they will not do anything but might not have been
+ * cancelled when disconnecting.
+ */
+ cancel_work_sync(&ifmgd->monitor_work);
+ cancel_work_sync(&ifmgd->beacon_connection_loss_work);
+ cancel_work_sync(&ifmgd->request_smps_work);
+ cancel_work_sync(&ifmgd->csa_connection_drop_work);
+ cancel_work_sync(&ifmgd->chswitch_work);
+ cancel_delayed_work_sync(&ifmgd->tdls_peer_del_work);
+
+ sdata_lock(sdata);
+ if (ifmgd->assoc_data) {
+ struct cfg80211_bss *bss = ifmgd->assoc_data->bss;
+ ieee80211_destroy_assoc_data(sdata, false);
+ cfg80211_assoc_timeout(sdata->dev, bss);
+ }
+ if (ifmgd->auth_data)
+ ieee80211_destroy_auth_data(sdata, false);
+ spin_lock_bh(&ifmgd->teardown_lock);
+ if (ifmgd->teardown_skb) {
+ kfree_skb(ifmgd->teardown_skb);
+ ifmgd->teardown_skb = NULL;
+ ifmgd->orig_teardown_skb = NULL;
+ }
+ spin_unlock_bh(&ifmgd->teardown_lock);
+ del_timer_sync(&ifmgd->timer);
+ sdata_unlock(sdata);
+}
+
+void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
+ enum nl80211_cqm_rssi_threshold_event rssi_event,
+ gfp_t gfp)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ trace_api_cqm_rssi_notify(sdata, rssi_event);
+
+ cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp);
+}
+EXPORT_SYMBOL(ieee80211_cqm_rssi_notify);
+
+void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ trace_api_cqm_beacon_loss_notify(sdata->local, sdata);
+
+ cfg80211_cqm_beacon_loss_notify(sdata->dev, gfp);
+}
+EXPORT_SYMBOL(ieee80211_cqm_beacon_loss_notify);
diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c
new file mode 100644
index 000000000..358d5f9d8
--- /dev/null
+++ b/net/mac80211/ocb.c
@@ -0,0 +1,250 @@
+/*
+ * OCB mode implementation
+ *
+ * Copyright: (c) 2014 Czech Technical University in Prague
+ * (c) 2014 Volkswagen Group Research
+ * Author: Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz>
+ * Funded by: Volkswagen Group Research
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+
+#define IEEE80211_OCB_HOUSEKEEPING_INTERVAL (60 * HZ)
+#define IEEE80211_OCB_PEER_INACTIVITY_LIMIT (240 * HZ)
+#define IEEE80211_OCB_MAX_STA_ENTRIES 128
+
+/**
+ * enum ocb_deferred_task_flags - mac80211 OCB deferred tasks
+ * @OCB_WORK_HOUSEKEEPING: run the periodic OCB housekeeping tasks
+ *
+ * These flags are used in @wrkq_flags field of &struct ieee80211_if_ocb
+ */
+enum ocb_deferred_task_flags {
+ OCB_WORK_HOUSEKEEPING,
+};
+
+void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, const u8 *addr,
+ u32 supp_rates)
+{
+ struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_supported_band *sband;
+ enum nl80211_bss_scan_width scan_width;
+ struct sta_info *sta;
+ int band;
+
+ /* XXX: Consider removing the least recently used entry and
+ * allow new one to be added.
+ */
+ if (local->num_sta >= IEEE80211_OCB_MAX_STA_ENTRIES) {
+ net_info_ratelimited("%s: No room for a new OCB STA entry %pM\n",
+ sdata->name, addr);
+ return;
+ }
+
+ ocb_dbg(sdata, "Adding new OCB station %pM\n", addr);
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON_ONCE(!chanctx_conf)) {
+ rcu_read_unlock();
+ return;
+ }
+ band = chanctx_conf->def.chan->band;
+ scan_width = cfg80211_chandef_to_scan_width(&chanctx_conf->def);
+ rcu_read_unlock();
+
+ sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
+ if (!sta)
+ return;
+
+ sta->last_rx = jiffies;
+
+ /* Add only mandatory rates for now */
+ sband = local->hw.wiphy->bands[band];
+ sta->sta.supp_rates[band] =
+ ieee80211_mandatory_rates(sband, scan_width);
+
+ spin_lock(&ifocb->incomplete_lock);
+ list_add(&sta->list, &ifocb->incomplete_stations);
+ spin_unlock(&ifocb->incomplete_lock);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+static struct sta_info *ieee80211_ocb_finish_sta(struct sta_info *sta)
+ __acquires(RCU)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u8 addr[ETH_ALEN];
+
+ memcpy(addr, sta->sta.addr, ETH_ALEN);
+
+ ocb_dbg(sdata, "Adding new IBSS station %pM (dev=%s)\n",
+ addr, sdata->name);
+
+ sta_info_move_state(sta, IEEE80211_STA_AUTH);
+ sta_info_move_state(sta, IEEE80211_STA_ASSOC);
+ sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED);
+
+ rate_control_rate_init(sta);
+
+ /* If it fails, maybe we raced another insertion? */
+ if (sta_info_insert_rcu(sta))
+ return sta_info_get(sdata, addr);
+ return sta;
+}
+
+static void ieee80211_ocb_housekeeping(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
+
+ ocb_dbg(sdata, "Running ocb housekeeping\n");
+
+ ieee80211_sta_expire(sdata, IEEE80211_OCB_PEER_INACTIVITY_LIMIT);
+
+ mod_timer(&ifocb->housekeeping_timer,
+ round_jiffies(jiffies + IEEE80211_OCB_HOUSEKEEPING_INTERVAL));
+}
+
+void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
+ struct sta_info *sta;
+
+ if (ifocb->joined != true)
+ return;
+
+ sdata_lock(sdata);
+
+ spin_lock_bh(&ifocb->incomplete_lock);
+ while (!list_empty(&ifocb->incomplete_stations)) {
+ sta = list_first_entry(&ifocb->incomplete_stations,
+ struct sta_info, list);
+ list_del(&sta->list);
+ spin_unlock_bh(&ifocb->incomplete_lock);
+
+ ieee80211_ocb_finish_sta(sta);
+ rcu_read_unlock();
+ spin_lock_bh(&ifocb->incomplete_lock);
+ }
+ spin_unlock_bh(&ifocb->incomplete_lock);
+
+ if (test_and_clear_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags))
+ ieee80211_ocb_housekeeping(sdata);
+
+ sdata_unlock(sdata);
+}
+
+static void ieee80211_ocb_housekeeping_timer(unsigned long data)
+{
+ struct ieee80211_sub_if_data *sdata = (void *)data;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
+
+ set_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags);
+
+ ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
+
+ setup_timer(&ifocb->housekeeping_timer,
+ ieee80211_ocb_housekeeping_timer,
+ (unsigned long)sdata);
+ INIT_LIST_HEAD(&ifocb->incomplete_stations);
+ spin_lock_init(&ifocb->incomplete_lock);
+}
+
+int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
+ struct ocb_setup *setup)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
+ u32 changed = BSS_CHANGED_OCB;
+ int err;
+
+ if (ifocb->joined == true)
+ return -EINVAL;
+
+ sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+ sdata->smps_mode = IEEE80211_SMPS_OFF;
+ sdata->needed_rx_chains = sdata->local->rx_chains;
+
+ mutex_lock(&sdata->local->mtx);
+ err = ieee80211_vif_use_channel(sdata, &setup->chandef,
+ IEEE80211_CHANCTX_SHARED);
+ mutex_unlock(&sdata->local->mtx);
+ if (err)
+ return err;
+
+ ieee80211_bss_info_change_notify(sdata, changed);
+
+ ifocb->joined = true;
+
+ set_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+
+ netif_carrier_on(sdata->dev);
+ return 0;
+}
+
+int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_ocb *ifocb = &sdata->u.ocb;
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+
+ ifocb->joined = false;
+ sta_info_flush(sdata);
+
+ spin_lock_bh(&ifocb->incomplete_lock);
+ while (!list_empty(&ifocb->incomplete_stations)) {
+ sta = list_first_entry(&ifocb->incomplete_stations,
+ struct sta_info, list);
+ list_del(&sta->list);
+ spin_unlock_bh(&ifocb->incomplete_lock);
+
+ sta_info_free(local, sta);
+ spin_lock_bh(&ifocb->incomplete_lock);
+ }
+ spin_unlock_bh(&ifocb->incomplete_lock);
+
+ netif_carrier_off(sdata->dev);
+ clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_OCB);
+
+ mutex_lock(&sdata->local->mtx);
+ ieee80211_vif_release_channel(sdata);
+ mutex_unlock(&sdata->local->mtx);
+
+ skb_queue_purge(&sdata->skb_queue);
+
+ del_timer_sync(&sdata->u.ocb.housekeeping_timer);
+ /* If the timer fired while we waited for it, it will have
+ * requeued the work. Now the work will be running again
+ * but will not rearm the timer again because it checks
+ * whether we are connected to the network or not -- at this
+ * point we shouldn't be anymore.
+ */
+
+ return 0;
+}
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
new file mode 100644
index 000000000..683f0e3cb
--- /dev/null
+++ b/net/mac80211/offchannel.c
@@ -0,0 +1,502 @@
+/*
+ * Off-channel operation helpers
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+
+/*
+ * Tell our hardware to disable PS.
+ * Optionally inform AP that we will go to sleep so that it will buffer
+ * the frames while we are doing off-channel work. This is optional
+ * because we *may* be doing work on-operating channel, and want our
+ * hardware unconditionally awake, but still let the AP send us normal frames.
+ */
+static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ local->offchannel_ps_enabled = false;
+
+ /* FIXME: what to do when local->pspolling is true? */
+
+ del_timer_sync(&local->dynamic_ps_timer);
+ del_timer_sync(&ifmgd->bcn_mon_timer);
+ del_timer_sync(&ifmgd->conn_mon_timer);
+
+ cancel_work_sync(&local->dynamic_ps_enable_work);
+
+ if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+ local->offchannel_ps_enabled = true;
+ local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+ }
+
+ if (!local->offchannel_ps_enabled ||
+ !(local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK))
+ /*
+ * If power save was enabled, no need to send a nullfunc
+ * frame because AP knows that we are sleeping. But if the
+ * hardware is creating the nullfunc frame for power save
+ * status (ie. IEEE80211_HW_PS_NULLFUNC_STACK is not
+ * enabled) and power save was enabled, the firmware just
+ * sent a null frame with power save disabled. So we need
+ * to send a new nullfunc frame to inform the AP that we
+ * are again sleeping.
+ */
+ ieee80211_send_nullfunc(local, sdata, 1);
+}
+
+/* inform AP that we are awake again, unless power save is enabled */
+static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+
+ if (!local->ps_sdata)
+ ieee80211_send_nullfunc(local, sdata, 0);
+ else if (local->offchannel_ps_enabled) {
+ /*
+ * In !IEEE80211_HW_PS_NULLFUNC_STACK case the hardware
+ * will send a nullfunc frame with the powersave bit set
+ * even though the AP already knows that we are sleeping.
+ * This could be avoided by sending a null frame with power
+ * save bit disabled before enabling the power save, but
+ * this doesn't gain anything.
+ *
+ * When IEEE80211_HW_PS_NULLFUNC_STACK is enabled, no need
+ * to send a nullfunc frame because AP already knows that
+ * we are sleeping, let's just enable power save mode in
+ * hardware.
+ */
+ /* TODO: Only set hardware if CONF_PS changed?
+ * TODO: Should we set offchannel_ps_enabled to false?
+ */
+ local->hw.conf.flags |= IEEE80211_CONF_PS;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+ } else if (local->hw.conf.dynamic_ps_timeout > 0) {
+ /*
+ * If IEEE80211_CONF_PS was not set and the dynamic_ps_timer
+ * had been running before leaving the operating channel,
+ * restart the timer now and send a nullfunc frame to inform
+ * the AP that we are awake.
+ */
+ ieee80211_send_nullfunc(local, sdata, 0);
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
+ }
+
+ ieee80211_sta_reset_beacon_monitor(sdata);
+ ieee80211_sta_reset_conn_monitor(sdata);
+}
+
+void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ if (WARN_ON(local->use_chanctx))
+ return;
+
+ /*
+ * notify the AP about us leaving the channel and stop all
+ * STA interfaces.
+ */
+
+ /*
+ * Stop queues and transmit all frames queued by the driver
+ * before sending nullfunc to enable powersave at the AP.
+ */
+ ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL,
+ false);
+ ieee80211_flush_queues(local, NULL, false);
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE)
+ continue;
+
+ if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
+
+ /* Check to see if we should disable beaconing. */
+ if (sdata->vif.bss_conf.enable_beacon) {
+ set_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED,
+ &sdata->state);
+ sdata->vif.bss_conf.enable_beacon = false;
+ ieee80211_bss_info_change_notify(
+ sdata, BSS_CHANGED_BEACON_ENABLED);
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ sdata->u.mgd.associated)
+ ieee80211_offchannel_ps_enable(sdata);
+ }
+ mutex_unlock(&local->iflist_mtx);
+}
+
+void ieee80211_offchannel_return(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+
+ if (WARN_ON(local->use_chanctx))
+ return;
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE)
+ continue;
+
+ if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
+
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ /* Tell AP we're back */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ sdata->u.mgd.associated)
+ ieee80211_offchannel_ps_disable(sdata);
+
+ if (test_and_clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED,
+ &sdata->state)) {
+ sdata->vif.bss_conf.enable_beacon = true;
+ ieee80211_bss_info_change_notify(
+ sdata, BSS_CHANGED_BEACON_ENABLED);
+ }
+ }
+ mutex_unlock(&local->iflist_mtx);
+
+ ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL,
+ false);
+}
+
+void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc)
+{
+ if (roc->notified)
+ return;
+
+ if (roc->mgmt_tx_cookie) {
+ if (!WARN_ON(!roc->frame)) {
+ ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7,
+ roc->chan->band);
+ roc->frame = NULL;
+ }
+ } else {
+ cfg80211_ready_on_channel(&roc->sdata->wdev, roc->cookie,
+ roc->chan, roc->req_duration,
+ GFP_KERNEL);
+ }
+
+ roc->notified = true;
+}
+
+static void ieee80211_hw_roc_start(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local, hw_roc_start);
+ struct ieee80211_roc_work *roc, *dep, *tmp;
+
+ mutex_lock(&local->mtx);
+
+ if (list_empty(&local->roc_list))
+ goto out_unlock;
+
+ roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work,
+ list);
+
+ if (!roc->started)
+ goto out_unlock;
+
+ roc->hw_begun = true;
+ roc->hw_start_time = local->hw_roc_start_time;
+
+ ieee80211_handle_roc_started(roc);
+ list_for_each_entry_safe(dep, tmp, &roc->dependents, list) {
+ ieee80211_handle_roc_started(dep);
+
+ if (dep->duration > roc->duration) {
+ u32 dur = dep->duration;
+ dep->duration = dur - roc->duration;
+ roc->duration = dur;
+ list_move(&dep->list, &roc->list);
+ }
+ }
+ out_unlock:
+ mutex_unlock(&local->mtx);
+}
+
+void ieee80211_ready_on_channel(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ local->hw_roc_start_time = jiffies;
+
+ trace_api_ready_on_channel(local);
+
+ ieee80211_queue_work(hw, &local->hw_roc_start);
+}
+EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel);
+
+void ieee80211_start_next_roc(struct ieee80211_local *local)
+{
+ struct ieee80211_roc_work *roc;
+
+ lockdep_assert_held(&local->mtx);
+
+ if (list_empty(&local->roc_list)) {
+ ieee80211_run_deferred_scan(local);
+ return;
+ }
+
+ roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work,
+ list);
+
+ if (WARN_ON_ONCE(roc->started))
+ return;
+
+ if (local->ops->remain_on_channel) {
+ int ret, duration = roc->duration;
+
+ /* XXX: duplicated, see ieee80211_start_roc_work() */
+ if (!duration)
+ duration = 10;
+
+ ret = drv_remain_on_channel(local, roc->sdata, roc->chan,
+ duration, roc->type);
+
+ roc->started = true;
+
+ if (ret) {
+ wiphy_warn(local->hw.wiphy,
+ "failed to start next HW ROC (%d)\n", ret);
+ /*
+ * queue the work struct again to avoid recursion
+ * when multiple failures occur
+ */
+ ieee80211_remain_on_channel_expired(&local->hw);
+ }
+ } else {
+ /* delay it a bit */
+ ieee80211_queue_delayed_work(&local->hw, &roc->work,
+ round_jiffies_relative(HZ/2));
+ }
+}
+
+void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free)
+{
+ struct ieee80211_roc_work *dep, *tmp;
+
+ if (WARN_ON(roc->to_be_freed))
+ return;
+
+ /* was never transmitted */
+ if (roc->frame) {
+ cfg80211_mgmt_tx_status(&roc->sdata->wdev,
+ (unsigned long)roc->frame,
+ roc->frame->data, roc->frame->len,
+ false, GFP_KERNEL);
+ kfree_skb(roc->frame);
+ }
+
+ if (!roc->mgmt_tx_cookie)
+ cfg80211_remain_on_channel_expired(&roc->sdata->wdev,
+ roc->cookie, roc->chan,
+ GFP_KERNEL);
+
+ list_for_each_entry_safe(dep, tmp, &roc->dependents, list)
+ ieee80211_roc_notify_destroy(dep, true);
+
+ if (free)
+ kfree(roc);
+ else
+ roc->to_be_freed = true;
+}
+
+void ieee80211_sw_roc_work(struct work_struct *work)
+{
+ struct ieee80211_roc_work *roc =
+ container_of(work, struct ieee80211_roc_work, work.work);
+ struct ieee80211_sub_if_data *sdata = roc->sdata;
+ struct ieee80211_local *local = sdata->local;
+ bool started, on_channel;
+
+ mutex_lock(&local->mtx);
+
+ if (roc->to_be_freed)
+ goto out_unlock;
+
+ if (roc->abort)
+ goto finish;
+
+ if (WARN_ON(list_empty(&local->roc_list)))
+ goto out_unlock;
+
+ if (WARN_ON(roc != list_first_entry(&local->roc_list,
+ struct ieee80211_roc_work,
+ list)))
+ goto out_unlock;
+
+ if (!roc->started) {
+ struct ieee80211_roc_work *dep;
+
+ WARN_ON(local->use_chanctx);
+
+ /* If actually operating on the desired channel (with at least
+ * 20 MHz channel width) don't stop all the operations but still
+ * treat it as though the ROC operation started properly, so
+ * other ROC operations won't interfere with this one.
+ */
+ roc->on_channel = roc->chan == local->_oper_chandef.chan &&
+ local->_oper_chandef.width != NL80211_CHAN_WIDTH_5 &&
+ local->_oper_chandef.width != NL80211_CHAN_WIDTH_10;
+
+ /* start this ROC */
+ ieee80211_recalc_idle(local);
+
+ if (!roc->on_channel) {
+ ieee80211_offchannel_stop_vifs(local);
+
+ local->tmp_channel = roc->chan;
+ ieee80211_hw_config(local, 0);
+ }
+
+ /* tell userspace or send frame */
+ ieee80211_handle_roc_started(roc);
+ list_for_each_entry(dep, &roc->dependents, list)
+ ieee80211_handle_roc_started(dep);
+
+ /* if it was pure TX, just finish right away */
+ if (!roc->duration)
+ goto finish;
+
+ roc->started = true;
+ ieee80211_queue_delayed_work(&local->hw, &roc->work,
+ msecs_to_jiffies(roc->duration));
+ } else {
+ /* finish this ROC */
+ finish:
+ list_del(&roc->list);
+ started = roc->started;
+ on_channel = roc->on_channel;
+ ieee80211_roc_notify_destroy(roc, !roc->abort);
+
+ if (started && !on_channel) {
+ ieee80211_flush_queues(local, NULL, false);
+
+ local->tmp_channel = NULL;
+ ieee80211_hw_config(local, 0);
+
+ ieee80211_offchannel_return(local);
+ }
+
+ ieee80211_recalc_idle(local);
+
+ if (started)
+ ieee80211_start_next_roc(local);
+ else if (list_empty(&local->roc_list))
+ ieee80211_run_deferred_scan(local);
+ }
+
+ out_unlock:
+ mutex_unlock(&local->mtx);
+}
+
+static void ieee80211_hw_roc_done(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local, hw_roc_done);
+ struct ieee80211_roc_work *roc;
+
+ mutex_lock(&local->mtx);
+
+ if (list_empty(&local->roc_list))
+ goto out_unlock;
+
+ roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work,
+ list);
+
+ if (!roc->started)
+ goto out_unlock;
+
+ list_del(&roc->list);
+
+ ieee80211_roc_notify_destroy(roc, true);
+
+ /* if there's another roc, start it now */
+ ieee80211_start_next_roc(local);
+
+ out_unlock:
+ mutex_unlock(&local->mtx);
+}
+
+void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ trace_api_remain_on_channel_expired(local);
+
+ ieee80211_queue_work(hw, &local->hw_roc_done);
+}
+EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired);
+
+void ieee80211_roc_setup(struct ieee80211_local *local)
+{
+ INIT_WORK(&local->hw_roc_start, ieee80211_hw_roc_start);
+ INIT_WORK(&local->hw_roc_done, ieee80211_hw_roc_done);
+ INIT_LIST_HEAD(&local->roc_list);
+}
+
+void ieee80211_roc_purge(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_roc_work *roc, *tmp;
+ LIST_HEAD(tmp_list);
+
+ mutex_lock(&local->mtx);
+ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) {
+ if (sdata && roc->sdata != sdata)
+ continue;
+
+ if (roc->started && local->ops->remain_on_channel) {
+ /* can race, so ignore return value */
+ drv_cancel_remain_on_channel(local);
+ }
+
+ list_move_tail(&roc->list, &tmp_list);
+ roc->abort = true;
+ }
+ mutex_unlock(&local->mtx);
+
+ list_for_each_entry_safe(roc, tmp, &tmp_list, list) {
+ if (local->ops->remain_on_channel) {
+ list_del(&roc->list);
+ ieee80211_roc_notify_destroy(roc, true);
+ } else {
+ ieee80211_queue_delayed_work(&local->hw, &roc->work, 0);
+
+ /* work will clean up etc */
+ flush_delayed_work(&roc->work);
+ WARN_ON(!roc->to_be_freed);
+ kfree(roc);
+ }
+ }
+
+ WARN_ON_ONCE(!list_empty(&tmp_list));
+}
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
new file mode 100644
index 000000000..ac6ad6238
--- /dev/null
+++ b/net/mac80211/pm.c
@@ -0,0 +1,179 @@
+#include <net/mac80211.h>
+#include <net/rtnetlink.h>
+
+#include "ieee80211_i.h"
+#include "mesh.h"
+#include "driver-ops.h"
+#include "led.h"
+
+int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+
+ if (!local->open_count)
+ goto suspend;
+
+ ieee80211_scan_cancel(local);
+
+ ieee80211_dfs_cac_cancel(local);
+
+ ieee80211_roc_purge(local, NULL);
+
+ ieee80211_del_virtual_monitor(local);
+
+ if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta, &local->sta_list, list) {
+ set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ ieee80211_sta_tear_down_BA_sessions(
+ sta, AGG_STOP_LOCAL_REQUEST);
+ }
+ mutex_unlock(&local->sta_mtx);
+ }
+
+ ieee80211_stop_queues_by_reason(hw,
+ IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_SUSPEND,
+ false);
+
+ /* flush out all packets */
+ synchronize_net();
+
+ ieee80211_flush_queues(local, NULL, true);
+
+ local->quiescing = true;
+ /* make quiescing visible to timers everywhere */
+ mb();
+
+ flush_workqueue(local->workqueue);
+
+ /* Don't try to run timers while suspended. */
+ del_timer_sync(&local->sta_cleanup);
+
+ /*
+ * Note that this particular timer doesn't need to be
+ * restarted at resume.
+ */
+ cancel_work_sync(&local->dynamic_ps_enable_work);
+ del_timer_sync(&local->dynamic_ps_timer);
+
+ local->wowlan = wowlan;
+ if (local->wowlan) {
+ int err;
+
+ /* Drivers don't expect to suspend while some operations like
+ * authenticating or associating are in progress. It doesn't
+ * make sense anyway to accept that, since the authentication
+ * or association would never finish since the driver can't do
+ * that on its own.
+ * Thus, clean up in-progress auth/assoc first.
+ */
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ continue;
+ ieee80211_mgd_quiesce(sdata);
+ }
+
+ err = drv_suspend(local, wowlan);
+ if (err < 0) {
+ local->quiescing = false;
+ local->wowlan = false;
+ if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta,
+ &local->sta_list, list) {
+ clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ }
+ mutex_unlock(&local->sta_mtx);
+ }
+ ieee80211_wake_queues_by_reason(hw,
+ IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_SUSPEND,
+ false);
+ return err;
+ } else if (err > 0) {
+ WARN_ON(err != 1);
+ /* cfg80211 will call back into mac80211 to disconnect
+ * all interfaces, allow that to proceed properly
+ */
+ ieee80211_wake_queues_by_reason(hw,
+ IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_SUSPEND,
+ false);
+ return err;
+ } else {
+ goto suspend;
+ }
+ }
+
+ /* remove all interfaces that were created in the driver */
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MONITOR:
+ continue;
+ case NL80211_IFTYPE_STATION:
+ ieee80211_mgd_quiesce(sdata);
+ break;
+ case NL80211_IFTYPE_WDS:
+ /* tear down aggregation sessions and remove STAs */
+ mutex_lock(&local->sta_mtx);
+ sta = sdata->u.wds.sta;
+ if (sta && sta->uploaded) {
+ enum ieee80211_sta_state state;
+
+ state = sta->sta_state;
+ for (; state > IEEE80211_STA_NOTEXIST; state--)
+ WARN_ON(drv_sta_state(local, sta->sdata,
+ sta, state,
+ state - 1));
+ }
+ mutex_unlock(&local->sta_mtx);
+ break;
+ default:
+ break;
+ }
+
+ drv_remove_interface(local, sdata);
+ }
+
+ /*
+ * We disconnected on all interfaces before suspend, all channel
+ * contexts should be released.
+ */
+ WARN_ON(!list_empty(&local->chanctx_list));
+
+ /* stop hardware - this must stop RX */
+ if (local->open_count)
+ ieee80211_stop_device(local);
+
+ suspend:
+ local->suspended = true;
+ /* need suspended to be visible before quiescing is false */
+ barrier();
+ local->quiescing = false;
+
+ return 0;
+}
+
+/*
+ * __ieee80211_resume() is a static inline which just calls
+ * ieee80211_reconfig(), which is also needed for hardware
+ * hang/firmware failure/etc. recovery.
+ */
+
+void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif,
+ struct cfg80211_wowlan_wakeup *wakeup,
+ gfp_t gfp)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ cfg80211_report_wowlan_wakeup(&sdata->wdev, wakeup, gfp);
+}
+EXPORT_SYMBOL(ieee80211_report_wowlan_wakeup);
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
new file mode 100644
index 000000000..d53355b01
--- /dev/null
+++ b/net/mac80211/rate.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "rate.h"
+#include "ieee80211_i.h"
+#include "debugfs.h"
+
+struct rate_control_alg {
+ struct list_head list;
+ const struct rate_control_ops *ops;
+};
+
+static LIST_HEAD(rate_ctrl_algs);
+static DEFINE_MUTEX(rate_ctrl_mutex);
+
+static char *ieee80211_default_rc_algo = CONFIG_MAC80211_RC_DEFAULT;
+module_param(ieee80211_default_rc_algo, charp, 0644);
+MODULE_PARM_DESC(ieee80211_default_rc_algo,
+ "Default rate control algorithm for mac80211 to use");
+
+int ieee80211_rate_control_register(const struct rate_control_ops *ops)
+{
+ struct rate_control_alg *alg;
+
+ if (!ops->name)
+ return -EINVAL;
+
+ mutex_lock(&rate_ctrl_mutex);
+ list_for_each_entry(alg, &rate_ctrl_algs, list) {
+ if (!strcmp(alg->ops->name, ops->name)) {
+ /* don't register an algorithm twice */
+ WARN_ON(1);
+ mutex_unlock(&rate_ctrl_mutex);
+ return -EALREADY;
+ }
+ }
+
+ alg = kzalloc(sizeof(*alg), GFP_KERNEL);
+ if (alg == NULL) {
+ mutex_unlock(&rate_ctrl_mutex);
+ return -ENOMEM;
+ }
+ alg->ops = ops;
+
+ list_add_tail(&alg->list, &rate_ctrl_algs);
+ mutex_unlock(&rate_ctrl_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(ieee80211_rate_control_register);
+
+void ieee80211_rate_control_unregister(const struct rate_control_ops *ops)
+{
+ struct rate_control_alg *alg;
+
+ mutex_lock(&rate_ctrl_mutex);
+ list_for_each_entry(alg, &rate_ctrl_algs, list) {
+ if (alg->ops == ops) {
+ list_del(&alg->list);
+ kfree(alg);
+ break;
+ }
+ }
+ mutex_unlock(&rate_ctrl_mutex);
+}
+EXPORT_SYMBOL(ieee80211_rate_control_unregister);
+
+static const struct rate_control_ops *
+ieee80211_try_rate_control_ops_get(const char *name)
+{
+ struct rate_control_alg *alg;
+ const struct rate_control_ops *ops = NULL;
+
+ if (!name)
+ return NULL;
+
+ mutex_lock(&rate_ctrl_mutex);
+ list_for_each_entry(alg, &rate_ctrl_algs, list) {
+ if (!strcmp(alg->ops->name, name)) {
+ ops = alg->ops;
+ break;
+ }
+ }
+ mutex_unlock(&rate_ctrl_mutex);
+ return ops;
+}
+
+/* Get the rate control algorithm. */
+static const struct rate_control_ops *
+ieee80211_rate_control_ops_get(const char *name)
+{
+ const struct rate_control_ops *ops;
+ const char *alg_name;
+
+ kparam_block_sysfs_write(ieee80211_default_rc_algo);
+ if (!name)
+ alg_name = ieee80211_default_rc_algo;
+ else
+ alg_name = name;
+
+ ops = ieee80211_try_rate_control_ops_get(alg_name);
+ if (!ops && name)
+ /* try default if specific alg requested but not found */
+ ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo);
+
+ /* try built-in one if specific alg requested but not found */
+ if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT))
+ ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT);
+ kparam_unblock_sysfs_write(ieee80211_default_rc_algo);
+
+ return ops;
+}
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+static ssize_t rcname_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct rate_control_ref *ref = file->private_data;
+ int len = strlen(ref->ops->name);
+
+ return simple_read_from_buffer(userbuf, count, ppos,
+ ref->ops->name, len);
+}
+
+static const struct file_operations rcname_ops = {
+ .read = rcname_read,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+#endif
+
+static struct rate_control_ref *rate_control_alloc(const char *name,
+ struct ieee80211_local *local)
+{
+ struct dentry *debugfsdir = NULL;
+ struct rate_control_ref *ref;
+
+ ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL);
+ if (!ref)
+ return NULL;
+ ref->local = local;
+ ref->ops = ieee80211_rate_control_ops_get(name);
+ if (!ref->ops)
+ goto free;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir);
+ local->debugfs.rcdir = debugfsdir;
+ debugfs_create_file("name", 0400, debugfsdir, ref, &rcname_ops);
+#endif
+
+ ref->priv = ref->ops->alloc(&local->hw, debugfsdir);
+ if (!ref->priv)
+ goto free;
+ return ref;
+
+free:
+ kfree(ref);
+ return NULL;
+}
+
+static void rate_control_free(struct rate_control_ref *ctrl_ref)
+{
+ ctrl_ref->ops->free(ctrl_ref->priv);
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ debugfs_remove_recursive(ctrl_ref->local->debugfs.rcdir);
+ ctrl_ref->local->debugfs.rcdir = NULL;
+#endif
+
+ kfree(ctrl_ref);
+}
+
+static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc)
+{
+ struct sk_buff *skb = txrc->skb;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ __le16 fc;
+
+ fc = hdr->frame_control;
+
+ return (info->flags & (IEEE80211_TX_CTL_NO_ACK |
+ IEEE80211_TX_CTL_USE_MINRATE)) ||
+ !ieee80211_is_data(fc);
+}
+
+static void rc_send_low_basicrate(s8 *idx, u32 basic_rates,
+ struct ieee80211_supported_band *sband)
+{
+ u8 i;
+
+ if (basic_rates == 0)
+ return; /* assume basic rates unknown and accept rate */
+ if (*idx < 0)
+ return;
+ if (basic_rates & (1 << *idx))
+ return; /* selected rate is a basic rate */
+
+ for (i = *idx + 1; i <= sband->n_bitrates; i++) {
+ if (basic_rates & (1 << i)) {
+ *idx = i;
+ return;
+ }
+ }
+
+ /* could not find a basic rate; use original selection */
+}
+
+static void __rate_control_send_low(struct ieee80211_hw *hw,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_sta *sta,
+ struct ieee80211_tx_info *info,
+ u32 rate_mask)
+{
+ int i;
+ u32 rate_flags =
+ ieee80211_chandef_rate_flags(&hw->conf.chandef);
+
+ if ((sband->band == IEEE80211_BAND_2GHZ) &&
+ (info->flags & IEEE80211_TX_CTL_NO_CCK_RATE))
+ rate_flags |= IEEE80211_RATE_ERP_G;
+
+ info->control.rates[0].idx = 0;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (!(rate_mask & BIT(i)))
+ continue;
+
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+
+ if (!rate_supported(sta, sband->band, i))
+ continue;
+
+ info->control.rates[0].idx = i;
+ break;
+ }
+ WARN_ON_ONCE(i == sband->n_bitrates);
+
+ info->control.rates[0].count =
+ (info->flags & IEEE80211_TX_CTL_NO_ACK) ?
+ 1 : hw->max_rate_tries;
+
+ info->control.skip_table = 1;
+}
+
+
+bool rate_control_send_low(struct ieee80211_sta *pubsta,
+ void *priv_sta,
+ struct ieee80211_tx_rate_control *txrc)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
+ struct ieee80211_supported_band *sband = txrc->sband;
+ struct sta_info *sta;
+ int mcast_rate;
+ bool use_basicrate = false;
+
+ if (!pubsta || !priv_sta || rc_no_data_or_no_ack_use_min(txrc)) {
+ __rate_control_send_low(txrc->hw, sband, pubsta, info,
+ txrc->rate_idx_mask);
+
+ if (!pubsta && txrc->bss) {
+ mcast_rate = txrc->bss_conf->mcast_rate[sband->band];
+ if (mcast_rate > 0) {
+ info->control.rates[0].idx = mcast_rate - 1;
+ return true;
+ }
+ use_basicrate = true;
+ } else if (pubsta) {
+ sta = container_of(pubsta, struct sta_info, sta);
+ if (ieee80211_vif_is_mesh(&sta->sdata->vif))
+ use_basicrate = true;
+ }
+
+ if (use_basicrate)
+ rc_send_low_basicrate(&info->control.rates[0].idx,
+ txrc->bss_conf->basic_rates,
+ sband);
+
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(rate_control_send_low);
+
+static bool rate_idx_match_legacy_mask(struct ieee80211_tx_rate *rate,
+ int n_bitrates, u32 mask)
+{
+ int j;
+
+ /* See whether the selected rate or anything below it is allowed. */
+ for (j = rate->idx; j >= 0; j--) {
+ if (mask & (1 << j)) {
+ /* Okay, found a suitable rate. Use it. */
+ rate->idx = j;
+ return true;
+ }
+ }
+
+ /* Try to find a higher rate that would be allowed */
+ for (j = rate->idx + 1; j < n_bitrates; j++) {
+ if (mask & (1 << j)) {
+ /* Okay, found a suitable rate. Use it. */
+ rate->idx = j;
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate,
+ u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN])
+{
+ int i, j;
+ int ridx, rbit;
+
+ ridx = rate->idx / 8;
+ rbit = rate->idx % 8;
+
+ /* sanity check */
+ if (ridx < 0 || ridx >= IEEE80211_HT_MCS_MASK_LEN)
+ return false;
+
+ /* See whether the selected rate or anything below it is allowed. */
+ for (i = ridx; i >= 0; i--) {
+ for (j = rbit; j >= 0; j--)
+ if (mcs_mask[i] & BIT(j)) {
+ rate->idx = i * 8 + j;
+ return true;
+ }
+ rbit = 7;
+ }
+
+ /* Try to find a higher rate that would be allowed */
+ ridx = (rate->idx + 1) / 8;
+ rbit = (rate->idx + 1) % 8;
+
+ for (i = ridx; i < IEEE80211_HT_MCS_MASK_LEN; i++) {
+ for (j = rbit; j < 8; j++)
+ if (mcs_mask[i] & BIT(j)) {
+ rate->idx = i * 8 + j;
+ return true;
+ }
+ rbit = 0;
+ }
+ return false;
+}
+
+
+
+static void rate_idx_match_mask(struct ieee80211_tx_rate *rate,
+ struct ieee80211_supported_band *sband,
+ enum nl80211_chan_width chan_width,
+ u32 mask,
+ u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN])
+{
+ struct ieee80211_tx_rate alt_rate;
+
+ /* handle HT rates */
+ if (rate->flags & IEEE80211_TX_RC_MCS) {
+ if (rate_idx_match_mcs_mask(rate, mcs_mask))
+ return;
+
+ /* also try the legacy rates. */
+ alt_rate.idx = 0;
+ /* keep protection flags */
+ alt_rate.flags = rate->flags &
+ (IEEE80211_TX_RC_USE_RTS_CTS |
+ IEEE80211_TX_RC_USE_CTS_PROTECT |
+ IEEE80211_TX_RC_USE_SHORT_PREAMBLE);
+ alt_rate.count = rate->count;
+ if (rate_idx_match_legacy_mask(&alt_rate,
+ sband->n_bitrates, mask)) {
+ *rate = alt_rate;
+ return;
+ }
+ } else if (!(rate->flags & IEEE80211_TX_RC_VHT_MCS)) {
+ /* handle legacy rates */
+ if (rate_idx_match_legacy_mask(rate, sband->n_bitrates, mask))
+ return;
+
+ /* if HT BSS, and we handle a data frame, also try HT rates */
+ switch (chan_width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ return;
+ default:
+ break;
+ }
+
+ alt_rate.idx = 0;
+ /* keep protection flags */
+ alt_rate.flags = rate->flags &
+ (IEEE80211_TX_RC_USE_RTS_CTS |
+ IEEE80211_TX_RC_USE_CTS_PROTECT |
+ IEEE80211_TX_RC_USE_SHORT_PREAMBLE);
+ alt_rate.count = rate->count;
+
+ alt_rate.flags |= IEEE80211_TX_RC_MCS;
+
+ if (chan_width == NL80211_CHAN_WIDTH_40)
+ alt_rate.flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
+
+ if (rate_idx_match_mcs_mask(&alt_rate, mcs_mask)) {
+ *rate = alt_rate;
+ return;
+ }
+ }
+
+ /*
+ * Uh.. No suitable rate exists. This should not really happen with
+ * sane TX rate mask configurations. However, should someone manage to
+ * configure supported rates and TX rate mask in incompatible way,
+ * allow the frame to be transmitted with whatever the rate control
+ * selected.
+ */
+}
+
+static void rate_fixup_ratelist(struct ieee80211_vif *vif,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_tx_info *info,
+ struct ieee80211_tx_rate *rates,
+ int max_rates)
+{
+ struct ieee80211_rate *rate;
+ bool inval = false;
+ int i;
+
+ /*
+ * Set up the RTS/CTS rate as the fastest basic rate
+ * that is not faster than the data rate unless there
+ * is no basic rate slower than the data rate, in which
+ * case we pick the slowest basic rate
+ *
+ * XXX: Should this check all retry rates?
+ */
+ if (!(rates[0].flags &
+ (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) {
+ u32 basic_rates = vif->bss_conf.basic_rates;
+ s8 baserate = basic_rates ? ffs(basic_rates) - 1 : 0;
+
+ rate = &sband->bitrates[rates[0].idx];
+
+ for (i = 0; i < sband->n_bitrates; i++) {
+ /* must be a basic rate */
+ if (!(basic_rates & BIT(i)))
+ continue;
+ /* must not be faster than the data rate */
+ if (sband->bitrates[i].bitrate > rate->bitrate)
+ continue;
+ /* maximum */
+ if (sband->bitrates[baserate].bitrate <
+ sband->bitrates[i].bitrate)
+ baserate = i;
+ }
+
+ info->control.rts_cts_rate_idx = baserate;
+ }
+
+ for (i = 0; i < max_rates; i++) {
+ /*
+ * make sure there's no valid rate following
+ * an invalid one, just in case drivers don't
+ * take the API seriously to stop at -1.
+ */
+ if (inval) {
+ rates[i].idx = -1;
+ continue;
+ }
+ if (rates[i].idx < 0) {
+ inval = true;
+ continue;
+ }
+
+ /*
+ * For now assume MCS is already set up correctly, this
+ * needs to be fixed.
+ */
+ if (rates[i].flags & IEEE80211_TX_RC_MCS) {
+ WARN_ON(rates[i].idx > 76);
+
+ if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) &&
+ info->control.use_cts_prot)
+ rates[i].flags |=
+ IEEE80211_TX_RC_USE_CTS_PROTECT;
+ continue;
+ }
+
+ if (rates[i].flags & IEEE80211_TX_RC_VHT_MCS) {
+ WARN_ON(ieee80211_rate_get_vht_mcs(&rates[i]) > 9);
+ continue;
+ }
+
+ /* set up RTS protection if desired */
+ if (info->control.use_rts) {
+ rates[i].flags |= IEEE80211_TX_RC_USE_RTS_CTS;
+ info->control.use_cts_prot = false;
+ }
+
+ /* RC is busted */
+ if (WARN_ON_ONCE(rates[i].idx >= sband->n_bitrates)) {
+ rates[i].idx = -1;
+ continue;
+ }
+
+ rate = &sband->bitrates[rates[i].idx];
+
+ /* set up short preamble */
+ if (info->control.short_preamble &&
+ rate->flags & IEEE80211_RATE_SHORT_PREAMBLE)
+ rates[i].flags |= IEEE80211_TX_RC_USE_SHORT_PREAMBLE;
+
+ /* set up G protection */
+ if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) &&
+ info->control.use_cts_prot &&
+ rate->flags & IEEE80211_RATE_ERP_G)
+ rates[i].flags |= IEEE80211_TX_RC_USE_CTS_PROTECT;
+ }
+}
+
+
+static void rate_control_fill_sta_table(struct ieee80211_sta *sta,
+ struct ieee80211_tx_info *info,
+ struct ieee80211_tx_rate *rates,
+ int max_rates)
+{
+ struct ieee80211_sta_rates *ratetbl = NULL;
+ int i;
+
+ if (sta && !info->control.skip_table)
+ ratetbl = rcu_dereference(sta->rates);
+
+ /* Fill remaining rate slots with data from the sta rate table. */
+ max_rates = min_t(int, max_rates, IEEE80211_TX_RATE_TABLE_SIZE);
+ for (i = 0; i < max_rates; i++) {
+ if (i < ARRAY_SIZE(info->control.rates) &&
+ info->control.rates[i].idx >= 0 &&
+ info->control.rates[i].count) {
+ if (rates != info->control.rates)
+ rates[i] = info->control.rates[i];
+ } else if (ratetbl) {
+ rates[i].idx = ratetbl->rate[i].idx;
+ rates[i].flags = ratetbl->rate[i].flags;
+ if (info->control.use_rts)
+ rates[i].count = ratetbl->rate[i].count_rts;
+ else if (info->control.use_cts_prot)
+ rates[i].count = ratetbl->rate[i].count_cts;
+ else
+ rates[i].count = ratetbl->rate[i].count;
+ } else {
+ rates[i].idx = -1;
+ rates[i].count = 0;
+ }
+
+ if (rates[i].idx < 0 || !rates[i].count)
+ break;
+ }
+}
+
+static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_tx_info *info,
+ struct ieee80211_tx_rate *rates,
+ int max_rates)
+{
+ enum nl80211_chan_width chan_width;
+ u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN];
+ bool has_mcs_mask;
+ u32 mask;
+ u32 rate_flags;
+ int i;
+
+ /*
+ * Try to enforce the rateidx mask the user wanted. skip this if the
+ * default mask (allow all rates) is used to save some processing for
+ * the common case.
+ */
+ mask = sdata->rc_rateidx_mask[info->band];
+ has_mcs_mask = sdata->rc_has_mcs_mask[info->band];
+ rate_flags =
+ ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+ for (i = 0; i < sband->n_bitrates; i++)
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ mask &= ~BIT(i);
+
+ if (mask == (1 << sband->n_bitrates) - 1 && !has_mcs_mask)
+ return;
+
+ if (has_mcs_mask)
+ memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[info->band],
+ sizeof(mcs_mask));
+ else
+ memset(mcs_mask, 0xff, sizeof(mcs_mask));
+
+ if (sta) {
+ /* Filter out rates that the STA does not support */
+ mask &= sta->supp_rates[info->band];
+ for (i = 0; i < sizeof(mcs_mask); i++)
+ mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i];
+ }
+
+ /*
+ * Make sure the rate index selected for each TX rate is
+ * included in the configured mask and change the rate indexes
+ * if needed.
+ */
+ chan_width = sdata->vif.bss_conf.chandef.width;
+ for (i = 0; i < max_rates; i++) {
+ /* Skip invalid rates */
+ if (rates[i].idx < 0)
+ break;
+
+ rate_idx_match_mask(&rates[i], sband, chan_width, mask,
+ mcs_mask);
+ }
+}
+
+void ieee80211_get_tx_rates(struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta,
+ struct sk_buff *skb,
+ struct ieee80211_tx_rate *dest,
+ int max_rates)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_supported_band *sband;
+
+ rate_control_fill_sta_table(sta, info, dest, max_rates);
+
+ if (!vif)
+ return;
+
+ sdata = vif_to_sdata(vif);
+ sband = sdata->local->hw.wiphy->bands[info->band];
+
+ if (ieee80211_is_data(hdr->frame_control))
+ rate_control_apply_mask(sdata, sta, sband, info, dest, max_rates);
+
+ if (dest[0].idx < 0)
+ __rate_control_send_low(&sdata->local->hw, sband, sta, info,
+ sdata->rc_rateidx_mask[info->band]);
+
+ if (sta)
+ rate_fixup_ratelist(vif, sband, info, dest, max_rates);
+}
+EXPORT_SYMBOL(ieee80211_get_tx_rates);
+
+void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_tx_rate_control *txrc)
+{
+ struct rate_control_ref *ref = sdata->local->rate_ctrl;
+ void *priv_sta = NULL;
+ struct ieee80211_sta *ista = NULL;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
+ int i;
+
+ if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) {
+ ista = &sta->sta;
+ priv_sta = sta->rate_ctrl_priv;
+ }
+
+ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+ info->control.rates[i].idx = -1;
+ info->control.rates[i].flags = 0;
+ info->control.rates[i].count = 0;
+ }
+
+ if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
+ return;
+
+ ref->ops->get_rate(ref->priv, ista, priv_sta, txrc);
+
+ if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_RC_TABLE)
+ return;
+
+ ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb,
+ info->control.rates,
+ ARRAY_SIZE(info->control.rates));
+}
+
+int rate_control_set_rates(struct ieee80211_hw *hw,
+ struct ieee80211_sta *pubsta,
+ struct ieee80211_sta_rates *rates)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ struct ieee80211_sta_rates *old;
+
+ /*
+ * mac80211 guarantees that this function will not be called
+ * concurrently, so the following RCU access is safe, even without
+ * extra locking. This can not be checked easily, so we just set
+ * the condition to true.
+ */
+ old = rcu_dereference_protected(pubsta->rates, true);
+ rcu_assign_pointer(pubsta->rates, rates);
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta);
+
+ return 0;
+}
+EXPORT_SYMBOL(rate_control_set_rates);
+
+int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
+ const char *name)
+{
+ struct rate_control_ref *ref;
+
+ ASSERT_RTNL();
+
+ if (local->open_count)
+ return -EBUSY;
+
+ if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) {
+ if (WARN_ON(!local->ops->set_rts_threshold))
+ return -EINVAL;
+ return 0;
+ }
+
+ ref = rate_control_alloc(name, local);
+ if (!ref) {
+ wiphy_warn(local->hw.wiphy,
+ "Failed to select rate control algorithm\n");
+ return -ENOENT;
+ }
+
+ WARN_ON(local->rate_ctrl);
+ local->rate_ctrl = ref;
+
+ wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n",
+ ref->ops->name);
+
+ return 0;
+}
+
+void rate_control_deinitialize(struct ieee80211_local *local)
+{
+ struct rate_control_ref *ref;
+
+ ref = local->rate_ctrl;
+
+ if (!ref)
+ return;
+
+ local->rate_ctrl = NULL;
+ rate_control_free(ref);
+}
+
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
new file mode 100644
index 000000000..38652f09f
--- /dev/null
+++ b/net/mac80211/rate.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef IEEE80211_RATE_H
+#define IEEE80211_RATE_H
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+#include "driver-ops.h"
+
+struct rate_control_ref {
+ struct ieee80211_local *local;
+ const struct rate_control_ops *ops;
+ void *priv;
+};
+
+void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct ieee80211_tx_rate_control *txrc);
+
+static inline void rate_control_tx_status(struct ieee80211_local *local,
+ struct ieee80211_supported_band *sband,
+ struct sta_info *sta,
+ struct sk_buff *skb)
+{
+ struct rate_control_ref *ref = local->rate_ctrl;
+ struct ieee80211_sta *ista = &sta->sta;
+ void *priv_sta = sta->rate_ctrl_priv;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+ if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
+ return;
+
+ if (ref->ops->tx_status)
+ ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb);
+ else
+ ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info);
+}
+
+static inline void
+rate_control_tx_status_noskb(struct ieee80211_local *local,
+ struct ieee80211_supported_band *sband,
+ struct sta_info *sta,
+ struct ieee80211_tx_info *info)
+{
+ struct rate_control_ref *ref = local->rate_ctrl;
+ struct ieee80211_sta *ista = &sta->sta;
+ void *priv_sta = sta->rate_ctrl_priv;
+
+ if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
+ return;
+
+ if (WARN_ON_ONCE(!ref->ops->tx_status_noskb))
+ return;
+
+ ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info);
+}
+
+static inline void rate_control_rate_init(struct sta_info *sta)
+{
+ struct ieee80211_local *local = sta->sdata->local;
+ struct rate_control_ref *ref = sta->rate_ctrl;
+ struct ieee80211_sta *ista = &sta->sta;
+ void *priv_sta = sta->rate_ctrl_priv;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ ieee80211_sta_set_rx_nss(sta);
+
+ if (!ref)
+ return;
+
+ rcu_read_lock();
+
+ chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band];
+
+ ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista,
+ priv_sta);
+ rcu_read_unlock();
+ set_sta_flag(sta, WLAN_STA_RATE_CONTROL);
+}
+
+static inline void rate_control_rate_update(struct ieee80211_local *local,
+ struct ieee80211_supported_band *sband,
+ struct sta_info *sta, u32 changed)
+{
+ struct rate_control_ref *ref = local->rate_ctrl;
+ struct ieee80211_sta *ista = &sta->sta;
+ void *priv_sta = sta->rate_ctrl_priv;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ if (ref && ref->ops->rate_update) {
+ rcu_read_lock();
+
+ chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def,
+ ista, priv_sta, changed);
+ rcu_read_unlock();
+ }
+ drv_sta_rc_update(local, sta->sdata, &sta->sta, changed);
+}
+
+static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
+ struct ieee80211_sta *sta,
+ gfp_t gfp)
+{
+ return ref->ops->alloc_sta(ref->priv, sta, gfp);
+}
+
+static inline void rate_control_free_sta(struct sta_info *sta)
+{
+ struct rate_control_ref *ref = sta->rate_ctrl;
+ struct ieee80211_sta *ista = &sta->sta;
+ void *priv_sta = sta->rate_ctrl_priv;
+
+ ref->ops->free_sta(ref->priv, ista, priv_sta);
+}
+
+static inline void rate_control_add_sta_debugfs(struct sta_info *sta)
+{
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct rate_control_ref *ref = sta->rate_ctrl;
+ if (ref && sta->debugfs.dir && ref->ops->add_sta_debugfs)
+ ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv,
+ sta->debugfs.dir);
+#endif
+}
+
+static inline void rate_control_remove_sta_debugfs(struct sta_info *sta)
+{
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct rate_control_ref *ref = sta->rate_ctrl;
+ if (ref && ref->ops->remove_sta_debugfs)
+ ref->ops->remove_sta_debugfs(ref->priv, sta->rate_ctrl_priv);
+#endif
+}
+
+/* Get a reference to the rate control algorithm. If `name' is NULL, get the
+ * first available algorithm. */
+int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
+ const char *name);
+void rate_control_deinitialize(struct ieee80211_local *local);
+
+
+/* Rate control algorithms */
+#ifdef CONFIG_MAC80211_RC_MINSTREL
+int rc80211_minstrel_init(void);
+void rc80211_minstrel_exit(void);
+#else
+static inline int rc80211_minstrel_init(void)
+{
+ return 0;
+}
+static inline void rc80211_minstrel_exit(void)
+{
+}
+#endif
+
+#ifdef CONFIG_MAC80211_RC_MINSTREL_HT
+int rc80211_minstrel_ht_init(void);
+void rc80211_minstrel_ht_exit(void);
+#else
+static inline int rc80211_minstrel_ht_init(void)
+{
+ return 0;
+}
+static inline void rc80211_minstrel_ht_exit(void)
+{
+}
+#endif
+
+
+#endif /* IEEE80211_RATE_H */
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
new file mode 100644
index 000000000..247552a7f
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel.c
@@ -0,0 +1,746 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on minstrel.c:
+ * Copyright (C) 2005-2007 Derek Smithies <derek@indranet.co.nz>
+ * Sponsored by Indranet Technologies Ltd
+ *
+ * Based on sample.c:
+ * Copyright (c) 2005 John Bicket
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ * redistribution must be conditioned upon including a substantially
+ * similar Disclaimer requirement for further binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ * of any contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/ieee80211.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include "rate.h"
+#include "rc80211_minstrel.h"
+
+#define SAMPLE_TBL(_mi, _idx, _col) \
+ _mi->sample_table[(_idx * SAMPLE_COLUMNS) + _col]
+
+/* convert mac80211 rate index to local array index */
+static inline int
+rix_to_ndx(struct minstrel_sta_info *mi, int rix)
+{
+ int i = rix;
+ for (i = rix; i >= 0; i--)
+ if (mi->r[i].rix == rix)
+ break;
+ return i;
+}
+
+/* return current EMWA throughput */
+int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma)
+{
+ int usecs;
+
+ usecs = mr->perfect_tx_time;
+ if (!usecs)
+ usecs = 1000000;
+
+ /* reset thr. below 10% success */
+ if (mr->stats.prob_ewma < MINSTREL_FRAC(10, 100))
+ return 0;
+
+ if (prob_ewma > MINSTREL_FRAC(90, 100))
+ return MINSTREL_TRUNC(100000 * (MINSTREL_FRAC(90, 100) / usecs));
+ else
+ return MINSTREL_TRUNC(100000 * (prob_ewma / usecs));
+}
+
+/* find & sort topmost throughput rates */
+static inline void
+minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list)
+{
+ int j = MAX_THR_RATES;
+ struct minstrel_rate_stats *tmp_mrs = &mi->r[j - 1].stats;
+ struct minstrel_rate_stats *cur_mrs = &mi->r[i].stats;
+
+ while (j > 0 && (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_ewma) >
+ minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_ewma))) {
+ j--;
+ tmp_mrs = &mi->r[tp_list[j - 1]].stats;
+ }
+
+ if (j < MAX_THR_RATES - 1)
+ memmove(&tp_list[j + 1], &tp_list[j], MAX_THR_RATES - (j + 1));
+ if (j < MAX_THR_RATES)
+ tp_list[j] = i;
+}
+
+static void
+minstrel_set_rate(struct minstrel_sta_info *mi, struct ieee80211_sta_rates *ratetbl,
+ int offset, int idx)
+{
+ struct minstrel_rate *r = &mi->r[idx];
+
+ ratetbl->rate[offset].idx = r->rix;
+ ratetbl->rate[offset].count = r->adjusted_retry_count;
+ ratetbl->rate[offset].count_cts = r->retry_count_cts;
+ ratetbl->rate[offset].count_rts = r->stats.retry_count_rtscts;
+}
+
+static void
+minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
+{
+ struct ieee80211_sta_rates *ratetbl;
+ int i = 0;
+
+ ratetbl = kzalloc(sizeof(*ratetbl), GFP_ATOMIC);
+ if (!ratetbl)
+ return;
+
+ /* Start with max_tp_rate */
+ minstrel_set_rate(mi, ratetbl, i++, mi->max_tp_rate[0]);
+
+ if (mp->hw->max_rates >= 3) {
+ /* At least 3 tx rates supported, use max_tp_rate2 next */
+ minstrel_set_rate(mi, ratetbl, i++, mi->max_tp_rate[1]);
+ }
+
+ if (mp->hw->max_rates >= 2) {
+ /* At least 2 tx rates supported, use max_prob_rate next */
+ minstrel_set_rate(mi, ratetbl, i++, mi->max_prob_rate);
+ }
+
+ /* Use lowest rate last */
+ ratetbl->rate[i].idx = mi->lowest_rix;
+ ratetbl->rate[i].count = mp->max_retry;
+ ratetbl->rate[i].count_cts = mp->max_retry;
+ ratetbl->rate[i].count_rts = mp->max_retry;
+
+ rate_control_set_rates(mp->hw, mi->sta, ratetbl);
+}
+
+/*
+* Recalculate statistics and counters of a given rate
+*/
+void
+minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
+{
+ if (unlikely(mrs->attempts > 0)) {
+ mrs->sample_skipped = 0;
+ mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
+ if (unlikely(!mrs->att_hist)) {
+ mrs->prob_ewma = mrs->cur_prob;
+ } else {
+ /* update exponential weighted moving variance */
+ mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd,
+ mrs->cur_prob,
+ mrs->prob_ewma,
+ EWMA_LEVEL);
+
+ /*update exponential weighted moving avarage */
+ mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
+ mrs->cur_prob,
+ EWMA_LEVEL);
+ }
+ mrs->att_hist += mrs->attempts;
+ mrs->succ_hist += mrs->success;
+ } else {
+ mrs->sample_skipped++;
+ }
+
+ mrs->last_success = mrs->success;
+ mrs->last_attempts = mrs->attempts;
+ mrs->success = 0;
+ mrs->attempts = 0;
+}
+
+static void
+minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
+{
+ u8 tmp_tp_rate[MAX_THR_RATES];
+ u8 tmp_prob_rate = 0;
+ int i, tmp_cur_tp, tmp_prob_tp;
+
+ for (i = 0; i < MAX_THR_RATES; i++)
+ tmp_tp_rate[i] = 0;
+
+ for (i = 0; i < mi->n_rates; i++) {
+ struct minstrel_rate *mr = &mi->r[i];
+ struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+ struct minstrel_rate_stats *tmp_mrs = &mi->r[tmp_prob_rate].stats;
+
+ /* Update statistics of success probability per rate */
+ minstrel_calc_rate_stats(mrs);
+
+ /* Sample less often below the 10% chance of success.
+ * Sample less often above the 95% chance of success. */
+ if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) ||
+ mrs->prob_ewma < MINSTREL_FRAC(10, 100)) {
+ mr->adjusted_retry_count = mrs->retry_count >> 1;
+ if (mr->adjusted_retry_count > 2)
+ mr->adjusted_retry_count = 2;
+ mr->sample_limit = 4;
+ } else {
+ mr->sample_limit = -1;
+ mr->adjusted_retry_count = mrs->retry_count;
+ }
+ if (!mr->adjusted_retry_count)
+ mr->adjusted_retry_count = 2;
+
+ minstrel_sort_best_tp_rates(mi, i, tmp_tp_rate);
+
+ /* To determine the most robust rate (max_prob_rate) used at
+ * 3rd mmr stage we distinct between two cases:
+ * (1) if any success probabilitiy >= 95%, out of those rates
+ * choose the maximum throughput rate as max_prob_rate
+ * (2) if all success probabilities < 95%, the rate with
+ * highest success probability is chosen as max_prob_rate */
+ if (mrs->prob_ewma >= MINSTREL_FRAC(95, 100)) {
+ tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_ewma);
+ tmp_prob_tp = minstrel_get_tp_avg(&mi->r[tmp_prob_rate],
+ tmp_mrs->prob_ewma);
+ if (tmp_cur_tp >= tmp_prob_tp)
+ tmp_prob_rate = i;
+ } else {
+ if (mrs->prob_ewma >= tmp_mrs->prob_ewma)
+ tmp_prob_rate = i;
+ }
+ }
+
+ /* Assign the new rate set */
+ memcpy(mi->max_tp_rate, tmp_tp_rate, sizeof(mi->max_tp_rate));
+ mi->max_prob_rate = tmp_prob_rate;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ /* use fixed index if set */
+ if (mp->fixed_rate_idx != -1) {
+ mi->max_tp_rate[0] = mp->fixed_rate_idx;
+ mi->max_tp_rate[1] = mp->fixed_rate_idx;
+ mi->max_prob_rate = mp->fixed_rate_idx;
+ }
+#endif
+
+ /* Reset update timer */
+ mi->last_stats_update = jiffies;
+
+ minstrel_update_rates(mp, mi);
+}
+
+static void
+minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband,
+ struct ieee80211_sta *sta, void *priv_sta,
+ struct ieee80211_tx_info *info)
+{
+ struct minstrel_priv *mp = priv;
+ struct minstrel_sta_info *mi = priv_sta;
+ struct ieee80211_tx_rate *ar = info->status.rates;
+ int i, ndx;
+ int success;
+
+ success = !!(info->flags & IEEE80211_TX_STAT_ACK);
+
+ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+ if (ar[i].idx < 0)
+ break;
+
+ ndx = rix_to_ndx(mi, ar[i].idx);
+ if (ndx < 0)
+ continue;
+
+ mi->r[ndx].stats.attempts += ar[i].count;
+
+ if ((i != IEEE80211_TX_MAX_RATES - 1) && (ar[i + 1].idx < 0))
+ mi->r[ndx].stats.success += success;
+ }
+
+ if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0))
+ mi->sample_packets++;
+
+ if (mi->sample_deferred > 0)
+ mi->sample_deferred--;
+
+ if (time_after(jiffies, mi->last_stats_update +
+ (mp->update_interval * HZ) / 1000))
+ minstrel_update_stats(mp, mi);
+}
+
+
+static inline unsigned int
+minstrel_get_retry_count(struct minstrel_rate *mr,
+ struct ieee80211_tx_info *info)
+{
+ u8 retry = mr->adjusted_retry_count;
+
+ if (info->control.use_rts)
+ retry = max_t(u8, 2, min(mr->stats.retry_count_rtscts, retry));
+ else if (info->control.use_cts_prot)
+ retry = max_t(u8, 2, min(mr->retry_count_cts, retry));
+ return retry;
+}
+
+
+static int
+minstrel_get_next_sample(struct minstrel_sta_info *mi)
+{
+ unsigned int sample_ndx;
+ sample_ndx = SAMPLE_TBL(mi, mi->sample_row, mi->sample_column);
+ mi->sample_row++;
+ if ((int) mi->sample_row >= mi->n_rates) {
+ mi->sample_row = 0;
+ mi->sample_column++;
+ if (mi->sample_column >= SAMPLE_COLUMNS)
+ mi->sample_column = 0;
+ }
+ return sample_ndx;
+}
+
+static void
+minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
+ void *priv_sta, struct ieee80211_tx_rate_control *txrc)
+{
+ struct sk_buff *skb = txrc->skb;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct minstrel_sta_info *mi = priv_sta;
+ struct minstrel_priv *mp = priv;
+ struct ieee80211_tx_rate *rate = &info->control.rates[0];
+ struct minstrel_rate *msr, *mr;
+ unsigned int ndx;
+ bool mrr_capable;
+ bool prev_sample;
+ int delta;
+ int sampling_ratio;
+
+ /* management/no-ack frames do not use rate control */
+ if (rate_control_send_low(sta, priv_sta, txrc))
+ return;
+
+ /* check multi-rate-retry capabilities & adjust lookaround_rate */
+ mrr_capable = mp->has_mrr &&
+ !txrc->rts &&
+ !txrc->bss_conf->use_cts_prot;
+ if (mrr_capable)
+ sampling_ratio = mp->lookaround_rate_mrr;
+ else
+ sampling_ratio = mp->lookaround_rate;
+
+ /* increase sum packet counter */
+ mi->total_packets++;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ if (mp->fixed_rate_idx != -1)
+ return;
+#endif
+
+ delta = (mi->total_packets * sampling_ratio / 100) -
+ (mi->sample_packets + mi->sample_deferred / 2);
+
+ /* delta < 0: no sampling required */
+ prev_sample = mi->prev_sample;
+ mi->prev_sample = false;
+ if (delta < 0 || (!mrr_capable && prev_sample))
+ return;
+
+ if (mi->total_packets >= 10000) {
+ mi->sample_deferred = 0;
+ mi->sample_packets = 0;
+ mi->total_packets = 0;
+ } else if (delta > mi->n_rates * 2) {
+ /* With multi-rate retry, not every planned sample
+ * attempt actually gets used, due to the way the retry
+ * chain is set up - [max_tp,sample,prob,lowest] for
+ * sample_rate < max_tp.
+ *
+ * If there's too much sampling backlog and the link
+ * starts getting worse, minstrel would start bursting
+ * out lots of sampling frames, which would result
+ * in a large throughput loss. */
+ mi->sample_packets += (delta - mi->n_rates * 2);
+ }
+
+ /* get next random rate sample */
+ ndx = minstrel_get_next_sample(mi);
+ msr = &mi->r[ndx];
+ mr = &mi->r[mi->max_tp_rate[0]];
+
+ /* Decide if direct ( 1st mrr stage) or indirect (2nd mrr stage)
+ * rate sampling method should be used.
+ * Respect such rates that are not sampled for 20 interations.
+ */
+ if (mrr_capable &&
+ msr->perfect_tx_time > mr->perfect_tx_time &&
+ msr->stats.sample_skipped < 20) {
+ /* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark
+ * packets that have the sampling rate deferred to the
+ * second MRR stage. Increase the sample counter only
+ * if the deferred sample rate was actually used.
+ * Use the sample_deferred counter to make sure that
+ * the sampling is not done in large bursts */
+ info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
+ rate++;
+ mi->sample_deferred++;
+ } else {
+ if (!msr->sample_limit)
+ return;
+
+ mi->sample_packets++;
+ if (msr->sample_limit > 0)
+ msr->sample_limit--;
+ }
+
+ /* If we're not using MRR and the sampling rate already
+ * has a probability of >95%, we shouldn't be attempting
+ * to use it, as this only wastes precious airtime */
+ if (!mrr_capable &&
+ (mi->r[ndx].stats.prob_ewma > MINSTREL_FRAC(95, 100)))
+ return;
+
+ mi->prev_sample = true;
+
+ rate->idx = mi->r[ndx].rix;
+ rate->count = minstrel_get_retry_count(&mi->r[ndx], info);
+}
+
+
+static void
+calc_rate_durations(enum ieee80211_band band,
+ struct minstrel_rate *d,
+ struct ieee80211_rate *rate,
+ struct cfg80211_chan_def *chandef)
+{
+ int erp = !!(rate->flags & IEEE80211_RATE_ERP_G);
+ int shift = ieee80211_chandef_get_shift(chandef);
+
+ d->perfect_tx_time = ieee80211_frame_duration(band, 1200,
+ DIV_ROUND_UP(rate->bitrate, 1 << shift), erp, 1,
+ shift);
+ d->ack_time = ieee80211_frame_duration(band, 10,
+ DIV_ROUND_UP(rate->bitrate, 1 << shift), erp, 1,
+ shift);
+}
+
+static void
+init_sample_table(struct minstrel_sta_info *mi)
+{
+ unsigned int i, col, new_idx;
+ u8 rnd[8];
+
+ mi->sample_column = 0;
+ mi->sample_row = 0;
+ memset(mi->sample_table, 0xff, SAMPLE_COLUMNS * mi->n_rates);
+
+ for (col = 0; col < SAMPLE_COLUMNS; col++) {
+ prandom_bytes(rnd, sizeof(rnd));
+ for (i = 0; i < mi->n_rates; i++) {
+ new_idx = (i + rnd[i & 7]) % mi->n_rates;
+ while (SAMPLE_TBL(mi, new_idx, col) != 0xff)
+ new_idx = (new_idx + 1) % mi->n_rates;
+
+ SAMPLE_TBL(mi, new_idx, col) = i;
+ }
+ }
+}
+
+static void
+minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband,
+ struct cfg80211_chan_def *chandef,
+ struct ieee80211_sta *sta, void *priv_sta)
+{
+ struct minstrel_sta_info *mi = priv_sta;
+ struct minstrel_priv *mp = priv;
+ struct ieee80211_rate *ctl_rate;
+ unsigned int i, n = 0;
+ unsigned int t_slot = 9; /* FIXME: get real slot time */
+ u32 rate_flags;
+
+ mi->sta = sta;
+ mi->lowest_rix = rate_lowest_index(sband, sta);
+ ctl_rate = &sband->bitrates[mi->lowest_rix];
+ mi->sp_ack_dur = ieee80211_frame_duration(sband->band, 10,
+ ctl_rate->bitrate,
+ !!(ctl_rate->flags & IEEE80211_RATE_ERP_G), 1,
+ ieee80211_chandef_get_shift(chandef));
+
+ rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
+ memset(mi->max_tp_rate, 0, sizeof(mi->max_tp_rate));
+ mi->max_prob_rate = 0;
+
+ for (i = 0; i < sband->n_bitrates; i++) {
+ struct minstrel_rate *mr = &mi->r[n];
+ struct minstrel_rate_stats *mrs = &mi->r[n].stats;
+ unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0;
+ unsigned int tx_time_single;
+ unsigned int cw = mp->cw_min;
+ int shift;
+
+ if (!rate_supported(sta, sband->band, i))
+ continue;
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+
+ n++;
+ memset(mr, 0, sizeof(*mr));
+ memset(mrs, 0, sizeof(*mrs));
+
+ mr->rix = i;
+ shift = ieee80211_chandef_get_shift(chandef);
+ mr->bitrate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+ (1 << shift) * 5);
+ calc_rate_durations(sband->band, mr, &sband->bitrates[i],
+ chandef);
+
+ /* calculate maximum number of retransmissions before
+ * fallback (based on maximum segment size) */
+ mr->sample_limit = -1;
+ mrs->retry_count = 1;
+ mr->retry_count_cts = 1;
+ mrs->retry_count_rtscts = 1;
+ tx_time = mr->perfect_tx_time + mi->sp_ack_dur;
+ do {
+ /* add one retransmission */
+ tx_time_single = mr->ack_time + mr->perfect_tx_time;
+
+ /* contention window */
+ tx_time_single += (t_slot * cw) >> 1;
+ cw = min((cw << 1) | 1, mp->cw_max);
+
+ tx_time += tx_time_single;
+ tx_time_cts += tx_time_single + mi->sp_ack_dur;
+ tx_time_rtscts += tx_time_single + 2 * mi->sp_ack_dur;
+ if ((tx_time_cts < mp->segment_size) &&
+ (mr->retry_count_cts < mp->max_retry))
+ mr->retry_count_cts++;
+ if ((tx_time_rtscts < mp->segment_size) &&
+ (mrs->retry_count_rtscts < mp->max_retry))
+ mrs->retry_count_rtscts++;
+ } while ((tx_time < mp->segment_size) &&
+ (++mr->stats.retry_count < mp->max_retry));
+ mr->adjusted_retry_count = mrs->retry_count;
+ if (!(sband->bitrates[i].flags & IEEE80211_RATE_ERP_G))
+ mr->retry_count_cts = mrs->retry_count;
+ }
+
+ for (i = n; i < sband->n_bitrates; i++) {
+ struct minstrel_rate *mr = &mi->r[i];
+ mr->rix = -1;
+ }
+
+ mi->n_rates = n;
+ mi->last_stats_update = jiffies;
+
+ init_sample_table(mi);
+ minstrel_update_rates(mp, mi);
+}
+
+static void *
+minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
+{
+ struct ieee80211_supported_band *sband;
+ struct minstrel_sta_info *mi;
+ struct minstrel_priv *mp = priv;
+ struct ieee80211_hw *hw = mp->hw;
+ int max_rates = 0;
+ int i;
+
+ mi = kzalloc(sizeof(struct minstrel_sta_info), gfp);
+ if (!mi)
+ return NULL;
+
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ sband = hw->wiphy->bands[i];
+ if (sband && sband->n_bitrates > max_rates)
+ max_rates = sband->n_bitrates;
+ }
+
+ mi->r = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp);
+ if (!mi->r)
+ goto error;
+
+ mi->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp);
+ if (!mi->sample_table)
+ goto error1;
+
+ mi->last_stats_update = jiffies;
+ return mi;
+
+error1:
+ kfree(mi->r);
+error:
+ kfree(mi);
+ return NULL;
+}
+
+static void
+minstrel_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
+{
+ struct minstrel_sta_info *mi = priv_sta;
+
+ kfree(mi->sample_table);
+ kfree(mi->r);
+ kfree(mi);
+}
+
+static void
+minstrel_init_cck_rates(struct minstrel_priv *mp)
+{
+ static const int bitrates[4] = { 10, 20, 55, 110 };
+ struct ieee80211_supported_band *sband;
+ u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
+ int i, j;
+
+ sband = mp->hw->wiphy->bands[IEEE80211_BAND_2GHZ];
+ if (!sband)
+ return;
+
+ for (i = 0, j = 0; i < sband->n_bitrates; i++) {
+ struct ieee80211_rate *rate = &sband->bitrates[i];
+
+ if (rate->flags & IEEE80211_RATE_ERP_G)
+ continue;
+
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+
+ for (j = 0; j < ARRAY_SIZE(bitrates); j++) {
+ if (rate->bitrate != bitrates[j])
+ continue;
+
+ mp->cck_rates[j] = i;
+ break;
+ }
+ }
+}
+
+static void *
+minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
+{
+ struct minstrel_priv *mp;
+
+ mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC);
+ if (!mp)
+ return NULL;
+
+ /* contention window settings
+ * Just an approximation. Using the per-queue values would complicate
+ * the calculations and is probably unnecessary */
+ mp->cw_min = 15;
+ mp->cw_max = 1023;
+
+ /* number of packets (in %) to use for sampling other rates
+ * sample less often for non-mrr packets, because the overhead
+ * is much higher than with mrr */
+ mp->lookaround_rate = 5;
+ mp->lookaround_rate_mrr = 10;
+
+ /* maximum time that the hw is allowed to stay in one MRR segment */
+ mp->segment_size = 6000;
+
+ if (hw->max_rate_tries > 0)
+ mp->max_retry = hw->max_rate_tries;
+ else
+ /* safe default, does not necessarily have to match hw properties */
+ mp->max_retry = 7;
+
+ if (hw->max_rates >= 4)
+ mp->has_mrr = true;
+
+ mp->hw = hw;
+ mp->update_interval = 100;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ mp->fixed_rate_idx = (u32) -1;
+ mp->dbg_fixed_rate = debugfs_create_u32("fixed_rate_idx",
+ S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx);
+#endif
+
+ minstrel_init_cck_rates(mp);
+
+ return mp;
+}
+
+static void
+minstrel_free(void *priv)
+{
+#ifdef CONFIG_MAC80211_DEBUGFS
+ debugfs_remove(((struct minstrel_priv *)priv)->dbg_fixed_rate);
+#endif
+ kfree(priv);
+}
+
+static u32 minstrel_get_expected_throughput(void *priv_sta)
+{
+ struct minstrel_sta_info *mi = priv_sta;
+ struct minstrel_rate_stats *tmp_mrs;
+ int idx = mi->max_tp_rate[0];
+ int tmp_cur_tp;
+
+ /* convert pkt per sec in kbps (1200 is the average pkt size used for
+ * computing cur_tp
+ */
+ tmp_mrs = &mi->r[idx].stats;
+ tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma);
+ tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
+
+ return tmp_cur_tp;
+}
+
+const struct rate_control_ops mac80211_minstrel = {
+ .name = "minstrel",
+ .tx_status_noskb = minstrel_tx_status,
+ .get_rate = minstrel_get_rate,
+ .rate_init = minstrel_rate_init,
+ .alloc = minstrel_alloc,
+ .free = minstrel_free,
+ .alloc_sta = minstrel_alloc_sta,
+ .free_sta = minstrel_free_sta,
+#ifdef CONFIG_MAC80211_DEBUGFS
+ .add_sta_debugfs = minstrel_add_sta_debugfs,
+ .remove_sta_debugfs = minstrel_remove_sta_debugfs,
+#endif
+ .get_expected_throughput = minstrel_get_expected_throughput,
+};
+
+int __init
+rc80211_minstrel_init(void)
+{
+ return ieee80211_rate_control_register(&mac80211_minstrel);
+}
+
+void
+rc80211_minstrel_exit(void)
+{
+ ieee80211_rate_control_unregister(&mac80211_minstrel);
+}
+
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
new file mode 100644
index 000000000..c230bbe93
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __RC_MINSTREL_H
+#define __RC_MINSTREL_H
+
+#define EWMA_LEVEL 96 /* ewma weighting factor [/EWMA_DIV] */
+#define EWMA_DIV 128
+#define SAMPLE_COLUMNS 10 /* number of columns in sample table */
+
+/* scaled fraction values */
+#define MINSTREL_SCALE 16
+#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div)
+#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE)
+
+/* number of highest throughput rates to consider*/
+#define MAX_THR_RATES 4
+
+/*
+ * Perform EWMA (Exponentially Weighted Moving Average) calculation
+ */
+static inline int
+minstrel_ewma(int old, int new, int weight)
+{
+ int diff, incr;
+
+ diff = new - old;
+ incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
+
+ return old + incr;
+}
+
+/*
+ * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation
+ */
+static inline int
+minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight)
+{
+ int diff, incr, tmp_var;
+
+ /* calculate exponential weighted moving variance */
+ diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000);
+ incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
+ tmp_var = old_ewmsd * old_ewmsd;
+ tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV;
+
+ /* return standard deviation */
+ return (u16) int_sqrt(tmp_var);
+}
+
+struct minstrel_rate_stats {
+ /* current / last sampling period attempts/success counters */
+ u16 attempts, last_attempts;
+ u16 success, last_success;
+
+ /* total attempts/success counters */
+ u64 att_hist, succ_hist;
+
+ /* statistis of packet delivery probability
+ * cur_prob - current prob within last update intervall
+ * prob_ewma - exponential weighted moving average of prob
+ * prob_ewmsd - exp. weighted moving standard deviation of prob */
+ unsigned int cur_prob;
+ unsigned int prob_ewma;
+ u16 prob_ewmsd;
+
+ /* maximum retry counts */
+ u8 retry_count;
+ u8 retry_count_rtscts;
+
+ u8 sample_skipped;
+ bool retry_updated;
+};
+
+struct minstrel_rate {
+ int bitrate;
+
+ s8 rix;
+ u8 retry_count_cts;
+ u8 adjusted_retry_count;
+
+ unsigned int perfect_tx_time;
+ unsigned int ack_time;
+
+ int sample_limit;
+
+ struct minstrel_rate_stats stats;
+};
+
+struct minstrel_sta_info {
+ struct ieee80211_sta *sta;
+
+ unsigned long last_stats_update;
+ unsigned int sp_ack_dur;
+ unsigned int rate_avg;
+
+ unsigned int lowest_rix;
+
+ u8 max_tp_rate[MAX_THR_RATES];
+ u8 max_prob_rate;
+ unsigned int total_packets;
+ unsigned int sample_packets;
+ int sample_deferred;
+
+ unsigned int sample_row;
+ unsigned int sample_column;
+
+ int n_rates;
+ struct minstrel_rate *r;
+ bool prev_sample;
+
+ /* sampling table */
+ u8 *sample_table;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct dentry *dbg_stats;
+ struct dentry *dbg_stats_csv;
+#endif
+};
+
+struct minstrel_priv {
+ struct ieee80211_hw *hw;
+ bool has_mrr;
+ unsigned int cw_min;
+ unsigned int cw_max;
+ unsigned int max_retry;
+ unsigned int segment_size;
+ unsigned int update_interval;
+ unsigned int lookaround_rate;
+ unsigned int lookaround_rate_mrr;
+
+ u8 cck_rates[4];
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ /*
+ * enable fixed rate processing per RC
+ * - write static index to debugfs:ieee80211/phyX/rc/fixed_rate_idx
+ * - write -1 to enable RC processing again
+ * - setting will be applied on next update
+ */
+ u32 fixed_rate_idx;
+ struct dentry *dbg_fixed_rate;
+#endif
+};
+
+struct minstrel_debugfs_info {
+ size_t len;
+ char buf[];
+};
+
+extern const struct rate_control_ops mac80211_minstrel;
+void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
+void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
+
+/* Recalculate success probabilities and counters for a given rate using EWMA */
+void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs);
+int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma);
+
+/* debugfs */
+int minstrel_stats_open(struct inode *inode, struct file *file);
+int minstrel_stats_csv_open(struct inode *inode, struct file *file);
+ssize_t minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos);
+int minstrel_stats_release(struct inode *inode, struct file *file);
+
+#endif
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
new file mode 100644
index 000000000..1db5f7c33
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on minstrel.c:
+ * Copyright (C) 2005-2007 Derek Smithies <derek@indranet.co.nz>
+ * Sponsored by Indranet Technologies Ltd
+ *
+ * Based on sample.c:
+ * Copyright (c) 2005 John Bicket
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ * redistribution must be conditioned upon including a substantially
+ * similar Disclaimer requirement for further binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ * of any contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/ieee80211.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include "rc80211_minstrel.h"
+
+ssize_t
+minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos)
+{
+ struct minstrel_debugfs_info *ms;
+
+ ms = file->private_data;
+ return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len);
+}
+
+int
+minstrel_stats_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+int
+minstrel_stats_open(struct inode *inode, struct file *file)
+{
+ struct minstrel_sta_info *mi = inode->i_private;
+ struct minstrel_debugfs_info *ms;
+ unsigned int i, tp_max, tp_avg, prob, eprob;
+ char *p;
+
+ ms = kmalloc(2048, GFP_KERNEL);
+ if (!ms)
+ return -ENOMEM;
+
+ file->private_data = ms;
+ p = ms->buf;
+ p += sprintf(p, "\n");
+ p += sprintf(p, "best __________rate_________ ______"
+ "statistics______ ________last_______ "
+ "______sum-of________\n");
+ p += sprintf(p, "rate [name idx airtime max_tp] [ ø(tp) ø(prob) "
+ "sd(prob)] [prob.|retry|suc|att] "
+ "[#success | #attempts]\n");
+
+ for (i = 0; i < mi->n_rates; i++) {
+ struct minstrel_rate *mr = &mi->r[i];
+ struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+
+ *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' ';
+ *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' ';
+ *(p++) = (i == mi->max_tp_rate[2]) ? 'C' : ' ';
+ *(p++) = (i == mi->max_tp_rate[3]) ? 'D' : ' ';
+ *(p++) = (i == mi->max_prob_rate) ? 'P' : ' ';
+
+ p += sprintf(p, " %3u%s ", mr->bitrate / 2,
+ (mr->bitrate & 1 ? ".5" : " "));
+ p += sprintf(p, "%3u ", i);
+ p += sprintf(p, "%6u ", mr->perfect_tx_time);
+
+ tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
+ tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
+ prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
+ eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+
+ p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
+ " %3u.%1u %3u %3u %-3u "
+ "%9llu %-9llu\n",
+ tp_max / 10, tp_max % 10,
+ tp_avg / 10, tp_avg % 10,
+ eprob / 10, eprob % 10,
+ mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
+ prob / 10, prob % 10,
+ mrs->retry_count,
+ mrs->last_success,
+ mrs->last_attempts,
+ (unsigned long long)mrs->succ_hist,
+ (unsigned long long)mrs->att_hist);
+ }
+ p += sprintf(p, "\nTotal packet count:: ideal %d "
+ "lookaround %d\n\n",
+ mi->total_packets - mi->sample_packets,
+ mi->sample_packets);
+ ms->len = p - ms->buf;
+
+ WARN_ON(ms->len + sizeof(*ms) > 2048);
+
+ return 0;
+}
+
+static const struct file_operations minstrel_stat_fops = {
+ .owner = THIS_MODULE,
+ .open = minstrel_stats_open,
+ .read = minstrel_stats_read,
+ .release = minstrel_stats_release,
+ .llseek = default_llseek,
+};
+
+int
+minstrel_stats_csv_open(struct inode *inode, struct file *file)
+{
+ struct minstrel_sta_info *mi = inode->i_private;
+ struct minstrel_debugfs_info *ms;
+ unsigned int i, tp_max, tp_avg, prob, eprob;
+ char *p;
+
+ ms = kmalloc(2048, GFP_KERNEL);
+ if (!ms)
+ return -ENOMEM;
+
+ file->private_data = ms;
+ p = ms->buf;
+
+ for (i = 0; i < mi->n_rates; i++) {
+ struct minstrel_rate *mr = &mi->r[i];
+ struct minstrel_rate_stats *mrs = &mi->r[i].stats;
+
+ p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : ""));
+ p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : ""));
+ p += sprintf(p, "%s" ,((i == mi->max_tp_rate[2]) ? "C" : ""));
+ p += sprintf(p, "%s" ,((i == mi->max_tp_rate[3]) ? "D" : ""));
+ p += sprintf(p, "%s" ,((i == mi->max_prob_rate) ? "P" : ""));
+
+ p += sprintf(p, ",%u%s", mr->bitrate / 2,
+ (mr->bitrate & 1 ? ".5," : ","));
+ p += sprintf(p, "%u,", i);
+ p += sprintf(p, "%u,",mr->perfect_tx_time);
+
+ tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
+ tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
+ prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
+ eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+
+ p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
+ "%llu,%llu,%d,%d\n",
+ tp_max / 10, tp_max % 10,
+ tp_avg / 10, tp_avg % 10,
+ eprob / 10, eprob % 10,
+ mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
+ prob / 10, prob % 10,
+ mrs->retry_count,
+ mrs->last_success,
+ mrs->last_attempts,
+ (unsigned long long)mrs->succ_hist,
+ (unsigned long long)mrs->att_hist,
+ mi->total_packets - mi->sample_packets,
+ mi->sample_packets);
+
+ }
+ ms->len = p - ms->buf;
+
+ WARN_ON(ms->len + sizeof(*ms) > 2048);
+
+ return 0;
+}
+
+static const struct file_operations minstrel_stat_csv_fops = {
+ .owner = THIS_MODULE,
+ .open = minstrel_stats_csv_open,
+ .read = minstrel_stats_read,
+ .release = minstrel_stats_release,
+ .llseek = default_llseek,
+};
+
+void
+minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
+{
+ struct minstrel_sta_info *mi = priv_sta;
+
+ mi->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, mi,
+ &minstrel_stat_fops);
+
+ mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO, dir,
+ mi, &minstrel_stat_csv_fops);
+}
+
+void
+minstrel_remove_sta_debugfs(void *priv, void *priv_sta)
+{
+ struct minstrel_sta_info *mi = priv_sta;
+
+ debugfs_remove(mi->dbg_stats);
+
+ debugfs_remove(mi->dbg_stats_csv);
+}
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
new file mode 100644
index 000000000..7430a1df2
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -0,0 +1,1383 @@
+/*
+ * Copyright (C) 2010-2013 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/moduleparam.h>
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "rate.h"
+#include "rc80211_minstrel.h"
+#include "rc80211_minstrel_ht.h"
+
+#define AVG_AMPDU_SIZE 16
+#define AVG_PKT_SIZE 1200
+
+/* Number of bits for an average sized packet */
+#define MCS_NBITS ((AVG_PKT_SIZE * AVG_AMPDU_SIZE) << 3)
+
+/* Number of symbols for a packet with (bps) bits per symbol */
+#define MCS_NSYMS(bps) DIV_ROUND_UP(MCS_NBITS, (bps))
+
+/* Transmission time (nanoseconds) for a packet containing (syms) symbols */
+#define MCS_SYMBOL_TIME(sgi, syms) \
+ (sgi ? \
+ ((syms) * 18000 + 4000) / 5 : /* syms * 3.6 us */ \
+ ((syms) * 1000) << 2 /* syms * 4 us */ \
+ )
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define MCS_DURATION(streams, sgi, bps) \
+ (MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps))) / AVG_AMPDU_SIZE)
+
+#define BW_20 0
+#define BW_40 1
+#define BW_80 2
+
+/*
+ * Define group sort order: HT40 -> SGI -> #streams
+ */
+#define GROUP_IDX(_streams, _sgi, _ht40) \
+ MINSTREL_HT_GROUP_0 + \
+ MINSTREL_MAX_STREAMS * 2 * _ht40 + \
+ MINSTREL_MAX_STREAMS * _sgi + \
+ _streams - 1
+
+/* MCS rate information for an MCS group */
+#define MCS_GROUP(_streams, _sgi, _ht40) \
+ [GROUP_IDX(_streams, _sgi, _ht40)] = { \
+ .streams = _streams, \
+ .flags = \
+ IEEE80211_TX_RC_MCS | \
+ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \
+ (_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \
+ .duration = { \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26), \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52), \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78), \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104), \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156), \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208), \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234), \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) \
+ } \
+}
+
+#define VHT_GROUP_IDX(_streams, _sgi, _bw) \
+ (MINSTREL_VHT_GROUP_0 + \
+ MINSTREL_MAX_STREAMS * 2 * (_bw) + \
+ MINSTREL_MAX_STREAMS * (_sgi) + \
+ (_streams) - 1)
+
+#define BW2VBPS(_bw, r3, r2, r1) \
+ (_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1)
+
+#define VHT_GROUP(_streams, _sgi, _bw) \
+ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \
+ .streams = _streams, \
+ .flags = \
+ IEEE80211_TX_RC_VHT_MCS | \
+ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \
+ (_bw == BW_80 ? IEEE80211_TX_RC_80_MHZ_WIDTH : \
+ _bw == BW_40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \
+ .duration = { \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 117, 54, 26)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 234, 108, 52)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 351, 162, 78)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 468, 216, 104)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 702, 324, 156)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 936, 432, 208)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 1053, 486, 234)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 1170, 540, 260)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 1404, 648, 312)), \
+ MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 1560, 720, 346)) \
+ } \
+}
+
+#define CCK_DURATION(_bitrate, _short, _len) \
+ (1000 * (10 /* SIFS */ + \
+ (_short ? 72 + 24 : 144 + 48) + \
+ (8 * (_len + 4) * 10) / (_bitrate)))
+
+#define CCK_ACK_DURATION(_bitrate, _short) \
+ (CCK_DURATION((_bitrate > 10 ? 20 : 10), false, 60) + \
+ CCK_DURATION(_bitrate, _short, AVG_PKT_SIZE))
+
+#define CCK_DURATION_LIST(_short) \
+ CCK_ACK_DURATION(10, _short), \
+ CCK_ACK_DURATION(20, _short), \
+ CCK_ACK_DURATION(55, _short), \
+ CCK_ACK_DURATION(110, _short)
+
+#define CCK_GROUP \
+ [MINSTREL_CCK_GROUP] = { \
+ .streams = 0, \
+ .flags = 0, \
+ .duration = { \
+ CCK_DURATION_LIST(false), \
+ CCK_DURATION_LIST(true) \
+ } \
+ }
+
+#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
+static bool minstrel_vht_only = true;
+module_param(minstrel_vht_only, bool, 0644);
+MODULE_PARM_DESC(minstrel_vht_only,
+ "Use only VHT rates when VHT is supported by sta.");
+#endif
+
+/*
+ * To enable sufficiently targeted rate sampling, MCS rates are divided into
+ * groups, based on the number of streams and flags (HT40, SGI) that they
+ * use.
+ *
+ * Sortorder has to be fixed for GROUP_IDX macro to be applicable:
+ * BW -> SGI -> #streams
+ */
+const struct mcs_group minstrel_mcs_groups[] = {
+ MCS_GROUP(1, 0, BW_20),
+ MCS_GROUP(2, 0, BW_20),
+#if MINSTREL_MAX_STREAMS >= 3
+ MCS_GROUP(3, 0, BW_20),
+#endif
+
+ MCS_GROUP(1, 1, BW_20),
+ MCS_GROUP(2, 1, BW_20),
+#if MINSTREL_MAX_STREAMS >= 3
+ MCS_GROUP(3, 1, BW_20),
+#endif
+
+ MCS_GROUP(1, 0, BW_40),
+ MCS_GROUP(2, 0, BW_40),
+#if MINSTREL_MAX_STREAMS >= 3
+ MCS_GROUP(3, 0, BW_40),
+#endif
+
+ MCS_GROUP(1, 1, BW_40),
+ MCS_GROUP(2, 1, BW_40),
+#if MINSTREL_MAX_STREAMS >= 3
+ MCS_GROUP(3, 1, BW_40),
+#endif
+
+ CCK_GROUP,
+
+#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
+ VHT_GROUP(1, 0, BW_20),
+ VHT_GROUP(2, 0, BW_20),
+#if MINSTREL_MAX_STREAMS >= 3
+ VHT_GROUP(3, 0, BW_20),
+#endif
+
+ VHT_GROUP(1, 1, BW_20),
+ VHT_GROUP(2, 1, BW_20),
+#if MINSTREL_MAX_STREAMS >= 3
+ VHT_GROUP(3, 1, BW_20),
+#endif
+
+ VHT_GROUP(1, 0, BW_40),
+ VHT_GROUP(2, 0, BW_40),
+#if MINSTREL_MAX_STREAMS >= 3
+ VHT_GROUP(3, 0, BW_40),
+#endif
+
+ VHT_GROUP(1, 1, BW_40),
+ VHT_GROUP(2, 1, BW_40),
+#if MINSTREL_MAX_STREAMS >= 3
+ VHT_GROUP(3, 1, BW_40),
+#endif
+
+ VHT_GROUP(1, 0, BW_80),
+ VHT_GROUP(2, 0, BW_80),
+#if MINSTREL_MAX_STREAMS >= 3
+ VHT_GROUP(3, 0, BW_80),
+#endif
+
+ VHT_GROUP(1, 1, BW_80),
+ VHT_GROUP(2, 1, BW_80),
+#if MINSTREL_MAX_STREAMS >= 3
+ VHT_GROUP(3, 1, BW_80),
+#endif
+#endif
+};
+
+static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly;
+
+static void
+minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi);
+
+/*
+ * Some VHT MCSes are invalid (when Ndbps / Nes is not an integer)
+ * e.g for MCS9@20MHzx1Nss: Ndbps=8x52*(5/6) Nes=1
+ *
+ * Returns the valid mcs map for struct minstrel_mcs_group_data.supported
+ */
+static u16
+minstrel_get_valid_vht_rates(int bw, int nss, __le16 mcs_map)
+{
+ u16 mask = 0;
+
+ if (bw == BW_20) {
+ if (nss != 3 && nss != 6)
+ mask = BIT(9);
+ } else if (bw == BW_80) {
+ if (nss == 3 || nss == 7)
+ mask = BIT(6);
+ else if (nss == 6)
+ mask = BIT(9);
+ } else {
+ WARN_ON(bw != BW_40);
+ }
+
+ switch ((le16_to_cpu(mcs_map) >> (2 * (nss - 1))) & 3) {
+ case IEEE80211_VHT_MCS_SUPPORT_0_7:
+ mask |= 0x300;
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_8:
+ mask |= 0x200;
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_9:
+ break;
+ default:
+ mask = 0x3ff;
+ }
+
+ return 0x3ff & ~mask;
+}
+
+/*
+ * Look up an MCS group index based on mac80211 rate information
+ */
+static int
+minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate)
+{
+ return GROUP_IDX((rate->idx / 8) + 1,
+ !!(rate->flags & IEEE80211_TX_RC_SHORT_GI),
+ !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH));
+}
+
+static int
+minstrel_vht_get_group_idx(struct ieee80211_tx_rate *rate)
+{
+ return VHT_GROUP_IDX(ieee80211_rate_get_vht_nss(rate),
+ !!(rate->flags & IEEE80211_TX_RC_SHORT_GI),
+ !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) +
+ 2*!!(rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH));
+}
+
+static struct minstrel_rate_stats *
+minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+ struct ieee80211_tx_rate *rate)
+{
+ int group, idx;
+
+ if (rate->flags & IEEE80211_TX_RC_MCS) {
+ group = minstrel_ht_get_group_idx(rate);
+ idx = rate->idx % 8;
+ } else if (rate->flags & IEEE80211_TX_RC_VHT_MCS) {
+ group = minstrel_vht_get_group_idx(rate);
+ idx = ieee80211_rate_get_vht_mcs(rate);
+ } else {
+ group = MINSTREL_CCK_GROUP;
+
+ for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++)
+ if (rate->idx == mp->cck_rates[idx])
+ break;
+
+ /* short preamble */
+ if (!(mi->groups[group].supported & BIT(idx)))
+ idx += 4;
+ }
+ return &mi->groups[group].rates[idx];
+}
+
+static inline struct minstrel_rate_stats *
+minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index)
+{
+ return &mi->groups[index / MCS_GROUP_RATES].rates[index % MCS_GROUP_RATES];
+}
+
+/*
+ * Return current throughput based on the average A-MPDU length, taking into
+ * account the expected number of retransmissions and their expected length
+ */
+int
+minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
+ int prob_ewma)
+{
+ unsigned int nsecs = 0;
+
+ /* do not account throughput if sucess prob is below 10% */
+ if (prob_ewma < MINSTREL_FRAC(10, 100))
+ return 0;
+
+ if (group != MINSTREL_CCK_GROUP)
+ nsecs = 1000 * mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len);
+
+ nsecs += minstrel_mcs_groups[group].duration[rate];
+
+ /*
+ * For the throughput calculation, limit the probability value to 90% to
+ * account for collision related packet error rate fluctuation
+ * (prob is scaled - see MINSTREL_FRAC above)
+ */
+ if (prob_ewma > MINSTREL_FRAC(90, 100))
+ return MINSTREL_TRUNC(100000 * ((MINSTREL_FRAC(90, 100) * 1000)
+ / nsecs));
+ else
+ return MINSTREL_TRUNC(100000 * ((prob_ewma * 1000) / nsecs));
+}
+
+/*
+ * Find & sort topmost throughput rates
+ *
+ * If multiple rates provide equal throughput the sorting is based on their
+ * current success probability. Higher success probability is preferred among
+ * MCS groups, CCK rates do not provide aggregation and are therefore at last.
+ */
+static void
+minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index,
+ u16 *tp_list)
+{
+ int cur_group, cur_idx, cur_tp_avg, cur_prob;
+ int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob;
+ int j = MAX_THR_RATES;
+
+ cur_group = index / MCS_GROUP_RATES;
+ cur_idx = index % MCS_GROUP_RATES;
+ cur_prob = mi->groups[cur_group].rates[cur_idx].prob_ewma;
+ cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob);
+
+ do {
+ tmp_group = tp_list[j - 1] / MCS_GROUP_RATES;
+ tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES;
+ tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma;
+ tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx,
+ tmp_prob);
+ if (cur_tp_avg < tmp_tp_avg ||
+ (cur_tp_avg == tmp_tp_avg && cur_prob <= tmp_prob))
+ break;
+ j--;
+ } while (j > 0);
+
+ if (j < MAX_THR_RATES - 1) {
+ memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) *
+ (MAX_THR_RATES - (j + 1))));
+ }
+ if (j < MAX_THR_RATES)
+ tp_list[j] = index;
+}
+
+/*
+ * Find and set the topmost probability rate per sta and per group
+ */
+static void
+minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
+{
+ struct minstrel_mcs_group_data *mg;
+ struct minstrel_rate_stats *mrs;
+ int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob;
+ int max_tp_group, cur_tp_avg, cur_group, cur_idx;
+ int max_gpr_group, max_gpr_idx;
+ int max_gpr_tp_avg, max_gpr_prob;
+
+ cur_group = index / MCS_GROUP_RATES;
+ cur_idx = index % MCS_GROUP_RATES;
+ mg = &mi->groups[index / MCS_GROUP_RATES];
+ mrs = &mg->rates[index % MCS_GROUP_RATES];
+
+ tmp_group = mi->max_prob_rate / MCS_GROUP_RATES;
+ tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES;
+ tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma;
+ tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob);
+
+ /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from
+ * MCS_GROUP as well as CCK_GROUP rates do not allow aggregation */
+ max_tp_group = mi->max_tp_rate[0] / MCS_GROUP_RATES;
+ if((index / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) &&
+ (max_tp_group != MINSTREL_CCK_GROUP))
+ return;
+
+ if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) {
+ cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx,
+ mrs->prob_ewma);
+ if (cur_tp_avg > tmp_tp_avg)
+ mi->max_prob_rate = index;
+
+ max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
+ max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
+ max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
+ max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group,
+ max_gpr_idx,
+ max_gpr_prob);
+ if (cur_tp_avg > max_gpr_tp_avg)
+ mg->max_group_prob_rate = index;
+ } else {
+ if (mrs->prob_ewma > tmp_prob)
+ mi->max_prob_rate = index;
+ if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma)
+ mg->max_group_prob_rate = index;
+ }
+}
+
+
+/*
+ * Assign new rate set per sta and use CCK rates only if the fastest
+ * rate (max_tp_rate[0]) is from CCK group. This prohibits such sorted
+ * rate sets where MCS and CCK rates are mixed, because CCK rates can
+ * not use aggregation.
+ */
+static void
+minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi,
+ u16 tmp_mcs_tp_rate[MAX_THR_RATES],
+ u16 tmp_cck_tp_rate[MAX_THR_RATES])
+{
+ unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp, tmp_prob;
+ int i;
+
+ tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES;
+ tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES;
+ tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma;
+ tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob);
+
+ tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES;
+ tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES;
+ tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma;
+ tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob);
+
+ if (tmp_cck_tp > tmp_mcs_tp) {
+ for(i = 0; i < MAX_THR_RATES; i++) {
+ minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i],
+ tmp_mcs_tp_rate);
+ }
+ }
+
+}
+
+/*
+ * Try to increase robustness of max_prob rate by decrease number of
+ * streams if possible.
+ */
+static inline void
+minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
+{
+ struct minstrel_mcs_group_data *mg;
+ int tmp_max_streams, group, tmp_idx, tmp_prob;
+ int tmp_tp = 0;
+
+ tmp_max_streams = minstrel_mcs_groups[mi->max_tp_rate[0] /
+ MCS_GROUP_RATES].streams;
+ for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
+ mg = &mi->groups[group];
+ if (!mg->supported || group == MINSTREL_CCK_GROUP)
+ continue;
+
+ tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
+ tmp_prob = mi->groups[group].rates[tmp_idx].prob_ewma;
+
+ if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) &&
+ (minstrel_mcs_groups[group].streams < tmp_max_streams)) {
+ mi->max_prob_rate = mg->max_group_prob_rate;
+ tmp_tp = minstrel_ht_get_tp_avg(mi, group,
+ tmp_idx,
+ tmp_prob);
+ }
+ }
+}
+
+/*
+ * Update rate statistics and select new primary rates
+ *
+ * Rules for rate selection:
+ * - max_prob_rate must use only one stream, as a tradeoff between delivery
+ * probability and throughput during strong fluctuations
+ * - as long as the max prob rate has a probability of more than 75%, pick
+ * higher throughput rates, even if the probablity is a bit lower
+ */
+static void
+minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
+{
+ struct minstrel_mcs_group_data *mg;
+ struct minstrel_rate_stats *mrs;
+ int group, i, j, cur_prob;
+ u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES];
+ u16 tmp_cck_tp_rate[MAX_THR_RATES], index;
+
+ if (mi->ampdu_packets > 0) {
+ mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len,
+ MINSTREL_FRAC(mi->ampdu_len, mi->ampdu_packets), EWMA_LEVEL);
+ mi->ampdu_len = 0;
+ mi->ampdu_packets = 0;
+ }
+
+ mi->sample_slow = 0;
+ mi->sample_count = 0;
+
+ /* Initialize global rate indexes */
+ for(j = 0; j < MAX_THR_RATES; j++){
+ tmp_mcs_tp_rate[j] = 0;
+ tmp_cck_tp_rate[j] = 0;
+ }
+
+ /* Find best rate sets within all MCS groups*/
+ for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
+
+ mg = &mi->groups[group];
+ if (!mg->supported)
+ continue;
+
+ mi->sample_count++;
+
+ /* (re)Initialize group rate indexes */
+ for(j = 0; j < MAX_THR_RATES; j++)
+ tmp_group_tp_rate[j] = group;
+
+ for (i = 0; i < MCS_GROUP_RATES; i++) {
+ if (!(mg->supported & BIT(i)))
+ continue;
+
+ index = MCS_GROUP_RATES * group + i;
+
+ mrs = &mg->rates[i];
+ mrs->retry_updated = false;
+ minstrel_calc_rate_stats(mrs);
+ cur_prob = mrs->prob_ewma;
+
+ if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0)
+ continue;
+
+ /* Find max throughput rate set */
+ if (group != MINSTREL_CCK_GROUP) {
+ minstrel_ht_sort_best_tp_rates(mi, index,
+ tmp_mcs_tp_rate);
+ } else if (group == MINSTREL_CCK_GROUP) {
+ minstrel_ht_sort_best_tp_rates(mi, index,
+ tmp_cck_tp_rate);
+ }
+
+ /* Find max throughput rate set within a group */
+ minstrel_ht_sort_best_tp_rates(mi, index,
+ tmp_group_tp_rate);
+
+ /* Find max probability rate per group and global */
+ minstrel_ht_set_best_prob_rate(mi, index);
+ }
+
+ memcpy(mg->max_group_tp_rate, tmp_group_tp_rate,
+ sizeof(mg->max_group_tp_rate));
+ }
+
+ /* Assign new rate set per sta */
+ minstrel_ht_assign_best_tp_rates(mi, tmp_mcs_tp_rate, tmp_cck_tp_rate);
+ memcpy(mi->max_tp_rate, tmp_mcs_tp_rate, sizeof(mi->max_tp_rate));
+
+ /* Try to increase robustness of max_prob_rate*/
+ minstrel_ht_prob_rate_reduce_streams(mi);
+
+ /* try to sample all available rates during each interval */
+ mi->sample_count *= 8;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ /* use fixed index if set */
+ if (mp->fixed_rate_idx != -1) {
+ for (i = 0; i < 4; i++)
+ mi->max_tp_rate[i] = mp->fixed_rate_idx;
+ mi->max_prob_rate = mp->fixed_rate_idx;
+ }
+#endif
+
+ /* Reset update timer */
+ mi->last_stats_update = jiffies;
+}
+
+static bool
+minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct ieee80211_tx_rate *rate)
+{
+ if (rate->idx < 0)
+ return false;
+
+ if (!rate->count)
+ return false;
+
+ if (rate->flags & IEEE80211_TX_RC_MCS ||
+ rate->flags & IEEE80211_TX_RC_VHT_MCS)
+ return true;
+
+ return rate->idx == mp->cck_rates[0] ||
+ rate->idx == mp->cck_rates[1] ||
+ rate->idx == mp->cck_rates[2] ||
+ rate->idx == mp->cck_rates[3];
+}
+
+static void
+minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi)
+{
+ struct minstrel_mcs_group_data *mg;
+
+ for (;;) {
+ mi->sample_group++;
+ mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups);
+ mg = &mi->groups[mi->sample_group];
+
+ if (!mg->supported)
+ continue;
+
+ if (++mg->index >= MCS_GROUP_RATES) {
+ mg->index = 0;
+ if (++mg->column >= ARRAY_SIZE(sample_table))
+ mg->column = 0;
+ }
+ break;
+ }
+}
+
+static void
+minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary)
+{
+ int group, orig_group;
+
+ orig_group = group = *idx / MCS_GROUP_RATES;
+ while (group > 0) {
+ group--;
+
+ if (!mi->groups[group].supported)
+ continue;
+
+ if (minstrel_mcs_groups[group].streams >
+ minstrel_mcs_groups[orig_group].streams)
+ continue;
+
+ if (primary)
+ *idx = mi->groups[group].max_group_tp_rate[0];
+ else
+ *idx = mi->groups[group].max_group_tp_rate[1];
+ break;
+ }
+}
+
+static void
+minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ u16 tid;
+
+ if (skb_get_queue_mapping(skb) == IEEE80211_AC_VO)
+ return;
+
+ if (unlikely(!ieee80211_is_data_qos(hdr->frame_control)))
+ return;
+
+ if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE)))
+ return;
+
+ tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+ if (likely(sta->ampdu_mlme.tid_tx[tid]))
+ return;
+
+ ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+}
+
+static void
+minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
+ struct ieee80211_sta *sta, void *priv_sta,
+ struct ieee80211_tx_info *info)
+{
+ struct minstrel_ht_sta_priv *msp = priv_sta;
+ struct minstrel_ht_sta *mi = &msp->ht;
+ struct ieee80211_tx_rate *ar = info->status.rates;
+ struct minstrel_rate_stats *rate, *rate2;
+ struct minstrel_priv *mp = priv;
+ bool last, update = false;
+ int i;
+
+ if (!msp->is_ht)
+ return mac80211_minstrel.tx_status_noskb(priv, sband, sta,
+ &msp->legacy, info);
+
+ /* This packet was aggregated but doesn't carry status info */
+ if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
+ !(info->flags & IEEE80211_TX_STAT_AMPDU))
+ return;
+
+ if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) {
+ info->status.ampdu_ack_len =
+ (info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0);
+ info->status.ampdu_len = 1;
+ }
+
+ mi->ampdu_packets++;
+ mi->ampdu_len += info->status.ampdu_len;
+
+ if (!mi->sample_wait && !mi->sample_tries && mi->sample_count > 0) {
+ mi->sample_wait = 16 + 2 * MINSTREL_TRUNC(mi->avg_ampdu_len);
+ mi->sample_tries = 1;
+ mi->sample_count--;
+ }
+
+ if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
+ mi->sample_packets += info->status.ampdu_len;
+
+ last = !minstrel_ht_txstat_valid(mp, &ar[0]);
+ for (i = 0; !last; i++) {
+ last = (i == IEEE80211_TX_MAX_RATES - 1) ||
+ !minstrel_ht_txstat_valid(mp, &ar[i + 1]);
+
+ rate = minstrel_ht_get_stats(mp, mi, &ar[i]);
+
+ if (last)
+ rate->success += info->status.ampdu_ack_len;
+
+ rate->attempts += ar[i].count * info->status.ampdu_len;
+ }
+
+ /*
+ * check for sudden death of spatial multiplexing,
+ * downgrade to a lower number of streams if necessary.
+ */
+ rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]);
+ if (rate->attempts > 30 &&
+ MINSTREL_FRAC(rate->success, rate->attempts) <
+ MINSTREL_FRAC(20, 100)) {
+ minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true);
+ update = true;
+ }
+
+ rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]);
+ if (rate2->attempts > 30 &&
+ MINSTREL_FRAC(rate2->success, rate2->attempts) <
+ MINSTREL_FRAC(20, 100)) {
+ minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false);
+ update = true;
+ }
+
+ if (time_after(jiffies, mi->last_stats_update +
+ (mp->update_interval / 2 * HZ) / 1000)) {
+ update = true;
+ minstrel_ht_update_stats(mp, mi);
+ }
+
+ if (update)
+ minstrel_ht_update_rates(mp, mi);
+}
+
+static void
+minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+ int index)
+{
+ struct minstrel_rate_stats *mrs;
+ const struct mcs_group *group;
+ unsigned int tx_time, tx_time_rtscts, tx_time_data;
+ unsigned int cw = mp->cw_min;
+ unsigned int ctime = 0;
+ unsigned int t_slot = 9; /* FIXME */
+ unsigned int ampdu_len = MINSTREL_TRUNC(mi->avg_ampdu_len);
+ unsigned int overhead = 0, overhead_rtscts = 0;
+
+ mrs = minstrel_get_ratestats(mi, index);
+ if (mrs->prob_ewma < MINSTREL_FRAC(1, 10)) {
+ mrs->retry_count = 1;
+ mrs->retry_count_rtscts = 1;
+ return;
+ }
+
+ mrs->retry_count = 2;
+ mrs->retry_count_rtscts = 2;
+ mrs->retry_updated = true;
+
+ group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+ tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len / 1000;
+
+ /* Contention time for first 2 tries */
+ ctime = (t_slot * cw) >> 1;
+ cw = min((cw << 1) | 1, mp->cw_max);
+ ctime += (t_slot * cw) >> 1;
+ cw = min((cw << 1) | 1, mp->cw_max);
+
+ if (index / MCS_GROUP_RATES != MINSTREL_CCK_GROUP) {
+ overhead = mi->overhead;
+ overhead_rtscts = mi->overhead_rtscts;
+ }
+
+ /* Total TX time for data and Contention after first 2 tries */
+ tx_time = ctime + 2 * (overhead + tx_time_data);
+ tx_time_rtscts = ctime + 2 * (overhead_rtscts + tx_time_data);
+
+ /* See how many more tries we can fit inside segment size */
+ do {
+ /* Contention time for this try */
+ ctime = (t_slot * cw) >> 1;
+ cw = min((cw << 1) | 1, mp->cw_max);
+
+ /* Total TX time after this try */
+ tx_time += ctime + overhead + tx_time_data;
+ tx_time_rtscts += ctime + overhead_rtscts + tx_time_data;
+
+ if (tx_time_rtscts < mp->segment_size)
+ mrs->retry_count_rtscts++;
+ } while ((tx_time < mp->segment_size) &&
+ (++mrs->retry_count < mp->max_retry));
+}
+
+
+static void
+minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+ struct ieee80211_sta_rates *ratetbl, int offset, int index)
+{
+ const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+ struct minstrel_rate_stats *mrs;
+ u8 idx;
+ u16 flags = group->flags;
+
+ mrs = minstrel_get_ratestats(mi, index);
+ if (!mrs->retry_updated)
+ minstrel_calc_retransmit(mp, mi, index);
+
+ if (mrs->prob_ewma < MINSTREL_FRAC(20, 100) || !mrs->retry_count) {
+ ratetbl->rate[offset].count = 2;
+ ratetbl->rate[offset].count_rts = 2;
+ ratetbl->rate[offset].count_cts = 2;
+ } else {
+ ratetbl->rate[offset].count = mrs->retry_count;
+ ratetbl->rate[offset].count_cts = mrs->retry_count;
+ ratetbl->rate[offset].count_rts = mrs->retry_count_rtscts;
+ }
+
+ if (index / MCS_GROUP_RATES == MINSTREL_CCK_GROUP)
+ idx = mp->cck_rates[index % ARRAY_SIZE(mp->cck_rates)];
+ else if (flags & IEEE80211_TX_RC_VHT_MCS)
+ idx = ((group->streams - 1) << 4) |
+ ((index % MCS_GROUP_RATES) & 0xF);
+ else
+ idx = index % MCS_GROUP_RATES + (group->streams - 1) * 8;
+
+ if (offset > 0) {
+ ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts;
+ flags |= IEEE80211_TX_RC_USE_RTS_CTS;
+ }
+
+ ratetbl->rate[offset].idx = idx;
+ ratetbl->rate[offset].flags = flags;
+}
+
+static void
+minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
+{
+ struct ieee80211_sta_rates *rates;
+ int i = 0;
+
+ rates = kzalloc(sizeof(*rates), GFP_ATOMIC);
+ if (!rates)
+ return;
+
+ /* Start with max_tp_rate[0] */
+ minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]);
+
+ if (mp->hw->max_rates >= 3) {
+ /* At least 3 tx rates supported, use max_tp_rate[1] next */
+ minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[1]);
+ }
+
+ if (mp->hw->max_rates >= 2) {
+ /*
+ * At least 2 tx rates supported, use max_prob_rate next */
+ minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_prob_rate);
+ }
+
+ rates->rate[i].idx = -1;
+ rate_control_set_rates(mp->hw, mi->sta, rates);
+}
+
+static inline int
+minstrel_get_duration(int index)
+{
+ const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+ return group->duration[index % MCS_GROUP_RATES];
+}
+
+static int
+minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
+{
+ struct minstrel_rate_stats *mrs;
+ struct minstrel_mcs_group_data *mg;
+ unsigned int sample_dur, sample_group, cur_max_tp_streams;
+ int sample_idx = 0;
+
+ if (mi->sample_wait > 0) {
+ mi->sample_wait--;
+ return -1;
+ }
+
+ if (!mi->sample_tries)
+ return -1;
+
+ sample_group = mi->sample_group;
+ mg = &mi->groups[sample_group];
+ sample_idx = sample_table[mg->column][mg->index];
+ minstrel_set_next_sample_idx(mi);
+
+ if (!(mg->supported & BIT(sample_idx)))
+ return -1;
+
+ mrs = &mg->rates[sample_idx];
+ sample_idx += sample_group * MCS_GROUP_RATES;
+
+ /*
+ * Sampling might add some overhead (RTS, no aggregation)
+ * to the frame. Hence, don't use sampling for the currently
+ * used rates.
+ */
+ if (sample_idx == mi->max_tp_rate[0] ||
+ sample_idx == mi->max_tp_rate[1] ||
+ sample_idx == mi->max_prob_rate)
+ return -1;
+
+ /*
+ * Do not sample if the probability is already higher than 95%
+ * to avoid wasting airtime.
+ */
+ if (mrs->prob_ewma > MINSTREL_FRAC(95, 100))
+ return -1;
+
+ /*
+ * Make sure that lower rates get sampled only occasionally,
+ * if the link is working perfectly.
+ */
+
+ cur_max_tp_streams = minstrel_mcs_groups[mi->max_tp_rate[0] /
+ MCS_GROUP_RATES].streams;
+ sample_dur = minstrel_get_duration(sample_idx);
+ if (sample_dur >= minstrel_get_duration(mi->max_tp_rate[1]) &&
+ (cur_max_tp_streams - 1 <
+ minstrel_mcs_groups[sample_group].streams ||
+ sample_dur >= minstrel_get_duration(mi->max_prob_rate))) {
+ if (mrs->sample_skipped < 20)
+ return -1;
+
+ if (mi->sample_slow++ > 2)
+ return -1;
+ }
+ mi->sample_tries--;
+
+ return sample_idx;
+}
+
+static void
+minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp,
+ struct minstrel_ht_sta *mi, bool val)
+{
+ u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported;
+
+ if (!supported || !mi->cck_supported_short)
+ return;
+
+ if (supported & (mi->cck_supported_short << (val * 4)))
+ return;
+
+ supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4);
+ mi->groups[MINSTREL_CCK_GROUP].supported = supported;
+}
+
+static void
+minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
+ struct ieee80211_tx_rate_control *txrc)
+{
+ const struct mcs_group *sample_group;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
+ struct ieee80211_tx_rate *rate = &info->status.rates[0];
+ struct minstrel_ht_sta_priv *msp = priv_sta;
+ struct minstrel_ht_sta *mi = &msp->ht;
+ struct minstrel_priv *mp = priv;
+ int sample_idx;
+
+ if (rate_control_send_low(sta, priv_sta, txrc))
+ return;
+
+ if (!msp->is_ht)
+ return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc);
+
+ if (!(info->flags & IEEE80211_TX_CTL_AMPDU) &&
+ mi->max_prob_rate / MCS_GROUP_RATES != MINSTREL_CCK_GROUP)
+ minstrel_aggr_check(sta, txrc->skb);
+
+ info->flags |= mi->tx_flags;
+ minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble);
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ if (mp->fixed_rate_idx != -1)
+ return;
+#endif
+
+ /* Don't use EAPOL frames for sampling on non-mrr hw */
+ if (mp->hw->max_rates == 1 &&
+ (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO))
+ sample_idx = -1;
+ else
+ sample_idx = minstrel_get_sample_rate(mp, mi);
+
+ mi->total_packets++;
+
+ /* wraparound */
+ if (mi->total_packets == ~0) {
+ mi->total_packets = 0;
+ mi->sample_packets = 0;
+ }
+
+ if (sample_idx < 0)
+ return;
+
+ sample_group = &minstrel_mcs_groups[sample_idx / MCS_GROUP_RATES];
+ info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
+ rate->count = 1;
+
+ if (sample_idx / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) {
+ int idx = sample_idx % ARRAY_SIZE(mp->cck_rates);
+ rate->idx = mp->cck_rates[idx];
+ } else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) {
+ ieee80211_rate_set_vht(rate, sample_idx % MCS_GROUP_RATES,
+ sample_group->streams);
+ } else {
+ rate->idx = sample_idx % MCS_GROUP_RATES +
+ (sample_group->streams - 1) * 8;
+ }
+
+ rate->flags = sample_group->flags;
+}
+
+static void
+minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+ struct ieee80211_supported_band *sband,
+ struct ieee80211_sta *sta)
+{
+ int i;
+
+ if (sband->band != IEEE80211_BAND_2GHZ)
+ return;
+
+ if (!(mp->hw->flags & IEEE80211_HW_SUPPORTS_HT_CCK_RATES))
+ return;
+
+ mi->cck_supported = 0;
+ mi->cck_supported_short = 0;
+ for (i = 0; i < 4; i++) {
+ if (!rate_supported(sta, sband->band, mp->cck_rates[i]))
+ continue;
+
+ mi->cck_supported |= BIT(i);
+ if (sband->bitrates[i].flags & IEEE80211_RATE_SHORT_PREAMBLE)
+ mi->cck_supported_short |= BIT(i);
+ }
+
+ mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported;
+}
+
+static void
+minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
+ struct cfg80211_chan_def *chandef,
+ struct ieee80211_sta *sta, void *priv_sta)
+{
+ struct minstrel_priv *mp = priv;
+ struct minstrel_ht_sta_priv *msp = priv_sta;
+ struct minstrel_ht_sta *mi = &msp->ht;
+ struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
+ u16 sta_cap = sta->ht_cap.cap;
+ struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
+ int use_vht;
+ int n_supported = 0;
+ int ack_dur;
+ int stbc;
+ int i;
+
+ /* fall back to the old minstrel for legacy stations */
+ if (!sta->ht_cap.ht_supported)
+ goto use_legacy;
+
+ BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB);
+
+#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
+ if (vht_cap->vht_supported)
+ use_vht = vht_cap->vht_mcs.tx_mcs_map != cpu_to_le16(~0);
+ else
+#endif
+ use_vht = 0;
+
+ msp->is_ht = true;
+ memset(mi, 0, sizeof(*mi));
+
+ mi->sta = sta;
+ mi->last_stats_update = jiffies;
+
+ ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1, 0);
+ mi->overhead = ieee80211_frame_duration(sband->band, 0, 60, 1, 1, 0);
+ mi->overhead += ack_dur;
+ mi->overhead_rtscts = mi->overhead + 2 * ack_dur;
+
+ mi->avg_ampdu_len = MINSTREL_FRAC(1, 1);
+
+ /* When using MRR, sample more on the first attempt, without delay */
+ if (mp->has_mrr) {
+ mi->sample_count = 16;
+ mi->sample_wait = 0;
+ } else {
+ mi->sample_count = 8;
+ mi->sample_wait = 8;
+ }
+ mi->sample_tries = 4;
+
+ /* TODO tx_flags for vht - ATM the RC API is not fine-grained enough */
+ if (!use_vht) {
+ stbc = (sta_cap & IEEE80211_HT_CAP_RX_STBC) >>
+ IEEE80211_HT_CAP_RX_STBC_SHIFT;
+ mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
+
+ if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING)
+ mi->tx_flags |= IEEE80211_TX_CTL_LDPC;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(mi->groups); i++) {
+ u32 gflags = minstrel_mcs_groups[i].flags;
+ int bw, nss;
+
+ mi->groups[i].supported = 0;
+ if (i == MINSTREL_CCK_GROUP) {
+ minstrel_ht_update_cck(mp, mi, sband, sta);
+ continue;
+ }
+
+ if (gflags & IEEE80211_TX_RC_SHORT_GI) {
+ if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) {
+ if (!(sta_cap & IEEE80211_HT_CAP_SGI_40))
+ continue;
+ } else {
+ if (!(sta_cap & IEEE80211_HT_CAP_SGI_20))
+ continue;
+ }
+ }
+
+ if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH &&
+ sta->bandwidth < IEEE80211_STA_RX_BW_40)
+ continue;
+
+ nss = minstrel_mcs_groups[i].streams;
+
+ /* Mark MCS > 7 as unsupported if STA is in static SMPS mode */
+ if (sta->smps_mode == IEEE80211_SMPS_STATIC && nss > 1)
+ continue;
+
+ /* HT rate */
+ if (gflags & IEEE80211_TX_RC_MCS) {
+#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
+ if (use_vht && minstrel_vht_only)
+ continue;
+#endif
+ mi->groups[i].supported = mcs->rx_mask[nss - 1];
+ if (mi->groups[i].supported)
+ n_supported++;
+ continue;
+ }
+
+ /* VHT rate */
+ if (!vht_cap->vht_supported ||
+ WARN_ON(!(gflags & IEEE80211_TX_RC_VHT_MCS)) ||
+ WARN_ON(gflags & IEEE80211_TX_RC_160_MHZ_WIDTH))
+ continue;
+
+ if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) {
+ if (sta->bandwidth < IEEE80211_STA_RX_BW_80 ||
+ ((gflags & IEEE80211_TX_RC_SHORT_GI) &&
+ !(vht_cap->cap & IEEE80211_VHT_CAP_SHORT_GI_80))) {
+ continue;
+ }
+ }
+
+ if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ bw = BW_40;
+ else if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+ bw = BW_80;
+ else
+ bw = BW_20;
+
+ mi->groups[i].supported = minstrel_get_valid_vht_rates(bw, nss,
+ vht_cap->vht_mcs.tx_mcs_map);
+
+ if (mi->groups[i].supported)
+ n_supported++;
+ }
+
+ if (!n_supported)
+ goto use_legacy;
+
+ /* create an initial rate table with the lowest supported rates */
+ minstrel_ht_update_stats(mp, mi);
+ minstrel_ht_update_rates(mp, mi);
+
+ return;
+
+use_legacy:
+ msp->is_ht = false;
+ memset(&msp->legacy, 0, sizeof(msp->legacy));
+ msp->legacy.r = msp->ratelist;
+ msp->legacy.sample_table = msp->sample_table;
+ return mac80211_minstrel.rate_init(priv, sband, chandef, sta,
+ &msp->legacy);
+}
+
+static void
+minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband,
+ struct cfg80211_chan_def *chandef,
+ struct ieee80211_sta *sta, void *priv_sta)
+{
+ minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta);
+}
+
+static void
+minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband,
+ struct cfg80211_chan_def *chandef,
+ struct ieee80211_sta *sta, void *priv_sta,
+ u32 changed)
+{
+ minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta);
+}
+
+static void *
+minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
+{
+ struct ieee80211_supported_band *sband;
+ struct minstrel_ht_sta_priv *msp;
+ struct minstrel_priv *mp = priv;
+ struct ieee80211_hw *hw = mp->hw;
+ int max_rates = 0;
+ int i;
+
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ sband = hw->wiphy->bands[i];
+ if (sband && sband->n_bitrates > max_rates)
+ max_rates = sband->n_bitrates;
+ }
+
+ msp = kzalloc(sizeof(*msp), gfp);
+ if (!msp)
+ return NULL;
+
+ msp->ratelist = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp);
+ if (!msp->ratelist)
+ goto error;
+
+ msp->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp);
+ if (!msp->sample_table)
+ goto error1;
+
+ return msp;
+
+error1:
+ kfree(msp->ratelist);
+error:
+ kfree(msp);
+ return NULL;
+}
+
+static void
+minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
+{
+ struct minstrel_ht_sta_priv *msp = priv_sta;
+
+ kfree(msp->sample_table);
+ kfree(msp->ratelist);
+ kfree(msp);
+}
+
+static void *
+minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
+{
+ return mac80211_minstrel.alloc(hw, debugfsdir);
+}
+
+static void
+minstrel_ht_free(void *priv)
+{
+ mac80211_minstrel.free(priv);
+}
+
+static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
+{
+ struct minstrel_ht_sta_priv *msp = priv_sta;
+ struct minstrel_ht_sta *mi = &msp->ht;
+ int i, j, prob, tp_avg;
+
+ if (!msp->is_ht)
+ return mac80211_minstrel.get_expected_throughput(priv_sta);
+
+ i = mi->max_tp_rate[0] / MCS_GROUP_RATES;
+ j = mi->max_tp_rate[0] % MCS_GROUP_RATES;
+ prob = mi->groups[i].rates[j].prob_ewma;
+
+ /* convert tp_avg from pkt per second in kbps */
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024;
+
+ return tp_avg;
+}
+
+static const struct rate_control_ops mac80211_minstrel_ht = {
+ .name = "minstrel_ht",
+ .tx_status_noskb = minstrel_ht_tx_status,
+ .get_rate = minstrel_ht_get_rate,
+ .rate_init = minstrel_ht_rate_init,
+ .rate_update = minstrel_ht_rate_update,
+ .alloc_sta = minstrel_ht_alloc_sta,
+ .free_sta = minstrel_ht_free_sta,
+ .alloc = minstrel_ht_alloc,
+ .free = minstrel_ht_free,
+#ifdef CONFIG_MAC80211_DEBUGFS
+ .add_sta_debugfs = minstrel_ht_add_sta_debugfs,
+ .remove_sta_debugfs = minstrel_ht_remove_sta_debugfs,
+#endif
+ .get_expected_throughput = minstrel_ht_get_expected_throughput,
+};
+
+
+static void __init init_sample_table(void)
+{
+ int col, i, new_idx;
+ u8 rnd[MCS_GROUP_RATES];
+
+ memset(sample_table, 0xff, sizeof(sample_table));
+ for (col = 0; col < SAMPLE_COLUMNS; col++) {
+ prandom_bytes(rnd, sizeof(rnd));
+ for (i = 0; i < MCS_GROUP_RATES; i++) {
+ new_idx = (i + rnd[i]) % MCS_GROUP_RATES;
+ while (sample_table[col][new_idx] != 0xff)
+ new_idx = (new_idx + 1) % MCS_GROUP_RATES;
+
+ sample_table[col][new_idx] = i;
+ }
+ }
+}
+
+int __init
+rc80211_minstrel_ht_init(void)
+{
+ init_sample_table();
+ return ieee80211_rate_control_register(&mac80211_minstrel_ht);
+}
+
+void
+rc80211_minstrel_ht_exit(void)
+{
+ ieee80211_rate_control_unregister(&mac80211_minstrel_ht);
+}
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
new file mode 100644
index 000000000..e8b52a94d
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __RC_MINSTREL_HT_H
+#define __RC_MINSTREL_HT_H
+
+/*
+ * The number of streams can be changed to 2 to reduce code
+ * size and memory footprint.
+ */
+#define MINSTREL_MAX_STREAMS 3
+#define MINSTREL_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */
+#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
+#define MINSTREL_VHT_STREAM_GROUPS 6 /* BW(=3) * SGI(=2) */
+#else
+#define MINSTREL_VHT_STREAM_GROUPS 0
+#endif
+
+#define MINSTREL_HT_GROUPS_NB (MINSTREL_MAX_STREAMS * \
+ MINSTREL_HT_STREAM_GROUPS)
+#define MINSTREL_VHT_GROUPS_NB (MINSTREL_MAX_STREAMS * \
+ MINSTREL_VHT_STREAM_GROUPS)
+#define MINSTREL_CCK_GROUPS_NB 1
+#define MINSTREL_GROUPS_NB (MINSTREL_HT_GROUPS_NB + \
+ MINSTREL_VHT_GROUPS_NB + \
+ MINSTREL_CCK_GROUPS_NB)
+
+#define MINSTREL_HT_GROUP_0 0
+#define MINSTREL_CCK_GROUP (MINSTREL_HT_GROUP_0 + MINSTREL_HT_GROUPS_NB)
+#define MINSTREL_VHT_GROUP_0 (MINSTREL_CCK_GROUP + 1)
+
+#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
+#define MCS_GROUP_RATES 10
+#else
+#define MCS_GROUP_RATES 8
+#endif
+
+struct mcs_group {
+ u32 flags;
+ unsigned int streams;
+ unsigned int duration[MCS_GROUP_RATES];
+};
+
+extern const struct mcs_group minstrel_mcs_groups[];
+
+struct minstrel_mcs_group_data {
+ u8 index;
+ u8 column;
+
+ /* bitfield of supported MCS rates of this group */
+ u16 supported;
+
+ /* sorted rate set within a MCS group*/
+ u16 max_group_tp_rate[MAX_THR_RATES];
+ u16 max_group_prob_rate;
+
+ /* MCS rate statistics */
+ struct minstrel_rate_stats rates[MCS_GROUP_RATES];
+};
+
+struct minstrel_ht_sta {
+ struct ieee80211_sta *sta;
+
+ /* ampdu length (average, per sampling interval) */
+ unsigned int ampdu_len;
+ unsigned int ampdu_packets;
+
+ /* ampdu length (EWMA) */
+ unsigned int avg_ampdu_len;
+
+ /* overall sorted rate set */
+ u16 max_tp_rate[MAX_THR_RATES];
+ u16 max_prob_rate;
+
+ /* time of last status update */
+ unsigned long last_stats_update;
+
+ /* overhead time in usec for each frame */
+ unsigned int overhead;
+ unsigned int overhead_rtscts;
+
+ unsigned int total_packets;
+ unsigned int sample_packets;
+
+ /* tx flags to add for frames for this sta */
+ u32 tx_flags;
+
+ u8 sample_wait;
+ u8 sample_tries;
+ u8 sample_count;
+ u8 sample_slow;
+
+ /* current MCS group to be sampled */
+ u8 sample_group;
+
+ u8 cck_supported;
+ u8 cck_supported_short;
+
+ /* MCS rate group info and statistics */
+ struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB];
+};
+
+struct minstrel_ht_sta_priv {
+ union {
+ struct minstrel_ht_sta ht;
+ struct minstrel_sta_info legacy;
+ };
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct dentry *dbg_stats;
+ struct dentry *dbg_stats_csv;
+#endif
+ void *ratelist;
+ void *sample_table;
+ bool is_ht;
+};
+
+void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
+void minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta);
+int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
+ int prob_ewma);
+
+#endif
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
new file mode 100644
index 000000000..6822ce0f9
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/ieee80211.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include "rc80211_minstrel.h"
+#include "rc80211_minstrel_ht.h"
+
+static char *
+minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
+{
+ const struct mcs_group *mg;
+ unsigned int j, tp_max, tp_avg, prob, eprob, tx_time;
+ char htmode = '2';
+ char gimode = 'L';
+ u32 gflags;
+
+ if (!mi->groups[i].supported)
+ return p;
+
+ mg = &minstrel_mcs_groups[i];
+ gflags = mg->flags;
+
+ if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ htmode = '4';
+ else if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+ htmode = '8';
+ if (gflags & IEEE80211_TX_RC_SHORT_GI)
+ gimode = 'S';
+
+ for (j = 0; j < MCS_GROUP_RATES; j++) {
+ struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
+ static const int bitrates[4] = { 10, 20, 55, 110 };
+ int idx = i * MCS_GROUP_RATES + j;
+
+ if (!(mi->groups[i].supported & BIT(j)))
+ continue;
+
+ if (gflags & IEEE80211_TX_RC_MCS) {
+ p += sprintf(p, "HT%c0 ", htmode);
+ p += sprintf(p, "%cGI ", gimode);
+ p += sprintf(p, "%d ", mg->streams);
+ } else if (gflags & IEEE80211_TX_RC_VHT_MCS) {
+ p += sprintf(p, "VHT%c0 ", htmode);
+ p += sprintf(p, "%cGI ", gimode);
+ p += sprintf(p, "%d ", mg->streams);
+ } else {
+ p += sprintf(p, "CCK ");
+ p += sprintf(p, "%cP ", j < 4 ? 'L' : 'S');
+ p += sprintf(p, "1 ");
+ }
+
+ *(p++) = (idx == mi->max_tp_rate[0]) ? 'A' : ' ';
+ *(p++) = (idx == mi->max_tp_rate[1]) ? 'B' : ' ';
+ *(p++) = (idx == mi->max_tp_rate[2]) ? 'C' : ' ';
+ *(p++) = (idx == mi->max_tp_rate[3]) ? 'D' : ' ';
+ *(p++) = (idx == mi->max_prob_rate) ? 'P' : ' ';
+
+ if (gflags & IEEE80211_TX_RC_MCS) {
+ p += sprintf(p, " MCS%-2u", (mg->streams - 1) * 8 + j);
+ } else if (gflags & IEEE80211_TX_RC_VHT_MCS) {
+ p += sprintf(p, " MCS%-1u/%1u", j, mg->streams);
+ } else {
+ int r = bitrates[j % 4];
+
+ p += sprintf(p, " %2u.%1uM", r / 10, r % 10);
+ }
+
+ p += sprintf(p, " %3u ", idx);
+
+ /* tx_time[rate(i)] in usec */
+ tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000);
+ p += sprintf(p, "%6u ", tx_time);
+
+ tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
+ prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
+ eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+
+ p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
+ " %3u.%1u %3u %3u %-3u "
+ "%9llu %-9llu\n",
+ tp_max / 10, tp_max % 10,
+ tp_avg / 10, tp_avg % 10,
+ eprob / 10, eprob % 10,
+ mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
+ prob / 10, prob % 10,
+ mrs->retry_count,
+ mrs->last_success,
+ mrs->last_attempts,
+ (unsigned long long)mrs->succ_hist,
+ (unsigned long long)mrs->att_hist);
+ }
+
+ return p;
+}
+
+static int
+minstrel_ht_stats_open(struct inode *inode, struct file *file)
+{
+ struct minstrel_ht_sta_priv *msp = inode->i_private;
+ struct minstrel_ht_sta *mi = &msp->ht;
+ struct minstrel_debugfs_info *ms;
+ unsigned int i;
+ int ret;
+ char *p;
+
+ if (!msp->is_ht) {
+ inode->i_private = &msp->legacy;
+ ret = minstrel_stats_open(inode, file);
+ inode->i_private = msp;
+ return ret;
+ }
+
+ ms = kmalloc(32768, GFP_KERNEL);
+ if (!ms)
+ return -ENOMEM;
+
+ file->private_data = ms;
+ p = ms->buf;
+
+ p += sprintf(p, "\n");
+ p += sprintf(p, " best ____________rate__________ "
+ "______statistics______ ________last_______ "
+ "______sum-of________\n");
+ p += sprintf(p, "mode guard # rate [name idx airtime max_tp] "
+ "[ ø(tp) ø(prob) sd(prob)] [prob.|retry|suc|att] [#success | "
+ "#attempts]\n");
+
+ p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
+ for (i = 0; i < MINSTREL_CCK_GROUP; i++)
+ p = minstrel_ht_stats_dump(mi, i, p);
+ for (i++; i < ARRAY_SIZE(mi->groups); i++)
+ p = minstrel_ht_stats_dump(mi, i, p);
+
+ p += sprintf(p, "\nTotal packet count:: ideal %d "
+ "lookaround %d\n",
+ max(0, (int) mi->total_packets - (int) mi->sample_packets),
+ mi->sample_packets);
+ p += sprintf(p, "Average # of aggregated frames per A-MPDU: %d.%d\n",
+ MINSTREL_TRUNC(mi->avg_ampdu_len),
+ MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10);
+ ms->len = p - ms->buf;
+ WARN_ON(ms->len + sizeof(*ms) > 32768);
+
+ return nonseekable_open(inode, file);
+}
+
+static const struct file_operations minstrel_ht_stat_fops = {
+ .owner = THIS_MODULE,
+ .open = minstrel_ht_stats_open,
+ .read = minstrel_stats_read,
+ .release = minstrel_stats_release,
+ .llseek = no_llseek,
+};
+
+static char *
+minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
+{
+ const struct mcs_group *mg;
+ unsigned int j, tp_max, tp_avg, prob, eprob, tx_time;
+ char htmode = '2';
+ char gimode = 'L';
+ u32 gflags;
+
+ if (!mi->groups[i].supported)
+ return p;
+
+ mg = &minstrel_mcs_groups[i];
+ gflags = mg->flags;
+
+ if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ htmode = '4';
+ else if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+ htmode = '8';
+ if (gflags & IEEE80211_TX_RC_SHORT_GI)
+ gimode = 'S';
+
+ for (j = 0; j < MCS_GROUP_RATES; j++) {
+ struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
+ static const int bitrates[4] = { 10, 20, 55, 110 };
+ int idx = i * MCS_GROUP_RATES + j;
+
+ if (!(mi->groups[i].supported & BIT(j)))
+ continue;
+
+ if (gflags & IEEE80211_TX_RC_MCS) {
+ p += sprintf(p, "HT%c0,", htmode);
+ p += sprintf(p, "%cGI,", gimode);
+ p += sprintf(p, "%d,", mg->streams);
+ } else if (gflags & IEEE80211_TX_RC_VHT_MCS) {
+ p += sprintf(p, "VHT%c0,", htmode);
+ p += sprintf(p, "%cGI,", gimode);
+ p += sprintf(p, "%d,", mg->streams);
+ } else {
+ p += sprintf(p, "CCK,");
+ p += sprintf(p, "%cP,", j < 4 ? 'L' : 'S');
+ p += sprintf(p, "1,");
+ }
+
+ p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[0]) ? "A" : ""));
+ p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[1]) ? "B" : ""));
+ p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[2]) ? "C" : ""));
+ p += sprintf(p, "%s" ,((idx == mi->max_tp_rate[3]) ? "D" : ""));
+ p += sprintf(p, "%s" ,((idx == mi->max_prob_rate) ? "P" : ""));
+
+ if (gflags & IEEE80211_TX_RC_MCS) {
+ p += sprintf(p, ",MCS%-2u,", (mg->streams - 1) * 8 + j);
+ } else if (gflags & IEEE80211_TX_RC_VHT_MCS) {
+ p += sprintf(p, ",MCS%-1u/%1u,", j, mg->streams);
+ } else {
+ int r = bitrates[j % 4];
+ p += sprintf(p, ",%2u.%1uM,", r / 10, r % 10);
+ }
+
+ p += sprintf(p, "%u,", idx);
+ tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000);
+ p += sprintf(p, "%u,", tx_time);
+
+ tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
+ prob = MINSTREL_TRUNC(mrs->cur_prob * 1000);
+ eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+
+ p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
+ "%u,%llu,%llu,",
+ tp_max / 10, tp_max % 10,
+ tp_avg / 10, tp_avg % 10,
+ eprob / 10, eprob % 10,
+ mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10,
+ prob / 10, prob % 10,
+ mrs->retry_count,
+ mrs->last_success,
+ mrs->last_attempts,
+ (unsigned long long)mrs->succ_hist,
+ (unsigned long long)mrs->att_hist);
+ p += sprintf(p, "%d,%d,%d.%d\n",
+ max(0, (int) mi->total_packets -
+ (int) mi->sample_packets),
+ mi->sample_packets,
+ MINSTREL_TRUNC(mi->avg_ampdu_len),
+ MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10);
+ }
+
+ return p;
+}
+
+static int
+minstrel_ht_stats_csv_open(struct inode *inode, struct file *file)
+{
+ struct minstrel_ht_sta_priv *msp = inode->i_private;
+ struct minstrel_ht_sta *mi = &msp->ht;
+ struct minstrel_debugfs_info *ms;
+ unsigned int i;
+ int ret;
+ char *p;
+
+ if (!msp->is_ht) {
+ inode->i_private = &msp->legacy;
+ ret = minstrel_stats_csv_open(inode, file);
+ inode->i_private = msp;
+ return ret;
+ }
+
+ ms = kmalloc(32768, GFP_KERNEL);
+
+ if (!ms)
+ return -ENOMEM;
+
+ file->private_data = ms;
+
+ p = ms->buf;
+
+ p = minstrel_ht_stats_csv_dump(mi, MINSTREL_CCK_GROUP, p);
+ for (i = 0; i < MINSTREL_CCK_GROUP; i++)
+ p = minstrel_ht_stats_csv_dump(mi, i, p);
+ for (i++; i < ARRAY_SIZE(mi->groups); i++)
+ p = minstrel_ht_stats_csv_dump(mi, i, p);
+
+ ms->len = p - ms->buf;
+ WARN_ON(ms->len + sizeof(*ms) > 32768);
+
+ return nonseekable_open(inode, file);
+}
+
+static const struct file_operations minstrel_ht_stat_csv_fops = {
+ .owner = THIS_MODULE,
+ .open = minstrel_ht_stats_csv_open,
+ .read = minstrel_stats_read,
+ .release = minstrel_stats_release,
+ .llseek = no_llseek,
+};
+
+void
+minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
+{
+ struct minstrel_ht_sta_priv *msp = priv_sta;
+
+ msp->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, msp,
+ &minstrel_ht_stat_fops);
+ msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO,
+ dir, msp, &minstrel_ht_stat_csv_fops);
+}
+
+void
+minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta)
+{
+ struct minstrel_ht_sta_priv *msp = priv_sta;
+
+ debugfs_remove(msp->dbg_stats);
+ debugfs_remove(msp->dbg_stats_csv);
+}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
new file mode 100644
index 000000000..5793f75c5
--- /dev/null
+++ b/net/mac80211/rx.c
@@ -0,0 +1,3671 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include <net/ieee80211_radiotap.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "led.h"
+#include "mesh.h"
+#include "wep.h"
+#include "wpa.h"
+#include "tkip.h"
+#include "wme.h"
+#include "rate.h"
+
+/*
+ * monitor mode reception
+ *
+ * This function cleans up the SKB, i.e. it removes all the stuff
+ * only useful for monitoring.
+ */
+static struct sk_buff *remove_monitor_info(struct ieee80211_local *local,
+ struct sk_buff *skb,
+ unsigned int rtap_vendor_space)
+{
+ if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) {
+ if (likely(skb->len > FCS_LEN))
+ __pskb_trim(skb, skb->len - FCS_LEN);
+ else {
+ /* driver bug */
+ WARN_ON(1);
+ dev_kfree_skb(skb);
+ return NULL;
+ }
+ }
+
+ __pskb_pull(skb, rtap_vendor_space);
+
+ return skb;
+}
+
+static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
+ unsigned int rtap_vendor_space)
+{
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_hdr *hdr;
+
+ hdr = (void *)(skb->data + rtap_vendor_space);
+
+ if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
+ RX_FLAG_FAILED_PLCP_CRC |
+ RX_FLAG_AMPDU_IS_ZEROLEN))
+ return true;
+
+ if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space))
+ return true;
+
+ if (ieee80211_is_ctl(hdr->frame_control) &&
+ !ieee80211_is_pspoll(hdr->frame_control) &&
+ !ieee80211_is_back_req(hdr->frame_control))
+ return true;
+
+ return false;
+}
+
+static int
+ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
+ struct ieee80211_rx_status *status,
+ struct sk_buff *skb)
+{
+ int len;
+
+ /* always present fields */
+ len = sizeof(struct ieee80211_radiotap_header) + 8;
+
+ /* allocate extra bitmaps */
+ if (status->chains)
+ len += 4 * hweight8(status->chains);
+
+ if (ieee80211_have_rx_timestamp(status)) {
+ len = ALIGN(len, 8);
+ len += 8;
+ }
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ len += 1;
+
+ /* antenna field, if we don't have per-chain info */
+ if (!status->chains)
+ len += 1;
+
+ /* padding for RX_FLAGS if necessary */
+ len = ALIGN(len, 2);
+
+ if (status->flag & RX_FLAG_HT) /* HT info */
+ len += 3;
+
+ if (status->flag & RX_FLAG_AMPDU_DETAILS) {
+ len = ALIGN(len, 4);
+ len += 8;
+ }
+
+ if (status->flag & RX_FLAG_VHT) {
+ len = ALIGN(len, 2);
+ len += 12;
+ }
+
+ if (status->chains) {
+ /* antenna and antenna signal fields */
+ len += 2 * hweight8(status->chains);
+ }
+
+ if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
+ struct ieee80211_vendor_radiotap *rtap = (void *)skb->data;
+
+ /* vendor presence bitmap */
+ len += 4;
+ /* alignment for fixed 6-byte vendor data header */
+ len = ALIGN(len, 2);
+ /* vendor data header */
+ len += 6;
+ if (WARN_ON(rtap->align == 0))
+ rtap->align = 1;
+ len = ALIGN(len, rtap->align);
+ len += rtap->len + rtap->pad;
+ }
+
+ return len;
+}
+
+/*
+ * ieee80211_add_rx_radiotap_header - add radiotap header
+ *
+ * add a radiotap header containing all the fields which the hardware provided.
+ */
+static void
+ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
+ struct sk_buff *skb,
+ struct ieee80211_rate *rate,
+ int rtap_len, bool has_fcs)
+{
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_radiotap_header *rthdr;
+ unsigned char *pos;
+ __le32 *it_present;
+ u32 it_present_val;
+ u16 rx_flags = 0;
+ u16 channel_flags = 0;
+ int mpdulen, chain;
+ unsigned long chains = status->chains;
+ struct ieee80211_vendor_radiotap rtap = {};
+
+ if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
+ rtap = *(struct ieee80211_vendor_radiotap *)skb->data;
+ /* rtap.len and rtap.pad are undone immediately */
+ skb_pull(skb, sizeof(rtap) + rtap.len + rtap.pad);
+ }
+
+ mpdulen = skb->len;
+ if (!(has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)))
+ mpdulen += FCS_LEN;
+
+ rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len);
+ memset(rthdr, 0, rtap_len - rtap.len - rtap.pad);
+ it_present = &rthdr->it_present;
+
+ /* radiotap header, set always present flags */
+ rthdr->it_len = cpu_to_le16(rtap_len);
+ it_present_val = BIT(IEEE80211_RADIOTAP_FLAGS) |
+ BIT(IEEE80211_RADIOTAP_CHANNEL) |
+ BIT(IEEE80211_RADIOTAP_RX_FLAGS);
+
+ if (!status->chains)
+ it_present_val |= BIT(IEEE80211_RADIOTAP_ANTENNA);
+
+ for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
+ it_present_val |=
+ BIT(IEEE80211_RADIOTAP_EXT) |
+ BIT(IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE);
+ put_unaligned_le32(it_present_val, it_present);
+ it_present++;
+ it_present_val = BIT(IEEE80211_RADIOTAP_ANTENNA) |
+ BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
+ }
+
+ if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
+ it_present_val |= BIT(IEEE80211_RADIOTAP_VENDOR_NAMESPACE) |
+ BIT(IEEE80211_RADIOTAP_EXT);
+ put_unaligned_le32(it_present_val, it_present);
+ it_present++;
+ it_present_val = rtap.present;
+ }
+
+ put_unaligned_le32(it_present_val, it_present);
+
+ pos = (void *)(it_present + 1);
+
+ /* the order of the following fields is important */
+
+ /* IEEE80211_RADIOTAP_TSFT */
+ if (ieee80211_have_rx_timestamp(status)) {
+ /* padding */
+ while ((pos - (u8 *)rthdr) & 7)
+ *pos++ = 0;
+ put_unaligned_le64(
+ ieee80211_calculate_rx_timestamp(local, status,
+ mpdulen, 0),
+ pos);
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_TSFT);
+ pos += 8;
+ }
+
+ /* IEEE80211_RADIOTAP_FLAGS */
+ if (has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS))
+ *pos |= IEEE80211_RADIOTAP_F_FCS;
+ if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC))
+ *pos |= IEEE80211_RADIOTAP_F_BADFCS;
+ if (status->flag & RX_FLAG_SHORTPRE)
+ *pos |= IEEE80211_RADIOTAP_F_SHORTPRE;
+ pos++;
+
+ /* IEEE80211_RADIOTAP_RATE */
+ if (!rate || status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) {
+ /*
+ * Without rate information don't add it. If we have,
+ * MCS information is a separate field in radiotap,
+ * added below. The byte here is needed as padding
+ * for the channel though, so initialise it to 0.
+ */
+ *pos = 0;
+ } else {
+ int shift = 0;
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
+ if (status->flag & RX_FLAG_10MHZ)
+ shift = 1;
+ else if (status->flag & RX_FLAG_5MHZ)
+ shift = 2;
+ *pos = DIV_ROUND_UP(rate->bitrate, 5 * (1 << shift));
+ }
+ pos++;
+
+ /* IEEE80211_RADIOTAP_CHANNEL */
+ put_unaligned_le16(status->freq, pos);
+ pos += 2;
+ if (status->flag & RX_FLAG_10MHZ)
+ channel_flags |= IEEE80211_CHAN_HALF;
+ else if (status->flag & RX_FLAG_5MHZ)
+ channel_flags |= IEEE80211_CHAN_QUARTER;
+
+ if (status->band == IEEE80211_BAND_5GHZ)
+ channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ;
+ else if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT))
+ channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ;
+ else if (rate && rate->flags & IEEE80211_RATE_ERP_G)
+ channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ;
+ else if (rate)
+ channel_flags |= IEEE80211_CHAN_CCK | IEEE80211_CHAN_2GHZ;
+ else
+ channel_flags |= IEEE80211_CHAN_2GHZ;
+ put_unaligned_le16(channel_flags, pos);
+ pos += 2;
+
+ /* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM &&
+ !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
+ *pos = status->signal;
+ rthdr->it_present |=
+ cpu_to_le32(1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
+ pos++;
+ }
+
+ /* IEEE80211_RADIOTAP_LOCK_QUALITY is missing */
+
+ if (!status->chains) {
+ /* IEEE80211_RADIOTAP_ANTENNA */
+ *pos = status->antenna;
+ pos++;
+ }
+
+ /* IEEE80211_RADIOTAP_DB_ANTNOISE is not used */
+
+ /* IEEE80211_RADIOTAP_RX_FLAGS */
+ /* ensure 2 byte alignment for the 2 byte field as required */
+ if ((pos - (u8 *)rthdr) & 1)
+ *pos++ = 0;
+ if (status->flag & RX_FLAG_FAILED_PLCP_CRC)
+ rx_flags |= IEEE80211_RADIOTAP_F_RX_BADPLCP;
+ put_unaligned_le16(rx_flags, pos);
+ pos += 2;
+
+ if (status->flag & RX_FLAG_HT) {
+ unsigned int stbc;
+
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS);
+ *pos++ = local->hw.radiotap_mcs_details;
+ *pos = 0;
+ if (status->flag & RX_FLAG_SHORT_GI)
+ *pos |= IEEE80211_RADIOTAP_MCS_SGI;
+ if (status->flag & RX_FLAG_40MHZ)
+ *pos |= IEEE80211_RADIOTAP_MCS_BW_40;
+ if (status->flag & RX_FLAG_HT_GF)
+ *pos |= IEEE80211_RADIOTAP_MCS_FMT_GF;
+ if (status->flag & RX_FLAG_LDPC)
+ *pos |= IEEE80211_RADIOTAP_MCS_FEC_LDPC;
+ stbc = (status->flag & RX_FLAG_STBC_MASK) >> RX_FLAG_STBC_SHIFT;
+ *pos |= stbc << IEEE80211_RADIOTAP_MCS_STBC_SHIFT;
+ pos++;
+ *pos++ = status->rate_idx;
+ }
+
+ if (status->flag & RX_FLAG_AMPDU_DETAILS) {
+ u16 flags = 0;
+
+ /* ensure 4 byte alignment */
+ while ((pos - (u8 *)rthdr) & 3)
+ pos++;
+ rthdr->it_present |=
+ cpu_to_le32(1 << IEEE80211_RADIOTAP_AMPDU_STATUS);
+ put_unaligned_le32(status->ampdu_reference, pos);
+ pos += 4;
+ if (status->flag & RX_FLAG_AMPDU_REPORT_ZEROLEN)
+ flags |= IEEE80211_RADIOTAP_AMPDU_REPORT_ZEROLEN;
+ if (status->flag & RX_FLAG_AMPDU_IS_ZEROLEN)
+ flags |= IEEE80211_RADIOTAP_AMPDU_IS_ZEROLEN;
+ if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN)
+ flags |= IEEE80211_RADIOTAP_AMPDU_LAST_KNOWN;
+ if (status->flag & RX_FLAG_AMPDU_IS_LAST)
+ flags |= IEEE80211_RADIOTAP_AMPDU_IS_LAST;
+ if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_ERROR)
+ flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR;
+ if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN)
+ flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN;
+ put_unaligned_le16(flags, pos);
+ pos += 2;
+ if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN)
+ *pos++ = status->ampdu_delimiter_crc;
+ else
+ *pos++ = 0;
+ *pos++ = 0;
+ }
+
+ if (status->flag & RX_FLAG_VHT) {
+ u16 known = local->hw.radiotap_vht_details;
+
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT);
+ put_unaligned_le16(known, pos);
+ pos += 2;
+ /* flags */
+ if (status->flag & RX_FLAG_SHORT_GI)
+ *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI;
+ /* in VHT, STBC is binary */
+ if (status->flag & RX_FLAG_STBC_MASK)
+ *pos |= IEEE80211_RADIOTAP_VHT_FLAG_STBC;
+ if (status->vht_flag & RX_VHT_FLAG_BF)
+ *pos |= IEEE80211_RADIOTAP_VHT_FLAG_BEAMFORMED;
+ pos++;
+ /* bandwidth */
+ if (status->vht_flag & RX_VHT_FLAG_80MHZ)
+ *pos++ = 4;
+ else if (status->vht_flag & RX_VHT_FLAG_160MHZ)
+ *pos++ = 11;
+ else if (status->flag & RX_FLAG_40MHZ)
+ *pos++ = 1;
+ else /* 20 MHz */
+ *pos++ = 0;
+ /* MCS/NSS */
+ *pos = (status->rate_idx << 4) | status->vht_nss;
+ pos += 4;
+ /* coding field */
+ if (status->flag & RX_FLAG_LDPC)
+ *pos |= IEEE80211_RADIOTAP_CODING_LDPC_USER0;
+ pos++;
+ /* group ID */
+ pos++;
+ /* partial_aid */
+ pos += 2;
+ }
+
+ for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
+ *pos++ = status->chain_signal[chain];
+ *pos++ = chain;
+ }
+
+ if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
+ /* ensure 2 byte alignment for the vendor field as required */
+ if ((pos - (u8 *)rthdr) & 1)
+ *pos++ = 0;
+ *pos++ = rtap.oui[0];
+ *pos++ = rtap.oui[1];
+ *pos++ = rtap.oui[2];
+ *pos++ = rtap.subns;
+ put_unaligned_le16(rtap.len, pos);
+ pos += 2;
+ /* align the actual payload as requested */
+ while ((pos - (u8 *)rthdr) & (rtap.align - 1))
+ *pos++ = 0;
+ /* data (and possible padding) already follows */
+ }
+}
+
+/*
+ * This function copies a received frame to all monitor interfaces and
+ * returns a cleaned-up SKB that no longer includes the FCS nor the
+ * radiotap header the driver might have added.
+ */
+static struct sk_buff *
+ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
+ struct ieee80211_rate *rate)
+{
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(origskb);
+ struct ieee80211_sub_if_data *sdata;
+ int rt_hdrlen, needed_headroom;
+ struct sk_buff *skb, *skb2;
+ struct net_device *prev_dev = NULL;
+ int present_fcs_len = 0;
+ unsigned int rtap_vendor_space = 0;
+
+ if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
+ struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
+
+ rtap_vendor_space = sizeof(*rtap) + rtap->len + rtap->pad;
+ }
+
+ /*
+ * First, we may need to make a copy of the skb because
+ * (1) we need to modify it for radiotap (if not present), and
+ * (2) the other RX handlers will modify the skb we got.
+ *
+ * We don't need to, of course, if we aren't going to return
+ * the SKB because it has a bad FCS/PLCP checksum.
+ */
+
+ if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
+ present_fcs_len = FCS_LEN;
+
+ /* ensure hdr->frame_control and vendor radiotap data are in skb head */
+ if (!pskb_may_pull(origskb, 2 + rtap_vendor_space)) {
+ dev_kfree_skb(origskb);
+ return NULL;
+ }
+
+ if (!local->monitors) {
+ if (should_drop_frame(origskb, present_fcs_len,
+ rtap_vendor_space)) {
+ dev_kfree_skb(origskb);
+ return NULL;
+ }
+
+ return remove_monitor_info(local, origskb, rtap_vendor_space);
+ }
+
+ /* room for the radiotap header based on driver features */
+ rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, origskb);
+ needed_headroom = rt_hdrlen - rtap_vendor_space;
+
+ if (should_drop_frame(origskb, present_fcs_len, rtap_vendor_space)) {
+ /* only need to expand headroom if necessary */
+ skb = origskb;
+ origskb = NULL;
+
+ /*
+ * This shouldn't trigger often because most devices have an
+ * RX header they pull before we get here, and that should
+ * be big enough for our radiotap information. We should
+ * probably export the length to drivers so that we can have
+ * them allocate enough headroom to start with.
+ */
+ if (skb_headroom(skb) < needed_headroom &&
+ pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) {
+ dev_kfree_skb(skb);
+ return NULL;
+ }
+ } else {
+ /*
+ * Need to make a copy and possibly remove radiotap header
+ * and FCS from the original.
+ */
+ skb = skb_copy_expand(origskb, needed_headroom, 0, GFP_ATOMIC);
+
+ origskb = remove_monitor_info(local, origskb,
+ rtap_vendor_space);
+
+ if (!skb)
+ return origskb;
+ }
+
+ /* prepend radiotap information */
+ ieee80211_add_rx_radiotap_header(local, skb, rate, rt_hdrlen, true);
+
+ skb_reset_mac_header(skb);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->pkt_type = PACKET_OTHERHOST;
+ skb->protocol = htons(ETH_P_802_2);
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ continue;
+
+ if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)
+ continue;
+
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if (prev_dev) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2) {
+ skb2->dev = prev_dev;
+ netif_receive_skb(skb2);
+ }
+ }
+
+ prev_dev = sdata->dev;
+ sdata->dev->stats.rx_packets++;
+ sdata->dev->stats.rx_bytes += skb->len;
+ }
+
+ if (prev_dev) {
+ skb->dev = prev_dev;
+ netif_receive_skb(skb);
+ } else
+ dev_kfree_skb(skb);
+
+ return origskb;
+}
+
+static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ int tid, seqno_idx, security_idx;
+
+ /* does the frame have a qos control field? */
+ if (ieee80211_is_data_qos(hdr->frame_control)) {
+ u8 *qc = ieee80211_get_qos_ctl(hdr);
+ /* frame has qos control */
+ tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+ if (*qc & IEEE80211_QOS_CTL_A_MSDU_PRESENT)
+ status->rx_flags |= IEEE80211_RX_AMSDU;
+
+ seqno_idx = tid;
+ security_idx = tid;
+ } else {
+ /*
+ * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"):
+ *
+ * Sequence numbers for management frames, QoS data
+ * frames with a broadcast/multicast address in the
+ * Address 1 field, and all non-QoS data frames sent
+ * by QoS STAs are assigned using an additional single
+ * modulo-4096 counter, [...]
+ *
+ * We also use that counter for non-QoS STAs.
+ */
+ seqno_idx = IEEE80211_NUM_TIDS;
+ security_idx = 0;
+ if (ieee80211_is_mgmt(hdr->frame_control))
+ security_idx = IEEE80211_NUM_TIDS;
+ tid = 0;
+ }
+
+ rx->seqno_idx = seqno_idx;
+ rx->security_idx = security_idx;
+ /* Set skb->priority to 1d tag if highest order bit of TID is not set.
+ * For now, set skb->priority to 0 for other cases. */
+ rx->skb->priority = (tid > 7) ? 0 : tid;
+}
+
+/**
+ * DOC: Packet alignment
+ *
+ * Drivers always need to pass packets that are aligned to two-byte boundaries
+ * to the stack.
+ *
+ * Additionally, should, if possible, align the payload data in a way that
+ * guarantees that the contained IP header is aligned to a four-byte
+ * boundary. In the case of regular frames, this simply means aligning the
+ * payload to a four-byte boundary (because either the IP header is directly
+ * contained, or IV/RFC1042 headers that have a length divisible by four are
+ * in front of it). If the payload data is not properly aligned and the
+ * architecture doesn't support efficient unaligned operations, mac80211
+ * will align the data.
+ *
+ * With A-MSDU frames, however, the payload data address must yield two modulo
+ * four because there are 14-byte 802.3 headers within the A-MSDU frames that
+ * push the IP header further back to a multiple of four again. Thankfully, the
+ * specs were sane enough this time around to require padding each A-MSDU
+ * subframe to a length that is a multiple of four.
+ *
+ * Padding like Atheros hardware adds which is between the 802.11 header and
+ * the payload is not supported, the driver is required to move the 802.11
+ * header to be directly in front of the payload in that case.
+ */
+static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx)
+{
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+ WARN_ONCE((unsigned long)rx->skb->data & 1,
+ "unaligned packet at 0x%p\n", rx->skb->data);
+#endif
+}
+
+
+/* rx handlers */
+
+static int ieee80211_is_unicast_robust_mgmt_frame(struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+ if (is_multicast_ether_addr(hdr->addr1))
+ return 0;
+
+ return ieee80211_is_robust_mgmt_frame(skb);
+}
+
+
+static int ieee80211_is_multicast_robust_mgmt_frame(struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+ if (!is_multicast_ether_addr(hdr->addr1))
+ return 0;
+
+ return ieee80211_is_robust_mgmt_frame(skb);
+}
+
+
+/* Get the BIP key index from MMIE; return -1 if this is not a BIP frame */
+static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
+{
+ struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data;
+ struct ieee80211_mmie *mmie;
+ struct ieee80211_mmie_16 *mmie16;
+
+ if (skb->len < 24 + sizeof(*mmie) || !is_multicast_ether_addr(hdr->da))
+ return -1;
+
+ if (!ieee80211_is_robust_mgmt_frame(skb))
+ return -1; /* not a robust management frame */
+
+ mmie = (struct ieee80211_mmie *)
+ (skb->data + skb->len - sizeof(*mmie));
+ if (mmie->element_id == WLAN_EID_MMIE &&
+ mmie->length == sizeof(*mmie) - 2)
+ return le16_to_cpu(mmie->key_id);
+
+ mmie16 = (struct ieee80211_mmie_16 *)
+ (skb->data + skb->len - sizeof(*mmie16));
+ if (skb->len >= 24 + sizeof(*mmie16) &&
+ mmie16->element_id == WLAN_EID_MMIE &&
+ mmie16->length == sizeof(*mmie16) - 2)
+ return le16_to_cpu(mmie16->key_id);
+
+ return -1;
+}
+
+static int iwl80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ __le16 fc;
+ int hdrlen;
+ u8 keyid;
+
+ fc = hdr->frame_control;
+ hdrlen = ieee80211_hdrlen(fc);
+
+ if (skb->len < hdrlen + cs->hdr_len)
+ return -EINVAL;
+
+ skb_copy_bits(skb, hdrlen + cs->key_idx_off, &keyid, 1);
+ keyid &= cs->key_idx_mask;
+ keyid >>= cs->key_idx_shift;
+
+ return keyid;
+}
+
+static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ char *dev_addr = rx->sdata->vif.addr;
+
+ if (ieee80211_is_data(hdr->frame_control)) {
+ if (is_multicast_ether_addr(hdr->addr1)) {
+ if (ieee80211_has_tods(hdr->frame_control) ||
+ !ieee80211_has_fromds(hdr->frame_control))
+ return RX_DROP_MONITOR;
+ if (ether_addr_equal(hdr->addr3, dev_addr))
+ return RX_DROP_MONITOR;
+ } else {
+ if (!ieee80211_has_a4(hdr->frame_control))
+ return RX_DROP_MONITOR;
+ if (ether_addr_equal(hdr->addr4, dev_addr))
+ return RX_DROP_MONITOR;
+ }
+ }
+
+ /* If there is not an established peer link and this is not a peer link
+ * establisment frame, beacon or probe, drop the frame.
+ */
+
+ if (!rx->sta || sta_plink_state(rx->sta) != NL80211_PLINK_ESTAB) {
+ struct ieee80211_mgmt *mgmt;
+
+ if (!ieee80211_is_mgmt(hdr->frame_control))
+ return RX_DROP_MONITOR;
+
+ if (ieee80211_is_action(hdr->frame_control)) {
+ u8 category;
+
+ /* make sure category field is present */
+ if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE)
+ return RX_DROP_MONITOR;
+
+ mgmt = (struct ieee80211_mgmt *)hdr;
+ category = mgmt->u.action.category;
+ if (category != WLAN_CATEGORY_MESH_ACTION &&
+ category != WLAN_CATEGORY_SELF_PROTECTED)
+ return RX_DROP_MONITOR;
+ return RX_CONTINUE;
+ }
+
+ if (ieee80211_is_probe_req(hdr->frame_control) ||
+ ieee80211_is_probe_resp(hdr->frame_control) ||
+ ieee80211_is_beacon(hdr->frame_control) ||
+ ieee80211_is_auth(hdr->frame_control))
+ return RX_CONTINUE;
+
+ return RX_DROP_MONITOR;
+ }
+
+ return RX_CONTINUE;
+}
+
+static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
+ struct tid_ampdu_rx *tid_agg_rx,
+ int index,
+ struct sk_buff_head *frames)
+{
+ struct sk_buff_head *skb_list = &tid_agg_rx->reorder_buf[index];
+ struct sk_buff *skb;
+ struct ieee80211_rx_status *status;
+
+ lockdep_assert_held(&tid_agg_rx->reorder_lock);
+
+ if (skb_queue_empty(skb_list))
+ goto no_frame;
+
+ if (!ieee80211_rx_reorder_ready(skb_list)) {
+ __skb_queue_purge(skb_list);
+ goto no_frame;
+ }
+
+ /* release frames from the reorder ring buffer */
+ tid_agg_rx->stored_mpdu_num--;
+ while ((skb = __skb_dequeue(skb_list))) {
+ status = IEEE80211_SKB_RXCB(skb);
+ status->rx_flags |= IEEE80211_RX_DEFERRED_RELEASE;
+ __skb_queue_tail(frames, skb);
+ }
+
+no_frame:
+ tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num);
+}
+
+static void ieee80211_release_reorder_frames(struct ieee80211_sub_if_data *sdata,
+ struct tid_ampdu_rx *tid_agg_rx,
+ u16 head_seq_num,
+ struct sk_buff_head *frames)
+{
+ int index;
+
+ lockdep_assert_held(&tid_agg_rx->reorder_lock);
+
+ while (ieee80211_sn_less(tid_agg_rx->head_seq_num, head_seq_num)) {
+ index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
+ ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
+ frames);
+ }
+}
+
+/*
+ * Timeout (in jiffies) for skb's that are waiting in the RX reorder buffer. If
+ * the skb was added to the buffer longer than this time ago, the earlier
+ * frames that have not yet been received are assumed to be lost and the skb
+ * can be released for processing. This may also release other skb's from the
+ * reorder buffer if there are no additional gaps between the frames.
+ *
+ * Callers must hold tid_agg_rx->reorder_lock.
+ */
+#define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10)
+
+static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
+ struct tid_ampdu_rx *tid_agg_rx,
+ struct sk_buff_head *frames)
+{
+ int index, i, j;
+
+ lockdep_assert_held(&tid_agg_rx->reorder_lock);
+
+ /* release the buffer until next missing frame */
+ index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
+ if (!ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index]) &&
+ tid_agg_rx->stored_mpdu_num) {
+ /*
+ * No buffers ready to be released, but check whether any
+ * frames in the reorder buffer have timed out.
+ */
+ int skipped = 1;
+ for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
+ j = (j + 1) % tid_agg_rx->buf_size) {
+ if (!ieee80211_rx_reorder_ready(
+ &tid_agg_rx->reorder_buf[j])) {
+ skipped++;
+ continue;
+ }
+ if (skipped &&
+ !time_after(jiffies, tid_agg_rx->reorder_time[j] +
+ HT_RX_REORDER_BUF_TIMEOUT))
+ goto set_release_timer;
+
+ /* don't leave incomplete A-MSDUs around */
+ for (i = (index + 1) % tid_agg_rx->buf_size; i != j;
+ i = (i + 1) % tid_agg_rx->buf_size)
+ __skb_queue_purge(&tid_agg_rx->reorder_buf[i]);
+
+ ht_dbg_ratelimited(sdata,
+ "release an RX reorder frame due to timeout on earlier frames\n");
+ ieee80211_release_reorder_frame(sdata, tid_agg_rx, j,
+ frames);
+
+ /*
+ * Increment the head seq# also for the skipped slots.
+ */
+ tid_agg_rx->head_seq_num =
+ (tid_agg_rx->head_seq_num +
+ skipped) & IEEE80211_SN_MASK;
+ skipped = 0;
+ }
+ } else while (ieee80211_rx_reorder_ready(
+ &tid_agg_rx->reorder_buf[index])) {
+ ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
+ frames);
+ index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
+ }
+
+ if (tid_agg_rx->stored_mpdu_num) {
+ j = index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
+
+ for (; j != (index - 1) % tid_agg_rx->buf_size;
+ j = (j + 1) % tid_agg_rx->buf_size) {
+ if (ieee80211_rx_reorder_ready(
+ &tid_agg_rx->reorder_buf[j]))
+ break;
+ }
+
+ set_release_timer:
+
+ if (!tid_agg_rx->removed)
+ mod_timer(&tid_agg_rx->reorder_timer,
+ tid_agg_rx->reorder_time[j] + 1 +
+ HT_RX_REORDER_BUF_TIMEOUT);
+ } else {
+ del_timer(&tid_agg_rx->reorder_timer);
+ }
+}
+
+/*
+ * As this function belongs to the RX path it must be under
+ * rcu_read_lock protection. It returns false if the frame
+ * can be processed immediately, true if it was consumed.
+ */
+static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata,
+ struct tid_ampdu_rx *tid_agg_rx,
+ struct sk_buff *skb,
+ struct sk_buff_head *frames)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ u16 sc = le16_to_cpu(hdr->seq_ctrl);
+ u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4;
+ u16 head_seq_num, buf_size;
+ int index;
+ bool ret = true;
+
+ spin_lock(&tid_agg_rx->reorder_lock);
+
+ /*
+ * Offloaded BA sessions have no known starting sequence number so pick
+ * one from first Rxed frame for this tid after BA was started.
+ */
+ if (unlikely(tid_agg_rx->auto_seq)) {
+ tid_agg_rx->auto_seq = false;
+ tid_agg_rx->ssn = mpdu_seq_num;
+ tid_agg_rx->head_seq_num = mpdu_seq_num;
+ }
+
+ buf_size = tid_agg_rx->buf_size;
+ head_seq_num = tid_agg_rx->head_seq_num;
+
+ /* frame with out of date sequence number */
+ if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
+ dev_kfree_skb(skb);
+ goto out;
+ }
+
+ /*
+ * If frame the sequence number exceeds our buffering window
+ * size release some previous frames to make room for this one.
+ */
+ if (!ieee80211_sn_less(mpdu_seq_num, head_seq_num + buf_size)) {
+ head_seq_num = ieee80211_sn_inc(
+ ieee80211_sn_sub(mpdu_seq_num, buf_size));
+ /* release stored frames up to new head to stack */
+ ieee80211_release_reorder_frames(sdata, tid_agg_rx,
+ head_seq_num, frames);
+ }
+
+ /* Now the new frame is always in the range of the reordering buffer */
+
+ index = mpdu_seq_num % tid_agg_rx->buf_size;
+
+ /* check if we already stored this frame */
+ if (ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index])) {
+ dev_kfree_skb(skb);
+ goto out;
+ }
+
+ /*
+ * If the current MPDU is in the right order and nothing else
+ * is stored we can process it directly, no need to buffer it.
+ * If it is first but there's something stored, we may be able
+ * to release frames after this one.
+ */
+ if (mpdu_seq_num == tid_agg_rx->head_seq_num &&
+ tid_agg_rx->stored_mpdu_num == 0) {
+ if (!(status->flag & RX_FLAG_AMSDU_MORE))
+ tid_agg_rx->head_seq_num =
+ ieee80211_sn_inc(tid_agg_rx->head_seq_num);
+ ret = false;
+ goto out;
+ }
+
+ /* put the frame in the reordering buffer */
+ __skb_queue_tail(&tid_agg_rx->reorder_buf[index], skb);
+ if (!(status->flag & RX_FLAG_AMSDU_MORE)) {
+ tid_agg_rx->reorder_time[index] = jiffies;
+ tid_agg_rx->stored_mpdu_num++;
+ ieee80211_sta_reorder_release(sdata, tid_agg_rx, frames);
+ }
+
+ out:
+ spin_unlock(&tid_agg_rx->reorder_lock);
+ return ret;
+}
+
+/*
+ * Reorder MPDUs from A-MPDUs, keeping them on a buffer. Returns
+ * true if the MPDU was buffered, false if it should be processed.
+ */
+static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
+ struct sk_buff_head *frames)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_local *local = rx->local;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct sta_info *sta = rx->sta;
+ struct tid_ampdu_rx *tid_agg_rx;
+ u16 sc;
+ u8 tid, ack_policy;
+
+ if (!ieee80211_is_data_qos(hdr->frame_control) ||
+ is_multicast_ether_addr(hdr->addr1))
+ goto dont_reorder;
+
+ /*
+ * filter the QoS data rx stream according to
+ * STA/TID and check if this STA/TID is on aggregation
+ */
+
+ if (!sta)
+ goto dont_reorder;
+
+ ack_policy = *ieee80211_get_qos_ctl(hdr) &
+ IEEE80211_QOS_CTL_ACK_POLICY_MASK;
+ tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+
+ tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+ if (!tid_agg_rx)
+ goto dont_reorder;
+
+ /* qos null data frames are excluded */
+ if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
+ goto dont_reorder;
+
+ /* not part of a BA session */
+ if (ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK &&
+ ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL)
+ goto dont_reorder;
+
+ /* not actually part of this BA session */
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ goto dont_reorder;
+
+ /* new, potentially un-ordered, ampdu frame - process it */
+
+ /* reset session timer */
+ if (tid_agg_rx->timeout)
+ tid_agg_rx->last_rx = jiffies;
+
+ /* if this mpdu is fragmented - terminate rx aggregation session */
+ sc = le16_to_cpu(hdr->seq_ctrl);
+ if (sc & IEEE80211_SCTL_FRAG) {
+ skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+ skb_queue_tail(&rx->sdata->skb_queue, skb);
+ ieee80211_queue_work(&local->hw, &rx->sdata->work);
+ return;
+ }
+
+ /*
+ * No locking needed -- we will only ever process one
+ * RX packet at a time, and thus own tid_agg_rx. All
+ * other code manipulating it needs to (and does) make
+ * sure that we cannot get to it any more before doing
+ * anything with it.
+ */
+ if (ieee80211_sta_manage_reorder_buf(rx->sdata, tid_agg_rx, skb,
+ frames))
+ return;
+
+ dont_reorder:
+ __skb_queue_tail(frames, skb);
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+ /*
+ * Drop duplicate 802.11 retransmissions
+ * (IEEE 802.11-2012: 9.3.2.10 "Duplicate detection and recovery")
+ */
+
+ if (rx->skb->len < 24)
+ return RX_CONTINUE;
+
+ if (ieee80211_is_ctl(hdr->frame_control) ||
+ ieee80211_is_qos_nullfunc(hdr->frame_control) ||
+ is_multicast_ether_addr(hdr->addr1))
+ return RX_CONTINUE;
+
+ if (rx->sta) {
+ if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
+ rx->sta->last_seq_ctrl[rx->seqno_idx] ==
+ hdr->seq_ctrl)) {
+ if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
+ rx->local->dot11FrameDuplicateCount++;
+ rx->sta->num_duplicates++;
+ }
+ return RX_DROP_UNUSABLE;
+ } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) {
+ rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl;
+ }
+ }
+
+ return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+
+ if (unlikely(rx->skb->len < 16)) {
+ I802_DEBUG_INC(rx->local->rx_handlers_drop_short);
+ return RX_DROP_MONITOR;
+ }
+
+ /* Drop disallowed frame classes based on STA auth/assoc state;
+ * IEEE 802.11, Chap 5.5.
+ *
+ * mac80211 filters only based on association state, i.e. it drops
+ * Class 3 frames from not associated stations. hostapd sends
+ * deauth/disassoc frames when needed. In addition, hostapd is
+ * responsible for filtering on both auth and assoc states.
+ */
+
+ if (ieee80211_vif_is_mesh(&rx->sdata->vif))
+ return ieee80211_rx_mesh_check(rx);
+
+ if (unlikely((ieee80211_is_data(hdr->frame_control) ||
+ ieee80211_is_pspoll(hdr->frame_control)) &&
+ rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ rx->sdata->vif.type != NL80211_IFTYPE_WDS &&
+ rx->sdata->vif.type != NL80211_IFTYPE_OCB &&
+ (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))) {
+ /*
+ * accept port control frames from the AP even when it's not
+ * yet marked ASSOC to prevent a race where we don't set the
+ * assoc bit quickly enough before it sends the first frame
+ */
+ if (rx->sta && rx->sdata->vif.type == NL80211_IFTYPE_STATION &&
+ ieee80211_is_data_present(hdr->frame_control)) {
+ unsigned int hdrlen;
+ __be16 ethertype;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ if (rx->skb->len < hdrlen + 8)
+ return RX_DROP_MONITOR;
+
+ skb_copy_bits(rx->skb, hdrlen + 6, &ethertype, 2);
+ if (ethertype == rx->sdata->control_port_protocol)
+ return RX_CONTINUE;
+ }
+
+ if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
+ cfg80211_rx_spurious_frame(rx->sdata->dev,
+ hdr->addr2,
+ GFP_ATOMIC))
+ return RX_DROP_UNUSABLE;
+
+ return RX_DROP_MONITOR;
+ }
+
+ return RX_CONTINUE;
+}
+
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_check_more_data(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_local *local;
+ struct ieee80211_hdr *hdr;
+ struct sk_buff *skb;
+
+ local = rx->local;
+ skb = rx->skb;
+ hdr = (struct ieee80211_hdr *) skb->data;
+
+ if (!local->pspolling)
+ return RX_CONTINUE;
+
+ if (!ieee80211_has_fromds(hdr->frame_control))
+ /* this is not from AP */
+ return RX_CONTINUE;
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ return RX_CONTINUE;
+
+ if (!ieee80211_has_moredata(hdr->frame_control)) {
+ /* AP has no more frames buffered for us */
+ local->pspolling = false;
+ return RX_CONTINUE;
+ }
+
+ /* more data bit is set, let's request a new frame from the AP */
+ ieee80211_send_pspoll(local, rx->sdata);
+
+ return RX_CONTINUE;
+}
+
+static void sta_ps_start(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct ps_data *ps;
+ int tid;
+
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
+ sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ ps = &sdata->bss->ps;
+ else
+ return;
+
+ atomic_inc(&ps->num_sta_ps);
+ set_sta_flag(sta, WLAN_STA_PS_STA);
+ if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
+ drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta);
+ ps_dbg(sdata, "STA %pM aid %d enters power save mode\n",
+ sta->sta.addr, sta->sta.aid);
+
+ if (!sta->sta.txq[0])
+ return;
+
+ for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
+ struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
+
+ if (!skb_queue_len(&txqi->queue))
+ set_bit(tid, &sta->txq_buffered_tids);
+ else
+ clear_bit(tid, &sta->txq_buffered_tids);
+ }
+}
+
+static void sta_ps_end(struct sta_info *sta)
+{
+ ps_dbg(sta->sdata, "STA %pM aid %d exits power save mode\n",
+ sta->sta.addr, sta->sta.aid);
+
+ if (test_sta_flag(sta, WLAN_STA_PS_DRIVER)) {
+ /*
+ * Clear the flag only if the other one is still set
+ * so that the TX path won't start TX'ing new frames
+ * directly ... In the case that the driver flag isn't
+ * set ieee80211_sta_ps_deliver_wakeup() will clear it.
+ */
+ clear_sta_flag(sta, WLAN_STA_PS_STA);
+ ps_dbg(sta->sdata, "STA %pM aid %d driver-ps-blocked\n",
+ sta->sta.addr, sta->sta.aid);
+ return;
+ }
+
+ set_sta_flag(sta, WLAN_STA_PS_DELIVER);
+ clear_sta_flag(sta, WLAN_STA_PS_STA);
+ ieee80211_sta_ps_deliver_wakeup(sta);
+}
+
+int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start)
+{
+ struct sta_info *sta_inf = container_of(sta, struct sta_info, sta);
+ bool in_ps;
+
+ WARN_ON(!(sta_inf->local->hw.flags & IEEE80211_HW_AP_LINK_PS));
+
+ /* Don't let the same PS state be set twice */
+ in_ps = test_sta_flag(sta_inf, WLAN_STA_PS_STA);
+ if ((start && in_ps) || (!start && !in_ps))
+ return -EINVAL;
+
+ if (start)
+ sta_ps_start(sta_inf);
+ else
+ sta_ps_end(sta_inf);
+
+ return 0;
+}
+EXPORT_SYMBOL(ieee80211_sta_ps_transition);
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_hdr *hdr = (void *)rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ int tid, ac;
+
+ if (!rx->sta || !(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ return RX_CONTINUE;
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP &&
+ sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
+ return RX_CONTINUE;
+
+ /*
+ * The device handles station powersave, so don't do anything about
+ * uAPSD and PS-Poll frames (the latter shouldn't even come up from
+ * it to mac80211 since they're handled.)
+ */
+ if (sdata->local->hw.flags & IEEE80211_HW_AP_LINK_PS)
+ return RX_CONTINUE;
+
+ /*
+ * Don't do anything if the station isn't already asleep. In
+ * the uAPSD case, the station will probably be marked asleep,
+ * in the PS-Poll case the station must be confused ...
+ */
+ if (!test_sta_flag(rx->sta, WLAN_STA_PS_STA))
+ return RX_CONTINUE;
+
+ if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) {
+ if (!test_sta_flag(rx->sta, WLAN_STA_SP)) {
+ if (!test_sta_flag(rx->sta, WLAN_STA_PS_DRIVER))
+ ieee80211_sta_ps_deliver_poll_response(rx->sta);
+ else
+ set_sta_flag(rx->sta, WLAN_STA_PSPOLL);
+ }
+
+ /* Free PS Poll skb here instead of returning RX_DROP that would
+ * count as an dropped frame. */
+ dev_kfree_skb(rx->skb);
+
+ return RX_QUEUED;
+ } else if (!ieee80211_has_morefrags(hdr->frame_control) &&
+ !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
+ ieee80211_has_pm(hdr->frame_control) &&
+ (ieee80211_is_data_qos(hdr->frame_control) ||
+ ieee80211_is_qos_nullfunc(hdr->frame_control))) {
+ tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+ ac = ieee802_1d_to_ac[tid & 7];
+
+ /*
+ * If this AC is not trigger-enabled do nothing.
+ *
+ * NB: This could/should check a separate bitmap of trigger-
+ * enabled queues, but for now we only implement uAPSD w/o
+ * TSPEC changes to the ACs, so they're always the same.
+ */
+ if (!(rx->sta->sta.uapsd_queues & BIT(ac)))
+ return RX_CONTINUE;
+
+ /* if we are in a service period, do nothing */
+ if (test_sta_flag(rx->sta, WLAN_STA_SP))
+ return RX_CONTINUE;
+
+ if (!test_sta_flag(rx->sta, WLAN_STA_PS_DRIVER))
+ ieee80211_sta_ps_deliver_uapsd(rx->sta);
+ else
+ set_sta_flag(rx->sta, WLAN_STA_UAPSD);
+ }
+
+ return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
+{
+ struct sta_info *sta = rx->sta;
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ int i;
+
+ if (!sta)
+ return RX_CONTINUE;
+
+ /*
+ * Update last_rx only for IBSS packets which are for the current
+ * BSSID and for station already AUTHORIZED to avoid keeping the
+ * current IBSS network alive in cases where other STAs start
+ * using different BSSID. This will also give the station another
+ * chance to restart the authentication/authorization in case
+ * something went wrong the first time.
+ */
+ if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
+ NL80211_IFTYPE_ADHOC);
+ if (ether_addr_equal(bssid, rx->sdata->u.ibss.bssid) &&
+ test_sta_flag(sta, WLAN_STA_AUTHORIZED)) {
+ sta->last_rx = jiffies;
+ if (ieee80211_is_data(hdr->frame_control) &&
+ !is_multicast_ether_addr(hdr->addr1)) {
+ sta->last_rx_rate_idx = status->rate_idx;
+ sta->last_rx_rate_flag = status->flag;
+ sta->last_rx_rate_vht_flag = status->vht_flag;
+ sta->last_rx_rate_vht_nss = status->vht_nss;
+ }
+ }
+ } else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) {
+ u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
+ NL80211_IFTYPE_OCB);
+ /* OCB uses wild-card BSSID */
+ if (is_broadcast_ether_addr(bssid))
+ sta->last_rx = jiffies;
+ } else if (!is_multicast_ether_addr(hdr->addr1)) {
+ /*
+ * Mesh beacons will update last_rx when if they are found to
+ * match the current local configuration when processed.
+ */
+ sta->last_rx = jiffies;
+ if (ieee80211_is_data(hdr->frame_control)) {
+ sta->last_rx_rate_idx = status->rate_idx;
+ sta->last_rx_rate_flag = status->flag;
+ sta->last_rx_rate_vht_flag = status->vht_flag;
+ sta->last_rx_rate_vht_nss = status->vht_nss;
+ }
+ }
+
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ return RX_CONTINUE;
+
+ if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
+ ieee80211_sta_rx_notify(rx->sdata, hdr);
+
+ sta->rx_fragments++;
+ sta->rx_bytes += rx->skb->len;
+ if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
+ sta->last_signal = status->signal;
+ ewma_add(&sta->avg_signal, -status->signal);
+ }
+
+ if (status->chains) {
+ sta->chains = status->chains;
+ for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) {
+ int signal = status->chain_signal[i];
+
+ if (!(status->chains & BIT(i)))
+ continue;
+
+ sta->chain_signal_last[i] = signal;
+ ewma_add(&sta->chain_signal_avg[i], -signal);
+ }
+ }
+
+ /*
+ * Change STA power saving mode only at the end of a frame
+ * exchange sequence.
+ */
+ if (!(sta->local->hw.flags & IEEE80211_HW_AP_LINK_PS) &&
+ !ieee80211_has_morefrags(hdr->frame_control) &&
+ !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
+ (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
+ rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
+ /* PM bit is only checked in frames where it isn't reserved,
+ * in AP mode it's reserved in non-bufferable management frames
+ * (cf. IEEE 802.11-2012 8.2.4.1.7 Power Management field)
+ */
+ (!ieee80211_is_mgmt(hdr->frame_control) ||
+ ieee80211_is_bufferable_mmpdu(hdr->frame_control))) {
+ if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
+ if (!ieee80211_has_pm(hdr->frame_control))
+ sta_ps_end(sta);
+ } else {
+ if (ieee80211_has_pm(hdr->frame_control))
+ sta_ps_start(sta);
+ }
+ }
+
+ /* mesh power save support */
+ if (ieee80211_vif_is_mesh(&rx->sdata->vif))
+ ieee80211_mps_rx_h_sta_process(sta, hdr);
+
+ /*
+ * Drop (qos-)data::nullfunc frames silently, since they
+ * are used only to control station power saving mode.
+ */
+ if (ieee80211_is_nullfunc(hdr->frame_control) ||
+ ieee80211_is_qos_nullfunc(hdr->frame_control)) {
+ I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc);
+
+ /*
+ * If we receive a 4-addr nullfunc frame from a STA
+ * that was not moved to a 4-addr STA vlan yet send
+ * the event to userspace and for older hostapd drop
+ * the frame to the monitor interface.
+ */
+ if (ieee80211_has_a4(hdr->frame_control) &&
+ (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
+ (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ !rx->sdata->u.vlan.sta))) {
+ if (!test_and_set_sta_flag(sta, WLAN_STA_4ADDR_EVENT))
+ cfg80211_rx_unexpected_4addr_frame(
+ rx->sdata->dev, sta->sta.addr,
+ GFP_ATOMIC);
+ return RX_DROP_MONITOR;
+ }
+ /*
+ * Update counter and free packet here to avoid
+ * counting this as a dropped packed.
+ */
+ sta->rx_packets++;
+ dev_kfree_skb(rx->skb);
+ return RX_QUEUED;
+ }
+
+ return RX_CONTINUE;
+} /* ieee80211_rx_h_sta_process */
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ int keyidx;
+ int hdrlen;
+ ieee80211_rx_result result = RX_DROP_UNUSABLE;
+ struct ieee80211_key *sta_ptk = NULL;
+ int mmie_keyidx = -1;
+ __le16 fc;
+ const struct ieee80211_cipher_scheme *cs = NULL;
+
+ /*
+ * Key selection 101
+ *
+ * There are four types of keys:
+ * - GTK (group keys)
+ * - IGTK (group keys for management frames)
+ * - PTK (pairwise keys)
+ * - STK (station-to-station pairwise keys)
+ *
+ * When selecting a key, we have to distinguish between multicast
+ * (including broadcast) and unicast frames, the latter can only
+ * use PTKs and STKs while the former always use GTKs and IGTKs.
+ * Unless, of course, actual WEP keys ("pre-RSNA") are used, then
+ * unicast frames can also use key indices like GTKs. Hence, if we
+ * don't have a PTK/STK we check the key index for a WEP key.
+ *
+ * Note that in a regular BSS, multicast frames are sent by the
+ * AP only, associated stations unicast the frame to the AP first
+ * which then multicasts it on their behalf.
+ *
+ * There is also a slight problem in IBSS mode: GTKs are negotiated
+ * with each station, that is something we don't currently handle.
+ * The spec seems to expect that one negotiates the same key with
+ * every station but there's no such requirement; VLANs could be
+ * possible.
+ */
+
+ /*
+ * No point in finding a key and decrypting if the frame is neither
+ * addressed to us nor a multicast frame.
+ */
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ return RX_CONTINUE;
+
+ /* start without a key */
+ rx->key = NULL;
+ fc = hdr->frame_control;
+
+ if (rx->sta) {
+ int keyid = rx->sta->ptk_idx;
+
+ if (ieee80211_has_protected(fc) && rx->sta->cipher_scheme) {
+ cs = rx->sta->cipher_scheme;
+ keyid = iwl80211_get_cs_keyid(cs, rx->skb);
+ if (unlikely(keyid < 0))
+ return RX_DROP_UNUSABLE;
+ }
+ sta_ptk = rcu_dereference(rx->sta->ptk[keyid]);
+ }
+
+ if (!ieee80211_has_protected(fc))
+ mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
+
+ if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
+ rx->key = sta_ptk;
+ if ((status->flag & RX_FLAG_DECRYPTED) &&
+ (status->flag & RX_FLAG_IV_STRIPPED))
+ return RX_CONTINUE;
+ /* Skip decryption if the frame is not protected. */
+ if (!ieee80211_has_protected(fc))
+ return RX_CONTINUE;
+ } else if (mmie_keyidx >= 0) {
+ /* Broadcast/multicast robust management frame / BIP */
+ if ((status->flag & RX_FLAG_DECRYPTED) &&
+ (status->flag & RX_FLAG_IV_STRIPPED))
+ return RX_CONTINUE;
+
+ if (mmie_keyidx < NUM_DEFAULT_KEYS ||
+ mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+ return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+ if (rx->sta)
+ rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
+ if (!rx->key)
+ rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
+ } else if (!ieee80211_has_protected(fc)) {
+ /*
+ * The frame was not protected, so skip decryption. However, we
+ * need to set rx->key if there is a key that could have been
+ * used so that the frame may be dropped if encryption would
+ * have been expected.
+ */
+ struct ieee80211_key *key = NULL;
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ int i;
+
+ if (ieee80211_is_mgmt(fc) &&
+ is_multicast_ether_addr(hdr->addr1) &&
+ (key = rcu_dereference(rx->sdata->default_mgmt_key)))
+ rx->key = key;
+ else {
+ if (rx->sta) {
+ for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
+ key = rcu_dereference(rx->sta->gtk[i]);
+ if (key)
+ break;
+ }
+ }
+ if (!key) {
+ for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
+ key = rcu_dereference(sdata->keys[i]);
+ if (key)
+ break;
+ }
+ }
+ if (key)
+ rx->key = key;
+ }
+ return RX_CONTINUE;
+ } else {
+ u8 keyid;
+
+ /*
+ * The device doesn't give us the IV so we won't be
+ * able to look up the key. That's ok though, we
+ * don't need to decrypt the frame, we just won't
+ * be able to keep statistics accurate.
+ * Except for key threshold notifications, should
+ * we somehow allow the driver to tell us which key
+ * the hardware used if this flag is set?
+ */
+ if ((status->flag & RX_FLAG_DECRYPTED) &&
+ (status->flag & RX_FLAG_IV_STRIPPED))
+ return RX_CONTINUE;
+
+ hdrlen = ieee80211_hdrlen(fc);
+
+ if (cs) {
+ keyidx = iwl80211_get_cs_keyid(cs, rx->skb);
+
+ if (unlikely(keyidx < 0))
+ return RX_DROP_UNUSABLE;
+ } else {
+ if (rx->skb->len < 8 + hdrlen)
+ return RX_DROP_UNUSABLE; /* TODO: count this? */
+ /*
+ * no need to call ieee80211_wep_get_keyidx,
+ * it verifies a bunch of things we've done already
+ */
+ skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1);
+ keyidx = keyid >> 6;
+ }
+
+ /* check per-station GTK first, if multicast packet */
+ if (is_multicast_ether_addr(hdr->addr1) && rx->sta)
+ rx->key = rcu_dereference(rx->sta->gtk[keyidx]);
+
+ /* if not found, try default key */
+ if (!rx->key) {
+ rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
+
+ /*
+ * RSNA-protected unicast frames should always be
+ * sent with pairwise or station-to-station keys,
+ * but for WEP we allow using a key index as well.
+ */
+ if (rx->key &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
+ !is_multicast_ether_addr(hdr->addr1))
+ rx->key = NULL;
+ }
+ }
+
+ if (rx->key) {
+ if (unlikely(rx->key->flags & KEY_FLAG_TAINTED))
+ return RX_DROP_MONITOR;
+
+ rx->key->tx_rx_count++;
+ /* TODO: add threshold stuff again */
+ } else {
+ return RX_DROP_MONITOR;
+ }
+
+ switch (rx->key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ result = ieee80211_crypto_wep_decrypt(rx);
+ break;
+ case WLAN_CIPHER_SUITE_TKIP:
+ result = ieee80211_crypto_tkip_decrypt(rx);
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ result = ieee80211_crypto_ccmp_decrypt(
+ rx, IEEE80211_CCMP_MIC_LEN);
+ break;
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ result = ieee80211_crypto_ccmp_decrypt(
+ rx, IEEE80211_CCMP_256_MIC_LEN);
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ result = ieee80211_crypto_aes_cmac_decrypt(rx);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ result = ieee80211_crypto_aes_cmac_256_decrypt(rx);
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ result = ieee80211_crypto_aes_gmac_decrypt(rx);
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ result = ieee80211_crypto_gcmp_decrypt(rx);
+ break;
+ default:
+ result = ieee80211_crypto_hw_decrypt(rx);
+ }
+
+ /* the hdr variable is invalid after the decrypt handlers */
+
+ /* either the frame has been decrypted or will be dropped */
+ status->flag |= RX_FLAG_DECRYPTED;
+
+ return result;
+}
+
+static inline struct ieee80211_fragment_entry *
+ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
+ unsigned int frag, unsigned int seq, int rx_queue,
+ struct sk_buff **skb)
+{
+ struct ieee80211_fragment_entry *entry;
+
+ entry = &sdata->fragments[sdata->fragment_next++];
+ if (sdata->fragment_next >= IEEE80211_FRAGMENT_MAX)
+ sdata->fragment_next = 0;
+
+ if (!skb_queue_empty(&entry->skb_list))
+ __skb_queue_purge(&entry->skb_list);
+
+ __skb_queue_tail(&entry->skb_list, *skb); /* no need for locking */
+ *skb = NULL;
+ entry->first_frag_time = jiffies;
+ entry->seq = seq;
+ entry->rx_queue = rx_queue;
+ entry->last_frag = frag;
+ entry->ccmp = 0;
+ entry->extra_len = 0;
+
+ return entry;
+}
+
+static inline struct ieee80211_fragment_entry *
+ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
+ unsigned int frag, unsigned int seq,
+ int rx_queue, struct ieee80211_hdr *hdr)
+{
+ struct ieee80211_fragment_entry *entry;
+ int i, idx;
+
+ idx = sdata->fragment_next;
+ for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) {
+ struct ieee80211_hdr *f_hdr;
+
+ idx--;
+ if (idx < 0)
+ idx = IEEE80211_FRAGMENT_MAX - 1;
+
+ entry = &sdata->fragments[idx];
+ if (skb_queue_empty(&entry->skb_list) || entry->seq != seq ||
+ entry->rx_queue != rx_queue ||
+ entry->last_frag + 1 != frag)
+ continue;
+
+ f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data;
+
+ /*
+ * Check ftype and addresses are equal, else check next fragment
+ */
+ if (((hdr->frame_control ^ f_hdr->frame_control) &
+ cpu_to_le16(IEEE80211_FCTL_FTYPE)) ||
+ !ether_addr_equal(hdr->addr1, f_hdr->addr1) ||
+ !ether_addr_equal(hdr->addr2, f_hdr->addr2))
+ continue;
+
+ if (time_after(jiffies, entry->first_frag_time + 2 * HZ)) {
+ __skb_queue_purge(&entry->skb_list);
+ continue;
+ }
+ return entry;
+ }
+
+ return NULL;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *hdr;
+ u16 sc;
+ __le16 fc;
+ unsigned int frag, seq;
+ struct ieee80211_fragment_entry *entry;
+ struct sk_buff *skb;
+ struct ieee80211_rx_status *status;
+
+ hdr = (struct ieee80211_hdr *)rx->skb->data;
+ fc = hdr->frame_control;
+
+ if (ieee80211_is_ctl(fc))
+ return RX_CONTINUE;
+
+ sc = le16_to_cpu(hdr->seq_ctrl);
+ frag = sc & IEEE80211_SCTL_FRAG;
+
+ if (is_multicast_ether_addr(hdr->addr1)) {
+ rx->local->dot11MulticastReceivedFrameCount++;
+ goto out_no_led;
+ }
+
+ if (likely(!ieee80211_has_morefrags(fc) && frag == 0))
+ goto out;
+
+ I802_DEBUG_INC(rx->local->rx_handlers_fragments);
+
+ if (skb_linearize(rx->skb))
+ return RX_DROP_UNUSABLE;
+
+ /*
+ * skb_linearize() might change the skb->data and
+ * previously cached variables (in this case, hdr) need to
+ * be refreshed with the new data.
+ */
+ hdr = (struct ieee80211_hdr *)rx->skb->data;
+ seq = (sc & IEEE80211_SCTL_SEQ) >> 4;
+
+ if (frag == 0) {
+ /* This is the first fragment of a new frame. */
+ entry = ieee80211_reassemble_add(rx->sdata, frag, seq,
+ rx->seqno_idx, &(rx->skb));
+ if (rx->key &&
+ (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) &&
+ ieee80211_has_protected(fc)) {
+ int queue = rx->security_idx;
+ /* Store CCMP PN so that we can verify that the next
+ * fragment has a sequential PN value. */
+ entry->ccmp = 1;
+ memcpy(entry->last_pn,
+ rx->key->u.ccmp.rx_pn[queue],
+ IEEE80211_CCMP_PN_LEN);
+ }
+ return RX_QUEUED;
+ }
+
+ /* This is a fragment for a frame that should already be pending in
+ * fragment cache. Add this fragment to the end of the pending entry.
+ */
+ entry = ieee80211_reassemble_find(rx->sdata, frag, seq,
+ rx->seqno_idx, hdr);
+ if (!entry) {
+ I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
+ return RX_DROP_MONITOR;
+ }
+
+ /* Verify that MPDUs within one MSDU have sequential PN values.
+ * (IEEE 802.11i, 8.3.3.4.5) */
+ if (entry->ccmp) {
+ int i;
+ u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
+ int queue;
+ if (!rx->key ||
+ (rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256))
+ return RX_DROP_UNUSABLE;
+ memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
+ for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
+ pn[i]++;
+ if (pn[i])
+ break;
+ }
+ queue = rx->security_idx;
+ rpn = rx->key->u.ccmp.rx_pn[queue];
+ if (memcmp(pn, rpn, IEEE80211_CCMP_PN_LEN))
+ return RX_DROP_UNUSABLE;
+ memcpy(entry->last_pn, pn, IEEE80211_CCMP_PN_LEN);
+ }
+
+ skb_pull(rx->skb, ieee80211_hdrlen(fc));
+ __skb_queue_tail(&entry->skb_list, rx->skb);
+ entry->last_frag = frag;
+ entry->extra_len += rx->skb->len;
+ if (ieee80211_has_morefrags(fc)) {
+ rx->skb = NULL;
+ return RX_QUEUED;
+ }
+
+ rx->skb = __skb_dequeue(&entry->skb_list);
+ if (skb_tailroom(rx->skb) < entry->extra_len) {
+ I802_DEBUG_INC(rx->local->rx_expand_skb_head2);
+ if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len,
+ GFP_ATOMIC))) {
+ I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
+ __skb_queue_purge(&entry->skb_list);
+ return RX_DROP_UNUSABLE;
+ }
+ }
+ while ((skb = __skb_dequeue(&entry->skb_list))) {
+ memcpy(skb_put(rx->skb, skb->len), skb->data, skb->len);
+ dev_kfree_skb(skb);
+ }
+
+ /* Complete frame has been reassembled - process it now */
+ status = IEEE80211_SKB_RXCB(rx->skb);
+ status->rx_flags |= IEEE80211_RX_FRAGMENTED;
+
+ out:
+ ieee80211_led_rx(rx->local);
+ out_no_led:
+ if (rx->sta)
+ rx->sta->rx_packets++;
+ return RX_CONTINUE;
+}
+
+static int ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx)
+{
+ if (unlikely(!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_AUTHORIZED)))
+ return -EACCES;
+
+ return 0;
+}
+
+static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+
+ /*
+ * Pass through unencrypted frames if the hardware has
+ * decrypted them already.
+ */
+ if (status->flag & RX_FLAG_DECRYPTED)
+ return 0;
+
+ /* Drop unencrypted frames if key is set. */
+ if (unlikely(!ieee80211_has_protected(fc) &&
+ !ieee80211_is_nullfunc(fc) &&
+ ieee80211_is_data(fc) && rx->key))
+ return -EACCES;
+
+ return 0;
+}
+
+static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ __le16 fc = hdr->frame_control;
+
+ /*
+ * Pass through unencrypted frames if the hardware has
+ * decrypted them already.
+ */
+ if (status->flag & RX_FLAG_DECRYPTED)
+ return 0;
+
+ if (rx->sta && test_sta_flag(rx->sta, WLAN_STA_MFP)) {
+ if (unlikely(!ieee80211_has_protected(fc) &&
+ ieee80211_is_unicast_robust_mgmt_frame(rx->skb) &&
+ rx->key)) {
+ if (ieee80211_is_deauth(fc) ||
+ ieee80211_is_disassoc(fc))
+ cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
+ rx->skb->data,
+ rx->skb->len);
+ return -EACCES;
+ }
+ /* BIP does not use Protected field, so need to check MMIE */
+ if (unlikely(ieee80211_is_multicast_robust_mgmt_frame(rx->skb) &&
+ ieee80211_get_mmie_keyidx(rx->skb) < 0)) {
+ if (ieee80211_is_deauth(fc) ||
+ ieee80211_is_disassoc(fc))
+ cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
+ rx->skb->data,
+ rx->skb->len);
+ return -EACCES;
+ }
+ /*
+ * When using MFP, Action frames are not allowed prior to
+ * having configured keys.
+ */
+ if (unlikely(ieee80211_is_action(fc) && !rx->key &&
+ ieee80211_is_robust_mgmt_frame(rx->skb)))
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int
+__ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control)
+{
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ bool check_port_control = false;
+ struct ethhdr *ehdr;
+ int ret;
+
+ *port_control = false;
+ if (ieee80211_has_a4(hdr->frame_control) &&
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
+ return -1;
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !!sdata->u.mgd.use_4addr != !!ieee80211_has_a4(hdr->frame_control)) {
+
+ if (!sdata->u.mgd.use_4addr)
+ return -1;
+ else
+ check_port_control = true;
+ }
+
+ if (is_multicast_ether_addr(hdr->addr1) &&
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sdata->u.vlan.sta)
+ return -1;
+
+ ret = ieee80211_data_to_8023(rx->skb, sdata->vif.addr, sdata->vif.type);
+ if (ret < 0)
+ return ret;
+
+ ehdr = (struct ethhdr *) rx->skb->data;
+ if (ehdr->h_proto == rx->sdata->control_port_protocol)
+ *port_control = true;
+ else if (check_port_control)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * requires that rx->skb is a frame with ethernet header
+ */
+static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
+{
+ static const u8 pae_group_addr[ETH_ALEN] __aligned(2)
+ = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 };
+ struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
+
+ /*
+ * Allow EAPOL frames to us/the PAE group address regardless
+ * of whether the frame was encrypted or not.
+ */
+ if (ehdr->h_proto == rx->sdata->control_port_protocol &&
+ (ether_addr_equal(ehdr->h_dest, rx->sdata->vif.addr) ||
+ ether_addr_equal(ehdr->h_dest, pae_group_addr)))
+ return true;
+
+ if (ieee80211_802_1x_port_control(rx) ||
+ ieee80211_drop_unencrypted(rx, fc))
+ return false;
+
+ return true;
+}
+
+/*
+ * requires that rx->skb is a frame with ethernet header
+ */
+static void
+ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct net_device *dev = sdata->dev;
+ struct sk_buff *skb, *xmit_skb;
+ struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
+ struct sta_info *dsta;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+ dev->stats.rx_packets++;
+ dev->stats.rx_bytes += rx->skb->len;
+
+ skb = rx->skb;
+ xmit_skb = NULL;
+
+ if ((sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
+ !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
+ (status->rx_flags & IEEE80211_RX_RA_MATCH) &&
+ (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
+ if (is_multicast_ether_addr(ehdr->h_dest)) {
+ /*
+ * send multicast frames both to higher layers in
+ * local net stack and back to the wireless medium
+ */
+ xmit_skb = skb_copy(skb, GFP_ATOMIC);
+ if (!xmit_skb)
+ net_info_ratelimited("%s: failed to clone multicast frame\n",
+ dev->name);
+ } else {
+ dsta = sta_info_get(sdata, skb->data);
+ if (dsta) {
+ /*
+ * The destination station is associated to
+ * this AP (in this VLAN), so send the frame
+ * directly to it and do not pass it to local
+ * net stack.
+ */
+ xmit_skb = skb;
+ skb = NULL;
+ }
+ }
+ }
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ if (skb) {
+ /* 'align' will only take the values 0 or 2 here since all
+ * frames are required to be aligned to 2-byte boundaries
+ * when being passed to mac80211; the code here works just
+ * as well if that isn't true, but mac80211 assumes it can
+ * access fields as 2-byte aligned (e.g. for ether_addr_equal)
+ */
+ int align;
+
+ align = (unsigned long)(skb->data + sizeof(struct ethhdr)) & 3;
+ if (align) {
+ if (WARN_ON(skb_headroom(skb) < 3)) {
+ dev_kfree_skb(skb);
+ skb = NULL;
+ } else {
+ u8 *data = skb->data;
+ size_t len = skb_headlen(skb);
+ skb->data -= align;
+ memmove(skb->data, data, len);
+ skb_set_tail_pointer(skb, len);
+ }
+ }
+ }
+#endif
+
+ if (skb) {
+ /* deliver to local stack */
+ skb->protocol = eth_type_trans(skb, dev);
+ memset(skb->cb, 0, sizeof(skb->cb));
+ if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) &&
+ rx->local->napi)
+ napi_gro_receive(rx->local->napi, skb);
+ else
+ netif_receive_skb(skb);
+ }
+
+ if (xmit_skb) {
+ /*
+ * Send to wireless media and increase priority by 256 to
+ * keep the received priority instead of reclassifying
+ * the frame (see cfg80211_classify8021d).
+ */
+ xmit_skb->priority += 256;
+ xmit_skb->protocol = htons(ETH_P_802_3);
+ skb_reset_network_header(xmit_skb);
+ skb_reset_mac_header(xmit_skb);
+ dev_queue_xmit(xmit_skb);
+ }
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
+{
+ struct net_device *dev = rx->sdata->dev;
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ __le16 fc = hdr->frame_control;
+ struct sk_buff_head frame_list;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+ if (unlikely(!ieee80211_is_data(fc)))
+ return RX_CONTINUE;
+
+ if (unlikely(!ieee80211_is_data_present(fc)))
+ return RX_DROP_MONITOR;
+
+ if (!(status->rx_flags & IEEE80211_RX_AMSDU))
+ return RX_CONTINUE;
+
+ if (ieee80211_has_a4(hdr->frame_control) &&
+ rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ !rx->sdata->u.vlan.sta)
+ return RX_DROP_UNUSABLE;
+
+ if (is_multicast_ether_addr(hdr->addr1) &&
+ ((rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ rx->sdata->u.vlan.sta) ||
+ (rx->sdata->vif.type == NL80211_IFTYPE_STATION &&
+ rx->sdata->u.mgd.use_4addr)))
+ return RX_DROP_UNUSABLE;
+
+ skb->dev = dev;
+ __skb_queue_head_init(&frame_list);
+
+ if (skb_linearize(skb))
+ return RX_DROP_UNUSABLE;
+
+ ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
+ rx->sdata->vif.type,
+ rx->local->hw.extra_tx_headroom, true);
+
+ while (!skb_queue_empty(&frame_list)) {
+ rx->skb = __skb_dequeue(&frame_list);
+
+ if (!ieee80211_frame_allowed(rx, fc)) {
+ dev_kfree_skb(rx->skb);
+ continue;
+ }
+
+ ieee80211_deliver_skb(rx);
+ }
+
+ return RX_QUEUED;
+}
+
+#ifdef CONFIG_MAC80211_MESH
+static ieee80211_rx_result
+ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *fwd_hdr, *hdr;
+ struct ieee80211_tx_info *info;
+ struct ieee80211s_hdr *mesh_hdr;
+ struct sk_buff *skb = rx->skb, *fwd_skb;
+ struct ieee80211_local *local = rx->local;
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+ u16 q, hdrlen;
+
+ hdr = (struct ieee80211_hdr *) skb->data;
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ /* make sure fixed part of mesh header is there, also checks skb len */
+ if (!pskb_may_pull(rx->skb, hdrlen + 6))
+ return RX_DROP_MONITOR;
+
+ mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen);
+
+ /* make sure full mesh header is there, also checks skb len */
+ if (!pskb_may_pull(rx->skb,
+ hdrlen + ieee80211_get_mesh_hdrlen(mesh_hdr)))
+ return RX_DROP_MONITOR;
+
+ /* reload pointers */
+ hdr = (struct ieee80211_hdr *) skb->data;
+ mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen);
+
+ if (ieee80211_drop_unencrypted(rx, hdr->frame_control))
+ return RX_DROP_MONITOR;
+
+ /* frame is in RMC, don't forward */
+ if (ieee80211_is_data(hdr->frame_control) &&
+ is_multicast_ether_addr(hdr->addr1) &&
+ mesh_rmc_check(rx->sdata, hdr->addr3, mesh_hdr))
+ return RX_DROP_MONITOR;
+
+ if (!ieee80211_is_data(hdr->frame_control) ||
+ !(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ return RX_CONTINUE;
+
+ if (!mesh_hdr->ttl)
+ return RX_DROP_MONITOR;
+
+ if (mesh_hdr->flags & MESH_FLAGS_AE) {
+ struct mesh_path *mppath;
+ char *proxied_addr;
+ char *mpp_addr;
+
+ if (is_multicast_ether_addr(hdr->addr1)) {
+ mpp_addr = hdr->addr3;
+ proxied_addr = mesh_hdr->eaddr1;
+ } else if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6) {
+ /* has_a4 already checked in ieee80211_rx_mesh_check */
+ mpp_addr = hdr->addr4;
+ proxied_addr = mesh_hdr->eaddr2;
+ } else {
+ return RX_DROP_MONITOR;
+ }
+
+ rcu_read_lock();
+ mppath = mpp_path_lookup(sdata, proxied_addr);
+ if (!mppath) {
+ mpp_path_add(sdata, proxied_addr, mpp_addr);
+ } else {
+ spin_lock_bh(&mppath->state_lock);
+ if (!ether_addr_equal(mppath->mpp, mpp_addr))
+ memcpy(mppath->mpp, mpp_addr, ETH_ALEN);
+ spin_unlock_bh(&mppath->state_lock);
+ }
+ rcu_read_unlock();
+ }
+
+ /* Frame has reached destination. Don't forward */
+ if (!is_multicast_ether_addr(hdr->addr1) &&
+ ether_addr_equal(sdata->vif.addr, hdr->addr3))
+ return RX_CONTINUE;
+
+ q = ieee80211_select_queue_80211(sdata, skb, hdr);
+ if (ieee80211_queue_stopped(&local->hw, q)) {
+ IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion);
+ return RX_DROP_MONITOR;
+ }
+ skb_set_queue_mapping(skb, q);
+
+ if (!--mesh_hdr->ttl) {
+ IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl);
+ goto out;
+ }
+
+ if (!ifmsh->mshcfg.dot11MeshForwarding)
+ goto out;
+
+ fwd_skb = skb_copy(skb, GFP_ATOMIC);
+ if (!fwd_skb) {
+ net_info_ratelimited("%s: failed to clone mesh frame\n",
+ sdata->name);
+ goto out;
+ }
+
+ fwd_hdr = (struct ieee80211_hdr *) fwd_skb->data;
+ fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY);
+ info = IEEE80211_SKB_CB(fwd_skb);
+ memset(info, 0, sizeof(*info));
+ info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->control.vif = &rx->sdata->vif;
+ info->control.jiffies = jiffies;
+ if (is_multicast_ether_addr(fwd_hdr->addr1)) {
+ IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_mcast);
+ memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN);
+ /* update power mode indication when forwarding */
+ ieee80211_mps_set_frame_flags(sdata, NULL, fwd_hdr);
+ } else if (!mesh_nexthop_lookup(sdata, fwd_skb)) {
+ /* mesh power mode flags updated in mesh_nexthop_lookup */
+ IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_unicast);
+ } else {
+ /* unable to resolve next hop */
+ mesh_path_error_tx(sdata, ifmsh->mshcfg.element_ttl,
+ fwd_hdr->addr3, 0,
+ WLAN_REASON_MESH_PATH_NOFORWARD,
+ fwd_hdr->addr2);
+ IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_no_route);
+ kfree_skb(fwd_skb);
+ return RX_DROP_MONITOR;
+ }
+
+ IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames);
+ ieee80211_add_pending_skb(local, fwd_skb);
+ out:
+ if (is_multicast_ether_addr(hdr->addr1) ||
+ sdata->dev->flags & IFF_PROMISC)
+ return RX_CONTINUE;
+ else
+ return RX_DROP_MONITOR;
+}
+#endif
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_local *local = rx->local;
+ struct net_device *dev = sdata->dev;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ __le16 fc = hdr->frame_control;
+ bool port_control;
+ int err;
+
+ if (unlikely(!ieee80211_is_data(hdr->frame_control)))
+ return RX_CONTINUE;
+
+ if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
+ return RX_DROP_MONITOR;
+
+ if (rx->sta) {
+ /* The seqno index has the same property as needed
+ * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
+ * for non-QoS-data frames. Here we know it's a data
+ * frame, so count MSDUs.
+ */
+ rx->sta->rx_msdu[rx->seqno_idx]++;
+ }
+
+ /*
+ * Send unexpected-4addr-frame event to hostapd. For older versions,
+ * also drop the frame to cooked monitor interfaces.
+ */
+ if (ieee80211_has_a4(hdr->frame_control) &&
+ sdata->vif.type == NL80211_IFTYPE_AP) {
+ if (rx->sta &&
+ !test_and_set_sta_flag(rx->sta, WLAN_STA_4ADDR_EVENT))
+ cfg80211_rx_unexpected_4addr_frame(
+ rx->sdata->dev, rx->sta->sta.addr, GFP_ATOMIC);
+ return RX_DROP_MONITOR;
+ }
+
+ err = __ieee80211_data_to_8023(rx, &port_control);
+ if (unlikely(err))
+ return RX_DROP_UNUSABLE;
+
+ if (!ieee80211_frame_allowed(rx, fc))
+ return RX_DROP_MONITOR;
+
+ /* directly handle TDLS channel switch requests/responses */
+ if (unlikely(((struct ethhdr *)rx->skb->data)->h_proto ==
+ cpu_to_be16(ETH_P_TDLS))) {
+ struct ieee80211_tdls_data *tf = (void *)rx->skb->data;
+
+ if (pskb_may_pull(rx->skb,
+ offsetof(struct ieee80211_tdls_data, u)) &&
+ tf->payload_type == WLAN_TDLS_SNAP_RFTYPE &&
+ tf->category == WLAN_CATEGORY_TDLS &&
+ (tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_REQUEST ||
+ tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_RESPONSE)) {
+ rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TDLS_CHSW;
+ skb_queue_tail(&sdata->skb_queue, rx->skb);
+ ieee80211_queue_work(&rx->local->hw, &sdata->work);
+ if (rx->sta)
+ rx->sta->rx_packets++;
+
+ return RX_QUEUED;
+ }
+ }
+
+ if (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ unlikely(port_control) && sdata->bss) {
+ sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
+ u.ap);
+ dev = sdata->dev;
+ rx->sdata = sdata;
+ }
+
+ rx->skb->dev = dev;
+
+ if (local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 &&
+ !is_multicast_ether_addr(
+ ((struct ethhdr *)rx->skb->data)->h_dest) &&
+ (!local->scanning &&
+ !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))) {
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
+ }
+
+ ieee80211_deliver_skb(rx);
+
+ return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_bar *bar = (struct ieee80211_bar *)skb->data;
+ struct tid_ampdu_rx *tid_agg_rx;
+ u16 start_seq_num;
+ u16 tid;
+
+ if (likely(!ieee80211_is_ctl(bar->frame_control)))
+ return RX_CONTINUE;
+
+ if (ieee80211_is_back_req(bar->frame_control)) {
+ struct {
+ __le16 control, start_seq_num;
+ } __packed bar_data;
+
+ if (!rx->sta)
+ return RX_DROP_MONITOR;
+
+ if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control),
+ &bar_data, sizeof(bar_data)))
+ return RX_DROP_MONITOR;
+
+ tid = le16_to_cpu(bar_data.control) >> 12;
+
+ tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
+ if (!tid_agg_rx)
+ return RX_DROP_MONITOR;
+
+ start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4;
+
+ /* reset session timer */
+ if (tid_agg_rx->timeout)
+ mod_timer(&tid_agg_rx->session_timer,
+ TU_TO_EXP_TIME(tid_agg_rx->timeout));
+
+ spin_lock(&tid_agg_rx->reorder_lock);
+ /* release stored frames up to start of BAR */
+ ieee80211_release_reorder_frames(rx->sdata, tid_agg_rx,
+ start_seq_num, frames);
+ spin_unlock(&tid_agg_rx->reorder_lock);
+
+ kfree_skb(skb);
+ return RX_QUEUED;
+ }
+
+ /*
+ * After this point, we only want management frames,
+ * so we can drop all remaining control frames to
+ * cooked monitor interfaces.
+ */
+ return RX_DROP_MONITOR;
+}
+
+static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *resp;
+
+ if (!ether_addr_equal(mgmt->da, sdata->vif.addr)) {
+ /* Not to own unicast address */
+ return;
+ }
+
+ if (!ether_addr_equal(mgmt->sa, sdata->u.mgd.bssid) ||
+ !ether_addr_equal(mgmt->bssid, sdata->u.mgd.bssid)) {
+ /* Not from the current AP or not associated yet. */
+ return;
+ }
+
+ if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) {
+ /* Too short SA Query request frame */
+ return;
+ }
+
+ skb = dev_alloc_skb(sizeof(*resp) + local->hw.extra_tx_headroom);
+ if (skb == NULL)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+ resp = (struct ieee80211_mgmt *) skb_put(skb, 24);
+ memset(resp, 0, 24);
+ memcpy(resp->da, mgmt->sa, ETH_ALEN);
+ memcpy(resp->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+ resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+ skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
+ resp->u.action.category = WLAN_CATEGORY_SA_QUERY;
+ resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE;
+ memcpy(resp->u.action.u.sa_query.trans_id,
+ mgmt->u.action.u.sa_query.trans_id,
+ WLAN_SA_QUERY_TR_ID_LEN);
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+ /*
+ * From here on, look only at management frames.
+ * Data and control frames are already handled,
+ * and unknown (reserved) frames are useless.
+ */
+ if (rx->skb->len < 24)
+ return RX_DROP_MONITOR;
+
+ if (!ieee80211_is_mgmt(mgmt->frame_control))
+ return RX_DROP_MONITOR;
+
+ if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
+ ieee80211_is_beacon(mgmt->frame_control) &&
+ !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) {
+ int sig = 0;
+
+ if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ sig = status->signal;
+
+ cfg80211_report_obss_beacon(rx->local->hw.wiphy,
+ rx->skb->data, rx->skb->len,
+ status->freq, sig);
+ rx->flags |= IEEE80211_RX_BEACON_REPORTED;
+ }
+
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ return RX_DROP_MONITOR;
+
+ if (ieee80211_drop_unencrypted_mgmt(rx))
+ return RX_DROP_UNUSABLE;
+
+ return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_local *local = rx->local;
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ int len = rx->skb->len;
+
+ if (!ieee80211_is_action(mgmt->frame_control))
+ return RX_CONTINUE;
+
+ /* drop too small frames */
+ if (len < IEEE80211_MIN_ACTION_SIZE)
+ return RX_DROP_UNUSABLE;
+
+ if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC &&
+ mgmt->u.action.category != WLAN_CATEGORY_SELF_PROTECTED &&
+ mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT)
+ return RX_DROP_UNUSABLE;
+
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ return RX_DROP_UNUSABLE;
+
+ switch (mgmt->u.action.category) {
+ case WLAN_CATEGORY_HT:
+ /* reject HT action frames from stations not supporting HT */
+ if (!rx->sta->sta.ht_cap.ht_supported)
+ goto invalid;
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
+ sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ sdata->vif.type != NL80211_IFTYPE_AP &&
+ sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ break;
+
+ /* verify action & smps_control/chanwidth are present */
+ if (len < IEEE80211_MIN_ACTION_SIZE + 2)
+ goto invalid;
+
+ switch (mgmt->u.action.u.ht_smps.action) {
+ case WLAN_HT_ACTION_SMPS: {
+ struct ieee80211_supported_band *sband;
+ enum ieee80211_smps_mode smps_mode;
+
+ /* convert to HT capability */
+ switch (mgmt->u.action.u.ht_smps.smps_control) {
+ case WLAN_HT_SMPS_CONTROL_DISABLED:
+ smps_mode = IEEE80211_SMPS_OFF;
+ break;
+ case WLAN_HT_SMPS_CONTROL_STATIC:
+ smps_mode = IEEE80211_SMPS_STATIC;
+ break;
+ case WLAN_HT_SMPS_CONTROL_DYNAMIC:
+ smps_mode = IEEE80211_SMPS_DYNAMIC;
+ break;
+ default:
+ goto invalid;
+ }
+
+ /* if no change do nothing */
+ if (rx->sta->sta.smps_mode == smps_mode)
+ goto handled;
+ rx->sta->sta.smps_mode = smps_mode;
+
+ sband = rx->local->hw.wiphy->bands[status->band];
+
+ rate_control_rate_update(local, sband, rx->sta,
+ IEEE80211_RC_SMPS_CHANGED);
+ goto handled;
+ }
+ case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: {
+ struct ieee80211_supported_band *sband;
+ u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth;
+ enum ieee80211_sta_rx_bandwidth max_bw, new_bw;
+
+ /* If it doesn't support 40 MHz it can't change ... */
+ if (!(rx->sta->sta.ht_cap.cap &
+ IEEE80211_HT_CAP_SUP_WIDTH_20_40))
+ goto handled;
+
+ if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ)
+ max_bw = IEEE80211_STA_RX_BW_20;
+ else
+ max_bw = ieee80211_sta_cap_rx_bw(rx->sta);
+
+ /* set cur_max_bandwidth and recalc sta bw */
+ rx->sta->cur_max_bandwidth = max_bw;
+ new_bw = ieee80211_sta_cur_vht_bw(rx->sta);
+
+ if (rx->sta->sta.bandwidth == new_bw)
+ goto handled;
+
+ rx->sta->sta.bandwidth = new_bw;
+ sband = rx->local->hw.wiphy->bands[status->band];
+
+ rate_control_rate_update(local, sband, rx->sta,
+ IEEE80211_RC_BW_CHANGED);
+ goto handled;
+ }
+ default:
+ goto invalid;
+ }
+
+ break;
+ case WLAN_CATEGORY_PUBLIC:
+ if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+ goto invalid;
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ break;
+ if (!rx->sta)
+ break;
+ if (!ether_addr_equal(mgmt->bssid, sdata->u.mgd.bssid))
+ break;
+ if (mgmt->u.action.u.ext_chan_switch.action_code !=
+ WLAN_PUB_ACTION_EXT_CHANSW_ANN)
+ break;
+ if (len < offsetof(struct ieee80211_mgmt,
+ u.action.u.ext_chan_switch.variable))
+ goto invalid;
+ goto queue;
+ case WLAN_CATEGORY_VHT:
+ if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
+ sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ sdata->vif.type != NL80211_IFTYPE_AP &&
+ sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ break;
+
+ /* verify action code is present */
+ if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+ goto invalid;
+
+ switch (mgmt->u.action.u.vht_opmode_notif.action_code) {
+ case WLAN_VHT_ACTION_OPMODE_NOTIF: {
+ u8 opmode;
+
+ /* verify opmode is present */
+ if (len < IEEE80211_MIN_ACTION_SIZE + 2)
+ goto invalid;
+
+ opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
+
+ ieee80211_vht_handle_opmode(rx->sdata, rx->sta,
+ opmode, status->band,
+ false);
+ goto handled;
+ }
+ default:
+ break;
+ }
+ break;
+ case WLAN_CATEGORY_BACK:
+ if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
+ sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ sdata->vif.type != NL80211_IFTYPE_AP &&
+ sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ break;
+
+ /* verify action_code is present */
+ if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+ break;
+
+ switch (mgmt->u.action.u.addba_req.action_code) {
+ case WLAN_ACTION_ADDBA_REQ:
+ if (len < (IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.addba_req)))
+ goto invalid;
+ break;
+ case WLAN_ACTION_ADDBA_RESP:
+ if (len < (IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.addba_resp)))
+ goto invalid;
+ break;
+ case WLAN_ACTION_DELBA:
+ if (len < (IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.delba)))
+ goto invalid;
+ break;
+ default:
+ goto invalid;
+ }
+
+ goto queue;
+ case WLAN_CATEGORY_SPECTRUM_MGMT:
+ /* verify action_code is present */
+ if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+ break;
+
+ switch (mgmt->u.action.u.measurement.action_code) {
+ case WLAN_ACTION_SPCT_MSR_REQ:
+ if (status->band != IEEE80211_BAND_5GHZ)
+ break;
+
+ if (len < (IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.measurement)))
+ break;
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ break;
+
+ ieee80211_process_measurement_req(sdata, mgmt, len);
+ goto handled;
+ case WLAN_ACTION_SPCT_CHL_SWITCH: {
+ u8 *bssid;
+ if (len < (IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.chan_switch)))
+ break;
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+ sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
+ break;
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ bssid = sdata->u.mgd.bssid;
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ bssid = sdata->u.ibss.bssid;
+ else if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
+ bssid = mgmt->sa;
+ else
+ break;
+
+ if (!ether_addr_equal(mgmt->bssid, bssid))
+ break;
+
+ goto queue;
+ }
+ }
+ break;
+ case WLAN_CATEGORY_SA_QUERY:
+ if (len < (IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.sa_query)))
+ break;
+
+ switch (mgmt->u.action.u.sa_query.action) {
+ case WLAN_ACTION_SA_QUERY_REQUEST:
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ break;
+ ieee80211_process_sa_query_req(sdata, mgmt, len);
+ goto handled;
+ }
+ break;
+ case WLAN_CATEGORY_SELF_PROTECTED:
+ if (len < (IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.self_prot.action_code)))
+ break;
+
+ switch (mgmt->u.action.u.self_prot.action_code) {
+ case WLAN_SP_MESH_PEERING_OPEN:
+ case WLAN_SP_MESH_PEERING_CLOSE:
+ case WLAN_SP_MESH_PEERING_CONFIRM:
+ if (!ieee80211_vif_is_mesh(&sdata->vif))
+ goto invalid;
+ if (sdata->u.mesh.user_mpm)
+ /* userspace handles this frame */
+ break;
+ goto queue;
+ case WLAN_SP_MGK_INFORM:
+ case WLAN_SP_MGK_ACK:
+ if (!ieee80211_vif_is_mesh(&sdata->vif))
+ goto invalid;
+ break;
+ }
+ break;
+ case WLAN_CATEGORY_MESH_ACTION:
+ if (len < (IEEE80211_MIN_ACTION_SIZE +
+ sizeof(mgmt->u.action.u.mesh_action.action_code)))
+ break;
+
+ if (!ieee80211_vif_is_mesh(&sdata->vif))
+ break;
+ if (mesh_action_is_path_sel(mgmt) &&
+ !mesh_path_sel_is_hwmp(sdata))
+ break;
+ goto queue;
+ }
+
+ return RX_CONTINUE;
+
+ invalid:
+ status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM;
+ /* will return in the next handlers */
+ return RX_CONTINUE;
+
+ handled:
+ if (rx->sta)
+ rx->sta->rx_packets++;
+ dev_kfree_skb(rx->skb);
+ return RX_QUEUED;
+
+ queue:
+ rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+ skb_queue_tail(&sdata->skb_queue, rx->skb);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+ if (rx->sta)
+ rx->sta->rx_packets++;
+ return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ int sig = 0;
+
+ /* skip known-bad action frames and return them in the next handler */
+ if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM)
+ return RX_CONTINUE;
+
+ /*
+ * Getting here means the kernel doesn't know how to handle
+ * it, but maybe userspace does ... include returned frames
+ * so userspace can register for those to know whether ones
+ * it transmitted were processed or returned.
+ */
+
+ if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ sig = status->signal;
+
+ if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig,
+ rx->skb->data, rx->skb->len, 0)) {
+ if (rx->sta)
+ rx->sta->rx_packets++;
+ dev_kfree_skb(rx->skb);
+ return RX_QUEUED;
+ }
+
+ return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_local *local = rx->local;
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+ struct sk_buff *nskb;
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+ if (!ieee80211_is_action(mgmt->frame_control))
+ return RX_CONTINUE;
+
+ /*
+ * For AP mode, hostapd is responsible for handling any action
+ * frames that we didn't handle, including returning unknown
+ * ones. For all other modes we will return them to the sender,
+ * setting the 0x80 bit in the action category, as required by
+ * 802.11-2012 9.24.4.
+ * Newer versions of hostapd shall also use the management frame
+ * registration mechanisms, but older ones still use cooked
+ * monitor interfaces so push all frames there.
+ */
+ if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) &&
+ (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
+ return RX_DROP_MONITOR;
+
+ if (is_multicast_ether_addr(mgmt->da))
+ return RX_DROP_MONITOR;
+
+ /* do not return rejected action frames */
+ if (mgmt->u.action.category & 0x80)
+ return RX_DROP_UNUSABLE;
+
+ nskb = skb_copy_expand(rx->skb, local->hw.extra_tx_headroom, 0,
+ GFP_ATOMIC);
+ if (nskb) {
+ struct ieee80211_mgmt *nmgmt = (void *)nskb->data;
+
+ nmgmt->u.action.category |= 0x80;
+ memcpy(nmgmt->da, nmgmt->sa, ETH_ALEN);
+ memcpy(nmgmt->sa, rx->sdata->vif.addr, ETH_ALEN);
+
+ memset(nskb->cb, 0, sizeof(nskb->cb));
+
+ if (rx->sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) {
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(nskb);
+
+ info->flags = IEEE80211_TX_CTL_TX_OFFCHAN |
+ IEEE80211_TX_INTFL_OFFCHAN_TX_OK |
+ IEEE80211_TX_CTL_NO_CCK_RATE;
+ if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
+ info->hw_queue =
+ local->hw.offchannel_tx_hw_queue;
+ }
+
+ __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7,
+ status->band);
+ }
+ dev_kfree_skb(rx->skb);
+ return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
+ __le16 stype;
+
+ stype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_STYPE);
+
+ if (!ieee80211_vif_is_mesh(&sdata->vif) &&
+ sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ sdata->vif.type != NL80211_IFTYPE_OCB &&
+ sdata->vif.type != NL80211_IFTYPE_STATION)
+ return RX_DROP_MONITOR;
+
+ switch (stype) {
+ case cpu_to_le16(IEEE80211_STYPE_AUTH):
+ case cpu_to_le16(IEEE80211_STYPE_BEACON):
+ case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP):
+ /* process for all: mesh, mlme, ibss */
+ break;
+ case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP):
+ case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP):
+ case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
+ case cpu_to_le16(IEEE80211_STYPE_DISASSOC):
+ if (is_multicast_ether_addr(mgmt->da) &&
+ !is_broadcast_ether_addr(mgmt->da))
+ return RX_DROP_MONITOR;
+
+ /* process only for station */
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return RX_DROP_MONITOR;
+ break;
+ case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ):
+ /* process only for ibss and mesh */
+ if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
+ return RX_DROP_MONITOR;
+ break;
+ default:
+ return RX_DROP_MONITOR;
+ }
+
+ /* queue up frame and kick off work to process it */
+ rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+ skb_queue_tail(&sdata->skb_queue, rx->skb);
+ ieee80211_queue_work(&rx->local->hw, &sdata->work);
+ if (rx->sta)
+ rx->sta->rx_packets++;
+
+ return RX_QUEUED;
+}
+
+/* TODO: use IEEE80211_RX_FRAGMENTED */
+static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
+ struct ieee80211_rate *rate)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_local *local = rx->local;
+ struct sk_buff *skb = rx->skb, *skb2;
+ struct net_device *prev_dev = NULL;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ int needed_headroom;
+
+ /*
+ * If cooked monitor has been processed already, then
+ * don't do it again. If not, set the flag.
+ */
+ if (rx->flags & IEEE80211_RX_CMNTR)
+ goto out_free_skb;
+ rx->flags |= IEEE80211_RX_CMNTR;
+
+ /* If there are no cooked monitor interfaces, just free the SKB */
+ if (!local->cooked_mntrs)
+ goto out_free_skb;
+
+ /* vendor data is long removed here */
+ status->flag &= ~RX_FLAG_RADIOTAP_VENDOR_DATA;
+ /* room for the radiotap header based on driver features */
+ needed_headroom = ieee80211_rx_radiotap_hdrlen(local, status, skb);
+
+ if (skb_headroom(skb) < needed_headroom &&
+ pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC))
+ goto out_free_skb;
+
+ /* prepend radiotap information */
+ ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom,
+ false);
+
+ skb_set_mac_header(skb, 0);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->pkt_type = PACKET_OTHERHOST;
+ skb->protocol = htons(ETH_P_802_2);
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
+ !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
+ continue;
+
+ if (prev_dev) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2) {
+ skb2->dev = prev_dev;
+ netif_receive_skb(skb2);
+ }
+ }
+
+ prev_dev = sdata->dev;
+ sdata->dev->stats.rx_packets++;
+ sdata->dev->stats.rx_bytes += skb->len;
+ }
+
+ if (prev_dev) {
+ skb->dev = prev_dev;
+ netif_receive_skb(skb);
+ return;
+ }
+
+ out_free_skb:
+ dev_kfree_skb(skb);
+}
+
+static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
+ ieee80211_rx_result res)
+{
+ switch (res) {
+ case RX_DROP_MONITOR:
+ I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
+ if (rx->sta)
+ rx->sta->rx_dropped++;
+ /* fall through */
+ case RX_CONTINUE: {
+ struct ieee80211_rate *rate = NULL;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_rx_status *status;
+
+ status = IEEE80211_SKB_RXCB((rx->skb));
+
+ sband = rx->local->hw.wiphy->bands[status->band];
+ if (!(status->flag & RX_FLAG_HT) &&
+ !(status->flag & RX_FLAG_VHT))
+ rate = &sband->bitrates[status->rate_idx];
+
+ ieee80211_rx_cooked_monitor(rx, rate);
+ break;
+ }
+ case RX_DROP_UNUSABLE:
+ I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
+ if (rx->sta)
+ rx->sta->rx_dropped++;
+ dev_kfree_skb(rx->skb);
+ break;
+ case RX_QUEUED:
+ I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
+ break;
+ }
+}
+
+static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
+ struct sk_buff_head *frames)
+{
+ ieee80211_rx_result res = RX_DROP_MONITOR;
+ struct sk_buff *skb;
+
+#define CALL_RXH(rxh) \
+ do { \
+ res = rxh(rx); \
+ if (res != RX_CONTINUE) \
+ goto rxh_next; \
+ } while (0);
+
+ /* Lock here to avoid hitting all of the data used in the RX
+ * path (e.g. key data, station data, ...) concurrently when
+ * a frame is released from the reorder buffer due to timeout
+ * from the timer, potentially concurrently with RX from the
+ * driver.
+ */
+ spin_lock_bh(&rx->local->rx_path_lock);
+
+ while ((skb = __skb_dequeue(frames))) {
+ /*
+ * all the other fields are valid across frames
+ * that belong to an aMPDU since they are on the
+ * same TID from the same station
+ */
+ rx->skb = skb;
+
+ CALL_RXH(ieee80211_rx_h_check_more_data)
+ CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll)
+ CALL_RXH(ieee80211_rx_h_sta_process)
+ CALL_RXH(ieee80211_rx_h_decrypt)
+ CALL_RXH(ieee80211_rx_h_defragment)
+ CALL_RXH(ieee80211_rx_h_michael_mic_verify)
+ /* must be after MMIC verify so header is counted in MPDU mic */
+#ifdef CONFIG_MAC80211_MESH
+ if (ieee80211_vif_is_mesh(&rx->sdata->vif))
+ CALL_RXH(ieee80211_rx_h_mesh_fwding);
+#endif
+ CALL_RXH(ieee80211_rx_h_amsdu)
+ CALL_RXH(ieee80211_rx_h_data)
+
+ /* special treatment -- needs the queue */
+ res = ieee80211_rx_h_ctrl(rx, frames);
+ if (res != RX_CONTINUE)
+ goto rxh_next;
+
+ CALL_RXH(ieee80211_rx_h_mgmt_check)
+ CALL_RXH(ieee80211_rx_h_action)
+ CALL_RXH(ieee80211_rx_h_userspace_mgmt)
+ CALL_RXH(ieee80211_rx_h_action_return)
+ CALL_RXH(ieee80211_rx_h_mgmt)
+
+ rxh_next:
+ ieee80211_rx_handlers_result(rx, res);
+
+#undef CALL_RXH
+ }
+
+ spin_unlock_bh(&rx->local->rx_path_lock);
+}
+
+static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
+{
+ struct sk_buff_head reorder_release;
+ ieee80211_rx_result res = RX_DROP_MONITOR;
+
+ __skb_queue_head_init(&reorder_release);
+
+#define CALL_RXH(rxh) \
+ do { \
+ res = rxh(rx); \
+ if (res != RX_CONTINUE) \
+ goto rxh_next; \
+ } while (0);
+
+ CALL_RXH(ieee80211_rx_h_check_dup)
+ CALL_RXH(ieee80211_rx_h_check)
+
+ ieee80211_rx_reorder_ampdu(rx, &reorder_release);
+
+ ieee80211_rx_handlers(rx, &reorder_release);
+ return;
+
+ rxh_next:
+ ieee80211_rx_handlers_result(rx, res);
+
+#undef CALL_RXH
+}
+
+/*
+ * This function makes calls into the RX path, therefore
+ * it has to be invoked under RCU read lock.
+ */
+void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
+{
+ struct sk_buff_head frames;
+ struct ieee80211_rx_data rx = {
+ .sta = sta,
+ .sdata = sta->sdata,
+ .local = sta->local,
+ /* This is OK -- must be QoS data frame */
+ .security_idx = tid,
+ .seqno_idx = tid,
+ .flags = IEEE80211_RX_REORDER_TIMER,
+ };
+ struct tid_ampdu_rx *tid_agg_rx;
+
+ tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+ if (!tid_agg_rx)
+ return;
+
+ __skb_queue_head_init(&frames);
+
+ spin_lock(&tid_agg_rx->reorder_lock);
+ ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames);
+ spin_unlock(&tid_agg_rx->reorder_lock);
+
+ ieee80211_rx_handlers(&rx, &frames);
+}
+
+/* main receive path */
+
+static bool prepare_for_handlers(struct ieee80211_rx_data *rx,
+ struct ieee80211_hdr *hdr)
+{
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
+ int multicast = is_multicast_ether_addr(hdr->addr1);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ if (!bssid && !sdata->u.mgd.use_4addr)
+ return false;
+ if (!multicast &&
+ !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
+ if (!(sdata->dev->flags & IFF_PROMISC) ||
+ sdata->u.mgd.use_4addr)
+ return false;
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+ }
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ if (!bssid)
+ return false;
+ if (ether_addr_equal(sdata->vif.addr, hdr->addr2) ||
+ ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2))
+ return false;
+ if (ieee80211_is_beacon(hdr->frame_control)) {
+ return true;
+ } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
+ return false;
+ } else if (!multicast &&
+ !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
+ if (!(sdata->dev->flags & IFF_PROMISC))
+ return false;
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+ } else if (!rx->sta) {
+ int rate_idx;
+ if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT))
+ rate_idx = 0; /* TODO: HT/VHT rates */
+ else
+ rate_idx = status->rate_idx;
+ ieee80211_ibss_rx_no_sta(sdata, bssid, hdr->addr2,
+ BIT(rate_idx));
+ }
+ break;
+ case NL80211_IFTYPE_OCB:
+ if (!bssid)
+ return false;
+ if (ieee80211_is_beacon(hdr->frame_control)) {
+ return false;
+ } else if (!is_broadcast_ether_addr(bssid)) {
+ ocb_dbg(sdata, "BSSID mismatch in OCB mode!\n");
+ return false;
+ } else if (!multicast &&
+ !ether_addr_equal(sdata->dev->dev_addr,
+ hdr->addr1)) {
+ /* if we are in promisc mode we also accept
+ * packets not destined for us
+ */
+ if (!(sdata->dev->flags & IFF_PROMISC))
+ return false;
+ rx->flags &= ~IEEE80211_RX_RA_MATCH;
+ } else if (!rx->sta) {
+ int rate_idx;
+ if (status->flag & RX_FLAG_HT)
+ rate_idx = 0; /* TODO: HT rates */
+ else
+ rate_idx = status->rate_idx;
+ ieee80211_ocb_rx_no_sta(sdata, bssid, hdr->addr2,
+ BIT(rate_idx));
+ }
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ if (!multicast &&
+ !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
+ if (!(sdata->dev->flags & IFF_PROMISC))
+ return false;
+
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+ }
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_AP:
+ if (!bssid) {
+ if (!ether_addr_equal(sdata->vif.addr, hdr->addr1))
+ return false;
+ } else if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) {
+ /*
+ * Accept public action frames even when the
+ * BSSID doesn't match, this is used for P2P
+ * and location updates. Note that mac80211
+ * itself never looks at these frames.
+ */
+ if (!multicast &&
+ !ether_addr_equal(sdata->vif.addr, hdr->addr1))
+ return false;
+ if (ieee80211_is_public_action(hdr, skb->len))
+ return true;
+ if (!ieee80211_is_beacon(hdr->frame_control))
+ return false;
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+ } else if (!ieee80211_has_tods(hdr->frame_control)) {
+ /* ignore data frames to TDLS-peers */
+ if (ieee80211_is_data(hdr->frame_control))
+ return false;
+ /* ignore action frames to TDLS-peers */
+ if (ieee80211_is_action(hdr->frame_control) &&
+ !ether_addr_equal(bssid, hdr->addr1))
+ return false;
+ }
+ break;
+ case NL80211_IFTYPE_WDS:
+ if (bssid || !ieee80211_is_data(hdr->frame_control))
+ return false;
+ if (!ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2))
+ return false;
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ if (!ieee80211_is_public_action(hdr, skb->len) &&
+ !ieee80211_is_probe_req(hdr->frame_control) &&
+ !ieee80211_is_probe_resp(hdr->frame_control) &&
+ !ieee80211_is_beacon(hdr->frame_control))
+ return false;
+ if (!ether_addr_equal(sdata->vif.addr, hdr->addr1) &&
+ !multicast)
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+ break;
+ default:
+ /* should never get here */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return true;
+}
+
+/*
+ * This function returns whether or not the SKB
+ * was destined for RX processing or not, which,
+ * if consume is true, is equivalent to whether
+ * or not the skb was consumed.
+ */
+static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
+ struct sk_buff *skb, bool consume)
+{
+ struct ieee80211_local *local = rx->local;
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+
+ rx->skb = skb;
+ status->rx_flags |= IEEE80211_RX_RA_MATCH;
+
+ if (!prepare_for_handlers(rx, hdr))
+ return false;
+
+ if (!consume) {
+ skb = skb_copy(skb, GFP_ATOMIC);
+ if (!skb) {
+ if (net_ratelimit())
+ wiphy_debug(local->hw.wiphy,
+ "failed to copy skb for %s\n",
+ sdata->name);
+ return true;
+ }
+
+ rx->skb = skb;
+ }
+
+ ieee80211_invoke_rx_handlers(rx);
+ return true;
+}
+
+/*
+ * This is the actual Rx frames handler. as it belongs to Rx path it must
+ * be called with rcu_read_lock protection.
+ */
+static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_hdr *hdr;
+ __le16 fc;
+ struct ieee80211_rx_data rx;
+ struct ieee80211_sub_if_data *prev;
+ struct sta_info *sta, *prev_sta;
+ struct rhash_head *tmp;
+ int err = 0;
+
+ fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
+ memset(&rx, 0, sizeof(rx));
+ rx.skb = skb;
+ rx.local = local;
+
+ if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc))
+ local->dot11ReceivedFragmentCount++;
+
+ if (ieee80211_is_mgmt(fc)) {
+ /* drop frame if too short for header */
+ if (skb->len < ieee80211_hdrlen(fc))
+ err = -ENOBUFS;
+ else
+ err = skb_linearize(skb);
+ } else {
+ err = !pskb_may_pull(skb, ieee80211_hdrlen(fc));
+ }
+
+ if (err) {
+ dev_kfree_skb(skb);
+ return;
+ }
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ ieee80211_parse_qos(&rx);
+ ieee80211_verify_alignment(&rx);
+
+ if (unlikely(ieee80211_is_probe_resp(hdr->frame_control) ||
+ ieee80211_is_beacon(hdr->frame_control)))
+ ieee80211_scan_rx(local, skb);
+
+ if (ieee80211_is_data(fc)) {
+ const struct bucket_table *tbl;
+
+ prev_sta = NULL;
+
+ tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
+
+ for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) {
+ if (!prev_sta) {
+ prev_sta = sta;
+ continue;
+ }
+
+ rx.sta = prev_sta;
+ rx.sdata = prev_sta->sdata;
+ ieee80211_prepare_and_rx_handle(&rx, skb, false);
+
+ prev_sta = sta;
+ }
+
+ if (prev_sta) {
+ rx.sta = prev_sta;
+ rx.sdata = prev_sta->sdata;
+
+ if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+ return;
+ goto out;
+ }
+ }
+
+ prev = NULL;
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ continue;
+
+ /*
+ * frame is destined for this interface, but if it's
+ * not also for the previous one we handle that after
+ * the loop to avoid copying the SKB once too much
+ */
+
+ if (!prev) {
+ prev = sdata;
+ continue;
+ }
+
+ rx.sta = sta_info_get_bss(prev, hdr->addr2);
+ rx.sdata = prev;
+ ieee80211_prepare_and_rx_handle(&rx, skb, false);
+
+ prev = sdata;
+ }
+
+ if (prev) {
+ rx.sta = sta_info_get_bss(prev, hdr->addr2);
+ rx.sdata = prev;
+
+ if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+ return;
+ }
+
+ out:
+ dev_kfree_skb(skb);
+}
+
+/*
+ * This is the receive path handler. It is called by a low level driver when an
+ * 802.11 MPDU is received from the hardware.
+ */
+void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_rate *rate = NULL;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+
+ WARN_ON_ONCE(softirq_count() == 0);
+
+ if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
+ goto drop;
+
+ sband = local->hw.wiphy->bands[status->band];
+ if (WARN_ON(!sband))
+ goto drop;
+
+ /*
+ * If we're suspending, it is possible although not too likely
+ * that we'd be receiving frames after having already partially
+ * quiesced the stack. We can't process such frames then since
+ * that might, for example, cause stations to be added or other
+ * driver callbacks be invoked.
+ */
+ if (unlikely(local->quiescing || local->suspended))
+ goto drop;
+
+ /* We might be during a HW reconfig, prevent Rx for the same reason */
+ if (unlikely(local->in_reconfig))
+ goto drop;
+
+ /*
+ * The same happens when we're not even started,
+ * but that's worth a warning.
+ */
+ if (WARN_ON(!local->started))
+ goto drop;
+
+ if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) {
+ /*
+ * Validate the rate, unless a PLCP error means that
+ * we probably can't have a valid rate here anyway.
+ */
+
+ if (status->flag & RX_FLAG_HT) {
+ /*
+ * rate_idx is MCS index, which can be [0-76]
+ * as documented on:
+ *
+ * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
+ *
+ * Anything else would be some sort of driver or
+ * hardware error. The driver should catch hardware
+ * errors.
+ */
+ if (WARN(status->rate_idx > 76,
+ "Rate marked as an HT rate but passed "
+ "status->rate_idx is not "
+ "an MCS index [0-76]: %d (0x%02x)\n",
+ status->rate_idx,
+ status->rate_idx))
+ goto drop;
+ } else if (status->flag & RX_FLAG_VHT) {
+ if (WARN_ONCE(status->rate_idx > 9 ||
+ !status->vht_nss ||
+ status->vht_nss > 8,
+ "Rate marked as a VHT rate but data is invalid: MCS: %d, NSS: %d\n",
+ status->rate_idx, status->vht_nss))
+ goto drop;
+ } else {
+ if (WARN_ON(status->rate_idx >= sband->n_bitrates))
+ goto drop;
+ rate = &sband->bitrates[status->rate_idx];
+ }
+ }
+
+ status->rx_flags = 0;
+
+ /*
+ * key references and virtual interfaces are protected using RCU
+ * and this requires that we are in a read-side RCU section during
+ * receive processing
+ */
+ rcu_read_lock();
+
+ /*
+ * Frames with failed FCS/PLCP checksum are not returned,
+ * all other frames are returned without radiotap header
+ * if it was previously present.
+ * Also, frames with less than 16 bytes are dropped.
+ */
+ skb = ieee80211_rx_monitor(local, skb, rate);
+ if (!skb) {
+ rcu_read_unlock();
+ return;
+ }
+
+ ieee80211_tpt_led_trig_rx(local,
+ ((struct ieee80211_hdr *)skb->data)->frame_control,
+ skb->len);
+ __ieee80211_rx_handle_packet(hw, skb);
+
+ rcu_read_unlock();
+
+ return;
+ drop:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL(ieee80211_rx);
+
+/* This is a version of the rx handler that can be called from hard irq
+ * context. Post the skb on the queue and schedule the tasklet */
+void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ BUILD_BUG_ON(sizeof(struct ieee80211_rx_status) > sizeof(skb->cb));
+
+ skb->pkt_type = IEEE80211_RX_MSG;
+ skb_queue_tail(&local->skb_queue, skb);
+ tasklet_schedule(&local->tasklet);
+}
+EXPORT_SYMBOL(ieee80211_rx_irqsafe);
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
new file mode 100644
index 000000000..7bb6a9383
--- /dev/null
+++ b/net/mac80211/scan.c
@@ -0,0 +1,1211 @@
+/*
+ * Scanning implementation
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/pm_qos.h>
+#include <net/sch_generic.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "mesh.h"
+
+#define IEEE80211_PROBE_DELAY (HZ / 33)
+#define IEEE80211_CHANNEL_TIME (HZ / 33)
+#define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 9)
+
+void ieee80211_rx_bss_put(struct ieee80211_local *local,
+ struct ieee80211_bss *bss)
+{
+ if (!bss)
+ return;
+ cfg80211_put_bss(local->hw.wiphy,
+ container_of((void *)bss, struct cfg80211_bss, priv));
+}
+
+static bool is_uapsd_supported(struct ieee802_11_elems *elems)
+{
+ u8 qos_info;
+
+ if (elems->wmm_info && elems->wmm_info_len == 7
+ && elems->wmm_info[5] == 1)
+ qos_info = elems->wmm_info[6];
+ else if (elems->wmm_param && elems->wmm_param_len == 24
+ && elems->wmm_param[5] == 1)
+ qos_info = elems->wmm_param[6];
+ else
+ /* no valid wmm information or parameter element found */
+ return false;
+
+ return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD;
+}
+
+struct ieee80211_bss *
+ieee80211_bss_info_update(struct ieee80211_local *local,
+ struct ieee80211_rx_status *rx_status,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee802_11_elems *elems,
+ struct ieee80211_channel *channel)
+{
+ bool beacon = ieee80211_is_beacon(mgmt->frame_control);
+ struct cfg80211_bss *cbss;
+ struct ieee80211_bss *bss;
+ int clen, srlen;
+ enum nl80211_bss_scan_width scan_width;
+ s32 signal = 0;
+
+ if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+ signal = rx_status->signal * 100;
+ else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
+ signal = (rx_status->signal * 100) / local->hw.max_signal;
+
+ scan_width = NL80211_BSS_CHAN_WIDTH_20;
+ if (rx_status->flag & RX_FLAG_5MHZ)
+ scan_width = NL80211_BSS_CHAN_WIDTH_5;
+ if (rx_status->flag & RX_FLAG_10MHZ)
+ scan_width = NL80211_BSS_CHAN_WIDTH_10;
+
+ cbss = cfg80211_inform_bss_width_frame(local->hw.wiphy, channel,
+ scan_width, mgmt, len, signal,
+ GFP_ATOMIC);
+ if (!cbss)
+ return NULL;
+
+ bss = (void *)cbss->priv;
+
+ if (beacon)
+ bss->device_ts_beacon = rx_status->device_timestamp;
+ else
+ bss->device_ts_presp = rx_status->device_timestamp;
+
+ if (elems->parse_error) {
+ if (beacon)
+ bss->corrupt_data |= IEEE80211_BSS_CORRUPT_BEACON;
+ else
+ bss->corrupt_data |= IEEE80211_BSS_CORRUPT_PROBE_RESP;
+ } else {
+ if (beacon)
+ bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_BEACON;
+ else
+ bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_PROBE_RESP;
+ }
+
+ /* save the ERP value so that it is available at association time */
+ if (elems->erp_info && (!elems->parse_error ||
+ !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) {
+ bss->erp_value = elems->erp_info[0];
+ bss->has_erp_value = true;
+ if (!elems->parse_error)
+ bss->valid_data |= IEEE80211_BSS_VALID_ERP;
+ }
+
+ /* replace old supported rates if we get new values */
+ if (!elems->parse_error ||
+ !(bss->valid_data & IEEE80211_BSS_VALID_RATES)) {
+ srlen = 0;
+ if (elems->supp_rates) {
+ clen = IEEE80211_MAX_SUPP_RATES;
+ if (clen > elems->supp_rates_len)
+ clen = elems->supp_rates_len;
+ memcpy(bss->supp_rates, elems->supp_rates, clen);
+ srlen += clen;
+ }
+ if (elems->ext_supp_rates) {
+ clen = IEEE80211_MAX_SUPP_RATES - srlen;
+ if (clen > elems->ext_supp_rates_len)
+ clen = elems->ext_supp_rates_len;
+ memcpy(bss->supp_rates + srlen, elems->ext_supp_rates,
+ clen);
+ srlen += clen;
+ }
+ if (srlen) {
+ bss->supp_rates_len = srlen;
+ if (!elems->parse_error)
+ bss->valid_data |= IEEE80211_BSS_VALID_RATES;
+ }
+ }
+
+ if (!elems->parse_error ||
+ !(bss->valid_data & IEEE80211_BSS_VALID_WMM)) {
+ bss->wmm_used = elems->wmm_param || elems->wmm_info;
+ bss->uapsd_supported = is_uapsd_supported(elems);
+ if (!elems->parse_error)
+ bss->valid_data |= IEEE80211_BSS_VALID_WMM;
+ }
+
+ if (beacon) {
+ struct ieee80211_supported_band *sband =
+ local->hw.wiphy->bands[rx_status->band];
+ if (!(rx_status->flag & RX_FLAG_HT) &&
+ !(rx_status->flag & RX_FLAG_VHT))
+ bss->beacon_rate =
+ &sband->bitrates[rx_status->rate_idx];
+ }
+
+ return bss;
+}
+
+void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb)
+{
+ struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_sub_if_data *sdata1, *sdata2;
+ struct ieee80211_mgmt *mgmt = (void *)skb->data;
+ struct ieee80211_bss *bss;
+ u8 *elements;
+ struct ieee80211_channel *channel;
+ size_t baselen;
+ struct ieee802_11_elems elems;
+
+ if (skb->len < 24 ||
+ (!ieee80211_is_probe_resp(mgmt->frame_control) &&
+ !ieee80211_is_beacon(mgmt->frame_control)))
+ return;
+
+ sdata1 = rcu_dereference(local->scan_sdata);
+ sdata2 = rcu_dereference(local->sched_scan_sdata);
+
+ if (likely(!sdata1 && !sdata2))
+ return;
+
+ if (ieee80211_is_probe_resp(mgmt->frame_control)) {
+ struct cfg80211_scan_request *scan_req;
+ struct cfg80211_sched_scan_request *sched_scan_req;
+
+ scan_req = rcu_dereference(local->scan_req);
+ sched_scan_req = rcu_dereference(local->sched_scan_req);
+
+ /* ignore ProbeResp to foreign address unless scanning
+ * with randomised address
+ */
+ if (!(sdata1 &&
+ (ether_addr_equal(mgmt->da, sdata1->vif.addr) ||
+ scan_req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR)) &&
+ !(sdata2 &&
+ (ether_addr_equal(mgmt->da, sdata2->vif.addr) ||
+ sched_scan_req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR)))
+ return;
+
+ elements = mgmt->u.probe_resp.variable;
+ baselen = offsetof(struct ieee80211_mgmt, u.probe_resp.variable);
+ } else {
+ baselen = offsetof(struct ieee80211_mgmt, u.beacon.variable);
+ elements = mgmt->u.beacon.variable;
+ }
+
+ if (baselen > skb->len)
+ return;
+
+ ieee802_11_parse_elems(elements, skb->len - baselen, false, &elems);
+
+ channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq);
+
+ if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+ return;
+
+ bss = ieee80211_bss_info_update(local, rx_status,
+ mgmt, skb->len, &elems,
+ channel);
+ if (bss)
+ ieee80211_rx_bss_put(local, bss);
+}
+
+static void
+ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef,
+ enum nl80211_bss_scan_width scan_width)
+{
+ memset(chandef, 0, sizeof(*chandef));
+ switch (scan_width) {
+ case NL80211_BSS_CHAN_WIDTH_5:
+ chandef->width = NL80211_CHAN_WIDTH_5;
+ break;
+ case NL80211_BSS_CHAN_WIDTH_10:
+ chandef->width = NL80211_CHAN_WIDTH_10;
+ break;
+ default:
+ chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
+ break;
+ }
+}
+
+/* return false if no more work */
+static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
+{
+ struct cfg80211_scan_request *req;
+ struct cfg80211_chan_def chandef;
+ u8 bands_used = 0;
+ int i, ielen, n_chans;
+
+ req = rcu_dereference_protected(local->scan_req,
+ lockdep_is_held(&local->mtx));
+
+ if (test_bit(SCAN_HW_CANCELLED, &local->scanning))
+ return false;
+
+ if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) {
+ for (i = 0; i < req->n_channels; i++) {
+ local->hw_scan_req->req.channels[i] = req->channels[i];
+ bands_used |= BIT(req->channels[i]->band);
+ }
+
+ n_chans = req->n_channels;
+ } else {
+ do {
+ if (local->hw_scan_band == IEEE80211_NUM_BANDS)
+ return false;
+
+ n_chans = 0;
+
+ for (i = 0; i < req->n_channels; i++) {
+ if (req->channels[i]->band !=
+ local->hw_scan_band)
+ continue;
+ local->hw_scan_req->req.channels[n_chans] =
+ req->channels[i];
+ n_chans++;
+ bands_used |= BIT(req->channels[i]->band);
+ }
+
+ local->hw_scan_band++;
+ } while (!n_chans);
+ }
+
+ local->hw_scan_req->req.n_channels = n_chans;
+ ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
+
+ ielen = ieee80211_build_preq_ies(local,
+ (u8 *)local->hw_scan_req->req.ie,
+ local->hw_scan_ies_bufsize,
+ &local->hw_scan_req->ies,
+ req->ie, req->ie_len,
+ bands_used, req->rates, &chandef);
+ local->hw_scan_req->req.ie_len = ielen;
+ local->hw_scan_req->req.no_cck = req->no_cck;
+ ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr);
+ ether_addr_copy(local->hw_scan_req->req.mac_addr_mask,
+ req->mac_addr_mask);
+
+ return true;
+}
+
+static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ bool hw_scan = local->ops->hw_scan;
+ bool was_scanning = local->scanning;
+ struct cfg80211_scan_request *scan_req;
+ struct ieee80211_sub_if_data *scan_sdata;
+
+ lockdep_assert_held(&local->mtx);
+
+ /*
+ * It's ok to abort a not-yet-running scan (that
+ * we have one at all will be verified by checking
+ * local->scan_req next), but not to complete it
+ * successfully.
+ */
+ if (WARN_ON(!local->scanning && !aborted))
+ aborted = true;
+
+ if (WARN_ON(!local->scan_req))
+ return;
+
+ if (hw_scan && !aborted &&
+ !(local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) &&
+ ieee80211_prep_hw_scan(local)) {
+ int rc;
+
+ rc = drv_hw_scan(local,
+ rcu_dereference_protected(local->scan_sdata,
+ lockdep_is_held(&local->mtx)),
+ local->hw_scan_req);
+
+ if (rc == 0)
+ return;
+ }
+
+ kfree(local->hw_scan_req);
+ local->hw_scan_req = NULL;
+
+ scan_req = rcu_dereference_protected(local->scan_req,
+ lockdep_is_held(&local->mtx));
+
+ if (scan_req != local->int_scan_req)
+ cfg80211_scan_done(scan_req, aborted);
+ RCU_INIT_POINTER(local->scan_req, NULL);
+
+ scan_sdata = rcu_dereference_protected(local->scan_sdata,
+ lockdep_is_held(&local->mtx));
+ RCU_INIT_POINTER(local->scan_sdata, NULL);
+
+ local->scanning = 0;
+ local->scan_chandef.chan = NULL;
+
+ /* Set power back to normal operating levels. */
+ ieee80211_hw_config(local, 0);
+
+ if (!hw_scan) {
+ ieee80211_configure_filter(local);
+ drv_sw_scan_complete(local, scan_sdata);
+ ieee80211_offchannel_return(local);
+ }
+
+ ieee80211_recalc_idle(local);
+
+ ieee80211_mlme_notify_scan_completed(local);
+ ieee80211_ibss_notify_scan_completed(local);
+ ieee80211_mesh_notify_scan_completed(local);
+ if (was_scanning)
+ ieee80211_start_next_roc(local);
+}
+
+void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ trace_api_scan_completed(local, aborted);
+
+ set_bit(SCAN_COMPLETED, &local->scanning);
+ if (aborted)
+ set_bit(SCAN_ABORTED, &local->scanning);
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
+}
+EXPORT_SYMBOL(ieee80211_scan_completed);
+
+static int ieee80211_start_sw_scan(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ /* Software scan is not supported in multi-channel cases */
+ if (local->use_chanctx)
+ return -EOPNOTSUPP;
+
+ /*
+ * Hardware/driver doesn't support hw_scan, so use software
+ * scanning instead. First send a nullfunc frame with power save
+ * bit on so that AP will buffer the frames for us while we are not
+ * listening, then send probe requests to each channel and wait for
+ * the responses. After all channels are scanned, tune back to the
+ * original channel and send a nullfunc frame with power save bit
+ * off to trigger the AP to send us all the buffered frames.
+ *
+ * Note that while local->sw_scanning is true everything else but
+ * nullfunc frames and probe requests will be dropped in
+ * ieee80211_tx_h_check_assoc().
+ */
+ drv_sw_scan_start(local, sdata, local->scan_addr);
+
+ local->leave_oper_channel_time = jiffies;
+ local->next_scan_state = SCAN_DECISION;
+ local->scan_channel_idx = 0;
+
+ ieee80211_offchannel_stop_vifs(local);
+
+ /* ensure nullfunc is transmitted before leaving operating channel */
+ ieee80211_flush_queues(local, NULL, false);
+
+ ieee80211_configure_filter(local);
+
+ /* We need to set power level at maximum rate for scanning. */
+ ieee80211_hw_config(local, 0);
+
+ ieee80211_queue_delayed_work(&local->hw,
+ &local->scan_work, 0);
+
+ return 0;
+}
+
+static bool ieee80211_can_scan(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ if (ieee80211_is_radar_required(local))
+ return false;
+
+ if (!list_empty(&local->roc_list))
+ return false;
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ sdata->u.mgd.flags & IEEE80211_STA_CONNECTION_POLL)
+ return false;
+
+ return true;
+}
+
+void ieee80211_run_deferred_scan(struct ieee80211_local *local)
+{
+ lockdep_assert_held(&local->mtx);
+
+ if (!local->scan_req || local->scanning)
+ return;
+
+ if (!ieee80211_can_scan(local,
+ rcu_dereference_protected(
+ local->scan_sdata,
+ lockdep_is_held(&local->mtx))))
+ return;
+
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work,
+ round_jiffies_relative(0));
+}
+
+static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
+ unsigned long *next_delay)
+{
+ int i;
+ struct ieee80211_sub_if_data *sdata;
+ struct cfg80211_scan_request *scan_req;
+ enum ieee80211_band band = local->hw.conf.chandef.chan->band;
+ u32 tx_flags;
+
+ scan_req = rcu_dereference_protected(local->scan_req,
+ lockdep_is_held(&local->mtx));
+
+ tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
+ if (scan_req->no_cck)
+ tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE;
+
+ sdata = rcu_dereference_protected(local->scan_sdata,
+ lockdep_is_held(&local->mtx));
+
+ for (i = 0; i < scan_req->n_ssids; i++)
+ ieee80211_send_probe_req(
+ sdata, local->scan_addr, NULL,
+ scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len,
+ scan_req->ie, scan_req->ie_len,
+ scan_req->rates[band], false,
+ tx_flags, local->hw.conf.chandef.chan, true);
+
+ /*
+ * After sending probe requests, wait for probe responses
+ * on the channel.
+ */
+ *next_delay = IEEE80211_CHANNEL_TIME;
+ local->next_scan_state = SCAN_DECISION;
+}
+
+static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_scan_request *req)
+{
+ struct ieee80211_local *local = sdata->local;
+ int rc;
+
+ lockdep_assert_held(&local->mtx);
+
+ if (local->scan_req || ieee80211_is_radar_required(local))
+ return -EBUSY;
+
+ if (!ieee80211_can_scan(local, sdata)) {
+ /* wait for the work to finish/time out */
+ rcu_assign_pointer(local->scan_req, req);
+ rcu_assign_pointer(local->scan_sdata, sdata);
+ return 0;
+ }
+
+ if (local->ops->hw_scan) {
+ u8 *ies;
+
+ local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len;
+
+ if (local->hw.flags & IEEE80211_SINGLE_HW_SCAN_ON_ALL_BANDS) {
+ int i, n_bands = 0;
+ u8 bands_counted = 0;
+
+ for (i = 0; i < req->n_channels; i++) {
+ if (bands_counted & BIT(req->channels[i]->band))
+ continue;
+ bands_counted |= BIT(req->channels[i]->band);
+ n_bands++;
+ }
+
+ local->hw_scan_ies_bufsize *= n_bands;
+ }
+
+ local->hw_scan_req = kmalloc(
+ sizeof(*local->hw_scan_req) +
+ req->n_channels * sizeof(req->channels[0]) +
+ local->hw_scan_ies_bufsize, GFP_KERNEL);
+ if (!local->hw_scan_req)
+ return -ENOMEM;
+
+ local->hw_scan_req->req.ssids = req->ssids;
+ local->hw_scan_req->req.n_ssids = req->n_ssids;
+ ies = (u8 *)local->hw_scan_req +
+ sizeof(*local->hw_scan_req) +
+ req->n_channels * sizeof(req->channels[0]);
+ local->hw_scan_req->req.ie = ies;
+ local->hw_scan_req->req.flags = req->flags;
+
+ local->hw_scan_band = 0;
+
+ /*
+ * After allocating local->hw_scan_req, we must
+ * go through until ieee80211_prep_hw_scan(), so
+ * anything that might be changed here and leave
+ * this function early must not go after this
+ * allocation.
+ */
+ }
+
+ rcu_assign_pointer(local->scan_req, req);
+ rcu_assign_pointer(local->scan_sdata, sdata);
+
+ if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR)
+ get_random_mask_addr(local->scan_addr,
+ req->mac_addr,
+ req->mac_addr_mask);
+ else
+ memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN);
+
+ if (local->ops->hw_scan) {
+ __set_bit(SCAN_HW_SCANNING, &local->scanning);
+ } else if ((req->n_channels == 1) &&
+ (req->channels[0] == local->_oper_chandef.chan)) {
+ /*
+ * If we are scanning only on the operating channel
+ * then we do not need to stop normal activities
+ */
+ unsigned long next_delay;
+
+ __set_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning);
+
+ ieee80211_recalc_idle(local);
+
+ /* Notify driver scan is starting, keep order of operations
+ * same as normal software scan, in case that matters. */
+ drv_sw_scan_start(local, sdata, local->scan_addr);
+
+ ieee80211_configure_filter(local); /* accept probe-responses */
+
+ /* We need to ensure power level is at max for scanning. */
+ ieee80211_hw_config(local, 0);
+
+ if ((req->channels[0]->flags &
+ IEEE80211_CHAN_NO_IR) ||
+ !req->n_ssids) {
+ next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
+ } else {
+ ieee80211_scan_state_send_probe(local, &next_delay);
+ next_delay = IEEE80211_CHANNEL_TIME;
+ }
+
+ /* Now, just wait a bit and we are all done! */
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work,
+ next_delay);
+ return 0;
+ } else {
+ /* Do normal software scan */
+ __set_bit(SCAN_SW_SCANNING, &local->scanning);
+ }
+
+ ieee80211_recalc_idle(local);
+
+ if (local->ops->hw_scan) {
+ WARN_ON(!ieee80211_prep_hw_scan(local));
+ rc = drv_hw_scan(local, sdata, local->hw_scan_req);
+ } else {
+ rc = ieee80211_start_sw_scan(local, sdata);
+ }
+
+ if (rc) {
+ kfree(local->hw_scan_req);
+ local->hw_scan_req = NULL;
+ local->scanning = 0;
+
+ ieee80211_recalc_idle(local);
+
+ local->scan_req = NULL;
+ RCU_INIT_POINTER(local->scan_sdata, NULL);
+ }
+
+ return rc;
+}
+
+static unsigned long
+ieee80211_scan_get_channel_time(struct ieee80211_channel *chan)
+{
+ /*
+ * TODO: channel switching also consumes quite some time,
+ * add that delay as well to get a better estimation
+ */
+ if (chan->flags & IEEE80211_CHAN_NO_IR)
+ return IEEE80211_PASSIVE_CHANNEL_TIME;
+ return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME;
+}
+
+static void ieee80211_scan_state_decision(struct ieee80211_local *local,
+ unsigned long *next_delay)
+{
+ bool associated = false;
+ bool tx_empty = true;
+ bool bad_latency;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_channel *next_chan;
+ enum mac80211_scan_state next_scan_state;
+ struct cfg80211_scan_request *scan_req;
+
+ /*
+ * check if at least one STA interface is associated,
+ * check if at least one STA interface has pending tx frames
+ * and grab the lowest used beacon interval
+ */
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ if (sdata->u.mgd.associated) {
+ associated = true;
+
+ if (!qdisc_all_tx_empty(sdata->dev)) {
+ tx_empty = false;
+ break;
+ }
+ }
+ }
+ }
+ mutex_unlock(&local->iflist_mtx);
+
+ scan_req = rcu_dereference_protected(local->scan_req,
+ lockdep_is_held(&local->mtx));
+
+ next_chan = scan_req->channels[local->scan_channel_idx];
+
+ /*
+ * we're currently scanning a different channel, let's
+ * see if we can scan another channel without interfering
+ * with the current traffic situation.
+ *
+ * Keep good latency, do not stay off-channel more than 125 ms.
+ */
+
+ bad_latency = time_after(jiffies +
+ ieee80211_scan_get_channel_time(next_chan),
+ local->leave_oper_channel_time + HZ / 8);
+
+ if (associated && !tx_empty) {
+ if (scan_req->flags & NL80211_SCAN_FLAG_LOW_PRIORITY)
+ next_scan_state = SCAN_ABORT;
+ else
+ next_scan_state = SCAN_SUSPEND;
+ } else if (associated && bad_latency) {
+ next_scan_state = SCAN_SUSPEND;
+ } else {
+ next_scan_state = SCAN_SET_CHANNEL;
+ }
+
+ local->next_scan_state = next_scan_state;
+
+ *next_delay = 0;
+}
+
+static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
+ unsigned long *next_delay)
+{
+ int skip;
+ struct ieee80211_channel *chan;
+ enum nl80211_bss_scan_width oper_scan_width;
+ struct cfg80211_scan_request *scan_req;
+
+ scan_req = rcu_dereference_protected(local->scan_req,
+ lockdep_is_held(&local->mtx));
+
+ skip = 0;
+ chan = scan_req->channels[local->scan_channel_idx];
+
+ local->scan_chandef.chan = chan;
+ local->scan_chandef.center_freq1 = chan->center_freq;
+ local->scan_chandef.center_freq2 = 0;
+ switch (scan_req->scan_width) {
+ case NL80211_BSS_CHAN_WIDTH_5:
+ local->scan_chandef.width = NL80211_CHAN_WIDTH_5;
+ break;
+ case NL80211_BSS_CHAN_WIDTH_10:
+ local->scan_chandef.width = NL80211_CHAN_WIDTH_10;
+ break;
+ case NL80211_BSS_CHAN_WIDTH_20:
+ /* If scanning on oper channel, use whatever channel-type
+ * is currently in use.
+ */
+ oper_scan_width = cfg80211_chandef_to_scan_width(
+ &local->_oper_chandef);
+ if (chan == local->_oper_chandef.chan &&
+ oper_scan_width == scan_req->scan_width)
+ local->scan_chandef = local->_oper_chandef;
+ else
+ local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
+ break;
+ }
+
+ if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL))
+ skip = 1;
+
+ /* advance state machine to next channel/band */
+ local->scan_channel_idx++;
+
+ if (skip) {
+ /* if we skip this channel return to the decision state */
+ local->next_scan_state = SCAN_DECISION;
+ return;
+ }
+
+ /*
+ * Probe delay is used to update the NAV, cf. 11.1.3.2.2
+ * (which unfortunately doesn't say _why_ step a) is done,
+ * but it waits for the probe delay or until a frame is
+ * received - and the received frame would update the NAV).
+ * For now, we do not support waiting until a frame is
+ * received.
+ *
+ * In any case, it is not necessary for a passive scan.
+ */
+ if (chan->flags & IEEE80211_CHAN_NO_IR || !scan_req->n_ssids) {
+ *next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
+ local->next_scan_state = SCAN_DECISION;
+ return;
+ }
+
+ /* active scan, send probes */
+ *next_delay = IEEE80211_PROBE_DELAY;
+ local->next_scan_state = SCAN_SEND_PROBE;
+}
+
+static void ieee80211_scan_state_suspend(struct ieee80211_local *local,
+ unsigned long *next_delay)
+{
+ /* switch back to the operating channel */
+ local->scan_chandef.chan = NULL;
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+
+ /* disable PS */
+ ieee80211_offchannel_return(local);
+
+ *next_delay = HZ / 5;
+ /* afterwards, resume scan & go to next channel */
+ local->next_scan_state = SCAN_RESUME;
+}
+
+static void ieee80211_scan_state_resume(struct ieee80211_local *local,
+ unsigned long *next_delay)
+{
+ ieee80211_offchannel_stop_vifs(local);
+
+ if (local->ops->flush) {
+ ieee80211_flush_queues(local, NULL, false);
+ *next_delay = 0;
+ } else
+ *next_delay = HZ / 10;
+
+ /* remember when we left the operating channel */
+ local->leave_oper_channel_time = jiffies;
+
+ /* advance to the next channel to be scanned */
+ local->next_scan_state = SCAN_SET_CHANNEL;
+}
+
+void ieee80211_scan_work(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local, scan_work.work);
+ struct ieee80211_sub_if_data *sdata;
+ struct cfg80211_scan_request *scan_req;
+ unsigned long next_delay = 0;
+ bool aborted;
+
+ mutex_lock(&local->mtx);
+
+ if (!ieee80211_can_run_worker(local)) {
+ aborted = true;
+ goto out_complete;
+ }
+
+ sdata = rcu_dereference_protected(local->scan_sdata,
+ lockdep_is_held(&local->mtx));
+ scan_req = rcu_dereference_protected(local->scan_req,
+ lockdep_is_held(&local->mtx));
+
+ /* When scanning on-channel, the first-callback means completed. */
+ if (test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning)) {
+ aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning);
+ goto out_complete;
+ }
+
+ if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) {
+ aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning);
+ goto out_complete;
+ }
+
+ if (!sdata || !scan_req)
+ goto out;
+
+ if (!local->scanning) {
+ int rc;
+
+ RCU_INIT_POINTER(local->scan_req, NULL);
+ RCU_INIT_POINTER(local->scan_sdata, NULL);
+
+ rc = __ieee80211_start_scan(sdata, scan_req);
+ if (rc) {
+ /* need to complete scan in cfg80211 */
+ rcu_assign_pointer(local->scan_req, scan_req);
+ aborted = true;
+ goto out_complete;
+ } else
+ goto out;
+ }
+
+ /*
+ * as long as no delay is required advance immediately
+ * without scheduling a new work
+ */
+ do {
+ if (!ieee80211_sdata_running(sdata)) {
+ aborted = true;
+ goto out_complete;
+ }
+
+ switch (local->next_scan_state) {
+ case SCAN_DECISION:
+ /* if no more bands/channels left, complete scan */
+ if (local->scan_channel_idx >= scan_req->n_channels) {
+ aborted = false;
+ goto out_complete;
+ }
+ ieee80211_scan_state_decision(local, &next_delay);
+ break;
+ case SCAN_SET_CHANNEL:
+ ieee80211_scan_state_set_channel(local, &next_delay);
+ break;
+ case SCAN_SEND_PROBE:
+ ieee80211_scan_state_send_probe(local, &next_delay);
+ break;
+ case SCAN_SUSPEND:
+ ieee80211_scan_state_suspend(local, &next_delay);
+ break;
+ case SCAN_RESUME:
+ ieee80211_scan_state_resume(local, &next_delay);
+ break;
+ case SCAN_ABORT:
+ aborted = true;
+ goto out_complete;
+ }
+ } while (next_delay == 0);
+
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay);
+ goto out;
+
+out_complete:
+ __ieee80211_scan_completed(&local->hw, aborted);
+out:
+ mutex_unlock(&local->mtx);
+}
+
+int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_scan_request *req)
+{
+ int res;
+
+ mutex_lock(&sdata->local->mtx);
+ res = __ieee80211_start_scan(sdata, req);
+ mutex_unlock(&sdata->local->mtx);
+
+ return res;
+}
+
+int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
+ const u8 *ssid, u8 ssid_len,
+ struct ieee80211_channel **channels,
+ unsigned int n_channels,
+ enum nl80211_bss_scan_width scan_width)
+{
+ struct ieee80211_local *local = sdata->local;
+ int ret = -EBUSY, i, n_ch = 0;
+ enum ieee80211_band band;
+
+ mutex_lock(&local->mtx);
+
+ /* busy scanning */
+ if (local->scan_req)
+ goto unlock;
+
+ /* fill internal scan request */
+ if (!channels) {
+ int max_n;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ if (!local->hw.wiphy->bands[band])
+ continue;
+
+ max_n = local->hw.wiphy->bands[band]->n_channels;
+ for (i = 0; i < max_n; i++) {
+ struct ieee80211_channel *tmp_ch =
+ &local->hw.wiphy->bands[band]->channels[i];
+
+ if (tmp_ch->flags & (IEEE80211_CHAN_NO_IR |
+ IEEE80211_CHAN_DISABLED))
+ continue;
+
+ local->int_scan_req->channels[n_ch] = tmp_ch;
+ n_ch++;
+ }
+ }
+
+ if (WARN_ON_ONCE(n_ch == 0))
+ goto unlock;
+
+ local->int_scan_req->n_channels = n_ch;
+ } else {
+ for (i = 0; i < n_channels; i++) {
+ if (channels[i]->flags & (IEEE80211_CHAN_NO_IR |
+ IEEE80211_CHAN_DISABLED))
+ continue;
+
+ local->int_scan_req->channels[n_ch] = channels[i];
+ n_ch++;
+ }
+
+ if (WARN_ON_ONCE(n_ch == 0))
+ goto unlock;
+
+ local->int_scan_req->n_channels = n_ch;
+ }
+
+ local->int_scan_req->ssids = &local->scan_ssid;
+ local->int_scan_req->n_ssids = 1;
+ local->int_scan_req->scan_width = scan_width;
+ memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN);
+ local->int_scan_req->ssids[0].ssid_len = ssid_len;
+
+ ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req);
+ unlock:
+ mutex_unlock(&local->mtx);
+ return ret;
+}
+
+/*
+ * Only call this function when a scan can't be queued -- under RTNL.
+ */
+void ieee80211_scan_cancel(struct ieee80211_local *local)
+{
+ /*
+ * We are canceling software scan, or deferred scan that was not
+ * yet really started (see __ieee80211_start_scan ).
+ *
+ * Regarding hardware scan:
+ * - we can not call __ieee80211_scan_completed() as when
+ * SCAN_HW_SCANNING bit is set this function change
+ * local->hw_scan_req to operate on 5G band, what race with
+ * driver which can use local->hw_scan_req
+ *
+ * - we can not cancel scan_work since driver can schedule it
+ * by ieee80211_scan_completed(..., true) to finish scan
+ *
+ * Hence we only call the cancel_hw_scan() callback, but the low-level
+ * driver is still responsible for calling ieee80211_scan_completed()
+ * after the scan was completed/aborted.
+ */
+
+ mutex_lock(&local->mtx);
+ if (!local->scan_req)
+ goto out;
+
+ /*
+ * We have a scan running and the driver already reported completion,
+ * but the worker hasn't run yet or is stuck on the mutex - mark it as
+ * cancelled.
+ */
+ if (test_bit(SCAN_HW_SCANNING, &local->scanning) &&
+ test_bit(SCAN_COMPLETED, &local->scanning)) {
+ set_bit(SCAN_HW_CANCELLED, &local->scanning);
+ goto out;
+ }
+
+ if (test_bit(SCAN_HW_SCANNING, &local->scanning)) {
+ /*
+ * Make sure that __ieee80211_scan_completed doesn't trigger a
+ * scan on another band.
+ */
+ set_bit(SCAN_HW_CANCELLED, &local->scanning);
+ if (local->ops->cancel_hw_scan)
+ drv_cancel_hw_scan(local,
+ rcu_dereference_protected(local->scan_sdata,
+ lockdep_is_held(&local->mtx)));
+ goto out;
+ }
+
+ /*
+ * If the work is currently running, it must be blocked on
+ * the mutex, but we'll set scan_sdata = NULL and it'll
+ * simply exit once it acquires the mutex.
+ */
+ cancel_delayed_work(&local->scan_work);
+ /* and clean up */
+ __ieee80211_scan_completed(&local->hw, true);
+out:
+ mutex_unlock(&local->mtx);
+}
+
+int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_sched_scan_request *req)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_scan_ies sched_scan_ies = {};
+ struct cfg80211_chan_def chandef;
+ int ret, i, iebufsz, num_bands = 0;
+ u32 rate_masks[IEEE80211_NUM_BANDS] = {};
+ u8 bands_used = 0;
+ u8 *ie;
+ size_t len;
+
+ iebufsz = local->scan_ies_len + req->ie_len;
+
+ lockdep_assert_held(&local->mtx);
+
+ if (!local->ops->sched_scan_start)
+ return -ENOTSUPP;
+
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ if (local->hw.wiphy->bands[i]) {
+ bands_used |= BIT(i);
+ rate_masks[i] = (u32) -1;
+ num_bands++;
+ }
+ }
+
+ ie = kzalloc(num_bands * iebufsz, GFP_KERNEL);
+ if (!ie) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
+
+ len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
+ &sched_scan_ies, req->ie,
+ req->ie_len, bands_used,
+ rate_masks, &chandef);
+
+ ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
+ if (ret == 0) {
+ rcu_assign_pointer(local->sched_scan_sdata, sdata);
+ rcu_assign_pointer(local->sched_scan_req, req);
+ }
+
+ kfree(ie);
+
+out:
+ if (ret) {
+ /* Clean in case of failure after HW restart or upon resume. */
+ RCU_INIT_POINTER(local->sched_scan_sdata, NULL);
+ RCU_INIT_POINTER(local->sched_scan_req, NULL);
+ }
+
+ return ret;
+}
+
+int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_sched_scan_request *req)
+{
+ struct ieee80211_local *local = sdata->local;
+ int ret;
+
+ mutex_lock(&local->mtx);
+
+ if (rcu_access_pointer(local->sched_scan_sdata)) {
+ mutex_unlock(&local->mtx);
+ return -EBUSY;
+ }
+
+ ret = __ieee80211_request_sched_scan_start(sdata, req);
+
+ mutex_unlock(&local->mtx);
+ return ret;
+}
+
+int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ int ret = 0;
+
+ mutex_lock(&local->mtx);
+
+ if (!local->ops->sched_scan_stop) {
+ ret = -ENOTSUPP;
+ goto out;
+ }
+
+ /* We don't want to restart sched scan anymore. */
+ RCU_INIT_POINTER(local->sched_scan_req, NULL);
+
+ if (rcu_access_pointer(local->sched_scan_sdata)) {
+ ret = drv_sched_scan_stop(local, sdata);
+ if (!ret)
+ RCU_INIT_POINTER(local->sched_scan_sdata, NULL);
+ }
+out:
+ mutex_unlock(&local->mtx);
+
+ return ret;
+}
+
+void ieee80211_sched_scan_results(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ trace_api_sched_scan_results(local);
+
+ cfg80211_sched_scan_results(hw->wiphy);
+}
+EXPORT_SYMBOL(ieee80211_sched_scan_results);
+
+void ieee80211_sched_scan_end(struct ieee80211_local *local)
+{
+ mutex_lock(&local->mtx);
+
+ if (!rcu_access_pointer(local->sched_scan_sdata)) {
+ mutex_unlock(&local->mtx);
+ return;
+ }
+
+ RCU_INIT_POINTER(local->sched_scan_sdata, NULL);
+
+ /* If sched scan was aborted by the driver. */
+ RCU_INIT_POINTER(local->sched_scan_req, NULL);
+
+ mutex_unlock(&local->mtx);
+
+ cfg80211_sched_scan_stopped(local->hw.wiphy);
+}
+
+void ieee80211_sched_scan_stopped_work(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local,
+ sched_scan_stopped_work);
+
+ ieee80211_sched_scan_end(local);
+}
+
+void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ trace_api_sched_scan_stopped(local);
+
+ schedule_work(&local->sched_scan_stopped_work);
+}
+EXPORT_SYMBOL(ieee80211_sched_scan_stopped);
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
new file mode 100644
index 000000000..06e6ac8cc
--- /dev/null
+++ b/net/mac80211/spectmgmt.c
@@ -0,0 +1,243 @@
+/*
+ * spectrum management
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2008, Intel Corporation
+ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <net/cfg80211.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+#include "wme.h"
+
+int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
+ struct ieee802_11_elems *elems,
+ enum ieee80211_band current_band,
+ u32 sta_flags, u8 *bssid,
+ struct ieee80211_csa_ie *csa_ie)
+{
+ enum ieee80211_band new_band;
+ int new_freq;
+ u8 new_chan_no;
+ struct ieee80211_channel *new_chan;
+ struct cfg80211_chan_def new_vht_chandef = {};
+ const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
+ const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
+ int secondary_channel_offset = -1;
+
+ sec_chan_offs = elems->sec_chan_offs;
+ wide_bw_chansw_ie = elems->wide_bw_chansw_ie;
+
+ if (sta_flags & (IEEE80211_STA_DISABLE_HT |
+ IEEE80211_STA_DISABLE_40MHZ)) {
+ sec_chan_offs = NULL;
+ wide_bw_chansw_ie = NULL;
+ }
+
+ if (sta_flags & IEEE80211_STA_DISABLE_VHT)
+ wide_bw_chansw_ie = NULL;
+
+ if (elems->ext_chansw_ie) {
+ if (!ieee80211_operating_class_to_band(
+ elems->ext_chansw_ie->new_operating_class,
+ &new_band)) {
+ sdata_info(sdata,
+ "cannot understand ECSA IE operating class %d, disconnecting\n",
+ elems->ext_chansw_ie->new_operating_class);
+ return -EINVAL;
+ }
+ new_chan_no = elems->ext_chansw_ie->new_ch_num;
+ csa_ie->count = elems->ext_chansw_ie->count;
+ csa_ie->mode = elems->ext_chansw_ie->mode;
+ } else if (elems->ch_switch_ie) {
+ new_band = current_band;
+ new_chan_no = elems->ch_switch_ie->new_ch_num;
+ csa_ie->count = elems->ch_switch_ie->count;
+ csa_ie->mode = elems->ch_switch_ie->mode;
+ } else {
+ /* nothing here we understand */
+ return 1;
+ }
+
+ /* Mesh Channel Switch Parameters Element */
+ if (elems->mesh_chansw_params_ie) {
+ csa_ie->ttl = elems->mesh_chansw_params_ie->mesh_ttl;
+ csa_ie->mode = elems->mesh_chansw_params_ie->mesh_flags;
+ csa_ie->pre_value = le16_to_cpu(
+ elems->mesh_chansw_params_ie->mesh_pre_value);
+ }
+
+ new_freq = ieee80211_channel_to_frequency(new_chan_no, new_band);
+ new_chan = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
+ if (!new_chan || new_chan->flags & IEEE80211_CHAN_DISABLED) {
+ sdata_info(sdata,
+ "BSS %pM switches to unsupported channel (%d MHz), disconnecting\n",
+ bssid, new_freq);
+ return -EINVAL;
+ }
+
+ if (sec_chan_offs) {
+ secondary_channel_offset = sec_chan_offs->sec_chan_offs;
+ } else if (!(sta_flags & IEEE80211_STA_DISABLE_HT)) {
+ /* If the secondary channel offset IE is not present,
+ * we can't know what's the post-CSA offset, so the
+ * best we can do is use 20MHz.
+ */
+ secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
+ }
+
+ switch (secondary_channel_offset) {
+ default:
+ /* secondary_channel_offset was present but is invalid */
+ case IEEE80211_HT_PARAM_CHA_SEC_NONE:
+ cfg80211_chandef_create(&csa_ie->chandef, new_chan,
+ NL80211_CHAN_HT20);
+ break;
+ case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+ cfg80211_chandef_create(&csa_ie->chandef, new_chan,
+ NL80211_CHAN_HT40PLUS);
+ break;
+ case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+ cfg80211_chandef_create(&csa_ie->chandef, new_chan,
+ NL80211_CHAN_HT40MINUS);
+ break;
+ case -1:
+ cfg80211_chandef_create(&csa_ie->chandef, new_chan,
+ NL80211_CHAN_NO_HT);
+ /* keep width for 5/10 MHz channels */
+ switch (sdata->vif.bss_conf.chandef.width) {
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ csa_ie->chandef.width =
+ sdata->vif.bss_conf.chandef.width;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+
+ if (wide_bw_chansw_ie) {
+ new_vht_chandef.chan = new_chan;
+ new_vht_chandef.center_freq1 =
+ ieee80211_channel_to_frequency(
+ wide_bw_chansw_ie->new_center_freq_seg0,
+ new_band);
+
+ switch (wide_bw_chansw_ie->new_channel_width) {
+ default:
+ /* hmmm, ignore VHT and use HT if present */
+ case IEEE80211_VHT_CHANWIDTH_USE_HT:
+ new_vht_chandef.chan = NULL;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_80MHZ:
+ new_vht_chandef.width = NL80211_CHAN_WIDTH_80;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_160MHZ:
+ new_vht_chandef.width = NL80211_CHAN_WIDTH_160;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
+ /* field is otherwise reserved */
+ new_vht_chandef.center_freq2 =
+ ieee80211_channel_to_frequency(
+ wide_bw_chansw_ie->new_center_freq_seg1,
+ new_band);
+ new_vht_chandef.width = NL80211_CHAN_WIDTH_80P80;
+ break;
+ }
+ if (sta_flags & IEEE80211_STA_DISABLE_80P80MHZ &&
+ new_vht_chandef.width == NL80211_CHAN_WIDTH_80P80)
+ ieee80211_chandef_downgrade(&new_vht_chandef);
+ if (sta_flags & IEEE80211_STA_DISABLE_160MHZ &&
+ new_vht_chandef.width == NL80211_CHAN_WIDTH_160)
+ ieee80211_chandef_downgrade(&new_vht_chandef);
+ if (sta_flags & IEEE80211_STA_DISABLE_40MHZ &&
+ new_vht_chandef.width > NL80211_CHAN_WIDTH_20)
+ ieee80211_chandef_downgrade(&new_vht_chandef);
+ }
+
+ /* if VHT data is there validate & use it */
+ if (new_vht_chandef.chan) {
+ if (!cfg80211_chandef_compatible(&new_vht_chandef,
+ &csa_ie->chandef)) {
+ sdata_info(sdata,
+ "BSS %pM: CSA has inconsistent channel data, disconnecting\n",
+ bssid);
+ return -EINVAL;
+ }
+ csa_ie->chandef = new_vht_chandef;
+ }
+
+ return 0;
+}
+
+static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_msrment_ie *request_ie,
+ const u8 *da, const u8 *bssid,
+ u8 dialog_token)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *msr_report;
+
+ skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom +
+ sizeof(struct ieee80211_msrment_ie));
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+ msr_report = (struct ieee80211_mgmt *)skb_put(skb, 24);
+ memset(msr_report, 0, 24);
+ memcpy(msr_report->da, da, ETH_ALEN);
+ memcpy(msr_report->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(msr_report->bssid, bssid, ETH_ALEN);
+ msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+
+ skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement));
+ msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
+ msr_report->u.action.u.measurement.action_code =
+ WLAN_ACTION_SPCT_MSR_RPRT;
+ msr_report->u.action.u.measurement.dialog_token = dialog_token;
+
+ msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT;
+ msr_report->u.action.u.measurement.length =
+ sizeof(struct ieee80211_msrment_ie);
+
+ memset(&msr_report->u.action.u.measurement.msr_elem, 0,
+ sizeof(struct ieee80211_msrment_ie));
+ msr_report->u.action.u.measurement.msr_elem.token = request_ie->token;
+ msr_report->u.action.u.measurement.msr_elem.mode |=
+ IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
+ msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt,
+ size_t len)
+{
+ /*
+ * Ignoring measurement request is spec violation.
+ * Mandatory measurements must be reported optional
+ * measurements might be refused or reported incapable
+ * For now just refuse
+ * TODO: Answer basic measurement as unmeasured
+ */
+ ieee80211_send_refuse_measurement_request(sdata,
+ &mgmt->u.action.u.measurement.msr_elem,
+ mgmt->sa, mgmt->bssid,
+ mgmt->u.action.u.measurement.dialog_token);
+}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
new file mode 100644
index 000000000..2880f2ae9
--- /dev/null
+++ b/net/mac80211/sta_info.c
@@ -0,0 +1,2007 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/timer.h>
+#include <linux/rtnetlink.h>
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "sta_info.h"
+#include "debugfs_sta.h"
+#include "mesh.h"
+#include "wme.h"
+
+/**
+ * DOC: STA information lifetime rules
+ *
+ * STA info structures (&struct sta_info) are managed in a hash table
+ * for faster lookup and a list for iteration. They are managed using
+ * RCU, i.e. access to the list and hash table is protected by RCU.
+ *
+ * Upon allocating a STA info structure with sta_info_alloc(), the caller
+ * owns that structure. It must then insert it into the hash table using
+ * either sta_info_insert() or sta_info_insert_rcu(); only in the latter
+ * case (which acquires an rcu read section but must not be called from
+ * within one) will the pointer still be valid after the call. Note that
+ * the caller may not do much with the STA info before inserting it, in
+ * particular, it may not start any mesh peer link management or add
+ * encryption keys.
+ *
+ * When the insertion fails (sta_info_insert()) returns non-zero), the
+ * structure will have been freed by sta_info_insert()!
+ *
+ * Station entries are added by mac80211 when you establish a link with a
+ * peer. This means different things for the different type of interfaces
+ * we support. For a regular station this mean we add the AP sta when we
+ * receive an association response from the AP. For IBSS this occurs when
+ * get to know about a peer on the same IBSS. For WDS we add the sta for
+ * the peer immediately upon device open. When using AP mode we add stations
+ * for each respective station upon request from userspace through nl80211.
+ *
+ * In order to remove a STA info structure, various sta_info_destroy_*()
+ * calls are available.
+ *
+ * There is no concept of ownership on a STA entry, each structure is
+ * owned by the global hash table/list until it is removed. All users of
+ * the structure need to be RCU protected so that the structure won't be
+ * freed before they are done using it.
+ */
+
+static const struct rhashtable_params sta_rht_params = {
+ .nelem_hint = 3, /* start small */
+ .automatic_shrinking = true,
+ .head_offset = offsetof(struct sta_info, hash_node),
+ .key_offset = offsetof(struct sta_info, sta.addr),
+ .key_len = ETH_ALEN,
+ .hashfn = sta_addr_hash,
+};
+
+/* Caller must hold local->sta_mtx */
+static int sta_info_hash_del(struct ieee80211_local *local,
+ struct sta_info *sta)
+{
+ return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
+}
+
+static void __cleanup_single_sta(struct sta_info *sta)
+{
+ int ac, i;
+ struct tid_ampdu_tx *tid_tx;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct ps_data *ps;
+
+ if (test_sta_flag(sta, WLAN_STA_PS_STA) ||
+ test_sta_flag(sta, WLAN_STA_PS_DRIVER) ||
+ test_sta_flag(sta, WLAN_STA_PS_DELIVER)) {
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
+ sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ ps = &sdata->bss->ps;
+ else if (ieee80211_vif_is_mesh(&sdata->vif))
+ ps = &sdata->u.mesh.ps;
+ else
+ return;
+
+ clear_sta_flag(sta, WLAN_STA_PS_STA);
+ clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
+ clear_sta_flag(sta, WLAN_STA_PS_DELIVER);
+
+ atomic_dec(&ps->num_sta_ps);
+ }
+
+ if (sta->sta.txq[0]) {
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
+ int n = skb_queue_len(&txqi->queue);
+
+ ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
+ atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]);
+ }
+ }
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]);
+ ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]);
+ ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]);
+ }
+
+ if (ieee80211_vif_is_mesh(&sdata->vif))
+ mesh_sta_cleanup(sta);
+
+ cancel_work_sync(&sta->drv_deliver_wk);
+
+ /*
+ * Destroy aggregation state here. It would be nice to wait for the
+ * driver to finish aggregation stop and then clean up, but for now
+ * drivers have to handle aggregation stop being requested, followed
+ * directly by station destruction.
+ */
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ kfree(sta->ampdu_mlme.tid_start_tx[i]);
+ tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]);
+ if (!tid_tx)
+ continue;
+ ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending);
+ kfree(tid_tx);
+ }
+}
+
+static void cleanup_single_sta(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+
+ __cleanup_single_sta(sta);
+ sta_info_free(local, sta);
+}
+
+/* protected by RCU */
+struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct rhash_head *tmp;
+ const struct bucket_table *tbl;
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
+
+ for_each_sta_info(local, tbl, addr, sta, tmp) {
+ if (sta->sdata == sdata) {
+ rcu_read_unlock();
+ /* this is safe as the caller must already hold
+ * another rcu read section or the mutex
+ */
+ return sta;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+/*
+ * Get sta info either from the specified interface
+ * or from one of its vlans
+ */
+struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct rhash_head *tmp;
+ const struct bucket_table *tbl;
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
+
+ for_each_sta_info(local, tbl, addr, sta, tmp) {
+ if (sta->sdata == sdata ||
+ (sta->sdata->bss && sta->sdata->bss == sdata->bss)) {
+ rcu_read_unlock();
+ /* this is safe as the caller must already hold
+ * another rcu read section or the mutex
+ */
+ return sta;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
+ int idx)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ int i = 0;
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sdata != sta->sdata)
+ continue;
+ if (i < idx) {
+ ++i;
+ continue;
+ }
+ return sta;
+ }
+
+ return NULL;
+}
+
+/**
+ * sta_info_free - free STA
+ *
+ * @local: pointer to the global information
+ * @sta: STA info to free
+ *
+ * This function must undo everything done by sta_info_alloc()
+ * that may happen before sta_info_insert(). It may only be
+ * called when sta_info_insert() has not been attempted (and
+ * if that fails, the station is freed anyway.)
+ */
+void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
+{
+ if (sta->rate_ctrl)
+ rate_control_free_sta(sta);
+
+ sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr);
+
+ if (sta->sta.txq[0])
+ kfree(to_txq_info(sta->sta.txq[0]));
+ kfree(rcu_dereference_raw(sta->sta.rates));
+ kfree(sta);
+}
+
+/* Caller must hold local->sta_mtx */
+static void sta_info_hash_add(struct ieee80211_local *local,
+ struct sta_info *sta)
+{
+ rhashtable_insert_fast(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
+}
+
+static void sta_deliver_ps_frames(struct work_struct *wk)
+{
+ struct sta_info *sta;
+
+ sta = container_of(wk, struct sta_info, drv_deliver_wk);
+
+ if (sta->dead)
+ return;
+
+ local_bh_disable();
+ if (!test_sta_flag(sta, WLAN_STA_PS_STA))
+ ieee80211_sta_ps_deliver_wakeup(sta);
+ else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL))
+ ieee80211_sta_ps_deliver_poll_response(sta);
+ else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD))
+ ieee80211_sta_ps_deliver_uapsd(sta);
+ local_bh_enable();
+}
+
+static int sta_prepare_rate_control(struct ieee80211_local *local,
+ struct sta_info *sta, gfp_t gfp)
+{
+ if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
+ return 0;
+
+ sta->rate_ctrl = local->rate_ctrl;
+ sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl,
+ &sta->sta, gfp);
+ if (!sta->rate_ctrl_priv)
+ return -ENOMEM;
+
+ return 0;
+}
+
+struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr, gfp_t gfp)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_hw *hw = &local->hw;
+ struct sta_info *sta;
+ struct timespec uptime;
+ int i;
+
+ sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp);
+ if (!sta)
+ return NULL;
+
+ spin_lock_init(&sta->lock);
+ spin_lock_init(&sta->ps_lock);
+ INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames);
+ INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work);
+ mutex_init(&sta->ampdu_mlme.mtx);
+#ifdef CONFIG_MAC80211_MESH
+ if (ieee80211_vif_is_mesh(&sdata->vif) &&
+ !sdata->u.mesh.user_mpm)
+ init_timer(&sta->plink_timer);
+ sta->nonpeer_pm = NL80211_MESH_POWER_ACTIVE;
+#endif
+
+ memcpy(sta->sta.addr, addr, ETH_ALEN);
+ sta->local = local;
+ sta->sdata = sdata;
+ sta->last_rx = jiffies;
+
+ sta->sta_state = IEEE80211_STA_NONE;
+
+ /* Mark TID as unreserved */
+ sta->reserved_tid = IEEE80211_TID_UNRESERVED;
+
+ ktime_get_ts(&uptime);
+ sta->last_connected = uptime.tv_sec;
+ ewma_init(&sta->avg_signal, 1024, 8);
+ for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++)
+ ewma_init(&sta->chain_signal_avg[i], 1024, 8);
+
+ if (local->ops->wake_tx_queue) {
+ void *txq_data;
+ int size = sizeof(struct txq_info) +
+ ALIGN(hw->txq_data_size, sizeof(void *));
+
+ txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp);
+ if (!txq_data)
+ goto free;
+
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ struct txq_info *txq = txq_data + i * size;
+
+ ieee80211_init_tx_queue(sdata, sta, txq, i);
+ }
+ }
+
+ if (sta_prepare_rate_control(local, sta, gfp))
+ goto free_txq;
+
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ /*
+ * timer_to_tid must be initialized with identity mapping
+ * to enable session_timer's data differentiation. See
+ * sta_rx_agg_session_timer_expired for usage.
+ */
+ sta->timer_to_tid[i] = i;
+ }
+ for (i = 0; i < IEEE80211_NUM_ACS; i++) {
+ skb_queue_head_init(&sta->ps_tx_buf[i]);
+ skb_queue_head_init(&sta->tx_filtered[i]);
+ }
+
+ for (i = 0; i < IEEE80211_NUM_TIDS; i++)
+ sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
+
+ sta->sta.smps_mode = IEEE80211_SMPS_OFF;
+ if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ struct ieee80211_supported_band *sband =
+ hw->wiphy->bands[ieee80211_get_sdata_band(sdata)];
+ u8 smps = (sband->ht_cap.cap & IEEE80211_HT_CAP_SM_PS) >>
+ IEEE80211_HT_CAP_SM_PS_SHIFT;
+ /*
+ * Assume that hostapd advertises our caps in the beacon and
+ * this is the known_smps_mode for a station that just assciated
+ */
+ switch (smps) {
+ case WLAN_HT_SMPS_CONTROL_DISABLED:
+ sta->known_smps_mode = IEEE80211_SMPS_OFF;
+ break;
+ case WLAN_HT_SMPS_CONTROL_STATIC:
+ sta->known_smps_mode = IEEE80211_SMPS_STATIC;
+ break;
+ case WLAN_HT_SMPS_CONTROL_DYNAMIC:
+ sta->known_smps_mode = IEEE80211_SMPS_DYNAMIC;
+ break;
+ default:
+ WARN_ON(1);
+ }
+ }
+
+ sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);
+
+ return sta;
+
+free_txq:
+ if (sta->sta.txq[0])
+ kfree(to_txq_info(sta->sta.txq[0]));
+free:
+ kfree(sta);
+ return NULL;
+}
+
+static int sta_info_insert_check(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+ /*
+ * Can't be a WARN_ON because it can be triggered through a race:
+ * something inserts a STA (on one CPU) without holding the RTNL
+ * and another CPU turns off the net device.
+ */
+ if (unlikely(!ieee80211_sdata_running(sdata)))
+ return -ENETDOWN;
+
+ if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) ||
+ is_multicast_ether_addr(sta->sta.addr)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int sta_info_insert_drv_state(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
+{
+ enum ieee80211_sta_state state;
+ int err = 0;
+
+ for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) {
+ err = drv_sta_state(local, sdata, sta, state, state + 1);
+ if (err)
+ break;
+ }
+
+ if (!err) {
+ /*
+ * Drivers using legacy sta_add/sta_remove callbacks only
+ * get uploaded set to true after sta_add is called.
+ */
+ if (!local->ops->sta_add)
+ sta->uploaded = true;
+ return 0;
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ sdata_info(sdata,
+ "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n",
+ sta->sta.addr, state + 1, err);
+ err = 0;
+ }
+
+ /* unwind on error */
+ for (; state > IEEE80211_STA_NOTEXIST; state--)
+ WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1));
+
+ return err;
+}
+
+/*
+ * should be called with sta_mtx locked
+ * this function replaces the mutex lock
+ * with a RCU lock
+ */
+static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
+{
+ struct ieee80211_local *local = sta->local;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct station_info sinfo;
+ int err = 0;
+
+ lockdep_assert_held(&local->sta_mtx);
+
+ /* check if STA exists already */
+ if (sta_info_get_bss(sdata, sta->sta.addr)) {
+ err = -EEXIST;
+ goto out_err;
+ }
+
+ local->num_sta++;
+ local->sta_generation++;
+ smp_mb();
+
+ /* simplify things and don't accept BA sessions yet */
+ set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+
+ /* make the station visible */
+ sta_info_hash_add(local, sta);
+
+ list_add_tail_rcu(&sta->list, &local->sta_list);
+
+ /* notify driver */
+ err = sta_info_insert_drv_state(local, sdata, sta);
+ if (err)
+ goto out_remove;
+
+ set_sta_flag(sta, WLAN_STA_INSERTED);
+ /* accept BA sessions now */
+ clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
+
+ ieee80211_recalc_min_chandef(sdata);
+ ieee80211_sta_debugfs_add(sta);
+ rate_control_add_sta_debugfs(sta);
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ sinfo.filled = 0;
+ sinfo.generation = local->sta_generation;
+ cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
+
+ sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr);
+
+ /* move reference to rcu-protected */
+ rcu_read_lock();
+ mutex_unlock(&local->sta_mtx);
+
+ if (ieee80211_vif_is_mesh(&sdata->vif))
+ mesh_accept_plinks_update(sdata);
+
+ return 0;
+ out_remove:
+ sta_info_hash_del(local, sta);
+ list_del_rcu(&sta->list);
+ local->num_sta--;
+ synchronize_net();
+ __cleanup_single_sta(sta);
+ out_err:
+ mutex_unlock(&local->sta_mtx);
+ rcu_read_lock();
+ return err;
+}
+
+int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
+{
+ struct ieee80211_local *local = sta->local;
+ int err;
+
+ might_sleep();
+
+ err = sta_info_insert_check(sta);
+ if (err) {
+ rcu_read_lock();
+ goto out_free;
+ }
+
+ mutex_lock(&local->sta_mtx);
+
+ err = sta_info_insert_finish(sta);
+ if (err)
+ goto out_free;
+
+ return 0;
+ out_free:
+ sta_info_free(local, sta);
+ return err;
+}
+
+int sta_info_insert(struct sta_info *sta)
+{
+ int err = sta_info_insert_rcu(sta);
+
+ rcu_read_unlock();
+
+ return err;
+}
+
+static inline void __bss_tim_set(u8 *tim, u16 id)
+{
+ /*
+ * This format has been mandated by the IEEE specifications,
+ * so this line may not be changed to use the __set_bit() format.
+ */
+ tim[id / 8] |= (1 << (id % 8));
+}
+
+static inline void __bss_tim_clear(u8 *tim, u16 id)
+{
+ /*
+ * This format has been mandated by the IEEE specifications,
+ * so this line may not be changed to use the __clear_bit() format.
+ */
+ tim[id / 8] &= ~(1 << (id % 8));
+}
+
+static inline bool __bss_tim_get(u8 *tim, u16 id)
+{
+ /*
+ * This format has been mandated by the IEEE specifications,
+ * so this line may not be changed to use the test_bit() format.
+ */
+ return tim[id / 8] & (1 << (id % 8));
+}
+
+static unsigned long ieee80211_tids_for_ac(int ac)
+{
+ /* If we ever support TIDs > 7, this obviously needs to be adjusted */
+ switch (ac) {
+ case IEEE80211_AC_VO:
+ return BIT(6) | BIT(7);
+ case IEEE80211_AC_VI:
+ return BIT(4) | BIT(5);
+ case IEEE80211_AC_BE:
+ return BIT(0) | BIT(3);
+ case IEEE80211_AC_BK:
+ return BIT(1) | BIT(2);
+ default:
+ WARN_ON(1);
+ return 0;
+ }
+}
+
+static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
+{
+ struct ieee80211_local *local = sta->local;
+ struct ps_data *ps;
+ bool indicate_tim = false;
+ u8 ignore_for_tim = sta->sta.uapsd_queues;
+ int ac;
+ u16 id;
+
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
+ sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ if (WARN_ON_ONCE(!sta->sdata->bss))
+ return;
+
+ ps = &sta->sdata->bss->ps;
+ id = sta->sta.aid;
+#ifdef CONFIG_MAC80211_MESH
+ } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) {
+ ps = &sta->sdata->u.mesh.ps;
+ /* TIM map only for 1 <= PLID <= IEEE80211_MAX_AID */
+ id = sta->plid % (IEEE80211_MAX_AID + 1);
+#endif
+ } else {
+ return;
+ }
+
+ /* No need to do anything if the driver does all */
+ if (local->hw.flags & IEEE80211_HW_AP_LINK_PS)
+ return;
+
+ if (sta->dead)
+ goto done;
+
+ /*
+ * If all ACs are delivery-enabled then we should build
+ * the TIM bit for all ACs anyway; if only some are then
+ * we ignore those and build the TIM bit using only the
+ * non-enabled ones.
+ */
+ if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1)
+ ignore_for_tim = 0;
+
+ if (ignore_pending)
+ ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ unsigned long tids;
+
+ if (ignore_for_tim & BIT(ac))
+ continue;
+
+ indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) ||
+ !skb_queue_empty(&sta->ps_tx_buf[ac]);
+ if (indicate_tim)
+ break;
+
+ tids = ieee80211_tids_for_ac(ac);
+
+ indicate_tim |=
+ sta->driver_buffered_tids & tids;
+ indicate_tim |=
+ sta->txq_buffered_tids & tids;
+ }
+
+ done:
+ spin_lock_bh(&local->tim_lock);
+
+ if (indicate_tim == __bss_tim_get(ps->tim, id))
+ goto out_unlock;
+
+ if (indicate_tim)
+ __bss_tim_set(ps->tim, id);
+ else
+ __bss_tim_clear(ps->tim, id);
+
+ if (local->ops->set_tim && !WARN_ON(sta->dead)) {
+ local->tim_in_locked_section = true;
+ drv_set_tim(local, &sta->sta, indicate_tim);
+ local->tim_in_locked_section = false;
+ }
+
+out_unlock:
+ spin_unlock_bh(&local->tim_lock);
+}
+
+void sta_info_recalc_tim(struct sta_info *sta)
+{
+ __sta_info_recalc_tim(sta, false);
+}
+
+static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb)
+{
+ struct ieee80211_tx_info *info;
+ int timeout;
+
+ if (!skb)
+ return false;
+
+ info = IEEE80211_SKB_CB(skb);
+
+ /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */
+ timeout = (sta->listen_interval *
+ sta->sdata->vif.bss_conf.beacon_int *
+ 32 / 15625) * HZ;
+ if (timeout < STA_TX_BUFFER_EXPIRE)
+ timeout = STA_TX_BUFFER_EXPIRE;
+ return time_after(jiffies, info->control.jiffies + timeout);
+}
+
+
+static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local,
+ struct sta_info *sta, int ac)
+{
+ unsigned long flags;
+ struct sk_buff *skb;
+
+ /*
+ * First check for frames that should expire on the filtered
+ * queue. Frames here were rejected by the driver and are on
+ * a separate queue to avoid reordering with normal PS-buffered
+ * frames. They also aren't accounted for right now in the
+ * total_ps_buffered counter.
+ */
+ for (;;) {
+ spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags);
+ skb = skb_peek(&sta->tx_filtered[ac]);
+ if (sta_info_buffer_expired(sta, skb))
+ skb = __skb_dequeue(&sta->tx_filtered[ac]);
+ else
+ skb = NULL;
+ spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags);
+
+ /*
+ * Frames are queued in order, so if this one
+ * hasn't expired yet we can stop testing. If
+ * we actually reached the end of the queue we
+ * also need to stop, of course.
+ */
+ if (!skb)
+ break;
+ ieee80211_free_txskb(&local->hw, skb);
+ }
+
+ /*
+ * Now also check the normal PS-buffered queue, this will
+ * only find something if the filtered queue was emptied
+ * since the filtered frames are all before the normal PS
+ * buffered frames.
+ */
+ for (;;) {
+ spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags);
+ skb = skb_peek(&sta->ps_tx_buf[ac]);
+ if (sta_info_buffer_expired(sta, skb))
+ skb = __skb_dequeue(&sta->ps_tx_buf[ac]);
+ else
+ skb = NULL;
+ spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags);
+
+ /*
+ * frames are queued in order, so if this one
+ * hasn't expired yet (or we reached the end of
+ * the queue) we can stop testing
+ */
+ if (!skb)
+ break;
+
+ local->total_ps_buffered--;
+ ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n",
+ sta->sta.addr);
+ ieee80211_free_txskb(&local->hw, skb);
+ }
+
+ /*
+ * Finally, recalculate the TIM bit for this station -- it might
+ * now be clear because the station was too slow to retrieve its
+ * frames.
+ */
+ sta_info_recalc_tim(sta);
+
+ /*
+ * Return whether there are any frames still buffered, this is
+ * used to check whether the cleanup timer still needs to run,
+ * if there are no frames we don't need to rearm the timer.
+ */
+ return !(skb_queue_empty(&sta->ps_tx_buf[ac]) &&
+ skb_queue_empty(&sta->tx_filtered[ac]));
+}
+
+static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local,
+ struct sta_info *sta)
+{
+ bool have_buffered = false;
+ int ac;
+
+ /* This is only necessary for stations on BSS/MBSS interfaces */
+ if (!sta->sdata->bss &&
+ !ieee80211_vif_is_mesh(&sta->sdata->vif))
+ return false;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+ have_buffered |=
+ sta_info_cleanup_expire_buffered_ac(local, sta, ac);
+
+ return have_buffered;
+}
+
+static int __must_check __sta_info_destroy_part1(struct sta_info *sta)
+{
+ struct ieee80211_local *local;
+ struct ieee80211_sub_if_data *sdata;
+ int ret;
+
+ might_sleep();
+
+ if (!sta)
+ return -ENOENT;
+
+ local = sta->local;
+ sdata = sta->sdata;
+
+ lockdep_assert_held(&local->sta_mtx);
+
+ /*
+ * Before removing the station from the driver and
+ * rate control, it might still start new aggregation
+ * sessions -- block that to make sure the tear-down
+ * will be sufficient.
+ */
+ set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA);
+
+ ret = sta_info_hash_del(local, sta);
+ if (WARN_ON(ret))
+ return ret;
+
+ /*
+ * for TDLS peers, make sure to return to the base channel before
+ * removal.
+ */
+ if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) {
+ drv_tdls_cancel_channel_switch(local, sdata, &sta->sta);
+ clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL);
+ }
+
+ list_del_rcu(&sta->list);
+
+ drv_sta_pre_rcu_remove(local, sta->sdata, sta);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ rcu_access_pointer(sdata->u.vlan.sta) == sta)
+ RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);
+
+ return 0;
+}
+
+static void __sta_info_destroy_part2(struct sta_info *sta)
+{
+ struct ieee80211_local *local = sta->local;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct station_info sinfo = {};
+ int ret;
+
+ /*
+ * NOTE: This assumes at least synchronize_net() was done
+ * after _part1 and before _part2!
+ */
+
+ might_sleep();
+ lockdep_assert_held(&local->sta_mtx);
+
+ /* now keys can no longer be reached */
+ ieee80211_free_sta_keys(local, sta);
+
+ /* disable TIM bit - last chance to tell driver */
+ __sta_info_recalc_tim(sta, true);
+
+ sta->dead = true;
+
+ local->num_sta--;
+ local->sta_generation++;
+
+ while (sta->sta_state > IEEE80211_STA_NONE) {
+ ret = sta_info_move_state(sta, sta->sta_state - 1);
+ if (ret) {
+ WARN_ON_ONCE(1);
+ break;
+ }
+ }
+
+ if (sta->uploaded) {
+ ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE,
+ IEEE80211_STA_NOTEXIST);
+ WARN_ON_ONCE(ret != 0);
+ }
+
+ sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr);
+
+ sta_set_sinfo(sta, &sinfo);
+ cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
+
+ rate_control_remove_sta_debugfs(sta);
+ ieee80211_sta_debugfs_remove(sta);
+ ieee80211_recalc_min_chandef(sdata);
+
+ cleanup_single_sta(sta);
+}
+
+int __must_check __sta_info_destroy(struct sta_info *sta)
+{
+ int err = __sta_info_destroy_part1(sta);
+
+ if (err)
+ return err;
+
+ synchronize_net();
+
+ __sta_info_destroy_part2(sta);
+
+ return 0;
+}
+
+int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+{
+ struct sta_info *sta;
+ int ret;
+
+ mutex_lock(&sdata->local->sta_mtx);
+ sta = sta_info_get(sdata, addr);
+ ret = __sta_info_destroy(sta);
+ mutex_unlock(&sdata->local->sta_mtx);
+
+ return ret;
+}
+
+int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr)
+{
+ struct sta_info *sta;
+ int ret;
+
+ mutex_lock(&sdata->local->sta_mtx);
+ sta = sta_info_get_bss(sdata, addr);
+ ret = __sta_info_destroy(sta);
+ mutex_unlock(&sdata->local->sta_mtx);
+
+ return ret;
+}
+
+static void sta_info_cleanup(unsigned long data)
+{
+ struct ieee80211_local *local = (struct ieee80211_local *) data;
+ struct sta_info *sta;
+ bool timer_needed = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &local->sta_list, list)
+ if (sta_info_cleanup_expire_buffered(local, sta))
+ timer_needed = true;
+ rcu_read_unlock();
+
+ if (local->quiescing)
+ return;
+
+ if (!timer_needed)
+ return;
+
+ mod_timer(&local->sta_cleanup,
+ round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
+}
+
+u32 sta_addr_hash(const void *key, u32 length, u32 seed)
+{
+ return jhash(key, ETH_ALEN, seed);
+}
+
+int sta_info_init(struct ieee80211_local *local)
+{
+ int err;
+
+ err = rhashtable_init(&local->sta_hash, &sta_rht_params);
+ if (err)
+ return err;
+
+ spin_lock_init(&local->tim_lock);
+ mutex_init(&local->sta_mtx);
+ INIT_LIST_HEAD(&local->sta_list);
+
+ setup_timer(&local->sta_cleanup, sta_info_cleanup,
+ (unsigned long)local);
+ return 0;
+}
+
+void sta_info_stop(struct ieee80211_local *local)
+{
+ del_timer_sync(&local->sta_cleanup);
+ rhashtable_destroy(&local->sta_hash);
+}
+
+
+int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta, *tmp;
+ LIST_HEAD(free_list);
+ int ret = 0;
+
+ might_sleep();
+
+ WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP);
+ WARN_ON(vlans && !sdata->bss);
+
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
+ if (sdata == sta->sdata ||
+ (vlans && sdata->bss == sta->sdata->bss)) {
+ if (!WARN_ON(__sta_info_destroy_part1(sta)))
+ list_add(&sta->free_list, &free_list);
+ ret++;
+ }
+ }
+
+ if (!list_empty(&free_list)) {
+ synchronize_net();
+ list_for_each_entry_safe(sta, tmp, &free_list, free_list)
+ __sta_info_destroy_part2(sta);
+ }
+ mutex_unlock(&local->sta_mtx);
+
+ return ret;
+}
+
+void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
+ unsigned long exp_time)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta, *tmp;
+
+ mutex_lock(&local->sta_mtx);
+
+ list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
+ if (sdata != sta->sdata)
+ continue;
+
+ if (time_after(jiffies, sta->last_rx + exp_time)) {
+ sta_dbg(sta->sdata, "expiring inactive STA %pM\n",
+ sta->sta.addr);
+
+ if (ieee80211_vif_is_mesh(&sdata->vif) &&
+ test_sta_flag(sta, WLAN_STA_PS_STA))
+ atomic_dec(&sdata->u.mesh.ps.num_sta_ps);
+
+ WARN_ON(__sta_info_destroy(sta));
+ }
+ }
+
+ mutex_unlock(&local->sta_mtx);
+}
+
+struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
+ const u8 *addr,
+ const u8 *localaddr)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct sta_info *sta;
+ struct rhash_head *tmp;
+ const struct bucket_table *tbl;
+
+ tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
+
+ /*
+ * Just return a random station if localaddr is NULL
+ * ... first in list.
+ */
+ for_each_sta_info(local, tbl, addr, sta, tmp) {
+ if (localaddr &&
+ !ether_addr_equal(sta->sdata->vif.addr, localaddr))
+ continue;
+ if (!sta->uploaded)
+ return NULL;
+ return &sta->sta;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr);
+
+struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
+ const u8 *addr)
+{
+ struct sta_info *sta;
+
+ if (!vif)
+ return NULL;
+
+ sta = sta_info_get_bss(vif_to_sdata(vif), addr);
+ if (!sta)
+ return NULL;
+
+ if (!sta->uploaded)
+ return NULL;
+
+ return &sta->sta;
+}
+EXPORT_SYMBOL(ieee80211_find_sta);
+
+/* powersave support code */
+void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff_head pending;
+ int filtered = 0, buffered = 0, ac, i;
+ unsigned long flags;
+ struct ps_data *ps;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
+ u.ap);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ ps = &sdata->bss->ps;
+ else if (ieee80211_vif_is_mesh(&sdata->vif))
+ ps = &sdata->u.mesh.ps;
+ else
+ return;
+
+ clear_sta_flag(sta, WLAN_STA_SP);
+
+ BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1);
+ sta->driver_buffered_tids = 0;
+ sta->txq_buffered_tids = 0;
+
+ if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
+ drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
+
+ if (sta->sta.txq[0]) {
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
+
+ if (!skb_queue_len(&txqi->queue))
+ continue;
+
+ drv_wake_tx_queue(local, txqi);
+ }
+ }
+
+ skb_queue_head_init(&pending);
+
+ /* sync with ieee80211_tx_h_unicast_ps_buf */
+ spin_lock(&sta->ps_lock);
+ /* Send all buffered frames to the station */
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ int count = skb_queue_len(&pending), tmp;
+
+ spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags);
+ skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending);
+ spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags);
+ tmp = skb_queue_len(&pending);
+ filtered += tmp - count;
+ count = tmp;
+
+ spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags);
+ skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending);
+ spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags);
+ tmp = skb_queue_len(&pending);
+ buffered += tmp - count;
+ }
+
+ ieee80211_add_pending_skbs(local, &pending);
+
+ /* now we're no longer in the deliver code */
+ clear_sta_flag(sta, WLAN_STA_PS_DELIVER);
+
+ /* The station might have polled and then woken up before we responded,
+ * so clear these flags now to avoid them sticking around.
+ */
+ clear_sta_flag(sta, WLAN_STA_PSPOLL);
+ clear_sta_flag(sta, WLAN_STA_UAPSD);
+ spin_unlock(&sta->ps_lock);
+
+ atomic_dec(&ps->num_sta_ps);
+
+ /* This station just woke up and isn't aware of our SMPS state */
+ if (!ieee80211_vif_is_mesh(&sdata->vif) &&
+ !ieee80211_smps_is_restrictive(sta->known_smps_mode,
+ sdata->smps_mode) &&
+ sta->known_smps_mode != sdata->bss->req_smps &&
+ sta_info_tx_streams(sta) != 1) {
+ ht_dbg(sdata,
+ "%pM just woke up and MIMO capable - update SMPS\n",
+ sta->sta.addr);
+ ieee80211_send_smps_action(sdata, sdata->bss->req_smps,
+ sta->sta.addr,
+ sdata->vif.bss_conf.bssid);
+ }
+
+ local->total_ps_buffered -= buffered;
+
+ sta_info_recalc_tim(sta);
+
+ ps_dbg(sdata,
+ "STA %pM aid %d sending %d filtered/%d PS frames since STA not sleeping anymore\n",
+ sta->sta.addr, sta->sta.aid, filtered, buffered);
+}
+
+static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, int tid,
+ enum ieee80211_frame_release_type reason,
+ bool call_driver)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_qos_hdr *nullfunc;
+ struct sk_buff *skb;
+ int size = sizeof(*nullfunc);
+ __le16 fc;
+ bool qos = sta->sta.wme;
+ struct ieee80211_tx_info *info;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ if (qos) {
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
+ IEEE80211_STYPE_QOS_NULLFUNC |
+ IEEE80211_FCTL_FROMDS);
+ } else {
+ size -= 2;
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
+ IEEE80211_STYPE_NULLFUNC |
+ IEEE80211_FCTL_FROMDS);
+ }
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + size);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ nullfunc = (void *) skb_put(skb, size);
+ nullfunc->frame_control = fc;
+ nullfunc->duration_id = 0;
+ memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN);
+ memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN);
+ nullfunc->seq_ctrl = 0;
+
+ skb->priority = tid;
+ skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]);
+ if (qos) {
+ nullfunc->qos_ctrl = cpu_to_le16(tid);
+
+ if (reason == IEEE80211_FRAME_RELEASE_UAPSD)
+ nullfunc->qos_ctrl |=
+ cpu_to_le16(IEEE80211_QOS_CTL_EOSP);
+ }
+
+ info = IEEE80211_SKB_CB(skb);
+
+ /*
+ * Tell TX path to send this frame even though the
+ * STA may still remain is PS mode after this frame
+ * exchange. Also set EOSP to indicate this packet
+ * ends the poll/service period.
+ */
+ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER |
+ IEEE80211_TX_STATUS_EOSP |
+ IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+ info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE;
+
+ if (call_driver)
+ drv_allow_buffered_frames(local, sta, BIT(tid), 1,
+ reason, false);
+
+ skb->dev = sdata->dev;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (WARN_ON(!chanctx_conf)) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return;
+ }
+
+ info->band = chanctx_conf->def.chan->band;
+ ieee80211_xmit(sdata, sta, skb);
+ rcu_read_unlock();
+}
+
+static int find_highest_prio_tid(unsigned long tids)
+{
+ /* lower 3 TIDs aren't ordered perfectly */
+ if (tids & 0xF8)
+ return fls(tids) - 1;
+ /* TID 0 is BE just like TID 3 */
+ if (tids & BIT(0))
+ return 0;
+ return fls(tids) - 1;
+}
+
+static void
+ieee80211_sta_ps_deliver_response(struct sta_info *sta,
+ int n_frames, u8 ignored_acs,
+ enum ieee80211_frame_release_type reason)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ bool more_data = false;
+ int ac;
+ unsigned long driver_release_tids = 0;
+ struct sk_buff_head frames;
+
+ /* Service or PS-Poll period starts */
+ set_sta_flag(sta, WLAN_STA_SP);
+
+ __skb_queue_head_init(&frames);
+
+ /* Get response frame(s) and more data bit for the last one. */
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ unsigned long tids;
+
+ if (ignored_acs & BIT(ac))
+ continue;
+
+ tids = ieee80211_tids_for_ac(ac);
+
+ /* if we already have frames from software, then we can't also
+ * release from hardware queues
+ */
+ if (skb_queue_empty(&frames)) {
+ driver_release_tids |= sta->driver_buffered_tids & tids;
+ driver_release_tids |= sta->txq_buffered_tids & tids;
+ }
+
+ if (driver_release_tids) {
+ /* If the driver has data on more than one TID then
+ * certainly there's more data if we release just a
+ * single frame now (from a single TID). This will
+ * only happen for PS-Poll.
+ */
+ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL &&
+ hweight16(driver_release_tids) > 1) {
+ more_data = true;
+ driver_release_tids =
+ BIT(find_highest_prio_tid(
+ driver_release_tids));
+ break;
+ }
+ } else {
+ struct sk_buff *skb;
+
+ while (n_frames > 0) {
+ skb = skb_dequeue(&sta->tx_filtered[ac]);
+ if (!skb) {
+ skb = skb_dequeue(
+ &sta->ps_tx_buf[ac]);
+ if (skb)
+ local->total_ps_buffered--;
+ }
+ if (!skb)
+ break;
+ n_frames--;
+ __skb_queue_tail(&frames, skb);
+ }
+ }
+
+ /* If we have more frames buffered on this AC, then set the
+ * more-data bit and abort the loop since we can't send more
+ * data from other ACs before the buffered frames from this.
+ */
+ if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
+ !skb_queue_empty(&sta->ps_tx_buf[ac])) {
+ more_data = true;
+ break;
+ }
+ }
+
+ if (skb_queue_empty(&frames) && !driver_release_tids) {
+ int tid;
+
+ /*
+ * For PS-Poll, this can only happen due to a race condition
+ * when we set the TIM bit and the station notices it, but
+ * before it can poll for the frame we expire it.
+ *
+ * For uAPSD, this is said in the standard (11.2.1.5 h):
+ * At each unscheduled SP for a non-AP STA, the AP shall
+ * attempt to transmit at least one MSDU or MMPDU, but no
+ * more than the value specified in the Max SP Length field
+ * in the QoS Capability element from delivery-enabled ACs,
+ * that are destined for the non-AP STA.
+ *
+ * Since we have no other MSDU/MMPDU, transmit a QoS null frame.
+ */
+
+ /* This will evaluate to 1, 3, 5 or 7. */
+ tid = 7 - ((ffs(~ignored_acs) - 1) << 1);
+
+ ieee80211_send_null_response(sdata, sta, tid, reason, true);
+ } else if (!driver_release_tids) {
+ struct sk_buff_head pending;
+ struct sk_buff *skb;
+ int num = 0;
+ u16 tids = 0;
+ bool need_null = false;
+
+ skb_queue_head_init(&pending);
+
+ while ((skb = __skb_dequeue(&frames))) {
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (void *) skb->data;
+ u8 *qoshdr = NULL;
+
+ num++;
+
+ /*
+ * Tell TX path to send this frame even though the
+ * STA may still remain is PS mode after this frame
+ * exchange.
+ */
+ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER;
+ info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE;
+
+ /*
+ * Use MoreData flag to indicate whether there are
+ * more buffered frames for this STA
+ */
+ if (more_data || !skb_queue_empty(&frames))
+ hdr->frame_control |=
+ cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+ else
+ hdr->frame_control &=
+ cpu_to_le16(~IEEE80211_FCTL_MOREDATA);
+
+ if (ieee80211_is_data_qos(hdr->frame_control) ||
+ ieee80211_is_qos_nullfunc(hdr->frame_control))
+ qoshdr = ieee80211_get_qos_ctl(hdr);
+
+ tids |= BIT(skb->priority);
+
+ __skb_queue_tail(&pending, skb);
+
+ /* end service period after last frame or add one */
+ if (!skb_queue_empty(&frames))
+ continue;
+
+ if (reason != IEEE80211_FRAME_RELEASE_UAPSD) {
+ /* for PS-Poll, there's only one frame */
+ info->flags |= IEEE80211_TX_STATUS_EOSP |
+ IEEE80211_TX_CTL_REQ_TX_STATUS;
+ break;
+ }
+
+ /* For uAPSD, things are a bit more complicated. If the
+ * last frame has a QoS header (i.e. is a QoS-data or
+ * QoS-nulldata frame) then just set the EOSP bit there
+ * and be done.
+ * If the frame doesn't have a QoS header (which means
+ * it should be a bufferable MMPDU) then we can't set
+ * the EOSP bit in the QoS header; add a QoS-nulldata
+ * frame to the list to send it after the MMPDU.
+ *
+ * Note that this code is only in the mac80211-release
+ * code path, we assume that the driver will not buffer
+ * anything but QoS-data frames, or if it does, will
+ * create the QoS-nulldata frame by itself if needed.
+ *
+ * Cf. 802.11-2012 10.2.1.10 (c).
+ */
+ if (qoshdr) {
+ *qoshdr |= IEEE80211_QOS_CTL_EOSP;
+
+ info->flags |= IEEE80211_TX_STATUS_EOSP |
+ IEEE80211_TX_CTL_REQ_TX_STATUS;
+ } else {
+ /* The standard isn't completely clear on this
+ * as it says the more-data bit should be set
+ * if there are more BUs. The QoS-Null frame
+ * we're about to send isn't buffered yet, we
+ * only create it below, but let's pretend it
+ * was buffered just in case some clients only
+ * expect more-data=0 when eosp=1.
+ */
+ hdr->frame_control |=
+ cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+ need_null = true;
+ num++;
+ }
+ break;
+ }
+
+ drv_allow_buffered_frames(local, sta, tids, num,
+ reason, more_data);
+
+ ieee80211_add_pending_skbs(local, &pending);
+
+ if (need_null)
+ ieee80211_send_null_response(
+ sdata, sta, find_highest_prio_tid(tids),
+ reason, false);
+
+ sta_info_recalc_tim(sta);
+ } else {
+ unsigned long tids = sta->txq_buffered_tids & driver_release_tids;
+ int tid;
+
+ /*
+ * We need to release a frame that is buffered somewhere in the
+ * driver ... it'll have to handle that.
+ * Note that the driver also has to check the number of frames
+ * on the TIDs we're releasing from - if there are more than
+ * n_frames it has to set the more-data bit (if we didn't ask
+ * it to set it anyway due to other buffered frames); if there
+ * are fewer than n_frames it has to make sure to adjust that
+ * to allow the service period to end properly.
+ */
+ drv_release_buffered_frames(local, sta, driver_release_tids,
+ n_frames, reason, more_data);
+
+ /*
+ * Note that we don't recalculate the TIM bit here as it would
+ * most likely have no effect at all unless the driver told us
+ * that the TID(s) became empty before returning here from the
+ * release function.
+ * Either way, however, when the driver tells us that the TID(s)
+ * became empty or we find that a txq became empty, we'll do the
+ * TIM recalculation.
+ */
+
+ if (!sta->sta.txq[0])
+ return;
+
+ for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
+ struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]);
+
+ if (!(tids & BIT(tid)) || skb_queue_len(&txqi->queue))
+ continue;
+
+ sta_info_recalc_tim(sta);
+ break;
+ }
+ }
+}
+
+void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta)
+{
+ u8 ignore_for_response = sta->sta.uapsd_queues;
+
+ /*
+ * If all ACs are delivery-enabled then we should reply
+ * from any of them, if only some are enabled we reply
+ * only from the non-enabled ones.
+ */
+ if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1)
+ ignore_for_response = 0;
+
+ ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response,
+ IEEE80211_FRAME_RELEASE_PSPOLL);
+}
+
+void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta)
+{
+ int n_frames = sta->sta.max_sp;
+ u8 delivery_enabled = sta->sta.uapsd_queues;
+
+ /*
+ * If we ever grow support for TSPEC this might happen if
+ * the TSPEC update from hostapd comes in between a trigger
+ * frame setting WLAN_STA_UAPSD in the RX path and this
+ * actually getting called.
+ */
+ if (!delivery_enabled)
+ return;
+
+ switch (sta->sta.max_sp) {
+ case 1:
+ n_frames = 2;
+ break;
+ case 2:
+ n_frames = 4;
+ break;
+ case 3:
+ n_frames = 6;
+ break;
+ case 0:
+ /* XXX: what is a good value? */
+ n_frames = 128;
+ break;
+ }
+
+ ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled,
+ IEEE80211_FRAME_RELEASE_UAPSD);
+}
+
+void ieee80211_sta_block_awake(struct ieee80211_hw *hw,
+ struct ieee80211_sta *pubsta, bool block)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+
+ trace_api_sta_block_awake(sta->local, pubsta, block);
+
+ if (block) {
+ set_sta_flag(sta, WLAN_STA_PS_DRIVER);
+ return;
+ }
+
+ if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER))
+ return;
+
+ if (!test_sta_flag(sta, WLAN_STA_PS_STA)) {
+ set_sta_flag(sta, WLAN_STA_PS_DELIVER);
+ clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
+ ieee80211_queue_work(hw, &sta->drv_deliver_wk);
+ } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) ||
+ test_sta_flag(sta, WLAN_STA_UAPSD)) {
+ /* must be asleep in this case */
+ clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
+ ieee80211_queue_work(hw, &sta->drv_deliver_wk);
+ } else {
+ clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
+ }
+}
+EXPORT_SYMBOL(ieee80211_sta_block_awake);
+
+void ieee80211_sta_eosp(struct ieee80211_sta *pubsta)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ struct ieee80211_local *local = sta->local;
+
+ trace_api_eosp(local, pubsta);
+
+ clear_sta_flag(sta, WLAN_STA_SP);
+}
+EXPORT_SYMBOL(ieee80211_sta_eosp);
+
+void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta,
+ u8 tid, bool buffered)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+
+ if (WARN_ON(tid >= IEEE80211_NUM_TIDS))
+ return;
+
+ trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered);
+
+ if (buffered)
+ set_bit(tid, &sta->driver_buffered_tids);
+ else
+ clear_bit(tid, &sta->driver_buffered_tids);
+
+ sta_info_recalc_tim(sta);
+}
+EXPORT_SYMBOL(ieee80211_sta_set_buffered);
+
+int sta_info_move_state(struct sta_info *sta,
+ enum ieee80211_sta_state new_state)
+{
+ might_sleep();
+
+ if (sta->sta_state == new_state)
+ return 0;
+
+ /* check allowed transitions first */
+
+ switch (new_state) {
+ case IEEE80211_STA_NONE:
+ if (sta->sta_state != IEEE80211_STA_AUTH)
+ return -EINVAL;
+ break;
+ case IEEE80211_STA_AUTH:
+ if (sta->sta_state != IEEE80211_STA_NONE &&
+ sta->sta_state != IEEE80211_STA_ASSOC)
+ return -EINVAL;
+ break;
+ case IEEE80211_STA_ASSOC:
+ if (sta->sta_state != IEEE80211_STA_AUTH &&
+ sta->sta_state != IEEE80211_STA_AUTHORIZED)
+ return -EINVAL;
+ break;
+ case IEEE80211_STA_AUTHORIZED:
+ if (sta->sta_state != IEEE80211_STA_ASSOC)
+ return -EINVAL;
+ break;
+ default:
+ WARN(1, "invalid state %d", new_state);
+ return -EINVAL;
+ }
+
+ sta_dbg(sta->sdata, "moving STA %pM to state %d\n",
+ sta->sta.addr, new_state);
+
+ /*
+ * notify the driver before the actual changes so it can
+ * fail the transition
+ */
+ if (test_sta_flag(sta, WLAN_STA_INSERTED)) {
+ int err = drv_sta_state(sta->local, sta->sdata, sta,
+ sta->sta_state, new_state);
+ if (err)
+ return err;
+ }
+
+ /* reflect the change in all state variables */
+
+ switch (new_state) {
+ case IEEE80211_STA_NONE:
+ if (sta->sta_state == IEEE80211_STA_AUTH)
+ clear_bit(WLAN_STA_AUTH, &sta->_flags);
+ break;
+ case IEEE80211_STA_AUTH:
+ if (sta->sta_state == IEEE80211_STA_NONE)
+ set_bit(WLAN_STA_AUTH, &sta->_flags);
+ else if (sta->sta_state == IEEE80211_STA_ASSOC)
+ clear_bit(WLAN_STA_ASSOC, &sta->_flags);
+ break;
+ case IEEE80211_STA_ASSOC:
+ if (sta->sta_state == IEEE80211_STA_AUTH) {
+ set_bit(WLAN_STA_ASSOC, &sta->_flags);
+ } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
+ (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ !sta->sdata->u.vlan.sta))
+ atomic_dec(&sta->sdata->bss->num_mcast_sta);
+ clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
+ }
+ break;
+ case IEEE80211_STA_AUTHORIZED:
+ if (sta->sta_state == IEEE80211_STA_ASSOC) {
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
+ (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ !sta->sdata->u.vlan.sta))
+ atomic_inc(&sta->sdata->bss->num_mcast_sta);
+ set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
+ }
+ break;
+ default:
+ break;
+ }
+
+ sta->sta_state = new_state;
+
+ return 0;
+}
+
+u8 sta_info_tx_streams(struct sta_info *sta)
+{
+ struct ieee80211_sta_ht_cap *ht_cap = &sta->sta.ht_cap;
+ u8 rx_streams;
+
+ if (!sta->sta.ht_cap.ht_supported)
+ return 1;
+
+ if (sta->sta.vht_cap.vht_supported) {
+ int i;
+ u16 tx_mcs_map =
+ le16_to_cpu(sta->sta.vht_cap.vht_mcs.tx_mcs_map);
+
+ for (i = 7; i >= 0; i--)
+ if ((tx_mcs_map & (0x3 << (i * 2))) !=
+ IEEE80211_VHT_MCS_NOT_SUPPORTED)
+ return i + 1;
+ }
+
+ if (ht_cap->mcs.rx_mask[3])
+ rx_streams = 4;
+ else if (ht_cap->mcs.rx_mask[2])
+ rx_streams = 3;
+ else if (ht_cap->mcs.rx_mask[1])
+ rx_streams = 2;
+ else
+ rx_streams = 1;
+
+ if (!(ht_cap->mcs.tx_params & IEEE80211_HT_MCS_TX_RX_DIFF))
+ return rx_streams;
+
+ return ((ht_cap->mcs.tx_params & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK)
+ >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1;
+}
+
+void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct rate_control_ref *ref = NULL;
+ struct timespec uptime;
+ u32 thr = 0;
+ int i, ac;
+
+ if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
+ ref = local->rate_ctrl;
+
+ sinfo->generation = sdata->local->sta_generation;
+
+ /* do before driver, so beacon filtering drivers have a
+ * chance to e.g. just add the number of filtered beacons
+ * (or just modify the value entirely, of course)
+ */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ sinfo->rx_beacon = sdata->u.mgd.count_beacon_signal;
+
+ drv_sta_statistics(local, sdata, &sta->sta, sinfo);
+
+ sinfo->filled |= BIT(NL80211_STA_INFO_INACTIVE_TIME) |
+ BIT(NL80211_STA_INFO_STA_FLAGS) |
+ BIT(NL80211_STA_INFO_BSS_PARAM) |
+ BIT(NL80211_STA_INFO_CONNECTED_TIME) |
+ BIT(NL80211_STA_INFO_RX_DROP_MISC) |
+ BIT(NL80211_STA_INFO_BEACON_LOSS);
+
+ ktime_get_ts(&uptime);
+ sinfo->connected_time = uptime.tv_sec - sta->last_connected;
+ sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
+
+ if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) |
+ BIT(NL80211_STA_INFO_TX_BYTES)))) {
+ sinfo->tx_bytes = 0;
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+ sinfo->tx_bytes += sta->tx_bytes[ac];
+ sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64);
+ }
+
+ if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) {
+ sinfo->tx_packets = 0;
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+ sinfo->tx_packets += sta->tx_packets[ac];
+ sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS);
+ }
+
+ if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) |
+ BIT(NL80211_STA_INFO_RX_BYTES)))) {
+ sinfo->rx_bytes = sta->rx_bytes;
+ sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64);
+ }
+
+ if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) {
+ sinfo->rx_packets = sta->rx_packets;
+ sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS);
+ }
+
+ if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) {
+ sinfo->tx_retries = sta->tx_retry_count;
+ sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES);
+ }
+
+ if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) {
+ sinfo->tx_failed = sta->tx_retry_failed;
+ sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED);
+ }
+
+ sinfo->rx_dropped_misc = sta->rx_dropped;
+ sinfo->beacon_loss_count = sta->beacon_loss_count;
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
+ sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_RX) |
+ BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
+ sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif);
+ }
+
+ if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) ||
+ (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) {
+ if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) {
+ sinfo->signal = (s8)sta->last_signal;
+ sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL);
+ }
+
+ if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) {
+ sinfo->signal_avg = (s8) -ewma_read(&sta->avg_signal);
+ sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG);
+ }
+ }
+
+ if (sta->chains &&
+ !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) |
+ BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
+ sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL) |
+ BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
+
+ sinfo->chains = sta->chains;
+ for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) {
+ sinfo->chain_signal[i] = sta->chain_signal_last[i];
+ sinfo->chain_signal_avg[i] =
+ (s8) -ewma_read(&sta->chain_signal_avg[i]);
+ }
+ }
+
+ if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) {
+ sta_set_rate_info_tx(sta, &sta->last_tx_rate, &sinfo->txrate);
+ sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE);
+ }
+
+ if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) {
+ sta_set_rate_info_rx(sta, &sinfo->rxrate);
+ sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
+ }
+
+ sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS);
+ for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
+ struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
+
+ if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) {
+ tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU);
+ tidstats->rx_msdu = sta->rx_msdu[i];
+ }
+
+ if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) {
+ tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU);
+ tidstats->tx_msdu = sta->tx_msdu[i];
+ }
+
+ if (!(tidstats->filled &
+ BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) &&
+ local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ tidstats->filled |=
+ BIT(NL80211_TID_STATS_TX_MSDU_RETRIES);
+ tidstats->tx_msdu_retries = sta->tx_msdu_retries[i];
+ }
+
+ if (!(tidstats->filled &
+ BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) &&
+ local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ tidstats->filled |=
+ BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
+ tidstats->tx_msdu_failed = sta->tx_msdu_failed[i];
+ }
+ }
+
+ if (ieee80211_vif_is_mesh(&sdata->vif)) {
+#ifdef CONFIG_MAC80211_MESH
+ sinfo->filled |= BIT(NL80211_STA_INFO_LLID) |
+ BIT(NL80211_STA_INFO_PLID) |
+ BIT(NL80211_STA_INFO_PLINK_STATE) |
+ BIT(NL80211_STA_INFO_LOCAL_PM) |
+ BIT(NL80211_STA_INFO_PEER_PM) |
+ BIT(NL80211_STA_INFO_NONPEER_PM);
+
+ sinfo->llid = sta->llid;
+ sinfo->plid = sta->plid;
+ sinfo->plink_state = sta->plink_state;
+ if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) {
+ sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET);
+ sinfo->t_offset = sta->t_offset;
+ }
+ sinfo->local_pm = sta->local_pm;
+ sinfo->peer_pm = sta->peer_pm;
+ sinfo->nonpeer_pm = sta->nonpeer_pm;
+#endif
+ }
+
+ sinfo->bss_param.flags = 0;
+ if (sdata->vif.bss_conf.use_cts_prot)
+ sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT;
+ if (sdata->vif.bss_conf.use_short_preamble)
+ sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE;
+ if (sdata->vif.bss_conf.use_short_slot)
+ sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME;
+ sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period;
+ sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int;
+
+ sinfo->sta_flags.set = 0;
+ sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) |
+ BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) |
+ BIT(NL80211_STA_FLAG_WME) |
+ BIT(NL80211_STA_FLAG_MFP) |
+ BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_ASSOCIATED) |
+ BIT(NL80211_STA_FLAG_TDLS_PEER);
+ if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED);
+ if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE))
+ sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE);
+ if (sta->sta.wme)
+ sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME);
+ if (test_sta_flag(sta, WLAN_STA_MFP))
+ sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP);
+ if (test_sta_flag(sta, WLAN_STA_AUTH))
+ sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED);
+ if (test_sta_flag(sta, WLAN_STA_ASSOC))
+ sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED);
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER))
+ sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER);
+
+ /* check if the driver has a SW RC implementation */
+ if (ref && ref->ops->get_expected_throughput)
+ thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv);
+ else
+ thr = drv_get_expected_throughput(local, &sta->sta);
+
+ if (thr != 0) {
+ sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
+ sinfo->expected_throughput = thr;
+ }
+}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
new file mode 100644
index 000000000..5c164fb3f
--- /dev/null
+++ b/net/mac80211/sta_info.h
@@ -0,0 +1,641 @@
+/*
+ * Copyright 2002-2005, Devicescape Software, Inc.
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef STA_INFO_H
+#define STA_INFO_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <linux/workqueue.h>
+#include <linux/average.h>
+#include <linux/etherdevice.h>
+#include <linux/rhashtable.h>
+#include "key.h"
+
+/**
+ * enum ieee80211_sta_info_flags - Stations flags
+ *
+ * These flags are used with &struct sta_info's @flags member, but
+ * only indirectly with set_sta_flag() and friends.
+ *
+ * @WLAN_STA_AUTH: Station is authenticated.
+ * @WLAN_STA_ASSOC: Station is associated.
+ * @WLAN_STA_PS_STA: Station is in power-save mode
+ * @WLAN_STA_AUTHORIZED: Station is authorized to send/receive traffic.
+ * This bit is always checked so needs to be enabled for all stations
+ * when virtual port control is not in use.
+ * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble
+ * frames.
+ * @WLAN_STA_WDS: Station is one of our WDS peers.
+ * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the
+ * IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next
+ * frame to this station is transmitted.
+ * @WLAN_STA_MFP: Management frame protection is used with this STA.
+ * @WLAN_STA_BLOCK_BA: Used to deny ADDBA requests (both TX and RX)
+ * during suspend/resume and station removal.
+ * @WLAN_STA_PS_DRIVER: driver requires keeping this station in
+ * power-save mode logically to flush frames that might still
+ * be in the queues
+ * @WLAN_STA_PSPOLL: Station sent PS-poll while driver was keeping
+ * station in power-save mode, reply when the driver unblocks.
+ * @WLAN_STA_TDLS_PEER: Station is a TDLS peer.
+ * @WLAN_STA_TDLS_PEER_AUTH: This TDLS peer is authorized to send direct
+ * packets. This means the link is enabled.
+ * @WLAN_STA_TDLS_INITIATOR: We are the initiator of the TDLS link with this
+ * station.
+ * @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching
+ * @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this
+ * TDLS peer
+ * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was
+ * keeping station in power-save mode, reply when the driver
+ * unblocks the station.
+ * @WLAN_STA_SP: Station is in a service period, so don't try to
+ * reply to other uAPSD trigger frames or PS-Poll.
+ * @WLAN_STA_4ADDR_EVENT: 4-addr event was already sent for this frame.
+ * @WLAN_STA_INSERTED: This station is inserted into the hash table.
+ * @WLAN_STA_RATE_CONTROL: rate control was initialized for this station.
+ * @WLAN_STA_TOFFSET_KNOWN: toffset calculated for this station is valid.
+ * @WLAN_STA_MPSP_OWNER: local STA is owner of a mesh Peer Service Period.
+ * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP.
+ * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX
+ * until pending frames are delivered
+ */
+enum ieee80211_sta_info_flags {
+ WLAN_STA_AUTH,
+ WLAN_STA_ASSOC,
+ WLAN_STA_PS_STA,
+ WLAN_STA_AUTHORIZED,
+ WLAN_STA_SHORT_PREAMBLE,
+ WLAN_STA_WDS,
+ WLAN_STA_CLEAR_PS_FILT,
+ WLAN_STA_MFP,
+ WLAN_STA_BLOCK_BA,
+ WLAN_STA_PS_DRIVER,
+ WLAN_STA_PSPOLL,
+ WLAN_STA_TDLS_PEER,
+ WLAN_STA_TDLS_PEER_AUTH,
+ WLAN_STA_TDLS_INITIATOR,
+ WLAN_STA_TDLS_CHAN_SWITCH,
+ WLAN_STA_TDLS_OFF_CHANNEL,
+ WLAN_STA_UAPSD,
+ WLAN_STA_SP,
+ WLAN_STA_4ADDR_EVENT,
+ WLAN_STA_INSERTED,
+ WLAN_STA_RATE_CONTROL,
+ WLAN_STA_TOFFSET_KNOWN,
+ WLAN_STA_MPSP_OWNER,
+ WLAN_STA_MPSP_RECIPIENT,
+ WLAN_STA_PS_DELIVER,
+};
+
+#define ADDBA_RESP_INTERVAL HZ
+#define HT_AGG_MAX_RETRIES 15
+#define HT_AGG_BURST_RETRIES 3
+#define HT_AGG_RETRIES_PERIOD (15 * HZ)
+
+#define HT_AGG_STATE_DRV_READY 0
+#define HT_AGG_STATE_RESPONSE_RECEIVED 1
+#define HT_AGG_STATE_OPERATIONAL 2
+#define HT_AGG_STATE_STOPPING 3
+#define HT_AGG_STATE_WANT_START 4
+#define HT_AGG_STATE_WANT_STOP 5
+
+enum ieee80211_agg_stop_reason {
+ AGG_STOP_DECLINED,
+ AGG_STOP_LOCAL_REQUEST,
+ AGG_STOP_PEER_REQUEST,
+ AGG_STOP_DESTROY_STA,
+};
+
+/**
+ * struct tid_ampdu_tx - TID aggregation information (Tx).
+ *
+ * @rcu_head: rcu head for freeing structure
+ * @session_timer: check if we keep Tx-ing on the TID (by timeout value)
+ * @addba_resp_timer: timer for peer's response to addba request
+ * @pending: pending frames queue -- use sta's spinlock to protect
+ * @dialog_token: dialog token for aggregation session
+ * @timeout: session timeout value to be filled in ADDBA requests
+ * @state: session state (see above)
+ * @last_tx: jiffies of last tx activity
+ * @stop_initiator: initiator of a session stop
+ * @tx_stop: TX DelBA frame when stopping
+ * @buf_size: reorder buffer size at receiver
+ * @failed_bar_ssn: ssn of the last failed BAR tx attempt
+ * @bar_pending: BAR needs to be re-sent
+ *
+ * This structure's lifetime is managed by RCU, assignments to
+ * the array holding it must hold the aggregation mutex.
+ *
+ * The TX path can access it under RCU lock-free if, and
+ * only if, the state has the flag %HT_AGG_STATE_OPERATIONAL
+ * set. Otherwise, the TX path must also acquire the spinlock
+ * and re-check the state, see comments in the tx code
+ * touching it.
+ */
+struct tid_ampdu_tx {
+ struct rcu_head rcu_head;
+ struct timer_list session_timer;
+ struct timer_list addba_resp_timer;
+ struct sk_buff_head pending;
+ unsigned long state;
+ unsigned long last_tx;
+ u16 timeout;
+ u8 dialog_token;
+ u8 stop_initiator;
+ bool tx_stop;
+ u8 buf_size;
+
+ u16 failed_bar_ssn;
+ bool bar_pending;
+};
+
+/**
+ * struct tid_ampdu_rx - TID aggregation information (Rx).
+ *
+ * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an
+ * A-MSDU with individually reported subframes.
+ * @reorder_time: jiffies when skb was added
+ * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
+ * @reorder_timer: releases expired frames from the reorder buffer.
+ * @last_rx: jiffies of last rx activity
+ * @head_seq_num: head sequence number in reordering buffer.
+ * @stored_mpdu_num: number of MPDUs in reordering buffer
+ * @ssn: Starting Sequence Number expected to be aggregated.
+ * @buf_size: buffer size for incoming A-MPDUs
+ * @timeout: reset timer value (in TUs).
+ * @dialog_token: dialog token for aggregation session
+ * @rcu_head: RCU head used for freeing this struct
+ * @reorder_lock: serializes access to reorder buffer, see below.
+ * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and
+ * and ssn.
+ * @removed: this session is removed (but might have been found due to RCU)
+ *
+ * This structure's lifetime is managed by RCU, assignments to
+ * the array holding it must hold the aggregation mutex.
+ *
+ * The @reorder_lock is used to protect the members of this
+ * struct, except for @timeout, @buf_size and @dialog_token,
+ * which are constant across the lifetime of the struct (the
+ * dialog token being used only for debugging).
+ */
+struct tid_ampdu_rx {
+ struct rcu_head rcu_head;
+ spinlock_t reorder_lock;
+ struct sk_buff_head *reorder_buf;
+ unsigned long *reorder_time;
+ struct timer_list session_timer;
+ struct timer_list reorder_timer;
+ unsigned long last_rx;
+ u16 head_seq_num;
+ u16 stored_mpdu_num;
+ u16 ssn;
+ u16 buf_size;
+ u16 timeout;
+ u8 dialog_token;
+ bool auto_seq;
+ bool removed;
+};
+
+/**
+ * struct sta_ampdu_mlme - STA aggregation information.
+ *
+ * @tid_rx: aggregation info for Rx per TID -- RCU protected
+ * @tid_tx: aggregation info for Tx per TID
+ * @tid_start_tx: sessions where start was requested
+ * @addba_req_num: number of times addBA request has been sent.
+ * @last_addba_req_time: timestamp of the last addBA request.
+ * @dialog_token_allocator: dialog token enumerator for each new session;
+ * @work: work struct for starting/stopping aggregation
+ * @tid_rx_timer_expired: bitmap indicating on which TIDs the
+ * RX timer expired until the work for it runs
+ * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the
+ * driver requested to close until the work for it runs
+ * @mtx: mutex to protect all TX data (except non-NULL assignments
+ * to tid_tx[idx], which are protected by the sta spinlock)
+ * tid_start_tx is also protected by sta->lock.
+ */
+struct sta_ampdu_mlme {
+ struct mutex mtx;
+ /* rx */
+ struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
+ unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
+ unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
+ /* tx */
+ struct work_struct work;
+ struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
+ struct tid_ampdu_tx *tid_start_tx[IEEE80211_NUM_TIDS];
+ unsigned long last_addba_req_time[IEEE80211_NUM_TIDS];
+ u8 addba_req_num[IEEE80211_NUM_TIDS];
+ u8 dialog_token_allocator;
+};
+
+
+/* Value to indicate no TID reservation */
+#define IEEE80211_TID_UNRESERVED 0xff
+
+/**
+ * struct sta_info - STA information
+ *
+ * This structure collects information about a station that
+ * mac80211 is communicating with.
+ *
+ * @list: global linked list entry
+ * @free_list: list entry for keeping track of stations to free
+ * @hash_node: hash node for rhashtable
+ * @local: pointer to the global information
+ * @sdata: virtual interface this station belongs to
+ * @ptk: peer keys negotiated with this station, if any
+ * @ptk_idx: last installed peer key index
+ * @gtk: group keys negotiated with this station, if any
+ * @gtk_idx: last installed group key index
+ * @rate_ctrl: rate control algorithm reference
+ * @rate_ctrl_priv: rate control private per-STA pointer
+ * @last_tx_rate: rate used for last transmit, to report to userspace as
+ * "the" transmit rate
+ * @last_rx_rate_idx: rx status rate index of the last data packet
+ * @last_rx_rate_flag: rx status flag of the last data packet
+ * @last_rx_rate_vht_flag: rx status vht flag of the last data packet
+ * @last_rx_rate_vht_nss: rx status nss of last data packet
+ * @lock: used for locking all fields that require locking, see comments
+ * in the header file.
+ * @drv_deliver_wk: used for delivering frames after driver PS unblocking
+ * @listen_interval: listen interval of this station, when we're acting as AP
+ * @_flags: STA flags, see &enum ieee80211_sta_info_flags, do not use directly
+ * @ps_lock: used for powersave (when mac80211 is the AP) related locking
+ * @ps_tx_buf: buffers (per AC) of frames to transmit to this station
+ * when it leaves power saving state or polls
+ * @tx_filtered: buffers (per AC) of frames we already tried to
+ * transmit but were filtered by hardware due to STA having
+ * entered power saving state, these are also delivered to
+ * the station when it leaves powersave or polls for frames
+ * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on
+ * @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on
+ * @rx_packets: Number of MSDUs received from this STA
+ * @rx_bytes: Number of bytes received from this STA
+ * @last_rx: time (in jiffies) when last frame was received from this STA
+ * @last_connected: time (in seconds) when a station got connected
+ * @num_duplicates: number of duplicate frames received from this STA
+ * @rx_fragments: number of received MPDUs
+ * @rx_dropped: number of dropped MPDUs from this STA
+ * @last_signal: signal of last received frame from this STA
+ * @avg_signal: moving average of signal of received frames from this STA
+ * @last_ack_signal: signal of last received Ack frame from this STA
+ * @last_seq_ctrl: last received seq/frag number from this STA (per RX queue)
+ * @tx_filtered_count: number of frames the hardware filtered for this STA
+ * @tx_retry_failed: number of frames that failed retry
+ * @tx_retry_count: total number of retries for frames to this STA
+ * @fail_avg: moving percentage of failed MSDUs
+ * @tx_packets: number of RX/TX MSDUs
+ * @tx_bytes: number of bytes transmitted to this STA
+ * @tx_fragments: number of transmitted MPDUs
+ * @tid_seq: per-TID sequence numbers for sending to this STA
+ * @ampdu_mlme: A-MPDU state machine state
+ * @timer_to_tid: identity mapping to ID timers
+ * @llid: Local link ID
+ * @plid: Peer link ID
+ * @reason: Cancel reason on PLINK_HOLDING state
+ * @plink_retries: Retries in establishment
+ * @plink_state: peer link state
+ * @plink_timeout: timeout of peer link
+ * @plink_timer: peer link watch timer
+ * @t_offset: timing offset relative to this host
+ * @t_offset_setpoint: reference timing offset of this sta to be used when
+ * calculating clockdrift
+ * @local_pm: local link-specific power save mode
+ * @peer_pm: peer-specific power save mode towards local STA
+ * @nonpeer_pm: STA power save mode towards non-peer neighbors
+ * @debugfs: debug filesystem info
+ * @dead: set to true when sta is unlinked
+ * @uploaded: set to true when sta is uploaded to the driver
+ * @lost_packets: number of consecutive lost packets
+ * @sta: station information we share with the driver
+ * @sta_state: duplicates information about station state (for debug)
+ * @beacon_loss_count: number of times beacon loss has triggered
+ * @rcu_head: RCU head used for freeing this station struct
+ * @cur_max_bandwidth: maximum bandwidth to use for TX to the station,
+ * taken from HT/VHT capabilities or VHT operating mode notification
+ * @chains: chains ever used for RX from this station
+ * @chain_signal_last: last signal (per chain)
+ * @chain_signal_avg: signal average (per chain)
+ * @known_smps_mode: the smps_mode the client thinks we are in. Relevant for
+ * AP only.
+ * @cipher_scheme: optional cipher scheme for this station
+ * @last_tdls_pkt_time: holds the time in jiffies of last TDLS pkt ACKed
+ * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
+ * @tx_msdu: MSDUs transmitted to this station, using IEEE80211_NUM_TID
+ * entry for non-QoS frames
+ * @tx_msdu_retries: MSDU retries for transmissions to to this station,
+ * using IEEE80211_NUM_TID entry for non-QoS frames
+ * @tx_msdu_failed: MSDU failures for transmissions to to this station,
+ * using IEEE80211_NUM_TID entry for non-QoS frames
+ * @rx_msdu: MSDUs received from this station, using IEEE80211_NUM_TID
+ * entry for non-QoS frames
+ */
+struct sta_info {
+ /* General information, mostly static */
+ struct list_head list, free_list;
+ struct rcu_head rcu_head;
+ struct rhash_head hash_node;
+ struct ieee80211_local *local;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
+ struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS];
+ u8 gtk_idx;
+ u8 ptk_idx;
+ struct rate_control_ref *rate_ctrl;
+ void *rate_ctrl_priv;
+ spinlock_t lock;
+
+ struct work_struct drv_deliver_wk;
+
+ u16 listen_interval;
+
+ bool dead;
+
+ bool uploaded;
+
+ enum ieee80211_sta_state sta_state;
+
+ /* use the accessors defined below */
+ unsigned long _flags;
+
+ /* STA powersave lock and frame queues */
+ spinlock_t ps_lock;
+ struct sk_buff_head ps_tx_buf[IEEE80211_NUM_ACS];
+ struct sk_buff_head tx_filtered[IEEE80211_NUM_ACS];
+ unsigned long driver_buffered_tids;
+ unsigned long txq_buffered_tids;
+
+ /* Updated from RX path only, no locking requirements */
+ unsigned long rx_packets;
+ u64 rx_bytes;
+ unsigned long last_rx;
+ long last_connected;
+ unsigned long num_duplicates;
+ unsigned long rx_fragments;
+ unsigned long rx_dropped;
+ int last_signal;
+ struct ewma avg_signal;
+ int last_ack_signal;
+
+ u8 chains;
+ s8 chain_signal_last[IEEE80211_MAX_CHAINS];
+ struct ewma chain_signal_avg[IEEE80211_MAX_CHAINS];
+
+ /* Plus 1 for non-QoS frames */
+ __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1];
+
+ /* Updated from TX status path only, no locking requirements */
+ unsigned long tx_filtered_count;
+ unsigned long tx_retry_failed, tx_retry_count;
+ /* moving percentage of failed MSDUs */
+ unsigned int fail_avg;
+
+ /* Updated from TX path only, no locking requirements */
+ u32 tx_fragments;
+ u64 tx_packets[IEEE80211_NUM_ACS];
+ u64 tx_bytes[IEEE80211_NUM_ACS];
+ struct ieee80211_tx_rate last_tx_rate;
+ int last_rx_rate_idx;
+ u32 last_rx_rate_flag;
+ u32 last_rx_rate_vht_flag;
+ u8 last_rx_rate_vht_nss;
+ u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
+ u64 tx_msdu[IEEE80211_NUM_TIDS + 1];
+ u64 tx_msdu_retries[IEEE80211_NUM_TIDS + 1];
+ u64 tx_msdu_failed[IEEE80211_NUM_TIDS + 1];
+ u64 rx_msdu[IEEE80211_NUM_TIDS + 1];
+
+ /*
+ * Aggregation information, locked with lock.
+ */
+ struct sta_ampdu_mlme ampdu_mlme;
+ u8 timer_to_tid[IEEE80211_NUM_TIDS];
+
+#ifdef CONFIG_MAC80211_MESH
+ /*
+ * Mesh peer link attributes
+ * TODO: move to a sub-structure that is referenced with pointer?
+ */
+ u16 llid;
+ u16 plid;
+ u16 reason;
+ u8 plink_retries;
+ enum nl80211_plink_state plink_state;
+ u32 plink_timeout;
+ struct timer_list plink_timer;
+ s64 t_offset;
+ s64 t_offset_setpoint;
+ /* mesh power save */
+ enum nl80211_mesh_power_mode local_pm;
+ enum nl80211_mesh_power_mode peer_pm;
+ enum nl80211_mesh_power_mode nonpeer_pm;
+#endif
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ struct sta_info_debugfsdentries {
+ struct dentry *dir;
+ bool add_has_run;
+ } debugfs;
+#endif
+
+ enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;
+
+ unsigned int lost_packets;
+ unsigned int beacon_loss_count;
+
+ enum ieee80211_smps_mode known_smps_mode;
+ const struct ieee80211_cipher_scheme *cipher_scheme;
+
+ /* TDLS timeout data */
+ unsigned long last_tdls_pkt_time;
+
+ u8 reserved_tid;
+
+ /* keep last! */
+ struct ieee80211_sta sta;
+};
+
+static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta)
+{
+#ifdef CONFIG_MAC80211_MESH
+ return sta->plink_state;
+#endif
+ return NL80211_PLINK_LISTEN;
+}
+
+static inline void set_sta_flag(struct sta_info *sta,
+ enum ieee80211_sta_info_flags flag)
+{
+ WARN_ON(flag == WLAN_STA_AUTH ||
+ flag == WLAN_STA_ASSOC ||
+ flag == WLAN_STA_AUTHORIZED);
+ set_bit(flag, &sta->_flags);
+}
+
+static inline void clear_sta_flag(struct sta_info *sta,
+ enum ieee80211_sta_info_flags flag)
+{
+ WARN_ON(flag == WLAN_STA_AUTH ||
+ flag == WLAN_STA_ASSOC ||
+ flag == WLAN_STA_AUTHORIZED);
+ clear_bit(flag, &sta->_flags);
+}
+
+static inline int test_sta_flag(struct sta_info *sta,
+ enum ieee80211_sta_info_flags flag)
+{
+ return test_bit(flag, &sta->_flags);
+}
+
+static inline int test_and_clear_sta_flag(struct sta_info *sta,
+ enum ieee80211_sta_info_flags flag)
+{
+ WARN_ON(flag == WLAN_STA_AUTH ||
+ flag == WLAN_STA_ASSOC ||
+ flag == WLAN_STA_AUTHORIZED);
+ return test_and_clear_bit(flag, &sta->_flags);
+}
+
+static inline int test_and_set_sta_flag(struct sta_info *sta,
+ enum ieee80211_sta_info_flags flag)
+{
+ WARN_ON(flag == WLAN_STA_AUTH ||
+ flag == WLAN_STA_ASSOC ||
+ flag == WLAN_STA_AUTHORIZED);
+ return test_and_set_bit(flag, &sta->_flags);
+}
+
+int sta_info_move_state(struct sta_info *sta,
+ enum ieee80211_sta_state new_state);
+
+static inline void sta_info_pre_move_state(struct sta_info *sta,
+ enum ieee80211_sta_state new_state)
+{
+ int ret;
+
+ WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED));
+
+ ret = sta_info_move_state(sta, new_state);
+ WARN_ON_ONCE(ret);
+}
+
+
+void ieee80211_assign_tid_tx(struct sta_info *sta, int tid,
+ struct tid_ampdu_tx *tid_tx);
+
+static inline struct tid_ampdu_tx *
+rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid)
+{
+ return rcu_dereference_protected(sta->ampdu_mlme.tid_tx[tid],
+ lockdep_is_held(&sta->lock) ||
+ lockdep_is_held(&sta->ampdu_mlme.mtx));
+}
+
+/* Maximum number of frames to buffer per power saving station per AC */
+#define STA_MAX_TX_BUFFER 64
+
+/* Minimum buffered frame expiry time. If STA uses listen interval that is
+ * smaller than this value, the minimum value here is used instead. */
+#define STA_TX_BUFFER_EXPIRE (10 * HZ)
+
+/* How often station data is cleaned up (e.g., expiration of buffered frames)
+ */
+#define STA_INFO_CLEANUP_INTERVAL (10 * HZ)
+
+/*
+ * Get a STA info, must be under RCU read lock.
+ */
+struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr);
+
+struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr);
+
+u32 sta_addr_hash(const void *key, u32 length, u32 seed);
+
+#define _sta_bucket_idx(_tbl, _a) \
+ rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd))
+
+#define for_each_sta_info(local, tbl, _addr, _sta, _tmp) \
+ rht_for_each_entry_rcu(_sta, _tmp, tbl, \
+ _sta_bucket_idx(tbl, _addr), \
+ hash_node) \
+ /* compare address and run code only if it matches */ \
+ if (ether_addr_equal(_sta->sta.addr, (_addr)))
+
+/*
+ * Get STA info by index, BROKEN!
+ */
+struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
+ int idx);
+/*
+ * Create a new STA info, caller owns returned structure
+ * until sta_info_insert().
+ */
+struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr, gfp_t gfp);
+
+void sta_info_free(struct ieee80211_local *local, struct sta_info *sta);
+
+/*
+ * Insert STA info into hash table/list, returns zero or a
+ * -EEXIST if (if the same MAC address is already present).
+ *
+ * Calling the non-rcu version makes the caller relinquish,
+ * the _rcu version calls read_lock_rcu() and must be called
+ * without it held.
+ */
+int sta_info_insert(struct sta_info *sta);
+int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU);
+
+int __must_check __sta_info_destroy(struct sta_info *sta);
+int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr);
+int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
+ const u8 *addr);
+
+void sta_info_recalc_tim(struct sta_info *sta);
+
+int sta_info_init(struct ieee80211_local *local);
+void sta_info_stop(struct ieee80211_local *local);
+
+/**
+ * sta_info_flush - flush matching STA entries from the STA table
+ *
+ * Returns the number of removed STA entries.
+ *
+ * @sdata: sdata to remove all stations from
+ * @vlans: if the given interface is an AP interface, also flush VLANs
+ */
+int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans);
+
+static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata)
+{
+ return __sta_info_flush(sdata, false);
+}
+
+void sta_set_rate_info_tx(struct sta_info *sta,
+ const struct ieee80211_tx_rate *rate,
+ struct rate_info *rinfo);
+void sta_set_rate_info_rx(struct sta_info *sta,
+ struct rate_info *rinfo);
+void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo);
+
+void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
+ unsigned long exp_time);
+u8 sta_info_tx_streams(struct sta_info *sta);
+
+void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta);
+void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta);
+void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta);
+
+#endif /* STA_INFO_H */
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
new file mode 100644
index 000000000..005fdbe39
--- /dev/null
+++ b/net/mac80211/status.c
@@ -0,0 +1,929 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/export.h>
+#include <linux/etherdevice.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+#include "mesh.h"
+#include "led.h"
+#include "wme.h"
+
+
+void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int tmp;
+
+ skb->pkt_type = IEEE80211_TX_STATUS_MSG;
+ skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ?
+ &local->skb_queue : &local->skb_queue_unreliable, skb);
+ tmp = skb_queue_len(&local->skb_queue) +
+ skb_queue_len(&local->skb_queue_unreliable);
+ while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT &&
+ (skb = skb_dequeue(&local->skb_queue_unreliable))) {
+ ieee80211_free_txskb(hw, skb);
+ tmp--;
+ I802_DEBUG_INC(local->tx_status_drop);
+ }
+ tasklet_schedule(&local->tasklet);
+}
+EXPORT_SYMBOL(ieee80211_tx_status_irqsafe);
+
+static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
+ struct sta_info *sta,
+ struct sk_buff *skb)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ int ac;
+
+ /*
+ * This skb 'survived' a round-trip through the driver, and
+ * hopefully the driver didn't mangle it too badly. However,
+ * we can definitely not rely on the control information
+ * being correct. Clear it so we don't get junk there, and
+ * indicate that it needs new processing, but must not be
+ * modified/encrypted again.
+ */
+ memset(&info->control, 0, sizeof(info->control));
+
+ info->control.jiffies = jiffies;
+ info->control.vif = &sta->sdata->vif;
+ info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING |
+ IEEE80211_TX_INTFL_RETRANSMISSION;
+ info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
+
+ sta->tx_filtered_count++;
+
+ /*
+ * Clear more-data bit on filtered frames, it might be set
+ * but later frames might time out so it might have to be
+ * clear again ... It's all rather unlikely (this frame
+ * should time out first, right?) but let's not confuse
+ * peers unnecessarily.
+ */
+ if (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_MOREDATA))
+ hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+
+ if (ieee80211_is_data_qos(hdr->frame_control)) {
+ u8 *p = ieee80211_get_qos_ctl(hdr);
+ int tid = *p & IEEE80211_QOS_CTL_TID_MASK;
+
+ /*
+ * Clear EOSP if set, this could happen e.g.
+ * if an absence period (us being a P2P GO)
+ * shortens the SP.
+ */
+ if (*p & IEEE80211_QOS_CTL_EOSP)
+ *p &= ~IEEE80211_QOS_CTL_EOSP;
+ ac = ieee802_1d_to_ac[tid & 7];
+ } else {
+ ac = IEEE80211_AC_BE;
+ }
+
+ /*
+ * Clear the TX filter mask for this STA when sending the next
+ * packet. If the STA went to power save mode, this will happen
+ * when it wakes up for the next time.
+ */
+ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT);
+
+ /*
+ * This code races in the following way:
+ *
+ * (1) STA sends frame indicating it will go to sleep and does so
+ * (2) hardware/firmware adds STA to filter list, passes frame up
+ * (3) hardware/firmware processes TX fifo and suppresses a frame
+ * (4) we get TX status before having processed the frame and
+ * knowing that the STA has gone to sleep.
+ *
+ * This is actually quite unlikely even when both those events are
+ * processed from interrupts coming in quickly after one another or
+ * even at the same time because we queue both TX status events and
+ * RX frames to be processed by a tasklet and process them in the
+ * same order that they were received or TX status last. Hence, there
+ * is no race as long as the frame RX is processed before the next TX
+ * status, which drivers can ensure, see below.
+ *
+ * Note that this can only happen if the hardware or firmware can
+ * actually add STAs to the filter list, if this is done by the
+ * driver in response to set_tim() (which will only reduce the race
+ * this whole filtering tries to solve, not completely solve it)
+ * this situation cannot happen.
+ *
+ * To completely solve this race drivers need to make sure that they
+ * (a) don't mix the irq-safe/not irq-safe TX status/RX processing
+ * functions and
+ * (b) always process RX events before TX status events if ordering
+ * can be unknown, for example with different interrupt status
+ * bits.
+ * (c) if PS mode transitions are manual (i.e. the flag
+ * %IEEE80211_HW_AP_LINK_PS is set), always process PS state
+ * changes before calling TX status events if ordering can be
+ * unknown.
+ */
+ if (test_sta_flag(sta, WLAN_STA_PS_STA) &&
+ skb_queue_len(&sta->tx_filtered[ac]) < STA_MAX_TX_BUFFER) {
+ skb_queue_tail(&sta->tx_filtered[ac], skb);
+ sta_info_recalc_tim(sta);
+
+ if (!timer_pending(&local->sta_cleanup))
+ mod_timer(&local->sta_cleanup,
+ round_jiffies(jiffies +
+ STA_INFO_CLEANUP_INTERVAL));
+ return;
+ }
+
+ if (!test_sta_flag(sta, WLAN_STA_PS_STA) &&
+ !(info->flags & IEEE80211_TX_INTFL_RETRIED)) {
+ /* Software retry the packet once */
+ info->flags |= IEEE80211_TX_INTFL_RETRIED;
+ ieee80211_add_pending_skb(local, skb);
+ return;
+ }
+
+ ps_dbg_ratelimited(sta->sdata,
+ "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n",
+ skb_queue_len(&sta->tx_filtered[ac]),
+ !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies);
+ ieee80211_free_txskb(&local->hw, skb);
+}
+
+static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid)
+{
+ struct tid_ampdu_tx *tid_tx;
+
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+ if (!tid_tx || !tid_tx->bar_pending)
+ return;
+
+ tid_tx->bar_pending = false;
+ ieee80211_send_bar(&sta->sdata->vif, addr, tid, tid_tx->failed_bar_ssn);
+}
+
+static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
+{
+ struct ieee80211_mgmt *mgmt = (void *) skb->data;
+ struct ieee80211_local *local = sta->local;
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+ if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+ sta->last_rx = jiffies;
+
+ if (ieee80211_is_data_qos(mgmt->frame_control)) {
+ struct ieee80211_hdr *hdr = (void *) skb->data;
+ u8 *qc = ieee80211_get_qos_ctl(hdr);
+ u16 tid = qc[0] & 0xf;
+
+ ieee80211_check_pending_bar(sta, hdr->addr1, tid);
+ }
+
+ if (ieee80211_is_action(mgmt->frame_control) &&
+ mgmt->u.action.category == WLAN_CATEGORY_HT &&
+ mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS &&
+ ieee80211_sdata_running(sdata)) {
+ enum ieee80211_smps_mode smps_mode;
+
+ switch (mgmt->u.action.u.ht_smps.smps_control) {
+ case WLAN_HT_SMPS_CONTROL_DYNAMIC:
+ smps_mode = IEEE80211_SMPS_DYNAMIC;
+ break;
+ case WLAN_HT_SMPS_CONTROL_STATIC:
+ smps_mode = IEEE80211_SMPS_STATIC;
+ break;
+ case WLAN_HT_SMPS_CONTROL_DISABLED:
+ default: /* shouldn't happen since we don't send that */
+ smps_mode = IEEE80211_SMPS_OFF;
+ break;
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ /*
+ * This update looks racy, but isn't -- if we come
+ * here we've definitely got a station that we're
+ * talking to, and on a managed interface that can
+ * only be the AP. And the only other place updating
+ * this variable in managed mode is before association.
+ */
+ sdata->smps_mode = smps_mode;
+ ieee80211_queue_work(&local->hw, &sdata->recalc_smps);
+ } else if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ sta->known_smps_mode = smps_mode;
+ }
+ }
+}
+
+static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn)
+{
+ struct tid_ampdu_tx *tid_tx;
+
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+ if (!tid_tx)
+ return;
+
+ tid_tx->failed_bar_ssn = ssn;
+ tid_tx->bar_pending = true;
+}
+
+static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info)
+{
+ int len = sizeof(struct ieee80211_radiotap_header);
+
+ /* IEEE80211_RADIOTAP_RATE rate */
+ if (info->status.rates[0].idx >= 0 &&
+ !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
+ IEEE80211_TX_RC_VHT_MCS)))
+ len += 2;
+
+ /* IEEE80211_RADIOTAP_TX_FLAGS */
+ len += 2;
+
+ /* IEEE80211_RADIOTAP_DATA_RETRIES */
+ len += 1;
+
+ /* IEEE80211_RADIOTAP_MCS
+ * IEEE80211_RADIOTAP_VHT */
+ if (info->status.rates[0].idx >= 0) {
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS)
+ len += 3;
+ else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS)
+ len = ALIGN(len, 2) + 12;
+ }
+
+ return len;
+}
+
+static void
+ieee80211_add_tx_radiotap_header(struct ieee80211_local *local,
+ struct ieee80211_supported_band *sband,
+ struct sk_buff *skb, int retry_count,
+ int rtap_len, int shift)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_radiotap_header *rthdr;
+ unsigned char *pos;
+ u16 txflags;
+
+ rthdr = (struct ieee80211_radiotap_header *) skb_push(skb, rtap_len);
+
+ memset(rthdr, 0, rtap_len);
+ rthdr->it_len = cpu_to_le16(rtap_len);
+ rthdr->it_present =
+ cpu_to_le32((1 << IEEE80211_RADIOTAP_TX_FLAGS) |
+ (1 << IEEE80211_RADIOTAP_DATA_RETRIES));
+ pos = (unsigned char *)(rthdr + 1);
+
+ /*
+ * XXX: Once radiotap gets the bitmap reset thing the vendor
+ * extensions proposal contains, we can actually report
+ * the whole set of tries we did.
+ */
+
+ /* IEEE80211_RADIOTAP_RATE */
+ if (info->status.rates[0].idx >= 0 &&
+ !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
+ IEEE80211_TX_RC_VHT_MCS))) {
+ u16 rate;
+
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
+ rate = sband->bitrates[info->status.rates[0].idx].bitrate;
+ *pos = DIV_ROUND_UP(rate, 5 * (1 << shift));
+ /* padding for tx flags */
+ pos += 2;
+ }
+
+ /* IEEE80211_RADIOTAP_TX_FLAGS */
+ txflags = 0;
+ if (!(info->flags & IEEE80211_TX_STAT_ACK) &&
+ !is_multicast_ether_addr(hdr->addr1))
+ txflags |= IEEE80211_RADIOTAP_F_TX_FAIL;
+
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT)
+ txflags |= IEEE80211_RADIOTAP_F_TX_CTS;
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS)
+ txflags |= IEEE80211_RADIOTAP_F_TX_RTS;
+
+ put_unaligned_le16(txflags, pos);
+ pos += 2;
+
+ /* IEEE80211_RADIOTAP_DATA_RETRIES */
+ /* for now report the total retry_count */
+ *pos = retry_count;
+ pos++;
+
+ if (info->status.rates[0].idx < 0)
+ return;
+
+ /* IEEE80211_RADIOTAP_MCS
+ * IEEE80211_RADIOTAP_VHT */
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) {
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS);
+ pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS |
+ IEEE80211_RADIOTAP_MCS_HAVE_GI |
+ IEEE80211_RADIOTAP_MCS_HAVE_BW;
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI)
+ pos[1] |= IEEE80211_RADIOTAP_MCS_SGI;
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40;
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_GREEN_FIELD)
+ pos[1] |= IEEE80211_RADIOTAP_MCS_FMT_GF;
+ pos[2] = info->status.rates[0].idx;
+ pos += 3;
+ } else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) {
+ u16 known = local->hw.radiotap_vht_details &
+ (IEEE80211_RADIOTAP_VHT_KNOWN_GI |
+ IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH);
+
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT);
+
+ /* required alignment from rthdr */
+ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2);
+
+ /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */
+ put_unaligned_le16(known, pos);
+ pos += 2;
+
+ /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI)
+ *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI;
+ pos++;
+
+ /* u8 bandwidth */
+ if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ *pos = 1;
+ else if (info->status.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+ *pos = 4;
+ else if (info->status.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH)
+ *pos = 11;
+ else /* IEEE80211_TX_RC_{20_MHZ_WIDTH,FIXME:DUP_DATA} */
+ *pos = 0;
+ pos++;
+
+ /* u8 mcs_nss[4] */
+ *pos = (ieee80211_rate_get_vht_mcs(&info->status.rates[0]) << 4) |
+ ieee80211_rate_get_vht_nss(&info->status.rates[0]);
+ pos += 4;
+
+ /* u8 coding */
+ pos++;
+ /* u8 group_id */
+ pos++;
+ /* u16 partial_aid */
+ pos += 2;
+ }
+}
+
+/*
+ * Handles the tx for TDLS teardown frames.
+ * If the frame wasn't ACKed by the peer - it will be re-sent through the AP
+ */
+static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u32 flags)
+{
+ struct sk_buff *teardown_skb;
+ struct sk_buff *orig_teardown_skb;
+ bool is_teardown = false;
+
+ /* Get the teardown data we need and free the lock */
+ spin_lock(&sdata->u.mgd.teardown_lock);
+ teardown_skb = sdata->u.mgd.teardown_skb;
+ orig_teardown_skb = sdata->u.mgd.orig_teardown_skb;
+ if ((skb == orig_teardown_skb) && teardown_skb) {
+ sdata->u.mgd.teardown_skb = NULL;
+ sdata->u.mgd.orig_teardown_skb = NULL;
+ is_teardown = true;
+ }
+ spin_unlock(&sdata->u.mgd.teardown_lock);
+
+ if (is_teardown) {
+ /* This mechanism relies on being able to get ACKs */
+ WARN_ON(!(local->hw.flags &
+ IEEE80211_HW_REPORTS_TX_ACK_STATUS));
+
+ /* Check if peer has ACKed */
+ if (flags & IEEE80211_TX_STAT_ACK) {
+ dev_kfree_skb_any(teardown_skb);
+ } else {
+ tdls_dbg(sdata,
+ "TDLS Resending teardown through AP\n");
+
+ ieee80211_subif_start_xmit(teardown_skb, skb->dev);
+ }
+ }
+}
+
+static void ieee80211_report_used_skb(struct ieee80211_local *local,
+ struct sk_buff *skb, bool dropped)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ bool acked = info->flags & IEEE80211_TX_STAT_ACK;
+
+ if (dropped)
+ acked = false;
+
+ if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX |
+ IEEE80211_TX_INTFL_MLME_CONN_TX)) {
+ struct ieee80211_sub_if_data *sdata = NULL;
+ struct ieee80211_sub_if_data *iter_sdata;
+ u64 cookie = (unsigned long)skb;
+
+ rcu_read_lock();
+
+ if (skb->dev) {
+ list_for_each_entry_rcu(iter_sdata, &local->interfaces,
+ list) {
+ if (!iter_sdata->dev)
+ continue;
+
+ if (skb->dev == iter_sdata->dev) {
+ sdata = iter_sdata;
+ break;
+ }
+ }
+ } else {
+ sdata = rcu_dereference(local->p2p_sdata);
+ }
+
+ if (!sdata) {
+ skb->dev = NULL;
+ } else if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) {
+ unsigned int hdr_size =
+ ieee80211_hdrlen(hdr->frame_control);
+
+ /* Check to see if packet is a TDLS teardown packet */
+ if (ieee80211_is_data(hdr->frame_control) &&
+ (ieee80211_get_tdls_action(skb, hdr_size) ==
+ WLAN_TDLS_TEARDOWN))
+ ieee80211_tdls_td_tx_handle(local, sdata, skb,
+ info->flags);
+ else
+ ieee80211_mgd_conn_tx_status(sdata,
+ hdr->frame_control,
+ acked);
+ } else if (ieee80211_is_nullfunc(hdr->frame_control) ||
+ ieee80211_is_qos_nullfunc(hdr->frame_control)) {
+ cfg80211_probe_status(sdata->dev, hdr->addr1,
+ cookie, acked, GFP_ATOMIC);
+ } else {
+ cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data,
+ skb->len, acked, GFP_ATOMIC);
+ }
+
+ rcu_read_unlock();
+ }
+
+ if (unlikely(info->ack_frame_id)) {
+ struct sk_buff *ack_skb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&local->ack_status_lock, flags);
+ ack_skb = idr_find(&local->ack_status_frames,
+ info->ack_frame_id);
+ if (ack_skb)
+ idr_remove(&local->ack_status_frames,
+ info->ack_frame_id);
+ spin_unlock_irqrestore(&local->ack_status_lock, flags);
+
+ if (ack_skb) {
+ if (!dropped) {
+ /* consumes ack_skb */
+ skb_complete_wifi_ack(ack_skb, acked);
+ } else {
+ dev_kfree_skb_any(ack_skb);
+ }
+ }
+ }
+}
+
+/*
+ * Use a static threshold for now, best value to be determined
+ * by testing ...
+ * Should it depend on:
+ * - on # of retransmissions
+ * - current throughput (higher value for higher tpt)?
+ */
+#define STA_LOST_PKT_THRESHOLD 50
+#define STA_LOST_TDLS_PKT_THRESHOLD 10
+#define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */
+
+static void ieee80211_lost_packet(struct sta_info *sta,
+ struct ieee80211_tx_info *info)
+{
+ /* This packet was aggregated but doesn't carry status info */
+ if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
+ !(info->flags & IEEE80211_TX_STAT_AMPDU))
+ return;
+
+ sta->lost_packets++;
+ if (!sta->sta.tdls && sta->lost_packets < STA_LOST_PKT_THRESHOLD)
+ return;
+
+ /*
+ * If we're in TDLS mode, make sure that all STA_LOST_TDLS_PKT_THRESHOLD
+ * of the last packets were lost, and that no ACK was received in the
+ * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss
+ * mechanism.
+ */
+ if (sta->sta.tdls &&
+ (sta->lost_packets < STA_LOST_TDLS_PKT_THRESHOLD ||
+ time_before(jiffies,
+ sta->last_tdls_pkt_time + STA_LOST_TDLS_PKT_TIME)))
+ return;
+
+ cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr,
+ sta->lost_packets, GFP_ATOMIC);
+ sta->lost_packets = 0;
+}
+
+static int ieee80211_tx_get_rates(struct ieee80211_hw *hw,
+ struct ieee80211_tx_info *info,
+ int *retry_count)
+{
+ int rates_idx = -1;
+ int count = -1;
+ int i;
+
+ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+ if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
+ !(info->flags & IEEE80211_TX_STAT_AMPDU)) {
+ /* just the first aggr frame carry status info */
+ info->status.rates[i].idx = -1;
+ info->status.rates[i].count = 0;
+ break;
+ } else if (info->status.rates[i].idx < 0) {
+ break;
+ } else if (i >= hw->max_report_rates) {
+ /* the HW cannot have attempted that rate */
+ info->status.rates[i].idx = -1;
+ info->status.rates[i].count = 0;
+ break;
+ }
+
+ count += info->status.rates[i].count;
+ }
+ rates_idx = i - 1;
+
+ if (count < 0)
+ count = 0;
+
+ *retry_count = count;
+ return rates_idx;
+}
+
+void ieee80211_tx_status_noskb(struct ieee80211_hw *hw,
+ struct ieee80211_sta *pubsta,
+ struct ieee80211_tx_info *info)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_supported_band *sband;
+ int retry_count;
+ int rates_idx;
+ bool acked, noack_success;
+
+ rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
+
+ sband = hw->wiphy->bands[info->band];
+
+ acked = !!(info->flags & IEEE80211_TX_STAT_ACK);
+ noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED);
+
+ if (pubsta) {
+ struct sta_info *sta;
+
+ sta = container_of(pubsta, struct sta_info, sta);
+
+ if (!acked)
+ sta->tx_retry_failed++;
+ sta->tx_retry_count += retry_count;
+
+ if (acked) {
+ sta->last_rx = jiffies;
+
+ if (sta->lost_packets)
+ sta->lost_packets = 0;
+
+ /* Track when last TDLS packet was ACKed */
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
+ sta->last_tdls_pkt_time = jiffies;
+ } else {
+ ieee80211_lost_packet(sta, info);
+ }
+
+ rate_control_tx_status_noskb(local, sband, sta, info);
+ }
+
+ if (acked || noack_success) {
+ local->dot11TransmittedFrameCount++;
+ if (!pubsta)
+ local->dot11MulticastTransmittedFrameCount++;
+ if (retry_count > 0)
+ local->dot11RetryCount++;
+ if (retry_count > 1)
+ local->dot11MultipleRetryCount++;
+ } else {
+ local->dot11FailedCount++;
+ }
+}
+EXPORT_SYMBOL(ieee80211_tx_status_noskb);
+
+void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+ struct sk_buff *skb2;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ __le16 fc;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_sub_if_data *sdata;
+ struct net_device *prev_dev = NULL;
+ struct sta_info *sta;
+ struct rhash_head *tmp;
+ int retry_count;
+ int rates_idx;
+ bool send_to_cooked;
+ bool acked;
+ struct ieee80211_bar *bar;
+ int rtap_len;
+ int shift = 0;
+ int tid = IEEE80211_NUM_TIDS;
+ const struct bucket_table *tbl;
+
+ rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
+
+ rcu_read_lock();
+
+ sband = local->hw.wiphy->bands[info->band];
+ fc = hdr->frame_control;
+
+ tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash);
+
+ for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) {
+ /* skip wrong virtual interface */
+ if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
+ continue;
+
+ shift = ieee80211_vif_get_shift(&sta->sdata->vif);
+
+ if (info->flags & IEEE80211_TX_STATUS_EOSP)
+ clear_sta_flag(sta, WLAN_STA_SP);
+
+ acked = !!(info->flags & IEEE80211_TX_STAT_ACK);
+ if (!acked && test_sta_flag(sta, WLAN_STA_PS_STA)) {
+ /*
+ * The STA is in power save mode, so assume
+ * that this TX packet failed because of that.
+ */
+ ieee80211_handle_filtered_frame(local, sta, skb);
+ rcu_read_unlock();
+ return;
+ }
+
+ /* mesh Peer Service Period support */
+ if (ieee80211_vif_is_mesh(&sta->sdata->vif) &&
+ ieee80211_is_data_qos(fc))
+ ieee80211_mpsp_trigger_process(
+ ieee80211_get_qos_ctl(hdr),
+ sta, true, acked);
+
+ if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) &&
+ (ieee80211_is_data(hdr->frame_control)) &&
+ (rates_idx != -1))
+ sta->last_tx_rate = info->status.rates[rates_idx];
+
+ if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) &&
+ (ieee80211_is_data_qos(fc))) {
+ u16 ssn;
+ u8 *qc;
+
+ qc = ieee80211_get_qos_ctl(hdr);
+ tid = qc[0] & 0xf;
+ ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10)
+ & IEEE80211_SCTL_SEQ);
+ ieee80211_send_bar(&sta->sdata->vif, hdr->addr1,
+ tid, ssn);
+ } else if (ieee80211_is_data_qos(fc)) {
+ u8 *qc = ieee80211_get_qos_ctl(hdr);
+
+ tid = qc[0] & 0xf;
+ }
+
+ if (!acked && ieee80211_is_back_req(fc)) {
+ u16 control;
+
+ /*
+ * BAR failed, store the last SSN and retry sending
+ * the BAR when the next unicast transmission on the
+ * same TID succeeds.
+ */
+ bar = (struct ieee80211_bar *) skb->data;
+ control = le16_to_cpu(bar->control);
+ if (!(control & IEEE80211_BAR_CTRL_MULTI_TID)) {
+ u16 ssn = le16_to_cpu(bar->start_seq_num);
+
+ tid = (control &
+ IEEE80211_BAR_CTRL_TID_INFO_MASK) >>
+ IEEE80211_BAR_CTRL_TID_INFO_SHIFT;
+
+ ieee80211_set_bar_pending(sta, tid, ssn);
+ }
+ }
+
+ if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) {
+ ieee80211_handle_filtered_frame(local, sta, skb);
+ rcu_read_unlock();
+ return;
+ } else {
+ if (!acked)
+ sta->tx_retry_failed++;
+ sta->tx_retry_count += retry_count;
+
+ if (ieee80211_is_data_present(fc)) {
+ if (!acked)
+ sta->tx_msdu_failed[tid]++;
+ sta->tx_msdu_retries[tid] += retry_count;
+ }
+ }
+
+ rate_control_tx_status(local, sband, sta, skb);
+ if (ieee80211_vif_is_mesh(&sta->sdata->vif))
+ ieee80211s_update_metric(local, sta, skb);
+
+ if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
+ ieee80211_frame_acked(sta, skb);
+
+ if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) &&
+ (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS))
+ ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data,
+ acked, info->status.tx_time);
+
+ if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+ if (info->flags & IEEE80211_TX_STAT_ACK) {
+ if (sta->lost_packets)
+ sta->lost_packets = 0;
+
+ /* Track when last TDLS packet was ACKed */
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
+ sta->last_tdls_pkt_time = jiffies;
+ } else {
+ ieee80211_lost_packet(sta, info);
+ }
+ }
+
+ if (acked)
+ sta->last_ack_signal = info->status.ack_signal;
+ }
+
+ rcu_read_unlock();
+
+ ieee80211_led_tx(local);
+
+ /* SNMP counters
+ * Fragments are passed to low-level drivers as separate skbs, so these
+ * are actually fragments, not frames. Update frame counters only for
+ * the first fragment of the frame. */
+ if ((info->flags & IEEE80211_TX_STAT_ACK) ||
+ (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) {
+ if (ieee80211_is_first_frag(hdr->seq_ctrl)) {
+ local->dot11TransmittedFrameCount++;
+ if (is_multicast_ether_addr(ieee80211_get_DA(hdr)))
+ local->dot11MulticastTransmittedFrameCount++;
+ if (retry_count > 0)
+ local->dot11RetryCount++;
+ if (retry_count > 1)
+ local->dot11MultipleRetryCount++;
+ }
+
+ /* This counter shall be incremented for an acknowledged MPDU
+ * with an individual address in the address 1 field or an MPDU
+ * with a multicast address in the address 1 field of type Data
+ * or Management. */
+ if (!is_multicast_ether_addr(hdr->addr1) ||
+ ieee80211_is_data(fc) ||
+ ieee80211_is_mgmt(fc))
+ local->dot11TransmittedFragmentCount++;
+ } else {
+ if (ieee80211_is_first_frag(hdr->seq_ctrl))
+ local->dot11FailedCount++;
+ }
+
+ if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) &&
+ (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) &&
+ !(info->flags & IEEE80211_TX_CTL_INJECTED) &&
+ local->ps_sdata && !(local->scanning)) {
+ if (info->flags & IEEE80211_TX_STAT_ACK) {
+ local->ps_sdata->u.mgd.flags |=
+ IEEE80211_STA_NULLFUNC_ACKED;
+ } else
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(10));
+ }
+
+ ieee80211_report_used_skb(local, skb, false);
+
+ /* this was a transmitted frame, but now we want to reuse it */
+ skb_orphan(skb);
+
+ /* Need to make a copy before skb->cb gets cleared */
+ send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) ||
+ !(ieee80211_is_data(fc));
+
+ /*
+ * This is a bit racy but we can avoid a lot of work
+ * with this test...
+ */
+ if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) {
+ dev_kfree_skb(skb);
+ return;
+ }
+
+ /* send frame to monitor interfaces now */
+ rtap_len = ieee80211_tx_radiotap_len(info);
+ if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) {
+ pr_err("ieee80211_tx_status: headroom too small\n");
+ dev_kfree_skb(skb);
+ return;
+ }
+ ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count,
+ rtap_len, shift);
+
+ /* XXX: is this sufficient for BPF? */
+ skb_set_mac_header(skb, 0);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->pkt_type = PACKET_OTHERHOST;
+ skb->protocol = htons(ETH_P_802_2);
+ memset(skb->cb, 0, sizeof(skb->cb));
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) &&
+ !send_to_cooked)
+ continue;
+
+ if (prev_dev) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2) {
+ skb2->dev = prev_dev;
+ netif_rx(skb2);
+ }
+ }
+
+ prev_dev = sdata->dev;
+ }
+ }
+ if (prev_dev) {
+ skb->dev = prev_dev;
+ netif_rx(skb);
+ skb = NULL;
+ }
+ rcu_read_unlock();
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(ieee80211_tx_status);
+
+void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr,
+ num_packets, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(ieee80211_report_low_ack);
+
+void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ ieee80211_report_used_skb(local, skb, true);
+ dev_kfree_skb_any(skb);
+}
+EXPORT_SYMBOL(ieee80211_free_txskb);
+
+void ieee80211_purge_tx_queue(struct ieee80211_hw *hw,
+ struct sk_buff_head *skbs)
+{
+ struct sk_buff *skb;
+
+ while ((skb = __skb_dequeue(skbs)))
+ ieee80211_free_txskb(hw, skb);
+}
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
new file mode 100644
index 000000000..fff0d864a
--- /dev/null
+++ b/net/mac80211/tdls.c
@@ -0,0 +1,1734 @@
+/*
+ * mac80211 TDLS handling code
+ *
+ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2014, Intel Corporation
+ * Copyright 2014 Intel Mobile Communications GmbH
+ *
+ * This file is GPLv2 as found in COPYING.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/log2.h>
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+
+/* give usermode some time for retries in setting up the TDLS session */
+#define TDLS_PEER_SETUP_TIMEOUT (15 * HZ)
+
+void ieee80211_tdls_peer_del_work(struct work_struct *wk)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_local *local;
+
+ sdata = container_of(wk, struct ieee80211_sub_if_data,
+ u.mgd.tdls_peer_del_work.work);
+ local = sdata->local;
+
+ mutex_lock(&local->mtx);
+ if (!is_zero_ether_addr(sdata->u.mgd.tdls_peer)) {
+ tdls_dbg(sdata, "TDLS del peer %pM\n", sdata->u.mgd.tdls_peer);
+ sta_info_destroy_addr(sdata, sdata->u.mgd.tdls_peer);
+ eth_zero_addr(sdata->u.mgd.tdls_peer);
+ }
+ mutex_unlock(&local->mtx);
+}
+
+static void ieee80211_tdls_add_ext_capab(struct ieee80211_local *local,
+ struct sk_buff *skb)
+{
+ u8 *pos = (void *)skb_put(skb, 7);
+ bool chan_switch = local->hw.wiphy->features &
+ NL80211_FEATURE_TDLS_CHANNEL_SWITCH;
+
+ *pos++ = WLAN_EID_EXT_CAPABILITY;
+ *pos++ = 5; /* len */
+ *pos++ = 0x0;
+ *pos++ = 0x0;
+ *pos++ = 0x0;
+ *pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0;
+ *pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED;
+}
+
+static u8
+ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u16 start, u16 end,
+ u16 spacing)
+{
+ u8 subband_cnt = 0, ch_cnt = 0;
+ struct ieee80211_channel *ch;
+ struct cfg80211_chan_def chandef;
+ int i, subband_start;
+
+ for (i = start; i <= end; i += spacing) {
+ if (!ch_cnt)
+ subband_start = i;
+
+ ch = ieee80211_get_channel(sdata->local->hw.wiphy, i);
+ if (ch) {
+ /* we will be active on the channel */
+ cfg80211_chandef_create(&chandef, ch,
+ NL80211_CHAN_NO_HT);
+ if (cfg80211_reg_can_beacon(sdata->local->hw.wiphy,
+ &chandef,
+ sdata->wdev.iftype)) {
+ ch_cnt++;
+ /*
+ * check if the next channel is also part of
+ * this allowed range
+ */
+ continue;
+ }
+ }
+
+ /*
+ * we've reached the end of a range, with allowed channels
+ * found
+ */
+ if (ch_cnt) {
+ u8 *pos = skb_put(skb, 2);
+ *pos++ = ieee80211_frequency_to_channel(subband_start);
+ *pos++ = ch_cnt;
+
+ subband_cnt++;
+ ch_cnt = 0;
+ }
+ }
+
+ /* all channels in the requested range are allowed - add them here */
+ if (ch_cnt) {
+ u8 *pos = skb_put(skb, 2);
+ *pos++ = ieee80211_frequency_to_channel(subband_start);
+ *pos++ = ch_cnt;
+
+ subband_cnt++;
+ }
+
+ return subband_cnt;
+}
+
+static void
+ieee80211_tdls_add_supp_channels(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ /*
+ * Add possible channels for TDLS. These are channels that are allowed
+ * to be active.
+ */
+ u8 subband_cnt;
+ u8 *pos = skb_put(skb, 2);
+
+ *pos++ = WLAN_EID_SUPPORTED_CHANNELS;
+
+ /*
+ * 5GHz and 2GHz channels numbers can overlap. Ignore this for now, as
+ * this doesn't happen in real world scenarios.
+ */
+
+ /* 2GHz, with 5MHz spacing */
+ subband_cnt = ieee80211_tdls_add_subband(sdata, skb, 2412, 2472, 5);
+
+ /* 5GHz, with 20MHz spacing */
+ subband_cnt += ieee80211_tdls_add_subband(sdata, skb, 5000, 5825, 20);
+
+ /* length */
+ *pos = 2 * subband_cnt;
+}
+
+static void ieee80211_tdls_add_oper_classes(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ u8 *pos;
+ u8 op_class;
+
+ if (!ieee80211_chandef_to_operating_class(&sdata->vif.bss_conf.chandef,
+ &op_class))
+ return;
+
+ pos = skb_put(skb, 4);
+ *pos++ = WLAN_EID_SUPPORTED_REGULATORY_CLASSES;
+ *pos++ = 2; /* len */
+
+ *pos++ = op_class;
+ *pos++ = op_class; /* give current operating class as alternate too */
+}
+
+static void ieee80211_tdls_add_bss_coex_ie(struct sk_buff *skb)
+{
+ u8 *pos = (void *)skb_put(skb, 3);
+
+ *pos++ = WLAN_EID_BSS_COEX_2040;
+ *pos++ = 1; /* len */
+
+ *pos++ = WLAN_BSS_COEX_INFORMATION_REQUEST;
+}
+
+static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata,
+ u16 status_code)
+{
+ struct ieee80211_local *local = sdata->local;
+ u16 capab;
+
+ /* The capability will be 0 when sending a failure code */
+ if (status_code != 0)
+ return 0;
+
+ capab = 0;
+ if (ieee80211_get_sdata_band(sdata) != IEEE80211_BAND_2GHZ)
+ return capab;
+
+ if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
+ capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
+ if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE))
+ capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
+
+ return capab;
+}
+
+static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, const u8 *peer,
+ bool initiator)
+{
+ struct ieee80211_tdls_lnkie *lnkid;
+ const u8 *init_addr, *rsp_addr;
+
+ if (initiator) {
+ init_addr = sdata->vif.addr;
+ rsp_addr = peer;
+ } else {
+ init_addr = peer;
+ rsp_addr = sdata->vif.addr;
+ }
+
+ lnkid = (void *)skb_put(skb, sizeof(struct ieee80211_tdls_lnkie));
+
+ lnkid->ie_type = WLAN_EID_LINK_ID;
+ lnkid->ie_len = sizeof(struct ieee80211_tdls_lnkie) - 2;
+
+ memcpy(lnkid->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+ memcpy(lnkid->init_sta, init_addr, ETH_ALEN);
+ memcpy(lnkid->resp_sta, rsp_addr, ETH_ALEN);
+}
+
+static void
+ieee80211_tdls_add_aid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ u8 *pos = (void *)skb_put(skb, 4);
+
+ *pos++ = WLAN_EID_AID;
+ *pos++ = 2; /* len */
+ put_unaligned_le16(ifmgd->aid, pos);
+}
+
+/* translate numbering in the WMM parameter IE to the mac80211 notation */
+static enum ieee80211_ac_numbers ieee80211_ac_from_wmm(int ac)
+{
+ switch (ac) {
+ default:
+ WARN_ON_ONCE(1);
+ case 0:
+ return IEEE80211_AC_BE;
+ case 1:
+ return IEEE80211_AC_BK;
+ case 2:
+ return IEEE80211_AC_VI;
+ case 3:
+ return IEEE80211_AC_VO;
+ }
+}
+
+static u8 ieee80211_wmm_aci_aifsn(int aifsn, bool acm, int aci)
+{
+ u8 ret;
+
+ ret = aifsn & 0x0f;
+ if (acm)
+ ret |= 0x10;
+ ret |= (aci << 5) & 0x60;
+ return ret;
+}
+
+static u8 ieee80211_wmm_ecw(u16 cw_min, u16 cw_max)
+{
+ return ((ilog2(cw_min + 1) << 0x0) & 0x0f) |
+ ((ilog2(cw_max + 1) << 0x4) & 0xf0);
+}
+
+static void ieee80211_tdls_add_wmm_param_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_wmm_param_ie *wmm;
+ struct ieee80211_tx_queue_params *txq;
+ int i;
+
+ wmm = (void *)skb_put(skb, sizeof(*wmm));
+ memset(wmm, 0, sizeof(*wmm));
+
+ wmm->element_id = WLAN_EID_VENDOR_SPECIFIC;
+ wmm->len = sizeof(*wmm) - 2;
+
+ wmm->oui[0] = 0x00; /* Microsoft OUI 00:50:F2 */
+ wmm->oui[1] = 0x50;
+ wmm->oui[2] = 0xf2;
+ wmm->oui_type = 2; /* WME */
+ wmm->oui_subtype = 1; /* WME param */
+ wmm->version = 1; /* WME ver */
+ wmm->qos_info = 0; /* U-APSD not in use */
+
+ /*
+ * Use the EDCA parameters defined for the BSS, or default if the AP
+ * doesn't support it, as mandated by 802.11-2012 section 10.22.4
+ */
+ for (i = 0; i < IEEE80211_NUM_ACS; i++) {
+ txq = &sdata->tx_conf[ieee80211_ac_from_wmm(i)];
+ wmm->ac[i].aci_aifsn = ieee80211_wmm_aci_aifsn(txq->aifs,
+ txq->acm, i);
+ wmm->ac[i].cw = ieee80211_wmm_ecw(txq->cw_min, txq->cw_max);
+ wmm->ac[i].txop_limit = cpu_to_le16(txq->txop);
+ }
+}
+
+static void
+ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, const u8 *peer,
+ u8 action_code, bool initiator,
+ const u8 *extra_ies, size_t extra_ies_len)
+{
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_sta_ht_cap ht_cap;
+ struct ieee80211_sta_vht_cap vht_cap;
+ struct sta_info *sta = NULL;
+ size_t offset = 0, noffset;
+ u8 *pos;
+
+ ieee80211_add_srates_ie(sdata, skb, false, band);
+ ieee80211_add_ext_srates_ie(sdata, skb, false, band);
+ ieee80211_tdls_add_supp_channels(sdata, skb);
+
+ /* add any custom IEs that go before Extended Capabilities */
+ if (extra_ies_len) {
+ static const u8 before_ext_cap[] = {
+ WLAN_EID_SUPP_RATES,
+ WLAN_EID_COUNTRY,
+ WLAN_EID_EXT_SUPP_RATES,
+ WLAN_EID_SUPPORTED_CHANNELS,
+ WLAN_EID_RSN,
+ };
+ noffset = ieee80211_ie_split(extra_ies, extra_ies_len,
+ before_ext_cap,
+ ARRAY_SIZE(before_ext_cap),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ ieee80211_tdls_add_ext_capab(local, skb);
+
+ /* add the QoS element if we support it */
+ if (local->hw.queues >= IEEE80211_NUM_ACS &&
+ action_code != WLAN_PUB_ACTION_TDLS_DISCOVER_RES)
+ ieee80211_add_wmm_info_ie(skb_put(skb, 9), 0); /* no U-APSD */
+
+ /* add any custom IEs that go before HT capabilities */
+ if (extra_ies_len) {
+ static const u8 before_ht_cap[] = {
+ WLAN_EID_SUPP_RATES,
+ WLAN_EID_COUNTRY,
+ WLAN_EID_EXT_SUPP_RATES,
+ WLAN_EID_SUPPORTED_CHANNELS,
+ WLAN_EID_RSN,
+ WLAN_EID_EXT_CAPABILITY,
+ WLAN_EID_QOS_CAPA,
+ WLAN_EID_FAST_BSS_TRANSITION,
+ WLAN_EID_TIMEOUT_INTERVAL,
+ WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+ };
+ noffset = ieee80211_ie_split(extra_ies, extra_ies_len,
+ before_ht_cap,
+ ARRAY_SIZE(before_ht_cap),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ rcu_read_lock();
+
+ /* we should have the peer STA if we're already responding */
+ if (action_code == WLAN_TDLS_SETUP_RESPONSE) {
+ sta = sta_info_get(sdata, peer);
+ if (WARN_ON_ONCE(!sta)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+
+ ieee80211_tdls_add_oper_classes(sdata, skb);
+
+ /*
+ * with TDLS we can switch channels, and HT-caps are not necessarily
+ * the same on all bands. The specification limits the setup to a
+ * single HT-cap, so use the current band for now.
+ */
+ sband = local->hw.wiphy->bands[band];
+ memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap));
+
+ if ((action_code == WLAN_TDLS_SETUP_REQUEST ||
+ action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES) &&
+ ht_cap.ht_supported) {
+ ieee80211_apply_htcap_overrides(sdata, &ht_cap);
+
+ /* disable SMPS in TDLS initiator */
+ ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED
+ << IEEE80211_HT_CAP_SM_PS_SHIFT;
+
+ pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2);
+ ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap);
+ } else if (action_code == WLAN_TDLS_SETUP_RESPONSE &&
+ ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) {
+ /* disable SMPS in TDLS responder */
+ sta->sta.ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED
+ << IEEE80211_HT_CAP_SM_PS_SHIFT;
+
+ /* the peer caps are already intersected with our own */
+ memcpy(&ht_cap, &sta->sta.ht_cap, sizeof(ht_cap));
+
+ pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2);
+ ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap);
+ }
+
+ if (ht_cap.ht_supported &&
+ (ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40))
+ ieee80211_tdls_add_bss_coex_ie(skb);
+
+ ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator);
+
+ /* add any custom IEs that go before VHT capabilities */
+ if (extra_ies_len) {
+ static const u8 before_vht_cap[] = {
+ WLAN_EID_SUPP_RATES,
+ WLAN_EID_COUNTRY,
+ WLAN_EID_EXT_SUPP_RATES,
+ WLAN_EID_SUPPORTED_CHANNELS,
+ WLAN_EID_RSN,
+ WLAN_EID_EXT_CAPABILITY,
+ WLAN_EID_QOS_CAPA,
+ WLAN_EID_FAST_BSS_TRANSITION,
+ WLAN_EID_TIMEOUT_INTERVAL,
+ WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+ WLAN_EID_MULTI_BAND,
+ };
+ noffset = ieee80211_ie_split(extra_ies, extra_ies_len,
+ before_vht_cap,
+ ARRAY_SIZE(before_vht_cap),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ /* build the VHT-cap similarly to the HT-cap */
+ memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap));
+ if ((action_code == WLAN_TDLS_SETUP_REQUEST ||
+ action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES) &&
+ vht_cap.vht_supported) {
+ ieee80211_apply_vhtcap_overrides(sdata, &vht_cap);
+
+ /* the AID is present only when VHT is implemented */
+ if (action_code == WLAN_TDLS_SETUP_REQUEST)
+ ieee80211_tdls_add_aid(sdata, skb);
+
+ pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2);
+ ieee80211_ie_build_vht_cap(pos, &vht_cap, vht_cap.cap);
+ } else if (action_code == WLAN_TDLS_SETUP_RESPONSE &&
+ vht_cap.vht_supported && sta->sta.vht_cap.vht_supported) {
+ /* the peer caps are already intersected with our own */
+ memcpy(&vht_cap, &sta->sta.vht_cap, sizeof(vht_cap));
+
+ /* the AID is present only when VHT is implemented */
+ ieee80211_tdls_add_aid(sdata, skb);
+
+ pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2);
+ ieee80211_ie_build_vht_cap(pos, &vht_cap, vht_cap.cap);
+ }
+
+ rcu_read_unlock();
+
+ /* add any remaining IEs */
+ if (extra_ies_len) {
+ noffset = extra_ies_len;
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ }
+
+}
+
+static void
+ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, const u8 *peer,
+ bool initiator, const u8 *extra_ies,
+ size_t extra_ies_len)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ size_t offset = 0, noffset;
+ struct sta_info *sta, *ap_sta;
+ enum ieee80211_band band = ieee80211_get_sdata_band(sdata);
+ u8 *pos;
+
+ rcu_read_lock();
+
+ sta = sta_info_get(sdata, peer);
+ ap_sta = sta_info_get(sdata, ifmgd->bssid);
+ if (WARN_ON_ONCE(!sta || !ap_sta)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /* add any custom IEs that go before the QoS IE */
+ if (extra_ies_len) {
+ static const u8 before_qos[] = {
+ WLAN_EID_RSN,
+ };
+ noffset = ieee80211_ie_split(extra_ies, extra_ies_len,
+ before_qos,
+ ARRAY_SIZE(before_qos),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ /* add the QoS param IE if both the peer and we support it */
+ if (local->hw.queues >= IEEE80211_NUM_ACS && sta->sta.wme)
+ ieee80211_tdls_add_wmm_param_ie(sdata, skb);
+
+ /* add any custom IEs that go before HT operation */
+ if (extra_ies_len) {
+ static const u8 before_ht_op[] = {
+ WLAN_EID_RSN,
+ WLAN_EID_QOS_CAPA,
+ WLAN_EID_FAST_BSS_TRANSITION,
+ WLAN_EID_TIMEOUT_INTERVAL,
+ };
+ noffset = ieee80211_ie_split(extra_ies, extra_ies_len,
+ before_ht_op,
+ ARRAY_SIZE(before_ht_op),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ /* if HT support is only added in TDLS, we need an HT-operation IE */
+ if (!ap_sta->sta.ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) {
+ struct ieee80211_chanctx_conf *chanctx_conf =
+ rcu_dereference(sdata->vif.chanctx_conf);
+ if (!WARN_ON(!chanctx_conf)) {
+ pos = skb_put(skb, 2 +
+ sizeof(struct ieee80211_ht_operation));
+ /* send an empty HT operation IE */
+ ieee80211_ie_build_ht_oper(pos, &sta->sta.ht_cap,
+ &chanctx_conf->def, 0);
+ }
+ }
+
+ ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator);
+
+ /* only include VHT-operation if not on the 2.4GHz band */
+ if (band != IEEE80211_BAND_2GHZ && !ap_sta->sta.vht_cap.vht_supported &&
+ sta->sta.vht_cap.vht_supported) {
+ struct ieee80211_chanctx_conf *chanctx_conf =
+ rcu_dereference(sdata->vif.chanctx_conf);
+ if (!WARN_ON(!chanctx_conf)) {
+ pos = skb_put(skb, 2 +
+ sizeof(struct ieee80211_vht_operation));
+ ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap,
+ &chanctx_conf->def);
+ }
+ }
+
+ rcu_read_unlock();
+
+ /* add any remaining IEs */
+ if (extra_ies_len) {
+ noffset = extra_ies_len;
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ }
+}
+
+static void
+ieee80211_tdls_add_chan_switch_req_ies(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, const u8 *peer,
+ bool initiator, const u8 *extra_ies,
+ size_t extra_ies_len, u8 oper_class,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_tdls_data *tf;
+ size_t offset = 0, noffset;
+ u8 *pos;
+
+ if (WARN_ON_ONCE(!chandef))
+ return;
+
+ tf = (void *)skb->data;
+ tf->u.chan_switch_req.target_channel =
+ ieee80211_frequency_to_channel(chandef->chan->center_freq);
+ tf->u.chan_switch_req.oper_class = oper_class;
+
+ if (extra_ies_len) {
+ static const u8 before_lnkie[] = {
+ WLAN_EID_SECONDARY_CHANNEL_OFFSET,
+ };
+ noffset = ieee80211_ie_split(extra_ies, extra_ies_len,
+ before_lnkie,
+ ARRAY_SIZE(before_lnkie),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ offset = noffset;
+ }
+
+ ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator);
+
+ /* add any remaining IEs */
+ if (extra_ies_len) {
+ noffset = extra_ies_len;
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, extra_ies + offset, noffset - offset);
+ }
+}
+
+static void
+ieee80211_tdls_add_chan_switch_resp_ies(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, const u8 *peer,
+ u16 status_code, bool initiator,
+ const u8 *extra_ies,
+ size_t extra_ies_len)
+{
+ if (status_code == 0)
+ ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator);
+
+ if (extra_ies_len)
+ memcpy(skb_put(skb, extra_ies_len), extra_ies, extra_ies_len);
+}
+
+static void ieee80211_tdls_add_ies(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, const u8 *peer,
+ u8 action_code, u16 status_code,
+ bool initiator, const u8 *extra_ies,
+ size_t extra_ies_len, u8 oper_class,
+ struct cfg80211_chan_def *chandef)
+{
+ switch (action_code) {
+ case WLAN_TDLS_SETUP_REQUEST:
+ case WLAN_TDLS_SETUP_RESPONSE:
+ case WLAN_PUB_ACTION_TDLS_DISCOVER_RES:
+ if (status_code == 0)
+ ieee80211_tdls_add_setup_start_ies(sdata, skb, peer,
+ action_code,
+ initiator,
+ extra_ies,
+ extra_ies_len);
+ break;
+ case WLAN_TDLS_SETUP_CONFIRM:
+ if (status_code == 0)
+ ieee80211_tdls_add_setup_cfm_ies(sdata, skb, peer,
+ initiator, extra_ies,
+ extra_ies_len);
+ break;
+ case WLAN_TDLS_TEARDOWN:
+ case WLAN_TDLS_DISCOVERY_REQUEST:
+ if (extra_ies_len)
+ memcpy(skb_put(skb, extra_ies_len), extra_ies,
+ extra_ies_len);
+ if (status_code == 0 || action_code == WLAN_TDLS_TEARDOWN)
+ ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator);
+ break;
+ case WLAN_TDLS_CHANNEL_SWITCH_REQUEST:
+ ieee80211_tdls_add_chan_switch_req_ies(sdata, skb, peer,
+ initiator, extra_ies,
+ extra_ies_len,
+ oper_class, chandef);
+ break;
+ case WLAN_TDLS_CHANNEL_SWITCH_RESPONSE:
+ ieee80211_tdls_add_chan_switch_resp_ies(sdata, skb, peer,
+ status_code,
+ initiator, extra_ies,
+ extra_ies_len);
+ break;
+ }
+
+}
+
+static int
+ieee80211_prep_tdls_encap_data(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, u8 action_code, u8 dialog_token,
+ u16 status_code, struct sk_buff *skb)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_tdls_data *tf;
+
+ tf = (void *)skb_put(skb, offsetof(struct ieee80211_tdls_data, u));
+
+ memcpy(tf->da, peer, ETH_ALEN);
+ memcpy(tf->sa, sdata->vif.addr, ETH_ALEN);
+ tf->ether_type = cpu_to_be16(ETH_P_TDLS);
+ tf->payload_type = WLAN_TDLS_SNAP_RFTYPE;
+
+ /* network header is after the ethernet header */
+ skb_set_network_header(skb, ETH_HLEN);
+
+ switch (action_code) {
+ case WLAN_TDLS_SETUP_REQUEST:
+ tf->category = WLAN_CATEGORY_TDLS;
+ tf->action_code = WLAN_TDLS_SETUP_REQUEST;
+
+ skb_put(skb, sizeof(tf->u.setup_req));
+ tf->u.setup_req.dialog_token = dialog_token;
+ tf->u.setup_req.capability =
+ cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata,
+ status_code));
+ break;
+ case WLAN_TDLS_SETUP_RESPONSE:
+ tf->category = WLAN_CATEGORY_TDLS;
+ tf->action_code = WLAN_TDLS_SETUP_RESPONSE;
+
+ skb_put(skb, sizeof(tf->u.setup_resp));
+ tf->u.setup_resp.status_code = cpu_to_le16(status_code);
+ tf->u.setup_resp.dialog_token = dialog_token;
+ tf->u.setup_resp.capability =
+ cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata,
+ status_code));
+ break;
+ case WLAN_TDLS_SETUP_CONFIRM:
+ tf->category = WLAN_CATEGORY_TDLS;
+ tf->action_code = WLAN_TDLS_SETUP_CONFIRM;
+
+ skb_put(skb, sizeof(tf->u.setup_cfm));
+ tf->u.setup_cfm.status_code = cpu_to_le16(status_code);
+ tf->u.setup_cfm.dialog_token = dialog_token;
+ break;
+ case WLAN_TDLS_TEARDOWN:
+ tf->category = WLAN_CATEGORY_TDLS;
+ tf->action_code = WLAN_TDLS_TEARDOWN;
+
+ skb_put(skb, sizeof(tf->u.teardown));
+ tf->u.teardown.reason_code = cpu_to_le16(status_code);
+ break;
+ case WLAN_TDLS_DISCOVERY_REQUEST:
+ tf->category = WLAN_CATEGORY_TDLS;
+ tf->action_code = WLAN_TDLS_DISCOVERY_REQUEST;
+
+ skb_put(skb, sizeof(tf->u.discover_req));
+ tf->u.discover_req.dialog_token = dialog_token;
+ break;
+ case WLAN_TDLS_CHANNEL_SWITCH_REQUEST:
+ tf->category = WLAN_CATEGORY_TDLS;
+ tf->action_code = WLAN_TDLS_CHANNEL_SWITCH_REQUEST;
+
+ skb_put(skb, sizeof(tf->u.chan_switch_req));
+ break;
+ case WLAN_TDLS_CHANNEL_SWITCH_RESPONSE:
+ tf->category = WLAN_CATEGORY_TDLS;
+ tf->action_code = WLAN_TDLS_CHANNEL_SWITCH_RESPONSE;
+
+ skb_put(skb, sizeof(tf->u.chan_switch_resp));
+ tf->u.chan_switch_resp.status_code = cpu_to_le16(status_code);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+ieee80211_prep_tdls_direct(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, u8 action_code, u8 dialog_token,
+ u16 status_code, struct sk_buff *skb)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_mgmt *mgmt;
+
+ mgmt = (void *)skb_put(skb, 24);
+ memset(mgmt, 0, 24);
+ memcpy(mgmt->da, peer, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+
+ switch (action_code) {
+ case WLAN_PUB_ACTION_TDLS_DISCOVER_RES:
+ skb_put(skb, 1 + sizeof(mgmt->u.action.u.tdls_discover_resp));
+ mgmt->u.action.category = WLAN_CATEGORY_PUBLIC;
+ mgmt->u.action.u.tdls_discover_resp.action_code =
+ WLAN_PUB_ACTION_TDLS_DISCOVER_RES;
+ mgmt->u.action.u.tdls_discover_resp.dialog_token =
+ dialog_token;
+ mgmt->u.action.u.tdls_discover_resp.capability =
+ cpu_to_le16(ieee80211_get_tdls_sta_capab(sdata,
+ status_code));
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct sk_buff *
+ieee80211_tdls_build_mgmt_packet_data(struct ieee80211_sub_if_data *sdata,
+ const u8 *peer, u8 action_code,
+ u8 dialog_token, u16 status_code,
+ bool initiator, const u8 *extra_ies,
+ size_t extra_ies_len, u8 oper_class,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ int ret;
+
+ skb = netdev_alloc_skb(sdata->dev,
+ local->hw.extra_tx_headroom +
+ max(sizeof(struct ieee80211_mgmt),
+ sizeof(struct ieee80211_tdls_data)) +
+ 50 + /* supported rates */
+ 7 + /* ext capab */
+ 26 + /* max(WMM-info, WMM-param) */
+ 2 + max(sizeof(struct ieee80211_ht_cap),
+ sizeof(struct ieee80211_ht_operation)) +
+ 2 + max(sizeof(struct ieee80211_vht_cap),
+ sizeof(struct ieee80211_vht_operation)) +
+ 50 + /* supported channels */
+ 3 + /* 40/20 BSS coex */
+ 4 + /* AID */
+ 4 + /* oper classes */
+ extra_ies_len +
+ sizeof(struct ieee80211_tdls_lnkie));
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ switch (action_code) {
+ case WLAN_TDLS_SETUP_REQUEST:
+ case WLAN_TDLS_SETUP_RESPONSE:
+ case WLAN_TDLS_SETUP_CONFIRM:
+ case WLAN_TDLS_TEARDOWN:
+ case WLAN_TDLS_DISCOVERY_REQUEST:
+ case WLAN_TDLS_CHANNEL_SWITCH_REQUEST:
+ case WLAN_TDLS_CHANNEL_SWITCH_RESPONSE:
+ ret = ieee80211_prep_tdls_encap_data(local->hw.wiphy,
+ sdata->dev, peer,
+ action_code, dialog_token,
+ status_code, skb);
+ break;
+ case WLAN_PUB_ACTION_TDLS_DISCOVER_RES:
+ ret = ieee80211_prep_tdls_direct(local->hw.wiphy, sdata->dev,
+ peer, action_code,
+ dialog_token, status_code,
+ skb);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+
+ if (ret < 0)
+ goto fail;
+
+ ieee80211_tdls_add_ies(sdata, skb, peer, action_code, status_code,
+ initiator, extra_ies, extra_ies_len, oper_class,
+ chandef);
+ return skb;
+
+fail:
+ dev_kfree_skb(skb);
+ return NULL;
+}
+
+static int
+ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, u8 action_code, u8 dialog_token,
+ u16 status_code, u32 peer_capability,
+ bool initiator, const u8 *extra_ies,
+ size_t extra_ies_len, u8 oper_class,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sk_buff *skb = NULL;
+ struct sta_info *sta;
+ u32 flags = 0;
+ int ret = 0;
+
+ rcu_read_lock();
+ sta = sta_info_get(sdata, peer);
+
+ /* infer the initiator if we can, to support old userspace */
+ switch (action_code) {
+ case WLAN_TDLS_SETUP_REQUEST:
+ if (sta) {
+ set_sta_flag(sta, WLAN_STA_TDLS_INITIATOR);
+ sta->sta.tdls_initiator = false;
+ }
+ /* fall-through */
+ case WLAN_TDLS_SETUP_CONFIRM:
+ case WLAN_TDLS_DISCOVERY_REQUEST:
+ initiator = true;
+ break;
+ case WLAN_TDLS_SETUP_RESPONSE:
+ /*
+ * In some testing scenarios, we send a request and response.
+ * Make the last packet sent take effect for the initiator
+ * value.
+ */
+ if (sta) {
+ clear_sta_flag(sta, WLAN_STA_TDLS_INITIATOR);
+ sta->sta.tdls_initiator = true;
+ }
+ /* fall-through */
+ case WLAN_PUB_ACTION_TDLS_DISCOVER_RES:
+ initiator = false;
+ break;
+ case WLAN_TDLS_TEARDOWN:
+ case WLAN_TDLS_CHANNEL_SWITCH_REQUEST:
+ case WLAN_TDLS_CHANNEL_SWITCH_RESPONSE:
+ /* any value is ok */
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+
+ if (sta && test_sta_flag(sta, WLAN_STA_TDLS_INITIATOR))
+ initiator = true;
+
+ rcu_read_unlock();
+ if (ret < 0)
+ goto fail;
+
+ skb = ieee80211_tdls_build_mgmt_packet_data(sdata, peer, action_code,
+ dialog_token, status_code,
+ initiator, extra_ies,
+ extra_ies_len, oper_class,
+ chandef);
+ if (!skb) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ if (action_code == WLAN_PUB_ACTION_TDLS_DISCOVER_RES) {
+ ieee80211_tx_skb(sdata, skb);
+ return 0;
+ }
+
+ /*
+ * According to 802.11z: Setup req/resp are sent in AC_BK, otherwise
+ * we should default to AC_VI.
+ */
+ switch (action_code) {
+ case WLAN_TDLS_SETUP_REQUEST:
+ case WLAN_TDLS_SETUP_RESPONSE:
+ skb_set_queue_mapping(skb, IEEE80211_AC_BK);
+ skb->priority = 2;
+ break;
+ default:
+ skb_set_queue_mapping(skb, IEEE80211_AC_VI);
+ skb->priority = 5;
+ break;
+ }
+
+ /*
+ * Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress.
+ * Later, if no ACK is returned from peer, we will re-send the teardown
+ * packet through the AP.
+ */
+ if ((action_code == WLAN_TDLS_TEARDOWN) &&
+ (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)) {
+ bool try_resend; /* Should we keep skb for possible resend */
+
+ /* If not sending directly to peer - no point in keeping skb */
+ rcu_read_lock();
+ sta = sta_info_get(sdata, peer);
+ try_resend = sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
+ rcu_read_unlock();
+
+ spin_lock_bh(&sdata->u.mgd.teardown_lock);
+ if (try_resend && !sdata->u.mgd.teardown_skb) {
+ /* Mark it as requiring TX status callback */
+ flags |= IEEE80211_TX_CTL_REQ_TX_STATUS |
+ IEEE80211_TX_INTFL_MLME_CONN_TX;
+
+ /*
+ * skb is copied since mac80211 will later set
+ * properties that might not be the same as the AP,
+ * such as encryption, QoS, addresses, etc.
+ *
+ * No problem if skb_copy() fails, so no need to check.
+ */
+ sdata->u.mgd.teardown_skb = skb_copy(skb, GFP_ATOMIC);
+ sdata->u.mgd.orig_teardown_skb = skb;
+ }
+ spin_unlock_bh(&sdata->u.mgd.teardown_lock);
+ }
+
+ /* disable bottom halves when entering the Tx path */
+ local_bh_disable();
+ __ieee80211_subif_start_xmit(skb, dev, flags);
+ local_bh_enable();
+
+ return ret;
+
+fail:
+ dev_kfree_skb(skb);
+ return ret;
+}
+
+static int
+ieee80211_tdls_mgmt_setup(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, u8 action_code, u8 dialog_token,
+ u16 status_code, u32 peer_capability, bool initiator,
+ const u8 *extra_ies, size_t extra_ies_len)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ int ret;
+
+ mutex_lock(&local->mtx);
+
+ /* we don't support concurrent TDLS peer setups */
+ if (!is_zero_ether_addr(sdata->u.mgd.tdls_peer) &&
+ !ether_addr_equal(sdata->u.mgd.tdls_peer, peer)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * make sure we have a STA representing the peer so we drop or buffer
+ * non-TDLS-setup frames to the peer. We can't send other packets
+ * during setup through the AP path.
+ * Allow error packets to be sent - sometimes we don't even add a STA
+ * before failing the setup.
+ */
+ if (status_code == 0) {
+ rcu_read_lock();
+ if (!sta_info_get(sdata, peer)) {
+ rcu_read_unlock();
+ ret = -ENOLINK;
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
+
+ ieee80211_flush_queues(local, sdata, false);
+ memcpy(sdata->u.mgd.tdls_peer, peer, ETH_ALEN);
+ mutex_unlock(&local->mtx);
+
+ /* we cannot take the mutex while preparing the setup packet */
+ ret = ieee80211_tdls_prep_mgmt_packet(wiphy, dev, peer, action_code,
+ dialog_token, status_code,
+ peer_capability, initiator,
+ extra_ies, extra_ies_len, 0,
+ NULL);
+ if (ret < 0) {
+ mutex_lock(&local->mtx);
+ eth_zero_addr(sdata->u.mgd.tdls_peer);
+ mutex_unlock(&local->mtx);
+ return ret;
+ }
+
+ ieee80211_queue_delayed_work(&sdata->local->hw,
+ &sdata->u.mgd.tdls_peer_del_work,
+ TDLS_PEER_SETUP_TIMEOUT);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&local->mtx);
+ return ret;
+}
+
+static int
+ieee80211_tdls_mgmt_teardown(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, u8 action_code, u8 dialog_token,
+ u16 status_code, u32 peer_capability,
+ bool initiator, const u8 *extra_ies,
+ size_t extra_ies_len)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ int ret;
+
+ /*
+ * No packets can be transmitted to the peer via the AP during setup -
+ * the STA is set as a TDLS peer, but is not authorized.
+ * During teardown, we prevent direct transmissions by stopping the
+ * queues and flushing all direct packets.
+ */
+ ieee80211_stop_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN);
+ ieee80211_flush_queues(local, sdata, false);
+
+ ret = ieee80211_tdls_prep_mgmt_packet(wiphy, dev, peer, action_code,
+ dialog_token, status_code,
+ peer_capability, initiator,
+ extra_ies, extra_ies_len, 0,
+ NULL);
+ if (ret < 0)
+ sdata_err(sdata, "Failed sending TDLS teardown packet %d\n",
+ ret);
+
+ /*
+ * Remove the STA AUTH flag to force further traffic through the AP. If
+ * the STA was unreachable, it was already removed.
+ */
+ rcu_read_lock();
+ sta = sta_info_get(sdata, peer);
+ if (sta)
+ clear_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
+ rcu_read_unlock();
+
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN);
+
+ return 0;
+}
+
+int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, u8 action_code, u8 dialog_token,
+ u16 status_code, u32 peer_capability,
+ bool initiator, const u8 *extra_ies,
+ size_t extra_ies_len)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ int ret;
+
+ if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS))
+ return -ENOTSUPP;
+
+ /* make sure we are in managed mode, and associated */
+ if (sdata->vif.type != NL80211_IFTYPE_STATION ||
+ !sdata->u.mgd.associated)
+ return -EINVAL;
+
+ switch (action_code) {
+ case WLAN_TDLS_SETUP_REQUEST:
+ case WLAN_TDLS_SETUP_RESPONSE:
+ ret = ieee80211_tdls_mgmt_setup(wiphy, dev, peer, action_code,
+ dialog_token, status_code,
+ peer_capability, initiator,
+ extra_ies, extra_ies_len);
+ break;
+ case WLAN_TDLS_TEARDOWN:
+ ret = ieee80211_tdls_mgmt_teardown(wiphy, dev, peer,
+ action_code, dialog_token,
+ status_code,
+ peer_capability, initiator,
+ extra_ies, extra_ies_len);
+ break;
+ case WLAN_TDLS_DISCOVERY_REQUEST:
+ /*
+ * Protect the discovery so we can hear the TDLS discovery
+ * response frame. It is transmitted directly and not buffered
+ * by the AP.
+ */
+ drv_mgd_protect_tdls_discover(sdata->local, sdata);
+ /* fall-through */
+ case WLAN_TDLS_SETUP_CONFIRM:
+ case WLAN_PUB_ACTION_TDLS_DISCOVER_RES:
+ /* no special handling */
+ ret = ieee80211_tdls_prep_mgmt_packet(wiphy, dev, peer,
+ action_code,
+ dialog_token,
+ status_code,
+ peer_capability,
+ initiator, extra_ies,
+ extra_ies_len, 0, NULL);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ tdls_dbg(sdata, "TDLS mgmt action %d peer %pM status %d\n",
+ action_code, peer, ret);
+ return ret;
+}
+
+int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *peer, enum nl80211_tdls_operation oper)
+{
+ struct sta_info *sta;
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ int ret;
+
+ if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS))
+ return -ENOTSUPP;
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return -EINVAL;
+
+ switch (oper) {
+ case NL80211_TDLS_ENABLE_LINK:
+ case NL80211_TDLS_DISABLE_LINK:
+ break;
+ case NL80211_TDLS_TEARDOWN:
+ case NL80211_TDLS_SETUP:
+ case NL80211_TDLS_DISCOVERY_REQ:
+ /* We don't support in-driver setup/teardown/discovery */
+ return -ENOTSUPP;
+ }
+
+ mutex_lock(&local->mtx);
+ tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer);
+
+ switch (oper) {
+ case NL80211_TDLS_ENABLE_LINK:
+ rcu_read_lock();
+ sta = sta_info_get(sdata, peer);
+ if (!sta) {
+ rcu_read_unlock();
+ ret = -ENOLINK;
+ break;
+ }
+
+ set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
+ rcu_read_unlock();
+
+ WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) ||
+ !ether_addr_equal(sdata->u.mgd.tdls_peer, peer));
+ ret = 0;
+ break;
+ case NL80211_TDLS_DISABLE_LINK:
+ /*
+ * The teardown message in ieee80211_tdls_mgmt_teardown() was
+ * created while the queues were stopped, so it might still be
+ * pending. Before flushing the queues we need to be sure the
+ * message is handled by the tasklet handling pending messages,
+ * otherwise we might start destroying the station before
+ * sending the teardown packet.
+ * Note that this only forces the tasklet to flush pendings -
+ * not to stop the tasklet from rescheduling itself.
+ */
+ tasklet_kill(&local->tx_pending_tasklet);
+ /* flush a potentially queued teardown packet */
+ ieee80211_flush_queues(local, sdata, false);
+
+ ret = sta_info_destroy_addr(sdata, peer);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+
+ if (ret == 0 && ether_addr_equal(sdata->u.mgd.tdls_peer, peer)) {
+ cancel_delayed_work(&sdata->u.mgd.tdls_peer_del_work);
+ eth_zero_addr(sdata->u.mgd.tdls_peer);
+ }
+
+ mutex_unlock(&local->mtx);
+ return ret;
+}
+
+void ieee80211_tdls_oper_request(struct ieee80211_vif *vif, const u8 *peer,
+ enum nl80211_tdls_operation oper,
+ u16 reason_code, gfp_t gfp)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (vif->type != NL80211_IFTYPE_STATION || !vif->bss_conf.assoc) {
+ sdata_err(sdata, "Discarding TDLS oper %d - not STA or disconnected\n",
+ oper);
+ return;
+ }
+
+ cfg80211_tdls_oper_request(sdata->dev, peer, oper, reason_code, gfp);
+}
+EXPORT_SYMBOL(ieee80211_tdls_oper_request);
+
+static void
+iee80211_tdls_add_ch_switch_timing(u8 *buf, u16 switch_time, u16 switch_timeout)
+{
+ struct ieee80211_ch_switch_timing *ch_sw;
+
+ *buf++ = WLAN_EID_CHAN_SWITCH_TIMING;
+ *buf++ = sizeof(struct ieee80211_ch_switch_timing);
+
+ ch_sw = (void *)buf;
+ ch_sw->switch_time = cpu_to_le16(switch_time);
+ ch_sw->switch_timeout = cpu_to_le16(switch_timeout);
+}
+
+/* find switch timing IE in SKB ready for Tx */
+static const u8 *ieee80211_tdls_find_sw_timing_ie(struct sk_buff *skb)
+{
+ struct ieee80211_tdls_data *tf;
+ const u8 *ie_start;
+
+ /*
+ * Get the offset for the new location of the switch timing IE.
+ * The SKB network header will now point to the "payload_type"
+ * element of the TDLS data frame struct.
+ */
+ tf = container_of(skb->data + skb_network_offset(skb),
+ struct ieee80211_tdls_data, payload_type);
+ ie_start = tf->u.chan_switch_req.variable;
+ return cfg80211_find_ie(WLAN_EID_CHAN_SWITCH_TIMING, ie_start,
+ skb->len - (ie_start - skb->data));
+}
+
+static struct sk_buff *
+ieee80211_tdls_ch_sw_tmpl_get(struct sta_info *sta, u8 oper_class,
+ struct cfg80211_chan_def *chandef,
+ u32 *ch_sw_tm_ie_offset)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u8 extra_ies[2 + sizeof(struct ieee80211_sec_chan_offs_ie) +
+ 2 + sizeof(struct ieee80211_ch_switch_timing)];
+ int extra_ies_len = 2 + sizeof(struct ieee80211_ch_switch_timing);
+ u8 *pos = extra_ies;
+ struct sk_buff *skb;
+
+ /*
+ * if chandef points to a wide channel add a Secondary-Channel
+ * Offset information element
+ */
+ if (chandef->width == NL80211_CHAN_WIDTH_40) {
+ struct ieee80211_sec_chan_offs_ie *sec_chan_ie;
+ bool ht40plus;
+
+ *pos++ = WLAN_EID_SECONDARY_CHANNEL_OFFSET;
+ *pos++ = sizeof(*sec_chan_ie);
+ sec_chan_ie = (void *)pos;
+
+ ht40plus = cfg80211_get_chandef_type(chandef) ==
+ NL80211_CHAN_HT40PLUS;
+ sec_chan_ie->sec_chan_offs = ht40plus ?
+ IEEE80211_HT_PARAM_CHA_SEC_ABOVE :
+ IEEE80211_HT_PARAM_CHA_SEC_BELOW;
+ pos += sizeof(*sec_chan_ie);
+
+ extra_ies_len += 2 + sizeof(struct ieee80211_sec_chan_offs_ie);
+ }
+
+ /* just set the values to 0, this is a template */
+ iee80211_tdls_add_ch_switch_timing(pos, 0, 0);
+
+ skb = ieee80211_tdls_build_mgmt_packet_data(sdata, sta->sta.addr,
+ WLAN_TDLS_CHANNEL_SWITCH_REQUEST,
+ 0, 0, !sta->sta.tdls_initiator,
+ extra_ies, extra_ies_len,
+ oper_class, chandef);
+ if (!skb)
+ return NULL;
+
+ skb = ieee80211_build_data_template(sdata, skb, 0);
+ if (IS_ERR(skb)) {
+ tdls_dbg(sdata, "Failed building TDLS channel switch frame\n");
+ return NULL;
+ }
+
+ if (ch_sw_tm_ie_offset) {
+ const u8 *tm_ie = ieee80211_tdls_find_sw_timing_ie(skb);
+
+ if (!tm_ie) {
+ tdls_dbg(sdata, "No switch timing IE in TDLS switch\n");
+ dev_kfree_skb_any(skb);
+ return NULL;
+ }
+
+ *ch_sw_tm_ie_offset = tm_ie - skb->data;
+ }
+
+ tdls_dbg(sdata,
+ "TDLS channel switch request template for %pM ch %d width %d\n",
+ sta->sta.addr, chandef->chan->center_freq, chandef->width);
+ return skb;
+}
+
+int
+ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev,
+ const u8 *addr, u8 oper_class,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ struct sk_buff *skb = NULL;
+ u32 ch_sw_tm_ie;
+ int ret;
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get(sdata, addr);
+ if (!sta) {
+ tdls_dbg(sdata,
+ "Invalid TDLS peer %pM for channel switch request\n",
+ addr);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!test_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH)) {
+ tdls_dbg(sdata, "TDLS channel switch unsupported by %pM\n",
+ addr);
+ ret = -ENOTSUPP;
+ goto out;
+ }
+
+ skb = ieee80211_tdls_ch_sw_tmpl_get(sta, oper_class, chandef,
+ &ch_sw_tm_ie);
+ if (!skb) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = drv_tdls_channel_switch(local, sdata, &sta->sta, oper_class,
+ chandef, skb, ch_sw_tm_ie);
+ if (!ret)
+ set_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL);
+
+out:
+ mutex_unlock(&local->sta_mtx);
+ dev_kfree_skb_any(skb);
+ return ret;
+}
+
+void
+ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy,
+ struct net_device *dev,
+ const u8 *addr)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get(sdata, addr);
+ if (!sta) {
+ tdls_dbg(sdata,
+ "Invalid TDLS peer %pM for channel switch cancel\n",
+ addr);
+ goto out;
+ }
+
+ if (!test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) {
+ tdls_dbg(sdata, "TDLS channel switch not initiated by %pM\n",
+ addr);
+ goto out;
+ }
+
+ drv_tdls_cancel_channel_switch(local, sdata, &sta->sta);
+ clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL);
+
+out:
+ mutex_unlock(&local->sta_mtx);
+}
+
+static struct sk_buff *
+ieee80211_tdls_ch_sw_resp_tmpl_get(struct sta_info *sta,
+ u32 *ch_sw_tm_ie_offset)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct sk_buff *skb;
+ u8 extra_ies[2 + sizeof(struct ieee80211_ch_switch_timing)];
+
+ /* initial timing are always zero in the template */
+ iee80211_tdls_add_ch_switch_timing(extra_ies, 0, 0);
+
+ skb = ieee80211_tdls_build_mgmt_packet_data(sdata, sta->sta.addr,
+ WLAN_TDLS_CHANNEL_SWITCH_RESPONSE,
+ 0, 0, !sta->sta.tdls_initiator,
+ extra_ies, sizeof(extra_ies), 0, NULL);
+ if (!skb)
+ return NULL;
+
+ skb = ieee80211_build_data_template(sdata, skb, 0);
+ if (IS_ERR(skb)) {
+ tdls_dbg(sdata,
+ "Failed building TDLS channel switch resp frame\n");
+ return NULL;
+ }
+
+ if (ch_sw_tm_ie_offset) {
+ const u8 *tm_ie = ieee80211_tdls_find_sw_timing_ie(skb);
+
+ if (!tm_ie) {
+ tdls_dbg(sdata,
+ "No switch timing IE in TDLS switch resp\n");
+ dev_kfree_skb_any(skb);
+ return NULL;
+ }
+
+ *ch_sw_tm_ie_offset = tm_ie - skb->data;
+ }
+
+ tdls_dbg(sdata, "TDLS get channel switch response template for %pM\n",
+ sta->sta.addr);
+ return skb;
+}
+
+static int
+ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee802_11_elems elems;
+ struct sta_info *sta;
+ struct ieee80211_tdls_data *tf = (void *)skb->data;
+ bool local_initiator;
+ struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb);
+ int baselen = offsetof(typeof(*tf), u.chan_switch_resp.variable);
+ struct ieee80211_tdls_ch_sw_params params = {};
+ int ret;
+
+ params.action_code = WLAN_TDLS_CHANNEL_SWITCH_RESPONSE;
+ params.timestamp = rx_status->device_timestamp;
+
+ if (skb->len < baselen) {
+ tdls_dbg(sdata, "TDLS channel switch resp too short: %d\n",
+ skb->len);
+ return -EINVAL;
+ }
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get(sdata, tf->sa);
+ if (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) {
+ tdls_dbg(sdata, "TDLS chan switch from non-peer sta %pM\n",
+ tf->sa);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ params.sta = &sta->sta;
+ params.status = le16_to_cpu(tf->u.chan_switch_resp.status_code);
+ if (params.status != 0) {
+ ret = 0;
+ goto call_drv;
+ }
+
+ ieee802_11_parse_elems(tf->u.chan_switch_resp.variable,
+ skb->len - baselen, false, &elems);
+ if (elems.parse_error) {
+ tdls_dbg(sdata, "Invalid IEs in TDLS channel switch resp\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!elems.ch_sw_timing || !elems.lnk_id) {
+ tdls_dbg(sdata, "TDLS channel switch resp - missing IEs\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* validate the initiator is set correctly */
+ local_initiator =
+ !memcmp(elems.lnk_id->init_sta, sdata->vif.addr, ETH_ALEN);
+ if (local_initiator == sta->sta.tdls_initiator) {
+ tdls_dbg(sdata, "TDLS chan switch invalid lnk-id initiator\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ params.switch_time = le16_to_cpu(elems.ch_sw_timing->switch_time);
+ params.switch_timeout = le16_to_cpu(elems.ch_sw_timing->switch_timeout);
+
+ params.tmpl_skb =
+ ieee80211_tdls_ch_sw_resp_tmpl_get(sta, &params.ch_sw_tm_ie);
+ if (!params.tmpl_skb) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+call_drv:
+ drv_tdls_recv_channel_switch(sdata->local, sdata, &params);
+
+ tdls_dbg(sdata,
+ "TDLS channel switch response received from %pM status %d\n",
+ tf->sa, params.status);
+
+out:
+ mutex_unlock(&local->sta_mtx);
+ dev_kfree_skb_any(params.tmpl_skb);
+ return ret;
+}
+
+static int
+ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee802_11_elems elems;
+ struct cfg80211_chan_def chandef;
+ struct ieee80211_channel *chan;
+ enum nl80211_channel_type chan_type;
+ int freq;
+ u8 target_channel, oper_class;
+ bool local_initiator;
+ struct sta_info *sta;
+ enum ieee80211_band band;
+ struct ieee80211_tdls_data *tf = (void *)skb->data;
+ struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb);
+ int baselen = offsetof(typeof(*tf), u.chan_switch_req.variable);
+ struct ieee80211_tdls_ch_sw_params params = {};
+ int ret = 0;
+
+ params.action_code = WLAN_TDLS_CHANNEL_SWITCH_REQUEST;
+ params.timestamp = rx_status->device_timestamp;
+
+ if (skb->len < baselen) {
+ tdls_dbg(sdata, "TDLS channel switch req too short: %d\n",
+ skb->len);
+ return -EINVAL;
+ }
+
+ target_channel = tf->u.chan_switch_req.target_channel;
+ oper_class = tf->u.chan_switch_req.oper_class;
+
+ /*
+ * We can't easily infer the channel band. The operating class is
+ * ambiguous - there are multiple tables (US/Europe/JP/Global). The
+ * solution here is to treat channels with number >14 as 5GHz ones,
+ * and specifically check for the (oper_class, channel) combinations
+ * where this doesn't hold. These are thankfully unique according to
+ * IEEE802.11-2012.
+ * We consider only the 2GHz and 5GHz bands and 20MHz+ channels as
+ * valid here.
+ */
+ if ((oper_class == 112 || oper_class == 2 || oper_class == 3 ||
+ oper_class == 4 || oper_class == 5 || oper_class == 6) &&
+ target_channel < 14)
+ band = IEEE80211_BAND_5GHZ;
+ else
+ band = target_channel < 14 ? IEEE80211_BAND_2GHZ :
+ IEEE80211_BAND_5GHZ;
+
+ freq = ieee80211_channel_to_frequency(target_channel, band);
+ if (freq == 0) {
+ tdls_dbg(sdata, "Invalid channel in TDLS chan switch: %d\n",
+ target_channel);
+ return -EINVAL;
+ }
+
+ chan = ieee80211_get_channel(sdata->local->hw.wiphy, freq);
+ if (!chan) {
+ tdls_dbg(sdata,
+ "Unsupported channel for TDLS chan switch: %d\n",
+ target_channel);
+ return -EINVAL;
+ }
+
+ ieee802_11_parse_elems(tf->u.chan_switch_req.variable,
+ skb->len - baselen, false, &elems);
+ if (elems.parse_error) {
+ tdls_dbg(sdata, "Invalid IEs in TDLS channel switch req\n");
+ return -EINVAL;
+ }
+
+ if (!elems.ch_sw_timing || !elems.lnk_id) {
+ tdls_dbg(sdata, "TDLS channel switch req - missing IEs\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&local->sta_mtx);
+ sta = sta_info_get(sdata, tf->sa);
+ if (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) {
+ tdls_dbg(sdata, "TDLS chan switch from non-peer sta %pM\n",
+ tf->sa);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ params.sta = &sta->sta;
+
+ /* validate the initiator is set correctly */
+ local_initiator =
+ !memcmp(elems.lnk_id->init_sta, sdata->vif.addr, ETH_ALEN);
+ if (local_initiator == sta->sta.tdls_initiator) {
+ tdls_dbg(sdata, "TDLS chan switch invalid lnk-id initiator\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!sta->sta.ht_cap.ht_supported) {
+ chan_type = NL80211_CHAN_NO_HT;
+ } else if (!elems.sec_chan_offs) {
+ chan_type = NL80211_CHAN_HT20;
+ } else {
+ switch (elems.sec_chan_offs->sec_chan_offs) {
+ case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+ chan_type = NL80211_CHAN_HT40PLUS;
+ break;
+ case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+ chan_type = NL80211_CHAN_HT40MINUS;
+ break;
+ default:
+ chan_type = NL80211_CHAN_HT20;
+ break;
+ }
+ }
+
+ cfg80211_chandef_create(&chandef, chan, chan_type);
+ params.chandef = &chandef;
+
+ params.switch_time = le16_to_cpu(elems.ch_sw_timing->switch_time);
+ params.switch_timeout = le16_to_cpu(elems.ch_sw_timing->switch_timeout);
+
+ params.tmpl_skb =
+ ieee80211_tdls_ch_sw_resp_tmpl_get(sta,
+ &params.ch_sw_tm_ie);
+ if (!params.tmpl_skb) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ drv_tdls_recv_channel_switch(sdata->local, sdata, &params);
+
+ tdls_dbg(sdata,
+ "TDLS ch switch request received from %pM ch %d width %d\n",
+ tf->sa, params.chandef->chan->center_freq,
+ params.chandef->width);
+out:
+ mutex_unlock(&local->sta_mtx);
+ dev_kfree_skb_any(params.tmpl_skb);
+ return ret;
+}
+
+void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_tdls_data *tf = (void *)skb->data;
+ struct wiphy *wiphy = sdata->local->hw.wiphy;
+
+ /* make sure the driver supports it */
+ if (!(wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH))
+ return;
+
+ /* we want to access the entire packet */
+ if (skb_linearize(skb))
+ return;
+ /*
+ * The packet/size was already validated by mac80211 Rx path, only look
+ * at the action type.
+ */
+ switch (tf->action_code) {
+ case WLAN_TDLS_CHANNEL_SWITCH_REQUEST:
+ ieee80211_process_tdls_channel_switch_req(sdata, skb);
+ break;
+ case WLAN_TDLS_CHANNEL_SWITCH_RESPONSE:
+ ieee80211_process_tdls_channel_switch_resp(sdata, skb);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+}
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
new file mode 100644
index 000000000..0ae207771
--- /dev/null
+++ b/net/mac80211/tkip.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/export.h>
+#include <asm/unaligned.h>
+
+#include <net/mac80211.h>
+#include "driver-ops.h"
+#include "key.h"
+#include "tkip.h"
+#include "wep.h"
+
+#define PHASE1_LOOP_COUNT 8
+
+/*
+ * 2-byte by 2-byte subset of the full AES S-box table; second part of this
+ * table is identical to first part but byte-swapped
+ */
+static const u16 tkip_sbox[256] =
+{
+ 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154,
+ 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A,
+ 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B,
+ 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B,
+ 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F,
+ 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F,
+ 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5,
+ 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F,
+ 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB,
+ 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397,
+ 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED,
+ 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A,
+ 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194,
+ 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3,
+ 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104,
+ 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D,
+ 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39,
+ 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695,
+ 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83,
+ 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76,
+ 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4,
+ 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B,
+ 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0,
+ 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018,
+ 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751,
+ 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85,
+ 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12,
+ 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9,
+ 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7,
+ 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A,
+ 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8,
+ 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A,
+};
+
+static u16 tkipS(u16 val)
+{
+ return tkip_sbox[val & 0xff] ^ swab16(tkip_sbox[val >> 8]);
+}
+
+static u8 *write_tkip_iv(u8 *pos, u16 iv16)
+{
+ *pos++ = iv16 >> 8;
+ *pos++ = ((iv16 >> 8) | 0x20) & 0x7f;
+ *pos++ = iv16 & 0xFF;
+ return pos;
+}
+
+/*
+ * P1K := Phase1(TA, TK, TSC)
+ * TA = transmitter address (48 bits)
+ * TK = dot11DefaultKeyValue or dot11KeyMappingValue (128 bits)
+ * TSC = TKIP sequence counter (48 bits, only 32 msb bits used)
+ * P1K: 80 bits
+ */
+static void tkip_mixing_phase1(const u8 *tk, struct tkip_ctx *ctx,
+ const u8 *ta, u32 tsc_IV32)
+{
+ int i, j;
+ u16 *p1k = ctx->p1k;
+
+ p1k[0] = tsc_IV32 & 0xFFFF;
+ p1k[1] = tsc_IV32 >> 16;
+ p1k[2] = get_unaligned_le16(ta + 0);
+ p1k[3] = get_unaligned_le16(ta + 2);
+ p1k[4] = get_unaligned_le16(ta + 4);
+
+ for (i = 0; i < PHASE1_LOOP_COUNT; i++) {
+ j = 2 * (i & 1);
+ p1k[0] += tkipS(p1k[4] ^ get_unaligned_le16(tk + 0 + j));
+ p1k[1] += tkipS(p1k[0] ^ get_unaligned_le16(tk + 4 + j));
+ p1k[2] += tkipS(p1k[1] ^ get_unaligned_le16(tk + 8 + j));
+ p1k[3] += tkipS(p1k[2] ^ get_unaligned_le16(tk + 12 + j));
+ p1k[4] += tkipS(p1k[3] ^ get_unaligned_le16(tk + 0 + j)) + i;
+ }
+ ctx->state = TKIP_STATE_PHASE1_DONE;
+ ctx->p1k_iv32 = tsc_IV32;
+}
+
+static void tkip_mixing_phase2(const u8 *tk, struct tkip_ctx *ctx,
+ u16 tsc_IV16, u8 *rc4key)
+{
+ u16 ppk[6];
+ const u16 *p1k = ctx->p1k;
+ int i;
+
+ ppk[0] = p1k[0];
+ ppk[1] = p1k[1];
+ ppk[2] = p1k[2];
+ ppk[3] = p1k[3];
+ ppk[4] = p1k[4];
+ ppk[5] = p1k[4] + tsc_IV16;
+
+ ppk[0] += tkipS(ppk[5] ^ get_unaligned_le16(tk + 0));
+ ppk[1] += tkipS(ppk[0] ^ get_unaligned_le16(tk + 2));
+ ppk[2] += tkipS(ppk[1] ^ get_unaligned_le16(tk + 4));
+ ppk[3] += tkipS(ppk[2] ^ get_unaligned_le16(tk + 6));
+ ppk[4] += tkipS(ppk[3] ^ get_unaligned_le16(tk + 8));
+ ppk[5] += tkipS(ppk[4] ^ get_unaligned_le16(tk + 10));
+ ppk[0] += ror16(ppk[5] ^ get_unaligned_le16(tk + 12), 1);
+ ppk[1] += ror16(ppk[0] ^ get_unaligned_le16(tk + 14), 1);
+ ppk[2] += ror16(ppk[1], 1);
+ ppk[3] += ror16(ppk[2], 1);
+ ppk[4] += ror16(ppk[3], 1);
+ ppk[5] += ror16(ppk[4], 1);
+
+ rc4key = write_tkip_iv(rc4key, tsc_IV16);
+ *rc4key++ = ((ppk[5] ^ get_unaligned_le16(tk)) >> 1) & 0xFF;
+
+ for (i = 0; i < 6; i++)
+ put_unaligned_le16(ppk[i], rc4key + 2 * i);
+}
+
+/* Add TKIP IV and Ext. IV at @pos. @iv0, @iv1, and @iv2 are the first octets
+ * of the IV. Returns pointer to the octet following IVs (i.e., beginning of
+ * the packet payload). */
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key)
+{
+ lockdep_assert_held(&key->u.tkip.txlock);
+
+ pos = write_tkip_iv(pos, key->u.tkip.tx.iv16);
+ *pos++ = (key->conf.keyidx << 6) | (1 << 5) /* Ext IV */;
+ put_unaligned_le32(key->u.tkip.tx.iv32, pos);
+ return pos + 4;
+}
+
+static void ieee80211_compute_tkip_p1k(struct ieee80211_key *key, u32 iv32)
+{
+ struct ieee80211_sub_if_data *sdata = key->sdata;
+ struct tkip_ctx *ctx = &key->u.tkip.tx;
+ const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+
+ lockdep_assert_held(&key->u.tkip.txlock);
+
+ /*
+ * Update the P1K when the IV32 is different from the value it
+ * had when we last computed it (or when not initialised yet).
+ * This might flip-flop back and forth if packets are processed
+ * out-of-order due to the different ACs, but then we have to
+ * just compute the P1K more often.
+ */
+ if (ctx->p1k_iv32 != iv32 || ctx->state == TKIP_STATE_NOT_INIT)
+ tkip_mixing_phase1(tk, ctx, sdata->vif.addr, iv32);
+}
+
+void ieee80211_get_tkip_p1k_iv(struct ieee80211_key_conf *keyconf,
+ u32 iv32, u16 *p1k)
+{
+ struct ieee80211_key *key = (struct ieee80211_key *)
+ container_of(keyconf, struct ieee80211_key, conf);
+ struct tkip_ctx *ctx = &key->u.tkip.tx;
+
+ spin_lock_bh(&key->u.tkip.txlock);
+ ieee80211_compute_tkip_p1k(key, iv32);
+ memcpy(p1k, ctx->p1k, sizeof(ctx->p1k));
+ spin_unlock_bh(&key->u.tkip.txlock);
+}
+EXPORT_SYMBOL(ieee80211_get_tkip_p1k_iv);
+
+void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf,
+ const u8 *ta, u32 iv32, u16 *p1k)
+{
+ const u8 *tk = &keyconf->key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+ struct tkip_ctx ctx;
+
+ tkip_mixing_phase1(tk, &ctx, ta, iv32);
+ memcpy(p1k, ctx.p1k, sizeof(ctx.p1k));
+}
+EXPORT_SYMBOL(ieee80211_get_tkip_rx_p1k);
+
+void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf,
+ struct sk_buff *skb, u8 *p2k)
+{
+ struct ieee80211_key *key = (struct ieee80211_key *)
+ container_of(keyconf, struct ieee80211_key, conf);
+ const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+ struct tkip_ctx *ctx = &key->u.tkip.tx;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ const u8 *data = (u8 *)hdr + ieee80211_hdrlen(hdr->frame_control);
+ u32 iv32 = get_unaligned_le32(&data[4]);
+ u16 iv16 = data[2] | (data[0] << 8);
+
+ spin_lock(&key->u.tkip.txlock);
+ ieee80211_compute_tkip_p1k(key, iv32);
+ tkip_mixing_phase2(tk, ctx, iv16, p2k);
+ spin_unlock(&key->u.tkip.txlock);
+}
+EXPORT_SYMBOL(ieee80211_get_tkip_p2k);
+
+/*
+ * Encrypt packet payload with TKIP using @key. @pos is a pointer to the
+ * beginning of the buffer containing payload. This payload must include
+ * the IV/Ext.IV and space for (taildroom) four octets for ICV.
+ * @payload_len is the length of payload (_not_ including IV/ICV length).
+ * @ta is the transmitter addresses.
+ */
+int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm,
+ struct ieee80211_key *key,
+ struct sk_buff *skb,
+ u8 *payload, size_t payload_len)
+{
+ u8 rc4key[16];
+
+ ieee80211_get_tkip_p2k(&key->conf, skb, rc4key);
+
+ return ieee80211_wep_encrypt_data(tfm, rc4key, 16,
+ payload, payload_len);
+}
+
+/* Decrypt packet payload with TKIP using @key. @pos is a pointer to the
+ * beginning of the buffer containing IEEE 802.11 header payload, i.e.,
+ * including IV, Ext. IV, real data, Michael MIC, ICV. @payload_len is the
+ * length of payload, including IV, Ext. IV, MIC, ICV. */
+int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
+ struct ieee80211_key *key,
+ u8 *payload, size_t payload_len, u8 *ta,
+ u8 *ra, int only_iv, int queue,
+ u32 *out_iv32, u16 *out_iv16)
+{
+ u32 iv32;
+ u32 iv16;
+ u8 rc4key[16], keyid, *pos = payload;
+ int res;
+ const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+
+ if (payload_len < 12)
+ return -1;
+
+ iv16 = (pos[0] << 8) | pos[2];
+ keyid = pos[3];
+ iv32 = get_unaligned_le32(pos + 4);
+ pos += 8;
+
+ if (!(keyid & (1 << 5)))
+ return TKIP_DECRYPT_NO_EXT_IV;
+
+ if ((keyid >> 6) != key->conf.keyidx)
+ return TKIP_DECRYPT_INVALID_KEYIDX;
+
+ if (key->u.tkip.rx[queue].state != TKIP_STATE_NOT_INIT &&
+ (iv32 < key->u.tkip.rx[queue].iv32 ||
+ (iv32 == key->u.tkip.rx[queue].iv32 &&
+ iv16 <= key->u.tkip.rx[queue].iv16)))
+ return TKIP_DECRYPT_REPLAY;
+
+ if (only_iv) {
+ res = TKIP_DECRYPT_OK;
+ key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+ goto done;
+ }
+
+ if (key->u.tkip.rx[queue].state == TKIP_STATE_NOT_INIT ||
+ key->u.tkip.rx[queue].iv32 != iv32) {
+ /* IV16 wrapped around - perform TKIP phase 1 */
+ tkip_mixing_phase1(tk, &key->u.tkip.rx[queue], ta, iv32);
+ }
+ if (key->local->ops->update_tkip_key &&
+ key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
+ key->u.tkip.rx[queue].state != TKIP_STATE_PHASE1_HW_UPLOADED) {
+ struct ieee80211_sub_if_data *sdata = key->sdata;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ sdata = container_of(key->sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+ drv_update_tkip_key(key->local, sdata, &key->conf, key->sta,
+ iv32, key->u.tkip.rx[queue].p1k);
+ key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+ }
+
+ tkip_mixing_phase2(tk, &key->u.tkip.rx[queue], iv16, rc4key);
+
+ res = ieee80211_wep_decrypt_data(tfm, rc4key, 16, pos, payload_len - 12);
+ done:
+ if (res == TKIP_DECRYPT_OK) {
+ /*
+ * Record previously received IV, will be copied into the
+ * key information after MIC verification. It is possible
+ * that we don't catch replays of fragments but that's ok
+ * because the Michael MIC verication will then fail.
+ */
+ *out_iv32 = iv32;
+ *out_iv16 = iv16;
+ }
+
+ return res;
+}
diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h
new file mode 100644
index 000000000..e3ecb659b
--- /dev/null
+++ b/net/mac80211/tkip.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef TKIP_H
+#define TKIP_H
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include "key.h"
+
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key);
+
+int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm,
+ struct ieee80211_key *key,
+ struct sk_buff *skb,
+ u8 *payload, size_t payload_len);
+
+enum {
+ TKIP_DECRYPT_OK = 0,
+ TKIP_DECRYPT_NO_EXT_IV = -1,
+ TKIP_DECRYPT_INVALID_KEYIDX = -2,
+ TKIP_DECRYPT_REPLAY = -3,
+};
+int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
+ struct ieee80211_key *key,
+ u8 *payload, size_t payload_len, u8 *ta,
+ u8 *ra, int only_iv, int queue,
+ u32 *out_iv32, u16 *out_iv16);
+
+#endif /* TKIP_H */
diff --git a/net/mac80211/trace.c b/net/mac80211/trace.c
new file mode 100644
index 000000000..edfe0c170
--- /dev/null
+++ b/net/mac80211/trace.c
@@ -0,0 +1,76 @@
+/* bug in tracepoint.h, it should include this */
+#include <linux/module.h>
+
+/* sparse isn't too happy with all macros... */
+#ifndef __CHECKER__
+#include <net/cfg80211.h>
+#include "driver-ops.h"
+#include "debug.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+#include "trace_msg.h"
+
+#ifdef CONFIG_MAC80211_MESSAGE_TRACING
+void __sdata_info(const char *fmt, ...)
+{
+ struct va_format vaf = {
+ .fmt = fmt,
+ };
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ pr_info("%pV", &vaf);
+ trace_mac80211_info(&vaf);
+ va_end(args);
+}
+
+void __sdata_dbg(bool print, const char *fmt, ...)
+{
+ struct va_format vaf = {
+ .fmt = fmt,
+ };
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ if (print)
+ pr_debug("%pV", &vaf);
+ trace_mac80211_dbg(&vaf);
+ va_end(args);
+}
+
+void __sdata_err(const char *fmt, ...)
+{
+ struct va_format vaf = {
+ .fmt = fmt,
+ };
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ pr_err("%pV", &vaf);
+ trace_mac80211_err(&vaf);
+ va_end(args);
+}
+
+void __wiphy_dbg(struct wiphy *wiphy, bool print, const char *fmt, ...)
+{
+ struct va_format vaf = {
+ .fmt = fmt,
+ };
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.va = &args;
+
+ if (print)
+ wiphy_dbg(wiphy, "%pV", &vaf);
+ trace_mac80211_dbg(&vaf);
+ va_end(args);
+}
+#endif
+#endif
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
new file mode 100644
index 000000000..4c2e76902
--- /dev/null
+++ b/net/mac80211/trace.h
@@ -0,0 +1,2352 @@
+#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __MAC80211_DRIVER_TRACE
+
+#include <linux/tracepoint.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mac80211
+
+#define MAXNAME 32
+#define LOCAL_ENTRY __array(char, wiphy_name, 32)
+#define LOCAL_ASSIGN strlcpy(__entry->wiphy_name, wiphy_name(local->hw.wiphy), MAXNAME)
+#define LOCAL_PR_FMT "%s"
+#define LOCAL_PR_ARG __entry->wiphy_name
+
+#define STA_ENTRY __array(char, sta_addr, ETH_ALEN)
+#define STA_ASSIGN (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : memset(__entry->sta_addr, 0, ETH_ALEN))
+#define STA_NAMED_ASSIGN(s) memcpy(__entry->sta_addr, (s)->addr, ETH_ALEN)
+#define STA_PR_FMT " sta:%pM"
+#define STA_PR_ARG __entry->sta_addr
+
+#define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \
+ __field(bool, p2p) \
+ __string(vif_name, sdata->name)
+#define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \
+ __entry->p2p = sdata->vif.p2p; \
+ __assign_str(vif_name, sdata->name)
+#define VIF_PR_FMT " vif:%s(%d%s)"
+#define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""
+
+#define CHANDEF_ENTRY __field(u32, control_freq) \
+ __field(u32, chan_width) \
+ __field(u32, center_freq1) \
+ __field(u32, center_freq2)
+#define CHANDEF_ASSIGN(c) \
+ __entry->control_freq = (c)->chan ? (c)->chan->center_freq : 0; \
+ __entry->chan_width = (c)->width; \
+ __entry->center_freq1 = (c)->center_freq1; \
+ __entry->center_freq2 = (c)->center_freq2;
+#define CHANDEF_PR_FMT " control:%d MHz width:%d center: %d/%d MHz"
+#define CHANDEF_PR_ARG __entry->control_freq, __entry->chan_width, \
+ __entry->center_freq1, __entry->center_freq2
+
+#define MIN_CHANDEF_ENTRY \
+ __field(u32, min_control_freq) \
+ __field(u32, min_chan_width) \
+ __field(u32, min_center_freq1) \
+ __field(u32, min_center_freq2)
+
+#define MIN_CHANDEF_ASSIGN(c) \
+ __entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0; \
+ __entry->min_chan_width = (c)->width; \
+ __entry->min_center_freq1 = (c)->center_freq1; \
+ __entry->min_center_freq2 = (c)->center_freq2;
+#define MIN_CHANDEF_PR_FMT " min_control:%d MHz min_width:%d min_center: %d/%d MHz"
+#define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_chan_width, \
+ __entry->min_center_freq1, __entry->min_center_freq2
+
+#define CHANCTX_ENTRY CHANDEF_ENTRY \
+ MIN_CHANDEF_ENTRY \
+ __field(u8, rx_chains_static) \
+ __field(u8, rx_chains_dynamic)
+#define CHANCTX_ASSIGN CHANDEF_ASSIGN(&ctx->conf.def) \
+ MIN_CHANDEF_ASSIGN(&ctx->conf.min_def) \
+ __entry->rx_chains_static = ctx->conf.rx_chains_static; \
+ __entry->rx_chains_dynamic = ctx->conf.rx_chains_dynamic
+#define CHANCTX_PR_FMT CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT " chains:%d/%d"
+#define CHANCTX_PR_ARG CHANDEF_PR_ARG, MIN_CHANDEF_PR_ARG, \
+ __entry->rx_chains_static, __entry->rx_chains_dynamic
+
+
+
+/*
+ * Tracing for driver callbacks.
+ */
+
+DECLARE_EVENT_CLASS(local_only_evt,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ ),
+ TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
+);
+
+DECLARE_EVENT_CLASS(local_sdata_addr_evt,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __array(char, addr, ETH_ALEN)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ memcpy(__entry->addr, sdata->vif.addr, ETH_ALEN);
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " addr:%pM",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr
+ )
+);
+
+DECLARE_EVENT_CLASS(local_u32_evt,
+ TP_PROTO(struct ieee80211_local *local, u32 value),
+ TP_ARGS(local, value),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u32, value)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->value = value;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " value:%d",
+ LOCAL_PR_ARG, __entry->value
+ )
+);
+
+DECLARE_EVENT_CLASS(local_sdata_evt,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG
+ )
+);
+
+DEFINE_EVENT(local_only_evt, drv_return_void,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_return_int,
+ TP_PROTO(struct ieee80211_local *local, int ret),
+ TP_ARGS(local, ret),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(LOCAL_PR_FMT " - %d", LOCAL_PR_ARG, __entry->ret)
+);
+
+TRACE_EVENT(drv_return_bool,
+ TP_PROTO(struct ieee80211_local *local, bool ret),
+ TP_ARGS(local, ret),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(bool, ret)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(LOCAL_PR_FMT " - %s", LOCAL_PR_ARG, (__entry->ret) ?
+ "true" : "false")
+);
+
+TRACE_EVENT(drv_return_u32,
+ TP_PROTO(struct ieee80211_local *local, u32 ret),
+ TP_ARGS(local, ret),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u32, ret)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(LOCAL_PR_FMT " - %u", LOCAL_PR_ARG, __entry->ret)
+);
+
+TRACE_EVENT(drv_return_u64,
+ TP_PROTO(struct ieee80211_local *local, u64 ret),
+ TP_ARGS(local, ret),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u64, ret)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(LOCAL_PR_FMT " - %llu", LOCAL_PR_ARG, __entry->ret)
+);
+
+DEFINE_EVENT(local_only_evt, drv_start,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_u32_evt, drv_get_et_strings,
+ TP_PROTO(struct ieee80211_local *local, u32 sset),
+ TP_ARGS(local, sset)
+);
+
+DEFINE_EVENT(local_u32_evt, drv_get_et_sset_count,
+ TP_PROTO(struct ieee80211_local *local, u32 sset),
+ TP_ARGS(local, sset)
+);
+
+DEFINE_EVENT(local_only_evt, drv_get_et_stats,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_suspend,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_resume,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_set_wakeup,
+ TP_PROTO(struct ieee80211_local *local, bool enabled),
+ TP_ARGS(local, enabled),
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(bool, enabled)
+ ),
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->enabled = enabled;
+ ),
+ TP_printk(LOCAL_PR_FMT " enabled:%d", LOCAL_PR_ARG, __entry->enabled)
+);
+
+DEFINE_EVENT(local_only_evt, drv_stop,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_sdata_addr_evt, drv_add_interface,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_change_interface,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type, bool p2p),
+
+ TP_ARGS(local, sdata, type, p2p),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u32, new_type)
+ __field(bool, new_p2p)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->new_type = type;
+ __entry->new_p2p = p2p;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " new type:%d%s",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type,
+ __entry->new_p2p ? "/p2p" : ""
+ )
+);
+
+DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_config,
+ TP_PROTO(struct ieee80211_local *local,
+ u32 changed),
+
+ TP_ARGS(local, changed),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u32, changed)
+ __field(u32, flags)
+ __field(int, power_level)
+ __field(int, dynamic_ps_timeout)
+ __field(int, max_sleep_period)
+ __field(u16, listen_interval)
+ __field(u8, long_frame_max_tx_count)
+ __field(u8, short_frame_max_tx_count)
+ CHANDEF_ENTRY
+ __field(int, smps)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->changed = changed;
+ __entry->flags = local->hw.conf.flags;
+ __entry->power_level = local->hw.conf.power_level;
+ __entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout;
+ __entry->max_sleep_period = local->hw.conf.max_sleep_period;
+ __entry->listen_interval = local->hw.conf.listen_interval;
+ __entry->long_frame_max_tx_count =
+ local->hw.conf.long_frame_max_tx_count;
+ __entry->short_frame_max_tx_count =
+ local->hw.conf.short_frame_max_tx_count;
+ CHANDEF_ASSIGN(&local->hw.conf.chandef)
+ __entry->smps = local->hw.conf.smps_mode;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT,
+ LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG
+ )
+);
+
+TRACE_EVENT(drv_bss_info_changed,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_bss_conf *info,
+ u32 changed),
+
+ TP_ARGS(local, sdata, info, changed),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u32, changed)
+ __field(bool, assoc)
+ __field(bool, ibss_joined)
+ __field(bool, ibss_creator)
+ __field(u16, aid)
+ __field(bool, cts)
+ __field(bool, shortpre)
+ __field(bool, shortslot)
+ __field(bool, enable_beacon)
+ __field(u8, dtimper)
+ __field(u16, bcnint)
+ __field(u16, assoc_cap)
+ __field(u64, sync_tsf)
+ __field(u32, sync_device_ts)
+ __field(u8, sync_dtim_count)
+ __field(u32, basic_rates)
+ __array(int, mcast_rate, IEEE80211_NUM_BANDS)
+ __field(u16, ht_operation_mode)
+ __field(s32, cqm_rssi_thold);
+ __field(s32, cqm_rssi_hyst);
+ __field(u32, channel_width);
+ __field(u32, channel_cfreq1);
+ __dynamic_array(u32, arp_addr_list,
+ info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
+ IEEE80211_BSS_ARP_ADDR_LIST_LEN :
+ info->arp_addr_cnt);
+ __field(int, arp_addr_cnt);
+ __field(bool, qos);
+ __field(bool, idle);
+ __field(bool, ps);
+ __dynamic_array(u8, ssid, info->ssid_len);
+ __field(bool, hidden_ssid);
+ __field(int, txpower)
+ __field(u8, p2p_oppps_ctwindow)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->changed = changed;
+ __entry->aid = info->aid;
+ __entry->assoc = info->assoc;
+ __entry->ibss_joined = info->ibss_joined;
+ __entry->ibss_creator = info->ibss_creator;
+ __entry->shortpre = info->use_short_preamble;
+ __entry->cts = info->use_cts_prot;
+ __entry->shortslot = info->use_short_slot;
+ __entry->enable_beacon = info->enable_beacon;
+ __entry->dtimper = info->dtim_period;
+ __entry->bcnint = info->beacon_int;
+ __entry->assoc_cap = info->assoc_capability;
+ __entry->sync_tsf = info->sync_tsf;
+ __entry->sync_device_ts = info->sync_device_ts;
+ __entry->sync_dtim_count = info->sync_dtim_count;
+ __entry->basic_rates = info->basic_rates;
+ memcpy(__entry->mcast_rate, info->mcast_rate,
+ sizeof(__entry->mcast_rate));
+ __entry->ht_operation_mode = info->ht_operation_mode;
+ __entry->cqm_rssi_thold = info->cqm_rssi_thold;
+ __entry->cqm_rssi_hyst = info->cqm_rssi_hyst;
+ __entry->channel_width = info->chandef.width;
+ __entry->channel_cfreq1 = info->chandef.center_freq1;
+ __entry->arp_addr_cnt = info->arp_addr_cnt;
+ memcpy(__get_dynamic_array(arp_addr_list), info->arp_addr_list,
+ sizeof(u32) * (info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
+ IEEE80211_BSS_ARP_ADDR_LIST_LEN :
+ info->arp_addr_cnt));
+ __entry->qos = info->qos;
+ __entry->idle = info->idle;
+ __entry->ps = info->ps;
+ memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
+ __entry->hidden_ssid = info->hidden_ssid;
+ __entry->txpower = info->txpower;
+ __entry->p2p_oppps_ctwindow = info->p2p_noa_attr.oppps_ctwindow;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " changed:%#x",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed
+ )
+);
+
+TRACE_EVENT(drv_prepare_multicast,
+ TP_PROTO(struct ieee80211_local *local, int mc_count),
+
+ TP_ARGS(local, mc_count),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(int, mc_count)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->mc_count = mc_count;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " prepare mc (%d)",
+ LOCAL_PR_ARG, __entry->mc_count
+ )
+);
+
+TRACE_EVENT(drv_configure_filter,
+ TP_PROTO(struct ieee80211_local *local,
+ unsigned int changed_flags,
+ unsigned int *total_flags,
+ u64 multicast),
+
+ TP_ARGS(local, changed_flags, total_flags, multicast),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(unsigned int, changed)
+ __field(unsigned int, total)
+ __field(u64, multicast)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->changed = changed_flags;
+ __entry->total = *total_flags;
+ __entry->multicast = multicast;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " changed:%#x total:%#x",
+ LOCAL_PR_ARG, __entry->changed, __entry->total
+ )
+);
+
+TRACE_EVENT(drv_set_tim,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sta *sta, bool set),
+
+ TP_ARGS(local, sta, set),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ STA_ENTRY
+ __field(bool, set)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ STA_ASSIGN;
+ __entry->set = set;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT STA_PR_FMT " set:%d",
+ LOCAL_PR_ARG, STA_PR_ARG, __entry->set
+ )
+);
+
+TRACE_EVENT(drv_set_key,
+ TP_PROTO(struct ieee80211_local *local,
+ enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ struct ieee80211_key_conf *key),
+
+ TP_ARGS(local, cmd, sdata, sta, key),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(u32, cipher)
+ __field(u8, hw_key_idx)
+ __field(u8, flags)
+ __field(s8, keyidx)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->cipher = key->cipher;
+ __entry->flags = key->flags;
+ __entry->keyidx = key->keyidx;
+ __entry->hw_key_idx = key->hw_key_idx;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
+ )
+);
+
+TRACE_EVENT(drv_update_tkip_key,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_key_conf *conf,
+ struct ieee80211_sta *sta, u32 iv32),
+
+ TP_ARGS(local, sdata, conf, sta, iv32),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(u32, iv32)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->iv32 = iv32;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " iv32:%#x",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->iv32
+ )
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_hw_scan,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_cancel_hw_scan,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_sched_scan_start,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_sched_scan_stop,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_sw_scan_start,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const u8 *mac_addr),
+
+ TP_ARGS(local, sdata, mac_addr),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __array(char, mac_addr, ETH_ALEN)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ memcpy(__entry->mac_addr, mac_addr, ETH_ALEN);
+ ),
+
+ TP_printk(LOCAL_PR_FMT ", " VIF_PR_FMT ", addr:%pM",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->mac_addr)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_sw_scan_complete,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_get_stats,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_low_level_stats *stats,
+ int ret),
+
+ TP_ARGS(local, stats, ret),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(int, ret)
+ __field(unsigned int, ackfail)
+ __field(unsigned int, rtsfail)
+ __field(unsigned int, fcserr)
+ __field(unsigned int, rtssucc)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->ret = ret;
+ __entry->ackfail = stats->dot11ACKFailureCount;
+ __entry->rtsfail = stats->dot11RTSFailureCount;
+ __entry->fcserr = stats->dot11FCSErrorCount;
+ __entry->rtssucc = stats->dot11RTSSuccessCount;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " ret:%d",
+ LOCAL_PR_ARG, __entry->ret
+ )
+);
+
+TRACE_EVENT(drv_get_tkip_seq,
+ TP_PROTO(struct ieee80211_local *local,
+ u8 hw_key_idx, u32 *iv32, u16 *iv16),
+
+ TP_ARGS(local, hw_key_idx, iv32, iv16),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u8, hw_key_idx)
+ __field(u32, iv32)
+ __field(u16, iv16)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->hw_key_idx = hw_key_idx;
+ __entry->iv32 = *iv32;
+ __entry->iv16 = *iv16;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT, LOCAL_PR_ARG
+ )
+);
+
+DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold,
+ TP_PROTO(struct ieee80211_local *local, u32 value),
+ TP_ARGS(local, value)
+);
+
+DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold,
+ TP_PROTO(struct ieee80211_local *local, u32 value),
+ TP_ARGS(local, value)
+);
+
+TRACE_EVENT(drv_set_coverage_class,
+ TP_PROTO(struct ieee80211_local *local, s16 value),
+
+ TP_ARGS(local, value),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(s16, value)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->value = value;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " value:%d",
+ LOCAL_PR_ARG, __entry->value
+ )
+);
+
+TRACE_EVENT(drv_sta_notify,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum sta_notify_cmd cmd,
+ struct ieee80211_sta *sta),
+
+ TP_ARGS(local, sdata, cmd, sta),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(u32, cmd)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->cmd = cmd;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " cmd:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd
+ )
+);
+
+TRACE_EVENT(drv_sta_state,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ enum ieee80211_sta_state old_state,
+ enum ieee80211_sta_state new_state),
+
+ TP_ARGS(local, sdata, sta, old_state, new_state),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(u32, old_state)
+ __field(u32, new_state)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->old_state = old_state;
+ __entry->new_state = new_state;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " state: %d->%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG,
+ __entry->old_state, __entry->new_state
+ )
+);
+
+TRACE_EVENT(drv_sta_rc_update,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ u32 changed),
+
+ TP_ARGS(local, sdata, sta, changed),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(u32, changed)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->changed = changed;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " changed: 0x%x",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->changed
+ )
+);
+
+DECLARE_EVENT_CLASS(sta_event,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta),
+
+ TP_ARGS(local, sdata, sta),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
+ )
+);
+
+DEFINE_EVENT(sta_event, drv_sta_statistics,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta),
+ TP_ARGS(local, sdata, sta)
+);
+
+DEFINE_EVENT(sta_event, drv_sta_add,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta),
+ TP_ARGS(local, sdata, sta)
+);
+
+DEFINE_EVENT(sta_event, drv_sta_remove,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta),
+ TP_ARGS(local, sdata, sta)
+);
+
+DEFINE_EVENT(sta_event, drv_sta_pre_rcu_remove,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta),
+ TP_ARGS(local, sdata, sta)
+);
+
+DEFINE_EVENT(sta_event, drv_sta_rate_tbl_update,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta),
+ TP_ARGS(local, sdata, sta)
+);
+
+TRACE_EVENT(drv_conf_tx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ u16 ac, const struct ieee80211_tx_queue_params *params),
+
+ TP_ARGS(local, sdata, ac, params),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u16, ac)
+ __field(u16, txop)
+ __field(u16, cw_min)
+ __field(u16, cw_max)
+ __field(u8, aifs)
+ __field(bool, uapsd)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->ac = ac;
+ __entry->txop = params->txop;
+ __entry->cw_max = params->cw_max;
+ __entry->cw_min = params->cw_min;
+ __entry->aifs = params->aifs;
+ __entry->uapsd = params->uapsd;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " AC:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->ac
+ )
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_get_tsf,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_set_tsf,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ u64 tsf),
+
+ TP_ARGS(local, sdata, tsf),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u64, tsf)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->tsf = tsf;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " tsf:%llu",
+ LOCAL_PR_ARG, VIF_PR_ARG, (unsigned long long)__entry->tsf
+ )
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_reset_tsf,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_only_evt, drv_tx_last_beacon,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_ampdu_action,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum ieee80211_ampdu_mlme_action action,
+ struct ieee80211_sta *sta, u16 tid,
+ u16 *ssn, u8 buf_size),
+
+ TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ STA_ENTRY
+ __field(u32, action)
+ __field(u16, tid)
+ __field(u16, ssn)
+ __field(u8, buf_size)
+ VIF_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->action = action;
+ __entry->tid = tid;
+ __entry->ssn = ssn ? *ssn : 0;
+ __entry->buf_size = buf_size;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action,
+ __entry->tid, __entry->buf_size
+ )
+);
+
+TRACE_EVENT(drv_get_survey,
+ TP_PROTO(struct ieee80211_local *local, int idx,
+ struct survey_info *survey),
+
+ TP_ARGS(local, idx, survey),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(int, idx)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->idx = idx;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " idx:%d",
+ LOCAL_PR_ARG, __entry->idx
+ )
+);
+
+TRACE_EVENT(drv_flush,
+ TP_PROTO(struct ieee80211_local *local,
+ u32 queues, bool drop),
+
+ TP_ARGS(local, queues, drop),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(bool, drop)
+ __field(u32, queues)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->drop = drop;
+ __entry->queues = queues;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " queues:0x%x drop:%d",
+ LOCAL_PR_ARG, __entry->queues, __entry->drop
+ )
+);
+
+TRACE_EVENT(drv_channel_switch,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel_switch *ch_switch),
+
+ TP_ARGS(local, sdata, ch_switch),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ CHANDEF_ENTRY
+ __field(u64, timestamp)
+ __field(u32, device_timestamp)
+ __field(bool, block_tx)
+ __field(u8, count)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ CHANDEF_ASSIGN(&ch_switch->chandef)
+ __entry->timestamp = ch_switch->timestamp;
+ __entry->device_timestamp = ch_switch->device_timestamp;
+ __entry->block_tx = ch_switch->block_tx;
+ __entry->count = ch_switch->count;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " new " CHANDEF_PR_FMT " count:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count
+ )
+);
+
+TRACE_EVENT(drv_set_antenna,
+ TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),
+
+ TP_ARGS(local, tx_ant, rx_ant, ret),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u32, tx_ant)
+ __field(u32, rx_ant)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->tx_ant = tx_ant;
+ __entry->rx_ant = rx_ant;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
+ LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
+ )
+);
+
+TRACE_EVENT(drv_get_antenna,
+ TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),
+
+ TP_ARGS(local, tx_ant, rx_ant, ret),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u32, tx_ant)
+ __field(u32, rx_ant)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->tx_ant = tx_ant;
+ __entry->rx_ant = rx_ant;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
+ LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
+ )
+);
+
+TRACE_EVENT(drv_remain_on_channel,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel *chan,
+ unsigned int duration,
+ enum ieee80211_roc_type type),
+
+ TP_ARGS(local, sdata, chan, duration, type),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(int, center_freq)
+ __field(unsigned int, duration)
+ __field(u32, type)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->center_freq = chan->center_freq;
+ __entry->duration = duration;
+ __entry->type = type;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " freq:%dMHz duration:%dms type=%d",
+ LOCAL_PR_ARG, VIF_PR_ARG,
+ __entry->center_freq, __entry->duration, __entry->type
+ )
+);
+
+DEFINE_EVENT(local_only_evt, drv_cancel_remain_on_channel,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_set_ringparam,
+ TP_PROTO(struct ieee80211_local *local, u32 tx, u32 rx),
+
+ TP_ARGS(local, tx, rx),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u32, tx)
+ __field(u32, rx)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->tx = tx;
+ __entry->rx = rx;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " tx:%d rx %d",
+ LOCAL_PR_ARG, __entry->tx, __entry->rx
+ )
+);
+
+TRACE_EVENT(drv_get_ringparam,
+ TP_PROTO(struct ieee80211_local *local, u32 *tx, u32 *tx_max,
+ u32 *rx, u32 *rx_max),
+
+ TP_ARGS(local, tx, tx_max, rx, rx_max),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u32, tx)
+ __field(u32, tx_max)
+ __field(u32, rx)
+ __field(u32, rx_max)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->tx = *tx;
+ __entry->tx_max = *tx_max;
+ __entry->rx = *rx;
+ __entry->rx_max = *rx_max;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " tx:%d tx_max %d rx %d rx_max %d",
+ LOCAL_PR_ARG,
+ __entry->tx, __entry->tx_max, __entry->rx, __entry->rx_max
+ )
+);
+
+DEFINE_EVENT(local_only_evt, drv_tx_frames_pending,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_offchannel_tx_cancel_wait,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_set_bitrate_mask,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_bitrate_mask *mask),
+
+ TP_ARGS(local, sdata, mask),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u32, legacy_2g)
+ __field(u32, legacy_5g)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->legacy_2g = mask->control[IEEE80211_BAND_2GHZ].legacy;
+ __entry->legacy_5g = mask->control[IEEE80211_BAND_5GHZ].legacy;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " 2G Mask:0x%x 5G Mask:0x%x",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->legacy_2g, __entry->legacy_5g
+ )
+);
+
+TRACE_EVENT(drv_set_rekey_data,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_gtk_rekey_data *data),
+
+ TP_ARGS(local, sdata, data),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __array(u8, kek, NL80211_KEK_LEN)
+ __array(u8, kck, NL80211_KCK_LEN)
+ __array(u8, replay_ctr, NL80211_REPLAY_CTR_LEN)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ memcpy(__entry->kek, data->kek, NL80211_KEK_LEN);
+ memcpy(__entry->kck, data->kck, NL80211_KCK_LEN);
+ memcpy(__entry->replay_ctr, data->replay_ctr,
+ NL80211_REPLAY_CTR_LEN);
+ ),
+
+ TP_printk(LOCAL_PR_FMT VIF_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG)
+);
+
+TRACE_EVENT(drv_event_callback,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const struct ieee80211_event *_event),
+
+ TP_ARGS(local, sdata, _event),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u32, type)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->type = _event->type;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " event:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->type
+ )
+);
+
+DECLARE_EVENT_CLASS(release_evt,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sta *sta,
+ u16 tids, int num_frames,
+ enum ieee80211_frame_release_type reason,
+ bool more_data),
+
+ TP_ARGS(local, sta, tids, num_frames, reason, more_data),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ STA_ENTRY
+ __field(u16, tids)
+ __field(int, num_frames)
+ __field(int, reason)
+ __field(bool, more_data)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ STA_ASSIGN;
+ __entry->tids = tids;
+ __entry->num_frames = num_frames;
+ __entry->reason = reason;
+ __entry->more_data = more_data;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT STA_PR_FMT
+ " TIDs:0x%.4x frames:%d reason:%d more:%d",
+ LOCAL_PR_ARG, STA_PR_ARG, __entry->tids, __entry->num_frames,
+ __entry->reason, __entry->more_data
+ )
+);
+
+DEFINE_EVENT(release_evt, drv_release_buffered_frames,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sta *sta,
+ u16 tids, int num_frames,
+ enum ieee80211_frame_release_type reason,
+ bool more_data),
+
+ TP_ARGS(local, sta, tids, num_frames, reason, more_data)
+);
+
+DEFINE_EVENT(release_evt, drv_allow_buffered_frames,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sta *sta,
+ u16 tids, int num_frames,
+ enum ieee80211_frame_release_type reason,
+ bool more_data),
+
+ TP_ARGS(local, sta, tids, num_frames, reason, more_data)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_mgd_prepare_tx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+
+ TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+
+ TP_ARGS(local, sdata)
+);
+
+DECLARE_EVENT_CLASS(local_chanctx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx),
+
+ TP_ARGS(local, ctx),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ CHANCTX_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ CHANCTX_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT CHANCTX_PR_FMT,
+ LOCAL_PR_ARG, CHANCTX_PR_ARG
+ )
+);
+
+DEFINE_EVENT(local_chanctx, drv_add_chanctx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx),
+ TP_ARGS(local, ctx)
+);
+
+DEFINE_EVENT(local_chanctx, drv_remove_chanctx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx),
+ TP_ARGS(local, ctx)
+);
+
+TRACE_EVENT(drv_change_chanctx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ u32 changed),
+
+ TP_ARGS(local, ctx, changed),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ CHANCTX_ENTRY
+ __field(u32, changed)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ CHANCTX_ASSIGN;
+ __entry->changed = changed;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT CHANCTX_PR_FMT " changed:%#x",
+ LOCAL_PR_ARG, CHANCTX_PR_ARG, __entry->changed
+ )
+);
+
+#if !defined(__TRACE_VIF_ENTRY)
+#define __TRACE_VIF_ENTRY
+struct trace_vif_entry {
+ enum nl80211_iftype vif_type;
+ bool p2p;
+ char vif_name[IFNAMSIZ];
+} __packed;
+
+struct trace_chandef_entry {
+ u32 control_freq;
+ u32 chan_width;
+ u32 center_freq1;
+ u32 center_freq2;
+} __packed;
+
+struct trace_switch_entry {
+ struct trace_vif_entry vif;
+ struct trace_chandef_entry old_chandef;
+ struct trace_chandef_entry new_chandef;
+} __packed;
+
+#define SWITCH_ENTRY_ASSIGN(to, from) local_vifs[i].to = vifs[i].from
+#endif
+
+TRACE_EVENT(drv_switch_vif_chanctx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_vif_chanctx_switch *vifs,
+ int n_vifs, enum ieee80211_chanctx_switch_mode mode),
+ TP_ARGS(local, vifs, n_vifs, mode),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(int, n_vifs)
+ __field(u32, mode)
+ __dynamic_array(u8, vifs,
+ sizeof(struct trace_switch_entry) * n_vifs)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->n_vifs = n_vifs;
+ __entry->mode = mode;
+ {
+ struct trace_switch_entry *local_vifs =
+ __get_dynamic_array(vifs);
+ int i;
+
+ for (i = 0; i < n_vifs; i++) {
+ struct ieee80211_sub_if_data *sdata;
+
+ sdata = container_of(vifs[i].vif,
+ struct ieee80211_sub_if_data,
+ vif);
+
+ SWITCH_ENTRY_ASSIGN(vif.vif_type, vif->type);
+ SWITCH_ENTRY_ASSIGN(vif.p2p, vif->p2p);
+ strncpy(local_vifs[i].vif.vif_name,
+ sdata->name,
+ sizeof(local_vifs[i].vif.vif_name));
+ SWITCH_ENTRY_ASSIGN(old_chandef.control_freq,
+ old_ctx->def.chan->center_freq);
+ SWITCH_ENTRY_ASSIGN(old_chandef.chan_width,
+ old_ctx->def.width);
+ SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1,
+ old_ctx->def.center_freq1);
+ SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2,
+ old_ctx->def.center_freq2);
+ SWITCH_ENTRY_ASSIGN(new_chandef.control_freq,
+ new_ctx->def.chan->center_freq);
+ SWITCH_ENTRY_ASSIGN(new_chandef.chan_width,
+ new_ctx->def.width);
+ SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1,
+ new_ctx->def.center_freq1);
+ SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2,
+ new_ctx->def.center_freq2);
+ }
+ }
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " n_vifs:%d mode:%d",
+ LOCAL_PR_ARG, __entry->n_vifs, __entry->mode
+ )
+);
+
+DECLARE_EVENT_CLASS(local_sdata_chanctx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_chanctx *ctx),
+
+ TP_ARGS(local, sdata, ctx),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ CHANCTX_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ CHANCTX_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT CHANCTX_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, CHANCTX_PR_ARG
+ )
+);
+
+DEFINE_EVENT(local_sdata_chanctx, drv_assign_vif_chanctx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_chanctx *ctx),
+ TP_ARGS(local, sdata, ctx)
+);
+
+DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_chanctx *ctx),
+ TP_ARGS(local, sdata, ctx)
+);
+
+TRACE_EVENT(drv_start_ap,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_bss_conf *info),
+
+ TP_ARGS(local, sdata, info),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u8, dtimper)
+ __field(u16, bcnint)
+ __dynamic_array(u8, ssid, info->ssid_len);
+ __field(bool, hidden_ssid);
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->dtimper = info->dtim_period;
+ __entry->bcnint = info->beacon_int;
+ memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
+ __entry->hidden_ssid = info->hidden_ssid;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG
+ )
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_stop_ap,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_reconfig_complete,
+ TP_PROTO(struct ieee80211_local *local,
+ enum ieee80211_reconfig_type reconfig_type),
+ TP_ARGS(local, reconfig_type),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u8, reconfig_type)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->reconfig_type = reconfig_type;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " reconfig_type:%d",
+ LOCAL_PR_ARG, __entry->reconfig_type
+ )
+
+);
+
+#if IS_ENABLED(CONFIG_IPV6)
+DEFINE_EVENT(local_sdata_evt, drv_ipv6_addr_change,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+#endif
+
+TRACE_EVENT(drv_join_ibss,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_bss_conf *info),
+
+ TP_ARGS(local, sdata, info),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u8, dtimper)
+ __field(u16, bcnint)
+ __dynamic_array(u8, ssid, info->ssid_len);
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->dtimper = info->dtim_period;
+ __entry->bcnint = info->beacon_int;
+ memcpy(__get_dynamic_array(ssid), info->ssid, info->ssid_len);
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG
+ )
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_leave_ibss,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_get_expected_throughput,
+ TP_PROTO(struct ieee80211_sta *sta),
+
+ TP_ARGS(sta),
+
+ TP_STRUCT__entry(
+ STA_ENTRY
+ ),
+
+ TP_fast_assign(
+ STA_ASSIGN;
+ ),
+
+ TP_printk(
+ STA_PR_FMT, STA_PR_ARG
+ )
+);
+
+/*
+ * Tracing for API calls that drivers call.
+ */
+
+TRACE_EVENT(api_start_tx_ba_session,
+ TP_PROTO(struct ieee80211_sta *sta, u16 tid),
+
+ TP_ARGS(sta, tid),
+
+ TP_STRUCT__entry(
+ STA_ENTRY
+ __field(u16, tid)
+ ),
+
+ TP_fast_assign(
+ STA_ASSIGN;
+ __entry->tid = tid;
+ ),
+
+ TP_printk(
+ STA_PR_FMT " tid:%d",
+ STA_PR_ARG, __entry->tid
+ )
+);
+
+TRACE_EVENT(api_start_tx_ba_cb,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid),
+
+ TP_ARGS(sdata, ra, tid),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ __array(u8, ra, ETH_ALEN)
+ __field(u16, tid)
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ memcpy(__entry->ra, ra, ETH_ALEN);
+ __entry->tid = tid;
+ ),
+
+ TP_printk(
+ VIF_PR_FMT " ra:%pM tid:%d",
+ VIF_PR_ARG, __entry->ra, __entry->tid
+ )
+);
+
+TRACE_EVENT(api_stop_tx_ba_session,
+ TP_PROTO(struct ieee80211_sta *sta, u16 tid),
+
+ TP_ARGS(sta, tid),
+
+ TP_STRUCT__entry(
+ STA_ENTRY
+ __field(u16, tid)
+ ),
+
+ TP_fast_assign(
+ STA_ASSIGN;
+ __entry->tid = tid;
+ ),
+
+ TP_printk(
+ STA_PR_FMT " tid:%d",
+ STA_PR_ARG, __entry->tid
+ )
+);
+
+TRACE_EVENT(api_stop_tx_ba_cb,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid),
+
+ TP_ARGS(sdata, ra, tid),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ __array(u8, ra, ETH_ALEN)
+ __field(u16, tid)
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ memcpy(__entry->ra, ra, ETH_ALEN);
+ __entry->tid = tid;
+ ),
+
+ TP_printk(
+ VIF_PR_FMT " ra:%pM tid:%d",
+ VIF_PR_ARG, __entry->ra, __entry->tid
+ )
+);
+
+DEFINE_EVENT(local_only_evt, api_restart_hw,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(api_beacon_loss,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata),
+
+ TP_ARGS(sdata),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ ),
+
+ TP_printk(
+ VIF_PR_FMT,
+ VIF_PR_ARG
+ )
+);
+
+TRACE_EVENT(api_connection_loss,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata),
+
+ TP_ARGS(sdata),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ ),
+
+ TP_printk(
+ VIF_PR_FMT,
+ VIF_PR_ARG
+ )
+);
+
+TRACE_EVENT(api_cqm_rssi_notify,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_cqm_rssi_threshold_event rssi_event),
+
+ TP_ARGS(sdata, rssi_event),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ __field(u32, rssi_event)
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ __entry->rssi_event = rssi_event;
+ ),
+
+ TP_printk(
+ VIF_PR_FMT " event:%d",
+ VIF_PR_ARG, __entry->rssi_event
+ )
+);
+
+DEFINE_EVENT(local_sdata_evt, api_cqm_beacon_loss_notify,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(api_scan_completed,
+ TP_PROTO(struct ieee80211_local *local, bool aborted),
+
+ TP_ARGS(local, aborted),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(bool, aborted)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->aborted = aborted;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " aborted:%d",
+ LOCAL_PR_ARG, __entry->aborted
+ )
+);
+
+TRACE_EVENT(api_sched_scan_results,
+ TP_PROTO(struct ieee80211_local *local),
+
+ TP_ARGS(local),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT, LOCAL_PR_ARG
+ )
+);
+
+TRACE_EVENT(api_sched_scan_stopped,
+ TP_PROTO(struct ieee80211_local *local),
+
+ TP_ARGS(local),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT, LOCAL_PR_ARG
+ )
+);
+
+TRACE_EVENT(api_sta_block_awake,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sta *sta, bool block),
+
+ TP_ARGS(local, sta, block),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ STA_ENTRY
+ __field(bool, block)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ STA_ASSIGN;
+ __entry->block = block;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT STA_PR_FMT " block:%d",
+ LOCAL_PR_ARG, STA_PR_ARG, __entry->block
+ )
+);
+
+TRACE_EVENT(api_chswitch_done,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata, bool success),
+
+ TP_ARGS(sdata, success),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ __field(bool, success)
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ __entry->success = success;
+ ),
+
+ TP_printk(
+ VIF_PR_FMT " success=%d",
+ VIF_PR_ARG, __entry->success
+ )
+);
+
+DEFINE_EVENT(local_only_evt, api_ready_on_channel,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, api_remain_on_channel_expired,
+ TP_PROTO(struct ieee80211_local *local),
+ TP_ARGS(local)
+);
+
+TRACE_EVENT(api_gtk_rekey_notify,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, const u8 *replay_ctr),
+
+ TP_ARGS(sdata, bssid, replay_ctr),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ __array(u8, bssid, ETH_ALEN)
+ __array(u8, replay_ctr, NL80211_REPLAY_CTR_LEN)
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ memcpy(__entry->bssid, bssid, ETH_ALEN);
+ memcpy(__entry->replay_ctr, replay_ctr, NL80211_REPLAY_CTR_LEN);
+ ),
+
+ TP_printk(VIF_PR_FMT, VIF_PR_ARG)
+);
+
+TRACE_EVENT(api_enable_rssi_reports,
+ TP_PROTO(struct ieee80211_sub_if_data *sdata,
+ int rssi_min_thold, int rssi_max_thold),
+
+ TP_ARGS(sdata, rssi_min_thold, rssi_max_thold),
+
+ TP_STRUCT__entry(
+ VIF_ENTRY
+ __field(int, rssi_min_thold)
+ __field(int, rssi_max_thold)
+ ),
+
+ TP_fast_assign(
+ VIF_ASSIGN;
+ __entry->rssi_min_thold = rssi_min_thold;
+ __entry->rssi_max_thold = rssi_max_thold;
+ ),
+
+ TP_printk(
+ VIF_PR_FMT " rssi_min_thold =%d, rssi_max_thold = %d",
+ VIF_PR_ARG, __entry->rssi_min_thold, __entry->rssi_max_thold
+ )
+);
+
+TRACE_EVENT(api_eosp,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sta *sta),
+
+ TP_ARGS(local, sta),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ STA_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ STA_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT STA_PR_FMT,
+ LOCAL_PR_ARG, STA_PR_ARG
+ )
+);
+
+TRACE_EVENT(api_sta_set_buffered,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sta *sta,
+ u8 tid, bool buffered),
+
+ TP_ARGS(local, sta, tid, buffered),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ STA_ENTRY
+ __field(u8, tid)
+ __field(bool, buffered)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ STA_ASSIGN;
+ __entry->tid = tid;
+ __entry->buffered = buffered;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT STA_PR_FMT " tid:%d buffered:%d",
+ LOCAL_PR_ARG, STA_PR_ARG, __entry->tid, __entry->buffered
+ )
+);
+
+/*
+ * Tracing for internal functions
+ * (which may also be called in response to driver calls)
+ */
+
+TRACE_EVENT(wake_queue,
+ TP_PROTO(struct ieee80211_local *local, u16 queue,
+ enum queue_stop_reason reason),
+
+ TP_ARGS(local, queue, reason),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u16, queue)
+ __field(u32, reason)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->queue = queue;
+ __entry->reason = reason;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " queue:%d, reason:%d",
+ LOCAL_PR_ARG, __entry->queue, __entry->reason
+ )
+);
+
+TRACE_EVENT(stop_queue,
+ TP_PROTO(struct ieee80211_local *local, u16 queue,
+ enum queue_stop_reason reason),
+
+ TP_ARGS(local, queue, reason),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(u16, queue)
+ __field(u32, reason)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->queue = queue;
+ __entry->reason = reason;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " queue:%d, reason:%d",
+ LOCAL_PR_ARG, __entry->queue, __entry->reason
+ )
+);
+
+TRACE_EVENT(drv_set_default_unicast_key,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ int key_idx),
+
+ TP_ARGS(local, sdata, key_idx),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(int, key_idx)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->key_idx = key_idx;
+ ),
+
+ TP_printk(LOCAL_PR_FMT VIF_PR_FMT " key_idx:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->key_idx)
+);
+
+TRACE_EVENT(api_radar_detected,
+ TP_PROTO(struct ieee80211_local *local),
+
+ TP_ARGS(local),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " radar detected",
+ LOCAL_PR_ARG
+ )
+);
+
+TRACE_EVENT(drv_channel_switch_beacon,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_chan_def *chandef),
+
+ TP_ARGS(local, sdata, chandef),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ CHANDEF_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ CHANDEF_ASSIGN(chandef);
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " channel switch to " CHANDEF_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG
+ )
+);
+
+TRACE_EVENT(drv_pre_channel_switch,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel_switch *ch_switch),
+
+ TP_ARGS(local, sdata, ch_switch),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ CHANDEF_ENTRY
+ __field(u64, timestamp)
+ __field(u32, device_timestamp)
+ __field(bool, block_tx)
+ __field(u8, count)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ CHANDEF_ASSIGN(&ch_switch->chandef)
+ __entry->timestamp = ch_switch->timestamp;
+ __entry->device_timestamp = ch_switch->device_timestamp;
+ __entry->block_tx = ch_switch->block_tx;
+ __entry->count = ch_switch->count;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " prepare channel switch to "
+ CHANDEF_PR_FMT " count:%d block_tx:%d timestamp:%llu",
+ LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count,
+ __entry->block_tx, __entry->timestamp
+ )
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_post_channel_switch,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_get_txpower,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ int dbm, int ret),
+
+ TP_ARGS(local, sdata, dbm, ret),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(int, dbm)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->dbm = dbm;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " dbm:%d ret:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->dbm, __entry->ret
+ )
+);
+
+TRACE_EVENT(drv_tdls_channel_switch,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, u8 oper_class,
+ struct cfg80211_chan_def *chandef),
+
+ TP_ARGS(local, sdata, sta, oper_class, chandef),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(u8, oper_class)
+ CHANDEF_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->oper_class = oper_class;
+ CHANDEF_ASSIGN(chandef)
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " tdls channel switch to"
+ CHANDEF_PR_FMT " oper_class:%d " STA_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->oper_class,
+ STA_PR_ARG
+ )
+);
+
+TRACE_EVENT(drv_tdls_cancel_channel_switch,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta),
+
+ TP_ARGS(local, sdata, sta),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT
+ " tdls cancel channel switch with " STA_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
+ )
+);
+
+TRACE_EVENT(drv_tdls_recv_channel_switch,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_tdls_ch_sw_params *params),
+
+ TP_ARGS(local, sdata, params),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u8, action_code)
+ STA_ENTRY
+ CHANDEF_ENTRY
+ __field(u32, status)
+ __field(bool, peer_initiator)
+ __field(u32, timestamp)
+ __field(u16, switch_time)
+ __field(u16, switch_timeout)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_NAMED_ASSIGN(params->sta);
+ CHANDEF_ASSIGN(params->chandef)
+ __entry->peer_initiator = params->sta->tdls_initiator;
+ __entry->action_code = params->action_code;
+ __entry->status = params->status;
+ __entry->timestamp = params->timestamp;
+ __entry->switch_time = params->switch_time;
+ __entry->switch_timeout = params->switch_timeout;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " received tdls channel switch packet"
+ " action:%d status:%d time:%d switch time:%d switch"
+ " timeout:%d initiator: %d chan:" CHANDEF_PR_FMT STA_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->action_code, __entry->status,
+ __entry->timestamp, __entry->switch_time,
+ __entry->switch_timeout, __entry->peer_initiator,
+ CHANDEF_PR_ARG, STA_PR_ARG
+ )
+);
+
+TRACE_EVENT(drv_wake_tx_queue,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct txq_info *txq),
+
+ TP_ARGS(local, sdata, txq),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(u8, ac)
+ __field(u8, tid)
+ ),
+
+ TP_fast_assign(
+ struct ieee80211_sta *sta = txq->txq.sta;
+
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_ASSIGN;
+ __entry->ac = txq->txq.ac;
+ __entry->tid = txq->txq.tid;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " ac:%d tid:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ac, __entry->tid
+ )
+);
+
+#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/mac80211/trace_msg.h b/net/mac80211/trace_msg.h
new file mode 100644
index 000000000..768f7c22a
--- /dev/null
+++ b/net/mac80211/trace_msg.h
@@ -0,0 +1,53 @@
+#ifdef CONFIG_MAC80211_MESSAGE_TRACING
+
+#if !defined(__MAC80211_MSG_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __MAC80211_MSG_DRIVER_TRACE
+
+#include <linux/tracepoint.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mac80211_msg
+
+#define MAX_MSG_LEN 100
+
+DECLARE_EVENT_CLASS(mac80211_msg_event,
+ TP_PROTO(struct va_format *vaf),
+
+ TP_ARGS(vaf),
+
+ TP_STRUCT__entry(
+ __dynamic_array(char, msg, MAX_MSG_LEN)
+ ),
+
+ TP_fast_assign(
+ WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
+ MAX_MSG_LEN, vaf->fmt,
+ *vaf->va) >= MAX_MSG_LEN);
+ ),
+
+ TP_printk("%s", __get_str(msg))
+);
+
+DEFINE_EVENT(mac80211_msg_event, mac80211_info,
+ TP_PROTO(struct va_format *vaf),
+ TP_ARGS(vaf)
+);
+DEFINE_EVENT(mac80211_msg_event, mac80211_dbg,
+ TP_PROTO(struct va_format *vaf),
+ TP_ARGS(vaf)
+);
+DEFINE_EVENT(mac80211_msg_event, mac80211_err,
+ TP_PROTO(struct va_format *vaf),
+ TP_ARGS(vaf)
+);
+#endif /* !__MAC80211_MSG_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_msg
+#include <trace/define_trace.h>
+
+#endif
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
new file mode 100644
index 000000000..667111ee6
--- /dev/null
+++ b/net/mac80211/tx.c
@@ -0,0 +1,3385 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *
+ * Transmit and frame generation functions.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/bitmap.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/ieee80211_radiotap.h>
+#include <net/cfg80211.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "led.h"
+#include "mesh.h"
+#include "wep.h"
+#include "wpa.h"
+#include "wme.h"
+#include "rate.h"
+
+/* misc utils */
+
+static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
+ struct sk_buff *skb, int group_addr,
+ int next_frag_len)
+{
+ int rate, mrate, erp, dur, i, shift = 0;
+ struct ieee80211_rate *txrate;
+ struct ieee80211_local *local = tx->local;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_hdr *hdr;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ u32 rate_flags = 0;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf);
+ if (chanctx_conf) {
+ shift = ieee80211_chandef_get_shift(&chanctx_conf->def);
+ rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def);
+ }
+ rcu_read_unlock();
+
+ /* assume HW handles this */
+ if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))
+ return 0;
+
+ /* uh huh? */
+ if (WARN_ON_ONCE(tx->rate.idx < 0))
+ return 0;
+
+ sband = local->hw.wiphy->bands[info->band];
+ txrate = &sband->bitrates[tx->rate.idx];
+
+ erp = txrate->flags & IEEE80211_RATE_ERP_G;
+
+ /*
+ * data and mgmt (except PS Poll):
+ * - during CFP: 32768
+ * - during contention period:
+ * if addr1 is group address: 0
+ * if more fragments = 0 and addr1 is individual address: time to
+ * transmit one ACK plus SIFS
+ * if more fragments = 1 and addr1 is individual address: time to
+ * transmit next fragment plus 2 x ACK plus 3 x SIFS
+ *
+ * IEEE 802.11, 9.6:
+ * - control response frame (CTS or ACK) shall be transmitted using the
+ * same rate as the immediately previous frame in the frame exchange
+ * sequence, if this rate belongs to the PHY mandatory rates, or else
+ * at the highest possible rate belonging to the PHY rates in the
+ * BSSBasicRateSet
+ */
+ hdr = (struct ieee80211_hdr *)skb->data;
+ if (ieee80211_is_ctl(hdr->frame_control)) {
+ /* TODO: These control frames are not currently sent by
+ * mac80211, but should they be implemented, this function
+ * needs to be updated to support duration field calculation.
+ *
+ * RTS: time needed to transmit pending data/mgmt frame plus
+ * one CTS frame plus one ACK frame plus 3 x SIFS
+ * CTS: duration of immediately previous RTS minus time
+ * required to transmit CTS and its SIFS
+ * ACK: 0 if immediately previous directed data/mgmt had
+ * more=0, with more=1 duration in ACK frame is duration
+ * from previous frame minus time needed to transmit ACK
+ * and its SIFS
+ * PS Poll: BIT(15) | BIT(14) | aid
+ */
+ return 0;
+ }
+
+ /* data/mgmt */
+ if (0 /* FIX: data/mgmt during CFP */)
+ return cpu_to_le16(32768);
+
+ if (group_addr) /* Group address as the destination - no ACK */
+ return 0;
+
+ /* Individual destination address:
+ * IEEE 802.11, Ch. 9.6 (after IEEE 802.11g changes)
+ * CTS and ACK frames shall be transmitted using the highest rate in
+ * basic rate set that is less than or equal to the rate of the
+ * immediately previous frame and that is using the same modulation
+ * (CCK or OFDM). If no basic rate set matches with these requirements,
+ * the highest mandatory rate of the PHY that is less than or equal to
+ * the rate of the previous frame is used.
+ * Mandatory rates for IEEE 802.11g PHY: 1, 2, 5.5, 11, 6, 12, 24 Mbps
+ */
+ rate = -1;
+ /* use lowest available if everything fails */
+ mrate = sband->bitrates[0].bitrate;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ struct ieee80211_rate *r = &sband->bitrates[i];
+
+ if (r->bitrate > txrate->bitrate)
+ break;
+
+ if ((rate_flags & r->flags) != rate_flags)
+ continue;
+
+ if (tx->sdata->vif.bss_conf.basic_rates & BIT(i))
+ rate = DIV_ROUND_UP(r->bitrate, 1 << shift);
+
+ switch (sband->band) {
+ case IEEE80211_BAND_2GHZ: {
+ u32 flag;
+ if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+ flag = IEEE80211_RATE_MANDATORY_G;
+ else
+ flag = IEEE80211_RATE_MANDATORY_B;
+ if (r->flags & flag)
+ mrate = r->bitrate;
+ break;
+ }
+ case IEEE80211_BAND_5GHZ:
+ if (r->flags & IEEE80211_RATE_MANDATORY_A)
+ mrate = r->bitrate;
+ break;
+ case IEEE80211_BAND_60GHZ:
+ /* TODO, for now fall through */
+ case IEEE80211_NUM_BANDS:
+ WARN_ON(1);
+ break;
+ }
+ }
+ if (rate == -1) {
+ /* No matching basic rate found; use highest suitable mandatory
+ * PHY rate */
+ rate = DIV_ROUND_UP(mrate, 1 << shift);
+ }
+
+ /* Don't calculate ACKs for QoS Frames with NoAck Policy set */
+ if (ieee80211_is_data_qos(hdr->frame_control) &&
+ *(ieee80211_get_qos_ctl(hdr)) & IEEE80211_QOS_CTL_ACK_POLICY_NOACK)
+ dur = 0;
+ else
+ /* Time needed to transmit ACK
+ * (10 bytes + 4-byte FCS = 112 bits) plus SIFS; rounded up
+ * to closest integer */
+ dur = ieee80211_frame_duration(sband->band, 10, rate, erp,
+ tx->sdata->vif.bss_conf.use_short_preamble,
+ shift);
+
+ if (next_frag_len) {
+ /* Frame is fragmented: duration increases with time needed to
+ * transmit next fragment plus ACK and 2 x SIFS. */
+ dur *= 2; /* ACK + SIFS */
+ /* next fragment */
+ dur += ieee80211_frame_duration(sband->band, next_frag_len,
+ txrate->bitrate, erp,
+ tx->sdata->vif.bss_conf.use_short_preamble,
+ shift);
+ }
+
+ return cpu_to_le16(dur);
+}
+
+/* tx handlers */
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_local *local = tx->local;
+ struct ieee80211_if_managed *ifmgd;
+
+ /* driver doesn't support power save */
+ if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS))
+ return TX_CONTINUE;
+
+ /* hardware does dynamic power save */
+ if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
+ return TX_CONTINUE;
+
+ /* dynamic power save disabled */
+ if (local->hw.conf.dynamic_ps_timeout <= 0)
+ return TX_CONTINUE;
+
+ /* we are scanning, don't enable power save */
+ if (local->scanning)
+ return TX_CONTINUE;
+
+ if (!local->ps_sdata)
+ return TX_CONTINUE;
+
+ /* No point if we're going to suspend */
+ if (local->quiescing)
+ return TX_CONTINUE;
+
+ /* dynamic ps is supported only in managed mode */
+ if (tx->sdata->vif.type != NL80211_IFTYPE_STATION)
+ return TX_CONTINUE;
+
+ ifmgd = &tx->sdata->u.mgd;
+
+ /*
+ * Don't wakeup from power save if u-apsd is enabled, voip ac has
+ * u-apsd enabled and the frame is in voip class. This effectively
+ * means that even if all access categories have u-apsd enabled, in
+ * practise u-apsd is only used with the voip ac. This is a
+ * workaround for the case when received voip class packets do not
+ * have correct qos tag for some reason, due the network or the
+ * peer application.
+ *
+ * Note: ifmgd->uapsd_queues access is racy here. If the value is
+ * changed via debugfs, user needs to reassociate manually to have
+ * everything in sync.
+ */
+ if ((ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) &&
+ (ifmgd->uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) &&
+ skb_get_queue_mapping(tx->skb) == IEEE80211_AC_VO)
+ return TX_CONTINUE;
+
+ if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+ ieee80211_stop_queues_by_reason(&local->hw,
+ IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_PS,
+ false);
+ ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
+ ieee80211_queue_work(&local->hw,
+ &local->dynamic_ps_disable_work);
+ }
+
+ /* Don't restart the timer if we're not disassociated */
+ if (!ifmgd->associated)
+ return TX_CONTINUE;
+
+ mod_timer(&local->dynamic_ps_timer, jiffies +
+ msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
+
+ return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
+{
+
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ bool assoc = false;
+
+ if (unlikely(info->flags & IEEE80211_TX_CTL_INJECTED))
+ return TX_CONTINUE;
+
+ if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) &&
+ test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) &&
+ !ieee80211_is_probe_req(hdr->frame_control) &&
+ !ieee80211_is_nullfunc(hdr->frame_control))
+ /*
+ * When software scanning only nullfunc frames (to notify
+ * the sleep state to the AP) and probe requests (for the
+ * active scan) are allowed, all other frames should not be
+ * sent and we should not get here, but if we do
+ * nonetheless, drop them to avoid sending them
+ * off-channel. See the link below and
+ * ieee80211_start_scan() for more.
+ *
+ * http://article.gmane.org/gmane.linux.kernel.wireless.general/30089
+ */
+ return TX_DROP;
+
+ if (tx->sdata->vif.type == NL80211_IFTYPE_OCB)
+ return TX_CONTINUE;
+
+ if (tx->sdata->vif.type == NL80211_IFTYPE_WDS)
+ return TX_CONTINUE;
+
+ if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
+ return TX_CONTINUE;
+
+ if (tx->flags & IEEE80211_TX_PS_BUFFERED)
+ return TX_CONTINUE;
+
+ if (tx->sta)
+ assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC);
+
+ if (likely(tx->flags & IEEE80211_TX_UNICAST)) {
+ if (unlikely(!assoc &&
+ ieee80211_is_data(hdr->frame_control))) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+ sdata_info(tx->sdata,
+ "dropped data frame to not associated station %pM\n",
+ hdr->addr1);
+#endif
+ I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc);
+ return TX_DROP;
+ }
+ } else if (unlikely(tx->sdata->vif.type == NL80211_IFTYPE_AP &&
+ ieee80211_is_data(hdr->frame_control) &&
+ !atomic_read(&tx->sdata->u.ap.num_mcast_sta))) {
+ /*
+ * No associated STAs - no need to send multicast
+ * frames.
+ */
+ return TX_DROP;
+ }
+
+ return TX_CONTINUE;
+}
+
+/* This function is called whenever the AP is about to exceed the maximum limit
+ * of buffered frames for power saving STAs. This situation should not really
+ * happen often during normal operation, so dropping the oldest buffered packet
+ * from each queue should be OK to make some room for new frames. */
+static void purge_old_ps_buffers(struct ieee80211_local *local)
+{
+ int total = 0, purged = 0;
+ struct sk_buff *skb;
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ struct ps_data *ps;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ ps = &sdata->u.ap.ps;
+ else if (ieee80211_vif_is_mesh(&sdata->vif))
+ ps = &sdata->u.mesh.ps;
+ else
+ continue;
+
+ skb = skb_dequeue(&ps->bc_buf);
+ if (skb) {
+ purged++;
+ dev_kfree_skb(skb);
+ }
+ total += skb_queue_len(&ps->bc_buf);
+ }
+
+ /*
+ * Drop one frame from each station from the lowest-priority
+ * AC that has frames at all.
+ */
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ int ac;
+
+ for (ac = IEEE80211_AC_BK; ac >= IEEE80211_AC_VO; ac--) {
+ skb = skb_dequeue(&sta->ps_tx_buf[ac]);
+ total += skb_queue_len(&sta->ps_tx_buf[ac]);
+ if (skb) {
+ purged++;
+ ieee80211_free_txskb(&local->hw, skb);
+ break;
+ }
+ }
+ }
+
+ local->total_ps_buffered = total;
+ ps_dbg_hw(&local->hw, "PS buffers full - purged %d frames\n", purged);
+}
+
+static ieee80211_tx_result
+ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+ struct ps_data *ps;
+
+ /*
+ * broadcast/multicast frame
+ *
+ * If any of the associated/peer stations is in power save mode,
+ * the frame is buffered to be sent after DTIM beacon frame.
+ * This is done either by the hardware or us.
+ */
+
+ /* powersaving STAs currently only in AP/VLAN/mesh mode */
+ if (tx->sdata->vif.type == NL80211_IFTYPE_AP ||
+ tx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ if (!tx->sdata->bss)
+ return TX_CONTINUE;
+
+ ps = &tx->sdata->bss->ps;
+ } else if (ieee80211_vif_is_mesh(&tx->sdata->vif)) {
+ ps = &tx->sdata->u.mesh.ps;
+ } else {
+ return TX_CONTINUE;
+ }
+
+
+ /* no buffering for ordered frames */
+ if (ieee80211_has_order(hdr->frame_control))
+ return TX_CONTINUE;
+
+ if (ieee80211_is_probe_req(hdr->frame_control))
+ return TX_CONTINUE;
+
+ if (tx->local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
+ info->hw_queue = tx->sdata->vif.cab_queue;
+
+ /* no stations in PS mode */
+ if (!atomic_read(&ps->num_sta_ps))
+ return TX_CONTINUE;
+
+ info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM;
+
+ /* device releases frame after DTIM beacon */
+ if (!(tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING))
+ return TX_CONTINUE;
+
+ /* buffered in mac80211 */
+ if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER)
+ purge_old_ps_buffers(tx->local);
+
+ if (skb_queue_len(&ps->bc_buf) >= AP_MAX_BC_BUFFER) {
+ ps_dbg(tx->sdata,
+ "BC TX buffer full - dropping the oldest frame\n");
+ dev_kfree_skb(skb_dequeue(&ps->bc_buf));
+ } else
+ tx->local->total_ps_buffered++;
+
+ skb_queue_tail(&ps->bc_buf, tx->skb);
+
+ return TX_QUEUED;
+}
+
+static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta,
+ struct sk_buff *skb)
+{
+ if (!ieee80211_is_mgmt(fc))
+ return 0;
+
+ if (sta == NULL || !test_sta_flag(sta, WLAN_STA_MFP))
+ return 0;
+
+ if (!ieee80211_is_robust_mgmt_frame(skb))
+ return 0;
+
+ return 1;
+}
+
+static ieee80211_tx_result
+ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
+{
+ struct sta_info *sta = tx->sta;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+ struct ieee80211_local *local = tx->local;
+
+ if (unlikely(!sta))
+ return TX_CONTINUE;
+
+ if (unlikely((test_sta_flag(sta, WLAN_STA_PS_STA) ||
+ test_sta_flag(sta, WLAN_STA_PS_DRIVER) ||
+ test_sta_flag(sta, WLAN_STA_PS_DELIVER)) &&
+ !(info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER))) {
+ int ac = skb_get_queue_mapping(tx->skb);
+
+ if (ieee80211_is_mgmt(hdr->frame_control) &&
+ !ieee80211_is_bufferable_mmpdu(hdr->frame_control)) {
+ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER;
+ return TX_CONTINUE;
+ }
+
+ ps_dbg(sta->sdata, "STA %pM aid %d: PS buffer for AC %d\n",
+ sta->sta.addr, sta->sta.aid, ac);
+ if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER)
+ purge_old_ps_buffers(tx->local);
+
+ /* sync with ieee80211_sta_ps_deliver_wakeup */
+ spin_lock(&sta->ps_lock);
+ /*
+ * STA woke up the meantime and all the frames on ps_tx_buf have
+ * been queued to pending queue. No reordering can happen, go
+ * ahead and Tx the packet.
+ */
+ if (!test_sta_flag(sta, WLAN_STA_PS_STA) &&
+ !test_sta_flag(sta, WLAN_STA_PS_DRIVER) &&
+ !test_sta_flag(sta, WLAN_STA_PS_DELIVER)) {
+ spin_unlock(&sta->ps_lock);
+ return TX_CONTINUE;
+ }
+
+ if (skb_queue_len(&sta->ps_tx_buf[ac]) >= STA_MAX_TX_BUFFER) {
+ struct sk_buff *old = skb_dequeue(&sta->ps_tx_buf[ac]);
+ ps_dbg(tx->sdata,
+ "STA %pM TX buffer for AC %d full - dropping oldest frame\n",
+ sta->sta.addr, ac);
+ ieee80211_free_txskb(&local->hw, old);
+ } else
+ tx->local->total_ps_buffered++;
+
+ info->control.jiffies = jiffies;
+ info->control.vif = &tx->sdata->vif;
+ info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
+ skb_queue_tail(&sta->ps_tx_buf[ac], tx->skb);
+ spin_unlock(&sta->ps_lock);
+
+ if (!timer_pending(&local->sta_cleanup))
+ mod_timer(&local->sta_cleanup,
+ round_jiffies(jiffies +
+ STA_INFO_CLEANUP_INTERVAL));
+
+ /*
+ * We queued up some frames, so the TIM bit might
+ * need to be set, recalculate it.
+ */
+ sta_info_recalc_tim(sta);
+
+ return TX_QUEUED;
+ } else if (unlikely(test_sta_flag(sta, WLAN_STA_PS_STA))) {
+ ps_dbg(tx->sdata,
+ "STA %pM in PS mode, but polling/in SP -> send frame\n",
+ sta->sta.addr);
+ }
+
+ return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx)
+{
+ if (unlikely(tx->flags & IEEE80211_TX_PS_BUFFERED))
+ return TX_CONTINUE;
+
+ if (tx->flags & IEEE80211_TX_UNICAST)
+ return ieee80211_tx_h_unicast_ps_buf(tx);
+ else
+ return ieee80211_tx_h_multicast_ps_buf(tx);
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+
+ if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol)) {
+ if (tx->sdata->control_port_no_encrypt)
+ info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO;
+ info->flags |= IEEE80211_TX_CTL_USE_MINRATE;
+ }
+
+ return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_key *key;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+
+ if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
+ tx->key = NULL;
+ else if (tx->sta &&
+ (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
+ tx->key = key;
+ else if (ieee80211_is_mgmt(hdr->frame_control) &&
+ is_multicast_ether_addr(hdr->addr1) &&
+ ieee80211_is_robust_mgmt_frame(tx->skb) &&
+ (key = rcu_dereference(tx->sdata->default_mgmt_key)))
+ tx->key = key;
+ else if (is_multicast_ether_addr(hdr->addr1) &&
+ (key = rcu_dereference(tx->sdata->default_multicast_key)))
+ tx->key = key;
+ else if (!is_multicast_ether_addr(hdr->addr1) &&
+ (key = rcu_dereference(tx->sdata->default_unicast_key)))
+ tx->key = key;
+ else
+ tx->key = NULL;
+
+ if (tx->key) {
+ bool skip_hw = false;
+
+ tx->key->tx_rx_count++;
+ /* TODO: add threshold stuff again */
+
+ switch (tx->key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ case WLAN_CIPHER_SUITE_TKIP:
+ if (!ieee80211_is_data_present(hdr->frame_control))
+ tx->key = NULL;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ if (!ieee80211_is_data_present(hdr->frame_control) &&
+ !ieee80211_use_mfp(hdr->frame_control, tx->sta,
+ tx->skb))
+ tx->key = NULL;
+ else
+ skip_hw = (tx->key->conf.flags &
+ IEEE80211_KEY_FLAG_SW_MGMT_TX) &&
+ ieee80211_is_mgmt(hdr->frame_control);
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ if (!ieee80211_is_mgmt(hdr->frame_control))
+ tx->key = NULL;
+ break;
+ }
+
+ if (unlikely(tx->key && tx->key->flags & KEY_FLAG_TAINTED &&
+ !ieee80211_is_deauth(hdr->frame_control)))
+ return TX_DROP;
+
+ if (!skip_hw && tx->key &&
+ tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
+ info->control.hw_key = &tx->key->conf;
+ }
+
+ return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ struct ieee80211_hdr *hdr = (void *)tx->skb->data;
+ struct ieee80211_supported_band *sband;
+ u32 len;
+ struct ieee80211_tx_rate_control txrc;
+ struct ieee80211_sta_rates *ratetbl = NULL;
+ bool assoc = false;
+
+ memset(&txrc, 0, sizeof(txrc));
+
+ sband = tx->local->hw.wiphy->bands[info->band];
+
+ len = min_t(u32, tx->skb->len + FCS_LEN,
+ tx->local->hw.wiphy->frag_threshold);
+
+ /* set up the tx rate control struct we give the RC algo */
+ txrc.hw = &tx->local->hw;
+ txrc.sband = sband;
+ txrc.bss_conf = &tx->sdata->vif.bss_conf;
+ txrc.skb = tx->skb;
+ txrc.reported_rate.idx = -1;
+ txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[info->band];
+ if (txrc.rate_idx_mask == (1 << sband->n_bitrates) - 1)
+ txrc.max_rate_idx = -1;
+ else
+ txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1;
+
+ if (tx->sdata->rc_has_mcs_mask[info->band])
+ txrc.rate_idx_mcs_mask =
+ tx->sdata->rc_rateidx_mcs_mask[info->band];
+
+ txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP ||
+ tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT ||
+ tx->sdata->vif.type == NL80211_IFTYPE_ADHOC);
+
+ /* set up RTS protection if desired */
+ if (len > tx->local->hw.wiphy->rts_threshold) {
+ txrc.rts = true;
+ }
+
+ info->control.use_rts = txrc.rts;
+ info->control.use_cts_prot = tx->sdata->vif.bss_conf.use_cts_prot;
+
+ /*
+ * Use short preamble if the BSS can handle it, but not for
+ * management frames unless we know the receiver can handle
+ * that -- the management frame might be to a station that
+ * just wants a probe response.
+ */
+ if (tx->sdata->vif.bss_conf.use_short_preamble &&
+ (ieee80211_is_data(hdr->frame_control) ||
+ (tx->sta && test_sta_flag(tx->sta, WLAN_STA_SHORT_PREAMBLE))))
+ txrc.short_preamble = true;
+
+ info->control.short_preamble = txrc.short_preamble;
+
+ if (tx->sta)
+ assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC);
+
+ /*
+ * Lets not bother rate control if we're associated and cannot
+ * talk to the sta. This should not happen.
+ */
+ if (WARN(test_bit(SCAN_SW_SCANNING, &tx->local->scanning) && assoc &&
+ !rate_usable_index_exists(sband, &tx->sta->sta),
+ "%s: Dropped data frame as no usable bitrate found while "
+ "scanning and associated. Target station: "
+ "%pM on %d GHz band\n",
+ tx->sdata->name, hdr->addr1,
+ info->band ? 5 : 2))
+ return TX_DROP;
+
+ /*
+ * If we're associated with the sta at this point we know we can at
+ * least send the frame at the lowest bit rate.
+ */
+ rate_control_get_rate(tx->sdata, tx->sta, &txrc);
+
+ if (tx->sta && !info->control.skip_table)
+ ratetbl = rcu_dereference(tx->sta->sta.rates);
+
+ if (unlikely(info->control.rates[0].idx < 0)) {
+ if (ratetbl) {
+ struct ieee80211_tx_rate rate = {
+ .idx = ratetbl->rate[0].idx,
+ .flags = ratetbl->rate[0].flags,
+ .count = ratetbl->rate[0].count
+ };
+
+ if (ratetbl->rate[0].idx < 0)
+ return TX_DROP;
+
+ tx->rate = rate;
+ } else {
+ return TX_DROP;
+ }
+ } else {
+ tx->rate = info->control.rates[0];
+ }
+
+ if (txrc.reported_rate.idx < 0) {
+ txrc.reported_rate = tx->rate;
+ if (tx->sta && ieee80211_is_data(hdr->frame_control))
+ tx->sta->last_tx_rate = txrc.reported_rate;
+ } else if (tx->sta)
+ tx->sta->last_tx_rate = txrc.reported_rate;
+
+ if (ratetbl)
+ return TX_CONTINUE;
+
+ if (unlikely(!info->control.rates[0].count))
+ info->control.rates[0].count = 1;
+
+ if (WARN_ON_ONCE((info->control.rates[0].count > 1) &&
+ (info->flags & IEEE80211_TX_CTL_NO_ACK)))
+ info->control.rates[0].count = 1;
+
+ return TX_CONTINUE;
+}
+
+static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid)
+{
+ u16 *seq = &sta->tid_seq[tid];
+ __le16 ret = cpu_to_le16(*seq);
+
+ /* Increase the sequence number. */
+ *seq = (*seq + 0x10) & IEEE80211_SCTL_SEQ;
+
+ return ret;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+ u8 *qc;
+ int tid;
+
+ /*
+ * Packet injection may want to control the sequence
+ * number, if we have no matching interface then we
+ * neither assign one ourselves nor ask the driver to.
+ */
+ if (unlikely(info->control.vif->type == NL80211_IFTYPE_MONITOR))
+ return TX_CONTINUE;
+
+ if (unlikely(ieee80211_is_ctl(hdr->frame_control)))
+ return TX_CONTINUE;
+
+ if (ieee80211_hdrlen(hdr->frame_control) < 24)
+ return TX_CONTINUE;
+
+ if (ieee80211_is_qos_nullfunc(hdr->frame_control))
+ return TX_CONTINUE;
+
+ /*
+ * Anything but QoS data that has a sequence number field
+ * (is long enough) gets a sequence number from the global
+ * counter. QoS data frames with a multicast destination
+ * also use the global counter (802.11-2012 9.3.2.10).
+ */
+ if (!ieee80211_is_data_qos(hdr->frame_control) ||
+ is_multicast_ether_addr(hdr->addr1)) {
+ /* driver should assign sequence number */
+ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+ /* for pure STA mode without beacons, we can do it */
+ hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
+ tx->sdata->sequence_number += 0x10;
+ if (tx->sta)
+ tx->sta->tx_msdu[IEEE80211_NUM_TIDS]++;
+ return TX_CONTINUE;
+ }
+
+ /*
+ * This should be true for injected/management frames only, for
+ * management frames we have set the IEEE80211_TX_CTL_ASSIGN_SEQ
+ * above since they are not QoS-data frames.
+ */
+ if (!tx->sta)
+ return TX_CONTINUE;
+
+ /* include per-STA, per-TID sequence counter */
+
+ qc = ieee80211_get_qos_ctl(hdr);
+ tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+ tx->sta->tx_msdu[tid]++;
+
+ if (!tx->sta->sta.txq[0])
+ hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
+
+ return TX_CONTINUE;
+}
+
+static int ieee80211_fragment(struct ieee80211_tx_data *tx,
+ struct sk_buff *skb, int hdrlen,
+ int frag_threshold)
+{
+ struct ieee80211_local *local = tx->local;
+ struct ieee80211_tx_info *info;
+ struct sk_buff *tmp;
+ int per_fragm = frag_threshold - hdrlen - FCS_LEN;
+ int pos = hdrlen + per_fragm;
+ int rem = skb->len - hdrlen - per_fragm;
+
+ if (WARN_ON(rem < 0))
+ return -EINVAL;
+
+ /* first fragment was already added to queue by caller */
+
+ while (rem) {
+ int fraglen = per_fragm;
+
+ if (fraglen > rem)
+ fraglen = rem;
+ rem -= fraglen;
+ tmp = dev_alloc_skb(local->tx_headroom +
+ frag_threshold +
+ tx->sdata->encrypt_headroom +
+ IEEE80211_ENCRYPT_TAILROOM);
+ if (!tmp)
+ return -ENOMEM;
+
+ __skb_queue_tail(&tx->skbs, tmp);
+
+ skb_reserve(tmp,
+ local->tx_headroom + tx->sdata->encrypt_headroom);
+
+ /* copy control information */
+ memcpy(tmp->cb, skb->cb, sizeof(tmp->cb));
+
+ info = IEEE80211_SKB_CB(tmp);
+ info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT |
+ IEEE80211_TX_CTL_FIRST_FRAGMENT);
+
+ if (rem)
+ info->flags |= IEEE80211_TX_CTL_MORE_FRAMES;
+
+ skb_copy_queue_mapping(tmp, skb);
+ tmp->priority = skb->priority;
+ tmp->dev = skb->dev;
+
+ /* copy header and data */
+ memcpy(skb_put(tmp, hdrlen), skb->data, hdrlen);
+ memcpy(skb_put(tmp, fraglen), skb->data + pos, fraglen);
+
+ pos += fraglen;
+ }
+
+ /* adjust first fragment's length */
+ skb_trim(skb, hdrlen + per_fragm);
+ return 0;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb = tx->skb;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ int frag_threshold = tx->local->hw.wiphy->frag_threshold;
+ int hdrlen;
+ int fragnum;
+
+ /* no matter what happens, tx->skb moves to tx->skbs */
+ __skb_queue_tail(&tx->skbs, skb);
+ tx->skb = NULL;
+
+ if (info->flags & IEEE80211_TX_CTL_DONTFRAG)
+ return TX_CONTINUE;
+
+ if (tx->local->ops->set_frag_threshold)
+ return TX_CONTINUE;
+
+ /*
+ * Warn when submitting a fragmented A-MPDU frame and drop it.
+ * This scenario is handled in ieee80211_tx_prepare but extra
+ * caution taken here as fragmented ampdu may cause Tx stop.
+ */
+ if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU))
+ return TX_DROP;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ /* internal error, why isn't DONTFRAG set? */
+ if (WARN_ON(skb->len + FCS_LEN <= frag_threshold))
+ return TX_DROP;
+
+ /*
+ * Now fragment the frame. This will allocate all the fragments and
+ * chain them (using skb as the first fragment) to skb->next.
+ * During transmission, we will remove the successfully transmitted
+ * fragments from this list. When the low-level driver rejects one
+ * of the fragments then we will simply pretend to accept the skb
+ * but store it away as pending.
+ */
+ if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
+ return TX_DROP;
+
+ /* update duration/seq/flags of fragments */
+ fragnum = 0;
+
+ skb_queue_walk(&tx->skbs, skb) {
+ const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
+
+ hdr = (void *)skb->data;
+ info = IEEE80211_SKB_CB(skb);
+
+ if (!skb_queue_is_last(&tx->skbs, skb)) {
+ hdr->frame_control |= morefrags;
+ /*
+ * No multi-rate retries for fragmented frames, that
+ * would completely throw off the NAV at other STAs.
+ */
+ info->control.rates[1].idx = -1;
+ info->control.rates[2].idx = -1;
+ info->control.rates[3].idx = -1;
+ BUILD_BUG_ON(IEEE80211_TX_MAX_RATES != 4);
+ info->flags &= ~IEEE80211_TX_CTL_RATE_CTRL_PROBE;
+ } else {
+ hdr->frame_control &= ~morefrags;
+ }
+ hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
+ fragnum++;
+ }
+
+ return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+ int ac = -1;
+
+ if (!tx->sta)
+ return TX_CONTINUE;
+
+ skb_queue_walk(&tx->skbs, skb) {
+ ac = skb_get_queue_mapping(skb);
+ tx->sta->tx_fragments++;
+ tx->sta->tx_bytes[ac] += skb->len;
+ }
+ if (ac >= 0)
+ tx->sta->tx_packets[ac]++;
+
+ return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
+{
+ if (!tx->key)
+ return TX_CONTINUE;
+
+ switch (tx->key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ return ieee80211_crypto_wep_encrypt(tx);
+ case WLAN_CIPHER_SUITE_TKIP:
+ return ieee80211_crypto_tkip_encrypt(tx);
+ case WLAN_CIPHER_SUITE_CCMP:
+ return ieee80211_crypto_ccmp_encrypt(
+ tx, IEEE80211_CCMP_MIC_LEN);
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ return ieee80211_crypto_ccmp_encrypt(
+ tx, IEEE80211_CCMP_256_MIC_LEN);
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ return ieee80211_crypto_aes_cmac_encrypt(tx);
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ return ieee80211_crypto_aes_cmac_256_encrypt(tx);
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ return ieee80211_crypto_aes_gmac_encrypt(tx);
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ return ieee80211_crypto_gcmp_encrypt(tx);
+ default:
+ return ieee80211_crypto_hw_encrypt(tx);
+ }
+
+ return TX_DROP;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+ struct ieee80211_hdr *hdr;
+ int next_len;
+ bool group_addr;
+
+ skb_queue_walk(&tx->skbs, skb) {
+ hdr = (void *) skb->data;
+ if (unlikely(ieee80211_is_pspoll(hdr->frame_control)))
+ break; /* must not overwrite AID */
+ if (!skb_queue_is_last(&tx->skbs, skb)) {
+ struct sk_buff *next = skb_queue_next(&tx->skbs, skb);
+ next_len = next->len;
+ } else
+ next_len = 0;
+ group_addr = is_multicast_ether_addr(hdr->addr1);
+
+ hdr->duration_id =
+ ieee80211_duration(tx, skb, group_addr, next_len);
+ }
+
+ return TX_CONTINUE;
+}
+
+/* actual transmit path */
+
+static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
+ struct sk_buff *skb,
+ struct ieee80211_tx_info *info,
+ struct tid_ampdu_tx *tid_tx,
+ int tid)
+{
+ bool queued = false;
+ bool reset_agg_timer = false;
+ struct sk_buff *purge_skb = NULL;
+
+ if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
+ info->flags |= IEEE80211_TX_CTL_AMPDU;
+ reset_agg_timer = true;
+ } else if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) {
+ /*
+ * nothing -- this aggregation session is being started
+ * but that might still fail with the driver
+ */
+ } else if (!tx->sta->sta.txq[tid]) {
+ spin_lock(&tx->sta->lock);
+ /*
+ * Need to re-check now, because we may get here
+ *
+ * 1) in the window during which the setup is actually
+ * already done, but not marked yet because not all
+ * packets are spliced over to the driver pending
+ * queue yet -- if this happened we acquire the lock
+ * either before or after the splice happens, but
+ * need to recheck which of these cases happened.
+ *
+ * 2) during session teardown, if the OPERATIONAL bit
+ * was cleared due to the teardown but the pointer
+ * hasn't been assigned NULL yet (or we loaded it
+ * before it was assigned) -- in this case it may
+ * now be NULL which means we should just let the
+ * packet pass through because splicing the frames
+ * back is already done.
+ */
+ tid_tx = rcu_dereference_protected_tid_tx(tx->sta, tid);
+
+ if (!tid_tx) {
+ /* do nothing, let packet pass through */
+ } else if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
+ info->flags |= IEEE80211_TX_CTL_AMPDU;
+ reset_agg_timer = true;
+ } else {
+ queued = true;
+ info->control.vif = &tx->sdata->vif;
+ info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+ info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
+ __skb_queue_tail(&tid_tx->pending, skb);
+ if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER)
+ purge_skb = __skb_dequeue(&tid_tx->pending);
+ }
+ spin_unlock(&tx->sta->lock);
+
+ if (purge_skb)
+ ieee80211_free_txskb(&tx->local->hw, purge_skb);
+ }
+
+ /* reset session timer */
+ if (reset_agg_timer && tid_tx->timeout)
+ tid_tx->last_tx = jiffies;
+
+ return queued;
+}
+
+/*
+ * initialises @tx
+ * pass %NULL for the station if unknown, a valid pointer if known
+ * or an ERR_PTR() if the station is known not to exist
+ */
+static ieee80211_tx_result
+ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_tx_data *tx,
+ struct sta_info *sta, struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_hdr *hdr;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int tid;
+ u8 *qc;
+
+ memset(tx, 0, sizeof(*tx));
+ tx->skb = skb;
+ tx->local = local;
+ tx->sdata = sdata;
+ __skb_queue_head_init(&tx->skbs);
+
+ /*
+ * If this flag is set to true anywhere, and we get here,
+ * we are doing the needed processing, so remove the flag
+ * now.
+ */
+ info->flags &= ~IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+
+ hdr = (struct ieee80211_hdr *) skb->data;
+
+ if (likely(sta)) {
+ if (!IS_ERR(sta))
+ tx->sta = sta;
+ } else {
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ tx->sta = rcu_dereference(sdata->u.vlan.sta);
+ if (!tx->sta && sdata->wdev.use_4addr)
+ return TX_DROP;
+ } else if (info->flags & (IEEE80211_TX_INTFL_NL80211_FRAME_TX |
+ IEEE80211_TX_CTL_INJECTED) ||
+ tx->sdata->control_port_protocol == tx->skb->protocol) {
+ tx->sta = sta_info_get_bss(sdata, hdr->addr1);
+ }
+ if (!tx->sta && !is_multicast_ether_addr(hdr->addr1))
+ tx->sta = sta_info_get(sdata, hdr->addr1);
+ }
+
+ if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) &&
+ !ieee80211_is_qos_nullfunc(hdr->frame_control) &&
+ (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) &&
+ !(local->hw.flags & IEEE80211_HW_TX_AMPDU_SETUP_IN_HW)) {
+ struct tid_ampdu_tx *tid_tx;
+
+ qc = ieee80211_get_qos_ctl(hdr);
+ tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+
+ tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]);
+ if (tid_tx) {
+ bool queued;
+
+ queued = ieee80211_tx_prep_agg(tx, skb, info,
+ tid_tx, tid);
+
+ if (unlikely(queued))
+ return TX_QUEUED;
+ }
+ }
+
+ if (is_multicast_ether_addr(hdr->addr1)) {
+ tx->flags &= ~IEEE80211_TX_UNICAST;
+ info->flags |= IEEE80211_TX_CTL_NO_ACK;
+ } else
+ tx->flags |= IEEE80211_TX_UNICAST;
+
+ if (!(info->flags & IEEE80211_TX_CTL_DONTFRAG)) {
+ if (!(tx->flags & IEEE80211_TX_UNICAST) ||
+ skb->len + FCS_LEN <= local->hw.wiphy->frag_threshold ||
+ info->flags & IEEE80211_TX_CTL_AMPDU)
+ info->flags |= IEEE80211_TX_CTL_DONTFRAG;
+ }
+
+ if (!tx->sta)
+ info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT;
+ else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT))
+ info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT;
+
+ info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT;
+
+ return TX_CONTINUE;
+}
+
+static void ieee80211_drv_tx(struct ieee80211_local *local,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *pubsta,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_control control = {
+ .sta = pubsta,
+ };
+ struct ieee80211_txq *txq = NULL;
+ struct txq_info *txqi;
+ u8 ac;
+
+ if (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)
+ goto tx_normal;
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ goto tx_normal;
+
+ if (pubsta) {
+ u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
+
+ txq = pubsta->txq[tid];
+ } else if (vif) {
+ txq = vif->txq;
+ }
+
+ if (!txq)
+ goto tx_normal;
+
+ ac = txq->ac;
+ txqi = to_txq_info(txq);
+ atomic_inc(&sdata->txqs_len[ac]);
+ if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending)
+ netif_stop_subqueue(sdata->dev, ac);
+
+ skb_queue_tail(&txqi->queue, skb);
+ drv_wake_tx_queue(local, txqi);
+
+ return;
+
+tx_normal:
+ drv_tx(local, &control, skb);
+}
+
+struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
+ struct ieee80211_txq *txq)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif);
+ struct txq_info *txqi = container_of(txq, struct txq_info, txq);
+ struct ieee80211_hdr *hdr;
+ struct sk_buff *skb = NULL;
+ u8 ac = txq->ac;
+
+ spin_lock_bh(&txqi->queue.lock);
+
+ if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
+ goto out;
+
+ skb = __skb_dequeue(&txqi->queue);
+ if (!skb)
+ goto out;
+
+ atomic_dec(&sdata->txqs_len[ac]);
+ if (__netif_subqueue_stopped(sdata->dev, ac))
+ ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]);
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+ struct sta_info *sta = container_of(txq->sta, struct sta_info,
+ sta);
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+ hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
+ if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
+ info->flags |= IEEE80211_TX_CTL_AMPDU;
+ else
+ info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+ }
+
+out:
+ spin_unlock_bh(&txqi->queue.lock);
+
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_tx_dequeue);
+
+static bool ieee80211_tx_frags(struct ieee80211_local *local,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta,
+ struct sk_buff_head *skbs,
+ bool txpending)
+{
+ struct sk_buff *skb, *tmp;
+ unsigned long flags;
+
+ skb_queue_walk_safe(skbs, skb, tmp) {
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int q = info->hw_queue;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+ if (WARN_ON_ONCE(q >= local->hw.queues)) {
+ __skb_unlink(skb, skbs);
+ ieee80211_free_txskb(&local->hw, skb);
+ continue;
+ }
+#endif
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ if (local->queue_stop_reasons[q] ||
+ (!txpending && !skb_queue_empty(&local->pending[q]))) {
+ if (unlikely(info->flags &
+ IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) {
+ if (local->queue_stop_reasons[q] &
+ ~BIT(IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL)) {
+ /*
+ * Drop off-channel frames if queues
+ * are stopped for any reason other
+ * than off-channel operation. Never
+ * queue them.
+ */
+ spin_unlock_irqrestore(
+ &local->queue_stop_reason_lock,
+ flags);
+ ieee80211_purge_tx_queue(&local->hw,
+ skbs);
+ return true;
+ }
+ } else {
+
+ /*
+ * Since queue is stopped, queue up frames for
+ * later transmission from the tx-pending
+ * tasklet when the queue is woken again.
+ */
+ if (txpending)
+ skb_queue_splice_init(skbs,
+ &local->pending[q]);
+ else
+ skb_queue_splice_tail_init(skbs,
+ &local->pending[q]);
+
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock,
+ flags);
+ return false;
+ }
+ }
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+ info->control.vif = vif;
+
+ __skb_unlink(skb, skbs);
+ ieee80211_drv_tx(local, vif, sta, skb);
+ }
+
+ return true;
+}
+
+/*
+ * Returns false if the frame couldn't be transmitted but was queued instead.
+ */
+static bool __ieee80211_tx(struct ieee80211_local *local,
+ struct sk_buff_head *skbs, int led_len,
+ struct sta_info *sta, bool txpending)
+{
+ struct ieee80211_tx_info *info;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_vif *vif;
+ struct ieee80211_sta *pubsta;
+ struct sk_buff *skb;
+ bool result = true;
+ __le16 fc;
+
+ if (WARN_ON(skb_queue_empty(skbs)))
+ return true;
+
+ skb = skb_peek(skbs);
+ fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
+ info = IEEE80211_SKB_CB(skb);
+ sdata = vif_to_sdata(info->control.vif);
+ if (sta && !sta->uploaded)
+ sta = NULL;
+
+ if (sta)
+ pubsta = &sta->sta;
+ else
+ pubsta = NULL;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_MONITOR:
+ if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) {
+ vif = &sdata->vif;
+ break;
+ }
+ sdata = rcu_dereference(local->monitor_sdata);
+ if (sdata) {
+ vif = &sdata->vif;
+ info->hw_queue =
+ vif->hw_queue[skb_get_queue_mapping(skb)];
+ } else if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) {
+ dev_kfree_skb(skb);
+ return true;
+ } else
+ vif = NULL;
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ sdata = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+ /* fall through */
+ default:
+ vif = &sdata->vif;
+ break;
+ }
+
+ result = ieee80211_tx_frags(local, vif, pubsta, skbs,
+ txpending);
+
+ ieee80211_tpt_led_trig_tx(local, fc, led_len);
+
+ WARN_ON_ONCE(!skb_queue_empty(skbs));
+
+ return result;
+}
+
+/*
+ * Invoke TX handlers, return 0 on success and non-zero if the
+ * frame was dropped or queued.
+ */
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+ ieee80211_tx_result res = TX_DROP;
+
+#define CALL_TXH(txh) \
+ do { \
+ res = txh(tx); \
+ if (res != TX_CONTINUE) \
+ goto txh_done; \
+ } while (0)
+
+ CALL_TXH(ieee80211_tx_h_dynamic_ps);
+ CALL_TXH(ieee80211_tx_h_check_assoc);
+ CALL_TXH(ieee80211_tx_h_ps_buf);
+ CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
+ CALL_TXH(ieee80211_tx_h_select_key);
+ if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
+ CALL_TXH(ieee80211_tx_h_rate_ctrl);
+
+ if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
+ __skb_queue_tail(&tx->skbs, tx->skb);
+ tx->skb = NULL;
+ goto txh_done;
+ }
+
+ CALL_TXH(ieee80211_tx_h_michael_mic_add);
+ CALL_TXH(ieee80211_tx_h_sequence);
+ CALL_TXH(ieee80211_tx_h_fragment);
+ /* handlers after fragment must be aware of tx info fragmentation! */
+ CALL_TXH(ieee80211_tx_h_stats);
+ CALL_TXH(ieee80211_tx_h_encrypt);
+ if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
+ CALL_TXH(ieee80211_tx_h_calculate_duration);
+#undef CALL_TXH
+
+ txh_done:
+ if (unlikely(res == TX_DROP)) {
+ I802_DEBUG_INC(tx->local->tx_handlers_drop);
+ if (tx->skb)
+ ieee80211_free_txskb(&tx->local->hw, tx->skb);
+ else
+ ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+ return -1;
+ } else if (unlikely(res == TX_QUEUED)) {
+ I802_DEBUG_INC(tx->local->tx_handlers_queued);
+ return -1;
+ }
+
+ return 0;
+}
+
+bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif, struct sk_buff *skb,
+ int band, struct ieee80211_sta **sta)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_data tx;
+ struct sk_buff *skb2;
+
+ if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP)
+ return false;
+
+ info->band = band;
+ info->control.vif = vif;
+ info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)];
+
+ if (invoke_tx_handlers(&tx))
+ return false;
+
+ if (sta) {
+ if (tx.sta)
+ *sta = &tx.sta->sta;
+ else
+ *sta = NULL;
+ }
+
+ /* this function isn't suitable for fragmented data frames */
+ skb2 = __skb_dequeue(&tx.skbs);
+ if (WARN_ON(skb2 != skb || !skb_queue_empty(&tx.skbs))) {
+ ieee80211_free_txskb(hw, skb2);
+ ieee80211_purge_tx_queue(hw, &tx.skbs);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(ieee80211_tx_prepare_skb);
+
+/*
+ * Returns false if the frame couldn't be transmitted but was queued instead.
+ */
+static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, struct sk_buff *skb,
+ bool txpending)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_data tx;
+ ieee80211_tx_result res_prepare;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ bool result = true;
+ int led_len;
+
+ if (unlikely(skb->len < 10)) {
+ dev_kfree_skb(skb);
+ return true;
+ }
+
+ /* initialises tx */
+ led_len = skb->len;
+ res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb);
+
+ if (unlikely(res_prepare == TX_DROP)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ return true;
+ } else if (unlikely(res_prepare == TX_QUEUED)) {
+ return true;
+ }
+
+ /* set up hw_queue value early */
+ if (!(info->flags & IEEE80211_TX_CTL_TX_OFFCHAN) ||
+ !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL))
+ info->hw_queue =
+ sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
+ if (!invoke_tx_handlers(&tx))
+ result = __ieee80211_tx(local, &tx.skbs, led_len,
+ tx.sta, txpending);
+
+ return result;
+}
+
+/* device xmit handlers */
+
+static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ int head_need, bool may_encrypt)
+{
+ struct ieee80211_local *local = sdata->local;
+ int tail_need = 0;
+
+ if (may_encrypt && sdata->crypto_tx_tailroom_needed_cnt) {
+ tail_need = IEEE80211_ENCRYPT_TAILROOM;
+ tail_need -= skb_tailroom(skb);
+ tail_need = max_t(int, tail_need, 0);
+ }
+
+ if (skb_cloned(skb) &&
+ (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CLONED_SKBS) ||
+ !skb_clone_writable(skb, ETH_HLEN) ||
+ sdata->crypto_tx_tailroom_needed_cnt))
+ I802_DEBUG_INC(local->tx_expand_skb_head_cloned);
+ else if (head_need || tail_need)
+ I802_DEBUG_INC(local->tx_expand_skb_head);
+ else
+ return 0;
+
+ if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) {
+ wiphy_debug(local->hw.wiphy,
+ "failed to reallocate TX buffer\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ int headroom;
+ bool may_encrypt;
+
+ may_encrypt = !(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT);
+
+ headroom = local->tx_headroom;
+ if (may_encrypt)
+ headroom += sdata->encrypt_headroom;
+ headroom -= skb_headroom(skb);
+ headroom = max_t(int, 0, headroom);
+
+ if (ieee80211_skb_resize(sdata, skb, headroom, may_encrypt)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ return;
+ }
+
+ hdr = (struct ieee80211_hdr *) skb->data;
+ info->control.vif = &sdata->vif;
+
+ if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ if (ieee80211_is_data(hdr->frame_control) &&
+ is_unicast_ether_addr(hdr->addr1)) {
+ if (mesh_nexthop_resolve(sdata, skb))
+ return; /* skb queued: don't free */
+ } else {
+ ieee80211_mps_set_frame_flags(sdata, NULL, hdr);
+ }
+ }
+
+ ieee80211_set_qos_hdr(sdata, skb);
+ ieee80211_tx(sdata, sta, skb, false);
+}
+
+static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
+{
+ struct ieee80211_radiotap_iterator iterator;
+ struct ieee80211_radiotap_header *rthdr =
+ (struct ieee80211_radiotap_header *) skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len,
+ NULL);
+ u16 txflags;
+
+ info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
+ IEEE80211_TX_CTL_DONTFRAG;
+
+ /*
+ * for every radiotap entry that is present
+ * (ieee80211_radiotap_iterator_next returns -ENOENT when no more
+ * entries present, or -EINVAL on error)
+ */
+
+ while (!ret) {
+ ret = ieee80211_radiotap_iterator_next(&iterator);
+
+ if (ret)
+ continue;
+
+ /* see if this argument is something we can use */
+ switch (iterator.this_arg_index) {
+ /*
+ * You must take care when dereferencing iterator.this_arg
+ * for multibyte types... the pointer is not aligned. Use
+ * get_unaligned((type *)iterator.this_arg) to dereference
+ * iterator.this_arg for type "type" safely on all arches.
+ */
+ case IEEE80211_RADIOTAP_FLAGS:
+ if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FCS) {
+ /*
+ * this indicates that the skb we have been
+ * handed has the 32-bit FCS CRC at the end...
+ * we should react to that by snipping it off
+ * because it will be recomputed and added
+ * on transmission
+ */
+ if (skb->len < (iterator._max_length + FCS_LEN))
+ return false;
+
+ skb_trim(skb, skb->len - FCS_LEN);
+ }
+ if (*iterator.this_arg & IEEE80211_RADIOTAP_F_WEP)
+ info->flags &= ~IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FRAG)
+ info->flags &= ~IEEE80211_TX_CTL_DONTFRAG;
+ break;
+
+ case IEEE80211_RADIOTAP_TX_FLAGS:
+ txflags = get_unaligned_le16(iterator.this_arg);
+ if (txflags & IEEE80211_RADIOTAP_F_TX_NOACK)
+ info->flags |= IEEE80211_TX_CTL_NO_ACK;
+ break;
+
+ /*
+ * Please update the file
+ * Documentation/networking/mac80211-injection.txt
+ * when parsing new fields here.
+ */
+
+ default:
+ break;
+ }
+ }
+
+ if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */
+ return false;
+
+ /*
+ * remove the radiotap header
+ * iterator->_max_length was sanity-checked against
+ * skb->len by iterator init
+ */
+ skb_pull(skb, iterator._max_length);
+
+ return true;
+}
+
+netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_radiotap_header *prthdr =
+ (struct ieee80211_radiotap_header *)skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_hdr *hdr;
+ struct ieee80211_sub_if_data *tmp_sdata, *sdata;
+ struct cfg80211_chan_def *chandef;
+ u16 len_rthdr;
+ int hdrlen;
+
+ /* check for not even having the fixed radiotap header part */
+ if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header)))
+ goto fail; /* too short to be possibly valid */
+
+ /* is it a header version we can trust to find length from? */
+ if (unlikely(prthdr->it_version))
+ goto fail; /* only version 0 is supported */
+
+ /* then there must be a radiotap header with a length we can use */
+ len_rthdr = ieee80211_get_radiotap_len(skb->data);
+
+ /* does the skb contain enough to deliver on the alleged length? */
+ if (unlikely(skb->len < len_rthdr))
+ goto fail; /* skb too short for claimed rt header extent */
+
+ /*
+ * fix up the pointers accounting for the radiotap
+ * header still being in there. We are being given
+ * a precooked IEEE80211 header so no need for
+ * normal processing
+ */
+ skb_set_mac_header(skb, len_rthdr);
+ /*
+ * these are just fixed to the end of the rt area since we
+ * don't have any better information and at this point, nobody cares
+ */
+ skb_set_network_header(skb, len_rthdr);
+ skb_set_transport_header(skb, len_rthdr);
+
+ if (skb->len < len_rthdr + 2)
+ goto fail;
+
+ hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr);
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ if (skb->len < len_rthdr + hdrlen)
+ goto fail;
+
+ /*
+ * Initialize skb->protocol if the injected frame is a data frame
+ * carrying a rfc1042 header
+ */
+ if (ieee80211_is_data(hdr->frame_control) &&
+ skb->len >= len_rthdr + hdrlen + sizeof(rfc1042_header) + 2) {
+ u8 *payload = (u8 *)hdr + hdrlen;
+
+ if (ether_addr_equal(payload, rfc1042_header))
+ skb->protocol = cpu_to_be16((payload[6] << 8) |
+ payload[7]);
+ }
+
+ memset(info, 0, sizeof(*info));
+
+ info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
+ IEEE80211_TX_CTL_INJECTED;
+
+ /* process and remove the injection radiotap header */
+ if (!ieee80211_parse_tx_radiotap(skb))
+ goto fail;
+
+ rcu_read_lock();
+
+ /*
+ * We process outgoing injected frames that have a local address
+ * we handle as though they are non-injected frames.
+ * This code here isn't entirely correct, the local MAC address
+ * isn't always enough to find the interface to use; for proper
+ * VLAN/WDS support we will need a different mechanism (which
+ * likely isn't going to be monitor interfaces).
+ */
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ list_for_each_entry_rcu(tmp_sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(tmp_sdata))
+ continue;
+ if (tmp_sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+ tmp_sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ tmp_sdata->vif.type == NL80211_IFTYPE_WDS)
+ continue;
+ if (ether_addr_equal(tmp_sdata->vif.addr, hdr->addr2)) {
+ sdata = tmp_sdata;
+ break;
+ }
+ }
+
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ tmp_sdata = rcu_dereference(local->monitor_sdata);
+ if (tmp_sdata)
+ chanctx_conf =
+ rcu_dereference(tmp_sdata->vif.chanctx_conf);
+ }
+
+ if (chanctx_conf)
+ chandef = &chanctx_conf->def;
+ else if (!local->use_chanctx)
+ chandef = &local->_oper_chandef;
+ else
+ goto fail_rcu;
+
+ /*
+ * Frame injection is not allowed if beaconing is not allowed
+ * or if we need radar detection. Beaconing is usually not allowed when
+ * the mode or operation (Adhoc, AP, Mesh) does not support DFS.
+ * Passive scan is also used in world regulatory domains where
+ * your country is not known and as such it should be treated as
+ * NO TX unless the channel is explicitly allowed in which case
+ * your current regulatory domain would not have the passive scan
+ * flag.
+ *
+ * Since AP mode uses monitor interfaces to inject/TX management
+ * frames we can make AP mode the exception to this rule once it
+ * supports radar detection as its implementation can deal with
+ * radar detection by itself. We can do that later by adding a
+ * monitor flag interfaces used for AP support.
+ */
+ if (!cfg80211_reg_can_beacon(local->hw.wiphy, chandef,
+ sdata->vif.type))
+ goto fail_rcu;
+
+ info->band = chandef->chan->band;
+ ieee80211_xmit(sdata, NULL, skb);
+ rcu_read_unlock();
+
+ return NETDEV_TX_OK;
+
+fail_rcu:
+ rcu_read_unlock();
+fail:
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK; /* meaning, we dealt with the skb */
+}
+
+static inline bool ieee80211_is_tdls_setup(struct sk_buff *skb)
+{
+ u16 ethertype = (skb->data[12] << 8) | skb->data[13];
+
+ return ethertype == ETH_P_TDLS &&
+ skb->len > 14 &&
+ skb->data[14] == WLAN_TDLS_SNAP_RFTYPE;
+}
+
+static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct sta_info **sta_out)
+{
+ struct sta_info *sta;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ sta = rcu_dereference(sdata->u.vlan.sta);
+ if (sta) {
+ *sta_out = sta;
+ return 0;
+ } else if (sdata->wdev.use_4addr) {
+ return -ENOLINK;
+ }
+ /* fall through */
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_OCB:
+ case NL80211_IFTYPE_ADHOC:
+ if (is_multicast_ether_addr(skb->data)) {
+ *sta_out = ERR_PTR(-ENOENT);
+ return 0;
+ }
+ sta = sta_info_get_bss(sdata, skb->data);
+ break;
+ case NL80211_IFTYPE_WDS:
+ sta = sta_info_get(sdata, sdata->u.wds.remote_addr);
+ break;
+#ifdef CONFIG_MAC80211_MESH
+ case NL80211_IFTYPE_MESH_POINT:
+ /* determined much later */
+ *sta_out = NULL;
+ return 0;
+#endif
+ case NL80211_IFTYPE_STATION:
+ if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) {
+ sta = sta_info_get(sdata, skb->data);
+ if (sta) {
+ bool tdls_peer, tdls_auth;
+
+ tdls_peer = test_sta_flag(sta,
+ WLAN_STA_TDLS_PEER);
+ tdls_auth = test_sta_flag(sta,
+ WLAN_STA_TDLS_PEER_AUTH);
+
+ if (tdls_peer && tdls_auth) {
+ *sta_out = sta;
+ return 0;
+ }
+
+ /*
+ * TDLS link during setup - throw out frames to
+ * peer. Allow TDLS-setup frames to unauthorized
+ * peers for the special case of a link teardown
+ * after a TDLS sta is removed due to being
+ * unreachable.
+ */
+ if (tdls_peer && !tdls_auth &&
+ !ieee80211_is_tdls_setup(skb))
+ return -EINVAL;
+ }
+
+ }
+
+ sta = sta_info_get(sdata, sdata->u.mgd.bssid);
+ if (!sta)
+ return -ENOLINK;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ *sta_out = sta ?: ERR_PTR(-ENOENT);
+ return 0;
+}
+
+/**
+ * ieee80211_build_hdr - build 802.11 header in the given frame
+ * @sdata: virtual interface to build the header for
+ * @skb: the skb to build the header in
+ * @info_flags: skb flags to set
+ *
+ * This function takes the skb with 802.3 header and reformats the header to
+ * the appropriate IEEE 802.11 header based on which interface the packet is
+ * being transmitted on.
+ *
+ * Note that this function also takes care of the TX status request and
+ * potential unsharing of the SKB - this needs to be interleaved with the
+ * header building.
+ *
+ * The function requires the read-side RCU lock held
+ *
+ * Returns: the (possibly reallocated) skb or an ERR_PTR() code
+ */
+static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u32 info_flags,
+ struct sta_info *sta)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_info *info;
+ int head_need;
+ u16 ethertype, hdrlen, meshhdrlen = 0;
+ __le16 fc;
+ struct ieee80211_hdr hdr;
+ struct ieee80211s_hdr mesh_hdr __maybe_unused;
+ struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL;
+ const u8 *encaps_data;
+ int encaps_len, skip_header_bytes;
+ int nh_pos, h_pos;
+ bool wme_sta = false, authorized = false;
+ bool tdls_peer;
+ bool multicast;
+ u16 info_id = 0;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_sub_if_data *ap_sdata;
+ enum ieee80211_band band;
+ int ret;
+
+ if (IS_ERR(sta))
+ sta = NULL;
+
+ /* convert Ethernet header to proper 802.11 header (based on
+ * operation mode) */
+ ethertype = (skb->data[12] << 8) | skb->data[13];
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ if (sdata->wdev.use_4addr) {
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
+ /* RA TA DA SA */
+ memcpy(hdr.addr1, sta->sta.addr, ETH_ALEN);
+ memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(hdr.addr3, skb->data, ETH_ALEN);
+ memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
+ hdrlen = 30;
+ authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED);
+ wme_sta = sta->sta.wme;
+ }
+ ap_sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
+ u.ap);
+ chanctx_conf = rcu_dereference(ap_sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ ret = -ENOTCONN;
+ goto free;
+ }
+ band = chanctx_conf->def.chan->band;
+ if (sdata->wdev.use_4addr)
+ break;
+ /* fall through */
+ case NL80211_IFTYPE_AP:
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ ret = -ENOTCONN;
+ goto free;
+ }
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+ /* DA BSSID SA */
+ memcpy(hdr.addr1, skb->data, ETH_ALEN);
+ memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
+ hdrlen = 24;
+ band = chanctx_conf->def.chan->band;
+ break;
+ case NL80211_IFTYPE_WDS:
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
+ /* RA TA DA SA */
+ memcpy(hdr.addr1, sdata->u.wds.remote_addr, ETH_ALEN);
+ memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(hdr.addr3, skb->data, ETH_ALEN);
+ memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
+ hdrlen = 30;
+ /*
+ * This is the exception! WDS style interfaces are prohibited
+ * when channel contexts are in used so this must be valid
+ */
+ band = local->hw.conf.chandef.chan->band;
+ break;
+#ifdef CONFIG_MAC80211_MESH
+ case NL80211_IFTYPE_MESH_POINT:
+ if (!is_multicast_ether_addr(skb->data)) {
+ struct sta_info *next_hop;
+ bool mpp_lookup = true;
+
+ mpath = mesh_path_lookup(sdata, skb->data);
+ if (mpath) {
+ mpp_lookup = false;
+ next_hop = rcu_dereference(mpath->next_hop);
+ if (!next_hop ||
+ !(mpath->flags & (MESH_PATH_ACTIVE |
+ MESH_PATH_RESOLVING)))
+ mpp_lookup = true;
+ }
+
+ if (mpp_lookup)
+ mppath = mpp_path_lookup(sdata, skb->data);
+
+ if (mppath && mpath)
+ mesh_path_del(mpath->sdata, mpath->dst);
+ }
+
+ /*
+ * Use address extension if it is a packet from
+ * another interface or if we know the destination
+ * is being proxied by a portal (i.e. portal address
+ * differs from proxied address)
+ */
+ if (ether_addr_equal(sdata->vif.addr, skb->data + ETH_ALEN) &&
+ !(mppath && !ether_addr_equal(mppath->mpp, skb->data))) {
+ hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc,
+ skb->data, skb->data + ETH_ALEN);
+ meshhdrlen = ieee80211_new_mesh_header(sdata, &mesh_hdr,
+ NULL, NULL);
+ } else {
+ /* DS -> MBSS (802.11-2012 13.11.3.3).
+ * For unicast with unknown forwarding information,
+ * destination might be in the MBSS or if that fails
+ * forwarded to another mesh gate. In either case
+ * resolution will be handled in ieee80211_xmit(), so
+ * leave the original DA. This also works for mcast */
+ const u8 *mesh_da = skb->data;
+
+ if (mppath)
+ mesh_da = mppath->mpp;
+ else if (mpath)
+ mesh_da = mpath->dst;
+
+ hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc,
+ mesh_da, sdata->vif.addr);
+ if (is_multicast_ether_addr(mesh_da))
+ /* DA TA mSA AE:SA */
+ meshhdrlen = ieee80211_new_mesh_header(
+ sdata, &mesh_hdr,
+ skb->data + ETH_ALEN, NULL);
+ else
+ /* RA TA mDA mSA AE:DA SA */
+ meshhdrlen = ieee80211_new_mesh_header(
+ sdata, &mesh_hdr, skb->data,
+ skb->data + ETH_ALEN);
+
+ }
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ ret = -ENOTCONN;
+ goto free;
+ }
+ band = chanctx_conf->def.chan->band;
+ break;
+#endif
+ case NL80211_IFTYPE_STATION:
+ /* we already did checks when looking up the RA STA */
+ tdls_peer = test_sta_flag(sta, WLAN_STA_TDLS_PEER);
+
+ if (tdls_peer) {
+ /* DA SA BSSID */
+ memcpy(hdr.addr1, skb->data, ETH_ALEN);
+ memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+ memcpy(hdr.addr3, sdata->u.mgd.bssid, ETH_ALEN);
+ hdrlen = 24;
+ } else if (sdata->u.mgd.use_4addr &&
+ cpu_to_be16(ethertype) != sdata->control_port_protocol) {
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS |
+ IEEE80211_FCTL_TODS);
+ /* RA TA DA SA */
+ memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
+ memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+ memcpy(hdr.addr3, skb->data, ETH_ALEN);
+ memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
+ hdrlen = 30;
+ } else {
+ fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+ /* BSSID SA DA */
+ memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
+ memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+ memcpy(hdr.addr3, skb->data, ETH_ALEN);
+ hdrlen = 24;
+ }
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ ret = -ENOTCONN;
+ goto free;
+ }
+ band = chanctx_conf->def.chan->band;
+ break;
+ case NL80211_IFTYPE_OCB:
+ /* DA SA BSSID */
+ memcpy(hdr.addr1, skb->data, ETH_ALEN);
+ memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+ eth_broadcast_addr(hdr.addr3);
+ hdrlen = 24;
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ ret = -ENOTCONN;
+ goto free;
+ }
+ band = chanctx_conf->def.chan->band;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ /* DA SA BSSID */
+ memcpy(hdr.addr1, skb->data, ETH_ALEN);
+ memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+ memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN);
+ hdrlen = 24;
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!chanctx_conf) {
+ ret = -ENOTCONN;
+ goto free;
+ }
+ band = chanctx_conf->def.chan->band;
+ break;
+ default:
+ ret = -EINVAL;
+ goto free;
+ }
+
+ multicast = is_multicast_ether_addr(hdr.addr1);
+
+ /* sta is always NULL for mesh */
+ if (sta) {
+ authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED);
+ wme_sta = sta->sta.wme;
+ } else if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ /* For mesh, the use of the QoS header is mandatory */
+ wme_sta = true;
+ }
+
+ /* receiver does QoS (which also means we do) use it */
+ if (wme_sta) {
+ fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+ hdrlen += 2;
+ }
+
+ /*
+ * Drop unicast frames to unauthorised stations unless they are
+ * EAPOL frames from the local station.
+ */
+ if (unlikely(!ieee80211_vif_is_mesh(&sdata->vif) &&
+ (sdata->vif.type != NL80211_IFTYPE_OCB) &&
+ !multicast && !authorized &&
+ (cpu_to_be16(ethertype) != sdata->control_port_protocol ||
+ !ether_addr_equal(sdata->vif.addr, skb->data + ETH_ALEN)))) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+ net_info_ratelimited("%s: dropped frame to %pM (unauthorized port)\n",
+ sdata->name, hdr.addr1);
+#endif
+
+ I802_DEBUG_INC(local->tx_handlers_drop_unauth_port);
+
+ ret = -EPERM;
+ goto free;
+ }
+
+ if (unlikely(!multicast && skb->sk &&
+ skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) {
+ struct sk_buff *ack_skb = skb_clone_sk(skb);
+
+ if (ack_skb) {
+ unsigned long flags;
+ int id;
+
+ spin_lock_irqsave(&local->ack_status_lock, flags);
+ id = idr_alloc(&local->ack_status_frames, ack_skb,
+ 1, 0x10000, GFP_ATOMIC);
+ spin_unlock_irqrestore(&local->ack_status_lock, flags);
+
+ if (id >= 0) {
+ info_id = id;
+ info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+ } else {
+ kfree_skb(ack_skb);
+ }
+ }
+ }
+
+ /*
+ * If the skb is shared we need to obtain our own copy.
+ */
+ if (skb_shared(skb)) {
+ struct sk_buff *tmp_skb = skb;
+
+ /* can't happen -- skb is a clone if info_id != 0 */
+ WARN_ON(info_id);
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ kfree_skb(tmp_skb);
+
+ if (!skb) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ }
+
+ hdr.frame_control = fc;
+ hdr.duration_id = 0;
+ hdr.seq_ctrl = 0;
+
+ skip_header_bytes = ETH_HLEN;
+ if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) {
+ encaps_data = bridge_tunnel_header;
+ encaps_len = sizeof(bridge_tunnel_header);
+ skip_header_bytes -= 2;
+ } else if (ethertype >= ETH_P_802_3_MIN) {
+ encaps_data = rfc1042_header;
+ encaps_len = sizeof(rfc1042_header);
+ skip_header_bytes -= 2;
+ } else {
+ encaps_data = NULL;
+ encaps_len = 0;
+ }
+
+ nh_pos = skb_network_header(skb) - skb->data;
+ h_pos = skb_transport_header(skb) - skb->data;
+
+ skb_pull(skb, skip_header_bytes);
+ nh_pos -= skip_header_bytes;
+ h_pos -= skip_header_bytes;
+
+ head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb);
+
+ /*
+ * So we need to modify the skb header and hence need a copy of
+ * that. The head_need variable above doesn't, so far, include
+ * the needed header space that we don't need right away. If we
+ * can, then we don't reallocate right now but only after the
+ * frame arrives at the master device (if it does...)
+ *
+ * If we cannot, however, then we will reallocate to include all
+ * the ever needed space. Also, if we need to reallocate it anyway,
+ * make it big enough for everything we may ever need.
+ */
+
+ if (head_need > 0 || skb_cloned(skb)) {
+ head_need += sdata->encrypt_headroom;
+ head_need += local->tx_headroom;
+ head_need = max_t(int, 0, head_need);
+ if (ieee80211_skb_resize(sdata, skb, head_need, true)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ skb = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ if (encaps_data) {
+ memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len);
+ nh_pos += encaps_len;
+ h_pos += encaps_len;
+ }
+
+#ifdef CONFIG_MAC80211_MESH
+ if (meshhdrlen > 0) {
+ memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen);
+ nh_pos += meshhdrlen;
+ h_pos += meshhdrlen;
+ }
+#endif
+
+ if (ieee80211_is_data_qos(fc)) {
+ __le16 *qos_control;
+
+ qos_control = (__le16 *) skb_push(skb, 2);
+ memcpy(skb_push(skb, hdrlen - 2), &hdr, hdrlen - 2);
+ /*
+ * Maybe we could actually set some fields here, for now just
+ * initialise to zero to indicate no special operation.
+ */
+ *qos_control = 0;
+ } else
+ memcpy(skb_push(skb, hdrlen), &hdr, hdrlen);
+
+ nh_pos += hdrlen;
+ h_pos += hdrlen;
+
+ /* Update skb pointers to various headers since this modified frame
+ * is going to go through Linux networking code that may potentially
+ * need things like pointer to IP header. */
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, nh_pos);
+ skb_set_transport_header(skb, h_pos);
+
+ info = IEEE80211_SKB_CB(skb);
+ memset(info, 0, sizeof(*info));
+
+ info->flags = info_flags;
+ info->ack_frame_id = info_id;
+ info->band = band;
+
+ return skb;
+ free:
+ kfree_skb(skb);
+ return ERR_PTR(ret);
+}
+
+void __ieee80211_subif_start_xmit(struct sk_buff *skb,
+ struct net_device *dev,
+ u32 info_flags)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sta_info *sta;
+
+ if (unlikely(skb->len < ETH_HLEN)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ rcu_read_lock();
+
+ if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ skb = ieee80211_build_hdr(sdata, skb, info_flags, sta);
+ if (IS_ERR(skb))
+ goto out;
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+ dev->trans_start = jiffies;
+
+ ieee80211_xmit(sdata, sta, skb);
+ out:
+ rcu_read_unlock();
+}
+
+/**
+ * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs
+ * @skb: packet to be sent
+ * @dev: incoming interface
+ *
+ * On failure skb will be freed.
+ */
+netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ __ieee80211_subif_start_xmit(skb, dev, 0);
+ return NETDEV_TX_OK;
+}
+
+struct sk_buff *
+ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u32 info_flags)
+{
+ struct ieee80211_hdr *hdr;
+ struct ieee80211_tx_data tx = {
+ .local = sdata->local,
+ .sdata = sdata,
+ };
+ struct sta_info *sta;
+
+ rcu_read_lock();
+
+ if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
+ kfree_skb(skb);
+ skb = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ skb = ieee80211_build_hdr(sdata, skb, info_flags, sta);
+ if (IS_ERR(skb))
+ goto out;
+
+ hdr = (void *)skb->data;
+ tx.sta = sta_info_get(sdata, hdr->addr1);
+ tx.skb = skb;
+
+ if (ieee80211_tx_h_select_key(&tx) != TX_CONTINUE) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+ }
+
+out:
+ rcu_read_unlock();
+ return skb;
+}
+
+/*
+ * ieee80211_clear_tx_pending may not be called in a context where
+ * it is possible that it packets could come in again.
+ */
+void ieee80211_clear_tx_pending(struct ieee80211_local *local)
+{
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < local->hw.queues; i++) {
+ while ((skb = skb_dequeue(&local->pending[i])) != NULL)
+ ieee80211_free_txskb(&local->hw, skb);
+ }
+}
+
+/*
+ * Returns false if the frame couldn't be transmitted but was queued instead,
+ * which in this case means re-queued -- take as an indication to stop sending
+ * more pending frames.
+ */
+static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
+ struct sk_buff *skb)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_sub_if_data *sdata;
+ struct sta_info *sta;
+ struct ieee80211_hdr *hdr;
+ bool result;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ sdata = vif_to_sdata(info->control.vif);
+
+ if (info->flags & IEEE80211_TX_INTFL_NEED_TXPROCESSING) {
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (unlikely(!chanctx_conf)) {
+ dev_kfree_skb(skb);
+ return true;
+ }
+ info->band = chanctx_conf->def.chan->band;
+ result = ieee80211_tx(sdata, NULL, skb, true);
+ } else {
+ struct sk_buff_head skbs;
+
+ __skb_queue_head_init(&skbs);
+ __skb_queue_tail(&skbs, skb);
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ sta = sta_info_get(sdata, hdr->addr1);
+
+ result = __ieee80211_tx(local, &skbs, skb->len, sta, true);
+ }
+
+ return result;
+}
+
+/*
+ * Transmit all pending packets. Called from tasklet.
+ */
+void ieee80211_tx_pending(unsigned long data)
+{
+ struct ieee80211_local *local = (struct ieee80211_local *)data;
+ unsigned long flags;
+ int i;
+ bool txok;
+
+ rcu_read_lock();
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ for (i = 0; i < local->hw.queues; i++) {
+ /*
+ * If queue is stopped by something other than due to pending
+ * frames, or we have no pending frames, proceed to next queue.
+ */
+ if (local->queue_stop_reasons[i] ||
+ skb_queue_empty(&local->pending[i]))
+ continue;
+
+ while (!skb_queue_empty(&local->pending[i])) {
+ struct sk_buff *skb = __skb_dequeue(&local->pending[i]);
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+ if (WARN_ON(!info->control.vif)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ continue;
+ }
+
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock,
+ flags);
+
+ txok = ieee80211_tx_pending_skb(local, skb);
+ spin_lock_irqsave(&local->queue_stop_reason_lock,
+ flags);
+ if (!txok)
+ break;
+ }
+
+ if (skb_queue_empty(&local->pending[i]))
+ ieee80211_propagate_queue_wake(local, i);
+ }
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+ rcu_read_unlock();
+}
+
+/* functions for drivers to get certain frames */
+
+static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
+ struct ps_data *ps, struct sk_buff *skb,
+ bool is_template)
+{
+ u8 *pos, *tim;
+ int aid0 = 0;
+ int i, have_bits = 0, n1, n2;
+
+ /* Generate bitmap for TIM only if there are any STAs in power save
+ * mode. */
+ if (atomic_read(&ps->num_sta_ps) > 0)
+ /* in the hope that this is faster than
+ * checking byte-for-byte */
+ have_bits = !bitmap_empty((unsigned long *)ps->tim,
+ IEEE80211_MAX_AID+1);
+ if (!is_template) {
+ if (ps->dtim_count == 0)
+ ps->dtim_count = sdata->vif.bss_conf.dtim_period - 1;
+ else
+ ps->dtim_count--;
+ }
+
+ tim = pos = (u8 *) skb_put(skb, 6);
+ *pos++ = WLAN_EID_TIM;
+ *pos++ = 4;
+ *pos++ = ps->dtim_count;
+ *pos++ = sdata->vif.bss_conf.dtim_period;
+
+ if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf))
+ aid0 = 1;
+
+ ps->dtim_bc_mc = aid0 == 1;
+
+ if (have_bits) {
+ /* Find largest even number N1 so that bits numbered 1 through
+ * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits
+ * (N2 + 1) x 8 through 2007 are 0. */
+ n1 = 0;
+ for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) {
+ if (ps->tim[i]) {
+ n1 = i & 0xfe;
+ break;
+ }
+ }
+ n2 = n1;
+ for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) {
+ if (ps->tim[i]) {
+ n2 = i;
+ break;
+ }
+ }
+
+ /* Bitmap control */
+ *pos++ = n1 | aid0;
+ /* Part Virt Bitmap */
+ skb_put(skb, n2 - n1);
+ memcpy(pos, ps->tim + n1, n2 - n1 + 1);
+
+ tim[1] = n2 - n1 + 4;
+ } else {
+ *pos++ = aid0; /* Bitmap control */
+ *pos++ = 0; /* Part Virt Bitmap */
+ }
+}
+
+static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
+ struct ps_data *ps, struct sk_buff *skb,
+ bool is_template)
+{
+ struct ieee80211_local *local = sdata->local;
+
+ /*
+ * Not very nice, but we want to allow the driver to call
+ * ieee80211_beacon_get() as a response to the set_tim()
+ * callback. That, however, is already invoked under the
+ * sta_lock to guarantee consistent and race-free update
+ * of the tim bitmap in mac80211 and the driver.
+ */
+ if (local->tim_in_locked_section) {
+ __ieee80211_beacon_add_tim(sdata, ps, skb, is_template);
+ } else {
+ spin_lock_bh(&local->tim_lock);
+ __ieee80211_beacon_add_tim(sdata, ps, skb, is_template);
+ spin_unlock_bh(&local->tim_lock);
+ }
+
+ return 0;
+}
+
+static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata,
+ struct beacon_data *beacon)
+{
+ struct probe_resp *resp;
+ u8 *beacon_data;
+ size_t beacon_data_len;
+ int i;
+ u8 count = beacon->csa_current_counter;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP:
+ beacon_data = beacon->tail;
+ beacon_data_len = beacon->tail_len;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ beacon_data = beacon->head;
+ beacon_data_len = beacon->head_len;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ beacon_data = beacon->head;
+ beacon_data_len = beacon->head_len;
+ break;
+ default:
+ return;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; ++i) {
+ resp = rcu_dereference(sdata->u.ap.probe_resp);
+
+ if (beacon->csa_counter_offsets[i]) {
+ if (WARN_ON_ONCE(beacon->csa_counter_offsets[i] >=
+ beacon_data_len)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ beacon_data[beacon->csa_counter_offsets[i]] = count;
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP && resp)
+ resp->data[resp->csa_counter_offsets[i]] = count;
+ }
+ rcu_read_unlock();
+}
+
+u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct beacon_data *beacon = NULL;
+ u8 count = 0;
+
+ rcu_read_lock();
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ beacon = rcu_dereference(sdata->u.ap.beacon);
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ beacon = rcu_dereference(sdata->u.ibss.presp);
+ else if (ieee80211_vif_is_mesh(&sdata->vif))
+ beacon = rcu_dereference(sdata->u.mesh.beacon);
+
+ if (!beacon)
+ goto unlock;
+
+ beacon->csa_current_counter--;
+
+ /* the counter should never reach 0 */
+ WARN_ON_ONCE(!beacon->csa_current_counter);
+ count = beacon->csa_current_counter;
+
+unlock:
+ rcu_read_unlock();
+ return count;
+}
+EXPORT_SYMBOL(ieee80211_csa_update_counter);
+
+bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct beacon_data *beacon = NULL;
+ u8 *beacon_data;
+ size_t beacon_data_len;
+ int ret = false;
+
+ if (!ieee80211_sdata_running(sdata))
+ return false;
+
+ rcu_read_lock();
+ if (vif->type == NL80211_IFTYPE_AP) {
+ struct ieee80211_if_ap *ap = &sdata->u.ap;
+
+ beacon = rcu_dereference(ap->beacon);
+ if (WARN_ON(!beacon || !beacon->tail))
+ goto out;
+ beacon_data = beacon->tail;
+ beacon_data_len = beacon->tail_len;
+ } else if (vif->type == NL80211_IFTYPE_ADHOC) {
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+ beacon = rcu_dereference(ifibss->presp);
+ if (!beacon)
+ goto out;
+
+ beacon_data = beacon->head;
+ beacon_data_len = beacon->head_len;
+ } else if (vif->type == NL80211_IFTYPE_MESH_POINT) {
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+ beacon = rcu_dereference(ifmsh->beacon);
+ if (!beacon)
+ goto out;
+
+ beacon_data = beacon->head;
+ beacon_data_len = beacon->head_len;
+ } else {
+ WARN_ON(1);
+ goto out;
+ }
+
+ if (!beacon->csa_counter_offsets[0])
+ goto out;
+
+ if (WARN_ON_ONCE(beacon->csa_counter_offsets[0] > beacon_data_len))
+ goto out;
+
+ if (beacon_data[beacon->csa_counter_offsets[0]] == 1)
+ ret = true;
+ out:
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(ieee80211_csa_is_complete);
+
+static struct sk_buff *
+__ieee80211_beacon_get(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_mutable_offsets *offs,
+ bool is_template)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct beacon_data *beacon = NULL;
+ struct sk_buff *skb = NULL;
+ struct ieee80211_tx_info *info;
+ struct ieee80211_sub_if_data *sdata = NULL;
+ enum ieee80211_band band;
+ struct ieee80211_tx_rate_control txrc;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ int csa_off_base = 0;
+
+ rcu_read_lock();
+
+ sdata = vif_to_sdata(vif);
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+
+ if (!ieee80211_sdata_running(sdata) || !chanctx_conf)
+ goto out;
+
+ if (offs)
+ memset(offs, 0, sizeof(*offs));
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ struct ieee80211_if_ap *ap = &sdata->u.ap;
+
+ beacon = rcu_dereference(ap->beacon);
+ if (beacon) {
+ if (beacon->csa_counter_offsets[0]) {
+ if (!is_template)
+ ieee80211_csa_update_counter(vif);
+
+ ieee80211_set_csa(sdata, beacon);
+ }
+
+ /*
+ * headroom, head length,
+ * tail length and maximum TIM length
+ */
+ skb = dev_alloc_skb(local->tx_headroom +
+ beacon->head_len +
+ beacon->tail_len + 256 +
+ local->hw.extra_beacon_tailroom);
+ if (!skb)
+ goto out;
+
+ skb_reserve(skb, local->tx_headroom);
+ memcpy(skb_put(skb, beacon->head_len), beacon->head,
+ beacon->head_len);
+
+ ieee80211_beacon_add_tim(sdata, &ap->ps, skb,
+ is_template);
+
+ if (offs) {
+ offs->tim_offset = beacon->head_len;
+ offs->tim_length = skb->len - beacon->head_len;
+
+ /* for AP the csa offsets are from tail */
+ csa_off_base = skb->len;
+ }
+
+ if (beacon->tail)
+ memcpy(skb_put(skb, beacon->tail_len),
+ beacon->tail, beacon->tail_len);
+ } else
+ goto out;
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ struct ieee80211_hdr *hdr;
+
+ beacon = rcu_dereference(ifibss->presp);
+ if (!beacon)
+ goto out;
+
+ if (beacon->csa_counter_offsets[0]) {
+ if (!is_template)
+ ieee80211_csa_update_counter(vif);
+
+ ieee80211_set_csa(sdata, beacon);
+ }
+
+ skb = dev_alloc_skb(local->tx_headroom + beacon->head_len +
+ local->hw.extra_beacon_tailroom);
+ if (!skb)
+ goto out;
+ skb_reserve(skb, local->tx_headroom);
+ memcpy(skb_put(skb, beacon->head_len), beacon->head,
+ beacon->head_len);
+
+ hdr = (struct ieee80211_hdr *) skb->data;
+ hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_BEACON);
+ } else if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+ beacon = rcu_dereference(ifmsh->beacon);
+ if (!beacon)
+ goto out;
+
+ if (beacon->csa_counter_offsets[0]) {
+ if (!is_template)
+ /* TODO: For mesh csa_counter is in TU, so
+ * decrementing it by one isn't correct, but
+ * for now we leave it consistent with overall
+ * mac80211's behavior.
+ */
+ ieee80211_csa_update_counter(vif);
+
+ ieee80211_set_csa(sdata, beacon);
+ }
+
+ if (ifmsh->sync_ops)
+ ifmsh->sync_ops->adjust_tbtt(sdata, beacon);
+
+ skb = dev_alloc_skb(local->tx_headroom +
+ beacon->head_len +
+ 256 + /* TIM IE */
+ beacon->tail_len +
+ local->hw.extra_beacon_tailroom);
+ if (!skb)
+ goto out;
+ skb_reserve(skb, local->tx_headroom);
+ memcpy(skb_put(skb, beacon->head_len), beacon->head,
+ beacon->head_len);
+ ieee80211_beacon_add_tim(sdata, &ifmsh->ps, skb, is_template);
+
+ if (offs) {
+ offs->tim_offset = beacon->head_len;
+ offs->tim_length = skb->len - beacon->head_len;
+ }
+
+ memcpy(skb_put(skb, beacon->tail_len), beacon->tail,
+ beacon->tail_len);
+ } else {
+ WARN_ON(1);
+ goto out;
+ }
+
+ /* CSA offsets */
+ if (offs && beacon) {
+ int i;
+
+ for (i = 0; i < IEEE80211_MAX_CSA_COUNTERS_NUM; i++) {
+ u16 csa_off = beacon->csa_counter_offsets[i];
+
+ if (!csa_off)
+ continue;
+
+ offs->csa_counter_offs[i] = csa_off_base + csa_off;
+ }
+ }
+
+ band = chanctx_conf->def.chan->band;
+
+ info = IEEE80211_SKB_CB(skb);
+
+ info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ info->flags |= IEEE80211_TX_CTL_NO_ACK;
+ info->band = band;
+
+ memset(&txrc, 0, sizeof(txrc));
+ txrc.hw = hw;
+ txrc.sband = local->hw.wiphy->bands[band];
+ txrc.bss_conf = &sdata->vif.bss_conf;
+ txrc.skb = skb;
+ txrc.reported_rate.idx = -1;
+ txrc.rate_idx_mask = sdata->rc_rateidx_mask[band];
+ if (txrc.rate_idx_mask == (1 << txrc.sband->n_bitrates) - 1)
+ txrc.max_rate_idx = -1;
+ else
+ txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1;
+ txrc.bss = true;
+ rate_control_get_rate(sdata, NULL, &txrc);
+
+ info->control.vif = vif;
+
+ info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT |
+ IEEE80211_TX_CTL_ASSIGN_SEQ |
+ IEEE80211_TX_CTL_FIRST_FRAGMENT;
+ out:
+ rcu_read_unlock();
+ return skb;
+
+}
+
+struct sk_buff *
+ieee80211_beacon_get_template(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_mutable_offsets *offs)
+{
+ return __ieee80211_beacon_get(hw, vif, offs, true);
+}
+EXPORT_SYMBOL(ieee80211_beacon_get_template);
+
+struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ u16 *tim_offset, u16 *tim_length)
+{
+ struct ieee80211_mutable_offsets offs = {};
+ struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false);
+
+ if (tim_offset)
+ *tim_offset = offs.tim_offset;
+
+ if (tim_length)
+ *tim_length = offs.tim_length;
+
+ return bcn;
+}
+EXPORT_SYMBOL(ieee80211_beacon_get_tim);
+
+struct sk_buff *ieee80211_proberesp_get(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct ieee80211_if_ap *ap = NULL;
+ struct sk_buff *skb = NULL;
+ struct probe_resp *presp = NULL;
+ struct ieee80211_hdr *hdr;
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ return NULL;
+
+ rcu_read_lock();
+
+ ap = &sdata->u.ap;
+ presp = rcu_dereference(ap->probe_resp);
+ if (!presp)
+ goto out;
+
+ skb = dev_alloc_skb(presp->len);
+ if (!skb)
+ goto out;
+
+ memcpy(skb_put(skb, presp->len), presp->data, presp->len);
+
+ hdr = (struct ieee80211_hdr *) skb->data;
+ memset(hdr->addr1, 0, sizeof(hdr->addr1));
+
+out:
+ rcu_read_unlock();
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_proberesp_get);
+
+struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_if_managed *ifmgd;
+ struct ieee80211_pspoll *pspoll;
+ struct ieee80211_local *local;
+ struct sk_buff *skb;
+
+ if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+ return NULL;
+
+ sdata = vif_to_sdata(vif);
+ ifmgd = &sdata->u.mgd;
+ local = sdata->local;
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll));
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ pspoll = (struct ieee80211_pspoll *) skb_put(skb, sizeof(*pspoll));
+ memset(pspoll, 0, sizeof(*pspoll));
+ pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
+ IEEE80211_STYPE_PSPOLL);
+ pspoll->aid = cpu_to_le16(ifmgd->aid);
+
+ /* aid in PS-Poll has its two MSBs each set to 1 */
+ pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14);
+
+ memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN);
+ memcpy(pspoll->ta, vif->addr, ETH_ALEN);
+
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_pspoll_get);
+
+struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct ieee80211_hdr_3addr *nullfunc;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_if_managed *ifmgd;
+ struct ieee80211_local *local;
+ struct sk_buff *skb;
+
+ if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+ return NULL;
+
+ sdata = vif_to_sdata(vif);
+ ifmgd = &sdata->u.mgd;
+ local = sdata->local;
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc));
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ nullfunc = (struct ieee80211_hdr_3addr *) skb_put(skb,
+ sizeof(*nullfunc));
+ memset(nullfunc, 0, sizeof(*nullfunc));
+ nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA |
+ IEEE80211_STYPE_NULLFUNC |
+ IEEE80211_FCTL_TODS);
+ memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN);
+ memcpy(nullfunc->addr2, vif->addr, ETH_ALEN);
+ memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN);
+
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_nullfunc_get);
+
+struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw,
+ const u8 *src_addr,
+ const u8 *ssid, size_t ssid_len,
+ size_t tailroom)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_hdr_3addr *hdr;
+ struct sk_buff *skb;
+ size_t ie_ssid_len;
+ u8 *pos;
+
+ ie_ssid_len = 2 + ssid_len;
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*hdr) +
+ ie_ssid_len + tailroom);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ hdr = (struct ieee80211_hdr_3addr *) skb_put(skb, sizeof(*hdr));
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_PROBE_REQ);
+ eth_broadcast_addr(hdr->addr1);
+ memcpy(hdr->addr2, src_addr, ETH_ALEN);
+ eth_broadcast_addr(hdr->addr3);
+
+ pos = skb_put(skb, ie_ssid_len);
+ *pos++ = WLAN_EID_SSID;
+ *pos++ = ssid_len;
+ if (ssid_len)
+ memcpy(pos, ssid, ssid_len);
+ pos += ssid_len;
+
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_probereq_get);
+
+void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+ const void *frame, size_t frame_len,
+ const struct ieee80211_tx_info *frame_txctl,
+ struct ieee80211_rts *rts)
+{
+ const struct ieee80211_hdr *hdr = frame;
+
+ rts->frame_control =
+ cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS);
+ rts->duration = ieee80211_rts_duration(hw, vif, frame_len,
+ frame_txctl);
+ memcpy(rts->ra, hdr->addr1, sizeof(rts->ra));
+ memcpy(rts->ta, hdr->addr2, sizeof(rts->ta));
+}
+EXPORT_SYMBOL(ieee80211_rts_get);
+
+void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+ const void *frame, size_t frame_len,
+ const struct ieee80211_tx_info *frame_txctl,
+ struct ieee80211_cts *cts)
+{
+ const struct ieee80211_hdr *hdr = frame;
+
+ cts->frame_control =
+ cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS);
+ cts->duration = ieee80211_ctstoself_duration(hw, vif,
+ frame_len, frame_txctl);
+ memcpy(cts->ra, hdr->addr1, sizeof(cts->ra));
+}
+EXPORT_SYMBOL(ieee80211_ctstoself_get);
+
+struct sk_buff *
+ieee80211_get_buffered_bc(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct sk_buff *skb = NULL;
+ struct ieee80211_tx_data tx;
+ struct ieee80211_sub_if_data *sdata;
+ struct ps_data *ps;
+ struct ieee80211_tx_info *info;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ sdata = vif_to_sdata(vif);
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+
+ if (!chanctx_conf)
+ goto out;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ struct beacon_data *beacon =
+ rcu_dereference(sdata->u.ap.beacon);
+
+ if (!beacon || !beacon->head)
+ goto out;
+
+ ps = &sdata->u.ap.ps;
+ } else if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ ps = &sdata->u.mesh.ps;
+ } else {
+ goto out;
+ }
+
+ if (ps->dtim_count != 0 || !ps->dtim_bc_mc)
+ goto out; /* send buffered bc/mc only after DTIM beacon */
+
+ while (1) {
+ skb = skb_dequeue(&ps->bc_buf);
+ if (!skb)
+ goto out;
+ local->total_ps_buffered--;
+
+ if (!skb_queue_empty(&ps->bc_buf) && skb->len >= 2) {
+ struct ieee80211_hdr *hdr =
+ (struct ieee80211_hdr *) skb->data;
+ /* more buffered multicast/broadcast frames ==> set
+ * MoreData flag in IEEE 802.11 header to inform PS
+ * STAs */
+ hdr->frame_control |=
+ cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ sdata = IEEE80211_DEV_TO_SUB_IF(skb->dev);
+ if (!ieee80211_tx_prepare(sdata, &tx, NULL, skb))
+ break;
+ dev_kfree_skb_any(skb);
+ }
+
+ info = IEEE80211_SKB_CB(skb);
+
+ tx.flags |= IEEE80211_TX_PS_BUFFERED;
+ info->band = chanctx_conf->def.chan->band;
+
+ if (invoke_tx_handlers(&tx))
+ skb = NULL;
+ out:
+ rcu_read_unlock();
+
+ return skb;
+}
+EXPORT_SYMBOL(ieee80211_get_buffered_bc);
+
+int ieee80211_reserve_tid(struct ieee80211_sta *pubsta, u8 tid)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ int ret;
+ u32 queues;
+
+ lockdep_assert_held(&local->sta_mtx);
+
+ /* only some cases are supported right now */
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ break;
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (WARN_ON(tid >= IEEE80211_NUM_UPS))
+ return -EINVAL;
+
+ if (sta->reserved_tid == tid) {
+ ret = 0;
+ goto out;
+ }
+
+ if (sta->reserved_tid != IEEE80211_TID_UNRESERVED) {
+ sdata_err(sdata, "TID reservation already active\n");
+ ret = -EALREADY;
+ goto out;
+ }
+
+ ieee80211_stop_vif_queues(sdata->local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_RESERVE_TID);
+
+ synchronize_net();
+
+ /* Tear down BA sessions so we stop aggregating on this TID */
+ if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+ set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ __ieee80211_stop_tx_ba_session(sta, tid,
+ AGG_STOP_LOCAL_REQUEST);
+ }
+
+ queues = BIT(sdata->vif.hw_queue[ieee802_1d_to_ac[tid]]);
+ __ieee80211_flush_queues(local, sdata, queues, false);
+
+ sta->reserved_tid = tid;
+
+ ieee80211_wake_vif_queues(local, sdata,
+ IEEE80211_QUEUE_STOP_REASON_RESERVE_TID);
+
+ if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION)
+ clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
+
+ ret = 0;
+ out:
+ return ret;
+}
+EXPORT_SYMBOL(ieee80211_reserve_tid);
+
+void ieee80211_unreserve_tid(struct ieee80211_sta *pubsta, u8 tid)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+ lockdep_assert_held(&sdata->local->sta_mtx);
+
+ /* only some cases are supported right now */
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ break;
+ default:
+ WARN_ON(1);
+ return;
+ }
+
+ if (tid != sta->reserved_tid) {
+ sdata_err(sdata, "TID to unreserve (%d) isn't reserved\n", tid);
+ return;
+ }
+
+ sta->reserved_tid = IEEE80211_TID_UNRESERVED;
+}
+EXPORT_SYMBOL(ieee80211_unreserve_tid);
+
+void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, int tid,
+ enum ieee80211_band band)
+{
+ int ac = ieee802_1d_to_ac[tid & 7];
+
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, 0);
+ skb_set_transport_header(skb, 0);
+
+ skb_set_queue_mapping(skb, ac);
+ skb->priority = tid;
+
+ skb->dev = sdata->dev;
+
+ /*
+ * The other path calling ieee80211_xmit is from the tasklet,
+ * and while we can handle concurrent transmissions locking
+ * requirements are that we do not come into tx with bhs on.
+ */
+ local_bh_disable();
+ IEEE80211_SKB_CB(skb)->band = band;
+ ieee80211_xmit(sdata, NULL, skb);
+ local_bh_enable();
+}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
new file mode 100644
index 000000000..b864ebc6a
--- /dev/null
+++ b/net/mac80211/util.c
@@ -0,0 +1,3339 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * utilities for mac80211
+ */
+
+#include <net/mac80211.h>
+#include <linux/netdevice.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/bitmap.h>
+#include <linux/crc32.h>
+#include <net/net_namespace.h>
+#include <net/cfg80211.h>
+#include <net/rtnetlink.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "mesh.h"
+#include "wme.h"
+#include "led.h"
+#include "wep.h"
+
+/* privid for wiphys to determine whether they belong to us or not */
+const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid;
+
+struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy)
+{
+ struct ieee80211_local *local;
+ BUG_ON(!wiphy);
+
+ local = wiphy_priv(wiphy);
+ return &local->hw;
+}
+EXPORT_SYMBOL(wiphy_to_ieee80211_hw);
+
+u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
+ enum nl80211_iftype type)
+{
+ __le16 fc = hdr->frame_control;
+
+ /* drop ACK/CTS frames and incorrect hdr len (ctrl) */
+ if (len < 16)
+ return NULL;
+
+ if (ieee80211_is_data(fc)) {
+ if (len < 24) /* drop incorrect hdr len (data) */
+ return NULL;
+
+ if (ieee80211_has_a4(fc))
+ return NULL;
+ if (ieee80211_has_tods(fc))
+ return hdr->addr1;
+ if (ieee80211_has_fromds(fc))
+ return hdr->addr2;
+
+ return hdr->addr3;
+ }
+
+ if (ieee80211_is_mgmt(fc)) {
+ if (len < 24) /* drop incorrect hdr len (mgmt) */
+ return NULL;
+ return hdr->addr3;
+ }
+
+ if (ieee80211_is_ctl(fc)) {
+ if (ieee80211_is_pspoll(fc))
+ return hdr->addr1;
+
+ if (ieee80211_is_back_req(fc)) {
+ switch (type) {
+ case NL80211_IFTYPE_STATION:
+ return hdr->addr2;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ return hdr->addr1;
+ default:
+ break; /* fall through to the return */
+ }
+ }
+ }
+
+ return NULL;
+}
+
+void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+ struct ieee80211_hdr *hdr;
+
+ skb_queue_walk(&tx->skbs, skb) {
+ hdr = (struct ieee80211_hdr *) skb->data;
+ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+ }
+}
+
+int ieee80211_frame_duration(enum ieee80211_band band, size_t len,
+ int rate, int erp, int short_preamble,
+ int shift)
+{
+ int dur;
+
+ /* calculate duration (in microseconds, rounded up to next higher
+ * integer if it includes a fractional microsecond) to send frame of
+ * len bytes (does not include FCS) at the given rate. Duration will
+ * also include SIFS.
+ *
+ * rate is in 100 kbps, so divident is multiplied by 10 in the
+ * DIV_ROUND_UP() operations.
+ *
+ * shift may be 2 for 5 MHz channels or 1 for 10 MHz channels, and
+ * is assumed to be 0 otherwise.
+ */
+
+ if (band == IEEE80211_BAND_5GHZ || erp) {
+ /*
+ * OFDM:
+ *
+ * N_DBPS = DATARATE x 4
+ * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS)
+ * (16 = SIGNAL time, 6 = tail bits)
+ * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext
+ *
+ * T_SYM = 4 usec
+ * 802.11a - 18.5.2: aSIFSTime = 16 usec
+ * 802.11g - 19.8.4: aSIFSTime = 10 usec +
+ * signal ext = 6 usec
+ */
+ dur = 16; /* SIFS + signal ext */
+ dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */
+ dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */
+
+ /* IEEE 802.11-2012 18.3.2.4: all values above are:
+ * * times 4 for 5 MHz
+ * * times 2 for 10 MHz
+ */
+ dur *= 1 << shift;
+
+ /* rates should already consider the channel bandwidth,
+ * don't apply divisor again.
+ */
+ dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10,
+ 4 * rate); /* T_SYM x N_SYM */
+ } else {
+ /*
+ * 802.11b or 802.11g with 802.11b compatibility:
+ * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime +
+ * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0.
+ *
+ * 802.11 (DS): 15.3.3, 802.11b: 18.3.4
+ * aSIFSTime = 10 usec
+ * aPreambleLength = 144 usec or 72 usec with short preamble
+ * aPLCPHeaderLength = 48 usec or 24 usec with short preamble
+ */
+ dur = 10; /* aSIFSTime = 10 usec */
+ dur += short_preamble ? (72 + 24) : (144 + 48);
+
+ dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate);
+ }
+
+ return dur;
+}
+
+/* Exported duration function for driver use */
+__le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ enum ieee80211_band band,
+ size_t frame_len,
+ struct ieee80211_rate *rate)
+{
+ struct ieee80211_sub_if_data *sdata;
+ u16 dur;
+ int erp, shift = 0;
+ bool short_preamble = false;
+
+ erp = 0;
+ if (vif) {
+ sdata = vif_to_sdata(vif);
+ short_preamble = sdata->vif.bss_conf.use_short_preamble;
+ if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+ erp = rate->flags & IEEE80211_RATE_ERP_G;
+ shift = ieee80211_vif_get_shift(vif);
+ }
+
+ dur = ieee80211_frame_duration(band, frame_len, rate->bitrate, erp,
+ short_preamble, shift);
+
+ return cpu_to_le16(dur);
+}
+EXPORT_SYMBOL(ieee80211_generic_frame_duration);
+
+__le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif, size_t frame_len,
+ const struct ieee80211_tx_info *frame_txctl)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_rate *rate;
+ struct ieee80211_sub_if_data *sdata;
+ bool short_preamble;
+ int erp, shift = 0, bitrate;
+ u16 dur;
+ struct ieee80211_supported_band *sband;
+
+ sband = local->hw.wiphy->bands[frame_txctl->band];
+
+ short_preamble = false;
+
+ rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx];
+
+ erp = 0;
+ if (vif) {
+ sdata = vif_to_sdata(vif);
+ short_preamble = sdata->vif.bss_conf.use_short_preamble;
+ if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+ erp = rate->flags & IEEE80211_RATE_ERP_G;
+ shift = ieee80211_vif_get_shift(vif);
+ }
+
+ bitrate = DIV_ROUND_UP(rate->bitrate, 1 << shift);
+
+ /* CTS duration */
+ dur = ieee80211_frame_duration(sband->band, 10, bitrate,
+ erp, short_preamble, shift);
+ /* Data frame duration */
+ dur += ieee80211_frame_duration(sband->band, frame_len, bitrate,
+ erp, short_preamble, shift);
+ /* ACK duration */
+ dur += ieee80211_frame_duration(sband->band, 10, bitrate,
+ erp, short_preamble, shift);
+
+ return cpu_to_le16(dur);
+}
+EXPORT_SYMBOL(ieee80211_rts_duration);
+
+__le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ size_t frame_len,
+ const struct ieee80211_tx_info *frame_txctl)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_rate *rate;
+ struct ieee80211_sub_if_data *sdata;
+ bool short_preamble;
+ int erp, shift = 0, bitrate;
+ u16 dur;
+ struct ieee80211_supported_band *sband;
+
+ sband = local->hw.wiphy->bands[frame_txctl->band];
+
+ short_preamble = false;
+
+ rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx];
+ erp = 0;
+ if (vif) {
+ sdata = vif_to_sdata(vif);
+ short_preamble = sdata->vif.bss_conf.use_short_preamble;
+ if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+ erp = rate->flags & IEEE80211_RATE_ERP_G;
+ shift = ieee80211_vif_get_shift(vif);
+ }
+
+ bitrate = DIV_ROUND_UP(rate->bitrate, 1 << shift);
+
+ /* Data frame duration */
+ dur = ieee80211_frame_duration(sband->band, frame_len, bitrate,
+ erp, short_preamble, shift);
+ if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) {
+ /* ACK duration */
+ dur += ieee80211_frame_duration(sband->band, 10, bitrate,
+ erp, short_preamble, shift);
+ }
+
+ return cpu_to_le16(dur);
+}
+EXPORT_SYMBOL(ieee80211_ctstoself_duration);
+
+void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
+{
+ struct ieee80211_sub_if_data *sdata;
+ int n_acs = IEEE80211_NUM_ACS;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ n_acs = 1;
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ int ac;
+
+ if (!sdata->dev)
+ continue;
+
+ if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE &&
+ local->queue_stop_reasons[sdata->vif.cab_queue] != 0)
+ continue;
+
+ for (ac = 0; ac < n_acs; ac++) {
+ int ac_queue = sdata->vif.hw_queue[ac];
+
+ if (local->ops->wake_tx_queue &&
+ (atomic_read(&sdata->txqs_len[ac]) >
+ local->hw.txq_ac_max_pending))
+ continue;
+
+ if (ac_queue == queue ||
+ (sdata->vif.cab_queue == queue &&
+ local->queue_stop_reasons[ac_queue] == 0 &&
+ skb_queue_empty(&local->pending[ac_queue])))
+ netif_wake_subqueue(sdata->dev, ac);
+ }
+ }
+}
+
+static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason,
+ bool refcounted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ trace_wake_queue(local, queue, reason);
+
+ if (WARN_ON(queue >= hw->queues))
+ return;
+
+ if (!test_bit(reason, &local->queue_stop_reasons[queue]))
+ return;
+
+ if (!refcounted)
+ local->q_stop_reasons[queue][reason] = 0;
+ else
+ local->q_stop_reasons[queue][reason]--;
+
+ if (local->q_stop_reasons[queue][reason] == 0)
+ __clear_bit(reason, &local->queue_stop_reasons[queue]);
+
+ if (local->queue_stop_reasons[queue] != 0)
+ /* someone still has this queue stopped */
+ return;
+
+ if (skb_queue_empty(&local->pending[queue])) {
+ rcu_read_lock();
+ ieee80211_propagate_queue_wake(local, queue);
+ rcu_read_unlock();
+ } else
+ tasklet_schedule(&local->tx_pending_tasklet);
+}
+
+void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason,
+ bool refcounted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ unsigned long flags;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ __ieee80211_wake_queue(hw, queue, reason, refcounted);
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue)
+{
+ ieee80211_wake_queue_by_reason(hw, queue,
+ IEEE80211_QUEUE_STOP_REASON_DRIVER,
+ false);
+}
+EXPORT_SYMBOL(ieee80211_wake_queue);
+
+static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason,
+ bool refcounted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_sub_if_data *sdata;
+ int n_acs = IEEE80211_NUM_ACS;
+
+ trace_stop_queue(local, queue, reason);
+
+ if (WARN_ON(queue >= hw->queues))
+ return;
+
+ if (!refcounted)
+ local->q_stop_reasons[queue][reason] = 1;
+ else
+ local->q_stop_reasons[queue][reason]++;
+
+ if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue]))
+ return;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ n_acs = 1;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ int ac;
+
+ if (!sdata->dev)
+ continue;
+
+ for (ac = 0; ac < n_acs; ac++) {
+ if (sdata->vif.hw_queue[ac] == queue ||
+ sdata->vif.cab_queue == queue)
+ netif_stop_subqueue(sdata->dev, ac);
+ }
+ }
+ rcu_read_unlock();
+}
+
+void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
+ enum queue_stop_reason reason,
+ bool refcounted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ unsigned long flags;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ __ieee80211_stop_queue(hw, queue, reason, refcounted);
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue)
+{
+ ieee80211_stop_queue_by_reason(hw, queue,
+ IEEE80211_QUEUE_STOP_REASON_DRIVER,
+ false);
+}
+EXPORT_SYMBOL(ieee80211_stop_queue);
+
+void ieee80211_add_pending_skb(struct ieee80211_local *local,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hw *hw = &local->hw;
+ unsigned long flags;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int queue = info->hw_queue;
+
+ if (WARN_ON(!info->control.vif)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ return;
+ }
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
+ false);
+ __skb_queue_tail(&local->pending[queue], skb);
+ __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
+ false);
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_add_pending_skbs(struct ieee80211_local *local,
+ struct sk_buff_head *skbs)
+{
+ struct ieee80211_hw *hw = &local->hw;
+ struct sk_buff *skb;
+ unsigned long flags;
+ int queue, i;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ while ((skb = skb_dequeue(skbs))) {
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+ if (WARN_ON(!info->control.vif)) {
+ ieee80211_free_txskb(&local->hw, skb);
+ continue;
+ }
+
+ queue = info->hw_queue;
+
+ __ieee80211_stop_queue(hw, queue,
+ IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
+ false);
+
+ __skb_queue_tail(&local->pending[queue], skb);
+ }
+
+ for (i = 0; i < hw->queues; i++)
+ __ieee80211_wake_queue(hw, i,
+ IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
+ false);
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
+ unsigned long queues,
+ enum queue_stop_reason reason,
+ bool refcounted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+
+ for_each_set_bit(i, &queues, hw->queues)
+ __ieee80211_stop_queue(hw, i, reason, refcounted);
+
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_stop_queues(struct ieee80211_hw *hw)
+{
+ ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_DRIVER,
+ false);
+}
+EXPORT_SYMBOL(ieee80211_stop_queues);
+
+int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ unsigned long flags;
+ int ret;
+
+ if (WARN_ON(queue >= hw->queues))
+ return true;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ ret = test_bit(IEEE80211_QUEUE_STOP_REASON_DRIVER,
+ &local->queue_stop_reasons[queue]);
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ieee80211_queue_stopped);
+
+void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
+ unsigned long queues,
+ enum queue_stop_reason reason,
+ bool refcounted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+
+ for_each_set_bit(i, &queues, hw->queues)
+ __ieee80211_wake_queue(hw, i, reason, refcounted);
+
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_wake_queues(struct ieee80211_hw *hw)
+{
+ ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_DRIVER,
+ false);
+}
+EXPORT_SYMBOL(ieee80211_wake_queues);
+
+static unsigned int
+ieee80211_get_vif_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ unsigned int queues;
+
+ if (sdata && local->hw.flags & IEEE80211_HW_QUEUE_CONTROL) {
+ int ac;
+
+ queues = 0;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+ queues |= BIT(sdata->vif.hw_queue[ac]);
+ if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE)
+ queues |= BIT(sdata->vif.cab_queue);
+ } else {
+ /* all queues */
+ queues = BIT(local->hw.queues) - 1;
+ }
+
+ return queues;
+}
+
+void __ieee80211_flush_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ unsigned int queues, bool drop)
+{
+ if (!local->ops->flush)
+ return;
+
+ /*
+ * If no queue was set, or if the HW doesn't support
+ * IEEE80211_HW_QUEUE_CONTROL - flush all queues
+ */
+ if (!queues || !(local->hw.flags & IEEE80211_HW_QUEUE_CONTROL))
+ queues = ieee80211_get_vif_queues(local, sdata);
+
+ ieee80211_stop_queues_by_reason(&local->hw, queues,
+ IEEE80211_QUEUE_STOP_REASON_FLUSH,
+ false);
+
+ drv_flush(local, sdata, queues, drop);
+
+ ieee80211_wake_queues_by_reason(&local->hw, queues,
+ IEEE80211_QUEUE_STOP_REASON_FLUSH,
+ false);
+}
+
+void ieee80211_flush_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata, bool drop)
+{
+ __ieee80211_flush_queues(local, sdata, 0, drop);
+}
+
+void ieee80211_stop_vif_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum queue_stop_reason reason)
+{
+ ieee80211_stop_queues_by_reason(&local->hw,
+ ieee80211_get_vif_queues(local, sdata),
+ reason, true);
+}
+
+void ieee80211_wake_vif_queues(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum queue_stop_reason reason)
+{
+ ieee80211_wake_queues_by_reason(&local->hw,
+ ieee80211_get_vif_queues(local, sdata),
+ reason, true);
+}
+
+static void __iterate_interfaces(struct ieee80211_local *local,
+ u32 iter_flags,
+ void (*iterator)(void *data, u8 *mac,
+ struct ieee80211_vif *vif),
+ void *data)
+{
+ struct ieee80211_sub_if_data *sdata;
+ bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE;
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_MONITOR:
+ if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE))
+ continue;
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ continue;
+ default:
+ break;
+ }
+ if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) &&
+ active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
+ continue;
+ if (ieee80211_sdata_running(sdata) || !active_only)
+ iterator(data, sdata->vif.addr,
+ &sdata->vif);
+ }
+
+ sdata = rcu_dereference_check(local->monitor_sdata,
+ lockdep_is_held(&local->iflist_mtx) ||
+ lockdep_rtnl_is_held());
+ if (sdata &&
+ (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only ||
+ sdata->flags & IEEE80211_SDATA_IN_DRIVER))
+ iterator(data, sdata->vif.addr, &sdata->vif);
+}
+
+void ieee80211_iterate_interfaces(
+ struct ieee80211_hw *hw, u32 iter_flags,
+ void (*iterator)(void *data, u8 *mac,
+ struct ieee80211_vif *vif),
+ void *data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ mutex_lock(&local->iflist_mtx);
+ __iterate_interfaces(local, iter_flags, iterator, data);
+ mutex_unlock(&local->iflist_mtx);
+}
+EXPORT_SYMBOL_GPL(ieee80211_iterate_interfaces);
+
+void ieee80211_iterate_active_interfaces_atomic(
+ struct ieee80211_hw *hw, u32 iter_flags,
+ void (*iterator)(void *data, u8 *mac,
+ struct ieee80211_vif *vif),
+ void *data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ rcu_read_lock();
+ __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE,
+ iterator, data);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic);
+
+void ieee80211_iterate_active_interfaces_rtnl(
+ struct ieee80211_hw *hw, u32 iter_flags,
+ void (*iterator)(void *data, u8 *mac,
+ struct ieee80211_vif *vif),
+ void *data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ ASSERT_RTNL();
+
+ __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE,
+ iterator, data);
+}
+EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_rtnl);
+
+static void __iterate_stations(struct ieee80211_local *local,
+ void (*iterator)(void *data,
+ struct ieee80211_sta *sta),
+ void *data)
+{
+ struct sta_info *sta;
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (!sta->uploaded)
+ continue;
+
+ iterator(data, &sta->sta);
+ }
+}
+
+void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw,
+ void (*iterator)(void *data,
+ struct ieee80211_sta *sta),
+ void *data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ rcu_read_lock();
+ __iterate_stations(local, iterator, data);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic);
+
+struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+
+ if (!ieee80211_sdata_running(sdata) ||
+ !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
+ return NULL;
+ return &sdata->vif;
+}
+EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif);
+
+struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (!ieee80211_sdata_running(sdata) ||
+ !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
+ return NULL;
+
+ return &sdata->wdev;
+}
+EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev);
+
+/*
+ * Nothing should have been stuffed into the workqueue during
+ * the suspend->resume cycle. Since we can't check each caller
+ * of this function if we are already quiescing / suspended,
+ * check here and don't WARN since this can actually happen when
+ * the rx path (for example) is racing against __ieee80211_suspend
+ * and suspending / quiescing was set after the rx path checked
+ * them.
+ */
+static bool ieee80211_can_queue_work(struct ieee80211_local *local)
+{
+ if (local->quiescing || (local->suspended && !local->resuming)) {
+ pr_warn("queueing ieee80211 work while going to suspend\n");
+ return false;
+ }
+
+ return true;
+}
+
+void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ if (!ieee80211_can_queue_work(local))
+ return;
+
+ queue_work(local->workqueue, work);
+}
+EXPORT_SYMBOL(ieee80211_queue_work);
+
+void ieee80211_queue_delayed_work(struct ieee80211_hw *hw,
+ struct delayed_work *dwork,
+ unsigned long delay)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ if (!ieee80211_can_queue_work(local))
+ return;
+
+ queue_delayed_work(local->workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(ieee80211_queue_delayed_work);
+
+u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
+ struct ieee802_11_elems *elems,
+ u64 filter, u32 crc)
+{
+ size_t left = len;
+ const u8 *pos = start;
+ bool calc_crc = filter != 0;
+ DECLARE_BITMAP(seen_elems, 256);
+ const u8 *ie;
+
+ bitmap_zero(seen_elems, 256);
+ memset(elems, 0, sizeof(*elems));
+ elems->ie_start = start;
+ elems->total_len = len;
+
+ while (left >= 2) {
+ u8 id, elen;
+ bool elem_parse_failed;
+
+ id = *pos++;
+ elen = *pos++;
+ left -= 2;
+
+ if (elen > left) {
+ elems->parse_error = true;
+ break;
+ }
+
+ switch (id) {
+ case WLAN_EID_SSID:
+ case WLAN_EID_SUPP_RATES:
+ case WLAN_EID_FH_PARAMS:
+ case WLAN_EID_DS_PARAMS:
+ case WLAN_EID_CF_PARAMS:
+ case WLAN_EID_TIM:
+ case WLAN_EID_IBSS_PARAMS:
+ case WLAN_EID_CHALLENGE:
+ case WLAN_EID_RSN:
+ case WLAN_EID_ERP_INFO:
+ case WLAN_EID_EXT_SUPP_RATES:
+ case WLAN_EID_HT_CAPABILITY:
+ case WLAN_EID_HT_OPERATION:
+ case WLAN_EID_VHT_CAPABILITY:
+ case WLAN_EID_VHT_OPERATION:
+ case WLAN_EID_MESH_ID:
+ case WLAN_EID_MESH_CONFIG:
+ case WLAN_EID_PEER_MGMT:
+ case WLAN_EID_PREQ:
+ case WLAN_EID_PREP:
+ case WLAN_EID_PERR:
+ case WLAN_EID_RANN:
+ case WLAN_EID_CHANNEL_SWITCH:
+ case WLAN_EID_EXT_CHANSWITCH_ANN:
+ case WLAN_EID_COUNTRY:
+ case WLAN_EID_PWR_CONSTRAINT:
+ case WLAN_EID_TIMEOUT_INTERVAL:
+ case WLAN_EID_SECONDARY_CHANNEL_OFFSET:
+ case WLAN_EID_WIDE_BW_CHANNEL_SWITCH:
+ case WLAN_EID_CHAN_SWITCH_PARAM:
+ case WLAN_EID_EXT_CAPABILITY:
+ case WLAN_EID_CHAN_SWITCH_TIMING:
+ case WLAN_EID_LINK_ID:
+ /*
+ * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible
+ * that if the content gets bigger it might be needed more than once
+ */
+ if (test_bit(id, seen_elems)) {
+ elems->parse_error = true;
+ left -= elen;
+ pos += elen;
+ continue;
+ }
+ break;
+ }
+
+ if (calc_crc && id < 64 && (filter & (1ULL << id)))
+ crc = crc32_be(crc, pos - 2, elen + 2);
+
+ elem_parse_failed = false;
+
+ switch (id) {
+ case WLAN_EID_LINK_ID:
+ if (elen + 2 != sizeof(struct ieee80211_tdls_lnkie)) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->lnk_id = (void *)(pos - 2);
+ break;
+ case WLAN_EID_CHAN_SWITCH_TIMING:
+ if (elen != sizeof(struct ieee80211_ch_switch_timing)) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->ch_sw_timing = (void *)pos;
+ break;
+ case WLAN_EID_EXT_CAPABILITY:
+ elems->ext_capab = pos;
+ elems->ext_capab_len = elen;
+ break;
+ case WLAN_EID_SSID:
+ elems->ssid = pos;
+ elems->ssid_len = elen;
+ break;
+ case WLAN_EID_SUPP_RATES:
+ elems->supp_rates = pos;
+ elems->supp_rates_len = elen;
+ break;
+ case WLAN_EID_DS_PARAMS:
+ if (elen >= 1)
+ elems->ds_params = pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_TIM:
+ if (elen >= sizeof(struct ieee80211_tim_ie)) {
+ elems->tim = (void *)pos;
+ elems->tim_len = elen;
+ } else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_CHALLENGE:
+ elems->challenge = pos;
+ elems->challenge_len = elen;
+ break;
+ case WLAN_EID_VENDOR_SPECIFIC:
+ if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 &&
+ pos[2] == 0xf2) {
+ /* Microsoft OUI (00:50:F2) */
+
+ if (calc_crc)
+ crc = crc32_be(crc, pos - 2, elen + 2);
+
+ if (elen >= 5 && pos[3] == 2) {
+ /* OUI Type 2 - WMM IE */
+ if (pos[4] == 0) {
+ elems->wmm_info = pos;
+ elems->wmm_info_len = elen;
+ } else if (pos[4] == 1) {
+ elems->wmm_param = pos;
+ elems->wmm_param_len = elen;
+ }
+ }
+ }
+ break;
+ case WLAN_EID_RSN:
+ elems->rsn = pos;
+ elems->rsn_len = elen;
+ break;
+ case WLAN_EID_ERP_INFO:
+ if (elen >= 1)
+ elems->erp_info = pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_EXT_SUPP_RATES:
+ elems->ext_supp_rates = pos;
+ elems->ext_supp_rates_len = elen;
+ break;
+ case WLAN_EID_HT_CAPABILITY:
+ if (elen >= sizeof(struct ieee80211_ht_cap))
+ elems->ht_cap_elem = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_HT_OPERATION:
+ if (elen >= sizeof(struct ieee80211_ht_operation))
+ elems->ht_operation = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_VHT_CAPABILITY:
+ if (elen >= sizeof(struct ieee80211_vht_cap))
+ elems->vht_cap_elem = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_VHT_OPERATION:
+ if (elen >= sizeof(struct ieee80211_vht_operation))
+ elems->vht_operation = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_OPMODE_NOTIF:
+ if (elen > 0)
+ elems->opmode_notif = pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_MESH_ID:
+ elems->mesh_id = pos;
+ elems->mesh_id_len = elen;
+ break;
+ case WLAN_EID_MESH_CONFIG:
+ if (elen >= sizeof(struct ieee80211_meshconf_ie))
+ elems->mesh_config = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_PEER_MGMT:
+ elems->peering = pos;
+ elems->peering_len = elen;
+ break;
+ case WLAN_EID_MESH_AWAKE_WINDOW:
+ if (elen >= 2)
+ elems->awake_window = (void *)pos;
+ break;
+ case WLAN_EID_PREQ:
+ elems->preq = pos;
+ elems->preq_len = elen;
+ break;
+ case WLAN_EID_PREP:
+ elems->prep = pos;
+ elems->prep_len = elen;
+ break;
+ case WLAN_EID_PERR:
+ elems->perr = pos;
+ elems->perr_len = elen;
+ break;
+ case WLAN_EID_RANN:
+ if (elen >= sizeof(struct ieee80211_rann_ie))
+ elems->rann = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ case WLAN_EID_CHANNEL_SWITCH:
+ if (elen != sizeof(struct ieee80211_channel_sw_ie)) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->ch_switch_ie = (void *)pos;
+ break;
+ case WLAN_EID_EXT_CHANSWITCH_ANN:
+ if (elen != sizeof(struct ieee80211_ext_chansw_ie)) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->ext_chansw_ie = (void *)pos;
+ break;
+ case WLAN_EID_SECONDARY_CHANNEL_OFFSET:
+ if (elen != sizeof(struct ieee80211_sec_chan_offs_ie)) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->sec_chan_offs = (void *)pos;
+ break;
+ case WLAN_EID_CHAN_SWITCH_PARAM:
+ if (elen !=
+ sizeof(*elems->mesh_chansw_params_ie)) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->mesh_chansw_params_ie = (void *)pos;
+ break;
+ case WLAN_EID_WIDE_BW_CHANNEL_SWITCH:
+ if (!action ||
+ elen != sizeof(*elems->wide_bw_chansw_ie)) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->wide_bw_chansw_ie = (void *)pos;
+ break;
+ case WLAN_EID_CHANNEL_SWITCH_WRAPPER:
+ if (action) {
+ elem_parse_failed = true;
+ break;
+ }
+ /*
+ * This is a bit tricky, but as we only care about
+ * the wide bandwidth channel switch element, so
+ * just parse it out manually.
+ */
+ ie = cfg80211_find_ie(WLAN_EID_WIDE_BW_CHANNEL_SWITCH,
+ pos, elen);
+ if (ie) {
+ if (ie[1] == sizeof(*elems->wide_bw_chansw_ie))
+ elems->wide_bw_chansw_ie =
+ (void *)(ie + 2);
+ else
+ elem_parse_failed = true;
+ }
+ break;
+ case WLAN_EID_COUNTRY:
+ elems->country_elem = pos;
+ elems->country_elem_len = elen;
+ break;
+ case WLAN_EID_PWR_CONSTRAINT:
+ if (elen != 1) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->pwr_constr_elem = pos;
+ break;
+ case WLAN_EID_CISCO_VENDOR_SPECIFIC:
+ /* Lots of different options exist, but we only care
+ * about the Dynamic Transmit Power Control element.
+ * First check for the Cisco OUI, then for the DTPC
+ * tag (0x00).
+ */
+ if (elen < 4) {
+ elem_parse_failed = true;
+ break;
+ }
+
+ if (pos[0] != 0x00 || pos[1] != 0x40 ||
+ pos[2] != 0x96 || pos[3] != 0x00)
+ break;
+
+ if (elen != 6) {
+ elem_parse_failed = true;
+ break;
+ }
+
+ if (calc_crc)
+ crc = crc32_be(crc, pos - 2, elen + 2);
+
+ elems->cisco_dtpc_elem = pos;
+ break;
+ case WLAN_EID_TIMEOUT_INTERVAL:
+ if (elen >= sizeof(struct ieee80211_timeout_interval_ie))
+ elems->timeout_int = (void *)pos;
+ else
+ elem_parse_failed = true;
+ break;
+ default:
+ break;
+ }
+
+ if (elem_parse_failed)
+ elems->parse_error = true;
+ else
+ __set_bit(id, seen_elems);
+
+ left -= elen;
+ pos += elen;
+ }
+
+ if (left != 0)
+ elems->parse_error = true;
+
+ return crc;
+}
+
+void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
+ bool bss_notify)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_queue_params qparam;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ int ac;
+ bool use_11b, enable_qos;
+ bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */
+ int aCWmin, aCWmax;
+
+ if (!local->ops->conf_tx)
+ return;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ return;
+
+ memset(&qparam, 0, sizeof(qparam));
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ use_11b = (chanctx_conf &&
+ chanctx_conf->def.chan->band == IEEE80211_BAND_2GHZ) &&
+ !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE);
+ rcu_read_unlock();
+
+ /*
+ * By default disable QoS in STA mode for old access points, which do
+ * not support 802.11e. New APs will provide proper queue parameters,
+ * that we will configure later.
+ */
+ enable_qos = (sdata->vif.type != NL80211_IFTYPE_STATION);
+
+ is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB);
+
+ /* Set defaults according to 802.11-2007 Table 7-37 */
+ aCWmax = 1023;
+ if (use_11b)
+ aCWmin = 31;
+ else
+ aCWmin = 15;
+
+ /* Confiure old 802.11b/g medium access rules. */
+ qparam.cw_max = aCWmax;
+ qparam.cw_min = aCWmin;
+ qparam.txop = 0;
+ qparam.aifs = 2;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ /* Update if QoS is enabled. */
+ if (enable_qos) {
+ switch (ac) {
+ case IEEE80211_AC_BK:
+ qparam.cw_max = aCWmax;
+ qparam.cw_min = aCWmin;
+ qparam.txop = 0;
+ if (is_ocb)
+ qparam.aifs = 9;
+ else
+ qparam.aifs = 7;
+ break;
+ /* never happens but let's not leave undefined */
+ default:
+ case IEEE80211_AC_BE:
+ qparam.cw_max = aCWmax;
+ qparam.cw_min = aCWmin;
+ qparam.txop = 0;
+ if (is_ocb)
+ qparam.aifs = 6;
+ else
+ qparam.aifs = 3;
+ break;
+ case IEEE80211_AC_VI:
+ qparam.cw_max = aCWmin;
+ qparam.cw_min = (aCWmin + 1) / 2 - 1;
+ if (is_ocb)
+ qparam.txop = 0;
+ else if (use_11b)
+ qparam.txop = 6016/32;
+ else
+ qparam.txop = 3008/32;
+
+ if (is_ocb)
+ qparam.aifs = 3;
+ else
+ qparam.aifs = 2;
+ break;
+ case IEEE80211_AC_VO:
+ qparam.cw_max = (aCWmin + 1) / 2 - 1;
+ qparam.cw_min = (aCWmin + 1) / 4 - 1;
+ if (is_ocb)
+ qparam.txop = 0;
+ else if (use_11b)
+ qparam.txop = 3264/32;
+ else
+ qparam.txop = 1504/32;
+ qparam.aifs = 2;
+ break;
+ }
+ }
+
+ qparam.uapsd = false;
+
+ sdata->tx_conf[ac] = qparam;
+ drv_conf_tx(local, sdata, ac, &qparam);
+ }
+
+ if (sdata->vif.type != NL80211_IFTYPE_MONITOR &&
+ sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) {
+ sdata->vif.bss_conf.qos = enable_qos;
+ if (bss_notify)
+ ieee80211_bss_info_change_notify(sdata,
+ BSS_CHANGED_QOS);
+ }
+}
+
+void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+ u16 transaction, u16 auth_alg, u16 status,
+ const u8 *extra, size_t extra_len, const u8 *da,
+ const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx,
+ u32 tx_flags)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ int err;
+
+ /* 24 + 6 = header + auth_algo + auth_transaction + status_code */
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN +
+ 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN);
+
+ mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
+ memset(mgmt, 0, 24 + 6);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_AUTH);
+ memcpy(mgmt->da, da, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, bssid, ETH_ALEN);
+ mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg);
+ mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
+ mgmt->u.auth.status_code = cpu_to_le16(status);
+ if (extra)
+ memcpy(skb_put(skb, extra_len), extra, extra_len);
+
+ if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) {
+ mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+ err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx);
+ WARN_ON(err);
+ }
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
+ tx_flags;
+ ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid, u16 stype, u16 reason,
+ bool send_frame, u8 *frame_buf)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt = (void *)frame_buf;
+
+ /* build frame */
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype);
+ mgmt->duration = 0; /* initialize only */
+ mgmt->seq_ctrl = 0; /* initialize only */
+ memcpy(mgmt->da, bssid, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, bssid, ETH_ALEN);
+ /* u.deauth.reason_code == u.disassoc.reason_code */
+ mgmt->u.deauth.reason_code = cpu_to_le16(reason);
+
+ if (send_frame) {
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+ IEEE80211_DEAUTH_FRAME_LEN);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ /* copy in frame */
+ memcpy(skb_put(skb, IEEE80211_DEAUTH_FRAME_LEN),
+ mgmt, IEEE80211_DEAUTH_FRAME_LEN);
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION ||
+ !(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED))
+ IEEE80211_SKB_CB(skb)->flags |=
+ IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+ ieee80211_tx_skb(sdata, skb);
+ }
+}
+
+static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
+ u8 *buffer, size_t buffer_len,
+ const u8 *ie, size_t ie_len,
+ enum ieee80211_band band,
+ u32 rate_mask,
+ struct cfg80211_chan_def *chandef,
+ size_t *offset)
+{
+ struct ieee80211_supported_band *sband;
+ u8 *pos = buffer, *end = buffer + buffer_len;
+ size_t noffset;
+ int supp_rates_len, i;
+ u8 rates[32];
+ int num_rates;
+ int ext_rates_len;
+ int shift;
+ u32 rate_flags;
+ bool have_80mhz = false;
+
+ *offset = 0;
+
+ sband = local->hw.wiphy->bands[band];
+ if (WARN_ON_ONCE(!sband))
+ return 0;
+
+ rate_flags = ieee80211_chandef_rate_flags(chandef);
+ shift = ieee80211_chandef_get_shift(chandef);
+
+ num_rates = 0;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if ((BIT(i) & rate_mask) == 0)
+ continue; /* skip rate */
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+
+ rates[num_rates++] =
+ (u8) DIV_ROUND_UP(sband->bitrates[i].bitrate,
+ (1 << shift) * 5);
+ }
+
+ supp_rates_len = min_t(int, num_rates, 8);
+
+ if (end - pos < 2 + supp_rates_len)
+ goto out_err;
+ *pos++ = WLAN_EID_SUPP_RATES;
+ *pos++ = supp_rates_len;
+ memcpy(pos, rates, supp_rates_len);
+ pos += supp_rates_len;
+
+ /* insert "request information" if in custom IEs */
+ if (ie && ie_len) {
+ static const u8 before_extrates[] = {
+ WLAN_EID_SSID,
+ WLAN_EID_SUPP_RATES,
+ WLAN_EID_REQUEST,
+ };
+ noffset = ieee80211_ie_split(ie, ie_len,
+ before_extrates,
+ ARRAY_SIZE(before_extrates),
+ *offset);
+ if (end - pos < noffset - *offset)
+ goto out_err;
+ memcpy(pos, ie + *offset, noffset - *offset);
+ pos += noffset - *offset;
+ *offset = noffset;
+ }
+
+ ext_rates_len = num_rates - supp_rates_len;
+ if (ext_rates_len > 0) {
+ if (end - pos < 2 + ext_rates_len)
+ goto out_err;
+ *pos++ = WLAN_EID_EXT_SUPP_RATES;
+ *pos++ = ext_rates_len;
+ memcpy(pos, rates + supp_rates_len, ext_rates_len);
+ pos += ext_rates_len;
+ }
+
+ if (chandef->chan && sband->band == IEEE80211_BAND_2GHZ) {
+ if (end - pos < 3)
+ goto out_err;
+ *pos++ = WLAN_EID_DS_PARAMS;
+ *pos++ = 1;
+ *pos++ = ieee80211_frequency_to_channel(
+ chandef->chan->center_freq);
+ }
+
+ /* insert custom IEs that go before HT */
+ if (ie && ie_len) {
+ static const u8 before_ht[] = {
+ WLAN_EID_SSID,
+ WLAN_EID_SUPP_RATES,
+ WLAN_EID_REQUEST,
+ WLAN_EID_EXT_SUPP_RATES,
+ WLAN_EID_DS_PARAMS,
+ WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+ };
+ noffset = ieee80211_ie_split(ie, ie_len,
+ before_ht, ARRAY_SIZE(before_ht),
+ *offset);
+ if (end - pos < noffset - *offset)
+ goto out_err;
+ memcpy(pos, ie + *offset, noffset - *offset);
+ pos += noffset - *offset;
+ *offset = noffset;
+ }
+
+ if (sband->ht_cap.ht_supported) {
+ if (end - pos < 2 + sizeof(struct ieee80211_ht_cap))
+ goto out_err;
+ pos = ieee80211_ie_build_ht_cap(pos, &sband->ht_cap,
+ sband->ht_cap.cap);
+ }
+
+ /*
+ * If adding more here, adjust code in main.c
+ * that calculates local->scan_ies_len.
+ */
+
+ /* insert custom IEs that go before VHT */
+ if (ie && ie_len) {
+ static const u8 before_vht[] = {
+ WLAN_EID_SSID,
+ WLAN_EID_SUPP_RATES,
+ WLAN_EID_REQUEST,
+ WLAN_EID_EXT_SUPP_RATES,
+ WLAN_EID_DS_PARAMS,
+ WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+ WLAN_EID_HT_CAPABILITY,
+ WLAN_EID_BSS_COEX_2040,
+ WLAN_EID_EXT_CAPABILITY,
+ WLAN_EID_SSID_LIST,
+ WLAN_EID_CHANNEL_USAGE,
+ WLAN_EID_INTERWORKING,
+ /* mesh ID can't happen here */
+ /* 60 GHz can't happen here right now */
+ };
+ noffset = ieee80211_ie_split(ie, ie_len,
+ before_vht, ARRAY_SIZE(before_vht),
+ *offset);
+ if (end - pos < noffset - *offset)
+ goto out_err;
+ memcpy(pos, ie + *offset, noffset - *offset);
+ pos += noffset - *offset;
+ *offset = noffset;
+ }
+
+ /* Check if any channel in this sband supports at least 80 MHz */
+ for (i = 0; i < sband->n_channels; i++) {
+ if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+ IEEE80211_CHAN_NO_80MHZ))
+ continue;
+
+ have_80mhz = true;
+ break;
+ }
+
+ if (sband->vht_cap.vht_supported && have_80mhz) {
+ if (end - pos < 2 + sizeof(struct ieee80211_vht_cap))
+ goto out_err;
+ pos = ieee80211_ie_build_vht_cap(pos, &sband->vht_cap,
+ sband->vht_cap.cap);
+ }
+
+ return pos - buffer;
+ out_err:
+ WARN_ONCE(1, "not enough space for preq IEs\n");
+ return pos - buffer;
+}
+
+int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
+ size_t buffer_len,
+ struct ieee80211_scan_ies *ie_desc,
+ const u8 *ie, size_t ie_len,
+ u8 bands_used, u32 *rate_masks,
+ struct cfg80211_chan_def *chandef)
+{
+ size_t pos = 0, old_pos = 0, custom_ie_offset = 0;
+ int i;
+
+ memset(ie_desc, 0, sizeof(*ie_desc));
+
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ if (bands_used & BIT(i)) {
+ pos += ieee80211_build_preq_ies_band(local,
+ buffer + pos,
+ buffer_len - pos,
+ ie, ie_len, i,
+ rate_masks[i],
+ chandef,
+ &custom_ie_offset);
+ ie_desc->ies[i] = buffer + old_pos;
+ ie_desc->len[i] = pos - old_pos;
+ old_pos = pos;
+ }
+ }
+
+ /* add any remaining custom IEs */
+ if (ie && ie_len) {
+ if (WARN_ONCE(buffer_len - pos < ie_len - custom_ie_offset,
+ "not enough space for preq custom IEs\n"))
+ return pos;
+ memcpy(buffer + pos, ie + custom_ie_offset,
+ ie_len - custom_ie_offset);
+ ie_desc->common_ies = buffer + pos;
+ ie_desc->common_ie_len = ie_len - custom_ie_offset;
+ pos += ie_len - custom_ie_offset;
+ }
+
+ return pos;
+};
+
+struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
+ const u8 *src, const u8 *dst,
+ u32 ratemask,
+ struct ieee80211_channel *chan,
+ const u8 *ssid, size_t ssid_len,
+ const u8 *ie, size_t ie_len,
+ bool directed)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct cfg80211_chan_def chandef;
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ int ies_len;
+ u32 rate_masks[IEEE80211_NUM_BANDS] = {};
+ struct ieee80211_scan_ies dummy_ie_desc;
+
+ /*
+ * Do not send DS Channel parameter for directed probe requests
+ * in order to maximize the chance that we get a response. Some
+ * badly-behaved APs don't respond when this parameter is included.
+ */
+ chandef.width = sdata->vif.bss_conf.chandef.width;
+ if (directed)
+ chandef.chan = NULL;
+ else
+ chandef.chan = chan;
+
+ skb = ieee80211_probereq_get(&local->hw, src, ssid, ssid_len,
+ 100 + ie_len);
+ if (!skb)
+ return NULL;
+
+ rate_masks[chan->band] = ratemask;
+ ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb),
+ skb_tailroom(skb), &dummy_ie_desc,
+ ie, ie_len, BIT(chan->band),
+ rate_masks, &chandef);
+ skb_put(skb, ies_len);
+
+ if (dst) {
+ mgmt = (struct ieee80211_mgmt *) skb->data;
+ memcpy(mgmt->da, dst, ETH_ALEN);
+ memcpy(mgmt->bssid, dst, ETH_ALEN);
+ }
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+ return skb;
+}
+
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
+ const u8 *src, const u8 *dst,
+ const u8 *ssid, size_t ssid_len,
+ const u8 *ie, size_t ie_len,
+ u32 ratemask, bool directed, u32 tx_flags,
+ struct ieee80211_channel *channel, bool scan)
+{
+ struct sk_buff *skb;
+
+ skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel,
+ ssid, ssid_len,
+ ie, ie_len, directed);
+ if (skb) {
+ IEEE80211_SKB_CB(skb)->flags |= tx_flags;
+ if (scan)
+ ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band);
+ else
+ ieee80211_tx_skb(sdata, skb);
+ }
+}
+
+u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
+ struct ieee802_11_elems *elems,
+ enum ieee80211_band band, u32 *basic_rates)
+{
+ struct ieee80211_supported_band *sband;
+ size_t num_rates;
+ u32 supp_rates, rate_flags;
+ int i, j, shift;
+ sband = sdata->local->hw.wiphy->bands[band];
+
+ rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+ shift = ieee80211_vif_get_shift(&sdata->vif);
+
+ if (WARN_ON(!sband))
+ return 1;
+
+ num_rates = sband->n_bitrates;
+ supp_rates = 0;
+ for (i = 0; i < elems->supp_rates_len +
+ elems->ext_supp_rates_len; i++) {
+ u8 rate = 0;
+ int own_rate;
+ bool is_basic;
+ if (i < elems->supp_rates_len)
+ rate = elems->supp_rates[i];
+ else if (elems->ext_supp_rates)
+ rate = elems->ext_supp_rates
+ [i - elems->supp_rates_len];
+ own_rate = 5 * (rate & 0x7f);
+ is_basic = !!(rate & 0x80);
+
+ if (is_basic && (rate & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
+ continue;
+
+ for (j = 0; j < num_rates; j++) {
+ int brate;
+ if ((rate_flags & sband->bitrates[j].flags)
+ != rate_flags)
+ continue;
+
+ brate = DIV_ROUND_UP(sband->bitrates[j].bitrate,
+ 1 << shift);
+
+ if (brate == own_rate) {
+ supp_rates |= BIT(j);
+ if (basic_rates && is_basic)
+ *basic_rates |= BIT(j);
+ }
+ }
+ }
+ return supp_rates;
+}
+
+void ieee80211_stop_device(struct ieee80211_local *local)
+{
+ ieee80211_led_radio(local, false);
+ ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO);
+
+ cancel_work_sync(&local->reconfig_filter);
+
+ flush_workqueue(local->workqueue);
+ drv_stop(local);
+}
+
+static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_chanctx *ctx;
+
+ /*
+ * We get here if during resume the device can't be restarted properly.
+ * We might also get here if this happens during HW reset, which is a
+ * slightly different situation and we need to drop all connections in
+ * the latter case.
+ *
+ * Ask cfg80211 to turn off all interfaces, this will result in more
+ * warnings but at least we'll then get into a clean stopped state.
+ */
+
+ local->resuming = false;
+ local->suspended = false;
+ local->started = false;
+
+ /* scheduled scan clearly can't be running any more, but tell
+ * cfg80211 and clear local state
+ */
+ ieee80211_sched_scan_end(local);
+
+ list_for_each_entry(sdata, &local->interfaces, list)
+ sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER;
+
+ /* Mark channel contexts as not being in the driver any more to avoid
+ * removing them from the driver during the shutdown process...
+ */
+ mutex_lock(&local->chanctx_mtx);
+ list_for_each_entry(ctx, &local->chanctx_list, list)
+ ctx->driver_present = false;
+ mutex_unlock(&local->chanctx_mtx);
+
+ cfg80211_shutdown_all_interfaces(local->hw.wiphy);
+}
+
+static void ieee80211_assign_chanctx(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *ctx;
+
+ if (!local->use_chanctx)
+ return;
+
+ mutex_lock(&local->chanctx_mtx);
+ conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+ if (conf) {
+ ctx = container_of(conf, struct ieee80211_chanctx, conf);
+ drv_assign_vif_chanctx(local, sdata, ctx);
+ }
+ mutex_unlock(&local->chanctx_mtx);
+}
+
+int ieee80211_reconfig(struct ieee80211_local *local)
+{
+ struct ieee80211_hw *hw = &local->hw;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_chanctx *ctx;
+ struct sta_info *sta;
+ int res, i;
+ bool reconfig_due_to_wowlan = false;
+ struct ieee80211_sub_if_data *sched_scan_sdata;
+ struct cfg80211_sched_scan_request *sched_scan_req;
+ bool sched_scan_stopped = false;
+
+ /* nothing to do if HW shouldn't run */
+ if (!local->open_count)
+ goto wake_up;
+
+#ifdef CONFIG_PM
+ if (local->suspended)
+ local->resuming = true;
+
+ if (local->wowlan) {
+ res = drv_resume(local);
+ local->wowlan = false;
+ if (res < 0) {
+ local->resuming = false;
+ return res;
+ }
+ if (res == 0)
+ goto wake_up;
+ WARN_ON(res > 1);
+ /*
+ * res is 1, which means the driver requested
+ * to go through a regular reset on wakeup.
+ */
+ reconfig_due_to_wowlan = true;
+ }
+#endif
+
+ /*
+ * Upon resume hardware can sometimes be goofy due to
+ * various platform / driver / bus issues, so restarting
+ * the device may at times not work immediately. Propagate
+ * the error.
+ */
+ res = drv_start(local);
+ if (res) {
+ if (local->suspended)
+ WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n");
+ else
+ WARN(1, "Hardware became unavailable during restart.\n");
+ ieee80211_handle_reconfig_failure(local);
+ return res;
+ }
+
+ /* setup fragmentation threshold */
+ drv_set_frag_threshold(local, hw->wiphy->frag_threshold);
+
+ /* setup RTS threshold */
+ drv_set_rts_threshold(local, hw->wiphy->rts_threshold);
+
+ /* reset coverage class */
+ drv_set_coverage_class(local, hw->wiphy->coverage_class);
+
+ ieee80211_led_radio(local, true);
+ ieee80211_mod_tpt_led_trig(local,
+ IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
+
+ /* add interfaces */
+ sdata = rtnl_dereference(local->monitor_sdata);
+ if (sdata) {
+ /* in HW restart it exists already */
+ WARN_ON(local->resuming);
+ res = drv_add_interface(local, sdata);
+ if (WARN_ON(res)) {
+ RCU_INIT_POINTER(local->monitor_sdata, NULL);
+ synchronize_net();
+ kfree(sdata);
+ }
+ }
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ sdata->vif.type != NL80211_IFTYPE_MONITOR &&
+ ieee80211_sdata_running(sdata)) {
+ res = drv_add_interface(local, sdata);
+ if (WARN_ON(res))
+ break;
+ }
+ }
+
+ /* If adding any of the interfaces failed above, roll back and
+ * report failure.
+ */
+ if (res) {
+ list_for_each_entry_continue_reverse(sdata, &local->interfaces,
+ list)
+ if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+ sdata->vif.type != NL80211_IFTYPE_MONITOR &&
+ ieee80211_sdata_running(sdata))
+ drv_remove_interface(local, sdata);
+ ieee80211_handle_reconfig_failure(local);
+ return res;
+ }
+
+ /* add channel contexts */
+ if (local->use_chanctx) {
+ mutex_lock(&local->chanctx_mtx);
+ list_for_each_entry(ctx, &local->chanctx_list, list)
+ if (ctx->replace_state !=
+ IEEE80211_CHANCTX_REPLACES_OTHER)
+ WARN_ON(drv_add_chanctx(local, ctx));
+ mutex_unlock(&local->chanctx_mtx);
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ ieee80211_assign_chanctx(local, sdata);
+ }
+
+ sdata = rtnl_dereference(local->monitor_sdata);
+ if (sdata && ieee80211_sdata_running(sdata))
+ ieee80211_assign_chanctx(local, sdata);
+ }
+
+ /* add STAs back */
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta, &local->sta_list, list) {
+ enum ieee80211_sta_state state;
+
+ if (!sta->uploaded)
+ continue;
+
+ /* AP-mode stations will be added later */
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP)
+ continue;
+
+ for (state = IEEE80211_STA_NOTEXIST;
+ state < sta->sta_state; state++)
+ WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
+ state + 1));
+ }
+ mutex_unlock(&local->sta_mtx);
+
+ /* reconfigure tx conf */
+ if (hw->queues >= IEEE80211_NUM_ACS) {
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+ !ieee80211_sdata_running(sdata))
+ continue;
+
+ for (i = 0; i < IEEE80211_NUM_ACS; i++)
+ drv_conf_tx(local, sdata, i,
+ &sdata->tx_conf[i]);
+ }
+ }
+
+ /* reconfigure hardware */
+ ieee80211_hw_config(local, ~0);
+
+ ieee80211_configure_filter(local);
+
+ /* Finally also reconfigure all the BSS information */
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ u32 changed;
+
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+
+ /* common change flags for all interface types */
+ changed = BSS_CHANGED_ERP_CTS_PROT |
+ BSS_CHANGED_ERP_PREAMBLE |
+ BSS_CHANGED_ERP_SLOT |
+ BSS_CHANGED_HT |
+ BSS_CHANGED_BASIC_RATES |
+ BSS_CHANGED_BEACON_INT |
+ BSS_CHANGED_BSSID |
+ BSS_CHANGED_CQM |
+ BSS_CHANGED_QOS |
+ BSS_CHANGED_IDLE |
+ BSS_CHANGED_TXPOWER;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ changed |= BSS_CHANGED_ASSOC |
+ BSS_CHANGED_ARP_FILTER |
+ BSS_CHANGED_PS;
+
+ /* Re-send beacon info report to the driver */
+ if (sdata->u.mgd.have_beacon)
+ changed |= BSS_CHANGED_BEACON_INFO;
+
+ sdata_lock(sdata);
+ ieee80211_bss_info_change_notify(sdata, changed);
+ sdata_unlock(sdata);
+ break;
+ case NL80211_IFTYPE_OCB:
+ changed |= BSS_CHANGED_OCB;
+ ieee80211_bss_info_change_notify(sdata, changed);
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ changed |= BSS_CHANGED_IBSS;
+ /* fall through */
+ case NL80211_IFTYPE_AP:
+ changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ changed |= BSS_CHANGED_AP_PROBE_RESP;
+
+ if (rcu_access_pointer(sdata->u.ap.beacon))
+ drv_start_ap(local, sdata);
+ }
+
+ /* fall through */
+ case NL80211_IFTYPE_MESH_POINT:
+ if (sdata->vif.bss_conf.enable_beacon) {
+ changed |= BSS_CHANGED_BEACON |
+ BSS_CHANGED_BEACON_ENABLED;
+ ieee80211_bss_info_change_notify(sdata, changed);
+ }
+ break;
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ /* nothing to do */
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ ieee80211_recalc_ps(local, -1);
+
+ /*
+ * The sta might be in psm against the ap (e.g. because
+ * this was the state before a hw restart), so we
+ * explicitly send a null packet in order to make sure
+ * it'll sync against the ap (and get out of psm).
+ */
+ if (!(local->hw.conf.flags & IEEE80211_CONF_PS)) {
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ continue;
+ if (!sdata->u.mgd.associated)
+ continue;
+
+ ieee80211_send_nullfunc(local, sdata, 0);
+ }
+ }
+
+ /* APs are now beaconing, add back stations */
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta, &local->sta_list, list) {
+ enum ieee80211_sta_state state;
+
+ if (!sta->uploaded)
+ continue;
+
+ if (sta->sdata->vif.type != NL80211_IFTYPE_AP)
+ continue;
+
+ for (state = IEEE80211_STA_NOTEXIST;
+ state < sta->sta_state; state++)
+ WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
+ state + 1));
+ }
+ mutex_unlock(&local->sta_mtx);
+
+ /* add back keys */
+ list_for_each_entry(sdata, &local->interfaces, list)
+ ieee80211_reset_crypto_tx_tailroom(sdata);
+
+ list_for_each_entry(sdata, &local->interfaces, list)
+ if (ieee80211_sdata_running(sdata))
+ ieee80211_enable_keys(sdata);
+
+ wake_up:
+ local->in_reconfig = false;
+ barrier();
+
+ if (local->monitors == local->open_count && local->monitors > 0)
+ ieee80211_add_virtual_monitor(local);
+
+ /*
+ * Clear the WLAN_STA_BLOCK_BA flag so new aggregation
+ * sessions can be established after a resume.
+ *
+ * Also tear down aggregation sessions since reconfiguring
+ * them in a hardware restart scenario is not easily done
+ * right now, and the hardware will have lost information
+ * about the sessions, but we and the AP still think they
+ * are active. This is really a workaround though.
+ */
+ if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+ mutex_lock(&local->sta_mtx);
+
+ list_for_each_entry(sta, &local->sta_list, list) {
+ ieee80211_sta_tear_down_BA_sessions(
+ sta, AGG_STOP_LOCAL_REQUEST);
+ clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ }
+
+ mutex_unlock(&local->sta_mtx);
+ }
+
+ ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
+ IEEE80211_QUEUE_STOP_REASON_SUSPEND,
+ false);
+
+ /*
+ * Reconfigure sched scan if it was interrupted by FW restart or
+ * suspend.
+ */
+ mutex_lock(&local->mtx);
+ sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata,
+ lockdep_is_held(&local->mtx));
+ sched_scan_req = rcu_dereference_protected(local->sched_scan_req,
+ lockdep_is_held(&local->mtx));
+ if (sched_scan_sdata && sched_scan_req)
+ /*
+ * Sched scan stopped, but we don't want to report it. Instead,
+ * we're trying to reschedule.
+ */
+ if (__ieee80211_request_sched_scan_start(sched_scan_sdata,
+ sched_scan_req))
+ sched_scan_stopped = true;
+ mutex_unlock(&local->mtx);
+
+ if (sched_scan_stopped)
+ cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy);
+
+ /*
+ * If this is for hw restart things are still running.
+ * We may want to change that later, however.
+ */
+ if (local->open_count && (!local->suspended || reconfig_due_to_wowlan))
+ drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART);
+
+ if (!local->suspended)
+ return 0;
+
+#ifdef CONFIG_PM
+ /* first set suspended false, then resuming */
+ local->suspended = false;
+ mb();
+ local->resuming = false;
+
+ /* It's possible that we don't handle the scan completion in
+ * time during suspend, so if it's still marked as completed
+ * here, queue the work and flush it to clean things up.
+ * Instead of calling the worker function directly here, we
+ * really queue it to avoid potential races with other flows
+ * scheduling the same work.
+ */
+ if (test_bit(SCAN_COMPLETED, &local->scanning)) {
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
+ flush_delayed_work(&local->scan_work);
+ }
+
+ if (local->open_count && !reconfig_due_to_wowlan)
+ drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND);
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ if (sdata->vif.type == NL80211_IFTYPE_STATION)
+ ieee80211_sta_restart(sdata);
+ }
+
+ mod_timer(&local->sta_cleanup, jiffies + 1);
+#else
+ WARN_ON(1);
+#endif
+
+ return 0;
+}
+
+void ieee80211_resume_disconnect(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_local *local;
+ struct ieee80211_key *key;
+
+ if (WARN_ON(!vif))
+ return;
+
+ sdata = vif_to_sdata(vif);
+ local = sdata->local;
+
+ if (WARN_ON(!local->resuming))
+ return;
+
+ if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+ return;
+
+ sdata->flags |= IEEE80211_SDATA_DISCONNECT_RESUME;
+
+ mutex_lock(&local->key_mtx);
+ list_for_each_entry(key, &sdata->key_list, list)
+ key->flags |= KEY_FLAG_TAINTED;
+ mutex_unlock(&local->key_mtx);
+}
+EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect);
+
+void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_chanctx *chanctx;
+
+ mutex_lock(&local->chanctx_mtx);
+
+ chanctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+
+ if (WARN_ON_ONCE(!chanctx_conf))
+ goto unlock;
+
+ chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf);
+ ieee80211_recalc_smps_chanctx(local, chanctx);
+ unlock:
+ mutex_unlock(&local->chanctx_mtx);
+}
+
+void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_chanctx *chanctx;
+
+ mutex_lock(&local->chanctx_mtx);
+
+ chanctx_conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
+ lockdep_is_held(&local->chanctx_mtx));
+
+ if (WARN_ON_ONCE(!chanctx_conf))
+ goto unlock;
+
+ chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf);
+ ieee80211_recalc_chanctx_min_def(local, chanctx);
+ unlock:
+ mutex_unlock(&local->chanctx_mtx);
+}
+
+size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset)
+{
+ size_t pos = offset;
+
+ while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC)
+ pos += 2 + ies[pos + 1];
+
+ return pos;
+}
+
+static void _ieee80211_enable_rssi_reports(struct ieee80211_sub_if_data *sdata,
+ int rssi_min_thold,
+ int rssi_max_thold)
+{
+ trace_api_enable_rssi_reports(sdata, rssi_min_thold, rssi_max_thold);
+
+ if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+ return;
+
+ /*
+ * Scale up threshold values before storing it, as the RSSI averaging
+ * algorithm uses a scaled up value as well. Change this scaling
+ * factor if the RSSI averaging algorithm changes.
+ */
+ sdata->u.mgd.rssi_min_thold = rssi_min_thold*16;
+ sdata->u.mgd.rssi_max_thold = rssi_max_thold*16;
+}
+
+void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif,
+ int rssi_min_thold,
+ int rssi_max_thold)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ WARN_ON(rssi_min_thold == rssi_max_thold ||
+ rssi_min_thold > rssi_max_thold);
+
+ _ieee80211_enable_rssi_reports(sdata, rssi_min_thold,
+ rssi_max_thold);
+}
+EXPORT_SYMBOL(ieee80211_enable_rssi_reports);
+
+void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ _ieee80211_enable_rssi_reports(sdata, 0, 0);
+}
+EXPORT_SYMBOL(ieee80211_disable_rssi_reports);
+
+u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
+ u16 cap)
+{
+ __le16 tmp;
+
+ *pos++ = WLAN_EID_HT_CAPABILITY;
+ *pos++ = sizeof(struct ieee80211_ht_cap);
+ memset(pos, 0, sizeof(struct ieee80211_ht_cap));
+
+ /* capability flags */
+ tmp = cpu_to_le16(cap);
+ memcpy(pos, &tmp, sizeof(u16));
+ pos += sizeof(u16);
+
+ /* AMPDU parameters */
+ *pos++ = ht_cap->ampdu_factor |
+ (ht_cap->ampdu_density <<
+ IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT);
+
+ /* MCS set */
+ memcpy(pos, &ht_cap->mcs, sizeof(ht_cap->mcs));
+ pos += sizeof(ht_cap->mcs);
+
+ /* extended capabilities */
+ pos += sizeof(__le16);
+
+ /* BF capabilities */
+ pos += sizeof(__le32);
+
+ /* antenna selection */
+ pos += sizeof(u8);
+
+ return pos;
+}
+
+u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
+ u32 cap)
+{
+ __le32 tmp;
+
+ *pos++ = WLAN_EID_VHT_CAPABILITY;
+ *pos++ = sizeof(struct ieee80211_vht_cap);
+ memset(pos, 0, sizeof(struct ieee80211_vht_cap));
+
+ /* capability flags */
+ tmp = cpu_to_le32(cap);
+ memcpy(pos, &tmp, sizeof(u32));
+ pos += sizeof(u32);
+
+ /* VHT MCS set */
+ memcpy(pos, &vht_cap->vht_mcs, sizeof(vht_cap->vht_mcs));
+ pos += sizeof(vht_cap->vht_mcs);
+
+ return pos;
+}
+
+u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
+ const struct cfg80211_chan_def *chandef,
+ u16 prot_mode)
+{
+ struct ieee80211_ht_operation *ht_oper;
+ /* Build HT Information */
+ *pos++ = WLAN_EID_HT_OPERATION;
+ *pos++ = sizeof(struct ieee80211_ht_operation);
+ ht_oper = (struct ieee80211_ht_operation *)pos;
+ ht_oper->primary_chan = ieee80211_frequency_to_channel(
+ chandef->chan->center_freq);
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_160:
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_40:
+ if (chandef->center_freq1 > chandef->chan->center_freq)
+ ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
+ else
+ ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_BELOW;
+ break;
+ default:
+ ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE;
+ break;
+ }
+ if (ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 &&
+ chandef->width != NL80211_CHAN_WIDTH_20_NOHT &&
+ chandef->width != NL80211_CHAN_WIDTH_20)
+ ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY;
+
+ ht_oper->operation_mode = cpu_to_le16(prot_mode);
+ ht_oper->stbc_param = 0x0000;
+
+ /* It seems that Basic MCS set and Supported MCS set
+ are identical for the first 10 bytes */
+ memset(&ht_oper->basic_set, 0, 16);
+ memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10);
+
+ return pos + sizeof(struct ieee80211_ht_operation);
+}
+
+u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
+ const struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_vht_operation *vht_oper;
+
+ *pos++ = WLAN_EID_VHT_OPERATION;
+ *pos++ = sizeof(struct ieee80211_vht_operation);
+ vht_oper = (struct ieee80211_vht_operation *)pos;
+ vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel(
+ chandef->center_freq1);
+ if (chandef->center_freq2)
+ vht_oper->center_freq_seg2_idx =
+ ieee80211_frequency_to_channel(chandef->center_freq2);
+
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_160:
+ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_160MHZ;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80P80MHZ;
+ break;
+ case NL80211_CHAN_WIDTH_80:
+ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
+ break;
+ default:
+ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT;
+ break;
+ }
+
+ /* don't require special VHT peer rates */
+ vht_oper->basic_mcs_set = cpu_to_le16(0xffff);
+
+ return pos + sizeof(struct ieee80211_vht_operation);
+}
+
+void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
+ const struct ieee80211_ht_operation *ht_oper,
+ struct cfg80211_chan_def *chandef)
+{
+ enum nl80211_channel_type channel_type;
+
+ if (!ht_oper) {
+ cfg80211_chandef_create(chandef, control_chan,
+ NL80211_CHAN_NO_HT);
+ return;
+ }
+
+ switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
+ case IEEE80211_HT_PARAM_CHA_SEC_NONE:
+ channel_type = NL80211_CHAN_HT20;
+ break;
+ case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+ channel_type = NL80211_CHAN_HT40PLUS;
+ break;
+ case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+ channel_type = NL80211_CHAN_HT40MINUS;
+ break;
+ default:
+ channel_type = NL80211_CHAN_NO_HT;
+ }
+
+ cfg80211_chandef_create(chandef, control_chan, channel_type);
+}
+
+void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan,
+ const struct ieee80211_vht_operation *oper,
+ struct cfg80211_chan_def *chandef)
+{
+ if (!oper)
+ return;
+
+ chandef->chan = control_chan;
+
+ switch (oper->chan_width) {
+ case IEEE80211_VHT_CHANWIDTH_USE_HT:
+ break;
+ case IEEE80211_VHT_CHANWIDTH_80MHZ:
+ chandef->width = NL80211_CHAN_WIDTH_80;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_160MHZ:
+ chandef->width = NL80211_CHAN_WIDTH_160;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
+ chandef->width = NL80211_CHAN_WIDTH_80P80;
+ break;
+ default:
+ break;
+ }
+
+ chandef->center_freq1 =
+ ieee80211_channel_to_frequency(oper->center_freq_seg1_idx,
+ control_chan->band);
+ chandef->center_freq2 =
+ ieee80211_channel_to_frequency(oper->center_freq_seg2_idx,
+ control_chan->band);
+}
+
+int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
+ const struct ieee80211_supported_band *sband,
+ const u8 *srates, int srates_len, u32 *rates)
+{
+ u32 rate_flags = ieee80211_chandef_rate_flags(chandef);
+ int shift = ieee80211_chandef_get_shift(chandef);
+ struct ieee80211_rate *br;
+ int brate, rate, i, j, count = 0;
+
+ *rates = 0;
+
+ for (i = 0; i < srates_len; i++) {
+ rate = srates[i] & 0x7f;
+
+ for (j = 0; j < sband->n_bitrates; j++) {
+ br = &sband->bitrates[j];
+ if ((rate_flags & br->flags) != rate_flags)
+ continue;
+
+ brate = DIV_ROUND_UP(br->bitrate, (1 << shift) * 5);
+ if (brate == rate) {
+ *rates |= BIT(j);
+ count++;
+ break;
+ }
+ }
+ }
+ return count;
+}
+
+int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, bool need_basic,
+ enum ieee80211_band band)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband;
+ int rate, shift;
+ u8 i, rates, *pos;
+ u32 basic_rates = sdata->vif.bss_conf.basic_rates;
+ u32 rate_flags;
+
+ shift = ieee80211_vif_get_shift(&sdata->vif);
+ rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+ sband = local->hw.wiphy->bands[band];
+ rates = 0;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+ rates++;
+ }
+ if (rates > 8)
+ rates = 8;
+
+ if (skb_tailroom(skb) < rates + 2)
+ return -ENOMEM;
+
+ pos = skb_put(skb, rates + 2);
+ *pos++ = WLAN_EID_SUPP_RATES;
+ *pos++ = rates;
+ for (i = 0; i < rates; i++) {
+ u8 basic = 0;
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+
+ if (need_basic && basic_rates & BIT(i))
+ basic = 0x80;
+ rate = sband->bitrates[i].bitrate;
+ rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+ 5 * (1 << shift));
+ *pos++ = basic | (u8) rate;
+ }
+
+ return 0;
+}
+
+int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, bool need_basic,
+ enum ieee80211_band band)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband;
+ int rate, shift;
+ u8 i, exrates, *pos;
+ u32 basic_rates = sdata->vif.bss_conf.basic_rates;
+ u32 rate_flags;
+
+ rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef);
+ shift = ieee80211_vif_get_shift(&sdata->vif);
+
+ sband = local->hw.wiphy->bands[band];
+ exrates = 0;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+ exrates++;
+ }
+
+ if (exrates > 8)
+ exrates -= 8;
+ else
+ exrates = 0;
+
+ if (skb_tailroom(skb) < exrates + 2)
+ return -ENOMEM;
+
+ if (exrates) {
+ pos = skb_put(skb, exrates + 2);
+ *pos++ = WLAN_EID_EXT_SUPP_RATES;
+ *pos++ = exrates;
+ for (i = 8; i < sband->n_bitrates; i++) {
+ u8 basic = 0;
+ if ((rate_flags & sband->bitrates[i].flags)
+ != rate_flags)
+ continue;
+ if (need_basic && basic_rates & BIT(i))
+ basic = 0x80;
+ rate = DIV_ROUND_UP(sband->bitrates[i].bitrate,
+ 5 * (1 << shift));
+ *pos++ = basic | (u8) rate;
+ }
+ }
+ return 0;
+}
+
+int ieee80211_ave_rssi(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) {
+ /* non-managed type inferfaces */
+ return 0;
+ }
+ return ifmgd->ave_beacon_signal / 16;
+}
+EXPORT_SYMBOL_GPL(ieee80211_ave_rssi);
+
+u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs)
+{
+ if (!mcs)
+ return 1;
+
+ /* TODO: consider rx_highest */
+
+ if (mcs->rx_mask[3])
+ return 4;
+ if (mcs->rx_mask[2])
+ return 3;
+ if (mcs->rx_mask[1])
+ return 2;
+ return 1;
+}
+
+/**
+ * ieee80211_calculate_rx_timestamp - calculate timestamp in frame
+ * @local: mac80211 hw info struct
+ * @status: RX status
+ * @mpdu_len: total MPDU length (including FCS)
+ * @mpdu_offset: offset into MPDU to calculate timestamp at
+ *
+ * This function calculates the RX timestamp at the given MPDU offset, taking
+ * into account what the RX timestamp was. An offset of 0 will just normalize
+ * the timestamp to TSF at beginning of MPDU reception.
+ */
+u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
+ struct ieee80211_rx_status *status,
+ unsigned int mpdu_len,
+ unsigned int mpdu_offset)
+{
+ u64 ts = status->mactime;
+ struct rate_info ri;
+ u16 rate;
+
+ if (WARN_ON(!ieee80211_have_rx_timestamp(status)))
+ return 0;
+
+ memset(&ri, 0, sizeof(ri));
+
+ /* Fill cfg80211 rate info */
+ if (status->flag & RX_FLAG_HT) {
+ ri.mcs = status->rate_idx;
+ ri.flags |= RATE_INFO_FLAGS_MCS;
+ if (status->flag & RX_FLAG_40MHZ)
+ ri.bw = RATE_INFO_BW_40;
+ else
+ ri.bw = RATE_INFO_BW_20;
+ if (status->flag & RX_FLAG_SHORT_GI)
+ ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
+ } else if (status->flag & RX_FLAG_VHT) {
+ ri.flags |= RATE_INFO_FLAGS_VHT_MCS;
+ ri.mcs = status->rate_idx;
+ ri.nss = status->vht_nss;
+ if (status->flag & RX_FLAG_40MHZ)
+ ri.bw = RATE_INFO_BW_40;
+ else if (status->vht_flag & RX_VHT_FLAG_80MHZ)
+ ri.bw = RATE_INFO_BW_80;
+ else if (status->vht_flag & RX_VHT_FLAG_160MHZ)
+ ri.bw = RATE_INFO_BW_160;
+ else
+ ri.bw = RATE_INFO_BW_20;
+ if (status->flag & RX_FLAG_SHORT_GI)
+ ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
+ } else {
+ struct ieee80211_supported_band *sband;
+ int shift = 0;
+ int bitrate;
+
+ if (status->flag & RX_FLAG_10MHZ) {
+ shift = 1;
+ ri.bw = RATE_INFO_BW_10;
+ } else if (status->flag & RX_FLAG_5MHZ) {
+ shift = 2;
+ ri.bw = RATE_INFO_BW_5;
+ } else {
+ ri.bw = RATE_INFO_BW_20;
+ }
+
+ sband = local->hw.wiphy->bands[status->band];
+ bitrate = sband->bitrates[status->rate_idx].bitrate;
+ ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift));
+ }
+
+ rate = cfg80211_calculate_bitrate(&ri);
+ if (WARN_ONCE(!rate,
+ "Invalid bitrate: flags=0x%x, idx=%d, vht_nss=%d\n",
+ status->flag, status->rate_idx, status->vht_nss))
+ return 0;
+
+ /* rewind from end of MPDU */
+ if (status->flag & RX_FLAG_MACTIME_END)
+ ts -= mpdu_len * 8 * 10 / rate;
+
+ ts += mpdu_offset * 8 * 10 / rate;
+
+ return ts;
+}
+
+void ieee80211_dfs_cac_cancel(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct cfg80211_chan_def chandef;
+
+ mutex_lock(&local->mtx);
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ /* it might be waiting for the local->mtx, but then
+ * by the time it gets it, sdata->wdev.cac_started
+ * will no longer be true
+ */
+ cancel_delayed_work(&sdata->dfs_cac_timer_work);
+
+ if (sdata->wdev.cac_started) {
+ chandef = sdata->vif.bss_conf.chandef;
+ ieee80211_vif_release_channel(sdata);
+ cfg80211_cac_event(sdata->dev,
+ &chandef,
+ NL80211_RADAR_CAC_ABORTED,
+ GFP_KERNEL);
+ }
+ }
+ mutex_unlock(&local->iflist_mtx);
+ mutex_unlock(&local->mtx);
+}
+
+void ieee80211_dfs_radar_detected_work(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local, radar_detected_work);
+ struct cfg80211_chan_def chandef = local->hw.conf.chandef;
+ struct ieee80211_chanctx *ctx;
+ int num_chanctx = 0;
+
+ mutex_lock(&local->chanctx_mtx);
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER)
+ continue;
+
+ num_chanctx++;
+ chandef = ctx->conf.def;
+ }
+ mutex_unlock(&local->chanctx_mtx);
+
+ ieee80211_dfs_cac_cancel(local);
+
+ if (num_chanctx > 1)
+ /* XXX: multi-channel is not supported yet */
+ WARN_ON(1);
+ else
+ cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL);
+}
+
+void ieee80211_radar_detected(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ trace_api_radar_detected(local);
+
+ ieee80211_queue_work(hw, &local->radar_detected_work);
+}
+EXPORT_SYMBOL(ieee80211_radar_detected);
+
+u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c)
+{
+ u32 ret;
+ int tmp;
+
+ switch (c->width) {
+ case NL80211_CHAN_WIDTH_20:
+ c->width = NL80211_CHAN_WIDTH_20_NOHT;
+ ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ c->width = NL80211_CHAN_WIDTH_20;
+ c->center_freq1 = c->chan->center_freq;
+ ret = IEEE80211_STA_DISABLE_40MHZ |
+ IEEE80211_STA_DISABLE_VHT;
+ break;
+ case NL80211_CHAN_WIDTH_80:
+ tmp = (30 + c->chan->center_freq - c->center_freq1)/20;
+ /* n_P40 */
+ tmp /= 2;
+ /* freq_P40 */
+ c->center_freq1 = c->center_freq1 - 20 + 40 * tmp;
+ c->width = NL80211_CHAN_WIDTH_40;
+ ret = IEEE80211_STA_DISABLE_VHT;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ c->center_freq2 = 0;
+ c->width = NL80211_CHAN_WIDTH_80;
+ ret = IEEE80211_STA_DISABLE_80P80MHZ |
+ IEEE80211_STA_DISABLE_160MHZ;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ /* n_P20 */
+ tmp = (70 + c->chan->center_freq - c->center_freq1)/20;
+ /* n_P80 */
+ tmp /= 4;
+ c->center_freq1 = c->center_freq1 - 40 + 80 * tmp;
+ c->width = NL80211_CHAN_WIDTH_80;
+ ret = IEEE80211_STA_DISABLE_80P80MHZ |
+ IEEE80211_STA_DISABLE_160MHZ;
+ break;
+ default:
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ WARN_ON_ONCE(1);
+ c->width = NL80211_CHAN_WIDTH_20_NOHT;
+ ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
+ break;
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ WARN_ON_ONCE(1);
+ /* keep c->width */
+ ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
+ break;
+ }
+
+ WARN_ON_ONCE(!cfg80211_chandef_valid(c));
+
+ return ret;
+}
+
+/*
+ * Returns true if smps_mode_new is strictly more restrictive than
+ * smps_mode_old.
+ */
+bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old,
+ enum ieee80211_smps_mode smps_mode_new)
+{
+ if (WARN_ON_ONCE(smps_mode_old == IEEE80211_SMPS_AUTOMATIC ||
+ smps_mode_new == IEEE80211_SMPS_AUTOMATIC))
+ return false;
+
+ switch (smps_mode_old) {
+ case IEEE80211_SMPS_STATIC:
+ return false;
+ case IEEE80211_SMPS_DYNAMIC:
+ return smps_mode_new == IEEE80211_SMPS_STATIC;
+ case IEEE80211_SMPS_OFF:
+ return smps_mode_new != IEEE80211_SMPS_OFF;
+ default:
+ WARN_ON(1);
+ }
+
+ return false;
+}
+
+int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_csa_settings *csa_settings)
+{
+ struct sk_buff *skb;
+ struct ieee80211_mgmt *mgmt;
+ struct ieee80211_local *local = sdata->local;
+ int freq;
+ int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.chan_switch) +
+ sizeof(mgmt->u.action.u.chan_switch);
+ u8 *pos;
+
+ if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ skb = dev_alloc_skb(local->tx_headroom + hdr_len +
+ 5 + /* channel switch announcement element */
+ 3 + /* secondary channel offset element */
+ 8); /* mesh channel switch parameters element */
+ if (!skb)
+ return -ENOMEM;
+
+ skb_reserve(skb, local->tx_headroom);
+ mgmt = (struct ieee80211_mgmt *)skb_put(skb, hdr_len);
+ memset(mgmt, 0, hdr_len);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+
+ eth_broadcast_addr(mgmt->da);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+ } else {
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+ memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN);
+ }
+ mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
+ mgmt->u.action.u.chan_switch.action_code = WLAN_ACTION_SPCT_CHL_SWITCH;
+ pos = skb_put(skb, 5);
+ *pos++ = WLAN_EID_CHANNEL_SWITCH; /* EID */
+ *pos++ = 3; /* IE length */
+ *pos++ = csa_settings->block_tx ? 1 : 0; /* CSA mode */
+ freq = csa_settings->chandef.chan->center_freq;
+ *pos++ = ieee80211_frequency_to_channel(freq); /* channel */
+ *pos++ = csa_settings->count; /* count */
+
+ if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_40) {
+ enum nl80211_channel_type ch_type;
+
+ skb_put(skb, 3);
+ *pos++ = WLAN_EID_SECONDARY_CHANNEL_OFFSET; /* EID */
+ *pos++ = 1; /* IE length */
+ ch_type = cfg80211_get_chandef_type(&csa_settings->chandef);
+ if (ch_type == NL80211_CHAN_HT40PLUS)
+ *pos++ = IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
+ else
+ *pos++ = IEEE80211_HT_PARAM_CHA_SEC_BELOW;
+ }
+
+ if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+ skb_put(skb, 8);
+ *pos++ = WLAN_EID_CHAN_SWITCH_PARAM; /* EID */
+ *pos++ = 6; /* IE length */
+ *pos++ = sdata->u.mesh.mshcfg.dot11MeshTTL; /* Mesh TTL */
+ *pos = 0x00; /* Mesh Flag: Tx Restrict, Initiator, Reason */
+ *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR;
+ *pos++ |= csa_settings->block_tx ?
+ WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT : 0x00;
+ put_unaligned_le16(WLAN_REASON_MESH_CHAN, pos); /* Reason Cd */
+ pos += 2;
+ put_unaligned_le16(ifmsh->pre_value, pos);/* Precedence Value */
+ pos += 2;
+ }
+
+ ieee80211_tx_skb(sdata, skb);
+ return 0;
+}
+
+bool ieee80211_cs_valid(const struct ieee80211_cipher_scheme *cs)
+{
+ return !(cs == NULL || cs->cipher == 0 ||
+ cs->hdr_len < cs->pn_len + cs->pn_off ||
+ cs->hdr_len <= cs->key_idx_off ||
+ cs->key_idx_shift > 7 ||
+ cs->key_idx_mask == 0);
+}
+
+bool ieee80211_cs_list_valid(const struct ieee80211_cipher_scheme *cs, int n)
+{
+ int i;
+
+ /* Ensure we have enough iftype bitmap space for all iftype values */
+ WARN_ON((NUM_NL80211_IFTYPES / 8 + 1) > sizeof(cs[0].iftype));
+
+ for (i = 0; i < n; i++)
+ if (!ieee80211_cs_valid(&cs[i]))
+ return false;
+
+ return true;
+}
+
+const struct ieee80211_cipher_scheme *
+ieee80211_cs_get(struct ieee80211_local *local, u32 cipher,
+ enum nl80211_iftype iftype)
+{
+ const struct ieee80211_cipher_scheme *l = local->hw.cipher_schemes;
+ int n = local->hw.n_cipher_schemes;
+ int i;
+ const struct ieee80211_cipher_scheme *cs = NULL;
+
+ for (i = 0; i < n; i++) {
+ if (l[i].cipher == cipher) {
+ cs = &l[i];
+ break;
+ }
+ }
+
+ if (!cs || !(cs->iftype & BIT(iftype)))
+ return NULL;
+
+ return cs;
+}
+
+int ieee80211_cs_headroom(struct ieee80211_local *local,
+ struct cfg80211_crypto_settings *crypto,
+ enum nl80211_iftype iftype)
+{
+ const struct ieee80211_cipher_scheme *cs;
+ int headroom = IEEE80211_ENCRYPT_HEADROOM;
+ int i;
+
+ for (i = 0; i < crypto->n_ciphers_pairwise; i++) {
+ cs = ieee80211_cs_get(local, crypto->ciphers_pairwise[i],
+ iftype);
+
+ if (cs && headroom < cs->hdr_len)
+ headroom = cs->hdr_len;
+ }
+
+ cs = ieee80211_cs_get(local, crypto->cipher_group, iftype);
+ if (cs && headroom < cs->hdr_len)
+ headroom = cs->hdr_len;
+
+ return headroom;
+}
+
+static bool
+ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i)
+{
+ s32 end = data->desc[i].start + data->desc[i].duration - (tsf + 1);
+ int skip;
+
+ if (end > 0)
+ return false;
+
+ /* End time is in the past, check for repetitions */
+ skip = DIV_ROUND_UP(-end, data->desc[i].interval);
+ if (data->count[i] < 255) {
+ if (data->count[i] <= skip) {
+ data->count[i] = 0;
+ return false;
+ }
+
+ data->count[i] -= skip;
+ }
+
+ data->desc[i].start += skip * data->desc[i].interval;
+
+ return true;
+}
+
+static bool
+ieee80211_extend_absent_time(struct ieee80211_noa_data *data, u32 tsf,
+ s32 *offset)
+{
+ bool ret = false;
+ int i;
+
+ for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) {
+ s32 cur;
+
+ if (!data->count[i])
+ continue;
+
+ if (ieee80211_extend_noa_desc(data, tsf + *offset, i))
+ ret = true;
+
+ cur = data->desc[i].start - tsf;
+ if (cur > *offset)
+ continue;
+
+ cur = data->desc[i].start + data->desc[i].duration - tsf;
+ if (cur > *offset)
+ *offset = cur;
+ }
+
+ return ret;
+}
+
+static u32
+ieee80211_get_noa_absent_time(struct ieee80211_noa_data *data, u32 tsf)
+{
+ s32 offset = 0;
+ int tries = 0;
+ /*
+ * arbitrary limit, used to avoid infinite loops when combined NoA
+ * descriptors cover the full time period.
+ */
+ int max_tries = 5;
+
+ ieee80211_extend_absent_time(data, tsf, &offset);
+ do {
+ if (!ieee80211_extend_absent_time(data, tsf, &offset))
+ break;
+
+ tries++;
+ } while (tries < max_tries);
+
+ return offset;
+}
+
+void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf)
+{
+ u32 next_offset = BIT(31) - 1;
+ int i;
+
+ data->absent = 0;
+ data->has_next_tsf = false;
+ for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) {
+ s32 start;
+
+ if (!data->count[i])
+ continue;
+
+ ieee80211_extend_noa_desc(data, tsf, i);
+ start = data->desc[i].start - tsf;
+ if (start <= 0)
+ data->absent |= BIT(i);
+
+ if (next_offset > start)
+ next_offset = start;
+
+ data->has_next_tsf = true;
+ }
+
+ if (data->absent)
+ next_offset = ieee80211_get_noa_absent_time(data, tsf);
+
+ data->next_tsf = tsf + next_offset;
+}
+EXPORT_SYMBOL(ieee80211_update_p2p_noa);
+
+int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr,
+ struct ieee80211_noa_data *data, u32 tsf)
+{
+ int ret = 0;
+ int i;
+
+ memset(data, 0, sizeof(*data));
+
+ for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) {
+ const struct ieee80211_p2p_noa_desc *desc = &attr->desc[i];
+
+ if (!desc->count || !desc->duration)
+ continue;
+
+ data->count[i] = desc->count;
+ data->desc[i].start = le32_to_cpu(desc->start_time);
+ data->desc[i].duration = le32_to_cpu(desc->duration);
+ data->desc[i].interval = le32_to_cpu(desc->interval);
+
+ if (data->count[i] > 1 &&
+ data->desc[i].interval < data->desc[i].duration)
+ continue;
+
+ ieee80211_extend_noa_desc(data, tsf, i);
+ ret++;
+ }
+
+ if (ret)
+ ieee80211_update_p2p_noa(data, tsf);
+
+ return ret;
+}
+EXPORT_SYMBOL(ieee80211_parse_p2p_noa);
+
+void ieee80211_recalc_dtim(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ u64 tsf = drv_get_tsf(local, sdata);
+ u64 dtim_count = 0;
+ u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024;
+ u8 dtim_period = sdata->vif.bss_conf.dtim_period;
+ struct ps_data *ps;
+ u8 bcns_from_dtim;
+
+ if (tsf == -1ULL || !beacon_int || !dtim_period)
+ return;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ if (!sdata->bss)
+ return;
+
+ ps = &sdata->bss->ps;
+ } else if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ ps = &sdata->u.mesh.ps;
+ } else {
+ return;
+ }
+
+ /*
+ * actually finds last dtim_count, mac80211 will update in
+ * __beacon_add_tim().
+ * dtim_count = dtim_period - (tsf / bcn_int) % dtim_period
+ */
+ do_div(tsf, beacon_int);
+ bcns_from_dtim = do_div(tsf, dtim_period);
+ /* just had a DTIM */
+ if (!bcns_from_dtim)
+ dtim_count = 0;
+ else
+ dtim_count = dtim_period - bcns_from_dtim;
+
+ ps->dtim_count = dtim_count;
+}
+
+static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ struct ieee80211_sub_if_data *sdata;
+ u8 radar_detect = 0;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED))
+ return 0;
+
+ list_for_each_entry(sdata, &ctx->reserved_vifs, reserved_chanctx_list)
+ if (sdata->reserved_radar_required)
+ radar_detect |= BIT(sdata->reserved_chandef.width);
+
+ /*
+ * An in-place reservation context should not have any assigned vifs
+ * until it replaces the other context.
+ */
+ WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER &&
+ !list_empty(&ctx->assigned_vifs));
+
+ list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list)
+ if (sdata->radar_required)
+ radar_detect |= BIT(sdata->vif.bss_conf.chandef.width);
+
+ return radar_detect;
+}
+
+int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
+ const struct cfg80211_chan_def *chandef,
+ enum ieee80211_chanctx_mode chanmode,
+ u8 radar_detect)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *sdata_iter;
+ enum nl80211_iftype iftype = sdata->wdev.iftype;
+ int num[NUM_NL80211_IFTYPES];
+ struct ieee80211_chanctx *ctx;
+ int num_different_channels = 0;
+ int total = 1;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ if (WARN_ON(hweight32(radar_detect) > 1))
+ return -EINVAL;
+
+ if (WARN_ON(chandef && chanmode == IEEE80211_CHANCTX_SHARED &&
+ !chandef->chan))
+ return -EINVAL;
+
+ if (chandef)
+ num_different_channels = 1;
+
+ if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
+ return -EINVAL;
+
+ /* Always allow software iftypes */
+ if (local->hw.wiphy->software_iftypes & BIT(iftype)) {
+ if (radar_detect)
+ return -EINVAL;
+ return 0;
+ }
+
+ memset(num, 0, sizeof(num));
+
+ if (iftype != NL80211_IFTYPE_UNSPECIFIED)
+ num[iftype] = 1;
+
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
+ continue;
+ radar_detect |= ieee80211_chanctx_radar_detect(local, ctx);
+ if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) {
+ num_different_channels++;
+ continue;
+ }
+ if (chandef && chanmode == IEEE80211_CHANCTX_SHARED &&
+ cfg80211_chandef_compatible(chandef,
+ &ctx->conf.def))
+ continue;
+ num_different_channels++;
+ }
+
+ list_for_each_entry_rcu(sdata_iter, &local->interfaces, list) {
+ struct wireless_dev *wdev_iter;
+
+ wdev_iter = &sdata_iter->wdev;
+
+ if (sdata_iter == sdata ||
+ !ieee80211_sdata_running(sdata_iter) ||
+ local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype))
+ continue;
+
+ num[wdev_iter->iftype]++;
+ total++;
+ }
+
+ if (total == 1 && !radar_detect)
+ return 0;
+
+ return cfg80211_check_combinations(local->hw.wiphy,
+ num_different_channels,
+ radar_detect, num);
+}
+
+static void
+ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c,
+ void *data)
+{
+ u32 *max_num_different_channels = data;
+
+ *max_num_different_channels = max(*max_num_different_channels,
+ c->num_different_channels);
+}
+
+int ieee80211_max_num_channels(struct ieee80211_local *local)
+{
+ struct ieee80211_sub_if_data *sdata;
+ int num[NUM_NL80211_IFTYPES] = {};
+ struct ieee80211_chanctx *ctx;
+ int num_different_channels = 0;
+ u8 radar_detect = 0;
+ u32 max_num_different_channels = 1;
+ int err;
+
+ lockdep_assert_held(&local->chanctx_mtx);
+
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
+ continue;
+
+ num_different_channels++;
+
+ radar_detect |= ieee80211_chanctx_radar_detect(local, ctx);
+ }
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list)
+ num[sdata->wdev.iftype]++;
+
+ err = cfg80211_iter_combinations(local->hw.wiphy,
+ num_different_channels, radar_detect,
+ num, ieee80211_iter_max_chans,
+ &max_num_different_channels);
+ if (err < 0)
+ return err;
+
+ return max_num_different_channels;
+}
+
+u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo)
+{
+ *buf++ = WLAN_EID_VENDOR_SPECIFIC;
+ *buf++ = 7; /* len */
+ *buf++ = 0x00; /* Microsoft OUI 00:50:F2 */
+ *buf++ = 0x50;
+ *buf++ = 0xf2;
+ *buf++ = 2; /* WME */
+ *buf++ = 0; /* WME info */
+ *buf++ = 1; /* WME ver */
+ *buf++ = qosinfo; /* U-APSD no in use */
+
+ return buf;
+}
+
+void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta,
+ struct txq_info *txqi, int tid)
+{
+ skb_queue_head_init(&txqi->queue);
+ txqi->txq.vif = &sdata->vif;
+
+ if (sta) {
+ txqi->txq.sta = &sta->sta;
+ sta->sta.txq[tid] = &txqi->txq;
+ txqi->txq.ac = ieee802_1d_to_ac[tid & 7];
+ } else {
+ sdata->vif.txq = &txqi->txq;
+ txqi->txq.ac = IEEE80211_AC_BE;
+ }
+}
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
new file mode 100644
index 000000000..80694d55d
--- /dev/null
+++ b/net/mac80211/vht.c
@@ -0,0 +1,424 @@
+/*
+ * VHT handling
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/export.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+
+
+static void __check_vhtcap_disable(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta_vht_cap *vht_cap,
+ u32 flag)
+{
+ __le32 le_flag = cpu_to_le32(flag);
+
+ if (sdata->u.mgd.vht_capa_mask.vht_cap_info & le_flag &&
+ !(sdata->u.mgd.vht_capa.vht_cap_info & le_flag))
+ vht_cap->cap &= ~flag;
+}
+
+void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta_vht_cap *vht_cap)
+{
+ int i;
+ u16 rxmcs_mask, rxmcs_cap, rxmcs_n, txmcs_mask, txmcs_cap, txmcs_n;
+
+ if (!vht_cap->vht_supported)
+ return;
+
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ return;
+
+ __check_vhtcap_disable(sdata, vht_cap,
+ IEEE80211_VHT_CAP_RXLDPC);
+ __check_vhtcap_disable(sdata, vht_cap,
+ IEEE80211_VHT_CAP_SHORT_GI_80);
+ __check_vhtcap_disable(sdata, vht_cap,
+ IEEE80211_VHT_CAP_SHORT_GI_160);
+ __check_vhtcap_disable(sdata, vht_cap,
+ IEEE80211_VHT_CAP_TXSTBC);
+ __check_vhtcap_disable(sdata, vht_cap,
+ IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE);
+ __check_vhtcap_disable(sdata, vht_cap,
+ IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE);
+ __check_vhtcap_disable(sdata, vht_cap,
+ IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN);
+ __check_vhtcap_disable(sdata, vht_cap,
+ IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN);
+
+ /* Allow user to decrease AMPDU length exponent */
+ if (sdata->u.mgd.vht_capa_mask.vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK)) {
+ u32 cap, n;
+
+ n = le32_to_cpu(sdata->u.mgd.vht_capa.vht_cap_info) &
+ IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK;
+ n >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT;
+ cap = vht_cap->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK;
+ cap >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT;
+
+ if (n < cap) {
+ vht_cap->cap &=
+ ~IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK;
+ vht_cap->cap |=
+ n << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT;
+ }
+ }
+
+ /* Allow the user to decrease MCSes */
+ rxmcs_mask =
+ le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.rx_mcs_map);
+ rxmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.rx_mcs_map);
+ rxmcs_n &= rxmcs_mask;
+ rxmcs_cap = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map);
+
+ txmcs_mask =
+ le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.tx_mcs_map);
+ txmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.tx_mcs_map);
+ txmcs_n &= txmcs_mask;
+ txmcs_cap = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map);
+ for (i = 0; i < 8; i++) {
+ u8 m, n, c;
+
+ m = (rxmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+ n = (rxmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+ c = (rxmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+ if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) ||
+ n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) {
+ rxmcs_cap &= ~(3 << 2*i);
+ rxmcs_cap |= (rxmcs_n & (3 << 2*i));
+ }
+
+ m = (txmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+ n = (txmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+ c = (txmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+ if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) ||
+ n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) {
+ txmcs_cap &= ~(3 << 2*i);
+ txmcs_cap |= (txmcs_n & (3 << 2*i));
+ }
+ }
+ vht_cap->vht_mcs.rx_mcs_map = cpu_to_le16(rxmcs_cap);
+ vht_cap->vht_mcs.tx_mcs_map = cpu_to_le16(txmcs_cap);
+}
+
+void
+ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const struct ieee80211_vht_cap *vht_cap_ie,
+ struct sta_info *sta)
+{
+ struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
+ struct ieee80211_sta_vht_cap own_cap;
+ u32 cap_info, i;
+
+ memset(vht_cap, 0, sizeof(*vht_cap));
+
+ if (!sta->sta.ht_cap.ht_supported)
+ return;
+
+ if (!vht_cap_ie || !sband->vht_cap.vht_supported)
+ return;
+
+ /*
+ * A VHT STA must support 40 MHz, but if we verify that here
+ * then we break a few things - some APs (e.g. Netgear R6300v2
+ * and others based on the BCM4360 chipset) will unset this
+ * capability bit when operating in 20 MHz.
+ */
+
+ vht_cap->vht_supported = true;
+
+ own_cap = sband->vht_cap;
+ /*
+ * If user has specified capability overrides, take care
+ * of that if the station we're setting up is the AP that
+ * we advertised a restricted capability set to. Override
+ * our own capabilities and then use those below.
+ */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !test_sta_flag(sta, WLAN_STA_TDLS_PEER))
+ ieee80211_apply_vhtcap_overrides(sdata, &own_cap);
+
+ /* take some capabilities as-is */
+ cap_info = le32_to_cpu(vht_cap_ie->vht_cap_info);
+ vht_cap->cap = cap_info;
+ vht_cap->cap &= IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 |
+ IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 |
+ IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 |
+ IEEE80211_VHT_CAP_RXLDPC |
+ IEEE80211_VHT_CAP_VHT_TXOP_PS |
+ IEEE80211_VHT_CAP_HTC_VHT |
+ IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK |
+ IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB |
+ IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB |
+ IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN |
+ IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN;
+
+ /* and some based on our own capabilities */
+ switch (own_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
+ case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
+ vht_cap->cap |= cap_info &
+ IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ;
+ break;
+ case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ:
+ vht_cap->cap |= cap_info &
+ IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+ break;
+ default:
+ /* nothing */
+ break;
+ }
+
+ /* symmetric capabilities */
+ vht_cap->cap |= cap_info & own_cap.cap &
+ (IEEE80211_VHT_CAP_SHORT_GI_80 |
+ IEEE80211_VHT_CAP_SHORT_GI_160);
+
+ /* remaining ones */
+ if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE)
+ vht_cap->cap |= cap_info &
+ (IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE |
+ IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK);
+
+ if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE)
+ vht_cap->cap |= cap_info &
+ (IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE |
+ IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK);
+
+ if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE)
+ vht_cap->cap |= cap_info &
+ IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
+
+ if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE)
+ vht_cap->cap |= cap_info &
+ IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE;
+
+ if (own_cap.cap & IEEE80211_VHT_CAP_TXSTBC)
+ vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_RXSTBC_MASK;
+
+ if (own_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK)
+ vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_TXSTBC;
+
+ /* Copy peer MCS info, the driver might need them. */
+ memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs,
+ sizeof(struct ieee80211_vht_mcs_info));
+
+ /* but also restrict MCSes */
+ for (i = 0; i < 8; i++) {
+ u16 own_rx, own_tx, peer_rx, peer_tx;
+
+ own_rx = le16_to_cpu(own_cap.vht_mcs.rx_mcs_map);
+ own_rx = (own_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+ own_tx = le16_to_cpu(own_cap.vht_mcs.tx_mcs_map);
+ own_tx = (own_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+ peer_rx = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map);
+ peer_rx = (peer_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+ peer_tx = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map);
+ peer_tx = (peer_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+ if (peer_tx != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ if (own_rx == IEEE80211_VHT_MCS_NOT_SUPPORTED)
+ peer_tx = IEEE80211_VHT_MCS_NOT_SUPPORTED;
+ else if (own_rx < peer_tx)
+ peer_tx = own_rx;
+ }
+
+ if (peer_rx != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ if (own_tx == IEEE80211_VHT_MCS_NOT_SUPPORTED)
+ peer_rx = IEEE80211_VHT_MCS_NOT_SUPPORTED;
+ else if (own_tx < peer_rx)
+ peer_rx = own_tx;
+ }
+
+ vht_cap->vht_mcs.rx_mcs_map &=
+ ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2);
+ vht_cap->vht_mcs.rx_mcs_map |= cpu_to_le16(peer_rx << i * 2);
+
+ vht_cap->vht_mcs.tx_mcs_map &=
+ ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2);
+ vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2);
+ }
+
+ /* finally set up the bandwidth */
+ switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
+ case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
+ case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ:
+ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+ break;
+ default:
+ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
+ }
+
+ sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
+}
+
+enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
+{
+ struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
+ u32 cap_width;
+
+ if (!vht_cap->vht_supported)
+ return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
+ IEEE80211_STA_RX_BW_40 :
+ IEEE80211_STA_RX_BW_20;
+
+ cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+
+ if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ ||
+ cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
+ return IEEE80211_STA_RX_BW_160;
+
+ return IEEE80211_STA_RX_BW_80;
+}
+
+static enum ieee80211_sta_rx_bandwidth
+ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width)
+{
+ switch (width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ return IEEE80211_STA_RX_BW_20;
+ case NL80211_CHAN_WIDTH_40:
+ return IEEE80211_STA_RX_BW_40;
+ case NL80211_CHAN_WIDTH_80:
+ return IEEE80211_STA_RX_BW_80;
+ case NL80211_CHAN_WIDTH_160:
+ case NL80211_CHAN_WIDTH_80P80:
+ return IEEE80211_STA_RX_BW_160;
+ default:
+ WARN_ON_ONCE(1);
+ return IEEE80211_STA_RX_BW_20;
+ }
+}
+
+enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ enum ieee80211_sta_rx_bandwidth bw;
+
+ bw = ieee80211_chan_width_to_rx_bw(sdata->vif.bss_conf.chandef.width);
+ bw = min(bw, ieee80211_sta_cap_rx_bw(sta));
+ bw = min(bw, sta->cur_max_bandwidth);
+
+ return bw;
+}
+
+void ieee80211_sta_set_rx_nss(struct sta_info *sta)
+{
+ u8 ht_rx_nss = 0, vht_rx_nss = 0;
+
+ /* if we received a notification already don't overwrite it */
+ if (sta->sta.rx_nss)
+ return;
+
+ if (sta->sta.ht_cap.ht_supported) {
+ if (sta->sta.ht_cap.mcs.rx_mask[0])
+ ht_rx_nss++;
+ if (sta->sta.ht_cap.mcs.rx_mask[1])
+ ht_rx_nss++;
+ if (sta->sta.ht_cap.mcs.rx_mask[2])
+ ht_rx_nss++;
+ if (sta->sta.ht_cap.mcs.rx_mask[3])
+ ht_rx_nss++;
+ /* FIXME: consider rx_highest? */
+ }
+
+ if (sta->sta.vht_cap.vht_supported) {
+ int i;
+ u16 rx_mcs_map;
+
+ rx_mcs_map = le16_to_cpu(sta->sta.vht_cap.vht_mcs.rx_mcs_map);
+
+ for (i = 7; i >= 0; i--) {
+ u8 mcs = (rx_mcs_map >> (2 * i)) & 3;
+
+ if (mcs != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ vht_rx_nss = i + 1;
+ break;
+ }
+ }
+ /* FIXME: consider rx_highest? */
+ }
+
+ ht_rx_nss = max(ht_rx_nss, vht_rx_nss);
+ sta->sta.rx_nss = max_t(u8, 1, ht_rx_nss);
+}
+
+u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, u8 opmode,
+ enum ieee80211_band band, bool nss_only)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband;
+ enum ieee80211_sta_rx_bandwidth new_bw;
+ u32 changed = 0;
+ u8 nss;
+
+ sband = local->hw.wiphy->bands[band];
+
+ /* ignore - no support for BF yet */
+ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)
+ return 0;
+
+ nss = opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK;
+ nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT;
+ nss += 1;
+
+ if (sta->sta.rx_nss != nss) {
+ sta->sta.rx_nss = nss;
+ changed |= IEEE80211_RC_NSS_CHANGED;
+ }
+
+ if (nss_only)
+ return changed;
+
+ switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) {
+ case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ:
+ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
+ break;
+ case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ:
+ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40;
+ break;
+ case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ:
+ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
+ break;
+ case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ:
+ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
+ break;
+ }
+
+ new_bw = ieee80211_sta_cur_vht_bw(sta);
+ if (new_bw != sta->sta.bandwidth) {
+ sta->sta.bandwidth = new_bw;
+ changed |= IEEE80211_RC_BW_CHANGED;
+ }
+
+ return changed;
+}
+
+void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, u8 opmode,
+ enum ieee80211_band band, bool nss_only)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
+
+ u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode,
+ band, nss_only);
+
+ if (changed > 0)
+ rate_control_rate_update(local, sband, sta, changed);
+}
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
new file mode 100644
index 000000000..efa3f48f1
--- /dev/null
+++ b/net/mac80211/wep.c
@@ -0,0 +1,339 @@
+/*
+ * Software WEP encryption implementation
+ * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2003, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/compiler.h>
+#include <linux/crc32.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "wep.h"
+
+
+int ieee80211_wep_init(struct ieee80211_local *local)
+{
+ /* start WEP IV from a random value */
+ get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN);
+
+ local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(local->wep_tx_tfm)) {
+ local->wep_rx_tfm = ERR_PTR(-EINVAL);
+ return PTR_ERR(local->wep_tx_tfm);
+ }
+
+ local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(local->wep_rx_tfm)) {
+ crypto_free_cipher(local->wep_tx_tfm);
+ local->wep_tx_tfm = ERR_PTR(-EINVAL);
+ return PTR_ERR(local->wep_rx_tfm);
+ }
+
+ return 0;
+}
+
+void ieee80211_wep_free(struct ieee80211_local *local)
+{
+ if (!IS_ERR(local->wep_tx_tfm))
+ crypto_free_cipher(local->wep_tx_tfm);
+ if (!IS_ERR(local->wep_rx_tfm))
+ crypto_free_cipher(local->wep_rx_tfm);
+}
+
+static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen)
+{
+ /*
+ * Fluhrer, Mantin, and Shamir have reported weaknesses in the
+ * key scheduling algorithm of RC4. At least IVs (KeyByte + 3,
+ * 0xff, N) can be used to speedup attacks, so avoid using them.
+ */
+ if ((iv & 0xff00) == 0xff00) {
+ u8 B = (iv >> 16) & 0xff;
+ if (B >= 3 && B < 3 + keylen)
+ return true;
+ }
+ return false;
+}
+
+
+static void ieee80211_wep_get_iv(struct ieee80211_local *local,
+ int keylen, int keyidx, u8 *iv)
+{
+ local->wep_iv++;
+ if (ieee80211_wep_weak_iv(local->wep_iv, keylen))
+ local->wep_iv += 0x0100;
+
+ if (!iv)
+ return;
+
+ *iv++ = (local->wep_iv >> 16) & 0xff;
+ *iv++ = (local->wep_iv >> 8) & 0xff;
+ *iv++ = local->wep_iv & 0xff;
+ *iv++ = keyidx << 6;
+}
+
+
+static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local,
+ struct sk_buff *skb,
+ int keylen, int keyidx)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ unsigned int hdrlen;
+ u8 *newhdr;
+
+ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+
+ if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN))
+ return NULL;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ newhdr = skb_push(skb, IEEE80211_WEP_IV_LEN);
+ memmove(newhdr, newhdr + IEEE80211_WEP_IV_LEN, hdrlen);
+
+ /* the HW only needs room for the IV, but not the actual IV */
+ if (info->control.hw_key &&
+ (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))
+ return newhdr + hdrlen;
+
+ ieee80211_wep_get_iv(local, keylen, keyidx, newhdr + hdrlen);
+ return newhdr + hdrlen;
+}
+
+
+static void ieee80211_wep_remove_iv(struct ieee80211_local *local,
+ struct sk_buff *skb,
+ struct ieee80211_key *key)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ unsigned int hdrlen;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen);
+ skb_pull(skb, IEEE80211_WEP_IV_LEN);
+}
+
+
+/* Perform WEP encryption using given key. data buffer must have tailroom
+ * for 4-byte ICV. data_len must not include this ICV. Note: this function
+ * does _not_ add IV. data = RC4(data | CRC32(data)) */
+int ieee80211_wep_encrypt_data(struct crypto_cipher *tfm, u8 *rc4key,
+ size_t klen, u8 *data, size_t data_len)
+{
+ __le32 icv;
+ int i;
+
+ if (IS_ERR(tfm))
+ return -1;
+
+ icv = cpu_to_le32(~crc32_le(~0, data, data_len));
+ put_unaligned(icv, (__le32 *)(data + data_len));
+
+ crypto_cipher_setkey(tfm, rc4key, klen);
+ for (i = 0; i < data_len + IEEE80211_WEP_ICV_LEN; i++)
+ crypto_cipher_encrypt_one(tfm, data + i, data + i);
+
+ return 0;
+}
+
+
+/* Perform WEP encryption on given skb. 4 bytes of extra space (IV) in the
+ * beginning of the buffer 4 bytes of extra space (ICV) in the end of the
+ * buffer will be added. Both IV and ICV will be transmitted, so the
+ * payload length increases with 8 bytes.
+ *
+ * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data))
+ */
+int ieee80211_wep_encrypt(struct ieee80211_local *local,
+ struct sk_buff *skb,
+ const u8 *key, int keylen, int keyidx)
+{
+ u8 *iv;
+ size_t len;
+ u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
+
+ if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN))
+ return -1;
+
+ iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx);
+ if (!iv)
+ return -1;
+
+ len = skb->len - (iv + IEEE80211_WEP_IV_LEN - skb->data);
+
+ /* Prepend 24-bit IV to RC4 key */
+ memcpy(rc4key, iv, 3);
+
+ /* Copy rest of the WEP key (the secret part) */
+ memcpy(rc4key + 3, key, keylen);
+
+ /* Add room for ICV */
+ skb_put(skb, IEEE80211_WEP_ICV_LEN);
+
+ return ieee80211_wep_encrypt_data(local->wep_tx_tfm, rc4key, keylen + 3,
+ iv + IEEE80211_WEP_IV_LEN, len);
+}
+
+
+/* Perform WEP decryption using given key. data buffer includes encrypted
+ * payload, including 4-byte ICV, but _not_ IV. data_len must not include ICV.
+ * Return 0 on success and -1 on ICV mismatch. */
+int ieee80211_wep_decrypt_data(struct crypto_cipher *tfm, u8 *rc4key,
+ size_t klen, u8 *data, size_t data_len)
+{
+ __le32 crc;
+ int i;
+
+ if (IS_ERR(tfm))
+ return -1;
+
+ crypto_cipher_setkey(tfm, rc4key, klen);
+ for (i = 0; i < data_len + IEEE80211_WEP_ICV_LEN; i++)
+ crypto_cipher_decrypt_one(tfm, data + i, data + i);
+
+ crc = cpu_to_le32(~crc32_le(~0, data, data_len));
+ if (memcmp(&crc, data + data_len, IEEE80211_WEP_ICV_LEN) != 0)
+ /* ICV mismatch */
+ return -1;
+
+ return 0;
+}
+
+
+/* Perform WEP decryption on given skb. Buffer includes whole WEP part of
+ * the frame: IV (4 bytes), encrypted payload (including SNAP header),
+ * ICV (4 bytes). skb->len includes both IV and ICV.
+ *
+ * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on
+ * failure. If frame is OK, IV and ICV will be removed, i.e., decrypted payload
+ * is moved to the beginning of the skb and skb length will be reduced.
+ */
+static int ieee80211_wep_decrypt(struct ieee80211_local *local,
+ struct sk_buff *skb,
+ struct ieee80211_key *key)
+{
+ u32 klen;
+ u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
+ u8 keyidx;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ unsigned int hdrlen;
+ size_t len;
+ int ret = 0;
+
+ if (!ieee80211_has_protected(hdr->frame_control))
+ return -1;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ if (skb->len < hdrlen + IEEE80211_WEP_IV_LEN + IEEE80211_WEP_ICV_LEN)
+ return -1;
+
+ len = skb->len - hdrlen - IEEE80211_WEP_IV_LEN - IEEE80211_WEP_ICV_LEN;
+
+ keyidx = skb->data[hdrlen + 3] >> 6;
+
+ if (!key || keyidx != key->conf.keyidx)
+ return -1;
+
+ klen = 3 + key->conf.keylen;
+
+ /* Prepend 24-bit IV to RC4 key */
+ memcpy(rc4key, skb->data + hdrlen, 3);
+
+ /* Copy rest of the WEP key (the secret part) */
+ memcpy(rc4key + 3, key->conf.key, key->conf.keylen);
+
+ if (ieee80211_wep_decrypt_data(local->wep_rx_tfm, rc4key, klen,
+ skb->data + hdrlen +
+ IEEE80211_WEP_IV_LEN, len))
+ ret = -1;
+
+ /* Trim ICV */
+ skb_trim(skb, skb->len - IEEE80211_WEP_ICV_LEN);
+
+ /* Remove IV */
+ memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen);
+ skb_pull(skb, IEEE80211_WEP_IV_LEN);
+
+ return ret;
+}
+
+ieee80211_rx_result
+ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ __le16 fc = hdr->frame_control;
+
+ if (!ieee80211_is_data(fc) && !ieee80211_is_auth(fc))
+ return RX_CONTINUE;
+
+ if (!(status->flag & RX_FLAG_DECRYPTED)) {
+ if (skb_linearize(rx->skb))
+ return RX_DROP_UNUSABLE;
+ if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key))
+ return RX_DROP_UNUSABLE;
+ } else if (!(status->flag & RX_FLAG_IV_STRIPPED)) {
+ if (!pskb_may_pull(rx->skb, ieee80211_hdrlen(fc) +
+ IEEE80211_WEP_IV_LEN))
+ return RX_DROP_UNUSABLE;
+ ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key);
+ /* remove ICV */
+ if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN))
+ return RX_DROP_UNUSABLE;
+ }
+
+ return RX_CONTINUE;
+}
+
+static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_key_conf *hw_key = info->control.hw_key;
+
+ if (!hw_key) {
+ if (ieee80211_wep_encrypt(tx->local, skb, tx->key->conf.key,
+ tx->key->conf.keylen,
+ tx->key->conf.keyidx))
+ return -1;
+ } else if ((hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) ||
+ (hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) {
+ if (!ieee80211_wep_add_iv(tx->local, skb,
+ tx->key->conf.keylen,
+ tx->key->conf.keyidx))
+ return -1;
+ }
+
+ return 0;
+}
+
+ieee80211_tx_result
+ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+
+ ieee80211_tx_set_protected(tx);
+
+ skb_queue_walk(&tx->skbs, skb) {
+ if (wep_encrypt_skb(tx, skb) < 0) {
+ I802_DEBUG_INC(tx->local->tx_handlers_drop_wep);
+ return TX_DROP;
+ }
+ }
+
+ return TX_CONTINUE;
+}
diff --git a/net/mac80211/wep.h b/net/mac80211/wep.h
new file mode 100644
index 000000000..9615749d1
--- /dev/null
+++ b/net/mac80211/wep.h
@@ -0,0 +1,34 @@
+/*
+ * Software WEP encryption implementation
+ * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2003, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef WEP_H
+#define WEP_H
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include "ieee80211_i.h"
+#include "key.h"
+
+int ieee80211_wep_init(struct ieee80211_local *local);
+void ieee80211_wep_free(struct ieee80211_local *local);
+int ieee80211_wep_encrypt_data(struct crypto_cipher *tfm, u8 *rc4key,
+ size_t klen, u8 *data, size_t data_len);
+int ieee80211_wep_encrypt(struct ieee80211_local *local,
+ struct sk_buff *skb,
+ const u8 *key, int keylen, int keyidx);
+int ieee80211_wep_decrypt_data(struct crypto_cipher *tfm, u8 *rc4key,
+ size_t klen, u8 *data, size_t data_len);
+
+ieee80211_rx_result
+ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx);
+ieee80211_tx_result
+ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx);
+
+#endif /* WEP_H */
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
new file mode 100644
index 000000000..9eb0aee91
--- /dev/null
+++ b/net/mac80211/wme.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "wme.h"
+
+/* Default mapping in classifier to work with default
+ * queue setup.
+ */
+const int ieee802_1d_to_ac[8] = {
+ IEEE80211_AC_BE,
+ IEEE80211_AC_BK,
+ IEEE80211_AC_BK,
+ IEEE80211_AC_BE,
+ IEEE80211_AC_VI,
+ IEEE80211_AC_VI,
+ IEEE80211_AC_VO,
+ IEEE80211_AC_VO
+};
+
+static int wme_downgrade_ac(struct sk_buff *skb)
+{
+ switch (skb->priority) {
+ case 6:
+ case 7:
+ skb->priority = 5; /* VO -> VI */
+ return 0;
+ case 4:
+ case 5:
+ skb->priority = 3; /* VI -> BE */
+ return 0;
+ case 0:
+ case 3:
+ skb->priority = 2; /* BE -> BK */
+ return 0;
+ default:
+ return -1;
+ }
+}
+
+/**
+ * ieee80211_fix_reserved_tid - return the TID to use if this one is reserved
+ * @tid: the assumed-reserved TID
+ *
+ * Returns: the alternative TID to use, or 0 on error
+ */
+static inline u8 ieee80211_fix_reserved_tid(u8 tid)
+{
+ switch (tid) {
+ case 0:
+ return 3;
+ case 1:
+ return 2;
+ case 2:
+ return 1;
+ case 3:
+ return 0;
+ case 4:
+ return 5;
+ case 5:
+ return 4;
+ case 6:
+ return 7;
+ case 7:
+ return 6;
+ }
+
+ return 0;
+}
+
+static u16 ieee80211_downgrade_queue(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta, struct sk_buff *skb)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ /* in case we are a client verify acm is not set for this ac */
+ while (sdata->wmm_acm & BIT(skb->priority)) {
+ int ac = ieee802_1d_to_ac[skb->priority];
+
+ if (ifmgd->tx_tspec[ac].admitted_time &&
+ skb->priority == ifmgd->tx_tspec[ac].up)
+ return ac;
+
+ if (wme_downgrade_ac(skb)) {
+ /*
+ * This should not really happen. The AP has marked all
+ * lower ACs to require admission control which is not
+ * a reasonable configuration. Allow the frame to be
+ * transmitted using AC_BK as a workaround.
+ */
+ break;
+ }
+ }
+
+ /* Check to see if this is a reserved TID */
+ if (sta && sta->reserved_tid == skb->priority)
+ skb->priority = ieee80211_fix_reserved_tid(skb->priority);
+
+ /* look up which queue to use for frames with this 1d tag */
+ return ieee802_1d_to_ac[skb->priority];
+}
+
+/* Indicate which queue to use for this fully formed 802.11 frame */
+u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct ieee80211_hdr *hdr)
+{
+ struct ieee80211_local *local = sdata->local;
+ u8 *p;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ return 0;
+
+ if (!ieee80211_is_data(hdr->frame_control)) {
+ skb->priority = 7;
+ return ieee802_1d_to_ac[skb->priority];
+ }
+ if (!ieee80211_is_data_qos(hdr->frame_control)) {
+ skb->priority = 0;
+ return ieee802_1d_to_ac[skb->priority];
+ }
+
+ p = ieee80211_get_qos_ctl(hdr);
+ skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK;
+
+ return ieee80211_downgrade_queue(sdata, NULL, skb);
+}
+
+/* Indicate which queue to use. */
+u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta = NULL;
+ const u8 *ra = NULL;
+ bool qos = false;
+ struct mac80211_qos_map *qos_map;
+ u16 ret;
+
+ if (local->hw.queues < IEEE80211_NUM_ACS || skb->len < 6) {
+ skb->priority = 0; /* required for correct WPA/11i MIC */
+ return 0;
+ }
+
+ rcu_read_lock();
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ sta = rcu_dereference(sdata->u.vlan.sta);
+ if (sta) {
+ qos = sta->sta.wme;
+ break;
+ }
+ case NL80211_IFTYPE_AP:
+ ra = skb->data;
+ break;
+ case NL80211_IFTYPE_WDS:
+ ra = sdata->u.wds.remote_addr;
+ break;
+#ifdef CONFIG_MAC80211_MESH
+ case NL80211_IFTYPE_MESH_POINT:
+ qos = true;
+ break;
+#endif
+ case NL80211_IFTYPE_STATION:
+ /* might be a TDLS station */
+ sta = sta_info_get(sdata, skb->data);
+ if (sta)
+ qos = sta->sta.wme;
+
+ ra = sdata->u.mgd.bssid;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ ra = skb->data;
+ break;
+ case NL80211_IFTYPE_OCB:
+ /* all stations are required to support WME */
+ qos = true;
+ break;
+ default:
+ break;
+ }
+
+ if (!sta && ra && !is_multicast_ether_addr(ra)) {
+ sta = sta_info_get(sdata, ra);
+ if (sta)
+ qos = sta->sta.wme;
+ }
+
+ if (!qos) {
+ skb->priority = 0; /* required for correct WPA/11i MIC */
+ ret = IEEE80211_AC_BE;
+ goto out;
+ }
+
+ if (skb->protocol == sdata->control_port_protocol) {
+ skb->priority = 7;
+ goto downgrade;
+ }
+
+ /* use the data classifier to determine what 802.1d tag the
+ * data frame has */
+ qos_map = rcu_dereference(sdata->qos_map);
+ skb->priority = cfg80211_classify8021d(skb, qos_map ?
+ &qos_map->qos_map : NULL);
+
+ downgrade:
+ ret = ieee80211_downgrade_queue(sdata, sta, skb);
+ out:
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * ieee80211_set_qos_hdr - Fill in the QoS header if there is one.
+ *
+ * @sdata: local subif
+ * @skb: packet to be updated
+ */
+void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ u8 *p;
+ u8 ack_policy, tid;
+
+ if (!ieee80211_is_data_qos(hdr->frame_control))
+ return;
+
+ p = ieee80211_get_qos_ctl(hdr);
+ tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+
+ /* preserve EOSP bit */
+ ack_policy = *p & IEEE80211_QOS_CTL_EOSP;
+
+ if (is_multicast_ether_addr(hdr->addr1) ||
+ sdata->noack_map & BIT(tid)) {
+ ack_policy |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK;
+ info->flags |= IEEE80211_TX_CTL_NO_ACK;
+ }
+
+ /* qos header is 2 bytes */
+ *p++ = ack_policy | tid;
+ if (ieee80211_vif_is_mesh(&sdata->vif)) {
+ /* preserve RSPI and Mesh PS Level bit */
+ *p &= ((IEEE80211_QOS_CTL_RSPI |
+ IEEE80211_QOS_CTL_MESH_PS_LEVEL) >> 8);
+
+ /* Nulls don't have a mesh header (frame body) */
+ if (!ieee80211_is_qos_nullfunc(hdr->frame_control))
+ *p |= (IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT >> 8);
+ } else {
+ *p = 0;
+ }
+}
diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
new file mode 100644
index 000000000..80151edc5
--- /dev/null
+++ b/net/mac80211/wme.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _WME_H
+#define _WME_H
+
+#include <linux/netdevice.h>
+#include "ieee80211_i.h"
+
+u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct ieee80211_hdr *hdr);
+u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
+
+#endif /* _WME_H */
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
new file mode 100644
index 000000000..9d63d93c8
--- /dev/null
+++ b/net/mac80211/wpa.c
@@ -0,0 +1,1238 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/compiler.h>
+#include <linux/ieee80211.h>
+#include <linux/gfp.h>
+#include <asm/unaligned.h>
+#include <net/mac80211.h>
+#include <crypto/aes.h>
+
+#include "ieee80211_i.h"
+#include "michael.h"
+#include "tkip.h"
+#include "aes_ccm.h"
+#include "aes_cmac.h"
+#include "aes_gmac.h"
+#include "aes_gcm.h"
+#include "wpa.h"
+
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+ u8 *data, *key, *mic;
+ size_t data_len;
+ unsigned int hdrlen;
+ struct ieee80211_hdr *hdr;
+ struct sk_buff *skb = tx->skb;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int tail;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
+ skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control))
+ return TX_CONTINUE;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ if (skb->len < hdrlen)
+ return TX_DROP;
+
+ data = skb->data + hdrlen;
+ data_len = skb->len - hdrlen;
+
+ if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) {
+ /* Need to use software crypto for the test */
+ info->control.hw_key = NULL;
+ }
+
+ if (info->control.hw_key &&
+ (info->flags & IEEE80211_TX_CTL_DONTFRAG ||
+ tx->local->ops->set_frag_threshold) &&
+ !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) {
+ /* hwaccel - with no need for SW-generated MMIC */
+ return TX_CONTINUE;
+ }
+
+ tail = MICHAEL_MIC_LEN;
+ if (!info->control.hw_key)
+ tail += IEEE80211_TKIP_ICV_LEN;
+
+ if (WARN(skb_tailroom(skb) < tail ||
+ skb_headroom(skb) < IEEE80211_TKIP_IV_LEN,
+ "mmic: not enough head/tail (%d/%d,%d/%d)\n",
+ skb_headroom(skb), IEEE80211_TKIP_IV_LEN,
+ skb_tailroom(skb), tail))
+ return TX_DROP;
+
+ key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY];
+ mic = skb_put(skb, MICHAEL_MIC_LEN);
+ michael_mic(key, hdr, data, data_len, mic);
+ if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE))
+ mic[0]++;
+
+ return TX_CONTINUE;
+}
+
+
+ieee80211_rx_result
+ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
+{
+ u8 *data, *key = NULL;
+ size_t data_len;
+ unsigned int hdrlen;
+ u8 mic[MICHAEL_MIC_LEN];
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+ /*
+ * it makes no sense to check for MIC errors on anything other
+ * than data frames.
+ */
+ if (!ieee80211_is_data_present(hdr->frame_control))
+ return RX_CONTINUE;
+
+ /*
+ * No way to verify the MIC if the hardware stripped it or
+ * the IV with the key index. In this case we have solely rely
+ * on the driver to set RX_FLAG_MMIC_ERROR in the event of a
+ * MIC failure report.
+ */
+ if (status->flag & (RX_FLAG_MMIC_STRIPPED | RX_FLAG_IV_STRIPPED)) {
+ if (status->flag & RX_FLAG_MMIC_ERROR)
+ goto mic_fail_no_key;
+
+ if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key &&
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)
+ goto update_iv;
+
+ return RX_CONTINUE;
+ }
+
+ /*
+ * Some hardware seems to generate Michael MIC failure reports; even
+ * though, the frame was not encrypted with TKIP and therefore has no
+ * MIC. Ignore the flag them to avoid triggering countermeasures.
+ */
+ if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
+ !(status->flag & RX_FLAG_DECRYPTED))
+ return RX_CONTINUE;
+
+ if (rx->sdata->vif.type == NL80211_IFTYPE_AP && rx->key->conf.keyidx) {
+ /*
+ * APs with pairwise keys should never receive Michael MIC
+ * errors for non-zero keyidx because these are reserved for
+ * group keys and only the AP is sending real multicast
+ * frames in the BSS.
+ */
+ return RX_DROP_UNUSABLE;
+ }
+
+ if (status->flag & RX_FLAG_MMIC_ERROR)
+ goto mic_fail;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ if (skb->len < hdrlen + MICHAEL_MIC_LEN)
+ return RX_DROP_UNUSABLE;
+
+ if (skb_linearize(rx->skb))
+ return RX_DROP_UNUSABLE;
+ hdr = (void *)skb->data;
+
+ data = skb->data + hdrlen;
+ data_len = skb->len - hdrlen - MICHAEL_MIC_LEN;
+ key = &rx->key->conf.key[NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY];
+ michael_mic(key, hdr, data, data_len, mic);
+ if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0)
+ goto mic_fail;
+
+ /* remove Michael MIC from payload */
+ skb_trim(skb, skb->len - MICHAEL_MIC_LEN);
+
+update_iv:
+ /* update IV in key information to be able to detect replays */
+ rx->key->u.tkip.rx[rx->security_idx].iv32 = rx->tkip_iv32;
+ rx->key->u.tkip.rx[rx->security_idx].iv16 = rx->tkip_iv16;
+
+ return RX_CONTINUE;
+
+mic_fail:
+ rx->key->u.tkip.mic_failures++;
+
+mic_fail_no_key:
+ /*
+ * In some cases the key can be unset - e.g. a multicast packet, in
+ * a driver that supports HW encryption. Send up the key idx only if
+ * the key is set.
+ */
+ mac80211_ev_michael_mic_failure(rx->sdata,
+ rx->key ? rx->key->conf.keyidx : -1,
+ (void *) skb->data, NULL, GFP_ATOMIC);
+ return RX_DROP_UNUSABLE;
+}
+
+
+static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_key *key = tx->key;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ unsigned int hdrlen;
+ int len, tail;
+ u8 *pos;
+
+ if (info->control.hw_key &&
+ !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) &&
+ !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) {
+ /* hwaccel - with no need for software-generated IV */
+ return 0;
+ }
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ len = skb->len - hdrlen;
+
+ if (info->control.hw_key)
+ tail = 0;
+ else
+ tail = IEEE80211_TKIP_ICV_LEN;
+
+ if (WARN_ON(skb_tailroom(skb) < tail ||
+ skb_headroom(skb) < IEEE80211_TKIP_IV_LEN))
+ return -1;
+
+ pos = skb_push(skb, IEEE80211_TKIP_IV_LEN);
+ memmove(pos, pos + IEEE80211_TKIP_IV_LEN, hdrlen);
+ pos += hdrlen;
+
+ /* the HW only needs room for the IV, but not the actual IV */
+ if (info->control.hw_key &&
+ (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))
+ return 0;
+
+ /* Increase IV for the frame */
+ spin_lock(&key->u.tkip.txlock);
+ key->u.tkip.tx.iv16++;
+ if (key->u.tkip.tx.iv16 == 0)
+ key->u.tkip.tx.iv32++;
+ pos = ieee80211_tkip_add_iv(pos, key);
+ spin_unlock(&key->u.tkip.txlock);
+
+ /* hwaccel - with software IV */
+ if (info->control.hw_key)
+ return 0;
+
+ /* Add room for ICV */
+ skb_put(skb, IEEE80211_TKIP_ICV_LEN);
+
+ return ieee80211_tkip_encrypt_data(tx->local->wep_tx_tfm,
+ key, skb, pos, len);
+}
+
+
+ieee80211_tx_result
+ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+
+ ieee80211_tx_set_protected(tx);
+
+ skb_queue_walk(&tx->skbs, skb) {
+ if (tkip_encrypt_skb(tx, skb) < 0)
+ return TX_DROP;
+ }
+
+ return TX_CONTINUE;
+}
+
+
+ieee80211_rx_result
+ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data;
+ int hdrlen, res, hwaccel = 0;
+ struct ieee80211_key *key = rx->key;
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ return RX_CONTINUE;
+
+ if (!rx->sta || skb->len - hdrlen < 12)
+ return RX_DROP_UNUSABLE;
+
+ /* it may be possible to optimize this a bit more */
+ if (skb_linearize(rx->skb))
+ return RX_DROP_UNUSABLE;
+ hdr = (void *)skb->data;
+
+ /*
+ * Let TKIP code verify IV, but skip decryption.
+ * In the case where hardware checks the IV as well,
+ * we don't even get here, see ieee80211_rx_h_decrypt()
+ */
+ if (status->flag & RX_FLAG_DECRYPTED)
+ hwaccel = 1;
+
+ res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm,
+ key, skb->data + hdrlen,
+ skb->len - hdrlen, rx->sta->sta.addr,
+ hdr->addr1, hwaccel, rx->security_idx,
+ &rx->tkip_iv32,
+ &rx->tkip_iv16);
+ if (res != TKIP_DECRYPT_OK)
+ return RX_DROP_UNUSABLE;
+
+ /* Trim ICV */
+ skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);
+
+ /* Remove IV */
+ memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen);
+ skb_pull(skb, IEEE80211_TKIP_IV_LEN);
+
+ return RX_CONTINUE;
+}
+
+
+static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad)
+{
+ __le16 mask_fc;
+ int a4_included, mgmt;
+ u8 qos_tid;
+ u16 len_a;
+ unsigned int hdrlen;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+ /*
+ * Mask FC: zero subtype b4 b5 b6 (if not mgmt)
+ * Retry, PwrMgt, MoreData; set Protected
+ */
+ mgmt = ieee80211_is_mgmt(hdr->frame_control);
+ mask_fc = hdr->frame_control;
+ mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY |
+ IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA);
+ if (!mgmt)
+ mask_fc &= ~cpu_to_le16(0x0070);
+ mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ len_a = hdrlen - 2;
+ a4_included = ieee80211_has_a4(hdr->frame_control);
+
+ if (ieee80211_is_data_qos(hdr->frame_control))
+ qos_tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+ else
+ qos_tid = 0;
+
+ /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC
+ * mode authentication are not allowed to collide, yet both are derived
+ * from this vector b_0. We only set L := 1 here to indicate that the
+ * data size can be represented in (L+1) bytes. The CCM layer will take
+ * care of storing the data length in the top (L+1) bytes and setting
+ * and clearing the other bits as is required to derive the two IVs.
+ */
+ b_0[0] = 0x1;
+
+ /* Nonce: Nonce Flags | A2 | PN
+ * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7)
+ */
+ b_0[1] = qos_tid | (mgmt << 4);
+ memcpy(&b_0[2], hdr->addr2, ETH_ALEN);
+ memcpy(&b_0[8], pn, IEEE80211_CCMP_PN_LEN);
+
+ /* AAD (extra authenticate-only data) / masked 802.11 header
+ * FC | A1 | A2 | A3 | SC | [A4] | [QC] */
+ put_unaligned_be16(len_a, &aad[0]);
+ put_unaligned(mask_fc, (__le16 *)&aad[2]);
+ memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN);
+
+ /* Mask Seq#, leave Frag# */
+ aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f;
+ aad[23] = 0;
+
+ if (a4_included) {
+ memcpy(&aad[24], hdr->addr4, ETH_ALEN);
+ aad[30] = qos_tid;
+ aad[31] = 0;
+ } else {
+ memset(&aad[24], 0, ETH_ALEN + IEEE80211_QOS_CTL_LEN);
+ aad[24] = qos_tid;
+ }
+}
+
+
+static inline void ccmp_pn2hdr(u8 *hdr, u8 *pn, int key_id)
+{
+ hdr[0] = pn[5];
+ hdr[1] = pn[4];
+ hdr[2] = 0;
+ hdr[3] = 0x20 | (key_id << 6);
+ hdr[4] = pn[3];
+ hdr[5] = pn[2];
+ hdr[6] = pn[1];
+ hdr[7] = pn[0];
+}
+
+
+static inline void ccmp_hdr2pn(u8 *pn, u8 *hdr)
+{
+ pn[0] = hdr[7];
+ pn[1] = hdr[6];
+ pn[2] = hdr[5];
+ pn[3] = hdr[4];
+ pn[4] = hdr[1];
+ pn[5] = hdr[0];
+}
+
+
+static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb,
+ unsigned int mic_len)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_key *key = tx->key;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int hdrlen, len, tail;
+ u8 *pos;
+ u8 pn[6];
+ u64 pn64;
+ u8 aad[2 * AES_BLOCK_SIZE];
+ u8 b_0[AES_BLOCK_SIZE];
+
+ if (info->control.hw_key &&
+ !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) &&
+ !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) &&
+ !((info->control.hw_key->flags &
+ IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) &&
+ ieee80211_is_mgmt(hdr->frame_control))) {
+ /*
+ * hwaccel has no need for preallocated room for CCMP
+ * header or MIC fields
+ */
+ return 0;
+ }
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ len = skb->len - hdrlen;
+
+ if (info->control.hw_key)
+ tail = 0;
+ else
+ tail = mic_len;
+
+ if (WARN_ON(skb_tailroom(skb) < tail ||
+ skb_headroom(skb) < IEEE80211_CCMP_HDR_LEN))
+ return -1;
+
+ pos = skb_push(skb, IEEE80211_CCMP_HDR_LEN);
+ memmove(pos, pos + IEEE80211_CCMP_HDR_LEN, hdrlen);
+
+ /* the HW only needs room for the IV, but not the actual IV */
+ if (info->control.hw_key &&
+ (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))
+ return 0;
+
+ hdr = (struct ieee80211_hdr *) pos;
+ pos += hdrlen;
+
+ pn64 = atomic64_inc_return(&key->u.ccmp.tx_pn);
+
+ pn[5] = pn64;
+ pn[4] = pn64 >> 8;
+ pn[3] = pn64 >> 16;
+ pn[2] = pn64 >> 24;
+ pn[1] = pn64 >> 32;
+ pn[0] = pn64 >> 40;
+
+ ccmp_pn2hdr(pos, pn, key->conf.keyidx);
+
+ /* hwaccel - with software CCMP header */
+ if (info->control.hw_key)
+ return 0;
+
+ pos += IEEE80211_CCMP_HDR_LEN;
+ ccmp_special_blocks(skb, pn, b_0, aad);
+ ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len,
+ skb_put(skb, mic_len), mic_len);
+
+ return 0;
+}
+
+
+ieee80211_tx_result
+ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx,
+ unsigned int mic_len)
+{
+ struct sk_buff *skb;
+
+ ieee80211_tx_set_protected(tx);
+
+ skb_queue_walk(&tx->skbs, skb) {
+ if (ccmp_encrypt_skb(tx, skb, mic_len) < 0)
+ return TX_DROP;
+ }
+
+ return TX_CONTINUE;
+}
+
+
+ieee80211_rx_result
+ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
+ unsigned int mic_len)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ int hdrlen;
+ struct ieee80211_key *key = rx->key;
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ u8 pn[IEEE80211_CCMP_PN_LEN];
+ int data_len;
+ int queue;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ if (!ieee80211_is_data(hdr->frame_control) &&
+ !ieee80211_is_robust_mgmt_frame(skb))
+ return RX_CONTINUE;
+
+ data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len;
+ if (!rx->sta || data_len < 0)
+ return RX_DROP_UNUSABLE;
+
+ if (status->flag & RX_FLAG_DECRYPTED) {
+ if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_CCMP_HDR_LEN))
+ return RX_DROP_UNUSABLE;
+ } else {
+ if (skb_linearize(rx->skb))
+ return RX_DROP_UNUSABLE;
+ }
+
+ ccmp_hdr2pn(pn, skb->data + hdrlen);
+
+ queue = rx->security_idx;
+
+ if (memcmp(pn, key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN) <= 0) {
+ key->u.ccmp.replays++;
+ return RX_DROP_UNUSABLE;
+ }
+
+ if (!(status->flag & RX_FLAG_DECRYPTED)) {
+ u8 aad[2 * AES_BLOCK_SIZE];
+ u8 b_0[AES_BLOCK_SIZE];
+ /* hardware didn't decrypt/verify MIC */
+ ccmp_special_blocks(skb, pn, b_0, aad);
+
+ if (ieee80211_aes_ccm_decrypt(
+ key->u.ccmp.tfm, b_0, aad,
+ skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
+ data_len,
+ skb->data + skb->len - mic_len, mic_len))
+ return RX_DROP_UNUSABLE;
+ }
+
+ memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
+
+ /* Remove CCMP header and MIC */
+ if (pskb_trim(skb, skb->len - mic_len))
+ return RX_DROP_UNUSABLE;
+ memmove(skb->data + IEEE80211_CCMP_HDR_LEN, skb->data, hdrlen);
+ skb_pull(skb, IEEE80211_CCMP_HDR_LEN);
+
+ return RX_CONTINUE;
+}
+
+static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad)
+{
+ __le16 mask_fc;
+ u8 qos_tid;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+ memcpy(j_0, hdr->addr2, ETH_ALEN);
+ memcpy(&j_0[ETH_ALEN], pn, IEEE80211_GCMP_PN_LEN);
+ j_0[13] = 0;
+ j_0[14] = 0;
+ j_0[AES_BLOCK_SIZE - 1] = 0x01;
+
+ /* AAD (extra authenticate-only data) / masked 802.11 header
+ * FC | A1 | A2 | A3 | SC | [A4] | [QC]
+ */
+ put_unaligned_be16(ieee80211_hdrlen(hdr->frame_control) - 2, &aad[0]);
+ /* Mask FC: zero subtype b4 b5 b6 (if not mgmt)
+ * Retry, PwrMgt, MoreData; set Protected
+ */
+ mask_fc = hdr->frame_control;
+ mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY |
+ IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA);
+ if (!ieee80211_is_mgmt(hdr->frame_control))
+ mask_fc &= ~cpu_to_le16(0x0070);
+ mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+
+ put_unaligned(mask_fc, (__le16 *)&aad[2]);
+ memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN);
+
+ /* Mask Seq#, leave Frag# */
+ aad[22] = *((u8 *)&hdr->seq_ctrl) & 0x0f;
+ aad[23] = 0;
+
+ if (ieee80211_is_data_qos(hdr->frame_control))
+ qos_tid = *ieee80211_get_qos_ctl(hdr) &
+ IEEE80211_QOS_CTL_TID_MASK;
+ else
+ qos_tid = 0;
+
+ if (ieee80211_has_a4(hdr->frame_control)) {
+ memcpy(&aad[24], hdr->addr4, ETH_ALEN);
+ aad[30] = qos_tid;
+ aad[31] = 0;
+ } else {
+ memset(&aad[24], 0, ETH_ALEN + IEEE80211_QOS_CTL_LEN);
+ aad[24] = qos_tid;
+ }
+}
+
+static inline void gcmp_pn2hdr(u8 *hdr, const u8 *pn, int key_id)
+{
+ hdr[0] = pn[5];
+ hdr[1] = pn[4];
+ hdr[2] = 0;
+ hdr[3] = 0x20 | (key_id << 6);
+ hdr[4] = pn[3];
+ hdr[5] = pn[2];
+ hdr[6] = pn[1];
+ hdr[7] = pn[0];
+}
+
+static inline void gcmp_hdr2pn(u8 *pn, const u8 *hdr)
+{
+ pn[0] = hdr[7];
+ pn[1] = hdr[6];
+ pn[2] = hdr[5];
+ pn[3] = hdr[4];
+ pn[4] = hdr[1];
+ pn[5] = hdr[0];
+}
+
+static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ struct ieee80211_key *key = tx->key;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int hdrlen, len, tail;
+ u8 *pos;
+ u8 pn[6];
+ u64 pn64;
+ u8 aad[2 * AES_BLOCK_SIZE];
+ u8 j_0[AES_BLOCK_SIZE];
+
+ if (info->control.hw_key &&
+ !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) &&
+ !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) &&
+ !((info->control.hw_key->flags &
+ IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) &&
+ ieee80211_is_mgmt(hdr->frame_control))) {
+ /* hwaccel has no need for preallocated room for GCMP
+ * header or MIC fields
+ */
+ return 0;
+ }
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ len = skb->len - hdrlen;
+
+ if (info->control.hw_key)
+ tail = 0;
+ else
+ tail = IEEE80211_GCMP_MIC_LEN;
+
+ if (WARN_ON(skb_tailroom(skb) < tail ||
+ skb_headroom(skb) < IEEE80211_GCMP_HDR_LEN))
+ return -1;
+
+ pos = skb_push(skb, IEEE80211_GCMP_HDR_LEN);
+ memmove(pos, pos + IEEE80211_GCMP_HDR_LEN, hdrlen);
+ skb_set_network_header(skb, skb_network_offset(skb) +
+ IEEE80211_GCMP_HDR_LEN);
+
+ /* the HW only needs room for the IV, but not the actual IV */
+ if (info->control.hw_key &&
+ (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))
+ return 0;
+
+ hdr = (struct ieee80211_hdr *)pos;
+ pos += hdrlen;
+
+ pn64 = atomic64_inc_return(&key->u.gcmp.tx_pn);
+
+ pn[5] = pn64;
+ pn[4] = pn64 >> 8;
+ pn[3] = pn64 >> 16;
+ pn[2] = pn64 >> 24;
+ pn[1] = pn64 >> 32;
+ pn[0] = pn64 >> 40;
+
+ gcmp_pn2hdr(pos, pn, key->conf.keyidx);
+
+ /* hwaccel - with software GCMP header */
+ if (info->control.hw_key)
+ return 0;
+
+ pos += IEEE80211_GCMP_HDR_LEN;
+ gcmp_special_blocks(skb, pn, j_0, aad);
+ ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len,
+ skb_put(skb, IEEE80211_GCMP_MIC_LEN));
+
+ return 0;
+}
+
+ieee80211_tx_result
+ieee80211_crypto_gcmp_encrypt(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+
+ ieee80211_tx_set_protected(tx);
+
+ skb_queue_walk(&tx->skbs, skb) {
+ if (gcmp_encrypt_skb(tx, skb) < 0)
+ return TX_DROP;
+ }
+
+ return TX_CONTINUE;
+}
+
+ieee80211_rx_result
+ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ int hdrlen;
+ struct ieee80211_key *key = rx->key;
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ u8 pn[IEEE80211_GCMP_PN_LEN];
+ int data_len;
+ int queue;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ if (!ieee80211_is_data(hdr->frame_control) &&
+ !ieee80211_is_robust_mgmt_frame(skb))
+ return RX_CONTINUE;
+
+ data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN -
+ IEEE80211_GCMP_MIC_LEN;
+ if (!rx->sta || data_len < 0)
+ return RX_DROP_UNUSABLE;
+
+ if (status->flag & RX_FLAG_DECRYPTED) {
+ if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_GCMP_HDR_LEN))
+ return RX_DROP_UNUSABLE;
+ } else {
+ if (skb_linearize(rx->skb))
+ return RX_DROP_UNUSABLE;
+ }
+
+ gcmp_hdr2pn(pn, skb->data + hdrlen);
+
+ queue = rx->security_idx;
+
+ if (memcmp(pn, key->u.gcmp.rx_pn[queue], IEEE80211_GCMP_PN_LEN) <= 0) {
+ key->u.gcmp.replays++;
+ return RX_DROP_UNUSABLE;
+ }
+
+ if (!(status->flag & RX_FLAG_DECRYPTED)) {
+ u8 aad[2 * AES_BLOCK_SIZE];
+ u8 j_0[AES_BLOCK_SIZE];
+ /* hardware didn't decrypt/verify MIC */
+ gcmp_special_blocks(skb, pn, j_0, aad);
+
+ if (ieee80211_aes_gcm_decrypt(
+ key->u.gcmp.tfm, j_0, aad,
+ skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN,
+ data_len,
+ skb->data + skb->len - IEEE80211_GCMP_MIC_LEN))
+ return RX_DROP_UNUSABLE;
+ }
+
+ memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
+
+ /* Remove GCMP header and MIC */
+ if (pskb_trim(skb, skb->len - IEEE80211_GCMP_MIC_LEN))
+ return RX_DROP_UNUSABLE;
+ memmove(skb->data + IEEE80211_GCMP_HDR_LEN, skb->data, hdrlen);
+ skb_pull(skb, IEEE80211_GCMP_HDR_LEN);
+
+ return RX_CONTINUE;
+}
+
+static ieee80211_tx_result
+ieee80211_crypto_cs_encrypt(struct ieee80211_tx_data *tx,
+ struct sk_buff *skb)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ struct ieee80211_key *key = tx->key;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ int hdrlen;
+ u8 *pos, iv_len = key->conf.iv_len;
+
+ if (info->control.hw_key &&
+ !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) {
+ /* hwaccel has no need for preallocated head room */
+ return TX_CONTINUE;
+ }
+
+ if (unlikely(skb_headroom(skb) < iv_len &&
+ pskb_expand_head(skb, iv_len, 0, GFP_ATOMIC)))
+ return TX_DROP;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ pos = skb_push(skb, iv_len);
+ memmove(pos, pos + iv_len, hdrlen);
+
+ return TX_CONTINUE;
+}
+
+static inline int ieee80211_crypto_cs_pn_compare(u8 *pn1, u8 *pn2, int len)
+{
+ int i;
+
+ /* pn is little endian */
+ for (i = len - 1; i >= 0; i--) {
+ if (pn1[i] < pn2[i])
+ return -1;
+ else if (pn1[i] > pn2[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+static ieee80211_rx_result
+ieee80211_crypto_cs_decrypt(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_key *key = rx->key;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ const struct ieee80211_cipher_scheme *cs = NULL;
+ int hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ int data_len;
+ u8 *rx_pn;
+ u8 *skb_pn;
+ u8 qos_tid;
+
+ if (!rx->sta || !rx->sta->cipher_scheme ||
+ !(status->flag & RX_FLAG_DECRYPTED))
+ return RX_DROP_UNUSABLE;
+
+ if (!ieee80211_is_data(hdr->frame_control))
+ return RX_CONTINUE;
+
+ cs = rx->sta->cipher_scheme;
+
+ data_len = rx->skb->len - hdrlen - cs->hdr_len;
+
+ if (data_len < 0)
+ return RX_DROP_UNUSABLE;
+
+ if (ieee80211_is_data_qos(hdr->frame_control))
+ qos_tid = *ieee80211_get_qos_ctl(hdr) &
+ IEEE80211_QOS_CTL_TID_MASK;
+ else
+ qos_tid = 0;
+
+ if (skb_linearize(rx->skb))
+ return RX_DROP_UNUSABLE;
+
+ hdr = (struct ieee80211_hdr *)rx->skb->data;
+
+ rx_pn = key->u.gen.rx_pn[qos_tid];
+ skb_pn = rx->skb->data + hdrlen + cs->pn_off;
+
+ if (ieee80211_crypto_cs_pn_compare(skb_pn, rx_pn, cs->pn_len) <= 0)
+ return RX_DROP_UNUSABLE;
+
+ memcpy(rx_pn, skb_pn, cs->pn_len);
+
+ /* remove security header and MIC */
+ if (pskb_trim(rx->skb, rx->skb->len - cs->mic_len))
+ return RX_DROP_UNUSABLE;
+
+ memmove(rx->skb->data + cs->hdr_len, rx->skb->data, hdrlen);
+ skb_pull(rx->skb, cs->hdr_len);
+
+ return RX_CONTINUE;
+}
+
+static void bip_aad(struct sk_buff *skb, u8 *aad)
+{
+ __le16 mask_fc;
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+ /* BIP AAD: FC(masked) || A1 || A2 || A3 */
+
+ /* FC type/subtype */
+ /* Mask FC Retry, PwrMgt, MoreData flags to zero */
+ mask_fc = hdr->frame_control;
+ mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | IEEE80211_FCTL_PM |
+ IEEE80211_FCTL_MOREDATA);
+ put_unaligned(mask_fc, (__le16 *) &aad[0]);
+ /* A1 || A2 || A3 */
+ memcpy(aad + 2, &hdr->addr1, 3 * ETH_ALEN);
+}
+
+
+static inline void bip_ipn_set64(u8 *d, u64 pn)
+{
+ *d++ = pn;
+ *d++ = pn >> 8;
+ *d++ = pn >> 16;
+ *d++ = pn >> 24;
+ *d++ = pn >> 32;
+ *d = pn >> 40;
+}
+
+static inline void bip_ipn_swap(u8 *d, const u8 *s)
+{
+ *d++ = s[5];
+ *d++ = s[4];
+ *d++ = s[3];
+ *d++ = s[2];
+ *d++ = s[1];
+ *d = s[0];
+}
+
+
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+ struct ieee80211_tx_info *info;
+ struct ieee80211_key *key = tx->key;
+ struct ieee80211_mmie *mmie;
+ u8 aad[20];
+ u64 pn64;
+
+ if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
+ return TX_DROP;
+
+ skb = skb_peek(&tx->skbs);
+
+ info = IEEE80211_SKB_CB(skb);
+
+ if (info->control.hw_key)
+ return TX_CONTINUE;
+
+ if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
+ return TX_DROP;
+
+ mmie = (struct ieee80211_mmie *) skb_put(skb, sizeof(*mmie));
+ mmie->element_id = WLAN_EID_MMIE;
+ mmie->length = sizeof(*mmie) - 2;
+ mmie->key_id = cpu_to_le16(key->conf.keyidx);
+
+ /* PN = PN + 1 */
+ pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn);
+
+ bip_ipn_set64(mmie->sequence_number, pn64);
+
+ bip_aad(skb, aad);
+
+ /*
+ * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64)
+ */
+ ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad,
+ skb->data + 24, skb->len - 24, mmie->mic);
+
+ return TX_CONTINUE;
+}
+
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+ struct ieee80211_tx_info *info;
+ struct ieee80211_key *key = tx->key;
+ struct ieee80211_mmie_16 *mmie;
+ u8 aad[20];
+ u64 pn64;
+
+ if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
+ return TX_DROP;
+
+ skb = skb_peek(&tx->skbs);
+
+ info = IEEE80211_SKB_CB(skb);
+
+ if (info->control.hw_key)
+ return TX_CONTINUE;
+
+ if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
+ return TX_DROP;
+
+ mmie = (struct ieee80211_mmie_16 *)skb_put(skb, sizeof(*mmie));
+ mmie->element_id = WLAN_EID_MMIE;
+ mmie->length = sizeof(*mmie) - 2;
+ mmie->key_id = cpu_to_le16(key->conf.keyidx);
+
+ /* PN = PN + 1 */
+ pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn);
+
+ bip_ipn_set64(mmie->sequence_number, pn64);
+
+ bip_aad(skb, aad);
+
+ /* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128)
+ */
+ ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
+ skb->data + 24, skb->len - 24, mmie->mic);
+
+ return TX_CONTINUE;
+}
+
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_key *key = rx->key;
+ struct ieee80211_mmie *mmie;
+ u8 aad[20], mic[8], ipn[6];
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+ if (!ieee80211_is_mgmt(hdr->frame_control))
+ return RX_CONTINUE;
+
+ /* management frames are already linear */
+
+ if (skb->len < 24 + sizeof(*mmie))
+ return RX_DROP_UNUSABLE;
+
+ mmie = (struct ieee80211_mmie *)
+ (skb->data + skb->len - sizeof(*mmie));
+ if (mmie->element_id != WLAN_EID_MMIE ||
+ mmie->length != sizeof(*mmie) - 2)
+ return RX_DROP_UNUSABLE; /* Invalid MMIE */
+
+ bip_ipn_swap(ipn, mmie->sequence_number);
+
+ if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
+ key->u.aes_cmac.replays++;
+ return RX_DROP_UNUSABLE;
+ }
+
+ if (!(status->flag & RX_FLAG_DECRYPTED)) {
+ /* hardware didn't decrypt/verify MIC */
+ bip_aad(skb, aad);
+ ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad,
+ skb->data + 24, skb->len - 24, mic);
+ if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ key->u.aes_cmac.icverrors++;
+ return RX_DROP_UNUSABLE;
+ }
+ }
+
+ memcpy(key->u.aes_cmac.rx_pn, ipn, 6);
+
+ /* Remove MMIE */
+ skb_trim(skb, skb->len - sizeof(*mmie));
+
+ return RX_CONTINUE;
+}
+
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_key *key = rx->key;
+ struct ieee80211_mmie_16 *mmie;
+ u8 aad[20], mic[16], ipn[6];
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+ if (!ieee80211_is_mgmt(hdr->frame_control))
+ return RX_CONTINUE;
+
+ /* management frames are already linear */
+
+ if (skb->len < 24 + sizeof(*mmie))
+ return RX_DROP_UNUSABLE;
+
+ mmie = (struct ieee80211_mmie_16 *)
+ (skb->data + skb->len - sizeof(*mmie));
+ if (mmie->element_id != WLAN_EID_MMIE ||
+ mmie->length != sizeof(*mmie) - 2)
+ return RX_DROP_UNUSABLE; /* Invalid MMIE */
+
+ bip_ipn_swap(ipn, mmie->sequence_number);
+
+ if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
+ key->u.aes_cmac.replays++;
+ return RX_DROP_UNUSABLE;
+ }
+
+ if (!(status->flag & RX_FLAG_DECRYPTED)) {
+ /* hardware didn't decrypt/verify MIC */
+ bip_aad(skb, aad);
+ ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
+ skb->data + 24, skb->len - 24, mic);
+ if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ key->u.aes_cmac.icverrors++;
+ return RX_DROP_UNUSABLE;
+ }
+ }
+
+ memcpy(key->u.aes_cmac.rx_pn, ipn, 6);
+
+ /* Remove MMIE */
+ skb_trim(skb, skb->len - sizeof(*mmie));
+
+ return RX_CONTINUE;
+}
+
+ieee80211_tx_result
+ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+ struct ieee80211_tx_info *info;
+ struct ieee80211_key *key = tx->key;
+ struct ieee80211_mmie_16 *mmie;
+ struct ieee80211_hdr *hdr;
+ u8 aad[20];
+ u64 pn64;
+ u8 nonce[12];
+
+ if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
+ return TX_DROP;
+
+ skb = skb_peek(&tx->skbs);
+
+ info = IEEE80211_SKB_CB(skb);
+
+ if (info->control.hw_key)
+ return TX_CONTINUE;
+
+ if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
+ return TX_DROP;
+
+ mmie = (struct ieee80211_mmie_16 *)skb_put(skb, sizeof(*mmie));
+ mmie->element_id = WLAN_EID_MMIE;
+ mmie->length = sizeof(*mmie) - 2;
+ mmie->key_id = cpu_to_le16(key->conf.keyidx);
+
+ /* PN = PN + 1 */
+ pn64 = atomic64_inc_return(&key->u.aes_gmac.tx_pn);
+
+ bip_ipn_set64(mmie->sequence_number, pn64);
+
+ bip_aad(skb, aad);
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ memcpy(nonce, hdr->addr2, ETH_ALEN);
+ bip_ipn_swap(nonce + ETH_ALEN, mmie->sequence_number);
+
+ /* MIC = AES-GMAC(IGTK, AAD || Management Frame Body || MMIE, 128) */
+ if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce,
+ skb->data + 24, skb->len - 24, mmie->mic) < 0)
+ return TX_DROP;
+
+ return TX_CONTINUE;
+}
+
+ieee80211_rx_result
+ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
+{
+ struct sk_buff *skb = rx->skb;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_key *key = rx->key;
+ struct ieee80211_mmie_16 *mmie;
+ u8 aad[20], mic[16], ipn[6], nonce[12];
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+ if (!ieee80211_is_mgmt(hdr->frame_control))
+ return RX_CONTINUE;
+
+ /* management frames are already linear */
+
+ if (skb->len < 24 + sizeof(*mmie))
+ return RX_DROP_UNUSABLE;
+
+ mmie = (struct ieee80211_mmie_16 *)
+ (skb->data + skb->len - sizeof(*mmie));
+ if (mmie->element_id != WLAN_EID_MMIE ||
+ mmie->length != sizeof(*mmie) - 2)
+ return RX_DROP_UNUSABLE; /* Invalid MMIE */
+
+ bip_ipn_swap(ipn, mmie->sequence_number);
+
+ if (memcmp(ipn, key->u.aes_gmac.rx_pn, 6) <= 0) {
+ key->u.aes_gmac.replays++;
+ return RX_DROP_UNUSABLE;
+ }
+
+ if (!(status->flag & RX_FLAG_DECRYPTED)) {
+ /* hardware didn't decrypt/verify MIC */
+ bip_aad(skb, aad);
+
+ memcpy(nonce, hdr->addr2, ETH_ALEN);
+ memcpy(nonce + ETH_ALEN, ipn, 6);
+
+ if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce,
+ skb->data + 24, skb->len - 24,
+ mic) < 0 ||
+ memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+ key->u.aes_gmac.icverrors++;
+ return RX_DROP_UNUSABLE;
+ }
+ }
+
+ memcpy(key->u.aes_gmac.rx_pn, ipn, 6);
+
+ /* Remove MMIE */
+ skb_trim(skb, skb->len - sizeof(*mmie));
+
+ return RX_CONTINUE;
+}
+
+ieee80211_tx_result
+ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx)
+{
+ struct sk_buff *skb;
+ struct ieee80211_tx_info *info = NULL;
+ ieee80211_tx_result res;
+
+ skb_queue_walk(&tx->skbs, skb) {
+ info = IEEE80211_SKB_CB(skb);
+
+ /* handle hw-only algorithm */
+ if (!info->control.hw_key)
+ return TX_DROP;
+
+ if (tx->key->flags & KEY_FLAG_CIPHER_SCHEME) {
+ res = ieee80211_crypto_cs_encrypt(tx, skb);
+ if (res != TX_CONTINUE)
+ return res;
+ }
+ }
+
+ ieee80211_tx_set_protected(tx);
+
+ return TX_CONTINUE;
+}
+
+ieee80211_rx_result
+ieee80211_crypto_hw_decrypt(struct ieee80211_rx_data *rx)
+{
+ if (rx->sta && rx->sta->cipher_scheme)
+ return ieee80211_crypto_cs_decrypt(rx);
+
+ return RX_DROP_UNUSABLE;
+}
diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h
new file mode 100644
index 000000000..d98011ee8
--- /dev/null
+++ b/net/mac80211/wpa.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef WPA_H
+#define WPA_H
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include "ieee80211_i.h"
+
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx);
+
+ieee80211_tx_result
+ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx);
+
+ieee80211_tx_result
+ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx,
+ unsigned int mic_len);
+ieee80211_rx_result
+ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
+ unsigned int mic_len);
+
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx);
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx);
+ieee80211_tx_result
+ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx);
+ieee80211_tx_result
+ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_hw_decrypt(struct ieee80211_rx_data *rx);
+
+ieee80211_tx_result
+ieee80211_crypto_gcmp_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx);
+
+#endif /* WPA_H */
diff --git a/net/mac802154/Kconfig b/net/mac802154/Kconfig
new file mode 100644
index 000000000..aa462b480
--- /dev/null
+++ b/net/mac802154/Kconfig
@@ -0,0 +1,20 @@
+config MAC802154
+ tristate "Generic IEEE 802.15.4 Soft Networking Stack (mac802154)"
+ depends on IEEE802154
+ select CRC_CCITT
+ select CRYPTO_AUTHENC
+ select CRYPTO_CCM
+ select CRYPTO_CTR
+ select CRYPTO_AES
+ ---help---
+ This option enables the hardware independent IEEE 802.15.4
+ networking stack for SoftMAC devices (the ones implementing
+ only PHY level of IEEE 802.15.4 standard).
+
+ Note: this implementation is neither certified, nor feature
+ complete! Compatibility with other implementations hasn't
+ been tested yet!
+
+ If you plan to use HardMAC IEEE 802.15.4 devices, you can
+ say N here. Alternatively you can say M to compile it as
+ module.
diff --git a/net/mac802154/Makefile b/net/mac802154/Makefile
new file mode 100644
index 000000000..702d8b466
--- /dev/null
+++ b/net/mac802154/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_MAC802154) += mac802154.o
+mac802154-objs := main.o rx.o tx.o mac_cmd.o mib.o \
+ iface.o llsec.o util.o cfg.o
+
+ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
new file mode 100644
index 000000000..70be9c799
--- /dev/null
+++ b/net/mac802154/cfg.c
@@ -0,0 +1,231 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Based on: net/mac80211/cfg.c
+ */
+
+#include <net/rtnetlink.h>
+#include <net/cfg802154.h>
+
+#include "ieee802154_i.h"
+#include "driver-ops.h"
+#include "cfg.h"
+
+static struct net_device *
+ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy,
+ const char *name,
+ unsigned char name_assign_type, int type)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+ struct net_device *dev;
+
+ rtnl_lock();
+ dev = ieee802154_if_add(local, name, name_assign_type, type,
+ cpu_to_le64(0x0000000000000000ULL));
+ rtnl_unlock();
+
+ return dev;
+}
+
+static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy,
+ struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ ieee802154_if_remove(sdata);
+}
+
+static int
+ieee802154_add_iface(struct wpan_phy *phy, const char *name,
+ unsigned char name_assign_type,
+ enum nl802154_iftype type, __le64 extended_addr)
+{
+ struct ieee802154_local *local = wpan_phy_priv(phy);
+ struct net_device *err;
+
+ err = ieee802154_if_add(local, name, name_assign_type, type,
+ extended_addr);
+ return PTR_ERR_OR_ZERO(err);
+}
+
+static int
+ieee802154_del_iface(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev)
+{
+ ieee802154_if_remove(IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev));
+
+ return 0;
+}
+
+static int
+ieee802154_set_channel(struct wpan_phy *wpan_phy, u8 page, u8 channel)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+ int ret;
+
+ ASSERT_RTNL();
+
+ /* check if phy support this setting */
+ if (!(wpan_phy->channels_supported[page] & BIT(channel)))
+ return -EINVAL;
+
+ ret = drv_set_channel(local, page, channel);
+ if (!ret) {
+ wpan_phy->current_page = page;
+ wpan_phy->current_channel = channel;
+ }
+
+ return ret;
+}
+
+static int
+ieee802154_set_cca_mode(struct wpan_phy *wpan_phy,
+ const struct wpan_phy_cca *cca)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+ int ret;
+
+ ASSERT_RTNL();
+
+ /* check if phy support this setting */
+ if (!(local->hw.flags & IEEE802154_HW_CCA_MODE))
+ return -EOPNOTSUPP;
+
+ ret = drv_set_cca_mode(local, cca);
+ if (!ret)
+ wpan_phy->cca = *cca;
+
+ return ret;
+}
+
+static int
+ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 pan_id)
+{
+ ASSERT_RTNL();
+
+ /* TODO
+ * I am not sure about to check here on broadcast pan_id.
+ * Broadcast is a valid setting, comment from 802.15.4:
+ * If this value is 0xffff, the device is not associated.
+ *
+ * This could useful to simple deassociate an device.
+ */
+ if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
+ return -EINVAL;
+
+ wpan_dev->pan_id = pan_id;
+ return 0;
+}
+
+static int
+ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ u8 min_be, u8 max_be)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+
+ ASSERT_RTNL();
+
+ if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS))
+ return -EOPNOTSUPP;
+
+ wpan_dev->min_be = min_be;
+ wpan_dev->max_be = max_be;
+ return 0;
+}
+
+static int
+ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ __le16 short_addr)
+{
+ ASSERT_RTNL();
+
+ /* TODO
+ * I am not sure about to check here on broadcast short_addr.
+ * Broadcast is a valid setting, comment from 802.15.4:
+ * A value of 0xfffe indicates that the device has
+ * associated but has not been allocated an address. A
+ * value of 0xffff indicates that the device does not
+ * have a short address.
+ *
+ * I think we should allow to set these settings but
+ * don't allow to allow socket communication with it.
+ */
+ if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) ||
+ short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST))
+ return -EINVAL;
+
+ wpan_dev->short_addr = short_addr;
+ return 0;
+}
+
+static int
+ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ u8 max_csma_backoffs)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+
+ ASSERT_RTNL();
+
+ if (!(local->hw.flags & IEEE802154_HW_CSMA_PARAMS))
+ return -EOPNOTSUPP;
+
+ wpan_dev->csma_retries = max_csma_backoffs;
+ return 0;
+}
+
+static int
+ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ s8 max_frame_retries)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+
+ ASSERT_RTNL();
+
+ if (!(local->hw.flags & IEEE802154_HW_FRAME_RETRIES))
+ return -EOPNOTSUPP;
+
+ wpan_dev->frame_retries = max_frame_retries;
+ return 0;
+}
+
+static int
+ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ bool mode)
+{
+ struct ieee802154_local *local = wpan_phy_priv(wpan_phy);
+
+ ASSERT_RTNL();
+
+ if (!(local->hw.flags & IEEE802154_HW_LBT))
+ return -EOPNOTSUPP;
+
+ wpan_dev->lbt = mode;
+ return 0;
+}
+
+const struct cfg802154_ops mac802154_config_ops = {
+ .add_virtual_intf_deprecated = ieee802154_add_iface_deprecated,
+ .del_virtual_intf_deprecated = ieee802154_del_iface_deprecated,
+ .add_virtual_intf = ieee802154_add_iface,
+ .del_virtual_intf = ieee802154_del_iface,
+ .set_channel = ieee802154_set_channel,
+ .set_cca_mode = ieee802154_set_cca_mode,
+ .set_pan_id = ieee802154_set_pan_id,
+ .set_short_addr = ieee802154_set_short_addr,
+ .set_backoff_exponent = ieee802154_set_backoff_exponent,
+ .set_max_csma_backoffs = ieee802154_set_max_csma_backoffs,
+ .set_max_frame_retries = ieee802154_set_max_frame_retries,
+ .set_lbt_mode = ieee802154_set_lbt_mode,
+};
diff --git a/net/mac802154/cfg.h b/net/mac802154/cfg.h
new file mode 100644
index 000000000..e2718f981
--- /dev/null
+++ b/net/mac802154/cfg.h
@@ -0,0 +1,9 @@
+/* mac802154 configuration hooks for cfg802154
+ */
+
+#ifndef __CFG_H
+#define __CFG_H
+
+extern const struct cfg802154_ops mac802154_config_ops;
+
+#endif /* __CFG_H */
diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h
new file mode 100644
index 000000000..a0533357b
--- /dev/null
+++ b/net/mac802154/driver-ops.h
@@ -0,0 +1,223 @@
+#ifndef __MAC802154_DRIVER_OPS
+#define __MAC802154_DRIVER_OPS
+
+#include <linux/types.h>
+#include <linux/rtnetlink.h>
+
+#include <net/mac802154.h>
+
+#include "ieee802154_i.h"
+
+static inline int
+drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb)
+{
+ return local->ops->xmit_async(&local->hw, skb);
+}
+
+static inline int
+drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb)
+{
+ /* don't allow other operations while sync xmit */
+ ASSERT_RTNL();
+
+ might_sleep();
+
+ return local->ops->xmit_sync(&local->hw, skb);
+}
+
+static inline int drv_start(struct ieee802154_local *local)
+{
+ might_sleep();
+
+ local->started = true;
+ smp_mb();
+
+ return local->ops->start(&local->hw);
+}
+
+static inline void drv_stop(struct ieee802154_local *local)
+{
+ might_sleep();
+
+ local->ops->stop(&local->hw);
+
+ /* sync away all work on the tasklet before clearing started */
+ tasklet_disable(&local->tasklet);
+ tasklet_enable(&local->tasklet);
+
+ barrier();
+
+ local->started = false;
+}
+
+static inline int
+drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel)
+{
+ might_sleep();
+
+ return local->ops->set_channel(&local->hw, page, channel);
+}
+
+static inline int drv_set_tx_power(struct ieee802154_local *local, s8 dbm)
+{
+ might_sleep();
+
+ if (!local->ops->set_txpower) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ return local->ops->set_txpower(&local->hw, dbm);
+}
+
+static inline int drv_set_cca_mode(struct ieee802154_local *local,
+ const struct wpan_phy_cca *cca)
+{
+ might_sleep();
+
+ if (!local->ops->set_cca_mode) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ return local->ops->set_cca_mode(&local->hw, cca);
+}
+
+static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode)
+{
+ might_sleep();
+
+ if (!local->ops->set_lbt) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ return local->ops->set_lbt(&local->hw, mode);
+}
+
+static inline int
+drv_set_cca_ed_level(struct ieee802154_local *local, s32 ed_level)
+{
+ might_sleep();
+
+ if (!local->ops->set_cca_ed_level) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ return local->ops->set_cca_ed_level(&local->hw, ed_level);
+}
+
+static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id)
+{
+ struct ieee802154_hw_addr_filt filt;
+
+ might_sleep();
+
+ if (!local->ops->set_hw_addr_filt) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ filt.pan_id = pan_id;
+
+ return local->ops->set_hw_addr_filt(&local->hw, &filt,
+ IEEE802154_AFILT_PANID_CHANGED);
+}
+
+static inline int
+drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr)
+{
+ struct ieee802154_hw_addr_filt filt;
+
+ might_sleep();
+
+ if (!local->ops->set_hw_addr_filt) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ filt.ieee_addr = extended_addr;
+
+ return local->ops->set_hw_addr_filt(&local->hw, &filt,
+ IEEE802154_AFILT_IEEEADDR_CHANGED);
+}
+
+static inline int
+drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr)
+{
+ struct ieee802154_hw_addr_filt filt;
+
+ might_sleep();
+
+ if (!local->ops->set_hw_addr_filt) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ filt.short_addr = short_addr;
+
+ return local->ops->set_hw_addr_filt(&local->hw, &filt,
+ IEEE802154_AFILT_SADDR_CHANGED);
+}
+
+static inline int
+drv_set_pan_coord(struct ieee802154_local *local, bool is_coord)
+{
+ struct ieee802154_hw_addr_filt filt;
+
+ might_sleep();
+
+ if (!local->ops->set_hw_addr_filt) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ filt.pan_coord = is_coord;
+
+ return local->ops->set_hw_addr_filt(&local->hw, &filt,
+ IEEE802154_AFILT_PANC_CHANGED);
+}
+
+static inline int
+drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be,
+ u8 max_csma_backoffs)
+{
+ might_sleep();
+
+ if (!local->ops->set_csma_params) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ return local->ops->set_csma_params(&local->hw, min_be, max_be,
+ max_csma_backoffs);
+}
+
+static inline int
+drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries)
+{
+ might_sleep();
+
+ if (!local->ops->set_frame_retries) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ return local->ops->set_frame_retries(&local->hw, max_frame_retries);
+}
+
+static inline int
+drv_set_promiscuous_mode(struct ieee802154_local *local, bool on)
+{
+ might_sleep();
+
+ if (!local->ops->set_promiscuous_mode) {
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+ }
+
+ return local->ops->set_promiscuous_mode(&local->hw, on);
+}
+
+#endif /* __MAC802154_DRIVER_OPS */
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
new file mode 100644
index 000000000..127ba1838
--- /dev/null
+++ b/net/mac802154/ieee802154_i.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (C) 2007-2012 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+#ifndef __IEEE802154_I_H
+#define __IEEE802154_I_H
+
+#include <linux/mutex.h>
+#include <linux/hrtimer.h>
+#include <net/cfg802154.h>
+#include <net/mac802154.h>
+#include <net/nl802154.h>
+#include <net/ieee802154_netdev.h>
+
+#include "llsec.h"
+
+/* mac802154 device private data */
+struct ieee802154_local {
+ struct ieee802154_hw hw;
+ const struct ieee802154_ops *ops;
+
+ /* ieee802154 phy */
+ struct wpan_phy *phy;
+
+ int open_count;
+
+ /* As in mac80211 slaves list is modified:
+ * 1) under the RTNL
+ * 2) protected by slaves_mtx;
+ * 3) in an RCU manner
+ *
+ * So atomic readers can use any of this protection methods.
+ */
+ struct list_head interfaces;
+ struct mutex iflist_mtx;
+
+ /* This one is used for scanning and other jobs not to be interfered
+ * with serial driver.
+ */
+ struct workqueue_struct *workqueue;
+
+ struct hrtimer ifs_timer;
+
+ bool started;
+
+ struct tasklet_struct tasklet;
+ struct sk_buff_head skb_queue;
+};
+
+enum {
+ IEEE802154_RX_MSG = 1,
+};
+
+enum ieee802154_sdata_state_bits {
+ SDATA_STATE_RUNNING,
+};
+
+/* Slave interface definition.
+ *
+ * Slaves represent typical network interfaces available from userspace.
+ * Each ieee802154 device/transceiver may have several slaves and able
+ * to be associated with several networks at the same time.
+ */
+struct ieee802154_sub_if_data {
+ struct list_head list; /* the ieee802154_priv->slaves list */
+
+ struct wpan_dev wpan_dev;
+
+ struct ieee802154_local *local;
+ struct net_device *dev;
+
+ unsigned long state;
+ char name[IFNAMSIZ];
+
+ spinlock_t mib_lock;
+
+ /* protects sec from concurrent access by netlink. access by
+ * encrypt/decrypt/header_create safe without additional protection.
+ */
+ struct mutex sec_mtx;
+
+ struct mac802154_llsec sec;
+ /* must be last, dynamically sized area in this! */
+ struct ieee802154_vif vif;
+};
+
+#define MAC802154_CHAN_NONE 0xff /* No channel is assigned */
+
+/* utility functions/constants */
+extern const void *const mac802154_wpan_phy_privid; /* for wpan_phy privid */
+
+static inline struct ieee802154_local *
+hw_to_local(struct ieee802154_hw *hw)
+{
+ return container_of(hw, struct ieee802154_local, hw);
+}
+
+static inline struct ieee802154_sub_if_data *
+IEEE802154_DEV_TO_SUB_IF(const struct net_device *dev)
+{
+ return netdev_priv(dev);
+}
+
+static inline struct ieee802154_sub_if_data *
+IEEE802154_WPAN_DEV_TO_SUB_IF(struct wpan_dev *wpan_dev)
+{
+ return container_of(wpan_dev, struct ieee802154_sub_if_data, wpan_dev);
+}
+
+static inline bool
+ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata)
+{
+ return test_bit(SDATA_STATE_RUNNING, &sdata->state);
+}
+
+extern struct ieee802154_mlme_ops mac802154_mlme_wpan;
+
+netdev_tx_t
+ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t
+ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev);
+enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer);
+
+/* MIB callbacks */
+void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val);
+__le16 mac802154_dev_get_short_addr(const struct net_device *dev);
+__le16 mac802154_dev_get_pan_id(const struct net_device *dev);
+void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val);
+void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan);
+u8 mac802154_dev_get_dsn(const struct net_device *dev);
+
+int mac802154_get_params(struct net_device *dev,
+ struct ieee802154_llsec_params *params);
+int mac802154_set_params(struct net_device *dev,
+ const struct ieee802154_llsec_params *params,
+ int changed);
+
+int mac802154_add_key(struct net_device *dev,
+ const struct ieee802154_llsec_key_id *id,
+ const struct ieee802154_llsec_key *key);
+int mac802154_del_key(struct net_device *dev,
+ const struct ieee802154_llsec_key_id *id);
+
+int mac802154_add_dev(struct net_device *dev,
+ const struct ieee802154_llsec_device *llsec_dev);
+int mac802154_del_dev(struct net_device *dev, __le64 dev_addr);
+
+int mac802154_add_devkey(struct net_device *dev,
+ __le64 device_addr,
+ const struct ieee802154_llsec_device_key *key);
+int mac802154_del_devkey(struct net_device *dev,
+ __le64 device_addr,
+ const struct ieee802154_llsec_device_key *key);
+
+int mac802154_add_seclevel(struct net_device *dev,
+ const struct ieee802154_llsec_seclevel *sl);
+int mac802154_del_seclevel(struct net_device *dev,
+ const struct ieee802154_llsec_seclevel *sl);
+
+void mac802154_lock_table(struct net_device *dev);
+void mac802154_get_table(struct net_device *dev,
+ struct ieee802154_llsec_table **t);
+void mac802154_unlock_table(struct net_device *dev);
+
+/* interface handling */
+int ieee802154_iface_init(void);
+void ieee802154_iface_exit(void);
+void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata);
+struct net_device *
+ieee802154_if_add(struct ieee802154_local *local, const char *name,
+ unsigned char name_assign_type, enum nl802154_iftype type,
+ __le64 extended_addr);
+void ieee802154_remove_interfaces(struct ieee802154_local *local);
+
+#endif /* __IEEE802154_I_H */
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
new file mode 100644
index 000000000..91b75abbd
--- /dev/null
+++ b/net/mac802154/iface.c
@@ -0,0 +1,651 @@
+/*
+ * Copyright 2007-2012 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Sergey Lapin <slapin@ossfans.org>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/ieee802154.h>
+
+#include <net/nl802154.h>
+#include <net/mac802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/cfg802154.h>
+
+#include "ieee802154_i.h"
+#include "driver-ops.h"
+
+static int mac802154_wpan_update_llsec(struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ int rc = 0;
+
+ if (ops->llsec) {
+ struct ieee802154_llsec_params params;
+ int changed = 0;
+
+ params.pan_id = wpan_dev->pan_id;
+ changed |= IEEE802154_LLSEC_PARAM_PAN_ID;
+
+ params.hwaddr = wpan_dev->extended_addr;
+ changed |= IEEE802154_LLSEC_PARAM_HWADDR;
+
+ rc = ops->llsec->set_params(dev, &params, changed);
+ }
+
+ return rc;
+}
+
+static int
+mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ struct sockaddr_ieee802154 *sa =
+ (struct sockaddr_ieee802154 *)&ifr->ifr_addr;
+ int err = -ENOIOCTLCMD;
+
+ ASSERT_RTNL();
+
+ spin_lock_bh(&sdata->mib_lock);
+
+ switch (cmd) {
+ case SIOCGIFADDR:
+ {
+ u16 pan_id, short_addr;
+
+ pan_id = le16_to_cpu(wpan_dev->pan_id);
+ short_addr = le16_to_cpu(wpan_dev->short_addr);
+ if (pan_id == IEEE802154_PANID_BROADCAST ||
+ short_addr == IEEE802154_ADDR_BROADCAST) {
+ err = -EADDRNOTAVAIL;
+ break;
+ }
+
+ sa->family = AF_IEEE802154;
+ sa->addr.addr_type = IEEE802154_ADDR_SHORT;
+ sa->addr.pan_id = pan_id;
+ sa->addr.short_addr = short_addr;
+
+ err = 0;
+ break;
+ }
+ case SIOCSIFADDR:
+ if (netif_running(dev)) {
+ spin_unlock_bh(&sdata->mib_lock);
+ return -EBUSY;
+ }
+
+ dev_warn(&dev->dev,
+ "Using DEBUGing ioctl SIOCSIFADDR isn't recommended!\n");
+ if (sa->family != AF_IEEE802154 ||
+ sa->addr.addr_type != IEEE802154_ADDR_SHORT ||
+ sa->addr.pan_id == IEEE802154_PANID_BROADCAST ||
+ sa->addr.short_addr == IEEE802154_ADDR_BROADCAST ||
+ sa->addr.short_addr == IEEE802154_ADDR_UNDEF) {
+ err = -EINVAL;
+ break;
+ }
+
+ wpan_dev->pan_id = cpu_to_le16(sa->addr.pan_id);
+ wpan_dev->short_addr = cpu_to_le16(sa->addr.short_addr);
+
+ err = mac802154_wpan_update_llsec(dev);
+ break;
+ }
+
+ spin_unlock_bh(&sdata->mib_lock);
+ return err;
+}
+
+static int mac802154_wpan_mac_addr(struct net_device *dev, void *p)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct sockaddr *addr = p;
+ __le64 extended_addr;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ ieee802154_be64_to_le64(&extended_addr, addr->sa_data);
+ if (!ieee802154_is_valid_extended_addr(extended_addr))
+ return -EINVAL;
+
+ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ sdata->wpan_dev.extended_addr = extended_addr;
+
+ return mac802154_wpan_update_llsec(dev);
+}
+
+static int mac802154_slave_open(struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct ieee802154_local *local = sdata->local;
+ int res = 0;
+
+ ASSERT_RTNL();
+
+ set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+ if (!local->open_count) {
+ res = drv_start(local);
+ WARN_ON(res);
+ if (res)
+ goto err;
+ }
+
+ local->open_count++;
+ netif_start_queue(dev);
+ return 0;
+err:
+ /* might already be clear but that doesn't matter */
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+ return res;
+}
+
+static int
+ieee802154_check_mac_settings(struct ieee802154_local *local,
+ struct wpan_dev *wpan_dev,
+ struct wpan_dev *nwpan_dev)
+{
+ ASSERT_RTNL();
+
+ if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
+ if (wpan_dev->promiscuous_mode != nwpan_dev->promiscuous_mode)
+ return -EBUSY;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_AFILT) {
+ if (wpan_dev->pan_id != nwpan_dev->pan_id ||
+ wpan_dev->short_addr != nwpan_dev->short_addr ||
+ wpan_dev->extended_addr != nwpan_dev->extended_addr)
+ return -EBUSY;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) {
+ if (wpan_dev->min_be != nwpan_dev->min_be ||
+ wpan_dev->max_be != nwpan_dev->max_be ||
+ wpan_dev->csma_retries != nwpan_dev->csma_retries)
+ return -EBUSY;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) {
+ if (wpan_dev->frame_retries != nwpan_dev->frame_retries)
+ return -EBUSY;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_LBT) {
+ if (wpan_dev->lbt != nwpan_dev->lbt)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int
+ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata,
+ enum nl802154_iftype iftype)
+{
+ struct ieee802154_local *local = sdata->local;
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ struct ieee802154_sub_if_data *nsdata;
+
+ /* we hold the RTNL here so can safely walk the list */
+ list_for_each_entry(nsdata, &local->interfaces, list) {
+ if (nsdata != sdata && ieee802154_sdata_running(nsdata)) {
+ int ret;
+
+ /* TODO currently we don't support multiple node types
+ * we need to run skb_clone at rx path. Check if there
+ * exist really an use case if we need to support
+ * multiple node types at the same time.
+ */
+ if (sdata->vif.type == NL802154_IFTYPE_NODE &&
+ nsdata->vif.type == NL802154_IFTYPE_NODE)
+ return -EBUSY;
+
+ /* check all phy mac sublayer settings are the same.
+ * We have only one phy, different values makes trouble.
+ */
+ ret = ieee802154_check_mac_settings(local, wpan_dev,
+ &nsdata->wpan_dev);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int mac802154_wpan_open(struct net_device *dev)
+{
+ int rc;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct ieee802154_local *local = sdata->local;
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ struct wpan_phy *phy = sdata->local->phy;
+
+ rc = ieee802154_check_concurrent_iface(sdata, sdata->vif.type);
+ if (rc < 0)
+ return rc;
+
+ rc = mac802154_slave_open(dev);
+ if (rc < 0)
+ return rc;
+
+ mutex_lock(&phy->pib_lock);
+
+ if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
+ rc = drv_set_promiscuous_mode(local,
+ wpan_dev->promiscuous_mode);
+ if (rc < 0)
+ goto out;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_AFILT) {
+ rc = drv_set_pan_id(local, wpan_dev->pan_id);
+ if (rc < 0)
+ goto out;
+
+ rc = drv_set_extended_addr(local, wpan_dev->extended_addr);
+ if (rc < 0)
+ goto out;
+
+ rc = drv_set_short_addr(local, wpan_dev->short_addr);
+ if (rc < 0)
+ goto out;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_LBT) {
+ rc = drv_set_lbt_mode(local, wpan_dev->lbt);
+ if (rc < 0)
+ goto out;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) {
+ rc = drv_set_csma_params(local, wpan_dev->min_be,
+ wpan_dev->max_be,
+ wpan_dev->csma_retries);
+ if (rc < 0)
+ goto out;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) {
+ rc = drv_set_max_frame_retries(local, wpan_dev->frame_retries);
+ if (rc < 0)
+ goto out;
+ }
+
+ mutex_unlock(&phy->pib_lock);
+ return 0;
+
+out:
+ mutex_unlock(&phy->pib_lock);
+ return rc;
+}
+
+static int mac802154_slave_close(struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct ieee802154_local *local = sdata->local;
+
+ ASSERT_RTNL();
+
+ hrtimer_cancel(&local->ifs_timer);
+
+ netif_stop_queue(dev);
+ local->open_count--;
+
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+ if (!local->open_count)
+ drv_stop(local);
+
+ return 0;
+}
+
+static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata,
+ struct ieee802154_hdr *hdr,
+ const struct ieee802154_mac_cb *cb)
+{
+ struct ieee802154_llsec_params params;
+ u8 level;
+
+ mac802154_llsec_get_params(&sdata->sec, &params);
+
+ if (!params.enabled && cb->secen_override && cb->secen)
+ return -EINVAL;
+ if (!params.enabled ||
+ (cb->secen_override && !cb->secen) ||
+ !params.out_level)
+ return 0;
+ if (cb->seclevel_override && !cb->seclevel)
+ return -EINVAL;
+
+ level = cb->seclevel_override ? cb->seclevel : params.out_level;
+
+ hdr->fc.security_enabled = 1;
+ hdr->sec.level = level;
+ hdr->sec.key_id_mode = params.out_key.mode;
+ if (params.out_key.mode == IEEE802154_SCF_KEY_SHORT_INDEX)
+ hdr->sec.short_src = params.out_key.short_source;
+ else if (params.out_key.mode == IEEE802154_SCF_KEY_HW_INDEX)
+ hdr->sec.extended_src = params.out_key.extended_source;
+ hdr->sec.key_id = params.out_key.id;
+
+ return 0;
+}
+
+static int mac802154_header_create(struct sk_buff *skb,
+ struct net_device *dev,
+ unsigned short type,
+ const void *daddr,
+ const void *saddr,
+ unsigned len)
+{
+ struct ieee802154_hdr hdr;
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ struct ieee802154_mac_cb *cb = mac_cb(skb);
+ int hlen;
+
+ if (!daddr)
+ return -EINVAL;
+
+ memset(&hdr.fc, 0, sizeof(hdr.fc));
+ hdr.fc.type = cb->type;
+ hdr.fc.security_enabled = cb->secen;
+ hdr.fc.ack_request = cb->ackreq;
+ hdr.seq = ieee802154_mlme_ops(dev)->get_dsn(dev);
+
+ if (mac802154_set_header_security(sdata, &hdr, cb) < 0)
+ return -EINVAL;
+
+ if (!saddr) {
+ spin_lock_bh(&sdata->mib_lock);
+
+ if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) ||
+ wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) ||
+ wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) {
+ hdr.source.mode = IEEE802154_ADDR_LONG;
+ hdr.source.extended_addr = wpan_dev->extended_addr;
+ } else {
+ hdr.source.mode = IEEE802154_ADDR_SHORT;
+ hdr.source.short_addr = wpan_dev->short_addr;
+ }
+
+ hdr.source.pan_id = wpan_dev->pan_id;
+
+ spin_unlock_bh(&sdata->mib_lock);
+ } else {
+ hdr.source = *(const struct ieee802154_addr *)saddr;
+ }
+
+ hdr.dest = *(const struct ieee802154_addr *)daddr;
+
+ hlen = ieee802154_hdr_push(skb, &hdr);
+ if (hlen < 0)
+ return -EINVAL;
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = hlen;
+
+ if (len > ieee802154_max_payload(&hdr))
+ return -EMSGSIZE;
+
+ return hlen;
+}
+
+static int
+mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+ struct ieee802154_hdr hdr;
+ struct ieee802154_addr *addr = (struct ieee802154_addr *)haddr;
+
+ if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) {
+ pr_debug("malformed packet\n");
+ return 0;
+ }
+
+ *addr = hdr.source;
+ return sizeof(*addr);
+}
+
+static struct header_ops mac802154_header_ops = {
+ .create = mac802154_header_create,
+ .parse = mac802154_header_parse,
+};
+
+static const struct net_device_ops mac802154_wpan_ops = {
+ .ndo_open = mac802154_wpan_open,
+ .ndo_stop = mac802154_slave_close,
+ .ndo_start_xmit = ieee802154_subif_start_xmit,
+ .ndo_do_ioctl = mac802154_wpan_ioctl,
+ .ndo_set_mac_address = mac802154_wpan_mac_addr,
+};
+
+static const struct net_device_ops mac802154_monitor_ops = {
+ .ndo_open = mac802154_wpan_open,
+ .ndo_stop = mac802154_slave_close,
+ .ndo_start_xmit = ieee802154_monitor_start_xmit,
+};
+
+static void mac802154_wpan_free(struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ mac802154_llsec_destroy(&sdata->sec);
+
+ free_netdev(dev);
+}
+
+static void ieee802154_if_setup(struct net_device *dev)
+{
+ dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN;
+ memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN);
+
+ dev->hard_header_len = MAC802154_FRAME_HARD_HEADER_LEN;
+ dev->needed_tailroom = 2 + 16; /* FCS + MIC */
+ dev->mtu = IEEE802154_MTU;
+ dev->tx_queue_len = 300;
+ dev->flags = IFF_NOARP | IFF_BROADCAST;
+}
+
+static int
+ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
+ enum nl802154_iftype type)
+{
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+
+ /* set some type-dependent values */
+ sdata->vif.type = type;
+ sdata->wpan_dev.iftype = type;
+
+ get_random_bytes(&wpan_dev->bsn, 1);
+ get_random_bytes(&wpan_dev->dsn, 1);
+
+ /* defaults per 802.15.4-2011 */
+ wpan_dev->min_be = 3;
+ wpan_dev->max_be = 5;
+ wpan_dev->csma_retries = 4;
+ /* for compatibility, actual default is 3 */
+ wpan_dev->frame_retries = -1;
+
+ wpan_dev->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST);
+ wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST);
+
+ switch (type) {
+ case NL802154_IFTYPE_NODE:
+ ieee802154_be64_to_le64(&wpan_dev->extended_addr,
+ sdata->dev->dev_addr);
+
+ sdata->dev->header_ops = &mac802154_header_ops;
+ sdata->dev->destructor = mac802154_wpan_free;
+ sdata->dev->netdev_ops = &mac802154_wpan_ops;
+ sdata->dev->ml_priv = &mac802154_mlme_wpan;
+ wpan_dev->promiscuous_mode = false;
+
+ spin_lock_init(&sdata->mib_lock);
+ mutex_init(&sdata->sec_mtx);
+
+ mac802154_llsec_init(&sdata->sec);
+ break;
+ case NL802154_IFTYPE_MONITOR:
+ sdata->dev->destructor = free_netdev;
+ sdata->dev->netdev_ops = &mac802154_monitor_ops;
+ wpan_dev->promiscuous_mode = true;
+ break;
+ default:
+ BUG();
+ }
+
+ return 0;
+}
+
+struct net_device *
+ieee802154_if_add(struct ieee802154_local *local, const char *name,
+ unsigned char name_assign_type, enum nl802154_iftype type,
+ __le64 extended_addr)
+{
+ struct net_device *ndev = NULL;
+ struct ieee802154_sub_if_data *sdata = NULL;
+ int ret = -ENOMEM;
+
+ ASSERT_RTNL();
+
+ ndev = alloc_netdev(sizeof(*sdata) + local->hw.vif_data_size, name,
+ name_assign_type, ieee802154_if_setup);
+ if (!ndev)
+ return ERR_PTR(-ENOMEM);
+
+ ndev->needed_headroom = local->hw.extra_tx_headroom;
+
+ ret = dev_alloc_name(ndev, ndev->name);
+ if (ret < 0)
+ goto err;
+
+ ieee802154_le64_to_be64(ndev->perm_addr,
+ &local->hw.phy->perm_extended_addr);
+ switch (type) {
+ case NL802154_IFTYPE_NODE:
+ ndev->type = ARPHRD_IEEE802154;
+ if (ieee802154_is_valid_extended_addr(extended_addr))
+ ieee802154_le64_to_be64(ndev->dev_addr, &extended_addr);
+ else
+ memcpy(ndev->dev_addr, ndev->perm_addr,
+ IEEE802154_EXTENDED_ADDR_LEN);
+ break;
+ case NL802154_IFTYPE_MONITOR:
+ ndev->type = ARPHRD_IEEE802154_MONITOR;
+ break;
+ default:
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* TODO check this */
+ SET_NETDEV_DEV(ndev, &local->phy->dev);
+ sdata = netdev_priv(ndev);
+ ndev->ieee802154_ptr = &sdata->wpan_dev;
+ memcpy(sdata->name, ndev->name, IFNAMSIZ);
+ sdata->dev = ndev;
+ sdata->wpan_dev.wpan_phy = local->hw.phy;
+ sdata->local = local;
+
+ /* setup type-dependent data */
+ ret = ieee802154_setup_sdata(sdata, type);
+ if (ret)
+ goto err;
+
+ ret = register_netdevice(ndev);
+ if (ret < 0)
+ goto err;
+
+ mutex_lock(&local->iflist_mtx);
+ list_add_tail_rcu(&sdata->list, &local->interfaces);
+ mutex_unlock(&local->iflist_mtx);
+
+ return ndev;
+
+err:
+ free_netdev(ndev);
+ return ERR_PTR(ret);
+}
+
+void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata)
+{
+ ASSERT_RTNL();
+
+ mutex_lock(&sdata->local->iflist_mtx);
+ list_del_rcu(&sdata->list);
+ mutex_unlock(&sdata->local->iflist_mtx);
+
+ synchronize_rcu();
+ unregister_netdevice(sdata->dev);
+}
+
+void ieee802154_remove_interfaces(struct ieee802154_local *local)
+{
+ struct ieee802154_sub_if_data *sdata, *tmp;
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) {
+ list_del(&sdata->list);
+
+ unregister_netdevice(sdata->dev);
+ }
+ mutex_unlock(&local->iflist_mtx);
+}
+
+static int netdev_notify(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct ieee802154_sub_if_data *sdata;
+
+ if (state != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
+
+ if (!dev->ieee802154_ptr || !dev->ieee802154_ptr->wpan_phy)
+ return NOTIFY_DONE;
+
+ if (dev->ieee802154_ptr->wpan_phy->privid != mac802154_wpan_phy_privid)
+ return NOTIFY_DONE;
+
+ sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ memcpy(sdata->name, dev->name, IFNAMSIZ);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mac802154_netdev_notifier = {
+ .notifier_call = netdev_notify,
+};
+
+int ieee802154_iface_init(void)
+{
+ return register_netdevice_notifier(&mac802154_netdev_notifier);
+}
+
+void ieee802154_iface_exit(void)
+{
+ unregister_netdevice_notifier(&mac802154_netdev_notifier);
+}
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
new file mode 100644
index 000000000..5b2be1283
--- /dev/null
+++ b/net/mac802154/llsec.c
@@ -0,0 +1,1056 @@
+/*
+ * Copyright (C) 2014 Fraunhofer ITWM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
+ */
+
+#include <linux/err.h>
+#include <linux/bug.h>
+#include <linux/completion.h>
+#include <linux/ieee802154.h>
+#include <crypto/algapi.h>
+
+#include "ieee802154_i.h"
+#include "llsec.h"
+
+static void llsec_key_put(struct mac802154_llsec_key *key);
+static bool llsec_key_id_equal(const struct ieee802154_llsec_key_id *a,
+ const struct ieee802154_llsec_key_id *b);
+
+static void llsec_dev_free(struct mac802154_llsec_device *dev);
+
+void mac802154_llsec_init(struct mac802154_llsec *sec)
+{
+ memset(sec, 0, sizeof(*sec));
+
+ memset(&sec->params.default_key_source, 0xFF, IEEE802154_ADDR_LEN);
+
+ INIT_LIST_HEAD(&sec->table.security_levels);
+ INIT_LIST_HEAD(&sec->table.devices);
+ INIT_LIST_HEAD(&sec->table.keys);
+ hash_init(sec->devices_short);
+ hash_init(sec->devices_hw);
+ rwlock_init(&sec->lock);
+}
+
+void mac802154_llsec_destroy(struct mac802154_llsec *sec)
+{
+ struct ieee802154_llsec_seclevel *sl, *sn;
+ struct ieee802154_llsec_device *dev, *dn;
+ struct ieee802154_llsec_key_entry *key, *kn;
+
+ list_for_each_entry_safe(sl, sn, &sec->table.security_levels, list) {
+ struct mac802154_llsec_seclevel *msl;
+
+ msl = container_of(sl, struct mac802154_llsec_seclevel, level);
+ list_del(&sl->list);
+ kfree(msl);
+ }
+
+ list_for_each_entry_safe(dev, dn, &sec->table.devices, list) {
+ struct mac802154_llsec_device *mdev;
+
+ mdev = container_of(dev, struct mac802154_llsec_device, dev);
+ list_del(&dev->list);
+ llsec_dev_free(mdev);
+ }
+
+ list_for_each_entry_safe(key, kn, &sec->table.keys, list) {
+ struct mac802154_llsec_key *mkey;
+
+ mkey = container_of(key->key, struct mac802154_llsec_key, key);
+ list_del(&key->list);
+ llsec_key_put(mkey);
+ kfree(key);
+ }
+}
+
+int mac802154_llsec_get_params(struct mac802154_llsec *sec,
+ struct ieee802154_llsec_params *params)
+{
+ read_lock_bh(&sec->lock);
+ *params = sec->params;
+ read_unlock_bh(&sec->lock);
+
+ return 0;
+}
+
+int mac802154_llsec_set_params(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_params *params,
+ int changed)
+{
+ write_lock_bh(&sec->lock);
+
+ if (changed & IEEE802154_LLSEC_PARAM_ENABLED)
+ sec->params.enabled = params->enabled;
+ if (changed & IEEE802154_LLSEC_PARAM_FRAME_COUNTER)
+ sec->params.frame_counter = params->frame_counter;
+ if (changed & IEEE802154_LLSEC_PARAM_OUT_LEVEL)
+ sec->params.out_level = params->out_level;
+ if (changed & IEEE802154_LLSEC_PARAM_OUT_KEY)
+ sec->params.out_key = params->out_key;
+ if (changed & IEEE802154_LLSEC_PARAM_KEY_SOURCE)
+ sec->params.default_key_source = params->default_key_source;
+ if (changed & IEEE802154_LLSEC_PARAM_PAN_ID)
+ sec->params.pan_id = params->pan_id;
+ if (changed & IEEE802154_LLSEC_PARAM_HWADDR)
+ sec->params.hwaddr = params->hwaddr;
+ if (changed & IEEE802154_LLSEC_PARAM_COORD_HWADDR)
+ sec->params.coord_hwaddr = params->coord_hwaddr;
+ if (changed & IEEE802154_LLSEC_PARAM_COORD_SHORTADDR)
+ sec->params.coord_shortaddr = params->coord_shortaddr;
+
+ write_unlock_bh(&sec->lock);
+
+ return 0;
+}
+
+static struct mac802154_llsec_key*
+llsec_key_alloc(const struct ieee802154_llsec_key *template)
+{
+ const int authsizes[3] = { 4, 8, 16 };
+ struct mac802154_llsec_key *key;
+ int i;
+
+ key = kzalloc(sizeof(*key), GFP_KERNEL);
+ if (!key)
+ return NULL;
+
+ kref_init(&key->ref);
+ key->key = *template;
+
+ BUILD_BUG_ON(ARRAY_SIZE(authsizes) != ARRAY_SIZE(key->tfm));
+
+ for (i = 0; i < ARRAY_SIZE(key->tfm); i++) {
+ key->tfm[i] = crypto_alloc_aead("ccm(aes)", 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(key->tfm[i]))
+ goto err_tfm;
+ if (crypto_aead_setkey(key->tfm[i], template->key,
+ IEEE802154_LLSEC_KEY_SIZE))
+ goto err_tfm;
+ if (crypto_aead_setauthsize(key->tfm[i], authsizes[i]))
+ goto err_tfm;
+ }
+
+ key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(key->tfm0))
+ goto err_tfm;
+
+ if (crypto_blkcipher_setkey(key->tfm0, template->key,
+ IEEE802154_LLSEC_KEY_SIZE))
+ goto err_tfm0;
+
+ return key;
+
+err_tfm0:
+ crypto_free_blkcipher(key->tfm0);
+err_tfm:
+ for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
+ if (key->tfm[i])
+ crypto_free_aead(key->tfm[i]);
+
+ kfree(key);
+ return NULL;
+}
+
+static void llsec_key_release(struct kref *ref)
+{
+ struct mac802154_llsec_key *key;
+ int i;
+
+ key = container_of(ref, struct mac802154_llsec_key, ref);
+
+ for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
+ crypto_free_aead(key->tfm[i]);
+
+ crypto_free_blkcipher(key->tfm0);
+ kfree(key);
+}
+
+static struct mac802154_llsec_key*
+llsec_key_get(struct mac802154_llsec_key *key)
+{
+ kref_get(&key->ref);
+ return key;
+}
+
+static void llsec_key_put(struct mac802154_llsec_key *key)
+{
+ kref_put(&key->ref, llsec_key_release);
+}
+
+static bool llsec_key_id_equal(const struct ieee802154_llsec_key_id *a,
+ const struct ieee802154_llsec_key_id *b)
+{
+ if (a->mode != b->mode)
+ return false;
+
+ if (a->mode == IEEE802154_SCF_KEY_IMPLICIT)
+ return ieee802154_addr_equal(&a->device_addr, &b->device_addr);
+
+ if (a->id != b->id)
+ return false;
+
+ switch (a->mode) {
+ case IEEE802154_SCF_KEY_INDEX:
+ return true;
+ case IEEE802154_SCF_KEY_SHORT_INDEX:
+ return a->short_source == b->short_source;
+ case IEEE802154_SCF_KEY_HW_INDEX:
+ return a->extended_source == b->extended_source;
+ }
+
+ return false;
+}
+
+int mac802154_llsec_key_add(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_key_id *id,
+ const struct ieee802154_llsec_key *key)
+{
+ struct mac802154_llsec_key *mkey = NULL;
+ struct ieee802154_llsec_key_entry *pos, *new;
+
+ if (!(key->frame_types & (1 << IEEE802154_FC_TYPE_MAC_CMD)) &&
+ key->cmd_frame_ids)
+ return -EINVAL;
+
+ list_for_each_entry(pos, &sec->table.keys, list) {
+ if (llsec_key_id_equal(&pos->id, id))
+ return -EEXIST;
+
+ if (memcmp(pos->key->key, key->key,
+ IEEE802154_LLSEC_KEY_SIZE))
+ continue;
+
+ mkey = container_of(pos->key, struct mac802154_llsec_key, key);
+
+ /* Don't allow multiple instances of the same AES key to have
+ * different allowed frame types/command frame ids, as this is
+ * not possible in the 802.15.4 PIB.
+ */
+ if (pos->key->frame_types != key->frame_types ||
+ pos->key->cmd_frame_ids != key->cmd_frame_ids)
+ return -EEXIST;
+
+ break;
+ }
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (!mkey)
+ mkey = llsec_key_alloc(key);
+ else
+ mkey = llsec_key_get(mkey);
+
+ if (!mkey)
+ goto fail;
+
+ new->id = *id;
+ new->key = &mkey->key;
+
+ list_add_rcu(&new->list, &sec->table.keys);
+
+ return 0;
+
+fail:
+ kfree(new);
+ return -ENOMEM;
+}
+
+int mac802154_llsec_key_del(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_key_id *key)
+{
+ struct ieee802154_llsec_key_entry *pos;
+
+ list_for_each_entry(pos, &sec->table.keys, list) {
+ struct mac802154_llsec_key *mkey;
+
+ mkey = container_of(pos->key, struct mac802154_llsec_key, key);
+
+ if (llsec_key_id_equal(&pos->id, key)) {
+ list_del_rcu(&pos->list);
+ llsec_key_put(mkey);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static bool llsec_dev_use_shortaddr(__le16 short_addr)
+{
+ return short_addr != cpu_to_le16(IEEE802154_ADDR_UNDEF) &&
+ short_addr != cpu_to_le16(0xffff);
+}
+
+static u32 llsec_dev_hash_short(__le16 short_addr, __le16 pan_id)
+{
+ return ((__force u16)short_addr) << 16 | (__force u16)pan_id;
+}
+
+static u64 llsec_dev_hash_long(__le64 hwaddr)
+{
+ return (__force u64)hwaddr;
+}
+
+static struct mac802154_llsec_device*
+llsec_dev_find_short(struct mac802154_llsec *sec, __le16 short_addr,
+ __le16 pan_id)
+{
+ struct mac802154_llsec_device *dev;
+ u32 key = llsec_dev_hash_short(short_addr, pan_id);
+
+ hash_for_each_possible_rcu(sec->devices_short, dev, bucket_s, key) {
+ if (dev->dev.short_addr == short_addr &&
+ dev->dev.pan_id == pan_id)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static struct mac802154_llsec_device*
+llsec_dev_find_long(struct mac802154_llsec *sec, __le64 hwaddr)
+{
+ struct mac802154_llsec_device *dev;
+ u64 key = llsec_dev_hash_long(hwaddr);
+
+ hash_for_each_possible_rcu(sec->devices_hw, dev, bucket_hw, key) {
+ if (dev->dev.hwaddr == hwaddr)
+ return dev;
+ }
+
+ return NULL;
+}
+
+static void llsec_dev_free(struct mac802154_llsec_device *dev)
+{
+ struct ieee802154_llsec_device_key *pos, *pn;
+ struct mac802154_llsec_device_key *devkey;
+
+ list_for_each_entry_safe(pos, pn, &dev->dev.keys, list) {
+ devkey = container_of(pos, struct mac802154_llsec_device_key,
+ devkey);
+
+ list_del(&pos->list);
+ kfree(devkey);
+ }
+
+ kfree(dev);
+}
+
+int mac802154_llsec_dev_add(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_device *dev)
+{
+ struct mac802154_llsec_device *entry;
+ u32 skey = llsec_dev_hash_short(dev->short_addr, dev->pan_id);
+ u64 hwkey = llsec_dev_hash_long(dev->hwaddr);
+
+ BUILD_BUG_ON(sizeof(hwkey) != IEEE802154_ADDR_LEN);
+
+ if ((llsec_dev_use_shortaddr(dev->short_addr) &&
+ llsec_dev_find_short(sec, dev->short_addr, dev->pan_id)) ||
+ llsec_dev_find_long(sec, dev->hwaddr))
+ return -EEXIST;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->dev = *dev;
+ spin_lock_init(&entry->lock);
+ INIT_LIST_HEAD(&entry->dev.keys);
+
+ if (llsec_dev_use_shortaddr(dev->short_addr))
+ hash_add_rcu(sec->devices_short, &entry->bucket_s, skey);
+ else
+ INIT_HLIST_NODE(&entry->bucket_s);
+
+ hash_add_rcu(sec->devices_hw, &entry->bucket_hw, hwkey);
+ list_add_tail_rcu(&entry->dev.list, &sec->table.devices);
+
+ return 0;
+}
+
+static void llsec_dev_free_rcu(struct rcu_head *rcu)
+{
+ llsec_dev_free(container_of(rcu, struct mac802154_llsec_device, rcu));
+}
+
+int mac802154_llsec_dev_del(struct mac802154_llsec *sec, __le64 device_addr)
+{
+ struct mac802154_llsec_device *pos;
+
+ pos = llsec_dev_find_long(sec, device_addr);
+ if (!pos)
+ return -ENOENT;
+
+ hash_del_rcu(&pos->bucket_s);
+ hash_del_rcu(&pos->bucket_hw);
+ call_rcu(&pos->rcu, llsec_dev_free_rcu);
+
+ return 0;
+}
+
+static struct mac802154_llsec_device_key*
+llsec_devkey_find(struct mac802154_llsec_device *dev,
+ const struct ieee802154_llsec_key_id *key)
+{
+ struct ieee802154_llsec_device_key *devkey;
+
+ list_for_each_entry_rcu(devkey, &dev->dev.keys, list) {
+ if (!llsec_key_id_equal(key, &devkey->key_id))
+ continue;
+
+ return container_of(devkey, struct mac802154_llsec_device_key,
+ devkey);
+ }
+
+ return NULL;
+}
+
+int mac802154_llsec_devkey_add(struct mac802154_llsec *sec,
+ __le64 dev_addr,
+ const struct ieee802154_llsec_device_key *key)
+{
+ struct mac802154_llsec_device *dev;
+ struct mac802154_llsec_device_key *devkey;
+
+ dev = llsec_dev_find_long(sec, dev_addr);
+
+ if (!dev)
+ return -ENOENT;
+
+ if (llsec_devkey_find(dev, &key->key_id))
+ return -EEXIST;
+
+ devkey = kmalloc(sizeof(*devkey), GFP_KERNEL);
+ if (!devkey)
+ return -ENOMEM;
+
+ devkey->devkey = *key;
+ list_add_tail_rcu(&devkey->devkey.list, &dev->dev.keys);
+ return 0;
+}
+
+int mac802154_llsec_devkey_del(struct mac802154_llsec *sec,
+ __le64 dev_addr,
+ const struct ieee802154_llsec_device_key *key)
+{
+ struct mac802154_llsec_device *dev;
+ struct mac802154_llsec_device_key *devkey;
+
+ dev = llsec_dev_find_long(sec, dev_addr);
+
+ if (!dev)
+ return -ENOENT;
+
+ devkey = llsec_devkey_find(dev, &key->key_id);
+ if (!devkey)
+ return -ENOENT;
+
+ list_del_rcu(&devkey->devkey.list);
+ kfree_rcu(devkey, rcu);
+ return 0;
+}
+
+static struct mac802154_llsec_seclevel*
+llsec_find_seclevel(const struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ struct ieee802154_llsec_seclevel *pos;
+
+ list_for_each_entry(pos, &sec->table.security_levels, list) {
+ if (pos->frame_type != sl->frame_type ||
+ (pos->frame_type == IEEE802154_FC_TYPE_MAC_CMD &&
+ pos->cmd_frame_id != sl->cmd_frame_id) ||
+ pos->device_override != sl->device_override ||
+ pos->sec_levels != sl->sec_levels)
+ continue;
+
+ return container_of(pos, struct mac802154_llsec_seclevel,
+ level);
+ }
+
+ return NULL;
+}
+
+int mac802154_llsec_seclevel_add(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ struct mac802154_llsec_seclevel *entry;
+
+ if (llsec_find_seclevel(sec, sl))
+ return -EEXIST;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->level = *sl;
+
+ list_add_tail_rcu(&entry->level.list, &sec->table.security_levels);
+
+ return 0;
+}
+
+int mac802154_llsec_seclevel_del(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ struct mac802154_llsec_seclevel *pos;
+
+ pos = llsec_find_seclevel(sec, sl);
+ if (!pos)
+ return -ENOENT;
+
+ list_del_rcu(&pos->level.list);
+ kfree_rcu(pos, rcu);
+
+ return 0;
+}
+
+static int llsec_recover_addr(struct mac802154_llsec *sec,
+ struct ieee802154_addr *addr)
+{
+ __le16 caddr = sec->params.coord_shortaddr;
+
+ addr->pan_id = sec->params.pan_id;
+
+ if (caddr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) {
+ return -EINVAL;
+ } else if (caddr == cpu_to_le16(IEEE802154_ADDR_UNDEF)) {
+ addr->extended_addr = sec->params.coord_hwaddr;
+ addr->mode = IEEE802154_ADDR_LONG;
+ } else {
+ addr->short_addr = sec->params.coord_shortaddr;
+ addr->mode = IEEE802154_ADDR_SHORT;
+ }
+
+ return 0;
+}
+
+static struct mac802154_llsec_key*
+llsec_lookup_key(struct mac802154_llsec *sec,
+ const struct ieee802154_hdr *hdr,
+ const struct ieee802154_addr *addr,
+ struct ieee802154_llsec_key_id *key_id)
+{
+ struct ieee802154_addr devaddr = *addr;
+ u8 key_id_mode = hdr->sec.key_id_mode;
+ struct ieee802154_llsec_key_entry *key_entry;
+ struct mac802154_llsec_key *key;
+
+ if (key_id_mode == IEEE802154_SCF_KEY_IMPLICIT &&
+ devaddr.mode == IEEE802154_ADDR_NONE) {
+ if (hdr->fc.type == IEEE802154_FC_TYPE_BEACON) {
+ devaddr.extended_addr = sec->params.coord_hwaddr;
+ devaddr.mode = IEEE802154_ADDR_LONG;
+ } else if (llsec_recover_addr(sec, &devaddr) < 0) {
+ return NULL;
+ }
+ }
+
+ list_for_each_entry_rcu(key_entry, &sec->table.keys, list) {
+ const struct ieee802154_llsec_key_id *id = &key_entry->id;
+
+ if (!(key_entry->key->frame_types & BIT(hdr->fc.type)))
+ continue;
+
+ if (id->mode != key_id_mode)
+ continue;
+
+ if (key_id_mode == IEEE802154_SCF_KEY_IMPLICIT) {
+ if (ieee802154_addr_equal(&devaddr, &id->device_addr))
+ goto found;
+ } else {
+ if (id->id != hdr->sec.key_id)
+ continue;
+
+ if ((key_id_mode == IEEE802154_SCF_KEY_INDEX) ||
+ (key_id_mode == IEEE802154_SCF_KEY_SHORT_INDEX &&
+ id->short_source == hdr->sec.short_src) ||
+ (key_id_mode == IEEE802154_SCF_KEY_HW_INDEX &&
+ id->extended_source == hdr->sec.extended_src))
+ goto found;
+ }
+ }
+
+ return NULL;
+
+found:
+ key = container_of(key_entry->key, struct mac802154_llsec_key, key);
+ if (key_id)
+ *key_id = key_entry->id;
+ return llsec_key_get(key);
+}
+
+static void llsec_geniv(u8 iv[16], __le64 addr,
+ const struct ieee802154_sechdr *sec)
+{
+ __be64 addr_bytes = (__force __be64) swab64((__force u64) addr);
+ __be32 frame_counter = (__force __be32) swab32((__force u32) sec->frame_counter);
+
+ iv[0] = 1; /* L' = L - 1 = 1 */
+ memcpy(iv + 1, &addr_bytes, sizeof(addr_bytes));
+ memcpy(iv + 9, &frame_counter, sizeof(frame_counter));
+ iv[13] = sec->level;
+ iv[14] = 0;
+ iv[15] = 1;
+}
+
+static int
+llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
+ const struct ieee802154_hdr *hdr,
+ struct mac802154_llsec_key *key)
+{
+ u8 iv[16];
+ struct scatterlist src;
+ struct blkcipher_desc req = {
+ .tfm = key->tfm0,
+ .info = iv,
+ .flags = 0,
+ };
+
+ llsec_geniv(iv, sec->params.hwaddr, &hdr->sec);
+ sg_init_one(&src, skb->data, skb->len);
+ return crypto_blkcipher_encrypt_iv(&req, &src, &src, skb->len);
+}
+
+static struct crypto_aead*
+llsec_tfm_by_len(struct mac802154_llsec_key *key, int authlen)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
+ if (crypto_aead_authsize(key->tfm[i]) == authlen)
+ return key->tfm[i];
+
+ BUG();
+}
+
+static int
+llsec_do_encrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
+ const struct ieee802154_hdr *hdr,
+ struct mac802154_llsec_key *key)
+{
+ u8 iv[16];
+ unsigned char *data;
+ int authlen, assoclen, datalen, rc;
+ struct scatterlist src, assoc[2], dst[2];
+ struct aead_request *req;
+
+ authlen = ieee802154_sechdr_authtag_len(&hdr->sec);
+ llsec_geniv(iv, sec->params.hwaddr, &hdr->sec);
+
+ req = aead_request_alloc(llsec_tfm_by_len(key, authlen), GFP_ATOMIC);
+ if (!req)
+ return -ENOMEM;
+
+ sg_init_table(assoc, 2);
+ sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len);
+ assoclen = skb->mac_len;
+
+ data = skb_mac_header(skb) + skb->mac_len;
+ datalen = skb_tail_pointer(skb) - data;
+
+ if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) {
+ sg_set_buf(&assoc[1], data, 0);
+ } else {
+ sg_set_buf(&assoc[1], data, datalen);
+ assoclen += datalen;
+ datalen = 0;
+ }
+
+ sg_init_one(&src, data, datalen);
+
+ sg_init_table(dst, 2);
+ sg_set_buf(&dst[0], data, datalen);
+ sg_set_buf(&dst[1], skb_put(skb, authlen), authlen);
+
+ aead_request_set_callback(req, 0, NULL, NULL);
+ aead_request_set_assoc(req, assoc, assoclen);
+ aead_request_set_crypt(req, &src, dst, datalen, iv);
+
+ rc = crypto_aead_encrypt(req);
+
+ kfree(req);
+
+ return rc;
+}
+
+static int llsec_do_encrypt(struct sk_buff *skb,
+ const struct mac802154_llsec *sec,
+ const struct ieee802154_hdr *hdr,
+ struct mac802154_llsec_key *key)
+{
+ if (hdr->sec.level == IEEE802154_SCF_SECLEVEL_ENC)
+ return llsec_do_encrypt_unauth(skb, sec, hdr, key);
+ else
+ return llsec_do_encrypt_auth(skb, sec, hdr, key);
+}
+
+int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb)
+{
+ struct ieee802154_hdr hdr;
+ int rc, authlen, hlen;
+ struct mac802154_llsec_key *key;
+ u32 frame_ctr;
+
+ hlen = ieee802154_hdr_pull(skb, &hdr);
+
+ if (hlen < 0 || hdr.fc.type != IEEE802154_FC_TYPE_DATA)
+ return -EINVAL;
+
+ if (!hdr.fc.security_enabled || hdr.sec.level == 0) {
+ skb_push(skb, hlen);
+ return 0;
+ }
+
+ authlen = ieee802154_sechdr_authtag_len(&hdr.sec);
+
+ if (skb->len + hlen + authlen + IEEE802154_MFR_SIZE > IEEE802154_MTU)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+
+ read_lock_bh(&sec->lock);
+
+ if (!sec->params.enabled) {
+ rc = -EINVAL;
+ goto fail_read;
+ }
+
+ key = llsec_lookup_key(sec, &hdr, &hdr.dest, NULL);
+ if (!key) {
+ rc = -ENOKEY;
+ goto fail_read;
+ }
+
+ read_unlock_bh(&sec->lock);
+
+ write_lock_bh(&sec->lock);
+
+ frame_ctr = be32_to_cpu(sec->params.frame_counter);
+ hdr.sec.frame_counter = cpu_to_le32(frame_ctr);
+ if (frame_ctr == 0xFFFFFFFF) {
+ write_unlock_bh(&sec->lock);
+ llsec_key_put(key);
+ rc = -EOVERFLOW;
+ goto fail;
+ }
+
+ sec->params.frame_counter = cpu_to_be32(frame_ctr + 1);
+
+ write_unlock_bh(&sec->lock);
+
+ rcu_read_unlock();
+
+ skb->mac_len = ieee802154_hdr_push(skb, &hdr);
+ skb_reset_mac_header(skb);
+
+ rc = llsec_do_encrypt(skb, sec, &hdr, key);
+ llsec_key_put(key);
+
+ return rc;
+
+fail_read:
+ read_unlock_bh(&sec->lock);
+fail:
+ rcu_read_unlock();
+ return rc;
+}
+
+static struct mac802154_llsec_device*
+llsec_lookup_dev(struct mac802154_llsec *sec,
+ const struct ieee802154_addr *addr)
+{
+ struct ieee802154_addr devaddr = *addr;
+ struct mac802154_llsec_device *dev = NULL;
+
+ if (devaddr.mode == IEEE802154_ADDR_NONE &&
+ llsec_recover_addr(sec, &devaddr) < 0)
+ return NULL;
+
+ if (devaddr.mode == IEEE802154_ADDR_SHORT) {
+ u32 key = llsec_dev_hash_short(devaddr.short_addr,
+ devaddr.pan_id);
+
+ hash_for_each_possible_rcu(sec->devices_short, dev,
+ bucket_s, key) {
+ if (dev->dev.pan_id == devaddr.pan_id &&
+ dev->dev.short_addr == devaddr.short_addr)
+ return dev;
+ }
+ } else {
+ u64 key = llsec_dev_hash_long(devaddr.extended_addr);
+
+ hash_for_each_possible_rcu(sec->devices_hw, dev,
+ bucket_hw, key) {
+ if (dev->dev.hwaddr == devaddr.extended_addr)
+ return dev;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+llsec_lookup_seclevel(const struct mac802154_llsec *sec,
+ u8 frame_type, u8 cmd_frame_id,
+ struct ieee802154_llsec_seclevel *rlevel)
+{
+ struct ieee802154_llsec_seclevel *level;
+
+ list_for_each_entry_rcu(level, &sec->table.security_levels, list) {
+ if (level->frame_type == frame_type &&
+ (frame_type != IEEE802154_FC_TYPE_MAC_CMD ||
+ level->cmd_frame_id == cmd_frame_id)) {
+ *rlevel = *level;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static int
+llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
+ const struct ieee802154_hdr *hdr,
+ struct mac802154_llsec_key *key, __le64 dev_addr)
+{
+ u8 iv[16];
+ unsigned char *data;
+ int datalen;
+ struct scatterlist src;
+ struct blkcipher_desc req = {
+ .tfm = key->tfm0,
+ .info = iv,
+ .flags = 0,
+ };
+
+ llsec_geniv(iv, dev_addr, &hdr->sec);
+ data = skb_mac_header(skb) + skb->mac_len;
+ datalen = skb_tail_pointer(skb) - data;
+
+ sg_init_one(&src, data, datalen);
+
+ return crypto_blkcipher_decrypt_iv(&req, &src, &src, datalen);
+}
+
+static int
+llsec_do_decrypt_auth(struct sk_buff *skb, const struct mac802154_llsec *sec,
+ const struct ieee802154_hdr *hdr,
+ struct mac802154_llsec_key *key, __le64 dev_addr)
+{
+ u8 iv[16];
+ unsigned char *data;
+ int authlen, datalen, assoclen, rc;
+ struct scatterlist src, assoc[2];
+ struct aead_request *req;
+
+ authlen = ieee802154_sechdr_authtag_len(&hdr->sec);
+ llsec_geniv(iv, dev_addr, &hdr->sec);
+
+ req = aead_request_alloc(llsec_tfm_by_len(key, authlen), GFP_ATOMIC);
+ if (!req)
+ return -ENOMEM;
+
+ sg_init_table(assoc, 2);
+ sg_set_buf(&assoc[0], skb_mac_header(skb), skb->mac_len);
+ assoclen = skb->mac_len;
+
+ data = skb_mac_header(skb) + skb->mac_len;
+ datalen = skb_tail_pointer(skb) - data;
+
+ if (hdr->sec.level & IEEE802154_SCF_SECLEVEL_ENC) {
+ sg_set_buf(&assoc[1], data, 0);
+ } else {
+ sg_set_buf(&assoc[1], data, datalen - authlen);
+ assoclen += datalen - authlen;
+ data += datalen - authlen;
+ datalen = authlen;
+ }
+
+ sg_init_one(&src, data, datalen);
+
+ aead_request_set_callback(req, 0, NULL, NULL);
+ aead_request_set_assoc(req, assoc, assoclen);
+ aead_request_set_crypt(req, &src, &src, datalen, iv);
+
+ rc = crypto_aead_decrypt(req);
+
+ kfree(req);
+ skb_trim(skb, skb->len - authlen);
+
+ return rc;
+}
+
+static int
+llsec_do_decrypt(struct sk_buff *skb, const struct mac802154_llsec *sec,
+ const struct ieee802154_hdr *hdr,
+ struct mac802154_llsec_key *key, __le64 dev_addr)
+{
+ if (hdr->sec.level == IEEE802154_SCF_SECLEVEL_ENC)
+ return llsec_do_decrypt_unauth(skb, sec, hdr, key, dev_addr);
+ else
+ return llsec_do_decrypt_auth(skb, sec, hdr, key, dev_addr);
+}
+
+static int
+llsec_update_devkey_record(struct mac802154_llsec_device *dev,
+ const struct ieee802154_llsec_key_id *in_key)
+{
+ struct mac802154_llsec_device_key *devkey;
+
+ devkey = llsec_devkey_find(dev, in_key);
+
+ if (!devkey) {
+ struct mac802154_llsec_device_key *next;
+
+ next = kzalloc(sizeof(*devkey), GFP_ATOMIC);
+ if (!next)
+ return -ENOMEM;
+
+ next->devkey.key_id = *in_key;
+
+ spin_lock_bh(&dev->lock);
+
+ devkey = llsec_devkey_find(dev, in_key);
+ if (!devkey)
+ list_add_rcu(&next->devkey.list, &dev->dev.keys);
+ else
+ kfree(next);
+
+ spin_unlock_bh(&dev->lock);
+ }
+
+ return 0;
+}
+
+static int
+llsec_update_devkey_info(struct mac802154_llsec_device *dev,
+ const struct ieee802154_llsec_key_id *in_key,
+ u32 frame_counter)
+{
+ struct mac802154_llsec_device_key *devkey = NULL;
+
+ if (dev->dev.key_mode == IEEE802154_LLSEC_DEVKEY_RESTRICT) {
+ devkey = llsec_devkey_find(dev, in_key);
+ if (!devkey)
+ return -ENOENT;
+ }
+
+ if (dev->dev.key_mode == IEEE802154_LLSEC_DEVKEY_RECORD) {
+ int rc = llsec_update_devkey_record(dev, in_key);
+
+ if (rc < 0)
+ return rc;
+ }
+
+ spin_lock_bh(&dev->lock);
+
+ if ((!devkey && frame_counter < dev->dev.frame_counter) ||
+ (devkey && frame_counter < devkey->devkey.frame_counter)) {
+ spin_unlock_bh(&dev->lock);
+ return -EINVAL;
+ }
+
+ if (devkey)
+ devkey->devkey.frame_counter = frame_counter + 1;
+ else
+ dev->dev.frame_counter = frame_counter + 1;
+
+ spin_unlock_bh(&dev->lock);
+
+ return 0;
+}
+
+int mac802154_llsec_decrypt(struct mac802154_llsec *sec, struct sk_buff *skb)
+{
+ struct ieee802154_hdr hdr;
+ struct mac802154_llsec_key *key;
+ struct ieee802154_llsec_key_id key_id;
+ struct mac802154_llsec_device *dev;
+ struct ieee802154_llsec_seclevel seclevel;
+ int err;
+ __le64 dev_addr;
+ u32 frame_ctr;
+
+ if (ieee802154_hdr_peek(skb, &hdr) < 0)
+ return -EINVAL;
+ if (!hdr.fc.security_enabled)
+ return 0;
+ if (hdr.fc.version == 0)
+ return -EINVAL;
+
+ read_lock_bh(&sec->lock);
+ if (!sec->params.enabled) {
+ read_unlock_bh(&sec->lock);
+ return -EINVAL;
+ }
+ read_unlock_bh(&sec->lock);
+
+ rcu_read_lock();
+
+ key = llsec_lookup_key(sec, &hdr, &hdr.source, &key_id);
+ if (!key) {
+ err = -ENOKEY;
+ goto fail;
+ }
+
+ dev = llsec_lookup_dev(sec, &hdr.source);
+ if (!dev) {
+ err = -EINVAL;
+ goto fail_dev;
+ }
+
+ if (llsec_lookup_seclevel(sec, hdr.fc.type, 0, &seclevel) < 0) {
+ err = -EINVAL;
+ goto fail_dev;
+ }
+
+ if (!(seclevel.sec_levels & BIT(hdr.sec.level)) &&
+ (hdr.sec.level == 0 && seclevel.device_override &&
+ !dev->dev.seclevel_exempt)) {
+ err = -EINVAL;
+ goto fail_dev;
+ }
+
+ frame_ctr = le32_to_cpu(hdr.sec.frame_counter);
+
+ if (frame_ctr == 0xffffffff) {
+ err = -EOVERFLOW;
+ goto fail_dev;
+ }
+
+ err = llsec_update_devkey_info(dev, &key_id, frame_ctr);
+ if (err)
+ goto fail_dev;
+
+ dev_addr = dev->dev.hwaddr;
+
+ rcu_read_unlock();
+
+ err = llsec_do_decrypt(skb, sec, &hdr, key, dev_addr);
+ llsec_key_put(key);
+ return err;
+
+fail_dev:
+ llsec_key_put(key);
+fail:
+ rcu_read_unlock();
+ return err;
+}
diff --git a/net/mac802154/llsec.h b/net/mac802154/llsec.h
new file mode 100644
index 000000000..950578e1d
--- /dev/null
+++ b/net/mac802154/llsec.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2014 Fraunhofer ITWM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
+ */
+
+#ifndef MAC802154_LLSEC_H
+#define MAC802154_LLSEC_H
+
+#include <linux/slab.h>
+#include <linux/hashtable.h>
+#include <linux/crypto.h>
+#include <linux/kref.h>
+#include <linux/spinlock.h>
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+
+struct mac802154_llsec_key {
+ struct ieee802154_llsec_key key;
+
+ /* one tfm for each authsize (4/8/16) */
+ struct crypto_aead *tfm[3];
+ struct crypto_blkcipher *tfm0;
+
+ struct kref ref;
+};
+
+struct mac802154_llsec_device_key {
+ struct ieee802154_llsec_device_key devkey;
+
+ struct rcu_head rcu;
+};
+
+struct mac802154_llsec_device {
+ struct ieee802154_llsec_device dev;
+
+ struct hlist_node bucket_s;
+ struct hlist_node bucket_hw;
+
+ /* protects dev.frame_counter and the elements of dev.keys */
+ spinlock_t lock;
+
+ struct rcu_head rcu;
+};
+
+struct mac802154_llsec_seclevel {
+ struct ieee802154_llsec_seclevel level;
+
+ struct rcu_head rcu;
+};
+
+struct mac802154_llsec {
+ struct ieee802154_llsec_params params;
+ struct ieee802154_llsec_table table;
+
+ DECLARE_HASHTABLE(devices_short, 6);
+ DECLARE_HASHTABLE(devices_hw, 6);
+
+ /* protects params, all other fields are fine with RCU */
+ rwlock_t lock;
+};
+
+void mac802154_llsec_init(struct mac802154_llsec *sec);
+void mac802154_llsec_destroy(struct mac802154_llsec *sec);
+
+int mac802154_llsec_get_params(struct mac802154_llsec *sec,
+ struct ieee802154_llsec_params *params);
+int mac802154_llsec_set_params(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_params *params,
+ int changed);
+
+int mac802154_llsec_key_add(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_key_id *id,
+ const struct ieee802154_llsec_key *key);
+int mac802154_llsec_key_del(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_key_id *key);
+
+int mac802154_llsec_dev_add(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_device *dev);
+int mac802154_llsec_dev_del(struct mac802154_llsec *sec,
+ __le64 device_addr);
+
+int mac802154_llsec_devkey_add(struct mac802154_llsec *sec,
+ __le64 dev_addr,
+ const struct ieee802154_llsec_device_key *key);
+int mac802154_llsec_devkey_del(struct mac802154_llsec *sec,
+ __le64 dev_addr,
+ const struct ieee802154_llsec_device_key *key);
+
+int mac802154_llsec_seclevel_add(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_seclevel *sl);
+int mac802154_llsec_seclevel_del(struct mac802154_llsec *sec,
+ const struct ieee802154_llsec_seclevel *sl);
+
+int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb);
+int mac802154_llsec_decrypt(struct mac802154_llsec *sec, struct sk_buff *skb);
+
+#endif /* MAC802154_LLSEC_H */
diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c
new file mode 100644
index 000000000..bdccb4ecd
--- /dev/null
+++ b/net/mac802154/mac_cmd.c
@@ -0,0 +1,162 @@
+/*
+ * MAC commands interface
+ *
+ * Copyright 2007-2012 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ieee802154.h>
+
+#include <net/ieee802154_netdev.h>
+#include <net/cfg802154.h>
+#include <net/mac802154.h>
+
+#include "ieee802154_i.h"
+#include "driver-ops.h"
+
+static int mac802154_mlme_start_req(struct net_device *dev,
+ struct ieee802154_addr *addr,
+ u8 channel, u8 page,
+ u8 bcn_ord, u8 sf_ord,
+ u8 pan_coord, u8 blx,
+ u8 coord_realign)
+{
+ struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev);
+ int rc = 0;
+
+ ASSERT_RTNL();
+
+ BUG_ON(addr->mode != IEEE802154_ADDR_SHORT);
+
+ mac802154_dev_set_pan_id(dev, addr->pan_id);
+ mac802154_dev_set_short_addr(dev, addr->short_addr);
+ mac802154_dev_set_page_channel(dev, page, channel);
+
+ if (ops->llsec) {
+ struct ieee802154_llsec_params params;
+ int changed = 0;
+
+ params.coord_shortaddr = addr->short_addr;
+ changed |= IEEE802154_LLSEC_PARAM_COORD_SHORTADDR;
+
+ params.pan_id = addr->pan_id;
+ changed |= IEEE802154_LLSEC_PARAM_PAN_ID;
+
+ params.hwaddr = ieee802154_devaddr_from_raw(dev->dev_addr);
+ changed |= IEEE802154_LLSEC_PARAM_HWADDR;
+
+ params.coord_hwaddr = params.hwaddr;
+ changed |= IEEE802154_LLSEC_PARAM_COORD_HWADDR;
+
+ rc = ops->llsec->set_params(dev, &params, changed);
+ }
+
+ return rc;
+}
+
+static int mac802154_set_mac_params(struct net_device *dev,
+ const struct ieee802154_mac_params *params)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct ieee802154_local *local = sdata->local;
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ int ret;
+
+ ASSERT_RTNL();
+
+ /* PHY */
+ wpan_dev->wpan_phy->transmit_power = params->transmit_power;
+ wpan_dev->wpan_phy->cca = params->cca;
+ wpan_dev->wpan_phy->cca_ed_level = params->cca_ed_level;
+
+ /* MAC */
+ wpan_dev->min_be = params->min_be;
+ wpan_dev->max_be = params->max_be;
+ wpan_dev->csma_retries = params->csma_retries;
+ wpan_dev->frame_retries = params->frame_retries;
+ wpan_dev->lbt = params->lbt;
+
+ if (local->hw.flags & IEEE802154_HW_TXPOWER) {
+ ret = drv_set_tx_power(local, params->transmit_power);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_CCA_MODE) {
+ ret = drv_set_cca_mode(local, &params->cca);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (local->hw.flags & IEEE802154_HW_CCA_ED_LEVEL) {
+ ret = drv_set_cca_ed_level(local, params->cca_ed_level);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mac802154_get_mac_params(struct net_device *dev,
+ struct ieee802154_mac_params *params)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+
+ ASSERT_RTNL();
+
+ /* PHY */
+ params->transmit_power = wpan_dev->wpan_phy->transmit_power;
+ params->cca = wpan_dev->wpan_phy->cca;
+ params->cca_ed_level = wpan_dev->wpan_phy->cca_ed_level;
+
+ /* MAC */
+ params->min_be = wpan_dev->min_be;
+ params->max_be = wpan_dev->max_be;
+ params->csma_retries = wpan_dev->csma_retries;
+ params->frame_retries = wpan_dev->frame_retries;
+ params->lbt = wpan_dev->lbt;
+}
+
+static struct ieee802154_llsec_ops mac802154_llsec_ops = {
+ .get_params = mac802154_get_params,
+ .set_params = mac802154_set_params,
+ .add_key = mac802154_add_key,
+ .del_key = mac802154_del_key,
+ .add_dev = mac802154_add_dev,
+ .del_dev = mac802154_del_dev,
+ .add_devkey = mac802154_add_devkey,
+ .del_devkey = mac802154_del_devkey,
+ .add_seclevel = mac802154_add_seclevel,
+ .del_seclevel = mac802154_del_seclevel,
+ .lock_table = mac802154_lock_table,
+ .get_table = mac802154_get_table,
+ .unlock_table = mac802154_unlock_table,
+};
+
+struct ieee802154_mlme_ops mac802154_mlme_wpan = {
+ .start_req = mac802154_mlme_start_req,
+ .get_pan_id = mac802154_dev_get_pan_id,
+ .get_short_addr = mac802154_dev_get_short_addr,
+ .get_dsn = mac802154_dev_get_dsn,
+
+ .llsec = &mac802154_llsec_ops,
+
+ .set_mac_params = mac802154_set_mac_params,
+ .get_mac_params = mac802154_get_mac_params,
+};
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
new file mode 100644
index 000000000..08cb32dc8
--- /dev/null
+++ b/net/mac802154/main.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2007-2012 Siemens AG
+ *
+ * Written by:
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+
+#include <net/netlink.h>
+#include <net/nl802154.h>
+#include <net/mac802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/route.h>
+#include <net/cfg802154.h>
+
+#include "ieee802154_i.h"
+#include "cfg.h"
+
+static void ieee802154_tasklet_handler(unsigned long data)
+{
+ struct ieee802154_local *local = (struct ieee802154_local *)data;
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&local->skb_queue))) {
+ switch (skb->pkt_type) {
+ case IEEE802154_RX_MSG:
+ /* Clear skb->pkt_type in order to not confuse kernel
+ * netstack.
+ */
+ skb->pkt_type = 0;
+ ieee802154_rx(&local->hw, skb);
+ break;
+ default:
+ WARN(1, "mac802154: Packet is of unknown type %d\n",
+ skb->pkt_type);
+ kfree_skb(skb);
+ break;
+ }
+ }
+}
+
+struct ieee802154_hw *
+ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops)
+{
+ struct wpan_phy *phy;
+ struct ieee802154_local *local;
+ size_t priv_size;
+
+ if (!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed ||
+ !ops->start || !ops->stop || !ops->set_channel) {
+ pr_err("undefined IEEE802.15.4 device operations\n");
+ return NULL;
+ }
+
+ /* Ensure 32-byte alignment of our private data and hw private data.
+ * We use the wpan_phy priv data for both our ieee802154_local and for
+ * the driver's private data
+ *
+ * in memory it'll be like this:
+ *
+ * +-------------------------+
+ * | struct wpan_phy |
+ * +-------------------------+
+ * | struct ieee802154_local |
+ * +-------------------------+
+ * | driver's private data |
+ * +-------------------------+
+ *
+ * Due to ieee802154 layer isn't aware of driver and MAC structures,
+ * so lets align them here.
+ */
+
+ priv_size = ALIGN(sizeof(*local), NETDEV_ALIGN) + priv_data_len;
+
+ phy = wpan_phy_new(&mac802154_config_ops, priv_size);
+ if (!phy) {
+ pr_err("failure to allocate master IEEE802.15.4 device\n");
+ return NULL;
+ }
+
+ phy->privid = mac802154_wpan_phy_privid;
+
+ local = wpan_phy_priv(phy);
+ local->phy = phy;
+ local->hw.phy = local->phy;
+ local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN);
+ local->ops = ops;
+
+ INIT_LIST_HEAD(&local->interfaces);
+ mutex_init(&local->iflist_mtx);
+
+ tasklet_init(&local->tasklet,
+ ieee802154_tasklet_handler,
+ (unsigned long)local);
+
+ skb_queue_head_init(&local->skb_queue);
+
+ return &local->hw;
+}
+EXPORT_SYMBOL(ieee802154_alloc_hw);
+
+void ieee802154_free_hw(struct ieee802154_hw *hw)
+{
+ struct ieee802154_local *local = hw_to_local(hw);
+
+ BUG_ON(!list_empty(&local->interfaces));
+
+ mutex_destroy(&local->iflist_mtx);
+
+ wpan_phy_free(local->phy);
+}
+EXPORT_SYMBOL(ieee802154_free_hw);
+
+static void ieee802154_setup_wpan_phy_pib(struct wpan_phy *wpan_phy)
+{
+ /* TODO warn on empty symbol_duration
+ * Should be done when all drivers sets this value.
+ */
+
+ wpan_phy->lifs_period = IEEE802154_LIFS_PERIOD *
+ wpan_phy->symbol_duration;
+ wpan_phy->sifs_period = IEEE802154_SIFS_PERIOD *
+ wpan_phy->symbol_duration;
+}
+
+int ieee802154_register_hw(struct ieee802154_hw *hw)
+{
+ struct ieee802154_local *local = hw_to_local(hw);
+ struct net_device *dev;
+ int rc = -ENOSYS;
+
+ local->workqueue =
+ create_singlethread_workqueue(wpan_phy_name(local->phy));
+ if (!local->workqueue) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ hrtimer_init(&local->ifs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ local->ifs_timer.function = ieee802154_xmit_ifs_timer;
+
+ wpan_phy_set_dev(local->phy, local->hw.parent);
+
+ ieee802154_setup_wpan_phy_pib(local->phy);
+
+ rc = wpan_phy_register(local->phy);
+ if (rc < 0)
+ goto out_wq;
+
+ rtnl_lock();
+
+ dev = ieee802154_if_add(local, "wpan%d", NET_NAME_ENUM,
+ NL802154_IFTYPE_NODE,
+ cpu_to_le64(0x0000000000000000ULL));
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
+ rc = PTR_ERR(dev);
+ goto out_phy;
+ }
+
+ rtnl_unlock();
+
+ return 0;
+
+out_phy:
+ wpan_phy_unregister(local->phy);
+out_wq:
+ destroy_workqueue(local->workqueue);
+out:
+ return rc;
+}
+EXPORT_SYMBOL(ieee802154_register_hw);
+
+void ieee802154_unregister_hw(struct ieee802154_hw *hw)
+{
+ struct ieee802154_local *local = hw_to_local(hw);
+
+ tasklet_kill(&local->tasklet);
+ flush_workqueue(local->workqueue);
+ destroy_workqueue(local->workqueue);
+
+ rtnl_lock();
+
+ ieee802154_remove_interfaces(local);
+
+ rtnl_unlock();
+
+ wpan_phy_unregister(local->phy);
+}
+EXPORT_SYMBOL(ieee802154_unregister_hw);
+
+static int __init ieee802154_init(void)
+{
+ return ieee802154_iface_init();
+}
+
+static void __exit ieee802154_exit(void)
+{
+ ieee802154_iface_exit();
+
+ rcu_barrier();
+}
+
+subsys_initcall(ieee802154_init);
+module_exit(ieee802154_exit);
+
+MODULE_DESCRIPTION("IEEE 802.15.4 subsystem");
+MODULE_LICENSE("GPL v2");
diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c
new file mode 100644
index 000000000..5cf019a57
--- /dev/null
+++ b/net/mac802154/mib.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright 2007-2012 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Sergey Lapin <slapin@ossfans.org>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+#include <linux/if_arp.h>
+
+#include <net/mac802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/cfg802154.h>
+
+#include "ieee802154_i.h"
+#include "driver-ops.h"
+
+void mac802154_dev_set_short_addr(struct net_device *dev, __le16 val)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ spin_lock_bh(&sdata->mib_lock);
+ sdata->wpan_dev.short_addr = val;
+ spin_unlock_bh(&sdata->mib_lock);
+}
+
+__le16 mac802154_dev_get_short_addr(const struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ __le16 ret;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ spin_lock_bh(&sdata->mib_lock);
+ ret = sdata->wpan_dev.short_addr;
+ spin_unlock_bh(&sdata->mib_lock);
+
+ return ret;
+}
+
+__le16 mac802154_dev_get_pan_id(const struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ __le16 ret;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ spin_lock_bh(&sdata->mib_lock);
+ ret = sdata->wpan_dev.pan_id;
+ spin_unlock_bh(&sdata->mib_lock);
+
+ return ret;
+}
+
+void mac802154_dev_set_pan_id(struct net_device *dev, __le16 val)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ spin_lock_bh(&sdata->mib_lock);
+ sdata->wpan_dev.pan_id = val;
+ spin_unlock_bh(&sdata->mib_lock);
+}
+
+u8 mac802154_dev_get_dsn(const struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ return sdata->wpan_dev.dsn++;
+}
+
+void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ struct ieee802154_local *local = sdata->local;
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ res = drv_set_channel(local, page, chan);
+ if (res) {
+ pr_debug("set_channel failed\n");
+ } else {
+ mutex_lock(&local->phy->pib_lock);
+ local->phy->current_channel = chan;
+ local->phy->current_page = page;
+ mutex_unlock(&local->phy->pib_lock);
+ }
+}
+
+int mac802154_get_params(struct net_device *dev,
+ struct ieee802154_llsec_params *params)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_get_params(&sdata->sec, params);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_set_params(struct net_device *dev,
+ const struct ieee802154_llsec_params *params,
+ int changed)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_set_params(&sdata->sec, params, changed);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_add_key(struct net_device *dev,
+ const struct ieee802154_llsec_key_id *id,
+ const struct ieee802154_llsec_key *key)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_key_add(&sdata->sec, id, key);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_del_key(struct net_device *dev,
+ const struct ieee802154_llsec_key_id *id)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_key_del(&sdata->sec, id);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_add_dev(struct net_device *dev,
+ const struct ieee802154_llsec_device *llsec_dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_dev_add(&sdata->sec, llsec_dev);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_del_dev(struct net_device *dev, __le64 dev_addr)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_dev_del(&sdata->sec, dev_addr);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_add_devkey(struct net_device *dev,
+ __le64 device_addr,
+ const struct ieee802154_llsec_device_key *key)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_devkey_add(&sdata->sec, device_addr, key);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_del_devkey(struct net_device *dev,
+ __le64 device_addr,
+ const struct ieee802154_llsec_device_key *key)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_devkey_del(&sdata->sec, device_addr, key);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_add_seclevel(struct net_device *dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_seclevel_add(&sdata->sec, sl);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+int mac802154_del_seclevel(struct net_device *dev,
+ const struct ieee802154_llsec_seclevel *sl)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int res;
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+ res = mac802154_llsec_seclevel_del(&sdata->sec, sl);
+ mutex_unlock(&sdata->sec_mtx);
+
+ return res;
+}
+
+void mac802154_lock_table(struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_lock(&sdata->sec_mtx);
+}
+
+void mac802154_get_table(struct net_device *dev,
+ struct ieee802154_llsec_table **t)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ *t = &sdata->sec.table;
+}
+
+void mac802154_unlock_table(struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+ mutex_unlock(&sdata->sec_mtx);
+}
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
new file mode 100644
index 000000000..c0d67b2b4
--- /dev/null
+++ b/net/mac802154/rx.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (C) 2007-2012 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Pavel Smolenskiy <pavel.smolenskiy@gmail.com>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/crc-ccitt.h>
+#include <asm/unaligned.h>
+
+#include <net/mac802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/nl802154.h>
+
+#include "ieee802154_i.h"
+
+static int ieee802154_deliver_skb(struct sk_buff *skb)
+{
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->protocol = htons(ETH_P_IEEE802154);
+
+ return netif_receive_skb(skb);
+}
+
+static int
+ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata,
+ struct sk_buff *skb, const struct ieee802154_hdr *hdr)
+{
+ struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+ __le16 span, sshort;
+ int rc;
+
+ pr_debug("getting packet via slave interface %s\n", sdata->dev->name);
+
+ spin_lock_bh(&sdata->mib_lock);
+
+ span = wpan_dev->pan_id;
+ sshort = wpan_dev->short_addr;
+
+ switch (mac_cb(skb)->dest.mode) {
+ case IEEE802154_ADDR_NONE:
+ if (mac_cb(skb)->dest.mode != IEEE802154_ADDR_NONE)
+ /* FIXME: check if we are PAN coordinator */
+ skb->pkt_type = PACKET_OTHERHOST;
+ else
+ /* ACK comes with both addresses empty */
+ skb->pkt_type = PACKET_HOST;
+ break;
+ case IEEE802154_ADDR_LONG:
+ if (mac_cb(skb)->dest.pan_id != span &&
+ mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST))
+ skb->pkt_type = PACKET_OTHERHOST;
+ else if (mac_cb(skb)->dest.extended_addr == wpan_dev->extended_addr)
+ skb->pkt_type = PACKET_HOST;
+ else
+ skb->pkt_type = PACKET_OTHERHOST;
+ break;
+ case IEEE802154_ADDR_SHORT:
+ if (mac_cb(skb)->dest.pan_id != span &&
+ mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST))
+ skb->pkt_type = PACKET_OTHERHOST;
+ else if (mac_cb(skb)->dest.short_addr == sshort)
+ skb->pkt_type = PACKET_HOST;
+ else if (mac_cb(skb)->dest.short_addr ==
+ cpu_to_le16(IEEE802154_ADDR_BROADCAST))
+ skb->pkt_type = PACKET_BROADCAST;
+ else
+ skb->pkt_type = PACKET_OTHERHOST;
+ break;
+ default:
+ spin_unlock_bh(&sdata->mib_lock);
+ pr_debug("invalid dest mode\n");
+ goto fail;
+ }
+
+ spin_unlock_bh(&sdata->mib_lock);
+
+ skb->dev = sdata->dev;
+
+ rc = mac802154_llsec_decrypt(&sdata->sec, skb);
+ if (rc) {
+ pr_debug("decryption failed: %i\n", rc);
+ goto fail;
+ }
+
+ sdata->dev->stats.rx_packets++;
+ sdata->dev->stats.rx_bytes += skb->len;
+
+ switch (mac_cb(skb)->type) {
+ case IEEE802154_FC_TYPE_DATA:
+ return ieee802154_deliver_skb(skb);
+ default:
+ pr_warn("ieee802154: bad frame received (type = %d)\n",
+ mac_cb(skb)->type);
+ goto fail;
+ }
+
+fail:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void
+ieee802154_print_addr(const char *name, const struct ieee802154_addr *addr)
+{
+ if (addr->mode == IEEE802154_ADDR_NONE)
+ pr_debug("%s not present\n", name);
+
+ pr_debug("%s PAN ID: %04x\n", name, le16_to_cpu(addr->pan_id));
+ if (addr->mode == IEEE802154_ADDR_SHORT) {
+ pr_debug("%s is short: %04x\n", name,
+ le16_to_cpu(addr->short_addr));
+ } else {
+ u64 hw = swab64((__force u64)addr->extended_addr);
+
+ pr_debug("%s is hardware: %8phC\n", name, &hw);
+ }
+}
+
+static int
+ieee802154_parse_frame_start(struct sk_buff *skb, struct ieee802154_hdr *hdr)
+{
+ int hlen;
+ struct ieee802154_mac_cb *cb = mac_cb_init(skb);
+
+ skb_reset_mac_header(skb);
+
+ hlen = ieee802154_hdr_pull(skb, hdr);
+ if (hlen < 0)
+ return -EINVAL;
+
+ skb->mac_len = hlen;
+
+ pr_debug("fc: %04x dsn: %02x\n", le16_to_cpup((__le16 *)&hdr->fc),
+ hdr->seq);
+
+ cb->type = hdr->fc.type;
+ cb->ackreq = hdr->fc.ack_request;
+ cb->secen = hdr->fc.security_enabled;
+
+ ieee802154_print_addr("destination", &hdr->dest);
+ ieee802154_print_addr("source", &hdr->source);
+
+ cb->source = hdr->source;
+ cb->dest = hdr->dest;
+
+ if (hdr->fc.security_enabled) {
+ u64 key;
+
+ pr_debug("seclevel %i\n", hdr->sec.level);
+
+ switch (hdr->sec.key_id_mode) {
+ case IEEE802154_SCF_KEY_IMPLICIT:
+ pr_debug("implicit key\n");
+ break;
+
+ case IEEE802154_SCF_KEY_INDEX:
+ pr_debug("key %02x\n", hdr->sec.key_id);
+ break;
+
+ case IEEE802154_SCF_KEY_SHORT_INDEX:
+ pr_debug("key %04x:%04x %02x\n",
+ le32_to_cpu(hdr->sec.short_src) >> 16,
+ le32_to_cpu(hdr->sec.short_src) & 0xffff,
+ hdr->sec.key_id);
+ break;
+
+ case IEEE802154_SCF_KEY_HW_INDEX:
+ key = swab64((__force u64)hdr->sec.extended_src);
+ pr_debug("key source %8phC %02x\n", &key,
+ hdr->sec.key_id);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void
+__ieee802154_rx_handle_packet(struct ieee802154_local *local,
+ struct sk_buff *skb)
+{
+ int ret;
+ struct ieee802154_sub_if_data *sdata;
+ struct ieee802154_hdr hdr;
+
+ ret = ieee802154_parse_frame_start(skb, &hdr);
+ if (ret) {
+ pr_debug("got invalid frame\n");
+ kfree_skb(skb);
+ return;
+ }
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (sdata->vif.type != NL802154_IFTYPE_NODE ||
+ !netif_running(sdata->dev))
+ continue;
+
+ ieee802154_subif_frame(sdata, skb, &hdr);
+ skb = NULL;
+ break;
+ }
+
+ if (skb)
+ kfree_skb(skb);
+}
+
+static void
+ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb)
+{
+ struct sk_buff *skb2;
+ struct ieee802154_sub_if_data *sdata;
+
+ skb_reset_mac_header(skb);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->pkt_type = PACKET_OTHERHOST;
+ skb->protocol = htons(ETH_P_IEEE802154);
+
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (sdata->vif.type != NL802154_IFTYPE_MONITOR)
+ continue;
+
+ if (!ieee802154_sdata_running(sdata))
+ continue;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2) {
+ skb2->dev = sdata->dev;
+ ieee802154_deliver_skb(skb2);
+
+ sdata->dev->stats.rx_packets++;
+ sdata->dev->stats.rx_bytes += skb->len;
+ }
+ }
+}
+
+void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb)
+{
+ struct ieee802154_local *local = hw_to_local(hw);
+ u16 crc;
+
+ WARN_ON_ONCE(softirq_count() == 0);
+
+ /* TODO: When a transceiver omits the checksum here, we
+ * add an own calculated one. This is currently an ugly
+ * solution because the monitor needs a crc here.
+ */
+ if (local->hw.flags & IEEE802154_HW_RX_OMIT_CKSUM) {
+ crc = crc_ccitt(0, skb->data, skb->len);
+ put_unaligned_le16(crc, skb_put(skb, 2));
+ }
+
+ rcu_read_lock();
+
+ ieee802154_monitors_rx(local, skb);
+
+ /* Check if transceiver doesn't validate the checksum.
+ * If not we validate the checksum here.
+ */
+ if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM) {
+ crc = crc_ccitt(0, skb->data, skb->len);
+ if (crc) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return;
+ }
+ }
+ /* remove crc */
+ skb_trim(skb, skb->len - 2);
+
+ __ieee802154_rx_handle_packet(local, skb);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee802154_rx);
+
+void
+ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi)
+{
+ struct ieee802154_local *local = hw_to_local(hw);
+
+ mac_cb(skb)->lqi = lqi;
+ skb->pkt_type = IEEE802154_RX_MSG;
+ skb_queue_tail(&local->skb_queue, skb);
+ tasklet_schedule(&local->tasklet);
+}
+EXPORT_SYMBOL(ieee802154_rx_irqsafe);
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
new file mode 100644
index 000000000..c62e95695
--- /dev/null
+++ b/net/mac802154/tx.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2007-2012 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Written by:
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Sergey Lapin <slapin@ossfans.org>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ * Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/crc-ccitt.h>
+#include <asm/unaligned.h>
+
+#include <net/rtnetlink.h>
+#include <net/ieee802154_netdev.h>
+#include <net/mac802154.h>
+#include <net/cfg802154.h>
+
+#include "ieee802154_i.h"
+#include "driver-ops.h"
+
+/* IEEE 802.15.4 transceivers can sleep during the xmit session, so process
+ * packets through the workqueue.
+ */
+struct ieee802154_xmit_cb {
+ struct sk_buff *skb;
+ struct work_struct work;
+ struct ieee802154_local *local;
+};
+
+static struct ieee802154_xmit_cb ieee802154_xmit_cb;
+
+static void ieee802154_xmit_worker(struct work_struct *work)
+{
+ struct ieee802154_xmit_cb *cb =
+ container_of(work, struct ieee802154_xmit_cb, work);
+ struct ieee802154_local *local = cb->local;
+ struct sk_buff *skb = cb->skb;
+ struct net_device *dev = skb->dev;
+ int res;
+
+ rtnl_lock();
+
+ /* check if ifdown occurred while schedule */
+ if (!netif_running(dev))
+ goto err_tx;
+
+ res = drv_xmit_sync(local, skb);
+ if (res)
+ goto err_tx;
+
+ ieee802154_xmit_complete(&local->hw, skb, false);
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+
+ rtnl_unlock();
+
+ return;
+
+err_tx:
+ /* Restart the netif queue on each sub_if_data object. */
+ ieee802154_wake_queue(&local->hw);
+ rtnl_unlock();
+ kfree_skb(skb);
+ netdev_dbg(dev, "transmission failed\n");
+}
+
+static netdev_tx_t
+ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ int ret;
+
+ if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) {
+ u16 crc = crc_ccitt(0, skb->data, skb->len);
+
+ put_unaligned_le16(crc, skb_put(skb, 2));
+ }
+
+ if (skb_cow_head(skb, local->hw.extra_tx_headroom))
+ goto err_tx;
+
+ /* Stop the netif queue on each sub_if_data object. */
+ ieee802154_stop_queue(&local->hw);
+
+ /* async is priority, otherwise sync is fallback */
+ if (local->ops->xmit_async) {
+ ret = drv_xmit_async(local, skb);
+ if (ret) {
+ ieee802154_wake_queue(&local->hw);
+ goto err_tx;
+ }
+
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += skb->len;
+ } else {
+ INIT_WORK(&ieee802154_xmit_cb.work, ieee802154_xmit_worker);
+ ieee802154_xmit_cb.skb = skb;
+ ieee802154_xmit_cb.local = local;
+
+ queue_work(local->workqueue, &ieee802154_xmit_cb.work);
+ }
+
+ return NETDEV_TX_OK;
+
+err_tx:
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+netdev_tx_t
+ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+
+ skb->skb_iif = dev->ifindex;
+
+ return ieee802154_tx(sdata->local, skb);
+}
+
+netdev_tx_t
+ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev);
+ int rc;
+
+ rc = mac802154_llsec_encrypt(&sdata->sec, skb);
+ if (rc) {
+ netdev_warn(dev, "encryption failed: %i\n", rc);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ skb->skb_iif = dev->ifindex;
+
+ return ieee802154_tx(sdata->local, skb);
+}
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
new file mode 100644
index 000000000..150bf807e
--- /dev/null
+++ b/net/mac802154/util.c
@@ -0,0 +1,95 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * Alexander Aring <aar@pengutronix.de>
+ *
+ * Based on: net/mac80211/util.c
+ */
+
+#include "ieee802154_i.h"
+
+/* privid for wpan_phys to determine whether they belong to us or not */
+const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid;
+
+void ieee802154_wake_queue(struct ieee802154_hw *hw)
+{
+ struct ieee802154_local *local = hw_to_local(hw);
+ struct ieee802154_sub_if_data *sdata;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!sdata->dev)
+ continue;
+
+ netif_wake_queue(sdata->dev);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee802154_wake_queue);
+
+void ieee802154_stop_queue(struct ieee802154_hw *hw)
+{
+ struct ieee802154_local *local = hw_to_local(hw);
+ struct ieee802154_sub_if_data *sdata;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!sdata->dev)
+ continue;
+
+ netif_stop_queue(sdata->dev);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee802154_stop_queue);
+
+enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer)
+{
+ struct ieee802154_local *local =
+ container_of(timer, struct ieee802154_local, ifs_timer);
+
+ ieee802154_wake_queue(&local->hw);
+
+ return HRTIMER_NORESTART;
+}
+
+void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
+ bool ifs_handling)
+{
+ if (ifs_handling) {
+ struct ieee802154_local *local = hw_to_local(hw);
+ u8 max_sifs_size;
+
+ /* If transceiver sets CRC on his own we need to use lifs
+ * threshold len above 16 otherwise 18, because it's not
+ * part of skb->len.
+ */
+ if (hw->flags & IEEE802154_HW_TX_OMIT_CKSUM)
+ max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE -
+ IEEE802154_FCS_LEN;
+ else
+ max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE;
+
+ if (skb->len > max_sifs_size)
+ hrtimer_start(&local->ifs_timer,
+ ktime_set(0, hw->phy->lifs_period * NSEC_PER_USEC),
+ HRTIMER_MODE_REL);
+ else
+ hrtimer_start(&local->ifs_timer,
+ ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC),
+ HRTIMER_MODE_REL);
+
+ consume_skb(skb);
+ } else {
+ ieee802154_wake_queue(hw);
+ consume_skb(skb);
+ }
+}
+EXPORT_SYMBOL(ieee802154_xmit_complete);
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
new file mode 100644
index 000000000..17bde799c
--- /dev/null
+++ b/net/mpls/Kconfig
@@ -0,0 +1,30 @@
+#
+# MPLS configuration
+#
+
+menuconfig MPLS
+ bool "MultiProtocol Label Switching"
+ default n
+ ---help---
+ MultiProtocol Label Switching routes packets through logical
+ circuits. Originally conceived as a way of routing packets at
+ hardware speeds (before hardware was capable of routing ipv4 packets),
+ MPLS remains a simple way of making tunnels.
+
+ If you have not heard of MPLS you probably want to say N here.
+
+if MPLS
+
+config NET_MPLS_GSO
+ tristate "MPLS: GSO support"
+ help
+ This is helper module to allow segmentation of non-MPLS GSO packets
+ that have had MPLS stack entries pushed onto them and thus
+ become MPLS GSO packets.
+
+config MPLS_ROUTING
+ tristate "MPLS: routing support"
+ help
+ Add support for forwarding of mpls packets.
+
+endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
new file mode 100644
index 000000000..65bbe68c7
--- /dev/null
+++ b/net/mpls/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for MPLS.
+#
+obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
+obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
+
+mpls_router-y := af_mpls.o
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
new file mode 100644
index 000000000..1f93a5978
--- /dev/null
+++ b/net/mpls/af_mpls.c
@@ -0,0 +1,1153 @@
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/sysctl.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/ipv6.h>
+#include <linux/mpls.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include "internal.h"
+
+#define LABEL_NOT_SPECIFIED (1<<20)
+#define MAX_NEW_LABELS 2
+
+/* This maximum ha length copied from the definition of struct neighbour */
+#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
+
+struct mpls_route { /* next hop label forwarding entry */
+ struct net_device __rcu *rt_dev;
+ struct rcu_head rt_rcu;
+ u32 rt_label[MAX_NEW_LABELS];
+ u8 rt_protocol; /* routing protocol that set this entry */
+ u8 rt_labels;
+ u8 rt_via_alen;
+ u8 rt_via_table;
+ u8 rt_via[0];
+};
+
+static int zero = 0;
+static int label_limit = (1 << 20) - 1;
+
+static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
+ struct nlmsghdr *nlh, struct net *net, u32 portid,
+ unsigned int nlm_flags);
+
+static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
+{
+ struct mpls_route *rt = NULL;
+
+ if (index < net->mpls.platform_labels) {
+ struct mpls_route __rcu **platform_label =
+ rcu_dereference(net->mpls.platform_label);
+ rt = rcu_dereference(platform_label[index]);
+ }
+ return rt;
+}
+
+static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
+{
+ return rcu_dereference_rtnl(dev->mpls_ptr);
+}
+
+static bool mpls_output_possible(const struct net_device *dev)
+{
+ return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
+}
+
+static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
+{
+ /* The size of the layer 2.5 labels to be added for this route */
+ return rt->rt_labels * sizeof(struct mpls_shim_hdr);
+}
+
+static unsigned int mpls_dev_mtu(const struct net_device *dev)
+{
+ /* The amount of data the layer 2 frame can hold */
+ return dev->mtu;
+}
+
+static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+ return false;
+
+ return true;
+}
+
+static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
+ struct mpls_entry_decoded dec)
+{
+ /* RFC4385 and RFC5586 encode other packets in mpls such that
+ * they don't conflict with the ip version number, making
+ * decoding by examining the ip version correct in everything
+ * except for the strangest cases.
+ *
+ * The strange cases if we choose to support them will require
+ * manual configuration.
+ */
+ struct iphdr *hdr4;
+ bool success = true;
+
+ /* The IPv4 code below accesses through the IPv4 header
+ * checksum, which is 12 bytes into the packet.
+ * The IPv6 code below accesses through the IPv6 hop limit
+ * which is 8 bytes into the packet.
+ *
+ * For all supported cases there should always be at least 12
+ * bytes of packet data present. The IPv4 header is 20 bytes
+ * without options and the IPv6 header is always 40 bytes
+ * long.
+ */
+ if (!pskb_may_pull(skb, 12))
+ return false;
+
+ /* Use ip_hdr to find the ip protocol version */
+ hdr4 = ip_hdr(skb);
+ if (hdr4->version == 4) {
+ skb->protocol = htons(ETH_P_IP);
+ csum_replace2(&hdr4->check,
+ htons(hdr4->ttl << 8),
+ htons(dec.ttl << 8));
+ hdr4->ttl = dec.ttl;
+ }
+ else if (hdr4->version == 6) {
+ struct ipv6hdr *hdr6 = ipv6_hdr(skb);
+ skb->protocol = htons(ETH_P_IPV6);
+ hdr6->hop_limit = dec.ttl;
+ }
+ else
+ /* version 0 and version 1 are used by pseudo wires */
+ success = false;
+ return success;
+}
+
+static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct net *net = dev_net(dev);
+ struct mpls_shim_hdr *hdr;
+ struct mpls_route *rt;
+ struct mpls_entry_decoded dec;
+ struct net_device *out_dev;
+ struct mpls_dev *mdev;
+ unsigned int hh_len;
+ unsigned int new_header_size;
+ unsigned int mtu;
+ int err;
+
+ /* Careful this entire function runs inside of an rcu critical section */
+
+ mdev = mpls_dev_get(dev);
+ if (!mdev || !mdev->input_enabled)
+ goto drop;
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto drop;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(*hdr)))
+ goto drop;
+
+ /* Read and decode the label */
+ hdr = mpls_hdr(skb);
+ dec = mpls_entry_decode(hdr);
+
+ /* Pop the label */
+ skb_pull(skb, sizeof(*hdr));
+ skb_reset_network_header(skb);
+
+ skb_orphan(skb);
+
+ rt = mpls_route_input_rcu(net, dec.label);
+ if (!rt)
+ goto drop;
+
+ /* Find the output device */
+ out_dev = rcu_dereference(rt->rt_dev);
+ if (!mpls_output_possible(out_dev))
+ goto drop;
+
+ if (skb_warn_if_lro(skb))
+ goto drop;
+
+ skb_forward_csum(skb);
+
+ /* Verify ttl is valid */
+ if (dec.ttl <= 1)
+ goto drop;
+ dec.ttl -= 1;
+
+ /* Verify the destination can hold the packet */
+ new_header_size = mpls_rt_header_size(rt);
+ mtu = mpls_dev_mtu(out_dev);
+ if (mpls_pkt_too_big(skb, mtu - new_header_size))
+ goto drop;
+
+ hh_len = LL_RESERVED_SPACE(out_dev);
+ if (!out_dev->header_ops)
+ hh_len = 0;
+
+ /* Ensure there is enough space for the headers in the skb */
+ if (skb_cow(skb, hh_len + new_header_size))
+ goto drop;
+
+ skb->dev = out_dev;
+ skb->protocol = htons(ETH_P_MPLS_UC);
+
+ if (unlikely(!new_header_size && dec.bos)) {
+ /* Penultimate hop popping */
+ if (!mpls_egress(rt, skb, dec))
+ goto drop;
+ } else {
+ bool bos;
+ int i;
+ skb_push(skb, new_header_size);
+ skb_reset_network_header(skb);
+ /* Push the new labels */
+ hdr = mpls_hdr(skb);
+ bos = dec.bos;
+ for (i = rt->rt_labels - 1; i >= 0; i--) {
+ hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos);
+ bos = false;
+ }
+ }
+
+ err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb);
+ if (err)
+ net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+ __func__, err);
+ return 0;
+
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static struct packet_type mpls_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_MPLS_UC),
+ .func = mpls_forward,
+};
+
+static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = {
+ [RTA_DST] = { .type = NLA_U32 },
+ [RTA_OIF] = { .type = NLA_U32 },
+};
+
+struct mpls_route_config {
+ u32 rc_protocol;
+ u32 rc_ifindex;
+ u16 rc_via_table;
+ u16 rc_via_alen;
+ u8 rc_via[MAX_VIA_ALEN];
+ u32 rc_label;
+ u32 rc_output_labels;
+ u32 rc_output_label[MAX_NEW_LABELS];
+ u32 rc_nlflags;
+ struct nl_info rc_nlinfo;
+};
+
+static struct mpls_route *mpls_rt_alloc(size_t alen)
+{
+ struct mpls_route *rt;
+
+ rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL);
+ if (rt)
+ rt->rt_via_alen = alen;
+ return rt;
+}
+
+static void mpls_rt_free(struct mpls_route *rt)
+{
+ if (rt)
+ kfree_rcu(rt, rt_rcu);
+}
+
+static void mpls_notify_route(struct net *net, unsigned index,
+ struct mpls_route *old, struct mpls_route *new,
+ const struct nl_info *info)
+{
+ struct nlmsghdr *nlh = info ? info->nlh : NULL;
+ unsigned portid = info ? info->portid : 0;
+ int event = new ? RTM_NEWROUTE : RTM_DELROUTE;
+ struct mpls_route *rt = new ? new : old;
+ unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0;
+ /* Ignore reserved labels for now */
+ if (rt && (index >= 16))
+ rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags);
+}
+
+static void mpls_route_update(struct net *net, unsigned index,
+ struct net_device *dev, struct mpls_route *new,
+ const struct nl_info *info)
+{
+ struct mpls_route __rcu **platform_label;
+ struct mpls_route *rt, *old = NULL;
+
+ ASSERT_RTNL();
+
+ platform_label = rtnl_dereference(net->mpls.platform_label);
+ rt = rtnl_dereference(platform_label[index]);
+ if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) {
+ rcu_assign_pointer(platform_label[index], new);
+ old = rt;
+ }
+
+ mpls_notify_route(net, index, old, new, info);
+
+ /* If we removed a route free it now */
+ mpls_rt_free(old);
+}
+
+static unsigned find_free_label(struct net *net)
+{
+ struct mpls_route __rcu **platform_label;
+ size_t platform_labels;
+ unsigned index;
+
+ platform_label = rtnl_dereference(net->mpls.platform_label);
+ platform_labels = net->mpls.platform_labels;
+ for (index = 16; index < platform_labels; index++) {
+ if (!rtnl_dereference(platform_label[index]))
+ return index;
+ }
+ return LABEL_NOT_SPECIFIED;
+}
+
+static int mpls_route_add(struct mpls_route_config *cfg)
+{
+ struct mpls_route __rcu **platform_label;
+ struct net *net = cfg->rc_nlinfo.nl_net;
+ struct net_device *dev = NULL;
+ struct mpls_route *rt, *old;
+ unsigned index;
+ int i;
+ int err = -EINVAL;
+
+ index = cfg->rc_label;
+
+ /* If a label was not specified during insert pick one */
+ if ((index == LABEL_NOT_SPECIFIED) &&
+ (cfg->rc_nlflags & NLM_F_CREATE)) {
+ index = find_free_label(net);
+ }
+
+ /* The first 16 labels are reserved, and may not be set */
+ if (index < 16)
+ goto errout;
+
+ /* The full 20 bit range may not be supported. */
+ if (index >= net->mpls.platform_labels)
+ goto errout;
+
+ /* Ensure only a supported number of labels are present */
+ if (cfg->rc_output_labels > MAX_NEW_LABELS)
+ goto errout;
+
+ err = -ENODEV;
+ dev = dev_get_by_index(net, cfg->rc_ifindex);
+ if (!dev)
+ goto errout;
+
+ /* Ensure this is a supported device */
+ err = -EINVAL;
+ if (!mpls_dev_get(dev))
+ goto errout;
+
+ err = -EINVAL;
+ if ((cfg->rc_via_table == NEIGH_LINK_TABLE) &&
+ (dev->addr_len != cfg->rc_via_alen))
+ goto errout;
+
+ /* Append makes no sense with mpls */
+ err = -EOPNOTSUPP;
+ if (cfg->rc_nlflags & NLM_F_APPEND)
+ goto errout;
+
+ err = -EEXIST;
+ platform_label = rtnl_dereference(net->mpls.platform_label);
+ old = rtnl_dereference(platform_label[index]);
+ if ((cfg->rc_nlflags & NLM_F_EXCL) && old)
+ goto errout;
+
+ err = -EEXIST;
+ if (!(cfg->rc_nlflags & NLM_F_REPLACE) && old)
+ goto errout;
+
+ err = -ENOENT;
+ if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old)
+ goto errout;
+
+ err = -ENOMEM;
+ rt = mpls_rt_alloc(cfg->rc_via_alen);
+ if (!rt)
+ goto errout;
+
+ rt->rt_labels = cfg->rc_output_labels;
+ for (i = 0; i < rt->rt_labels; i++)
+ rt->rt_label[i] = cfg->rc_output_label[i];
+ rt->rt_protocol = cfg->rc_protocol;
+ RCU_INIT_POINTER(rt->rt_dev, dev);
+ rt->rt_via_table = cfg->rc_via_table;
+ memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
+
+ mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo);
+
+ dev_put(dev);
+ return 0;
+
+errout:
+ if (dev)
+ dev_put(dev);
+ return err;
+}
+
+static int mpls_route_del(struct mpls_route_config *cfg)
+{
+ struct net *net = cfg->rc_nlinfo.nl_net;
+ unsigned index;
+ int err = -EINVAL;
+
+ index = cfg->rc_label;
+
+ /* The first 16 labels are reserved, and may not be removed */
+ if (index < 16)
+ goto errout;
+
+ /* The full 20 bit range may not be supported */
+ if (index >= net->mpls.platform_labels)
+ goto errout;
+
+ mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo);
+
+ err = 0;
+errout:
+ return err;
+}
+
+#define MPLS_PERDEV_SYSCTL_OFFSET(field) \
+ (&((struct mpls_dev *)0)->field)
+
+static const struct ctl_table mpls_dev_table[] = {
+ {
+ .procname = "input",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
+ },
+ { }
+};
+
+static int mpls_dev_sysctl_register(struct net_device *dev,
+ struct mpls_dev *mdev)
+{
+ char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
+ struct ctl_table *table;
+ int i;
+
+ table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ /* Table data contains only offsets relative to the base of
+ * the mdev at this point, so make them absolute.
+ */
+ for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
+ table[i].data = (char *)mdev + (uintptr_t)table[i].data;
+
+ snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
+
+ mdev->sysctl = register_net_sysctl(dev_net(dev), path, table);
+ if (!mdev->sysctl)
+ goto free;
+
+ return 0;
+
+free:
+ kfree(table);
+out:
+ return -ENOBUFS;
+}
+
+static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev)
+{
+ struct ctl_table *table;
+
+ table = mdev->sysctl->ctl_table_arg;
+ unregister_net_sysctl_table(mdev->sysctl);
+ kfree(table);
+}
+
+static struct mpls_dev *mpls_add_dev(struct net_device *dev)
+{
+ struct mpls_dev *mdev;
+ int err = -ENOMEM;
+
+ ASSERT_RTNL();
+
+ mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+ if (!mdev)
+ return ERR_PTR(err);
+
+ err = mpls_dev_sysctl_register(dev, mdev);
+ if (err)
+ goto free;
+
+ rcu_assign_pointer(dev->mpls_ptr, mdev);
+
+ return mdev;
+
+free:
+ kfree(mdev);
+ return ERR_PTR(err);
+}
+
+static void mpls_ifdown(struct net_device *dev)
+{
+ struct mpls_route __rcu **platform_label;
+ struct net *net = dev_net(dev);
+ struct mpls_dev *mdev;
+ unsigned index;
+
+ platform_label = rtnl_dereference(net->mpls.platform_label);
+ for (index = 0; index < net->mpls.platform_labels; index++) {
+ struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+ if (!rt)
+ continue;
+ if (rtnl_dereference(rt->rt_dev) != dev)
+ continue;
+ rt->rt_dev = NULL;
+ }
+
+ mdev = mpls_dev_get(dev);
+ if (!mdev)
+ return;
+
+ mpls_dev_sysctl_unregister(mdev);
+
+ RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+
+ kfree_rcu(mdev, rcu);
+}
+
+static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct mpls_dev *mdev;
+
+ switch(event) {
+ case NETDEV_REGISTER:
+ /* For now just support ethernet devices */
+ if ((dev->type == ARPHRD_ETHER) ||
+ (dev->type == ARPHRD_LOOPBACK)) {
+ mdev = mpls_add_dev(dev);
+ if (IS_ERR(mdev))
+ return notifier_from_errno(PTR_ERR(mdev));
+ }
+ break;
+
+ case NETDEV_UNREGISTER:
+ mpls_ifdown(dev);
+ break;
+ case NETDEV_CHANGENAME:
+ mdev = mpls_dev_get(dev);
+ if (mdev) {
+ int err;
+
+ mpls_dev_sysctl_unregister(mdev);
+ err = mpls_dev_sysctl_register(dev, mdev);
+ if (err)
+ return notifier_from_errno(err);
+ }
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mpls_dev_notifier = {
+ .notifier_call = mpls_dev_notify,
+};
+
+static int nla_put_via(struct sk_buff *skb,
+ u8 table, const void *addr, int alen)
+{
+ static const int table_to_family[NEIGH_NR_TABLES + 1] = {
+ AF_INET, AF_INET6, AF_DECnet, AF_PACKET,
+ };
+ struct nlattr *nla;
+ struct rtvia *via;
+ int family = AF_UNSPEC;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ return -EMSGSIZE;
+
+ if (table <= NEIGH_NR_TABLES)
+ family = table_to_family[table];
+
+ via = nla_data(nla);
+ via->rtvia_family = family;
+ memcpy(via->rtvia_addr, addr, alen);
+ return 0;
+}
+
+int nla_put_labels(struct sk_buff *skb, int attrtype,
+ u8 labels, const u32 label[])
+{
+ struct nlattr *nla;
+ struct mpls_shim_hdr *nla_label;
+ bool bos;
+ int i;
+ nla = nla_reserve(skb, attrtype, labels*4);
+ if (!nla)
+ return -EMSGSIZE;
+
+ nla_label = nla_data(nla);
+ bos = true;
+ for (i = labels - 1; i >= 0; i--) {
+ nla_label[i] = mpls_entry_encode(label[i], 0, 0, bos);
+ bos = false;
+ }
+
+ return 0;
+}
+
+int nla_get_labels(const struct nlattr *nla,
+ u32 max_labels, u32 *labels, u32 label[])
+{
+ unsigned len = nla_len(nla);
+ unsigned nla_labels;
+ struct mpls_shim_hdr *nla_label;
+ bool bos;
+ int i;
+
+ /* len needs to be an even multiple of 4 (the label size) */
+ if (len & 3)
+ return -EINVAL;
+
+ /* Limit the number of new labels allowed */
+ nla_labels = len/4;
+ if (nla_labels > max_labels)
+ return -EINVAL;
+
+ nla_label = nla_data(nla);
+ bos = true;
+ for (i = nla_labels - 1; i >= 0; i--, bos = false) {
+ struct mpls_entry_decoded dec;
+ dec = mpls_entry_decode(nla_label + i);
+
+ /* Ensure the bottom of stack flag is properly set
+ * and ttl and tc are both clear.
+ */
+ if ((dec.bos != bos) || dec.ttl || dec.tc)
+ return -EINVAL;
+
+ switch (dec.label) {
+ case MPLS_LABEL_IMPLNULL:
+ /* RFC3032: This is a label that an LSR may
+ * assign and distribute, but which never
+ * actually appears in the encapsulation.
+ */
+ return -EINVAL;
+ }
+
+ label[i] = dec.label;
+ }
+ *labels = nla_labels;
+ return 0;
+}
+
+static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct mpls_route_config *cfg)
+{
+ struct rtmsg *rtm;
+ struct nlattr *tb[RTA_MAX+1];
+ int index;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ rtm = nlmsg_data(nlh);
+ memset(cfg, 0, sizeof(*cfg));
+
+ if (rtm->rtm_family != AF_MPLS)
+ goto errout;
+ if (rtm->rtm_dst_len != 20)
+ goto errout;
+ if (rtm->rtm_src_len != 0)
+ goto errout;
+ if (rtm->rtm_tos != 0)
+ goto errout;
+ if (rtm->rtm_table != RT_TABLE_MAIN)
+ goto errout;
+ /* Any value is acceptable for rtm_protocol */
+
+ /* As mpls uses destination specific addresses
+ * (or source specific address in the case of multicast)
+ * all addresses have universal scope.
+ */
+ if (rtm->rtm_scope != RT_SCOPE_UNIVERSE)
+ goto errout;
+ if (rtm->rtm_type != RTN_UNICAST)
+ goto errout;
+ if (rtm->rtm_flags != 0)
+ goto errout;
+
+ cfg->rc_label = LABEL_NOT_SPECIFIED;
+ cfg->rc_protocol = rtm->rtm_protocol;
+ cfg->rc_nlflags = nlh->nlmsg_flags;
+ cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->rc_nlinfo.nlh = nlh;
+ cfg->rc_nlinfo.nl_net = sock_net(skb->sk);
+
+ for (index = 0; index <= RTA_MAX; index++) {
+ struct nlattr *nla = tb[index];
+ if (!nla)
+ continue;
+
+ switch(index) {
+ case RTA_OIF:
+ cfg->rc_ifindex = nla_get_u32(nla);
+ break;
+ case RTA_NEWDST:
+ if (nla_get_labels(nla, MAX_NEW_LABELS,
+ &cfg->rc_output_labels,
+ cfg->rc_output_label))
+ goto errout;
+ break;
+ case RTA_DST:
+ {
+ u32 label_count;
+ if (nla_get_labels(nla, 1, &label_count,
+ &cfg->rc_label))
+ goto errout;
+
+ /* The first 16 labels are reserved, and may not be set */
+ if (cfg->rc_label < 16)
+ goto errout;
+
+ break;
+ }
+ case RTA_VIA:
+ {
+ struct rtvia *via = nla_data(nla);
+ if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
+ goto errout;
+ cfg->rc_via_alen = nla_len(nla) -
+ offsetof(struct rtvia, rtvia_addr);
+ if (cfg->rc_via_alen > MAX_VIA_ALEN)
+ goto errout;
+
+ /* Validate the address family */
+ switch(via->rtvia_family) {
+ case AF_PACKET:
+ cfg->rc_via_table = NEIGH_LINK_TABLE;
+ break;
+ case AF_INET:
+ cfg->rc_via_table = NEIGH_ARP_TABLE;
+ if (cfg->rc_via_alen != 4)
+ goto errout;
+ break;
+ case AF_INET6:
+ cfg->rc_via_table = NEIGH_ND_TABLE;
+ if (cfg->rc_via_alen != 16)
+ goto errout;
+ break;
+ default:
+ /* Unsupported address family */
+ goto errout;
+ }
+
+ memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen);
+ break;
+ }
+ default:
+ /* Unsupported attribute */
+ goto errout;
+ }
+ }
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct mpls_route_config cfg;
+ int err;
+
+ err = rtm_to_route_config(skb, nlh, &cfg);
+ if (err < 0)
+ return err;
+
+ return mpls_route_del(&cfg);
+}
+
+
+static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct mpls_route_config cfg;
+ int err;
+
+ err = rtm_to_route_config(skb, nlh, &cfg);
+ if (err < 0)
+ return err;
+
+ return mpls_route_add(&cfg);
+}
+
+static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
+ u32 label, struct mpls_route *rt, int flags)
+{
+ struct net_device *dev;
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = AF_MPLS;
+ rtm->rtm_dst_len = 20;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = RT_TABLE_MAIN;
+ rtm->rtm_protocol = rt->rt_protocol;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ rtm->rtm_type = RTN_UNICAST;
+ rtm->rtm_flags = 0;
+
+ if (rt->rt_labels &&
+ nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label))
+ goto nla_put_failure;
+ if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen))
+ goto nla_put_failure;
+ dev = rtnl_dereference(rt->rt_dev);
+ if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
+ goto nla_put_failure;
+ if (nla_put_labels(skb, RTA_DST, 1, &label))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct mpls_route __rcu **platform_label;
+ size_t platform_labels;
+ unsigned int index;
+
+ ASSERT_RTNL();
+
+ index = cb->args[0];
+ if (index < 16)
+ index = 16;
+
+ platform_label = rtnl_dereference(net->mpls.platform_label);
+ platform_labels = net->mpls.platform_labels;
+ for (; index < platform_labels; index++) {
+ struct mpls_route *rt;
+ rt = rtnl_dereference(platform_label[index]);
+ if (!rt)
+ continue;
+
+ if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ index, rt, NLM_F_MULTI) < 0)
+ break;
+ }
+ cb->args[0] = index;
+
+ return skb->len;
+}
+
+static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
+{
+ size_t payload =
+ NLMSG_ALIGN(sizeof(struct rtmsg))
+ + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */
+ + nla_total_size(4); /* RTA_DST */
+ if (rt->rt_labels) /* RTA_NEWDST */
+ payload += nla_total_size(rt->rt_labels * 4);
+ if (rt->rt_dev) /* RTA_OIF */
+ payload += nla_total_size(4);
+ return payload;
+}
+
+static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
+ struct nlmsghdr *nlh, struct net *net, u32 portid,
+ unsigned int nlm_flags)
+{
+ struct sk_buff *skb;
+ u32 seq = nlh ? nlh->nlmsg_seq : 0;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(lfib_nlmsg_size(rt), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = mpls_dump_route(skb, portid, seq, event, label, rt, nlm_flags);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in lfib_nlmsg_size */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, portid, RTNLGRP_MPLS_ROUTE, nlh, GFP_KERNEL);
+
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err);
+}
+
+static int resize_platform_label_table(struct net *net, size_t limit)
+{
+ size_t size = sizeof(struct mpls_route *) * limit;
+ size_t old_limit;
+ size_t cp_size;
+ struct mpls_route __rcu **labels = NULL, **old;
+ struct mpls_route *rt0 = NULL, *rt2 = NULL;
+ unsigned index;
+
+ if (size) {
+ labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ if (!labels)
+ labels = vzalloc(size);
+
+ if (!labels)
+ goto nolabels;
+ }
+
+ /* In case the predefined labels need to be populated */
+ if (limit > MPLS_LABEL_IPV4NULL) {
+ struct net_device *lo = net->loopback_dev;
+ rt0 = mpls_rt_alloc(lo->addr_len);
+ if (!rt0)
+ goto nort0;
+ RCU_INIT_POINTER(rt0->rt_dev, lo);
+ rt0->rt_protocol = RTPROT_KERNEL;
+ rt0->rt_via_table = NEIGH_LINK_TABLE;
+ memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
+ }
+ if (limit > MPLS_LABEL_IPV6NULL) {
+ struct net_device *lo = net->loopback_dev;
+ rt2 = mpls_rt_alloc(lo->addr_len);
+ if (!rt2)
+ goto nort2;
+ RCU_INIT_POINTER(rt2->rt_dev, lo);
+ rt2->rt_protocol = RTPROT_KERNEL;
+ rt2->rt_via_table = NEIGH_LINK_TABLE;
+ memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
+ }
+
+ rtnl_lock();
+ /* Remember the original table */
+ old = rtnl_dereference(net->mpls.platform_label);
+ old_limit = net->mpls.platform_labels;
+
+ /* Free any labels beyond the new table */
+ for (index = limit; index < old_limit; index++)
+ mpls_route_update(net, index, NULL, NULL, NULL);
+
+ /* Copy over the old labels */
+ cp_size = size;
+ if (old_limit < limit)
+ cp_size = old_limit * sizeof(struct mpls_route *);
+
+ memcpy(labels, old, cp_size);
+
+ /* If needed set the predefined labels */
+ if ((old_limit <= MPLS_LABEL_IPV6NULL) &&
+ (limit > MPLS_LABEL_IPV6NULL)) {
+ RCU_INIT_POINTER(labels[MPLS_LABEL_IPV6NULL], rt2);
+ rt2 = NULL;
+ }
+
+ if ((old_limit <= MPLS_LABEL_IPV4NULL) &&
+ (limit > MPLS_LABEL_IPV4NULL)) {
+ RCU_INIT_POINTER(labels[MPLS_LABEL_IPV4NULL], rt0);
+ rt0 = NULL;
+ }
+
+ /* Update the global pointers */
+ net->mpls.platform_labels = limit;
+ rcu_assign_pointer(net->mpls.platform_label, labels);
+
+ rtnl_unlock();
+
+ mpls_rt_free(rt2);
+ mpls_rt_free(rt0);
+
+ if (old) {
+ synchronize_rcu();
+ kvfree(old);
+ }
+ return 0;
+
+nort2:
+ mpls_rt_free(rt0);
+nort0:
+ kvfree(labels);
+nolabels:
+ return -ENOMEM;
+}
+
+static int mpls_platform_labels(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = table->data;
+ int platform_labels = net->mpls.platform_labels;
+ int ret;
+ struct ctl_table tmp = {
+ .procname = table->procname,
+ .data = &platform_labels,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ .extra1 = &zero,
+ .extra2 = &label_limit,
+ };
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0)
+ ret = resize_platform_label_table(net, platform_labels);
+
+ return ret;
+}
+
+static const struct ctl_table mpls_table[] = {
+ {
+ .procname = "platform_labels",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = mpls_platform_labels,
+ },
+ { }
+};
+
+static int mpls_net_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ net->mpls.platform_labels = 0;
+ net->mpls.platform_label = NULL;
+
+ table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL);
+ if (table == NULL)
+ return -ENOMEM;
+
+ table[0].data = net;
+ net->mpls.ctl = register_net_sysctl(net, "net/mpls", table);
+ if (net->mpls.ctl == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void mpls_net_exit(struct net *net)
+{
+ struct mpls_route __rcu **platform_label;
+ size_t platform_labels;
+ struct ctl_table *table;
+ unsigned int index;
+
+ table = net->mpls.ctl->ctl_table_arg;
+ unregister_net_sysctl_table(net->mpls.ctl);
+ kfree(table);
+
+ /* An rcu grace period has passed since there was a device in
+ * the network namespace (and thus the last in flight packet)
+ * left this network namespace. This is because
+ * unregister_netdevice_many and netdev_run_todo has completed
+ * for each network device that was in this network namespace.
+ *
+ * As such no additional rcu synchronization is necessary when
+ * freeing the platform_label table.
+ */
+ rtnl_lock();
+ platform_label = rtnl_dereference(net->mpls.platform_label);
+ platform_labels = net->mpls.platform_labels;
+ for (index = 0; index < platform_labels; index++) {
+ struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+ RCU_INIT_POINTER(platform_label[index], NULL);
+ mpls_rt_free(rt);
+ }
+ rtnl_unlock();
+
+ kvfree(platform_label);
+}
+
+static struct pernet_operations mpls_net_ops = {
+ .init = mpls_net_init,
+ .exit = mpls_net_exit,
+};
+
+static int __init mpls_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4);
+
+ err = register_pernet_subsys(&mpls_net_ops);
+ if (err)
+ goto out;
+
+ err = register_netdevice_notifier(&mpls_dev_notifier);
+ if (err)
+ goto out_unregister_pernet;
+
+ dev_add_pack(&mpls_packet_type);
+
+ rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
+ rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL);
+ err = 0;
+out:
+ return err;
+
+out_unregister_pernet:
+ unregister_pernet_subsys(&mpls_net_ops);
+ goto out;
+}
+module_init(mpls_init);
+
+static void __exit mpls_exit(void)
+{
+ rtnl_unregister_all(PF_MPLS);
+ dev_remove_pack(&mpls_packet_type);
+ unregister_netdevice_notifier(&mpls_dev_notifier);
+ unregister_pernet_subsys(&mpls_net_ops);
+}
+module_exit(mpls_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_NETPROTO(PF_MPLS);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
new file mode 100644
index 000000000..8cabeb5a1
--- /dev/null
+++ b/net/mpls/internal.h
@@ -0,0 +1,56 @@
+#ifndef MPLS_INTERNAL_H
+#define MPLS_INTERNAL_H
+
+struct mpls_shim_hdr {
+ __be32 label_stack_entry;
+};
+
+struct mpls_entry_decoded {
+ u32 label;
+ u8 ttl;
+ u8 tc;
+ u8 bos;
+};
+
+struct mpls_dev {
+ int input_enabled;
+
+ struct ctl_table_header *sysctl;
+ struct rcu_head rcu;
+};
+
+struct sk_buff;
+
+static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
+{
+ return (struct mpls_shim_hdr *)skb_network_header(skb);
+}
+
+static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos)
+{
+ struct mpls_shim_hdr result;
+ result.label_stack_entry =
+ cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) |
+ (tc << MPLS_LS_TC_SHIFT) |
+ (bos ? (1 << MPLS_LS_S_SHIFT) : 0) |
+ (ttl << MPLS_LS_TTL_SHIFT));
+ return result;
+}
+
+static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr)
+{
+ struct mpls_entry_decoded result;
+ unsigned entry = be32_to_cpu(hdr->label_stack_entry);
+
+ result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+ result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
+ result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;
+
+ return result;
+}
+
+int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]);
+int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]);
+
+#endif /* MPLS_INTERNAL_H */
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
new file mode 100644
index 000000000..809df534a
--- /dev/null
+++ b/net/mpls/mpls_gso.c
@@ -0,0 +1,98 @@
+/*
+ * MPLS GSO Support
+ *
+ * Authors: Simon Horman (horms@verge.net.au)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Based on: GSO portions of net/ipv4/gre.c
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/netdev_features.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ netdev_features_t mpls_features;
+ __be16 mpls_protocol;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN)))
+ goto out;
+
+ /* Setup inner SKB. */
+ mpls_protocol = skb->protocol;
+ skb->protocol = skb->inner_protocol;
+
+ /* Push back the mac header that skb_mac_gso_segment() has pulled.
+ * It will be re-pulled by the call to skb_mac_gso_segment() below
+ */
+ __skb_push(skb, skb->mac_len);
+
+ /* Segment inner packet. */
+ mpls_features = skb->dev->mpls_features & features;
+ segs = skb_mac_gso_segment(skb, mpls_features);
+
+
+ /* Restore outer protocol. */
+ skb->protocol = mpls_protocol;
+
+ /* Re-pull the mac header that the call to skb_mac_gso_segment()
+ * above pulled. It will be re-pushed after returning
+ * skb_mac_gso_segment(), an indirect caller of this function.
+ */
+ __skb_pull(skb, skb->data - skb_mac_header(skb));
+out:
+ return segs;
+}
+
+static struct packet_offload mpls_mc_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_MPLS_MC),
+ .callbacks = {
+ .gso_segment = mpls_gso_segment,
+ },
+};
+
+static struct packet_offload mpls_uc_offload __read_mostly = {
+ .type = cpu_to_be16(ETH_P_MPLS_UC),
+ .callbacks = {
+ .gso_segment = mpls_gso_segment,
+ },
+};
+
+static int __init mpls_gso_init(void)
+{
+ pr_info("MPLS GSO support\n");
+
+ dev_add_offload(&mpls_uc_offload);
+ dev_add_offload(&mpls_mc_offload);
+
+ return 0;
+}
+
+static void __exit mpls_gso_exit(void)
+{
+ dev_remove_offload(&mpls_uc_offload);
+ dev_remove_offload(&mpls_mc_offload);
+}
+
+module_init(mpls_gso_init);
+module_exit(mpls_gso_exit);
+
+MODULE_DESCRIPTION("MPLS GSO support");
+MODULE_AUTHOR("Simon Horman (horms@verge.net.au)");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
new file mode 100644
index 000000000..a0f3e6a3c
--- /dev/null
+++ b/net/netfilter/Kconfig
@@ -0,0 +1,1447 @@
+menu "Core Netfilter Configuration"
+ depends on NET && INET && NETFILTER
+
+config NETFILTER_NETLINK
+ tristate
+
+config NETFILTER_NETLINK_ACCT
+tristate "Netfilter NFACCT over NFNETLINK interface"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for extended accounting via NFNETLINK.
+
+config NETFILTER_NETLINK_QUEUE
+ tristate "Netfilter NFQUEUE over NFNETLINK interface"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for queueing packets via NFNETLINK.
+
+config NETFILTER_NETLINK_LOG
+ tristate "Netfilter LOG over NFNETLINK interface"
+ default m if NETFILTER_ADVANCED=n
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for logging packets via NFNETLINK.
+
+ This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms,
+ and is also scheduled to replace the old syslog-based ipt_LOG
+ and ip6t_LOG modules.
+
+config NF_CONNTRACK
+ tristate "Netfilter connection tracking support"
+ default m if NETFILTER_ADVANCED=n
+ help
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ This is required to do Masquerading or other kinds of Network
+ Address Translation. It can also be used to enhance packet
+ filtering (see `Connection state match support' below).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_LOG_COMMON
+ tristate
+
+if NF_CONNTRACK
+
+config NF_CONNTRACK_MARK
+ bool 'Connection mark tracking support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection marks, used by the
+ `CONNMARK' target and `connmark' match. Similar to the mark value
+ of packets, but this mark value is kept in the conntrack session
+ instead of the individual packets.
+
+config NF_CONNTRACK_SECMARK
+ bool 'Connection tracking security mark support'
+ depends on NETWORK_SECMARK
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option enables security markings to be applied to
+ connections. Typically they are copied to connections from
+ packets using the CONNSECMARK target and copied back from
+ connections to packets with the same target, with the packets
+ being originally labeled via SECMARK.
+
+ If unsure, say 'N'.
+
+config NF_CONNTRACK_ZONES
+ bool 'Connection tracking zones'
+ depends on NETFILTER_ADVANCED
+ depends on NETFILTER_XT_TARGET_CT
+ help
+ This option enables support for connection tracking zones.
+ Normally, each connection needs to have a unique system wide
+ identity. Connection tracking zones allow to have multiple
+ connections using the same identity, as long as they are
+ contained in different zones.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_PROCFS
+ bool "Supply CT list in procfs (OBSOLETE)"
+ default y
+ depends on PROC_FS
+ ---help---
+ This option enables for the list of known conntrack entries
+ to be shown in procfs under net/netfilter/nf_conntrack. This
+ is considered obsolete in favor of using the conntrack(8)
+ tool which uses Netlink.
+
+config NF_CONNTRACK_EVENTS
+ bool "Connection tracking events"
+ depends on NETFILTER_ADVANCED
+ help
+ If this option is enabled, the connection tracking code will
+ provide a notifier chain that can be used by other kernel code
+ to get notified about changes in the connection tracking state.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_TIMEOUT
+ bool 'Connection tracking timeout'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timeout
+ extension. This allows you to attach timeout policies to flow
+ via the CT target.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_TIMESTAMP
+ bool 'Connection tracking timestamping'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timestamping.
+ This allows you to store the flow start-time and to obtain
+ the flow-stop time (once it has been destroyed) via Connection
+ tracking events.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_LABELS
+ bool
+ help
+ This option enables support for assigning user-defined flag bits
+ to connection tracking entries. It selected by the connlabel match.
+
+config NF_CT_PROTO_DCCP
+ tristate 'DCCP protocol connection tracking support'
+ depends on NETFILTER_ADVANCED
+ default IP_DCCP
+ help
+ With this option enabled, the layer 3 independent connection
+ tracking code will be able to do state tracking on DCCP connections.
+
+ If unsure, say 'N'.
+
+config NF_CT_PROTO_GRE
+ tristate
+
+config NF_CT_PROTO_SCTP
+ tristate 'SCTP protocol connection tracking support'
+ depends on NETFILTER_ADVANCED
+ default IP_SCTP
+ help
+ With this option enabled, the layer 3 independent connection
+ tracking code will be able to do state tracking on SCTP connections.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NF_CT_PROTO_UDPLITE
+ tristate 'UDP-Lite protocol connection tracking support'
+ depends on NETFILTER_ADVANCED
+ help
+ With this option enabled, the layer 3 independent connection
+ tracking code will be able to do state tracking on UDP-Lite
+ connections.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_AMANDA
+ tristate "Amanda backup protocol support"
+ depends on NETFILTER_ADVANCED
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ help
+ If you are running the Amanda backup package <http://www.amanda.org/>
+ on this machine or machines that will be MASQUERADED through this
+ machine, then you may want to enable this feature. This allows the
+ connection tracking and natting code to allow the sub-channels that
+ Amanda requires for communication of the backup data, messages and
+ index.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_FTP
+ tristate "FTP protocol support"
+ default m if NETFILTER_ADVANCED=n
+ help
+ Tracking FTP connections is problematic: special helpers are
+ required for tracking them, and doing masquerading and other forms
+ of Network Address Translation on them.
+
+ This is FTP support on Layer 3 independent connection tracking.
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_H323
+ tristate "H.323 protocol support"
+ depends on (IPV6 || IPV6=n)
+ depends on NETFILTER_ADVANCED
+ help
+ H.323 is a VoIP signalling protocol from ITU-T. As one of the most
+ important VoIP protocols, it is widely used by voice hardware and
+ software including voice gateways, IP phones, Netmeeting, OpenPhone,
+ Gnomemeeting, etc.
+
+ With this module you can support H.323 on a connection tracking/NAT
+ firewall.
+
+ This module supports RAS, Fast Start, H.245 Tunnelling, Call
+ Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
+ whiteboard, file transfer, etc. For more information, please
+ visit http://nath323.sourceforge.net/.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_IRC
+ tristate "IRC protocol support"
+ default m if NETFILTER_ADVANCED=n
+ help
+ There is a commonly-used extension to IRC called
+ Direct Client-to-Client Protocol (DCC). This enables users to send
+ files to each other, and also chat to each other without the need
+ of a server. DCC Sending is used anywhere you send files over IRC,
+ and DCC Chat is most commonly used by Eggdrop bots. If you are
+ using NAT, this extension will enable you to send files and initiate
+ chats. Note that you do NOT need this extension to get files or
+ have others initiate chats, or everything else in IRC.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_BROADCAST
+ tristate
+
+config NF_CONNTRACK_NETBIOS_NS
+ tristate "NetBIOS name service protocol support"
+ select NF_CONNTRACK_BROADCAST
+ help
+ NetBIOS name service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating NetBIOS name service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address. When properly configured, the output
+ of "ip address show" should look similar to this:
+
+ $ ip -4 address show eth0
+ 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000
+ inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_SNMP
+ tristate "SNMP service protocol support"
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_BROADCAST
+ help
+ SNMP service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating SNMP service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_PPTP
+ tristate "PPtP protocol support"
+ depends on NETFILTER_ADVANCED
+ select NF_CT_PROTO_GRE
+ help
+ This module adds support for PPTP (Point to Point Tunnelling
+ Protocol, RFC2637) connection tracking and NAT.
+
+ If you are running PPTP sessions over a stateful firewall or NAT
+ box, you may want to enable this feature.
+
+ Please note that not all PPTP modes of operation are supported yet.
+ Specifically these limitations exist:
+ - Blindly assumes that control connections are always established
+ in PNS->PAC direction. This is a violation of RFC2637.
+ - Only supports a single call within each session
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_SANE
+ tristate "SANE protocol support"
+ depends on NETFILTER_ADVANCED
+ help
+ SANE is a protocol for remote access to scanners as implemented
+ by the 'saned' daemon. Like FTP, it uses separate control and
+ data connections.
+
+ With this module you can support SANE on a connection tracking
+ firewall.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_SIP
+ tristate "SIP protocol support"
+ default m if NETFILTER_ADVANCED=n
+ help
+ SIP is an application-layer control protocol that can establish,
+ modify, and terminate multimedia sessions (conferences) such as
+ Internet telephony calls. With the ip_conntrack_sip and
+ the nf_nat_sip modules you can support the protocol on a connection
+ tracking/NATing firewall.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CONNTRACK_TFTP
+ tristate "TFTP protocol support"
+ depends on NETFILTER_ADVANCED
+ help
+ TFTP connection tracking helper, this is required depending
+ on how restrictive your ruleset is.
+ If you are using a tftp client behind -j SNAT or -j MASQUERADING
+ you will need this.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CT_NETLINK
+ tristate 'Connection tracking netlink interface'
+ select NETFILTER_NETLINK
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option enables support for a netlink-based userspace interface
+
+config NF_CT_NETLINK_TIMEOUT
+ tristate 'Connection tracking timeout tuning via Netlink'
+ select NETFILTER_NETLINK
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking timeout
+ fine-grain tuning. This allows you to attach specific timeout
+ policies to flows, instead of using the global timeout policy.
+
+ If unsure, say `N'.
+
+config NF_CT_NETLINK_HELPER
+ tristate 'Connection tracking helpers in user-space via Netlink'
+ select NETFILTER_NETLINK
+ depends on NF_CT_NETLINK
+ depends on NETFILTER_NETLINK_QUEUE
+ depends on NETFILTER_NETLINK_QUEUE_CT
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables the user-space connection tracking helpers
+ infrastructure.
+
+ If unsure, say `N'.
+
+config NETFILTER_NETLINK_QUEUE_CT
+ bool "NFQUEUE integration with Connection Tracking"
+ default n
+ depends on NETFILTER_NETLINK_QUEUE
+ help
+ If this option is enabled, NFQUEUE can include Connection Tracking
+ information together with the packet is the enqueued via NFNETLINK.
+
+config NF_NAT
+ tristate
+
+config NF_NAT_NEEDED
+ bool
+ depends on NF_NAT
+ default y
+
+config NF_NAT_PROTO_DCCP
+ tristate
+ depends on NF_NAT && NF_CT_PROTO_DCCP
+ default NF_NAT && NF_CT_PROTO_DCCP
+
+config NF_NAT_PROTO_UDPLITE
+ tristate
+ depends on NF_NAT && NF_CT_PROTO_UDPLITE
+ default NF_NAT && NF_CT_PROTO_UDPLITE
+
+config NF_NAT_PROTO_SCTP
+ tristate
+ default NF_NAT && NF_CT_PROTO_SCTP
+ depends on NF_NAT && NF_CT_PROTO_SCTP
+ select LIBCRC32C
+
+config NF_NAT_AMANDA
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_AMANDA
+
+config NF_NAT_FTP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_FTP
+
+config NF_NAT_IRC
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_IRC
+
+config NF_NAT_SIP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_SIP
+
+config NF_NAT_TFTP
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_TFTP
+
+config NF_NAT_REDIRECT
+ tristate "IPv4/IPv6 redirect support"
+ depends on NF_NAT
+ help
+ This is the kernel functionality to redirect packets to local
+ machine through NAT.
+
+config NETFILTER_SYNPROXY
+ tristate
+
+endif # NF_CONNTRACK
+
+config NF_TABLES
+ select NETFILTER_NETLINK
+ tristate "Netfilter nf_tables support"
+ help
+ nftables is the new packet classification framework that intends to
+ replace the existing {ip,ip6,arp,eb}_tables infrastructure. It
+ provides a pseudo-state machine with an extensible instruction-set
+ (also known as expressions) that the userspace 'nft' utility
+ (http://www.netfilter.org/projects/nftables) uses to build the
+ rule-set. It also comes with the generic set infrastructure that
+ allows you to construct mappings between matchings and actions
+ for performance lookups.
+
+ To compile it as a module, choose M here.
+
+if NF_TABLES
+
+config NF_TABLES_INET
+ depends on IPV6
+ select NF_TABLES_IPV4
+ select NF_TABLES_IPV6
+ tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support"
+ help
+ This option enables support for a mixed IPv4/IPv6 "inet" table.
+
+config NFT_EXTHDR
+ tristate "Netfilter nf_tables IPv6 exthdr module"
+ help
+ This option adds the "exthdr" expression that you can use to match
+ IPv6 extension headers.
+
+config NFT_META
+ tristate "Netfilter nf_tables meta module"
+ help
+ This option adds the "meta" expression that you can use to match and
+ to set packet metainformation such as the packet mark.
+
+config NFT_CT
+ depends on NF_CONNTRACK
+ tristate "Netfilter nf_tables conntrack module"
+ help
+ This option adds the "meta" expression that you can use to match
+ connection tracking information such as the flow state.
+
+config NFT_RBTREE
+ tristate "Netfilter nf_tables rbtree set module"
+ help
+ This option adds the "rbtree" set type (Red Black tree) that is used
+ to build interval-based sets.
+
+config NFT_HASH
+ tristate "Netfilter nf_tables hash set module"
+ help
+ This option adds the "hash" set type that is used to build one-way
+ mappings between matchings and actions.
+
+config NFT_COUNTER
+ tristate "Netfilter nf_tables counter module"
+ help
+ This option adds the "counter" expression that you can use to
+ include packet and byte counters in a rule.
+
+config NFT_LOG
+ tristate "Netfilter nf_tables log module"
+ help
+ This option adds the "log" expression that you can use to log
+ packets matching some criteria.
+
+config NFT_LIMIT
+ tristate "Netfilter nf_tables limit module"
+ help
+ This option adds the "limit" expression that you can use to
+ ratelimit rule matchings.
+
+config NFT_MASQ
+ depends on NF_CONNTRACK
+ depends on NF_NAT
+ tristate "Netfilter nf_tables masquerade support"
+ help
+ This option adds the "masquerade" expression that you can use
+ to perform NAT in the masquerade flavour.
+
+config NFT_REDIR
+ depends on NF_CONNTRACK
+ depends on NF_NAT
+ tristate "Netfilter nf_tables redirect support"
+ help
+ This options adds the "redirect" expression that you can use
+ to perform NAT in the redirect flavour.
+
+config NFT_NAT
+ depends on NF_CONNTRACK
+ select NF_NAT
+ tristate "Netfilter nf_tables nat module"
+ help
+ This option adds the "nat" expression that you can use to perform
+ typical Network Address Translation (NAT) packet transformations.
+
+config NFT_QUEUE
+ depends on NETFILTER_NETLINK_QUEUE
+ tristate "Netfilter nf_tables queue module"
+ help
+ This is required if you intend to use the userspace queueing
+ infrastructure (also known as NFQUEUE) from nftables.
+
+config NFT_REJECT
+ default m if NETFILTER_ADVANCED=n
+ tristate "Netfilter nf_tables reject support"
+ help
+ This option adds the "reject" expression that you can use to
+ explicitly deny and notify via TCP reset/ICMP informational errors
+ unallowed traffic.
+
+config NFT_REJECT_INET
+ depends on NF_TABLES_INET
+ default NFT_REJECT
+ tristate
+
+config NFT_COMPAT
+ depends on NETFILTER_XTABLES
+ tristate "Netfilter x_tables over nf_tables module"
+ help
+ This is required if you intend to use any of existing
+ x_tables match/target extensions over the nf_tables
+ framework.
+
+endif # NF_TABLES
+
+config NETFILTER_XTABLES
+ tristate "Netfilter Xtables support (required for ip_tables)"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This is required if you intend to use any of ip_tables,
+ ip6_tables or arp_tables.
+
+if NETFILTER_XTABLES
+
+comment "Xtables combined modules"
+
+config NETFILTER_XT_MARK
+ tristate 'nfmark target and match support'
+ default m if NETFILTER_ADVANCED=n
+ ---help---
+ This option adds the "MARK" target and "mark" match.
+
+ Netfilter mark matching allows you to match packets based on the
+ "nfmark" value in the packet.
+ The target allows you to create rules in the "mangle" table which alter
+ the netfilter mark (nfmark) field associated with the packet.
+
+ Prior to routing, the nfmark can influence the routing method (see
+ "Use netfilter MARK value as routing key") and can also be used by
+ other subsystems to change their behavior.
+
+config NETFILTER_XT_CONNMARK
+ tristate 'ctmark target and match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ select NF_CONNTRACK_MARK
+ ---help---
+ This option adds the "CONNMARK" target and "connmark" match.
+
+ Netfilter allows you to store a mark value per connection (a.k.a.
+ ctmark), similarly to the packet mark (nfmark). Using this
+ target and match, you can set and match on this mark.
+
+config NETFILTER_XT_SET
+ tristate 'set target and match support'
+ depends on IP_SET
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds the "SET" target and "set" match.
+
+ Using this target and match, you can add/delete and match
+ elements in the sets created by ipset(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+# alphabetically ordered list of targets
+
+comment "Xtables targets"
+
+config NETFILTER_XT_TARGET_AUDIT
+ tristate "AUDIT target support"
+ depends on AUDIT
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a 'AUDIT' target, which can be used to create
+ audit records for packets dropped/accepted.
+
+ To compileit as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_CHECKSUM
+ tristate "CHECKSUM target support"
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+ table.
+
+ You can use this target to compute and fill in the checksum in
+ a packet that lacks a checksum. This is particularly useful,
+ if you need to work around old applications such as dhcp clients,
+ that do not work well with checksum offloads, but don't want to disable
+ checksum offload in your device.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_CLASSIFY
+ tristate '"CLASSIFY" target support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `CLASSIFY' target, which enables the user to set
+ the priority of a packet. Some qdiscs can use this value for
+ classification, among these are:
+
+ atm, cbq, dsmark, pfifo_fast, htb, prio
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_CONNMARK
+ tristate '"CONNMARK" target support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_CONNMARK
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
+
+config NETFILTER_XT_TARGET_CONNSECMARK
+ tristate '"CONNSECMARK" target support'
+ depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK
+ default m if NETFILTER_ADVANCED=n
+ help
+ The CONNSECMARK target copies security markings from packets
+ to connections, and restores security markings from connections
+ to packets (if the packets are not already marked). This would
+ normally be used in conjunction with the SECMARK target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_CT
+ tristate '"CT" target support'
+ depends on NF_CONNTRACK
+ depends on IP_NF_RAW || IP6_NF_RAW
+ depends on NETFILTER_ADVANCED
+ help
+ This options adds a `CT' target, which allows to specify initial
+ connection tracking parameters like events to be delivered and
+ the helper to be used.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_DSCP
+ tristate '"DSCP" and "TOS" target support'
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `DSCP' target, which allows you to manipulate
+ the IPv4/IPv6 header DSCP field (differentiated services codepoint).
+
+ The DSCP field can have any value between 0x0 and 0x3f inclusive.
+
+ It also adds the "TOS" target, which allows you to create rules in
+ the "mangle" table which alter the Type Of Service field of an IPv4
+ or the Priority field of an IPv6 packet, prior to routing.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_HL
+ tristate '"HL" hoplimit target support'
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds the "HL" (for IPv6) and "TTL" (for IPv4)
+ targets, which enable the user to change the
+ hoplimit/time-to-live value of the IP header.
+
+ While it is safe to decrement the hoplimit/TTL value, the
+ modules also allow to increment and set the hoplimit value of
+ the header to arbitrary values. This is EXTREMELY DANGEROUS
+ since you can easily create immortal packets that loop
+ forever on the network.
+
+config NETFILTER_XT_TARGET_HMARK
+ tristate '"HMARK" target support'
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds the "HMARK" target.
+
+ The target allows you to create rules in the "raw" and "mangle" tables
+ which set the skbuff mark by means of hash calculation within a given
+ range. The nfmark can influence the routing method (see "Use netfilter
+ MARK value as routing key") and can also be used by other subsystems to
+ change their behaviour.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_IDLETIMER
+ tristate "IDLETIMER target support"
+ depends on NETFILTER_ADVANCED
+ help
+
+ This option adds the `IDLETIMER' target. Each matching packet
+ resets the timer associated with label specified when the rule is
+ added. When the timer expires, it triggers a sysfs notification.
+ The remaining time for expiration can be read via sysfs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_LED
+ tristate '"LED" target support'
+ depends on LEDS_CLASS && LEDS_TRIGGERS
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `LED' target, which allows you to blink LEDs in
+ response to particular packets passing through your machine.
+
+ This can be used to turn a spare LED into a network activity LED,
+ which only flashes in response to FTP transfers, for example. Or
+ you could have an LED which lights up for a minute or two every time
+ somebody connects to your machine via SSH.
+
+ You will need support for the "led" class to make this work.
+
+ To create an LED trigger for incoming SSH traffic:
+ iptables -A INPUT -p tcp --dport 22 -j LED --led-trigger-id ssh --led-delay 1000
+
+ Then attach the new trigger to an LED on your system:
+ echo netfilter-ssh > /sys/class/leds/<ledname>/trigger
+
+ For more information on the LEDs available on your system, see
+ Documentation/leds/leds-class.txt
+
+config NETFILTER_XT_TARGET_LOG
+ tristate "LOG target support"
+ select NF_LOG_COMMON
+ select NF_LOG_IPV4
+ select NF_LOG_IPV6 if IPV6
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option adds a `LOG' target, which allows you to create rules in
+ any iptables table which records the packet header to the syslog.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_MARK
+ tristate '"MARK" target support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MARK
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
+
+config NETFILTER_XT_NAT
+ tristate '"SNAT and DNAT" targets support'
+ depends on NF_NAT
+ ---help---
+ This option enables the SNAT and DNAT targets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_NETMAP
+ tristate '"NETMAP" target support'
+ depends on NF_NAT
+ ---help---
+ NETMAP is an implementation of static 1:1 NAT mapping of network
+ addresses. It maps the network address part, while keeping the host
+ address part intact.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_NFLOG
+ tristate '"NFLOG" target support'
+ default m if NETFILTER_ADVANCED=n
+ select NETFILTER_NETLINK_LOG
+ help
+ This option enables the NFLOG target, which allows to LOG
+ messages through nfnetlink_log.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_NFQUEUE
+ tristate '"NFQUEUE" target Support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_QUEUE
+ help
+ This target replaced the old obsolete QUEUE target.
+
+ As opposed to QUEUE, it supports 65535 different queues,
+ not just one.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_NOTRACK
+ tristate '"NOTRACK" target support (DEPRECATED)'
+ depends on NF_CONNTRACK
+ depends on IP_NF_RAW || IP6_NF_RAW
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_CT
+
+config NETFILTER_XT_TARGET_RATEEST
+ tristate '"RATEEST" target support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `RATEEST' target, which allows to measure
+ rates similar to TC estimators. The `rateest' match can be
+ used to match on the measured rates.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_REDIRECT
+ tristate "REDIRECT target support"
+ depends on NF_NAT
+ select NF_NAT_REDIRECT
+ ---help---
+ REDIRECT is a special case of NAT: all incoming connections are
+ mapped onto the incoming interface's address, causing the packets to
+ come to the local machine instead of passing through. This is
+ useful for transparent proxies.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_TEE
+ tristate '"TEE" - packet cloning to alternate destination'
+ depends on NETFILTER_ADVANCED
+ depends on (IPV6 || IPV6=n)
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ ---help---
+ This option adds a "TEE" target with which a packet can be cloned and
+ this clone be rerouted to another nexthop.
+
+config NETFILTER_XT_TARGET_TPROXY
+ tristate '"TPROXY" target transparent proxying support'
+ depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ depends on (IPV6 || IPV6=n)
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on IP_NF_MANGLE
+ select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ help
+ This option adds a `TPROXY' target, which is somewhat similar to
+ REDIRECT. It can only be used in the mangle table and is useful
+ to redirect traffic to a transparent proxy. It does _not_ depend
+ on Netfilter connection tracking and NAT, unlike REDIRECT.
+ For it to work you will have to configure certain iptables rules
+ and use policy routing. For more information on how to set it up
+ see Documentation/networking/tproxy.txt.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_TRACE
+ tristate '"TRACE" target support'
+ depends on IP_NF_RAW || IP6_NF_RAW
+ depends on NETFILTER_ADVANCED
+ help
+ The TRACE target allows you to mark packets so that the kernel
+ will log every rule which match the packets as those traverse
+ the tables, chains, rules.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_TARGET_SECMARK
+ tristate '"SECMARK" target support'
+ depends on NETWORK_SECMARK
+ default m if NETFILTER_ADVANCED=n
+ help
+ The SECMARK target allows security marking of network
+ packets, for use with security subsystems.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_TCPMSS
+ tristate '"TCPMSS" target support'
+ depends on (IPV6 || IPV6=n)
+ default m if NETFILTER_ADVANCED=n
+ ---help---
+ This option adds a `TCPMSS' target, which allows you to alter the
+ MSS value of TCP SYN packets, to control the maximum size for that
+ connection (usually limiting it to your outgoing interface's MTU
+ minus 40).
+
+ This is used to overcome criminally braindead ISPs or servers which
+ block ICMP Fragmentation Needed packets. The symptoms of this
+ problem are that everything works fine from your Linux
+ firewall/router, but machines behind it can never exchange large
+ packets:
+ 1) Web browsers connect, then hang with no data received.
+ 2) Small mail works fine, but large emails hang.
+ 3) ssh works fine, but scp hangs after initial handshaking.
+
+ Workaround: activate this option and add a rule to your firewall
+ configuration like:
+
+ iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \
+ -j TCPMSS --clamp-mss-to-pmtu
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_TCPOPTSTRIP
+ tristate '"TCPOPTSTRIP" target support'
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a "TCPOPTSTRIP" target, which allows you to strip
+ TCP options from TCP packets.
+
+# alphabetically ordered list of matches
+
+comment "Xtables matches"
+
+config NETFILTER_XT_MATCH_ADDRTYPE
+ tristate '"addrtype" address type match support'
+ default m if NETFILTER_ADVANCED=n
+ ---help---
+ This option allows you to match what routing thinks of an address,
+ eg. UNICAST, LOCAL, BROADCAST, ...
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_BPF
+ tristate '"bpf" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ BPF matching applies a linux socket filter to each packet and
+ accepts those for which the filter returns non-zero.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_CGROUP
+ tristate '"control group" match support'
+ depends on NETFILTER_ADVANCED
+ depends on CGROUPS
+ select CGROUP_NET_CLASSID
+ ---help---
+ Socket/process control group matching allows you to match locally
+ generated packets based on which net_cls control group processes
+ belong to.
+
+config NETFILTER_XT_MATCH_CLUSTER
+ tristate '"cluster" match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option allows you to build work-load-sharing clusters of
+ network servers/stateful firewalls without having a dedicated
+ load-balancing router/server/switch. Basically, this match returns
+ true when the packet must be handled by this cluster node. Thus,
+ all nodes see all packets and this match decides which node handles
+ what packets. The work-load sharing algorithm is based on source
+ address hashing.
+
+ If you say Y or M here, try `iptables -m cluster --help` for
+ more information.
+
+config NETFILTER_XT_MATCH_COMMENT
+ tristate '"comment" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `comment' dummy-match, which allows you to put
+ comments in your iptables ruleset.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CONNBYTES
+ tristate '"connbytes" per-connection counter match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `connbytes' match, which allows you to match the
+ number of bytes and/or packets for each direction within a connection.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CONNLABEL
+ tristate '"connlabel" match support'
+ select NF_CONNTRACK_LABELS
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This match allows you to test and assign userspace-defined labels names
+ to a connection. The kernel only stores bit values - mapping
+ names to bits is done by userspace.
+
+ Unlike connmark, more than 32 flag bits may be assigned to a
+ connection simultaneously.
+
+config NETFILTER_XT_MATCH_CONNLIMIT
+ tristate '"connlimit" match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This match allows you to match against the number of parallel
+ connections to a server per client IP address (or address block).
+
+config NETFILTER_XT_MATCH_CONNMARK
+ tristate '"connmark" connection mark match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_CONNMARK
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
+
+config NETFILTER_XT_MATCH_CONNTRACK
+ tristate '"conntrack" connection tracking match support'
+ depends on NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
+ help
+ This is a general conntrack match module, a superset of the state match.
+
+ It allows matching on additional conntrack information, which is
+ useful in complex configurations, such as NAT gateways with multiple
+ internet links or tunnels.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_CPU
+ tristate '"cpu" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ CPU matching allows you to match packets based on the CPU
+ currently handling the packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_DCCP
+ tristate '"dccp" protocol match support'
+ depends on NETFILTER_ADVANCED
+ default IP_DCCP
+ help
+ With this option enabled, you will be able to use the iptables
+ `dccp' match in order to match on DCCP source/destination ports
+ and DCCP flags.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_DEVGROUP
+ tristate '"devgroup" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This options adds a `devgroup' match, which allows to match on the
+ device group a network device is assigned to.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_DSCP
+ tristate '"dscp" and "tos" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `DSCP' match, which allows you to match against
+ the IPv4/IPv6 header DSCP field (differentiated services codepoint).
+
+ The DSCP field can have any value between 0x0 and 0x3f inclusive.
+
+ It will also add a "tos" match, which allows you to match packets
+ based on the Type Of Service fields of the IPv4 packet (which share
+ the same bits as DSCP).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_ECN
+ tristate '"ecn" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds an "ECN" match, which allows you to match against
+ the IPv4 and TCP header ECN fields.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_ESP
+ tristate '"esp" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This match extension allows you to match a range of SPIs
+ inside ESP header of IPSec packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_HASHLIMIT
+ tristate '"hashlimit" match support'
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `hashlimit' match.
+
+ As opposed to `limit', this match dynamically creates a hash table
+ of limit buckets, based on your selection of source/destination
+ addresses and/or ports.
+
+ It enables you to express policies like `10kpps for any given
+ destination address' or `500pps from any given source address'
+ with a single rule.
+
+config NETFILTER_XT_MATCH_HELPER
+ tristate '"helper" match support'
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ help
+ Helper matching allows you to match packets in dynamic connections
+ tracked by a conntrack-helper, ie. ip_conntrack_ftp
+
+ To compile it as a module, choose M here. If unsure, say Y.
+
+config NETFILTER_XT_MATCH_HL
+ tristate '"hl" hoplimit/TTL match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ HL matching allows you to match packets based on the hoplimit
+ in the IPv6 header, or the time-to-live field in the IPv4
+ header of the packet.
+
+config NETFILTER_XT_MATCH_IPCOMP
+ tristate '"ipcomp" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This match extension allows you to match a range of CPIs(16 bits)
+ inside IPComp header of IPSec packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_IPRANGE
+ tristate '"iprange" address range match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a "iprange" match, which allows you to match based on
+ an IP address range. (Normal iptables only matches on single addresses
+ with an optional mask.)
+
+ If unsure, say M.
+
+config NETFILTER_XT_MATCH_IPVS
+ tristate '"ipvs" match support'
+ depends on IP_VS
+ depends on NETFILTER_ADVANCED
+ depends on NF_CONNTRACK
+ help
+ This option allows you to match against IPVS properties of a packet.
+
+ If unsure, say N.
+
+config NETFILTER_XT_MATCH_L2TP
+ tristate '"l2tp" match support'
+ depends on NETFILTER_ADVANCED
+ default L2TP
+ ---help---
+ This option adds an "L2TP" match, which allows you to match against
+ L2TP protocol header fields.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_LENGTH
+ tristate '"length" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option allows you to match the length of a packet against a
+ specific value or range of values.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_LIMIT
+ tristate '"limit" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ limit matching allows you to control the rate at which a rule can be
+ matched: mainly useful in combination with the LOG target ("LOG
+ target support", below) and to avoid some Denial of Service attacks.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_MAC
+ tristate '"mac" address match support'
+ depends on NETFILTER_ADVANCED
+ help
+ MAC matching allows you to match packets based on the source
+ Ethernet address of the packet.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_MARK
+ tristate '"mark" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_MARK
+ ---help---
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
+
+config NETFILTER_XT_MATCH_MULTIPORT
+ tristate '"multiport" Multiple port match support'
+ depends on NETFILTER_ADVANCED
+ help
+ Multiport matching allows you to match TCP or UDP packets based on
+ a series of source or destination ports: normally a rule can only
+ match a single range of ports.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_NFACCT
+ tristate '"nfacct" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_ACCT
+ help
+ This option allows you to use the extended accounting through
+ nfnetlink_acct.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_OSF
+ tristate '"osf" Passive OS fingerprint match'
+ depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+ help
+ This option selects the Passive OS Fingerprinting match module
+ that allows to passively match the remote operating system by
+ analyzing incoming TCP SYN packets.
+
+ Rules and loading software can be downloaded from
+ http://www.ioremap.net/projects/osf
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_OWNER
+ tristate '"owner" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ Socket owner matching allows you to match locally-generated packets
+ based on who created the socket: the user or group. It is also
+ possible to check whether a socket actually exists.
+
+config NETFILTER_XT_MATCH_POLICY
+ tristate 'IPsec "policy" match support'
+ depends on XFRM
+ default m if NETFILTER_ADVANCED=n
+ help
+ Policy matching allows you to match packets based on the
+ IPsec policy that was used during decapsulation/will
+ be used during encapsulation.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_PHYSDEV
+ tristate '"physdev" match support'
+ depends on BRIDGE && BRIDGE_NETFILTER
+ depends on NETFILTER_ADVANCED
+ help
+ Physdev packet matching matches against the physical bridge ports
+ the IP packet arrived on or will leave by.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_PKTTYPE
+ tristate '"pkttype" packet type match support'
+ depends on NETFILTER_ADVANCED
+ help
+ Packet type matching allows you to match a packet by
+ its "class", eg. BROADCAST, MULTICAST, ...
+
+ Typical usage:
+ iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_QUOTA
+ tristate '"quota" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `quota' match, which allows to match on a
+ byte counter.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_RATEEST
+ tristate '"rateest" match support'
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_XT_TARGET_RATEEST
+ help
+ This option adds a `rateest' match, which allows to match on the
+ rate estimated by the RATEEST target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_REALM
+ tristate '"realm" match support'
+ depends on NETFILTER_ADVANCED
+ select IP_ROUTE_CLASSID
+ help
+ This option adds a `realm' match, which allows you to use the realm
+ key from the routing subsystem inside iptables.
+
+ This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option
+ in tc world.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_RECENT
+ tristate '"recent" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This match is used for creating one or many lists of recently
+ used addresses and then matching against that/those list(s).
+
+ Short options are available by using 'iptables -m recent -h'
+ Official Website: <http://snowman.net/projects/ipt_recent/>
+
+config NETFILTER_XT_MATCH_SCTP
+ tristate '"sctp" protocol match support'
+ depends on NETFILTER_ADVANCED
+ default IP_SCTP
+ help
+ With this option enabled, you will be able to use the
+ `sctp' match in order to match on SCTP source/destination ports
+ and SCTP chunk types.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_SOCKET
+ tristate '"socket" match support'
+ depends on NETFILTER_XTABLES
+ depends on NETFILTER_ADVANCED
+ depends on !NF_CONNTRACK || NF_CONNTRACK
+ depends on (IPV6 || IPV6=n)
+ depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+ select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ help
+ This option adds a `socket' match, which can be used to match
+ packets for which a TCP or UDP socket lookup finds a valid socket.
+ It can be used in combination with the MARK target and policy
+ routing to implement full featured non-locally bound sockets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_STATE
+ tristate '"state" match support'
+ depends on NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
+ help
+ Connection state matching allows you to match packets based on their
+ relationship to a tracked connection (ie. previous packets). This
+ is a powerful tool for packet classification.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_STATISTIC
+ tristate '"statistic" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `statistic' match, which allows you to match
+ on packets periodically or randomly with a given percentage.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_STRING
+ tristate '"string" match support'
+ depends on NETFILTER_ADVANCED
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ select TEXTSEARCH_BM
+ select TEXTSEARCH_FSM
+ help
+ This option adds a `string' match, which allows you to look for
+ pattern matchings in packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_TCPMSS
+ tristate '"tcpmss" match support'
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `tcpmss' match, which allows you to examine the
+ MSS value of TCP SYN packets, which control the maximum packet size
+ for that connection.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_MATCH_TIME
+ tristate '"time" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a "time" match, which allows you to match based on
+ the packet arrival time (at the machine which netfilter is running)
+ on) or departure time/date (for locally generated packets).
+
+ If you say Y here, try `iptables -m time --help` for
+ more information.
+
+ If you want to compile it as a module, say M here.
+ If unsure, say N.
+
+config NETFILTER_XT_MATCH_U32
+ tristate '"u32" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ u32 allows you to extract quantities of up to 4 bytes from a packet,
+ AND them with specified masks, shift them by specified amounts and
+ test whether the results are in any of a set of specified ranges.
+ The specification of what to extract is general enough to skip over
+ headers with lengths stored in the packet, as in IP or TCP header
+ lengths.
+
+ Details and examples are in the kernel module source.
+
+endif # NETFILTER_XTABLES
+
+endmenu
+
+source "net/netfilter/ipset/Kconfig"
+
+source "net/netfilter/ipvs/Kconfig"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
new file mode 100644
index 000000000..a87d8b8ec
--- /dev/null
+++ b/net/netfilter/Makefile
@@ -0,0 +1,177 @@
+netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
+
+nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
+
+obj-$(CONFIG_NETFILTER) = netfilter.o
+
+obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
+obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
+nfnetlink_queue-y := nfnetlink_queue_core.o
+nfnetlink_queue-$(CONFIG_NETFILTER_NETLINK_QUEUE_CT) += nfnetlink_queue_ct.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
+
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
+
+# SCTP protocol connection tracking
+obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
+obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
+
+# netlink interface for nf_conntrack
+obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
+obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o
+obj-$(CONFIG_NF_CT_NETLINK_HELPER) += nfnetlink_cthelper.o
+
+# connection tracking helpers
+nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o
+
+obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o
+obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
+obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o
+obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o
+obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o
+obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o
+obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o
+obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
+obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
+obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
+obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
+
+nf_nat-y := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
+ nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
+
+# generic transport layer logging
+obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
+
+obj-$(CONFIG_NF_NAT) += nf_nat.o
+obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
+
+# NAT protocols (nf_nat)
+obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
+obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+
+# NAT helpers
+obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
+obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
+obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
+obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
+obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
+
+# SYNPROXY
+obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
+
+# nf_tables
+nf_tables-objs += nf_tables_core.o nf_tables_api.o
+nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o
+nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
+
+obj-$(CONFIG_NF_TABLES) += nf_tables.o
+obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o
+obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
+obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o
+obj-$(CONFIG_NFT_META) += nft_meta.o
+obj-$(CONFIG_NFT_CT) += nft_ct.o
+obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
+obj-$(CONFIG_NFT_NAT) += nft_nat.o
+obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
+obj-$(CONFIG_NFT_REJECT) += nft_reject.o
+obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
+obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o
+obj-$(CONFIG_NFT_HASH) += nft_hash.o
+obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
+obj-$(CONFIG_NFT_LOG) += nft_log.o
+obj-$(CONFIG_NFT_MASQ) += nft_masq.o
+obj-$(CONFIG_NFT_REDIR) += nft_redir.o
+
+# generic X tables
+obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
+
+# combos
+obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
+obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
+obj-$(CONFIG_NETFILTER_XT_NAT) += xt_nat.o
+
+# targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
+
+# matches
+obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_BPF) += xt_bpf.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLABEL) += xt_connlabel.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_ECN) += xt_ecn.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_L2TP) += xt_l2tp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+
+# ipset
+obj-$(CONFIG_IP_SET) += ipset/
+
+# IPVS
+obj-$(CONFIG_IP_VS) += ipvs/
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
new file mode 100644
index 000000000..e6163017c
--- /dev/null
+++ b/net/netfilter/core.c
@@ -0,0 +1,317 @@
+/* netfilter.c: look after the filters for various protocols.
+ * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
+ *
+ * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
+ * way.
+ *
+ * Rusty Russell (C)2000 -- This code is GPL.
+ * Patrick McHardy (c) 2006-2012
+ */
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+static DEFINE_MUTEX(afinfo_mutex);
+
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+EXPORT_SYMBOL(nf_afinfo);
+const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ipv6_ops);
+
+int nf_register_afinfo(const struct nf_afinfo *afinfo)
+{
+ mutex_lock(&afinfo_mutex);
+ RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo);
+ mutex_unlock(&afinfo_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_register_afinfo);
+
+void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
+{
+ mutex_lock(&afinfo_mutex);
+ RCU_INIT_POINTER(nf_afinfo[afinfo->family], NULL);
+ mutex_unlock(&afinfo_mutex);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
+
+struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
+EXPORT_SYMBOL(nf_hooks);
+
+#ifdef HAVE_JUMP_LABEL
+struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+EXPORT_SYMBOL(nf_hooks_needed);
+#endif
+
+static DEFINE_MUTEX(nf_hook_mutex);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ struct nf_hook_ops *elem;
+
+ mutex_lock(&nf_hook_mutex);
+ list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
+ if (reg->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&reg->list, elem->list.prev);
+ mutex_unlock(&nf_hook_mutex);
+#ifdef HAVE_JUMP_LABEL
+ static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
+#endif
+ return 0;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ mutex_lock(&nf_hook_mutex);
+ list_del_rcu(&reg->list);
+ mutex_unlock(&nf_hook_mutex);
+#ifdef HAVE_JUMP_LABEL
+ static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
+#endif
+ synchronize_net();
+}
+EXPORT_SYMBOL(nf_unregister_hook);
+
+int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = nf_register_hook(&reg[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ nf_unregister_hooks(reg, i);
+ return err;
+}
+EXPORT_SYMBOL(nf_register_hooks);
+
+void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+ while (n-- > 0)
+ nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_hooks);
+
+unsigned int nf_iterate(struct list_head *head,
+ struct sk_buff *skb,
+ struct nf_hook_state *state,
+ struct nf_hook_ops **elemp)
+{
+ unsigned int verdict;
+
+ /*
+ * The caller must not block between calls to this
+ * function because of risk of continuing from deleted element.
+ */
+ list_for_each_entry_continue_rcu((*elemp), head, list) {
+ if (state->thresh > (*elemp)->priority)
+ continue;
+
+ /* Optimization: we don't need to hold module
+ reference here, since function can't sleep. --RR */
+repeat:
+ verdict = (*elemp)->hook(*elemp, skb, state);
+ if (verdict != NF_ACCEPT) {
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (unlikely((verdict & NF_VERDICT_MASK)
+ > NF_MAX_VERDICT)) {
+ NFDEBUG("Evil return from %p(%u).\n",
+ (*elemp)->hook, state->hook);
+ continue;
+ }
+#endif
+ if (verdict != NF_REPEAT)
+ return verdict;
+ goto repeat;
+ }
+ }
+ return NF_ACCEPT;
+}
+
+
+/* Returns 1 if okfn() needs to be executed by the caller,
+ * -EPERM for NF_DROP, 0 otherwise. */
+int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
+{
+ struct nf_hook_ops *elem;
+ unsigned int verdict;
+ int ret = 0;
+
+ /* We may already have this, but read-locks nest anyway */
+ rcu_read_lock();
+
+ elem = list_entry_rcu(&nf_hooks[state->pf][state->hook],
+ struct nf_hook_ops, list);
+next_hook:
+ verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state,
+ &elem);
+ if (verdict == NF_ACCEPT || verdict == NF_STOP) {
+ ret = 1;
+ } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
+ kfree_skb(skb);
+ ret = NF_DROP_GETERR(verdict);
+ if (ret == 0)
+ ret = -EPERM;
+ } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
+ int err = nf_queue(skb, elem, state,
+ verdict >> NF_VERDICT_QBITS);
+ if (err < 0) {
+ if (err == -ECANCELED)
+ goto next_hook;
+ if (err == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(nf_hook_slow);
+
+
+int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
+{
+ if (writable_len > skb->len)
+ return 0;
+
+ /* Not exclusive use of packet? Must copy. */
+ if (!skb_cloned(skb)) {
+ if (writable_len <= skb_headlen(skb))
+ return 1;
+ } else if (skb_clone_writable(skb, writable_len))
+ return 1;
+
+ if (writable_len <= skb_headlen(skb))
+ writable_len = 0;
+ else
+ writable_len -= skb_headlen(skb);
+
+ return !!__pskb_pull_tail(skb, writable_len);
+}
+EXPORT_SYMBOL(skb_make_writable);
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+/* This does not belong here, but locally generated errors need it if connection
+ tracking in use: without this, connection may not be in hash table, and hence
+ manufactured ICMP or RST packets will not be associated with it. */
+void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
+ __rcu __read_mostly;
+EXPORT_SYMBOL(ip_ct_attach);
+
+void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
+{
+ void (*attach)(struct sk_buff *, const struct sk_buff *);
+
+ if (skb->nfct) {
+ rcu_read_lock();
+ attach = rcu_dereference(ip_ct_attach);
+ if (attach)
+ attach(new, skb);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(nf_ct_attach);
+
+void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
+EXPORT_SYMBOL(nf_ct_destroy);
+
+void nf_conntrack_destroy(struct nf_conntrack *nfct)
+{
+ void (*destroy)(struct nf_conntrack *);
+
+ rcu_read_lock();
+ destroy = rcu_dereference(nf_ct_destroy);
+ BUG_ON(destroy == NULL);
+ destroy(nfct);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_conntrack_destroy);
+
+struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfq_ct_hook);
+
+struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfq_ct_nat_hook);
+
+#endif /* CONFIG_NF_CONNTRACK */
+
+#ifdef CONFIG_NF_NAT_NEEDED
+void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(nf_nat_decode_session_hook);
+#endif
+
+static int __net_init netfilter_net_init(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
+ net->proc_net);
+ if (!net->nf.proc_netfilter) {
+ if (!net_eq(net, &init_net))
+ pr_err("cannot create netfilter proc entry");
+
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+static void __net_exit netfilter_net_exit(struct net *net)
+{
+ remove_proc_entry("netfilter", net->proc_net);
+}
+
+static struct pernet_operations netfilter_net_ops = {
+ .init = netfilter_net_init,
+ .exit = netfilter_net_exit,
+};
+
+int __init netfilter_init(void)
+{
+ int i, h, ret;
+
+ for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
+ for (h = 0; h < NF_MAX_HOOKS; h++)
+ INIT_LIST_HEAD(&nf_hooks[i][h]);
+ }
+
+ ret = register_pernet_subsys(&netfilter_net_ops);
+ if (ret < 0)
+ goto err;
+
+ ret = netfilter_log_init();
+ if (ret < 0)
+ goto err_pernet;
+
+ return 0;
+err_pernet:
+ unregister_pernet_subsys(&netfilter_net_ops);
+err:
+ return ret;
+}
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
new file mode 100644
index 000000000..234a8ec82
--- /dev/null
+++ b/net/netfilter/ipset/Kconfig
@@ -0,0 +1,168 @@
+menuconfig IP_SET
+ tristate "IP set support"
+ depends on INET && NETFILTER
+ select NETFILTER_NETLINK
+ help
+ This option adds IP set support to the kernel.
+ In order to define and use the sets, you need the userspace utility
+ ipset(8). You can use the sets in netfilter via the "set" match
+ and "SET" target.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+if IP_SET
+
+config IP_SET_MAX
+ int "Maximum number of IP sets"
+ default 256
+ range 2 65534
+ depends on IP_SET
+ help
+ You can define here default value of the maximum number
+ of IP sets for the kernel.
+
+ The value can be overridden by the 'max_sets' module
+ parameter of the 'ip_set' module.
+
+config IP_SET_BITMAP_IP
+ tristate "bitmap:ip set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip set type support, by which one
+ can store IPv4 addresses (or network addresse) from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_IPMAC
+ tristate "bitmap:ip,mac set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:ip,mac set type support, by which one
+ can store IPv4 address and (source) MAC address pairs from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_BITMAP_PORT
+ tristate "bitmap:port set support"
+ depends on IP_SET
+ help
+ This option adds the bitmap:port set type support, by which one
+ can store TCP/UDP port numbers from a range.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IP
+ tristate "hash:ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip set type support, by which one
+ can store arbitrary IPv4 or IPv6 addresses (or network addresses)
+ in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPMARK
+ tristate "hash:ip,mark set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,mark set type support, by which one
+ can store IPv4/IPv6 address and mark pairs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORT
+ tristate "hash:ip,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port set type support, by which one
+ can store IPv4/IPv6 address and protocol/port pairs.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTIP
+ tristate "hash:ip,port,ip set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,ip set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ address triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_IPPORTNET
+ tristate "hash:ip,port,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:ip,port,net set type support, by which
+ one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+ network address/prefix triples in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_MAC
+ tristate "hash:mac set support"
+ depends on IP_SET
+ help
+ This option adds the hash:mac set type support, by which
+ one can store MAC (ethernet address) elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETPORTNET
+ tristate "hash:net,port,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,port,net set type support, by which
+ one can store two IPv4/IPv6 subnets, and a protocol/port in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NET
+ tristate "hash:net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net set type support, by which
+ one can store IPv4/IPv6 network address/prefix elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETNET
+ tristate "hash:net,net set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,net set type support, by which
+ one can store IPv4/IPv6 network address/prefix pairs in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETPORT
+ tristate "hash:net,port set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,port set type support, by which
+ one can store IPv4/IPv6 network address/prefix and
+ protocol/port pairs as elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_HASH_NETIFACE
+ tristate "hash:net,iface set support"
+ depends on IP_SET
+ help
+ This option adds the hash:net,iface set type support, by which
+ one can store IPv4/IPv6 network address/prefix and
+ interface name pairs as elements in a set.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config IP_SET_LIST_SET
+ tristate "list:set set support"
+ depends on IP_SET
+ help
+ This option adds the list:set set type support. In this
+ kind of set one can store the name of other sets and it forms
+ an ordered union of the member sets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+endif # IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
new file mode 100644
index 000000000..3dbd5e958
--- /dev/null
+++ b/net/netfilter/ipset/Makefile
@@ -0,0 +1,29 @@
+#
+# Makefile for the ipset modules
+#
+
+ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o
+
+# ipset core
+obj-$(CONFIG_IP_SET) += ip_set.o
+
+# bitmap types
+obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o
+obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o
+obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
+
+# hash types
+obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPMARK) += ip_set_hash_ipmark.o
+obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
+obj-$(CONFIG_IP_SET_HASH_MAC) += ip_set_hash_mac.o
+obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o
+obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o
+obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o
+obj-$(CONFIG_IP_SET_HASH_NETNET) += ip_set_hash_netnet.o
+obj-$(CONFIG_IP_SET_HASH_NETPORTNET) += ip_set_hash_netportnet.o
+
+# list types
+obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
new file mode 100644
index 000000000..6f024a8a1
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -0,0 +1,293 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __IP_SET_BITMAP_IP_GEN_H
+#define __IP_SET_BITMAP_IP_GEN_H
+
+#define mtype_do_test IPSET_TOKEN(MTYPE, _do_test)
+#define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test)
+#define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled)
+#define mtype_do_add IPSET_TOKEN(MTYPE, _do_add)
+#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
+#define mtype_do_del IPSET_TOKEN(MTYPE, _do_del)
+#define mtype_do_list IPSET_TOKEN(MTYPE, _do_list)
+#define mtype_do_head IPSET_TOKEN(MTYPE, _do_head)
+#define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem)
+#define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout)
+#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
+#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
+#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
+#define mtype_head IPSET_TOKEN(MTYPE, _head)
+#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
+#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
+#define mtype_test IPSET_TOKEN(MTYPE, _test)
+#define mtype_add IPSET_TOKEN(MTYPE, _add)
+#define mtype_del IPSET_TOKEN(MTYPE, _del)
+#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype MTYPE
+
+#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id))
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct mtype *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+mtype_ext_cleanup(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+ u32 id;
+
+ for (id = 0; id < map->elements; id++)
+ if (test_bit(id, map->members))
+ ip_set_ext_destroy(set, get_ext(set, map, id));
+}
+
+static void
+mtype_destroy(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ if (set->dsize) {
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ ip_set_free(map->extensions);
+ }
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+mtype_flush(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ memset(map->members, 0, map->memsize);
+}
+
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct mtype *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (mtype_do_head(skb, map) ||
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) +
+ map->memsize +
+ set->dsize * map->elements)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+ int ret = mtype_do_test(e, map, set->dsize);
+
+ if (ret <= 0)
+ return ret;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ return 0;
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(x, set), ext, mext, flags);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_get_skbinfo(ext_skbinfo(x, set), ext, mext, flags);
+ return 1;
+}
+
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+ int ret = mtype_do_add(e, map, flags, set->dsize);
+
+ if (ret == IPSET_ADD_FAILED) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ ret = 0;
+ else if (!(flags & IPSET_FLAG_EXIST))
+ return -IPSET_ERR_EXIST;
+ /* Element is re-added, cleanup extensions */
+ ip_set_ext_destroy(set, x);
+ }
+
+ if (SET_WITH_TIMEOUT(set))
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret);
+#else
+ ip_set_timeout_set(ext_timeout(x, set), ext->timeout);
+#endif
+
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(x, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(x, set), ext);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_init_skbinfo(ext_skbinfo(x, set), ext);
+ return 0;
+}
+
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(set, map, e->id);
+
+ if (mtype_do_del(e, map))
+ return -IPSET_ERR_EXIST;
+
+ ip_set_ext_destroy(set, x);
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, set)))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+#ifndef IP_SET_BITMAP_STORED_TIMEOUT
+static inline bool
+mtype_is_filled(const struct mtype_elem *x)
+{
+ return true;
+}
+#endif
+
+static int
+mtype_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct mtype *map = set->data;
+ struct nlattr *adt, *nested;
+ void *x;
+ u32 id, first = cb->args[IPSET_CB_ARG0];
+
+ adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!adt)
+ return -EMSGSIZE;
+ for (; cb->args[IPSET_CB_ARG0] < map->elements;
+ cb->args[IPSET_CB_ARG0]++) {
+ id = cb->args[IPSET_CB_ARG0];
+ x = get_ext(set, map, id);
+ if (!test_bit(id, map->members) ||
+ (SET_WITH_TIMEOUT(set) &&
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ mtype_is_filled((const struct mtype_elem *) x) &&
+#endif
+ ip_set_timeout_expired(ext_timeout(x, set))))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, adt);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (mtype_do_list(skb, map, id, set->dsize))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, x,
+ mtype_is_filled((const struct mtype_elem *) x)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, adt);
+
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ if (unlikely(id == first)) {
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, adt);
+ return 0;
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct mtype *map = set->data;
+ void *x;
+ u32 id;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id < map->elements; id++)
+ if (mtype_gc_test(id, map, set->dsize)) {
+ x = get_ext(set, map, id);
+ if (ip_set_timeout_expired(ext_timeout(x, set))) {
+ clear_bit(id, map->members);
+ ip_set_ext_destroy(set, x);
+ }
+ }
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static const struct ip_set_type_variant mtype = {
+ .kadt = mtype_kadt,
+ .uadt = mtype_uadt,
+ .adt = {
+ [IPSET_ADD] = mtype_add,
+ [IPSET_DEL] = mtype_del,
+ [IPSET_TEST] = mtype_test,
+ },
+ .destroy = mtype_destroy,
+ .flush = mtype_flush,
+ .head = mtype_head,
+ .list = mtype_list,
+ .same_set = mtype_same_set,
+};
+
+#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
new file mode 100644
index 000000000..55b083ec5
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -0,0 +1,384 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+/* 2 Comment support added */
+#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:ip");
+
+#define MTYPE bitmap_ip
+
+/* Type structure */
+struct bitmap_ip {
+ void *members; /* the set members */
+ void *extensions; /* data extensions */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ u32 hosts; /* number of hosts in a subnet */
+ size_t memsize; /* members size */
+ u8 netmask; /* subnet netmask */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_ip_adt_elem {
+ u16 id;
+};
+
+static inline u32
+ip_to_id(const struct bitmap_ip *m, u32 ip)
+{
+ return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+}
+
+/* Common functions */
+
+static inline int
+bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e,
+ struct bitmap_ip *map, size_t dsize)
+{
+ return !!test_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize)
+{
+ return !!test_bit(id, map->members);
+}
+
+static inline int
+bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map,
+ u32 flags, size_t dsize)
+{
+ return !!test_and_set_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id,
+ size_t dsize)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id * map->hosts));
+}
+
+static inline int
+bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) ||
+ (map->netmask != 32 &&
+ nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask));
+}
+
+static int
+bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ip_adt_elem e = { .id = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ u32 ip;
+
+ ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = ip_to_id(map, ip);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct bitmap_ip *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ u32 ip = 0, ip_to = 0;
+ struct bitmap_ip_adt_elem e = { .id = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ if (adt == IPSET_TEST) {
+ e.id = ip_to_id(map, ip);
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to) {
+ swap(ip, ip_to);
+ if (ip < map->first_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ } else
+ ip_to = ip;
+
+ if (ip_to > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; !before(ip_to, ip); ip += map->hosts) {
+ e.id = ip_to_id(map, ip);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ip *x = a->data;
+ const struct bitmap_ip *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ x->netmask == y->netmask &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+struct bitmap_ip_elem {
+};
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_ip(struct ip_set *set, struct bitmap_ip *map,
+ u32 first_ip, u32 last_ip,
+ u32 elements, u32 hosts, u8 netmask)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->elements = elements;
+ map->hosts = hosts;
+ map->netmask = netmask;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_IPV4;
+
+ return true;
+}
+
+static int
+bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ struct bitmap_ip *map;
+ u32 first_ip = 0, last_ip = 0, hosts;
+ u64 elements;
+ u8 netmask = 32;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(first_ip, last_ip, cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if (netmask > 32)
+ return -IPSET_ERR_INVALID_NETMASK;
+
+ first_ip &= ip_set_hostmask(netmask);
+ last_ip |= ~ip_set_hostmask(netmask);
+ }
+
+ if (netmask == 32) {
+ hosts = 1;
+ elements = (u64)last_ip - first_ip + 1;
+ } else {
+ u8 mask_bits;
+ u32 mask;
+
+ mask = range_to_mask(first_ip, last_ip, &mask_bits);
+
+ if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) ||
+ netmask <= mask_bits)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask);
+ hosts = 2 << (32 - netmask - 1);
+ elements = 2 << (netmask - mask_bits - 1);
+ }
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ pr_debug("hosts %u, elements %llu\n",
+ hosts, (unsigned long long)elements);
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->memsize = bitmap_bytes(0, elements - 1);
+ set->variant = &bitmap_ip;
+ set->dsize = ip_set_elem_len(set, tb, 0);
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_ip_gc_init(set, bitmap_ip_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ip_type __read_mostly = {
+ .name = "bitmap:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_IPV4,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ip_init(void)
+{
+ return ip_set_type_register(&bitmap_ip_type);
+}
+
+static void __exit
+bitmap_ip_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ip_type);
+}
+
+module_init(bitmap_ip_init);
+module_exit(bitmap_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
new file mode 100644
index 000000000..86104744b
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -0,0 +1,421 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+/* 2 Comment support added */
+#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:ip,mac");
+
+#define MTYPE bitmap_ipmac
+#define IP_SET_BITMAP_STORED_TIMEOUT
+
+enum {
+ MAC_UNSET, /* element is set, without MAC */
+ MAC_FILLED, /* element is set with MAC */
+};
+
+/* Type structure */
+struct bitmap_ipmac {
+ void *members; /* the set members */
+ void *extensions; /* MAC + data extensions */
+ u32 first_ip; /* host byte order, included in range */
+ u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ size_t memsize; /* members size */
+ struct timer_list gc; /* garbage collector */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_ipmac_adt_elem {
+ u16 id;
+ unsigned char *ether;
+};
+
+struct bitmap_ipmac_elem {
+ unsigned char ether[ETH_ALEN];
+ unsigned char filled;
+} __attribute__ ((aligned));
+
+static inline u32
+ip_to_id(const struct bitmap_ipmac *m, u32 ip)
+{
+ return ip - m->first_ip;
+}
+
+static inline struct bitmap_ipmac_elem *
+get_elem(void *extensions, u16 id, size_t dsize)
+{
+ return (struct bitmap_ipmac_elem *)(extensions + id * dsize);
+}
+
+/* Common functions */
+
+static inline int
+bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
+ const struct bitmap_ipmac *map, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem;
+
+ if (!test_bit(e->id, map->members))
+ return 0;
+ elem = get_elem(map->extensions, e->id, dsize);
+ if (elem->filled == MAC_FILLED)
+ return e->ether == NULL ||
+ ether_addr_equal(e->ether, elem->ether);
+ /* Trigger kernel to fill out the ethernet address */
+ return -EAGAIN;
+}
+
+static inline int
+bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem;
+
+ if (!test_bit(id, map->members))
+ return 0;
+ elem = get_elem(map->extensions, id, dsize);
+ /* Timer not started for the incomplete elements */
+ return elem->filled == MAC_FILLED;
+}
+
+static inline int
+bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem)
+{
+ return elem->filled == MAC_FILLED;
+}
+
+static inline int
+bitmap_ipmac_add_timeout(unsigned long *timeout,
+ const struct bitmap_ipmac_adt_elem *e,
+ const struct ip_set_ext *ext, struct ip_set *set,
+ struct bitmap_ipmac *map, int mode)
+{
+ u32 t = ext->timeout;
+
+ if (mode == IPSET_ADD_START_STORED_TIMEOUT) {
+ if (t == set->timeout)
+ /* Timeout was not specified, get stored one */
+ t = *timeout;
+ ip_set_timeout_set(timeout, t);
+ } else {
+ /* If MAC is unset yet, we store plain timeout value
+ * because the timer is not activated yet
+ * and we can reuse it later when MAC is filled out,
+ * possibly by the kernel */
+ if (e->ether)
+ ip_set_timeout_set(timeout, t);
+ else
+ *timeout = t;
+ }
+ return 0;
+}
+
+static inline int
+bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
+ struct bitmap_ipmac *map, u32 flags, size_t dsize)
+{
+ struct bitmap_ipmac_elem *elem;
+
+ elem = get_elem(map->extensions, e->id, dsize);
+ if (test_and_set_bit(e->id, map->members)) {
+ if (elem->filled == MAC_FILLED) {
+ if (e->ether && (flags & IPSET_FLAG_EXIST))
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ return IPSET_ADD_FAILED;
+ } else if (!e->ether)
+ /* Already added without ethernet address */
+ return IPSET_ADD_FAILED;
+ /* Fill the MAC address and trigger the timer activation */
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ elem->filled = MAC_FILLED;
+ return IPSET_ADD_START_STORED_TIMEOUT;
+ } else if (e->ether) {
+ /* We can store MAC too */
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ elem->filled = MAC_FILLED;
+ return 0;
+ } else {
+ elem->filled = MAC_UNSET;
+ /* MAC is not stored yet, don't start timer */
+ return IPSET_ADD_STORE_PLAIN_TIMEOUT;
+ }
+}
+
+static inline int
+bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e,
+ struct bitmap_ipmac *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
+ u32 id, size_t dsize)
+{
+ const struct bitmap_ipmac_elem *elem =
+ get_elem(map->extensions, id, dsize);
+
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id)) ||
+ (elem->filled == MAC_FILLED &&
+ nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether));
+}
+
+static inline int
+bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+}
+
+static int
+bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ipmac_adt_elem e = { .id = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ u32 ip;
+
+ /* MAC can be src only */
+ if (!(opt->flags & IPSET_DIM_TWO_SRC))
+ return 0;
+
+ ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ /* Backward compatibility: we don't check the second flag */
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ e.id = ip_to_id(map, ip);
+ e.ether = eth_hdr(skb)->h_source;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct bitmap_ipmac *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ipmac_adt_elem e = { .id = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (ip < map->first_ip || ip > map->last_ip)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = ip_to_id(map, ip);
+ if (tb[IPSET_ATTR_ETHER])
+ e.ether = nla_data(tb[IPSET_ATTR_ETHER]);
+ else
+ e.ether = NULL;
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static bool
+bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_ipmac *x = a->data;
+ const struct bitmap_ipmac *y = b->data;
+
+ return x->first_ip == y->first_ip &&
+ x->last_ip == y->last_ip &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip,mac type of sets */
+
+static bool
+init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
+ u32 first_ip, u32 last_ip, u32 elements)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_ip = first_ip;
+ map->last_ip = last_ip;
+ map->elements = elements;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_IPV4;
+
+ return true;
+}
+
+static int
+bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ u32 first_ip = 0, last_ip = 0;
+ u64 elements;
+ struct bitmap_ipmac *map;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+ if (ret)
+ return ret;
+ if (first_ip > last_ip) {
+ u32 tmp = first_ip;
+
+ first_ip = last_ip;
+ last_ip = tmp;
+ }
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (cidr >= 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(first_ip, last_ip, cidr);
+ } else
+ return -IPSET_ERR_PROTOCOL;
+
+ elements = (u64)last_ip - first_ip + 1;
+
+ if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+ return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->memsize = bitmap_bytes(0, elements - 1);
+ set->variant = &bitmap_ipmac;
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct bitmap_ipmac_elem));
+ if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_ipmac_gc_init(set, bitmap_ipmac_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_ipmac_type = {
+ .name = "bitmap:ip,mac",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MAC,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_IPV4,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_ipmac_create,
+ .create_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
+ .len = ETH_ALEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_ipmac_init(void)
+{
+ return ip_set_type_register(&bitmap_ipmac_type);
+}
+
+static void __exit
+bitmap_ipmac_fini(void)
+{
+ ip_set_type_unregister(&bitmap_ipmac_type);
+}
+
+module_init(bitmap_ipmac_init);
+module_exit(bitmap_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
new file mode 100644
index 000000000..005dd3644
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -0,0 +1,318 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:port type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counter support added */
+/* 2 Comment support added */
+#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_bitmap:port");
+
+#define MTYPE bitmap_port
+
+/* Type structure */
+struct bitmap_port {
+ void *members; /* the set members */
+ void *extensions; /* data extensions */
+ u16 first_port; /* host byte order, included in range */
+ u16 last_port; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
+ size_t memsize; /* members size */
+ struct timer_list gc; /* garbage collection */
+};
+
+/* ADT structure for generic function args */
+struct bitmap_port_adt_elem {
+ u16 id;
+};
+
+static inline u16
+port_to_id(const struct bitmap_port *m, u16 port)
+{
+ return port - m->first_port;
+}
+
+/* Common functions */
+
+static inline int
+bitmap_port_do_test(const struct bitmap_port_adt_elem *e,
+ const struct bitmap_port *map, size_t dsize)
+{
+ return !!test_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize)
+{
+ return !!test_bit(id, map->members);
+}
+
+static inline int
+bitmap_port_do_add(const struct bitmap_port_adt_elem *e,
+ struct bitmap_port *map, u32 flags, size_t dsize)
+{
+ return !!test_and_set_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_do_del(const struct bitmap_port_adt_elem *e,
+ struct bitmap_port *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
+
+static inline int
+bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id,
+ size_t dsize)
+{
+ return nla_put_net16(skb, IPSET_ATTR_PORT,
+ htons(map->first_port + id));
+}
+
+static inline int
+bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map)
+{
+ return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
+}
+
+static int
+bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_port_adt_elem e = { .id = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ __be16 __port;
+ u16 port = 0;
+
+ if (!ip_set_get_ip_port(skb, opt->family,
+ opt->flags & IPSET_DIM_ONE_SRC, &__port))
+ return -EINVAL;
+
+ port = ntohs(__port);
+
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ e.id = port_to_id(map, port);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct bitmap_port *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_port_adt_elem e = { .id = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port; /* wraparound */
+ u16 port_to;
+ int ret = 0;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ if (port < map->first_port || port > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (adt == IPSET_TEST) {
+ e.id = port_to_id(map, port);
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to) {
+ swap(port, port_to);
+ if (port < map->first_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+ }
+ } else
+ port_to = port;
+
+ if (port_to > map->last_port)
+ return -IPSET_ERR_BITMAP_RANGE;
+
+ for (; port <= port_to; port++) {
+ e.id = port_to_id(map, port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static bool
+bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct bitmap_port *x = a->data;
+ const struct bitmap_port *y = b->data;
+
+ return x->first_port == y->first_port &&
+ x->last_port == y->last_port &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+/* Plain variant */
+
+struct bitmap_port_elem {
+};
+
+#include "ip_set_bitmap_gen.h"
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_port(struct ip_set *set, struct bitmap_port *map,
+ u16 first_port, u16 last_port)
+{
+ map->members = ip_set_alloc(map->memsize);
+ if (!map->members)
+ return false;
+ if (set->dsize) {
+ map->extensions = ip_set_alloc(set->dsize * map->elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
+ map->first_port = first_port;
+ map->last_port = last_port;
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ set->data = map;
+ set->family = NFPROTO_UNSPEC;
+
+ return true;
+}
+
+static int
+bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ struct bitmap_port *map;
+ u16 first_port, last_port;
+
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+ last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (first_port > last_port) {
+ u16 tmp = first_port;
+
+ first_port = last_port;
+ last_port = tmp;
+ }
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ map->elements = last_port - first_port + 1;
+ map->memsize = bitmap_bytes(0, map->elements);
+ set->variant = &bitmap_port;
+ set->dsize = ip_set_elem_len(set, tb, 0);
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ bitmap_port_gc_init(set, bitmap_port_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type bitmap_port_type = {
+ .name = "bitmap:port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = bitmap_port_create,
+ .create_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+bitmap_port_init(void)
+{
+ return ip_set_type_register(&bitmap_port_type);
+}
+
+static void __exit
+bitmap_port_fini(void)
+{
+ ip_set_type_unregister(&bitmap_port_type);
+}
+
+module_init(bitmap_port_init);
+module_exit(bitmap_port_fini);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
new file mode 100644
index 000000000..d259da3ce
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -0,0 +1,2042 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module for IP set management */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/rculist.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/ipset/ip_set.h>
+
+static LIST_HEAD(ip_set_type_list); /* all registered set types */
+static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */
+static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */
+
+struct ip_set_net {
+ struct ip_set * __rcu *ip_set_list; /* all individual sets */
+ ip_set_id_t ip_set_max; /* max number of sets */
+ int is_deleted; /* deleted by ip_set_net_exit */
+};
+static int ip_set_net_id __read_mostly;
+
+static inline struct ip_set_net *ip_set_pernet(struct net *net)
+{
+ return net_generic(net, ip_set_net_id);
+}
+
+#define IP_SET_INC 64
+#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0)
+
+static unsigned int max_sets;
+
+module_param(max_sets, int, 0600);
+MODULE_PARM_DESC(max_sets, "maximal number of sets");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("core IP set support");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
+
+/* When the nfnl mutex is held: */
+#define ip_set_dereference(p) \
+ rcu_dereference_protected(p, 1)
+#define ip_set(inst, id) \
+ ip_set_dereference((inst)->ip_set_list)[id]
+
+/*
+ * The set types are implemented in modules and registered set types
+ * can be found in ip_set_type_list. Adding/deleting types is
+ * serialized by ip_set_type_mutex.
+ */
+
+static inline void
+ip_set_type_lock(void)
+{
+ mutex_lock(&ip_set_type_mutex);
+}
+
+static inline void
+ip_set_type_unlock(void)
+{
+ mutex_unlock(&ip_set_type_mutex);
+}
+
+/* Register and deregister settype */
+
+static struct ip_set_type *
+find_set_type(const char *name, u8 family, u8 revision)
+{
+ struct ip_set_type *type;
+
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family ||
+ type->family == NFPROTO_UNSPEC) &&
+ revision >= type->revision_min &&
+ revision <= type->revision_max)
+ return type;
+ return NULL;
+}
+
+/* Unlock, try to load a set type module and lock again */
+static bool
+load_settype(const char *name)
+{
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ pr_debug("try to load ip_set_%s\n", name);
+ if (request_module("ip_set_%s", name) < 0) {
+ pr_warn("Can't find ip_set type %s\n", name);
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ return false;
+ }
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ return true;
+}
+
+/* Find a set type and reference it */
+#define find_set_type_get(name, family, revision, found) \
+ __find_set_type_get(name, family, revision, found, false)
+
+static int
+__find_set_type_get(const char *name, u8 family, u8 revision,
+ struct ip_set_type **found, bool retry)
+{
+ struct ip_set_type *type;
+ int err;
+
+ if (retry && !load_settype(name))
+ return -IPSET_ERR_FIND_TYPE;
+
+ rcu_read_lock();
+ *found = find_set_type(name, family, revision);
+ if (*found) {
+ err = !try_module_get((*found)->me) ? -EFAULT : 0;
+ goto unlock;
+ }
+ /* Make sure the type is already loaded
+ * but we don't support the revision */
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name)) {
+ err = -IPSET_ERR_FIND_TYPE;
+ goto unlock;
+ }
+ rcu_read_unlock();
+
+ return retry ? -IPSET_ERR_FIND_TYPE :
+ __find_set_type_get(name, family, revision, found, true);
+
+unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+/* Find a given set type by name and family.
+ * If we succeeded, the supported minimal and maximum revisions are
+ * filled out.
+ */
+#define find_set_type_minmax(name, family, min, max) \
+ __find_set_type_minmax(name, family, min, max, false)
+
+static int
+__find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max,
+ bool retry)
+{
+ struct ip_set_type *type;
+ bool found = false;
+
+ if (retry && !load_settype(name))
+ return -IPSET_ERR_FIND_TYPE;
+
+ *min = 255; *max = 0;
+ rcu_read_lock();
+ list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ if (STREQ(type->name, name) &&
+ (type->family == family ||
+ type->family == NFPROTO_UNSPEC)) {
+ found = true;
+ if (type->revision_min < *min)
+ *min = type->revision_min;
+ if (type->revision_max > *max)
+ *max = type->revision_max;
+ }
+ rcu_read_unlock();
+ if (found)
+ return 0;
+
+ return retry ? -IPSET_ERR_FIND_TYPE :
+ __find_set_type_minmax(name, family, min, max, true);
+}
+
+#define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \
+ (f) == NFPROTO_IPV6 ? "inet6" : "any")
+
+/* Register a set type structure. The type is identified by
+ * the unique triple of name, family and revision.
+ */
+int
+ip_set_type_register(struct ip_set_type *type)
+{
+ int ret = 0;
+
+ if (type->protocol != IPSET_PROTOCOL) {
+ pr_warn("ip_set type %s, family %s, revision %u:%u uses wrong protocol version %u (want %u)\n",
+ type->name, family_name(type->family),
+ type->revision_min, type->revision_max,
+ type->protocol, IPSET_PROTOCOL);
+ return -EINVAL;
+ }
+
+ ip_set_type_lock();
+ if (find_set_type(type->name, type->family, type->revision_min)) {
+ /* Duplicate! */
+ pr_warn("ip_set type %s, family %s with revision min %u already registered!\n",
+ type->name, family_name(type->family),
+ type->revision_min);
+ ret = -EINVAL;
+ goto unlock;
+ }
+ list_add_rcu(&type->list, &ip_set_type_list);
+ pr_debug("type %s, family %s, revision %u:%u registered.\n",
+ type->name, family_name(type->family),
+ type->revision_min, type->revision_max);
+unlock:
+ ip_set_type_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_type_register);
+
+/* Unregister a set type. There's a small race with ip_set_create */
+void
+ip_set_type_unregister(struct ip_set_type *type)
+{
+ ip_set_type_lock();
+ if (!find_set_type(type->name, type->family, type->revision_min)) {
+ pr_warn("ip_set type %s, family %s with revision min %u not registered\n",
+ type->name, family_name(type->family),
+ type->revision_min);
+ goto unlock;
+ }
+ list_del_rcu(&type->list);
+ pr_debug("type %s, family %s with revision min %u unregistered.\n",
+ type->name, family_name(type->family), type->revision_min);
+unlock:
+ ip_set_type_unlock();
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ip_set_type_unregister);
+
+/* Utility functions */
+void *
+ip_set_alloc(size_t size)
+{
+ void *members = NULL;
+
+ if (size < KMALLOC_MAX_SIZE)
+ members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+
+ if (members) {
+ pr_debug("%p: allocated with kmalloc\n", members);
+ return members;
+ }
+
+ members = vzalloc(size);
+ if (!members)
+ return NULL;
+ pr_debug("%p: allocated with vmalloc\n", members);
+
+ return members;
+}
+EXPORT_SYMBOL_GPL(ip_set_alloc);
+
+void
+ip_set_free(void *members)
+{
+ pr_debug("%p: free with %s\n", members,
+ is_vmalloc_addr(members) ? "vfree" : "kfree");
+ kvfree(members);
+}
+EXPORT_SYMBOL_GPL(ip_set_free);
+
+static inline bool
+flag_nested(const struct nlattr *nla)
+{
+ return nla->nla_type & NLA_F_NESTED;
+}
+
+static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
+ [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
+ [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+};
+
+int
+ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
+ return -IPSET_ERR_PROTOCOL;
+
+ *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
+
+int
+ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
+{
+ struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+ if (unlikely(!flag_nested(nla)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
+ return -IPSET_ERR_PROTOCOL;
+
+ memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
+ sizeof(struct in6_addr));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+
+typedef void (*destroyer)(void *);
+/* ipset data extension types, in size order */
+
+const struct ip_set_ext_type ip_set_extensions[] = {
+ [IPSET_EXT_ID_COUNTER] = {
+ .type = IPSET_EXT_COUNTER,
+ .flag = IPSET_FLAG_WITH_COUNTERS,
+ .len = sizeof(struct ip_set_counter),
+ .align = __alignof__(struct ip_set_counter),
+ },
+ [IPSET_EXT_ID_TIMEOUT] = {
+ .type = IPSET_EXT_TIMEOUT,
+ .len = sizeof(unsigned long),
+ .align = __alignof__(unsigned long),
+ },
+ [IPSET_EXT_ID_SKBINFO] = {
+ .type = IPSET_EXT_SKBINFO,
+ .flag = IPSET_FLAG_WITH_SKBINFO,
+ .len = sizeof(struct ip_set_skbinfo),
+ .align = __alignof__(struct ip_set_skbinfo),
+ },
+ [IPSET_EXT_ID_COMMENT] = {
+ .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY,
+ .flag = IPSET_FLAG_WITH_COMMENT,
+ .len = sizeof(struct ip_set_comment),
+ .align = __alignof__(struct ip_set_comment),
+ .destroy = (destroyer) ip_set_comment_free,
+ },
+};
+EXPORT_SYMBOL_GPL(ip_set_extensions);
+
+static inline bool
+add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[])
+{
+ return ip_set_extensions[id].flag ?
+ (flags & ip_set_extensions[id].flag) :
+ !!tb[IPSET_ATTR_TIMEOUT];
+}
+
+size_t
+ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len)
+{
+ enum ip_set_ext_id id;
+ size_t offset = 0;
+ u32 cadt_flags = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS])
+ cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_WITH_FORCEADD)
+ set->flags |= IPSET_CREATE_FLAG_FORCEADD;
+ for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
+ if (!add_extension(id, cadt_flags, tb))
+ continue;
+ offset += ALIGN(len + offset, ip_set_extensions[id].align);
+ set->offset[id] = offset;
+ set->extensions |= ip_set_extensions[id].type;
+ offset += ip_set_extensions[id].len;
+ }
+ return len + offset;
+}
+EXPORT_SYMBOL_GPL(ip_set_elem_len);
+
+int
+ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
+ struct ip_set_ext *ext)
+{
+ u64 fullmark;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!(set->extensions & IPSET_EXT_TIMEOUT))
+ return -IPSET_ERR_TIMEOUT;
+ ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+ if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) {
+ if (!(set->extensions & IPSET_EXT_COUNTER))
+ return -IPSET_ERR_COUNTER;
+ if (tb[IPSET_ATTR_BYTES])
+ ext->bytes = be64_to_cpu(nla_get_be64(
+ tb[IPSET_ATTR_BYTES]));
+ if (tb[IPSET_ATTR_PACKETS])
+ ext->packets = be64_to_cpu(nla_get_be64(
+ tb[IPSET_ATTR_PACKETS]));
+ }
+ if (tb[IPSET_ATTR_COMMENT]) {
+ if (!(set->extensions & IPSET_EXT_COMMENT))
+ return -IPSET_ERR_COMMENT;
+ ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]);
+ }
+ if (tb[IPSET_ATTR_SKBMARK]) {
+ if (!(set->extensions & IPSET_EXT_SKBINFO))
+ return -IPSET_ERR_SKBINFO;
+ fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK]));
+ ext->skbmark = fullmark >> 32;
+ ext->skbmarkmask = fullmark & 0xffffffff;
+ }
+ if (tb[IPSET_ATTR_SKBPRIO]) {
+ if (!(set->extensions & IPSET_EXT_SKBINFO))
+ return -IPSET_ERR_SKBINFO;
+ ext->skbprio = be32_to_cpu(nla_get_be32(
+ tb[IPSET_ATTR_SKBPRIO]));
+ }
+ if (tb[IPSET_ATTR_SKBQUEUE]) {
+ if (!(set->extensions & IPSET_EXT_SKBINFO))
+ return -IPSET_ERR_SKBINFO;
+ ext->skbqueue = be16_to_cpu(nla_get_be16(
+ tb[IPSET_ATTR_SKBQUEUE]));
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_extensions);
+
+/*
+ * Creating/destroying/renaming/swapping affect the existence and
+ * the properties of a set. All of these can be executed from userspace
+ * only and serialized by the nfnl mutex indirectly from nfnetlink.
+ *
+ * Sets are identified by their index in ip_set_list and the index
+ * is used by the external references (set/SET netfilter modules).
+ *
+ * The set behind an index may change by swapping only, from userspace.
+ */
+
+static inline void
+__ip_set_get(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ set->ref++;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+static inline void
+__ip_set_put(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ BUG_ON(set->ref == 0);
+ set->ref--;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+/*
+ * Add, del and test set entries from kernel.
+ *
+ * The set behind the index must exist and must be referenced
+ * so it can't be destroyed (or changed) under our foot.
+ */
+
+static inline struct ip_set *
+ip_set_rcu_get(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ rcu_read_lock();
+ /* ip_set_list itself needs to be protected */
+ set = rcu_dereference(inst->ip_set_list)[index];
+ rcu_read_unlock();
+
+ return set;
+}
+
+int
+ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret = 0;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return 0;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
+ read_unlock_bh(&set->lock);
+
+ if (ret == -EAGAIN) {
+ /* Type requests element to be completed */
+ pr_debug("element must be completed, ADD is triggered\n");
+ write_lock_bh(&set->lock);
+ set->variant->kadt(set, skb, par, IPSET_ADD, opt);
+ write_unlock_bh(&set->lock);
+ ret = 1;
+ } else {
+ /* --return-nomatch: invert matched element */
+ if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) &&
+ (set->type->features & IPSET_TYPE_NOMATCH) &&
+ (ret > 0 || ret == -ENOTEMPTY))
+ ret = -ret;
+ }
+
+ /* Convert error codes to nomatch */
+ return (ret < 0 ? 0 : ret);
+}
+EXPORT_SYMBOL_GPL(ip_set_test);
+
+int
+ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_add);
+
+int
+ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+{
+ struct ip_set *set = ip_set_rcu_get(
+ dev_net(par->in ? par->in : par->out), index);
+ int ret = 0;
+
+ BUG_ON(set == NULL);
+ pr_debug("set %s, index %u\n", set->name, index);
+
+ if (opt->dim < set->type->dimension ||
+ !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ write_lock_bh(&set->lock);
+ ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
+ write_unlock_bh(&set->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_del);
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ */
+ip_set_id_t
+ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
+{
+ ip_set_id_t i, index = IPSET_INVALID_ID;
+ struct ip_set *s;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ rcu_read_lock();
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = rcu_dereference(inst->ip_set_list)[i];
+ if (s != NULL && STREQ(s->name, name)) {
+ __ip_set_get(s);
+ index = i;
+ *set = s;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_byname);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ */
+
+static inline void
+__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
+{
+ struct ip_set *set;
+
+ rcu_read_lock();
+ set = rcu_dereference(inst->ip_set_list)[index];
+ if (set != NULL)
+ __ip_set_put(set);
+ rcu_read_unlock();
+}
+
+void
+ip_set_put_byindex(struct net *net, ip_set_id_t index)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ __ip_set_put_byindex(inst, index);
+}
+EXPORT_SYMBOL_GPL(ip_set_put_byindex);
+
+/*
+ * Get the name of a set behind a set index.
+ * We assume the set is referenced, so it does exist and
+ * can't be destroyed. The set cannot be renamed due to
+ * the referencing either.
+ *
+ */
+const char *
+ip_set_name_byindex(struct net *net, ip_set_id_t index)
+{
+ const struct ip_set *set = ip_set_rcu_get(net, index);
+
+ BUG_ON(set == NULL);
+ BUG_ON(set->ref == 0);
+
+ /* Referenced, so it's safe */
+ return set->name;
+}
+EXPORT_SYMBOL_GPL(ip_set_name_byindex);
+
+/*
+ * Routines to call by external subsystems, which do not
+ * call nfnl_lock for us.
+ */
+
+/*
+ * Find set by index, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ if (index >= inst->ip_set_max)
+ return IPSET_INVALID_ID;
+
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ set = ip_set(inst, index);
+ if (set)
+ __ip_set_get(set);
+ else
+ index = IPSET_INVALID_ID;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+
+ return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ * The nfnl mutex is used in the function.
+ */
+void
+ip_set_nfnl_put(struct net *net, ip_set_id_t index)
+{
+ struct ip_set *set;
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
+ set = ip_set(inst, index);
+ if (set != NULL)
+ __ip_set_put(set);
+ }
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
+
+/*
+ * Communication protocol with userspace over netlink.
+ *
+ * The commands are serialized by the nfnl mutex.
+ */
+
+static inline bool
+protocol_failed(const struct nlattr * const tb[])
+{
+ return !tb[IPSET_ATTR_PROTOCOL] ||
+ nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
+}
+
+static inline u32
+flag_exist(const struct nlmsghdr *nlh)
+{
+ return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST;
+}
+
+static struct nlmsghdr *
+start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
+ enum ipset_cmd cmd)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
+ sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ return NULL;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = NFPROTO_IPV4;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ return nlh;
+}
+
+/* Create a set */
+
+static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1},
+ [IPSET_ATTR_REVISION] = { .type = NLA_U8 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+};
+
+static struct ip_set *
+find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
+{
+ struct ip_set *set = NULL;
+ ip_set_id_t i;
+
+ *id = IPSET_INVALID_ID;
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set != NULL && STREQ(set->name, name)) {
+ *id = i;
+ break;
+ }
+ }
+ return (*id == IPSET_INVALID_ID ? NULL : set);
+}
+
+static inline struct ip_set *
+find_set(struct ip_set_net *inst, const char *name)
+{
+ ip_set_id_t id;
+
+ return find_set_and_id(inst, name, &id);
+}
+
+static int
+find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
+ struct ip_set **set)
+{
+ struct ip_set *s;
+ ip_set_id_t i;
+
+ *index = IPSET_INVALID_ID;
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s == NULL) {
+ if (*index == IPSET_INVALID_ID)
+ *index = i;
+ } else if (STREQ(name, s->name)) {
+ /* Name clash */
+ *set = s;
+ return -EEXIST;
+ }
+ }
+ if (*index == IPSET_INVALID_ID)
+ /* No free slot remained */
+ return -IPSET_ERR_MAX_SETS;
+ return 0;
+}
+
+static int
+ip_set_none(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ return -EOPNOTSUPP;
+}
+
+static int
+ip_set_create(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct net *net = sock_net(ctnl);
+ struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set *set, *clash = NULL;
+ ip_set_id_t index = IPSET_INVALID_ID;
+ struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+ const char *name, *typename;
+ u8 family, revision;
+ u32 flags = flag_exist(nlh);
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_REVISION] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA]))))
+ return -IPSET_ERR_PROTOCOL;
+
+ name = nla_data(attr[IPSET_ATTR_SETNAME]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ revision = nla_get_u8(attr[IPSET_ATTR_REVISION]);
+ pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
+ name, typename, family_name(family), revision);
+
+ /*
+ * First, and without any locks, allocate and initialize
+ * a normal base set structure.
+ */
+ set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+ if (!set)
+ return -ENOMEM;
+ rwlock_init(&set->lock);
+ strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ set->family = family;
+ set->revision = revision;
+
+ /*
+ * Next, check that we know the type, and take
+ * a reference on the type, to make sure it stays available
+ * while constructing our new set.
+ *
+ * After referencing the type, we try to create the type
+ * specific part of the set without holding any locks.
+ */
+ ret = find_set_type_get(typename, family, revision, &(set->type));
+ if (ret)
+ goto out;
+
+ /*
+ * Without holding any locks, create private part.
+ */
+ if (attr[IPSET_ATTR_DATA] &&
+ nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
+ set->type->create_policy)) {
+ ret = -IPSET_ERR_PROTOCOL;
+ goto put_out;
+ }
+
+ ret = set->type->create(net, set, tb, flags);
+ if (ret != 0)
+ goto put_out;
+
+ /* BTW, ret==0 here. */
+
+ /*
+ * Here, we have a valid, constructed set and we are protected
+ * by the nfnl mutex. Find the first free index in ip_set_list
+ * and check clashing.
+ */
+ ret = find_free_id(inst, set->name, &index, &clash);
+ if (ret == -EEXIST) {
+ /* If this is the same set and requested, ignore error */
+ if ((flags & IPSET_FLAG_EXIST) &&
+ STREQ(set->type->name, clash->type->name) &&
+ set->type->family == clash->type->family &&
+ set->type->revision_min == clash->type->revision_min &&
+ set->type->revision_max == clash->type->revision_max &&
+ set->variant->same_set(set, clash))
+ ret = 0;
+ goto cleanup;
+ } else if (ret == -IPSET_ERR_MAX_SETS) {
+ struct ip_set **list, **tmp;
+ ip_set_id_t i = inst->ip_set_max + IP_SET_INC;
+
+ if (i < inst->ip_set_max || i == IPSET_INVALID_ID)
+ /* Wraparound */
+ goto cleanup;
+
+ list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL);
+ if (!list)
+ goto cleanup;
+ /* nfnl mutex is held, both lists are valid */
+ tmp = ip_set_dereference(inst->ip_set_list);
+ memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
+ rcu_assign_pointer(inst->ip_set_list, list);
+ /* Make sure all current packets have passed through */
+ synchronize_net();
+ /* Use new list */
+ index = inst->ip_set_max;
+ inst->ip_set_max = i;
+ kfree(tmp);
+ ret = 0;
+ } else if (ret)
+ goto cleanup;
+
+ /*
+ * Finally! Add our shiny new set to the list, and be done.
+ */
+ pr_debug("create: '%s' created with index %u!\n", set->name, index);
+ ip_set(inst, index) = set;
+
+ return ret;
+
+cleanup:
+ set->variant->destroy(set);
+put_out:
+ module_put(set->type->me);
+out:
+ kfree(set);
+ return ret;
+}
+
+/* Destroy sets */
+
+static const struct nla_policy
+ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static void
+ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
+{
+ struct ip_set *set = ip_set(inst, index);
+
+ pr_debug("set: %s\n", set->name);
+ ip_set(inst, index) = NULL;
+
+ /* Must call it without holding any lock */
+ set->variant->destroy(set);
+ module_put(set->type->me);
+ kfree(set);
+}
+
+static int
+ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *s;
+ ip_set_id_t i;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ /* Commands are serialized and references are
+ * protected by the ip_set_ref_lock.
+ * External systems (i.e. xt_set) must call
+ * ip_set_put|get_nfnl_* functions, that way we
+ * can safely check references here.
+ *
+ * list:set timer can only decrement the reference
+ * counter, so if it's already zero, we can proceed
+ * without holding the lock.
+ */
+ read_lock_bh(&ip_set_ref_lock);
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL && s->ref) {
+ ret = -IPSET_ERR_BUSY;
+ goto out;
+ }
+ }
+ read_unlock_bh(&ip_set_ref_lock);
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL)
+ ip_set_destroy_set(inst, i);
+ }
+ } else {
+ s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+ &i);
+ if (s == NULL) {
+ ret = -ENOENT;
+ goto out;
+ } else if (s->ref) {
+ ret = -IPSET_ERR_BUSY;
+ goto out;
+ }
+ read_unlock_bh(&ip_set_ref_lock);
+
+ ip_set_destroy_set(inst, i);
+ }
+ return 0;
+out:
+ read_unlock_bh(&ip_set_ref_lock);
+ return ret;
+}
+
+/* Flush sets */
+
+static void
+ip_set_flush_set(struct ip_set *set)
+{
+ pr_debug("set: %s\n", set->name);
+
+ write_lock_bh(&set->lock);
+ set->variant->flush(set);
+ write_unlock_bh(&set->lock);
+}
+
+static int
+ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *s;
+ ip_set_id_t i;
+
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (!attr[IPSET_ATTR_SETNAME]) {
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL)
+ ip_set_flush_set(s);
+ }
+ } else {
+ s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (s == NULL)
+ return -ENOENT;
+
+ ip_set_flush_set(s);
+ }
+
+ return 0;
+}
+
+/* Rename a set */
+
+static const struct nla_policy
+ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static int
+ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set, *s;
+ const char *name2;
+ ip_set_id_t i;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ read_lock_bh(&ip_set_ref_lock);
+ if (set->ref != 0) {
+ ret = -IPSET_ERR_REFERENCED;
+ goto out;
+ }
+
+ name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
+ for (i = 0; i < inst->ip_set_max; i++) {
+ s = ip_set(inst, i);
+ if (s != NULL && STREQ(s->name, name2)) {
+ ret = -IPSET_ERR_EXIST_SETNAME2;
+ goto out;
+ }
+ }
+ strncpy(set->name, name2, IPSET_MAXNAMELEN);
+
+out:
+ read_unlock_bh(&ip_set_ref_lock);
+ return ret;
+}
+
+/* Swap two sets so that name/index points to the other.
+ * References and set names are also swapped.
+ *
+ * The commands are serialized by the nfnl mutex and references are
+ * protected by the ip_set_ref_lock. The kernel interfaces
+ * do not hold the mutex but the pointer settings are atomic
+ * so the ip_set_list always contains valid pointers to the sets.
+ */
+
+static int
+ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *from, *to;
+ ip_set_id_t from_id, to_id;
+ char from_name[IPSET_MAXNAMELEN];
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_SETNAME2] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+ &from_id);
+ if (from == NULL)
+ return -ENOENT;
+
+ to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]),
+ &to_id);
+ if (to == NULL)
+ return -IPSET_ERR_EXIST_SETNAME2;
+
+ /* Features must not change.
+ * Not an artificial restriction anymore, as we must prevent
+ * possible loops created by swapping in setlist type of sets. */
+ if (!(from->type->features == to->type->features &&
+ from->family == to->family))
+ return -IPSET_ERR_TYPE_MISMATCH;
+
+ strncpy(from_name, from->name, IPSET_MAXNAMELEN);
+ strncpy(from->name, to->name, IPSET_MAXNAMELEN);
+ strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+
+ write_lock_bh(&ip_set_ref_lock);
+ swap(from->ref, to->ref);
+ ip_set(inst, from_id) = to;
+ ip_set(inst, to_id) = from;
+ write_unlock_bh(&ip_set_ref_lock);
+
+ return 0;
+}
+
+/* List/save set data */
+
+#define DUMP_INIT 0
+#define DUMP_ALL 1
+#define DUMP_ONE 2
+#define DUMP_LAST 3
+
+#define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF)
+#define DUMP_FLAGS(arg) (((u32)(arg)) >> 16)
+
+static int
+ip_set_dump_done(struct netlink_callback *cb)
+{
+ struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
+ if (cb->args[IPSET_CB_ARG0]) {
+ pr_debug("release set %s\n",
+ ip_set(inst, cb->args[IPSET_CB_INDEX])->name);
+ __ip_set_put_byindex(inst,
+ (ip_set_id_t) cb->args[IPSET_CB_INDEX]);
+ }
+ return 0;
+}
+
+static inline void
+dump_attrs(struct nlmsghdr *nlh)
+{
+ const struct nlattr *attr;
+ int rem;
+
+ pr_debug("dump nlmsg\n");
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) {
+ pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len);
+ }
+}
+
+static int
+dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ u32 dump_type;
+ ip_set_id_t index;
+
+ /* Second pass, so parser can't fail */
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
+
+ /* cb->args[IPSET_CB_NET]: net namespace
+ * [IPSET_CB_DUMP]: dump single set/all sets
+ * [IPSET_CB_INDEX]: set index
+ * [IPSET_CB_ARG0]: type specific
+ */
+
+ if (cda[IPSET_ATTR_SETNAME]) {
+ struct ip_set *set;
+
+ set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
+ &index);
+ if (set == NULL)
+ return -ENOENT;
+
+ dump_type = DUMP_ONE;
+ cb->args[IPSET_CB_INDEX] = index;
+ } else
+ dump_type = DUMP_ALL;
+
+ if (cda[IPSET_ATTR_FLAGS]) {
+ u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
+ dump_type |= (f << 16);
+ }
+ cb->args[IPSET_CB_NET] = (unsigned long)inst;
+ cb->args[IPSET_CB_DUMP] = dump_type;
+
+ return 0;
+}
+
+static int
+ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ ip_set_id_t index = IPSET_INVALID_ID, max;
+ struct ip_set *set = NULL;
+ struct nlmsghdr *nlh = NULL;
+ unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
+ struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
+ u32 dump_type, dump_flags;
+ int ret = 0;
+
+ if (!cb->args[IPSET_CB_DUMP]) {
+ ret = dump_init(cb, inst);
+ if (ret < 0) {
+ nlh = nlmsg_hdr(cb->skb);
+ /* We have to create and send the error message
+ * manually :-( */
+ if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(cb->skb, nlh, ret);
+ return ret;
+ }
+ }
+
+ if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max)
+ goto out;
+
+ dump_type = DUMP_TYPE(cb->args[IPSET_CB_DUMP]);
+ dump_flags = DUMP_FLAGS(cb->args[IPSET_CB_DUMP]);
+ max = dump_type == DUMP_ONE ? cb->args[IPSET_CB_INDEX] + 1
+ : inst->ip_set_max;
+dump_last:
+ pr_debug("dump type, flag: %u %u index: %ld\n",
+ dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
+ for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
+ index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
+ set = ip_set(inst, index);
+ if (set == NULL) {
+ if (dump_type == DUMP_ONE) {
+ ret = -ENOENT;
+ goto out;
+ }
+ continue;
+ }
+ /* When dumping all sets, we must dump "sorted"
+ * so that lists (unions of sets) are dumped last.
+ */
+ if (dump_type != DUMP_ONE &&
+ ((dump_type == DUMP_ALL) ==
+ !!(set->type->features & IPSET_DUMP_LAST)))
+ continue;
+ pr_debug("List set: %s\n", set->name);
+ if (!cb->args[IPSET_CB_ARG0]) {
+ /* Start listing: make sure set won't be destroyed */
+ pr_debug("reference set\n");
+ __ip_set_get(set);
+ }
+ nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ IPSET_CMD_LIST);
+ if (!nlh) {
+ ret = -EMSGSIZE;
+ goto release_refcount;
+ }
+ if (nla_put_u8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb, IPSET_ATTR_SETNAME, set->name))
+ goto nla_put_failure;
+ if (dump_flags & IPSET_FLAG_LIST_SETNAME)
+ goto next_set;
+ switch (cb->args[IPSET_CB_ARG0]) {
+ case 0:
+ /* Core header data */
+ if (nla_put_string(skb, IPSET_ATTR_TYPENAME,
+ set->type->name) ||
+ nla_put_u8(skb, IPSET_ATTR_FAMILY,
+ set->family) ||
+ nla_put_u8(skb, IPSET_ATTR_REVISION,
+ set->revision))
+ goto nla_put_failure;
+ ret = set->variant->head(set, skb);
+ if (ret < 0)
+ goto release_refcount;
+ if (dump_flags & IPSET_FLAG_LIST_HEADER)
+ goto next_set;
+ /* Fall through and add elements */
+ default:
+ read_lock_bh(&set->lock);
+ ret = set->variant->list(set, skb, cb);
+ read_unlock_bh(&set->lock);
+ if (!cb->args[IPSET_CB_ARG0])
+ /* Set is done, proceed with next one */
+ goto next_set;
+ goto release_refcount;
+ }
+ }
+ /* If we dump all sets, continue with dumping last ones */
+ if (dump_type == DUMP_ALL) {
+ dump_type = DUMP_LAST;
+ cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16);
+ cb->args[IPSET_CB_INDEX] = 0;
+ goto dump_last;
+ }
+ goto out;
+
+nla_put_failure:
+ ret = -EFAULT;
+next_set:
+ if (dump_type == DUMP_ONE)
+ cb->args[IPSET_CB_INDEX] = IPSET_INVALID_ID;
+ else
+ cb->args[IPSET_CB_INDEX]++;
+release_refcount:
+ /* If there was an error or set is done, release set */
+ if (ret || !cb->args[IPSET_CB_ARG0]) {
+ pr_debug("release set %s\n", ip_set(inst, index)->name);
+ __ip_set_put_byindex(inst, index);
+ cb->args[IPSET_CB_ARG0] = 0;
+ }
+out:
+ if (nlh) {
+ nlmsg_end(skb, nlh);
+ pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len);
+ dump_attrs(nlh);
+ }
+
+ return ret < 0 ? ret : skb->len;
+}
+
+static int
+ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ if (unlikely(protocol_failed(attr)))
+ return -IPSET_ERR_PROTOCOL;
+
+ {
+ struct netlink_dump_control c = {
+ .dump = ip_set_dump_start,
+ .done = ip_set_dump_done,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+}
+
+/* Add, del and test */
+
+static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_DATA] = { .type = NLA_NESTED },
+ [IPSET_ATTR_ADT] = { .type = NLA_NESTED },
+};
+
+static int
+call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
+ struct nlattr *tb[], enum ipset_adt adt,
+ u32 flags, bool use_lineno)
+{
+ int ret;
+ u32 lineno = 0;
+ bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
+
+ do {
+ write_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
+ write_unlock_bh(&set->lock);
+ retried = true;
+ } while (ret == -EAGAIN &&
+ set->variant->resize &&
+ (ret = set->variant->resize(set, retried)) == 0);
+
+ if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
+ return 0;
+ if (lineno && use_lineno) {
+ /* Error in restore/batch mode: send back lineno */
+ struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb);
+ struct sk_buff *skb2;
+ struct nlmsgerr *errmsg;
+ size_t payload = min(SIZE_MAX,
+ sizeof(*errmsg) + nlmsg_len(nlh));
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+ struct nlattr *cmdattr;
+ u32 *errline;
+
+ skb2 = nlmsg_new(payload, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+ rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ errmsg = nlmsg_data(rep);
+ errmsg->error = ret;
+ memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ cmdattr = (void *)&errmsg->msg + min_len;
+
+ nla_parse(cda, IPSET_ATTR_CMD_MAX,
+ cmdattr, nlh->nlmsg_len - min_len,
+ ip_set_adt_policy);
+
+ errline = nla_data(cda[IPSET_ATTR_LINENO]);
+
+ *errline = lineno;
+
+ netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ /* Signal netlink not to send its ACK/errmsg. */
+ return -EINTR;
+ }
+
+ return ret;
+}
+
+static int
+ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ const struct nlattr *nla;
+ u32 flags = flag_exist(nlh);
+ bool use_lineno;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ !((attr[IPSET_ATTR_DATA] != NULL) ^
+ (attr[IPSET_ATTR_ADT] != NULL)) ||
+ (attr[IPSET_ATTR_DATA] != NULL &&
+ !flag_nested(attr[IPSET_ATTR_DATA])) ||
+ (attr[IPSET_ATTR_ADT] != NULL &&
+ (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+ attr[IPSET_ATTR_LINENO] == NULL))))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ use_lineno = !!attr[IPSET_ATTR_LINENO];
+ if (attr[IPSET_ATTR_DATA]) {
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
+ use_lineno);
+ } else {
+ int nla_rem;
+
+ nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+ memset(tb, 0, sizeof(*tb));
+ if (nla_type(nla) != IPSET_ATTR_DATA ||
+ !flag_nested(nla) ||
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+ ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
+ flags, use_lineno);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+static int
+ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set *set;
+ struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL ||
+ attr[IPSET_ATTR_DATA] == NULL ||
+ !flag_nested(attr[IPSET_ATTR_DATA])))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
+ set->type->adt_policy))
+ return -IPSET_ERR_PROTOCOL;
+
+ read_lock_bh(&set->lock);
+ ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
+ read_unlock_bh(&set->lock);
+ /* Userspace can't trigger element to be re-added */
+ if (ret == -EAGAIN)
+ ret = 1;
+
+ return ret > 0 ? 0 : -IPSET_ERR_EXIST;
+}
+
+/* Get headed data of a set */
+
+static int
+ip_set_header(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ const struct ip_set *set;
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_SETNAME] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
+ if (set == NULL)
+ return -ENOENT;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_HEADER);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb2, IPSET_ATTR_SETNAME, set->name) ||
+ nla_put_string(skb2, IPSET_ATTR_TYPENAME, set->type->name) ||
+ nla_put_u8(skb2, IPSET_ATTR_FAMILY, set->family) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION, set->revision))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get type data */
+
+static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_type(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ u8 family, min, max;
+ const char *typename;
+ int ret = 0;
+
+ if (unlikely(protocol_failed(attr) ||
+ attr[IPSET_ATTR_TYPENAME] == NULL ||
+ attr[IPSET_ATTR_FAMILY] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+ typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+ ret = find_set_type_minmax(typename, family, &min, &max);
+ if (ret)
+ return ret;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_TYPE);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL) ||
+ nla_put_string(skb2, IPSET_ATTR_TYPENAME, typename) ||
+ nla_put_u8(skb2, IPSET_ATTR_FAMILY, family) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION, max) ||
+ nla_put_u8(skb2, IPSET_ATTR_REVISION_MIN, min))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+/* Get protocol version */
+
+static const struct nla_policy
+ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+};
+
+static int
+ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
+{
+ struct sk_buff *skb2;
+ struct nlmsghdr *nlh2;
+ int ret = 0;
+
+ if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+ return -IPSET_ERR_PROTOCOL;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ IPSET_CMD_PROTOCOL);
+ if (!nlh2)
+ goto nlmsg_failure;
+ if (nla_put_u8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL))
+ goto nla_put_failure;
+ nlmsg_end(skb2, nlh2);
+
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+ kfree_skb(skb2);
+ return -EMSGSIZE;
+}
+
+static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+ [IPSET_CMD_NONE] = {
+ .call = ip_set_none,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ },
+ [IPSET_CMD_CREATE] = {
+ .call = ip_set_create,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_create_policy,
+ },
+ [IPSET_CMD_DESTROY] = {
+ .call = ip_set_destroy,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_FLUSH] = {
+ .call = ip_set_flush,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_RENAME] = {
+ .call = ip_set_rename,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_SWAP] = {
+ .call = ip_set_swap,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname2_policy,
+ },
+ [IPSET_CMD_LIST] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_SAVE] = {
+ .call = ip_set_dump,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_ADD] = {
+ .call = ip_set_uadd,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_DEL] = {
+ .call = ip_set_udel,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_TEST] = {
+ .call = ip_set_utest,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_adt_policy,
+ },
+ [IPSET_CMD_HEADER] = {
+ .call = ip_set_header,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_setname_policy,
+ },
+ [IPSET_CMD_TYPE] = {
+ .call = ip_set_type,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_type_policy,
+ },
+ [IPSET_CMD_PROTOCOL] = {
+ .call = ip_set_protocol,
+ .attr_count = IPSET_ATTR_CMD_MAX,
+ .policy = ip_set_protocol_policy,
+ },
+};
+
+static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
+ .name = "ip_set",
+ .subsys_id = NFNL_SUBSYS_IPSET,
+ .cb_count = IPSET_MSG_MAX,
+ .cb = ip_set_netlink_subsys_cb,
+};
+
+/* Interface to iptables/ip6tables */
+
+static int
+ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
+{
+ unsigned int *op;
+ void *data;
+ int copylen = *len, ret = 0;
+ struct net *net = sock_net(sk);
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (optval != SO_IP_SET)
+ return -EBADF;
+ if (*len < sizeof(unsigned int))
+ return -EINVAL;
+
+ data = vmalloc(*len);
+ if (!data)
+ return -ENOMEM;
+ if (copy_from_user(data, user, *len) != 0) {
+ ret = -EFAULT;
+ goto done;
+ }
+ op = (unsigned int *) data;
+
+ if (*op < IP_SET_OP_VERSION) {
+ /* Check the version at the beginning of operations */
+ struct ip_set_req_version *req_version = data;
+
+ if (*len < sizeof(struct ip_set_req_version)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ if (req_version->version != IPSET_PROTOCOL) {
+ ret = -EPROTO;
+ goto done;
+ }
+ }
+
+ switch (*op) {
+ case IP_SET_OP_VERSION: {
+ struct ip_set_req_version *req_version = data;
+
+ if (*len != sizeof(struct ip_set_req_version)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ req_version->version = IPSET_PROTOCOL;
+ ret = copy_to_user(user, req_version,
+ sizeof(struct ip_set_req_version));
+ goto done;
+ }
+ case IP_SET_OP_GET_BYNAME: {
+ struct ip_set_req_get_set *req_get = data;
+ ip_set_id_t id;
+
+ if (*len != sizeof(struct ip_set_req_get_set)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ find_set_and_id(inst, req_get->set.name, &id);
+ req_get->set.index = id;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ case IP_SET_OP_GET_FNAME: {
+ struct ip_set_req_get_set_family *req_get = data;
+ ip_set_id_t id;
+
+ if (*len != sizeof(struct ip_set_req_get_set_family)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ find_set_and_id(inst, req_get->set.name, &id);
+ req_get->set.index = id;
+ if (id != IPSET_INVALID_ID)
+ req_get->family = ip_set(inst, id)->family;
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ case IP_SET_OP_GET_BYINDEX: {
+ struct ip_set_req_get_set *req_get = data;
+ struct ip_set *set;
+
+ if (*len != sizeof(struct ip_set_req_get_set) ||
+ req_get->set.index >= inst->ip_set_max) {
+ ret = -EINVAL;
+ goto done;
+ }
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ set = ip_set(inst, req_get->set.index);
+ strncpy(req_get->set.name, set ? set->name : "",
+ IPSET_MAXNAMELEN);
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ goto copy;
+ }
+ default:
+ ret = -EBADMSG;
+ goto done;
+ } /* end of switch(op) */
+
+copy:
+ ret = copy_to_user(user, data, copylen);
+
+done:
+ vfree(data);
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static struct nf_sockopt_ops so_set __read_mostly = {
+ .pf = PF_INET,
+ .get_optmin = SO_IP_SET,
+ .get_optmax = SO_IP_SET + 1,
+ .get = &ip_set_sockfn_get,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init
+ip_set_net_init(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set **list;
+
+ inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX;
+ if (inst->ip_set_max >= IPSET_INVALID_ID)
+ inst->ip_set_max = IPSET_INVALID_ID - 1;
+
+ list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);
+ if (!list)
+ return -ENOMEM;
+ inst->is_deleted = 0;
+ rcu_assign_pointer(inst->ip_set_list, list);
+ return 0;
+}
+
+static void __net_exit
+ip_set_net_exit(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ struct ip_set *set = NULL;
+ ip_set_id_t i;
+
+ inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
+
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set != NULL)
+ ip_set_destroy_set(inst, i);
+ }
+ kfree(rcu_dereference_protected(inst->ip_set_list, 1));
+}
+
+static struct pernet_operations ip_set_net_ops = {
+ .init = ip_set_net_init,
+ .exit = ip_set_net_exit,
+ .id = &ip_set_net_id,
+ .size = sizeof(struct ip_set_net)
+};
+
+
+static int __init
+ip_set_init(void)
+{
+ int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+ if (ret != 0) {
+ pr_err("ip_set: cannot register with nfnetlink.\n");
+ return ret;
+ }
+ ret = nf_register_sockopt(&so_set);
+ if (ret != 0) {
+ pr_err("SO_SET registry failed: %d\n", ret);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ return ret;
+ }
+ ret = register_pernet_subsys(&ip_set_net_ops);
+ if (ret) {
+ pr_err("ip_set: cannot register pernet_subsys.\n");
+ nf_unregister_sockopt(&so_set);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ return ret;
+ }
+ pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
+ return 0;
+}
+
+static void __exit
+ip_set_fini(void)
+{
+ unregister_pernet_subsys(&ip_set_net_ops);
+ nf_unregister_sockopt(&so_set);
+ nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+ pr_debug("these are the famous last words\n");
+}
+
+module_init(ip_set_init);
+module_exit(ip_set_fini);
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
new file mode 100644
index 000000000..29fb01ddf
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -0,0 +1,174 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Get Layer-4 data from the packets */
+
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/sctp.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/export.h>
+
+/* We must handle non-linear skbs */
+static bool
+get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
+ bool src, __be16 *port, u8 *proto)
+{
+ switch (protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? th->source : th->dest;
+ break;
+ }
+ case IPPROTO_SCTP: {
+ sctp_sctphdr_t _sh;
+ const sctp_sctphdr_t *sh;
+
+ sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
+ if (sh == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? sh->source : sh->dest;
+ break;
+ }
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ /* No choice either */
+ return false;
+
+ *port = src ? uh->source : uh->dest;
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr _ich;
+ const struct icmphdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)htons((ic->type << 8) | ic->code);
+ break;
+ }
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _ich;
+ const struct icmp6hdr *ic;
+
+ ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+ if (ic == NULL)
+ return false;
+
+ *port = (__force __be16)
+ htons((ic->icmp6_type << 8) | ic->icmp6_code);
+ break;
+ }
+ default:
+ break;
+ }
+ *proto = protocol;
+
+ return true;
+}
+
+bool
+ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned int protooff = ip_hdrlen(skb);
+ int protocol = iph->protocol;
+
+ /* See comments at tcp_match in ip_tables.c */
+ if (protocol <= 0)
+ return false;
+
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ switch (protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_ICMP:
+ /* Port info not available for fragment offset > 0 */
+ return false;
+ default:
+ /* Other protocols doesn't have ports,
+ so we can match fragments */
+ *proto = protocol;
+ return true;
+ }
+
+ return get_port(skb, protocol, protooff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip4_port);
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+bool
+ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
+ __be16 *port, u8 *proto)
+{
+ int protoff;
+ u8 nexthdr;
+ __be16 frag_off = 0;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+ return false;
+
+ return get_port(skb, nexthdr, protoff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
+#endif
+
+bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+ bool ret;
+ u8 proto;
+
+ switch (pf) {
+ case NFPROTO_IPV4:
+ ret = ip_set_get_ip4_port(skb, src, port, &proto);
+ break;
+ case NFPROTO_IPV6:
+ ret = ip_set_get_ip6_port(skb, src, port, &proto);
+ break;
+ default:
+ return false;
+ }
+ if (!ret)
+ return ret;
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return true;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
new file mode 100644
index 000000000..974ff386d
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -0,0 +1,1167 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _IP_SET_HASH_GEN_H
+#define _IP_SET_HASH_GEN_H
+
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#ifndef rcu_dereference_bh
+#define rcu_dereference_bh(p) rcu_dereference(p)
+#endif
+
+#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
+
+/* Hashing which uses arrays to resolve clashing. The hash table is resized
+ * (doubled) when searching becomes too long.
+ * Internally jhash is used with the assumption that the size of the
+ * stored data is a multiple of sizeof(u32). If storage supports timeout,
+ * the timeout field must be the last one in the data structure - that field
+ * is ignored when computing the hash key.
+ *
+ * Readers and resizing
+ *
+ * Resizing can be triggered by userspace command only, and those
+ * are serialized by the nfnl mutex. During resizing the set is
+ * read-locked, so the only possible concurrent operations are
+ * the kernel side readers. Those must be protected by proper RCU locking.
+ */
+
+/* Number of elements to store in an initial array block */
+#define AHASH_INIT_SIZE 4
+/* Max number of elements to store in an array block */
+#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE)
+
+/* Max number of elements can be tuned */
+#ifdef IP_SET_HASH_WITH_MULTI
+#define AHASH_MAX(h) ((h)->ahash_max)
+
+static inline u8
+tune_ahash_max(u8 curr, u32 multi)
+{
+ u32 n;
+
+ if (multi < curr)
+ return curr;
+
+ n = curr + AHASH_INIT_SIZE;
+ /* Currently, at listing one hash bucket must fit into a message.
+ * Therefore we have a hard limit here.
+ */
+ return n > curr && n <= 64 ? n : curr;
+}
+#define TUNE_AHASH_MAX(h, multi) \
+ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
+#else
+#define AHASH_MAX(h) AHASH_MAX_SIZE
+#define TUNE_AHASH_MAX(h, multi)
+#endif
+
+/* A hash bucket */
+struct hbucket {
+ void *value; /* the array of the values */
+ u8 size; /* size of the array */
+ u8 pos; /* position of the first free entry */
+};
+
+/* The hash table: the table size stored here in order to make resizing easy */
+struct htable {
+ u8 htable_bits; /* size of hash table == 2^htable_bits */
+ struct hbucket bucket[0]; /* hashtable buckets */
+};
+
+#define hbucket(h, i) (&((h)->bucket[i]))
+
+#ifndef IPSET_NET_COUNT
+#define IPSET_NET_COUNT 1
+#endif
+
+/* Book-keeping of the prefixes added to the set */
+struct net_prefixes {
+ u32 nets[IPSET_NET_COUNT]; /* number of elements per cidr */
+ u8 cidr[IPSET_NET_COUNT]; /* the different cidr values in the set */
+};
+
+/* Compute the hash table size */
+static size_t
+htable_size(u8 hbits)
+{
+ size_t hsize;
+
+ /* We must fit both into u32 in jhash and size_t */
+ if (hbits > 31)
+ return 0;
+ hsize = jhash_size(hbits);
+ if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket)
+ < hsize)
+ return 0;
+
+ return hsize * sizeof(struct hbucket) + sizeof(struct htable);
+}
+
+/* Compute htable_bits from the user input parameter hashsize */
+static u8
+htable_bits(u32 hashsize)
+{
+ /* Assume that hashsize == 2^htable_bits */
+ u8 bits = fls(hashsize - 1);
+ if (jhash_size(bits) != hashsize)
+ /* Round up to the first 2^n value */
+ bits = fls(hashsize);
+
+ return bits;
+}
+
+static int
+hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
+{
+ if (n->pos >= n->size) {
+ void *tmp;
+
+ if (n->size >= ahash_max)
+ /* Trigger rehashing */
+ return -EAGAIN;
+
+ tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ return -ENOMEM;
+ if (n->size) {
+ memcpy(tmp, n->value, n->size * dsize);
+ kfree(n->value);
+ }
+ n->value = tmp;
+ n->size += AHASH_INIT_SIZE;
+ }
+ return 0;
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+#if IPSET_NET_COUNT > 1
+#define __CIDR(cidr, i) (cidr[i])
+#else
+#define __CIDR(cidr, i) (cidr)
+#endif
+
+/* cidr + 1 is stored in net_prefixes to support /0 */
+#define SCIDR(cidr, i) (__CIDR(cidr, i) + 1)
+
+#ifdef IP_SET_HASH_WITH_NETS_PACKED
+/* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */
+#define GCIDR(cidr, i) (__CIDR(cidr, i) + 1)
+#define NCIDR(cidr) (cidr)
+#else
+#define GCIDR(cidr, i) (__CIDR(cidr, i))
+#define NCIDR(cidr) (cidr - 1)
+#endif
+
+#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
+
+#ifdef IP_SET_HASH_WITH_NET0
+#define NLEN(family) (SET_HOST_MASK(family) + 1)
+#else
+#define NLEN(family) SET_HOST_MASK(family)
+#endif
+
+#else
+#define NLEN(family) 0
+#endif /* IP_SET_HASH_WITH_NETS */
+
+#endif /* _IP_SET_HASH_GEN_H */
+
+/* Family dependent templates */
+
+#undef ahash_data
+#undef mtype_data_equal
+#undef mtype_do_data_match
+#undef mtype_data_set_flags
+#undef mtype_data_reset_flags
+#undef mtype_data_netmask
+#undef mtype_data_list
+#undef mtype_data_next
+#undef mtype_elem
+
+#undef mtype_ahash_destroy
+#undef mtype_ext_cleanup
+#undef mtype_add_cidr
+#undef mtype_del_cidr
+#undef mtype_ahash_memsize
+#undef mtype_flush
+#undef mtype_destroy
+#undef mtype_gc_init
+#undef mtype_same_set
+#undef mtype_kadt
+#undef mtype_uadt
+#undef mtype
+
+#undef mtype_add
+#undef mtype_del
+#undef mtype_test_cidrs
+#undef mtype_test
+#undef mtype_expire
+#undef mtype_resize
+#undef mtype_head
+#undef mtype_list
+#undef mtype_gc
+#undef mtype_gc_init
+#undef mtype_variant
+#undef mtype_data_match
+
+#undef HKEY
+
+#define mtype_data_equal IPSET_TOKEN(MTYPE, _data_equal)
+#ifdef IP_SET_HASH_WITH_NETS
+#define mtype_do_data_match IPSET_TOKEN(MTYPE, _do_data_match)
+#else
+#define mtype_do_data_match(d) 1
+#endif
+#define mtype_data_set_flags IPSET_TOKEN(MTYPE, _data_set_flags)
+#define mtype_data_reset_elem IPSET_TOKEN(MTYPE, _data_reset_elem)
+#define mtype_data_reset_flags IPSET_TOKEN(MTYPE, _data_reset_flags)
+#define mtype_data_netmask IPSET_TOKEN(MTYPE, _data_netmask)
+#define mtype_data_list IPSET_TOKEN(MTYPE, _data_list)
+#define mtype_data_next IPSET_TOKEN(MTYPE, _data_next)
+#define mtype_elem IPSET_TOKEN(MTYPE, _elem)
+#define mtype_ahash_destroy IPSET_TOKEN(MTYPE, _ahash_destroy)
+#define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup)
+#define mtype_add_cidr IPSET_TOKEN(MTYPE, _add_cidr)
+#define mtype_del_cidr IPSET_TOKEN(MTYPE, _del_cidr)
+#define mtype_ahash_memsize IPSET_TOKEN(MTYPE, _ahash_memsize)
+#define mtype_flush IPSET_TOKEN(MTYPE, _flush)
+#define mtype_destroy IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_same_set IPSET_TOKEN(MTYPE, _same_set)
+#define mtype_kadt IPSET_TOKEN(MTYPE, _kadt)
+#define mtype_uadt IPSET_TOKEN(MTYPE, _uadt)
+#define mtype MTYPE
+
+#define mtype_add IPSET_TOKEN(MTYPE, _add)
+#define mtype_del IPSET_TOKEN(MTYPE, _del)
+#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
+#define mtype_test IPSET_TOKEN(MTYPE, _test)
+#define mtype_expire IPSET_TOKEN(MTYPE, _expire)
+#define mtype_resize IPSET_TOKEN(MTYPE, _resize)
+#define mtype_head IPSET_TOKEN(MTYPE, _head)
+#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
+#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
+
+#ifndef HKEY_DATALEN
+#define HKEY_DATALEN sizeof(struct mtype_elem)
+#endif
+
+#define HKEY(data, initval, htable_bits) \
+(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \
+ & jhash_mask(htable_bits))
+
+#ifndef htype
+#define htype HTYPE
+
+/* The generic hash structure */
+struct htype {
+ struct htable __rcu *table; /* the hash table */
+ u32 maxelem; /* max elements in the hash */
+ u32 elements; /* current element (vs timeout) */
+ u32 initval; /* random jhash init value */
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ u32 markmask; /* markmask value for mark mask to store */
+#endif
+ struct timer_list gc; /* garbage collection when timeout enabled */
+ struct mtype_elem next; /* temporary storage for uadd */
+#ifdef IP_SET_HASH_WITH_MULTI
+ u8 ahash_max; /* max elements in an array block */
+#endif
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask; /* netmask value for subnets to store */
+#endif
+#ifdef IP_SET_HASH_WITH_RBTREE
+ struct rb_root rbtree;
+#endif
+#ifdef IP_SET_HASH_WITH_NETS
+ struct net_prefixes nets[0]; /* book-keeping of prefixes */
+#endif
+};
+#endif
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Network cidr size book keeping when the hash stores different
+ * sized networks */
+static void
+mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+{
+ int i, j;
+
+ /* Add in increasing prefix order, so larger cidr first */
+ for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) {
+ if (j != -1)
+ continue;
+ else if (h->nets[i].cidr[n] < cidr)
+ j = i;
+ else if (h->nets[i].cidr[n] == cidr) {
+ h->nets[cidr - 1].nets[n]++;
+ return;
+ }
+ }
+ if (j != -1) {
+ for (; i > j; i--)
+ h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
+ }
+ h->nets[i].cidr[n] = cidr;
+ h->nets[cidr - 1].nets[n] = 1;
+}
+
+static void
+mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
+{
+ u8 i, j, net_end = nets_length - 1;
+
+ for (i = 0; i < nets_length; i++) {
+ if (h->nets[i].cidr[n] != cidr)
+ continue;
+ h->nets[cidr -1].nets[n]--;
+ if (h->nets[cidr -1].nets[n] > 0)
+ return;
+ for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
+ h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
+ h->nets[j].cidr[n] = 0;
+ return;
+ }
+}
+#endif
+
+/* Calculate the actual memory size of the set data */
+static size_t
+mtype_ahash_memsize(const struct htype *h, const struct htable *t,
+ u8 nets_length, size_t dsize)
+{
+ u32 i;
+ size_t memsize = sizeof(*h)
+ + sizeof(*t)
+#ifdef IP_SET_HASH_WITH_NETS
+ + sizeof(struct net_prefixes) * nets_length
+#endif
+ + jhash_size(t->htable_bits) * sizeof(struct hbucket);
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++)
+ memsize += t->bucket[i].size * dsize;
+
+ return memsize;
+}
+
+/* Get the ith element from the array block n */
+#define ahash_data(n, i, dsize) \
+ ((struct mtype_elem *)((n)->value + ((i) * (dsize))))
+
+static void
+mtype_ext_cleanup(struct ip_set *set, struct hbucket *n)
+{
+ int i;
+
+ for (i = 0; i < n->pos; i++)
+ ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
+}
+
+/* Flush a hash type of set: destroy all elements */
+static void
+mtype_flush(struct ip_set *set)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ struct hbucket *n;
+ u32 i;
+
+ t = rcu_dereference_bh_nfnl(h->table);
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size) {
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set, n);
+ n->size = n->pos = 0;
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+ }
+#ifdef IP_SET_HASH_WITH_NETS
+ memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
+#endif
+ h->elements = 0;
+}
+
+/* Destroy the hashtable part of the set */
+static void
+mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
+{
+ struct hbucket *n;
+ u32 i;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size) {
+ if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+ }
+
+ ip_set_free(t);
+}
+
+/* Destroy a hash type of set */
+static void
+mtype_destroy(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (set->extensions & IPSET_EXT_TIMEOUT)
+ del_timer_sync(&h->gc);
+
+ mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table), true);
+#ifdef IP_SET_HASH_WITH_RBTREE
+ rbtree_destroy(&h->rbtree);
+#endif
+ kfree(h);
+
+ set->data = NULL;
+}
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct htype *h = set->data;
+
+ init_timer(&h->gc);
+ h->gc.data = (unsigned long) set;
+ h->gc.function = gc;
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&h->gc);
+ pr_debug("gc initialized, run in every %u\n",
+ IPSET_GC_PERIOD(set->timeout));
+}
+
+static bool
+mtype_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct htype *x = a->data;
+ const struct htype *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ a->timeout == b->timeout &&
+#ifdef IP_SET_HASH_WITH_NETMASK
+ x->netmask == y->netmask &&
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ x->markmask == y->markmask &&
+#endif
+ a->extensions == b->extensions;
+}
+
+/* Delete expired elements from the hashtable */
+static void
+mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
+{
+ struct htable *t;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ u32 i;
+ int j;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 k;
+#endif
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j, dsize);
+ if (ip_set_timeout_expired(ext_timeout(data, set))) {
+ pr_debug("expired %u/%u\n", i, j);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (k = 0; k < IPSET_NET_COUNT; k++)
+ mtype_del_cidr(h, SCIDR(data->cidr, k),
+ nets_length, k);
+#endif
+ ip_set_ext_destroy(set, data);
+ if (j != n->pos - 1)
+ /* Not last one */
+ memcpy(data,
+ ahash_data(n, n->pos - 1, dsize),
+ dsize);
+ n->pos--;
+ h->elements--;
+ }
+ }
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ /* Still try to delete expired elements */
+ continue;
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value, n->size * dsize);
+ kfree(n->value);
+ n->value = tmp;
+ }
+ }
+ rcu_read_unlock_bh();
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct htype *h = set->data;
+
+ pr_debug("called\n");
+ write_lock_bh(&set->lock);
+ mtype_expire(set, h, NLEN(set->family), set->dsize);
+ write_unlock_bh(&set->lock);
+
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&h->gc);
+}
+
+/* Resize a hash: create a new hash table with doubling the hashsize
+ * and inserting the elements to it. Repeat until we succeed or
+ * fail due to memory pressures. */
+static int
+mtype_resize(struct ip_set *set, bool retried)
+{
+ struct htype *h = set->data;
+ struct htable *t, *orig = rcu_dereference_bh_nfnl(h->table);
+ u8 htable_bits = orig->htable_bits;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 flags;
+#endif
+ struct mtype_elem *data;
+ struct mtype_elem *d;
+ struct hbucket *n, *m;
+ u32 i, j;
+ int ret;
+
+ /* Try to cleanup once */
+ if (SET_WITH_TIMEOUT(set) && !retried) {
+ i = h->elements;
+ write_lock_bh(&set->lock);
+ mtype_expire(set, set->data, NLEN(set->family), set->dsize);
+ write_unlock_bh(&set->lock);
+ if (h->elements < i)
+ return 0;
+ }
+
+retry:
+ ret = 0;
+ htable_bits++;
+ pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+ set->name, orig->htable_bits, htable_bits, orig);
+ if (!htable_bits) {
+ /* In case we have plenty of memory :-) */
+ pr_warn("Cannot increase the hashsize of set %s further\n",
+ set->name);
+ return -IPSET_ERR_HASH_FULL;
+ }
+ t = ip_set_alloc(sizeof(*t)
+ + jhash_size(htable_bits) * sizeof(struct hbucket));
+ if (!t)
+ return -ENOMEM;
+ t->htable_bits = htable_bits;
+
+ read_lock_bh(&set->lock);
+ for (i = 0; i < jhash_size(orig->htable_bits); i++) {
+ n = hbucket(orig, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ flags = 0;
+ mtype_data_reset_flags(data, &flags);
+#endif
+ m = hbucket(t, HKEY(data, h->initval, htable_bits));
+ ret = hbucket_elem_add(m, AHASH_MAX(h), set->dsize);
+ if (ret < 0) {
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_reset_flags(data, &flags);
+#endif
+ read_unlock_bh(&set->lock);
+ mtype_ahash_destroy(set, t, false);
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+ }
+ d = ahash_data(m, m->pos++, set->dsize);
+ memcpy(d, data, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_reset_flags(d, &flags);
+#endif
+ }
+ }
+
+ rcu_assign_pointer(h->table, t);
+ read_unlock_bh(&set->lock);
+
+ /* Give time to other readers of the set */
+ synchronize_rcu_bh();
+
+ pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
+ orig->htable_bits, orig, t->htable_bits, t);
+ mtype_ahash_destroy(set, orig, false);
+
+ return 0;
+}
+
+/* Add an element to a hash and update the internal counters when succeeded,
+ * otherwise report the proper error code. */
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ const struct mtype_elem *d = value;
+ struct mtype_elem *data;
+ struct hbucket *n;
+ int i, ret = 0;
+ int j = AHASH_MAX(h) + 1;
+ bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 key, multi = 0;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (mtype_data_equal(data, d, &multi)) {
+ if (flag_exist ||
+ (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))) {
+ /* Just the extensions could be overwritten */
+ j = i;
+ goto reuse_slot;
+ } else {
+ ret = -IPSET_ERR_EXIST;
+ goto out;
+ }
+ }
+ /* Reuse first timed out entry */
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)) &&
+ j != AHASH_MAX(h) + 1)
+ j = i;
+ }
+ if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set) && n->pos) {
+ /* Choosing the first entry in the array to replace */
+ j = 0;
+ goto reuse_slot;
+ }
+ if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
+ /* FIXME: when set is full, we slow down here */
+ mtype_expire(set, h, NLEN(set->family), set->dsize);
+
+ if (h->elements >= h->maxelem) {
+ if (net_ratelimit())
+ pr_warn("Set %s is full, maxelem %u reached\n",
+ set->name, h->maxelem);
+ ret = -IPSET_ERR_HASH_FULL;
+ goto out;
+ }
+
+reuse_slot:
+ if (j != AHASH_MAX(h) + 1) {
+ /* Fill out reused slot */
+ data = ahash_data(n, j, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (i = 0; i < IPSET_NET_COUNT; i++) {
+ mtype_del_cidr(h, SCIDR(data->cidr, i),
+ NLEN(set->family), i);
+ mtype_add_cidr(h, SCIDR(d->cidr, i),
+ NLEN(set->family), i);
+ }
+#endif
+ ip_set_ext_destroy(set, data);
+ } else {
+ /* Use/create a new slot */
+ TUNE_AHASH_MAX(h, multi);
+ ret = hbucket_elem_add(n, AHASH_MAX(h), set->dsize);
+ if (ret != 0) {
+ if (ret == -EAGAIN)
+ mtype_data_next(&h->next, d);
+ goto out;
+ }
+ data = ahash_data(n, n->pos++, set->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ mtype_add_cidr(h, SCIDR(d->cidr, i), NLEN(set->family),
+ i);
+#endif
+ h->elements++;
+ }
+ memcpy(data, d, sizeof(struct mtype_elem));
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_set_flags(data, flags);
+#endif
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(data, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(data, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(data, set), ext);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_init_skbinfo(ext_skbinfo(data, set), ext);
+
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+/* Delete an element from the hash: swap it with the last element
+ * and free up space if possible.
+ */
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ const struct mtype_elem *d = value;
+ struct mtype_elem *data;
+ struct hbucket *n;
+ int i, ret = -IPSET_ERR_EXIST;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 j;
+#endif
+ u32 key, multi = 0;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (!mtype_data_equal(data, d, &multi))
+ continue;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))
+ goto out;
+ if (i != n->pos - 1)
+ /* Not last one */
+ memcpy(data, ahash_data(n, n->pos - 1, set->dsize),
+ set->dsize);
+
+ n->pos--;
+ h->elements--;
+#ifdef IP_SET_HASH_WITH_NETS
+ for (j = 0; j < IPSET_NET_COUNT; j++)
+ mtype_del_cidr(h, SCIDR(d->cidr, j), NLEN(set->family),
+ j);
+#endif
+ ip_set_ext_destroy(set, data);
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * set->dsize,
+ GFP_ATOMIC);
+ if (!tmp) {
+ ret = 0;
+ goto out;
+ }
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value, n->size * set->dsize);
+ kfree(n->value);
+ n->value = tmp;
+ }
+ ret = 0;
+ goto out;
+ }
+
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+static inline int
+mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, struct ip_set *set, u32 flags)
+{
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(data, set),
+ ext, mext, flags);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_get_skbinfo(ext_skbinfo(data, set),
+ ext, mext, flags);
+ return mtype_do_data_match(data);
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Special test function which takes into account the different network
+ * sizes added to the set */
+static int
+mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
+ const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t = rcu_dereference_bh(h->table);
+ struct hbucket *n;
+ struct mtype_elem *data;
+#if IPSET_NET_COUNT == 2
+ struct mtype_elem orig = *d;
+ int i, j = 0, k;
+#else
+ int i, j = 0;
+#endif
+ u32 key, multi = 0;
+ u8 nets_length = NLEN(set->family);
+
+ pr_debug("test by nets\n");
+ for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) {
+#if IPSET_NET_COUNT == 2
+ mtype_data_reset_elem(d, &orig);
+ mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]), false);
+ for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi;
+ k++) {
+ mtype_data_netmask(d, NCIDR(h->nets[k].cidr[1]), true);
+#else
+ mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]));
+#endif
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (!mtype_data_equal(data, d, &multi))
+ continue;
+ if (SET_WITH_TIMEOUT(set)) {
+ if (!ip_set_timeout_expired(
+ ext_timeout(data, set)))
+ return mtype_data_match(data, ext,
+ mext, set,
+ flags);
+#ifdef IP_SET_HASH_WITH_MULTI
+ multi = 0;
+#endif
+ } else
+ return mtype_data_match(data, ext,
+ mext, set, flags);
+ }
+#if IPSET_NET_COUNT == 2
+ }
+#endif
+ }
+ return 0;
+}
+#endif
+
+/* Test whether the element is added to the set */
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ struct mtype_elem *d = value;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ int i, ret = 0;
+ u32 key, multi = 0;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+#ifdef IP_SET_HASH_WITH_NETS
+ /* If we test an IP address and not a network address,
+ * try all possible network sizes */
+ for (i = 0; i < IPSET_NET_COUNT; i++)
+ if (GCIDR(d->cidr, i) != SET_HOST_MASK(set->family))
+ break;
+ if (i == IPSET_NET_COUNT) {
+ ret = mtype_test_cidrs(set, d, ext, mext, flags);
+ goto out;
+ }
+#endif
+
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, set->dsize);
+ if (mtype_data_equal(data, d, &multi) &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, set)))) {
+ ret = mtype_data_match(data, ext, mext, set, flags);
+ goto out;
+ }
+ }
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+/* Reply a HEADER request: fill out the header part of the set */
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct htype *h = set->data;
+ const struct htable *t;
+ struct nlattr *nested;
+ size_t memsize;
+
+ t = rcu_dereference_bh_nfnl(h->table);
+ memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
+ htonl(jhash_size(t->htable_bits))) ||
+ nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
+ goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (h->netmask != HOST_MASK &&
+ nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+ goto nla_put_failure;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
+ goto nla_put_failure;
+#endif
+ if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+/* Reply a LIST/SAVE request: dump the elements of the specified set */
+static int
+mtype_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct htype *h = set->data;
+ const struct htable *t = rcu_dereference_bh_nfnl(h->table);
+ struct nlattr *atd, *nested;
+ const struct hbucket *n;
+ const struct mtype_elem *e;
+ u32 first = cb->args[IPSET_CB_ARG0];
+ /* We assume that one hash bucket fills into one page */
+ void *incomplete;
+ int i;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ pr_debug("list hash set %s\n", set->name);
+ for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
+ cb->args[IPSET_CB_ARG0]++) {
+ incomplete = skb_tail_pointer(skb);
+ n = hbucket(t, cb->args[IPSET_CB_ARG0]);
+ pr_debug("cb->arg bucket: %lu, t %p n %p\n",
+ cb->args[IPSET_CB_ARG0], t, n);
+ for (i = 0; i < n->pos; i++) {
+ e = ahash_data(n, i, set->dsize);
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ pr_debug("list hash %lu hbucket %p i %u, data %p\n",
+ cb->args[IPSET_CB_ARG0], n, i, e);
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (cb->args[IPSET_CB_ARG0] == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (mtype_data_list(skb, e))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, e, true))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, incomplete);
+ if (unlikely(first == cb->args[IPSET_CB_ARG0])) {
+ pr_warn("Can't list set %s: one bucket does not fit into a message. Please report it!\n",
+ set->name);
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, atd);
+ return 0;
+}
+
+static int
+IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt);
+
+static int
+IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+
+static const struct ip_set_type_variant mtype_variant = {
+ .kadt = mtype_kadt,
+ .uadt = mtype_uadt,
+ .adt = {
+ [IPSET_ADD] = mtype_add,
+ [IPSET_DEL] = mtype_del,
+ [IPSET_TEST] = mtype_test,
+ },
+ .destroy = mtype_destroy,
+ .flush = mtype_flush,
+ .head = mtype_head,
+ .list = mtype_list,
+ .resize = mtype_resize,
+ .same_set = mtype_same_set,
+};
+
+#ifdef IP_SET_EMIT_CREATE
+static int
+IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
+ struct nlattr *tb[], u32 flags)
+{
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ u32 markmask;
+#endif
+ u8 hbits;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask;
+#endif
+ size_t hsize;
+ struct HTYPE *h;
+ struct htable *t;
+
+#ifndef IP_SET_PROTO_UNDEF
+ if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
+ return -IPSET_ERR_INVALID_FAMILY;
+#endif
+
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ markmask = 0xffffffff;
+#endif
+#ifdef IP_SET_HASH_WITH_NETMASK
+ netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+ pr_debug("Create set %s with family %s\n",
+ set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
+#endif
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MARKMASK) ||
+#endif
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if ((set->family == NFPROTO_IPV4 && netmask > 32) ||
+ (set->family == NFPROTO_IPV6 && netmask > 128) ||
+ netmask == 0)
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ if (tb[IPSET_ATTR_MARKMASK]) {
+ markmask = ntohl(nla_get_u32(tb[IPSET_ATTR_MARKMASK]));
+
+ if (markmask == 0)
+ return -IPSET_ERR_INVALID_MARKMASK;
+ }
+#endif
+
+ hsize = sizeof(*h);
+#ifdef IP_SET_HASH_WITH_NETS
+ hsize += sizeof(struct net_prefixes) * NLEN(set->family);
+#endif
+ h = kzalloc(hsize, GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ h->netmask = netmask;
+#endif
+#ifdef IP_SET_HASH_WITH_MARKMASK
+ h->markmask = markmask;
+#endif
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ set->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ hsize = htable_size(hbits);
+ if (hsize == 0) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ t = ip_set_alloc(hsize);
+ if (!t) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ t->htable_bits = hbits;
+ rcu_assign_pointer(h->table, t);
+
+ set->data = h;
+#ifndef IP_SET_PROTO_UNDEF
+ if (set->family == NFPROTO_IPV4) {
+#endif
+ set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)));
+#ifndef IP_SET_PROTO_UNDEF
+ } else {
+ set->variant = &IPSET_TOKEN(HTYPE, 6_variant);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)));
+ }
+#endif
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+#ifndef IP_SET_PROTO_UNDEF
+ if (set->family == NFPROTO_IPV4)
+#endif
+ IPSET_TOKEN(HTYPE, 4_gc_init)(set,
+ IPSET_TOKEN(HTYPE, 4_gc));
+#ifndef IP_SET_PROTO_UNDEF
+ else
+ IPSET_TOKEN(HTYPE, 6_gc_init)(set,
+ IPSET_TOKEN(HTYPE, 6_gc));
+#endif
+ }
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(t->htable_bits),
+ t->htable_bits, h->maxelem, set->data, t);
+
+ return 0;
+}
+#endif /* IP_SET_EMIT_CREATE */
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
new file mode 100644
index 000000000..76959d79e
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -0,0 +1,325 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counters support */
+/* 2 Comments support */
+/* 3 Forceadd support */
+#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip");
+
+/* Type specific function prefix */
+#define HTYPE hash_ip
+#define IP_SET_HASH_WITH_NETMASK
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ip4_elem {
+ /* Zero valued IP addresses cannot be stored */
+ __be32 ip;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ip4_data_equal(const struct hash_ip4_elem *e1,
+ const struct hash_ip4_elem *e2,
+ u32 *multi)
+{
+ return e1->ip == e2->ip;
+}
+
+static inline bool
+hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e)
+{
+ next->ip = e->ip;
+}
+
+#define MTYPE hash_ip4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip4_elem e = { 0 };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ __be32 ip;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
+ ip &= ip_set_netmask(h->netmask);
+ if (ip == 0)
+ return -EINVAL;
+
+ e.ip = ip;
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip4_elem e = { 0 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, hosts;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ip &= ip_set_hostmask(h->netmask);
+
+ if (adt == IPSET_TEST) {
+ e.ip = htonl(ip);
+ if (e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ return adtfn(set, &e, &ext, &ext, flags);
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip += hosts) {
+ e.ip = htonl(ip);
+ if (e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+/* Member elements */
+struct hash_ip6_elem {
+ union nf_inet_addr ip;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
+ const struct hash_ip6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
+}
+
+static inline void
+hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+ ip6_netmask(ip, prefix);
+}
+
+static bool
+hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ip6
+#define PF 6
+#define HOST_MASK 128
+
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip6_elem e = { { .all = { 0 } } };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ hash_ip6_netmask(&e.ip, h->netmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip6_elem e = { { .all = { 0 } } };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ hash_ip6_netmask(&e.ip, h->netmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -IPSET_ERR_HASH_ELEM;
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_ip_type __read_mostly = {
+ .name = "hash:ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ip_init(void)
+{
+ return ip_set_type_register(&hash_ip_type);
+}
+
+static void __exit
+hash_ip_fini(void)
+{
+ ip_set_type_unregister(&hash_ip_type);
+}
+
+module_init(hash_ip_init);
+module_exit(hash_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
new file mode 100644
index 000000000..7abf9788c
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -0,0 +1,331 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,mark type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Forceadd support */
+#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
+IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,mark");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipmark
+#define IP_SET_HASH_WITH_MARKMASK
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipmark4_elem {
+ __be32 ip;
+ __u32 mark;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1,
+ const struct hash_ipmark4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->mark == ip2->mark;
+}
+
+static bool
+hash_ipmark4_data_list(struct sk_buff *skb,
+ const struct hash_ipmark4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
+ const struct hash_ipmark4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_ipmark4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipmark4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.mark = skb->mark;
+ e.mark &= h->markmask;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ e.mark &= h->markmask;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ e.ip = htonl(ip);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipmark6_elem {
+ union nf_inet_addr ip;
+ __u32 mark;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1,
+ const struct hash_ipmark6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->mark == ip2->mark;
+}
+
+static bool
+hash_ipmark6_data_list(struct sk_buff *skb,
+ const struct hash_ipmark6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
+ const struct hash_ipmark6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ipmark6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_ipmark6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+
+static int
+hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.mark = skb->mark;
+ e.mark &= h->markmask;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipmark *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipmark6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_MARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ e.mark = ntohl(nla_get_u32(tb[IPSET_ATTR_MARK]));
+ e.mark &= h->markmask;
+
+ if (adt == IPSET_TEST) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static struct ip_set_type hash_ipmark_type __read_mostly = {
+ .name = "hash:ip,mark",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_MARK,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipmark_create,
+ .create_policy = {
+ [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 },
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_MARK] = { .type = NLA_U32 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipmark_init(void)
+{
+ return ip_set_type_register(&hash_ipmark_type);
+}
+
+static void __exit
+hash_ipmark_fini(void)
+{
+ ip_set_type_unregister(&hash_ipmark_type);
+}
+
+module_init(hash_ipmark_init);
+module_exit(hash_ipmark_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
new file mode 100644
index 000000000..dcbcceb9a
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -0,0 +1,400 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Counters support added */
+/* 3 Comments support added */
+/* 4 Forceadd support added */
+#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipport
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
+ const struct hash_ipport4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipport4_data_list(struct sk_buff *skb,
+ const struct hash_ipport4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipport4_data_next(struct hash_ipport4_elem *next,
+ const struct hash_ipport4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+#define MTYPE hash_ipport4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipport4_elem)
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem e = { .ip = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport4_elem e = { .ip = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0, p = 0, port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.ip = htonl(ip);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
+ const struct hash_ipport6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipport6_data_list(struct sk_buff *skb,
+ const struct hash_ipport6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipport6_data_next(struct hash_ipport4_elem *next,
+ const struct hash_ipport6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_ipport6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_ipport6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipport_type __read_mostly = {
+ .name = "hash:ip,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipport_init(void)
+{
+ return ip_set_type_register(&hash_ipport_type);
+}
+
+static void __exit
+hash_ipport_fini(void)
+{
+ ip_set_type_unregister(&hash_ipport_type);
+}
+
+module_init(hash_ipport_init);
+module_exit(hash_ipport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
new file mode 100644
index 000000000..7ef93fc88
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -0,0 +1,412 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Counters support added */
+/* 3 Comments support added */
+/* 4 Forceadd support added */
+#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port,ip");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipportip
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipportip4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+static inline bool
+hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
+ const struct hash_ipportip4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipportip4_data_list(struct sk_buff *skb,
+ const struct hash_ipportip4_elem *data)
+{
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportip4_data_next(struct hash_ipportip4_elem *next,
+ const struct hash_ipportip4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+/* Common functions */
+#define MTYPE hash_ipportip4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem e = { .ip = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip4_elem e = { .ip = 0 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip, ip_to = 0, p = 0, port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+ tb[IPSET_ATTR_PORT_TO])) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip = ntohl(e.ip);
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.ip = htonl(ip);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipportip6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
+ const struct hash_ipportip6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static bool
+hash_ipportip6_data_list(struct sk_buff *skb,
+ const struct hash_ipportip6_elem *data)
+{
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportip6_data_next(struct hash_ipportip4_elem *next,
+ const struct hash_ipportip6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_ipportip6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportip *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipportip_type __read_mostly = {
+ .name = "hash:ip,port,ip",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipportip_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportip_init(void)
+{
+ return ip_set_type_register(&hash_ipportip_type);
+}
+
+static void __exit
+hash_ipportip_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportip_type);
+}
+
+module_init(hash_ipportip_init);
+module_exit(hash_ipportip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
new file mode 100644
index 000000000..b6012ad92
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -0,0 +1,571 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Range as input support for IPv4 added */
+/* 3 nomatch flag support added */
+/* 4 Counters support added */
+/* 5 Comments support added */
+/* 6 Forceadd support added */
+#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:ip,port,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_ipportnet
+
+/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0
+ * However this way we have to store internally cidr - 1,
+ * dancing back and forth.
+ */
+#define IP_SET_HASH_WITH_NETS_PACKED
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_ipportnet4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
+ const struct hash_ipportnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->ip2 == ip2->ip2 &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
+{
+ elem->ip2 &= ip_set_netmask(cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_ipportnet4_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next,
+ const struct hash_ipportnet4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+ next->ip2 = d->ip2;
+}
+
+#define MTYPE hash_ipportnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2);
+ e.ip2 &= ip_set_netmask(e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, p = 0, port, port_to;
+ u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports ||
+ tb[IPSET_ATTR_IP2_TO])) {
+ e.ip = htonl(ip);
+ e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ } else if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!cidr || cidr > 32)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip_set_mask_from_to(ip, ip_to, cidr);
+ }
+
+ port_to = port = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_from > ip2_to)
+ swap(ip2_from, ip2_to);
+ if (ip2_from + UINT_MAX == ip2_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ for (; !before(ip_to, ip); ip++) {
+ e.ip = htonl(ip);
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ip2 = retried &&
+ ip == ntohl(h->next.ip) &&
+ p == ntohs(h->next.port)
+ ? ntohl(h->next.ip2) : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip2 = htonl(ip2);
+ ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
+ &cidr);
+ e.cidr = cidr - 1;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = ip2_last + 1;
+ }
+ }
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_ipportnet6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
+ const struct hash_ipportnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) &&
+ ip1->cidr == ip2->cidr &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip2, cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_ipportnet6_data_list(struct sk_buff *skb,
+ const struct hash_ipportnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next,
+ const struct hash_ipportnet6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_ipportnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6);
+ ip6_netmask(&e.ip2, e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_ipportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE) ||
+ tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_CIDR]))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ ip6_netmask(&e.ip2, e.cidr + 1);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_ipportnet_type __read_mostly = {
+ .name = "hash:ip,port,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_ipportnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_ipportnet_init(void)
+{
+ return ip_set_type_register(&hash_ipportnet_type);
+}
+
+static void __exit
+hash_ipportnet_fini(void)
+{
+ ip_set_type_unregister(&hash_ipportnet_type);
+}
+
+module_init(hash_ipportnet_init);
+module_exit(hash_ipportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
new file mode 100644
index 000000000..65690b52a
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -0,0 +1,173 @@
+/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:mac type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+#define IPSET_TYPE_REV_MAX 0
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:mac");
+
+/* Type specific function prefix */
+#define HTYPE hash_mac
+
+/* Member elements */
+struct hash_mac4_elem {
+ /* Zero valued IP addresses cannot be stored */
+ union {
+ unsigned char ether[ETH_ALEN];
+ __be32 foo[2];
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_mac4_data_equal(const struct hash_mac4_elem *e1,
+ const struct hash_mac4_elem *e2,
+ u32 *multi)
+{
+ return ether_addr_equal(e1->ether, e2->ether);
+}
+
+static inline bool
+hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e)
+{
+ return nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether);
+}
+
+static inline void
+hash_mac4_data_next(struct hash_mac4_elem *next,
+ const struct hash_mac4_elem *e)
+{
+}
+
+#define MTYPE hash_mac4
+#define PF 4
+#define HOST_MASK 32
+#define IP_SET_EMIT_CREATE
+#define IP_SET_PROTO_UNDEF
+#include "ip_set_hash_gen.h"
+
+/* Zero valued element is not supported */
+static const unsigned char invalid_ether[ETH_ALEN] = { 0 };
+
+static int
+hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ /* MAC can be src only */
+ if (!(opt->flags & IPSET_DIM_ONE_SRC))
+ return 0;
+
+ if (skb_mac_header(skb) < skb->head ||
+ (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+ return -EINVAL;
+
+ memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+ if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
+ return -EINVAL;
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_ETHER] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+ memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
+ return -IPSET_ERR_HASH_ELEM;
+
+ return adtfn(set, &e, &ext, &ext, flags);
+}
+
+static struct ip_set_type hash_mac_type __read_mostly = {
+ .name = "hash:mac",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_MAC,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_mac_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
+ .len = ETH_ALEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_mac_init(void)
+{
+ return ip_set_type_register(&hash_mac_type);
+}
+
+static void __exit
+hash_mac_fini(void)
+{
+ ip_set_type_unregister(&hash_mac_type);
+}
+
+module_init(hash_mac_init);
+module_exit(hash_mac_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
new file mode 100644
index 000000000..6b3ac10ac
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -0,0 +1,407 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Range as input support for IPv4 added */
+/* 2 nomatch flag support added */
+/* 3 Counters support added */
+/* 4 Comments support added */
+/* 5 Forceadd support added */
+#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net");
+
+/* Type specific function prefix */
+#define HTYPE hash_net
+#define IP_SET_HASH_WITH_NETS
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_net4_elem {
+ __be32 ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+};
+
+/* Common functions */
+
+static inline bool
+hash_net4_data_equal(const struct hash_net4_elem *ip1,
+ const struct hash_net4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_net4_do_data_match(const struct hash_net4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_net4_data_next(struct hash_net4_elem *next,
+ const struct hash_net4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_net4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net4_elem e = { .cidr = HOST_MASK };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr || e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret:
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ }
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_net6_elem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+};
+
+/* Common functions */
+
+static inline bool
+hash_net6_data_equal(const struct hash_net6_elem *ip1,
+ const struct hash_net6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_net6_do_data_match(const struct hash_net6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_net6_data_next(struct hash_net4_elem *next,
+ const struct hash_net6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_net6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_net *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_net6_elem e = { .cidr = HOST_MASK };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (!e.cidr || e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip, e.cidr);
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_net_type __read_mostly = {
+ .name = "hash:net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_net_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_net_init(void)
+{
+ return ip_set_type_register(&hash_net_type);
+}
+
+static void __exit
+hash_net_fini(void)
+{
+ ip_set_type_unregister(&hash_net_type);
+}
+
+module_init(hash_net_init);
+module_exit(hash_net_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
new file mode 100644
index 000000000..380ef5148
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -0,0 +1,637 @@
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,iface type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 nomatch flag support added */
+/* 2 /0 support added */
+/* 3 Counters support added */
+/* 4 Comments support added */
+/* 5 Forceadd support added */
+#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,iface");
+
+/* Interface name rbtree */
+
+struct iface_node {
+ struct rb_node node;
+ char iface[IFNAMSIZ];
+};
+
+#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface)
+
+static void
+rbtree_destroy(struct rb_root *root)
+{
+ struct iface_node *node, *next;
+
+ rbtree_postorder_for_each_entry_safe(node, next, root, node)
+ kfree(node);
+
+ *root = RB_ROOT;
+}
+
+static int
+iface_test(struct rb_root *root, const char **iface)
+{
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ const char *d = iface_data(n);
+ int res = strcmp(*iface, d);
+
+ if (res < 0)
+ n = n->rb_left;
+ else if (res > 0)
+ n = n->rb_right;
+ else {
+ *iface = d;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+iface_add(struct rb_root *root, const char **iface)
+{
+ struct rb_node **n = &(root->rb_node), *p = NULL;
+ struct iface_node *d;
+
+ while (*n) {
+ char *ifname = iface_data(*n);
+ int res = strcmp(*iface, ifname);
+
+ p = *n;
+ if (res < 0)
+ n = &((*n)->rb_left);
+ else if (res > 0)
+ n = &((*n)->rb_right);
+ else {
+ *iface = ifname;
+ return 0;
+ }
+ }
+
+ d = kzalloc(sizeof(*d), GFP_ATOMIC);
+ if (!d)
+ return -ENOMEM;
+ strcpy(d->iface, *iface);
+
+ rb_link_node(&d->node, p, n);
+ rb_insert_color(&d->node, root);
+
+ *iface = d->iface;
+ return 0;
+}
+
+/* Type specific function prefix */
+#define HTYPE hash_netiface
+#define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_RBTREE
+#define IP_SET_HASH_WITH_MULTI
+#define IP_SET_HASH_WITH_NET0
+
+#define STREQ(a, b) (strcmp(a, b) == 0)
+
+/* IPv4 variant */
+
+struct hash_netiface4_elem_hashed {
+ __be32 ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+};
+
+/* Member elements */
+struct hash_netiface4_elem {
+ __be32 ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
+ const struct hash_netiface4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->cidr == ip2->cidr &&
+ (++*multi) &&
+ ip1->physdev == ip2->physdev &&
+ ip1->iface == ip2->iface;
+}
+
+static inline int
+hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_netiface4_data_list(struct sk_buff *skb,
+ const struct hash_netiface4_elem *data)
+{
+ u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+ if (data->nomatch)
+ flags |= IPSET_FLAG_NOMATCH;
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netiface4_data_next(struct hash_netiface4_elem *next,
+ const struct hash_netiface4_elem *d)
+{
+ next->ip = d->ip;
+}
+
+#define MTYPE hash_netiface4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed)
+#include "ip_set_hash_gen.h"
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+static const char *get_physindev_name(const struct sk_buff *skb)
+{
+ struct net_device *dev = nf_bridge_get_physindev(skb);
+
+ return dev ? dev->name : NULL;
+}
+
+static const char *get_phyoutdev_name(const struct sk_buff *skb)
+{
+ struct net_device *dev = nf_bridge_get_physoutdev(skb);
+
+ return dev ? dev->name : NULL;
+}
+#endif
+
+static int
+hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .elem = 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ int ret;
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr);
+
+#define IFACE(dir) (par->dir ? par->dir->name : NULL)
+#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC)
+
+ if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ e.iface = SRCDIR ? get_physindev_name(skb) :
+ get_phyoutdev_name(skb);
+
+ if (!e.iface)
+ return -EINVAL;
+ e.physdev = 1;
+#else
+ e.iface = NULL;
+#endif
+ } else
+ e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+
+ if (!e.iface)
+ return -EINVAL;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ char iface[IFNAMSIZ];
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_IFACE] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ }
+
+ strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
+ e.iface = iface;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_PHYSDEV)
+ e.physdev = 1;
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+ if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netiface6_elem_hashed {
+ union nf_inet_addr ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+};
+
+struct hash_netiface6_elem {
+ union nf_inet_addr ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
+ const struct hash_netiface6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->cidr == ip2->cidr &&
+ (++*multi) &&
+ ip1->physdev == ip2->physdev &&
+ ip1->iface == ip2->iface;
+}
+
+static inline int
+hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr;
+}
+
+static bool
+hash_netiface6_data_list(struct sk_buff *skb,
+ const struct hash_netiface6_elem *data)
+{
+ u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+
+ if (data->nomatch)
+ flags |= IPSET_FLAG_NOMATCH;
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
+ nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netiface6_data_next(struct hash_netiface4_elem *next,
+ const struct hash_netiface6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+#undef HKEY_DATALEN
+
+#define MTYPE hash_netiface6
+#define PF 6
+#define HOST_MASK 128
+#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+ .elem = 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ int ret;
+
+ if (e.cidr == 0)
+ return -EINVAL;
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr);
+
+ if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ e.iface = SRCDIR ? get_physindev_name(skb) :
+ get_phyoutdev_name(skb);
+ if (!e.iface)
+ return -EINVAL;
+
+ e.physdev = 1;
+#else
+ e.iface = NULL;
+#endif
+ } else
+ e.iface = SRCDIR ? IFACE(in) : IFACE(out);
+
+ if (!e.iface)
+ return -EINVAL;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct hash_netiface *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ char iface[IFNAMSIZ];
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !tb[IPSET_ATTR_IFACE] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (e.cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ ip6_netmask(&e.ip, e.cidr);
+
+ strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
+ e.iface = iface;
+ ret = iface_test(&h->rbtree, &e.iface);
+ if (adt == IPSET_ADD) {
+ if (!ret) {
+ ret = iface_add(&h->rbtree, &e.iface);
+ if (ret)
+ return ret;
+ }
+ } else if (!ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_PHYSDEV)
+ e.physdev = 1;
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_netiface_type __read_mostly = {
+ .name = "hash:net,iface",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_IFACE |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netiface_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IFACE] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netiface_init(void)
+{
+ return ip_set_type_register(&hash_netiface_type);
+}
+
+static void __exit
+hash_netiface_fini(void)
+{
+ ip_set_type_unregister(&hash_netiface_type);
+}
+
+module_init(hash_netiface_init);
+module_exit(hash_netiface_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
new file mode 100644
index 000000000..ea8772afb
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -0,0 +1,494 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Forceadd support added */
+#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
+IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_netnet
+#define IP_SET_HASH_WITH_NETS
+#define IPSET_NET_COUNT 2
+
+/* IPv4 variants */
+
+/* Member elements */
+struct hash_netnet4_elem {
+ union {
+ __be32 ip[2];
+ __be64 ipcmp;
+ };
+ u8 nomatch;
+ u8 padding;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
+ const struct hash_netnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ipcmp == ip2->ipcmp &&
+ ip1->ccmp == ip2->ccmp;
+}
+
+static inline int
+hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
+ struct hash_netnet4_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
+{
+ if (inner) {
+ elem->ip[1] &= ip_set_netmask(cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ elem->ip[0] &= ip_set_netmask(cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netnet4_data_list(struct sk_buff *skb,
+ const struct hash_netnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_netnet4_data_next(struct hash_netnet4_elem *next,
+ const struct hash_netnet4_elem *d)
+{
+ next->ipcmp = d->ipcmp;
+}
+
+#define MTYPE hash_netnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
+ e.ip[0] &= ip_set_netmask(e.cidr[0]);
+ e.ip[1] &= ip_set_netmask(e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, last;
+ u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
+ u8 cidr, cidr2;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[0] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr2 = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr2 || cidr2 > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[1] = cidr2;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] ||
+ tb[IPSET_ATTR_IP2_TO])) {
+ e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (unlikely(ip + UINT_MAX == ip_to))
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_to < ip2_from)
+ swap(ip2_from, ip2_to);
+ if (unlikely(ip2_from + UINT_MAX == ip2_to))
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
+
+ if (retried)
+ ip = ntohl(h->next.ip[0]);
+
+ while (!after(ip, ip_to)) {
+ e.ip[0] = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr[0] = cidr;
+ ip2 = (retried &&
+ ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
+ : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip[1] = htonl(ip2);
+ last2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr2);
+ e.cidr[1] = cidr2;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = last2 + 1;
+ }
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variants */
+
+struct hash_netnet6_elem {
+ union nf_inet_addr ip[2];
+ u8 nomatch;
+ u8 padding;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+};
+
+/* Common functions */
+
+static inline bool
+hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
+ const struct hash_netnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
+ ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
+ ip1->ccmp == ip2->ccmp;
+}
+
+static inline int
+hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
+}
+
+static inline void
+hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
+ struct hash_netnet6_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
+{
+ if (inner) {
+ ip6_netmask(&elem->ip[1], cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ ip6_netmask(&elem->ip[0], cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netnet6_data_list(struct sk_buff *skb,
+ const struct hash_netnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return false;
+
+nla_put_failure:
+ return true;
+}
+
+static inline void
+hash_netnet6_data_next(struct hash_netnet4_elem *next,
+ const struct hash_netnet6_elem *d)
+{
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6);
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
+ ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (tb[IPSET_ATTR_CIDR2])
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
+ e.cidr[1] > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static struct ip_set_type hash_netnet_type __read_mostly = {
+ .name = "hash:net,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netnet_init(void)
+{
+ return ip_set_type_register(&hash_netnet_type);
+}
+
+static void __exit
+hash_netnet_fini(void)
+{
+ ip_set_type_unregister(&hash_netnet_type);
+}
+
+module_init(hash_netnet_init);
+module_exit(hash_netnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
new file mode 100644
index 000000000..c0ddb58d1
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -0,0 +1,519 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 SCTP and UDPLITE support added */
+/* 2 Range as input support for IPv4 added */
+/* 3 nomatch flag support added */
+/* 4 Counters support added */
+/* 5 Comments support added */
+/* 6 Forceadd support added */
+#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,port");
+
+/* Type specific function prefix */
+#define HTYPE hash_netport
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0
+ * However this way we have to store internally cidr - 1,
+ * dancing back and forth.
+ */
+#define IP_SET_HASH_WITH_NETS_PACKED
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_netport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
+ const struct hash_netport4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ip == ip2->ip &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_netport4_do_data_match(const struct hash_netport4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
+{
+ elem->ip &= ip_set_netmask(cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_netport4_data_list(struct sk_buff *skb,
+ const struct hash_netport4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netport4_data_next(struct hash_netport4_elem *next,
+ const struct hash_netport4_elem *d)
+{
+ next->ip = d->ip;
+ next->port = d->port;
+}
+
+#define MTYPE hash_netport4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to, p = 0, ip = 0, ip_to = 0, last;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) {
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = port_to = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port_to < port)
+ swap(port, port_to);
+ }
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip_to < ip)
+ swap(ip, ip_to);
+ if (ip + UINT_MAX == ip_to)
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
+
+ if (retried)
+ ip = ntohl(h->next.ip);
+ while (!after(ip, ip_to)) {
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr = cidr - 1;
+ p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ ip = last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
+ const struct hash_netport6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto &&
+ ip1->cidr == ip2->cidr;
+}
+
+static inline int
+hash_netport6_do_data_match(const struct hash_netport6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
+{
+ ip6_netmask(&elem->ip, cidr);
+ elem->cidr = cidr - 1;
+}
+
+static bool
+hash_netport6_data_list(struct sk_buff *skb,
+ const struct hash_netport6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netport6_data_next(struct hash_netport4_elem *next,
+ const struct hash_netport6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netport6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem e = {
+ .cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+ };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ if (adt == IPSET_TEST)
+ e.cidr = HOST_MASK - 1;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr + 1);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netport *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ u8 cidr;
+ int ret;
+
+ if (unlikely(!tb[IPSET_ATTR_IP] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr = cidr - 1;
+ }
+ ip6_netmask(&e.ip, e.cidr + 1);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_netport_type __read_mostly = {
+ .name = "hash:net,port",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_TWO,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netport_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netport_init(void)
+{
+ return ip_set_type_register(&hash_netport_type);
+}
+
+static void __exit
+hash_netport_fini(void)
+{
+ ip_set_type_unregister(&hash_netport_type);
+}
+
+module_init(hash_netport_init);
+module_exit(hash_netport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
new file mode 100644
index 000000000..bfaa94c7b
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -0,0 +1,601 @@
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 0 Comments support added */
+/* 1 Forceadd support added */
+#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
+IP_SET_MODULE_DESC("hash:net,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_hash:net,port,net");
+
+/* Type specific function prefix */
+#define HTYPE hash_netportnet
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+#define IPSET_NET_COUNT 2
+
+/* IPv4 variant */
+
+/* Member elements */
+struct hash_netportnet4_elem {
+ union {
+ __be32 ip[2];
+ __be64 ipcmp;
+ };
+ __be16 port;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+ u16 padding;
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
+ const struct hash_netportnet4_elem *ip2,
+ u32 *multi)
+{
+ return ip1->ipcmp == ip2->ipcmp &&
+ ip1->ccmp == ip2->ccmp &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem,
+ struct hash_netportnet4_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem,
+ u8 cidr, bool inner)
+{
+ if (inner) {
+ elem->ip[1] &= ip_set_netmask(cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ elem->ip[0] &= ip_set_netmask(cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netportnet4_data_list(struct sk_buff *skb,
+ const struct hash_netportnet4_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
+ const struct hash_netportnet4_elem *d)
+{
+ next->ipcmp = d->ipcmp;
+ next->port = d->port;
+}
+
+#define MTYPE hash_netportnet4
+#define PF 4
+#define HOST_MASK 32
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
+
+ if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1]);
+ e.ip[0] &= ip_set_netmask(e.cidr[0]);
+ e.ip[1] &= ip_set_netmask(e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
+ u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
+ bool with_ports = false;
+ u8 cidr, cidr2;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[0] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_CIDR2]) {
+ cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+ if (!cidr || cidr > HOST_MASK)
+ return -IPSET_ERR_INVALID_CIDR;
+ e.cidr[1] = cidr;
+ }
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
+ if (adt == IPSET_TEST ||
+ !(tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) {
+ e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ ip_to = ip;
+ if (tb[IPSET_ATTR_IP_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+ if (ret)
+ return ret;
+ if (ip > ip_to)
+ swap(ip, ip_to);
+ if (unlikely(ip + UINT_MAX == ip_to))
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
+
+ port_to = port = ntohs(e.port);
+ if (tb[IPSET_ATTR_PORT_TO]) {
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+ }
+
+ ip2_to = ip2_from;
+ if (tb[IPSET_ATTR_IP2_TO]) {
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
+ if (ret)
+ return ret;
+ if (ip2_from > ip2_to)
+ swap(ip2_from, ip2_to);
+ if (unlikely(ip2_from + UINT_MAX == ip2_to))
+ return -IPSET_ERR_HASH_RANGE;
+ } else
+ ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
+
+ if (retried)
+ ip = ntohl(h->next.ip[0]);
+
+ while (!after(ip, ip_to)) {
+ e.ip[0] = htonl(ip);
+ ip_last = ip_set_range_to_cidr(ip, ip_to, &cidr);
+ e.cidr[0] = cidr;
+ p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port)
+ : port;
+ for (; p <= port_to; p++) {
+ e.port = htons(p);
+ ip2 = (retried && ip == ntohl(h->next.ip[0]) &&
+ p == ntohs(h->next.port)) ? ntohl(h->next.ip[1])
+ : ip2_from;
+ while (!after(ip2, ip2_to)) {
+ e.ip[1] = htonl(ip2);
+ ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
+ &cidr2);
+ e.cidr[1] = cidr2;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ ip2 = ip2_last + 1;
+ }
+ }
+ ip = ip_last + 1;
+ }
+ return ret;
+}
+
+/* IPv6 variant */
+
+struct hash_netportnet6_elem {
+ union nf_inet_addr ip[2];
+ __be16 port;
+ union {
+ u8 cidr[2];
+ u16 ccmp;
+ };
+ u16 padding;
+ u8 nomatch:1;
+ u8 proto;
+};
+
+/* Common functions */
+
+static inline bool
+hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
+ const struct hash_netportnet6_elem *ip2,
+ u32 *multi)
+{
+ return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
+ ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
+ ip1->ccmp == ip2->ccmp &&
+ ip1->port == ip2->port &&
+ ip1->proto == ip2->proto;
+}
+
+static inline int
+hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem)
+{
+ return elem->nomatch ? -ENOTEMPTY : 1;
+}
+
+static inline void
+hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags)
+{
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags)
+{
+ swap(*flags, elem->nomatch);
+}
+
+static inline void
+hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem,
+ struct hash_netportnet6_elem *orig)
+{
+ elem->ip[1] = orig->ip[1];
+}
+
+static inline void
+hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem,
+ u8 cidr, bool inner)
+{
+ if (inner) {
+ ip6_netmask(&elem->ip[1], cidr);
+ elem->cidr[1] = cidr;
+ } else {
+ ip6_netmask(&elem->ip[0], cidr);
+ elem->cidr[0] = cidr;
+ }
+}
+
+static bool
+hash_netportnet6_data_list(struct sk_buff *skb,
+ const struct hash_netportnet6_elem *data)
+{
+ u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
+
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) ||
+ nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
+ nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
+ nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
+ (flags &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return 1;
+}
+
+static inline void
+hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
+ const struct hash_netportnet6_elem *d)
+{
+ next->port = d->port;
+}
+
+#undef MTYPE
+#undef PF
+#undef HOST_MASK
+
+#define MTYPE hash_netportnet6
+#define PF 6
+#define HOST_MASK 128
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
+
+static int
+hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+ e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+ if (adt == IPSET_TEST)
+ e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
+
+ if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
+ &e.port, &e.proto))
+ return -EINVAL;
+
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1].in6);
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
+}
+
+static int
+hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ const struct hash_netportnet *h = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_netportnet6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ u32 port, port_to;
+ bool with_ports = false;
+ int ret;
+
+ e.cidr[0] = e.cidr[1] = HOST_MASK;
+ if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+ !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+ if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
+ return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]) ||
+ ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]) ||
+ ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+
+ if (tb[IPSET_ATTR_CIDR])
+ e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+ if (tb[IPSET_ATTR_CIDR2])
+ e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+ if (unlikely(!e.cidr[0] || e.cidr[0] > HOST_MASK || !e.cidr[1] ||
+ e.cidr[1] > HOST_MASK))
+ return -IPSET_ERR_INVALID_CIDR;
+
+ ip6_netmask(&e.ip[0], e.cidr[0]);
+ ip6_netmask(&e.ip[1], e.cidr[1]);
+
+ if (tb[IPSET_ATTR_PORT])
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ else
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_PROTO]) {
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
+
+ if (e.proto == 0)
+ return -IPSET_ERR_INVALID_PROTO;
+ } else
+ return -IPSET_ERR_MISSING_PROTO;
+
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
+ }
+
+ if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt, set) ? -ret :
+ ip_set_eexist(ret, flags) ? 0 : ret;
+ }
+
+ port = ntohs(e.port);
+ port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+ if (port > port_to)
+ swap(port, port_to);
+
+ if (retried)
+ port = ntohs(h->next.port);
+ for (; port <= port_to; port++) {
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+ if (ret && !ip_set_eexist(ret, flags))
+ return ret;
+ else
+ ret = 0;
+ }
+ return ret;
+}
+
+static struct ip_set_type hash_netportnet_type __read_mostly = {
+ .name = "hash:net,port,net",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 |
+ IPSET_TYPE_NOMATCH,
+ .dimension = IPSET_DIM_THREE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = hash_netportnet_create,
+ .create_policy = {
+ [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
+ [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_IP] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2] = { .type = NLA_NESTED },
+ [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
+ [IPSET_ATTR_PORT] = { .type = NLA_U16 },
+ [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
+ [IPSET_ATTR_CIDR] = { .type = NLA_U8 },
+ [IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
+ [IPSET_ATTR_PROTO] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+hash_netportnet_init(void)
+{
+ return ip_set_type_register(&hash_netportnet_type);
+}
+
+static void __exit
+hash_netportnet_fini(void)
+{
+ ip_set_type_unregister(&hash_netportnet_type);
+}
+
+module_init(hash_netportnet_init);
+module_exit(hash_netportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
new file mode 100644
index 000000000..f8f682806
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -0,0 +1,702 @@
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the list:set type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_list.h>
+
+#define IPSET_TYPE_REV_MIN 0
+/* 1 Counters support added */
+/* 2 Comments support added */
+#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
+MODULE_ALIAS("ip_set_list:set");
+
+/* Member elements */
+struct set_elem {
+ ip_set_id_t id;
+};
+
+struct set_adt_elem {
+ ip_set_id_t id;
+ ip_set_id_t refid;
+ int before;
+};
+
+/* Type structure */
+struct list_set {
+ u32 size; /* size of set list array */
+ struct timer_list gc; /* garbage collection */
+ struct net *net; /* namespace */
+ struct set_elem members[0]; /* the set members */
+};
+
+#define list_set_elem(set, map, id) \
+ (struct set_elem *)((void *)(map)->members + (id) * (set)->dsize)
+
+static int
+list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i, cmdflags = opt->cmdflags;
+ int ret;
+
+ /* Don't lookup sub-counters at all */
+ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
+ if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
+ opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_test(e->id, skb, par, opt);
+ if (ret > 0) {
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(e, set),
+ ext, &opt->ext,
+ cmdflags);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_get_skbinfo(ext_skbinfo(e, set),
+ ext, &opt->ext,
+ cmdflags);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static int
+list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_add(e->id, skb, par, opt);
+ if (ret == 0)
+ return ret;
+ }
+ return 0;
+}
+
+static int
+list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ ret = ip_set_del(e->id, skb, par, opt);
+ if (ret == 0)
+ return ret;
+ }
+ return 0;
+}
+
+static int
+list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+
+ switch (adt) {
+ case IPSET_TEST:
+ return list_set_ktest(set, skb, par, opt, &ext);
+ case IPSET_ADD:
+ return list_set_kadd(set, skb, par, opt, &ext);
+ case IPSET_DEL:
+ return list_set_kdel(set, skb, par, opt, &ext);
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+static bool
+id_eq(const struct ip_set *set, u32 i, ip_set_id_t id)
+{
+ const struct list_set *map = set->data;
+ const struct set_elem *e;
+
+ if (i >= map->size)
+ return 0;
+
+ e = list_set_elem(set, map, i);
+ return !!(e->id == id &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set))));
+}
+
+static int
+list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
+ const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e = list_set_elem(set, map, i);
+
+ if (e->id != IPSET_INVALID_ID) {
+ if (i == map->size - 1) {
+ /* Last element replaced: e.g. add new,before,last */
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+ } else {
+ struct set_elem *x = list_set_elem(set, map,
+ map->size - 1);
+
+ /* Last element pushed off */
+ if (x->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(map->net, x->id);
+ ip_set_ext_destroy(set, x);
+ }
+ memmove(list_set_elem(set, map, i + 1), e,
+ set->dsize * (map->size - (i + 1)));
+ /* Extensions must be initialized to zero */
+ memset(e, 0, set->dsize);
+ }
+ }
+
+ e->id = d->id;
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(e, set), ext);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
+ return 0;
+}
+
+static int
+list_set_del(struct ip_set *set, u32 i)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e = list_set_elem(set, map, i);
+
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+
+ if (i < map->size - 1)
+ memmove(e, list_set_elem(set, map, i + 1),
+ set->dsize * (map->size - (i + 1)));
+
+ /* Last element */
+ e = list_set_elem(set, map, map->size - 1);
+ e->id = IPSET_INVALID_ID;
+ return 0;
+}
+
+static void
+set_cleanup_entries(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i = 0;
+
+ while (i < map->size) {
+ e = list_set_elem(set, map, i);
+ if (e->id != IPSET_INVALID_ID &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ list_set_del(set, i);
+ /* Check element moved to position i in next loop */
+ else
+ i++;
+ }
+}
+
+static int
+list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ u32 i;
+ int ret;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ else if (e->id != d->id)
+ continue;
+
+ if (d->before == 0)
+ return 1;
+ else if (d->before > 0)
+ ret = id_eq(set, i + 1, d->refid);
+ else
+ ret = i > 0 && id_eq(set, i - 1, d->refid);
+ return ret;
+ }
+ return 0;
+}
+
+
+static int
+list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 i, ret = 0;
+
+ if (SET_WITH_TIMEOUT(set))
+ set_cleanup_entries(set);
+
+ /* Check already added element */
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto insert;
+ else if (e->id != d->id)
+ continue;
+
+ if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) ||
+ (d->before < 0 &&
+ (i == 0 || !id_eq(set, i - 1, d->refid))))
+ /* Before/after doesn't match */
+ return -IPSET_ERR_REF_EXIST;
+ if (!flag_exist)
+ /* Can't re-add */
+ return -IPSET_ERR_EXIST;
+ /* Update extensions */
+ ip_set_ext_destroy(set, e);
+
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, set), ext);
+ if (SET_WITH_COMMENT(set))
+ ip_set_init_comment(ext_comment(e, set), ext);
+ if (SET_WITH_SKBINFO(set))
+ ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
+ /* Set is already added to the list */
+ ip_set_put_byindex(map->net, d->id);
+ return 0;
+ }
+insert:
+ ret = -IPSET_ERR_LIST_FULL;
+ for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ ret = d->before != 0 ? -IPSET_ERR_REF_EXIST
+ : list_set_add(set, i, d, ext);
+ else if (e->id != d->refid)
+ continue;
+ else if (d->before > 0)
+ ret = list_set_add(set, i, d, ext);
+ else if (i + 1 < map->size)
+ ret = list_set_add(set, i + 1, d, ext);
+ }
+
+ return ret;
+}
+
+static int
+list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ u32 i;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return d->before != 0 ? -IPSET_ERR_REF_EXIST
+ : -IPSET_ERR_EXIST;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ else if (e->id != d->id)
+ continue;
+
+ if (d->before == 0)
+ return list_set_del(set, i);
+ else if (d->before > 0) {
+ if (!id_eq(set, i + 1, d->refid))
+ return -IPSET_ERR_REF_EXIST;
+ return list_set_del(set, i);
+ } else if (i == 0 || !id_eq(set, i - 1, d->refid))
+ return -IPSET_ERR_REF_EXIST;
+ else
+ return list_set_del(set, i);
+ }
+ return -IPSET_ERR_EXIST;
+}
+
+static int
+list_set_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+{
+ struct list_set *map = set->data;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct set_adt_elem e = { .refid = IPSET_INVALID_ID };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ struct ip_set *s;
+ int ret = 0;
+
+ if (unlikely(!tb[IPSET_ATTR_NAME] ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_LINENO])
+ *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+ e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s);
+ if (e.id == IPSET_INVALID_ID)
+ return -IPSET_ERR_NAME;
+ /* "Loop detection" */
+ if (s->type->features & IPSET_TYPE_NAME) {
+ ret = -IPSET_ERR_LOOP;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
+ u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ e.before = f & IPSET_FLAG_BEFORE;
+ }
+
+ if (e.before && !tb[IPSET_ATTR_NAMEREF]) {
+ ret = -IPSET_ERR_BEFORE;
+ goto finish;
+ }
+
+ if (tb[IPSET_ATTR_NAMEREF]) {
+ e.refid = ip_set_get_byname(map->net,
+ nla_data(tb[IPSET_ATTR_NAMEREF]),
+ &s);
+ if (e.refid == IPSET_INVALID_ID) {
+ ret = -IPSET_ERR_NAMEREF;
+ goto finish;
+ }
+ if (!e.before)
+ e.before = -1;
+ }
+ if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set))
+ set_cleanup_entries(set);
+
+ ret = adtfn(set, &e, &ext, &ext, flags);
+
+finish:
+ if (e.refid != IPSET_INVALID_ID)
+ ip_set_put_byindex(map->net, e.refid);
+ if (adt != IPSET_ADD || ret)
+ ip_set_put_byindex(map->net, e.id);
+
+ return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+list_set_flush(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(set, map, i);
+ if (e->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(map->net, e->id);
+ ip_set_ext_destroy(set, e);
+ e->id = IPSET_INVALID_ID;
+ }
+ }
+}
+
+static void
+list_set_destroy(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+ list_set_flush(set);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static int
+list_set_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) + map->size * set->dsize)))
+ goto nla_put_failure;
+ if (unlikely(ip_set_put_flags(skb, set)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+list_set_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct list_set *map = set->data;
+ struct nlattr *atd, *nested;
+ u32 i, first = cb->args[IPSET_CB_ARG0];
+ const struct set_elem *e;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[IPSET_CB_ARG0] < map->size;
+ cb->args[IPSET_CB_ARG0]++) {
+ i = cb->args[IPSET_CB_ARG0];
+ e = list_set_elem(set, map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto finish;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (i == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (nla_put_string(skb, IPSET_ATTR_NAME,
+ ip_set_name_byindex(map->net, e->id)))
+ goto nla_put_failure;
+ if (ip_set_put_extensions(skb, set, e, true))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+finish:
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[IPSET_CB_ARG0] = 0;
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ if (unlikely(i == first)) {
+ cb->args[IPSET_CB_ARG0] = 0;
+ return -EMSGSIZE;
+ }
+ ipset_nest_end(skb, atd);
+ return 0;
+}
+
+static bool
+list_set_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct list_set *x = a->data;
+ const struct list_set *y = b->data;
+
+ return x->size == y->size &&
+ a->timeout == b->timeout &&
+ a->extensions == b->extensions;
+}
+
+static const struct ip_set_type_variant set_variant = {
+ .kadt = list_set_kadt,
+ .uadt = list_set_uadt,
+ .adt = {
+ [IPSET_ADD] = list_set_uadd,
+ [IPSET_DEL] = list_set_udel,
+ [IPSET_TEST] = list_set_utest,
+ },
+ .destroy = list_set_destroy,
+ .flush = list_set_flush,
+ .head = list_set_head,
+ .list = list_set_list,
+ .same_set = list_set_same_set,
+};
+
+static void
+list_set_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct list_set *map = set->data;
+
+ write_lock_bh(&set->lock);
+ set_cleanup_entries(set);
+ write_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct list_set *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+/* Create list:set type of sets */
+
+static bool
+init_list_set(struct net *net, struct ip_set *set, u32 size)
+{
+ struct list_set *map;
+ struct set_elem *e;
+ u32 i;
+
+ map = kzalloc(sizeof(*map) +
+ min_t(u32, size, IP_SET_LIST_MAX_SIZE) * set->dsize,
+ GFP_KERNEL);
+ if (!map)
+ return false;
+
+ map->size = size;
+ map->net = net;
+ set->data = map;
+
+ for (i = 0; i < size; i++) {
+ e = list_set_elem(set, map, i);
+ e->id = IPSET_INVALID_ID;
+ }
+
+ return true;
+}
+
+static int
+list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+ u32 flags)
+{
+ u32 size = IP_SET_LIST_DEFAULT_SIZE;
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_SIZE])
+ size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]);
+ if (size < IP_SET_LIST_MIN_SIZE)
+ size = IP_SET_LIST_MIN_SIZE;
+
+ set->variant = &set_variant;
+ set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem));
+ if (!init_list_set(net, set, size))
+ return -ENOMEM;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ list_set_gc_init(set, list_set_gc);
+ }
+ return 0;
+}
+
+static struct ip_set_type list_set_type __read_mostly = {
+ .name = "list:set",
+ .protocol = IPSET_PROTOCOL,
+ .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST,
+ .dimension = IPSET_DIM_ONE,
+ .family = NFPROTO_UNSPEC,
+ .revision_min = IPSET_TYPE_REV_MIN,
+ .revision_max = IPSET_TYPE_REV_MAX,
+ .create = list_set_create,
+ .create_policy = {
+ [IPSET_ATTR_SIZE] = { .type = NLA_U32 },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ },
+ .adt_policy = {
+ [IPSET_ATTR_NAME] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING,
+ .len = IPSET_MAXNAMELEN },
+ [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
+ [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING },
+ [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
+ [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
+ [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
+ },
+ .me = THIS_MODULE,
+};
+
+static int __init
+list_set_init(void)
+{
+ return ip_set_type_register(&list_set_type);
+}
+
+static void __exit
+list_set_fini(void)
+{
+ ip_set_type_unregister(&list_set_type);
+}
+
+module_init(list_set_init);
+module_exit(list_set_fini);
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
new file mode 100644
index 000000000..04d15fdc9
--- /dev/null
+++ b/net/netfilter/ipset/pfxlen.c
@@ -0,0 +1,313 @@
+#include <linux/export.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+
+/*
+ * Prefixlen maps for fast conversions, by Jan Engelhardt.
+ */
+
+#define E(a, b, c, d) \
+ {.ip6 = { \
+ htonl(a), htonl(b), \
+ htonl(c), htonl(d), \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_netmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_netmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_netmask_map);
+
+#undef E
+#define E(a, b, c, d) \
+ {.ip6 = { (__force __be32) a, (__force __be32) b, \
+ (__force __be32) c, (__force __be32) d, \
+ } }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_hostmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_hostmask_map[] = {
+ E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+ E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_hostmask_map);
+
+/* Find the largest network which matches the range from left, in host order. */
+u32
+ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr)
+{
+ u32 last;
+ u8 i;
+
+ for (i = 1; i < 32; i++) {
+ if ((from & ip_set_hostmask(i)) != from)
+ continue;
+ last = from | ~ip_set_hostmask(i);
+ if (!after(last, to)) {
+ *cidr = i;
+ return last;
+ }
+ }
+ *cidr = 32;
+ return from;
+}
+EXPORT_SYMBOL_GPL(ip_set_range_to_cidr);
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
new file mode 100644
index 000000000..3b6929dec
--- /dev/null
+++ b/net/netfilter/ipvs/Kconfig
@@ -0,0 +1,291 @@
+#
+# IP Virtual Server configuration
+#
+menuconfig IP_VS
+ tristate "IP virtual server support"
+ depends on NET && INET && NETFILTER
+ depends on (NF_CONNTRACK || NF_CONNTRACK=n)
+ ---help---
+ IP Virtual Server support will let you build a high-performance
+ virtual server based on cluster of two or more real servers. This
+ option must be enabled for at least one of the clustered computers
+ that will take care of intercepting incoming connections to a
+ single IP address and scheduling them to real servers.
+
+ Three request dispatching techniques are implemented, they are
+ virtual server via NAT, virtual server via tunneling and virtual
+ server via direct routing. The several scheduling algorithms can
+ be used to choose which server the connection is directed to,
+ thus load balancing can be achieved among the servers. For more
+ information and its administration program, please visit the
+ following URL: <http://www.linuxvirtualserver.org/>.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+if IP_VS
+
+config IP_VS_IPV6
+ bool "IPv6 support for IPVS"
+ depends on IPV6 = y || IP_VS = IPV6
+ select IP6_NF_IPTABLES
+ ---help---
+ Add IPv6 support to IPVS.
+
+ Say Y if unsure.
+
+config IP_VS_DEBUG
+ bool "IP virtual server debugging"
+ ---help---
+ Say Y here if you want to get additional messages useful in
+ debugging the IP virtual server code. You can change the debug
+ level in /proc/sys/net/ipv4/vs/debug_level
+
+config IP_VS_TAB_BITS
+ int "IPVS connection table size (the Nth power of 2)"
+ range 8 20
+ default 12
+ ---help---
+ The IPVS connection hash table uses the chaining scheme to handle
+ hash collisions. Using a big IPVS connection hash table will greatly
+ reduce conflicts when there are hundreds of thousands of connections
+ in the hash table.
+
+ Note the table size must be power of 2. The table size will be the
+ value of 2 to the your input number power. The number to choose is
+ from 8 to 20, the default number is 12, which means the table size
+ is 4096. Don't input the number too small, otherwise you will lose
+ performance on it. You can adapt the table size yourself, according
+ to your virtual server application. It is good to set the table size
+ not far less than the number of connections per second multiplying
+ average lasting time of connection in the table. For example, your
+ virtual server gets 200 connections per second, the connection lasts
+ for 200 seconds in average in the connection table, the table size
+ should be not far less than 200x200, it is good to set the table
+ size 32768 (2**15).
+
+ Another note that each connection occupies 128 bytes effectively and
+ each hash entry uses 8 bytes, so you can estimate how much memory is
+ needed for your box.
+
+ You can overwrite this number setting conn_tab_bits module parameter
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line
+ if IP VS was compiled built-in.
+
+comment "IPVS transport protocol load balancing support"
+
+config IP_VS_PROTO_TCP
+ bool "TCP load balancing support"
+ ---help---
+ This option enables support for load balancing TCP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_UDP
+ bool "UDP load balancing support"
+ ---help---
+ This option enables support for load balancing UDP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH_ESP
+ def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH
+
+config IP_VS_PROTO_ESP
+ bool "ESP load balancing support"
+ ---help---
+ This option enables support for load balancing ESP (Encapsulation
+ Security Payload) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH
+ bool "AH load balancing support"
+ ---help---
+ This option enables support for load balancing AH (Authentication
+ Header) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_SCTP
+ bool "SCTP load balancing support"
+ select LIBCRC32C
+ ---help---
+ This option enables support for load balancing SCTP transport
+ protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+
+config IP_VS_RR
+ tristate "round-robin scheduling"
+ ---help---
+ The robin-robin scheduling algorithm simply directs network
+ connections to different real servers in a round-robin manner.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WRR
+ tristate "weighted round-robin scheduling"
+ ---help---
+ The weighted robin-robin scheduling algorithm directs network
+ connections to different real servers based on server weights
+ in a round-robin manner. Servers with higher weights receive
+ new connections first than those with less weights, and servers
+ with higher weights get more connections than those with less
+ weights and servers with equal weights get equal connections.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LC
+ tristate "least-connection scheduling"
+ ---help---
+ The least-connection scheduling algorithm directs network
+ connections to the server with the least number of active
+ connections.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WLC
+ tristate "weighted least-connection scheduling"
+ ---help---
+ The weighted least-connection scheduling algorithm directs network
+ connections to the server with the least active connections
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_FO
+ tristate "weighted failover scheduling"
+ ---help---
+ The weighted failover scheduling algorithm directs network
+ connections to the server with the highest weight that is
+ currently available.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LBLC
+ tristate "locality-based least-connection scheduling"
+ ---help---
+ The locality-based least-connection scheduling algorithm is for
+ destination IP load balancing. It is usually used in cache cluster.
+ This algorithm usually directs packet destined for an IP address to
+ its server if the server is alive and under load. If the server is
+ overloaded (its active connection numbers is larger than its weight)
+ and there is a server in its half load, then allocate the weighted
+ least-connection server to this IP address.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LBLCR
+ tristate "locality-based least-connection with replication scheduling"
+ ---help---
+ The locality-based least-connection with replication scheduling
+ algorithm is also for destination IP load balancing. It is
+ usually used in cache cluster. It differs from the LBLC scheduling
+ as follows: the load balancer maintains mappings from a target
+ to a set of server nodes that can serve the target. Requests for
+ a target are assigned to the least-connection node in the target's
+ server set. If all the node in the server set are over loaded,
+ it picks up a least-connection node in the cluster and adds it
+ in the sever set for the target. If the server set has not been
+ modified for the specified time, the most loaded node is removed
+ from the server set, in order to avoid high degree of replication.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_DH
+ tristate "destination hashing scheduling"
+ ---help---
+ The destination hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their destination IP addresses.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_SH
+ tristate "source hashing scheduling"
+ ---help---
+ The source hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their source IP addresses.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_SED
+ tristate "shortest expected delay scheduling"
+ ---help---
+ The shortest expected delay scheduling algorithm assigns network
+ connections to the server with the shortest expected delay. The
+ expected delay that the job will experience is (Ci + 1) / Ui if
+ sent to the ith server, in which Ci is the number of connections
+ on the ith server and Ui is the fixed service rate (weight)
+ of the ith server.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_NQ
+ tristate "never queue scheduling"
+ ---help---
+ The never queue scheduling algorithm adopts a two-speed model.
+ When there is an idle server available, the job will be sent to
+ the idle server, instead of waiting for a fast one. When there
+ is no idle server available, the job will be sent to the server
+ that minimize its expected delay (The Shortest Expected Delay
+ scheduling algorithm).
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+comment 'IPVS SH scheduler'
+
+config IP_VS_SH_TAB_BITS
+ int "IPVS source hashing table size (the Nth power of 2)"
+ range 4 20
+ default 8
+ ---help---
+ The source hashing scheduler maps source IPs to destinations
+ stored in a hash table. This table is tiled by each destination
+ until all slots in the table are filled. When using weights to
+ allow destinations to receive more connections, the table is
+ tiled an amount proportional to the weights specified. The table
+ needs to be large enough to effectively fit all the destinations
+ multiplied by their respective weights.
+
+comment 'IPVS application helper'
+
+config IP_VS_FTP
+ tristate "FTP protocol helper"
+ depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \
+ NF_CONNTRACK_FTP
+ select IP_VS_NFCT
+ ---help---
+ FTP is a protocol that transfers IP address and/or port number in
+ the payload. In the virtual server via Network Address Translation,
+ the IP address and port number of real servers cannot be sent to
+ clients in ftp connections directly, so FTP protocol helper is
+ required for tracking the connection and mangling it back to that of
+ virtual service.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_NFCT
+ bool "Netfilter connection tracking"
+ depends on NF_CONNTRACK
+ ---help---
+ The Netfilter connection tracking support allows the IPVS
+ connection state to be exported to the Netfilter framework
+ for filtering purposes.
+
+config IP_VS_PE_SIP
+ tristate "SIP persistence engine"
+ depends on IP_VS_PROTO_UDP
+ depends on NF_CONNTRACK_SIP
+ ---help---
+ Allow persistence based on the SIP Call-ID
+
+endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
new file mode 100644
index 000000000..38b2723b2
--- /dev/null
+++ b/net/netfilter/ipvs/Makefile
@@ -0,0 +1,41 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
+
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
+ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
+ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
+ ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \
+ $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
new file mode 100644
index 000000000..dfd7b65b3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -0,0 +1,627 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ * IP_MASQ_APP application masquerading module
+ *
+ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+static DEFINE_MUTEX(__ip_vs_app_mutex);
+
+/*
+ * Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+ return try_module_get(app->module);
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+ module_put(app->module);
+}
+
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+ ip_vs_app_inc_destroy(inc);
+}
+
+/*
+ * Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
+{
+ struct ip_vs_protocol *pp;
+ struct ip_vs_app *inc;
+ int ret;
+
+ if (!(pp = ip_vs_proto_get(proto)))
+ return -EPROTONOSUPPORT;
+
+ if (!pp->unregister_app)
+ return -EOPNOTSUPP;
+
+ inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
+ if (!inc)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&inc->p_list);
+ INIT_LIST_HEAD(&inc->incs_list);
+ inc->app = app;
+ inc->port = htons(port);
+ atomic_set(&inc->usecnt, 0);
+
+ if (app->timeouts) {
+ inc->timeout_table =
+ ip_vs_create_timeout_table(app->timeouts,
+ app->timeouts_size);
+ if (!inc->timeout_table) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = pp->register_app(net, inc);
+ if (ret)
+ goto out;
+
+ list_add(&inc->a_list, &app->incs_list);
+ IP_VS_DBG(9, "%s App %s:%u registered\n",
+ pp->name, inc->name, ntohs(inc->port));
+
+ return 0;
+
+ out:
+ ip_vs_app_inc_destroy(inc);
+ return ret;
+}
+
+
+/*
+ * Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_protocol *pp;
+
+ if (!(pp = ip_vs_proto_get(inc->protocol)))
+ return;
+
+ if (pp->unregister_app)
+ pp->unregister_app(net, inc);
+
+ IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+ pp->name, inc->name, ntohs(inc->port));
+
+ list_del(&inc->a_list);
+
+ call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
+}
+
+
+/*
+ * Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+ int result;
+
+ result = ip_vs_app_get(inc->app);
+ if (result)
+ atomic_inc(&inc->usecnt);
+ return result;
+}
+
+
+/*
+ * Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+ atomic_dec(&inc->usecnt);
+ ip_vs_app_put(inc->app);
+}
+
+
+/*
+ * Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
+{
+ int result;
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ result = ip_vs_app_inc_new(net, app, proto, port);
+
+ mutex_unlock(&__ip_vs_app_mutex);
+
+ return result;
+}
+
+
+/* Register application for netns */
+struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a;
+ int err = 0;
+
+ if (!ipvs)
+ return ERR_PTR(-ENOENT);
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ list_for_each_entry(a, &ipvs->app_list, a_list) {
+ if (!strcmp(app->name, a->name)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ a = kmemdup(app, sizeof(*app), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ INIT_LIST_HEAD(&a->incs_list);
+ list_add(&a->a_list, &ipvs->app_list);
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+out_unlock:
+ mutex_unlock(&__ip_vs_app_mutex);
+
+ return err ? ERR_PTR(err) : a;
+}
+
+
+/*
+ * ip_vs_app unregistration routine
+ * We are sure there are no app incarnations attached to services
+ * Caller should use synchronize_rcu() or rcu_barrier()
+ */
+void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a, *anxt, *inc, *nxt;
+
+ if (!ipvs)
+ return;
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) {
+ if (app && strcmp(app->name, a->name))
+ continue;
+ list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) {
+ ip_vs_app_inc_release(net, inc);
+ }
+
+ list_del(&a->a_list);
+ kfree(a);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+ }
+
+ mutex_unlock(&__ip_vs_app_mutex);
+}
+
+
+/*
+ * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ return pp->app_conn_bind(cp);
+}
+
+
+/*
+ * Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+ struct ip_vs_app *inc = cp->app;
+
+ if (!inc)
+ return;
+
+ if (inc->unbind_conn)
+ inc->unbind_conn(inc, cp);
+ if (inc->done_conn)
+ inc->done_conn(inc, cp);
+ ip_vs_app_inc_put(inc);
+ cp->app = NULL;
+}
+
+
+/*
+ * Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 seq = ntohl(th->seq);
+
+ /*
+ * Adjust seq with delta-offset for all packets after
+ * the most recent resized pkt seq and with previous_delta offset
+ * for all packets before most recent resized pkt seq.
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ if(after(seq, vseq->init_seq)) {
+ th->seq = htonl(seq + vseq->delta);
+ IP_VS_DBG(9, "%s(): added delta (%d) to seq\n",
+ __func__, vseq->delta);
+ } else {
+ th->seq = htonl(seq + vseq->previous_delta);
+ IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n",
+ __func__, vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 ack_seq = ntohl(th->ack_seq);
+
+ /*
+ * Adjust ack_seq with delta-offset for
+ * the packets AFTER most recent resized pkt has caused a shift
+ * for packets before most recent resized pkt, use previous_delta
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ /* since ack_seq is the number of octet that is expected
+ to receive next, so compare it with init_seq+delta */
+ if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+ th->ack_seq = htonl(ack_seq - vseq->delta);
+ IP_VS_DBG(9, "%s(): subtracted delta "
+ "(%d) from ack_seq\n", __func__, vseq->delta);
+
+ } else {
+ th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+ IP_VS_DBG(9, "%s(): subtracted "
+ "previous_delta (%d) from ack_seq\n",
+ __func__, vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Updates ip_vs_seq if pkt has been resized
+ * Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+ unsigned int flag, __u32 seq, int diff)
+{
+ /* spinlock is to keep updating cp->flags atomic */
+ spin_lock_bh(&cp->lock);
+ if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+ vseq->previous_delta = vseq->delta;
+ vseq->delta += diff;
+ vseq->init_seq = seq;
+ cp->flags |= flag;
+ }
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_app *app)
+{
+ int diff;
+ const unsigned int tcp_offset = ip_hdrlen(skb);
+ struct tcphdr *th;
+ __u32 seq;
+
+ if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ return 0;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_seq(&cp->out_seq, th);
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_ack_seq(&cp->in_seq, th);
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 1;
+
+ if (!app->pkt_out(app, cp, skb, &diff))
+ return 0;
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0)
+ vs_seq_update(cp, &cp->out_seq,
+ IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+ return 1;
+}
+
+/*
+ * Output pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL
+ * returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_app *app;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 1;
+
+ /* TCP is complicated */
+ if (cp->protocol == IPPROTO_TCP)
+ return app_tcp_pkt_out(cp, skb, app);
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 1;
+
+ return app->pkt_out(app, cp, skb, NULL);
+}
+
+
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_app *app)
+{
+ int diff;
+ const unsigned int tcp_offset = ip_hdrlen(skb);
+ struct tcphdr *th;
+ __u32 seq;
+
+ if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ return 0;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_seq(&cp->in_seq, th);
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_ack_seq(&cp->out_seq, th);
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 1;
+
+ if (!app->pkt_in(app, cp, skb, &diff))
+ return 0;
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0)
+ vs_seq_update(cp, &cp->in_seq,
+ IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+ return 1;
+}
+
+/*
+ * Input pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL.
+ * returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_app *app;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 1;
+
+ /* TCP is complicated */
+ if (cp->protocol == IPPROTO_TCP)
+ return app_tcp_pkt_in(cp, skb, app);
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 1;
+
+ return app->pkt_in(app, cp, skb, NULL);
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ * /proc/net/ip_vs_app entry function
+ */
+
+static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
+{
+ struct ip_vs_app *app, *inc;
+
+ list_for_each_entry(app, &ipvs->app_list, a_list) {
+ list_for_each_entry(inc, &app->incs_list, a_list) {
+ if (pos-- == 0)
+ return inc;
+ }
+ }
+ return NULL;
+
+}
+
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_vs_app *inc, *app;
+ struct list_head *e;
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_app_idx(ipvs, 0);
+
+ inc = v;
+ app = inc->app;
+
+ if ((e = inc->a_list.next) != &app->incs_list)
+ return list_entry(e, struct ip_vs_app, a_list);
+
+ /* go on to next application */
+ for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
+ app = list_entry(e, struct ip_vs_app, a_list);
+ list_for_each_entry(inc, &app->incs_list, a_list) {
+ return inc;
+ }
+ }
+ return NULL;
+}
+
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+ mutex_unlock(&__ip_vs_app_mutex);
+}
+
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "prot port usecnt name\n");
+ else {
+ const struct ip_vs_app *inc = v;
+
+ seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
+ ip_vs_proto_name(inc->protocol),
+ ntohs(inc->port),
+ atomic_read(&inc->usecnt),
+ inc->name);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_app_seq_ops = {
+ .start = ip_vs_app_seq_start,
+ .next = ip_vs_app_seq_next,
+ .stop = ip_vs_app_seq_stop,
+ .show = ip_vs_app_seq_show,
+};
+
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ip_vs_app_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_app_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
+
+int __net_init ip_vs_app_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->app_list);
+ proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
+ return 0;
+}
+
+void __net_exit ip_vs_app_net_cleanup(struct net *net)
+{
+ unregister_ip_vs_app(net, NULL /* all */);
+ remove_proc_entry("ip_vs_app", net->proc_net);
+}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
new file mode 100644
index 000000000..b0f7b626b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -0,0 +1,1418 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/net.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h> /* for proc_net_* */
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/net_namespace.h>
+#include <net/ip_vs.h>
+
+
+#ifndef CONFIG_IP_VS_TAB_BITS
+#define CONFIG_IP_VS_TAB_BITS 12
+#endif
+
+/*
+ * Connection hash size. Default is what was selected at compile time.
+*/
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
+MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
+
+/* size and mask values */
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
+
+/*
+ * Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct hlist_head *ip_vs_conn_tab __read_mostly;
+
+/* SLAB cache for IPVS connections */
+static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
+
+/* counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd __read_mostly;
+
+/*
+ * Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS 5
+#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
+
+/* We need an addrstrlen that works with or without v6 */
+#ifdef CONFIG_IP_VS_IPV6
+#define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
+#else
+#define IP_VS_ADDRSTRLEN (8+1)
+#endif
+
+struct ip_vs_aligned_lock
+{
+ spinlock_t l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_write_lock_bh(unsigned int key)
+{
+ spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned int key)
+{
+ spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ * Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,
+ const union nf_inet_addr *addr,
+ __be16 port)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+ (__force u32)port, proto, ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+#endif
+ return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+ ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+}
+
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+ bool inverse)
+{
+ const union nf_inet_addr *addr;
+ __be16 port;
+
+ if (p->pe_data && p->pe->hashkey_raw)
+ return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+ ip_vs_conn_tab_mask;
+
+ if (likely(!inverse)) {
+ addr = p->caddr;
+ port = p->cport;
+ } else {
+ addr = p->vaddr;
+ port = p->vport;
+ }
+
+ return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ &cp->caddr, cp->cport, NULL, 0, &p);
+
+ if (cp->pe) {
+ p.pe = cp->pe;
+ p.pe_data = cp->pe_data;
+ p.pe_data_len = cp->pe_data_len;
+ }
+
+ return ip_vs_conn_hashkey_param(&p, false);
+}
+
+/*
+ * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
+ * returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ int ret;
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return 0;
+
+ /* Hash by protocol, client address and port */
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+ cp->flags |= IP_VS_CONN_F_HASHED;
+ atomic_inc(&cp->refcnt);
+ hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
+ ret = 1;
+ } else {
+ pr_err("%s(): request for already hashed, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ ret = 0;
+ }
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+
+/*
+ * UNhashes ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success. Caller should hold conn reference.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ int ret;
+
+ /* unhash it and decrease its reference counter */
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ atomic_dec(&cp->refcnt);
+ ret = 1;
+ } else
+ ret = 0;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ bool ret;
+
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ ret = false;
+ /* Decrease refcnt and unlink conn only if we are last user */
+ if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ ret = true;
+ }
+ } else
+ ret = atomic_read(&cp->refcnt) ? false : true;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+
+/*
+ * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from OUTside-to-INside.
+ * p->caddr, p->cport: pkt source address (foreign host)
+ * p->vaddr, p->vport: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey_param(p, false);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->cport == cp->cport && p->vport == cp->vport &&
+ cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+ ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
+ /* HIT */
+ rcu_read_unlock();
+ return cp;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+{
+ struct ip_vs_conn *cp;
+
+ cp = __ip_vs_conn_in_get(p);
+ if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+ struct ip_vs_conn_param cport_zero_p = *p;
+ cport_zero_p.cport = 0;
+ cp = __ip_vs_conn_in_get(&cport_zero_p);
+ }
+
+ IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ cp ? "hit" : "not hit");
+
+ return cp;
+}
+
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ int inverse, struct ip_vs_conn_param *p)
+{
+ __be16 _ports[2], *pptr;
+ struct net *net = skb_net(skb);
+
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL)
+ return 1;
+
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ pptr[0], &iph->daddr, pptr[1], p);
+ else
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ pptr[1], &iph->saddr, pptr[0], p);
+ return 0;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn_param p;
+
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ return NULL;
+
+ return ip_vs_conn_in_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
+
+/* Get reference to connection template */
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey_param(p, false);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (unlikely(p->pe_data && p->pe->ct_match)) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
+ continue;
+ }
+
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * p->vaddr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+ p->af, p->vaddr, &cp->vaddr) &&
+ p->vport == cp->vport && p->cport == cp->cport &&
+ cp->flags & IP_VS_CONN_F_TEMPLATE &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
+ }
+ cp = NULL;
+
+ out:
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ cp ? "hit" : "not hit");
+
+ return cp;
+}
+
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ * p->caddr, p->cport: pkt source address (inside host)
+ * p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp, *ret=NULL;
+
+ /*
+ * Check for "full" addressed entries
+ */
+ hash = ip_vs_conn_hashkey_param(p, true);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->vport == cp->cport && p->cport == cp->dport &&
+ cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
+ /* HIT */
+ ret = cp;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn_param p;
+
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ return NULL;
+
+ return ip_vs_conn_out_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
+
+/*
+ * Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
+ 0 : cp->timeout;
+ mod_timer(&cp->timer, jiffies+t);
+
+ __ip_vs_conn_put(cp);
+}
+
+
+/*
+ * Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
+{
+ if (ip_vs_conn_unhash(cp)) {
+ spin_lock_bh(&cp->lock);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+ cp->cport = cport;
+ }
+ spin_unlock_bh(&cp->lock);
+
+ /* hash on new dport */
+ ip_vs_conn_hash(cp);
+ }
+}
+
+
+/*
+ * Bind a connection entry with the corresponding packet_xmit.
+ * Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ cp->packet_xmit = ip_vs_nat_xmit;
+ break;
+
+ case IP_VS_CONN_F_TUNNEL:
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->daf == AF_INET6)
+ cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+ else
+#endif
+ cp->packet_xmit = ip_vs_tunnel_xmit;
+ break;
+
+ case IP_VS_CONN_F_DROUTE:
+ cp->packet_xmit = ip_vs_dr_xmit;
+ break;
+
+ case IP_VS_CONN_F_LOCALNODE:
+ cp->packet_xmit = ip_vs_null_xmit;
+ break;
+
+ case IP_VS_CONN_F_BYPASS:
+ cp->packet_xmit = ip_vs_bypass_xmit;
+ break;
+ }
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
+{
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ cp->packet_xmit = ip_vs_nat_xmit_v6;
+ break;
+
+ case IP_VS_CONN_F_TUNNEL:
+ if (cp->daf == AF_INET6)
+ cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+ else
+ cp->packet_xmit = ip_vs_tunnel_xmit;
+ break;
+
+ case IP_VS_CONN_F_DROUTE:
+ cp->packet_xmit = ip_vs_dr_xmit_v6;
+ break;
+
+ case IP_VS_CONN_F_LOCALNODE:
+ cp->packet_xmit = ip_vs_null_xmit;
+ break;
+
+ case IP_VS_CONN_F_BYPASS:
+ cp->packet_xmit = ip_vs_bypass_xmit_v6;
+ break;
+ }
+}
+#endif
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->activeconns)
+ + atomic_read(&dest->inactconns);
+}
+
+/*
+ * Bind a connection entry with a virtual service destination
+ * Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+ unsigned int conn_flags;
+ __u32 flags;
+
+ /* if dest is NULL, then return directly */
+ if (!dest)
+ return;
+
+ /* Increase the refcnt counter of the dest */
+ ip_vs_dest_hold(dest);
+
+ conn_flags = atomic_read(&dest->conn_flags);
+ if (cp->protocol != IPPROTO_UDP)
+ conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
+ flags = cp->flags;
+ /* Bind with the destination and its corresponding transmitter */
+ if (flags & IP_VS_CONN_F_SYNC) {
+ /* if the connection is not template and is created
+ * by sync, preserve the activity flag.
+ */
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* connections inherit forwarding method from dest */
+ flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
+ }
+ flags |= conn_flags;
+ cp->flags = flags;
+ cp->dest = dest;
+
+ IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
+ "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so modify the counters
+ * according to the flags, later the protocol can
+ * update them on state change
+ */
+ if (!(flags & IP_VS_CONN_F_INACTIVE))
+ atomic_inc(&dest->activeconns);
+ else
+ atomic_inc(&dest->inactconns);
+ } else {
+ /* It is a persistent connection/template, so increase
+ the persistent connection counter */
+ atomic_inc(&dest->persistconns);
+ }
+
+ if (dest->u_threshold != 0 &&
+ ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+ dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Check if there is a destination for the connection, if so
+ * bind the connection to the destination.
+ */
+void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+{
+ struct ip_vs_dest *dest;
+
+ rcu_read_lock();
+
+ /* This function is only invoked by the synchronization code. We do
+ * not currently support heterogeneous pools with synchronization,
+ * so we can make the assumption that the svc_af is the same as the
+ * dest_af
+ */
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark, cp->flags);
+ if (dest) {
+ struct ip_vs_proto_data *pd;
+
+ spin_lock_bh(&cp->lock);
+ if (cp->dest) {
+ spin_unlock_bh(&cp->lock);
+ rcu_read_unlock();
+ return;
+ }
+
+ /* Applications work depending on the forwarding method
+ * but better to reassign them always when binding dest */
+ if (cp->app)
+ ip_vs_unbind_app(cp);
+
+ ip_vs_bind_dest(cp, dest);
+ spin_unlock_bh(&cp->lock);
+
+ /* Update its packet transmitter */
+ cp->packet_xmit = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
+ if (pd && atomic_read(&pd->appcnt))
+ ip_vs_bind_app(cp, pd->pp);
+ }
+ rcu_read_unlock();
+}
+
+
+/*
+ * Unbind a connection entry with its VS destination
+ * Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+ struct ip_vs_dest *dest = cp->dest;
+
+ if (!dest)
+ return;
+
+ IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
+ "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so decrease the inactconns
+ or activeconns counter */
+ if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->inactconns);
+ } else {
+ atomic_dec(&dest->activeconns);
+ }
+ } else {
+ /* It is a persistent connection/template, so decrease
+ the persistent connection counter */
+ atomic_dec(&dest->persistconns);
+ }
+
+ if (dest->l_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else if (dest->u_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ }
+
+ ip_vs_dest_put(dest);
+}
+
+static int expire_quiescent_template(struct netns_ipvs *ipvs,
+ struct ip_vs_dest *dest)
+{
+#ifdef CONFIG_SYSCTL
+ return ipvs->sysctl_expire_quiescent_template &&
+ (atomic_read(&dest->weight) == 0);
+#else
+ return 0;
+#endif
+}
+
+/*
+ * Checking if the destination of a connection template is available.
+ * If available, return 1, otherwise invalidate this connection
+ * template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+ struct ip_vs_dest *dest = ct->dest;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
+
+ /*
+ * Checking the dest server status.
+ */
+ if ((dest == NULL) ||
+ !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
+ expire_quiescent_template(ipvs, dest)) {
+ IP_VS_DBG_BUF(9, "check_template: dest not available for "
+ "protocol %s s:%s:%d v:%s:%d "
+ "-> d:%s:%d\n",
+ ip_vs_proto_name(ct->protocol),
+ IP_VS_DBG_ADDR(ct->af, &ct->caddr),
+ ntohs(ct->cport),
+ IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
+ ntohs(ct->vport),
+ IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
+ ntohs(ct->dport));
+
+ /*
+ * Invalidate the connection template
+ */
+ if (ct->vport != htons(0xffff)) {
+ if (ip_vs_conn_unhash(ct)) {
+ ct->dport = htons(0xffff);
+ ct->vport = htons(0xffff);
+ ct->cport = 0;
+ ip_vs_conn_hash(ct);
+ }
+ }
+
+ /*
+ * Simply decrease the refcnt of the template,
+ * don't restart its timer.
+ */
+ __ip_vs_conn_put(ct);
+ return 0;
+ }
+ return 1;
+}
+
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+ rcu_head);
+
+ ip_vs_pe_put(cp->pe);
+ kfree(cp->pe_data);
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+ struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct net *net = ip_vs_conn_net(cp);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /*
+ * do I control anybody?
+ */
+ if (atomic_read(&cp->n_control))
+ goto expire_later;
+
+ /* Unlink conn if not referenced anymore */
+ if (likely(ip_vs_conn_unlink(cp))) {
+ /* delete the timer if it is activated by other users */
+ del_timer(&cp->timer);
+
+ /* does anybody control me? */
+ if (cp->control)
+ ip_vs_control_del(cp);
+
+ if (cp->flags & IP_VS_CONN_F_NFCT) {
+ /* Do not access conntracks during subsys cleanup
+ * because nf_conntrack_find_get can not be used after
+ * conntrack cleanup for the net.
+ */
+ smp_rmb();
+ if (ipvs->enable)
+ ip_vs_conn_drop_conntrack(cp);
+ }
+
+ if (unlikely(cp->app != NULL))
+ ip_vs_unbind_app(cp);
+ ip_vs_unbind_dest(cp);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
+ atomic_dec(&ipvs->conn_count);
+ return;
+ }
+
+ expire_later:
+ IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+ atomic_read(&cp->refcnt),
+ atomic_read(&cp->n_control));
+
+ atomic_inc(&cp->refcnt);
+ cp->timeout = 60*HZ;
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+
+ ip_vs_conn_put(cp);
+}
+
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+ /* Using mod_timer_pending will ensure the timer is not
+ * modified after the final del_timer in ip_vs_conn_expire.
+ */
+ if (timer_pending(&cp->timer) &&
+ time_after(cp->timer.expires, jiffies))
+ mod_timer_pending(&cp->timer, jiffies);
+}
+
+
+/*
+ * Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
+ const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
+ struct ip_vs_dest *dest, __u32 fwmark)
+{
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(p->net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ p->protocol);
+
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ if (cp == NULL) {
+ IP_VS_ERR_RL("%s(): no memory\n", __func__);
+ return NULL;
+ }
+
+ INIT_HLIST_NODE(&cp->c_list);
+ setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+ ip_vs_conn_net_set(cp, p->net);
+ cp->af = p->af;
+ cp->daf = dest_af;
+ cp->protocol = p->protocol;
+ ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
+ cp->cport = p->cport;
+ /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
+ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ &cp->vaddr, p->vaddr);
+ cp->vport = p->vport;
+ ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
+ cp->dport = dport;
+ cp->flags = flags;
+ cp->fwmark = fwmark;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+ ip_vs_pe_get(p->pe);
+ cp->pe = p->pe;
+ cp->pe_data = p->pe_data;
+ cp->pe_data_len = p->pe_data_len;
+ } else {
+ cp->pe = NULL;
+ cp->pe_data = NULL;
+ cp->pe_data_len = 0;
+ }
+ spin_lock_init(&cp->lock);
+
+ /*
+ * Set the entry is referenced by the current thread before hashing
+ * it in the table, so that other thread run ip_vs_random_dropentry
+ * but cannot drop this entry.
+ */
+ atomic_set(&cp->refcnt, 1);
+
+ cp->control = NULL;
+ atomic_set(&cp->n_control, 0);
+ atomic_set(&cp->in_pkts, 0);
+
+ cp->packet_xmit = NULL;
+ cp->app = NULL;
+ cp->app_data = NULL;
+ /* reset struct ip_vs_seq */
+ cp->in_seq.delta = 0;
+ cp->out_seq.delta = 0;
+
+ atomic_inc(&ipvs->conn_count);
+ if (flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+ /* Bind the connection with a destination server */
+ cp->dest = NULL;
+ ip_vs_bind_dest(cp, dest);
+
+ /* Set its state and timeout */
+ cp->state = 0;
+ cp->old_state = 0;
+ cp->timeout = 3*HZ;
+ cp->sync_endtime = jiffies & ~3UL;
+
+ /* Bind its packet transmitter */
+#ifdef CONFIG_IP_VS_IPV6
+ if (p->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ if (unlikely(pd && atomic_read(&pd->appcnt)))
+ ip_vs_bind_app(cp, pd->pp);
+
+ /*
+ * Allow conntrack to be preserved. By default, conntrack
+ * is created and destroyed for every packet.
+ * Sometimes keeping conntrack can be useful for
+ * IP_VS_CONN_F_ONE_PACKET too.
+ */
+
+ if (ip_vs_conntrack_enabled(ipvs))
+ cp->flags |= IP_VS_CONN_F_NFCT;
+
+ /* Hash it in the ip_vs_conn_tab finally */
+ ip_vs_conn_hash(cp);
+
+ return cp;
+}
+
+/*
+ * /proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+ struct seq_net_private p;
+ struct hlist_head *l;
+};
+
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+ int idx;
+ struct ip_vs_conn *cp;
+ struct ip_vs_iter_state *iter = seq->private;
+
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ /* __ip_vs_conn_get() is not needed by
+ * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+ */
+ if (pos-- == 0) {
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
+ }
+ }
+ cond_resched_rcu();
+ }
+
+ return NULL;
+}
+
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct ip_vs_iter_state *iter = seq->private;
+
+ iter->l = NULL;
+ rcu_read_lock();
+ return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_vs_conn *cp = v;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
+ struct hlist_head *l = iter->l;
+ int idx;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_conn_array(seq, 0);
+
+ /* more on same hash chain? */
+ e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_conn, c_list);
+
+ idx = l - ip_vs_conn_tab;
+ while (++idx < ip_vs_conn_tab_size) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
+ }
+ cond_resched_rcu();
+ }
+ iter->l = NULL;
+ return NULL;
+}
+
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
+ else {
+ const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+ char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+ size_t len = 0;
+ char dbuf[IP_VS_ADDRSTRLEN];
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+ if (cp->pe_data) {
+ pe_data[0] = ' ';
+ len = strlen(cp->pe->name);
+ memcpy(pe_data + 1, cp->pe->name, len);
+ pe_data[len + 1] = ' ';
+ len += 2;
+ len += cp->pe->show_pe_data(cp, pe_data + len);
+ }
+ pe_data[len] = '\0';
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->daf == AF_INET6)
+ snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
+ else
+#endif
+ snprintf(dbuf, sizeof(dbuf), "%08X",
+ ntohl(cp->daddr.ip));
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+ "%s %04X %-11s %7lu%s\n",
+ ip_vs_proto_name(cp->protocol),
+ &cp->caddr.in6, ntohs(cp->cport),
+ &cp->vaddr.in6, ntohs(cp->vport),
+ dbuf, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ (cp->timer.expires-jiffies)/HZ, pe_data);
+ else
+#endif
+ seq_printf(seq,
+ "%-3s %08X %04X %08X %04X"
+ " %s %04X %-11s %7lu%s\n",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr.ip), ntohs(cp->cport),
+ ntohl(cp->vaddr.ip), ntohs(cp->vport),
+ dbuf, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ (cp->timer.expires-jiffies)/HZ, pe_data);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_conn_seq_ops = {
+ .start = ip_vs_conn_seq_start,
+ .next = ip_vs_conn_seq_next,
+ .stop = ip_vs_conn_seq_stop,
+ .show = ip_vs_conn_seq_show,
+};
+
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state));
+}
+
+static const struct file_operations ip_vs_conn_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_conn_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static const char *ip_vs_origin_name(unsigned int flags)
+{
+ if (flags & IP_VS_CONN_F_SYNC)
+ return "SYNC";
+ else
+ return "LOCAL";
+}
+
+static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
+{
+ char dbuf[IP_VS_ADDRSTRLEN];
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
+ else {
+ const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->daf == AF_INET6)
+ snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
+ else
+#endif
+ snprintf(dbuf, sizeof(dbuf), "%08X",
+ ntohl(cp->daddr.ip));
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+ "%s %04X %-11s %-6s %7lu\n",
+ ip_vs_proto_name(cp->protocol),
+ &cp->caddr.in6, ntohs(cp->cport),
+ &cp->vaddr.in6, ntohs(cp->vport),
+ dbuf, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_origin_name(cp->flags),
+ (cp->timer.expires-jiffies)/HZ);
+ else
+#endif
+ seq_printf(seq,
+ "%-3s %08X %04X %08X %04X "
+ "%s %04X %-11s %-6s %7lu\n",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr.ip), ntohs(cp->cport),
+ ntohl(cp->vaddr.ip), ntohs(cp->vport),
+ dbuf, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_origin_name(cp->flags),
+ (cp->timer.expires-jiffies)/HZ);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_conn_sync_seq_ops = {
+ .start = ip_vs_conn_seq_start,
+ .next = ip_vs_conn_seq_next,
+ .stop = ip_vs_conn_seq_stop,
+ .show = ip_vs_conn_sync_seq_show,
+};
+
+static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state));
+}
+
+static const struct file_operations ip_vs_conn_sync_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_conn_sync_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif
+
+
+/*
+ * Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+ /*
+ * The drop rate array needs tuning for real environments.
+ * Called from timer bh only => no locking
+ */
+ static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static char todrop_counter[9] = {0};
+ int i;
+
+ /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+ This will leave enough time for normal connection to get
+ through. */
+ if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+ return 0;
+
+ /* Don't drop the entry if its number of incoming packets is not
+ located in [0, 8] */
+ i = atomic_read(&cp->in_pkts);
+ if (i > 8 || i < 0) return 0;
+
+ if (!todrop_rate[i]) return 0;
+ if (--todrop_counter[i] > 0) return 0;
+
+ todrop_counter[i] = todrop_rate[i];
+ return 1;
+}
+
+/* Called from keventd and must protect itself from softirqs */
+void ip_vs_random_dropentry(struct net *net)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+
+ rcu_read_lock();
+ /*
+ * Randomly scan 1/32 of the whole table every second
+ */
+ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
+ unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ /* connection template */
+ continue;
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
+ if (cp->protocol == IPPROTO_TCP) {
+ switch(cp->state) {
+ case IP_VS_TCP_S_SYN_RECV:
+ case IP_VS_TCP_S_SYNACK:
+ break;
+
+ case IP_VS_TCP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+
+ default:
+ continue;
+ }
+ } else if (cp->protocol == IPPROTO_SCTP) {
+ switch (cp->state) {
+ case IP_VS_SCTP_S_INIT1:
+ case IP_VS_SCTP_S_INIT:
+ break;
+ case IP_VS_SCTP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+ default:
+ continue;
+ }
+ } else {
+ if (!todrop_entry(cp))
+ continue;
+ }
+
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
+ }
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+}
+
+
+/*
+ * Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(struct net *net)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+flush_again:
+ rcu_read_lock();
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
+ }
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+
+ /* the counter may be not NULL, because maybe some conn entries
+ are run by slow timer handler or unhashed but still referred */
+ if (atomic_read(&ipvs->conn_count) != 0) {
+ schedule();
+ goto flush_again;
+ }
+}
+/*
+ * per netns init and exit
+ */
+int __net_init ip_vs_conn_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ atomic_set(&ipvs->conn_count, 0);
+
+ proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
+ proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
+ return 0;
+}
+
+void __net_exit ip_vs_conn_net_cleanup(struct net *net)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush(net);
+ remove_proc_entry("ip_vs_conn", net->proc_net);
+ remove_proc_entry("ip_vs_conn_sync", net->proc_net);
+}
+
+int __init ip_vs_conn_init(void)
+{
+ int idx;
+
+ /* Compute size and mask */
+ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
+ ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
+
+ /*
+ * Allocate the connection hash table and initialize its list heads
+ */
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
+ if (!ip_vs_conn_tab)
+ return -ENOMEM;
+
+ /* Allocate ip_vs_conn slab cache */
+ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+ sizeof(struct ip_vs_conn), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!ip_vs_conn_cachep) {
+ vfree(ip_vs_conn_tab);
+ return -ENOMEM;
+ }
+
+ pr_info("Connection hash table configured "
+ "(size=%d, memory=%ldKbytes)\n",
+ ip_vs_conn_tab_size,
+ (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
+ IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+ sizeof(struct ip_vs_conn));
+
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
+ INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
+
+ for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
+ spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ }
+
+ /* calculate the random value for connection hash */
+ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+ return 0;
+}
+
+void ip_vs_conn_cleanup(void)
+{
+ /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+ rcu_barrier();
+ /* Release the empty cache */
+ kmem_cache_destroy(ip_vs_conn_cachep);
+ vfree(ip_vs_conn_tab);
+}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
new file mode 100644
index 000000000..5d2b806a8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -0,0 +1,2136 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ * Paul `Rusty' Russell properly handle non-linear skbs
+ * Harald Welte don't use nfcache
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/sctp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h>
+#include <net/ip6_checksum.h>
+#include <net/netns/generic.h> /* net_generic() */
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
+#endif
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+
+static int ip_vs_net_id __read_mostly;
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph) (((icmph)->un).echo.id)
+#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
+
+const char *ip_vs_proto_name(unsigned int proto)
+{
+ static char buf[20];
+
+ switch (proto) {
+ case IPPROTO_IP:
+ return "IP";
+ case IPPROTO_UDP:
+ return "UDP";
+ case IPPROTO_TCP:
+ return "TCP";
+ case IPPROTO_SCTP:
+ return "SCTP";
+ case IPPROTO_ICMP:
+ return "ICMP";
+#ifdef CONFIG_IP_VS_IPV6
+ case IPPROTO_ICMPV6:
+ return "ICMPv6";
+#endif
+ default:
+ sprintf(buf, "IP_%u", proto);
+ return buf;
+ }
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+ while (--rows >= 0)
+ INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.inpkts++;
+ s->cnt.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.inpkts++;
+ s->cnt.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.inpkts++;
+ s->cnt.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ }
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.outpkts++;
+ s->cnt.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.outpkts++;
+ s->cnt.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.outpkts++;
+ s->cnt.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ }
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(cp->dest->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.conns++;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(svc->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.conns++;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.conns++;
+ u64_stats_update_end(&s->syncp);
+}
+
+
+static inline void
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ if (likely(pd->pp->state_transition))
+ pd->pp->state_transition(cp, direction, skb, pd);
+}
+
+static inline int
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+ struct sk_buff *skb, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ vport, p);
+ p->pe = rcu_dereference(svc->pe);
+ if (p->pe && p->pe->fill_param)
+ return p->pe->fill_param(p, skb);
+
+ return 0;
+}
+
+/*
+ * IPVS persistent scheduling function
+ * It creates a connection entry according to its template if exists,
+ * or selects a server and creates a connection entry plus a template.
+ * Locking: we are svc user (svc->refcnt), so we hold all dests too
+ * Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+ struct sk_buff *skb, __be16 src_port, __be16 dst_port,
+ int *ignored, struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_conn *cp = NULL;
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *ct;
+ __be16 dport = 0; /* destination port to forward */
+ unsigned int flags;
+ struct ip_vs_conn_param param;
+ const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+ union nf_inet_addr snet; /* source network of the client,
+ after masking */
+
+ /* Mask saddr with the netmask to adjust template granularity */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
+ (__force __u32) svc->netmask);
+ else
+#endif
+ snet.ip = iph->saddr.ip & svc->netmask;
+
+ IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
+ "mnet %s\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),
+ IP_VS_DBG_ADDR(svc->af, &snet));
+
+ /*
+ * As far as we know, FTP is a very complicated network protocol, and
+ * it uses control connection and data connections. For active FTP,
+ * FTP server initialize data connection to the client, its source port
+ * is often 20. For passive FTP, FTP server tells the clients the port
+ * that it passively listens to, and the client issues the data
+ * connection. In the tunneling or direct routing mode, the load
+ * balancer is on the client-to-server half of connection, the port
+ * number is unknown to the load balancer. So, a conn template like
+ * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+ * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+ * is created for other persistent services.
+ */
+ {
+ int protocol = iph->protocol;
+ const union nf_inet_addr *vaddr = &iph->daddr;
+ __be16 vport = 0;
+
+ if (dst_port == svc->port) {
+ /* non-FTP template:
+ * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+ * FTP template:
+ * <protocol, caddr, 0, vaddr, 0, daddr, 0>
+ */
+ if (svc->port != FTPPORT)
+ vport = dst_port;
+ } else {
+ /* Note: persistent fwmark-based services and
+ * persistent port zero service are handled here.
+ * fwmark template:
+ * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+ * port zero template:
+ * <protocol,caddr,0,vaddr,0,daddr,0>
+ */
+ if (svc->fwmark) {
+ protocol = IPPROTO_IP;
+ vaddr = &fwmark;
+ }
+ }
+ /* return *ignored = -1 so NF_DROP can be used */
+ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param) < 0) {
+ *ignored = -1;
+ return NULL;
+ }
+ }
+
+ /* Check if a template already exists */
+ ct = ip_vs_ct_in_get(&param);
+ if (!ct || !ip_vs_check_template(ct)) {
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * No template found or the dest of the connection
+ * template is not available.
+ * return *ignored=0 i.e. ICMP and NF_DROP
+ */
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
+ if (!dest) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ kfree(param.pe_data);
+ *ignored = 0;
+ return NULL;
+ }
+
+ if (dst_port == svc->port && svc->port != FTPPORT)
+ dport = dest->port;
+
+ /* Create a template
+ * This adds param.pe_data to the template,
+ * and thus param.pe_data will be destroyed
+ * when the template expires */
+ ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport,
+ IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
+ if (ct == NULL) {
+ kfree(param.pe_data);
+ *ignored = -1;
+ return NULL;
+ }
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ kfree(param.pe_data);
+ }
+
+ dport = dst_port;
+ if (dport == svc->port && dest->port)
+ dport = dest->port;
+
+ flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+ && iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+
+ /*
+ * Create a new connection according to the template
+ */
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr,
+ src_port, &iph->daddr, dst_port, &param);
+
+ cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
+ skb->mark);
+ if (cp == NULL) {
+ ip_vs_conn_put(ct);
+ *ignored = -1;
+ return NULL;
+ }
+
+ /*
+ * Add its control
+ */
+ ip_vs_control_add(cp, ct);
+ ip_vs_conn_put(ct);
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * IPVS main scheduling function
+ * It selects a server according to the virtual service, and
+ * creates a connection entry.
+ * Protocols supported: TCP, UDP
+ *
+ * Usage of *ignored
+ *
+ * 1 : protocol tried to schedule (eg. on SYN), found svc but the
+ * svc/scheduler decides that this packet should be accepted with
+ * NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 : scheduler can not find destination, so try bypass or
+ * return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 : scheduler tried to schedule but fatal error occurred, eg.
+ * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ * failure such as missing Call-ID, ENOMEM on skb_linearize
+ * or pe_data. In this case we should return NF_DROP without
+ * any attempts to send ICMP with ip_vs_leave.
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd, int *ignored,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_protocol *pp = pd->pp;
+ struct ip_vs_conn *cp = NULL;
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_dest *dest;
+ __be16 _ports[2], *pptr;
+ unsigned int flags;
+
+ *ignored = 1;
+ /*
+ * IPv6 frags, only the first hit here.
+ */
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL)
+ return NULL;
+
+ /*
+ * FTPDATA needs this check when using local real server.
+ * Never schedule Active FTPDATA connections from real server.
+ * For LVS-NAT they must be already created. For other methods
+ * with persistence the connection is created on SYN+ACK.
+ */
+ if (pptr[0] == FTPDATA) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling FTPDATA");
+ return NULL;
+ }
+
+ /*
+ * Do not schedule replies from local real server.
+ */
+ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling reply for existing connection");
+ __ip_vs_conn_put(cp);
+ return NULL;
+ }
+
+ /*
+ * Persistent service
+ */
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored,
+ iph);
+
+ *ignored = 0;
+
+ /*
+ * Non-persistent service
+ */
+ if (!svc->fwmark && pptr[1] != svc->port) {
+ if (!svc->port)
+ pr_err("Schedule: port zero only supported "
+ "in persistent services, "
+ "check your ipvs configuration\n");
+ return NULL;
+ }
+
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "Schedule: no dest found.\n");
+ return NULL;
+ }
+
+ flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+ && iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+
+ /*
+ * Create a connection entry.
+ */
+ {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0], &iph->daddr,
+ pptr[1], &p);
+ cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
+ dest->port ? dest->port : pptr[1],
+ flags, dest, skb->mark);
+ if (!cp) {
+ *ignored = -1;
+ return NULL;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
+ "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+ ip_vs_fwd_tag(cp),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
+ cp->flags, atomic_read(&cp->refcnt));
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * Pass or drop the packet.
+ * Called by ip_vs_in, when the virtual service is available but
+ * no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
+{
+ __be16 _ports[2], *pptr;
+#ifdef CONFIG_SYSCTL
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ int unicast;
+#endif
+
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL) {
+ return NF_DROP;
+ }
+
+#ifdef CONFIG_SYSCTL
+ net = skb_net(skb);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;
+ else
+#endif
+ unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);
+
+ /* if it is fwmark-based service, the cache_bypass sysctl is up
+ and the destination is a non-local unicast, then create
+ a cache_bypass connection entry */
+ ipvs = net_ipvs(net);
+ if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
+ int ret;
+ struct ip_vs_conn *cp;
+ unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+ iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+ union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
+
+ /* create a new connection entry */
+ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0],
+ &iph->daddr, pptr[1], &p);
+ cp = ip_vs_conn_new(&p, svc->af, &daddr, 0,
+ IP_VS_CONN_F_BYPASS | flags,
+ NULL, skb->mark);
+ if (!cp)
+ return NF_DROP;
+ }
+
+ /* statistics */
+ ip_vs_in_stats(cp, skb);
+
+ /* set state */
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+
+ /* transmit the first SYN packet */
+ ret = cp->packet_xmit(skb, cp, pd->pp, iph);
+ /* do not touch skb anymore */
+
+ atomic_inc(&cp->in_pkts);
+ ip_vs_conn_put(cp);
+ return ret;
+ }
+#endif
+
+ /*
+ * When the virtual ftp service is presented, packets destined
+ * for other services on the VIP may get here (except services
+ * listed in the ipvs table), pass the packets, because it is
+ * not ipvs job to decide to drop the packets.
+ */
+ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
+ return NF_ACCEPT;
+
+ /*
+ * Notify the client that the destination is unreachable, and
+ * release the socket buffer.
+ * Since it is in IP layer, the TCP socket is not actually
+ * created, the TCP RST packet cannot be sent, instead that
+ * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+ */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6) {
+ if (!skb->dev) {
+ struct net *net_ = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net_->loopback_dev;
+ }
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+ } else
+#endif
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ return NF_DROP;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int sysctl_snat_reroute(struct sk_buff *skb)
+{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ return ipvs->sysctl_snat_reroute;
+}
+
+static int sysctl_nat_icmp_send(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ return ipvs->sysctl_nat_icmp_send;
+}
+
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
+{
+ return ipvs->sysctl_expire_nodest_conn;
+}
+
+#else
+
+static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
+static int sysctl_nat_icmp_send(struct net *net) { return 0; }
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
+
+#endif
+
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+ return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
+{
+ if (NF_INET_LOCAL_IN == hooknum)
+ return IP_DEFRAG_VS_IN;
+ if (NF_INET_FORWARD == hooknum)
+ return IP_DEFRAG_VS_FWD;
+ return IP_DEFRAG_VS_OUT;
+}
+
+static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+ int err;
+
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
+ if (!err)
+ ip_send_check(ip_hdr(skb));
+
+ return err;
+}
+
+static int ip_vs_route_me_harder(int af, struct sk_buff *skb,
+ unsigned int hooknum)
+{
+ if (!sysctl_snat_reroute(skb))
+ return 0;
+ /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */
+ if (NF_INET_LOCAL_IN == hooknum)
+ return 0;
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
+ ip6_route_me_harder(skb) != 0)
+ return 1;
+ } else
+#endif
+ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, int inout)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned int icmp_offset = iph->ihl*4;
+ struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
+ icmp_offset);
+ struct iphdr *ciph = (struct iphdr *)(icmph + 1);
+
+ if (inout) {
+ iph->saddr = cp->vaddr.ip;
+ ip_send_check(iph);
+ ciph->daddr = cp->vaddr.ip;
+ ip_send_check(ciph);
+ } else {
+ iph->daddr = cp->daddr.ip;
+ ip_send_check(iph);
+ ciph->saddr = cp->daddr.ip;
+ ip_send_check(ciph);
+ }
+
+ /* the TCP/UDP/SCTP port */
+ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
+ IPPROTO_SCTP == ciph->protocol) {
+ __be16 *ports = (void *)ciph + ciph->ihl*4;
+
+ if (inout)
+ ports[1] = cp->vport;
+ else
+ ports[0] = cp->dport;
+ }
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ if (inout)
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMP");
+ else
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMP");
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, int inout)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ unsigned int icmp_offset = 0;
+ unsigned int offs = 0; /* header offset*/
+ int protocol;
+ struct icmp6hdr *icmph;
+ struct ipv6hdr *ciph;
+ unsigned short fragoffs;
+
+ ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
+ icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
+ offs = icmp_offset + sizeof(struct icmp6hdr);
+ ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
+
+ protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
+
+ if (inout) {
+ iph->saddr = cp->vaddr.in6;
+ ciph->daddr = cp->vaddr.in6;
+ } else {
+ iph->daddr = cp->daddr.in6;
+ ciph->saddr = cp->daddr.in6;
+ }
+
+ /* the TCP/UDP/SCTP port */
+ if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+ IPPROTO_SCTP == protocol)) {
+ __be16 *ports = (void *)(skb_network_header(skb) + offs);
+
+ IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
+ ntohs(inout ? ports[1] : ports[0]),
+ ntohs(inout ? cp->vport : cp->dport));
+ if (inout)
+ ports[1] = cp->vport;
+ else
+ ports[0] = cp->dport;
+ }
+
+ /* And finally the ICMP checksum */
+ icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+ skb->len - icmp_offset,
+ IPPROTO_ICMPV6, 0);
+ skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+ skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (inout)
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMPv6");
+ else
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMPv6");
+}
+#endif
+
+/* Handle relevant response ICMP messages - forward to the right
+ * destination host.
+ */
+static int handle_response_icmp(int af, struct sk_buff *skb,
+ union nf_inet_addr *snet,
+ __u8 protocol, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp,
+ unsigned int offset, unsigned int ihl,
+ unsigned int hooknum)
+{
+ unsigned int verdict = NF_DROP;
+
+ if (IP_VS_FWD_METHOD(cp) != 0) {
+ pr_err("shouldn't reach here, because the box is on the "
+ "half connection in the tun/dr module.\n");
+ }
+
+ /* Ensure the checksum is correct */
+ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+ /* Failed checksum! */
+ IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
+ IP_VS_DBG_ADDR(af, snet));
+ goto out;
+ }
+
+ if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+ IPPROTO_SCTP == protocol)
+ offset += 2 * sizeof(__u16);
+ if (!skb_make_writable(skb, offset))
+ goto out;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_nat_icmp_v6(skb, pp, cp, 1);
+ else
+#endif
+ ip_vs_nat_icmp(skb, pp, cp, 1);
+
+ if (ip_vs_route_me_harder(af, skb, hooknum))
+ goto out;
+
+ /* do the statistics and put it back */
+ ip_vs_out_stats(cp, skb);
+
+ skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
+ verdict = NF_ACCEPT;
+
+out:
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+
+/*
+ * Handle ICMP messages in the inside-to-outside direction (outgoing).
+ * Find any that might be relevant, check against existing connections.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+ unsigned int hooknum)
+{
+ struct iphdr *iph;
+ struct icmphdr _icmph, *ic;
+ struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
+ struct ip_vs_iphdr ciph;
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ unsigned int offset, ihl;
+ union nf_inet_addr snet;
+
+ *related = 1;
+
+ /* reassemble IP fragments */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+ }
+
+ iph = ip_hdr(skb);
+ offset = ihl = iph->ihl * 4;
+ ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
+ ic->type, ntohs(icmp_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((ic->type != ICMP_DEST_UNREACH) &&
+ (ic->type != ICMP_SOURCE_QUENCH) &&
+ (ic->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ offset += sizeof(_icmph);
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+ pp = ip_vs_proto_get(cih->protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+ pp->dont_defrag))
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking outgoing ICMP for");
+
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
+ /* The embedded headers contain source and dest in reverse order */
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ snet.ip = iph->saddr;
+ return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
+ pp, ciph.len, ihl, hooknum);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum, struct ip_vs_iphdr *ipvsh)
+{
+ struct icmp6hdr _icmph, *ic;
+ struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ union nf_inet_addr snet;
+ unsigned int writable;
+
+ *related = 1;
+ ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
+ if (ic == NULL)
+ return NF_DROP;
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (ipvsh->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &ipvsh->saddr, &ipvsh->daddr);
+
+ /* Now find the contained IP header */
+ ciph.len = ipvsh->len + sizeof(_icmph);
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ pp = ip_vs_proto_get(ciph.protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* The embedded headers contain source and dest in reverse order */
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ snet.in6 = ciph.saddr.in6;
+ writable = ciph.len;
+ return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
+ pp, writable, sizeof(struct ipv6hdr),
+ hooknum);
+}
+#endif
+
+/*
+ * Check if sctp chunc is ABORT chunk
+ */
+static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
+{
+ sctp_chunkhdr_t *sch, schunk;
+ sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return 0;
+ if (sch->type == SCTP_CID_ABORT)
+ return 1;
+ return 0;
+}
+
+static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
+{
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return 0;
+ return th->rst;
+}
+
+static inline bool is_new_conn(const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return false;
+ return th->syn;
+ }
+ case IPPROTO_SCTP: {
+ sctp_chunkhdr_t *sch, schunk;
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return false;
+ return sch->type == SCTP_CID_INIT;
+ }
+ default:
+ return false;
+ }
+}
+
+static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
+ int conn_reuse_mode)
+{
+ /* Controlled (FTP DATA or persistence)? */
+ if (cp->control)
+ return false;
+
+ switch (cp->protocol) {
+ case IPPROTO_TCP:
+ return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
+ ((conn_reuse_mode & 2) &&
+ (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
+ (cp->flags & IP_VS_CONN_F_NOOUTPUT));
+ case IPPROTO_SCTP:
+ return cp->state == IP_VS_SCTP_S_CLOSED;
+ default:
+ return false;
+ }
+}
+
+/* Handle response packets: rewrite addresses and send away...
+ */
+static unsigned int
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,
+ unsigned int hooknum)
+{
+ struct ip_vs_protocol *pp = pd->pp;
+
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
+
+ if (!skb_make_writable(skb, iph->len))
+ goto drop;
+
+ /* mangle the packet */
+ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
+ goto drop;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ipv6_hdr(skb)->saddr = cp->vaddr.in6;
+ else
+#endif
+ {
+ ip_hdr(skb)->saddr = cp->vaddr.ip;
+ ip_send_check(ip_hdr(skb));
+ }
+
+ /*
+ * nf_iterate does not expect change in the skb->dst->dev.
+ * It looks like it is not fatal to enable this code for hooks
+ * where our handlers are at the end of the chain list and
+ * when all next handlers use skb->dst->dev and not outdev.
+ * It will definitely route properly the inout NAT traffic
+ * when multiple paths are used.
+ */
+
+ /* For policy routing, packets originating from this
+ * machine itself may be routed differently to packets
+ * passing through. We want this packet to be routed as
+ * if it came from this machine itself. So re-compute
+ * the routing information.
+ */
+ if (ip_vs_route_me_harder(af, skb, hooknum))
+ goto drop;
+
+ IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
+
+ ip_vs_out_stats(cp, skb);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
+ skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
+ ip_vs_conn_put(cp);
+
+ LeaveFunction(11);
+ return NF_ACCEPT;
+
+drop:
+ ip_vs_conn_put(cp);
+ kfree_skb(skb);
+ LeaveFunction(11);
+ return NF_STOLEN;
+}
+
+/*
+ * Check if outgoing packet belongs to the established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
+{
+ struct net *net = NULL;
+ struct ip_vs_iphdr iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ struct ip_vs_conn *cp;
+
+ EnterFunction(11);
+
+ /* Already marked as IPVS request or reply? */
+ if (skb->ipvs_property)
+ return NF_ACCEPT;
+
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
+
+ if (unlikely(!skb_dst(skb)))
+ return NF_ACCEPT;
+
+ net = skb_net(skb);
+ if (!net_ipvs(net)->enable)
+ return NF_ACCEPT;
+
+ ip_vs_fill_iph_skb(af, skb, &iph);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+ int related;
+ int verdict = ip_vs_out_icmp_v6(skb, &related,
+ hooknum, &iph);
+
+ if (related)
+ return verdict;
+ }
+ } else
+#endif
+ if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+ int related;
+ int verdict = ip_vs_out_icmp(skb, &related, hooknum);
+
+ if (related)
+ return verdict;
+ }
+
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* reassemble IP fragments */
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET)
+#endif
+ if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
+ if (ip_vs_gather_frags(skb,
+ ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+
+ ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
+ }
+
+ /*
+ * Check if the packet belongs to an existing entry
+ */
+ cp = pp->conn_out_get(af, skb, &iph, 0);
+
+ if (likely(cp))
+ return handle_response(af, skb, pd, cp, &iph, hooknum);
+ if (sysctl_nat_icmp_send(net) &&
+ (pp->protocol == IPPROTO_TCP ||
+ pp->protocol == IPPROTO_UDP ||
+ pp->protocol == IPPROTO_SCTP)) {
+ __be16 _ports[2], *pptr;
+
+ pptr = frag_safe_skb_hp(skb, iph.len,
+ sizeof(_ports), _ports, &iph);
+ if (pptr == NULL)
+ return NF_ACCEPT; /* Not for me */
+ if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ pptr[0])) {
+ /*
+ * Notify the real server: there is no
+ * existing entry if it is not RST
+ * packet or not TCP packet.
+ */
+ if ((iph.protocol != IPPROTO_TCP &&
+ iph.protocol != IPPROTO_SCTP)
+ || ((iph.protocol == IPPROTO_TCP
+ && !is_tcp_reset(skb, iph.len))
+ || (iph.protocol == IPPROTO_SCTP
+ && !is_sctp_abort(skb,
+ iph.len)))) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ icmpv6_send(skb,
+ ICMPV6_DEST_UNREACH,
+ ICMPV6_PORT_UNREACH,
+ 0);
+ } else
+#endif
+ icmp_send(skb,
+ ICMP_DEST_UNREACH,
+ ICMP_PORT_UNREACH, 0);
+ return NF_DROP;
+ }
+ }
+ }
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_out: packet continues traversal as normal");
+ return NF_ACCEPT;
+}
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
+}
+
+#endif
+
+/*
+ * Handle ICMP messages in the outside-to-inside direction (incoming).
+ * Find any that might be relevant, check against existing connections,
+ * forward to the right destination host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int
+ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+ struct net *net = NULL;
+ struct iphdr *iph;
+ struct icmphdr _icmph, *ic;
+ struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
+ struct ip_vs_iphdr ciph;
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ unsigned int offset, offset2, ihl, verdict;
+ bool ipip;
+
+ *related = 1;
+
+ /* reassemble IP fragments */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+ }
+
+ iph = ip_hdr(skb);
+ offset = ihl = iph->ihl * 4;
+ ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
+ ic->type, ntohs(icmp_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((ic->type != ICMP_DEST_UNREACH) &&
+ (ic->type != ICMP_SOURCE_QUENCH) &&
+ (ic->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ offset += sizeof(_icmph);
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+ net = skb_net(skb);
+
+ /* Special case for errors for IPIP packets */
+ ipip = false;
+ if (cih->protocol == IPPROTO_IPIP) {
+ if (unlikely(cih->frag_off & htons(IP_OFFSET)))
+ return NF_ACCEPT;
+ /* Error for our IPIP must arrive at LOCAL_IN */
+ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
+ return NF_ACCEPT;
+ offset += cih->ihl * 4;
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ipip = true;
+ }
+
+ pd = ip_vs_proto_data_get(net, cih->protocol);
+ if (!pd)
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+ pp->dont_defrag))
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking incoming ICMP for");
+
+ offset2 = offset;
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
+ offset = ciph.len;
+ /* The embedded headers contain source and dest in reverse order.
+ * For IPIP this is error for request, not for reply.
+ */
+ cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ verdict = NF_DROP;
+
+ /* Ensure the checksum is correct */
+ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+ /* Failed checksum! */
+ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
+ &iph->saddr);
+ goto out;
+ }
+
+ if (ipip) {
+ __be32 info = ic->un.gateway;
+ __u8 type = ic->type;
+ __u8 code = ic->code;
+
+ /* Update the MTU */
+ if (ic->type == ICMP_DEST_UNREACH &&
+ ic->code == ICMP_FRAG_NEEDED) {
+ struct ip_vs_dest *dest = cp->dest;
+ u32 mtu = ntohs(ic->un.frag.mtu);
+ __be16 frag_off = cih->frag_off;
+
+ /* Strip outer IP and ICMP, go to IPIP header */
+ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
+ goto ignore_ipip;
+ offset2 -= ihl + sizeof(_icmph);
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ ipv4_update_pmtu(skb, dev_net(skb->dev),
+ mtu, 0, 0, 0, 0);
+ /* Client uses PMTUD? */
+ if (!(frag_off & htons(IP_DF)))
+ goto ignore_ipip;
+ /* Prefer the resulting PMTU */
+ if (dest) {
+ struct ip_vs_dest_dst *dest_dst;
+
+ rcu_read_lock();
+ dest_dst = rcu_dereference(dest->dest_dst);
+ if (dest_dst)
+ mtu = dst_mtu(dest_dst->dst_cache);
+ rcu_read_unlock();
+ }
+ if (mtu > 68 + sizeof(struct iphdr))
+ mtu -= sizeof(struct iphdr);
+ info = htonl(mtu);
+ }
+ /* Strip outer IP, ICMP and IPIP, go to IP header of
+ * original request.
+ */
+ if (pskb_pull(skb, offset2) == NULL)
+ goto ignore_ipip;
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ type, code, ntohl(info));
+ icmp_send(skb, type, code, info);
+ /* ICMP can be shorter but anyways, account it */
+ ip_vs_out_stats(cp, skb);
+
+ignore_ipip:
+ consume_skb(skb);
+ verdict = NF_STOLEN;
+ goto out;
+ }
+
+ /* do the statistics and put it back */
+ ip_vs_in_stats(cp, skb);
+ if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
+ IPPROTO_SCTP == cih->protocol)
+ offset += 2 * sizeof(__u16);
+ verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
+
+out:
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum, struct ip_vs_iphdr *iph)
+{
+ struct net *net = NULL;
+ struct ipv6hdr _ip6h, *ip6h;
+ struct icmp6hdr _icmph, *ic;
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ unsigned int offs_ciph, writable, verdict;
+
+ *related = 1;
+
+ ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (iph->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /* Now find the contained IP header */
+ ciph.len = iph->len + sizeof(_icmph);
+ offs_ciph = ciph.len; /* Save ip header offset */
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, ciph.protocol);
+ if (!pd)
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* Cannot handle fragmented embedded protocol */
+ if (ciph.fragoffs)
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
+ "Checking incoming ICMPv6 for");
+
+ /* The embedded headers contain source and dest in reverse order
+ * if not from localhost
+ */
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph,
+ (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
+
+ if (!cp)
+ return NF_ACCEPT;
+ /* VS/TUN, VS/DR and LOCALNODE just let it go */
+ if ((hooknum == NF_INET_LOCAL_OUT) &&
+ (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
+ __ip_vs_conn_put(cp);
+ return NF_ACCEPT;
+ }
+
+ /* do the statistics and put it back */
+ ip_vs_in_stats(cp, skb);
+
+ /* Need to mangle contained IPv6 header in ICMPv6 packet */
+ writable = ciph.len;
+ if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
+ IPPROTO_SCTP == ciph.protocol)
+ writable += 2 * sizeof(__u16); /* Also mangle ports */
+
+ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);
+
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+#endif
+
+
+/*
+ * Check if it's for virtual services, look it up,
+ * and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
+{
+ struct net *net;
+ struct ip_vs_iphdr iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ struct ip_vs_conn *cp;
+ int ret, pkts;
+ struct netns_ipvs *ipvs;
+ int conn_reuse_mode;
+
+ /* Already marked as IPVS request or reply? */
+ if (skb->ipvs_property)
+ return NF_ACCEPT;
+
+ /*
+ * Big tappo:
+ * - remote client: only PACKET_HOST
+ * - route: used for struct net when skb->dev is unset
+ */
+ if (unlikely((skb->pkt_type != PACKET_HOST &&
+ hooknum != NF_INET_LOCAL_OUT) ||
+ !skb_dst(skb))) {
+ ip_vs_fill_iph_skb(af, skb, &iph);
+ IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+ " ignored in hook %u\n",
+ skb->pkt_type, iph.protocol,
+ IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
+ return NF_ACCEPT;
+ }
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ ip_vs_fill_iph_skb(af, skb, &iph);
+
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+ int related;
+ int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
+ &iph);
+
+ if (related)
+ return verdict;
+ }
+ } else
+#endif
+ if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+ int related;
+ int verdict = ip_vs_in_icmp(skb, &related, hooknum);
+
+ if (related)
+ return verdict;
+ }
+
+ /* Protocol supported? */
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
+ return NF_ACCEPT;
+ pp = pd->pp;
+ /*
+ * Check if the packet belongs to an existing connection entry
+ */
+ cp = pp->conn_in_get(af, skb, &iph, 0);
+
+ conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
+ if (conn_reuse_mode && !iph.fragoffs &&
+ is_new_conn(skb, &iph) && cp &&
+ ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
+ unlikely(!atomic_read(&cp->dest->weight))) ||
+ unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
+ if (!atomic_read(&cp->n_control))
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ cp = NULL;
+ }
+
+ if (unlikely(!cp) && !iph.fragoffs) {
+ /* No (second) fragments need to enter here, as nf_defrag_ipv6
+ * replayed fragment zero will already have created the cp
+ */
+ int v;
+
+ /* Schedule and create new connection entry into &cp */
+ if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
+ return v;
+ }
+
+ if (unlikely(!cp)) {
+ /* sorry, all this trouble for a no-hit :) */
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_in: packet continues traversal as normal");
+ if (iph.fragoffs) {
+ /* Fragment that couldn't be mapped to a conn entry
+ * is missing module nf_defrag_ipv6
+ */
+ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+ IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
+ }
+ return NF_ACCEPT;
+ }
+
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
+ /* Check the server status */
+ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ /* the destination server is not available */
+
+ if (sysctl_expire_nodest_conn(ipvs)) {
+ /* try to expire the connection immediately */
+ ip_vs_conn_expire_now(cp);
+ }
+ /* don't restart its timer, and silently
+ drop the packet. */
+ __ip_vs_conn_put(cp);
+ return NF_DROP;
+ }
+
+ ip_vs_in_stats(cp, skb);
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+ if (cp->packet_xmit)
+ ret = cp->packet_xmit(skb, cp, pp, &iph);
+ /* do not touch skb anymore */
+ else {
+ IP_VS_DBG_RL("warning: packet_xmit is null");
+ ret = NF_ACCEPT;
+ }
+
+ /* Increase its packet counter and check if it is needed
+ * to be synchronized
+ *
+ * Sync connection if it is about to close to
+ * encorage the standby servers to update the connections timeout
+ *
+ * For ONE_PKT let ip_vs_sync_conn() do the filter work.
+ */
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ pkts = sysctl_sync_threshold(ipvs);
+ else
+ pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, pkts);
+
+ ip_vs_conn_put(cp);
+ return ret;
+}
+
+/*
+ * AF_INET handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
+}
+
+/*
+ * AF_INET handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
+}
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
+}
+
+#endif
+
+
+/*
+ * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
+ * related packets destined for 0.0.0.0/0.
+ * When fwmark-based virtual service is used, such as transparent
+ * cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
+ * and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int r;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+
+ if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+ return NF_ACCEPT;
+
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ return ip_vs_in_icmp(skb, &r, ops->hooknum);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static unsigned int
+ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int r;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ struct ip_vs_iphdr iphdr;
+
+ ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
+ if (iphdr.protocol != IPPROTO_ICMPV6)
+ return NF_ACCEPT;
+
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
+}
+#endif
+
+
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC - 2,
+ },
+ /* After packet filtering, forward packet through VS/DR, VS/TUN,
+ * or VS/NAT(change destination), so that filtering rules can be
+ * applied to IPVS. */
+ {
+ .hook = ip_vs_remote_request4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC - 1,
+ },
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST + 1,
+ },
+ /* After mangle, schedule and forward local requests */
+ {
+ .hook = ip_vs_local_request4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST + 2,
+ },
+ /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+ {
+ .hook = ip_vs_forward_icmp,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
+ },
+#ifdef CONFIG_IP_VS_IPV6
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC - 2,
+ },
+ /* After packet filtering, forward packet through VS/DR, VS/TUN,
+ * or VS/NAT(change destination), so that filtering rules can be
+ * applied to IPVS. */
+ {
+ .hook = ip_vs_remote_request6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC - 1,
+ },
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST + 1,
+ },
+ /* After mangle, schedule and forward local requests */
+ {
+ .hook = ip_vs_local_request6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST + 2,
+ },
+ /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+ {
+ .hook = ip_vs_forward_icmp_v6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
+ },
+#endif
+};
+/*
+ * Initialize IP Virtual Server netns mem.
+ */
+static int __net_init __ip_vs_init(struct net *net)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_generic(net, ip_vs_net_id);
+ if (ipvs == NULL)
+ return -ENOMEM;
+
+ /* Hold the beast until a service is registerd */
+ ipvs->enable = 0;
+ ipvs->net = net;
+ /* Counters used for creating unique names */
+ ipvs->gen = atomic_read(&ipvs_netns_cnt);
+ atomic_inc(&ipvs_netns_cnt);
+ net->ipvs = ipvs;
+
+ if (ip_vs_estimator_net_init(net) < 0)
+ goto estimator_fail;
+
+ if (ip_vs_control_net_init(net) < 0)
+ goto control_fail;
+
+ if (ip_vs_protocol_net_init(net) < 0)
+ goto protocol_fail;
+
+ if (ip_vs_app_net_init(net) < 0)
+ goto app_fail;
+
+ if (ip_vs_conn_net_init(net) < 0)
+ goto conn_fail;
+
+ if (ip_vs_sync_net_init(net) < 0)
+ goto sync_fail;
+
+ printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
+ sizeof(struct netns_ipvs), ipvs->gen);
+ return 0;
+/*
+ * Error handling
+ */
+
+sync_fail:
+ ip_vs_conn_net_cleanup(net);
+conn_fail:
+ ip_vs_app_net_cleanup(net);
+app_fail:
+ ip_vs_protocol_net_cleanup(net);
+protocol_fail:
+ ip_vs_control_net_cleanup(net);
+control_fail:
+ ip_vs_estimator_net_cleanup(net);
+estimator_fail:
+ net->ipvs = NULL;
+ return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+ ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
+ ip_vs_conn_net_cleanup(net);
+ ip_vs_app_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(net);
+ ip_vs_control_net_cleanup(net);
+ ip_vs_estimator_net_cleanup(net);
+ IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+ net->ipvs = NULL;
+}
+
+static void __net_exit __ip_vs_dev_cleanup(struct net *net)
+{
+ EnterFunction(2);
+ net_ipvs(net)->enable = 0; /* Disable packet reception */
+ smp_wmb();
+ ip_vs_sync_net_cleanup(net);
+ LeaveFunction(2);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+ .init = __ip_vs_init,
+ .exit = __ip_vs_cleanup,
+ .id = &ip_vs_net_id,
+ .size = sizeof(struct netns_ipvs),
+};
+
+static struct pernet_operations ipvs_core_dev_ops = {
+ .exit = __ip_vs_dev_cleanup,
+};
+
+/*
+ * Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+ int ret;
+
+ ret = ip_vs_control_init();
+ if (ret < 0) {
+ pr_err("can't setup control.\n");
+ goto exit;
+ }
+
+ ip_vs_protocol_init();
+
+ ret = ip_vs_conn_init();
+ if (ret < 0) {
+ pr_err("can't setup connection table.\n");
+ goto cleanup_protocol;
+ }
+
+ ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
+ if (ret < 0)
+ goto cleanup_conn;
+
+ ret = register_pernet_device(&ipvs_core_dev_ops);
+ if (ret < 0)
+ goto cleanup_sub;
+
+ ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ if (ret < 0) {
+ pr_err("can't register hooks.\n");
+ goto cleanup_dev;
+ }
+
+ ret = ip_vs_register_nl_ioctl();
+ if (ret < 0) {
+ pr_err("can't register netlink/ioctl.\n");
+ goto cleanup_hooks;
+ }
+
+ pr_info("ipvs loaded.\n");
+
+ return ret;
+
+cleanup_hooks:
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+cleanup_dev:
+ unregister_pernet_device(&ipvs_core_dev_ops);
+cleanup_sub:
+ unregister_pernet_subsys(&ipvs_core_ops);
+cleanup_conn:
+ ip_vs_conn_cleanup();
+cleanup_protocol:
+ ip_vs_protocol_cleanup();
+ ip_vs_control_cleanup();
+exit:
+ return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+ ip_vs_unregister_nl_ioctl();
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ unregister_pernet_device(&ipvs_core_dev_ops);
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
+ ip_vs_conn_cleanup();
+ ip_vs_protocol_cleanup();
+ ip_vs_control_cleanup();
+ pr_info("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
new file mode 100644
index 000000000..285eae3a1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -0,0 +1,3956 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/mutex.h>
+
+#include <net/net_namespace.h>
+#include <linux/nsproxy.h>
+#include <net/ip.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#endif
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DEFINE_MUTEX(__ip_vs_mutex);
+
+/* sysctl variables */
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+ return sysctl_ip_vs_debug_level;
+}
+#endif
+
+
+/* Protos */
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
+
+
+#ifdef CONFIG_IP_VS_IPV6
+/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
+static bool __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
+{
+ struct flowi6 fl6 = {
+ .daddr = *addr,
+ };
+ struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
+ bool is_local;
+
+ is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
+
+ dst_release(dst);
+ return is_local;
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+/*
+ * update_defense_level is called from keventd and from sysctl,
+ * so it needs to protect itself from softirqs
+ */
+static void update_defense_level(struct netns_ipvs *ipvs)
+{
+ struct sysinfo i;
+ static int old_secure_tcp = 0;
+ int availmem;
+ int nomem;
+ int to_change = -1;
+
+ /* we only count free and buffered memory (in pages) */
+ si_meminfo(&i);
+ availmem = i.freeram + i.bufferram;
+ /* however in linux 2.5 the i.bufferram is total page cache size,
+ we need adjust it */
+ /* si_swapinfo(&i); */
+ /* availmem = availmem - (i.totalswap - i.freeswap); */
+
+ nomem = (availmem < ipvs->sysctl_amemthresh);
+
+ local_bh_disable();
+
+ /* drop_entry */
+ spin_lock(&ipvs->dropentry_lock);
+ switch (ipvs->sysctl_drop_entry) {
+ case 0:
+ atomic_set(&ipvs->dropentry, 0);
+ break;
+ case 1:
+ if (nomem) {
+ atomic_set(&ipvs->dropentry, 1);
+ ipvs->sysctl_drop_entry = 2;
+ } else {
+ atomic_set(&ipvs->dropentry, 0);
+ }
+ break;
+ case 2:
+ if (nomem) {
+ atomic_set(&ipvs->dropentry, 1);
+ } else {
+ atomic_set(&ipvs->dropentry, 0);
+ ipvs->sysctl_drop_entry = 1;
+ };
+ break;
+ case 3:
+ atomic_set(&ipvs->dropentry, 1);
+ break;
+ }
+ spin_unlock(&ipvs->dropentry_lock);
+
+ /* drop_packet */
+ spin_lock(&ipvs->droppacket_lock);
+ switch (ipvs->sysctl_drop_packet) {
+ case 0:
+ ipvs->drop_rate = 0;
+ break;
+ case 1:
+ if (nomem) {
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ ipvs->sysctl_drop_packet = 2;
+ } else {
+ ipvs->drop_rate = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ } else {
+ ipvs->drop_rate = 0;
+ ipvs->sysctl_drop_packet = 1;
+ }
+ break;
+ case 3:
+ ipvs->drop_rate = ipvs->sysctl_am_droprate;
+ break;
+ }
+ spin_unlock(&ipvs->droppacket_lock);
+
+ /* secure_tcp */
+ spin_lock(&ipvs->securetcp_lock);
+ switch (ipvs->sysctl_secure_tcp) {
+ case 0:
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ break;
+ case 1:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ ipvs->sysctl_secure_tcp = 2;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ ipvs->sysctl_secure_tcp = 1;
+ }
+ break;
+ case 3:
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ break;
+ }
+ old_secure_tcp = ipvs->sysctl_secure_tcp;
+ if (to_change >= 0)
+ ip_vs_protocol_timeout_change(ipvs,
+ ipvs->sysctl_secure_tcp > 1);
+ spin_unlock(&ipvs->securetcp_lock);
+
+ local_bh_enable();
+}
+
+
+/*
+ * Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD 1*HZ
+
+static void defense_work_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, defense_work.work);
+
+ update_defense_level(ipvs);
+ if (atomic_read(&ipvs->dropentry))
+ ip_vs_random_dropentry(ipvs->net);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+}
+#endif
+
+int
+ip_vs_use_count_inc(void)
+{
+ return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+ module_put(THIS_MODULE);
+}
+
+
+/*
+ * Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+
+/*
+ * Returns hash value for virtual service
+ */
+static inline unsigned int
+ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ register unsigned int porth = ntohs(port);
+ __be32 addr_fold = addr->ip;
+ __u32 ahash;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ ahash = ntohl(addr_fold);
+ ahash ^= ((size_t) net >> 8);
+
+ return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
+ IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Returns hash value of fwmark for virtual service lookup
+ */
+static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
+{
+ return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
+ * or in the ip_vs_svc_fwm_table by fwmark.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+ unsigned int hash;
+
+ if (svc->flags & IP_VS_SVC_F_HASHED) {
+ pr_err("%s(): request for already hashed, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /*
+ * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
+ */
+ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ &svc->addr, svc->port);
+ hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
+ } else {
+ /*
+ * Hash it by fwmark in svc_fwm_table
+ */
+ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
+ hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ }
+
+ svc->flags |= IP_VS_SVC_F_HASHED;
+ /* increase its refcnt because it is referenced by the svc table */
+ atomic_inc(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Unhashes a service from svc_table / svc_fwm_table.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+ if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+ pr_err("%s(): request for unhash flagged, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /* Remove it from the svc_table table */
+ hlist_del_rcu(&svc->s_list);
+ } else {
+ /* Remove it from the svc_fwm_table table */
+ hlist_del_rcu(&svc->f_list);
+ }
+
+ svc->flags &= ~IP_VS_SVC_F_HASHED;
+ atomic_dec(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Get service by {netns, proto,addr,port} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
+{
+ unsigned int hash;
+ struct ip_vs_service *svc;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
+
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
+ if ((svc->af == af)
+ && ip_vs_addr_equal(af, &svc->addr, vaddr)
+ && (svc->port == vport)
+ && (svc->protocol == protocol)
+ && net_eq(svc->net, net)) {
+ /* HIT */
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Get service by {fwmark} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
+{
+ unsigned int hash;
+ struct ip_vs_service *svc;
+
+ /* Check for fwmark addressed entries */
+ hash = ip_vs_svc_fwm_hashkey(net, fwmark);
+
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+ if (svc->fwmark == fwmark && svc->af == af
+ && net_eq(svc->net, net)) {
+ /* HIT */
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+/* Find service, called under RCU lock */
+struct ip_vs_service *
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
+{
+ struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /*
+ * Check the table hashed by fwmark first
+ */
+ if (fwmark) {
+ svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ if (svc)
+ goto out;
+ }
+
+ /*
+ * Check the table hashed by <protocol,addr,port>
+ * for "full" addressed entries
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
+
+ if (svc == NULL
+ && protocol == IPPROTO_TCP
+ && atomic_read(&ipvs->ftpsvc_counter)
+ && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+ /*
+ * Check if ftp service entry exists, the packet
+ * might belong to FTP data connections.
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
+ }
+
+ if (svc == NULL
+ && atomic_read(&ipvs->nullsvc_counter)) {
+ /*
+ * Check if the catch-all port (port zero) exists
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
+ }
+
+ out:
+ IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
+ fwmark, ip_vs_proto_name(protocol),
+ IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
+ svc ? "hit" : "not hit");
+
+ return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ atomic_inc(&svc->refcnt);
+ rcu_assign_pointer(dest->svc, svc);
+}
+
+static void ip_vs_service_free(struct ip_vs_service *svc)
+{
+ free_percpu(svc->stats.cpustats);
+ kfree(svc);
+}
+
+static void ip_vs_service_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_service *svc;
+
+ svc = container_of(head, struct ip_vs_service, rcu_head);
+ ip_vs_service_free(svc);
+}
+
+static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+{
+ if (atomic_dec_and_test(&svc->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port));
+ if (do_delay)
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
+ else
+ ip_vs_service_free(svc);
+ }
+}
+
+
+/*
+ * Returns hash value for real service
+ */
+static inline unsigned int ip_vs_rs_hashkey(int af,
+ const union nf_inet_addr *addr,
+ __be16 port)
+{
+ register unsigned int porth = ntohs(port);
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+
+ return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
+ & IP_VS_RTAB_MASK;
+}
+
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+{
+ unsigned int hash;
+
+ if (dest->in_rs_table)
+ return;
+
+ /*
+ * Hash by proto,addr,port,
+ * which are the parameters of the real service.
+ */
+ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+
+ hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+ dest->in_rs_table = 1;
+}
+
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+ /*
+ * Remove it from the rs_table table.
+ */
+ if (dest->in_rs_table) {
+ hlist_del_rcu(&dest->d_list);
+ dest->in_rs_table = 0;
+ }
+}
+
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int hash;
+ struct ip_vs_dest *dest;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
+ /* HIT */
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+/* Lookup destination by {addr,port} in the given service
+ * Called under RCU lock.
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
+ const union nf_inet_addr *daddr, __be16 dport)
+{
+ struct ip_vs_dest *dest;
+
+ /*
+ * Find the destination for the given service
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if ((dest->af == dest_af) &&
+ ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
+ (dest->port == dport)) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Find destination by {daddr,dport,vaddr,protocol}
+ * Created to be used in ip_vs_process_message() in
+ * the backup synchronization daemon. It finds the
+ * destination to be bound to the received connection
+ * on the backup.
+ * Called under RCU lock, no refcnt is returned.
+ */
+struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af,
+ const union nf_inet_addr *daddr,
+ __be16 dport,
+ const union nf_inet_addr *vaddr,
+ __be16 vport, __u16 protocol, __u32 fwmark,
+ __u32 flags)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_service *svc;
+ __be16 port = dport;
+
+ svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport);
+ if (!svc)
+ return NULL;
+ if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
+ port = 0;
+ dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
+ if (!dest)
+ dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
+ return dest;
+}
+
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_dst *dest_dst = container_of(head,
+ struct ip_vs_dest_dst,
+ rcu_head);
+
+ dst_release(dest_dst->dst_cache);
+ kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst, 1);
+ if (old) {
+ RCU_INIT_POINTER(dest->dest_dst, NULL);
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+ }
+}
+
+/*
+ * Lookup dest by {svc,addr,port} in the destination trash.
+ * The destination trash is used to hold the destinations that are removed
+ * from the service table but are still referenced by some conn entries.
+ * The reason to add the destination trash is when the dest is temporary
+ * down (either by administrator or by monitor program), the dest can be
+ * picked back from the trash, the remaining connections to the dest can
+ * continue, and the counting information of the dest is also useful for
+ * scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
+ const union nf_inet_addr *daddr, __be16 dport)
+{
+ struct ip_vs_dest *dest;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
+ /*
+ * Find the destination in trash
+ */
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
+ "dest->refcnt=%d\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (dest->af == dest_af &&
+ ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
+ dest->port == dport &&
+ dest->vfwmark == svc->fwmark &&
+ dest->protocol == svc->protocol &&
+ (svc->fwmark ||
+ (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
+ dest->vport == svc->port))) {
+ /* HIT */
+ list_del(&dest->t_list);
+ ip_vs_dest_hold(dest);
+ goto out;
+ }
+ }
+
+ dest = NULL;
+
+out:
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ return dest;
+}
+
+static void ip_vs_dest_free(struct ip_vs_dest *dest)
+{
+ struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
+
+ __ip_vs_dst_cache_reset(dest);
+ __ip_vs_svc_put(svc, false);
+ free_percpu(dest->stats.cpustats);
+ ip_vs_dest_put_and_free(dest);
+}
+
+/*
+ * Clean up all the destinations in the trash
+ * Called by the ip_vs_control_cleanup()
+ *
+ * When the ip_vs_control_clearup is activated by ipvs module exit,
+ * the service tables must have been flushed and all the connections
+ * are expired, and the refcnt of each destination in the trash must
+ * be 0, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(struct net *net)
+{
+ struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ del_timer_sync(&ipvs->dest_trash_timer);
+ /* No need to use dest_trash_lock */
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+}
+
+static void
+ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
+{
+#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
+
+ spin_lock_bh(&src->lock);
+
+ IP_VS_SHOW_STATS_COUNTER(conns);
+ IP_VS_SHOW_STATS_COUNTER(inpkts);
+ IP_VS_SHOW_STATS_COUNTER(outpkts);
+ IP_VS_SHOW_STATS_COUNTER(inbytes);
+ IP_VS_SHOW_STATS_COUNTER(outbytes);
+
+ ip_vs_read_estimator(dst, src);
+
+ spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
+{
+ dst->conns = (u32)src->conns;
+ dst->inpkts = (u32)src->inpkts;
+ dst->outpkts = (u32)src->outpkts;
+ dst->inbytes = src->inbytes;
+ dst->outbytes = src->outbytes;
+ dst->cps = (u32)src->cps;
+ dst->inpps = (u32)src->inpps;
+ dst->outpps = (u32)src->outpps;
+ dst->inbps = (u32)src->inbps;
+ dst->outbps = (u32)src->outbps;
+}
+
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+ spin_lock_bh(&stats->lock);
+
+ /* get current counters as zero point, rates are zeroed */
+
+#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
+
+ IP_VS_ZERO_STATS_COUNTER(conns);
+ IP_VS_ZERO_STATS_COUNTER(inpkts);
+ IP_VS_ZERO_STATS_COUNTER(outpkts);
+ IP_VS_ZERO_STATS_COUNTER(inbytes);
+ IP_VS_ZERO_STATS_COUNTER(outbytes);
+
+ ip_vs_zero_estimator(stats);
+
+ spin_unlock_bh(&stats->lock);
+}
+
+/*
+ * Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+ struct ip_vs_dest_user_kern *udest, int add)
+{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_service *old_svc;
+ struct ip_vs_scheduler *sched;
+ int conn_flags;
+
+ /* We cannot modify an address and change the address family */
+ BUG_ON(!add && udest->af != dest->af);
+
+ if (add && udest->af != svc->af)
+ ipvs->mixed_address_family_dests++;
+
+ /* set the weight and the flags */
+ atomic_set(&dest->weight, udest->weight);
+ conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+ conn_flags |= IP_VS_CONN_F_INACTIVE;
+
+ /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
+ conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+ } else {
+ /*
+ * Put the real service in rs_table if not present.
+ * For now only for NAT!
+ */
+ ip_vs_rs_hash(ipvs, dest);
+ }
+ atomic_set(&dest->conn_flags, conn_flags);
+
+ /* bind the service */
+ old_svc = rcu_dereference_protected(dest->svc, 1);
+ if (!old_svc) {
+ __ip_vs_bind_svc(dest, svc);
+ } else {
+ if (old_svc != svc) {
+ ip_vs_zero_stats(&dest->stats);
+ __ip_vs_bind_svc(dest, svc);
+ __ip_vs_svc_put(old_svc, true);
+ }
+ }
+
+ /* set the dest status flags */
+ dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+ if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ dest->u_threshold = udest->u_threshold;
+ dest->l_threshold = udest->l_threshold;
+
+ dest->af = udest->af;
+
+ spin_lock_bh(&dest->dst_lock);
+ __ip_vs_dst_cache_reset(dest);
+ spin_unlock_bh(&dest->dst_lock);
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (add) {
+ ip_vs_start_estimator(svc->net, &dest->stats);
+ list_add_rcu(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+ if (sched->add_dest)
+ sched->add_dest(svc, dest);
+ } else {
+ if (sched->upd_dest)
+ sched->upd_dest(svc, dest);
+ }
+}
+
+
+/*
+ * Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
+ struct ip_vs_dest **dest_p)
+{
+ struct ip_vs_dest *dest;
+ unsigned int atype, i;
+
+ EnterFunction(2);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (udest->af == AF_INET6) {
+ atype = ipv6_addr_type(&udest->addr.in6);
+ if ((!(atype & IPV6_ADDR_UNICAST) ||
+ atype & IPV6_ADDR_LINKLOCAL) &&
+ !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
+ return -EINVAL;
+ } else
+#endif
+ {
+ atype = inet_addr_type(svc->net, udest->addr.ip);
+ if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+ return -EINVAL;
+ }
+
+ dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
+ if (dest == NULL)
+ return -ENOMEM;
+
+ dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!dest->stats.cpustats)
+ goto err_alloc;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_dest_stats;
+ ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
+ u64_stats_init(&ip_vs_dest_stats->syncp);
+ }
+
+ dest->af = udest->af;
+ dest->protocol = svc->protocol;
+ dest->vaddr = svc->addr;
+ dest->vport = svc->port;
+ dest->vfwmark = svc->fwmark;
+ ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
+ dest->port = udest->port;
+
+ atomic_set(&dest->activeconns, 0);
+ atomic_set(&dest->inactconns, 0);
+ atomic_set(&dest->persistconns, 0);
+ atomic_set(&dest->refcnt, 1);
+
+ INIT_HLIST_NODE(&dest->d_list);
+ spin_lock_init(&dest->dst_lock);
+ spin_lock_init(&dest->stats.lock);
+ __ip_vs_update_dest(svc, dest, udest, 1);
+
+ *dest_p = dest;
+
+ LeaveFunction(2);
+ return 0;
+
+err_alloc:
+ kfree(dest);
+ return -ENOMEM;
+}
+
+
+/*
+ * Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ union nf_inet_addr daddr;
+ __be16 dport = udest->port;
+ int ret;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ pr_err("%s(): server weight less than zero\n", __func__);
+ return -ERANGE;
+ }
+
+ if (udest->l_threshold > udest->u_threshold) {
+ pr_err("%s(): lower threshold is higher than upper threshold\n",
+ __func__);
+ return -ERANGE;
+ }
+
+ ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
+ rcu_read_unlock();
+
+ if (dest != NULL) {
+ IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
+ return -EEXIST;
+ }
+
+ /*
+ * Check if the dest already exists in the trash and
+ * is from the same service
+ */
+ dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
+
+ if (dest != NULL) {
+ IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
+ "dest->refcnt=%d, service %u/%s:%u\n",
+ IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
+ atomic_read(&dest->refcnt),
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
+ ntohs(dest->vport));
+
+ __ip_vs_update_dest(svc, dest, udest, 1);
+ ret = 0;
+ } else {
+ /*
+ * Allocate and initialize the dest structure
+ */
+ ret = ip_vs_new_dest(svc, udest, &dest);
+ }
+ LeaveFunction(2);
+
+ return ret;
+}
+
+
+/*
+ * Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ union nf_inet_addr daddr;
+ __be16 dport = udest->port;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ pr_err("%s(): server weight less than zero\n", __func__);
+ return -ERANGE;
+ }
+
+ if (udest->l_threshold > udest->u_threshold) {
+ pr_err("%s(): lower threshold is higher than upper threshold\n",
+ __func__);
+ return -ERANGE;
+ }
+
+ ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
+ rcu_read_unlock();
+
+ if (dest == NULL) {
+ IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
+ return -ENOENT;
+ }
+
+ __ip_vs_update_dest(svc, dest, udest, 0);
+ LeaveFunction(2);
+
+ return 0;
+}
+
+/*
+ * Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+ bool cleanup)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_stop_estimator(net, &dest->stats);
+
+ /*
+ * Remove it from the d-linked list with the real services.
+ */
+ ip_vs_rs_unhash(dest);
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ /* dest lives in trash without reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = 0;
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_dest_put(dest);
+}
+
+
+/*
+ * Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ int svcupd)
+{
+ dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+ /*
+ * Remove it from the d-linked destination list.
+ */
+ list_del_rcu(&dest->n_list);
+ svc->num_dests--;
+
+ if (dest->af != svc->af)
+ net_ipvs(svc->net)->mixed_address_family_dests--;
+
+ if (svcupd) {
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched->del_dest)
+ sched->del_dest(svc, dest);
+ }
+}
+
+
+/*
+ * Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ __be16 dport = udest->port;
+
+ EnterFunction(2);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
+ rcu_read_unlock();
+
+ if (dest == NULL) {
+ IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
+ return -ENOENT;
+ }
+
+ /*
+ * Unlink dest from the service
+ */
+ __ip_vs_unlink_dest(svc, dest, 1);
+
+ /*
+ * Delete the destination
+ */
+ __ip_vs_del_dest(svc->net, dest, false);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+static void ip_vs_dest_trash_expire(unsigned long data)
+{
+ struct net *net = (struct net *) data;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest *dest, *next;
+ unsigned long now = jiffies;
+
+ spin_lock(&ipvs->dest_trash_lock);
+ list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
+ if (atomic_read(&dest->refcnt) > 0)
+ continue;
+ if (dest->idle_start) {
+ if (time_before(now, dest->idle_start +
+ IP_VS_DEST_TRASH_PERIOD))
+ continue;
+ } else {
+ dest->idle_start = max(1UL, now);
+ continue;
+ }
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+ if (!list_empty(&ipvs->dest_trash))
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ spin_unlock(&ipvs->dest_trash_lock);
+}
+
+/*
+ * Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
+ struct ip_vs_service **svc_p)
+{
+ int ret = 0, i;
+ struct ip_vs_scheduler *sched = NULL;
+ struct ip_vs_pe *pe = NULL;
+ struct ip_vs_service *svc = NULL;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /* Lookup the scheduler by 'u->sched_name' */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
+
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_getbyname(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+ }
+#endif
+
+ svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
+ if (svc == NULL) {
+ IP_VS_DBG(1, "%s(): no memory\n", __func__);
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!svc->stats.cpustats) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_stats;
+ ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
+ u64_stats_init(&ip_vs_stats->syncp);
+ }
+
+
+ /* I'm the first user of the service */
+ atomic_set(&svc->refcnt, 0);
+
+ svc->af = u->af;
+ svc->protocol = u->protocol;
+ ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
+ svc->port = u->port;
+ svc->fwmark = u->fwmark;
+ svc->flags = u->flags;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+ svc->net = net;
+
+ INIT_LIST_HEAD(&svc->destinations);
+ spin_lock_init(&svc->sched_lock);
+ spin_lock_init(&svc->stats.lock);
+
+ /* Bind the scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret)
+ goto out_err;
+ sched = NULL;
+
+ /* Bind the ct retriever */
+ RCU_INIT_POINTER(svc->pe, pe);
+ pe = NULL;
+
+ /* Update the virtual service counters */
+ if (svc->port == FTPPORT)
+ atomic_inc(&ipvs->ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_inc(&ipvs->nullsvc_counter);
+
+ ip_vs_start_estimator(net, &svc->stats);
+
+ /* Count only IPv4 services for old get/setsockopt interface */
+ if (svc->af == AF_INET)
+ ipvs->num_services++;
+
+ /* Hash the service into the service table */
+ ip_vs_svc_hash(svc);
+
+ *svc_p = svc;
+ /* Now there is a service - full throttle */
+ ipvs->enable = 1;
+ return 0;
+
+
+ out_err:
+ if (svc != NULL) {
+ ip_vs_unbind_scheduler(svc, sched);
+ ip_vs_service_free(svc);
+ }
+ ip_vs_scheduler_put(sched);
+ ip_vs_pe_put(pe);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+/*
+ * Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
+{
+ struct ip_vs_scheduler *sched, *old_sched;
+ struct ip_vs_pe *pe = NULL, *old_pe = NULL;
+ int ret = 0;
+
+ /*
+ * Lookup the scheduler, by 'u->sched_name'
+ */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
+ return -ENOENT;
+ }
+ old_sched = sched;
+
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_getbyname(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out;
+ }
+ old_pe = pe;
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+#endif
+
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched != old_sched) {
+ /* Bind the new scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ old_sched = sched;
+ goto out;
+ }
+ /* Unbind the old scheduler on success */
+ ip_vs_unbind_scheduler(svc, old_sched);
+ }
+
+ /*
+ * Set the flags and timeout value
+ */
+ svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (pe != old_pe)
+ rcu_assign_pointer(svc->pe, pe);
+
+out:
+ ip_vs_scheduler_put(old_sched);
+ ip_vs_pe_put(old_pe);
+ return ret;
+}
+
+/*
+ * Delete a service from the service list
+ * - The service must be unlinked, unlocked and not referenced!
+ * - We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
+{
+ struct ip_vs_dest *dest, *nxt;
+ struct ip_vs_scheduler *old_sched;
+ struct ip_vs_pe *old_pe;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
+ pr_info("%s: enter\n", __func__);
+
+ /* Count only IPv4 services for old get/setsockopt interface */
+ if (svc->af == AF_INET)
+ ipvs->num_services--;
+
+ ip_vs_stop_estimator(svc->net, &svc->stats);
+
+ /* Unbind scheduler */
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ ip_vs_unbind_scheduler(svc, old_sched);
+ ip_vs_scheduler_put(old_sched);
+
+ /* Unbind persistence engine, keep svc->pe */
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ ip_vs_pe_put(old_pe);
+
+ /*
+ * Unlink the whole destination list
+ */
+ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+ __ip_vs_unlink_dest(svc, dest, 0);
+ __ip_vs_del_dest(svc->net, dest, cleanup);
+ }
+
+ /*
+ * Update the virtual service counters
+ */
+ if (svc->port == FTPPORT)
+ atomic_dec(&ipvs->ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_dec(&ipvs->nullsvc_counter);
+
+ /*
+ * Free the service if nobody refers to it
+ */
+ __ip_vs_svc_put(svc, true);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+}
+
+/*
+ * Unlink a service from list and try to delete it if its refcnt reached 0
+ */
+static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
+{
+ /* Hold svc to avoid double release from dest_trash */
+ atomic_inc(&svc->refcnt);
+ /*
+ * Unhash it from the service table
+ */
+ ip_vs_svc_unhash(svc);
+
+ __ip_vs_del_service(svc, cleanup);
+}
+
+/*
+ * Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+ if (svc == NULL)
+ return -EEXIST;
+ ip_vs_unlink_service(svc, false);
+
+ return 0;
+}
+
+
+/*
+ * Flush all the virtual services
+ */
+static int ip_vs_flush(struct net *net, bool cleanup)
+{
+ int idx;
+ struct ip_vs_service *svc;
+ struct hlist_node *n;
+
+ /*
+ * Flush the service table hashed by <netns,protocol,addr,port>
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+ s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc, cleanup);
+ }
+ }
+
+ /*
+ * Flush the service table hashed by fwmark
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+ f_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc, cleanup);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Delete service by {netns} in the service table.
+ * Called by __ip_vs_cleanup()
+ */
+void ip_vs_service_net_cleanup(struct net *net)
+{
+ EnterFunction(2);
+ /* Check for "full" addressed entries */
+ mutex_lock(&__ip_vs_mutex);
+ ip_vs_flush(net, true);
+ mutex_unlock(&__ip_vs_mutex);
+ LeaveFunction(2);
+}
+
+/* Put all references for device (dst_cache) */
+static inline void
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
+{
+ struct ip_vs_dest_dst *dest_dst;
+
+ spin_lock_bh(&dest->dst_lock);
+ dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
+ if (dest_dst && dest_dst->dst_cache->dev == dev) {
+ IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
+ dev->name,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ __ip_vs_dst_cache_reset(dest);
+ }
+ spin_unlock_bh(&dest->dst_lock);
+
+}
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
+ */
+static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_service *svc;
+ struct ip_vs_dest *dest;
+ unsigned int idx;
+
+ if (event != NETDEV_DOWN || !ipvs)
+ return NOTIFY_DONE;
+ IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+ EnterFunction(2);
+ mutex_lock(&__ip_vs_mutex);
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net)) {
+ list_for_each_entry(dest, &svc->destinations,
+ n_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ }
+ }
+
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (net_eq(svc->net, net)) {
+ list_for_each_entry(dest, &svc->destinations,
+ n_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ }
+
+ }
+ }
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ mutex_unlock(&__ip_vs_mutex);
+ LeaveFunction(2);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ ip_vs_zero_stats(&dest->stats);
+ }
+ ip_vs_zero_stats(&svc->stats);
+ return 0;
+}
+
+static int ip_vs_zero_all(struct net *net)
+{
+ int idx;
+ struct ip_vs_service *svc;
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
+ return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int zero;
+static int three = 3;
+
+static int
+proc_do_defense_mode(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 3)) {
+ /* Restore the correct value */
+ *valp = val;
+ } else {
+ update_defense_level(net_ipvs(net));
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_threshold(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val[2];
+ int rc;
+
+ /* backup the value first */
+ memcpy(val, valp, sizeof(val));
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (valp[0] < 0 || valp[1] < 0 ||
+ (valp[0] >= valp[1] && valp[1]))) {
+ /* Restore the correct value */
+ memcpy(valp, val, sizeof(val));
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_mode(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 1)) {
+ /* Restore the correct value */
+ *valp = val;
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_ports(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if (*valp < 1 || !is_power_of_2(*valp)) {
+ /* Restore the correct value */
+ *valp = val;
+ }
+ }
+ return rc;
+}
+
+/*
+ * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ * Do not change order or insert new entries without
+ * align with netns init in ip_vs_control_net_init()
+ */
+
+static struct ctl_table vs_vars[] = {
+ {
+ .procname = "amemthresh",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "am_droprate",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "drop_entry",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+ {
+ .procname = "drop_packet",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+#ifdef CONFIG_IP_VS_NFCT
+ {
+ .procname = "conntrack",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .procname = "secure_tcp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+ {
+ .procname = "snat_reroute",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "sync_version",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_mode,
+ },
+ {
+ .procname = "sync_ports",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_ports,
+ },
+ {
+ .procname = "sync_persist_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_qlen_max",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "sync_sock_size",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cache_bypass",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_nodest_conn",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_tcp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_sctp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_quiescent_template",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_threshold",
+ .maxlen =
+ sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
+ .mode = 0644,
+ .proc_handler = proc_do_sync_threshold,
+ },
+ {
+ .procname = "sync_refresh_period",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "sync_retries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &three,
+ },
+ {
+ .procname = "nat_icmp_send",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "pmtu_disc",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "backup_only",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "conn_reuse_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IP_VS_DEBUG
+ {
+ .procname = "debug_level",
+ .data = &sysctl_ip_vs_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ { }
+};
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+struct ip_vs_iter {
+ struct seq_net_private p; /* Do not move this, netns depends upon it*/
+ struct hlist_head *table;
+ int bucket;
+};
+
+/*
+ * Write the contents of the VS rule table to a PROCfs file.
+ * (It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned int flags)
+{
+ switch (flags & IP_VS_CONN_F_FWD_MASK) {
+ case IP_VS_CONN_F_LOCALNODE:
+ return "Local";
+ case IP_VS_CONN_F_TUNNEL:
+ return "Tunnel";
+ case IP_VS_CONN_F_DROUTE:
+ return "Route";
+ default:
+ return "Masq";
+ }
+}
+
+
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct ip_vs_iter *iter = seq->private;
+ int idx;
+ struct ip_vs_service *svc;
+
+ /* look in hash by protocol */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
+ iter->table = ip_vs_svc_table;
+ iter->bucket = idx;
+ return svc;
+ }
+ }
+ }
+
+ /* keep looking in fwmark */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+ f_list) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
+ iter->table = ip_vs_svc_fwm_table;
+ iter->bucket = idx;
+ return svc;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct hlist_node *e;
+ struct ip_vs_iter *iter;
+ struct ip_vs_service *svc;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_info_array(seq,0);
+
+ svc = v;
+ iter = seq->private;
+
+ if (iter->table == ip_vs_svc_table) {
+ /* next service in table hashed by protocol */
+ e = rcu_dereference(hlist_next_rcu(&svc->s_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, s_list);
+
+ while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_table[iter->bucket],
+ s_list) {
+ return svc;
+ }
+ }
+
+ iter->table = ip_vs_svc_fwm_table;
+ iter->bucket = -1;
+ goto scan_fwmark;
+ }
+
+ /* next service in hashed by fwmark */
+ e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, f_list);
+
+ scan_fwmark:
+ while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_fwm_table[iter->bucket],
+ f_list)
+ return svc;
+ }
+
+ return NULL;
+}
+
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq,
+ "IP Virtual Server version %d.%d.%d (size=%d)\n",
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+ seq_puts(seq,
+ "Prot LocalAddress:Port Scheduler Flags\n");
+ seq_puts(seq,
+ " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+ } else {
+ const struct ip_vs_service *svc = v;
+ const struct ip_vs_iter *iter = seq->private;
+ const struct ip_vs_dest *dest;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+
+ if (iter->table == ip_vs_svc_table) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ seq_printf(seq, "%s [%pI6]:%04X %s ",
+ ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6,
+ ntohs(svc->port),
+ sched->name);
+ else
+#endif
+ seq_printf(seq, "%s %08X:%04X %s %s ",
+ ip_vs_proto_name(svc->protocol),
+ ntohl(svc->addr.ip),
+ ntohs(svc->port),
+ sched->name,
+ (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
+ } else {
+ seq_printf(seq, "FWM %08X %s %s",
+ svc->fwmark, sched->name,
+ (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
+ }
+
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ seq_printf(seq, "persistent %d %08X\n",
+ svc->timeout,
+ ntohl(svc->netmask));
+ else
+ seq_putc(seq, '\n');
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (dest->af == AF_INET6)
+ seq_printf(seq,
+ " -> [%pI6]:%04X"
+ " %-7s %-6d %-10d %-10d\n",
+ &dest->addr.in6,
+ ntohs(dest->port),
+ ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+ atomic_read(&dest->weight),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->inactconns));
+ else
+#endif
+ seq_printf(seq,
+ " -> %08X:%04X "
+ "%-7s %-6d %-10d %-10d\n",
+ ntohl(dest->addr.ip),
+ ntohs(dest->port),
+ ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+ atomic_read(&dest->weight),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->inactconns));
+
+ }
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_info_seq_ops = {
+ .start = ip_vs_info_seq_start,
+ .next = ip_vs_info_seq_next,
+ .stop = ip_vs_info_seq_stop,
+ .show = ip_vs_info_seq_show,
+};
+
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_info_seq_ops,
+ sizeof(struct ip_vs_iter));
+}
+
+static const struct file_operations ip_vs_info_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_kstats show;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ " Conns Packets Packets Bytes Bytes\n");
+
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
+ (unsigned long long)show.conns,
+ (unsigned long long)show.inpkts,
+ (unsigned long long)show.outpkts,
+ (unsigned long long)show.inbytes,
+ (unsigned long long)show.outbytes);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567*/
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
+ (unsigned long long)show.cps,
+ (unsigned long long)show.inpps,
+ (unsigned long long)show.outpps,
+ (unsigned long long)show.inbps,
+ (unsigned long long)show.outbps);
+
+ return 0;
+}
+
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_show);
+}
+
+static const struct file_operations ip_vs_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
+ struct ip_vs_kstats kstats;
+ int i;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ "CPU Conns Packets Packets Bytes Bytes\n");
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
+ unsigned int start;
+ u64 conns, inpkts, outpkts, inbytes, outbytes;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&u->syncp);
+ conns = u->cnt.conns;
+ inpkts = u->cnt.inpkts;
+ outpkts = u->cnt.outpkts;
+ inbytes = u->cnt.inbytes;
+ outbytes = u->cnt.outbytes;
+ } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+
+ seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
+ i, (u64)conns, (u64)inpkts,
+ (u64)outpkts, (u64)inbytes,
+ (u64)outbytes);
+ }
+
+ ip_vs_copy_stats(&kstats, tot_stats);
+
+ seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n",
+ (unsigned long long)kstats.conns,
+ (unsigned long long)kstats.inpkts,
+ (unsigned long long)kstats.outpkts,
+ (unsigned long long)kstats.inbytes,
+ (unsigned long long)kstats.outbytes);
+
+/* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n",
+ kstats.cps,
+ kstats.inpps,
+ kstats.outpps,
+ kstats.inbps,
+ kstats.outbps);
+
+ return 0;
+}
+
+static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_percpu_show);
+}
+
+static const struct file_operations ip_vs_stats_percpu_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_percpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+#endif
+
+/*
+ * Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
+{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
+ IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+ u->tcp_timeout,
+ u->tcp_fin_timeout,
+ u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (u->tcp_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ = u->tcp_timeout * HZ;
+ }
+
+ if (u->tcp_fin_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ = u->tcp_fin_timeout * HZ;
+ }
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (u->udp_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd->timeout_table[IP_VS_UDP_S_NORMAL]
+ = u->udp_timeout * HZ;
+ }
+#endif
+ return 0;
+}
+
+#define CMDID(cmd) (cmd - IP_VS_BASE_CTL)
+
+struct ip_vs_svcdest_user {
+ struct ip_vs_service_user s;
+ struct ip_vs_dest_user d;
+};
+
+static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
+ [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user),
+ [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user),
+ [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user),
+ [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user),
+ [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user),
+ [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user),
+ [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
+ [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
+ [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user),
+ [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user),
+};
+
+union ip_vs_set_arglen {
+ struct ip_vs_service_user field_IP_VS_SO_SET_ADD;
+ struct ip_vs_service_user field_IP_VS_SO_SET_EDIT;
+ struct ip_vs_service_user field_IP_VS_SO_SET_DEL;
+ struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST;
+ struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST;
+ struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST;
+ struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT;
+ struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON;
+ struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON;
+ struct ip_vs_service_user field_IP_VS_SO_SET_ZERO;
+};
+
+#define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen)
+
+static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
+ struct ip_vs_service_user *usvc_compat)
+{
+ memset(usvc, 0, sizeof(*usvc));
+
+ usvc->af = AF_INET;
+ usvc->protocol = usvc_compat->protocol;
+ usvc->addr.ip = usvc_compat->addr;
+ usvc->port = usvc_compat->port;
+ usvc->fwmark = usvc_compat->fwmark;
+
+ /* Deep copy of sched_name is not needed here */
+ usvc->sched_name = usvc_compat->sched_name;
+
+ usvc->flags = usvc_compat->flags;
+ usvc->timeout = usvc_compat->timeout;
+ usvc->netmask = usvc_compat->netmask;
+}
+
+static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
+ struct ip_vs_dest_user *udest_compat)
+{
+ memset(udest, 0, sizeof(*udest));
+
+ udest->addr.ip = udest_compat->addr;
+ udest->port = udest_compat->port;
+ udest->conn_flags = udest_compat->conn_flags;
+ udest->weight = udest_compat->weight;
+ udest->u_threshold = udest_compat->u_threshold;
+ udest->l_threshold = udest_compat->l_threshold;
+ udest->af = AF_INET;
+}
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ struct net *net = sock_net(sk);
+ int ret;
+ unsigned char arg[MAX_SET_ARGLEN];
+ struct ip_vs_service_user *usvc_compat;
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
+ struct ip_vs_dest_user *udest_compat;
+ struct ip_vs_dest_user_kern udest;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ BUILD_BUG_ON(sizeof(arg) > 255);
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
+ return -EINVAL;
+ if (len != set_arglen[CMDID(cmd)]) {
+ IP_VS_DBG(1, "set_ctl: len %u != %u\n",
+ len, set_arglen[CMDID(cmd)]);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, len) != 0)
+ return -EFAULT;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /* Handle daemons since they have another lock */
+ if (cmd == IP_VS_SO_SET_STARTDAEMON ||
+ cmd == IP_VS_SO_SET_STOPDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+
+ mutex_lock(&ipvs->sync_mutex);
+ if (cmd == IP_VS_SO_SET_STARTDAEMON)
+ ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+ dm->syncid);
+ else
+ ret = stop_sync_thread(net, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ goto out_dec;
+ }
+
+ mutex_lock(&__ip_vs_mutex);
+ if (cmd == IP_VS_SO_SET_FLUSH) {
+ /* Flush the virtual service */
+ ret = ip_vs_flush(net, false);
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+ /* Set timeout values for (tcp tcpfin udp) */
+ ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
+ goto out_unlock;
+ }
+
+ usvc_compat = (struct ip_vs_service_user *)arg;
+ udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
+
+ /* We only use the new structs internally, so copy userspace compat
+ * structs to extended internal versions */
+ ip_vs_copy_usvc_compat(&usvc, usvc_compat);
+ ip_vs_copy_udest_compat(&udest, udest_compat);
+
+ if (cmd == IP_VS_SO_SET_ZERO) {
+ /* if no service address is set, zero counters in all */
+ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
+ ret = ip_vs_zero_all(net);
+ goto out_unlock;
+ }
+ }
+
+ /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
+ if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
+ usvc.protocol != IPPROTO_SCTP) {
+ pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
+ usvc.protocol, &usvc.addr.ip,
+ ntohs(usvc.port), usvc.sched_name);
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* Lookup the exact service by <protocol, addr, port> or fwmark */
+ rcu_read_lock();
+ if (usvc.fwmark == 0)
+ svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
+ &usvc.addr, usvc.port);
+ else
+ svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ rcu_read_unlock();
+
+ if (cmd != IP_VS_SO_SET_ADD
+ && (svc == NULL || svc->protocol != usvc.protocol)) {
+ ret = -ESRCH;
+ goto out_unlock;
+ }
+
+ switch (cmd) {
+ case IP_VS_SO_SET_ADD:
+ if (svc != NULL)
+ ret = -EEXIST;
+ else
+ ret = ip_vs_add_service(net, &usvc, &svc);
+ break;
+ case IP_VS_SO_SET_EDIT:
+ ret = ip_vs_edit_service(svc, &usvc);
+ break;
+ case IP_VS_SO_SET_DEL:
+ ret = ip_vs_del_service(svc);
+ if (!ret)
+ goto out_unlock;
+ break;
+ case IP_VS_SO_SET_ZERO:
+ ret = ip_vs_zero_service(svc);
+ break;
+ case IP_VS_SO_SET_ADDDEST:
+ ret = ip_vs_add_dest(svc, &udest);
+ break;
+ case IP_VS_SO_SET_EDITDEST:
+ ret = ip_vs_edit_dest(svc, &udest);
+ break;
+ case IP_VS_SO_SET_DELDEST:
+ ret = ip_vs_del_dest(svc, &udest);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ out_unlock:
+ mutex_unlock(&__ip_vs_mutex);
+ out_dec:
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_kstats kstats;
+
+ sched = rcu_dereference_protected(src->scheduler, 1);
+ dst->protocol = src->protocol;
+ dst->addr = src->addr.ip;
+ dst->port = src->port;
+ dst->fwmark = src->fwmark;
+ strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
+ dst->flags = src->flags;
+ dst->timeout = src->timeout / HZ;
+ dst->netmask = src->netmask;
+ dst->num_dests = src->num_dests;
+ ip_vs_copy_stats(&kstats, &src->stats);
+ ip_vs_export_stats_user(&dst->stats, &kstats);
+}
+
+static inline int
+__ip_vs_get_service_entries(struct net *net,
+ const struct ip_vs_get_services *get,
+ struct ip_vs_get_services __user *uptr)
+{
+ int idx, count=0;
+ struct ip_vs_service *svc;
+ struct ip_vs_service_entry entry;
+ int ret = 0;
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ /* Only expose IPv4 entries to old interface */
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
+ continue;
+
+ if (count >= get->num_services)
+ goto out;
+ memset(&entry, 0, sizeof(entry));
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ /* Only expose IPv4 entries to old interface */
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
+ continue;
+
+ if (count >= get->num_services)
+ goto out;
+ memset(&entry, 0, sizeof(entry));
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+out:
+ return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
+ struct ip_vs_get_dests __user *uptr)
+{
+ struct ip_vs_service *svc;
+ union nf_inet_addr addr = { .ip = get->addr };
+ int ret = 0;
+
+ rcu_read_lock();
+ if (get->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
+ else
+ svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
+ get->port);
+ rcu_read_unlock();
+
+ if (svc) {
+ int count = 0;
+ struct ip_vs_dest *dest;
+ struct ip_vs_dest_entry entry;
+ struct ip_vs_kstats kstats;
+
+ memset(&entry, 0, sizeof(entry));
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (count >= get->num_dests)
+ break;
+
+ /* Cannot expose heterogeneous members via sockopt
+ * interface
+ */
+ if (dest->af != svc->af)
+ continue;
+
+ entry.addr = dest->addr.ip;
+ entry.port = dest->port;
+ entry.conn_flags = atomic_read(&dest->conn_flags);
+ entry.weight = atomic_read(&dest->weight);
+ entry.u_threshold = dest->u_threshold;
+ entry.l_threshold = dest->l_threshold;
+ entry.activeconns = atomic_read(&dest->activeconns);
+ entry.inactconns = atomic_read(&dest->inactconns);
+ entry.persistconns = atomic_read(&dest->persistconns);
+ ip_vs_copy_stats(&kstats, &dest->stats);
+ ip_vs_export_stats_user(&entry.stats, &kstats);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ break;
+ }
+ count++;
+ }
+ } else
+ ret = -ESRCH;
+ return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
+{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
+ memset(u, 0, sizeof (*u));
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ u->udp_timeout =
+ pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
+ [CMDID(IP_VS_SO_GET_VERSION)] = 64,
+ [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo),
+ [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
+ [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry),
+ [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests),
+ [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
+ [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user),
+};
+
+union ip_vs_get_arglen {
+ char field_IP_VS_SO_GET_VERSION[64];
+ struct ip_vs_getinfo field_IP_VS_SO_GET_INFO;
+ struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES;
+ struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE;
+ struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS;
+ struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT;
+ struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2];
+};
+
+#define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen)
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ unsigned char arg[MAX_GET_ARGLEN];
+ int ret = 0;
+ unsigned int copylen;
+ struct net *net = sock_net(sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ BUG_ON(!net);
+ BUILD_BUG_ON(sizeof(arg) > 255);
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
+ return -EINVAL;
+
+ copylen = get_arglen[CMDID(cmd)];
+ if (*len < (int) copylen) {
+ IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, copylen) != 0)
+ return -EFAULT;
+ /*
+ * Handle daemons first since it has its own locking
+ */
+ if (cmd == IP_VS_SO_GET_DAEMON) {
+ struct ip_vs_daemon_user d[2];
+
+ memset(&d, 0, sizeof(d));
+ mutex_lock(&ipvs->sync_mutex);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER) {
+ d[0].state = IP_VS_STATE_MASTER;
+ strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ sizeof(d[0].mcast_ifn));
+ d[0].syncid = ipvs->master_syncid;
+ }
+ if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
+ d[1].state = IP_VS_STATE_BACKUP;
+ strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ sizeof(d[1].mcast_ifn));
+ d[1].syncid = ipvs->backup_syncid;
+ }
+ if (copy_to_user(user, &d, sizeof(d)) != 0)
+ ret = -EFAULT;
+ mutex_unlock(&ipvs->sync_mutex);
+ return ret;
+ }
+
+ mutex_lock(&__ip_vs_mutex);
+ switch (cmd) {
+ case IP_VS_SO_GET_VERSION:
+ {
+ char buf[64];
+
+ sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+ if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ *len = strlen(buf)+1;
+ }
+ break;
+
+ case IP_VS_SO_GET_INFO:
+ {
+ struct ip_vs_getinfo info;
+ info.version = IP_VS_VERSION_CODE;
+ info.size = ip_vs_conn_tab_size;
+ info.num_services = ipvs->num_services;
+ if (copy_to_user(user, &info, sizeof(info)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICES:
+ {
+ struct ip_vs_get_services *get;
+ int size;
+
+ get = (struct ip_vs_get_services *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_service_entry) * get->num_services;
+ if (*len != size) {
+ pr_err("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_service_entries(net, get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICE:
+ {
+ struct ip_vs_service_entry *entry;
+ struct ip_vs_service *svc;
+ union nf_inet_addr addr;
+
+ entry = (struct ip_vs_service_entry *)arg;
+ addr.ip = entry->addr;
+ rcu_read_lock();
+ if (entry->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
+ else
+ svc = __ip_vs_service_find(net, AF_INET,
+ entry->protocol, &addr,
+ entry->port);
+ rcu_read_unlock();
+ if (svc) {
+ ip_vs_copy_service(entry, svc);
+ if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+ ret = -EFAULT;
+ } else
+ ret = -ESRCH;
+ }
+ break;
+
+ case IP_VS_SO_GET_DESTS:
+ {
+ struct ip_vs_get_dests *get;
+ int size;
+
+ get = (struct ip_vs_get_dests *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_dest_entry) * get->num_dests;
+ if (*len != size) {
+ pr_err("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_dest_entries(net, get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_TIMEOUT:
+ {
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+ if (copy_to_user(user, &t, sizeof(t)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&__ip_vs_mutex);
+ return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = IP_VS_BASE_CTL,
+ .set_optmax = IP_VS_SO_SET_MAX+1,
+ .set = do_ip_vs_set_ctl,
+ .get_optmin = IP_VS_BASE_CTL,
+ .get_optmax = IP_VS_SO_GET_MAX+1,
+ .get = do_ip_vs_get_ctl,
+ .owner = THIS_MODULE,
+};
+
+/*
+ * Generic Netlink interface
+ */
+
+/* IPVS genetlink family */
+static struct genl_family ip_vs_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = IPVS_GENL_NAME,
+ .version = IPVS_GENL_VERSION,
+ .maxattr = IPVS_CMD_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
+};
+
+/* Policy used for first-level command attributes */
+static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
+ [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
+ [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
+ [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
+static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
+ [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
+ [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_IFNAME_MAXLEN },
+ [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
+static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
+ [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(union nf_inet_addr) },
+ [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_SCHEDNAME_MAXLEN },
+ [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_PENAME_MAXLEN },
+ [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
+ .len = sizeof(struct ip_vs_flags) },
+ [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
+static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
+ [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(union nf_inet_addr) },
+ [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
+ [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
+};
+
+static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
+ struct ip_vs_kstats *kstats)
+{
+ struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+
+ if (!nl_stats)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
+ goto nla_put_failure;
+ nla_nest_end(skb, nl_stats);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_stats);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
+ struct ip_vs_kstats *kstats)
+{
+ struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+
+ if (!nl_stats)
+ return -EMSGSIZE;
+
+ if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps))
+ goto nla_put_failure;
+ nla_nest_end(skb, nl_stats);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_stats);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+ struct ip_vs_service *svc)
+{
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_pe *pe;
+ struct nlattr *nl_service;
+ struct ip_vs_flags flags = { .flags = svc->flags,
+ .mask = ~0 };
+ struct ip_vs_kstats kstats;
+
+ nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
+ if (!nl_service)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
+ goto nla_put_failure;
+ if (svc->fwmark) {
+ if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
+ nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
+ nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
+ goto nla_put_failure;
+ }
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ pe = rcu_dereference_protected(svc->pe, 1);
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+ (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
+ nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
+ nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
+ nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
+ goto nla_put_failure;
+ ip_vs_copy_stats(&kstats, &svc->stats);
+ if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
+ goto nla_put_failure;
+ if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nl_service);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_service);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_service(struct sk_buff *skb,
+ struct ip_vs_service *svc,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_SERVICE);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_service(skb, svc) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int idx = 0, i;
+ int start = cb->args[0];
+ struct ip_vs_service *svc;
+ struct net *net = skb_sknet(skb);
+
+ mutex_lock(&__ip_vs_mutex);
+ for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+ if (++idx <= start || !net_eq(svc->net, net))
+ continue;
+ if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+ }
+
+ for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+ if (++idx <= start || !net_eq(svc->net, net))
+ continue;
+ if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+ }
+
+nla_put_failure:
+ mutex_unlock(&__ip_vs_mutex);
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static int ip_vs_genl_parse_service(struct net *net,
+ struct ip_vs_service_user_kern *usvc,
+ struct nlattr *nla, int full_entry,
+ struct ip_vs_service **ret_svc)
+{
+ struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
+ struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+ struct ip_vs_service *svc;
+
+ /* Parse mandatory identifying service fields first */
+ if (nla == NULL ||
+ nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
+ return -EINVAL;
+
+ nla_af = attrs[IPVS_SVC_ATTR_AF];
+ nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
+ nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
+ nla_port = attrs[IPVS_SVC_ATTR_PORT];
+ nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
+
+ if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
+ return -EINVAL;
+
+ memset(usvc, 0, sizeof(*usvc));
+
+ usvc->af = nla_get_u16(nla_af);
+#ifdef CONFIG_IP_VS_IPV6
+ if (usvc->af != AF_INET && usvc->af != AF_INET6)
+#else
+ if (usvc->af != AF_INET)
+#endif
+ return -EAFNOSUPPORT;
+
+ if (nla_fwmark) {
+ usvc->protocol = IPPROTO_TCP;
+ usvc->fwmark = nla_get_u32(nla_fwmark);
+ } else {
+ usvc->protocol = nla_get_u16(nla_protocol);
+ nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
+ usvc->port = nla_get_be16(nla_port);
+ usvc->fwmark = 0;
+ }
+
+ rcu_read_lock();
+ if (usvc->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
+ else
+ svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
+ &usvc->addr, usvc->port);
+ rcu_read_unlock();
+ *ret_svc = svc;
+
+ /* If a full entry was requested, check for the additional fields */
+ if (full_entry) {
+ struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
+ *nla_netmask;
+ struct ip_vs_flags flags;
+
+ nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+ nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
+ nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
+ nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
+ nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
+
+ if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
+ return -EINVAL;
+
+ nla_memcpy(&flags, nla_flags, sizeof(flags));
+
+ /* prefill flags from service if it already exists */
+ if (svc)
+ usvc->flags = svc->flags;
+
+ /* set new flags from userland */
+ usvc->flags = (usvc->flags & ~flags.mask) |
+ (flags.flags & flags.mask);
+ usvc->sched_name = nla_data(nla_sched);
+ usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
+ usvc->timeout = nla_get_u32(nla_timeout);
+ usvc->netmask = nla_get_be32(nla_netmask);
+ }
+
+ return 0;
+}
+
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+ struct nlattr *nla)
+{
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
+ int ret;
+
+ ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
+ return ret ? ERR_PTR(ret) : svc;
+}
+
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+{
+ struct nlattr *nl_dest;
+ struct ip_vs_kstats kstats;
+
+ nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
+ if (!nl_dest)
+ return -EMSGSIZE;
+
+ if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+ (atomic_read(&dest->conn_flags) &
+ IP_VS_CONN_F_FWD_MASK)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
+ atomic_read(&dest->weight)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+ atomic_read(&dest->activeconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+ atomic_read(&dest->inactconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+ atomic_read(&dest->persistconns)) ||
+ nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
+ goto nla_put_failure;
+ ip_vs_copy_stats(&kstats, &dest->stats);
+ if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
+ goto nla_put_failure;
+ if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nl_dest);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_dest);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_DEST);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_dest(skb, dest) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dests(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int idx = 0;
+ int start = cb->args[0];
+ struct ip_vs_service *svc;
+ struct ip_vs_dest *dest;
+ struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+ struct net *net = skb_sknet(skb);
+
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Try to find the service for which to dump destinations */
+ if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
+ IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
+ goto out_err;
+
+
+ svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
+ if (IS_ERR(svc) || svc == NULL)
+ goto out_err;
+
+ /* Dump the destinations */
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (++idx <= start)
+ continue;
+ if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+
+nla_put_failure:
+ cb->args[0] = idx;
+
+out_err:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return skb->len;
+}
+
+static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
+ struct nlattr *nla, int full_entry)
+{
+ struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
+ struct nlattr *nla_addr, *nla_port;
+ struct nlattr *nla_addr_family;
+
+ /* Parse mandatory identifying destination fields first */
+ if (nla == NULL ||
+ nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
+ return -EINVAL;
+
+ nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
+ nla_port = attrs[IPVS_DEST_ATTR_PORT];
+ nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
+
+ if (!(nla_addr && nla_port))
+ return -EINVAL;
+
+ memset(udest, 0, sizeof(*udest));
+
+ nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
+ udest->port = nla_get_be16(nla_port);
+
+ if (nla_addr_family)
+ udest->af = nla_get_u16(nla_addr_family);
+ else
+ udest->af = 0;
+
+ /* If a full entry was requested, check for the additional fields */
+ if (full_entry) {
+ struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
+ *nla_l_thresh;
+
+ nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
+ nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
+ nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
+ nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
+
+ if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
+ return -EINVAL;
+
+ udest->conn_flags = nla_get_u32(nla_fwd)
+ & IP_VS_CONN_F_FWD_MASK;
+ udest->weight = nla_get_u32(nla_weight);
+ udest->u_threshold = nla_get_u32(nla_u_thresh);
+ udest->l_threshold = nla_get_u32(nla_l_thresh);
+ }
+
+ return 0;
+}
+
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid)
+{
+ struct nlattr *nl_daemon;
+
+ nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
+ if (!nl_daemon)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
+ nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
+ nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+ goto nla_put_failure;
+ nla_nest_end(skb, nl_daemon);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_daemon);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_DAEMON);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = skb_sknet(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&ipvs->sync_mutex);
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+ if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
+ ipvs->master_mcast_ifn,
+ ipvs->master_syncid, cb) < 0)
+ goto nla_put_failure;
+
+ cb->args[0] = 1;
+ }
+
+ if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+ if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
+ ipvs->backup_mcast_ifn,
+ ipvs->backup_syncid, cb) < 0)
+ goto nla_put_failure;
+
+ cb->args[1] = 1;
+ }
+
+nla_put_failure:
+ mutex_unlock(&ipvs->sync_mutex);
+
+ return skb->len;
+}
+
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
+{
+ if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
+ attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
+ attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
+ return -EINVAL;
+
+ /* The synchronization protocol is incompatible with mixed family
+ * services
+ */
+ if (net_ipvs(net)->mixed_address_family_dests > 0)
+ return -EINVAL;
+
+ return start_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+ nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+}
+
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
+{
+ if (!attrs[IPVS_DAEMON_ATTR_STATE])
+ return -EINVAL;
+
+ return stop_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+}
+
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
+{
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
+ t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
+ t.tcp_fin_timeout =
+ nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
+ t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
+
+ return ip_vs_set_timeout(net, &t);
+}
+
+static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret = 0, cmd;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
+ cmd = info->genlhdr->cmd;
+
+ if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
+ struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+
+ mutex_lock(&ipvs->sync_mutex);
+ if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
+ nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
+ info->attrs[IPVS_CMD_ATTR_DAEMON],
+ ip_vs_daemon_policy)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cmd == IPVS_CMD_NEW_DAEMON)
+ ret = ip_vs_genl_new_daemon(net, daemon_attrs);
+ else
+ ret = ip_vs_genl_del_daemon(net, daemon_attrs);
+out:
+ mutex_unlock(&ipvs->sync_mutex);
+ }
+ return ret;
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ip_vs_service *svc = NULL;
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_dest_user_kern udest;
+ int ret = 0, cmd;
+ int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+
+ net = skb_sknet(skb);
+ cmd = info->genlhdr->cmd;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ if (cmd == IPVS_CMD_FLUSH) {
+ ret = ip_vs_flush(net, false);
+ goto out;
+ } else if (cmd == IPVS_CMD_SET_CONFIG) {
+ ret = ip_vs_genl_set_config(net, info->attrs);
+ goto out;
+ } else if (cmd == IPVS_CMD_ZERO &&
+ !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
+ ret = ip_vs_zero_all(net);
+ goto out;
+ }
+
+ /* All following commands require a service argument, so check if we
+ * received a valid one. We need a full service specification when
+ * adding / editing a service. Only identifying members otherwise. */
+ if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
+ need_full_svc = 1;
+
+ ret = ip_vs_genl_parse_service(net, &usvc,
+ info->attrs[IPVS_CMD_ATTR_SERVICE],
+ need_full_svc, &svc);
+ if (ret)
+ goto out;
+
+ /* Unless we're adding a new service, the service must already exist */
+ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ /* Destination commands require a valid destination argument. For
+ * adding / editing a destination, we need a full destination
+ * specification. */
+ if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
+ cmd == IPVS_CMD_DEL_DEST) {
+ if (cmd != IPVS_CMD_DEL_DEST)
+ need_full_dest = 1;
+
+ ret = ip_vs_genl_parse_dest(&udest,
+ info->attrs[IPVS_CMD_ATTR_DEST],
+ need_full_dest);
+ if (ret)
+ goto out;
+
+ /* Old protocols did not allow the user to specify address
+ * family, so we set it to zero instead. We also didn't
+ * allow heterogeneous pools in the old code, so it's safe
+ * to assume that this will have the same address family as
+ * the service.
+ */
+ if (udest.af == 0)
+ udest.af = svc->af;
+
+ if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
+ /* The synchronization protocol is incompatible
+ * with mixed family services
+ */
+ if (net_ipvs(net)->sync_state) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Which connection types do we support? */
+ switch (udest.conn_flags) {
+ case IP_VS_CONN_F_TUNNEL:
+ /* We are able to forward this */
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ }
+
+ switch (cmd) {
+ case IPVS_CMD_NEW_SERVICE:
+ if (svc == NULL)
+ ret = ip_vs_add_service(net, &usvc, &svc);
+ else
+ ret = -EEXIST;
+ break;
+ case IPVS_CMD_SET_SERVICE:
+ ret = ip_vs_edit_service(svc, &usvc);
+ break;
+ case IPVS_CMD_DEL_SERVICE:
+ ret = ip_vs_del_service(svc);
+ /* do not use svc, it can be freed */
+ break;
+ case IPVS_CMD_NEW_DEST:
+ ret = ip_vs_add_dest(svc, &udest);
+ break;
+ case IPVS_CMD_SET_DEST:
+ ret = ip_vs_edit_dest(svc, &udest);
+ break;
+ case IPVS_CMD_DEL_DEST:
+ ret = ip_vs_del_dest(svc, &udest);
+ break;
+ case IPVS_CMD_ZERO:
+ ret = ip_vs_zero_service(svc);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return ret;
+}
+
+static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *reply;
+ int ret, cmd, reply_cmd;
+ struct net *net;
+
+ net = skb_sknet(skb);
+ cmd = info->genlhdr->cmd;
+
+ if (cmd == IPVS_CMD_GET_SERVICE)
+ reply_cmd = IPVS_CMD_NEW_SERVICE;
+ else if (cmd == IPVS_CMD_GET_INFO)
+ reply_cmd = IPVS_CMD_SET_INFO;
+ else if (cmd == IPVS_CMD_GET_CONFIG)
+ reply_cmd = IPVS_CMD_SET_CONFIG;
+ else {
+ pr_err("unknown Generic Netlink command\n");
+ return -EINVAL;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
+ if (reply == NULL)
+ goto nla_put_failure;
+
+ switch (cmd) {
+ case IPVS_CMD_GET_SERVICE:
+ {
+ struct ip_vs_service *svc;
+
+ svc = ip_vs_genl_find_service(net,
+ info->attrs[IPVS_CMD_ATTR_SERVICE]);
+ if (IS_ERR(svc)) {
+ ret = PTR_ERR(svc);
+ goto out_err;
+ } else if (svc) {
+ ret = ip_vs_genl_fill_service(msg, svc);
+ if (ret)
+ goto nla_put_failure;
+ } else {
+ ret = -ESRCH;
+ goto out_err;
+ }
+
+ break;
+ }
+
+ case IPVS_CMD_GET_CONFIG:
+ {
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
+ t.tcp_timeout) ||
+ nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+ t.tcp_fin_timeout))
+ goto nla_put_failure;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
+ goto nla_put_failure;
+#endif
+
+ break;
+ }
+
+ case IPVS_CMD_GET_INFO:
+ if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
+ IP_VS_VERSION_CODE) ||
+ nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+ ip_vs_conn_tab_size))
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(msg, reply);
+ ret = genlmsg_reply(msg, info);
+ goto out;
+
+nla_put_failure:
+ pr_err("not enough space in Netlink message\n");
+ ret = -EMSGSIZE;
+
+out_err:
+ nlmsg_free(msg);
+out:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return ret;
+}
+
+
+static const struct genl_ops ip_vs_genl_ops[] = {
+ {
+ .cmd = IPVS_CMD_NEW_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_SET_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ .dumpit = ip_vs_genl_dump_services,
+ .policy = ip_vs_cmd_policy,
+ },
+ {
+ .cmd = IPVS_CMD_NEW_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_SET_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .dumpit = ip_vs_genl_dump_dests,
+ },
+ {
+ .cmd = IPVS_CMD_NEW_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_daemon,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_daemon,
+ },
+ {
+ .cmd = IPVS_CMD_GET_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .dumpit = ip_vs_genl_dump_daemons,
+ },
+ {
+ .cmd = IPVS_CMD_SET_CONFIG,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_CONFIG,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_INFO,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_ZERO,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_FLUSH,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_set_cmd,
+ },
+};
+
+static int __init ip_vs_genl_register(void)
+{
+ return genl_register_family_with_ops(&ip_vs_genl_family,
+ ip_vs_genl_ops);
+}
+
+static void ip_vs_genl_unregister(void)
+{
+ genl_unregister_family(&ip_vs_genl_family);
+}
+
+/* End of Generic Netlink interface definitions */
+
+/*
+ * per netns intit/exit func.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
+{
+ int idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ctl_table *tbl;
+
+ atomic_set(&ipvs->dropentry, 0);
+ spin_lock_init(&ipvs->dropentry_lock);
+ spin_lock_init(&ipvs->droppacket_lock);
+ spin_lock_init(&ipvs->securetcp_lock);
+
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
+ } else
+ tbl = vs_vars;
+ /* Initialize sysctl defaults */
+ idx = 0;
+ ipvs->sysctl_amemthresh = 1024;
+ tbl[idx++].data = &ipvs->sysctl_amemthresh;
+ ipvs->sysctl_am_droprate = 10;
+ tbl[idx++].data = &ipvs->sysctl_am_droprate;
+ tbl[idx++].data = &ipvs->sysctl_drop_entry;
+ tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+ tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+ tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+ ipvs->sysctl_snat_reroute = 1;
+ tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+ ipvs->sysctl_sync_ver = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ ipvs->sysctl_sync_ports = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ports;
+ tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
+ ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+ ipvs->sysctl_sync_sock_size = 0;
+ tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
+ tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+ tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
+ tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+ ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
+ ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
+ tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
+ tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
+ ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
+ tbl[idx++].data = &ipvs->sysctl_sync_retries;
+ tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+ ipvs->sysctl_pmtu_disc = 1;
+ tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
+ tbl[idx++].data = &ipvs->sysctl_backup_only;
+ ipvs->sysctl_conn_reuse_mode = 1;
+ tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
+
+
+ ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
+ if (ipvs->sysctl_hdr == NULL) {
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return -ENOMEM;
+ }
+ ip_vs_start_estimator(net, &ipvs->tot_stats);
+ ipvs->sysctl_tbl = tbl;
+ /* Schedule defense work */
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+
+ return 0;
+}
+
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ cancel_delayed_work_sync(&ipvs->defense_work);
+ cancel_work_sync(&ipvs->defense_work.work);
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ ip_vs_stop_estimator(net, &ipvs->tot_stats);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->sysctl_tbl);
+}
+
+#else
+
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
+
+#endif
+
+static struct notifier_block ip_vs_dst_notifier = {
+ .notifier_call = ip_vs_dst_event,
+};
+
+int __net_init ip_vs_control_net_init(struct net *net)
+{
+ int i, idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /* Initialize rs_table */
+ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
+ INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
+
+ INIT_LIST_HEAD(&ipvs->dest_trash);
+ spin_lock_init(&ipvs->dest_trash_lock);
+ setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
+ (unsigned long) net);
+ atomic_set(&ipvs->ftpsvc_counter, 0);
+ atomic_set(&ipvs->nullsvc_counter, 0);
+
+ /* procfs stats */
+ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!ipvs->tot_stats.cpustats)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ipvs_tot_stats;
+ ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
+ u64_stats_init(&ipvs_tot_stats->syncp);
+ }
+
+ spin_lock_init(&ipvs->tot_stats.lock);
+
+ proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
+ proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
+ proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+ &ip_vs_stats_percpu_fops);
+
+ if (ip_vs_control_net_init_sysctl(net))
+ goto err;
+
+ return 0;
+
+err:
+ free_percpu(ipvs->tot_stats.cpustats);
+ return -ENOMEM;
+}
+
+void __net_exit ip_vs_control_net_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_trash_cleanup(net);
+ ip_vs_control_net_cleanup_sysctl(net);
+ remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
+ remove_proc_entry("ip_vs_stats", net->proc_net);
+ remove_proc_entry("ip_vs", net->proc_net);
+ free_percpu(ipvs->tot_stats.cpustats);
+}
+
+int __init ip_vs_register_nl_ioctl(void)
+{
+ int ret;
+
+ ret = nf_register_sockopt(&ip_vs_sockopts);
+ if (ret) {
+ pr_err("cannot register sockopt.\n");
+ goto err_sock;
+ }
+
+ ret = ip_vs_genl_register();
+ if (ret) {
+ pr_err("cannot register Generic Netlink interface.\n");
+ goto err_genl;
+ }
+ return 0;
+
+err_genl:
+ nf_unregister_sockopt(&ip_vs_sockopts);
+err_sock:
+ return ret;
+}
+
+void ip_vs_unregister_nl_ioctl(void)
+{
+ ip_vs_genl_unregister();
+ nf_unregister_sockopt(&ip_vs_sockopts);
+}
+
+int __init ip_vs_control_init(void)
+{
+ int idx;
+ int ret;
+
+ EnterFunction(2);
+
+ /* Initialize svc_table, ip_vs_svc_fwm_table */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+
+ smp_wmb(); /* Do we really need it now ? */
+
+ ret = register_netdevice_notifier(&ip_vs_dst_notifier);
+ if (ret < 0)
+ return ret;
+
+ LeaveFunction(2);
+ return 0;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+ EnterFunction(2);
+ unregister_netdevice_notifier(&ip_vs_dst_notifier);
+ LeaveFunction(2);
+}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
new file mode 100644
index 000000000..6be5c538b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -0,0 +1,276 @@
+/*
+ * IPVS: Destination Hashing scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * Inspired by the consistent hashing scheduler patch from
+ * Thomas Proell <proellt@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[dest_ip];
+ * if (n is dead) OR
+ * (n is overloaded) OR (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS 8
+#endif
+#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+
+struct ip_vs_dh_state {
+ struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE];
+ struct rcu_head rcu_head;
+};
+
+/*
+ * Returns hash value for IPVS DH entry
+ */
+static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)
+{
+ return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);
+}
+
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+ bool empty;
+
+ b = &s->buckets[0];
+ p = &svc->destinations;
+ empty = list_empty(p);
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
+
+ p = p->next;
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_state *s)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+ struct ip_vs_dest *dest;
+
+ b = &s->buckets[0];
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_state *s;
+
+ /* allocate the DH table for this service */
+ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL);
+ if (s == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = s;
+ IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+ /* assign the hash buckets with current dests */
+ ip_vs_dh_reassign(s, svc);
+
+ return 0;
+}
+
+
+static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_state *s = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_dh_flush(s);
+
+ /* release the table itself */
+ kfree_rcu(s, rcu_head);
+ IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+}
+
+
+static int ip_vs_dh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_dh_state *s = svc->sched_data;
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_dh_reassign(s, svc);
+
+ return 0;
+}
+
+
+/*
+ * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ * consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+ return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_dh_state *s;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ s = (struct ip_vs_dh_state *) svc->sched_data;
+ dest = ip_vs_dh_get(svc->af, s, &iph->daddr);
+ if (!dest
+ || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest)) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+ .name = "dh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
+ .init_service = ip_vs_dh_init_svc,
+ .done_service = ip_vs_dh_done_svc,
+ .add_dest = ip_vs_dh_dest_changed,
+ .del_dest = ip_vs_dh_dest_changed,
+ .schedule = ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+ synchronize_rcu();
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
new file mode 100644
index 000000000..ef0eb0a8d
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -0,0 +1,209 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * Affected data: est_list and est_lock.
+ * estimation_timer() runs with timer per netns.
+ * get_stats()) do the per cpu summing.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/list.h>
+
+#include <net/ip_vs.h>
+
+/*
+ This code is to estimate rate in a shorter interval (such as 8
+ seconds) for virtual services and real servers. For measure rate in a
+ long interval, it is easy to implement a user level daemon which
+ periodically reads those statistical counters and measure rate.
+
+ Currently, the measurement is activated by slow timer handler. Hope
+ this measurement will not introduce too much load.
+
+ We measure rate during the last 8 seconds every 2 seconds:
+
+ avgrate = avgrate*(1-W) + rate*W
+
+ where W = 2^(-2)
+
+ NOTES.
+
+ * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
+
+ * Netlink users can see 64-bit values but sockopt users are restricted
+ to 32-bit values for conns, packets, bps, cps and pps.
+
+ * A lot of code is taken from net/core/gen_estimator.c
+ */
+
+
+/*
+ * Make a summary from each cpu
+ */
+static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
+ struct ip_vs_cpu_stats __percpu *stats)
+{
+ int i;
+ bool add = false;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
+ unsigned int start;
+ u64 conns, inpkts, outpkts, inbytes, outbytes;
+
+ if (add) {
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ conns = s->cnt.conns;
+ inpkts = s->cnt.inpkts;
+ outpkts = s->cnt.outpkts;
+ inbytes = s->cnt.inbytes;
+ outbytes = s->cnt.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ sum->conns += conns;
+ sum->inpkts += inpkts;
+ sum->outpkts += outpkts;
+ sum->inbytes += inbytes;
+ sum->outbytes += outbytes;
+ } else {
+ add = true;
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ sum->conns = s->cnt.conns;
+ sum->inpkts = s->cnt.inpkts;
+ sum->outpkts = s->cnt.outpkts;
+ sum->inbytes = s->cnt.inbytes;
+ sum->outbytes = s->cnt.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ }
+ }
+}
+
+
+static void estimation_timer(unsigned long arg)
+{
+ struct ip_vs_estimator *e;
+ struct ip_vs_stats *s;
+ u64 rate;
+ struct net *net = (struct net *)arg;
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_ipvs(net);
+ spin_lock(&ipvs->est_lock);
+ list_for_each_entry(e, &ipvs->est_list, list) {
+ s = container_of(e, struct ip_vs_stats, est);
+
+ spin_lock(&s->lock);
+ ip_vs_read_cpu_stats(&s->kstats, s->cpustats);
+
+ /* scaled by 2^10, but divided 2 seconds */
+ rate = (s->kstats.conns - e->last_conns) << 9;
+ e->last_conns = s->kstats.conns;
+ e->cps += ((s64)rate - (s64)e->cps) >> 2;
+
+ rate = (s->kstats.inpkts - e->last_inpkts) << 9;
+ e->last_inpkts = s->kstats.inpkts;
+ e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
+
+ rate = (s->kstats.outpkts - e->last_outpkts) << 9;
+ e->last_outpkts = s->kstats.outpkts;
+ e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
+
+ /* scaled by 2^5, but divided 2 seconds */
+ rate = (s->kstats.inbytes - e->last_inbytes) << 4;
+ e->last_inbytes = s->kstats.inbytes;
+ e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
+
+ rate = (s->kstats.outbytes - e->last_outbytes) << 4;
+ e->last_outbytes = s->kstats.outbytes;
+ e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
+ spin_unlock(&s->lock);
+ }
+ spin_unlock(&ipvs->est_lock);
+ mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
+}
+
+void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_estimator *est = &stats->est;
+
+ INIT_LIST_HEAD(&est->list);
+
+ spin_lock_bh(&ipvs->est_lock);
+ list_add(&est->list, &ipvs->est_list);
+ spin_unlock_bh(&ipvs->est_lock);
+}
+
+void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_estimator *est = &stats->est;
+
+ spin_lock_bh(&ipvs->est_lock);
+ list_del(&est->list);
+ spin_unlock_bh(&ipvs->est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_kstats *k = &stats->kstats;
+
+ /* reset counters, caller must hold the stats->lock lock */
+ est->last_inbytes = k->inbytes;
+ est->last_outbytes = k->outbytes;
+ est->last_conns = k->conns;
+ est->last_inpkts = k->inpkts;
+ est->last_outpkts = k->outpkts;
+ est->cps = 0;
+ est->inpps = 0;
+ est->outpps = 0;
+ est->inbps = 0;
+ est->outbps = 0;
+}
+
+/* Get decoded rates */
+void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *e = &stats->est;
+
+ dst->cps = (e->cps + 0x1FF) >> 10;
+ dst->inpps = (e->inpps + 0x1FF) >> 10;
+ dst->outpps = (e->outpps + 0x1FF) >> 10;
+ dst->inbps = (e->inbps + 0xF) >> 5;
+ dst->outbps = (e->outbps + 0xF) >> 5;
+}
+
+int __net_init ip_vs_estimator_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->est_list);
+ spin_lock_init(&ipvs->est_lock);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+ return 0;
+}
+
+void __net_exit ip_vs_estimator_net_cleanup(struct net *net)
+{
+ del_timer_sync(&net_ipvs(net)->est_timer);
+}
diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c
new file mode 100644
index 000000000..e09874d02
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_fo.c
@@ -0,0 +1,79 @@
+/*
+ * IPVS: Weighted Fail Over module
+ *
+ * Authors: Kenny Mathis <kmathis@chokepoint.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Kenny Mathis : added initial functionality based on weight
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/* Weighted Fail Over Module */
+static struct ip_vs_dest *
+ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *hweight = NULL;
+ int hw = 0; /* Track highest weight */
+
+ IP_VS_DBG(6, "ip_vs_fo_schedule(): Scheduling...\n");
+
+ /* Basic failover functionality
+ * Find virtual server with highest weight and send it traffic
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > hw) {
+ hweight = dest;
+ hw = atomic_read(&dest->weight);
+ }
+ }
+
+ if (hweight) {
+ IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n",
+ IP_VS_DBG_ADDR(hweight->af, &hweight->addr),
+ ntohs(hweight->port),
+ atomic_read(&hweight->activeconns),
+ atomic_read(&hweight->weight));
+ return hweight;
+ }
+
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+}
+
+static struct ip_vs_scheduler ip_vs_fo_scheduler = {
+ .name = "fo",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_fo_scheduler.n_list),
+ .schedule = ip_vs_fo_schedule,
+};
+
+static int __init ip_vs_fo_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_fo_scheduler);
+}
+
+static void __exit ip_vs_fo_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_fo_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_fo_init);
+module_exit(ip_vs_fo_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
new file mode 100644
index 000000000..5d3daae98
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -0,0 +1,503 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ * IP_MASQ_FTP ftp masquerading module
+ *
+ * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
+ *
+ * Author: Wouter Gadeyne
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/gfp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/unaligned.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 "
+#define CLIENT_STRING "PORT"
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static unsigned int ports_count = 1;
+static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, ushort, &ports_count, 0444);
+MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
+
+
+/* Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ /* We use connection tracking for the command connection */
+ cp->flags |= IP_VS_CONN_F_NFCT;
+ return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern", ignoring before "skip" and terminated with
+ * the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+ const char *pattern, size_t plen,
+ char skip, char term,
+ __be32 *addr, __be16 *port,
+ char **start, char **end)
+{
+ char *s, c;
+ unsigned char p[6];
+ int i = 0;
+
+ if (data_limit - data < plen) {
+ /* check if there is partial match */
+ if (strncasecmp(data, pattern, data_limit - data) == 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ if (strncasecmp(data, pattern, plen) != 0) {
+ return 0;
+ }
+ s = data + plen;
+ if (skip) {
+ int found = 0;
+
+ for (;; s++) {
+ if (s == data_limit)
+ return -1;
+ if (!found) {
+ if (*s == skip)
+ found = 1;
+ } else if (*s != skip) {
+ break;
+ }
+ }
+ }
+
+ for (data = s; ; data++) {
+ if (data == data_limit)
+ return -1;
+ if (*data == term)
+ break;
+ }
+ *end = data;
+
+ memset(p, 0, sizeof(p));
+ for (data = s; ; data++) {
+ c = *data;
+ if (c == term)
+ break;
+ if (c >= '0' && c <= '9') {
+ p[i] = p[i]*10 + c - '0';
+ } else if (c == ',' && i < 5) {
+ i++;
+ } else {
+ /* unexpected character */
+ return -1;
+ }
+ }
+
+ if (i != 5)
+ return -1;
+
+ *start = s;
+ *addr = get_unaligned((__be32 *) p);
+ *port = get_unaligned((__be16 *) (p + 4));
+ return 1;
+}
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port. Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+ struct sk_buff *skb, int *diff)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ char *start, *end;
+ union nf_inet_addr from;
+ __be16 port;
+ struct ip_vs_conn *n_cp;
+ char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+ unsigned int buf_len;
+ int ret = 0;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct net *net;
+
+ *diff = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ /* This application helper doesn't work with IPv6 yet,
+ * so turn this into a no-op for IPv6 packets
+ */
+ if (cp->af == AF_INET6)
+ return 1;
+#endif
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 1;
+
+ /* Linear packets are much easier to deal with. */
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (cp->app_data == &ip_vs_ftp_pasv) {
+ iph = ip_hdr(skb);
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)th + (th->doff << 2);
+ data_limit = skb_tail_pointer(skb);
+
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ SERVER_STRING,
+ sizeof(SERVER_STRING)-1,
+ '(', ')',
+ &from.ip, &port,
+ &start, &end) != 1)
+ return 1;
+
+ IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+ &from.ip, ntohs(port), &cp->caddr.ip, 0);
+
+ /*
+ * Now update or create an connection entry for it
+ */
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &from, port,
+ &cp->caddr, 0, &p);
+ n_cp = ip_vs_conn_out_get(&p);
+ }
+ if (!n_cp) {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ AF_INET, IPPROTO_TCP, &cp->caddr,
+ 0, &cp->vaddr, port, &p);
+ /* As above, this is ipv4 only */
+ n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
+ cp->dest, skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /*
+ * Replace the old passive address with the new one
+ */
+ from.ip = n_cp->vaddr.ip;
+ port = n_cp->vport;
+ snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
+ ((unsigned char *)&from.ip)[0],
+ ((unsigned char *)&from.ip)[1],
+ ((unsigned char *)&from.ip)[2],
+ ((unsigned char *)&from.ip)[3],
+ ntohs(port) >> 8,
+ ntohs(port) & 0xFF);
+
+ buf_len = strlen(buf);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
+ /* If mangling fails this function will return 0
+ * which will cause the packet to be dropped.
+ * Mangling can only fail under memory pressure,
+ * hopefully it will succeed on the retransmitted
+ * packet.
+ */
+ rcu_read_lock();
+ ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ iph->ihl * 4,
+ start-data, end-start,
+ buf, buf_len);
+ rcu_read_unlock();
+ if (ret) {
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ IPPROTO_TCP, 0, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* csum is updated */
+ ret = 1;
+ }
+ }
+
+ /*
+ * Not setting 'diff' is intentional, otherwise the sequence
+ * would be adjusted twice.
+ */
+
+ net = skb_net(skb);
+ cp->app_data = NULL;
+ ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_conn_put(n_cp);
+ return ret;
+ }
+ return 1;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+ struct sk_buff *skb, int *diff)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_start, *data_limit;
+ char *start, *end;
+ union nf_inet_addr to;
+ __be16 port;
+ struct ip_vs_conn *n_cp;
+ struct net *net;
+
+ /* no diff required for incoming packets */
+ *diff = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ /* This application helper doesn't work with IPv6 yet,
+ * so turn this into a no-op for IPv6 packets
+ */
+ if (cp->af == AF_INET6)
+ return 1;
+#endif
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 1;
+
+ /* Linear packets are much easier to deal with. */
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ /*
+ * Detecting whether it is passive
+ */
+ iph = ip_hdr(skb);
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /* Since there may be OPTIONS in the TCP packet and the HLEN is
+ the length of the header in 32-bit multiples, it is accurate
+ to calculate data address by th+HLEN*4 */
+ data = data_start = (char *)th + (th->doff << 2);
+ data_limit = skb_tail_pointer(skb);
+
+ while (data <= data_limit - 6) {
+ if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+ /* Passive mode on */
+ IP_VS_DBG(7, "got PASV at %td of %td\n",
+ data - data_start,
+ data_limit - data_start);
+ cp->app_data = &ip_vs_ftp_pasv;
+ return 1;
+ }
+ data++;
+ }
+
+ /*
+ * To support virtual FTP server, the scenerio is as follows:
+ * FTP client ----> Load Balancer ----> FTP server
+ * First detect the port number in the application data,
+ * then create a new connection entry for the coming data
+ * connection.
+ */
+ if (ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+ ' ', '\r', &to.ip, &port,
+ &start, &end) != 1)
+ return 1;
+
+ IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+
+ /* Passive mode off */
+ cp->app_data = NULL;
+
+ /*
+ * Now update or create a connection entry for it
+ */
+ IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
+ ip_vs_proto_name(iph->protocol),
+ &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &to, port, &cp->vaddr,
+ htons(ntohs(cp->vport)-1), &p);
+ n_cp = ip_vs_conn_in_get(&p);
+ if (!n_cp) {
+ /* This is ipv4 only */
+ n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+ htons(ntohs(cp->dport)-1),
+ IP_VS_CONN_F_NFCT, cp->dest,
+ skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+ }
+
+ /*
+ * Move tunnel to listen state
+ */
+ net = skb_net(skb);
+ ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_conn_put(n_cp);
+
+ return 1;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+ .name = "ftp",
+ .type = IP_VS_APP_TYPE_FTP,
+ .protocol = IPPROTO_TCP,
+ .module = THIS_MODULE,
+ .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+ .init_conn = ip_vs_ftp_init_conn,
+ .done_conn = ip_vs_ftp_done_conn,
+ .bind_conn = NULL,
+ .unbind_conn = NULL,
+ .pkt_out = ip_vs_ftp_out,
+ .pkt_in = ip_vs_ftp_in,
+};
+
+/*
+ * per netns ip_vs_ftp initialization
+ */
+static int __net_init __ip_vs_ftp_init(struct net *net)
+{
+ int i, ret;
+ struct ip_vs_app *app;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ app = register_ip_vs_app(net, &ip_vs_ftp);
+ if (IS_ERR(app))
+ return PTR_ERR(app);
+
+ for (i = 0; i < ports_count; i++) {
+ if (!ports[i])
+ continue;
+ ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
+ if (ret)
+ goto err_unreg;
+ pr_info("%s: loaded support on port[%d] = %d\n",
+ app->name, i, ports[i]);
+ }
+ return 0;
+
+err_unreg:
+ unregister_ip_vs_app(net, &ip_vs_ftp);
+ return ret;
+}
+/*
+ * netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+ unregister_ip_vs_app(net, &ip_vs_ftp);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+ .init = __ip_vs_ftp_init,
+ .exit = __ip_vs_ftp_exit,
+};
+
+static int __init ip_vs_ftp_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns on error */
+ return rv;
+}
+
+/*
+ * ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+ unregister_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns */
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
new file mode 100644
index 000000000..127f14046
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -0,0 +1,633 @@
+/*
+ * IPVS: Locality-Based Least-Connection scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Martin Hamilton : fixed the terrible locking bugs
+ * *lock(tbl->lock) ==> *lock(&tbl->lock)
+ * Wensong Zhang : fixed the uninitialized tbl->lock bug
+ * Wensong Zhang : added doing full expiration check to
+ * collect stale entries of 24+ hours when
+ * no partial expire check in a half hour
+ * Julian Anastasov : replaced del_timer call with del_timer_sync
+ * to avoid the possible race between timer
+ * handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ * if cachenode[dest_ip] is null then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- cachenode[dest_ip];
+ * if (n is dead) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ * return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblc entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+
+
+/*
+ * for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS 10
+#endif
+#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ * IPVS lblc entry represents an association between destination
+ * IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+ struct hlist_node list;
+ int af; /* address family */
+ union nf_inet_addr addr; /* destination IP address */
+ struct ip_vs_dest *dest; /* real server (cache) */
+ unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
+};
+
+
+/*
+ * IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct timer_list periodic_timer; /* collect stale entries */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+ bool dead;
+};
+
+
+/*
+ * IPVS LBLC sysctl table
+ */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vs_vars_table[] = {
+ {
+ .procname = "lblc_expiration",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif
+
+static void ip_vs_lblc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_lblc_entry *en = container_of(head,
+ struct ip_vs_lblc_entry,
+ rcu_head);
+
+ ip_vs_dest_put_and_free(en->dest);
+ kfree(en);
+}
+
+static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
+}
+
+/*
+ * Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned int
+ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblc_table.
+ * returns bool success.
+ */
+static void
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+ unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
+
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+}
+
+
+/* Get ip_vs_lblc_entry associated with supplied parameters. */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
+ const union nf_inet_addr *addr)
+{
+ unsigned int hash = ip_vs_lblc_hashkey(af, addr);
+ struct ip_vs_lblc_entry *en;
+
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
+ if (ip_vs_addr_equal(af, &en->addr, addr))
+ return en;
+
+ return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
+ * address to a server. Called under spin lock.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
+ u16 af, struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblc_entry *en;
+
+ en = ip_vs_lblc_get(af, tbl, daddr);
+ if (en) {
+ if (en->dest == dest)
+ return en;
+ ip_vs_lblc_del(en);
+ }
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
+
+ en->af = af;
+ ip_vs_addr_copy(af, &en->addr, daddr);
+ en->lastuse = jiffies;
+
+ ip_vs_dest_hold(dest);
+ en->dest = dest;
+
+ ip_vs_lblc_hash(tbl, en);
+
+ return en;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+ int i;
+
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ }
+ }
+ spin_unlock_bh(&svc->sched_lock);
+}
+
+static int sysctl_lblc_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblc_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+ unsigned long now = jiffies;
+ int i, j;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now,
+ en->lastuse +
+ sysctl_lblc_expiration(svc)))
+ continue;
+
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ }
+ spin_unlock(&svc->sched_lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblc table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+ struct ip_vs_service *svc = (struct ip_vs_service *) data;
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblc_full_check(svc);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+ continue;
+
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ spin_unlock(&svc->sched_lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblc_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblc_table for this service
+ */
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+ "current service\n", sizeof(*tbl));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
+ }
+ tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+ tbl->dead = 0;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
+ (unsigned long)svc);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+ return 0;
+}
+
+
+static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblc_flush(svc);
+
+ /* release the table itself */
+ kfree_rcu(tbl, rcu_head);
+ IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+ sizeof(*tbl));
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblc_schedule(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We use the following formula to estimate the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ if (atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ struct ip_vs_dest *d;
+
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_dest *dest = NULL;
+ struct ip_vs_lblc_entry *en;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* First look in our cache */
+ en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
+ if (en) {
+ /* We only hold a read lock, but this is atomic */
+ en->lastuse = jiffies;
+
+ /*
+ * If the destination is not available, i.e. it's in the trash,
+ * we must ignore it, as it may be removed from under our feet,
+ * if someone drops our reference count. Our caller only makes
+ * sure that destinations, that are not in the trash, are not
+ * moved to the trash, while we are scheduling. But anyone can
+ * free up entries from the trash at any time.
+ */
+
+ dest = en->dest;
+ if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
+ atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+ goto out;
+ }
+
+ /* No cache entry or it is invalid, time to schedule */
+ dest = __ip_vs_lblc_schedule(svc);
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* If we fail to create a cache entry, we'll just use the valid dest */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest);
+ spin_unlock_bh(&svc->sched_lock);
+
+out:
+ IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
+ .name = "lblc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
+ .init_service = ip_vs_lblc_init_svc,
+ .done_service = ip_vs_lblc_done_svc,
+ .schedule = ip_vs_lblc_schedule,
+};
+
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblc_ctl_table == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblc_ctl_table[0].procname = NULL;
+
+ } else
+ ipvs->lblc_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
+
+ ipvs->lblc_ctl_header =
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
+ if (!ipvs->lblc_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblc_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblc_ops = {
+ .init = __ip_vs_lblc_init,
+ .exit = __ip_vs_lblc_exit,
+};
+
+static int __init ip_vs_lblc_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip_vs_lblc_ops);
+ if (ret)
+ return ret;
+
+ ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ if (ret)
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
+ return ret;
+}
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
+ rcu_barrier();
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
new file mode 100644
index 000000000..2229d2d8b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,818 @@
+/*
+ * IPVS: Locality-Based Least-Connection with Replication scheduler
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Julian Anastasov : Added the missing (dest->weight>0)
+ * condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ * if serverSet[dest_ip] is null then
+ * n, serverSet[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- {least-conn (alive) node in serverSet[dest_ip]};
+ * if (n is null) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n <- {weighted least-conn node};
+ * add n to serverSet[dest_ip];
+ * if |serverSet[dest_ip]| > 1 AND
+ * now - serverSet[dest_ip].lastMod > T then
+ * m <- {most conn node in serverSet[dest_ip]};
+ * remove m from serverSet[dest_ip];
+ * if serverSet[dest_ip] changed then
+ * serverSet[dest_ip].lastMod <- now;
+ *
+ * return n;
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblcr entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+
+/*
+ * for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
+#endif
+#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ * IPVS destination set structure and operations
+ */
+struct ip_vs_dest_set_elem {
+ struct list_head list; /* list link */
+ struct ip_vs_dest *dest; /* destination server */
+ struct rcu_head rcu_head;
+};
+
+struct ip_vs_dest_set {
+ atomic_t size; /* set size */
+ unsigned long lastmod; /* last modified time */
+ struct list_head list; /* destination list */
+};
+
+
+static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
+ struct ip_vs_dest *dest, bool check)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ if (check) {
+ list_for_each_entry(e, &set->list, list) {
+ if (e->dest == dest)
+ return;
+ }
+ }
+
+ e = kmalloc(sizeof(*e), GFP_ATOMIC);
+ if (e == NULL)
+ return;
+
+ ip_vs_dest_hold(dest);
+ e->dest = dest;
+
+ list_add_rcu(&e->list, &set->list);
+ atomic_inc(&set->size);
+
+ set->lastmod = jiffies;
+}
+
+static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
+ ip_vs_dest_put_and_free(e->dest);
+ kfree(e);
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ list_for_each_entry(e, &set->list, list) {
+ if (e->dest == dest) {
+ /* HIT */
+ atomic_dec(&set->size);
+ set->lastmod = jiffies;
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
+ break;
+ }
+ }
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+ struct ip_vs_dest_set_elem *e, *ep;
+
+ list_for_each_entry_safe(e, ep, &set->list, list) {
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
+ }
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /* select the first destination server, whose weight > 0 */
+ list_for_each_entry_rcu(e, &set->list, list) {
+ least = e->dest;
+ if (least->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if ((atomic_read(&least->weight) > 0)
+ && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /* find the destination with the weighted least load */
+ nextstage:
+ list_for_each_entry_continue_rcu(e, &set->list, list) {
+ dest = e->dest;
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if (((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))
+ && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "%s(): server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ __func__,
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+ return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest *dest, *most;
+ int moh, doh;
+
+ if (set == NULL)
+ return NULL;
+
+ /* select the first destination server, whose weight > 0 */
+ list_for_each_entry(e, &set->list, list) {
+ most = e->dest;
+ if (atomic_read(&most->weight) > 0) {
+ moh = ip_vs_dest_conn_overhead(most);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /* find the destination with the weighted most load */
+ nextstage:
+ list_for_each_entry_continue(e, &set->list, list) {
+ dest = e->dest;
+ doh = ip_vs_dest_conn_overhead(dest);
+ /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+ if (((__s64)moh * atomic_read(&dest->weight) <
+ (__s64)doh * atomic_read(&most->weight))
+ && (atomic_read(&dest->weight) > 0)) {
+ most = dest;
+ moh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "%s(): server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ __func__,
+ IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port),
+ atomic_read(&most->activeconns),
+ atomic_read(&most->refcnt),
+ atomic_read(&most->weight), moh);
+ return most;
+}
+
+
+/*
+ * IPVS lblcr entry represents an association between destination
+ * IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+ struct hlist_node list;
+ int af; /* address family */
+ union nf_inet_addr addr; /* destination IP address */
+ struct ip_vs_dest_set set; /* destination server set */
+ unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
+};
+
+
+/*
+ * IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ struct timer_list periodic_timer; /* collect stale entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+ bool dead;
+};
+
+
+#ifdef CONFIG_SYSCTL
+/*
+ * IPVS LBLCR sysctl table
+ */
+
+static struct ctl_table vs_vars_table[] = {
+ {
+ .procname = "lblcr_expiration",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ ip_vs_dest_set_eraseall(&en->set);
+ kfree_rcu(en, rcu_head);
+}
+
+
+/*
+ * Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned int
+ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblcr_table.
+ * returns bool success.
+ */
+static void
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+ unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
+
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+}
+
+
+/* Get ip_vs_lblcr_entry associated with supplied parameters. */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
+ const union nf_inet_addr *addr)
+{
+ unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
+ struct ip_vs_lblcr_entry *en;
+
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
+ if (ip_vs_addr_equal(af, &en->addr, addr))
+ return en;
+
+ return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under spin lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
+ u16 af, struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblcr_entry *en;
+
+ en = ip_vs_lblcr_get(af, tbl, daddr);
+ if (!en) {
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
+
+ en->af = af;
+ ip_vs_addr_copy(af, &en->addr, daddr);
+ en->lastuse = jiffies;
+
+ /* initialize its dest set */
+ atomic_set(&(en->set.size), 0);
+ INIT_LIST_HEAD(&en->set.list);
+
+ ip_vs_dest_set_insert(&en->set, dest, false);
+
+ ip_vs_lblcr_hash(tbl, en);
+ return en;
+ }
+
+ ip_vs_dest_set_insert(&en->set, dest, true);
+
+ return en;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ int i;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
+ ip_vs_lblcr_free(en);
+ }
+ }
+ spin_unlock_bh(&svc->sched_lock);
+}
+
+static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblcr_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int i, j;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_after(en->lastuse +
+ sysctl_lblcr_expiration(svc), now))
+ continue;
+
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ spin_unlock(&svc->sched_lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblcr table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+ struct ip_vs_service *svc = (struct ip_vs_service *) data;
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblcr_full_check(svc);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+ continue;
+
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ spin_unlock(&svc->sched_lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblcr_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblcr_table for this service
+ */
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+ "current service\n", sizeof(*tbl));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
+ }
+ tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+ tbl->dead = 0;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
+ (unsigned long)svc);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+ return 0;
+}
+
+
+static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblcr_flush(svc);
+
+ /* release the table itself */
+ kfree_rcu(tbl, rcu_head);
+ IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+ sizeof(*tbl));
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblcr_schedule(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We use the following formula to estimate the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if (atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "LBLCR: server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ struct ip_vs_dest *d;
+
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ struct ip_vs_dest *dest;
+ struct ip_vs_lblcr_entry *en;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* First look in our cache */
+ en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr);
+ if (en) {
+ en->lastuse = jiffies;
+
+ /* Get the least loaded destination */
+ dest = ip_vs_dest_set_min(&en->set);
+
+ /* More than one destination + enough time passed by, cleanup */
+ if (atomic_read(&en->set.size) > 1 &&
+ time_after(jiffies, en->set.lastmod +
+ sysctl_lblcr_expiration(svc))) {
+ spin_lock_bh(&svc->sched_lock);
+ if (atomic_read(&en->set.size) > 1) {
+ struct ip_vs_dest *m;
+
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ spin_unlock_bh(&svc->sched_lock);
+ }
+
+ /* If the destination is not overloaded, use it */
+ if (dest && !is_overloaded(dest, svc))
+ goto out;
+
+ /* The cache entry is invalid, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc);
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* Update our cache entry */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_dest_set_insert(&en->set, dest, true);
+ spin_unlock_bh(&svc->sched_lock);
+ goto out;
+ }
+
+ /* No cache entry, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc);
+ if (!dest) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+
+ /* If we fail to create a cache entry, we'll just use the valid dest */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest);
+ spin_unlock_bh(&svc->sched_lock);
+
+out:
+ IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+ .name = "lblcr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
+ .init_service = ip_vs_lblcr_init_svc,
+ .done_service = ip_vs_lblcr_done_svc,
+ .schedule = ip_vs_lblcr_schedule,
+};
+
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblcr_ctl_table == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblcr_ctl_table[0].procname = NULL;
+ } else
+ ipvs->lblcr_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+ ipvs->lblcr_ctl_header =
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
+ if (!ipvs->lblcr_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+ .init = __ip_vs_lblcr_init,
+ .exit = __ip_vs_lblcr_exit,
+};
+
+static int __init ip_vs_lblcr_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+ if (ret)
+ return ret;
+
+ ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ if (ret)
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ return ret;
+}
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ rcu_barrier();
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
new file mode 100644
index 000000000..19a0769a9
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -0,0 +1,93 @@
+/*
+ * IPVS: Least-Connection Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : added the ip_vs_lc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least = NULL;
+ unsigned int loh = 0, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * Simply select the server with the least number of
+ * (activeconns<<5) + inactconns
+ * Except whose weight is equal to zero.
+ * If the weight is equal to zero, it means that the server is
+ * quiesced, the existing connections to the server still get
+ * served, but no new connection is assigned to the server.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+ atomic_read(&dest->weight) == 0)
+ continue;
+ doh = ip_vs_dest_conn_overhead(dest);
+ if (!least || doh < loh) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ if (!least)
+ ip_vs_scheduler_err(svc, "no destination available");
+ else
+ IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d "
+ "inactconns %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->inactconns));
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+ .name = "lc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
+ .schedule = ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 000000000..5882bbfd1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,299 @@
+/*
+ * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
+ &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+ (T)->dst.protonum
+
+#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
+ &((C)->vaddr.ip), ntohs((C)->vport), \
+ &((C)->daddr.ip), ntohs((C)->dport), \
+ (C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conntrack_tuple new_tuple;
+
+ if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+ nf_ct_is_dying(ct))
+ return;
+
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return;
+
+ /* Alter reply only in original direction */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return;
+
+ /* Applications may adjust TCP seqs */
+ if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP &&
+ !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct))
+ return;
+
+ /*
+ * The connection is not yet in the hashtable, so we update it.
+ * CIP->VIP will remain the same, so leave the tuple in
+ * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
+ * real-server we will see RIP->DIP.
+ */
+ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ /*
+ * This will also take care of UDP and other protocols.
+ */
+ if (outin) {
+ new_tuple.src.u3 = cp->daddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.src.u.tcp.port = cp->dport;
+ } else {
+ new_tuple.dst.u3 = cp->vaddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.dst.u.tcp.port = cp->vport;
+ }
+ IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE
+ ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+ ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb)
+{
+ return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_conntrack_tuple *orig, new_reply;
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = nf_ct_net(ct);
+
+ if (exp->tuple.src.l3num != PF_INET)
+ return;
+
+ /*
+ * We assume that no NF locks are held before this callback.
+ * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+ * expectations even if they use wildcard values, now we provide the
+ * actual values from the newly created original conntrack direction.
+ * The conntrack is confirmed when packet reaches IPVS hooks.
+ */
+
+ /* RS->CLIENT */
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
+ &orig->src.u3, orig->src.u.tcp.port,
+ &orig->dst.u3, orig->dst.u.tcp.port, &p);
+ cp = ip_vs_conn_out_get(&p);
+ if (cp) {
+ /* Change reply CLIENT->RS to CLIENT->VS */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.dst.u3 = cp->vaddr;
+ new_reply.dst.u.tcp.port = cp->vport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+ ", inout cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ /* CLIENT->VS */
+ cp = ip_vs_conn_in_get(&p);
+ if (cp) {
+ /* Change reply VS->CLIENT to RS->CLIENT */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.src.u3 = cp->daddr;
+ new_reply.src.u.tcp.port = cp->dport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
+ return;
+
+alter:
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+ nf_conntrack_alter_reply(ct, &new_reply);
+ ip_vs_conn_put(cp);
+ return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+ struct ip_vs_conn *cp, u_int8_t proto,
+ const __be16 port, int from_rs)
+{
+ struct nf_conntrack_expect *exp;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return;
+
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ from_rs ? &cp->daddr : &cp->caddr,
+ from_rs ? &cp->caddr : &cp->vaddr,
+ proto, port ? &port : NULL,
+ from_rs ? &cp->cport : &cp->vport);
+
+ exp->expectfn = ip_vs_nfct_expect_callback;
+
+ IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct nf_conntrack_tuple tuple;
+
+ if (!cp->cport)
+ return;
+
+ tuple = (struct nf_conntrack_tuple) {
+ .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+ tuple.src.u3 = cp->caddr;
+ tuple.src.u.all = cp->cport;
+ tuple.src.l3num = cp->af;
+ tuple.dst.u3 = cp->vaddr;
+ tuple.dst.u.all = cp->vport;
+
+ IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+ " for conn " FMT_CONN "\n",
+ __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+ h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+ &tuple);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* Show what happens instead of calling nf_ct_kill() */
+ if (del_timer(&ct->timeout)) {
+ IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ if (ct->timeout.function)
+ ct->timeout.function(ct->timeout.data);
+ } else {
+ IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ }
+ nf_ct_put(ct);
+ } else {
+ IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
+ }
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
new file mode 100644
index 000000000..a8b63401e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -0,0 +1,143 @@
+/*
+ * IPVS: Never Queue scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least = NULL;
+ int loh = 0, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+ !atomic_read(&dest->weight))
+ continue;
+
+ doh = ip_vs_nq_dest_overhead(dest);
+
+ /* return the server directly if it is idle */
+ if (atomic_read(&dest->activeconns) == 0) {
+ least = dest;
+ loh = doh;
+ goto out;
+ }
+
+ if (!least ||
+ ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ if (!least) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ out:
+ IP_VS_DBG_BUF(6, "NQ: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+ .name = "nq",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
+ .schedule = ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 000000000..0df17caa8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,110 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
+
+/* Get pe in the pe list by name */
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
+{
+ struct ip_vs_pe *pe;
+
+ IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
+ pe_name);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
+ /* Test and get the modules atomically */
+ if (pe->module &&
+ !try_module_get(pe->module)) {
+ /* This pe is just deleted */
+ continue;
+ }
+ if (strcmp(pe_name, pe->name)==0) {
+ /* HIT */
+ rcu_read_unlock();
+ return pe;
+ }
+ module_put(pe->module);
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
+{
+ struct ip_vs_pe *pe;
+
+ /* Search for the pe by name */
+ pe = __ip_vs_pe_getbyname(name);
+
+ /* If pe not found, load the module and search again */
+ if (!pe) {
+ request_module("ip_vs_pe_%s", name);
+ pe = __ip_vs_pe_getbyname(name);
+ }
+
+ return pe;
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ struct ip_vs_pe *tmp;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ mutex_lock(&ip_vs_pe_mutex);
+ /* Make sure that the pe with this name doesn't exist
+ * in the pe list.
+ */
+ list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+ if (strcmp(tmp->name, pe->name) == 0) {
+ mutex_unlock(&ip_vs_pe_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] pe already existed "
+ "in the system\n", __func__, pe->name);
+ return -EINVAL;
+ }
+ }
+ /* Add it into the d-linked pe list */
+ list_add_rcu(&pe->n_list, &ip_vs_pe);
+ mutex_unlock(&ip_vs_pe_mutex);
+
+ pr_info("[%s] pe registered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ mutex_lock(&ip_vs_pe_mutex);
+ /* Remove it from the d-linked pe list */
+ list_del_rcu(&pe->n_list);
+ mutex_unlock(&ip_vs_pe_mutex);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ pr_info("[%s] pe unregistered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 000000000..bed5f7042
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,171 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+ const char *callid, size_t callid_len,
+ int *idx)
+{
+ size_t max_len = 64;
+ size_t len = min3(max_len, callid_len, buf_len - *idx - 1);
+ memcpy(buf + *idx, callid, len);
+ buf[*idx+len] = '\0';
+ *idx += len + 1;
+ return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len) \
+ ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \
+ callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+ unsigned int datalen,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ /* Find callid */
+ while (1) {
+ int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+ SIP_HDR_CALL_ID, matchoff,
+ matchlen);
+ if (ret > 0)
+ break;
+ if (!ret)
+ return -EINVAL;
+ dataoff += *matchoff;
+ }
+
+ /* Too large is useless */
+ if (*matchlen > IP_VS_PEDATA_MAXLEN)
+ return -EINVAL;
+
+ /* SIP headers are always followed by a line terminator */
+ if (*matchoff + *matchlen == datalen)
+ return -EINVAL;
+
+ /* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+ * RFC 3261 allows only CRLF, we support both. */
+ if (*(dptr + *matchoff + *matchlen) != '\r' &&
+ *(dptr + *matchoff + *matchlen) != '\n')
+ return -EINVAL;
+
+ IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+ IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+ *matchlen);
+ return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+ struct ip_vs_iphdr iph;
+ unsigned int dataoff, datalen, matchoff, matchlen;
+ const char *dptr;
+ int retc;
+
+ ip_vs_fill_iph_skb(p->af, skb, &iph);
+
+ /* Only useful with UDP */
+ if (iph.protocol != IPPROTO_UDP)
+ return -EINVAL;
+ /* todo: IPv6 fragments:
+ * I think this only should be done for the first fragment. /HS
+ */
+ dataoff = iph.len + sizeof(struct udphdr);
+
+ if (dataoff >= skb->len)
+ return -EINVAL;
+ retc = skb_linearize(skb);
+ if (retc < 0)
+ return retc;
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+
+ if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+ return -EINVAL;
+
+ /* N.B: pe_data is only set on success,
+ * this allows fallback to the default persistence logic on failure
+ */
+ p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC);
+ if (!p->pe_data)
+ return -ENOMEM;
+
+ p->pe_data_len = matchlen;
+
+ return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+ struct ip_vs_conn *ct)
+
+{
+ bool ret = false;
+
+ if (ct->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * d_addr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ p->vaddr, &ct->vaddr) &&
+ ct->vport == p->vport &&
+ ct->flags & IP_VS_CONN_F_TEMPLATE &&
+ ct->protocol == p->protocol &&
+ ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+ !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+ ret = true;
+
+ IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+ u32 initval, bool inverse)
+{
+ return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+ memcpy(buf, cp->pe_data, cp->pe_data_len);
+ return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+ .name = "sip",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+ .fill_param = ip_vs_sip_fill_param,
+ .ct_match = ip_vs_sip_ct_match,
+ .hashkey_raw = ip_vs_sip_hashkey_raw,
+ .show_pe_data = ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+ return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+ unregister_ip_vs_pe(&ip_vs_sip_pe);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
new file mode 100644
index 000000000..939f7fbe9
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -0,0 +1,409 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ * register an ipvs protocol
+ */
+static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp->next = ip_vs_proto_table[hash];
+ ip_vs_proto_table[hash] = pp;
+
+ if (pp->init != NULL)
+ pp->init(pp);
+
+ return 0;
+}
+
+/*
+ * register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+ struct ip_vs_proto_data *pd =
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL);
+
+ if (!pd)
+ return -ENOMEM;
+
+ pd->pp = pp; /* For speed issues */
+ pd->next = ipvs->proto_data_table[hash];
+ ipvs->proto_data_table[hash] = pd;
+ atomic_set(&pd->appcnt, 0); /* Init app counter */
+
+ if (pp->init_netns != NULL) {
+ int ret = pp->init_netns(net, pd);
+ if (ret) {
+ /* unlink an free proto data */
+ ipvs->proto_data_table[hash] = pd->next;
+ kfree(pd);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ struct ip_vs_protocol **pp_p;
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp_p = &ip_vs_proto_table[hash];
+ for (; *pp_p; pp_p = &(*pp_p)->next) {
+ if (*pp_p == pp) {
+ *pp_p = pp->next;
+ if (pp->exit != NULL)
+ pp->exit(pp);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
+
+/*
+ * unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data **pd_p;
+ unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+ pd_p = &ipvs->proto_data_table[hash];
+ for (; *pd_p; pd_p = &(*pd_p)->next) {
+ if (*pd_p == pd) {
+ *pd_p = pd->next;
+ if (pd->pp->exit_netns != NULL)
+ pd->pp->exit_netns(net, pd);
+ kfree(pd);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
+
+/*
+ * get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+ struct ip_vs_protocol *pp;
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
+
+ for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+ if (pp->protocol == proto)
+ return pp;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(ip_vs_proto_get);
+
+/*
+ * get ip_vs_protocol object data by netns and proto
+ */
+static struct ip_vs_proto_data *
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+{
+ struct ip_vs_proto_data *pd;
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
+
+ for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+ if (pd->pp->protocol == proto)
+ return pd;
+ }
+
+ return NULL;
+}
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ return __ipvs_proto_data_get(ipvs, proto);
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
+
+/*
+ * Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
+{
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
+ if (pd->pp->timeout_change)
+ pd->pp->timeout_change(pd, flags);
+ }
+ }
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+ return kmemdup(table, size, GFP_KERNEL);
+}
+
+
+/*
+ * Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, const char *const *names,
+ const char *name, int to)
+{
+ int i;
+
+ if (!table || !name || !to)
+ return -EINVAL;
+
+ for (i = 0; i < num; i++) {
+ if (strcmp(names[i], name))
+ continue;
+ table[i] = to * HZ;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+ struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+ if (pp == NULL || pp->state_name == NULL)
+ return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
+ return pp->state_name(state);
+}
+
+
+static void
+ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+ char buf[128];
+ struct iphdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "TRUNCATED");
+ else if (ih->frag_off & htons(IP_OFFSET))
+ sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
+ else {
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ sprintf(buf, "TRUNCATED %pI4->%pI4",
+ &ih->saddr, &ih->daddr);
+ else
+ sprintf(buf, "%pI4:%u->%pI4:%u",
+ &ih->saddr, ntohs(pptr[0]),
+ &ih->daddr, ntohs(pptr[1]));
+ }
+
+ pr_debug("%s: %s %s\n", msg, pp->name, buf);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+ char buf[192];
+ struct ipv6hdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "TRUNCATED");
+ else if (ih->nexthdr == IPPROTO_FRAGMENT)
+ sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr);
+ else {
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ sprintf(buf, "TRUNCATED %pI6c->%pI6c",
+ &ih->saddr, &ih->daddr);
+ else
+ sprintf(buf, "%pI6c:%u->%pI6c:%u",
+ &ih->saddr, ntohs(pptr[0]),
+ &ih->daddr, ntohs(pptr[1]));
+ }
+
+ pr_debug("%s: %s %s\n", msg, pp->name, buf);
+}
+#endif
+
+
+void
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
+ else
+#endif
+ ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
+}
+
+/*
+ * per network name-space init
+ */
+int __net_init ip_vs_protocol_net_init(struct net *net)
+{
+ int i, ret;
+ static struct ip_vs_protocol *protos[] = {
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ &ip_vs_protocol_tcp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ &ip_vs_protocol_udp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ &ip_vs_protocol_sctp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ &ip_vs_protocol_ah,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ &ip_vs_protocol_esp,
+#endif
+ };
+
+ for (i = 0; i < ARRAY_SIZE(protos); i++) {
+ ret = register_ip_vs_proto_netns(net, protos[i]);
+ if (ret < 0)
+ goto cleanup;
+ }
+ return 0;
+
+cleanup:
+ ip_vs_protocol_net_cleanup(net);
+ return ret;
+}
+
+void __net_exit ip_vs_protocol_net_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ /* unregister all the ipvs proto data for this netns */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pd = ipvs->proto_data_table[i]) != NULL)
+ unregister_ip_vs_proto_netns(net, pd);
+ }
+}
+
+int __init ip_vs_protocol_init(void)
+{
+ char protocols[64];
+#define REGISTER_PROTOCOL(p) \
+ do { \
+ register_ip_vs_protocol(p); \
+ strcat(protocols, ", "); \
+ strcat(protocols, (p)->name); \
+ } while (0)
+
+ protocols[0] = '\0';
+ protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ REGISTER_PROTOCOL(&ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+ pr_info("Registered protocols (%s)\n", &protocols[2]);
+
+ return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+ struct ip_vs_protocol *pp;
+ int i;
+
+ /* unregister all the ipvs protocols */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pp = ip_vs_proto_table[i]) != NULL)
+ unregister_ip_vs_protocol(pp);
+ }
+}
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
new file mode 100644
index 000000000..5de3dd312
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -0,0 +1,165 @@
+/*
+ * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
+ * Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+ __u8 icookie[8];
+ __u8 rcookie[8];
+ __u8 np;
+ __u8 version;
+ __u8 xchgtype;
+ __u8 flags;
+ __u32 msgid;
+ __u32 length;
+};
+
+*/
+
+#define PORT_ISAKMP 500
+
+static void
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+ const struct ip_vs_iphdr *iph, int inverse,
+ struct ip_vs_conn_param *p)
+{
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ &iph->saddr, htons(PORT_ISAKMP),
+ &iph->daddr, htons(PORT_ISAKMP), p);
+ else
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ &iph->daddr, htons(PORT_ISAKMP),
+ &iph->saddr, htons(PORT_ISAKMP), p);
+}
+
+static struct ip_vs_conn *
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ int inverse)
+{
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
+
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ cp = ip_vs_conn_in_get(&p);
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so our conn_schedule hook should return NF_ACCEPT
+ */
+ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
+ "%s%s %s->%s\n",
+ inverse ? "ICMP+" : "",
+ ip_vs_proto_get(iph->protocol)->name,
+ IP_VS_DBG_ADDR(af, &iph->saddr),
+ IP_VS_DBG_ADDR(af, &iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_esp_conn_out_get(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
+
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ cp = ip_vs_conn_out_get(&p);
+ if (!cp) {
+ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
+ "%s%s %s->%s\n",
+ inverse ? "ICMP+" : "",
+ ip_vs_proto_get(iph->protocol)->name,
+ IP_VS_DBG_ADDR(af, &iph->saddr),
+ IP_VS_DBG_ADDR(af, &iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static int
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ /*
+ * AH/ESP is only related traffic. Pass the packet to IP stack.
+ */
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+#ifdef CONFIG_IP_VS_PROTO_AH
+struct ip_vs_protocol ip_vs_protocol_ah = {
+ .name = "AH",
+ .protocol = IPPROTO_AH,
+ .num_states = 1,
+ .dont_defrag = 1,
+ .init = NULL,
+ .exit = NULL,
+ .conn_schedule = ah_esp_conn_schedule,
+ .conn_in_get = ah_esp_conn_in_get,
+ .conn_out_get = ah_esp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+};
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_ESP
+struct ip_vs_protocol ip_vs_protocol_esp = {
+ .name = "ESP",
+ .protocol = IPPROTO_ESP,
+ .num_states = 1,
+ .dont_defrag = 1,
+ .init = NULL,
+ .exit = NULL,
+ .conn_schedule = ah_esp_conn_schedule,
+ .conn_in_get = ah_esp_conn_in_get,
+ .conn_out_get = ah_esp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+};
+#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
new file mode 100644
index 000000000..5b84c0b56
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -0,0 +1,591 @@
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/sctp/checksum.h>
+#include <net/ip_vs.h>
+
+static int
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs;
+ sctp_chunkhdr_t _schunkh, *sch;
+ sctp_sctphdr_t *sh, _sctph;
+
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (sh == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(_schunkh), &_schunkh);
+ if (sch == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ rcu_read_lock();
+ if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, sh->dest))) {
+ int ignored;
+
+ if (ip_vs_todrop(ipvs)) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
+ unsigned int sctphoff)
+{
+ sctph->checksum = sctp_compute_cksum(skb, sctphoff);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+}
+
+static int
+sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ sctp_sctphdr_t *sctph;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ ret = ip_vs_app_pkt_out(cp, skb);
+ if (ret == 0)
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
+ }
+
+ sctph = (void *) skb_network_header(skb) + sctphoff;
+
+ /* Only update csum if we really have to */
+ if (sctph->source != cp->vport || payload_csum ||
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ sctph->source = cp->vport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ return 1;
+}
+
+static int
+sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ sctp_sctphdr_t *sctph;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ ret = ip_vs_app_pkt_in(cp, skb);
+ if (ret == 0)
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
+ }
+
+ sctph = (void *) skb_network_header(skb) + sctphoff;
+
+ /* Only update csum if we really have to */
+ if (sctph->dest != cp->dport || payload_csum ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) {
+ sctph->dest = cp->dport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ return 1;
+}
+
+static int
+sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ unsigned int sctphoff;
+ struct sctphdr *sh, _sctph;
+ __le32 cmp, val;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ sctphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ sctphoff = ip_hdrlen(skb);
+
+ sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ return 0;
+
+ cmp = sh->checksum;
+ val = sctp_compute_cksum(skb, sctphoff);
+
+ if (val != cmp) {
+ /* CRC failure, dump it. */
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ return 1;
+}
+
+enum ipvs_sctp_event_t {
+ IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */
+ IP_VS_SCTP_INIT,
+ IP_VS_SCTP_INIT_ACK,
+ IP_VS_SCTP_COOKIE_ECHO,
+ IP_VS_SCTP_COOKIE_ACK,
+ IP_VS_SCTP_SHUTDOWN,
+ IP_VS_SCTP_SHUTDOWN_ACK,
+ IP_VS_SCTP_SHUTDOWN_COMPLETE,
+ IP_VS_SCTP_ERROR,
+ IP_VS_SCTP_ABORT,
+ IP_VS_SCTP_EVENT_LAST
+};
+
+/* RFC 2960, 3.2 Chunk Field Descriptions */
+static __u8 sctp_events[] = {
+ [SCTP_CID_DATA] = IP_VS_SCTP_DATA,
+ [SCTP_CID_INIT] = IP_VS_SCTP_INIT,
+ [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK,
+ [SCTP_CID_SACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT,
+ [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN,
+ [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK,
+ [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR,
+ [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO,
+ [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK,
+ [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA,
+ [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE,
+};
+
+/* SCTP States:
+ * See RFC 2960, 4. SCTP Association State Diagram
+ *
+ * New states (not in diagram):
+ * - INIT1 state: use shorter timeout for dropped INIT packets
+ * - REJECTED state: use shorter timeout if INIT is rejected with ABORT
+ * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging
+ *
+ * The states are as seen in real server. In the diagram, INIT1, INIT,
+ * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state.
+ *
+ * States as per packets from client (C) and server (S):
+ *
+ * Setup of client connection:
+ * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK
+ *
+ * Setup of server connection:
+ * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK
+ */
+
+#define sNO IP_VS_SCTP_S_NONE
+#define sI1 IP_VS_SCTP_S_INIT1
+#define sIN IP_VS_SCTP_S_INIT
+#define sCS IP_VS_SCTP_S_COOKIE_SENT
+#define sCR IP_VS_SCTP_S_COOKIE_REPLIED
+#define sCW IP_VS_SCTP_S_COOKIE_WAIT
+#define sCO IP_VS_SCTP_S_COOKIE
+#define sCE IP_VS_SCTP_S_COOKIE_ECHOED
+#define sES IP_VS_SCTP_S_ESTABLISHED
+#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT
+#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED
+#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT
+#define sRJ IP_VS_SCTP_S_REJECTED
+#define sCL IP_VS_SCTP_S_CLOSED
+
+static const __u8 sctp_states
+ [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = {
+ { /* INPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* OUTPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW},
+/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL},
+/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* INPUT-ONLY */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+};
+
+#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ)
+
+/* Timeout table[state] */
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+ [IP_VS_SCTP_S_NONE] = 2 * HZ,
+ [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ,
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_LAST] = 2 * HZ,
+};
+
+static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = {
+ [IP_VS_SCTP_S_NONE] = "NONE",
+ [IP_VS_SCTP_S_INIT1] = "INIT1",
+ [IP_VS_SCTP_S_INIT] = "INIT",
+ [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT",
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED",
+ [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT",
+ [IP_VS_SCTP_S_COOKIE] = "COOKIE",
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED",
+ [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT",
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED",
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT",
+ [IP_VS_SCTP_S_REJECTED] = "REJECTED",
+ [IP_VS_SCTP_S_CLOSED] = "CLOSED",
+ [IP_VS_SCTP_S_LAST] = "BUG!",
+};
+
+
+static const char *sctp_state_name(int state)
+{
+ if (state >= IP_VS_SCTP_S_LAST)
+ return "ERR!";
+ if (sctp_state_name_table[state])
+ return sctp_state_name_table[state];
+ return "?";
+}
+
+static inline void
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
+ int direction, const struct sk_buff *skb)
+{
+ sctp_chunkhdr_t _sctpch, *sch;
+ unsigned char chunk_type;
+ int event, next_state;
+ int ihl, cofs;
+
+#ifdef CONFIG_IP_VS_IPV6
+ ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+ ihl = ip_hdrlen(skb);
+#endif
+
+ cofs = ihl + sizeof(sctp_sctphdr_t);
+ sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch);
+ if (sch == NULL)
+ return;
+
+ chunk_type = sch->type;
+ /*
+ * Section 3: Multiple chunks can be bundled into one SCTP packet
+ * up to the MTU size, except for the INIT, INIT ACK, and
+ * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with
+ * any other chunk in a packet.
+ *
+ * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control
+ * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be
+ * bundled with an ABORT, but they MUST be placed before the ABORT
+ * in the SCTP packet or they will be ignored by the receiver.
+ */
+ if ((sch->type == SCTP_CID_COOKIE_ECHO) ||
+ (sch->type == SCTP_CID_COOKIE_ACK)) {
+ int clen = ntohs(sch->length);
+
+ if (clen >= sizeof(sctp_chunkhdr_t)) {
+ sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4),
+ sizeof(_sctpch), &_sctpch);
+ if (sch && sch->type == SCTP_CID_ABORT)
+ chunk_type = sch->type;
+ }
+ }
+
+ event = (chunk_type < sizeof(sctp_events)) ?
+ sctp_events[chunk_type] : IP_VS_SCTP_DATA;
+
+ /* Update direction to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
+ */
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (direction == IP_VS_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ direction = IP_VS_DIR_INPUT_ONLY;
+ }
+
+ next_state = sctp_states[direction][event][cp->state];
+
+ if (next_state != cp->state) {
+ struct ip_vs_dest *dest = cp->dest;
+
+ IP_VS_DBG_BUF(8, "%s %s %s:%d->"
+ "%s:%d state: %s->%s conn->refcnt:%d\n",
+ pd->pp->name,
+ ((direction == IP_VS_DIR_OUTPUT) ?
+ "output " : "input "),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+ ntohs(cp->dport),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ sctp_state_name(cp->state),
+ sctp_state_name(next_state),
+ atomic_read(&cp->refcnt));
+ if (dest) {
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (next_state != IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (next_state == IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
+ }
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = next_state];
+ else /* What to do ? */
+ cp->timeout = sctp_timeouts[cp->state = next_state];
+}
+
+static void
+sctp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb, struct ip_vs_proto_data *pd)
+{
+ spin_lock_bh(&cp->lock);
+ set_sctp_state(pd, cp, direction, skb);
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline __u16 sctp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
+ & SCTP_APP_TAB_MASK;
+}
+
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ hash = sctp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+out:
+
+ return ret;
+}
+
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+static int sctp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+ /* Lookup application incarnations and bind the right one */
+ hash = sctp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+out:
+ return result;
+}
+
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
+ sizeof(sctp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+struct ip_vs_protocol ip_vs_protocol_sctp = {
+ .name = "SCTP",
+ .protocol = IPPROTO_SCTP,
+ .num_states = IP_VS_SCTP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_sctp_init,
+ .exit_netns = __ip_vs_sctp_exit,
+ .register_app = sctp_register_app,
+ .unregister_app = sctp_unregister_app,
+ .conn_schedule = sctp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = sctp_snat_handler,
+ .dnat_handler = sctp_dnat_handler,
+ .csum_check = sctp_csum_check,
+ .state_name = sctp_state_name,
+ .state_transition = sctp_state_transition,
+ .app_conn_bind = sctp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 000000000..8e92beb0c
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,712 @@
+/*
+ * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * tcp_timeouts table has copy per netns in a hash table per
+ * protocol ip_vs_proto_data and is handled by netns
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+static int
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct tcphdr _tcph, *th;
+ struct netns_ipvs *ipvs;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
+ rcu_read_lock();
+ if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, th->dest))) {
+ int ignored;
+
+ if (ip_vs_todrop(ipvs)) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(int af, struct tcphdr *tcph,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcph->check =
+ csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(tcph->check))));
+ else
+#endif
+ tcph->check =
+ csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(tcph->check))));
+}
+
+
+static inline void
+tcp_partial_csum_update(int af, struct tcphdr *tcph,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcph->check =
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(tcph->check))));
+ else
+#endif
+ tcph->check =
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(tcph->check))));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct tcphdr *tcph;
+ unsigned int tcphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - tcphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
+ }
+
+ tcph = (void *)skb_network_header(skb) + tcphoff;
+ tcph->source = cp->vport;
+
+ /* Adjust TCP checksums */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+ htons(oldlen),
+ htons(skb->len - tcphoff));
+ } else if (!payload_csum) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+ cp->dport, cp->vport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ tcph->check = 0;
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
+ &cp->caddr.in6,
+ skb->len - tcphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
+ cp->caddr.ip,
+ skb->len - tcphoff,
+ cp->protocol,
+ skb->csum);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+ pp->name, tcph->check,
+ (char*)&(tcph->check) - (char*)tcph);
+ }
+ return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct tcphdr *tcph;
+ unsigned int tcphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - tcphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn and iph ack_seq stuff
+ */
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
+ }
+
+ tcph = (void *)skb_network_header(skb) + tcphoff;
+ tcph->dest = cp->dport;
+
+ /*
+ * Adjust TCP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+ htons(oldlen),
+ htons(skb->len - tcphoff));
+ } else if (!payload_csum) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+ cp->vport, cp->dport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ tcph->check = 0;
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ tcph->check = csum_ipv6_magic(&cp->caddr.in6,
+ &cp->daddr.in6,
+ skb->len - tcphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ tcph->check = csum_tcpudp_magic(cp->caddr.ip,
+ cp->daddr.ip,
+ skb->len - tcphoff,
+ cp->protocol,
+ skb->csum);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ tcphoff = ip_hdrlen(skb);
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+ case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - tcphoff,
+ ipv6_hdr(skb)->nexthdr,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ } else
+#endif
+ if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - tcphoff,
+ ip_hdr(skb)->protocol,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* No need to checksum. */
+ break;
+ }
+
+ return 1;
+}
+
+
+#define TCP_DIR_INPUT 0
+#define TCP_DIR_OUTPUT 4
+#define TCP_DIR_INPUT_ONLY 8
+
+static const int tcp_state_off[IP_VS_DIR_LAST] = {
+ [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
+ [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
+ [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ * Timeout table[state]
+ */
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = 2*HZ,
+ [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
+ [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
+ [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
+ [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_CLOSE] = 10*HZ,
+ [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
+ [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
+ [IP_VS_TCP_S_SYNACK] = 120*HZ,
+ [IP_VS_TCP_S_LAST] = 2*HZ,
+};
+
+static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = "NONE",
+ [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
+ [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
+ [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
+ [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
+ [IP_VS_TCP_S_CLOSE] = "CLOSE",
+ [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
+ [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
+ [IP_VS_TCP_S_LISTEN] = "LISTEN",
+ [IP_VS_TCP_S_SYNACK] = "SYNACK",
+ [IP_VS_TCP_S_LAST] = "BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+ int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+ if (state >= IP_VS_TCP_S_LAST)
+ return "ERR!";
+ return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
+{
+ int on = (flags & 1); /* secure_tcp */
+
+ /*
+ ** FIXME: change secure_tcp to independent sysctl var
+ ** or make it per-service or per-app because it is valid
+ ** for most if not for all of the applications. Something
+ ** like "capabilities" (flags) for each object.
+ */
+ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+ if (th->rst)
+ return 3;
+ if (th->syn)
+ return 0;
+ if (th->fin)
+ return 1;
+ if (th->ack)
+ return 2;
+ return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
+ int direction, struct tcphdr *th)
+{
+ int state_idx;
+ int new_state = IP_VS_TCP_S_CLOSE;
+ int state_off = tcp_state_off[direction];
+
+ /*
+ * Update state offset to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
+ */
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (state_off == TCP_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ state_off = TCP_DIR_INPUT_ONLY;
+ }
+
+ if ((state_idx = tcp_state_idx(th)) < 0) {
+ IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+ goto tcp_state_out;
+ }
+
+ new_state =
+ pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+ tcp_state_out:
+ if (new_state != cp->state) {
+ struct ip_vs_dest *dest = cp->dest;
+
+ IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
+ "%s:%d state: %s->%s conn->refcnt:%d\n",
+ pd->pp->name,
+ ((state_off == TCP_DIR_OUTPUT) ?
+ "output " : "input "),
+ th->syn ? 'S' : '.',
+ th->fin ? 'F' : '.',
+ th->ack ? 'A' : '.',
+ th->rst ? 'R' : '.',
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+ ntohs(cp->dport),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ tcp_state_name(cp->state),
+ tcp_state_name(new_state),
+ atomic_read(&cp->refcnt));
+
+ if (dest) {
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
+ }
+
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = new_state];
+ else /* What to do ? */
+ cp->timeout = tcp_timeouts[cp->state = new_state];
+}
+
+/*
+ * Handle state transitions
+ */
+static void
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ struct tcphdr _tcph, *th;
+
+#ifdef CONFIG_IP_VS_IPV6
+ int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+ int ihl = ip_hdrlen(skb);
+#endif
+
+ th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return;
+
+ spin_lock_bh(&cp->lock);
+ set_tcp_state(pd, cp, direction, th);
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline __u16 tcp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
+ & TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ hash = tcp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+
+ out:
+ return ret;
+}
+
+
+static void
+tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = tcp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+
+ out:
+ return result;
+}
+
+
+/*
+ * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ spin_lock_bh(&cp->lock);
+ cp->state = IP_VS_TCP_S_LISTEN;
+ cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+ : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
+ spin_unlock_bh(&cp->lock);
+}
+
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
+ sizeof(tcp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ pd->tcp_state_table = tcp_states;
+ return 0;
+}
+
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+ .name = "TCP",
+ .protocol = IPPROTO_TCP,
+ .num_states = IP_VS_TCP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_tcp_init,
+ .exit_netns = __ip_vs_tcp_exit,
+ .register_app = tcp_register_app,
+ .unregister_app = tcp_unregister_app,
+ .conn_schedule = tcp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = tcp_snat_handler,
+ .dnat_handler = tcp_dnat_handler,
+ .csum_check = tcp_csum_check,
+ .state_name = tcp_state_name,
+ .state_transition = tcp_state_transition,
+ .app_conn_bind = tcp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = tcp_timeout_change,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 000000000..b62a3c0ff
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,499 @@
+/*
+ * ip_vs_proto_udp.c: UDP load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
+
+#include <net/ip_vs.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+
+static int
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct udphdr _udph, *uh;
+
+ /* IPv6 fragments, only first fragment will hit this */
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+ net = skb_net(skb);
+ rcu_read_lock();
+ svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, uh->dest);
+ if (svc) {
+ int ignored;
+
+ if (ip_vs_todrop(net_ipvs(net))) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(int af, struct udphdr *uhdr,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ uhdr->check =
+ csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(uhdr->check))));
+ else
+#endif
+ uhdr->check =
+ csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(uhdr->check))));
+ if (!uhdr->check)
+ uhdr->check = CSUM_MANGLED_0;
+}
+
+static inline void
+udp_partial_csum_update(int af, struct udphdr *uhdr,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ uhdr->check =
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(uhdr->check))));
+ else
+#endif
+ uhdr->check =
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(uhdr->check))));
+}
+
+
+static int
+udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct udphdr *udph;
+ unsigned int udphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - udphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Call application helper if needed
+ */
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
+ }
+
+ udph = (void *)skb_network_header(skb) + udphoff;
+ udph->source = cp->vport;
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+ htons(oldlen),
+ htons(skb->len - udphoff));
+ } else if (!payload_csum && (udph->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+ cp->dport, cp->vport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ udph->check = 0;
+ skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ udph->check = csum_ipv6_magic(&cp->vaddr.in6,
+ &cp->caddr.in6,
+ skb->len - udphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ udph->check = csum_tcpudp_magic(cp->vaddr.ip,
+ cp->caddr.ip,
+ skb->len - udphoff,
+ cp->protocol,
+ skb->csum);
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+ pp->name, udph->check,
+ (char*)&(udph->check) - (char*)udph);
+ }
+ return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct udphdr *udph;
+ unsigned int udphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - udphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn
+ */
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
+ }
+
+ udph = (void *)skb_network_header(skb) + udphoff;
+ udph->dest = cp->dport;
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+ htons(oldlen),
+ htons(skb->len - udphoff));
+ } else if (!payload_csum && (udph->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+ cp->vport, cp->dport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ udph->check = 0;
+ skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ udph->check = csum_ipv6_magic(&cp->caddr.in6,
+ &cp->daddr.in6,
+ skb->len - udphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ udph->check = csum_tcpudp_magic(cp->caddr.ip,
+ cp->daddr.ip,
+ skb->len - udphoff,
+ cp->protocol,
+ skb->csum);
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ struct udphdr _udph, *uh;
+ unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ udphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ udphoff = ip_hdrlen(skb);
+
+ uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ return 0;
+
+ if (uh->check != 0) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = skb_checksum(skb, udphoff,
+ skb->len - udphoff, 0);
+ case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - udphoff,
+ ipv6_hdr(skb)->nexthdr,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ } else
+#endif
+ if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - udphoff,
+ ip_hdr(skb)->protocol,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* No need to checksum. */
+ break;
+ }
+ }
+ return 1;
+}
+
+static inline __u16 udp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
+ & UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+ hash = udp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+
+ out:
+ return ret;
+}
+
+
+static void
+udp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = udp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+
+ out:
+ return result;
+}
+
+
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
+ [IP_VS_UDP_S_LAST] = 2*HZ,
+};
+
+static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = "UDP",
+ [IP_VS_UDP_S_LAST] = "BUG!",
+};
+
+static const char * udp_state_name(int state)
+{
+ if (state >= IP_VS_UDP_S_LAST)
+ return "ERR!";
+ return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static void
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ if (unlikely(!pd)) {
+ pr_err("UDP no ns data\n");
+ return;
+ }
+
+ cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
+}
+
+static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
+ sizeof(udp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+ .name = "UDP",
+ .protocol = IPPROTO_UDP,
+ .num_states = IP_VS_UDP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __udp_init,
+ .exit_netns = __udp_exit,
+ .conn_schedule = udp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = udp_snat_handler,
+ .dnat_handler = udp_dnat_handler,
+ .csum_check = udp_csum_check,
+ .state_transition = udp_state_transition,
+ .state_name = udp_state_name,
+ .register_app = udp_register_app,
+ .unregister_app = udp_unregister_app,
+ .app_conn_bind = udp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
+};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
new file mode 100644
index 000000000..58bacfc46
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -0,0 +1,130 @@
+/*
+ * IPVS: Round-Robin Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
+ * Julian Anastasov : fixed the NULL pointer access bug in debugging
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_rr_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
+ return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last;
+ int pass = 0;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
+ do {
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ if (dest == last)
+ goto stop;
+ }
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ spin_unlock_bh(&svc->sched_lock);
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ out:
+ svc->sched_data = &dest->n_list;
+ spin_unlock_bh(&svc->sched_lock);
+ IP_VS_DBG_BUF(6, "RR: server %s:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+ return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+ .name = "rr", /* name */
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
+ .init_service = ip_vs_rr_init_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_rr_del_dest,
+ .schedule = ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
new file mode 100644
index 000000000..199760c71
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -0,0 +1,254 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(ip_vs_scheduler_err);
+/*
+ * IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* semaphore for schedulers */
+static DEFINE_MUTEX(ip_vs_sched_mutex);
+
+
+/*
+ * Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *scheduler)
+{
+ int ret;
+
+ if (scheduler->init_service) {
+ ret = scheduler->init_service(svc);
+ if (ret) {
+ pr_err("%s(): init error\n", __func__);
+ return ret;
+ }
+ }
+ rcu_assign_pointer(svc->scheduler, scheduler);
+ return 0;
+}
+
+
+/*
+ * Unbind a service with its scheduler
+ */
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *sched)
+{
+ struct ip_vs_scheduler *cur_sched;
+
+ cur_sched = rcu_dereference_protected(svc->scheduler, 1);
+ /* This check proves that old 'sched' was installed */
+ if (!cur_sched)
+ return;
+
+ if (sched->done_service)
+ sched->done_service(svc);
+ /* svc->scheduler can not be set to NULL */
+}
+
+
+/*
+ * Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
+
+ mutex_lock(&ip_vs_sched_mutex);
+
+ list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+ /*
+ * Test and get the modules atomically
+ */
+ if (sched->module && !try_module_get(sched->module)) {
+ /*
+ * This scheduler is just deleted
+ */
+ continue;
+ }
+ if (strcmp(sched_name, sched->name)==0) {
+ /* HIT */
+ mutex_unlock(&ip_vs_sched_mutex);
+ return sched;
+ }
+ module_put(sched->module);
+ }
+
+ mutex_unlock(&ip_vs_sched_mutex);
+ return NULL;
+}
+
+
+/*
+ * Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * Search for the scheduler by sched_name
+ */
+ sched = ip_vs_sched_getbyname(sched_name);
+
+ /*
+ * If scheduler not found, load the module and search again
+ */
+ if (sched == NULL) {
+ request_module("ip_vs_%s", sched_name);
+ sched = ip_vs_sched_getbyname(sched_name);
+ }
+
+ return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+ if (scheduler && scheduler->module)
+ module_put(scheduler->module);
+}
+
+/*
+ * Common error output helper for schedulers
+ */
+
+void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
+{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference(svc->scheduler);
+ if (svc->fwmark) {
+ IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
+ sched->name, svc->fwmark, svc->fwmark, msg);
+#ifdef CONFIG_IP_VS_IPV6
+ } else if (svc->af == AF_INET6) {
+ IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6, ntohs(svc->port), msg);
+#endif
+ } else {
+ IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
+ &svc->addr.ip, ntohs(svc->port), msg);
+ }
+}
+
+/*
+ * Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ struct ip_vs_scheduler *sched;
+
+ if (!scheduler) {
+ pr_err("%s(): NULL arg\n", __func__);
+ return -EINVAL;
+ }
+
+ if (!scheduler->name) {
+ pr_err("%s(): NULL scheduler_name\n", __func__);
+ return -EINVAL;
+ }
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ mutex_lock(&ip_vs_sched_mutex);
+
+ if (!list_empty(&scheduler->n_list)) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] scheduler already linked\n",
+ __func__, scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Make sure that the scheduler with this name doesn't exist
+ * in the scheduler list.
+ */
+ list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+ if (strcmp(scheduler->name, sched->name) == 0) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] scheduler already existed "
+ "in the system\n", __func__, scheduler->name);
+ return -EINVAL;
+ }
+ }
+ /*
+ * Add it into the d-linked scheduler list
+ */
+ list_add(&scheduler->n_list, &ip_vs_schedulers);
+ mutex_unlock(&ip_vs_sched_mutex);
+
+ pr_info("[%s] scheduler registered.\n", scheduler->name);
+
+ return 0;
+}
+
+
+/*
+ * Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ if (!scheduler) {
+ pr_err("%s(): NULL arg\n", __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&ip_vs_sched_mutex);
+ if (list_empty(&scheduler->n_list)) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ pr_err("%s(): [%s] scheduler is not in the list. failed\n",
+ __func__, scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Remove it from the d-linked scheduler list
+ */
+ list_del(&scheduler->n_list);
+ mutex_unlock(&ip_vs_sched_mutex);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ pr_info("[%s] scheduler unregistered.\n", scheduler->name);
+
+ return 0;
+}
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
new file mode 100644
index 000000000..f8e2d00f5
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -0,0 +1,144 @@
+/*
+ * IPVS: Shortest Expected Delay scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_sed_dest_overhead(least);
+ goto nextstage;
+ }
+ }
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_sed_dest_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "SED: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+ .name = "sed",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
+ .schedule = ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
new file mode 100644
index 000000000..98a13433b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -0,0 +1,385 @@
+/*
+ * IPVS: Source Hashing scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[src_ip];
+ * if (n is dead) OR
+ * (n is overloaded) or (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ * The weight destination attribute can be used to control the
+ * distribution of connections to the destinations in servernode. The
+ * greater the weight, the more connections the destination
+ * will receive.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
+
+/*
+ * IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS 8
+#endif
+#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+
+struct ip_vs_sh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
+};
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/*
+ * Returns hash value for IPVS SH entry
+ */
+static inline unsigned int
+ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, unsigned int offset)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+ IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ *
+ * The fallback strategy loops around the table starting from a "random"
+ * point (in fact, it is chosen to be the original hash value to make the
+ * algorithm deterministic) to find a new server.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* first try the dest it's supposed to go to */
+ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ dest = rcu_dereference(s->buckets[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ /* if the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
+ hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
+ dest = rcu_dereference(s->buckets[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6, "SH: selected unavailable "
+ "server %s:%d (offset %d), reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+ int d_count;
+ bool empty;
+
+ b = &s->buckets[0];
+ p = &svc->destinations;
+ empty = list_empty(p);
+ d_count = 0;
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
+
+ IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
+ i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ atomic_read(&dest->weight));
+
+ /* Don't move to next dest until filling weight */
+ if (++d_count >= atomic_read(&dest->weight)) {
+ p = p->next;
+ d_count = 0;
+ }
+
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+ struct ip_vs_dest *dest;
+
+ b = &s->buckets[0];
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_state *s;
+
+ /* allocate the SH table for this service */
+ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
+ if (s == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = s;
+ IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+ /* assign the hash buckets with current dests */
+ ip_vs_sh_reassign(s, svc);
+
+ return 0;
+}
+
+
+static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_state *s = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_sh_flush(s);
+
+ /* release the table itself */
+ kfree_rcu(s, rcu_head);
+ IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+}
+
+
+static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_sh_state *s = svc->sched_data;
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_sh_reassign(s, svc);
+
+ return 0;
+}
+
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+ __be16 port;
+ struct tcphdr _tcph, *th;
+ struct udphdr _udph, *uh;
+ sctp_sctphdr_t _sctph, *sh;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (unlikely(th == NULL))
+ return 0;
+ port = th->source;
+ break;
+ case IPPROTO_UDP:
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (unlikely(uh == NULL))
+ return 0;
+ port = uh->source;
+ break;
+ case IPPROTO_SCTP:
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (unlikely(sh == NULL))
+ return 0;
+ port = sh->source;
+ break;
+ default:
+ port = 0;
+ }
+
+ return port;
+}
+
+
+/*
+ * Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_sh_state *s;
+ __be16 port = 0;
+
+ IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+ port = ip_vs_sh_get_port(skb, iph);
+
+ s = (struct ip_vs_sh_state *) svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+ dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+ else
+ dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+ .name = "sh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
+ .init_service = ip_vs_sh_init_svc,
+ .done_service = ip_vs_sh_done_svc,
+ .add_dest = ip_vs_sh_dest_changed,
+ .del_dest = ip_vs_sh_dest_changed,
+ .upd_dest = ip_vs_sh_dest_changed,
+ .schedule = ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+ synchronize_rcu();
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
new file mode 100644
index 000000000..19b9cce6c
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -0,0 +1,1975 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version 1, is capable of handling both version 0 and 1 messages.
+ * Version 0 is the plain old format.
+ * Note Version 0 receivers will just drop Ver 1 messages.
+ * Version 1 is capable of handle IPv6, Persistence data,
+ * time-outs, and firewall marks.
+ * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions Message: is a complete datagram
+ * Sync_conn: is a part of a Message
+ * Param Data is an option to a Sync_conn.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync: sync connection info from master load balancer to backups
+ * through multicast
+ *
+ * Changes:
+ * Alexandre Cassen : Added master & backup support at a time.
+ * Alexandre Cassen : Added SyncID support for incoming sync
+ * messages filtering.
+ * Justin Ossevoort : Fix endian problem on sync message size.
+ * Hans Schillstrom : Added Version 1: i.e. IPv6,
+ * Persistence support, fwmark and time-out.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/inetdevice.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h> /* for ip_mc_join_group */
+#include <linux/udp.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/kernel.h>
+
+#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT 8848 /* multicast port */
+
+#define SYNC_PROTO_VER 1 /* Protocol version in header */
+
+static struct lock_class_key __ipvs_sync_key;
+/*
+ * IPVS sync connection entry
+ * Version 0, i.e. original version.
+ */
+struct ip_vs_sync_conn_v0 {
+ __u8 reserved;
+
+ /* Protocol, addresses and port numbers */
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+
+ /* Flags and state transition */
+ __be16 flags; /* status flags */
+ __be16 state; /* state info */
+
+ /* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+ struct ip_vs_seq in_seq; /* incoming seq. struct */
+ struct ip_vs_seq out_seq; /* outgoing seq. struct */
+};
+
+/*
+ Sync Connection format (sync_conn)
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Protocol | Ver. | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Flags |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | State | cport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | vport | dport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | fwmark |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | timeout (in sec.) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | ... |
+ | IP-Addresses (v4 or v6) |
+ | ... |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ Optional Parameters.
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param. Type | Param. Length | Param. data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ | ... |
+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | | Param Type | Param. Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param data |
+ | Last Param data should be padded for 32 bit alignment |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ * Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ struct in6_addr caddr; /* client address */
+ struct in6_addr vaddr; /* virtual address */
+ struct in6_addr daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+ struct ip_vs_sync_v4 v4;
+ struct ip_vs_sync_v6 v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6 0
+#define STYPE_F_INET6 (1 << STYPE_INET6)
+
+#define SVER_SHIFT 12 /* Shift to get version */
+#define SVER_MASK 0x0fff /* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA 1
+#define IPVS_OPT_PE_DATA 2
+#define IPVS_OPT_PE_NAME 3
+#define IPVS_OPT_PARAM 7
+
+#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
+
+struct ip_vs_sync_thread_data {
+ struct net *net;
+ struct socket *sock;
+ char *buf;
+ int id;
+};
+
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
+#define FULL_CONN_SIZE \
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+ The master mulitcasts messages (Datagrams) to the backup load balancers
+ in the following format.
+
+ Version 1:
+ Note, first byte should be Zero, so ver 0 receivers will drop the packet.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | 0 | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | Version | Reserved, set to Zero |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (1) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | . |
+ ~ . ~
+ | . |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (n) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | IPVS Sync Connection (1) |
+*/
+
+#define SYNC_MESG_HEADER_LEN 4
+#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
+
+/* Version 0 header */
+struct ip_vs_sync_mesg_v0 {
+ __u8 nr_conns;
+ __u8 syncid;
+ __be16 size;
+
+ /* ip_vs_sync_conn entries start here */
+};
+
+/* Version 1 header */
+struct ip_vs_sync_mesg {
+ __u8 reserved; /* must be zero */
+ __u8 syncid;
+ __be16 size;
+ __u8 nr_conns;
+ __s8 version; /* SYNC_PROTO_VER */
+ __u16 spare;
+ /* ip_vs_sync_conn entries start here */
+};
+
+struct ip_vs_sync_buff {
+ struct list_head list;
+ unsigned long firstuse;
+
+ /* pointers for the message data */
+ struct ip_vs_sync_mesg *mesg;
+ unsigned char *head;
+ unsigned char *end;
+};
+
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+ ho->init_seq = get_unaligned_be32(&no->init_seq);
+ ho->delta = get_unaligned_be32(&no->delta);
+ ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
+
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+ put_unaligned_be32(ho->init_seq, &no->init_seq);
+ put_unaligned_be32(ho->delta, &no->delta);
+ put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
+
+static inline struct ip_vs_sync_buff *
+sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (list_empty(&ms->sync_queue)) {
+ sb = NULL;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ } else {
+ sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
+ list);
+ list_del(&sb->list);
+ ms->sync_queue_len--;
+ if (!ms->sync_queue_len)
+ ms->sync_queue_delay = 0;
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+
+ return sb;
+}
+
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
+ sb->mesg->version = SYNC_PROTO_VER;
+ sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
+ sb->mesg->nr_conns = 0;
+ sb->mesg->spare = 0;
+ sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
+ sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+ kfree(sb->mesg);
+ kfree(sb);
+}
+
+static inline void sb_queue_tail(struct netns_ipvs *ipvs,
+ struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb = ms->sync_buff;
+
+ spin_lock(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER &&
+ ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
+ if (!ms->sync_queue_len)
+ schedule_delayed_work(&ms->master_wakeup_work,
+ max(IPVS_SYNC_SEND_DELAY, 1));
+ ms->sync_queue_len++;
+ list_add_tail(&sb->list, &ms->sync_queue);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
+ wake_up_process(ms->master_thread);
+ } else
+ ip_vs_sync_buff_release(sb);
+ spin_unlock(&ipvs->sync_lock);
+}
+
+/*
+ * Get the current sync buffer if it has been created for more
+ * than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
+ unsigned long time)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ sb = ms->sync_buff;
+ if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
+ ms->sync_buff = NULL;
+ __set_current_state(TASK_RUNNING);
+ } else
+ sb = NULL;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return sb;
+}
+
+static inline int
+select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
+{
+ return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
+}
+
+/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+ struct ip_vs_sync_mesg_v0 *mesg;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+ mesg->nr_conns = 0;
+ mesg->syncid = ipvs->master_syncid;
+ mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
+ sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+ sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+/* Check if connection is controlled by persistence */
+static inline bool in_persistence(struct ip_vs_conn *cp)
+{
+ for (cp = cp->control; cp; cp = cp->control) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ return true;
+ }
+ return false;
+}
+
+/* Check if conn should be synced.
+ * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
+ * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
+ * sync_retries times with period of sync_refresh_period/8
+ * - (2) if both sync_refresh_period and sync_period are 0 send sync only
+ * for state changes or only once when pkts matches sync_threshold
+ * - (3) templates: rate can be reduced only with sync_refresh_period or
+ * with (2)
+ */
+static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
+ struct ip_vs_conn *cp, int pkts)
+{
+ unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+ unsigned long now = jiffies;
+ unsigned long n = (now + cp->timeout) & ~3UL;
+ unsigned int sync_refresh_period;
+ int sync_period;
+ int force;
+
+ /* Check if we sync in current state */
+ if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ force = 0;
+ else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
+ return 0;
+ else if (likely(cp->protocol == IPPROTO_TCP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_TCP_S_ESTABLISHED) |
+ (1 << IP_VS_TCP_S_FIN_WAIT) |
+ (1 << IP_VS_TCP_S_CLOSE) |
+ (1 << IP_VS_TCP_S_CLOSE_WAIT) |
+ (1 << IP_VS_TCP_S_TIME_WAIT))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
+ goto set;
+ } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_SCTP_S_ESTABLISHED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
+ (1 << IP_VS_SCTP_S_CLOSED))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
+ goto set;
+ } else {
+ /* UDP or another protocol with single state */
+ force = 0;
+ }
+
+ sync_refresh_period = sysctl_sync_refresh_period(ipvs);
+ if (sync_refresh_period > 0) {
+ long diff = n - orig;
+ long min_diff = max(cp->timeout >> 1, 10UL * HZ);
+
+ /* Avoid sync if difference is below sync_refresh_period
+ * and below the half timeout.
+ */
+ if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
+ int retries = orig & 3;
+
+ if (retries >= sysctl_sync_retries(ipvs))
+ return 0;
+ if (time_before(now, orig - cp->timeout +
+ (sync_refresh_period >> 3)))
+ return 0;
+ n |= retries + 1;
+ }
+ }
+ sync_period = sysctl_sync_period(ipvs);
+ if (sync_period > 0) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
+ pkts % sync_period != sysctl_sync_threshold(ipvs))
+ return 0;
+ } else if (sync_refresh_period <= 0 &&
+ pkts != sysctl_sync_threshold(ipvs))
+ return 0;
+
+set:
+ cp->old_state = cp->state;
+ n = cmpxchg(&cp->sync_endtime, orig, n);
+ return n == orig || force;
+}
+
+/*
+ * Version 0 , could be switched in by sys_ctl.
+ * Add an ip_vs_conn information into the current sync_buff.
+ */
+static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+ int pkts)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg_v0 *m;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
+ int len;
+
+ if (unlikely(cp->af != AF_INET))
+ return;
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ return;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+ buff = ms->sync_buff;
+ if (buff) {
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ /* Send buffer if it is for v1 */
+ if (!m->nr_conns) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ }
+ }
+ if (!buff) {
+ buff = ip_vs_sync_buff_create_v0(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ ms->sync_buff = buff;
+ }
+
+ len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+ SIMPLE_CONN_SIZE;
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *) buff->head;
+
+ /* copy members */
+ s->reserved = 0;
+ s->protocol = cp->protocol;
+ s->cport = cp->cport;
+ s->vport = cp->vport;
+ s->dport = cp->dport;
+ s->caddr = cp->caddr.ip;
+ s->vaddr = cp->vaddr.ip;
+ s->daddr = cp->daddr.ip;
+ s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->state = htons(cp->state);
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ struct ip_vs_sync_conn_options *opt =
+ (struct ip_vs_sync_conn_options *)&s[1];
+ memcpy(opt, &cp->in_seq, sizeof(*opt));
+ }
+
+ m->nr_conns++;
+ m->size = htons(ntohs(m->size) + len);
+ buff->head += len;
+
+ /* check if there is a space for next one */
+ if (buff->head + FULL_CONN_SIZE > buff->end) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ }
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (cp) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ ip_vs_sync_conn(net, cp->control, pkts);
+ }
+}
+
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ * Sending Version 1 messages
+ */
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
+ __u8 *p;
+ unsigned int len, pe_name_len, pad;
+
+ /* Handle old version of the protocol */
+ if (sysctl_sync_ver(ipvs) == 0) {
+ ip_vs_sync_conn_v0(net, cp, pkts);
+ return;
+ }
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ goto control;
+sloop:
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ goto control;
+
+ /* Sanity checks */
+ pe_name_len = 0;
+ if (cp->pe_data_len) {
+ if (!cp->pe_data || !cp->dest) {
+ IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+ return;
+ }
+ pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+ }
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ len = sizeof(struct ip_vs_sync_v6);
+ else
+#endif
+ len = sizeof(struct ip_vs_sync_v4);
+
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+ len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+ if (cp->pe_data_len)
+ len += cp->pe_data_len + 2; /* + Param hdr field */
+ if (pe_name_len)
+ len += pe_name_len + 2;
+
+ /* check if there is a space for this one */
+ pad = 0;
+ buff = ms->sync_buff;
+ if (buff) {
+ m = buff->mesg;
+ pad = (4 - (size_t) buff->head) & 3;
+ /* Send buffer if it is for v0 */
+ if (buff->head + len + pad > buff->end || m->reserved) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ pad = 0;
+ }
+ }
+
+ if (!buff) {
+ buff = ip_vs_sync_buff_create(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ ms->sync_buff = buff;
+ m = buff->mesg;
+ }
+
+ p = buff->head;
+ buff->head += pad + len;
+ m->size = htons(ntohs(m->size) + pad + len);
+ /* Add ev. padding from prev. sync_conn */
+ while (pad--)
+ *(p++) = 0;
+
+ s = (union ip_vs_sync_conn *)p;
+
+ /* Set message type & copy members */
+ s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+ s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
+ s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->v4.state = htons(cp->state);
+ s->v4.protocol = cp->protocol;
+ s->v4.cport = cp->cport;
+ s->v4.vport = cp->vport;
+ s->v4.dport = cp->dport;
+ s->v4.fwmark = htonl(cp->fwmark);
+ s->v4.timeout = htonl(cp->timeout / HZ);
+ m->nr_conns++;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6) {
+ p += sizeof(struct ip_vs_sync_v6);
+ s->v6.caddr = cp->caddr.in6;
+ s->v6.vaddr = cp->vaddr.in6;
+ s->v6.daddr = cp->daddr.in6;
+ } else
+#endif
+ {
+ p += sizeof(struct ip_vs_sync_v4); /* options ptr */
+ s->v4.caddr = cp->caddr.ip;
+ s->v4.vaddr = cp->vaddr.ip;
+ s->v4.daddr = cp->daddr.ip;
+ }
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ *(p++) = IPVS_OPT_SEQ_DATA;
+ *(p++) = sizeof(struct ip_vs_sync_conn_options);
+ hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+ p += sizeof(struct ip_vs_seq);
+ hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+ p += sizeof(struct ip_vs_seq);
+ }
+ /* Handle pe data */
+ if (cp->pe_data_len && cp->pe_data) {
+ *(p++) = IPVS_OPT_PE_DATA;
+ *(p++) = cp->pe_data_len;
+ memcpy(p, cp->pe_data, cp->pe_data_len);
+ p += cp->pe_data_len;
+ if (pe_name_len) {
+ /* Add PE_NAME */
+ *(p++) = IPVS_OPT_PE_NAME;
+ *(p++) = pe_name_len;
+ memcpy(p, cp->pe->name, pe_name_len);
+ p += pe_name_len;
+ }
+ }
+
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+control:
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (!cp)
+ return;
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ goto sloop;
+}
+
+/*
+ * fill_param used by version 1
+ */
+static inline int
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ struct ip_vs_conn_param *p,
+ __u8 *pe_data, unsigned int pe_data_len,
+ __u8 *pe_name, unsigned int pe_name_len)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ (const union nf_inet_addr *)&sc->v6.caddr,
+ sc->v6.cport,
+ (const union nf_inet_addr *)&sc->v6.vaddr,
+ sc->v6.vport, p);
+ else
+#endif
+ ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ (const union nf_inet_addr *)&sc->v4.caddr,
+ sc->v4.cport,
+ (const union nf_inet_addr *)&sc->v4.vaddr,
+ sc->v4.vport, p);
+ /* Handle pe data */
+ if (pe_data_len) {
+ if (pe_name_len) {
+ char buff[IP_VS_PENAME_MAXLEN+1];
+
+ memcpy(buff, pe_name, pe_name_len);
+ buff[pe_name_len]=0;
+ p->pe = __ip_vs_pe_getbyname(buff);
+ if (!p->pe) {
+ IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+ buff);
+ return 1;
+ }
+ } else {
+ IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+ return 1;
+ }
+
+ p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
+ if (!p->pe_data) {
+ module_put(p->pe->module);
+ return -ENOMEM;
+ }
+ p->pe_data_len = pe_data_len;
+ }
+ return 0;
+}
+
+/*
+ * Connection Add / Update.
+ * Common for version 0 and 1 reception of backup sync_conns.
+ * Param: ...
+ * timeout is in sec.
+ */
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+ unsigned int flags, unsigned int state,
+ unsigned int protocol, unsigned int type,
+ const union nf_inet_addr *daddr, __be16 dport,
+ unsigned long timeout, __u32 fwmark,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ cp = ip_vs_conn_in_get(param);
+ if (cp && ((cp->dport != dport) ||
+ !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
+ if (!(flags & IP_VS_CONN_F_INACTIVE)) {
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ cp = NULL;
+ } else {
+ /* This is the expiration message for the
+ * connection that was already replaced, so we
+ * just ignore it.
+ */
+ __ip_vs_conn_put(cp);
+ kfree(param->pe_data);
+ return;
+ }
+ }
+ } else {
+ cp = ip_vs_ct_in_get(param);
+ }
+
+ if (cp) {
+ /* Free pe_data */
+ kfree(param->pe_data);
+
+ dest = cp->dest;
+ spin_lock_bh(&cp->lock);
+ if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
+ !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
+ if (flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ } else {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ }
+ }
+ flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
+ flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
+ cp->flags = flags;
+ spin_unlock_bh(&cp->lock);
+ if (!dest)
+ ip_vs_try_bind_dest(cp);
+ } else {
+ /*
+ * Find the appropriate destination for the connection.
+ * If it is not found the connection will remain unbound
+ * but still handled.
+ */
+ rcu_read_lock();
+ /* This function is only invoked by the synchronization
+ * code. We do not currently support heterogeneous pools
+ * with synchronization, so we can make the assumption that
+ * the svc_af is the same as the dest_af
+ */
+ dest = ip_vs_find_dest(net, type, type, daddr, dport,
+ param->vaddr, param->vport, protocol,
+ fwmark, flags);
+
+ cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
+ fwmark);
+ rcu_read_unlock();
+ if (!cp) {
+ kfree(param->pe_data);
+ IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+ return;
+ }
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ kfree(param->pe_data);
+ }
+
+ if (opt)
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
+ cp->state = state;
+ cp->old_state = cp->state;
+ /*
+ * For Ver 0 messages style
+ * - Not possible to recover the right timeout for templates
+ * - can not find the right fwmark
+ * virtual service. If needed, we can do it for
+ * non-fwmark persistent services.
+ * Ver 1 messages style.
+ * - No problem.
+ */
+ if (timeout) {
+ if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+ timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+ cp->timeout = timeout*HZ;
+ } else {
+ struct ip_vs_proto_data *pd;
+
+ pd = ip_vs_proto_data_get(net, protocol);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+ cp->timeout = pd->timeout_table[state];
+ else
+ cp->timeout = (3*60*HZ);
+ }
+ ip_vs_conn_put(cp);
+}
+
+/*
+ * Process received multicast message for Version 0
+ */
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+ const size_t buflen)
+{
+ struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ char *p;
+ int i;
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
+ for (i=0; i<m->nr_conns; i++) {
+ unsigned int flags, state;
+
+ if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
+ return;
+ }
+ s = (struct ip_vs_sync_conn_v0 *) p;
+ flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
+ flags &= ~IP_VS_CONN_F_HASHED;
+ if (flags & IP_VS_CONN_F_SEQ_MASK) {
+ opt = (struct ip_vs_sync_conn_options *)&s[1];
+ p += FULL_CONN_SIZE;
+ if (p > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
+ return;
+ }
+ } else {
+ opt = NULL;
+ p += SIMPLE_CONN_SIZE;
+ }
+
+ state = ntohs(s->state);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->protocol);
+ if (!pp) {
+ IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
+ s->protocol);
+ continue;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
+ pp->name, state);
+ continue;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+
+ ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ (const union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (const union nf_inet_addr *)&s->vaddr,
+ s->vport, &param);
+
+ /* Send timeout as Zero */
+ ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ (union nf_inet_addr *)&s->daddr, s->dport,
+ 0, 0, opt);
+ }
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+ __u32 *opt_flags,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_sync_conn_options *topt;
+
+ topt = (struct ip_vs_sync_conn_options *)p;
+
+ if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+ IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+ return -EINVAL;
+ }
+ if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+ IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+ return -EINVAL;
+ }
+ ntoh_seq(&topt->in_seq, &opt->in_seq);
+ ntoh_seq(&topt->out_seq, &opt->out_seq);
+ *opt_flags |= IPVS_OPT_F_SEQ_DATA;
+ return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+ __u8 **data, unsigned int maxlen,
+ __u32 *opt_flags, __u32 flag)
+{
+ if (plen > maxlen) {
+ IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+ return -EINVAL;
+ }
+ if (*opt_flags & flag) {
+ IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+ return -EINVAL;
+ }
+ *data_len = plen;
+ *data = p;
+ *opt_flags |= flag;
+ return 0;
+}
+/*
+ * Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+{
+ struct ip_vs_sync_conn_options opt;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ __u32 flags;
+ unsigned int af, state, pe_data_len=0, pe_name_len=0;
+ __u8 *pe_data=NULL, *pe_name=NULL;
+ __u32 opt_flags=0;
+ int retc=0;
+
+ s = (union ip_vs_sync_conn *) p;
+
+ if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ af = AF_INET6;
+ p += sizeof(struct ip_vs_sync_v6);
+#else
+ IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+ retc = 10;
+ goto out;
+#endif
+ } else if (!s->v4.type) {
+ af = AF_INET;
+ p += sizeof(struct ip_vs_sync_v4);
+ } else {
+ return -10;
+ }
+ if (p > msg_end)
+ return -20;
+
+ /* Process optional params check Type & Len. */
+ while (p < msg_end) {
+ int ptype;
+ int plen;
+
+ if (p+2 > msg_end)
+ return -30;
+ ptype = *(p++);
+ plen = *(p++);
+
+ if (!plen || ((p + plen) > msg_end))
+ return -40;
+ /* Handle seq option p = param data */
+ switch (ptype & ~IPVS_OPT_F_PARAM) {
+ case IPVS_OPT_SEQ_DATA:
+ if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+ return -50;
+ break;
+
+ case IPVS_OPT_PE_DATA:
+ if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+ IP_VS_PEDATA_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_DATA))
+ return -60;
+ break;
+
+ case IPVS_OPT_PE_NAME:
+ if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+ IP_VS_PENAME_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_NAME))
+ return -70;
+ break;
+
+ default:
+ /* Param data mandatory ? */
+ if (!(ptype & IPVS_OPT_F_PARAM)) {
+ IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+ ptype & ~IPVS_OPT_F_PARAM);
+ retc = 20;
+ goto out;
+ }
+ }
+ p += plen; /* Next option */
+ }
+
+ /* Get flags and Mask off unsupported */
+ flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+ flags |= IP_VS_CONN_F_SYNC;
+ state = ntohs(s->v4.state);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->v4.protocol);
+ if (!pp) {
+ IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+ s->v4.protocol);
+ retc = 30;
+ goto out;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+ pp->name, state);
+ retc = 40;
+ goto out;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+ if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ pe_data_len, pe_name, pe_name_len)) {
+ retc = 50;
+ goto out;
+ }
+ /* If only IPv4, just silent skip IPv6 */
+ if (af == AF_INET)
+ ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+ ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#ifdef CONFIG_IP_VS_IPV6
+ else
+ ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+ ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#endif
+ ip_vs_pe_put(param.pe);
+ return 0;
+ /* Error exit */
+out:
+ IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+ return retc;
+
+}
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ * Handles Version 0 & 1
+ */
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+ const size_t buflen)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
+ __u8 *p, *msg_end;
+ int i, nr_conns;
+
+ if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
+ IP_VS_DBG(2, "BACKUP, message header too short\n");
+ return;
+ }
+
+ if (buflen != ntohs(m2->size)) {
+ IP_VS_DBG(2, "BACKUP, bogus message size\n");
+ return;
+ }
+ /* SyncID sanity check */
+ if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+ return;
+ }
+ /* Handle version 1 message */
+ if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+ && (m2->spare == 0)) {
+
+ msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
+ nr_conns = m2->nr_conns;
+
+ for (i=0; i<nr_conns; i++) {
+ union ip_vs_sync_conn *s;
+ unsigned int size;
+ int retc;
+
+ p = msg_end;
+ if (p + sizeof(s->v4) > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ return;
+ }
+ s = (union ip_vs_sync_conn *)p;
+ size = ntohs(s->v4.ver_size) & SVER_MASK;
+ msg_end = p + size;
+ /* Basic sanity checks */
+ if (msg_end > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
+ return;
+ }
+ if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+ ntohs(s->v4.ver_size) >> SVER_SHIFT);
+ return;
+ }
+ /* Process a single sync_conn */
+ retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ if (retc < 0) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+ retc);
+ return;
+ }
+ /* Make sure we have 32 bit alignment */
+ msg_end = p + ((size + 3) & ~3);
+ }
+ } else {
+ /* Old type of message */
+ ip_vs_process_message_v0(net, buffer, buflen);
+ return;
+ }
+}
+
+
+/*
+ * Setup sndbuf (mode=1) or rcvbuf (mode=0)
+ */
+static void set_sock_size(struct sock *sk, int mode, int val)
+{
+ /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
+ /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
+ lock_sock(sk);
+ if (mode) {
+ val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
+ sysctl_wmem_max);
+ sk->sk_sndbuf = val * 2;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ } else {
+ val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
+ sysctl_rmem_max);
+ sk->sk_rcvbuf = val * 2;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
+/*
+ * Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+ lock_sock(sk);
+ inet->mc_loop = loop ? 1 : 0;
+ release_sock(sk);
+}
+
+/*
+ * Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+ lock_sock(sk);
+ inet->mc_ttl = ttl;
+ release_sock(sk);
+}
+
+/*
+ * Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+ struct net_device *dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ lock_sock(sk);
+ inet->mc_index = dev->ifindex;
+ /* inet->mc_addr = 0; */
+ release_sock(sk);
+
+ return 0;
+}
+
+
+/*
+ * Set the maximum length of sync message according to the
+ * specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net_device *dev;
+ int num;
+
+ if (sync_state == IP_VS_STATE_MASTER) {
+ dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+ if (!dev)
+ return -ENODEV;
+
+ num = (dev->mtu - sizeof(struct iphdr) -
+ sizeof(struct udphdr) -
+ SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+ ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+ SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
+ IP_VS_DBG(7, "setting the maximum length of sync sending "
+ "message %d.\n", ipvs->send_mesg_maxlen);
+ } else if (sync_state == IP_VS_STATE_BACKUP) {
+ dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+ if (!dev)
+ return -ENODEV;
+
+ ipvs->recv_mesg_maxlen = dev->mtu -
+ sizeof(struct iphdr) - sizeof(struct udphdr);
+ IP_VS_DBG(7, "setting the maximum length of sync receiving "
+ "message %d.\n", ipvs->recv_mesg_maxlen);
+ }
+
+ return 0;
+}
+
+
+/*
+ * Join a multicast group.
+ * the group is specified by a class D multicast address 224.0.0.0/8
+ * in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+ struct net *net = sock_net(sk);
+ struct ip_mreqn mreq;
+ struct net_device *dev;
+ int ret;
+
+ memset(&mreq, 0, sizeof(mreq));
+ memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ mreq.imr_ifindex = dev->ifindex;
+
+ rtnl_lock();
+ lock_sock(sk);
+ ret = ip_mc_join_group(sk, &mreq);
+ release_sock(sk);
+ rtnl_unlock();
+
+ return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+ struct net *net = sock_net(sock->sk);
+ struct net_device *dev;
+ __be32 addr;
+ struct sockaddr_in sin;
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+
+ addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ if (!addr)
+ pr_err("You probably need to specify IP address on "
+ "multicast interface.\n");
+
+ IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
+ ifname, &addr);
+
+ /* Now bind the socket with the address of multicast interface */
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+ sin.sin_port = 0;
+
+ return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ * Set up sending multicast socket over UDP
+ */
+static struct socket *make_send_sock(struct net *net, int id)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
+ struct socket *sock;
+ int result;
+
+ /* First create a socket move it to right name space later */
+ result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ if (result < 0) {
+ pr_err("Error during creation of socket; terminating\n");
+ return ERR_PTR(result);
+ }
+ /*
+ * Kernel sockets that are a part of a namespace, should not
+ * hold a reference to a namespace in order to allow to stop it.
+ * After sk_change_net should be released using sk_release_kernel.
+ */
+ sk_change_net(sock->sk, net);
+ result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error setting outbound mcast interface\n");
+ goto error;
+ }
+
+ set_mcast_loop(sock->sk, 0);
+ set_mcast_ttl(sock->sk, 1);
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 1, result);
+
+ result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error binding address of the mcast interface\n");
+ goto error;
+ }
+
+ result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
+ sizeof(struct sockaddr), 0);
+ if (result < 0) {
+ pr_err("Error connecting to the multicast addr\n");
+ goto error;
+ }
+
+ return sock;
+
+error:
+ sk_release_kernel(sock->sk);
+ return ERR_PTR(result);
+}
+
+
+/*
+ * Set up receiving multicast socket over UDP
+ */
+static struct socket *make_receive_sock(struct net *net, int id)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
+ struct socket *sock;
+ int result;
+
+ /* First create a socket */
+ result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ if (result < 0) {
+ pr_err("Error during creation of socket; terminating\n");
+ return ERR_PTR(result);
+ }
+ /*
+ * Kernel sockets that are a part of a namespace, should not
+ * hold a reference to a namespace in order to allow to stop it.
+ * After sk_change_net should be released using sk_release_kernel.
+ */
+ sk_change_net(sock->sk, net);
+ /* it is equivalent to the REUSEADDR option in user-space */
+ sock->sk->sk_reuse = SK_CAN_REUSE;
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 0, result);
+
+ result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
+ sizeof(struct sockaddr));
+ if (result < 0) {
+ pr_err("Error binding to the multicast addr\n");
+ goto error;
+ }
+
+ /* join the multicast group */
+ result = join_mcast_group(sock->sk,
+ (struct in_addr *) &mcast_addr.sin_addr,
+ ipvs->backup_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error joining to the multicast group\n");
+ goto error;
+ }
+
+ return sock;
+
+error:
+ sk_release_kernel(sock->sk);
+ return ERR_PTR(result);
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+ struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+ struct kvec iov;
+ int len;
+
+ EnterFunction(7);
+ iov.iov_base = (void *)buffer;
+ iov.iov_len = length;
+
+ len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+
+ LeaveFunction(7);
+ return len;
+}
+
+static int
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+ int msize;
+ int ret;
+
+ msize = ntohs(msg->size);
+
+ ret = ip_vs_send_async(sock, (char *)msg, msize);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ pr_err("ip_vs_send_async error %d\n", ret);
+ return 0;
+}
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+ struct msghdr msg = {NULL,};
+ struct kvec iov;
+ int len;
+
+ EnterFunction(7);
+
+ /* Receive a packet */
+ iov.iov_base = buffer;
+ iov.iov_len = (size_t)buflen;
+
+ len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
+
+ if (len < 0)
+ return len;
+
+ LeaveFunction(7);
+ return len;
+}
+
+/* Wakeup the master thread for sending */
+static void master_wakeup_work_handler(struct work_struct *work)
+{
+ struct ipvs_master_sync_state *ms =
+ container_of(work, struct ipvs_master_sync_state,
+ master_wakeup_work.work);
+ struct netns_ipvs *ipvs = ms->ipvs;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ms->sync_queue_len &&
+ ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
+ wake_up_process(ms->master_thread);
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+}
+
+/* Get next buffer to send */
+static inline struct ip_vs_sync_buff *
+next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ sb = sb_dequeue(ipvs, ms);
+ if (sb)
+ return sb;
+ /* Do not delay entries in buffer for more than 2 seconds */
+ return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
+}
+
+static int sync_thread_master(void *data)
+{
+ struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
+ struct sock *sk = tinfo->sock->sk;
+ struct ip_vs_sync_buff *sb;
+
+ pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
+ "syncid = %d, id = %d\n",
+ ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
+
+ for (;;) {
+ sb = next_sync_buff(ipvs, ms);
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (!sb) {
+ schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
+ continue;
+ }
+ while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
+ /* (Ab)use interruptible sleep to avoid increasing
+ * the load avg.
+ */
+ __wait_event_interruptible(*sk_sleep(sk),
+ sock_writeable(sk) ||
+ kthread_should_stop());
+ if (unlikely(kthread_should_stop()))
+ goto done;
+ }
+ ip_vs_sync_buff_release(sb);
+ }
+
+done:
+ __set_current_state(TASK_RUNNING);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
+ /* clean up the sync_buff queue */
+ while ((sb = sb_dequeue(ipvs, ms)))
+ ip_vs_sync_buff_release(sb);
+ __set_current_state(TASK_RUNNING);
+
+ /* clean up the current sync_buff */
+ sb = get_curr_sync_buff(ipvs, ms, 0);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
+ /* release the sending multicast socket */
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo);
+
+ return 0;
+}
+
+
+static int sync_thread_backup(void *data)
+{
+ struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ int len;
+
+ pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
+ "syncid = %d, id = %d\n",
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
+
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
+ !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
+ || kthread_should_stop());
+
+ /* do we have data now? */
+ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+ len = ip_vs_receive(tinfo->sock, tinfo->buf,
+ ipvs->recv_mesg_maxlen);
+ if (len <= 0) {
+ if (len != -EAGAIN)
+ pr_err("receiving message error\n");
+ break;
+ }
+
+ ip_vs_process_message(tinfo->net, tinfo->buf, len);
+ }
+ }
+
+ /* release the sending multicast socket */
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+
+ return 0;
+}
+
+
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
+{
+ struct ip_vs_sync_thread_data *tinfo;
+ struct task_struct **array = NULL, *task;
+ struct socket *sock;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ char *name;
+ int (*threadfn)(void *data);
+ int id, count;
+ int result = -ENOMEM;
+
+ IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+ IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+ sizeof(struct ip_vs_sync_conn_v0));
+
+ if (!ipvs->sync_state) {
+ count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
+ ipvs->threads_mask = count - 1;
+ } else
+ count = ipvs->threads_mask + 1;
+
+ if (state == IP_VS_STATE_MASTER) {
+ if (ipvs->ms)
+ return -EEXIST;
+
+ strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->master_mcast_ifn));
+ ipvs->master_syncid = syncid;
+ name = "ipvs-m:%d:%d";
+ threadfn = sync_thread_master;
+ } else if (state == IP_VS_STATE_BACKUP) {
+ if (ipvs->backup_threads)
+ return -EEXIST;
+
+ strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->backup_mcast_ifn));
+ ipvs->backup_syncid = syncid;
+ name = "ipvs-b:%d:%d";
+ threadfn = sync_thread_backup;
+ } else {
+ return -EINVAL;
+ }
+
+ if (state == IP_VS_STATE_MASTER) {
+ struct ipvs_master_sync_state *ms;
+
+ ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
+ if (!ipvs->ms)
+ goto out;
+ ms = ipvs->ms;
+ for (id = 0; id < count; id++, ms++) {
+ INIT_LIST_HEAD(&ms->sync_queue);
+ ms->sync_queue_len = 0;
+ ms->sync_queue_delay = 0;
+ INIT_DELAYED_WORK(&ms->master_wakeup_work,
+ master_wakeup_work_handler);
+ ms->ipvs = ipvs;
+ }
+ } else {
+ array = kzalloc(count * sizeof(struct task_struct *),
+ GFP_KERNEL);
+ if (!array)
+ goto out;
+ }
+ set_sync_mesg_maxlen(net, state);
+
+ tinfo = NULL;
+ for (id = 0; id < count; id++) {
+ if (state == IP_VS_STATE_MASTER)
+ sock = make_send_sock(net, id);
+ else
+ sock = make_receive_sock(net, id);
+ if (IS_ERR(sock)) {
+ result = PTR_ERR(sock);
+ goto outtinfo;
+ }
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ goto outsocket;
+ tinfo->net = net;
+ tinfo->sock = sock;
+ if (state == IP_VS_STATE_BACKUP) {
+ tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ GFP_KERNEL);
+ if (!tinfo->buf)
+ goto outtinfo;
+ } else {
+ tinfo->buf = NULL;
+ }
+ tinfo->id = id;
+
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
+ if (IS_ERR(task)) {
+ result = PTR_ERR(task);
+ goto outtinfo;
+ }
+ tinfo = NULL;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->ms[id].master_thread = task;
+ else
+ array[id] = task;
+ }
+
+ /* mark as active */
+
+ if (state == IP_VS_STATE_BACKUP)
+ ipvs->backup_threads = array;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ ipvs->sync_state |= state;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ return 0;
+
+outsocket:
+ sk_release_kernel(sock->sk);
+
+outtinfo:
+ if (tinfo) {
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ count = id;
+ while (count-- > 0) {
+ if (state == IP_VS_STATE_MASTER)
+ kthread_stop(ipvs->ms[count].master_thread);
+ else
+ kthread_stop(array[count]);
+ }
+ kfree(array);
+
+out:
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ }
+ return result;
+}
+
+
+int stop_sync_thread(struct net *net, int state)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct task_struct **array;
+ int id;
+ int retc = -EINVAL;
+
+ IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+
+ if (state == IP_VS_STATE_MASTER) {
+ if (!ipvs->ms)
+ return -ESRCH;
+
+ /*
+ * The lock synchronizes with sb_queue_tail(), so that we don't
+ * add sync buffers to the queue, when we are already in
+ * progress of stopping the master sync daemon.
+ */
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ spin_lock(&ipvs->sync_lock);
+ ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+ spin_unlock(&ipvs->sync_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ struct ipvs_master_sync_state *ms = &ipvs->ms[id];
+ int ret;
+
+ pr_info("stopping master sync thread %d ...\n",
+ task_pid_nr(ms->master_thread));
+ cancel_delayed_work_sync(&ms->master_wakeup_work);
+ ret = kthread_stop(ms->master_thread);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ } else if (state == IP_VS_STATE_BACKUP) {
+ if (!ipvs->backup_threads)
+ return -ESRCH;
+
+ ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+ array = ipvs->backup_threads;
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ int ret;
+
+ pr_info("stopping backup sync thread %d ...\n",
+ task_pid_nr(array[id]));
+ ret = kthread_stop(array[id]);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(array);
+ ipvs->backup_threads = NULL;
+ }
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return retc;
+}
+
+/*
+ * Initialize data struct for each netns
+ */
+int __net_init ip_vs_sync_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
+ spin_lock_init(&ipvs->sync_lock);
+ spin_lock_init(&ipvs->sync_buff_lock);
+ return 0;
+}
+
+void ip_vs_sync_net_cleanup(struct net *net)
+{
+ int retc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&ipvs->sync_mutex);
+ retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
+ if (retc && retc != -ESRCH)
+ pr_err("Failed to stop Master Daemon\n");
+
+ retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
+ if (retc && retc != -ESRCH)
+ pr_err("Failed to stop Backup Daemon\n");
+ mutex_unlock(&ipvs->sync_mutex);
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
new file mode 100644
index 000000000..6b366fd90
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -0,0 +1,116 @@
+/*
+ * IPVS: Weighted Least-Connection Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
+ * Wensong Zhang : changed to use the inactconns in scheduling
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wlc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "WLC: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+ .name = "wlc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
+ .schedule = ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
new file mode 100644
index 000000000..17e6d4406
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -0,0 +1,270 @@
+/*
+ * IPVS: Weighted Round-Robin Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wrr_update_svc
+ * Julian Anastasov : fixed the bug of returning destination
+ * with weight 0 when all weights are zero
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/gcd.h>
+
+#include <net/ip_vs.h>
+
+/* The WRR algorithm depends on some caclulations:
+ * - mw: maximum weight
+ * - di: weight step, greatest common divisor from all weights
+ * - cw: current required weight
+ * As result, all weights are in the [di..mw] range with a step=di.
+ *
+ * First, we start with cw = mw and select dests with weight >= cw.
+ * Then cw is reduced with di and all dests are checked again.
+ * Last pass should be with cw = di. We have mw/di passes in total:
+ *
+ * pass 1: cw = max weight
+ * pass 2: cw = max weight - di
+ * pass 3: cw = max weight - 2 * di
+ * ...
+ * last pass: cw = di
+ *
+ * Weights are supposed to be >= di but we run in parallel with
+ * weight changes, it is possible some dest weight to be reduced
+ * below di, bad if it is the only available dest.
+ *
+ * So, we modify how mw is calculated, now it is reduced with (di - 1),
+ * so that last cw is 1 to catch such dests with weight below di:
+ * pass 1: cw = max weight - (di - 1)
+ * pass 2: cw = max weight - di - (di - 1)
+ * pass 3: cw = max weight - 2 * di - (di - 1)
+ * ...
+ * last pass: cw = 1
+ *
+ */
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+ struct ip_vs_dest *cl; /* current dest or head */
+ int cw; /* current weight */
+ int mw; /* maximum weight */
+ int di; /* decreasing interval */
+ struct rcu_head rcu_head;
+};
+
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ weight = atomic_read(&dest->weight);
+ if (weight > 0) {
+ if (g > 0)
+ g = gcd(weight, g);
+ else
+ g = weight;
+ }
+ }
+ return g ? g : 1;
+}
+
+
+/*
+ * Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int new_weight, weight = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ new_weight = atomic_read(&dest->weight);
+ if (new_weight > weight)
+ weight = new_weight;
+ }
+
+ return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark;
+
+ /*
+ * Allocate the mark variable for WRR scheduling
+ */
+ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
+ if (mark == NULL)
+ return -ENOMEM;
+
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ mark->cw = mark->mw;
+ svc->sched_data = mark;
+
+ return 0;
+}
+
+
+static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+ /*
+ * Release the mark variable
+ */
+ kfree_rcu(mark, rcu_head);
+}
+
+
+static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+ spin_lock_bh(&svc->sched_lock);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ if (mark->cw > mark->mw || !mark->cw)
+ mark->cw = mark->mw;
+ else if (mark->di > 1)
+ mark->cw = (mark->cw / mark->di) * mark->di + 1;
+ spin_unlock_bh(&svc->sched_lock);
+ return 0;
+}
+
+
+/*
+ * Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *last, *stop = NULL;
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+ bool last_pass = false, restarted = false;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ spin_lock_bh(&svc->sched_lock);
+ dest = mark->cl;
+ /* No available dests? */
+ if (mark->mw == 0)
+ goto err_noavail;
+ last = dest;
+ /* Stop only after all dests were checked for weight >= 1 (last pass) */
+ while (1) {
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) >= mark->cw)
+ goto found;
+ if (dest == stop)
+ goto err_over;
+ }
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /* Stop if we tried last pass from first dest:
+ * 1. last_pass: we started checks when cw > di but
+ * then all dests were checked for w >= 1
+ * 2. last was head: the first and only traversal
+ * was for weight >= 1, for all dests.
+ */
+ if (last_pass ||
+ &last->n_list == &svc->destinations)
+ goto err_over;
+ restarted = true;
+ }
+ last_pass = mark->cw <= mark->di;
+ if (last_pass && restarted &&
+ &last->n_list != &svc->destinations) {
+ /* First traversal was for w >= 1 but only
+ * for dests after 'last', now do the same
+ * for all dests up to 'last'.
+ */
+ stop = last;
+ }
+ }
+
+found:
+ IP_VS_DBG_BUF(6, "WRR: server %s:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt),
+ atomic_read(&dest->weight));
+ mark->cl = dest;
+
+ out:
+ spin_unlock_bh(&svc->sched_lock);
+ return dest;
+
+err_noavail:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available");
+ goto out;
+
+err_over:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available: "
+ "all destinations are overloaded");
+ goto out;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+ .name = "wrr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
+ .init_service = ip_vs_wrr_init_svc,
+ .done_service = ip_vs_wrr_done_svc,
+ .add_dest = ip_vs_wrr_dest_changed,
+ .del_dest = ip_vs_wrr_dest_changed,
+ .upd_dest = ip_vs_wrr_dest_changed,
+ .schedule = ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
new file mode 100644
index 000000000..19986ec5f
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -0,0 +1,1380 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h> /* for ip_route_output */
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/ip_tunnels.h>
+#include <net/addrconf.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+enum {
+ IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
+ IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
+ IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
+ * local
+ */
+ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
+ IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
+ IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
+};
+
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+ return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+ kfree(dest_dst);
+}
+
+/*
+ * Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+ struct dst_entry *dst, u32 dst_cookie)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst,
+ lockdep_is_held(&dest->dst_lock));
+
+ if (dest_dst) {
+ dest_dst->dst_cache = dst;
+ dest_dst->dst_cookie = dst_cookie;
+ }
+ rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+ if (old)
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+}
+
+static inline struct ip_vs_dest_dst *
+__ip_vs_dst_check(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+ struct dst_entry *dst;
+
+ if (!dest_dst)
+ return NULL;
+ dst = dest_dst->dst_cache;
+ if (dst->obsolete &&
+ dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
+ return NULL;
+ return dest_dst;
+}
+
+static inline bool
+__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
+{
+ if (IP6CB(skb)->frag_max_size) {
+ /* frag_max_size tell us that, this packet have been
+ * defragmented by netfilter IPv6 conntrack module.
+ */
+ if (IP6CB(skb)->frag_max_size > mtu)
+ return true; /* largest fragment violate MTU */
+ }
+ else if (skb->len > mtu && !skb_is_gso(skb)) {
+ return true; /* Packet size violate MTU size */
+ }
+ return false;
+}
+
+/* Get route to daddr, update *saddr, optionally bind route to saddr */
+static struct rtable *do_output_route4(struct net *net, __be32 daddr,
+ int rt_mode, __be32 *saddr)
+{
+ struct flowi4 fl4;
+ struct rtable *rt;
+ int loop = 0;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
+ fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
+ FLOWI_FLAG_KNOWN_NH : 0;
+
+retry:
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ /* Invalid saddr ? */
+ if (PTR_ERR(rt) == -EINVAL && *saddr &&
+ rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
+ *saddr = 0;
+ flowi4_update_output(&fl4, 0, 0, daddr, 0);
+ goto retry;
+ }
+ IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
+ return NULL;
+ } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ ip_rt_put(rt);
+ *saddr = fl4.saddr;
+ flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
+ loop++;
+ goto retry;
+ }
+ *saddr = fl4.saddr;
+ return rt;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+ return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
+}
+#endif
+
+static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
+ int rt_mode,
+ bool new_rt_is_local)
+{
+ bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
+ bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
+ bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
+ bool source_is_loopback;
+ bool old_rt_is_local;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
+
+ source_is_loopback =
+ (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ (addr_type & IPV6_ADDR_LOOPBACK);
+ old_rt_is_local = __ip_vs_is_local_route6(
+ (struct rt6_info *)skb_dst(skb));
+ } else
+#endif
+ {
+ source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
+ old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+ }
+
+ if (unlikely(new_rt_is_local)) {
+ if (!rt_mode_allow_local)
+ return true;
+ if (!rt_mode_allow_redirect && !old_rt_is_local)
+ return true;
+ } else {
+ if (!rt_mode_allow_non_local)
+ return true;
+ if (source_is_loopback)
+ return true;
+ }
+ return false;
+}
+
+static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
+{
+ struct sock *sk = skb->sk;
+ struct rtable *ort = skb_rtable(skb);
+
+ if (!skb->dev && sk && sk_fullsock(sk))
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+}
+
+static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
+ struct ip_vs_iphdr *ipvsh,
+ struct sk_buff *skb, int mtu)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ /* only send ICMP too big on first fragment */
+ if (!ipvsh->fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP_VS_DBG(1, "frag needed for %pI6c\n",
+ &ipv6_hdr(skb)->saddr);
+ return false;
+ }
+ } else
+#endif
+ {
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+ /* If we're going to tunnel the packet and pmtu discovery
+ * is disabled, we'll just fragment it anyway
+ */
+ if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
+ return true;
+
+ if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
+ skb->len > mtu && !skb_is_gso(skb))) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ IP_VS_DBG(1, "frag needed for %pI4\n",
+ &ip_hdr(skb)->saddr);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Get route to destination or remote server */
+static int
+__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
+ __be32 daddr, int rt_mode, __be32 *ret_saddr,
+ struct ip_vs_iphdr *ipvsh)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
+ struct rtable *rt; /* Route to the other host */
+ int mtu;
+ int local, noref = 1;
+
+ if (dest) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rtable *) dest_dst->dst_cache;
+ else {
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ rt = do_output_route4(net, dest->addr.ip, rt_mode,
+ &dest_dst->dst_saddr.ip);
+ if (!rt) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
+ }
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
+ &dest->addr.ip, &dest_dst->dst_saddr.ip,
+ atomic_read(&rt->dst.__refcnt));
+ }
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.ip;
+ } else {
+ __be32 saddr = htonl(INADDR_ANY);
+
+ noref = 0;
+
+ /* For such unconfigured boxes avoid many route lookups
+ * for performance reasons because we do not remember saddr
+ */
+ rt_mode &= ~IP_VS_RT_MODE_CONNECT;
+ rt = do_output_route4(net, daddr, rt_mode, &saddr);
+ if (!rt)
+ goto err_unreach;
+ if (ret_saddr)
+ *ret_saddr = saddr;
+ }
+
+ local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
+ if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
+ local))) {
+ IP_VS_DBG_RL("We are crossing local and non-local addresses"
+ " daddr=%pI4\n", &daddr);
+ goto err_put;
+ }
+
+ if (unlikely(local)) {
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ ip_rt_put(rt);
+ return local;
+ }
+
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
+ mtu = dst_mtu(&rt->dst);
+ } else {
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+ goto err_put;
+ }
+ maybe_update_pmtu(skb_af, skb, mtu);
+ }
+
+ if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu))
+ goto err_put;
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ ip_rt_put(rt);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+ struct in6_addr *ret_saddr, int do_xfrm)
+{
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .daddr = *daddr,
+ };
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error)
+ goto out_err;
+ if (!ret_saddr)
+ return dst;
+ if (ipv6_addr_any(&fl6.saddr) &&
+ ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+ &fl6.daddr, 0, &fl6.saddr) < 0)
+ goto out_err;
+ if (do_xfrm) {
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ dst = NULL;
+ goto out_err;
+ }
+ }
+ *ret_saddr = fl6.saddr;
+ return dst;
+
+out_err:
+ dst_release(dst);
+ IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+ return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ */
+static int
+__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
+ struct in6_addr *daddr, struct in6_addr *ret_saddr,
+ struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
+ struct rt6_info *rt; /* Route to the other host */
+ struct dst_entry *dst;
+ int mtu;
+ int local, noref = 1;
+
+ if (dest) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rt6_info *) dest_dst->dst_cache;
+ else {
+ u32 cookie;
+
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+ &dest_dst->dst_saddr.in6,
+ do_xfrm);
+ if (!dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
+ }
+ rt = (struct rt6_info *) dst;
+ cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+ &dest->addr.in6, &dest_dst->dst_saddr.in6,
+ atomic_read(&rt->dst.__refcnt));
+ }
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.in6;
+ } else {
+ noref = 0;
+ dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+ if (!dst)
+ goto err_unreach;
+ rt = (struct rt6_info *) dst;
+ }
+
+ local = __ip_vs_is_local_route6(rt);
+
+ if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
+ local))) {
+ IP_VS_DBG_RL("We are crossing local and non-local addresses"
+ " daddr=%pI6\n", daddr);
+ goto err_put;
+ }
+
+ if (unlikely(local)) {
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ dst_release(&rt->dst);
+ return local;
+ }
+
+ /* MTU checking */
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
+ mtu = dst_mtu(&rt->dst);
+ else {
+ mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto err_put;
+ }
+ maybe_update_pmtu(skb_af, skb, mtu);
+ }
+
+ if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu))
+ goto err_put;
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ dst_release(&rt->dst);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
+}
+#endif
+
+
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
+{
+ int ret = NF_ACCEPT;
+
+ skb->ipvs_property = 1;
+ if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+ ret = ip_vs_confirm_conntrack(skb);
+ if (ret == NF_ACCEPT) {
+ nf_reset(skb);
+ skb_forward_csum(skb);
+ }
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 1);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
+ NULL, skb_dst(skb)->dev, dst_output_sk);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
+ NULL, skb_dst(skb)->dev, dst_output_sk);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
+
+
+/*
+ * NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ /* we do not touch skb and do not need pskb ptr */
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+}
+
+
+/*
+ * Bypass transmitter
+ * Let packets bypass the destination when the destination is not
+ * available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr,
+ IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
+ goto tx_error;
+
+ ip_send_check(iph);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ EnterFunction(10);
+
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL,
+ ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
+ goto tx_error;
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+/*
+ * NAT transmitter (only for outside-to-inside nat forwarding)
+ * Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rtable *rt; /* Route to the other host */
+ int local, rc, was_input;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ /* check if it is a connection of no-client-port */
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ __be16 _pt, *p;
+
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
+ if (p == NULL)
+ goto tx_error;
+ ip_vs_conn_fill_cport(cp, *p);
+ IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+ }
+
+ was_input = rt_is_input_route(skb_rtable(skb));
+ local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL, ipvsh);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+ "ip_vs_nat_xmit(): "
+ "stopping DNAT to local address");
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
+ IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+ "stopping DNAT to loopback address");
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ /* mangle the packet */
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
+ goto tx_error;
+ ip_hdr(skb)->daddr = cp->daddr.ip;
+ ip_send_check(ip_hdr(skb));
+
+ IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
+
+ /* FIXME: when application helper enlarges the packet and the length
+ is larger than the MTU of outgoing device, there will be still
+ MTU problem. */
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return rc;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ int local, rc;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ /* check if it is a connection of no-client-port */
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
+ __be16 _pt, *p;
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
+ if (p == NULL)
+ goto tx_error;
+ ip_vs_conn_fill_cport(cp, *p);
+ IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+ }
+
+ local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ NULL, ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to local address");
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to loopback address");
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ /* mangle the packet */
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
+ goto tx_error;
+ ipv6_hdr(skb)->daddr = cp->daddr.in6;
+
+ IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
+
+ /* FIXME: when application helper enlarges the packet and the length
+ is larger than the MTU of outgoing device, there will be still
+ MTU problem. */
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return rc;
+
+tx_error:
+ LeaveFunction(10);
+ kfree_skb(skb);
+ rcu_read_unlock();
+ return NF_STOLEN;
+}
+#endif
+
+/* When forwarding a packet, we must ensure that we've got enough headroom
+ * for the encapsulation packet in the skb. This also gives us an
+ * opportunity to figure out what the payload_len, dsfield, ttl, and df
+ * values should be, so that we won't need to look at the old ip header
+ * again
+ */
+static struct sk_buff *
+ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
+ unsigned int max_headroom, __u8 *next_protocol,
+ __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
+ __be16 *df)
+{
+ struct sk_buff *new_skb = NULL;
+ struct iphdr *old_iph = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+ struct ipv6hdr *old_ipv6h = NULL;
+#endif
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
+ new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb)
+ goto error;
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ consume_skb(skb);
+ skb = new_skb;
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ old_ipv6h = ipv6_hdr(skb);
+ *next_protocol = IPPROTO_IPV6;
+ if (payload_len)
+ *payload_len =
+ ntohs(old_ipv6h->payload_len) +
+ sizeof(*old_ipv6h);
+ *dsfield = ipv6_get_dsfield(old_ipv6h);
+ *ttl = old_ipv6h->hop_limit;
+ if (df)
+ *df = 0;
+ } else
+#endif
+ {
+ old_iph = ip_hdr(skb);
+ /* Copy DF, reset fragment offset and MF */
+ if (df)
+ *df = (old_iph->frag_off & htons(IP_DF));
+ *next_protocol = IPPROTO_IPIP;
+
+ /* fix old IP header checksum */
+ ip_send_check(old_iph);
+ *dsfield = ipv4_get_dsfield(old_iph);
+ *ttl = old_iph->ttl;
+ if (payload_len)
+ *payload_len = ntohs(old_iph->tot_len);
+ }
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(-ENOMEM);
+}
+
+static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
+{
+ if (encaps_af == AF_INET) {
+ if (orig_af == AF_INET)
+ return SKB_GSO_IPIP;
+
+ return SKB_GSO_SIT;
+ }
+
+ /* GSO: we need to provide proper SKB_GSO_ value for IPv6:
+ * SKB_GSO_SIT/IPV6
+ */
+ return 0;
+}
+
+/*
+ * IP Tunneling transmitter
+ *
+ * This function encapsulates the packet in a new IP packet, its
+ * destination will be set to cp->daddr. Most code of this function
+ * is taken from ipip.c.
+ *
+ * It is used in VS/TUN cluster. The load balancer selects a real
+ * server from a cluster based on a scheduling algorithm,
+ * encapsulates the request packet and forwards it to the selected
+ * server. For example, all real servers are configured with
+ * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ * the encapsulated packet, it will decapsulate the packet, processe
+ * the request and return the response packets directly to the client
+ * without passing the load balancer. This can greatly increase the
+ * scalability of virtual server.
+ *
+ * Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct net *net = skb_net(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct rtable *rt; /* Route to the other host */
+ __be32 saddr; /* Source for tunnel */
+ struct net_device *tdev; /* Device to other host */
+ __u8 next_protocol = 0;
+ __u8 dsfield = 0;
+ __u8 ttl = 0;
+ __be16 df = 0;
+ __be16 *dfp = NULL;
+ struct iphdr *iph; /* Our new IP header */
+ unsigned int max_headroom; /* The extra header space needed */
+ int ret, local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT |
+ IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+ }
+
+ rt = skb_rtable(skb);
+ tdev = rt->dst.dev;
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+
+ /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
+ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
+ skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
+ &next_protocol, NULL, &dsfield,
+ &ttl, dfp);
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb = iptunnel_handle_offloads(
+ skb, false, __tun_gso_type_mask(AF_INET, cp->af));
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb->transport_header = skb->network_header;
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ /*
+ * Push down and install the IPIP header.
+ */
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr)>>2;
+ iph->frag_off = df;
+ iph->protocol = next_protocol;
+ iph->tos = dsfield;
+ iph->daddr = cp->daddr.ip;
+ iph->saddr = saddr;
+ iph->ttl = ttl;
+ ip_select_ident(net, skb, NULL);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+
+ return NF_STOLEN;
+
+ tx_error:
+ if (!IS_ERR(skb))
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ struct in6_addr saddr; /* Source for tunnel */
+ struct net_device *tdev; /* Device to other host */
+ __u8 next_protocol = 0;
+ __u32 payload_len = 0;
+ __u8 dsfield = 0;
+ __u8 ttl = 0;
+ struct ipv6hdr *iph; /* Our new IP header */
+ unsigned int max_headroom; /* The extra header space needed */
+ int ret, local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ &saddr, ipvsh, 1,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_TUNNEL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
+ }
+
+ rt = (struct rt6_info *) skb_dst(skb);
+ tdev = rt->dst.dev;
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+
+ skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
+ &next_protocol, &payload_len,
+ &dsfield, &ttl, NULL);
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb = iptunnel_handle_offloads(
+ skb, false, __tun_gso_type_mask(AF_INET6, cp->af));
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb->transport_header = skb->network_header;
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ /*
+ * Push down and install the IPIP header.
+ */
+ iph = ipv6_hdr(skb);
+ iph->version = 6;
+ iph->nexthdr = next_protocol;
+ iph->payload_len = htons(payload_len);
+ memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
+ ipv6_change_dsfield(iph, 0, dsfield);
+ iph->daddr = cp->daddr.in6;
+ iph->saddr = saddr;
+ iph->hop_limit = ttl;
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip6_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+
+ return NF_STOLEN;
+
+tx_error:
+ if (!IS_ERR(skb))
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+
+/*
+ * Direct Routing transmitter
+ * Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ int local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+ }
+
+ ip_send_check(ip_hdr(skb));
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ int local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ NULL, ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
+ }
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+
+/*
+ * ICMP packet transmitter
+ * called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *iph)
+{
+ struct rtable *rt; /* Route to the other host */
+ int rc;
+ int local;
+ int rt_mode, was_input;
+
+ EnterFunction(10);
+
+ /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+ forwarded directly here, because there is no need to
+ translate address/port back */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ if (cp->packet_xmit)
+ rc = cp->packet_xmit(skb, cp, pp, iph);
+ else
+ rc = NF_ACCEPT;
+ /* do not touch skb anymore */
+ atomic_inc(&cp->in_pkts);
+ goto out;
+ }
+
+ /*
+ * mangle and send the packet here (only for VS/NAT)
+ */
+ was_input = rt_is_input_route(skb_rtable(skb));
+
+ /* LOCALNODE from FORWARD hook is not supported */
+ rt_mode = (hooknum != NF_INET_FORWARD) ?
+ IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
+ NULL, iph);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
+
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, offset))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ ip_vs_nat_icmp(skb, pp, cp, 0);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
+ goto out;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ rc = NF_STOLEN;
+ out:
+ LeaveFunction(10);
+ return rc;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ int rc;
+ int local;
+ int rt_mode;
+
+ EnterFunction(10);
+
+ /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+ forwarded directly here, because there is no need to
+ translate address/port back */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ if (cp->packet_xmit)
+ rc = cp->packet_xmit(skb, cp, pp, ipvsh);
+ else
+ rc = NF_ACCEPT;
+ /* do not touch skb anymore */
+ atomic_inc(&cp->in_pkts);
+ goto out;
+ }
+
+ /*
+ * mangle and send the packet here (only for VS/NAT)
+ */
+
+ /* LOCALNODE from FORWARD hook is not supported */
+ rt_mode = (hooknum != NF_INET_FORWARD) ?
+ IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ NULL, ipvsh, 0, rt_mode);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, offset))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
+ goto out;
+
+tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ rc = NF_STOLEN;
+out:
+ LeaveFunction(10);
+ return rc;
+}
+#endif
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
new file mode 100644
index 000000000..45da11afa
--- /dev/null
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -0,0 +1,135 @@
+/* Accouting handling for netfilter. */
+
+/*
+ * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+
+static bool nf_ct_acct __read_mostly;
+
+module_param_named(acct, nf_ct_acct, bool, 0644);
+MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table acct_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_acct",
+ .data = &init_net.ct.sysctl_acct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+unsigned int
+seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
+{
+ struct nf_conn_acct *acct;
+ struct nf_conn_counter *counter;
+
+ acct = nf_conn_acct_find(ct);
+ if (!acct)
+ return 0;
+
+ counter = acct->counter;
+ seq_printf(s, "packets=%llu bytes=%llu ",
+ (unsigned long long)atomic64_read(&counter[dir].packets),
+ (unsigned long long)atomic64_read(&counter[dir].bytes));
+
+ return 0;
+};
+EXPORT_SYMBOL_GPL(seq_print_acct);
+
+static struct nf_ct_ext_type acct_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_acct),
+ .align = __alignof__(struct nf_conn_acct),
+ .id = NF_CT_EXT_ACCT,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_acct_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_acct;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter",
+ table);
+ if (!net->ct.acct_sysctl_header) {
+ printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_acct_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.acct_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.acct_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_acct_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_acct_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_acct_pernet_init(struct net *net)
+{
+ net->ct.sysctl_acct = nf_ct_acct;
+ return nf_conntrack_acct_init_sysctl(net);
+}
+
+void nf_conntrack_acct_pernet_fini(struct net *net)
+{
+ nf_conntrack_acct_fini_sysctl(net);
+}
+
+int nf_conntrack_acct_init(void)
+{
+ int ret = nf_ct_extend_register(&acct_extend);
+ if (ret < 0)
+ pr_err("nf_conntrack_acct: Unable to register extension\n");
+ return ret;
+}
+
+void nf_conntrack_acct_fini(void)
+{
+ nf_ct_extend_unregister(&acct_extend);
+}
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
new file mode 100644
index 000000000..57a26cc90
--- /dev/null
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -0,0 +1,237 @@
+/* Amanda extension for IP connection tracking
+ *
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on HW's ip_conntrack_irc.c as well as other modules
+ * (C) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/textsearch.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+#include <linux/gfp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_amanda.h>
+
+static unsigned int master_timeout __read_mostly = 300;
+static char *ts_algo = "kmp";
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda connection tracking module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_amanda");
+MODULE_ALIAS_NFCT_HELPER("amanda");
+
+module_param(master_timeout, uint, 0600);
+MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
+module_param(ts_algo, charp, 0400);
+MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
+
+unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+ __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_amanda_hook);
+
+enum amanda_strings {
+ SEARCH_CONNECT,
+ SEARCH_NEWLINE,
+ SEARCH_DATA,
+ SEARCH_MESG,
+ SEARCH_INDEX,
+};
+
+static struct {
+ const char *string;
+ size_t len;
+ struct ts_config *ts;
+} search[] __read_mostly = {
+ [SEARCH_CONNECT] = {
+ .string = "CONNECT ",
+ .len = 8,
+ },
+ [SEARCH_NEWLINE] = {
+ .string = "\n",
+ .len = 1,
+ },
+ [SEARCH_DATA] = {
+ .string = "DATA ",
+ .len = 5,
+ },
+ [SEARCH_MESG] = {
+ .string = "MESG ",
+ .len = 5,
+ },
+ [SEARCH_INDEX] = {
+ .string = "INDEX ",
+ .len = 6,
+ },
+};
+
+static int amanda_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple *tuple;
+ unsigned int dataoff, start, stop, off, i;
+ char pbuf[sizeof("65535")], *tmp;
+ u_int16_t len;
+ __be16 port;
+ int ret = NF_ACCEPT;
+ typeof(nf_nat_amanda_hook) nf_nat_amanda;
+
+ /* Only look at packets from the Amanda server */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ /* increase the UDP timeout of the master connection as replies from
+ * Amanda clients to the server can be quite delayed */
+ nf_ct_refresh(ct, skb, master_timeout * HZ);
+
+ /* No data? */
+ dataoff = protoff + sizeof(struct udphdr);
+ if (dataoff >= skb->len) {
+ net_err_ratelimited("amanda_help: skblen = %u\n", skb->len);
+ return NF_ACCEPT;
+ }
+
+ start = skb_find_text(skb, dataoff, skb->len,
+ search[SEARCH_CONNECT].ts);
+ if (start == UINT_MAX)
+ goto out;
+ start += dataoff + search[SEARCH_CONNECT].len;
+
+ stop = skb_find_text(skb, start, skb->len,
+ search[SEARCH_NEWLINE].ts);
+ if (stop == UINT_MAX)
+ goto out;
+ stop += start;
+
+ for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
+ off = skb_find_text(skb, start, stop, search[i].ts);
+ if (off == UINT_MAX)
+ continue;
+ off += start + search[i].len;
+
+ len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
+ if (skb_copy_bits(skb, off, pbuf, len))
+ break;
+ pbuf[len] = '\0';
+
+ port = htons(simple_strtoul(pbuf, &tmp, 10));
+ len = tmp - pbuf;
+ if (port == 0 || len > 5)
+ break;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ ret = NF_DROP;
+ goto out;
+ }
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+ nf_ct_l3num(ct),
+ &tuple->src.u3, &tuple->dst.u3,
+ IPPROTO_TCP, NULL, &port);
+
+ nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook);
+ if (nf_nat_amanda && ct->status & IPS_NAT_MASK)
+ ret = nf_nat_amanda(skb, ctinfo, protoff,
+ off - dataoff, len, exp);
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ }
+ nf_ct_expect_put(exp);
+ }
+
+out:
+ return ret;
+}
+
+static const struct nf_conntrack_expect_policy amanda_exp_policy = {
+ .max_expected = 3,
+ .timeout = 180,
+};
+
+static struct nf_conntrack_helper amanda_helper[2] __read_mostly = {
+ {
+ .name = "amanda",
+ .me = THIS_MODULE,
+ .help = amanda_help,
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(10080),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .expect_policy = &amanda_exp_policy,
+ },
+ {
+ .name = "amanda",
+ .me = THIS_MODULE,
+ .help = amanda_help,
+ .tuple.src.l3num = AF_INET6,
+ .tuple.src.u.udp.port = cpu_to_be16(10080),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .expect_policy = &amanda_exp_policy,
+ },
+};
+
+static void __exit nf_conntrack_amanda_fini(void)
+{
+ int i;
+
+ nf_conntrack_helper_unregister(&amanda_helper[0]);
+ nf_conntrack_helper_unregister(&amanda_helper[1]);
+ for (i = 0; i < ARRAY_SIZE(search); i++)
+ textsearch_destroy(search[i].ts);
+}
+
+static int __init nf_conntrack_amanda_init(void)
+{
+ int ret, i;
+
+ for (i = 0; i < ARRAY_SIZE(search); i++) {
+ search[i].ts = textsearch_prepare(ts_algo, search[i].string,
+ search[i].len,
+ GFP_KERNEL, TS_AUTOLOAD);
+ if (IS_ERR(search[i].ts)) {
+ ret = PTR_ERR(search[i].ts);
+ goto err1;
+ }
+ }
+ ret = nf_conntrack_helper_register(&amanda_helper[0]);
+ if (ret < 0)
+ goto err1;
+ ret = nf_conntrack_helper_register(&amanda_helper[1]);
+ if (ret < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_conntrack_helper_unregister(&amanda_helper[0]);
+err1:
+ while (--i >= 0)
+ textsearch_destroy(search[i].ts);
+
+ return ret;
+}
+
+module_init(nf_conntrack_amanda_init);
+module_exit(nf_conntrack_amanda_fini);
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
new file mode 100644
index 000000000..4e99cca61
--- /dev/null
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -0,0 +1,82 @@
+/*
+ * broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <net/route.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+int nf_conntrack_broadcast_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int timeout)
+{
+ struct nf_conntrack_expect *exp;
+ struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt = skb_rtable(skb);
+ struct in_device *in_dev;
+ struct nf_conn_help *help = nfct_help(ct);
+ __be32 mask = 0;
+
+ /* we're only interested in locally generated packets */
+ if (skb->sk == NULL)
+ goto out;
+ if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+ goto out;
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (in_dev != NULL) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_broadcast == iph->daddr) {
+ mask = ifa->ifa_mask;
+ break;
+ }
+ } endfor_ifa(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (mask == 0)
+ goto out;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL)
+ goto out;
+
+ exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ exp->mask.src.u3.ip = mask;
+ exp->mask.src.u.udp.port = htons(0xFFFF);
+
+ exp->expectfn = NULL;
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
+ exp->helper = NULL;
+
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+
+ nf_ct_refresh(ct, skb, timeout * HZ);
+out:
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
new file mode 100644
index 000000000..13fad8668
--- /dev/null
+++ b/net/netfilter/nf_conntrack_core.c
@@ -0,0 +1,1821 @@
+/* Connection state tracking for netfilter. This is separated from,
+ but required by, the NAT layer; it can also be used by an iptables
+ extension. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/rculist_nulls.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+
+#define NF_CONNTRACK_VERSION "0.5.0"
+
+int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr) __read_mostly;
+EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
+
+__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
+EXPORT_SYMBOL_GPL(nf_conntrack_locks);
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
+EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+
+static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
+{
+ h1 %= CONNTRACK_LOCKS;
+ h2 %= CONNTRACK_LOCKS;
+ spin_unlock(&nf_conntrack_locks[h1]);
+ if (h1 != h2)
+ spin_unlock(&nf_conntrack_locks[h2]);
+}
+
+/* return true if we need to recompute hashes (in case hash table was resized) */
+static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
+ unsigned int h2, unsigned int sequence)
+{
+ h1 %= CONNTRACK_LOCKS;
+ h2 %= CONNTRACK_LOCKS;
+ if (h1 <= h2) {
+ spin_lock(&nf_conntrack_locks[h1]);
+ if (h1 != h2)
+ spin_lock_nested(&nf_conntrack_locks[h2],
+ SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&nf_conntrack_locks[h2]);
+ spin_lock_nested(&nf_conntrack_locks[h1],
+ SINGLE_DEPTH_NESTING);
+ }
+ if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ nf_conntrack_double_unlock(h1, h2);
+ return true;
+ }
+ return false;
+}
+
+static void nf_conntrack_all_lock(void)
+{
+ int i;
+
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_lock_nested(&nf_conntrack_locks[i], i);
+}
+
+static void nf_conntrack_all_unlock(void)
+{
+ int i;
+
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_unlock(&nf_conntrack_locks[i]);
+}
+
+unsigned int nf_conntrack_htable_size __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
+
+unsigned int nf_conntrack_max __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
+
+DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
+EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
+
+unsigned int nf_conntrack_hash_rnd __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
+
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
+{
+ unsigned int n;
+
+ /* The direction must be ignored, so we hash everything up to the
+ * destination ports (which is a multiple of 4) and treat the last
+ * three bytes manually.
+ */
+ n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
+ return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+ (((__force __u16)tuple->dst.u.all << 16) |
+ tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+ return reciprocal_scale(hash, size);
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+ return __hash_bucket(hash, net->ct.htable_size);
+}
+
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+ u16 zone, unsigned int size)
+{
+ return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
+}
+
+static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return __hash_conntrack(tuple, zone, net->ct.htable_size);
+}
+
+bool
+nf_ct_get_tuple(const struct sk_buff *skb,
+ unsigned int nhoff,
+ unsigned int dataoff,
+ u_int16_t l3num,
+ u_int8_t protonum,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_l3proto *l3proto,
+ const struct nf_conntrack_l4proto *l4proto)
+{
+ memset(tuple, 0, sizeof(*tuple));
+
+ tuple->src.l3num = l3num;
+ if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
+ return false;
+
+ tuple->dst.protonum = protonum;
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+ return l4proto->pkt_to_tuple(skb, dataoff, tuple);
+}
+EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
+
+bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
+ u_int16_t l3num, struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ unsigned int protoff;
+ u_int8_t protonum;
+ int ret;
+
+ rcu_read_lock();
+
+ l3proto = __nf_ct_l3proto_find(l3num);
+ ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
+ if (ret != NF_ACCEPT) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ l4proto = __nf_ct_l4proto_find(l3num, protonum);
+
+ ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
+ l3proto, l4proto);
+
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
+
+bool
+nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_l3proto *l3proto,
+ const struct nf_conntrack_l4proto *l4proto)
+{
+ memset(inverse, 0, sizeof(*inverse));
+
+ inverse->src.l3num = orig->src.l3num;
+ if (l3proto->invert_tuple(inverse, orig) == 0)
+ return false;
+
+ inverse->dst.dir = !orig->dst.dir;
+
+ inverse->dst.protonum = orig->dst.protonum;
+ return l4proto->invert_tuple(inverse, orig);
+}
+EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
+
+static void
+clean_from_lists(struct nf_conn *ct)
+{
+ pr_debug("clean_from_lists(%p)\n", ct);
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
+
+ /* Destroy all pending expectations */
+ nf_ct_remove_expectations(ct);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_dying_list(struct nf_conn *ct)
+{
+ struct ct_pcpu *pcpu;
+
+ /* add this conntrack to the (per cpu) dying list */
+ ct->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+ spin_lock(&pcpu->lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->dying);
+ spin_unlock(&pcpu->lock);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
+{
+ struct ct_pcpu *pcpu;
+
+ /* add this conntrack to the (per cpu) unconfirmed list */
+ ct->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+ spin_lock(&pcpu->lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->unconfirmed);
+ spin_unlock(&pcpu->lock);
+}
+
+/* must be called with local_bh_disable */
+static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
+{
+ struct ct_pcpu *pcpu;
+
+ /* We overload first tuple to link into unconfirmed or dying list.*/
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
+
+ spin_lock(&pcpu->lock);
+ BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ spin_unlock(&pcpu->lock);
+}
+
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+ struct nf_conn *ct = (struct nf_conn *)nfct;
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_l4proto *l4proto;
+
+ pr_debug("destroy_conntrack(%p)\n", ct);
+ NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
+ NF_CT_ASSERT(!timer_pending(&ct->timeout));
+
+ rcu_read_lock();
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto && l4proto->destroy)
+ l4proto->destroy(ct);
+
+ rcu_read_unlock();
+
+ local_bh_disable();
+ /* Expectations will have been removed in clean_from_lists,
+ * except TFTP can create an expectation on the first packet,
+ * before connection is in the list, so we need to clean here,
+ * too.
+ */
+ nf_ct_remove_expectations(ct);
+
+ nf_ct_del_from_dying_or_unconfirmed_list(ct);
+
+ NF_CT_STAT_INC(net, delete);
+ local_bh_enable();
+
+ if (ct->master)
+ nf_ct_put(ct->master);
+
+ pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
+ nf_conntrack_free(ct);
+}
+
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ unsigned int hash, reply_hash;
+ u16 zone = nf_ct_zone(ct);
+ unsigned int sequence;
+
+ nf_ct_helper_destroy(ct);
+
+ local_bh_disable();
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+
+ clean_from_lists(ct);
+ nf_conntrack_double_unlock(hash, reply_hash);
+
+ nf_ct_add_to_dying_list(ct);
+
+ NF_CT_STAT_INC(net, delete_list);
+ local_bh_enable();
+}
+
+bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
+{
+ struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp && tstamp->stop == 0)
+ tstamp->stop = ktime_get_real_ns();
+
+ if (nf_ct_is_dying(ct))
+ goto delete;
+
+ if (nf_conntrack_event_report(IPCT_DESTROY, ct,
+ portid, report) < 0) {
+ /* destroy event was not delivered */
+ nf_ct_delete_from_lists(ct);
+ nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
+ return false;
+ }
+
+ nf_conntrack_ecache_work(nf_ct_net(ct));
+ set_bit(IPS_DYING_BIT, &ct->status);
+ delete:
+ nf_ct_delete_from_lists(ct);
+ nf_ct_put(ct);
+ return true;
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
+}
+
+static inline bool
+nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
+ const struct nf_conntrack_tuple *tuple,
+ u16 zone)
+{
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ /* A conntrack can be recreated with the equal tuple,
+ * so we need to check that the conntrack is confirmed
+ */
+ return nf_ct_tuple_equal(tuple, &h->tuple) &&
+ nf_ct_zone(ct) == zone &&
+ nf_ct_is_confirmed(ct);
+}
+
+/*
+ * Warning :
+ * - Caller must take a reference on returned object
+ * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
+ */
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ unsigned int bucket = hash_bucket(hash, net);
+
+ /* Disable BHs the entire time since we normally need to disable them
+ * at least once for the stats anyway.
+ */
+ local_bh_disable();
+begin:
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
+ if (nf_ct_key_equal(h, tuple, zone)) {
+ NF_CT_STAT_INC(net, found);
+ local_bh_enable();
+ return h;
+ }
+ NF_CT_STAT_INC(net, searched);
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(n) != bucket) {
+ NF_CT_STAT_INC(net, search_restart);
+ goto begin;
+ }
+ local_bh_enable();
+
+ return NULL;
+}
+
+/* Find a connection corresponding to a tuple. */
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+
+ rcu_read_lock();
+begin:
+ h = ____nf_conntrack_find(net, zone, tuple, hash);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (unlikely(nf_ct_is_dying(ct) ||
+ !atomic_inc_not_zero(&ct->ct_general.use)))
+ h = NULL;
+ else {
+ if (unlikely(!nf_ct_key_equal(h, tuple, zone))) {
+ nf_ct_put(ct);
+ goto begin;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ return h;
+}
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone));
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
+
+static void __nf_conntrack_hash_insert(struct nf_conn *ct,
+ unsigned int hash,
+ unsigned int reply_hash)
+{
+ struct net *net = nf_ct_net(ct);
+
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &net->ct.hash[hash]);
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
+ &net->ct.hash[reply_hash]);
+}
+
+int
+nf_conntrack_hash_check_insert(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ unsigned int hash, reply_hash;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ u16 zone;
+ unsigned int sequence;
+
+ zone = nf_ct_zone(ct);
+
+ local_bh_disable();
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+
+ /* See if there's one in the list already, including reverse */
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+
+ add_timer(&ct->timeout);
+ smp_wmb();
+ /* The caller holds a reference to this object */
+ atomic_set(&ct->ct_general.use, 2);
+ __nf_conntrack_hash_insert(ct, hash, reply_hash);
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert);
+ local_bh_enable();
+ return 0;
+
+out:
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert_failed);
+ local_bh_enable();
+ return -EEXIST;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
+
+/* deletion from this larval template list happens via nf_ct_put() */
+void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
+{
+ struct ct_pcpu *pcpu;
+
+ __set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
+ __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+ nf_conntrack_get(&tmpl->ct_general);
+
+ /* add this conntrack to the (per cpu) tmpl list */
+ local_bh_disable();
+ tmpl->cpu = smp_processor_id();
+ pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
+
+ spin_lock(&pcpu->lock);
+ /* Overload tuple linked list to put us in template list. */
+ hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &pcpu->tmpl);
+ spin_unlock_bh(&pcpu->lock);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
+
+/* Confirm a connection given skb; places it in hash table */
+int
+__nf_conntrack_confirm(struct sk_buff *skb)
+{
+ unsigned int hash, reply_hash;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct nf_conn_help *help;
+ struct nf_conn_tstamp *tstamp;
+ struct hlist_nulls_node *n;
+ enum ip_conntrack_info ctinfo;
+ struct net *net;
+ u16 zone;
+ unsigned int sequence;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ net = nf_ct_net(ct);
+
+ /* ipt_REJECT uses nf_conntrack_attach to attach related
+ ICMP/TCP RST packets in other direction. Actual packet
+ which created connection will be IP_CT_NEW or for an
+ expected connection, IP_CT_RELATED. */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ zone = nf_ct_zone(ct);
+ local_bh_disable();
+
+ do {
+ sequence = read_seqcount_begin(&net->ct.generation);
+ /* reuse the hash saved before */
+ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+ hash = hash_bucket(hash, net);
+ reply_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+
+ /* We're not in hash table, and we refuse to set up related
+ * connections for unconfirmed conns. But packet copies and
+ * REJECT will give spurious warnings here.
+ */
+ /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
+
+ /* No external references means no one else could have
+ * confirmed us.
+ */
+ NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+ pr_debug("Confirming conntrack %p\n", ct);
+ /* We have to check the DYING flag after unlink to prevent
+ * a race against nf_ct_get_next_corpse() possibly called from
+ * user context, else we insert an already 'dead' hash, blocking
+ * further use of that particular connection -JM.
+ */
+ nf_ct_del_from_dying_or_unconfirmed_list(ct);
+
+ if (unlikely(nf_ct_is_dying(ct)))
+ goto out;
+
+ /* See if there's one in the list already, including reverse:
+ NAT could have grabbed it without realizing, since we're
+ not in the hash. If there is, we lost race. */
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &h->tuple) &&
+ zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+ goto out;
+
+ /* Timer relative to confirmation time, not original
+ setting time, otherwise we'd get timer wrap in
+ weird delay cases. */
+ ct->timeout.expires += jiffies;
+ add_timer(&ct->timeout);
+ atomic_inc(&ct->ct_general.use);
+ ct->status |= IPS_CONFIRMED;
+
+ /* set conntrack timestamp, if enabled. */
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp(skb);
+
+ tstamp->start = ktime_to_ns(skb->tstamp);
+ }
+ /* Since the lookup is lockless, hash insertion must be done after
+ * starting the timer and setting the CONFIRMED bit. The RCU barriers
+ * guarantee that no other CPU can find the conntrack before the above
+ * stores are visible.
+ */
+ __nf_conntrack_hash_insert(ct, hash, reply_hash);
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert);
+ local_bh_enable();
+
+ help = nfct_help(ct);
+ if (help && help->helper)
+ nf_conntrack_event_cache(IPCT_HELPER, ct);
+
+ nf_conntrack_event_cache(master_ct(ct) ?
+ IPCT_RELATED : IPCT_NEW, ct);
+ return NF_ACCEPT;
+
+out:
+ nf_ct_add_to_dying_list(ct);
+ nf_conntrack_double_unlock(hash, reply_hash);
+ NF_CT_STAT_INC(net, insert_failed);
+ local_bh_enable();
+ return NF_DROP;
+}
+EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
+
+/* Returns true if a connection correspondings to the tuple (required
+ for NAT). */
+int
+nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack)
+{
+ struct net *net = nf_ct_net(ignored_conntrack);
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *ct;
+ u16 zone = nf_ct_zone(ignored_conntrack);
+ unsigned int hash = hash_conntrack(net, zone, tuple);
+
+ /* Disable BHs the entire time since we need to disable them at
+ * least once for the stats anyway.
+ */
+ rcu_read_lock_bh();
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (ct != ignored_conntrack &&
+ nf_ct_tuple_equal(tuple, &h->tuple) &&
+ nf_ct_zone(ct) == zone) {
+ NF_CT_STAT_INC(net, found);
+ rcu_read_unlock_bh();
+ return 1;
+ }
+ NF_CT_STAT_INC(net, searched);
+ }
+ rcu_read_unlock_bh();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
+
+#define NF_CT_EVICTION_RANGE 8
+
+/* There's a small race here where we may free a just-assured
+ connection. Too bad: we're in trouble anyway. */
+static noinline int early_drop(struct net *net, unsigned int _hash)
+{
+ /* Use oldest entry, which is roughly LRU */
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct = NULL, *tmp;
+ struct hlist_nulls_node *n;
+ unsigned int i = 0, cnt = 0;
+ int dropped = 0;
+ unsigned int hash, sequence;
+ spinlock_t *lockp;
+
+ local_bh_disable();
+restart:
+ sequence = read_seqcount_begin(&net->ct.generation);
+ hash = hash_bucket(_hash, net);
+ for (; i < net->ct.htable_size; i++) {
+ lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
+ spin_lock(lockp);
+ if (read_seqcount_retry(&net->ct.generation, sequence)) {
+ spin_unlock(lockp);
+ goto restart;
+ }
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
+ hnnode) {
+ tmp = nf_ct_tuplehash_to_ctrack(h);
+ if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
+ !nf_ct_is_dying(tmp) &&
+ atomic_inc_not_zero(&tmp->ct_general.use)) {
+ ct = tmp;
+ break;
+ }
+ cnt++;
+ }
+
+ hash = (hash + 1) % net->ct.htable_size;
+ spin_unlock(lockp);
+
+ if (ct || cnt >= NF_CT_EVICTION_RANGE)
+ break;
+
+ }
+ local_bh_enable();
+
+ if (!ct)
+ return dropped;
+
+ if (del_timer(&ct->timeout)) {
+ if (nf_ct_delete(ct, 0, 0)) {
+ dropped = 1;
+ NF_CT_STAT_INC_ATOMIC(net, early_drop);
+ }
+ }
+ nf_ct_put(ct);
+ return dropped;
+}
+
+void init_nf_conntrack_hash_rnd(void)
+{
+ unsigned int rand;
+
+ /*
+ * Why not initialize nf_conntrack_rnd in a "init()" function ?
+ * Because there isn't enough entropy when system initializing,
+ * and we initialize it as late as possible.
+ */
+ do {
+ get_random_bytes(&rand, sizeof(rand));
+ } while (!rand);
+ cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+}
+
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp, u32 hash)
+{
+ struct nf_conn *ct;
+
+ if (unlikely(!nf_conntrack_hash_rnd)) {
+ init_nf_conntrack_hash_rnd();
+ /* recompute the hash as nf_conntrack_hash_rnd is initialized */
+ hash = hash_conntrack_raw(orig, zone);
+ }
+
+ /* We don't want any race condition at early drop stage */
+ atomic_inc(&net->ct.count);
+
+ if (nf_conntrack_max &&
+ unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+ if (!early_drop(net, hash)) {
+ atomic_dec(&net->ct.count);
+ net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ /*
+ * Do not use kmem_cache_zalloc(), as this cache uses
+ * SLAB_DESTROY_BY_RCU.
+ */
+ ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+ if (ct == NULL) {
+ atomic_dec(&net->ct.count);
+ return ERR_PTR(-ENOMEM);
+ }
+ spin_lock_init(&ct->lock);
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+ /* save hash for reusing when confirming */
+ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
+ ct->status = 0;
+ /* Don't set timer yet: wait for confirmation */
+ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
+ write_pnet(&ct->ct_net, net);
+ memset(&ct->__nfct_init_offset[0], 0,
+ offsetof(struct nf_conn, proto) -
+ offsetof(struct nf_conn, __nfct_init_offset[0]));
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ if (zone) {
+ struct nf_conntrack_zone *nf_ct_zone;
+
+ nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
+ if (!nf_ct_zone)
+ goto out_free;
+ nf_ct_zone->id = zone;
+ }
+#endif
+ /* Because we use RCU lookups, we set ct_general.use to zero before
+ * this is inserted in any list.
+ */
+ atomic_set(&ct->ct_general.use, 0);
+ return ct;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+out_free:
+ atomic_dec(&net->ct.count);
+ kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ return ERR_PTR(-ENOMEM);
+#endif
+}
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp)
+{
+ return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
+
+void nf_conntrack_free(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+
+ /* A freed object has refcnt == 0, that's
+ * the golden rule for SLAB_DESTROY_BY_RCU
+ */
+ NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
+
+ nf_ct_ext_destroy(ct);
+ nf_ct_ext_free(ct);
+ kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+ smp_mb__before_atomic();
+ atomic_dec(&net->ct.count);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_free);
+
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ failed due to stress. Otherwise it really is unclassifiable. */
+static struct nf_conntrack_tuple_hash *
+init_conntrack(struct net *net, struct nf_conn *tmpl,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l3proto *l3proto,
+ struct nf_conntrack_l4proto *l4proto,
+ struct sk_buff *skb,
+ unsigned int dataoff, u32 hash)
+{
+ struct nf_conn *ct;
+ struct nf_conn_help *help;
+ struct nf_conntrack_tuple repl_tuple;
+ struct nf_conntrack_ecache *ecache;
+ struct nf_conntrack_expect *exp = NULL;
+ u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ struct nf_conn_timeout *timeout_ext;
+ unsigned int *timeouts;
+
+ if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
+ pr_debug("Can't invert tuple.\n");
+ return NULL;
+ }
+
+ ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+ hash);
+ if (IS_ERR(ct))
+ return (struct nf_conntrack_tuple_hash *)ct;
+
+ if (tmpl && nfct_synproxy(tmpl)) {
+ nfct_seqadj_ext_add(ct);
+ nfct_synproxy_ext_add(ct);
+ }
+
+ timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
+ if (timeout_ext)
+ timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
+ else
+ timeouts = l4proto->get_timeouts(net);
+
+ if (!l4proto->new(ct, skb, dataoff, timeouts)) {
+ nf_conntrack_free(ct);
+ pr_debug("init conntrack: can't track with proto module\n");
+ return NULL;
+ }
+
+ if (timeout_ext)
+ nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC);
+
+ nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
+ nf_ct_labels_ext_add(ct);
+
+ ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
+ nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+ ecache ? ecache->expmask : 0,
+ GFP_ATOMIC);
+
+ local_bh_disable();
+ if (net->ct.expect_count) {
+ spin_lock(&nf_conntrack_expect_lock);
+ exp = nf_ct_find_expectation(net, zone, tuple);
+ if (exp) {
+ pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+ ct, exp);
+ /* Welcome, Mr. Bond. We've been expecting you... */
+ __set_bit(IPS_EXPECTED_BIT, &ct->status);
+ /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
+ ct->master = exp->master;
+ if (exp->helper) {
+ help = nf_ct_helper_ext_add(ct, exp->helper,
+ GFP_ATOMIC);
+ if (help)
+ rcu_assign_pointer(help->helper, exp->helper);
+ }
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ ct->mark = exp->master->mark;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ ct->secmark = exp->master->secmark;
+#endif
+ NF_CT_STAT_INC(net, expect_new);
+ }
+ spin_unlock(&nf_conntrack_expect_lock);
+ }
+ if (!exp) {
+ __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
+ NF_CT_STAT_INC(net, new);
+ }
+
+ /* Now it is inserted into the unconfirmed list, bump refcount */
+ nf_conntrack_get(&ct->ct_general);
+ nf_ct_add_to_unconfirmed_list(ct);
+
+ local_bh_enable();
+
+ if (exp) {
+ if (exp->expectfn)
+ exp->expectfn(ct, exp);
+ nf_ct_expect_put(exp);
+ }
+
+ return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
+}
+
+/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+static inline struct nf_conn *
+resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ u_int16_t l3num,
+ u_int8_t protonum,
+ struct nf_conntrack_l3proto *l3proto,
+ struct nf_conntrack_l4proto *l4proto,
+ int *set_reply,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ u32 hash;
+
+ if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
+ dataoff, l3num, protonum, &tuple, l3proto,
+ l4proto)) {
+ pr_debug("resolve_normal_ct: Can't get tuple\n");
+ return NULL;
+ }
+
+ /* look for tuple match */
+ hash = hash_conntrack_raw(&tuple, zone);
+ h = __nf_conntrack_find_get(net, zone, &tuple, hash);
+ if (!h) {
+ h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
+ skb, dataoff, hash);
+ if (!h)
+ return NULL;
+ if (IS_ERR(h))
+ return (void *)h;
+ }
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ /* It exists; we have (non-exclusive) reference. */
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
+ *ctinfo = IP_CT_ESTABLISHED_REPLY;
+ /* Please set reply bit if this packet OK */
+ *set_reply = 1;
+ } else {
+ /* Once we've had two way comms, always ESTABLISHED. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
+ *ctinfo = IP_CT_ESTABLISHED;
+ } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
+ pr_debug("nf_conntrack_in: related packet for %p\n",
+ ct);
+ *ctinfo = IP_CT_RELATED;
+ } else {
+ pr_debug("nf_conntrack_in: new packet for %p\n", ct);
+ *ctinfo = IP_CT_NEW;
+ }
+ *set_reply = 0;
+ }
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return ct;
+}
+
+unsigned int
+nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ struct nf_conn *ct, *tmpl = NULL;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ unsigned int *timeouts;
+ unsigned int dataoff;
+ u_int8_t protonum;
+ int set_reply = 0;
+ int ret;
+
+ if (skb->nfct) {
+ /* Previously seen (loopback or untracked)? Ignore. */
+ tmpl = (struct nf_conn *)skb->nfct;
+ if (!nf_ct_is_template(tmpl)) {
+ NF_CT_STAT_INC_ATOMIC(net, ignore);
+ return NF_ACCEPT;
+ }
+ skb->nfct = NULL;
+ }
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ l3proto = __nf_ct_l3proto_find(pf);
+ ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
+ &dataoff, &protonum);
+ if (ret <= 0) {
+ pr_debug("not prepared to track yet or error occurred\n");
+ NF_CT_STAT_INC_ATOMIC(net, error);
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
+ ret = -ret;
+ goto out;
+ }
+
+ l4proto = __nf_ct_l4proto_find(pf, protonum);
+
+ /* It may be an special packet, error, unclean...
+ * inverse of the return code tells to the netfilter
+ * core what to do with the packet. */
+ if (l4proto->error != NULL) {
+ ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
+ pf, hooknum);
+ if (ret <= 0) {
+ NF_CT_STAT_INC_ATOMIC(net, error);
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
+ ret = -ret;
+ goto out;
+ }
+ /* ICMP[v6] protocol trackers may assign one conntrack. */
+ if (skb->nfct)
+ goto out;
+ }
+
+ ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
+ l3proto, l4proto, &set_reply, &ctinfo);
+ if (!ct) {
+ /* Not valid part of a connection */
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
+ ret = NF_ACCEPT;
+ goto out;
+ }
+
+ if (IS_ERR(ct)) {
+ /* Too stressed to deal. */
+ NF_CT_STAT_INC_ATOMIC(net, drop);
+ ret = NF_DROP;
+ goto out;
+ }
+
+ NF_CT_ASSERT(skb->nfct);
+
+ /* Decide what timeout policy we want to apply to this flow. */
+ timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
+
+ ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
+ if (ret <= 0) {
+ /* Invalid: inverse of the return code tells
+ * the netfilter core what to do */
+ pr_debug("nf_conntrack_in: Can't track with proto module\n");
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
+ if (ret == -NF_DROP)
+ NF_CT_STAT_INC_ATOMIC(net, drop);
+ ret = -ret;
+ goto out;
+ }
+
+ if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_REPLY, ct);
+out:
+ if (tmpl) {
+ /* Special case: we have to repeat this hook, assign the
+ * template again to this packet. We assume that this packet
+ * has no conntrack assigned. This is used by nf_ct_tcp. */
+ if (ret == NF_REPEAT)
+ skb->nfct = (struct nf_conntrack *)tmpl;
+ else
+ nf_ct_put(tmpl);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_in);
+
+bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
+ const struct nf_conntrack_tuple *orig)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = nf_ct_invert_tuple(inverse, orig,
+ __nf_ct_l3proto_find(orig->src.l3num),
+ __nf_ct_l4proto_find(orig->src.l3num,
+ orig->dst.protonum));
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
+
+/* Alter reply tuple (maybe alter helper). This is for NAT, and is
+ implicitly racy: see __nf_conntrack_confirm */
+void nf_conntrack_alter_reply(struct nf_conn *ct,
+ const struct nf_conntrack_tuple *newreply)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+
+ /* Should be unconfirmed, so not in hash table yet */
+ NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+
+ pr_debug("Altering reply tuple of %p to ", ct);
+ nf_ct_dump_tuple(newreply);
+
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+ if (ct->master || (help && !hlist_empty(&help->expectations)))
+ return;
+
+ rcu_read_lock();
+ __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
+
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __nf_ct_refresh_acct(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb,
+ unsigned long extra_jiffies,
+ int do_acct)
+{
+ NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
+ NF_CT_ASSERT(skb);
+
+ /* Only update if this is not a fixed timeout */
+ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ goto acct;
+
+ /* If not in hash table, timer will not be active yet */
+ if (!nf_ct_is_confirmed(ct)) {
+ ct->timeout.expires = extra_jiffies;
+ } else {
+ unsigned long newtime = jiffies + extra_jiffies;
+
+ /* Only update the timeout if the new timeout is at least
+ HZ jiffies from the old timeout. Need del_timer for race
+ avoidance (may already be dying). */
+ if (newtime - ct->timeout.expires >= HZ)
+ mod_timer_pending(&ct->timeout, newtime);
+ }
+
+acct:
+ if (do_acct) {
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
+
+bool __nf_ct_kill_acct(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb,
+ int do_acct)
+{
+ if (do_acct) {
+ struct nf_conn_acct *acct;
+
+ acct = nf_conn_acct_find(ct);
+ if (acct) {
+ struct nf_conn_counter *counter = acct->counter;
+
+ atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
+ atomic64_add(skb->len - skb_network_offset(skb),
+ &counter[CTINFO2DIR(ctinfo)].bytes);
+ }
+ }
+
+ if (del_timer(&ct->timeout)) {
+ ct->timeout.function((unsigned long)ct);
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
+ .len = sizeof(struct nf_conntrack_zone),
+ .align = __alignof__(struct nf_conntrack_zone),
+ .id = NF_CT_EXT_ZONE,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/mutex.h>
+
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
+ nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
+
+const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
+ [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
+ [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
+};
+EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
+
+int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *t)
+{
+ if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
+ return -EINVAL;
+
+ t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
+ t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
+
+int nf_ct_port_nlattr_tuple_size(void)
+{
+ return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+}
+EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
+#endif
+
+/* Used by ipt_REJECT and ip6t_REJECT. */
+static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ /* This ICMP is in reverse direction to the packet which caused it */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+ ctinfo = IP_CT_RELATED_REPLY;
+ else
+ ctinfo = IP_CT_RELATED;
+
+ /* Attach to new skbuff, and increment count */
+ nskb->nfct = &ct->ct_general;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nskb->nfct);
+}
+
+/* Bring out ya dead! */
+static struct nf_conn *
+get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
+ void *data, unsigned int *bucket)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct hlist_nulls_node *n;
+ int cpu;
+ spinlock_t *lockp;
+
+ for (; *bucket < net->ct.htable_size; (*bucket)++) {
+ lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
+ local_bh_disable();
+ spin_lock(lockp);
+ if (*bucket < net->ct.htable_size) {
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (iter(ct, data))
+ goto found;
+ }
+ }
+ spin_unlock(lockp);
+ local_bh_enable();
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (iter(ct, data))
+ set_bit(IPS_DYING_BIT, &ct->status);
+ }
+ spin_unlock_bh(&pcpu->lock);
+ }
+ return NULL;
+found:
+ atomic_inc(&ct->ct_general.use);
+ spin_unlock(lockp);
+ local_bh_enable();
+ return ct;
+}
+
+void nf_ct_iterate_cleanup(struct net *net,
+ int (*iter)(struct nf_conn *i, void *data),
+ void *data, u32 portid, int report)
+{
+ struct nf_conn *ct;
+ unsigned int bucket = 0;
+
+ while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
+ /* Time to push up daises... */
+ if (del_timer(&ct->timeout))
+ nf_ct_delete(ct, portid, report);
+
+ /* ... else the timer will get him soon. */
+
+ nf_ct_put(ct);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
+
+static int kill_all(struct nf_conn *i, void *data)
+{
+ return 1;
+}
+
+void nf_ct_free_hashtable(void *hash, unsigned int size)
+{
+ if (is_vmalloc_addr(hash))
+ vfree(hash);
+ else
+ free_pages((unsigned long)hash,
+ get_order(sizeof(struct hlist_head) * size));
+}
+EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
+
+static int untrack_refs(void)
+{
+ int cnt = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+
+ cnt += atomic_read(&ct->ct_general.use) - 1;
+ }
+ return cnt;
+}
+
+void nf_conntrack_cleanup_start(void)
+{
+ RCU_INIT_POINTER(ip_ct_attach, NULL);
+}
+
+void nf_conntrack_cleanup_end(void)
+{
+ RCU_INIT_POINTER(nf_ct_destroy, NULL);
+ while (untrack_refs() > 0)
+ schedule();
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ nf_ct_extend_unregister(&nf_ct_zone_extend);
+#endif
+ nf_conntrack_proto_fini();
+ nf_conntrack_seqadj_fini();
+ nf_conntrack_labels_fini();
+ nf_conntrack_helper_fini();
+ nf_conntrack_timeout_fini();
+ nf_conntrack_ecache_fini();
+ nf_conntrack_tstamp_fini();
+ nf_conntrack_acct_fini();
+ nf_conntrack_expect_fini();
+}
+
+/*
+ * Mishearing the voices in his head, our hero wonders how he's
+ * supposed to kill the mall.
+ */
+void nf_conntrack_cleanup_net(struct net *net)
+{
+ LIST_HEAD(single);
+
+ list_add(&net->exit_list, &single);
+ nf_conntrack_cleanup_net_list(&single);
+}
+
+void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
+{
+ int busy;
+ struct net *net;
+
+ /*
+ * This makes sure all current packets have passed through
+ * netfilter framework. Roll on, two-stage module
+ * delete...
+ */
+ synchronize_net();
+i_see_dead_people:
+ busy = 0;
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0);
+ if (atomic_read(&net->ct.count) != 0)
+ busy = 1;
+ }
+ if (busy) {
+ schedule();
+ goto i_see_dead_people;
+ }
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+ nf_conntrack_proto_pernet_fini(net);
+ nf_conntrack_helper_pernet_fini(net);
+ nf_conntrack_ecache_pernet_fini(net);
+ nf_conntrack_tstamp_pernet_fini(net);
+ nf_conntrack_acct_pernet_fini(net);
+ nf_conntrack_expect_pernet_fini(net);
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+ kfree(net->ct.slabname);
+ free_percpu(net->ct.stat);
+ free_percpu(net->ct.pcpu_lists);
+ }
+}
+
+void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
+{
+ struct hlist_nulls_head *hash;
+ unsigned int nr_slots, i;
+ size_t sz;
+
+ BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
+ nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
+ sz = nr_slots * sizeof(struct hlist_nulls_head);
+ hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ get_order(sz));
+ if (!hash) {
+ printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
+ hash = vzalloc(sz);
+ }
+
+ if (hash && nulls)
+ for (i = 0; i < nr_slots; i++)
+ INIT_HLIST_NULLS_HEAD(&hash[i], i);
+
+ return hash;
+}
+EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
+
+int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+{
+ int i, bucket, rc;
+ unsigned int hashsize, old_size;
+ struct hlist_nulls_head *hash, *old_hash;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+
+ if (current->nsproxy->net_ns != &init_net)
+ return -EOPNOTSUPP;
+
+ /* On boot, we can set this without any fancy locking. */
+ if (!nf_conntrack_htable_size)
+ return param_set_uint(val, kp);
+
+ rc = kstrtouint(val, 0, &hashsize);
+ if (rc)
+ return rc;
+ if (!hashsize)
+ return -EINVAL;
+
+ hash = nf_ct_alloc_hashtable(&hashsize, 1);
+ if (!hash)
+ return -ENOMEM;
+
+ local_bh_disable();
+ nf_conntrack_all_lock();
+ write_seqcount_begin(&init_net.ct.generation);
+
+ /* Lookups in the old hash might happen in parallel, which means we
+ * might get false negatives during connection lookup. New connections
+ * created because of a false negative won't make it into the hash
+ * though since that required taking the locks.
+ */
+
+ for (i = 0; i < init_net.ct.htable_size; i++) {
+ while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
+ h = hlist_nulls_entry(init_net.ct.hash[i].first,
+ struct nf_conntrack_tuple_hash, hnnode);
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ hlist_nulls_del_rcu(&h->hnnode);
+ bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
+ hashsize);
+ hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
+ }
+ }
+ old_size = init_net.ct.htable_size;
+ old_hash = init_net.ct.hash;
+
+ init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
+ init_net.ct.hash = hash;
+
+ write_seqcount_end(&init_net.ct.generation);
+ nf_conntrack_all_unlock();
+ local_bh_enable();
+
+ nf_ct_free_hashtable(old_hash, old_size);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
+
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+ &nf_conntrack_htable_size, 0600);
+
+void nf_ct_untracked_status_or(unsigned long bits)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(nf_conntrack_untracked, cpu).status |= bits;
+}
+EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
+
+int nf_conntrack_init_start(void)
+{
+ int max_factor = 8;
+ int i, ret, cpu;
+
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
+ spin_lock_init(&nf_conntrack_locks[i]);
+
+ if (!nf_conntrack_htable_size) {
+ /* Idea from tcp.c: use 1/16384 of memory.
+ * On i386: 32MB machine has 512 buckets.
+ * >= 1GB machines have 16384 buckets.
+ * >= 4GB machines have 65536 buckets.
+ */
+ nf_conntrack_htable_size
+ = (((totalram_pages << PAGE_SHIFT) / 16384)
+ / sizeof(struct hlist_head));
+ if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
+ nf_conntrack_htable_size = 65536;
+ else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
+ nf_conntrack_htable_size = 16384;
+ if (nf_conntrack_htable_size < 32)
+ nf_conntrack_htable_size = 32;
+
+ /* Use a max. factor of four by default to get the same max as
+ * with the old struct list_heads. When a table size is given
+ * we use the old value of 8 to avoid reducing the max.
+ * entries. */
+ max_factor = 4;
+ }
+ nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+
+ printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
+ NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
+ nf_conntrack_max);
+
+ ret = nf_conntrack_expect_init();
+ if (ret < 0)
+ goto err_expect;
+
+ ret = nf_conntrack_acct_init();
+ if (ret < 0)
+ goto err_acct;
+
+ ret = nf_conntrack_tstamp_init();
+ if (ret < 0)
+ goto err_tstamp;
+
+ ret = nf_conntrack_ecache_init();
+ if (ret < 0)
+ goto err_ecache;
+
+ ret = nf_conntrack_timeout_init();
+ if (ret < 0)
+ goto err_timeout;
+
+ ret = nf_conntrack_helper_init();
+ if (ret < 0)
+ goto err_helper;
+
+ ret = nf_conntrack_labels_init();
+ if (ret < 0)
+ goto err_labels;
+
+ ret = nf_conntrack_seqadj_init();
+ if (ret < 0)
+ goto err_seqadj;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ ret = nf_ct_extend_register(&nf_ct_zone_extend);
+ if (ret < 0)
+ goto err_extend;
+#endif
+ ret = nf_conntrack_proto_init();
+ if (ret < 0)
+ goto err_proto;
+
+ /* Set up fake conntrack: to never be deleted, not in any hashes */
+ for_each_possible_cpu(cpu) {
+ struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+ write_pnet(&ct->ct_net, &init_net);
+ atomic_set(&ct->ct_general.use, 1);
+ }
+ /* - and look it like as a confirmed connection */
+ nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
+ return 0;
+
+err_proto:
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ nf_ct_extend_unregister(&nf_ct_zone_extend);
+err_extend:
+#endif
+ nf_conntrack_seqadj_fini();
+err_seqadj:
+ nf_conntrack_labels_fini();
+err_labels:
+ nf_conntrack_helper_fini();
+err_helper:
+ nf_conntrack_timeout_fini();
+err_timeout:
+ nf_conntrack_ecache_fini();
+err_ecache:
+ nf_conntrack_tstamp_fini();
+err_tstamp:
+ nf_conntrack_acct_fini();
+err_acct:
+ nf_conntrack_expect_fini();
+err_expect:
+ return ret;
+}
+
+void nf_conntrack_init_end(void)
+{
+ /* For use by REJECT target */
+ RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
+ RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
+}
+
+/*
+ * We need to use special "null" values, not used in hash table
+ */
+#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
+#define DYING_NULLS_VAL ((1<<30)+1)
+#define TEMPLATE_NULLS_VAL ((1<<30)+2)
+
+int nf_conntrack_init_net(struct net *net)
+{
+ int ret = -ENOMEM;
+ int cpu;
+
+ atomic_set(&net->ct.count, 0);
+ seqcount_init(&net->ct.generation);
+
+ net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
+ if (!net->ct.pcpu_lists)
+ goto err_stat;
+
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_init(&pcpu->lock);
+ INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
+ INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
+ INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
+ }
+
+ net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
+ if (!net->ct.stat)
+ goto err_pcpu_lists;
+
+ net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
+ if (!net->ct.slabname)
+ goto err_slabname;
+
+ net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
+ sizeof(struct nf_conn), 0,
+ SLAB_DESTROY_BY_RCU, NULL);
+ if (!net->ct.nf_conntrack_cachep) {
+ printk(KERN_ERR "Unable to create nf_conn slab cache\n");
+ goto err_cache;
+ }
+
+ net->ct.htable_size = nf_conntrack_htable_size;
+ net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
+ if (!net->ct.hash) {
+ printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
+ goto err_hash;
+ }
+ ret = nf_conntrack_expect_pernet_init(net);
+ if (ret < 0)
+ goto err_expect;
+ ret = nf_conntrack_acct_pernet_init(net);
+ if (ret < 0)
+ goto err_acct;
+ ret = nf_conntrack_tstamp_pernet_init(net);
+ if (ret < 0)
+ goto err_tstamp;
+ ret = nf_conntrack_ecache_pernet_init(net);
+ if (ret < 0)
+ goto err_ecache;
+ ret = nf_conntrack_helper_pernet_init(net);
+ if (ret < 0)
+ goto err_helper;
+ ret = nf_conntrack_proto_pernet_init(net);
+ if (ret < 0)
+ goto err_proto;
+ return 0;
+
+err_proto:
+ nf_conntrack_helper_pernet_fini(net);
+err_helper:
+ nf_conntrack_ecache_pernet_fini(net);
+err_ecache:
+ nf_conntrack_tstamp_pernet_fini(net);
+err_tstamp:
+ nf_conntrack_acct_pernet_fini(net);
+err_acct:
+ nf_conntrack_expect_pernet_fini(net);
+err_expect:
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+err_hash:
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+err_cache:
+ kfree(net->ct.slabname);
+err_slabname:
+ free_percpu(net->ct.stat);
+err_pcpu_lists:
+ free_percpu(net->ct.pcpu_lists);
+err_stat:
+ return ret;
+}
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
new file mode 100644
index 000000000..4e78c57b8
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -0,0 +1,337 @@
+/* Event cache for netfilter. */
+
+/*
+ * (C) 2005 Harald Welte <laforge@gnumonks.org>
+ * (C) 2005 Patrick McHardy <kaber@trash.net>
+ * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+
+static DEFINE_MUTEX(nf_ct_ecache_mutex);
+
+#define ECACHE_RETRY_WAIT (HZ/10)
+
+enum retry_state {
+ STATE_CONGESTED,
+ STATE_RESTART,
+ STATE_DONE,
+};
+
+static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
+{
+ struct nf_conn *refs[16];
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ unsigned int evicted = 0;
+ enum retry_state ret = STATE_DONE;
+
+ spin_lock(&pcpu->lock);
+
+ hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (nf_ct_is_dying(ct))
+ continue;
+
+ if (nf_conntrack_event(IPCT_DESTROY, ct)) {
+ ret = STATE_CONGESTED;
+ break;
+ }
+
+ /* we've got the event delivered, now it's dying */
+ set_bit(IPS_DYING_BIT, &ct->status);
+ refs[evicted] = ct;
+
+ if (++evicted >= ARRAY_SIZE(refs)) {
+ ret = STATE_RESTART;
+ break;
+ }
+ }
+
+ spin_unlock(&pcpu->lock);
+
+ /* can't _put while holding lock */
+ while (evicted)
+ nf_ct_put(refs[--evicted]);
+
+ return ret;
+}
+
+static void ecache_work(struct work_struct *work)
+{
+ struct netns_ct *ctnet =
+ container_of(work, struct netns_ct, ecache_dwork.work);
+ int cpu, delay = -1;
+ struct ct_pcpu *pcpu;
+
+ local_bh_disable();
+
+ for_each_possible_cpu(cpu) {
+ enum retry_state ret;
+
+ pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu);
+
+ ret = ecache_work_evict_list(pcpu);
+
+ switch (ret) {
+ case STATE_CONGESTED:
+ delay = ECACHE_RETRY_WAIT;
+ goto out;
+ case STATE_RESTART:
+ delay = 0;
+ break;
+ case STATE_DONE:
+ break;
+ }
+ }
+
+ out:
+ local_bh_enable();
+
+ ctnet->ecache_dwork_pending = delay > 0;
+ if (delay >= 0)
+ schedule_delayed_work(&ctnet->ecache_dwork, delay);
+}
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+void nf_ct_deliver_cached_events(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ unsigned long events, missed;
+ struct nf_ct_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
+ struct nf_ct_event item;
+ int ret;
+
+ rcu_read_lock();
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+ if (notify == NULL)
+ goto out_unlock;
+
+ e = nf_ct_ecache_find(ct);
+ if (e == NULL)
+ goto out_unlock;
+
+ events = xchg(&e->cache, 0);
+
+ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events)
+ goto out_unlock;
+
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely. */
+ missed = e->missed;
+
+ if (!((events | missed) & e->ctmask))
+ goto out_unlock;
+
+ item.ct = ct;
+ item.portid = 0;
+ item.report = 0;
+
+ ret = notify->fcn(events | missed, &item);
+
+ if (likely(ret >= 0 && !missed))
+ goto out_unlock;
+
+ spin_lock_bh(&ct->lock);
+ if (ret < 0)
+ e->missed |= events;
+ else
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
+
+out_unlock:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
+
+int nf_conntrack_register_notifier(struct net *net,
+ struct nf_ct_event_notifier *new)
+{
+ int ret;
+ struct nf_ct_event_notifier *notify;
+
+ mutex_lock(&nf_ct_ecache_mutex);
+ notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
+ lockdep_is_held(&nf_ct_ecache_mutex));
+ if (notify != NULL) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
+ ret = 0;
+
+out_unlock:
+ mutex_unlock(&nf_ct_ecache_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
+
+void nf_conntrack_unregister_notifier(struct net *net,
+ struct nf_ct_event_notifier *new)
+{
+ struct nf_ct_event_notifier *notify;
+
+ mutex_lock(&nf_ct_ecache_mutex);
+ notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
+ lockdep_is_held(&nf_ct_ecache_mutex));
+ BUG_ON(notify != new);
+ RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
+ mutex_unlock(&nf_ct_ecache_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
+
+int nf_ct_expect_register_notifier(struct net *net,
+ struct nf_exp_event_notifier *new)
+{
+ int ret;
+ struct nf_exp_event_notifier *notify;
+
+ mutex_lock(&nf_ct_ecache_mutex);
+ notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
+ lockdep_is_held(&nf_ct_ecache_mutex));
+ if (notify != NULL) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
+ ret = 0;
+
+out_unlock:
+ mutex_unlock(&nf_ct_ecache_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
+
+void nf_ct_expect_unregister_notifier(struct net *net,
+ struct nf_exp_event_notifier *new)
+{
+ struct nf_exp_event_notifier *notify;
+
+ mutex_lock(&nf_ct_ecache_mutex);
+ notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
+ lockdep_is_held(&nf_ct_ecache_mutex));
+ BUG_ON(notify != new);
+ RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
+ mutex_unlock(&nf_ct_ecache_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
+
+#define NF_CT_EVENTS_DEFAULT 1
+static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table event_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_events",
+ .data = &init_net.ct.sysctl_events,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type event_extend __read_mostly = {
+ .len = sizeof(struct nf_conntrack_ecache),
+ .align = __alignof__(struct nf_conntrack_ecache),
+ .id = NF_CT_EXT_ECACHE,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_events;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.event_sysctl_header =
+ register_net_sysctl(net, "net/netfilter", table);
+ if (!net->ct.event_sysctl_header) {
+ printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.event_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.event_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+int nf_conntrack_ecache_pernet_init(struct net *net)
+{
+ net->ct.sysctl_events = nf_ct_events;
+ INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work);
+ return nf_conntrack_event_init_sysctl(net);
+}
+
+void nf_conntrack_ecache_pernet_fini(struct net *net)
+{
+ cancel_delayed_work_sync(&net->ct.ecache_dwork);
+ nf_conntrack_event_fini_sysctl(net);
+}
+
+int nf_conntrack_ecache_init(void)
+{
+ int ret = nf_ct_extend_register(&event_extend);
+ if (ret < 0)
+ pr_err("nf_ct_event: Unable to register event extension.\n");
+ return ret;
+}
+
+void nf_conntrack_ecache_fini(void)
+{
+ nf_ct_extend_unregister(&event_extend);
+}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
new file mode 100644
index 000000000..7a17070c5
--- /dev/null
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -0,0 +1,659 @@
+/* Expectation handling for nf_conntrack. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (c) 2005-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+unsigned int nf_ct_expect_hsize __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
+
+unsigned int nf_ct_expect_max __read_mostly;
+
+static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+
+/* nf_conntrack_expect helper functions */
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+ u32 portid, int report)
+{
+ struct nf_conn_help *master_help = nfct_help(exp->master);
+ struct net *net = nf_ct_exp_net(exp);
+
+ NF_CT_ASSERT(master_help);
+ NF_CT_ASSERT(!timer_pending(&exp->timeout));
+
+ hlist_del_rcu(&exp->hnode);
+ net->ct.expect_count--;
+
+ hlist_del(&exp->lnode);
+ master_help->expecting[exp->class]--;
+
+ nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
+ nf_ct_expect_put(exp);
+
+ NF_CT_STAT_INC(net, expect_delete);
+}
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
+
+static void nf_ct_expectation_timed_out(unsigned long ul_expect)
+{
+ struct nf_conntrack_expect *exp = (void *)ul_expect;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ nf_ct_unlink_expect(exp);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ nf_ct_expect_put(exp);
+}
+
+static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
+{
+ unsigned int hash;
+
+ if (unlikely(!nf_conntrack_hash_rnd)) {
+ init_nf_conntrack_hash_rnd();
+ }
+
+ hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
+ (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
+ (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
+
+ return reciprocal_scale(hash, nf_ct_expect_hsize);
+}
+
+struct nf_conntrack_expect *
+__nf_ct_expect_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_expect *i;
+ unsigned int h;
+
+ if (!net->ct.expect_count)
+ return NULL;
+
+ h = nf_ct_expect_dst_hash(tuple);
+ hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
+ if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+ nf_ct_zone(i->master) == zone)
+ return i;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
+
+/* Just find a expectation corresponding to a tuple. */
+struct nf_conntrack_expect *
+nf_ct_expect_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_expect *i;
+
+ rcu_read_lock();
+ i = __nf_ct_expect_find(net, zone, tuple);
+ if (i && !atomic_inc_not_zero(&i->use))
+ i = NULL;
+ rcu_read_unlock();
+
+ return i;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
+
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+struct nf_conntrack_expect *
+nf_ct_find_expectation(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_expect *i, *exp = NULL;
+ unsigned int h;
+
+ if (!net->ct.expect_count)
+ return NULL;
+
+ h = nf_ct_expect_dst_hash(tuple);
+ hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
+ if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
+ nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+ nf_ct_zone(i->master) == zone) {
+ exp = i;
+ break;
+ }
+ }
+ if (!exp)
+ return NULL;
+
+ /* If master is not in hash table yet (ie. packet hasn't left
+ this machine yet), how can other end know about expected?
+ Hence these are not the droids you are looking for (if
+ master ct never got confirmed, we'd hold a reference to it
+ and weird things would happen to future packets). */
+ if (!nf_ct_is_confirmed(exp->master))
+ return NULL;
+
+ /* Avoid race with other CPUs, that for exp->master ct, is
+ * about to invoke ->destroy(), or nf_ct_delete() via timeout
+ * or early_drop().
+ *
+ * The atomic_inc_not_zero() check tells: If that fails, we
+ * know that the ct is being destroyed. If it succeeds, we
+ * can be sure the ct cannot disappear underneath.
+ */
+ if (unlikely(nf_ct_is_dying(exp->master) ||
+ !atomic_inc_not_zero(&exp->master->ct_general.use)))
+ return NULL;
+
+ if (exp->flags & NF_CT_EXPECT_PERMANENT) {
+ atomic_inc(&exp->use);
+ return exp;
+ } else if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ return exp;
+ }
+ /* Undo exp->master refcnt increase, if del_timer() failed */
+ nf_ct_put(exp->master);
+
+ return NULL;
+}
+
+/* delete all expectations for this conntrack */
+void nf_ct_remove_expectations(struct nf_conn *ct)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *next;
+
+ /* Optimization: most connection never expect any others. */
+ if (!help)
+ return;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
+
+/* Would two expected things clash? */
+static inline int expect_clash(const struct nf_conntrack_expect *a,
+ const struct nf_conntrack_expect *b)
+{
+ /* Part covered by intersection of masks must be unequal,
+ otherwise they clash */
+ struct nf_conntrack_tuple_mask intersect_mask;
+ int count;
+
+ intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
+
+ for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
+ intersect_mask.src.u3.all[count] =
+ a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
+ }
+
+ return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+}
+
+static inline int expect_matches(const struct nf_conntrack_expect *a,
+ const struct nf_conntrack_expect *b)
+{
+ return a->master == b->master && a->class == b->class &&
+ nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
+ nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+ nf_ct_zone(a->master) == nf_ct_zone(b->master);
+}
+
+/* Generally a bad idea to call this: could have matched already. */
+void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
+{
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
+
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
+struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
+{
+ struct nf_conntrack_expect *new;
+
+ new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+
+ new->master = me;
+ atomic_set(&new->use, 1);
+ return new;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
+
+void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
+ u_int8_t family,
+ const union nf_inet_addr *saddr,
+ const union nf_inet_addr *daddr,
+ u_int8_t proto, const __be16 *src, const __be16 *dst)
+{
+ int len;
+
+ if (family == AF_INET)
+ len = 4;
+ else
+ len = 16;
+
+ exp->flags = 0;
+ exp->class = class;
+ exp->expectfn = NULL;
+ exp->helper = NULL;
+ exp->tuple.src.l3num = family;
+ exp->tuple.dst.protonum = proto;
+
+ if (saddr) {
+ memcpy(&exp->tuple.src.u3, saddr, len);
+ if (sizeof(exp->tuple.src.u3) > len)
+ /* address needs to be cleared for nf_ct_tuple_equal */
+ memset((void *)&exp->tuple.src.u3 + len, 0x00,
+ sizeof(exp->tuple.src.u3) - len);
+ memset(&exp->mask.src.u3, 0xFF, len);
+ if (sizeof(exp->mask.src.u3) > len)
+ memset((void *)&exp->mask.src.u3 + len, 0x00,
+ sizeof(exp->mask.src.u3) - len);
+ } else {
+ memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
+ memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
+ }
+
+ if (src) {
+ exp->tuple.src.u.all = *src;
+ exp->mask.src.u.all = htons(0xFFFF);
+ } else {
+ exp->tuple.src.u.all = 0;
+ exp->mask.src.u.all = 0;
+ }
+
+ memcpy(&exp->tuple.dst.u3, daddr, len);
+ if (sizeof(exp->tuple.dst.u3) > len)
+ /* address needs to be cleared for nf_ct_tuple_equal */
+ memset((void *)&exp->tuple.dst.u3 + len, 0x00,
+ sizeof(exp->tuple.dst.u3) - len);
+
+ exp->tuple.dst.u.all = *dst;
+
+#ifdef CONFIG_NF_NAT_NEEDED
+ memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
+ memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
+#endif
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_init);
+
+static void nf_ct_expect_free_rcu(struct rcu_head *head)
+{
+ struct nf_conntrack_expect *exp;
+
+ exp = container_of(head, struct nf_conntrack_expect, rcu);
+ kmem_cache_free(nf_ct_expect_cachep, exp);
+}
+
+void nf_ct_expect_put(struct nf_conntrack_expect *exp)
+{
+ if (atomic_dec_and_test(&exp->use))
+ call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_put);
+
+static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+{
+ struct nf_conn_help *master_help = nfct_help(exp->master);
+ struct nf_conntrack_helper *helper;
+ struct net *net = nf_ct_exp_net(exp);
+ unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
+
+ /* two references : one for hash insert, one for the timer */
+ atomic_add(2, &exp->use);
+
+ hlist_add_head(&exp->lnode, &master_help->expectations);
+ master_help->expecting[exp->class]++;
+
+ hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
+ net->ct.expect_count++;
+
+ setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
+ (unsigned long)exp);
+ helper = rcu_dereference_protected(master_help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock));
+ if (helper) {
+ exp->timeout.expires = jiffies +
+ helper->expect_policy[exp->class].timeout * HZ;
+ }
+ add_timer(&exp->timeout);
+
+ NF_CT_STAT_INC(net, expect_create);
+ return 0;
+}
+
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct nf_conn *master,
+ struct nf_conntrack_expect *new)
+{
+ struct nf_conn_help *master_help = nfct_help(master);
+ struct nf_conntrack_expect *exp, *last = NULL;
+
+ hlist_for_each_entry(exp, &master_help->expectations, lnode) {
+ if (exp->class == new->class)
+ last = exp;
+ }
+
+ if (last && del_timer(&last->timeout)) {
+ nf_ct_unlink_expect(last);
+ nf_ct_expect_put(last);
+ }
+}
+
+static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
+{
+ const struct nf_conntrack_expect_policy *p;
+ struct nf_conntrack_expect *i;
+ struct nf_conn *master = expect->master;
+ struct nf_conn_help *master_help = nfct_help(master);
+ struct nf_conntrack_helper *helper;
+ struct net *net = nf_ct_exp_net(expect);
+ struct hlist_node *next;
+ unsigned int h;
+ int ret = 1;
+
+ if (!master_help) {
+ ret = -ESHUTDOWN;
+ goto out;
+ }
+ h = nf_ct_expect_dst_hash(&expect->tuple);
+ hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
+ if (expect_matches(i, expect)) {
+ if (del_timer(&i->timeout)) {
+ nf_ct_unlink_expect(i);
+ nf_ct_expect_put(i);
+ break;
+ }
+ } else if (expect_clash(i, expect)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+ /* Will be over limit? */
+ helper = rcu_dereference_protected(master_help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock));
+ if (helper) {
+ p = &helper->expect_policy[expect->class];
+ if (p->max_expected &&
+ master_help->expecting[expect->class] >= p->max_expected) {
+ evict_oldest_expect(master, expect);
+ if (master_help->expecting[expect->class]
+ >= p->max_expected) {
+ ret = -EMFILE;
+ goto out;
+ }
+ }
+ }
+
+ if (net->ct.expect_count >= nf_ct_expect_max) {
+ net_warn_ratelimited("nf_conntrack: expectation table full\n");
+ ret = -EMFILE;
+ }
+out:
+ return ret;
+}
+
+int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
+ u32 portid, int report)
+{
+ int ret;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ ret = __nf_ct_expect_check(expect);
+ if (ret <= 0)
+ goto out;
+
+ ret = nf_ct_expect_insert(expect);
+ if (ret < 0)
+ goto out;
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
+ return ret;
+out:
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
+
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+struct ct_expect_iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_expect_iter_state *st = seq->private;
+ struct hlist_node *n;
+
+ for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
+ n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ if (n)
+ return n;
+ }
+ return NULL;
+}
+
+static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
+ struct hlist_node *head)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_expect_iter_state *st = seq->private;
+
+ head = rcu_dereference(hlist_next_rcu(head));
+ while (head == NULL) {
+ if (++st->bucket >= nf_ct_expect_hsize)
+ return NULL;
+ head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ }
+ return head;
+}
+
+static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head = ct_expect_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_expect_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return ct_expect_get_idx(seq, *pos);
+}
+
+static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_expect_get_next(seq, v);
+}
+
+static void exp_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+ struct nf_conntrack_expect *expect;
+ struct nf_conntrack_helper *helper;
+ struct hlist_node *n = v;
+ char *delim = "";
+
+ expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
+
+ if (expect->timeout.function)
+ seq_printf(s, "%ld ", timer_pending(&expect->timeout)
+ ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
+ else
+ seq_printf(s, "- ");
+ seq_printf(s, "l3proto = %u proto=%u ",
+ expect->tuple.src.l3num,
+ expect->tuple.dst.protonum);
+ print_tuple(s, &expect->tuple,
+ __nf_ct_l3proto_find(expect->tuple.src.l3num),
+ __nf_ct_l4proto_find(expect->tuple.src.l3num,
+ expect->tuple.dst.protonum));
+
+ if (expect->flags & NF_CT_EXPECT_PERMANENT) {
+ seq_printf(s, "PERMANENT");
+ delim = ",";
+ }
+ if (expect->flags & NF_CT_EXPECT_INACTIVE) {
+ seq_printf(s, "%sINACTIVE", delim);
+ delim = ",";
+ }
+ if (expect->flags & NF_CT_EXPECT_USERSPACE)
+ seq_printf(s, "%sUSERSPACE", delim);
+
+ helper = rcu_dereference(nfct_help(expect->master)->helper);
+ if (helper) {
+ seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
+ if (helper->expect_policy[expect->class].name)
+ seq_printf(s, "/%s",
+ helper->expect_policy[expect->class].name);
+ }
+
+ seq_putc(s, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations exp_seq_ops = {
+ .start = exp_seq_start,
+ .next = exp_seq_next,
+ .stop = exp_seq_stop,
+ .show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &exp_seq_ops,
+ sizeof(struct ct_expect_iter_state));
+}
+
+static const struct file_operations exp_file_ops = {
+ .owner = THIS_MODULE,
+ .open = exp_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+
+static int exp_proc_init(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+ struct proc_dir_entry *proc;
+
+ proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
+ &exp_file_ops);
+ if (!proc)
+ return -ENOMEM;
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+ return 0;
+}
+
+static void exp_proc_remove(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+ remove_proc_entry("nf_conntrack_expect", net->proc_net);
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+}
+
+module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
+
+int nf_conntrack_expect_pernet_init(struct net *net)
+{
+ int err = -ENOMEM;
+
+ net->ct.expect_count = 0;
+ net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
+ if (net->ct.expect_hash == NULL)
+ goto err1;
+
+ err = exp_proc_init(net);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+err2:
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
+err1:
+ return err;
+}
+
+void nf_conntrack_expect_pernet_fini(struct net *net)
+{
+ exp_proc_remove(net);
+ nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
+}
+
+int nf_conntrack_expect_init(void)
+{
+ if (!nf_ct_expect_hsize) {
+ nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
+ if (!nf_ct_expect_hsize)
+ nf_ct_expect_hsize = 1;
+ }
+ nf_ct_expect_max = nf_ct_expect_hsize * 4;
+ nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
+ sizeof(struct nf_conntrack_expect),
+ 0, 0, NULL);
+ if (!nf_ct_expect_cachep)
+ return -ENOMEM;
+ return 0;
+}
+
+void nf_conntrack_expect_fini(void)
+{
+ rcu_barrier(); /* Wait for call_rcu() before destroy */
+ kmem_cache_destroy(nf_ct_expect_cachep);
+}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
new file mode 100644
index 000000000..1a9545965
--- /dev/null
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -0,0 +1,191 @@
+/* Structure dynamic extension infrastructure
+ * Copyright (C) 2004 Rusty Russell IBM Corporation
+ * Copyright (C) 2007 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2007 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
+static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+
+void __nf_ct_ext_destroy(struct nf_conn *ct)
+{
+ unsigned int i;
+ struct nf_ct_ext_type *t;
+ struct nf_ct_ext *ext = ct->ext;
+
+ for (i = 0; i < NF_CT_EXT_NUM; i++) {
+ if (!__nf_ct_ext_exist(ext, i))
+ continue;
+
+ rcu_read_lock();
+ t = rcu_dereference(nf_ct_ext_types[i]);
+
+ /* Here the nf_ct_ext_type might have been unregisterd.
+ * I.e., it has responsible to cleanup private
+ * area in all conntracks when it is unregisterd.
+ */
+ if (t && t->destroy)
+ t->destroy(ct);
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(__nf_ct_ext_destroy);
+
+static void *
+nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id,
+ size_t var_alloc_len, gfp_t gfp)
+{
+ unsigned int off, len;
+ struct nf_ct_ext_type *t;
+ size_t alloc_size;
+
+ rcu_read_lock();
+ t = rcu_dereference(nf_ct_ext_types[id]);
+ BUG_ON(t == NULL);
+ off = ALIGN(sizeof(struct nf_ct_ext), t->align);
+ len = off + t->len + var_alloc_len;
+ alloc_size = t->alloc_size + var_alloc_len;
+ rcu_read_unlock();
+
+ *ext = kzalloc(alloc_size, gfp);
+ if (!*ext)
+ return NULL;
+
+ (*ext)->offset[id] = off;
+ (*ext)->len = len;
+
+ return (void *)(*ext) + off;
+}
+
+void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id,
+ size_t var_alloc_len, gfp_t gfp)
+{
+ struct nf_ct_ext *old, *new;
+ int i, newlen, newoff;
+ struct nf_ct_ext_type *t;
+
+ /* Conntrack must not be confirmed to avoid races on reallocation. */
+ NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+
+ old = ct->ext;
+ if (!old)
+ return nf_ct_ext_create(&ct->ext, id, var_alloc_len, gfp);
+
+ if (__nf_ct_ext_exist(old, id))
+ return NULL;
+
+ rcu_read_lock();
+ t = rcu_dereference(nf_ct_ext_types[id]);
+ BUG_ON(t == NULL);
+
+ newoff = ALIGN(old->len, t->align);
+ newlen = newoff + t->len + var_alloc_len;
+ rcu_read_unlock();
+
+ new = __krealloc(old, newlen, gfp);
+ if (!new)
+ return NULL;
+
+ if (new != old) {
+ for (i = 0; i < NF_CT_EXT_NUM; i++) {
+ if (!__nf_ct_ext_exist(old, i))
+ continue;
+
+ rcu_read_lock();
+ t = rcu_dereference(nf_ct_ext_types[i]);
+ if (t && t->move)
+ t->move((void *)new + new->offset[i],
+ (void *)old + old->offset[i]);
+ rcu_read_unlock();
+ }
+ kfree_rcu(old, rcu);
+ ct->ext = new;
+ }
+
+ new->offset[id] = newoff;
+ new->len = newlen;
+ memset((void *)new + newoff, 0, newlen - newoff);
+ return (void *)new + newoff;
+}
+EXPORT_SYMBOL(__nf_ct_ext_add_length);
+
+static void update_alloc_size(struct nf_ct_ext_type *type)
+{
+ int i, j;
+ struct nf_ct_ext_type *t1, *t2;
+ enum nf_ct_ext_id min = 0, max = NF_CT_EXT_NUM - 1;
+
+ /* unnecessary to update all types */
+ if ((type->flags & NF_CT_EXT_F_PREALLOC) == 0) {
+ min = type->id;
+ max = type->id;
+ }
+
+ /* This assumes that extended areas in conntrack for the types
+ whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
+ for (i = min; i <= max; i++) {
+ t1 = rcu_dereference_protected(nf_ct_ext_types[i],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
+ if (!t1)
+ continue;
+
+ t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
+ t1->len;
+ for (j = 0; j < NF_CT_EXT_NUM; j++) {
+ t2 = rcu_dereference_protected(nf_ct_ext_types[j],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
+ if (t2 == NULL || t2 == t1 ||
+ (t2->flags & NF_CT_EXT_F_PREALLOC) == 0)
+ continue;
+
+ t1->alloc_size = ALIGN(t1->alloc_size, t2->align)
+ + t2->len;
+ }
+ }
+}
+
+/* This MUST be called in process context. */
+int nf_ct_extend_register(struct nf_ct_ext_type *type)
+{
+ int ret = 0;
+
+ mutex_lock(&nf_ct_ext_type_mutex);
+ if (nf_ct_ext_types[type->id]) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* This ensures that nf_ct_ext_create() can allocate enough area
+ before updating alloc_size */
+ type->alloc_size = ALIGN(sizeof(struct nf_ct_ext), type->align)
+ + type->len;
+ rcu_assign_pointer(nf_ct_ext_types[type->id], type);
+ update_alloc_size(type);
+out:
+ mutex_unlock(&nf_ct_ext_type_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_extend_register);
+
+/* This MUST be called in process context. */
+void nf_ct_extend_unregister(struct nf_ct_ext_type *type)
+{
+ mutex_lock(&nf_ct_ext_type_mutex);
+ RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
+ update_alloc_size(type);
+ mutex_unlock(&nf_ct_ext_type_mutex);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
new file mode 100644
index 000000000..b666959f1
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -0,0 +1,646 @@
+/* FTP extension for connection tracking. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/ipv6.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_ftp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp connection tracking helper");
+MODULE_ALIAS("ip_conntrack_ftp");
+MODULE_ALIAS_NFCT_HELPER("ftp");
+
+/* This is slow, but it's simple. --RR */
+static char *ftp_buffer;
+
+static DEFINE_SPINLOCK(nf_ftp_lock);
+
+#define MAX_PORTS 8
+static u_int16_t ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+
+static bool loose;
+module_param(loose, bool, 0600);
+
+unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ enum nf_ct_ftp_type type,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
+
+static int try_rfc959(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
+static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
+static int try_eprt(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
+static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
+ char, unsigned int *);
+
+static struct ftp_search {
+ const char *pattern;
+ size_t plen;
+ char skip;
+ char term;
+ enum nf_ct_ftp_type ftptype;
+ int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *);
+} search[IP_CT_DIR_MAX][2] = {
+ [IP_CT_DIR_ORIGINAL] = {
+ {
+ .pattern = "PORT",
+ .plen = sizeof("PORT") - 1,
+ .skip = ' ',
+ .term = '\r',
+ .ftptype = NF_CT_FTP_PORT,
+ .getnum = try_rfc959,
+ },
+ {
+ .pattern = "EPRT",
+ .plen = sizeof("EPRT") - 1,
+ .skip = ' ',
+ .term = '\r',
+ .ftptype = NF_CT_FTP_EPRT,
+ .getnum = try_eprt,
+ },
+ },
+ [IP_CT_DIR_REPLY] = {
+ {
+ .pattern = "227 ",
+ .plen = sizeof("227 ") - 1,
+ .ftptype = NF_CT_FTP_PASV,
+ .getnum = try_rfc1123,
+ },
+ {
+ .pattern = "229 ",
+ .plen = sizeof("229 ") - 1,
+ .skip = '(',
+ .term = ')',
+ .ftptype = NF_CT_FTP_EPSV,
+ .getnum = try_epsv_response,
+ },
+ },
+};
+
+static int
+get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term)
+{
+ const char *end;
+ int ret = in6_pton(src, min_t(size_t, dlen, 0xffff), (u8 *)dst, term, &end);
+ if (ret > 0)
+ return (int)(end - src);
+ return 0;
+}
+
+static int try_number(const char *data, size_t dlen, u_int32_t array[],
+ int array_size, char sep, char term)
+{
+ u_int32_t i, len;
+
+ memset(array, 0, sizeof(array[0])*array_size);
+
+ /* Keep data pointing at next char. */
+ for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
+ if (*data >= '0' && *data <= '9') {
+ array[i] = array[i]*10 + *data - '0';
+ }
+ else if (*data == sep)
+ i++;
+ else {
+ /* Unexpected character; true if it's the
+ terminator (or we don't care about one)
+ and we're finished. */
+ if ((*data == term || !term) && i == array_size - 1)
+ return len;
+
+ pr_debug("Char %u (got %u nums) `%u' unexpected\n",
+ len, i, *data);
+ return 0;
+ }
+ }
+ pr_debug("Failed to fill %u numbers separated by %c\n",
+ array_size, sep);
+ return 0;
+}
+
+/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
+static int try_rfc959(const char *data, size_t dlen,
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
+{
+ int length;
+ u_int32_t array[6];
+
+ length = try_number(data, dlen, array, 6, ',', term);
+ if (length == 0)
+ return 0;
+
+ cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) |
+ (array[2] << 8) | array[3]);
+ cmd->u.tcp.port = htons((array[4] << 8) | array[5]);
+ return length;
+}
+
+/*
+ * From RFC 1123:
+ * The format of the 227 reply to a PASV command is not
+ * well standardized. In particular, an FTP client cannot
+ * assume that the parentheses shown on page 40 of RFC-959
+ * will be present (and in fact, Figure 3 on page 43 omits
+ * them). Therefore, a User-FTP program that interprets
+ * the PASV reply must scan the reply for the first digit
+ * of the host and port numbers.
+ */
+static int try_rfc1123(const char *data, size_t dlen,
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
+{
+ int i;
+ for (i = 0; i < dlen; i++)
+ if (isdigit(data[i]))
+ break;
+
+ if (i == dlen)
+ return 0;
+
+ *offset += i;
+
+ return try_rfc959(data + i, dlen - i, cmd, 0, offset);
+}
+
+/* Grab port: number up to delimiter */
+static int get_port(const char *data, int start, size_t dlen, char delim,
+ __be16 *port)
+{
+ u_int16_t tmp_port = 0;
+ int i;
+
+ for (i = start; i < dlen; i++) {
+ /* Finished? */
+ if (data[i] == delim) {
+ if (tmp_port == 0)
+ break;
+ *port = htons(tmp_port);
+ pr_debug("get_port: return %d\n", tmp_port);
+ return i + 1;
+ }
+ else if (data[i] >= '0' && data[i] <= '9')
+ tmp_port = tmp_port*10 + data[i] - '0';
+ else { /* Some other crap */
+ pr_debug("get_port: invalid char.\n");
+ break;
+ }
+ }
+ return 0;
+}
+
+/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */
+static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
+ char term, unsigned int *offset)
+{
+ char delim;
+ int length;
+
+ /* First character is delimiter, then "1" for IPv4 or "2" for IPv6,
+ then delimiter again. */
+ if (dlen <= 3) {
+ pr_debug("EPRT: too short\n");
+ return 0;
+ }
+ delim = data[0];
+ if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
+ pr_debug("try_eprt: invalid delimitter.\n");
+ return 0;
+ }
+
+ if ((cmd->l3num == PF_INET && data[1] != '1') ||
+ (cmd->l3num == PF_INET6 && data[1] != '2')) {
+ pr_debug("EPRT: invalid protocol number.\n");
+ return 0;
+ }
+
+ pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim);
+
+ if (data[1] == '1') {
+ u_int32_t array[4];
+
+ /* Now we have IP address. */
+ length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
+ if (length != 0)
+ cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16)
+ | (array[2] << 8) | array[3]);
+ } else {
+ /* Now we have IPv6 address. */
+ length = get_ipv6_addr(data + 3, dlen - 3,
+ (struct in6_addr *)cmd->u3.ip6, delim);
+ }
+
+ if (length == 0)
+ return 0;
+ pr_debug("EPRT: Got IP address!\n");
+ /* Start offset includes initial "|1|", and trailing delimiter */
+ return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port);
+}
+
+/* Returns 0, or length of numbers: |||6446| */
+static int try_epsv_response(const char *data, size_t dlen,
+ struct nf_conntrack_man *cmd, char term,
+ unsigned int *offset)
+{
+ char delim;
+
+ /* Three delimiters. */
+ if (dlen <= 3) return 0;
+ delim = data[0];
+ if (isdigit(delim) || delim < 33 || delim > 126 ||
+ data[1] != delim || data[2] != delim)
+ return 0;
+
+ return get_port(data, 3, dlen, delim, &cmd->u.tcp.port);
+}
+
+/* Return 1 for match, 0 for accept, -1 for partial. */
+static int find_pattern(const char *data, size_t dlen,
+ const char *pattern, size_t plen,
+ char skip, char term,
+ unsigned int *numoff,
+ unsigned int *numlen,
+ struct nf_conntrack_man *cmd,
+ int (*getnum)(const char *, size_t,
+ struct nf_conntrack_man *, char,
+ unsigned int *))
+{
+ size_t i = plen;
+
+ pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
+ if (dlen == 0)
+ return 0;
+
+ if (dlen <= plen) {
+ /* Short packet: try for partial? */
+ if (strncasecmp(data, pattern, dlen) == 0)
+ return -1;
+ else return 0;
+ }
+
+ if (strncasecmp(data, pattern, plen) != 0) {
+#if 0
+ size_t i;
+
+ pr_debug("ftp: string mismatch\n");
+ for (i = 0; i < plen; i++) {
+ pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+ i, data[i], data[i],
+ pattern[i], pattern[i]);
+ }
+#endif
+ return 0;
+ }
+
+ pr_debug("Pattern matches!\n");
+ /* Now we've found the constant string, try to skip
+ to the 'skip' character */
+ if (skip) {
+ for (i = plen; data[i] != skip; i++)
+ if (i == dlen - 1) return -1;
+
+ /* Skip over the last character */
+ i++;
+ }
+
+ pr_debug("Skipped up to `%c'!\n", skip);
+
+ *numoff = i;
+ *numlen = getnum(data + i, dlen - i, cmd, term, numoff);
+ if (!*numlen)
+ return -1;
+
+ pr_debug("Match succeeded!\n");
+ return 1;
+}
+
+/* Look up to see if we're just after a \n. */
+static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir)
+{
+ unsigned int i;
+
+ for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
+ if (info->seq_aft_nl[dir][i] == seq)
+ return 1;
+ return 0;
+}
+
+/* We don't update if it's older than what we have. */
+static void update_nl_seq(struct nf_conn *ct, u32 nl_seq,
+ struct nf_ct_ftp_master *info, int dir,
+ struct sk_buff *skb)
+{
+ unsigned int i, oldest;
+
+ /* Look for oldest: if we find exact match, we're done. */
+ for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
+ if (info->seq_aft_nl[dir][i] == nl_seq)
+ return;
+ }
+
+ if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
+ info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
+ } else {
+ if (before(info->seq_aft_nl[dir][0], info->seq_aft_nl[dir][1]))
+ oldest = 0;
+ else
+ oldest = 1;
+
+ if (after(nl_seq, info->seq_aft_nl[dir][oldest]))
+ info->seq_aft_nl[dir][oldest] = nl_seq;
+ }
+}
+
+static int help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff, datalen;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ const char *fb_ptr;
+ int ret;
+ u32 seq;
+ int dir = CTINFO2DIR(ctinfo);
+ unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
+ struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
+ struct nf_conntrack_expect *exp;
+ union nf_inet_addr *daddr;
+ struct nf_conntrack_man cmd = {};
+ unsigned int i;
+ int found = 0, ends_in_nl;
+ typeof(nf_nat_ftp_hook) nf_nat_ftp;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY) {
+ pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);
+ return NF_ACCEPT;
+ }
+
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+
+ dataoff = protoff + th->doff * 4;
+ /* No data? */
+ if (dataoff >= skb->len) {
+ pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
+ skb->len);
+ return NF_ACCEPT;
+ }
+ datalen = skb->len - dataoff;
+
+ spin_lock_bh(&nf_ftp_lock);
+ fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
+ BUG_ON(fb_ptr == NULL);
+
+ ends_in_nl = (fb_ptr[datalen - 1] == '\n');
+ seq = ntohl(th->seq) + datalen;
+
+ /* Look up to see if we're just after a \n. */
+ if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
+ /* We're picking up this, clear flags and let it continue */
+ if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) {
+ ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP;
+ goto skip_nl_seq;
+ }
+
+ /* Now if this ends in \n, update ftp info. */
+ pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n",
+ ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
+ ct_ftp_info->seq_aft_nl[dir][0],
+ ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
+ ct_ftp_info->seq_aft_nl[dir][1]);
+ ret = NF_ACCEPT;
+ goto out_update_nl;
+ }
+
+skip_nl_seq:
+ /* Initialize IP/IPv6 addr to expected address (it's not mentioned
+ in EPSV responses) */
+ cmd.l3num = nf_ct_l3num(ct);
+ memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
+ sizeof(cmd.u3.all));
+
+ for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
+ found = find_pattern(fb_ptr, datalen,
+ search[dir][i].pattern,
+ search[dir][i].plen,
+ search[dir][i].skip,
+ search[dir][i].term,
+ &matchoff, &matchlen,
+ &cmd,
+ search[dir][i].getnum);
+ if (found) break;
+ }
+ if (found == -1) {
+ /* We don't usually drop packets. After all, this is
+ connection tracking, not packet filtering.
+ However, it is necessary for accurate tracking in
+ this case. */
+ nf_ct_helper_log(skb, ct, "partial matching of `%s'",
+ search[dir][i].pattern);
+ ret = NF_DROP;
+ goto out;
+ } else if (found == 0) { /* No match */
+ ret = NF_ACCEPT;
+ goto out_update_nl;
+ }
+
+ pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
+ matchlen, fb_ptr + matchoff,
+ matchlen, ntohl(th->seq) + matchoff);
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ ret = NF_DROP;
+ goto out;
+ }
+
+ /* We refer to the reverse direction ("!dir") tuples here,
+ * because we're expecting something in the other direction.
+ * Doesn't matter unless NAT is happening. */
+ daddr = &ct->tuplehash[!dir].tuple.dst.u3;
+
+ /* Update the ftp info */
+ if ((cmd.l3num == nf_ct_l3num(ct)) &&
+ memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
+ sizeof(cmd.u3.all))) {
+ /* Enrico Scholz's passive FTP to partially RNAT'd ftp
+ server: it really wants us to connect to a
+ different IP address. Simply don't record it for
+ NAT. */
+ if (cmd.l3num == PF_INET) {
+ pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n",
+ &cmd.u3.ip,
+ &ct->tuplehash[dir].tuple.src.u3.ip);
+ } else {
+ pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n",
+ cmd.u3.ip6,
+ ct->tuplehash[dir].tuple.src.u3.ip6);
+ }
+
+ /* Thanks to Cristiano Lincoln Mattos
+ <lincoln@cesar.org.br> for reporting this potential
+ problem (DMZ machines opening holes to internal
+ networks, or the packet filter itself). */
+ if (!loose) {
+ ret = NF_ACCEPT;
+ goto out_put_expect;
+ }
+ daddr = &cmd.u3;
+ }
+
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num,
+ &ct->tuplehash[!dir].tuple.src.u3, daddr,
+ IPPROTO_TCP, NULL, &cmd.u.tcp.port);
+
+ /* Now, NAT might want to mangle the packet, and register the
+ * (possibly changed) expectation itself. */
+ nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);
+ if (nf_nat_ftp && ct->status & IPS_NAT_MASK)
+ ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype,
+ protoff, matchoff, matchlen, exp);
+ else {
+ /* Can't expect this? Best to drop packet now. */
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ } else
+ ret = NF_ACCEPT;
+ }
+
+out_put_expect:
+ nf_ct_expect_put(exp);
+
+out_update_nl:
+ /* Now if this ends in \n, update ftp info. Seq may have been
+ * adjusted by NAT code. */
+ if (ends_in_nl)
+ update_nl_seq(ct, seq, ct_ftp_info, dir, skb);
+ out:
+ spin_unlock_bh(&nf_ftp_lock);
+ return ret;
+}
+
+static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
+{
+ struct nf_ct_ftp_master *ftp = nfct_help_data(ct);
+
+ /* This conntrack has been injected from user-space, always pick up
+ * sequence tracking. Otherwise, the first FTP command after the
+ * failover breaks.
+ */
+ ftp->flags[IP_CT_DIR_ORIGINAL] |= NF_CT_FTP_SEQ_PICKUP;
+ ftp->flags[IP_CT_DIR_REPLY] |= NF_CT_FTP_SEQ_PICKUP;
+ return 0;
+}
+
+static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly;
+
+static const struct nf_conntrack_expect_policy ftp_exp_policy = {
+ .max_expected = 1,
+ .timeout = 5 * 60,
+};
+
+/* don't make this __exit, since it's called from __init ! */
+static void nf_conntrack_ftp_fini(void)
+{
+ int i, j;
+ for (i = 0; i < ports_c; i++) {
+ for (j = 0; j < 2; j++) {
+ if (ftp[i][j].me == NULL)
+ continue;
+
+ pr_debug("nf_ct_ftp: unregistering helper for pf: %d "
+ "port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_helper_unregister(&ftp[i][j]);
+ }
+ }
+
+ kfree(ftp_buffer);
+}
+
+static int __init nf_conntrack_ftp_init(void)
+{
+ int i, j = -1, ret = 0;
+
+ ftp_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!ftp_buffer)
+ return -ENOMEM;
+
+ if (ports_c == 0)
+ ports[ports_c++] = FTP_PORT;
+
+ /* FIXME should be configurable whether IPv4 and IPv6 FTP connections
+ are tracked or not - YK */
+ for (i = 0; i < ports_c; i++) {
+ ftp[i][0].tuple.src.l3num = PF_INET;
+ ftp[i][1].tuple.src.l3num = PF_INET6;
+ for (j = 0; j < 2; j++) {
+ ftp[i][j].data_len = sizeof(struct nf_ct_ftp_master);
+ ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
+ ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
+ ftp[i][j].expect_policy = &ftp_exp_policy;
+ ftp[i][j].me = THIS_MODULE;
+ ftp[i][j].help = help;
+ ftp[i][j].from_nlattr = nf_ct_ftp_from_nlattr;
+ if (ports[i] == FTP_PORT)
+ sprintf(ftp[i][j].name, "ftp");
+ else
+ sprintf(ftp[i][j].name, "ftp-%d", ports[i]);
+
+ pr_debug("nf_ct_ftp: registering helper for pf: %d "
+ "port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
+ ret = nf_conntrack_helper_register(&ftp[i][j]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_ftp: failed to register"
+ " helper for pf: %d port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_ftp_fini();
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+module_init(nf_conntrack_ftp_init);
+module_exit(nf_conntrack_ftp_fini);
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
new file mode 100644
index 000000000..bcd5ed6b7
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -0,0 +1,888 @@
+/****************************************************************************
+ * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323
+ * conntrack/NAT module.
+ *
+ * Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * See ip_conntrack_helper_h323_asn1.h for details.
+ *
+ ****************************************************************************/
+
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#else
+#include <stdio.h>
+#endif
+#include <linux/netfilter/nf_conntrack_h323_asn1.h>
+
+/* Trace Flag */
+#ifndef H323_TRACE
+#define H323_TRACE 0
+#endif
+
+#if H323_TRACE
+#define TAB_SIZE 4
+#define IFTHEN(cond, act) if(cond){act;}
+#ifdef __KERNEL__
+#define PRINT printk
+#else
+#define PRINT printf
+#endif
+#define FNAME(name) name,
+#else
+#define IFTHEN(cond, act)
+#define PRINT(fmt, args...)
+#define FNAME(name)
+#endif
+
+/* ASN.1 Types */
+#define NUL 0
+#define BOOL 1
+#define OID 2
+#define INT 3
+#define ENUM 4
+#define BITSTR 5
+#define NUMSTR 6
+#define NUMDGT 6
+#define TBCDSTR 6
+#define OCTSTR 7
+#define PRTSTR 7
+#define IA5STR 7
+#define GENSTR 7
+#define BMPSTR 8
+#define SEQ 9
+#define SET 9
+#define SEQOF 10
+#define SETOF 10
+#define CHOICE 11
+
+/* Constraint Types */
+#define FIXD 0
+/* #define BITS 1-8 */
+#define BYTE 9
+#define WORD 10
+#define CONS 11
+#define SEMI 12
+#define UNCO 13
+
+/* ASN.1 Type Attributes */
+#define SKIP 0
+#define STOP 1
+#define DECODE 2
+#define EXT 4
+#define OPEN 8
+#define OPT 16
+
+
+/* ASN.1 Field Structure */
+typedef struct field_t {
+#if H323_TRACE
+ char *name;
+#endif
+ unsigned char type;
+ unsigned char sz;
+ unsigned char lb;
+ unsigned char ub;
+ unsigned short attr;
+ unsigned short offset;
+ const struct field_t *fields;
+} field_t;
+
+/* Bit Stream */
+typedef struct {
+ unsigned char *buf;
+ unsigned char *beg;
+ unsigned char *end;
+ unsigned char *cur;
+ unsigned int bit;
+} bitstr_t;
+
+/* Tool Functions */
+#define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;}
+#define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;}
+#define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;}
+#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND)
+static unsigned int get_len(bitstr_t *bs);
+static unsigned int get_bit(bitstr_t *bs);
+static unsigned int get_bits(bitstr_t *bs, unsigned int b);
+static unsigned int get_bitmap(bitstr_t *bs, unsigned int b);
+static unsigned int get_uint(bitstr_t *bs, int b);
+
+/* Decoder Functions */
+static int decode_nul(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bool(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_oid(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_int(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_enum(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bitstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_numstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_octstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_seq(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_seqof(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_choice(bitstr_t *bs, const struct field_t *f, char *base, int level);
+
+/* Decoder Functions Vector */
+typedef int (*decoder_t)(bitstr_t *, const struct field_t *, char *, int);
+static const decoder_t Decoders[] = {
+ decode_nul,
+ decode_bool,
+ decode_oid,
+ decode_int,
+ decode_enum,
+ decode_bitstr,
+ decode_numstr,
+ decode_octstr,
+ decode_bmpstr,
+ decode_seq,
+ decode_seqof,
+ decode_choice,
+};
+
+/****************************************************************************
+ * H.323 Types
+ ****************************************************************************/
+#include "nf_conntrack_h323_types.c"
+
+/****************************************************************************
+ * Functions
+ ****************************************************************************/
+/* Assume bs is aligned && v < 16384 */
+static unsigned int get_len(bitstr_t *bs)
+{
+ unsigned int v;
+
+ v = *bs->cur++;
+
+ if (v & 0x80) {
+ v &= 0x3f;
+ v <<= 8;
+ v += *bs->cur++;
+ }
+
+ return v;
+}
+
+/****************************************************************************/
+static unsigned int get_bit(bitstr_t *bs)
+{
+ unsigned int b = (*bs->cur) & (0x80 >> bs->bit);
+
+ INC_BIT(bs);
+
+ return b;
+}
+
+/****************************************************************************/
+/* Assume b <= 8 */
+static unsigned int get_bits(bitstr_t *bs, unsigned int b)
+{
+ unsigned int v, l;
+
+ v = (*bs->cur) & (0xffU >> bs->bit);
+ l = b + bs->bit;
+
+ if (l < 8) {
+ v >>= 8 - l;
+ bs->bit = l;
+ } else if (l == 8) {
+ bs->cur++;
+ bs->bit = 0;
+ } else { /* l > 8 */
+
+ v <<= 8;
+ v += *(++bs->cur);
+ v >>= 16 - l;
+ bs->bit = l - 8;
+ }
+
+ return v;
+}
+
+/****************************************************************************/
+/* Assume b <= 32 */
+static unsigned int get_bitmap(bitstr_t *bs, unsigned int b)
+{
+ unsigned int v, l, shift, bytes;
+
+ if (!b)
+ return 0;
+
+ l = bs->bit + b;
+
+ if (l < 8) {
+ v = (unsigned int)(*bs->cur) << (bs->bit + 24);
+ bs->bit = l;
+ } else if (l == 8) {
+ v = (unsigned int)(*bs->cur++) << (bs->bit + 24);
+ bs->bit = 0;
+ } else {
+ for (bytes = l >> 3, shift = 24, v = 0; bytes;
+ bytes--, shift -= 8)
+ v |= (unsigned int)(*bs->cur++) << shift;
+
+ if (l < 32) {
+ v |= (unsigned int)(*bs->cur) << shift;
+ v <<= bs->bit;
+ } else if (l > 32) {
+ v <<= bs->bit;
+ v |= (*bs->cur) >> (8 - bs->bit);
+ }
+
+ bs->bit = l & 0x7;
+ }
+
+ v &= 0xffffffff << (32 - b);
+
+ return v;
+}
+
+/****************************************************************************
+ * Assume bs is aligned and sizeof(unsigned int) == 4
+ ****************************************************************************/
+static unsigned int get_uint(bitstr_t *bs, int b)
+{
+ unsigned int v = 0;
+
+ switch (b) {
+ case 4:
+ v |= *bs->cur++;
+ v <<= 8;
+ case 3:
+ v |= *bs->cur++;
+ v <<= 8;
+ case 2:
+ v |= *bs->cur++;
+ v <<= 8;
+ case 1:
+ v |= *bs->cur++;
+ break;
+ }
+ return v;
+}
+
+/****************************************************************************/
+static int decode_nul(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bool(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ INC_BIT(bs);
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_oid(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ int len;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 1);
+ len = *bs->cur++;
+ bs->cur += len;
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_int(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s", level * TAB_SIZE, " ", f->name);
+
+ switch (f->sz) {
+ case BYTE: /* Range == 256 */
+ BYTE_ALIGN(bs);
+ bs->cur++;
+ break;
+ case WORD: /* 257 <= Range <= 64K */
+ BYTE_ALIGN(bs);
+ bs->cur += 2;
+ break;
+ case CONS: /* 64K < Range < 4G */
+ len = get_bits(bs, 2) + 1;
+ BYTE_ALIGN(bs);
+ if (base && (f->attr & DECODE)) { /* timeToLive */
+ unsigned int v = get_uint(bs, len) + f->lb;
+ PRINT(" = %u", v);
+ *((unsigned int *)(base + f->offset)) = v;
+ }
+ bs->cur += len;
+ break;
+ case UNCO:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ bs->cur += len;
+ break;
+ default: /* 2 <= Range <= 255 */
+ INC_BITS(bs, f->sz);
+ break;
+ }
+
+ PRINT("\n");
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_enum(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ if ((f->attr & EXT) && get_bit(bs)) {
+ INC_BITS(bs, 7);
+ } else {
+ INC_BITS(bs, f->sz);
+ }
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bitstr(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ BYTE_ALIGN(bs);
+ switch (f->sz) {
+ case FIXD: /* fixed length > 16 */
+ len = f->lb;
+ break;
+ case WORD: /* 2-byte length */
+ CHECK_BOUND(bs, 2);
+ len = (*bs->cur++) << 8;
+ len += (*bs->cur++) + f->lb;
+ break;
+ case SEMI:
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ break;
+ default:
+ len = 0;
+ break;
+ }
+
+ bs->cur += len >> 3;
+ bs->bit = len & 7;
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_numstr(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ /* 2 <= Range <= 255 */
+ len = get_bits(bs, f->sz) + f->lb;
+
+ BYTE_ALIGN(bs);
+ INC_BITS(bs, (len << 2));
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_octstr(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s", level * TAB_SIZE, " ", f->name);
+
+ switch (f->sz) {
+ case FIXD: /* Range == 1 */
+ if (f->lb > 2) {
+ BYTE_ALIGN(bs);
+ if (base && (f->attr & DECODE)) {
+ /* The IP Address */
+ IFTHEN(f->lb == 4,
+ PRINT(" = %d.%d.%d.%d:%d",
+ bs->cur[0], bs->cur[1],
+ bs->cur[2], bs->cur[3],
+ bs->cur[4] * 256 + bs->cur[5]));
+ *((unsigned int *)(base + f->offset)) =
+ bs->cur - bs->buf;
+ }
+ }
+ len = f->lb;
+ break;
+ case BYTE: /* Range == 256 */
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 1);
+ len = (*bs->cur++) + f->lb;
+ break;
+ case SEMI:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs) + f->lb;
+ break;
+ default: /* 2 <= Range <= 255 */
+ len = get_bits(bs, f->sz) + f->lb;
+ BYTE_ALIGN(bs);
+ break;
+ }
+
+ bs->cur += len;
+
+ PRINT("\n");
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bmpstr(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int len;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ switch (f->sz) {
+ case BYTE: /* Range == 256 */
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 1);
+ len = (*bs->cur++) + f->lb;
+ break;
+ default: /* 2 <= Range <= 255 */
+ len = get_bits(bs, f->sz) + f->lb;
+ BYTE_ALIGN(bs);
+ break;
+ }
+
+ bs->cur += len << 1;
+
+ CHECK_BOUND(bs, 0);
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_seq(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int ext, bmp, i, opt, len = 0, bmp2, bmp2_len;
+ int err;
+ const struct field_t *son;
+ unsigned char *beg = NULL;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ /* Decode? */
+ base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+ /* Extensible? */
+ ext = (f->attr & EXT) ? get_bit(bs) : 0;
+
+ /* Get fields bitmap */
+ bmp = get_bitmap(bs, f->sz);
+ if (base)
+ *(unsigned int *)base = bmp;
+
+ /* Decode the root components */
+ for (i = opt = 0, son = f->fields; i < f->lb; i++, son++) {
+ if (son->attr & STOP) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+ son->name);
+ return H323_ERROR_STOP;
+ }
+
+ if (son->attr & OPT) { /* Optional component */
+ if (!((0x80000000U >> (opt++)) & bmp)) /* Not exist */
+ continue;
+ }
+
+ /* Decode */
+ if (son->attr & OPEN) { /* Open field */
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ if (!base || !(son->attr & DECODE)) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
+ " ", son->name);
+ bs->cur += len;
+ continue;
+ }
+ beg = bs->cur;
+
+ /* Decode */
+ if ((err = (Decoders[son->type]) (bs, son, base,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ bs->cur = beg + len;
+ bs->bit = 0;
+ } else if ((err = (Decoders[son->type]) (bs, son, base,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+ }
+
+ /* No extension? */
+ if (!ext)
+ return H323_ERROR_NONE;
+
+ /* Get the extension bitmap */
+ bmp2_len = get_bits(bs, 7) + 1;
+ CHECK_BOUND(bs, (bmp2_len + 7) >> 3);
+ bmp2 = get_bitmap(bs, bmp2_len);
+ bmp |= bmp2 >> f->sz;
+ if (base)
+ *(unsigned int *)base = bmp;
+ BYTE_ALIGN(bs);
+
+ /* Decode the extension components */
+ for (opt = 0; opt < bmp2_len; opt++, i++, son++) {
+ /* Check Range */
+ if (i >= f->ub) { /* Newer Version? */
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ bs->cur += len;
+ continue;
+ }
+
+ if (son->attr & STOP) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+ son->name);
+ return H323_ERROR_STOP;
+ }
+
+ if (!((0x80000000 >> opt) & bmp2)) /* Not present */
+ continue;
+
+ CHECK_BOUND(bs, 2);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ if (!base || !(son->attr & DECODE)) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+ son->name);
+ bs->cur += len;
+ continue;
+ }
+ beg = bs->cur;
+
+ if ((err = (Decoders[son->type]) (bs, son, base,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ bs->cur = beg + len;
+ bs->bit = 0;
+ }
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_seqof(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int count, effective_count = 0, i, len = 0;
+ int err;
+ const struct field_t *son;
+ unsigned char *beg = NULL;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ /* Decode? */
+ base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+ /* Decode item count */
+ switch (f->sz) {
+ case BYTE:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 1);
+ count = *bs->cur++;
+ break;
+ case WORD:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 2);
+ count = *bs->cur++;
+ count <<= 8;
+ count += *bs->cur++;
+ break;
+ case SEMI:
+ BYTE_ALIGN(bs);
+ CHECK_BOUND(bs, 2);
+ count = get_len(bs);
+ break;
+ default:
+ count = get_bits(bs, f->sz);
+ break;
+ }
+ count += f->lb;
+
+ /* Write Count */
+ if (base) {
+ effective_count = count > f->ub ? f->ub : count;
+ *(unsigned int *)base = effective_count;
+ base += sizeof(unsigned int);
+ }
+
+ /* Decode nested field */
+ son = f->fields;
+ if (base)
+ base -= son->offset;
+ for (i = 0; i < count; i++) {
+ if (son->attr & OPEN) {
+ BYTE_ALIGN(bs);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ if (!base || !(son->attr & DECODE)) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
+ " ", son->name);
+ bs->cur += len;
+ continue;
+ }
+ beg = bs->cur;
+
+ if ((err = (Decoders[son->type]) (bs, son,
+ i <
+ effective_count ?
+ base : NULL,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ bs->cur = beg + len;
+ bs->bit = 0;
+ } else
+ if ((err = (Decoders[son->type]) (bs, son,
+ i <
+ effective_count ?
+ base : NULL,
+ level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ if (base)
+ base += son->offset;
+ }
+
+ return H323_ERROR_NONE;
+}
+
+
+/****************************************************************************/
+static int decode_choice(bitstr_t *bs, const struct field_t *f,
+ char *base, int level)
+{
+ unsigned int type, ext, len = 0;
+ int err;
+ const struct field_t *son;
+ unsigned char *beg = NULL;
+
+ PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+ /* Decode? */
+ base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+ /* Decode the choice index number */
+ if ((f->attr & EXT) && get_bit(bs)) {
+ ext = 1;
+ type = get_bits(bs, 7) + f->lb;
+ } else {
+ ext = 0;
+ type = get_bits(bs, f->sz);
+ if (type >= f->lb)
+ return H323_ERROR_RANGE;
+ }
+
+ /* Write Type */
+ if (base)
+ *(unsigned int *)base = type;
+
+ /* Check Range */
+ if (type >= f->ub) { /* Newer version? */
+ BYTE_ALIGN(bs);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ bs->cur += len;
+ return H323_ERROR_NONE;
+ }
+
+ /* Transfer to son level */
+ son = &f->fields[type];
+ if (son->attr & STOP) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name);
+ return H323_ERROR_STOP;
+ }
+
+ if (ext || (son->attr & OPEN)) {
+ BYTE_ALIGN(bs);
+ len = get_len(bs);
+ CHECK_BOUND(bs, len);
+ if (!base || !(son->attr & DECODE)) {
+ PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+ son->name);
+ bs->cur += len;
+ return H323_ERROR_NONE;
+ }
+ beg = bs->cur;
+
+ if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ bs->cur = beg + len;
+ bs->bit = 0;
+ } else if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) <
+ H323_ERROR_NONE)
+ return err;
+
+ return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras)
+{
+ static const struct field_t ras_message = {
+ FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT,
+ 0, _RasMessage
+ };
+ bitstr_t bs;
+
+ bs.buf = bs.beg = bs.cur = buf;
+ bs.end = buf + sz;
+ bs.bit = 0;
+
+ return decode_choice(&bs, &ras_message, (char *) ras, 0);
+}
+
+/****************************************************************************/
+static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg,
+ size_t sz, H323_UserInformation *uuie)
+{
+ static const struct field_t h323_userinformation = {
+ FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT,
+ 0, _H323_UserInformation
+ };
+ bitstr_t bs;
+
+ bs.buf = buf;
+ bs.beg = bs.cur = beg;
+ bs.end = beg + sz;
+ bs.bit = 0;
+
+ return decode_seq(&bs, &h323_userinformation, (char *) uuie, 0);
+}
+
+/****************************************************************************/
+int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz,
+ MultimediaSystemControlMessage *
+ mscm)
+{
+ static const struct field_t multimediasystemcontrolmessage = {
+ FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4,
+ DECODE | EXT, 0, _MultimediaSystemControlMessage
+ };
+ bitstr_t bs;
+
+ bs.buf = bs.beg = bs.cur = buf;
+ bs.end = buf + sz;
+ bs.bit = 0;
+
+ return decode_choice(&bs, &multimediasystemcontrolmessage,
+ (char *) mscm, 0);
+}
+
+/****************************************************************************/
+int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931)
+{
+ unsigned char *p = buf;
+ int len;
+
+ if (!p || sz < 1)
+ return H323_ERROR_BOUND;
+
+ /* Protocol Discriminator */
+ if (*p != 0x08) {
+ PRINT("Unknown Protocol Discriminator\n");
+ return H323_ERROR_RANGE;
+ }
+ p++;
+ sz--;
+
+ /* CallReferenceValue */
+ if (sz < 1)
+ return H323_ERROR_BOUND;
+ len = *p++;
+ sz--;
+ if (sz < len)
+ return H323_ERROR_BOUND;
+ p += len;
+ sz -= len;
+
+ /* Message Type */
+ if (sz < 1)
+ return H323_ERROR_BOUND;
+ q931->MessageType = *p++;
+ PRINT("MessageType = %02X\n", q931->MessageType);
+ if (*p & 0x80) {
+ p++;
+ sz--;
+ }
+
+ /* Decode Information Elements */
+ while (sz > 0) {
+ if (*p == 0x7e) { /* UserUserIE */
+ if (sz < 3)
+ break;
+ p++;
+ len = *p++ << 8;
+ len |= *p++;
+ sz -= 3;
+ if (sz < len)
+ break;
+ p++;
+ len--;
+ return DecodeH323_UserInformation(buf, p, len,
+ &q931->UUIE);
+ }
+ p++;
+ sz--;
+ if (sz < 1)
+ break;
+ len = *p++;
+ if (sz < len)
+ break;
+ p += len;
+ sz -= len;
+ }
+
+ PRINT("Q.931 UUIE not found\n");
+
+ return H323_ERROR_BOUND;
+}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
new file mode 100644
index 000000000..1d69f5b97
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -0,0 +1,1906 @@
+/*
+ * H.323 connection tracking helper
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * Based on the 'brute force' H.323 connection tracking module by
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * For more information, please see http://nath323.sourceforge.net/
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/ip6_route.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_h323.h>
+
+/* Parameters */
+static unsigned int default_rrq_ttl __read_mostly = 300;
+module_param(default_rrq_ttl, uint, 0600);
+MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ");
+
+static int gkrouted_only __read_mostly = 1;
+module_param(gkrouted_only, int, 0600);
+MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
+
+static bool callforward_filter __read_mostly = true;
+module_param(callforward_filter, bool, 0600);
+MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
+ "if both endpoints are on different sides "
+ "(determined by routing information)");
+
+/* Hooks for NAT */
+int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+ __read_mostly;
+int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 port)
+ __read_mostly;
+int (*set_sig_addr_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count) __read_mostly;
+int (*set_ras_addr_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count) __read_mostly;
+int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr,
+ __be16 port, __be16 rtp_port,
+ struct nf_conntrack_expect *rtp_exp,
+ struct nf_conntrack_expect *rtcp_exp) __read_mostly;
+int (*nat_t120_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_h245_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_callforwarding_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr, __be16 port,
+ struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_q931_hook) (struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, TransportAddress *taddr, int idx,
+ __be16 port, struct nf_conntrack_expect *exp)
+ __read_mostly;
+
+static DEFINE_SPINLOCK(nf_h323_lock);
+static char *h323_buffer;
+
+static struct nf_conntrack_helper nf_conntrack_helper_h245;
+static struct nf_conntrack_helper nf_conntrack_helper_q931[];
+static struct nf_conntrack_helper nf_conntrack_helper_ras[];
+
+/****************************************************************************/
+static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned char **data, int *datalen, int *dataoff)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ int tcpdatalen;
+ int tcpdataoff;
+ unsigned char *tpkt;
+ int tpktlen;
+ int tpktoff;
+
+ /* Get TCP header */
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return 0;
+
+ /* Get TCP data offset */
+ tcpdataoff = protoff + th->doff * 4;
+
+ /* Get TCP data length */
+ tcpdatalen = skb->len - tcpdataoff;
+ if (tcpdatalen <= 0) /* No TCP data */
+ goto clear_out;
+
+ if (*data == NULL) { /* first TPKT */
+ /* Get first TPKT pointer */
+ tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
+ h323_buffer);
+ BUG_ON(tpkt == NULL);
+
+ /* Validate TPKT identifier */
+ if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
+ /* Netmeeting sends TPKT header and data separately */
+ if (info->tpkt_len[dir] > 0) {
+ pr_debug("nf_ct_h323: previous packet "
+ "indicated separate TPKT data of %hu "
+ "bytes\n", info->tpkt_len[dir]);
+ if (info->tpkt_len[dir] <= tcpdatalen) {
+ /* Yes, there was a TPKT header
+ * received */
+ *data = tpkt;
+ *datalen = info->tpkt_len[dir];
+ *dataoff = 0;
+ goto out;
+ }
+
+ /* Fragmented TPKT */
+ pr_debug("nf_ct_h323: fragmented TPKT\n");
+ goto clear_out;
+ }
+
+ /* It is not even a TPKT */
+ return 0;
+ }
+ tpktoff = 0;
+ } else { /* Next TPKT */
+ tpktoff = *dataoff + *datalen;
+ tcpdatalen -= tpktoff;
+ if (tcpdatalen <= 4) /* No more TPKT */
+ goto clear_out;
+ tpkt = *data + *datalen;
+
+ /* Validate TPKT identifier */
+ if (tpkt[0] != 0x03 || tpkt[1] != 0)
+ goto clear_out;
+ }
+
+ /* Validate TPKT length */
+ tpktlen = tpkt[2] * 256 + tpkt[3];
+ if (tpktlen < 4)
+ goto clear_out;
+ if (tpktlen > tcpdatalen) {
+ if (tcpdatalen == 4) { /* Separate TPKT header */
+ /* Netmeeting sends TPKT header and data separately */
+ pr_debug("nf_ct_h323: separate TPKT header indicates "
+ "there will be TPKT data of %hu bytes\n",
+ tpktlen - 4);
+ info->tpkt_len[dir] = tpktlen - 4;
+ return 0;
+ }
+
+ pr_debug("nf_ct_h323: incomplete TPKT (fragmented?)\n");
+ goto clear_out;
+ }
+
+ /* This is the encapsulated data */
+ *data = tpkt + 4;
+ *datalen = tpktlen - 4;
+ *dataoff = tpktoff + 4;
+
+ out:
+ /* Clear TPKT length */
+ info->tpkt_len[dir] = 0;
+ return 1;
+
+ clear_out:
+ info->tpkt_len[dir] = 0;
+ return 0;
+}
+
+/****************************************************************************/
+static int get_h245_addr(struct nf_conn *ct, const unsigned char *data,
+ H245_TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 *port)
+{
+ const unsigned char *p;
+ int len;
+
+ if (taddr->choice != eH245_TransportAddress_unicastAddress)
+ return 0;
+
+ switch (taddr->unicastAddress.choice) {
+ case eUnicastAddress_iPAddress:
+ if (nf_ct_l3num(ct) != AF_INET)
+ return 0;
+ p = data + taddr->unicastAddress.iPAddress.network;
+ len = 4;
+ break;
+ case eUnicastAddress_iP6Address:
+ if (nf_ct_l3num(ct) != AF_INET6)
+ return 0;
+ p = data + taddr->unicastAddress.iP6Address.network;
+ len = 16;
+ break;
+ default:
+ return 0;
+ }
+
+ memcpy(addr, p, len);
+ memset((void *)addr + len, 0, sizeof(*addr) - len);
+ memcpy(port, p + len, sizeof(__be16));
+
+ return 1;
+}
+
+/****************************************************************************/
+static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ __be16 rtp_port, rtcp_port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *rtp_exp;
+ struct nf_conntrack_expect *rtcp_exp;
+ typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
+
+ /* Read RTP or RTCP address */
+ if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
+ memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+ port == 0)
+ return 0;
+
+ /* RTP port is even */
+ rtp_port = port & ~htons(1);
+ rtcp_port = port | htons(1);
+
+ /* Create expect for RTP */
+ if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(rtp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_UDP, NULL, &rtp_port);
+
+ /* Create expect for RTCP */
+ if ((rtcp_exp = nf_ct_expect_alloc(ct)) == NULL) {
+ nf_ct_expect_put(rtp_exp);
+ return -1;
+ }
+ nf_ct_expect_init(rtcp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_UDP, NULL, &rtcp_port);
+
+ if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+ (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* NAT needed */
+ ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ taddr, port, rtp_port, rtp_exp, rtcp_exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(rtp_exp) == 0) {
+ if (nf_ct_expect_related(rtcp_exp) == 0) {
+ pr_debug("nf_ct_h323: expect RTP ");
+ nf_ct_dump_tuple(&rtp_exp->tuple);
+ pr_debug("nf_ct_h323: expect RTCP ");
+ nf_ct_dump_tuple(&rtcp_exp->tuple);
+ } else {
+ nf_ct_unexpect_related(rtp_exp);
+ ret = -1;
+ }
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(rtp_exp);
+ nf_ct_expect_put(rtcp_exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int expect_t120(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H245_TransportAddress *taddr)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(nat_t120_hook) nat_t120;
+
+ /* Read T.120 address */
+ if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
+ memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+ port == 0)
+ return 0;
+
+ /* Create expect for T.120 connections */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_TCP, NULL, &port);
+ exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */
+
+ if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+ (nat_t120 = rcu_dereference(nat_t120_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* NAT needed */
+ ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,
+ port, exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_h323: expect T.120 ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_h245_channel(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ H2250LogicalChannelParameters *channel)
+{
+ int ret;
+
+ if (channel->options & eH2250LogicalChannelParameters_mediaChannel) {
+ /* RTP */
+ ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ &channel->mediaChannel);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (channel->
+ options & eH2250LogicalChannelParameters_mediaControlChannel) {
+ /* RTCP */
+ ret = expect_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ &channel->mediaControlChannel);
+ if (ret < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ OpenLogicalChannel *olc)
+{
+ int ret;
+
+ pr_debug("nf_ct_h323: OpenLogicalChannel\n");
+
+ if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==
+ eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)
+ {
+ ret = process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &olc->
+ forwardLogicalChannelParameters.
+ multiplexParameters.
+ h2250LogicalChannelParameters);
+ if (ret < 0)
+ return -1;
+ }
+
+ if ((olc->options &
+ eOpenLogicalChannel_reverseLogicalChannelParameters) &&
+ (olc->reverseLogicalChannelParameters.options &
+ eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters)
+ && (olc->reverseLogicalChannelParameters.multiplexParameters.
+ choice ==
+ eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
+ {
+ ret =
+ process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &olc->
+ reverseLogicalChannelParameters.
+ multiplexParameters.
+ h2250LogicalChannelParameters);
+ if (ret < 0)
+ return -1;
+ }
+
+ if ((olc->options & eOpenLogicalChannel_separateStack) &&
+ olc->forwardLogicalChannelParameters.dataType.choice ==
+ eDataType_data &&
+ olc->forwardLogicalChannelParameters.dataType.data.application.
+ choice == eDataApplicationCapability_application_t120 &&
+ olc->forwardLogicalChannelParameters.dataType.data.application.
+ t120.choice == eDataProtocolCapability_separateLANStack &&
+ olc->separateStack.networkAddress.choice ==
+ eNetworkAccessParameters_networkAddress_localAreaAddress) {
+ ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff,
+ &olc->separateStack.networkAddress.
+ localAreaAddress);
+ if (ret < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ OpenLogicalChannelAck *olca)
+{
+ H2250LogicalChannelAckParameters *ack;
+ int ret;
+
+ pr_debug("nf_ct_h323: OpenLogicalChannelAck\n");
+
+ if ((olca->options &
+ eOpenLogicalChannelAck_reverseLogicalChannelParameters) &&
+ (olca->reverseLogicalChannelParameters.options &
+ eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters)
+ && (olca->reverseLogicalChannelParameters.multiplexParameters.
+ choice ==
+ eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
+ {
+ ret = process_h245_channel(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &olca->
+ reverseLogicalChannelParameters.
+ multiplexParameters.
+ h2250LogicalChannelParameters);
+ if (ret < 0)
+ return -1;
+ }
+
+ if ((olca->options &
+ eOpenLogicalChannelAck_forwardMultiplexAckParameters) &&
+ (olca->forwardMultiplexAckParameters.choice ==
+ eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters))
+ {
+ ack = &olca->forwardMultiplexAckParameters.
+ h2250LogicalChannelAckParameters;
+ if (ack->options &
+ eH2250LogicalChannelAckParameters_mediaChannel) {
+ /* RTP */
+ ret = expect_rtp_rtcp(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &ack->mediaChannel);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (ack->options &
+ eH2250LogicalChannelAckParameters_mediaControlChannel) {
+ /* RTCP */
+ ret = expect_rtp_rtcp(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &ack->mediaControlChannel);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ if ((olca->options & eOpenLogicalChannelAck_separateStack) &&
+ olca->separateStack.networkAddress.choice ==
+ eNetworkAccessParameters_networkAddress_localAreaAddress) {
+ ret = expect_t120(skb, ct, ctinfo, protoff, data, dataoff,
+ &olca->separateStack.networkAddress.
+ localAreaAddress);
+ if (ret < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_h245(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ MultimediaSystemControlMessage *mscm)
+{
+ switch (mscm->choice) {
+ case eMultimediaSystemControlMessage_request:
+ if (mscm->request.choice ==
+ eRequestMessage_openLogicalChannel) {
+ return process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &mscm->request.openLogicalChannel);
+ }
+ pr_debug("nf_ct_h323: H.245 Request %d\n",
+ mscm->request.choice);
+ break;
+ case eMultimediaSystemControlMessage_response:
+ if (mscm->response.choice ==
+ eResponseMessage_openLogicalChannelAck) {
+ return process_olca(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &mscm->response.
+ openLogicalChannelAck);
+ }
+ pr_debug("nf_ct_h323: H.245 Response %d\n",
+ mscm->response.choice);
+ break;
+ default:
+ pr_debug("nf_ct_h323: H.245 signal %d\n", mscm->choice);
+ break;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int h245_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ static MultimediaSystemControlMessage mscm;
+ unsigned char *data = NULL;
+ int datalen;
+ int dataoff;
+ int ret;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ pr_debug("nf_ct_h245: skblen = %u\n", skb->len);
+
+ spin_lock_bh(&nf_h323_lock);
+
+ /* Process each TPKT */
+ while (get_tpkt_data(skb, protoff, ct, ctinfo,
+ &data, &datalen, &dataoff)) {
+ pr_debug("nf_ct_h245: TPKT len=%d ", datalen);
+ nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+ /* Decode H.245 signal */
+ ret = DecodeMultimediaSystemControlMessage(data, datalen,
+ &mscm);
+ if (ret < 0) {
+ pr_debug("nf_ct_h245: decoding error: %s\n",
+ ret == H323_ERROR_BOUND ?
+ "out of bound" : "out of range");
+ /* We don't drop when decoding error */
+ break;
+ }
+
+ /* Process H.245 signal */
+ if (process_h245(skb, ct, ctinfo, protoff,
+ &data, dataoff, &mscm) < 0)
+ goto drop;
+ }
+
+ spin_unlock_bh(&nf_h323_lock);
+ return NF_ACCEPT;
+
+ drop:
+ spin_unlock_bh(&nf_h323_lock);
+ nf_ct_helper_log(skb, ct, "cannot process H.245 message");
+ return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy h245_exp_policy = {
+ .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */,
+ .timeout = 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {
+ .name = "H.245",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
+ .tuple.src.l3num = AF_UNSPEC,
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .help = h245_help,
+ .expect_policy = &h245_exp_policy,
+};
+
+/****************************************************************************/
+int get_h225_addr(struct nf_conn *ct, unsigned char *data,
+ TransportAddress *taddr,
+ union nf_inet_addr *addr, __be16 *port)
+{
+ const unsigned char *p;
+ int len;
+
+ switch (taddr->choice) {
+ case eTransportAddress_ipAddress:
+ if (nf_ct_l3num(ct) != AF_INET)
+ return 0;
+ p = data + taddr->ipAddress.ip;
+ len = 4;
+ break;
+ case eTransportAddress_ip6Address:
+ if (nf_ct_l3num(ct) != AF_INET6)
+ return 0;
+ p = data + taddr->ip6Address.ip;
+ len = 16;
+ break;
+ default:
+ return 0;
+ }
+
+ memcpy(addr, p, len);
+ memset((void *)addr + len, 0, sizeof(*addr) - len);
+ memcpy(port, p + len, sizeof(__be16));
+
+ return 1;
+}
+
+/****************************************************************************/
+static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ TransportAddress *taddr)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(nat_h245_hook) nat_h245;
+
+ /* Read h245Address */
+ if (!get_h225_addr(ct, *data, taddr, &addr, &port) ||
+ memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+ port == 0)
+ return 0;
+
+ /* Create expect for h245 connection */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_TCP, NULL, &port);
+ exp->helper = &nf_conntrack_helper_h245;
+
+ if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+ (nat_h245 = rcu_dereference(nat_h245_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* NAT needed */
+ ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,
+ port, exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_q931: expect H.245 ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/* If the calling party is on the same side of the forward-to party,
+ * we don't need to track the second call */
+static int callforward_do_filter(struct net *net,
+ const union nf_inet_addr *src,
+ const union nf_inet_addr *dst,
+ u_int8_t family)
+{
+ const struct nf_afinfo *afinfo;
+ int ret = 0;
+
+ /* rcu_read_lock()ed by nf_hook_slow() */
+ afinfo = nf_get_afinfo(family);
+ if (!afinfo)
+ return 0;
+
+ switch (family) {
+ case AF_INET: {
+ struct flowi4 fl1, fl2;
+ struct rtable *rt1, *rt2;
+
+ memset(&fl1, 0, sizeof(fl1));
+ fl1.daddr = src->ip;
+
+ memset(&fl2, 0, sizeof(fl2));
+ fl2.daddr = dst->ip;
+ if (!afinfo->route(net, (struct dst_entry **)&rt1,
+ flowi4_to_flowi(&fl1), false)) {
+ if (!afinfo->route(net, (struct dst_entry **)&rt2,
+ flowi4_to_flowi(&fl2), false)) {
+ if (rt_nexthop(rt1, fl1.daddr) ==
+ rt_nexthop(rt2, fl2.daddr) &&
+ rt1->dst.dev == rt2->dst.dev)
+ ret = 1;
+ dst_release(&rt2->dst);
+ }
+ dst_release(&rt1->dst);
+ }
+ break;
+ }
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+ case AF_INET6: {
+ struct flowi6 fl1, fl2;
+ struct rt6_info *rt1, *rt2;
+
+ memset(&fl1, 0, sizeof(fl1));
+ fl1.daddr = src->in6;
+
+ memset(&fl2, 0, sizeof(fl2));
+ fl2.daddr = dst->in6;
+ if (!afinfo->route(net, (struct dst_entry **)&rt1,
+ flowi6_to_flowi(&fl1), false)) {
+ if (!afinfo->route(net, (struct dst_entry **)&rt2,
+ flowi6_to_flowi(&fl2), false)) {
+ if (ipv6_addr_equal(rt6_nexthop(rt1),
+ rt6_nexthop(rt2)) &&
+ rt1->dst.dev == rt2->dst.dev)
+ ret = 1;
+ dst_release(&rt2->dst);
+ }
+ dst_release(&rt1->dst);
+ }
+ break;
+ }
+#endif
+ }
+ return ret;
+
+}
+
+/****************************************************************************/
+static int expect_callforwarding(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ TransportAddress *taddr)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ struct net *net = nf_ct_net(ct);
+ typeof(nat_callforwarding_hook) nat_callforwarding;
+
+ /* Read alternativeAddress */
+ if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0)
+ return 0;
+
+ /* If the calling party is on the same side of the forward-to party,
+ * we don't need to track the second call */
+ if (callforward_filter &&
+ callforward_do_filter(net, &addr, &ct->tuplehash[!dir].tuple.src.u3,
+ nf_ct_l3num(ct))) {
+ pr_debug("nf_ct_q931: Call Forwarding not tracked\n");
+ return 0;
+ }
+
+ /* Create expect for the second call leg */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3, &addr,
+ IPPROTO_TCP, NULL, &port);
+ exp->helper = nf_conntrack_helper_q931;
+
+ if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+ (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* Need NAT */
+ ret = nat_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ taddr, port, exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_q931: expect Call Forwarding ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Setup_UUIE *setup)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret;
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+ typeof(set_h225_addr_hook) set_h225_addr;
+
+ pr_debug("nf_ct_q931: Setup\n");
+
+ if (setup->options & eSetup_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &setup->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
+ (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK &&
+ get_h225_addr(ct, *data, &setup->destCallSignalAddress,
+ &addr, &port) &&
+ memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) {
+ pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
+ &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
+ ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
+ ret = set_h225_addr(skb, protoff, data, dataoff,
+ &setup->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ ct->tuplehash[!dir].tuple.src.u.tcp.port);
+ if (ret < 0)
+ return -1;
+ }
+
+ if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
+ (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK &&
+ get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
+ &addr, &port) &&
+ memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) {
+ pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
+ &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
+ ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
+ ret = set_h225_addr(skb, protoff, data, dataoff,
+ &setup->sourceCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.dst.u.tcp.port);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (setup->options & eSetup_UUIE_fastStart) {
+ for (i = 0; i < setup->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &setup->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_callproceeding(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ CallProceeding_UUIE *callproc)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: CallProceeding\n");
+
+ if (callproc->options & eCallProceeding_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &callproc->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (callproc->options & eCallProceeding_UUIE_fastStart) {
+ for (i = 0; i < callproc->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &callproc->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Connect_UUIE *connect)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: Connect\n");
+
+ if (connect->options & eConnect_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &connect->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (connect->options & eConnect_UUIE_fastStart) {
+ for (i = 0; i < connect->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &connect->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Alerting_UUIE *alert)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: Alerting\n");
+
+ if (alert->options & eAlerting_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &alert->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (alert->options & eAlerting_UUIE_fastStart) {
+ for (i = 0; i < alert->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &alert->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Facility_UUIE *facility)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: Facility\n");
+
+ if (facility->reason.choice == eFacilityReason_callForwarded) {
+ if (facility->options & eFacility_UUIE_alternativeAddress)
+ return expect_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &facility->
+ alternativeAddress);
+ return 0;
+ }
+
+ if (facility->options & eFacility_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &facility->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (facility->options & eFacility_UUIE_fastStart) {
+ for (i = 0; i < facility->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &facility->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, int dataoff,
+ Progress_UUIE *progress)
+{
+ int ret;
+ int i;
+
+ pr_debug("nf_ct_q931: Progress\n");
+
+ if (progress->options & eProgress_UUIE_h245Address) {
+ ret = expect_h245(skb, ct, ctinfo, protoff, data, dataoff,
+ &progress->h245Address);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (progress->options & eProgress_UUIE_fastStart) {
+ for (i = 0; i < progress->fastStart.count; i++) {
+ ret = process_olc(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &progress->fastStart.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_q931(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data, int dataoff,
+ Q931 *q931)
+{
+ H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu;
+ int i;
+ int ret = 0;
+
+ switch (pdu->h323_message_body.choice) {
+ case eH323_UU_PDU_h323_message_body_setup:
+ ret = process_setup(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.setup);
+ break;
+ case eH323_UU_PDU_h323_message_body_callProceeding:
+ ret = process_callproceeding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &pdu->h323_message_body.
+ callProceeding);
+ break;
+ case eH323_UU_PDU_h323_message_body_connect:
+ ret = process_connect(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.connect);
+ break;
+ case eH323_UU_PDU_h323_message_body_alerting:
+ ret = process_alerting(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.alerting);
+ break;
+ case eH323_UU_PDU_h323_message_body_facility:
+ ret = process_facility(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.facility);
+ break;
+ case eH323_UU_PDU_h323_message_body_progress:
+ ret = process_progress(skb, ct, ctinfo, protoff, data, dataoff,
+ &pdu->h323_message_body.progress);
+ break;
+ default:
+ pr_debug("nf_ct_q931: Q.931 signal %d\n",
+ pdu->h323_message_body.choice);
+ break;
+ }
+
+ if (ret < 0)
+ return -1;
+
+ if (pdu->options & eH323_UU_PDU_h245Control) {
+ for (i = 0; i < pdu->h245Control.count; i++) {
+ ret = process_h245(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ &pdu->h245Control.item[i]);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int q931_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ static Q931 q931;
+ unsigned char *data = NULL;
+ int datalen;
+ int dataoff;
+ int ret;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ pr_debug("nf_ct_q931: skblen = %u\n", skb->len);
+
+ spin_lock_bh(&nf_h323_lock);
+
+ /* Process each TPKT */
+ while (get_tpkt_data(skb, protoff, ct, ctinfo,
+ &data, &datalen, &dataoff)) {
+ pr_debug("nf_ct_q931: TPKT len=%d ", datalen);
+ nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+ /* Decode Q.931 signal */
+ ret = DecodeQ931(data, datalen, &q931);
+ if (ret < 0) {
+ pr_debug("nf_ct_q931: decoding error: %s\n",
+ ret == H323_ERROR_BOUND ?
+ "out of bound" : "out of range");
+ /* We don't drop when decoding error */
+ break;
+ }
+
+ /* Process Q.931 signal */
+ if (process_q931(skb, ct, ctinfo, protoff,
+ &data, dataoff, &q931) < 0)
+ goto drop;
+ }
+
+ spin_unlock_bh(&nf_h323_lock);
+ return NF_ACCEPT;
+
+ drop:
+ spin_unlock_bh(&nf_h323_lock);
+ nf_ct_helper_log(skb, ct, "cannot process Q.931 message");
+ return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy q931_exp_policy = {
+ /* T.120 and H.245 */
+ .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4,
+ .timeout = 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = {
+ {
+ .name = "Q.931",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT),
+ .tuple.dst.protonum = IPPROTO_TCP,
+ .help = q931_help,
+ .expect_policy = &q931_exp_policy,
+ },
+ {
+ .name = "Q.931",
+ .me = THIS_MODULE,
+ .tuple.src.l3num = AF_INET6,
+ .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT),
+ .tuple.dst.protonum = IPPROTO_TCP,
+ .help = q931_help,
+ .expect_policy = &q931_exp_policy,
+ },
+};
+
+/****************************************************************************/
+static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
+ int *datalen)
+{
+ const struct udphdr *uh;
+ struct udphdr _uh;
+ int dataoff;
+
+ uh = skb_header_pointer(skb, protoff, sizeof(_uh), &_uh);
+ if (uh == NULL)
+ return NULL;
+ dataoff = protoff + sizeof(_uh);
+ if (dataoff >= skb->len)
+ return NULL;
+ *datalen = skb->len - dataoff;
+ return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
+}
+
+/****************************************************************************/
+static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
+ union nf_inet_addr *addr,
+ __be16 port)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple tuple;
+
+ memset(&tuple.src.u3, 0, sizeof(tuple.src.u3));
+ tuple.src.u.tcp.port = 0;
+ memcpy(&tuple.dst.u3, addr, sizeof(tuple.dst.u3));
+ tuple.dst.u.tcp.port = port;
+ tuple.dst.protonum = IPPROTO_TCP;
+
+ exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple);
+ if (exp && exp->master == ct)
+ return exp;
+ return NULL;
+}
+
+/****************************************************************************/
+static int set_expect_timeout(struct nf_conntrack_expect *exp,
+ unsigned int timeout)
+{
+ if (!exp || !del_timer(&exp->timeout))
+ return 0;
+
+ exp->timeout.expires = jiffies + timeout * HZ;
+ add_timer(&exp->timeout);
+
+ return 1;
+}
+
+/****************************************************************************/
+static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff, unsigned char **data,
+ TransportAddress *taddr, int count)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ int i;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(nat_q931_hook) nat_q931;
+
+ /* Look for the first related address */
+ for (i = 0; i < count; i++) {
+ if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
+ memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3,
+ sizeof(addr)) == 0 && port != 0)
+ break;
+ }
+
+ if (i >= count) /* Not found */
+ return 0;
+
+ /* Create expect for Q.931 */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ gkrouted_only ? /* only accept calls from GK? */
+ &ct->tuplehash[!dir].tuple.src.u3 : NULL,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ IPPROTO_TCP, NULL, &port);
+ exp->helper = nf_conntrack_helper_q931;
+ exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
+
+ nat_q931 = rcu_dereference(nat_q931_hook);
+ if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) { /* Need NAT */
+ ret = nat_q931(skb, ct, ctinfo, protoff, data,
+ taddr, i, port, exp);
+ } else { /* Conntrack only */
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_ras: expect Q.931 ");
+ nf_ct_dump_tuple(&exp->tuple);
+
+ /* Save port for looking up expect in processing RCF */
+ info->sig_port[dir] = port;
+ } else
+ ret = -1;
+ }
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, GatekeeperRequest *grq)
+{
+ typeof(set_ras_addr_hook) set_ras_addr;
+
+ pr_debug("nf_ct_ras: GRQ\n");
+
+ set_ras_addr = rcu_dereference(set_ras_addr_hook);
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) /* NATed */
+ return set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &grq->rasAddress, 1);
+ return 0;
+}
+
+/****************************************************************************/
+static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, GatekeeperConfirm *gcf)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+
+ pr_debug("nf_ct_ras: GCF\n");
+
+ if (!get_h225_addr(ct, *data, &gcf->rasAddress, &addr, &port))
+ return 0;
+
+ /* Registration port is the same as discovery port */
+ if (!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+ port == ct->tuplehash[dir].tuple.src.u.udp.port)
+ return 0;
+
+ /* Avoid RAS expectation loops. A GCF is never expected. */
+ if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+ return 0;
+
+ /* Need new expect */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3, &addr,
+ IPPROTO_UDP, NULL, &port);
+ exp->helper = nf_conntrack_helper_ras;
+
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_ras: expect RAS ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, RegistrationRequest *rrq)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int ret;
+ typeof(set_ras_addr_hook) set_ras_addr;
+
+ pr_debug("nf_ct_ras: RRQ\n");
+
+ ret = expect_q931(skb, ct, ctinfo, protoff, data,
+ rrq->callSignalAddress.item,
+ rrq->callSignalAddress.count);
+ if (ret < 0)
+ return -1;
+
+ set_ras_addr = rcu_dereference(set_ras_addr_hook);
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
+ rrq->rasAddress.item,
+ rrq->rasAddress.count);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (rrq->options & eRegistrationRequest_timeToLive) {
+ pr_debug("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive);
+ info->timeout = rrq->timeToLive;
+ } else
+ info->timeout = default_rrq_ttl;
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, RegistrationConfirm *rcf)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int ret;
+ struct nf_conntrack_expect *exp;
+ typeof(set_sig_addr_hook) set_sig_addr;
+
+ pr_debug("nf_ct_ras: RCF\n");
+
+ set_sig_addr = rcu_dereference(set_sig_addr_hook);
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
+ rcf->callSignalAddress.item,
+ rcf->callSignalAddress.count);
+ if (ret < 0)
+ return -1;
+ }
+
+ if (rcf->options & eRegistrationConfirm_timeToLive) {
+ pr_debug("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive);
+ info->timeout = rcf->timeToLive;
+ }
+
+ if (info->timeout > 0) {
+ pr_debug("nf_ct_ras: set RAS connection timeout to "
+ "%u seconds\n", info->timeout);
+ nf_ct_refresh(ct, skb, info->timeout * HZ);
+
+ /* Set expect timeout */
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,
+ info->sig_port[!dir]);
+ if (exp) {
+ pr_debug("nf_ct_ras: set Q.931 expect "
+ "timeout to %u seconds for",
+ info->timeout);
+ nf_ct_dump_tuple(&exp->tuple);
+ set_expect_timeout(exp, info->timeout);
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, UnregistrationRequest *urq)
+{
+ struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ int ret;
+ typeof(set_sig_addr_hook) set_sig_addr;
+
+ pr_debug("nf_ct_ras: URQ\n");
+
+ set_sig_addr = rcu_dereference(set_sig_addr_hook);
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
+ urq->callSignalAddress.item,
+ urq->callSignalAddress.count);
+ if (ret < 0)
+ return -1;
+ }
+
+ /* Clear old expect */
+ nf_ct_remove_expectations(ct);
+ info->sig_port[dir] = 0;
+ info->sig_port[!dir] = 0;
+
+ /* Give it 30 seconds for UCF or URJ */
+ nf_ct_refresh(ct, skb, 30 * HZ);
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, AdmissionRequest *arq)
+{
+ const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ int dir = CTINFO2DIR(ctinfo);
+ __be16 port;
+ union nf_inet_addr addr;
+ typeof(set_h225_addr_hook) set_h225_addr;
+
+ pr_debug("nf_ct_ras: ARQ\n");
+
+ set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
+ get_h225_addr(ct, *data, &arq->destCallSignalAddress,
+ &addr, &port) &&
+ !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+ port == info->sig_port[dir] &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ set_h225_addr && ct->status & IPS_NAT_MASK) {
+ /* Answering ARQ */
+ return set_h225_addr(skb, protoff, data, 0,
+ &arq->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir]);
+ }
+
+ if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
+ get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
+ &addr, &port) &&
+ !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+ set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ /* Calling ARQ */
+ return set_h225_addr(skb, protoff, data, 0,
+ &arq->srcCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ port);
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, AdmissionConfirm *acf)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+ typeof(set_sig_addr_hook) set_sig_addr;
+
+ pr_debug("nf_ct_ras: ACF\n");
+
+ if (!get_h225_addr(ct, *data, &acf->destCallSignalAddress,
+ &addr, &port))
+ return 0;
+
+ if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
+ /* Answering ACF */
+ set_sig_addr = rcu_dereference(set_sig_addr_hook);
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
+ return set_sig_addr(skb, ct, ctinfo, protoff, data,
+ &acf->destCallSignalAddress, 1);
+ return 0;
+ }
+
+ /* Need new expect */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3, &addr,
+ IPPROTO_TCP, NULL, &port);
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->helper = nf_conntrack_helper_q931;
+
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_ras: expect Q.931 ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+
+ nf_ct_expect_put(exp);
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, LocationRequest *lrq)
+{
+ typeof(set_ras_addr_hook) set_ras_addr;
+
+ pr_debug("nf_ct_ras: LRQ\n");
+
+ set_ras_addr = rcu_dereference(set_ras_addr_hook);
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
+ return set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &lrq->replyAddress, 1);
+ return 0;
+}
+
+/****************************************************************************/
+static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, LocationConfirm *lcf)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ int ret = 0;
+ __be16 port;
+ union nf_inet_addr addr;
+ struct nf_conntrack_expect *exp;
+
+ pr_debug("nf_ct_ras: LCF\n");
+
+ if (!get_h225_addr(ct, *data, &lcf->callSignalAddress,
+ &addr, &port))
+ return 0;
+
+ /* Need new expect for call signal */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return -1;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &ct->tuplehash[!dir].tuple.src.u3, &addr,
+ IPPROTO_TCP, NULL, &port);
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->helper = nf_conntrack_helper_q931;
+
+ if (nf_ct_expect_related(exp) == 0) {
+ pr_debug("nf_ct_ras: expect Q.931 ");
+ nf_ct_dump_tuple(&exp->tuple);
+ } else
+ ret = -1;
+
+ nf_ct_expect_put(exp);
+
+ /* Ignore rasAddress */
+
+ return ret;
+}
+
+/****************************************************************************/
+static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, InfoRequestResponse *irr)
+{
+ int ret;
+ typeof(set_ras_addr_hook) set_ras_addr;
+ typeof(set_sig_addr_hook) set_sig_addr;
+
+ pr_debug("nf_ct_ras: IRR\n");
+
+ set_ras_addr = rcu_dereference(set_ras_addr_hook);
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &irr->rasAddress, 1);
+ if (ret < 0)
+ return -1;
+ }
+
+ set_sig_addr = rcu_dereference(set_sig_addr_hook);
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
+ ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
+ irr->callSignalAddress.item,
+ irr->callSignalAddress.count);
+ if (ret < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int process_ras(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned char **data, RasMessage *ras)
+{
+ switch (ras->choice) {
+ case eRasMessage_gatekeeperRequest:
+ return process_grq(skb, ct, ctinfo, protoff, data,
+ &ras->gatekeeperRequest);
+ case eRasMessage_gatekeeperConfirm:
+ return process_gcf(skb, ct, ctinfo, protoff, data,
+ &ras->gatekeeperConfirm);
+ case eRasMessage_registrationRequest:
+ return process_rrq(skb, ct, ctinfo, protoff, data,
+ &ras->registrationRequest);
+ case eRasMessage_registrationConfirm:
+ return process_rcf(skb, ct, ctinfo, protoff, data,
+ &ras->registrationConfirm);
+ case eRasMessage_unregistrationRequest:
+ return process_urq(skb, ct, ctinfo, protoff, data,
+ &ras->unregistrationRequest);
+ case eRasMessage_admissionRequest:
+ return process_arq(skb, ct, ctinfo, protoff, data,
+ &ras->admissionRequest);
+ case eRasMessage_admissionConfirm:
+ return process_acf(skb, ct, ctinfo, protoff, data,
+ &ras->admissionConfirm);
+ case eRasMessage_locationRequest:
+ return process_lrq(skb, ct, ctinfo, protoff, data,
+ &ras->locationRequest);
+ case eRasMessage_locationConfirm:
+ return process_lcf(skb, ct, ctinfo, protoff, data,
+ &ras->locationConfirm);
+ case eRasMessage_infoRequestResponse:
+ return process_irr(skb, ct, ctinfo, protoff, data,
+ &ras->infoRequestResponse);
+ default:
+ pr_debug("nf_ct_ras: RAS message %d\n", ras->choice);
+ break;
+ }
+
+ return 0;
+}
+
+/****************************************************************************/
+static int ras_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ static RasMessage ras;
+ unsigned char *data;
+ int datalen = 0;
+ int ret;
+
+ pr_debug("nf_ct_ras: skblen = %u\n", skb->len);
+
+ spin_lock_bh(&nf_h323_lock);
+
+ /* Get UDP data */
+ data = get_udp_data(skb, protoff, &datalen);
+ if (data == NULL)
+ goto accept;
+ pr_debug("nf_ct_ras: RAS message len=%d ", datalen);
+ nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+ /* Decode RAS message */
+ ret = DecodeRasMessage(data, datalen, &ras);
+ if (ret < 0) {
+ pr_debug("nf_ct_ras: decoding error: %s\n",
+ ret == H323_ERROR_BOUND ?
+ "out of bound" : "out of range");
+ goto accept;
+ }
+
+ /* Process RAS message */
+ if (process_ras(skb, ct, ctinfo, protoff, &data, &ras) < 0)
+ goto drop;
+
+ accept:
+ spin_unlock_bh(&nf_h323_lock);
+ return NF_ACCEPT;
+
+ drop:
+ spin_unlock_bh(&nf_h323_lock);
+ nf_ct_helper_log(skb, ct, "cannot process RAS message");
+ return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy ras_exp_policy = {
+ .max_expected = 32,
+ .timeout = 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {
+ {
+ .name = "RAS",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .help = ras_help,
+ .expect_policy = &ras_exp_policy,
+ },
+ {
+ .name = "RAS",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_h323_master),
+ .tuple.src.l3num = AF_INET6,
+ .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .help = ras_help,
+ .expect_policy = &ras_exp_policy,
+ },
+};
+
+/****************************************************************************/
+static void __exit nf_conntrack_h323_fini(void)
+{
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[1]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
+ kfree(h323_buffer);
+ pr_debug("nf_ct_h323: fini\n");
+}
+
+/****************************************************************************/
+static int __init nf_conntrack_h323_init(void)
+{
+ int ret;
+
+ h323_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!h323_buffer)
+ return -ENOMEM;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245);
+ if (ret < 0)
+ goto err1;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]);
+ if (ret < 0)
+ goto err2;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]);
+ if (ret < 0)
+ goto err3;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]);
+ if (ret < 0)
+ goto err4;
+ ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]);
+ if (ret < 0)
+ goto err5;
+ pr_debug("nf_ct_h323: init success\n");
+ return 0;
+
+err5:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
+err4:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
+err3:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+err2:
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
+err1:
+ kfree(h323_buffer);
+ return ret;
+}
+
+/****************************************************************************/
+module_init(nf_conntrack_h323_init);
+module_exit(nf_conntrack_h323_fini);
+
+EXPORT_SYMBOL_GPL(get_h225_addr);
+EXPORT_SYMBOL_GPL(set_h245_addr_hook);
+EXPORT_SYMBOL_GPL(set_h225_addr_hook);
+EXPORT_SYMBOL_GPL(set_sig_addr_hook);
+EXPORT_SYMBOL_GPL(set_ras_addr_hook);
+EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
+EXPORT_SYMBOL_GPL(nat_t120_hook);
+EXPORT_SYMBOL_GPL(nat_h245_hook);
+EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
+EXPORT_SYMBOL_GPL(nat_q931_hook);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_h323");
+MODULE_ALIAS_NFCT_HELPER("RAS");
+MODULE_ALIAS_NFCT_HELPER("Q.931");
+MODULE_ALIAS_NFCT_HELPER("H.245");
diff --git a/net/netfilter/nf_conntrack_h323_types.c b/net/netfilter/nf_conntrack_h323_types.c
new file mode 100644
index 000000000..d880f3523
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_types.c
@@ -0,0 +1,1922 @@
+/* Generated by Jing Min Zhao's ASN.1 parser, May 16 2007
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ */
+
+static const struct field_t _TransportAddress_ipAddress[] = { /* SEQUENCE */
+ {FNAME("ip") OCTSTR, FIXD, 4, 0, DECODE,
+ offsetof(TransportAddress_ipAddress, ip), NULL},
+ {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute_route[] = { /* SEQUENCE OF */
+ {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute_routing[] = { /* CHOICE */
+ {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute[] = { /* SEQUENCE */
+ {FNAME("ip") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0,
+ _TransportAddress_ipSourceRoute_route},
+ {FNAME("routing") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _TransportAddress_ipSourceRoute_routing},
+};
+
+static const struct field_t _TransportAddress_ipxAddress[] = { /* SEQUENCE */
+ {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL},
+ {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("port") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ip6Address[] = { /* SEQUENCE */
+ {FNAME("ip") OCTSTR, FIXD, 16, 0, DECODE,
+ offsetof(TransportAddress_ip6Address, ip), NULL},
+ {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H221NonStandard[] = { /* SEQUENCE */
+ {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _NonStandardIdentifier[] = { /* CHOICE */
+ {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP | EXT, 0,
+ _H221NonStandard},
+};
+
+static const struct field_t _NonStandardParameter[] = { /* SEQUENCE */
+ {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _NonStandardIdentifier},
+ {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress[] = { /* CHOICE */
+ {FNAME("ipAddress") SEQ, 0, 2, 2, DECODE,
+ offsetof(TransportAddress, ipAddress), _TransportAddress_ipAddress},
+ {FNAME("ipSourceRoute") SEQ, 0, 4, 4, SKIP | EXT, 0,
+ _TransportAddress_ipSourceRoute},
+ {FNAME("ipxAddress") SEQ, 0, 3, 3, SKIP, 0,
+ _TransportAddress_ipxAddress},
+ {FNAME("ip6Address") SEQ, 0, 2, 2, DECODE | EXT,
+ offsetof(TransportAddress, ip6Address),
+ _TransportAddress_ip6Address},
+ {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0,
+ _NonStandardParameter},
+};
+
+static const struct field_t _AliasAddress[] = { /* CHOICE */
+ {FNAME("dialedDigits") NUMDGT, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("h323-ID") BMPSTR, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("url-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("transportID") CHOICE, 3, 7, 7, SKIP | EXT, 0, NULL},
+ {FNAME("email-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("partyNumber") CHOICE, 3, 5, 5, SKIP | EXT, 0, NULL},
+ {FNAME("mobileUIM") CHOICE, 1, 2, 2, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_sourceAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _VendorIdentifier[] = { /* SEQUENCE */
+ {FNAME("vendor") SEQ, 0, 3, 3, SKIP | EXT, 0, _H221NonStandard},
+ {FNAME("productId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("versionId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperInfo[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+};
+
+static const struct field_t _H310Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H320Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H321Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H322Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H323Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H324Caps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _VoiceCaps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T120OnlyCaps[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SupportedProtocols[] = { /* CHOICE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP, 0,
+ _NonStandardParameter},
+ {FNAME("h310") SEQ, 1, 1, 3, SKIP | EXT, 0, _H310Caps},
+ {FNAME("h320") SEQ, 1, 1, 3, SKIP | EXT, 0, _H320Caps},
+ {FNAME("h321") SEQ, 1, 1, 3, SKIP | EXT, 0, _H321Caps},
+ {FNAME("h322") SEQ, 1, 1, 3, SKIP | EXT, 0, _H322Caps},
+ {FNAME("h323") SEQ, 1, 1, 3, SKIP | EXT, 0, _H323Caps},
+ {FNAME("h324") SEQ, 1, 1, 3, SKIP | EXT, 0, _H324Caps},
+ {FNAME("voice") SEQ, 1, 1, 3, SKIP | EXT, 0, _VoiceCaps},
+ {FNAME("t120-only") SEQ, 1, 1, 3, SKIP | EXT, 0, _T120OnlyCaps},
+ {FNAME("nonStandardProtocol") SEQ, 2, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("t38FaxAnnexbOnly") SEQ, 2, 5, 5, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _GatewayInfo_protocol[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 4, 9, 11, SKIP | EXT, 0, _SupportedProtocols},
+};
+
+static const struct field_t _GatewayInfo[] = { /* SEQUENCE */
+ {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _GatewayInfo_protocol},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+};
+
+static const struct field_t _McuInfo[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _TerminalInfo[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+};
+
+static const struct field_t _EndpointType[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("vendor") SEQ, 2, 3, 3, SKIP | EXT | OPT, 0,
+ _VendorIdentifier},
+ {FNAME("gatekeeper") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0,
+ _GatekeeperInfo},
+ {FNAME("gateway") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, _GatewayInfo},
+ {FNAME("mcu") SEQ, 1, 1, 2, SKIP | EXT | OPT, 0, _McuInfo},
+ {FNAME("terminal") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0, _TerminalInfo},
+ {FNAME("mc") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("undefinedNode") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("set") BITSTR, FIXD, 32, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedTunnelledProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT,
+ 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_destinationAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _Setup_UUIE_destExtraCallInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _Setup_UUIE_destExtraCRV[] = { /* SEQUENCE OF */
+ {FNAME("item") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_conferenceGoal[] = { /* CHOICE */
+ {FNAME("create") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("join") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("invite") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("capability-negotiation") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("callIndependentSupplementaryService") NUL, FIXD, 0, 0, SKIP,
+ 0, NULL},
+};
+
+static const struct field_t _Q954Details[] = { /* SEQUENCE */
+ {FNAME("conferenceCalling") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("threePartyService") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _QseriesOptions[] = { /* SEQUENCE */
+ {FNAME("q932Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q951Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q952Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q953Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q955Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q956Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q957Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("q954Info") SEQ, 0, 2, 2, SKIP | EXT, 0, _Q954Details},
+};
+
+static const struct field_t _CallType[] = { /* CHOICE */
+ {FNAME("pointToPoint") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("oneToN") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("nToOne") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("nToN") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_NonStandardIdentifier_h221NonStandard[] = { /* SEQUENCE */
+ {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_NonStandardIdentifier[] = { /* CHOICE */
+ {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP, 0,
+ _H245_NonStandardIdentifier_h221NonStandard},
+};
+
+static const struct field_t _H245_NonStandardParameter[] = { /* SEQUENCE */
+ {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP, 0,
+ _H245_NonStandardIdentifier},
+ {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H261VideoCapability[] = { /* SEQUENCE */
+ {FNAME("qcifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0,
+ NULL},
+ {FNAME("maxBitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("stillImageTransmission") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H262VideoCapability[] = { /* SEQUENCE */
+ {FNAME("profileAndLevel-SPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-MPatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-MPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-MPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-MPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-SNRatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-SNRatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-SpatialatH-14") BOOL, FIXD, 0, 0, SKIP, 0,
+ NULL},
+ {FNAME("profileAndLevel-HPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-HPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("profileAndLevel-HPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("framesPerSecond") INT, 4, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H263VideoCapability[] = { /* SEQUENCE */
+ {FNAME("sqcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("qcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cif4MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cif16MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("maxBitRate") INT, CONS, 1, 0, SKIP, 0, NULL},
+ {FNAME("unrestrictedVector") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("arithmeticCoding") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("advancedPrediction") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("pbFrames") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0,
+ NULL},
+ {FNAME("hrd-B") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("bppMaxKb") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowSqcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowQcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowCifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowCif4MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("slowCif16MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("errorCompensation") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("enhancementLayerInfo") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("h263Options") SEQ, 5, 29, 31, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _IS11172VideoCapability[] = { /* SEQUENCE */
+ {FNAME("constrainedBitstream") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("pictureRate") INT, 4, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _VideoCapability[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("h261VideoCapability") SEQ, 2, 5, 6, SKIP | EXT, 0,
+ _H261VideoCapability},
+ {FNAME("h262VideoCapability") SEQ, 6, 17, 18, SKIP | EXT, 0,
+ _H262VideoCapability},
+ {FNAME("h263VideoCapability") SEQ, 7, 13, 21, SKIP | EXT, 0,
+ _H263VideoCapability},
+ {FNAME("is11172VideoCapability") SEQ, 6, 7, 8, SKIP | EXT, 0,
+ _IS11172VideoCapability},
+ {FNAME("genericVideoCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _AudioCapability_g7231[] = { /* SEQUENCE */
+ {FNAME("maxAl-sduAudioFrames") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _IS11172AudioCapability[] = { /* SEQUENCE */
+ {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _IS13818AudioCapability[] = { /* SEQUENCE */
+ {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling16k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling22k05") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling24k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("threeChannels2-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("threeChannels3-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fourChannels2-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fourChannels2-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fourChannels3-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fiveChannels3-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fiveChannels3-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("lowFrequencyEnhancement") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("multilingual") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _AudioCapability[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("g711Alaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g711Alaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g711Ulaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g711Ulaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g722-64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g722-56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g722-48k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g7231") SEQ, 0, 2, 2, SKIP, 0, _AudioCapability_g7231},
+ {FNAME("g728") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g729") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g729AnnexA") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("is11172AudioCapability") SEQ, 0, 9, 9, SKIP | EXT, 0,
+ _IS11172AudioCapability},
+ {FNAME("is13818AudioCapability") SEQ, 0, 21, 21, SKIP | EXT, 0,
+ _IS13818AudioCapability},
+ {FNAME("g729wAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g729AnnexAwAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL},
+ {FNAME("g7231AnnexCCapability") SEQ, 1, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("gsmFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("gsmHalfRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("gsmEnhancedFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("genericAudioCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+ {FNAME("g729Extensions") SEQ, 1, 8, 8, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _DataProtocolCapability[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("v14buffered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("v42lapm") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("hdlcFrameTunnelling") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("h310SeparateVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("h310SingleVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("transparent") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("segmentationAndReassembly") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("hdlcFrameTunnelingwSAR") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("v120") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("separateLANStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("v76wCompression") CHOICE, 2, 3, 3, SKIP | EXT, 0, NULL},
+ {FNAME("tcp") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("udp") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T84Profile_t84Restricted[] = { /* SEQUENCE */
+ {FNAME("qcif") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("cif") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("ccir601Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("ccir601Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("hdtvSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("hdtvProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("g3FacsMH200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("g3FacsMH200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("g4FacsMMR200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("g4FacsMMR200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("jbig200x200Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("jbig200x200Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("jbig300x300Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("jbig300x300Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoLow") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoMedSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoMedProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoHighSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("digPhotoHighProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T84Profile[] = { /* CHOICE */
+ {FNAME("t84Unrestricted") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("t84Restricted") SEQ, 0, 19, 19, SKIP | EXT, 0,
+ _T84Profile_t84Restricted},
+};
+
+static const struct field_t _DataApplicationCapability_application_t84[] = { /* SEQUENCE */
+ {FNAME("t84Protocol") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("t84Profile") CHOICE, 1, 2, 2, SKIP, 0, _T84Profile},
+};
+
+static const struct field_t _DataApplicationCapability_application_nlpid[] = { /* SEQUENCE */
+ {FNAME("nlpidProtocol") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("nlpidData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _DataApplicationCapability_application[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("t120") CHOICE, 3, 7, 14, DECODE | EXT,
+ offsetof(DataApplicationCapability_application, t120),
+ _DataProtocolCapability},
+ {FNAME("dsm-cc") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("userData") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("t84") SEQ, 0, 2, 2, SKIP, 0,
+ _DataApplicationCapability_application_t84},
+ {FNAME("t434") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("h224") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("nlpid") SEQ, 0, 2, 2, SKIP, 0,
+ _DataApplicationCapability_application_nlpid},
+ {FNAME("dsvdControl") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("h222DataPartitioning") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+ _DataProtocolCapability},
+ {FNAME("t30fax") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL},
+ {FNAME("t140") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL},
+ {FNAME("t38fax") SEQ, 0, 2, 2, SKIP, 0, NULL},
+ {FNAME("genericDataCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _DataApplicationCapability[] = { /* SEQUENCE */
+ {FNAME("application") CHOICE, 4, 10, 14, DECODE | EXT,
+ offsetof(DataApplicationCapability, application),
+ _DataApplicationCapability_application},
+ {FNAME("maxBitRate") INT, CONS, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _EncryptionMode[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("h233Encryption") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _DataType[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("nullData") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("videoData") CHOICE, 3, 5, 6, SKIP | EXT, 0, _VideoCapability},
+ {FNAME("audioData") CHOICE, 4, 14, 22, SKIP | EXT, 0,
+ _AudioCapability},
+ {FNAME("data") SEQ, 0, 2, 2, DECODE | EXT, offsetof(DataType, data),
+ _DataApplicationCapability},
+ {FNAME("encryptionData") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _EncryptionMode},
+ {FNAME("h235Control") SEQ, 0, 2, 2, SKIP, 0, NULL},
+ {FNAME("h235Media") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+ {FNAME("multiplexedStream") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _H222LogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("resourceID") INT, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("subChannelID") INT, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("pcr-pid") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("programDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("streamDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters_adaptationLayerType_al3[] = { /* SEQUENCE */
+ {FNAME("controlFieldOctets") INT, 2, 0, 0, SKIP, 0, NULL},
+ {FNAME("sendBufferSize") INT, CONS, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters_adaptationLayerType[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+ _H245_NonStandardParameter},
+ {FNAME("al1Framed") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("al1NotFramed") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("al2WithoutSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("al2WithSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("al3") SEQ, 0, 2, 2, SKIP, 0,
+ _H223LogicalChannelParameters_adaptationLayerType_al3},
+ {FNAME("al1M") SEQ, 0, 7, 8, SKIP | EXT, 0, NULL},
+ {FNAME("al2M") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+ {FNAME("al3M") SEQ, 0, 5, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("adaptationLayerType") CHOICE, 3, 6, 9, SKIP | EXT, 0,
+ _H223LogicalChannelParameters_adaptationLayerType},
+ {FNAME("segmentableFlag") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CRCLength[] = { /* CHOICE */
+ {FNAME("crc8bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("crc16bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("crc32bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76HDLCParameters[] = { /* SEQUENCE */
+ {FNAME("crcLength") CHOICE, 2, 3, 3, SKIP | EXT, 0, _CRCLength},
+ {FNAME("n401") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("loopbackTestProcedure") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_suspendResume[] = { /* CHOICE */
+ {FNAME("noSuspendResume") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("suspendResumewAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("suspendResumewoAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode_eRM_recovery[] = { /* CHOICE */
+ {FNAME("rej") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("sREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("mSREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode_eRM[] = { /* SEQUENCE */
+ {FNAME("windowSize") INT, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("recovery") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _V76LogicalChannelParameters_mode_eRM_recovery},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode[] = { /* CHOICE */
+ {FNAME("eRM") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _V76LogicalChannelParameters_mode_eRM},
+ {FNAME("uNERM") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V75Parameters[] = { /* SEQUENCE */
+ {FNAME("audioHeaderPresent") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("hdlcParameters") SEQ, 0, 3, 3, SKIP | EXT, 0,
+ _V76HDLCParameters},
+ {FNAME("suspendResume") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _V76LogicalChannelParameters_suspendResume},
+ {FNAME("uIH") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("mode") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _V76LogicalChannelParameters_mode},
+ {FNAME("v75Parameters") SEQ, 0, 1, 1, SKIP | EXT, 0, _V75Parameters},
+};
+
+static const struct field_t _H2250LogicalChannelParameters_nonStandard[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter},
+};
+
+static const struct field_t _UnicastAddress_iPAddress[] = { /* SEQUENCE */
+ {FNAME("network") OCTSTR, FIXD, 4, 0, DECODE,
+ offsetof(UnicastAddress_iPAddress, network), NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPXAddress[] = { /* SEQUENCE */
+ {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL},
+ {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("tsapIdentifier") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iP6Address[] = { /* SEQUENCE */
+ {FNAME("network") OCTSTR, FIXD, 16, 0, DECODE,
+ offsetof(UnicastAddress_iP6Address, network), NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress_routing[] = { /* CHOICE */
+ {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress_route[] = { /* SEQUENCE OF */
+ {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress[] = { /* SEQUENCE */
+ {FNAME("routing") CHOICE, 1, 2, 2, SKIP, 0,
+ _UnicastAddress_iPSourceRouteAddress_routing},
+ {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0,
+ _UnicastAddress_iPSourceRouteAddress_route},
+};
+
+static const struct field_t _UnicastAddress[] = { /* CHOICE */
+ {FNAME("iPAddress") SEQ, 0, 2, 2, DECODE | EXT,
+ offsetof(UnicastAddress, iPAddress), _UnicastAddress_iPAddress},
+ {FNAME("iPXAddress") SEQ, 0, 3, 3, SKIP | EXT, 0,
+ _UnicastAddress_iPXAddress},
+ {FNAME("iP6Address") SEQ, 0, 2, 2, DECODE | EXT,
+ offsetof(UnicastAddress, iP6Address), _UnicastAddress_iP6Address},
+ {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("iPSourceRouteAddress") SEQ, 0, 4, 4, SKIP | EXT, 0,
+ _UnicastAddress_iPSourceRouteAddress},
+ {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress_iPAddress[] = { /* SEQUENCE */
+ {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress_iP6Address[] = { /* SEQUENCE */
+ {FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress[] = { /* CHOICE */
+ {FNAME("iPAddress") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _MulticastAddress_iPAddress},
+ {FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _MulticastAddress_iP6Address},
+ {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_TransportAddress[] = { /* CHOICE */
+ {FNAME("unicastAddress") CHOICE, 3, 5, 7, DECODE | EXT,
+ offsetof(H245_TransportAddress, unicastAddress), _UnicastAddress},
+ {FNAME("multicastAddress") CHOICE, 1, 2, 4, SKIP | EXT, 0,
+ _MulticastAddress},
+};
+
+static const struct field_t _H2250LogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _H2250LogicalChannelParameters_nonStandard},
+ {FNAME("sessionID") INT, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("associatedSessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+ offsetof(H2250LogicalChannelParameters, mediaChannel),
+ _H245_TransportAddress},
+ {FNAME("mediaGuaranteedDelivery") BOOL, FIXD, 0, 0, SKIP | OPT, 0,
+ NULL},
+ {FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+ offsetof(H2250LogicalChannelParameters, mediaControlChannel),
+ _H245_TransportAddress},
+ {FNAME("mediaControlGuaranteedDelivery") BOOL, FIXD, 0, 0, STOP | OPT,
+ 0, NULL},
+ {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destination") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, STOP | OPT, 0, NULL},
+ {FNAME("mediaPacketization") CHOICE, 0, 1, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("transportCapability") SEQ, 3, 3, 3, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("redundancyEncoding") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("source") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */
+ {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0,
+ _H222LogicalChannelParameters},
+ {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _H223LogicalChannelParameters},
+ {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0,
+ _V76LogicalChannelParameters},
+ {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+ offsetof
+ (OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters,
+ h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+ {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("dataType") CHOICE, 3, 6, 9, DECODE | EXT,
+ offsetof(OpenLogicalChannel_forwardLogicalChannelParameters,
+ dataType), _DataType},
+ {FNAME("multiplexParameters") CHOICE, 2, 3, 5, DECODE | EXT,
+ offsetof(OpenLogicalChannel_forwardLogicalChannelParameters,
+ multiplexParameters),
+ _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters},
+ {FNAME("forwardLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT,
+ 0, NULL},
+ {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */
+ {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0,
+ _H223LogicalChannelParameters},
+ {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0,
+ _V76LogicalChannelParameters},
+ {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+ offsetof
+ (OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters,
+ h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+};
+
+static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("dataType") CHOICE, 3, 6, 9, SKIP | EXT, 0, _DataType},
+ {FNAME("multiplexParameters") CHOICE, 1, 2, 3, DECODE | EXT | OPT,
+ offsetof(OpenLogicalChannel_reverseLogicalChannelParameters,
+ multiplexParameters),
+ _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters},
+ {FNAME("reverseLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT,
+ 0, NULL},
+ {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _NetworkAccessParameters_distribution[] = { /* CHOICE */
+ {FNAME("unicast") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("multicast") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Q2931Address_address[] = { /* CHOICE */
+ {FNAME("internationalNumber") NUMSTR, 4, 1, 0, SKIP, 0, NULL},
+ {FNAME("nsapAddress") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Q2931Address[] = { /* SEQUENCE */
+ {FNAME("address") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+ _Q2931Address_address},
+ {FNAME("subaddress") OCTSTR, 5, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _NetworkAccessParameters_networkAddress[] = { /* CHOICE */
+ {FNAME("q2931Address") SEQ, 1, 2, 2, SKIP | EXT, 0, _Q2931Address},
+ {FNAME("e164Address") NUMDGT, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("localAreaAddress") CHOICE, 1, 2, 2, DECODE | EXT,
+ offsetof(NetworkAccessParameters_networkAddress, localAreaAddress),
+ _H245_TransportAddress},
+};
+
+static const struct field_t _NetworkAccessParameters[] = { /* SEQUENCE */
+ {FNAME("distribution") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0,
+ _NetworkAccessParameters_distribution},
+ {FNAME("networkAddress") CHOICE, 2, 3, 3, DECODE | EXT,
+ offsetof(NetworkAccessParameters, networkAddress),
+ _NetworkAccessParameters_networkAddress},
+ {FNAME("associateConference") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("externalReference") OCTSTR, 8, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("t120SetupProcedure") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+};
+
+static const struct field_t _OpenLogicalChannel[] = { /* SEQUENCE */
+ {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("forwardLogicalChannelParameters") SEQ, 1, 3, 5, DECODE | EXT,
+ offsetof(OpenLogicalChannel, forwardLogicalChannelParameters),
+ _OpenLogicalChannel_forwardLogicalChannelParameters},
+ {FNAME("reverseLogicalChannelParameters") SEQ, 1, 2, 4,
+ DECODE | EXT | OPT, offsetof(OpenLogicalChannel,
+ reverseLogicalChannelParameters),
+ _OpenLogicalChannel_reverseLogicalChannelParameters},
+ {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT,
+ offsetof(OpenLogicalChannel, separateStack),
+ _NetworkAccessParameters},
+ {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Setup_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Setup_UUIE, h245Address), _TransportAddress},
+ {FNAME("sourceAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Setup_UUIE_sourceAddress},
+ {FNAME("sourceInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+ {FNAME("destinationAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Setup_UUIE_destinationAddress},
+ {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Setup_UUIE, destCallSignalAddress), _TransportAddress},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Setup_UUIE_destExtraCallInfo},
+ {FNAME("destExtraCRV") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Setup_UUIE_destExtraCRV},
+ {FNAME("activeMC") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("conferenceGoal") CHOICE, 2, 3, 5, SKIP | EXT, 0,
+ _Setup_UUIE_conferenceGoal},
+ {FNAME("callServices") SEQ, 0, 8, 8, SKIP | EXT | OPT, 0,
+ _QseriesOptions},
+ {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType},
+ {FNAME("sourceCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Setup_UUIE, sourceCallSignalAddress), _TransportAddress},
+ {FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("h245SecurityCapability") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Setup_UUIE, fastStart), _Setup_UUIE_fastStart},
+ {FNAME("mediaWaitForConnect") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("canOverlapSend") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("connectionParameters") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("symmetricOperationRequired") NUL, FIXD, 0, 0, SKIP | OPT, 0,
+ NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("neededFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("desiredFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("supportedFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("parallelH245Control") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("additionalSourceAddresses") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ NULL},
+};
+
+static const struct field_t _CallProceeding_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _CallProceeding_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+ _EndpointType},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(CallProceeding_UUIE, h245Address), _TransportAddress},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(CallProceeding_UUIE, fastStart),
+ _CallProceeding_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Connect_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Connect_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Connect_UUIE, h245Address), _TransportAddress},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+ _EndpointType},
+ {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Connect_UUIE, fastStart), _Connect_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("connectedAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Alerting_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Alerting_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+ _EndpointType},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Alerting_UUIE, h245Address), _TransportAddress},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Alerting_UUIE, fastStart), _Alerting_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("alertingAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Information_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, SKIP | OPT, 0, NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _ReleaseCompleteReason[] = { /* CHOICE */
+ {FNAME("noBandwidth") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("gatekeeperResources") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("unreachableDestination") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("destinationRejection") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("invalidRevision") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("noPermission") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("unreachableGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("gatewayResources") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("badFormatAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("adaptiveBusy") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("inConf") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("facilityCallDeflection") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("securityDenied") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("calledPartyNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("callerNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("newConnectionNeeded") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardReason") SEQ, 0, 2, 2, SKIP, 0, NULL},
+ {FNAME("replaceWithConferenceInvite") OCTSTR, FIXD, 16, 0, SKIP, 0,
+ NULL},
+ {FNAME("genericDataReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("neededFeatureNotSupported") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("tunnelledSignallingRejected") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _ReleaseComplete_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("reason") CHOICE, 4, 12, 22, SKIP | EXT | OPT, 0,
+ _ReleaseCompleteReason},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("busyAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Facility_UUIE_alternativeAliasAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _FacilityReason[] = { /* CHOICE */
+ {FNAME("routeCallToGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("callForwarded") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("routeCallToMC") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("conferenceListChoice") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("startH245") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("noH245") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("newTokens") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("featureSetUpdate") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("forwardedElements") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("transportedInformation") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Facility_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
+ {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Facility_UUIE_alternativeAliasAddress},
+ {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
+ {FNAME("reason") CHOICE, 2, 4, 11, DECODE | EXT,
+ offsetof(Facility_UUIE, reason), _FacilityReason},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("conferences") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Facility_UUIE, h245Address), _TransportAddress},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Facility_UUIE, fastStart), _Facility_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ NULL},
+};
+
+static const struct field_t _CallIdentifier[] = { /* SEQUENCE */
+ {FNAME("guid") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SecurityServiceMode[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter},
+ {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("default") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SecurityCapabilities[] = { /* SEQUENCE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("encryption") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _SecurityServiceMode},
+ {FNAME("authenticaton") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _SecurityServiceMode},
+ {FNAME("integrity") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+ _SecurityServiceMode},
+};
+
+static const struct field_t _H245Security[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter},
+ {FNAME("noSecurity") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("tls") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities},
+ {FNAME("ipsec") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities},
+};
+
+static const struct field_t _DHset[] = { /* SEQUENCE */
+ {FNAME("halfkey") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("modSize") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+ {FNAME("generator") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TypedCertificate[] = { /* SEQUENCE */
+ {FNAME("type") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("certificate") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H235_NonStandardParameter[] = { /* SEQUENCE */
+ {FNAME("nonStandardIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _ClearToken[] = { /* SEQUENCE */
+ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("timeStamp") INT, CONS, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("password") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("dhkey") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0, _DHset},
+ {FNAME("challenge") OCTSTR, 7, 8, 0, SKIP | OPT, 0, NULL},
+ {FNAME("random") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("certificate") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0,
+ _TypedCertificate},
+ {FNAME("generalID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _H235_NonStandardParameter},
+ {FNAME("eckasdhkey") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, NULL},
+ {FNAME("sendersID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _Progress_UUIE_tokens[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken},
+};
+
+static const struct field_t _Params[] = { /* SEQUENCE */
+ {FNAME("ranInt") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("iv8") OCTSTR, FIXD, 8, 0, SKIP | OPT, 0, NULL},
+ {FNAME("iv16") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdHash_token[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdHash[] = { /* SEQUENCE */
+ {FNAME("alias") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+ {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL},
+ {FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoEPPwdHash_token},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdHash_token[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdHash[] = { /* SEQUENCE */
+ {FNAME("gatekeeperId") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL},
+ {FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoGKPwdHash_token},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdEncr[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdEncr[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPCert[] = { /* SEQUENCE */
+ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKCert[] = { /* SEQUENCE */
+ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoFastStart[] = { /* SEQUENCE */
+ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoEncryptedToken_token[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoEncryptedToken[] = { /* SEQUENCE */
+ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoToken_cryptoEncryptedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoSignedToken_token[] = { /* SEQUENCE */
+ {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoSignedToken[] = { /* SEQUENCE */
+ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("token") SEQ, 0, 4, 4, SKIP, 0,
+ _CryptoToken_cryptoSignedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoHashedToken_token[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoHashedToken[] = { /* SEQUENCE */
+ {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("hashedVals") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken},
+ {FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoToken_cryptoHashedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoPwdEncr[] = { /* SEQUENCE */
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+ {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken[] = { /* CHOICE */
+ {FNAME("cryptoEncryptedToken") SEQ, 0, 2, 2, SKIP, 0,
+ _CryptoToken_cryptoEncryptedToken},
+ {FNAME("cryptoSignedToken") SEQ, 0, 2, 2, SKIP, 0,
+ _CryptoToken_cryptoSignedToken},
+ {FNAME("cryptoHashedToken") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoToken_cryptoHashedToken},
+ {FNAME("cryptoPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoToken_cryptoPwdEncr},
+};
+
+static const struct field_t _CryptoH323Token[] = { /* CHOICE */
+ {FNAME("cryptoEPPwdHash") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoEPPwdHash},
+ {FNAME("cryptoGKPwdHash") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoGKPwdHash},
+ {FNAME("cryptoEPPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoEPPwdEncr},
+ {FNAME("cryptoGKPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+ _CryptoH323Token_cryptoGKPwdEncr},
+ {FNAME("cryptoEPCert") SEQ, 0, 4, 4, SKIP, 0,
+ _CryptoH323Token_cryptoEPCert},
+ {FNAME("cryptoGKCert") SEQ, 0, 4, 4, SKIP, 0,
+ _CryptoH323Token_cryptoGKCert},
+ {FNAME("cryptoFastStart") SEQ, 0, 4, 4, SKIP, 0,
+ _CryptoH323Token_cryptoFastStart},
+ {FNAME("nestedcryptoToken") CHOICE, 2, 4, 4, SKIP | EXT, 0,
+ _CryptoToken},
+};
+
+static const struct field_t _Progress_UUIE_cryptoTokens[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 8, 8, SKIP | EXT, 0, _CryptoH323Token},
+};
+
+static const struct field_t _Progress_UUIE_fastStart[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+ sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+ ,
+};
+
+static const struct field_t _Progress_UUIE[] = { /* SEQUENCE */
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+ _EndpointType},
+ {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(Progress_UUIE, h245Address), _TransportAddress},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0,
+ _CallIdentifier},
+ {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+ _H245Security},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Progress_UUIE_tokens},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _Progress_UUIE_cryptoTokens},
+ {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+ offsetof(Progress_UUIE, fastStart), _Progress_UUIE_fastStart},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H323_UU_PDU_h323_message_body[] = { /* CHOICE */
+ {FNAME("setup") SEQ, 7, 13, 39, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, setup), _Setup_UUIE},
+ {FNAME("callProceeding") SEQ, 1, 3, 12, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, callProceeding),
+ _CallProceeding_UUIE},
+ {FNAME("connect") SEQ, 1, 4, 19, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, connect), _Connect_UUIE},
+ {FNAME("alerting") SEQ, 1, 3, 17, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, alerting), _Alerting_UUIE},
+ {FNAME("information") SEQ, 0, 1, 7, SKIP | EXT, 0, _Information_UUIE},
+ {FNAME("releaseComplete") SEQ, 1, 2, 11, SKIP | EXT, 0,
+ _ReleaseComplete_UUIE},
+ {FNAME("facility") SEQ, 3, 5, 21, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, facility), _Facility_UUIE},
+ {FNAME("progress") SEQ, 5, 8, 11, DECODE | EXT,
+ offsetof(H323_UU_PDU_h323_message_body, progress), _Progress_UUIE},
+ {FNAME("empty") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("status") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+ {FNAME("statusInquiry") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+ {FNAME("setupAcknowledge") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+ {FNAME("notify") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _RequestMessage[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("masterSlaveDetermination") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("terminalCapabilitySet") SEQ, 3, 5, 5, STOP | EXT, 0, NULL},
+ {FNAME("openLogicalChannel") SEQ, 1, 3, 5, DECODE | EXT,
+ offsetof(RequestMessage, openLogicalChannel), _OpenLogicalChannel},
+ {FNAME("closeLogicalChannel") SEQ, 0, 2, 3, STOP | EXT, 0, NULL},
+ {FNAME("requestChannelClose") SEQ, 0, 1, 3, STOP | EXT, 0, NULL},
+ {FNAME("multiplexEntrySend") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("requestMultiplexEntry") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("requestMode") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("roundTripDelayRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("maintenanceLoopRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("communicationModeRequest") SEQ, 0, 0, 0, STOP | EXT, 0, NULL},
+ {FNAME("conferenceRequest") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL},
+ {FNAME("multilinkRequest") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL},
+ {FNAME("logicalChannelRateRequest") SEQ, 0, 3, 3, STOP | EXT, 0,
+ NULL},
+};
+
+static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */
+ {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0,
+ _H222LogicalChannelParameters},
+ {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+ offsetof
+ (OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters,
+ h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+};
+
+static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = { /* SEQUENCE */
+ {FNAME("reverseLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("multiplexParameters") CHOICE, 0, 1, 2, DECODE | EXT | OPT,
+ offsetof(OpenLogicalChannelAck_reverseLogicalChannelParameters,
+ multiplexParameters),
+ _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters},
+ {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H2250LogicalChannelAckParameters_nonStandard[] = { /* SEQUENCE OF */
+ {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter},
+};
+
+static const struct field_t _H2250LogicalChannelAckParameters[] = { /* SEQUENCE */
+ {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _H2250LogicalChannelAckParameters_nonStandard},
+ {FNAME("sessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+ offsetof(H2250LogicalChannelAckParameters, mediaChannel),
+ _H245_TransportAddress},
+ {FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+ offsetof(H2250LogicalChannelAckParameters, mediaControlChannel),
+ _H245_TransportAddress},
+ {FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, SKIP | OPT, 0, NULL},
+ {FNAME("flowControlToZero") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannelAck_forwardMultiplexAckParameters[] = { /* CHOICE */
+ {FNAME("h2250LogicalChannelAckParameters") SEQ, 5, 5, 7, DECODE | EXT,
+ offsetof(OpenLogicalChannelAck_forwardMultiplexAckParameters,
+ h2250LogicalChannelAckParameters),
+ _H2250LogicalChannelAckParameters},
+};
+
+static const struct field_t _OpenLogicalChannelAck[] = { /* SEQUENCE */
+ {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("reverseLogicalChannelParameters") SEQ, 2, 3, 4,
+ DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck,
+ reverseLogicalChannelParameters),
+ _OpenLogicalChannelAck_reverseLogicalChannelParameters},
+ {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT,
+ offsetof(OpenLogicalChannelAck, separateStack),
+ _NetworkAccessParameters},
+ {FNAME("forwardMultiplexAckParameters") CHOICE, 0, 1, 1,
+ DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck,
+ forwardMultiplexAckParameters),
+ _OpenLogicalChannelAck_forwardMultiplexAckParameters},
+ {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _ResponseMessage[] = { /* CHOICE */
+ {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("masterSlaveDeterminationAck") SEQ, 0, 1, 1, STOP | EXT, 0,
+ NULL},
+ {FNAME("masterSlaveDeterminationReject") SEQ, 0, 1, 1, STOP | EXT, 0,
+ NULL},
+ {FNAME("terminalCapabilitySetAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("terminalCapabilitySetReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+ NULL},
+ {FNAME("openLogicalChannelAck") SEQ, 1, 2, 5, DECODE | EXT,
+ offsetof(ResponseMessage, openLogicalChannelAck),
+ _OpenLogicalChannelAck},
+ {FNAME("openLogicalChannelReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("closeLogicalChannelAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("requestChannelCloseAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("requestChannelCloseReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+ NULL},
+ {FNAME("multiplexEntrySendAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("multiplexEntrySendReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("requestMultiplexEntryAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("requestMultiplexEntryReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+ NULL},
+ {FNAME("requestModeAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("requestModeReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("roundTripDelayResponse") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("maintenanceLoopAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("maintenanceLoopReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+ {FNAME("communicationModeResponse") CHOICE, 0, 1, 1, STOP | EXT, 0,
+ NULL},
+ {FNAME("conferenceResponse") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL},
+ {FNAME("multilinkResponse") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL},
+ {FNAME("logicalChannelRateAcknowledge") SEQ, 0, 3, 3, STOP | EXT, 0,
+ NULL},
+ {FNAME("logicalChannelRateReject") SEQ, 1, 4, 4, STOP | EXT, 0, NULL},
+};
+
+static const struct field_t _MultimediaSystemControlMessage[] = { /* CHOICE */
+ {FNAME("request") CHOICE, 4, 11, 15, DECODE | EXT,
+ offsetof(MultimediaSystemControlMessage, request), _RequestMessage},
+ {FNAME("response") CHOICE, 5, 19, 24, DECODE | EXT,
+ offsetof(MultimediaSystemControlMessage, response),
+ _ResponseMessage},
+ {FNAME("command") CHOICE, 3, 7, 12, STOP | EXT, 0, NULL},
+ {FNAME("indication") CHOICE, 4, 14, 23, STOP | EXT, 0, NULL},
+};
+
+static const struct field_t _H323_UU_PDU_h245Control[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 2, 4, 4, DECODE | OPEN | EXT,
+ sizeof(MultimediaSystemControlMessage),
+ _MultimediaSystemControlMessage}
+ ,
+};
+
+static const struct field_t _H323_UU_PDU[] = { /* SEQUENCE */
+ {FNAME("h323-message-body") CHOICE, 3, 7, 13, DECODE | EXT,
+ offsetof(H323_UU_PDU, h323_message_body),
+ _H323_UU_PDU_h323_message_body},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("h4501SupplementaryService") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ NULL},
+ {FNAME("h245Tunneling") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("h245Control") SEQOF, SEMI, 0, 4, DECODE | OPT,
+ offsetof(H323_UU_PDU, h245Control), _H323_UU_PDU_h245Control},
+ {FNAME("nonStandardControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("tunnelledSignallingMessage") SEQ, 2, 4, 4, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("provisionalRespToH245Tunneling") NUL, FIXD, 0, 0, STOP | OPT,
+ 0, NULL},
+ {FNAME("stimulusControl") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _H323_UserInformation[] = { /* SEQUENCE */
+ {FNAME("h323-uu-pdu") SEQ, 1, 2, 11, DECODE | EXT,
+ offsetof(H323_UserInformation, h323_uu_pdu), _H323_UU_PDU},
+ {FNAME("user-data") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(GatekeeperRequest, rasAddress), _TransportAddress},
+ {FNAME("endpointType") SEQ, 6, 8, 10, STOP | EXT, 0, NULL},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL},
+ {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("authenticationCapability") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("algorithmOIDs") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperConfirm[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(GatekeeperConfirm, rasAddress), _TransportAddress},
+ {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("authenticationMode") CHOICE, 3, 7, 8, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("algorithmOID") OID, BYTE, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RegistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _RegistrationRequest_rasAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _RegistrationRequest_terminalAlias[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _RegistrationRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("discoveryComplete") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(RegistrationRequest, callSignalAddress),
+ _RegistrationRequest_callSignalAddress},
+ {FNAME("rasAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(RegistrationRequest, rasAddress),
+ _RegistrationRequest_rasAddress},
+ {FNAME("terminalType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+ {FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _RegistrationRequest_terminalAlias},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("endpointVendor") SEQ, 2, 3, 3, SKIP | EXT, 0,
+ _VendorIdentifier},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT,
+ offsetof(RegistrationRequest, timeToLive), NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("keepAlive") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("additiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("usageReportingCapability") SEQ, 3, 4, 4, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("supportedH248Packages") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("callCreditCapability") SEQ, 2, 2, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("capacityReportingCapability") SEQ, 0, 1, 1, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RegistrationConfirm_callSignalAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _RegistrationConfirm_terminalAlias[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _RegistrationConfirm[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(RegistrationConfirm, callSignalAddress),
+ _RegistrationConfirm_callSignalAddress},
+ {FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _RegistrationConfirm_terminalAlias},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+ {FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT,
+ offsetof(RegistrationConfirm, timeToLive), NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("preGrantedARQ") SEQ, 0, 4, 8, STOP | EXT | OPT, 0, NULL},
+ {FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("supportsAdditiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("featureServerAlias") CHOICE, 1, 2, 7, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("capacityReportingSpec") SEQ, 0, 1, 1, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _UnregistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _UnregistrationRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(UnregistrationRequest, callSignalAddress),
+ _UnregistrationRequest_callSignalAddress},
+ {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("reason") CHOICE, 2, 4, 5, STOP | EXT | OPT, 0, NULL},
+ {FNAME("endpointAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _CallModel[] = { /* CHOICE */
+ {FNAME("direct") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+ {FNAME("gatekeeperRouted") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _AdmissionRequest_destinationInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest_destExtraCallInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest_srcInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType},
+ {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, _CallModel},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _AdmissionRequest_destinationInfo},
+ {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(AdmissionRequest, destCallSignalAddress),
+ _TransportAddress},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+ _AdmissionRequest_destExtraCallInfo},
+ {FNAME("srcInfo") SEQOF, SEMI, 0, 0, SKIP, 0,
+ _AdmissionRequest_srcInfo},
+ {FNAME("srcCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+ offsetof(AdmissionRequest, srcCallSignalAddress), _TransportAddress},
+ {FNAME("bandWidth") INT, CONS, 0, 0, STOP, 0, NULL},
+ {FNAME("callReferenceValue") INT, WORD, 0, 0, STOP, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL},
+ {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, STOP, 0, NULL},
+ {FNAME("activeMC") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("answerCall") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("callIdentifier") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+ {FNAME("srcAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("gatewayDataRate") SEQ, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _AdmissionConfirm[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("bandWidth") INT, CONS, 0, 0, SKIP, 0, NULL},
+ {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT, 0, _CallModel},
+ {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(AdmissionConfirm, destCallSignalAddress),
+ _TransportAddress},
+ {FNAME("irrFrequency") INT, WORD, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL},
+ {FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("uuiesRequested") SEQ, 0, 9, 13, STOP | EXT, 0, NULL},
+ {FNAME("language") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("useSpecifiedTransport") CHOICE, 1, 2, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _LocationRequest_destinationInfo[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _LocationRequest[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+ {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP, 0,
+ _LocationRequest_destinationInfo},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("replyAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(LocationRequest, replyAddress), _TransportAddress},
+ {FNAME("sourceInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0,
+ NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("hopCount") INT, 8, 1, 0, STOP | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _LocationConfirm[] = { /* SEQUENCE */
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("callSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(LocationConfirm, callSignalAddress), _TransportAddress},
+ {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(LocationConfirm, rasAddress), _TransportAddress},
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL},
+ {FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+ NULL},
+ {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+ 0, NULL},
+ {FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+ {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _InfoRequestResponse_callSignalAddress[] = { /* SEQUENCE OF */
+ {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+ sizeof(TransportAddress), _TransportAddress}
+ ,
+};
+
+static const struct field_t _InfoRequestResponse[] = { /* SEQUENCE */
+ {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+ _NonStandardParameter},
+ {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+ {FNAME("endpointType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+ {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+ {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+ offsetof(InfoRequestResponse, rasAddress), _TransportAddress},
+ {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+ offsetof(InfoRequestResponse, callSignalAddress),
+ _InfoRequestResponse_callSignalAddress},
+ {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("perCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+ {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+ {FNAME("needResponse") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+ {FNAME("irrStatus") CHOICE, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+ {FNAME("unsolicited") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+ {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RasMessage[] = { /* CHOICE */
+ {FNAME("gatekeeperRequest") SEQ, 4, 8, 18, DECODE | EXT,
+ offsetof(RasMessage, gatekeeperRequest), _GatekeeperRequest},
+ {FNAME("gatekeeperConfirm") SEQ, 2, 5, 14, DECODE | EXT,
+ offsetof(RasMessage, gatekeeperConfirm), _GatekeeperConfirm},
+ {FNAME("gatekeeperReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL},
+ {FNAME("registrationRequest") SEQ, 3, 10, 31, DECODE | EXT,
+ offsetof(RasMessage, registrationRequest), _RegistrationRequest},
+ {FNAME("registrationConfirm") SEQ, 3, 7, 24, DECODE | EXT,
+ offsetof(RasMessage, registrationConfirm), _RegistrationConfirm},
+ {FNAME("registrationReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL},
+ {FNAME("unregistrationRequest") SEQ, 3, 5, 15, DECODE | EXT,
+ offsetof(RasMessage, unregistrationRequest), _UnregistrationRequest},
+ {FNAME("unregistrationConfirm") SEQ, 1, 2, 6, STOP | EXT, 0, NULL},
+ {FNAME("unregistrationReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+ {FNAME("admissionRequest") SEQ, 7, 16, 34, DECODE | EXT,
+ offsetof(RasMessage, admissionRequest), _AdmissionRequest},
+ {FNAME("admissionConfirm") SEQ, 2, 6, 27, DECODE | EXT,
+ offsetof(RasMessage, admissionConfirm), _AdmissionConfirm},
+ {FNAME("admissionReject") SEQ, 1, 3, 11, STOP | EXT, 0, NULL},
+ {FNAME("bandwidthRequest") SEQ, 2, 7, 18, STOP | EXT, 0, NULL},
+ {FNAME("bandwidthConfirm") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+ {FNAME("bandwidthReject") SEQ, 1, 4, 9, STOP | EXT, 0, NULL},
+ {FNAME("disengageRequest") SEQ, 1, 6, 19, STOP | EXT, 0, NULL},
+ {FNAME("disengageConfirm") SEQ, 1, 2, 9, STOP | EXT, 0, NULL},
+ {FNAME("disengageReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+ {FNAME("locationRequest") SEQ, 2, 5, 17, DECODE | EXT,
+ offsetof(RasMessage, locationRequest), _LocationRequest},
+ {FNAME("locationConfirm") SEQ, 1, 4, 19, DECODE | EXT,
+ offsetof(RasMessage, locationConfirm), _LocationConfirm},
+ {FNAME("locationReject") SEQ, 1, 3, 10, STOP | EXT, 0, NULL},
+ {FNAME("infoRequest") SEQ, 2, 4, 15, STOP | EXT, 0, NULL},
+ {FNAME("infoRequestResponse") SEQ, 3, 8, 16, DECODE | EXT,
+ offsetof(RasMessage, infoRequestResponse), _InfoRequestResponse},
+ {FNAME("nonStandardMessage") SEQ, 0, 2, 7, STOP | EXT, 0, NULL},
+ {FNAME("unknownMessageResponse") SEQ, 0, 1, 5, STOP | EXT, 0, NULL},
+ {FNAME("requestInProgress") SEQ, 4, 6, 6, STOP | EXT, 0, NULL},
+ {FNAME("resourcesAvailableIndicate") SEQ, 4, 9, 11, STOP | EXT, 0,
+ NULL},
+ {FNAME("resourcesAvailableConfirm") SEQ, 4, 6, 7, STOP | EXT, 0,
+ NULL},
+ {FNAME("infoRequestAck") SEQ, 4, 5, 5, STOP | EXT, 0, NULL},
+ {FNAME("infoRequestNak") SEQ, 5, 7, 7, STOP | EXT, 0, NULL},
+ {FNAME("serviceControlIndication") SEQ, 8, 10, 10, STOP | EXT, 0,
+ NULL},
+ {FNAME("serviceControlResponse") SEQ, 7, 8, 8, STOP | EXT, 0, NULL},
+};
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
new file mode 100644
index 000000000..bd9d31537
--- /dev/null
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -0,0 +1,502 @@
+/* Helper handling for netfilter. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/random.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_log.h>
+
+static DEFINE_MUTEX(nf_ct_helper_mutex);
+struct hlist_head *nf_ct_helper_hash __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_helper_hash);
+unsigned int nf_ct_helper_hsize __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
+static unsigned int nf_ct_helper_count __read_mostly;
+
+static bool nf_ct_auto_assign_helper __read_mostly = true;
+module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
+MODULE_PARM_DESC(nf_conntrack_helper,
+ "Enable automatic conntrack helper assignment (default 1)");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table helper_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_helper",
+ .data = &init_net.ct.sysctl_auto_assign_helper,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static int nf_conntrack_helper_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(helper_sysctl_table, sizeof(helper_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_auto_assign_helper;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.helper_sysctl_header =
+ register_net_sysctl(net, "net/netfilter", table);
+
+ if (!net->ct.helper_sysctl_header) {
+ pr_err("nf_conntrack_helper: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_helper_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.helper_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.helper_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_helper_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_helper_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+/* Stupid hash, but collision free for the default registrations of the
+ * helpers currently in the kernel. */
+static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
+{
+ return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^
+ (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize;
+}
+
+static struct nf_conntrack_helper *
+__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
+ unsigned int h;
+
+ if (!nf_ct_helper_count)
+ return NULL;
+
+ h = helper_hash(tuple);
+ hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {
+ if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
+ return helper;
+ }
+ return NULL;
+}
+
+struct nf_conntrack_helper *
+__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
+{
+ struct nf_conntrack_helper *h;
+ unsigned int i;
+
+ for (i = 0; i < nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) {
+ if (!strcmp(h->name, name) &&
+ h->tuple.src.l3num == l3num &&
+ h->tuple.dst.protonum == protonum)
+ return h;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find);
+
+struct nf_conntrack_helper *
+nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
+{
+ struct nf_conntrack_helper *h;
+
+ h = __nf_conntrack_helper_find(name, l3num, protonum);
+#ifdef CONFIG_MODULES
+ if (h == NULL) {
+ if (request_module("nfct-helper-%s", name) == 0)
+ h = __nf_conntrack_helper_find(name, l3num, protonum);
+ }
+#endif
+ if (h != NULL && !try_module_get(h->me))
+ h = NULL;
+
+ return h;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
+
+struct nf_conn_help *
+nf_ct_helper_ext_add(struct nf_conn *ct,
+ struct nf_conntrack_helper *helper, gfp_t gfp)
+{
+ struct nf_conn_help *help;
+
+ help = nf_ct_ext_add_length(ct, NF_CT_EXT_HELPER,
+ helper->data_len, gfp);
+ if (help)
+ INIT_HLIST_HEAD(&help->expectations);
+ else
+ pr_debug("failed to add helper extension area");
+ return help;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
+
+int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
+ gfp_t flags)
+{
+ struct nf_conntrack_helper *helper = NULL;
+ struct nf_conn_help *help;
+ struct net *net = nf_ct_net(ct);
+ int ret = 0;
+
+ /* We already got a helper explicitly attached. The function
+ * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
+ * the helper up again. Since now the user is in full control of
+ * making consistent helper configurations, skip this automatic
+ * re-lookup, otherwise we'll lose the helper.
+ */
+ if (test_bit(IPS_HELPER_BIT, &ct->status))
+ return 0;
+
+ if (tmpl != NULL) {
+ help = nfct_help(tmpl);
+ if (help != NULL) {
+ helper = help->helper;
+ set_bit(IPS_HELPER_BIT, &ct->status);
+ }
+ }
+
+ help = nfct_help(ct);
+ if (net->ct.sysctl_auto_assign_helper && helper == NULL) {
+ helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ if (unlikely(!net->ct.auto_assign_helper_warned && helper)) {
+ pr_info("nf_conntrack: automatic helper "
+ "assignment is deprecated and it will "
+ "be removed soon. Use the iptables CT target "
+ "to attach helpers instead.\n");
+ net->ct.auto_assign_helper_warned = true;
+ }
+ }
+
+ if (helper == NULL) {
+ if (help)
+ RCU_INIT_POINTER(help->helper, NULL);
+ goto out;
+ }
+
+ if (help == NULL) {
+ help = nf_ct_helper_ext_add(ct, helper, flags);
+ if (help == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else {
+ /* We only allow helper re-assignment of the same sort since
+ * we cannot reallocate the helper extension area.
+ */
+ struct nf_conntrack_helper *tmp = rcu_dereference(help->helper);
+
+ if (tmp && tmp->help != helper->help) {
+ RCU_INIT_POINTER(help->helper, NULL);
+ goto out;
+ }
+ }
+
+ rcu_assign_pointer(help->helper, helper);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
+
+/* appropriate ct lock protecting must be taken by caller */
+static inline int unhelp(struct nf_conntrack_tuple_hash *i,
+ const struct nf_conntrack_helper *me)
+{
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
+ struct nf_conn_help *help = nfct_help(ct);
+
+ if (help && rcu_dereference_raw(help->helper) == me) {
+ nf_conntrack_event(IPCT_HELPER, ct);
+ RCU_INIT_POINTER(help->helper, NULL);
+ }
+ return 0;
+}
+
+void nf_ct_helper_destroy(struct nf_conn *ct)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_helper *helper;
+
+ if (help) {
+ rcu_read_lock();
+ helper = rcu_dereference(help->helper);
+ if (helper && helper->destroy)
+ helper->destroy(ct);
+ rcu_read_unlock();
+ }
+}
+
+static LIST_HEAD(nf_ct_helper_expectfn_list);
+
+void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n)
+{
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ list_add_rcu(&n->head, &nf_ct_helper_expectfn_list);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register);
+
+void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n)
+{
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ list_del_rcu(&n->head);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister);
+
+struct nf_ct_helper_expectfn *
+nf_ct_helper_expectfn_find_by_name(const char *name)
+{
+ struct nf_ct_helper_expectfn *cur;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
+ if (!strcmp(cur->name, name)) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return found ? cur : NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name);
+
+struct nf_ct_helper_expectfn *
+nf_ct_helper_expectfn_find_by_symbol(const void *symbol)
+{
+ struct nf_ct_helper_expectfn *cur;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) {
+ if (cur->expectfn == symbol) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return found ? cur : NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol);
+
+__printf(3, 4)
+void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
+ const char *fmt, ...)
+{
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ /* Called from the helper function, this call never fails */
+ help = nfct_help(ct);
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+
+ nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
+ "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_log);
+
+int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
+{
+ int ret = 0;
+ struct nf_conntrack_helper *cur;
+ unsigned int h = helper_hash(&me->tuple);
+
+ BUG_ON(me->expect_policy == NULL);
+ BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
+ BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
+
+ mutex_lock(&nf_ct_helper_mutex);
+ hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
+ if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 &&
+ cur->tuple.src.l3num == me->tuple.src.l3num &&
+ cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
+ nf_ct_helper_count++;
+out:
+ mutex_unlock(&nf_ct_helper_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_register);
+
+static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
+ struct net *net)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_expect *exp;
+ const struct hlist_node *next;
+ const struct hlist_nulls_node *nn;
+ unsigned int i;
+ int cpu;
+
+ /* Get rid of expectations */
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ for (i = 0; i < nf_ct_expect_hsize; i++) {
+ hlist_for_each_entry_safe(exp, next,
+ &net->ct.expect_hash[i], hnode) {
+ struct nf_conn_help *help = nfct_help(exp->master);
+ if ((rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_expect_lock)
+ ) == me || exp->helper == me) &&
+ del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ }
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+ /* Get rid of expecteds, set helpers to NULL. */
+ for_each_possible_cpu(cpu) {
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ spin_lock_bh(&pcpu->lock);
+ hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode)
+ unhelp(h, me);
+ spin_unlock_bh(&pcpu->lock);
+ }
+ local_bh_disable();
+ for (i = 0; i < net->ct.htable_size; i++) {
+ spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ if (i < net->ct.htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+ unhelp(h, me);
+ }
+ spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ }
+ local_bh_enable();
+}
+
+void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+{
+ struct net *net;
+
+ mutex_lock(&nf_ct_helper_mutex);
+ hlist_del_rcu(&me->hnode);
+ nf_ct_helper_count--;
+ mutex_unlock(&nf_ct_helper_mutex);
+
+ /* Make sure every nothing is still using the helper unless its a
+ * connection in the hash.
+ */
+ synchronize_rcu();
+
+ rtnl_lock();
+ for_each_net(net)
+ __nf_conntrack_helper_unregister(me, net);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
+
+static struct nf_ct_ext_type helper_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_help),
+ .align = __alignof__(struct nf_conn_help),
+ .id = NF_CT_EXT_HELPER,
+};
+
+int nf_conntrack_helper_pernet_init(struct net *net)
+{
+ net->ct.auto_assign_helper_warned = false;
+ net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
+ return nf_conntrack_helper_init_sysctl(net);
+}
+
+void nf_conntrack_helper_pernet_fini(struct net *net)
+{
+ nf_conntrack_helper_fini_sysctl(net);
+}
+
+int nf_conntrack_helper_init(void)
+{
+ int ret;
+ nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
+ nf_ct_helper_hash =
+ nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
+ if (!nf_ct_helper_hash)
+ return -ENOMEM;
+
+ ret = nf_ct_extend_register(&helper_extend);
+ if (ret < 0) {
+ pr_err("nf_ct_helper: Unable to register helper extension.\n");
+ goto out_extend;
+ }
+
+ return 0;
+out_extend:
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ return ret;
+}
+
+void nf_conntrack_helper_fini(void)
+{
+ nf_ct_extend_unregister(&helper_extend);
+ nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
new file mode 100644
index 000000000..0fd2976db
--- /dev/null
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -0,0 +1,292 @@
+/* IRC extension for IP connection tracking, Version 1.21
+ * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
+ * based on RR's ip_conntrack_ftp.c
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_irc.h>
+
+#define MAX_PORTS 8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+static unsigned int max_dcc_channels = 8;
+static unsigned int dcc_timeout __read_mostly = 300;
+/* This is slow, but it's simple. --RR */
+static char *irc_buffer;
+static DEFINE_SPINLOCK(irc_buffer_lock);
+
+unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_irc_hook);
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_irc");
+MODULE_ALIAS_NFCT_HELPER("irc");
+
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of IRC servers");
+module_param(max_dcc_channels, uint, 0400);
+MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per "
+ "IRC session");
+module_param(dcc_timeout, uint, 0400);
+MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
+
+static const char *const dccprotos[] = {
+ "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT "
+};
+
+#define MINMATCHLEN 5
+
+/* tries to get the ip_addr and port out of a dcc command
+ * return value: -1 on failure, 0 on success
+ * data pointer to first byte of DCC command data
+ * data_end pointer to last byte of dcc command data
+ * ip returns parsed ip of dcc command
+ * port returns parsed port of dcc command
+ * ad_beg_p returns pointer to first byte of addr data
+ * ad_end_p returns pointer to last byte of addr data
+ */
+static int parse_dcc(char *data, const char *data_end, __be32 *ip,
+ u_int16_t *port, char **ad_beg_p, char **ad_end_p)
+{
+ char *tmp;
+
+ /* at least 12: "AAAAAAAA P\1\n" */
+ while (*data++ != ' ')
+ if (data > data_end - 12)
+ return -1;
+
+ /* Make sure we have a newline character within the packet boundaries
+ * because simple_strtoul parses until the first invalid character. */
+ for (tmp = data; tmp <= data_end; tmp++)
+ if (*tmp == '\n')
+ break;
+ if (tmp > data_end || *tmp != '\n')
+ return -1;
+
+ *ad_beg_p = data;
+ *ip = cpu_to_be32(simple_strtoul(data, &data, 10));
+
+ /* skip blanks between ip and port */
+ while (*data == ' ') {
+ if (data >= data_end)
+ return -1;
+ data++;
+ }
+
+ *port = simple_strtoul(data, &data, 10);
+ *ad_end_p = data;
+
+ return 0;
+}
+
+static int help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff;
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ const char *data_limit;
+ char *data, *ib_ptr;
+ int dir = CTINFO2DIR(ctinfo);
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple *tuple;
+ __be32 dcc_ip;
+ u_int16_t dcc_port;
+ __be16 port;
+ int i, ret = NF_ACCEPT;
+ char *addr_beg_p, *addr_end_p;
+ typeof(nf_nat_irc_hook) nf_nat_irc;
+
+ /* If packet is coming from IRC server */
+ if (dir == IP_CT_DIR_REPLY)
+ return NF_ACCEPT;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ /* Not a full tcp header? */
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+
+ /* No data? */
+ dataoff = protoff + th->doff*4;
+ if (dataoff >= skb->len)
+ return NF_ACCEPT;
+
+ spin_lock_bh(&irc_buffer_lock);
+ ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
+ irc_buffer);
+ BUG_ON(ib_ptr == NULL);
+
+ data = ib_ptr;
+ data_limit = ib_ptr + skb->len - dataoff;
+
+ /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
+ * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
+ while (data < data_limit - (19 + MINMATCHLEN)) {
+ if (memcmp(data, "\1DCC ", 5)) {
+ data++;
+ continue;
+ }
+ data += 5;
+ /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+
+ iph = ip_hdr(skb);
+ pr_debug("DCC found in master %pI4:%u %pI4:%u\n",
+ &iph->saddr, ntohs(th->source),
+ &iph->daddr, ntohs(th->dest));
+
+ for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
+ if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
+ /* no match */
+ continue;
+ }
+ data += strlen(dccprotos[i]);
+ pr_debug("DCC %s detected\n", dccprotos[i]);
+
+ /* we have at least
+ * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+ * data left (== 14/13 bytes) */
+ if (parse_dcc(data, data_limit, &dcc_ip,
+ &dcc_port, &addr_beg_p, &addr_end_p)) {
+ pr_debug("unable to parse dcc command\n");
+ continue;
+ }
+
+ pr_debug("DCC bound ip/port: %pI4:%u\n",
+ &dcc_ip, dcc_port);
+
+ /* dcc_ip can be the internal OR external (NAT'ed) IP */
+ tuple = &ct->tuplehash[dir].tuple;
+ if (tuple->src.u3.ip != dcc_ip &&
+ tuple->dst.u3.ip != dcc_ip) {
+ net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n",
+ &tuple->src.u3.ip,
+ &dcc_ip, dcc_port);
+ continue;
+ }
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct,
+ "cannot alloc expectation");
+ ret = NF_DROP;
+ goto out;
+ }
+ tuple = &ct->tuplehash[!dir].tuple;
+ port = htons(dcc_port);
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+ tuple->src.l3num,
+ NULL, &tuple->dst.u3,
+ IPPROTO_TCP, NULL, &port);
+
+ nf_nat_irc = rcu_dereference(nf_nat_irc_hook);
+ if (nf_nat_irc && ct->status & IPS_NAT_MASK)
+ ret = nf_nat_irc(skb, ctinfo, protoff,
+ addr_beg_p - ib_ptr,
+ addr_end_p - addr_beg_p,
+ exp);
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct,
+ "cannot add expectation");
+ ret = NF_DROP;
+ }
+ nf_ct_expect_put(exp);
+ goto out;
+ }
+ }
+ out:
+ spin_unlock_bh(&irc_buffer_lock);
+ return ret;
+}
+
+static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly;
+static struct nf_conntrack_expect_policy irc_exp_policy;
+
+static void nf_conntrack_irc_fini(void);
+
+static int __init nf_conntrack_irc_init(void)
+{
+ int i, ret;
+
+ if (max_dcc_channels < 1) {
+ printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n");
+ return -EINVAL;
+ }
+
+ irc_exp_policy.max_expected = max_dcc_channels;
+ irc_exp_policy.timeout = dcc_timeout;
+
+ irc_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!irc_buffer)
+ return -ENOMEM;
+
+ /* If no port given, default to standard irc port */
+ if (ports_c == 0)
+ ports[ports_c++] = IRC_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ irc[i].tuple.src.l3num = AF_INET;
+ irc[i].tuple.src.u.tcp.port = htons(ports[i]);
+ irc[i].tuple.dst.protonum = IPPROTO_TCP;
+ irc[i].expect_policy = &irc_exp_policy;
+ irc[i].me = THIS_MODULE;
+ irc[i].help = help;
+
+ if (ports[i] == IRC_PORT)
+ sprintf(irc[i].name, "irc");
+ else
+ sprintf(irc[i].name, "irc-%u", i);
+
+ ret = nf_conntrack_helper_register(&irc[i]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_irc: failed to register helper "
+ "for pf: %u port: %u\n",
+ irc[i].tuple.src.l3num, ports[i]);
+ nf_conntrack_irc_fini();
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/* This function is intentionally _NOT_ defined as __exit, because
+ * it is needed by the init function */
+static void nf_conntrack_irc_fini(void)
+{
+ int i;
+
+ for (i = 0; i < ports_c; i++)
+ nf_conntrack_helper_unregister(&irc[i]);
+ kfree(irc_buffer);
+}
+
+module_init(nf_conntrack_irc_init);
+module_exit(nf_conntrack_irc_fini);
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
new file mode 100644
index 000000000..cf9ace70b
--- /dev/null
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -0,0 +1,73 @@
+/*
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * Based largely upon the original ip_conntrack code which
+ * had the following copyright information:
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+
+static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
+ memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
+
+ return true;
+}
+
+static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
+ memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
+
+ return true;
+}
+
+static void generic_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+}
+
+static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ unsigned int *dataoff, u_int8_t *protonum)
+{
+ /* Never track !!! */
+ return -NF_ACCEPT;
+}
+
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
+ .l3proto = PF_UNSPEC,
+ .name = "unknown",
+ .pkt_to_tuple = generic_pkt_to_tuple,
+ .invert_tuple = generic_invert_tuple,
+ .print_tuple = generic_print_tuple,
+ .get_l4proto = generic_get_l4proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
new file mode 100644
index 000000000..bb53f120e
--- /dev/null
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -0,0 +1,108 @@
+/*
+ * test/set flag bits stored in conntrack extension area.
+ *
+ * (C) 2013 Astaro GmbH & Co KG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/export.h>
+#include <linux/types.h>
+
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+static unsigned int label_bits(const struct nf_conn_labels *l)
+{
+ unsigned int longs = l->words;
+ return longs * BITS_PER_LONG;
+}
+
+bool nf_connlabel_match(const struct nf_conn *ct, u16 bit)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels)
+ return false;
+
+ return bit < label_bits(labels) && test_bit(bit, labels->bits);
+}
+EXPORT_SYMBOL_GPL(nf_connlabel_match);
+
+int nf_connlabel_set(struct nf_conn *ct, u16 bit)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels || bit >= label_bits(labels))
+ return -ENOSPC;
+
+ if (test_bit(bit, labels->bits))
+ return 0;
+
+ if (!test_and_set_bit(bit, labels->bits))
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabel_set);
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+static void replace_u32(u32 *address, u32 mask, u32 new)
+{
+ u32 old, tmp;
+
+ do {
+ old = *address;
+ tmp = (old & mask) ^ new;
+ } while (cmpxchg(address, old, tmp) != old);
+}
+
+int nf_connlabels_replace(struct nf_conn *ct,
+ const u32 *data,
+ const u32 *mask, unsigned int words32)
+{
+ struct nf_conn_labels *labels;
+ unsigned int size, i;
+ u32 *dst;
+
+ labels = nf_ct_labels_find(ct);
+ if (!labels)
+ return -ENOSPC;
+
+ size = labels->words * sizeof(long);
+ if (size < (words32 * sizeof(u32)))
+ words32 = size / sizeof(u32);
+
+ dst = (u32 *) labels->bits;
+ if (words32) {
+ for (i = 0; i < words32; i++)
+ replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]);
+ }
+
+ size /= sizeof(u32);
+ for (i = words32; i < size; i++) /* pad */
+ replace_u32(&dst[i], 0, 0);
+
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_replace);
+#endif
+
+static struct nf_ct_ext_type labels_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_labels),
+ .align = __alignof__(struct nf_conn_labels),
+ .id = NF_CT_EXT_LABELS,
+};
+
+int nf_conntrack_labels_init(void)
+{
+ return nf_ct_extend_register(&labels_extend);
+}
+
+void nf_conntrack_labels_fini(void)
+{
+ nf_ct_extend_unregister(&labels_extend);
+}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
new file mode 100644
index 000000000..4c8f30a3d
--- /dev/null
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -0,0 +1,71 @@
+/*
+ * NetBIOS name service broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+/*
+ * This helper tracks locally originating NetBIOS name service
+ * requests by issuing permanent expectations (valid until
+ * timing out) matching all reply connections from the
+ * destination network. The only NetBIOS specific thing is
+ * actually the port number.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+#define NMBD_PORT 137
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_netbios_ns");
+MODULE_ALIAS_NFCT_HELPER("netbios_ns");
+
+static unsigned int timeout __read_mostly = 3;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+static struct nf_conntrack_expect_policy exp_policy = {
+ .max_expected = 1,
+};
+
+static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+}
+
+static struct nf_conntrack_helper helper __read_mostly = {
+ .name = "netbios-ns",
+ .tuple.src.l3num = NFPROTO_IPV4,
+ .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .help = netbios_ns_help,
+ .expect_policy = &exp_policy,
+};
+
+static int __init nf_conntrack_netbios_ns_init(void)
+{
+ exp_policy.timeout = timeout;
+ return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_netbios_ns_fini(void)
+{
+ nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_netbios_ns_init);
+module_exit(nf_conntrack_netbios_ns_fini);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
new file mode 100644
index 000000000..d1c23940a
--- /dev/null
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -0,0 +1,3288 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Initial connection tracking via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/rculist.h>
+#include <linux/rculist_nulls.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/security.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_helper.h>
+#endif
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+MODULE_LICENSE("GPL");
+
+static char __initdata version[] = "0.93";
+
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ int ret = 0;
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum))
+ goto nla_put_failure;
+
+ if (likely(l4proto->tuple_to_nlattr))
+ ret = l4proto->tuple_to_nlattr(skb, tuple);
+
+ nla_nest_end(skb, nest_parms);
+
+ return ret;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_tuples_ip(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l3proto *l3proto)
+{
+ int ret = 0;
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (likely(l3proto->tuple_to_nlattr))
+ ret = l3proto->tuple_to_nlattr(skb, tuple);
+
+ nla_nest_end(skb, nest_parms);
+
+ return ret;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+ctnetlink_dump_tuples(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ int ret;
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+ ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
+
+ if (ret >= 0) {
+ l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
+ tuple->dst.protonum);
+ ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ;
+
+ if (timeout < 0)
+ timeout = 0;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
+{
+ struct nf_conntrack_l4proto *l4proto;
+ struct nlattr *nest_proto;
+ int ret;
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (!l4proto->to_nlattr)
+ return 0;
+
+ nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED);
+ if (!nest_proto)
+ goto nla_put_failure;
+
+ ret = l4proto->to_nlattr(skb, nest_proto, ct);
+
+ nla_nest_end(skb, nest_proto);
+
+ return ret;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_helper;
+ const struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_helper *helper;
+
+ if (!help)
+ return 0;
+
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ goto out;
+
+ nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED);
+ if (!nest_helper)
+ goto nla_put_failure;
+ if (nla_put_string(skb, CTA_HELP_NAME, helper->name))
+ goto nla_put_failure;
+
+ if (helper->to_nlattr)
+ helper->to_nlattr(skb, ct);
+
+ nla_nest_end(skb, nest_helper);
+out:
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
+ enum ip_conntrack_dir dir, int type)
+{
+ enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ struct nf_conn_counter *counter = acct->counter;
+ struct nlattr *nest_count;
+ u64 pkts, bytes;
+
+ if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
+ pkts = atomic64_xchg(&counter[dir].packets, 0);
+ bytes = atomic64_xchg(&counter[dir].bytes, 0);
+ } else {
+ pkts = atomic64_read(&counter[dir].packets);
+ bytes = atomic64_read(&counter[dir].bytes);
+ }
+
+ nest_count = nla_nest_start(skb, attr | NLA_F_NESTED);
+ if (!nest_count)
+ goto nla_put_failure;
+
+ if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) ||
+ nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_count);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type)
+{
+ struct nf_conn_acct *acct = nf_conn_acct_find(ct);
+
+ if (!acct)
+ return 0;
+
+ if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0)
+ return -1;
+ if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int
+ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_count;
+ const struct nf_conn_tstamp *tstamp;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (!tstamp)
+ return 0;
+
+ nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
+ if (!nest_count)
+ goto nla_put_failure;
+
+ if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) ||
+ (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP,
+ cpu_to_be64(tstamp->stop))))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_count);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static inline int
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_secctx;
+ int len, ret;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return 0;
+
+ ret = -1;
+ nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+ if (!nest_secctx)
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_secctx);
+
+ ret = 0;
+nla_put_failure:
+ security_release_secctx(secctx, len);
+ return ret;
+}
+#else
+#define ctnetlink_dump_secctx(a, b) (0)
+#endif
+
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+static int ctnetlink_label_size(const struct nf_conn *ct)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+
+ if (!labels)
+ return 0;
+ return nla_total_size(labels->words * sizeof(long));
+}
+
+static int
+ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+ unsigned int len, i;
+
+ if (!labels)
+ return 0;
+
+ len = labels->words * sizeof(long);
+ i = 0;
+ do {
+ if (labels->bits[i] != 0)
+ return nla_put(skb, CTA_LABELS, len, labels->bits);
+ i++;
+ } while (i < labels->words);
+
+ return 0;
+}
+#else
+#define ctnetlink_dump_labels(a, b) (0)
+#define ctnetlink_label_size(a) (0)
+#endif
+
+#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
+
+static inline int
+ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ if (!(ct->status & IPS_EXPECTED))
+ return 0;
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type)
+{
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS,
+ htonl(seq->correction_pos)) ||
+ nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE,
+ htonl(seq->offset_before)) ||
+ nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER,
+ htonl(seq->offset_after)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *seq;
+
+ if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj)
+ return 0;
+
+ seq = &seqadj->seq[IP_CT_DIR_ORIGINAL];
+ if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1)
+ return -1;
+
+ seq = &seqadj->seq[IP_CT_DIR_REPLY];
+ if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1)
+ return -1;
+
+ return 0;
+}
+
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ struct nf_conn *ct)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct nlattr *nest_parms;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = nf_ct_l3num(ct);
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ if (nf_ct_zone(ct) &&
+ nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_status(skb, ct) < 0 ||
+ ctnetlink_dump_timeout(skb, ct) < 0 ||
+ ctnetlink_dump_acct(skb, ct, type) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+ ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_secctx(skb, ct) < 0 ||
+ ctnetlink_dump_labels(skb, ct) < 0 ||
+ ctnetlink_dump_id(skb, ct) < 0 ||
+ ctnetlink_dump_use(skb, ct) < 0 ||
+ ctnetlink_dump_master(skb, ct) < 0 ||
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static inline size_t
+ctnetlink_proto_size(const struct nf_conn *ct)
+{
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ size_t len = 0;
+
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
+ len += l3proto->nla_size;
+
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ len += l4proto->nla_size;
+ rcu_read_unlock();
+
+ return len;
+}
+
+static inline size_t
+ctnetlink_acct_size(const struct nf_conn *ct)
+{
+ if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
+ return 0;
+ return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */
+ + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
+ + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
+ ;
+}
+
+static inline int
+ctnetlink_secctx_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ int len, ret;
+
+ ret = security_secid_to_secctx(ct->secmark, NULL, &len);
+ if (ret)
+ return 0;
+
+ return nla_total_size(0) /* CTA_SECCTX */
+ + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
+#else
+ return 0;
+#endif
+}
+
+static inline size_t
+ctnetlink_timestamp_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ return 0;
+ return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+#else
+ return 0;
+#endif
+}
+
+static inline size_t
+ctnetlink_nlmsg_size(const struct nf_conn *ct)
+{
+ return NLMSG_ALIGN(sizeof(struct nfgenmsg))
+ + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
+ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ + ctnetlink_acct_size(ct)
+ + ctnetlink_timestamp_size(ct)
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ + nla_total_size(0) /* CTA_PROTOINFO */
+ + nla_total_size(0) /* CTA_HELP */
+ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ + ctnetlink_secctx_size(ct)
+#ifdef CONFIG_NF_NAT_NEEDED
+ + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+#endif
+ + ctnetlink_proto_size(ct)
+ + ctnetlink_label_size(ct)
+ ;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int
+ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+{
+ struct net *net;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct nlattr *nest_parms;
+ struct nf_conn *ct = item->ct;
+ struct sk_buff *skb;
+ unsigned int type;
+ unsigned int flags = 0, group;
+ int err;
+
+ /* ignore our fake conntrack entry */
+ if (nf_ct_is_untracked(ct))
+ return 0;
+
+ if (events & (1 << IPCT_DESTROY)) {
+ type = IPCTNL_MSG_CT_DELETE;
+ group = NFNLGRP_CONNTRACK_DESTROY;
+ } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) {
+ type = IPCTNL_MSG_CT_NEW;
+ flags = NLM_F_CREATE|NLM_F_EXCL;
+ group = NFNLGRP_CONNTRACK_NEW;
+ } else if (events) {
+ type = IPCTNL_MSG_CT_NEW;
+ group = NFNLGRP_CONNTRACK_UPDATE;
+ } else
+ return 0;
+
+ net = nf_ct_net(ct);
+ if (!item->report && !nfnetlink_has_listeners(net, group))
+ return 0;
+
+ skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ type |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = nf_ct_l3num(ct);
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ rcu_read_lock();
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ if (nf_ct_zone(ct) &&
+ nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_id(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_status(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (events & (1 << IPCT_DESTROY)) {
+ if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0)
+ goto nla_put_failure;
+ } else {
+ if (ctnetlink_dump_timeout(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (events & (1 << IPCT_PROTOINFO)
+ && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
+ && ctnetlink_dump_helpinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ if ((events & (1 << IPCT_SECMARK) || ct->secmark)
+ && ctnetlink_dump_secctx(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ if (events & (1 << IPCT_LABEL) &&
+ ctnetlink_dump_labels(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (events & (1 << IPCT_RELATED) &&
+ ctnetlink_dump_master(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (events & (1 << IPCT_SEQADJ) &&
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
+ goto nla_put_failure;
+ }
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if ((events & (1 << IPCT_MARK) || ct->mark)
+ && ctnetlink_dump_mark(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ rcu_read_unlock();
+
+ nlmsg_end(skb, nlh);
+ err = nfnetlink_send(skb, net, item->portid, group, item->report,
+ GFP_ATOMIC);
+ if (err == -ENOBUFS || err == -EAGAIN)
+ return -ENOBUFS;
+
+ return 0;
+
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+ kfree_skb(skb);
+errout:
+ if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0)
+ return -ENOBUFS;
+
+ return 0;
+}
+#endif /* CONFIG_NF_CONNTRACK_EVENTS */
+
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+ if (cb->args[1])
+ nf_ct_put((struct nf_conn *)cb->args[1]);
+ kfree(cb->data);
+ return 0;
+}
+
+struct ctnetlink_filter {
+ struct {
+ u_int32_t val;
+ u_int32_t mask;
+ } mark;
+};
+
+static struct ctnetlink_filter *
+ctnetlink_alloc_filter(const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ struct ctnetlink_filter *filter;
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ if (filter == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
+ filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+
+ return filter;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
+
+static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
+{
+ struct ctnetlink_filter *filter = data;
+
+ if (filter == NULL)
+ return 1;
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if ((ct->mark & filter->mark.mask) == filter->mark.val)
+ return 1;
+#endif
+
+ return 0;
+}
+
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nf_conn *ct, *last;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+ int res;
+ spinlock_t *lockp;
+
+ last = (struct nf_conn *)cb->args[1];
+
+ local_bh_disable();
+ for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
+restart:
+ lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
+ spin_lock(lockp);
+ if (cb->args[0] >= net->ct.htable_size) {
+ spin_unlock(lockp);
+ goto out;
+ }
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
+ hnnode) {
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* Dump entries of a given L3 protocol number.
+ * If it is not specified, ie. l3proto == 0,
+ * then dump everything. */
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (ct != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (!ctnetlink_filter_match(ct, cb->data))
+ continue;
+
+ rcu_read_lock();
+ res =
+ ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct);
+ rcu_read_unlock();
+ if (res < 0) {
+ nf_conntrack_get(&ct->ct_general);
+ cb->args[1] = (unsigned long)ct;
+ spin_unlock(lockp);
+ goto out;
+ }
+ }
+ spin_unlock(lockp);
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ }
+out:
+ local_bh_enable();
+ if (last)
+ nf_ct_put(last);
+
+ return skb->len;
+}
+
+static inline int
+ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)
+{
+ struct nlattr *tb[CTA_IP_MAX+1];
+ struct nf_conntrack_l3proto *l3proto;
+ int ret = 0;
+
+ ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL);
+ if (ret < 0)
+ return ret;
+
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+
+ if (likely(l3proto->nlattr_to_tuple)) {
+ ret = nla_validate_nested(attr, CTA_IP_MAX,
+ l3proto->nla_policy);
+ if (ret == 0)
+ ret = l3proto->nlattr_to_tuple(tb, tuple);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
+ [CTA_PROTO_NUM] = { .type = NLA_U8 },
+};
+
+static inline int
+ctnetlink_parse_tuple_proto(struct nlattr *attr,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct nlattr *tb[CTA_PROTO_MAX+1];
+ struct nf_conntrack_l4proto *l4proto;
+ int ret = 0;
+
+ ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[CTA_PROTO_NUM])
+ return -EINVAL;
+ tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
+
+ rcu_read_lock();
+ l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
+
+ if (likely(l4proto->nlattr_to_tuple)) {
+ ret = nla_validate_nested(attr, CTA_PROTO_MAX,
+ l4proto->nla_policy);
+ if (ret == 0)
+ ret = l4proto->nlattr_to_tuple(tb, tuple);
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
+ [CTA_TUPLE_IP] = { .type = NLA_NESTED },
+ [CTA_TUPLE_PROTO] = { .type = NLA_NESTED },
+};
+
+static int
+ctnetlink_parse_tuple(const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *tuple,
+ enum ctattr_type type, u_int8_t l3num)
+{
+ struct nlattr *tb[CTA_TUPLE_MAX+1];
+ int err;
+
+ memset(tuple, 0, sizeof(*tuple));
+
+ err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_TUPLE_IP])
+ return -EINVAL;
+
+ tuple->src.l3num = l3num;
+
+ err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_TUPLE_PROTO])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple);
+ if (err < 0)
+ return err;
+
+ /* orig and expect tuples get DIR_ORIGINAL */
+ if (type == CTA_TUPLE_REPLY)
+ tuple->dst.dir = IP_CT_DIR_REPLY;
+ else
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+ return 0;
+}
+
+static int
+ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
+{
+ if (attr)
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ *zone = ntohs(nla_get_be16(attr));
+#else
+ return -EOPNOTSUPP;
+#endif
+ else
+ *zone = 0;
+
+ return 0;
+}
+
+static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
+ [CTA_HELP_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN - 1 },
+};
+
+static inline int
+ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
+ struct nlattr **helpinfo)
+{
+ int err;
+ struct nlattr *tb[CTA_HELP_MAX+1];
+
+ err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_HELP_NAME])
+ return -EINVAL;
+
+ *helper_name = nla_data(tb[CTA_HELP_NAME]);
+
+ if (tb[CTA_HELP_INFO])
+ *helpinfo = tb[CTA_HELP_INFO];
+
+ return 0;
+}
+
+static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
+ [CTA_TUPLE_ORIG] = { .type = NLA_NESTED },
+ [CTA_TUPLE_REPLY] = { .type = NLA_NESTED },
+ [CTA_STATUS] = { .type = NLA_U32 },
+ [CTA_PROTOINFO] = { .type = NLA_NESTED },
+ [CTA_HELP] = { .type = NLA_NESTED },
+ [CTA_NAT_SRC] = { .type = NLA_NESTED },
+ [CTA_TIMEOUT] = { .type = NLA_U32 },
+ [CTA_MARK] = { .type = NLA_U32 },
+ [CTA_ID] = { .type = NLA_U32 },
+ [CTA_NAT_DST] = { .type = NLA_NESTED },
+ [CTA_TUPLE_MASTER] = { .type = NLA_NESTED },
+ [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED },
+ [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED },
+ [CTA_ZONE] = { .type = NLA_U16 },
+ [CTA_MARK_MASK] = { .type = NLA_U32 },
+ [CTA_LABELS] = { .type = NLA_BINARY,
+ .len = NF_CT_LABELS_MAX_SIZE },
+ [CTA_LABELS_MASK] = { .type = NLA_BINARY,
+ .len = NF_CT_LABELS_MAX_SIZE },
+};
+
+static int ctnetlink_flush_conntrack(struct net *net,
+ const struct nlattr * const cda[],
+ u32 portid, int report)
+{
+ struct ctnetlink_filter *filter = NULL;
+
+ if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
+ filter = ctnetlink_alloc_filter(cda);
+ if (IS_ERR(filter))
+ return PTR_ERR(filter);
+ }
+
+ nf_ct_iterate_cleanup(net, ctnetlink_filter_match, filter,
+ portid, report);
+ kfree(filter);
+
+ return 0;
+}
+
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct net *net = sock_net(ctnl);
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conn *ct;
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ u16 zone;
+ int err;
+
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_TUPLE_ORIG])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+ else if (cda[CTA_TUPLE_REPLY])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+ else {
+ return ctnetlink_flush_conntrack(net, cda,
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ }
+
+ if (err < 0)
+ return err;
+
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return -ENOENT;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (cda[CTA_ID]) {
+ u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
+ if (id != (u32)(unsigned long)ct) {
+ nf_ct_put(ct);
+ return -ENOENT;
+ }
+ }
+
+ if (del_timer(&ct->timeout))
+ nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
+
+ nf_ct_put(ct);
+
+ return 0;
+}
+
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct net *net = sock_net(ctnl);
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conn *ct;
+ struct sk_buff *skb2 = NULL;
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ u16 zone;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_table,
+ .done = ctnetlink_done,
+ };
+
+ if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
+ struct ctnetlink_filter *filter;
+
+ filter = ctnetlink_alloc_filter(cda);
+ if (IS_ERR(filter))
+ return PTR_ERR(filter);
+
+ c.data = filter;
+ }
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_TUPLE_ORIG])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+ else if (cda[CTA_TUPLE_REPLY])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+ else
+ return -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return -ENOENT;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ err = -ENOMEM;
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ nf_ct_put(ct);
+ return -ENOMEM;
+ }
+
+ rcu_read_lock();
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type), ct);
+ rcu_read_unlock();
+ nf_ct_put(ct);
+ if (err <= 0)
+ goto free;
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+
+ return 0;
+
+free:
+ kfree_skb(skb2);
+out:
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+static int ctnetlink_done_list(struct netlink_callback *cb)
+{
+ if (cb->args[1])
+ nf_ct_put((struct nf_conn *)cb->args[1]);
+ return 0;
+}
+
+static int
+ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
+{
+ struct nf_conn *ct, *last;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+ int res;
+ int cpu;
+ struct hlist_nulls_head *list;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct nf_conn *)cb->args[1];
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ struct ct_pcpu *pcpu;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+ spin_lock_bh(&pcpu->lock);
+ list = dying ? &pcpu->dying : &pcpu->unconfirmed;
+restart:
+ hlist_nulls_for_each_entry(h, n, list, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (ct != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ rcu_read_lock();
+ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct);
+ rcu_read_unlock();
+ if (res < 0) {
+ if (!atomic_inc_not_zero(&ct->ct_general.use))
+ continue;
+ cb->args[0] = cpu;
+ cb->args[1] = (unsigned long)ct;
+ spin_unlock_bh(&pcpu->lock);
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ spin_unlock_bh(&pcpu->lock);
+ }
+ cb->args[2] = 1;
+out:
+ if (last)
+ nf_ct_put(last);
+
+ return skb->len;
+}
+
+static int
+ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ctnetlink_dump_list(skb, cb, true);
+}
+
+static int
+ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_dying,
+ .done = ctnetlink_done_list,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return ctnetlink_dump_list(skb, cb, false);
+}
+
+static int
+ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_dump_unconfirmed,
+ .done = ctnetlink_done_list,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+#ifdef CONFIG_NF_NAT_NEEDED
+static int
+ctnetlink_parse_nat_setup(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr)
+{
+ typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+ int err;
+
+ parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
+ if (!parse_nat_setup) {
+#ifdef CONFIG_MODULES
+ rcu_read_unlock();
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ if (request_module("nf-nat") < 0) {
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+ return -EOPNOTSUPP;
+ }
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+ if (nfnetlink_parse_nat_setup_hook)
+ return -EAGAIN;
+#endif
+ return -EOPNOTSUPP;
+ }
+
+ err = parse_nat_setup(ct, manip, attr);
+ if (err == -EAGAIN) {
+#ifdef CONFIG_MODULES
+ rcu_read_unlock();
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) {
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+ return -EOPNOTSUPP;
+ }
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ rcu_read_lock();
+#else
+ err = -EOPNOTSUPP;
+#endif
+ }
+ return err;
+}
+#endif
+
+static int
+ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+ unsigned long d;
+ unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EBUSY;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EBUSY;
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EBUSY;
+
+ /* Be careful here, modifying NAT bits can screw up things,
+ * so don't let users modify them directly if they don't pass
+ * nf_nat_range. */
+ ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+ return 0;
+}
+
+static int
+ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_NAT_NEEDED
+ int ret;
+
+ if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
+ return 0;
+
+ ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST,
+ cda[CTA_NAT_DST]);
+ if (ret < 0)
+ return ret;
+
+ ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC,
+ cda[CTA_NAT_SRC]);
+ return ret;
+#else
+ if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
+ return 0;
+ return -EOPNOTSUPP;
+#endif
+}
+
+static inline int
+ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help = nfct_help(ct);
+ char *helpname = NULL;
+ struct nlattr *helpinfo = NULL;
+ int err;
+
+ /* don't change helper of sibling connections */
+ if (ct->master)
+ return -EBUSY;
+
+ err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
+ if (err < 0)
+ return err;
+
+ if (!strcmp(helpname, "")) {
+ if (help && help->helper) {
+ /* we had a helper before ... */
+ nf_ct_remove_expectations(ct);
+ RCU_INIT_POINTER(help->helper, NULL);
+ }
+
+ return 0;
+ }
+
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper == NULL) {
+#ifdef CONFIG_MODULES
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+ if (request_module("nfct-helper-%s", helpname) < 0) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ return -EOPNOTSUPP;
+ }
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper)
+ return -EAGAIN;
+#endif
+ return -EOPNOTSUPP;
+ }
+
+ if (help) {
+ if (help->helper == helper) {
+ /* update private helper data if allowed. */
+ if (helper->from_nlattr)
+ helper->from_nlattr(helpinfo, ct);
+ return 0;
+ } else
+ return -EBUSY;
+ }
+
+ /* we cannot set a helper for an existing conntrack */
+ return -EOPNOTSUPP;
+}
+
+static inline int
+ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+ u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
+
+ if (!del_timer(&ct->timeout))
+ return -ETIME;
+
+ ct->timeout.expires = jiffies + timeout * HZ;
+ add_timer(&ct->timeout);
+
+ return 0;
+}
+
+static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
+ [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED },
+ [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED },
+ [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED },
+};
+
+static inline int
+ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+ const struct nlattr *attr = cda[CTA_PROTOINFO];
+ struct nlattr *tb[CTA_PROTOINFO_MAX+1];
+ struct nf_conntrack_l4proto *l4proto;
+ int err = 0;
+
+ err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy);
+ if (err < 0)
+ return err;
+
+ rcu_read_lock();
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->from_nlattr)
+ err = l4proto->from_nlattr(tb, ct);
+ rcu_read_unlock();
+
+ return err;
+}
+
+static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = {
+ [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 },
+ [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 },
+ [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 },
+};
+
+static inline int
+change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr)
+{
+ int err;
+ struct nlattr *cda[CTA_SEQADJ_MAX+1];
+
+ err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy);
+ if (err < 0)
+ return err;
+
+ if (!cda[CTA_SEQADJ_CORRECTION_POS])
+ return -EINVAL;
+
+ seq->correction_pos =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS]));
+
+ if (!cda[CTA_SEQADJ_OFFSET_BEFORE])
+ return -EINVAL;
+
+ seq->offset_before =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE]));
+
+ if (!cda[CTA_SEQADJ_OFFSET_AFTER])
+ return -EINVAL;
+
+ seq->offset_after =
+ ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER]));
+
+ return 0;
+}
+
+static int
+ctnetlink_change_seq_adj(struct nf_conn *ct,
+ const struct nlattr * const cda[])
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ int ret = 0;
+
+ if (!seqadj)
+ return 0;
+
+ if (cda[CTA_SEQ_ADJ_ORIG]) {
+ ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL],
+ cda[CTA_SEQ_ADJ_ORIG]);
+ if (ret < 0)
+ return ret;
+
+ ct->status |= IPS_SEQ_ADJUST;
+ }
+
+ if (cda[CTA_SEQ_ADJ_REPLY]) {
+ ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY],
+ cda[CTA_SEQ_ADJ_REPLY]);
+ if (ret < 0)
+ return ret;
+
+ ct->status |= IPS_SEQ_ADJUST;
+ }
+
+ return 0;
+}
+
+static int
+ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ size_t len = nla_len(cda[CTA_LABELS]);
+ const void *mask = cda[CTA_LABELS_MASK];
+
+ if (len & (sizeof(u32)-1)) /* must be multiple of u32 */
+ return -EINVAL;
+
+ if (mask) {
+ if (nla_len(cda[CTA_LABELS_MASK]) == 0 ||
+ nla_len(cda[CTA_LABELS_MASK]) != len)
+ return -EINVAL;
+ mask = nla_data(cda[CTA_LABELS_MASK]);
+ }
+
+ len /= sizeof(u32);
+
+ return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len);
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int
+ctnetlink_change_conntrack(struct nf_conn *ct,
+ const struct nlattr * const cda[])
+{
+ int err;
+
+ /* only allow NAT changes and master assignation for new conntracks */
+ if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER])
+ return -EOPNOTSUPP;
+
+ if (cda[CTA_HELP]) {
+ err = ctnetlink_change_helper(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TIMEOUT]) {
+ err = ctnetlink_change_timeout(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_STATUS]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_PROTOINFO]) {
+ err = ctnetlink_change_protoinfo(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK])
+ ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+#endif
+
+ if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+ err = ctnetlink_change_seq_adj(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_LABELS]) {
+ err = ctnetlink_attach_labels(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static struct nf_conn *
+ctnetlink_create_conntrack(struct net *net, u16 zone,
+ const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *otuple,
+ struct nf_conntrack_tuple *rtuple,
+ u8 u3)
+{
+ struct nf_conn *ct;
+ int err = -EINVAL;
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_tstamp *tstamp;
+
+ ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
+ if (IS_ERR(ct))
+ return ERR_PTR(-ENOMEM);
+
+ if (!cda[CTA_TIMEOUT])
+ goto err1;
+ ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
+
+ ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+
+ rcu_read_lock();
+ if (cda[CTA_HELP]) {
+ char *helpname = NULL;
+ struct nlattr *helpinfo = NULL;
+
+ err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
+ if (err < 0)
+ goto err2;
+
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper == NULL) {
+ rcu_read_unlock();
+#ifdef CONFIG_MODULES
+ if (request_module("nfct-helper-%s", helpname) < 0) {
+ err = -EOPNOTSUPP;
+ goto err1;
+ }
+
+ rcu_read_lock();
+ helper = __nf_conntrack_helper_find(helpname,
+ nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper) {
+ err = -EAGAIN;
+ goto err2;
+ }
+ rcu_read_unlock();
+#endif
+ err = -EOPNOTSUPP;
+ goto err1;
+ } else {
+ struct nf_conn_help *help;
+
+ help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);
+ if (help == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+ /* set private helper data if allowed. */
+ if (helper->from_nlattr)
+ helper->from_nlattr(helpinfo, ct);
+
+ /* not in hash table yet so not strictly necessary */
+ RCU_INIT_POINTER(help->helper, helper);
+ }
+ } else {
+ /* try an implicit helper assignation */
+ err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
+ if (err < 0)
+ goto err2;
+ }
+
+ err = ctnetlink_setup_nat(ct, cda);
+ if (err < 0)
+ goto err2;
+
+ nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
+ nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
+ nf_ct_labels_ext_add(ct);
+
+ /* we must add conntrack extensions before confirmation. */
+ ct->status |= IPS_CONFIRMED;
+
+ if (cda[CTA_STATUS]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ goto err2;
+ }
+
+ if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+ err = ctnetlink_change_seq_adj(ct, cda);
+ if (err < 0)
+ goto err2;
+ }
+
+ memset(&ct->proto, 0, sizeof(ct->proto));
+ if (cda[CTA_PROTOINFO]) {
+ err = ctnetlink_change_protoinfo(ct, cda);
+ if (err < 0)
+ goto err2;
+ }
+
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK])
+ ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+#endif
+
+ /* setup master conntrack: this is a confirmed expectation */
+ if (cda[CTA_TUPLE_MASTER]) {
+ struct nf_conntrack_tuple master;
+ struct nf_conntrack_tuple_hash *master_h;
+ struct nf_conn *master_ct;
+
+ err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3);
+ if (err < 0)
+ goto err2;
+
+ master_h = nf_conntrack_find_get(net, zone, &master);
+ if (master_h == NULL) {
+ err = -ENOENT;
+ goto err2;
+ }
+ master_ct = nf_ct_tuplehash_to_ctrack(master_h);
+ __set_bit(IPS_EXPECTED_BIT, &ct->status);
+ ct->master = master_ct;
+ }
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp)
+ tstamp->start = ktime_get_real_ns();
+
+ err = nf_conntrack_hash_check_insert(ct);
+ if (err < 0)
+ goto err2;
+
+ rcu_read_unlock();
+
+ return ct;
+
+err2:
+ rcu_read_unlock();
+err1:
+ nf_conntrack_free(ct);
+ return ERR_PTR(err);
+}
+
+static int
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct net *net = sock_net(ctnl);
+ struct nf_conntrack_tuple otuple, rtuple;
+ struct nf_conntrack_tuple_hash *h = NULL;
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nf_conn *ct;
+ u_int8_t u3 = nfmsg->nfgen_family;
+ u16 zone;
+ int err;
+
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_TUPLE_ORIG]) {
+ err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TUPLE_REPLY]) {
+ err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TUPLE_ORIG])
+ h = nf_conntrack_find_get(net, zone, &otuple);
+ else if (cda[CTA_TUPLE_REPLY])
+ h = nf_conntrack_find_get(net, zone, &rtuple);
+
+ if (h == NULL) {
+ err = -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_CREATE) {
+ enum ip_conntrack_events events;
+
+ if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
+ return -EINVAL;
+
+ ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
+ &rtuple, u3);
+ if (IS_ERR(ct))
+ return PTR_ERR(ct);
+
+ err = 0;
+ if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+ events = IPCT_RELATED;
+ else
+ events = IPCT_NEW;
+
+ if (cda[CTA_LABELS] &&
+ ctnetlink_attach_labels(ct, cda) == 0)
+ events |= (1 << IPCT_LABEL);
+
+ nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
+ (1 << IPCT_ASSURED) |
+ (1 << IPCT_HELPER) |
+ (1 << IPCT_PROTOINFO) |
+ (1 << IPCT_SEQADJ) |
+ (1 << IPCT_MARK) | events,
+ ct, NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ nf_ct_put(ct);
+ }
+
+ return err;
+ }
+ /* implicit 'else' */
+
+ err = -EEXIST;
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ err = ctnetlink_change_conntrack(ct, cda);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ if (err == 0) {
+ nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
+ (1 << IPCT_ASSURED) |
+ (1 << IPCT_HELPER) |
+ (1 << IPCT_LABEL) |
+ (1 << IPCT_PROTOINFO) |
+ (1 << IPCT_SEQADJ) |
+ (1 << IPCT_MARK),
+ ct, NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ }
+ }
+
+ nf_ct_put(ct);
+ return err;
+}
+
+static int
+ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
+ __u16 cpu, const struct ip_conntrack_stat *st)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(cpu);
+
+ if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) ||
+ nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
+ nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) ||
+ nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
+ nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
+ nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) ||
+ nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) ||
+ nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
+ nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
+ htonl(st->insert_failed)) ||
+ nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) ||
+ nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
+ nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
+ nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
+ htonl(st->search_restart)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int cpu;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[0] == nr_cpu_ids)
+ return 0;
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ const struct ip_conntrack_stat *st;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ st = per_cpu_ptr(net->ct.stat, cpu);
+ if (ctnetlink_ct_stat_cpu_fill_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ cpu, st) < 0)
+ break;
+ }
+ cb->args[0] = cpu;
+
+ return skb->len;
+}
+
+static int
+ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_ct_stat_cpu_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return 0;
+}
+
+static int
+ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ struct net *net)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+ unsigned int nr_conntracks = atomic_read(&net->ct.count);
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct sk_buff *skb2;
+ int err;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ sock_net(skb->sk));
+ if (err <= 0)
+ goto free;
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+
+ return 0;
+
+free:
+ kfree_skb(skb2);
+out:
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
+ [CTA_EXPECT_MASTER] = { .type = NLA_NESTED },
+ [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED },
+ [CTA_EXPECT_MASK] = { .type = NLA_NESTED },
+ [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 },
+ [CTA_EXPECT_ID] = { .type = NLA_U32 },
+ [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN - 1 },
+ [CTA_EXPECT_ZONE] = { .type = NLA_U16 },
+ [CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
+ [CTA_EXPECT_CLASS] = { .type = NLA_U32 },
+ [CTA_EXPECT_NAT] = { .type = NLA_NESTED },
+ [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING },
+};
+
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
+ struct nf_conntrack_helper *helper,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask);
+
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+static size_t
+ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
+{
+ return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */
+ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
+ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+ + nla_total_size(0) /* CTA_PROTOINFO */
+ + nla_total_size(0) /* CTA_HELP */
+ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ + ctnetlink_secctx_size(ct)
+#ifdef CONFIG_NF_NAT_NEEDED
+ + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */
+#endif
+ + ctnetlink_proto_size(ct)
+ ;
+}
+
+static int
+ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ rcu_read_lock();
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ if (nf_ct_zone(ct)) {
+ if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct))))
+ goto nla_put_failure;
+ }
+
+ if (ctnetlink_dump_id(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_status(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_timeout(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_protoinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_helpinfo(skb, ct) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ if (ct->master && ctnetlink_dump_master(skb, ct) < 0)
+ goto nla_put_failure;
+
+ if ((ct->status & IPS_SEQ_ADJUST) &&
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0)
+ goto nla_put_failure;
+#endif
+ if (ctnetlink_dump_labels(skb, ct) < 0)
+ goto nla_put_failure;
+ rcu_read_unlock();
+ return 0;
+
+nla_put_failure:
+ rcu_read_unlock();
+ return -ENOSPC;
+}
+
+static int
+ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
+{
+ int err;
+
+ if (cda[CTA_TIMEOUT]) {
+ err = ctnetlink_change_timeout(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_STATUS]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_HELP]) {
+ err = ctnetlink_change_helper(ct, cda);
+ if (err < 0)
+ return err;
+ }
+ if (cda[CTA_LABELS]) {
+ err = ctnetlink_attach_labels(ct, cda);
+ if (err < 0)
+ return err;
+ }
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK]) {
+ u32 mask = 0, mark, newmark;
+ if (cda[CTA_MARK_MASK])
+ mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+
+ mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+ newmark = (ct->mark & mask) ^ mark;
+ if (newmark != ct->mark)
+ ct->mark = newmark;
+ }
+#endif
+ return 0;
+}
+
+static int
+ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
+{
+ struct nlattr *cda[CTA_MAX+1];
+ int ret;
+
+ ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy);
+ if (ret < 0)
+ return ret;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+ return ret;
+}
+
+static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
+ const struct nf_conn *ct,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask)
+{
+ int err;
+
+ err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
+ nf_ct_l3num(ct));
+ if (err < 0)
+ return err;
+
+ return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
+ nf_ct_l3num(ct));
+}
+
+static int
+ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
+ u32 portid, u32 report)
+{
+ struct nlattr *cda[CTA_EXPECT_MAX+1];
+ struct nf_conntrack_tuple tuple, mask;
+ struct nf_conntrack_helper *helper = NULL;
+ struct nf_conntrack_expect *exp;
+ int err;
+
+ err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy);
+ if (err < 0)
+ return err;
+
+ err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda,
+ ct, &tuple, &mask);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_HELP_NAME]) {
+ const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+ helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+ nf_ct_protonum(ct));
+ if (helper == NULL)
+ return -EOPNOTSUPP;
+ }
+
+ exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct,
+ helper, &tuple, &mask);
+ if (IS_ERR(exp))
+ return PTR_ERR(exp);
+
+ err = nf_ct_expect_related_report(exp, portid, report);
+ if (err < 0) {
+ nf_ct_expect_put(exp);
+ return err;
+ }
+
+ return 0;
+}
+
+static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
+ .build_size = ctnetlink_nfqueue_build_size,
+ .build = ctnetlink_nfqueue_build,
+ .parse = ctnetlink_nfqueue_parse,
+ .attach_expect = ctnetlink_nfqueue_attach_expect,
+ .seq_adjust = nf_ct_tcp_seqadj_set,
+};
+#endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
+
+/***********************************************************************
+ * EXPECT
+ ***********************************************************************/
+
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ enum ctattr_expect type)
+{
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (ctnetlink_dump_tuples(skb, tuple) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_exp_dump_mask(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple_mask *mask)
+{
+ int ret;
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ struct nf_conntrack_tuple m;
+ struct nlattr *nest_parms;
+
+ memset(&m, 0xFF, sizeof(m));
+ memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
+ m.src.u.all = mask->src.u.all;
+ m.dst.protonum = tuple->dst.protonum;
+
+ nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+ ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
+ if (ret >= 0) {
+ l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
+ tuple->dst.protonum);
+ ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
+ }
+ rcu_read_unlock();
+
+ if (unlikely(ret < 0))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const union nf_inet_addr any_addr;
+
+static int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+ const struct nf_conntrack_expect *exp)
+{
+ struct nf_conn *master = exp->master;
+ long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
+ struct nf_conn_help *help;
+#ifdef CONFIG_NF_NAT_NEEDED
+ struct nlattr *nest_parms;
+ struct nf_conntrack_tuple nat_tuple = {};
+#endif
+ struct nf_ct_helper_expectfn *expfn;
+
+ if (timeout < 0)
+ timeout = 0;
+
+ if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+ goto nla_put_failure;
+ if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0)
+ goto nla_put_failure;
+ if (ctnetlink_exp_dump_tuple(skb,
+ &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ CTA_EXPECT_MASTER) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NF_NAT_NEEDED
+ if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) ||
+ exp->saved_proto.all) {
+ nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir)))
+ goto nla_put_failure;
+
+ nat_tuple.src.l3num = nf_ct_l3num(master);
+ nat_tuple.src.u3 = exp->saved_addr;
+ nat_tuple.dst.protonum = nf_ct_protonum(master);
+ nat_tuple.src.u = exp->saved_proto;
+
+ if (ctnetlink_exp_dump_tuple(skb, &nat_tuple,
+ CTA_EXPECT_NAT_TUPLE) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+ }
+#endif
+ if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
+ nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) ||
+ nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
+ nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
+ goto nla_put_failure;
+ help = nfct_help(master);
+ if (help) {
+ struct nf_conntrack_helper *helper;
+
+ helper = rcu_dereference(help->helper);
+ if (helper &&
+ nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name))
+ goto nla_put_failure;
+ }
+ expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn);
+ if (expfn != NULL &&
+ nla_put_string(skb, CTA_EXPECT_FN, expfn->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
+ int event, const struct nf_conntrack_expect *exp)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+ event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = exp->tuple.src.l3num;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int
+ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
+{
+ struct nf_conntrack_expect *exp = item->exp;
+ struct net *net = nf_ct_exp_net(exp);
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct sk_buff *skb;
+ unsigned int type, group;
+ int flags = 0;
+
+ if (events & (1 << IPEXP_DESTROY)) {
+ type = IPCTNL_MSG_EXP_DELETE;
+ group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+ } else if (events & (1 << IPEXP_NEW)) {
+ type = IPCTNL_MSG_EXP_NEW;
+ flags = NLM_F_CREATE|NLM_F_EXCL;
+ group = NFNLGRP_CONNTRACK_EXP_NEW;
+ } else
+ return 0;
+
+ if (!item->report && !nfnetlink_has_listeners(net, group))
+ return 0;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ type |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+ nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = exp->tuple.src.l3num;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ rcu_read_lock();
+ if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+ goto nla_put_failure;
+ rcu_read_unlock();
+
+ nlmsg_end(skb, nlh);
+ nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC);
+ return 0;
+
+nla_put_failure:
+ rcu_read_unlock();
+ nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+ kfree_skb(skb);
+errout:
+ nfnetlink_set_err(net, 0, 0, -ENOBUFS);
+ return 0;
+}
+#endif
+static int ctnetlink_exp_done(struct netlink_callback *cb)
+{
+ if (cb->args[1])
+ nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
+ return 0;
+}
+
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_expect *)cb->args[1];
+ for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
+restart:
+ hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],
+ hnode) {
+ if (l3proto && exp->tuple.src.l3num != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (exp != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (ctnetlink_exp_fill_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ exp) < 0) {
+ if (!atomic_inc_not_zero(&exp->use))
+ continue;
+ cb->args[1] = (unsigned long)exp;
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ }
+out:
+ rcu_read_unlock();
+ if (last)
+ nf_ct_expect_put(last);
+
+ return skb->len;
+}
+
+static int
+ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nf_conn *ct = cb->data;
+ struct nf_conn_help *help = nfct_help(ct);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+
+ if (cb->args[0])
+ return 0;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_expect *)cb->args[1];
+restart:
+ hlist_for_each_entry(exp, &help->expectations, lnode) {
+ if (l3proto && exp->tuple.src.l3num != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (exp != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ exp) < 0) {
+ if (!atomic_inc_not_zero(&exp->use))
+ continue;
+ cb->args[1] = (unsigned long)exp;
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ cb->args[0] = 1;
+out:
+ rcu_read_unlock();
+ if (last)
+ nf_ct_expect_put(last);
+
+ return skb->len;
+}
+
+static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ int err;
+ struct net *net = sock_net(ctnl);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ u16 zone = 0;
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_ct_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_ZONE]) {
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+ }
+
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return -ENOENT;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ c.data = ct;
+
+ err = netlink_dump_start(ctnl, skb, nlh, &c);
+ nf_ct_put(ct);
+
+ return err;
+}
+
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct net *net = sock_net(ctnl);
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_expect *exp;
+ struct sk_buff *skb2;
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ u16 zone;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (cda[CTA_EXPECT_MASTER])
+ return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda);
+ else {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+ }
+
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_TUPLE])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ else if (cda[CTA_EXPECT_MASTER])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ else
+ return -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ exp = nf_ct_expect_find_get(net, zone, &tuple);
+ if (!exp)
+ return -ENOENT;
+
+ if (cda[CTA_EXPECT_ID]) {
+ __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+ if (ntohl(id) != (u32)(unsigned long)exp) {
+ nf_ct_expect_put(exp);
+ return -ENOENT;
+ }
+ }
+
+ err = -ENOMEM;
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ nf_ct_expect_put(exp);
+ goto out;
+ }
+
+ rcu_read_lock();
+ err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
+ rcu_read_unlock();
+ nf_ct_expect_put(exp);
+ if (err <= 0)
+ goto free;
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+
+ return 0;
+
+free:
+ kfree_skb(skb2);
+out:
+ /* this avoids a loop in nfnetlink. */
+ return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct net *net = sock_net(ctnl);
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple tuple;
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct hlist_node *next;
+ u_int8_t u3 = nfmsg->nfgen_family;
+ unsigned int i;
+ u16 zone;
+ int err;
+
+ if (cda[CTA_EXPECT_TUPLE]) {
+ /* delete a single expect by tuple */
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ if (err < 0)
+ return err;
+
+ /* bump usage count to 2 */
+ exp = nf_ct_expect_find_get(net, zone, &tuple);
+ if (!exp)
+ return -ENOENT;
+
+ if (cda[CTA_EXPECT_ID]) {
+ __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+ if (ntohl(id) != (u32)(unsigned long)exp) {
+ nf_ct_expect_put(exp);
+ return -ENOENT;
+ }
+ }
+
+ /* after list removal, usage count == 1 */
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ nf_ct_expect_put(exp);
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ /* have to put what we 'get' above.
+ * after this line usage count == 0 */
+ nf_ct_expect_put(exp);
+ } else if (cda[CTA_EXPECT_HELP_NAME]) {
+ char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+ struct nf_conn_help *m_help;
+
+ /* delete all expectations for this helper */
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ for (i = 0; i < nf_ct_expect_hsize; i++) {
+ hlist_for_each_entry_safe(exp, next,
+ &net->ct.expect_hash[i],
+ hnode) {
+ m_help = nfct_help(exp->master);
+ if (!strcmp(m_help->helper->name, name) &&
+ del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect_report(exp,
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ nf_ct_expect_put(exp);
+ }
+ }
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ } else {
+ /* This basically means we have to flush everything*/
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ for (i = 0; i < nf_ct_expect_hsize; i++) {
+ hlist_for_each_entry_safe(exp, next,
+ &net->ct.expect_hash[i],
+ hnode) {
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect_report(exp,
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ nf_ct_expect_put(exp);
+ }
+ }
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ }
+
+ return 0;
+}
+static int
+ctnetlink_change_expect(struct nf_conntrack_expect *x,
+ const struct nlattr * const cda[])
+{
+ if (cda[CTA_EXPECT_TIMEOUT]) {
+ if (!del_timer(&x->timeout))
+ return -ETIME;
+
+ x->timeout.expires = jiffies +
+ ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+ add_timer(&x->timeout);
+ }
+ return 0;
+}
+
+static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
+ [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 },
+ [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED },
+};
+
+static int
+ctnetlink_parse_expect_nat(const struct nlattr *attr,
+ struct nf_conntrack_expect *exp,
+ u_int8_t u3)
+{
+#ifdef CONFIG_NF_NAT_NEEDED
+ struct nlattr *tb[CTA_EXPECT_NAT_MAX+1];
+ struct nf_conntrack_tuple nat_tuple = {};
+ int err;
+
+ err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
+ &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3);
+ if (err < 0)
+ return err;
+
+ exp->saved_addr = nat_tuple.src.u3;
+ exp->saved_proto = nat_tuple.src.u;
+ exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR]));
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
+ struct nf_conntrack_helper *helper,
+ struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *mask)
+{
+ u_int32_t class = 0;
+ struct nf_conntrack_expect *exp;
+ struct nf_conn_help *help;
+ int err;
+
+ if (cda[CTA_EXPECT_CLASS] && helper) {
+ class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS]));
+ if (class > helper->expect_class_max)
+ return ERR_PTR(-EINVAL);
+ }
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return ERR_PTR(-ENOMEM);
+
+ help = nfct_help(ct);
+ if (!help) {
+ if (!cda[CTA_EXPECT_TIMEOUT]) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ exp->timeout.expires =
+ jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+
+ exp->flags = NF_CT_EXPECT_USERSPACE;
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags |=
+ ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ }
+ } else {
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+ } else
+ exp->flags = 0;
+ }
+ if (cda[CTA_EXPECT_FN]) {
+ const char *name = nla_data(cda[CTA_EXPECT_FN]);
+ struct nf_ct_helper_expectfn *expfn;
+
+ expfn = nf_ct_helper_expectfn_find_by_name(name);
+ if (expfn == NULL) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ exp->expectfn = expfn->expectfn;
+ } else
+ exp->expectfn = NULL;
+
+ exp->class = class;
+ exp->master = ct;
+ exp->helper = helper;
+ exp->tuple = *tuple;
+ exp->mask.src.u3 = mask->src.u3;
+ exp->mask.src.u.all = mask->src.u.all;
+
+ if (cda[CTA_EXPECT_NAT]) {
+ err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT],
+ exp, nf_ct_l3num(ct));
+ if (err < 0)
+ goto err_out;
+ }
+ return exp;
+err_out:
+ nf_ct_expect_put(exp);
+ return ERR_PTR(err);
+}
+
+static int
+ctnetlink_create_expect(struct net *net, u16 zone,
+ const struct nlattr * const cda[],
+ u_int8_t u3, u32 portid, int report)
+{
+ struct nf_conntrack_tuple tuple, mask, master_tuple;
+ struct nf_conntrack_tuple_hash *h = NULL;
+ struct nf_conntrack_helper *helper = NULL;
+ struct nf_conntrack_expect *exp;
+ struct nf_conn *ct;
+ int err;
+
+ /* caller guarantees that those three CTA_EXPECT_* exist */
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+ if (err < 0)
+ return err;
+
+ /* Look for master conntrack of this expectation */
+ h = nf_conntrack_find_get(net, zone, &master_tuple);
+ if (!h)
+ return -ENOENT;
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (cda[CTA_EXPECT_HELP_NAME]) {
+ const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+ helper = __nf_conntrack_helper_find(helpname, u3,
+ nf_ct_protonum(ct));
+ if (helper == NULL) {
+#ifdef CONFIG_MODULES
+ if (request_module("nfct-helper-%s", helpname) < 0) {
+ err = -EOPNOTSUPP;
+ goto err_ct;
+ }
+ helper = __nf_conntrack_helper_find(helpname, u3,
+ nf_ct_protonum(ct));
+ if (helper) {
+ err = -EAGAIN;
+ goto err_ct;
+ }
+#endif
+ err = -EOPNOTSUPP;
+ goto err_ct;
+ }
+ }
+
+ exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
+ if (IS_ERR(exp)) {
+ err = PTR_ERR(exp);
+ goto err_ct;
+ }
+
+ err = nf_ct_expect_related_report(exp, portid, report);
+ if (err < 0)
+ goto err_exp;
+
+ return 0;
+err_exp:
+ nf_ct_expect_put(exp);
+err_ct:
+ nf_ct_put(ct);
+ return err;
+}
+
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ struct net *net = sock_net(ctnl);
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_expect *exp;
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ u16 zone;
+ int err;
+
+ if (!cda[CTA_EXPECT_TUPLE]
+ || !cda[CTA_EXPECT_MASK]
+ || !cda[CTA_EXPECT_MASTER])
+ return -EINVAL;
+
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+ if (err < 0)
+ return err;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ exp = __nf_ct_expect_find(net, zone, &tuple);
+
+ if (!exp) {
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ err = -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_CREATE) {
+ err = ctnetlink_create_expect(net, zone, cda,
+ u3,
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ }
+ return err;
+ }
+
+ err = -EEXIST;
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ err = ctnetlink_change_expect(exp, cda);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+ return err;
+}
+
+static int
+ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
+ const struct ip_conntrack_stat *st)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+
+ event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU);
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(cpu);
+
+ if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
+ nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
+ nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int cpu;
+ struct net *net = sock_net(skb->sk);
+
+ if (cb->args[0] == nr_cpu_ids)
+ return 0;
+
+ for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
+ const struct ip_conntrack_stat *st;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ st = per_cpu_ptr(net->ct.stat, cpu);
+ if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ cpu, st) < 0)
+ break;
+ }
+ cb->args[0] = cpu;
+
+ return skb->len;
+}
+
+static int
+ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_stat_cpu_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static struct nf_ct_event_notifier ctnl_notifier = {
+ .fcn = ctnetlink_conntrack_event,
+};
+
+static struct nf_exp_event_notifier ctnl_notifier_exp = {
+ .fcn = ctnetlink_expect_event,
+};
+#endif
+
+static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+ [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy },
+ [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy },
+ [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy },
+ [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy },
+ [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu },
+ [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct },
+ [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying },
+ [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed },
+};
+
+static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+ [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy },
+ [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy },
+ [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy },
+ [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu },
+};
+
+static const struct nfnetlink_subsystem ctnl_subsys = {
+ .name = "conntrack",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK,
+ .cb_count = IPCTNL_MSG_MAX,
+ .cb = ctnl_cb,
+};
+
+static const struct nfnetlink_subsystem ctnl_exp_subsys = {
+ .name = "conntrack_expect",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
+ .cb_count = IPCTNL_MSG_EXP_MAX,
+ .cb = ctnl_exp_cb,
+};
+
+MODULE_ALIAS("ip_conntrack_netlink");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
+
+static int __net_init ctnetlink_net_init(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int ret;
+
+ ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot register notifier.\n");
+ goto err_out;
+ }
+
+ ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot expect register notifier.\n");
+ goto err_unreg_notifier;
+ }
+#endif
+ return 0;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+ nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+err_out:
+ return ret;
+#endif
+}
+
+static void ctnetlink_net_exit(struct net *net)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
+ nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+#endif
+}
+
+static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ctnetlink_net_exit(net);
+}
+
+static struct pernet_operations ctnetlink_net_ops = {
+ .init = ctnetlink_net_init,
+ .exit_batch = ctnetlink_net_exit_batch,
+};
+
+static int __init ctnetlink_init(void)
+{
+ int ret;
+
+ pr_info("ctnetlink v%s: registering with nfnetlink.\n", version);
+ ret = nfnetlink_subsys_register(&ctnl_subsys);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+
+ ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n");
+ goto err_unreg_subsys;
+ }
+
+ ret = register_pernet_subsys(&ctnetlink_net_ops);
+ if (ret < 0) {
+ pr_err("ctnetlink_init: cannot register pernet operations\n");
+ goto err_unreg_exp_subsys;
+ }
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+ /* setup interaction between nf_queue and nf_conntrack_netlink. */
+ RCU_INIT_POINTER(nfq_ct_hook, &ctnetlink_nfqueue_hook);
+#endif
+ return 0;
+
+err_unreg_exp_subsys:
+ nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+err_unreg_subsys:
+ nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+ return ret;
+}
+
+static void __exit ctnetlink_exit(void)
+{
+ pr_info("ctnetlink: unregistering from nfnetlink.\n");
+
+ unregister_pernet_subsys(&ctnetlink_net_ops);
+ nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+ nfnetlink_subsys_unregister(&ctnl_subsys);
+#ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
+ RCU_INIT_POINTER(nfq_ct_hook, NULL);
+#endif
+}
+
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
new file mode 100644
index 000000000..825c3e3f8
--- /dev/null
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -0,0 +1,619 @@
+/*
+ * Connection tracking support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft. PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702. Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * Limitations:
+ * - We blindly assume that control connections are always
+ * established in PNS->PAC direction. This is a violation
+ * of RFC 2637
+ * - We can only support one single call within each session
+ * TODO:
+ * - testing of incoming PPTP calls
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define NF_CT_PPTP_VERSION "3.1"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP");
+MODULE_ALIAS("ip_conntrack_pptp");
+MODULE_ALIAS_NFCT_HELPER("pptp");
+
+static DEFINE_SPINLOCK(nf_pptp_lock);
+
+int
+(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned int protoff, struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
+
+int
+(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned int protoff, struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
+
+void
+(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
+ struct nf_conntrack_expect *expect_reply)
+ __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
+
+void
+(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+
+#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+/* PptpControlMessageType names */
+const char *const pptp_msg_name[] = {
+ "UNKNOWN_MESSAGE",
+ "START_SESSION_REQUEST",
+ "START_SESSION_REPLY",
+ "STOP_SESSION_REQUEST",
+ "STOP_SESSION_REPLY",
+ "ECHO_REQUEST",
+ "ECHO_REPLY",
+ "OUT_CALL_REQUEST",
+ "OUT_CALL_REPLY",
+ "IN_CALL_REQUEST",
+ "IN_CALL_REPLY",
+ "IN_CALL_CONNECT",
+ "CALL_CLEAR_REQUEST",
+ "CALL_DISCONNECT_NOTIFY",
+ "WAN_ERROR_NOTIFY",
+ "SET_LINK_INFO"
+};
+EXPORT_SYMBOL(pptp_msg_name);
+#endif
+
+#define SECS *HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+
+#define PPTP_GRE_TIMEOUT (10 MINS)
+#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS)
+
+static void pptp_expectfn(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct net *net = nf_ct_net(ct);
+ typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
+ pr_debug("increasing timeouts\n");
+
+ /* increase timeout of GRE data channel conntrack entry */
+ ct->proto.gre.timeout = PPTP_GRE_TIMEOUT;
+ ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT;
+
+ /* Can you see how rusty this code is, compared with the pre-2.6.11
+ * one? That's what happened to my shiny newnat of 2002 ;( -HW */
+
+ rcu_read_lock();
+ nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
+ if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
+ nf_nat_pptp_expectfn(ct, exp);
+ else {
+ struct nf_conntrack_tuple inv_t;
+ struct nf_conntrack_expect *exp_other;
+
+ /* obviously this tuple inversion only works until you do NAT */
+ nf_ct_invert_tuplepr(&inv_t, &exp->tuple);
+ pr_debug("trying to unexpect other dir: ");
+ nf_ct_dump_tuple(&inv_t);
+
+ exp_other = nf_ct_expect_find_get(net, nf_ct_zone(ct), &inv_t);
+ if (exp_other) {
+ /* delete other expectation. */
+ pr_debug("found\n");
+ nf_ct_unexpect_related(exp_other);
+ nf_ct_expect_put(exp_other);
+ } else {
+ pr_debug("not found\n");
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
+ const struct nf_conntrack_tuple *t)
+{
+ const struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_expect *exp;
+ struct nf_conn *sibling;
+ u16 zone = nf_ct_zone(ct);
+
+ pr_debug("trying to timeout ct or exp for tuple ");
+ nf_ct_dump_tuple(t);
+
+ h = nf_conntrack_find_get(net, zone, t);
+ if (h) {
+ sibling = nf_ct_tuplehash_to_ctrack(h);
+ pr_debug("setting timeout of conntrack %p to 0\n", sibling);
+ sibling->proto.gre.timeout = 0;
+ sibling->proto.gre.stream_timeout = 0;
+ if (del_timer(&sibling->timeout))
+ sibling->timeout.function((unsigned long)sibling);
+ nf_ct_put(sibling);
+ return 1;
+ } else {
+ exp = nf_ct_expect_find_get(net, zone, t);
+ if (exp) {
+ pr_debug("unexpect_related of expect %p\n", exp);
+ nf_ct_unexpect_related(exp);
+ nf_ct_expect_put(exp);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* timeout GRE data connections */
+static void pptp_destroy_siblings(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
+ struct nf_conntrack_tuple t;
+
+ nf_ct_gre_keymap_destroy(ct);
+
+ /* try original (pns->pac) tuple */
+ memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
+ t.dst.protonum = IPPROTO_GRE;
+ t.src.u.gre.key = ct_pptp_info->pns_call_id;
+ t.dst.u.gre.key = ct_pptp_info->pac_call_id;
+ if (!destroy_sibling_or_exp(net, ct, &t))
+ pr_debug("failed to timeout original pns->pac ct/exp\n");
+
+ /* try reply (pac->pns) tuple */
+ memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
+ t.dst.protonum = IPPROTO_GRE;
+ t.src.u.gre.key = ct_pptp_info->pac_call_id;
+ t.dst.u.gre.key = ct_pptp_info->pns_call_id;
+ if (!destroy_sibling_or_exp(net, ct, &t))
+ pr_debug("failed to timeout reply pac->pns ct/exp\n");
+}
+
+/* expect GRE connections (PNS->PAC and PAC->PNS direction) */
+static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
+{
+ struct nf_conntrack_expect *exp_orig, *exp_reply;
+ enum ip_conntrack_dir dir;
+ int ret = 1;
+ typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
+
+ exp_orig = nf_ct_expect_alloc(ct);
+ if (exp_orig == NULL)
+ goto out;
+
+ exp_reply = nf_ct_expect_alloc(ct);
+ if (exp_reply == NULL)
+ goto out_put_orig;
+
+ /* original direction, PNS->PAC */
+ dir = IP_CT_DIR_ORIGINAL;
+ nf_ct_expect_init(exp_orig, NF_CT_EXPECT_CLASS_DEFAULT,
+ nf_ct_l3num(ct),
+ &ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[dir].tuple.dst.u3,
+ IPPROTO_GRE, &peer_callid, &callid);
+ exp_orig->expectfn = pptp_expectfn;
+
+ /* reply direction, PAC->PNS */
+ dir = IP_CT_DIR_REPLY;
+ nf_ct_expect_init(exp_reply, NF_CT_EXPECT_CLASS_DEFAULT,
+ nf_ct_l3num(ct),
+ &ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[dir].tuple.dst.u3,
+ IPPROTO_GRE, &callid, &peer_callid);
+ exp_reply->expectfn = pptp_expectfn;
+
+ nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
+ if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
+ nf_nat_pptp_exp_gre(exp_orig, exp_reply);
+ if (nf_ct_expect_related(exp_orig) != 0)
+ goto out_put_both;
+ if (nf_ct_expect_related(exp_reply) != 0)
+ goto out_unexpect_orig;
+
+ /* Add GRE keymap entries */
+ if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0)
+ goto out_unexpect_both;
+ if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) {
+ nf_ct_gre_keymap_destroy(ct);
+ goto out_unexpect_both;
+ }
+ ret = 0;
+
+out_put_both:
+ nf_ct_expect_put(exp_reply);
+out_put_orig:
+ nf_ct_expect_put(exp_orig);
+out:
+ return ret;
+
+out_unexpect_both:
+ nf_ct_unexpect_related(exp_reply);
+out_unexpect_orig:
+ nf_ct_unexpect_related(exp_orig);
+ goto out_put_both;
+}
+
+static inline int
+pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq,
+ unsigned int reqlen,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ u_int16_t msg;
+ __be16 cid = 0, pcid = 0;
+ typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
+
+ msg = ntohs(ctlh->messageType);
+ pr_debug("inbound control message %s\n", pptp_msg_name[msg]);
+
+ switch (msg) {
+ case PPTP_START_SESSION_REPLY:
+ /* server confirms new control session */
+ if (info->sstate < PPTP_SESSION_REQUESTED)
+ goto invalid;
+ if (pptpReq->srep.resultCode == PPTP_START_OK)
+ info->sstate = PPTP_SESSION_CONFIRMED;
+ else
+ info->sstate = PPTP_SESSION_ERROR;
+ break;
+
+ case PPTP_STOP_SESSION_REPLY:
+ /* server confirms end of control session */
+ if (info->sstate > PPTP_SESSION_STOPREQ)
+ goto invalid;
+ if (pptpReq->strep.resultCode == PPTP_STOP_OK)
+ info->sstate = PPTP_SESSION_NONE;
+ else
+ info->sstate = PPTP_SESSION_ERROR;
+ break;
+
+ case PPTP_OUT_CALL_REPLY:
+ /* server accepted call, we now expect GRE frames */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+ if (info->cstate != PPTP_CALL_OUT_REQ &&
+ info->cstate != PPTP_CALL_OUT_CONF)
+ goto invalid;
+
+ cid = pptpReq->ocack.callID;
+ pcid = pptpReq->ocack.peersCallID;
+ if (info->pns_call_id != pcid)
+ goto invalid;
+ pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
+ ntohs(cid), ntohs(pcid));
+
+ if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) {
+ info->cstate = PPTP_CALL_OUT_CONF;
+ info->pac_call_id = cid;
+ exp_gre(ct, cid, pcid);
+ } else
+ info->cstate = PPTP_CALL_NONE;
+ break;
+
+ case PPTP_IN_CALL_REQUEST:
+ /* server tells us about incoming call request */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+
+ cid = pptpReq->icreq.callID;
+ pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ info->cstate = PPTP_CALL_IN_REQ;
+ info->pac_call_id = cid;
+ break;
+
+ case PPTP_IN_CALL_CONNECT:
+ /* server tells us about incoming call established */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+ if (info->cstate != PPTP_CALL_IN_REP &&
+ info->cstate != PPTP_CALL_IN_CONF)
+ goto invalid;
+
+ pcid = pptpReq->iccon.peersCallID;
+ cid = info->pac_call_id;
+
+ if (info->pns_call_id != pcid)
+ goto invalid;
+
+ pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
+ info->cstate = PPTP_CALL_IN_CONF;
+
+ /* we expect a GRE connection from PAC to PNS */
+ exp_gre(ct, cid, pcid);
+ break;
+
+ case PPTP_CALL_DISCONNECT_NOTIFY:
+ /* server confirms disconnect */
+ cid = pptpReq->disc.callID;
+ pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ info->cstate = PPTP_CALL_NONE;
+
+ /* untrack this call id, unexpect GRE packets */
+ pptp_destroy_siblings(ct);
+ break;
+
+ case PPTP_WAN_ERROR_NOTIFY:
+ case PPTP_SET_LINK_INFO:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* I don't have to explain these ;) */
+ break;
+
+ default:
+ goto invalid;
+ }
+
+ nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
+ if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
+ return nf_nat_pptp_inbound(skb, ct, ctinfo,
+ protoff, ctlh, pptpReq);
+ return NF_ACCEPT;
+
+invalid:
+ pr_debug("invalid %s: type=%d cid=%u pcid=%u "
+ "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
+ msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+ msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
+ ntohs(info->pns_call_id), ntohs(info->pac_call_id));
+ return NF_ACCEPT;
+}
+
+static inline int
+pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
+ struct PptpControlHeader *ctlh,
+ union pptp_ctrl_union *pptpReq,
+ unsigned int reqlen,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ u_int16_t msg;
+ __be16 cid = 0, pcid = 0;
+ typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
+
+ msg = ntohs(ctlh->messageType);
+ pr_debug("outbound control message %s\n", pptp_msg_name[msg]);
+
+ switch (msg) {
+ case PPTP_START_SESSION_REQUEST:
+ /* client requests for new control session */
+ if (info->sstate != PPTP_SESSION_NONE)
+ goto invalid;
+ info->sstate = PPTP_SESSION_REQUESTED;
+ break;
+
+ case PPTP_STOP_SESSION_REQUEST:
+ /* client requests end of control session */
+ info->sstate = PPTP_SESSION_STOPREQ;
+ break;
+
+ case PPTP_OUT_CALL_REQUEST:
+ /* client initiating connection to server */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+ info->cstate = PPTP_CALL_OUT_REQ;
+ /* track PNS call id */
+ cid = pptpReq->ocreq.callID;
+ pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ info->pns_call_id = cid;
+ break;
+
+ case PPTP_IN_CALL_REPLY:
+ /* client answers incoming call */
+ if (info->cstate != PPTP_CALL_IN_REQ &&
+ info->cstate != PPTP_CALL_IN_REP)
+ goto invalid;
+
+ cid = pptpReq->icack.callID;
+ pcid = pptpReq->icack.peersCallID;
+ if (info->pac_call_id != pcid)
+ goto invalid;
+ pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
+ ntohs(cid), ntohs(pcid));
+
+ if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) {
+ /* part two of the three-way handshake */
+ info->cstate = PPTP_CALL_IN_REP;
+ info->pns_call_id = cid;
+ } else
+ info->cstate = PPTP_CALL_NONE;
+ break;
+
+ case PPTP_CALL_CLEAR_REQUEST:
+ /* client requests hangup of call */
+ if (info->sstate != PPTP_SESSION_CONFIRMED)
+ goto invalid;
+ /* FUTURE: iterate over all calls and check if
+ * call ID is valid. We don't do this without newnat,
+ * because we only know about last call */
+ info->cstate = PPTP_CALL_CLEAR_REQ;
+ break;
+
+ case PPTP_SET_LINK_INFO:
+ case PPTP_ECHO_REQUEST:
+ case PPTP_ECHO_REPLY:
+ /* I don't have to explain these ;) */
+ break;
+
+ default:
+ goto invalid;
+ }
+
+ nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
+ if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
+ return nf_nat_pptp_outbound(skb, ct, ctinfo,
+ protoff, ctlh, pptpReq);
+ return NF_ACCEPT;
+
+invalid:
+ pr_debug("invalid %s: type=%d cid=%u pcid=%u "
+ "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
+ msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+ msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
+ ntohs(info->pns_call_id), ntohs(info->pac_call_id));
+ return NF_ACCEPT;
+}
+
+static const unsigned int pptp_msg_size[] = {
+ [PPTP_START_SESSION_REQUEST] = sizeof(struct PptpStartSessionRequest),
+ [PPTP_START_SESSION_REPLY] = sizeof(struct PptpStartSessionReply),
+ [PPTP_STOP_SESSION_REQUEST] = sizeof(struct PptpStopSessionRequest),
+ [PPTP_STOP_SESSION_REPLY] = sizeof(struct PptpStopSessionReply),
+ [PPTP_OUT_CALL_REQUEST] = sizeof(struct PptpOutCallRequest),
+ [PPTP_OUT_CALL_REPLY] = sizeof(struct PptpOutCallReply),
+ [PPTP_IN_CALL_REQUEST] = sizeof(struct PptpInCallRequest),
+ [PPTP_IN_CALL_REPLY] = sizeof(struct PptpInCallReply),
+ [PPTP_IN_CALL_CONNECT] = sizeof(struct PptpInCallConnected),
+ [PPTP_CALL_CLEAR_REQUEST] = sizeof(struct PptpClearCallRequest),
+ [PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify),
+ [PPTP_WAN_ERROR_NOTIFY] = sizeof(struct PptpWanErrorNotify),
+ [PPTP_SET_LINK_INFO] = sizeof(struct PptpSetLinkInfo),
+};
+
+/* track caller id inside control connection, call expect_related */
+static int
+conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+
+{
+ int dir = CTINFO2DIR(ctinfo);
+ const struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct tcphdr *tcph;
+ struct tcphdr _tcph;
+ const struct pptp_pkt_hdr *pptph;
+ struct pptp_pkt_hdr _pptph;
+ struct PptpControlHeader _ctlh, *ctlh;
+ union pptp_ctrl_union _pptpReq, *pptpReq;
+ unsigned int tcplen = skb->len - protoff;
+ unsigned int datalen, reqlen, nexthdr_off;
+ int oldsstate, oldcstate;
+ int ret;
+ u_int16_t msg;
+
+ /* don't do any tracking before tcp handshake complete */
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ nexthdr_off = protoff;
+ tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph);
+ BUG_ON(!tcph);
+ nexthdr_off += tcph->doff * 4;
+ datalen = tcplen - tcph->doff * 4;
+
+ pptph = skb_header_pointer(skb, nexthdr_off, sizeof(_pptph), &_pptph);
+ if (!pptph) {
+ pr_debug("no full PPTP header, can't track\n");
+ return NF_ACCEPT;
+ }
+ nexthdr_off += sizeof(_pptph);
+ datalen -= sizeof(_pptph);
+
+ /* if it's not a control message we can't do anything with it */
+ if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL ||
+ ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) {
+ pr_debug("not a control packet\n");
+ return NF_ACCEPT;
+ }
+
+ ctlh = skb_header_pointer(skb, nexthdr_off, sizeof(_ctlh), &_ctlh);
+ if (!ctlh)
+ return NF_ACCEPT;
+ nexthdr_off += sizeof(_ctlh);
+ datalen -= sizeof(_ctlh);
+
+ reqlen = datalen;
+ msg = ntohs(ctlh->messageType);
+ if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg])
+ return NF_ACCEPT;
+ if (reqlen > sizeof(*pptpReq))
+ reqlen = sizeof(*pptpReq);
+
+ pptpReq = skb_header_pointer(skb, nexthdr_off, reqlen, &_pptpReq);
+ if (!pptpReq)
+ return NF_ACCEPT;
+
+ oldsstate = info->sstate;
+ oldcstate = info->cstate;
+
+ spin_lock_bh(&nf_pptp_lock);
+
+ /* FIXME: We just blindly assume that the control connection is always
+ * established from PNS->PAC. However, RFC makes no guarantee */
+ if (dir == IP_CT_DIR_ORIGINAL)
+ /* client -> server (PNS -> PAC) */
+ ret = pptp_outbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct,
+ ctinfo);
+ else
+ /* server -> client (PAC -> PNS) */
+ ret = pptp_inbound_pkt(skb, protoff, ctlh, pptpReq, reqlen, ct,
+ ctinfo);
+ pr_debug("sstate: %d->%d, cstate: %d->%d\n",
+ oldsstate, info->sstate, oldcstate, info->cstate);
+ spin_unlock_bh(&nf_pptp_lock);
+
+ return ret;
+}
+
+static const struct nf_conntrack_expect_policy pptp_exp_policy = {
+ .max_expected = 2,
+ .timeout = 5 * 60,
+};
+
+/* control protocol helper */
+static struct nf_conntrack_helper pptp __read_mostly = {
+ .name = "pptp",
+ .me = THIS_MODULE,
+ .data_len = sizeof(struct nf_ct_pptp_master),
+ .tuple.src.l3num = AF_INET,
+ .tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT),
+ .tuple.dst.protonum = IPPROTO_TCP,
+ .help = conntrack_pptp_help,
+ .destroy = pptp_destroy_siblings,
+ .expect_policy = &pptp_exp_policy,
+};
+
+static int __init nf_conntrack_pptp_init(void)
+{
+ return nf_conntrack_helper_register(&pptp);
+}
+
+static void __exit nf_conntrack_pptp_fini(void)
+{
+ nf_conntrack_helper_unregister(&pptp);
+}
+
+module_init(nf_conntrack_pptp_init);
+module_exit(nf_conntrack_pptp_fini);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
new file mode 100644
index 000000000..b65d5864b
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -0,0 +1,523 @@
+/* L3/L4 protocol support for nf_conntrack. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_l3protos);
+
+static DEFINE_MUTEX(nf_ct_proto_mutex);
+
+#ifdef CONFIG_SYSCTL
+static int
+nf_ct_register_sysctl(struct net *net,
+ struct ctl_table_header **header,
+ const char *path,
+ struct ctl_table *table)
+{
+ if (*header == NULL) {
+ *header = register_net_sysctl(net, path, table);
+ if (*header == NULL)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void
+nf_ct_unregister_sysctl(struct ctl_table_header **header,
+ struct ctl_table **table,
+ unsigned int users)
+{
+ if (users > 0)
+ return;
+
+ unregister_net_sysctl_table(*header);
+ kfree(*table);
+ *header = NULL;
+ *table = NULL;
+}
+#endif
+
+struct nf_conntrack_l4proto *
+__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
+{
+ if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
+ return &nf_conntrack_l4proto_generic;
+
+ return rcu_dereference(nf_ct_protos[l3proto][l4proto]);
+}
+EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct nf_conntrack_l3proto *
+nf_ct_l3proto_find_get(u_int16_t l3proto)
+{
+ struct nf_conntrack_l3proto *p;
+
+ rcu_read_lock();
+ p = __nf_ct_l3proto_find(l3proto);
+ if (!try_module_get(p->me))
+ p = &nf_conntrack_l3proto_generic;
+ rcu_read_unlock();
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
+
+int
+nf_ct_l3proto_try_module_get(unsigned short l3proto)
+{
+ int ret;
+ struct nf_conntrack_l3proto *p;
+
+retry: p = nf_ct_l3proto_find_get(l3proto);
+ if (p == &nf_conntrack_l3proto_generic) {
+ ret = request_module("nf_conntrack-%d", l3proto);
+ if (!ret)
+ goto retry;
+
+ return -EPROTOTYPE;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get);
+
+void nf_ct_l3proto_module_put(unsigned short l3proto)
+{
+ struct nf_conntrack_l3proto *p;
+
+ /* rcu_read_lock not necessary since the caller holds a reference, but
+ * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find()
+ */
+ rcu_read_lock();
+ p = __nf_ct_l3proto_find(l3proto);
+ module_put(p->me);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
+
+struct nf_conntrack_l4proto *
+nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
+{
+ struct nf_conntrack_l4proto *p;
+
+ rcu_read_lock();
+ p = __nf_ct_l4proto_find(l3num, l4num);
+ if (!try_module_get(p->me))
+ p = &nf_conntrack_l4proto_generic;
+ rcu_read_unlock();
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get);
+
+void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p)
+{
+ module_put(p->me);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
+
+static int kill_l3proto(struct nf_conn *i, void *data)
+{
+ return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto;
+}
+
+static int kill_l4proto(struct nf_conn *i, void *data)
+{
+ struct nf_conntrack_l4proto *l4proto;
+ l4proto = (struct nf_conntrack_l4proto *)data;
+ return nf_ct_protonum(i) == l4proto->l4proto &&
+ nf_ct_l3num(i) == l4proto->l3proto;
+}
+
+static struct nf_ip_net *nf_ct_l3proto_net(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
+{
+ if (l3proto->l3proto == PF_INET)
+ return &net->ct.nf_ct_proto;
+ else
+ return NULL;
+}
+
+static int nf_ct_l3proto_register_sysctl(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
+{
+ int err = 0;
+ struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
+ /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */
+ if (in == NULL)
+ return 0;
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ if (in->ctl_table != NULL) {
+ err = nf_ct_register_sysctl(net,
+ &in->ctl_table_header,
+ l3proto->ctl_table_path,
+ in->ctl_table);
+ if (err < 0) {
+ kfree(in->ctl_table);
+ in->ctl_table = NULL;
+ }
+ }
+#endif
+ return err;
+}
+
+static void nf_ct_l3proto_unregister_sysctl(struct net *net,
+ struct nf_conntrack_l3proto *l3proto)
+{
+ struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto);
+
+ if (in == NULL)
+ return;
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ if (in->ctl_table_header != NULL)
+ nf_ct_unregister_sysctl(&in->ctl_table_header,
+ &in->ctl_table,
+ 0);
+#endif
+}
+
+int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
+{
+ int ret = 0;
+ struct nf_conntrack_l3proto *old;
+
+ if (proto->l3proto >= AF_MAX)
+ return -EBUSY;
+
+ if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size)
+ return -EINVAL;
+
+ mutex_lock(&nf_ct_proto_mutex);
+ old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex));
+ if (old != &nf_conntrack_l3proto_generic) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (proto->nlattr_tuple_size)
+ proto->nla_size = 3 * proto->nlattr_tuple_size();
+
+ rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
+
+out_unlock:
+ mutex_unlock(&nf_ct_proto_mutex);
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
+
+int nf_ct_l3proto_pernet_register(struct net *net,
+ struct nf_conntrack_l3proto *proto)
+{
+ int ret = 0;
+
+ if (proto->init_net) {
+ ret = proto->init_net(net);
+ if (ret < 0)
+ return ret;
+ }
+
+ return nf_ct_l3proto_register_sysctl(net, proto);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
+
+void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+{
+ BUG_ON(proto->l3proto >= AF_MAX);
+
+ mutex_lock(&nf_ct_proto_mutex);
+ BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != proto);
+ rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
+ &nf_conntrack_l3proto_generic);
+ mutex_unlock(&nf_ct_proto_mutex);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
+
+void nf_ct_l3proto_pernet_unregister(struct net *net,
+ struct nf_conntrack_l3proto *proto)
+{
+ nf_ct_l3proto_unregister_sysctl(net, proto);
+
+ /* Remove all contrack entries for this protocol */
+ nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister);
+
+static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ if (l4proto->get_net_proto) {
+ /* statically built-in protocols use static per-net */
+ return l4proto->get_net_proto(net);
+ } else if (l4proto->net_id) {
+ /* ... and loadable protocols use dynamic per-net */
+ return net_generic(net, *l4proto->net_id);
+ }
+ return NULL;
+}
+
+static
+int nf_ct_l4proto_register_sysctl(struct net *net,
+ struct nf_proto_net *pn,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ int err = 0;
+
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table != NULL) {
+ err = nf_ct_register_sysctl(net,
+ &pn->ctl_table_header,
+ "net/netfilter",
+ pn->ctl_table);
+ if (err < 0) {
+ if (!pn->users) {
+ kfree(pn->ctl_table);
+ pn->ctl_table = NULL;
+ }
+ }
+ }
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) {
+ if (err < 0) {
+ nf_ct_kfree_compat_sysctl_table(pn);
+ goto out;
+ }
+ err = nf_ct_register_sysctl(net,
+ &pn->ctl_compat_header,
+ "net/ipv4/netfilter",
+ pn->ctl_compat_table);
+ if (err == 0)
+ goto out;
+
+ nf_ct_kfree_compat_sysctl_table(pn);
+ nf_ct_unregister_sysctl(&pn->ctl_table_header,
+ &pn->ctl_table,
+ pn->users);
+ }
+out:
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+ return err;
+}
+
+static
+void nf_ct_l4proto_unregister_sysctl(struct net *net,
+ struct nf_proto_net *pn,
+ struct nf_conntrack_l4proto *l4proto)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table_header != NULL)
+ nf_ct_unregister_sysctl(&pn->ctl_table_header,
+ &pn->ctl_table,
+ pn->users);
+
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL)
+ nf_ct_unregister_sysctl(&pn->ctl_compat_header,
+ &pn->ctl_compat_table,
+ 0);
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+}
+
+/* FIXME: Allow NULL functions and sub in pointers to generic for
+ them. --RR */
+int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+{
+ int ret = 0;
+
+ if (l4proto->l3proto >= PF_MAX)
+ return -EBUSY;
+
+ if ((l4proto->to_nlattr && !l4proto->nlattr_size)
+ || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
+ return -EINVAL;
+
+ mutex_lock(&nf_ct_proto_mutex);
+ if (!nf_ct_protos[l4proto->l3proto]) {
+ /* l3proto may be loaded latter. */
+ struct nf_conntrack_l4proto __rcu **proto_array;
+ int i;
+
+ proto_array = kmalloc(MAX_NF_CT_PROTO *
+ sizeof(struct nf_conntrack_l4proto *),
+ GFP_KERNEL);
+ if (proto_array == NULL) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < MAX_NF_CT_PROTO; i++)
+ RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
+
+ /* Before making proto_array visible to lockless readers,
+ * we must make sure its content is committed to memory.
+ */
+ smp_wmb();
+
+ nf_ct_protos[l4proto->l3proto] = proto_array;
+ } else if (rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != &nf_conntrack_l4proto_generic) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ l4proto->nla_size = 0;
+ if (l4proto->nlattr_size)
+ l4proto->nla_size += l4proto->nlattr_size();
+ if (l4proto->nlattr_tuple_size)
+ l4proto->nla_size += 3 * l4proto->nlattr_tuple_size();
+
+ rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ l4proto);
+out_unlock:
+ mutex_unlock(&nf_ct_proto_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
+
+int nf_ct_l4proto_pernet_register(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ int ret = 0;
+ struct nf_proto_net *pn = NULL;
+
+ if (l4proto->init_net) {
+ ret = l4proto->init_net(net, l4proto->l3proto);
+ if (ret < 0)
+ goto out;
+ }
+
+ pn = nf_ct_l4proto_net(net, l4proto);
+ if (pn == NULL)
+ goto out;
+
+ ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto);
+ if (ret < 0)
+ goto out;
+
+ pn->users++;
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
+
+void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+{
+ BUG_ON(l4proto->l3proto >= PF_MAX);
+
+ mutex_lock(&nf_ct_proto_mutex);
+ BUG_ON(rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != l4proto);
+ rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ &nf_conntrack_l4proto_generic);
+ mutex_unlock(&nf_ct_proto_mutex);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
+
+void nf_ct_l4proto_pernet_unregister(struct net *net,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ struct nf_proto_net *pn = NULL;
+
+ pn = nf_ct_l4proto_net(net, l4proto);
+ if (pn == NULL)
+ return;
+
+ pn->users--;
+ nf_ct_l4proto_unregister_sysctl(net, pn, l4proto);
+
+ /* Remove all contrack entries for this protocol */
+ nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
+
+int nf_conntrack_proto_pernet_init(struct net *net)
+{
+ int err;
+ struct nf_proto_net *pn = nf_ct_l4proto_net(net,
+ &nf_conntrack_l4proto_generic);
+
+ err = nf_conntrack_l4proto_generic.init_net(net,
+ nf_conntrack_l4proto_generic.l3proto);
+ if (err < 0)
+ return err;
+ err = nf_ct_l4proto_register_sysctl(net,
+ pn,
+ &nf_conntrack_l4proto_generic);
+ if (err < 0)
+ return err;
+
+ pn->users++;
+ return 0;
+}
+
+void nf_conntrack_proto_pernet_fini(struct net *net)
+{
+ struct nf_proto_net *pn = nf_ct_l4proto_net(net,
+ &nf_conntrack_l4proto_generic);
+
+ pn->users--;
+ nf_ct_l4proto_unregister_sysctl(net,
+ pn,
+ &nf_conntrack_l4proto_generic);
+}
+
+int nf_conntrack_proto_init(void)
+{
+ unsigned int i;
+ for (i = 0; i < AF_MAX; i++)
+ rcu_assign_pointer(nf_ct_l3protos[i],
+ &nf_conntrack_l3proto_generic);
+ return 0;
+}
+
+void nf_conntrack_proto_fini(void)
+{
+ unsigned int i;
+ /* free l3proto protocol tables */
+ for (i = 0; i < PF_MAX; i++)
+ kfree(nf_ct_protos[i]);
+}
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
new file mode 100644
index 000000000..6dd995c7c
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -0,0 +1,1006 @@
+/*
+ * DCCP connection tracking protocol helper
+ *
+ * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/dccp.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+
+/* Timeouts are based on values from RFC4340:
+ *
+ * - REQUEST:
+ *
+ * 8.1.2. Client Request
+ *
+ * A client MAY give up on its DCCP-Requests after some time
+ * (3 minutes, for example).
+ *
+ * - RESPOND:
+ *
+ * 8.1.3. Server Response
+ *
+ * It MAY also leave the RESPOND state for CLOSED after a timeout of
+ * not less than 4MSL (8 minutes);
+ *
+ * - PARTOPEN:
+ *
+ * 8.1.5. Handshake Completion
+ *
+ * If the client remains in PARTOPEN for more than 4MSL (8 minutes),
+ * it SHOULD reset the connection with Reset Code 2, "Aborted".
+ *
+ * - OPEN:
+ *
+ * The DCCP timestamp overflows after 11.9 hours. If the connection
+ * stays idle this long the sequence number won't be recognized
+ * as valid anymore.
+ *
+ * - CLOSEREQ/CLOSING:
+ *
+ * 8.3. Termination
+ *
+ * The retransmission timer should initially be set to go off in two
+ * round-trip times and should back off to not less than once every
+ * 64 seconds ...
+ *
+ * - TIMEWAIT:
+ *
+ * 4.3. States
+ *
+ * A server or client socket remains in this state for 2MSL (4 minutes)
+ * after the connection has been town down, ...
+ */
+
+#define DCCP_MSL (2 * 60 * HZ)
+
+static const char * const dccp_state_names[] = {
+ [CT_DCCP_NONE] = "NONE",
+ [CT_DCCP_REQUEST] = "REQUEST",
+ [CT_DCCP_RESPOND] = "RESPOND",
+ [CT_DCCP_PARTOPEN] = "PARTOPEN",
+ [CT_DCCP_OPEN] = "OPEN",
+ [CT_DCCP_CLOSEREQ] = "CLOSEREQ",
+ [CT_DCCP_CLOSING] = "CLOSING",
+ [CT_DCCP_TIMEWAIT] = "TIMEWAIT",
+ [CT_DCCP_IGNORE] = "IGNORE",
+ [CT_DCCP_INVALID] = "INVALID",
+};
+
+#define sNO CT_DCCP_NONE
+#define sRQ CT_DCCP_REQUEST
+#define sRS CT_DCCP_RESPOND
+#define sPO CT_DCCP_PARTOPEN
+#define sOP CT_DCCP_OPEN
+#define sCR CT_DCCP_CLOSEREQ
+#define sCG CT_DCCP_CLOSING
+#define sTW CT_DCCP_TIMEWAIT
+#define sIG CT_DCCP_IGNORE
+#define sIV CT_DCCP_INVALID
+
+/*
+ * DCCP state transition table
+ *
+ * The assumption is the same as for TCP tracking:
+ *
+ * We are the man in the middle. All the packets go through us but might
+ * get lost in transit to the destination. It is assumed that the destination
+ * can't receive segments we haven't seen.
+ *
+ * The following states exist:
+ *
+ * NONE: Initial state, expecting Request
+ * REQUEST: Request seen, waiting for Response from server
+ * RESPOND: Response from server seen, waiting for Ack from client
+ * PARTOPEN: Ack after Response seen, waiting for packet other than Response,
+ * Reset or Sync from server
+ * OPEN: Packet other than Response, Reset or Sync seen
+ * CLOSEREQ: CloseReq from server seen, expecting Close from client
+ * CLOSING: Close seen, expecting Reset
+ * TIMEWAIT: Reset seen
+ * IGNORE: Not determinable whether packet is valid
+ *
+ * Some states exist only on one side of the connection: REQUEST, RESPOND,
+ * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to
+ * the one it was in before.
+ *
+ * Packets are marked as ignored (sIG) if we don't know if they're valid
+ * (for example a reincarnation of a connection we didn't notice is dead
+ * already) and the server may send back a connection closing Reset or a
+ * Response. They're also used for Sync/SyncAck packets, which we don't
+ * care about.
+ */
+static const u_int8_t
+dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = {
+ [CT_DCCP_ROLE_CLIENT] = {
+ [DCCP_PKT_REQUEST] = {
+ /*
+ * sNO -> sRQ Regular Request
+ * sRQ -> sRQ Retransmitted Request or reincarnation
+ * sRS -> sRS Retransmitted Request (apparently Response
+ * got lost after we saw it) or reincarnation
+ * sPO -> sIG Ignore, conntrack might be out of sync
+ * sOP -> sIG Ignore, conntrack might be out of sync
+ * sCR -> sIG Ignore, conntrack might be out of sync
+ * sCG -> sIG Ignore, conntrack might be out of sync
+ * sTW -> sRQ Reincarnation
+ *
+ * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */
+ sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ,
+ },
+ [DCCP_PKT_RESPONSE] = {
+ /*
+ * sNO -> sIV Invalid
+ * sRQ -> sIG Ignore, might be response to ignored Request
+ * sRS -> sIG Ignore, might be response to ignored Request
+ * sPO -> sIG Ignore, might be response to ignored Request
+ * sOP -> sIG Ignore, might be response to ignored Request
+ * sCR -> sIG Ignore, might be response to ignored Request
+ * sCG -> sIG Ignore, might be response to ignored Request
+ * sTW -> sIV Invalid, reincarnation in reverse direction
+ * goes through sRQ
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV,
+ },
+ [DCCP_PKT_ACK] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
+ * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN
+ * sOP -> sOP Regular ACK, remain in OPEN
+ * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.)
+ * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
+ },
+ [DCCP_PKT_DATA] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.)
+ * sOP -> sOP Regular Data packet
+ * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.)
+ * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV,
+ },
+ [DCCP_PKT_DATAACK] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
+ * sPO -> sPO Remain in PARTOPEN state
+ * sOP -> sOP Regular DataAck packet in OPEN state
+ * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.)
+ * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
+ },
+ [DCCP_PKT_CLOSEREQ] = {
+ /*
+ * CLOSEREQ may only be sent by the server.
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV
+ },
+ [DCCP_PKT_CLOSE] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sCG Client-initiated close
+ * sOP -> sCG Client-initiated close
+ * sCR -> sCG Close in response to CloseReq (8.3.)
+ * sCG -> sCG Retransmit
+ * sTW -> sIV Late retransmit, already in TIME_WAIT
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV
+ },
+ [DCCP_PKT_RESET] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.)
+ * sRS -> sTW Response received without Request
+ * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.)
+ * sOP -> sTW Connection reset
+ * sCR -> sTW Connection reset
+ * sCG -> sTW Connection reset
+ * sTW -> sIG Ignore (don't refresh timer)
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG
+ },
+ [DCCP_PKT_SYNC] = {
+ /*
+ * We currently ignore Sync packets
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ },
+ [DCCP_PKT_SYNCACK] = {
+ /*
+ * We currently ignore SyncAck packets
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ },
+ },
+ [CT_DCCP_ROLE_SERVER] = {
+ [DCCP_PKT_REQUEST] = {
+ /*
+ * sNO -> sIV Invalid
+ * sRQ -> sIG Ignore, conntrack might be out of sync
+ * sRS -> sIG Ignore, conntrack might be out of sync
+ * sPO -> sIG Ignore, conntrack might be out of sync
+ * sOP -> sIG Ignore, conntrack might be out of sync
+ * sCR -> sIG Ignore, conntrack might be out of sync
+ * sCG -> sIG Ignore, conntrack might be out of sync
+ * sTW -> sRQ Reincarnation, must reverse roles
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ
+ },
+ [DCCP_PKT_RESPONSE] = {
+ /*
+ * sNO -> sIV Response without Request
+ * sRQ -> sRS Response to clients Request
+ * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT)
+ * sPO -> sIG Response to an ignored Request or late retransmit
+ * sOP -> sIG Ignore, might be response to ignored Request
+ * sCR -> sIG Ignore, might be response to ignored Request
+ * sCG -> sIG Ignore, might be response to ignored Request
+ * sTW -> sIV Invalid, Request from client in sTW moves to sRQ
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV
+ },
+ [DCCP_PKT_ACK] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP Enter OPEN state (8.1.5.)
+ * sOP -> sOP Regular Ack in OPEN state
+ * sCR -> sIV Waiting for Close from client
+ * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+ },
+ [DCCP_PKT_DATA] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP Enter OPEN state (8.1.5.)
+ * sOP -> sOP Regular Data packet in OPEN state
+ * sCR -> sIV Waiting for Close from client
+ * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+ },
+ [DCCP_PKT_DATAACK] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP Enter OPEN state (8.1.5.)
+ * sOP -> sOP Regular DataAck in OPEN state
+ * sCR -> sIV Waiting for Close from client
+ * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
+ * sTW -> sIV
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+ },
+ [DCCP_PKT_CLOSEREQ] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.)
+ * sOP -> sCR CloseReq in OPEN state
+ * sCR -> sCR Retransmit
+ * sCG -> sCR Simultaneous close, client sends another Close
+ * sTW -> sIV Already closed
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV
+ },
+ [DCCP_PKT_CLOSE] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sIV No connection
+ * sRS -> sIV No connection
+ * sPO -> sOP -> sCG Move direcly to CLOSING
+ * sOP -> sCG Move to CLOSING
+ * sCR -> sIV Close after CloseReq is invalid
+ * sCG -> sCG Retransmit
+ * sTW -> sIV Already closed
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV
+ },
+ [DCCP_PKT_RESET] = {
+ /*
+ * sNO -> sIV No connection
+ * sRQ -> sTW Reset in response to Request
+ * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.)
+ * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.)
+ * sOP -> sTW
+ * sCR -> sTW
+ * sCG -> sTW
+ * sTW -> sIG Ignore (don't refresh timer)
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */
+ sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG
+ },
+ [DCCP_PKT_SYNC] = {
+ /*
+ * We currently ignore Sync packets
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ },
+ [DCCP_PKT_SYNCACK] = {
+ /*
+ * We currently ignore SyncAck packets
+ *
+ * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+ sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ },
+ },
+};
+
+/* this module per-net specifics */
+static int dccp_net_id __read_mostly;
+struct dccp_net {
+ struct nf_proto_net pn;
+ int dccp_loose;
+ unsigned int dccp_timeout[CT_DCCP_MAX + 1];
+};
+
+static inline struct dccp_net *dccp_pernet(struct net *net)
+{
+ return net_generic(net, dccp_net_id);
+}
+
+static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct dccp_hdr _hdr, *dh;
+
+ dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (dh == NULL)
+ return false;
+
+ tuple->src.u.dccp.port = dh->dccph_sport;
+ tuple->dst.u.dccp.port = dh->dccph_dport;
+ return true;
+}
+
+static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
+ const struct nf_conntrack_tuple *tuple)
+{
+ inv->src.u.dccp.port = tuple->dst.u.dccp.port;
+ inv->dst.u.dccp.port = tuple->src.u.dccp.port;
+ return true;
+}
+
+static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ struct net *net = nf_ct_net(ct);
+ struct dccp_net *dn;
+ struct dccp_hdr _dh, *dh;
+ const char *msg;
+ u_int8_t state;
+
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
+ BUG_ON(dh == NULL);
+
+ state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
+ switch (state) {
+ default:
+ dn = dccp_pernet(net);
+ if (dn->dccp_loose == 0) {
+ msg = "nf_ct_dccp: not picking up existing connection ";
+ goto out_invalid;
+ }
+ case CT_DCCP_REQUEST:
+ break;
+ case CT_DCCP_INVALID:
+ msg = "nf_ct_dccp: invalid state transition ";
+ goto out_invalid;
+ }
+
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
+ ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
+ ct->proto.dccp.state = CT_DCCP_NONE;
+ ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
+ ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
+ ct->proto.dccp.handshake_seq = 0;
+ return true;
+
+out_invalid:
+ if (LOG_INVALID(net, IPPROTO_DCCP))
+ nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL,
+ NULL, "%s", msg);
+ return false;
+}
+
+static u64 dccp_ack_seq(const struct dccp_hdr *dh)
+{
+ const struct dccp_hdr_ack_bits *dhack;
+
+ dhack = (void *)dh + __dccp_basic_hdr_len(dh);
+ return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) +
+ ntohl(dhack->dccph_ack_nr_low);
+}
+
+static unsigned int *dccp_get_timeouts(struct net *net)
+{
+ return dccp_pernet(net)->dccp_timeout;
+}
+
+static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, enum ip_conntrack_info ctinfo,
+ u_int8_t pf, unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ struct net *net = nf_ct_net(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct dccp_hdr _dh, *dh;
+ u_int8_t type, old_state, new_state;
+ enum ct_dccp_roles role;
+
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
+ BUG_ON(dh == NULL);
+ type = dh->dccph_type;
+
+ if (type == DCCP_PKT_RESET &&
+ !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ /* Tear down connection immediately if only reply is a RESET */
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_ACCEPT;
+ }
+
+ spin_lock_bh(&ct->lock);
+
+ role = ct->proto.dccp.role[dir];
+ old_state = ct->proto.dccp.state;
+ new_state = dccp_state_table[role][type][old_state];
+
+ switch (new_state) {
+ case CT_DCCP_REQUEST:
+ if (old_state == CT_DCCP_TIMEWAIT &&
+ role == CT_DCCP_ROLE_SERVER) {
+ /* Reincarnation in the reverse direction: reopen and
+ * reverse client/server roles. */
+ ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT;
+ ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER;
+ }
+ break;
+ case CT_DCCP_RESPOND:
+ if (old_state == CT_DCCP_REQUEST)
+ ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
+ break;
+ case CT_DCCP_PARTOPEN:
+ if (old_state == CT_DCCP_RESPOND &&
+ type == DCCP_PKT_ACK &&
+ dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq)
+ set_bit(IPS_ASSURED_BIT, &ct->status);
+ break;
+ case CT_DCCP_IGNORE:
+ /*
+ * Connection tracking might be out of sync, so we ignore
+ * packets that might establish a new connection and resync
+ * if the server responds with a valid Response.
+ */
+ if (ct->proto.dccp.last_dir == !dir &&
+ ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST &&
+ type == DCCP_PKT_RESPONSE) {
+ ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT;
+ ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER;
+ ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
+ new_state = CT_DCCP_RESPOND;
+ break;
+ }
+ ct->proto.dccp.last_dir = dir;
+ ct->proto.dccp.last_pkt = type;
+
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_DCCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_dccp: invalid packet ignored ");
+ return NF_ACCEPT;
+ case CT_DCCP_INVALID:
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_DCCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_dccp: invalid state transition ");
+ return -NF_ACCEPT;
+ }
+
+ ct->proto.dccp.last_dir = dir;
+ ct->proto.dccp.last_pkt = type;
+ ct->proto.dccp.state = new_state;
+ spin_unlock_bh(&ct->lock);
+
+ if (new_state != old_state)
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
+
+ return NF_ACCEPT;
+}
+
+static int dccp_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb, unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ u_int8_t pf, unsigned int hooknum)
+{
+ struct dccp_hdr _dh, *dh;
+ unsigned int dccp_len = skb->len - dataoff;
+ unsigned int cscov;
+ const char *msg;
+
+ dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
+ if (dh == NULL) {
+ msg = "nf_ct_dccp: short packet ";
+ goto out_invalid;
+ }
+
+ if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
+ dh->dccph_doff * 4 > dccp_len) {
+ msg = "nf_ct_dccp: truncated/malformed packet ";
+ goto out_invalid;
+ }
+
+ cscov = dccp_len;
+ if (dh->dccph_cscov) {
+ cscov = (dh->dccph_cscov - 1) * 4;
+ if (cscov > dccp_len) {
+ msg = "nf_ct_dccp: bad checksum coverage ";
+ goto out_invalid;
+ }
+ }
+
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP,
+ pf)) {
+ msg = "nf_ct_dccp: bad checksum ";
+ goto out_invalid;
+ }
+
+ if (dh->dccph_type >= DCCP_PKT_INVALID) {
+ msg = "nf_ct_dccp: reserved packet type ";
+ goto out_invalid;
+ }
+
+ return NF_ACCEPT;
+
+out_invalid:
+ if (LOG_INVALID(net, IPPROTO_DCCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, "%s", msg);
+ return -NF_ACCEPT;
+}
+
+static void dccp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.dccp.port),
+ ntohs(tuple->dst.u.dccp.port));
+}
+
+static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+ seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+ struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ spin_lock_bh(&ct->lock);
+ nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+ if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) ||
+ nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
+ nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
+ cpu_to_be64(ct->proto.dccp.handshake_seq)))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest_parms);
+ spin_unlock_bh(&ct->lock);
+ return 0;
+
+nla_put_failure:
+ spin_unlock_bh(&ct->lock);
+ return -1;
+}
+
+static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
+ [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
+};
+
+static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
+{
+ struct nlattr *attr = cda[CTA_PROTOINFO_DCCP];
+ struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1];
+ int err;
+
+ if (!attr)
+ return 0;
+
+ err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr,
+ dccp_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
+ !tb[CTA_PROTOINFO_DCCP_ROLE] ||
+ nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
+ nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&ct->lock);
+ ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
+ if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
+ ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
+ } else {
+ ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
+ ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
+ }
+ if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) {
+ ct->proto.dccp.handshake_seq =
+ be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
+ }
+ spin_unlock_bh(&ct->lock);
+ return 0;
+}
+
+static int dccp_nlattr_size(void)
+{
+ return nla_total_size(0) /* CTA_PROTOINFO_DCCP */
+ + nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1);
+}
+
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ struct dccp_net *dn = dccp_pernet(net);
+ unsigned int *timeouts = data;
+ int i;
+
+ /* set default DCCP timeouts. */
+ for (i=0; i<CT_DCCP_MAX; i++)
+ timeouts[i] = dn->dccp_timeout[i];
+
+ /* there's a 1:1 mapping between attributes and protocol states. */
+ for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
+ if (tb[i]) {
+ timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
+ }
+ }
+ return 0;
+}
+
+static int
+dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+ int i;
+
+ for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
+ if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
+ goto nla_put_failure;
+ }
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
+ [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+/* template, data assigned later */
+static struct ctl_table dccp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_dccp_timeout_request",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_respond",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_partopen",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_open",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_closereq",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_closing",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_timeout_timewait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_dccp_loose",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+#endif /* CONFIG_SYSCTL */
+
+static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
+ struct dccp_net *dn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(dccp_sysctl_table,
+ sizeof(dccp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
+ pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
+ pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN];
+ pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN];
+ pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ];
+ pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING];
+ pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT];
+ pn->ctl_table[7].data = &dn->dccp_loose;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ pn->ctl_table[0].procname = NULL;
+#endif
+ return 0;
+}
+
+static int dccp_init_net(struct net *net, u_int16_t proto)
+{
+ struct dccp_net *dn = dccp_pernet(net);
+ struct nf_proto_net *pn = &dn->pn;
+
+ if (!pn->users) {
+ /* default values */
+ dn->dccp_loose = 1;
+ dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL;
+ dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ;
+ dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ;
+ dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ;
+ dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL;
+ }
+
+ return dccp_kmemdup_sysctl_table(net, pn, dn);
+}
+
+static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
+ .l3proto = AF_INET,
+ .l4proto = IPPROTO_DCCP,
+ .name = "dccp",
+ .pkt_to_tuple = dccp_pkt_to_tuple,
+ .invert_tuple = dccp_invert_tuple,
+ .new = dccp_new,
+ .packet = dccp_packet,
+ .get_timeouts = dccp_get_timeouts,
+ .error = dccp_error,
+ .print_tuple = dccp_print_tuple,
+ .print_conntrack = dccp_print_conntrack,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = dccp_to_nlattr,
+ .nlattr_size = dccp_nlattr_size,
+ .from_nlattr = nlattr_to_dccp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
+ .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
+ .nla_policy = dccp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &dccp_net_id,
+ .init_net = dccp_init_net,
+};
+
+static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
+ .l3proto = AF_INET6,
+ .l4proto = IPPROTO_DCCP,
+ .name = "dccp",
+ .pkt_to_tuple = dccp_pkt_to_tuple,
+ .invert_tuple = dccp_invert_tuple,
+ .new = dccp_new,
+ .packet = dccp_packet,
+ .get_timeouts = dccp_get_timeouts,
+ .error = dccp_error,
+ .print_tuple = dccp_print_tuple,
+ .print_conntrack = dccp_print_conntrack,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = dccp_to_nlattr,
+ .nlattr_size = dccp_nlattr_size,
+ .from_nlattr = nlattr_to_dccp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
+ .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
+ .nla_policy = dccp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &dccp_net_id,
+ .init_net = dccp_init_net,
+};
+
+static __net_init int dccp_net_init(struct net *net)
+{
+ int ret = 0;
+ ret = nf_ct_l4proto_pernet_register(net, &dccp_proto4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_dccp4: pernet registration failed.\n");
+ goto out;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &dccp_proto6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_dccp6: pernet registration failed.\n");
+ goto cleanup_dccp4;
+ }
+ return 0;
+cleanup_dccp4:
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
+out:
+ return ret;
+}
+
+static __net_exit void dccp_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto6);
+ nf_ct_l4proto_pernet_unregister(net, &dccp_proto4);
+}
+
+static struct pernet_operations dccp_net_ops = {
+ .init = dccp_net_init,
+ .exit = dccp_net_exit,
+ .id = &dccp_net_id,
+ .size = sizeof(struct dccp_net),
+};
+
+static int __init nf_conntrack_proto_dccp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&dccp_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&dccp_proto4);
+ if (ret < 0)
+ goto out_dccp4;
+
+ ret = nf_ct_l4proto_register(&dccp_proto6);
+ if (ret < 0)
+ goto out_dccp6;
+
+ return 0;
+out_dccp6:
+ nf_ct_l4proto_unregister(&dccp_proto4);
+out_dccp4:
+ unregister_pernet_subsys(&dccp_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit nf_conntrack_proto_dccp_fini(void)
+{
+ nf_ct_l4proto_unregister(&dccp_proto6);
+ nf_ct_l4proto_unregister(&dccp_proto4);
+ unregister_pernet_subsys(&dccp_net_ops);
+}
+
+module_init(nf_conntrack_proto_dccp_init);
+module_exit(nf_conntrack_proto_dccp_fini);
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("DCCP connection tracking protocol helper");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
new file mode 100644
index 000000000..60865f110
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -0,0 +1,239 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+
+static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ;
+
+static bool nf_generic_should_process(u8 proto)
+{
+ switch (proto) {
+#ifdef CONFIG_NF_CT_PROTO_SCTP_MODULE
+ case IPPROTO_SCTP:
+ return false;
+#endif
+#ifdef CONFIG_NF_CT_PROTO_DCCP_MODULE
+ case IPPROTO_DCCP:
+ return false;
+#endif
+#ifdef CONFIG_NF_CT_PROTO_GRE_MODULE
+ case IPPROTO_GRE:
+ return false;
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE_MODULE
+ case IPPROTO_UDPLITE:
+ return false;
+#endif
+ default:
+ return true;
+ }
+}
+
+static inline struct nf_generic_net *generic_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.generic;
+}
+
+static bool generic_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+
+ return true;
+}
+
+static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static void generic_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+}
+
+static unsigned int *generic_get_timeouts(struct net *net)
+{
+ return &(generic_pernet(net)->timeout);
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int generic_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeout)
+{
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ return nf_generic_should_process(nf_ct_protonum(ct));
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeout = data;
+ struct nf_generic_net *gn = generic_pernet(net);
+
+ if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT])
+ *timeout =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ;
+ else {
+ /* Set default generic timeout. */
+ *timeout = gn->timeout;
+ }
+
+ return 0;
+}
+
+static int
+generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeout = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = {
+ [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table generic_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_generic_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table generic_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_generic_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_generic_net *gn)
+{
+#ifdef CONFIG_SYSCTL
+ pn->ctl_table = kmemdup(generic_sysctl_table,
+ sizeof(generic_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &gn->timeout;
+#endif
+ return 0;
+}
+
+static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_generic_net *gn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table,
+ sizeof(generic_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &gn->timeout;
+#endif
+#endif
+ return 0;
+}
+
+static int generic_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_generic_net *gn = generic_pernet(net);
+ struct nf_proto_net *pn = &gn->pn;
+
+ gn->timeout = nf_ct_generic_timeout;
+
+ ret = generic_kmemdup_compat_sysctl_table(pn, gn);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_kmemdup_sysctl_table(pn, gn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+
+ return ret;
+}
+
+static struct nf_proto_net *generic_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.generic.pn;
+}
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
+{
+ .l3proto = PF_UNSPEC,
+ .l4proto = 255,
+ .name = "unknown",
+ .pkt_to_tuple = generic_pkt_to_tuple,
+ .invert_tuple = generic_invert_tuple,
+ .print_tuple = generic_print_tuple,
+ .packet = generic_packet,
+ .get_timeouts = generic_get_timeouts,
+ .new = generic_new,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = generic_timeout_nlattr_to_obj,
+ .obj_to_nlattr = generic_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_GENERIC_MAX,
+ .obj_size = sizeof(unsigned int),
+ .nla_policy = generic_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = generic_init_net,
+ .get_net_proto = generic_get_net_proto,
+};
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
new file mode 100644
index 000000000..7648674f2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -0,0 +1,447 @@
+/*
+ * ip_conntrack_proto_gre.c - Version 3.0
+ *
+ * Connection tracking protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/dst.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+enum grep_conntrack {
+ GRE_CT_UNREPLIED,
+ GRE_CT_REPLIED,
+ GRE_CT_MAX
+};
+
+static unsigned int gre_timeouts[GRE_CT_MAX] = {
+ [GRE_CT_UNREPLIED] = 30*HZ,
+ [GRE_CT_REPLIED] = 180*HZ,
+};
+
+static int proto_gre_net_id __read_mostly;
+struct netns_proto_gre {
+ struct nf_proto_net nf;
+ rwlock_t keymap_lock;
+ struct list_head keymap_list;
+ unsigned int gre_timeouts[GRE_CT_MAX];
+};
+
+static inline struct netns_proto_gre *gre_pernet(struct net *net)
+{
+ return net_generic(net, proto_gre_net_id);
+}
+
+static void nf_ct_gre_keymap_flush(struct net *net)
+{
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_gre_keymap *km, *tmp;
+
+ write_lock_bh(&net_gre->keymap_lock);
+ list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
+ list_del(&km->list);
+ kfree(km);
+ }
+ write_unlock_bh(&net_gre->keymap_lock);
+}
+
+static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
+ const struct nf_conntrack_tuple *t)
+{
+ return km->tuple.src.l3num == t->src.l3num &&
+ !memcmp(&km->tuple.src.u3, &t->src.u3, sizeof(t->src.u3)) &&
+ !memcmp(&km->tuple.dst.u3, &t->dst.u3, sizeof(t->dst.u3)) &&
+ km->tuple.dst.protonum == t->dst.protonum &&
+ km->tuple.dst.u.all == t->dst.u.all;
+}
+
+/* look up the source key for a given tuple */
+static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
+{
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_gre_keymap *km;
+ __be16 key = 0;
+
+ read_lock_bh(&net_gre->keymap_lock);
+ list_for_each_entry(km, &net_gre->keymap_list, list) {
+ if (gre_key_cmpfn(km, t)) {
+ key = km->tuple.src.u.gre.key;
+ break;
+ }
+ }
+ read_unlock_bh(&net_gre->keymap_lock);
+
+ pr_debug("lookup src key 0x%x for ", key);
+ nf_ct_dump_tuple(t);
+
+ return key;
+}
+
+/* add a single keymap entry, associate with specified master ct */
+int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
+ struct nf_conntrack_tuple *t)
+{
+ struct net *net = nf_ct_net(ct);
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
+ struct nf_ct_gre_keymap **kmp, *km;
+
+ kmp = &ct_pptp_info->keymap[dir];
+ if (*kmp) {
+ /* check whether it's a retransmission */
+ read_lock_bh(&net_gre->keymap_lock);
+ list_for_each_entry(km, &net_gre->keymap_list, list) {
+ if (gre_key_cmpfn(km, t) && km == *kmp) {
+ read_unlock_bh(&net_gre->keymap_lock);
+ return 0;
+ }
+ }
+ read_unlock_bh(&net_gre->keymap_lock);
+ pr_debug("trying to override keymap_%s for ct %p\n",
+ dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
+ return -EEXIST;
+ }
+
+ km = kmalloc(sizeof(*km), GFP_ATOMIC);
+ if (!km)
+ return -ENOMEM;
+ memcpy(&km->tuple, t, sizeof(*t));
+ *kmp = km;
+
+ pr_debug("adding new entry %p: ", km);
+ nf_ct_dump_tuple(&km->tuple);
+
+ write_lock_bh(&net_gre->keymap_lock);
+ list_add_tail(&km->list, &net_gre->keymap_list);
+ write_unlock_bh(&net_gre->keymap_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);
+
+/* destroy the keymap entries associated with specified master ct */
+void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir;
+
+ pr_debug("entering for ct %p\n", ct);
+
+ write_lock_bh(&net_gre->keymap_lock);
+ for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
+ if (ct_pptp_info->keymap[dir]) {
+ pr_debug("removing %p from list\n",
+ ct_pptp_info->keymap[dir]);
+ list_del(&ct_pptp_info->keymap[dir]->list);
+ kfree(ct_pptp_info->keymap[dir]);
+ ct_pptp_info->keymap[dir] = NULL;
+ }
+ }
+ write_unlock_bh(&net_gre->keymap_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
+
+/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
+
+/* invert gre part of tuple */
+static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->dst.u.gre.key = orig->src.u.gre.key;
+ tuple->src.u.gre.key = orig->dst.u.gre.key;
+ return true;
+}
+
+/* gre hdr info to tuple */
+static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev);
+ const struct gre_hdr_pptp *pgrehdr;
+ struct gre_hdr_pptp _pgrehdr;
+ __be16 srckey;
+ const struct gre_hdr *grehdr;
+ struct gre_hdr _grehdr;
+
+ /* first only delinearize old RFC1701 GRE header */
+ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
+ if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+ /* try to behave like "nf_conntrack_proto_generic" */
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+ return true;
+ }
+
+ /* PPTP header is variable length, only need up to the call_id field */
+ pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
+ if (!pgrehdr)
+ return true;
+
+ if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
+ pr_debug("GRE_VERSION_PPTP but unknown proto\n");
+ return false;
+ }
+
+ tuple->dst.u.gre.key = pgrehdr->call_id;
+ srckey = gre_keymap_lookup(net, tuple);
+ tuple->src.u.gre.key = srckey;
+
+ return true;
+}
+
+/* print gre part of tuple */
+static void gre_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "srckey=0x%x dstkey=0x%x ",
+ ntohs(tuple->src.u.gre.key),
+ ntohs(tuple->dst.u.gre.key));
+}
+
+/* print private data for conntrack */
+static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+ seq_printf(s, "timeout=%u, stream_timeout=%u ",
+ (ct->proto.gre.timeout / HZ),
+ (ct->proto.gre.stream_timeout / HZ));
+}
+
+static unsigned int *gre_get_timeouts(struct net *net)
+{
+ return gre_pernet(net)->gre_timeouts;
+}
+
+/* Returns verdict for packet, and may modify conntrack */
+static int gre_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ /* If we've seen traffic both ways, this is a GRE connection.
+ * Extend timeout. */
+ if (ct->status & IPS_SEEN_REPLY) {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ ct->proto.gre.stream_timeout);
+ /* Also, more likely to be important, and not a probe. */
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ } else
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ ct->proto.gre.timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ pr_debug(": ");
+ nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+
+ /* initialize to sane value. Ideally a conntrack helper
+ * (e.g. in case of pptp) is increasing them */
+ ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
+ ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
+
+ return true;
+}
+
+/* Called when a conntrack entry has already been removed from the hashes
+ * and is about to be deleted from memory */
+static void gre_destroy(struct nf_conn *ct)
+{
+ struct nf_conn *master = ct->master;
+ pr_debug(" entering\n");
+
+ if (!master)
+ pr_debug("no master !?!\n");
+ else
+ nf_ct_gre_keymap_destroy(master);
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+
+ /* set default timeouts for GRE. */
+ timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
+ timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) {
+ timeouts[GRE_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_GRE_REPLIED]) {
+ timeouts[GRE_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ;
+ }
+ return 0;
+}
+
+static int
+gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED,
+ htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED,
+ htonl(timeouts[GRE_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
+ [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+static int gre_init_net(struct net *net, u_int16_t proto)
+{
+ struct netns_proto_gre *net_gre = gre_pernet(net);
+ int i;
+
+ rwlock_init(&net_gre->keymap_lock);
+ INIT_LIST_HEAD(&net_gre->keymap_list);
+ for (i = 0; i < GRE_CT_MAX; i++)
+ net_gre->gre_timeouts[i] = gre_timeouts[i];
+
+ return 0;
+}
+
+/* protocol helper struct */
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
+ .l3proto = AF_INET,
+ .l4proto = IPPROTO_GRE,
+ .name = "gre",
+ .pkt_to_tuple = gre_pkt_to_tuple,
+ .invert_tuple = gre_invert_tuple,
+ .print_tuple = gre_print_tuple,
+ .print_conntrack = gre_print_conntrack,
+ .get_timeouts = gre_get_timeouts,
+ .packet = gre_packet,
+ .new = gre_new,
+ .destroy = gre_destroy,
+ .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = gre_timeout_nlattr_to_obj,
+ .obj_to_nlattr = gre_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_GRE_MAX,
+ .obj_size = sizeof(unsigned int) * GRE_CT_MAX,
+ .nla_policy = gre_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &proto_gre_net_id,
+ .init_net = gre_init_net,
+};
+
+static int proto_gre_net_init(struct net *net)
+{
+ int ret = 0;
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4);
+ if (ret < 0)
+ pr_err("nf_conntrack_gre4: pernet registration failed.\n");
+ return ret;
+}
+
+static void proto_gre_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4);
+ nf_ct_gre_keymap_flush(net);
+}
+
+static struct pernet_operations proto_gre_net_ops = {
+ .init = proto_gre_net_init,
+ .exit = proto_gre_net_exit,
+ .id = &proto_gre_net_id,
+ .size = sizeof(struct netns_proto_gre),
+};
+
+static int __init nf_ct_proto_gre_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&proto_gre_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4);
+ if (ret < 0)
+ goto out_gre4;
+
+ return 0;
+out_gre4:
+ unregister_pernet_subsys(&proto_gre_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit nf_ct_proto_gre_fini(void)
+{
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+ unregister_pernet_subsys(&proto_gre_net_ops);
+}
+
+module_init(nf_ct_proto_gre_init);
+module_exit(nf_ct_proto_gre_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
new file mode 100644
index 000000000..b45da90fa
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -0,0 +1,928 @@
+/*
+ * Connection tracking protocol helper module for SCTP.
+ *
+ * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com>
+ * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * SCTP is defined in RFC 2960. References to various sections in this code
+ * are to this RFC.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+
+/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+ closely. They're more complex. --RR
+
+ And so for me for SCTP :D -Kiran */
+
+static const char *const sctp_conntrack_names[] = {
+ "NONE",
+ "CLOSED",
+ "COOKIE_WAIT",
+ "COOKIE_ECHOED",
+ "ESTABLISHED",
+ "SHUTDOWN_SENT",
+ "SHUTDOWN_RECD",
+ "SHUTDOWN_ACK_SENT",
+};
+
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
+ [SCTP_CONNTRACK_CLOSED] = 10 SECS,
+ [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS,
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS,
+ [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS,
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
+};
+
+#define sNO SCTP_CONNTRACK_NONE
+#define sCL SCTP_CONNTRACK_CLOSED
+#define sCW SCTP_CONNTRACK_COOKIE_WAIT
+#define sCE SCTP_CONNTRACK_COOKIE_ECHOED
+#define sES SCTP_CONNTRACK_ESTABLISHED
+#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
+#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
+#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define sIV SCTP_CONNTRACK_MAX
+
+/*
+ These are the descriptions of the states:
+
+NOTE: These state names are tantalizingly similar to the states of an
+SCTP endpoint. But the interpretation of the states is a little different,
+considering that these are the states of the connection and not of an end
+point. Please note the subtleties. -Kiran
+
+NONE - Nothing so far.
+COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
+ an INIT_ACK chunk in the reply direction.
+COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
+ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
+SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
+SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
+ to that of the SHUTDOWN chunk.
+CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
+ the SHUTDOWN chunk. Connection is closed.
+*/
+
+/* TODO
+ - I have assumed that the first INIT is in the original direction.
+ This messes things when an INIT comes in the reply direction in CLOSED
+ state.
+ - Check the error type in the reply dir before transitioning from
+cookie echoed to closed.
+ - Sec 5.2.4 of RFC 2960
+ - Multi Homing support.
+*/
+
+/* SCTP conntrack state transitions */
+static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+ {
+/* ORIGINAL */
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+ },
+ {
+/* REPLY */
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+ }
+};
+
+static int sctp_net_id __read_mostly;
+struct sctp_net {
+ struct nf_proto_net pn;
+ unsigned int timeouts[SCTP_CONNTRACK_MAX];
+};
+
+static inline struct sctp_net *sctp_pernet(struct net *net)
+{
+ return net_generic(net, sctp_net_id);
+}
+
+static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct sctphdr *hp;
+ struct sctphdr _hdr;
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+ if (hp == NULL)
+ return false;
+
+ tuple->src.u.sctp.port = hp->source;
+ tuple->dst.u.sctp.port = hp->dest;
+ return true;
+}
+
+static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.sctp.port = orig->dst.u.sctp.port;
+ tuple->dst.u.sctp.port = orig->src.u.sctp.port;
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static void sctp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.sctp.port),
+ ntohs(tuple->dst.u.sctp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+ enum sctp_conntrack state;
+
+ spin_lock_bh(&ct->lock);
+ state = ct->proto.sctp.state;
+ spin_unlock_bh(&ct->lock);
+
+ seq_printf(s, "%s ", sctp_conntrack_names[state]);
+}
+
+#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
+for ((offset) = (dataoff) + sizeof(sctp_sctphdr_t), (count) = 0; \
+ (offset) < (skb)->len && \
+ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \
+ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)
+
+/* Some validity checks to make sure the chunks are fine */
+static int do_basic_checks(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ unsigned long *map)
+{
+ u_int32_t offset, count;
+ sctp_chunkhdr_t _sch, *sch;
+ int flag;
+
+ flag = 0;
+
+ for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+ pr_debug("Chunk Num: %d Type: %d\n", count, sch->type);
+
+ if (sch->type == SCTP_CID_INIT ||
+ sch->type == SCTP_CID_INIT_ACK ||
+ sch->type == SCTP_CID_SHUTDOWN_COMPLETE)
+ flag = 1;
+
+ /*
+ * Cookie Ack/Echo chunks not the first OR
+ * Init / Init Ack / Shutdown compl chunks not the only chunks
+ * OR zero-length.
+ */
+ if (((sch->type == SCTP_CID_COOKIE_ACK ||
+ sch->type == SCTP_CID_COOKIE_ECHO ||
+ flag) &&
+ count != 0) || !sch->length) {
+ pr_debug("Basic checks failed\n");
+ return 1;
+ }
+
+ if (map)
+ set_bit(sch->type, map);
+ }
+
+ pr_debug("Basic checks passed\n");
+ return count == 0;
+}
+
+static int sctp_new_state(enum ip_conntrack_dir dir,
+ enum sctp_conntrack cur_state,
+ int chunk_type)
+{
+ int i;
+
+ pr_debug("Chunk type: %d\n", chunk_type);
+
+ switch (chunk_type) {
+ case SCTP_CID_INIT:
+ pr_debug("SCTP_CID_INIT\n");
+ i = 0;
+ break;
+ case SCTP_CID_INIT_ACK:
+ pr_debug("SCTP_CID_INIT_ACK\n");
+ i = 1;
+ break;
+ case SCTP_CID_ABORT:
+ pr_debug("SCTP_CID_ABORT\n");
+ i = 2;
+ break;
+ case SCTP_CID_SHUTDOWN:
+ pr_debug("SCTP_CID_SHUTDOWN\n");
+ i = 3;
+ break;
+ case SCTP_CID_SHUTDOWN_ACK:
+ pr_debug("SCTP_CID_SHUTDOWN_ACK\n");
+ i = 4;
+ break;
+ case SCTP_CID_ERROR:
+ pr_debug("SCTP_CID_ERROR\n");
+ i = 5;
+ break;
+ case SCTP_CID_COOKIE_ECHO:
+ pr_debug("SCTP_CID_COOKIE_ECHO\n");
+ i = 6;
+ break;
+ case SCTP_CID_COOKIE_ACK:
+ pr_debug("SCTP_CID_COOKIE_ACK\n");
+ i = 7;
+ break;
+ case SCTP_CID_SHUTDOWN_COMPLETE:
+ pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
+ i = 8;
+ break;
+ default:
+ /* Other chunks like DATA, SACK, HEARTBEAT and
+ its ACK do not cause a change in state */
+ pr_debug("Unknown chunk type, Will stay in %s\n",
+ sctp_conntrack_names[cur_state]);
+ return cur_state;
+ }
+
+ pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
+ dir, sctp_conntrack_names[cur_state], chunk_type,
+ sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
+
+ return sctp_conntracks[dir][i][cur_state];
+}
+
+static unsigned int *sctp_get_timeouts(struct net *net)
+{
+ return sctp_pernet(net)->timeouts;
+}
+
+/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
+static int sctp_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ enum sctp_conntrack new_state, old_state;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ const struct sctphdr *sh;
+ struct sctphdr _sctph;
+ const struct sctp_chunkhdr *sch;
+ struct sctp_chunkhdr _sch;
+ u_int32_t offset, count;
+ unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+
+ sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ goto out;
+
+ if (do_basic_checks(ct, skb, dataoff, map) != 0)
+ goto out;
+
+ /* Check the verification tag (Sec 8.5) */
+ if (!test_bit(SCTP_CID_INIT, map) &&
+ !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
+ !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
+ !test_bit(SCTP_CID_ABORT, map) &&
+ !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) {
+ pr_debug("Verification tag check failed\n");
+ goto out;
+ }
+
+ old_state = new_state = SCTP_CONNTRACK_NONE;
+ spin_lock_bh(&ct->lock);
+ for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+ /* Special cases of Verification tag check (Sec 8.5.1) */
+ if (sch->type == SCTP_CID_INIT) {
+ /* Sec 8.5.1 (A) */
+ if (sh->vtag != 0)
+ goto out_unlock;
+ } else if (sch->type == SCTP_CID_ABORT) {
+ /* Sec 8.5.1 (B) */
+ if (sh->vtag != ct->proto.sctp.vtag[dir] &&
+ sh->vtag != ct->proto.sctp.vtag[!dir])
+ goto out_unlock;
+ } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+ /* Sec 8.5.1 (C) */
+ if (sh->vtag != ct->proto.sctp.vtag[dir] &&
+ sh->vtag != ct->proto.sctp.vtag[!dir] &&
+ sch->flags & SCTP_CHUNK_FLAG_T)
+ goto out_unlock;
+ } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
+ /* Sec 8.5.1 (D) */
+ if (sh->vtag != ct->proto.sctp.vtag[dir])
+ goto out_unlock;
+ }
+
+ old_state = ct->proto.sctp.state;
+ new_state = sctp_new_state(dir, old_state, sch->type);
+
+ /* Invalid */
+ if (new_state == SCTP_CONNTRACK_MAX) {
+ pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u "
+ "conntrack=%u\n",
+ dir, sch->type, old_state);
+ goto out_unlock;
+ }
+
+ /* If it is an INIT or an INIT ACK note down the vtag */
+ if (sch->type == SCTP_CID_INIT ||
+ sch->type == SCTP_CID_INIT_ACK) {
+ sctp_inithdr_t _inithdr, *ih;
+
+ ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+ sizeof(_inithdr), &_inithdr);
+ if (ih == NULL)
+ goto out_unlock;
+ pr_debug("Setting vtag %x for dir %d\n",
+ ih->init_tag, !dir);
+ ct->proto.sctp.vtag[!dir] = ih->init_tag;
+ }
+
+ ct->proto.sctp.state = new_state;
+ if (old_state != new_state)
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+ }
+ spin_unlock_bh(&ct->lock);
+
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
+
+ if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
+ dir == IP_CT_DIR_REPLY &&
+ new_state == SCTP_CONNTRACK_ESTABLISHED) {
+ pr_debug("Setting assured bit\n");
+ set_bit(IPS_ASSURED_BIT, &ct->status);
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ }
+
+ return NF_ACCEPT;
+
+out_unlock:
+ spin_unlock_bh(&ct->lock);
+out:
+ return -NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ enum sctp_conntrack new_state;
+ const struct sctphdr *sh;
+ struct sctphdr _sctph;
+ const struct sctp_chunkhdr *sch;
+ struct sctp_chunkhdr _sch;
+ u_int32_t offset, count;
+ unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+
+ sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ return false;
+
+ if (do_basic_checks(ct, skb, dataoff, map) != 0)
+ return false;
+
+ /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
+ if (test_bit(SCTP_CID_ABORT, map) ||
+ test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
+ test_bit(SCTP_CID_COOKIE_ACK, map))
+ return false;
+
+ memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
+ new_state = SCTP_CONNTRACK_MAX;
+ for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+ /* Don't need lock here: this conntrack not in circulation yet */
+ new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
+ SCTP_CONNTRACK_NONE, sch->type);
+
+ /* Invalid: delete conntrack */
+ if (new_state == SCTP_CONNTRACK_NONE ||
+ new_state == SCTP_CONNTRACK_MAX) {
+ pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
+ return false;
+ }
+
+ /* Copy the vtag into the state info */
+ if (sch->type == SCTP_CID_INIT) {
+ if (sh->vtag == 0) {
+ sctp_inithdr_t _inithdr, *ih;
+
+ ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+ sizeof(_inithdr), &_inithdr);
+ if (ih == NULL)
+ return false;
+
+ pr_debug("Setting vtag %x for new conn\n",
+ ih->init_tag);
+
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY] =
+ ih->init_tag;
+ } else {
+ /* Sec 8.5.1 (A) */
+ return false;
+ }
+ }
+ /* If it is a shutdown ack OOTB packet, we expect a return
+ shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+ else {
+ pr_debug("Setting vtag %x for new conn OOTB\n",
+ sh->vtag);
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+ }
+
+ ct->proto.sctp.state = new_state;
+ }
+
+ return true;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+ struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+
+ spin_lock_bh(&ct->lock);
+ nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) ||
+ nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) ||
+ nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY,
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY]))
+ goto nla_put_failure;
+
+ spin_unlock_bh(&ct->lock);
+
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ spin_unlock_bh(&ct->lock);
+ return -1;
+}
+
+static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = {
+ [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 },
+ [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 },
+};
+
+static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
+{
+ struct nlattr *attr = cda[CTA_PROTOINFO_SCTP];
+ struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1];
+ int err;
+
+ /* updates may not contain the internal protocol info, skip parsing */
+ if (!attr)
+ return 0;
+
+ err = nla_parse_nested(tb,
+ CTA_PROTOINFO_SCTP_MAX,
+ attr,
+ sctp_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_PROTOINFO_SCTP_STATE] ||
+ !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] ||
+ !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY])
+ return -EINVAL;
+
+ spin_lock_bh(&ct->lock);
+ ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]);
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] =
+ nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]);
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY] =
+ nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]);
+ spin_unlock_bh(&ct->lock);
+
+ return 0;
+}
+
+static int sctp_nlattr_size(void)
+{
+ return nla_total_size(0) /* CTA_PROTOINFO_SCTP */
+ + nla_policy_len(sctp_nla_policy, CTA_PROTOINFO_SCTP_MAX + 1);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct sctp_net *sn = sctp_pernet(net);
+ int i;
+
+ /* set default SCTP timeouts. */
+ for (i=0; i<SCTP_CONNTRACK_MAX; i++)
+ timeouts[i] = sn->timeouts[i];
+
+ /* there's a 1:1 mapping between attributes and protocol states. */
+ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
+ if (tb[i]) {
+ timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
+ }
+ }
+ return 0;
+}
+
+static int
+sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+ int i;
+
+ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
+ if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
+ goto nla_put_failure;
+ }
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
+ [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sctp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_sctp_timeout_closed",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_cookie_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_cookie_echoed",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_shutdown_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_shutdown_recd",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table sctp_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_sctp_timeout_closed",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_cookie_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_cookie_echoed",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_shutdown_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_shutdown_recd",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif
+
+static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct sctp_net *sn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(sctp_sysctl_table,
+ sizeof(sctp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
+ pn->ctl_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
+ pn->ctl_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
+ pn->ctl_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
+ pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
+ pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
+ pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+#endif
+ return 0;
+}
+
+static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct sctp_net *sn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table,
+ sizeof(sctp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
+ pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
+ pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
+ pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
+ pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
+ pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
+ pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
+#endif
+#endif
+ return 0;
+}
+
+static int sctp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct sctp_net *sn = sctp_pernet(net);
+ struct nf_proto_net *pn = &sn->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
+ sn->timeouts[i] = sctp_timeouts[i];
+ }
+
+ if (proto == AF_INET) {
+ ret = sctp_kmemdup_compat_sysctl_table(pn, sn);
+ if (ret < 0)
+ return ret;
+
+ ret = sctp_kmemdup_sysctl_table(pn, sn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = sctp_kmemdup_sysctl_table(pn, sn);
+
+ return ret;
+}
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_SCTP,
+ .name = "sctp",
+ .pkt_to_tuple = sctp_pkt_to_tuple,
+ .invert_tuple = sctp_invert_tuple,
+ .print_tuple = sctp_print_tuple,
+ .print_conntrack = sctp_print_conntrack,
+ .packet = sctp_packet,
+ .get_timeouts = sctp_get_timeouts,
+ .new = sctp_new,
+ .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = sctp_to_nlattr,
+ .nlattr_size = sctp_nlattr_size,
+ .from_nlattr = nlattr_to_sctp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = sctp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = sctp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_SCTP_MAX,
+ .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
+ .nla_policy = sctp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &sctp_net_id,
+ .init_net = sctp_init_net,
+};
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
+ .l3proto = PF_INET6,
+ .l4proto = IPPROTO_SCTP,
+ .name = "sctp",
+ .pkt_to_tuple = sctp_pkt_to_tuple,
+ .invert_tuple = sctp_invert_tuple,
+ .print_tuple = sctp_print_tuple,
+ .print_conntrack = sctp_print_conntrack,
+ .packet = sctp_packet,
+ .get_timeouts = sctp_get_timeouts,
+ .new = sctp_new,
+ .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = sctp_to_nlattr,
+ .nlattr_size = sctp_nlattr_size,
+ .from_nlattr = nlattr_to_sctp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = sctp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = sctp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_SCTP_MAX,
+ .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
+ .nla_policy = sctp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif
+ .net_id = &sctp_net_id,
+ .init_net = sctp_init_net,
+};
+
+static int sctp_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_sctp4: pernet registration failed.\n");
+ goto out;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_sctp6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_sctp6: pernet registration failed.\n");
+ goto cleanup_sctp4;
+ }
+ return 0;
+
+cleanup_sctp4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
+out:
+ return ret;
+}
+
+static void sctp_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_sctp4);
+}
+
+static struct pernet_operations sctp_net_ops = {
+ .init = sctp_net_init,
+ .exit = sctp_net_exit,
+ .id = &sctp_net_id,
+ .size = sizeof(struct sctp_net),
+};
+
+static int __init nf_conntrack_proto_sctp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&sctp_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4);
+ if (ret < 0)
+ goto out_sctp4;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp6);
+ if (ret < 0)
+ goto out_sctp6;
+
+ return 0;
+out_sctp6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+out_sctp4:
+ unregister_pernet_subsys(&sctp_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit nf_conntrack_proto_sctp_fini(void)
+{
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+ unregister_pernet_subsys(&sctp_net_ops);
+}
+
+module_init(nf_conntrack_proto_sctp_init);
+module_exit(nf_conntrack_proto_sctp_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
+MODULE_ALIAS("ip_conntrack_proto_sctp");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
new file mode 100644
index 000000000..70383de72
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -0,0 +1,1739 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <asm/unaligned.h>
+
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+
+/* "Be conservative in what you do,
+ be liberal in what you accept from others."
+ If it's non-zero, we mark only out of window RST segments as INVALID. */
+static int nf_ct_tcp_be_liberal __read_mostly = 0;
+
+/* If it is set to zero, we disable picking up already established
+ connections. */
+static int nf_ct_tcp_loose __read_mostly = 1;
+
+/* Max number of the retransmitted packets without receiving an (acceptable)
+ ACK from the destination. If this number is reached, a shorter timer
+ will be started. */
+static int nf_ct_tcp_max_retrans __read_mostly = 3;
+
+ /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+ closely. They're more complex. --RR */
+
+static const char *const tcp_conntrack_names[] = {
+ "NONE",
+ "SYN_SENT",
+ "SYN_RECV",
+ "ESTABLISHED",
+ "FIN_WAIT",
+ "CLOSE_WAIT",
+ "LAST_ACK",
+ "TIME_WAIT",
+ "CLOSE",
+ "SYN_SENT2",
+};
+
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = {
+ [TCP_CONNTRACK_SYN_SENT] = 2 MINS,
+ [TCP_CONNTRACK_SYN_RECV] = 60 SECS,
+ [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS,
+ [TCP_CONNTRACK_FIN_WAIT] = 2 MINS,
+ [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS,
+ [TCP_CONNTRACK_LAST_ACK] = 30 SECS,
+ [TCP_CONNTRACK_TIME_WAIT] = 2 MINS,
+ [TCP_CONNTRACK_CLOSE] = 10 SECS,
+ [TCP_CONNTRACK_SYN_SENT2] = 2 MINS,
+/* RFC1122 says the R2 limit should be at least 100 seconds.
+ Linux uses 15 packets as limit, which corresponds
+ to ~13-30min depending on RTO. */
+ [TCP_CONNTRACK_RETRANS] = 5 MINS,
+ [TCP_CONNTRACK_UNACK] = 5 MINS,
+};
+
+#define sNO TCP_CONNTRACK_NONE
+#define sSS TCP_CONNTRACK_SYN_SENT
+#define sSR TCP_CONNTRACK_SYN_RECV
+#define sES TCP_CONNTRACK_ESTABLISHED
+#define sFW TCP_CONNTRACK_FIN_WAIT
+#define sCW TCP_CONNTRACK_CLOSE_WAIT
+#define sLA TCP_CONNTRACK_LAST_ACK
+#define sTW TCP_CONNTRACK_TIME_WAIT
+#define sCL TCP_CONNTRACK_CLOSE
+#define sS2 TCP_CONNTRACK_SYN_SENT2
+#define sIV TCP_CONNTRACK_MAX
+#define sIG TCP_CONNTRACK_IGNORE
+
+/* What TCP flags are set from RST/SYN/FIN/ACK. */
+enum tcp_bit_set {
+ TCP_SYN_SET,
+ TCP_SYNACK_SET,
+ TCP_FIN_SET,
+ TCP_ACK_SET,
+ TCP_RST_SET,
+ TCP_NONE_SET,
+};
+
+/*
+ * The TCP state transition table needs a few words...
+ *
+ * We are the man in the middle. All the packets go through us
+ * but might get lost in transit to the destination.
+ * It is assumed that the destinations can't receive segments
+ * we haven't seen.
+ *
+ * The checked segment is in window, but our windows are *not*
+ * equivalent with the ones of the sender/receiver. We always
+ * try to guess the state of the current sender.
+ *
+ * The meaning of the states are:
+ *
+ * NONE: initial state
+ * SYN_SENT: SYN-only packet seen
+ * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open
+ * SYN_RECV: SYN-ACK packet seen
+ * ESTABLISHED: ACK packet seen
+ * FIN_WAIT: FIN packet seen
+ * CLOSE_WAIT: ACK seen (after FIN)
+ * LAST_ACK: FIN seen (after FIN)
+ * TIME_WAIT: last ACK seen
+ * CLOSE: closed connection (RST)
+ *
+ * Packets marked as IGNORED (sIG):
+ * if they may be either invalid or valid
+ * and the receiver may send back a connection
+ * closing RST or a SYN/ACK.
+ *
+ * Packets marked as INVALID (sIV):
+ * if we regard them as truly invalid packets
+ */
+static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
+ {
+/* ORIGINAL */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
+/*
+ * sNO -> sSS Initialize a new connection
+ * sSS -> sSS Retransmitted SYN
+ * sS2 -> sS2 Late retransmitted SYN
+ * sSR -> sIG
+ * sES -> sIG Error: SYNs in window outside the SYN_SENT state
+ * are errors. Receiver will reply with RST
+ * and close the connection.
+ * Or we are not in sync and hold a dead connection.
+ * sFW -> sIG
+ * sCW -> sIG
+ * sLA -> sIG
+ * sTW -> sSS Reopened connection (RFC 1122).
+ * sCL -> sSS
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
+/*
+ * sNO -> sIV Too late and no reason to do anything
+ * sSS -> sIV Client can't send SYN and then SYN/ACK
+ * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open
+ * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open
+ * sES -> sIV Invalid SYN/ACK packets sent by the client
+ * sFW -> sIV
+ * sCW -> sIV
+ * sLA -> sIV
+ * sTW -> sIV
+ * sCL -> sIV
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ * sNO -> sIV Too late and no reason to do anything...
+ * sSS -> sIV Client migth not send FIN in this state:
+ * we enforce waiting for a SYN/ACK reply first.
+ * sS2 -> sIV
+ * sSR -> sFW Close started.
+ * sES -> sFW
+ * sFW -> sLA FIN seen in both directions, waiting for
+ * the last ACK.
+ * Migth be a retransmitted FIN as well...
+ * sCW -> sLA
+ * sLA -> sLA Retransmitted FIN. Remain in the same state.
+ * sTW -> sTW
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ * sNO -> sES Assumed.
+ * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
+ * sS2 -> sIV
+ * sSR -> sES Established state is reached.
+ * sES -> sES :-)
+ * sFW -> sCW Normal close request answered by ACK.
+ * sCW -> sCW
+ * sLA -> sTW Last ACK detected (RFC5961 challenged)
+ * sTW -> sTW Retransmitted last ACK. Remain in the same state.
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
+/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+ },
+ {
+/* REPLY */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
+/*
+ * sNO -> sIV Never reached.
+ * sSS -> sS2 Simultaneous open
+ * sS2 -> sS2 Retransmitted simultaneous SYN
+ * sSR -> sIV Invalid SYN packets sent by the server
+ * sES -> sIV
+ * sFW -> sIV
+ * sCW -> sIV
+ * sLA -> sIV
+ * sTW -> sSS Reopened connection, but server may have switched role
+ * sCL -> sIV
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
+/*
+ * sSS -> sSR Standard open.
+ * sS2 -> sSR Simultaneous open
+ * sSR -> sIG Retransmitted SYN/ACK, ignore it.
+ * sES -> sIG Late retransmitted SYN/ACK?
+ * sFW -> sIG Might be SYN/ACK answering ignored SYN
+ * sCW -> sIG
+ * sLA -> sIG
+ * sTW -> sIG
+ * sCL -> sIG
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ * sSS -> sIV Server might not send FIN in this state.
+ * sS2 -> sIV
+ * sSR -> sFW Close started.
+ * sES -> sFW
+ * sFW -> sLA FIN seen in both directions.
+ * sCW -> sLA
+ * sLA -> sLA Retransmitted FIN.
+ * sTW -> sTW
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
+/*
+ * sSS -> sIG Might be a half-open connection.
+ * sS2 -> sIG
+ * sSR -> sSR Might answer late resent SYN.
+ * sES -> sES :-)
+ * sFW -> sCW Normal close request answered by ACK.
+ * sCW -> sCW
+ * sLA -> sTW Last ACK detected (RFC5961 challenged)
+ * sTW -> sTW Retransmitted last ACK.
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
+/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
+/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+ }
+};
+
+static inline struct nf_tcp_net *tcp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.tcp;
+}
+
+static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct tcphdr *hp;
+ struct tcphdr _hdr;
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+ if (hp == NULL)
+ return false;
+
+ tuple->src.u.tcp.port = hp->source;
+ tuple->dst.u.tcp.port = hp->dest;
+
+ return true;
+}
+
+static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.tcp.port = orig->dst.u.tcp.port;
+ tuple->dst.u.tcp.port = orig->src.u.tcp.port;
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static void tcp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.tcp.port),
+ ntohs(tuple->dst.u.tcp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+ enum tcp_conntrack state;
+
+ spin_lock_bh(&ct->lock);
+ state = ct->proto.tcp.state;
+ spin_unlock_bh(&ct->lock);
+
+ seq_printf(s, "%s ", tcp_conntrack_names[state]);
+}
+
+static unsigned int get_conntrack_index(const struct tcphdr *tcph)
+{
+ if (tcph->rst) return TCP_RST_SET;
+ else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
+ else if (tcph->fin) return TCP_FIN_SET;
+ else if (tcph->ack) return TCP_ACK_SET;
+ else return TCP_NONE_SET;
+}
+
+/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
+ in IP Filter' by Guido van Rooij.
+
+ http://www.sane.nl/events/sane2000/papers.html
+ http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
+
+ The boundaries and the conditions are changed according to RFC793:
+ the packet must intersect the window (i.e. segments may be
+ after the right or before the left edge) and thus receivers may ACK
+ segments after the right edge of the window.
+
+ td_maxend = max(sack + max(win,1)) seen in reply packets
+ td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
+ td_maxwin += seq + len - sender.td_maxend
+ if seq + len > sender.td_maxend
+ td_end = max(seq + len) seen in sent packets
+
+ I. Upper bound for valid data: seq <= sender.td_maxend
+ II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
+ III. Upper bound for valid (s)ack: sack <= receiver.td_end
+ IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW
+
+ where sack is the highest right edge of sack block found in the packet
+ or ack in the case of packet without SACK option.
+
+ The upper bound limit for a valid (s)ack is not ignored -
+ we doesn't have to deal with fragments.
+*/
+
+static inline __u32 segment_seq_plus_len(__u32 seq,
+ size_t len,
+ unsigned int dataoff,
+ const struct tcphdr *tcph)
+{
+ /* XXX Should I use payload length field in IP/IPv6 header ?
+ * - YK */
+ return (seq + len - dataoff - tcph->doff*4
+ + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
+}
+
+/* Fixme: what about big packets? */
+#define MAXACKWINCONST 66000
+#define MAXACKWINDOW(sender) \
+ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
+ : MAXACKWINCONST)
+
+/*
+ * Simplified tcp_parse_options routine from tcp_input.c
+ */
+static void tcp_options(const struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct tcphdr *tcph,
+ struct ip_ct_tcp_state *state)
+{
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ const unsigned char *ptr;
+ int length = (tcph->doff*4) - sizeof(struct tcphdr);
+
+ if (!length)
+ return;
+
+ ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
+ length, buff);
+ BUG_ON(ptr == NULL);
+
+ state->td_scale =
+ state->flags = 0;
+
+ while (length > 0) {
+ int opcode=*ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize=*ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK_PERM
+ && opsize == TCPOLEN_SACK_PERM)
+ state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
+ else if (opcode == TCPOPT_WINDOW
+ && opsize == TCPOLEN_WINDOW) {
+ state->td_scale = *(u_int8_t *)ptr;
+
+ if (state->td_scale > 14) {
+ /* See RFC1323 */
+ state->td_scale = 14;
+ }
+ state->flags |=
+ IP_CT_TCP_FLAG_WINDOW_SCALE;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
+ const struct tcphdr *tcph, __u32 *sack)
+{
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ const unsigned char *ptr;
+ int length = (tcph->doff*4) - sizeof(struct tcphdr);
+ __u32 tmp;
+
+ if (!length)
+ return;
+
+ ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
+ length, buff);
+ BUG_ON(ptr == NULL);
+
+ /* Fast path for timestamp-only option */
+ if (length == TCPOLEN_TSTAMP_ALIGNED
+ && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
+ | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))
+ return;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize, i;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK
+ && opsize >= (TCPOLEN_SACK_BASE
+ + TCPOLEN_SACK_PERBLOCK)
+ && !((opsize - TCPOLEN_SACK_BASE)
+ % TCPOLEN_SACK_PERBLOCK)) {
+ for (i = 0;
+ i < (opsize - TCPOLEN_SACK_BASE);
+ i += TCPOLEN_SACK_PERBLOCK) {
+ tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
+
+ if (after(tmp, *sack))
+ *sack = tmp;
+ }
+ return;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+static bool tcp_in_window(const struct nf_conn *ct,
+ struct ip_ct_tcp *state,
+ enum ip_conntrack_dir dir,
+ unsigned int index,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct tcphdr *tcph,
+ u_int8_t pf)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ struct ip_ct_tcp_state *sender = &state->seen[dir];
+ struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+ const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
+ __u32 seq, ack, sack, end, win, swin;
+ s32 receiver_offset;
+ bool res, in_recv_win;
+
+ /*
+ * Get the required data from the packet.
+ */
+ seq = ntohl(tcph->seq);
+ ack = sack = ntohl(tcph->ack_seq);
+ win = ntohs(tcph->window);
+ end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
+
+ if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
+ tcp_sack(skb, dataoff, tcph, &sack);
+
+ /* Take into account NAT sequence number mangling */
+ receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
+ ack -= receiver_offset;
+ sack -= receiver_offset;
+
+ pr_debug("tcp_in_window: START\n");
+ pr_debug("tcp_in_window: ");
+ nf_ct_dump_tuple(tuple);
+ pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+ seq, ack, receiver_offset, sack, receiver_offset, win, end);
+ pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ if (sender->td_maxwin == 0) {
+ /*
+ * Initialize sender data.
+ */
+ if (tcph->syn) {
+ /*
+ * SYN-ACK in reply to a SYN
+ * or SYN from reply direction in simultaneous open.
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, dataoff, tcph, sender);
+ /*
+ * RFC 1323:
+ * Both sides must send the Window Scale option
+ * to enable window scaling in either direction.
+ */
+ if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
+ && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
+ sender->td_scale =
+ receiver->td_scale = 0;
+ if (!tcph->ack)
+ /* Simultaneous open */
+ return true;
+ } else {
+ /*
+ * We are in the middle of a connection,
+ * its history is lost for us.
+ * Let's try to use the data from the packet.
+ */
+ sender->td_end = end;
+ swin = win << sender->td_scale;
+ sender->td_maxwin = (swin == 0 ? 1 : swin);
+ sender->td_maxend = end + sender->td_maxwin;
+ /*
+ * We haven't seen traffic in the other direction yet
+ * but we have to tweak window tracking to pass III
+ * and IV until that happens.
+ */
+ if (receiver->td_maxwin == 0)
+ receiver->td_end = receiver->td_maxend = sack;
+ }
+ } else if (((state->state == TCP_CONNTRACK_SYN_SENT
+ && dir == IP_CT_DIR_ORIGINAL)
+ || (state->state == TCP_CONNTRACK_SYN_RECV
+ && dir == IP_CT_DIR_REPLY))
+ && after(end, sender->td_end)) {
+ /*
+ * RFC 793: "if a TCP is reinitialized ... then it need
+ * not wait at all; it must only be sure to use sequence
+ * numbers larger than those recently used."
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, dataoff, tcph, sender);
+ }
+
+ if (!(tcph->ack)) {
+ /*
+ * If there is no ACK, just pretend it was set and OK.
+ */
+ ack = sack = receiver->td_end;
+ } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
+ (TCP_FLAG_ACK|TCP_FLAG_RST))
+ && (ack == 0)) {
+ /*
+ * Broken TCP stacks, that set ACK in RST packets as well
+ * with zero ack value.
+ */
+ ack = sack = receiver->td_end;
+ }
+
+ if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
+ /*
+ * RST sent answering SYN.
+ */
+ seq = end = sender->td_end;
+
+ pr_debug("tcp_in_window: ");
+ nf_ct_dump_tuple(tuple);
+ pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+ seq, ack, receiver_offset, sack, receiver_offset, win, end);
+ pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ /* Is the ending sequence in the receive window (if available)? */
+ in_recv_win = !receiver->td_maxwin ||
+ after(end, sender->td_end - receiver->td_maxwin - 1);
+
+ pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+ before(seq, sender->td_maxend + 1),
+ (in_recv_win ? 1 : 0),
+ before(sack, receiver->td_end + 1),
+ after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
+
+ if (before(seq, sender->td_maxend + 1) &&
+ in_recv_win &&
+ before(sack, receiver->td_end + 1) &&
+ after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
+ /*
+ * Take into account window scaling (RFC 1323).
+ */
+ if (!tcph->syn)
+ win <<= sender->td_scale;
+
+ /*
+ * Update sender data.
+ */
+ swin = win + (sack - ack);
+ if (sender->td_maxwin < swin)
+ sender->td_maxwin = swin;
+ if (after(end, sender->td_end)) {
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ }
+ if (tcph->ack) {
+ if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+ sender->td_maxack = ack;
+ sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+ } else if (after(ack, sender->td_maxack))
+ sender->td_maxack = ack;
+ }
+
+ /*
+ * Update receiver data.
+ */
+ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
+ receiver->td_maxwin += end - sender->td_maxend;
+ if (after(sack + win, receiver->td_maxend - 1)) {
+ receiver->td_maxend = sack + win;
+ if (win == 0)
+ receiver->td_maxend++;
+ }
+ if (ack == receiver->td_end)
+ receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ /*
+ * Check retransmissions.
+ */
+ if (index == TCP_ACK_SET) {
+ if (state->last_dir == dir
+ && state->last_seq == seq
+ && state->last_ack == ack
+ && state->last_end == end
+ && state->last_win == win)
+ state->retrans++;
+ else {
+ state->last_dir = dir;
+ state->last_seq = seq;
+ state->last_ack = ack;
+ state->last_end = end;
+ state->last_win = win;
+ state->retrans = 0;
+ }
+ }
+ res = true;
+ } else {
+ res = false;
+ if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
+ tn->tcp_be_liberal)
+ res = true;
+ if (!res && LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: %s ",
+ before(seq, sender->td_maxend + 1) ?
+ in_recv_win ?
+ before(sack, receiver->td_end + 1) ?
+ after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
+ : "ACK is under the lower bound (possible overly delayed ACK)"
+ : "ACK is over the upper bound (ACKed data not seen yet)"
+ : "SEQ is under the lower bound (already ACKed data retransmitted)"
+ : "SEQ is over the upper bound (over the window of the receiver)");
+ }
+
+ pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
+ "receiver end=%u maxend=%u maxwin=%u\n",
+ res, sender->td_end, sender->td_maxend, sender->td_maxwin,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+
+ return res;
+}
+
+/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
+static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
+ TCPHDR_URG) + 1] =
+{
+ [TCPHDR_SYN] = 1,
+ [TCPHDR_SYN|TCPHDR_URG] = 1,
+ [TCPHDR_SYN|TCPHDR_ACK] = 1,
+ [TCPHDR_RST] = 1,
+ [TCPHDR_RST|TCPHDR_ACK] = 1,
+ [TCPHDR_FIN|TCPHDR_ACK] = 1,
+ [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1,
+ [TCPHDR_ACK] = 1,
+ [TCPHDR_ACK|TCPHDR_URG] = 1,
+};
+
+/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
+static int tcp_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum)
+{
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ unsigned int tcplen = skb->len - dataoff;
+ u_int8_t tcpflags;
+
+ /* Smaller that minimal TCP header? */
+ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Not whole TCP header or malformed packet */
+ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: truncated/malformed packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Checksum invalid? Ignore.
+ * We skip checking packets on the outgoing path
+ * because the checksum is assumed to be correct.
+ */
+ /* FIXME: Source route IP option packets --RR */
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: bad TCP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ /* Check TCP flags. */
+ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
+ if (!tcp_valid_flags[tcpflags]) {
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid TCP flag combination ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+static unsigned int *tcp_get_timeouts(struct net *net)
+{
+ return tcp_pernet(net)->timeouts;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int tcp_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ struct nf_conntrack_tuple *tuple;
+ enum tcp_conntrack new_state, old_state;
+ enum ip_conntrack_dir dir;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ unsigned long timeout;
+ unsigned int index;
+
+ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+ BUG_ON(th == NULL);
+
+ spin_lock_bh(&ct->lock);
+ old_state = ct->proto.tcp.state;
+ dir = CTINFO2DIR(ctinfo);
+ index = get_conntrack_index(th);
+ new_state = tcp_conntracks[dir][index][old_state];
+ tuple = &ct->tuplehash[dir].tuple;
+
+ switch (new_state) {
+ case TCP_CONNTRACK_SYN_SENT:
+ if (old_state < TCP_CONNTRACK_TIME_WAIT)
+ break;
+ /* RFC 1122: "When a connection is closed actively,
+ * it MUST linger in TIME-WAIT state for a time 2xMSL
+ * (Maximum Segment Lifetime). However, it MAY accept
+ * a new SYN from the remote TCP to reopen the connection
+ * directly from TIME-WAIT state, if..."
+ * We ignore the conditions because we are in the
+ * TIME-WAIT state anyway.
+ *
+ * Handle aborted connections: we and the server
+ * think there is an existing connection but the client
+ * aborts it and starts a new one.
+ */
+ if (((ct->proto.tcp.seen[dir].flags
+ | ct->proto.tcp.seen[!dir].flags)
+ & IP_CT_TCP_FLAG_CLOSE_INIT)
+ || (ct->proto.tcp.last_dir == dir
+ && ct->proto.tcp.last_index == TCP_RST_SET)) {
+ /* Attempt to reopen a closed/aborted connection.
+ * Delete this connection and look up again. */
+ spin_unlock_bh(&ct->lock);
+
+ /* Only repeat if we can actually remove the timer.
+ * Destruction may already be in progress in process
+ * context and we must give it a chance to terminate.
+ */
+ if (nf_ct_kill(ct))
+ return -NF_REPEAT;
+ return NF_DROP;
+ }
+ /* Fall through */
+ case TCP_CONNTRACK_IGNORE:
+ /* Ignored packets:
+ *
+ * Our connection entry may be out of sync, so ignore
+ * packets which may signal the real connection between
+ * the client and the server.
+ *
+ * a) SYN in ORIGINAL
+ * b) SYN/ACK in REPLY
+ * c) ACK in reply direction after initial SYN in original.
+ *
+ * If the ignored packet is invalid, the receiver will send
+ * a RST we'll catch below.
+ */
+ if (index == TCP_SYNACK_SET
+ && ct->proto.tcp.last_index == TCP_SYN_SET
+ && ct->proto.tcp.last_dir != dir
+ && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
+ /* b) This SYN/ACK acknowledges a SYN that we earlier
+ * ignored as invalid. This means that the client and
+ * the server are both in sync, while the firewall is
+ * not. We get in sync from the previously annotated
+ * values.
+ */
+ old_state = TCP_CONNTRACK_SYN_SENT;
+ new_state = TCP_CONNTRACK_SYN_RECV;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
+ ct->proto.tcp.last_end;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
+ ct->proto.tcp.last_end;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
+ ct->proto.tcp.last_win == 0 ?
+ 1 : ct->proto.tcp.last_win;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
+ ct->proto.tcp.last_wscale;
+ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
+ ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
+ ct->proto.tcp.last_flags;
+ memset(&ct->proto.tcp.seen[dir], 0,
+ sizeof(struct ip_ct_tcp_state));
+ break;
+ }
+ ct->proto.tcp.last_index = index;
+ ct->proto.tcp.last_dir = dir;
+ ct->proto.tcp.last_seq = ntohl(th->seq);
+ ct->proto.tcp.last_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
+ ct->proto.tcp.last_win = ntohs(th->window);
+
+ /* a) This is a SYN in ORIGINAL. The client and the server
+ * may be in sync but we are not. In that case, we annotate
+ * the TCP options and let the packet go through. If it is a
+ * valid SYN packet, the server will reply with a SYN/ACK, and
+ * then we'll get in sync. Otherwise, the server potentially
+ * responds with a challenge ACK if implementing RFC5961.
+ */
+ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
+ struct ip_ct_tcp_state seen = {};
+
+ ct->proto.tcp.last_flags =
+ ct->proto.tcp.last_wscale = 0;
+ tcp_options(skb, dataoff, th, &seen);
+ if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+ ct->proto.tcp.last_flags |=
+ IP_CT_TCP_FLAG_WINDOW_SCALE;
+ ct->proto.tcp.last_wscale = seen.td_scale;
+ }
+ if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
+ ct->proto.tcp.last_flags |=
+ IP_CT_TCP_FLAG_SACK_PERM;
+ }
+ /* Mark the potential for RFC5961 challenge ACK,
+ * this pose a special problem for LAST_ACK state
+ * as ACK is intrepretated as ACKing last FIN.
+ */
+ if (old_state == TCP_CONNTRACK_LAST_ACK)
+ ct->proto.tcp.last_flags |=
+ IP_CT_EXP_CHALLENGE_ACK;
+ }
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid packet ignored in "
+ "state %s ", tcp_conntrack_names[old_state]);
+ return NF_ACCEPT;
+ case TCP_CONNTRACK_MAX:
+ /* Special case for SYN proxy: when the SYN to the server or
+ * the SYN/ACK from the server is lost, the client may transmit
+ * a keep-alive packet while in SYN_SENT state. This needs to
+ * be associated with the original conntrack entry in order to
+ * generate a new SYN with the correct sequence number.
+ */
+ if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
+ index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
+ ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
+ ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
+ pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
+ spin_unlock_bh(&ct->lock);
+ return NF_ACCEPT;
+ }
+
+ /* Invalid packet */
+ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
+ dir, get_conntrack_index(th), old_state);
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid state ");
+ return -NF_ACCEPT;
+ case TCP_CONNTRACK_TIME_WAIT:
+ /* RFC5961 compliance cause stack to send "challenge-ACK"
+ * e.g. in response to spurious SYNs. Conntrack MUST
+ * not believe this ACK is acking last FIN.
+ */
+ if (old_state == TCP_CONNTRACK_LAST_ACK &&
+ index == TCP_ACK_SET &&
+ ct->proto.tcp.last_dir != dir &&
+ ct->proto.tcp.last_index == TCP_SYN_SET &&
+ (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
+ /* Detected RFC5961 challenge ACK */
+ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: challenge-ACK ignored ");
+ return NF_ACCEPT; /* Don't change state */
+ }
+ break;
+ case TCP_CONNTRACK_CLOSE:
+ if (index == TCP_RST_SET
+ && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
+ && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
+ /* Invalid RST */
+ spin_unlock_bh(&ct->lock);
+ if (LOG_INVALID(net, IPPROTO_TCP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL,
+ NULL, "nf_ct_tcp: invalid RST ");
+ return -NF_ACCEPT;
+ }
+ if (index == TCP_RST_SET
+ && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
+ && ct->proto.tcp.last_index == TCP_SYN_SET)
+ || (!test_bit(IPS_ASSURED_BIT, &ct->status)
+ && ct->proto.tcp.last_index == TCP_ACK_SET))
+ && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
+ /* RST sent to invalid SYN or ACK we had let through
+ * at a) and c) above:
+ *
+ * a) SYN was in window then
+ * c) we hold a half-open connection.
+ *
+ * Delete our connection entry.
+ * We skip window checking, because packet might ACK
+ * segments we ignored. */
+ goto in_window;
+ }
+ /* Just fall through */
+ default:
+ /* Keep compilers happy. */
+ break;
+ }
+
+ if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
+ skb, dataoff, th, pf)) {
+ spin_unlock_bh(&ct->lock);
+ return -NF_ACCEPT;
+ }
+ in_window:
+ /* From now on we have got in-window packets */
+ ct->proto.tcp.last_index = index;
+ ct->proto.tcp.last_dir = dir;
+
+ pr_debug("tcp_conntracks: ");
+ nf_ct_dump_tuple(tuple);
+ pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+ (th->syn ? 1 : 0), (th->ack ? 1 : 0),
+ (th->fin ? 1 : 0), (th->rst ? 1 : 0),
+ old_state, new_state);
+
+ ct->proto.tcp.state = new_state;
+ if (old_state != new_state
+ && new_state == TCP_CONNTRACK_FIN_WAIT)
+ ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
+ if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
+ timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
+ timeout = timeouts[TCP_CONNTRACK_RETRANS];
+ else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
+ IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
+ timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
+ timeout = timeouts[TCP_CONNTRACK_UNACK];
+ else
+ timeout = timeouts[new_state];
+ spin_unlock_bh(&ct->lock);
+
+ if (new_state != old_state)
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
+ if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ /* If only reply is a RST, we can consider ourselves not to
+ have an established connection: this is a fairly common
+ problem case, so we can delete the conntrack
+ immediately. --RR */
+ if (th->rst) {
+ nf_ct_kill_acct(ct, ctinfo, skb);
+ return NF_ACCEPT;
+ }
+ /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
+ * pickup with loose=1. Avoid large ESTABLISHED timeout.
+ */
+ if (new_state == TCP_CONNTRACK_ESTABLISHED &&
+ timeout > timeouts[TCP_CONNTRACK_UNACK])
+ timeout = timeouts[TCP_CONNTRACK_UNACK];
+ } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
+ && (old_state == TCP_CONNTRACK_SYN_RECV
+ || old_state == TCP_CONNTRACK_ESTABLISHED)
+ && new_state == TCP_CONNTRACK_ESTABLISHED) {
+ /* Set ASSURED if we see see valid ack in ESTABLISHED
+ after SYN_RECV or a valid answer for a picked up
+ connection. */
+ set_bit(IPS_ASSURED_BIT, &ct->status);
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ }
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ enum tcp_conntrack new_state;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ struct net *net = nf_ct_net(ct);
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
+ const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
+
+ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+ BUG_ON(th == NULL);
+
+ /* Don't need lock here: this conntrack not in circulation yet */
+ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
+
+ /* Invalid: delete conntrack */
+ if (new_state >= TCP_CONNTRACK_MAX) {
+ pr_debug("nf_ct_tcp: invalid new deleting.\n");
+ return false;
+ }
+
+ if (new_state == TCP_CONNTRACK_SYN_SENT) {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
+ /* SYN packet */
+ ct->proto.tcp.seen[0].td_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len,
+ dataoff, th);
+ ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (ct->proto.tcp.seen[0].td_maxwin == 0)
+ ct->proto.tcp.seen[0].td_maxwin = 1;
+ ct->proto.tcp.seen[0].td_maxend =
+ ct->proto.tcp.seen[0].td_end;
+
+ tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
+ } else if (tn->tcp_loose == 0) {
+ /* Don't try to pick up connections. */
+ return false;
+ } else {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
+ /*
+ * We are in the middle of a connection,
+ * its history is lost for us.
+ * Let's try to use the data from the packet.
+ */
+ ct->proto.tcp.seen[0].td_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len,
+ dataoff, th);
+ ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (ct->proto.tcp.seen[0].td_maxwin == 0)
+ ct->proto.tcp.seen[0].td_maxwin = 1;
+ ct->proto.tcp.seen[0].td_maxend =
+ ct->proto.tcp.seen[0].td_end +
+ ct->proto.tcp.seen[0].td_maxwin;
+
+ /* We assume SACK and liberal window checking to handle
+ * window scaling */
+ ct->proto.tcp.seen[0].flags =
+ ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
+ IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+
+ /* tcp_packet will set them */
+ ct->proto.tcp.last_index = TCP_NONE_SET;
+
+ pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+ return true;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+ struct nf_conn *ct)
+{
+ struct nlattr *nest_parms;
+ struct nf_ct_tcp_flags tmp = {};
+
+ spin_lock_bh(&ct->lock);
+ nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
+ nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
+ ct->proto.tcp.seen[0].td_scale) ||
+ nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
+ ct->proto.tcp.seen[1].td_scale))
+ goto nla_put_failure;
+
+ tmp.flags = ct->proto.tcp.seen[0].flags;
+ if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
+ sizeof(struct nf_ct_tcp_flags), &tmp))
+ goto nla_put_failure;
+
+ tmp.flags = ct->proto.tcp.seen[1].flags;
+ if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
+ sizeof(struct nf_ct_tcp_flags), &tmp))
+ goto nla_put_failure;
+ spin_unlock_bh(&ct->lock);
+
+ nla_nest_end(skb, nest_parms);
+
+ return 0;
+
+nla_put_failure:
+ spin_unlock_bh(&ct->lock);
+ return -1;
+}
+
+static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
+ [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 },
+ [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) },
+ [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) },
+};
+
+static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
+{
+ struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
+ struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
+ int err;
+
+ /* updates could not contain anything about the private
+ * protocol info, in that case skip the parsing */
+ if (!pattr)
+ return 0;
+
+ err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[CTA_PROTOINFO_TCP_STATE] &&
+ nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
+ return -EINVAL;
+
+ spin_lock_bh(&ct->lock);
+ if (tb[CTA_PROTOINFO_TCP_STATE])
+ ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
+
+ if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
+ struct nf_ct_tcp_flags *attr =
+ nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
+ ct->proto.tcp.seen[0].flags &= ~attr->mask;
+ ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
+ }
+
+ if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
+ struct nf_ct_tcp_flags *attr =
+ nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
+ ct->proto.tcp.seen[1].flags &= ~attr->mask;
+ ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
+ }
+
+ if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
+ tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
+ ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
+ ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+ ct->proto.tcp.seen[0].td_scale =
+ nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
+ ct->proto.tcp.seen[1].td_scale =
+ nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
+ }
+ spin_unlock_bh(&ct->lock);
+
+ return 0;
+}
+
+static int tcp_nlattr_size(void)
+{
+ return nla_total_size(0) /* CTA_PROTOINFO_TCP */
+ + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1);
+}
+
+static int tcp_nlattr_tuple_size(void)
+{
+ return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ int i;
+
+ /* set default TCP timeouts. */
+ for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
+ timeouts[i] = tn->timeouts[i];
+
+ if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
+ timeouts[TCP_CONNTRACK_SYN_SENT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
+ timeouts[TCP_CONNTRACK_SYN_RECV] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
+ timeouts[TCP_CONNTRACK_ESTABLISHED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
+ timeouts[TCP_CONNTRACK_FIN_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
+ timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
+ timeouts[TCP_CONNTRACK_LAST_ACK] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
+ timeouts[TCP_CONNTRACK_TIME_WAIT] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
+ timeouts[TCP_CONNTRACK_CLOSE] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
+ timeouts[TCP_CONNTRACK_SYN_SENT2] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
+ timeouts[TCP_CONNTRACK_RETRANS] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
+ }
+ if (tb[CTA_TIMEOUT_TCP_UNACK]) {
+ timeouts[TCP_CONNTRACK_UNACK] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
+ }
+ return 0;
+}
+
+static int
+tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
+ htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
+ htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
+ htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
+ htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
+ htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
+ htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
+ htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
+ htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
+ htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
+ [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tcp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_tcp_timeout_syn_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_syn_recv",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_fin_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_close_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_last_ack",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_time_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_close",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_timeout_unacknowledged",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_tcp_loose",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_tcp_be_liberal",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_tcp_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table tcp_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_tcp_timeout_syn_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_syn_sent2",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_syn_recv",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_fin_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_close_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_last_ack",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_time_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_close",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_timeout_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_tcp_loose",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_tcp_be_liberal",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ip_conntrack_tcp_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_tcp_net *tn)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(tcp_sysctl_table,
+ sizeof(tcp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
+ pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
+ pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+ pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
+ pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
+ pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
+ pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
+ pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
+ pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
+ pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK];
+ pn->ctl_table[10].data = &tn->tcp_loose;
+ pn->ctl_table[11].data = &tn->tcp_be_liberal;
+ pn->ctl_table[12].data = &tn->tcp_max_retrans;
+#endif
+ return 0;
+}
+
+static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_tcp_net *tn)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table,
+ sizeof(tcp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
+ pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2];
+ pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
+ pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+ pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
+ pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
+ pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
+ pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
+ pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
+ pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
+ pn->ctl_compat_table[10].data = &tn->tcp_loose;
+ pn->ctl_compat_table[11].data = &tn->tcp_be_liberal;
+ pn->ctl_compat_table[12].data = &tn->tcp_max_retrans;
+#endif
+#endif
+ return 0;
+}
+
+static int tcp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_tcp_net *tn = tcp_pernet(net);
+ struct nf_proto_net *pn = &tn->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
+ tn->timeouts[i] = tcp_timeouts[i];
+
+ tn->tcp_loose = nf_ct_tcp_loose;
+ tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
+ tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
+ }
+
+ if (proto == AF_INET) {
+ ret = tcp_kmemdup_compat_sysctl_table(pn, tn);
+ if (ret < 0)
+ return ret;
+
+ ret = tcp_kmemdup_sysctl_table(pn, tn);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = tcp_kmemdup_sysctl_table(pn, tn);
+
+ return ret;
+}
+
+static struct nf_proto_net *tcp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.tcp.pn;
+}
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
+{
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_TCP,
+ .name = "tcp",
+ .pkt_to_tuple = tcp_pkt_to_tuple,
+ .invert_tuple = tcp_invert_tuple,
+ .print_tuple = tcp_print_tuple,
+ .print_conntrack = tcp_print_conntrack,
+ .packet = tcp_packet,
+ .get_timeouts = tcp_get_timeouts,
+ .new = tcp_new,
+ .error = tcp_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = tcp_to_nlattr,
+ .nlattr_size = tcp_nlattr_size,
+ .from_nlattr = nlattr_to_tcp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = tcp_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_TCP_MAX,
+ .obj_size = sizeof(unsigned int) *
+ TCP_CONNTRACK_TIMEOUT_MAX,
+ .nla_policy = tcp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = tcp_init_net,
+ .get_net_proto = tcp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
+{
+ .l3proto = PF_INET6,
+ .l4proto = IPPROTO_TCP,
+ .name = "tcp",
+ .pkt_to_tuple = tcp_pkt_to_tuple,
+ .invert_tuple = tcp_invert_tuple,
+ .print_tuple = tcp_print_tuple,
+ .print_conntrack = tcp_print_conntrack,
+ .packet = tcp_packet,
+ .get_timeouts = tcp_get_timeouts,
+ .new = tcp_new,
+ .error = tcp_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .to_nlattr = tcp_to_nlattr,
+ .nlattr_size = tcp_nlattr_size,
+ .from_nlattr = nlattr_to_tcp,
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = tcp_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_TCP_MAX,
+ .obj_size = sizeof(unsigned int) *
+ TCP_CONNTRACK_TIMEOUT_MAX,
+ .nla_policy = tcp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = tcp_init_net,
+ .get_net_proto = tcp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
new file mode 100644
index 000000000..6957281ff
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -0,0 +1,368 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+
+static unsigned int udp_timeouts[UDP_CT_MAX] = {
+ [UDP_CT_UNREPLIED] = 30*HZ,
+ [UDP_CT_REPLIED] = 180*HZ,
+};
+
+static inline struct nf_udp_net *udp_pernet(struct net *net)
+{
+ return &net->ct.nf_ct_proto.udp;
+}
+
+static bool udp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct udphdr *hp;
+ struct udphdr _hdr;
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return false;
+
+ tuple->src.u.udp.port = hp->source;
+ tuple->dst.u.udp.port = hp->dest;
+
+ return true;
+}
+
+static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.udp.port = orig->dst.u.udp.port;
+ tuple->dst.u.udp.port = orig->src.u.udp.port;
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static void udp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.udp.port),
+ ntohs(tuple->dst.u.udp.port));
+}
+
+static unsigned int *udp_get_timeouts(struct net *net)
+{
+ return udp_pernet(net)->timeouts;
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udp_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ /* If we've seen traffic both ways, this is some kind of UDP
+ stream. Extend timeout. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDP_CT_REPLIED]);
+ /* Also, more likely to be important, and not a probe */
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ } else {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDP_CT_UNREPLIED]);
+ }
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ return true;
+}
+
+static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
+ unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum)
+{
+ unsigned int udplen = skb->len - dataoff;
+ const struct udphdr *hdr;
+ struct udphdr _hdr;
+
+ /* Header is too small? */
+ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hdr == NULL) {
+ if (LOG_INVALID(net, IPPROTO_UDP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Truncated/malformed packets */
+ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
+ if (LOG_INVALID(net, IPPROTO_UDP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udp: truncated/malformed packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Packet with no checksum */
+ if (!hdr->check)
+ return NF_ACCEPT;
+
+ /* Checksum invalid? Ignore.
+ * We skip checking packets on the outgoing path
+ * because the checksum is assumed to be correct.
+ * FIXME: Source route IP option packets --RR */
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
+ if (LOG_INVALID(net, IPPROTO_UDP))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udp: bad UDP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct nf_udp_net *un = udp_pernet(net);
+
+ /* set default timeouts for UDP. */
+ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED];
+ timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) {
+ timeouts[UDP_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_UDP_REPLIED]) {
+ timeouts[UDP_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ;
+ }
+ return 0;
+}
+
+static int
+udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_UDP_UNREPLIED,
+ htonl(timeouts[UDP_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_UDP_REPLIED,
+ htonl(timeouts[UDP_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = {
+ [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table udp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_udp_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table udp_compat_sysctl_table[] = {
+ {
+ .procname = "ip_conntrack_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "ip_conntrack_udp_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct nf_udp_net *un)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+ pn->ctl_table = kmemdup(udp_sysctl_table,
+ sizeof(udp_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+ pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
+ pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED];
+#endif
+ return 0;
+}
+
+static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
+ struct nf_udp_net *un)
+{
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table,
+ sizeof(udp_compat_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_compat_table)
+ return -ENOMEM;
+
+ pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
+ pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED];
+#endif
+#endif
+ return 0;
+}
+
+static int udp_init_net(struct net *net, u_int16_t proto)
+{
+ int ret;
+ struct nf_udp_net *un = udp_pernet(net);
+ struct nf_proto_net *pn = &un->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0; i < UDP_CT_MAX; i++)
+ un->timeouts[i] = udp_timeouts[i];
+ }
+
+ if (proto == AF_INET) {
+ ret = udp_kmemdup_compat_sysctl_table(pn, un);
+ if (ret < 0)
+ return ret;
+
+ ret = udp_kmemdup_sysctl_table(pn, un);
+ if (ret < 0)
+ nf_ct_kfree_compat_sysctl_table(pn);
+ } else
+ ret = udp_kmemdup_sysctl_table(pn, un);
+
+ return ret;
+}
+
+static struct nf_proto_net *udp_get_net_proto(struct net *net)
+{
+ return &net->ct.nf_ct_proto.udp.pn;
+}
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
+{
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_UDP,
+ .name = "udp",
+ .pkt_to_tuple = udp_pkt_to_tuple,
+ .invert_tuple = udp_invert_tuple,
+ .print_tuple = udp_print_tuple,
+ .packet = udp_packet,
+ .get_timeouts = udp_get_timeouts,
+ .new = udp_new,
+ .error = udp_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDP_MAX,
+ .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+ .nla_policy = udp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = udp_init_net,
+ .get_net_proto = udp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
+{
+ .l3proto = PF_INET6,
+ .l4proto = IPPROTO_UDP,
+ .name = "udp",
+ .pkt_to_tuple = udp_pkt_to_tuple,
+ .invert_tuple = udp_invert_tuple,
+ .print_tuple = udp_print_tuple,
+ .packet = udp_packet,
+ .get_timeouts = udp_get_timeouts,
+ .new = udp_new,
+ .error = udp_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udp_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udp_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDP_MAX,
+ .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
+ .nla_policy = udp_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .init_net = udp_init_net,
+ .get_net_proto = udp_get_net_proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
new file mode 100644
index 000000000..c5903d164
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -0,0 +1,405 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+
+enum udplite_conntrack {
+ UDPLITE_CT_UNREPLIED,
+ UDPLITE_CT_REPLIED,
+ UDPLITE_CT_MAX
+};
+
+static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
+ [UDPLITE_CT_UNREPLIED] = 30*HZ,
+ [UDPLITE_CT_REPLIED] = 180*HZ,
+};
+
+static int udplite_net_id __read_mostly;
+struct udplite_net {
+ struct nf_proto_net pn;
+ unsigned int timeouts[UDPLITE_CT_MAX];
+};
+
+static inline struct udplite_net *udplite_pernet(struct net *net)
+{
+ return net_generic(net, udplite_net_id);
+}
+
+static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ const struct udphdr *hp;
+ struct udphdr _hdr;
+
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return false;
+
+ tuple->src.u.udp.port = hp->source;
+ tuple->dst.u.udp.port = hp->dest;
+ return true;
+}
+
+static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.udp.port = orig->dst.u.udp.port;
+ tuple->dst.u.udp.port = orig->src.u.udp.port;
+ return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static void udplite_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.udp.port),
+ ntohs(tuple->dst.u.udp.port));
+}
+
+static unsigned int *udplite_get_timeouts(struct net *net)
+{
+ return udplite_pernet(net)->timeouts;
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udplite_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum,
+ unsigned int *timeouts)
+{
+ /* If we've seen traffic both ways, this is some kind of UDP
+ stream. Extend timeout. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDPLITE_CT_REPLIED]);
+ /* Also, more likely to be important, and not a probe */
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ } else {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDPLITE_CT_UNREPLIED]);
+ }
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff, unsigned int *timeouts)
+{
+ return true;
+}
+
+static int udplite_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ u_int8_t pf,
+ unsigned int hooknum)
+{
+ unsigned int udplen = skb->len - dataoff;
+ const struct udphdr *hdr;
+ struct udphdr _hdr;
+ unsigned int cscov;
+
+ /* Header is too small? */
+ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hdr == NULL) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ cscov = ntohs(hdr->len);
+ if (cscov == 0)
+ cscov = udplen;
+ else if (cscov < sizeof(*hdr) || cscov > udplen) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: invalid checksum coverage ");
+ return -NF_ACCEPT;
+ }
+
+ /* UDPLITE mandates checksums */
+ if (!hdr->check) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: checksum missing ");
+ return -NF_ACCEPT;
+ }
+
+ /* Checksum invalid? Ignore. */
+ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+ nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
+ pf)) {
+ if (LOG_INVALID(net, IPPROTO_UDPLITE))
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udplite: bad UDPLite checksum ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[],
+ struct net *net, void *data)
+{
+ unsigned int *timeouts = data;
+ struct udplite_net *un = udplite_pernet(net);
+
+ /* set default timeouts for UDPlite. */
+ timeouts[UDPLITE_CT_UNREPLIED] = un->timeouts[UDPLITE_CT_UNREPLIED];
+ timeouts[UDPLITE_CT_REPLIED] = un->timeouts[UDPLITE_CT_REPLIED];
+
+ if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) {
+ timeouts[UDPLITE_CT_UNREPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ;
+ }
+ if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) {
+ timeouts[UDPLITE_CT_REPLIED] =
+ ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ;
+ }
+ return 0;
+}
+
+static int
+udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
+{
+ const unsigned int *timeouts = data;
+
+ if (nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED,
+ htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)) ||
+ nla_put_be32(skb, CTA_TIMEOUT_UDPLITE_REPLIED,
+ htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy
+udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = {
+ [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 },
+ [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 },
+};
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table udplite_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_udplite_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "nf_conntrack_udplite_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif /* CONFIG_SYSCTL */
+
+static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn,
+ struct udplite_net *un)
+{
+#ifdef CONFIG_SYSCTL
+ if (pn->ctl_table)
+ return 0;
+
+ pn->ctl_table = kmemdup(udplite_sysctl_table,
+ sizeof(udplite_sysctl_table),
+ GFP_KERNEL);
+ if (!pn->ctl_table)
+ return -ENOMEM;
+
+ pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
+ pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
+#endif
+ return 0;
+}
+
+static int udplite_init_net(struct net *net, u_int16_t proto)
+{
+ struct udplite_net *un = udplite_pernet(net);
+ struct nf_proto_net *pn = &un->pn;
+
+ if (!pn->users) {
+ int i;
+
+ for (i = 0 ; i < UDPLITE_CT_MAX; i++)
+ un->timeouts[i] = udplite_timeouts[i];
+ }
+
+ return udplite_kmemdup_sysctl_table(pn, un);
+}
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+{
+ .l3proto = PF_INET,
+ .l4proto = IPPROTO_UDPLITE,
+ .name = "udplite",
+ .pkt_to_tuple = udplite_pkt_to_tuple,
+ .invert_tuple = udplite_invert_tuple,
+ .print_tuple = udplite_print_tuple,
+ .packet = udplite_packet,
+ .get_timeouts = udplite_get_timeouts,
+ .new = udplite_new,
+ .error = udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
+ .obj_size = sizeof(unsigned int) *
+ CTA_TIMEOUT_UDPLITE_MAX,
+ .nla_policy = udplite_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &udplite_net_id,
+ .init_net = udplite_init_net,
+};
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+{
+ .l3proto = PF_INET6,
+ .l4proto = IPPROTO_UDPLITE,
+ .name = "udplite",
+ .pkt_to_tuple = udplite_pkt_to_tuple,
+ .invert_tuple = udplite_invert_tuple,
+ .print_tuple = udplite_print_tuple,
+ .packet = udplite_packet,
+ .get_timeouts = udplite_get_timeouts,
+ .new = udplite_new,
+ .error = udplite_error,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = udplite_timeout_nlattr_to_obj,
+ .obj_to_nlattr = udplite_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX,
+ .obj_size = sizeof(unsigned int) *
+ CTA_TIMEOUT_UDPLITE_MAX,
+ .nla_policy = udplite_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &udplite_net_id,
+ .init_net = udplite_init_net,
+};
+
+static int udplite_net_init(struct net *net)
+{
+ int ret = 0;
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite4);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udplite4: pernet registration failed.\n");
+ goto out;
+ }
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udplite6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_udplite6: pernet registration failed.\n");
+ goto cleanup_udplite4;
+ }
+ return 0;
+
+cleanup_udplite4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
+out:
+ return ret;
+}
+
+static void udplite_net_exit(struct net *net)
+{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite6);
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udplite4);
+}
+
+static struct pernet_operations udplite_net_ops = {
+ .init = udplite_net_init,
+ .exit = udplite_net_exit,
+ .id = &udplite_net_id,
+ .size = sizeof(struct udplite_net),
+};
+
+static int __init nf_conntrack_proto_udplite_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&udplite_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4);
+ if (ret < 0)
+ goto out_udplite4;
+
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite6);
+ if (ret < 0)
+ goto out_udplite6;
+
+ return 0;
+out_udplite6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+out_udplite4:
+ unregister_pernet_subsys(&udplite_net_ops);
+out_pernet:
+ return ret;
+}
+
+static void __exit nf_conntrack_proto_udplite_exit(void)
+{
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+ unregister_pernet_subsys(&udplite_net_ops);
+}
+
+module_init(nf_conntrack_proto_udplite_init);
+module_exit(nf_conntrack_proto_udplite_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
new file mode 100644
index 000000000..4a2134fd3
--- /dev/null
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -0,0 +1,237 @@
+/* SANE connection tracking helper
+ * (SANE = Scanner Access Now Easy)
+ * For documentation about the SANE network protocol see
+ * http://www.sane-project.org/html/doc015.html
+ */
+
+/* Copyright (C) 2007 Red Hat, Inc.
+ * Author: Michal Schmidt <mschmidt@redhat.com>
+ * Based on the FTP conntrack helper (net/netfilter/nf_conntrack_ftp.c):
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2003 Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_sane.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>");
+MODULE_DESCRIPTION("SANE connection tracking helper");
+MODULE_ALIAS_NFCT_HELPER("sane");
+
+static char *sane_buffer;
+
+static DEFINE_SPINLOCK(nf_sane_lock);
+
+#define MAX_PORTS 8
+static u_int16_t ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+
+struct sane_request {
+ __be32 RPC_code;
+#define SANE_NET_START 7 /* RPC code */
+
+ __be32 handle;
+};
+
+struct sane_reply_net_start {
+ __be32 status;
+#define SANE_STATUS_SUCCESS 0
+
+ __be16 zero;
+ __be16 port;
+ /* other fields aren't interesting for conntrack */
+};
+
+static int help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff, datalen;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ void *sb_ptr;
+ int ret = NF_ACCEPT;
+ int dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple *tuple;
+ struct sane_request *req;
+ struct sane_reply_net_start *reply;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ /* Not a full tcp header? */
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+
+ /* No data? */
+ dataoff = protoff + th->doff * 4;
+ if (dataoff >= skb->len)
+ return NF_ACCEPT;
+
+ datalen = skb->len - dataoff;
+
+ spin_lock_bh(&nf_sane_lock);
+ sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
+ BUG_ON(sb_ptr == NULL);
+
+ if (dir == IP_CT_DIR_ORIGINAL) {
+ if (datalen != sizeof(struct sane_request))
+ goto out;
+
+ req = sb_ptr;
+ if (req->RPC_code != htonl(SANE_NET_START)) {
+ /* Not an interesting command */
+ ct_sane_info->state = SANE_STATE_NORMAL;
+ goto out;
+ }
+
+ /* We're interested in the next reply */
+ ct_sane_info->state = SANE_STATE_START_REQUESTED;
+ goto out;
+ }
+
+ /* Is it a reply to an uninteresting command? */
+ if (ct_sane_info->state != SANE_STATE_START_REQUESTED)
+ goto out;
+
+ /* It's a reply to SANE_NET_START. */
+ ct_sane_info->state = SANE_STATE_NORMAL;
+
+ if (datalen < sizeof(struct sane_reply_net_start)) {
+ pr_debug("nf_ct_sane: NET_START reply too short\n");
+ goto out;
+ }
+
+ reply = sb_ptr;
+ if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
+ /* saned refused the command */
+ pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n",
+ ntohl(reply->status));
+ goto out;
+ }
+
+ /* Invalid saned reply? Ignore it. */
+ if (reply->zero != 0)
+ goto out;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ ret = NF_DROP;
+ goto out;
+ }
+
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ &tuple->src.u3, &tuple->dst.u3,
+ IPPROTO_TCP, NULL, &reply->port);
+
+ pr_debug("nf_ct_sane: expect: ");
+ nf_ct_dump_tuple(&exp->tuple);
+
+ /* Can't expect this? Best to drop packet now. */
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ }
+
+ nf_ct_expect_put(exp);
+
+out:
+ spin_unlock_bh(&nf_sane_lock);
+ return ret;
+}
+
+static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly;
+
+static const struct nf_conntrack_expect_policy sane_exp_policy = {
+ .max_expected = 1,
+ .timeout = 5 * 60,
+};
+
+/* don't make this __exit, since it's called from __init ! */
+static void nf_conntrack_sane_fini(void)
+{
+ int i, j;
+
+ for (i = 0; i < ports_c; i++) {
+ for (j = 0; j < 2; j++) {
+ pr_debug("nf_ct_sane: unregistering helper for pf: %d "
+ "port: %d\n",
+ sane[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_helper_unregister(&sane[i][j]);
+ }
+ }
+
+ kfree(sane_buffer);
+}
+
+static int __init nf_conntrack_sane_init(void)
+{
+ int i, j = -1, ret = 0;
+
+ sane_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!sane_buffer)
+ return -ENOMEM;
+
+ if (ports_c == 0)
+ ports[ports_c++] = SANE_PORT;
+
+ /* FIXME should be configurable whether IPv4 and IPv6 connections
+ are tracked or not - YK */
+ for (i = 0; i < ports_c; i++) {
+ sane[i][0].tuple.src.l3num = PF_INET;
+ sane[i][1].tuple.src.l3num = PF_INET6;
+ for (j = 0; j < 2; j++) {
+ sane[i][j].data_len = sizeof(struct nf_ct_sane_master);
+ sane[i][j].tuple.src.u.tcp.port = htons(ports[i]);
+ sane[i][j].tuple.dst.protonum = IPPROTO_TCP;
+ sane[i][j].expect_policy = &sane_exp_policy;
+ sane[i][j].me = THIS_MODULE;
+ sane[i][j].help = help;
+ if (ports[i] == SANE_PORT)
+ sprintf(sane[i][j].name, "sane");
+ else
+ sprintf(sane[i][j].name, "sane-%d", ports[i]);
+
+ pr_debug("nf_ct_sane: registering helper for pf: %d "
+ "port: %d\n",
+ sane[i][j].tuple.src.l3num, ports[i]);
+ ret = nf_conntrack_helper_register(&sane[i][j]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_sane: failed to "
+ "register helper for pf: %d port: %d\n",
+ sane[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_sane_fini();
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+module_init(nf_conntrack_sane_init);
+module_exit(nf_conntrack_sane_fini);
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
new file mode 100644
index 000000000..ce3e840c8
--- /dev/null
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -0,0 +1,243 @@
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+
+int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ s32 off)
+{
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_conn_seqadj *seqadj;
+ struct nf_ct_seqadj *this_way;
+
+ if (off == 0)
+ return 0;
+
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+ seqadj = nfct_seqadj(ct);
+ this_way = &seqadj->seq[dir];
+ this_way->offset_before = off;
+ this_way->offset_after = off;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_init);
+
+int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ __be32 seq, s32 off)
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_seqadj *this_way;
+
+ if (off == 0)
+ return 0;
+
+ if (unlikely(!seqadj)) {
+ WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n");
+ return 0;
+ }
+
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+ spin_lock_bh(&ct->lock);
+ this_way = &seqadj->seq[dir];
+ if (this_way->offset_before == this_way->offset_after ||
+ before(this_way->correction_pos, ntohl(seq))) {
+ this_way->correction_pos = ntohl(seq);
+ this_way->offset_before = this_way->offset_after;
+ this_way->offset_after += off;
+ }
+ spin_unlock_bh(&ct->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_set);
+
+void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ s32 off)
+{
+ const struct tcphdr *th;
+
+ if (nf_ct_protonum(ct) != IPPROTO_TCP)
+ return;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb));
+ nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
+}
+EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set);
+
+/* Adjust one found SACK option including checksum correction */
+static void nf_ct_sack_block_adjust(struct sk_buff *skb,
+ struct tcphdr *tcph,
+ unsigned int sackoff,
+ unsigned int sackend,
+ struct nf_ct_seqadj *seq)
+{
+ while (sackoff < sackend) {
+ struct tcp_sack_block_wire *sack;
+ __be32 new_start_seq, new_end_seq;
+
+ sack = (void *)skb->data + sackoff;
+ if (after(ntohl(sack->start_seq) - seq->offset_before,
+ seq->correction_pos))
+ new_start_seq = htonl(ntohl(sack->start_seq) -
+ seq->offset_after);
+ else
+ new_start_seq = htonl(ntohl(sack->start_seq) -
+ seq->offset_before);
+
+ if (after(ntohl(sack->end_seq) - seq->offset_before,
+ seq->correction_pos))
+ new_end_seq = htonl(ntohl(sack->end_seq) -
+ seq->offset_after);
+ else
+ new_end_seq = htonl(ntohl(sack->end_seq) -
+ seq->offset_before);
+
+ pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n",
+ ntohl(sack->start_seq), ntohl(new_start_seq),
+ ntohl(sack->end_seq), ntohl(new_end_seq));
+
+ inet_proto_csum_replace4(&tcph->check, skb,
+ sack->start_seq, new_start_seq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb,
+ sack->end_seq, new_end_seq, 0);
+ sack->start_seq = new_start_seq;
+ sack->end_seq = new_end_seq;
+ sackoff += sizeof(*sack);
+ }
+}
+
+/* TCP SACK sequence number adjustment */
+static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
+ unsigned int protoff,
+ struct tcphdr *tcph,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dir, optoff, optend;
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+
+ optoff = protoff + sizeof(struct tcphdr);
+ optend = protoff + tcph->doff * 4;
+
+ if (!skb_make_writable(skb, optend))
+ return 0;
+
+ dir = CTINFO2DIR(ctinfo);
+
+ while (optoff < optend) {
+ /* Usually: option, length. */
+ unsigned char *op = skb->data + optoff;
+
+ switch (op[0]) {
+ case TCPOPT_EOL:
+ return 1;
+ case TCPOPT_NOP:
+ optoff++;
+ continue;
+ default:
+ /* no partial options */
+ if (optoff + 1 == optend ||
+ optoff + op[1] > optend ||
+ op[1] < 2)
+ return 0;
+ if (op[0] == TCPOPT_SACK &&
+ op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
+ ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+ nf_ct_sack_block_adjust(skb, tcph, optoff + 2,
+ optoff+op[1],
+ &seqadj->seq[!dir]);
+ optoff += op[1];
+ }
+ }
+ return 1;
+}
+
+/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */
+int nf_ct_seq_adjust(struct sk_buff *skb,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ unsigned int protoff)
+{
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct tcphdr *tcph;
+ __be32 newseq, newack;
+ s32 seqoff, ackoff;
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *this_way, *other_way;
+ int res;
+
+ this_way = &seqadj->seq[dir];
+ other_way = &seqadj->seq[!dir];
+
+ if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
+ return 0;
+
+ tcph = (void *)skb->data + protoff;
+ spin_lock_bh(&ct->lock);
+ if (after(ntohl(tcph->seq), this_way->correction_pos))
+ seqoff = this_way->offset_after;
+ else
+ seqoff = this_way->offset_before;
+
+ if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+ other_way->correction_pos))
+ ackoff = other_way->offset_after;
+ else
+ ackoff = other_way->offset_before;
+
+ newseq = htonl(ntohl(tcph->seq) + seqoff);
+ newack = htonl(ntohl(tcph->ack_seq) - ackoff);
+
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
+ inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+
+ pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+ ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+ ntohl(newack));
+
+ tcph->seq = newseq;
+ tcph->ack_seq = newack;
+
+ res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+ spin_unlock_bh(&ct->lock);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_adjust);
+
+s32 nf_ct_seq_offset(const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ u32 seq)
+{
+ struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+ struct nf_ct_seqadj *this_way;
+
+ if (!seqadj)
+ return 0;
+
+ this_way = &seqadj->seq[dir];
+ return after(seq, this_way->correction_pos) ?
+ this_way->offset_after : this_way->offset_before;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
+
+static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_seqadj),
+ .align = __alignof__(struct nf_conn_seqadj),
+ .id = NF_CT_EXT_SEQADJ,
+};
+
+int nf_conntrack_seqadj_init(void)
+{
+ return nf_ct_extend_register(&nf_ct_seqadj_extend);
+}
+
+void nf_conntrack_seqadj_fini(void)
+{
+ nf_ct_extend_unregister(&nf_ct_seqadj_extend);
+}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
new file mode 100644
index 000000000..885b4aba3
--- /dev/null
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -0,0 +1,1680 @@
+/* SIP extension for IP connection tracking.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_conntrack_ftp.c and other modules.
+ * (C) 2007 United Security Providers
+ * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP connection tracking helper");
+MODULE_ALIAS("ip_conntrack_sip");
+MODULE_ALIAS_NFCT_HELPER("sip");
+
+#define MAX_PORTS 8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of SIP servers");
+
+static unsigned int sip_timeout __read_mostly = SIP_TIMEOUT;
+module_param(sip_timeout, uint, 0600);
+MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
+
+static int sip_direct_signalling __read_mostly = 1;
+module_param(sip_direct_signalling, int, 0600);
+MODULE_PARM_DESC(sip_direct_signalling, "expect incoming calls from registrar "
+ "only (default 1)");
+
+static int sip_direct_media __read_mostly = 1;
+module_param(sip_direct_media, int, 0600);
+MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling "
+ "endpoints only (default 1)");
+
+const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);
+
+static int string_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len = 0;
+
+ while (dptr < limit && isalpha(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int digits_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len = 0;
+ while (dptr < limit && isdigit(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int iswordc(const char c)
+{
+ if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+ (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+ c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+ c == '{' || c == '}' || c == '~')
+ return 1;
+ return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+ int len = 0;
+ while (dptr < limit && iswordc(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len, domain_len;
+
+ len = word_len(dptr, limit);
+ dptr += len;
+ if (!len || dptr == limit || *dptr != '@')
+ return len;
+ dptr++;
+ len++;
+
+ domain_len = word_len(dptr, limit);
+ if (!domain_len)
+ return 0;
+ return len + domain_len;
+}
+
+/* get media type + port length */
+static int media_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len = string_len(ct, dptr, limit, shift);
+
+ dptr += len;
+ if (dptr >= limit || *dptr != ' ')
+ return 0;
+ len++;
+ dptr++;
+
+ return len + digits_len(ct, dptr, limit, shift);
+}
+
+static int sip_parse_addr(const struct nf_conn *ct, const char *cp,
+ const char **endp, union nf_inet_addr *addr,
+ const char *limit, bool delim)
+{
+ const char *end;
+ int ret;
+
+ if (!ct)
+ return 0;
+
+ memset(addr, 0, sizeof(*addr));
+ switch (nf_ct_l3num(ct)) {
+ case AF_INET:
+ ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
+ if (ret == 0)
+ return 0;
+ break;
+ case AF_INET6:
+ if (cp < limit && *cp == '[')
+ cp++;
+ else if (delim)
+ return 0;
+
+ ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
+ if (ret == 0)
+ return 0;
+
+ if (end < limit && *end == ']')
+ end++;
+ else if (delim)
+ return 0;
+ break;
+ default:
+ BUG();
+ }
+
+ if (endp)
+ *endp = end;
+ return 1;
+}
+
+/* skip ip address. returns its length. */
+static int epaddr_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ union nf_inet_addr addr;
+ const char *aux = dptr;
+
+ if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) {
+ pr_debug("ip: %s parse failed.!\n", dptr);
+ return 0;
+ }
+
+ /* Port number */
+ if (*dptr == ':') {
+ dptr++;
+ dptr += digits_len(ct, dptr, limit, shift);
+ }
+ return dptr - aux;
+}
+
+/* get address length, skiping user info. */
+static int skp_epaddr_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ const char *start = dptr;
+ int s = *shift;
+
+ /* Search for @, but stop at the end of the line.
+ * We are inside a sip: URI, so we don't need to worry about
+ * continuation lines. */
+ while (dptr < limit &&
+ *dptr != '@' && *dptr != '\r' && *dptr != '\n') {
+ (*shift)++;
+ dptr++;
+ }
+
+ if (dptr < limit && *dptr == '@') {
+ dptr++;
+ (*shift)++;
+ } else {
+ dptr = start;
+ *shift = s;
+ }
+
+ return epaddr_len(ct, dptr, limit, shift);
+}
+
+/* Parse a SIP request line of the form:
+ *
+ * Request-Line = Method SP Request-URI SP SIP-Version CRLF
+ *
+ * and return the offset and length of the address contained in the Request-URI.
+ */
+int ct_sip_parse_request(const struct nf_conn *ct,
+ const char *dptr, unsigned int datalen,
+ unsigned int *matchoff, unsigned int *matchlen,
+ union nf_inet_addr *addr, __be16 *port)
+{
+ const char *start = dptr, *limit = dptr + datalen, *end;
+ unsigned int mlen;
+ unsigned int p;
+ int shift = 0;
+
+ /* Skip method and following whitespace */
+ mlen = string_len(ct, dptr, limit, NULL);
+ if (!mlen)
+ return 0;
+ dptr += mlen;
+ if (++dptr >= limit)
+ return 0;
+
+ /* Find SIP URI */
+ for (; dptr < limit - strlen("sip:"); dptr++) {
+ if (*dptr == '\r' || *dptr == '\n')
+ return -1;
+ if (strncasecmp(dptr, "sip:", strlen("sip:")) == 0) {
+ dptr += strlen("sip:");
+ break;
+ }
+ }
+ if (!skp_epaddr_len(ct, dptr, limit, &shift))
+ return 0;
+ dptr += shift;
+
+ if (!sip_parse_addr(ct, dptr, &end, addr, limit, true))
+ return -1;
+ if (end < limit && *end == ':') {
+ end++;
+ p = simple_strtoul(end, (char **)&end, 10);
+ if (p < 1024 || p > 65535)
+ return -1;
+ *port = htons(p);
+ } else
+ *port = htons(SIP_PORT);
+
+ if (end == dptr)
+ return 0;
+ *matchoff = dptr - start;
+ *matchlen = end - dptr;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_request);
+
+/* SIP header parsing: SIP headers are located at the beginning of a line, but
+ * may span several lines, in which case the continuation lines begin with a
+ * whitespace character. RFC 2543 allows lines to be terminated with CR, LF or
+ * CRLF, RFC 3261 allows only CRLF, we support both.
+ *
+ * Headers are followed by (optionally) whitespace, a colon, again (optionally)
+ * whitespace and the values. Whitespace in this context means any amount of
+ * tabs, spaces and continuation lines, which are treated as a single whitespace
+ * character.
+ *
+ * Some headers may appear multiple times. A comma separated list of values is
+ * equivalent to multiple headers.
+ */
+static const struct sip_header ct_sip_hdrs[] = {
+ [SIP_HDR_CSEQ] = SIP_HDR("CSeq", NULL, NULL, digits_len),
+ [SIP_HDR_FROM] = SIP_HDR("From", "f", "sip:", skp_epaddr_len),
+ [SIP_HDR_TO] = SIP_HDR("To", "t", "sip:", skp_epaddr_len),
+ [SIP_HDR_CONTACT] = SIP_HDR("Contact", "m", "sip:", skp_epaddr_len),
+ [SIP_HDR_VIA_UDP] = SIP_HDR("Via", "v", "UDP ", epaddr_len),
+ [SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len),
+ [SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len),
+ [SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len),
+ [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len),
+};
+
+static const char *sip_follow_continuation(const char *dptr, const char *limit)
+{
+ /* Walk past newline */
+ if (++dptr >= limit)
+ return NULL;
+
+ /* Skip '\n' in CR LF */
+ if (*(dptr - 1) == '\r' && *dptr == '\n') {
+ if (++dptr >= limit)
+ return NULL;
+ }
+
+ /* Continuation line? */
+ if (*dptr != ' ' && *dptr != '\t')
+ return NULL;
+
+ /* skip leading whitespace */
+ for (; dptr < limit; dptr++) {
+ if (*dptr != ' ' && *dptr != '\t')
+ break;
+ }
+ return dptr;
+}
+
+static const char *sip_skip_whitespace(const char *dptr, const char *limit)
+{
+ for (; dptr < limit; dptr++) {
+ if (*dptr == ' ')
+ continue;
+ if (*dptr != '\r' && *dptr != '\n')
+ break;
+ dptr = sip_follow_continuation(dptr, limit);
+ if (dptr == NULL)
+ return NULL;
+ }
+ return dptr;
+}
+
+/* Search within a SIP header value, dealing with continuation lines */
+static const char *ct_sip_header_search(const char *dptr, const char *limit,
+ const char *needle, unsigned int len)
+{
+ for (limit -= len; dptr < limit; dptr++) {
+ if (*dptr == '\r' || *dptr == '\n') {
+ dptr = sip_follow_continuation(dptr, limit);
+ if (dptr == NULL)
+ break;
+ continue;
+ }
+
+ if (strncasecmp(dptr, needle, len) == 0)
+ return dptr;
+ }
+ return NULL;
+}
+
+int ct_sip_get_header(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sip_header_types type,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ const struct sip_header *hdr = &ct_sip_hdrs[type];
+ const char *start = dptr, *limit = dptr + datalen;
+ int shift = 0;
+
+ for (dptr += dataoff; dptr < limit; dptr++) {
+ /* Find beginning of line */
+ if (*dptr != '\r' && *dptr != '\n')
+ continue;
+ if (++dptr >= limit)
+ break;
+ if (*(dptr - 1) == '\r' && *dptr == '\n') {
+ if (++dptr >= limit)
+ break;
+ }
+
+ /* Skip continuation lines */
+ if (*dptr == ' ' || *dptr == '\t')
+ continue;
+
+ /* Find header. Compact headers must be followed by a
+ * non-alphabetic character to avoid mismatches. */
+ if (limit - dptr >= hdr->len &&
+ strncasecmp(dptr, hdr->name, hdr->len) == 0)
+ dptr += hdr->len;
+ else if (hdr->cname && limit - dptr >= hdr->clen + 1 &&
+ strncasecmp(dptr, hdr->cname, hdr->clen) == 0 &&
+ !isalpha(*(dptr + hdr->clen)))
+ dptr += hdr->clen;
+ else
+ continue;
+
+ /* Find and skip colon */
+ dptr = sip_skip_whitespace(dptr, limit);
+ if (dptr == NULL)
+ break;
+ if (*dptr != ':' || ++dptr >= limit)
+ break;
+
+ /* Skip whitespace after colon */
+ dptr = sip_skip_whitespace(dptr, limit);
+ if (dptr == NULL)
+ break;
+
+ *matchoff = dptr - start;
+ if (hdr->search) {
+ dptr = ct_sip_header_search(dptr, limit, hdr->search,
+ hdr->slen);
+ if (!dptr)
+ return -1;
+ dptr += hdr->slen;
+ }
+
+ *matchlen = hdr->match_len(ct, dptr, limit, &shift);
+ if (!*matchlen)
+ return -1;
+ *matchoff = dptr - start + shift;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ct_sip_get_header);
+
+/* Get next header field in a list of comma separated values */
+static int ct_sip_next_header(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sip_header_types type,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ const struct sip_header *hdr = &ct_sip_hdrs[type];
+ const char *start = dptr, *limit = dptr + datalen;
+ int shift = 0;
+
+ dptr += dataoff;
+
+ dptr = ct_sip_header_search(dptr, limit, ",", strlen(","));
+ if (!dptr)
+ return 0;
+
+ dptr = ct_sip_header_search(dptr, limit, hdr->search, hdr->slen);
+ if (!dptr)
+ return 0;
+ dptr += hdr->slen;
+
+ *matchoff = dptr - start;
+ *matchlen = hdr->match_len(ct, dptr, limit, &shift);
+ if (!*matchlen)
+ return -1;
+ *matchoff += shift;
+ return 1;
+}
+
+/* Walk through headers until a parsable one is found or no header of the
+ * given type is left. */
+static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sip_header_types type, int *in_header,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ int ret;
+
+ if (in_header && *in_header) {
+ while (1) {
+ ret = ct_sip_next_header(ct, dptr, dataoff, datalen,
+ type, matchoff, matchlen);
+ if (ret > 0)
+ return ret;
+ if (ret == 0)
+ break;
+ dataoff += *matchoff;
+ }
+ *in_header = 0;
+ }
+
+ while (1) {
+ ret = ct_sip_get_header(ct, dptr, dataoff, datalen,
+ type, matchoff, matchlen);
+ if (ret > 0)
+ break;
+ if (ret == 0)
+ return ret;
+ dataoff += *matchoff;
+ }
+
+ if (in_header)
+ *in_header = 1;
+ return 1;
+}
+
+/* Locate a SIP header, parse the URI and return the offset and length of
+ * the address as well as the address and port themselves. A stream of
+ * headers can be parsed by handing in a non-NULL datalen and in_header
+ * pointer.
+ */
+int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr,
+ unsigned int *dataoff, unsigned int datalen,
+ enum sip_header_types type, int *in_header,
+ unsigned int *matchoff, unsigned int *matchlen,
+ union nf_inet_addr *addr, __be16 *port)
+{
+ const char *c, *limit = dptr + datalen;
+ unsigned int p;
+ int ret;
+
+ ret = ct_sip_walk_headers(ct, dptr, dataoff ? *dataoff : 0, datalen,
+ type, in_header, matchoff, matchlen);
+ WARN_ON(ret < 0);
+ if (ret == 0)
+ return ret;
+
+ if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true))
+ return -1;
+ if (*c == ':') {
+ c++;
+ p = simple_strtoul(c, (char **)&c, 10);
+ if (p < 1024 || p > 65535)
+ return -1;
+ *port = htons(p);
+ } else
+ *port = htons(SIP_PORT);
+
+ if (dataoff)
+ *dataoff = c - dptr;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_header_uri);
+
+static int ct_sip_parse_param(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ const char *name,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ const char *limit = dptr + datalen;
+ const char *start;
+ const char *end;
+
+ limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+ if (!limit)
+ limit = dptr + datalen;
+
+ start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+ if (!start)
+ return 0;
+ start += strlen(name);
+
+ end = ct_sip_header_search(start, limit, ";", strlen(";"));
+ if (!end)
+ end = limit;
+
+ *matchoff = start - dptr;
+ *matchlen = end - start;
+ return 1;
+}
+
+/* Parse address from header parameter and return address, offset and length */
+int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ const char *name,
+ unsigned int *matchoff, unsigned int *matchlen,
+ union nf_inet_addr *addr, bool delim)
+{
+ const char *limit = dptr + datalen;
+ const char *start, *end;
+
+ limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+ if (!limit)
+ limit = dptr + datalen;
+
+ start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+ if (!start)
+ return 0;
+
+ start += strlen(name);
+ if (!sip_parse_addr(ct, start, &end, addr, limit, delim))
+ return 0;
+ *matchoff = start - dptr;
+ *matchlen = end - start;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_address_param);
+
+/* Parse numerical header parameter and return value, offset and length */
+int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ const char *name,
+ unsigned int *matchoff, unsigned int *matchlen,
+ unsigned int *val)
+{
+ const char *limit = dptr + datalen;
+ const char *start;
+ char *end;
+
+ limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+ if (!limit)
+ limit = dptr + datalen;
+
+ start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+ if (!start)
+ return 0;
+
+ start += strlen(name);
+ *val = simple_strtoul(start, &end, 0);
+ if (start == end)
+ return 0;
+ if (matchoff && matchlen) {
+ *matchoff = start - dptr;
+ *matchlen = end - start;
+ }
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_numerical_param);
+
+static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ u8 *proto)
+{
+ unsigned int matchoff, matchlen;
+
+ if (ct_sip_parse_param(ct, dptr, dataoff, datalen, "transport=",
+ &matchoff, &matchlen)) {
+ if (!strncasecmp(dptr + matchoff, "TCP", strlen("TCP")))
+ *proto = IPPROTO_TCP;
+ else if (!strncasecmp(dptr + matchoff, "UDP", strlen("UDP")))
+ *proto = IPPROTO_UDP;
+ else
+ return 0;
+
+ if (*proto != nf_ct_protonum(ct))
+ return 0;
+ } else
+ *proto = nf_ct_protonum(ct);
+
+ return 1;
+}
+
+static int sdp_parse_addr(const struct nf_conn *ct, const char *cp,
+ const char **endp, union nf_inet_addr *addr,
+ const char *limit)
+{
+ const char *end;
+ int ret;
+
+ memset(addr, 0, sizeof(*addr));
+ switch (nf_ct_l3num(ct)) {
+ case AF_INET:
+ ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
+ break;
+ case AF_INET6:
+ ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
+ break;
+ default:
+ BUG();
+ }
+
+ if (ret == 0)
+ return 0;
+ if (endp)
+ *endp = end;
+ return 1;
+}
+
+/* skip ip address. returns its length. */
+static int sdp_addr_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ union nf_inet_addr addr;
+ const char *aux = dptr;
+
+ if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) {
+ pr_debug("ip: %s parse failed.!\n", dptr);
+ return 0;
+ }
+
+ return dptr - aux;
+}
+
+/* SDP header parsing: a SDP session description contains an ordered set of
+ * headers, starting with a section containing general session parameters,
+ * optionally followed by multiple media descriptions.
+ *
+ * SDP headers always start at the beginning of a line. According to RFC 2327:
+ * "The sequence CRLF (0x0d0a) is used to end a record, although parsers should
+ * be tolerant and also accept records terminated with a single newline
+ * character". We handle both cases.
+ */
+static const struct sip_header ct_sdp_hdrs_v4[] = {
+ [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len),
+ [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len),
+ [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len),
+ [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len),
+};
+
+static const struct sip_header ct_sdp_hdrs_v6[] = {
+ [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len),
+ [SDP_HDR_OWNER] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len),
+ [SDP_HDR_CONNECTION] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len),
+ [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len),
+};
+
+/* Linear string search within SDP header values */
+static const char *ct_sdp_header_search(const char *dptr, const char *limit,
+ const char *needle, unsigned int len)
+{
+ for (limit -= len; dptr < limit; dptr++) {
+ if (*dptr == '\r' || *dptr == '\n')
+ break;
+ if (strncmp(dptr, needle, len) == 0)
+ return dptr;
+ }
+ return NULL;
+}
+
+/* Locate a SDP header (optionally a substring within the header value),
+ * optionally stopping at the first occurrence of the term header, parse
+ * it and return the offset and length of the data we're interested in.
+ */
+int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ const struct sip_header *hdrs, *hdr, *thdr;
+ const char *start = dptr, *limit = dptr + datalen;
+ int shift = 0;
+
+ hdrs = nf_ct_l3num(ct) == NFPROTO_IPV4 ? ct_sdp_hdrs_v4 : ct_sdp_hdrs_v6;
+ hdr = &hdrs[type];
+ thdr = &hdrs[term];
+
+ for (dptr += dataoff; dptr < limit; dptr++) {
+ /* Find beginning of line */
+ if (*dptr != '\r' && *dptr != '\n')
+ continue;
+ if (++dptr >= limit)
+ break;
+ if (*(dptr - 1) == '\r' && *dptr == '\n') {
+ if (++dptr >= limit)
+ break;
+ }
+
+ if (term != SDP_HDR_UNSPEC &&
+ limit - dptr >= thdr->len &&
+ strncasecmp(dptr, thdr->name, thdr->len) == 0)
+ break;
+ else if (limit - dptr >= hdr->len &&
+ strncasecmp(dptr, hdr->name, hdr->len) == 0)
+ dptr += hdr->len;
+ else
+ continue;
+
+ *matchoff = dptr - start;
+ if (hdr->search) {
+ dptr = ct_sdp_header_search(dptr, limit, hdr->search,
+ hdr->slen);
+ if (!dptr)
+ return -1;
+ dptr += hdr->slen;
+ }
+
+ *matchlen = hdr->match_len(ct, dptr, limit, &shift);
+ if (!*matchlen)
+ return -1;
+ *matchoff = dptr - start + shift;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ct_sip_get_sdp_header);
+
+static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr,
+ unsigned int dataoff, unsigned int datalen,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ unsigned int *matchoff, unsigned int *matchlen,
+ union nf_inet_addr *addr)
+{
+ int ret;
+
+ ret = ct_sip_get_sdp_header(ct, dptr, dataoff, datalen, type, term,
+ matchoff, matchlen);
+ if (ret <= 0)
+ return ret;
+
+ if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr,
+ dptr + *matchoff + *matchlen))
+ return -1;
+ return 1;
+}
+
+static int refresh_signalling_expectation(struct nf_conn *ct,
+ union nf_inet_addr *addr,
+ u8 proto, __be16 port,
+ unsigned int expires)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *next;
+ int found = 0;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
+ if (exp->class != SIP_EXPECT_SIGNALLING ||
+ !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) ||
+ exp->tuple.dst.protonum != proto ||
+ exp->tuple.dst.u.udp.port != port)
+ continue;
+ if (!del_timer(&exp->timeout))
+ continue;
+ exp->flags &= ~NF_CT_EXPECT_INACTIVE;
+ exp->timeout.expires = jiffies + expires * HZ;
+ add_timer(&exp->timeout);
+ found = 1;
+ break;
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+ return found;
+}
+
+static void flush_expectations(struct nf_conn *ct, bool media)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *next;
+
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
+ if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)
+ continue;
+ if (!del_timer(&exp->timeout))
+ continue;
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ if (!media)
+ break;
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+
+static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ union nf_inet_addr *daddr, __be16 port,
+ enum sip_expectation_classes class,
+ unsigned int mediaoff, unsigned int medialen)
+{
+ struct nf_conntrack_expect *exp, *rtp_exp, *rtcp_exp;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct net *net = nf_ct_net(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ union nf_inet_addr *saddr;
+ struct nf_conntrack_tuple tuple;
+ int direct_rtp = 0, skip_expect = 0, ret = NF_DROP;
+ u_int16_t base_port;
+ __be16 rtp_port, rtcp_port;
+ const struct nf_nat_sip_hooks *hooks;
+
+ saddr = NULL;
+ if (sip_direct_media) {
+ if (!nf_inet_addr_cmp(daddr, &ct->tuplehash[dir].tuple.src.u3))
+ return NF_ACCEPT;
+ saddr = &ct->tuplehash[!dir].tuple.src.u3;
+ }
+
+ /* We need to check whether the registration exists before attempting
+ * to register it since we can see the same media description multiple
+ * times on different connections in case multiple endpoints receive
+ * the same call.
+ *
+ * RTP optimization: if we find a matching media channel expectation
+ * and both the expectation and this connection are SNATed, we assume
+ * both sides can reach each other directly and use the final
+ * destination address from the expectation. We still need to keep
+ * the NATed expectations for media that might arrive from the
+ * outside, and additionally need to expect the direct RTP stream
+ * in case it passes through us even without NAT.
+ */
+ memset(&tuple, 0, sizeof(tuple));
+ if (saddr)
+ tuple.src.u3 = *saddr;
+ tuple.src.l3num = nf_ct_l3num(ct);
+ tuple.dst.protonum = IPPROTO_UDP;
+ tuple.dst.u3 = *daddr;
+ tuple.dst.u.udp.port = port;
+
+ rcu_read_lock();
+ do {
+ exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple);
+
+ if (!exp || exp->master == ct ||
+ nfct_help(exp->master)->helper != nfct_help(ct)->helper ||
+ exp->class != class)
+ break;
+#ifdef CONFIG_NF_NAT_NEEDED
+ if (!direct_rtp &&
+ (!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) ||
+ exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) &&
+ ct->status & IPS_NAT_MASK) {
+ *daddr = exp->saved_addr;
+ tuple.dst.u3 = exp->saved_addr;
+ tuple.dst.u.udp.port = exp->saved_proto.udp.port;
+ direct_rtp = 1;
+ } else
+#endif
+ skip_expect = 1;
+ } while (!skip_expect);
+
+ base_port = ntohs(tuple.dst.u.udp.port) & ~1;
+ rtp_port = htons(base_port);
+ rtcp_port = htons(base_port + 1);
+
+ if (direct_rtp) {
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks &&
+ !hooks->sdp_port(skb, protoff, dataoff, dptr, datalen,
+ mediaoff, medialen, ntohs(rtp_port)))
+ goto err1;
+ }
+
+ if (skip_expect) {
+ rcu_read_unlock();
+ return NF_ACCEPT;
+ }
+
+ rtp_exp = nf_ct_expect_alloc(ct);
+ if (rtp_exp == NULL)
+ goto err1;
+ nf_ct_expect_init(rtp_exp, class, nf_ct_l3num(ct), saddr, daddr,
+ IPPROTO_UDP, NULL, &rtp_port);
+
+ rtcp_exp = nf_ct_expect_alloc(ct);
+ if (rtcp_exp == NULL)
+ goto err2;
+ nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr,
+ IPPROTO_UDP, NULL, &rtcp_port);
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK && !direct_rtp)
+ ret = hooks->sdp_media(skb, protoff, dataoff, dptr,
+ datalen, rtp_exp, rtcp_exp,
+ mediaoff, medialen, daddr);
+ else {
+ if (nf_ct_expect_related(rtp_exp) == 0) {
+ if (nf_ct_expect_related(rtcp_exp) != 0)
+ nf_ct_unexpect_related(rtp_exp);
+ else
+ ret = NF_ACCEPT;
+ }
+ }
+ nf_ct_expect_put(rtcp_exp);
+err2:
+ nf_ct_expect_put(rtp_exp);
+err1:
+ rcu_read_unlock();
+ return ret;
+}
+
+static const struct sdp_media_type sdp_media_types[] = {
+ SDP_MEDIA_TYPE("audio ", SIP_EXPECT_AUDIO),
+ SDP_MEDIA_TYPE("video ", SIP_EXPECT_VIDEO),
+ SDP_MEDIA_TYPE("image ", SIP_EXPECT_IMAGE),
+};
+
+static const struct sdp_media_type *sdp_media_type(const char *dptr,
+ unsigned int matchoff,
+ unsigned int matchlen)
+{
+ const struct sdp_media_type *t;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(sdp_media_types); i++) {
+ t = &sdp_media_types[i];
+ if (matchlen < t->len ||
+ strncmp(dptr + matchoff, t->name, t->len))
+ continue;
+ return t;
+ }
+ return NULL;
+}
+
+static int process_sdp(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchoff, matchlen;
+ unsigned int mediaoff, medialen;
+ unsigned int sdpoff;
+ unsigned int caddr_len, maddr_len;
+ unsigned int i;
+ union nf_inet_addr caddr, maddr, rtp_addr;
+ const struct nf_nat_sip_hooks *hooks;
+ unsigned int port;
+ const struct sdp_media_type *t;
+ int ret = NF_ACCEPT;
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+
+ /* Find beginning of session description */
+ if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
+ SDP_HDR_VERSION, SDP_HDR_UNSPEC,
+ &matchoff, &matchlen) <= 0)
+ return NF_ACCEPT;
+ sdpoff = matchoff;
+
+ /* The connection information is contained in the session description
+ * and/or once per media description. The first media description marks
+ * the end of the session description. */
+ caddr_len = 0;
+ if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
+ &matchoff, &matchlen, &caddr) > 0)
+ caddr_len = matchlen;
+
+ mediaoff = sdpoff;
+ for (i = 0; i < ARRAY_SIZE(sdp_media_types); ) {
+ if (ct_sip_get_sdp_header(ct, *dptr, mediaoff, *datalen,
+ SDP_HDR_MEDIA, SDP_HDR_UNSPEC,
+ &mediaoff, &medialen) <= 0)
+ break;
+
+ /* Get media type and port number. A media port value of zero
+ * indicates an inactive stream. */
+ t = sdp_media_type(*dptr, mediaoff, medialen);
+ if (!t) {
+ mediaoff += medialen;
+ continue;
+ }
+ mediaoff += t->len;
+ medialen -= t->len;
+
+ port = simple_strtoul(*dptr + mediaoff, NULL, 10);
+ if (port == 0)
+ continue;
+ if (port < 1024 || port > 65535) {
+ nf_ct_helper_log(skb, ct, "wrong port %u", port);
+ return NF_DROP;
+ }
+
+ /* The media description overrides the session description. */
+ maddr_len = 0;
+ if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
+ &matchoff, &matchlen, &maddr) > 0) {
+ maddr_len = matchlen;
+ memcpy(&rtp_addr, &maddr, sizeof(rtp_addr));
+ } else if (caddr_len)
+ memcpy(&rtp_addr, &caddr, sizeof(rtp_addr));
+ else {
+ nf_ct_helper_log(skb, ct, "cannot parse SDP message");
+ return NF_DROP;
+ }
+
+ ret = set_expected_rtp_rtcp(skb, protoff, dataoff,
+ dptr, datalen,
+ &rtp_addr, htons(port), t->class,
+ mediaoff, medialen);
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct,
+ "cannot add expectation for voice");
+ return ret;
+ }
+
+ /* Update media connection address if present */
+ if (maddr_len && hooks && ct->status & IPS_NAT_MASK) {
+ ret = hooks->sdp_addr(skb, protoff, dataoff,
+ dptr, datalen, mediaoff,
+ SDP_HDR_CONNECTION,
+ SDP_HDR_MEDIA,
+ &rtp_addr);
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SDP");
+ return ret;
+ }
+ }
+ i++;
+ }
+
+ /* Update session connection and owner addresses */
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK)
+ ret = hooks->sdp_session(skb, protoff, dataoff,
+ dptr, datalen, sdpoff,
+ &rtp_addr);
+
+ return ret;
+}
+static int process_invite_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq, unsigned int code)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+
+ if ((code >= 100 && code <= 199) ||
+ (code >= 200 && code <= 299))
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
+ flush_expectations(ct, true);
+ return NF_ACCEPT;
+}
+
+static int process_update_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq, unsigned int code)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+
+ if ((code >= 100 && code <= 199) ||
+ (code >= 200 && code <= 299))
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
+ flush_expectations(ct, true);
+ return NF_ACCEPT;
+}
+
+static int process_prack_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq, unsigned int code)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+
+ if ((code >= 100 && code <= 199) ||
+ (code >= 200 && code <= 299))
+ return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ else if (ct_sip_info->invite_cseq == cseq)
+ flush_expectations(ct, true);
+ return NF_ACCEPT;
+}
+
+static int process_invite_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ unsigned int ret;
+
+ flush_expectations(ct, true);
+ ret = process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
+ if (ret == NF_ACCEPT)
+ ct_sip_info->invite_cseq = cseq;
+ return ret;
+}
+
+static int process_bye_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ flush_expectations(ct, true);
+ return NF_ACCEPT;
+}
+
+/* Parse a REGISTER request and create a permanent expectation for incoming
+ * signalling connections. The expectation is marked inactive and is activated
+ * when receiving a response indicating success from the registrar.
+ */
+static int process_register_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned int matchoff, matchlen;
+ struct nf_conntrack_expect *exp;
+ union nf_inet_addr *saddr, daddr;
+ const struct nf_nat_sip_hooks *hooks;
+ __be16 port;
+ u8 proto;
+ unsigned int expires = 0;
+ int ret;
+
+ /* Expected connections can not register again. */
+ if (ct->status & IPS_EXPECTED)
+ return NF_ACCEPT;
+
+ /* We must check the expiration time: a value of zero signals the
+ * registrar to release the binding. We'll remove our expectation
+ * when receiving the new bindings in the response, but we don't
+ * want to create new ones.
+ *
+ * The expiration time may be contained in Expires: header, the
+ * Contact: header parameters or the URI parameters.
+ */
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES,
+ &matchoff, &matchlen) > 0)
+ expires = simple_strtoul(*dptr + matchoff, NULL, 10);
+
+ ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+ SIP_HDR_CONTACT, NULL,
+ &matchoff, &matchlen, &daddr, &port);
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse contact");
+ return NF_DROP;
+ } else if (ret == 0)
+ return NF_ACCEPT;
+
+ /* We don't support third-party registrations */
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &daddr))
+ return NF_ACCEPT;
+
+ if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, *datalen,
+ &proto) == 0)
+ return NF_ACCEPT;
+
+ if (ct_sip_parse_numerical_param(ct, *dptr,
+ matchoff + matchlen, *datalen,
+ "expires=", NULL, NULL, &expires) < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse expires");
+ return NF_DROP;
+ }
+
+ if (expires == 0) {
+ ret = NF_ACCEPT;
+ goto store_cseq;
+ }
+
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ return NF_DROP;
+ }
+
+ saddr = NULL;
+ if (sip_direct_signalling)
+ saddr = &ct->tuplehash[!dir].tuple.src.u3;
+
+ nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct),
+ saddr, &daddr, proto, NULL, &port);
+ exp->timeout.expires = sip_timeout * HZ;
+ exp->helper = nfct_help(ct)->helper;
+ exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && ct->status & IPS_NAT_MASK)
+ ret = hooks->expect(skb, protoff, dataoff, dptr, datalen,
+ exp, matchoff, matchlen);
+ else {
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ } else
+ ret = NF_ACCEPT;
+ }
+ nf_ct_expect_put(exp);
+
+store_cseq:
+ if (ret == NF_ACCEPT)
+ ct_sip_info->register_cseq = cseq;
+ return ret;
+}
+
+static int process_register_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int cseq, unsigned int code)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ union nf_inet_addr addr;
+ __be16 port;
+ u8 proto;
+ unsigned int matchoff, matchlen, coff = 0;
+ unsigned int expires = 0;
+ int in_contact = 0, ret;
+
+ /* According to RFC 3261, "UAs MUST NOT send a new registration until
+ * they have received a final response from the registrar for the
+ * previous one or the previous REGISTER request has timed out".
+ *
+ * However, some servers fail to detect retransmissions and send late
+ * responses, so we store the sequence number of the last valid
+ * request and compare it here.
+ */
+ if (ct_sip_info->register_cseq != cseq)
+ return NF_ACCEPT;
+
+ if (code >= 100 && code <= 199)
+ return NF_ACCEPT;
+ if (code < 200 || code > 299)
+ goto flush;
+
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES,
+ &matchoff, &matchlen) > 0)
+ expires = simple_strtoul(*dptr + matchoff, NULL, 10);
+
+ while (1) {
+ unsigned int c_expires = expires;
+
+ ret = ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
+ SIP_HDR_CONTACT, &in_contact,
+ &matchoff, &matchlen,
+ &addr, &port);
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse contact");
+ return NF_DROP;
+ } else if (ret == 0)
+ break;
+
+ /* We don't support third-party registrations */
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &addr))
+ continue;
+
+ if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen,
+ *datalen, &proto) == 0)
+ continue;
+
+ ret = ct_sip_parse_numerical_param(ct, *dptr,
+ matchoff + matchlen,
+ *datalen, "expires=",
+ NULL, NULL, &c_expires);
+ if (ret < 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse expires");
+ return NF_DROP;
+ }
+ if (c_expires == 0)
+ break;
+ if (refresh_signalling_expectation(ct, &addr, proto, port,
+ c_expires))
+ return NF_ACCEPT;
+ }
+
+flush:
+ flush_expectations(ct, false);
+ return NF_ACCEPT;
+}
+
+static const struct sip_handler sip_handlers[] = {
+ SIP_HANDLER("INVITE", process_invite_request, process_invite_response),
+ SIP_HANDLER("UPDATE", process_sdp, process_update_response),
+ SIP_HANDLER("ACK", process_sdp, NULL),
+ SIP_HANDLER("PRACK", process_sdp, process_prack_response),
+ SIP_HANDLER("BYE", process_bye_request, NULL),
+ SIP_HANDLER("REGISTER", process_register_request, process_register_response),
+};
+
+static int process_sip_response(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchoff, matchlen, matchend;
+ unsigned int code, cseq, i;
+
+ if (*datalen < strlen("SIP/2.0 200"))
+ return NF_ACCEPT;
+ code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10);
+ if (!code) {
+ nf_ct_helper_log(skb, ct, "cannot get code");
+ return NF_DROP;
+ }
+
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
+ &matchoff, &matchlen) <= 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse cseq");
+ return NF_DROP;
+ }
+ cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
+ if (!cseq) {
+ nf_ct_helper_log(skb, ct, "cannot get cseq");
+ return NF_DROP;
+ }
+ matchend = matchoff + matchlen + 1;
+
+ for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
+ const struct sip_handler *handler;
+
+ handler = &sip_handlers[i];
+ if (handler->response == NULL)
+ continue;
+ if (*datalen < matchend + handler->len ||
+ strncasecmp(*dptr + matchend, handler->method, handler->len))
+ continue;
+ return handler->response(skb, protoff, dataoff, dptr, datalen,
+ cseq, code);
+ }
+ return NF_ACCEPT;
+}
+
+static int process_sip_request(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned int matchoff, matchlen;
+ unsigned int cseq, i;
+ union nf_inet_addr addr;
+ __be16 port;
+
+ /* Many Cisco IP phones use a high source port for SIP requests, but
+ * listen for the response on port 5060. If we are the local
+ * router for one of these phones, save the port number from the
+ * Via: header so that nf_nat_sip can redirect the responses to
+ * the correct port.
+ */
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+ SIP_HDR_VIA_UDP, NULL, &matchoff,
+ &matchlen, &addr, &port) > 0 &&
+ port != ct->tuplehash[dir].tuple.src.u.udp.port &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3))
+ ct_sip_info->forced_dport = port;
+
+ for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
+ const struct sip_handler *handler;
+
+ handler = &sip_handlers[i];
+ if (handler->request == NULL)
+ continue;
+ if (*datalen < handler->len ||
+ strncasecmp(*dptr, handler->method, handler->len))
+ continue;
+
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
+ &matchoff, &matchlen) <= 0) {
+ nf_ct_helper_log(skb, ct, "cannot parse cseq");
+ return NF_DROP;
+ }
+ cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
+ if (!cseq) {
+ nf_ct_helper_log(skb, ct, "cannot get cseq");
+ return NF_DROP;
+ }
+
+ return handler->request(skb, protoff, dataoff, dptr, datalen,
+ cseq);
+ }
+ return NF_ACCEPT;
+}
+
+static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct,
+ unsigned int protoff, unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ const struct nf_nat_sip_hooks *hooks;
+ int ret;
+
+ if (strncasecmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0)
+ ret = process_sip_request(skb, protoff, dataoff, dptr, datalen);
+ else
+ ret = process_sip_response(skb, protoff, dataoff, dptr, datalen);
+
+ if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks && !hooks->msg(skb, protoff, dataoff,
+ dptr, datalen)) {
+ nf_ct_helper_log(skb, ct, "cannot NAT SIP message");
+ ret = NF_DROP;
+ }
+ }
+
+ return ret;
+}
+
+static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ struct tcphdr *th, _tcph;
+ unsigned int dataoff, datalen;
+ unsigned int matchoff, matchlen, clen;
+ unsigned int msglen, origlen;
+ const char *dptr, *end;
+ s16 diff, tdiff = 0;
+ int ret = NF_ACCEPT;
+ bool term;
+
+ if (ctinfo != IP_CT_ESTABLISHED &&
+ ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return NF_ACCEPT;
+
+ /* No Data ? */
+ th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+ dataoff = protoff + th->doff * 4;
+ if (dataoff >= skb->len)
+ return NF_ACCEPT;
+
+ nf_ct_refresh(ct, skb, sip_timeout * HZ);
+
+ if (unlikely(skb_linearize(skb)))
+ return NF_DROP;
+
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+ if (datalen < strlen("SIP/2.0 200"))
+ return NF_ACCEPT;
+
+ while (1) {
+ if (ct_sip_get_header(ct, dptr, 0, datalen,
+ SIP_HDR_CONTENT_LENGTH,
+ &matchoff, &matchlen) <= 0)
+ break;
+
+ clen = simple_strtoul(dptr + matchoff, (char **)&end, 10);
+ if (dptr + matchoff == end)
+ break;
+
+ term = false;
+ for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) {
+ if (end[0] == '\r' && end[1] == '\n' &&
+ end[2] == '\r' && end[3] == '\n') {
+ term = true;
+ break;
+ }
+ }
+ if (!term)
+ break;
+ end += strlen("\r\n\r\n") + clen;
+
+ msglen = origlen = end - dptr;
+ if (msglen > datalen)
+ return NF_ACCEPT;
+
+ ret = process_sip_msg(skb, ct, protoff, dataoff,
+ &dptr, &msglen);
+ /* process_sip_* functions report why this packet is dropped */
+ if (ret != NF_ACCEPT)
+ break;
+ diff = msglen - origlen;
+ tdiff += diff;
+
+ dataoff += msglen;
+ dptr += msglen;
+ datalen = datalen + diff - msglen;
+ }
+
+ if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
+ const struct nf_nat_sip_hooks *hooks;
+
+ hooks = rcu_dereference(nf_nat_sip_hooks);
+ if (hooks)
+ hooks->seq_adjust(skb, protoff, tdiff);
+ }
+
+ return ret;
+}
+
+static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff, datalen;
+ const char *dptr;
+
+ /* No Data ? */
+ dataoff = protoff + sizeof(struct udphdr);
+ if (dataoff >= skb->len)
+ return NF_ACCEPT;
+
+ nf_ct_refresh(ct, skb, sip_timeout * HZ);
+
+ if (unlikely(skb_linearize(skb)))
+ return NF_DROP;
+
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+ if (datalen < strlen("SIP/2.0 200"))
+ return NF_ACCEPT;
+
+ return process_sip_msg(skb, ct, protoff, dataoff, &dptr, &datalen);
+}
+
+static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly;
+
+static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = {
+ [SIP_EXPECT_SIGNALLING] = {
+ .name = "signalling",
+ .max_expected = 1,
+ .timeout = 3 * 60,
+ },
+ [SIP_EXPECT_AUDIO] = {
+ .name = "audio",
+ .max_expected = 2 * IP_CT_DIR_MAX,
+ .timeout = 3 * 60,
+ },
+ [SIP_EXPECT_VIDEO] = {
+ .name = "video",
+ .max_expected = 2 * IP_CT_DIR_MAX,
+ .timeout = 3 * 60,
+ },
+ [SIP_EXPECT_IMAGE] = {
+ .name = "image",
+ .max_expected = IP_CT_DIR_MAX,
+ .timeout = 3 * 60,
+ },
+};
+
+static void nf_conntrack_sip_fini(void)
+{
+ int i, j;
+
+ for (i = 0; i < ports_c; i++) {
+ for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
+ if (sip[i][j].me == NULL)
+ continue;
+ nf_conntrack_helper_unregister(&sip[i][j]);
+ }
+ }
+}
+
+static int __init nf_conntrack_sip_init(void)
+{
+ int i, j, ret;
+
+ if (ports_c == 0)
+ ports[ports_c++] = SIP_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ memset(&sip[i], 0, sizeof(sip[i]));
+
+ sip[i][0].tuple.src.l3num = AF_INET;
+ sip[i][0].tuple.dst.protonum = IPPROTO_UDP;
+ sip[i][0].help = sip_help_udp;
+ sip[i][1].tuple.src.l3num = AF_INET;
+ sip[i][1].tuple.dst.protonum = IPPROTO_TCP;
+ sip[i][1].help = sip_help_tcp;
+
+ sip[i][2].tuple.src.l3num = AF_INET6;
+ sip[i][2].tuple.dst.protonum = IPPROTO_UDP;
+ sip[i][2].help = sip_help_udp;
+ sip[i][3].tuple.src.l3num = AF_INET6;
+ sip[i][3].tuple.dst.protonum = IPPROTO_TCP;
+ sip[i][3].help = sip_help_tcp;
+
+ for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
+ sip[i][j].data_len = sizeof(struct nf_ct_sip_master);
+ sip[i][j].tuple.src.u.udp.port = htons(ports[i]);
+ sip[i][j].expect_policy = sip_exp_policy;
+ sip[i][j].expect_class_max = SIP_EXPECT_MAX;
+ sip[i][j].me = THIS_MODULE;
+
+ if (ports[i] == SIP_PORT)
+ sprintf(sip[i][j].name, "sip");
+ else
+ sprintf(sip[i][j].name, "sip-%u", i);
+
+ pr_debug("port #%u: %u\n", i, ports[i]);
+
+ ret = nf_conntrack_helper_register(&sip[i][j]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_sip: failed to register"
+ " helper for pf: %u port: %u\n",
+ sip[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_sip_fini();
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+module_init(nf_conntrack_sip_init);
+module_exit(nf_conntrack_sip_fini);
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
new file mode 100644
index 000000000..87b95a2c2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -0,0 +1,78 @@
+/*
+ * SNMP service broadcast connection tracking helper
+ *
+ * (c) 2011 Jiri Olsa <jolsa@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+
+#define SNMP_PORT 161
+
+MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>");
+MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFCT_HELPER("snmp");
+
+static unsigned int timeout __read_mostly = 30;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+int (*nf_nat_snmp_hook)(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
+
+static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ typeof(nf_nat_snmp_hook) nf_nat_snmp;
+
+ nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+
+ nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
+ if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
+ return nf_nat_snmp(skb, protoff, ct, ctinfo);
+
+ return NF_ACCEPT;
+}
+
+static struct nf_conntrack_expect_policy exp_policy = {
+ .max_expected = 1,
+};
+
+static struct nf_conntrack_helper helper __read_mostly = {
+ .name = "snmp",
+ .tuple.src.l3num = NFPROTO_IPV4,
+ .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .help = snmp_conntrack_help,
+ .expect_policy = &exp_policy,
+};
+
+static int __init nf_conntrack_snmp_init(void)
+{
+ exp_policy.timeout = timeout;
+ return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_snmp_fini(void)
+{
+ nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_snmp_init);
+module_exit(nf_conntrack_snmp_fini);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
new file mode 100644
index 000000000..fc823fa5d
--- /dev/null
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -0,0 +1,613 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <net/net_namespace.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <linux/rculist_nulls.h>
+
+MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+void
+print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_l3proto *l3proto,
+ const struct nf_conntrack_l4proto *l4proto)
+{
+ l3proto->print_tuple(s, tuple);
+ l4proto->print_tuple(s, tuple);
+}
+EXPORT_SYMBOL_GPL(print_tuple);
+
+struct ct_iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+ u_int64_t time_now;
+};
+
+static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_iter_state *st = seq->private;
+ struct hlist_nulls_node *n;
+
+ for (st->bucket = 0;
+ st->bucket < net->ct.htable_size;
+ st->bucket++) {
+ n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ if (!is_a_nulls(n))
+ return n;
+ }
+ return NULL;
+}
+
+static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
+ struct hlist_nulls_node *head)
+{
+ struct net *net = seq_file_net(seq);
+ struct ct_iter_state *st = seq->private;
+
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
+ while (is_a_nulls(head)) {
+ if (likely(get_nulls_value(head) == st->bucket)) {
+ if (++st->bucket >= net->ct.htable_size)
+ return NULL;
+ }
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(
+ &net->ct.hash[st->bucket]));
+ }
+ return head;
+}
+
+static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_nulls_node *head = ct_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct ct_iter_state *st = seq->private;
+
+ st->time_now = ktime_get_real_ns();
+ rcu_read_lock();
+ return ct_get_idx(seq, *pos);
+}
+
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_get_next(s, v);
+}
+
+static void ct_seq_stop(struct seq_file *s, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ int ret;
+ u32 len;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return;
+
+ seq_printf(s, "secctx=%s ", secctx);
+
+ security_release_secctx(secctx, len);
+}
+#else
+static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+}
+#endif
+
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+ struct ct_iter_state *st = s->private;
+ struct nf_conn_tstamp *tstamp;
+ s64 delta_time;
+
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp) {
+ delta_time = st->time_now - tstamp->start;
+ if (delta_time > 0)
+ delta_time = div_s64(delta_time, NSEC_PER_SEC);
+ else
+ delta_time = 0;
+
+ seq_printf(s, "delta-time=%llu ",
+ (unsigned long long)delta_time);
+ }
+ return;
+}
+#else
+static inline void
+ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+}
+#endif
+
+/* return 0 on success, 1 in case of error */
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+ struct nf_conntrack_tuple_hash *hash = v;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
+ const struct nf_conntrack_l3proto *l3proto;
+ const struct nf_conntrack_l4proto *l4proto;
+ int ret = 0;
+
+ NF_CT_ASSERT(ct);
+ if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ return 0;
+
+ /* we only want to print DIR_ORIGINAL */
+ if (NF_CT_DIRECTION(hash))
+ goto release;
+
+ l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
+ NF_CT_ASSERT(l3proto);
+ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ NF_CT_ASSERT(l4proto);
+
+ ret = -ENOSPC;
+ seq_printf(s, "%-8s %u %-8s %u %ld ",
+ l3proto->name, nf_ct_l3num(ct),
+ l4proto->name, nf_ct_protonum(ct),
+ timer_pending(&ct->timeout)
+ ? (long)(ct->timeout.expires - jiffies)/HZ : 0);
+
+ if (l4proto->print_conntrack)
+ l4proto->print_conntrack(s, ct);
+
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ l3proto, l4proto);
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
+ goto release;
+
+ if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
+ seq_printf(s, "[UNREPLIED] ");
+
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ l3proto, l4proto);
+
+ if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
+ goto release;
+
+ if (test_bit(IPS_ASSURED_BIT, &ct->status))
+ seq_printf(s, "[ASSURED] ");
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ seq_printf(s, "mark=%u ", ct->mark);
+#endif
+
+ ct_show_secctx(s, ct);
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ seq_printf(s, "zone=%u ", nf_ct_zone(ct));
+#endif
+
+ ct_show_delta_time(s, ct);
+
+ seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
+
+ if (seq_has_overflowed(s))
+ goto release;
+
+ ret = 0;
+release:
+ nf_ct_put(ct);
+ return ret;
+}
+
+static const struct seq_operations ct_seq_ops = {
+ .start = ct_seq_start,
+ .next = ct_seq_next,
+ .stop = ct_seq_stop,
+ .show = ct_seq_show
+};
+
+static int ct_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ct_seq_ops,
+ sizeof(struct ct_iter_state));
+}
+
+static const struct file_operations ct_file_ops = {
+ .owner = THIS_MODULE,
+ .open = ct_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ct.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ct.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_net(seq);
+ unsigned int nr_conntracks = atomic_read(&net->ct.count);
+ const struct ip_conntrack_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ nr_conntracks,
+ st->searched,
+ st->found,
+ st->new,
+ st->invalid,
+ st->ignore,
+ st->delete,
+ st->delete_list,
+ st->insert,
+ st->insert_failed,
+ st->drop,
+ st->early_drop,
+ st->error,
+
+ st->expect_new,
+ st->expect_create,
+ st->expect_delete,
+ st->search_restart
+ );
+ return 0;
+}
+
+static const struct seq_operations ct_cpu_seq_ops = {
+ .start = ct_cpu_seq_start,
+ .next = ct_cpu_seq_next,
+ .stop = ct_cpu_seq_stop,
+ .show = ct_cpu_seq_show,
+};
+
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ct_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ct_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ct_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int nf_conntrack_standalone_init_proc(struct net *net)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
+ if (!pde)
+ goto out_nf_conntrack;
+
+ pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
+ &ct_cpu_seq_fops);
+ if (!pde)
+ goto out_stat_nf_conntrack;
+ return 0;
+
+out_stat_nf_conntrack:
+ remove_proc_entry("nf_conntrack", net->proc_net);
+out_nf_conntrack:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_standalone_fini_proc(struct net *net)
+{
+ remove_proc_entry("nf_conntrack", net->proc_net_stat);
+ remove_proc_entry("nf_conntrack", net->proc_net);
+}
+#else
+static int nf_conntrack_standalone_init_proc(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_standalone_fini_proc(struct net *net)
+{
+}
+#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+
+/* Sysctl support */
+
+#ifdef CONFIG_SYSCTL
+/* Log invalid packets of a given protocol */
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+
+static struct ctl_table_header *nf_ct_netfilter_header;
+
+static struct ctl_table nf_ct_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_max",
+ .data = &nf_conntrack_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_count",
+ .data = &init_net.ct.count,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_buckets",
+ .data = &init_net.ct.htable_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_checksum",
+ .data = &init_net.ct.sysctl_checksum,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nf_conntrack_log_invalid",
+ .data = &init_net.ct.sysctl_log_invalid,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &log_invalid_proto_min,
+ .extra2 = &log_invalid_proto_max,
+ },
+ {
+ .procname = "nf_conntrack_expect_max",
+ .data = &nf_ct_expect_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+#define NET_NF_CONNTRACK_MAX 2089
+
+static struct ctl_table nf_ct_netfilter_table[] = {
+ {
+ .procname = "nf_conntrack_max",
+ .data = &nf_conntrack_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static int nf_conntrack_standalone_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out_kmemdup;
+
+ table[1].data = &net->ct.count;
+ table[2].data = &net->ct.htable_size;
+ table[3].data = &net->ct.sysctl_checksum;
+ table[4].data = &net->ct.sysctl_log_invalid;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
+ if (!net->ct.sysctl_header)
+ goto out_unregister_netfilter;
+
+ return 0;
+
+out_unregister_netfilter:
+ kfree(table);
+out_kmemdup:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_standalone_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_standalone_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_standalone_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+static int nf_conntrack_pernet_init(struct net *net)
+{
+ int ret;
+
+ ret = nf_conntrack_init_net(net);
+ if (ret < 0)
+ goto out_init;
+
+ ret = nf_conntrack_standalone_init_proc(net);
+ if (ret < 0)
+ goto out_proc;
+
+ net->ct.sysctl_checksum = 1;
+ net->ct.sysctl_log_invalid = 0;
+ ret = nf_conntrack_standalone_init_sysctl(net);
+ if (ret < 0)
+ goto out_sysctl;
+
+ return 0;
+
+out_sysctl:
+ nf_conntrack_standalone_fini_proc(net);
+out_proc:
+ nf_conntrack_cleanup_net(net);
+out_init:
+ return ret;
+}
+
+static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_conntrack_standalone_fini_sysctl(net);
+ nf_conntrack_standalone_fini_proc(net);
+ }
+ nf_conntrack_cleanup_net_list(net_exit_list);
+}
+
+static struct pernet_operations nf_conntrack_net_ops = {
+ .init = nf_conntrack_pernet_init,
+ .exit_batch = nf_conntrack_pernet_exit,
+};
+
+static int __init nf_conntrack_standalone_init(void)
+{
+ int ret = nf_conntrack_init_start();
+ if (ret < 0)
+ goto out_start;
+
+#ifdef CONFIG_SYSCTL
+ nf_ct_netfilter_header =
+ register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
+ if (!nf_ct_netfilter_header) {
+ pr_err("nf_conntrack: can't register to sysctl.\n");
+ ret = -ENOMEM;
+ goto out_sysctl;
+ }
+#endif
+
+ ret = register_pernet_subsys(&nf_conntrack_net_ops);
+ if (ret < 0)
+ goto out_pernet;
+
+ nf_conntrack_init_end();
+ return 0;
+
+out_pernet:
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(nf_ct_netfilter_header);
+out_sysctl:
+#endif
+ nf_conntrack_cleanup_end();
+out_start:
+ return ret;
+}
+
+static void __exit nf_conntrack_standalone_fini(void)
+{
+ nf_conntrack_cleanup_start();
+ unregister_pernet_subsys(&nf_conntrack_net_ops);
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(nf_ct_netfilter_header);
+#endif
+ nf_conntrack_cleanup_end();
+}
+
+module_init(nf_conntrack_standalone_init);
+module_exit(nf_conntrack_standalone_fini);
+
+/* Some modules need us, but don't depend directly on any symbol.
+ They should call this. */
+void need_conntrack(void)
+{
+}
+EXPORT_SYMBOL_GPL(need_conntrack);
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
new file mode 100644
index 000000000..e68ab4fbd
--- /dev/null
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -0,0 +1,153 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_tftp.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("TFTP connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_tftp");
+MODULE_ALIAS_NFCT_HELPER("tftp");
+
+#define MAX_PORTS 8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "Port numbers of TFTP servers");
+
+unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_tftp_hook);
+
+static int tftp_help(struct sk_buff *skb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ const struct tftphdr *tfh;
+ struct tftphdr _tftph;
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_tuple *tuple;
+ unsigned int ret = NF_ACCEPT;
+ typeof(nf_nat_tftp_hook) nf_nat_tftp;
+
+ tfh = skb_header_pointer(skb, protoff + sizeof(struct udphdr),
+ sizeof(_tftph), &_tftph);
+ if (tfh == NULL)
+ return NF_ACCEPT;
+
+ switch (ntohs(tfh->opcode)) {
+ case TFTP_OPCODE_READ:
+ case TFTP_OPCODE_WRITE:
+ /* RRQ and WRQ works the same way */
+ nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ nf_ct_helper_log(skb, ct, "cannot alloc expectation");
+ return NF_DROP;
+ }
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+ nf_ct_l3num(ct),
+ &tuple->src.u3, &tuple->dst.u3,
+ IPPROTO_UDP, NULL, &tuple->dst.u.udp.port);
+
+ pr_debug("expect: ");
+ nf_ct_dump_tuple(&exp->tuple);
+
+ nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook);
+ if (nf_nat_tftp && ct->status & IPS_NAT_MASK)
+ ret = nf_nat_tftp(skb, ctinfo, exp);
+ else if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, ct, "cannot add expectation");
+ ret = NF_DROP;
+ }
+ nf_ct_expect_put(exp);
+ break;
+ case TFTP_OPCODE_DATA:
+ case TFTP_OPCODE_ACK:
+ pr_debug("Data/ACK opcode\n");
+ break;
+ case TFTP_OPCODE_ERROR:
+ pr_debug("Error opcode\n");
+ break;
+ default:
+ pr_debug("Unknown opcode\n");
+ }
+ return ret;
+}
+
+static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly;
+
+static const struct nf_conntrack_expect_policy tftp_exp_policy = {
+ .max_expected = 1,
+ .timeout = 5 * 60,
+};
+
+static void nf_conntrack_tftp_fini(void)
+{
+ int i, j;
+
+ for (i = 0; i < ports_c; i++) {
+ for (j = 0; j < 2; j++)
+ nf_conntrack_helper_unregister(&tftp[i][j]);
+ }
+}
+
+static int __init nf_conntrack_tftp_init(void)
+{
+ int i, j, ret;
+
+ if (ports_c == 0)
+ ports[ports_c++] = TFTP_PORT;
+
+ for (i = 0; i < ports_c; i++) {
+ memset(&tftp[i], 0, sizeof(tftp[i]));
+
+ tftp[i][0].tuple.src.l3num = AF_INET;
+ tftp[i][1].tuple.src.l3num = AF_INET6;
+ for (j = 0; j < 2; j++) {
+ tftp[i][j].tuple.dst.protonum = IPPROTO_UDP;
+ tftp[i][j].tuple.src.u.udp.port = htons(ports[i]);
+ tftp[i][j].expect_policy = &tftp_exp_policy;
+ tftp[i][j].me = THIS_MODULE;
+ tftp[i][j].help = tftp_help;
+
+ if (ports[i] == TFTP_PORT)
+ sprintf(tftp[i][j].name, "tftp");
+ else
+ sprintf(tftp[i][j].name, "tftp-%u", i);
+
+ ret = nf_conntrack_helper_register(&tftp[i][j]);
+ if (ret) {
+ printk(KERN_ERR "nf_ct_tftp: failed to register"
+ " helper for pf: %u port: %u\n",
+ tftp[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_tftp_fini();
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+module_init(nf_conntrack_tftp_init);
+module_exit(nf_conntrack_tftp_fini);
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
new file mode 100644
index 000000000..93da609d9
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -0,0 +1,51 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2012 by Vyatta Inc. <http://www.vyatta.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+
+struct ctnl_timeout *
+(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
+
+void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+
+static struct nf_ct_ext_type timeout_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_timeout),
+ .align = __alignof__(struct nf_conn_timeout),
+ .id = NF_CT_EXT_TIMEOUT,
+};
+
+int nf_conntrack_timeout_init(void)
+{
+ int ret = nf_ct_extend_register(&timeout_extend);
+ if (ret < 0)
+ pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
+ return ret;
+}
+
+void nf_conntrack_timeout_fini(void)
+{
+ nf_ct_extend_unregister(&timeout_extend);
+}
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
new file mode 100644
index 000000000..7a394df0d
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -0,0 +1,114 @@
+/*
+ * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+
+static bool nf_ct_tstamp __read_mostly;
+
+module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
+MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tstamp_sysctl_table[] = {
+ {
+ .procname = "nf_conntrack_timestamp",
+ .data = &init_net.ct.sysctl_tstamp,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type tstamp_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_tstamp),
+ .align = __alignof__(struct nf_conn_tstamp),
+ .id = NF_CT_EXT_TSTAMP,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_tstamp;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter",
+ table);
+ if (!net->ct.tstamp_sysctl_header) {
+ printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.tstamp_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_tstamp_pernet_init(struct net *net)
+{
+ net->ct.sysctl_tstamp = nf_ct_tstamp;
+ return nf_conntrack_tstamp_init_sysctl(net);
+}
+
+void nf_conntrack_tstamp_pernet_fini(struct net *net)
+{
+ nf_conntrack_tstamp_fini_sysctl(net);
+}
+
+int nf_conntrack_tstamp_init(void)
+{
+ int ret;
+ ret = nf_ct_extend_register(&tstamp_extend);
+ if (ret < 0)
+ pr_err("nf_ct_tstamp: Unable to register extension\n");
+ return ret;
+}
+
+void nf_conntrack_tstamp_fini(void)
+{
+ nf_ct_extend_unregister(&tstamp_extend);
+}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
new file mode 100644
index 000000000..ea7f36784
--- /dev/null
+++ b/net/netfilter/nf_internals.h
@@ -0,0 +1,27 @@
+#ifndef _NF_INTERNALS_H
+#define _NF_INTERNALS_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define NFDEBUG(format, args...) printk(KERN_DEBUG format , ## args)
+#else
+#define NFDEBUG(format, args...)
+#endif
+
+
+/* core.c */
+unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb,
+ struct nf_hook_state *state, struct nf_hook_ops **elemp);
+
+/* nf_queue.c */
+int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem,
+ struct nf_hook_state *state, unsigned int queuenum);
+int __init netfilter_queue_init(void);
+
+/* nf_log.c */
+int __init netfilter_log_init(void);
+
+#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
new file mode 100644
index 000000000..675d12c69
--- /dev/null
+++ b/net/netfilter/nf_log.c
@@ -0,0 +1,534 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+#include <net/netfilter/nf_log.h>
+
+#include "nf_internals.h"
+
+/* Internal logging interface, which relies on the real
+ LOG target modules */
+
+#define NF_LOG_PREFIXLEN 128
+#define NFLOGGER_NAME_LEN 64
+
+static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
+static DEFINE_MUTEX(nf_log_mutex);
+
+#define nft_log_dereference(logger) \
+ rcu_dereference_protected(logger, lockdep_is_held(&nf_log_mutex))
+
+static struct nf_logger *__find_logger(int pf, const char *str_logger)
+{
+ struct nf_logger *log;
+ int i;
+
+ for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
+ if (loggers[pf][i] == NULL)
+ continue;
+
+ log = nft_log_dereference(loggers[pf][i]);
+ if (!strncasecmp(str_logger, log->name, strlen(log->name)))
+ return log;
+ }
+
+ return NULL;
+}
+
+void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
+{
+ const struct nf_logger *log;
+
+ if (pf == NFPROTO_UNSPEC)
+ return;
+
+ mutex_lock(&nf_log_mutex);
+ log = nft_log_dereference(net->nf.nf_loggers[pf]);
+ if (log == NULL)
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
+
+ mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_set);
+
+void nf_log_unset(struct net *net, const struct nf_logger *logger)
+{
+ int i;
+ const struct nf_logger *log;
+
+ mutex_lock(&nf_log_mutex);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ log = nft_log_dereference(net->nf.nf_loggers[i]);
+ if (log == logger)
+ RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL);
+ }
+ mutex_unlock(&nf_log_mutex);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_log_unset);
+
+/* return EEXIST if the same logger is registered, 0 on success. */
+int nf_log_register(u_int8_t pf, struct nf_logger *logger)
+{
+ int i;
+ int ret = 0;
+
+ if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers))
+ return -EINVAL;
+
+ mutex_lock(&nf_log_mutex);
+
+ if (pf == NFPROTO_UNSPEC) {
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+ if (rcu_access_pointer(loggers[i][logger->type])) {
+ ret = -EEXIST;
+ goto unlock;
+ }
+ }
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
+ rcu_assign_pointer(loggers[i][logger->type], logger);
+ } else {
+ if (rcu_access_pointer(loggers[pf][logger->type])) {
+ ret = -EEXIST;
+ goto unlock;
+ }
+ rcu_assign_pointer(loggers[pf][logger->type], logger);
+ }
+
+unlock:
+ mutex_unlock(&nf_log_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(nf_log_register);
+
+void nf_log_unregister(struct nf_logger *logger)
+{
+ int i;
+
+ mutex_lock(&nf_log_mutex);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ RCU_INIT_POINTER(loggers[i][logger->type], NULL);
+ mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_unregister);
+
+int nf_log_bind_pf(struct net *net, u_int8_t pf,
+ const struct nf_logger *logger)
+{
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
+ return -EINVAL;
+ mutex_lock(&nf_log_mutex);
+ if (__find_logger(pf, logger->name) == NULL) {
+ mutex_unlock(&nf_log_mutex);
+ return -ENOENT;
+ }
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
+ mutex_unlock(&nf_log_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(nf_log_bind_pf);
+
+void nf_log_unbind_pf(struct net *net, u_int8_t pf)
+{
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
+ return;
+ mutex_lock(&nf_log_mutex);
+ RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL);
+ mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_unbind_pf);
+
+void nf_logger_request_module(int pf, enum nf_log_type type)
+{
+ if (loggers[pf][type] == NULL)
+ request_module("nf-logger-%u-%u", pf, type);
+}
+EXPORT_SYMBOL_GPL(nf_logger_request_module);
+
+int nf_logger_find_get(int pf, enum nf_log_type type)
+{
+ struct nf_logger *logger;
+ int ret = -ENOENT;
+
+ if (rcu_access_pointer(loggers[pf][type]) == NULL)
+ request_module("nf-logger-%u-%u", pf, type);
+
+ rcu_read_lock();
+ logger = rcu_dereference(loggers[pf][type]);
+ if (logger == NULL)
+ goto out;
+
+ if (logger && try_module_get(logger->me))
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_logger_find_get);
+
+void nf_logger_put(int pf, enum nf_log_type type)
+{
+ struct nf_logger *logger;
+
+ BUG_ON(loggers[pf][type] == NULL);
+
+ rcu_read_lock();
+ logger = rcu_dereference(loggers[pf][type]);
+ module_put(logger->me);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_logger_put);
+
+void nf_log_packet(struct net *net,
+ u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *fmt, ...)
+{
+ va_list args;
+ char prefix[NF_LOG_PREFIXLEN];
+ const struct nf_logger *logger;
+
+ rcu_read_lock();
+ if (loginfo != NULL)
+ logger = rcu_dereference(loggers[pf][loginfo->type]);
+ else
+ logger = rcu_dereference(net->nf.nf_loggers[pf]);
+
+ if (logger) {
+ va_start(args, fmt);
+ vsnprintf(prefix, sizeof(prefix), fmt, args);
+ va_end(args);
+ logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_log_packet);
+
+void nf_log_trace(struct net *net,
+ u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo, const char *fmt, ...)
+{
+ va_list args;
+ char prefix[NF_LOG_PREFIXLEN];
+ const struct nf_logger *logger;
+
+ rcu_read_lock();
+ logger = rcu_dereference(net->nf.nf_loggers[pf]);
+ if (logger) {
+ va_start(args, fmt);
+ vsnprintf(prefix, sizeof(prefix), fmt, args);
+ va_end(args);
+ logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_log_trace);
+
+#define S_SIZE (1024 - (sizeof(unsigned int) + 1))
+
+struct nf_log_buf {
+ unsigned int count;
+ char buf[S_SIZE + 1];
+};
+static struct nf_log_buf emergency, *emergency_ptr = &emergency;
+
+__printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...)
+{
+ va_list args;
+ int len;
+
+ if (likely(m->count < S_SIZE)) {
+ va_start(args, f);
+ len = vsnprintf(m->buf + m->count, S_SIZE - m->count, f, args);
+ va_end(args);
+ if (likely(m->count + len < S_SIZE)) {
+ m->count += len;
+ return 0;
+ }
+ }
+ m->count = S_SIZE;
+ printk_once(KERN_ERR KBUILD_MODNAME " please increase S_SIZE\n");
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nf_log_buf_add);
+
+struct nf_log_buf *nf_log_buf_open(void)
+{
+ struct nf_log_buf *m = kmalloc(sizeof(*m), GFP_ATOMIC);
+
+ if (unlikely(!m)) {
+ local_bh_disable();
+ do {
+ m = xchg(&emergency_ptr, NULL);
+ } while (!m);
+ }
+ m->count = 0;
+ return m;
+}
+EXPORT_SYMBOL_GPL(nf_log_buf_open);
+
+void nf_log_buf_close(struct nf_log_buf *m)
+{
+ m->buf[m->count] = 0;
+ printk("%s\n", m->buf);
+
+ if (likely(m != &emergency))
+ kfree(m);
+ else {
+ emergency_ptr = m;
+ local_bh_enable();
+ }
+}
+EXPORT_SYMBOL_GPL(nf_log_buf_close);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+
+ mutex_lock(&nf_log_mutex);
+
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
+ return NULL;
+
+ return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(s);
+
+ (*pos)++;
+
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
+ return NULL;
+
+ return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+ mutex_unlock(&nf_log_mutex);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ loff_t *pos = v;
+ const struct nf_logger *logger;
+ int i;
+ struct net *net = seq_file_net(s);
+
+ logger = nft_log_dereference(net->nf.nf_loggers[*pos]);
+
+ if (!logger)
+ seq_printf(s, "%2lld NONE (", *pos);
+ else
+ seq_printf(s, "%2lld %s (", *pos, logger->name);
+
+ if (seq_has_overflowed(s))
+ return -ENOSPC;
+
+ for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
+ if (loggers[*pos][i] == NULL)
+ continue;
+
+ logger = nft_log_dereference(loggers[*pos][i]);
+ seq_printf(s, "%s", logger->name);
+ if (i == 0 && loggers[*pos][i + 1] != NULL)
+ seq_printf(s, ",");
+
+ if (seq_has_overflowed(s))
+ return -ENOSPC;
+ }
+
+ seq_printf(s, ")\n");
+
+ if (seq_has_overflowed(s))
+ return -ENOSPC;
+ return 0;
+}
+
+static const struct seq_operations nflog_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nflog_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &nflog_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations nflog_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nflog_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+
+#endif /* PROC_FS */
+
+#ifdef CONFIG_SYSCTL
+static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
+static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
+
+static int nf_log_proc_dostring(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ const struct nf_logger *logger;
+ char buf[NFLOGGER_NAME_LEN];
+ size_t size = *lenp;
+ int r = 0;
+ int tindex = (unsigned long)table->extra1;
+ struct net *net = current->nsproxy->net_ns;
+
+ if (write) {
+ if (size > sizeof(buf))
+ size = sizeof(buf);
+ if (copy_from_user(buf, buffer, size))
+ return -EFAULT;
+
+ if (!strcmp(buf, "NONE")) {
+ nf_log_unbind_pf(net, tindex);
+ return 0;
+ }
+ mutex_lock(&nf_log_mutex);
+ logger = __find_logger(tindex, buf);
+ if (logger == NULL) {
+ mutex_unlock(&nf_log_mutex);
+ return -ENOENT;
+ }
+ rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);
+ mutex_unlock(&nf_log_mutex);
+ } else {
+ mutex_lock(&nf_log_mutex);
+ logger = nft_log_dereference(net->nf.nf_loggers[tindex]);
+ if (!logger)
+ table->data = "NONE";
+ else
+ table->data = logger->name;
+ r = proc_dostring(table, write, buffer, lenp, ppos);
+ mutex_unlock(&nf_log_mutex);
+ }
+
+ return r;
+}
+
+static int netfilter_log_sysctl_init(struct net *net)
+{
+ int i;
+ struct ctl_table *table;
+
+ table = nf_log_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(nf_log_sysctl_table,
+ sizeof(nf_log_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ } else {
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+ snprintf(nf_log_sysctl_fnames[i],
+ 3, "%d", i);
+ nf_log_sysctl_table[i].procname =
+ nf_log_sysctl_fnames[i];
+ nf_log_sysctl_table[i].data = NULL;
+ nf_log_sysctl_table[i].maxlen = NFLOGGER_NAME_LEN;
+ nf_log_sysctl_table[i].mode = 0644;
+ nf_log_sysctl_table[i].proc_handler =
+ nf_log_proc_dostring;
+ nf_log_sysctl_table[i].extra1 =
+ (void *)(unsigned long) i;
+ }
+ }
+
+ net->nf.nf_log_dir_header = register_net_sysctl(net,
+ "net/netfilter/nf_log",
+ table);
+ if (!net->nf.nf_log_dir_header)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->nf.nf_log_dir_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf.nf_log_dir_header);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+#else
+static int netfilter_log_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+static int __net_init nf_log_net_init(struct net *net)
+{
+ int ret = -ENOMEM;
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nf_log", S_IRUGO,
+ net->nf.proc_netfilter, &nflog_file_ops))
+ return ret;
+#endif
+ ret = netfilter_log_sysctl_init(net);
+ if (ret < 0)
+ goto out_sysctl;
+
+ return 0;
+
+out_sysctl:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
+#endif
+ return ret;
+}
+
+static void __net_exit nf_log_net_exit(struct net *net)
+{
+ netfilter_log_sysctl_exit(net);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
+#endif
+}
+
+static struct pernet_operations nf_log_net_ops = {
+ .init = nf_log_net_init,
+ .exit = nf_log_net_exit,
+};
+
+int __init netfilter_log_init(void)
+{
+ return register_pernet_subsys(&nf_log_net_ops);
+}
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
new file mode 100644
index 000000000..a5aa5967b
--- /dev/null
+++ b/net/netfilter/nf_log_common.c
@@ -0,0 +1,188 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
+ u8 proto, int fragment, unsigned int offset)
+{
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ if (proto == IPPROTO_UDP)
+ /* Max length: 10 "PROTO=UDP " */
+ nf_log_buf_add(m, "PROTO=UDP ");
+ else /* Max length: 14 "PROTO=UDPLITE " */
+ nf_log_buf_add(m, "PROTO=UDPLITE ");
+
+ if (fragment)
+ goto out;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
+ ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
+
+out:
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_udp_header);
+
+int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
+ u8 proto, int fragment, unsigned int offset,
+ unsigned int logflags)
+{
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ /* Max length: 10 "PROTO=TCP " */
+ nf_log_buf_add(m, "PROTO=TCP ");
+
+ if (fragment)
+ return 0;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ nf_log_buf_add(m, "SPT=%u DPT=%u ",
+ ntohs(th->source), ntohs(th->dest));
+ /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+ if (logflags & XT_LOG_TCPSEQ) {
+ nf_log_buf_add(m, "SEQ=%u ACK=%u ",
+ ntohl(th->seq), ntohl(th->ack_seq));
+ }
+
+ /* Max length: 13 "WINDOW=65535 " */
+ nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
+ /* Max length: 9 "RES=0x3C " */
+ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
+ TCP_RESERVED_BITS) >> 22));
+ /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+ if (th->cwr)
+ nf_log_buf_add(m, "CWR ");
+ if (th->ece)
+ nf_log_buf_add(m, "ECE ");
+ if (th->urg)
+ nf_log_buf_add(m, "URG ");
+ if (th->ack)
+ nf_log_buf_add(m, "ACK ");
+ if (th->psh)
+ nf_log_buf_add(m, "PSH ");
+ if (th->rst)
+ nf_log_buf_add(m, "RST ");
+ if (th->syn)
+ nf_log_buf_add(m, "SYN ");
+ if (th->fin)
+ nf_log_buf_add(m, "FIN ");
+ /* Max length: 11 "URGP=65535 " */
+ nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
+
+ if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
+ u_int8_t _opt[60 - sizeof(struct tcphdr)];
+ const u_int8_t *op;
+ unsigned int i;
+ unsigned int optsize = th->doff*4 - sizeof(struct tcphdr);
+
+ op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
+ optsize, _opt);
+ if (op == NULL) {
+ nf_log_buf_add(m, "OPT (TRUNCATED)");
+ return 1;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+
+ nf_log_buf_add(m, ") ");
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
+
+void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk)
+{
+ if (!sk || !sk_fullsock(sk))
+ return;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ const struct cred *cred = sk->sk_socket->file->f_cred;
+ nf_log_buf_add(m, "UID=%u GID=%u ",
+ from_kuid_munged(&init_user_ns, cred->fsuid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid);
+
+void
+nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo, const char *prefix)
+{
+ nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
+ '0' + loginfo->u.log.level, prefix,
+ in ? in->name : "",
+ out ? out->name : "");
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (skb->nf_bridge) {
+ const struct net_device *physindev;
+ const struct net_device *physoutdev;
+
+ physindev = nf_bridge_get_physindev(skb);
+ if (physindev && in != physindev)
+ nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
+ physoutdev = nf_bridge_get_physoutdev(skb);
+ if (physoutdev && out != physoutdev)
+ nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
+ }
+#endif
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
+
+static int __init nf_log_common_init(void)
+{
+ return 0;
+}
+
+static void __exit nf_log_common_exit(void) {}
+
+module_init(nf_log_common_init);
+module_exit(nf_log_common_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
new file mode 100644
index 000000000..eb772380a
--- /dev/null
+++ b/net/netfilter/nf_nat_amanda.c
@@ -0,0 +1,90 @@
+/* Amanda extension for TCP NAT alteration.
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on a copy of HW's ip_nat_irc.c as well as other modules
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_amanda.h>
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_amanda");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ char buffer[sizeof("65535")];
+ u_int16_t port;
+ unsigned int ret;
+
+ /* Connection comes from client. */
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_ORIGINAL;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one (ie. same IP: it will be TCP and master is UDP). */
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int res;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ res = nf_ct_expect_related(exp);
+ if (res == 0)
+ break;
+ else if (res != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, exp->master, "all ports in use");
+ return NF_DROP;
+ }
+
+ sprintf(buffer, "%u", port);
+ ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, strlen(buffer));
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, exp->master, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ }
+ return ret;
+}
+
+static void __exit nf_nat_amanda_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_amanda_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_amanda_init(void)
+{
+ BUG_ON(nf_nat_amanda_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_amanda_hook, help);
+ return 0;
+}
+
+module_init(nf_nat_amanda_init);
+module_exit(nf_nat_amanda_fini);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
new file mode 100644
index 000000000..4e0b47831
--- /dev/null
+++ b/net/netfilter/nf_nat_core.c
@@ -0,0 +1,899 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <net/xfrm.h>
+#include <linux/jhash.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_nat.h>
+
+static DEFINE_SPINLOCK(nf_nat_lock);
+
+static DEFINE_MUTEX(nf_nat_proto_mutex);
+static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
+ __read_mostly;
+static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
+ __read_mostly;
+
+
+inline const struct nf_nat_l3proto *
+__nf_nat_l3proto_find(u8 family)
+{
+ return rcu_dereference(nf_nat_l3protos[family]);
+}
+
+inline const struct nf_nat_l4proto *
+__nf_nat_l4proto_find(u8 family, u8 protonum)
+{
+ return rcu_dereference(nf_nat_l4protos[family][protonum]);
+}
+EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find);
+
+#ifdef CONFIG_XFRM
+static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ unsigned long statusbit;
+ u8 family;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return;
+
+ family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ rcu_read_lock();
+ l3proto = __nf_nat_l3proto_find(family);
+ if (l3proto == NULL)
+ goto out;
+
+ dir = CTINFO2DIR(ctinfo);
+ if (dir == IP_CT_DIR_ORIGINAL)
+ statusbit = IPS_DST_NAT;
+ else
+ statusbit = IPS_SRC_NAT;
+
+ l3proto->decode_session(skb, ct, dir, statusbit, fl);
+out:
+ rcu_read_unlock();
+}
+
+int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
+{
+ struct flowi fl;
+ unsigned int hh_len;
+ struct dst_entry *dst;
+ int err;
+
+ err = xfrm_decode_session(skb, &fl, family);
+ if (err < 0)
+ return err;
+
+ dst = skb_dst(skb);
+ if (dst->xfrm)
+ dst = ((struct xfrm_dst *)dst)->route;
+ dst_hold(dst);
+
+ dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL(nf_xfrm_me_harder);
+#endif /* CONFIG_XFRM */
+
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ unsigned int hash;
+
+ /* Original src, to ensure we map it consistently if poss. */
+ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+ tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd);
+
+ return reciprocal_scale(hash, net->ct.nat_htable_size);
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack)
+{
+ /* Conntrack tracking doesn't keep track of outgoing tuples; only
+ * incoming ones. NAT means they don't have a fixed mapping,
+ * so we invert the tuple and look for the incoming reply.
+ *
+ * We could keep a separate hash if this proves too slow.
+ */
+ struct nf_conntrack_tuple reply;
+
+ nf_ct_invert_tuplepr(&reply, tuple);
+ return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+EXPORT_SYMBOL(nf_nat_used_tuple);
+
+/* If we source map this tuple so reply looks like reply_tuple, will
+ * that meet the constraints of range.
+ */
+static int in_range(const struct nf_nat_l3proto *l3proto,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range)
+{
+ /* If we are supposed to map IPs, then we must be in the
+ * range specified, otherwise let this drag us onto a new src IP.
+ */
+ if (range->flags & NF_NAT_RANGE_MAP_IPS &&
+ !l3proto->in_range(tuple, range))
+ return 0;
+
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
+ l4proto->in_range(tuple, NF_NAT_MANIP_SRC,
+ &range->min_proto, &range->max_proto))
+ return 1;
+
+ return 0;
+}
+
+static inline int
+same_src(const struct nf_conn *ct,
+ const struct nf_conntrack_tuple *tuple)
+{
+ const struct nf_conntrack_tuple *t;
+
+ t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ return (t->dst.protonum == tuple->dst.protonum &&
+ nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) &&
+ t->src.u.all == tuple->src.u.all);
+}
+
+/* Only called for SRC manip */
+static int
+find_appropriate_src(struct net *net, u16 zone,
+ const struct nf_nat_l3proto *l3proto,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple *result,
+ const struct nf_nat_range *range)
+{
+ unsigned int h = hash_by_src(net, zone, tuple);
+ const struct nf_conn_nat *nat;
+ const struct nf_conn *ct;
+
+ hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
+ ct = nat->ct;
+ if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
+ /* Copy source part from reply tuple. */
+ nf_ct_invert_tuplepr(result,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ result->dst = tuple->dst;
+
+ if (in_range(l3proto, l4proto, result, range))
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+ * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
+ * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+ * 1-65535, we don't do pro-rata allocation based on ports; we choose
+ * the ip with the lowest src-ip/dst-ip/proto usage.
+ */
+static void
+find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ const struct nf_conn *ct,
+ enum nf_nat_manip_type maniptype)
+{
+ union nf_inet_addr *var_ipp;
+ unsigned int i, max;
+ /* Host order */
+ u32 minip, maxip, j, dist;
+ bool full_range;
+
+ /* No IP mapping? Do nothing. */
+ if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
+ return;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ var_ipp = &tuple->src.u3;
+ else
+ var_ipp = &tuple->dst.u3;
+
+ /* Fast path: only one choice. */
+ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
+ *var_ipp = range->min_addr;
+ return;
+ }
+
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
+ else
+ max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;
+
+ /* Hashing source and destination IPs gives a fairly even
+ * spread in practice (if there are a small number of IPs
+ * involved, there usually aren't that many connections
+ * anyway). The consistency means that servers see the same
+ * client coming from the same IP (some Internet Banking sites
+ * like this), even across reboots.
+ */
+ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
+ range->flags & NF_NAT_RANGE_PERSISTENT ?
+ 0 : (__force u32)tuple->dst.u3.all[max] ^ zone);
+
+ full_range = false;
+ for (i = 0; i <= max; i++) {
+ /* If first bytes of the address are at the maximum, use the
+ * distance. Otherwise use the full range.
+ */
+ if (!full_range) {
+ minip = ntohl((__force __be32)range->min_addr.all[i]);
+ maxip = ntohl((__force __be32)range->max_addr.all[i]);
+ dist = maxip - minip + 1;
+ } else {
+ minip = 0;
+ dist = ~0;
+ }
+
+ var_ipp->all[i] = (__force __u32)
+ htonl(minip + reciprocal_scale(j, dist));
+ if (var_ipp->all[i] != range->max_addr.all[i])
+ full_range = true;
+
+ if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
+ j ^= (__force u32)tuple->dst.u3.all[i];
+ }
+}
+
+/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
+ * we change the source to map into the range. For NF_INET_PRE_ROUTING
+ * and NF_INET_LOCAL_OUT, we change the destination to map into the
+ * range. It might not be possible to get a unique tuple, but we try.
+ * At worst (or if we race), we will end up with a final duplicate in
+ * __ip_conntrack_confirm and drop the packet. */
+static void
+get_unique_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig_tuple,
+ const struct nf_nat_range *range,
+ struct nf_conn *ct,
+ enum nf_nat_manip_type maniptype)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_nat_l4proto *l4proto;
+ struct net *net = nf_ct_net(ct);
+ u16 zone = nf_ct_zone(ct);
+
+ rcu_read_lock();
+ l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
+ l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
+ orig_tuple->dst.protonum);
+
+ /* 1) If this srcip/proto/src-proto-part is currently mapped,
+ * and that same mapping gives a unique tuple within the given
+ * range, use that.
+ *
+ * This is only required for source (ie. NAT/masq) mappings.
+ * So far, we don't do local source mappings, so multiple
+ * manips not an issue.
+ */
+ if (maniptype == NF_NAT_MANIP_SRC &&
+ !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
+ /* try the original tuple first */
+ if (in_range(l3proto, l4proto, orig_tuple, range)) {
+ if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ *tuple = *orig_tuple;
+ goto out;
+ }
+ } else if (find_appropriate_src(net, zone, l3proto, l4proto,
+ orig_tuple, tuple, range)) {
+ pr_debug("get_unique_tuple: Found current src map\n");
+ if (!nf_nat_used_tuple(tuple, ct))
+ goto out;
+ }
+ }
+
+ /* 2) Select the least-used IP/proto combination in the given range */
+ *tuple = *orig_tuple;
+ find_best_ips_proto(zone, tuple, range, ct, maniptype);
+
+ /* 3) The per-protocol part of the manip is made to map into
+ * the range to make a unique tuple.
+ */
+
+ /* Only bother mapping if it's not already in range and unique */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
+ if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
+ if (l4proto->in_range(tuple, maniptype,
+ &range->min_proto,
+ &range->max_proto) &&
+ (range->min_proto.all == range->max_proto.all ||
+ !nf_nat_used_tuple(tuple, ct)))
+ goto out;
+ } else if (!nf_nat_used_tuple(tuple, ct)) {
+ goto out;
+ }
+ }
+
+ /* Last change: get protocol to try to obtain unique tuple. */
+ l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
+out:
+ rcu_read_unlock();
+}
+
+struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
+{
+ struct nf_conn_nat *nat = nfct_nat(ct);
+ if (nat)
+ return nat;
+
+ if (!nf_ct_is_confirmed(ct))
+ nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+
+ return nat;
+}
+EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
+
+unsigned int
+nf_nat_setup_info(struct nf_conn *ct,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_tuple curr_tuple, new_tuple;
+ struct nf_conn_nat *nat;
+
+ /* nat helper or nfctnetlink also setup binding */
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat == NULL)
+ return NF_ACCEPT;
+
+ NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
+ maniptype == NF_NAT_MANIP_DST);
+ BUG_ON(nf_nat_initialized(ct, maniptype));
+
+ /* What we've got will look like inverse of reply. Normally
+ * this is what is in the conntrack, except for prior
+ * manipulations (future optimization: if num_manips == 0,
+ * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
+ */
+ nf_ct_invert_tuplepr(&curr_tuple,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
+
+ if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
+ struct nf_conntrack_tuple reply;
+
+ /* Alter conntrack table so will recognize replies. */
+ nf_ct_invert_tuplepr(&reply, &new_tuple);
+ nf_conntrack_alter_reply(ct, &reply);
+
+ /* Non-atomic: we own this at the moment. */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ ct->status |= IPS_SRC_NAT;
+ else
+ ct->status |= IPS_DST_NAT;
+
+ if (nfct_help(ct))
+ nfct_seqadj_ext_add(ct);
+ }
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ unsigned int srchash;
+
+ srchash = hash_by_src(net, nf_ct_zone(ct),
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ spin_lock_bh(&nf_nat_lock);
+ /* nf_conntrack_alter_reply might re-allocate extension aera */
+ nat = nfct_nat(ct);
+ nat->ct = ct;
+ hlist_add_head_rcu(&nat->bysource,
+ &net->ct.nat_bysource[srchash]);
+ spin_unlock_bh(&nf_nat_lock);
+ }
+
+ /* It's done. */
+ if (maniptype == NF_NAT_MANIP_DST)
+ ct->status |= IPS_DST_NAT_DONE;
+ else
+ ct->status |= IPS_SRC_NAT_DONE;
+
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL(nf_nat_setup_info);
+
+static unsigned int
+__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
+{
+ /* Force range to this IP; let proto decide mapping for
+ * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+ * Use reply in case it's already been mangled (eg local packet).
+ */
+ union nf_inet_addr ip =
+ (manip == NF_NAT_MANIP_SRC ?
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
+ struct nf_nat_range range = {
+ .flags = NF_NAT_RANGE_MAP_IPS,
+ .min_addr = ip,
+ .max_addr = ip,
+ };
+ return nf_nat_setup_info(ct, &range, manip);
+}
+
+unsigned int
+nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+ return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
+}
+EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
+
+/* Do packet manipulations according to nf_nat_setup_info. */
+unsigned int nf_nat_packet(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ const struct nf_nat_l3proto *l3proto;
+ const struct nf_nat_l4proto *l4proto;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ unsigned long statusbit;
+ enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
+
+ if (mtype == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply dir. */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ /* Non-atomic: these bits don't change. */
+ if (ct->status & statusbit) {
+ struct nf_conntrack_tuple target;
+
+ /* We are aiming to look like inverse of other direction. */
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+ l3proto = __nf_nat_l3proto_find(target.src.l3num);
+ l4proto = __nf_nat_l4proto_find(target.src.l3num,
+ target.dst.protonum);
+ if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_nat_packet);
+
+struct nf_nat_proto_clean {
+ u8 l3proto;
+ u8 l4proto;
+};
+
+/* kill conntracks with affected NAT section */
+static int nf_nat_proto_remove(struct nf_conn *i, void *data)
+{
+ const struct nf_nat_proto_clean *clean = data;
+ struct nf_conn_nat *nat = nfct_nat(i);
+
+ if (!nat)
+ return 0;
+
+ if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
+ (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
+ return 0;
+
+ return i->status & IPS_NAT_MASK ? 1 : 0;
+}
+
+static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
+{
+ struct nf_conn_nat *nat = nfct_nat(ct);
+
+ if (nf_nat_proto_remove(ct, data))
+ return 1;
+
+ if (!nat || !nat->ct)
+ return 0;
+
+ /* This netns is being destroyed, and conntrack has nat null binding.
+ * Remove it from bysource hash, as the table will be freed soon.
+ *
+ * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
+ * will delete entry from already-freed table.
+ */
+ if (!del_timer(&ct->timeout))
+ return 1;
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&nat->bysource);
+ ct->status &= ~IPS_NAT_DONE_MASK;
+ nat->ct = NULL;
+ spin_unlock_bh(&nf_nat_lock);
+
+ add_timer(&ct->timeout);
+
+ /* don't delete conntrack. Although that would make things a lot
+ * simpler, we'd end up flushing all conntracks on nat rmmod.
+ */
+ return 0;
+}
+
+static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)
+{
+ struct nf_nat_proto_clean clean = {
+ .l3proto = l3proto,
+ .l4proto = l4proto,
+ };
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
+ rtnl_unlock();
+}
+
+static void nf_nat_l3proto_clean(u8 l3proto)
+{
+ struct nf_nat_proto_clean clean = {
+ .l3proto = l3proto,
+ };
+ struct net *net;
+
+ rtnl_lock();
+
+ for_each_net(net)
+ nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0);
+ rtnl_unlock();
+}
+
+/* Protocol registration. */
+int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto)
+{
+ const struct nf_nat_l4proto **l4protos;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&nf_nat_proto_mutex);
+ if (nf_nat_l4protos[l3proto] == NULL) {
+ l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *),
+ GFP_KERNEL);
+ if (l4protos == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < IPPROTO_MAX; i++)
+ RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown);
+
+ /* Before making proto_array visible to lockless readers,
+ * we must make sure its content is committed to memory.
+ */
+ smp_wmb();
+
+ nf_nat_l4protos[l3proto] = l4protos;
+ }
+
+ if (rcu_dereference_protected(
+ nf_nat_l4protos[l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_nat_proto_mutex)
+ ) != &nf_nat_l4proto_unknown) {
+ ret = -EBUSY;
+ goto out;
+ }
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto);
+ out:
+ mutex_unlock(&nf_nat_proto_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_register);
+
+/* No one stores the protocol anywhere; simply delete it. */
+void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto)
+{
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto],
+ &nf_nat_l4proto_unknown);
+ mutex_unlock(&nf_nat_proto_mutex);
+ synchronize_rcu();
+
+ nf_nat_l4proto_clean(l3proto, l4proto->l4proto);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
+
+int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
+{
+ int err;
+
+ err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
+ if (err < 0)
+ return err;
+
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
+ &nf_nat_l4proto_tcp);
+ RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
+ &nf_nat_l4proto_udp);
+ mutex_unlock(&nf_nat_proto_mutex);
+
+ RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_register);
+
+void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
+{
+ mutex_lock(&nf_nat_proto_mutex);
+ RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL);
+ mutex_unlock(&nf_nat_proto_mutex);
+ synchronize_rcu();
+
+ nf_nat_l3proto_clean(l3proto->l3proto);
+ nf_ct_l3proto_module_put(l3proto->l3proto);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
+
+/* No one using conntrack by the time this called. */
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
+{
+ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
+
+ if (nat == NULL || nat->ct == NULL)
+ return;
+
+ NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&nat->bysource);
+ spin_unlock_bh(&nf_nat_lock);
+}
+
+static void nf_nat_move_storage(void *new, void *old)
+{
+ struct nf_conn_nat *new_nat = new;
+ struct nf_conn_nat *old_nat = old;
+ struct nf_conn *ct = old_nat->ct;
+
+ if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
+ return;
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
+ spin_unlock_bh(&nf_nat_lock);
+}
+
+static struct nf_ct_ext_type nat_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_nat),
+ .align = __alignof__(struct nf_conn_nat),
+ .destroy = nf_nat_cleanup_conntrack,
+ .move = nf_nat_move_storage,
+ .id = NF_CT_EXT_NAT,
+ .flags = NF_CT_EXT_F_PREALLOC,
+};
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
+ [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
+ [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
+};
+
+static int nfnetlink_parse_nat_proto(struct nlattr *attr,
+ const struct nf_conn *ct,
+ struct nf_nat_range *range)
+{
+ struct nlattr *tb[CTA_PROTONAT_MAX+1];
+ const struct nf_nat_l4proto *l4proto;
+ int err;
+
+ err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
+ if (err < 0)
+ return err;
+
+ l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ if (l4proto->nlattr_to_range)
+ err = l4proto->nlattr_to_range(tb, range);
+
+ return err;
+}
+
+static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
+ [CTA_NAT_V4_MINIP] = { .type = NLA_U32 },
+ [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 },
+ [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) },
+ [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) },
+ [CTA_NAT_PROTO] = { .type = NLA_NESTED },
+};
+
+static int
+nfnetlink_parse_nat(const struct nlattr *nat,
+ const struct nf_conn *ct, struct nf_nat_range *range,
+ const struct nf_nat_l3proto *l3proto)
+{
+ struct nlattr *tb[CTA_NAT_MAX+1];
+ int err;
+
+ memset(range, 0, sizeof(*range));
+
+ err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
+ if (err < 0)
+ return err;
+
+ err = l3proto->nlattr_to_range(tb, range);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_NAT_PROTO])
+ return 0;
+
+ return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
+}
+
+/* This function is called under rcu_read_lock() */
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr)
+{
+ struct nf_nat_range range;
+ const struct nf_nat_l3proto *l3proto;
+ int err;
+
+ /* Should not happen, restricted to creating new conntracks
+ * via ctnetlink.
+ */
+ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
+ return -EEXIST;
+
+ /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to
+ * attach the null binding, otherwise this may oops.
+ */
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ if (l3proto == NULL)
+ return -EAGAIN;
+
+ /* No NAT information has been passed, allocate the null-binding */
+ if (attr == NULL)
+ return __nf_nat_alloc_null_binding(ct, manip);
+
+ err = nfnetlink_parse_nat(attr, ct, &range, l3proto);
+ if (err < 0)
+ return err;
+
+ return nf_nat_setup_info(ct, &range, manip);
+}
+#else
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+ enum nf_nat_manip_type manip,
+ const struct nlattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static int __net_init nf_nat_net_init(struct net *net)
+{
+ /* Leave them the same for the moment. */
+ net->ct.nat_htable_size = net->ct.htable_size;
+ net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
+ if (!net->ct.nat_bysource)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit nf_nat_net_exit(struct net *net)
+{
+ struct nf_nat_proto_clean clean = {};
+
+ nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
+ synchronize_rcu();
+ nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
+}
+
+static struct pernet_operations nf_nat_net_ops = {
+ .init = nf_nat_net_init,
+ .exit = nf_nat_net_exit,
+};
+
+static struct nf_ct_helper_expectfn follow_master_nat = {
+ .name = "nat-follow-master",
+ .expectfn = nf_nat_follow_master,
+};
+
+static int __init nf_nat_init(void)
+{
+ int ret;
+
+ ret = nf_ct_extend_register(&nat_extend);
+ if (ret < 0) {
+ printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
+ return ret;
+ }
+
+ ret = register_pernet_subsys(&nf_nat_net_ops);
+ if (ret < 0)
+ goto cleanup_extend;
+
+ nf_ct_helper_expectfn_register(&follow_master_nat);
+
+ /* Initialize fake conntrack so that NAT will skip it */
+ nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
+
+ BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
+ RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
+ nfnetlink_parse_nat_setup);
+#ifdef CONFIG_XFRM
+ BUG_ON(nf_nat_decode_session_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
+#endif
+ return 0;
+
+ cleanup_extend:
+ nf_ct_extend_unregister(&nat_extend);
+ return ret;
+}
+
+static void __exit nf_nat_cleanup(void)
+{
+ unsigned int i;
+
+ unregister_pernet_subsys(&nf_nat_net_ops);
+ nf_ct_extend_unregister(&nat_extend);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
+#ifdef CONFIG_XFRM
+ RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
+#endif
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ kfree(nf_nat_l4protos[i]);
+ synchronize_net();
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(nf_nat_init);
+module_exit(nf_nat_cleanup);
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
new file mode 100644
index 000000000..e84a578db
--- /dev/null
+++ b/net/netfilter/nf_nat_ftp.c
@@ -0,0 +1,146 @@
+/* FTP extension for TCP NAT alteration. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/inet.h>
+#include <linux/tcp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_ftp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp NAT helper");
+MODULE_ALIAS("ip_nat_ftp");
+
+/* FIXME: Time out? --RR */
+
+static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type,
+ char *buffer, size_t buflen,
+ union nf_inet_addr *addr, u16 port)
+{
+ switch (type) {
+ case NF_CT_FTP_PORT:
+ case NF_CT_FTP_PASV:
+ return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
+ ((unsigned char *)&addr->ip)[0],
+ ((unsigned char *)&addr->ip)[1],
+ ((unsigned char *)&addr->ip)[2],
+ ((unsigned char *)&addr->ip)[3],
+ port >> 8,
+ port & 0xFF);
+ case NF_CT_FTP_EPRT:
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return snprintf(buffer, buflen, "|1|%pI4|%u|",
+ &addr->ip, port);
+ else
+ return snprintf(buffer, buflen, "|2|%pI6|%u|",
+ &addr->ip6, port);
+ case NF_CT_FTP_EPSV:
+ return snprintf(buffer, buflen, "|||%u|", port);
+ }
+
+ return 0;
+}
+
+/* So, this packet has hit the connection tracking matching code.
+ Mangle it, and change the expectation to match the new version. */
+static unsigned int nf_nat_ftp(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ enum nf_ct_ftp_type type,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ int dir = CTINFO2DIR(ctinfo);
+ struct nf_conn *ct = exp->master;
+ char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+
+ /* Connection will come from wherever this packet goes, hence !dir */
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = !dir;
+
+ /* When you see the packet, we need to NAT it the same as the
+ * this one. */
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use");
+ return NF_DROP;
+ }
+
+ buflen = nf_nat_ftp_fmt_cmd(ct, type, buffer, sizeof(buffer),
+ &newaddr, port);
+ if (!buflen)
+ goto out;
+
+ pr_debug("calling nf_nat_mangle_tcp_packet\n");
+
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
+ matchlen, buffer, buflen))
+ goto out;
+
+ return NF_ACCEPT;
+
+out:
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ return NF_DROP;
+}
+
+static void __exit nf_nat_ftp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_ftp_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_ftp_init(void)
+{
+ BUG_ON(nf_nat_ftp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp);
+ return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+ printk(KERN_INFO KBUILD_MODNAME
+ ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+ return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(nf_nat_ftp_init);
+module_exit(nf_nat_ftp_fini);
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
new file mode 100644
index 000000000..2840abb5b
--- /dev/null
+++ b/net/netfilter/nf_nat_helper.c
@@ -0,0 +1,212 @@
+/* nf_nat_helper.c - generic support functions for NAT helpers
+ *
+ * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
+ * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2007-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+
+/* Frobs data inside this packet, which is linear. */
+static void mangle_contents(struct sk_buff *skb,
+ unsigned int dataoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ unsigned char *data;
+
+ BUG_ON(skb_is_nonlinear(skb));
+ data = skb_network_header(skb) + dataoff;
+
+ /* move post-replacement */
+ memmove(data + match_offset + rep_len,
+ data + match_offset + match_len,
+ skb_tail_pointer(skb) - (skb_network_header(skb) + dataoff +
+ match_offset + match_len));
+
+ /* insert data from buffer */
+ memcpy(data + match_offset, rep_buffer, rep_len);
+
+ /* update skb info */
+ if (rep_len > match_len) {
+ pr_debug("nf_nat_mangle_packet: Extending packet by "
+ "%u from %u bytes\n", rep_len - match_len, skb->len);
+ skb_put(skb, rep_len - match_len);
+ } else {
+ pr_debug("nf_nat_mangle_packet: Shrinking packet from "
+ "%u from %u bytes\n", match_len - rep_len, skb->len);
+ __skb_trim(skb, skb->len + rep_len - match_len);
+ }
+
+ if (nf_ct_l3num((struct nf_conn *)skb->nfct) == NFPROTO_IPV4) {
+ /* fix IP hdr checksum information */
+ ip_hdr(skb)->tot_len = htons(skb->len);
+ ip_send_check(ip_hdr(skb));
+ } else
+ ipv6_hdr(skb)->payload_len =
+ htons(skb->len - sizeof(struct ipv6hdr));
+}
+
+/* Unusual, but possible case. */
+static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
+{
+ if (skb->len + extra > 65535)
+ return 0;
+
+ if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
+ return 0;
+
+ return 1;
+}
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
+ * command in FTP).
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * */
+int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len, bool adjust)
+{
+ const struct nf_nat_l3proto *l3proto;
+ struct tcphdr *tcph;
+ int oldlen, datalen;
+
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (rep_len > match_len &&
+ rep_len - match_len > skb_tailroom(skb) &&
+ !enlarge_skb(skb, rep_len - match_len))
+ return 0;
+
+ SKB_LINEAR_ASSERT(skb);
+
+ tcph = (void *)skb->data + protoff;
+
+ oldlen = skb->len - protoff;
+ mangle_contents(skb, protoff + tcph->doff*4,
+ match_offset, match_len, rep_buffer, rep_len);
+
+ datalen = skb->len - protoff;
+
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ l3proto->csum_recalc(skb, IPPROTO_TCP, tcph, &tcph->check,
+ datalen, oldlen);
+
+ if (adjust && rep_len != match_len)
+ nf_ct_seqadj_set(ct, ctinfo, tcph->seq,
+ (int)rep_len - (int)match_len);
+
+ return 1;
+}
+EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
+ * command in the Amanda protocol)
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
+ * should be fairly easy to do.
+ */
+int
+nf_nat_mangle_udp_packet(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len)
+{
+ const struct nf_nat_l3proto *l3proto;
+ struct udphdr *udph;
+ int datalen, oldlen;
+
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (rep_len > match_len &&
+ rep_len - match_len > skb_tailroom(skb) &&
+ !enlarge_skb(skb, rep_len - match_len))
+ return 0;
+
+ udph = (void *)skb->data + protoff;
+
+ oldlen = skb->len - protoff;
+ mangle_contents(skb, protoff + sizeof(*udph),
+ match_offset, match_len, rep_buffer, rep_len);
+
+ /* update the length of the UDP packet */
+ datalen = skb->len - protoff;
+ udph->len = htons(datalen);
+
+ /* fix udp checksum if udp checksum was previously calculated */
+ if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
+ return 1;
+
+ l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
+ l3proto->csum_recalc(skb, IPPROTO_UDP, udph, &udph->check,
+ datalen, oldlen);
+
+ return 1;
+}
+EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
+
+/* Setup NAT on this expected conntrack so it follows master. */
+/* If we fail to get a free NAT slot, we'll get dropped on confirm */
+void nf_nat_follow_master(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = exp->saved_proto;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.src.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
new file mode 100644
index 000000000..1fb2258c3
--- /dev/null
+++ b/net/netfilter/nf_nat_irc.c
@@ -0,0 +1,119 @@
+/* IRC extension for TCP NAT alteration.
+ *
+ * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ * based on a copy of RR's ip_nat_ftp.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_irc.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("IRC (DCC) NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_irc");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ unsigned int protoff,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp)
+{
+ char buffer[sizeof("4294967296 65635")];
+ struct nf_conn *ct = exp->master;
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ unsigned int ret;
+
+ /* Reply comes from server. */
+ newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3;
+
+ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+ exp->dir = IP_CT_DIR_REPLY;
+ exp->expectfn = nf_nat_follow_master;
+
+ /* Try to get same port: if not, try to change it. */
+ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use");
+ return NF_DROP;
+ }
+
+ /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
+ * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
+ * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
+ * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
+ *
+ * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
+ * 255.255.255.255==4294967296, 10 digits)
+ * P: bound port (min 1 d, max 5d (65635))
+ * F: filename (min 1 d )
+ * S: size (min 1 d )
+ * 0x01, \n: terminators
+ */
+ /* AAA = "us", ie. where server normally talks to. */
+ snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port);
+ pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
+ buffer, &newaddr.ip, port);
+
+ ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
+ matchlen, buffer, strlen(buffer));
+ if (ret != NF_ACCEPT) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ nf_ct_unexpect_related(exp);
+ }
+
+ return ret;
+}
+
+static void __exit nf_nat_irc_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_irc_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_irc_init(void)
+{
+ BUG_ON(nf_nat_irc_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_irc_hook, help);
+ return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+ printk(KERN_INFO KBUILD_MODNAME
+ ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+ return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(nf_nat_irc_init);
+module_exit(nf_nat_irc_fini);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
new file mode 100644
index 000000000..fbce552a7
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -0,0 +1,114 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/netfilter.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ __be16 port;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ port = tuple->src.u.all;
+ else
+ port = tuple->dst.u.all;
+
+ return ntohs(port) >= ntohs(min->all) &&
+ ntohs(port) <= ntohs(max->all);
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
+
+void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct,
+ u16 *rover)
+{
+ unsigned int range_size, min, i;
+ __be16 *portptr;
+ u_int16_t off;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.all;
+ else
+ portptr = &tuple->dst.u.all;
+
+ /* If no range specified... */
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == NF_NAT_MANIP_DST)
+ return;
+
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr) < 512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min_proto.all);
+ range_size = ntohs(range->max_proto.all) - min + 1;
+ }
+
+ if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {
+ off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC
+ ? tuple->dst.u.all
+ : tuple->src.u.all);
+ } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
+ off = prandom_u32();
+ } else {
+ off = *rover;
+ }
+
+ for (i = 0; ; ++off) {
+ *portptr = htons(min + off % range_size);
+ if (++i != range_size && nf_nat_used_tuple(tuple, ct))
+ continue;
+ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+ *rover = off;
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
+
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range *range)
+{
+ if (tb[CTA_PROTONAT_PORT_MIN]) {
+ range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
+ range->max_proto.all = range->min_proto.all;
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+ if (tb[CTA_PROTONAT_PORT_MAX]) {
+ range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
+ range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_l4proto_nlattr_to_range);
+#endif
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
new file mode 100644
index 000000000..b8067b53f
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -0,0 +1,116 @@
+/*
+ * DCCP NAT protocol helper
+ *
+ * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/dccp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u_int16_t dccp_port_rover;
+
+static void
+dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &dccp_port_rover);
+}
+
+static bool
+dccp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct dccp_hdr *hdr;
+ __be16 *portptr, oldport, newport;
+ int hdrsize = 8; /* DCCP connection tracking guarantees this much */
+
+ if (skb->len >= hdroff + sizeof(struct dccp_hdr))
+ hdrsize = sizeof(struct dccp_hdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct dccp_hdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ newport = tuple->src.u.dccp.port;
+ portptr = &hdr->dccph_sport;
+ } else {
+ newport = tuple->dst.u.dccp.port;
+ portptr = &hdr->dccph_dport;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
+ 0);
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
+ .l4proto = IPPROTO_DCCP,
+ .manip_pkt = dccp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = dccp_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_dccp_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_dccp_fini(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_dccp);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_dccp);
+
+}
+
+module_init(nf_nat_proto_dccp_init);
+module_exit(nf_nat_proto_dccp_fini);
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("DCCP NAT protocol helper");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
new file mode 100644
index 000000000..cbc7ade14
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sctp.h>
+#include <linux/module.h>
+#include <net/sctp/checksum.h>
+
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u_int16_t nf_sctp_port_rover;
+
+static void
+sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &nf_sctp_port_rover);
+}
+
+static bool
+sctp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ sctp_sctphdr_t *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct sctphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ hdr->source = tuple->src.u.sctp.port;
+ } else {
+ /* Get rid of dst port */
+ hdr->dest = tuple->dst.u.sctp.port;
+ }
+
+ hdr->checksum = sctp_compute_cksum(skb, hdroff);
+
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
+ .l4proto = IPPROTO_SCTP,
+ .manip_pkt = sctp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = sctp_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_sctp_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_sctp_exit(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_sctp);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_sctp);
+}
+
+module_init(nf_nat_proto_sctp_init);
+module_exit(nf_nat_proto_sctp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SCTP NAT protocol helper");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
new file mode 100644
index 000000000..37f5505f4
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -0,0 +1,85 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+#include <net/netfilter/nf_nat_core.h>
+
+static u16 tcp_port_rover;
+
+static void
+tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &tcp_port_rover);
+}
+
+static bool
+tcp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct tcphdr *hdr;
+ __be16 *portptr, newport, oldport;
+ int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+ /* this could be a inner header returned in icmp packet; in such
+ cases we cannot update the checksum field since it is outside of
+ the 8 bytes of transport layer headers we are guaranteed */
+ if (skb->len >= hdroff + sizeof(struct tcphdr))
+ hdrsize = sizeof(struct tcphdr);
+
+ if (!skb_make_writable(skb, hdroff + hdrsize))
+ return false;
+
+ hdr = (struct tcphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ newport = tuple->src.u.tcp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.tcp.port;
+ portptr = &hdr->dest;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return true;
+
+ l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_tcp = {
+ .l4proto = IPPROTO_TCP,
+ .manip_pkt = tcp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = tcp_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
new file mode 100644
index 000000000..b0ede2f0d
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -0,0 +1,76 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u16 udp_port_rover;
+
+static void
+udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &udp_port_rover);
+}
+
+static bool
+udp_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+ __be16 *portptr, newport;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+ hdr = (struct udphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of src port */
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+ if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ l3proto->csum_update(skb, iphdroff, &hdr->check,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
+ 0);
+ if (!hdr->check)
+ hdr->check = CSUM_MANGLED_0;
+ }
+ *portptr = newport;
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_udp = {
+ .l4proto = IPPROTO_UDP,
+ .manip_pkt = udp_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = udp_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
new file mode 100644
index 000000000..368f14e01
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_udplite.c
@@ -0,0 +1,106 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static u16 udplite_port_rover;
+
+static void
+udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ nf_nat_l4proto_unique_tuple(l3proto, tuple, range, maniptype, ct,
+ &udplite_port_rover);
+}
+
+static bool
+udplite_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct udphdr *hdr;
+ __be16 *portptr, newport;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct udphdr *)(skb->data + hdroff);
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ /* Get rid of source port */
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst port */
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+
+ l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
+ if (!hdr->check)
+ hdr->check = CSUM_MANGLED_0;
+
+ *portptr = newport;
+ return true;
+}
+
+static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+ .l4proto = IPPROTO_UDPLITE,
+ .manip_pkt = udplite_manip_pkt,
+ .in_range = nf_nat_l4proto_in_range,
+ .unique_tuple = udplite_unique_tuple,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_udplite_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_proto_udplite_fini(void)
+{
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_udplite);
+ nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_udplite);
+}
+
+module_init(nf_nat_proto_udplite_init);
+module_exit(nf_nat_proto_udplite_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
new file mode 100644
index 000000000..6e494d584
--- /dev/null
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -0,0 +1,54 @@
+/* The "unknown" protocol. This is what is used for protocols we
+ * don't understand. It's returned by ip_ct_find_proto().
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type manip_type,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ return true;
+}
+
+static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ /* Sorry: we can't help you; if it's not unique, we can't frob
+ * anything.
+ */
+ return;
+}
+
+static bool
+unknown_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_unknown = {
+ .manip_pkt = unknown_manip_pkt,
+ .in_range = unknown_in_range,
+ .unique_tuple = unknown_unique_tuple,
+};
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
new file mode 100644
index 000000000..97b75f9bf
--- /dev/null
+++ b/net/netfilter/nf_nat_redirect.c
@@ -0,0 +1,127 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/types.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/addrconf.h>
+#include <net/checksum.h>
+#include <net/protocol.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_redirect.h>
+
+unsigned int
+nf_nat_redirect_ipv4(struct sk_buff *skb,
+ const struct nf_nat_ipv4_multi_range_compat *mr,
+ unsigned int hooknum)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ __be32 newdst;
+ struct nf_nat_range newrange;
+
+ NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING ||
+ hooknum == NF_INET_LOCAL_OUT);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ /* Local packets: make them go to loopback */
+ if (hooknum == NF_INET_LOCAL_OUT) {
+ newdst = htonl(0x7F000001);
+ } else {
+ struct in_device *indev;
+ struct in_ifaddr *ifa;
+
+ newdst = 0;
+
+ rcu_read_lock();
+ indev = __in_dev_get_rcu(skb->dev);
+ if (indev != NULL) {
+ ifa = indev->ifa_list;
+ newdst = ifa->ifa_local;
+ }
+ rcu_read_unlock();
+
+ if (!newdst)
+ return NF_DROP;
+ }
+
+ /* Transfer from original range. */
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = newdst;
+ newrange.max_addr.ip = newdst;
+ newrange.min_proto = mr->range[0].min;
+ newrange.max_proto = mr->range[0].max;
+
+ /* Hand modified range to generic setup. */
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
+
+static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
+
+unsigned int
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+ unsigned int hooknum)
+{
+ struct nf_nat_range newrange;
+ struct in6_addr newdst;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (hooknum == NF_INET_LOCAL_OUT) {
+ newdst = loopback_addr;
+ } else {
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+ bool addr = false;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(skb->dev);
+ if (idev != NULL) {
+ list_for_each_entry(ifa, &idev->addr_list, if_list) {
+ newdst = ifa->addr;
+ addr = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (!addr)
+ return NF_DROP;
+ }
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.in6 = newdst;
+ newrange.max_addr.in6 = newdst;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
new file mode 100644
index 000000000..791fac4fd
--- /dev/null
+++ b/net/netfilter/nf_nat_sip.c
@@ -0,0 +1,653 @@
+/* SIP extension for NAT alteration.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_nat_ftp.c and other modules.
+ * (C) 2007 United Security Providers
+ * (C) 2007, 2008, 2011, 2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP NAT helper");
+MODULE_ALIAS("ip_nat_sip");
+
+
+static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff, unsigned int matchlen,
+ const char *buffer, unsigned int buflen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct tcphdr *th;
+ unsigned int baseoff;
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP) {
+ th = (struct tcphdr *)(skb->data + protoff);
+ baseoff = protoff + th->doff * 4;
+ matchoff += dataoff - baseoff;
+
+ if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, buflen, false))
+ return 0;
+ } else {
+ baseoff = protoff + sizeof(struct udphdr);
+ matchoff += dataoff - baseoff;
+
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+ protoff, matchoff, matchlen,
+ buffer, buflen))
+ return 0;
+ }
+
+ /* Reload data pointer and adjust datalen value */
+ *dptr = skb->data + dataoff;
+ *datalen += buflen - matchlen;
+ return 1;
+}
+
+static int sip_sprintf_addr(const struct nf_conn *ct, char *buffer,
+ const union nf_inet_addr *addr, bool delim)
+{
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return sprintf(buffer, "%pI4", &addr->ip);
+ else {
+ if (delim)
+ return sprintf(buffer, "[%pI6c]", &addr->ip6);
+ else
+ return sprintf(buffer, "%pI6c", &addr->ip6);
+ }
+}
+
+static int sip_sprintf_addr_port(const struct nf_conn *ct, char *buffer,
+ const union nf_inet_addr *addr, u16 port)
+{
+ if (nf_ct_l3num(ct) == NFPROTO_IPV4)
+ return sprintf(buffer, "%pI4:%u", &addr->ip, port);
+ else
+ return sprintf(buffer, "[%pI6c]:%u", &addr->ip6, port);
+}
+
+static int map_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff, unsigned int matchlen,
+ union nf_inet_addr *addr, __be16 port)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+ unsigned int buflen;
+ union nf_inet_addr newaddr;
+ __be16 newport;
+
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, addr) &&
+ ct->tuplehash[dir].tuple.src.u.udp.port == port) {
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+ newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
+ } else if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, addr) &&
+ ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
+ newaddr = ct->tuplehash[!dir].tuple.src.u3;
+ newport = ct_sip_info->forced_dport ? :
+ ct->tuplehash[!dir].tuple.src.u.udp.port;
+ } else
+ return 1;
+
+ if (nf_inet_addr_cmp(&newaddr, addr) && newport == port)
+ return 1;
+
+ buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, ntohs(newport));
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen);
+}
+
+static int map_sip_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ enum sip_header_types type)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchlen, matchoff;
+ union nf_inet_addr addr;
+ __be16 port;
+
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
+ &matchoff, &matchlen, &addr, &port) <= 0)
+ return 1;
+ return map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port);
+}
+
+static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ unsigned int coff, matchoff, matchlen;
+ enum sip_header_types hdr;
+ union nf_inet_addr addr;
+ __be16 port;
+ int request, in_header;
+
+ /* Basic rules: requests and responses. */
+ if (strncasecmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
+ if (ct_sip_parse_request(ct, *dptr, *datalen,
+ &matchoff, &matchlen,
+ &addr, &port) > 0 &&
+ !map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SIP message");
+ return NF_DROP;
+ }
+ request = 1;
+ } else
+ request = 0;
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP)
+ hdr = SIP_HDR_VIA_TCP;
+ else
+ hdr = SIP_HDR_VIA_UDP;
+
+ /* Translate topmost Via header and parameters */
+ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+ hdr, NULL, &matchoff, &matchlen,
+ &addr, &port) > 0) {
+ unsigned int olen, matchend, poff, plen, buflen, n;
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+
+ /* We're only interested in headers related to this
+ * connection */
+ if (request) {
+ if (!nf_inet_addr_cmp(&addr,
+ &ct->tuplehash[dir].tuple.src.u3) ||
+ port != ct->tuplehash[dir].tuple.src.u.udp.port)
+ goto next;
+ } else {
+ if (!nf_inet_addr_cmp(&addr,
+ &ct->tuplehash[dir].tuple.dst.u3) ||
+ port != ct->tuplehash[dir].tuple.dst.u.udp.port)
+ goto next;
+ }
+
+ olen = *datalen;
+ if (!map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle Via header");
+ return NF_DROP;
+ }
+
+ matchend = matchoff + matchlen + *datalen - olen;
+
+ /* The maddr= parameter (RFC 2361) specifies where to send
+ * the reply. */
+ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
+ "maddr=", &poff, &plen,
+ &addr, true) > 0 &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3) &&
+ !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3)) {
+ buflen = sip_sprintf_addr(ct, buffer,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ true);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle maddr");
+ return NF_DROP;
+ }
+ }
+
+ /* The received= parameter (RFC 2361) contains the address
+ * from which the server received the request. */
+ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
+ "received=", &poff, &plen,
+ &addr, false) > 0 &&
+ nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.dst.u3) &&
+ !nf_inet_addr_cmp(&addr, &ct->tuplehash[!dir].tuple.src.u3)) {
+ buflen = sip_sprintf_addr(ct, buffer,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ false);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle received");
+ return NF_DROP;
+ }
+ }
+
+ /* The rport= parameter (RFC 3581) contains the port number
+ * from which the server received the request. */
+ if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
+ "rport=", &poff, &plen,
+ &n) > 0 &&
+ htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
+ htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
+ __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
+ buflen = sprintf(buffer, "%u", ntohs(p));
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle rport");
+ return NF_DROP;
+ }
+ }
+ }
+
+next:
+ /* Translate Contact headers */
+ coff = 0;
+ in_header = 0;
+ while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
+ SIP_HDR_CONTACT, &in_header,
+ &matchoff, &matchlen,
+ &addr, &port) > 0) {
+ if (!map_addr(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen,
+ &addr, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle contact");
+ return NF_DROP;
+ }
+ }
+
+ if (!map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_FROM) ||
+ !map_sip_addr(skb, protoff, dataoff, dptr, datalen, SIP_HDR_TO)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SIP from/to");
+ return NF_DROP;
+ }
+
+ /* Mangle destination port for Cisco phones, then fix up checksums */
+ if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) {
+ struct udphdr *uh;
+
+ if (!skb_make_writable(skb, skb->len)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ return NF_DROP;
+ }
+
+ uh = (void *)skb->data + protoff;
+ uh->dest = ct_sip_info->forced_dport;
+
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, protoff,
+ 0, 0, NULL, 0)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ return NF_DROP;
+ }
+ }
+
+ return NF_ACCEPT;
+}
+
+static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
+ s16 off)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ const struct tcphdr *th;
+
+ if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
+ return;
+
+ th = (struct tcphdr *)(skb->data + protoff);
+ nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
+}
+
+/* Handles expected signalling connections and media streams */
+static void nf_nat_sip_expected(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_nat_range range;
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = exp->saved_proto;
+ range.min_addr = range.max_addr = exp->saved_addr;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+
+ /* Change src to where master sends to, but only if the connection
+ * actually came from the same source. */
+ if (nf_inet_addr_cmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
+ &ct->master->tuplehash[exp->dir].tuple.src.u3)) {
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr
+ = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+ }
+}
+
+static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ struct nf_conntrack_expect *exp,
+ unsigned int matchoff,
+ unsigned int matchlen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ union nf_inet_addr newaddr;
+ u_int16_t port;
+ __be16 srcport;
+ char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
+ unsigned int buflen;
+
+ /* Connection will come from reply */
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3))
+ newaddr = exp->tuple.dst.u3;
+ else
+ newaddr = ct->tuplehash[!dir].tuple.dst.u3;
+
+ /* If the signalling port matches the connection's source port in the
+ * original direction, try to use the destination port in the opposite
+ * direction. */
+ srcport = ct_sip_info->forced_dport ? :
+ ct->tuplehash[dir].tuple.src.u.udp.port;
+ if (exp->tuple.dst.u.udp.port == srcport)
+ port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
+ else
+ port = ntohs(exp->tuple.dst.u.udp.port);
+
+ exp->saved_addr = exp->tuple.dst.u3;
+ exp->tuple.dst.u3 = newaddr;
+ exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
+ exp->dir = !dir;
+ exp->expectfn = nf_nat_sip_expected;
+
+ for (; port != 0; port++) {
+ int ret;
+
+ exp->tuple.dst.u.udp.port = htons(port);
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use for SIP");
+ return NF_DROP;
+ }
+
+ if (!nf_inet_addr_cmp(&exp->tuple.dst.u3, &exp->saved_addr) ||
+ exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
+ buflen = sip_sprintf_addr_port(ct, buffer, &newaddr, port);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle packet");
+ goto err;
+ }
+ }
+ return NF_ACCEPT;
+
+err:
+ nf_ct_unexpect_related(exp);
+ return NF_DROP;
+}
+
+static int mangle_content_len(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchoff, matchlen;
+ char buffer[sizeof("65536")];
+ int buflen, c_len;
+
+ /* Get actual SDP length */
+ if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
+ SDP_HDR_VERSION, SDP_HDR_UNSPEC,
+ &matchoff, &matchlen) <= 0)
+ return 0;
+ c_len = *datalen - matchoff + strlen("v=");
+
+ /* Now, update SDP length */
+ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
+ &matchoff, &matchlen) <= 0)
+ return 0;
+
+ buflen = sprintf(buffer, "%u", c_len);
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen);
+}
+
+static int mangle_sdp_packet(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ char *buffer, int buflen)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ unsigned int matchlen, matchoff;
+
+ if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
+ &matchoff, &matchlen) <= 0)
+ return -ENOENT;
+ return mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL;
+}
+
+static unsigned int nf_nat_sdp_addr(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ enum sdp_header_types type,
+ enum sdp_header_types term,
+ const union nf_inet_addr *addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ char buffer[INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ buflen = sip_sprintf_addr(ct, buffer, addr, false);
+ if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen,
+ sdpoff, type, term, buffer, buflen))
+ return 0;
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+static unsigned int nf_nat_sdp_port(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ u_int16_t port)
+{
+ char buffer[sizeof("nnnnn")];
+ unsigned int buflen;
+
+ buflen = sprintf(buffer, "%u", port);
+ if (!mangle_packet(skb, protoff, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen))
+ return 0;
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+static unsigned int nf_nat_sdp_session(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
+ const union nf_inet_addr *addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ char buffer[INET6_ADDRSTRLEN];
+ unsigned int buflen;
+
+ /* Mangle session description owner and contact addresses */
+ buflen = sip_sprintf_addr(ct, buffer, addr, false);
+ if (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
+ SDP_HDR_OWNER, SDP_HDR_MEDIA, buffer, buflen))
+ return 0;
+
+ switch (mangle_sdp_packet(skb, protoff, dataoff, dptr, datalen, sdpoff,
+ SDP_HDR_CONNECTION, SDP_HDR_MEDIA,
+ buffer, buflen)) {
+ case 0:
+ /*
+ * RFC 2327:
+ *
+ * Session description
+ *
+ * c=* (connection information - not required if included in all media)
+ */
+ case -ENOENT:
+ break;
+ default:
+ return 0;
+ }
+
+ return mangle_content_len(skb, protoff, dataoff, dptr, datalen);
+}
+
+/* So, this packet has hit the connection tracking matching code.
+ Mangle it, and change the expectation to match the new version. */
+static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff,
+ unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ struct nf_conntrack_expect *rtp_exp,
+ struct nf_conntrack_expect *rtcp_exp,
+ unsigned int mediaoff,
+ unsigned int medialen,
+ union nf_inet_addr *rtp_addr)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ u_int16_t port;
+
+ /* Connection will come from reply */
+ if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3))
+ *rtp_addr = rtp_exp->tuple.dst.u3;
+ else
+ *rtp_addr = ct->tuplehash[!dir].tuple.dst.u3;
+
+ rtp_exp->saved_addr = rtp_exp->tuple.dst.u3;
+ rtp_exp->tuple.dst.u3 = *rtp_addr;
+ rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+ rtp_exp->dir = !dir;
+ rtp_exp->expectfn = nf_nat_sip_expected;
+
+ rtcp_exp->saved_addr = rtcp_exp->tuple.dst.u3;
+ rtcp_exp->tuple.dst.u3 = *rtp_addr;
+ rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+ rtcp_exp->dir = !dir;
+ rtcp_exp->expectfn = nf_nat_sip_expected;
+
+ /* Try to get same pair of ports: if not, try to change them. */
+ for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+ port != 0; port += 2) {
+ int ret;
+
+ rtp_exp->tuple.dst.u.udp.port = htons(port);
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == -EBUSY)
+ continue;
+ else if (ret < 0) {
+ port = 0;
+ break;
+ }
+ rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
+ break;
+ else if (ret == -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ continue;
+ } else if (ret < 0) {
+ nf_ct_unexpect_related(rtp_exp);
+ port = 0;
+ break;
+ }
+ }
+
+ if (port == 0) {
+ nf_ct_helper_log(skb, ct, "all ports in use for SDP media");
+ goto err1;
+ }
+
+ /* Update media port. */
+ if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
+ !nf_nat_sdp_port(skb, protoff, dataoff, dptr, datalen,
+ mediaoff, medialen, port)) {
+ nf_ct_helper_log(skb, ct, "cannot mangle SDP message");
+ goto err2;
+ }
+
+ return NF_ACCEPT;
+
+err2:
+ nf_ct_unexpect_related(rtp_exp);
+ nf_ct_unexpect_related(rtcp_exp);
+err1:
+ return NF_DROP;
+}
+
+static struct nf_ct_helper_expectfn sip_nat = {
+ .name = "sip",
+ .expectfn = nf_nat_sip_expected,
+};
+
+static void __exit nf_nat_sip_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_sip_hooks, NULL);
+
+ nf_ct_helper_expectfn_unregister(&sip_nat);
+ synchronize_rcu();
+}
+
+static const struct nf_nat_sip_hooks sip_hooks = {
+ .msg = nf_nat_sip,
+ .seq_adjust = nf_nat_sip_seq_adjust,
+ .expect = nf_nat_sip_expect,
+ .sdp_addr = nf_nat_sdp_addr,
+ .sdp_port = nf_nat_sdp_port,
+ .sdp_session = nf_nat_sdp_session,
+ .sdp_media = nf_nat_sdp_media,
+};
+
+static int __init nf_nat_sip_init(void)
+{
+ BUG_ON(nf_nat_sip_hooks != NULL);
+ RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks);
+ nf_ct_helper_expectfn_register(&sip_nat);
+ return 0;
+}
+
+module_init(nf_nat_sip_init);
+module_exit(nf_nat_sip_fini);
diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c
new file mode 100644
index 000000000..7f67e1d53
--- /dev/null
+++ b/net/netfilter/nf_nat_tftp.c
@@ -0,0 +1,52 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/udp.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_tftp.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("TFTP NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_tftp");
+
+static unsigned int help(struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_expect *exp)
+{
+ const struct nf_conn *ct = exp->master;
+
+ exp->saved_proto.udp.port
+ = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ exp->dir = IP_CT_DIR_REPLY;
+ exp->expectfn = nf_nat_follow_master;
+ if (nf_ct_expect_related(exp) != 0) {
+ nf_ct_helper_log(skb, exp->master, "cannot add expectation");
+ return NF_DROP;
+ }
+ return NF_ACCEPT;
+}
+
+static void __exit nf_nat_tftp_fini(void)
+{
+ RCU_INIT_POINTER(nf_nat_tftp_hook, NULL);
+ synchronize_rcu();
+}
+
+static int __init nf_nat_tftp_init(void)
+{
+ BUG_ON(nf_nat_tftp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_tftp_hook, help);
+ return 0;
+}
+
+module_init(nf_nat_tftp_init);
+module_exit(nf_nat_tftp_fini);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
new file mode 100644
index 000000000..2e88032cd
--- /dev/null
+++ b/net/netfilter/nf_queue.c
@@ -0,0 +1,230 @@
+/*
+ * Rusty Russell (C)2000 -- This code is GPL.
+ * Patrick McHardy (c) 2006-2012
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <net/protocol.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/dst.h>
+
+#include "nf_internals.h"
+
+/*
+ * Hook for nfnetlink_queue to register its queue handler.
+ * We do this so that most of the NFQUEUE code can be modular.
+ *
+ * Once the queue is registered it must reinject all packets it
+ * receives, no matter what.
+ */
+static const struct nf_queue_handler __rcu *queue_handler __read_mostly;
+
+/* return EBUSY when somebody else is registered, return EEXIST if the
+ * same handler is registered, return 0 in case of success. */
+void nf_register_queue_handler(const struct nf_queue_handler *qh)
+{
+ /* should never happen, we only have one queueing backend in kernel */
+ WARN_ON(rcu_access_pointer(queue_handler));
+ rcu_assign_pointer(queue_handler, qh);
+}
+EXPORT_SYMBOL(nf_register_queue_handler);
+
+/* The caller must flush their queue before this */
+void nf_unregister_queue_handler(void)
+{
+ RCU_INIT_POINTER(queue_handler, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_unregister_queue_handler);
+
+void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
+{
+ struct nf_hook_state *state = &entry->state;
+
+ /* Release those devices we held, or Alexey will kill me. */
+ if (state->in)
+ dev_put(state->in);
+ if (state->out)
+ dev_put(state->out);
+ if (state->sk)
+ sock_put(state->sk);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (entry->skb->nf_bridge) {
+ struct net_device *physdev;
+
+ physdev = nf_bridge_get_physindev(entry->skb);
+ if (physdev)
+ dev_put(physdev);
+ physdev = nf_bridge_get_physoutdev(entry->skb);
+ if (physdev)
+ dev_put(physdev);
+ }
+#endif
+ /* Drop reference to owner of hook which queued us. */
+ module_put(entry->elem->owner);
+}
+EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
+
+/* Bump dev refs so they don't vanish while packet is out */
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+{
+ struct nf_hook_state *state = &entry->state;
+
+ if (!try_module_get(entry->elem->owner))
+ return false;
+
+ if (state->in)
+ dev_hold(state->in);
+ if (state->out)
+ dev_hold(state->out);
+ if (state->sk)
+ sock_hold(state->sk);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (entry->skb->nf_bridge) {
+ struct net_device *physdev;
+
+ physdev = nf_bridge_get_physindev(entry->skb);
+ if (physdev)
+ dev_hold(physdev);
+ physdev = nf_bridge_get_physoutdev(entry->skb);
+ if (physdev)
+ dev_hold(physdev);
+ }
+#endif
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
+
+/*
+ * Any packet that leaves via this function must come back
+ * through nf_reinject().
+ */
+int nf_queue(struct sk_buff *skb,
+ struct nf_hook_ops *elem,
+ struct nf_hook_state *state,
+ unsigned int queuenum)
+{
+ int status = -ENOENT;
+ struct nf_queue_entry *entry = NULL;
+ const struct nf_afinfo *afinfo;
+ const struct nf_queue_handler *qh;
+
+ /* QUEUE == DROP if no one is waiting, to be safe. */
+ rcu_read_lock();
+
+ qh = rcu_dereference(queue_handler);
+ if (!qh) {
+ status = -ESRCH;
+ goto err_unlock;
+ }
+
+ afinfo = nf_get_afinfo(state->pf);
+ if (!afinfo)
+ goto err_unlock;
+
+ entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
+ if (!entry) {
+ status = -ENOMEM;
+ goto err_unlock;
+ }
+
+ *entry = (struct nf_queue_entry) {
+ .skb = skb,
+ .elem = elem,
+ .state = *state,
+ .size = sizeof(*entry) + afinfo->route_key_size,
+ };
+
+ if (!nf_queue_entry_get_refs(entry)) {
+ status = -ECANCELED;
+ goto err_unlock;
+ }
+ skb_dst_force(skb);
+ afinfo->saveroute(skb, entry);
+ status = qh->outfn(entry, queuenum);
+
+ rcu_read_unlock();
+
+ if (status < 0) {
+ nf_queue_entry_release_refs(entry);
+ goto err;
+ }
+
+ return 0;
+
+err_unlock:
+ rcu_read_unlock();
+err:
+ kfree(entry);
+ return status;
+}
+
+void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+ struct sk_buff *skb = entry->skb;
+ struct nf_hook_ops *elem = entry->elem;
+ const struct nf_afinfo *afinfo;
+ int err;
+
+ rcu_read_lock();
+
+ nf_queue_entry_release_refs(entry);
+
+ /* Continue traversal iff userspace said ok... */
+ if (verdict == NF_REPEAT) {
+ elem = list_entry(elem->list.prev, struct nf_hook_ops, list);
+ verdict = NF_ACCEPT;
+ }
+
+ if (verdict == NF_ACCEPT) {
+ afinfo = nf_get_afinfo(entry->state.pf);
+ if (!afinfo || afinfo->reroute(skb, entry) < 0)
+ verdict = NF_DROP;
+ }
+
+ entry->state.thresh = INT_MIN;
+
+ if (verdict == NF_ACCEPT) {
+ next_hook:
+ verdict = nf_iterate(&nf_hooks[entry->state.pf][entry->state.hook],
+ skb, &entry->state, &elem);
+ }
+
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_STOP:
+ local_bh_disable();
+ entry->state.okfn(entry->state.sk, skb);
+ local_bh_enable();
+ break;
+ case NF_QUEUE:
+ err = nf_queue(skb, elem, &entry->state,
+ verdict >> NF_VERDICT_QBITS);
+ if (err < 0) {
+ if (err == -ECANCELED)
+ goto next_hook;
+ if (err == -ESRCH &&
+ (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+ goto next_hook;
+ kfree_skb(skb);
+ }
+ break;
+ case NF_STOLEN:
+ break;
+ default:
+ kfree_skb(skb);
+ }
+ rcu_read_unlock();
+ kfree(entry);
+}
+EXPORT_SYMBOL(nf_reinject);
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
new file mode 100644
index 000000000..c68c1e58b
--- /dev/null
+++ b/net/netfilter/nf_sockopt.c
@@ -0,0 +1,165 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+/* Sockopts only registered and called from user context, so
+ net locking would be overkill. Also, [gs]etsockopt calls may
+ sleep. */
+static DEFINE_MUTEX(nf_sockopt_mutex);
+static LIST_HEAD(nf_sockopts);
+
+/* Do exclusive ranges overlap? */
+static inline int overlap(int min1, int max1, int min2, int max2)
+{
+ return max1 > min2 && min1 < max2;
+}
+
+/* Functions to register sockopt ranges (exclusive). */
+int nf_register_sockopt(struct nf_sockopt_ops *reg)
+{
+ struct nf_sockopt_ops *ops;
+ int ret = 0;
+
+ mutex_lock(&nf_sockopt_mutex);
+ list_for_each_entry(ops, &nf_sockopts, list) {
+ if (ops->pf == reg->pf
+ && (overlap(ops->set_optmin, ops->set_optmax,
+ reg->set_optmin, reg->set_optmax)
+ || overlap(ops->get_optmin, ops->get_optmax,
+ reg->get_optmin, reg->get_optmax))) {
+ NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
+ ops->set_optmin, ops->set_optmax,
+ ops->get_optmin, ops->get_optmax,
+ reg->set_optmin, reg->set_optmax,
+ reg->get_optmin, reg->get_optmax);
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ list_add(&reg->list, &nf_sockopts);
+out:
+ mutex_unlock(&nf_sockopt_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(nf_register_sockopt);
+
+void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
+{
+ mutex_lock(&nf_sockopt_mutex);
+ list_del(&reg->list);
+ mutex_unlock(&nf_sockopt_mutex);
+}
+EXPORT_SYMBOL(nf_unregister_sockopt);
+
+static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
+ int val, int get)
+{
+ struct nf_sockopt_ops *ops;
+
+ mutex_lock(&nf_sockopt_mutex);
+ list_for_each_entry(ops, &nf_sockopts, list) {
+ if (ops->pf == pf) {
+ if (!try_module_get(ops->owner))
+ goto out_nosup;
+
+ if (get) {
+ if (val >= ops->get_optmin &&
+ val < ops->get_optmax)
+ goto out;
+ } else {
+ if (val >= ops->set_optmin &&
+ val < ops->set_optmax)
+ goto out;
+ }
+ module_put(ops->owner);
+ }
+ }
+out_nosup:
+ ops = ERR_PTR(-ENOPROTOOPT);
+out:
+ mutex_unlock(&nf_sockopt_mutex);
+ return ops;
+}
+
+/* Call get/setsockopt() */
+static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
+ char __user *opt, int *len, int get)
+{
+ struct nf_sockopt_ops *ops;
+ int ret;
+
+ ops = nf_sockopt_find(sk, pf, val, get);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ if (get)
+ ret = ops->get(sk, val, opt, len);
+ else
+ ret = ops->set(sk, val, opt, *len);
+
+ module_put(ops->owner);
+ return ret;
+}
+
+int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
+ unsigned int len)
+{
+ return nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(nf_setsockopt);
+
+int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
+ int *len)
+{
+ return nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(nf_getsockopt);
+
+#ifdef CONFIG_COMPAT
+static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
+ char __user *opt, int *len, int get)
+{
+ struct nf_sockopt_ops *ops;
+ int ret;
+
+ ops = nf_sockopt_find(sk, pf, val, get);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ if (get) {
+ if (ops->compat_get)
+ ret = ops->compat_get(sk, val, opt, len);
+ else
+ ret = ops->get(sk, val, opt, len);
+ } else {
+ if (ops->compat_set)
+ ret = ops->compat_set(sk, val, opt, *len);
+ else
+ ret = ops->set(sk, val, opt, *len);
+ }
+
+ module_put(ops->owner);
+ return ret;
+}
+
+int compat_nf_setsockopt(struct sock *sk, u_int8_t pf,
+ int val, char __user *opt, unsigned int len)
+{
+ return compat_nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(compat_nf_setsockopt);
+
+int compat_nf_getsockopt(struct sock *sk, u_int8_t pf,
+ int val, char __user *opt, int *len)
+{
+ return compat_nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(compat_nf_getsockopt);
+#endif
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
new file mode 100644
index 000000000..52e20c9a4
--- /dev/null
+++ b/net/netfilter/nf_synproxy_core.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <asm/unaligned.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+int synproxy_net_id;
+EXPORT_SYMBOL_GPL(synproxy_net_id);
+
+bool
+synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
+ const struct tcphdr *th, struct synproxy_options *opts)
+{
+ int length = (th->doff * 4) - sizeof(*th);
+ u8 buf[40], *ptr;
+
+ ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
+ if (ptr == NULL)
+ return false;
+
+ opts->options = 0;
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return true;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2)
+ return true;
+ if (opsize > length)
+ return true;
+
+ switch (opcode) {
+ case TCPOPT_MSS:
+ if (opsize == TCPOLEN_MSS) {
+ opts->mss = get_unaligned_be16(ptr);
+ opts->options |= XT_SYNPROXY_OPT_MSS;
+ }
+ break;
+ case TCPOPT_WINDOW:
+ if (opsize == TCPOLEN_WINDOW) {
+ opts->wscale = *ptr;
+ if (opts->wscale > 14)
+ opts->wscale = 14;
+ opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ }
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opsize == TCPOLEN_TIMESTAMP) {
+ opts->tsval = get_unaligned_be32(ptr);
+ opts->tsecr = get_unaligned_be32(ptr + 4);
+ opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+ }
+ break;
+ case TCPOPT_SACK_PERM:
+ if (opsize == TCPOLEN_SACK_PERM)
+ opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+ break;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_parse_options);
+
+unsigned int synproxy_options_size(const struct synproxy_options *opts)
+{
+ unsigned int size = 0;
+
+ if (opts->options & XT_SYNPROXY_OPT_MSS)
+ size += TCPOLEN_MSS_ALIGNED;
+ if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ size += TCPOLEN_TSTAMP_ALIGNED;
+ else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ size += TCPOLEN_SACKPERM_ALIGNED;
+ if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+ size += TCPOLEN_WSCALE_ALIGNED;
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(synproxy_options_size);
+
+void
+synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
+{
+ __be32 *ptr = (__be32 *)(th + 1);
+ u8 options = opts->options;
+
+ if (options & XT_SYNPROXY_OPT_MSS)
+ *ptr++ = htonl((TCPOPT_MSS << 24) |
+ (TCPOLEN_MSS << 16) |
+ opts->mss);
+
+ if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
+ if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+ (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ else
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+
+ *ptr++ = htonl(opts->tsval);
+ *ptr++ = htonl(opts->tsecr);
+ } else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) |
+ TCPOLEN_SACK_PERM);
+
+ if (options & XT_SYNPROXY_OPT_WSCALE)
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_WINDOW << 16) |
+ (TCPOLEN_WINDOW << 8) |
+ opts->wscale);
+}
+EXPORT_SYMBOL_GPL(synproxy_build_options);
+
+void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+ struct synproxy_options *opts)
+{
+ opts->tsecr = opts->tsval;
+ opts->tsval = tcp_time_stamp & ~0x3f;
+
+ if (opts->options & XT_SYNPROXY_OPT_WSCALE) {
+ opts->tsval |= opts->wscale;
+ opts->wscale = info->wscale;
+ } else
+ opts->tsval |= 0xf;
+
+ if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ opts->tsval |= 1 << 4;
+
+ if (opts->options & XT_SYNPROXY_OPT_ECN)
+ opts->tsval |= 1 << 5;
+}
+EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
+
+void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+{
+ opts->wscale = opts->tsecr & 0xf;
+ if (opts->wscale != 0xf)
+ opts->options |= XT_SYNPROXY_OPT_WSCALE;
+
+ opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+
+ opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+}
+EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
+
+unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
+ unsigned int protoff,
+ struct tcphdr *th,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_conn_synproxy *synproxy)
+{
+ unsigned int optoff, optend;
+ u32 *ptr, old;
+
+ if (synproxy->tsoff == 0)
+ return 1;
+
+ optoff = protoff + sizeof(struct tcphdr);
+ optend = protoff + th->doff * 4;
+
+ if (!skb_make_writable(skb, optend))
+ return 0;
+
+ while (optoff < optend) {
+ unsigned char *op = skb->data + optoff;
+
+ switch (op[0]) {
+ case TCPOPT_EOL:
+ return 1;
+ case TCPOPT_NOP:
+ optoff++;
+ continue;
+ default:
+ if (optoff + 1 == optend ||
+ optoff + op[1] > optend ||
+ op[1] < 2)
+ return 0;
+ if (op[0] == TCPOPT_TIMESTAMP &&
+ op[1] == TCPOLEN_TIMESTAMP) {
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+ ptr = (u32 *)&op[2];
+ old = *ptr;
+ *ptr = htonl(ntohl(*ptr) -
+ synproxy->tsoff);
+ } else {
+ ptr = (u32 *)&op[6];
+ old = *ptr;
+ *ptr = htonl(ntohl(*ptr) +
+ synproxy->tsoff);
+ }
+ inet_proto_csum_replace4(&th->check, skb,
+ old, *ptr, 0);
+ return 1;
+ }
+ optoff += op[1];
+ }
+ }
+ return 1;
+}
+EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
+
+static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_synproxy),
+ .align = __alignof__(struct nf_conn_synproxy),
+ .id = NF_CT_EXT_SYNPROXY,
+};
+
+#ifdef CONFIG_PROC_FS
+static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(snet->stats, cpu);
+ }
+
+ return NULL;
+}
+
+static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(snet->stats, cpu);
+ }
+
+ return NULL;
+}
+
+static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+ return;
+}
+
+static int synproxy_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ struct synproxy_stats *stats = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries\t\tsyn_received\t"
+ "cookie_invalid\tcookie_valid\t"
+ "cookie_retrans\tconn_reopened\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0,
+ stats->syn_received,
+ stats->cookie_invalid,
+ stats->cookie_valid,
+ stats->cookie_retrans,
+ stats->conn_reopened);
+
+ return 0;
+}
+
+static const struct seq_operations synproxy_cpu_seq_ops = {
+ .start = synproxy_cpu_seq_start,
+ .next = synproxy_cpu_seq_next,
+ .stop = synproxy_cpu_seq_stop,
+ .show = synproxy_cpu_seq_show,
+};
+
+static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &synproxy_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations synproxy_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = synproxy_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int __net_init synproxy_proc_init(struct net *net)
+{
+ if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat,
+ &synproxy_cpu_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+ remove_proc_entry("synproxy", net->proc_net_stat);
+}
+#else
+static int __net_init synproxy_proc_init(struct net *net)
+{
+ return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+ return;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int __net_init synproxy_net_init(struct net *net)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct nf_conntrack_tuple t;
+ struct nf_conn *ct;
+ int err = -ENOMEM;
+
+ memset(&t, 0, sizeof(t));
+ ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
+ if (IS_ERR(ct)) {
+ err = PTR_ERR(ct);
+ goto err1;
+ }
+
+ if (!nfct_seqadj_ext_add(ct))
+ goto err2;
+ if (!nfct_synproxy_ext_add(ct))
+ goto err2;
+
+ nf_conntrack_tmpl_insert(net, ct);
+ snet->tmpl = ct;
+
+ snet->stats = alloc_percpu(struct synproxy_stats);
+ if (snet->stats == NULL)
+ goto err2;
+
+ err = synproxy_proc_init(net);
+ if (err < 0)
+ goto err3;
+
+ return 0;
+
+err3:
+ free_percpu(snet->stats);
+err2:
+ nf_conntrack_free(ct);
+err1:
+ return err;
+}
+
+static void __net_exit synproxy_net_exit(struct net *net)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+
+ nf_ct_put(snet->tmpl);
+ synproxy_proc_exit(net);
+ free_percpu(snet->stats);
+}
+
+static struct pernet_operations synproxy_net_ops = {
+ .init = synproxy_net_init,
+ .exit = synproxy_net_exit,
+ .id = &synproxy_net_id,
+ .size = sizeof(struct synproxy_net),
+};
+
+static int __init synproxy_core_init(void)
+{
+ int err;
+
+ err = nf_ct_extend_register(&nf_ct_synproxy_extend);
+ if (err < 0)
+ goto err1;
+
+ err = register_pernet_subsys(&synproxy_net_ops);
+ if (err < 0)
+ goto err2;
+
+ return 0;
+
+err2:
+ nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+err1:
+ return err;
+}
+
+static void __exit synproxy_core_exit(void)
+{
+ unregister_pernet_subsys(&synproxy_net_ops);
+ nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+}
+
+module_init(synproxy_core_init);
+module_exit(synproxy_core_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
new file mode 100644
index 000000000..34ded0931
--- /dev/null
+++ b/net/netfilter/nf_tables_api.c
@@ -0,0 +1,4568 @@
+/*
+ * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+static LIST_HEAD(nf_tables_expressions);
+
+/**
+ * nft_register_afinfo - register nf_tables address family info
+ *
+ * @afi: address family info to register
+ *
+ * Register the address family for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
+{
+ INIT_LIST_HEAD(&afi->tables);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_tail_rcu(&afi->list, &net->nft.af_info);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_afinfo);
+
+/**
+ * nft_unregister_afinfo - unregister nf_tables address family info
+ *
+ * @afi: address family info to unregister
+ *
+ * Unregister the address family for use with nf_tables.
+ */
+void nft_unregister_afinfo(struct nft_af_info *afi)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&afi->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_afinfo);
+
+static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family)
+{
+ struct nft_af_info *afi;
+
+ list_for_each_entry(afi, &net->nft.af_info, list) {
+ if (afi->family == family)
+ return afi;
+ }
+ return NULL;
+}
+
+static struct nft_af_info *
+nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
+{
+ struct nft_af_info *afi;
+
+ afi = nft_afinfo_lookup(net, family);
+ if (afi != NULL)
+ return afi;
+#ifdef CONFIG_MODULES
+ if (autoload) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-afinfo-%u", family);
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ afi = nft_afinfo_lookup(net, family);
+ if (afi != NULL)
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+
+static void nft_ctx_init(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nft_af_info *afi,
+ struct nft_table *table,
+ struct nft_chain *chain,
+ const struct nlattr * const *nla)
+{
+ ctx->net = sock_net(skb->sk);
+ ctx->afi = afi;
+ ctx->table = table;
+ ctx->chain = chain;
+ ctx->nla = nla;
+ ctx->portid = NETLINK_CB(skb).portid;
+ ctx->report = nlmsg_report(nlh);
+ ctx->seq = nlh->nlmsg_seq;
+}
+
+static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
+ u32 size)
+{
+ struct nft_trans *trans;
+
+ trans = kzalloc(sizeof(struct nft_trans) + size, GFP_KERNEL);
+ if (trans == NULL)
+ return NULL;
+
+ trans->msg_type = msg_type;
+ trans->ctx = *ctx;
+
+ return trans;
+}
+
+static void nft_trans_destroy(struct nft_trans *trans)
+{
+ list_del(&trans->list);
+ kfree(trans);
+}
+
+static void nf_tables_unregister_hooks(const struct nft_table *table,
+ const struct nft_chain *chain,
+ unsigned int hook_nops)
+{
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ chain->flags & NFT_BASE_CHAIN)
+ nf_unregister_hooks(nft_base_chain(chain)->ops, hook_nops);
+}
+
+/* Internal table flags */
+#define NFT_TABLE_INACTIVE (1 << 15)
+
+static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWTABLE)
+ ctx->table->flags |= NFT_TABLE_INACTIVE;
+
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+}
+
+static int nft_deltable(struct nft_ctx *ctx)
+{
+ int err;
+
+ err = nft_trans_table_add(ctx, NFT_MSG_DELTABLE);
+ if (err < 0)
+ return err;
+
+ list_del_rcu(&ctx->table->list);
+ return err;
+}
+
+static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWCHAIN)
+ ctx->chain->flags |= NFT_CHAIN_INACTIVE;
+
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+}
+
+static int nft_delchain(struct nft_ctx *ctx)
+{
+ int err;
+
+ err = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN);
+ if (err < 0)
+ return err;
+
+ ctx->table->use--;
+ list_del_rcu(&ctx->chain->list);
+
+ return err;
+}
+
+static inline bool
+nft_rule_is_active(struct net *net, const struct nft_rule *rule)
+{
+ return (rule->genmask & nft_genmask_cur(net)) == 0;
+}
+
+static inline int
+nft_rule_is_active_next(struct net *net, const struct nft_rule *rule)
+{
+ return (rule->genmask & nft_genmask_next(net)) == 0;
+}
+
+static inline void
+nft_rule_activate_next(struct net *net, struct nft_rule *rule)
+{
+ /* Now inactive, will be active in the future */
+ rule->genmask = nft_genmask_cur(net);
+}
+
+static inline void
+nft_rule_deactivate_next(struct net *net, struct nft_rule *rule)
+{
+ rule->genmask = nft_genmask_next(net);
+}
+
+static inline void nft_rule_clear(struct net *net, struct nft_rule *rule)
+{
+ rule->genmask &= ~nft_genmask_next(net);
+}
+
+static int
+nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
+{
+ /* You cannot delete the same rule twice */
+ if (nft_rule_is_active_next(ctx->net, rule)) {
+ nft_rule_deactivate_next(ctx->net, rule);
+ ctx->chain->use--;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_rule *rule)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule));
+ if (trans == NULL)
+ return NULL;
+
+ nft_trans_rule(trans) = rule;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return trans;
+}
+
+static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
+{
+ struct nft_trans *trans;
+ int err;
+
+ trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule);
+ if (trans == NULL)
+ return -ENOMEM;
+
+ err = nf_tables_delrule_deactivate(ctx, rule);
+ if (err < 0) {
+ nft_trans_destroy(trans);
+ return err;
+ }
+
+ return 0;
+}
+
+static int nft_delrule_by_chain(struct nft_ctx *ctx)
+{
+ struct nft_rule *rule;
+ int err;
+
+ list_for_each_entry(rule, &ctx->chain->rules, list) {
+ err = nft_delrule(ctx, rule);
+ if (err < 0)
+ return err;
+ }
+ return 0;
+}
+
+/* Internal set flag */
+#define NFT_SET_INACTIVE (1 << 15)
+
+static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_set *set)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) {
+ nft_trans_set_id(trans) =
+ ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
+ set->flags |= NFT_SET_INACTIVE;
+ }
+ nft_trans_set(trans) = set;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+}
+
+static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
+{
+ int err;
+
+ err = nft_trans_set_add(ctx, NFT_MSG_DELSET, set);
+ if (err < 0)
+ return err;
+
+ list_del_rcu(&set->list);
+ ctx->table->use--;
+
+ return err;
+}
+
+/*
+ * Tables
+ */
+
+static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla)
+{
+ struct nft_table *table;
+
+ list_for_each_entry(table, &afi->tables, list) {
+ if (!nla_strcmp(nla, table->name))
+ return table;
+ }
+ return NULL;
+}
+
+static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla)
+{
+ struct nft_table *table;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ table = nft_table_lookup(afi, nla);
+ if (table != NULL)
+ return table;
+
+ return ERR_PTR(-ENOENT);
+}
+
+static inline u64 nf_tables_alloc_handle(struct nft_table *table)
+{
+ return ++table->hgenerator;
+}
+
+static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX];
+
+static const struct nf_chain_type *
+__nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
+{
+ int i;
+
+ for (i = 0; i < NFT_CHAIN_T_MAX; i++) {
+ if (chain_type[family][i] != NULL &&
+ !nla_strcmp(nla, chain_type[family][i]->name))
+ return chain_type[family][i];
+ }
+ return NULL;
+}
+
+static const struct nf_chain_type *
+nf_tables_chain_type_lookup(const struct nft_af_info *afi,
+ const struct nlattr *nla,
+ bool autoload)
+{
+ const struct nf_chain_type *type;
+
+ type = __nf_tables_chain_type_lookup(afi->family, nla);
+ if (type != NULL)
+ return type;
+#ifdef CONFIG_MODULES
+ if (autoload) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-chain-%u-%.*s", afi->family,
+ nla_len(nla), (const char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ type = __nf_tables_chain_type_lookup(afi->family, nla);
+ if (type != NULL)
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
+ [NFTA_TABLE_NAME] = { .type = NLA_STRING,
+ .len = NFT_TABLE_MAXNAMELEN - 1 },
+ [NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
+};
+
+static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq, int event, u32 flags,
+ int family, const struct nft_table *table)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
+ nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
+ nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_table_notify(const struct nft_ctx *ctx, int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq,
+ event, 0, ctx->afi->family, ctx->table);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static int nf_tables_dump_tables(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_table_info(skb, net,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWTABLE,
+ NLM_F_MULTI,
+ afi->family, table) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+done:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_tables,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
+ family, table);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static int nf_tables_table_enable(const struct nft_af_info *afi,
+ struct nft_table *table)
+{
+ struct nft_chain *chain;
+ int err, i = 0;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
+ if (err < 0)
+ goto err;
+
+ i++;
+ }
+ return 0;
+err:
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ continue;
+
+ if (i-- <= 0)
+ break;
+
+ nf_unregister_hooks(nft_base_chain(chain)->ops, afi->nops);
+ }
+ return err;
+}
+
+static void nf_tables_table_disable(const struct nft_af_info *afi,
+ struct nft_table *table)
+{
+ struct nft_chain *chain;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (chain->flags & NFT_BASE_CHAIN)
+ nf_unregister_hooks(nft_base_chain(chain)->ops,
+ afi->nops);
+ }
+}
+
+static int nf_tables_updtable(struct nft_ctx *ctx)
+{
+ struct nft_trans *trans;
+ u32 flags;
+ int ret = 0;
+
+ if (!ctx->nla[NFTA_TABLE_FLAGS])
+ return 0;
+
+ flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS]));
+ if (flags & ~NFT_TABLE_F_DORMANT)
+ return -EINVAL;
+
+ if (flags == ctx->table->flags)
+ return 0;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
+ sizeof(struct nft_trans_table));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ if ((flags & NFT_TABLE_F_DORMANT) &&
+ !(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_trans_table_enable(trans) = false;
+ } else if (!(flags & NFT_TABLE_F_DORMANT) &&
+ ctx->table->flags & NFT_TABLE_F_DORMANT) {
+ ret = nf_tables_table_enable(ctx->afi, ctx->table);
+ if (ret >= 0) {
+ ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
+ nft_trans_table_enable(trans) = true;
+ }
+ }
+ if (ret < 0)
+ goto err;
+
+ nft_trans_table_update(trans) = true;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+err:
+ nft_trans_destroy(trans);
+ return ret;
+}
+
+static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nlattr *name;
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ u32 flags = 0;
+ struct nft_ctx ctx;
+ int err;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ name = nla[NFTA_TABLE_NAME];
+ table = nf_tables_table_lookup(afi, name);
+ if (IS_ERR(table)) {
+ if (PTR_ERR(table) != -ENOENT)
+ return PTR_ERR(table);
+ table = NULL;
+ }
+
+ if (table != NULL) {
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ return nf_tables_updtable(&ctx);
+ }
+
+ if (nla[NFTA_TABLE_FLAGS]) {
+ flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
+ if (flags & ~NFT_TABLE_F_DORMANT)
+ return -EINVAL;
+ }
+
+ if (!try_module_get(afi->owner))
+ return -EAFNOSUPPORT;
+
+ err = -ENOMEM;
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
+ if (table == NULL)
+ goto err1;
+
+ nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
+ INIT_LIST_HEAD(&table->chains);
+ INIT_LIST_HEAD(&table->sets);
+ table->flags = flags;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
+ if (err < 0)
+ goto err2;
+
+ list_add_tail_rcu(&table->list, &afi->tables);
+ return 0;
+err2:
+ kfree(table);
+err1:
+ module_put(afi->owner);
+ return err;
+}
+
+static int nft_flush_table(struct nft_ctx *ctx)
+{
+ int err;
+ struct nft_chain *chain, *nc;
+ struct nft_set *set, *ns;
+
+ list_for_each_entry(chain, &ctx->table->chains, list) {
+ ctx->chain = chain;
+
+ err = nft_delrule_by_chain(ctx);
+ if (err < 0)
+ goto out;
+ }
+
+ list_for_each_entry_safe(set, ns, &ctx->table->sets, list) {
+ if (set->flags & NFT_SET_ANONYMOUS &&
+ !list_empty(&set->bindings))
+ continue;
+
+ err = nft_delset(ctx, set);
+ if (err < 0)
+ goto out;
+ }
+
+ list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
+ ctx->chain = chain;
+
+ err = nft_delchain(ctx);
+ if (err < 0)
+ goto out;
+ }
+
+ err = nft_deltable(ctx);
+out:
+ return err;
+}
+
+static int nft_flush(struct nft_ctx *ctx, int family)
+{
+ struct nft_af_info *afi;
+ struct nft_table *table, *nt;
+ const struct nlattr * const *nla = ctx->nla;
+ int err = 0;
+
+ list_for_each_entry(afi, &ctx->net->nft.af_info, list) {
+ if (family != AF_UNSPEC && afi->family != family)
+ continue;
+
+ ctx->afi = afi;
+ list_for_each_entry_safe(table, nt, &afi->tables, list) {
+ if (nla[NFTA_TABLE_NAME] &&
+ nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
+ continue;
+
+ ctx->table = table;
+
+ err = nft_flush_table(ctx);
+ if (err < 0)
+ goto out;
+ }
+ }
+out:
+ return err;
+}
+
+static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ struct nft_ctx ctx;
+
+ nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla);
+ if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
+ return nft_flush(&ctx, family);
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ ctx.afi = afi;
+ ctx.table = table;
+
+ return nft_flush_table(&ctx);
+}
+
+static void nf_tables_table_destroy(struct nft_ctx *ctx)
+{
+ BUG_ON(ctx->table->use > 0);
+
+ kfree(ctx->table);
+ module_put(ctx->afi->owner);
+}
+
+int nft_register_chain_type(const struct nf_chain_type *ctype)
+{
+ int err = 0;
+
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (chain_type[ctype->family][ctype->type] != NULL) {
+ err = -EBUSY;
+ goto out;
+ }
+ chain_type[ctype->family][ctype->type] = ctype;
+out:
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_register_chain_type);
+
+void nft_unregister_chain_type(const struct nf_chain_type *ctype)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ chain_type[ctype->family][ctype->type] = NULL;
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
+
+/*
+ * Chains
+ */
+
+static struct nft_chain *
+nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle)
+{
+ struct nft_chain *chain;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (chain->handle == handle)
+ return chain;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
+ const struct nlattr *nla)
+{
+ struct nft_chain *chain;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nla_strcmp(nla, chain->name))
+ return chain;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
+ [NFTA_CHAIN_TABLE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 },
+ [NFTA_CHAIN_NAME] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_CHAIN_HOOK] = { .type = NLA_NESTED },
+ [NFTA_CHAIN_POLICY] = { .type = NLA_U32 },
+ [NFTA_CHAIN_TYPE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
+ [NFTA_HOOK_HOOKNUM] = { .type = NLA_U32 },
+ [NFTA_HOOK_PRIORITY] = { .type = NLA_U32 },
+};
+
+static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
+{
+ struct nft_stats *cpu_stats, total;
+ struct nlattr *nest;
+ unsigned int seq;
+ u64 pkts, bytes;
+ int cpu;
+
+ memset(&total, 0, sizeof(total));
+ for_each_possible_cpu(cpu) {
+ cpu_stats = per_cpu_ptr(stats, cpu);
+ do {
+ seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ pkts = cpu_stats->pkts;
+ bytes = cpu_stats->bytes;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ total.pkts += pkts;
+ total.bytes += bytes;
+ }
+ nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) ||
+ nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq, int event, u32 flags,
+ int family, const struct nft_table *table,
+ const struct nft_chain *chain)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle)))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
+ goto nla_put_failure;
+
+ if (chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain = nft_base_chain(chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
+ htonl(basechain->policy)))
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
+ goto nla_put_failure;
+
+ if (nft_dump_stats(skb, nft_base_chain(chain)->stats))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
+ event, 0, ctx->afi->family, ctx->table,
+ ctx->chain);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static int nf_tables_dump_chains(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(chain, &table->chains, list) {
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_chain_info(skb, net,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWCHAIN,
+ NLM_F_MULTI,
+ afi->family, table, chain) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_chains,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
+ family, table, chain);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+ [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
+ [NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
+};
+
+static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
+{
+ struct nlattr *tb[NFTA_COUNTER_MAX+1];
+ struct nft_stats __percpu *newstats;
+ struct nft_stats *stats;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
+ return ERR_PTR(-EINVAL);
+
+ newstats = netdev_alloc_pcpu_stats(struct nft_stats);
+ if (newstats == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ /* Restore old counters on this cpu, no problem. Per-cpu statistics
+ * are not exposed to userspace.
+ */
+ preempt_disable();
+ stats = this_cpu_ptr(newstats);
+ stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ preempt_enable();
+
+ return newstats;
+}
+
+static void nft_chain_stats_replace(struct nft_base_chain *chain,
+ struct nft_stats __percpu *newstats)
+{
+ if (newstats == NULL)
+ return;
+
+ if (chain->stats) {
+ struct nft_stats __percpu *oldstats =
+ nft_dereference(chain->stats);
+
+ rcu_assign_pointer(chain->stats, newstats);
+ synchronize_rcu();
+ free_percpu(oldstats);
+ } else
+ rcu_assign_pointer(chain->stats, newstats);
+}
+
+static void nf_tables_chain_destroy(struct nft_chain *chain)
+{
+ BUG_ON(chain->use > 0);
+
+ if (chain->flags & NFT_BASE_CHAIN) {
+ module_put(nft_base_chain(chain)->type->owner);
+ free_percpu(nft_base_chain(chain)->stats);
+ kfree(nft_base_chain(chain));
+ } else {
+ kfree(chain);
+ }
+}
+
+static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nlattr * uninitialized_var(name);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_base_chain *basechain = NULL;
+ struct nlattr *ha[NFTA_HOOK_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ u8 policy = NF_ACCEPT;
+ u64 handle = 0;
+ unsigned int i;
+ struct nft_stats __percpu *stats;
+ int err;
+ bool create;
+ struct nft_ctx ctx;
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, family, true);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ chain = NULL;
+ name = nla[NFTA_CHAIN_NAME];
+
+ if (nla[NFTA_CHAIN_HANDLE]) {
+ handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+ chain = nf_tables_chain_lookup_byhandle(table, handle);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ } else {
+ chain = nf_tables_chain_lookup(table, name);
+ if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) != -ENOENT)
+ return PTR_ERR(chain);
+ chain = NULL;
+ }
+ }
+
+ if (nla[NFTA_CHAIN_POLICY]) {
+ if ((chain != NULL &&
+ !(chain->flags & NFT_BASE_CHAIN)))
+ return -EOPNOTSUPP;
+
+ if (chain == NULL &&
+ nla[NFTA_CHAIN_HOOK] == NULL)
+ return -EOPNOTSUPP;
+
+ policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
+ switch (policy) {
+ case NF_DROP:
+ case NF_ACCEPT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (chain != NULL) {
+ struct nft_stats *stats = NULL;
+ struct nft_trans *trans;
+
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (nla[NFTA_CHAIN_HANDLE] && name &&
+ !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
+ return -EEXIST;
+
+ if (nla[NFTA_CHAIN_COUNTERS]) {
+ if (!(chain->flags & NFT_BASE_CHAIN))
+ return -EOPNOTSUPP;
+
+ stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
+ if (IS_ERR(stats))
+ return PTR_ERR(stats);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN,
+ sizeof(struct nft_trans_chain));
+ if (trans == NULL) {
+ free_percpu(stats);
+ return -ENOMEM;
+ }
+
+ nft_trans_chain_stats(trans) = stats;
+ nft_trans_chain_update(trans) = true;
+
+ if (nla[NFTA_CHAIN_POLICY])
+ nft_trans_chain_policy(trans) = policy;
+ else
+ nft_trans_chain_policy(trans) = -1;
+
+ if (nla[NFTA_CHAIN_HANDLE] && name) {
+ nla_strlcpy(nft_trans_chain_name(trans), name,
+ NFT_CHAIN_MAXNAMELEN);
+ }
+ list_add_tail(&trans->list, &net->nft.commit_list);
+ return 0;
+ }
+
+ if (table->use == UINT_MAX)
+ return -EOVERFLOW;
+
+ if (nla[NFTA_CHAIN_HOOK]) {
+ const struct nf_chain_type *type;
+ struct nf_hook_ops *ops;
+ nf_hookfn *hookfn;
+ u32 hooknum, priority;
+
+ type = chain_type[family][NFT_CHAIN_T_DEFAULT];
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = nf_tables_chain_type_lookup(afi,
+ nla[NFTA_CHAIN_TYPE],
+ create);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+ }
+
+ err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
+ nft_hook_policy);
+ if (err < 0)
+ return err;
+ if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
+ ha[NFTA_HOOK_PRIORITY] == NULL)
+ return -EINVAL;
+
+ hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ if (hooknum >= afi->nhooks)
+ return -EINVAL;
+ priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+
+ if (!(type->hook_mask & (1 << hooknum)))
+ return -EOPNOTSUPP;
+ if (!try_module_get(type->owner))
+ return -ENOENT;
+ hookfn = type->hooks[hooknum];
+
+ basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+ if (basechain == NULL) {
+ module_put(type->owner);
+ return -ENOMEM;
+ }
+
+ if (nla[NFTA_CHAIN_COUNTERS]) {
+ stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
+ if (IS_ERR(stats)) {
+ module_put(type->owner);
+ kfree(basechain);
+ return PTR_ERR(stats);
+ }
+ basechain->stats = stats;
+ } else {
+ stats = netdev_alloc_pcpu_stats(struct nft_stats);
+ if (stats == NULL) {
+ module_put(type->owner);
+ kfree(basechain);
+ return -ENOMEM;
+ }
+ rcu_assign_pointer(basechain->stats, stats);
+ }
+
+ write_pnet(&basechain->pnet, net);
+ basechain->type = type;
+ chain = &basechain->chain;
+
+ for (i = 0; i < afi->nops; i++) {
+ ops = &basechain->ops[i];
+ ops->pf = family;
+ ops->owner = afi->owner;
+ ops->hooknum = hooknum;
+ ops->priority = priority;
+ ops->priv = chain;
+ ops->hook = afi->hooks[ops->hooknum];
+ if (hookfn)
+ ops->hook = hookfn;
+ if (afi->hook_ops_init)
+ afi->hook_ops_init(ops, i);
+ }
+
+ chain->flags |= NFT_BASE_CHAIN;
+ basechain->policy = policy;
+ } else {
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ if (chain == NULL)
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&chain->rules);
+ chain->handle = nf_tables_alloc_handle(table);
+ chain->table = table;
+ nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ chain->flags & NFT_BASE_CHAIN) {
+ err = nf_register_hooks(nft_base_chain(chain)->ops, afi->nops);
+ if (err < 0)
+ goto err1;
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN);
+ if (err < 0)
+ goto err2;
+
+ table->use++;
+ list_add_tail_rcu(&chain->list, &table->chains);
+ return 0;
+err2:
+ nf_tables_unregister_hooks(table, chain, afi->nops);
+err1:
+ nf_tables_chain_destroy(chain);
+ return err;
+}
+
+static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ struct nft_ctx ctx;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+ if (chain->use > 0)
+ return -EBUSY;
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
+ return nft_delchain(&ctx);
+}
+
+/*
+ * Expressions
+ */
+
+/**
+ * nft_register_expr - register nf_tables expr type
+ * @ops: expr type
+ *
+ * Registers the expr type for use with nf_tables. Returns zero on
+ * success or a negative errno code otherwise.
+ */
+int nft_register_expr(struct nft_expr_type *type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (type->family == NFPROTO_UNSPEC)
+ list_add_tail_rcu(&type->list, &nf_tables_expressions);
+ else
+ list_add_rcu(&type->list, &nf_tables_expressions);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_expr);
+
+/**
+ * nft_unregister_expr - unregister nf_tables expr type
+ * @ops: expr type
+ *
+ * Unregisters the expr typefor use with nf_tables.
+ */
+void nft_unregister_expr(struct nft_expr_type *type)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&type->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_expr);
+
+static const struct nft_expr_type *__nft_expr_type_get(u8 family,
+ struct nlattr *nla)
+{
+ const struct nft_expr_type *type;
+
+ list_for_each_entry(type, &nf_tables_expressions, list) {
+ if (!nla_strcmp(nla, type->name) &&
+ (!type->family || type->family == family))
+ return type;
+ }
+ return NULL;
+}
+
+static const struct nft_expr_type *nft_expr_type_get(u8 family,
+ struct nlattr *nla)
+{
+ const struct nft_expr_type *type;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ type = __nft_expr_type_get(family, nla);
+ if (type != NULL && try_module_get(type->owner))
+ return type;
+
+#ifdef CONFIG_MODULES
+ if (type == NULL) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-expr-%u-%.*s", family,
+ nla_len(nla), (char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_expr_type_get(family, nla))
+ return ERR_PTR(-EAGAIN);
+
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-expr-%.*s",
+ nla_len(nla), (char *)nla_data(nla));
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (__nft_expr_type_get(family, nla))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ return ERR_PTR(-ENOENT);
+}
+
+static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
+ [NFTA_EXPR_NAME] = { .type = NLA_STRING },
+ [NFTA_EXPR_DATA] = { .type = NLA_NESTED },
+};
+
+static int nf_tables_fill_expr_info(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
+ goto nla_put_failure;
+
+ if (expr->ops->dump) {
+ struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
+ if (data == NULL)
+ goto nla_put_failure;
+ if (expr->ops->dump(skb, expr) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, data);
+ }
+
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+};
+
+int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
+ const struct nft_expr *expr)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ goto nla_put_failure;
+ if (nf_tables_fill_expr_info(skb, expr) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+struct nft_expr_info {
+ const struct nft_expr_ops *ops;
+ struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
+};
+
+static int nf_tables_expr_parse(const struct nft_ctx *ctx,
+ const struct nlattr *nla,
+ struct nft_expr_info *info)
+{
+ const struct nft_expr_type *type;
+ const struct nft_expr_ops *ops;
+ struct nlattr *tb[NFTA_EXPR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy);
+ if (err < 0)
+ return err;
+
+ type = nft_expr_type_get(ctx->afi->family, tb[NFTA_EXPR_NAME]);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+
+ if (tb[NFTA_EXPR_DATA]) {
+ err = nla_parse_nested(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA], type->policy);
+ if (err < 0)
+ goto err1;
+ } else
+ memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
+
+ if (type->select_ops != NULL) {
+ ops = type->select_ops(ctx,
+ (const struct nlattr * const *)info->tb);
+ if (IS_ERR(ops)) {
+ err = PTR_ERR(ops);
+ goto err1;
+ }
+ } else
+ ops = type->ops;
+
+ info->ops = ops;
+ return 0;
+
+err1:
+ module_put(type->owner);
+ return err;
+}
+
+static int nf_tables_newexpr(const struct nft_ctx *ctx,
+ const struct nft_expr_info *info,
+ struct nft_expr *expr)
+{
+ const struct nft_expr_ops *ops = info->ops;
+ int err;
+
+ expr->ops = ops;
+ if (ops->init) {
+ err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
+ if (err < 0)
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ expr->ops = NULL;
+ return err;
+}
+
+static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
+{
+ if (expr->ops->destroy)
+ expr->ops->destroy(ctx, expr);
+ module_put(expr->ops->type->owner);
+}
+
+struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
+{
+ struct nft_expr_info info;
+ struct nft_expr *expr;
+ int err;
+
+ err = nf_tables_expr_parse(ctx, nla, &info);
+ if (err < 0)
+ goto err1;
+
+ err = -ENOMEM;
+ expr = kzalloc(info.ops->size, GFP_KERNEL);
+ if (expr == NULL)
+ goto err2;
+
+ err = nf_tables_newexpr(ctx, &info, expr);
+ if (err < 0)
+ goto err2;
+
+ return expr;
+err2:
+ module_put(info.ops->type->owner);
+err1:
+ return ERR_PTR(err);
+}
+
+void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
+{
+ nf_tables_expr_destroy(ctx, expr);
+ kfree(expr);
+}
+
+/*
+ * Rules
+ */
+
+static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
+ u64 handle)
+{
+ struct nft_rule *rule;
+
+ // FIXME: this sucks
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (handle == rule->handle)
+ return rule;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
+ const struct nlattr *nla)
+{
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+}
+
+static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
+ [NFTA_RULE_TABLE] = { .type = NLA_STRING },
+ [NFTA_RULE_CHAIN] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_RULE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_RULE_COMPAT] = { .type = NLA_NESTED },
+ [NFTA_RULE_POSITION] = { .type = NLA_U64 },
+ [NFTA_RULE_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
+};
+
+static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq, int event,
+ u32 flags, int family,
+ const struct nft_table *table,
+ const struct nft_chain *chain,
+ const struct nft_rule *rule)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ const struct nft_expr *expr, *next;
+ struct nlattr *list;
+ const struct nft_rule *prule;
+ int type = event | NFNL_SUBSYS_NFTABLES << 8;
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle)))
+ goto nla_put_failure;
+
+ if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
+ prule = list_entry(rule->list.prev, struct nft_rule, list);
+ if (nla_put_be64(skb, NFTA_RULE_POSITION,
+ cpu_to_be64(prule->handle)))
+ goto nla_put_failure;
+ }
+
+ list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
+ if (list == NULL)
+ goto nla_put_failure;
+ nft_rule_for_each_expr(expr, next, rule) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, list);
+
+ if (rule->udata) {
+ struct nft_userdata *udata = nft_userdata(rule);
+ if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1,
+ udata->data) < 0)
+ goto nla_put_failure;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_rule_notify(const struct nft_ctx *ctx,
+ const struct nft_rule *rule,
+ int event)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
+ event, 0, ctx->afi->family, ctx->table,
+ ctx->chain, rule);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ ctx->report, GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static int nf_tables_dump_rules(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ unsigned int idx = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (family != NFPROTO_UNSPEC && family != afi->family)
+ continue;
+
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ list_for_each_entry_rcu(chain, &table->chains, list) {
+ list_for_each_entry_rcu(rule, &chain->rules, list) {
+ if (!nft_rule_is_active(net, rule))
+ goto cont;
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
+ if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWRULE,
+ NLM_F_MULTI | NLM_F_APPEND,
+ afi->family, table, chain, rule) < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ }
+ }
+ }
+done:
+ rcu_read_unlock();
+
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_af_info *afi;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ struct sk_buff *skb2;
+ struct net *net = sock_net(skb->sk);
+ int family = nfmsg->nfgen_family;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_rules,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_CHAIN_INACTIVE)
+ return -ENOENT;
+
+ rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ return -ENOMEM;
+
+ err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+ family, table, chain, rule);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
+ struct nft_rule *rule)
+{
+ struct nft_expr *expr;
+
+ /*
+ * Careful: some expressions might not be initialized in case this
+ * is called on error from nf_tables_newrule().
+ */
+ expr = nft_expr_first(rule);
+ while (expr->ops && expr != nft_expr_last(rule)) {
+ nf_tables_expr_destroy(ctx, expr);
+ expr = nft_expr_next(expr);
+ }
+ kfree(rule);
+}
+
+#define NFT_RULE_MAXEXPRS 128
+
+static struct nft_expr_info *info;
+
+static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_rule *rule, *old_rule = NULL;
+ struct nft_userdata *udata;
+ struct nft_trans *trans = NULL;
+ struct nft_expr *expr;
+ struct nft_ctx ctx;
+ struct nlattr *tmp;
+ unsigned int size, i, n, ulen = 0, usize = 0;
+ int err, rem;
+ bool create;
+ u64 handle, pos_handle;
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+
+ if (nla[NFTA_RULE_HANDLE]) {
+ handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
+ rule = __nf_tables_rule_lookup(chain, handle);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ old_rule = rule;
+ else
+ return -EOPNOTSUPP;
+ } else {
+ if (!create || nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EINVAL;
+ handle = nf_tables_alloc_handle(table);
+
+ if (chain->use == UINT_MAX)
+ return -EOVERFLOW;
+ }
+
+ if (nla[NFTA_RULE_POSITION]) {
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -EOPNOTSUPP;
+
+ pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
+ old_rule = __nf_tables_rule_lookup(chain, pos_handle);
+ if (IS_ERR(old_rule))
+ return PTR_ERR(old_rule);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
+ n = 0;
+ size = 0;
+ if (nla[NFTA_RULE_EXPRESSIONS]) {
+ nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
+ err = -EINVAL;
+ if (nla_type(tmp) != NFTA_LIST_ELEM)
+ goto err1;
+ if (n == NFT_RULE_MAXEXPRS)
+ goto err1;
+ err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
+ if (err < 0)
+ goto err1;
+ size += info[n].ops->size;
+ n++;
+ }
+ }
+ /* Check for overflow of dlen field */
+ err = -EFBIG;
+ if (size >= 1 << 12)
+ goto err1;
+
+ if (nla[NFTA_RULE_USERDATA]) {
+ ulen = nla_len(nla[NFTA_RULE_USERDATA]);
+ if (ulen > 0)
+ usize = sizeof(struct nft_userdata) + ulen;
+ }
+
+ err = -ENOMEM;
+ rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL);
+ if (rule == NULL)
+ goto err1;
+
+ nft_rule_activate_next(net, rule);
+
+ rule->handle = handle;
+ rule->dlen = size;
+ rule->udata = ulen ? 1 : 0;
+
+ if (ulen) {
+ udata = nft_userdata(rule);
+ udata->len = ulen - 1;
+ nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen);
+ }
+
+ expr = nft_expr_first(rule);
+ for (i = 0; i < n; i++) {
+ err = nf_tables_newexpr(&ctx, &info[i], expr);
+ if (err < 0)
+ goto err2;
+ info[i].ops = NULL;
+ expr = nft_expr_next(expr);
+ }
+
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (nft_rule_is_active_next(net, old_rule)) {
+ trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE,
+ old_rule);
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+ nft_rule_deactivate_next(net, old_rule);
+ chain->use--;
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ } else {
+ err = -ENOENT;
+ goto err2;
+ }
+ } else if (nlh->nlmsg_flags & NLM_F_APPEND)
+ if (old_rule)
+ list_add_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_tail_rcu(&rule->list, &chain->rules);
+ else {
+ if (old_rule)
+ list_add_tail_rcu(&rule->list, &old_rule->list);
+ else
+ list_add_rcu(&rule->list, &chain->rules);
+ }
+
+ if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ err = -ENOMEM;
+ goto err3;
+ }
+ chain->use++;
+ return 0;
+
+err3:
+ list_del_rcu(&rule->list);
+err2:
+ nf_tables_rule_destroy(&ctx, rule);
+err1:
+ for (i = 0; i < n; i++) {
+ if (info[i].ops != NULL)
+ module_put(info[i].ops->type->owner);
+ }
+ return err;
+}
+
+static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_chain *chain = NULL;
+ struct nft_rule *rule;
+ int family = nfmsg->nfgen_family, err = 0;
+ struct nft_ctx ctx;
+
+ afi = nf_tables_afinfo_lookup(net, family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+
+ if (nla[NFTA_RULE_CHAIN]) {
+ chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ }
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
+ if (chain) {
+ if (nla[NFTA_RULE_HANDLE]) {
+ rule = nf_tables_rule_lookup(chain,
+ nla[NFTA_RULE_HANDLE]);
+ if (IS_ERR(rule))
+ return PTR_ERR(rule);
+
+ err = nft_delrule(&ctx, rule);
+ } else {
+ err = nft_delrule_by_chain(&ctx);
+ }
+ } else {
+ list_for_each_entry(chain, &table->chains, list) {
+ ctx.chain = chain;
+ err = nft_delrule_by_chain(&ctx);
+ if (err < 0)
+ break;
+ }
+ }
+
+ return err;
+}
+
+/*
+ * Sets
+ */
+
+static LIST_HEAD(nf_tables_set_ops);
+
+int nft_register_set(struct nft_set_ops *ops)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_add_tail_rcu(&ops->list, &nf_tables_set_ops);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_set);
+
+void nft_unregister_set(struct nft_set_ops *ops)
+{
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_del_rcu(&ops->list);
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_set);
+
+/*
+ * Select a set implementation based on the data characteristics and the
+ * given policy. The total memory use might not be known if no size is
+ * given, in that case the amount of memory per element is used.
+ */
+static const struct nft_set_ops *
+nft_select_set_ops(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc,
+ enum nft_set_policies policy)
+{
+ const struct nft_set_ops *ops, *bops;
+ struct nft_set_estimate est, best;
+ u32 features;
+
+#ifdef CONFIG_MODULES
+ if (list_empty(&nf_tables_set_ops)) {
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ request_module("nft-set");
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ if (!list_empty(&nf_tables_set_ops))
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ features = 0;
+ if (nla[NFTA_SET_FLAGS] != NULL) {
+ features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
+ features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT;
+ }
+
+ bops = NULL;
+ best.size = ~0;
+ best.class = ~0;
+
+ list_for_each_entry(ops, &nf_tables_set_ops, list) {
+ if ((ops->features & features) != features)
+ continue;
+ if (!ops->estimate(desc, features, &est))
+ continue;
+
+ switch (policy) {
+ case NFT_SET_POL_PERFORMANCE:
+ if (est.class < best.class)
+ break;
+ if (est.class == best.class && est.size < best.size)
+ break;
+ continue;
+ case NFT_SET_POL_MEMORY:
+ if (est.size < best.size)
+ break;
+ if (est.size == best.size && est.class < best.class)
+ break;
+ continue;
+ default:
+ break;
+ }
+
+ if (!try_module_get(ops->owner))
+ continue;
+ if (bops != NULL)
+ module_put(bops->owner);
+
+ bops = ops;
+ best = est;
+ }
+
+ if (bops != NULL)
+ return bops;
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
+ [NFTA_SET_TABLE] = { .type = NLA_STRING },
+ [NFTA_SET_NAME] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
+ [NFTA_SET_FLAGS] = { .type = NLA_U32 },
+ [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 },
+ [NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
+ [NFTA_SET_DATA_TYPE] = { .type = NLA_U32 },
+ [NFTA_SET_DATA_LEN] = { .type = NLA_U32 },
+ [NFTA_SET_POLICY] = { .type = NLA_U32 },
+ [NFTA_SET_DESC] = { .type = NLA_NESTED },
+ [NFTA_SET_ID] = { .type = NLA_U32 },
+ [NFTA_SET_TIMEOUT] = { .type = NLA_U64 },
+ [NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
+ [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
+};
+
+static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ struct net *net = sock_net(skb->sk);
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi = NULL;
+ struct nft_table *table = NULL;
+
+ if (nfmsg->nfgen_family != NFPROTO_UNSPEC) {
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+ }
+
+ if (nla[NFTA_SET_TABLE] != NULL) {
+ if (afi == NULL)
+ return -EAFNOSUPPORT;
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
+ }
+
+ nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ return 0;
+}
+
+struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
+ const struct nlattr *nla)
+{
+ struct nft_set *set;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry(set, &table->sets, list) {
+ if (!nla_strcmp(nla, set->name))
+ return set;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
+ const struct nlattr *nla)
+{
+ struct nft_trans *trans;
+ u32 id = ntohl(nla_get_be32(nla));
+
+ list_for_each_entry(trans, &net->nft.commit_list, list) {
+ if (trans->msg_type == NFT_MSG_NEWSET &&
+ id == nft_trans_set_id(trans))
+ return nft_trans_set(trans);
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
+ const char *name)
+{
+ const struct nft_set *i;
+ const char *p;
+ unsigned long *inuse;
+ unsigned int n = 0, min = 0;
+
+ p = strnchr(name, IFNAMSIZ, '%');
+ if (p != NULL) {
+ if (p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
+
+ inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+ if (inuse == NULL)
+ return -ENOMEM;
+cont:
+ list_for_each_entry(i, &ctx->table->sets, list) {
+ int tmp;
+
+ if (!sscanf(i->name, name, &tmp))
+ continue;
+ if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE)
+ continue;
+
+ set_bit(tmp - min, inuse);
+ }
+
+ n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
+ if (n >= BITS_PER_BYTE * PAGE_SIZE) {
+ min += BITS_PER_BYTE * PAGE_SIZE;
+ memset(inuse, 0, PAGE_SIZE);
+ goto cont;
+ }
+ free_page((unsigned long)inuse);
+ }
+
+ snprintf(set->name, sizeof(set->name), name, min + n);
+ list_for_each_entry(i, &ctx->table->sets, list) {
+ if (!strcmp(set->name, i->name))
+ return -ENFILE;
+ }
+ return 0;
+}
+
+static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
+ const struct nft_set *set, u16 event, u16 flags)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *desc;
+ u32 portid = ctx->portid;
+ u32 seq = ctx->seq;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx->afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_NAME, set->name))
+ goto nla_put_failure;
+ if (set->flags != 0)
+ if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen)))
+ goto nla_put_failure;
+ if (set->flags & NFT_SET_MAP) {
+ if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
+ goto nla_put_failure;
+ }
+
+ if (set->timeout &&
+ nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout)))
+ goto nla_put_failure;
+ if (set->gc_int &&
+ nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int)))
+ goto nla_put_failure;
+
+ if (set->policy != NFT_SET_POL_PERFORMANCE) {
+ if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy)))
+ goto nla_put_failure;
+ }
+
+ desc = nla_nest_start(skb, NFTA_SET_DESC);
+ if (desc == NULL)
+ goto nla_put_failure;
+ if (set->size &&
+ nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
+ goto nla_put_failure;
+ nla_nest_end(skb, desc);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_set_notify(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ int event, gfp_t gfp_flags)
+{
+ struct sk_buff *skb;
+ u32 portid = ctx->portid;
+ int err;
+
+ if (!ctx->report &&
+ !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_set(skb, ctx, set, event, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES,
+ ctx->report, gfp_flags);
+err:
+ if (err < 0)
+ nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err);
+ return err;
+}
+
+static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ unsigned int idx, s_idx = cb->args[0];
+ struct nft_af_info *afi;
+ struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
+ struct net *net = sock_net(skb->sk);
+ int cur_family = cb->args[3];
+ struct nft_ctx *ctx = cb->data, ctx_set;
+
+ if (cb->args[1])
+ return skb->len;
+
+ rcu_read_lock();
+ cb->seq = net->nft.base_seq;
+
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (ctx->afi && ctx->afi != afi)
+ continue;
+
+ if (cur_family) {
+ if (afi->family != cur_family)
+ continue;
+
+ cur_family = 0;
+ }
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (ctx->table && ctx->table != table)
+ continue;
+
+ if (cur_table) {
+ if (cur_table != table)
+ continue;
+
+ cur_table = NULL;
+ }
+ idx = 0;
+ list_for_each_entry_rcu(set, &table->sets, list) {
+ if (idx < s_idx)
+ goto cont;
+
+ ctx_set = *ctx;
+ ctx_set.table = table;
+ ctx_set.afi = afi;
+ if (nf_tables_fill_set(skb, &ctx_set, set,
+ NFT_MSG_NEWSET,
+ NLM_F_MULTI) < 0) {
+ cb->args[0] = idx;
+ cb->args[2] = (unsigned long) table;
+ cb->args[3] = afi->family;
+ goto done;
+ }
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+ idx++;
+ }
+ if (s_idx)
+ s_idx = 0;
+ }
+ }
+ cb->args[1] = 1;
+done:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int nf_tables_dump_sets_done(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+ return 0;
+}
+
+static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nft_set *set;
+ struct nft_ctx ctx;
+ struct sk_buff *skb2;
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ int err;
+
+ /* Verify existence before starting dump */
+ err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ if (err < 0)
+ return err;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_sets,
+ .done = nf_tables_dump_sets_done,
+ };
+ struct nft_ctx *ctx_dump;
+
+ ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL);
+ if (ctx_dump == NULL)
+ return -ENOMEM;
+
+ *ctx_dump = ctx;
+ c.data = ctx_dump;
+
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+
+ /* Only accept unspec with dump */
+ if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ return -EAFNOSUPPORT;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
+ struct nft_set_desc *desc,
+ const struct nlattr *nla)
+{
+ struct nlattr *da[NFTA_SET_DESC_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy);
+ if (err < 0)
+ return err;
+
+ if (da[NFTA_SET_DESC_SIZE] != NULL)
+ desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));
+
+ return 0;
+}
+
+static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ const struct nft_set_ops *ops;
+ struct nft_af_info *afi;
+ struct net *net = sock_net(skb->sk);
+ struct nft_table *table;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ char name[IFNAMSIZ];
+ unsigned int size;
+ bool create;
+ u64 timeout;
+ u32 ktype, dtype, flags, policy, gc_int;
+ struct nft_set_desc desc;
+ int err;
+
+ if (nla[NFTA_SET_TABLE] == NULL ||
+ nla[NFTA_SET_NAME] == NULL ||
+ nla[NFTA_SET_KEY_LEN] == NULL ||
+ nla[NFTA_SET_ID] == NULL)
+ return -EINVAL;
+
+ memset(&desc, 0, sizeof(desc));
+
+ ktype = NFT_DATA_VALUE;
+ if (nla[NFTA_SET_KEY_TYPE] != NULL) {
+ ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
+ if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
+ return -EINVAL;
+ }
+
+ desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
+ if (desc.klen == 0 || desc.klen > NFT_DATA_VALUE_MAXLEN)
+ return -EINVAL;
+
+ flags = 0;
+ if (nla[NFTA_SET_FLAGS] != NULL) {
+ flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
+ if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
+ NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
+ NFT_SET_MAP | NFT_SET_EVAL))
+ return -EINVAL;
+ /* Only one of both operations is supported */
+ if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) ==
+ (NFT_SET_MAP | NFT_SET_EVAL))
+ return -EOPNOTSUPP;
+ }
+
+ dtype = 0;
+ if (nla[NFTA_SET_DATA_TYPE] != NULL) {
+ if (!(flags & NFT_SET_MAP))
+ return -EINVAL;
+
+ dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
+ if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
+ dtype != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ if (dtype != NFT_DATA_VERDICT) {
+ if (nla[NFTA_SET_DATA_LEN] == NULL)
+ return -EINVAL;
+ desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
+ if (desc.dlen == 0 || desc.dlen > NFT_DATA_VALUE_MAXLEN)
+ return -EINVAL;
+ } else
+ desc.dlen = sizeof(struct nft_verdict);
+ } else if (flags & NFT_SET_MAP)
+ return -EINVAL;
+
+ timeout = 0;
+ if (nla[NFTA_SET_TIMEOUT] != NULL) {
+ if (!(flags & NFT_SET_TIMEOUT))
+ return -EINVAL;
+ timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT]));
+ }
+ gc_int = 0;
+ if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
+ if (!(flags & NFT_SET_TIMEOUT))
+ return -EINVAL;
+ gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL]));
+ }
+
+ policy = NFT_SET_POL_PERFORMANCE;
+ if (nla[NFTA_SET_POLICY] != NULL)
+ policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+
+ if (nla[NFTA_SET_DESC] != NULL) {
+ err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]);
+ if (err < 0)
+ return err;
+ }
+
+ create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+
+ nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+
+ set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set)) {
+ if (PTR_ERR(set) != -ENOENT)
+ return PTR_ERR(set);
+ set = NULL;
+ }
+
+ if (set != NULL) {
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+ return 0;
+ }
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -ENOENT;
+
+ ops = nft_select_set_ops(nla, &desc, policy);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ size = 0;
+ if (ops->privsize != NULL)
+ size = ops->privsize(nla);
+
+ err = -ENOMEM;
+ set = kzalloc(sizeof(*set) + size, GFP_KERNEL);
+ if (set == NULL)
+ goto err1;
+
+ nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name));
+ err = nf_tables_set_alloc_name(&ctx, set, name);
+ if (err < 0)
+ goto err2;
+
+ INIT_LIST_HEAD(&set->bindings);
+ write_pnet(&set->pnet, net);
+ set->ops = ops;
+ set->ktype = ktype;
+ set->klen = desc.klen;
+ set->dtype = dtype;
+ set->dlen = desc.dlen;
+ set->flags = flags;
+ set->size = desc.size;
+ set->policy = policy;
+ set->timeout = timeout;
+ set->gc_int = gc_int;
+
+ err = ops->init(set, &desc, nla);
+ if (err < 0)
+ goto err2;
+
+ err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
+ if (err < 0)
+ goto err2;
+
+ list_add_tail_rcu(&set->list, &table->sets);
+ table->use++;
+ return 0;
+
+err2:
+ kfree(set);
+err1:
+ module_put(ops->owner);
+ return err;
+}
+
+static void nft_set_destroy(struct nft_set *set)
+{
+ set->ops->destroy(set);
+ module_put(set->ops->owner);
+ kfree(set);
+}
+
+static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ list_del_rcu(&set->list);
+ nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
+ nft_set_destroy(set);
+}
+
+static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int err;
+
+ if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ return -EAFNOSUPPORT;
+ if (nla[NFTA_SET_TABLE] == NULL)
+ return -EINVAL;
+
+ err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+ if (!list_empty(&set->bindings))
+ return -EBUSY;
+
+ return nft_delset(&ctx, set);
+}
+
+static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ enum nft_registers dreg;
+
+ dreg = nft_type_to_reg(set->dtype);
+ return nft_validate_register_store(ctx, dreg, nft_set_ext_data(ext),
+ set->dtype == NFT_DATA_VERDICT ?
+ NFT_DATA_VERDICT : NFT_DATA_VALUE,
+ set->dlen);
+}
+
+int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding)
+{
+ struct nft_set_binding *i;
+ struct nft_set_iter iter;
+
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS)
+ return -EBUSY;
+
+ if (binding->flags & NFT_SET_MAP) {
+ /* If the set is already bound to the same chain all
+ * jumps are already validated for that chain.
+ */
+ list_for_each_entry(i, &set->bindings, list) {
+ if (binding->flags & NFT_SET_MAP &&
+ i->chain == binding->chain)
+ goto bind;
+ }
+
+ iter.skip = 0;
+ iter.count = 0;
+ iter.err = 0;
+ iter.fn = nf_tables_bind_check_setelem;
+
+ set->ops->walk(ctx, set, &iter);
+ if (iter.err < 0) {
+ /* Destroy anonymous sets if binding fails */
+ if (set->flags & NFT_SET_ANONYMOUS)
+ nf_tables_set_destroy(ctx, set);
+
+ return iter.err;
+ }
+ }
+bind:
+ binding->chain = ctx->chain;
+ list_add_tail_rcu(&binding->list, &set->bindings);
+ return 0;
+}
+
+void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding)
+{
+ list_del_rcu(&binding->list);
+
+ if (list_empty(&set->bindings) && set->flags & NFT_SET_ANONYMOUS &&
+ !(set->flags & NFT_SET_INACTIVE))
+ nf_tables_set_destroy(ctx, set);
+}
+
+const struct nft_set_ext_type nft_set_ext_types[] = {
+ [NFT_SET_EXT_KEY] = {
+ .align = __alignof__(u32),
+ },
+ [NFT_SET_EXT_DATA] = {
+ .align = __alignof__(u32),
+ },
+ [NFT_SET_EXT_EXPR] = {
+ .align = __alignof__(struct nft_expr),
+ },
+ [NFT_SET_EXT_FLAGS] = {
+ .len = sizeof(u8),
+ .align = __alignof__(u8),
+ },
+ [NFT_SET_EXT_TIMEOUT] = {
+ .len = sizeof(u64),
+ .align = __alignof__(u64),
+ },
+ [NFT_SET_EXT_EXPIRATION] = {
+ .len = sizeof(unsigned long),
+ .align = __alignof__(unsigned long),
+ },
+ [NFT_SET_EXT_USERDATA] = {
+ .len = sizeof(struct nft_userdata),
+ .align = __alignof__(struct nft_userdata),
+ },
+};
+EXPORT_SYMBOL_GPL(nft_set_ext_types);
+
+/*
+ * Set elements
+ */
+
+static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
+ [NFTA_SET_ELEM_KEY] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 },
+ [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 },
+ [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
+};
+
+static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
+ [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
+};
+
+static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[],
+ bool trans)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_af_info *afi;
+ struct nft_table *table;
+ struct net *net = sock_net(skb->sk);
+
+ afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
+ if (IS_ERR(afi))
+ return PTR_ERR(afi);
+
+ table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ if (IS_ERR(table))
+ return PTR_ERR(table);
+ if (!trans && (table->flags & NFT_TABLE_INACTIVE))
+ return -ENOENT;
+
+ nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ return 0;
+}
+
+static int nf_tables_fill_setelem(struct sk_buff *skb,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
+ NFT_DATA_VALUE, set->klen) < 0)
+ goto nla_put_failure;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+ nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext),
+ set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
+ set->dlen) < 0)
+ goto nla_put_failure;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) &&
+ nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
+ goto nla_put_failure;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ nla_put_be32(skb, NFTA_SET_ELEM_FLAGS,
+ htonl(*nft_set_ext_flags(ext))))
+ goto nla_put_failure;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
+ nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
+ cpu_to_be64(*nft_set_ext_timeout(ext))))
+ goto nla_put_failure;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
+ unsigned long expires, now = jiffies;
+
+ expires = *nft_set_ext_expiration(ext);
+ if (time_before(now, expires))
+ expires -= now;
+ else
+ expires = 0;
+
+ if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
+ cpu_to_be64(jiffies_to_msecs(expires))))
+ goto nla_put_failure;
+ }
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) {
+ struct nft_userdata *udata;
+
+ udata = nft_set_ext_userdata(ext);
+ if (nla_put(skb, NFTA_SET_ELEM_USERDATA,
+ udata->len + 1, udata->data))
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+struct nft_set_dump_args {
+ const struct netlink_callback *cb;
+ struct nft_set_iter iter;
+ struct sk_buff *skb;
+};
+
+static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ struct nft_set_dump_args *args;
+
+ args = container_of(iter, struct nft_set_dump_args, iter);
+ return nf_tables_fill_setelem(args->skb, set, elem);
+}
+
+static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct nft_set *set;
+ struct nft_set_dump_args args;
+ struct nft_ctx ctx;
+ struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ u32 portid, seq;
+ int event, err;
+
+ err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla,
+ NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy);
+ if (err < 0)
+ return err;
+
+ err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla,
+ false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ event = NFT_MSG_NEWSETELEM;
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ portid = NETLINK_CB(cb->skb).portid;
+ seq = cb->nlh->nlmsg_seq;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ NLM_F_MULTI);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx.afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ args.cb = cb;
+ args.skb = skb;
+ args.iter.skip = cb->args[0];
+ args.iter.count = 0;
+ args.iter.err = 0;
+ args.iter.fn = nf_tables_dump_setelem;
+ set->ops->walk(&ctx, set, &args.iter);
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+
+ if (args.iter.err && args.iter.err != -EMSGSIZE)
+ return args.iter.err;
+ if (args.iter.count == cb->args[0])
+ return 0;
+
+ cb->args[0] = args.iter.count;
+ return skb->len;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nft_set *set;
+ struct nft_ctx ctx;
+ int err;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (set->flags & NFT_SET_INACTIVE)
+ return -ENOENT;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nf_tables_dump_set,
+ };
+ return netlink_dump_start(nlsk, skb, nlh, &c);
+ }
+ return -EOPNOTSUPP;
+}
+
+static int nf_tables_fill_setelem_info(struct sk_buff *skb,
+ const struct nft_ctx *ctx, u32 seq,
+ u32 portid, int event, u16 flags,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ int err;
+
+ event |= NFNL_SUBSYS_NFTABLES << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
+ flags);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = ctx->afi->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
+
+ if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_SET_NAME, set->name))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ err = nf_tables_fill_setelem(skb, set, elem);
+ if (err < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -1;
+}
+
+static int nf_tables_setelem_notify(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem,
+ int event, u16 flags)
+{
+ struct net *net = ctx->net;
+ u32 portid = ctx->portid;
+ struct sk_buff *skb;
+ int err;
+
+ if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb == NULL)
+ goto err;
+
+ err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
+ set, elem);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
+ GFP_KERNEL);
+err:
+ if (err < 0)
+ nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+ return err;
+}
+
+static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
+ int msg_type,
+ struct nft_set *set)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem));
+ if (trans == NULL)
+ return NULL;
+
+ nft_trans_elem_set(trans) = set;
+ return trans;
+}
+
+void *nft_set_elem_init(const struct nft_set *set,
+ const struct nft_set_ext_tmpl *tmpl,
+ const u32 *key, const u32 *data,
+ u64 timeout, gfp_t gfp)
+{
+ struct nft_set_ext *ext;
+ void *elem;
+
+ elem = kzalloc(set->ops->elemsize + tmpl->len, gfp);
+ if (elem == NULL)
+ return NULL;
+
+ ext = nft_set_elem_ext(set, elem);
+ nft_set_ext_init(ext, tmpl);
+
+ memcpy(nft_set_ext_key(ext), key, set->klen);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
+ memcpy(nft_set_ext_data(ext), data, set->dlen);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
+ *nft_set_ext_expiration(ext) =
+ jiffies + msecs_to_jiffies(timeout);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
+ *nft_set_ext_timeout(ext) = timeout;
+
+ return elem;
+}
+
+void nft_set_elem_destroy(const struct nft_set *set, void *elem)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+
+ nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
+ nft_data_uninit(nft_set_ext_data(ext), set->dtype);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+
+ kfree(elem);
+}
+EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
+
+static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_data_desc d1, d2;
+ struct nft_set_ext_tmpl tmpl;
+ struct nft_set_ext *ext;
+ struct nft_set_elem elem;
+ struct nft_set_binding *binding;
+ struct nft_userdata *udata;
+ struct nft_data data;
+ enum nft_registers dreg;
+ struct nft_trans *trans;
+ u64 timeout;
+ u32 flags;
+ u8 ulen;
+ int err;
+
+ err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy);
+ if (err < 0)
+ return err;
+
+ if (nla[NFTA_SET_ELEM_KEY] == NULL)
+ return -EINVAL;
+
+ nft_set_ext_prepare(&tmpl);
+
+ flags = 0;
+ if (nla[NFTA_SET_ELEM_FLAGS] != NULL) {
+ flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS]));
+ if (flags & ~NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ if (!(set->flags & NFT_SET_INTERVAL) &&
+ flags & NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ if (flags != 0)
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ }
+
+ if (set->flags & NFT_SET_MAP) {
+ if (nla[NFTA_SET_ELEM_DATA] == NULL &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END))
+ return -EINVAL;
+ if (nla[NFTA_SET_ELEM_DATA] != NULL &&
+ flags & NFT_SET_ELEM_INTERVAL_END)
+ return -EINVAL;
+ } else {
+ if (nla[NFTA_SET_ELEM_DATA] != NULL)
+ return -EINVAL;
+ }
+
+ timeout = 0;
+ if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
+ if (!(set->flags & NFT_SET_TIMEOUT))
+ return -EINVAL;
+ timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT]));
+ } else if (set->flags & NFT_SET_TIMEOUT) {
+ timeout = set->timeout;
+ }
+
+ err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err1;
+ err = -EINVAL;
+ if (d1.type != NFT_DATA_VALUE || d1.len != set->klen)
+ goto err2;
+
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len);
+ if (timeout > 0) {
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
+ if (timeout != set->timeout)
+ nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ }
+
+ if (nla[NFTA_SET_ELEM_DATA] != NULL) {
+ err = nft_data_init(ctx, &data, sizeof(data), &d2,
+ nla[NFTA_SET_ELEM_DATA]);
+ if (err < 0)
+ goto err2;
+
+ err = -EINVAL;
+ if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen)
+ goto err3;
+
+ dreg = nft_type_to_reg(set->dtype);
+ list_for_each_entry(binding, &set->bindings, list) {
+ struct nft_ctx bind_ctx = {
+ .afi = ctx->afi,
+ .table = ctx->table,
+ .chain = (struct nft_chain *)binding->chain,
+ };
+
+ if (!(binding->flags & NFT_SET_MAP))
+ continue;
+
+ err = nft_validate_register_store(&bind_ctx, dreg,
+ &data,
+ d2.type, d2.len);
+ if (err < 0)
+ goto err3;
+ }
+
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
+ }
+
+ /* The full maximum length of userdata can exceed the maximum
+ * offset value (U8_MAX) for following extensions, therefor it
+ * must be the last extension added.
+ */
+ ulen = 0;
+ if (nla[NFTA_SET_ELEM_USERDATA] != NULL) {
+ ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]);
+ if (ulen > 0)
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
+ ulen);
+ }
+
+ err = -ENOMEM;
+ elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data,
+ timeout, GFP_KERNEL);
+ if (elem.priv == NULL)
+ goto err3;
+
+ ext = nft_set_elem_ext(set, elem.priv);
+ if (flags)
+ *nft_set_ext_flags(ext) = flags;
+ if (ulen > 0) {
+ udata = nft_set_ext_userdata(ext);
+ udata->len = ulen - 1;
+ nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
+ }
+
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
+ if (trans == NULL)
+ goto err4;
+
+ ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
+ err = set->ops->insert(set, &elem);
+ if (err < 0)
+ goto err5;
+
+ nft_trans_elem(trans) = elem;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+
+err5:
+ kfree(trans);
+err4:
+ kfree(elem.priv);
+err3:
+ if (nla[NFTA_SET_ELEM_DATA] != NULL)
+ nft_data_uninit(&data, d2.type);
+err2:
+ nft_data_uninit(&elem.key.val, d1.type);
+err1:
+ return err;
+}
+
+static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ struct net *net = sock_net(skb->sk);
+ const struct nlattr *attr;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int rem, err = 0;
+
+ if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
+ return -EINVAL;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set)) {
+ if (nla[NFTA_SET_ELEM_LIST_SET_ID]) {
+ set = nf_tables_set_lookup_byid(net,
+ nla[NFTA_SET_ELEM_LIST_SET_ID]);
+ }
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ return -EBUSY;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ if (set->size &&
+ !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact))
+ return -ENFILE;
+
+ err = nft_add_set_elem(&ctx, set, attr);
+ if (err < 0) {
+ atomic_dec(&set->nelems);
+ break;
+ }
+ }
+ return err;
+}
+
+static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+ struct nft_data_desc desc;
+ struct nft_set_elem elem;
+ struct nft_trans *trans;
+ int err;
+
+ err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy);
+ if (err < 0)
+ goto err1;
+
+ err = -EINVAL;
+ if (nla[NFTA_SET_ELEM_KEY] == NULL)
+ goto err1;
+
+ err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err1;
+
+ err = -EINVAL;
+ if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
+ goto err2;
+
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err2;
+ }
+
+ elem.priv = set->ops->deactivate(set, &elem);
+ if (elem.priv == NULL) {
+ err = -ENOENT;
+ goto err3;
+ }
+
+ nft_trans_elem(trans) = elem;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ return 0;
+
+err3:
+ kfree(trans);
+err2:
+ nft_data_uninit(&elem.key.val, desc.type);
+err1:
+ return err;
+}
+
+static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ const struct nlattr *attr;
+ struct nft_set *set;
+ struct nft_ctx ctx;
+ int rem, err = 0;
+
+ if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
+ return -EINVAL;
+
+ err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ if (err < 0)
+ return err;
+
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ return -EBUSY;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ err = nft_del_setelem(&ctx, set, attr);
+ if (err < 0)
+ break;
+
+ set->ndeact++;
+ }
+ return err;
+}
+
+void nft_set_gc_batch_release(struct rcu_head *rcu)
+{
+ struct nft_set_gc_batch *gcb;
+ unsigned int i;
+
+ gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
+ for (i = 0; i < gcb->head.cnt; i++)
+ nft_set_elem_destroy(gcb->head.set, gcb->elems[i]);
+ kfree(gcb);
+}
+EXPORT_SYMBOL_GPL(nft_set_gc_batch_release);
+
+struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
+ gfp_t gfp)
+{
+ struct nft_set_gc_batch *gcb;
+
+ gcb = kzalloc(sizeof(*gcb), gfp);
+ if (gcb == NULL)
+ return gcb;
+ gcb->head.set = set;
+ return gcb;
+}
+EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
+
+static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWGEN;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0);
+ if (nlh == NULL)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
+
+ if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ struct sk_buff *skb2;
+ int err;
+
+ if (nlmsg_report(nlh) &&
+ !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+ return 0;
+
+ err = -ENOBUFS;
+ skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ goto err;
+
+ err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq);
+ if (err < 0) {
+ kfree_skb(skb2);
+ goto err;
+ }
+
+ err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid,
+ NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL);
+err:
+ if (err < 0) {
+ nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
+ err);
+ }
+ return err;
+}
+
+static int nf_tables_getgen(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nla[])
+{
+ struct net *net = sock_net(skb->sk);
+ struct sk_buff *skb2;
+ int err;
+
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq);
+ if (err < 0)
+ goto err;
+
+ return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+err:
+ kfree_skb(skb2);
+ return err;
+}
+
+static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
+ [NFT_MSG_NEWTABLE] = {
+ .call_batch = nf_tables_newtable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_GETTABLE] = {
+ .call = nf_tables_gettable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_DELTABLE] = {
+ .call_batch = nf_tables_deltable,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_NEWCHAIN] = {
+ .call_batch = nf_tables_newchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_GETCHAIN] = {
+ .call = nf_tables_getchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_DELCHAIN] = {
+ .call_batch = nf_tables_delchain,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_NEWRULE] = {
+ .call_batch = nf_tables_newrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_GETRULE] = {
+ .call = nf_tables_getrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_DELRULE] = {
+ .call_batch = nf_tables_delrule,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_NEWSET] = {
+ .call_batch = nf_tables_newset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_GETSET] = {
+ .call = nf_tables_getset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_DELSET] = {
+ .call_batch = nf_tables_delset,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_NEWSETELEM] = {
+ .call_batch = nf_tables_newsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_GETSETELEM] = {
+ .call = nf_tables_getsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_DELSETELEM] = {
+ .call_batch = nf_tables_delsetelem,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_GETGEN] = {
+ .call = nf_tables_getgen,
+ },
+};
+
+static void nft_chain_commit_update(struct nft_trans *trans)
+{
+ struct nft_base_chain *basechain;
+
+ if (nft_trans_chain_name(trans)[0])
+ strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans));
+
+ if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN))
+ return;
+
+ basechain = nft_base_chain(trans->ctx.chain);
+ nft_chain_stats_replace(basechain, nft_trans_chain_stats(trans));
+
+ switch (nft_trans_chain_policy(trans)) {
+ case NF_DROP:
+ case NF_ACCEPT:
+ basechain->policy = nft_trans_chain_policy(trans);
+ break;
+ }
+}
+
+static void nf_tables_commit_release(struct nft_trans *trans)
+{
+ switch (trans->msg_type) {
+ case NFT_MSG_DELTABLE:
+ nf_tables_table_destroy(&trans->ctx);
+ break;
+ case NFT_MSG_DELCHAIN:
+ nf_tables_chain_destroy(trans->ctx.chain);
+ break;
+ case NFT_MSG_DELRULE:
+ nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ break;
+ case NFT_MSG_DELSET:
+ nft_set_destroy(nft_trans_set(trans));
+ break;
+ case NFT_MSG_DELSETELEM:
+ nft_set_elem_destroy(nft_trans_elem_set(trans),
+ nft_trans_elem(trans).priv);
+ break;
+ }
+ kfree(trans);
+}
+
+static int nf_tables_commit(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nft_trans *trans, *next;
+ struct nft_trans_elem *te;
+
+ /* Bump generation counter, invalidate any dump in progress */
+ while (++net->nft.base_seq == 0);
+
+ /* A new generation has just started */
+ net->nft.gencursor = nft_gencursor_next(net);
+
+ /* Make sure all packets have left the previous generation before
+ * purging old rules.
+ */
+ synchronize_rcu();
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ if (nft_trans_table_update(trans)) {
+ if (!nft_trans_table_enable(trans)) {
+ nf_tables_table_disable(trans->ctx.afi,
+ trans->ctx.table);
+ trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ }
+ } else {
+ trans->ctx.table->flags &= ~NFT_TABLE_INACTIVE;
+ }
+ nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELTABLE:
+ nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain_update(trans))
+ nft_chain_commit_update(trans);
+ else
+ trans->ctx.chain->flags &= ~NFT_CHAIN_INACTIVE;
+
+ nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELCHAIN:
+ nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
+ nf_tables_unregister_hooks(trans->ctx.table,
+ trans->ctx.chain,
+ trans->ctx.afi->nops);
+ break;
+ case NFT_MSG_NEWRULE:
+ nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+ nf_tables_rule_notify(&trans->ctx,
+ nft_trans_rule(trans),
+ NFT_MSG_NEWRULE);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELRULE:
+ list_del_rcu(&nft_trans_rule(trans)->list);
+ nf_tables_rule_notify(&trans->ctx,
+ nft_trans_rule(trans),
+ NFT_MSG_DELRULE);
+ break;
+ case NFT_MSG_NEWSET:
+ nft_trans_set(trans)->flags &= ~NFT_SET_INACTIVE;
+ /* This avoids hitting -EBUSY when deleting the table
+ * from the transaction.
+ */
+ if (nft_trans_set(trans)->flags & NFT_SET_ANONYMOUS &&
+ !list_empty(&nft_trans_set(trans)->bindings))
+ trans->ctx.table->use--;
+
+ nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ NFT_MSG_NEWSET, GFP_KERNEL);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELSET:
+ nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ NFT_MSG_DELSET, GFP_KERNEL);
+ break;
+ case NFT_MSG_NEWSETELEM:
+ te = (struct nft_trans_elem *)trans->data;
+
+ te->set->ops->activate(te->set, &te->elem);
+ nf_tables_setelem_notify(&trans->ctx, te->set,
+ &te->elem,
+ NFT_MSG_NEWSETELEM, 0);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_DELSETELEM:
+ te = (struct nft_trans_elem *)trans->data;
+
+ nf_tables_setelem_notify(&trans->ctx, te->set,
+ &te->elem,
+ NFT_MSG_DELSETELEM, 0);
+ te->set->ops->remove(te->set, &te->elem);
+ atomic_dec(&te->set->nelems);
+ te->set->ndeact--;
+ break;
+ }
+ }
+
+ synchronize_rcu();
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_del(&trans->list);
+ nf_tables_commit_release(trans);
+ }
+
+ nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
+
+ return 0;
+}
+
+static void nf_tables_abort_release(struct nft_trans *trans)
+{
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ nf_tables_table_destroy(&trans->ctx);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ nf_tables_chain_destroy(trans->ctx.chain);
+ break;
+ case NFT_MSG_NEWRULE:
+ nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ break;
+ case NFT_MSG_NEWSET:
+ nft_set_destroy(nft_trans_set(trans));
+ break;
+ case NFT_MSG_NEWSETELEM:
+ nft_set_elem_destroy(nft_trans_elem_set(trans),
+ nft_trans_elem(trans).priv);
+ break;
+ }
+ kfree(trans);
+}
+
+static int nf_tables_abort(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nft_trans *trans, *next;
+ struct nft_trans_elem *te;
+
+ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWTABLE:
+ if (nft_trans_table_update(trans)) {
+ if (nft_trans_table_enable(trans)) {
+ nf_tables_table_disable(trans->ctx.afi,
+ trans->ctx.table);
+ trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ }
+ nft_trans_destroy(trans);
+ } else {
+ list_del_rcu(&trans->ctx.table->list);
+ }
+ break;
+ case NFT_MSG_DELTABLE:
+ list_add_tail_rcu(&trans->ctx.table->list,
+ &trans->ctx.afi->tables);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ free_percpu(nft_trans_chain_stats(trans));
+
+ nft_trans_destroy(trans);
+ } else {
+ trans->ctx.table->use--;
+ list_del_rcu(&trans->ctx.chain->list);
+ nf_tables_unregister_hooks(trans->ctx.table,
+ trans->ctx.chain,
+ trans->ctx.afi->nops);
+ }
+ break;
+ case NFT_MSG_DELCHAIN:
+ trans->ctx.table->use++;
+ list_add_tail_rcu(&trans->ctx.chain->list,
+ &trans->ctx.table->chains);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWRULE:
+ trans->ctx.chain->use--;
+ list_del_rcu(&nft_trans_rule(trans)->list);
+ break;
+ case NFT_MSG_DELRULE:
+ trans->ctx.chain->use++;
+ nft_rule_clear(trans->ctx.net, nft_trans_rule(trans));
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWSET:
+ trans->ctx.table->use--;
+ list_del_rcu(&nft_trans_set(trans)->list);
+ break;
+ case NFT_MSG_DELSET:
+ trans->ctx.table->use++;
+ list_add_tail_rcu(&nft_trans_set(trans)->list,
+ &trans->ctx.table->sets);
+ nft_trans_destroy(trans);
+ break;
+ case NFT_MSG_NEWSETELEM:
+ te = (struct nft_trans_elem *)trans->data;
+
+ te->set->ops->remove(te->set, &te->elem);
+ atomic_dec(&te->set->nelems);
+ break;
+ case NFT_MSG_DELSETELEM:
+ te = (struct nft_trans_elem *)trans->data;
+
+ te->set->ops->activate(te->set, &te->elem);
+ te->set->ndeact--;
+
+ nft_trans_destroy(trans);
+ break;
+ }
+ }
+
+ synchronize_rcu();
+
+ list_for_each_entry_safe_reverse(trans, next,
+ &net->nft.commit_list, list) {
+ list_del(&trans->list);
+ nf_tables_abort_release(trans);
+ }
+
+ return 0;
+}
+
+static const struct nfnetlink_subsystem nf_tables_subsys = {
+ .name = "nf_tables",
+ .subsys_id = NFNL_SUBSYS_NFTABLES,
+ .cb_count = NFT_MSG_MAX,
+ .cb = nf_tables_cb,
+ .commit = nf_tables_commit,
+ .abort = nf_tables_abort,
+};
+
+int nft_chain_validate_dependency(const struct nft_chain *chain,
+ enum nft_chain_type type)
+{
+ const struct nft_base_chain *basechain;
+
+ if (chain->flags & NFT_BASE_CHAIN) {
+ basechain = nft_base_chain(chain);
+ if (basechain->type->type != type)
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate_dependency);
+
+int nft_chain_validate_hooks(const struct nft_chain *chain,
+ unsigned int hook_flags)
+{
+ struct nft_base_chain *basechain;
+
+ if (chain->flags & NFT_BASE_CHAIN) {
+ basechain = nft_base_chain(chain);
+
+ if ((1 << basechain->ops[0].hooknum) & hook_flags)
+ return 0;
+
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
+
+/*
+ * Loop detection - walk through the ruleset beginning at the destination chain
+ * of a new jump until either the source chain is reached (loop) or all
+ * reachable chains have been traversed.
+ *
+ * The loop check is performed whenever a new jump verdict is added to an
+ * expression or verdict map or a verdict map is bound to a new chain.
+ */
+
+static int nf_tables_check_loops(const struct nft_ctx *ctx,
+ const struct nft_chain *chain);
+
+static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_set_iter *iter,
+ const struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_data *data;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ data = nft_set_ext_data(ext);
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ return nf_tables_check_loops(ctx, data->verdict.chain);
+ default:
+ return 0;
+ }
+}
+
+static int nf_tables_check_loops(const struct nft_ctx *ctx,
+ const struct nft_chain *chain)
+{
+ const struct nft_rule *rule;
+ const struct nft_expr *expr, *last;
+ const struct nft_set *set;
+ struct nft_set_binding *binding;
+ struct nft_set_iter iter;
+
+ if (ctx->chain == chain)
+ return -ELOOP;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ nft_rule_for_each_expr(expr, last, rule) {
+ const struct nft_data *data = NULL;
+ int err;
+
+ if (!expr->ops->validate)
+ continue;
+
+ err = expr->ops->validate(ctx, expr, &data);
+ if (err < 0)
+ return err;
+
+ if (data == NULL)
+ continue;
+
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ err = nf_tables_check_loops(ctx,
+ data->verdict.chain);
+ if (err < 0)
+ return err;
+ default:
+ break;
+ }
+ }
+ }
+
+ list_for_each_entry(set, &ctx->table->sets, list) {
+ if (!(set->flags & NFT_SET_MAP) ||
+ set->dtype != NFT_DATA_VERDICT)
+ continue;
+
+ list_for_each_entry(binding, &set->bindings, list) {
+ if (!(binding->flags & NFT_SET_MAP) ||
+ binding->chain != chain)
+ continue;
+
+ iter.skip = 0;
+ iter.count = 0;
+ iter.err = 0;
+ iter.fn = nf_tables_loop_check_setelem;
+
+ set->ops->walk(ctx, set, &iter);
+ if (iter.err < 0)
+ return iter.err;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * nft_parse_register - parse a register value from a netlink attribute
+ *
+ * @attr: netlink attribute
+ *
+ * Parse and translate a register value from a netlink attribute.
+ * Registers used to be 128 bit wide, these register numbers will be
+ * mapped to the corresponding 32 bit register numbers.
+ */
+unsigned int nft_parse_register(const struct nlattr *attr)
+{
+ unsigned int reg;
+
+ reg = ntohl(nla_get_be32(attr));
+ switch (reg) {
+ case NFT_REG_VERDICT...NFT_REG_4:
+ return reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ default:
+ return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_parse_register);
+
+/**
+ * nft_dump_register - dump a register value to a netlink attribute
+ *
+ * @skb: socket buffer
+ * @attr: attribute number
+ * @reg: register number
+ *
+ * Construct a netlink attribute containing the register number. For
+ * compatibility reasons, register numbers being a multiple of 4 are
+ * translated to the corresponding 128 bit register numbers.
+ */
+int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg)
+{
+ if (reg % (NFT_REG_SIZE / NFT_REG32_SIZE) == 0)
+ reg = reg / (NFT_REG_SIZE / NFT_REG32_SIZE);
+ else
+ reg = reg - NFT_REG_SIZE / NFT_REG32_SIZE + NFT_REG32_00;
+
+ return nla_put_be32(skb, attr, htonl(reg));
+}
+EXPORT_SYMBOL_GPL(nft_dump_register);
+
+/**
+ * nft_validate_register_load - validate a load from a register
+ *
+ * @reg: the register number
+ * @len: the length of the data
+ *
+ * Validate that the input register is one of the general purpose
+ * registers and that the length of the load is within the bounds.
+ */
+int nft_validate_register_load(enum nft_registers reg, unsigned int len)
+{
+ if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
+ return -EINVAL;
+ if (len == 0)
+ return -EINVAL;
+ if (reg * NFT_REG32_SIZE + len > FIELD_SIZEOF(struct nft_regs, data))
+ return -ERANGE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_validate_register_load);
+
+/**
+ * nft_validate_register_store - validate an expressions' register store
+ *
+ * @ctx: context of the expression performing the load
+ * @reg: the destination register number
+ * @data: the data to load
+ * @type: the data type
+ * @len: the length of the data
+ *
+ * Validate that a data load uses the appropriate data type for
+ * the destination register and the length is within the bounds.
+ * A value of NULL for the data means that its runtime gathered
+ * data.
+ */
+int nft_validate_register_store(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type, unsigned int len)
+{
+ int err;
+
+ switch (reg) {
+ case NFT_REG_VERDICT:
+ if (type != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ if (data != NULL &&
+ (data->verdict.code == NFT_GOTO ||
+ data->verdict.code == NFT_JUMP)) {
+ err = nf_tables_check_loops(ctx, data->verdict.chain);
+ if (err < 0)
+ return err;
+
+ if (ctx->chain->level + 1 >
+ data->verdict.chain->level) {
+ if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
+ return -EMLINK;
+ data->verdict.chain->level = ctx->chain->level + 1;
+ }
+ }
+
+ return 0;
+ default:
+ if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
+ return -EINVAL;
+ if (len == 0)
+ return -EINVAL;
+ if (reg * NFT_REG32_SIZE + len >
+ FIELD_SIZEOF(struct nft_regs, data))
+ return -ERANGE;
+
+ if (data != NULL && type != NFT_DATA_VALUE)
+ return -EINVAL;
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_validate_register_store);
+
+static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
+ [NFTA_VERDICT_CODE] = { .type = NLA_U32 },
+ [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING,
+ .len = NFT_CHAIN_MAXNAMELEN - 1 },
+};
+
+static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ struct nlattr *tb[NFTA_VERDICT_MAX + 1];
+ struct nft_chain *chain;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_VERDICT_CODE])
+ return -EINVAL;
+ data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
+
+ switch (data->verdict.code) {
+ default:
+ switch (data->verdict.code & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* fall through */
+ case NFT_CONTINUE:
+ case NFT_BREAK:
+ case NFT_RETURN:
+ break;
+ case NFT_JUMP:
+ case NFT_GOTO:
+ if (!tb[NFTA_VERDICT_CHAIN])
+ return -EINVAL;
+ chain = nf_tables_chain_lookup(ctx->table,
+ tb[NFTA_VERDICT_CHAIN]);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ if (chain->flags & NFT_BASE_CHAIN)
+ return -EOPNOTSUPP;
+
+ chain->use++;
+ data->verdict.chain = chain;
+ break;
+ }
+
+ desc->len = sizeof(data->verdict);
+ desc->type = NFT_DATA_VERDICT;
+ return 0;
+}
+
+static void nft_verdict_uninit(const struct nft_data *data)
+{
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ data->verdict.chain->use--;
+ break;
+ }
+}
+
+static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_DATA_VERDICT);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict.code)))
+ goto nla_put_failure;
+
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ if (nla_put_string(skb, NFTA_VERDICT_CHAIN,
+ data->verdict.chain->name))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_value_init(const struct nft_ctx *ctx,
+ struct nft_data *data, unsigned int size,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ unsigned int len;
+
+ len = nla_len(nla);
+ if (len == 0)
+ return -EINVAL;
+ if (len > size)
+ return -EOVERFLOW;
+
+ nla_memcpy(data->data, nla, len);
+ desc->type = NFT_DATA_VALUE;
+ desc->len = len;
+ return 0;
+}
+
+static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
+ unsigned int len)
+{
+ return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
+}
+
+static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
+ [NFTA_DATA_VALUE] = { .type = NLA_BINARY },
+ [NFTA_DATA_VERDICT] = { .type = NLA_NESTED },
+};
+
+/**
+ * nft_data_init - parse nf_tables data netlink attributes
+ *
+ * @ctx: context of the expression using the data
+ * @data: destination struct nft_data
+ * @size: maximum data length
+ * @desc: data description
+ * @nla: netlink attribute containing data
+ *
+ * Parse the netlink data attributes and initialize a struct nft_data.
+ * The type and length of data are returned in the data description.
+ *
+ * The caller can indicate that it only wants to accept data of type
+ * NFT_DATA_VALUE by passing NULL for the ctx argument.
+ */
+int nft_data_init(const struct nft_ctx *ctx,
+ struct nft_data *data, unsigned int size,
+ struct nft_data_desc *desc, const struct nlattr *nla)
+{
+ struct nlattr *tb[NFTA_DATA_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DATA_VALUE])
+ return nft_value_init(ctx, data, size, desc,
+ tb[NFTA_DATA_VALUE]);
+ if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
+ return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(nft_data_init);
+
+/**
+ * nft_data_uninit - release a nft_data item
+ *
+ * @data: struct nft_data to release
+ * @type: type of data
+ *
+ * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
+ * all others need to be released by calling this function.
+ */
+void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
+{
+ if (type < NFT_DATA_VERDICT)
+ return;
+ switch (type) {
+ case NFT_DATA_VERDICT:
+ return nft_verdict_uninit(data);
+ default:
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL_GPL(nft_data_uninit);
+
+int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
+ enum nft_data_types type, unsigned int len)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start(skb, attr);
+ if (nest == NULL)
+ return -1;
+
+ switch (type) {
+ case NFT_DATA_VALUE:
+ err = nft_value_dump(skb, data, len);
+ break;
+ case NFT_DATA_VERDICT:
+ err = nft_verdict_dump(skb, data);
+ break;
+ default:
+ err = -EINVAL;
+ WARN_ON(1);
+ }
+
+ nla_nest_end(skb, nest);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nft_data_dump);
+
+static int nf_tables_init_net(struct net *net)
+{
+ INIT_LIST_HEAD(&net->nft.af_info);
+ INIT_LIST_HEAD(&net->nft.commit_list);
+ net->nft.base_seq = 1;
+ return 0;
+}
+
+static struct pernet_operations nf_tables_net_ops = {
+ .init = nf_tables_init_net,
+};
+
+static int __init nf_tables_module_init(void)
+{
+ int err;
+
+ info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS,
+ GFP_KERNEL);
+ if (info == NULL) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = nf_tables_core_module_init();
+ if (err < 0)
+ goto err2;
+
+ err = nfnetlink_subsys_register(&nf_tables_subsys);
+ if (err < 0)
+ goto err3;
+
+ pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
+ return register_pernet_subsys(&nf_tables_net_ops);
+err3:
+ nf_tables_core_module_exit();
+err2:
+ kfree(info);
+err1:
+ return err;
+}
+
+static void __exit nf_tables_module_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_net_ops);
+ nfnetlink_subsys_unregister(&nf_tables_subsys);
+ rcu_barrier();
+ nf_tables_core_module_exit();
+ kfree(info);
+}
+
+module_init(nf_tables_module_init);
+module_exit(nf_tables_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
new file mode 100644
index 000000000..f153b0707
--- /dev/null
+++ b/net/netfilter/nf_tables_core.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
+
+enum nft_trace {
+ NFT_TRACE_RULE,
+ NFT_TRACE_RETURN,
+ NFT_TRACE_POLICY,
+};
+
+static const char *const comments[] = {
+ [NFT_TRACE_RULE] = "rule",
+ [NFT_TRACE_RETURN] = "return",
+ [NFT_TRACE_POLICY] = "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_WARNING,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+static void __nft_trace_packet(const struct nft_pktinfo *pkt,
+ const struct nft_chain *chain,
+ int rulenum, enum nft_trace type)
+{
+ struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
+
+ nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in,
+ pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
+ chain->table->name, chain->name, comments[type],
+ rulenum);
+}
+
+static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+ const struct nft_chain *chain,
+ int rulenum, enum nft_trace type)
+{
+ if (unlikely(pkt->skb->nf_trace))
+ __nft_trace_packet(pkt, chain, rulenum, type);
+}
+
+static void nft_cmp_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ u32 mask = nft_cmp_fast_mask(priv->len);
+
+ if ((regs->data[priv->sreg] & mask) == priv->data)
+ return;
+ regs->verdict.code = NFT_BREAK;
+}
+
+static bool nft_payload_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ u32 *dest = &regs->data[priv->dreg];
+ unsigned char *ptr;
+
+ if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
+ ptr = skb_network_header(skb);
+ else
+ ptr = skb_network_header(skb) + pkt->xt.thoff;
+
+ ptr += priv->offset;
+
+ if (unlikely(ptr + priv->len >= skb_tail_pointer(skb)))
+ return false;
+
+ *dest = 0;
+ if (priv->len == 2)
+ *(u16 *)dest = *(u16 *)ptr;
+ else if (priv->len == 4)
+ *(u32 *)dest = *(u32 *)ptr;
+ else
+ *(u8 *)dest = *(u8 *)ptr;
+ return true;
+}
+
+struct nft_jumpstack {
+ const struct nft_chain *chain;
+ const struct nft_rule *rule;
+ int rulenum;
+};
+
+unsigned int
+nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
+{
+ const struct nft_chain *chain = ops->priv, *basechain = chain;
+ const struct net *net = read_pnet(&nft_base_chain(basechain)->pnet);
+ const struct nft_rule *rule;
+ const struct nft_expr *expr, *last;
+ struct nft_regs regs;
+ unsigned int stackptr = 0;
+ struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
+ struct nft_stats *stats;
+ int rulenum;
+ unsigned int gencursor = nft_genmask_cur(net);
+
+do_chain:
+ rulenum = 0;
+ rule = list_entry(&chain->rules, struct nft_rule, list);
+next_rule:
+ regs.verdict.code = NFT_CONTINUE;
+ list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
+
+ /* This rule is not active, skip. */
+ if (unlikely(rule->genmask & (1 << gencursor)))
+ continue;
+
+ rulenum++;
+
+ nft_rule_for_each_expr(expr, last, rule) {
+ if (expr->ops == &nft_cmp_fast_ops)
+ nft_cmp_fast_eval(expr, &regs);
+ else if (expr->ops != &nft_payload_fast_ops ||
+ !nft_payload_fast_eval(expr, &regs, pkt))
+ expr->ops->eval(expr, &regs, pkt);
+
+ if (regs.verdict.code != NFT_CONTINUE)
+ break;
+ }
+
+ switch (regs.verdict.code) {
+ case NFT_BREAK:
+ regs.verdict.code = NFT_CONTINUE;
+ continue;
+ case NFT_CONTINUE:
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+ continue;
+ }
+ break;
+ }
+
+ switch (regs.verdict.code & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+ return regs.verdict.code;
+ }
+
+ switch (regs.verdict.code) {
+ case NFT_JUMP:
+ BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
+ jumpstack[stackptr].chain = chain;
+ jumpstack[stackptr].rule = rule;
+ jumpstack[stackptr].rulenum = rulenum;
+ stackptr++;
+ /* fall through */
+ case NFT_GOTO:
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+
+ chain = regs.verdict.chain;
+ goto do_chain;
+ case NFT_CONTINUE:
+ rulenum++;
+ /* fall through */
+ case NFT_RETURN:
+ nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN);
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ if (stackptr > 0) {
+ stackptr--;
+ chain = jumpstack[stackptr].chain;
+ rule = jumpstack[stackptr].rule;
+ rulenum = jumpstack[stackptr].rulenum;
+ goto next_rule;
+ }
+
+ nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY);
+
+ rcu_read_lock_bh();
+ stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats));
+ u64_stats_update_begin(&stats->syncp);
+ stats->pkts++;
+ stats->bytes += pkt->skb->len;
+ u64_stats_update_end(&stats->syncp);
+ rcu_read_unlock_bh();
+
+ return nft_base_chain(basechain)->policy;
+}
+EXPORT_SYMBOL_GPL(nft_do_chain);
+
+int __init nf_tables_core_module_init(void)
+{
+ int err;
+
+ err = nft_immediate_module_init();
+ if (err < 0)
+ goto err1;
+
+ err = nft_cmp_module_init();
+ if (err < 0)
+ goto err2;
+
+ err = nft_lookup_module_init();
+ if (err < 0)
+ goto err3;
+
+ err = nft_bitwise_module_init();
+ if (err < 0)
+ goto err4;
+
+ err = nft_byteorder_module_init();
+ if (err < 0)
+ goto err5;
+
+ err = nft_payload_module_init();
+ if (err < 0)
+ goto err6;
+
+ err = nft_dynset_module_init();
+ if (err < 0)
+ goto err7;
+
+ return 0;
+
+err7:
+ nft_payload_module_exit();
+err6:
+ nft_byteorder_module_exit();
+err5:
+ nft_bitwise_module_exit();
+err4:
+ nft_lookup_module_exit();
+err3:
+ nft_cmp_module_exit();
+err2:
+ nft_immediate_module_exit();
+err1:
+ return err;
+}
+
+void nf_tables_core_module_exit(void)
+{
+ nft_dynset_module_exit();
+ nft_payload_module_exit();
+ nft_byteorder_module_exit();
+ nft_bitwise_module_exit();
+ nft_lookup_module_exit();
+ nft_cmp_module_exit();
+ nft_immediate_module_exit();
+}
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
new file mode 100644
index 000000000..9dd2d216c
--- /dev/null
+++ b/net/netfilter/nf_tables_inet.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/ip.h>
+
+static void nft_inet_hook_ops_init(struct nf_hook_ops *ops, unsigned int n)
+{
+ struct nft_af_info *afi;
+
+ if (n == 1)
+ afi = &nft_af_ipv4;
+ else
+ afi = &nft_af_ipv6;
+
+ ops->pf = afi->family;
+ if (afi->hooks[ops->hooknum])
+ ops->hook = afi->hooks[ops->hooknum];
+}
+
+static struct nft_af_info nft_af_inet __read_mostly = {
+ .family = NFPROTO_INET,
+ .nhooks = NF_INET_NUMHOOKS,
+ .owner = THIS_MODULE,
+ .nops = 2,
+ .hook_ops_init = nft_inet_hook_ops_init,
+};
+
+static int __net_init nf_tables_inet_init_net(struct net *net)
+{
+ net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+ if (net->nft.inet == NULL)
+ return -ENOMEM;
+ memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet));
+
+ if (nft_register_afinfo(net, net->nft.inet) < 0)
+ goto err;
+
+ return 0;
+
+err:
+ kfree(net->nft.inet);
+ return -ENOMEM;
+}
+
+static void __net_exit nf_tables_inet_exit_net(struct net *net)
+{
+ nft_unregister_afinfo(net->nft.inet);
+ kfree(net->nft.inet);
+}
+
+static struct pernet_operations nf_tables_inet_net_ops = {
+ .init = nf_tables_inet_init_net,
+ .exit = nf_tables_inet_exit_net,
+};
+
+static const struct nf_chain_type filter_inet = {
+ .name = "filter",
+ .type = NFT_CHAIN_T_DEFAULT,
+ .family = NFPROTO_INET,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING),
+};
+
+static int __init nf_tables_inet_init(void)
+{
+ int ret;
+
+ nft_register_chain_type(&filter_inet);
+ ret = register_pernet_subsys(&nf_tables_inet_net_ops);
+ if (ret < 0)
+ nft_unregister_chain_type(&filter_inet);
+
+ return ret;
+}
+
+static void __exit nf_tables_inet_exit(void)
+{
+ unregister_pernet_subsys(&nf_tables_inet_net_ops);
+ nft_unregister_chain_type(&filter_inet);
+}
+
+module_init(nf_tables_inet_init);
+module_exit(nf_tables_inet_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_FAMILY(1);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
new file mode 100644
index 000000000..8b117c90e
--- /dev/null
+++ b/net/netfilter/nfnetlink.c
@@ -0,0 +1,540 @@
+/* Netfilter messages via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>,
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Initial netfilter messages via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <asm/uaccess.h>
+#include <net/sock.h>
+#include <linux/init.h>
+
+#include <net/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+
+static char __initdata nfversion[] = "0.30";
+
+static struct {
+ struct mutex mutex;
+ const struct nfnetlink_subsystem __rcu *subsys;
+} table[NFNL_SUBSYS_COUNT];
+
+static const int nfnl_group2type[NFNLGRP_MAX+1] = {
+ [NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_DESTROY] = NFNL_SUBSYS_CTNETLINK,
+ [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP,
+ [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP,
+ [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
+ [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES,
+ [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT,
+};
+
+void nfnl_lock(__u8 subsys_id)
+{
+ mutex_lock(&table[subsys_id].mutex);
+}
+EXPORT_SYMBOL_GPL(nfnl_lock);
+
+void nfnl_unlock(__u8 subsys_id)
+{
+ mutex_unlock(&table[subsys_id].mutex);
+}
+EXPORT_SYMBOL_GPL(nfnl_unlock);
+
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_nfnl_is_held(u8 subsys_id)
+{
+ return lockdep_is_held(&table[subsys_id].mutex);
+}
+EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
+#endif
+
+int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
+{
+ nfnl_lock(n->subsys_id);
+ if (table[n->subsys_id].subsys) {
+ nfnl_unlock(n->subsys_id);
+ return -EBUSY;
+ }
+ rcu_assign_pointer(table[n->subsys_id].subsys, n);
+ nfnl_unlock(n->subsys_id);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
+
+int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
+{
+ nfnl_lock(n->subsys_id);
+ table[n->subsys_id].subsys = NULL;
+ nfnl_unlock(n->subsys_id);
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
+
+static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+{
+ u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+
+ if (subsys_id >= NFNL_SUBSYS_COUNT)
+ return NULL;
+
+ return rcu_dereference(table[subsys_id].subsys);
+}
+
+static inline const struct nfnl_callback *
+nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss)
+{
+ u_int8_t cb_id = NFNL_MSG_TYPE(type);
+
+ if (cb_id >= ss->cb_count)
+ return NULL;
+
+ return &ss->cb[cb_id];
+}
+
+int nfnetlink_has_listeners(struct net *net, unsigned int group)
+{
+ return netlink_has_listeners(net->nfnl, group);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
+
+struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
+ u32 dst_portid, gfp_t gfp_mask)
+{
+ return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
+
+int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
+ unsigned int group, int echo, gfp_t flags)
+{
+ return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_send);
+
+int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
+{
+ return netlink_set_err(net->nfnl, portid, group, error);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_set_err);
+
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
+ int flags)
+{
+ return netlink_unicast(net->nfnl, skb, portid, flags);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+
+/* Process one complete nfnetlink message. */
+static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ const struct nfnl_callback *nc;
+ const struct nfnetlink_subsystem *ss;
+ int type, err;
+
+ /* All the messages must at least contain nfgenmsg */
+ if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))
+ return 0;
+
+ type = nlh->nlmsg_type;
+replay:
+ rcu_read_lock();
+ ss = nfnetlink_get_subsys(type);
+ if (!ss) {
+#ifdef CONFIG_MODULES
+ rcu_read_unlock();
+ request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
+ rcu_read_lock();
+ ss = nfnetlink_get_subsys(type);
+ if (!ss)
+#endif
+ {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ }
+
+ nc = nfnetlink_find_client(type, ss);
+ if (!nc) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ {
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ int attrlen = nlh->nlmsg_len - min_len;
+ __u8 subsys_id = NFNL_SUBSYS_ID(type);
+
+ err = nla_parse(cda, ss->cb[cb_id].attr_count,
+ attr, attrlen, ss->cb[cb_id].policy);
+ if (err < 0) {
+ rcu_read_unlock();
+ return err;
+ }
+
+ if (nc->call_rcu) {
+ err = nc->call_rcu(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ nfnl_lock(subsys_id);
+ if (rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex)) != ss ||
+ nfnetlink_find_client(type, ss) != nc)
+ err = -EAGAIN;
+ else if (nc->call)
+ err = nc->call(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ else
+ err = -EINVAL;
+ nfnl_unlock(subsys_id);
+ }
+ if (err == -EAGAIN)
+ goto replay;
+ return err;
+ }
+}
+
+struct nfnl_err {
+ struct list_head head;
+ struct nlmsghdr *nlh;
+ int err;
+};
+
+static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err)
+{
+ struct nfnl_err *nfnl_err;
+
+ nfnl_err = kmalloc(sizeof(struct nfnl_err), GFP_KERNEL);
+ if (nfnl_err == NULL)
+ return -ENOMEM;
+
+ nfnl_err->nlh = nlh;
+ nfnl_err->err = err;
+ list_add_tail(&nfnl_err->head, list);
+
+ return 0;
+}
+
+static void nfnl_err_del(struct nfnl_err *nfnl_err)
+{
+ list_del(&nfnl_err->head);
+ kfree(nfnl_err);
+}
+
+static void nfnl_err_reset(struct list_head *err_list)
+{
+ struct nfnl_err *nfnl_err, *next;
+
+ list_for_each_entry_safe(nfnl_err, next, err_list, head)
+ nfnl_err_del(nfnl_err);
+}
+
+static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb)
+{
+ struct nfnl_err *nfnl_err, *next;
+
+ list_for_each_entry_safe(nfnl_err, next, err_list, head) {
+ netlink_ack(skb, nfnl_err->nlh, nfnl_err->err);
+ nfnl_err_del(nfnl_err);
+ }
+}
+
+static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
+ u_int16_t subsys_id)
+{
+ struct sk_buff *oskb = skb;
+ struct net *net = sock_net(skb->sk);
+ const struct nfnetlink_subsystem *ss;
+ const struct nfnl_callback *nc;
+ bool success = true, done = false;
+ static LIST_HEAD(err_list);
+ int err;
+
+ if (subsys_id >= NFNL_SUBSYS_COUNT)
+ return netlink_ack(skb, nlh, -EINVAL);
+replay:
+ skb = netlink_skb_clone(oskb, GFP_KERNEL);
+ if (!skb)
+ return netlink_ack(oskb, nlh, -ENOMEM);
+
+ skb->sk = oskb->sk;
+
+ nfnl_lock(subsys_id);
+ ss = rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex));
+ if (!ss) {
+#ifdef CONFIG_MODULES
+ nfnl_unlock(subsys_id);
+ request_module("nfnetlink-subsys-%d", subsys_id);
+ nfnl_lock(subsys_id);
+ ss = rcu_dereference_protected(table[subsys_id].subsys,
+ lockdep_is_held(&table[subsys_id].mutex));
+ if (!ss)
+#endif
+ {
+ nfnl_unlock(subsys_id);
+ netlink_ack(skb, nlh, -EOPNOTSUPP);
+ return kfree_skb(skb);
+ }
+ }
+
+ if (!ss->commit || !ss->abort) {
+ nfnl_unlock(subsys_id);
+ netlink_ack(skb, nlh, -EOPNOTSUPP);
+ return kfree_skb(skb);
+ }
+
+ while (skb->len >= nlmsg_total_size(0)) {
+ int msglen, type;
+
+ nlh = nlmsg_hdr(skb);
+ err = 0;
+
+ if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) ||
+ skb->len < nlh->nlmsg_len) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ /* Only requests are handled by the kernel */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ type = nlh->nlmsg_type;
+ if (type == NFNL_MSG_BATCH_BEGIN) {
+ /* Malformed: Batch begin twice */
+ nfnl_err_reset(&err_list);
+ success = false;
+ goto done;
+ } else if (type == NFNL_MSG_BATCH_END) {
+ done = true;
+ goto done;
+ } else if (type < NLMSG_MIN_TYPE) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ /* We only accept a batch with messages for the same
+ * subsystem.
+ */
+ if (NFNL_SUBSYS_ID(type) != subsys_id) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ nc = nfnetlink_find_client(type, ss);
+ if (!nc) {
+ err = -EINVAL;
+ goto ack;
+ }
+
+ {
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+ struct nlattr *attr = (void *)nlh + min_len;
+ int attrlen = nlh->nlmsg_len - min_len;
+
+ err = nla_parse(cda, ss->cb[cb_id].attr_count,
+ attr, attrlen, ss->cb[cb_id].policy);
+ if (err < 0)
+ goto ack;
+
+ if (nc->call_batch) {
+ err = nc->call_batch(net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
+ }
+
+ /* The lock was released to autoload some module, we
+ * have to abort and start from scratch using the
+ * original skb.
+ */
+ if (err == -EAGAIN) {
+ nfnl_err_reset(&err_list);
+ ss->abort(oskb);
+ nfnl_unlock(subsys_id);
+ kfree_skb(skb);
+ goto replay;
+ }
+ }
+ack:
+ if (nlh->nlmsg_flags & NLM_F_ACK || err) {
+ /* Errors are delivered once the full batch has been
+ * processed, this avoids that the same error is
+ * reported several times when replaying the batch.
+ */
+ if (nfnl_err_add(&err_list, nlh, err) < 0) {
+ /* We failed to enqueue an error, reset the
+ * list of errors and send OOM to userspace
+ * pointing to the batch header.
+ */
+ nfnl_err_reset(&err_list);
+ netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM);
+ success = false;
+ goto done;
+ }
+ /* We don't stop processing the batch on errors, thus,
+ * userspace gets all the errors that the batch
+ * triggers.
+ */
+ if (err)
+ success = false;
+ }
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+done:
+ if (success && done)
+ ss->commit(oskb);
+ else
+ ss->abort(oskb);
+
+ nfnl_err_deliver(&err_list, oskb);
+ nfnl_unlock(subsys_id);
+ kfree_skb(skb);
+}
+
+static void nfnetlink_rcv(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ int msglen;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < nlh->nlmsg_len)
+ return;
+
+ if (!netlink_net_capable(skb, CAP_NET_ADMIN)) {
+ netlink_ack(skb, nlh, -EPERM);
+ return;
+ }
+
+ if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) {
+ struct nfgenmsg *nfgenmsg;
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
+ return;
+
+ nfgenmsg = nlmsg_data(nlh);
+ skb_pull(skb, msglen);
+ nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+ } else {
+ netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
+ }
+}
+
+#ifdef CONFIG_MODULES
+static int nfnetlink_bind(struct net *net, int group)
+{
+ const struct nfnetlink_subsystem *ss;
+ int type;
+
+ if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+ return 0;
+
+ type = nfnl_group2type[group];
+
+ rcu_read_lock();
+ ss = nfnetlink_get_subsys(type);
+ rcu_read_unlock();
+ if (!ss)
+ request_module("nfnetlink-subsys-%d", type);
+ return 0;
+}
+#endif
+
+static int __net_init nfnetlink_net_init(struct net *net)
+{
+ struct sock *nfnl;
+ struct netlink_kernel_cfg cfg = {
+ .groups = NFNLGRP_MAX,
+ .input = nfnetlink_rcv,
+#ifdef CONFIG_MODULES
+ .bind = nfnetlink_bind,
+#endif
+ };
+
+ nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
+ if (!nfnl)
+ return -ENOMEM;
+ net->nfnl_stash = nfnl;
+ rcu_assign_pointer(net->nfnl, nfnl);
+ return 0;
+}
+
+static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list)
+ RCU_INIT_POINTER(net->nfnl, NULL);
+ synchronize_net();
+ list_for_each_entry(net, net_exit_list, exit_list)
+ netlink_kernel_release(net->nfnl_stash);
+}
+
+static struct pernet_operations nfnetlink_net_ops = {
+ .init = nfnetlink_net_init,
+ .exit_batch = nfnetlink_net_exit_batch,
+};
+
+static int __init nfnetlink_init(void)
+{
+ int i;
+
+ for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++)
+ BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
+
+ for (i=0; i<NFNL_SUBSYS_COUNT; i++)
+ mutex_init(&table[i].mutex);
+
+ pr_info("Netfilter messages via NETLINK v%s.\n", nfversion);
+ return register_pernet_subsys(&nfnetlink_net_ops);
+}
+
+static void __exit nfnetlink_exit(void)
+{
+ pr_info("Removing netfilter NETLINK layer.\n");
+ unregister_pernet_subsys(&nfnetlink_net_ops);
+}
+module_init(nfnetlink_init);
+module_exit(nfnetlink_exit);
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
new file mode 100644
index 000000000..c18af2f63
--- /dev/null
+++ b/net/netfilter/nfnetlink_acct.c
@@ -0,0 +1,512 @@
+/*
+ * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/atomic.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_acct.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
+
+static LIST_HEAD(nfnl_acct_list);
+
+struct nf_acct {
+ atomic64_t pkts;
+ atomic64_t bytes;
+ unsigned long flags;
+ struct list_head head;
+ atomic_t refcnt;
+ char name[NFACCT_NAME_MAX];
+ struct rcu_head rcu_head;
+ char data[0];
+};
+
+struct nfacct_filter {
+ u32 value;
+ u32 mask;
+};
+
+#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
+#define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */
+
+static int
+nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ struct nf_acct *nfacct, *matching = NULL;
+ char *acct_name;
+ unsigned int size = 0;
+ u32 flags = 0;
+
+ if (!tb[NFACCT_NAME])
+ return -EINVAL;
+
+ acct_name = nla_data(tb[NFACCT_NAME]);
+ if (strlen(acct_name) == 0)
+ return -EINVAL;
+
+ list_for_each_entry(nfacct, &nfnl_acct_list, head) {
+ if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ matching = nfacct;
+ break;
+ }
+
+ if (matching) {
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ /* reset counters if you request a replacement. */
+ atomic64_set(&matching->pkts, 0);
+ atomic64_set(&matching->bytes, 0);
+ smp_mb__before_atomic();
+ /* reset overquota flag if quota is enabled. */
+ if ((matching->flags & NFACCT_F_QUOTA))
+ clear_bit(NFACCT_OVERQUOTA_BIT,
+ &matching->flags);
+ return 0;
+ }
+ return -EBUSY;
+ }
+
+ if (tb[NFACCT_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS]));
+ if (flags & ~NFACCT_F_QUOTA)
+ return -EOPNOTSUPP;
+ if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA)
+ return -EINVAL;
+ if (flags & NFACCT_F_OVERQUOTA)
+ return -EINVAL;
+
+ size += sizeof(u64);
+ }
+
+ nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL);
+ if (nfacct == NULL)
+ return -ENOMEM;
+
+ if (flags & NFACCT_F_QUOTA) {
+ u64 *quota = (u64 *)nfacct->data;
+
+ *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA]));
+ nfacct->flags = flags;
+ }
+
+ strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+
+ if (tb[NFACCT_BYTES]) {
+ atomic64_set(&nfacct->bytes,
+ be64_to_cpu(nla_get_be64(tb[NFACCT_BYTES])));
+ }
+ if (tb[NFACCT_PKTS]) {
+ atomic64_set(&nfacct->pkts,
+ be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
+ }
+ atomic_set(&nfacct->refcnt, 1);
+ list_add_tail_rcu(&nfacct->head, &nfnl_acct_list);
+ return 0;
+}
+
+static int
+nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct nf_acct *acct)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ u64 pkts, bytes;
+ u32 old_flags;
+
+ event |= NFNL_SUBSYS_ACCT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFACCT_NAME, acct->name))
+ goto nla_put_failure;
+
+ old_flags = acct->flags;
+ if (type == NFNL_MSG_ACCT_GET_CTRZERO) {
+ pkts = atomic64_xchg(&acct->pkts, 0);
+ bytes = atomic64_xchg(&acct->bytes, 0);
+ smp_mb__before_atomic();
+ if (acct->flags & NFACCT_F_QUOTA)
+ clear_bit(NFACCT_OVERQUOTA_BIT, &acct->flags);
+ } else {
+ pkts = atomic64_read(&acct->pkts);
+ bytes = atomic64_read(&acct->bytes);
+ }
+ if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) ||
+ nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) ||
+ nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))))
+ goto nla_put_failure;
+ if (acct->flags & NFACCT_F_QUOTA) {
+ u64 *quota = (u64 *)acct->data;
+
+ if (nla_put_be32(skb, NFACCT_FLAGS, htonl(old_flags)) ||
+ nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota)))
+ goto nla_put_failure;
+ }
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_acct *cur, *last;
+ const struct nfacct_filter *filter = cb->data;
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct nf_acct *)cb->args[1];
+ if (cb->args[1])
+ cb->args[1] = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ if (last) {
+ if (cur != last)
+ continue;
+
+ last = NULL;
+ }
+
+ if (filter && (cur->flags & filter->mask) != filter->value)
+ continue;
+
+ if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ break;
+ }
+ }
+ if (!cb->args[1])
+ cb->args[2] = 1;
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int nfnl_acct_done(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+ return 0;
+}
+
+static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = {
+ [NFACCT_FILTER_MASK] = { .type = NLA_U32 },
+ [NFACCT_FILTER_VALUE] = { .type = NLA_U32 },
+};
+
+static struct nfacct_filter *
+nfacct_filter_alloc(const struct nlattr * const attr)
+{
+ struct nfacct_filter *filter;
+ struct nlattr *tb[NFACCT_FILTER_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
+ if (!filter)
+ return ERR_PTR(-ENOMEM);
+
+ filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK]));
+ filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE]));
+
+ return filter;
+}
+
+static int
+nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = -ENOENT;
+ struct nf_acct *cur;
+ char *acct_name;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nfnl_acct_dump,
+ .done = nfnl_acct_done,
+ };
+
+ if (tb[NFACCT_FILTER]) {
+ struct nfacct_filter *filter;
+
+ filter = nfacct_filter_alloc(tb[NFACCT_FILTER]);
+ if (IS_ERR(filter))
+ return PTR_ERR(filter);
+
+ c.data = filter;
+ }
+ return netlink_dump_start(nfnl, skb, nlh, &c);
+ }
+
+ if (!tb[NFACCT_NAME])
+ return -EINVAL;
+ acct_name = nla_data(tb[NFACCT_NAME]);
+
+ list_for_each_entry(cur, &nfnl_acct_list, head) {
+ struct sk_buff *skb2;
+
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ return ret;
+}
+
+/* try to delete object, fail if it is still in use. */
+static int nfnl_acct_try_del(struct nf_acct *cur)
+{
+ int ret = 0;
+
+ /* we want to avoid races with nfnl_acct_find_get. */
+ if (atomic_dec_and_test(&cur->refcnt)) {
+ /* We are protected by nfnl mutex. */
+ list_del_rcu(&cur->head);
+ kfree_rcu(cur, rcu_head);
+ } else {
+ /* still in use, restore reference counter. */
+ atomic_inc(&cur->refcnt);
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+static int
+nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ char *acct_name;
+ struct nf_acct *cur;
+ int ret = -ENOENT;
+
+ if (!tb[NFACCT_NAME]) {
+ list_for_each_entry(cur, &nfnl_acct_list, head)
+ nfnl_acct_try_del(cur);
+
+ return 0;
+ }
+ acct_name = nla_data(tb[NFACCT_NAME]);
+
+ list_for_each_entry(cur, &nfnl_acct_list, head) {
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
+ continue;
+
+ ret = nfnl_acct_try_del(cur);
+ if (ret < 0)
+ return ret;
+
+ break;
+ }
+ return ret;
+}
+
+static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
+ [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 },
+ [NFACCT_BYTES] = { .type = NLA_U64 },
+ [NFACCT_PKTS] = { .type = NLA_U64 },
+ [NFACCT_FLAGS] = { .type = NLA_U32 },
+ [NFACCT_QUOTA] = { .type = NLA_U64 },
+ [NFACCT_FILTER] = {.type = NLA_NESTED },
+};
+
+static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
+ [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy },
+};
+
+static const struct nfnetlink_subsystem nfnl_acct_subsys = {
+ .name = "acct",
+ .subsys_id = NFNL_SUBSYS_ACCT,
+ .cb_count = NFNL_MSG_ACCT_MAX,
+ .cb = nfnl_acct_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
+
+struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+{
+ struct nf_acct *cur, *acct = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+ if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
+ continue;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err;
+
+ if (!atomic_inc_not_zero(&cur->refcnt)) {
+ module_put(THIS_MODULE);
+ goto err;
+ }
+
+ acct = cur;
+ break;
+ }
+err:
+ rcu_read_unlock();
+ return acct;
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
+
+void nfnl_acct_put(struct nf_acct *acct)
+{
+ atomic_dec(&acct->refcnt);
+ module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_put);
+
+void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
+{
+ atomic64_inc(&nfacct->pkts);
+ atomic64_add(skb->len, &nfacct->bytes);
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_update);
+
+static void nfnl_overquota_report(struct nf_acct *nfacct)
+{
+ int ret;
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0,
+ nfacct);
+ if (ret <= 0) {
+ kfree_skb(skb);
+ return;
+ }
+ netlink_broadcast(init_net.nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
+ GFP_ATOMIC);
+}
+
+int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
+{
+ u64 now;
+ u64 *quota;
+ int ret = NFACCT_UNDERQUOTA;
+
+ /* no place here if we don't have a quota */
+ if (!(nfacct->flags & NFACCT_F_QUOTA))
+ return NFACCT_NO_QUOTA;
+
+ quota = (u64 *)nfacct->data;
+ now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ?
+ atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes);
+
+ ret = now > *quota;
+
+ if (now >= *quota &&
+ !test_and_set_bit(NFACCT_OVERQUOTA_BIT, &nfacct->flags)) {
+ nfnl_overquota_report(nfacct);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
+
+static int __init nfnl_acct_init(void)
+{
+ int ret;
+
+ pr_info("nfnl_acct: registering with nfnetlink.\n");
+ ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
+ if (ret < 0) {
+ pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+ return 0;
+err_out:
+ return ret;
+}
+
+static void __exit nfnl_acct_exit(void)
+{
+ struct nf_acct *cur, *tmp;
+
+ pr_info("nfnl_acct: unregistering from nfnetlink.\n");
+ nfnetlink_subsys_unregister(&nfnl_acct_subsys);
+
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) {
+ list_del_rcu(&cur->head);
+ /* We are sure that our objects have no clients at this point,
+ * it's safe to release them all without checking refcnt. */
+ kfree_rcu(cur, rcu_head);
+ }
+}
+
+module_init(nfnl_acct_init);
+module_exit(nfnl_acct_exit);
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
new file mode 100644
index 000000000..54330fb5e
--- /dev/null
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -0,0 +1,683 @@
+/*
+ * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ *
+ * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com>
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/netfilter/nfnetlink_cthelper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers");
+
+static int
+nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ const struct nf_conn_help *help;
+ struct nf_conntrack_helper *helper;
+
+ help = nfct_help(ct);
+ if (help == NULL)
+ return NF_DROP;
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(help->helper);
+ if (helper == NULL)
+ return NF_DROP;
+
+ /* This is an user-space helper not yet configured, skip. */
+ if ((helper->flags &
+ (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) ==
+ NF_CT_HELPER_F_USERSPACE)
+ return NF_ACCEPT;
+
+ /* If the user-space helper is not available, don't block traffic. */
+ return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS;
+}
+
+static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = {
+ [NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, },
+ [NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, },
+};
+
+static int
+nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nlattr *attr)
+{
+ int err;
+ struct nlattr *tb[NFCTH_TUPLE_MAX+1];
+
+ err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM])
+ return -EINVAL;
+
+ /* Not all fields are initialized so first zero the tuple */
+ memset(tuple, 0, sizeof(struct nf_conntrack_tuple));
+
+ tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM]));
+ tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]);
+
+ return 0;
+}
+
+static int
+nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+
+ if (attr == NULL)
+ return -EINVAL;
+
+ if (help->helper->data_len == 0)
+ return -EINVAL;
+
+ memcpy(help->data, nla_data(attr), help->helper->data_len);
+ return 0;
+}
+
+static int
+nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
+{
+ const struct nf_conn_help *help = nfct_help(ct);
+
+ if (help->helper->data_len &&
+ nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -ENOSPC;
+}
+
+static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = {
+ [NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN-1 },
+ [NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, },
+ [NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, },
+};
+
+static int
+nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
+ const struct nlattr *attr)
+{
+ int err;
+ struct nlattr *tb[NFCTH_POLICY_MAX+1];
+
+ err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFCTH_POLICY_NAME] ||
+ !tb[NFCTH_POLICY_EXPECT_MAX] ||
+ !tb[NFCTH_POLICY_EXPECT_TIMEOUT])
+ return -EINVAL;
+
+ strncpy(expect_policy->name,
+ nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN);
+ expect_policy->max_expected =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+ expect_policy->timeout =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT]));
+
+ return 0;
+}
+
+static const struct nla_policy
+nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = {
+ [NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, },
+};
+
+static int
+nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
+ const struct nlattr *attr)
+{
+ int i, ret;
+ struct nf_conntrack_expect_policy *expect_policy;
+ struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
+
+ ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
+ nfnl_cthelper_expect_policy_set);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[NFCTH_POLICY_SET_NUM])
+ return -EINVAL;
+
+ helper->expect_class_max =
+ ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM]));
+
+ if (helper->expect_class_max != 0 &&
+ helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES)
+ return -EOVERFLOW;
+
+ expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) *
+ helper->expect_class_max, GFP_KERNEL);
+ if (expect_policy == NULL)
+ return -ENOMEM;
+
+ for (i=0; i<helper->expect_class_max; i++) {
+ if (!tb[NFCTH_POLICY_SET+i])
+ goto err;
+
+ ret = nfnl_cthelper_expect_policy(&expect_policy[i],
+ tb[NFCTH_POLICY_SET+i]);
+ if (ret < 0)
+ goto err;
+ }
+ helper->expect_policy = expect_policy;
+ return 0;
+err:
+ kfree(expect_policy);
+ return -EINVAL;
+}
+
+static int
+nfnl_cthelper_create(const struct nlattr * const tb[],
+ struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_helper *helper;
+ int ret;
+
+ if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN])
+ return -EINVAL;
+
+ helper = kzalloc(sizeof(struct nf_conntrack_helper), GFP_KERNEL);
+ if (helper == NULL)
+ return -ENOMEM;
+
+ ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]);
+ if (ret < 0)
+ goto err;
+
+ strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN);
+ helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
+ helper->flags |= NF_CT_HELPER_F_USERSPACE;
+ memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple));
+
+ helper->me = THIS_MODULE;
+ helper->help = nfnl_userspace_cthelper;
+ helper->from_nlattr = nfnl_cthelper_from_nlattr;
+ helper->to_nlattr = nfnl_cthelper_to_nlattr;
+
+ /* Default to queue number zero, this can be updated at any time. */
+ if (tb[NFCTH_QUEUE_NUM])
+ helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM]));
+
+ if (tb[NFCTH_STATUS]) {
+ int status = ntohl(nla_get_be32(tb[NFCTH_STATUS]));
+
+ switch(status) {
+ case NFCT_HELPER_STATUS_ENABLED:
+ helper->flags |= NF_CT_HELPER_F_CONFIGURED;
+ break;
+ case NFCT_HELPER_STATUS_DISABLED:
+ helper->flags &= ~NF_CT_HELPER_F_CONFIGURED;
+ break;
+ }
+ }
+
+ ret = nf_conntrack_helper_register(helper);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(helper);
+ return ret;
+}
+
+static int
+nfnl_cthelper_update(const struct nlattr * const tb[],
+ struct nf_conntrack_helper *helper)
+{
+ int ret;
+
+ if (tb[NFCTH_PRIV_DATA_LEN])
+ return -EBUSY;
+
+ if (tb[NFCTH_POLICY]) {
+ ret = nfnl_cthelper_parse_expect_policy(helper,
+ tb[NFCTH_POLICY]);
+ if (ret < 0)
+ return ret;
+ }
+ if (tb[NFCTH_QUEUE_NUM])
+ helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM]));
+
+ if (tb[NFCTH_STATUS]) {
+ int status = ntohl(nla_get_be32(tb[NFCTH_STATUS]));
+
+ switch(status) {
+ case NFCT_HELPER_STATUS_ENABLED:
+ helper->flags |= NF_CT_HELPER_F_CONFIGURED;
+ break;
+ case NFCT_HELPER_STATUS_DISABLED:
+ helper->flags &= ~NF_CT_HELPER_F_CONFIGURED;
+ break;
+ }
+ }
+ return 0;
+}
+
+static int
+nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ const char *helper_name;
+ struct nf_conntrack_helper *cur, *helper = NULL;
+ struct nf_conntrack_tuple tuple;
+ int ret = 0, i;
+
+ if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE])
+ return -EINVAL;
+
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ rcu_read_lock();
+ for (i = 0; i < nf_ct_helper_hsize && !helper; i++) {
+ hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0)
+ continue;
+
+ if ((tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ ret = -EEXIST;
+ goto err;
+ }
+ helper = cur;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (helper == NULL)
+ ret = nfnl_cthelper_create(tb, &tuple);
+ else
+ ret = nfnl_cthelper_update(tb, helper);
+
+ return ret;
+err:
+ rcu_read_unlock();
+ return ret;
+}
+
+static int
+nfnl_cthelper_dump_tuple(struct sk_buff *skb,
+ struct nf_conntrack_helper *helper)
+{
+ struct nlattr *nest_parms;
+
+ nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED);
+ if (nest_parms == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM,
+ htons(helper->tuple.src.l3num)))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+nfnl_cthelper_dump_policy(struct sk_buff *skb,
+ struct nf_conntrack_helper *helper)
+{
+ int i;
+ struct nlattr *nest_parms1, *nest_parms2;
+
+ nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED);
+ if (nest_parms1 == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM,
+ htonl(helper->expect_class_max)))
+ goto nla_put_failure;
+
+ for (i=0; i<helper->expect_class_max; i++) {
+ nest_parms2 = nla_nest_start(skb,
+ (NFCTH_POLICY_SET+i) | NLA_F_NESTED);
+ if (nest_parms2 == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_string(skb, NFCTH_POLICY_NAME,
+ helper->expect_policy[i].name))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX,
+ htonl(helper->expect_policy[i].max_expected)))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT,
+ htonl(helper->expect_policy[i].timeout)))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms2);
+ }
+ nla_nest_end(skb, nest_parms1);
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int
+nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct nf_conntrack_helper *helper)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ int status;
+
+ event |= NFNL_SUBSYS_CTHELPER << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFCTH_NAME, helper->name))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num)))
+ goto nla_put_failure;
+
+ if (nfnl_cthelper_dump_tuple(skb, helper) < 0)
+ goto nla_put_failure;
+
+ if (nfnl_cthelper_dump_policy(skb, helper) < 0)
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len)))
+ goto nla_put_failure;
+
+ if (helper->flags & NF_CT_HELPER_F_CONFIGURED)
+ status = NFCT_HELPER_STATUS_ENABLED;
+ else
+ status = NFCT_HELPER_STATUS_DISABLED;
+
+ if (nla_put_be32(skb, NFCTH_STATUS, htonl(status)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_conntrack_helper *cur, *last;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_helper *)cb->args[1];
+ for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) {
+restart:
+ hlist_for_each_entry_rcu(cur,
+ &nf_ct_helper_hash[cb->args[0]], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (cb->args[1]) {
+ if (cur != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (nfnl_cthelper_fill_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ NFNL_MSG_CTHELPER_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ goto out;
+ }
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+out:
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int
+nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = -ENOENT, i;
+ struct nf_conntrack_helper *cur;
+ struct sk_buff *skb2;
+ char *helper_name = NULL;
+ struct nf_conntrack_tuple tuple;
+ bool tuple_set = false;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = nfnl_cthelper_dump_table,
+ };
+ return netlink_dump_start(nfnl, skb, nlh, &c);
+ }
+
+ if (tb[NFCTH_NAME])
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ if (tb[NFCTH_TUPLE]) {
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ tuple_set = true;
+ }
+
+ for (i = 0; i < nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[i], hnode) {
+
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ if (helper_name && strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0) {
+ continue;
+ }
+ if (tuple_set &&
+ (tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_CTHELPER_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ }
+ return ret;
+}
+
+static int
+nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ char *helper_name = NULL;
+ struct nf_conntrack_helper *cur;
+ struct hlist_node *tmp;
+ struct nf_conntrack_tuple tuple;
+ bool tuple_set = false, found = false;
+ int i, j = 0, ret;
+
+ if (tb[NFCTH_NAME])
+ helper_name = nla_data(tb[NFCTH_NAME]);
+
+ if (tb[NFCTH_TUPLE]) {
+ ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]);
+ if (ret < 0)
+ return ret;
+
+ tuple_set = true;
+ }
+
+ for (i = 0; i < nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
+ hnode) {
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ j++;
+
+ if (helper_name && strncmp(cur->name, helper_name,
+ NF_CT_HELPER_NAME_LEN) != 0) {
+ continue;
+ }
+ if (tuple_set &&
+ (tuple.src.l3num != cur->tuple.src.l3num ||
+ tuple.dst.protonum != cur->tuple.dst.protonum))
+ continue;
+
+ found = true;
+ nf_conntrack_helper_unregister(cur);
+ }
+ }
+ /* Make sure we return success if we flush and there is no helpers */
+ return (found || j == 0) ? 0 : -ENOENT;
+}
+
+static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
+ [NFCTH_NAME] = { .type = NLA_NUL_STRING,
+ .len = NF_CT_HELPER_NAME_LEN-1 },
+ [NFCTH_QUEUE_NUM] = { .type = NLA_U32, },
+};
+
+static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
+ [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy },
+};
+
+static const struct nfnetlink_subsystem nfnl_cthelper_subsys = {
+ .name = "cthelper",
+ .subsys_id = NFNL_SUBSYS_CTHELPER,
+ .cb_count = NFNL_MSG_CTHELPER_MAX,
+ .cb = nfnl_cthelper_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER);
+
+static int __init nfnl_cthelper_init(void)
+{
+ int ret;
+
+ ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys);
+ if (ret < 0) {
+ pr_err("nfnl_cthelper: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+ return 0;
+err_out:
+ return ret;
+}
+
+static void __exit nfnl_cthelper_exit(void)
+{
+ struct nf_conntrack_helper *cur;
+ struct hlist_node *tmp;
+ int i;
+
+ nfnetlink_subsys_unregister(&nfnl_cthelper_subsys);
+
+ for (i=0; i<nf_ct_helper_hsize; i++) {
+ hlist_for_each_entry_safe(cur, tmp, &nf_ct_helper_hash[i],
+ hnode) {
+ /* skip non-userspace conntrack helpers. */
+ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE))
+ continue;
+
+ nf_conntrack_helper_unregister(cur);
+ }
+ }
+}
+
+module_init(nfnl_cthelper_init);
+module_exit(nfnl_cthelper_exit);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
new file mode 100644
index 000000000..476accd17
--- /dev/null
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -0,0 +1,585 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2012 by Vyatta Inc. <http://www.vyatta.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/rculist.h>
+#include <linux/rculist_nulls.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/security.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_cttimeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning");
+
+static LIST_HEAD(cttimeout_list);
+
+static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
+ [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING,
+ .len = CTNL_TIMEOUT_NAME_MAX - 1},
+ [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 },
+ [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 },
+ [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED },
+};
+
+static int
+ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto,
+ struct net *net, const struct nlattr *attr)
+{
+ int ret = 0;
+
+ if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) {
+ struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1];
+
+ ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
+ attr, l4proto->ctnl_timeout.nla_policy);
+ if (ret < 0)
+ return ret;
+
+ ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+ }
+ return ret;
+}
+
+static int
+cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct ctnl_timeout *timeout, *matching = NULL;
+ struct net *net = sock_net(skb->sk);
+ char *name;
+ int ret;
+
+ if (!cda[CTA_TIMEOUT_NAME] ||
+ !cda[CTA_TIMEOUT_L3PROTO] ||
+ !cda[CTA_TIMEOUT_L4PROTO] ||
+ !cda[CTA_TIMEOUT_DATA])
+ return -EINVAL;
+
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+
+ list_for_each_entry(timeout, &cttimeout_list, head) {
+ if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ matching = timeout;
+ break;
+ }
+
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supportted, skip. */
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
+ goto err_proto_put;
+ }
+
+ if (matching) {
+ if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ /* You cannot replace one timeout policy by another of
+ * different kind, sorry.
+ */
+ if (matching->l3num != l3num ||
+ matching->l4proto->l4proto != l4num) {
+ ret = -EINVAL;
+ goto err_proto_put;
+ }
+
+ ret = ctnl_timeout_parse_policy(&matching->data,
+ l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ return ret;
+ }
+ ret = -EBUSY;
+ goto err_proto_put;
+ }
+
+ timeout = kzalloc(sizeof(struct ctnl_timeout) +
+ l4proto->ctnl_timeout.obj_size, GFP_KERNEL);
+ if (timeout == NULL) {
+ ret = -ENOMEM;
+ goto err_proto_put;
+ }
+
+ ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ if (ret < 0)
+ goto err;
+
+ strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME]));
+ timeout->l3num = l3num;
+ timeout->l4proto = l4proto;
+ atomic_set(&timeout->refcnt, 1);
+ list_add_tail_rcu(&timeout->head, &cttimeout_list);
+
+ return 0;
+err:
+ kfree(timeout);
+err_proto_put:
+ nf_ct_l4proto_put(l4proto);
+ return ret;
+}
+
+static int
+ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, struct ctnl_timeout *timeout)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+ struct nf_conntrack_l4proto *l4proto = timeout->l4proto;
+
+ event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
+ nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) ||
+ nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) ||
+ nla_put_be32(skb, CTA_TIMEOUT_USE,
+ htonl(atomic_read(&timeout->refcnt))))
+ goto nla_put_failure;
+
+ if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
+ struct nlattr *nest_parms;
+ int ret;
+
+ nest_parms = nla_nest_start(skb,
+ CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ }
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ctnl_timeout *cur, *last;
+
+ if (cb->args[2])
+ return 0;
+
+ last = (struct ctnl_timeout *)cb->args[1];
+ if (cb->args[1])
+ cb->args[1] = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cur, &cttimeout_list, head) {
+ if (last) {
+ if (cur != last)
+ continue;
+
+ last = NULL;
+ }
+ if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) {
+ cb->args[1] = (unsigned long)cur;
+ break;
+ }
+ }
+ if (!cb->args[1])
+ cb->args[2] = 1;
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static int
+cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ int ret = -ENOENT;
+ char *name;
+ struct ctnl_timeout *cur;
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = ctnl_timeout_dump,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
+
+ if (!cda[CTA_TIMEOUT_NAME])
+ return -EINVAL;
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+
+ list_for_each_entry(cur, &cttimeout_list, head) {
+ struct sk_buff *skb2;
+
+ if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_NEW, cur);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ break;
+ }
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+ }
+ return ret;
+}
+
+/* try to delete object, fail if it is still in use. */
+static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
+{
+ int ret = 0;
+
+ /* we want to avoid races with nf_ct_timeout_find_get. */
+ if (atomic_dec_and_test(&timeout->refcnt)) {
+ /* We are protected by nfnl mutex. */
+ list_del_rcu(&timeout->head);
+ nf_ct_l4proto_put(timeout->l4proto);
+ kfree_rcu(timeout, rcu_head);
+ } else {
+ /* still in use, restore reference counter. */
+ atomic_inc(&timeout->refcnt);
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+static int
+cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ char *name;
+ struct ctnl_timeout *cur;
+ int ret = -ENOENT;
+
+ if (!cda[CTA_TIMEOUT_NAME]) {
+ list_for_each_entry(cur, &cttimeout_list, head)
+ ctnl_timeout_try_del(cur);
+
+ return 0;
+ }
+ name = nla_data(cda[CTA_TIMEOUT_NAME]);
+
+ list_for_each_entry(cur, &cttimeout_list, head) {
+ if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ ret = ctnl_timeout_try_del(cur);
+ if (ret < 0)
+ return ret;
+
+ break;
+ }
+ return ret;
+}
+
+static int
+cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct net *net = sock_net(skb->sk);
+ unsigned int *timeouts;
+ int ret;
+
+ if (!cda[CTA_TIMEOUT_L3PROTO] ||
+ !cda[CTA_TIMEOUT_L4PROTO] ||
+ !cda[CTA_TIMEOUT_DATA])
+ return -EINVAL;
+
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supported, skip. */
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
+ goto err;
+ }
+
+ timeouts = l4proto->get_timeouts(net);
+
+ ret = ctnl_timeout_parse_policy(timeouts, l4proto, net,
+ cda[CTA_TIMEOUT_DATA]);
+ if (ret < 0)
+ goto err;
+
+ nf_ct_l4proto_put(l4proto);
+ return 0;
+err:
+ nf_ct_l4proto_put(l4proto);
+ return ret;
+}
+
+static int
+cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
+ u32 seq, u32 type, int event,
+ struct nf_conntrack_l4proto *l4proto)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+ event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = AF_UNSPEC;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) ||
+ nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
+ goto nla_put_failure;
+
+ if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
+ struct nlattr *nest_parms;
+ unsigned int *timeouts = l4proto->get_timeouts(net);
+ int ret;
+
+ nest_parms = nla_nest_start(skb,
+ CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+ }
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ __u16 l3num;
+ __u8 l4num;
+ struct nf_conntrack_l4proto *l4proto;
+ struct net *net = sock_net(skb->sk);
+ struct sk_buff *skb2;
+ int ret, err;
+
+ if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO])
+ return -EINVAL;
+
+ l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
+ l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ /* This protocol is not supported, skip. */
+ if (l4proto->l4proto != l4num) {
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
+ l4proto);
+ if (ret <= 0) {
+ kfree_skb(skb2);
+ err = -ENOMEM;
+ goto err;
+ }
+ ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ /* this avoids a loop in nfnetlink. */
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+err:
+ nf_ct_l4proto_put(l4proto);
+ return err;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+static struct ctnl_timeout *ctnl_timeout_find_get(const char *name)
+{
+ struct ctnl_timeout *timeout, *matching = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(timeout, &cttimeout_list, head) {
+ if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
+ continue;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err;
+
+ if (!atomic_inc_not_zero(&timeout->refcnt)) {
+ module_put(THIS_MODULE);
+ goto err;
+ }
+ matching = timeout;
+ break;
+ }
+err:
+ rcu_read_unlock();
+ return matching;
+}
+
+static void ctnl_timeout_put(struct ctnl_timeout *timeout)
+{
+ atomic_dec(&timeout->refcnt);
+ module_put(THIS_MODULE);
+}
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+
+static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
+ [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy },
+};
+
+static const struct nfnetlink_subsystem cttimeout_subsys = {
+ .name = "conntrack_timeout",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT,
+ .cb_count = IPCTNL_MSG_TIMEOUT_MAX,
+ .cb = cttimeout_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT);
+
+static int __init cttimeout_init(void)
+{
+ int ret;
+
+ ret = nfnetlink_subsys_register(&cttimeout_subsys);
+ if (ret < 0) {
+ pr_err("cttimeout_init: cannot register cttimeout with "
+ "nfnetlink.\n");
+ goto err_out;
+ }
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
+ RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+ return 0;
+
+err_out:
+ return ret;
+}
+
+static void __exit cttimeout_exit(void)
+{
+ struct ctnl_timeout *cur, *tmp;
+
+ pr_info("cttimeout: unregistering from nfnetlink.\n");
+
+ nfnetlink_subsys_unregister(&cttimeout_subsys);
+ list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) {
+ list_del_rcu(&cur->head);
+ /* We are sure that our objects have no clients at this point,
+ * it's safe to release them all without checking refcnt.
+ */
+ nf_ct_l4proto_put(cur->l4proto);
+ kfree_rcu(cur, rcu_head);
+ }
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
+ RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+}
+
+module_init(cttimeout_init);
+module_exit(cttimeout_exit);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
new file mode 100644
index 000000000..4ef1fae84
--- /dev/null
+++ b/net/netfilter/nfnetlink_log.c
@@ -0,0 +1,1125 @@
+/*
+ * This is a module which is used for logging packets to userspace via
+ * nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the old ipv4-only ipt_ULOG.c:
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_log.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
+#include <net/netfilter/nfnetlink_log.h>
+
+#include <linux/atomic.h>
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+#include "../bridge/br_private.h"
+#endif
+
+#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
+#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
+#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
+/* max packet size is limited by 16-bit struct nfattr nfa_len field */
+#define NFULNL_COPY_RANGE_MAX (0xFFFF - NLA_HDRLEN)
+
+#define PRINTR(x, args...) do { if (net_ratelimit()) \
+ printk(x, ## args); } while (0);
+
+struct nfulnl_instance {
+ struct hlist_node hlist; /* global list of instances */
+ spinlock_t lock;
+ atomic_t use; /* use count */
+
+ unsigned int qlen; /* number of nlmsgs in skb */
+ struct sk_buff *skb; /* pre-allocatd skb */
+ struct timer_list timer;
+ struct net *net;
+ struct user_namespace *peer_user_ns; /* User namespace of the peer process */
+ u32 peer_portid; /* PORTID of the peer process */
+
+ /* configurable parameters */
+ unsigned int flushtimeout; /* timeout until queue flush */
+ unsigned int nlbufsiz; /* netlink buffer allocation size */
+ unsigned int qthreshold; /* threshold of the queue */
+ u_int32_t copy_range;
+ u_int32_t seq; /* instance-local sequential counter */
+ u_int16_t group_num; /* number of this queue */
+ u_int16_t flags;
+ u_int8_t copy_mode;
+ struct rcu_head rcu;
+};
+
+#define INSTANCE_BUCKETS 16
+
+static int nfnl_log_net_id __read_mostly;
+
+struct nfnl_log_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+ atomic_t global_seq;
+};
+
+static struct nfnl_log_net *nfnl_log_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_log_net_id);
+}
+
+static inline u_int8_t instance_hashfn(u_int16_t group_num)
+{
+ return ((group_num & 0xff) % INSTANCE_BUCKETS);
+}
+
+static struct nfulnl_instance *
+__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
+{
+ struct hlist_head *head;
+ struct nfulnl_instance *inst;
+
+ head = &log->instance_table[instance_hashfn(group_num)];
+ hlist_for_each_entry_rcu(inst, head, hlist) {
+ if (inst->group_num == group_num)
+ return inst;
+ }
+ return NULL;
+}
+
+static inline void
+instance_get(struct nfulnl_instance *inst)
+{
+ atomic_inc(&inst->use);
+}
+
+static struct nfulnl_instance *
+instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
+{
+ struct nfulnl_instance *inst;
+
+ rcu_read_lock_bh();
+ inst = __instance_lookup(log, group_num);
+ if (inst && !atomic_inc_not_zero(&inst->use))
+ inst = NULL;
+ rcu_read_unlock_bh();
+
+ return inst;
+}
+
+static void nfulnl_instance_free_rcu(struct rcu_head *head)
+{
+ struct nfulnl_instance *inst =
+ container_of(head, struct nfulnl_instance, rcu);
+
+ put_net(inst->net);
+ kfree(inst);
+ module_put(THIS_MODULE);
+}
+
+static void
+instance_put(struct nfulnl_instance *inst)
+{
+ if (inst && atomic_dec_and_test(&inst->use))
+ call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu);
+}
+
+static void nfulnl_timer(unsigned long data);
+
+static struct nfulnl_instance *
+instance_create(struct net *net, u_int16_t group_num,
+ u32 portid, struct user_namespace *user_ns)
+{
+ struct nfulnl_instance *inst;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
+ int err;
+
+ spin_lock_bh(&log->instances_lock);
+ if (__instance_lookup(log, group_num)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
+ if (!inst) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ if (!try_module_get(THIS_MODULE)) {
+ kfree(inst);
+ err = -EAGAIN;
+ goto out_unlock;
+ }
+
+ INIT_HLIST_NODE(&inst->hlist);
+ spin_lock_init(&inst->lock);
+ /* needs to be two, since we _put() after creation */
+ atomic_set(&inst->use, 2);
+
+ setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
+
+ inst->net = get_net(net);
+ inst->peer_user_ns = user_ns;
+ inst->peer_portid = portid;
+ inst->group_num = group_num;
+
+ inst->qthreshold = NFULNL_QTHRESH_DEFAULT;
+ inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT;
+ inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT;
+ inst->copy_mode = NFULNL_COPY_PACKET;
+ inst->copy_range = NFULNL_COPY_RANGE_MAX;
+
+ hlist_add_head_rcu(&inst->hlist,
+ &log->instance_table[instance_hashfn(group_num)]);
+
+
+ spin_unlock_bh(&log->instances_lock);
+
+ return inst;
+
+out_unlock:
+ spin_unlock_bh(&log->instances_lock);
+ return ERR_PTR(err);
+}
+
+static void __nfulnl_flush(struct nfulnl_instance *inst);
+
+/* called with BH disabled */
+static void
+__instance_destroy(struct nfulnl_instance *inst)
+{
+ /* first pull it out of the global list */
+ hlist_del_rcu(&inst->hlist);
+
+ /* then flush all pending packets from skb */
+
+ spin_lock(&inst->lock);
+
+ /* lockless readers wont be able to use us */
+ inst->copy_mode = NFULNL_COPY_DISABLED;
+
+ if (inst->skb)
+ __nfulnl_flush(inst);
+ spin_unlock(&inst->lock);
+
+ /* and finally put the refcount */
+ instance_put(inst);
+}
+
+static inline void
+instance_destroy(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst)
+{
+ spin_lock_bh(&log->instances_lock);
+ __instance_destroy(inst);
+ spin_unlock_bh(&log->instances_lock);
+}
+
+static int
+nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
+ unsigned int range)
+{
+ int status = 0;
+
+ spin_lock_bh(&inst->lock);
+
+ switch (mode) {
+ case NFULNL_COPY_NONE:
+ case NFULNL_COPY_META:
+ inst->copy_mode = mode;
+ inst->copy_range = 0;
+ break;
+
+ case NFULNL_COPY_PACKET:
+ inst->copy_mode = mode;
+ if (range == 0)
+ range = NFULNL_COPY_RANGE_MAX;
+ inst->copy_range = min_t(unsigned int,
+ range, NFULNL_COPY_RANGE_MAX);
+ break;
+
+ default:
+ status = -EINVAL;
+ break;
+ }
+
+ spin_unlock_bh(&inst->lock);
+
+ return status;
+}
+
+static int
+nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
+{
+ int status;
+
+ spin_lock_bh(&inst->lock);
+ if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT)
+ status = -ERANGE;
+ else if (nlbufsiz > 131072)
+ status = -ERANGE;
+ else {
+ inst->nlbufsiz = nlbufsiz;
+ status = 0;
+ }
+ spin_unlock_bh(&inst->lock);
+
+ return status;
+}
+
+static int
+nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
+{
+ spin_lock_bh(&inst->lock);
+ inst->flushtimeout = timeout;
+ spin_unlock_bh(&inst->lock);
+
+ return 0;
+}
+
+static int
+nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
+{
+ spin_lock_bh(&inst->lock);
+ inst->qthreshold = qthresh;
+ spin_unlock_bh(&inst->lock);
+
+ return 0;
+}
+
+static int
+nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
+{
+ spin_lock_bh(&inst->lock);
+ inst->flags = flags;
+ spin_unlock_bh(&inst->lock);
+
+ return 0;
+}
+
+static struct sk_buff *
+nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
+ unsigned int pkt_size)
+{
+ struct sk_buff *skb;
+ unsigned int n;
+
+ /* alloc skb which should be big enough for a whole multipart
+ * message. WARNING: has to be <= 128k due to slab restrictions */
+
+ n = max(inst_size, pkt_size);
+ skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC);
+ if (!skb) {
+ if (n > pkt_size) {
+ /* try to allocate only as much as we need for current
+ * packet */
+
+ skb = nfnetlink_alloc_skb(net, pkt_size,
+ peer_portid, GFP_ATOMIC);
+ }
+ }
+
+ return skb;
+}
+
+static void
+__nfulnl_send(struct nfulnl_instance *inst)
+{
+ if (inst->qlen > 1) {
+ struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0,
+ NLMSG_DONE,
+ sizeof(struct nfgenmsg),
+ 0);
+ if (WARN_ONCE(!nlh, "bad nlskb size: %u, tailroom %d\n",
+ inst->skb->len, skb_tailroom(inst->skb))) {
+ kfree_skb(inst->skb);
+ goto out;
+ }
+ }
+ nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
+ MSG_DONTWAIT);
+out:
+ inst->qlen = 0;
+ inst->skb = NULL;
+}
+
+static void
+__nfulnl_flush(struct nfulnl_instance *inst)
+{
+ /* timer holds a reference */
+ if (del_timer(&inst->timer))
+ instance_put(inst);
+ if (inst->skb)
+ __nfulnl_send(inst);
+}
+
+static void
+nfulnl_timer(unsigned long data)
+{
+ struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
+
+ spin_lock_bh(&inst->lock);
+ if (inst->skb)
+ __nfulnl_send(inst);
+ spin_unlock_bh(&inst->lock);
+ instance_put(inst);
+}
+
+/* This is an inline function, we don't really care about a long
+ * list of arguments */
+static inline int
+__build_packet_message(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst,
+ const struct sk_buff *skb,
+ unsigned int data_len,
+ u_int8_t pf,
+ unsigned int hooknum,
+ const struct net_device *indev,
+ const struct net_device *outdev,
+ const char *prefix, unsigned int plen)
+{
+ struct nfulnl_msg_packet_hdr pmsg;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ sk_buff_data_t old_tail = inst->skb->tail;
+ struct sock *sk;
+ const unsigned char *hwhdrp;
+
+ nlh = nlmsg_put(inst->skb, 0, 0,
+ NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
+ sizeof(struct nfgenmsg), 0);
+ if (!nlh)
+ return -1;
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = pf;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(inst->group_num);
+
+ memset(&pmsg, 0, sizeof(pmsg));
+ pmsg.hw_protocol = skb->protocol;
+ pmsg.hook = hooknum;
+
+ if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg))
+ goto nla_put_failure;
+
+ if (prefix &&
+ nla_put(inst->skb, NFULA_PREFIX, plen, prefix))
+ goto nla_put_failure;
+
+ if (indev) {
+#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
+#else
+ if (pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical input device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ htonl(indev->ifindex)) ||
+ /* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+ nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
+ goto nla_put_failure;
+ } else {
+ struct net_device *physindev;
+
+ /* Case 2: indev is bridge group, we need to look for
+ * physical device (when called from ipv4) */
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
+
+ physindev = nf_bridge_get_physindev(skb);
+ if (physindev &&
+ nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ htonl(physindev->ifindex)))
+ goto nla_put_failure;
+ }
+#endif
+ }
+
+ if (outdev) {
+#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
+#else
+ if (pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical output device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ htonl(outdev->ifindex)) ||
+ /* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+ nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
+ goto nla_put_failure;
+ } else {
+ struct net_device *physoutdev;
+
+ /* Case 2: indev is a bridge group, we need to look
+ * for physical device (when called from ipv4) */
+ if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
+
+ physoutdev = nf_bridge_get_physoutdev(skb);
+ if (physoutdev &&
+ nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ htonl(physoutdev->ifindex)))
+ goto nla_put_failure;
+ }
+#endif
+ }
+
+ if (skb->mark &&
+ nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark)))
+ goto nla_put_failure;
+
+ if (indev && skb->dev &&
+ skb->mac_header != skb->network_header) {
+ struct nfulnl_msg_packet_hw phw;
+ int len;
+
+ memset(&phw, 0, sizeof(phw));
+ len = dev_parse_header(skb, phw.hw_addr);
+ if (len > 0) {
+ phw.hw_addrlen = htons(len);
+ if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw))
+ goto nla_put_failure;
+ }
+ }
+
+ if (indev && skb_mac_header_was_set(skb)) {
+ if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) ||
+ nla_put_be16(inst->skb, NFULA_HWLEN,
+ htons(skb->dev->hard_header_len)))
+ goto nla_put_failure;
+
+ hwhdrp = skb_mac_header(skb);
+
+ if (skb->dev->type == ARPHRD_SIT)
+ hwhdrp -= ETH_HLEN;
+
+ if (hwhdrp >= skb->head &&
+ nla_put(inst->skb, NFULA_HWHEADER,
+ skb->dev->hard_header_len, hwhdrp))
+ goto nla_put_failure;
+ }
+
+ if (skb->tstamp.tv64) {
+ struct nfulnl_msg_packet_timestamp ts;
+ struct timeval tv = ktime_to_timeval(skb->tstamp);
+ ts.sec = cpu_to_be64(tv.tv_sec);
+ ts.usec = cpu_to_be64(tv.tv_usec);
+
+ if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts))
+ goto nla_put_failure;
+ }
+
+ /* UID */
+ sk = skb->sk;
+ if (sk && sk_fullsock(sk)) {
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ struct file *file = sk->sk_socket->file;
+ const struct cred *cred = file->f_cred;
+ struct user_namespace *user_ns = inst->peer_user_ns;
+ __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid));
+ __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid));
+ read_unlock_bh(&sk->sk_callback_lock);
+ if (nla_put_be32(inst->skb, NFULA_UID, uid) ||
+ nla_put_be32(inst->skb, NFULA_GID, gid))
+ goto nla_put_failure;
+ } else
+ read_unlock_bh(&sk->sk_callback_lock);
+ }
+
+ /* local sequence number */
+ if ((inst->flags & NFULNL_CFG_F_SEQ) &&
+ nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++)))
+ goto nla_put_failure;
+
+ /* global sequence number */
+ if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) &&
+ nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL,
+ htonl(atomic_inc_return(&log->global_seq))))
+ goto nla_put_failure;
+
+ if (data_len) {
+ struct nlattr *nla;
+ int size = nla_attr_size(data_len);
+
+ if (skb_tailroom(inst->skb) < nla_total_size(data_len))
+ goto nla_put_failure;
+
+ nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len));
+ nla->nla_type = NFULA_PAYLOAD;
+ nla->nla_len = size;
+
+ if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
+ BUG();
+ }
+
+ nlh->nlmsg_len = inst->skb->tail - old_tail;
+ return 0;
+
+nla_put_failure:
+ PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
+ return -1;
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_ULOG,
+ .u = {
+ .ulog = {
+ .copy_len = 0xffff,
+ .group = 0,
+ .qthreshold = 1,
+ },
+ },
+};
+
+/* log handler for internal netfilter logging api */
+void
+nfulnl_log_packet(struct net *net,
+ u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *li_user,
+ const char *prefix)
+{
+ unsigned int size, data_len;
+ struct nfulnl_instance *inst;
+ const struct nf_loginfo *li;
+ unsigned int qthreshold;
+ unsigned int plen;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
+
+ if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
+ li = li_user;
+ else
+ li = &default_loginfo;
+
+ inst = instance_lookup_get(log, li->u.ulog.group);
+ if (!inst)
+ return;
+
+ plen = 0;
+ if (prefix)
+ plen = strlen(prefix) + 1;
+
+ /* FIXME: do we want to make the size calculation conditional based on
+ * what is actually present? way more branches and checks, but more
+ * memory efficient... */
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+#endif
+ + nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(u_int32_t)) /* uid */
+ + nla_total_size(sizeof(u_int32_t)) /* gid */
+ + nla_total_size(plen) /* prefix */
+ + nla_total_size(sizeof(struct nfulnl_msg_packet_hw))
+ + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp))
+ + nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */
+
+ if (in && skb_mac_header_was_set(skb)) {
+ size += nla_total_size(skb->dev->hard_header_len)
+ + nla_total_size(sizeof(u_int16_t)) /* hwtype */
+ + nla_total_size(sizeof(u_int16_t)); /* hwlen */
+ }
+
+ spin_lock_bh(&inst->lock);
+
+ if (inst->flags & NFULNL_CFG_F_SEQ)
+ size += nla_total_size(sizeof(u_int32_t));
+ if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
+ size += nla_total_size(sizeof(u_int32_t));
+
+ qthreshold = inst->qthreshold;
+ /* per-rule qthreshold overrides per-instance */
+ if (li->u.ulog.qthreshold)
+ if (qthreshold > li->u.ulog.qthreshold)
+ qthreshold = li->u.ulog.qthreshold;
+
+
+ switch (inst->copy_mode) {
+ case NFULNL_COPY_META:
+ case NFULNL_COPY_NONE:
+ data_len = 0;
+ break;
+
+ case NFULNL_COPY_PACKET:
+ if (inst->copy_range > skb->len)
+ data_len = skb->len;
+ else
+ data_len = inst->copy_range;
+
+ size += nla_total_size(data_len);
+ break;
+
+ case NFULNL_COPY_DISABLED:
+ default:
+ goto unlock_and_release;
+ }
+
+ if (inst->skb && size > skb_tailroom(inst->skb)) {
+ /* either the queue len is too high or we don't have
+ * enough room in the skb left. flush to userspace. */
+ __nfulnl_flush(inst);
+ }
+
+ if (!inst->skb) {
+ inst->skb = nfulnl_alloc_skb(net, inst->peer_portid,
+ inst->nlbufsiz, size);
+ if (!inst->skb)
+ goto alloc_failure;
+ }
+
+ inst->qlen++;
+
+ __build_packet_message(log, inst, skb, data_len, pf,
+ hooknum, in, out, prefix, plen);
+
+ if (inst->qlen >= qthreshold)
+ __nfulnl_flush(inst);
+ /* timer_pending always called within inst->lock, so there
+ * is no chance of a race here */
+ else if (!timer_pending(&inst->timer)) {
+ instance_get(inst);
+ inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
+ add_timer(&inst->timer);
+ }
+
+unlock_and_release:
+ spin_unlock_bh(&inst->lock);
+ instance_put(inst);
+ return;
+
+alloc_failure:
+ /* FIXME: statistics */
+ goto unlock_and_release;
+}
+EXPORT_SYMBOL_GPL(nfulnl_log_packet);
+
+static int
+nfulnl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+ struct nfnl_log_net *log = nfnl_log_pernet(n->net);
+
+ if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
+ int i;
+
+ /* destroy all instances for this portid */
+ spin_lock_bh(&log->instances_lock);
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *t2;
+ struct nfulnl_instance *inst;
+ struct hlist_head *head = &log->instance_table[i];
+
+ hlist_for_each_entry_safe(inst, t2, head, hlist) {
+ if (n->portid == inst->peer_portid)
+ __instance_destroy(inst);
+ }
+ }
+ spin_unlock_bh(&log->instances_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfulnl_rtnl_notifier = {
+ .notifier_call = nfulnl_rcv_nl_event,
+};
+
+static int
+nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ return -ENOTSUPP;
+}
+
+static struct nf_logger nfulnl_logger __read_mostly = {
+ .name = "nfnetlink_log",
+ .type = NF_LOG_TYPE_ULOG,
+ .logfn = &nfulnl_log_packet,
+ .me = THIS_MODULE,
+};
+
+static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
+ [NFULA_CFG_CMD] = { .len = sizeof(struct nfulnl_msg_config_cmd) },
+ [NFULA_CFG_MODE] = { .len = sizeof(struct nfulnl_msg_config_mode) },
+ [NFULA_CFG_TIMEOUT] = { .type = NLA_U32 },
+ [NFULA_CFG_QTHRESH] = { .type = NLA_U32 },
+ [NFULA_CFG_NLBUFSIZ] = { .type = NLA_U32 },
+ [NFULA_CFG_FLAGS] = { .type = NLA_U16 },
+};
+
+static int
+nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfula[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int16_t group_num = ntohs(nfmsg->res_id);
+ struct nfulnl_instance *inst;
+ struct nfulnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
+ int ret = 0;
+
+ if (nfula[NFULA_CFG_CMD]) {
+ u_int8_t pf = nfmsg->nfgen_family;
+ cmd = nla_data(nfula[NFULA_CFG_CMD]);
+
+ /* Commands without queue context */
+ switch (cmd->command) {
+ case NFULNL_CFG_CMD_PF_BIND:
+ return nf_log_bind_pf(net, pf, &nfulnl_logger);
+ case NFULNL_CFG_CMD_PF_UNBIND:
+ nf_log_unbind_pf(net, pf);
+ return 0;
+ }
+ }
+
+ inst = instance_lookup_get(log, group_num);
+ if (inst && inst->peer_portid != NETLINK_CB(skb).portid) {
+ ret = -EPERM;
+ goto out_put;
+ }
+
+ if (cmd != NULL) {
+ switch (cmd->command) {
+ case NFULNL_CFG_CMD_BIND:
+ if (inst) {
+ ret = -EBUSY;
+ goto out_put;
+ }
+
+ inst = instance_create(net, group_num,
+ NETLINK_CB(skb).portid,
+ sk_user_ns(NETLINK_CB(skb).sk));
+ if (IS_ERR(inst)) {
+ ret = PTR_ERR(inst);
+ goto out;
+ }
+ break;
+ case NFULNL_CFG_CMD_UNBIND:
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ instance_destroy(log, inst);
+ goto out_put;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ }
+
+ if (nfula[NFULA_CFG_MODE]) {
+ struct nfulnl_msg_config_mode *params;
+ params = nla_data(nfula[NFULA_CFG_MODE]);
+
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
+ nfulnl_set_mode(inst, params->copy_mode,
+ ntohl(params->copy_range));
+ }
+
+ if (nfula[NFULA_CFG_TIMEOUT]) {
+ __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]);
+
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
+ nfulnl_set_timeout(inst, ntohl(timeout));
+ }
+
+ if (nfula[NFULA_CFG_NLBUFSIZ]) {
+ __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]);
+
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
+ nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
+ }
+
+ if (nfula[NFULA_CFG_QTHRESH]) {
+ __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]);
+
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
+ nfulnl_set_qthresh(inst, ntohl(qthresh));
+ }
+
+ if (nfula[NFULA_CFG_FLAGS]) {
+ __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]);
+
+ if (!inst) {
+ ret = -ENODEV;
+ goto out;
+ }
+ nfulnl_set_flags(inst, ntohs(flags));
+ }
+
+out_put:
+ instance_put(inst);
+out:
+ return ret;
+}
+
+static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
+ [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
+ .attr_count = NFULA_MAX, },
+ [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
+ .attr_count = NFULA_CFG_MAX,
+ .policy = nfula_cfg_policy },
+};
+
+static const struct nfnetlink_subsystem nfulnl_subsys = {
+ .name = "log",
+ .subsys_id = NFNL_SUBSYS_ULOG,
+ .cb_count = NFULNL_MSG_MAX,
+ .cb = nfulnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct net *net, struct iter_state *st)
+{
+ struct nfnl_log_net *log;
+ if (!st)
+ return NULL;
+
+ log = nfnl_log_pernet(net);
+
+ for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+ struct hlist_head *head = &log->instance_table[st->bucket];
+
+ if (!hlist_empty(head))
+ return rcu_dereference_bh(hlist_first_rcu(head));
+ }
+ return NULL;
+}
+
+static struct hlist_node *get_next(struct net *net, struct iter_state *st,
+ struct hlist_node *h)
+{
+ h = rcu_dereference_bh(hlist_next_rcu(h));
+ while (!h) {
+ struct nfnl_log_net *log;
+ struct hlist_head *head;
+
+ if (++st->bucket >= INSTANCE_BUCKETS)
+ return NULL;
+
+ log = nfnl_log_pernet(net);
+ head = &log->instance_table[st->bucket];
+ h = rcu_dereference_bh(hlist_first_rcu(head));
+ }
+ return h;
+}
+
+static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
+ loff_t pos)
+{
+ struct hlist_node *head;
+ head = get_first(net, st);
+
+ if (head)
+ while (pos && (head = get_next(net, st, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(rcu_bh)
+{
+ rcu_read_lock_bh();
+ return get_idx(seq_file_net(s), s->private, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return get_next(seq_file_net(s), s->private, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+ __releases(rcu_bh)
+{
+ rcu_read_unlock_bh();
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ const struct nfulnl_instance *inst = v;
+
+ seq_printf(s, "%5u %6u %5u %1u %5u %6u %2u\n",
+ inst->group_num,
+ inst->peer_portid, inst->qlen,
+ inst->copy_mode, inst->copy_range,
+ inst->flushtimeout, atomic_read(&inst->use));
+
+ return 0;
+}
+
+static const struct seq_operations nful_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nful_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &nful_seq_ops,
+ sizeof(struct iter_state));
+}
+
+static const struct file_operations nful_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nful_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif /* PROC_FS */
+
+static int __net_init nfnl_log_net_init(struct net *net)
+{
+ unsigned int i;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&log->instance_table[i]);
+ spin_lock_init(&log->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_log", 0440,
+ net->nf.proc_netfilter, &nful_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit nfnl_log_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
+#endif
+ nf_log_unset(net, &nfulnl_logger);
+}
+
+static struct pernet_operations nfnl_log_net_ops = {
+ .init = nfnl_log_net_init,
+ .exit = nfnl_log_net_exit,
+ .id = &nfnl_log_net_id,
+ .size = sizeof(struct nfnl_log_net),
+};
+
+static int __init nfnetlink_log_init(void)
+{
+ int status;
+
+ status = register_pernet_subsys(&nfnl_log_net_ops);
+ if (status < 0) {
+ pr_err("failed to register pernet ops\n");
+ goto out;
+ }
+
+ netlink_register_notifier(&nfulnl_rtnl_notifier);
+ status = nfnetlink_subsys_register(&nfulnl_subsys);
+ if (status < 0) {
+ pr_err("failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+ status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);
+ if (status < 0) {
+ pr_err("failed to register logger\n");
+ goto cleanup_subsys;
+ }
+
+ return status;
+
+cleanup_subsys:
+ nfnetlink_subsys_unregister(&nfulnl_subsys);
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_log_net_ops);
+out:
+ return status;
+}
+
+static void __exit nfnetlink_log_fini(void)
+{
+ nf_log_unregister(&nfulnl_logger);
+ nfnetlink_subsys_unregister(&nfulnl_subsys);
+ netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_log_net_ops);
+}
+
+MODULE_DESCRIPTION("netfilter userspace logging");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
+MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
+MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
+MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
+
+module_init(nfnetlink_log_init);
+module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
new file mode 100644
index 000000000..11c7682fa
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -0,0 +1,1362 @@
+/*
+ * This is a module which is used for queueing packets and communicating with
+ * userspace via nfnetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ * (C) 2007 by Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the old ipv4-only ip_queue.c:
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/netns/generic.h>
+#include <net/netfilter/nfnetlink_queue.h>
+
+#include <linux/atomic.h>
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+#include "../bridge/br_private.h"
+#endif
+
+#define NFQNL_QMAX_DEFAULT 1024
+
+/* We're using struct nlattr which has 16bit nla_len. Note that nla_len
+ * includes the header length. Thus, the maximum packet length that we
+ * support is 65531 bytes. We send truncated packets if the specified length
+ * is larger than that. Userspace can check for presence of NFQA_CAP_LEN
+ * attribute to detect truncation.
+ */
+#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)
+
+struct nfqnl_instance {
+ struct hlist_node hlist; /* global list of queues */
+ struct rcu_head rcu;
+
+ u32 peer_portid;
+ unsigned int queue_maxlen;
+ unsigned int copy_range;
+ unsigned int queue_dropped;
+ unsigned int queue_user_dropped;
+
+
+ u_int16_t queue_num; /* number of this queue */
+ u_int8_t copy_mode;
+ u_int32_t flags; /* Set using NFQA_CFG_FLAGS */
+/*
+ * Following fields are dirtied for each queued packet,
+ * keep them in same cache line if possible.
+ */
+ spinlock_t lock;
+ unsigned int queue_total;
+ unsigned int id_sequence; /* 'sequence' of pkt ids */
+ struct list_head queue_list; /* packets in queue */
+};
+
+typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
+
+static int nfnl_queue_net_id __read_mostly;
+
+#define INSTANCE_BUCKETS 16
+struct nfnl_queue_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+};
+
+static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_queue_net_id);
+}
+
+static inline u_int8_t instance_hashfn(u_int16_t queue_num)
+{
+ return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
+}
+
+static struct nfqnl_instance *
+instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
+{
+ struct hlist_head *head;
+ struct nfqnl_instance *inst;
+
+ head = &q->instance_table[instance_hashfn(queue_num)];
+ hlist_for_each_entry_rcu(inst, head, hlist) {
+ if (inst->queue_num == queue_num)
+ return inst;
+ }
+ return NULL;
+}
+
+static struct nfqnl_instance *
+instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
+{
+ struct nfqnl_instance *inst;
+ unsigned int h;
+ int err;
+
+ spin_lock(&q->instances_lock);
+ if (instance_lookup(q, queue_num)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
+ if (!inst) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ inst->queue_num = queue_num;
+ inst->peer_portid = portid;
+ inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
+ inst->copy_range = NFQNL_MAX_COPY_RANGE;
+ inst->copy_mode = NFQNL_COPY_NONE;
+ spin_lock_init(&inst->lock);
+ INIT_LIST_HEAD(&inst->queue_list);
+
+ if (!try_module_get(THIS_MODULE)) {
+ err = -EAGAIN;
+ goto out_free;
+ }
+
+ h = instance_hashfn(queue_num);
+ hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);
+
+ spin_unlock(&q->instances_lock);
+
+ return inst;
+
+out_free:
+ kfree(inst);
+out_unlock:
+ spin_unlock(&q->instances_lock);
+ return ERR_PTR(err);
+}
+
+static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
+ unsigned long data);
+
+static void
+instance_destroy_rcu(struct rcu_head *head)
+{
+ struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
+ rcu);
+
+ nfqnl_flush(inst, NULL, 0);
+ kfree(inst);
+ module_put(THIS_MODULE);
+}
+
+static void
+__instance_destroy(struct nfqnl_instance *inst)
+{
+ hlist_del_rcu(&inst->hlist);
+ call_rcu(&inst->rcu, instance_destroy_rcu);
+}
+
+static void
+instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
+{
+ spin_lock(&q->instances_lock);
+ __instance_destroy(inst);
+ spin_unlock(&q->instances_lock);
+}
+
+static inline void
+__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
+{
+ list_add_tail(&entry->list, &queue->queue_list);
+ queue->queue_total++;
+}
+
+static void
+__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
+{
+ list_del(&entry->list);
+ queue->queue_total--;
+}
+
+static struct nf_queue_entry *
+find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
+{
+ struct nf_queue_entry *entry = NULL, *i;
+
+ spin_lock_bh(&queue->lock);
+
+ list_for_each_entry(i, &queue->queue_list, list) {
+ if (i->id == id) {
+ entry = i;
+ break;
+ }
+ }
+
+ if (entry)
+ __dequeue_entry(queue, entry);
+
+ spin_unlock_bh(&queue->lock);
+
+ return entry;
+}
+
+static void
+nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
+{
+ struct nf_queue_entry *entry, *next;
+
+ spin_lock_bh(&queue->lock);
+ list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
+ if (!cmpfn || cmpfn(entry, data)) {
+ list_del(&entry->list);
+ queue->queue_total--;
+ nf_reinject(entry, NF_DROP);
+ }
+ }
+ spin_unlock_bh(&queue->lock);
+}
+
+static int
+nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
+ bool csum_verify)
+{
+ __u32 flags = 0;
+
+ if (packet->ip_summed == CHECKSUM_PARTIAL)
+ flags = NFQA_SKB_CSUMNOTREADY;
+ else if (csum_verify)
+ flags = NFQA_SKB_CSUM_NOTVERIFIED;
+
+ if (skb_is_gso(packet))
+ flags |= NFQA_SKB_GSO;
+
+ return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0;
+}
+
+static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
+{
+ const struct cred *cred;
+
+ if (!sk_fullsock(sk))
+ return 0;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ cred = sk->sk_socket->file->f_cred;
+ if (nla_put_be32(skb, NFQA_UID,
+ htonl(from_kuid_munged(&init_user_ns, cred->fsuid))))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFQA_GID,
+ htonl(from_kgid_munged(&init_user_ns, cred->fsgid))))
+ goto nla_put_failure;
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+ return 0;
+
+nla_put_failure:
+ read_unlock_bh(&sk->sk_callback_lock);
+ return -1;
+}
+
+static struct sk_buff *
+nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ struct nf_queue_entry *entry,
+ __be32 **packet_id_ptr)
+{
+ size_t size;
+ size_t data_len = 0, cap_len = 0;
+ unsigned int hlen = 0;
+ struct sk_buff *skb;
+ struct nlattr *nla;
+ struct nfqnl_msg_packet_hdr *pmsg;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct sk_buff *entskb = entry->skb;
+ struct net_device *indev;
+ struct net_device *outdev;
+ struct nf_conn *ct = NULL;
+ enum ip_conntrack_info uninitialized_var(ctinfo);
+ bool csum_verify;
+
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ + nla_total_size(sizeof(u_int32_t)) /* ifindex */
+#endif
+ + nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ + nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ + nla_total_size(sizeof(u_int32_t)); /* cap_len */
+
+ if (entskb->tstamp.tv64)
+ size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
+
+ if (entry->state.hook <= NF_INET_FORWARD ||
+ (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
+ csum_verify = !skb_csum_unnecessary(entskb);
+ else
+ csum_verify = false;
+
+ outdev = entry->state.out;
+
+ switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
+ case NFQNL_COPY_META:
+ case NFQNL_COPY_NONE:
+ break;
+
+ case NFQNL_COPY_PACKET:
+ if (!(queue->flags & NFQA_CFG_F_GSO) &&
+ entskb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(entskb))
+ return NULL;
+
+ data_len = ACCESS_ONCE(queue->copy_range);
+ if (data_len > entskb->len)
+ data_len = entskb->len;
+
+ hlen = skb_zerocopy_headlen(entskb);
+ hlen = min_t(unsigned int, hlen, data_len);
+ size += sizeof(struct nlattr) + hlen;
+ cap_len = entskb->len;
+ break;
+ }
+
+ if (queue->flags & NFQA_CFG_F_CONNTRACK)
+ ct = nfqnl_ct_get(entskb, &size, &ctinfo);
+
+ if (queue->flags & NFQA_CFG_F_UID_GID) {
+ size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ + nla_total_size(sizeof(u_int32_t))); /* gid */
+ }
+
+ skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+ GFP_ATOMIC);
+ if (!skb) {
+ skb_tx_error(entskb);
+ return NULL;
+ }
+
+ nlh = nlmsg_put(skb, 0, 0,
+ NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+ sizeof(struct nfgenmsg), 0);
+ if (!nlh) {
+ skb_tx_error(entskb);
+ kfree_skb(skb);
+ return NULL;
+ }
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = entry->state.pf;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(queue->queue_num);
+
+ nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
+ pmsg = nla_data(nla);
+ pmsg->hw_protocol = entskb->protocol;
+ pmsg->hook = entry->state.hook;
+ *packet_id_ptr = &pmsg->packet_id;
+
+ indev = entry->state.in;
+ if (indev) {
+#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)))
+ goto nla_put_failure;
+#else
+ if (entry->state.pf == PF_BRIDGE) {
+ /* Case 1: indev is physical input device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
+ htonl(indev->ifindex)) ||
+ /* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by __nf_queue */
+ nla_put_be32(skb, NFQA_IFINDEX_INDEV,
+ htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
+ goto nla_put_failure;
+ } else {
+ int physinif;
+
+ /* Case 2: indev is bridge group, we need to look for
+ * physical device (when called from ipv4) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV,
+ htonl(indev->ifindex)))
+ goto nla_put_failure;
+
+ physinif = nf_bridge_get_physinif(entskb);
+ if (physinif &&
+ nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
+ htonl(physinif)))
+ goto nla_put_failure;
+ }
+#endif
+ }
+
+ if (outdev) {
+#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)))
+ goto nla_put_failure;
+#else
+ if (entry->state.pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical output device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
+ htonl(outdev->ifindex)) ||
+ /* this is the bridge group "brX" */
+ /* rcu_read_lock()ed by __nf_queue */
+ nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
+ htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
+ goto nla_put_failure;
+ } else {
+ int physoutif;
+
+ /* Case 2: outdev is bridge group, we need to look for
+ * physical output device (when called from ipv4) */
+ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
+ htonl(outdev->ifindex)))
+ goto nla_put_failure;
+
+ physoutif = nf_bridge_get_physoutif(entskb);
+ if (physoutif &&
+ nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
+ htonl(physoutif)))
+ goto nla_put_failure;
+ }
+#endif
+ }
+
+ if (entskb->mark &&
+ nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
+ goto nla_put_failure;
+
+ if (indev && entskb->dev &&
+ entskb->mac_header != entskb->network_header) {
+ struct nfqnl_msg_packet_hw phw;
+ int len;
+
+ memset(&phw, 0, sizeof(phw));
+ len = dev_parse_header(entskb, phw.hw_addr);
+ if (len) {
+ phw.hw_addrlen = htons(len);
+ if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw))
+ goto nla_put_failure;
+ }
+ }
+
+ if (entskb->tstamp.tv64) {
+ struct nfqnl_msg_packet_timestamp ts;
+ struct timeval tv = ktime_to_timeval(entskb->tstamp);
+ ts.sec = cpu_to_be64(tv.tv_sec);
+ ts.usec = cpu_to_be64(tv.tv_usec);
+
+ if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts))
+ goto nla_put_failure;
+ }
+
+ if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk &&
+ nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
+ goto nla_put_failure;
+
+ if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+ goto nla_put_failure;
+
+ if (cap_len > data_len &&
+ nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
+ goto nla_put_failure;
+
+ if (nfqnl_put_packet_info(skb, entskb, csum_verify))
+ goto nla_put_failure;
+
+ if (data_len) {
+ struct nlattr *nla;
+
+ if (skb_tailroom(skb) < sizeof(*nla) + hlen)
+ goto nla_put_failure;
+
+ nla = (struct nlattr *)skb_put(skb, sizeof(*nla));
+ nla->nla_type = NFQA_PAYLOAD;
+ nla->nla_len = nla_attr_size(data_len);
+
+ if (skb_zerocopy(skb, entskb, data_len, hlen))
+ goto nla_put_failure;
+ }
+
+ nlh->nlmsg_len = skb->len;
+ return skb;
+
+nla_put_failure:
+ skb_tx_error(entskb);
+ kfree_skb(skb);
+ net_err_ratelimited("nf_queue: error creating packet message\n");
+ return NULL;
+}
+
+static int
+__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
+ struct nf_queue_entry *entry)
+{
+ struct sk_buff *nskb;
+ int err = -ENOBUFS;
+ __be32 *packet_id_ptr;
+ int failopen = 0;
+
+ nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr);
+ if (nskb == NULL) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ spin_lock_bh(&queue->lock);
+
+ if (queue->queue_total >= queue->queue_maxlen) {
+ if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
+ failopen = 1;
+ err = 0;
+ } else {
+ queue->queue_dropped++;
+ net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
+ queue->queue_total);
+ }
+ goto err_out_free_nskb;
+ }
+ entry->id = ++queue->id_sequence;
+ *packet_id_ptr = htonl(entry->id);
+
+ /* nfnetlink_unicast will either free the nskb or add it to a socket */
+ err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
+ if (err < 0) {
+ queue->queue_user_dropped++;
+ goto err_out_unlock;
+ }
+
+ __enqueue_entry(queue, entry);
+
+ spin_unlock_bh(&queue->lock);
+ return 0;
+
+err_out_free_nskb:
+ kfree_skb(nskb);
+err_out_unlock:
+ spin_unlock_bh(&queue->lock);
+ if (failopen)
+ nf_reinject(entry, NF_ACCEPT);
+err_out:
+ return err;
+}
+
+static struct nf_queue_entry *
+nf_queue_entry_dup(struct nf_queue_entry *e)
+{
+ struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
+ if (entry) {
+ if (nf_queue_entry_get_refs(entry))
+ return entry;
+ kfree(entry);
+ }
+ return NULL;
+}
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+/* When called from bridge netfilter, skb->data must point to MAC header
+ * before calling skb_gso_segment(). Else, original MAC header is lost
+ * and segmented skbs will be sent to wrong destination.
+ */
+static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
+{
+ if (skb->nf_bridge)
+ __skb_push(skb, skb->network_header - skb->mac_header);
+}
+
+static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
+{
+ if (skb->nf_bridge)
+ __skb_pull(skb, skb->network_header - skb->mac_header);
+}
+#else
+#define nf_bridge_adjust_skb_data(s) do {} while (0)
+#define nf_bridge_adjust_segmented_data(s) do {} while (0)
+#endif
+
+static void free_entry(struct nf_queue_entry *entry)
+{
+ nf_queue_entry_release_refs(entry);
+ kfree(entry);
+}
+
+static int
+__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
+ struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+ int ret = -ENOMEM;
+ struct nf_queue_entry *entry_seg;
+
+ nf_bridge_adjust_segmented_data(skb);
+
+ if (skb->next == NULL) { /* last packet, no need to copy entry */
+ struct sk_buff *gso_skb = entry->skb;
+ entry->skb = skb;
+ ret = __nfqnl_enqueue_packet(net, queue, entry);
+ if (ret)
+ entry->skb = gso_skb;
+ return ret;
+ }
+
+ skb->next = NULL;
+
+ entry_seg = nf_queue_entry_dup(entry);
+ if (entry_seg) {
+ entry_seg->skb = skb;
+ ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
+ if (ret)
+ free_entry(entry_seg);
+ }
+ return ret;
+}
+
+static int
+nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
+{
+ unsigned int queued;
+ struct nfqnl_instance *queue;
+ struct sk_buff *skb, *segs;
+ int err = -ENOBUFS;
+ struct net *net = dev_net(entry->state.in ?
+ entry->state.in : entry->state.out);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ /* rcu_read_lock()ed by nf_hook_slow() */
+ queue = instance_lookup(q, queuenum);
+ if (!queue)
+ return -ESRCH;
+
+ if (queue->copy_mode == NFQNL_COPY_NONE)
+ return -EINVAL;
+
+ skb = entry->skb;
+
+ switch (entry->state.pf) {
+ case NFPROTO_IPV4:
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ case NFPROTO_IPV6:
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ }
+
+ if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb))
+ return __nfqnl_enqueue_packet(net, queue, entry);
+
+ nf_bridge_adjust_skb_data(skb);
+ segs = skb_gso_segment(skb, 0);
+ /* Does not use PTR_ERR to limit the number of error codes that can be
+ * returned by nf_queue. For instance, callers rely on -ECANCELED to
+ * mean 'ignore this hook'.
+ */
+ if (IS_ERR_OR_NULL(segs))
+ goto out_err;
+ queued = 0;
+ err = 0;
+ do {
+ struct sk_buff *nskb = segs->next;
+ if (err == 0)
+ err = __nfqnl_enqueue_packet_gso(net, queue,
+ segs, entry);
+ if (err == 0)
+ queued++;
+ else
+ kfree_skb(segs);
+ segs = nskb;
+ } while (segs);
+
+ if (queued) {
+ if (err) /* some segments are already queued */
+ free_entry(entry);
+ kfree_skb(skb);
+ return 0;
+ }
+ out_err:
+ nf_bridge_adjust_segmented_data(skb);
+ return err;
+}
+
+static int
+nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+{
+ struct sk_buff *nskb;
+
+ if (diff < 0) {
+ if (pskb_trim(e->skb, data_len))
+ return -ENOMEM;
+ } else if (diff > 0) {
+ if (data_len > 0xFFFF)
+ return -EINVAL;
+ if (diff > skb_tailroom(e->skb)) {
+ nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
+ diff, GFP_ATOMIC);
+ if (!nskb) {
+ printk(KERN_WARNING "nf_queue: OOM "
+ "in mangle, dropping packet\n");
+ return -ENOMEM;
+ }
+ kfree_skb(e->skb);
+ e->skb = nskb;
+ }
+ skb_put(e->skb, diff);
+ }
+ if (!skb_make_writable(e->skb, data_len))
+ return -ENOMEM;
+ skb_copy_to_linear_data(e->skb, data, data_len);
+ e->skb->ip_summed = CHECKSUM_NONE;
+ return 0;
+}
+
+static int
+nfqnl_set_mode(struct nfqnl_instance *queue,
+ unsigned char mode, unsigned int range)
+{
+ int status = 0;
+
+ spin_lock_bh(&queue->lock);
+ switch (mode) {
+ case NFQNL_COPY_NONE:
+ case NFQNL_COPY_META:
+ queue->copy_mode = mode;
+ queue->copy_range = 0;
+ break;
+
+ case NFQNL_COPY_PACKET:
+ queue->copy_mode = mode;
+ if (range == 0 || range > NFQNL_MAX_COPY_RANGE)
+ queue->copy_range = NFQNL_MAX_COPY_RANGE;
+ else
+ queue->copy_range = range;
+ break;
+
+ default:
+ status = -EINVAL;
+
+ }
+ spin_unlock_bh(&queue->lock);
+
+ return status;
+}
+
+static int
+dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
+{
+ if (entry->state.in)
+ if (entry->state.in->ifindex == ifindex)
+ return 1;
+ if (entry->state.out)
+ if (entry->state.out->ifindex == ifindex)
+ return 1;
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (entry->skb->nf_bridge) {
+ int physinif, physoutif;
+
+ physinif = nf_bridge_get_physinif(entry->skb);
+ physoutif = nf_bridge_get_physoutif(entry->skb);
+
+ if (physinif == ifindex || physoutif == ifindex)
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/* drop all packets with either indev or outdev == ifindex from all queue
+ * instances */
+static void
+nfqnl_dev_drop(struct net *net, int ifindex)
+{
+ int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ rcu_read_lock();
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &q->instance_table[i];
+
+ hlist_for_each_entry_rcu(inst, head, hlist)
+ nfqnl_flush(inst, dev_cmp, ifindex);
+ }
+
+ rcu_read_unlock();
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static int
+nfqnl_rcv_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Drop any packets associated with the downed device */
+ if (event == NETDEV_DOWN)
+ nfqnl_dev_drop(dev_net(dev), dev->ifindex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_dev_notifier = {
+ .notifier_call = nfqnl_rcv_dev_event,
+};
+
+static int
+nfqnl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(n->net);
+
+ if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
+ int i;
+
+ /* destroy all instances for this portid */
+ spin_lock(&q->instances_lock);
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *t2;
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &q->instance_table[i];
+
+ hlist_for_each_entry_safe(inst, t2, head, hlist) {
+ if (n->portid == inst->peer_portid)
+ __instance_destroy(inst);
+ }
+ }
+ spin_unlock(&q->instances_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_rtnl_notifier = {
+ .notifier_call = nfqnl_rcv_nl_event,
+};
+
+static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
+ [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
+ [NFQA_MARK] = { .type = NLA_U32 },
+ [NFQA_PAYLOAD] = { .type = NLA_UNSPEC },
+ [NFQA_CT] = { .type = NLA_UNSPEC },
+ [NFQA_EXP] = { .type = NLA_UNSPEC },
+};
+
+static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
+ [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
+ [NFQA_MARK] = { .type = NLA_U32 },
+};
+
+static struct nfqnl_instance *
+verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid)
+{
+ struct nfqnl_instance *queue;
+
+ queue = instance_lookup(q, queue_num);
+ if (!queue)
+ return ERR_PTR(-ENODEV);
+
+ if (queue->peer_portid != nlportid)
+ return ERR_PTR(-EPERM);
+
+ return queue;
+}
+
+static struct nfqnl_msg_verdict_hdr*
+verdicthdr_get(const struct nlattr * const nfqa[])
+{
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ unsigned int verdict;
+
+ if (!nfqa[NFQA_VERDICT_HDR])
+ return NULL;
+
+ vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
+ verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK;
+ if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN)
+ return NULL;
+ return vhdr;
+}
+
+static int nfq_id_after(unsigned int id, unsigned int max)
+{
+ return (int)(id - max) > 0;
+}
+
+static int
+nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nf_queue_entry *entry, *tmp;
+ unsigned int verdict, maxid;
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ struct nfqnl_instance *queue;
+ LIST_HEAD(batch_list);
+ u16 queue_num = ntohs(nfmsg->res_id);
+
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue))
+ return PTR_ERR(queue);
+
+ vhdr = verdicthdr_get(nfqa);
+ if (!vhdr)
+ return -EINVAL;
+
+ verdict = ntohl(vhdr->verdict);
+ maxid = ntohl(vhdr->id);
+
+ spin_lock_bh(&queue->lock);
+
+ list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) {
+ if (nfq_id_after(entry->id, maxid))
+ break;
+ __dequeue_entry(queue, entry);
+ list_add_tail(&entry->list, &batch_list);
+ }
+
+ spin_unlock_bh(&queue->lock);
+
+ if (list_empty(&batch_list))
+ return -ENOENT;
+
+ list_for_each_entry_safe(entry, tmp, &batch_list, list) {
+ if (nfqa[NFQA_MARK])
+ entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ nf_reinject(entry, verdict);
+ }
+ return 0;
+}
+
+static int
+nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ struct nfqnl_instance *queue;
+ unsigned int verdict;
+ struct nf_queue_entry *entry;
+ enum ip_conntrack_info uninitialized_var(ctinfo);
+ struct nf_conn *ct = NULL;
+
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ queue = instance_lookup(q, queue_num);
+ if (!queue)
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue))
+ return PTR_ERR(queue);
+
+ vhdr = verdicthdr_get(nfqa);
+ if (!vhdr)
+ return -EINVAL;
+
+ verdict = ntohl(vhdr->verdict);
+
+ entry = find_dequeue_entry(queue, ntohl(vhdr->id));
+ if (entry == NULL)
+ return -ENOENT;
+
+ if (nfqa[NFQA_CT]) {
+ ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
+ if (ct && nfqa[NFQA_EXP]) {
+ nfqnl_attach_expect(ct, nfqa[NFQA_EXP],
+ NETLINK_CB(skb).portid,
+ nlmsg_report(nlh));
+ }
+ }
+
+ if (nfqa[NFQA_PAYLOAD]) {
+ u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
+ int diff = payload_len - entry->skb->len;
+
+ if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
+ payload_len, entry, diff) < 0)
+ verdict = NF_DROP;
+
+ if (ct)
+ nfqnl_ct_seq_adjust(entry->skb, ct, ctinfo, diff);
+ }
+
+ if (nfqa[NFQA_MARK])
+ entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+
+ nf_reinject(entry, verdict);
+ return 0;
+}
+
+static int
+nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ return -ENOTSUPP;
+}
+
+static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
+ [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) },
+ [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) },
+};
+
+static const struct nf_queue_handler nfqh = {
+ .outfn = &nfqnl_enqueue_packet,
+};
+
+static int
+nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+ struct nfqnl_instance *queue;
+ struct nfqnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ int ret = 0;
+
+ if (nfqa[NFQA_CFG_CMD]) {
+ cmd = nla_data(nfqa[NFQA_CFG_CMD]);
+
+ /* Obsolete commands without queue context */
+ switch (cmd->command) {
+ case NFQNL_CFG_CMD_PF_BIND: return 0;
+ case NFQNL_CFG_CMD_PF_UNBIND: return 0;
+ }
+ }
+
+ rcu_read_lock();
+ queue = instance_lookup(q, queue_num);
+ if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
+ ret = -EPERM;
+ goto err_out_unlock;
+ }
+
+ if (cmd != NULL) {
+ switch (cmd->command) {
+ case NFQNL_CFG_CMD_BIND:
+ if (queue) {
+ ret = -EBUSY;
+ goto err_out_unlock;
+ }
+ queue = instance_create(q, queue_num,
+ NETLINK_CB(skb).portid);
+ if (IS_ERR(queue)) {
+ ret = PTR_ERR(queue);
+ goto err_out_unlock;
+ }
+ break;
+ case NFQNL_CFG_CMD_UNBIND:
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ instance_destroy(q, queue);
+ break;
+ case NFQNL_CFG_CMD_PF_BIND:
+ case NFQNL_CFG_CMD_PF_UNBIND:
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ }
+
+ if (nfqa[NFQA_CFG_PARAMS]) {
+ struct nfqnl_msg_config_params *params;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ params = nla_data(nfqa[NFQA_CFG_PARAMS]);
+ nfqnl_set_mode(queue, params->copy_mode,
+ ntohl(params->copy_range));
+ }
+
+ if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) {
+ __be32 *queue_maxlen;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+ queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);
+ spin_lock_bh(&queue->lock);
+ queue->queue_maxlen = ntohl(*queue_maxlen);
+ spin_unlock_bh(&queue->lock);
+ }
+
+ if (nfqa[NFQA_CFG_FLAGS]) {
+ __u32 flags, mask;
+
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+
+ if (!nfqa[NFQA_CFG_MASK]) {
+ /* A mask is needed to specify which flags are being
+ * changed.
+ */
+ ret = -EINVAL;
+ goto err_out_unlock;
+ }
+
+ flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS]));
+ mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK]));
+
+ if (flags >= NFQA_CFG_F_MAX) {
+ ret = -EOPNOTSUPP;
+ goto err_out_unlock;
+ }
+
+ spin_lock_bh(&queue->lock);
+ queue->flags &= ~mask;
+ queue->flags |= flags & mask;
+ spin_unlock_bh(&queue->lock);
+ }
+
+err_out_unlock:
+ rcu_read_unlock();
+ return ret;
+}
+
+static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
+ [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp,
+ .attr_count = NFQA_MAX, },
+ [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_policy },
+ [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
+ .attr_count = NFQA_CFG_MAX,
+ .policy = nfqa_cfg_policy },
+ [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_batch_policy },
+};
+
+static const struct nfnetlink_subsystem nfqnl_subsys = {
+ .name = "nf_queue",
+ .subsys_id = NFNL_SUBSYS_QUEUE,
+ .cb_count = NFQNL_MSG_MAX,
+ .cb = nfqnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+ struct seq_net_private p;
+ unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+ struct iter_state *st = seq->private;
+ struct net *net;
+ struct nfnl_queue_net *q;
+
+ if (!st)
+ return NULL;
+
+ net = seq_file_net(seq);
+ q = nfnl_queue_pernet(net);
+ for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+ if (!hlist_empty(&q->instance_table[st->bucket]))
+ return q->instance_table[st->bucket].first;
+ }
+ return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+ struct iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
+
+ h = h->next;
+ while (!h) {
+ struct nfnl_queue_net *q;
+
+ if (++st->bucket >= INSTANCE_BUCKETS)
+ return NULL;
+
+ q = nfnl_queue_pernet(net);
+ h = q->instance_table[st->bucket].first;
+ }
+ return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head;
+ head = get_first(seq);
+
+ if (head)
+ while (pos && (head = get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
+{
+ spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+ return get_idx(s, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+ __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
+{
+ spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ const struct nfqnl_instance *inst = v;
+
+ seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n",
+ inst->queue_num,
+ inst->peer_portid, inst->queue_total,
+ inst->copy_mode, inst->copy_range,
+ inst->queue_dropped, inst->queue_user_dropped,
+ inst->id_sequence, 1);
+ return seq_has_overflowed(s);
+}
+
+static const struct seq_operations nfqnl_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nfqnl_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &nfqnl_seq_ops,
+ sizeof(struct iter_state));
+}
+
+static const struct file_operations nfqnl_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nfqnl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif /* PROC_FS */
+
+static int __net_init nfnl_queue_net_init(struct net *net)
+{
+ unsigned int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&q->instance_table[i]);
+
+ spin_lock_init(&q->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_queue", 0440,
+ net->nf.proc_netfilter, &nfqnl_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit nfnl_queue_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
+#endif
+}
+
+static struct pernet_operations nfnl_queue_net_ops = {
+ .init = nfnl_queue_net_init,
+ .exit = nfnl_queue_net_exit,
+ .id = &nfnl_queue_net_id,
+ .size = sizeof(struct nfnl_queue_net),
+};
+
+static int __init nfnetlink_queue_init(void)
+{
+ int status;
+
+ status = register_pernet_subsys(&nfnl_queue_net_ops);
+ if (status < 0) {
+ pr_err("nf_queue: failed to register pernet ops\n");
+ goto out;
+ }
+
+ netlink_register_notifier(&nfqnl_rtnl_notifier);
+ status = nfnetlink_subsys_register(&nfqnl_subsys);
+ if (status < 0) {
+ pr_err("nf_queue: failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+ register_netdevice_notifier(&nfqnl_dev_notifier);
+ nf_register_queue_handler(&nfqh);
+ return status;
+
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+out:
+ return status;
+}
+
+static void __exit nfnetlink_queue_fini(void)
+{
+ nf_unregister_queue_handler();
+ unregister_netdevice_notifier(&nfqnl_dev_notifier);
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
+ netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+
+MODULE_DESCRIPTION("netfilter packet queue handler");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
+
+module_init(nfnetlink_queue_init);
+module_exit(nfnetlink_queue_fini);
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
new file mode 100644
index 000000000..96cac50e0
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -0,0 +1,113 @@
+/*
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nfnetlink_queue.h>
+
+struct nf_conn *nfqnl_ct_get(struct sk_buff *entskb, size_t *size,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nf_conn *ct;
+
+ /* rcu_read_lock()ed by __nf_queue already. */
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return NULL;
+
+ ct = nf_ct_get(entskb, ctinfo);
+ if (ct) {
+ if (!nf_ct_is_untracked(ct))
+ *size += nfq_ct->build_size(ct);
+ else
+ ct = NULL;
+ }
+ return ct;
+}
+
+struct nf_conn *
+nfqnl_ct_parse(const struct sk_buff *skb, const struct nlattr *attr,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nf_conn *ct;
+
+ /* rcu_read_lock()ed by __nf_queue already. */
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return NULL;
+
+ ct = nf_ct_get(skb, ctinfo);
+ if (ct && !nf_ct_is_untracked(ct))
+ nfq_ct->parse(attr, ct);
+
+ return ct;
+}
+
+int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct nfq_ct_hook *nfq_ct;
+ struct nlattr *nest_parms;
+ u_int32_t tmp;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return 0;
+
+ nest_parms = nla_nest_start(skb, NFQA_CT | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
+
+ if (nfq_ct->build(skb, ct) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest_parms);
+
+ tmp = ctinfo;
+ if (nla_put_be32(skb, NFQA_CT_INFO, htonl(tmp)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int diff)
+{
+ struct nfq_ct_hook *nfq_ct;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return;
+
+ if ((ct->status & IPS_NAT_MASK) && diff)
+ nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
+}
+
+int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+ u32 portid, u32 report)
+{
+ struct nfq_ct_hook *nfq_ct;
+
+ if (nf_ct_is_untracked(ct))
+ return 0;
+
+ nfq_ct = rcu_dereference(nfq_ct_hook);
+ if (nfq_ct == NULL)
+ return -EOPNOTSUPP;
+
+ return nfq_ct->attach_expect(attr, ct, portid, report);
+}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
new file mode 100644
index 000000000..d71cc18fa
--- /dev/null
+++ b/net/netfilter/nft_bitwise.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_bitwise {
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ u8 len;
+ struct nft_data mask;
+ struct nft_data xor;
+};
+
+static void nft_bitwise_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ const u32 *src = &regs->data[priv->sreg];
+ u32 *dst = &regs->data[priv->dreg];
+ unsigned int i;
+
+ for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++)
+ dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i];
+}
+
+static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
+ [NFTA_BITWISE_SREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_DREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_LEN] = { .type = NLA_U32 },
+ [NFTA_BITWISE_MASK] = { .type = NLA_NESTED },
+ [NFTA_BITWISE_XOR] = { .type = NLA_NESTED },
+};
+
+static int nft_bitwise_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_bitwise *priv = nft_expr_priv(expr);
+ struct nft_data_desc d1, d2;
+ int err;
+
+ if (tb[NFTA_BITWISE_SREG] == NULL ||
+ tb[NFTA_BITWISE_DREG] == NULL ||
+ tb[NFTA_BITWISE_LEN] == NULL ||
+ tb[NFTA_BITWISE_MASK] == NULL ||
+ tb[NFTA_BITWISE_XOR] == NULL)
+ return -EINVAL;
+
+ priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN]));
+ priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
+ err = nft_validate_register_load(priv->sreg, priv->len);
+ if (err < 0)
+ return err;
+
+ priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]);
+ err = nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, priv->len);
+ if (err < 0)
+ return err;
+
+ err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1,
+ tb[NFTA_BITWISE_MASK]);
+ if (err < 0)
+ return err;
+ if (d1.len != priv->len)
+ return -EINVAL;
+
+ err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
+ tb[NFTA_BITWISE_XOR]);
+ if (err < 0)
+ return err;
+ if (d2.len != priv->len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_bitwise_type;
+static const struct nft_expr_ops nft_bitwise_ops = {
+ .type = &nft_bitwise_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
+ .eval = nft_bitwise_eval,
+ .init = nft_bitwise_init,
+ .dump = nft_bitwise_dump,
+};
+
+static struct nft_expr_type nft_bitwise_type __read_mostly = {
+ .name = "bitwise",
+ .ops = &nft_bitwise_ops,
+ .policy = nft_bitwise_policy,
+ .maxattr = NFTA_BITWISE_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_bitwise_module_init(void)
+{
+ return nft_register_expr(&nft_bitwise_type);
+}
+
+void nft_bitwise_module_exit(void)
+{
+ nft_unregister_expr(&nft_bitwise_type);
+}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
new file mode 100644
index 000000000..fde5145f2
--- /dev/null
+++ b/net/netfilter/nft_byteorder.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_byteorder {
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ enum nft_byteorder_ops op:8;
+ u8 len;
+ u8 size;
+};
+
+static void nft_byteorder_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_byteorder *priv = nft_expr_priv(expr);
+ u32 *src = &regs->data[priv->sreg];
+ u32 *dst = &regs->data[priv->dreg];
+ union { u32 u32; u16 u16; } *s, *d;
+ unsigned int i;
+
+ s = (void *)src;
+ d = (void *)dst;
+
+ switch (priv->size) {
+ case 4:
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ for (i = 0; i < priv->len / 4; i++)
+ d[i].u32 = ntohl((__force __be32)s[i].u32);
+ break;
+ case NFT_BYTEORDER_HTON:
+ for (i = 0; i < priv->len / 4; i++)
+ d[i].u32 = (__force __u32)htonl(s[i].u32);
+ break;
+ }
+ break;
+ case 2:
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ for (i = 0; i < priv->len / 2; i++)
+ d[i].u16 = ntohs((__force __be16)s[i].u16);
+ break;
+ case NFT_BYTEORDER_HTON:
+ for (i = 0; i < priv->len / 2; i++)
+ d[i].u16 = (__force __u16)htons(s[i].u16);
+ break;
+ }
+ break;
+ }
+}
+
+static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = {
+ [NFTA_BYTEORDER_SREG] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_DREG] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_OP] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 },
+};
+
+static int nft_byteorder_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_byteorder *priv = nft_expr_priv(expr);
+ int err;
+
+ if (tb[NFTA_BYTEORDER_SREG] == NULL ||
+ tb[NFTA_BYTEORDER_DREG] == NULL ||
+ tb[NFTA_BYTEORDER_LEN] == NULL ||
+ tb[NFTA_BYTEORDER_SIZE] == NULL ||
+ tb[NFTA_BYTEORDER_OP] == NULL)
+ return -EINVAL;
+
+ priv->op = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_OP]));
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ case NFT_BYTEORDER_HTON:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE]));
+ switch (priv->size) {
+ case 2:
+ case 4:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]);
+ priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN]));
+ err = nft_validate_register_load(priv->sreg, priv->len);
+ if (err < 0)
+ return err;
+
+ priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, priv->len);
+}
+
+static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_byteorder *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_BYTEORDER_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_BYTEORDER_DREG, priv->dreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_OP, htonl(priv->op)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_BYTEORDER_SIZE, htonl(priv->size)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_byteorder_type;
+static const struct nft_expr_ops nft_byteorder_ops = {
+ .type = &nft_byteorder_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
+ .eval = nft_byteorder_eval,
+ .init = nft_byteorder_init,
+ .dump = nft_byteorder_dump,
+};
+
+static struct nft_expr_type nft_byteorder_type __read_mostly = {
+ .name = "byteorder",
+ .ops = &nft_byteorder_ops,
+ .policy = nft_byteorder_policy,
+ .maxattr = NFTA_BYTEORDER_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_byteorder_module_init(void)
+{
+ return nft_register_expr(&nft_byteorder_type);
+}
+
+void nft_byteorder_module_exit(void)
+{
+ nft_unregister_expr(&nft_byteorder_type);
+}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
new file mode 100644
index 000000000..e25b35d70
--- /dev/null
+++ b/net/netfilter/nft_cmp.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_cmp_expr {
+ struct nft_data data;
+ enum nft_registers sreg:8;
+ u8 len;
+ enum nft_cmp_ops op:8;
+};
+
+static void nft_cmp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+ int d;
+
+ d = memcmp(&regs->data[priv->sreg], &priv->data, priv->len);
+ switch (priv->op) {
+ case NFT_CMP_EQ:
+ if (d != 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_NEQ:
+ if (d == 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_LT:
+ if (d == 0)
+ goto mismatch;
+ case NFT_CMP_LTE:
+ if (d > 0)
+ goto mismatch;
+ break;
+ case NFT_CMP_GT:
+ if (d == 0)
+ goto mismatch;
+ case NFT_CMP_GTE:
+ if (d < 0)
+ goto mismatch;
+ break;
+ }
+ return;
+
+mismatch:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = {
+ [NFTA_CMP_SREG] = { .type = NLA_U32 },
+ [NFTA_CMP_OP] = { .type = NLA_U32 },
+ [NFTA_CMP_DATA] = { .type = NLA_NESTED },
+};
+
+static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ int err;
+
+ err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
+ tb[NFTA_CMP_DATA]);
+ BUG_ON(err < 0);
+
+ priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
+ err = nft_validate_register_load(priv->sreg, desc.len);
+ if (err < 0)
+ return err;
+
+ priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
+ priv->len = desc.len;
+ return 0;
+}
+
+static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_cmp_type;
+static const struct nft_expr_ops nft_cmp_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
+ .eval = nft_cmp_eval,
+ .init = nft_cmp_init,
+ .dump = nft_cmp_dump,
+};
+
+static int nft_cmp_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ struct nft_data data;
+ u32 mask;
+ int err;
+
+ err = nft_data_init(NULL, &data, sizeof(data), &desc,
+ tb[NFTA_CMP_DATA]);
+ BUG_ON(err < 0);
+
+ priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
+ err = nft_validate_register_load(priv->sreg, desc.len);
+ if (err < 0)
+ return err;
+
+ desc.len *= BITS_PER_BYTE;
+ mask = nft_cmp_fast_mask(desc.len);
+
+ priv->data = data.data[0] & mask;
+ priv->len = desc.len;
+ return 0;
+}
+
+static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data data;
+
+ if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ)))
+ goto nla_put_failure;
+
+ data.data[0] = priv->data;
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &data,
+ NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+const struct nft_expr_ops nft_cmp_fast_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_cmp_fast_init,
+ .dump = nft_cmp_fast_dump,
+};
+
+static const struct nft_expr_ops *
+nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
+{
+ struct nft_data_desc desc;
+ struct nft_data data;
+ enum nft_cmp_ops op;
+ int err;
+
+ if (tb[NFTA_CMP_SREG] == NULL ||
+ tb[NFTA_CMP_OP] == NULL ||
+ tb[NFTA_CMP_DATA] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ op = ntohl(nla_get_be32(tb[NFTA_CMP_OP]));
+ switch (op) {
+ case NFT_CMP_EQ:
+ case NFT_CMP_NEQ:
+ case NFT_CMP_LT:
+ case NFT_CMP_LTE:
+ case NFT_CMP_GT:
+ case NFT_CMP_GTE:
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = nft_data_init(NULL, &data, sizeof(data), &desc,
+ tb[NFTA_CMP_DATA]);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
+ return &nft_cmp_fast_ops;
+ else
+ return &nft_cmp_ops;
+}
+
+static struct nft_expr_type nft_cmp_type __read_mostly = {
+ .name = "cmp",
+ .select_ops = nft_cmp_select_ops,
+ .policy = nft_cmp_policy,
+ .maxattr = NFTA_CMP_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_cmp_module_init(void)
+{
+ return nft_register_expr(&nft_cmp_type);
+}
+
+void nft_cmp_module_exit(void)
+{
+ nft_unregister_expr(&nft_cmp_type);
+}
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
new file mode 100644
index 000000000..7f29cfc76
--- /dev/null
+++ b/net/netfilter/nft_compat.c
@@ -0,0 +1,814 @@
+/*
+ * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This software has been sponsored by Sophos Astaro <http://www.sophos.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_tables_compat.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+static int nft_compat_chain_validate_dependency(const char *tablename,
+ const struct nft_chain *chain)
+{
+ const struct nft_base_chain *basechain;
+
+ if (!tablename || !(chain->flags & NFT_BASE_CHAIN))
+ return 0;
+
+ basechain = nft_base_chain(chain);
+ if (strcmp(tablename, "nat") == 0 &&
+ basechain->type->type != NFT_CHAIN_T_NAT)
+ return -EINVAL;
+
+ return 0;
+}
+
+union nft_entry {
+ struct ipt_entry e4;
+ struct ip6t_entry e6;
+ struct ebt_entry ebt;
+ struct arpt_entry arp;
+};
+
+static inline void
+nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+{
+ par->target = xt;
+ par->targinfo = xt_info;
+ par->hotdrop = false;
+}
+
+static void nft_target_eval_xt(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_target *target = expr->ops->data;
+ struct sk_buff *skb = pkt->skb;
+ int ret;
+
+ nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+
+ ret = target->target(skb, &pkt->xt);
+
+ if (pkt->xt.hotdrop)
+ ret = NF_DROP;
+
+ switch (ret) {
+ case XT_CONTINUE:
+ regs->verdict.code = NFT_CONTINUE;
+ break;
+ default:
+ regs->verdict.code = ret;
+ break;
+ }
+}
+
+static void nft_target_eval_bridge(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_target *target = expr->ops->data;
+ struct sk_buff *skb = pkt->skb;
+ int ret;
+
+ nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+
+ ret = target->target(skb, &pkt->xt);
+
+ if (pkt->xt.hotdrop)
+ ret = NF_DROP;
+
+ switch (ret) {
+ case EBT_ACCEPT:
+ regs->verdict.code = NF_ACCEPT;
+ break;
+ case EBT_DROP:
+ regs->verdict.code = NF_DROP;
+ break;
+ case EBT_CONTINUE:
+ regs->verdict.code = NFT_CONTINUE;
+ break;
+ case EBT_RETURN:
+ regs->verdict.code = NFT_RETURN;
+ break;
+ default:
+ regs->verdict.code = ret;
+ break;
+ }
+}
+
+static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
+ [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING },
+ [NFTA_TARGET_REV] = { .type = NLA_U32 },
+ [NFTA_TARGET_INFO] = { .type = NLA_BINARY },
+};
+
+static void
+nft_target_set_tgchk_param(struct xt_tgchk_param *par,
+ const struct nft_ctx *ctx,
+ struct xt_target *target, void *info,
+ union nft_entry *entry, u16 proto, bool inv)
+{
+ par->net = ctx->net;
+ par->table = ctx->table->name;
+ switch (ctx->afi->family) {
+ case AF_INET:
+ entry->e4.ip.proto = proto;
+ entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+ break;
+ case AF_INET6:
+ if (proto)
+ entry->e6.ipv6.flags |= IP6T_F_PROTO;
+
+ entry->e6.ipv6.proto = proto;
+ entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+ break;
+ case NFPROTO_BRIDGE:
+ entry->ebt.ethproto = (__force __be16)proto;
+ entry->ebt.invflags = inv ? EBT_IPROTO : 0;
+ break;
+ case NFPROTO_ARP:
+ break;
+ }
+ par->entryinfo = entry;
+ par->target = target;
+ par->targinfo = info;
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ par->hook_mask = 1 << ops->hooknum;
+ } else {
+ par->hook_mask = 0;
+ }
+ par->family = ctx->afi->family;
+}
+
+static void target_compat_from_user(struct xt_target *t, void *in, void *out)
+{
+ int pad;
+
+ memcpy(out, in, t->targetsize);
+ pad = XT_ALIGN(t->targetsize) - t->targetsize;
+ if (pad > 0)
+ memset(out + t->targetsize, 0, pad);
+}
+
+static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = {
+ [NFTA_RULE_COMPAT_PROTO] = { .type = NLA_U32 },
+ [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 },
+};
+
+static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
+{
+ struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+ u32 flags;
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr,
+ nft_rule_compat_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS])
+ return -EINVAL;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
+ if (flags & ~NFT_RULE_COMPAT_F_MASK)
+ return -EINVAL;
+ if (flags & NFT_RULE_COMPAT_F_INV)
+ *inv = true;
+
+ *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ return 0;
+}
+
+static int
+nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_target *target = expr->ops->data;
+ struct xt_tgchk_param par;
+ size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
+ u16 proto = 0;
+ bool inv = false;
+ union nft_entry e = {};
+ int ret;
+
+ ret = nft_compat_chain_validate_dependency(target->table, ctx->chain);
+ if (ret < 0)
+ goto err;
+
+ target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info);
+
+ if (ctx->nla[NFTA_RULE_COMPAT]) {
+ ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv);
+ if (ret < 0)
+ goto err;
+ }
+
+ nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
+ ret = xt_check_target(&par, size, proto, inv);
+ if (ret < 0)
+ goto err;
+
+ /* The standard target cannot be used */
+ if (target->target == NULL) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ return 0;
+err:
+ module_put(target->me);
+ return ret;
+}
+
+static void
+nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_tgdtor_param par;
+
+ par.net = ctx->net;
+ par.target = target;
+ par.targinfo = info;
+ par.family = ctx->afi->family;
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+
+ module_put(target->me);
+}
+
+static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct xt_target *target = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+
+ if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) ||
+ nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) ||
+ nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(target->targetsize), info))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_target_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ struct xt_target *target = expr->ops->data;
+ unsigned int hook_mask = 0;
+ int ret;
+
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ hook_mask = 1 << ops->hooknum;
+ if (!(hook_mask & target->hooks))
+ return -EINVAL;
+
+ ret = nft_compat_chain_validate_dependency(target->table,
+ ctx->chain);
+ if (ret < 0)
+ return ret;
+ }
+ return 0;
+}
+
+static void nft_match_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+ struct sk_buff *skb = pkt->skb;
+ bool ret;
+
+ nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+
+ ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+
+ if (pkt->xt.hotdrop) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ switch (ret ? 1 : 0) {
+ case 1:
+ regs->verdict.code = NFT_CONTINUE;
+ break;
+ case 0:
+ regs->verdict.code = NFT_BREAK;
+ break;
+ }
+}
+
+static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
+ [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING },
+ [NFTA_MATCH_REV] = { .type = NLA_U32 },
+ [NFTA_MATCH_INFO] = { .type = NLA_BINARY },
+};
+
+/* struct xt_mtchk_param and xt_tgchk_param look very similar */
+static void
+nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
+ struct xt_match *match, void *info,
+ union nft_entry *entry, u16 proto, bool inv)
+{
+ par->net = ctx->net;
+ par->table = ctx->table->name;
+ switch (ctx->afi->family) {
+ case AF_INET:
+ entry->e4.ip.proto = proto;
+ entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+ break;
+ case AF_INET6:
+ if (proto)
+ entry->e6.ipv6.flags |= IP6T_F_PROTO;
+
+ entry->e6.ipv6.proto = proto;
+ entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+ break;
+ case NFPROTO_BRIDGE:
+ entry->ebt.ethproto = (__force __be16)proto;
+ entry->ebt.invflags = inv ? EBT_IPROTO : 0;
+ break;
+ case NFPROTO_ARP:
+ break;
+ }
+ par->entryinfo = entry;
+ par->match = match;
+ par->matchinfo = info;
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ par->hook_mask = 1 << ops->hooknum;
+ } else {
+ par->hook_mask = 0;
+ }
+ par->family = ctx->afi->family;
+}
+
+static void match_compat_from_user(struct xt_match *m, void *in, void *out)
+{
+ int pad;
+
+ memcpy(out, in, m->matchsize);
+ pad = XT_ALIGN(m->matchsize) - m->matchsize;
+ if (pad > 0)
+ memset(out + m->matchsize, 0, pad);
+}
+
+static int
+nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+ struct xt_mtchk_param par;
+ size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
+ u16 proto = 0;
+ bool inv = false;
+ union nft_entry e = {};
+ int ret;
+
+ ret = nft_compat_chain_validate_dependency(match->table, ctx->chain);
+ if (ret < 0)
+ goto err;
+
+ match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info);
+
+ if (ctx->nla[NFTA_RULE_COMPAT]) {
+ ret = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &proto, &inv);
+ if (ret < 0)
+ goto err;
+ }
+
+ nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
+ ret = xt_check_match(&par, size, proto, inv);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ module_put(match->me);
+ return ret;
+}
+
+static void
+nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ struct xt_match *match = expr->ops->data;
+ void *info = nft_expr_priv(expr);
+ struct xt_mtdtor_param par;
+
+ par.net = ctx->net;
+ par.match = match;
+ par.matchinfo = info;
+ par.family = ctx->afi->family;
+ if (par.match->destroy != NULL)
+ par.match->destroy(&par);
+
+ module_put(match->me);
+}
+
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ void *info = nft_expr_priv(expr);
+ struct xt_match *match = expr->ops->data;
+
+ if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
+ nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) ||
+ nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(match->matchsize), info))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_match_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ struct xt_match *match = expr->ops->data;
+ unsigned int hook_mask = 0;
+ int ret;
+
+ if (ctx->chain->flags & NFT_BASE_CHAIN) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+ const struct nf_hook_ops *ops = &basechain->ops[0];
+
+ hook_mask = 1 << ops->hooknum;
+ if (!(hook_mask & match->hooks))
+ return -EINVAL;
+
+ ret = nft_compat_chain_validate_dependency(match->table,
+ ctx->chain);
+ if (ret < 0)
+ return ret;
+ }
+ return 0;
+}
+
+static int
+nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+ int event, u16 family, const char *name,
+ int rev, int target)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+ event |= NFNL_SUBSYS_NFT_COMPAT << 8;
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+ if (nlh == NULL)
+ goto nlmsg_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
+ nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
+ nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+ return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -1;
+}
+
+static int
+nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+ int ret = 0, target;
+ struct nfgenmsg *nfmsg;
+ const char *fmt;
+ const char *name;
+ u32 rev;
+ struct sk_buff *skb2;
+
+ if (tb[NFTA_COMPAT_NAME] == NULL ||
+ tb[NFTA_COMPAT_REV] == NULL ||
+ tb[NFTA_COMPAT_TYPE] == NULL)
+ return -EINVAL;
+
+ name = nla_data(tb[NFTA_COMPAT_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
+ target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
+
+ nfmsg = nlmsg_data(nlh);
+
+ switch(nfmsg->nfgen_family) {
+ case AF_INET:
+ fmt = "ipt_%s";
+ break;
+ case AF_INET6:
+ fmt = "ip6t_%s";
+ break;
+ case NFPROTO_BRIDGE:
+ fmt = "ebt_%s";
+ break;
+ case NFPROTO_ARP:
+ fmt = "arpt_%s";
+ break;
+ default:
+ pr_err("nft_compat: unsupported protocol %d\n",
+ nfmsg->nfgen_family);
+ return -EINVAL;
+ }
+
+ try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
+ rev, target, &ret),
+ fmt, name);
+
+ if (ret < 0)
+ return ret;
+
+ skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb2 == NULL)
+ return -ENOMEM;
+
+ /* include the best revision for this extension in the message */
+ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(nlh->nlmsg_type),
+ NFNL_MSG_COMPAT_GET,
+ nfmsg->nfgen_family,
+ name, ret, target) <= 0) {
+ kfree_skb(skb2);
+ return -ENOSPC;
+ }
+
+ ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+ MSG_DONTWAIT);
+ if (ret > 0)
+ ret = 0;
+
+ return ret == -EAGAIN ? -ENOBUFS : ret;
+}
+
+static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
+ [NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING,
+ .len = NFT_COMPAT_NAME_MAX-1 },
+ [NFTA_COMPAT_REV] = { .type = NLA_U32 },
+ [NFTA_COMPAT_TYPE] = { .type = NLA_U32 },
+};
+
+static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
+ [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get,
+ .attr_count = NFTA_COMPAT_MAX,
+ .policy = nfnl_compat_policy_get },
+};
+
+static const struct nfnetlink_subsystem nfnl_compat_subsys = {
+ .name = "nft-compat",
+ .subsys_id = NFNL_SUBSYS_NFT_COMPAT,
+ .cb_count = NFNL_MSG_COMPAT_MAX,
+ .cb = nfnl_nft_compat_cb,
+};
+
+static LIST_HEAD(nft_match_list);
+
+struct nft_xt {
+ struct list_head head;
+ struct nft_expr_ops ops;
+};
+
+static struct nft_expr_type nft_match_type;
+
+static const struct nft_expr_ops *
+nft_match_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ struct nft_xt *nft_match;
+ struct xt_match *match;
+ char *mt_name;
+ __u32 rev, family;
+
+ if (tb[NFTA_MATCH_NAME] == NULL ||
+ tb[NFTA_MATCH_REV] == NULL ||
+ tb[NFTA_MATCH_INFO] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ mt_name = nla_data(tb[NFTA_MATCH_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
+ family = ctx->afi->family;
+
+ /* Re-use the existing match if it's already loaded. */
+ list_for_each_entry(nft_match, &nft_match_list, head) {
+ struct xt_match *match = nft_match->ops.data;
+
+ if (strcmp(match->name, mt_name) == 0 &&
+ match->revision == rev && match->family == family) {
+ if (!try_module_get(match->me))
+ return ERR_PTR(-ENOENT);
+
+ return &nft_match->ops;
+ }
+ }
+
+ match = xt_request_find_match(family, mt_name, rev);
+ if (IS_ERR(match))
+ return ERR_PTR(-ENOENT);
+
+ /* This is the first time we use this match, allocate operations */
+ nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+ if (nft_match == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ nft_match->ops.type = &nft_match_type;
+ nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
+ nft_match->ops.eval = nft_match_eval;
+ nft_match->ops.init = nft_match_init;
+ nft_match->ops.destroy = nft_match_destroy;
+ nft_match->ops.dump = nft_match_dump;
+ nft_match->ops.validate = nft_match_validate;
+ nft_match->ops.data = match;
+
+ list_add(&nft_match->head, &nft_match_list);
+
+ return &nft_match->ops;
+}
+
+static void nft_match_release(void)
+{
+ struct nft_xt *nft_match, *tmp;
+
+ list_for_each_entry_safe(nft_match, tmp, &nft_match_list, head)
+ kfree(nft_match);
+}
+
+static struct nft_expr_type nft_match_type __read_mostly = {
+ .name = "match",
+ .select_ops = nft_match_select_ops,
+ .policy = nft_match_policy,
+ .maxattr = NFTA_MATCH_MAX,
+ .owner = THIS_MODULE,
+};
+
+static LIST_HEAD(nft_target_list);
+
+static struct nft_expr_type nft_target_type;
+
+static const struct nft_expr_ops *
+nft_target_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ struct nft_xt *nft_target;
+ struct xt_target *target;
+ char *tg_name;
+ __u32 rev, family;
+
+ if (tb[NFTA_TARGET_NAME] == NULL ||
+ tb[NFTA_TARGET_REV] == NULL ||
+ tb[NFTA_TARGET_INFO] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ tg_name = nla_data(tb[NFTA_TARGET_NAME]);
+ rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
+ family = ctx->afi->family;
+
+ /* Re-use the existing target if it's already loaded. */
+ list_for_each_entry(nft_target, &nft_target_list, head) {
+ struct xt_target *target = nft_target->ops.data;
+
+ if (strcmp(target->name, tg_name) == 0 &&
+ target->revision == rev && target->family == family) {
+ if (!try_module_get(target->me))
+ return ERR_PTR(-ENOENT);
+
+ return &nft_target->ops;
+ }
+ }
+
+ target = xt_request_find_target(family, tg_name, rev);
+ if (IS_ERR(target))
+ return ERR_PTR(-ENOENT);
+
+ /* This is the first time we use this target, allocate operations */
+ nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+ if (nft_target == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ nft_target->ops.type = &nft_target_type;
+ nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
+ nft_target->ops.init = nft_target_init;
+ nft_target->ops.destroy = nft_target_destroy;
+ nft_target->ops.dump = nft_target_dump;
+ nft_target->ops.validate = nft_target_validate;
+ nft_target->ops.data = target;
+
+ if (family == NFPROTO_BRIDGE)
+ nft_target->ops.eval = nft_target_eval_bridge;
+ else
+ nft_target->ops.eval = nft_target_eval_xt;
+
+ list_add(&nft_target->head, &nft_target_list);
+
+ return &nft_target->ops;
+}
+
+static void nft_target_release(void)
+{
+ struct nft_xt *nft_target, *tmp;
+
+ list_for_each_entry_safe(nft_target, tmp, &nft_target_list, head)
+ kfree(nft_target);
+}
+
+static struct nft_expr_type nft_target_type __read_mostly = {
+ .name = "target",
+ .select_ops = nft_target_select_ops,
+ .policy = nft_target_policy,
+ .maxattr = NFTA_TARGET_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_compat_module_init(void)
+{
+ int ret;
+
+ ret = nft_register_expr(&nft_match_type);
+ if (ret < 0)
+ return ret;
+
+ ret = nft_register_expr(&nft_target_type);
+ if (ret < 0)
+ goto err_match;
+
+ ret = nfnetlink_subsys_register(&nfnl_compat_subsys);
+ if (ret < 0) {
+ pr_err("nft_compat: cannot register with nfnetlink.\n");
+ goto err_target;
+ }
+
+ pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n");
+
+ return ret;
+
+err_target:
+ nft_unregister_expr(&nft_target_type);
+err_match:
+ nft_unregister_expr(&nft_match_type);
+ return ret;
+}
+
+static void __exit nft_compat_module_exit(void)
+{
+ nfnetlink_subsys_unregister(&nfnl_compat_subsys);
+ nft_unregister_expr(&nft_target_type);
+ nft_unregister_expr(&nft_match_type);
+ nft_match_release();
+ nft_target_release();
+}
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
+
+module_init(nft_compat_module_init);
+module_exit(nft_compat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("match");
+MODULE_ALIAS_NFT_EXPR("target");
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
new file mode 100644
index 000000000..175912392
--- /dev/null
+++ b/net/netfilter/nft_counter.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/seqlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_counter {
+ seqlock_t lock;
+ u64 bytes;
+ u64 packets;
+};
+
+static void nft_counter_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+
+ write_seqlock_bh(&priv->lock);
+ priv->bytes += pkt->skb->len;
+ priv->packets++;
+ write_sequnlock_bh(&priv->lock);
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+ unsigned int seq;
+ u64 bytes;
+ u64 packets;
+
+ do {
+ seq = read_seqbegin(&priv->lock);
+ bytes = priv->bytes;
+ packets = priv->packets;
+ } while (read_seqretry(&priv->lock, seq));
+
+ if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes)))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+ [NFTA_COUNTER_PACKETS] = { .type = NLA_U64 },
+ [NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
+};
+
+static int nft_counter_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_counter *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_COUNTER_PACKETS])
+ priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ if (tb[NFTA_COUNTER_BYTES])
+ priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+
+ seqlock_init(&priv->lock);
+ return 0;
+}
+
+static struct nft_expr_type nft_counter_type;
+static const struct nft_expr_ops nft_counter_ops = {
+ .type = &nft_counter_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)),
+ .eval = nft_counter_eval,
+ .init = nft_counter_init,
+ .dump = nft_counter_dump,
+};
+
+static struct nft_expr_type nft_counter_type __read_mostly = {
+ .name = "counter",
+ .ops = &nft_counter_ops,
+ .policy = nft_counter_policy,
+ .maxattr = NFTA_COUNTER_MAX,
+ .flags = NFT_EXPR_STATEFUL,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_counter_module_init(void)
+{
+ return nft_register_expr(&nft_counter_type);
+}
+
+static void __exit nft_counter_module_exit(void)
+{
+ nft_unregister_expr(&nft_counter_type);
+}
+
+module_init(nft_counter_module_init);
+module_exit(nft_counter_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("counter");
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
new file mode 100644
index 000000000..8cbca3432
--- /dev/null
+++ b/net/netfilter/nft_ct.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+struct nft_ct {
+ enum nft_ct_keys key:8;
+ enum ip_conntrack_dir dir:8;
+ union {
+ enum nft_registers dreg:8;
+ enum nft_registers sreg:8;
+ };
+};
+
+static void nft_ct_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_tuple *tuple;
+ const struct nf_conntrack_helper *helper;
+ long diff;
+ unsigned int state;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+
+ switch (priv->key) {
+ case NFT_CT_STATE:
+ if (ct == NULL)
+ state = NF_CT_STATE_INVALID_BIT;
+ else if (nf_ct_is_untracked(ct))
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ else
+ state = NF_CT_STATE_BIT(ctinfo);
+ *dest = state;
+ return;
+ default:
+ break;
+ }
+
+ if (ct == NULL)
+ goto err;
+
+ switch (priv->key) {
+ case NFT_CT_DIRECTION:
+ *dest = CTINFO2DIR(ctinfo);
+ return;
+ case NFT_CT_STATUS:
+ *dest = ct->status;
+ return;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ *dest = ct->mark;
+ return;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+ *dest = ct->secmark;
+ return;
+#endif
+ case NFT_CT_EXPIRATION:
+ diff = (long)jiffies - (long)ct->timeout.expires;
+ if (diff < 0)
+ diff = 0;
+ *dest = jiffies_to_msecs(diff);
+ return;
+ case NFT_CT_HELPER:
+ if (ct->master == NULL)
+ goto err;
+ help = nfct_help(ct->master);
+ if (help == NULL)
+ goto err;
+ helper = rcu_dereference(help->helper);
+ if (helper == NULL)
+ goto err;
+ strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN);
+ return;
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS: {
+ struct nf_conn_labels *labels = nf_ct_labels_find(ct);
+ unsigned int size;
+
+ if (!labels) {
+ memset(dest, 0, NF_CT_LABELS_MAX_SIZE);
+ return;
+ }
+
+ size = labels->words * sizeof(long);
+ memcpy(dest, labels->bits, size);
+ if (size < NF_CT_LABELS_MAX_SIZE)
+ memset(((char *) dest) + size, 0,
+ NF_CT_LABELS_MAX_SIZE - size);
+ return;
+ }
+#endif
+ default:
+ break;
+ }
+
+ tuple = &ct->tuplehash[priv->dir].tuple;
+ switch (priv->key) {
+ case NFT_CT_L3PROTOCOL:
+ *dest = nf_ct_l3num(ct);
+ return;
+ case NFT_CT_SRC:
+ memcpy(dest, tuple->src.u3.all,
+ nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+ return;
+ case NFT_CT_DST:
+ memcpy(dest, tuple->dst.u3.all,
+ nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16);
+ return;
+ case NFT_CT_PROTOCOL:
+ *dest = nf_ct_protonum(ct);
+ return;
+ case NFT_CT_PROTO_SRC:
+ *dest = (__force __u16)tuple->src.u.all;
+ return;
+ case NFT_CT_PROTO_DST:
+ *dest = (__force __u16)tuple->dst.u.all;
+ return;
+ default:
+ break;
+ }
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static void nft_ct_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ u32 value = regs->data[priv->sreg];
+#endif
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return;
+
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ if (ct->mark != value) {
+ ct->mark = value;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
+static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
+ [NFTA_CT_DREG] = { .type = NLA_U32 },
+ [NFTA_CT_KEY] = { .type = NLA_U32 },
+ [NFTA_CT_DIRECTION] = { .type = NLA_U8 },
+ [NFTA_CT_SREG] = { .type = NLA_U32 },
+};
+
+static int nft_ct_l3proto_try_module_get(uint8_t family)
+{
+ int err;
+
+ if (family == NFPROTO_INET) {
+ err = nf_ct_l3proto_try_module_get(NFPROTO_IPV4);
+ if (err < 0)
+ goto err1;
+ err = nf_ct_l3proto_try_module_get(NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ } else {
+ err = nf_ct_l3proto_try_module_get(family);
+ if (err < 0)
+ goto err1;
+ }
+ return 0;
+
+err2:
+ nf_ct_l3proto_module_put(NFPROTO_IPV4);
+err1:
+ return err;
+}
+
+static void nft_ct_l3proto_module_put(uint8_t family)
+{
+ if (family == NFPROTO_INET) {
+ nf_ct_l3proto_module_put(NFPROTO_IPV4);
+ nf_ct_l3proto_module_put(NFPROTO_IPV6);
+ } else
+ nf_ct_l3proto_module_put(family);
+}
+
+static int nft_ct_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ct *priv = nft_expr_priv(expr);
+ unsigned int len;
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+ switch (priv->key) {
+ case NFT_CT_DIRECTION:
+ if (tb[NFTA_CT_DIRECTION] != NULL)
+ return -EINVAL;
+ len = sizeof(u8);
+ break;
+ case NFT_CT_STATE:
+ case NFT_CT_STATUS:
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+#endif
+ case NFT_CT_EXPIRATION:
+ if (tb[NFTA_CT_DIRECTION] != NULL)
+ return -EINVAL;
+ len = sizeof(u32);
+ break;
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ case NFT_CT_LABELS:
+ if (tb[NFTA_CT_DIRECTION] != NULL)
+ return -EINVAL;
+ len = NF_CT_LABELS_MAX_SIZE;
+ break;
+#endif
+ case NFT_CT_HELPER:
+ if (tb[NFTA_CT_DIRECTION] != NULL)
+ return -EINVAL;
+ len = NF_CT_HELPER_NAME_LEN;
+ break;
+
+ case NFT_CT_L3PROTOCOL:
+ case NFT_CT_PROTOCOL:
+ if (tb[NFTA_CT_DIRECTION] == NULL)
+ return -EINVAL;
+ len = sizeof(u8);
+ break;
+ case NFT_CT_SRC:
+ case NFT_CT_DST:
+ if (tb[NFTA_CT_DIRECTION] == NULL)
+ return -EINVAL;
+
+ switch (ctx->afi->family) {
+ case NFPROTO_IPV4:
+ len = FIELD_SIZEOF(struct nf_conntrack_tuple,
+ src.u3.ip);
+ break;
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ len = FIELD_SIZEOF(struct nf_conntrack_tuple,
+ src.u3.ip6);
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+ break;
+ case NFT_CT_PROTO_SRC:
+ case NFT_CT_PROTO_DST:
+ if (tb[NFTA_CT_DIRECTION] == NULL)
+ return -EINVAL;
+ len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[NFTA_CT_DIRECTION] != NULL) {
+ priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]);
+ switch (priv->dir) {
+ case IP_CT_DIR_ORIGINAL:
+ case IP_CT_DIR_REPLY:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]);
+ err = nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+ if (err < 0)
+ return err;
+
+ err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_ct_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_ct *priv = nft_expr_priv(expr);
+ unsigned int len;
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+ switch (priv->key) {
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ len = FIELD_SIZEOF(struct nf_conn, mark);
+ break;
+#endif
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
+ err = nft_validate_register_load(priv->sreg, len);
+ if (err < 0)
+ return err;
+
+ err = nft_ct_l3proto_try_module_get(ctx->afi->family);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static void nft_ct_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ nft_ct_l3proto_module_put(ctx->afi->family);
+}
+
+static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_CT_DREG, priv->dreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+
+ switch (priv->key) {
+ case NFT_CT_PROTOCOL:
+ case NFT_CT_SRC:
+ case NFT_CT_DST:
+ case NFT_CT_PROTO_SRC:
+ case NFT_CT_PROTO_DST:
+ if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+ goto nla_put_failure;
+ default:
+ break;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_CT_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_ct_type;
+static const struct nft_expr_ops nft_ct_get_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_get_eval,
+ .init = nft_ct_get_init,
+ .destroy = nft_ct_destroy,
+ .dump = nft_ct_get_dump,
+};
+
+static const struct nft_expr_ops nft_ct_set_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_set_eval,
+ .init = nft_ct_set_init,
+ .destroy = nft_ct_destroy,
+ .dump = nft_ct_set_dump,
+};
+
+static const struct nft_expr_ops *
+nft_ct_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_CT_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_CT_DREG])
+ return &nft_ct_get_ops;
+
+ if (tb[NFTA_CT_SREG])
+ return &nft_ct_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_ct_type __read_mostly = {
+ .name = "ct",
+ .select_ops = &nft_ct_select_ops,
+ .policy = nft_ct_policy,
+ .maxattr = NFTA_CT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_ct_module_init(void)
+{
+ return nft_register_expr(&nft_ct_type);
+}
+
+static void __exit nft_ct_module_exit(void)
+{
+ nft_unregister_expr(&nft_ct_type);
+}
+
+module_init(nft_ct_module_init);
+module_exit(nft_ct_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("ct");
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
new file mode 100644
index 000000000..513a8ef60
--- /dev/null
+++ b/net/netfilter/nft_dynset.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2015 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+struct nft_dynset {
+ struct nft_set *set;
+ struct nft_set_ext_tmpl tmpl;
+ enum nft_dynset_ops op:8;
+ enum nft_registers sreg_key:8;
+ enum nft_registers sreg_data:8;
+ u64 timeout;
+ struct nft_expr *expr;
+ struct nft_set_binding binding;
+};
+
+static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_dynset *priv = nft_expr_priv(expr);
+ struct nft_set_ext *ext;
+ u64 timeout;
+ void *elem;
+
+ if (set->size && !atomic_add_unless(&set->nelems, 1, set->size))
+ return NULL;
+
+ timeout = priv->timeout ? : set->timeout;
+ elem = nft_set_elem_init(set, &priv->tmpl,
+ &regs->data[priv->sreg_key],
+ &regs->data[priv->sreg_data],
+ timeout, GFP_ATOMIC);
+ if (elem == NULL) {
+ if (set->size)
+ atomic_dec(&set->nelems);
+ return NULL;
+ }
+
+ ext = nft_set_elem_ext(set, elem);
+ if (priv->expr != NULL)
+ nft_expr_clone(nft_set_ext_expr(ext), priv->expr);
+
+ return elem;
+}
+
+static void nft_dynset_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_dynset *priv = nft_expr_priv(expr);
+ struct nft_set *set = priv->set;
+ const struct nft_set_ext *ext;
+ const struct nft_expr *sexpr;
+ u64 timeout;
+
+ if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
+ expr, regs, &ext)) {
+ sexpr = NULL;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ sexpr = nft_set_ext_expr(ext);
+
+ if (priv->op == NFT_DYNSET_OP_UPDATE &&
+ nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
+ timeout = priv->timeout ? : set->timeout;
+ *nft_set_ext_expiration(ext) = jiffies + timeout;
+ } else if (sexpr == NULL)
+ goto out;
+
+ if (sexpr != NULL)
+ sexpr->ops->eval(sexpr, regs, pkt);
+ return;
+ }
+out:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
+ [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING },
+ [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 },
+ [NFTA_DYNSET_OP] = { .type = NLA_U32 },
+ [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 },
+ [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 },
+ [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
+ [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED },
+};
+
+static int nft_dynset_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_dynset *priv = nft_expr_priv(expr);
+ struct nft_set *set;
+ u64 timeout;
+ int err;
+
+ if (tb[NFTA_DYNSET_SET_NAME] == NULL ||
+ tb[NFTA_DYNSET_OP] == NULL ||
+ tb[NFTA_DYNSET_SREG_KEY] == NULL)
+ return -EINVAL;
+
+ set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME]);
+ if (IS_ERR(set)) {
+ if (tb[NFTA_DYNSET_SET_ID])
+ set = nf_tables_set_lookup_byid(ctx->net,
+ tb[NFTA_DYNSET_SET_ID]);
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ if (set->flags & NFT_SET_CONSTANT)
+ return -EBUSY;
+
+ priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP]));
+ switch (priv->op) {
+ case NFT_DYNSET_OP_ADD:
+ break;
+ case NFT_DYNSET_OP_UPDATE:
+ if (!(set->flags & NFT_SET_TIMEOUT))
+ return -EOPNOTSUPP;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ timeout = 0;
+ if (tb[NFTA_DYNSET_TIMEOUT] != NULL) {
+ if (!(set->flags & NFT_SET_TIMEOUT))
+ return -EINVAL;
+ timeout = be64_to_cpu(nla_get_be64(tb[NFTA_DYNSET_TIMEOUT]));
+ }
+
+ priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]);
+ err = nft_validate_register_load(priv->sreg_key, set->klen);;
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_DYNSET_SREG_DATA] != NULL) {
+ if (!(set->flags & NFT_SET_MAP))
+ return -EINVAL;
+ if (set->dtype == NFT_DATA_VERDICT)
+ return -EOPNOTSUPP;
+
+ priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]);
+ err = nft_validate_register_load(priv->sreg_data, set->dlen);
+ if (err < 0)
+ return err;
+ } else if (set->flags & NFT_SET_MAP)
+ return -EINVAL;
+
+ if (tb[NFTA_DYNSET_EXPR] != NULL) {
+ if (!(set->flags & NFT_SET_EVAL))
+ return -EINVAL;
+ if (!(set->flags & NFT_SET_ANONYMOUS))
+ return -EOPNOTSUPP;
+
+ priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
+ if (IS_ERR(priv->expr))
+ return PTR_ERR(priv->expr);
+
+ err = -EOPNOTSUPP;
+ if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
+ goto err1;
+ } else if (set->flags & NFT_SET_EVAL)
+ return -EINVAL;
+
+ nft_set_ext_prepare(&priv->tmpl);
+ nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (set->flags & NFT_SET_MAP)
+ nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen);
+ if (priv->expr != NULL)
+ nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR,
+ priv->expr->ops->size);
+ if (set->flags & NFT_SET_TIMEOUT) {
+ if (timeout || set->timeout)
+ nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION);
+ }
+
+ priv->timeout = timeout;
+
+ err = nf_tables_bind_set(ctx, set, &priv->binding);
+ if (err < 0)
+ goto err1;
+
+ priv->set = set;
+ return 0;
+
+err1:
+ if (priv->expr != NULL)
+ nft_expr_destroy(ctx, priv->expr);
+ return err;
+}
+
+static void nft_dynset_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_dynset *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ if (priv->expr != NULL)
+ nft_expr_destroy(ctx, priv->expr);
+}
+
+static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_dynset *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
+ goto nla_put_failure;
+ if (priv->set->flags & NFT_SET_MAP &&
+ nft_dump_register(skb, NFTA_DYNSET_SREG_DATA, priv->sreg_data))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_DYNSET_OP, htonl(priv->op)))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout)))
+ goto nla_put_failure;
+ if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_dynset_type;
+static const struct nft_expr_ops nft_dynset_ops = {
+ .type = &nft_dynset_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)),
+ .eval = nft_dynset_eval,
+ .init = nft_dynset_init,
+ .destroy = nft_dynset_destroy,
+ .dump = nft_dynset_dump,
+};
+
+static struct nft_expr_type nft_dynset_type __read_mostly = {
+ .name = "dynset",
+ .ops = &nft_dynset_ops,
+ .policy = nft_dynset_policy,
+ .maxattr = NFTA_DYNSET_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_dynset_module_init(void)
+{
+ return nft_register_expr(&nft_dynset_type);
+}
+
+void nft_dynset_module_exit(void)
+{
+ nft_unregister_expr(&nft_dynset_type);
+}
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
new file mode 100644
index 000000000..ba7aed13e
--- /dev/null
+++ b/net/netfilter/nft_exthdr.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+// FIXME:
+#include <net/ipv6.h>
+
+struct nft_exthdr {
+ u8 type;
+ u8 offset;
+ u8 len;
+ enum nft_registers dreg:8;
+};
+
+static void nft_exthdr_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ unsigned int offset = 0;
+ int err;
+
+ err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
+ if (err < 0)
+ goto err;
+ offset += priv->offset;
+
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+ goto err;
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
+ [NFTA_EXTHDR_DREG] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 },
+ [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_LEN] = { .type = NLA_U32 },
+};
+
+static int nft_exthdr_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_EXTHDR_DREG] == NULL ||
+ tb[NFTA_EXTHDR_TYPE] == NULL ||
+ tb[NFTA_EXTHDR_OFFSET] == NULL ||
+ tb[NFTA_EXTHDR_LEN] == NULL)
+ return -EINVAL;
+
+ priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN]));
+ priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
+
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, priv->len);
+}
+
+static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_EXTHDR_DREG, priv->dreg))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_exthdr_type;
+static const struct nft_expr_ops nft_exthdr_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_eval,
+ .init = nft_exthdr_init,
+ .dump = nft_exthdr_dump,
+};
+
+static struct nft_expr_type nft_exthdr_type __read_mostly = {
+ .name = "exthdr",
+ .ops = &nft_exthdr_ops,
+ .policy = nft_exthdr_policy,
+ .maxattr = NFTA_EXTHDR_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_exthdr_module_init(void)
+{
+ return nft_register_expr(&nft_exthdr_type);
+}
+
+static void __exit nft_exthdr_module_exit(void)
+{
+ nft_unregister_expr(&nft_exthdr_type);
+}
+
+module_init(nft_exthdr_module_init);
+module_exit(nft_exthdr_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("exthdr");
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
new file mode 100644
index 000000000..3f9d45d3d
--- /dev/null
+++ b/net/netfilter/nft_hash.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/jhash.h>
+#include <linux/netlink.h>
+#include <linux/workqueue.h>
+#include <linux/rhashtable.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+/* We target a hash table size of 4, element hint is 75% of final size */
+#define NFT_HASH_ELEMENT_HINT 3
+
+struct nft_hash {
+ struct rhashtable ht;
+ struct delayed_work gc_work;
+};
+
+struct nft_hash_elem {
+ struct rhash_head node;
+ struct nft_set_ext ext;
+};
+
+struct nft_hash_cmp_arg {
+ const struct nft_set *set;
+ const u32 *key;
+ u8 genmask;
+};
+
+static const struct rhashtable_params nft_hash_params;
+
+static inline u32 nft_hash_key(const void *data, u32 len, u32 seed)
+{
+ const struct nft_hash_cmp_arg *arg = data;
+
+ return jhash(arg->key, len, seed);
+}
+
+static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct nft_hash_elem *he = data;
+
+ return jhash(nft_set_ext_key(&he->ext), len, seed);
+}
+
+static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct nft_hash_cmp_arg *x = arg->key;
+ const struct nft_hash_elem *he = ptr;
+
+ if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
+ return 1;
+ if (nft_set_elem_expired(&he->ext))
+ return 1;
+ if (!nft_set_elem_active(&he->ext, x->genmask))
+ return 1;
+ return 0;
+}
+
+static bool nft_hash_lookup(const struct nft_set *set, const u32 *key,
+ const struct nft_set_ext **ext)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_elem *he;
+ struct nft_hash_cmp_arg arg = {
+ .genmask = nft_genmask_cur(read_pnet(&set->pnet)),
+ .set = set,
+ .key = key,
+ };
+
+ he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+ if (he != NULL)
+ *ext = &he->ext;
+
+ return !!he;
+}
+
+static bool nft_hash_update(struct nft_set *set, const u32 *key,
+ void *(*new)(struct nft_set *,
+ const struct nft_expr *,
+ struct nft_regs *regs),
+ const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_set_ext **ext)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he;
+ struct nft_hash_cmp_arg arg = {
+ .genmask = NFT_GENMASK_ANY,
+ .set = set,
+ .key = key,
+ };
+
+ he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+ if (he != NULL)
+ goto out;
+
+ he = new(set, expr, regs);
+ if (he == NULL)
+ goto err1;
+ if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
+ nft_hash_params))
+ goto err2;
+out:
+ *ext = &he->ext;
+ return true;
+
+err2:
+ nft_set_elem_destroy(set, he);
+err1:
+ return false;
+}
+
+static int nft_hash_insert(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he = elem->priv;
+ struct nft_hash_cmp_arg arg = {
+ .genmask = nft_genmask_next(read_pnet(&set->pnet)),
+ .set = set,
+ .key = elem->key.val.data,
+ };
+
+ return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node,
+ nft_hash_params);
+}
+
+static void nft_hash_activate(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash_elem *he = elem->priv;
+
+ nft_set_elem_change_active(set, &he->ext);
+ nft_set_elem_clear_busy(&he->ext);
+}
+
+static void *nft_hash_deactivate(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he;
+ struct nft_hash_cmp_arg arg = {
+ .genmask = nft_genmask_next(read_pnet(&set->pnet)),
+ .set = set,
+ .key = elem->key.val.data,
+ };
+
+ rcu_read_lock();
+ he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
+ if (he != NULL) {
+ if (!nft_set_elem_mark_busy(&he->ext))
+ nft_set_elem_change_active(set, &he->ext);
+ else
+ he = NULL;
+ }
+ rcu_read_unlock();
+
+ return he;
+}
+
+static void nft_hash_remove(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he = elem->priv;
+
+ rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
+}
+
+static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_elem *he;
+ struct rhashtable_iter hti;
+ struct nft_set_elem elem;
+ u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
+ int err;
+
+ err = rhashtable_walk_init(&priv->ht, &hti);
+ iter->err = err;
+ if (err)
+ return;
+
+ err = rhashtable_walk_start(&hti);
+ if (err && err != -EAGAIN) {
+ iter->err = err;
+ goto out;
+ }
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ err = PTR_ERR(he);
+ if (err != -EAGAIN) {
+ iter->err = err;
+ goto out;
+ }
+
+ continue;
+ }
+
+ if (iter->count < iter->skip)
+ goto cont;
+ if (nft_set_elem_expired(&he->ext))
+ goto cont;
+ if (!nft_set_elem_active(&he->ext, genmask))
+ goto cont;
+
+ elem.priv = he;
+
+ iter->err = iter->fn(ctx, set, iter, &elem);
+ if (iter->err < 0)
+ goto out;
+
+cont:
+ iter->count++;
+ }
+
+out:
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+}
+
+static void nft_hash_gc(struct work_struct *work)
+{
+ struct nft_set *set;
+ struct nft_hash_elem *he;
+ struct nft_hash *priv;
+ struct nft_set_gc_batch *gcb = NULL;
+ struct rhashtable_iter hti;
+ int err;
+
+ priv = container_of(work, struct nft_hash, gc_work.work);
+ set = nft_set_container_of(priv);
+
+ err = rhashtable_walk_init(&priv->ht, &hti);
+ if (err)
+ goto schedule;
+
+ err = rhashtable_walk_start(&hti);
+ if (err && err != -EAGAIN)
+ goto out;
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ if (PTR_ERR(he) != -EAGAIN)
+ goto out;
+ continue;
+ }
+
+ if (!nft_set_elem_expired(&he->ext))
+ continue;
+ if (nft_set_elem_mark_busy(&he->ext))
+ continue;
+
+ gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+ if (gcb == NULL)
+ goto out;
+ rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
+ atomic_dec(&set->nelems);
+ nft_set_gc_batch_add(gcb, he);
+ }
+out:
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ nft_set_gc_batch_complete(gcb);
+schedule:
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+}
+
+static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
+{
+ return sizeof(struct nft_hash);
+}
+
+static const struct rhashtable_params nft_hash_params = {
+ .head_offset = offsetof(struct nft_hash_elem, node),
+ .hashfn = nft_hash_key,
+ .obj_hashfn = nft_hash_obj,
+ .obj_cmpfn = nft_hash_cmp,
+ .automatic_shrinking = true,
+};
+
+static int nft_hash_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ const struct nlattr * const tb[])
+{
+ struct nft_hash *priv = nft_set_priv(set);
+ struct rhashtable_params params = nft_hash_params;
+ int err;
+
+ params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT;
+ params.key_len = set->klen;
+
+ err = rhashtable_init(&priv->ht, &params);
+ if (err < 0)
+ return err;
+
+ INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc);
+ if (set->flags & NFT_SET_TIMEOUT)
+ queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+ nft_set_gc_interval(set));
+ return 0;
+}
+
+static void nft_hash_elem_destroy(void *ptr, void *arg)
+{
+ nft_set_elem_destroy((const struct nft_set *)arg, ptr);
+}
+
+static void nft_hash_destroy(const struct nft_set *set)
+{
+ struct nft_hash *priv = nft_set_priv(set);
+
+ cancel_delayed_work_sync(&priv->gc_work);
+ rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy,
+ (void *)set);
+}
+
+static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ unsigned int esize;
+
+ esize = sizeof(struct nft_hash_elem);
+ if (desc->size) {
+ est->size = sizeof(struct nft_hash) +
+ roundup_pow_of_two(desc->size * 4 / 3) *
+ sizeof(struct nft_hash_elem *) +
+ desc->size * esize;
+ } else {
+ /* Resizing happens when the load drops below 30% or goes
+ * above 75%. The average of 52.5% load (approximated by 50%)
+ * is used for the size estimation of the hash buckets,
+ * meaning we calculate two buckets per element.
+ */
+ est->size = esize + 2 * sizeof(struct nft_hash_elem *);
+ }
+
+ est->class = NFT_SET_CLASS_O_1;
+
+ return true;
+}
+
+static struct nft_set_ops nft_hash_ops __read_mostly = {
+ .privsize = nft_hash_privsize,
+ .elemsize = offsetof(struct nft_hash_elem, ext),
+ .estimate = nft_hash_estimate,
+ .init = nft_hash_init,
+ .destroy = nft_hash_destroy,
+ .insert = nft_hash_insert,
+ .activate = nft_hash_activate,
+ .deactivate = nft_hash_deactivate,
+ .remove = nft_hash_remove,
+ .lookup = nft_hash_lookup,
+ .update = nft_hash_update,
+ .walk = nft_hash_walk,
+ .features = NFT_SET_MAP | NFT_SET_TIMEOUT,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_hash_module_init(void)
+{
+ return nft_register_set(&nft_hash_ops);
+}
+
+static void __exit nft_hash_module_exit(void)
+{
+ nft_unregister_set(&nft_hash_ops);
+}
+
+module_init(nft_hash_module_init);
+module_exit(nft_hash_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
new file mode 100644
index 000000000..db3b74685
--- /dev/null
+++ b/net/netfilter/nft_immediate.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_immediate_expr {
+ struct nft_data data;
+ enum nft_registers dreg:8;
+ u8 dlen;
+};
+
+static void nft_immediate_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ nft_data_copy(&regs->data[priv->dreg], &priv->data, priv->dlen);
+}
+
+static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
+ [NFTA_IMMEDIATE_DREG] = { .type = NLA_U32 },
+ [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED },
+};
+
+static int nft_immediate_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc;
+ int err;
+
+ if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
+ tb[NFTA_IMMEDIATE_DATA] == NULL)
+ return -EINVAL;
+
+ err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc,
+ tb[NFTA_IMMEDIATE_DATA]);
+ if (err < 0)
+ return err;
+ priv->dlen = desc.len;
+
+ priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
+ err = nft_validate_register_store(ctx, priv->dreg, &priv->data,
+ desc.type, desc.len);
+ if (err < 0)
+ goto err1;
+
+ return 0;
+
+err1:
+ nft_data_uninit(&priv->data, desc.type);
+ return err;
+}
+
+static void nft_immediate_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
+}
+
+static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_IMMEDIATE_DREG, priv->dreg))
+ goto nla_put_failure;
+
+ return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data,
+ nft_dreg_to_type(priv->dreg), priv->dlen);
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_immediate_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg == NFT_REG_VERDICT)
+ *data = &priv->data;
+
+ return 0;
+}
+
+static struct nft_expr_type nft_imm_type;
+static const struct nft_expr_ops nft_imm_ops = {
+ .type = &nft_imm_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
+ .eval = nft_immediate_eval,
+ .init = nft_immediate_init,
+ .destroy = nft_immediate_destroy,
+ .dump = nft_immediate_dump,
+ .validate = nft_immediate_validate,
+};
+
+static struct nft_expr_type nft_imm_type __read_mostly = {
+ .name = "immediate",
+ .ops = &nft_imm_ops,
+ .policy = nft_immediate_policy,
+ .maxattr = NFTA_IMMEDIATE_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_immediate_module_init(void)
+{
+ return nft_register_expr(&nft_imm_type);
+}
+
+void nft_immediate_module_exit(void)
+{
+ nft_unregister_expr(&nft_imm_type);
+}
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
new file mode 100644
index 000000000..435c1ccd6
--- /dev/null
+++ b/net/netfilter/nft_limit.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+static DEFINE_SPINLOCK(limit_lock);
+
+struct nft_limit {
+ u64 tokens;
+ u64 rate;
+ u64 unit;
+ unsigned long stamp;
+};
+
+static void nft_limit_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_limit *priv = nft_expr_priv(expr);
+
+ spin_lock_bh(&limit_lock);
+ if (time_after_eq(jiffies, priv->stamp)) {
+ priv->tokens = priv->rate;
+ priv->stamp = jiffies + priv->unit * HZ;
+ }
+
+ if (priv->tokens >= 1) {
+ priv->tokens--;
+ spin_unlock_bh(&limit_lock);
+ return;
+ }
+ spin_unlock_bh(&limit_lock);
+
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
+ [NFTA_LIMIT_RATE] = { .type = NLA_U64 },
+ [NFTA_LIMIT_UNIT] = { .type = NLA_U64 },
+};
+
+static int nft_limit_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_limit *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_LIMIT_RATE] == NULL ||
+ tb[NFTA_LIMIT_UNIT] == NULL)
+ return -EINVAL;
+
+ priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+ priv->stamp = jiffies + priv->unit * HZ;
+ priv->tokens = priv->rate;
+ return 0;
+}
+
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_limit *priv = nft_expr_priv(expr);
+
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate)))
+ goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_limit_type;
+static const struct nft_expr_ops nft_limit_ops = {
+ .type = &nft_limit_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+ .eval = nft_limit_eval,
+ .init = nft_limit_init,
+ .dump = nft_limit_dump,
+};
+
+static struct nft_expr_type nft_limit_type __read_mostly = {
+ .name = "limit",
+ .ops = &nft_limit_ops,
+ .policy = nft_limit_policy,
+ .maxattr = NFTA_LIMIT_MAX,
+ .flags = NFT_EXPR_STATEFUL,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_limit_module_init(void)
+{
+ return nft_register_expr(&nft_limit_type);
+}
+
+static void __exit nft_limit_module_exit(void)
+{
+ nft_unregister_expr(&nft_limit_type);
+}
+
+module_init(nft_limit_module_init);
+module_exit(nft_limit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("limit");
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
new file mode 100644
index 000000000..a13d6a386
--- /dev/null
+++ b/net/netfilter/nft_log.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2014 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netdevice.h>
+
+static const char *nft_log_null_prefix = "";
+
+struct nft_log {
+ struct nf_loginfo loginfo;
+ char *prefix;
+};
+
+static void nft_log_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_log *priv = nft_expr_priv(expr);
+ struct net *net = dev_net(pkt->in ? pkt->in : pkt->out);
+
+ nf_log_packet(net, pkt->ops->pf, pkt->ops->hooknum, pkt->skb, pkt->in,
+ pkt->out, &priv->loginfo, "%s", priv->prefix);
+}
+
+static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
+ [NFTA_LOG_GROUP] = { .type = NLA_U16 },
+ [NFTA_LOG_PREFIX] = { .type = NLA_STRING },
+ [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 },
+ [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 },
+ [NFTA_LOG_LEVEL] = { .type = NLA_U32 },
+ [NFTA_LOG_FLAGS] = { .type = NLA_U32 },
+};
+
+static int nft_log_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_log *priv = nft_expr_priv(expr);
+ struct nf_loginfo *li = &priv->loginfo;
+ const struct nlattr *nla;
+ int ret;
+
+ nla = tb[NFTA_LOG_PREFIX];
+ if (nla != NULL) {
+ priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
+ if (priv->prefix == NULL)
+ return -ENOMEM;
+ nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+ } else {
+ priv->prefix = (char *)nft_log_null_prefix;
+ }
+
+ li->type = NF_LOG_TYPE_LOG;
+ if (tb[NFTA_LOG_LEVEL] != NULL &&
+ tb[NFTA_LOG_GROUP] != NULL)
+ return -EINVAL;
+ if (tb[NFTA_LOG_GROUP] != NULL)
+ li->type = NF_LOG_TYPE_ULOG;
+
+ switch (li->type) {
+ case NF_LOG_TYPE_LOG:
+ if (tb[NFTA_LOG_LEVEL] != NULL) {
+ li->u.log.level =
+ ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL]));
+ } else {
+ li->u.log.level = LOGLEVEL_WARNING;
+ }
+ if (tb[NFTA_LOG_FLAGS] != NULL) {
+ li->u.log.logflags =
+ ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS]));
+ }
+ break;
+ case NF_LOG_TYPE_ULOG:
+ li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP]));
+ if (tb[NFTA_LOG_SNAPLEN] != NULL) {
+ li->u.ulog.copy_len =
+ ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN]));
+ }
+ if (tb[NFTA_LOG_QTHRESHOLD] != NULL) {
+ li->u.ulog.qthreshold =
+ ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD]));
+ }
+ break;
+ }
+
+ if (ctx->afi->family == NFPROTO_INET) {
+ ret = nf_logger_find_get(NFPROTO_IPV4, li->type);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_logger_find_get(NFPROTO_IPV6, li->type);
+ if (ret < 0) {
+ nf_logger_put(NFPROTO_IPV4, li->type);
+ return ret;
+ }
+ return 0;
+ }
+
+ return nf_logger_find_get(ctx->afi->family, li->type);
+}
+
+static void nft_log_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_log *priv = nft_expr_priv(expr);
+ struct nf_loginfo *li = &priv->loginfo;
+
+ if (priv->prefix != nft_log_null_prefix)
+ kfree(priv->prefix);
+
+ if (ctx->afi->family == NFPROTO_INET) {
+ nf_logger_put(NFPROTO_IPV4, li->type);
+ nf_logger_put(NFPROTO_IPV6, li->type);
+ } else {
+ nf_logger_put(ctx->afi->family, li->type);
+ }
+}
+
+static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_log *priv = nft_expr_priv(expr);
+ const struct nf_loginfo *li = &priv->loginfo;
+
+ if (priv->prefix != nft_log_null_prefix)
+ if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix))
+ goto nla_put_failure;
+ switch (li->type) {
+ case NF_LOG_TYPE_LOG:
+ if (nla_put_be32(skb, NFTA_LOG_LEVEL, htonl(li->u.log.level)))
+ goto nla_put_failure;
+
+ if (li->u.log.logflags) {
+ if (nla_put_be32(skb, NFTA_LOG_FLAGS,
+ htonl(li->u.log.logflags)))
+ goto nla_put_failure;
+ }
+ break;
+ case NF_LOG_TYPE_ULOG:
+ if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group)))
+ goto nla_put_failure;
+
+ if (li->u.ulog.copy_len) {
+ if (nla_put_be32(skb, NFTA_LOG_SNAPLEN,
+ htonl(li->u.ulog.copy_len)))
+ goto nla_put_failure;
+ }
+ if (li->u.ulog.qthreshold) {
+ if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD,
+ htons(li->u.ulog.qthreshold)))
+ goto nla_put_failure;
+ }
+ break;
+ }
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_log_type;
+static const struct nft_expr_ops nft_log_ops = {
+ .type = &nft_log_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_log)),
+ .eval = nft_log_eval,
+ .init = nft_log_init,
+ .destroy = nft_log_destroy,
+ .dump = nft_log_dump,
+};
+
+static struct nft_expr_type nft_log_type __read_mostly = {
+ .name = "log",
+ .ops = &nft_log_ops,
+ .policy = nft_log_policy,
+ .maxattr = NFTA_LOG_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_log_module_init(void)
+{
+ return nft_register_expr(&nft_log_type);
+}
+
+static void __exit nft_log_module_exit(void)
+{
+ nft_unregister_expr(&nft_log_type);
+}
+
+module_init(nft_log_module_init);
+module_exit(nft_log_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("log");
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
new file mode 100644
index 000000000..b3c31ef80
--- /dev/null
+++ b/net/netfilter/nft_lookup.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+
+struct nft_lookup {
+ struct nft_set *set;
+ enum nft_registers sreg:8;
+ enum nft_registers dreg:8;
+ struct nft_set_binding binding;
+};
+
+static void nft_lookup_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+ const struct nft_set *set = priv->set;
+ const struct nft_set_ext *ext;
+
+ if (set->ops->lookup(set, &regs->data[priv->sreg], &ext)) {
+ if (set->flags & NFT_SET_MAP)
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), set->dlen);
+ return;
+ }
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
+ [NFTA_LOOKUP_SET] = { .type = NLA_STRING },
+ [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 },
+ [NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
+ [NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
+};
+
+static int nft_lookup_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_lookup *priv = nft_expr_priv(expr);
+ struct nft_set *set;
+ int err;
+
+ if (tb[NFTA_LOOKUP_SET] == NULL ||
+ tb[NFTA_LOOKUP_SREG] == NULL)
+ return -EINVAL;
+
+ set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET]);
+ if (IS_ERR(set)) {
+ if (tb[NFTA_LOOKUP_SET_ID]) {
+ set = nf_tables_set_lookup_byid(ctx->net,
+ tb[NFTA_LOOKUP_SET_ID]);
+ }
+ if (IS_ERR(set))
+ return PTR_ERR(set);
+ }
+
+ if (set->flags & NFT_SET_EVAL)
+ return -EOPNOTSUPP;
+
+ priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]);
+ err = nft_validate_register_load(priv->sreg, set->klen);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_LOOKUP_DREG] != NULL) {
+ if (!(set->flags & NFT_SET_MAP))
+ return -EINVAL;
+
+ priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]);
+ err = nft_validate_register_store(ctx, priv->dreg, NULL,
+ set->dtype, set->dlen);
+ if (err < 0)
+ return err;
+ } else if (set->flags & NFT_SET_MAP)
+ return -EINVAL;
+
+ priv->binding.flags = set->flags & NFT_SET_MAP;
+
+ err = nf_tables_bind_set(ctx, set, &priv->binding);
+ if (err < 0)
+ return err;
+
+ priv->set = set;
+ return 0;
+}
+
+static void nft_lookup_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_lookup *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
+static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+
+ if (nla_put_string(skb, NFTA_LOOKUP_SET, priv->set->name))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (priv->set->flags & NFT_SET_MAP)
+ if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_lookup_type;
+static const struct nft_expr_ops nft_lookup_ops = {
+ .type = &nft_lookup_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
+ .eval = nft_lookup_eval,
+ .init = nft_lookup_init,
+ .destroy = nft_lookup_destroy,
+ .dump = nft_lookup_dump,
+};
+
+static struct nft_expr_type nft_lookup_type __read_mostly = {
+ .name = "lookup",
+ .ops = &nft_lookup_ops,
+ .policy = nft_lookup_policy,
+ .maxattr = NFTA_LOOKUP_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_lookup_module_init(void)
+{
+ return nft_register_expr(&nft_lookup_type);
+}
+
+void nft_lookup_module_exit(void)
+{
+ nft_unregister_expr(&nft_lookup_type);
+}
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
new file mode 100644
index 000000000..9aea747b4
--- /dev/null
+++ b/net/netfilter/nft_masq.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nft_masq.h>
+
+const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
+ [NFTA_MASQ_FLAGS] = { .type = NLA_U32 },
+};
+EXPORT_SYMBOL_GPL(nft_masq_policy);
+
+int nft_masq_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ int err;
+
+ err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+ if (err < 0)
+ return err;
+
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_POST_ROUTING));
+}
+EXPORT_SYMBOL_GPL(nft_masq_validate);
+
+int nft_masq_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_masq *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_masq_validate(ctx, expr, NULL);
+ if (err)
+ return err;
+
+ if (tb[NFTA_MASQ_FLAGS] == NULL)
+ return 0;
+
+ priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
+ if (priv->flags & ~NF_NAT_RANGE_MASK)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_masq_init);
+
+int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_masq *priv = nft_expr_priv(expr);
+
+ if (priv->flags == 0)
+ return 0;
+
+ if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nft_masq_dump);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
new file mode 100644
index 000000000..52561e1c3
--- /dev/null
+++ b/net/netfilter/nft_meta.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/smp.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+
+void nft_meta_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ const struct net_device *in = pkt->in, *out = pkt->out;
+ u32 *dest = &regs->data[priv->dreg];
+
+ switch (priv->key) {
+ case NFT_META_LEN:
+ *dest = skb->len;
+ break;
+ case NFT_META_PROTOCOL:
+ *dest = 0;
+ *(__be16 *)dest = skb->protocol;
+ break;
+ case NFT_META_NFPROTO:
+ *dest = pkt->ops->pf;
+ break;
+ case NFT_META_L4PROTO:
+ *dest = pkt->tprot;
+ break;
+ case NFT_META_PRIORITY:
+ *dest = skb->priority;
+ break;
+ case NFT_META_MARK:
+ *dest = skb->mark;
+ break;
+ case NFT_META_IIF:
+ if (in == NULL)
+ goto err;
+ *dest = in->ifindex;
+ break;
+ case NFT_META_OIF:
+ if (out == NULL)
+ goto err;
+ *dest = out->ifindex;
+ break;
+ case NFT_META_IIFNAME:
+ if (in == NULL)
+ goto err;
+ strncpy((char *)dest, in->name, IFNAMSIZ);
+ break;
+ case NFT_META_OIFNAME:
+ if (out == NULL)
+ goto err;
+ strncpy((char *)dest, out->name, IFNAMSIZ);
+ break;
+ case NFT_META_IIFTYPE:
+ if (in == NULL)
+ goto err;
+ *dest = 0;
+ *(u16 *)dest = in->type;
+ break;
+ case NFT_META_OIFTYPE:
+ if (out == NULL)
+ goto err;
+ *dest = 0;
+ *(u16 *)dest = out->type;
+ break;
+ case NFT_META_SKUID:
+ if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ goto err;
+
+ read_lock_bh(&skb->sk->sk_callback_lock);
+ if (skb->sk->sk_socket == NULL ||
+ skb->sk->sk_socket->file == NULL) {
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ goto err;
+ }
+
+ *dest = from_kuid_munged(&init_user_ns,
+ skb->sk->sk_socket->file->f_cred->fsuid);
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ break;
+ case NFT_META_SKGID:
+ if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ goto err;
+
+ read_lock_bh(&skb->sk->sk_callback_lock);
+ if (skb->sk->sk_socket == NULL ||
+ skb->sk->sk_socket->file == NULL) {
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ goto err;
+ }
+ *dest = from_kgid_munged(&init_user_ns,
+ skb->sk->sk_socket->file->f_cred->fsgid);
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ break;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ case NFT_META_RTCLASSID: {
+ const struct dst_entry *dst = skb_dst(skb);
+
+ if (dst == NULL)
+ goto err;
+ *dest = dst->tclassid;
+ break;
+ }
+#endif
+#ifdef CONFIG_NETWORK_SECMARK
+ case NFT_META_SECMARK:
+ *dest = skb->secmark;
+ break;
+#endif
+ case NFT_META_PKTTYPE:
+ if (skb->pkt_type != PACKET_LOOPBACK) {
+ *dest = skb->pkt_type;
+ break;
+ }
+
+ switch (pkt->ops->pf) {
+ case NFPROTO_IPV4:
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr))
+ *dest = PACKET_MULTICAST;
+ else
+ *dest = PACKET_BROADCAST;
+ break;
+ case NFPROTO_IPV6:
+ if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
+ *dest = PACKET_MULTICAST;
+ else
+ *dest = PACKET_BROADCAST;
+ break;
+ default:
+ WARN_ON(1);
+ goto err;
+ }
+ break;
+ case NFT_META_CPU:
+ *dest = raw_smp_processor_id();
+ break;
+ case NFT_META_IIFGROUP:
+ if (in == NULL)
+ goto err;
+ *dest = in->group;
+ break;
+ case NFT_META_OIFGROUP:
+ if (out == NULL)
+ goto err;
+ *dest = out->group;
+ break;
+ case NFT_META_CGROUP:
+ if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ goto err;
+ *dest = skb->sk->sk_classid;
+ break;
+ default:
+ WARN_ON(1);
+ goto err;
+ }
+ return;
+
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_eval);
+
+void nft_meta_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *meta = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ u32 value = regs->data[meta->sreg];
+
+ switch (meta->key) {
+ case NFT_META_MARK:
+ skb->mark = value;
+ break;
+ case NFT_META_PRIORITY:
+ skb->priority = value;
+ break;
+ case NFT_META_NFTRACE:
+ skb->nf_trace = 1;
+ break;
+ default:
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL_GPL(nft_meta_set_eval);
+
+const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+ [NFTA_META_DREG] = { .type = NLA_U32 },
+ [NFTA_META_KEY] = { .type = NLA_U32 },
+ [NFTA_META_SREG] = { .type = NLA_U32 },
+};
+EXPORT_SYMBOL_GPL(nft_meta_policy);
+
+int nft_meta_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ case NFT_META_IIFTYPE:
+ case NFT_META_OIFTYPE:
+ len = sizeof(u16);
+ break;
+ case NFT_META_NFPROTO:
+ case NFT_META_L4PROTO:
+ case NFT_META_LEN:
+ case NFT_META_PRIORITY:
+ case NFT_META_MARK:
+ case NFT_META_IIF:
+ case NFT_META_OIF:
+ case NFT_META_SKUID:
+ case NFT_META_SKGID:
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ case NFT_META_RTCLASSID:
+#endif
+#ifdef CONFIG_NETWORK_SECMARK
+ case NFT_META_SECMARK:
+#endif
+ case NFT_META_PKTTYPE:
+ case NFT_META_CPU:
+ case NFT_META_IIFGROUP:
+ case NFT_META_OIFGROUP:
+ case NFT_META_CGROUP:
+ len = sizeof(u32);
+ break;
+ case NFT_META_IIFNAME:
+ case NFT_META_OIFNAME:
+ len = IFNAMSIZ;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_init);
+
+int nft_meta_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_MARK:
+ case NFT_META_PRIORITY:
+ len = sizeof(u32);
+ break;
+ case NFT_META_NFTRACE:
+ len = sizeof(u8);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
+ err = nft_validate_register_load(priv->sreg, len);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_meta_set_init);
+
+int nft_meta_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_META_DREG, priv->dreg))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_dump);
+
+int nft_meta_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_META_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_META_SREG, priv->sreg))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nft_meta_set_dump);
+
+static struct nft_expr_type nft_meta_type;
+static const struct nft_expr_ops nft_meta_get_ops = {
+ .type = &nft_meta_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_get_eval,
+ .init = nft_meta_get_init,
+ .dump = nft_meta_get_dump,
+};
+
+static const struct nft_expr_ops nft_meta_set_ops = {
+ .type = &nft_meta_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_set_eval,
+ .init = nft_meta_set_init,
+ .dump = nft_meta_set_dump,
+};
+
+static const struct nft_expr_ops *
+nft_meta_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_META_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG])
+ return &nft_meta_get_ops;
+
+ if (tb[NFTA_META_SREG])
+ return &nft_meta_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_meta_type __read_mostly = {
+ .name = "meta",
+ .select_ops = &nft_meta_select_ops,
+ .policy = nft_meta_policy,
+ .maxattr = NFTA_META_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_meta_module_init(void)
+{
+ return nft_register_expr(&nft_meta_type);
+}
+
+static void __exit nft_meta_module_exit(void)
+{
+ nft_unregister_expr(&nft_meta_type);
+}
+
+module_init(nft_meta_module_init);
+module_exit(nft_meta_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_EXPR("meta");
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
new file mode 100644
index 000000000..ee2d71753
--- /dev/null
+++ b/net/netfilter/nft_nat.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/string.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+ enum nft_registers sreg_addr_min:8;
+ enum nft_registers sreg_addr_max:8;
+ enum nft_registers sreg_proto_min:8;
+ enum nft_registers sreg_proto_max:8;
+ enum nf_nat_manip_type type:8;
+ u8 family;
+ u16 flags;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+ struct nf_nat_range range;
+
+ memset(&range, 0, sizeof(range));
+ if (priv->sreg_addr_min) {
+ if (priv->family == AF_INET) {
+ range.min_addr.ip = (__force __be32)
+ regs->data[priv->sreg_addr_min];
+ range.max_addr.ip = (__force __be32)
+ regs->data[priv->sreg_addr_max];
+
+ } else {
+ memcpy(range.min_addr.ip6,
+ &regs->data[priv->sreg_addr_min],
+ sizeof(range.min_addr.ip6));
+ memcpy(range.max_addr.ip6,
+ &regs->data[priv->sreg_addr_max],
+ sizeof(range.max_addr.ip6));
+ }
+ range.flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ if (priv->sreg_proto_min) {
+ range.min_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
+ range.max_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ }
+
+ range.flags |= priv->flags;
+
+ regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+ [NFTA_NAT_TYPE] = { .type = NLA_U32 },
+ [NFTA_NAT_FAMILY] = { .type = NLA_U32 },
+ [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 },
+ [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 },
+ [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 },
+ [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 },
+ [NFTA_NAT_FLAGS] = { .type = NLA_U32 },
+};
+
+static int nft_nat_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ struct nft_nat *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+ if (err < 0)
+ return err;
+
+ switch (priv->type) {
+ case NFT_NAT_SNAT:
+ err = nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN));
+ break;
+ case NFT_NAT_DNAT:
+ err = nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT));
+ break;
+ }
+
+ return err;
+}
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_nat *priv = nft_expr_priv(expr);
+ unsigned int alen, plen;
+ u32 family;
+ int err;
+
+ if (tb[NFTA_NAT_TYPE] == NULL ||
+ (tb[NFTA_NAT_REG_ADDR_MIN] == NULL &&
+ tb[NFTA_NAT_REG_PROTO_MIN] == NULL))
+ return -EINVAL;
+
+ switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+ case NFT_NAT_SNAT:
+ priv->type = NF_NAT_MANIP_SRC;
+ break;
+ case NFT_NAT_DNAT:
+ priv->type = NF_NAT_MANIP_DST;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ err = nft_nat_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_NAT_FAMILY] == NULL)
+ return -EINVAL;
+
+ family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
+ if (family != ctx->afi->family)
+ return -EOPNOTSUPP;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip);
+ break;
+ case NFPROTO_IPV6:
+ alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip6);
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+ priv->family = family;
+
+ if (tb[NFTA_NAT_REG_ADDR_MIN]) {
+ priv->sreg_addr_min =
+ nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]);
+ err = nft_validate_register_load(priv->sreg_addr_min, alen);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_NAT_REG_ADDR_MAX]) {
+ priv->sreg_addr_max =
+ nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]);
+
+ err = nft_validate_register_load(priv->sreg_addr_max,
+ alen);
+ if (err < 0)
+ return err;
+ } else {
+ priv->sreg_addr_max = priv->sreg_addr_min;
+ }
+ }
+
+ plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
+ if (tb[NFTA_NAT_REG_PROTO_MIN]) {
+ priv->sreg_proto_min =
+ nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]);
+
+ err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_NAT_REG_PROTO_MAX]) {
+ priv->sreg_proto_max =
+ nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]);
+
+ err = nft_validate_register_load(priv->sreg_proto_max,
+ plen);
+ if (err < 0)
+ return err;
+ } else {
+ priv->sreg_proto_max = priv->sreg_proto_min;
+ }
+ }
+
+ if (tb[NFTA_NAT_FLAGS]) {
+ priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
+ if (priv->flags & ~NF_NAT_RANGE_MASK)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+
+ switch (priv->type) {
+ case NF_NAT_MANIP_SRC:
+ if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+ goto nla_put_failure;
+ break;
+ case NF_NAT_MANIP_DST:
+ if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+ goto nla_put_failure;
+ break;
+ }
+
+ if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family)))
+ goto nla_put_failure;
+
+ if (priv->sreg_addr_min) {
+ if (nft_dump_register(skb, NFTA_NAT_REG_ADDR_MIN,
+ priv->sreg_addr_min) ||
+ nft_dump_register(skb, NFTA_NAT_REG_ADDR_MAX,
+ priv->sreg_addr_max))
+ goto nla_put_failure;
+ }
+
+ if (priv->sreg_proto_min) {
+ if (nft_dump_register(skb, NFTA_NAT_REG_PROTO_MIN,
+ priv->sreg_proto_min) ||
+ nft_dump_register(skb, NFTA_NAT_REG_PROTO_MAX,
+ priv->sreg_proto_max))
+ goto nla_put_failure;
+ }
+
+ if (priv->flags != 0) {
+ if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags)))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_nat_type;
+static const struct nft_expr_ops nft_nat_ops = {
+ .type = &nft_nat_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+ .eval = nft_nat_eval,
+ .init = nft_nat_init,
+ .dump = nft_nat_dump,
+ .validate = nft_nat_validate,
+};
+
+static struct nft_expr_type nft_nat_type __read_mostly = {
+ .name = "nat",
+ .ops = &nft_nat_ops,
+ .policy = nft_nat_policy,
+ .maxattr = NFTA_NAT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_nat_module_init(void)
+{
+ return nft_register_expr(&nft_nat_type);
+}
+
+static void __exit nft_nat_module_exit(void)
+{
+ nft_unregister_expr(&nft_nat_type);
+}
+
+module_init(nft_nat_module_init);
+module_exit(nft_nat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
new file mode 100644
index 000000000..94fb3b27a
--- /dev/null
+++ b/net/netfilter/nft_payload.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+static void nft_payload_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ u32 *dest = &regs->data[priv->dreg];
+ int offset;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ if (!skb_mac_header_was_set(skb))
+ goto err;
+ offset = skb_mac_header(skb) - skb->data;
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ offset = skb_network_offset(skb);
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ offset = pkt->xt.thoff;
+ break;
+ default:
+ BUG();
+ }
+ offset += priv->offset;
+
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (skb_copy_bits(skb, offset, dest, priv->len) < 0)
+ goto err;
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
+ [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
+};
+
+static int nft_payload_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_payload *priv = nft_expr_priv(expr);
+
+ priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]);
+
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, priv->len);
+}
+
+static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_PAYLOAD_DREG, priv->dreg) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_payload_type;
+static const struct nft_expr_ops nft_payload_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+ .eval = nft_payload_eval,
+ .init = nft_payload_init,
+ .dump = nft_payload_dump,
+};
+
+const struct nft_expr_ops nft_payload_fast_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+ .eval = nft_payload_eval,
+ .init = nft_payload_init,
+ .dump = nft_payload_dump,
+};
+
+static const struct nft_expr_ops *
+nft_payload_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ enum nft_payload_bases base;
+ unsigned int offset, len;
+
+ if (tb[NFTA_PAYLOAD_DREG] == NULL ||
+ tb[NFTA_PAYLOAD_BASE] == NULL ||
+ tb[NFTA_PAYLOAD_OFFSET] == NULL ||
+ tb[NFTA_PAYLOAD_LEN] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ switch (base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ break;
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+
+ if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) &&
+ base != NFT_PAYLOAD_LL_HEADER)
+ return &nft_payload_fast_ops;
+ else
+ return &nft_payload_ops;
+}
+
+static struct nft_expr_type nft_payload_type __read_mostly = {
+ .name = "payload",
+ .select_ops = nft_payload_select_ops,
+ .policy = nft_payload_policy,
+ .maxattr = NFTA_PAYLOAD_MAX,
+ .owner = THIS_MODULE,
+};
+
+int __init nft_payload_module_init(void)
+{
+ return nft_register_expr(&nft_payload_type);
+}
+
+void nft_payload_module_exit(void)
+{
+ nft_unregister_expr(&nft_payload_type);
+}
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
new file mode 100644
index 000000000..96805d21d
--- /dev/null
+++ b/net/netfilter/nft_queue.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code partly funded by OISF
+ * (http://www.openinfosecfoundation.org/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/jhash.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_queue.h>
+
+static u32 jhash_initval __read_mostly;
+
+struct nft_queue {
+ u16 queuenum;
+ u16 queues_total;
+ u16 flags;
+};
+
+static void nft_queue_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_queue *priv = nft_expr_priv(expr);
+ u32 queue = priv->queuenum;
+ u32 ret;
+
+ if (priv->queues_total > 1) {
+ if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) {
+ int cpu = smp_processor_id();
+
+ queue = priv->queuenum + cpu % priv->queues_total;
+ } else {
+ queue = nfqueue_hash(pkt->skb, queue,
+ priv->queues_total, pkt->ops->pf,
+ jhash_initval);
+ }
+ }
+
+ ret = NF_QUEUE_NR(queue);
+ if (priv->flags & NFT_QUEUE_FLAG_BYPASS)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+ regs->verdict.code = ret;
+}
+
+static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
+ [NFTA_QUEUE_NUM] = { .type = NLA_U16 },
+ [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
+ [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 },
+};
+
+static int nft_queue_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_queue *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_QUEUE_NUM] == NULL)
+ return -EINVAL;
+
+ init_hashrandom(&jhash_initval);
+ priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM]));
+
+ if (tb[NFTA_QUEUE_TOTAL] != NULL)
+ priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL]));
+ if (tb[NFTA_QUEUE_FLAGS] != NULL) {
+ priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS]));
+ if (priv->flags & ~NFT_QUEUE_FLAG_MASK)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_queue *priv = nft_expr_priv(expr);
+
+ if (nla_put_be16(skb, NFTA_QUEUE_NUM, htons(priv->queuenum)) ||
+ nla_put_be16(skb, NFTA_QUEUE_TOTAL, htons(priv->queues_total)) ||
+ nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_queue_type;
+static const struct nft_expr_ops nft_queue_ops = {
+ .type = &nft_queue_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)),
+ .eval = nft_queue_eval,
+ .init = nft_queue_init,
+ .dump = nft_queue_dump,
+};
+
+static struct nft_expr_type nft_queue_type __read_mostly = {
+ .name = "queue",
+ .ops = &nft_queue_ops,
+ .policy = nft_queue_policy,
+ .maxattr = NFTA_QUEUE_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_queue_module_init(void)
+{
+ return nft_register_expr(&nft_queue_type);
+}
+
+static void __exit nft_queue_module_exit(void)
+{
+ nft_unregister_expr(&nft_queue_type);
+}
+
+module_init(nft_queue_module_init);
+module_exit(nft_queue_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Leblond <eric@regit.org>");
+MODULE_ALIAS_NFT_EXPR("queue");
diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c
new file mode 100644
index 000000000..1c30f41cf
--- /dev/null
+++ b/net/netfilter/nft_rbtree.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+static DEFINE_SPINLOCK(nft_rbtree_lock);
+
+struct nft_rbtree {
+ struct rb_root root;
+};
+
+struct nft_rbtree_elem {
+ struct rb_node node;
+ struct nft_set_ext ext;
+};
+
+
+static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
+ const struct nft_set_ext **ext)
+{
+ const struct nft_rbtree *priv = nft_set_priv(set);
+ const struct nft_rbtree_elem *rbe, *interval = NULL;
+ const struct rb_node *parent;
+ u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
+ int d;
+
+ spin_lock_bh(&nft_rbtree_lock);
+ parent = priv->root.rb_node;
+ while (parent != NULL) {
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+ d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
+ if (d < 0) {
+ parent = parent->rb_left;
+ interval = rbe;
+ } else if (d > 0)
+ parent = parent->rb_right;
+ else {
+found:
+ if (!nft_set_elem_active(&rbe->ext, genmask)) {
+ parent = parent->rb_left;
+ continue;
+ }
+ if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(&rbe->ext) &
+ NFT_SET_ELEM_INTERVAL_END)
+ goto out;
+ spin_unlock_bh(&nft_rbtree_lock);
+
+ *ext = &rbe->ext;
+ return true;
+ }
+ }
+
+ if (set->flags & NFT_SET_INTERVAL && interval != NULL) {
+ rbe = interval;
+ goto found;
+ }
+out:
+ spin_unlock_bh(&nft_rbtree_lock);
+ return false;
+}
+
+static int __nft_rbtree_insert(const struct nft_set *set,
+ struct nft_rbtree_elem *new)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct rb_node *parent, **p;
+ u8 genmask = nft_genmask_next(read_pnet(&set->pnet));
+ int d;
+
+ parent = NULL;
+ p = &priv->root.rb_node;
+ while (*p != NULL) {
+ parent = *p;
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+ d = memcmp(nft_set_ext_key(&rbe->ext),
+ nft_set_ext_key(&new->ext),
+ set->klen);
+ if (d < 0)
+ p = &parent->rb_left;
+ else if (d > 0)
+ p = &parent->rb_right;
+ else {
+ if (nft_set_elem_active(&rbe->ext, genmask))
+ return -EEXIST;
+ p = &parent->rb_left;
+ }
+ }
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, &priv->root);
+ return 0;
+}
+
+static int nft_rbtree_insert(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_rbtree_elem *rbe = elem->priv;
+ int err;
+
+ spin_lock_bh(&nft_rbtree_lock);
+ err = __nft_rbtree_insert(set, rbe);
+ spin_unlock_bh(&nft_rbtree_lock);
+
+ return err;
+}
+
+static void nft_rbtree_remove(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe = elem->priv;
+
+ spin_lock_bh(&nft_rbtree_lock);
+ rb_erase(&rbe->node, &priv->root);
+ spin_unlock_bh(&nft_rbtree_lock);
+}
+
+static void nft_rbtree_activate(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_rbtree_elem *rbe = elem->priv;
+
+ nft_set_elem_change_active(set, &rbe->ext);
+}
+
+static void *nft_rbtree_deactivate(const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ const struct nft_rbtree *priv = nft_set_priv(set);
+ const struct rb_node *parent = priv->root.rb_node;
+ struct nft_rbtree_elem *rbe;
+ u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
+ int d;
+
+ while (parent != NULL) {
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+
+ d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
+ set->klen);
+ if (d < 0)
+ parent = parent->rb_left;
+ else if (d > 0)
+ parent = parent->rb_right;
+ else {
+ if (!nft_set_elem_active(&rbe->ext, genmask)) {
+ parent = parent->rb_left;
+ continue;
+ }
+ nft_set_elem_change_active(set, &rbe->ext);
+ return rbe;
+ }
+ }
+ return NULL;
+}
+
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ const struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct nft_set_elem elem;
+ struct rb_node *node;
+ u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
+
+ spin_lock_bh(&nft_rbtree_lock);
+ for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+ if (iter->count < iter->skip)
+ goto cont;
+ if (!nft_set_elem_active(&rbe->ext, genmask))
+ goto cont;
+
+ elem.priv = rbe;
+
+ iter->err = iter->fn(ctx, set, iter, &elem);
+ if (iter->err < 0) {
+ spin_unlock_bh(&nft_rbtree_lock);
+ return;
+ }
+cont:
+ iter->count++;
+ }
+ spin_unlock_bh(&nft_rbtree_lock);
+}
+
+static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[])
+{
+ return sizeof(struct nft_rbtree);
+}
+
+static int nft_rbtree_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ const struct nlattr * const nla[])
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ priv->root = RB_ROOT;
+ return 0;
+}
+
+static void nft_rbtree_destroy(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct rb_node *node;
+
+ while ((node = priv->root.rb_node) != NULL) {
+ rb_erase(node, &priv->root);
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ nft_set_elem_destroy(set, rbe);
+ }
+}
+
+static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ unsigned int nsize;
+
+ nsize = sizeof(struct nft_rbtree_elem);
+ if (desc->size)
+ est->size = sizeof(struct nft_rbtree) + desc->size * nsize;
+ else
+ est->size = nsize;
+
+ est->class = NFT_SET_CLASS_O_LOG_N;
+
+ return true;
+}
+
+static struct nft_set_ops nft_rbtree_ops __read_mostly = {
+ .privsize = nft_rbtree_privsize,
+ .elemsize = offsetof(struct nft_rbtree_elem, ext),
+ .estimate = nft_rbtree_estimate,
+ .init = nft_rbtree_init,
+ .destroy = nft_rbtree_destroy,
+ .insert = nft_rbtree_insert,
+ .remove = nft_rbtree_remove,
+ .deactivate = nft_rbtree_deactivate,
+ .activate = nft_rbtree_activate,
+ .lookup = nft_rbtree_lookup,
+ .walk = nft_rbtree_walk,
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_rbtree_module_init(void)
+{
+ return nft_register_set(&nft_rbtree_ops);
+}
+
+static void __exit nft_rbtree_module_exit(void)
+{
+ nft_unregister_set(&nft_rbtree_ops);
+}
+
+module_init(nft_rbtree_module_init);
+module_exit(nft_rbtree_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
new file mode 100644
index 000000000..03f7bf40a
--- /dev/null
+++ b/net/netfilter/nft_redir.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_redir.h>
+
+const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = {
+ [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 },
+ [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 },
+ [NFTA_REDIR_FLAGS] = { .type = NLA_U32 },
+};
+EXPORT_SYMBOL_GPL(nft_redir_policy);
+
+int nft_redir_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ int err;
+
+ err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+ if (err < 0)
+ return err;
+
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT));
+}
+EXPORT_SYMBOL_GPL(nft_redir_validate);
+
+int nft_redir_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_redir *priv = nft_expr_priv(expr);
+ unsigned int plen;
+ int err;
+
+ err = nft_redir_validate(ctx, expr, NULL);
+ if (err < 0)
+ return err;
+
+ plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
+ if (tb[NFTA_REDIR_REG_PROTO_MIN]) {
+ priv->sreg_proto_min =
+ nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]);
+
+ err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_REDIR_REG_PROTO_MAX]) {
+ priv->sreg_proto_max =
+ nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]);
+
+ err = nft_validate_register_load(priv->sreg_proto_max,
+ plen);
+ if (err < 0)
+ return err;
+ } else {
+ priv->sreg_proto_max = priv->sreg_proto_min;
+ }
+ }
+
+ if (tb[NFTA_REDIR_FLAGS]) {
+ priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS]));
+ if (priv->flags & ~NF_NAT_RANGE_MASK)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_redir_init);
+
+int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_redir *priv = nft_expr_priv(expr);
+
+ if (priv->sreg_proto_min) {
+ if (nft_dump_register(skb, NFTA_REDIR_REG_PROTO_MIN,
+ priv->sreg_proto_min))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_REDIR_REG_PROTO_MAX,
+ priv->sreg_proto_max))
+ goto nla_put_failure;
+ }
+
+ if (priv->flags != 0 &&
+ nla_put_be32(skb, NFTA_REDIR_FLAGS, htonl(priv->flags)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nft_redir_dump);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
new file mode 100644
index 000000000..0522fc9bf
--- /dev/null
+++ b/net/netfilter/nft_reject.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2013 Eric Leblond <eric@regit.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+
+const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
+ [NFTA_REJECT_TYPE] = { .type = NLA_U32 },
+ [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 },
+};
+EXPORT_SYMBOL_GPL(nft_reject_policy);
+
+int nft_reject_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_REJECT_TYPE] == NULL)
+ return -EINVAL;
+
+ priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
+ return -EINVAL;
+ priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+ case NFT_REJECT_TCP_RST:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_reject_init);
+
+int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_reject *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
+ goto nla_put_failure;
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
+ goto nla_put_failure;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(nft_reject_dump);
+
+static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = {
+ [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH,
+ [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH,
+ [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH,
+ [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED,
+};
+
+int nft_reject_icmp_code(u8 code)
+{
+ BUG_ON(code > NFT_REJECT_ICMPX_MAX);
+
+ return icmp_code_v4[code];
+}
+
+EXPORT_SYMBOL_GPL(nft_reject_icmp_code);
+
+
+static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = {
+ [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE,
+ [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH,
+ [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH,
+ [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED,
+};
+
+int nft_reject_icmpv6_code(u8 code)
+{
+ BUG_ON(code > NFT_REJECT_ICMPX_MAX);
+
+ return icmp_code_v6[code];
+}
+
+EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
new file mode 100644
index 000000000..635dbba93
--- /dev/null
+++ b/net/netfilter/nft_reject_inet.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2014 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
+static void nft_reject_inet_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+ struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out);
+
+ switch (pkt->ops->pf) {
+ case NFPROTO_IPV4:
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach(pkt->skb, priv->icmp_code,
+ pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset(pkt->skb, pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nf_send_unreach(pkt->skb,
+ nft_reject_icmp_code(priv->icmp_code),
+ pkt->ops->hooknum);
+ break;
+ }
+ break;
+ case NFPROTO_IPV6:
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nf_send_unreach6(net, pkt->skb, priv->icmp_code,
+ pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nf_send_reset6(net, pkt->skb, pkt->ops->hooknum);
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nf_send_unreach6(net, pkt->skb,
+ nft_reject_icmpv6_code(priv->icmp_code),
+ pkt->ops->hooknum);
+ break;
+ }
+ break;
+ }
+
+ regs->verdict.code = NF_DROP;
+}
+
+static int nft_reject_inet_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_reject *priv = nft_expr_priv(expr);
+ int icmp_code;
+
+ if (tb[NFTA_REJECT_TYPE] == NULL)
+ return -EINVAL;
+
+ priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
+ if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
+ return -EINVAL;
+
+ icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+ if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
+ icmp_code > NFT_REJECT_ICMPX_MAX)
+ return -EINVAL;
+
+ priv->icmp_code = icmp_code;
+ break;
+ case NFT_REJECT_TCP_RST:
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nft_reject_inet_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_reject *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
+ goto nla_put_failure;
+
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
+ if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
+ goto nla_put_failure;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_reject_inet_type;
+static const struct nft_expr_ops nft_reject_inet_ops = {
+ .type = &nft_reject_inet_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_inet_eval,
+ .init = nft_reject_inet_init,
+ .dump = nft_reject_inet_dump,
+};
+
+static struct nft_expr_type nft_reject_inet_type __read_mostly = {
+ .family = NFPROTO_INET,
+ .name = "reject",
+ .ops = &nft_reject_inet_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_inet_module_init(void)
+{
+ return nft_register_expr(&nft_reject_inet_type);
+}
+
+static void __exit nft_reject_inet_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_inet_type);
+}
+
+module_init(nft_reject_inet_module_init);
+module_exit(nft_reject_inet_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS_NFT_AF_EXPR(1, "reject");
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
new file mode 100644
index 000000000..51a459c3c
--- /dev/null
+++ b/net/netfilter/x_tables.c
@@ -0,0 +1,1360 @@
+/*
+ * x_tables core - Backend for {ip,ip6,arp}_tables
+ *
+ * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org>
+ * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on existing ip_tables code which is
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/audit.h>
+#include <net/net_namespace.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
+
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+struct compat_delta {
+ unsigned int offset; /* offset in kernel */
+ int delta; /* delta in 32bit user land */
+};
+
+struct xt_af {
+ struct mutex mutex;
+ struct list_head match;
+ struct list_head target;
+#ifdef CONFIG_COMPAT
+ struct mutex compat_mutex;
+ struct compat_delta *compat_tab;
+ unsigned int number; /* number of slots in compat_tab[] */
+ unsigned int cur; /* number of used slots in compat_tab[] */
+#endif
+};
+
+static struct xt_af *xt;
+
+static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
+ [NFPROTO_UNSPEC] = "x",
+ [NFPROTO_IPV4] = "ip",
+ [NFPROTO_ARP] = "arp",
+ [NFPROTO_BRIDGE] = "eb",
+ [NFPROTO_IPV6] = "ip6",
+};
+
+/* Allow this many total (re)entries. */
+static const unsigned int xt_jumpstack_multiplier = 2;
+
+/* Registration hooks for targets. */
+int xt_register_target(struct xt_target *target)
+{
+ u_int8_t af = target->family;
+
+ mutex_lock(&xt[af].mutex);
+ list_add(&target->list, &xt[af].target);
+ mutex_unlock(&xt[af].mutex);
+ return 0;
+}
+EXPORT_SYMBOL(xt_register_target);
+
+void
+xt_unregister_target(struct xt_target *target)
+{
+ u_int8_t af = target->family;
+
+ mutex_lock(&xt[af].mutex);
+ list_del(&target->list);
+ mutex_unlock(&xt[af].mutex);
+}
+EXPORT_SYMBOL(xt_unregister_target);
+
+int
+xt_register_targets(struct xt_target *target, unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = xt_register_target(&target[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ xt_unregister_targets(target, i);
+ return err;
+}
+EXPORT_SYMBOL(xt_register_targets);
+
+void
+xt_unregister_targets(struct xt_target *target, unsigned int n)
+{
+ while (n-- > 0)
+ xt_unregister_target(&target[n]);
+}
+EXPORT_SYMBOL(xt_unregister_targets);
+
+int xt_register_match(struct xt_match *match)
+{
+ u_int8_t af = match->family;
+
+ mutex_lock(&xt[af].mutex);
+ list_add(&match->list, &xt[af].match);
+ mutex_unlock(&xt[af].mutex);
+ return 0;
+}
+EXPORT_SYMBOL(xt_register_match);
+
+void
+xt_unregister_match(struct xt_match *match)
+{
+ u_int8_t af = match->family;
+
+ mutex_lock(&xt[af].mutex);
+ list_del(&match->list);
+ mutex_unlock(&xt[af].mutex);
+}
+EXPORT_SYMBOL(xt_unregister_match);
+
+int
+xt_register_matches(struct xt_match *match, unsigned int n)
+{
+ unsigned int i;
+ int err = 0;
+
+ for (i = 0; i < n; i++) {
+ err = xt_register_match(&match[i]);
+ if (err)
+ goto err;
+ }
+ return err;
+
+err:
+ if (i > 0)
+ xt_unregister_matches(match, i);
+ return err;
+}
+EXPORT_SYMBOL(xt_register_matches);
+
+void
+xt_unregister_matches(struct xt_match *match, unsigned int n)
+{
+ while (n-- > 0)
+ xt_unregister_match(&match[n]);
+}
+EXPORT_SYMBOL(xt_unregister_matches);
+
+
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use.
+ */
+
+/* Find match, grabs ref. Returns ERR_PTR() on error. */
+struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
+{
+ struct xt_match *m;
+ int err = -ENOENT;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(m, &xt[af].match, list) {
+ if (strcmp(m->name, name) == 0) {
+ if (m->revision == revision) {
+ if (try_module_get(m->me)) {
+ mutex_unlock(&xt[af].mutex);
+ return m;
+ }
+ } else
+ err = -EPROTOTYPE; /* Found something. */
+ }
+ }
+ mutex_unlock(&xt[af].mutex);
+
+ if (af != NFPROTO_UNSPEC)
+ /* Try searching again in the family-independent list */
+ return xt_find_match(NFPROTO_UNSPEC, name, revision);
+
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(xt_find_match);
+
+struct xt_match *
+xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision)
+{
+ struct xt_match *match;
+
+ match = xt_find_match(nfproto, name, revision);
+ if (IS_ERR(match)) {
+ request_module("%st_%s", xt_prefix[nfproto], name);
+ match = xt_find_match(nfproto, name, revision);
+ }
+
+ return match;
+}
+EXPORT_SYMBOL_GPL(xt_request_find_match);
+
+/* Find target, grabs ref. Returns ERR_PTR() on error. */
+struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
+{
+ struct xt_target *t;
+ int err = -ENOENT;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &xt[af].target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision == revision) {
+ if (try_module_get(t->me)) {
+ mutex_unlock(&xt[af].mutex);
+ return t;
+ }
+ } else
+ err = -EPROTOTYPE; /* Found something. */
+ }
+ }
+ mutex_unlock(&xt[af].mutex);
+
+ if (af != NFPROTO_UNSPEC)
+ /* Try searching again in the family-independent list */
+ return xt_find_target(NFPROTO_UNSPEC, name, revision);
+
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(xt_find_target);
+
+struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
+{
+ struct xt_target *target;
+
+ target = xt_find_target(af, name, revision);
+ if (IS_ERR(target)) {
+ request_module("%st_%s", xt_prefix[af], name);
+ target = xt_find_target(af, name, revision);
+ }
+
+ return target;
+}
+EXPORT_SYMBOL_GPL(xt_request_find_target);
+
+static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
+{
+ const struct xt_match *m;
+ int have_rev = 0;
+
+ list_for_each_entry(m, &xt[af].match, list) {
+ if (strcmp(m->name, name) == 0) {
+ if (m->revision > *bestp)
+ *bestp = m->revision;
+ if (m->revision == revision)
+ have_rev = 1;
+ }
+ }
+
+ if (af != NFPROTO_UNSPEC && !have_rev)
+ return match_revfn(NFPROTO_UNSPEC, name, revision, bestp);
+
+ return have_rev;
+}
+
+static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
+{
+ const struct xt_target *t;
+ int have_rev = 0;
+
+ list_for_each_entry(t, &xt[af].target, list) {
+ if (strcmp(t->name, name) == 0) {
+ if (t->revision > *bestp)
+ *bestp = t->revision;
+ if (t->revision == revision)
+ have_rev = 1;
+ }
+ }
+
+ if (af != NFPROTO_UNSPEC && !have_rev)
+ return target_revfn(NFPROTO_UNSPEC, name, revision, bestp);
+
+ return have_rev;
+}
+
+/* Returns true or false (if no such extension at all) */
+int xt_find_revision(u8 af, const char *name, u8 revision, int target,
+ int *err)
+{
+ int have_rev, best = -1;
+
+ mutex_lock(&xt[af].mutex);
+ if (target == 1)
+ have_rev = target_revfn(af, name, revision, &best);
+ else
+ have_rev = match_revfn(af, name, revision, &best);
+ mutex_unlock(&xt[af].mutex);
+
+ /* Nothing at all? Return 0 to try loading module. */
+ if (best == -1) {
+ *err = -ENOENT;
+ return 0;
+ }
+
+ *err = best;
+ if (!have_rev)
+ *err = -EPROTONOSUPPORT;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(xt_find_revision);
+
+static char *
+textify_hooks(char *buf, size_t size, unsigned int mask, uint8_t nfproto)
+{
+ static const char *const inetbr_names[] = {
+ "PREROUTING", "INPUT", "FORWARD",
+ "OUTPUT", "POSTROUTING", "BROUTING",
+ };
+ static const char *const arp_names[] = {
+ "INPUT", "FORWARD", "OUTPUT",
+ };
+ const char *const *names;
+ unsigned int i, max;
+ char *p = buf;
+ bool np = false;
+ int res;
+
+ names = (nfproto == NFPROTO_ARP) ? arp_names : inetbr_names;
+ max = (nfproto == NFPROTO_ARP) ? ARRAY_SIZE(arp_names) :
+ ARRAY_SIZE(inetbr_names);
+ *p = '\0';
+ for (i = 0; i < max; ++i) {
+ if (!(mask & (1 << i)))
+ continue;
+ res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]);
+ if (res > 0) {
+ size -= res;
+ p += res;
+ }
+ np = true;
+ }
+
+ return buf;
+}
+
+int xt_check_match(struct xt_mtchk_param *par,
+ unsigned int size, u_int8_t proto, bool inv_proto)
+{
+ int ret;
+
+ if (XT_ALIGN(par->match->matchsize) != size &&
+ par->match->matchsize != -1) {
+ /*
+ * ebt_among is exempt from centralized matchsize checking
+ * because it uses a dynamic-size data set.
+ */
+ pr_err("%s_tables: %s.%u match: invalid size "
+ "%u (kernel) != (user) %u\n",
+ xt_prefix[par->family], par->match->name,
+ par->match->revision,
+ XT_ALIGN(par->match->matchsize), size);
+ return -EINVAL;
+ }
+ if (par->match->table != NULL &&
+ strcmp(par->match->table, par->table) != 0) {
+ pr_err("%s_tables: %s match: only valid in %s table, not %s\n",
+ xt_prefix[par->family], par->match->name,
+ par->match->table, par->table);
+ return -EINVAL;
+ }
+ if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
+ char used[64], allow[64];
+
+ pr_err("%s_tables: %s match: used from hooks %s, but only "
+ "valid from %s\n",
+ xt_prefix[par->family], par->match->name,
+ textify_hooks(used, sizeof(used), par->hook_mask,
+ par->family),
+ textify_hooks(allow, sizeof(allow), par->match->hooks,
+ par->family));
+ return -EINVAL;
+ }
+ if (par->match->proto && (par->match->proto != proto || inv_proto)) {
+ pr_err("%s_tables: %s match: only valid for protocol %u\n",
+ xt_prefix[par->family], par->match->name,
+ par->match->proto);
+ return -EINVAL;
+ }
+ if (par->match->checkentry != NULL) {
+ ret = par->match->checkentry(par);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ /* Flag up potential errors. */
+ return -EIO;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_match);
+
+#ifdef CONFIG_COMPAT
+int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
+{
+ struct xt_af *xp = &xt[af];
+
+ if (!xp->compat_tab) {
+ if (!xp->number)
+ return -EINVAL;
+ xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number);
+ if (!xp->compat_tab)
+ return -ENOMEM;
+ xp->cur = 0;
+ }
+
+ if (xp->cur >= xp->number)
+ return -EINVAL;
+
+ if (xp->cur)
+ delta += xp->compat_tab[xp->cur - 1].delta;
+ xp->compat_tab[xp->cur].offset = offset;
+ xp->compat_tab[xp->cur].delta = delta;
+ xp->cur++;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_add_offset);
+
+void xt_compat_flush_offsets(u_int8_t af)
+{
+ if (xt[af].compat_tab) {
+ vfree(xt[af].compat_tab);
+ xt[af].compat_tab = NULL;
+ xt[af].number = 0;
+ xt[af].cur = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
+
+int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
+{
+ struct compat_delta *tmp = xt[af].compat_tab;
+ int mid, left = 0, right = xt[af].cur - 1;
+
+ while (left <= right) {
+ mid = (left + right) >> 1;
+ if (offset > tmp[mid].offset)
+ left = mid + 1;
+ else if (offset < tmp[mid].offset)
+ right = mid - 1;
+ else
+ return mid ? tmp[mid - 1].delta : 0;
+ }
+ return left ? tmp[left - 1].delta : 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
+
+void xt_compat_init_offsets(u_int8_t af, unsigned int number)
+{
+ xt[af].number = number;
+ xt[af].cur = 0;
+}
+EXPORT_SYMBOL(xt_compat_init_offsets);
+
+int xt_compat_match_offset(const struct xt_match *match)
+{
+ u_int16_t csize = match->compatsize ? : match->matchsize;
+ return XT_ALIGN(match->matchsize) - COMPAT_XT_ALIGN(csize);
+}
+EXPORT_SYMBOL_GPL(xt_compat_match_offset);
+
+int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
+ unsigned int *size)
+{
+ const struct xt_match *match = m->u.kernel.match;
+ struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
+ int pad, off = xt_compat_match_offset(match);
+ u_int16_t msize = cm->u.user.match_size;
+
+ m = *dstptr;
+ memcpy(m, cm, sizeof(*cm));
+ if (match->compat_from_user)
+ match->compat_from_user(m->data, cm->data);
+ else
+ memcpy(m->data, cm->data, msize - sizeof(*cm));
+ pad = XT_ALIGN(match->matchsize) - match->matchsize;
+ if (pad > 0)
+ memset(m->data + match->matchsize, 0, pad);
+
+ msize += off;
+ m->u.user.match_size = msize;
+
+ *size += off;
+ *dstptr += msize;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_match_from_user);
+
+int xt_compat_match_to_user(const struct xt_entry_match *m,
+ void __user **dstptr, unsigned int *size)
+{
+ const struct xt_match *match = m->u.kernel.match;
+ struct compat_xt_entry_match __user *cm = *dstptr;
+ int off = xt_compat_match_offset(match);
+ u_int16_t msize = m->u.user.match_size - off;
+
+ if (copy_to_user(cm, m, sizeof(*cm)) ||
+ put_user(msize, &cm->u.user.match_size) ||
+ copy_to_user(cm->u.user.name, m->u.kernel.match->name,
+ strlen(m->u.kernel.match->name) + 1))
+ return -EFAULT;
+
+ if (match->compat_to_user) {
+ if (match->compat_to_user((void __user *)cm->data, m->data))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(cm->data, m->data, msize - sizeof(*cm)))
+ return -EFAULT;
+ }
+
+ *size -= off;
+ *dstptr += msize;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
+#endif /* CONFIG_COMPAT */
+
+int xt_check_target(struct xt_tgchk_param *par,
+ unsigned int size, u_int8_t proto, bool inv_proto)
+{
+ int ret;
+
+ if (XT_ALIGN(par->target->targetsize) != size) {
+ pr_err("%s_tables: %s.%u target: invalid size "
+ "%u (kernel) != (user) %u\n",
+ xt_prefix[par->family], par->target->name,
+ par->target->revision,
+ XT_ALIGN(par->target->targetsize), size);
+ return -EINVAL;
+ }
+ if (par->target->table != NULL &&
+ strcmp(par->target->table, par->table) != 0) {
+ pr_err("%s_tables: %s target: only valid in %s table, not %s\n",
+ xt_prefix[par->family], par->target->name,
+ par->target->table, par->table);
+ return -EINVAL;
+ }
+ if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) {
+ char used[64], allow[64];
+
+ pr_err("%s_tables: %s target: used from hooks %s, but only "
+ "usable from %s\n",
+ xt_prefix[par->family], par->target->name,
+ textify_hooks(used, sizeof(used), par->hook_mask,
+ par->family),
+ textify_hooks(allow, sizeof(allow), par->target->hooks,
+ par->family));
+ return -EINVAL;
+ }
+ if (par->target->proto && (par->target->proto != proto || inv_proto)) {
+ pr_err("%s_tables: %s target: only valid for protocol %u\n",
+ xt_prefix[par->family], par->target->name,
+ par->target->proto);
+ return -EINVAL;
+ }
+ if (par->target->checkentry != NULL) {
+ ret = par->target->checkentry(par);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ /* Flag up potential errors. */
+ return -EIO;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_target);
+
+#ifdef CONFIG_COMPAT
+int xt_compat_target_offset(const struct xt_target *target)
+{
+ u_int16_t csize = target->compatsize ? : target->targetsize;
+ return XT_ALIGN(target->targetsize) - COMPAT_XT_ALIGN(csize);
+}
+EXPORT_SYMBOL_GPL(xt_compat_target_offset);
+
+void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
+ unsigned int *size)
+{
+ const struct xt_target *target = t->u.kernel.target;
+ struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
+ int pad, off = xt_compat_target_offset(target);
+ u_int16_t tsize = ct->u.user.target_size;
+
+ t = *dstptr;
+ memcpy(t, ct, sizeof(*ct));
+ if (target->compat_from_user)
+ target->compat_from_user(t->data, ct->data);
+ else
+ memcpy(t->data, ct->data, tsize - sizeof(*ct));
+ pad = XT_ALIGN(target->targetsize) - target->targetsize;
+ if (pad > 0)
+ memset(t->data + target->targetsize, 0, pad);
+
+ tsize += off;
+ t->u.user.target_size = tsize;
+
+ *size += off;
+ *dstptr += tsize;
+}
+EXPORT_SYMBOL_GPL(xt_compat_target_from_user);
+
+int xt_compat_target_to_user(const struct xt_entry_target *t,
+ void __user **dstptr, unsigned int *size)
+{
+ const struct xt_target *target = t->u.kernel.target;
+ struct compat_xt_entry_target __user *ct = *dstptr;
+ int off = xt_compat_target_offset(target);
+ u_int16_t tsize = t->u.user.target_size - off;
+
+ if (copy_to_user(ct, t, sizeof(*ct)) ||
+ put_user(tsize, &ct->u.user.target_size) ||
+ copy_to_user(ct->u.user.name, t->u.kernel.target->name,
+ strlen(t->u.kernel.target->name) + 1))
+ return -EFAULT;
+
+ if (target->compat_to_user) {
+ if (target->compat_to_user((void __user *)ct->data, t->data))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct)))
+ return -EFAULT;
+ }
+
+ *size -= off;
+ *dstptr += tsize;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
+#endif
+
+struct xt_table_info *xt_alloc_table_info(unsigned int size)
+{
+ struct xt_table_info *newinfo;
+ int cpu;
+
+ /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
+ return NULL;
+
+ newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+
+ newinfo->size = size;
+
+ for_each_possible_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ else
+ newinfo->entries[cpu] = vmalloc_node(size,
+ cpu_to_node(cpu));
+
+ if (newinfo->entries[cpu] == NULL) {
+ xt_free_table_info(newinfo);
+ return NULL;
+ }
+ }
+
+ return newinfo;
+}
+EXPORT_SYMBOL(xt_alloc_table_info);
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ kvfree(info->entries[cpu]);
+
+ if (info->jumpstack != NULL) {
+ for_each_possible_cpu(cpu)
+ kvfree(info->jumpstack[cpu]);
+ kvfree(info->jumpstack);
+ }
+
+ free_percpu(info->stackptr);
+
+ kfree(info);
+}
+EXPORT_SYMBOL(xt_free_table_info);
+
+/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
+struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
+ const char *name)
+{
+ struct xt_table *t;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &net->xt.tables[af], list)
+ if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+ return t;
+ mutex_unlock(&xt[af].mutex);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(xt_find_table_lock);
+
+void xt_table_unlock(struct xt_table *table)
+{
+ mutex_unlock(&xt[table->af].mutex);
+}
+EXPORT_SYMBOL_GPL(xt_table_unlock);
+
+#ifdef CONFIG_COMPAT
+void xt_compat_lock(u_int8_t af)
+{
+ mutex_lock(&xt[af].compat_mutex);
+}
+EXPORT_SYMBOL_GPL(xt_compat_lock);
+
+void xt_compat_unlock(u_int8_t af)
+{
+ mutex_unlock(&xt[af].compat_mutex);
+}
+EXPORT_SYMBOL_GPL(xt_compat_unlock);
+#endif
+
+DEFINE_PER_CPU(seqcount_t, xt_recseq);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
+
+static int xt_jumpstack_alloc(struct xt_table_info *i)
+{
+ unsigned int size;
+ int cpu;
+
+ i->stackptr = alloc_percpu(unsigned int);
+ if (i->stackptr == NULL)
+ return -ENOMEM;
+
+ size = sizeof(void **) * nr_cpu_ids;
+ if (size > PAGE_SIZE)
+ i->jumpstack = vzalloc(size);
+ else
+ i->jumpstack = kzalloc(size, GFP_KERNEL);
+ if (i->jumpstack == NULL)
+ return -ENOMEM;
+
+ i->stacksize *= xt_jumpstack_multiplier;
+ size = sizeof(void *) * i->stacksize;
+ for_each_possible_cpu(cpu) {
+ if (size > PAGE_SIZE)
+ i->jumpstack[cpu] = vmalloc_node(size,
+ cpu_to_node(cpu));
+ else
+ i->jumpstack[cpu] = kmalloc_node(size,
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (i->jumpstack[cpu] == NULL)
+ /*
+ * Freeing will be done later on by the callers. The
+ * chain is: xt_replace_table -> __do_replace ->
+ * do_replace -> xt_free_table_info.
+ */
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+struct xt_table_info *
+xt_replace_table(struct xt_table *table,
+ unsigned int num_counters,
+ struct xt_table_info *newinfo,
+ int *error)
+{
+ struct xt_table_info *private;
+ int ret;
+
+ ret = xt_jumpstack_alloc(newinfo);
+ if (ret < 0) {
+ *error = ret;
+ return NULL;
+ }
+
+ /* Do the substitution. */
+ local_bh_disable();
+ private = table->private;
+
+ /* Check inside lock: is the old number correct? */
+ if (num_counters != private->number) {
+ pr_debug("num_counters != table->private->number (%u/%u)\n",
+ num_counters, private->number);
+ local_bh_enable();
+ *error = -EAGAIN;
+ return NULL;
+ }
+
+ newinfo->initial_entries = private->initial_entries;
+ /*
+ * Ensure contents of newinfo are visible before assigning to
+ * private.
+ */
+ smp_wmb();
+ table->private = newinfo;
+
+ /*
+ * Even though table entries have now been swapped, other CPU's
+ * may still be using the old entries. This is okay, because
+ * resynchronization happens because of the locking done
+ * during the get_counters() routine.
+ */
+ local_bh_enable();
+
+#ifdef CONFIG_AUDIT
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_NETFILTER_CFG);
+ if (ab) {
+ audit_log_format(ab, "table=%s family=%u entries=%u",
+ table->name, table->af,
+ private->number);
+ audit_log_end(ab);
+ }
+ }
+#endif
+
+ return private;
+}
+EXPORT_SYMBOL_GPL(xt_replace_table);
+
+struct xt_table *xt_register_table(struct net *net,
+ const struct xt_table *input_table,
+ struct xt_table_info *bootstrap,
+ struct xt_table_info *newinfo)
+{
+ int ret;
+ struct xt_table_info *private;
+ struct xt_table *t, *table;
+
+ /* Don't add one object to multiple lists. */
+ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
+ if (!table) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&xt[table->af].mutex);
+ /* Don't autoload: we'd eat our tail... */
+ list_for_each_entry(t, &net->xt.tables[table->af], list) {
+ if (strcmp(t->name, table->name) == 0) {
+ ret = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ /* Simplifies replace_table code. */
+ table->private = bootstrap;
+
+ if (!xt_replace_table(table, 0, newinfo, &ret))
+ goto unlock;
+
+ private = table->private;
+ pr_debug("table->private->number = %u\n", private->number);
+
+ /* save number of initial entries */
+ private->initial_entries = private->number;
+
+ list_add(&table->list, &net->xt.tables[table->af]);
+ mutex_unlock(&xt[table->af].mutex);
+ return table;
+
+unlock:
+ mutex_unlock(&xt[table->af].mutex);
+ kfree(table);
+out:
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(xt_register_table);
+
+void *xt_unregister_table(struct xt_table *table)
+{
+ struct xt_table_info *private;
+
+ mutex_lock(&xt[table->af].mutex);
+ private = table->private;
+ list_del(&table->list);
+ mutex_unlock(&xt[table->af].mutex);
+ kfree(table);
+
+ return private;
+}
+EXPORT_SYMBOL_GPL(xt_unregister_table);
+
+#ifdef CONFIG_PROC_FS
+struct xt_names_priv {
+ struct seq_net_private p;
+ u_int8_t af;
+};
+static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct xt_names_priv *priv = seq->private;
+ struct net *net = seq_file_net(seq);
+ u_int8_t af = priv->af;
+
+ mutex_lock(&xt[af].mutex);
+ return seq_list_start(&net->xt.tables[af], *pos);
+}
+
+static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct xt_names_priv *priv = seq->private;
+ struct net *net = seq_file_net(seq);
+ u_int8_t af = priv->af;
+
+ return seq_list_next(v, &net->xt.tables[af], pos);
+}
+
+static void xt_table_seq_stop(struct seq_file *seq, void *v)
+{
+ struct xt_names_priv *priv = seq->private;
+ u_int8_t af = priv->af;
+
+ mutex_unlock(&xt[af].mutex);
+}
+
+static int xt_table_seq_show(struct seq_file *seq, void *v)
+{
+ struct xt_table *table = list_entry(v, struct xt_table, list);
+
+ if (strlen(table->name)) {
+ seq_printf(seq, "%s\n", table->name);
+ return seq_has_overflowed(seq);
+ } else
+ return 0;
+}
+
+static const struct seq_operations xt_table_seq_ops = {
+ .start = xt_table_seq_start,
+ .next = xt_table_seq_next,
+ .stop = xt_table_seq_stop,
+ .show = xt_table_seq_show,
+};
+
+static int xt_table_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct xt_names_priv *priv;
+
+ ret = seq_open_net(inode, file, &xt_table_seq_ops,
+ sizeof(struct xt_names_priv));
+ if (!ret) {
+ priv = ((struct seq_file *)file->private_data)->private;
+ priv->af = (unsigned long)PDE_DATA(inode);
+ }
+ return ret;
+}
+
+static const struct file_operations xt_table_ops = {
+ .owner = THIS_MODULE,
+ .open = xt_table_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+/*
+ * Traverse state for ip{,6}_{tables,matches} for helping crossing
+ * the multi-AF mutexes.
+ */
+struct nf_mttg_trav {
+ struct list_head *head, *curr;
+ uint8_t class, nfproto;
+};
+
+enum {
+ MTTG_TRAV_INIT,
+ MTTG_TRAV_NFP_UNSPEC,
+ MTTG_TRAV_NFP_SPEC,
+ MTTG_TRAV_DONE,
+};
+
+static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
+ bool is_target)
+{
+ static const uint8_t next_class[] = {
+ [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
+ [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE,
+ };
+ struct nf_mttg_trav *trav = seq->private;
+
+ switch (trav->class) {
+ case MTTG_TRAV_INIT:
+ trav->class = MTTG_TRAV_NFP_UNSPEC;
+ mutex_lock(&xt[NFPROTO_UNSPEC].mutex);
+ trav->head = trav->curr = is_target ?
+ &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match;
+ break;
+ case MTTG_TRAV_NFP_UNSPEC:
+ trav->curr = trav->curr->next;
+ if (trav->curr != trav->head)
+ break;
+ mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
+ mutex_lock(&xt[trav->nfproto].mutex);
+ trav->head = trav->curr = is_target ?
+ &xt[trav->nfproto].target : &xt[trav->nfproto].match;
+ trav->class = next_class[trav->class];
+ break;
+ case MTTG_TRAV_NFP_SPEC:
+ trav->curr = trav->curr->next;
+ if (trav->curr != trav->head)
+ break;
+ /* fallthru, _stop will unlock */
+ default:
+ return NULL;
+ }
+
+ if (ppos != NULL)
+ ++*ppos;
+ return trav;
+}
+
+static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
+ bool is_target)
+{
+ struct nf_mttg_trav *trav = seq->private;
+ unsigned int j;
+
+ trav->class = MTTG_TRAV_INIT;
+ for (j = 0; j < *pos; ++j)
+ if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL)
+ return NULL;
+ return trav;
+}
+
+static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
+{
+ struct nf_mttg_trav *trav = seq->private;
+
+ switch (trav->class) {
+ case MTTG_TRAV_NFP_UNSPEC:
+ mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
+ break;
+ case MTTG_TRAV_NFP_SPEC:
+ mutex_unlock(&xt[trav->nfproto].mutex);
+ break;
+ }
+}
+
+static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return xt_mttg_seq_start(seq, pos, false);
+}
+
+static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+ return xt_mttg_seq_next(seq, v, ppos, false);
+}
+
+static int xt_match_seq_show(struct seq_file *seq, void *v)
+{
+ const struct nf_mttg_trav *trav = seq->private;
+ const struct xt_match *match;
+
+ switch (trav->class) {
+ case MTTG_TRAV_NFP_UNSPEC:
+ case MTTG_TRAV_NFP_SPEC:
+ if (trav->curr == trav->head)
+ return 0;
+ match = list_entry(trav->curr, struct xt_match, list);
+ if (*match->name == '\0')
+ return 0;
+ seq_printf(seq, "%s\n", match->name);
+ return seq_has_overflowed(seq);
+ }
+ return 0;
+}
+
+static const struct seq_operations xt_match_seq_ops = {
+ .start = xt_match_seq_start,
+ .next = xt_match_seq_next,
+ .stop = xt_mttg_seq_stop,
+ .show = xt_match_seq_show,
+};
+
+static int xt_match_open(struct inode *inode, struct file *file)
+{
+ struct nf_mttg_trav *trav;
+ trav = __seq_open_private(file, &xt_match_seq_ops, sizeof(*trav));
+ if (!trav)
+ return -ENOMEM;
+
+ trav->nfproto = (unsigned long)PDE_DATA(inode);
+ return 0;
+}
+
+static const struct file_operations xt_match_ops = {
+ .owner = THIS_MODULE,
+ .open = xt_match_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return xt_mttg_seq_start(seq, pos, true);
+}
+
+static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+ return xt_mttg_seq_next(seq, v, ppos, true);
+}
+
+static int xt_target_seq_show(struct seq_file *seq, void *v)
+{
+ const struct nf_mttg_trav *trav = seq->private;
+ const struct xt_target *target;
+
+ switch (trav->class) {
+ case MTTG_TRAV_NFP_UNSPEC:
+ case MTTG_TRAV_NFP_SPEC:
+ if (trav->curr == trav->head)
+ return 0;
+ target = list_entry(trav->curr, struct xt_target, list);
+ if (*target->name == '\0')
+ return 0;
+ seq_printf(seq, "%s\n", target->name);
+ return seq_has_overflowed(seq);
+ }
+ return 0;
+}
+
+static const struct seq_operations xt_target_seq_ops = {
+ .start = xt_target_seq_start,
+ .next = xt_target_seq_next,
+ .stop = xt_mttg_seq_stop,
+ .show = xt_target_seq_show,
+};
+
+static int xt_target_open(struct inode *inode, struct file *file)
+{
+ struct nf_mttg_trav *trav;
+ trav = __seq_open_private(file, &xt_target_seq_ops, sizeof(*trav));
+ if (!trav)
+ return -ENOMEM;
+
+ trav->nfproto = (unsigned long)PDE_DATA(inode);
+ return 0;
+}
+
+static const struct file_operations xt_target_ops = {
+ .owner = THIS_MODULE,
+ .open = xt_target_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#define FORMAT_TABLES "_tables_names"
+#define FORMAT_MATCHES "_tables_matches"
+#define FORMAT_TARGETS "_tables_targets"
+
+#endif /* CONFIG_PROC_FS */
+
+/**
+ * xt_hook_link - set up hooks for a new table
+ * @table: table with metadata needed to set up hooks
+ * @fn: Hook function
+ *
+ * This function will take care of creating and registering the necessary
+ * Netfilter hooks for XT tables.
+ */
+struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
+{
+ unsigned int hook_mask = table->valid_hooks;
+ uint8_t i, num_hooks = hweight32(hook_mask);
+ uint8_t hooknum;
+ struct nf_hook_ops *ops;
+ int ret;
+
+ ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
+ if (ops == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0;
+ hook_mask >>= 1, ++hooknum) {
+ if (!(hook_mask & 1))
+ continue;
+ ops[i].hook = fn;
+ ops[i].owner = table->me;
+ ops[i].pf = table->af;
+ ops[i].hooknum = hooknum;
+ ops[i].priority = table->priority;
+ ++i;
+ }
+
+ ret = nf_register_hooks(ops, num_hooks);
+ if (ret < 0) {
+ kfree(ops);
+ return ERR_PTR(ret);
+ }
+
+ return ops;
+}
+EXPORT_SYMBOL_GPL(xt_hook_link);
+
+/**
+ * xt_hook_unlink - remove hooks for a table
+ * @ops: nf_hook_ops array as returned by nf_hook_link
+ * @hook_mask: the very same mask that was passed to nf_hook_link
+ */
+void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops)
+{
+ nf_unregister_hooks(ops, hweight32(table->valid_hooks));
+ kfree(ops);
+}
+EXPORT_SYMBOL_GPL(xt_hook_unlink);
+
+int xt_proto_init(struct net *net, u_int8_t af)
+{
+#ifdef CONFIG_PROC_FS
+ char buf[XT_FUNCTION_MAXNAMELEN];
+ struct proc_dir_entry *proc;
+#endif
+
+ if (af >= ARRAY_SIZE(xt_prefix))
+ return -EINVAL;
+
+
+#ifdef CONFIG_PROC_FS
+ strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strlcat(buf, FORMAT_TABLES, sizeof(buf));
+ proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops,
+ (void *)(unsigned long)af);
+ if (!proc)
+ goto out;
+
+ strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+ proc = proc_create_data(buf, 0440, net->proc_net, &xt_match_ops,
+ (void *)(unsigned long)af);
+ if (!proc)
+ goto out_remove_tables;
+
+ strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strlcat(buf, FORMAT_TARGETS, sizeof(buf));
+ proc = proc_create_data(buf, 0440, net->proc_net, &xt_target_ops,
+ (void *)(unsigned long)af);
+ if (!proc)
+ goto out_remove_matches;
+#endif
+
+ return 0;
+
+#ifdef CONFIG_PROC_FS
+out_remove_matches:
+ strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+ remove_proc_entry(buf, net->proc_net);
+
+out_remove_tables:
+ strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strlcat(buf, FORMAT_TABLES, sizeof(buf));
+ remove_proc_entry(buf, net->proc_net);
+out:
+ return -1;
+#endif
+}
+EXPORT_SYMBOL_GPL(xt_proto_init);
+
+void xt_proto_fini(struct net *net, u_int8_t af)
+{
+#ifdef CONFIG_PROC_FS
+ char buf[XT_FUNCTION_MAXNAMELEN];
+
+ strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strlcat(buf, FORMAT_TABLES, sizeof(buf));
+ remove_proc_entry(buf, net->proc_net);
+
+ strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strlcat(buf, FORMAT_TARGETS, sizeof(buf));
+ remove_proc_entry(buf, net->proc_net);
+
+ strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+ remove_proc_entry(buf, net->proc_net);
+#endif /*CONFIG_PROC_FS*/
+}
+EXPORT_SYMBOL_GPL(xt_proto_fini);
+
+static int __net_init xt_net_init(struct net *net)
+{
+ int i;
+
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ INIT_LIST_HEAD(&net->xt.tables[i]);
+ return 0;
+}
+
+static struct pernet_operations xt_net_ops = {
+ .init = xt_net_init,
+};
+
+static int __init xt_init(void)
+{
+ unsigned int i;
+ int rv;
+
+ for_each_possible_cpu(i) {
+ seqcount_init(&per_cpu(xt_recseq, i));
+ }
+
+ xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
+ if (!xt)
+ return -ENOMEM;
+
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ mutex_init(&xt[i].mutex);
+#ifdef CONFIG_COMPAT
+ mutex_init(&xt[i].compat_mutex);
+ xt[i].compat_tab = NULL;
+#endif
+ INIT_LIST_HEAD(&xt[i].target);
+ INIT_LIST_HEAD(&xt[i].match);
+ }
+ rv = register_pernet_subsys(&xt_net_ops);
+ if (rv < 0)
+ kfree(xt);
+ return rv;
+}
+
+static void __exit xt_fini(void)
+{
+ unregister_pernet_subsys(&xt_net_ops);
+ kfree(xt);
+}
+
+module_init(xt_init);
+module_exit(xt_fini);
+
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
new file mode 100644
index 000000000..4973cbddc
--- /dev/null
+++ b/net/netfilter/xt_AUDIT.c
@@ -0,0 +1,231 @@
+/*
+ * Creates audit record for dropped/accepted packets
+ *
+ * (C) 2010-2011 Thomas Graf <tgraf@redhat.com>
+ * (C) 2010-2011 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/audit.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_AUDIT.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>");
+MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets");
+MODULE_ALIAS("ipt_AUDIT");
+MODULE_ALIAS("ip6t_AUDIT");
+MODULE_ALIAS("ebt_AUDIT");
+MODULE_ALIAS("arpt_AUDIT");
+
+static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb,
+ unsigned int proto, unsigned int offset)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE: {
+ const __be16 *pptr;
+ __be16 _ports[2];
+
+ pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports);
+ if (pptr == NULL) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " sport=%hu dport=%hu",
+ ntohs(pptr[0]), ntohs(pptr[1]));
+ }
+ break;
+
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6: {
+ const u8 *iptr;
+ u8 _ih[2];
+
+ iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih);
+ if (iptr == NULL) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu",
+ iptr[0], iptr[1]);
+
+ }
+ break;
+ }
+}
+
+static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+
+ ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (!ih) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu",
+ &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol);
+
+ if (ntohs(ih->frag_off) & IP_OFFSET) {
+ audit_log_format(ab, " frag=1");
+ return;
+ }
+
+ audit_proto(ab, skb, ih->protocol, ih->ihl * 4);
+}
+
+static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+ struct ipv6hdr _ip6h;
+ const struct ipv6hdr *ih;
+ u8 nexthdr;
+ __be16 frag_off;
+ int offset;
+
+ ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+ if (!ih) {
+ audit_log_format(ab, " truncated=1");
+ return;
+ }
+
+ nexthdr = ih->nexthdr;
+ offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h),
+ &nexthdr, &frag_off);
+
+ audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+ &ih->saddr, &ih->daddr, nexthdr);
+
+ if (offset)
+ audit_proto(ab, skb, nexthdr, offset);
+}
+
+static unsigned int
+audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_audit_info *info = par->targinfo;
+ struct audit_buffer *ab;
+
+ if (audit_enabled == 0)
+ goto errout;
+
+ ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+ if (ab == NULL)
+ goto errout;
+
+ audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
+ info->type, par->hooknum, skb->len,
+ par->in ? par->in->name : "?",
+ par->out ? par->out->name : "?");
+
+ if (skb->mark)
+ audit_log_format(ab, " mark=%#x", skb->mark);
+
+ if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+ audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+
+ if (par->family == NFPROTO_BRIDGE) {
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ audit_ip4(ab, skb);
+ break;
+
+ case htons(ETH_P_IPV6):
+ audit_ip6(ab, skb);
+ break;
+ }
+ }
+ }
+
+ switch (par->family) {
+ case NFPROTO_IPV4:
+ audit_ip4(ab, skb);
+ break;
+
+ case NFPROTO_IPV6:
+ audit_ip6(ab, skb);
+ break;
+ }
+
+#ifdef CONFIG_NETWORK_SECMARK
+ if (skb->secmark)
+ audit_log_secctx(ab, skb->secmark);
+#endif
+
+ audit_log_end(ab);
+
+errout:
+ return XT_CONTINUE;
+}
+
+static unsigned int
+audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ audit_tg(skb, par);
+ return EBT_CONTINUE;
+}
+
+static int audit_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_audit_info *info = par->targinfo;
+
+ if (info->type > XT_AUDIT_TYPE_MAX) {
+ pr_info("Audit type out of range (valid range: 0..%hhu)\n",
+ XT_AUDIT_TYPE_MAX);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static struct xt_target audit_tg_reg[] __read_mostly = {
+ {
+ .name = "AUDIT",
+ .family = NFPROTO_UNSPEC,
+ .target = audit_tg,
+ .targetsize = sizeof(struct xt_audit_info),
+ .checkentry = audit_tg_check,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "AUDIT",
+ .family = NFPROTO_BRIDGE,
+ .target = audit_tg_ebt,
+ .targetsize = sizeof(struct xt_audit_info),
+ .checkentry = audit_tg_check,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init audit_tg_init(void)
+{
+ return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg));
+}
+
+static void __exit audit_tg_exit(void)
+{
+ xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg));
+}
+
+module_init(audit_tg_init);
+module_exit(audit_tg_exit);
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
new file mode 100644
index 000000000..0f642ef8c
--- /dev/null
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -0,0 +1,70 @@
+/* iptables module for the packet checksum mangling
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * (C) 2010 Red Hat, Inc.
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
+MODULE_DESCRIPTION("Xtables: checksum modification");
+MODULE_ALIAS("ipt_CHECKSUM");
+MODULE_ALIAS("ip6t_CHECKSUM");
+
+static unsigned int
+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ skb_checksum_help(skb);
+
+ return XT_CONTINUE;
+}
+
+static int checksum_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_CHECKSUM_info *einfo = par->targinfo;
+
+ if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
+ pr_info("unsupported CHECKSUM operation %x\n", einfo->operation);
+ return -EINVAL;
+ }
+ if (!einfo->operation) {
+ pr_info("no CHECKSUM operation enabled\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target checksum_tg_reg __read_mostly = {
+ .name = "CHECKSUM",
+ .family = NFPROTO_UNSPEC,
+ .target = checksum_tg,
+ .targetsize = sizeof(struct xt_CHECKSUM_info),
+ .table = "mangle",
+ .checkentry = checksum_tg_check,
+ .me = THIS_MODULE,
+};
+
+static int __init checksum_tg_init(void)
+{
+ return xt_register_target(&checksum_tg_reg);
+}
+
+static void __exit checksum_tg_exit(void)
+{
+ xt_unregister_target(&checksum_tg_reg);
+}
+
+module_init(checksum_tg_init);
+module_exit(checksum_tg_exit);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
new file mode 100644
index 000000000..af9c4dadf
--- /dev/null
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -0,0 +1,73 @@
+/*
+ * This is a module which is used for setting the skb->priority field
+ * of an skb for qdisc classification.
+ */
+
+/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CLASSIFY.h>
+#include <linux/netfilter_arp.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Qdisc classification");
+MODULE_ALIAS("ipt_CLASSIFY");
+MODULE_ALIAS("ip6t_CLASSIFY");
+MODULE_ALIAS("arpt_CLASSIFY");
+
+static unsigned int
+classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_classify_target_info *clinfo = par->targinfo;
+
+ skb->priority = clinfo->priority;
+ return XT_CONTINUE;
+}
+
+static struct xt_target classify_tg_reg[] __read_mostly = {
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_POST_ROUTING),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_ARP,
+ .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init classify_tg_init(void)
+{
+ return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
+}
+
+static void __exit classify_tg_exit(void)
+{
+ xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
+}
+
+module_init(classify_tg_init);
+module_exit(classify_tg_exit);
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
new file mode 100644
index 000000000..e04dc282e
--- /dev/null
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -0,0 +1,143 @@
+/*
+ * This module is used to copy security markings from packets
+ * to connections, and restore security markings from connections
+ * back to packets. This would normally be performed in conjunction
+ * with the SECMARK target and state match.
+ *
+ * Based somewhat on CONNMARK:
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CONNSECMARK.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("Xtables: target for copying between connection and security mark");
+MODULE_ALIAS("ipt_CONNSECMARK");
+MODULE_ALIAS("ip6t_CONNSECMARK");
+
+/*
+ * If the packet has a security mark and the connection does not, copy
+ * the security mark from the packet to the connection.
+ */
+static void secmark_save(const struct sk_buff *skb)
+{
+ if (skb->secmark) {
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !ct->secmark) {
+ ct->secmark = skb->secmark;
+ nf_conntrack_event_cache(IPCT_SECMARK, ct);
+ }
+ }
+}
+
+/*
+ * If packet has no security mark, and the connection does, restore the
+ * security mark from the connection to the packet.
+ */
+static void secmark_restore(struct sk_buff *skb)
+{
+ if (!skb->secmark) {
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && ct->secmark)
+ skb->secmark = ct->secmark;
+ }
+}
+
+static unsigned int
+connsecmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_connsecmark_target_info *info = par->targinfo;
+
+ switch (info->mode) {
+ case CONNSECMARK_SAVE:
+ secmark_save(skb);
+ break;
+
+ case CONNSECMARK_RESTORE:
+ secmark_restore(skb);
+ break;
+
+ default:
+ BUG();
+ }
+
+ return XT_CONTINUE;
+}
+
+static int connsecmark_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_connsecmark_target_info *info = par->targinfo;
+ int ret;
+
+ if (strcmp(par->table, "mangle") != 0 &&
+ strcmp(par->table, "security") != 0) {
+ pr_info("target only valid in the \'mangle\' "
+ "or \'security\' tables, not \'%s\'.\n", par->table);
+ return -EINVAL;
+ }
+
+ switch (info->mode) {
+ case CONNSECMARK_SAVE:
+ case CONNSECMARK_RESTORE:
+ break;
+
+ default:
+ pr_info("invalid mode: %hu\n", info->mode);
+ return -EINVAL;
+ }
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+}
+
+static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target connsecmark_tg_reg __read_mostly = {
+ .name = "CONNSECMARK",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connsecmark_tg_check,
+ .destroy = connsecmark_tg_destroy,
+ .target = connsecmark_tg,
+ .targetsize = sizeof(struct xt_connsecmark_target_info),
+ .me = THIS_MODULE,
+};
+
+static int __init connsecmark_tg_init(void)
+{
+ return xt_register_target(&connsecmark_tg_reg);
+}
+
+static void __exit connsecmark_tg_exit(void)
+{
+ xt_unregister_target(&connsecmark_tg_reg);
+}
+
+module_init(connsecmark_tg_init);
+module_exit(connsecmark_tg_exit);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
new file mode 100644
index 000000000..75747aecd
--- /dev/null
+++ b/net/netfilter/xt_CT.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CT.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
+{
+ /* Previously seen (loopback)? Ignore. */
+ if (skb->nfct != NULL)
+ return XT_CONTINUE;
+
+ /* special case the untracked ct : we want the percpu object */
+ if (!ct)
+ ct = nf_ct_untracked_get();
+ atomic_inc(&ct->ct_general.use);
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+
+ return XT_CONTINUE;
+}
+
+static unsigned int xt_ct_target_v0(struct sk_buff *skb,
+ const struct xt_action_param *par)
+{
+ const struct xt_ct_target_info *info = par->targinfo;
+ struct nf_conn *ct = info->ct;
+
+ return xt_ct_target(skb, ct);
+}
+
+static unsigned int xt_ct_target_v1(struct sk_buff *skb,
+ const struct xt_action_param *par)
+{
+ const struct xt_ct_target_info_v1 *info = par->targinfo;
+ struct nf_conn *ct = info->ct;
+
+ return xt_ct_target(skb, ct);
+}
+
+static u8 xt_ct_find_proto(const struct xt_tgchk_param *par)
+{
+ if (par->family == NFPROTO_IPV4) {
+ const struct ipt_entry *e = par->entryinfo;
+
+ if (e->ip.invflags & IPT_INV_PROTO)
+ return 0;
+ return e->ip.proto;
+ } else if (par->family == NFPROTO_IPV6) {
+ const struct ip6t_entry *e = par->entryinfo;
+
+ if (e->ipv6.invflags & IP6T_INV_PROTO)
+ return 0;
+ return e->ipv6.proto;
+ } else
+ return 0;
+}
+
+static int
+xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
+ const struct xt_tgchk_param *par)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
+ u8 proto;
+
+ proto = xt_ct_find_proto(par);
+ if (!proto) {
+ pr_info("You must specify a L4 protocol, and not use "
+ "inversions on it.\n");
+ return -ENOENT;
+ }
+
+ helper = nf_conntrack_helper_try_module_get(helper_name, par->family,
+ proto);
+ if (helper == NULL) {
+ pr_info("No such helper \"%s\"\n", helper_name);
+ return -ENOENT;
+ }
+
+ help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
+ if (help == NULL) {
+ module_put(helper->me);
+ return -ENOMEM;
+ }
+
+ help->helper = helper;
+ return 0;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout)
+{
+ typeof(nf_ct_timeout_put_hook) timeout_put;
+
+ timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+ if (timeout_put)
+ timeout_put(timeout);
+}
+#endif
+
+static int
+xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
+ const char *timeout_name)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
+ struct ctnl_timeout *timeout;
+ struct nf_conn_timeout *timeout_ext;
+ struct nf_conntrack_l4proto *l4proto;
+ int ret = 0;
+ u8 proto;
+
+ rcu_read_lock();
+ timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
+ if (timeout_find_get == NULL) {
+ ret = -ENOENT;
+ pr_info("Timeout policy base is empty\n");
+ goto out;
+ }
+
+ proto = xt_ct_find_proto(par);
+ if (!proto) {
+ ret = -EINVAL;
+ pr_info("You must specify a L4 protocol, and not use "
+ "inversions on it.\n");
+ goto out;
+ }
+
+ timeout = timeout_find_get(timeout_name);
+ if (timeout == NULL) {
+ ret = -ENOENT;
+ pr_info("No such timeout policy \"%s\"\n", timeout_name);
+ goto out;
+ }
+
+ if (timeout->l3num != par->family) {
+ ret = -EINVAL;
+ pr_info("Timeout policy `%s' can only be used by L3 protocol "
+ "number %d\n", timeout_name, timeout->l3num);
+ goto err_put_timeout;
+ }
+ /* Make sure the timeout policy matches any existing protocol tracker,
+ * otherwise default to generic.
+ */
+ l4proto = __nf_ct_l4proto_find(par->family, proto);
+ if (timeout->l4proto->l4proto != l4proto->l4proto) {
+ ret = -EINVAL;
+ pr_info("Timeout policy `%s' can only be used by L4 protocol "
+ "number %d\n",
+ timeout_name, timeout->l4proto->l4proto);
+ goto err_put_timeout;
+ }
+ timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC);
+ if (timeout_ext == NULL)
+ ret = -ENOMEM;
+
+err_put_timeout:
+ __xt_ct_tg_timeout_put(timeout);
+out:
+ rcu_read_unlock();
+ return ret;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int xt_ct_tg_check(const struct xt_tgchk_param *par,
+ struct xt_ct_target_info_v1 *info)
+{
+ struct nf_conntrack_tuple t;
+ struct nf_conn *ct;
+ int ret = -EOPNOTSUPP;
+
+ if (info->flags & XT_CT_NOTRACK) {
+ ct = NULL;
+ goto out;
+ }
+
+#ifndef CONFIG_NF_CONNTRACK_ZONES
+ if (info->zone)
+ goto err1;
+#endif
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ goto err1;
+
+ memset(&t, 0, sizeof(t));
+ ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL);
+ ret = PTR_ERR(ct);
+ if (IS_ERR(ct))
+ goto err2;
+
+ ret = 0;
+ if ((info->ct_events || info->exp_events) &&
+ !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events,
+ GFP_KERNEL)) {
+ ret = -EINVAL;
+ goto err3;
+ }
+
+ if (info->helper[0]) {
+ ret = xt_ct_set_helper(ct, info->helper, par);
+ if (ret < 0)
+ goto err3;
+ }
+
+ if (info->timeout[0]) {
+ ret = xt_ct_set_timeout(ct, par, info->timeout);
+ if (ret < 0)
+ goto err3;
+ }
+
+ nf_conntrack_tmpl_insert(par->net, ct);
+out:
+ info->ct = ct;
+ return 0;
+
+err3:
+ nf_conntrack_free(ct);
+err2:
+ nf_ct_l3proto_module_put(par->family);
+err1:
+ return ret;
+}
+
+static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par)
+{
+ struct xt_ct_target_info *info = par->targinfo;
+ struct xt_ct_target_info_v1 info_v1 = {
+ .flags = info->flags,
+ .zone = info->zone,
+ .ct_events = info->ct_events,
+ .exp_events = info->exp_events,
+ };
+ int ret;
+
+ if (info->flags & ~XT_CT_NOTRACK)
+ return -EINVAL;
+
+ memcpy(info_v1.helper, info->helper, sizeof(info->helper));
+
+ ret = xt_ct_tg_check(par, &info_v1);
+ if (ret < 0)
+ return ret;
+
+ info->ct = info_v1.ct;
+
+ return ret;
+}
+
+static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par)
+{
+ struct xt_ct_target_info_v1 *info = par->targinfo;
+
+ if (info->flags & ~XT_CT_NOTRACK)
+ return -EINVAL;
+
+ return xt_ct_tg_check(par, par->targinfo);
+}
+
+static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par)
+{
+ struct xt_ct_target_info_v1 *info = par->targinfo;
+
+ if (info->flags & ~XT_CT_MASK)
+ return -EINVAL;
+
+ return xt_ct_tg_check(par, par->targinfo);
+}
+
+static void xt_ct_destroy_timeout(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ struct nf_conn_timeout *timeout_ext;
+ typeof(nf_ct_timeout_put_hook) timeout_put;
+
+ rcu_read_lock();
+ timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+
+ if (timeout_put) {
+ timeout_ext = nf_ct_timeout_find(ct);
+ if (timeout_ext)
+ timeout_put(timeout_ext->timeout);
+ }
+ rcu_read_unlock();
+#endif
+}
+
+static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
+ struct xt_ct_target_info_v1 *info)
+{
+ struct nf_conn *ct = info->ct;
+ struct nf_conn_help *help;
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ help = nfct_help(ct);
+ if (help)
+ module_put(help->helper->me);
+
+ nf_ct_l3proto_module_put(par->family);
+
+ xt_ct_destroy_timeout(ct);
+ nf_ct_put(info->ct);
+ }
+}
+
+static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par)
+{
+ struct xt_ct_target_info *info = par->targinfo;
+ struct xt_ct_target_info_v1 info_v1 = {
+ .flags = info->flags,
+ .zone = info->zone,
+ .ct_events = info->ct_events,
+ .exp_events = info->exp_events,
+ .ct = info->ct,
+ };
+ memcpy(info_v1.helper, info->helper, sizeof(info->helper));
+
+ xt_ct_tg_destroy(par, &info_v1);
+}
+
+static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par)
+{
+ xt_ct_tg_destroy(par, par->targinfo);
+}
+
+static struct xt_target xt_ct_tg_reg[] __read_mostly = {
+ {
+ .name = "CT",
+ .family = NFPROTO_UNSPEC,
+ .targetsize = sizeof(struct xt_ct_target_info),
+ .checkentry = xt_ct_tg_check_v0,
+ .destroy = xt_ct_tg_destroy_v0,
+ .target = xt_ct_target_v0,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_UNSPEC,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .checkentry = xt_ct_tg_check_v1,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_UNSPEC,
+ .revision = 2,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .checkentry = xt_ct_tg_check_v2,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+};
+
+static unsigned int
+notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ /* Previously seen (loopback)? Ignore. */
+ if (skb->nfct != NULL)
+ return XT_CONTINUE;
+
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+
+ return XT_CONTINUE;
+}
+
+static int notrack_chk(const struct xt_tgchk_param *par)
+{
+ if (!par->net->xt.notrack_deprecated_warning) {
+ pr_info("netfilter: NOTRACK target is deprecated, "
+ "use CT instead or upgrade iptables\n");
+ par->net->xt.notrack_deprecated_warning = true;
+ }
+ return 0;
+}
+
+static struct xt_target notrack_tg_reg __read_mostly = {
+ .name = "NOTRACK",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = notrack_chk,
+ .target = notrack_tg,
+ .table = "raw",
+ .me = THIS_MODULE,
+};
+
+static int __init xt_ct_tg_init(void)
+{
+ int ret;
+
+ ret = xt_register_target(&notrack_tg_reg);
+ if (ret < 0)
+ return ret;
+
+ ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
+ if (ret < 0) {
+ xt_unregister_target(&notrack_tg_reg);
+ return ret;
+ }
+ return 0;
+}
+
+static void __exit xt_ct_tg_exit(void)
+{
+ xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
+ xt_unregister_target(&notrack_tg_reg);
+}
+
+module_init(xt_ct_tg_init);
+module_exit(xt_ct_tg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: connection tracking target");
+MODULE_ALIAS("ipt_CT");
+MODULE_ALIAS("ip6t_CT");
+MODULE_ALIAS("ipt_NOTRACK");
+MODULE_ALIAS("ip6t_NOTRACK");
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
new file mode 100644
index 000000000..3f83d38c4
--- /dev/null
+++ b/net/netfilter/xt_DSCP.c
@@ -0,0 +1,166 @@
+/* x_tables module for setting the IPv4/IPv6 DSCP field, Version 1.8
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * See RFC2474 for a description of the DSCP field within the IP Header.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/dsfield.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_DSCP.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_DSCP");
+MODULE_ALIAS("ip6t_DSCP");
+MODULE_ALIAS("ipt_TOS");
+MODULE_ALIAS("ip6t_TOS");
+
+static unsigned int
+dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_DSCP_info *dinfo = par->targinfo;
+ u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
+
+ if (dscp != dinfo->dscp) {
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return NF_DROP;
+
+ ipv4_change_dsfield(ip_hdr(skb),
+ (__force __u8)(~XT_DSCP_MASK),
+ dinfo->dscp << XT_DSCP_SHIFT);
+
+ }
+ return XT_CONTINUE;
+}
+
+static unsigned int
+dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_DSCP_info *dinfo = par->targinfo;
+ u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
+
+ if (dscp != dinfo->dscp) {
+ if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ return NF_DROP;
+
+ ipv6_change_dsfield(ipv6_hdr(skb),
+ (__force __u8)(~XT_DSCP_MASK),
+ dinfo->dscp << XT_DSCP_SHIFT);
+ }
+ return XT_CONTINUE;
+}
+
+static int dscp_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_DSCP_info *info = par->targinfo;
+
+ if (info->dscp > XT_DSCP_MAX) {
+ pr_info("dscp %x out of range\n", info->dscp);
+ return -EDOM;
+ }
+ return 0;
+}
+
+static unsigned int
+tos_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tos_target_info *info = par->targinfo;
+ struct iphdr *iph = ip_hdr(skb);
+ u_int8_t orig, nv;
+
+ orig = ipv4_get_dsfield(iph);
+ nv = (orig & ~info->tos_mask) ^ info->tos_value;
+
+ if (orig != nv) {
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return NF_DROP;
+ iph = ip_hdr(skb);
+ ipv4_change_dsfield(iph, 0, nv);
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int
+tos_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tos_target_info *info = par->targinfo;
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ u_int8_t orig, nv;
+
+ orig = ipv6_get_dsfield(iph);
+ nv = (orig & ~info->tos_mask) ^ info->tos_value;
+
+ if (orig != nv) {
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return NF_DROP;
+ iph = ipv6_hdr(skb);
+ ipv6_change_dsfield(iph, 0, nv);
+ }
+
+ return XT_CONTINUE;
+}
+
+static struct xt_target dscp_tg_reg[] __read_mostly = {
+ {
+ .name = "DSCP",
+ .family = NFPROTO_IPV4,
+ .checkentry = dscp_tg_check,
+ .target = dscp_tg,
+ .targetsize = sizeof(struct xt_DSCP_info),
+ .table = "mangle",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DSCP",
+ .family = NFPROTO_IPV6,
+ .checkentry = dscp_tg_check,
+ .target = dscp_tg6,
+ .targetsize = sizeof(struct xt_DSCP_info),
+ .table = "mangle",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "TOS",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tos_tg,
+ .targetsize = sizeof(struct xt_tos_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "TOS",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .table = "mangle",
+ .target = tos_tg6,
+ .targetsize = sizeof(struct xt_tos_target_info),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init dscp_tg_init(void)
+{
+ return xt_register_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg));
+}
+
+static void __exit dscp_tg_exit(void)
+{
+ xt_unregister_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg));
+}
+
+module_init(dscp_tg_init);
+module_exit(dscp_tg_exit);
diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c
new file mode 100644
index 000000000..1535e87ed
--- /dev/null
+++ b/net/netfilter/xt_HL.c
@@ -0,0 +1,169 @@
+/*
+ * TTL modification target for IP tables
+ * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Hop Limit modification target for ip6tables
+ * Maciej Soltysiak <solt@dns.toxicfilms.tv>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ipt_TTL.h>
+#include <linux/netfilter_ipv6/ip6t_HL.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
+MODULE_DESCRIPTION("Xtables: Hoplimit/TTL Limit field modification target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+ttl_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct iphdr *iph;
+ const struct ipt_TTL_info *info = par->targinfo;
+ int new_ttl;
+
+ if (!skb_make_writable(skb, skb->len))
+ return NF_DROP;
+
+ iph = ip_hdr(skb);
+
+ switch (info->mode) {
+ case IPT_TTL_SET:
+ new_ttl = info->ttl;
+ break;
+ case IPT_TTL_INC:
+ new_ttl = iph->ttl + info->ttl;
+ if (new_ttl > 255)
+ new_ttl = 255;
+ break;
+ case IPT_TTL_DEC:
+ new_ttl = iph->ttl - info->ttl;
+ if (new_ttl < 0)
+ new_ttl = 0;
+ break;
+ default:
+ new_ttl = iph->ttl;
+ break;
+ }
+
+ if (new_ttl != iph->ttl) {
+ csum_replace2(&iph->check, htons(iph->ttl << 8),
+ htons(new_ttl << 8));
+ iph->ttl = new_ttl;
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int
+hl_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct ipv6hdr *ip6h;
+ const struct ip6t_HL_info *info = par->targinfo;
+ int new_hl;
+
+ if (!skb_make_writable(skb, skb->len))
+ return NF_DROP;
+
+ ip6h = ipv6_hdr(skb);
+
+ switch (info->mode) {
+ case IP6T_HL_SET:
+ new_hl = info->hop_limit;
+ break;
+ case IP6T_HL_INC:
+ new_hl = ip6h->hop_limit + info->hop_limit;
+ if (new_hl > 255)
+ new_hl = 255;
+ break;
+ case IP6T_HL_DEC:
+ new_hl = ip6h->hop_limit - info->hop_limit;
+ if (new_hl < 0)
+ new_hl = 0;
+ break;
+ default:
+ new_hl = ip6h->hop_limit;
+ break;
+ }
+
+ ip6h->hop_limit = new_hl;
+
+ return XT_CONTINUE;
+}
+
+static int ttl_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_TTL_info *info = par->targinfo;
+
+ if (info->mode > IPT_TTL_MAXMODE) {
+ pr_info("TTL: invalid or unknown mode %u\n", info->mode);
+ return -EINVAL;
+ }
+ if (info->mode != IPT_TTL_SET && info->ttl == 0)
+ return -EINVAL;
+ return 0;
+}
+
+static int hl_tg6_check(const struct xt_tgchk_param *par)
+{
+ const struct ip6t_HL_info *info = par->targinfo;
+
+ if (info->mode > IP6T_HL_MAXMODE) {
+ pr_info("invalid or unknown mode %u\n", info->mode);
+ return -EINVAL;
+ }
+ if (info->mode != IP6T_HL_SET && info->hop_limit == 0) {
+ pr_info("increment/decrement does not "
+ "make sense with value 0\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target hl_tg_reg[] __read_mostly = {
+ {
+ .name = "TTL",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = ttl_tg,
+ .targetsize = sizeof(struct ipt_TTL_info),
+ .table = "mangle",
+ .checkentry = ttl_tg_check,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "HL",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = hl_tg6,
+ .targetsize = sizeof(struct ip6t_HL_info),
+ .table = "mangle",
+ .checkentry = hl_tg6_check,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init hl_tg_init(void)
+{
+ return xt_register_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg));
+}
+
+static void __exit hl_tg_exit(void)
+{
+ xt_unregister_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg));
+}
+
+module_init(hl_tg_init);
+module_exit(hl_tg_exit);
+MODULE_ALIAS("ipt_TTL");
+MODULE_ALIAS("ip6t_HL");
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
new file mode 100644
index 000000000..02afaf48a
--- /dev/null
+++ b/net/netfilter/xt_HMARK.c
@@ -0,0 +1,372 @@
+/*
+ * xt_HMARK - Netfilter module to set mark by means of hashing
+ *
+ * (C) 2012 by Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_HMARK.h>
+
+#include <net/ip.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>");
+MODULE_DESCRIPTION("Xtables: packet marking using hash calculation");
+MODULE_ALIAS("ipt_HMARK");
+MODULE_ALIAS("ip6t_HMARK");
+
+struct hmark_tuple {
+ __be32 src;
+ __be32 dst;
+ union hmark_ports uports;
+ u8 proto;
+};
+
+static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask)
+{
+ return (addr32[0] & mask[0]) ^
+ (addr32[1] & mask[1]) ^
+ (addr32[2] & mask[2]) ^
+ (addr32[3] & mask[3]);
+}
+
+static inline __be32
+hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask)
+{
+ switch (l3num) {
+ case AF_INET:
+ return *addr32 & *mask;
+ case AF_INET6:
+ return hmark_addr6_mask(addr32, mask);
+ }
+ return 0;
+}
+
+static inline void hmark_swap_ports(union hmark_ports *uports,
+ const struct xt_hmark_info *info)
+{
+ union hmark_ports hp;
+ u16 src, dst;
+
+ hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32;
+ src = ntohs(hp.b16.src);
+ dst = ntohs(hp.b16.dst);
+
+ if (dst > src)
+ uports->v32 = (dst << 16) | src;
+ else
+ uports->v32 = (src << 16) | dst;
+}
+
+static int
+hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t,
+ const struct xt_hmark_info *info)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conntrack_tuple *otuple;
+ struct nf_conntrack_tuple *rtuple;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return -1;
+
+ otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6,
+ info->src_mask.ip6);
+ t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6,
+ info->dst_mask.ip6);
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))
+ return 0;
+
+ t->proto = nf_ct_protonum(ct);
+ if (t->proto != IPPROTO_ICMP) {
+ t->uports.b16.src = otuple->src.u.all;
+ t->uports.b16.dst = rtuple->src.u.all;
+ hmark_swap_ports(&t->uports, info);
+ }
+
+ return 0;
+#else
+ return -1;
+#endif
+}
+
+/* This hash function is endian independent, to ensure consistent hashing if
+ * the cluster is composed of big and little endian systems. */
+static inline u32
+hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info)
+{
+ u32 hash;
+ u32 src = ntohl(t->src);
+ u32 dst = ntohl(t->dst);
+
+ if (dst < src)
+ swap(src, dst);
+
+ hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd);
+ hash = hash ^ (t->proto & info->proto_mask);
+
+ return reciprocal_scale(hash, info->hmodulus) + info->hoffset;
+}
+
+static void
+hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff,
+ struct hmark_tuple *t, const struct xt_hmark_info *info)
+{
+ int protoff;
+
+ protoff = proto_ports_offset(t->proto);
+ if (protoff < 0)
+ return;
+
+ nhoff += protoff;
+ if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0)
+ return;
+
+ hmark_swap_ports(&t->uports, info);
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static int get_inner6_hdr(const struct sk_buff *skb, int *offset)
+{
+ struct icmp6hdr *icmp6h, _ih6;
+
+ icmp6h = skb_header_pointer(skb, *offset, sizeof(_ih6), &_ih6);
+ if (icmp6h == NULL)
+ return 0;
+
+ if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) {
+ *offset += sizeof(struct icmp6hdr);
+ return 1;
+ }
+ return 0;
+}
+
+static int
+hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t,
+ const struct xt_hmark_info *info)
+{
+ struct ipv6hdr *ip6, _ip6;
+ int flag = IP6_FH_F_AUTH;
+ unsigned int nhoff = 0;
+ u16 fragoff = 0;
+ int nexthdr;
+
+ ip6 = (struct ipv6hdr *) (skb->data + skb_network_offset(skb));
+ nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag);
+ if (nexthdr < 0)
+ return 0;
+ /* No need to check for icmp errors on fragments */
+ if ((flag & IP6_FH_F_FRAG) || (nexthdr != IPPROTO_ICMPV6))
+ goto noicmp;
+ /* Use inner header in case of ICMP errors */
+ if (get_inner6_hdr(skb, &nhoff)) {
+ ip6 = skb_header_pointer(skb, nhoff, sizeof(_ip6), &_ip6);
+ if (ip6 == NULL)
+ return -1;
+ /* If AH present, use SPI like in ESP. */
+ flag = IP6_FH_F_AUTH;
+ nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag);
+ if (nexthdr < 0)
+ return -1;
+ }
+noicmp:
+ t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.ip6);
+ t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.ip6);
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))
+ return 0;
+
+ t->proto = nexthdr;
+ if (t->proto == IPPROTO_ICMPV6)
+ return 0;
+
+ if (flag & IP6_FH_F_FRAG)
+ return 0;
+
+ hmark_set_tuple_ports(skb, nhoff, t, info);
+ return 0;
+}
+
+static unsigned int
+hmark_tg_v6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_hmark_info *info = par->targinfo;
+ struct hmark_tuple t;
+
+ memset(&t, 0, sizeof(struct hmark_tuple));
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) {
+ if (hmark_ct_set_htuple(skb, &t, info) < 0)
+ return XT_CONTINUE;
+ } else {
+ if (hmark_pkt_set_htuple_ipv6(skb, &t, info) < 0)
+ return XT_CONTINUE;
+ }
+
+ skb->mark = hmark_hash(&t, info);
+ return XT_CONTINUE;
+}
+#endif
+
+static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff)
+{
+ const struct icmphdr *icmph;
+ struct icmphdr _ih;
+
+ /* Not enough header? */
+ icmph = skb_header_pointer(skb, *nhoff + iphsz, sizeof(_ih), &_ih);
+ if (icmph == NULL || icmph->type > NR_ICMP_TYPES)
+ return 0;
+
+ /* Error message? */
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_SOURCE_QUENCH &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB &&
+ icmph->type != ICMP_REDIRECT)
+ return 0;
+
+ *nhoff += iphsz + sizeof(_ih);
+ return 1;
+}
+
+static int
+hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
+ const struct xt_hmark_info *info)
+{
+ struct iphdr *ip, _ip;
+ int nhoff = skb_network_offset(skb);
+
+ ip = (struct iphdr *) (skb->data + nhoff);
+ if (ip->protocol == IPPROTO_ICMP) {
+ /* Use inner header in case of ICMP errors */
+ if (get_inner_hdr(skb, ip->ihl * 4, &nhoff)) {
+ ip = skb_header_pointer(skb, nhoff, sizeof(_ip), &_ip);
+ if (ip == NULL)
+ return -1;
+ }
+ }
+
+ t->src = ip->saddr & info->src_mask.ip;
+ t->dst = ip->daddr & info->dst_mask.ip;
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))
+ return 0;
+
+ t->proto = ip->protocol;
+
+ /* ICMP has no ports, skip */
+ if (t->proto == IPPROTO_ICMP)
+ return 0;
+
+ /* follow-up fragments don't contain ports, skip all fragments */
+ if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ return 0;
+
+ hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info);
+
+ return 0;
+}
+
+static unsigned int
+hmark_tg_v4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_hmark_info *info = par->targinfo;
+ struct hmark_tuple t;
+
+ memset(&t, 0, sizeof(struct hmark_tuple));
+
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) {
+ if (hmark_ct_set_htuple(skb, &t, info) < 0)
+ return XT_CONTINUE;
+ } else {
+ if (hmark_pkt_set_htuple_ipv4(skb, &t, info) < 0)
+ return XT_CONTINUE;
+ }
+
+ skb->mark = hmark_hash(&t, info);
+ return XT_CONTINUE;
+}
+
+static int hmark_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_hmark_info *info = par->targinfo;
+
+ if (!info->hmodulus) {
+ pr_info("xt_HMARK: hash modulus can't be zero\n");
+ return -EINVAL;
+ }
+ if (info->proto_mask &&
+ (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))) {
+ pr_info("xt_HMARK: proto mask must be zero with L3 mode\n");
+ return -EINVAL;
+ }
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI_MASK) &&
+ (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT_MASK) |
+ XT_HMARK_FLAG(XT_HMARK_DPORT_MASK)))) {
+ pr_info("xt_HMARK: spi-mask and port-mask can't be combined\n");
+ return -EINVAL;
+ }
+ if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI) &&
+ (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT) |
+ XT_HMARK_FLAG(XT_HMARK_DPORT)))) {
+ pr_info("xt_HMARK: spi-set and port-set can't be combined\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target hmark_tg_reg[] __read_mostly = {
+ {
+ .name = "HMARK",
+ .family = NFPROTO_IPV4,
+ .target = hmark_tg_v4,
+ .targetsize = sizeof(struct xt_hmark_info),
+ .checkentry = hmark_tg_check,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "HMARK",
+ .family = NFPROTO_IPV6,
+ .target = hmark_tg_v6,
+ .targetsize = sizeof(struct xt_hmark_info),
+ .checkentry = hmark_tg_check,
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init hmark_tg_init(void)
+{
+ return xt_register_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg));
+}
+
+static void __exit hmark_tg_exit(void)
+{
+ xt_unregister_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg));
+}
+
+module_init(hmark_tg_init);
+module_exit(hmark_tg_exit);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
new file mode 100644
index 000000000..f407ebc13
--- /dev/null
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -0,0 +1,315 @@
+/*
+ * linux/net/netfilter/xt_IDLETIMER.c
+ *
+ * Netfilter module to trigger a timer when packet matches.
+ * After timer expires a kevent will be sent.
+ *
+ * Copyright (C) 2004, 2010 Nokia Corporation
+ * Written by Timo Teras <ext-timo.teras@nokia.com>
+ *
+ * Converted to x_tables and reworked for upstream inclusion
+ * by Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * Contact: Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_IDLETIMER.h>
+#include <linux/kdev_t.h>
+#include <linux/kobject.h>
+#include <linux/workqueue.h>
+#include <linux/sysfs.h>
+
+struct idletimer_tg_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+};
+
+struct idletimer_tg {
+ struct list_head entry;
+ struct timer_list timer;
+ struct work_struct work;
+
+ struct kobject *kobj;
+ struct idletimer_tg_attr attr;
+
+ unsigned int refcnt;
+};
+
+static LIST_HEAD(idletimer_tg_list);
+static DEFINE_MUTEX(list_mutex);
+
+static struct kobject *idletimer_tg_kobj;
+
+static
+struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
+{
+ struct idletimer_tg *entry;
+
+ BUG_ON(!label);
+
+ list_for_each_entry(entry, &idletimer_tg_list, entry) {
+ if (!strcmp(label, entry->attr.attr.name))
+ return entry;
+ }
+
+ return NULL;
+}
+
+static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct idletimer_tg *timer;
+ unsigned long expires = 0;
+
+ mutex_lock(&list_mutex);
+
+ timer = __idletimer_tg_find_by_label(attr->name);
+ if (timer)
+ expires = timer->timer.expires;
+
+ mutex_unlock(&list_mutex);
+
+ if (time_after(expires, jiffies))
+ return sprintf(buf, "%u\n",
+ jiffies_to_msecs(expires - jiffies) / 1000);
+
+ return sprintf(buf, "0\n");
+}
+
+static void idletimer_tg_work(struct work_struct *work)
+{
+ struct idletimer_tg *timer = container_of(work, struct idletimer_tg,
+ work);
+
+ sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+}
+
+static void idletimer_tg_expired(unsigned long data)
+{
+ struct idletimer_tg *timer = (struct idletimer_tg *) data;
+
+ pr_debug("timer %s expired\n", timer->attr.attr.name);
+
+ schedule_work(&timer->work);
+}
+
+static int idletimer_tg_create(struct idletimer_tg_info *info)
+{
+ int ret;
+
+ info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ if (!info->timer) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+ if (!info->timer->attr.attr.name) {
+ ret = -ENOMEM;
+ goto out_free_timer;
+ }
+ info->timer->attr.attr.mode = S_IRUGO;
+ info->timer->attr.show = idletimer_tg_show;
+
+ ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ if (ret < 0) {
+ pr_debug("couldn't add file to sysfs");
+ goto out_free_attr;
+ }
+
+ list_add(&info->timer->entry, &idletimer_tg_list);
+
+ setup_timer(&info->timer->timer, idletimer_tg_expired,
+ (unsigned long) info->timer);
+ info->timer->refcnt = 1;
+
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+ INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+ return 0;
+
+out_free_attr:
+ kfree(info->timer->attr.attr.name);
+out_free_timer:
+ kfree(info->timer);
+out:
+ return ret;
+}
+
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target(struct sk_buff *skb,
+ const struct xt_action_param *par)
+{
+ const struct idletimer_tg_info *info = par->targinfo;
+
+ pr_debug("resetting timer %s, timeout period %u\n",
+ info->label, info->timeout);
+
+ BUG_ON(!info->timer);
+
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+ return XT_CONTINUE;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ if (info->timeout == 0) {
+ pr_debug("timeout value is zero\n");
+ return -EINVAL;
+ }
+
+ if (info->label[0] == '\0' ||
+ strnlen(info->label,
+ MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) {
+ pr_debug("label is empty or not nul-terminated\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&list_mutex);
+
+ info->timer = __idletimer_tg_find_by_label(info->label);
+ if (info->timer) {
+ info->timer->refcnt++;
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+ pr_debug("increased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ } else {
+ ret = idletimer_tg_create(info);
+ if (ret < 0) {
+ pr_debug("failed to create timer\n");
+ mutex_unlock(&list_mutex);
+ return ret;
+ }
+ }
+
+ mutex_unlock(&list_mutex);
+ return 0;
+}
+
+static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct idletimer_tg_info *info = par->targinfo;
+
+ pr_debug("destroy targinfo %s\n", info->label);
+
+ mutex_lock(&list_mutex);
+
+ if (--info->timer->refcnt == 0) {
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
+ del_timer_sync(&info->timer->timer);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
+ } else {
+ pr_debug("decreased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ }
+
+ mutex_unlock(&list_mutex);
+}
+
+static struct xt_target idletimer_tg __read_mostly = {
+ .name = "IDLETIMER",
+ .family = NFPROTO_UNSPEC,
+ .target = idletimer_tg_target,
+ .targetsize = sizeof(struct idletimer_tg_info),
+ .checkentry = idletimer_tg_checkentry,
+ .destroy = idletimer_tg_destroy,
+ .me = THIS_MODULE,
+};
+
+static struct class *idletimer_tg_class;
+
+static struct device *idletimer_tg_device;
+
+static int __init idletimer_tg_init(void)
+{
+ int err;
+
+ idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+ err = PTR_ERR(idletimer_tg_class);
+ if (IS_ERR(idletimer_tg_class)) {
+ pr_debug("couldn't register device class\n");
+ goto out;
+ }
+
+ idletimer_tg_device = device_create(idletimer_tg_class, NULL,
+ MKDEV(0, 0), NULL, "timers");
+ err = PTR_ERR(idletimer_tg_device);
+ if (IS_ERR(idletimer_tg_device)) {
+ pr_debug("couldn't register system device\n");
+ goto out_class;
+ }
+
+ idletimer_tg_kobj = &idletimer_tg_device->kobj;
+
+ err = xt_register_target(&idletimer_tg);
+ if (err < 0) {
+ pr_debug("couldn't register xt target\n");
+ goto out_dev;
+ }
+
+ return 0;
+out_dev:
+ device_destroy(idletimer_tg_class, MKDEV(0, 0));
+out_class:
+ class_destroy(idletimer_tg_class);
+out:
+ return err;
+}
+
+static void __exit idletimer_tg_exit(void)
+{
+ xt_unregister_target(&idletimer_tg);
+
+ device_destroy(idletimer_tg_class, MKDEV(0, 0));
+ class_destroy(idletimer_tg_class);
+}
+
+module_init(idletimer_tg_init);
+module_exit(idletimer_tg_exit);
+
+MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
+MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
+MODULE_DESCRIPTION("Xtables: idle time monitor");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ipt_IDLETIMER");
+MODULE_ALIAS("ip6t_IDLETIMER");
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
new file mode 100644
index 000000000..3ba31c194
--- /dev/null
+++ b/net/netfilter/xt_LED.c
@@ -0,0 +1,217 @@
+/*
+ * xt_LED.c - netfilter target to make LEDs blink upon packet matches
+ *
+ * Copyright (C) 2008 Adam Nielsen <a.nielsen@shikadi.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/slab.h>
+#include <linux/leds.h>
+#include <linux/mutex.h>
+
+#include <linux/netfilter/xt_LED.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>");
+MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match");
+MODULE_ALIAS("ipt_LED");
+MODULE_ALIAS("ip6t_LED");
+
+static LIST_HEAD(xt_led_triggers);
+static DEFINE_MUTEX(xt_led_mutex);
+
+/*
+ * This is declared in here (the kernel module) only, to avoid having these
+ * dependencies in userspace code. This is what xt_led_info.internal_data
+ * points to.
+ */
+struct xt_led_info_internal {
+ struct list_head list;
+ int refcnt;
+ char *trigger_id;
+ struct led_trigger netfilter_led_trigger;
+ struct timer_list timer;
+};
+
+#define XT_LED_BLINK_DELAY 50 /* ms */
+
+static unsigned int
+led_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_led_info *ledinfo = par->targinfo;
+ struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
+ unsigned long led_delay = XT_LED_BLINK_DELAY;
+
+ /*
+ * If "always blink" is enabled, and there's still some time until the
+ * LED will switch off, briefly switch it off now.
+ */
+ if ((ledinfo->delay > 0) && ledinfo->always_blink &&
+ timer_pending(&ledinternal->timer))
+ led_trigger_blink_oneshot(&ledinternal->netfilter_led_trigger,
+ &led_delay, &led_delay, 1);
+ else
+ led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL);
+
+ /* If there's a positive delay, start/update the timer */
+ if (ledinfo->delay > 0) {
+ mod_timer(&ledinternal->timer,
+ jiffies + msecs_to_jiffies(ledinfo->delay));
+
+ /* Otherwise if there was no delay given, blink as fast as possible */
+ } else if (ledinfo->delay == 0) {
+ led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
+ }
+
+ /* else the delay is negative, which means switch on and stay on */
+
+ return XT_CONTINUE;
+}
+
+static void led_timeout_callback(unsigned long data)
+{
+ struct xt_led_info_internal *ledinternal = (struct xt_led_info_internal *)data;
+
+ led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
+}
+
+static struct xt_led_info_internal *led_trigger_lookup(const char *name)
+{
+ struct xt_led_info_internal *ledinternal;
+
+ list_for_each_entry(ledinternal, &xt_led_triggers, list) {
+ if (!strcmp(name, ledinternal->netfilter_led_trigger.name)) {
+ return ledinternal;
+ }
+ }
+ return NULL;
+}
+
+static int led_tg_check(const struct xt_tgchk_param *par)
+{
+ struct xt_led_info *ledinfo = par->targinfo;
+ struct xt_led_info_internal *ledinternal;
+ int err;
+
+ if (ledinfo->id[0] == '\0') {
+ pr_info("No 'id' parameter given.\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&xt_led_mutex);
+
+ ledinternal = led_trigger_lookup(ledinfo->id);
+ if (ledinternal) {
+ ledinternal->refcnt++;
+ goto out;
+ }
+
+ err = -ENOMEM;
+ ledinternal = kzalloc(sizeof(struct xt_led_info_internal), GFP_KERNEL);
+ if (!ledinternal)
+ goto exit_mutex_only;
+
+ ledinternal->trigger_id = kstrdup(ledinfo->id, GFP_KERNEL);
+ if (!ledinternal->trigger_id)
+ goto exit_internal_alloc;
+
+ ledinternal->refcnt = 1;
+ ledinternal->netfilter_led_trigger.name = ledinternal->trigger_id;
+
+ err = led_trigger_register(&ledinternal->netfilter_led_trigger);
+ if (err) {
+ pr_err("Trigger name is already in use.\n");
+ goto exit_alloc;
+ }
+
+ /* See if we need to set up a timer */
+ if (ledinfo->delay > 0)
+ setup_timer(&ledinternal->timer, led_timeout_callback,
+ (unsigned long)ledinternal);
+
+ list_add_tail(&ledinternal->list, &xt_led_triggers);
+
+out:
+ mutex_unlock(&xt_led_mutex);
+
+ ledinfo->internal_data = ledinternal;
+
+ return 0;
+
+exit_alloc:
+ kfree(ledinternal->trigger_id);
+
+exit_internal_alloc:
+ kfree(ledinternal);
+
+exit_mutex_only:
+ mutex_unlock(&xt_led_mutex);
+
+ return err;
+}
+
+static void led_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_led_info *ledinfo = par->targinfo;
+ struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
+
+ mutex_lock(&xt_led_mutex);
+
+ if (--ledinternal->refcnt) {
+ mutex_unlock(&xt_led_mutex);
+ return;
+ }
+
+ list_del(&ledinternal->list);
+
+ if (ledinfo->delay > 0)
+ del_timer_sync(&ledinternal->timer);
+
+ led_trigger_unregister(&ledinternal->netfilter_led_trigger);
+
+ mutex_unlock(&xt_led_mutex);
+
+ kfree(ledinternal->trigger_id);
+ kfree(ledinternal);
+}
+
+static struct xt_target led_tg_reg __read_mostly = {
+ .name = "LED",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .target = led_tg,
+ .targetsize = sizeof(struct xt_led_info),
+ .checkentry = led_tg_check,
+ .destroy = led_tg_destroy,
+ .me = THIS_MODULE,
+};
+
+static int __init led_tg_init(void)
+{
+ return xt_register_target(&led_tg_reg);
+}
+
+static void __exit led_tg_exit(void)
+{
+ xt_unregister_target(&led_tg_reg);
+}
+
+module_init(led_tg_init);
+module_exit(led_tg_exit);
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
new file mode 100644
index 000000000..c13b79440
--- /dev/null
+++ b/net/netfilter/xt_LOG.c
@@ -0,0 +1,113 @@
+/*
+ * This is a module which is used for logging packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int
+log_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_log_info *loginfo = par->targinfo;
+ struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
+
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = loginfo->level;
+ li.u.log.logflags = loginfo->logflags;
+
+ nf_log_packet(net, par->family, par->hooknum, skb, par->in, par->out,
+ &li, "%s", loginfo->prefix);
+ return XT_CONTINUE;
+}
+
+static int log_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_log_info *loginfo = par->targinfo;
+
+ if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6)
+ return -EINVAL;
+
+ if (loginfo->level >= 8) {
+ pr_debug("level %u >= 8\n", loginfo->level);
+ return -EINVAL;
+ }
+
+ if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
+ pr_debug("prefix is not null-terminated\n");
+ return -EINVAL;
+ }
+
+ return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+}
+
+static void log_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_logger_put(par->family, NF_LOG_TYPE_LOG);
+}
+
+static struct xt_target log_tg_regs[] __read_mostly = {
+ {
+ .name = "LOG",
+ .family = NFPROTO_IPV4,
+ .target = log_tg,
+ .targetsize = sizeof(struct xt_log_info),
+ .checkentry = log_tg_check,
+ .destroy = log_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "LOG",
+ .family = NFPROTO_IPV6,
+ .target = log_tg,
+ .targetsize = sizeof(struct xt_log_info),
+ .checkentry = log_tg_check,
+ .destroy = log_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init log_tg_init(void)
+{
+ return xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
+}
+
+static void __exit log_tg_exit(void)
+{
+ xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
+}
+
+module_init(log_tg_init);
+module_exit(log_tg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
+MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging");
+MODULE_ALIAS("ipt_LOG");
+MODULE_ALIAS("ip6t_LOG");
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
new file mode 100644
index 000000000..b253e07cb
--- /dev/null
+++ b/net/netfilter/xt_NETMAP.c
@@ -0,0 +1,165 @@
+/*
+ * (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+
+static unsigned int
+netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ struct nf_nat_range newrange;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ union nf_inet_addr new_addr, netmask;
+ unsigned int i;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ for (i = 0; i < ARRAY_SIZE(range->min_addr.ip6); i++)
+ netmask.ip6[i] = ~(range->min_addr.ip6[i] ^
+ range->max_addr.ip6[i]);
+
+ if (par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_OUT)
+ new_addr.in6 = ipv6_hdr(skb)->daddr;
+ else
+ new_addr.in6 = ipv6_hdr(skb)->saddr;
+
+ for (i = 0; i < ARRAY_SIZE(new_addr.ip6); i++) {
+ new_addr.ip6[i] &= ~netmask.ip6[i];
+ new_addr.ip6[i] |= range->min_addr.ip6[i] &
+ netmask.ip6[i];
+ }
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr = new_addr;
+ newrange.max_addr = new_addr;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+}
+
+static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+
+ if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
+ return -EINVAL;
+ return 0;
+}
+
+static unsigned int
+netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ __be32 new_ip, netmask;
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range newrange;
+
+ NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_POST_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_OUT ||
+ par->hooknum == NF_INET_LOCAL_IN);
+ ct = nf_ct_get(skb, &ctinfo);
+
+ netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
+
+ if (par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_OUT)
+ new_ip = ip_hdr(skb)->daddr & ~netmask;
+ else
+ new_ip = ip_hdr(skb)->saddr & ~netmask;
+ new_ip |= mr->range[0].min_ip & netmask;
+
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = new_ip;
+ newrange.max_addr.ip = new_ip;
+ newrange.min_proto = mr->range[0].min;
+ newrange.max_proto = mr->range[0].max;
+
+ /* Hand modified range to generic setup. */
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+}
+
+static int netmap_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) {
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
+ }
+ if (mr->rangesize != 1) {
+ pr_debug("bad rangesize %u.\n", mr->rangesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_target netmap_tg_reg[] __read_mostly = {
+ {
+ .name = "NETMAP",
+ .family = NFPROTO_IPV6,
+ .revision = 0,
+ .target = netmap_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .checkentry = netmap_tg6_checkentry,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "NETMAP",
+ .family = NFPROTO_IPV4,
+ .revision = 0,
+ .target = netmap_tg4,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .checkentry = netmap_tg4_check,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init netmap_tg_init(void)
+{
+ return xt_register_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg));
+}
+
+static void netmap_tg_exit(void)
+{
+ xt_unregister_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg));
+}
+
+module_init(netmap_tg_init);
+module_exit(netmap_tg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of subnets");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS("ip6t_NETMAP");
+MODULE_ALIAS("ipt_NETMAP");
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
new file mode 100644
index 000000000..fb7497c92
--- /dev/null
+++ b/net/netfilter/xt_NFLOG.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_NFLOG.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/nfnetlink_log.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_NFLOG");
+MODULE_ALIAS("ip6t_NFLOG");
+
+static unsigned int
+nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_nflog_info *info = par->targinfo;
+ struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
+
+ li.type = NF_LOG_TYPE_ULOG;
+ li.u.ulog.copy_len = info->len;
+ li.u.ulog.group = info->group;
+ li.u.ulog.qthreshold = info->threshold;
+
+ nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
+ par->out, &li, info->prefix);
+ return XT_CONTINUE;
+}
+
+static int nflog_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_nflog_info *info = par->targinfo;
+
+ if (info->flags & ~XT_NFLOG_MASK)
+ return -EINVAL;
+ if (info->prefix[sizeof(info->prefix) - 1] != '\0')
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target nflog_tg_reg __read_mostly = {
+ .name = "NFLOG",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nflog_tg_check,
+ .target = nflog_tg,
+ .targetsize = sizeof(struct xt_nflog_info),
+ .me = THIS_MODULE,
+};
+
+static int __init nflog_tg_init(void)
+{
+ return xt_register_target(&nflog_tg_reg);
+}
+
+static void __exit nflog_tg_exit(void)
+{
+ xt_unregister_target(&nflog_tg_reg);
+}
+
+module_init(nflog_tg_init);
+module_exit(nflog_tg_exit);
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
new file mode 100644
index 000000000..8f1779ff7
--- /dev/null
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -0,0 +1,160 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_NFQUEUE.h>
+
+#include <net/netfilter/nf_queue.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: packet forwarding to netlink");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_NFQUEUE");
+MODULE_ALIAS("ip6t_NFQUEUE");
+MODULE_ALIAS("arpt_NFQUEUE");
+
+static u32 jhash_initval __read_mostly;
+
+static unsigned int
+nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info *tinfo = par->targinfo;
+
+ return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static unsigned int
+nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info_v1 *info = par->targinfo;
+ u32 queue = info->queuenum;
+
+ if (info->queues_total > 1) {
+ queue = nfqueue_hash(skb, queue, info->queues_total,
+ par->family, jhash_initval);
+ }
+ return NF_QUEUE_NR(queue);
+}
+
+static unsigned int
+nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info_v2 *info = par->targinfo;
+ unsigned int ret = nfqueue_tg_v1(skb, par);
+
+ if (info->bypass)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+ return ret;
+}
+
+static int nfqueue_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_NFQ_info_v3 *info = par->targinfo;
+ u32 maxid;
+
+ init_hashrandom(&jhash_initval);
+
+ if (info->queues_total == 0) {
+ pr_err("NFQUEUE: number of total queues is 0\n");
+ return -EINVAL;
+ }
+ maxid = info->queues_total - 1 + info->queuenum;
+ if (maxid > 0xffff) {
+ pr_err("NFQUEUE: number of queues (%u) out of range (got %u)\n",
+ info->queues_total, maxid);
+ return -ERANGE;
+ }
+ if (par->target->revision == 2 && info->flags > 1)
+ return -EINVAL;
+ if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK)
+ return -EINVAL;
+
+ return 0;
+}
+
+static unsigned int
+nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info_v3 *info = par->targinfo;
+ u32 queue = info->queuenum;
+ int ret;
+
+ if (info->queues_total > 1) {
+ if (info->flags & NFQ_FLAG_CPU_FANOUT) {
+ int cpu = smp_processor_id();
+
+ queue = info->queuenum + cpu % info->queues_total;
+ } else {
+ queue = nfqueue_hash(skb, queue, info->queues_total,
+ par->family, jhash_initval);
+ }
+ }
+
+ ret = NF_QUEUE_NR(queue);
+ if (info->flags & NFQ_FLAG_BYPASS)
+ ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+
+ return ret;
+}
+
+static struct xt_target nfqueue_tg_reg[] __read_mostly = {
+ {
+ .name = "NFQUEUE",
+ .family = NFPROTO_UNSPEC,
+ .target = nfqueue_tg,
+ .targetsize = sizeof(struct xt_NFQ_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "NFQUEUE",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v1,
+ .targetsize = sizeof(struct xt_NFQ_info_v1),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "NFQUEUE",
+ .revision = 2,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v2,
+ .targetsize = sizeof(struct xt_NFQ_info_v2),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "NFQUEUE",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v3,
+ .targetsize = sizeof(struct xt_NFQ_info_v3),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init nfqueue_tg_init(void)
+{
+ return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg));
+}
+
+static void __exit nfqueue_tg_exit(void)
+{
+ xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg));
+}
+
+module_init(nfqueue_tg_init);
+module_exit(nfqueue_tg_exit);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
new file mode 100644
index 000000000..604df6fae
--- /dev/null
+++ b/net/netfilter/xt_RATEEST.c
@@ -0,0 +1,194 @@
+/*
+ * (C) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/gen_stats.h>
+#include <linux/jhash.h>
+#include <linux/rtnetlink.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/gen_stats.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_RATEEST.h>
+#include <net/netfilter/xt_rateest.h>
+
+static DEFINE_MUTEX(xt_rateest_mutex);
+
+#define RATEEST_HSIZE 16
+static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly;
+static unsigned int jhash_rnd __read_mostly;
+static bool rnd_inited __read_mostly;
+
+static unsigned int xt_rateest_hash(const char *name)
+{
+ return jhash(name, FIELD_SIZEOF(struct xt_rateest, name), jhash_rnd) &
+ (RATEEST_HSIZE - 1);
+}
+
+static void xt_rateest_hash_insert(struct xt_rateest *est)
+{
+ unsigned int h;
+
+ h = xt_rateest_hash(est->name);
+ hlist_add_head(&est->list, &rateest_hash[h]);
+}
+
+struct xt_rateest *xt_rateest_lookup(const char *name)
+{
+ struct xt_rateest *est;
+ unsigned int h;
+
+ h = xt_rateest_hash(name);
+ mutex_lock(&xt_rateest_mutex);
+ hlist_for_each_entry(est, &rateest_hash[h], list) {
+ if (strcmp(est->name, name) == 0) {
+ est->refcnt++;
+ mutex_unlock(&xt_rateest_mutex);
+ return est;
+ }
+ }
+ mutex_unlock(&xt_rateest_mutex);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(xt_rateest_lookup);
+
+void xt_rateest_put(struct xt_rateest *est)
+{
+ mutex_lock(&xt_rateest_mutex);
+ if (--est->refcnt == 0) {
+ hlist_del(&est->list);
+ gen_kill_estimator(&est->bstats, &est->rstats);
+ /*
+ * gen_estimator est_timer() might access est->lock or bstats,
+ * wait a RCU grace period before freeing 'est'
+ */
+ kfree_rcu(est, rcu);
+ }
+ mutex_unlock(&xt_rateest_mutex);
+}
+EXPORT_SYMBOL_GPL(xt_rateest_put);
+
+static unsigned int
+xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_rateest_target_info *info = par->targinfo;
+ struct gnet_stats_basic_packed *stats = &info->est->bstats;
+
+ spin_lock_bh(&info->est->lock);
+ stats->bytes += skb->len;
+ stats->packets++;
+ spin_unlock_bh(&info->est->lock);
+
+ return XT_CONTINUE;
+}
+
+static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
+{
+ struct xt_rateest_target_info *info = par->targinfo;
+ struct xt_rateest *est;
+ struct {
+ struct nlattr opt;
+ struct gnet_estimator est;
+ } cfg;
+ int ret;
+
+ if (unlikely(!rnd_inited)) {
+ get_random_bytes(&jhash_rnd, sizeof(jhash_rnd));
+ rnd_inited = true;
+ }
+
+ est = xt_rateest_lookup(info->name);
+ if (est) {
+ /*
+ * If estimator parameters are specified, they must match the
+ * existing estimator.
+ */
+ if ((!info->interval && !info->ewma_log) ||
+ (info->interval != est->params.interval ||
+ info->ewma_log != est->params.ewma_log)) {
+ xt_rateest_put(est);
+ return -EINVAL;
+ }
+ info->est = est;
+ return 0;
+ }
+
+ ret = -ENOMEM;
+ est = kzalloc(sizeof(*est), GFP_KERNEL);
+ if (!est)
+ goto err1;
+
+ strlcpy(est->name, info->name, sizeof(est->name));
+ spin_lock_init(&est->lock);
+ est->refcnt = 1;
+ est->params.interval = info->interval;
+ est->params.ewma_log = info->ewma_log;
+
+ cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est));
+ cfg.opt.nla_type = TCA_STATS_RATE_EST;
+ cfg.est.interval = info->interval;
+ cfg.est.ewma_log = info->ewma_log;
+
+ ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
+ &est->lock, &cfg.opt);
+ if (ret < 0)
+ goto err2;
+
+ info->est = est;
+ xt_rateest_hash_insert(est);
+ return 0;
+
+err2:
+ kfree(est);
+err1:
+ return ret;
+}
+
+static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ struct xt_rateest_target_info *info = par->targinfo;
+
+ xt_rateest_put(info->est);
+}
+
+static struct xt_target xt_rateest_tg_reg __read_mostly = {
+ .name = "RATEEST",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .target = xt_rateest_tg,
+ .checkentry = xt_rateest_tg_checkentry,
+ .destroy = xt_rateest_tg_destroy,
+ .targetsize = sizeof(struct xt_rateest_target_info),
+ .me = THIS_MODULE,
+};
+
+static int __init xt_rateest_tg_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(rateest_hash); i++)
+ INIT_HLIST_HEAD(&rateest_hash[i]);
+
+ return xt_register_target(&xt_rateest_tg_reg);
+}
+
+static void __exit xt_rateest_tg_fini(void)
+{
+ xt_unregister_target(&xt_rateest_tg_reg);
+}
+
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: packet rate estimator");
+MODULE_ALIAS("ipt_RATEEST");
+MODULE_ALIAS("ip6t_RATEEST");
+module_init(xt_rateest_tg_init);
+module_exit(xt_rateest_tg_fini);
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
new file mode 100644
index 000000000..03f0b370e
--- /dev/null
+++ b/net/netfilter/xt_REDIRECT.c
@@ -0,0 +1,113 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/types.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/addrconf.h>
+#include <net/checksum.h>
+#include <net/protocol.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_redirect.h>
+
+static unsigned int
+redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return nf_nat_redirect_ipv6(skb, par->targinfo, par->hooknum);
+}
+
+static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS)
+ return -EINVAL;
+ return 0;
+}
+
+/* FIXME: Take multiple ranges --RR */
+static int redirect_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
+ }
+ if (mr->rangesize != 1) {
+ pr_debug("bad rangesize %u.\n", mr->rangesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static unsigned int
+redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return nf_nat_redirect_ipv4(skb, par->targinfo, par->hooknum);
+}
+
+static struct xt_target redirect_tg_reg[] __read_mostly = {
+ {
+ .name = "REDIRECT",
+ .family = NFPROTO_IPV6,
+ .revision = 0,
+ .table = "nat",
+ .checkentry = redirect_tg6_checkentry,
+ .target = redirect_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "REDIRECT",
+ .family = NFPROTO_IPV4,
+ .revision = 0,
+ .table = "nat",
+ .target = redirect_tg4,
+ .checkentry = redirect_tg4_check,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init redirect_tg_init(void)
+{
+ return xt_register_targets(redirect_tg_reg,
+ ARRAY_SIZE(redirect_tg_reg));
+}
+
+static void __exit redirect_tg_exit(void)
+{
+ xt_unregister_targets(redirect_tg_reg, ARRAY_SIZE(redirect_tg_reg));
+}
+
+module_init(redirect_tg_init);
+module_exit(redirect_tg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
+MODULE_ALIAS("ip6t_REDIRECT");
+MODULE_ALIAS("ipt_REDIRECT");
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
new file mode 100644
index 000000000..9faf5e050
--- /dev/null
+++ b/net/netfilter/xt_SECMARK.c
@@ -0,0 +1,147 @@
+/*
+ * Module for modifying the secmark field of the skb, for use by
+ * security subsystems.
+ *
+ * Based on the nfmark match by:
+ * (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SECMARK.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("Xtables: packet security mark modification");
+MODULE_ALIAS("ipt_SECMARK");
+MODULE_ALIAS("ip6t_SECMARK");
+
+#define PFX "SECMARK: "
+
+static u8 mode;
+
+static unsigned int
+secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ u32 secmark = 0;
+ const struct xt_secmark_target_info *info = par->targinfo;
+
+ BUG_ON(info->mode != mode);
+
+ switch (mode) {
+ case SECMARK_MODE_SEL:
+ secmark = info->secid;
+ break;
+ default:
+ BUG();
+ }
+
+ skb->secmark = secmark;
+ return XT_CONTINUE;
+}
+
+static int checkentry_lsm(struct xt_secmark_target_info *info)
+{
+ int err;
+
+ info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
+ info->secid = 0;
+
+ err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
+ &info->secid);
+ if (err) {
+ if (err == -EINVAL)
+ pr_info("invalid security context \'%s\'\n", info->secctx);
+ return err;
+ }
+
+ if (!info->secid) {
+ pr_info("unable to map security context \'%s\'\n", info->secctx);
+ return -ENOENT;
+ }
+
+ err = security_secmark_relabel_packet(info->secid);
+ if (err) {
+ pr_info("unable to obtain relabeling permission\n");
+ return err;
+ }
+
+ security_secmark_refcount_inc();
+ return 0;
+}
+
+static int secmark_tg_check(const struct xt_tgchk_param *par)
+{
+ struct xt_secmark_target_info *info = par->targinfo;
+ int err;
+
+ if (strcmp(par->table, "mangle") != 0 &&
+ strcmp(par->table, "security") != 0) {
+ pr_info("target only valid in the \'mangle\' "
+ "or \'security\' tables, not \'%s\'.\n", par->table);
+ return -EINVAL;
+ }
+
+ if (mode && mode != info->mode) {
+ pr_info("mode already set to %hu cannot mix with "
+ "rules for mode %hu\n", mode, info->mode);
+ return -EINVAL;
+ }
+
+ switch (info->mode) {
+ case SECMARK_MODE_SEL:
+ break;
+ default:
+ pr_info("invalid mode: %hu\n", info->mode);
+ return -EINVAL;
+ }
+
+ err = checkentry_lsm(info);
+ if (err)
+ return err;
+
+ if (!mode)
+ mode = info->mode;
+ return 0;
+}
+
+static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ switch (mode) {
+ case SECMARK_MODE_SEL:
+ security_secmark_refcount_dec();
+ }
+}
+
+static struct xt_target secmark_tg_reg __read_mostly = {
+ .name = "SECMARK",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = secmark_tg_check,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg,
+ .targetsize = sizeof(struct xt_secmark_target_info),
+ .me = THIS_MODULE,
+};
+
+static int __init secmark_tg_init(void)
+{
+ return xt_register_target(&secmark_tg_reg);
+}
+
+static void __exit secmark_tg_exit(void)
+{
+ xt_unregister_target(&secmark_tg_reg);
+}
+
+module_init(secmark_tg_init);
+module_exit(secmark_tg_exit);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
new file mode 100644
index 000000000..e762de5ee
--- /dev/null
+++ b/net/netfilter/xt_TCPMSS.c
@@ -0,0 +1,344 @@
+/*
+ * This is a module which is used for setting the MSS option in TCP packets.
+ *
+ * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ * Copyright (C) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/gfp.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/ipv6.h>
+#include <net/route.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter/xt_TCPMSS.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment");
+MODULE_ALIAS("ipt_TCPMSS");
+MODULE_ALIAS("ip6t_TCPMSS");
+
+static inline unsigned int
+optlen(const u_int8_t *opt, unsigned int offset)
+{
+ /* Beware zero-length options: make finite progress */
+ if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0)
+ return 1;
+ else
+ return opt[offset+1];
+}
+
+static u_int32_t tcpmss_reverse_mtu(struct net *net,
+ const struct sk_buff *skb,
+ unsigned int family)
+{
+ struct flowi fl;
+ const struct nf_afinfo *ai;
+ struct rtable *rt = NULL;
+ u_int32_t mtu = ~0U;
+
+ if (family == PF_INET) {
+ struct flowi4 *fl4 = &fl.u.ip4;
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->daddr = ip_hdr(skb)->saddr;
+ } else {
+ struct flowi6 *fl6 = &fl.u.ip6;
+
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->daddr = ipv6_hdr(skb)->saddr;
+ }
+ rcu_read_lock();
+ ai = nf_get_afinfo(family);
+ if (ai != NULL)
+ ai->route(net, (struct dst_entry **)&rt, &fl, false);
+ rcu_read_unlock();
+
+ if (rt != NULL) {
+ mtu = dst_mtu(&rt->dst);
+ dst_release(&rt->dst);
+ }
+ return mtu;
+}
+
+static int
+tcpmss_mangle_packet(struct sk_buff *skb,
+ const struct xt_action_param *par,
+ unsigned int family,
+ unsigned int tcphoff,
+ unsigned int minlen)
+{
+ const struct xt_tcpmss_info *info = par->targinfo;
+ struct tcphdr *tcph;
+ int len, tcp_hdrlen;
+ unsigned int i;
+ __be16 oldval;
+ u16 newmss;
+ u8 *opt;
+
+ /* This is a fragment, no TCP header is available */
+ if (par->fragoff != 0)
+ return 0;
+
+ if (!skb_make_writable(skb, skb->len))
+ return -1;
+
+ len = skb->len - tcphoff;
+ if (len < (int)sizeof(struct tcphdr))
+ return -1;
+
+ tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+ tcp_hdrlen = tcph->doff * 4;
+
+ if (len < tcp_hdrlen)
+ return -1;
+
+ if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
+ struct net *net = dev_net(par->in ? par->in : par->out);
+ unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
+
+ if (dst_mtu(skb_dst(skb)) <= minlen) {
+ net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
+ dst_mtu(skb_dst(skb)));
+ return -1;
+ }
+ if (in_mtu <= minlen) {
+ net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
+ in_mtu);
+ return -1;
+ }
+ newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
+ } else
+ newmss = info->mss;
+
+ opt = (u_int8_t *)tcph;
+ for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) {
+ if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) {
+ u_int16_t oldmss;
+
+ oldmss = (opt[i+2] << 8) | opt[i+3];
+
+ /* Never increase MSS, even when setting it, as
+ * doing so results in problems for hosts that rely
+ * on MSS being set correctly.
+ */
+ if (oldmss <= newmss)
+ return 0;
+
+ opt[i+2] = (newmss & 0xff00) >> 8;
+ opt[i+3] = newmss & 0x00ff;
+
+ inet_proto_csum_replace2(&tcph->check, skb,
+ htons(oldmss), htons(newmss),
+ 0);
+ return 0;
+ }
+ }
+
+ /* There is data after the header so the option can't be added
+ * without moving it, and doing so may make the SYN packet
+ * itself too large. Accept the packet unmodified instead.
+ */
+ if (len > tcp_hdrlen)
+ return 0;
+
+ /*
+ * MSS Option not found ?! add it..
+ */
+ if (skb_tailroom(skb) < TCPOLEN_MSS) {
+ if (pskb_expand_head(skb, 0,
+ TCPOLEN_MSS - skb_tailroom(skb),
+ GFP_ATOMIC))
+ return -1;
+ tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+ }
+
+ skb_put(skb, TCPOLEN_MSS);
+
+ /*
+ * IPv4: RFC 1122 states "If an MSS option is not received at
+ * connection setup, TCP MUST assume a default send MSS of 536".
+ * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum
+ * length IPv6 header of 60, ergo the default MSS value is 1220
+ * Since no MSS was provided, we must use the default values
+ */
+ if (par->family == NFPROTO_IPV4)
+ newmss = min(newmss, (u16)536);
+ else
+ newmss = min(newmss, (u16)1220);
+
+ opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
+ memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr));
+
+ inet_proto_csum_replace2(&tcph->check, skb,
+ htons(len), htons(len + TCPOLEN_MSS), 1);
+ opt[0] = TCPOPT_MSS;
+ opt[1] = TCPOLEN_MSS;
+ opt[2] = (newmss & 0xff00) >> 8;
+ opt[3] = newmss & 0x00ff;
+
+ inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0);
+
+ oldval = ((__be16 *)tcph)[6];
+ tcph->doff += TCPOLEN_MSS/4;
+ inet_proto_csum_replace2(&tcph->check, skb,
+ oldval, ((__be16 *)tcph)[6], 0);
+ return TCPOLEN_MSS;
+}
+
+static unsigned int
+tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ __be16 newlen;
+ int ret;
+
+ ret = tcpmss_mangle_packet(skb, par,
+ PF_INET,
+ iph->ihl * 4,
+ sizeof(*iph) + sizeof(struct tcphdr));
+ if (ret < 0)
+ return NF_DROP;
+ if (ret > 0) {
+ iph = ip_hdr(skb);
+ newlen = htons(ntohs(iph->tot_len) + ret);
+ csum_replace2(&iph->check, iph->tot_len, newlen);
+ iph->tot_len = newlen;
+ }
+ return XT_CONTINUE;
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static unsigned int
+tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ u8 nexthdr;
+ __be16 frag_off;
+ int tcphoff;
+ int ret;
+
+ nexthdr = ipv6h->nexthdr;
+ tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);
+ if (tcphoff < 0)
+ return NF_DROP;
+ ret = tcpmss_mangle_packet(skb, par,
+ PF_INET6,
+ tcphoff,
+ sizeof(*ipv6h) + sizeof(struct tcphdr));
+ if (ret < 0)
+ return NF_DROP;
+ if (ret > 0) {
+ ipv6h = ipv6_hdr(skb);
+ ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret);
+ }
+ return XT_CONTINUE;
+}
+#endif
+
+/* Must specify -p tcp --syn */
+static inline bool find_syn_match(const struct xt_entry_match *m)
+{
+ const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data;
+
+ if (strcmp(m->u.kernel.match->name, "tcp") == 0 &&
+ tcpinfo->flg_cmp & TCPHDR_SYN &&
+ !(tcpinfo->invflags & XT_TCP_INV_FLAGS))
+ return true;
+
+ return false;
+}
+
+static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_tcpmss_info *info = par->targinfo;
+ const struct ipt_entry *e = par->entryinfo;
+ const struct xt_entry_match *ematch;
+
+ if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
+ (par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING))) != 0) {
+ pr_info("path-MTU clamping only supported in "
+ "FORWARD, OUTPUT and POSTROUTING hooks\n");
+ return -EINVAL;
+ }
+ xt_ematch_foreach(ematch, e)
+ if (find_syn_match(ematch))
+ return 0;
+ pr_info("Only works on TCP SYN packets\n");
+ return -EINVAL;
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
+{
+ const struct xt_tcpmss_info *info = par->targinfo;
+ const struct ip6t_entry *e = par->entryinfo;
+ const struct xt_entry_match *ematch;
+
+ if (info->mss == XT_TCPMSS_CLAMP_PMTU &&
+ (par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING))) != 0) {
+ pr_info("path-MTU clamping only supported in "
+ "FORWARD, OUTPUT and POSTROUTING hooks\n");
+ return -EINVAL;
+ }
+ xt_ematch_foreach(ematch, e)
+ if (find_syn_match(ematch))
+ return 0;
+ pr_info("Only works on TCP SYN packets\n");
+ return -EINVAL;
+}
+#endif
+
+static struct xt_target tcpmss_tg_reg[] __read_mostly = {
+ {
+ .family = NFPROTO_IPV4,
+ .name = "TCPMSS",
+ .checkentry = tcpmss_tg4_check,
+ .target = tcpmss_tg4,
+ .targetsize = sizeof(struct xt_tcpmss_info),
+ .proto = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .family = NFPROTO_IPV6,
+ .name = "TCPMSS",
+ .checkentry = tcpmss_tg6_check,
+ .target = tcpmss_tg6,
+ .targetsize = sizeof(struct xt_tcpmss_info),
+ .proto = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init tcpmss_tg_init(void)
+{
+ return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg));
+}
+
+static void __exit tcpmss_tg_exit(void)
+{
+ xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg));
+}
+
+module_init(tcpmss_tg_init);
+module_exit(tcpmss_tg_exit);
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
new file mode 100644
index 000000000..625fa1d63
--- /dev/null
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -0,0 +1,158 @@
+/*
+ * A module for stripping a specific TCP option from TCP packets.
+ *
+ * Copyright (C) 2007 Sven Schnelle <svens@bitebene.org>
+ * Copyright © CC Computer Consultants GmbH, 2007
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_TCPOPTSTRIP.h>
+
+static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)
+{
+ /* Beware zero-length options: make finite progress */
+ if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0)
+ return 1;
+ else
+ return opt[offset+1];
+}
+
+static unsigned int
+tcpoptstrip_mangle_packet(struct sk_buff *skb,
+ const struct xt_action_param *par,
+ unsigned int tcphoff, unsigned int minlen)
+{
+ const struct xt_tcpoptstrip_target_info *info = par->targinfo;
+ unsigned int optl, i, j;
+ struct tcphdr *tcph;
+ u_int16_t n, o;
+ u_int8_t *opt;
+ int len, tcp_hdrlen;
+
+ /* This is a fragment, no TCP header is available */
+ if (par->fragoff != 0)
+ return XT_CONTINUE;
+
+ if (!skb_make_writable(skb, skb->len))
+ return NF_DROP;
+
+ len = skb->len - tcphoff;
+ if (len < (int)sizeof(struct tcphdr))
+ return NF_DROP;
+
+ tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+ tcp_hdrlen = tcph->doff * 4;
+
+ if (len < tcp_hdrlen)
+ return NF_DROP;
+
+ opt = (u_int8_t *)tcph;
+
+ /*
+ * Walk through all TCP options - if we find some option to remove,
+ * set all octets to %TCPOPT_NOP and adjust checksum.
+ */
+ for (i = sizeof(struct tcphdr); i < tcp_hdrlen - 1; i += optl) {
+ optl = optlen(opt, i);
+
+ if (i + optl > tcp_hdrlen)
+ break;
+
+ if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i]))
+ continue;
+
+ for (j = 0; j < optl; ++j) {
+ o = opt[i+j];
+ n = TCPOPT_NOP;
+ if ((i + j) % 2 == 0) {
+ o <<= 8;
+ n <<= 8;
+ }
+ inet_proto_csum_replace2(&tcph->check, skb, htons(o),
+ htons(n), 0);
+ }
+ memset(opt + i, TCPOPT_NOP, optl);
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int
+tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb),
+ sizeof(struct iphdr) + sizeof(struct tcphdr));
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+static unsigned int
+tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ int tcphoff;
+ u_int8_t nexthdr;
+ __be16 frag_off;
+
+ nexthdr = ipv6h->nexthdr;
+ tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off);
+ if (tcphoff < 0)
+ return NF_DROP;
+
+ return tcpoptstrip_mangle_packet(skb, par, tcphoff,
+ sizeof(*ipv6h) + sizeof(struct tcphdr));
+}
+#endif
+
+static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
+ {
+ .name = "TCPOPTSTRIP",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .proto = IPPROTO_TCP,
+ .target = tcpoptstrip_tg4,
+ .targetsize = sizeof(struct xt_tcpoptstrip_target_info),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+ {
+ .name = "TCPOPTSTRIP",
+ .family = NFPROTO_IPV6,
+ .table = "mangle",
+ .proto = IPPROTO_TCP,
+ .target = tcpoptstrip_tg6,
+ .targetsize = sizeof(struct xt_tcpoptstrip_target_info),
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init tcpoptstrip_tg_init(void)
+{
+ return xt_register_targets(tcpoptstrip_tg_reg,
+ ARRAY_SIZE(tcpoptstrip_tg_reg));
+}
+
+static void __exit tcpoptstrip_tg_exit(void)
+{
+ xt_unregister_targets(tcpoptstrip_tg_reg,
+ ARRAY_SIZE(tcpoptstrip_tg_reg));
+}
+
+module_init(tcpoptstrip_tg_init);
+module_exit(tcpoptstrip_tg_exit);
+MODULE_AUTHOR("Sven Schnelle <svens@bitebene.org>, Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: TCP option stripping");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TCPOPTSTRIP");
+MODULE_ALIAS("ip6t_TCPOPTSTRIP");
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
new file mode 100644
index 000000000..292934d23
--- /dev/null
+++ b/net/netfilter/xt_TEE.c
@@ -0,0 +1,309 @@
+/*
+ * "TEE" target extension for Xtables
+ * Copyright © Sebastian Claßen, 2007
+ * Jan Engelhardt, 2007-2010
+ *
+ * based on ipt_ROUTE.c from Cédric de Launois
+ * <delaunois@info.ucl.be>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 or later, as published by the Free Software Foundation.
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/notifier.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/route.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_TEE.h>
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+# define WITH_CONNTRACK 1
+# include <net/netfilter/nf_conntrack.h>
+#endif
+
+struct xt_tee_priv {
+ struct notifier_block notifier;
+ struct xt_tee_tginfo *tginfo;
+ int oif;
+};
+
+static const union nf_inet_addr tee_zero_address;
+static DEFINE_PER_CPU(bool, tee_active);
+
+static struct net *pick_net(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_NS
+ const struct dst_entry *dst;
+
+ if (skb->dev != NULL)
+ return dev_net(skb->dev);
+ dst = skb_dst(skb);
+ if (dst != NULL && dst->dev != NULL)
+ return dev_net(dst->dev);
+#endif
+ return &init_net;
+}
+
+static bool
+tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct net *net = pick_net(skb);
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ memset(&fl4, 0, sizeof(fl4));
+ if (info->priv) {
+ if (info->priv->oif == -1)
+ return false;
+ fl4.flowi4_oif = info->priv->oif;
+ }
+ fl4.daddr = info->gw.ip;
+ fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+ fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt))
+ return false;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ skb->dev = rt->dst.dev;
+ skb->protocol = htons(ETH_P_IP);
+ return true;
+}
+
+static unsigned int
+tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tee_tginfo *info = par->targinfo;
+ struct iphdr *iph;
+
+ if (__this_cpu_read(tee_active))
+ return XT_CONTINUE;
+ /*
+ * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+ * the original skb, which should continue on its way as if nothing has
+ * happened. The copy should be independently delivered to the TEE
+ * --gateway.
+ */
+ skb = pskb_copy(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return XT_CONTINUE;
+
+#ifdef WITH_CONNTRACK
+ /* Avoid counting cloned packets towards the original connection. */
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+#endif
+ /*
+ * If we are in PREROUTING/INPUT, the checksum must be recalculated
+ * since the length could have changed as a result of defragmentation.
+ *
+ * We also decrease the TTL to mitigate potential TEE loops
+ * between two hosts.
+ *
+ * Set %IP_DF so that the original source is notified of a potentially
+ * decreased MTU on the clone route. IPv6 does this too.
+ */
+ iph = ip_hdr(skb);
+ iph->frag_off |= htons(IP_DF);
+ if (par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_IN)
+ --iph->ttl;
+ ip_send_check(iph);
+
+ if (tee_tg_route4(skb, info)) {
+ __this_cpu_write(tee_active, true);
+ ip_local_out(skb);
+ __this_cpu_write(tee_active, false);
+ } else {
+ kfree_skb(skb);
+ }
+ return XT_CONTINUE;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static bool
+tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct net *net = pick_net(skb);
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ if (info->priv) {
+ if (info->priv->oif == -1)
+ return false;
+ fl6.flowi6_oif = info->priv->oif;
+ }
+ fl6.daddr = info->gw.in6;
+ fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
+ (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
+ return false;
+ }
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ skb->dev = dst->dev;
+ skb->protocol = htons(ETH_P_IPV6);
+ return true;
+}
+
+static unsigned int
+tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tee_tginfo *info = par->targinfo;
+
+ if (__this_cpu_read(tee_active))
+ return XT_CONTINUE;
+ skb = pskb_copy(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return XT_CONTINUE;
+
+#ifdef WITH_CONNTRACK
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+#endif
+ if (par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_IN) {
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ --iph->hop_limit;
+ }
+ if (tee_tg_route6(skb, info)) {
+ __this_cpu_write(tee_active, true);
+ ip6_local_out(skb);
+ __this_cpu_write(tee_active, false);
+ } else {
+ kfree_skb(skb);
+ }
+ return XT_CONTINUE;
+}
+#endif
+
+static int tee_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct xt_tee_priv *priv;
+
+ priv = container_of(this, struct xt_tee_priv, notifier);
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (!strcmp(dev->name, priv->tginfo->oif))
+ priv->oif = dev->ifindex;
+ break;
+ case NETDEV_UNREGISTER:
+ if (dev->ifindex == priv->oif)
+ priv->oif = -1;
+ break;
+ case NETDEV_CHANGENAME:
+ if (!strcmp(dev->name, priv->tginfo->oif))
+ priv->oif = dev->ifindex;
+ else if (dev->ifindex == priv->oif)
+ priv->oif = -1;
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int tee_tg_check(const struct xt_tgchk_param *par)
+{
+ struct xt_tee_tginfo *info = par->targinfo;
+ struct xt_tee_priv *priv;
+
+ /* 0.0.0.0 and :: not allowed */
+ if (memcmp(&info->gw, &tee_zero_address,
+ sizeof(tee_zero_address)) == 0)
+ return -EINVAL;
+
+ if (info->oif[0]) {
+ if (info->oif[sizeof(info->oif)-1] != '\0')
+ return -EINVAL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv == NULL)
+ return -ENOMEM;
+
+ priv->tginfo = info;
+ priv->oif = -1;
+ priv->notifier.notifier_call = tee_netdev_event;
+ info->priv = priv;
+
+ register_netdevice_notifier(&priv->notifier);
+ } else
+ info->priv = NULL;
+
+ return 0;
+}
+
+static void tee_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ struct xt_tee_tginfo *info = par->targinfo;
+
+ if (info->priv) {
+ unregister_netdevice_notifier(&info->priv->notifier);
+ kfree(info->priv);
+ }
+}
+
+static struct xt_target tee_tg_reg[] __read_mostly = {
+ {
+ .name = "TEE",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .target = tee_tg4,
+ .targetsize = sizeof(struct xt_tee_tginfo),
+ .checkentry = tee_tg_check,
+ .destroy = tee_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IPV6)
+ {
+ .name = "TEE",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .target = tee_tg6,
+ .targetsize = sizeof(struct xt_tee_tginfo),
+ .checkentry = tee_tg_check,
+ .destroy = tee_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init tee_tg_init(void)
+{
+ return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+}
+
+static void __exit tee_tg_exit(void)
+{
+ xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+}
+
+module_init(tee_tg_init);
+module_exit(tee_tg_exit);
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: Reroute packet copy");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TEE");
+MODULE_ALIAS("ip6t_TEE");
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
new file mode 100644
index 000000000..cca96cec1
--- /dev/null
+++ b/net/netfilter/xt_TPROXY.c
@@ -0,0 +1,605 @@
+/*
+ * Transparent proxy support for Linux/iptables
+ *
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
+ * Author: Balazs Scheidler, Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/inet_sock.h>
+#include <net/inet_hashtables.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+#define XT_TPROXY_HAVE_IPV6 1
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <net/inet6_hashtables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
+#include <linux/netfilter/xt_TPROXY.h>
+
+enum nf_tproxy_lookup_t {
+ NFT_LOOKUP_LISTENER,
+ NFT_LOOKUP_ESTABLISHED,
+};
+
+static bool tproxy_sk_is_transparent(struct sock *sk)
+{
+ switch (sk->sk_state) {
+ case TCP_TIME_WAIT:
+ if (inet_twsk(sk)->tw_transparent)
+ return true;
+ break;
+ case TCP_NEW_SYN_RECV:
+ if (inet_rsk(inet_reqsk(sk))->no_srccheck)
+ return true;
+ break;
+ default:
+ if (inet_sk(sk)->transparent)
+ return true;
+ }
+
+ sock_gen_put(sk);
+ return false;
+}
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+ struct in_device *indev;
+ __be32 laddr;
+
+ if (user_laddr)
+ return user_laddr;
+
+ laddr = 0;
+ rcu_read_lock();
+ indev = __in_dev_get_rcu(skb->dev);
+ for_primary_ifa(indev) {
+ laddr = ifa->ifa_local;
+ break;
+ } endfor_ifa(indev);
+ rcu_read_unlock();
+
+ return laddr ? laddr : daddr;
+}
+
+/*
+ * This is used when the user wants to intercept a connection matching
+ * an explicit iptables rule. In this case the sockets are assumed
+ * matching in preference order:
+ *
+ * - match: if there's a fully established connection matching the
+ * _packet_ tuple, it is returned, assuming the redirection
+ * already took place and we process a packet belonging to an
+ * established connection
+ *
+ * - match: if there's a listening socket matching the redirection
+ * (e.g. on-port & on-ip of the connection), it is returned,
+ * regardless if it was bound to 0.0.0.0 or an explicit
+ * address. The reasoning is that if there's an explicit rule, it
+ * does not really matter if the listener is bound to an interface
+ * or to 0. The user already stated that he wants redirection
+ * (since he added the rule).
+ *
+ * Please note that there's an overlap between what a TPROXY target
+ * and a socket match will match. Normally if you have both rules the
+ * "socket" match will be the first one, effectively all packets
+ * belonging to established connections going through that one.
+ */
+static inline struct sock *
+nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ switch (lookup_type) {
+ case NFT_LOOKUP_LISTENER:
+ sk = inet_lookup_listener(net, &tcp_hashinfo,
+ saddr, sport,
+ daddr, dport,
+ in->ifindex);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NFT_LOOKUP_ESTABLISHED:
+ sk = inet_lookup_established(net, &tcp_hashinfo,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case IPPROTO_UDP:
+ sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+ (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+ protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+
+#ifdef XT_TPROXY_HAVE_IPV6
+static inline struct sock *
+nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in,
+ const enum nf_tproxy_lookup_t lookup_type)
+{
+ struct sock *sk;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ switch (lookup_type) {
+ case NFT_LOOKUP_LISTENER:
+ sk = inet6_lookup_listener(net, &tcp_hashinfo,
+ saddr, sport,
+ daddr, ntohs(dport),
+ in->ifindex);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ break;
+ case NFT_LOOKUP_ESTABLISHED:
+ sk = __inet6_lookup_established(net, &tcp_hashinfo,
+ saddr, sport, daddr, ntohs(dport),
+ in->ifindex);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case IPPROTO_UDP:
+ sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ if (sk) {
+ int connected = (sk->sk_state == TCP_ESTABLISHED);
+ int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+
+ /* NOTE: we return listeners even if bound to
+ * 0.0.0.0, those are filtered out in
+ * xt_socket, since xt_TPROXY needs 0 bound
+ * listeners too
+ */
+ if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+ (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ break;
+ default:
+ WARN_ON(1);
+ sk = NULL;
+ }
+
+ pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+ protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+ return sk;
+}
+#endif
+
+/**
+ * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb: The skb being processed.
+ * @laddr: IPv4 address to redirect to or zero.
+ * @lport: TCP port to redirect to or zero.
+ * @sk: The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+ struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ iph->saddr, laddr ? laddr : iph->daddr,
+ hp->source, lport ? lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk));
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+/* assign a socket to the skb -- consumes sk */
+static void
+nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_edemux;
+}
+
+static unsigned int
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+ u_int32_t mark_mask, u_int32_t mark_value)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct udphdr _hdr, *hp;
+ struct sock *sk;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NF_DROP;
+
+ /* check if there's an ongoing connection on the packet
+ * addresses, this happens if the redirect already happened
+ * and the current packet belongs to an already established
+ * connection */
+ sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ iph->saddr, iph->daddr,
+ hp->source, hp->dest,
+ skb->dev, NFT_LOOKUP_ESTABLISHED);
+
+ laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+ if (!lport)
+ lport = hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+ else if (!sk)
+ /* no, there's no established connection, check if
+ * there's a listener on the redirected addr/port */
+ sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ iph->saddr, laddr,
+ hp->source, lport,
+ skb->dev, NFT_LOOKUP_LISTENER);
+
+ /* NOTE: assign_sock consumes our sk reference */
+ if (sk && tproxy_sk_is_transparent(sk)) {
+ /* This should be in a separate target, but we don't do multiple
+ targets on the same rule yet */
+ skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+ pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+ iph->protocol, &iph->daddr, ntohs(hp->dest),
+ &laddr, ntohs(lport), skb->mark);
+
+ nf_tproxy_assign_sock(skb, sk);
+ return NF_ACCEPT;
+ }
+
+ pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+ iph->protocol, &iph->saddr, ntohs(hp->source),
+ &iph->daddr, ntohs(hp->dest), skb->mark);
+ return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+ return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#ifdef XT_TPROXY_HAVE_IPV6
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+ const struct in6_addr *daddr)
+{
+ struct inet6_dev *indev;
+ struct inet6_ifaddr *ifa;
+ struct in6_addr *laddr;
+
+ if (!ipv6_addr_any(user_laddr))
+ return user_laddr;
+ laddr = NULL;
+
+ rcu_read_lock();
+ indev = __in6_dev_get(skb->dev);
+ if (indev)
+ list_for_each_entry(ifa, &indev->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+ continue;
+
+ laddr = &ifa->addr;
+ break;
+ }
+ rcu_read_unlock();
+
+ return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb: The skb being processed.
+ * @tproto: Transport protocol.
+ * @thoff: Transport protocol header offset.
+ * @par: Iptables target parameters.
+ * @sk: The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+ const struct xt_action_param *par,
+ struct sock *sk)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr _hdr, *hp;
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr,
+ tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+ hp->source,
+ tgi->lport ? tgi->lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk));
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ struct udphdr _hdr, *hp;
+ struct sock *sk;
+ const struct in6_addr *laddr;
+ __be16 lport;
+ int thoff = 0;
+ int tproto;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ /* check if there's an ongoing connection on the packet
+ * addresses, this happens if the redirect already happened
+ * and the current packet belongs to an already established
+ * connection */
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr, &iph->daddr,
+ hp->source, hp->dest,
+ par->in, NFT_LOOKUP_ESTABLISHED);
+
+ laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+ lport = tgi->lport ? tgi->lport : hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+ else if (!sk)
+ /* no there's no established connection, check if
+ * there's a listener on the redirected addr/port */
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr, laddr,
+ hp->source, lport,
+ par->in, NFT_LOOKUP_LISTENER);
+
+ /* NOTE: assign_sock consumes our sk reference */
+ if (sk && tproxy_sk_is_transparent(sk)) {
+ /* This should be in a separate target, but we don't do multiple
+ targets on the same rule yet */
+ skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
+
+ pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+ tproto, &iph->saddr, ntohs(hp->source),
+ laddr, ntohs(lport), skb->mark);
+
+ nf_tproxy_assign_sock(skb, sk);
+ return NF_ACCEPT;
+ }
+
+ pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+ tproto, &iph->saddr, ntohs(hp->source),
+ &iph->daddr, ntohs(hp->dest), skb->mark);
+
+ return NF_DROP;
+}
+
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+ const struct ip6t_ip6 *i = par->entryinfo;
+
+ if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) &&
+ !(i->invflags & IP6T_INV_PROTO))
+ return 0;
+
+ pr_info("Can be used only in combination with "
+ "either -p tcp or -p udp\n");
+ return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+ const struct ipt_ip *i = par->entryinfo;
+
+ if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+ && !(i->invflags & IPT_INV_PROTO))
+ return 0;
+
+ pr_info("Can be used only in combination with "
+ "either -p tcp or -p udp\n");
+ return -EINVAL;
+}
+
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tproxy_tg4_v0,
+ .revision = 0,
+ .targetsize = sizeof(struct xt_tproxy_target_info),
+ .checkentry = tproxy_tg4_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tproxy_tg4_v1,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_tproxy_target_info_v1),
+ .checkentry = tproxy_tg4_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+#ifdef XT_TPROXY_HAVE_IPV6
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV6,
+ .table = "mangle",
+ .target = tproxy_tg6_v1,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_tproxy_target_info_v1),
+ .checkentry = tproxy_tg6_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+#endif
+
+};
+
+static int __init tproxy_tg_init(void)
+{
+ nf_defrag_ipv4_enable();
+#ifdef XT_TPROXY_HAVE_IPV6
+ nf_defrag_ipv6_enable();
+#endif
+
+ return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
+}
+
+static void __exit tproxy_tg_exit(void)
+{
+ xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
+}
+
+module_init(tproxy_tg_init);
+module_exit(tproxy_tg_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
+MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
new file mode 100644
index 000000000..df48967af
--- /dev/null
+++ b/net/netfilter/xt_TRACE.c
@@ -0,0 +1,40 @@
+/* This is a module which is used to mark packets for tracing.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+
+MODULE_DESCRIPTION("Xtables: packet flow tracing");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TRACE");
+MODULE_ALIAS("ip6t_TRACE");
+
+static unsigned int
+trace_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ skb->nf_trace = 1;
+ return XT_CONTINUE;
+}
+
+static struct xt_target trace_tg_reg __read_mostly = {
+ .name = "TRACE",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .table = "raw",
+ .target = trace_tg,
+ .me = THIS_MODULE,
+};
+
+static int __init trace_tg_init(void)
+{
+ return xt_register_target(&trace_tg_reg);
+}
+
+static void __exit trace_tg_exit(void)
+{
+ xt_unregister_target(&trace_tg_reg);
+}
+
+module_init(trace_tg_init);
+module_exit(trace_tg_exit);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
new file mode 100644
index 000000000..fab6eea1b
--- /dev/null
+++ b/net/netfilter/xt_addrtype.c
@@ -0,0 +1,248 @@
+/*
+ * iptables module to match inet_addr_type() of an ip.
+ *
+ * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
+ * (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/ip6_fib.h>
+#endif
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/xt_addrtype.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: address type match");
+MODULE_ALIAS("ipt_addrtype");
+MODULE_ALIAS("ip6t_addrtype");
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
+ const struct in6_addr *addr, u16 mask)
+{
+ const struct nf_afinfo *afinfo;
+ struct flowi6 flow;
+ struct rt6_info *rt;
+ u32 ret = 0;
+ int route_err;
+
+ memset(&flow, 0, sizeof(flow));
+ flow.daddr = *addr;
+ if (dev)
+ flow.flowi6_oif = dev->ifindex;
+
+ rcu_read_lock();
+
+ afinfo = nf_get_afinfo(NFPROTO_IPV6);
+ if (afinfo != NULL) {
+ const struct nf_ipv6_ops *v6ops;
+
+ if (dev && (mask & XT_ADDRTYPE_LOCAL)) {
+ v6ops = nf_get_ipv6_ops();
+ if (v6ops && v6ops->chk_addr(net, addr, dev, true))
+ ret = XT_ADDRTYPE_LOCAL;
+ }
+ route_err = afinfo->route(net, (struct dst_entry **)&rt,
+ flowi6_to_flowi(&flow), false);
+ } else {
+ route_err = 1;
+ }
+ rcu_read_unlock();
+
+ if (route_err)
+ return XT_ADDRTYPE_UNREACHABLE;
+
+ if (rt->rt6i_flags & RTF_REJECT)
+ ret = XT_ADDRTYPE_UNREACHABLE;
+
+ if (dev == NULL && rt->rt6i_flags & RTF_LOCAL)
+ ret |= XT_ADDRTYPE_LOCAL;
+ if (rt->rt6i_flags & RTF_ANYCAST)
+ ret |= XT_ADDRTYPE_ANYCAST;
+
+ dst_release(&rt->dst);
+ return ret;
+}
+
+static bool match_type6(struct net *net, const struct net_device *dev,
+ const struct in6_addr *addr, u16 mask)
+{
+ int addr_type = ipv6_addr_type(addr);
+
+ if ((mask & XT_ADDRTYPE_MULTICAST) &&
+ !(addr_type & IPV6_ADDR_MULTICAST))
+ return false;
+ if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST))
+ return false;
+ if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY)
+ return false;
+
+ if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST |
+ XT_ADDRTYPE_UNREACHABLE) & mask)
+ return !!(mask & match_lookup_rt6(net, dev, addr, mask));
+ return true;
+}
+
+static bool
+addrtype_mt6(struct net *net, const struct net_device *dev,
+ const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ bool ret = true;
+
+ if (info->source)
+ ret &= match_type6(net, dev, &iph->saddr, info->source) ^
+ (info->flags & XT_ADDRTYPE_INVERT_SOURCE);
+ if (ret && info->dest)
+ ret &= match_type6(net, dev, &iph->daddr, info->dest) ^
+ !!(info->flags & XT_ADDRTYPE_INVERT_DEST);
+ return ret;
+}
+#endif
+
+static inline bool match_type(struct net *net, const struct net_device *dev,
+ __be32 addr, u_int16_t mask)
+{
+ return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
+}
+
+static bool
+addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct net *net = dev_net(par->in ? par->in : par->out);
+ const struct xt_addrtype_info *info = par->matchinfo;
+ const struct iphdr *iph = ip_hdr(skb);
+ bool ret = true;
+
+ if (info->source)
+ ret &= match_type(net, NULL, iph->saddr, info->source) ^
+ info->invert_source;
+ if (info->dest)
+ ret &= match_type(net, NULL, iph->daddr, info->dest) ^
+ info->invert_dest;
+
+ return ret;
+}
+
+static bool
+addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct net *net = dev_net(par->in ? par->in : par->out);
+ const struct xt_addrtype_info_v1 *info = par->matchinfo;
+ const struct iphdr *iph;
+ const struct net_device *dev = NULL;
+ bool ret = true;
+
+ if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN)
+ dev = par->in;
+ else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
+ dev = par->out;
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ if (par->family == NFPROTO_IPV6)
+ return addrtype_mt6(net, dev, skb, info);
+#endif
+ iph = ip_hdr(skb);
+ if (info->source)
+ ret &= match_type(net, dev, iph->saddr, info->source) ^
+ (info->flags & XT_ADDRTYPE_INVERT_SOURCE);
+ if (ret && info->dest)
+ ret &= match_type(net, dev, iph->daddr, info->dest) ^
+ !!(info->flags & XT_ADDRTYPE_INVERT_DEST);
+ return ret;
+}
+
+static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_addrtype_info_v1 *info = par->matchinfo;
+
+ if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) {
+ pr_info("both incoming and outgoing "
+ "interface limitation cannot be selected\n");
+ return -EINVAL;
+ }
+
+ if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN)) &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) {
+ pr_info("output interface limitation "
+ "not valid in PREROUTING and INPUT\n");
+ return -EINVAL;
+ }
+
+ if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT)) &&
+ info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) {
+ pr_info("input interface limitation "
+ "not valid in POSTROUTING and OUTPUT\n");
+ return -EINVAL;
+ }
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ if (par->family == NFPROTO_IPV6) {
+ if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) {
+ pr_err("ipv6 BLACKHOLE matching not supported\n");
+ return -EINVAL;
+ }
+ if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) {
+ pr_err("ipv6 PROHIBIT (THROW, NAT ..) matching not supported\n");
+ return -EINVAL;
+ }
+ if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) {
+ pr_err("ipv6 does not support BROADCAST matching\n");
+ return -EINVAL;
+ }
+ }
+#endif
+ return 0;
+}
+
+static struct xt_match addrtype_mt_reg[] __read_mostly = {
+ {
+ .name = "addrtype",
+ .family = NFPROTO_IPV4,
+ .match = addrtype_mt_v0,
+ .matchsize = sizeof(struct xt_addrtype_info),
+ .me = THIS_MODULE
+ },
+ {
+ .name = "addrtype",
+ .family = NFPROTO_UNSPEC,
+ .revision = 1,
+ .match = addrtype_mt_v1,
+ .checkentry = addrtype_mt_checkentry_v1,
+ .matchsize = sizeof(struct xt_addrtype_info_v1),
+ .me = THIS_MODULE
+ }
+};
+
+static int __init addrtype_mt_init(void)
+{
+ return xt_register_matches(addrtype_mt_reg,
+ ARRAY_SIZE(addrtype_mt_reg));
+}
+
+static void __exit addrtype_mt_exit(void)
+{
+ xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
+}
+
+module_init(addrtype_mt_init);
+module_exit(addrtype_mt_exit);
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
new file mode 100644
index 000000000..dffee9d47
--- /dev/null
+++ b/net/netfilter/xt_bpf.c
@@ -0,0 +1,74 @@
+/* Xtables module to match packets using a BPF filter.
+ * Copyright 2013 Google Inc.
+ * Written by Willem de Bruijn <willemb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+
+#include <linux/netfilter/xt_bpf.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Willem de Bruijn <willemb@google.com>");
+MODULE_DESCRIPTION("Xtables: BPF filter match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_bpf");
+MODULE_ALIAS("ip6t_bpf");
+
+static int bpf_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_bpf_info *info = par->matchinfo;
+ struct sock_fprog_kern program;
+
+ program.len = info->bpf_program_num_elem;
+ program.filter = info->bpf_program;
+
+ if (bpf_prog_create(&info->filter, &program)) {
+ pr_info("bpf: check failed: parse error\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_bpf_info *info = par->matchinfo;
+
+ return BPF_PROG_RUN(info->filter, skb);
+}
+
+static void bpf_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_bpf_info *info = par->matchinfo;
+ bpf_prog_destroy(info->filter);
+}
+
+static struct xt_match bpf_mt_reg __read_mostly = {
+ .name = "bpf",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = bpf_mt_check,
+ .match = bpf_mt,
+ .destroy = bpf_mt_destroy,
+ .matchsize = sizeof(struct xt_bpf_info),
+ .me = THIS_MODULE,
+};
+
+static int __init bpf_mt_init(void)
+{
+ return xt_register_match(&bpf_mt_reg);
+}
+
+static void __exit bpf_mt_exit(void)
+{
+ xt_unregister_match(&bpf_mt_reg);
+}
+
+module_init(bpf_mt_init);
+module_exit(bpf_mt_exit);
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
new file mode 100644
index 000000000..a1d126f29
--- /dev/null
+++ b/net/netfilter/xt_cgroup.c
@@ -0,0 +1,72 @@
+/*
+ * Xtables module to match the process control group.
+ *
+ * Might be used to implement individual "per-application" firewall
+ * policies in contrast to global policies based on control groups.
+ * Matching is based upon processes tagged to net_cls' classid marker.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_cgroup.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("Xtables: process control group matching");
+MODULE_ALIAS("ipt_cgroup");
+MODULE_ALIAS("ip6t_cgroup");
+
+static int cgroup_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_cgroup_info *info = par->matchinfo;
+
+ if (info->invert & ~1)
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool
+cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_cgroup_info *info = par->matchinfo;
+
+ if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ return false;
+
+ return (info->id == skb->sk->sk_classid) ^ info->invert;
+}
+
+static struct xt_match cgroup_mt_reg __read_mostly = {
+ .name = "cgroup",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = cgroup_mt_check,
+ .match = cgroup_mt,
+ .matchsize = sizeof(struct xt_cgroup_info),
+ .me = THIS_MODULE,
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+};
+
+static int __init cgroup_mt_init(void)
+{
+ return xt_register_match(&cgroup_mt_reg);
+}
+
+static void __exit cgroup_mt_exit(void)
+{
+ xt_unregister_match(&cgroup_mt_reg);
+}
+
+module_init(cgroup_mt_init);
+module_exit(cgroup_mt_exit);
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
new file mode 100644
index 000000000..96fa26b20
--- /dev/null
+++ b/net/netfilter/xt_cluster.c
@@ -0,0 +1,179 @@
+/*
+ * (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/xt_cluster.h>
+
+static inline u32 nf_ct_orig_ipv4_src(const struct nf_conn *ct)
+{
+ return (__force u32)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+}
+
+static inline const u32 *nf_ct_orig_ipv6_src(const struct nf_conn *ct)
+{
+ return (__force u32 *)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6;
+}
+
+static inline u_int32_t
+xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info)
+{
+ return jhash_1word(ip, info->hash_seed);
+}
+
+static inline u_int32_t
+xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info)
+{
+ return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed);
+}
+
+static inline u_int32_t
+xt_cluster_hash(const struct nf_conn *ct,
+ const struct xt_cluster_match_info *info)
+{
+ u_int32_t hash = 0;
+
+ switch(nf_ct_l3num(ct)) {
+ case AF_INET:
+ hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info);
+ break;
+ case AF_INET6:
+ hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+
+ return reciprocal_scale(hash, info->total_nodes);
+}
+
+static inline bool
+xt_cluster_ipv6_is_multicast(const struct in6_addr *addr)
+{
+ __be32 st = addr->s6_addr32[0];
+ return ((st & htonl(0xFF000000)) == htonl(0xFF000000));
+}
+
+static inline bool
+xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family)
+{
+ bool is_multicast = false;
+
+ switch(family) {
+ case NFPROTO_IPV4:
+ is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr);
+ break;
+ case NFPROTO_IPV6:
+ is_multicast =
+ xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+ return is_multicast;
+}
+
+static bool
+xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct sk_buff *pskb = (struct sk_buff *)skb;
+ const struct xt_cluster_match_info *info = par->matchinfo;
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned long hash;
+
+ /* This match assumes that all nodes see the same packets. This can be
+ * achieved if the switch that connects the cluster nodes support some
+ * sort of 'port mirroring'. However, if your switch does not support
+ * this, your cluster nodes can reply ARP request using a multicast MAC
+ * address. Thus, your switch will flood the same packets to the
+ * cluster nodes with the same multicast MAC address. Using a multicast
+ * link address is a RFC 1812 (section 3.3.2) violation, but this works
+ * fine in practise.
+ *
+ * Unfortunately, if you use the multicast MAC address, the link layer
+ * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted
+ * by TCP and others for packets coming to this node. For that reason,
+ * this match mangles skbuff's pkt_type if it detects a packet
+ * addressed to a unicast address but using PACKET_MULTICAST. Yes, I
+ * know, matches should not alter packets, but we are doing this here
+ * because we would need to add a PKTTYPE target for this sole purpose.
+ */
+ if (!xt_cluster_is_multicast_addr(skb, par->family) &&
+ skb->pkt_type == PACKET_MULTICAST) {
+ pskb->pkt_type = PACKET_HOST;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return false;
+
+ if (nf_ct_is_untracked(ct))
+ return false;
+
+ if (ct->master)
+ hash = xt_cluster_hash(ct->master, info);
+ else
+ hash = xt_cluster_hash(ct, info);
+
+ return !!((1 << hash) & info->node_mask) ^
+ !!(info->flags & XT_CLUSTER_F_INV);
+}
+
+static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_cluster_match_info *info = par->matchinfo;
+
+ if (info->total_nodes > XT_CLUSTER_NODES_MAX) {
+ pr_info("you have exceeded the maximum "
+ "number of cluster nodes (%u > %u)\n",
+ info->total_nodes, XT_CLUSTER_NODES_MAX);
+ return -EINVAL;
+ }
+ if (info->node_mask >= (1ULL << info->total_nodes)) {
+ pr_info("this node mask cannot be "
+ "higher than the total number of nodes\n");
+ return -EDOM;
+ }
+ return 0;
+}
+
+static struct xt_match xt_cluster_match __read_mostly = {
+ .name = "cluster",
+ .family = NFPROTO_UNSPEC,
+ .match = xt_cluster_mt,
+ .checkentry = xt_cluster_mt_checkentry,
+ .matchsize = sizeof(struct xt_cluster_match_info),
+ .me = THIS_MODULE,
+};
+
+static int __init xt_cluster_mt_init(void)
+{
+ return xt_register_match(&xt_cluster_match);
+}
+
+static void __exit xt_cluster_mt_fini(void)
+{
+ xt_unregister_match(&xt_cluster_match);
+}
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: hash-based cluster match");
+MODULE_ALIAS("ipt_cluster");
+MODULE_ALIAS("ip6t_cluster");
+module_init(xt_cluster_mt_init);
+module_exit(xt_cluster_mt_fini);
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
new file mode 100644
index 000000000..5c861d2f2
--- /dev/null
+++ b/net/netfilter/xt_comment.c
@@ -0,0 +1,45 @@
+/*
+ * Implements a dummy match to allow attaching comments to rules
+ *
+ * 2003-05-13 Brad Fisher (brad@info-link.net)
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_comment.h>
+
+MODULE_AUTHOR("Brad Fisher <brad@info-link.net>");
+MODULE_DESCRIPTION("Xtables: No-op match which can be tagged with a comment");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_comment");
+MODULE_ALIAS("ip6t_comment");
+
+static bool
+comment_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ /* We always match */
+ return true;
+}
+
+static struct xt_match comment_mt_reg __read_mostly = {
+ .name = "comment",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = comment_mt,
+ .matchsize = sizeof(struct xt_comment_info),
+ .me = THIS_MODULE,
+};
+
+static int __init comment_mt_init(void)
+{
+ return xt_register_match(&comment_mt_reg);
+}
+
+static void __exit comment_mt_exit(void)
+{
+ xt_unregister_match(&comment_mt_reg);
+}
+
+module_init(comment_mt_init);
+module_exit(comment_mt_exit);
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
new file mode 100644
index 000000000..d4bec261e
--- /dev/null
+++ b/net/netfilter/xt_connbytes.c
@@ -0,0 +1,157 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/math64.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connbytes.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching");
+MODULE_ALIAS("ipt_connbytes");
+MODULE_ALIAS("ip6t_connbytes");
+
+static bool
+connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_connbytes_info *sinfo = par->matchinfo;
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ u_int64_t what = 0; /* initialize to make gcc happy */
+ u_int64_t bytes = 0;
+ u_int64_t pkts = 0;
+ const struct nf_conn_acct *acct;
+ const struct nf_conn_counter *counters;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+
+ acct = nf_conn_acct_find(ct);
+ if (!acct)
+ return false;
+
+ counters = acct->counter;
+ switch (sinfo->what) {
+ case XT_CONNBYTES_PKTS:
+ switch (sinfo->direction) {
+ case XT_CONNBYTES_DIR_ORIGINAL:
+ what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets);
+ break;
+ case XT_CONNBYTES_DIR_REPLY:
+ what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets);
+ break;
+ case XT_CONNBYTES_DIR_BOTH:
+ what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets);
+ what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets);
+ break;
+ }
+ break;
+ case XT_CONNBYTES_BYTES:
+ switch (sinfo->direction) {
+ case XT_CONNBYTES_DIR_ORIGINAL:
+ what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes);
+ break;
+ case XT_CONNBYTES_DIR_REPLY:
+ what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);
+ break;
+ case XT_CONNBYTES_DIR_BOTH:
+ what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes);
+ what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);
+ break;
+ }
+ break;
+ case XT_CONNBYTES_AVGPKT:
+ switch (sinfo->direction) {
+ case XT_CONNBYTES_DIR_ORIGINAL:
+ bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes);
+ pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets);
+ break;
+ case XT_CONNBYTES_DIR_REPLY:
+ bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);
+ pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets);
+ break;
+ case XT_CONNBYTES_DIR_BOTH:
+ bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) +
+ atomic64_read(&counters[IP_CT_DIR_REPLY].bytes);
+ pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) +
+ atomic64_read(&counters[IP_CT_DIR_REPLY].packets);
+ break;
+ }
+ if (pkts != 0)
+ what = div64_u64(bytes, pkts);
+ break;
+ }
+
+ if (sinfo->count.to >= sinfo->count.from)
+ return what <= sinfo->count.to && what >= sinfo->count.from;
+ else /* inverted */
+ return what < sinfo->count.to || what > sinfo->count.from;
+}
+
+static int connbytes_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_connbytes_info *sinfo = par->matchinfo;
+ int ret;
+
+ if (sinfo->what != XT_CONNBYTES_PKTS &&
+ sinfo->what != XT_CONNBYTES_BYTES &&
+ sinfo->what != XT_CONNBYTES_AVGPKT)
+ return -EINVAL;
+
+ if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL &&
+ sinfo->direction != XT_CONNBYTES_DIR_REPLY &&
+ sinfo->direction != XT_CONNBYTES_DIR_BOTH)
+ return -EINVAL;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+
+ /*
+ * This filter cannot function correctly unless connection tracking
+ * accounting is enabled, so complain in the hope that someone notices.
+ */
+ if (!nf_ct_acct_enabled(par->net)) {
+ pr_warn("Forcing CT accounting to be enabled\n");
+ nf_ct_set_acct(par->net, true);
+ }
+
+ return ret;
+}
+
+static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match connbytes_mt_reg __read_mostly = {
+ .name = "connbytes",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connbytes_mt_check,
+ .match = connbytes_mt,
+ .destroy = connbytes_mt_destroy,
+ .matchsize = sizeof(struct xt_connbytes_info),
+ .me = THIS_MODULE,
+};
+
+static int __init connbytes_mt_init(void)
+{
+ return xt_register_match(&connbytes_mt_reg);
+}
+
+static void __exit connbytes_mt_exit(void)
+{
+ xt_unregister_match(&connbytes_mt_reg);
+}
+
+module_init(connbytes_mt_init);
+module_exit(connbytes_mt_exit);
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
new file mode 100644
index 000000000..9f8719df2
--- /dev/null
+++ b/net/netfilter/xt_connlabel.c
@@ -0,0 +1,99 @@
+/*
+ * (C) 2013 Astaro GmbH & Co KG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("Xtables: add/match connection trackling labels");
+MODULE_ALIAS("ipt_connlabel");
+MODULE_ALIAS("ip6t_connlabel");
+
+static bool
+connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_connlabel_mtinfo *info = par->matchinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ bool invert = info->options & XT_CONNLABEL_OP_INVERT;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return invert;
+
+ if (info->options & XT_CONNLABEL_OP_SET)
+ return (nf_connlabel_set(ct, info->bit) == 0) ^ invert;
+
+ return nf_connlabel_match(ct, info->bit) ^ invert;
+}
+
+static int connlabel_mt_check(const struct xt_mtchk_param *par)
+{
+ const int options = XT_CONNLABEL_OP_INVERT |
+ XT_CONNLABEL_OP_SET;
+ struct xt_connlabel_mtinfo *info = par->matchinfo;
+ int ret;
+ size_t words;
+
+ if (info->bit > XT_CONNLABEL_MAXBIT)
+ return -ERANGE;
+
+ if (info->options & ~options) {
+ pr_err("Unknown options in mask %x\n", info->options);
+ return -EINVAL;
+ }
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0) {
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+ }
+
+ par->net->ct.labels_used++;
+ words = BITS_TO_LONGS(info->bit+1);
+ if (words > par->net->ct.label_words)
+ par->net->ct.label_words = words;
+
+ return ret;
+}
+
+static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ par->net->ct.labels_used--;
+ if (par->net->ct.labels_used == 0)
+ par->net->ct.label_words = 0;
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match connlabels_mt_reg __read_mostly = {
+ .name = "connlabel",
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connlabel_mt_check,
+ .match = connlabel_mt,
+ .matchsize = sizeof(struct xt_connlabel_mtinfo),
+ .destroy = connlabel_mt_destroy,
+ .me = THIS_MODULE,
+};
+
+static int __init connlabel_mt_init(void)
+{
+ return xt_register_match(&connlabels_mt_reg);
+}
+
+static void __exit connlabel_mt_exit(void)
+{
+ xt_unregister_match(&connlabels_mt_reg);
+}
+
+module_init(connlabel_mt_init);
+module_exit(connlabel_mt_exit);
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
new file mode 100644
index 000000000..29ba6218a
--- /dev/null
+++ b/net/netfilter/xt_connlimit.c
@@ -0,0 +1,487 @@
+/*
+ * netfilter module to limit the number of parallel tcp
+ * connections per IP address.
+ * (c) 2000 Gerd Knorr <kraxel@bytesex.org>
+ * Nov 2002: Martin Bene <martin.bene@icomedias.com>:
+ * only ignore TIME_WAIT or gone connections
+ * (C) CC Computer Consultants GmbH, 2007
+ *
+ * based on ...
+ *
+ * Kernel module to match connection tracking information.
+ * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au).
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_tcp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connlimit.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+#define CONNLIMIT_SLOTS 256U
+
+#ifdef CONFIG_LOCKDEP
+#define CONNLIMIT_LOCK_SLOTS 8U
+#else
+#define CONNLIMIT_LOCK_SLOTS 256U
+#endif
+
+#define CONNLIMIT_GC_MAX_NODES 8
+
+/* we will save the tuples of all connections we care about */
+struct xt_connlimit_conn {
+ struct hlist_node node;
+ struct nf_conntrack_tuple tuple;
+ union nf_inet_addr addr;
+};
+
+struct xt_connlimit_rb {
+ struct rb_node node;
+ struct hlist_head hhead; /* connections/hosts in same subnet */
+ union nf_inet_addr addr; /* search key */
+};
+
+static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp;
+
+struct xt_connlimit_data {
+ struct rb_root climit_root4[CONNLIMIT_SLOTS];
+ struct rb_root climit_root6[CONNLIMIT_SLOTS];
+};
+
+static u_int32_t connlimit_rnd __read_mostly;
+static struct kmem_cache *connlimit_rb_cachep __read_mostly;
+static struct kmem_cache *connlimit_conn_cachep __read_mostly;
+
+static inline unsigned int connlimit_iphash(__be32 addr)
+{
+ return jhash_1word((__force __u32)addr,
+ connlimit_rnd) % CONNLIMIT_SLOTS;
+}
+
+static inline unsigned int
+connlimit_iphash6(const union nf_inet_addr *addr,
+ const union nf_inet_addr *mask)
+{
+ union nf_inet_addr res;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i)
+ res.ip6[i] = addr->ip6[i] & mask->ip6[i];
+
+ return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6),
+ connlimit_rnd) % CONNLIMIT_SLOTS;
+}
+
+static inline bool already_closed(const struct nf_conn *conn)
+{
+ if (nf_ct_protonum(conn) == IPPROTO_TCP)
+ return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
+ conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
+ else
+ return 0;
+}
+
+static int
+same_source_net(const union nf_inet_addr *addr,
+ const union nf_inet_addr *mask,
+ const union nf_inet_addr *u3, u_int8_t family)
+{
+ if (family == NFPROTO_IPV4) {
+ return ntohl(addr->ip & mask->ip) -
+ ntohl(u3->ip & mask->ip);
+ } else {
+ union nf_inet_addr lh, rh;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) {
+ lh.ip6[i] = addr->ip6[i] & mask->ip6[i];
+ rh.ip6[i] = u3->ip6[i] & mask->ip6[i];
+ }
+
+ return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6));
+ }
+}
+
+static bool add_hlist(struct hlist_head *head,
+ const struct nf_conntrack_tuple *tuple,
+ const union nf_inet_addr *addr)
+{
+ struct xt_connlimit_conn *conn;
+
+ conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
+ if (conn == NULL)
+ return false;
+ conn->tuple = *tuple;
+ conn->addr = *addr;
+ hlist_add_head(&conn->node, head);
+ return true;
+}
+
+static unsigned int check_hlist(struct net *net,
+ struct hlist_head *head,
+ const struct nf_conntrack_tuple *tuple,
+ u16 zone,
+ bool *addit)
+{
+ const struct nf_conntrack_tuple_hash *found;
+ struct xt_connlimit_conn *conn;
+ struct hlist_node *n;
+ struct nf_conn *found_ct;
+ unsigned int length = 0;
+
+ *addit = true;
+ rcu_read_lock();
+
+ /* check the saved connections */
+ hlist_for_each_entry_safe(conn, n, head, node) {
+ found = nf_conntrack_find_get(net, zone, &conn->tuple);
+ if (found == NULL) {
+ hlist_del(&conn->node);
+ kmem_cache_free(connlimit_conn_cachep, conn);
+ continue;
+ }
+
+ found_ct = nf_ct_tuplehash_to_ctrack(found);
+
+ if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
+ /*
+ * Just to be sure we have it only once in the list.
+ * We should not see tuples twice unless someone hooks
+ * this into a table without "-p tcp --syn".
+ */
+ *addit = false;
+ } else if (already_closed(found_ct)) {
+ /*
+ * we do not care about connections which are
+ * closed already -> ditch it
+ */
+ nf_ct_put(found_ct);
+ hlist_del(&conn->node);
+ kmem_cache_free(connlimit_conn_cachep, conn);
+ continue;
+ }
+
+ nf_ct_put(found_ct);
+ length++;
+ }
+
+ rcu_read_unlock();
+
+ return length;
+}
+
+static void tree_nodes_free(struct rb_root *root,
+ struct xt_connlimit_rb *gc_nodes[],
+ unsigned int gc_count)
+{
+ struct xt_connlimit_rb *rbconn;
+
+ while (gc_count) {
+ rbconn = gc_nodes[--gc_count];
+ rb_erase(&rbconn->node, root);
+ kmem_cache_free(connlimit_rb_cachep, rbconn);
+ }
+}
+
+static unsigned int
+count_tree(struct net *net, struct rb_root *root,
+ const struct nf_conntrack_tuple *tuple,
+ const union nf_inet_addr *addr, const union nf_inet_addr *mask,
+ u8 family, u16 zone)
+{
+ struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
+ struct rb_node **rbnode, *parent;
+ struct xt_connlimit_rb *rbconn;
+ struct xt_connlimit_conn *conn;
+ unsigned int gc_count;
+ bool no_gc = false;
+
+ restart:
+ gc_count = 0;
+ parent = NULL;
+ rbnode = &(root->rb_node);
+ while (*rbnode) {
+ int diff;
+ bool addit;
+
+ rbconn = container_of(*rbnode, struct xt_connlimit_rb, node);
+
+ parent = *rbnode;
+ diff = same_source_net(addr, mask, &rbconn->addr, family);
+ if (diff < 0) {
+ rbnode = &((*rbnode)->rb_left);
+ } else if (diff > 0) {
+ rbnode = &((*rbnode)->rb_right);
+ } else {
+ /* same source network -> be counted! */
+ unsigned int count;
+ count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+
+ tree_nodes_free(root, gc_nodes, gc_count);
+ if (!addit)
+ return count;
+
+ if (!add_hlist(&rbconn->hhead, tuple, addr))
+ return 0; /* hotdrop */
+
+ return count + 1;
+ }
+
+ if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
+ continue;
+
+ /* only used for GC on hhead, retval and 'addit' ignored */
+ check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+ if (hlist_empty(&rbconn->hhead))
+ gc_nodes[gc_count++] = rbconn;
+ }
+
+ if (gc_count) {
+ no_gc = true;
+ tree_nodes_free(root, gc_nodes, gc_count);
+ /* tree_node_free before new allocation permits
+ * allocator to re-use newly free'd object.
+ *
+ * This is a rare event; in most cases we will find
+ * existing node to re-use. (or gc_count is 0).
+ */
+ goto restart;
+ }
+
+ /* no match, need to insert new node */
+ rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC);
+ if (rbconn == NULL)
+ return 0;
+
+ conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
+ if (conn == NULL) {
+ kmem_cache_free(connlimit_rb_cachep, rbconn);
+ return 0;
+ }
+
+ conn->tuple = *tuple;
+ conn->addr = *addr;
+ rbconn->addr = *addr;
+
+ INIT_HLIST_HEAD(&rbconn->hhead);
+ hlist_add_head(&conn->node, &rbconn->hhead);
+
+ rb_link_node(&rbconn->node, parent, rbnode);
+ rb_insert_color(&rbconn->node, root);
+ return 1;
+}
+
+static int count_them(struct net *net,
+ struct xt_connlimit_data *data,
+ const struct nf_conntrack_tuple *tuple,
+ const union nf_inet_addr *addr,
+ const union nf_inet_addr *mask,
+ u_int8_t family, u16 zone)
+{
+ struct rb_root *root;
+ int count;
+ u32 hash;
+
+ if (family == NFPROTO_IPV6) {
+ hash = connlimit_iphash6(addr, mask);
+ root = &data->climit_root6[hash];
+ } else {
+ hash = connlimit_iphash(addr->ip & mask->ip);
+ root = &data->climit_root4[hash];
+ }
+
+ spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
+
+ count = count_tree(net, root, tuple, addr, mask, family, zone);
+
+ spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
+
+ return count;
+}
+
+static bool
+connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct net *net = dev_net(par->in ? par->in : par->out);
+ const struct xt_connlimit_info *info = par->matchinfo;
+ union nf_inet_addr addr;
+ struct nf_conntrack_tuple tuple;
+ const struct nf_conntrack_tuple *tuple_ptr = &tuple;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int connections;
+ u16 zone = NF_CT_DEFAULT_ZONE;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct != NULL) {
+ tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ zone = nf_ct_zone(ct);
+ } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ par->family, &tuple)) {
+ goto hotdrop;
+ }
+
+ if (par->family == NFPROTO_IPV6) {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
+ &iph->daddr : &iph->saddr, sizeof(addr.ip6));
+ } else {
+ const struct iphdr *iph = ip_hdr(skb);
+ addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ?
+ iph->daddr : iph->saddr;
+ }
+
+ connections = count_them(net, info->data, tuple_ptr, &addr,
+ &info->mask, par->family, zone);
+ if (connections == 0)
+ /* kmalloc failed, drop it entirely */
+ goto hotdrop;
+
+ return (connections > info->limit) ^
+ !!(info->flags & XT_CONNLIMIT_INVERT);
+
+ hotdrop:
+ par->hotdrop = true;
+ return false;
+}
+
+static int connlimit_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_connlimit_info *info = par->matchinfo;
+ unsigned int i;
+ int ret;
+
+ if (unlikely(!connlimit_rnd)) {
+ u_int32_t rand;
+
+ do {
+ get_random_bytes(&rand, sizeof(rand));
+ } while (!rand);
+ cmpxchg(&connlimit_rnd, 0, rand);
+ }
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0) {
+ pr_info("cannot load conntrack support for "
+ "address family %u\n", par->family);
+ return ret;
+ }
+
+ /* init private data */
+ info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
+ if (info->data == NULL) {
+ nf_ct_l3proto_module_put(par->family);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
+ info->data->climit_root4[i] = RB_ROOT;
+ for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
+ info->data->climit_root6[i] = RB_ROOT;
+
+ return 0;
+}
+
+static void destroy_tree(struct rb_root *r)
+{
+ struct xt_connlimit_conn *conn;
+ struct xt_connlimit_rb *rbconn;
+ struct hlist_node *n;
+ struct rb_node *node;
+
+ while ((node = rb_first(r)) != NULL) {
+ rbconn = container_of(node, struct xt_connlimit_rb, node);
+
+ rb_erase(node, r);
+
+ hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
+ kmem_cache_free(connlimit_conn_cachep, conn);
+
+ kmem_cache_free(connlimit_rb_cachep, rbconn);
+ }
+}
+
+static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_connlimit_info *info = par->matchinfo;
+ unsigned int i;
+
+ nf_ct_l3proto_module_put(par->family);
+
+ for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
+ destroy_tree(&info->data->climit_root4[i]);
+ for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
+ destroy_tree(&info->data->climit_root6[i]);
+
+ kfree(info->data);
+}
+
+static struct xt_match connlimit_mt_reg __read_mostly = {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+};
+
+static int __init connlimit_mt_init(void)
+{
+ int ret, i;
+
+ BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS);
+ BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0);
+
+ for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i)
+ spin_lock_init(&xt_connlimit_locks[i]);
+
+ connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn",
+ sizeof(struct xt_connlimit_conn),
+ 0, 0, NULL);
+ if (!connlimit_conn_cachep)
+ return -ENOMEM;
+
+ connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb",
+ sizeof(struct xt_connlimit_rb),
+ 0, 0, NULL);
+ if (!connlimit_rb_cachep) {
+ kmem_cache_destroy(connlimit_conn_cachep);
+ return -ENOMEM;
+ }
+ ret = xt_register_match(&connlimit_mt_reg);
+ if (ret != 0) {
+ kmem_cache_destroy(connlimit_conn_cachep);
+ kmem_cache_destroy(connlimit_rb_cachep);
+ }
+ return ret;
+}
+
+static void __exit connlimit_mt_exit(void)
+{
+ xt_unregister_match(&connlimit_mt_reg);
+ kmem_cache_destroy(connlimit_conn_cachep);
+ kmem_cache_destroy(connlimit_rb_cachep);
+}
+
+module_init(connlimit_mt_init);
+module_exit(connlimit_mt_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: Number of connections matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_connlimit");
+MODULE_ALIAS("ip6t_connlimit");
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
new file mode 100644
index 000000000..69f78e96f
--- /dev/null
+++ b/net/netfilter/xt_connmark.c
@@ -0,0 +1,166 @@
+/*
+ * xt_connmark - Netfilter module to operate on connection marks
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ * Jan Engelhardt <jengelh@medozas.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connmark.h>
+
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>");
+MODULE_DESCRIPTION("Xtables: connection mark operations");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_CONNMARK");
+MODULE_ALIAS("ip6t_CONNMARK");
+MODULE_ALIAS("ipt_connmark");
+MODULE_ALIAS("ip6t_connmark");
+
+static unsigned int
+connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_connmark_tginfo1 *info = par->targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ u_int32_t newmark;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return XT_CONTINUE;
+
+ switch (info->mode) {
+ case XT_CONNMARK_SET:
+ newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
+ if (ct->mark != newmark) {
+ ct->mark = newmark;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+ break;
+ case XT_CONNMARK_SAVE:
+ newmark = (ct->mark & ~info->ctmask) ^
+ (skb->mark & info->nfmask);
+ if (ct->mark != newmark) {
+ ct->mark = newmark;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+ break;
+ case XT_CONNMARK_RESTORE:
+ newmark = (skb->mark & ~info->nfmask) ^
+ (ct->mark & info->ctmask);
+ skb->mark = newmark;
+ break;
+ }
+
+ return XT_CONTINUE;
+}
+
+static int connmark_tg_check(const struct xt_tgchk_param *par)
+{
+ int ret;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+}
+
+static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static bool
+connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_connmark_mtinfo1 *info = par->matchinfo;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return false;
+
+ return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int connmark_mt_check(const struct xt_mtchk_param *par)
+{
+ int ret;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+}
+
+static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target connmark_tg_reg __read_mostly = {
+ .name = "CONNMARK",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg,
+ .targetsize = sizeof(struct xt_connmark_tginfo1),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
+};
+
+static struct xt_match connmark_mt_reg __read_mostly = {
+ .name = "connmark",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connmark_mt_check,
+ .match = connmark_mt,
+ .matchsize = sizeof(struct xt_connmark_mtinfo1),
+ .destroy = connmark_mt_destroy,
+ .me = THIS_MODULE,
+};
+
+static int __init connmark_mt_init(void)
+{
+ int ret;
+
+ ret = xt_register_target(&connmark_tg_reg);
+ if (ret < 0)
+ return ret;
+ ret = xt_register_match(&connmark_mt_reg);
+ if (ret < 0) {
+ xt_unregister_target(&connmark_tg_reg);
+ return ret;
+ }
+ return 0;
+}
+
+static void __exit connmark_mt_exit(void)
+{
+ xt_unregister_match(&connmark_mt_reg);
+ xt_unregister_target(&connmark_tg_reg);
+}
+
+module_init(connmark_mt_init);
+module_exit(connmark_mt_exit);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
new file mode 100644
index 000000000..188404b9b
--- /dev/null
+++ b/net/netfilter/xt_conntrack.c
@@ -0,0 +1,333 @@
+/*
+ * xt_conntrack - Netfilter module to match connection tracking
+ * information. (Superset of Rusty's minimalistic state match.)
+ *
+ * (C) 2001 Marc Boucher (marc@mbsi.ca).
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_conntrack.h>
+#include <net/netfilter/nf_conntrack.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: connection tracking state match");
+MODULE_ALIAS("ipt_conntrack");
+MODULE_ALIAS("ip6t_conntrack");
+
+static bool
+conntrack_addrcmp(const union nf_inet_addr *kaddr,
+ const union nf_inet_addr *uaddr,
+ const union nf_inet_addr *umask, unsigned int l3proto)
+{
+ if (l3proto == NFPROTO_IPV4)
+ return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
+ else if (l3proto == NFPROTO_IPV6)
+ return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
+ &uaddr->in6) == 0;
+ else
+ return false;
+}
+
+static inline bool
+conntrack_mt_origsrc(const struct nf_conn *ct,
+ const struct xt_conntrack_mtinfo2 *info,
+ u_int8_t family)
+{
+ return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
+ &info->origsrc_addr, &info->origsrc_mask, family);
+}
+
+static inline bool
+conntrack_mt_origdst(const struct nf_conn *ct,
+ const struct xt_conntrack_mtinfo2 *info,
+ u_int8_t family)
+{
+ return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3,
+ &info->origdst_addr, &info->origdst_mask, family);
+}
+
+static inline bool
+conntrack_mt_replsrc(const struct nf_conn *ct,
+ const struct xt_conntrack_mtinfo2 *info,
+ u_int8_t family)
+{
+ return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3,
+ &info->replsrc_addr, &info->replsrc_mask, family);
+}
+
+static inline bool
+conntrack_mt_repldst(const struct nf_conn *ct,
+ const struct xt_conntrack_mtinfo2 *info,
+ u_int8_t family)
+{
+ return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3,
+ &info->repldst_addr, &info->repldst_mask, family);
+}
+
+static inline bool
+ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
+ const struct nf_conn *ct)
+{
+ const struct nf_conntrack_tuple *tuple;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ if ((info->match_flags & XT_CONNTRACK_PROTO) &&
+ (nf_ct_protonum(ct) == info->l4proto) ^
+ !(info->invert_flags & XT_CONNTRACK_PROTO))
+ return false;
+
+ /* Shortcut to match all recognized protocols by using ->src.all. */
+ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) &&
+ (tuple->src.u.all == info->origsrc_port) ^
+ !(info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) &&
+ (tuple->dst.u.all == info->origdst_port) ^
+ !(info->invert_flags & XT_CONNTRACK_ORIGDST_PORT))
+ return false;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) &&
+ (tuple->src.u.all == info->replsrc_port) ^
+ !(info->invert_flags & XT_CONNTRACK_REPLSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) &&
+ (tuple->dst.u.all == info->repldst_port) ^
+ !(info->invert_flags & XT_CONNTRACK_REPLDST_PORT))
+ return false;
+
+ return true;
+}
+
+static inline bool
+port_match(u16 min, u16 max, u16 port, bool invert)
+{
+ return (port >= min && port <= max) ^ invert;
+}
+
+static inline bool
+ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info,
+ const struct nf_conn *ct)
+{
+ const struct nf_conntrack_tuple *tuple;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ if ((info->match_flags & XT_CONNTRACK_PROTO) &&
+ (nf_ct_protonum(ct) == info->l4proto) ^
+ !(info->invert_flags & XT_CONNTRACK_PROTO))
+ return false;
+
+ /* Shortcut to match all recognized protocols by using ->src.all. */
+ if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) &&
+ !port_match(info->origsrc_port, info->origsrc_port_high,
+ ntohs(tuple->src.u.all),
+ info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) &&
+ !port_match(info->origdst_port, info->origdst_port_high,
+ ntohs(tuple->dst.u.all),
+ info->invert_flags & XT_CONNTRACK_ORIGDST_PORT))
+ return false;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) &&
+ !port_match(info->replsrc_port, info->replsrc_port_high,
+ ntohs(tuple->src.u.all),
+ info->invert_flags & XT_CONNTRACK_REPLSRC_PORT))
+ return false;
+
+ if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) &&
+ !port_match(info->repldst_port, info->repldst_port_high,
+ ntohs(tuple->dst.u.all),
+ info->invert_flags & XT_CONNTRACK_REPLDST_PORT))
+ return false;
+
+ return true;
+}
+
+static bool
+conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
+ u16 state_mask, u16 status_mask)
+{
+ const struct xt_conntrack_mtinfo2 *info = par->matchinfo;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int statebit;
+
+ ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct) {
+ if (nf_ct_is_untracked(ct))
+ statebit = XT_CONNTRACK_STATE_UNTRACKED;
+ else
+ statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+ } else
+ statebit = XT_CONNTRACK_STATE_INVALID;
+
+ if (info->match_flags & XT_CONNTRACK_STATE) {
+ if (ct != NULL) {
+ if (test_bit(IPS_SRC_NAT_BIT, &ct->status))
+ statebit |= XT_CONNTRACK_STATE_SNAT;
+ if (test_bit(IPS_DST_NAT_BIT, &ct->status))
+ statebit |= XT_CONNTRACK_STATE_DNAT;
+ }
+ if (!!(state_mask & statebit) ^
+ !(info->invert_flags & XT_CONNTRACK_STATE))
+ return false;
+ }
+
+ if (ct == NULL)
+ return info->match_flags & XT_CONNTRACK_STATE;
+ if ((info->match_flags & XT_CONNTRACK_DIRECTION) &&
+ (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^
+ !(info->invert_flags & XT_CONNTRACK_DIRECTION))
+ return false;
+
+ if (info->match_flags & XT_CONNTRACK_ORIGSRC)
+ if (conntrack_mt_origsrc(ct, info, par->family) ^
+ !(info->invert_flags & XT_CONNTRACK_ORIGSRC))
+ return false;
+
+ if (info->match_flags & XT_CONNTRACK_ORIGDST)
+ if (conntrack_mt_origdst(ct, info, par->family) ^
+ !(info->invert_flags & XT_CONNTRACK_ORIGDST))
+ return false;
+
+ if (info->match_flags & XT_CONNTRACK_REPLSRC)
+ if (conntrack_mt_replsrc(ct, info, par->family) ^
+ !(info->invert_flags & XT_CONNTRACK_REPLSRC))
+ return false;
+
+ if (info->match_flags & XT_CONNTRACK_REPLDST)
+ if (conntrack_mt_repldst(ct, info, par->family) ^
+ !(info->invert_flags & XT_CONNTRACK_REPLDST))
+ return false;
+
+ if (par->match->revision != 3) {
+ if (!ct_proto_port_check(info, ct))
+ return false;
+ } else {
+ if (!ct_proto_port_check_v3(par->matchinfo, ct))
+ return false;
+ }
+
+ if ((info->match_flags & XT_CONNTRACK_STATUS) &&
+ (!!(status_mask & ct->status) ^
+ !(info->invert_flags & XT_CONNTRACK_STATUS)))
+ return false;
+
+ if (info->match_flags & XT_CONNTRACK_EXPIRES) {
+ unsigned long expires = 0;
+
+ if (timer_pending(&ct->timeout))
+ expires = (ct->timeout.expires - jiffies) / HZ;
+ if ((expires >= info->expires_min &&
+ expires <= info->expires_max) ^
+ !(info->invert_flags & XT_CONNTRACK_EXPIRES))
+ return false;
+ }
+ return true;
+}
+
+static bool
+conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_conntrack_mtinfo1 *info = par->matchinfo;
+
+ return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
+static bool
+conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_conntrack_mtinfo2 *info = par->matchinfo;
+
+ return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
+static bool
+conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_conntrack_mtinfo3 *info = par->matchinfo;
+
+ return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
+static int conntrack_mt_check(const struct xt_mtchk_param *par)
+{
+ int ret;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+}
+
+static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match conntrack_mt_reg[] __read_mostly = {
+ {
+ .name = "conntrack",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .matchsize = sizeof(struct xt_conntrack_mtinfo1),
+ .match = conntrack_mt_v1,
+ .checkentry = conntrack_mt_check,
+ .destroy = conntrack_mt_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "conntrack",
+ .revision = 2,
+ .family = NFPROTO_UNSPEC,
+ .matchsize = sizeof(struct xt_conntrack_mtinfo2),
+ .match = conntrack_mt_v2,
+ .checkentry = conntrack_mt_check,
+ .destroy = conntrack_mt_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "conntrack",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .matchsize = sizeof(struct xt_conntrack_mtinfo3),
+ .match = conntrack_mt_v3,
+ .checkentry = conntrack_mt_check,
+ .destroy = conntrack_mt_destroy,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init conntrack_mt_init(void)
+{
+ return xt_register_matches(conntrack_mt_reg,
+ ARRAY_SIZE(conntrack_mt_reg));
+}
+
+static void __exit conntrack_mt_exit(void)
+{
+ xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg));
+}
+
+module_init(conntrack_mt_init);
+module_exit(conntrack_mt_exit);
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
new file mode 100644
index 000000000..c7a2e5466
--- /dev/null
+++ b/net/netfilter/xt_cpu.c
@@ -0,0 +1,65 @@
+/* Kernel module to match running CPU */
+
+/*
+ * Might be used to distribute connections on several daemons, if
+ * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable,
+ * each RX queue IRQ affined to one CPU (1:1 mapping)
+ *
+ */
+
+/* (C) 2010 Eric Dumazet
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_cpu.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
+MODULE_DESCRIPTION("Xtables: CPU match");
+MODULE_ALIAS("ipt_cpu");
+MODULE_ALIAS("ip6t_cpu");
+
+static int cpu_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_cpu_info *info = par->matchinfo;
+
+ if (info->invert & ~1)
+ return -EINVAL;
+ return 0;
+}
+
+static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_cpu_info *info = par->matchinfo;
+
+ return (info->cpu == smp_processor_id()) ^ info->invert;
+}
+
+static struct xt_match cpu_mt_reg __read_mostly = {
+ .name = "cpu",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = cpu_mt_check,
+ .match = cpu_mt,
+ .matchsize = sizeof(struct xt_cpu_info),
+ .me = THIS_MODULE,
+};
+
+static int __init cpu_mt_init(void)
+{
+ return xt_register_match(&cpu_mt_reg);
+}
+
+static void __exit cpu_mt_exit(void)
+{
+ xt_unregister_match(&cpu_mt_reg);
+}
+
+module_init(cpu_mt_init);
+module_exit(cpu_mt_exit);
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
new file mode 100644
index 000000000..b63d2a3d8
--- /dev/null
+++ b/net/netfilter/xt_dccp.c
@@ -0,0 +1,188 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_dccp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: DCCP protocol packet match");
+MODULE_ALIAS("ipt_dccp");
+MODULE_ALIAS("ip6t_dccp");
+
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+ || (!!((invflag) & (option)) ^ (cond)))
+
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+
+static inline bool
+dccp_find_option(u_int8_t option,
+ const struct sk_buff *skb,
+ unsigned int protoff,
+ const struct dccp_hdr *dh,
+ bool *hotdrop)
+{
+ /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+ const unsigned char *op;
+ unsigned int optoff = __dccp_hdr_len(dh);
+ unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+ unsigned int i;
+
+ if (dh->dccph_doff * 4 < __dccp_hdr_len(dh))
+ goto invalid;
+
+ if (!optlen)
+ return false;
+
+ spin_lock_bh(&dccp_buflock);
+ op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf);
+ if (op == NULL) {
+ /* If we don't have the whole header, drop packet. */
+ goto partial;
+ }
+
+ for (i = 0; i < optlen; ) {
+ if (op[i] == option) {
+ spin_unlock_bh(&dccp_buflock);
+ return true;
+ }
+
+ if (op[i] < 2)
+ i++;
+ else
+ i += op[i+1]?:1;
+ }
+
+ spin_unlock_bh(&dccp_buflock);
+ return false;
+
+partial:
+ spin_unlock_bh(&dccp_buflock);
+invalid:
+ *hotdrop = true;
+ return false;
+}
+
+
+static inline bool
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+ return typemask & (1 << dh->dccph_type);
+}
+
+static inline bool
+match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff,
+ const struct dccp_hdr *dh, bool *hotdrop)
+{
+ return dccp_find_option(option, skb, protoff, dh, hotdrop);
+}
+
+static bool
+dccp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_dccp_info *info = par->matchinfo;
+ const struct dccp_hdr *dh;
+ struct dccp_hdr _dh;
+
+ if (par->fragoff != 0)
+ return false;
+
+ dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh);
+ if (dh == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
+
+ return DCCHECK(ntohs(dh->dccph_sport) >= info->spts[0]
+ && ntohs(dh->dccph_sport) <= info->spts[1],
+ XT_DCCP_SRC_PORTS, info->flags, info->invflags)
+ && DCCHECK(ntohs(dh->dccph_dport) >= info->dpts[0]
+ && ntohs(dh->dccph_dport) <= info->dpts[1],
+ XT_DCCP_DEST_PORTS, info->flags, info->invflags)
+ && DCCHECK(match_types(dh, info->typemask),
+ XT_DCCP_TYPE, info->flags, info->invflags)
+ && DCCHECK(match_option(info->option, skb, par->thoff, dh,
+ &par->hotdrop),
+ XT_DCCP_OPTION, info->flags, info->invflags);
+}
+
+static int dccp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_dccp_info *info = par->matchinfo;
+
+ if (info->flags & ~XT_DCCP_VALID_FLAGS)
+ return -EINVAL;
+ if (info->invflags & ~XT_DCCP_VALID_FLAGS)
+ return -EINVAL;
+ if (info->invflags & ~info->flags)
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_match dccp_mt_reg[] __read_mostly = {
+ {
+ .name = "dccp",
+ .family = NFPROTO_IPV4,
+ .checkentry = dccp_mt_check,
+ .match = dccp_mt,
+ .matchsize = sizeof(struct xt_dccp_info),
+ .proto = IPPROTO_DCCP,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "dccp",
+ .family = NFPROTO_IPV6,
+ .checkentry = dccp_mt_check,
+ .match = dccp_mt,
+ .matchsize = sizeof(struct xt_dccp_info),
+ .proto = IPPROTO_DCCP,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init dccp_mt_init(void)
+{
+ int ret;
+
+ /* doff is 8 bits, so the maximum option size is (4*256). Don't put
+ * this in BSS since DaveM is worried about locked TLB's for kernel
+ * BSS. */
+ dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+ if (!dccp_optbuf)
+ return -ENOMEM;
+ ret = xt_register_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg));
+ if (ret)
+ goto out_kfree;
+ return ret;
+
+out_kfree:
+ kfree(dccp_optbuf);
+ return ret;
+}
+
+static void __exit dccp_mt_exit(void)
+{
+ xt_unregister_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg));
+ kfree(dccp_optbuf);
+}
+
+module_init(dccp_mt_init);
+module_exit(dccp_mt_exit);
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
new file mode 100644
index 000000000..d9202cdd2
--- /dev/null
+++ b/net/netfilter/xt_devgroup.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <linux/netfilter/xt_devgroup.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Device group match");
+MODULE_ALIAS("ipt_devgroup");
+MODULE_ALIAS("ip6t_devgroup");
+
+static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+ (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^
+ ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
+ return false;
+
+ if (info->flags & XT_DEVGROUP_MATCH_DST &&
+ (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^
+ ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
+ return false;
+
+ return true;
+}
+
+static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_devgroup_info *info = par->matchinfo;
+
+ if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
+ XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
+ return -EINVAL;
+
+ if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+ par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD)))
+ return -EINVAL;
+
+ if (info->flags & XT_DEVGROUP_MATCH_DST &&
+ par->hook_mask & ~((1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_match devgroup_mt_reg __read_mostly = {
+ .name = "devgroup",
+ .match = devgroup_mt,
+ .checkentry = devgroup_mt_checkentry,
+ .matchsize = sizeof(struct xt_devgroup_info),
+ .family = NFPROTO_UNSPEC,
+ .me = THIS_MODULE
+};
+
+static int __init devgroup_mt_init(void)
+{
+ return xt_register_match(&devgroup_mt_reg);
+}
+
+static void __exit devgroup_mt_exit(void)
+{
+ xt_unregister_match(&devgroup_mt_reg);
+}
+
+module_init(devgroup_mt_init);
+module_exit(devgroup_mt_exit);
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
new file mode 100644
index 000000000..64670fc5d
--- /dev/null
+++ b/net/netfilter/xt_dscp.c
@@ -0,0 +1,115 @@
+/* IP tables module for matching the value of the IPv4/IPv6 DSCP field
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/dsfield.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_dscp.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: DSCP/TOS field match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_dscp");
+MODULE_ALIAS("ip6t_dscp");
+MODULE_ALIAS("ipt_tos");
+MODULE_ALIAS("ip6t_tos");
+
+static bool
+dscp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_dscp_info *info = par->matchinfo;
+ u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
+
+ return (dscp == info->dscp) ^ !!info->invert;
+}
+
+static bool
+dscp_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_dscp_info *info = par->matchinfo;
+ u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
+
+ return (dscp == info->dscp) ^ !!info->invert;
+}
+
+static int dscp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_dscp_info *info = par->matchinfo;
+
+ if (info->dscp > XT_DSCP_MAX) {
+ pr_info("dscp %x out of range\n", info->dscp);
+ return -EDOM;
+ }
+
+ return 0;
+}
+
+static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_tos_match_info *info = par->matchinfo;
+
+ if (par->family == NFPROTO_IPV4)
+ return ((ip_hdr(skb)->tos & info->tos_mask) ==
+ info->tos_value) ^ !!info->invert;
+ else
+ return ((ipv6_get_dsfield(ipv6_hdr(skb)) & info->tos_mask) ==
+ info->tos_value) ^ !!info->invert;
+}
+
+static struct xt_match dscp_mt_reg[] __read_mostly = {
+ {
+ .name = "dscp",
+ .family = NFPROTO_IPV4,
+ .checkentry = dscp_mt_check,
+ .match = dscp_mt,
+ .matchsize = sizeof(struct xt_dscp_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "dscp",
+ .family = NFPROTO_IPV6,
+ .checkentry = dscp_mt_check,
+ .match = dscp_mt6,
+ .matchsize = sizeof(struct xt_dscp_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "tos",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .match = tos_mt,
+ .matchsize = sizeof(struct xt_tos_match_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "tos",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = tos_mt,
+ .matchsize = sizeof(struct xt_tos_match_info),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init dscp_mt_init(void)
+{
+ return xt_register_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg));
+}
+
+static void __exit dscp_mt_exit(void)
+{
+ xt_unregister_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg));
+}
+
+module_init(dscp_mt_init);
+module_exit(dscp_mt_exit);
diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c
new file mode 100644
index 000000000..3c831a8ef
--- /dev/null
+++ b/net/netfilter/xt_ecn.c
@@ -0,0 +1,179 @@
+/*
+ * Xtables module for matching the value of the IPv4/IPv6 and TCP ECN bits
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_ecn.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ecn");
+MODULE_ALIAS("ip6t_ecn");
+
+static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_ecn_info *einfo = par->matchinfo;
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ /* In practice, TCP match does this, so can't fail. But let's
+ * be good citizens.
+ */
+ th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return false;
+
+ if (einfo->operation & XT_ECN_OP_MATCH_ECE) {
+ if (einfo->invert & XT_ECN_OP_MATCH_ECE) {
+ if (th->ece == 1)
+ return false;
+ } else {
+ if (th->ece == 0)
+ return false;
+ }
+ }
+
+ if (einfo->operation & XT_ECN_OP_MATCH_CWR) {
+ if (einfo->invert & XT_ECN_OP_MATCH_CWR) {
+ if (th->cwr == 1)
+ return false;
+ } else {
+ if (th->cwr == 0)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static inline bool match_ip(const struct sk_buff *skb,
+ const struct xt_ecn_info *einfo)
+{
+ return ((ip_hdr(skb)->tos & XT_ECN_IP_MASK) == einfo->ip_ect) ^
+ !!(einfo->invert & XT_ECN_OP_MATCH_IP);
+}
+
+static bool ecn_mt4(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_ecn_info *info = par->matchinfo;
+
+ if (info->operation & XT_ECN_OP_MATCH_IP && !match_ip(skb, info))
+ return false;
+
+ if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) &&
+ !match_tcp(skb, par))
+ return false;
+
+ return true;
+}
+
+static int ecn_mt_check4(const struct xt_mtchk_param *par)
+{
+ const struct xt_ecn_info *info = par->matchinfo;
+ const struct ipt_ip *ip = par->entryinfo;
+
+ if (info->operation & XT_ECN_OP_MATCH_MASK)
+ return -EINVAL;
+
+ if (info->invert & XT_ECN_OP_MATCH_MASK)
+ return -EINVAL;
+
+ if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) &&
+ (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) {
+ pr_info("cannot match TCP bits in rule for non-tcp packets\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline bool match_ipv6(const struct sk_buff *skb,
+ const struct xt_ecn_info *einfo)
+{
+ return (((ipv6_hdr(skb)->flow_lbl[0] >> 4) & XT_ECN_IP_MASK) ==
+ einfo->ip_ect) ^
+ !!(einfo->invert & XT_ECN_OP_MATCH_IP);
+}
+
+static bool ecn_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_ecn_info *info = par->matchinfo;
+
+ if (info->operation & XT_ECN_OP_MATCH_IP && !match_ipv6(skb, info))
+ return false;
+
+ if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) &&
+ !match_tcp(skb, par))
+ return false;
+
+ return true;
+}
+
+static int ecn_mt_check6(const struct xt_mtchk_param *par)
+{
+ const struct xt_ecn_info *info = par->matchinfo;
+ const struct ip6t_ip6 *ip = par->entryinfo;
+
+ if (info->operation & XT_ECN_OP_MATCH_MASK)
+ return -EINVAL;
+
+ if (info->invert & XT_ECN_OP_MATCH_MASK)
+ return -EINVAL;
+
+ if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) &&
+ (ip->proto != IPPROTO_TCP || ip->invflags & IP6T_INV_PROTO)) {
+ pr_info("cannot match TCP bits in rule for non-tcp packets\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match ecn_mt_reg[] __read_mostly = {
+ {
+ .name = "ecn",
+ .family = NFPROTO_IPV4,
+ .match = ecn_mt4,
+ .matchsize = sizeof(struct xt_ecn_info),
+ .checkentry = ecn_mt_check4,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "ecn",
+ .family = NFPROTO_IPV6,
+ .match = ecn_mt6,
+ .matchsize = sizeof(struct xt_ecn_info),
+ .checkentry = ecn_mt_check6,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init ecn_mt_init(void)
+{
+ return xt_register_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg));
+}
+
+static void __exit ecn_mt_exit(void)
+{
+ xt_unregister_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg));
+}
+
+module_init(ecn_mt_init);
+module_exit(ecn_mt_exit);
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
new file mode 100644
index 000000000..171ba82b5
--- /dev/null
+++ b/net/netfilter/xt_esp.c
@@ -0,0 +1,107 @@
+/* Kernel module to match ESP parameters. */
+
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter/xt_esp.h>
+#include <linux/netfilter/x_tables.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("Xtables: IPsec-ESP packet match");
+MODULE_ALIAS("ipt_esp");
+MODULE_ALIAS("ip6t_esp");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+ bool r;
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
+ r = (spi >= min && spi <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+ return r;
+}
+
+static bool esp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ip_esp_hdr *eh;
+ struct ip_esp_hdr _esp;
+ const struct xt_esp *espinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp);
+ if (eh == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ pr_debug("Dropping evil ESP tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ return spi_match(espinfo->spis[0], espinfo->spis[1], ntohl(eh->spi),
+ !!(espinfo->invflags & XT_ESP_INV_SPI));
+}
+
+static int esp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_esp *espinfo = par->matchinfo;
+
+ if (espinfo->invflags & ~XT_ESP_INV_MASK) {
+ pr_debug("unknown flags %X\n", espinfo->invflags);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match esp_mt_reg[] __read_mostly = {
+ {
+ .name = "esp",
+ .family = NFPROTO_IPV4,
+ .checkentry = esp_mt_check,
+ .match = esp_mt,
+ .matchsize = sizeof(struct xt_esp),
+ .proto = IPPROTO_ESP,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "esp",
+ .family = NFPROTO_IPV6,
+ .checkentry = esp_mt_check,
+ .match = esp_mt,
+ .matchsize = sizeof(struct xt_esp),
+ .proto = IPPROTO_ESP,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init esp_mt_init(void)
+{
+ return xt_register_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg));
+}
+
+static void __exit esp_mt_exit(void)
+{
+ xt_unregister_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg));
+}
+
+module_init(esp_mt_init);
+module_exit(esp_mt_exit);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
new file mode 100644
index 000000000..178696852
--- /dev/null
+++ b/net/netfilter/xt_hashlimit.c
@@ -0,0 +1,967 @@
+/*
+ * xt_hashlimit - Netfilter module to limit the number of packets per time
+ * separately for each hashbucket (sourceip/sourceport/dstip/dstport)
+ *
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *
+ * Development of this code was funded by Astaro AG, http://www.astaro.com/
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#endif
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/xt_hashlimit.h>
+#include <linux/mutex.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match");
+MODULE_ALIAS("ipt_hashlimit");
+MODULE_ALIAS("ip6t_hashlimit");
+
+struct hashlimit_net {
+ struct hlist_head htables;
+ struct proc_dir_entry *ipt_hashlimit;
+ struct proc_dir_entry *ip6t_hashlimit;
+};
+
+static int hashlimit_net_id;
+static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
+{
+ return net_generic(net, hashlimit_net_id);
+}
+
+/* need to declare this at the top */
+static const struct file_operations dl_file_ops;
+
+/* hash table crap */
+struct dsthash_dst {
+ union {
+ struct {
+ __be32 src;
+ __be32 dst;
+ } ip;
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ struct {
+ __be32 src[4];
+ __be32 dst[4];
+ } ip6;
+#endif
+ };
+ __be16 src_port;
+ __be16 dst_port;
+};
+
+struct dsthash_ent {
+ /* static / read-only parts in the beginning */
+ struct hlist_node node;
+ struct dsthash_dst dst;
+
+ /* modified structure members in the end */
+ spinlock_t lock;
+ unsigned long expires; /* precalculated expiry time */
+ struct {
+ unsigned long prev; /* last modification */
+ u_int32_t credit;
+ u_int32_t credit_cap, cost;
+ } rateinfo;
+ struct rcu_head rcu;
+};
+
+struct xt_hashlimit_htable {
+ struct hlist_node node; /* global list of all htables */
+ int use;
+ u_int8_t family;
+ bool rnd_initialized;
+
+ struct hashlimit_cfg1 cfg; /* config */
+
+ /* used internally */
+ spinlock_t lock; /* lock for list_head */
+ u_int32_t rnd; /* random seed for hash */
+ unsigned int count; /* number entries in table */
+ struct delayed_work gc_work;
+
+ /* seq_file stuff */
+ struct proc_dir_entry *pde;
+ const char *name;
+ struct net *net;
+
+ struct hlist_head hash[0]; /* hashtable itself */
+};
+
+static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */
+static struct kmem_cache *hashlimit_cachep __read_mostly;
+
+static inline bool dst_cmp(const struct dsthash_ent *ent,
+ const struct dsthash_dst *b)
+{
+ return !memcmp(&ent->dst, b, sizeof(ent->dst));
+}
+
+static u_int32_t
+hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst)
+{
+ u_int32_t hash = jhash2((const u32 *)dst,
+ sizeof(*dst)/sizeof(u32),
+ ht->rnd);
+ /*
+ * Instead of returning hash % ht->cfg.size (implying a divide)
+ * we return the high 32 bits of the (hash * ht->cfg.size) that will
+ * give results between [0 and cfg.size-1] and same hash distribution,
+ * but using a multiply, less expensive than a divide
+ */
+ return reciprocal_scale(hash, ht->cfg.size);
+}
+
+static struct dsthash_ent *
+dsthash_find(const struct xt_hashlimit_htable *ht,
+ const struct dsthash_dst *dst)
+{
+ struct dsthash_ent *ent;
+ u_int32_t hash = hash_dst(ht, dst);
+
+ if (!hlist_empty(&ht->hash[hash])) {
+ hlist_for_each_entry_rcu(ent, &ht->hash[hash], node)
+ if (dst_cmp(ent, dst)) {
+ spin_lock(&ent->lock);
+ return ent;
+ }
+ }
+ return NULL;
+}
+
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+dsthash_alloc_init(struct xt_hashlimit_htable *ht,
+ const struct dsthash_dst *dst, bool *race)
+{
+ struct dsthash_ent *ent;
+
+ spin_lock(&ht->lock);
+
+ /* Two or more packets may race to create the same entry in the
+ * hashtable, double check if this packet lost race.
+ */
+ ent = dsthash_find(ht, dst);
+ if (ent != NULL) {
+ spin_unlock(&ht->lock);
+ *race = true;
+ return ent;
+ }
+
+ /* initialize hash with random val at the time we allocate
+ * the first hashtable entry */
+ if (unlikely(!ht->rnd_initialized)) {
+ get_random_bytes(&ht->rnd, sizeof(ht->rnd));
+ ht->rnd_initialized = true;
+ }
+
+ if (ht->cfg.max && ht->count >= ht->cfg.max) {
+ /* FIXME: do something. question is what.. */
+ net_err_ratelimited("max count of %u reached\n", ht->cfg.max);
+ ent = NULL;
+ } else
+ ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+ if (ent) {
+ memcpy(&ent->dst, dst, sizeof(ent->dst));
+ spin_lock_init(&ent->lock);
+
+ spin_lock(&ent->lock);
+ hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+ ht->count++;
+ }
+ spin_unlock(&ht->lock);
+ return ent;
+}
+
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+ struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
+
+ kmem_cache_free(hashlimit_cachep, ent);
+}
+
+static inline void
+dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
+{
+ hlist_del_rcu(&ent->node);
+ call_rcu_bh(&ent->rcu, dsthash_free_rcu);
+ ht->count--;
+}
+static void htable_gc(struct work_struct *work);
+
+static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
+ u_int8_t family)
+{
+ struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+ struct xt_hashlimit_htable *hinfo;
+ unsigned int size;
+ unsigned int i;
+
+ if (minfo->cfg.size) {
+ size = minfo->cfg.size;
+ } else {
+ size = (totalram_pages << PAGE_SHIFT) / 16384 /
+ sizeof(struct list_head);
+ if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
+ size = 8192;
+ if (size < 16)
+ size = 16;
+ }
+ /* FIXME: don't use vmalloc() here or anywhere else -HW */
+ hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
+ sizeof(struct list_head) * size);
+ if (hinfo == NULL)
+ return -ENOMEM;
+ minfo->hinfo = hinfo;
+
+ /* copy match config into hashtable config */
+ memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+ hinfo->cfg.size = size;
+ if (hinfo->cfg.max == 0)
+ hinfo->cfg.max = 8 * hinfo->cfg.size;
+ else if (hinfo->cfg.max < hinfo->cfg.size)
+ hinfo->cfg.max = hinfo->cfg.size;
+
+ for (i = 0; i < hinfo->cfg.size; i++)
+ INIT_HLIST_HEAD(&hinfo->hash[i]);
+
+ hinfo->use = 1;
+ hinfo->count = 0;
+ hinfo->family = family;
+ hinfo->rnd_initialized = false;
+ hinfo->name = kstrdup(minfo->name, GFP_KERNEL);
+ if (!hinfo->name) {
+ vfree(hinfo);
+ return -ENOMEM;
+ }
+ spin_lock_init(&hinfo->lock);
+
+ hinfo->pde = proc_create_data(minfo->name, 0,
+ (family == NFPROTO_IPV4) ?
+ hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
+ &dl_file_ops, hinfo);
+ if (hinfo->pde == NULL) {
+ kfree(hinfo->name);
+ vfree(hinfo);
+ return -ENOMEM;
+ }
+ hinfo->net = net;
+
+ INIT_DEFERRABLE_WORK(&hinfo->gc_work, htable_gc);
+ queue_delayed_work(system_power_efficient_wq, &hinfo->gc_work,
+ msecs_to_jiffies(hinfo->cfg.gc_interval));
+
+ hlist_add_head(&hinfo->node, &hashlimit_net->htables);
+
+ return 0;
+}
+
+static bool select_all(const struct xt_hashlimit_htable *ht,
+ const struct dsthash_ent *he)
+{
+ return 1;
+}
+
+static bool select_gc(const struct xt_hashlimit_htable *ht,
+ const struct dsthash_ent *he)
+{
+ return time_after_eq(jiffies, he->expires);
+}
+
+static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
+ bool (*select)(const struct xt_hashlimit_htable *ht,
+ const struct dsthash_ent *he))
+{
+ unsigned int i;
+
+ for (i = 0; i < ht->cfg.size; i++) {
+ struct dsthash_ent *dh;
+ struct hlist_node *n;
+
+ spin_lock_bh(&ht->lock);
+ hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) {
+ if ((*select)(ht, dh))
+ dsthash_free(ht, dh);
+ }
+ spin_unlock_bh(&ht->lock);
+ cond_resched();
+ }
+}
+
+static void htable_gc(struct work_struct *work)
+{
+ struct xt_hashlimit_htable *ht;
+
+ ht = container_of(work, struct xt_hashlimit_htable, gc_work.work);
+
+ htable_selective_cleanup(ht, select_gc);
+
+ queue_delayed_work(system_power_efficient_wq,
+ &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval));
+}
+
+static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo)
+{
+ struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net);
+ struct proc_dir_entry *parent;
+
+ if (hinfo->family == NFPROTO_IPV4)
+ parent = hashlimit_net->ipt_hashlimit;
+ else
+ parent = hashlimit_net->ip6t_hashlimit;
+
+ if (parent != NULL)
+ remove_proc_entry(hinfo->name, parent);
+}
+
+static void htable_destroy(struct xt_hashlimit_htable *hinfo)
+{
+ cancel_delayed_work_sync(&hinfo->gc_work);
+ htable_remove_proc_entry(hinfo);
+ htable_selective_cleanup(hinfo, select_all);
+ kfree(hinfo->name);
+ vfree(hinfo);
+}
+
+static struct xt_hashlimit_htable *htable_find_get(struct net *net,
+ const char *name,
+ u_int8_t family)
+{
+ struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+ struct xt_hashlimit_htable *hinfo;
+
+ hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) {
+ if (!strcmp(name, hinfo->name) &&
+ hinfo->family == family) {
+ hinfo->use++;
+ return hinfo;
+ }
+ }
+ return NULL;
+}
+
+static void htable_put(struct xt_hashlimit_htable *hinfo)
+{
+ mutex_lock(&hashlimit_mutex);
+ if (--hinfo->use == 0) {
+ hlist_del(&hinfo->node);
+ htable_destroy(hinfo);
+ }
+ mutex_unlock(&hashlimit_mutex);
+}
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+ this algorithm. The `average rate' in jiffies becomes your initial
+ amount of credit `credit' and the most credit you can ever have
+ `credit_cap'. The `peak rate' becomes the cost of passing the
+ test, `cost'.
+
+ `prev' tracks the last packet hit: you gain one credit per jiffy.
+ If you get credit balance more than this, the extra credit is
+ discarded. Every time the match passes, you lose `cost' credits;
+ if you don't have that many, the test fails.
+
+ See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+ To get the maximum range, we multiply by this factor (ie. you get N
+ credits per jiffy). We want to allow a rate as low as 1 per day
+ (slowest userspace tool allows), which means
+ CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
+*/
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+/* in byte mode, the lowest possible rate is one packet/second.
+ * credit_cap is used as a counter that tells us how many times we can
+ * refill the "credits available" counter when it becomes empty.
+ */
+#define MAX_CPJ_BYTES (0xFFFFFFFF / HZ)
+#define CREDITS_PER_JIFFY_BYTES POW2_BELOW32(MAX_CPJ_BYTES)
+
+static u32 xt_hashlimit_len_to_chunks(u32 len)
+{
+ return (len >> XT_HASHLIMIT_BYTE_SHIFT) + 1;
+}
+
+/* Precision saver. */
+static u32 user2credits(u32 user)
+{
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+ /* Divide first. */
+ return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+ return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+}
+
+static u32 user2credits_byte(u32 user)
+{
+ u64 us = user;
+ us *= HZ * CREDITS_PER_JIFFY_BYTES;
+ return (u32) (us >> 32);
+}
+
+static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
+{
+ unsigned long delta = now - dh->rateinfo.prev;
+ u32 cap;
+
+ if (delta == 0)
+ return;
+
+ dh->rateinfo.prev = now;
+
+ if (mode & XT_HASHLIMIT_BYTES) {
+ u32 tmp = dh->rateinfo.credit;
+ dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
+ cap = CREDITS_PER_JIFFY_BYTES * HZ;
+ if (tmp >= dh->rateinfo.credit) {/* overflow */
+ dh->rateinfo.credit = cap;
+ return;
+ }
+ } else {
+ dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+ cap = dh->rateinfo.credit_cap;
+ }
+ if (dh->rateinfo.credit > cap)
+ dh->rateinfo.credit = cap;
+}
+
+static void rateinfo_init(struct dsthash_ent *dh,
+ struct xt_hashlimit_htable *hinfo)
+{
+ dh->rateinfo.prev = jiffies;
+ if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
+ dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ;
+ dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg);
+ dh->rateinfo.credit_cap = hinfo->cfg.burst;
+ } else {
+ dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
+ hinfo->cfg.burst);
+ dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+ dh->rateinfo.credit_cap = dh->rateinfo.credit;
+ }
+}
+
+static inline __be32 maskl(__be32 a, unsigned int l)
+{
+ return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0;
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static void hashlimit_ipv6_mask(__be32 *i, unsigned int p)
+{
+ switch (p) {
+ case 0 ... 31:
+ i[0] = maskl(i[0], p);
+ i[1] = i[2] = i[3] = 0;
+ break;
+ case 32 ... 63:
+ i[1] = maskl(i[1], p - 32);
+ i[2] = i[3] = 0;
+ break;
+ case 64 ... 95:
+ i[2] = maskl(i[2], p - 64);
+ i[3] = 0;
+ break;
+ case 96 ... 127:
+ i[3] = maskl(i[3], p - 96);
+ break;
+ case 128:
+ break;
+ }
+}
+#endif
+
+static int
+hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
+ struct dsthash_dst *dst,
+ const struct sk_buff *skb, unsigned int protoff)
+{
+ __be16 _ports[2], *ports;
+ u8 nexthdr;
+ int poff;
+
+ memset(dst, 0, sizeof(*dst));
+
+ switch (hinfo->family) {
+ case NFPROTO_IPV4:
+ if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP)
+ dst->ip.dst = maskl(ip_hdr(skb)->daddr,
+ hinfo->cfg.dstmask);
+ if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP)
+ dst->ip.src = maskl(ip_hdr(skb)->saddr,
+ hinfo->cfg.srcmask);
+
+ if (!(hinfo->cfg.mode &
+ (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
+ return 0;
+ nexthdr = ip_hdr(skb)->protocol;
+ break;
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ case NFPROTO_IPV6:
+ {
+ __be16 frag_off;
+
+ if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) {
+ memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr,
+ sizeof(dst->ip6.dst));
+ hashlimit_ipv6_mask(dst->ip6.dst, hinfo->cfg.dstmask);
+ }
+ if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) {
+ memcpy(&dst->ip6.src, &ipv6_hdr(skb)->saddr,
+ sizeof(dst->ip6.src));
+ hashlimit_ipv6_mask(dst->ip6.src, hinfo->cfg.srcmask);
+ }
+
+ if (!(hinfo->cfg.mode &
+ (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
+ return 0;
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off);
+ if ((int)protoff < 0)
+ return -1;
+ break;
+ }
+#endif
+ default:
+ BUG();
+ return 0;
+ }
+
+ poff = proto_ports_offset(nexthdr);
+ if (poff >= 0) {
+ ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports),
+ &_ports);
+ } else {
+ _ports[0] = _ports[1] = 0;
+ ports = _ports;
+ }
+ if (!ports)
+ return -1;
+ if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SPT)
+ dst->src_port = ports[0];
+ if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DPT)
+ dst->dst_port = ports[1];
+ return 0;
+}
+
+static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
+{
+ u64 tmp = xt_hashlimit_len_to_chunks(len);
+ tmp = tmp * dh->rateinfo.cost;
+
+ if (unlikely(tmp > CREDITS_PER_JIFFY_BYTES * HZ))
+ tmp = CREDITS_PER_JIFFY_BYTES * HZ;
+
+ if (dh->rateinfo.credit < tmp && dh->rateinfo.credit_cap) {
+ dh->rateinfo.credit_cap--;
+ dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ;
+ }
+ return (u32) tmp;
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+ struct xt_hashlimit_htable *hinfo = info->hinfo;
+ unsigned long now = jiffies;
+ struct dsthash_ent *dh;
+ struct dsthash_dst dst;
+ bool race = false;
+ u32 cost;
+
+ if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
+ goto hotdrop;
+
+ rcu_read_lock_bh();
+ dh = dsthash_find(hinfo, &dst);
+ if (dh == NULL) {
+ dh = dsthash_alloc_init(hinfo, &dst, &race);
+ if (dh == NULL) {
+ rcu_read_unlock_bh();
+ goto hotdrop;
+ } else if (race) {
+ /* Already got an entry, update expiration timeout */
+ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode);
+ } else {
+ dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
+ rateinfo_init(dh, hinfo);
+ }
+ } else {
+ /* update expiration timeout */
+ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode);
+ }
+
+ if (info->cfg.mode & XT_HASHLIMIT_BYTES)
+ cost = hashlimit_byte_cost(skb->len, dh);
+ else
+ cost = dh->rateinfo.cost;
+
+ if (dh->rateinfo.credit >= cost) {
+ /* below the limit */
+ dh->rateinfo.credit -= cost;
+ spin_unlock(&dh->lock);
+ rcu_read_unlock_bh();
+ return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
+ }
+
+ spin_unlock(&dh->lock);
+ rcu_read_unlock_bh();
+ /* default match is underlimit - so over the limit, we need to invert */
+ return info->cfg.mode & XT_HASHLIMIT_INVERT;
+
+ hotdrop:
+ par->hotdrop = true;
+ return false;
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+ struct net *net = par->net;
+ struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+ int ret;
+
+ if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
+ return -EINVAL;
+ if (info->name[sizeof(info->name)-1] != '\0')
+ return -EINVAL;
+ if (par->family == NFPROTO_IPV4) {
+ if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
+ return -EINVAL;
+ } else {
+ if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128)
+ return -EINVAL;
+ }
+
+ if (info->cfg.mode & ~XT_HASHLIMIT_ALL) {
+ pr_info("Unknown mode mask %X, kernel too old?\n",
+ info->cfg.mode);
+ return -EINVAL;
+ }
+
+ /* Check for overflow. */
+ if (info->cfg.mode & XT_HASHLIMIT_BYTES) {
+ if (user2credits_byte(info->cfg.avg) == 0) {
+ pr_info("overflow, rate too high: %u\n", info->cfg.avg);
+ return -EINVAL;
+ }
+ } else if (info->cfg.burst == 0 ||
+ user2credits(info->cfg.avg * info->cfg.burst) <
+ user2credits(info->cfg.avg)) {
+ pr_info("overflow, try lower: %u/%u\n",
+ info->cfg.avg, info->cfg.burst);
+ return -ERANGE;
+ }
+
+ mutex_lock(&hashlimit_mutex);
+ info->hinfo = htable_find_get(net, info->name, par->family);
+ if (info->hinfo == NULL) {
+ ret = htable_create(net, info, par->family);
+ if (ret < 0) {
+ mutex_unlock(&hashlimit_mutex);
+ return ret;
+ }
+ }
+ mutex_unlock(&hashlimit_mutex);
+ return 0;
+}
+
+static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+
+ htable_put(info->hinfo);
+}
+
+static struct xt_match hashlimit_mt_reg[] __read_mostly = {
+ {
+ .name = "hashlimit",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .match = hashlimit_mt,
+ .matchsize = sizeof(struct xt_hashlimit_mtinfo1),
+ .checkentry = hashlimit_mt_check,
+ .destroy = hashlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "hashlimit",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = hashlimit_mt,
+ .matchsize = sizeof(struct xt_hashlimit_mtinfo1),
+ .checkentry = hashlimit_mt_check,
+ .destroy = hashlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+/* PROC stuff */
+static void *dl_seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(htable->lock)
+{
+ struct xt_hashlimit_htable *htable = s->private;
+ unsigned int *bucket;
+
+ spin_lock_bh(&htable->lock);
+ if (*pos >= htable->cfg.size)
+ return NULL;
+
+ bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC);
+ if (!bucket)
+ return ERR_PTR(-ENOMEM);
+
+ *bucket = *pos;
+ return bucket;
+}
+
+static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct xt_hashlimit_htable *htable = s->private;
+ unsigned int *bucket = (unsigned int *)v;
+
+ *pos = ++(*bucket);
+ if (*pos >= htable->cfg.size) {
+ kfree(v);
+ return NULL;
+ }
+ return bucket;
+}
+
+static void dl_seq_stop(struct seq_file *s, void *v)
+ __releases(htable->lock)
+{
+ struct xt_hashlimit_htable *htable = s->private;
+ unsigned int *bucket = (unsigned int *)v;
+
+ if (!IS_ERR(bucket))
+ kfree(bucket);
+ spin_unlock_bh(&htable->lock);
+}
+
+static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
+ struct seq_file *s)
+{
+ const struct xt_hashlimit_htable *ht = s->private;
+
+ spin_lock(&ent->lock);
+ /* recalculate to show accurate numbers */
+ rateinfo_recalc(ent, jiffies, ht->cfg.mode);
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+ (long)(ent->expires - jiffies)/HZ,
+ &ent->dst.ip.src,
+ ntohs(ent->dst.src_port),
+ &ent->dst.ip.dst,
+ ntohs(ent->dst.dst_port),
+ ent->rateinfo.credit, ent->rateinfo.credit_cap,
+ ent->rateinfo.cost);
+ break;
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ case NFPROTO_IPV6:
+ seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+ (long)(ent->expires - jiffies)/HZ,
+ &ent->dst.ip6.src,
+ ntohs(ent->dst.src_port),
+ &ent->dst.ip6.dst,
+ ntohs(ent->dst.dst_port),
+ ent->rateinfo.credit, ent->rateinfo.credit_cap,
+ ent->rateinfo.cost);
+ break;
+#endif
+ default:
+ BUG();
+ }
+ spin_unlock(&ent->lock);
+ return seq_has_overflowed(s);
+}
+
+static int dl_seq_show(struct seq_file *s, void *v)
+{
+ struct xt_hashlimit_htable *htable = s->private;
+ unsigned int *bucket = (unsigned int *)v;
+ struct dsthash_ent *ent;
+
+ if (!hlist_empty(&htable->hash[*bucket])) {
+ hlist_for_each_entry(ent, &htable->hash[*bucket], node)
+ if (dl_seq_real_show(ent, htable->family, s))
+ return -1;
+ }
+ return 0;
+}
+
+static const struct seq_operations dl_seq_ops = {
+ .start = dl_seq_start,
+ .next = dl_seq_next,
+ .stop = dl_seq_stop,
+ .show = dl_seq_show
+};
+
+static int dl_proc_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &dl_seq_ops);
+
+ if (!ret) {
+ struct seq_file *sf = file->private_data;
+ sf->private = PDE_DATA(inode);
+ }
+ return ret;
+}
+
+static const struct file_operations dl_file_ops = {
+ .owner = THIS_MODULE,
+ .open = dl_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static int __net_init hashlimit_proc_net_init(struct net *net)
+{
+ struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+
+ hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net);
+ if (!hashlimit_net->ipt_hashlimit)
+ return -ENOMEM;
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net);
+ if (!hashlimit_net->ip6t_hashlimit) {
+ remove_proc_entry("ipt_hashlimit", net->proc_net);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+static void __net_exit hashlimit_proc_net_exit(struct net *net)
+{
+ struct xt_hashlimit_htable *hinfo;
+ struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+
+ /* hashlimit_net_exit() is called before hashlimit_mt_destroy().
+ * Make sure that the parent ipt_hashlimit and ip6t_hashlimit proc
+ * entries is empty before trying to remove it.
+ */
+ mutex_lock(&hashlimit_mutex);
+ hlist_for_each_entry(hinfo, &hashlimit_net->htables, node)
+ htable_remove_proc_entry(hinfo);
+ hashlimit_net->ipt_hashlimit = NULL;
+ hashlimit_net->ip6t_hashlimit = NULL;
+ mutex_unlock(&hashlimit_mutex);
+
+ remove_proc_entry("ipt_hashlimit", net->proc_net);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ remove_proc_entry("ip6t_hashlimit", net->proc_net);
+#endif
+}
+
+static int __net_init hashlimit_net_init(struct net *net)
+{
+ struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+
+ INIT_HLIST_HEAD(&hashlimit_net->htables);
+ return hashlimit_proc_net_init(net);
+}
+
+static void __net_exit hashlimit_net_exit(struct net *net)
+{
+ hashlimit_proc_net_exit(net);
+}
+
+static struct pernet_operations hashlimit_net_ops = {
+ .init = hashlimit_net_init,
+ .exit = hashlimit_net_exit,
+ .id = &hashlimit_net_id,
+ .size = sizeof(struct hashlimit_net),
+};
+
+static int __init hashlimit_mt_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&hashlimit_net_ops);
+ if (err < 0)
+ return err;
+ err = xt_register_matches(hashlimit_mt_reg,
+ ARRAY_SIZE(hashlimit_mt_reg));
+ if (err < 0)
+ goto err1;
+
+ err = -ENOMEM;
+ hashlimit_cachep = kmem_cache_create("xt_hashlimit",
+ sizeof(struct dsthash_ent), 0, 0,
+ NULL);
+ if (!hashlimit_cachep) {
+ pr_warn("unable to create slab cache\n");
+ goto err2;
+ }
+ return 0;
+
+err2:
+ xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
+err1:
+ unregister_pernet_subsys(&hashlimit_net_ops);
+ return err;
+
+}
+
+static void __exit hashlimit_mt_exit(void)
+{
+ xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
+ unregister_pernet_subsys(&hashlimit_net_ops);
+
+ rcu_barrier_bh();
+ kmem_cache_destroy(hashlimit_cachep);
+}
+
+module_init(hashlimit_mt_init);
+module_exit(hashlimit_mt_exit);
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
new file mode 100644
index 000000000..9f4ab00c8
--- /dev/null
+++ b/net/netfilter/xt_helper.c
@@ -0,0 +1,99 @@
+/* iptables module to match on related connections */
+/*
+ * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_helper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Related connection matching");
+MODULE_ALIAS("ipt_helper");
+MODULE_ALIAS("ip6t_helper");
+
+
+static bool
+helper_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_helper_info *info = par->matchinfo;
+ const struct nf_conn *ct;
+ const struct nf_conn_help *master_help;
+ const struct nf_conntrack_helper *helper;
+ enum ip_conntrack_info ctinfo;
+ bool ret = info->invert;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || !ct->master)
+ return ret;
+
+ master_help = nfct_help(ct->master);
+ if (!master_help)
+ return ret;
+
+ /* rcu_read_lock()ed by nf_hook_slow */
+ helper = rcu_dereference(master_help->helper);
+ if (!helper)
+ return ret;
+
+ if (info->name[0] == '\0')
+ ret = !ret;
+ else
+ ret ^= !strncmp(helper->name, info->name,
+ strlen(helper->name));
+ return ret;
+}
+
+static int helper_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_helper_info *info = par->matchinfo;
+ int ret;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0) {
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+ }
+ info->name[29] = '\0';
+ return 0;
+}
+
+static void helper_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match helper_mt_reg __read_mostly = {
+ .name = "helper",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = helper_mt_check,
+ .match = helper_mt,
+ .destroy = helper_mt_destroy,
+ .matchsize = sizeof(struct xt_helper_info),
+ .me = THIS_MODULE,
+};
+
+static int __init helper_mt_init(void)
+{
+ return xt_register_match(&helper_mt_reg);
+}
+
+static void __exit helper_mt_exit(void)
+{
+ xt_unregister_match(&helper_mt_reg);
+}
+
+module_init(helper_mt_init);
+module_exit(helper_mt_exit);
diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c
new file mode 100644
index 000000000..003951149
--- /dev/null
+++ b/net/netfilter/xt_hl.c
@@ -0,0 +1,96 @@
+/*
+ * IP tables module for matching the value of the TTL
+ * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
+ *
+ * Hop Limit matching module
+ * (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ipt_ttl.h>
+#include <linux/netfilter_ipv6/ip6t_hl.h>
+
+MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
+MODULE_DESCRIPTION("Xtables: Hoplimit/TTL field match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ttl");
+MODULE_ALIAS("ip6t_hl");
+
+static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ipt_ttl_info *info = par->matchinfo;
+ const u8 ttl = ip_hdr(skb)->ttl;
+
+ switch (info->mode) {
+ case IPT_TTL_EQ:
+ return ttl == info->ttl;
+ case IPT_TTL_NE:
+ return ttl != info->ttl;
+ case IPT_TTL_LT:
+ return ttl < info->ttl;
+ case IPT_TTL_GT:
+ return ttl > info->ttl;
+ }
+
+ return false;
+}
+
+static bool hl_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct ip6t_hl_info *info = par->matchinfo;
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+
+ switch (info->mode) {
+ case IP6T_HL_EQ:
+ return ip6h->hop_limit == info->hop_limit;
+ case IP6T_HL_NE:
+ return ip6h->hop_limit != info->hop_limit;
+ case IP6T_HL_LT:
+ return ip6h->hop_limit < info->hop_limit;
+ case IP6T_HL_GT:
+ return ip6h->hop_limit > info->hop_limit;
+ }
+
+ return false;
+}
+
+static struct xt_match hl_mt_reg[] __read_mostly = {
+ {
+ .name = "ttl",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .match = ttl_mt,
+ .matchsize = sizeof(struct ipt_ttl_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "hl",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .match = hl_mt6,
+ .matchsize = sizeof(struct ip6t_hl_info),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init hl_mt_init(void)
+{
+ return xt_register_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg));
+}
+
+static void __exit hl_mt_exit(void)
+{
+ xt_unregister_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg));
+}
+
+module_init(hl_mt_init);
+module_exit(hl_mt_exit);
diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c
new file mode 100644
index 000000000..89d53104c
--- /dev/null
+++ b/net/netfilter/xt_ipcomp.c
@@ -0,0 +1,111 @@
+/* Kernel module to match IPComp parameters for IPv4 and IPv6
+ *
+ * Copyright (C) 2013 WindRiver
+ *
+ * Author:
+ * Fan Du <fan.du@windriver.com>
+ *
+ * Based on:
+ * net/netfilter/xt_esp.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter/xt_ipcomp.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Fan Du <fan.du@windriver.com>");
+MODULE_DESCRIPTION("Xtables: IPv4/6 IPsec-IPComp SPI match");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+ bool r;
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
+ r = (spi >= min && spi <= max) ^ invert;
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+ return r;
+}
+
+static bool comp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ip_comp_hdr _comphdr;
+ const struct ip_comp_hdr *chdr;
+ const struct xt_ipcomp *compinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ chdr = skb_header_pointer(skb, par->thoff, sizeof(_comphdr), &_comphdr);
+ if (chdr == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ pr_debug("Dropping evil IPComp tinygram.\n");
+ par->hotdrop = true;
+ return 0;
+ }
+
+ return spi_match(compinfo->spis[0], compinfo->spis[1],
+ ntohs(chdr->cpi),
+ !!(compinfo->invflags & XT_IPCOMP_INV_SPI));
+}
+
+static int comp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_ipcomp *compinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ if (compinfo->invflags & ~XT_IPCOMP_INV_MASK) {
+ pr_err("unknown flags %X\n", compinfo->invflags);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match comp_mt_reg[] __read_mostly = {
+ {
+ .name = "ipcomp",
+ .family = NFPROTO_IPV4,
+ .match = comp_mt,
+ .matchsize = sizeof(struct xt_ipcomp),
+ .proto = IPPROTO_COMP,
+ .checkentry = comp_mt_check,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "ipcomp",
+ .family = NFPROTO_IPV6,
+ .match = comp_mt,
+ .matchsize = sizeof(struct xt_ipcomp),
+ .proto = IPPROTO_COMP,
+ .checkentry = comp_mt_check,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init comp_mt_init(void)
+{
+ return xt_register_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg));
+}
+
+static void __exit comp_mt_exit(void)
+{
+ xt_unregister_matches(comp_mt_reg, ARRAY_SIZE(comp_mt_reg));
+}
+
+module_init(comp_mt_init);
+module_exit(comp_mt_exit);
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
new file mode 100644
index 000000000..b46626cdd
--- /dev/null
+++ b/net/netfilter/xt_iprange.c
@@ -0,0 +1,140 @@
+/*
+ * xt_iprange - Netfilter module to match IP address ranges
+ *
+ * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) CC Computer Consultants GmbH, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_iprange.h>
+
+static bool
+iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_iprange_mtinfo *info = par->matchinfo;
+ const struct iphdr *iph = ip_hdr(skb);
+ bool m;
+
+ if (info->flags & IPRANGE_SRC) {
+ m = ntohl(iph->saddr) < ntohl(info->src_min.ip);
+ m |= ntohl(iph->saddr) > ntohl(info->src_max.ip);
+ m ^= !!(info->flags & IPRANGE_SRC_INV);
+ if (m) {
+ pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
+ &iph->saddr,
+ (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
+ &info->src_min.ip,
+ &info->src_max.ip);
+ return false;
+ }
+ }
+ if (info->flags & IPRANGE_DST) {
+ m = ntohl(iph->daddr) < ntohl(info->dst_min.ip);
+ m |= ntohl(iph->daddr) > ntohl(info->dst_max.ip);
+ m ^= !!(info->flags & IPRANGE_DST_INV);
+ if (m) {
+ pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n",
+ &iph->daddr,
+ (info->flags & IPRANGE_DST_INV) ? "(INV) " : "",
+ &info->dst_min.ip,
+ &info->dst_max.ip);
+ return false;
+ }
+ }
+ return true;
+}
+
+static inline int
+iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b)
+{
+ unsigned int i;
+
+ for (i = 0; i < 4; ++i) {
+ if (a->s6_addr32[i] != b->s6_addr32[i])
+ return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]);
+ }
+
+ return 0;
+}
+
+static bool
+iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_iprange_mtinfo *info = par->matchinfo;
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ bool m;
+
+ if (info->flags & IPRANGE_SRC) {
+ m = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6);
+ m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr);
+ m ^= !!(info->flags & IPRANGE_SRC_INV);
+ if (m) {
+ pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n",
+ &iph->saddr,
+ (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
+ &info->src_min.in6,
+ &info->src_max.in6);
+ return false;
+ }
+ }
+ if (info->flags & IPRANGE_DST) {
+ m = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6);
+ m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr);
+ m ^= !!(info->flags & IPRANGE_DST_INV);
+ if (m) {
+ pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n",
+ &iph->daddr,
+ (info->flags & IPRANGE_DST_INV) ? "(INV) " : "",
+ &info->dst_min.in6,
+ &info->dst_max.in6);
+ return false;
+ }
+ }
+ return true;
+}
+
+static struct xt_match iprange_mt_reg[] __read_mostly = {
+ {
+ .name = "iprange",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .match = iprange_mt4,
+ .matchsize = sizeof(struct xt_iprange_mtinfo),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "iprange",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = iprange_mt6,
+ .matchsize = sizeof(struct xt_iprange_mtinfo),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init iprange_mt_init(void)
+{
+ return xt_register_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg));
+}
+
+static void __exit iprange_mt_exit(void)
+{
+ xt_unregister_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg));
+}
+
+module_init(iprange_mt_init);
+module_exit(iprange_mt_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching");
+MODULE_ALIAS("ipt_iprange");
+MODULE_ALIAS("ip6t_iprange");
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
new file mode 100644
index 000000000..8d47c3780
--- /dev/null
+++ b/net/netfilter/xt_ipvs.c
@@ -0,0 +1,188 @@
+/*
+ * xt_ipvs - kernel module to match IPVS connection properties
+ *
+ * Author: Hannes Eder <heder@google.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#endif
+#include <linux/ip_vs.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_ipvs.h>
+#include <net/netfilter/nf_conntrack.h>
+
+#include <net/ip_vs.h>
+
+MODULE_AUTHOR("Hannes Eder <heder@google.com>");
+MODULE_DESCRIPTION("Xtables: match IPVS connection properties");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ipvs");
+MODULE_ALIAS("ip6t_ipvs");
+
+/* borrowed from xt_conntrack */
+static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr,
+ const union nf_inet_addr *uaddr,
+ const union nf_inet_addr *umask,
+ unsigned int l3proto)
+{
+ if (l3proto == NFPROTO_IPV4)
+ return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
+#ifdef CONFIG_IP_VS_IPV6
+ else if (l3proto == NFPROTO_IPV6)
+ return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
+ &uaddr->in6) == 0;
+#endif
+ else
+ return false;
+}
+
+static bool
+ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_ipvs_mtinfo *data = par->matchinfo;
+ /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
+ const u_int8_t family = par->family;
+ struct ip_vs_iphdr iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn *cp;
+ bool match = true;
+
+ if (data->bitmask == XT_IPVS_IPVS_PROPERTY) {
+ match = skb->ipvs_property ^
+ !!(data->invert & XT_IPVS_IPVS_PROPERTY);
+ goto out;
+ }
+
+ /* other flags than XT_IPVS_IPVS_PROPERTY are set */
+ if (!skb->ipvs_property) {
+ match = false;
+ goto out;
+ }
+
+ ip_vs_fill_iph_skb(family, skb, &iph);
+
+ if (data->bitmask & XT_IPVS_PROTO)
+ if ((iph.protocol == data->l4proto) ^
+ !(data->invert & XT_IPVS_PROTO)) {
+ match = false;
+ goto out;
+ }
+
+ pp = ip_vs_proto_get(iph.protocol);
+ if (unlikely(!pp)) {
+ match = false;
+ goto out;
+ }
+
+ /*
+ * Check if the packet belongs to an existing entry
+ */
+ cp = pp->conn_out_get(family, skb, &iph, 1 /* inverse */);
+ if (unlikely(cp == NULL)) {
+ match = false;
+ goto out;
+ }
+
+ /*
+ * We found a connection, i.e. ct != 0, make sure to call
+ * __ip_vs_conn_put before returning. In our case jump to out_put_con.
+ */
+
+ if (data->bitmask & XT_IPVS_VPORT)
+ if ((cp->vport == data->vport) ^
+ !(data->invert & XT_IPVS_VPORT)) {
+ match = false;
+ goto out_put_cp;
+ }
+
+ if (data->bitmask & XT_IPVS_VPORTCTL)
+ if ((cp->control != NULL &&
+ cp->control->vport == data->vportctl) ^
+ !(data->invert & XT_IPVS_VPORTCTL)) {
+ match = false;
+ goto out_put_cp;
+ }
+
+ if (data->bitmask & XT_IPVS_DIR) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct == NULL || nf_ct_is_untracked(ct)) {
+ match = false;
+ goto out_put_cp;
+ }
+
+ if ((ctinfo >= IP_CT_IS_REPLY) ^
+ !!(data->invert & XT_IPVS_DIR)) {
+ match = false;
+ goto out_put_cp;
+ }
+ }
+
+ if (data->bitmask & XT_IPVS_METHOD)
+ if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^
+ !(data->invert & XT_IPVS_METHOD)) {
+ match = false;
+ goto out_put_cp;
+ }
+
+ if (data->bitmask & XT_IPVS_VADDR) {
+ if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr,
+ &data->vmask, family) ^
+ !(data->invert & XT_IPVS_VADDR)) {
+ match = false;
+ goto out_put_cp;
+ }
+ }
+
+out_put_cp:
+ __ip_vs_conn_put(cp);
+out:
+ pr_debug("match=%d\n", match);
+ return match;
+}
+
+static int ipvs_mt_check(const struct xt_mtchk_param *par)
+{
+ if (par->family != NFPROTO_IPV4
+#ifdef CONFIG_IP_VS_IPV6
+ && par->family != NFPROTO_IPV6
+#endif
+ ) {
+ pr_info("protocol family %u not supported\n", par->family);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct xt_match xt_ipvs_mt_reg __read_mostly = {
+ .name = "ipvs",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = ipvs_mt,
+ .checkentry = ipvs_mt_check,
+ .matchsize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+ .me = THIS_MODULE,
+};
+
+static int __init ipvs_mt_init(void)
+{
+ return xt_register_match(&xt_ipvs_mt_reg);
+}
+
+static void __exit ipvs_mt_exit(void)
+{
+ xt_unregister_match(&xt_ipvs_mt_reg);
+}
+
+module_init(ipvs_mt_init);
+module_exit(ipvs_mt_exit);
diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c
new file mode 100644
index 000000000..8aee57277
--- /dev/null
+++ b/net/netfilter/xt_l2tp.c
@@ -0,0 +1,354 @@
+/* Kernel module to match L2TP header parameters. */
+
+/* (C) 2013 James Chapman <jchapman@katalix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <linux/l2tp.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter/xt_l2tp.h>
+
+/* L2TP header masks */
+#define L2TP_HDR_T_BIT 0x8000
+#define L2TP_HDR_L_BIT 0x4000
+#define L2TP_HDR_VER 0x000f
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("Xtables: L2TP header match");
+MODULE_ALIAS("ipt_l2tp");
+MODULE_ALIAS("ip6t_l2tp");
+
+/* The L2TP fields that can be matched */
+struct l2tp_data {
+ u32 tid;
+ u32 sid;
+ u8 type;
+ u8 version;
+};
+
+union l2tp_val {
+ __be16 val16[2];
+ __be32 val32;
+};
+
+static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data)
+{
+ if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type))
+ return false;
+
+ if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version))
+ return false;
+
+ /* Check tid only for L2TPv3 control or any L2TPv2 packets */
+ if ((info->flags & XT_L2TP_TID) &&
+ ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) &&
+ (info->tid != data->tid))
+ return false;
+
+ /* Check sid only for L2TP data packets */
+ if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) &&
+ (info->sid != data->sid))
+ return false;
+
+ return true;
+}
+
+/* Parse L2TP header fields when UDP encapsulation is used. Handles
+ * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a
+ * different format. See
+ * RFC2661, Section 3.1, L2TPv2 Header Format
+ * RFC3931, Section 3.2.1, L2TPv3 Control Message Header
+ * RFC3931, Section 3.2.2, L2TPv3 Data Message Header
+ * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP
+ */
+static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+ int uhlen = sizeof(struct udphdr);
+ int offs = thoff + uhlen;
+ union l2tp_val *lh;
+ union l2tp_val lhbuf;
+ u16 flags;
+ struct l2tp_data data = { 0, };
+
+ if (par->fragoff != 0)
+ return false;
+
+ /* Extract L2TP header fields. The flags in the first 16 bits
+ * tell us where the other fields are.
+ */
+ lh = skb_header_pointer(skb, offs, 2, &lhbuf);
+ if (lh == NULL)
+ return false;
+
+ flags = ntohs(lh->val16[0]);
+ if (flags & L2TP_HDR_T_BIT)
+ data.type = XT_L2TP_TYPE_CONTROL;
+ else
+ data.type = XT_L2TP_TYPE_DATA;
+ data.version = (u8) flags & L2TP_HDR_VER;
+
+ /* Now extract the L2TP tid/sid. These are in different places
+ * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we
+ * must also check to see if the length field is present,
+ * since this affects the offsets into the packet of the
+ * tid/sid fields.
+ */
+ if (data.version == 3) {
+ lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf);
+ if (lh == NULL)
+ return false;
+ if (data.type == XT_L2TP_TYPE_CONTROL)
+ data.tid = ntohl(lh->val32);
+ else
+ data.sid = ntohl(lh->val32);
+ } else if (data.version == 2) {
+ if (flags & L2TP_HDR_L_BIT)
+ offs += 2;
+ lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf);
+ if (lh == NULL)
+ return false;
+ data.tid = (u32) ntohs(lh->val16[0]);
+ data.sid = (u32) ntohs(lh->val16[1]);
+ } else
+ return false;
+
+ return l2tp_match(info, &data);
+}
+
+/* Parse L2TP header fields for IP encapsulation (no UDP header).
+ * L2TPv3 data packets have a different form with IP encap. See
+ * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP.
+ * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP.
+ */
+static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+ union l2tp_val *lh;
+ union l2tp_val lhbuf;
+ struct l2tp_data data = { 0, };
+
+ /* For IP encap, the L2TP sid is the first 32-bits. */
+ lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf);
+ if (lh == NULL)
+ return false;
+ if (lh->val32 == 0) {
+ /* Must be a control packet. The L2TP tid is further
+ * into the packet.
+ */
+ data.type = XT_L2TP_TYPE_CONTROL;
+ lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf),
+ &lhbuf);
+ if (lh == NULL)
+ return false;
+ data.tid = ntohl(lh->val32);
+ } else {
+ data.sid = ntohl(lh->val32);
+ data.type = XT_L2TP_TYPE_DATA;
+ }
+
+ data.version = 3;
+
+ return l2tp_match(info, &data);
+}
+
+static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ u8 ipproto = iph->protocol;
+
+ /* l2tp_mt_check4 already restricts the transport protocol */
+ switch (ipproto) {
+ case IPPROTO_UDP:
+ return l2tp_udp_mt(skb, par, par->thoff);
+ case IPPROTO_L2TP:
+ return l2tp_ip_mt(skb, par, par->thoff);
+ }
+
+ return false;
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ unsigned int thoff = 0;
+ unsigned short fragoff = 0;
+ int ipproto;
+
+ ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL);
+ if (fragoff != 0)
+ return false;
+
+ /* l2tp_mt_check6 already restricts the transport protocol */
+ switch (ipproto) {
+ case IPPROTO_UDP:
+ return l2tp_udp_mt(skb, par, thoff);
+ case IPPROTO_L2TP:
+ return l2tp_ip_mt(skb, par, thoff);
+ }
+
+ return false;
+}
+#endif
+
+static int l2tp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+
+ /* Check for invalid flags */
+ if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION |
+ XT_L2TP_TYPE)) {
+ pr_info("unknown flags: %x\n", info->flags);
+ return -EINVAL;
+ }
+
+ /* At least one of tid, sid or type=control must be specified */
+ if ((!(info->flags & XT_L2TP_TID)) &&
+ (!(info->flags & XT_L2TP_SID)) &&
+ ((!(info->flags & XT_L2TP_TYPE)) ||
+ (info->type != XT_L2TP_TYPE_CONTROL))) {
+ pr_info("invalid flags combination: %x\n", info->flags);
+ return -EINVAL;
+ }
+
+ /* If version 2 is specified, check that incompatible params
+ * are not supplied
+ */
+ if (info->flags & XT_L2TP_VERSION) {
+ if ((info->version < 2) || (info->version > 3)) {
+ pr_info("wrong L2TP version: %u\n", info->version);
+ return -EINVAL;
+ }
+
+ if (info->version == 2) {
+ if ((info->flags & XT_L2TP_TID) &&
+ (info->tid > 0xffff)) {
+ pr_info("v2 tid > 0xffff: %u\n", info->tid);
+ return -EINVAL;
+ }
+ if ((info->flags & XT_L2TP_SID) &&
+ (info->sid > 0xffff)) {
+ pr_info("v2 sid > 0xffff: %u\n", info->sid);
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int l2tp_mt_check4(const struct xt_mtchk_param *par)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+ const struct ipt_entry *e = par->entryinfo;
+ const struct ipt_ip *ip = &e->ip;
+ int ret;
+
+ ret = l2tp_mt_check(par);
+ if (ret != 0)
+ return ret;
+
+ if ((ip->proto != IPPROTO_UDP) &&
+ (ip->proto != IPPROTO_L2TP)) {
+ pr_info("missing protocol rule (udp|l2tpip)\n");
+ return -EINVAL;
+ }
+
+ if ((ip->proto == IPPROTO_L2TP) &&
+ (info->version == 2)) {
+ pr_info("v2 doesn't support IP mode\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static int l2tp_mt_check6(const struct xt_mtchk_param *par)
+{
+ const struct xt_l2tp_info *info = par->matchinfo;
+ const struct ip6t_entry *e = par->entryinfo;
+ const struct ip6t_ip6 *ip = &e->ipv6;
+ int ret;
+
+ ret = l2tp_mt_check(par);
+ if (ret != 0)
+ return ret;
+
+ if ((ip->proto != IPPROTO_UDP) &&
+ (ip->proto != IPPROTO_L2TP)) {
+ pr_info("missing protocol rule (udp|l2tpip)\n");
+ return -EINVAL;
+ }
+
+ if ((ip->proto == IPPROTO_L2TP) &&
+ (info->version == 2)) {
+ pr_info("v2 doesn't support IP mode\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+#endif
+
+static struct xt_match l2tp_mt_reg[] __read_mostly = {
+ {
+ .name = "l2tp",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .match = l2tp_mt4,
+ .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)),
+ .checkentry = l2tp_mt_check4,
+ .hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD)),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "l2tp",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .match = l2tp_mt6,
+ .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)),
+ .checkentry = l2tp_mt_check6,
+ .hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD)),
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init l2tp_mt_init(void)
+{
+ return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg));
+}
+
+static void __exit l2tp_mt_exit(void)
+{
+ xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg));
+}
+
+module_init(l2tp_mt_init);
+module_exit(l2tp_mt_exit);
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
new file mode 100644
index 000000000..176e5570a
--- /dev/null
+++ b/net/netfilter/xt_length.c
@@ -0,0 +1,70 @@
+/* Kernel module to match packet length. */
+/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+
+#include <linux/netfilter/xt_length.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Xtables: Packet length (Layer3,4,5) match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_length");
+MODULE_ALIAS("ip6t_length");
+
+static bool
+length_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_length_info *info = par->matchinfo;
+ u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len);
+
+ return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+
+static bool
+length_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_length_info *info = par->matchinfo;
+ const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) +
+ sizeof(struct ipv6hdr);
+
+ return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+
+static struct xt_match length_mt_reg[] __read_mostly = {
+ {
+ .name = "length",
+ .family = NFPROTO_IPV4,
+ .match = length_mt,
+ .matchsize = sizeof(struct xt_length_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "length",
+ .family = NFPROTO_IPV6,
+ .match = length_mt6,
+ .matchsize = sizeof(struct xt_length_info),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init length_mt_init(void)
+{
+ return xt_register_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg));
+}
+
+static void __exit length_mt_exit(void)
+{
+ xt_unregister_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg));
+}
+
+module_init(length_mt_init);
+module_exit(length_mt_exit);
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
new file mode 100644
index 000000000..bef850596
--- /dev/null
+++ b/net/netfilter/xt_limit.c
@@ -0,0 +1,210 @@
+/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
+ * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_limit.h>
+
+struct xt_limit_priv {
+ unsigned long prev;
+ uint32_t credit;
+};
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
+MODULE_DESCRIPTION("Xtables: rate-limit match");
+MODULE_ALIAS("ipt_limit");
+MODULE_ALIAS("ip6t_limit");
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+static DEFINE_SPINLOCK(limit_lock);
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+ this algorithm. The `average rate' in jiffies becomes your initial
+ amount of credit `credit' and the most credit you can ever have
+ `credit_cap'. The `peak rate' becomes the cost of passing the
+ test, `cost'.
+
+ `prev' tracks the last packet hit: you gain one credit per jiffy.
+ If you get credit balance more than this, the extra credit is
+ discarded. Every time the match passes, you lose `cost' credits;
+ if you don't have that many, the test fails.
+
+ See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+ To get the maxmum range, we multiply by this factor (ie. you get N
+ credits per jiffy). We want to allow a rate as low as 1 per day
+ (slowest userspace tool allows), which means
+ CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+static bool
+limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_rateinfo *r = par->matchinfo;
+ struct xt_limit_priv *priv = r->master;
+ unsigned long now = jiffies;
+
+ spin_lock_bh(&limit_lock);
+ priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
+ if (priv->credit > r->credit_cap)
+ priv->credit = r->credit_cap;
+
+ if (priv->credit >= r->cost) {
+ /* We're not limited. */
+ priv->credit -= r->cost;
+ spin_unlock_bh(&limit_lock);
+ return true;
+ }
+
+ spin_unlock_bh(&limit_lock);
+ return false;
+}
+
+/* Precision saver. */
+static u32 user2credits(u32 user)
+{
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+ /* Divide first. */
+ return (user / XT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+ return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE;
+}
+
+static int limit_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_rateinfo *r = par->matchinfo;
+ struct xt_limit_priv *priv;
+
+ /* Check for overflow. */
+ if (r->burst == 0
+ || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
+ pr_info("Overflow, try lower: %u/%u\n",
+ r->avg, r->burst);
+ return -ERANGE;
+ }
+
+ priv = kmalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv == NULL)
+ return -ENOMEM;
+
+ /* For SMP, we only want to use one set of state. */
+ r->master = priv;
+ /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
+ 128. */
+ priv->prev = jiffies;
+ priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
+ if (r->cost == 0) {
+ r->credit_cap = priv->credit; /* Credits full. */
+ r->cost = user2credits(r->avg);
+ }
+ return 0;
+}
+
+static void limit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_rateinfo *info = par->matchinfo;
+
+ kfree(info->master);
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_rateinfo {
+ u_int32_t avg;
+ u_int32_t burst;
+
+ compat_ulong_t prev;
+ u_int32_t credit;
+ u_int32_t credit_cap, cost;
+
+ u_int32_t master;
+};
+
+/* To keep the full "prev" timestamp, the upper 32 bits are stored in the
+ * master pointer, which does not need to be preserved. */
+static void limit_mt_compat_from_user(void *dst, const void *src)
+{
+ const struct compat_xt_rateinfo *cm = src;
+ struct xt_rateinfo m = {
+ .avg = cm->avg,
+ .burst = cm->burst,
+ .prev = cm->prev | (unsigned long)cm->master << 32,
+ .credit = cm->credit,
+ .credit_cap = cm->credit_cap,
+ .cost = cm->cost,
+ };
+ memcpy(dst, &m, sizeof(m));
+}
+
+static int limit_mt_compat_to_user(void __user *dst, const void *src)
+{
+ const struct xt_rateinfo *m = src;
+ struct compat_xt_rateinfo cm = {
+ .avg = m->avg,
+ .burst = m->burst,
+ .prev = m->prev,
+ .credit = m->credit,
+ .credit_cap = m->credit_cap,
+ .cost = m->cost,
+ .master = m->prev >> 32,
+ };
+ return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match limit_mt_reg __read_mostly = {
+ .name = "limit",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = limit_mt,
+ .checkentry = limit_mt_check,
+ .destroy = limit_mt_destroy,
+ .matchsize = sizeof(struct xt_rateinfo),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_xt_rateinfo),
+ .compat_from_user = limit_mt_compat_from_user,
+ .compat_to_user = limit_mt_compat_to_user,
+#endif
+ .me = THIS_MODULE,
+};
+
+static int __init limit_mt_init(void)
+{
+ return xt_register_match(&limit_mt_reg);
+}
+
+static void __exit limit_mt_exit(void)
+{
+ xt_unregister_match(&limit_mt_reg);
+}
+
+module_init(limit_mt_init);
+module_exit(limit_mt_exit);
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
new file mode 100644
index 000000000..d5b4fd4f9
--- /dev/null
+++ b/net/netfilter/xt_mac.c
@@ -0,0 +1,66 @@
+/* Kernel module to match MAC address parameters. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/xt_mac.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: MAC address match");
+MODULE_ALIAS("ipt_mac");
+MODULE_ALIAS("ip6t_mac");
+
+static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_mac_info *info = par->matchinfo;
+ bool ret;
+
+ if (skb->dev == NULL || skb->dev->type != ARPHRD_ETHER)
+ return false;
+ if (skb_mac_header(skb) < skb->head)
+ return false;
+ if (skb_mac_header(skb) + ETH_HLEN > skb->data)
+ return false;
+ ret = ether_addr_equal(eth_hdr(skb)->h_source, info->srcaddr);
+ ret ^= info->invert;
+ return ret;
+}
+
+static struct xt_match mac_mt_reg __read_mostly = {
+ .name = "mac",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = mac_mt,
+ .matchsize = sizeof(struct xt_mac_info),
+ .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD),
+ .me = THIS_MODULE,
+};
+
+static int __init mac_mt_init(void)
+{
+ return xt_register_match(&mac_mt_reg);
+}
+
+static void __exit mac_mt_exit(void)
+{
+ xt_unregister_match(&mac_mt_reg);
+}
+
+module_init(mac_mt_init);
+module_exit(mac_mt_exit);
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
new file mode 100644
index 000000000..233452387
--- /dev/null
+++ b/net/netfilter/xt_mark.c
@@ -0,0 +1,84 @@
+/*
+ * xt_mark - Netfilter module to match NFMARK value
+ *
+ * (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ * Jan Engelhardt <jengelh@medozas.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/xt_mark.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("Xtables: packet mark operations");
+MODULE_ALIAS("ipt_mark");
+MODULE_ALIAS("ip6t_mark");
+MODULE_ALIAS("ipt_MARK");
+MODULE_ALIAS("ip6t_MARK");
+
+static unsigned int
+mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_mark_tginfo2 *info = par->targinfo;
+
+ skb->mark = (skb->mark & ~info->mask) ^ info->mark;
+ return XT_CONTINUE;
+}
+
+static bool
+mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_mark_mtinfo1 *info = par->matchinfo;
+
+ return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static struct xt_target mark_tg_reg __read_mostly = {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_UNSPEC,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+};
+
+static struct xt_match mark_mt_reg __read_mostly = {
+ .name = "mark",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .match = mark_mt,
+ .matchsize = sizeof(struct xt_mark_mtinfo1),
+ .me = THIS_MODULE,
+};
+
+static int __init mark_mt_init(void)
+{
+ int ret;
+
+ ret = xt_register_target(&mark_tg_reg);
+ if (ret < 0)
+ return ret;
+ ret = xt_register_match(&mark_mt_reg);
+ if (ret < 0) {
+ xt_unregister_target(&mark_tg_reg);
+ return ret;
+ }
+ return 0;
+}
+
+static void __exit mark_mt_exit(void)
+{
+ xt_unregister_match(&mark_mt_reg);
+ xt_unregister_target(&mark_tg_reg);
+}
+
+module_init(mark_mt_init);
+module_exit(mark_mt_exit);
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
new file mode 100644
index 000000000..ac1d3c3d0
--- /dev/null
+++ b/net/netfilter/xt_multiport.c
@@ -0,0 +1,165 @@
+/* Kernel module to match one of a list of TCP/UDP(-Lite)/SCTP/DCCP ports:
+ ports are in the same place so we can treat them as equal. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+
+#include <linux/netfilter/xt_multiport.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: multiple port matching for TCP, UDP, UDP-Lite, SCTP and DCCP");
+MODULE_ALIAS("ipt_multiport");
+MODULE_ALIAS("ip6t_multiport");
+
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline bool
+ports_match_v1(const struct xt_multiport_v1 *minfo,
+ u_int16_t src, u_int16_t dst)
+{
+ unsigned int i;
+ u_int16_t s, e;
+
+ for (i = 0; i < minfo->count; i++) {
+ s = minfo->ports[i];
+
+ if (minfo->pflags[i]) {
+ /* range port matching */
+ e = minfo->ports[++i];
+ pr_debug("src or dst matches with %d-%d?\n", s, e);
+
+ if (minfo->flags == XT_MULTIPORT_SOURCE
+ && src >= s && src <= e)
+ return true ^ minfo->invert;
+ if (minfo->flags == XT_MULTIPORT_DESTINATION
+ && dst >= s && dst <= e)
+ return true ^ minfo->invert;
+ if (minfo->flags == XT_MULTIPORT_EITHER
+ && ((dst >= s && dst <= e)
+ || (src >= s && src <= e)))
+ return true ^ minfo->invert;
+ } else {
+ /* exact port matching */
+ pr_debug("src or dst matches with %d?\n", s);
+
+ if (minfo->flags == XT_MULTIPORT_SOURCE
+ && src == s)
+ return true ^ minfo->invert;
+ if (minfo->flags == XT_MULTIPORT_DESTINATION
+ && dst == s)
+ return true ^ minfo->invert;
+ if (minfo->flags == XT_MULTIPORT_EITHER
+ && (src == s || dst == s))
+ return true ^ minfo->invert;
+ }
+ }
+
+ return minfo->invert;
+}
+
+static bool
+multiport_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const __be16 *pptr;
+ __be16 _ports[2];
+ const struct xt_multiport_v1 *multiinfo = par->matchinfo;
+
+ if (par->fragoff != 0)
+ return false;
+
+ pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports);
+ if (pptr == NULL) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ pr_debug("Dropping evil offset=0 tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1]));
+}
+
+static inline bool
+check(u_int16_t proto,
+ u_int8_t ip_invflags,
+ u_int8_t match_flags,
+ u_int8_t count)
+{
+ /* Must specify supported protocol, no unknown flags or bad count */
+ return (proto == IPPROTO_TCP || proto == IPPROTO_UDP
+ || proto == IPPROTO_UDPLITE
+ || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP)
+ && !(ip_invflags & XT_INV_PROTO)
+ && (match_flags == XT_MULTIPORT_SOURCE
+ || match_flags == XT_MULTIPORT_DESTINATION
+ || match_flags == XT_MULTIPORT_EITHER)
+ && count <= XT_MULTI_PORTS;
+}
+
+static int multiport_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct ipt_ip *ip = par->entryinfo;
+ const struct xt_multiport_v1 *multiinfo = par->matchinfo;
+
+ return check(ip->proto, ip->invflags, multiinfo->flags,
+ multiinfo->count) ? 0 : -EINVAL;
+}
+
+static int multiport_mt6_check(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_ip6 *ip = par->entryinfo;
+ const struct xt_multiport_v1 *multiinfo = par->matchinfo;
+
+ return check(ip->proto, ip->invflags, multiinfo->flags,
+ multiinfo->count) ? 0 : -EINVAL;
+}
+
+static struct xt_match multiport_mt_reg[] __read_mostly = {
+ {
+ .name = "multiport",
+ .family = NFPROTO_IPV4,
+ .revision = 1,
+ .checkentry = multiport_mt_check,
+ .match = multiport_mt,
+ .matchsize = sizeof(struct xt_multiport_v1),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "multiport",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .checkentry = multiport_mt6_check,
+ .match = multiport_mt,
+ .matchsize = sizeof(struct xt_multiport_v1),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init multiport_mt_init(void)
+{
+ return xt_register_matches(multiport_mt_reg,
+ ARRAY_SIZE(multiport_mt_reg));
+}
+
+static void __exit multiport_mt_exit(void)
+{
+ xt_unregister_matches(multiport_mt_reg, ARRAY_SIZE(multiport_mt_reg));
+}
+
+module_init(multiport_mt_init);
+module_exit(multiport_mt_exit);
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
new file mode 100644
index 000000000..bea7464cc
--- /dev/null
+++ b/net/netfilter/xt_nat.c
@@ -0,0 +1,170 @@
+/*
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat_core.h>
+
+static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (mr->rangesize != 1) {
+ pr_info("%s: multiple ranges no longer supported\n",
+ par->target->name);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void xt_nat_convert_range(struct nf_nat_range *dst,
+ const struct nf_nat_ipv4_range *src)
+{
+ memset(&dst->min_addr, 0, sizeof(dst->min_addr));
+ memset(&dst->max_addr, 0, sizeof(dst->max_addr));
+
+ dst->flags = src->flags;
+ dst->min_addr.ip = src->min_ip;
+ dst->max_addr.ip = src->max_ip;
+ dst->min_proto = src->min;
+ dst->max_proto = src->max;
+}
+
+static unsigned int
+xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range range;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ xt_nat_convert_range(&range, &mr->range[0]);
+ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range range;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ xt_nat_convert_range(&range, &mr->range[0]);
+ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+static unsigned int
+xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct != NULL &&
+ (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST);
+}
+
+static struct xt_target xt_nat_target_reg[] __read_mostly = {
+ {
+ .name = "SNAT",
+ .revision = 0,
+ .checkentry = xt_nat_checkentry_v0,
+ .target = xt_snat_target_v0,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .family = NFPROTO_IPV4,
+ .table = "nat",
+ .hooks = (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DNAT",
+ .revision = 0,
+ .checkentry = xt_nat_checkentry_v0,
+ .target = xt_dnat_target_v0,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .family = NFPROTO_IPV4,
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "SNAT",
+ .revision = 1,
+ .target = xt_snat_target_v1,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "DNAT",
+ .revision = 1,
+ .target = xt_dnat_target_v1,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init xt_nat_init(void)
+{
+ return xt_register_targets(xt_nat_target_reg,
+ ARRAY_SIZE(xt_nat_target_reg));
+}
+
+static void __exit xt_nat_exit(void)
+{
+ xt_unregister_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg));
+}
+
+module_init(xt_nat_init);
+module_exit(xt_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_ALIAS("ipt_SNAT");
+MODULE_ALIAS("ipt_DNAT");
+MODULE_ALIAS("ip6t_SNAT");
+MODULE_ALIAS("ip6t_DNAT");
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
new file mode 100644
index 000000000..8c646ed9c
--- /dev/null
+++ b/net/netfilter/xt_nfacct.c
@@ -0,0 +1,79 @@
+/*
+ * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
+ * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 (or any
+ * later at your option) as published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/nfnetlink_acct.h>
+#include <linux/netfilter/xt_nfacct.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: match for the extended accounting infrastructure");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_nfacct");
+MODULE_ALIAS("ip6t_nfacct");
+
+static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ int overquota;
+ const struct xt_nfacct_match_info *info = par->targinfo;
+
+ nfnl_acct_update(skb, info->nfacct);
+
+ overquota = nfnl_acct_overquota(skb, info->nfacct);
+
+ return overquota == NFACCT_UNDERQUOTA ? false : true;
+}
+
+static int
+nfacct_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_nfacct_match_info *info = par->matchinfo;
+ struct nf_acct *nfacct;
+
+ nfacct = nfnl_acct_find_get(info->name);
+ if (nfacct == NULL) {
+ pr_info("xt_nfacct: accounting object with name `%s' "
+ "does not exists\n", info->name);
+ return -ENOENT;
+ }
+ info->nfacct = nfacct;
+ return 0;
+}
+
+static void
+nfacct_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_nfacct_match_info *info = par->matchinfo;
+
+ nfnl_acct_put(info->nfacct);
+}
+
+static struct xt_match nfacct_mt_reg __read_mostly = {
+ .name = "nfacct",
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfacct_mt_checkentry,
+ .match = nfacct_mt,
+ .destroy = nfacct_mt_destroy,
+ .matchsize = sizeof(struct xt_nfacct_match_info),
+ .me = THIS_MODULE,
+};
+
+static int __init nfacct_mt_init(void)
+{
+ return xt_register_match(&nfacct_mt_reg);
+}
+
+static void __exit nfacct_mt_exit(void)
+{
+ xt_unregister_match(&nfacct_mt_reg);
+}
+
+module_init(nfacct_mt_init);
+module_exit(nfacct_mt_exit);
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
new file mode 100644
index 000000000..0778855ea
--- /dev/null
+++ b/net/netfilter/xt_osf.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2003+ Evgeniy Polyakov <zbr@ioremap.net>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/xt_osf.h>
+
+struct xt_osf_finger {
+ struct rcu_head rcu_head;
+ struct list_head finger_entry;
+ struct xt_osf_user_finger finger;
+};
+
+enum osf_fmatch_states {
+ /* Packet does not match the fingerprint */
+ FMATCH_WRONG = 0,
+ /* Packet matches the fingerprint */
+ FMATCH_OK,
+ /* Options do not match the fingerprint, but header does */
+ FMATCH_OPT_WRONG,
+};
+
+/*
+ * Indexed by dont-fragment bit.
+ * It is the only constant value in the fingerprint.
+ */
+static struct list_head xt_osf_fingers[2];
+
+static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
+ [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) },
+};
+
+static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const osf_attrs[])
+{
+ struct xt_osf_user_finger *f;
+ struct xt_osf_finger *kf = NULL, *sf;
+ int err = 0;
+
+ if (!osf_attrs[OSF_ATTR_FINGER])
+ return -EINVAL;
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -EINVAL;
+
+ f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+ kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL);
+ if (!kf)
+ return -ENOMEM;
+
+ memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger));
+
+ list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
+ if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
+ continue;
+
+ kfree(kf);
+ kf = NULL;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ err = -EEXIST;
+ break;
+ }
+
+ /*
+ * We are protected by nfnl mutex.
+ */
+ if (kf)
+ list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]);
+
+ return err;
+}
+
+static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const osf_attrs[])
+{
+ struct xt_osf_user_finger *f;
+ struct xt_osf_finger *sf;
+ int err = -ENOENT;
+
+ if (!osf_attrs[OSF_ATTR_FINGER])
+ return -EINVAL;
+
+ f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+ list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
+ if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
+ continue;
+
+ /*
+ * We are protected by nfnl mutex.
+ */
+ list_del_rcu(&sf->finger_entry);
+ kfree_rcu(sf, rcu_head);
+
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = {
+ [OSF_MSG_ADD] = {
+ .call = xt_osf_add_callback,
+ .attr_count = OSF_ATTR_MAX,
+ .policy = xt_osf_policy,
+ },
+ [OSF_MSG_REMOVE] = {
+ .call = xt_osf_remove_callback,
+ .attr_count = OSF_ATTR_MAX,
+ .policy = xt_osf_policy,
+ },
+};
+
+static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
+ .name = "osf",
+ .subsys_id = NFNL_SUBSYS_OSF,
+ .cb_count = OSF_MSG_MAX,
+ .cb = xt_osf_nfnetlink_callbacks,
+};
+
+static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
+ unsigned char f_ttl)
+{
+ const struct iphdr *ip = ip_hdr(skb);
+
+ if (info->flags & XT_OSF_TTL) {
+ if (info->ttl == XT_OSF_TTL_TRUE)
+ return ip->ttl == f_ttl;
+ if (info->ttl == XT_OSF_TTL_NOCHECK)
+ return 1;
+ else if (ip->ttl <= f_ttl)
+ return 1;
+ else {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ int ret = 0;
+
+ for_ifa(in_dev) {
+ if (inet_ifa_match(ip->saddr, ifa)) {
+ ret = (ip->ttl == f_ttl);
+ break;
+ }
+ }
+ endfor_ifa(in_dev);
+
+ return ret;
+ }
+ }
+
+ return ip->ttl == f_ttl;
+}
+
+static bool
+xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
+{
+ const struct xt_osf_info *info = p->matchinfo;
+ const struct iphdr *ip = ip_hdr(skb);
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+ int fmatch = FMATCH_WRONG, fcount = 0;
+ unsigned int optsize = 0, check_WSS = 0;
+ u16 window, totlen, mss = 0;
+ bool df;
+ const unsigned char *optp = NULL, *_optp = NULL;
+ unsigned char opts[MAX_IPOPTLEN];
+ const struct xt_osf_finger *kf;
+ const struct xt_osf_user_finger *f;
+ struct net *net = dev_net(p->in ? p->in : p->out);
+
+ if (!info)
+ return false;
+
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+ if (!tcp)
+ return false;
+
+ if (!tcp->syn)
+ return false;
+
+ totlen = ntohs(ip->tot_len);
+ df = ntohs(ip->frag_off) & IP_DF;
+ window = ntohs(tcp->window);
+
+ if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+ optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+ _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+ sizeof(struct tcphdr), optsize, opts);
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
+ int foptsize, optnum;
+
+ f = &kf->finger;
+
+ if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
+ continue;
+
+ optp = _optp;
+ fmatch = FMATCH_WRONG;
+
+ if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl))
+ continue;
+
+ /*
+ * Should not happen if userspace parser was written correctly.
+ */
+ if (f->wss.wc >= OSF_WSS_MAX)
+ continue;
+
+ /* Check options */
+
+ foptsize = 0;
+ for (optnum = 0; optnum < f->opt_num; ++optnum)
+ foptsize += f->opt[optnum].length;
+
+ if (foptsize > MAX_IPOPTLEN ||
+ optsize > MAX_IPOPTLEN ||
+ optsize != foptsize)
+ continue;
+
+ check_WSS = f->wss.wc;
+
+ for (optnum = 0; optnum < f->opt_num; ++optnum) {
+ if (f->opt[optnum].kind == (*optp)) {
+ __u32 len = f->opt[optnum].length;
+ const __u8 *optend = optp + len;
+ int loop_cont = 0;
+
+ fmatch = FMATCH_OK;
+
+ switch (*optp) {
+ case OSFOPT_MSS:
+ mss = optp[3];
+ mss <<= 8;
+ mss |= optp[2];
+
+ mss = ntohs((__force __be16)mss);
+ break;
+ case OSFOPT_TS:
+ loop_cont = 1;
+ break;
+ }
+
+ optp = optend;
+ } else
+ fmatch = FMATCH_OPT_WRONG;
+
+ if (fmatch != FMATCH_OK)
+ break;
+ }
+
+ if (fmatch != FMATCH_OPT_WRONG) {
+ fmatch = FMATCH_WRONG;
+
+ switch (check_WSS) {
+ case OSF_WSS_PLAIN:
+ if (f->wss.val == 0 || window == f->wss.val)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MSS:
+ /*
+ * Some smart modems decrease mangle MSS to
+ * SMART_MSS_2, so we check standard, decreased
+ * and the one provided in the fingerprint MSS
+ * values.
+ */
+#define SMART_MSS_1 1460
+#define SMART_MSS_2 1448
+ if (window == f->wss.val * mss ||
+ window == f->wss.val * SMART_MSS_1 ||
+ window == f->wss.val * SMART_MSS_2)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MTU:
+ if (window == f->wss.val * (mss + 40) ||
+ window == f->wss.val * (SMART_MSS_1 + 40) ||
+ window == f->wss.val * (SMART_MSS_2 + 40))
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MODULO:
+ if ((window % f->wss.val) == 0)
+ fmatch = FMATCH_OK;
+ break;
+ }
+ }
+
+ if (fmatch != FMATCH_OK)
+ continue;
+
+ fcount++;
+
+ if (info->flags & XT_OSF_LOG)
+ nf_log_packet(net, p->family, p->hooknum, skb,
+ p->in, p->out, NULL,
+ "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+ f->genre, f->version, f->subtype,
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest),
+ f->ttl - ip->ttl);
+
+ if ((info->flags & XT_OSF_LOG) &&
+ info->loglevel == XT_OSF_LOGLEVEL_FIRST)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!fcount && (info->flags & XT_OSF_LOG))
+ nf_log_packet(net, p->family, p->hooknum, skb, p->in,
+ p->out, NULL,
+ "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest));
+
+ if (fcount)
+ fmatch = FMATCH_OK;
+
+ return fmatch == FMATCH_OK;
+}
+
+static struct xt_match xt_osf_match = {
+ .name = "osf",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .proto = IPPROTO_TCP,
+ .hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_FORWARD),
+ .match = xt_osf_match_packet,
+ .matchsize = sizeof(struct xt_osf_info),
+ .me = THIS_MODULE,
+};
+
+static int __init xt_osf_init(void)
+{
+ int err = -EINVAL;
+ int i;
+
+ for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i)
+ INIT_LIST_HEAD(&xt_osf_fingers[i]);
+
+ err = nfnetlink_subsys_register(&xt_osf_nfnetlink);
+ if (err < 0) {
+ pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
+ goto err_out_exit;
+ }
+
+ err = xt_register_match(&xt_osf_match);
+ if (err) {
+ pr_err("Failed to register OS fingerprint "
+ "matching module (%d)\n", err);
+ goto err_out_remove;
+ }
+
+ return 0;
+
+err_out_remove:
+ nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
+err_out_exit:
+ return err;
+}
+
+static void __exit xt_osf_fini(void)
+{
+ struct xt_osf_finger *f;
+ int i;
+
+ nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
+ xt_unregister_match(&xt_osf_match);
+
+ rcu_read_lock();
+ for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) {
+
+ list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {
+ list_del_rcu(&f->finger_entry);
+ kfree_rcu(f, rcu_head);
+ }
+ }
+ rcu_read_unlock();
+
+ rcu_barrier();
+}
+
+module_init(xt_osf_init);
+module_exit(xt_osf_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_DESCRIPTION("Passive OS fingerprint matching.");
+MODULE_ALIAS("ipt_osf");
+MODULE_ALIAS("ip6t_osf");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
new file mode 100644
index 000000000..ca2e577ed
--- /dev/null
+++ b/net/netfilter/xt_owner.c
@@ -0,0 +1,100 @@
+/*
+ * Kernel module to match various things tied to sockets associated with
+ * locally generated outgoing packets.
+ *
+ * (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_owner.h>
+
+static int owner_check(const struct xt_mtchk_param *par)
+{
+ struct xt_owner_match_info *info = par->matchinfo;
+
+ /* For now only allow adding matches from the initial user namespace */
+ if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) &&
+ (current_user_ns() != &init_user_ns))
+ return -EINVAL;
+ return 0;
+}
+
+static bool
+owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_owner_match_info *info = par->matchinfo;
+ const struct file *filp;
+
+ if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+ return (info->match ^ info->invert) == 0;
+ else if (info->match & info->invert & XT_OWNER_SOCKET)
+ /*
+ * Socket exists but user wanted ! --socket-exists.
+ * (Single ampersands intended.)
+ */
+ return false;
+
+ filp = skb->sk->sk_socket->file;
+ if (filp == NULL)
+ return ((info->match ^ info->invert) &
+ (XT_OWNER_UID | XT_OWNER_GID)) == 0;
+
+ if (info->match & XT_OWNER_UID) {
+ kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
+ kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
+ if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
+ uid_lte(filp->f_cred->fsuid, uid_max)) ^
+ !(info->invert & XT_OWNER_UID))
+ return false;
+ }
+
+ if (info->match & XT_OWNER_GID) {
+ kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
+ kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
+ if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
+ gid_lte(filp->f_cred->fsgid, gid_max)) ^
+ !(info->invert & XT_OWNER_GID))
+ return false;
+ }
+
+ return true;
+}
+
+static struct xt_match owner_mt_reg __read_mostly = {
+ .name = "owner",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = owner_check,
+ .match = owner_mt,
+ .matchsize = sizeof(struct xt_owner_match_info),
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING),
+ .me = THIS_MODULE,
+};
+
+static int __init owner_mt_init(void)
+{
+ return xt_register_match(&owner_mt_reg);
+}
+
+static void __exit owner_mt_exit(void)
+{
+ xt_unregister_match(&owner_mt_reg);
+}
+
+module_init(owner_mt_init);
+module_exit(owner_mt_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: socket owner matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_owner");
+MODULE_ALIAS("ip6t_owner");
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
new file mode 100644
index 000000000..1caaccbc3
--- /dev/null
+++ b/net/netfilter/xt_physdev.c
@@ -0,0 +1,140 @@
+/* Kernel module to match the bridge port in and
+ * out device for IP packets coming into contact with a bridge. */
+
+/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter/xt_physdev.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/br_netfilter.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("Xtables: Bridge physical device match");
+MODULE_ALIAS("ipt_physdev");
+MODULE_ALIAS("ip6t_physdev");
+
+
+static bool
+physdev_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_physdev_info *info = par->matchinfo;
+ const struct net_device *physdev;
+ unsigned long ret;
+ const char *indev, *outdev;
+
+ /* Not a bridged IP packet or no info available yet:
+ * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
+ * the destination device will be a bridge. */
+ if (!skb->nf_bridge) {
+ /* Return MATCH if the invert flags of the used options are on */
+ if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
+ !(info->invert & XT_PHYSDEV_OP_BRIDGED))
+ return false;
+ if ((info->bitmask & XT_PHYSDEV_OP_ISIN) &&
+ !(info->invert & XT_PHYSDEV_OP_ISIN))
+ return false;
+ if ((info->bitmask & XT_PHYSDEV_OP_ISOUT) &&
+ !(info->invert & XT_PHYSDEV_OP_ISOUT))
+ return false;
+ if ((info->bitmask & XT_PHYSDEV_OP_IN) &&
+ !(info->invert & XT_PHYSDEV_OP_IN))
+ return false;
+ if ((info->bitmask & XT_PHYSDEV_OP_OUT) &&
+ !(info->invert & XT_PHYSDEV_OP_OUT))
+ return false;
+ return true;
+ }
+
+ physdev = nf_bridge_get_physoutdev(skb);
+ outdev = physdev ? physdev->name : NULL;
+
+ /* This only makes sense in the FORWARD and POSTROUTING chains */
+ if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
+ (!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
+ return false;
+
+ physdev = nf_bridge_get_physindev(skb);
+ indev = physdev ? physdev->name : NULL;
+
+ if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
+ (!indev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) ||
+ (info->bitmask & XT_PHYSDEV_OP_ISOUT &&
+ (!outdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT))))
+ return false;
+
+ if (!(info->bitmask & XT_PHYSDEV_OP_IN))
+ goto match_outdev;
+
+ if (indev) {
+ ret = ifname_compare_aligned(indev, info->physindev,
+ info->in_mask);
+
+ if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN))
+ return false;
+ }
+
+match_outdev:
+ if (!(info->bitmask & XT_PHYSDEV_OP_OUT))
+ return true;
+
+ if (!outdev)
+ return false;
+
+ ret = ifname_compare_aligned(outdev, info->physoutdev, info->out_mask);
+
+ return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT));
+}
+
+static int physdev_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_physdev_info *info = par->matchinfo;
+
+ br_netfilter_enable();
+
+ if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
+ info->bitmask & ~XT_PHYSDEV_OP_MASK)
+ return -EINVAL;
+ if (info->bitmask & XT_PHYSDEV_OP_OUT &&
+ (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
+ info->invert & XT_PHYSDEV_OP_BRIDGED) &&
+ par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
+ pr_info("using --physdev-out in the OUTPUT, FORWARD and "
+ "POSTROUTING chains for non-bridged traffic is not "
+ "supported anymore.\n");
+ if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match physdev_mt_reg __read_mostly = {
+ .name = "physdev",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = physdev_mt_check,
+ .match = physdev_mt,
+ .matchsize = sizeof(struct xt_physdev_info),
+ .me = THIS_MODULE,
+};
+
+static int __init physdev_mt_init(void)
+{
+ return xt_register_match(&physdev_mt_reg);
+}
+
+static void __exit physdev_mt_exit(void)
+{
+ xt_unregister_match(&physdev_mt_reg);
+}
+
+module_init(physdev_mt_init);
+module_exit(physdev_mt_exit);
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
new file mode 100644
index 000000000..5b645cb59
--- /dev/null
+++ b/net/netfilter/xt_pkttype.c
@@ -0,0 +1,65 @@
+/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <linux/netfilter/xt_pkttype.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>");
+MODULE_DESCRIPTION("Xtables: link layer packet type match");
+MODULE_ALIAS("ipt_pkttype");
+MODULE_ALIAS("ip6t_pkttype");
+
+static bool
+pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_pkttype_info *info = par->matchinfo;
+ u_int8_t type;
+
+ if (skb->pkt_type != PACKET_LOOPBACK)
+ type = skb->pkt_type;
+ else if (par->family == NFPROTO_IPV4 &&
+ ipv4_is_multicast(ip_hdr(skb)->daddr))
+ type = PACKET_MULTICAST;
+ else if (par->family == NFPROTO_IPV6 &&
+ ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
+ type = PACKET_MULTICAST;
+ else
+ type = PACKET_BROADCAST;
+
+ return (type == info->pkttype) ^ info->invert;
+}
+
+static struct xt_match pkttype_mt_reg __read_mostly = {
+ .name = "pkttype",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = pkttype_mt,
+ .matchsize = sizeof(struct xt_pkttype_info),
+ .me = THIS_MODULE,
+};
+
+static int __init pkttype_mt_init(void)
+{
+ return xt_register_match(&pkttype_mt_reg);
+}
+
+static void __exit pkttype_mt_exit(void)
+{
+ xt_unregister_match(&pkttype_mt_reg);
+}
+
+module_init(pkttype_mt_init);
+module_exit(pkttype_mt_exit);
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
new file mode 100644
index 000000000..f23e97bb4
--- /dev/null
+++ b/net/netfilter/xt_policy.c
@@ -0,0 +1,188 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_policy.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: IPsec policy match");
+MODULE_LICENSE("GPL");
+
+static inline bool
+xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m,
+ const union nf_inet_addr *a2, unsigned short family)
+{
+ switch (family) {
+ case NFPROTO_IPV4:
+ return ((a1->ip ^ a2->ip) & m->ip) == 0;
+ case NFPROTO_IPV6:
+ return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0;
+ }
+ return false;
+}
+
+static bool
+match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e,
+ unsigned short family)
+{
+#define MATCH_ADDR(x,y,z) (!e->match.x || \
+ (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \
+ ^ e->invert.x))
+#define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x))
+
+ return MATCH_ADDR(saddr, smask, &x->props.saddr) &&
+ MATCH_ADDR(daddr, dmask, &x->id.daddr) &&
+ MATCH(proto, x->id.proto) &&
+ MATCH(mode, x->props.mode) &&
+ MATCH(spi, x->id.spi) &&
+ MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info,
+ unsigned short family)
+{
+ const struct xt_policy_elem *e;
+ const struct sec_path *sp = skb->sp;
+ int strict = info->flags & XT_POLICY_MATCH_STRICT;
+ int i, pos;
+
+ if (sp == NULL)
+ return -1;
+ if (strict && info->len != sp->len)
+ return 0;
+
+ for (i = sp->len - 1; i >= 0; i--) {
+ pos = strict ? i - sp->len + 1 : 0;
+ if (pos >= info->len)
+ return 0;
+ e = &info->pol[pos];
+
+ if (match_xfrm_state(sp->xvec[i], e, family)) {
+ if (!strict)
+ return 1;
+ } else if (strict)
+ return 0;
+ }
+
+ return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
+ unsigned short family)
+{
+ const struct xt_policy_elem *e;
+ const struct dst_entry *dst = skb_dst(skb);
+ int strict = info->flags & XT_POLICY_MATCH_STRICT;
+ int i, pos;
+
+ if (dst->xfrm == NULL)
+ return -1;
+
+ for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+ pos = strict ? i : 0;
+ if (pos >= info->len)
+ return 0;
+ e = &info->pol[pos];
+
+ if (match_xfrm_state(dst->xfrm, e, family)) {
+ if (!strict)
+ return 1;
+ } else if (strict)
+ return 0;
+ }
+
+ return strict ? i == info->len : 0;
+}
+
+static bool
+policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_policy_info *info = par->matchinfo;
+ int ret;
+
+ if (info->flags & XT_POLICY_MATCH_IN)
+ ret = match_policy_in(skb, info, par->family);
+ else
+ ret = match_policy_out(skb, info, par->family);
+
+ if (ret < 0)
+ ret = info->flags & XT_POLICY_MATCH_NONE ? true : false;
+ else if (info->flags & XT_POLICY_MATCH_NONE)
+ ret = false;
+
+ return ret;
+}
+
+static int policy_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_policy_info *info = par->matchinfo;
+
+ if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) {
+ pr_info("neither incoming nor outgoing policy selected\n");
+ return -EINVAL;
+ }
+ if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) {
+ pr_info("output policy not valid in PREROUTING and INPUT\n");
+ return -EINVAL;
+ }
+ if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) {
+ pr_info("input policy not valid in POSTROUTING and OUTPUT\n");
+ return -EINVAL;
+ }
+ if (info->len > XT_POLICY_MAX_ELEM) {
+ pr_info("too many policy elements\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match policy_mt_reg[] __read_mostly = {
+ {
+ .name = "policy",
+ .family = NFPROTO_IPV4,
+ .checkentry = policy_mt_check,
+ .match = policy_mt,
+ .matchsize = sizeof(struct xt_policy_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "policy",
+ .family = NFPROTO_IPV6,
+ .checkentry = policy_mt_check,
+ .match = policy_mt,
+ .matchsize = sizeof(struct xt_policy_info),
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init policy_mt_init(void)
+{
+ return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg));
+}
+
+static void __exit policy_mt_exit(void)
+{
+ xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg));
+}
+
+module_init(policy_mt_init);
+module_exit(policy_mt_exit);
+MODULE_ALIAS("ipt_policy");
+MODULE_ALIAS("ip6t_policy");
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
new file mode 100644
index 000000000..44c8eb4c9
--- /dev/null
+++ b/net/netfilter/xt_quota.c
@@ -0,0 +1,90 @@
+/*
+ * netfilter module to enforce network quotas
+ *
+ * Sam Johnston <samj@samj.net>
+ */
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota.h>
+#include <linux/module.h>
+
+struct xt_quota_priv {
+ spinlock_t lock;
+ uint64_t quota;
+};
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+MODULE_DESCRIPTION("Xtables: countdown quota match");
+MODULE_ALIAS("ipt_quota");
+MODULE_ALIAS("ip6t_quota");
+
+static bool
+quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct xt_quota_info *q = (void *)par->matchinfo;
+ struct xt_quota_priv *priv = q->master;
+ bool ret = q->flags & XT_QUOTA_INVERT;
+
+ spin_lock_bh(&priv->lock);
+ if (priv->quota >= skb->len) {
+ priv->quota -= skb->len;
+ ret = !ret;
+ } else {
+ /* we do not allow even small packets from now on */
+ priv->quota = 0;
+ }
+ spin_unlock_bh(&priv->lock);
+
+ return ret;
+}
+
+static int quota_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_quota_info *q = par->matchinfo;
+
+ if (q->flags & ~XT_QUOTA_MASK)
+ return -EINVAL;
+
+ q->master = kmalloc(sizeof(*q->master), GFP_KERNEL);
+ if (q->master == NULL)
+ return -ENOMEM;
+
+ spin_lock_init(&q->master->lock);
+ q->master->quota = q->quota;
+ return 0;
+}
+
+static void quota_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_quota_info *q = par->matchinfo;
+
+ kfree(q->master);
+}
+
+static struct xt_match quota_mt_reg __read_mostly = {
+ .name = "quota",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = quota_mt,
+ .checkentry = quota_mt_check,
+ .destroy = quota_mt_destroy,
+ .matchsize = sizeof(struct xt_quota_info),
+ .me = THIS_MODULE,
+};
+
+static int __init quota_mt_init(void)
+{
+ return xt_register_match(&quota_mt_reg);
+}
+
+static void __exit quota_mt_exit(void)
+{
+ xt_unregister_match(&quota_mt_reg);
+}
+
+module_init(quota_mt_init);
+module_exit(quota_mt_exit);
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
new file mode 100644
index 000000000..7720b036d
--- /dev/null
+++ b/net/netfilter/xt_rateest.c
@@ -0,0 +1,157 @@
+/*
+ * (C) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/gen_stats.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_rateest.h>
+#include <net/netfilter/xt_rateest.h>
+
+
+static bool
+xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_rateest_match_info *info = par->matchinfo;
+ struct gnet_stats_rate_est64 *r;
+ u_int32_t bps1, bps2, pps1, pps2;
+ bool ret = true;
+
+ spin_lock_bh(&info->est1->lock);
+ r = &info->est1->rstats;
+ if (info->flags & XT_RATEEST_MATCH_DELTA) {
+ bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
+ pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+ } else {
+ bps1 = r->bps;
+ pps1 = r->pps;
+ }
+ spin_unlock_bh(&info->est1->lock);
+
+ if (info->flags & XT_RATEEST_MATCH_ABS) {
+ bps2 = info->bps2;
+ pps2 = info->pps2;
+ } else {
+ spin_lock_bh(&info->est2->lock);
+ r = &info->est2->rstats;
+ if (info->flags & XT_RATEEST_MATCH_DELTA) {
+ bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
+ pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+ } else {
+ bps2 = r->bps;
+ pps2 = r->pps;
+ }
+ spin_unlock_bh(&info->est2->lock);
+ }
+
+ switch (info->mode) {
+ case XT_RATEEST_MATCH_LT:
+ if (info->flags & XT_RATEEST_MATCH_BPS)
+ ret &= bps1 < bps2;
+ if (info->flags & XT_RATEEST_MATCH_PPS)
+ ret &= pps1 < pps2;
+ break;
+ case XT_RATEEST_MATCH_GT:
+ if (info->flags & XT_RATEEST_MATCH_BPS)
+ ret &= bps1 > bps2;
+ if (info->flags & XT_RATEEST_MATCH_PPS)
+ ret &= pps1 > pps2;
+ break;
+ case XT_RATEEST_MATCH_EQ:
+ if (info->flags & XT_RATEEST_MATCH_BPS)
+ ret &= bps1 == bps2;
+ if (info->flags & XT_RATEEST_MATCH_PPS)
+ ret &= pps1 == pps2;
+ break;
+ }
+
+ ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false;
+ return ret;
+}
+
+static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_rateest_match_info *info = par->matchinfo;
+ struct xt_rateest *est1, *est2;
+ int ret = -EINVAL;
+
+ if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS |
+ XT_RATEEST_MATCH_REL)) != 1)
+ goto err1;
+
+ if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS)))
+ goto err1;
+
+ switch (info->mode) {
+ case XT_RATEEST_MATCH_EQ:
+ case XT_RATEEST_MATCH_LT:
+ case XT_RATEEST_MATCH_GT:
+ break;
+ default:
+ goto err1;
+ }
+
+ ret = -ENOENT;
+ est1 = xt_rateest_lookup(info->name1);
+ if (!est1)
+ goto err1;
+
+ est2 = NULL;
+ if (info->flags & XT_RATEEST_MATCH_REL) {
+ est2 = xt_rateest_lookup(info->name2);
+ if (!est2)
+ goto err2;
+ }
+
+ info->est1 = est1;
+ info->est2 = est2;
+ return 0;
+
+err2:
+ xt_rateest_put(est1);
+err1:
+ return ret;
+}
+
+static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_rateest_match_info *info = par->matchinfo;
+
+ xt_rateest_put(info->est1);
+ if (info->est2)
+ xt_rateest_put(info->est2);
+}
+
+static struct xt_match xt_rateest_mt_reg __read_mostly = {
+ .name = "rateest",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = xt_rateest_mt,
+ .checkentry = xt_rateest_mt_checkentry,
+ .destroy = xt_rateest_mt_destroy,
+ .matchsize = sizeof(struct xt_rateest_match_info),
+ .me = THIS_MODULE,
+};
+
+static int __init xt_rateest_mt_init(void)
+{
+ return xt_register_match(&xt_rateest_mt_reg);
+}
+
+static void __exit xt_rateest_mt_fini(void)
+{
+ xt_unregister_match(&xt_rateest_mt_reg);
+}
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("xtables rate estimator match");
+MODULE_ALIAS("ipt_rateest");
+MODULE_ALIAS("ip6t_rateest");
+module_init(xt_rateest_mt_init);
+module_exit(xt_rateest_mt_fini);
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
new file mode 100644
index 000000000..459a7b256
--- /dev/null
+++ b/net/netfilter/xt_realm.c
@@ -0,0 +1,54 @@
+/* IP tables module for matching the routing realm
+ *
+ * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/route.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/xt_realm.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Routing realm match");
+MODULE_ALIAS("ipt_realm");
+
+static bool
+realm_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_realm_info *info = par->matchinfo;
+ const struct dst_entry *dst = skb_dst(skb);
+
+ return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
+}
+
+static struct xt_match realm_mt_reg __read_mostly = {
+ .name = "realm",
+ .match = realm_mt,
+ .matchsize = sizeof(struct xt_realm_info),
+ .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN),
+ .family = NFPROTO_UNSPEC,
+ .me = THIS_MODULE
+};
+
+static int __init realm_mt_init(void)
+{
+ return xt_register_match(&realm_mt_reg);
+}
+
+static void __exit realm_mt_exit(void)
+{
+ xt_unregister_match(&realm_mt_reg);
+}
+
+module_init(realm_mt_init);
+module_exit(realm_mt_exit);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
new file mode 100644
index 000000000..45e1b30e4
--- /dev/null
+++ b/net/netfilter/xt_recent.c
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This is a replacement of the old ipt_recent module, which carried the
+ * following copyright notice:
+ *
+ * Author: Stephen Frost <sfrost@snowman.net>
+ * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_recent.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_recent");
+MODULE_ALIAS("ip6t_recent");
+
+static unsigned int ip_list_tot __read_mostly = 100;
+static unsigned int ip_list_hash_size __read_mostly;
+static unsigned int ip_list_perms __read_mostly = 0644;
+static unsigned int ip_list_uid __read_mostly;
+static unsigned int ip_list_gid __read_mostly;
+module_param(ip_list_tot, uint, 0400);
+module_param(ip_list_hash_size, uint, 0400);
+module_param(ip_list_perms, uint, 0400);
+module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR);
+module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
+MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
+MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files");
+MODULE_PARM_DESC(ip_list_uid, "default owner of /proc/net/xt_recent/* files");
+MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* files");
+
+/* retained for backwards compatibility */
+static unsigned int ip_pkt_list_tot __read_mostly;
+module_param(ip_pkt_list_tot, uint, 0400);
+MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)");
+
+#define XT_RECENT_MAX_NSTAMPS 256
+
+struct recent_entry {
+ struct list_head list;
+ struct list_head lru_list;
+ union nf_inet_addr addr;
+ u_int16_t family;
+ u_int8_t ttl;
+ u_int8_t index;
+ u_int16_t nstamps;
+ unsigned long stamps[0];
+};
+
+struct recent_table {
+ struct list_head list;
+ char name[XT_RECENT_NAME_LEN];
+ union nf_inet_addr mask;
+ unsigned int refcnt;
+ unsigned int entries;
+ u8 nstamps_max_mask;
+ struct list_head lru_list;
+ struct list_head iphash[0];
+};
+
+struct recent_net {
+ struct list_head tables;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *xt_recent;
+#endif
+};
+
+static int recent_net_id __read_mostly;
+
+static inline struct recent_net *recent_pernet(struct net *net)
+{
+ return net_generic(net, recent_net_id);
+}
+
+static DEFINE_SPINLOCK(recent_lock);
+static DEFINE_MUTEX(recent_mutex);
+
+#ifdef CONFIG_PROC_FS
+static const struct file_operations recent_old_fops, recent_mt_fops;
+#endif
+
+static u_int32_t hash_rnd __read_mostly;
+static bool hash_rnd_inited __read_mostly;
+
+static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
+{
+ return jhash_1word((__force u32)addr->ip, hash_rnd) &
+ (ip_list_hash_size - 1);
+}
+
+static inline unsigned int recent_entry_hash6(const union nf_inet_addr *addr)
+{
+ return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) &
+ (ip_list_hash_size - 1);
+}
+
+static struct recent_entry *
+recent_entry_lookup(const struct recent_table *table,
+ const union nf_inet_addr *addrp, u_int16_t family,
+ u_int8_t ttl)
+{
+ struct recent_entry *e;
+ unsigned int h;
+
+ if (family == NFPROTO_IPV4)
+ h = recent_entry_hash4(addrp);
+ else
+ h = recent_entry_hash6(addrp);
+
+ list_for_each_entry(e, &table->iphash[h], list)
+ if (e->family == family &&
+ memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 &&
+ (ttl == e->ttl || ttl == 0 || e->ttl == 0))
+ return e;
+ return NULL;
+}
+
+static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
+{
+ list_del(&e->list);
+ list_del(&e->lru_list);
+ kfree(e);
+ t->entries--;
+}
+
+/*
+ * Drop entries with timestamps older then 'time'.
+ */
+static void recent_entry_reap(struct recent_table *t, unsigned long time)
+{
+ struct recent_entry *e;
+
+ /*
+ * The head of the LRU list is always the oldest entry.
+ */
+ e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
+
+ /*
+ * The last time stamp is the most recent.
+ */
+ if (time_after(time, e->stamps[e->index-1]))
+ recent_entry_remove(t, e);
+}
+
+static struct recent_entry *
+recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr,
+ u_int16_t family, u_int8_t ttl)
+{
+ struct recent_entry *e;
+ unsigned int nstamps_max = t->nstamps_max_mask;
+
+ if (t->entries >= ip_list_tot) {
+ e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
+ recent_entry_remove(t, e);
+ }
+
+ nstamps_max += 1;
+ e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * nstamps_max,
+ GFP_ATOMIC);
+ if (e == NULL)
+ return NULL;
+ memcpy(&e->addr, addr, sizeof(e->addr));
+ e->ttl = ttl;
+ e->stamps[0] = jiffies;
+ e->nstamps = 1;
+ e->index = 1;
+ e->family = family;
+ if (family == NFPROTO_IPV4)
+ list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]);
+ else
+ list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]);
+ list_add_tail(&e->lru_list, &t->lru_list);
+ t->entries++;
+ return e;
+}
+
+static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
+{
+ e->index &= t->nstamps_max_mask;
+ e->stamps[e->index++] = jiffies;
+ if (e->index > e->nstamps)
+ e->nstamps = e->index;
+ list_move_tail(&e->lru_list, &t->lru_list);
+}
+
+static struct recent_table *recent_table_lookup(struct recent_net *recent_net,
+ const char *name)
+{
+ struct recent_table *t;
+
+ list_for_each_entry(t, &recent_net->tables, list)
+ if (!strcmp(t->name, name))
+ return t;
+ return NULL;
+}
+
+static void recent_table_flush(struct recent_table *t)
+{
+ struct recent_entry *e, *next;
+ unsigned int i;
+
+ for (i = 0; i < ip_list_hash_size; i++)
+ list_for_each_entry_safe(e, next, &t->iphash[i], list)
+ recent_entry_remove(t, e);
+}
+
+static bool
+recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct net *net = dev_net(par->in ? par->in : par->out);
+ struct recent_net *recent_net = recent_pernet(net);
+ const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
+ struct recent_table *t;
+ struct recent_entry *e;
+ union nf_inet_addr addr = {}, addr_mask;
+ u_int8_t ttl;
+ bool ret = info->invert;
+
+ if (par->family == NFPROTO_IPV4) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (info->side == XT_RECENT_DEST)
+ addr.ip = iph->daddr;
+ else
+ addr.ip = iph->saddr;
+
+ ttl = iph->ttl;
+ } else {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ if (info->side == XT_RECENT_DEST)
+ memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6));
+ else
+ memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6));
+
+ ttl = iph->hop_limit;
+ }
+
+ /* use TTL as seen before forwarding */
+ if (par->out != NULL && skb->sk == NULL)
+ ttl++;
+
+ spin_lock_bh(&recent_lock);
+ t = recent_table_lookup(recent_net, info->name);
+
+ nf_inet_addr_mask(&addr, &addr_mask, &t->mask);
+
+ e = recent_entry_lookup(t, &addr_mask, par->family,
+ (info->check_set & XT_RECENT_TTL) ? ttl : 0);
+ if (e == NULL) {
+ if (!(info->check_set & XT_RECENT_SET))
+ goto out;
+ e = recent_entry_init(t, &addr_mask, par->family, ttl);
+ if (e == NULL)
+ par->hotdrop = true;
+ ret = !ret;
+ goto out;
+ }
+
+ if (info->check_set & XT_RECENT_SET)
+ ret = !ret;
+ else if (info->check_set & XT_RECENT_REMOVE) {
+ recent_entry_remove(t, e);
+ ret = !ret;
+ } else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) {
+ unsigned long time = jiffies - info->seconds * HZ;
+ unsigned int i, hits = 0;
+
+ for (i = 0; i < e->nstamps; i++) {
+ if (info->seconds && time_after(time, e->stamps[i]))
+ continue;
+ if (!info->hit_count || ++hits >= info->hit_count) {
+ ret = !ret;
+ break;
+ }
+ }
+
+ /* info->seconds must be non-zero */
+ if (info->check_set & XT_RECENT_REAP)
+ recent_entry_reap(t, time);
+ }
+
+ if (info->check_set & XT_RECENT_SET ||
+ (info->check_set & XT_RECENT_UPDATE && ret)) {
+ recent_entry_update(t, e);
+ e->ttl = ttl;
+ }
+out:
+ spin_unlock_bh(&recent_lock);
+ return ret;
+}
+
+static void recent_table_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static int recent_mt_check(const struct xt_mtchk_param *par,
+ const struct xt_recent_mtinfo_v1 *info)
+{
+ struct recent_net *recent_net = recent_pernet(par->net);
+ struct recent_table *t;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *pde;
+ kuid_t uid;
+ kgid_t gid;
+#endif
+ unsigned int nstamp_mask;
+ unsigned int i;
+ int ret = -EINVAL;
+ size_t sz;
+
+ if (unlikely(!hash_rnd_inited)) {
+ get_random_bytes(&hash_rnd, sizeof(hash_rnd));
+ hash_rnd_inited = true;
+ }
+ if (info->check_set & ~XT_RECENT_VALID_FLAGS) {
+ pr_info("Unsupported user space flags (%08x)\n",
+ info->check_set);
+ return -EINVAL;
+ }
+ if (hweight8(info->check_set &
+ (XT_RECENT_SET | XT_RECENT_REMOVE |
+ XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1)
+ return -EINVAL;
+ if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) &&
+ (info->seconds || info->hit_count ||
+ (info->check_set & XT_RECENT_MODIFIERS)))
+ return -EINVAL;
+ if ((info->check_set & XT_RECENT_REAP) && !info->seconds)
+ return -EINVAL;
+ if (info->hit_count >= XT_RECENT_MAX_NSTAMPS) {
+ pr_info("hitcount (%u) is larger than allowed maximum (%u)\n",
+ info->hit_count, XT_RECENT_MAX_NSTAMPS - 1);
+ return -EINVAL;
+ }
+ if (info->name[0] == '\0' ||
+ strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN)
+ return -EINVAL;
+
+ if (ip_pkt_list_tot && info->hit_count < ip_pkt_list_tot)
+ nstamp_mask = roundup_pow_of_two(ip_pkt_list_tot) - 1;
+ else if (info->hit_count)
+ nstamp_mask = roundup_pow_of_two(info->hit_count) - 1;
+ else
+ nstamp_mask = 32 - 1;
+
+ mutex_lock(&recent_mutex);
+ t = recent_table_lookup(recent_net, info->name);
+ if (t != NULL) {
+ if (nstamp_mask > t->nstamps_max_mask) {
+ spin_lock_bh(&recent_lock);
+ recent_table_flush(t);
+ t->nstamps_max_mask = nstamp_mask;
+ spin_unlock_bh(&recent_lock);
+ }
+
+ t->refcnt++;
+ ret = 0;
+ goto out;
+ }
+
+ sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size;
+ if (sz <= PAGE_SIZE)
+ t = kzalloc(sz, GFP_KERNEL);
+ else
+ t = vzalloc(sz);
+ if (t == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ t->refcnt = 1;
+ t->nstamps_max_mask = nstamp_mask;
+
+ memcpy(&t->mask, &info->mask, sizeof(t->mask));
+ strcpy(t->name, info->name);
+ INIT_LIST_HEAD(&t->lru_list);
+ for (i = 0; i < ip_list_hash_size; i++)
+ INIT_LIST_HEAD(&t->iphash[i]);
+#ifdef CONFIG_PROC_FS
+ uid = make_kuid(&init_user_ns, ip_list_uid);
+ gid = make_kgid(&init_user_ns, ip_list_gid);
+ if (!uid_valid(uid) || !gid_valid(gid)) {
+ recent_table_free(t);
+ ret = -EINVAL;
+ goto out;
+ }
+ pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,
+ &recent_mt_fops, t);
+ if (pde == NULL) {
+ recent_table_free(t);
+ ret = -ENOMEM;
+ goto out;
+ }
+ proc_set_user(pde, uid, gid);
+#endif
+ spin_lock_bh(&recent_lock);
+ list_add_tail(&t->list, &recent_net->tables);
+ spin_unlock_bh(&recent_lock);
+ ret = 0;
+out:
+ mutex_unlock(&recent_mutex);
+ return ret;
+}
+
+static int recent_mt_check_v0(const struct xt_mtchk_param *par)
+{
+ const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo;
+ struct xt_recent_mtinfo_v1 info_v1;
+
+ /* Copy revision 0 structure to revision 1 */
+ memcpy(&info_v1, info_v0, sizeof(struct xt_recent_mtinfo));
+ /* Set default mask to ensure backward compatible behaviour */
+ memset(info_v1.mask.all, 0xFF, sizeof(info_v1.mask.all));
+
+ return recent_mt_check(par, &info_v1);
+}
+
+static int recent_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ return recent_mt_check(par, par->matchinfo);
+}
+
+static void recent_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ struct recent_net *recent_net = recent_pernet(par->net);
+ const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
+ struct recent_table *t;
+
+ mutex_lock(&recent_mutex);
+ t = recent_table_lookup(recent_net, info->name);
+ if (--t->refcnt == 0) {
+ spin_lock_bh(&recent_lock);
+ list_del(&t->list);
+ spin_unlock_bh(&recent_lock);
+#ifdef CONFIG_PROC_FS
+ if (recent_net->xt_recent != NULL)
+ remove_proc_entry(t->name, recent_net->xt_recent);
+#endif
+ recent_table_flush(t);
+ recent_table_free(t);
+ }
+ mutex_unlock(&recent_mutex);
+}
+
+#ifdef CONFIG_PROC_FS
+struct recent_iter_state {
+ const struct recent_table *table;
+ unsigned int bucket;
+};
+
+static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(recent_lock)
+{
+ struct recent_iter_state *st = seq->private;
+ const struct recent_table *t = st->table;
+ struct recent_entry *e;
+ loff_t p = *pos;
+
+ spin_lock_bh(&recent_lock);
+
+ for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++)
+ list_for_each_entry(e, &t->iphash[st->bucket], list)
+ if (p-- == 0)
+ return e;
+ return NULL;
+}
+
+static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct recent_iter_state *st = seq->private;
+ const struct recent_table *t = st->table;
+ const struct recent_entry *e = v;
+ const struct list_head *head = e->list.next;
+
+ while (head == &t->iphash[st->bucket]) {
+ if (++st->bucket >= ip_list_hash_size)
+ return NULL;
+ head = t->iphash[st->bucket].next;
+ }
+ (*pos)++;
+ return list_entry(head, struct recent_entry, list);
+}
+
+static void recent_seq_stop(struct seq_file *s, void *v)
+ __releases(recent_lock)
+{
+ spin_unlock_bh(&recent_lock);
+}
+
+static int recent_seq_show(struct seq_file *seq, void *v)
+{
+ const struct recent_entry *e = v;
+ struct recent_iter_state *st = seq->private;
+ const struct recent_table *t = st->table;
+ unsigned int i;
+
+ i = (e->index - 1) & t->nstamps_max_mask;
+
+ if (e->family == NFPROTO_IPV4)
+ seq_printf(seq, "src=%pI4 ttl: %u last_seen: %lu oldest_pkt: %u",
+ &e->addr.ip, e->ttl, e->stamps[i], e->index);
+ else
+ seq_printf(seq, "src=%pI6 ttl: %u last_seen: %lu oldest_pkt: %u",
+ &e->addr.in6, e->ttl, e->stamps[i], e->index);
+ for (i = 0; i < e->nstamps; i++)
+ seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
+ seq_printf(seq, "\n");
+ return 0;
+}
+
+static const struct seq_operations recent_seq_ops = {
+ .start = recent_seq_start,
+ .next = recent_seq_next,
+ .stop = recent_seq_stop,
+ .show = recent_seq_show,
+};
+
+static int recent_seq_open(struct inode *inode, struct file *file)
+{
+ struct recent_iter_state *st;
+
+ st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
+ if (st == NULL)
+ return -ENOMEM;
+
+ st->table = PDE_DATA(inode);
+ return 0;
+}
+
+static ssize_t
+recent_mt_proc_write(struct file *file, const char __user *input,
+ size_t size, loff_t *loff)
+{
+ struct recent_table *t = PDE_DATA(file_inode(file));
+ struct recent_entry *e;
+ char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
+ const char *c = buf;
+ union nf_inet_addr addr = {};
+ u_int16_t family;
+ bool add, succ;
+
+ if (size == 0)
+ return 0;
+ if (size > sizeof(buf))
+ size = sizeof(buf);
+ if (copy_from_user(buf, input, size) != 0)
+ return -EFAULT;
+
+ /* Strict protocol! */
+ if (*loff != 0)
+ return -ESPIPE;
+ switch (*c) {
+ case '/': /* flush table */
+ spin_lock_bh(&recent_lock);
+ recent_table_flush(t);
+ spin_unlock_bh(&recent_lock);
+ return size;
+ case '-': /* remove address */
+ add = false;
+ break;
+ case '+': /* add address */
+ add = true;
+ break;
+ default:
+ pr_info("Need \"+ip\", \"-ip\" or \"/\"\n");
+ return -EINVAL;
+ }
+
+ ++c;
+ --size;
+ if (strnchr(c, size, ':') != NULL) {
+ family = NFPROTO_IPV6;
+ succ = in6_pton(c, size, (void *)&addr, '\n', NULL);
+ } else {
+ family = NFPROTO_IPV4;
+ succ = in4_pton(c, size, (void *)&addr, '\n', NULL);
+ }
+
+ if (!succ) {
+ pr_info("illegal address written to procfs\n");
+ return -EINVAL;
+ }
+
+ spin_lock_bh(&recent_lock);
+ e = recent_entry_lookup(t, &addr, family, 0);
+ if (e == NULL) {
+ if (add)
+ recent_entry_init(t, &addr, family, 0);
+ } else {
+ if (add)
+ recent_entry_update(t, e);
+ else
+ recent_entry_remove(t, e);
+ }
+ spin_unlock_bh(&recent_lock);
+ /* Note we removed one above */
+ *loff += size + 1;
+ return size + 1;
+}
+
+static const struct file_operations recent_mt_fops = {
+ .open = recent_seq_open,
+ .read = seq_read,
+ .write = recent_mt_proc_write,
+ .release = seq_release_private,
+ .owner = THIS_MODULE,
+ .llseek = seq_lseek,
+};
+
+static int __net_init recent_proc_net_init(struct net *net)
+{
+ struct recent_net *recent_net = recent_pernet(net);
+
+ recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net);
+ if (!recent_net->xt_recent)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit recent_proc_net_exit(struct net *net)
+{
+ struct recent_net *recent_net = recent_pernet(net);
+ struct recent_table *t;
+
+ /* recent_net_exit() is called before recent_mt_destroy(). Make sure
+ * that the parent xt_recent proc entry is is empty before trying to
+ * remove it.
+ */
+ spin_lock_bh(&recent_lock);
+ list_for_each_entry(t, &recent_net->tables, list)
+ remove_proc_entry(t->name, recent_net->xt_recent);
+
+ recent_net->xt_recent = NULL;
+ spin_unlock_bh(&recent_lock);
+
+ remove_proc_entry("xt_recent", net->proc_net);
+}
+#else
+static inline int recent_proc_net_init(struct net *net)
+{
+ return 0;
+}
+
+static inline void recent_proc_net_exit(struct net *net)
+{
+}
+#endif /* CONFIG_PROC_FS */
+
+static int __net_init recent_net_init(struct net *net)
+{
+ struct recent_net *recent_net = recent_pernet(net);
+
+ INIT_LIST_HEAD(&recent_net->tables);
+ return recent_proc_net_init(net);
+}
+
+static void __net_exit recent_net_exit(struct net *net)
+{
+ recent_proc_net_exit(net);
+}
+
+static struct pernet_operations recent_net_ops = {
+ .init = recent_net_init,
+ .exit = recent_net_exit,
+ .id = &recent_net_id,
+ .size = sizeof(struct recent_net),
+};
+
+static struct xt_match recent_mt_reg[] __read_mostly = {
+ {
+ .name = "recent",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .match = recent_mt,
+ .matchsize = sizeof(struct xt_recent_mtinfo),
+ .checkentry = recent_mt_check_v0,
+ .destroy = recent_mt_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "recent",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .match = recent_mt,
+ .matchsize = sizeof(struct xt_recent_mtinfo),
+ .checkentry = recent_mt_check_v0,
+ .destroy = recent_mt_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "recent",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .match = recent_mt,
+ .matchsize = sizeof(struct xt_recent_mtinfo_v1),
+ .checkentry = recent_mt_check_v1,
+ .destroy = recent_mt_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "recent",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = recent_mt,
+ .matchsize = sizeof(struct xt_recent_mtinfo_v1),
+ .checkentry = recent_mt_check_v1,
+ .destroy = recent_mt_destroy,
+ .me = THIS_MODULE,
+ }
+};
+
+static int __init recent_mt_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON_NOT_POWER_OF_2(XT_RECENT_MAX_NSTAMPS);
+
+ if (!ip_list_tot || ip_pkt_list_tot >= XT_RECENT_MAX_NSTAMPS)
+ return -EINVAL;
+ ip_list_hash_size = 1 << fls(ip_list_tot);
+
+ err = register_pernet_subsys(&recent_net_ops);
+ if (err)
+ return err;
+ err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+ if (err)
+ unregister_pernet_subsys(&recent_net_ops);
+ return err;
+}
+
+static void __exit recent_mt_exit(void)
+{
+ xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+ unregister_pernet_subsys(&recent_net_ops);
+}
+
+module_init(recent_mt_init);
+module_exit(recent_mt_exit);
diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h
new file mode 100644
index 000000000..8fd324116
--- /dev/null
+++ b/net/netfilter/xt_repldata.h
@@ -0,0 +1,47 @@
+/*
+ * Today's hack: quantum tunneling in structs
+ *
+ * 'entries' and 'term' are never anywhere referenced by word in code. In fact,
+ * they serve as the hanging-off data accessed through repl.data[].
+ */
+
+/* tbl has the following structure equivalent, but is C99 compliant:
+ * struct {
+ * struct type##_replace repl;
+ * struct type##_standard entries[nhooks];
+ * struct type##_error term;
+ * } *tbl;
+ */
+
+#define xt_alloc_initial_table(type, typ2) ({ \
+ unsigned int hook_mask = info->valid_hooks; \
+ unsigned int nhooks = hweight32(hook_mask); \
+ unsigned int bytes = 0, hooknum = 0, i = 0; \
+ struct { \
+ struct type##_replace repl; \
+ struct type##_standard entries[]; \
+ } *tbl; \
+ struct type##_error *term; \
+ size_t term_offset = (offsetof(typeof(*tbl), entries[nhooks]) + \
+ __alignof__(*term) - 1) & ~(__alignof__(*term) - 1); \
+ tbl = kzalloc(term_offset + sizeof(*term), GFP_KERNEL); \
+ if (tbl == NULL) \
+ return NULL; \
+ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \
+ strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \
+ *term = (struct type##_error)typ2##_ERROR_INIT; \
+ tbl->repl.valid_hooks = hook_mask; \
+ tbl->repl.num_entries = nhooks + 1; \
+ tbl->repl.size = nhooks * sizeof(struct type##_standard) + \
+ sizeof(struct type##_error); \
+ for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \
+ if (!(hook_mask & 1)) \
+ continue; \
+ tbl->repl.hook_entry[hooknum] = bytes; \
+ tbl->repl.underflow[hooknum] = bytes; \
+ tbl->entries[i++] = (struct type##_standard) \
+ typ2##_STANDARD_INIT(NF_ACCEPT); \
+ bytes += sizeof(struct type##_standard); \
+ } \
+ tbl; \
+})
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
new file mode 100644
index 000000000..ef36a56a0
--- /dev/null
+++ b/net/netfilter/xt_sctp.c
@@ -0,0 +1,198 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/sctp/sctp.h>
+#include <linux/sctp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_sctp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Xtables: SCTP protocol packet match");
+MODULE_ALIAS("ipt_sctp");
+MODULE_ALIAS("ip6t_sctp");
+
+#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+ || (!!((invflag) & (option)) ^ (cond)))
+
+static bool
+match_flags(const struct xt_sctp_flag_info *flag_info,
+ const int flag_count,
+ u_int8_t chunktype,
+ u_int8_t chunkflags)
+{
+ int i;
+
+ for (i = 0; i < flag_count; i++)
+ if (flag_info[i].chunktype == chunktype)
+ return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag;
+
+ return true;
+}
+
+static inline bool
+match_packet(const struct sk_buff *skb,
+ unsigned int offset,
+ const struct xt_sctp_info *info,
+ bool *hotdrop)
+{
+ u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
+ const sctp_chunkhdr_t *sch;
+ sctp_chunkhdr_t _sch;
+ int chunk_match_type = info->chunk_match_type;
+ const struct xt_sctp_flag_info *flag_info = info->flag_info;
+ int flag_count = info->flag_count;
+
+#ifdef DEBUG
+ int i = 0;
+#endif
+
+ if (chunk_match_type == SCTP_CHUNK_MATCH_ALL)
+ SCTP_CHUNKMAP_COPY(chunkmapcopy, info->chunkmap);
+
+ do {
+ sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
+ if (sch == NULL || sch->length == 0) {
+ pr_debug("Dropping invalid SCTP packet.\n");
+ *hotdrop = true;
+ return false;
+ }
+#ifdef DEBUG
+ pr_debug("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d"
+ "\tflags: %x\n",
+ ++i, offset, sch->type, htons(sch->length),
+ sch->flags);
+#endif
+ offset += WORD_ROUND(ntohs(sch->length));
+
+ pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
+
+ if (SCTP_CHUNKMAP_IS_SET(info->chunkmap, sch->type)) {
+ switch (chunk_match_type) {
+ case SCTP_CHUNK_MATCH_ANY:
+ if (match_flags(flag_info, flag_count,
+ sch->type, sch->flags)) {
+ return true;
+ }
+ break;
+
+ case SCTP_CHUNK_MATCH_ALL:
+ if (match_flags(flag_info, flag_count,
+ sch->type, sch->flags))
+ SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type);
+ break;
+
+ case SCTP_CHUNK_MATCH_ONLY:
+ if (!match_flags(flag_info, flag_count,
+ sch->type, sch->flags))
+ return false;
+ break;
+ }
+ } else {
+ switch (chunk_match_type) {
+ case SCTP_CHUNK_MATCH_ONLY:
+ return false;
+ }
+ }
+ } while (offset < skb->len);
+
+ switch (chunk_match_type) {
+ case SCTP_CHUNK_MATCH_ALL:
+ return SCTP_CHUNKMAP_IS_CLEAR(chunkmapcopy);
+ case SCTP_CHUNK_MATCH_ANY:
+ return false;
+ case SCTP_CHUNK_MATCH_ONLY:
+ return true;
+ }
+
+ /* This will never be reached, but required to stop compiler whine */
+ return false;
+}
+
+static bool
+sctp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_sctp_info *info = par->matchinfo;
+ const sctp_sctphdr_t *sh;
+ sctp_sctphdr_t _sh;
+
+ if (par->fragoff != 0) {
+ pr_debug("Dropping non-first fragment.. FIXME\n");
+ return false;
+ }
+
+ sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh);
+ if (sh == NULL) {
+ pr_debug("Dropping evil TCP offset=0 tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+ pr_debug("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
+
+ return SCCHECK(ntohs(sh->source) >= info->spts[0]
+ && ntohs(sh->source) <= info->spts[1],
+ XT_SCTP_SRC_PORTS, info->flags, info->invflags)
+ && SCCHECK(ntohs(sh->dest) >= info->dpts[0]
+ && ntohs(sh->dest) <= info->dpts[1],
+ XT_SCTP_DEST_PORTS, info->flags, info->invflags)
+ && SCCHECK(match_packet(skb, par->thoff + sizeof(sctp_sctphdr_t),
+ info, &par->hotdrop),
+ XT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
+}
+
+static int sctp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_sctp_info *info = par->matchinfo;
+
+ if (info->flags & ~XT_SCTP_VALID_FLAGS)
+ return -EINVAL;
+ if (info->invflags & ~XT_SCTP_VALID_FLAGS)
+ return -EINVAL;
+ if (info->invflags & ~info->flags)
+ return -EINVAL;
+ if (!(info->flags & XT_SCTP_CHUNK_TYPES))
+ return 0;
+ if (info->chunk_match_type & (SCTP_CHUNK_MATCH_ALL |
+ SCTP_CHUNK_MATCH_ANY | SCTP_CHUNK_MATCH_ONLY))
+ return 0;
+ return -EINVAL;
+}
+
+static struct xt_match sctp_mt_reg[] __read_mostly = {
+ {
+ .name = "sctp",
+ .family = NFPROTO_IPV4,
+ .checkentry = sctp_mt_check,
+ .match = sctp_mt,
+ .matchsize = sizeof(struct xt_sctp_info),
+ .proto = IPPROTO_SCTP,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "sctp",
+ .family = NFPROTO_IPV6,
+ .checkentry = sctp_mt_check,
+ .match = sctp_mt,
+ .matchsize = sizeof(struct xt_sctp_info),
+ .proto = IPPROTO_SCTP,
+ .me = THIS_MODULE
+ },
+};
+
+static int __init sctp_mt_init(void)
+{
+ return xt_register_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg));
+}
+
+static void __exit sctp_mt_exit(void)
+{
+ xt_unregister_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg));
+}
+
+module_init(sctp_mt_init);
+module_exit(sctp_mt_exit);
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
new file mode 100644
index 000000000..89045982e
--- /dev/null
+++ b/net/netfilter/xt_set.c
@@ -0,0 +1,741 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ * Patrick Schaaf <bof@bof.de>
+ * Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module which implements the set match and SET target
+ * for netfilter/iptables. */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("Xtables: IP set match and target module");
+MODULE_ALIAS("xt_SET");
+MODULE_ALIAS("ipt_set");
+MODULE_ALIAS("ip6t_set");
+MODULE_ALIAS("ipt_SET");
+MODULE_ALIAS("ip6t_SET");
+
+static inline int
+match_set(ip_set_id_t index, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, int inv)
+{
+ if (ip_set_test(index, skb, par, opt))
+ inv = !inv;
+ return inv;
+}
+
+#define ADT_OPT(n, f, d, fs, cfs, t) \
+struct ip_set_adt_opt n = { \
+ .family = f, \
+ .dim = d, \
+ .flags = fs, \
+ .cmdflags = cfs, \
+ .ext.timeout = t, \
+}
+
+/* Revision 0 interface: backward compatible with netfilter/iptables */
+
+static bool
+set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v0 *info = par->matchinfo;
+ ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
+ info->match_set.u.compat.flags, 0, UINT_MAX);
+
+ return match_set(info->match_set.index, skb, par, &opt,
+ info->match_set.u.compat.flags & IPSET_INV_MATCH);
+}
+
+static void
+compat_flags(struct xt_set_info_v0 *info)
+{
+ u_int8_t i;
+
+ /* Fill out compatibility data according to enum ip_set_kopt */
+ info->u.compat.dim = IPSET_DIM_ZERO;
+ if (info->u.flags[0] & IPSET_MATCH_INV)
+ info->u.compat.flags |= IPSET_INV_MATCH;
+ for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) {
+ info->u.compat.dim++;
+ if (info->u.flags[i] & IPSET_SRC)
+ info->u.compat.flags |= (1<<info->u.compat.dim);
+ }
+}
+
+static int
+set_match_v0_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_set_info_match_v0 *info = par->matchinfo;
+ ip_set_id_t index;
+
+ index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);
+
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find set identified by id %u to match\n",
+ info->match_set.index);
+ return -ENOENT;
+ }
+ if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ pr_warn("Protocol error: set match dimension is over the limit!\n");
+ ip_set_nfnl_put(par->net, info->match_set.index);
+ return -ERANGE;
+ }
+
+ /* Fill out compatibility data */
+ compat_flags(&info->match_set);
+
+ return 0;
+}
+
+static void
+set_match_v0_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_set_info_match_v0 *info = par->matchinfo;
+
+ ip_set_nfnl_put(par->net, info->match_set.index);
+}
+
+/* Revision 1 match */
+
+static bool
+set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v1 *info = par->matchinfo;
+ ADT_OPT(opt, par->family, info->match_set.dim,
+ info->match_set.flags, 0, UINT_MAX);
+
+ if (opt.flags & IPSET_RETURN_NOMATCH)
+ opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH;
+
+ return match_set(info->match_set.index, skb, par, &opt,
+ info->match_set.flags & IPSET_INV_MATCH);
+}
+
+static int
+set_match_v1_checkentry(const struct xt_mtchk_param *par)
+{
+ struct xt_set_info_match_v1 *info = par->matchinfo;
+ ip_set_id_t index;
+
+ index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);
+
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find set identified by id %u to match\n",
+ info->match_set.index);
+ return -ENOENT;
+ }
+ if (info->match_set.dim > IPSET_DIM_MAX) {
+ pr_warn("Protocol error: set match dimension is over the limit!\n");
+ ip_set_nfnl_put(par->net, info->match_set.index);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_match_v1_destroy(const struct xt_mtdtor_param *par)
+{
+ struct xt_set_info_match_v1 *info = par->matchinfo;
+
+ ip_set_nfnl_put(par->net, info->match_set.index);
+}
+
+/* Revision 3 match */
+
+static bool
+match_counter0(u64 counter, const struct ip_set_counter_match0 *info)
+{
+ switch (info->op) {
+ case IPSET_COUNTER_NONE:
+ return true;
+ case IPSET_COUNTER_EQ:
+ return counter == info->value;
+ case IPSET_COUNTER_NE:
+ return counter != info->value;
+ case IPSET_COUNTER_LT:
+ return counter < info->value;
+ case IPSET_COUNTER_GT:
+ return counter > info->value;
+ }
+ return false;
+}
+
+static bool
+set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v3 *info = par->matchinfo;
+ ADT_OPT(opt, par->family, info->match_set.dim,
+ info->match_set.flags, info->flags, UINT_MAX);
+ int ret;
+
+ if (info->packets.op != IPSET_COUNTER_NONE ||
+ info->bytes.op != IPSET_COUNTER_NONE)
+ opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
+
+ ret = match_set(info->match_set.index, skb, par, &opt,
+ info->match_set.flags & IPSET_INV_MATCH);
+
+ if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
+ return ret;
+
+ if (!match_counter0(opt.ext.packets, &info->packets))
+ return false;
+ return match_counter0(opt.ext.bytes, &info->bytes);
+}
+
+#define set_match_v3_checkentry set_match_v1_checkentry
+#define set_match_v3_destroy set_match_v1_destroy
+
+/* Revision 4 match */
+
+static bool
+match_counter(u64 counter, const struct ip_set_counter_match *info)
+{
+ switch (info->op) {
+ case IPSET_COUNTER_NONE:
+ return true;
+ case IPSET_COUNTER_EQ:
+ return counter == info->value;
+ case IPSET_COUNTER_NE:
+ return counter != info->value;
+ case IPSET_COUNTER_LT:
+ return counter < info->value;
+ case IPSET_COUNTER_GT:
+ return counter > info->value;
+ }
+ return false;
+}
+
+static bool
+set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v4 *info = par->matchinfo;
+ ADT_OPT(opt, par->family, info->match_set.dim,
+ info->match_set.flags, info->flags, UINT_MAX);
+ int ret;
+
+ if (info->packets.op != IPSET_COUNTER_NONE ||
+ info->bytes.op != IPSET_COUNTER_NONE)
+ opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
+
+ ret = match_set(info->match_set.index, skb, par, &opt,
+ info->match_set.flags & IPSET_INV_MATCH);
+
+ if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
+ return ret;
+
+ if (!match_counter(opt.ext.packets, &info->packets))
+ return false;
+ return match_counter(opt.ext.bytes, &info->bytes);
+}
+
+#define set_match_v4_checkentry set_match_v1_checkentry
+#define set_match_v4_destroy set_match_v1_destroy
+
+/* Revision 0 interface: backward compatible with netfilter/iptables */
+
+static unsigned int
+set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v0 *info = par->targinfo;
+ ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
+ info->add_set.u.compat.flags, 0, UINT_MAX);
+ ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
+ info->del_set.u.compat.flags, 0, UINT_MAX);
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par, &add_opt);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par, &del_opt);
+
+ return XT_CONTINUE;
+}
+
+static int
+set_target_v0_checkentry(const struct xt_tgchk_param *par)
+{
+ struct xt_set_info_target_v0 *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ return -ENOENT;
+ }
+ }
+ if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 ||
+ info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+ pr_warn("Protocol error: SET target dimension is over the limit!\n");
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+ return -ERANGE;
+ }
+
+ /* Fill out compatibility data */
+ compat_flags(&info->add_set);
+ compat_flags(&info->del_set);
+
+ return 0;
+}
+
+static void
+set_target_v0_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target_v0 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+}
+
+/* Revision 1 target */
+
+static unsigned int
+set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v1 *info = par->targinfo;
+ ADT_OPT(add_opt, par->family, info->add_set.dim,
+ info->add_set.flags, 0, UINT_MAX);
+ ADT_OPT(del_opt, par->family, info->del_set.dim,
+ info->del_set.flags, 0, UINT_MAX);
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par, &add_opt);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par, &del_opt);
+
+ return XT_CONTINUE;
+}
+
+static int
+set_target_v1_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct xt_set_info_target_v1 *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ return -ENOENT;
+ }
+ }
+ if (info->add_set.dim > IPSET_DIM_MAX ||
+ info->del_set.dim > IPSET_DIM_MAX) {
+ pr_warn("Protocol error: SET target dimension is over the limit!\n");
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_target_v1_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target_v1 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+}
+
+/* Revision 2 target */
+
+static unsigned int
+set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v2 *info = par->targinfo;
+ ADT_OPT(add_opt, par->family, info->add_set.dim,
+ info->add_set.flags, info->flags, info->timeout);
+ ADT_OPT(del_opt, par->family, info->del_set.dim,
+ info->del_set.flags, 0, UINT_MAX);
+
+ /* Normalize to fit into jiffies */
+ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
+ add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
+ add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par, &add_opt);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par, &del_opt);
+
+ return XT_CONTINUE;
+}
+
+#define set_target_v2_checkentry set_target_v1_checkentry
+#define set_target_v2_destroy set_target_v1_destroy
+
+/* Revision 3 target */
+
+static unsigned int
+set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_set_info_target_v3 *info = par->targinfo;
+ ADT_OPT(add_opt, par->family, info->add_set.dim,
+ info->add_set.flags, info->flags, info->timeout);
+ ADT_OPT(del_opt, par->family, info->del_set.dim,
+ info->del_set.flags, 0, UINT_MAX);
+ ADT_OPT(map_opt, par->family, info->map_set.dim,
+ info->map_set.flags, 0, UINT_MAX);
+
+ int ret;
+
+ /* Normalize to fit into jiffies */
+ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
+ add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
+ add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_add(info->add_set.index, skb, par, &add_opt);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_del(info->del_set.index, skb, par, &del_opt);
+ if (info->map_set.index != IPSET_INVALID_ID) {
+ map_opt.cmdflags |= info->flags & (IPSET_FLAG_MAP_SKBMARK |
+ IPSET_FLAG_MAP_SKBPRIO |
+ IPSET_FLAG_MAP_SKBQUEUE);
+ ret = match_set(info->map_set.index, skb, par, &map_opt,
+ info->map_set.flags & IPSET_INV_MATCH);
+ if (!ret)
+ return XT_CONTINUE;
+ if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK)
+ skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask))
+ ^ (map_opt.ext.skbmark);
+ if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO)
+ skb->priority = map_opt.ext.skbprio;
+ if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) &&
+ skb->dev &&
+ skb->dev->real_num_tx_queues > map_opt.ext.skbqueue)
+ skb_set_queue_mapping(skb, map_opt.ext.skbqueue);
+ }
+ return XT_CONTINUE;
+}
+
+
+static int
+set_target_v3_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct xt_set_info_target_v3 *info = par->targinfo;
+ ip_set_id_t index;
+
+ if (info->add_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net,
+ info->add_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find add_set index %u as target\n",
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->del_set.index != IPSET_INVALID_ID) {
+ index = ip_set_nfnl_get_byindex(par->net,
+ info->del_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find del_set index %u as target\n",
+ info->del_set.index);
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net,
+ info->add_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->map_set.index != IPSET_INVALID_ID) {
+ if (strncmp(par->table, "mangle", 7)) {
+ pr_warn("--map-set only usable from mangle table\n");
+ return -EINVAL;
+ }
+ if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
+ (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
+ !(par->hook_mask & (1 << NF_INET_FORWARD |
+ 1 << NF_INET_LOCAL_OUT |
+ 1 << NF_INET_POST_ROUTING))) {
+ pr_warn("mapping of prio or/and queue is allowed only"
+ "from OUTPUT/FORWARD/POSTROUTING chains\n");
+ return -EINVAL;
+ }
+ index = ip_set_nfnl_get_byindex(par->net,
+ info->map_set.index);
+ if (index == IPSET_INVALID_ID) {
+ pr_warn("Cannot find map_set index %u as target\n",
+ info->map_set.index);
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net,
+ info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net,
+ info->del_set.index);
+ return -ENOENT;
+ }
+ }
+
+ if (info->add_set.dim > IPSET_DIM_MAX ||
+ info->del_set.dim > IPSET_DIM_MAX ||
+ info->map_set.dim > IPSET_DIM_MAX) {
+ pr_warn("Protocol error: SET target dimension "
+ "is over the limit!\n");
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+ if (info->map_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->map_set.index);
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static void
+set_target_v3_destroy(const struct xt_tgdtor_param *par)
+{
+ const struct xt_set_info_target_v3 *info = par->targinfo;
+
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+ if (info->map_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->map_set.index);
+}
+
+
+static struct xt_match set_matches[] __read_mostly = {
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 0,
+ .match = set_match_v0,
+ .matchsize = sizeof(struct xt_set_info_match_v0),
+ .checkentry = set_match_v0_checkentry,
+ .destroy = set_match_v0_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 1,
+ .match = set_match_v1,
+ .matchsize = sizeof(struct xt_set_info_match_v1),
+ .checkentry = set_match_v1_checkentry,
+ .destroy = set_match_v1_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .match = set_match_v1,
+ .matchsize = sizeof(struct xt_set_info_match_v1),
+ .checkentry = set_match_v1_checkentry,
+ .destroy = set_match_v1_destroy,
+ .me = THIS_MODULE
+ },
+ /* --return-nomatch flag support */
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 2,
+ .match = set_match_v1,
+ .matchsize = sizeof(struct xt_set_info_match_v1),
+ .checkentry = set_match_v1_checkentry,
+ .destroy = set_match_v1_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 2,
+ .match = set_match_v1,
+ .matchsize = sizeof(struct xt_set_info_match_v1),
+ .checkentry = set_match_v1_checkentry,
+ .destroy = set_match_v1_destroy,
+ .me = THIS_MODULE
+ },
+ /* counters support: update, match */
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 3,
+ .match = set_match_v3,
+ .matchsize = sizeof(struct xt_set_info_match_v3),
+ .checkentry = set_match_v3_checkentry,
+ .destroy = set_match_v3_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 3,
+ .match = set_match_v3,
+ .matchsize = sizeof(struct xt_set_info_match_v3),
+ .checkentry = set_match_v3_checkentry,
+ .destroy = set_match_v3_destroy,
+ .me = THIS_MODULE
+ },
+ /* new revision for counters support: update, match */
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 4,
+ .match = set_match_v4,
+ .matchsize = sizeof(struct xt_set_info_match_v4),
+ .checkentry = set_match_v4_checkentry,
+ .destroy = set_match_v4_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 4,
+ .match = set_match_v4,
+ .matchsize = sizeof(struct xt_set_info_match_v4),
+ .checkentry = set_match_v4_checkentry,
+ .destroy = set_match_v4_destroy,
+ .me = THIS_MODULE
+ },
+};
+
+static struct xt_target set_targets[] __read_mostly = {
+ {
+ .name = "SET",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v0,
+ .targetsize = sizeof(struct xt_set_info_target_v0),
+ .checkentry = set_target_v0_checkentry,
+ .destroy = set_target_v0_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v1,
+ .targetsize = sizeof(struct xt_set_info_target_v1),
+ .checkentry = set_target_v1_checkentry,
+ .destroy = set_target_v1_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .target = set_target_v1,
+ .targetsize = sizeof(struct xt_set_info_target_v1),
+ .checkentry = set_target_v1_checkentry,
+ .destroy = set_target_v1_destroy,
+ .me = THIS_MODULE
+ },
+ /* --timeout and --exist flags support */
+ {
+ .name = "SET",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v2,
+ .targetsize = sizeof(struct xt_set_info_target_v2),
+ .checkentry = set_target_v2_checkentry,
+ .destroy = set_target_v2_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .target = set_target_v2,
+ .targetsize = sizeof(struct xt_set_info_target_v2),
+ .checkentry = set_target_v2_checkentry,
+ .destroy = set_target_v2_destroy,
+ .me = THIS_MODULE
+ },
+ /* --map-set support */
+ {
+ .name = "SET",
+ .revision = 3,
+ .family = NFPROTO_IPV4,
+ .target = set_target_v3,
+ .targetsize = sizeof(struct xt_set_info_target_v3),
+ .checkentry = set_target_v3_checkentry,
+ .destroy = set_target_v3_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "SET",
+ .revision = 3,
+ .family = NFPROTO_IPV6,
+ .target = set_target_v3,
+ .targetsize = sizeof(struct xt_set_info_target_v3),
+ .checkentry = set_target_v3_checkentry,
+ .destroy = set_target_v3_destroy,
+ .me = THIS_MODULE
+ },
+};
+
+static int __init xt_set_init(void)
+{
+ int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches));
+
+ if (!ret) {
+ ret = xt_register_targets(set_targets,
+ ARRAY_SIZE(set_targets));
+ if (ret)
+ xt_unregister_matches(set_matches,
+ ARRAY_SIZE(set_matches));
+ }
+ return ret;
+}
+
+static void __exit xt_set_fini(void)
+{
+ xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches));
+ xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets));
+}
+
+module_init(xt_set_init);
+module_exit(xt_set_fini);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
new file mode 100644
index 000000000..e092cb046
--- /dev/null
+++ b/net/netfilter/xt_socket.c
@@ -0,0 +1,513 @@
+/*
+ * Transparent proxy support for Linux/iptables
+ *
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+#define XT_SOCKET_HAVE_IPV6 1
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/inet6_hashtables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
+#include <linux/netfilter/xt_socket.h>
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#define XT_SOCKET_HAVE_CONNTRACK 1
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+static int
+extract_icmp4_fields(const struct sk_buff *skb,
+ u8 *protocol,
+ __be32 *raddr,
+ __be32 *laddr,
+ __be16 *rport,
+ __be16 *lport)
+{
+ unsigned int outside_hdrlen = ip_hdrlen(skb);
+ struct iphdr *inside_iph, _inside_iph;
+ struct icmphdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_REDIRECT:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return 1;
+ }
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr),
+ sizeof(_inside_iph), &_inside_iph);
+ if (inside_iph == NULL)
+ return 1;
+
+ if (inside_iph->protocol != IPPROTO_TCP &&
+ inside_iph->protocol != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, outside_hdrlen +
+ sizeof(struct icmphdr) +
+ (inside_iph->ihl << 2),
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_iph->protocol;
+ *laddr = inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+/* "socket" match based redirection (no specific rule)
+ * ===================================================
+ *
+ * There are connections with dynamic endpoints (e.g. FTP data
+ * connection) that the user is unable to add explicit rules
+ * for. These are taken care of by a generic "socket" rule. It is
+ * assumed that the proxy application is trusted to open such
+ * connections without explicit iptables rule (except of course the
+ * generic 'socket' rule). In this case the following sockets are
+ * matched in preference order:
+ *
+ * - match: if there's a fully established connection matching the
+ * _packet_ tuple
+ *
+ * - match: if there's a non-zero bound listener (possibly with a
+ * non-local address) We don't accept zero-bound listeners, since
+ * then local services could intercept traffic going through the
+ * box.
+ */
+static struct sock *
+xt_socket_get_sock_v4(struct net *net, const u8 protocol,
+ const __be32 saddr, const __be32 daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return __inet_lookup(net, &tcp_hashinfo,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+ return NULL;
+}
+
+static bool xt_socket_sk_is_transparent(struct sock *sk)
+{
+ switch (sk->sk_state) {
+ case TCP_TIME_WAIT:
+ return inet_twsk(sk)->tw_transparent;
+
+ case TCP_NEW_SYN_RECV:
+ return inet_rsk(inet_reqsk(sk))->no_srccheck;
+
+ default:
+ return inet_sk(sk)->transparent;
+ }
+}
+
+static struct sock *xt_socket_lookup_slow_v4(const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __be32 uninitialized_var(daddr), uninitialized_var(saddr);
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ u8 uninitialized_var(protocol);
+#ifdef XT_SOCKET_HAVE_CONNTRACK
+ struct nf_conn const *ct;
+ enum ip_conntrack_info ctinfo;
+#endif
+
+ if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
+ struct udphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ protocol = iph->protocol;
+ saddr = iph->saddr;
+ sport = hp->source;
+ daddr = iph->daddr;
+ dport = hp->dest;
+
+ } else if (iph->protocol == IPPROTO_ICMP) {
+ if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
+ &sport, &dport))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+#ifdef XT_SOCKET_HAVE_CONNTRACK
+ /* Do the lookup with the original socket address in
+ * case this is a reply packet of an established
+ * SNAT-ted connection.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !nf_ct_is_untracked(ct) &&
+ ((iph->protocol != IPPROTO_ICMP &&
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+ (iph->protocol == IPPROTO_ICMP &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
+ (ct->status & IPS_SRC_NAT_DONE)) {
+
+ daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+ dport = (iph->protocol == IPPROTO_TCP) ?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ }
+#endif
+
+ return xt_socket_get_sock_v4(dev_net(skb->dev), protocol, saddr, daddr,
+ sport, dport, indev);
+}
+
+static bool
+socket_match(const struct sk_buff *skb, struct xt_action_param *par,
+ const struct xt_socket_mtinfo1 *info)
+{
+ struct sock *sk = skb->sk;
+
+ if (!sk)
+ sk = xt_socket_lookup_slow_v4(skb, par->in);
+ if (sk) {
+ bool wildcard;
+ bool transparent = true;
+
+ /* Ignore sockets listening on INADDR_ANY,
+ * unless XT_SOCKET_NOWILDCARD is set
+ */
+ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
+ sk_fullsock(sk) &&
+ inet_sk(sk)->inet_rcv_saddr == 0);
+
+ /* Ignore non-transparent sockets,
+ * if XT_SOCKET_TRANSPARENT is used
+ */
+ if (info->flags & XT_SOCKET_TRANSPARENT)
+ transparent = xt_socket_sk_is_transparent(sk);
+
+ if (sk != skb->sk)
+ sock_gen_put(sk);
+
+ if (wildcard || !transparent)
+ sk = NULL;
+ }
+
+ return sk != NULL;
+}
+
+static bool
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ static struct xt_socket_mtinfo1 xt_info_v0 = {
+ .flags = 0,
+ };
+
+ return socket_match(skb, par, &xt_info_v0);
+}
+
+static bool
+socket_mt4_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ return socket_match(skb, par, par->matchinfo);
+}
+
+#ifdef XT_SOCKET_HAVE_IPV6
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+ unsigned int outside_hdrlen,
+ int *protocol,
+ const struct in6_addr **raddr,
+ const struct in6_addr **laddr,
+ __be16 *rport,
+ __be16 *lport,
+ struct ipv6hdr *ipv6_var)
+{
+ const struct ipv6hdr *inside_iph;
+ struct icmp6hdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+ u8 inside_nexthdr;
+ __be16 inside_fragoff;
+ int inside_hdrlen;
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+ return 1;
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph),
+ sizeof(*ipv6_var), ipv6_var);
+ if (inside_iph == NULL)
+ return 1;
+ inside_nexthdr = inside_iph->nexthdr;
+
+ inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) +
+ sizeof(*ipv6_var),
+ &inside_nexthdr, &inside_fragoff);
+ if (inside_hdrlen < 0)
+ return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+ if (inside_nexthdr != IPPROTO_TCP &&
+ inside_nexthdr != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, inside_hdrlen,
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_nexthdr;
+ *laddr = &inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = &inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static struct sock *
+xt_socket_get_sock_v6(struct net *net, const u8 protocol,
+ const struct in6_addr *saddr, const struct in6_addr *daddr,
+ const __be16 sport, const __be16 dport,
+ const struct net_device *in)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ return inet6_lookup(net, &tcp_hashinfo,
+ saddr, sport, daddr, dport,
+ in->ifindex);
+ case IPPROTO_UDP:
+ return udp6_lib_lookup(net, saddr, sport, daddr, dport,
+ in->ifindex);
+ }
+
+ return NULL;
+}
+
+static struct sock *xt_socket_lookup_slow_v6(const struct sk_buff *skb,
+ const struct net_device *indev)
+{
+ __be16 uninitialized_var(dport), uninitialized_var(sport);
+ const struct in6_addr *daddr = NULL, *saddr = NULL;
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ int thoff = 0, tproto;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NULL;
+ }
+
+ if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+ struct udphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
+ saddr = &iph->saddr;
+ sport = hp->source;
+ daddr = &iph->daddr;
+ dport = hp->dest;
+
+ } else if (tproto == IPPROTO_ICMPV6) {
+ struct ipv6hdr ipv6_var;
+
+ if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+ &sport, &dport, &ipv6_var))
+ return NULL;
+ } else {
+ return NULL;
+ }
+
+ return xt_socket_get_sock_v6(dev_net(skb->dev), tproto, saddr, daddr,
+ sport, dport, indev);
+}
+
+static bool
+socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+ struct sock *sk = skb->sk;
+
+ if (!sk)
+ sk = xt_socket_lookup_slow_v6(skb, par->in);
+ if (sk) {
+ bool wildcard;
+ bool transparent = true;
+
+ /* Ignore sockets listening on INADDR_ANY
+ * unless XT_SOCKET_NOWILDCARD is set
+ */
+ wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
+ sk_fullsock(sk) &&
+ ipv6_addr_any(&sk->sk_v6_rcv_saddr));
+
+ /* Ignore non-transparent sockets,
+ * if XT_SOCKET_TRANSPARENT is used
+ */
+ if (info->flags & XT_SOCKET_TRANSPARENT)
+ transparent = xt_socket_sk_is_transparent(sk);
+
+ if (sk != skb->sk)
+ sock_gen_put(sk);
+
+ if (wildcard || !transparent)
+ sk = NULL;
+ }
+
+ return sk != NULL;
+}
+#endif
+
+static int socket_mt_v1_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+ if (info->flags & ~XT_SOCKET_FLAGS_V1) {
+ pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int socket_mt_v2_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_socket_mtinfo2 *info = (struct xt_socket_mtinfo2 *) par->matchinfo;
+
+ if (info->flags & ~XT_SOCKET_FLAGS_V2) {
+ pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct xt_match socket_mt_reg[] __read_mostly = {
+ {
+ .name = "socket",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .match = socket_mt4_v0,
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "socket",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .match = socket_mt4_v1_v2,
+ .checkentry = socket_mt_v1_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#ifdef XT_SOCKET_HAVE_IPV6
+ {
+ .name = "socket",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = socket_mt6_v1_v2,
+ .checkentry = socket_mt_v1_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#endif
+ {
+ .name = "socket",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .match = socket_mt4_v1_v2,
+ .checkentry = socket_mt_v2_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#ifdef XT_SOCKET_HAVE_IPV6
+ {
+ .name = "socket",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .match = socket_mt6_v1_v2,
+ .checkentry = socket_mt_v2_check,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#endif
+};
+
+static int __init socket_mt_init(void)
+{
+ nf_defrag_ipv4_enable();
+#ifdef XT_SOCKET_HAVE_IPV6
+ nf_defrag_ipv6_enable();
+#endif
+
+ return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
+}
+
+static void __exit socket_mt_exit(void)
+{
+ xt_unregister_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
+}
+
+module_init(socket_mt_init);
+module_exit(socket_mt_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("x_tables socket match module");
+MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
new file mode 100644
index 000000000..a507922d8
--- /dev/null
+++ b/net/netfilter/xt_state.c
@@ -0,0 +1,79 @@
+/* Kernel module to match connection tracking information. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_state.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module");
+MODULE_ALIAS("ipt_state");
+MODULE_ALIAS("ip6t_state");
+
+static bool
+state_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_state_info *sinfo = par->matchinfo;
+ enum ip_conntrack_info ctinfo;
+ unsigned int statebit;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (!ct)
+ statebit = XT_STATE_INVALID;
+ else {
+ if (nf_ct_is_untracked(ct))
+ statebit = XT_STATE_UNTRACKED;
+ else
+ statebit = XT_STATE_BIT(ctinfo);
+ }
+ return (sinfo->statemask & statebit);
+}
+
+static int state_mt_check(const struct xt_mtchk_param *par)
+{
+ int ret;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+}
+
+static void state_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match state_mt_reg __read_mostly = {
+ .name = "state",
+ .family = NFPROTO_UNSPEC,
+ .checkentry = state_mt_check,
+ .match = state_mt,
+ .destroy = state_mt_destroy,
+ .matchsize = sizeof(struct xt_state_info),
+ .me = THIS_MODULE,
+};
+
+static int __init state_mt_init(void)
+{
+ return xt_register_match(&state_mt_reg);
+}
+
+static void __exit state_mt_exit(void)
+{
+ xt_unregister_match(&state_mt_reg);
+}
+
+module_init(state_mt_init);
+module_exit(state_mt_exit);
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
new file mode 100644
index 000000000..11de55e7a
--- /dev/null
+++ b/net/netfilter/xt_statistic.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>.
+ */
+
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter/xt_statistic.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/module.h>
+
+struct xt_statistic_priv {
+ atomic_t count;
+} ____cacheline_aligned_in_smp;
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)");
+MODULE_ALIAS("ipt_statistic");
+MODULE_ALIAS("ip6t_statistic");
+
+static bool
+statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_statistic_info *info = par->matchinfo;
+ bool ret = info->flags & XT_STATISTIC_INVERT;
+ int nval, oval;
+
+ switch (info->mode) {
+ case XT_STATISTIC_MODE_RANDOM:
+ if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)
+ ret = !ret;
+ break;
+ case XT_STATISTIC_MODE_NTH:
+ do {
+ oval = atomic_read(&info->master->count);
+ nval = (oval == info->u.nth.every) ? 0 : oval + 1;
+ } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval);
+ if (nval == 0)
+ ret = !ret;
+ break;
+ }
+
+ return ret;
+}
+
+static int statistic_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_statistic_info *info = par->matchinfo;
+
+ if (info->mode > XT_STATISTIC_MODE_MAX ||
+ info->flags & ~XT_STATISTIC_MASK)
+ return -EINVAL;
+
+ info->master = kzalloc(sizeof(*info->master), GFP_KERNEL);
+ if (info->master == NULL)
+ return -ENOMEM;
+ atomic_set(&info->master->count, info->u.nth.count);
+
+ return 0;
+}
+
+static void statistic_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ const struct xt_statistic_info *info = par->matchinfo;
+
+ kfree(info->master);
+}
+
+static struct xt_match xt_statistic_mt_reg __read_mostly = {
+ .name = "statistic",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = statistic_mt,
+ .checkentry = statistic_mt_check,
+ .destroy = statistic_mt_destroy,
+ .matchsize = sizeof(struct xt_statistic_info),
+ .me = THIS_MODULE,
+};
+
+static int __init statistic_mt_init(void)
+{
+ return xt_register_match(&xt_statistic_mt_reg);
+}
+
+static void __exit statistic_mt_exit(void)
+{
+ xt_unregister_match(&xt_statistic_mt_reg);
+}
+
+module_init(statistic_mt_init);
+module_exit(statistic_mt_exit);
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
new file mode 100644
index 000000000..0bc346031
--- /dev/null
+++ b/net/netfilter/xt_string.c
@@ -0,0 +1,94 @@
+/* String matching match for iptables
+ *
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_string.h>
+#include <linux/textsearch.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("Xtables: string-based matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_string");
+MODULE_ALIAS("ip6t_string");
+
+static bool
+string_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_string_info *conf = par->matchinfo;
+ bool invert;
+
+ invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT;
+
+ return (skb_find_text((struct sk_buff *)skb, conf->from_offset,
+ conf->to_offset, conf->config)
+ != UINT_MAX) ^ invert;
+}
+
+#define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m))
+
+static int string_mt_check(const struct xt_mtchk_param *par)
+{
+ struct xt_string_info *conf = par->matchinfo;
+ struct ts_config *ts_conf;
+ int flags = TS_AUTOLOAD;
+
+ /* Damn, can't handle this case properly with iptables... */
+ if (conf->from_offset > conf->to_offset)
+ return -EINVAL;
+ if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0')
+ return -EINVAL;
+ if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE)
+ return -EINVAL;
+ if (conf->u.v1.flags &
+ ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT))
+ return -EINVAL;
+ if (conf->u.v1.flags & XT_STRING_FLAG_IGNORECASE)
+ flags |= TS_IGNORECASE;
+ ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+ GFP_KERNEL, flags);
+ if (IS_ERR(ts_conf))
+ return PTR_ERR(ts_conf);
+
+ conf->config = ts_conf;
+ return 0;
+}
+
+static void string_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config);
+}
+
+static struct xt_match xt_string_mt_reg __read_mostly = {
+ .name = "string",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = string_mt_check,
+ .match = string_mt,
+ .destroy = string_mt_destroy,
+ .matchsize = sizeof(struct xt_string_info),
+ .me = THIS_MODULE,
+};
+
+static int __init string_mt_init(void)
+{
+ return xt_register_match(&xt_string_mt_reg);
+}
+
+static void __exit string_mt_exit(void)
+{
+ xt_unregister_match(&xt_string_mt_reg);
+}
+
+module_init(string_mt_init);
+module_exit(string_mt_exit);
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
new file mode 100644
index 000000000..c53d4d18e
--- /dev/null
+++ b/net/netfilter/xt_tcpmss.c
@@ -0,0 +1,110 @@
+/* Kernel module to match TCP MSS values. */
+
+/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ * Portions (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/xt_tcpmss.h>
+#include <linux/netfilter/x_tables.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("Xtables: TCP MSS match");
+MODULE_ALIAS("ipt_tcpmss");
+MODULE_ALIAS("ip6t_tcpmss");
+
+static bool
+tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_tcpmss_match_info *info = par->matchinfo;
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+ const u_int8_t *op;
+ u8 _opt[15 * 4 - sizeof(_tcph)];
+ unsigned int i, optlen;
+
+ /* If we don't have the whole header, drop packet. */
+ th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ goto dropit;
+
+ /* Malformed. */
+ if (th->doff*4 < sizeof(*th))
+ goto dropit;
+
+ optlen = th->doff*4 - sizeof(*th);
+ if (!optlen)
+ goto out;
+
+ /* Truncated options. */
+ op = skb_header_pointer(skb, par->thoff + sizeof(*th), optlen, _opt);
+ if (op == NULL)
+ goto dropit;
+
+ for (i = 0; i < optlen; ) {
+ if (op[i] == TCPOPT_MSS
+ && (optlen - i) >= TCPOLEN_MSS
+ && op[i+1] == TCPOLEN_MSS) {
+ u_int16_t mssval;
+
+ mssval = (op[i+2] << 8) | op[i+3];
+
+ return (mssval >= info->mss_min &&
+ mssval <= info->mss_max) ^ info->invert;
+ }
+ if (op[i] < 2)
+ i++;
+ else
+ i += op[i+1] ? : 1;
+ }
+out:
+ return info->invert;
+
+dropit:
+ par->hotdrop = true;
+ return false;
+}
+
+static struct xt_match tcpmss_mt_reg[] __read_mostly = {
+ {
+ .name = "tcpmss",
+ .family = NFPROTO_IPV4,
+ .match = tcpmss_mt,
+ .matchsize = sizeof(struct xt_tcpmss_match_info),
+ .proto = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "tcpmss",
+ .family = NFPROTO_IPV6,
+ .match = tcpmss_mt,
+ .matchsize = sizeof(struct xt_tcpmss_match_info),
+ .proto = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init tcpmss_mt_init(void)
+{
+ return xt_register_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg));
+}
+
+static void __exit tcpmss_mt_exit(void)
+{
+ xt_unregister_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg));
+}
+
+module_init(tcpmss_mt_init);
+module_exit(tcpmss_mt_exit);
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
new file mode 100644
index 000000000..c14d4645d
--- /dev/null
+++ b/net/netfilter/xt_tcpudp.c
@@ -0,0 +1,234 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("xt_tcp");
+MODULE_ALIAS("xt_udp");
+MODULE_ALIAS("ipt_udp");
+MODULE_ALIAS("ipt_tcp");
+MODULE_ALIAS("ip6t_udp");
+MODULE_ALIAS("ip6t_tcp");
+
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+static inline bool
+port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert)
+{
+ return (port >= min && port <= max) ^ invert;
+}
+
+static bool
+tcp_find_option(u_int8_t option,
+ const struct sk_buff *skb,
+ unsigned int protoff,
+ unsigned int optlen,
+ bool invert,
+ bool *hotdrop)
+{
+ /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+ const u_int8_t *op;
+ u_int8_t _opt[60 - sizeof(struct tcphdr)];
+ unsigned int i;
+
+ pr_debug("finding option\n");
+
+ if (!optlen)
+ return invert;
+
+ /* If we don't have the whole header, drop packet. */
+ op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr),
+ optlen, _opt);
+ if (op == NULL) {
+ *hotdrop = true;
+ return false;
+ }
+
+ for (i = 0; i < optlen; ) {
+ if (op[i] == option) return !invert;
+ if (op[i] < 2) i++;
+ else i += op[i+1]?:1;
+ }
+
+ return invert;
+}
+
+static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct tcphdr *th;
+ struct tcphdr _tcph;
+ const struct xt_tcp *tcpinfo = par->matchinfo;
+
+ if (par->fragoff != 0) {
+ /* To quote Alan:
+
+ Don't allow a fragment of TCP 8 bytes in. Nobody normal
+ causes this. Its a cracker trying to break in by doing a
+ flag overwrite to pass the direction checks.
+ */
+ if (par->fragoff == 1) {
+ pr_debug("Dropping evil TCP offset=1 frag.\n");
+ par->hotdrop = true;
+ }
+ /* Must not be a fragment. */
+ return false;
+ }
+
+#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg)))
+
+ th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ pr_debug("Dropping evil TCP offset=0 tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
+ ntohs(th->source),
+ !!(tcpinfo->invflags & XT_TCP_INV_SRCPT)))
+ return false;
+ if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
+ ntohs(th->dest),
+ !!(tcpinfo->invflags & XT_TCP_INV_DSTPT)))
+ return false;
+ if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
+ == tcpinfo->flg_cmp,
+ XT_TCP_INV_FLAGS))
+ return false;
+ if (tcpinfo->option) {
+ if (th->doff * 4 < sizeof(_tcph)) {
+ par->hotdrop = true;
+ return false;
+ }
+ if (!tcp_find_option(tcpinfo->option, skb, par->thoff,
+ th->doff*4 - sizeof(_tcph),
+ tcpinfo->invflags & XT_TCP_INV_OPTION,
+ &par->hotdrop))
+ return false;
+ }
+ return true;
+}
+
+static int tcp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_tcp *tcpinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0;
+}
+
+static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct udphdr *uh;
+ struct udphdr _udph;
+ const struct xt_udp *udpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ pr_debug("Dropping evil UDP tinygram.\n");
+ par->hotdrop = true;
+ return false;
+ }
+
+ return port_match(udpinfo->spts[0], udpinfo->spts[1],
+ ntohs(uh->source),
+ !!(udpinfo->invflags & XT_UDP_INV_SRCPT))
+ && port_match(udpinfo->dpts[0], udpinfo->dpts[1],
+ ntohs(uh->dest),
+ !!(udpinfo->invflags & XT_UDP_INV_DSTPT));
+}
+
+static int udp_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_udp *udpinfo = par->matchinfo;
+
+ /* Must specify no unknown invflags */
+ return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0;
+}
+
+static struct xt_match tcpudp_mt_reg[] __read_mostly = {
+ {
+ .name = "tcp",
+ .family = NFPROTO_IPV4,
+ .checkentry = tcp_mt_check,
+ .match = tcp_mt,
+ .matchsize = sizeof(struct xt_tcp),
+ .proto = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "tcp",
+ .family = NFPROTO_IPV6,
+ .checkentry = tcp_mt_check,
+ .match = tcp_mt,
+ .matchsize = sizeof(struct xt_tcp),
+ .proto = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "udp",
+ .family = NFPROTO_IPV4,
+ .checkentry = udp_mt_check,
+ .match = udp_mt,
+ .matchsize = sizeof(struct xt_udp),
+ .proto = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "udp",
+ .family = NFPROTO_IPV6,
+ .checkentry = udp_mt_check,
+ .match = udp_mt,
+ .matchsize = sizeof(struct xt_udp),
+ .proto = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "udplite",
+ .family = NFPROTO_IPV4,
+ .checkentry = udp_mt_check,
+ .match = udp_mt,
+ .matchsize = sizeof(struct xt_udp),
+ .proto = IPPROTO_UDPLITE,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "udplite",
+ .family = NFPROTO_IPV6,
+ .checkentry = udp_mt_check,
+ .match = udp_mt,
+ .matchsize = sizeof(struct xt_udp),
+ .proto = IPPROTO_UDPLITE,
+ .me = THIS_MODULE,
+ },
+};
+
+static int __init tcpudp_mt_init(void)
+{
+ return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg));
+}
+
+static void __exit tcpudp_mt_exit(void)
+{
+ xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg));
+}
+
+module_init(tcpudp_mt_init);
+module_exit(tcpudp_mt_exit);
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
new file mode 100644
index 000000000..0ae55a36f
--- /dev/null
+++ b/net/netfilter/xt_time.c
@@ -0,0 +1,291 @@
+/*
+ * xt_time
+ * Copyright © CC Computer Consultants GmbH, 2007
+ *
+ * based on ipt_time by Fabrice MARIE <fabrice@netfilter.org>
+ * This is a module which is used for time matching
+ * It is using some modified code from dietlibc (localtime() function)
+ * that you can find at http://www.fefe.de/dietlibc/
+ * This file is distributed under the terms of the GNU General Public
+ * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl.
+ */
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_time.h>
+
+struct xtm {
+ u_int8_t month; /* (1-12) */
+ u_int8_t monthday; /* (1-31) */
+ u_int8_t weekday; /* (1-7) */
+ u_int8_t hour; /* (0-23) */
+ u_int8_t minute; /* (0-59) */
+ u_int8_t second; /* (0-59) */
+ unsigned int dse;
+};
+
+extern struct timezone sys_tz; /* ouch */
+
+static const u_int16_t days_since_year[] = {
+ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334,
+};
+
+static const u_int16_t days_since_leapyear[] = {
+ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335,
+};
+
+/*
+ * Since time progresses forward, it is best to organize this array in reverse,
+ * to minimize lookup time.
+ */
+enum {
+ DSE_FIRST = 2039,
+ SECONDS_PER_DAY = 86400,
+};
+static const u_int16_t days_since_epoch[] = {
+ /* 2039 - 2030 */
+ 25202, 24837, 24472, 24106, 23741, 23376, 23011, 22645, 22280, 21915,
+ /* 2029 - 2020 */
+ 21550, 21184, 20819, 20454, 20089, 19723, 19358, 18993, 18628, 18262,
+ /* 2019 - 2010 */
+ 17897, 17532, 17167, 16801, 16436, 16071, 15706, 15340, 14975, 14610,
+ /* 2009 - 2000 */
+ 14245, 13879, 13514, 13149, 12784, 12418, 12053, 11688, 11323, 10957,
+ /* 1999 - 1990 */
+ 10592, 10227, 9862, 9496, 9131, 8766, 8401, 8035, 7670, 7305,
+ /* 1989 - 1980 */
+ 6940, 6574, 6209, 5844, 5479, 5113, 4748, 4383, 4018, 3652,
+ /* 1979 - 1970 */
+ 3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0,
+};
+
+static inline bool is_leap(unsigned int y)
+{
+ return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0);
+}
+
+/*
+ * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp.
+ * Since we match against days and daytime, the SSTE value needs to be
+ * computed back into human-readable dates.
+ *
+ * This is done in three separate functions so that the most expensive
+ * calculations are done last, in case a "simple match" can be found earlier.
+ */
+static inline unsigned int localtime_1(struct xtm *r, time_t time)
+{
+ unsigned int v, w;
+
+ /* Each day has 86400s, so finding the hour/minute is actually easy. */
+ v = time % SECONDS_PER_DAY;
+ r->second = v % 60;
+ w = v / 60;
+ r->minute = w % 60;
+ r->hour = w / 60;
+ return v;
+}
+
+static inline void localtime_2(struct xtm *r, time_t time)
+{
+ /*
+ * Here comes the rest (weekday, monthday). First, divide the SSTE
+ * by seconds-per-day to get the number of _days_ since the epoch.
+ */
+ r->dse = time / 86400;
+
+ /*
+ * 1970-01-01 (w=0) was a Thursday (4).
+ * -1 and +1 map Sunday properly onto 7.
+ */
+ r->weekday = (4 + r->dse - 1) % 7 + 1;
+}
+
+static void localtime_3(struct xtm *r, time_t time)
+{
+ unsigned int year, i, w = r->dse;
+
+ /*
+ * In each year, a certain number of days-since-the-epoch have passed.
+ * Find the year that is closest to said days.
+ *
+ * Consider, for example, w=21612 (2029-03-04). Loop will abort on
+ * dse[i] <= w, which happens when dse[i] == 21550. This implies
+ * year == 2009. w will then be 62.
+ */
+ for (i = 0, year = DSE_FIRST; days_since_epoch[i] > w;
+ ++i, --year)
+ /* just loop */;
+
+ w -= days_since_epoch[i];
+
+ /*
+ * By now we have the current year, and the day of the year.
+ * r->yearday = w;
+ *
+ * On to finding the month (like above). In each month, a certain
+ * number of days-since-New Year have passed, and find the closest
+ * one.
+ *
+ * Consider w=62 (in a non-leap year). Loop will abort on
+ * dsy[i] < w, which happens when dsy[i] == 31+28 (i == 2).
+ * Concludes i == 2, i.e. 3rd month => March.
+ *
+ * (A different approach to use would be to subtract a monthlength
+ * from w repeatedly while counting.)
+ */
+ if (is_leap(year)) {
+ /* use days_since_leapyear[] in a leap year */
+ for (i = ARRAY_SIZE(days_since_leapyear) - 1;
+ i > 0 && days_since_leapyear[i] > w; --i)
+ /* just loop */;
+ r->monthday = w - days_since_leapyear[i] + 1;
+ } else {
+ for (i = ARRAY_SIZE(days_since_year) - 1;
+ i > 0 && days_since_year[i] > w; --i)
+ /* just loop */;
+ r->monthday = w - days_since_year[i] + 1;
+ }
+
+ r->month = i + 1;
+}
+
+static bool
+time_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_time_info *info = par->matchinfo;
+ unsigned int packet_time;
+ struct xtm current_time;
+ s64 stamp;
+
+ /*
+ * We cannot use get_seconds() instead of __net_timestamp() here.
+ * Suppose you have two rules:
+ * 1. match before 13:00
+ * 2. match after 13:00
+ * If you match against processing time (get_seconds) it
+ * may happen that the same packet matches both rules if
+ * it arrived at the right moment before 13:00.
+ */
+ if (skb->tstamp.tv64 == 0)
+ __net_timestamp((struct sk_buff *)skb);
+
+ stamp = ktime_to_ns(skb->tstamp);
+ stamp = div_s64(stamp, NSEC_PER_SEC);
+
+ if (info->flags & XT_TIME_LOCAL_TZ)
+ /* Adjust for local timezone */
+ stamp -= 60 * sys_tz.tz_minuteswest;
+
+ /*
+ * xt_time will match when _all_ of the following hold:
+ * - 'now' is in the global time range date_start..date_end
+ * - 'now' is in the monthday mask
+ * - 'now' is in the weekday mask
+ * - 'now' is in the daytime range time_start..time_end
+ * (and by default, libxt_time will set these so as to match)
+ */
+
+ if (stamp < info->date_start || stamp > info->date_stop)
+ return false;
+
+ packet_time = localtime_1(&current_time, stamp);
+
+ if (info->daytime_start < info->daytime_stop) {
+ if (packet_time < info->daytime_start ||
+ packet_time > info->daytime_stop)
+ return false;
+ } else {
+ if (packet_time < info->daytime_start &&
+ packet_time > info->daytime_stop)
+ return false;
+
+ /** if user asked to ignore 'next day', then e.g.
+ * '1 PM Wed, August 1st' should be treated
+ * like 'Tue 1 PM July 31st'.
+ *
+ * This also causes
+ * 'Monday, "23:00 to 01:00", to match for 2 hours, starting
+ * Monday 23:00 to Tuesday 01:00.
+ */
+ if ((info->flags & XT_TIME_CONTIGUOUS) &&
+ packet_time <= info->daytime_stop)
+ stamp -= SECONDS_PER_DAY;
+ }
+
+ localtime_2(&current_time, stamp);
+
+ if (!(info->weekdays_match & (1 << current_time.weekday)))
+ return false;
+
+ /* Do not spend time computing monthday if all days match anyway */
+ if (info->monthdays_match != XT_TIME_ALL_MONTHDAYS) {
+ localtime_3(&current_time, stamp);
+ if (!(info->monthdays_match & (1 << current_time.monthday)))
+ return false;
+ }
+
+ return true;
+}
+
+static int time_mt_check(const struct xt_mtchk_param *par)
+{
+ const struct xt_time_info *info = par->matchinfo;
+
+ if (info->daytime_start > XT_TIME_MAX_DAYTIME ||
+ info->daytime_stop > XT_TIME_MAX_DAYTIME) {
+ pr_info("invalid argument - start or "
+ "stop time greater than 23:59:59\n");
+ return -EDOM;
+ }
+
+ if (info->flags & ~XT_TIME_ALL_FLAGS) {
+ pr_info("unknown flags 0x%x\n", info->flags & ~XT_TIME_ALL_FLAGS);
+ return -EINVAL;
+ }
+
+ if ((info->flags & XT_TIME_CONTIGUOUS) &&
+ info->daytime_start < info->daytime_stop)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_match xt_time_mt_reg __read_mostly = {
+ .name = "time",
+ .family = NFPROTO_UNSPEC,
+ .match = time_mt,
+ .checkentry = time_mt_check,
+ .matchsize = sizeof(struct xt_time_info),
+ .me = THIS_MODULE,
+};
+
+static int __init time_mt_init(void)
+{
+ int minutes = sys_tz.tz_minuteswest;
+
+ if (minutes < 0) /* east of Greenwich */
+ printk(KERN_INFO KBUILD_MODNAME
+ ": kernel timezone is +%02d%02d\n",
+ -minutes / 60, -minutes % 60);
+ else /* west of Greenwich */
+ printk(KERN_INFO KBUILD_MODNAME
+ ": kernel timezone is -%02d%02d\n",
+ minutes / 60, minutes % 60);
+
+ return xt_register_match(&xt_time_mt_reg);
+}
+
+static void __exit time_mt_exit(void)
+{
+ xt_unregister_match(&xt_time_mt_reg);
+}
+
+module_init(time_mt_init);
+module_exit(time_mt_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: time-based matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_time");
+MODULE_ALIAS("ip6t_time");
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
new file mode 100644
index 000000000..a95b50342
--- /dev/null
+++ b/net/netfilter/xt_u32.c
@@ -0,0 +1,123 @@
+/*
+ * xt_u32 - kernel module to match u32 packet content
+ *
+ * Original author: Don Cohen <don@isis.cs3-inc.com>
+ * (C) CC Computer Consultants GmbH, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_u32.h>
+
+static bool u32_match_it(const struct xt_u32 *data,
+ const struct sk_buff *skb)
+{
+ const struct xt_u32_test *ct;
+ unsigned int testind;
+ unsigned int nnums;
+ unsigned int nvals;
+ unsigned int i;
+ __be32 n;
+ u_int32_t pos;
+ u_int32_t val;
+ u_int32_t at;
+
+ /*
+ * Small example: "0 >> 28 == 4 && 8 & 0xFF0000 >> 16 = 6, 17"
+ * (=IPv4 and (TCP or UDP)). Outer loop runs over the "&&" operands.
+ */
+ for (testind = 0; testind < data->ntests; ++testind) {
+ ct = &data->tests[testind];
+ at = 0;
+ pos = ct->location[0].number;
+
+ if (skb->len < 4 || pos > skb->len - 4)
+ return false;
+
+ if (skb_copy_bits(skb, pos, &n, sizeof(n)) < 0)
+ BUG();
+ val = ntohl(n);
+ nnums = ct->nnums;
+
+ /* Inner loop runs over "&", "<<", ">>" and "@" operands */
+ for (i = 1; i < nnums; ++i) {
+ u_int32_t number = ct->location[i].number;
+ switch (ct->location[i].nextop) {
+ case XT_U32_AND:
+ val &= number;
+ break;
+ case XT_U32_LEFTSH:
+ val <<= number;
+ break;
+ case XT_U32_RIGHTSH:
+ val >>= number;
+ break;
+ case XT_U32_AT:
+ if (at + val < at)
+ return false;
+ at += val;
+ pos = number;
+ if (at + 4 < at || skb->len < at + 4 ||
+ pos > skb->len - at - 4)
+ return false;
+
+ if (skb_copy_bits(skb, at + pos, &n,
+ sizeof(n)) < 0)
+ BUG();
+ val = ntohl(n);
+ break;
+ }
+ }
+
+ /* Run over the "," and ":" operands */
+ nvals = ct->nvalues;
+ for (i = 0; i < nvals; ++i)
+ if (ct->value[i].min <= val && val <= ct->value[i].max)
+ break;
+
+ if (i >= ct->nvalues)
+ return false;
+ }
+
+ return true;
+}
+
+static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_u32 *data = par->matchinfo;
+ bool ret;
+
+ ret = u32_match_it(data, skb);
+ return ret ^ data->invert;
+}
+
+static struct xt_match xt_u32_mt_reg __read_mostly = {
+ .name = "u32",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = u32_mt,
+ .matchsize = sizeof(struct xt_u32),
+ .me = THIS_MODULE,
+};
+
+static int __init u32_mt_init(void)
+{
+ return xt_register_match(&xt_u32_mt_reg);
+}
+
+static void __exit u32_mt_exit(void)
+{
+ xt_unregister_match(&xt_u32_mt_reg);
+}
+
+module_init(u32_mt_init);
+module_exit(u32_mt_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: arbitrary byte matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_u32");
+MODULE_ALIAS("ip6t_u32");
diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig
new file mode 100644
index 000000000..56958c85f
--- /dev/null
+++ b/net/netlabel/Kconfig
@@ -0,0 +1,17 @@
+#
+# NetLabel configuration
+#
+
+config NETLABEL
+ bool "NetLabel subsystem support"
+ depends on SECURITY
+ default n
+ ---help---
+ NetLabel provides support for explicit network packet labeling
+ protocols such as CIPSO and RIPSO. For more information see
+ Documentation/netlabel as well as the NetLabel SourceForge project
+ for configuration tools and additional documentation.
+
+ * http://netlabel.sf.net
+
+ If you are unsure, say N.
diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile
new file mode 100644
index 000000000..d2732fc95
--- /dev/null
+++ b/net/netlabel/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the NetLabel subsystem.
+#
+
+# base objects
+obj-y := netlabel_user.o netlabel_kapi.o
+obj-y += netlabel_domainhash.o netlabel_addrlist.o
+
+# management objects
+obj-y += netlabel_mgmt.o
+
+# protocol modules
+obj-y += netlabel_unlabeled.o
+obj-y += netlabel_cipso_v4.o
+
diff --git a/net/netlabel/netlabel_addrlist.c b/net/netlabel/netlabel_addrlist.c
new file mode 100644
index 000000000..d0a3acfa5
--- /dev/null
+++ b/net/netlabel/netlabel_addrlist.c
@@ -0,0 +1,383 @@
+/*
+ * NetLabel Network Address Lists
+ *
+ * This file contains network address list functions used to manage ordered
+ * lists of network addresses for use by the NetLabel subsystem. The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/audit.h>
+
+#include "netlabel_addrlist.h"
+
+/*
+ * Address List Functions
+ */
+
+/**
+ * netlbl_af4list_search - Search for a matching IPv4 address entry
+ * @addr: IPv4 address
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv4 address list given by @head. If a matching address entry
+ * is found it is returned, otherwise NULL is returned. The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
+ struct list_head *head)
+{
+ struct netlbl_af4list *iter;
+
+ list_for_each_entry_rcu(iter, head, list)
+ if (iter->valid && (addr & iter->mask) == iter->addr)
+ return iter;
+
+ return NULL;
+}
+
+/**
+ * netlbl_af4list_search_exact - Search for an exact IPv4 address entry
+ * @addr: IPv4 address
+ * @mask: IPv4 address mask
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv4 address list given by @head. If an exact match if found
+ * it is returned, otherwise NULL is returned. The caller is responsible for
+ * calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr,
+ __be32 mask,
+ struct list_head *head)
+{
+ struct netlbl_af4list *iter;
+
+ list_for_each_entry_rcu(iter, head, list)
+ if (iter->valid && iter->addr == addr && iter->mask == mask)
+ return iter;
+
+ return NULL;
+}
+
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_af6list_search - Search for a matching IPv6 address entry
+ * @addr: IPv6 address
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv6 address list given by @head. If a matching address entry
+ * is found it is returned, otherwise NULL is returned. The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
+ struct list_head *head)
+{
+ struct netlbl_af6list *iter;
+
+ list_for_each_entry_rcu(iter, head, list)
+ if (iter->valid &&
+ ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
+ return iter;
+
+ return NULL;
+}
+
+/**
+ * netlbl_af6list_search_exact - Search for an exact IPv6 address entry
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv6 address list given by @head. If an exact match if found
+ * it is returned, otherwise NULL is returned. The caller is responsible for
+ * calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct list_head *head)
+{
+ struct netlbl_af6list *iter;
+
+ list_for_each_entry_rcu(iter, head, list)
+ if (iter->valid &&
+ ipv6_addr_equal(&iter->addr, addr) &&
+ ipv6_addr_equal(&iter->mask, mask))
+ return iter;
+
+ return NULL;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_af4list_add - Add a new IPv4 address entry to a list
+ * @entry: address entry
+ * @head: the list head
+ *
+ * Description:
+ * Add a new address entry to the list pointed to by @head. On success zero is
+ * returned, otherwise a negative value is returned. The caller is responsible
+ * for calling the necessary locking functions.
+ *
+ */
+int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head)
+{
+ struct netlbl_af4list *iter;
+
+ iter = netlbl_af4list_search(entry->addr, head);
+ if (iter != NULL &&
+ iter->addr == entry->addr && iter->mask == entry->mask)
+ return -EEXIST;
+
+ /* in order to speed up address searches through the list (the common
+ * case) we need to keep the list in order based on the size of the
+ * address mask such that the entry with the widest mask (smallest
+ * numerical value) appears first in the list */
+ list_for_each_entry_rcu(iter, head, list)
+ if (iter->valid &&
+ ntohl(entry->mask) > ntohl(iter->mask)) {
+ __list_add_rcu(&entry->list,
+ iter->list.prev,
+ &iter->list);
+ return 0;
+ }
+ list_add_tail_rcu(&entry->list, head);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_af6list_add - Add a new IPv6 address entry to a list
+ * @entry: address entry
+ * @head: the list head
+ *
+ * Description:
+ * Add a new address entry to the list pointed to by @head. On success zero is
+ * returned, otherwise a negative value is returned. The caller is responsible
+ * for calling the necessary locking functions.
+ *
+ */
+int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head)
+{
+ struct netlbl_af6list *iter;
+
+ iter = netlbl_af6list_search(&entry->addr, head);
+ if (iter != NULL &&
+ ipv6_addr_equal(&iter->addr, &entry->addr) &&
+ ipv6_addr_equal(&iter->mask, &entry->mask))
+ return -EEXIST;
+
+ /* in order to speed up address searches through the list (the common
+ * case) we need to keep the list in order based on the size of the
+ * address mask such that the entry with the widest mask (smallest
+ * numerical value) appears first in the list */
+ list_for_each_entry_rcu(iter, head, list)
+ if (iter->valid &&
+ ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
+ __list_add_rcu(&entry->list,
+ iter->list.prev,
+ &iter->list);
+ return 0;
+ }
+ list_add_tail_rcu(&entry->list, head);
+ return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_af4list_remove_entry - Remove an IPv4 address entry
+ * @entry: address entry
+ *
+ * Description:
+ * Remove the specified IP address entry. The caller is responsible for
+ * calling the necessary locking functions.
+ *
+ */
+void netlbl_af4list_remove_entry(struct netlbl_af4list *entry)
+{
+ entry->valid = 0;
+ list_del_rcu(&entry->list);
+}
+
+/**
+ * netlbl_af4list_remove - Remove an IPv4 address entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @head: the list head
+ *
+ * Description:
+ * Remove an IP address entry from the list pointed to by @head. Returns the
+ * entry on success, NULL on failure. The caller is responsible for calling
+ * the necessary locking functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask,
+ struct list_head *head)
+{
+ struct netlbl_af4list *entry;
+
+ entry = netlbl_af4list_search_exact(addr, mask, head);
+ if (entry == NULL)
+ return NULL;
+ netlbl_af4list_remove_entry(entry);
+ return entry;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_af6list_remove_entry - Remove an IPv6 address entry
+ * @entry: address entry
+ *
+ * Description:
+ * Remove the specified IP address entry. The caller is responsible for
+ * calling the necessary locking functions.
+ *
+ */
+void netlbl_af6list_remove_entry(struct netlbl_af6list *entry)
+{
+ entry->valid = 0;
+ list_del_rcu(&entry->list);
+}
+
+/**
+ * netlbl_af6list_remove - Remove an IPv6 address entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @head: the list head
+ *
+ * Description:
+ * Remove an IP address entry from the list pointed to by @head. Returns the
+ * entry on success, NULL on failure. The caller is responsible for calling
+ * the necessary locking functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct list_head *head)
+{
+ struct netlbl_af6list *entry;
+
+ entry = netlbl_af6list_search_exact(addr, mask, head);
+ if (entry == NULL)
+ return NULL;
+ netlbl_af6list_remove_entry(entry);
+ return entry;
+}
+#endif /* IPv6 */
+
+/*
+ * Audit Helper Functions
+ */
+
+#ifdef CONFIG_AUDIT
+/**
+ * netlbl_af4list_audit_addr - Audit an IPv4 address
+ * @audit_buf: audit buffer
+ * @src: true if source address, false if destination
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv4 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
+ int src, const char *dev,
+ __be32 addr, __be32 mask)
+{
+ u32 mask_val = ntohl(mask);
+ char *dir = (src ? "src" : "dst");
+
+ if (dev != NULL)
+ audit_log_format(audit_buf, " netif=%s", dev);
+ audit_log_format(audit_buf, " %s=%pI4", dir, &addr);
+ if (mask_val != 0xffffffff) {
+ u32 mask_len = 0;
+ while (mask_val > 0) {
+ mask_val <<= 1;
+ mask_len++;
+ }
+ audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len);
+ }
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_af6list_audit_addr - Audit an IPv6 address
+ * @audit_buf: audit buffer
+ * @src: true if source address, false if destination
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv6 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
+ int src,
+ const char *dev,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask)
+{
+ char *dir = (src ? "src" : "dst");
+
+ if (dev != NULL)
+ audit_log_format(audit_buf, " netif=%s", dev);
+ audit_log_format(audit_buf, " %s=%pI6", dir, addr);
+ if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
+ u32 mask_len = 0;
+ u32 mask_val;
+ int iter = -1;
+ while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
+ mask_len += 32;
+ mask_val = ntohl(mask->s6_addr32[iter]);
+ while (mask_val > 0) {
+ mask_val <<= 1;
+ mask_len++;
+ }
+ audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len);
+ }
+}
+#endif /* IPv6 */
+#endif /* CONFIG_AUDIT */
diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h
new file mode 100644
index 000000000..d0f38bc9a
--- /dev/null
+++ b/net/netlabel/netlabel_addrlist.h
@@ -0,0 +1,208 @@
+/*
+ * NetLabel Network Address Lists
+ *
+ * This file contains network address list functions used to manage ordered
+ * lists of network addresses for use by the NetLabel subsystem. The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_ADDRLIST_H
+#define _NETLABEL_ADDRLIST_H
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/in6.h>
+#include <linux/audit.h>
+
+/**
+ * struct netlbl_af4list - NetLabel IPv4 address list
+ * @addr: IPv4 address
+ * @mask: IPv4 address mask
+ * @valid: valid flag
+ * @list: list structure, used internally
+ */
+struct netlbl_af4list {
+ __be32 addr;
+ __be32 mask;
+
+ u32 valid;
+ struct list_head list;
+};
+
+/**
+ * struct netlbl_af6list - NetLabel IPv6 address list
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @valid: valid flag
+ * @list: list structure, used internally
+ */
+struct netlbl_af6list {
+ struct in6_addr addr;
+ struct in6_addr mask;
+
+ u32 valid;
+ struct list_head list;
+};
+
+#define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list)
+
+static inline struct netlbl_af4list *__af4list_valid(struct list_head *s,
+ struct list_head *h)
+{
+ struct list_head *i = s;
+ struct netlbl_af4list *n = __af4list_entry(s);
+ while (i != h && !n->valid) {
+ i = i->next;
+ n = __af4list_entry(i);
+ }
+ return n;
+}
+
+static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s,
+ struct list_head *h)
+{
+ struct list_head *i = s;
+ struct netlbl_af4list *n = __af4list_entry(s);
+ while (i != h && !n->valid) {
+ i = rcu_dereference(i->next);
+ n = __af4list_entry(i);
+ }
+ return n;
+}
+
+#define netlbl_af4list_foreach(iter, head) \
+ for (iter = __af4list_valid((head)->next, head); \
+ &iter->list != (head); \
+ iter = __af4list_valid(iter->list.next, head))
+
+#define netlbl_af4list_foreach_rcu(iter, head) \
+ for (iter = __af4list_valid_rcu((head)->next, head); \
+ &iter->list != (head); \
+ iter = __af4list_valid_rcu(iter->list.next, head))
+
+#define netlbl_af4list_foreach_safe(iter, tmp, head) \
+ for (iter = __af4list_valid((head)->next, head), \
+ tmp = __af4list_valid(iter->list.next, head); \
+ &iter->list != (head); \
+ iter = tmp, tmp = __af4list_valid(iter->list.next, head))
+
+int netlbl_af4list_add(struct netlbl_af4list *entry,
+ struct list_head *head);
+struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask,
+ struct list_head *head);
+void netlbl_af4list_remove_entry(struct netlbl_af4list *entry);
+struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
+ struct list_head *head);
+struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr,
+ __be32 mask,
+ struct list_head *head);
+
+#ifdef CONFIG_AUDIT
+void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
+ int src, const char *dev,
+ __be32 addr, __be32 mask);
+#else
+static inline void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
+ int src, const char *dev,
+ __be32 addr, __be32 mask)
+{
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list)
+
+static inline struct netlbl_af6list *__af6list_valid(struct list_head *s,
+ struct list_head *h)
+{
+ struct list_head *i = s;
+ struct netlbl_af6list *n = __af6list_entry(s);
+ while (i != h && !n->valid) {
+ i = i->next;
+ n = __af6list_entry(i);
+ }
+ return n;
+}
+
+static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s,
+ struct list_head *h)
+{
+ struct list_head *i = s;
+ struct netlbl_af6list *n = __af6list_entry(s);
+ while (i != h && !n->valid) {
+ i = rcu_dereference(i->next);
+ n = __af6list_entry(i);
+ }
+ return n;
+}
+
+#define netlbl_af6list_foreach(iter, head) \
+ for (iter = __af6list_valid((head)->next, head); \
+ &iter->list != (head); \
+ iter = __af6list_valid(iter->list.next, head))
+
+#define netlbl_af6list_foreach_rcu(iter, head) \
+ for (iter = __af6list_valid_rcu((head)->next, head); \
+ &iter->list != (head); \
+ iter = __af6list_valid_rcu(iter->list.next, head))
+
+#define netlbl_af6list_foreach_safe(iter, tmp, head) \
+ for (iter = __af6list_valid((head)->next, head), \
+ tmp = __af6list_valid(iter->list.next, head); \
+ &iter->list != (head); \
+ iter = tmp, tmp = __af6list_valid(iter->list.next, head))
+
+int netlbl_af6list_add(struct netlbl_af6list *entry,
+ struct list_head *head);
+struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct list_head *head);
+void netlbl_af6list_remove_entry(struct netlbl_af6list *entry);
+struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
+ struct list_head *head);
+struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct list_head *head);
+
+#ifdef CONFIG_AUDIT
+void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
+ int src,
+ const char *dev,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask);
+#else
+static inline void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
+ int src,
+ const char *dev,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask)
+{
+}
+#endif
+#endif /* IPV6 */
+
+#endif
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
new file mode 100644
index 000000000..7fd1104ba
--- /dev/null
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -0,0 +1,786 @@
+/*
+ * NetLabel CIPSO/IPv4 Support
+ *
+ * This file defines the CIPSO/IPv4 functions for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <linux/atomic.h>
+
+#include "netlabel_user.h"
+#include "netlabel_cipso_v4.h"
+#include "netlabel_mgmt.h"
+#include "netlabel_domainhash.h"
+
+/* Argument struct for cipso_v4_doi_walk() */
+struct netlbl_cipsov4_doiwalk_arg {
+ struct netlink_callback *nl_cb;
+ struct sk_buff *skb;
+ u32 seq;
+};
+
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+ struct netlbl_audit *audit_info;
+ u32 doi;
+};
+
+/* NetLabel Generic NETLINK CIPSOv4 family */
+static struct genl_family netlbl_cipsov4_gnl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_CIPSOV4_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_CIPSOV4_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = {
+ [NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 },
+ [NLBL_CIPSOV4_A_MTYPE] = { .type = NLA_U32 },
+ [NLBL_CIPSOV4_A_TAG] = { .type = NLA_U8 },
+ [NLBL_CIPSOV4_A_TAGLST] = { .type = NLA_NESTED },
+ [NLBL_CIPSOV4_A_MLSLVLLOC] = { .type = NLA_U32 },
+ [NLBL_CIPSOV4_A_MLSLVLREM] = { .type = NLA_U32 },
+ [NLBL_CIPSOV4_A_MLSLVL] = { .type = NLA_NESTED },
+ [NLBL_CIPSOV4_A_MLSLVLLST] = { .type = NLA_NESTED },
+ [NLBL_CIPSOV4_A_MLSCATLOC] = { .type = NLA_U32 },
+ [NLBL_CIPSOV4_A_MLSCATREM] = { .type = NLA_U32 },
+ [NLBL_CIPSOV4_A_MLSCAT] = { .type = NLA_NESTED },
+ [NLBL_CIPSOV4_A_MLSCATLST] = { .type = NLA_NESTED },
+};
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * netlbl_cipsov4_add_common - Parse the common sections of a ADD message
+ * @info: the Generic NETLINK info block
+ * @doi_def: the CIPSO V4 DOI definition
+ *
+ * Description:
+ * Parse the common sections of a ADD message and fill in the related values
+ * in @doi_def. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_add_common(struct genl_info *info,
+ struct cipso_v4_doi *doi_def)
+{
+ struct nlattr *nla;
+ int nla_rem;
+ u32 iter = 0;
+
+ doi_def->doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+
+ if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_TAGLST],
+ NLBL_CIPSOV4_A_MAX,
+ netlbl_cipsov4_genl_policy) != 0)
+ return -EINVAL;
+
+ nla_for_each_nested(nla, info->attrs[NLBL_CIPSOV4_A_TAGLST], nla_rem)
+ if (nla_type(nla) == NLBL_CIPSOV4_A_TAG) {
+ if (iter >= CIPSO_V4_TAG_MAXCNT)
+ return -EINVAL;
+ doi_def->tags[iter++] = nla_get_u8(nla);
+ }
+ while (iter < CIPSO_V4_TAG_MAXCNT)
+ doi_def->tags[iter++] = CIPSO_V4_TAG_INVALID;
+
+ return 0;
+}
+
+/*
+ * NetLabel Command Handlers
+ */
+
+/**
+ * netlbl_cipsov4_add_std - Adds a CIPSO V4 DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CIPSO_V4_MAP_TRANS DOI definition based on the given ADD
+ * message and add it to the CIPSO V4 engine. Return zero on success and
+ * non-zero on error.
+ *
+ */
+static int netlbl_cipsov4_add_std(struct genl_info *info,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -EINVAL;
+ struct cipso_v4_doi *doi_def = NULL;
+ struct nlattr *nla_a;
+ struct nlattr *nla_b;
+ int nla_a_rem;
+ int nla_b_rem;
+ u32 iter;
+
+ if (!info->attrs[NLBL_CIPSOV4_A_TAGLST] ||
+ !info->attrs[NLBL_CIPSOV4_A_MLSLVLLST])
+ return -EINVAL;
+
+ if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+ NLBL_CIPSOV4_A_MAX,
+ netlbl_cipsov4_genl_policy) != 0)
+ return -EINVAL;
+
+ doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+ if (doi_def == NULL)
+ return -ENOMEM;
+ doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL);
+ if (doi_def->map.std == NULL) {
+ ret_val = -ENOMEM;
+ goto add_std_failure;
+ }
+ doi_def->type = CIPSO_V4_MAP_TRANS;
+
+ ret_val = netlbl_cipsov4_add_common(info, doi_def);
+ if (ret_val != 0)
+ goto add_std_failure;
+ ret_val = -EINVAL;
+
+ nla_for_each_nested(nla_a,
+ info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+ nla_a_rem)
+ if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) {
+ if (nla_validate_nested(nla_a,
+ NLBL_CIPSOV4_A_MAX,
+ netlbl_cipsov4_genl_policy) != 0)
+ goto add_std_failure;
+ nla_for_each_nested(nla_b, nla_a, nla_b_rem)
+ switch (nla_type(nla_b)) {
+ case NLBL_CIPSOV4_A_MLSLVLLOC:
+ if (nla_get_u32(nla_b) >
+ CIPSO_V4_MAX_LOC_LVLS)
+ goto add_std_failure;
+ if (nla_get_u32(nla_b) >=
+ doi_def->map.std->lvl.local_size)
+ doi_def->map.std->lvl.local_size =
+ nla_get_u32(nla_b) + 1;
+ break;
+ case NLBL_CIPSOV4_A_MLSLVLREM:
+ if (nla_get_u32(nla_b) >
+ CIPSO_V4_MAX_REM_LVLS)
+ goto add_std_failure;
+ if (nla_get_u32(nla_b) >=
+ doi_def->map.std->lvl.cipso_size)
+ doi_def->map.std->lvl.cipso_size =
+ nla_get_u32(nla_b) + 1;
+ break;
+ }
+ }
+ doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size,
+ sizeof(u32),
+ GFP_KERNEL);
+ if (doi_def->map.std->lvl.local == NULL) {
+ ret_val = -ENOMEM;
+ goto add_std_failure;
+ }
+ doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size,
+ sizeof(u32),
+ GFP_KERNEL);
+ if (doi_def->map.std->lvl.cipso == NULL) {
+ ret_val = -ENOMEM;
+ goto add_std_failure;
+ }
+ for (iter = 0; iter < doi_def->map.std->lvl.local_size; iter++)
+ doi_def->map.std->lvl.local[iter] = CIPSO_V4_INV_LVL;
+ for (iter = 0; iter < doi_def->map.std->lvl.cipso_size; iter++)
+ doi_def->map.std->lvl.cipso[iter] = CIPSO_V4_INV_LVL;
+ nla_for_each_nested(nla_a,
+ info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+ nla_a_rem)
+ if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) {
+ struct nlattr *lvl_loc;
+ struct nlattr *lvl_rem;
+
+ lvl_loc = nla_find_nested(nla_a,
+ NLBL_CIPSOV4_A_MLSLVLLOC);
+ lvl_rem = nla_find_nested(nla_a,
+ NLBL_CIPSOV4_A_MLSLVLREM);
+ if (lvl_loc == NULL || lvl_rem == NULL)
+ goto add_std_failure;
+ doi_def->map.std->lvl.local[nla_get_u32(lvl_loc)] =
+ nla_get_u32(lvl_rem);
+ doi_def->map.std->lvl.cipso[nla_get_u32(lvl_rem)] =
+ nla_get_u32(lvl_loc);
+ }
+
+ if (info->attrs[NLBL_CIPSOV4_A_MLSCATLST]) {
+ if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+ NLBL_CIPSOV4_A_MAX,
+ netlbl_cipsov4_genl_policy) != 0)
+ goto add_std_failure;
+
+ nla_for_each_nested(nla_a,
+ info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+ nla_a_rem)
+ if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) {
+ if (nla_validate_nested(nla_a,
+ NLBL_CIPSOV4_A_MAX,
+ netlbl_cipsov4_genl_policy) != 0)
+ goto add_std_failure;
+ nla_for_each_nested(nla_b, nla_a, nla_b_rem)
+ switch (nla_type(nla_b)) {
+ case NLBL_CIPSOV4_A_MLSCATLOC:
+ if (nla_get_u32(nla_b) >
+ CIPSO_V4_MAX_LOC_CATS)
+ goto add_std_failure;
+ if (nla_get_u32(nla_b) >=
+ doi_def->map.std->cat.local_size)
+ doi_def->map.std->cat.local_size =
+ nla_get_u32(nla_b) + 1;
+ break;
+ case NLBL_CIPSOV4_A_MLSCATREM:
+ if (nla_get_u32(nla_b) >
+ CIPSO_V4_MAX_REM_CATS)
+ goto add_std_failure;
+ if (nla_get_u32(nla_b) >=
+ doi_def->map.std->cat.cipso_size)
+ doi_def->map.std->cat.cipso_size =
+ nla_get_u32(nla_b) + 1;
+ break;
+ }
+ }
+ doi_def->map.std->cat.local = kcalloc(
+ doi_def->map.std->cat.local_size,
+ sizeof(u32),
+ GFP_KERNEL);
+ if (doi_def->map.std->cat.local == NULL) {
+ ret_val = -ENOMEM;
+ goto add_std_failure;
+ }
+ doi_def->map.std->cat.cipso = kcalloc(
+ doi_def->map.std->cat.cipso_size,
+ sizeof(u32),
+ GFP_KERNEL);
+ if (doi_def->map.std->cat.cipso == NULL) {
+ ret_val = -ENOMEM;
+ goto add_std_failure;
+ }
+ for (iter = 0; iter < doi_def->map.std->cat.local_size; iter++)
+ doi_def->map.std->cat.local[iter] = CIPSO_V4_INV_CAT;
+ for (iter = 0; iter < doi_def->map.std->cat.cipso_size; iter++)
+ doi_def->map.std->cat.cipso[iter] = CIPSO_V4_INV_CAT;
+ nla_for_each_nested(nla_a,
+ info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+ nla_a_rem)
+ if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) {
+ struct nlattr *cat_loc;
+ struct nlattr *cat_rem;
+
+ cat_loc = nla_find_nested(nla_a,
+ NLBL_CIPSOV4_A_MLSCATLOC);
+ cat_rem = nla_find_nested(nla_a,
+ NLBL_CIPSOV4_A_MLSCATREM);
+ if (cat_loc == NULL || cat_rem == NULL)
+ goto add_std_failure;
+ doi_def->map.std->cat.local[
+ nla_get_u32(cat_loc)] =
+ nla_get_u32(cat_rem);
+ doi_def->map.std->cat.cipso[
+ nla_get_u32(cat_rem)] =
+ nla_get_u32(cat_loc);
+ }
+ }
+
+ ret_val = cipso_v4_doi_add(doi_def, audit_info);
+ if (ret_val != 0)
+ goto add_std_failure;
+ return 0;
+
+add_std_failure:
+ cipso_v4_doi_free(doi_def);
+ return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_add_pass - Adds a CIPSO V4 DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CIPSO_V4_MAP_PASS DOI definition based on the given ADD message
+ * and add it to the CIPSO V4 engine. Return zero on success and non-zero on
+ * error.
+ *
+ */
+static int netlbl_cipsov4_add_pass(struct genl_info *info,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct cipso_v4_doi *doi_def = NULL;
+
+ if (!info->attrs[NLBL_CIPSOV4_A_TAGLST])
+ return -EINVAL;
+
+ doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+ if (doi_def == NULL)
+ return -ENOMEM;
+ doi_def->type = CIPSO_V4_MAP_PASS;
+
+ ret_val = netlbl_cipsov4_add_common(info, doi_def);
+ if (ret_val != 0)
+ goto add_pass_failure;
+
+ ret_val = cipso_v4_doi_add(doi_def, audit_info);
+ if (ret_val != 0)
+ goto add_pass_failure;
+ return 0;
+
+add_pass_failure:
+ cipso_v4_doi_free(doi_def);
+ return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_add_local - Adds a CIPSO V4 DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CIPSO_V4_MAP_LOCAL DOI definition based on the given ADD
+ * message and add it to the CIPSO V4 engine. Return zero on success and
+ * non-zero on error.
+ *
+ */
+static int netlbl_cipsov4_add_local(struct genl_info *info,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct cipso_v4_doi *doi_def = NULL;
+
+ if (!info->attrs[NLBL_CIPSOV4_A_TAGLST])
+ return -EINVAL;
+
+ doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+ if (doi_def == NULL)
+ return -ENOMEM;
+ doi_def->type = CIPSO_V4_MAP_LOCAL;
+
+ ret_val = netlbl_cipsov4_add_common(info, doi_def);
+ if (ret_val != 0)
+ goto add_local_failure;
+
+ ret_val = cipso_v4_doi_add(doi_def, audit_info);
+ if (ret_val != 0)
+ goto add_local_failure;
+ return 0;
+
+add_local_failure:
+ cipso_v4_doi_free(doi_def);
+ return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_add - Handle an ADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Create a new DOI definition based on the given ADD message and add it to the
+ * CIPSO V4 engine. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
+
+{
+ int ret_val = -EINVAL;
+ struct netlbl_audit audit_info;
+
+ if (!info->attrs[NLBL_CIPSOV4_A_DOI] ||
+ !info->attrs[NLBL_CIPSOV4_A_MTYPE])
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+ switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) {
+ case CIPSO_V4_MAP_TRANS:
+ ret_val = netlbl_cipsov4_add_std(info, &audit_info);
+ break;
+ case CIPSO_V4_MAP_PASS:
+ ret_val = netlbl_cipsov4_add_pass(info, &audit_info);
+ break;
+ case CIPSO_V4_MAP_LOCAL:
+ ret_val = netlbl_cipsov4_add_local(info, &audit_info);
+ break;
+ }
+ if (ret_val == 0)
+ atomic_inc(&netlabel_mgmt_protocount);
+
+ return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_list - Handle a LIST message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LIST message and respond accordingly. While the
+ * response message generated by the kernel is straightforward, determining
+ * before hand the size of the buffer to allocate is not (we have to generate
+ * the message to know the size). In order to keep this function sane what we
+ * do is allocate a buffer of NLMSG_GOODSIZE and try to fit the response in
+ * that size, if we fail then we restart with a larger buffer and try again.
+ * We continue in this manner until we hit a limit of failed attempts then we
+ * give up and just send an error message. Returns zero on success and
+ * negative values on error.
+ *
+ */
+static int netlbl_cipsov4_list(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val;
+ struct sk_buff *ans_skb = NULL;
+ u32 nlsze_mult = 1;
+ void *data;
+ u32 doi;
+ struct nlattr *nla_a;
+ struct nlattr *nla_b;
+ struct cipso_v4_doi *doi_def;
+ u32 iter;
+
+ if (!info->attrs[NLBL_CIPSOV4_A_DOI]) {
+ ret_val = -EINVAL;
+ goto list_failure;
+ }
+
+list_start:
+ ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE * nlsze_mult, GFP_KERNEL);
+ if (ans_skb == NULL) {
+ ret_val = -ENOMEM;
+ goto list_failure;
+ }
+ data = genlmsg_put_reply(ans_skb, info, &netlbl_cipsov4_gnl_family,
+ 0, NLBL_CIPSOV4_C_LIST);
+ if (data == NULL) {
+ ret_val = -ENOMEM;
+ goto list_failure;
+ }
+
+ doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_getdef(doi);
+ if (doi_def == NULL) {
+ ret_val = -EINVAL;
+ goto list_failure_lock;
+ }
+
+ ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type);
+ if (ret_val != 0)
+ goto list_failure_lock;
+
+ nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_TAGLST);
+ if (nla_a == NULL) {
+ ret_val = -ENOMEM;
+ goto list_failure_lock;
+ }
+ for (iter = 0;
+ iter < CIPSO_V4_TAG_MAXCNT &&
+ doi_def->tags[iter] != CIPSO_V4_TAG_INVALID;
+ iter++) {
+ ret_val = nla_put_u8(ans_skb,
+ NLBL_CIPSOV4_A_TAG,
+ doi_def->tags[iter]);
+ if (ret_val != 0)
+ goto list_failure_lock;
+ }
+ nla_nest_end(ans_skb, nla_a);
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_TRANS:
+ nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST);
+ if (nla_a == NULL) {
+ ret_val = -ENOMEM;
+ goto list_failure_lock;
+ }
+ for (iter = 0;
+ iter < doi_def->map.std->lvl.local_size;
+ iter++) {
+ if (doi_def->map.std->lvl.local[iter] ==
+ CIPSO_V4_INV_LVL)
+ continue;
+
+ nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVL);
+ if (nla_b == NULL) {
+ ret_val = -ENOMEM;
+ goto list_retry;
+ }
+ ret_val = nla_put_u32(ans_skb,
+ NLBL_CIPSOV4_A_MLSLVLLOC,
+ iter);
+ if (ret_val != 0)
+ goto list_retry;
+ ret_val = nla_put_u32(ans_skb,
+ NLBL_CIPSOV4_A_MLSLVLREM,
+ doi_def->map.std->lvl.local[iter]);
+ if (ret_val != 0)
+ goto list_retry;
+ nla_nest_end(ans_skb, nla_b);
+ }
+ nla_nest_end(ans_skb, nla_a);
+
+ nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCATLST);
+ if (nla_a == NULL) {
+ ret_val = -ENOMEM;
+ goto list_retry;
+ }
+ for (iter = 0;
+ iter < doi_def->map.std->cat.local_size;
+ iter++) {
+ if (doi_def->map.std->cat.local[iter] ==
+ CIPSO_V4_INV_CAT)
+ continue;
+
+ nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCAT);
+ if (nla_b == NULL) {
+ ret_val = -ENOMEM;
+ goto list_retry;
+ }
+ ret_val = nla_put_u32(ans_skb,
+ NLBL_CIPSOV4_A_MLSCATLOC,
+ iter);
+ if (ret_val != 0)
+ goto list_retry;
+ ret_val = nla_put_u32(ans_skb,
+ NLBL_CIPSOV4_A_MLSCATREM,
+ doi_def->map.std->cat.local[iter]);
+ if (ret_val != 0)
+ goto list_retry;
+ nla_nest_end(ans_skb, nla_b);
+ }
+ nla_nest_end(ans_skb, nla_a);
+
+ break;
+ }
+ rcu_read_unlock();
+
+ genlmsg_end(ans_skb, data);
+ return genlmsg_reply(ans_skb, info);
+
+list_retry:
+ /* XXX - this limit is a guesstimate */
+ if (nlsze_mult < 4) {
+ rcu_read_unlock();
+ kfree_skb(ans_skb);
+ nlsze_mult *= 2;
+ goto list_start;
+ }
+list_failure_lock:
+ rcu_read_unlock();
+list_failure:
+ kfree_skb(ans_skb);
+ return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_listall_cb - cipso_v4_doi_walk() callback for LISTALL
+ * @doi_def: the CIPSOv4 DOI definition
+ * @arg: the netlbl_cipsov4_doiwalk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * cipso_v4_doi_walk() function for use in generating a response for a LISTALL
+ * message. Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_cipsov4_listall_cb(struct cipso_v4_doi *doi_def, void *arg)
+{
+ int ret_val = -ENOMEM;
+ struct netlbl_cipsov4_doiwalk_arg *cb_arg = arg;
+ void *data;
+
+ data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
+ cb_arg->seq, &netlbl_cipsov4_gnl_family,
+ NLM_F_MULTI, NLBL_CIPSOV4_C_LISTALL);
+ if (data == NULL)
+ goto listall_cb_failure;
+
+ ret_val = nla_put_u32(cb_arg->skb, NLBL_CIPSOV4_A_DOI, doi_def->doi);
+ if (ret_val != 0)
+ goto listall_cb_failure;
+ ret_val = nla_put_u32(cb_arg->skb,
+ NLBL_CIPSOV4_A_MTYPE,
+ doi_def->type);
+ if (ret_val != 0)
+ goto listall_cb_failure;
+
+ genlmsg_end(cb_arg->skb, data);
+ return 0;
+
+listall_cb_failure:
+ genlmsg_cancel(cb_arg->skb, data);
+ return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_listall - Handle a LISTALL message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated LISTALL message and respond accordingly. Returns
+ * zero on success and negative values on error.
+ *
+ */
+static int netlbl_cipsov4_listall(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netlbl_cipsov4_doiwalk_arg cb_arg;
+ u32 doi_skip = cb->args[0];
+
+ cb_arg.nl_cb = cb;
+ cb_arg.skb = skb;
+ cb_arg.seq = cb->nlh->nlmsg_seq;
+
+ cipso_v4_doi_walk(&doi_skip, netlbl_cipsov4_listall_cb, &cb_arg);
+
+ cb->args[0] = doi_skip;
+ return skb->len;
+}
+
+/**
+ * netlbl_cipsov4_remove_cb - netlbl_cipsov4_remove() callback for REMOVE
+ * @entry: LSM domain mapping entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is intended for use by netlbl_cipsov4_remove() as the callback
+ * for the netlbl_domhsh_walk() function; it removes LSM domain map entries
+ * which are associated with the CIPSO DOI specified in @arg. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_remove_cb(struct netlbl_dom_map *entry, void *arg)
+{
+ struct netlbl_domhsh_walk_arg *cb_arg = arg;
+
+ if (entry->def.type == NETLBL_NLTYPE_CIPSOV4 &&
+ entry->def.cipso->doi == cb_arg->doi)
+ return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info);
+
+ return 0;
+}
+
+/**
+ * netlbl_cipsov4_remove - Handle a REMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVE message and respond accordingly. Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val = -EINVAL;
+ struct netlbl_domhsh_walk_arg cb_arg;
+ struct netlbl_audit audit_info;
+ u32 skip_bkt = 0;
+ u32 skip_chain = 0;
+
+ if (!info->attrs[NLBL_CIPSOV4_A_DOI])
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+ cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+ cb_arg.audit_info = &audit_info;
+ ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain,
+ netlbl_cipsov4_remove_cb, &cb_arg);
+ if (ret_val == 0 || ret_val == -ENOENT) {
+ ret_val = cipso_v4_doi_remove(cb_arg.doi, &audit_info);
+ if (ret_val == 0)
+ atomic_dec(&netlabel_mgmt_protocount);
+ }
+
+ return ret_val;
+}
+
+/*
+ * NetLabel Generic NETLINK Command Definitions
+ */
+
+static const struct genl_ops netlbl_cipsov4_ops[] = {
+ {
+ .cmd = NLBL_CIPSOV4_C_ADD,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_cipsov4_genl_policy,
+ .doit = netlbl_cipsov4_add,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CIPSOV4_C_REMOVE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_cipsov4_genl_policy,
+ .doit = netlbl_cipsov4_remove,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CIPSOV4_C_LIST,
+ .flags = 0,
+ .policy = netlbl_cipsov4_genl_policy,
+ .doit = netlbl_cipsov4_list,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CIPSOV4_C_LISTALL,
+ .flags = 0,
+ .policy = netlbl_cipsov4_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_cipsov4_listall,
+ },
+};
+
+/*
+ * NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_cipsov4_genl_init - Register the CIPSOv4 NetLabel component
+ *
+ * Description:
+ * Register the CIPSOv4 packet NetLabel component with the Generic NETLINK
+ * mechanism. Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_cipsov4_genl_init(void)
+{
+ return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family,
+ netlbl_cipsov4_ops);
+}
diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h
new file mode 100644
index 000000000..875826808
--- /dev/null
+++ b/net/netlabel/netlabel_cipso_v4.h
@@ -0,0 +1,169 @@
+/*
+ * NetLabel CIPSO/IPv4 Support
+ *
+ * This file defines the CIPSO/IPv4 functions for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_CIPSO_V4
+#define _NETLABEL_CIPSO_V4
+
+#include <net/netlabel.h>
+
+/*
+ * The following NetLabel payloads are supported by the CIPSO subsystem.
+ *
+ * o ADD:
+ * Sent by an application to add a new DOI mapping table.
+ *
+ * Required attributes:
+ *
+ * NLBL_CIPSOV4_A_DOI
+ * NLBL_CIPSOV4_A_MTYPE
+ * NLBL_CIPSOV4_A_TAGLST
+ *
+ * If using CIPSO_V4_MAP_TRANS the following attributes are required:
+ *
+ * NLBL_CIPSOV4_A_MLSLVLLST
+ * NLBL_CIPSOV4_A_MLSCATLST
+ *
+ * If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes
+ * are required.
+ *
+ * o REMOVE:
+ * Sent by an application to remove a specific DOI mapping table from the
+ * CIPSO V4 system.
+ *
+ * Required attributes:
+ *
+ * NLBL_CIPSOV4_A_DOI
+ *
+ * o LIST:
+ * Sent by an application to list the details of a DOI definition. On
+ * success the kernel should send a response using the following format.
+ *
+ * Required attributes:
+ *
+ * NLBL_CIPSOV4_A_DOI
+ *
+ * The valid response message format depends on the type of the DOI mapping,
+ * the defined formats are shown below.
+ *
+ * Required attributes:
+ *
+ * NLBL_CIPSOV4_A_MTYPE
+ * NLBL_CIPSOV4_A_TAGLST
+ *
+ * If using CIPSO_V4_MAP_TRANS the following attributes are required:
+ *
+ * NLBL_CIPSOV4_A_MLSLVLLST
+ * NLBL_CIPSOV4_A_MLSCATLST
+ *
+ * If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes
+ * are required.
+ *
+ * o LISTALL:
+ * This message is sent by an application to list the valid DOIs on the
+ * system. When sent by an application there is no payload and the
+ * NLM_F_DUMP flag should be set. The kernel should respond with a series of
+ * the following messages.
+ *
+ * Required attributes:
+ *
+ * NLBL_CIPSOV4_A_DOI
+ * NLBL_CIPSOV4_A_MTYPE
+ *
+ */
+
+/* NetLabel CIPSOv4 commands */
+enum {
+ NLBL_CIPSOV4_C_UNSPEC,
+ NLBL_CIPSOV4_C_ADD,
+ NLBL_CIPSOV4_C_REMOVE,
+ NLBL_CIPSOV4_C_LIST,
+ NLBL_CIPSOV4_C_LISTALL,
+ __NLBL_CIPSOV4_C_MAX,
+};
+
+/* NetLabel CIPSOv4 attributes */
+enum {
+ NLBL_CIPSOV4_A_UNSPEC,
+ NLBL_CIPSOV4_A_DOI,
+ /* (NLA_U32)
+ * the DOI value */
+ NLBL_CIPSOV4_A_MTYPE,
+ /* (NLA_U32)
+ * the mapping table type (defined in the cipso_ipv4.h header as
+ * CIPSO_V4_MAP_*) */
+ NLBL_CIPSOV4_A_TAG,
+ /* (NLA_U8)
+ * a CIPSO tag type, meant to be used within a NLBL_CIPSOV4_A_TAGLST
+ * attribute */
+ NLBL_CIPSOV4_A_TAGLST,
+ /* (NLA_NESTED)
+ * the CIPSO tag list for the DOI, there must be at least one
+ * NLBL_CIPSOV4_A_TAG attribute, tags listed first are given higher
+ * priorirty when sending packets */
+ NLBL_CIPSOV4_A_MLSLVLLOC,
+ /* (NLA_U32)
+ * the local MLS sensitivity level */
+ NLBL_CIPSOV4_A_MLSLVLREM,
+ /* (NLA_U32)
+ * the remote MLS sensitivity level */
+ NLBL_CIPSOV4_A_MLSLVL,
+ /* (NLA_NESTED)
+ * a MLS sensitivity level mapping, must contain only one attribute of
+ * each of the following types: NLBL_CIPSOV4_A_MLSLVLLOC and
+ * NLBL_CIPSOV4_A_MLSLVLREM */
+ NLBL_CIPSOV4_A_MLSLVLLST,
+ /* (NLA_NESTED)
+ * the CIPSO level mappings, there must be at least one
+ * NLBL_CIPSOV4_A_MLSLVL attribute */
+ NLBL_CIPSOV4_A_MLSCATLOC,
+ /* (NLA_U32)
+ * the local MLS category */
+ NLBL_CIPSOV4_A_MLSCATREM,
+ /* (NLA_U32)
+ * the remote MLS category */
+ NLBL_CIPSOV4_A_MLSCAT,
+ /* (NLA_NESTED)
+ * a MLS category mapping, must contain only one attribute of each of
+ * the following types: NLBL_CIPSOV4_A_MLSCATLOC and
+ * NLBL_CIPSOV4_A_MLSCATREM */
+ NLBL_CIPSOV4_A_MLSCATLST,
+ /* (NLA_NESTED)
+ * the CIPSO category mappings, there must be at least one
+ * NLBL_CIPSOV4_A_MLSCAT attribute */
+ __NLBL_CIPSOV4_A_MAX,
+};
+#define NLBL_CIPSOV4_A_MAX (__NLBL_CIPSOV4_A_MAX - 1)
+
+/* NetLabel protocol functions */
+int netlbl_cipsov4_genl_init(void);
+
+/* Free the memory associated with a CIPSOv4 DOI definition */
+void netlbl_cipsov4_doi_free(struct rcu_head *entry);
+
+#endif
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
new file mode 100644
index 000000000..f0cb92f3d
--- /dev/null
+++ b/net/netlabel/netlabel_domainhash.c
@@ -0,0 +1,790 @@
+/*
+ * NetLabel Domain Hash Table
+ *
+ * This file manages the domain hash table that NetLabel uses to determine
+ * which network labeling protocol to use for a given domain. The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <asm/bug.h>
+
+#include "netlabel_mgmt.h"
+#include "netlabel_addrlist.h"
+#include "netlabel_domainhash.h"
+#include "netlabel_user.h"
+
+struct netlbl_domhsh_tbl {
+ struct list_head *tbl;
+ u32 size;
+};
+
+/* Domain hash table */
+/* updates should be so rare that having one spinlock for the entire hash table
+ * should be okay */
+static DEFINE_SPINLOCK(netlbl_domhsh_lock);
+#define netlbl_domhsh_rcu_deref(p) \
+ rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
+static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
+static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
+
+/*
+ * Domain Hash Table Helper Functions
+ */
+
+/**
+ * netlbl_domhsh_free_entry - Frees a domain hash table entry
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to a hash table entry can be released
+ * safely.
+ *
+ */
+static void netlbl_domhsh_free_entry(struct rcu_head *entry)
+{
+ struct netlbl_dom_map *ptr;
+ struct netlbl_af4list *iter4;
+ struct netlbl_af4list *tmp4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+ struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
+
+ ptr = container_of(entry, struct netlbl_dom_map, rcu);
+ if (ptr->def.type == NETLBL_NLTYPE_ADDRSELECT) {
+ netlbl_af4list_foreach_safe(iter4, tmp4,
+ &ptr->def.addrsel->list4) {
+ netlbl_af4list_remove_entry(iter4);
+ kfree(netlbl_domhsh_addr4_entry(iter4));
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_safe(iter6, tmp6,
+ &ptr->def.addrsel->list6) {
+ netlbl_af6list_remove_entry(iter6);
+ kfree(netlbl_domhsh_addr6_entry(iter6));
+ }
+#endif /* IPv6 */
+ }
+ kfree(ptr->domain);
+ kfree(ptr);
+}
+
+/**
+ * netlbl_domhsh_hash - Hashing function for the domain hash table
+ * @domain: the domain name to hash
+ *
+ * Description:
+ * This is the hashing function for the domain hash table, it returns the
+ * correct bucket number for the domain. The caller is responsible for
+ * ensuring that the hash table is protected with either a RCU read lock or the
+ * hash table lock.
+ *
+ */
+static u32 netlbl_domhsh_hash(const char *key)
+{
+ u32 iter;
+ u32 val;
+ u32 len;
+
+ /* This is taken (with slight modification) from
+ * security/selinux/ss/symtab.c:symhash() */
+
+ for (iter = 0, val = 0, len = strlen(key); iter < len; iter++)
+ val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter];
+ return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1);
+}
+
+/**
+ * netlbl_domhsh_search - Search for a domain entry
+ * @domain: the domain
+ *
+ * Description:
+ * Searches the domain hash table and returns a pointer to the hash table
+ * entry if found, otherwise NULL is returned. The caller is responsible for
+ * ensuring that the hash table is protected with either a RCU read lock or the
+ * hash table lock.
+ *
+ */
+static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
+{
+ u32 bkt;
+ struct list_head *bkt_list;
+ struct netlbl_dom_map *iter;
+
+ if (domain != NULL) {
+ bkt = netlbl_domhsh_hash(domain);
+ bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
+ list_for_each_entry_rcu(iter, bkt_list, list)
+ if (iter->valid && strcmp(iter->domain, domain) == 0)
+ return iter;
+ }
+
+ return NULL;
+}
+
+/**
+ * netlbl_domhsh_search_def - Search for a domain entry
+ * @domain: the domain
+ * @def: return default if no match is found
+ *
+ * Description:
+ * Searches the domain hash table and returns a pointer to the hash table
+ * entry if an exact match is found, if an exact match is not present in the
+ * hash table then the default entry is returned if valid otherwise NULL is
+ * returned. The caller is responsible ensuring that the hash table is
+ * protected with either a RCU read lock or the hash table lock.
+ *
+ */
+static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
+{
+ struct netlbl_dom_map *entry;
+
+ entry = netlbl_domhsh_search(domain);
+ if (entry == NULL) {
+ entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def);
+ if (entry != NULL && !entry->valid)
+ entry = NULL;
+ }
+
+ return entry;
+}
+
+/**
+ * netlbl_domhsh_audit_add - Generate an audit entry for an add event
+ * @entry: the entry being added
+ * @addr4: the IPv4 address information
+ * @addr6: the IPv6 address information
+ * @result: the result code
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Generate an audit record for adding a new NetLabel/LSM mapping entry with
+ * the given information. Caller is responsible for holding the necessary
+ * locks.
+ *
+ */
+static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
+ struct netlbl_af4list *addr4,
+ struct netlbl_af6list *addr6,
+ int result,
+ struct netlbl_audit *audit_info)
+{
+ struct audit_buffer *audit_buf;
+ struct cipso_v4_doi *cipsov4 = NULL;
+ u32 type;
+
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
+ if (audit_buf != NULL) {
+ audit_log_format(audit_buf, " nlbl_domain=%s",
+ entry->domain ? entry->domain : "(default)");
+ if (addr4 != NULL) {
+ struct netlbl_domaddr4_map *map4;
+ map4 = netlbl_domhsh_addr4_entry(addr4);
+ type = map4->def.type;
+ cipsov4 = map4->def.cipso;
+ netlbl_af4list_audit_addr(audit_buf, 0, NULL,
+ addr4->addr, addr4->mask);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (addr6 != NULL) {
+ struct netlbl_domaddr6_map *map6;
+ map6 = netlbl_domhsh_addr6_entry(addr6);
+ type = map6->def.type;
+ netlbl_af6list_audit_addr(audit_buf, 0, NULL,
+ &addr6->addr, &addr6->mask);
+#endif /* IPv6 */
+ } else {
+ type = entry->def.type;
+ cipsov4 = entry->def.cipso;
+ }
+ switch (type) {
+ case NETLBL_NLTYPE_UNLABELED:
+ audit_log_format(audit_buf, " nlbl_protocol=unlbl");
+ break;
+ case NETLBL_NLTYPE_CIPSOV4:
+ BUG_ON(cipsov4 == NULL);
+ audit_log_format(audit_buf,
+ " nlbl_protocol=cipsov4 cipso_doi=%u",
+ cipsov4->doi);
+ break;
+ }
+ audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+}
+
+/**
+ * netlbl_domhsh_validate - Validate a new domain mapping entry
+ * @entry: the entry to validate
+ *
+ * This function validates the new domain mapping entry to ensure that it is
+ * a valid entry. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry)
+{
+ struct netlbl_af4list *iter4;
+ struct netlbl_domaddr4_map *map4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+ struct netlbl_domaddr6_map *map6;
+#endif /* IPv6 */
+
+ if (entry == NULL)
+ return -EINVAL;
+
+ switch (entry->def.type) {
+ case NETLBL_NLTYPE_UNLABELED:
+ if (entry->def.cipso != NULL || entry->def.addrsel != NULL)
+ return -EINVAL;
+ break;
+ case NETLBL_NLTYPE_CIPSOV4:
+ if (entry->def.cipso == NULL)
+ return -EINVAL;
+ break;
+ case NETLBL_NLTYPE_ADDRSELECT:
+ netlbl_af4list_foreach(iter4, &entry->def.addrsel->list4) {
+ map4 = netlbl_domhsh_addr4_entry(iter4);
+ switch (map4->def.type) {
+ case NETLBL_NLTYPE_UNLABELED:
+ if (map4->def.cipso != NULL)
+ return -EINVAL;
+ break;
+ case NETLBL_NLTYPE_CIPSOV4:
+ if (map4->def.cipso == NULL)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach(iter6, &entry->def.addrsel->list6) {
+ map6 = netlbl_domhsh_addr6_entry(iter6);
+ switch (map6->def.type) {
+ case NETLBL_NLTYPE_UNLABELED:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+#endif /* IPv6 */
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Domain Hash Table Functions
+ */
+
+/**
+ * netlbl_domhsh_init - Init for the domain hash
+ * @size: the number of bits to use for the hash buckets
+ *
+ * Description:
+ * Initializes the domain hash table, should be called only by
+ * netlbl_user_init() during initialization. Returns zero on success, non-zero
+ * values on error.
+ *
+ */
+int __init netlbl_domhsh_init(u32 size)
+{
+ u32 iter;
+ struct netlbl_domhsh_tbl *hsh_tbl;
+
+ if (size == 0)
+ return -EINVAL;
+
+ hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
+ if (hsh_tbl == NULL)
+ return -ENOMEM;
+ hsh_tbl->size = 1 << size;
+ hsh_tbl->tbl = kcalloc(hsh_tbl->size,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (hsh_tbl->tbl == NULL) {
+ kfree(hsh_tbl);
+ return -ENOMEM;
+ }
+ for (iter = 0; iter < hsh_tbl->size; iter++)
+ INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
+
+ spin_lock(&netlbl_domhsh_lock);
+ rcu_assign_pointer(netlbl_domhsh, hsh_tbl);
+ spin_unlock(&netlbl_domhsh_lock);
+
+ return 0;
+}
+
+/**
+ * netlbl_domhsh_add - Adds a entry to the domain hash table
+ * @entry: the entry to add
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new entry to the domain hash table and handles any updates to the
+ * lower level protocol handler (i.e. CIPSO). Returns zero on success,
+ * negative on failure.
+ *
+ */
+int netlbl_domhsh_add(struct netlbl_dom_map *entry,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = 0;
+ struct netlbl_dom_map *entry_old;
+ struct netlbl_af4list *iter4;
+ struct netlbl_af4list *tmp4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+ struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
+
+ ret_val = netlbl_domhsh_validate(entry);
+ if (ret_val != 0)
+ return ret_val;
+
+ /* XXX - we can remove this RCU read lock as the spinlock protects the
+ * entire function, but before we do we need to fixup the
+ * netlbl_af[4,6]list RCU functions to do "the right thing" with
+ * respect to rcu_dereference() when only a spinlock is held. */
+ rcu_read_lock();
+ spin_lock(&netlbl_domhsh_lock);
+ if (entry->domain != NULL)
+ entry_old = netlbl_domhsh_search(entry->domain);
+ else
+ entry_old = netlbl_domhsh_search_def(entry->domain);
+ if (entry_old == NULL) {
+ entry->valid = 1;
+
+ if (entry->domain != NULL) {
+ u32 bkt = netlbl_domhsh_hash(entry->domain);
+ list_add_tail_rcu(&entry->list,
+ &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
+ } else {
+ INIT_LIST_HEAD(&entry->list);
+ rcu_assign_pointer(netlbl_domhsh_def, entry);
+ }
+
+ if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) {
+ netlbl_af4list_foreach_rcu(iter4,
+ &entry->def.addrsel->list4)
+ netlbl_domhsh_audit_add(entry, iter4, NULL,
+ ret_val, audit_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(iter6,
+ &entry->def.addrsel->list6)
+ netlbl_domhsh_audit_add(entry, NULL, iter6,
+ ret_val, audit_info);
+#endif /* IPv6 */
+ } else
+ netlbl_domhsh_audit_add(entry, NULL, NULL,
+ ret_val, audit_info);
+ } else if (entry_old->def.type == NETLBL_NLTYPE_ADDRSELECT &&
+ entry->def.type == NETLBL_NLTYPE_ADDRSELECT) {
+ struct list_head *old_list4;
+ struct list_head *old_list6;
+
+ old_list4 = &entry_old->def.addrsel->list4;
+ old_list6 = &entry_old->def.addrsel->list6;
+
+ /* we only allow the addition of address selectors if all of
+ * the selectors do not exist in the existing domain map */
+ netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4)
+ if (netlbl_af4list_search_exact(iter4->addr,
+ iter4->mask,
+ old_list4)) {
+ ret_val = -EEXIST;
+ goto add_return;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6)
+ if (netlbl_af6list_search_exact(&iter6->addr,
+ &iter6->mask,
+ old_list6)) {
+ ret_val = -EEXIST;
+ goto add_return;
+ }
+#endif /* IPv6 */
+
+ netlbl_af4list_foreach_safe(iter4, tmp4,
+ &entry->def.addrsel->list4) {
+ netlbl_af4list_remove_entry(iter4);
+ iter4->valid = 1;
+ ret_val = netlbl_af4list_add(iter4, old_list4);
+ netlbl_domhsh_audit_add(entry_old, iter4, NULL,
+ ret_val, audit_info);
+ if (ret_val != 0)
+ goto add_return;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_safe(iter6, tmp6,
+ &entry->def.addrsel->list6) {
+ netlbl_af6list_remove_entry(iter6);
+ iter6->valid = 1;
+ ret_val = netlbl_af6list_add(iter6, old_list6);
+ netlbl_domhsh_audit_add(entry_old, NULL, iter6,
+ ret_val, audit_info);
+ if (ret_val != 0)
+ goto add_return;
+ }
+#endif /* IPv6 */
+ } else
+ ret_val = -EINVAL;
+
+add_return:
+ spin_unlock(&netlbl_domhsh_lock);
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/**
+ * netlbl_domhsh_add_default - Adds the default entry to the domain hash table
+ * @entry: the entry to add
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new default entry to the domain hash table and handles any updates
+ * to the lower level protocol handler (i.e. CIPSO). Returns zero on success,
+ * negative on failure.
+ *
+ */
+int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
+ struct netlbl_audit *audit_info)
+{
+ return netlbl_domhsh_add(entry, audit_info);
+}
+
+/**
+ * netlbl_domhsh_remove_entry - Removes a given entry from the domain table
+ * @entry: the entry to remove
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an entry from the domain hash table and handles any updates to the
+ * lower level protocol handler (i.e. CIPSO). Caller is responsible for
+ * ensuring that the RCU read lock is held. Returns zero on success, negative
+ * on failure.
+ *
+ */
+int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = 0;
+ struct audit_buffer *audit_buf;
+
+ if (entry == NULL)
+ return -ENOENT;
+
+ spin_lock(&netlbl_domhsh_lock);
+ if (entry->valid) {
+ entry->valid = 0;
+ if (entry != rcu_dereference(netlbl_domhsh_def))
+ list_del_rcu(&entry->list);
+ else
+ RCU_INIT_POINTER(netlbl_domhsh_def, NULL);
+ } else
+ ret_val = -ENOENT;
+ spin_unlock(&netlbl_domhsh_lock);
+
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
+ if (audit_buf != NULL) {
+ audit_log_format(audit_buf,
+ " nlbl_domain=%s res=%u",
+ entry->domain ? entry->domain : "(default)",
+ ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ if (ret_val == 0) {
+ struct netlbl_af4list *iter4;
+ struct netlbl_domaddr4_map *map4;
+
+ switch (entry->def.type) {
+ case NETLBL_NLTYPE_ADDRSELECT:
+ netlbl_af4list_foreach_rcu(iter4,
+ &entry->def.addrsel->list4) {
+ map4 = netlbl_domhsh_addr4_entry(iter4);
+ cipso_v4_doi_putdef(map4->def.cipso);
+ }
+ /* no need to check the IPv6 list since we currently
+ * support only unlabeled protocols for IPv6 */
+ break;
+ case NETLBL_NLTYPE_CIPSOV4:
+ cipso_v4_doi_putdef(entry->def.cipso);
+ break;
+ }
+ call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
+ }
+
+ return ret_val;
+}
+
+/**
+ * netlbl_domhsh_remove_af4 - Removes an address selector entry
+ * @domain: the domain
+ * @addr: IPv4 address
+ * @mask: IPv4 address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an individual address selector from a domain mapping and potentially
+ * the entire mapping if it is empty. Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int netlbl_domhsh_remove_af4(const char *domain,
+ const struct in_addr *addr,
+ const struct in_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+ struct netlbl_dom_map *entry_map;
+ struct netlbl_af4list *entry_addr;
+ struct netlbl_af4list *iter4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+#endif /* IPv6 */
+ struct netlbl_domaddr4_map *entry;
+
+ rcu_read_lock();
+
+ if (domain)
+ entry_map = netlbl_domhsh_search(domain);
+ else
+ entry_map = netlbl_domhsh_search_def(domain);
+ if (entry_map == NULL ||
+ entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
+ goto remove_af4_failure;
+
+ spin_lock(&netlbl_domhsh_lock);
+ entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
+ &entry_map->def.addrsel->list4);
+ spin_unlock(&netlbl_domhsh_lock);
+
+ if (entry_addr == NULL)
+ goto remove_af4_failure;
+ netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4)
+ goto remove_af4_single_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6)
+ goto remove_af4_single_addr;
+#endif /* IPv6 */
+ /* the domain mapping is empty so remove it from the mapping table */
+ netlbl_domhsh_remove_entry(entry_map, audit_info);
+
+remove_af4_single_addr:
+ rcu_read_unlock();
+ /* yick, we can't use call_rcu here because we don't have a rcu head
+ * pointer but hopefully this should be a rare case so the pause
+ * shouldn't be a problem */
+ synchronize_rcu();
+ entry = netlbl_domhsh_addr4_entry(entry_addr);
+ cipso_v4_doi_putdef(entry->def.cipso);
+ kfree(entry);
+ return 0;
+
+remove_af4_failure:
+ rcu_read_unlock();
+ return -ENOENT;
+}
+
+/**
+ * netlbl_domhsh_remove - Removes an entry from the domain hash table
+ * @domain: the domain to remove
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an entry from the domain hash table and handles any updates to the
+ * lower level protocol handler (i.e. CIPSO). Returns zero on success,
+ * negative on failure.
+ *
+ */
+int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct netlbl_dom_map *entry;
+
+ rcu_read_lock();
+ if (domain)
+ entry = netlbl_domhsh_search(domain);
+ else
+ entry = netlbl_domhsh_search_def(domain);
+ ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
+ rcu_read_unlock();
+
+ return ret_val;
+}
+
+/**
+ * netlbl_domhsh_remove_default - Removes the default entry from the table
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes/resets the default entry for the domain hash table and handles any
+ * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on
+ * success, non-zero on failure.
+ *
+ */
+int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
+{
+ return netlbl_domhsh_remove(NULL, audit_info);
+}
+
+/**
+ * netlbl_domhsh_getentry - Get an entry from the domain hash table
+ * @domain: the domain name to search for
+ *
+ * Description:
+ * Look through the domain hash table searching for an entry to match @domain,
+ * return a pointer to a copy of the entry or NULL. The caller is responsible
+ * for ensuring that rcu_read_[un]lock() is called.
+ *
+ */
+struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
+{
+ return netlbl_domhsh_search_def(domain);
+}
+
+/**
+ * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table
+ * @domain: the domain name to search for
+ * @addr: the IP address to search for
+ *
+ * Description:
+ * Look through the domain hash table searching for an entry to match @domain
+ * and @addr, return a pointer to a copy of the entry or NULL. The caller is
+ * responsible for ensuring that rcu_read_[un]lock() is called.
+ *
+ */
+struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
+ __be32 addr)
+{
+ struct netlbl_dom_map *dom_iter;
+ struct netlbl_af4list *addr_iter;
+
+ dom_iter = netlbl_domhsh_search_def(domain);
+ if (dom_iter == NULL)
+ return NULL;
+
+ if (dom_iter->def.type != NETLBL_NLTYPE_ADDRSELECT)
+ return &dom_iter->def;
+ addr_iter = netlbl_af4list_search(addr, &dom_iter->def.addrsel->list4);
+ if (addr_iter == NULL)
+ return NULL;
+ return &(netlbl_domhsh_addr4_entry(addr_iter)->def);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table
+ * @domain: the domain name to search for
+ * @addr: the IP address to search for
+ *
+ * Description:
+ * Look through the domain hash table searching for an entry to match @domain
+ * and @addr, return a pointer to a copy of the entry or NULL. The caller is
+ * responsible for ensuring that rcu_read_[un]lock() is called.
+ *
+ */
+struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
+ const struct in6_addr *addr)
+{
+ struct netlbl_dom_map *dom_iter;
+ struct netlbl_af6list *addr_iter;
+
+ dom_iter = netlbl_domhsh_search_def(domain);
+ if (dom_iter == NULL)
+ return NULL;
+
+ if (dom_iter->def.type != NETLBL_NLTYPE_ADDRSELECT)
+ return &dom_iter->def;
+ addr_iter = netlbl_af6list_search(addr, &dom_iter->def.addrsel->list6);
+ if (addr_iter == NULL)
+ return NULL;
+ return &(netlbl_domhsh_addr6_entry(addr_iter)->def);
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_domhsh_walk - Iterate through the domain mapping hash table
+ * @skip_bkt: the number of buckets to skip at the start
+ * @skip_chain: the number of entries to skip in the first iterated bucket
+ * @callback: callback for each entry
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Interate over the domain mapping hash table, skipping the first @skip_bkt
+ * buckets and @skip_chain entries. For each entry in the table call
+ * @callback, if @callback returns a negative value stop 'walking' through the
+ * table and return. Updates the values in @skip_bkt and @skip_chain on
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_domhsh_walk(u32 *skip_bkt,
+ u32 *skip_chain,
+ int (*callback) (struct netlbl_dom_map *entry, void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOENT;
+ u32 iter_bkt;
+ struct list_head *iter_list;
+ struct netlbl_dom_map *iter_entry;
+ u32 chain_cnt = 0;
+
+ rcu_read_lock();
+ for (iter_bkt = *skip_bkt;
+ iter_bkt < rcu_dereference(netlbl_domhsh)->size;
+ iter_bkt++, chain_cnt = 0) {
+ iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt];
+ list_for_each_entry_rcu(iter_entry, iter_list, list)
+ if (iter_entry->valid) {
+ if (chain_cnt++ < *skip_chain)
+ continue;
+ ret_val = callback(iter_entry, cb_arg);
+ if (ret_val < 0) {
+ chain_cnt--;
+ goto walk_return;
+ }
+ }
+ }
+
+walk_return:
+ rcu_read_unlock();
+ *skip_bkt = iter_bkt;
+ *skip_chain = chain_cnt;
+ return ret_val;
+}
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
new file mode 100644
index 000000000..680caf4df
--- /dev/null
+++ b/net/netlabel/netlabel_domainhash.h
@@ -0,0 +1,109 @@
+/*
+ * NetLabel Domain Hash Table
+ *
+ * This file manages the domain hash table that NetLabel uses to determine
+ * which network labeling protocol to use for a given domain. The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_DOMAINHASH_H
+#define _NETLABEL_DOMAINHASH_H
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+
+#include "netlabel_addrlist.h"
+
+/* Domain hash table size */
+/* XXX - currently this number is an uneducated guess */
+#define NETLBL_DOMHSH_BITSIZE 7
+
+/* Domain mapping definition structures */
+struct netlbl_domaddr_map {
+ struct list_head list4;
+ struct list_head list6;
+};
+struct netlbl_dommap_def {
+ u32 type;
+ union {
+ struct netlbl_domaddr_map *addrsel;
+ struct cipso_v4_doi *cipso;
+ };
+};
+#define netlbl_domhsh_addr4_entry(iter) \
+ container_of(iter, struct netlbl_domaddr4_map, list)
+struct netlbl_domaddr4_map {
+ struct netlbl_dommap_def def;
+
+ struct netlbl_af4list list;
+};
+#define netlbl_domhsh_addr6_entry(iter) \
+ container_of(iter, struct netlbl_domaddr6_map, list)
+struct netlbl_domaddr6_map {
+ struct netlbl_dommap_def def;
+
+ struct netlbl_af6list list;
+};
+
+struct netlbl_dom_map {
+ char *domain;
+ struct netlbl_dommap_def def;
+
+ u32 valid;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+/* init function */
+int netlbl_domhsh_init(u32 size);
+
+/* Manipulate the domain hash table */
+int netlbl_domhsh_add(struct netlbl_dom_map *entry,
+ struct netlbl_audit *audit_info);
+int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
+ struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
+ struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_af4(const char *domain,
+ const struct in_addr *addr,
+ const struct in_addr *mask,
+ struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
+struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
+struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
+ __be32 addr);
+#if IS_ENABLED(CONFIG_IPV6)
+struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
+ const struct in6_addr *addr);
+#endif /* IPv6 */
+
+int netlbl_domhsh_walk(u32 *skip_bkt,
+ u32 *skip_chain,
+ int (*callback) (struct netlbl_dom_map *entry, void *arg),
+ void *cb_arg);
+
+#endif
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
new file mode 100644
index 000000000..28cddc85b
--- /dev/null
+++ b/net/netlabel/netlabel_kapi.c
@@ -0,0 +1,1211 @@
+/*
+ * NetLabel Kernel API
+ *
+ * This file defines the kernel API for the NetLabel system. The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/audit.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <asm/bug.h>
+#include <linux/atomic.h>
+
+#include "netlabel_domainhash.h"
+#include "netlabel_unlabeled.h"
+#include "netlabel_cipso_v4.h"
+#include "netlabel_user.h"
+#include "netlabel_mgmt.h"
+#include "netlabel_addrlist.h"
+
+/*
+ * Configuration Functions
+ */
+
+/**
+ * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping
+ * @domain: the domain mapping to remove
+ * @family: address family
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes a NetLabel/LSM domain mapping. A @domain value of NULL causes the
+ * default domain mapping to be removed. Returns zero on success, negative
+ * values on failure.
+ *
+ */
+int netlbl_cfg_map_del(const char *domain,
+ u16 family,
+ const void *addr,
+ const void *mask,
+ struct netlbl_audit *audit_info)
+{
+ if (addr == NULL && mask == NULL) {
+ return netlbl_domhsh_remove(domain, audit_info);
+ } else if (addr != NULL && mask != NULL) {
+ switch (family) {
+ case AF_INET:
+ return netlbl_domhsh_remove_af4(domain, addr, mask,
+ audit_info);
+ default:
+ return -EPFNOSUPPORT;
+ }
+ } else
+ return -EINVAL;
+}
+
+/**
+ * netlbl_cfg_unlbl_map_add - Add a new unlabeled mapping
+ * @domain: the domain mapping to add
+ * @family: address family
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new unlabeled NetLabel/LSM domain mapping. A @domain value of NULL
+ * causes a new default domain mapping to be added. Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int netlbl_cfg_unlbl_map_add(const char *domain,
+ u16 family,
+ const void *addr,
+ const void *mask,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -ENOMEM;
+ struct netlbl_dom_map *entry;
+ struct netlbl_domaddr_map *addrmap = NULL;
+ struct netlbl_domaddr4_map *map4 = NULL;
+ struct netlbl_domaddr6_map *map6 = NULL;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ return -ENOMEM;
+ if (domain != NULL) {
+ entry->domain = kstrdup(domain, GFP_ATOMIC);
+ if (entry->domain == NULL)
+ goto cfg_unlbl_map_add_failure;
+ }
+
+ if (addr == NULL && mask == NULL)
+ entry->def.type = NETLBL_NLTYPE_UNLABELED;
+ else if (addr != NULL && mask != NULL) {
+ addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
+ if (addrmap == NULL)
+ goto cfg_unlbl_map_add_failure;
+ INIT_LIST_HEAD(&addrmap->list4);
+ INIT_LIST_HEAD(&addrmap->list6);
+
+ switch (family) {
+ case AF_INET: {
+ const struct in_addr *addr4 = addr;
+ const struct in_addr *mask4 = mask;
+ map4 = kzalloc(sizeof(*map4), GFP_ATOMIC);
+ if (map4 == NULL)
+ goto cfg_unlbl_map_add_failure;
+ map4->def.type = NETLBL_NLTYPE_UNLABELED;
+ map4->list.addr = addr4->s_addr & mask4->s_addr;
+ map4->list.mask = mask4->s_addr;
+ map4->list.valid = 1;
+ ret_val = netlbl_af4list_add(&map4->list,
+ &addrmap->list4);
+ if (ret_val != 0)
+ goto cfg_unlbl_map_add_failure;
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ const struct in6_addr *addr6 = addr;
+ const struct in6_addr *mask6 = mask;
+ map6 = kzalloc(sizeof(*map6), GFP_ATOMIC);
+ if (map6 == NULL)
+ goto cfg_unlbl_map_add_failure;
+ map6->def.type = NETLBL_NLTYPE_UNLABELED;
+ map6->list.addr = *addr6;
+ map6->list.addr.s6_addr32[0] &= mask6->s6_addr32[0];
+ map6->list.addr.s6_addr32[1] &= mask6->s6_addr32[1];
+ map6->list.addr.s6_addr32[2] &= mask6->s6_addr32[2];
+ map6->list.addr.s6_addr32[3] &= mask6->s6_addr32[3];
+ map6->list.mask = *mask6;
+ map6->list.valid = 1;
+ ret_val = netlbl_af6list_add(&map6->list,
+ &addrmap->list6);
+ if (ret_val != 0)
+ goto cfg_unlbl_map_add_failure;
+ break;
+ }
+#endif /* IPv6 */
+ default:
+ goto cfg_unlbl_map_add_failure;
+ }
+
+ entry->def.addrsel = addrmap;
+ entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
+ } else {
+ ret_val = -EINVAL;
+ goto cfg_unlbl_map_add_failure;
+ }
+
+ ret_val = netlbl_domhsh_add(entry, audit_info);
+ if (ret_val != 0)
+ goto cfg_unlbl_map_add_failure;
+
+ return 0;
+
+cfg_unlbl_map_add_failure:
+ kfree(entry->domain);
+ kfree(entry);
+ kfree(addrmap);
+ kfree(map4);
+ kfree(map6);
+ return ret_val;
+}
+
+
+/**
+ * netlbl_cfg_unlbl_static_add - Adds a new static label
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order (struct in[6]_addr)
+ * @mask: address mask in network byte order (struct in[6]_addr)
+ * @family: address family
+ * @secid: LSM secid value for the entry
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new NetLabel static label to be used when protocol provided labels
+ * are not present on incoming traffic. If @dev_name is NULL then the default
+ * interface will be used. Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_unlbl_static_add(struct net *net,
+ const char *dev_name,
+ const void *addr,
+ const void *mask,
+ u16 family,
+ u32 secid,
+ struct netlbl_audit *audit_info)
+{
+ u32 addr_len;
+
+ switch (family) {
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+#endif /* IPv6 */
+ default:
+ return -EPFNOSUPPORT;
+ }
+
+ return netlbl_unlhsh_add(net,
+ dev_name, addr, mask, addr_len,
+ secid, audit_info);
+}
+
+/**
+ * netlbl_cfg_unlbl_static_del - Removes an existing static label
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order (struct in[6]_addr)
+ * @mask: address mask in network byte order (struct in[6]_addr)
+ * @family: address family
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an existing NetLabel static label used when protocol provided labels
+ * are not present on incoming traffic. If @dev_name is NULL then the default
+ * interface will be used. Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_unlbl_static_del(struct net *net,
+ const char *dev_name,
+ const void *addr,
+ const void *mask,
+ u16 family,
+ struct netlbl_audit *audit_info)
+{
+ u32 addr_len;
+
+ switch (family) {
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ addr_len = sizeof(struct in6_addr);
+ break;
+#endif /* IPv6 */
+ default:
+ return -EPFNOSUPPORT;
+ }
+
+ return netlbl_unlhsh_remove(net,
+ dev_name, addr, mask, addr_len,
+ audit_info);
+}
+
+/**
+ * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition
+ * @doi_def: CIPSO DOI definition
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CIPSO DOI definition as defined by @doi_def. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ return cipso_v4_doi_add(doi_def, audit_info);
+}
+
+/**
+ * netlbl_cfg_cipsov4_del - Remove an existing CIPSOv4 DOI definition
+ * @doi: CIPSO DOI
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an existing CIPSO DOI definition matching @doi. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info)
+{
+ cipso_v4_doi_remove(doi, audit_info);
+}
+
+/**
+ * netlbl_cfg_cipsov4_map_add - Add a new CIPSOv4 DOI mapping
+ * @doi: the CIPSO DOI
+ * @domain: the domain mapping to add
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new NetLabel/LSM domain mapping for the given CIPSO DOI to the NetLabel
+ * subsystem. A @domain value of NULL adds a new default domain mapping.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_cipsov4_map_add(u32 doi,
+ const char *domain,
+ const struct in_addr *addr,
+ const struct in_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -ENOMEM;
+ struct cipso_v4_doi *doi_def;
+ struct netlbl_dom_map *entry;
+ struct netlbl_domaddr_map *addrmap = NULL;
+ struct netlbl_domaddr4_map *addrinfo = NULL;
+
+ doi_def = cipso_v4_doi_getdef(doi);
+ if (doi_def == NULL)
+ return -ENOENT;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ goto out_entry;
+ if (domain != NULL) {
+ entry->domain = kstrdup(domain, GFP_ATOMIC);
+ if (entry->domain == NULL)
+ goto out_domain;
+ }
+
+ if (addr == NULL && mask == NULL) {
+ entry->def.cipso = doi_def;
+ entry->def.type = NETLBL_NLTYPE_CIPSOV4;
+ } else if (addr != NULL && mask != NULL) {
+ addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
+ if (addrmap == NULL)
+ goto out_addrmap;
+ INIT_LIST_HEAD(&addrmap->list4);
+ INIT_LIST_HEAD(&addrmap->list6);
+
+ addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
+ if (addrinfo == NULL)
+ goto out_addrinfo;
+ addrinfo->def.cipso = doi_def;
+ addrinfo->def.type = NETLBL_NLTYPE_CIPSOV4;
+ addrinfo->list.addr = addr->s_addr & mask->s_addr;
+ addrinfo->list.mask = mask->s_addr;
+ addrinfo->list.valid = 1;
+ ret_val = netlbl_af4list_add(&addrinfo->list, &addrmap->list4);
+ if (ret_val != 0)
+ goto cfg_cipsov4_map_add_failure;
+
+ entry->def.addrsel = addrmap;
+ entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
+ } else {
+ ret_val = -EINVAL;
+ goto out_addrmap;
+ }
+
+ ret_val = netlbl_domhsh_add(entry, audit_info);
+ if (ret_val != 0)
+ goto cfg_cipsov4_map_add_failure;
+
+ return 0;
+
+cfg_cipsov4_map_add_failure:
+ kfree(addrinfo);
+out_addrinfo:
+ kfree(addrmap);
+out_addrmap:
+ kfree(entry->domain);
+out_domain:
+ kfree(entry);
+out_entry:
+ cipso_v4_doi_putdef(doi_def);
+ return ret_val;
+}
+
+/*
+ * Security Attribute Functions
+ */
+
+#define _CM_F_NONE 0x00000000
+#define _CM_F_ALLOC 0x00000001
+#define _CM_F_WALK 0x00000002
+
+/**
+ * _netlbl_catmap_getnode - Get a individual node from a catmap
+ * @catmap: pointer to the category bitmap
+ * @offset: the requested offset
+ * @cm_flags: catmap flags, see _CM_F_*
+ * @gfp_flags: memory allocation flags
+ *
+ * Description:
+ * Iterate through the catmap looking for the node associated with @offset.
+ * If the _CM_F_ALLOC flag is set in @cm_flags and there is no associated node,
+ * one will be created and inserted into the catmap. If the _CM_F_WALK flag is
+ * set in @cm_flags and there is no associated node, the next highest node will
+ * be returned. Returns a pointer to the node on success, NULL on failure.
+ *
+ */
+static struct netlbl_lsm_catmap *_netlbl_catmap_getnode(
+ struct netlbl_lsm_catmap **catmap,
+ u32 offset,
+ unsigned int cm_flags,
+ gfp_t gfp_flags)
+{
+ struct netlbl_lsm_catmap *iter = *catmap;
+ struct netlbl_lsm_catmap *prev = NULL;
+
+ if (iter == NULL)
+ goto catmap_getnode_alloc;
+ if (offset < iter->startbit)
+ goto catmap_getnode_walk;
+ while (iter && offset >= (iter->startbit + NETLBL_CATMAP_SIZE)) {
+ prev = iter;
+ iter = iter->next;
+ }
+ if (iter == NULL || offset < iter->startbit)
+ goto catmap_getnode_walk;
+
+ return iter;
+
+catmap_getnode_walk:
+ if (cm_flags & _CM_F_WALK)
+ return iter;
+catmap_getnode_alloc:
+ if (!(cm_flags & _CM_F_ALLOC))
+ return NULL;
+
+ iter = netlbl_catmap_alloc(gfp_flags);
+ if (iter == NULL)
+ return NULL;
+ iter->startbit = offset & ~(NETLBL_CATMAP_SIZE - 1);
+
+ if (prev == NULL) {
+ iter->next = *catmap;
+ *catmap = iter;
+ } else {
+ iter->next = prev->next;
+ prev->next = iter;
+ }
+
+ return iter;
+}
+
+/**
+ * netlbl_catmap_walk - Walk a LSM secattr catmap looking for a bit
+ * @catmap: the category bitmap
+ * @offset: the offset to start searching at, in bits
+ *
+ * Description:
+ * This function walks a LSM secattr category bitmap starting at @offset and
+ * returns the spot of the first set bit or -ENOENT if no bits are set.
+ *
+ */
+int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset)
+{
+ struct netlbl_lsm_catmap *iter = catmap;
+ u32 idx;
+ u32 bit;
+ NETLBL_CATMAP_MAPTYPE bitmap;
+
+ iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0);
+ if (iter == NULL)
+ return -ENOENT;
+ if (offset > iter->startbit) {
+ offset -= iter->startbit;
+ idx = offset / NETLBL_CATMAP_MAPSIZE;
+ bit = offset % NETLBL_CATMAP_MAPSIZE;
+ } else {
+ idx = 0;
+ bit = 0;
+ }
+ bitmap = iter->bitmap[idx] >> bit;
+
+ for (;;) {
+ if (bitmap != 0) {
+ while ((bitmap & NETLBL_CATMAP_BIT) == 0) {
+ bitmap >>= 1;
+ bit++;
+ }
+ return iter->startbit +
+ (NETLBL_CATMAP_MAPSIZE * idx) + bit;
+ }
+ if (++idx >= NETLBL_CATMAP_MAPCNT) {
+ if (iter->next != NULL) {
+ iter = iter->next;
+ idx = 0;
+ } else
+ return -ENOENT;
+ }
+ bitmap = iter->bitmap[idx];
+ bit = 0;
+ }
+
+ return -ENOENT;
+}
+
+/**
+ * netlbl_catmap_walkrng - Find the end of a string of set bits
+ * @catmap: the category bitmap
+ * @offset: the offset to start searching at, in bits
+ *
+ * Description:
+ * This function walks a LSM secattr category bitmap starting at @offset and
+ * returns the spot of the first cleared bit or -ENOENT if the offset is past
+ * the end of the bitmap.
+ *
+ */
+int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset)
+{
+ struct netlbl_lsm_catmap *iter;
+ struct netlbl_lsm_catmap *prev = NULL;
+ u32 idx;
+ u32 bit;
+ NETLBL_CATMAP_MAPTYPE bitmask;
+ NETLBL_CATMAP_MAPTYPE bitmap;
+
+ iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0);
+ if (iter == NULL)
+ return -ENOENT;
+ if (offset > iter->startbit) {
+ offset -= iter->startbit;
+ idx = offset / NETLBL_CATMAP_MAPSIZE;
+ bit = offset % NETLBL_CATMAP_MAPSIZE;
+ } else {
+ idx = 0;
+ bit = 0;
+ }
+ bitmask = NETLBL_CATMAP_BIT << bit;
+
+ for (;;) {
+ bitmap = iter->bitmap[idx];
+ while (bitmask != 0 && (bitmap & bitmask) != 0) {
+ bitmask <<= 1;
+ bit++;
+ }
+
+ if (prev && idx == 0 && bit == 0)
+ return prev->startbit + NETLBL_CATMAP_SIZE - 1;
+ else if (bitmask != 0)
+ return iter->startbit +
+ (NETLBL_CATMAP_MAPSIZE * idx) + bit - 1;
+ else if (++idx >= NETLBL_CATMAP_MAPCNT) {
+ if (iter->next == NULL)
+ return iter->startbit + NETLBL_CATMAP_SIZE - 1;
+ prev = iter;
+ iter = iter->next;
+ idx = 0;
+ }
+ bitmask = NETLBL_CATMAP_BIT;
+ bit = 0;
+ }
+
+ return -ENOENT;
+}
+
+/**
+ * netlbl_catmap_getlong - Export an unsigned long bitmap
+ * @catmap: pointer to the category bitmap
+ * @offset: pointer to the requested offset
+ * @bitmap: the exported bitmap
+ *
+ * Description:
+ * Export a bitmap with an offset greater than or equal to @offset and return
+ * it in @bitmap. The @offset must be aligned to an unsigned long and will be
+ * updated on return if different from what was requested; if the catmap is
+ * empty at the requested offset and beyond, the @offset is set to (u32)-1.
+ * Returns zero on sucess, negative values on failure.
+ *
+ */
+int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
+ u32 *offset,
+ unsigned long *bitmap)
+{
+ struct netlbl_lsm_catmap *iter;
+ u32 off = *offset;
+ u32 idx;
+
+ /* only allow aligned offsets */
+ if ((off & (BITS_PER_LONG - 1)) != 0)
+ return -EINVAL;
+
+ if (off < catmap->startbit) {
+ off = catmap->startbit;
+ *offset = off;
+ }
+ iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_NONE, 0);
+ if (iter == NULL) {
+ *offset = (u32)-1;
+ return 0;
+ }
+
+ if (off < iter->startbit) {
+ off = iter->startbit;
+ *offset = off;
+ } else
+ off -= iter->startbit;
+
+ idx = off / NETLBL_CATMAP_MAPSIZE;
+ *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_SIZE);
+
+ return 0;
+}
+
+/**
+ * netlbl_catmap_setbit - Set a bit in a LSM secattr catmap
+ * @catmap: pointer to the category bitmap
+ * @bit: the bit to set
+ * @flags: memory allocation flags
+ *
+ * Description:
+ * Set the bit specified by @bit in @catmap. Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
+ u32 bit,
+ gfp_t flags)
+{
+ struct netlbl_lsm_catmap *iter;
+ u32 idx;
+
+ iter = _netlbl_catmap_getnode(catmap, bit, _CM_F_ALLOC, flags);
+ if (iter == NULL)
+ return -ENOMEM;
+
+ bit -= iter->startbit;
+ idx = bit / NETLBL_CATMAP_MAPSIZE;
+ iter->bitmap[idx] |= NETLBL_CATMAP_BIT << (bit % NETLBL_CATMAP_MAPSIZE);
+
+ return 0;
+}
+
+/**
+ * netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap
+ * @catmap: pointer to the category bitmap
+ * @start: the starting bit
+ * @end: the last bit in the string
+ * @flags: memory allocation flags
+ *
+ * Description:
+ * Set a range of bits, starting at @start and ending with @end. Returns zero
+ * on success, negative values on failure.
+ *
+ */
+int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
+ u32 start,
+ u32 end,
+ gfp_t flags)
+{
+ int rc = 0;
+ u32 spot = start;
+
+ while (rc == 0 && spot <= end) {
+ if (((spot & (BITS_PER_LONG - 1)) != 0) &&
+ ((end - spot) > BITS_PER_LONG)) {
+ rc = netlbl_catmap_setlong(catmap,
+ spot,
+ (unsigned long)-1,
+ flags);
+ spot += BITS_PER_LONG;
+ } else
+ rc = netlbl_catmap_setbit(catmap, spot++, flags);
+ }
+
+ return rc;
+}
+
+/**
+ * netlbl_catmap_setlong - Import an unsigned long bitmap
+ * @catmap: pointer to the category bitmap
+ * @offset: offset to the start of the imported bitmap
+ * @bitmap: the bitmap to import
+ * @flags: memory allocation flags
+ *
+ * Description:
+ * Import the bitmap specified in @bitmap into @catmap, using the offset
+ * in @offset. The offset must be aligned to an unsigned long. Returns zero
+ * on success, negative values on failure.
+ *
+ */
+int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
+ u32 offset,
+ unsigned long bitmap,
+ gfp_t flags)
+{
+ struct netlbl_lsm_catmap *iter;
+ u32 idx;
+
+ /* only allow aligned offsets */
+ if ((offset & (BITS_PER_LONG - 1)) != 0)
+ return -EINVAL;
+
+ iter = _netlbl_catmap_getnode(catmap, offset, _CM_F_ALLOC, flags);
+ if (iter == NULL)
+ return -ENOMEM;
+
+ offset -= iter->startbit;
+ idx = offset / NETLBL_CATMAP_MAPSIZE;
+ iter->bitmap[idx] |= bitmap << (offset % NETLBL_CATMAP_MAPSIZE);
+
+ return 0;
+}
+
+/*
+ * LSM Functions
+ */
+
+/**
+ * netlbl_enabled - Determine if the NetLabel subsystem is enabled
+ *
+ * Description:
+ * The LSM can use this function to determine if it should use NetLabel
+ * security attributes in it's enforcement mechanism. Currently, NetLabel is
+ * considered to be enabled when it's configuration contains a valid setup for
+ * at least one labeled protocol (i.e. NetLabel can understand incoming
+ * labeled packets of at least one type); otherwise NetLabel is considered to
+ * be disabled.
+ *
+ */
+int netlbl_enabled(void)
+{
+ /* At some point we probably want to expose this mechanism to the user
+ * as well so that admins can toggle NetLabel regardless of the
+ * configuration */
+ return (atomic_read(&netlabel_mgmt_protocount) > 0);
+}
+
+/**
+ * netlbl_sock_setattr - Label a socket using the correct protocol
+ * @sk: the socket to label
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given socket using the security attributes
+ * specified in @secattr. This function requires exclusive access to @sk,
+ * which means it either needs to be in the process of being created or locked.
+ * Returns zero on success, -EDESTADDRREQ if the domain is configured to use
+ * network address selectors (can't blindly label the socket), and negative
+ * values on all other failures.
+ *
+ */
+int netlbl_sock_setattr(struct sock *sk,
+ u16 family,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct netlbl_dom_map *dom_entry;
+
+ rcu_read_lock();
+ dom_entry = netlbl_domhsh_getentry(secattr->domain);
+ if (dom_entry == NULL) {
+ ret_val = -ENOENT;
+ goto socket_setattr_return;
+ }
+ switch (family) {
+ case AF_INET:
+ switch (dom_entry->def.type) {
+ case NETLBL_NLTYPE_ADDRSELECT:
+ ret_val = -EDESTADDRREQ;
+ break;
+ case NETLBL_NLTYPE_CIPSOV4:
+ ret_val = cipso_v4_sock_setattr(sk,
+ dom_entry->def.cipso,
+ secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ /* since we don't support any IPv6 labeling protocols right
+ * now we can optimize everything away until we do */
+ ret_val = 0;
+ break;
+#endif /* IPv6 */
+ default:
+ ret_val = -EPROTONOSUPPORT;
+ }
+
+socket_setattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/**
+ * netlbl_sock_delattr - Delete all the NetLabel labels on a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Remove all the NetLabel labeling from @sk. The caller is responsible for
+ * ensuring that @sk is locked.
+ *
+ */
+void netlbl_sock_delattr(struct sock *sk)
+{
+ cipso_v4_sock_delattr(sk);
+}
+
+/**
+ * netlbl_sock_getattr - Determine the security attributes of a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Examines the given sock to see if any NetLabel style labeling has been
+ * applied to the sock, if so it parses the socket label and returns the
+ * security attributes in @secattr. Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int netlbl_sock_getattr(struct sock *sk,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ ret_val = cipso_v4_sock_getattr(sk, secattr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ ret_val = -ENOMSG;
+ break;
+#endif /* IPv6 */
+ default:
+ ret_val = -EPROTONOSUPPORT;
+ }
+
+ return ret_val;
+}
+
+/**
+ * netlbl_conn_setattr - Label a connected socket using the correct protocol
+ * @sk: the socket to label
+ * @addr: the destination address
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given connected socket using the security
+ * attributes specified in @secattr. The caller is responsible for ensuring
+ * that @sk is locked. Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_conn_setattr(struct sock *sk,
+ struct sockaddr *addr,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct sockaddr_in *addr4;
+ struct netlbl_dommap_def *entry;
+
+ rcu_read_lock();
+ switch (addr->sa_family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)addr;
+ entry = netlbl_domhsh_getentry_af4(secattr->domain,
+ addr4->sin_addr.s_addr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto conn_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CIPSOV4:
+ ret_val = cipso_v4_sock_setattr(sk,
+ entry->cipso, secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ /* just delete the protocols we support for right now
+ * but we could remove other protocols if needed */
+ cipso_v4_sock_delattr(sk);
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ /* since we don't support any IPv6 labeling protocols right
+ * now we can optimize everything away until we do */
+ ret_val = 0;
+ break;
+#endif /* IPv6 */
+ default:
+ ret_val = -EPROTONOSUPPORT;
+ }
+
+conn_setattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/**
+ * netlbl_req_setattr - Label a request socket using the correct protocol
+ * @req: the request socket to label
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given socket using the security attributes
+ * specified in @secattr. Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_req_setattr(struct request_sock *req,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct netlbl_dommap_def *entry;
+
+ rcu_read_lock();
+ switch (req->rsk_ops->family) {
+ case AF_INET:
+ entry = netlbl_domhsh_getentry_af4(secattr->domain,
+ inet_rsk(req)->ir_rmt_addr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto req_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CIPSOV4:
+ ret_val = cipso_v4_req_setattr(req,
+ entry->cipso, secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ /* just delete the protocols we support for right now
+ * but we could remove other protocols if needed */
+ cipso_v4_req_delattr(req);
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ /* since we don't support any IPv6 labeling protocols right
+ * now we can optimize everything away until we do */
+ ret_val = 0;
+ break;
+#endif /* IPv6 */
+ default:
+ ret_val = -EPROTONOSUPPORT;
+ }
+
+req_setattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/**
+* netlbl_req_delattr - Delete all the NetLabel labels on a socket
+* @req: the socket
+*
+* Description:
+* Remove all the NetLabel labeling from @req.
+*
+*/
+void netlbl_req_delattr(struct request_sock *req)
+{
+ cipso_v4_req_delattr(req);
+}
+
+/**
+ * netlbl_skbuff_setattr - Label a packet using the correct protocol
+ * @skb: the packet
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given packet using the security attributes
+ * specified in @secattr. Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_skbuff_setattr(struct sk_buff *skb,
+ u16 family,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct iphdr *hdr4;
+ struct netlbl_dommap_def *entry;
+
+ rcu_read_lock();
+ switch (family) {
+ case AF_INET:
+ hdr4 = ip_hdr(skb);
+ entry = netlbl_domhsh_getentry_af4(secattr->domain,hdr4->daddr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto skbuff_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CIPSOV4:
+ ret_val = cipso_v4_skbuff_setattr(skb, entry->cipso,
+ secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ /* just delete the protocols we support for right now
+ * but we could remove other protocols if needed */
+ ret_val = cipso_v4_skbuff_delattr(skb);
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ /* since we don't support any IPv6 labeling protocols right
+ * now we can optimize everything away until we do */
+ ret_val = 0;
+ break;
+#endif /* IPv6 */
+ default:
+ ret_val = -EPROTONOSUPPORT;
+ }
+
+skbuff_setattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/**
+ * netlbl_skbuff_getattr - Determine the security attributes of a packet
+ * @skb: the packet
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Examines the given packet to see if a recognized form of packet labeling
+ * is present, if so it parses the packet label and returns the security
+ * attributes in @secattr. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+int netlbl_skbuff_getattr(const struct sk_buff *skb,
+ u16 family,
+ struct netlbl_lsm_secattr *secattr)
+{
+ unsigned char *ptr;
+
+ switch (family) {
+ case AF_INET:
+ ptr = cipso_v4_optptr(skb);
+ if (ptr && cipso_v4_getattr(ptr, secattr) == 0)
+ return 0;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ break;
+#endif /* IPv6 */
+ }
+
+ return netlbl_unlabel_getattr(skb, family, secattr);
+}
+
+/**
+ * netlbl_skbuff_err - Handle a LSM error on a sk_buff
+ * @skb: the packet
+ * @error: the error code
+ * @gateway: true if host is acting as a gateway, false otherwise
+ *
+ * Description:
+ * Deal with a LSM problem when handling the packet in @skb, typically this is
+ * a permission denied problem (-EACCES). The correct action is determined
+ * according to the packet's labeling protocol.
+ *
+ */
+void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway)
+{
+ if (cipso_v4_optptr(skb))
+ cipso_v4_error(skb, error, gateway);
+}
+
+/**
+ * netlbl_cache_invalidate - Invalidate all of the NetLabel protocol caches
+ *
+ * Description:
+ * For all of the NetLabel protocols that support some form of label mapping
+ * cache, invalidate the cache. Returns zero on success, negative values on
+ * error.
+ *
+ */
+void netlbl_cache_invalidate(void)
+{
+ cipso_v4_cache_invalidate();
+}
+
+/**
+ * netlbl_cache_add - Add an entry to a NetLabel protocol cache
+ * @skb: the packet
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add the LSM security attributes for the given packet to the underlying
+ * NetLabel protocol's label mapping cache. Returns zero on success, negative
+ * values on error.
+ *
+ */
+int netlbl_cache_add(const struct sk_buff *skb,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ unsigned char *ptr;
+
+ if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0)
+ return -ENOMSG;
+
+ ptr = cipso_v4_optptr(skb);
+ if (ptr)
+ return cipso_v4_cache_add(ptr, secattr);
+
+ return -ENOMSG;
+}
+
+/*
+ * Protocol Engine Functions
+ */
+
+/**
+ * netlbl_audit_start - Start an audit message
+ * @type: audit message type
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Start an audit message using the type specified in @type and fill the audit
+ * message with some fields common to all NetLabel audit messages. This
+ * function should only be used by protocol engines, not LSMs. Returns a
+ * pointer to the audit buffer on success, NULL on failure.
+ *
+ */
+struct audit_buffer *netlbl_audit_start(int type,
+ struct netlbl_audit *audit_info)
+{
+ return netlbl_audit_start_common(type, audit_info);
+}
+
+/*
+ * Setup Functions
+ */
+
+/**
+ * netlbl_init - Initialize NetLabel
+ *
+ * Description:
+ * Perform the required NetLabel initialization before first use.
+ *
+ */
+static int __init netlbl_init(void)
+{
+ int ret_val;
+
+ printk(KERN_INFO "NetLabel: Initializing\n");
+ printk(KERN_INFO "NetLabel: domain hash size = %u\n",
+ (1 << NETLBL_DOMHSH_BITSIZE));
+ printk(KERN_INFO "NetLabel: protocols ="
+ " UNLABELED"
+ " CIPSOv4"
+ "\n");
+
+ ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE);
+ if (ret_val != 0)
+ goto init_failure;
+
+ ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
+ if (ret_val != 0)
+ goto init_failure;
+
+ ret_val = netlbl_netlink_init();
+ if (ret_val != 0)
+ goto init_failure;
+
+ ret_val = netlbl_unlabel_defconf();
+ if (ret_val != 0)
+ goto init_failure;
+ printk(KERN_INFO "NetLabel: unlabeled traffic allowed by default\n");
+
+ return 0;
+
+init_failure:
+ panic("NetLabel: failed to initialize properly (%d)\n", ret_val);
+}
+
+subsys_initcall(netlbl_init);
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
new file mode 100644
index 000000000..13f777f20
--- /dev/null
+++ b/net/netlabel/netlabel_mgmt.c
@@ -0,0 +1,778 @@
+/*
+ * NetLabel Management Support
+ *
+ * This file defines the management functions for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <linux/atomic.h>
+
+#include "netlabel_domainhash.h"
+#include "netlabel_user.h"
+#include "netlabel_mgmt.h"
+
+/* NetLabel configured protocol counter */
+atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0);
+
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+ struct netlink_callback *nl_cb;
+ struct sk_buff *skb;
+ u32 seq;
+};
+
+/* NetLabel Generic NETLINK CIPSOv4 family */
+static struct genl_family netlbl_mgmt_gnl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_MGMT_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_MGMT_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
+ [NLBL_MGMT_A_DOMAIN] = { .type = NLA_NUL_STRING },
+ [NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 },
+ [NLBL_MGMT_A_VERSION] = { .type = NLA_U32 },
+ [NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 },
+};
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * netlbl_mgmt_add - Handle an ADD message
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Helper function for the ADD and ADDDEF messages to add the domain mappings
+ * from the message to the hash table. See netlabel.h for a description of the
+ * message format. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_add_common(struct genl_info *info,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -EINVAL;
+ struct netlbl_domaddr_map *addrmap = NULL;
+ struct cipso_v4_doi *cipsov4 = NULL;
+ u32 tmp_val;
+ struct netlbl_dom_map *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+
+ if (!entry)
+ return -ENOMEM;
+ entry->def.type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]);
+ if (info->attrs[NLBL_MGMT_A_DOMAIN]) {
+ size_t tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]);
+ entry->domain = kmalloc(tmp_size, GFP_KERNEL);
+ if (entry->domain == NULL) {
+ ret_val = -ENOMEM;
+ goto add_free_entry;
+ }
+ nla_strlcpy(entry->domain,
+ info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size);
+ }
+
+ /* NOTE: internally we allow/use a entry->def.type value of
+ * NETLBL_NLTYPE_ADDRSELECT but we don't currently allow users
+ * to pass that as a protocol value because we need to know the
+ * "real" protocol */
+
+ switch (entry->def.type) {
+ case NETLBL_NLTYPE_UNLABELED:
+ break;
+ case NETLBL_NLTYPE_CIPSOV4:
+ if (!info->attrs[NLBL_MGMT_A_CV4DOI])
+ goto add_free_domain;
+
+ tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]);
+ cipsov4 = cipso_v4_doi_getdef(tmp_val);
+ if (cipsov4 == NULL)
+ goto add_free_domain;
+ entry->def.cipso = cipsov4;
+ break;
+ default:
+ goto add_free_domain;
+ }
+
+ if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) {
+ struct in_addr *addr;
+ struct in_addr *mask;
+ struct netlbl_domaddr4_map *map;
+
+ addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL);
+ if (addrmap == NULL) {
+ ret_val = -ENOMEM;
+ goto add_doi_put_def;
+ }
+ INIT_LIST_HEAD(&addrmap->list4);
+ INIT_LIST_HEAD(&addrmap->list6);
+
+ if (nla_len(info->attrs[NLBL_MGMT_A_IPV4ADDR]) !=
+ sizeof(struct in_addr)) {
+ ret_val = -EINVAL;
+ goto add_free_addrmap;
+ }
+ if (nla_len(info->attrs[NLBL_MGMT_A_IPV4MASK]) !=
+ sizeof(struct in_addr)) {
+ ret_val = -EINVAL;
+ goto add_free_addrmap;
+ }
+ addr = nla_data(info->attrs[NLBL_MGMT_A_IPV4ADDR]);
+ mask = nla_data(info->attrs[NLBL_MGMT_A_IPV4MASK]);
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (map == NULL) {
+ ret_val = -ENOMEM;
+ goto add_free_addrmap;
+ }
+ map->list.addr = addr->s_addr & mask->s_addr;
+ map->list.mask = mask->s_addr;
+ map->list.valid = 1;
+ map->def.type = entry->def.type;
+ if (cipsov4)
+ map->def.cipso = cipsov4;
+
+ ret_val = netlbl_af4list_add(&map->list, &addrmap->list4);
+ if (ret_val != 0) {
+ kfree(map);
+ goto add_free_addrmap;
+ }
+
+ entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
+ entry->def.addrsel = addrmap;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (info->attrs[NLBL_MGMT_A_IPV6ADDR]) {
+ struct in6_addr *addr;
+ struct in6_addr *mask;
+ struct netlbl_domaddr6_map *map;
+
+ addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL);
+ if (addrmap == NULL) {
+ ret_val = -ENOMEM;
+ goto add_doi_put_def;
+ }
+ INIT_LIST_HEAD(&addrmap->list4);
+ INIT_LIST_HEAD(&addrmap->list6);
+
+ if (nla_len(info->attrs[NLBL_MGMT_A_IPV6ADDR]) !=
+ sizeof(struct in6_addr)) {
+ ret_val = -EINVAL;
+ goto add_free_addrmap;
+ }
+ if (nla_len(info->attrs[NLBL_MGMT_A_IPV6MASK]) !=
+ sizeof(struct in6_addr)) {
+ ret_val = -EINVAL;
+ goto add_free_addrmap;
+ }
+ addr = nla_data(info->attrs[NLBL_MGMT_A_IPV6ADDR]);
+ mask = nla_data(info->attrs[NLBL_MGMT_A_IPV6MASK]);
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (map == NULL) {
+ ret_val = -ENOMEM;
+ goto add_free_addrmap;
+ }
+ map->list.addr = *addr;
+ map->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+ map->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+ map->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+ map->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+ map->list.mask = *mask;
+ map->list.valid = 1;
+ map->def.type = entry->def.type;
+
+ ret_val = netlbl_af6list_add(&map->list, &addrmap->list6);
+ if (ret_val != 0) {
+ kfree(map);
+ goto add_free_addrmap;
+ }
+
+ entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
+ entry->def.addrsel = addrmap;
+#endif /* IPv6 */
+ }
+
+ ret_val = netlbl_domhsh_add(entry, audit_info);
+ if (ret_val != 0)
+ goto add_free_addrmap;
+
+ return 0;
+
+add_free_addrmap:
+ kfree(addrmap);
+add_doi_put_def:
+ cipso_v4_doi_putdef(cipsov4);
+add_free_domain:
+ kfree(entry->domain);
+add_free_entry:
+ kfree(entry);
+ return ret_val;
+}
+
+/**
+ * netlbl_mgmt_listentry - List a NetLabel/LSM domain map entry
+ * @skb: the NETLINK buffer
+ * @entry: the map entry
+ *
+ * Description:
+ * This function is a helper function used by the LISTALL and LISTDEF command
+ * handlers. The caller is responsible for ensuring that the RCU read lock
+ * is held. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_listentry(struct sk_buff *skb,
+ struct netlbl_dom_map *entry)
+{
+ int ret_val = 0;
+ struct nlattr *nla_a;
+ struct nlattr *nla_b;
+ struct netlbl_af4list *iter4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+#endif
+
+ if (entry->domain != NULL) {
+ ret_val = nla_put_string(skb,
+ NLBL_MGMT_A_DOMAIN, entry->domain);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ switch (entry->def.type) {
+ case NETLBL_NLTYPE_ADDRSELECT:
+ nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST);
+ if (nla_a == NULL)
+ return -ENOMEM;
+
+ netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) {
+ struct netlbl_domaddr4_map *map4;
+ struct in_addr addr_struct;
+
+ nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR);
+ if (nla_b == NULL)
+ return -ENOMEM;
+
+ addr_struct.s_addr = iter4->addr;
+ ret_val = nla_put_in_addr(skb, NLBL_MGMT_A_IPV4ADDR,
+ addr_struct.s_addr);
+ if (ret_val != 0)
+ return ret_val;
+ addr_struct.s_addr = iter4->mask;
+ ret_val = nla_put_in_addr(skb, NLBL_MGMT_A_IPV4MASK,
+ addr_struct.s_addr);
+ if (ret_val != 0)
+ return ret_val;
+ map4 = netlbl_domhsh_addr4_entry(iter4);
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+ map4->def.type);
+ if (ret_val != 0)
+ return ret_val;
+ switch (map4->def.type) {
+ case NETLBL_NLTYPE_CIPSOV4:
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI,
+ map4->def.cipso->doi);
+ if (ret_val != 0)
+ return ret_val;
+ break;
+ }
+
+ nla_nest_end(skb, nla_b);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) {
+ struct netlbl_domaddr6_map *map6;
+
+ nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR);
+ if (nla_b == NULL)
+ return -ENOMEM;
+
+ ret_val = nla_put_in6_addr(skb, NLBL_MGMT_A_IPV6ADDR,
+ &iter6->addr);
+ if (ret_val != 0)
+ return ret_val;
+ ret_val = nla_put_in6_addr(skb, NLBL_MGMT_A_IPV6MASK,
+ &iter6->mask);
+ if (ret_val != 0)
+ return ret_val;
+ map6 = netlbl_domhsh_addr6_entry(iter6);
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+ map6->def.type);
+ if (ret_val != 0)
+ return ret_val;
+
+ nla_nest_end(skb, nla_b);
+ }
+#endif /* IPv6 */
+
+ nla_nest_end(skb, nla_a);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type);
+ break;
+ case NETLBL_NLTYPE_CIPSOV4:
+ ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type);
+ if (ret_val != 0)
+ return ret_val;
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI,
+ entry->def.cipso->doi);
+ break;
+ }
+
+ return ret_val;
+}
+
+/*
+ * NetLabel Command Handlers
+ */
+
+/**
+ * netlbl_mgmt_add - Handle an ADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated ADD message and add the domains from the message
+ * to the hash table. See netlabel.h for a description of the message format.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlbl_audit audit_info;
+
+ if ((!info->attrs[NLBL_MGMT_A_DOMAIN]) ||
+ (!info->attrs[NLBL_MGMT_A_PROTOCOL]) ||
+ (info->attrs[NLBL_MGMT_A_IPV4ADDR] &&
+ info->attrs[NLBL_MGMT_A_IPV6ADDR]) ||
+ (info->attrs[NLBL_MGMT_A_IPV4MASK] &&
+ info->attrs[NLBL_MGMT_A_IPV6MASK]) ||
+ ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^
+ (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) ||
+ ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^
+ (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL)))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ return netlbl_mgmt_add_common(info, &audit_info);
+}
+
+/**
+ * netlbl_mgmt_remove - Handle a REMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVE message and remove the specified domain
+ * mappings. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info)
+{
+ char *domain;
+ struct netlbl_audit audit_info;
+
+ if (!info->attrs[NLBL_MGMT_A_DOMAIN])
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]);
+ return netlbl_domhsh_remove(domain, &audit_info);
+}
+
+/**
+ * netlbl_mgmt_listall_cb - netlbl_domhsh_walk() callback for LISTALL
+ * @entry: the domain mapping hash table entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * netlbl_domhsh_walk() function for use in generating a response for a LISTALL
+ * message. Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg)
+{
+ int ret_val = -ENOMEM;
+ struct netlbl_domhsh_walk_arg *cb_arg = arg;
+ void *data;
+
+ data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
+ cb_arg->seq, &netlbl_mgmt_gnl_family,
+ NLM_F_MULTI, NLBL_MGMT_C_LISTALL);
+ if (data == NULL)
+ goto listall_cb_failure;
+
+ ret_val = netlbl_mgmt_listentry(cb_arg->skb, entry);
+ if (ret_val != 0)
+ goto listall_cb_failure;
+
+ cb_arg->seq++;
+ genlmsg_end(cb_arg->skb, data);
+ return 0;
+
+listall_cb_failure:
+ genlmsg_cancel(cb_arg->skb, data);
+ return ret_val;
+}
+
+/**
+ * netlbl_mgmt_listall - Handle a LISTALL message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated LISTALL message and dumps the domain hash table in
+ * a form suitable for use in a kernel generated LISTALL message. Returns zero
+ * on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_listall(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netlbl_domhsh_walk_arg cb_arg;
+ u32 skip_bkt = cb->args[0];
+ u32 skip_chain = cb->args[1];
+
+ cb_arg.nl_cb = cb;
+ cb_arg.skb = skb;
+ cb_arg.seq = cb->nlh->nlmsg_seq;
+
+ netlbl_domhsh_walk(&skip_bkt,
+ &skip_chain,
+ netlbl_mgmt_listall_cb,
+ &cb_arg);
+
+ cb->args[0] = skip_bkt;
+ cb->args[1] = skip_chain;
+ return skb->len;
+}
+
+/**
+ * netlbl_mgmt_adddef - Handle an ADDDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated ADDDEF message and respond accordingly. Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlbl_audit audit_info;
+
+ if ((!info->attrs[NLBL_MGMT_A_PROTOCOL]) ||
+ (info->attrs[NLBL_MGMT_A_IPV4ADDR] &&
+ info->attrs[NLBL_MGMT_A_IPV6ADDR]) ||
+ (info->attrs[NLBL_MGMT_A_IPV4MASK] &&
+ info->attrs[NLBL_MGMT_A_IPV6MASK]) ||
+ ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^
+ (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) ||
+ ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^
+ (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL)))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ return netlbl_mgmt_add_common(info, &audit_info);
+}
+
+/**
+ * netlbl_mgmt_removedef - Handle a REMOVEDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVEDEF message and remove the default domain
+ * mapping. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlbl_audit audit_info;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ return netlbl_domhsh_remove_default(&audit_info);
+}
+
+/**
+ * netlbl_mgmt_listdef - Handle a LISTDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LISTDEF message and dumps the default domain
+ * mapping in a form suitable for use in a kernel generated LISTDEF message.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val = -ENOMEM;
+ struct sk_buff *ans_skb = NULL;
+ void *data;
+ struct netlbl_dom_map *entry;
+
+ ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (ans_skb == NULL)
+ return -ENOMEM;
+ data = genlmsg_put_reply(ans_skb, info, &netlbl_mgmt_gnl_family,
+ 0, NLBL_MGMT_C_LISTDEF);
+ if (data == NULL)
+ goto listdef_failure;
+
+ rcu_read_lock();
+ entry = netlbl_domhsh_getentry(NULL);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto listdef_failure_lock;
+ }
+ ret_val = netlbl_mgmt_listentry(ans_skb, entry);
+ rcu_read_unlock();
+ if (ret_val != 0)
+ goto listdef_failure;
+
+ genlmsg_end(ans_skb, data);
+ return genlmsg_reply(ans_skb, info);
+
+listdef_failure_lock:
+ rcu_read_unlock();
+listdef_failure:
+ kfree_skb(ans_skb);
+ return ret_val;
+}
+
+/**
+ * netlbl_mgmt_protocols_cb - Write an individual PROTOCOL message response
+ * @skb: the skb to write to
+ * @cb: the NETLINK callback
+ * @protocol: the NetLabel protocol to use in the message
+ *
+ * Description:
+ * This function is to be used in conjunction with netlbl_mgmt_protocols() to
+ * answer a application's PROTOCOLS message. Returns the size of the message
+ * on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_protocols_cb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u32 protocol)
+{
+ int ret_val = -ENOMEM;
+ void *data;
+
+ data = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &netlbl_mgmt_gnl_family, NLM_F_MULTI,
+ NLBL_MGMT_C_PROTOCOLS);
+ if (data == NULL)
+ goto protocols_cb_failure;
+
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, protocol);
+ if (ret_val != 0)
+ goto protocols_cb_failure;
+
+ genlmsg_end(skb, data);
+ return 0;
+
+protocols_cb_failure:
+ genlmsg_cancel(skb, data);
+ return ret_val;
+}
+
+/**
+ * netlbl_mgmt_protocols - Handle a PROTOCOLS message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated PROTOCOLS message and respond accordingly.
+ *
+ */
+static int netlbl_mgmt_protocols(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ u32 protos_sent = cb->args[0];
+
+ if (protos_sent == 0) {
+ if (netlbl_mgmt_protocols_cb(skb,
+ cb,
+ NETLBL_NLTYPE_UNLABELED) < 0)
+ goto protocols_return;
+ protos_sent++;
+ }
+ if (protos_sent == 1) {
+ if (netlbl_mgmt_protocols_cb(skb,
+ cb,
+ NETLBL_NLTYPE_CIPSOV4) < 0)
+ goto protocols_return;
+ protos_sent++;
+ }
+
+protocols_return:
+ cb->args[0] = protos_sent;
+ return skb->len;
+}
+
+/**
+ * netlbl_mgmt_version - Handle a VERSION message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated VERSION message and respond accordingly. Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_version(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val = -ENOMEM;
+ struct sk_buff *ans_skb = NULL;
+ void *data;
+
+ ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (ans_skb == NULL)
+ return -ENOMEM;
+ data = genlmsg_put_reply(ans_skb, info, &netlbl_mgmt_gnl_family,
+ 0, NLBL_MGMT_C_VERSION);
+ if (data == NULL)
+ goto version_failure;
+
+ ret_val = nla_put_u32(ans_skb,
+ NLBL_MGMT_A_VERSION,
+ NETLBL_PROTO_VERSION);
+ if (ret_val != 0)
+ goto version_failure;
+
+ genlmsg_end(ans_skb, data);
+ return genlmsg_reply(ans_skb, info);
+
+version_failure:
+ kfree_skb(ans_skb);
+ return ret_val;
+}
+
+
+/*
+ * NetLabel Generic NETLINK Command Definitions
+ */
+
+static const struct genl_ops netlbl_mgmt_genl_ops[] = {
+ {
+ .cmd = NLBL_MGMT_C_ADD,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_mgmt_genl_policy,
+ .doit = netlbl_mgmt_add,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_MGMT_C_REMOVE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_mgmt_genl_policy,
+ .doit = netlbl_mgmt_remove,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_MGMT_C_LISTALL,
+ .flags = 0,
+ .policy = netlbl_mgmt_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_mgmt_listall,
+ },
+ {
+ .cmd = NLBL_MGMT_C_ADDDEF,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_mgmt_genl_policy,
+ .doit = netlbl_mgmt_adddef,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_MGMT_C_REMOVEDEF,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_mgmt_genl_policy,
+ .doit = netlbl_mgmt_removedef,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_MGMT_C_LISTDEF,
+ .flags = 0,
+ .policy = netlbl_mgmt_genl_policy,
+ .doit = netlbl_mgmt_listdef,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_MGMT_C_PROTOCOLS,
+ .flags = 0,
+ .policy = netlbl_mgmt_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_mgmt_protocols,
+ },
+ {
+ .cmd = NLBL_MGMT_C_VERSION,
+ .flags = 0,
+ .policy = netlbl_mgmt_genl_policy,
+ .doit = netlbl_mgmt_version,
+ .dumpit = NULL,
+ },
+};
+
+/*
+ * NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_mgmt_genl_init - Register the NetLabel management component
+ *
+ * Description:
+ * Register the NetLabel management component with the Generic NETLINK
+ * mechanism. Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_mgmt_genl_init(void)
+{
+ return genl_register_family_with_ops(&netlbl_mgmt_gnl_family,
+ netlbl_mgmt_genl_ops);
+}
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
new file mode 100644
index 000000000..8b6e1ab62
--- /dev/null
+++ b/net/netlabel/netlabel_mgmt.h
@@ -0,0 +1,222 @@
+/*
+ * NetLabel Management Support
+ *
+ * This file defines the management functions for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_MGMT_H
+#define _NETLABEL_MGMT_H
+
+#include <net/netlabel.h>
+#include <linux/atomic.h>
+
+/*
+ * The following NetLabel payloads are supported by the management interface.
+ *
+ * o ADD:
+ * Sent by an application to add a domain mapping to the NetLabel system.
+ *
+ * Required attributes:
+ *
+ * NLBL_MGMT_A_DOMAIN
+ * NLBL_MGMT_A_PROTOCOL
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_MGMT_A_IPV4ADDR
+ * NLBL_MGMT_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_MGMT_A_IPV6ADDR
+ * NLBL_MGMT_A_IPV6MASK
+ *
+ * If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
+ *
+ * NLBL_MGMT_A_CV4DOI
+ *
+ * If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ *
+ * o REMOVE:
+ * Sent by an application to remove a domain mapping from the NetLabel
+ * system.
+ *
+ * Required attributes:
+ *
+ * NLBL_MGMT_A_DOMAIN
+ *
+ * o LISTALL:
+ * This message can be sent either from an application or by the kernel in
+ * response to an application generated LISTALL message. When sent by an
+ * application there is no payload and the NLM_F_DUMP flag should be set.
+ * The kernel should respond with a series of the following messages.
+ *
+ * Required attributes:
+ *
+ * NLBL_MGMT_A_DOMAIN
+ *
+ * If the IP address selectors are not used the following attribute is
+ * required:
+ *
+ * NLBL_MGMT_A_PROTOCOL
+ *
+ * If the IP address selectors are used then the following attritbute is
+ * required:
+ *
+ * NLBL_MGMT_A_SELECTORLIST
+ *
+ * If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following
+ * attributes are required:
+ *
+ * NLBL_MGMT_A_CV4DOI
+ *
+ * If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other
+ * attributes are required.
+ *
+ * o ADDDEF:
+ * Sent by an application to set the default domain mapping for the NetLabel
+ * system.
+ *
+ * Required attributes:
+ *
+ * NLBL_MGMT_A_PROTOCOL
+ *
+ * If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
+ *
+ * NLBL_MGMT_A_CV4DOI
+ *
+ * If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ *
+ * o REMOVEDEF:
+ * Sent by an application to remove the default domain mapping from the
+ * NetLabel system, there is no payload.
+ *
+ * o LISTDEF:
+ * This message can be sent either from an application or by the kernel in
+ * response to an application generated LISTDEF message. When sent by an
+ * application there is no payload. On success the kernel should send a
+ * response using the following format.
+ *
+ * If the IP address selectors are not used the following attribute is
+ * required:
+ *
+ * NLBL_MGMT_A_PROTOCOL
+ *
+ * If the IP address selectors are used then the following attritbute is
+ * required:
+ *
+ * NLBL_MGMT_A_SELECTORLIST
+ *
+ * If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following
+ * attributes are required:
+ *
+ * NLBL_MGMT_A_CV4DOI
+ *
+ * If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other
+ * attributes are required.
+ *
+ * o PROTOCOLS:
+ * Sent by an application to request a list of configured NetLabel protocols
+ * in the kernel. When sent by an application there is no payload and the
+ * NLM_F_DUMP flag should be set. The kernel should respond with a series of
+ * the following messages.
+ *
+ * Required attributes:
+ *
+ * NLBL_MGMT_A_PROTOCOL
+ *
+ * o VERSION:
+ * Sent by an application to request the NetLabel version. When sent by an
+ * application there is no payload. This message type is also used by the
+ * kernel to respond to an VERSION request.
+ *
+ * Required attributes:
+ *
+ * NLBL_MGMT_A_VERSION
+ *
+ */
+
+/* NetLabel Management commands */
+enum {
+ NLBL_MGMT_C_UNSPEC,
+ NLBL_MGMT_C_ADD,
+ NLBL_MGMT_C_REMOVE,
+ NLBL_MGMT_C_LISTALL,
+ NLBL_MGMT_C_ADDDEF,
+ NLBL_MGMT_C_REMOVEDEF,
+ NLBL_MGMT_C_LISTDEF,
+ NLBL_MGMT_C_PROTOCOLS,
+ NLBL_MGMT_C_VERSION,
+ __NLBL_MGMT_C_MAX,
+};
+
+/* NetLabel Management attributes */
+enum {
+ NLBL_MGMT_A_UNSPEC,
+ NLBL_MGMT_A_DOMAIN,
+ /* (NLA_NUL_STRING)
+ * the NULL terminated LSM domain string */
+ NLBL_MGMT_A_PROTOCOL,
+ /* (NLA_U32)
+ * the NetLabel protocol type (defined by NETLBL_NLTYPE_*) */
+ NLBL_MGMT_A_VERSION,
+ /* (NLA_U32)
+ * the NetLabel protocol version number (defined by
+ * NETLBL_PROTO_VERSION) */
+ NLBL_MGMT_A_CV4DOI,
+ /* (NLA_U32)
+ * the CIPSOv4 DOI value */
+ NLBL_MGMT_A_IPV6ADDR,
+ /* (NLA_BINARY, struct in6_addr)
+ * an IPv6 address */
+ NLBL_MGMT_A_IPV6MASK,
+ /* (NLA_BINARY, struct in6_addr)
+ * an IPv6 address mask */
+ NLBL_MGMT_A_IPV4ADDR,
+ /* (NLA_BINARY, struct in_addr)
+ * an IPv4 address */
+ NLBL_MGMT_A_IPV4MASK,
+ /* (NLA_BINARY, struct in_addr)
+ * and IPv4 address mask */
+ NLBL_MGMT_A_ADDRSELECTOR,
+ /* (NLA_NESTED)
+ * an IP address selector, must contain an address, mask, and protocol
+ * attribute plus any protocol specific attributes */
+ NLBL_MGMT_A_SELECTORLIST,
+ /* (NLA_NESTED)
+ * the selector list, there must be at least one
+ * NLBL_MGMT_A_ADDRSELECTOR attribute */
+ __NLBL_MGMT_A_MAX,
+};
+#define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1)
+
+/* NetLabel protocol functions */
+int netlbl_mgmt_genl_init(void);
+
+/* NetLabel configured protocol reference counter */
+extern atomic_t netlabel_mgmt_protocount;
+
+#endif
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
new file mode 100644
index 000000000..b0380927f
--- /dev/null
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -0,0 +1,1548 @@
+/*
+ * NetLabel Unlabeled Support
+ *
+ * This file defines functions for dealing with unlabeled packets for the
+ * NetLabel system. The NetLabel system manages static and dynamic label
+ * mappings for network protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/audit.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/net_namespace.h>
+#include <net/netlabel.h>
+#include <asm/bug.h>
+#include <linux/atomic.h>
+
+#include "netlabel_user.h"
+#include "netlabel_addrlist.h"
+#include "netlabel_domainhash.h"
+#include "netlabel_unlabeled.h"
+#include "netlabel_mgmt.h"
+
+/* NOTE: at present we always use init's network namespace since we don't
+ * presently support different namespaces even though the majority of
+ * the functions in this file are "namespace safe" */
+
+/* The unlabeled connection hash table which we use to map network interfaces
+ * and addresses of unlabeled packets to a user specified secid value for the
+ * LSM. The hash table is used to lookup the network interface entry
+ * (struct netlbl_unlhsh_iface) and then the interface entry is used to
+ * lookup an IP address match from an ordered list. If a network interface
+ * match can not be found in the hash table then the default entry
+ * (netlbl_unlhsh_def) is used. The IP address entry list
+ * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
+ * larger netmask come first.
+ */
+struct netlbl_unlhsh_tbl {
+ struct list_head *tbl;
+ u32 size;
+};
+#define netlbl_unlhsh_addr4_entry(iter) \
+ container_of(iter, struct netlbl_unlhsh_addr4, list)
+struct netlbl_unlhsh_addr4 {
+ u32 secid;
+
+ struct netlbl_af4list list;
+ struct rcu_head rcu;
+};
+#define netlbl_unlhsh_addr6_entry(iter) \
+ container_of(iter, struct netlbl_unlhsh_addr6, list)
+struct netlbl_unlhsh_addr6 {
+ u32 secid;
+
+ struct netlbl_af6list list;
+ struct rcu_head rcu;
+};
+struct netlbl_unlhsh_iface {
+ int ifindex;
+ struct list_head addr4_list;
+ struct list_head addr6_list;
+
+ u32 valid;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+/* Argument struct for netlbl_unlhsh_walk() */
+struct netlbl_unlhsh_walk_arg {
+ struct netlink_callback *nl_cb;
+ struct sk_buff *skb;
+ u32 seq;
+};
+
+/* Unlabeled connection hash table */
+/* updates should be so rare that having one spinlock for the entire
+ * hash table should be okay */
+static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
+#define netlbl_unlhsh_rcu_deref(p) \
+ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
+static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
+
+/* Accept unlabeled packets flag */
+static u8 netlabel_unlabel_acceptflg = 0;
+
+/* NetLabel Generic NETLINK unlabeled family */
+static struct genl_family netlbl_unlabel_gnl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_UNLABELED_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_UNLABEL_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
+ [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
+ [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+ [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
+ .len = sizeof(struct in6_addr) },
+ [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(struct in_addr) },
+ [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
+ .len = sizeof(struct in_addr) },
+ [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1 },
+ [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
+};
+
+/*
+ * Unlabeled Connection Hash Table Functions
+ */
+
+/**
+ * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table interface entry can be
+ * released safely. It is important to note that this function does not free
+ * the IPv4 and IPv6 address lists contained as part of an interface entry. It
+ * is up to the rest of the code to make sure an interface entry is only freed
+ * once it's address lists are empty.
+ *
+ */
+static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
+{
+ struct netlbl_unlhsh_iface *iface;
+ struct netlbl_af4list *iter4;
+ struct netlbl_af4list *tmp4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+ struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
+
+ iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
+
+ /* no need for locks here since we are the only one with access to this
+ * structure */
+
+ netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) {
+ netlbl_af4list_remove_entry(iter4);
+ kfree(netlbl_unlhsh_addr4_entry(iter4));
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) {
+ netlbl_af6list_remove_entry(iter6);
+ kfree(netlbl_unlhsh_addr6_entry(iter6));
+ }
+#endif /* IPv6 */
+ kfree(iface);
+}
+
+/**
+ * netlbl_unlhsh_hash - Hashing function for the hash table
+ * @ifindex: the network interface/device to hash
+ *
+ * Description:
+ * This is the hashing function for the unlabeled hash table, it returns the
+ * bucket number for the given device/interface. The caller is responsible for
+ * ensuring that the hash table is protected with either a RCU read lock or
+ * the hash table lock.
+ *
+ */
+static u32 netlbl_unlhsh_hash(int ifindex)
+{
+ return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1);
+}
+
+/**
+ * netlbl_unlhsh_search_iface - Search for a matching interface entry
+ * @ifindex: the network interface
+ *
+ * Description:
+ * Searches the unlabeled connection hash table and returns a pointer to the
+ * interface entry which matches @ifindex, otherwise NULL is returned. The
+ * caller is responsible for ensuring that the hash table is protected with
+ * either a RCU read lock or the hash table lock.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
+{
+ u32 bkt;
+ struct list_head *bkt_list;
+ struct netlbl_unlhsh_iface *iter;
+
+ bkt = netlbl_unlhsh_hash(ifindex);
+ bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt];
+ list_for_each_entry_rcu(iter, bkt_list, list)
+ if (iter->valid && iter->ifindex == ifindex)
+ return iter;
+
+ return NULL;
+}
+
+/**
+ * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
+ * @iface: the associated interface entry
+ * @addr: IPv4 address in network byte order
+ * @mask: IPv4 address mask in network byte order
+ * @secid: LSM secid value for entry
+ *
+ * Description:
+ * Add a new address entry into the unlabeled connection hash table using the
+ * interface entry specified by @iface. On success zero is returned, otherwise
+ * a negative value is returned.
+ *
+ */
+static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
+ const struct in_addr *addr,
+ const struct in_addr *mask,
+ u32 secid)
+{
+ int ret_val;
+ struct netlbl_unlhsh_addr4 *entry;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ entry->list.addr = addr->s_addr & mask->s_addr;
+ entry->list.mask = mask->s_addr;
+ entry->list.valid = 1;
+ entry->secid = secid;
+
+ spin_lock(&netlbl_unlhsh_lock);
+ ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list);
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ if (ret_val != 0)
+ kfree(entry);
+ return ret_val;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
+ * @iface: the associated interface entry
+ * @addr: IPv6 address in network byte order
+ * @mask: IPv6 address mask in network byte order
+ * @secid: LSM secid value for entry
+ *
+ * Description:
+ * Add a new address entry into the unlabeled connection hash table using the
+ * interface entry specified by @iface. On success zero is returned, otherwise
+ * a negative value is returned.
+ *
+ */
+static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ u32 secid)
+{
+ int ret_val;
+ struct netlbl_unlhsh_addr6 *entry;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ entry->list.addr = *addr;
+ entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+ entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+ entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+ entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+ entry->list.mask = *mask;
+ entry->list.valid = 1;
+ entry->secid = secid;
+
+ spin_lock(&netlbl_unlhsh_lock);
+ ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list);
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ if (ret_val != 0)
+ kfree(entry);
+ return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
+ * @ifindex: network interface
+ *
+ * Description:
+ * Add a new, empty, interface entry into the unlabeled connection hash table.
+ * On success a pointer to the new interface entry is returned, on failure NULL
+ * is returned.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
+{
+ u32 bkt;
+ struct netlbl_unlhsh_iface *iface;
+
+ iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
+ if (iface == NULL)
+ return NULL;
+
+ iface->ifindex = ifindex;
+ INIT_LIST_HEAD(&iface->addr4_list);
+ INIT_LIST_HEAD(&iface->addr6_list);
+ iface->valid = 1;
+
+ spin_lock(&netlbl_unlhsh_lock);
+ if (ifindex > 0) {
+ bkt = netlbl_unlhsh_hash(ifindex);
+ if (netlbl_unlhsh_search_iface(ifindex) != NULL)
+ goto add_iface_failure;
+ list_add_tail_rcu(&iface->list,
+ &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]);
+ } else {
+ INIT_LIST_HEAD(&iface->list);
+ if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL)
+ goto add_iface_failure;
+ rcu_assign_pointer(netlbl_unlhsh_def, iface);
+ }
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ return iface;
+
+add_iface_failure:
+ spin_unlock(&netlbl_unlhsh_lock);
+ kfree(iface);
+ return NULL;
+}
+
+/**
+ * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order
+ * @mask: address mask in network byte order
+ * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
+ * @secid: LSM secid value for the entry
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new entry to the unlabeled connection hash table. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int netlbl_unlhsh_add(struct net *net,
+ const char *dev_name,
+ const void *addr,
+ const void *mask,
+ u32 addr_len,
+ u32 secid,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ int ifindex;
+ struct net_device *dev;
+ struct netlbl_unlhsh_iface *iface;
+ struct audit_buffer *audit_buf = NULL;
+ char *secctx = NULL;
+ u32 secctx_len;
+
+ if (addr_len != sizeof(struct in_addr) &&
+ addr_len != sizeof(struct in6_addr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (dev_name != NULL) {
+ dev = dev_get_by_name_rcu(net, dev_name);
+ if (dev == NULL) {
+ ret_val = -ENODEV;
+ goto unlhsh_add_return;
+ }
+ ifindex = dev->ifindex;
+ iface = netlbl_unlhsh_search_iface(ifindex);
+ } else {
+ ifindex = 0;
+ iface = rcu_dereference(netlbl_unlhsh_def);
+ }
+ if (iface == NULL) {
+ iface = netlbl_unlhsh_add_iface(ifindex);
+ if (iface == NULL) {
+ ret_val = -ENOMEM;
+ goto unlhsh_add_return;
+ }
+ }
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
+ audit_info);
+ switch (addr_len) {
+ case sizeof(struct in_addr): {
+ const struct in_addr *addr4 = addr;
+ const struct in_addr *mask4 = mask;
+
+ ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
+ if (audit_buf != NULL)
+ netlbl_af4list_audit_addr(audit_buf, 1,
+ dev_name,
+ addr4->s_addr,
+ mask4->s_addr);
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case sizeof(struct in6_addr): {
+ const struct in6_addr *addr6 = addr;
+ const struct in6_addr *mask6 = mask;
+
+ ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
+ if (audit_buf != NULL)
+ netlbl_af6list_audit_addr(audit_buf, 1,
+ dev_name,
+ addr6, mask6);
+ break;
+ }
+#endif /* IPv6 */
+ default:
+ ret_val = -EINVAL;
+ }
+ if (ret_val == 0)
+ atomic_inc(&netlabel_mgmt_protocount);
+
+unlhsh_add_return:
+ rcu_read_unlock();
+ if (audit_buf != NULL) {
+ if (security_secid_to_secctx(secid,
+ &secctx,
+ &secctx_len) == 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", secctx);
+ security_release_secctx(secctx, secctx_len);
+ }
+ audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+ return ret_val;
+}
+
+/**
+ * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
+ * @net: network namespace
+ * @iface: interface entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an IP address entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlhsh_remove_addr4(struct net *net,
+ struct netlbl_unlhsh_iface *iface,
+ const struct in_addr *addr,
+ const struct in_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+ struct netlbl_af4list *list_entry;
+ struct netlbl_unlhsh_addr4 *entry;
+ struct audit_buffer *audit_buf;
+ struct net_device *dev;
+ char *secctx;
+ u32 secctx_len;
+
+ spin_lock(&netlbl_unlhsh_lock);
+ list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
+ &iface->addr4_list);
+ spin_unlock(&netlbl_unlhsh_lock);
+ if (list_entry != NULL)
+ entry = netlbl_unlhsh_addr4_entry(list_entry);
+ else
+ entry = NULL;
+
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
+ audit_info);
+ if (audit_buf != NULL) {
+ dev = dev_get_by_index(net, iface->ifindex);
+ netlbl_af4list_audit_addr(audit_buf, 1,
+ (dev != NULL ? dev->name : NULL),
+ addr->s_addr, mask->s_addr);
+ if (dev != NULL)
+ dev_put(dev);
+ if (entry != NULL &&
+ security_secid_to_secctx(entry->secid,
+ &secctx, &secctx_len) == 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", secctx);
+ security_release_secctx(secctx, secctx_len);
+ }
+ audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ if (entry == NULL)
+ return -ENOENT;
+
+ kfree_rcu(entry, rcu);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
+ * @net: network namespace
+ * @iface: interface entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an IP address entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlhsh_remove_addr6(struct net *net,
+ struct netlbl_unlhsh_iface *iface,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+ struct netlbl_af6list *list_entry;
+ struct netlbl_unlhsh_addr6 *entry;
+ struct audit_buffer *audit_buf;
+ struct net_device *dev;
+ char *secctx;
+ u32 secctx_len;
+
+ spin_lock(&netlbl_unlhsh_lock);
+ list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list);
+ spin_unlock(&netlbl_unlhsh_lock);
+ if (list_entry != NULL)
+ entry = netlbl_unlhsh_addr6_entry(list_entry);
+ else
+ entry = NULL;
+
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
+ audit_info);
+ if (audit_buf != NULL) {
+ dev = dev_get_by_index(net, iface->ifindex);
+ netlbl_af6list_audit_addr(audit_buf, 1,
+ (dev != NULL ? dev->name : NULL),
+ addr, mask);
+ if (dev != NULL)
+ dev_put(dev);
+ if (entry != NULL &&
+ security_secid_to_secctx(entry->secid,
+ &secctx, &secctx_len) == 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", secctx);
+ security_release_secctx(secctx, secctx_len);
+ }
+ audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ if (entry == NULL)
+ return -ENOENT;
+
+ kfree_rcu(entry, rcu);
+ return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_condremove_iface - Remove an interface entry
+ * @iface: the interface entry
+ *
+ * Description:
+ * Remove an interface entry from the unlabeled connection hash table if it is
+ * empty. An interface entry is considered to be empty if there are no
+ * address entries assigned to it.
+ *
+ */
+static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
+{
+ struct netlbl_af4list *iter4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+#endif /* IPv6 */
+
+ spin_lock(&netlbl_unlhsh_lock);
+ netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list)
+ goto unlhsh_condremove_failure;
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list)
+ goto unlhsh_condremove_failure;
+#endif /* IPv6 */
+ iface->valid = 0;
+ if (iface->ifindex > 0)
+ list_del_rcu(&iface->list);
+ else
+ RCU_INIT_POINTER(netlbl_unlhsh_def, NULL);
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+ return;
+
+unlhsh_condremove_failure:
+ spin_unlock(&netlbl_unlhsh_lock);
+}
+
+/**
+ * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order
+ * @mask: address mask in network byte order
+ * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes and existing entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_unlhsh_remove(struct net *net,
+ const char *dev_name,
+ const void *addr,
+ const void *mask,
+ u32 addr_len,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct net_device *dev;
+ struct netlbl_unlhsh_iface *iface;
+
+ if (addr_len != sizeof(struct in_addr) &&
+ addr_len != sizeof(struct in6_addr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (dev_name != NULL) {
+ dev = dev_get_by_name_rcu(net, dev_name);
+ if (dev == NULL) {
+ ret_val = -ENODEV;
+ goto unlhsh_remove_return;
+ }
+ iface = netlbl_unlhsh_search_iface(dev->ifindex);
+ } else
+ iface = rcu_dereference(netlbl_unlhsh_def);
+ if (iface == NULL) {
+ ret_val = -ENOENT;
+ goto unlhsh_remove_return;
+ }
+ switch (addr_len) {
+ case sizeof(struct in_addr):
+ ret_val = netlbl_unlhsh_remove_addr4(net,
+ iface, addr, mask,
+ audit_info);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case sizeof(struct in6_addr):
+ ret_val = netlbl_unlhsh_remove_addr6(net,
+ iface, addr, mask,
+ audit_info);
+ break;
+#endif /* IPv6 */
+ default:
+ ret_val = -EINVAL;
+ }
+ if (ret_val == 0) {
+ netlbl_unlhsh_condremove_iface(iface);
+ atomic_dec(&netlabel_mgmt_protocount);
+ }
+
+unlhsh_remove_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/*
+ * General Helper Functions
+ */
+
+/**
+ * netlbl_unlhsh_netdev_handler - Network device notification handler
+ * @this: notifier block
+ * @event: the event
+ * @ptr: the netdevice notifier info (cast to void)
+ *
+ * Description:
+ * Handle network device events, although at present all we care about is a
+ * network device going away. In the case of a device going away we clear any
+ * related entries from the unlabeled connection hash table.
+ *
+ */
+static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netlbl_unlhsh_iface *iface = NULL;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
+ if (event == NETDEV_DOWN) {
+ spin_lock(&netlbl_unlhsh_lock);
+ iface = netlbl_unlhsh_search_iface(dev->ifindex);
+ if (iface != NULL && iface->valid) {
+ iface->valid = 0;
+ list_del_rcu(&iface->list);
+ } else
+ iface = NULL;
+ spin_unlock(&netlbl_unlhsh_lock);
+ }
+
+ if (iface != NULL)
+ call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+
+ return NOTIFY_DONE;
+}
+
+/**
+ * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
+ * @value: desired value
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Set the value of the unlabeled accept flag to @value.
+ *
+ */
+static void netlbl_unlabel_acceptflg_set(u8 value,
+ struct netlbl_audit *audit_info)
+{
+ struct audit_buffer *audit_buf;
+ u8 old_val;
+
+ old_val = netlabel_unlabel_acceptflg;
+ netlabel_unlabel_acceptflg = value;
+ audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
+ audit_info);
+ if (audit_buf != NULL) {
+ audit_log_format(audit_buf,
+ " unlbl_accept=%u old=%u", value, old_val);
+ audit_log_end(audit_buf);
+ }
+}
+
+/**
+ * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
+ * @info: the Generic NETLINK info block
+ * @addr: the IP address
+ * @mask: the IP address mask
+ * @len: the address length
+ *
+ * Description:
+ * Examine the Generic NETLINK message and extract the IP address information.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
+ void **addr,
+ void **mask,
+ u32 *len)
+{
+ u32 addr_len;
+
+ if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
+ addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
+ if (addr_len != sizeof(struct in_addr) &&
+ addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
+ return -EINVAL;
+ *len = addr_len;
+ *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
+ *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
+ return 0;
+ } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
+ addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
+ if (addr_len != sizeof(struct in6_addr) &&
+ addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
+ return -EINVAL;
+ *len = addr_len;
+ *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
+ *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * NetLabel Command Handlers
+ */
+
+/**
+ * netlbl_unlabel_accept - Handle an ACCEPT message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated ACCEPT message and set the accept flag accordingly.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info)
+{
+ u8 value;
+ struct netlbl_audit audit_info;
+
+ if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) {
+ value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]);
+ if (value == 1 || value == 0) {
+ netlbl_netlink_auditinfo(skb, &audit_info);
+ netlbl_unlabel_acceptflg_set(value, &audit_info);
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * netlbl_unlabel_list - Handle a LIST message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LIST message and respond with the current status.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val = -EINVAL;
+ struct sk_buff *ans_skb;
+ void *data;
+
+ ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (ans_skb == NULL)
+ goto list_failure;
+ data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family,
+ 0, NLBL_UNLABEL_C_LIST);
+ if (data == NULL) {
+ ret_val = -ENOMEM;
+ goto list_failure;
+ }
+
+ ret_val = nla_put_u8(ans_skb,
+ NLBL_UNLABEL_A_ACPTFLG,
+ netlabel_unlabel_acceptflg);
+ if (ret_val != 0)
+ goto list_failure;
+
+ genlmsg_end(ans_skb, data);
+ return genlmsg_reply(ans_skb, info);
+
+list_failure:
+ kfree_skb(ans_skb);
+ return ret_val;
+}
+
+/**
+ * netlbl_unlabel_staticadd - Handle a STATICADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICADD message and add a new unlabeled
+ * connection entry to the hash table. Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int netlbl_unlabel_staticadd(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ int ret_val;
+ char *dev_name;
+ void *addr;
+ void *mask;
+ u32 addr_len;
+ u32 secid;
+ struct netlbl_audit audit_info;
+
+ /* Don't allow users to add both IPv4 and IPv6 addresses for a
+ * single entry. However, allow users to create two entries, one each
+ * for IPv4 and IPv4, with the same LSM security context which should
+ * achieve the same result. */
+ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
+ !info->attrs[NLBL_UNLABEL_A_IFACE] ||
+ !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+ if (ret_val != 0)
+ return ret_val;
+ dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
+ ret_val = security_secctx_to_secid(
+ nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+ nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+ &secid);
+ if (ret_val != 0)
+ return ret_val;
+
+ return netlbl_unlhsh_add(&init_net,
+ dev_name, addr, mask, addr_len, secid,
+ &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICADDDEF message and add a new default
+ * unlabeled connection entry. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ int ret_val;
+ void *addr;
+ void *mask;
+ u32 addr_len;
+ u32 secid;
+ struct netlbl_audit audit_info;
+
+ /* Don't allow users to add both IPv4 and IPv6 addresses for a
+ * single entry. However, allow users to create two entries, one each
+ * for IPv4 and IPv6, with the same LSM security context which should
+ * achieve the same result. */
+ if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
+ !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+ if (ret_val != 0)
+ return ret_val;
+ ret_val = security_secctx_to_secid(
+ nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+ nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+ &secid);
+ if (ret_val != 0)
+ return ret_val;
+
+ return netlbl_unlhsh_add(&init_net,
+ NULL, addr, mask, addr_len, secid,
+ &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICREMOVE message and remove the specified
+ * unlabeled connection entry. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticremove(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ int ret_val;
+ char *dev_name;
+ void *addr;
+ void *mask;
+ u32 addr_len;
+ struct netlbl_audit audit_info;
+
+ /* See the note in netlbl_unlabel_staticadd() about not allowing both
+ * IPv4 and IPv6 in the same entry. */
+ if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
+ !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+ if (ret_val != 0)
+ return ret_val;
+ dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
+
+ return netlbl_unlhsh_remove(&init_net,
+ dev_name, addr, mask, addr_len,
+ &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICREMOVEDEF message and remove the default
+ * unlabeled connection entry. Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ int ret_val;
+ void *addr;
+ void *mask;
+ u32 addr_len;
+ struct netlbl_audit audit_info;
+
+ /* See the note in netlbl_unlabel_staticadd() about not allowing both
+ * IPv4 and IPv6 in the same entry. */
+ if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+ (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+ !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+
+ ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+ if (ret_val != 0)
+ return ret_val;
+
+ return netlbl_unlhsh_remove(&init_net,
+ NULL, addr, mask, addr_len,
+ &audit_info);
+}
+
+
+/**
+ * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
+ * @cmd: command/message
+ * @iface: the interface entry
+ * @addr4: the IPv4 address entry
+ * @addr6: the IPv6 address entry
+ * @arg: the netlbl_unlhsh_walk_arg structure
+ *
+ * Description:
+ * This function is designed to be used to generate a response for a
+ * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6
+ * can be specified, not both, the other unspecified entry should be set to
+ * NULL by the caller. Returns the size of the message on success, negative
+ * values on failure.
+ *
+ */
+static int netlbl_unlabel_staticlist_gen(u32 cmd,
+ const struct netlbl_unlhsh_iface *iface,
+ const struct netlbl_unlhsh_addr4 *addr4,
+ const struct netlbl_unlhsh_addr6 *addr6,
+ void *arg)
+{
+ int ret_val = -ENOMEM;
+ struct netlbl_unlhsh_walk_arg *cb_arg = arg;
+ struct net_device *dev;
+ void *data;
+ u32 secid;
+ char *secctx;
+ u32 secctx_len;
+
+ data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
+ cb_arg->seq, &netlbl_unlabel_gnl_family,
+ NLM_F_MULTI, cmd);
+ if (data == NULL)
+ goto list_cb_failure;
+
+ if (iface->ifindex > 0) {
+ dev = dev_get_by_index(&init_net, iface->ifindex);
+ if (!dev) {
+ ret_val = -ENODEV;
+ goto list_cb_failure;
+ }
+ ret_val = nla_put_string(cb_arg->skb,
+ NLBL_UNLABEL_A_IFACE, dev->name);
+ dev_put(dev);
+ if (ret_val != 0)
+ goto list_cb_failure;
+ }
+
+ if (addr4) {
+ struct in_addr addr_struct;
+
+ addr_struct.s_addr = addr4->list.addr;
+ ret_val = nla_put_in_addr(cb_arg->skb,
+ NLBL_UNLABEL_A_IPV4ADDR,
+ addr_struct.s_addr);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ addr_struct.s_addr = addr4->list.mask;
+ ret_val = nla_put_in_addr(cb_arg->skb,
+ NLBL_UNLABEL_A_IPV4MASK,
+ addr_struct.s_addr);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ secid = addr4->secid;
+ } else {
+ ret_val = nla_put_in6_addr(cb_arg->skb,
+ NLBL_UNLABEL_A_IPV6ADDR,
+ &addr6->list.addr);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ ret_val = nla_put_in6_addr(cb_arg->skb,
+ NLBL_UNLABEL_A_IPV6MASK,
+ &addr6->list.mask);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ secid = addr6->secid;
+ }
+
+ ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
+ if (ret_val != 0)
+ goto list_cb_failure;
+ ret_val = nla_put(cb_arg->skb,
+ NLBL_UNLABEL_A_SECCTX,
+ secctx_len,
+ secctx);
+ security_release_secctx(secctx, secctx_len);
+ if (ret_val != 0)
+ goto list_cb_failure;
+
+ cb_arg->seq++;
+ genlmsg_end(cb_arg->skb, data);
+ return 0;
+
+list_cb_failure:
+ genlmsg_cancel(cb_arg->skb, data);
+ return ret_val;
+}
+
+/**
+ * netlbl_unlabel_staticlist - Handle a STATICLIST message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated STATICLIST message and dump the unlabeled
+ * connection hash table in a form suitable for use in a kernel generated
+ * STATICLIST message. Returns the length of @skb.
+ *
+ */
+static int netlbl_unlabel_staticlist(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netlbl_unlhsh_walk_arg cb_arg;
+ u32 skip_bkt = cb->args[0];
+ u32 skip_chain = cb->args[1];
+ u32 iter_bkt;
+ u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
+ struct netlbl_unlhsh_iface *iface;
+ struct list_head *iter_list;
+ struct netlbl_af4list *addr4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *addr6;
+#endif
+
+ cb_arg.nl_cb = cb;
+ cb_arg.skb = skb;
+ cb_arg.seq = cb->nlh->nlmsg_seq;
+
+ rcu_read_lock();
+ for (iter_bkt = skip_bkt;
+ iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
+ iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
+ iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt];
+ list_for_each_entry_rcu(iface, iter_list, list) {
+ if (!iface->valid ||
+ iter_chain++ < skip_chain)
+ continue;
+ netlbl_af4list_foreach_rcu(addr4,
+ &iface->addr4_list) {
+ if (iter_addr4++ < cb->args[2])
+ continue;
+ if (netlbl_unlabel_staticlist_gen(
+ NLBL_UNLABEL_C_STATICLIST,
+ iface,
+ netlbl_unlhsh_addr4_entry(addr4),
+ NULL,
+ &cb_arg) < 0) {
+ iter_addr4--;
+ iter_chain--;
+ goto unlabel_staticlist_return;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(addr6,
+ &iface->addr6_list) {
+ if (iter_addr6++ < cb->args[3])
+ continue;
+ if (netlbl_unlabel_staticlist_gen(
+ NLBL_UNLABEL_C_STATICLIST,
+ iface,
+ NULL,
+ netlbl_unlhsh_addr6_entry(addr6),
+ &cb_arg) < 0) {
+ iter_addr6--;
+ iter_chain--;
+ goto unlabel_staticlist_return;
+ }
+ }
+#endif /* IPv6 */
+ }
+ }
+
+unlabel_staticlist_return:
+ rcu_read_unlock();
+ cb->args[0] = iter_bkt;
+ cb->args[1] = iter_chain;
+ cb->args[2] = iter_addr4;
+ cb->args[3] = iter_addr6;
+ return skb->len;
+}
+
+/**
+ * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated STATICLISTDEF message and dump the default
+ * unlabeled connection entry in a form suitable for use in a kernel generated
+ * STATICLISTDEF message. Returns the length of @skb.
+ *
+ */
+static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netlbl_unlhsh_walk_arg cb_arg;
+ struct netlbl_unlhsh_iface *iface;
+ u32 iter_addr4 = 0, iter_addr6 = 0;
+ struct netlbl_af4list *addr4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *addr6;
+#endif
+
+ cb_arg.nl_cb = cb;
+ cb_arg.skb = skb;
+ cb_arg.seq = cb->nlh->nlmsg_seq;
+
+ rcu_read_lock();
+ iface = rcu_dereference(netlbl_unlhsh_def);
+ if (iface == NULL || !iface->valid)
+ goto unlabel_staticlistdef_return;
+
+ netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) {
+ if (iter_addr4++ < cb->args[0])
+ continue;
+ if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
+ iface,
+ netlbl_unlhsh_addr4_entry(addr4),
+ NULL,
+ &cb_arg) < 0) {
+ iter_addr4--;
+ goto unlabel_staticlistdef_return;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) {
+ if (iter_addr6++ < cb->args[1])
+ continue;
+ if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
+ iface,
+ NULL,
+ netlbl_unlhsh_addr6_entry(addr6),
+ &cb_arg) < 0) {
+ iter_addr6--;
+ goto unlabel_staticlistdef_return;
+ }
+ }
+#endif /* IPv6 */
+
+unlabel_staticlistdef_return:
+ rcu_read_unlock();
+ cb->args[0] = iter_addr4;
+ cb->args[1] = iter_addr6;
+ return skb->len;
+}
+
+/*
+ * NetLabel Generic NETLINK Command Definitions
+ */
+
+static const struct genl_ops netlbl_unlabel_genl_ops[] = {
+ {
+ .cmd = NLBL_UNLABEL_C_STATICADD,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_staticadd,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_UNLABEL_C_STATICREMOVE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_staticremove,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_UNLABEL_C_STATICLIST,
+ .flags = 0,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_unlabel_staticlist,
+ },
+ {
+ .cmd = NLBL_UNLABEL_C_STATICADDDEF,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_staticadddef,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_staticremovedef,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_UNLABEL_C_STATICLISTDEF,
+ .flags = 0,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_unlabel_staticlistdef,
+ },
+ {
+ .cmd = NLBL_UNLABEL_C_ACCEPT,
+ .flags = GENL_ADMIN_PERM,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_accept,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_UNLABEL_C_LIST,
+ .flags = 0,
+ .policy = netlbl_unlabel_genl_policy,
+ .doit = netlbl_unlabel_list,
+ .dumpit = NULL,
+ },
+};
+
+/*
+ * NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component
+ *
+ * Description:
+ * Register the unlabeled packet NetLabel component with the Generic NETLINK
+ * mechanism. Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_unlabel_genl_init(void)
+{
+ return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
+ netlbl_unlabel_genl_ops);
+}
+
+/*
+ * NetLabel KAPI Hooks
+ */
+
+static struct notifier_block netlbl_unlhsh_netdev_notifier = {
+ .notifier_call = netlbl_unlhsh_netdev_handler,
+};
+
+/**
+ * netlbl_unlabel_init - Initialize the unlabeled connection hash table
+ * @size: the number of bits to use for the hash buckets
+ *
+ * Description:
+ * Initializes the unlabeled connection hash table and registers a network
+ * device notification handler. This function should only be called by the
+ * NetLabel subsystem itself during initialization. Returns zero on success,
+ * non-zero values on error.
+ *
+ */
+int __init netlbl_unlabel_init(u32 size)
+{
+ u32 iter;
+ struct netlbl_unlhsh_tbl *hsh_tbl;
+
+ if (size == 0)
+ return -EINVAL;
+
+ hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
+ if (hsh_tbl == NULL)
+ return -ENOMEM;
+ hsh_tbl->size = 1 << size;
+ hsh_tbl->tbl = kcalloc(hsh_tbl->size,
+ sizeof(struct list_head),
+ GFP_KERNEL);
+ if (hsh_tbl->tbl == NULL) {
+ kfree(hsh_tbl);
+ return -ENOMEM;
+ }
+ for (iter = 0; iter < hsh_tbl->size; iter++)
+ INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
+
+ spin_lock(&netlbl_unlhsh_lock);
+ rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
+ spin_unlock(&netlbl_unlhsh_lock);
+
+ register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
+
+ return 0;
+}
+
+/**
+ * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
+ * @skb: the packet
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Determine the security attributes, if any, for an unlabled packet and return
+ * them in @secattr. Returns zero on success and negative values on failure.
+ *
+ */
+int netlbl_unlabel_getattr(const struct sk_buff *skb,
+ u16 family,
+ struct netlbl_lsm_secattr *secattr)
+{
+ struct netlbl_unlhsh_iface *iface;
+
+ rcu_read_lock();
+ iface = netlbl_unlhsh_search_iface(skb->skb_iif);
+ if (iface == NULL)
+ iface = rcu_dereference(netlbl_unlhsh_def);
+ if (iface == NULL || !iface->valid)
+ goto unlabel_getattr_nolabel;
+ switch (family) {
+ case PF_INET: {
+ struct iphdr *hdr4;
+ struct netlbl_af4list *addr4;
+
+ hdr4 = ip_hdr(skb);
+ addr4 = netlbl_af4list_search(hdr4->saddr,
+ &iface->addr4_list);
+ if (addr4 == NULL)
+ goto unlabel_getattr_nolabel;
+ secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid;
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case PF_INET6: {
+ struct ipv6hdr *hdr6;
+ struct netlbl_af6list *addr6;
+
+ hdr6 = ipv6_hdr(skb);
+ addr6 = netlbl_af6list_search(&hdr6->saddr,
+ &iface->addr6_list);
+ if (addr6 == NULL)
+ goto unlabel_getattr_nolabel;
+ secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid;
+ break;
+ }
+#endif /* IPv6 */
+ default:
+ goto unlabel_getattr_nolabel;
+ }
+ rcu_read_unlock();
+
+ secattr->flags |= NETLBL_SECATTR_SECID;
+ secattr->type = NETLBL_NLTYPE_UNLABELED;
+ return 0;
+
+unlabel_getattr_nolabel:
+ rcu_read_unlock();
+ if (netlabel_unlabel_acceptflg == 0)
+ return -ENOMSG;
+ secattr->type = NETLBL_NLTYPE_UNLABELED;
+ return 0;
+}
+
+/**
+ * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets
+ *
+ * Description:
+ * Set the default NetLabel configuration to allow incoming unlabeled packets
+ * and to send unlabeled network traffic by default.
+ *
+ */
+int __init netlbl_unlabel_defconf(void)
+{
+ int ret_val;
+ struct netlbl_dom_map *entry;
+ struct netlbl_audit audit_info;
+
+ /* Only the kernel is allowed to call this function and the only time
+ * it is called is at bootup before the audit subsystem is reporting
+ * messages so don't worry to much about these values. */
+ security_task_getsecid(current, &audit_info.secid);
+ audit_info.loginuid = GLOBAL_ROOT_UID;
+ audit_info.sessionid = 0;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (entry == NULL)
+ return -ENOMEM;
+ entry->def.type = NETLBL_NLTYPE_UNLABELED;
+ ret_val = netlbl_domhsh_add_default(entry, &audit_info);
+ if (ret_val != 0)
+ return ret_val;
+
+ netlbl_unlabel_acceptflg_set(1, &audit_info);
+
+ return 0;
+}
diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h
new file mode 100644
index 000000000..3a9e5dc95
--- /dev/null
+++ b/net/netlabel/netlabel_unlabeled.h
@@ -0,0 +1,245 @@
+/*
+ * NetLabel Unlabeled Support
+ *
+ * This file defines functions for dealing with unlabeled packets for the
+ * NetLabel system. The NetLabel system manages static and dynamic label
+ * mappings for network protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_UNLABELED_H
+#define _NETLABEL_UNLABELED_H
+
+#include <net/netlabel.h>
+
+/*
+ * The following NetLabel payloads are supported by the Unlabeled subsystem.
+ *
+ * o STATICADD
+ * This message is sent from an application to add a new static label for
+ * incoming unlabeled connections.
+ *
+ * Required attributes:
+ *
+ * NLBL_UNLABEL_A_IFACE
+ * NLBL_UNLABEL_A_SECCTX
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICREMOVE
+ * This message is sent from an application to remove an existing static
+ * label for incoming unlabeled connections.
+ *
+ * Required attributes:
+ *
+ * NLBL_UNLABEL_A_IFACE
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICLIST
+ * This message can be sent either from an application or by the kernel in
+ * response to an application generated STATICLIST message. When sent by an
+ * application there is no payload and the NLM_F_DUMP flag should be set.
+ * The kernel should response with a series of the following messages.
+ *
+ * Required attributes:
+ *
+ * NLBL_UNLABEL_A_IFACE
+ * NLBL_UNLABEL_A_SECCTX
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICADDDEF
+ * This message is sent from an application to set the default static
+ * label for incoming unlabeled connections.
+ *
+ * Required attribute:
+ *
+ * NLBL_UNLABEL_A_SECCTX
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICREMOVEDEF
+ * This message is sent from an application to remove the existing default
+ * static label for incoming unlabeled connections.
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICLISTDEF
+ * This message can be sent either from an application or by the kernel in
+ * response to an application generated STATICLISTDEF message. When sent by
+ * an application there is no payload and the NLM_F_DUMP flag should be set.
+ * The kernel should response with the following message.
+ *
+ * Required attribute:
+ *
+ * NLBL_UNLABEL_A_SECCTX
+ *
+ * If IPv4 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV4ADDR
+ * NLBL_UNLABEL_A_IPV4MASK
+ *
+ * If IPv6 is specified the following attributes are required:
+ *
+ * NLBL_UNLABEL_A_IPV6ADDR
+ * NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o ACCEPT
+ * This message is sent from an application to specify if the kernel should
+ * allow unlabled packets to pass if they do not match any of the static
+ * mappings defined in the unlabeled module.
+ *
+ * Required attributes:
+ *
+ * NLBL_UNLABEL_A_ACPTFLG
+ *
+ * o LIST
+ * This message can be sent either from an application or by the kernel in
+ * response to an application generated LIST message. When sent by an
+ * application there is no payload. The kernel should respond to a LIST
+ * message with a LIST message on success.
+ *
+ * Required attributes:
+ *
+ * NLBL_UNLABEL_A_ACPTFLG
+ *
+ */
+
+/* NetLabel Unlabeled commands */
+enum {
+ NLBL_UNLABEL_C_UNSPEC,
+ NLBL_UNLABEL_C_ACCEPT,
+ NLBL_UNLABEL_C_LIST,
+ NLBL_UNLABEL_C_STATICADD,
+ NLBL_UNLABEL_C_STATICREMOVE,
+ NLBL_UNLABEL_C_STATICLIST,
+ NLBL_UNLABEL_C_STATICADDDEF,
+ NLBL_UNLABEL_C_STATICREMOVEDEF,
+ NLBL_UNLABEL_C_STATICLISTDEF,
+ __NLBL_UNLABEL_C_MAX,
+};
+
+/* NetLabel Unlabeled attributes */
+enum {
+ NLBL_UNLABEL_A_UNSPEC,
+ NLBL_UNLABEL_A_ACPTFLG,
+ /* (NLA_U8)
+ * if true then unlabeled packets are allowed to pass, else unlabeled
+ * packets are rejected */
+ NLBL_UNLABEL_A_IPV6ADDR,
+ /* (NLA_BINARY, struct in6_addr)
+ * an IPv6 address */
+ NLBL_UNLABEL_A_IPV6MASK,
+ /* (NLA_BINARY, struct in6_addr)
+ * an IPv6 address mask */
+ NLBL_UNLABEL_A_IPV4ADDR,
+ /* (NLA_BINARY, struct in_addr)
+ * an IPv4 address */
+ NLBL_UNLABEL_A_IPV4MASK,
+ /* (NLA_BINARY, struct in_addr)
+ * and IPv4 address mask */
+ NLBL_UNLABEL_A_IFACE,
+ /* (NLA_NULL_STRING)
+ * network interface */
+ NLBL_UNLABEL_A_SECCTX,
+ /* (NLA_BINARY)
+ * a LSM specific security context */
+ __NLBL_UNLABEL_A_MAX,
+};
+#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1)
+
+/* NetLabel protocol functions */
+int netlbl_unlabel_genl_init(void);
+
+/* Unlabeled connection hash table size */
+/* XXX - currently this number is an uneducated guess */
+#define NETLBL_UNLHSH_BITSIZE 7
+
+/* General Unlabeled init function */
+int netlbl_unlabel_init(u32 size);
+
+/* Static/Fallback label management functions */
+int netlbl_unlhsh_add(struct net *net,
+ const char *dev_name,
+ const void *addr,
+ const void *mask,
+ u32 addr_len,
+ u32 secid,
+ struct netlbl_audit *audit_info);
+int netlbl_unlhsh_remove(struct net *net,
+ const char *dev_name,
+ const void *addr,
+ const void *mask,
+ u32 addr_len,
+ struct netlbl_audit *audit_info);
+
+/* Process Unlabeled incoming network packets */
+int netlbl_unlabel_getattr(const struct sk_buff *skb,
+ u16 family,
+ struct netlbl_lsm_secattr *secattr);
+
+/* Set the default configuration to allow Unlabeled packets */
+int netlbl_unlabel_defconf(void);
+
+#endif
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
new file mode 100644
index 000000000..adf8b7900
--- /dev/null
+++ b/net/netlabel/netlabel_user.c
@@ -0,0 +1,119 @@
+/*
+ * NetLabel NETLINK Interface
+ *
+ * This file defines the NETLINK interface for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/socket.h>
+#include <linux/audit.h>
+#include <linux/tty.h>
+#include <linux/security.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+#include <asm/bug.h>
+
+#include "netlabel_mgmt.h"
+#include "netlabel_unlabeled.h"
+#include "netlabel_cipso_v4.h"
+#include "netlabel_user.h"
+
+/*
+ * NetLabel NETLINK Setup Functions
+ */
+
+/**
+ * netlbl_netlink_init - Initialize the NETLINK communication channel
+ *
+ * Description:
+ * Call out to the NetLabel components so they can register their families and
+ * commands with the Generic NETLINK mechanism. Returns zero on success and
+ * non-zero on failure.
+ *
+ */
+int __init netlbl_netlink_init(void)
+{
+ int ret_val;
+
+ ret_val = netlbl_mgmt_genl_init();
+ if (ret_val != 0)
+ return ret_val;
+
+ ret_val = netlbl_cipsov4_genl_init();
+ if (ret_val != 0)
+ return ret_val;
+
+ return netlbl_unlabel_genl_init();
+}
+
+/*
+ * NetLabel Audit Functions
+ */
+
+/**
+ * netlbl_audit_start_common - Start an audit message
+ * @type: audit message type
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Start an audit message using the type specified in @type and fill the audit
+ * message with some fields common to all NetLabel audit messages. Returns
+ * a pointer to the audit buffer on success, NULL on failure.
+ *
+ */
+struct audit_buffer *netlbl_audit_start_common(int type,
+ struct netlbl_audit *audit_info)
+{
+ struct audit_buffer *audit_buf;
+ char *secctx;
+ u32 secctx_len;
+
+ if (audit_enabled == 0)
+ return NULL;
+
+ audit_buf = audit_log_start(current->audit_context, GFP_ATOMIC, type);
+ if (audit_buf == NULL)
+ return NULL;
+
+ audit_log_format(audit_buf, "netlabel: auid=%u ses=%u",
+ from_kuid(&init_user_ns, audit_info->loginuid),
+ audit_info->sessionid);
+
+ if (audit_info->secid != 0 &&
+ security_secid_to_secctx(audit_info->secid,
+ &secctx,
+ &secctx_len) == 0) {
+ audit_log_format(audit_buf, " subj=%s", secctx);
+ security_release_secctx(secctx, secctx_len);
+ }
+
+ return audit_buf;
+}
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
new file mode 100644
index 000000000..4a397cde1
--- /dev/null
+++ b/net/netlabel/netlabel_user.h
@@ -0,0 +1,65 @@
+/*
+ * NetLabel NETLINK Interface
+ *
+ * This file defines the NETLINK interface for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul@paul-moore.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_USER_H
+#define _NETLABEL_USER_H
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/capability.h>
+#include <linux/audit.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+
+/* NetLabel NETLINK helper functions */
+
+/**
+ * netlbl_netlink_auditinfo - Fetch the audit information from a NETLINK msg
+ * @skb: the packet
+ * @audit_info: NetLabel audit information
+ */
+static inline void netlbl_netlink_auditinfo(struct sk_buff *skb,
+ struct netlbl_audit *audit_info)
+{
+ security_task_getsecid(current, &audit_info->secid);
+ audit_info->loginuid = audit_get_loginuid(current);
+ audit_info->sessionid = audit_get_sessionid(current);
+}
+
+/* NetLabel NETLINK I/O functions */
+
+int netlbl_netlink_init(void);
+
+/* NetLabel Audit Functions */
+
+struct audit_buffer *netlbl_audit_start_common(int type,
+ struct netlbl_audit *audit_info);
+
+#endif
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
new file mode 100644
index 000000000..2c5e95e9b
--- /dev/null
+++ b/net/netlink/Kconfig
@@ -0,0 +1,19 @@
+#
+# Netlink Sockets
+#
+
+config NETLINK_MMAP
+ bool "NETLINK: mmaped IO"
+ ---help---
+ This option enables support for memory mapped netlink IO. This
+ reduces overhead by avoiding copying data between kernel- and
+ userspace.
+
+ If unsure, say N.
+
+config NETLINK_DIAG
+ tristate "NETLINK: socket monitoring interface"
+ default n
+ ---help---
+ Support for NETLINK socket monitoring interface used by the ss tool.
+ If unsure, say Y.
diff --git a/net/netlink/Makefile b/net/netlink/Makefile
new file mode 100644
index 000000000..e837917f6
--- /dev/null
+++ b/net/netlink/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the netlink driver.
+#
+
+obj-y := af_netlink.o genetlink.o
+
+obj-$(CONFIG_NETLINK_DIAG) += netlink_diag.o
+netlink_diag-y := diag.o
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
new file mode 100644
index 000000000..bf6e76643
--- /dev/null
+++ b/net/netlink/af_netlink.c
@@ -0,0 +1,3184 @@
+/*
+ * NETLINK Kernel-user communication protocol.
+ *
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ * Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
+ * added netlink_proto_exit
+ * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
+ * use nlk_sk, as sk->protinfo is on a diet 8)
+ * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
+ * - inc module use count of module that owns
+ * the kernel socket in case userspace opens
+ * socket of same protocol
+ * - remove all module support, since netlink is
+ * mandatory if CONFIG_NET=y these days
+ */
+
+#include <linux/module.h>
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/security.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/if_arp.h>
+#include <linux/rhashtable.h>
+#include <asm/cacheflush.h>
+#include <linux/hash.h>
+#include <linux/genetlink.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/scm.h>
+#include <net/netlink.h>
+
+#include "af_netlink.h"
+
+struct listeners {
+ struct rcu_head rcu;
+ unsigned long masks[0];
+};
+
+/* state bits */
+#define NETLINK_CONGESTED 0x0
+
+/* flags */
+#define NETLINK_KERNEL_SOCKET 0x1
+#define NETLINK_RECV_PKTINFO 0x2
+#define NETLINK_BROADCAST_SEND_ERROR 0x4
+#define NETLINK_RECV_NO_ENOBUFS 0x8
+
+static inline int netlink_is_kernel(struct sock *sk)
+{
+ return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
+}
+
+struct netlink_table *nl_table __read_mostly;
+EXPORT_SYMBOL_GPL(nl_table);
+
+static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
+
+static int netlink_dump(struct sock *sk);
+static void netlink_skb_destructor(struct sk_buff *skb);
+
+/* nl_table locking explained:
+ * Lookup and traversal are protected with an RCU read-side lock. Insertion
+ * and removal are protected with per bucket lock while using RCU list
+ * modification primitives and may run in parallel to RCU protected lookups.
+ * Destruction of the Netlink socket may only occur *after* nl_table_lock has
+ * been acquired * either during or after the socket has been removed from
+ * the list and after an RCU grace period.
+ */
+DEFINE_RWLOCK(nl_table_lock);
+EXPORT_SYMBOL_GPL(nl_table_lock);
+static atomic_t nl_table_users = ATOMIC_INIT(0);
+
+#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
+
+static ATOMIC_NOTIFIER_HEAD(netlink_chain);
+
+static DEFINE_SPINLOCK(netlink_tap_lock);
+static struct list_head netlink_tap_all __read_mostly;
+
+static const struct rhashtable_params netlink_rhashtable_params;
+
+static inline u32 netlink_group_mask(u32 group)
+{
+ return group ? 1 << (group - 1) : 0;
+}
+
+int netlink_add_tap(struct netlink_tap *nt)
+{
+ if (unlikely(nt->dev->type != ARPHRD_NETLINK))
+ return -EINVAL;
+
+ spin_lock(&netlink_tap_lock);
+ list_add_rcu(&nt->list, &netlink_tap_all);
+ spin_unlock(&netlink_tap_lock);
+
+ __module_get(nt->module);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netlink_add_tap);
+
+static int __netlink_remove_tap(struct netlink_tap *nt)
+{
+ bool found = false;
+ struct netlink_tap *tmp;
+
+ spin_lock(&netlink_tap_lock);
+
+ list_for_each_entry(tmp, &netlink_tap_all, list) {
+ if (nt == tmp) {
+ list_del_rcu(&nt->list);
+ found = true;
+ goto out;
+ }
+ }
+
+ pr_warn("__netlink_remove_tap: %p not found\n", nt);
+out:
+ spin_unlock(&netlink_tap_lock);
+
+ if (found && nt->module)
+ module_put(nt->module);
+
+ return found ? 0 : -ENODEV;
+}
+
+int netlink_remove_tap(struct netlink_tap *nt)
+{
+ int ret;
+
+ ret = __netlink_remove_tap(nt);
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(netlink_remove_tap);
+
+static bool netlink_filter_tap(const struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ /* We take the more conservative approach and
+ * whitelist socket protocols that may pass.
+ */
+ switch (sk->sk_protocol) {
+ case NETLINK_ROUTE:
+ case NETLINK_USERSOCK:
+ case NETLINK_SOCK_DIAG:
+ case NETLINK_NFLOG:
+ case NETLINK_XFRM:
+ case NETLINK_FIB_LOOKUP:
+ case NETLINK_NETFILTER:
+ case NETLINK_GENERIC:
+ return true;
+ }
+
+ return false;
+}
+
+static int __netlink_deliver_tap_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct sk_buff *nskb;
+ struct sock *sk = skb->sk;
+ int ret = -ENOMEM;
+
+ dev_hold(dev);
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (nskb) {
+ nskb->dev = dev;
+ nskb->protocol = htons((u16) sk->sk_protocol);
+ nskb->pkt_type = netlink_is_kernel(sk) ?
+ PACKET_KERNEL : PACKET_USER;
+ skb_reset_network_header(nskb);
+ ret = dev_queue_xmit(nskb);
+ if (unlikely(ret > 0))
+ ret = net_xmit_errno(ret);
+ }
+
+ dev_put(dev);
+ return ret;
+}
+
+static void __netlink_deliver_tap(struct sk_buff *skb)
+{
+ int ret;
+ struct netlink_tap *tmp;
+
+ if (!netlink_filter_tap(skb))
+ return;
+
+ list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
+ ret = __netlink_deliver_tap_skb(skb, tmp->dev);
+ if (unlikely(ret))
+ break;
+ }
+}
+
+static void netlink_deliver_tap(struct sk_buff *skb)
+{
+ rcu_read_lock();
+
+ if (unlikely(!list_empty(&netlink_tap_all)))
+ __netlink_deliver_tap(skb);
+
+ rcu_read_unlock();
+}
+
+static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
+ struct sk_buff *skb)
+{
+ if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
+ netlink_deliver_tap(skb);
+}
+
+static void netlink_overrun(struct sock *sk)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
+ if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) {
+ sk->sk_err = ENOBUFS;
+ sk->sk_error_report(sk);
+ }
+ }
+ atomic_inc(&sk->sk_drops);
+}
+
+static void netlink_rcv_wake(struct sock *sk)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (skb_queue_empty(&sk->sk_receive_queue))
+ clear_bit(NETLINK_CONGESTED, &nlk->state);
+ if (!test_bit(NETLINK_CONGESTED, &nlk->state))
+ wake_up_interruptible(&nlk->wait);
+}
+
+#ifdef CONFIG_NETLINK_MMAP
+static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
+{
+ return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
+}
+
+static bool netlink_rx_is_mmaped(struct sock *sk)
+{
+ return nlk_sk(sk)->rx_ring.pg_vec != NULL;
+}
+
+static bool netlink_tx_is_mmaped(struct sock *sk)
+{
+ return nlk_sk(sk)->tx_ring.pg_vec != NULL;
+}
+
+static __pure struct page *pgvec_to_page(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ else
+ return virt_to_page(addr);
+}
+
+static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; i++) {
+ if (pg_vec[i] != NULL) {
+ if (is_vmalloc_addr(pg_vec[i]))
+ vfree(pg_vec[i]);
+ else
+ free_pages((unsigned long)pg_vec[i], order);
+ }
+ }
+ kfree(pg_vec);
+}
+
+static void *alloc_one_pg_vec_page(unsigned long order)
+{
+ void *buffer;
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
+ __GFP_NOWARN | __GFP_NORETRY;
+
+ buffer = (void *)__get_free_pages(gfp_flags, order);
+ if (buffer != NULL)
+ return buffer;
+
+ buffer = vzalloc((1 << order) * PAGE_SIZE);
+ if (buffer != NULL)
+ return buffer;
+
+ gfp_flags &= ~__GFP_NORETRY;
+ return (void *)__get_free_pages(gfp_flags, order);
+}
+
+static void **alloc_pg_vec(struct netlink_sock *nlk,
+ struct nl_mmap_req *req, unsigned int order)
+{
+ unsigned int block_nr = req->nm_block_nr;
+ unsigned int i;
+ void **pg_vec;
+
+ pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
+ if (pg_vec == NULL)
+ return NULL;
+
+ for (i = 0; i < block_nr; i++) {
+ pg_vec[i] = alloc_one_pg_vec_page(order);
+ if (pg_vec[i] == NULL)
+ goto err1;
+ }
+
+ return pg_vec;
+err1:
+ free_pg_vec(pg_vec, order, block_nr);
+ return NULL;
+}
+
+static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
+ bool closing, bool tx_ring)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ struct netlink_ring *ring;
+ struct sk_buff_head *queue;
+ void **pg_vec = NULL;
+ unsigned int order = 0;
+ int err;
+
+ ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
+ queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
+
+ if (!closing) {
+ if (atomic_read(&nlk->mapped))
+ return -EBUSY;
+ if (atomic_read(&ring->pending))
+ return -EBUSY;
+ }
+
+ if (req->nm_block_nr) {
+ if (ring->pg_vec != NULL)
+ return -EBUSY;
+
+ if ((int)req->nm_block_size <= 0)
+ return -EINVAL;
+ if (!PAGE_ALIGNED(req->nm_block_size))
+ return -EINVAL;
+ if (req->nm_frame_size < NL_MMAP_HDRLEN)
+ return -EINVAL;
+ if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
+ return -EINVAL;
+
+ ring->frames_per_block = req->nm_block_size /
+ req->nm_frame_size;
+ if (ring->frames_per_block == 0)
+ return -EINVAL;
+ if (ring->frames_per_block * req->nm_block_nr !=
+ req->nm_frame_nr)
+ return -EINVAL;
+
+ order = get_order(req->nm_block_size);
+ pg_vec = alloc_pg_vec(nlk, req, order);
+ if (pg_vec == NULL)
+ return -ENOMEM;
+ } else {
+ if (req->nm_frame_nr)
+ return -EINVAL;
+ }
+
+ err = -EBUSY;
+ mutex_lock(&nlk->pg_vec_lock);
+ if (closing || atomic_read(&nlk->mapped) == 0) {
+ err = 0;
+ spin_lock_bh(&queue->lock);
+
+ ring->frame_max = req->nm_frame_nr - 1;
+ ring->head = 0;
+ ring->frame_size = req->nm_frame_size;
+ ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE;
+
+ swap(ring->pg_vec_len, req->nm_block_nr);
+ swap(ring->pg_vec_order, order);
+ swap(ring->pg_vec, pg_vec);
+
+ __skb_queue_purge(queue);
+ spin_unlock_bh(&queue->lock);
+
+ WARN_ON(atomic_read(&nlk->mapped));
+ }
+ mutex_unlock(&nlk->pg_vec_lock);
+
+ if (pg_vec)
+ free_pg_vec(pg_vec, order, req->nm_block_nr);
+ return err;
+}
+
+static void netlink_mm_open(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct socket *sock = file->private_data;
+ struct sock *sk = sock->sk;
+
+ if (sk)
+ atomic_inc(&nlk_sk(sk)->mapped);
+}
+
+static void netlink_mm_close(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct socket *sock = file->private_data;
+ struct sock *sk = sock->sk;
+
+ if (sk)
+ atomic_dec(&nlk_sk(sk)->mapped);
+}
+
+static const struct vm_operations_struct netlink_mmap_ops = {
+ .open = netlink_mm_open,
+ .close = netlink_mm_close,
+};
+
+static int netlink_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ struct netlink_ring *ring;
+ unsigned long start, size, expected;
+ unsigned int i;
+ int err = -EINVAL;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ mutex_lock(&nlk->pg_vec_lock);
+
+ expected = 0;
+ for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
+ if (ring->pg_vec == NULL)
+ continue;
+ expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
+ }
+
+ if (expected == 0)
+ goto out;
+
+ size = vma->vm_end - vma->vm_start;
+ if (size != expected)
+ goto out;
+
+ start = vma->vm_start;
+ for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
+ if (ring->pg_vec == NULL)
+ continue;
+
+ for (i = 0; i < ring->pg_vec_len; i++) {
+ struct page *page;
+ void *kaddr = ring->pg_vec[i];
+ unsigned int pg_num;
+
+ for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
+ page = pgvec_to_page(kaddr);
+ err = vm_insert_page(vma, start, page);
+ if (err < 0)
+ goto out;
+ start += PAGE_SIZE;
+ kaddr += PAGE_SIZE;
+ }
+ }
+ }
+
+ atomic_inc(&nlk->mapped);
+ vma->vm_ops = &netlink_mmap_ops;
+ err = 0;
+out:
+ mutex_unlock(&nlk->pg_vec_lock);
+ return err;
+}
+
+static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
+{
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+ struct page *p_start, *p_end;
+
+ /* First page is flushed through netlink_{get,set}_status */
+ p_start = pgvec_to_page(hdr + PAGE_SIZE);
+ p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
+ while (p_start <= p_end) {
+ flush_dcache_page(p_start);
+ p_start++;
+ }
+#endif
+}
+
+static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
+{
+ smp_rmb();
+ flush_dcache_page(pgvec_to_page(hdr));
+ return hdr->nm_status;
+}
+
+static void netlink_set_status(struct nl_mmap_hdr *hdr,
+ enum nl_mmap_status status)
+{
+ smp_mb();
+ hdr->nm_status = status;
+ flush_dcache_page(pgvec_to_page(hdr));
+}
+
+static struct nl_mmap_hdr *
+__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
+{
+ unsigned int pg_vec_pos, frame_off;
+
+ pg_vec_pos = pos / ring->frames_per_block;
+ frame_off = pos % ring->frames_per_block;
+
+ return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
+}
+
+static struct nl_mmap_hdr *
+netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
+ enum nl_mmap_status status)
+{
+ struct nl_mmap_hdr *hdr;
+
+ hdr = __netlink_lookup_frame(ring, pos);
+ if (netlink_get_status(hdr) != status)
+ return NULL;
+
+ return hdr;
+}
+
+static struct nl_mmap_hdr *
+netlink_current_frame(const struct netlink_ring *ring,
+ enum nl_mmap_status status)
+{
+ return netlink_lookup_frame(ring, ring->head, status);
+}
+
+static struct nl_mmap_hdr *
+netlink_previous_frame(const struct netlink_ring *ring,
+ enum nl_mmap_status status)
+{
+ unsigned int prev;
+
+ prev = ring->head ? ring->head - 1 : ring->frame_max;
+ return netlink_lookup_frame(ring, prev, status);
+}
+
+static void netlink_increment_head(struct netlink_ring *ring)
+{
+ ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
+}
+
+static void netlink_forward_ring(struct netlink_ring *ring)
+{
+ unsigned int head = ring->head, pos = head;
+ const struct nl_mmap_hdr *hdr;
+
+ do {
+ hdr = __netlink_lookup_frame(ring, pos);
+ if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
+ break;
+ if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
+ break;
+ netlink_increment_head(ring);
+ } while (ring->head != head);
+}
+
+static bool netlink_dump_space(struct netlink_sock *nlk)
+{
+ struct netlink_ring *ring = &nlk->rx_ring;
+ struct nl_mmap_hdr *hdr;
+ unsigned int n;
+
+ hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
+ if (hdr == NULL)
+ return false;
+
+ n = ring->head + ring->frame_max / 2;
+ if (n > ring->frame_max)
+ n -= ring->frame_max;
+
+ hdr = __netlink_lookup_frame(ring, n);
+
+ return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
+}
+
+static unsigned int netlink_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ unsigned int mask;
+ int err;
+
+ if (nlk->rx_ring.pg_vec != NULL) {
+ /* Memory mapped sockets don't call recvmsg(), so flow control
+ * for dumps is performed here. A dump is allowed to continue
+ * if at least half the ring is unused.
+ */
+ while (nlk->cb_running && netlink_dump_space(nlk)) {
+ err = netlink_dump(sk);
+ if (err < 0) {
+ sk->sk_err = -err;
+ sk->sk_error_report(sk);
+ break;
+ }
+ }
+ netlink_rcv_wake(sk);
+ }
+
+ mask = datagram_poll(file, sock, wait);
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (nlk->rx_ring.pg_vec) {
+ netlink_forward_ring(&nlk->rx_ring);
+ if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
+ mask |= POLLIN | POLLRDNORM;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+ spin_lock_bh(&sk->sk_write_queue.lock);
+ if (nlk->tx_ring.pg_vec) {
+ if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ spin_unlock_bh(&sk->sk_write_queue.lock);
+
+ return mask;
+}
+
+static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
+{
+ return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
+}
+
+static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
+ struct netlink_ring *ring,
+ struct nl_mmap_hdr *hdr)
+{
+ unsigned int size;
+ void *data;
+
+ size = ring->frame_size - NL_MMAP_HDRLEN;
+ data = (void *)hdr + NL_MMAP_HDRLEN;
+
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+ skb->len = 0;
+
+ skb->destructor = netlink_skb_destructor;
+ NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
+ NETLINK_CB(skb).sk = sk;
+}
+
+static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
+ u32 dst_portid, u32 dst_group,
+ struct scm_cookie *scm)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ struct netlink_ring *ring;
+ struct nl_mmap_hdr *hdr;
+ struct sk_buff *skb;
+ unsigned int maxlen;
+ int err = 0, len = 0;
+
+ mutex_lock(&nlk->pg_vec_lock);
+
+ ring = &nlk->tx_ring;
+ maxlen = ring->frame_size - NL_MMAP_HDRLEN;
+
+ do {
+ unsigned int nm_len;
+
+ hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
+ if (hdr == NULL) {
+ if (!(msg->msg_flags & MSG_DONTWAIT) &&
+ atomic_read(&nlk->tx_ring.pending))
+ schedule();
+ continue;
+ }
+
+ nm_len = ACCESS_ONCE(hdr->nm_len);
+ if (nm_len > maxlen) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ netlink_frame_flush_dcache(hdr, nm_len);
+
+ skb = alloc_skb(nm_len, GFP_KERNEL);
+ if (skb == NULL) {
+ err = -ENOBUFS;
+ goto out;
+ }
+ __skb_put(skb, nm_len);
+ memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
+ netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
+
+ netlink_increment_head(ring);
+
+ NETLINK_CB(skb).portid = nlk->portid;
+ NETLINK_CB(skb).dst_group = dst_group;
+ NETLINK_CB(skb).creds = scm->creds;
+
+ err = security_netlink_send(sk, skb);
+ if (err) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ if (unlikely(dst_group)) {
+ atomic_inc(&skb->users);
+ netlink_broadcast(sk, skb, dst_portid, dst_group,
+ GFP_KERNEL);
+ }
+ err = netlink_unicast(sk, skb, dst_portid,
+ msg->msg_flags & MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+ len += err;
+
+ } while (hdr != NULL ||
+ (!(msg->msg_flags & MSG_DONTWAIT) &&
+ atomic_read(&nlk->tx_ring.pending)));
+
+ if (len > 0)
+ err = len;
+out:
+ mutex_unlock(&nlk->pg_vec_lock);
+ return err;
+}
+
+static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct nl_mmap_hdr *hdr;
+
+ hdr = netlink_mmap_hdr(skb);
+ hdr->nm_len = skb->len;
+ hdr->nm_group = NETLINK_CB(skb).dst_group;
+ hdr->nm_pid = NETLINK_CB(skb).creds.pid;
+ hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
+ hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
+ netlink_frame_flush_dcache(hdr, hdr->nm_len);
+ netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+
+ NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
+ kfree_skb(skb);
+}
+
+static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ struct netlink_ring *ring = &nlk->rx_ring;
+ struct nl_mmap_hdr *hdr;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
+ if (hdr == NULL) {
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ kfree_skb(skb);
+ netlink_overrun(sk);
+ return;
+ }
+ netlink_increment_head(ring);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+ hdr->nm_len = skb->len;
+ hdr->nm_group = NETLINK_CB(skb).dst_group;
+ hdr->nm_pid = NETLINK_CB(skb).creds.pid;
+ hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
+ hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
+ netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
+}
+
+#else /* CONFIG_NETLINK_MMAP */
+#define netlink_skb_is_mmaped(skb) false
+#define netlink_rx_is_mmaped(sk) false
+#define netlink_tx_is_mmaped(sk) false
+#define netlink_mmap sock_no_mmap
+#define netlink_poll datagram_poll
+#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0
+#endif /* CONFIG_NETLINK_MMAP */
+
+static void netlink_skb_destructor(struct sk_buff *skb)
+{
+#ifdef CONFIG_NETLINK_MMAP
+ struct nl_mmap_hdr *hdr;
+ struct netlink_ring *ring;
+ struct sock *sk;
+
+ /* If a packet from the kernel to userspace was freed because of an
+ * error without being delivered to userspace, the kernel must reset
+ * the status. In the direction userspace to kernel, the status is
+ * always reset here after the packet was processed and freed.
+ */
+ if (netlink_skb_is_mmaped(skb)) {
+ hdr = netlink_mmap_hdr(skb);
+ sk = NETLINK_CB(skb).sk;
+
+ if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
+ netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
+ ring = &nlk_sk(sk)->tx_ring;
+ } else {
+ if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+ hdr->nm_len = 0;
+ netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+ }
+ ring = &nlk_sk(sk)->rx_ring;
+ }
+
+ WARN_ON(atomic_read(&ring->pending) == 0);
+ atomic_dec(&ring->pending);
+ sock_put(sk);
+
+ skb->head = NULL;
+ }
+#endif
+ if (is_vmalloc_addr(skb->head)) {
+ if (!skb->cloned ||
+ !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+ vfree(skb->head);
+
+ skb->head = NULL;
+ }
+ if (skb->sk != NULL)
+ sock_rfree(skb);
+}
+
+static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+ WARN_ON(skb->sk != NULL);
+ skb->sk = sk;
+ skb->destructor = netlink_skb_destructor;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, skb->truesize);
+}
+
+static void netlink_sock_destruct(struct sock *sk)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (nlk->cb_running) {
+ if (nlk->cb.done)
+ nlk->cb.done(&nlk->cb);
+
+ module_put(nlk->cb.module);
+ kfree_skb(nlk->cb.skb);
+ }
+
+ skb_queue_purge(&sk->sk_receive_queue);
+#ifdef CONFIG_NETLINK_MMAP
+ if (1) {
+ struct nl_mmap_req req;
+
+ memset(&req, 0, sizeof(req));
+ if (nlk->rx_ring.pg_vec)
+ netlink_set_ring(sk, &req, true, false);
+ memset(&req, 0, sizeof(req));
+ if (nlk->tx_ring.pg_vec)
+ netlink_set_ring(sk, &req, true, true);
+ }
+#endif /* CONFIG_NETLINK_MMAP */
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
+ return;
+ }
+
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(nlk_sk(sk)->groups);
+}
+
+/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
+ * SMP. Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines.
+ */
+
+void netlink_table_grab(void)
+ __acquires(nl_table_lock)
+{
+ might_sleep();
+
+ write_lock_irq(&nl_table_lock);
+
+ if (atomic_read(&nl_table_users)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue_exclusive(&nl_table_wait, &wait);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&nl_table_users) == 0)
+ break;
+ write_unlock_irq(&nl_table_lock);
+ schedule();
+ write_lock_irq(&nl_table_lock);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&nl_table_wait, &wait);
+ }
+}
+
+void netlink_table_ungrab(void)
+ __releases(nl_table_lock)
+{
+ write_unlock_irq(&nl_table_lock);
+ wake_up(&nl_table_wait);
+}
+
+static inline void
+netlink_lock_table(void)
+{
+ /* read_lock() synchronizes us to netlink_table_grab */
+
+ read_lock(&nl_table_lock);
+ atomic_inc(&nl_table_users);
+ read_unlock(&nl_table_lock);
+}
+
+static inline void
+netlink_unlock_table(void)
+{
+ if (atomic_dec_and_test(&nl_table_users))
+ wake_up(&nl_table_wait);
+}
+
+struct netlink_compare_arg
+{
+ possible_net_t pnet;
+ u32 portid;
+};
+
+/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
+#define netlink_compare_arg_len \
+ (offsetof(struct netlink_compare_arg, portid) + sizeof(u32))
+
+static inline int netlink_compare(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct netlink_compare_arg *x = arg->key;
+ const struct netlink_sock *nlk = ptr;
+
+ return nlk->portid != x->portid ||
+ !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
+}
+
+static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
+ struct net *net, u32 portid)
+{
+ memset(arg, 0, sizeof(*arg));
+ write_pnet(&arg->pnet, net);
+ arg->portid = portid;
+}
+
+static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
+ struct net *net)
+{
+ struct netlink_compare_arg arg;
+
+ netlink_compare_arg_init(&arg, net, portid);
+ return rhashtable_lookup_fast(&table->hash, &arg,
+ netlink_rhashtable_params);
+}
+
+static int __netlink_insert(struct netlink_table *table, struct sock *sk)
+{
+ struct netlink_compare_arg arg;
+
+ netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
+ return rhashtable_lookup_insert_key(&table->hash, &arg,
+ &nlk_sk(sk)->node,
+ netlink_rhashtable_params);
+}
+
+static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
+{
+ struct netlink_table *table = &nl_table[protocol];
+ struct sock *sk;
+
+ rcu_read_lock();
+ sk = __netlink_lookup(table, portid, net);
+ if (sk)
+ sock_hold(sk);
+ rcu_read_unlock();
+
+ return sk;
+}
+
+static const struct proto_ops netlink_ops;
+
+static void
+netlink_update_listeners(struct sock *sk)
+{
+ struct netlink_table *tbl = &nl_table[sk->sk_protocol];
+ unsigned long mask;
+ unsigned int i;
+ struct listeners *listeners;
+
+ listeners = nl_deref_protected(tbl->listeners);
+ if (!listeners)
+ return;
+
+ for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
+ mask = 0;
+ sk_for_each_bound(sk, &tbl->mc_list) {
+ if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
+ mask |= nlk_sk(sk)->groups[i];
+ }
+ listeners->masks[i] = mask;
+ }
+ /* this function is only called with the netlink table "grabbed", which
+ * makes sure updates are visible before bind or setsockopt return. */
+}
+
+static int netlink_insert(struct sock *sk, u32 portid)
+{
+ struct netlink_table *table = &nl_table[sk->sk_protocol];
+ int err;
+
+ lock_sock(sk);
+
+ err = -EBUSY;
+ if (nlk_sk(sk)->portid)
+ goto err;
+
+ err = -ENOMEM;
+ if (BITS_PER_LONG > 32 &&
+ unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
+ goto err;
+
+ nlk_sk(sk)->portid = portid;
+ sock_hold(sk);
+
+ err = __netlink_insert(table, sk);
+ if (err) {
+ if (err == -EEXIST)
+ err = -EADDRINUSE;
+ nlk_sk(sk)->portid = 0;
+ sock_put(sk);
+ }
+
+err:
+ release_sock(sk);
+ return err;
+}
+
+static void netlink_remove(struct sock *sk)
+{
+ struct netlink_table *table;
+
+ table = &nl_table[sk->sk_protocol];
+ if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
+ netlink_rhashtable_params)) {
+ WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
+ __sock_put(sk);
+ }
+
+ netlink_table_grab();
+ if (nlk_sk(sk)->subscriptions) {
+ __sk_del_bind_node(sk);
+ netlink_update_listeners(sk);
+ }
+ if (sk->sk_protocol == NETLINK_GENERIC)
+ atomic_inc(&genl_sk_destructing_cnt);
+ netlink_table_ungrab();
+}
+
+static struct proto netlink_proto = {
+ .name = "NETLINK",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct netlink_sock),
+};
+
+static int __netlink_create(struct net *net, struct socket *sock,
+ struct mutex *cb_mutex, int protocol)
+{
+ struct sock *sk;
+ struct netlink_sock *nlk;
+
+ sock->ops = &netlink_ops;
+
+ sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+
+ nlk = nlk_sk(sk);
+ if (cb_mutex) {
+ nlk->cb_mutex = cb_mutex;
+ } else {
+ nlk->cb_mutex = &nlk->cb_def_mutex;
+ mutex_init(nlk->cb_mutex);
+ }
+ init_waitqueue_head(&nlk->wait);
+#ifdef CONFIG_NETLINK_MMAP
+ mutex_init(&nlk->pg_vec_lock);
+#endif
+
+ sk->sk_destruct = netlink_sock_destruct;
+ sk->sk_protocol = protocol;
+ return 0;
+}
+
+static int netlink_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct module *module = NULL;
+ struct mutex *cb_mutex;
+ struct netlink_sock *nlk;
+ int (*bind)(struct net *net, int group);
+ void (*unbind)(struct net *net, int group);
+ int err = 0;
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+ return -ESOCKTNOSUPPORT;
+
+ if (protocol < 0 || protocol >= MAX_LINKS)
+ return -EPROTONOSUPPORT;
+
+ netlink_lock_table();
+#ifdef CONFIG_MODULES
+ if (!nl_table[protocol].registered) {
+ netlink_unlock_table();
+ request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
+ netlink_lock_table();
+ }
+#endif
+ if (nl_table[protocol].registered &&
+ try_module_get(nl_table[protocol].module))
+ module = nl_table[protocol].module;
+ else
+ err = -EPROTONOSUPPORT;
+ cb_mutex = nl_table[protocol].cb_mutex;
+ bind = nl_table[protocol].bind;
+ unbind = nl_table[protocol].unbind;
+ netlink_unlock_table();
+
+ if (err < 0)
+ goto out;
+
+ err = __netlink_create(net, sock, cb_mutex, protocol);
+ if (err < 0)
+ goto out_module;
+
+ local_bh_disable();
+ sock_prot_inuse_add(net, &netlink_proto, 1);
+ local_bh_enable();
+
+ nlk = nlk_sk(sock->sk);
+ nlk->module = module;
+ nlk->netlink_bind = bind;
+ nlk->netlink_unbind = unbind;
+out:
+ return err;
+
+out_module:
+ module_put(module);
+ goto out;
+}
+
+static void deferred_put_nlk_sk(struct rcu_head *head)
+{
+ struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
+
+ sock_put(&nlk->sk);
+}
+
+static int netlink_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk;
+
+ if (!sk)
+ return 0;
+
+ netlink_remove(sk);
+ sock_orphan(sk);
+ nlk = nlk_sk(sk);
+
+ /*
+ * OK. Socket is unlinked, any packets that arrive now
+ * will be purged.
+ */
+
+ /* must not acquire netlink_table_lock in any way again before unbind
+ * and notifying genetlink is done as otherwise it might deadlock
+ */
+ if (nlk->netlink_unbind) {
+ int i;
+
+ for (i = 0; i < nlk->ngroups; i++)
+ if (test_bit(i, nlk->groups))
+ nlk->netlink_unbind(sock_net(sk), i + 1);
+ }
+ if (sk->sk_protocol == NETLINK_GENERIC &&
+ atomic_dec_return(&genl_sk_destructing_cnt) == 0)
+ wake_up(&genl_sk_destructing_waitq);
+
+ sock->sk = NULL;
+ wake_up_interruptible_all(&nlk->wait);
+
+ skb_queue_purge(&sk->sk_write_queue);
+
+ if (nlk->portid) {
+ struct netlink_notify n = {
+ .net = sock_net(sk),
+ .protocol = sk->sk_protocol,
+ .portid = nlk->portid,
+ };
+ atomic_notifier_call_chain(&netlink_chain,
+ NETLINK_URELEASE, &n);
+ }
+
+ module_put(nlk->module);
+
+ if (netlink_is_kernel(sk)) {
+ netlink_table_grab();
+ BUG_ON(nl_table[sk->sk_protocol].registered == 0);
+ if (--nl_table[sk->sk_protocol].registered == 0) {
+ struct listeners *old;
+
+ old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
+ RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
+ kfree_rcu(old, rcu);
+ nl_table[sk->sk_protocol].module = NULL;
+ nl_table[sk->sk_protocol].bind = NULL;
+ nl_table[sk->sk_protocol].unbind = NULL;
+ nl_table[sk->sk_protocol].flags = 0;
+ nl_table[sk->sk_protocol].registered = 0;
+ }
+ netlink_table_ungrab();
+ }
+
+ kfree(nlk->groups);
+ nlk->groups = NULL;
+
+ local_bh_disable();
+ sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
+ local_bh_enable();
+ call_rcu(&nlk->rcu, deferred_put_nlk_sk);
+ return 0;
+}
+
+static int netlink_autobind(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct netlink_table *table = &nl_table[sk->sk_protocol];
+ s32 portid = task_tgid_vnr(current);
+ int err;
+ static s32 rover = -4097;
+
+retry:
+ cond_resched();
+ rcu_read_lock();
+ if (__netlink_lookup(table, portid, net)) {
+ /* Bind collision, search negative portid values. */
+ portid = rover--;
+ if (rover > -4097)
+ rover = -4097;
+ rcu_read_unlock();
+ goto retry;
+ }
+ rcu_read_unlock();
+
+ err = netlink_insert(sk, portid);
+ if (err == -EADDRINUSE)
+ goto retry;
+
+ /* If 2 threads race to autobind, that is fine. */
+ if (err == -EBUSY)
+ err = 0;
+
+ return err;
+}
+
+/**
+ * __netlink_ns_capable - General netlink message capability test
+ * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket we received the message
+ * from had when the netlink socket was created and the sender of the
+ * message has has the capability @cap in the user namespace @user_ns.
+ */
+bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
+ struct user_namespace *user_ns, int cap)
+{
+ return ((nsp->flags & NETLINK_SKB_DST) ||
+ file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
+ ns_capable(user_ns, cap);
+}
+EXPORT_SYMBOL(__netlink_ns_capable);
+
+/**
+ * netlink_ns_capable - General netlink message capability test
+ * @skb: socket buffer holding a netlink command from userspace
+ * @user_ns: The user namespace of the capability to use
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket we received the message
+ * from had when the netlink socket was created and the sender of the
+ * message has has the capability @cap in the user namespace @user_ns.
+ */
+bool netlink_ns_capable(const struct sk_buff *skb,
+ struct user_namespace *user_ns, int cap)
+{
+ return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
+}
+EXPORT_SYMBOL(netlink_ns_capable);
+
+/**
+ * netlink_capable - Netlink global message capability test
+ * @skb: socket buffer holding a netlink command from userspace
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket we received the message
+ * from had when the netlink socket was created and the sender of the
+ * message has has the capability @cap in all user namespaces.
+ */
+bool netlink_capable(const struct sk_buff *skb, int cap)
+{
+ return netlink_ns_capable(skb, &init_user_ns, cap);
+}
+EXPORT_SYMBOL(netlink_capable);
+
+/**
+ * netlink_net_capable - Netlink network namespace message capability test
+ * @skb: socket buffer holding a netlink command from userspace
+ * @cap: The capability to use
+ *
+ * Test to see if the opener of the socket we received the message
+ * from had when the netlink socket was created and the sender of the
+ * message has has the capability @cap over the network namespace of
+ * the socket we received the message from.
+ */
+bool netlink_net_capable(const struct sk_buff *skb, int cap)
+{
+ return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
+}
+EXPORT_SYMBOL(netlink_net_capable);
+
+static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
+{
+ return (nl_table[sock->sk->sk_protocol].flags & flag) ||
+ ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
+}
+
+static void
+netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (nlk->subscriptions && !subscriptions)
+ __sk_del_bind_node(sk);
+ else if (!nlk->subscriptions && subscriptions)
+ sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+ nlk->subscriptions = subscriptions;
+}
+
+static int netlink_realloc_groups(struct sock *sk)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ unsigned int groups;
+ unsigned long *new_groups;
+ int err = 0;
+
+ netlink_table_grab();
+
+ groups = nl_table[sk->sk_protocol].groups;
+ if (!nl_table[sk->sk_protocol].registered) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (nlk->ngroups >= groups)
+ goto out_unlock;
+
+ new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
+ if (new_groups == NULL) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
+ NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));
+
+ nlk->groups = new_groups;
+ nlk->ngroups = groups;
+ out_unlock:
+ netlink_table_ungrab();
+ return err;
+}
+
+static void netlink_undo_bind(int group, long unsigned int groups,
+ struct sock *sk)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int undo;
+
+ if (!nlk->netlink_unbind)
+ return;
+
+ for (undo = 0; undo < group; undo++)
+ if (test_bit(undo, &groups))
+ nlk->netlink_unbind(sock_net(sk), undo + 1);
+}
+
+static int netlink_bind(struct socket *sock, struct sockaddr *addr,
+ int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct netlink_sock *nlk = nlk_sk(sk);
+ struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
+ int err;
+ long unsigned int groups = nladdr->nl_groups;
+
+ if (addr_len < sizeof(struct sockaddr_nl))
+ return -EINVAL;
+
+ if (nladdr->nl_family != AF_NETLINK)
+ return -EINVAL;
+
+ /* Only superuser is allowed to listen multicasts */
+ if (groups) {
+ if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
+ return -EPERM;
+ err = netlink_realloc_groups(sk);
+ if (err)
+ return err;
+ }
+
+ if (nlk->portid)
+ if (nladdr->nl_pid != nlk->portid)
+ return -EINVAL;
+
+ if (nlk->netlink_bind && groups) {
+ int group;
+
+ for (group = 0; group < nlk->ngroups; group++) {
+ if (!test_bit(group, &groups))
+ continue;
+ err = nlk->netlink_bind(net, group + 1);
+ if (!err)
+ continue;
+ netlink_undo_bind(group, groups, sk);
+ return err;
+ }
+ }
+
+ if (!nlk->portid) {
+ err = nladdr->nl_pid ?
+ netlink_insert(sk, nladdr->nl_pid) :
+ netlink_autobind(sock);
+ if (err) {
+ netlink_undo_bind(nlk->ngroups, groups, sk);
+ return err;
+ }
+ }
+
+ if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
+ return 0;
+
+ netlink_table_grab();
+ netlink_update_subscriptions(sk, nlk->subscriptions +
+ hweight32(groups) -
+ hweight32(nlk->groups[0]));
+ nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
+ netlink_update_listeners(sk);
+ netlink_table_ungrab();
+
+ return 0;
+}
+
+static int netlink_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ int err = 0;
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
+
+ if (alen < sizeof(addr->sa_family))
+ return -EINVAL;
+
+ if (addr->sa_family == AF_UNSPEC) {
+ sk->sk_state = NETLINK_UNCONNECTED;
+ nlk->dst_portid = 0;
+ nlk->dst_group = 0;
+ return 0;
+ }
+ if (addr->sa_family != AF_NETLINK)
+ return -EINVAL;
+
+ if ((nladdr->nl_groups || nladdr->nl_pid) &&
+ !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
+ return -EPERM;
+
+ if (!nlk->portid)
+ err = netlink_autobind(sock);
+
+ if (err == 0) {
+ sk->sk_state = NETLINK_CONNECTED;
+ nlk->dst_portid = nladdr->nl_pid;
+ nlk->dst_group = ffs(nladdr->nl_groups);
+ }
+
+ return err;
+}
+
+static int netlink_getname(struct socket *sock, struct sockaddr *addr,
+ int *addr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
+
+ nladdr->nl_family = AF_NETLINK;
+ nladdr->nl_pad = 0;
+ *addr_len = sizeof(*nladdr);
+
+ if (peer) {
+ nladdr->nl_pid = nlk->dst_portid;
+ nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
+ } else {
+ nladdr->nl_pid = nlk->portid;
+ nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
+ }
+ return 0;
+}
+
+static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
+{
+ struct sock *sock;
+ struct netlink_sock *nlk;
+
+ sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
+ if (!sock)
+ return ERR_PTR(-ECONNREFUSED);
+
+ /* Don't bother queuing skb if kernel socket has no input function */
+ nlk = nlk_sk(sock);
+ if (sock->sk_state == NETLINK_CONNECTED &&
+ nlk->dst_portid != nlk_sk(ssk)->portid) {
+ sock_put(sock);
+ return ERR_PTR(-ECONNREFUSED);
+ }
+ return sock;
+}
+
+struct sock *netlink_getsockbyfilp(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ struct sock *sock;
+
+ if (!S_ISSOCK(inode->i_mode))
+ return ERR_PTR(-ENOTSOCK);
+
+ sock = SOCKET_I(inode)->sk;
+ if (sock->sk_family != AF_NETLINK)
+ return ERR_PTR(-EINVAL);
+
+ sock_hold(sock);
+ return sock;
+}
+
+static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
+ int broadcast)
+{
+ struct sk_buff *skb;
+ void *data;
+
+ if (size <= NLMSG_GOODSIZE || broadcast)
+ return alloc_skb(size, GFP_KERNEL);
+
+ size = SKB_DATA_ALIGN(size) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ data = vmalloc(size);
+ if (data == NULL)
+ return NULL;
+
+ skb = __build_skb(data, size);
+ if (skb == NULL)
+ vfree(data);
+ else
+ skb->destructor = netlink_skb_destructor;
+
+ return skb;
+}
+
+/*
+ * Attach a skb to a netlink socket.
+ * The caller must hold a reference to the destination socket. On error, the
+ * reference is dropped. The skb is not send to the destination, just all
+ * all error checks are performed and memory in the queue is reserved.
+ * Return values:
+ * < 0: error. skb freed, reference to sock dropped.
+ * 0: continue
+ * 1: repeat lookup - reference dropped while waiting for socket memory.
+ */
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
+ long *timeo, struct sock *ssk)
+{
+ struct netlink_sock *nlk;
+
+ nlk = nlk_sk(sk);
+
+ if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ test_bit(NETLINK_CONGESTED, &nlk->state)) &&
+ !netlink_skb_is_mmaped(skb)) {
+ DECLARE_WAITQUEUE(wait, current);
+ if (!*timeo) {
+ if (!ssk || netlink_is_kernel(ssk))
+ netlink_overrun(sk);
+ sock_put(sk);
+ kfree_skb(skb);
+ return -EAGAIN;
+ }
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&nlk->wait, &wait);
+
+ if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ test_bit(NETLINK_CONGESTED, &nlk->state)) &&
+ !sock_flag(sk, SOCK_DEAD))
+ *timeo = schedule_timeout(*timeo);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&nlk->wait, &wait);
+ sock_put(sk);
+
+ if (signal_pending(current)) {
+ kfree_skb(skb);
+ return sock_intr_errno(*timeo);
+ }
+ return 1;
+ }
+ netlink_skb_set_owner_r(skb, sk);
+ return 0;
+}
+
+static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
+{
+ int len = skb->len;
+
+ netlink_deliver_tap(skb);
+
+#ifdef CONFIG_NETLINK_MMAP
+ if (netlink_skb_is_mmaped(skb))
+ netlink_queue_mmaped_skb(sk, skb);
+ else if (netlink_rx_is_mmaped(sk))
+ netlink_ring_set_copied(sk, skb);
+ else
+#endif /* CONFIG_NETLINK_MMAP */
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+ return len;
+}
+
+int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
+{
+ int len = __netlink_sendskb(sk, skb);
+
+ sock_put(sk);
+ return len;
+}
+
+void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ sock_put(sk);
+}
+
+static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
+{
+ int delta;
+
+ WARN_ON(skb->sk != NULL);
+ if (netlink_skb_is_mmaped(skb))
+ return skb;
+
+ delta = skb->end - skb->tail;
+ if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
+ return skb;
+
+ if (skb_shared(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, allocation);
+ if (!nskb)
+ return skb;
+ consume_skb(skb);
+ skb = nskb;
+ }
+
+ if (!pskb_expand_head(skb, 0, -delta, allocation))
+ skb->truesize -= delta;
+
+ return skb;
+}
+
+static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
+ struct sock *ssk)
+{
+ int ret;
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ ret = -ECONNREFUSED;
+ if (nlk->netlink_rcv != NULL) {
+ ret = skb->len;
+ netlink_skb_set_owner_r(skb, sk);
+ NETLINK_CB(skb).sk = ssk;
+ netlink_deliver_tap_kernel(sk, ssk, skb);
+ nlk->netlink_rcv(skb);
+ consume_skb(skb);
+ } else {
+ kfree_skb(skb);
+ }
+ sock_put(sk);
+ return ret;
+}
+
+int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
+ u32 portid, int nonblock)
+{
+ struct sock *sk;
+ int err;
+ long timeo;
+
+ skb = netlink_trim(skb, gfp_any());
+
+ timeo = sock_sndtimeo(ssk, nonblock);
+retry:
+ sk = netlink_getsockbyportid(ssk, portid);
+ if (IS_ERR(sk)) {
+ kfree_skb(skb);
+ return PTR_ERR(sk);
+ }
+ if (netlink_is_kernel(sk))
+ return netlink_unicast_kernel(sk, skb, ssk);
+
+ if (sk_filter(sk, skb)) {
+ err = skb->len;
+ kfree_skb(skb);
+ sock_put(sk);
+ return err;
+ }
+
+ err = netlink_attachskb(sk, skb, &timeo, ssk);
+ if (err == 1)
+ goto retry;
+ if (err)
+ return err;
+
+ return netlink_sendskb(sk, skb);
+}
+EXPORT_SYMBOL(netlink_unicast);
+
+struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
+ u32 dst_portid, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NETLINK_MMAP
+ struct sock *sk = NULL;
+ struct sk_buff *skb;
+ struct netlink_ring *ring;
+ struct nl_mmap_hdr *hdr;
+ unsigned int maxlen;
+
+ sk = netlink_getsockbyportid(ssk, dst_portid);
+ if (IS_ERR(sk))
+ goto out;
+
+ ring = &nlk_sk(sk)->rx_ring;
+ /* fast-path without atomic ops for common case: non-mmaped receiver */
+ if (ring->pg_vec == NULL)
+ goto out_put;
+
+ if (ring->frame_size - NL_MMAP_HDRLEN < size)
+ goto out_put;
+
+ skb = alloc_skb_head(gfp_mask);
+ if (skb == NULL)
+ goto err1;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ /* check again under lock */
+ if (ring->pg_vec == NULL)
+ goto out_free;
+
+ /* check again under lock */
+ maxlen = ring->frame_size - NL_MMAP_HDRLEN;
+ if (maxlen < size)
+ goto out_free;
+
+ netlink_forward_ring(ring);
+ hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
+ if (hdr == NULL)
+ goto err2;
+ netlink_ring_setup_skb(skb, sk, ring, hdr);
+ netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
+ atomic_inc(&ring->pending);
+ netlink_increment_head(ring);
+
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return skb;
+
+err2:
+ kfree_skb(skb);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ netlink_overrun(sk);
+err1:
+ sock_put(sk);
+ return NULL;
+
+out_free:
+ kfree_skb(skb);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+out_put:
+ sock_put(sk);
+out:
+#endif
+ return alloc_skb(size, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(netlink_alloc_skb);
+
+int netlink_has_listeners(struct sock *sk, unsigned int group)
+{
+ int res = 0;
+ struct listeners *listeners;
+
+ BUG_ON(!netlink_is_kernel(sk));
+
+ rcu_read_lock();
+ listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);
+
+ if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
+ res = test_bit(group - 1, listeners->masks);
+
+ rcu_read_unlock();
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(netlink_has_listeners);
+
+static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+ !test_bit(NETLINK_CONGESTED, &nlk->state)) {
+ netlink_skb_set_owner_r(skb, sk);
+ __netlink_sendskb(sk, skb);
+ return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
+ }
+ return -1;
+}
+
+struct netlink_broadcast_data {
+ struct sock *exclude_sk;
+ struct net *net;
+ u32 portid;
+ u32 group;
+ int failure;
+ int delivery_failure;
+ int congested;
+ int delivered;
+ gfp_t allocation;
+ struct sk_buff *skb, *skb2;
+ int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
+ void *tx_data;
+};
+
+static void do_one_broadcast(struct sock *sk,
+ struct netlink_broadcast_data *p)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int val;
+
+ if (p->exclude_sk == sk)
+ return;
+
+ if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
+ !test_bit(p->group - 1, nlk->groups))
+ return;
+
+ if (!net_eq(sock_net(sk), p->net))
+ return;
+
+ if (p->failure) {
+ netlink_overrun(sk);
+ return;
+ }
+
+ sock_hold(sk);
+ if (p->skb2 == NULL) {
+ if (skb_shared(p->skb)) {
+ p->skb2 = skb_clone(p->skb, p->allocation);
+ } else {
+ p->skb2 = skb_get(p->skb);
+ /*
+ * skb ownership may have been set when
+ * delivered to a previous socket.
+ */
+ skb_orphan(p->skb2);
+ }
+ }
+ if (p->skb2 == NULL) {
+ netlink_overrun(sk);
+ /* Clone failed. Notify ALL listeners. */
+ p->failure = 1;
+ if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+ p->delivery_failure = 1;
+ } else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
+ kfree_skb(p->skb2);
+ p->skb2 = NULL;
+ } else if (sk_filter(sk, p->skb2)) {
+ kfree_skb(p->skb2);
+ p->skb2 = NULL;
+ } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
+ netlink_overrun(sk);
+ if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+ p->delivery_failure = 1;
+ } else {
+ p->congested |= val;
+ p->delivered = 1;
+ p->skb2 = NULL;
+ }
+ sock_put(sk);
+}
+
+int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid,
+ u32 group, gfp_t allocation,
+ int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
+ void *filter_data)
+{
+ struct net *net = sock_net(ssk);
+ struct netlink_broadcast_data info;
+ struct sock *sk;
+
+ skb = netlink_trim(skb, allocation);
+
+ info.exclude_sk = ssk;
+ info.net = net;
+ info.portid = portid;
+ info.group = group;
+ info.failure = 0;
+ info.delivery_failure = 0;
+ info.congested = 0;
+ info.delivered = 0;
+ info.allocation = allocation;
+ info.skb = skb;
+ info.skb2 = NULL;
+ info.tx_filter = filter;
+ info.tx_data = filter_data;
+
+ /* While we sleep in clone, do not allow to change socket list */
+
+ netlink_lock_table();
+
+ sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
+ do_one_broadcast(sk, &info);
+
+ consume_skb(skb);
+
+ netlink_unlock_table();
+
+ if (info.delivery_failure) {
+ kfree_skb(info.skb2);
+ return -ENOBUFS;
+ }
+ consume_skb(info.skb2);
+
+ if (info.delivered) {
+ if (info.congested && (allocation & __GFP_WAIT))
+ yield();
+ return 0;
+ }
+ return -ESRCH;
+}
+EXPORT_SYMBOL(netlink_broadcast_filtered);
+
+int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
+ u32 group, gfp_t allocation)
+{
+ return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
+ NULL, NULL);
+}
+EXPORT_SYMBOL(netlink_broadcast);
+
+struct netlink_set_err_data {
+ struct sock *exclude_sk;
+ u32 portid;
+ u32 group;
+ int code;
+};
+
+static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int ret = 0;
+
+ if (sk == p->exclude_sk)
+ goto out;
+
+ if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
+ goto out;
+
+ if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
+ !test_bit(p->group - 1, nlk->groups))
+ goto out;
+
+ if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) {
+ ret = 1;
+ goto out;
+ }
+
+ sk->sk_err = p->code;
+ sk->sk_error_report(sk);
+out:
+ return ret;
+}
+
+/**
+ * netlink_set_err - report error to broadcast listeners
+ * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
+ * @portid: the PORTID of a process that we want to skip (if any)
+ * @group: the broadcast group that will notice the error
+ * @code: error code, must be negative (as usual in kernelspace)
+ *
+ * This function returns the number of broadcast listeners that have set the
+ * NETLINK_RECV_NO_ENOBUFS socket option.
+ */
+int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
+{
+ struct netlink_set_err_data info;
+ struct sock *sk;
+ int ret = 0;
+
+ info.exclude_sk = ssk;
+ info.portid = portid;
+ info.group = group;
+ /* sk->sk_err wants a positive error value */
+ info.code = -code;
+
+ read_lock(&nl_table_lock);
+
+ sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
+ ret += do_one_set_err(sk, &info);
+
+ read_unlock(&nl_table_lock);
+ return ret;
+}
+EXPORT_SYMBOL(netlink_set_err);
+
+/* must be called with netlink table grabbed */
+static void netlink_update_socket_mc(struct netlink_sock *nlk,
+ unsigned int group,
+ int is_new)
+{
+ int old, new = !!is_new, subscriptions;
+
+ old = test_bit(group - 1, nlk->groups);
+ subscriptions = nlk->subscriptions - old + new;
+ if (new)
+ __set_bit(group - 1, nlk->groups);
+ else
+ __clear_bit(group - 1, nlk->groups);
+ netlink_update_subscriptions(&nlk->sk, subscriptions);
+ netlink_update_listeners(&nlk->sk);
+}
+
+static int netlink_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ unsigned int val = 0;
+ int err;
+
+ if (level != SOL_NETLINK)
+ return -ENOPROTOOPT;
+
+ if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
+ optlen >= sizeof(int) &&
+ get_user(val, (unsigned int __user *)optval))
+ return -EFAULT;
+
+ switch (optname) {
+ case NETLINK_PKTINFO:
+ if (val)
+ nlk->flags |= NETLINK_RECV_PKTINFO;
+ else
+ nlk->flags &= ~NETLINK_RECV_PKTINFO;
+ err = 0;
+ break;
+ case NETLINK_ADD_MEMBERSHIP:
+ case NETLINK_DROP_MEMBERSHIP: {
+ if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
+ return -EPERM;
+ err = netlink_realloc_groups(sk);
+ if (err)
+ return err;
+ if (!val || val - 1 >= nlk->ngroups)
+ return -EINVAL;
+ if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
+ err = nlk->netlink_bind(sock_net(sk), val);
+ if (err)
+ return err;
+ }
+ netlink_table_grab();
+ netlink_update_socket_mc(nlk, val,
+ optname == NETLINK_ADD_MEMBERSHIP);
+ netlink_table_ungrab();
+ if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
+ nlk->netlink_unbind(sock_net(sk), val);
+
+ err = 0;
+ break;
+ }
+ case NETLINK_BROADCAST_ERROR:
+ if (val)
+ nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
+ else
+ nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
+ err = 0;
+ break;
+ case NETLINK_NO_ENOBUFS:
+ if (val) {
+ nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
+ clear_bit(NETLINK_CONGESTED, &nlk->state);
+ wake_up_interruptible(&nlk->wait);
+ } else {
+ nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
+ }
+ err = 0;
+ break;
+#ifdef CONFIG_NETLINK_MMAP
+ case NETLINK_RX_RING:
+ case NETLINK_TX_RING: {
+ struct nl_mmap_req req;
+
+ /* Rings might consume more memory than queue limits, require
+ * CAP_NET_ADMIN.
+ */
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (optlen < sizeof(req))
+ return -EINVAL;
+ if (copy_from_user(&req, optval, sizeof(req)))
+ return -EFAULT;
+ err = netlink_set_ring(sk, &req, false,
+ optname == NETLINK_TX_RING);
+ break;
+ }
+#endif /* CONFIG_NETLINK_MMAP */
+ default:
+ err = -ENOPROTOOPT;
+ }
+ return err;
+}
+
+static int netlink_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int len, val, err;
+
+ if (level != SOL_NETLINK)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case NETLINK_PKTINFO:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
+ if (put_user(len, optlen) ||
+ put_user(val, optval))
+ return -EFAULT;
+ err = 0;
+ break;
+ case NETLINK_BROADCAST_ERROR:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
+ if (put_user(len, optlen) ||
+ put_user(val, optval))
+ return -EFAULT;
+ err = 0;
+ break;
+ case NETLINK_NO_ENOBUFS:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
+ if (put_user(len, optlen) ||
+ put_user(val, optval))
+ return -EFAULT;
+ err = 0;
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ }
+ return err;
+}
+
+static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct nl_pktinfo info;
+
+ info.group = NETLINK_CB(skb).dst_group;
+ put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
+}
+
+static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
+ u32 dst_portid;
+ u32 dst_group;
+ struct sk_buff *skb;
+ int err;
+ struct scm_cookie scm;
+ u32 netlink_skb_flags = 0;
+
+ if (msg->msg_flags&MSG_OOB)
+ return -EOPNOTSUPP;
+
+ err = scm_send(sock, msg, &scm, true);
+ if (err < 0)
+ return err;
+
+ if (msg->msg_namelen) {
+ err = -EINVAL;
+ if (addr->nl_family != AF_NETLINK)
+ goto out;
+ dst_portid = addr->nl_pid;
+ dst_group = ffs(addr->nl_groups);
+ err = -EPERM;
+ if ((dst_group || dst_portid) &&
+ !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
+ goto out;
+ netlink_skb_flags |= NETLINK_SKB_DST;
+ } else {
+ dst_portid = nlk->dst_portid;
+ dst_group = nlk->dst_group;
+ }
+
+ if (!nlk->portid) {
+ err = netlink_autobind(sock);
+ if (err)
+ goto out;
+ }
+
+ /* It's a really convoluted way for userland to ask for mmaped
+ * sendmsg(), but that's what we've got...
+ */
+ if (netlink_tx_is_mmaped(sk) &&
+ msg->msg_iter.type == ITER_IOVEC &&
+ msg->msg_iter.nr_segs == 1 &&
+ msg->msg_iter.iov->iov_base == NULL) {
+ err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
+ &scm);
+ goto out;
+ }
+
+ err = -EMSGSIZE;
+ if (len > sk->sk_sndbuf - 32)
+ goto out;
+ err = -ENOBUFS;
+ skb = netlink_alloc_large_skb(len, dst_group);
+ if (skb == NULL)
+ goto out;
+
+ NETLINK_CB(skb).portid = nlk->portid;
+ NETLINK_CB(skb).dst_group = dst_group;
+ NETLINK_CB(skb).creds = scm.creds;
+ NETLINK_CB(skb).flags = netlink_skb_flags;
+
+ err = -EFAULT;
+ if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ err = security_netlink_send(sk, skb);
+ if (err) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ if (dst_group) {
+ atomic_inc(&skb->users);
+ netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
+ }
+ err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT);
+
+out:
+ scm_destroy(&scm);
+ return err;
+}
+
+static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct scm_cookie scm;
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int noblock = flags&MSG_DONTWAIT;
+ size_t copied;
+ struct sk_buff *skb, *data_skb;
+ int err, ret;
+
+ if (flags&MSG_OOB)
+ return -EOPNOTSUPP;
+
+ copied = 0;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (skb == NULL)
+ goto out;
+
+ data_skb = skb;
+
+#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
+ if (unlikely(skb_shinfo(skb)->frag_list)) {
+ /*
+ * If this skb has a frag_list, then here that means that we
+ * will have to use the frag_list skb's data for compat tasks
+ * and the regular skb's data for normal (non-compat) tasks.
+ *
+ * If we need to send the compat skb, assign it to the
+ * 'data_skb' variable so that it will be used below for data
+ * copying. We keep 'skb' for everything else, including
+ * freeing both later.
+ */
+ if (flags & MSG_CMSG_COMPAT)
+ data_skb = skb_shinfo(skb)->frag_list;
+ }
+#endif
+
+ /* Record the max length of recvmsg() calls for future allocations */
+ nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len);
+ nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
+ 16384);
+
+ copied = data_skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ skb_reset_transport_header(data_skb);
+ err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
+
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
+ addr->nl_family = AF_NETLINK;
+ addr->nl_pad = 0;
+ addr->nl_pid = NETLINK_CB(skb).portid;
+ addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
+ msg->msg_namelen = sizeof(*addr);
+ }
+
+ if (nlk->flags & NETLINK_RECV_PKTINFO)
+ netlink_cmsg_recv_pktinfo(msg, skb);
+
+ memset(&scm, 0, sizeof(scm));
+ scm.creds = *NETLINK_CREDS(skb);
+ if (flags & MSG_TRUNC)
+ copied = data_skb->len;
+
+ skb_free_datagram(sk, skb);
+
+ if (nlk->cb_running &&
+ atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
+ ret = netlink_dump(sk);
+ if (ret) {
+ sk->sk_err = -ret;
+ sk->sk_error_report(sk);
+ }
+ }
+
+ scm_recv(sock, msg, &scm, flags);
+out:
+ netlink_rcv_wake(sk);
+ return err ? : copied;
+}
+
+static void netlink_data_ready(struct sock *sk)
+{
+ BUG();
+}
+
+/*
+ * We export these functions to other modules. They provide a
+ * complete set of kernel non-blocking support for message
+ * queueing.
+ */
+
+struct sock *
+__netlink_kernel_create(struct net *net, int unit, struct module *module,
+ struct netlink_kernel_cfg *cfg)
+{
+ struct socket *sock;
+ struct sock *sk;
+ struct netlink_sock *nlk;
+ struct listeners *listeners = NULL;
+ struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL;
+ unsigned int groups;
+
+ BUG_ON(!nl_table);
+
+ if (unit < 0 || unit >= MAX_LINKS)
+ return NULL;
+
+ if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
+ return NULL;
+
+ /*
+ * We have to just have a reference on the net from sk, but don't
+ * get_net it. Besides, we cannot get and then put the net here.
+ * So we create one inside init_net and the move it to net.
+ */
+
+ if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
+ goto out_sock_release_nosk;
+
+ sk = sock->sk;
+ sk_change_net(sk, net);
+
+ if (!cfg || cfg->groups < 32)
+ groups = 32;
+ else
+ groups = cfg->groups;
+
+ listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
+ if (!listeners)
+ goto out_sock_release;
+
+ sk->sk_data_ready = netlink_data_ready;
+ if (cfg && cfg->input)
+ nlk_sk(sk)->netlink_rcv = cfg->input;
+
+ if (netlink_insert(sk, 0))
+ goto out_sock_release;
+
+ nlk = nlk_sk(sk);
+ nlk->flags |= NETLINK_KERNEL_SOCKET;
+
+ netlink_table_grab();
+ if (!nl_table[unit].registered) {
+ nl_table[unit].groups = groups;
+ rcu_assign_pointer(nl_table[unit].listeners, listeners);
+ nl_table[unit].cb_mutex = cb_mutex;
+ nl_table[unit].module = module;
+ if (cfg) {
+ nl_table[unit].bind = cfg->bind;
+ nl_table[unit].unbind = cfg->unbind;
+ nl_table[unit].flags = cfg->flags;
+ if (cfg->compare)
+ nl_table[unit].compare = cfg->compare;
+ }
+ nl_table[unit].registered = 1;
+ } else {
+ kfree(listeners);
+ nl_table[unit].registered++;
+ }
+ netlink_table_ungrab();
+ return sk;
+
+out_sock_release:
+ kfree(listeners);
+ netlink_kernel_release(sk);
+ return NULL;
+
+out_sock_release_nosk:
+ sock_release(sock);
+ return NULL;
+}
+EXPORT_SYMBOL(__netlink_kernel_create);
+
+void
+netlink_kernel_release(struct sock *sk)
+{
+ sk_release_kernel(sk);
+}
+EXPORT_SYMBOL(netlink_kernel_release);
+
+int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
+{
+ struct listeners *new, *old;
+ struct netlink_table *tbl = &nl_table[sk->sk_protocol];
+
+ if (groups < 32)
+ groups = 32;
+
+ if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
+ new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
+ if (!new)
+ return -ENOMEM;
+ old = nl_deref_protected(tbl->listeners);
+ memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
+ rcu_assign_pointer(tbl->listeners, new);
+
+ kfree_rcu(old, rcu);
+ }
+ tbl->groups = groups;
+
+ return 0;
+}
+
+/**
+ * netlink_change_ngroups - change number of multicast groups
+ *
+ * This changes the number of multicast groups that are available
+ * on a certain netlink family. Note that it is not possible to
+ * change the number of groups to below 32. Also note that it does
+ * not implicitly call netlink_clear_multicast_users() when the
+ * number of groups is reduced.
+ *
+ * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
+ * @groups: The new number of groups.
+ */
+int netlink_change_ngroups(struct sock *sk, unsigned int groups)
+{
+ int err;
+
+ netlink_table_grab();
+ err = __netlink_change_ngroups(sk, groups);
+ netlink_table_ungrab();
+
+ return err;
+}
+
+void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+{
+ struct sock *sk;
+ struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
+
+ sk_for_each_bound(sk, &tbl->mc_list)
+ netlink_update_socket_mc(nlk_sk(sk), group, 0);
+}
+
+struct nlmsghdr *
+__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
+{
+ struct nlmsghdr *nlh;
+ int size = nlmsg_msg_size(len);
+
+ nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_ALIGN(size));
+ nlh->nlmsg_type = type;
+ nlh->nlmsg_len = size;
+ nlh->nlmsg_flags = flags;
+ nlh->nlmsg_pid = portid;
+ nlh->nlmsg_seq = seq;
+ if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
+ memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
+ return nlh;
+}
+EXPORT_SYMBOL(__nlmsg_put);
+
+/*
+ * It looks a bit ugly.
+ * It would be better to create kernel thread.
+ */
+
+static int netlink_dump(struct sock *sk)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ struct netlink_callback *cb;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ int len, err = -ENOBUFS;
+ int alloc_size;
+
+ mutex_lock(nlk->cb_mutex);
+ if (!nlk->cb_running) {
+ err = -EINVAL;
+ goto errout_skb;
+ }
+
+ cb = &nlk->cb;
+ alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
+
+ if (!netlink_rx_is_mmaped(sk) &&
+ atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ goto errout_skb;
+
+ /* NLMSG_GOODSIZE is small to avoid high order allocations being
+ * required, but it makes sense to _attempt_ a 16K bytes allocation
+ * to reduce number of system calls on dump operations, if user
+ * ever provided a big enough buffer.
+ */
+ if (alloc_size < nlk->max_recvmsg_len) {
+ skb = netlink_alloc_skb(sk,
+ nlk->max_recvmsg_len,
+ nlk->portid,
+ GFP_KERNEL |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
+ /* available room should be exact amount to avoid MSG_TRUNC */
+ if (skb)
+ skb_reserve(skb, skb_tailroom(skb) -
+ nlk->max_recvmsg_len);
+ }
+ if (!skb)
+ skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
+ GFP_KERNEL);
+ if (!skb)
+ goto errout_skb;
+ netlink_skb_set_owner_r(skb, sk);
+
+ len = cb->dump(skb, cb);
+
+ if (len > 0) {
+ mutex_unlock(nlk->cb_mutex);
+
+ if (sk_filter(sk, skb))
+ kfree_skb(skb);
+ else
+ __netlink_sendskb(sk, skb);
+ return 0;
+ }
+
+ nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
+ if (!nlh)
+ goto errout_skb;
+
+ nl_dump_check_consistent(cb, nlh);
+
+ memcpy(nlmsg_data(nlh), &len, sizeof(len));
+
+ if (sk_filter(sk, skb))
+ kfree_skb(skb);
+ else
+ __netlink_sendskb(sk, skb);
+
+ if (cb->done)
+ cb->done(cb);
+
+ nlk->cb_running = false;
+ mutex_unlock(nlk->cb_mutex);
+ module_put(cb->module);
+ consume_skb(cb->skb);
+ return 0;
+
+errout_skb:
+ mutex_unlock(nlk->cb_mutex);
+ kfree_skb(skb);
+ return err;
+}
+
+int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *control)
+{
+ struct netlink_callback *cb;
+ struct sock *sk;
+ struct netlink_sock *nlk;
+ int ret;
+
+ /* Memory mapped dump requests need to be copied to avoid looping
+ * on the pending state in netlink_mmap_sendmsg() while the CB hold
+ * a reference to the skb.
+ */
+ if (netlink_skb_is_mmaped(skb)) {
+ skb = skb_copy(skb, GFP_KERNEL);
+ if (skb == NULL)
+ return -ENOBUFS;
+ } else
+ atomic_inc(&skb->users);
+
+ sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
+ if (sk == NULL) {
+ ret = -ECONNREFUSED;
+ goto error_free;
+ }
+
+ nlk = nlk_sk(sk);
+ mutex_lock(nlk->cb_mutex);
+ /* A dump is in progress... */
+ if (nlk->cb_running) {
+ ret = -EBUSY;
+ goto error_unlock;
+ }
+ /* add reference of module which cb->dump belongs to */
+ if (!try_module_get(control->module)) {
+ ret = -EPROTONOSUPPORT;
+ goto error_unlock;
+ }
+
+ cb = &nlk->cb;
+ memset(cb, 0, sizeof(*cb));
+ cb->dump = control->dump;
+ cb->done = control->done;
+ cb->nlh = nlh;
+ cb->data = control->data;
+ cb->module = control->module;
+ cb->min_dump_alloc = control->min_dump_alloc;
+ cb->skb = skb;
+
+ nlk->cb_running = true;
+
+ mutex_unlock(nlk->cb_mutex);
+
+ ret = netlink_dump(sk);
+ sock_put(sk);
+
+ if (ret)
+ return ret;
+
+ /* We successfully started a dump, by returning -EINTR we
+ * signal not to send ACK even if it was requested.
+ */
+ return -EINTR;
+
+error_unlock:
+ sock_put(sk);
+ mutex_unlock(nlk->cb_mutex);
+error_free:
+ kfree_skb(skb);
+ return ret;
+}
+EXPORT_SYMBOL(__netlink_dump_start);
+
+void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *rep;
+ struct nlmsgerr *errmsg;
+ size_t payload = sizeof(*errmsg);
+
+ /* error messages get the original request appened */
+ if (err)
+ payload += nlmsg_len(nlh);
+
+ skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
+ NETLINK_CB(in_skb).portid, GFP_KERNEL);
+ if (!skb) {
+ struct sock *sk;
+
+ sk = netlink_lookup(sock_net(in_skb->sk),
+ in_skb->sk->sk_protocol,
+ NETLINK_CB(in_skb).portid);
+ if (sk) {
+ sk->sk_err = ENOBUFS;
+ sk->sk_error_report(sk);
+ sock_put(sk);
+ }
+ return;
+ }
+
+ rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+ NLMSG_ERROR, payload, 0);
+ errmsg = nlmsg_data(rep);
+ errmsg->error = err;
+ memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
+ netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
+}
+EXPORT_SYMBOL(netlink_ack);
+
+int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+ struct nlmsghdr *))
+{
+ struct nlmsghdr *nlh;
+ int err;
+
+ while (skb->len >= nlmsg_total_size(0)) {
+ int msglen;
+
+ nlh = nlmsg_hdr(skb);
+ err = 0;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+ return 0;
+
+ /* Only requests are handled by the kernel */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
+ goto ack;
+
+ /* Skip control messages */
+ if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
+ goto ack;
+
+ err = cb(skb, nlh);
+ if (err == -EINTR)
+ goto skip;
+
+ack:
+ if (nlh->nlmsg_flags & NLM_F_ACK || err)
+ netlink_ack(skb, nlh, err);
+
+skip:
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netlink_rcv_skb);
+
+/**
+ * nlmsg_notify - send a notification netlink message
+ * @sk: netlink socket to use
+ * @skb: notification message
+ * @portid: destination netlink portid for reports or 0
+ * @group: destination multicast group or 0
+ * @report: 1 to report back, 0 to disable
+ * @flags: allocation flags
+ */
+int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
+ unsigned int group, int report, gfp_t flags)
+{
+ int err = 0;
+
+ if (group) {
+ int exclude_portid = 0;
+
+ if (report) {
+ atomic_inc(&skb->users);
+ exclude_portid = portid;
+ }
+
+ /* errors reported via destination sk->sk_err, but propagate
+ * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
+ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
+ }
+
+ if (report) {
+ int err2;
+
+ err2 = nlmsg_unicast(sk, skb, portid);
+ if (!err || err == -ESRCH)
+ err = err2;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(nlmsg_notify);
+
+#ifdef CONFIG_PROC_FS
+struct nl_seq_iter {
+ struct seq_net_private p;
+ struct rhashtable_iter hti;
+ int link;
+};
+
+static int netlink_walk_start(struct nl_seq_iter *iter)
+{
+ int err;
+
+ err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti);
+ if (err) {
+ iter->link = MAX_LINKS;
+ return err;
+ }
+
+ err = rhashtable_walk_start(&iter->hti);
+ return err == -EAGAIN ? 0 : err;
+}
+
+static void netlink_walk_stop(struct nl_seq_iter *iter)
+{
+ rhashtable_walk_stop(&iter->hti);
+ rhashtable_walk_exit(&iter->hti);
+}
+
+static void *__netlink_seq_next(struct seq_file *seq)
+{
+ struct nl_seq_iter *iter = seq->private;
+ struct netlink_sock *nlk;
+
+ do {
+ for (;;) {
+ int err;
+
+ nlk = rhashtable_walk_next(&iter->hti);
+
+ if (IS_ERR(nlk)) {
+ if (PTR_ERR(nlk) == -EAGAIN)
+ continue;
+
+ return nlk;
+ }
+
+ if (nlk)
+ break;
+
+ netlink_walk_stop(iter);
+ if (++iter->link >= MAX_LINKS)
+ return NULL;
+
+ err = netlink_walk_start(iter);
+ if (err)
+ return ERR_PTR(err);
+ }
+ } while (sock_net(&nlk->sk) != seq_file_net(seq));
+
+ return nlk;
+}
+
+static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
+{
+ struct nl_seq_iter *iter = seq->private;
+ void *obj = SEQ_START_TOKEN;
+ loff_t pos;
+ int err;
+
+ iter->link = 0;
+
+ err = netlink_walk_start(iter);
+ if (err)
+ return ERR_PTR(err);
+
+ for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
+ obj = __netlink_seq_next(seq);
+
+ return obj;
+}
+
+static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return __netlink_seq_next(seq);
+}
+
+static void netlink_seq_stop(struct seq_file *seq, void *v)
+{
+ struct nl_seq_iter *iter = seq->private;
+
+ if (iter->link >= MAX_LINKS)
+ return;
+
+ netlink_walk_stop(iter);
+}
+
+
+static int netlink_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "sk Eth Pid Groups "
+ "Rmem Wmem Dump Locks Drops Inode\n");
+ } else {
+ struct sock *s = v;
+ struct netlink_sock *nlk = nlk_sk(s);
+
+ seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n",
+ s,
+ s->sk_protocol,
+ nlk->portid,
+ nlk->groups ? (u32)nlk->groups[0] : 0,
+ sk_rmem_alloc_get(s),
+ sk_wmem_alloc_get(s),
+ nlk->cb_running,
+ atomic_read(&s->sk_refcnt),
+ atomic_read(&s->sk_drops),
+ sock_i_ino(s)
+ );
+
+ }
+ return 0;
+}
+
+static const struct seq_operations netlink_seq_ops = {
+ .start = netlink_seq_start,
+ .next = netlink_seq_next,
+ .stop = netlink_seq_stop,
+ .show = netlink_seq_show,
+};
+
+
+static int netlink_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &netlink_seq_ops,
+ sizeof(struct nl_seq_iter));
+}
+
+static const struct file_operations netlink_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = netlink_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif
+
+int netlink_register_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&netlink_chain, nb);
+}
+EXPORT_SYMBOL(netlink_register_notifier);
+
+int netlink_unregister_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&netlink_chain, nb);
+}
+EXPORT_SYMBOL(netlink_unregister_notifier);
+
+static const struct proto_ops netlink_ops = {
+ .family = PF_NETLINK,
+ .owner = THIS_MODULE,
+ .release = netlink_release,
+ .bind = netlink_bind,
+ .connect = netlink_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = netlink_getname,
+ .poll = netlink_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = netlink_setsockopt,
+ .getsockopt = netlink_getsockopt,
+ .sendmsg = netlink_sendmsg,
+ .recvmsg = netlink_recvmsg,
+ .mmap = netlink_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static const struct net_proto_family netlink_family_ops = {
+ .family = PF_NETLINK,
+ .create = netlink_create,
+ .owner = THIS_MODULE, /* for consistency 8) */
+};
+
+static int __net_init netlink_net_init(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit netlink_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("netlink", net->proc_net);
+#endif
+}
+
+static void __init netlink_add_usersock_entry(void)
+{
+ struct listeners *listeners;
+ int groups = 32;
+
+ listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
+ if (!listeners)
+ panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
+
+ netlink_table_grab();
+
+ nl_table[NETLINK_USERSOCK].groups = groups;
+ rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
+ nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
+ nl_table[NETLINK_USERSOCK].registered = 1;
+ nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;
+
+ netlink_table_ungrab();
+}
+
+static struct pernet_operations __net_initdata netlink_net_ops = {
+ .init = netlink_net_init,
+ .exit = netlink_net_exit,
+};
+
+static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
+{
+ const struct netlink_sock *nlk = data;
+ struct netlink_compare_arg arg;
+
+ netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
+ return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
+}
+
+static const struct rhashtable_params netlink_rhashtable_params = {
+ .head_offset = offsetof(struct netlink_sock, node),
+ .key_len = netlink_compare_arg_len,
+ .obj_hashfn = netlink_hash,
+ .obj_cmpfn = netlink_compare,
+ .automatic_shrinking = true,
+};
+
+static int __init netlink_proto_init(void)
+{
+ int i;
+ int err = proto_register(&netlink_proto, 0);
+
+ if (err != 0)
+ goto out;
+
+ BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
+
+ nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
+ if (!nl_table)
+ goto panic;
+
+ for (i = 0; i < MAX_LINKS; i++) {
+ if (rhashtable_init(&nl_table[i].hash,
+ &netlink_rhashtable_params) < 0) {
+ while (--i > 0)
+ rhashtable_destroy(&nl_table[i].hash);
+ kfree(nl_table);
+ goto panic;
+ }
+ }
+
+ INIT_LIST_HEAD(&netlink_tap_all);
+
+ netlink_add_usersock_entry();
+
+ sock_register(&netlink_family_ops);
+ register_pernet_subsys(&netlink_net_ops);
+ /* The netlink device handler may be needed early. */
+ rtnetlink_init();
+out:
+ return err;
+panic:
+ panic("netlink_init: Cannot allocate nl_table\n");
+}
+
+core_initcall(netlink_proto_init);
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
new file mode 100644
index 000000000..89008405d
--- /dev/null
+++ b/net/netlink/af_netlink.h
@@ -0,0 +1,79 @@
+#ifndef _AF_NETLINK_H
+#define _AF_NETLINK_H
+
+#include <linux/rhashtable.h>
+#include <linux/atomic.h>
+#include <net/sock.h>
+
+#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
+#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
+
+struct netlink_ring {
+ void **pg_vec;
+ unsigned int head;
+ unsigned int frames_per_block;
+ unsigned int frame_size;
+ unsigned int frame_max;
+
+ unsigned int pg_vec_order;
+ unsigned int pg_vec_pages;
+ unsigned int pg_vec_len;
+
+ atomic_t pending;
+};
+
+struct netlink_sock {
+ /* struct sock has to be the first member of netlink_sock */
+ struct sock sk;
+ u32 portid;
+ u32 dst_portid;
+ u32 dst_group;
+ u32 flags;
+ u32 subscriptions;
+ u32 ngroups;
+ unsigned long *groups;
+ unsigned long state;
+ size_t max_recvmsg_len;
+ wait_queue_head_t wait;
+ bool cb_running;
+ struct netlink_callback cb;
+ struct mutex *cb_mutex;
+ struct mutex cb_def_mutex;
+ void (*netlink_rcv)(struct sk_buff *skb);
+ int (*netlink_bind)(struct net *net, int group);
+ void (*netlink_unbind)(struct net *net, int group);
+ struct module *module;
+#ifdef CONFIG_NETLINK_MMAP
+ struct mutex pg_vec_lock;
+ struct netlink_ring rx_ring;
+ struct netlink_ring tx_ring;
+ atomic_t mapped;
+#endif /* CONFIG_NETLINK_MMAP */
+
+ struct rhash_head node;
+ struct rcu_head rcu;
+};
+
+static inline struct netlink_sock *nlk_sk(struct sock *sk)
+{
+ return container_of(sk, struct netlink_sock, sk);
+}
+
+struct netlink_table {
+ struct rhashtable hash;
+ struct hlist_head mc_list;
+ struct listeners __rcu *listeners;
+ unsigned int flags;
+ unsigned int groups;
+ struct mutex *cb_mutex;
+ struct module *module;
+ int (*bind)(struct net *net, int group);
+ void (*unbind)(struct net *net, int group);
+ bool (*compare)(struct net *net, struct sock *sock);
+ int registered;
+};
+
+extern struct netlink_table *nl_table;
+extern rwlock_t nl_table_lock;
+
+#endif
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
new file mode 100644
index 000000000..3ee63a3cf
--- /dev/null
+++ b/net/netlink/diag.c
@@ -0,0 +1,238 @@
+#include <linux/module.h>
+
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/sock_diag.h>
+#include <linux/netlink_diag.h>
+#include <linux/rhashtable.h>
+
+#include "af_netlink.h"
+
+#ifdef CONFIG_NETLINK_MMAP
+static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
+ struct sk_buff *nlskb)
+{
+ struct netlink_diag_ring ndr;
+
+ ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
+ ndr.ndr_block_nr = ring->pg_vec_len;
+ ndr.ndr_frame_size = ring->frame_size;
+ ndr.ndr_frame_nr = ring->frame_max + 1;
+
+ return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
+}
+
+static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int ret;
+
+ mutex_lock(&nlk->pg_vec_lock);
+ ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
+ if (!ret)
+ ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
+ nlskb);
+ mutex_unlock(&nlk->pg_vec_lock);
+
+ return ret;
+}
+#else
+static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
+{
+ return 0;
+}
+#endif
+
+static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (nlk->groups == NULL)
+ return 0;
+
+ return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups),
+ nlk->groups);
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_diag_req *req,
+ u32 portid, u32 seq, u32 flags, int sk_ino)
+{
+ struct nlmsghdr *nlh;
+ struct netlink_diag_msg *rep;
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rep = nlmsg_data(nlh);
+ rep->ndiag_family = AF_NETLINK;
+ rep->ndiag_type = sk->sk_type;
+ rep->ndiag_protocol = sk->sk_protocol;
+ rep->ndiag_state = sk->sk_state;
+
+ rep->ndiag_ino = sk_ino;
+ rep->ndiag_portid = nlk->portid;
+ rep->ndiag_dst_portid = nlk->dst_portid;
+ rep->ndiag_dst_group = nlk->dst_group;
+ sock_diag_save_cookie(sk, rep->ndiag_cookie);
+
+ if ((req->ndiag_show & NDIAG_SHOW_GROUPS) &&
+ sk_diag_dump_groups(sk, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) &&
+ sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
+ goto out_nlmsg_trim;
+
+ if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
+ sk_diag_put_rings_cfg(sk, skb))
+ goto out_nlmsg_trim;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+out_nlmsg_trim:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ int protocol, int s_num)
+{
+ struct netlink_table *tbl = &nl_table[protocol];
+ struct rhashtable *ht = &tbl->hash;
+ const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht);
+ struct net *net = sock_net(skb->sk);
+ struct netlink_diag_req *req;
+ struct netlink_sock *nlsk;
+ struct sock *sk;
+ int ret = 0, num = 0, i;
+
+ req = nlmsg_data(cb->nlh);
+
+ for (i = 0; i < htbl->size; i++) {
+ struct rhash_head *pos;
+
+ rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) {
+ sk = (struct sock *)nlsk;
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (sk_diag_fill(sk, skb, req,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ sock_i_ino(sk)) < 0) {
+ ret = 1;
+ goto done;
+ }
+
+ num++;
+ }
+ }
+
+ sk_for_each_bound(sk, &tbl->mc_list) {
+ if (sk_hashed(sk))
+ continue;
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (sk_diag_fill(sk, skb, req,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ sock_i_ino(sk)) < 0) {
+ ret = 1;
+ goto done;
+ }
+ num++;
+ }
+done:
+ cb->args[0] = num;
+ cb->args[1] = protocol;
+
+ return ret;
+}
+
+static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netlink_diag_req *req;
+ int s_num = cb->args[0];
+
+ req = nlmsg_data(cb->nlh);
+
+ rcu_read_lock();
+ read_lock(&nl_table_lock);
+
+ if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
+ int i;
+
+ for (i = cb->args[1]; i < MAX_LINKS; i++) {
+ if (__netlink_diag_dump(skb, cb, i, s_num))
+ break;
+ s_num = 0;
+ }
+ } else {
+ if (req->sdiag_protocol >= MAX_LINKS) {
+ read_unlock(&nl_table_lock);
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
+ }
+
+ read_unlock(&nl_table_lock);
+ rcu_read_unlock();
+
+ return skb->len;
+}
+
+static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ int hdrlen = sizeof(struct netlink_diag_req);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = netlink_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ } else
+ return -EOPNOTSUPP;
+}
+
+static const struct sock_diag_handler netlink_diag_handler = {
+ .family = AF_NETLINK,
+ .dump = netlink_diag_handler_dump,
+};
+
+static int __init netlink_diag_init(void)
+{
+ return sock_diag_register(&netlink_diag_handler);
+}
+
+static void __exit netlink_diag_exit(void)
+{
+ sock_diag_unregister(&netlink_diag_handler);
+}
+
+module_init(netlink_diag_init);
+module_exit(netlink_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
new file mode 100644
index 000000000..2ed5f9647
--- /dev/null
+++ b/net/netlink/genetlink.c
@@ -0,0 +1,1154 @@
+/*
+ * NETLINK Generic Netlink Family
+ *
+ * Authors: Jamal Hadi Salim
+ * Thomas Graf <tgraf@suug.ch>
+ * Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/mutex.h>
+#include <linux/bitmap.h>
+#include <linux/rwsem.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */
+static DECLARE_RWSEM(cb_lock);
+
+atomic_t genl_sk_destructing_cnt = ATOMIC_INIT(0);
+DECLARE_WAIT_QUEUE_HEAD(genl_sk_destructing_waitq);
+
+void genl_lock(void)
+{
+ mutex_lock(&genl_mutex);
+}
+EXPORT_SYMBOL(genl_lock);
+
+void genl_unlock(void)
+{
+ mutex_unlock(&genl_mutex);
+}
+EXPORT_SYMBOL(genl_unlock);
+
+#ifdef CONFIG_LOCKDEP
+int lockdep_genl_is_held(void)
+{
+ return lockdep_is_held(&genl_mutex);
+}
+EXPORT_SYMBOL(lockdep_genl_is_held);
+#endif
+
+static void genl_lock_all(void)
+{
+ down_write(&cb_lock);
+ genl_lock();
+}
+
+static void genl_unlock_all(void)
+{
+ genl_unlock();
+ up_write(&cb_lock);
+}
+
+#define GENL_FAM_TAB_SIZE 16
+#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1)
+
+static struct list_head family_ht[GENL_FAM_TAB_SIZE];
+/*
+ * Bitmap of multicast groups that are currently in use.
+ *
+ * To avoid an allocation at boot of just one unsigned long,
+ * declare it global instead.
+ * Bit 0 is marked as already used since group 0 is invalid.
+ * Bit 1 is marked as already used since the drop-monitor code
+ * abuses the API and thinks it can statically use group 1.
+ * That group will typically conflict with other groups that
+ * any proper users use.
+ * Bit 16 is marked as used since it's used for generic netlink
+ * and the code no longer marks pre-reserved IDs as used.
+ * Bit 17 is marked as already used since the VFS quota code
+ * also abused this API and relied on family == group ID, we
+ * cater to that by giving it a static family and group ID.
+ * Bit 18 is marked as already used since the PMCRAID driver
+ * did the same thing as the VFS quota code (maybe copied?)
+ */
+static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
+ BIT(GENL_ID_VFS_DQUOT) |
+ BIT(GENL_ID_PMCRAID);
+static unsigned long *mc_groups = &mc_group_start;
+static unsigned long mc_groups_longs = 1;
+
+static int genl_ctrl_event(int event, struct genl_family *family,
+ const struct genl_multicast_group *grp,
+ int grp_id);
+
+static inline unsigned int genl_family_hash(unsigned int id)
+{
+ return id & GENL_FAM_TAB_MASK;
+}
+
+static inline struct list_head *genl_family_chain(unsigned int id)
+{
+ return &family_ht[genl_family_hash(id)];
+}
+
+static struct genl_family *genl_family_find_byid(unsigned int id)
+{
+ struct genl_family *f;
+
+ list_for_each_entry(f, genl_family_chain(id), family_list)
+ if (f->id == id)
+ return f;
+
+ return NULL;
+}
+
+static struct genl_family *genl_family_find_byname(char *name)
+{
+ struct genl_family *f;
+ int i;
+
+ for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
+ list_for_each_entry(f, genl_family_chain(i), family_list)
+ if (strcmp(f->name, name) == 0)
+ return f;
+
+ return NULL;
+}
+
+static const struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
+{
+ int i;
+
+ for (i = 0; i < family->n_ops; i++)
+ if (family->ops[i].cmd == cmd)
+ return &family->ops[i];
+
+ return NULL;
+}
+
+/* Of course we are going to have problems once we hit
+ * 2^16 alive types, but that can only happen by year 2K
+*/
+static u16 genl_generate_id(void)
+{
+ static u16 id_gen_idx = GENL_MIN_ID;
+ int i;
+
+ for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
+ if (id_gen_idx != GENL_ID_VFS_DQUOT &&
+ id_gen_idx != GENL_ID_PMCRAID &&
+ !genl_family_find_byid(id_gen_idx))
+ return id_gen_idx;
+ if (++id_gen_idx > GENL_MAX_ID)
+ id_gen_idx = GENL_MIN_ID;
+ }
+
+ return 0;
+}
+
+static int genl_allocate_reserve_groups(int n_groups, int *first_id)
+{
+ unsigned long *new_groups;
+ int start = 0;
+ int i;
+ int id;
+ bool fits;
+
+ do {
+ if (start == 0)
+ id = find_first_zero_bit(mc_groups,
+ mc_groups_longs *
+ BITS_PER_LONG);
+ else
+ id = find_next_zero_bit(mc_groups,
+ mc_groups_longs * BITS_PER_LONG,
+ start);
+
+ fits = true;
+ for (i = id;
+ i < min_t(int, id + n_groups,
+ mc_groups_longs * BITS_PER_LONG);
+ i++) {
+ if (test_bit(i, mc_groups)) {
+ start = i;
+ fits = false;
+ break;
+ }
+ }
+
+ if (id >= mc_groups_longs * BITS_PER_LONG) {
+ unsigned long new_longs = mc_groups_longs +
+ BITS_TO_LONGS(n_groups);
+ size_t nlen = new_longs * sizeof(unsigned long);
+
+ if (mc_groups == &mc_group_start) {
+ new_groups = kzalloc(nlen, GFP_KERNEL);
+ if (!new_groups)
+ return -ENOMEM;
+ mc_groups = new_groups;
+ *mc_groups = mc_group_start;
+ } else {
+ new_groups = krealloc(mc_groups, nlen,
+ GFP_KERNEL);
+ if (!new_groups)
+ return -ENOMEM;
+ mc_groups = new_groups;
+ for (i = 0; i < BITS_TO_LONGS(n_groups); i++)
+ mc_groups[mc_groups_longs + i] = 0;
+ }
+ mc_groups_longs = new_longs;
+ }
+ } while (!fits);
+
+ for (i = id; i < id + n_groups; i++)
+ set_bit(i, mc_groups);
+ *first_id = id;
+ return 0;
+}
+
+static struct genl_family genl_ctrl;
+
+static int genl_validate_assign_mc_groups(struct genl_family *family)
+{
+ int first_id;
+ int n_groups = family->n_mcgrps;
+ int err = 0, i;
+ bool groups_allocated = false;
+
+ if (!n_groups)
+ return 0;
+
+ for (i = 0; i < n_groups; i++) {
+ const struct genl_multicast_group *grp = &family->mcgrps[i];
+
+ if (WARN_ON(grp->name[0] == '\0'))
+ return -EINVAL;
+ if (WARN_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL))
+ return -EINVAL;
+ }
+
+ /* special-case our own group and hacks */
+ if (family == &genl_ctrl) {
+ first_id = GENL_ID_CTRL;
+ BUG_ON(n_groups != 1);
+ } else if (strcmp(family->name, "NET_DM") == 0) {
+ first_id = 1;
+ BUG_ON(n_groups != 1);
+ } else if (family->id == GENL_ID_VFS_DQUOT) {
+ first_id = GENL_ID_VFS_DQUOT;
+ BUG_ON(n_groups != 1);
+ } else if (family->id == GENL_ID_PMCRAID) {
+ first_id = GENL_ID_PMCRAID;
+ BUG_ON(n_groups != 1);
+ } else {
+ groups_allocated = true;
+ err = genl_allocate_reserve_groups(n_groups, &first_id);
+ if (err)
+ return err;
+ }
+
+ family->mcgrp_offset = first_id;
+
+ /* if still initializing, can't and don't need to to realloc bitmaps */
+ if (!init_net.genl_sock)
+ return 0;
+
+ if (family->netnsok) {
+ struct net *net;
+
+ netlink_table_grab();
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ err = __netlink_change_ngroups(net->genl_sock,
+ mc_groups_longs * BITS_PER_LONG);
+ if (err) {
+ /*
+ * No need to roll back, can only fail if
+ * memory allocation fails and then the
+ * number of _possible_ groups has been
+ * increased on some sockets which is ok.
+ */
+ break;
+ }
+ }
+ rcu_read_unlock();
+ netlink_table_ungrab();
+ } else {
+ err = netlink_change_ngroups(init_net.genl_sock,
+ mc_groups_longs * BITS_PER_LONG);
+ }
+
+ if (groups_allocated && err) {
+ for (i = 0; i < family->n_mcgrps; i++)
+ clear_bit(family->mcgrp_offset + i, mc_groups);
+ }
+
+ return err;
+}
+
+static void genl_unregister_mc_groups(struct genl_family *family)
+{
+ struct net *net;
+ int i;
+
+ netlink_table_grab();
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ for (i = 0; i < family->n_mcgrps; i++)
+ __netlink_clear_multicast_users(
+ net->genl_sock, family->mcgrp_offset + i);
+ }
+ rcu_read_unlock();
+ netlink_table_ungrab();
+
+ for (i = 0; i < family->n_mcgrps; i++) {
+ int grp_id = family->mcgrp_offset + i;
+
+ if (grp_id != 1)
+ clear_bit(grp_id, mc_groups);
+ genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family,
+ &family->mcgrps[i], grp_id);
+ }
+}
+
+static int genl_validate_ops(const struct genl_family *family)
+{
+ const struct genl_ops *ops = family->ops;
+ unsigned int n_ops = family->n_ops;
+ int i, j;
+
+ if (WARN_ON(n_ops && !ops))
+ return -EINVAL;
+
+ if (!n_ops)
+ return 0;
+
+ for (i = 0; i < n_ops; i++) {
+ if (ops[i].dumpit == NULL && ops[i].doit == NULL)
+ return -EINVAL;
+ for (j = i + 1; j < n_ops; j++)
+ if (ops[i].cmd == ops[j].cmd)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * __genl_register_family - register a generic netlink family
+ * @family: generic netlink family
+ *
+ * Registers the specified family after validating it first. Only one
+ * family may be registered with the same family name or identifier.
+ * The family id may equal GENL_ID_GENERATE causing an unique id to
+ * be automatically generated and assigned.
+ *
+ * The family's ops array must already be assigned, you can use the
+ * genl_register_family_with_ops() helper function.
+ *
+ * Return 0 on success or a negative error code.
+ */
+int __genl_register_family(struct genl_family *family)
+{
+ int err = -EINVAL, i;
+
+ if (family->id && family->id < GENL_MIN_ID)
+ goto errout;
+
+ if (family->id > GENL_MAX_ID)
+ goto errout;
+
+ err = genl_validate_ops(family);
+ if (err)
+ return err;
+
+ genl_lock_all();
+
+ if (genl_family_find_byname(family->name)) {
+ err = -EEXIST;
+ goto errout_locked;
+ }
+
+ if (family->id == GENL_ID_GENERATE) {
+ u16 newid = genl_generate_id();
+
+ if (!newid) {
+ err = -ENOMEM;
+ goto errout_locked;
+ }
+
+ family->id = newid;
+ } else if (genl_family_find_byid(family->id)) {
+ err = -EEXIST;
+ goto errout_locked;
+ }
+
+ if (family->maxattr && !family->parallel_ops) {
+ family->attrbuf = kmalloc((family->maxattr+1) *
+ sizeof(struct nlattr *), GFP_KERNEL);
+ if (family->attrbuf == NULL) {
+ err = -ENOMEM;
+ goto errout_locked;
+ }
+ } else
+ family->attrbuf = NULL;
+
+ err = genl_validate_assign_mc_groups(family);
+ if (err)
+ goto errout_locked;
+
+ list_add_tail(&family->family_list, genl_family_chain(family->id));
+ genl_unlock_all();
+
+ /* send all events */
+ genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0);
+ for (i = 0; i < family->n_mcgrps; i++)
+ genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family,
+ &family->mcgrps[i], family->mcgrp_offset + i);
+
+ return 0;
+
+errout_locked:
+ genl_unlock_all();
+errout:
+ return err;
+}
+EXPORT_SYMBOL(__genl_register_family);
+
+/**
+ * genl_unregister_family - unregister generic netlink family
+ * @family: generic netlink family
+ *
+ * Unregisters the specified family.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int genl_unregister_family(struct genl_family *family)
+{
+ struct genl_family *rc;
+
+ genl_lock_all();
+
+ list_for_each_entry(rc, genl_family_chain(family->id), family_list) {
+ if (family->id != rc->id || strcmp(rc->name, family->name))
+ continue;
+
+ genl_unregister_mc_groups(family);
+
+ list_del(&rc->family_list);
+ family->n_ops = 0;
+ up_write(&cb_lock);
+ wait_event(genl_sk_destructing_waitq,
+ atomic_read(&genl_sk_destructing_cnt) == 0);
+ genl_unlock();
+
+ kfree(family->attrbuf);
+ genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);
+ return 0;
+ }
+
+ genl_unlock_all();
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(genl_unregister_family);
+
+/**
+ * genlmsg_new_unicast - Allocate generic netlink message for unicast
+ * @payload: size of the message payload
+ * @info: information on destination
+ * @flags: the type of memory to allocate
+ *
+ * Allocates a new sk_buff large enough to cover the specified payload
+ * plus required Netlink headers. Will check receiving socket for
+ * memory mapped i/o capability and use it if enabled. Will fall back
+ * to non-mapped skb if message size exceeds the frame size of the ring.
+ */
+struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info,
+ gfp_t flags)
+{
+ size_t len = nlmsg_total_size(genlmsg_total_size(payload));
+
+ return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags);
+}
+EXPORT_SYMBOL_GPL(genlmsg_new_unicast);
+
+/**
+ * genlmsg_put - Add generic netlink header to netlink message
+ * @skb: socket buffer holding the message
+ * @portid: netlink portid the message is addressed to
+ * @seq: sequence number (usually the one of the sender)
+ * @family: generic netlink family
+ * @flags: netlink message flags
+ * @cmd: generic netlink command
+ *
+ * Returns pointer to user specific header
+ */
+void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
+ struct genl_family *family, int flags, u8 cmd)
+{
+ struct nlmsghdr *nlh;
+ struct genlmsghdr *hdr;
+
+ nlh = nlmsg_put(skb, portid, seq, family->id, GENL_HDRLEN +
+ family->hdrsize, flags);
+ if (nlh == NULL)
+ return NULL;
+
+ hdr = nlmsg_data(nlh);
+ hdr->cmd = cmd;
+ hdr->version = family->version;
+ hdr->reserved = 0;
+
+ return (char *) hdr + GENL_HDRLEN;
+}
+EXPORT_SYMBOL(genlmsg_put);
+
+static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ /* our ops are always const - netlink API doesn't propagate that */
+ const struct genl_ops *ops = cb->data;
+ int rc;
+
+ genl_lock();
+ rc = ops->dumpit(skb, cb);
+ genl_unlock();
+ return rc;
+}
+
+static int genl_lock_done(struct netlink_callback *cb)
+{
+ /* our ops are always const - netlink API doesn't propagate that */
+ const struct genl_ops *ops = cb->data;
+ int rc = 0;
+
+ if (ops->done) {
+ genl_lock();
+ rc = ops->done(cb);
+ genl_unlock();
+ }
+ return rc;
+}
+
+static int genl_family_rcv_msg(struct genl_family *family,
+ struct sk_buff *skb,
+ struct nlmsghdr *nlh)
+{
+ const struct genl_ops *ops;
+ struct net *net = sock_net(skb->sk);
+ struct genl_info info;
+ struct genlmsghdr *hdr = nlmsg_data(nlh);
+ struct nlattr **attrbuf;
+ int hdrlen, err;
+
+ /* this family doesn't exist in this netns */
+ if (!family->netnsok && !net_eq(net, &init_net))
+ return -ENOENT;
+
+ hdrlen = GENL_HDRLEN + family->hdrsize;
+ if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+ return -EINVAL;
+
+ ops = genl_get_cmd(hdr->cmd, family);
+ if (ops == NULL)
+ return -EOPNOTSUPP;
+
+ if ((ops->flags & GENL_ADMIN_PERM) &&
+ !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
+ int rc;
+
+ if (ops->dumpit == NULL)
+ return -EOPNOTSUPP;
+
+ if (!family->parallel_ops) {
+ struct netlink_dump_control c = {
+ .module = family->module,
+ /* we have const, but the netlink API doesn't */
+ .data = (void *)ops,
+ .dump = genl_lock_dumpit,
+ .done = genl_lock_done,
+ };
+
+ genl_unlock();
+ rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
+ genl_lock();
+
+ } else {
+ struct netlink_dump_control c = {
+ .module = family->module,
+ .dump = ops->dumpit,
+ .done = ops->done,
+ };
+
+ rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
+ }
+
+ return rc;
+ }
+
+ if (ops->doit == NULL)
+ return -EOPNOTSUPP;
+
+ if (family->maxattr && family->parallel_ops) {
+ attrbuf = kmalloc((family->maxattr+1) *
+ sizeof(struct nlattr *), GFP_KERNEL);
+ if (attrbuf == NULL)
+ return -ENOMEM;
+ } else
+ attrbuf = family->attrbuf;
+
+ if (attrbuf) {
+ err = nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
+ ops->policy);
+ if (err < 0)
+ goto out;
+ }
+
+ info.snd_seq = nlh->nlmsg_seq;
+ info.snd_portid = NETLINK_CB(skb).portid;
+ info.nlhdr = nlh;
+ info.genlhdr = nlmsg_data(nlh);
+ info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
+ info.attrs = attrbuf;
+ info.dst_sk = skb->sk;
+ genl_info_net_set(&info, net);
+ memset(&info.user_ptr, 0, sizeof(info.user_ptr));
+
+ if (family->pre_doit) {
+ err = family->pre_doit(ops, skb, &info);
+ if (err)
+ goto out;
+ }
+
+ err = ops->doit(skb, &info);
+
+ if (family->post_doit)
+ family->post_doit(ops, skb, &info);
+
+out:
+ if (family->parallel_ops)
+ kfree(attrbuf);
+
+ return err;
+}
+
+static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct genl_family *family;
+ int err;
+
+ family = genl_family_find_byid(nlh->nlmsg_type);
+ if (family == NULL)
+ return -ENOENT;
+
+ if (!family->parallel_ops)
+ genl_lock();
+
+ err = genl_family_rcv_msg(family, skb, nlh);
+
+ if (!family->parallel_ops)
+ genl_unlock();
+
+ return err;
+}
+
+static void genl_rcv(struct sk_buff *skb)
+{
+ down_read(&cb_lock);
+ netlink_rcv_skb(skb, &genl_rcv_msg);
+ up_read(&cb_lock);
+}
+
+/**************************************************************************
+ * Controller
+ **************************************************************************/
+
+static struct genl_family genl_ctrl = {
+ .id = GENL_ID_CTRL,
+ .name = "nlctrl",
+ .version = 0x2,
+ .maxattr = CTRL_ATTR_MAX,
+ .netnsok = true,
+};
+
+static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq,
+ u32 flags, struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd);
+ if (hdr == NULL)
+ return -1;
+
+ if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) ||
+ nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id) ||
+ nla_put_u32(skb, CTRL_ATTR_VERSION, family->version) ||
+ nla_put_u32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize) ||
+ nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr))
+ goto nla_put_failure;
+
+ if (family->n_ops) {
+ struct nlattr *nla_ops;
+ int i;
+
+ nla_ops = nla_nest_start(skb, CTRL_ATTR_OPS);
+ if (nla_ops == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < family->n_ops; i++) {
+ struct nlattr *nest;
+ const struct genl_ops *ops = &family->ops[i];
+ u32 op_flags = ops->flags;
+
+ if (ops->dumpit)
+ op_flags |= GENL_CMD_CAP_DUMP;
+ if (ops->doit)
+ op_flags |= GENL_CMD_CAP_DO;
+ if (ops->policy)
+ op_flags |= GENL_CMD_CAP_HASPOL;
+
+ nest = nla_nest_start(skb, i + 1);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, CTRL_ATTR_OP_ID, ops->cmd) ||
+ nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ }
+
+ nla_nest_end(skb, nla_ops);
+ }
+
+ if (family->n_mcgrps) {
+ struct nlattr *nla_grps;
+ int i;
+
+ nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS);
+ if (nla_grps == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < family->n_mcgrps; i++) {
+ struct nlattr *nest;
+ const struct genl_multicast_group *grp;
+
+ grp = &family->mcgrps[i];
+
+ nest = nla_nest_start(skb, i + 1);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID,
+ family->mcgrp_offset + i) ||
+ nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME,
+ grp->name))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ }
+ nla_nest_end(skb, nla_grps);
+ }
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ctrl_fill_mcgrp_info(struct genl_family *family,
+ const struct genl_multicast_group *grp,
+ int grp_id, u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+ struct nlattr *nla_grps;
+ struct nlattr *nest;
+
+ hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd);
+ if (hdr == NULL)
+ return -1;
+
+ if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) ||
+ nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id))
+ goto nla_put_failure;
+
+ nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS);
+ if (nla_grps == NULL)
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, 1);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp_id) ||
+ nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME,
+ grp->name))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ nla_nest_end(skb, nla_grps);
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
+{
+
+ int i, n = 0;
+ struct genl_family *rt;
+ struct net *net = sock_net(skb->sk);
+ int chains_to_skip = cb->args[0];
+ int fams_to_skip = cb->args[1];
+
+ for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) {
+ n = 0;
+ list_for_each_entry(rt, genl_family_chain(i), family_list) {
+ if (!rt->netnsok && !net_eq(net, &init_net))
+ continue;
+ if (++n < fams_to_skip)
+ continue;
+ if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, CTRL_CMD_NEWFAMILY) < 0)
+ goto errout;
+ }
+
+ fams_to_skip = 0;
+ }
+
+errout:
+ cb->args[0] = i;
+ cb->args[1] = n;
+
+ return skb->len;
+}
+
+static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
+ u32 portid, int seq, u8 cmd)
+{
+ struct sk_buff *skb;
+ int err;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb == NULL)
+ return ERR_PTR(-ENOBUFS);
+
+ err = ctrl_fill_info(family, portid, seq, 0, skb, cmd);
+ if (err < 0) {
+ nlmsg_free(skb);
+ return ERR_PTR(err);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *
+ctrl_build_mcgrp_msg(struct genl_family *family,
+ const struct genl_multicast_group *grp,
+ int grp_id, u32 portid, int seq, u8 cmd)
+{
+ struct sk_buff *skb;
+ int err;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (skb == NULL)
+ return ERR_PTR(-ENOBUFS);
+
+ err = ctrl_fill_mcgrp_info(family, grp, grp_id, portid,
+ seq, 0, skb, cmd);
+ if (err < 0) {
+ nlmsg_free(skb);
+ return ERR_PTR(err);
+ }
+
+ return skb;
+}
+
+static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = {
+ [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 },
+ [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING,
+ .len = GENL_NAMSIZ - 1 },
+};
+
+static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct genl_family *res = NULL;
+ int err = -EINVAL;
+
+ if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
+ u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]);
+ res = genl_family_find_byid(id);
+ err = -ENOENT;
+ }
+
+ if (info->attrs[CTRL_ATTR_FAMILY_NAME]) {
+ char *name;
+
+ name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]);
+ res = genl_family_find_byname(name);
+#ifdef CONFIG_MODULES
+ if (res == NULL) {
+ genl_unlock();
+ up_read(&cb_lock);
+ request_module("net-pf-%d-proto-%d-family-%s",
+ PF_NETLINK, NETLINK_GENERIC, name);
+ down_read(&cb_lock);
+ genl_lock();
+ res = genl_family_find_byname(name);
+ }
+#endif
+ err = -ENOENT;
+ }
+
+ if (res == NULL)
+ return err;
+
+ if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) {
+ /* family doesn't exist here */
+ return -ENOENT;
+ }
+
+ msg = ctrl_build_family_msg(res, info->snd_portid, info->snd_seq,
+ CTRL_CMD_NEWFAMILY);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ return genlmsg_reply(msg, info);
+}
+
+static int genl_ctrl_event(int event, struct genl_family *family,
+ const struct genl_multicast_group *grp,
+ int grp_id)
+{
+ struct sk_buff *msg;
+
+ /* genl is still initialising */
+ if (!init_net.genl_sock)
+ return 0;
+
+ switch (event) {
+ case CTRL_CMD_NEWFAMILY:
+ case CTRL_CMD_DELFAMILY:
+ WARN_ON(grp);
+ msg = ctrl_build_family_msg(family, 0, 0, event);
+ break;
+ case CTRL_CMD_NEWMCAST_GRP:
+ case CTRL_CMD_DELMCAST_GRP:
+ BUG_ON(!grp);
+ msg = ctrl_build_mcgrp_msg(family, grp, grp_id, 0, 0, event);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ if (!family->netnsok) {
+ genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0,
+ 0, GFP_KERNEL);
+ } else {
+ rcu_read_lock();
+ genlmsg_multicast_allns(&genl_ctrl, msg, 0,
+ 0, GFP_ATOMIC);
+ rcu_read_unlock();
+ }
+
+ return 0;
+}
+
+static struct genl_ops genl_ctrl_ops[] = {
+ {
+ .cmd = CTRL_CMD_GETFAMILY,
+ .doit = ctrl_getfamily,
+ .dumpit = ctrl_dumpfamily,
+ .policy = ctrl_policy,
+ },
+};
+
+static struct genl_multicast_group genl_ctrl_groups[] = {
+ { .name = "notify", },
+};
+
+static int genl_bind(struct net *net, int group)
+{
+ int i, err = -ENOENT;
+
+ down_read(&cb_lock);
+ for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
+ struct genl_family *f;
+
+ list_for_each_entry(f, genl_family_chain(i), family_list) {
+ if (group >= f->mcgrp_offset &&
+ group < f->mcgrp_offset + f->n_mcgrps) {
+ int fam_grp = group - f->mcgrp_offset;
+
+ if (!f->netnsok && net != &init_net)
+ err = -ENOENT;
+ else if (f->mcast_bind)
+ err = f->mcast_bind(net, fam_grp);
+ else
+ err = 0;
+ break;
+ }
+ }
+ }
+ up_read(&cb_lock);
+
+ return err;
+}
+
+static void genl_unbind(struct net *net, int group)
+{
+ int i;
+
+ down_read(&cb_lock);
+ for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
+ struct genl_family *f;
+
+ list_for_each_entry(f, genl_family_chain(i), family_list) {
+ if (group >= f->mcgrp_offset &&
+ group < f->mcgrp_offset + f->n_mcgrps) {
+ int fam_grp = group - f->mcgrp_offset;
+
+ if (f->mcast_unbind)
+ f->mcast_unbind(net, fam_grp);
+ break;
+ }
+ }
+ }
+ up_read(&cb_lock);
+}
+
+static int __net_init genl_pernet_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = genl_rcv,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .bind = genl_bind,
+ .unbind = genl_unbind,
+ };
+
+ /* we'll bump the group number right afterwards */
+ net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg);
+
+ if (!net->genl_sock && net_eq(net, &init_net))
+ panic("GENL: Cannot initialize generic netlink\n");
+
+ if (!net->genl_sock)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit genl_pernet_exit(struct net *net)
+{
+ netlink_kernel_release(net->genl_sock);
+ net->genl_sock = NULL;
+}
+
+static struct pernet_operations genl_pernet_ops = {
+ .init = genl_pernet_init,
+ .exit = genl_pernet_exit,
+};
+
+static int __init genl_init(void)
+{
+ int i, err;
+
+ for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
+ INIT_LIST_HEAD(&family_ht[i]);
+
+ err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops,
+ genl_ctrl_groups);
+ if (err < 0)
+ goto problem;
+
+ err = register_pernet_subsys(&genl_pernet_ops);
+ if (err)
+ goto problem;
+
+ return 0;
+
+problem:
+ panic("GENL: Cannot register controller: %d\n", err);
+}
+
+subsys_initcall(genl_init);
+
+static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
+ gfp_t flags)
+{
+ struct sk_buff *tmp;
+ struct net *net, *prev = NULL;
+ int err;
+
+ for_each_net_rcu(net) {
+ if (prev) {
+ tmp = skb_clone(skb, flags);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto error;
+ }
+ err = nlmsg_multicast(prev->genl_sock, tmp,
+ portid, group, flags);
+ if (err)
+ goto error;
+ }
+
+ prev = net;
+ }
+
+ return nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
+ error:
+ kfree_skb(skb);
+ return err;
+}
+
+int genlmsg_multicast_allns(struct genl_family *family, struct sk_buff *skb,
+ u32 portid, unsigned int group, gfp_t flags)
+{
+ if (WARN_ON_ONCE(group >= family->n_mcgrps))
+ return -EINVAL;
+ group = family->mcgrp_offset + group;
+ return genlmsg_mcast(skb, portid, group, flags);
+}
+EXPORT_SYMBOL(genlmsg_multicast_allns);
+
+void genl_notify(struct genl_family *family,
+ struct sk_buff *skb, struct net *net, u32 portid, u32 group,
+ struct nlmsghdr *nlh, gfp_t flags)
+{
+ struct sock *sk = net->genl_sock;
+ int report = 0;
+
+ if (nlh)
+ report = nlmsg_report(nlh);
+
+ if (WARN_ON_ONCE(group >= family->n_mcgrps))
+ return;
+ group = family->mcgrp_offset + group;
+ nlmsg_notify(sk, skb, portid, group, report, flags);
+}
+EXPORT_SYMBOL(genl_notify);
diff --git a/net/netrom/Makefile b/net/netrom/Makefile
new file mode 100644
index 000000000..2660f5a16
--- /dev/null
+++ b/net/netrom/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux NET/ROM layer.
+#
+
+obj-$(CONFIG_NETROM) += netrom.o
+
+netrom-y := af_netrom.o nr_dev.o nr_in.o nr_loopback.o \
+ nr_out.o nr_route.o nr_subr.o nr_timer.o
+netrom-$(CONFIG_SYSCTL) += sysctl_net_netrom.o
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
new file mode 100644
index 000000000..b987fd56c
--- /dev/null
+++ b/net/netrom/af_netrom.c
@@ -0,0 +1,1511 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/stat.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <linux/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <net/netrom.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/tcp_states.h>
+#include <net/arp.h>
+#include <linux/init.h>
+
+static int nr_ndevs = 4;
+
+int sysctl_netrom_default_path_quality = NR_DEFAULT_QUAL;
+int sysctl_netrom_obsolescence_count_initialiser = NR_DEFAULT_OBS;
+int sysctl_netrom_network_ttl_initialiser = NR_DEFAULT_TTL;
+int sysctl_netrom_transport_timeout = NR_DEFAULT_T1;
+int sysctl_netrom_transport_maximum_tries = NR_DEFAULT_N2;
+int sysctl_netrom_transport_acknowledge_delay = NR_DEFAULT_T2;
+int sysctl_netrom_transport_busy_delay = NR_DEFAULT_T4;
+int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW;
+int sysctl_netrom_transport_no_activity_timeout = NR_DEFAULT_IDLE;
+int sysctl_netrom_routing_control = NR_DEFAULT_ROUTING;
+int sysctl_netrom_link_fails_count = NR_DEFAULT_FAILS;
+int sysctl_netrom_reset_circuit = NR_DEFAULT_RESET;
+
+static unsigned short circuit = 0x101;
+
+static HLIST_HEAD(nr_list);
+static DEFINE_SPINLOCK(nr_list_lock);
+
+static const struct proto_ops nr_proto_ops;
+
+/*
+ * NETROM network devices are virtual network devices encapsulating NETROM
+ * frames into AX.25 which will be sent through an AX.25 device, so form a
+ * special "super class" of normal net devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key nr_netdev_xmit_lock_key;
+static struct lock_class_key nr_netdev_addr_lock_key;
+
+static void nr_set_lockdep_one(struct net_device *dev,
+ struct netdev_queue *txq,
+ void *_unused)
+{
+ lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key);
+}
+
+static void nr_set_lockdep_key(struct net_device *dev)
+{
+ lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key);
+ netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL);
+}
+
+/*
+ * Socket removal during an interrupt is now safe.
+ */
+static void nr_remove_socket(struct sock *sk)
+{
+ spin_lock_bh(&nr_list_lock);
+ sk_del_node_init(sk);
+ spin_unlock_bh(&nr_list_lock);
+}
+
+/*
+ * Kill all bound sockets on a dropped device.
+ */
+static void nr_kill_by_device(struct net_device *dev)
+{
+ struct sock *s;
+
+ spin_lock_bh(&nr_list_lock);
+ sk_for_each(s, &nr_list)
+ if (nr_sk(s)->device == dev)
+ nr_disconnect(s, ENETUNREACH);
+ spin_unlock_bh(&nr_list_lock);
+}
+
+/*
+ * Handle device status changes.
+ */
+static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ nr_kill_by_device(dev);
+ nr_rt_device_down(dev);
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Add a socket to the bound sockets list.
+ */
+static void nr_insert_socket(struct sock *sk)
+{
+ spin_lock_bh(&nr_list_lock);
+ sk_add_node(sk, &nr_list);
+ spin_unlock_bh(&nr_list_lock);
+}
+
+/*
+ * Find a socket that wants to accept the Connect Request we just
+ * received.
+ */
+static struct sock *nr_find_listener(ax25_address *addr)
+{
+ struct sock *s;
+
+ spin_lock_bh(&nr_list_lock);
+ sk_for_each(s, &nr_list)
+ if (!ax25cmp(&nr_sk(s)->source_addr, addr) &&
+ s->sk_state == TCP_LISTEN) {
+ bh_lock_sock(s);
+ goto found;
+ }
+ s = NULL;
+found:
+ spin_unlock_bh(&nr_list_lock);
+ return s;
+}
+
+/*
+ * Find a connected NET/ROM socket given my circuit IDs.
+ */
+static struct sock *nr_find_socket(unsigned char index, unsigned char id)
+{
+ struct sock *s;
+
+ spin_lock_bh(&nr_list_lock);
+ sk_for_each(s, &nr_list) {
+ struct nr_sock *nr = nr_sk(s);
+
+ if (nr->my_index == index && nr->my_id == id) {
+ bh_lock_sock(s);
+ goto found;
+ }
+ }
+ s = NULL;
+found:
+ spin_unlock_bh(&nr_list_lock);
+ return s;
+}
+
+/*
+ * Find a connected NET/ROM socket given their circuit IDs.
+ */
+static struct sock *nr_find_peer(unsigned char index, unsigned char id,
+ ax25_address *dest)
+{
+ struct sock *s;
+
+ spin_lock_bh(&nr_list_lock);
+ sk_for_each(s, &nr_list) {
+ struct nr_sock *nr = nr_sk(s);
+
+ if (nr->your_index == index && nr->your_id == id &&
+ !ax25cmp(&nr->dest_addr, dest)) {
+ bh_lock_sock(s);
+ goto found;
+ }
+ }
+ s = NULL;
+found:
+ spin_unlock_bh(&nr_list_lock);
+ return s;
+}
+
+/*
+ * Find next free circuit ID.
+ */
+static unsigned short nr_find_next_circuit(void)
+{
+ unsigned short id = circuit;
+ unsigned char i, j;
+ struct sock *sk;
+
+ for (;;) {
+ i = id / 256;
+ j = id % 256;
+
+ if (i != 0 && j != 0) {
+ if ((sk=nr_find_socket(i, j)) == NULL)
+ break;
+ bh_unlock_sock(sk);
+ }
+
+ id++;
+ }
+
+ return id;
+}
+
+/*
+ * Deferred destroy.
+ */
+void nr_destroy_socket(struct sock *);
+
+/*
+ * Handler for deferred kills.
+ */
+static void nr_destroy_timer(unsigned long data)
+{
+ struct sock *sk=(struct sock *)data;
+ bh_lock_sock(sk);
+ sock_hold(sk);
+ nr_destroy_socket(sk);
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * This is called from user mode and the timers. Thus it protects itself
+ * against interrupt users but doesn't worry about being called during
+ * work. Once it is removed from the queue no interrupt or bottom half
+ * will touch it and we are (fairly 8-) ) safe.
+ */
+void nr_destroy_socket(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ nr_remove_socket(sk);
+
+ nr_stop_heartbeat(sk);
+ nr_stop_t1timer(sk);
+ nr_stop_t2timer(sk);
+ nr_stop_t4timer(sk);
+ nr_stop_idletimer(sk);
+
+ nr_clear_queues(sk); /* Flush the queues */
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (skb->sk != sk) { /* A pending connection */
+ /* Queue the unaccepted socket for death */
+ sock_set_flag(skb->sk, SOCK_DEAD);
+ nr_start_heartbeat(skb->sk);
+ nr_sk(skb->sk)->state = NR_STATE_0;
+ }
+
+ kfree_skb(skb);
+ }
+
+ if (sk_has_allocations(sk)) {
+ /* Defer: outstanding buffers */
+ sk->sk_timer.function = nr_destroy_timer;
+ sk->sk_timer.expires = jiffies + 2 * HZ;
+ add_timer(&sk->sk_timer);
+ } else
+ sock_put(sk);
+}
+
+/*
+ * Handling for system calls applied via the various interfaces to a
+ * NET/ROM socket object.
+ */
+
+static int nr_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct nr_sock *nr = nr_sk(sk);
+ unsigned long opt;
+
+ if (level != SOL_NETROM)
+ return -ENOPROTOOPT;
+
+ if (optlen < sizeof(unsigned int))
+ return -EINVAL;
+
+ if (get_user(opt, (unsigned int __user *)optval))
+ return -EFAULT;
+
+ switch (optname) {
+ case NETROM_T1:
+ if (opt < 1 || opt > ULONG_MAX / HZ)
+ return -EINVAL;
+ nr->t1 = opt * HZ;
+ return 0;
+
+ case NETROM_T2:
+ if (opt < 1 || opt > ULONG_MAX / HZ)
+ return -EINVAL;
+ nr->t2 = opt * HZ;
+ return 0;
+
+ case NETROM_N2:
+ if (opt < 1 || opt > 31)
+ return -EINVAL;
+ nr->n2 = opt;
+ return 0;
+
+ case NETROM_T4:
+ if (opt < 1 || opt > ULONG_MAX / HZ)
+ return -EINVAL;
+ nr->t4 = opt * HZ;
+ return 0;
+
+ case NETROM_IDLE:
+ if (opt > ULONG_MAX / (60 * HZ))
+ return -EINVAL;
+ nr->idle = opt * 60 * HZ;
+ return 0;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+static int nr_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct nr_sock *nr = nr_sk(sk);
+ int val = 0;
+ int len;
+
+ if (level != SOL_NETROM)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case NETROM_T1:
+ val = nr->t1 / HZ;
+ break;
+
+ case NETROM_T2:
+ val = nr->t2 / HZ;
+ break;
+
+ case NETROM_N2:
+ val = nr->n2;
+ break;
+
+ case NETROM_T4:
+ val = nr->t4 / HZ;
+ break;
+
+ case NETROM_IDLE:
+ val = nr->idle / (60 * HZ);
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return copy_to_user(optval, &val, len) ? -EFAULT : 0;
+}
+
+static int nr_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+
+ lock_sock(sk);
+ if (sk->sk_state != TCP_LISTEN) {
+ memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN);
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_state = TCP_LISTEN;
+ release_sock(sk);
+ return 0;
+ }
+ release_sock(sk);
+
+ return -EOPNOTSUPP;
+}
+
+static struct proto nr_proto = {
+ .name = "NETROM",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct nr_sock),
+};
+
+static int nr_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct nr_sock *nr;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ if (sock->type != SOCK_SEQPACKET || protocol != 0)
+ return -ESOCKTNOSUPPORT;
+
+ sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto);
+ if (sk == NULL)
+ return -ENOMEM;
+
+ nr = nr_sk(sk);
+
+ sock_init_data(sock, sk);
+
+ sock->ops = &nr_proto_ops;
+ sk->sk_protocol = protocol;
+
+ skb_queue_head_init(&nr->ack_queue);
+ skb_queue_head_init(&nr->reseq_queue);
+ skb_queue_head_init(&nr->frag_queue);
+
+ nr_init_timers(sk);
+
+ nr->t1 =
+ msecs_to_jiffies(sysctl_netrom_transport_timeout);
+ nr->t2 =
+ msecs_to_jiffies(sysctl_netrom_transport_acknowledge_delay);
+ nr->n2 =
+ msecs_to_jiffies(sysctl_netrom_transport_maximum_tries);
+ nr->t4 =
+ msecs_to_jiffies(sysctl_netrom_transport_busy_delay);
+ nr->idle =
+ msecs_to_jiffies(sysctl_netrom_transport_no_activity_timeout);
+ nr->window = sysctl_netrom_transport_requested_window_size;
+
+ nr->bpqext = 1;
+ nr->state = NR_STATE_0;
+
+ return 0;
+}
+
+static struct sock *nr_make_new(struct sock *osk)
+{
+ struct sock *sk;
+ struct nr_sock *nr, *onr;
+
+ if (osk->sk_type != SOCK_SEQPACKET)
+ return NULL;
+
+ sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot);
+ if (sk == NULL)
+ return NULL;
+
+ nr = nr_sk(sk);
+
+ sock_init_data(NULL, sk);
+
+ sk->sk_type = osk->sk_type;
+ sk->sk_priority = osk->sk_priority;
+ sk->sk_protocol = osk->sk_protocol;
+ sk->sk_rcvbuf = osk->sk_rcvbuf;
+ sk->sk_sndbuf = osk->sk_sndbuf;
+ sk->sk_state = TCP_ESTABLISHED;
+ sock_copy_flags(sk, osk);
+
+ skb_queue_head_init(&nr->ack_queue);
+ skb_queue_head_init(&nr->reseq_queue);
+ skb_queue_head_init(&nr->frag_queue);
+
+ nr_init_timers(sk);
+
+ onr = nr_sk(osk);
+
+ nr->t1 = onr->t1;
+ nr->t2 = onr->t2;
+ nr->n2 = onr->n2;
+ nr->t4 = onr->t4;
+ nr->idle = onr->idle;
+ nr->window = onr->window;
+
+ nr->device = onr->device;
+ nr->bpqext = onr->bpqext;
+
+ return sk;
+}
+
+static int nr_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct nr_sock *nr;
+
+ if (sk == NULL) return 0;
+
+ sock_hold(sk);
+ sock_orphan(sk);
+ lock_sock(sk);
+ nr = nr_sk(sk);
+
+ switch (nr->state) {
+ case NR_STATE_0:
+ case NR_STATE_1:
+ case NR_STATE_2:
+ nr_disconnect(sk, 0);
+ nr_destroy_socket(sk);
+ break;
+
+ case NR_STATE_3:
+ nr_clear_queues(sk);
+ nr->n2count = 0;
+ nr_write_internal(sk, NR_DISCREQ);
+ nr_start_t1timer(sk);
+ nr_stop_t2timer(sk);
+ nr_stop_t4timer(sk);
+ nr_stop_idletimer(sk);
+ nr->state = NR_STATE_2;
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DESTROY);
+ break;
+
+ default:
+ break;
+ }
+
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct nr_sock *nr = nr_sk(sk);
+ struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
+ struct net_device *dev;
+ ax25_uid_assoc *user;
+ ax25_address *source;
+
+ lock_sock(sk);
+ if (!sock_flag(sk, SOCK_ZAPPED)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (addr_len < sizeof(struct sockaddr_ax25) || addr_len > sizeof(struct full_sockaddr_ax25)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (addr_len < (addr->fsa_ax25.sax25_ndigis * sizeof(ax25_address) + sizeof(struct sockaddr_ax25))) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if (addr->fsa_ax25.sax25_family != AF_NETROM) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+ if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) {
+ release_sock(sk);
+ return -EADDRNOTAVAIL;
+ }
+
+ /*
+ * Only the super user can set an arbitrary user callsign.
+ */
+ if (addr->fsa_ax25.sax25_ndigis == 1) {
+ if (!capable(CAP_NET_BIND_SERVICE)) {
+ dev_put(dev);
+ release_sock(sk);
+ return -EPERM;
+ }
+ nr->user_addr = addr->fsa_digipeater[0];
+ nr->source_addr = addr->fsa_ax25.sax25_call;
+ } else {
+ source = &addr->fsa_ax25.sax25_call;
+
+ user = ax25_findbyuid(current_euid());
+ if (user) {
+ nr->user_addr = user->call;
+ ax25_uid_put(user);
+ } else {
+ if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
+ release_sock(sk);
+ dev_put(dev);
+ return -EPERM;
+ }
+ nr->user_addr = *source;
+ }
+
+ nr->source_addr = *source;
+ }
+
+ nr->device = dev;
+ nr_insert_socket(sk);
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+ dev_put(dev);
+ release_sock(sk);
+
+ return 0;
+}
+
+static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct nr_sock *nr = nr_sk(sk);
+ struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr;
+ ax25_address *source = NULL;
+ ax25_uid_assoc *user;
+ struct net_device *dev;
+ int err = 0;
+
+ lock_sock(sk);
+ if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
+ sock->state = SS_CONNECTED;
+ goto out_release; /* Connect completed during a ERESTARTSYS event */
+ }
+
+ if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
+ sock->state = SS_UNCONNECTED;
+ err = -ECONNREFUSED;
+ goto out_release;
+ }
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ err = -EISCONN; /* No reconnect on a seqpacket socket */
+ goto out_release;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ sock->state = SS_UNCONNECTED;
+
+ if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) {
+ err = -EINVAL;
+ goto out_release;
+ }
+ if (addr->sax25_family != AF_NETROM) {
+ err = -EINVAL;
+ goto out_release;
+ }
+ if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ if ((dev = nr_dev_first()) == NULL) {
+ err = -ENETUNREACH;
+ goto out_release;
+ }
+ source = (ax25_address *)dev->dev_addr;
+
+ user = ax25_findbyuid(current_euid());
+ if (user) {
+ nr->user_addr = user->call;
+ ax25_uid_put(user);
+ } else {
+ if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
+ dev_put(dev);
+ err = -EPERM;
+ goto out_release;
+ }
+ nr->user_addr = *source;
+ }
+
+ nr->source_addr = *source;
+ nr->device = dev;
+
+ dev_put(dev);
+ nr_insert_socket(sk); /* Finish the bind */
+ }
+
+ nr->dest_addr = addr->sax25_call;
+
+ release_sock(sk);
+ circuit = nr_find_next_circuit();
+ lock_sock(sk);
+
+ nr->my_index = circuit / 256;
+ nr->my_id = circuit % 256;
+
+ circuit++;
+
+ /* Move to connecting socket, start sending Connect Requests */
+ sock->state = SS_CONNECTING;
+ sk->sk_state = TCP_SYN_SENT;
+
+ nr_establish_data_link(sk);
+
+ nr->state = NR_STATE_1;
+
+ nr_start_heartbeat(sk);
+
+ /* Now the loop */
+ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) {
+ err = -EINPROGRESS;
+ goto out_release;
+ }
+
+ /*
+ * A Connect Ack with Choke or timeout or failed routing will go to
+ * closed.
+ */
+ if (sk->sk_state == TCP_SYN_SENT) {
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk->sk_state != TCP_SYN_SENT)
+ break;
+ if (!signal_pending(current)) {
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ continue;
+ }
+ err = -ERESTARTSYS;
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (err)
+ goto out_release;
+ }
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ sock->state = SS_UNCONNECTED;
+ err = sock_error(sk); /* Always set at this point */
+ goto out_release;
+ }
+
+ sock->state = SS_CONNECTED;
+
+out_release:
+ release_sock(sk);
+
+ return err;
+}
+
+static int nr_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sk_buff *skb;
+ struct sock *newsk;
+ DEFINE_WAIT(wait);
+ struct sock *sk;
+ int err = 0;
+
+ if ((sk = sock->sk) == NULL)
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EOPNOTSUPP;
+ goto out_release;
+ }
+
+ if (sk->sk_state != TCP_LISTEN) {
+ err = -EINVAL;
+ goto out_release;
+ }
+
+ /*
+ * The write queue this time is holding sockets ready to use
+ * hooked into the SABM we saved
+ */
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (skb)
+ break;
+
+ if (flags & O_NONBLOCK) {
+ err = -EWOULDBLOCK;
+ break;
+ }
+ if (!signal_pending(current)) {
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ continue;
+ }
+ err = -ERESTARTSYS;
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (err)
+ goto out_release;
+
+ newsk = skb->sk;
+ sock_graft(newsk, newsock);
+
+ /* Now attach up the new socket */
+ kfree_skb(skb);
+ sk_acceptq_removed(sk);
+
+out_release:
+ release_sock(sk);
+
+ return err;
+}
+
+static int nr_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr;
+ struct sock *sk = sock->sk;
+ struct nr_sock *nr = nr_sk(sk);
+
+ memset(&sax->fsa_ax25, 0, sizeof(struct sockaddr_ax25));
+
+ lock_sock(sk);
+ if (peer != 0) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+ sax->fsa_ax25.sax25_family = AF_NETROM;
+ sax->fsa_ax25.sax25_ndigis = 1;
+ sax->fsa_ax25.sax25_call = nr->user_addr;
+ memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater));
+ sax->fsa_digipeater[0] = nr->dest_addr;
+ *uaddr_len = sizeof(struct full_sockaddr_ax25);
+ } else {
+ sax->fsa_ax25.sax25_family = AF_NETROM;
+ sax->fsa_ax25.sax25_ndigis = 0;
+ sax->fsa_ax25.sax25_call = nr->source_addr;
+ *uaddr_len = sizeof(struct sockaddr_ax25);
+ }
+ release_sock(sk);
+
+ return 0;
+}
+
+int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
+{
+ struct sock *sk;
+ struct sock *make;
+ struct nr_sock *nr_make;
+ ax25_address *src, *dest, *user;
+ unsigned short circuit_index, circuit_id;
+ unsigned short peer_circuit_index, peer_circuit_id;
+ unsigned short frametype, flags, window, timeout;
+ int ret;
+
+ skb->sk = NULL; /* Initially we don't know who it's for */
+
+ /*
+ * skb->data points to the netrom frame start
+ */
+
+ src = (ax25_address *)(skb->data + 0);
+ dest = (ax25_address *)(skb->data + 7);
+
+ circuit_index = skb->data[15];
+ circuit_id = skb->data[16];
+ peer_circuit_index = skb->data[17];
+ peer_circuit_id = skb->data[18];
+ frametype = skb->data[19] & 0x0F;
+ flags = skb->data[19] & 0xF0;
+
+ /*
+ * Check for an incoming IP over NET/ROM frame.
+ */
+ if (frametype == NR_PROTOEXT &&
+ circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
+ skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
+ skb_reset_transport_header(skb);
+
+ return nr_rx_ip(skb, dev);
+ }
+
+ /*
+ * Find an existing socket connection, based on circuit ID, if it's
+ * a Connect Request base it on their circuit ID.
+ *
+ * Circuit ID 0/0 is not valid but it could still be a "reset" for a
+ * circuit that no longer exists at the other end ...
+ */
+
+ sk = NULL;
+
+ if (circuit_index == 0 && circuit_id == 0) {
+ if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG)
+ sk = nr_find_peer(peer_circuit_index, peer_circuit_id, src);
+ } else {
+ if (frametype == NR_CONNREQ)
+ sk = nr_find_peer(circuit_index, circuit_id, src);
+ else
+ sk = nr_find_socket(circuit_index, circuit_id);
+ }
+
+ if (sk != NULL) {
+ skb_reset_transport_header(skb);
+
+ if (frametype == NR_CONNACK && skb->len == 22)
+ nr_sk(sk)->bpqext = 1;
+ else
+ nr_sk(sk)->bpqext = 0;
+
+ ret = nr_process_rx_frame(sk, skb);
+ bh_unlock_sock(sk);
+ return ret;
+ }
+
+ /*
+ * Now it should be a CONNREQ.
+ */
+ if (frametype != NR_CONNREQ) {
+ /*
+ * Here it would be nice to be able to send a reset but
+ * NET/ROM doesn't have one. We've tried to extend the protocol
+ * by sending NR_CONNACK | NR_CHOKE_FLAGS replies but that
+ * apparently kills BPQ boxes... :-(
+ * So now we try to follow the established behaviour of
+ * G8PZT's Xrouter which is sending packets with command type 7
+ * as an extension of the protocol.
+ */
+ if (sysctl_netrom_reset_circuit &&
+ (frametype != NR_RESET || flags != 0))
+ nr_transmit_reset(skb, 1);
+
+ return 0;
+ }
+
+ sk = nr_find_listener(dest);
+
+ user = (ax25_address *)(skb->data + 21);
+
+ if (sk == NULL || sk_acceptq_is_full(sk) ||
+ (make = nr_make_new(sk)) == NULL) {
+ nr_transmit_refusal(skb, 0);
+ if (sk)
+ bh_unlock_sock(sk);
+ return 0;
+ }
+
+ window = skb->data[20];
+
+ skb->sk = make;
+ make->sk_state = TCP_ESTABLISHED;
+
+ /* Fill in his circuit details */
+ nr_make = nr_sk(make);
+ nr_make->source_addr = *dest;
+ nr_make->dest_addr = *src;
+ nr_make->user_addr = *user;
+
+ nr_make->your_index = circuit_index;
+ nr_make->your_id = circuit_id;
+
+ bh_unlock_sock(sk);
+ circuit = nr_find_next_circuit();
+ bh_lock_sock(sk);
+
+ nr_make->my_index = circuit / 256;
+ nr_make->my_id = circuit % 256;
+
+ circuit++;
+
+ /* Window negotiation */
+ if (window < nr_make->window)
+ nr_make->window = window;
+
+ /* L4 timeout negotiation */
+ if (skb->len == 37) {
+ timeout = skb->data[36] * 256 + skb->data[35];
+ if (timeout * HZ < nr_make->t1)
+ nr_make->t1 = timeout * HZ;
+ nr_make->bpqext = 1;
+ } else {
+ nr_make->bpqext = 0;
+ }
+
+ nr_write_internal(make, NR_CONNACK);
+
+ nr_make->condition = 0x00;
+ nr_make->vs = 0;
+ nr_make->va = 0;
+ nr_make->vr = 0;
+ nr_make->vl = 0;
+ nr_make->state = NR_STATE_3;
+ sk_acceptq_added(sk);
+ skb_queue_head(&sk->sk_receive_queue, skb);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+
+ bh_unlock_sock(sk);
+
+ nr_insert_socket(make);
+
+ nr_start_heartbeat(make);
+ nr_start_idletimer(make);
+
+ return 1;
+}
+
+static int nr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct nr_sock *nr = nr_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_ax25 *, usax, msg->msg_name);
+ int err;
+ struct sockaddr_ax25 sax;
+ struct sk_buff *skb;
+ unsigned char *asmptr;
+ int size;
+
+ if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_ZAPPED)) {
+ err = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ send_sig(SIGPIPE, current, 0);
+ err = -EPIPE;
+ goto out;
+ }
+
+ if (nr->device == NULL) {
+ err = -ENETUNREACH;
+ goto out;
+ }
+
+ if (usax) {
+ if (msg->msg_namelen < sizeof(sax)) {
+ err = -EINVAL;
+ goto out;
+ }
+ sax = *usax;
+ if (ax25cmp(&nr->dest_addr, &sax.sax25_call) != 0) {
+ err = -EISCONN;
+ goto out;
+ }
+ if (sax.sax25_family != AF_NETROM) {
+ err = -EINVAL;
+ goto out;
+ }
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ err = -ENOTCONN;
+ goto out;
+ }
+ sax.sax25_family = AF_NETROM;
+ sax.sax25_call = nr->dest_addr;
+ }
+
+ /* Build a packet - the conventional user limit is 236 bytes. We can
+ do ludicrously large NetROM frames but must not overflow */
+ if (len > 65536) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ size = len + NR_NETWORK_LEN + NR_TRANSPORT_LEN;
+
+ if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL)
+ goto out;
+
+ skb_reserve(skb, size - len);
+ skb_reset_transport_header(skb);
+
+ /*
+ * Push down the NET/ROM header
+ */
+
+ asmptr = skb_push(skb, NR_TRANSPORT_LEN);
+
+ /* Build a NET/ROM Transport header */
+
+ *asmptr++ = nr->your_index;
+ *asmptr++ = nr->your_id;
+ *asmptr++ = 0; /* To be filled in later */
+ *asmptr++ = 0; /* Ditto */
+ *asmptr++ = NR_INFO;
+
+ /*
+ * Put the data on the end
+ */
+ skb_put(skb, len);
+
+ /* User data follows immediately after the NET/ROM transport header */
+ if (memcpy_from_msg(skb_transport_header(skb), msg, len)) {
+ kfree_skb(skb);
+ err = -EFAULT;
+ goto out;
+ }
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ kfree_skb(skb);
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ nr_output(sk, skb); /* Shove it onto the queue */
+
+ err = len;
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int nr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ DECLARE_SOCKADDR(struct sockaddr_ax25 *, sax, msg->msg_name);
+ size_t copied;
+ struct sk_buff *skb;
+ int er;
+
+ /*
+ * This works for seqpacket too. The receiver has ordered the queue for
+ * us! We do one quick check first though
+ */
+
+ lock_sock(sk);
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ /* Now we can treat all alike */
+ if ((skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &er)) == NULL) {
+ release_sock(sk);
+ return er;
+ }
+
+ skb_reset_transport_header(skb);
+ copied = skb->len;
+
+ if (copied > size) {
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ er = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (er < 0) {
+ skb_free_datagram(sk, skb);
+ release_sock(sk);
+ return er;
+ }
+
+ if (sax != NULL) {
+ memset(sax, 0, sizeof(*sax));
+ sax->sax25_family = AF_NETROM;
+ skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call,
+ AX25_ADDR_LEN);
+ msg->msg_namelen = sizeof(*sax);
+ }
+
+ skb_free_datagram(sk, skb);
+
+ release_sock(sk);
+ return copied;
+}
+
+
+static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ void __user *argp = (void __user *)arg;
+ int ret;
+
+ switch (cmd) {
+ case TIOCOUTQ: {
+ long amount;
+
+ lock_sock(sk);
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ release_sock(sk);
+ return put_user(amount, (int __user *)argp);
+ }
+
+ case TIOCINQ: {
+ struct sk_buff *skb;
+ long amount = 0L;
+
+ lock_sock(sk);
+ /* These two are safe on a single CPU system as only user tasks fiddle here */
+ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+ amount = skb->len;
+ release_sock(sk);
+ return put_user(amount, (int __user *)argp);
+ }
+
+ case SIOCGSTAMP:
+ lock_sock(sk);
+ ret = sock_get_timestamp(sk, argp);
+ release_sock(sk);
+ return ret;
+
+ case SIOCGSTAMPNS:
+ lock_sock(sk);
+ ret = sock_get_timestampns(sk, argp);
+ release_sock(sk);
+ return ret;
+
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFMETRIC:
+ case SIOCSIFMETRIC:
+ return -EINVAL;
+
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCNRDECOBS:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return nr_rt_ioctl(cmd, argp);
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *nr_info_start(struct seq_file *seq, loff_t *pos)
+{
+ spin_lock_bh(&nr_list_lock);
+ return seq_hlist_start_head(&nr_list, *pos);
+}
+
+static void *nr_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_hlist_next(v, &nr_list, pos);
+}
+
+static void nr_info_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_bh(&nr_list_lock);
+}
+
+static int nr_info_show(struct seq_file *seq, void *v)
+{
+ struct sock *s = sk_entry(v);
+ struct net_device *dev;
+ struct nr_sock *nr;
+ const char *devname;
+ char buf[11];
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+"user_addr dest_node src_node dev my your st vs vr va t1 t2 t4 idle n2 wnd Snd-Q Rcv-Q inode\n");
+
+ else {
+
+ bh_lock_sock(s);
+ nr = nr_sk(s);
+
+ if ((dev = nr->device) == NULL)
+ devname = "???";
+ else
+ devname = dev->name;
+
+ seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr));
+ seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr));
+ seq_printf(seq,
+"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n",
+ ax2asc(buf, &nr->source_addr),
+ devname,
+ nr->my_index,
+ nr->my_id,
+ nr->your_index,
+ nr->your_id,
+ nr->state,
+ nr->vs,
+ nr->vr,
+ nr->va,
+ ax25_display_timer(&nr->t1timer) / HZ,
+ nr->t1 / HZ,
+ ax25_display_timer(&nr->t2timer) / HZ,
+ nr->t2 / HZ,
+ ax25_display_timer(&nr->t4timer) / HZ,
+ nr->t4 / HZ,
+ ax25_display_timer(&nr->idletimer) / (60 * HZ),
+ nr->idle / (60 * HZ),
+ nr->n2count,
+ nr->n2,
+ nr->window,
+ sk_wmem_alloc_get(s),
+ sk_rmem_alloc_get(s),
+ s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L);
+
+ bh_unlock_sock(s);
+ }
+ return 0;
+}
+
+static const struct seq_operations nr_info_seqops = {
+ .start = nr_info_start,
+ .next = nr_info_next,
+ .stop = nr_info_stop,
+ .show = nr_info_show,
+};
+
+static int nr_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &nr_info_seqops);
+}
+
+static const struct file_operations nr_info_fops = {
+ .owner = THIS_MODULE,
+ .open = nr_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+static const struct net_proto_family nr_family_ops = {
+ .family = PF_NETROM,
+ .create = nr_create,
+ .owner = THIS_MODULE,
+};
+
+static const struct proto_ops nr_proto_ops = {
+ .family = PF_NETROM,
+ .owner = THIS_MODULE,
+ .release = nr_release,
+ .bind = nr_bind,
+ .connect = nr_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = nr_accept,
+ .getname = nr_getname,
+ .poll = datagram_poll,
+ .ioctl = nr_ioctl,
+ .listen = nr_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = nr_setsockopt,
+ .getsockopt = nr_getsockopt,
+ .sendmsg = nr_sendmsg,
+ .recvmsg = nr_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct notifier_block nr_dev_notifier = {
+ .notifier_call = nr_device_event,
+};
+
+static struct net_device **dev_nr;
+
+static struct ax25_protocol nr_pid = {
+ .pid = AX25_P_NETROM,
+ .func = nr_route_frame
+};
+
+static struct ax25_linkfail nr_linkfail_notifier = {
+ .func = nr_link_failed,
+};
+
+static int __init nr_proto_init(void)
+{
+ int i;
+ int rc = proto_register(&nr_proto, 0);
+
+ if (rc != 0)
+ goto out;
+
+ if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) {
+ printk(KERN_ERR "NET/ROM: nr_proto_init - nr_ndevs parameter to large\n");
+ return -1;
+ }
+
+ dev_nr = kzalloc(nr_ndevs * sizeof(struct net_device *), GFP_KERNEL);
+ if (dev_nr == NULL) {
+ printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device array\n");
+ return -1;
+ }
+
+ for (i = 0; i < nr_ndevs; i++) {
+ char name[IFNAMSIZ];
+ struct net_device *dev;
+
+ sprintf(name, "nr%d", i);
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup);
+ if (!dev) {
+ printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n");
+ goto fail;
+ }
+
+ dev->base_addr = i;
+ if (register_netdev(dev)) {
+ printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register network device\n");
+ free_netdev(dev);
+ goto fail;
+ }
+ nr_set_lockdep_key(dev);
+ dev_nr[i] = dev;
+ }
+
+ if (sock_register(&nr_family_ops)) {
+ printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register socket family\n");
+ goto fail;
+ }
+
+ register_netdevice_notifier(&nr_dev_notifier);
+
+ ax25_register_pid(&nr_pid);
+ ax25_linkfail_register(&nr_linkfail_notifier);
+
+#ifdef CONFIG_SYSCTL
+ nr_register_sysctl();
+#endif
+
+ nr_loopback_init();
+
+ proc_create("nr", S_IRUGO, init_net.proc_net, &nr_info_fops);
+ proc_create("nr_neigh", S_IRUGO, init_net.proc_net, &nr_neigh_fops);
+ proc_create("nr_nodes", S_IRUGO, init_net.proc_net, &nr_nodes_fops);
+out:
+ return rc;
+fail:
+ while (--i >= 0) {
+ unregister_netdev(dev_nr[i]);
+ free_netdev(dev_nr[i]);
+ }
+ kfree(dev_nr);
+ proto_unregister(&nr_proto);
+ rc = -1;
+ goto out;
+}
+
+module_init(nr_proto_init);
+
+module_param(nr_ndevs, int, 0);
+MODULE_PARM_DESC(nr_ndevs, "number of NET/ROM devices");
+
+MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The amateur radio NET/ROM network and transport layer protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_NETROM);
+
+static void __exit nr_exit(void)
+{
+ int i;
+
+ remove_proc_entry("nr", init_net.proc_net);
+ remove_proc_entry("nr_neigh", init_net.proc_net);
+ remove_proc_entry("nr_nodes", init_net.proc_net);
+ nr_loopback_clear();
+
+ nr_rt_free();
+
+#ifdef CONFIG_SYSCTL
+ nr_unregister_sysctl();
+#endif
+
+ ax25_linkfail_release(&nr_linkfail_notifier);
+ ax25_protocol_release(AX25_P_NETROM);
+
+ unregister_netdevice_notifier(&nr_dev_notifier);
+
+ sock_unregister(PF_NETROM);
+
+ for (i = 0; i < nr_ndevs; i++) {
+ struct net_device *dev = dev_nr[i];
+ if (dev) {
+ unregister_netdev(dev);
+ free_netdev(dev);
+ }
+ }
+
+ kfree(dev_nr);
+ proto_unregister(&nr_proto);
+}
+module_exit(nr_exit);
diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c
new file mode 100644
index 000000000..988f54248
--- /dev/null
+++ b/net/netrom/nr_dev.c
@@ -0,0 +1,181 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/sysctl.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/if_ether.h> /* For the statistics structure. */
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/io.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/arp.h>
+
+#include <net/ax25.h>
+#include <net/netrom.h>
+
+/*
+ * Only allow IP over NET/ROM frames through if the netrom device is up.
+ */
+
+int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_device_stats *stats = &dev->stats;
+
+ if (!netif_running(dev)) {
+ stats->rx_dropped++;
+ return 0;
+ }
+
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+
+ skb->protocol = htons(ETH_P_IP);
+
+ /* Spoof incoming device */
+ skb->dev = dev;
+ skb->mac_header = skb->network_header;
+ skb_reset_network_header(skb);
+ skb->pkt_type = PACKET_HOST;
+
+ netif_rx(skb);
+
+ return 1;
+}
+
+static int nr_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ unsigned char *buff = skb_push(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
+
+ memcpy(buff, (saddr != NULL) ? saddr : dev->dev_addr, dev->addr_len);
+ buff[6] &= ~AX25_CBIT;
+ buff[6] &= ~AX25_EBIT;
+ buff[6] |= AX25_SSSID_SPARE;
+ buff += AX25_ADDR_LEN;
+
+ if (daddr != NULL)
+ memcpy(buff, daddr, dev->addr_len);
+ buff[6] &= ~AX25_CBIT;
+ buff[6] |= AX25_EBIT;
+ buff[6] |= AX25_SSSID_SPARE;
+ buff += AX25_ADDR_LEN;
+
+ *buff++ = sysctl_netrom_network_ttl_initialiser;
+
+ *buff++ = NR_PROTO_IP;
+ *buff++ = NR_PROTO_IP;
+ *buff++ = 0;
+ *buff++ = 0;
+ *buff++ = NR_PROTOEXT;
+
+ if (daddr != NULL)
+ return 37;
+
+ return -37;
+}
+
+static int __must_check nr_set_mac_address(struct net_device *dev, void *addr)
+{
+ struct sockaddr *sa = addr;
+ int err;
+
+ if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len))
+ return 0;
+
+ if (dev->flags & IFF_UP) {
+ err = ax25_listen_register((ax25_address *)sa->sa_data, NULL);
+ if (err)
+ return err;
+
+ ax25_listen_release((ax25_address *)dev->dev_addr, NULL);
+ }
+
+ memcpy(dev->dev_addr, sa->sa_data, dev->addr_len);
+
+ return 0;
+}
+
+static int nr_open(struct net_device *dev)
+{
+ int err;
+
+ err = ax25_listen_register((ax25_address *)dev->dev_addr, NULL);
+ if (err)
+ return err;
+
+ netif_start_queue(dev);
+
+ return 0;
+}
+
+static int nr_close(struct net_device *dev)
+{
+ ax25_listen_release((ax25_address *)dev->dev_addr, NULL);
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static netdev_tx_t nr_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_device_stats *stats = &dev->stats;
+ unsigned int len = skb->len;
+
+ if (!nr_route_frame(skb, NULL)) {
+ kfree_skb(skb);
+ stats->tx_errors++;
+ return NETDEV_TX_OK;
+ }
+
+ stats->tx_packets++;
+ stats->tx_bytes += len;
+
+ return NETDEV_TX_OK;
+}
+
+static const struct header_ops nr_header_ops = {
+ .create = nr_header,
+};
+
+static const struct net_device_ops nr_netdev_ops = {
+ .ndo_open = nr_open,
+ .ndo_stop = nr_close,
+ .ndo_start_xmit = nr_xmit,
+ .ndo_set_mac_address = nr_set_mac_address,
+};
+
+void nr_setup(struct net_device *dev)
+{
+ dev->mtu = NR_MAX_PACKET_SIZE;
+ dev->netdev_ops = &nr_netdev_ops;
+ dev->header_ops = &nr_header_ops;
+ dev->hard_header_len = NR_NETWORK_LEN + NR_TRANSPORT_LEN;
+ dev->addr_len = AX25_ADDR_LEN;
+ dev->type = ARPHRD_NETROM;
+
+ /* New-style flags. */
+ dev->flags = IFF_NOARP;
+}
diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c
new file mode 100644
index 000000000..80dbd0beb
--- /dev/null
+++ b/net/netrom/nr_in.c
@@ -0,0 +1,305 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/netrom.h>
+
+static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
+{
+ struct sk_buff *skbo, *skbn = skb;
+ struct nr_sock *nr = nr_sk(sk);
+
+ skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
+
+ nr_start_idletimer(sk);
+
+ if (more) {
+ nr->fraglen += skb->len;
+ skb_queue_tail(&nr->frag_queue, skb);
+ return 0;
+ }
+
+ if (!more && nr->fraglen > 0) { /* End of fragment */
+ nr->fraglen += skb->len;
+ skb_queue_tail(&nr->frag_queue, skb);
+
+ if ((skbn = alloc_skb(nr->fraglen, GFP_ATOMIC)) == NULL)
+ return 1;
+
+ skb_reset_transport_header(skbn);
+
+ while ((skbo = skb_dequeue(&nr->frag_queue)) != NULL) {
+ skb_copy_from_linear_data(skbo,
+ skb_put(skbn, skbo->len),
+ skbo->len);
+ kfree_skb(skbo);
+ }
+
+ nr->fraglen = 0;
+ }
+
+ return sock_queue_rcv_skb(sk, skbn);
+}
+
+/*
+ * State machine for state 1, Awaiting Connection State.
+ * The handling of the timer(s) is in file nr_timer.c.
+ * Handling of state 0 and connection release is in netrom.c.
+ */
+static int nr_state1_machine(struct sock *sk, struct sk_buff *skb,
+ int frametype)
+{
+ switch (frametype) {
+ case NR_CONNACK: {
+ struct nr_sock *nr = nr_sk(sk);
+
+ nr_stop_t1timer(sk);
+ nr_start_idletimer(sk);
+ nr->your_index = skb->data[17];
+ nr->your_id = skb->data[18];
+ nr->vs = 0;
+ nr->va = 0;
+ nr->vr = 0;
+ nr->vl = 0;
+ nr->state = NR_STATE_3;
+ nr->n2count = 0;
+ nr->window = skb->data[20];
+ sk->sk_state = TCP_ESTABLISHED;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ break;
+ }
+
+ case NR_CONNACK | NR_CHOKE_FLAG:
+ nr_disconnect(sk, ECONNREFUSED);
+ break;
+
+ case NR_RESET:
+ if (sysctl_netrom_reset_circuit)
+ nr_disconnect(sk, ECONNRESET);
+ break;
+
+ default:
+ break;
+ }
+ return 0;
+}
+
+/*
+ * State machine for state 2, Awaiting Release State.
+ * The handling of the timer(s) is in file nr_timer.c
+ * Handling of state 0 and connection release is in netrom.c.
+ */
+static int nr_state2_machine(struct sock *sk, struct sk_buff *skb,
+ int frametype)
+{
+ switch (frametype) {
+ case NR_CONNACK | NR_CHOKE_FLAG:
+ nr_disconnect(sk, ECONNRESET);
+ break;
+
+ case NR_DISCREQ:
+ nr_write_internal(sk, NR_DISCACK);
+
+ case NR_DISCACK:
+ nr_disconnect(sk, 0);
+ break;
+
+ case NR_RESET:
+ if (sysctl_netrom_reset_circuit)
+ nr_disconnect(sk, ECONNRESET);
+ break;
+
+ default:
+ break;
+ }
+ return 0;
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file nr_timer.c
+ * Handling of state 0 and connection release is in netrom.c.
+ */
+static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ struct nr_sock *nrom = nr_sk(sk);
+ struct sk_buff_head temp_queue;
+ struct sk_buff *skbn;
+ unsigned short save_vr;
+ unsigned short nr, ns;
+ int queued = 0;
+
+ nr = skb->data[18];
+ ns = skb->data[17];
+
+ switch (frametype) {
+ case NR_CONNREQ:
+ nr_write_internal(sk, NR_CONNACK);
+ break;
+
+ case NR_DISCREQ:
+ nr_write_internal(sk, NR_DISCACK);
+ nr_disconnect(sk, 0);
+ break;
+
+ case NR_CONNACK | NR_CHOKE_FLAG:
+ case NR_DISCACK:
+ nr_disconnect(sk, ECONNRESET);
+ break;
+
+ case NR_INFOACK:
+ case NR_INFOACK | NR_CHOKE_FLAG:
+ case NR_INFOACK | NR_NAK_FLAG:
+ case NR_INFOACK | NR_NAK_FLAG | NR_CHOKE_FLAG:
+ if (frametype & NR_CHOKE_FLAG) {
+ nrom->condition |= NR_COND_PEER_RX_BUSY;
+ nr_start_t4timer(sk);
+ } else {
+ nrom->condition &= ~NR_COND_PEER_RX_BUSY;
+ nr_stop_t4timer(sk);
+ }
+ if (!nr_validate_nr(sk, nr)) {
+ break;
+ }
+ if (frametype & NR_NAK_FLAG) {
+ nr_frames_acked(sk, nr);
+ nr_send_nak_frame(sk);
+ } else {
+ if (nrom->condition & NR_COND_PEER_RX_BUSY) {
+ nr_frames_acked(sk, nr);
+ } else {
+ nr_check_iframes_acked(sk, nr);
+ }
+ }
+ break;
+
+ case NR_INFO:
+ case NR_INFO | NR_NAK_FLAG:
+ case NR_INFO | NR_CHOKE_FLAG:
+ case NR_INFO | NR_MORE_FLAG:
+ case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG:
+ case NR_INFO | NR_CHOKE_FLAG | NR_MORE_FLAG:
+ case NR_INFO | NR_NAK_FLAG | NR_MORE_FLAG:
+ case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG | NR_MORE_FLAG:
+ if (frametype & NR_CHOKE_FLAG) {
+ nrom->condition |= NR_COND_PEER_RX_BUSY;
+ nr_start_t4timer(sk);
+ } else {
+ nrom->condition &= ~NR_COND_PEER_RX_BUSY;
+ nr_stop_t4timer(sk);
+ }
+ if (nr_validate_nr(sk, nr)) {
+ if (frametype & NR_NAK_FLAG) {
+ nr_frames_acked(sk, nr);
+ nr_send_nak_frame(sk);
+ } else {
+ if (nrom->condition & NR_COND_PEER_RX_BUSY) {
+ nr_frames_acked(sk, nr);
+ } else {
+ nr_check_iframes_acked(sk, nr);
+ }
+ }
+ }
+ queued = 1;
+ skb_queue_head(&nrom->reseq_queue, skb);
+ if (nrom->condition & NR_COND_OWN_RX_BUSY)
+ break;
+ skb_queue_head_init(&temp_queue);
+ do {
+ save_vr = nrom->vr;
+ while ((skbn = skb_dequeue(&nrom->reseq_queue)) != NULL) {
+ ns = skbn->data[17];
+ if (ns == nrom->vr) {
+ if (nr_queue_rx_frame(sk, skbn, frametype & NR_MORE_FLAG) == 0) {
+ nrom->vr = (nrom->vr + 1) % NR_MODULUS;
+ } else {
+ nrom->condition |= NR_COND_OWN_RX_BUSY;
+ skb_queue_tail(&temp_queue, skbn);
+ }
+ } else if (nr_in_rx_window(sk, ns)) {
+ skb_queue_tail(&temp_queue, skbn);
+ } else {
+ kfree_skb(skbn);
+ }
+ }
+ while ((skbn = skb_dequeue(&temp_queue)) != NULL) {
+ skb_queue_tail(&nrom->reseq_queue, skbn);
+ }
+ } while (save_vr != nrom->vr);
+ /*
+ * Window is full, ack it immediately.
+ */
+ if (((nrom->vl + nrom->window) % NR_MODULUS) == nrom->vr) {
+ nr_enquiry_response(sk);
+ } else {
+ if (!(nrom->condition & NR_COND_ACK_PENDING)) {
+ nrom->condition |= NR_COND_ACK_PENDING;
+ nr_start_t2timer(sk);
+ }
+ }
+ break;
+
+ case NR_RESET:
+ if (sysctl_netrom_reset_circuit)
+ nr_disconnect(sk, ECONNRESET);
+ break;
+
+ default:
+ break;
+ }
+ return queued;
+}
+
+/* Higher level upcall for a LAPB frame - called with sk locked */
+int nr_process_rx_frame(struct sock *sk, struct sk_buff *skb)
+{
+ struct nr_sock *nr = nr_sk(sk);
+ int queued = 0, frametype;
+
+ if (nr->state == NR_STATE_0)
+ return 0;
+
+ frametype = skb->data[19];
+
+ switch (nr->state) {
+ case NR_STATE_1:
+ queued = nr_state1_machine(sk, skb, frametype);
+ break;
+ case NR_STATE_2:
+ queued = nr_state2_machine(sk, skb, frametype);
+ break;
+ case NR_STATE_3:
+ queued = nr_state3_machine(sk, skb, frametype);
+ break;
+ }
+
+ nr_kick(sk);
+
+ return queued;
+}
diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c
new file mode 100644
index 000000000..94d4e922a
--- /dev/null
+++ b/net/netrom/nr_loopback.c
@@ -0,0 +1,77 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi)
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/timer.h>
+#include <net/ax25.h>
+#include <linux/skbuff.h>
+#include <net/netrom.h>
+#include <linux/init.h>
+
+static void nr_loopback_timer(unsigned long);
+
+static struct sk_buff_head loopback_queue;
+static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0);
+
+void __init nr_loopback_init(void)
+{
+ skb_queue_head_init(&loopback_queue);
+}
+
+static inline int nr_loopback_running(void)
+{
+ return timer_pending(&loopback_timer);
+}
+
+int nr_loopback_queue(struct sk_buff *skb)
+{
+ struct sk_buff *skbn;
+
+ if ((skbn = alloc_skb(skb->len, GFP_ATOMIC)) != NULL) {
+ skb_copy_from_linear_data(skb, skb_put(skbn, skb->len), skb->len);
+ skb_reset_transport_header(skbn);
+
+ skb_queue_tail(&loopback_queue, skbn);
+
+ if (!nr_loopback_running())
+ mod_timer(&loopback_timer, jiffies + 10);
+ }
+
+ kfree_skb(skb);
+ return 1;
+}
+
+static void nr_loopback_timer(unsigned long param)
+{
+ struct sk_buff *skb;
+ ax25_address *nr_dest;
+ struct net_device *dev;
+
+ if ((skb = skb_dequeue(&loopback_queue)) != NULL) {
+ nr_dest = (ax25_address *)(skb->data + 7);
+
+ dev = nr_dev_get(nr_dest);
+
+ if (dev == NULL || nr_rx_frame(skb, dev) == 0)
+ kfree_skb(skb);
+
+ if (dev != NULL)
+ dev_put(dev);
+
+ if (!skb_queue_empty(&loopback_queue) && !nr_loopback_running())
+ mod_timer(&loopback_timer, jiffies + 10);
+ }
+}
+
+void __exit nr_loopback_clear(void)
+{
+ del_timer_sync(&loopback_timer);
+ skb_queue_purge(&loopback_queue);
+}
diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c
new file mode 100644
index 000000000..00fbf1419
--- /dev/null
+++ b/net/netrom/nr_out.c
@@ -0,0 +1,273 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/netrom.h>
+
+/*
+ * This is where all NET/ROM frames pass, except for IP-over-NET/ROM which
+ * cannot be fragmented in this manner.
+ */
+void nr_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *skbn;
+ unsigned char transport[NR_TRANSPORT_LEN];
+ int err, frontlen, len;
+
+ if (skb->len - NR_TRANSPORT_LEN > NR_MAX_PACKET_SIZE) {
+ /* Save a copy of the Transport Header */
+ skb_copy_from_linear_data(skb, transport, NR_TRANSPORT_LEN);
+ skb_pull(skb, NR_TRANSPORT_LEN);
+
+ frontlen = skb_headroom(skb);
+
+ while (skb->len > 0) {
+ if ((skbn = sock_alloc_send_skb(sk, frontlen + NR_MAX_PACKET_SIZE, 0, &err)) == NULL)
+ return;
+
+ skb_reserve(skbn, frontlen);
+
+ len = (NR_MAX_PACKET_SIZE > skb->len) ? skb->len : NR_MAX_PACKET_SIZE;
+
+ /* Copy the user data */
+ skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
+ skb_pull(skb, len);
+
+ /* Duplicate the Transport Header */
+ skb_push(skbn, NR_TRANSPORT_LEN);
+ skb_copy_to_linear_data(skbn, transport,
+ NR_TRANSPORT_LEN);
+ if (skb->len > 0)
+ skbn->data[4] |= NR_MORE_FLAG;
+
+ skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */
+ }
+
+ kfree_skb(skb);
+ } else {
+ skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */
+ }
+
+ nr_kick(sk);
+}
+
+/*
+ * This procedure is passed a buffer descriptor for an iframe. It builds
+ * the rest of the control part of the frame and then writes it out.
+ */
+static void nr_send_iframe(struct sock *sk, struct sk_buff *skb)
+{
+ struct nr_sock *nr = nr_sk(sk);
+
+ if (skb == NULL)
+ return;
+
+ skb->data[2] = nr->vs;
+ skb->data[3] = nr->vr;
+
+ if (nr->condition & NR_COND_OWN_RX_BUSY)
+ skb->data[4] |= NR_CHOKE_FLAG;
+
+ nr_start_idletimer(sk);
+
+ nr_transmit_buffer(sk, skb);
+}
+
+void nr_send_nak_frame(struct sock *sk)
+{
+ struct sk_buff *skb, *skbn;
+ struct nr_sock *nr = nr_sk(sk);
+
+ if ((skb = skb_peek(&nr->ack_queue)) == NULL)
+ return;
+
+ if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL)
+ return;
+
+ skbn->data[2] = nr->va;
+ skbn->data[3] = nr->vr;
+
+ if (nr->condition & NR_COND_OWN_RX_BUSY)
+ skbn->data[4] |= NR_CHOKE_FLAG;
+
+ nr_transmit_buffer(sk, skbn);
+
+ nr->condition &= ~NR_COND_ACK_PENDING;
+ nr->vl = nr->vr;
+
+ nr_stop_t1timer(sk);
+}
+
+void nr_kick(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+ struct sk_buff *skb, *skbn;
+ unsigned short start, end;
+
+ if (nr->state != NR_STATE_3)
+ return;
+
+ if (nr->condition & NR_COND_PEER_RX_BUSY)
+ return;
+
+ if (!skb_peek(&sk->sk_write_queue))
+ return;
+
+ start = (skb_peek(&nr->ack_queue) == NULL) ? nr->va : nr->vs;
+ end = (nr->va + nr->window) % NR_MODULUS;
+
+ if (start == end)
+ return;
+
+ nr->vs = start;
+
+ /*
+ * Transmit data until either we're out of data to send or
+ * the window is full.
+ */
+
+ /*
+ * Dequeue the frame and copy it.
+ */
+ skb = skb_dequeue(&sk->sk_write_queue);
+
+ do {
+ if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ skb_queue_head(&sk->sk_write_queue, skb);
+ break;
+ }
+
+ skb_set_owner_w(skbn, sk);
+
+ /*
+ * Transmit the frame copy.
+ */
+ nr_send_iframe(sk, skbn);
+
+ nr->vs = (nr->vs + 1) % NR_MODULUS;
+
+ /*
+ * Requeue the original data frame.
+ */
+ skb_queue_tail(&nr->ack_queue, skb);
+
+ } while (nr->vs != end &&
+ (skb = skb_dequeue(&sk->sk_write_queue)) != NULL);
+
+ nr->vl = nr->vr;
+ nr->condition &= ~NR_COND_ACK_PENDING;
+
+ if (!nr_t1timer_running(sk))
+ nr_start_t1timer(sk);
+}
+
+void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb)
+{
+ struct nr_sock *nr = nr_sk(sk);
+ unsigned char *dptr;
+
+ /*
+ * Add the protocol byte and network header.
+ */
+ dptr = skb_push(skb, NR_NETWORK_LEN);
+
+ memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN);
+ dptr[6] &= ~AX25_CBIT;
+ dptr[6] &= ~AX25_EBIT;
+ dptr[6] |= AX25_SSSID_SPARE;
+ dptr += AX25_ADDR_LEN;
+
+ memcpy(dptr, &nr->dest_addr, AX25_ADDR_LEN);
+ dptr[6] &= ~AX25_CBIT;
+ dptr[6] |= AX25_EBIT;
+ dptr[6] |= AX25_SSSID_SPARE;
+ dptr += AX25_ADDR_LEN;
+
+ *dptr++ = sysctl_netrom_network_ttl_initialiser;
+
+ if (!nr_route_frame(skb, NULL)) {
+ kfree_skb(skb);
+ nr_disconnect(sk, ENETUNREACH);
+ }
+}
+
+/*
+ * The following routines are taken from page 170 of the 7th ARRL Computer
+ * Networking Conference paper, as is the whole state machine.
+ */
+
+void nr_establish_data_link(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+
+ nr->condition = 0x00;
+ nr->n2count = 0;
+
+ nr_write_internal(sk, NR_CONNREQ);
+
+ nr_stop_t2timer(sk);
+ nr_stop_t4timer(sk);
+ nr_stop_idletimer(sk);
+ nr_start_t1timer(sk);
+}
+
+/*
+ * Never send a NAK when we are CHOKEd.
+ */
+void nr_enquiry_response(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+ int frametype = NR_INFOACK;
+
+ if (nr->condition & NR_COND_OWN_RX_BUSY) {
+ frametype |= NR_CHOKE_FLAG;
+ } else {
+ if (skb_peek(&nr->reseq_queue) != NULL)
+ frametype |= NR_NAK_FLAG;
+ }
+
+ nr_write_internal(sk, frametype);
+
+ nr->vl = nr->vr;
+ nr->condition &= ~NR_COND_ACK_PENDING;
+}
+
+void nr_check_iframes_acked(struct sock *sk, unsigned short nr)
+{
+ struct nr_sock *nrom = nr_sk(sk);
+
+ if (nrom->vs == nr) {
+ nr_frames_acked(sk, nr);
+ nr_stop_t1timer(sk);
+ nrom->n2count = 0;
+ } else {
+ if (nrom->va != nr) {
+ nr_frames_acked(sk, nr);
+ nr_start_t1timer(sk);
+ }
+ }
+}
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
new file mode 100644
index 000000000..96b64d2f6
--- /dev/null
+++ b/net/netrom/nr_route.c
@@ -0,0 +1,1030 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/arp.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/netfilter.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <net/netrom.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+
+static unsigned int nr_neigh_no = 1;
+
+static HLIST_HEAD(nr_node_list);
+static DEFINE_SPINLOCK(nr_node_list_lock);
+static HLIST_HEAD(nr_neigh_list);
+static DEFINE_SPINLOCK(nr_neigh_list_lock);
+
+static struct nr_node *nr_node_get(ax25_address *callsign)
+{
+ struct nr_node *found = NULL;
+ struct nr_node *nr_node;
+
+ spin_lock_bh(&nr_node_list_lock);
+ nr_node_for_each(nr_node, &nr_node_list)
+ if (ax25cmp(callsign, &nr_node->callsign) == 0) {
+ nr_node_hold(nr_node);
+ found = nr_node;
+ break;
+ }
+ spin_unlock_bh(&nr_node_list_lock);
+ return found;
+}
+
+static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign,
+ struct net_device *dev)
+{
+ struct nr_neigh *found = NULL;
+ struct nr_neigh *nr_neigh;
+
+ spin_lock_bh(&nr_neigh_list_lock);
+ nr_neigh_for_each(nr_neigh, &nr_neigh_list)
+ if (ax25cmp(callsign, &nr_neigh->callsign) == 0 &&
+ nr_neigh->dev == dev) {
+ nr_neigh_hold(nr_neigh);
+ found = nr_neigh;
+ break;
+ }
+ spin_unlock_bh(&nr_neigh_list_lock);
+ return found;
+}
+
+static void nr_remove_neigh(struct nr_neigh *);
+
+/*
+ * Add a new route to a node, and in the process add the node and the
+ * neighbour if it is new.
+ */
+static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic,
+ ax25_address *ax25, ax25_digi *ax25_digi, struct net_device *dev,
+ int quality, int obs_count)
+{
+ struct nr_node *nr_node;
+ struct nr_neigh *nr_neigh;
+ struct nr_route nr_route;
+ int i, found;
+ struct net_device *odev;
+
+ if ((odev=nr_dev_get(nr)) != NULL) { /* Can't add routes to ourself */
+ dev_put(odev);
+ return -EINVAL;
+ }
+
+ nr_node = nr_node_get(nr);
+
+ nr_neigh = nr_neigh_get_dev(ax25, dev);
+
+ /*
+ * The L2 link to a neighbour has failed in the past
+ * and now a frame comes from this neighbour. We assume
+ * it was a temporary trouble with the link and reset the
+ * routes now (and not wait for a node broadcast).
+ */
+ if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) {
+ struct nr_node *nr_nodet;
+
+ spin_lock_bh(&nr_node_list_lock);
+ nr_node_for_each(nr_nodet, &nr_node_list) {
+ nr_node_lock(nr_nodet);
+ for (i = 0; i < nr_nodet->count; i++)
+ if (nr_nodet->routes[i].neighbour == nr_neigh)
+ if (i < nr_nodet->which)
+ nr_nodet->which = i;
+ nr_node_unlock(nr_nodet);
+ }
+ spin_unlock_bh(&nr_node_list_lock);
+ }
+
+ if (nr_neigh != NULL)
+ nr_neigh->failed = 0;
+
+ if (quality == 0 && nr_neigh != NULL && nr_node != NULL) {
+ nr_neigh_put(nr_neigh);
+ nr_node_put(nr_node);
+ return 0;
+ }
+
+ if (nr_neigh == NULL) {
+ if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) {
+ if (nr_node)
+ nr_node_put(nr_node);
+ return -ENOMEM;
+ }
+
+ nr_neigh->callsign = *ax25;
+ nr_neigh->digipeat = NULL;
+ nr_neigh->ax25 = NULL;
+ nr_neigh->dev = dev;
+ nr_neigh->quality = sysctl_netrom_default_path_quality;
+ nr_neigh->locked = 0;
+ nr_neigh->count = 0;
+ nr_neigh->number = nr_neigh_no++;
+ nr_neigh->failed = 0;
+ atomic_set(&nr_neigh->refcount, 1);
+
+ if (ax25_digi != NULL && ax25_digi->ndigi > 0) {
+ nr_neigh->digipeat = kmemdup(ax25_digi,
+ sizeof(*ax25_digi),
+ GFP_KERNEL);
+ if (nr_neigh->digipeat == NULL) {
+ kfree(nr_neigh);
+ if (nr_node)
+ nr_node_put(nr_node);
+ return -ENOMEM;
+ }
+ }
+
+ spin_lock_bh(&nr_neigh_list_lock);
+ hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list);
+ nr_neigh_hold(nr_neigh);
+ spin_unlock_bh(&nr_neigh_list_lock);
+ }
+
+ if (quality != 0 && ax25cmp(nr, ax25) == 0 && !nr_neigh->locked)
+ nr_neigh->quality = quality;
+
+ if (nr_node == NULL) {
+ if ((nr_node = kmalloc(sizeof(*nr_node), GFP_ATOMIC)) == NULL) {
+ if (nr_neigh)
+ nr_neigh_put(nr_neigh);
+ return -ENOMEM;
+ }
+
+ nr_node->callsign = *nr;
+ strcpy(nr_node->mnemonic, mnemonic);
+
+ nr_node->which = 0;
+ nr_node->count = 1;
+ atomic_set(&nr_node->refcount, 1);
+ spin_lock_init(&nr_node->node_lock);
+
+ nr_node->routes[0].quality = quality;
+ nr_node->routes[0].obs_count = obs_count;
+ nr_node->routes[0].neighbour = nr_neigh;
+
+ nr_neigh_hold(nr_neigh);
+ nr_neigh->count++;
+
+ spin_lock_bh(&nr_node_list_lock);
+ hlist_add_head(&nr_node->node_node, &nr_node_list);
+ /* refcount initialized at 1 */
+ spin_unlock_bh(&nr_node_list_lock);
+
+ return 0;
+ }
+ nr_node_lock(nr_node);
+
+ if (quality != 0)
+ strcpy(nr_node->mnemonic, mnemonic);
+
+ for (found = 0, i = 0; i < nr_node->count; i++) {
+ if (nr_node->routes[i].neighbour == nr_neigh) {
+ nr_node->routes[i].quality = quality;
+ nr_node->routes[i].obs_count = obs_count;
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ /* We have space at the bottom, slot it in */
+ if (nr_node->count < 3) {
+ nr_node->routes[2] = nr_node->routes[1];
+ nr_node->routes[1] = nr_node->routes[0];
+
+ nr_node->routes[0].quality = quality;
+ nr_node->routes[0].obs_count = obs_count;
+ nr_node->routes[0].neighbour = nr_neigh;
+
+ nr_node->which++;
+ nr_node->count++;
+ nr_neigh_hold(nr_neigh);
+ nr_neigh->count++;
+ } else {
+ /* It must be better than the worst */
+ if (quality > nr_node->routes[2].quality) {
+ nr_node->routes[2].neighbour->count--;
+ nr_neigh_put(nr_node->routes[2].neighbour);
+
+ if (nr_node->routes[2].neighbour->count == 0 && !nr_node->routes[2].neighbour->locked)
+ nr_remove_neigh(nr_node->routes[2].neighbour);
+
+ nr_node->routes[2].quality = quality;
+ nr_node->routes[2].obs_count = obs_count;
+ nr_node->routes[2].neighbour = nr_neigh;
+
+ nr_neigh_hold(nr_neigh);
+ nr_neigh->count++;
+ }
+ }
+ }
+
+ /* Now re-sort the routes in quality order */
+ switch (nr_node->count) {
+ case 3:
+ if (nr_node->routes[1].quality > nr_node->routes[0].quality) {
+ switch (nr_node->which) {
+ case 0:
+ nr_node->which = 1;
+ break;
+ case 1:
+ nr_node->which = 0;
+ break;
+ }
+ nr_route = nr_node->routes[0];
+ nr_node->routes[0] = nr_node->routes[1];
+ nr_node->routes[1] = nr_route;
+ }
+ if (nr_node->routes[2].quality > nr_node->routes[1].quality) {
+ switch (nr_node->which) {
+ case 1: nr_node->which = 2;
+ break;
+
+ case 2: nr_node->which = 1;
+ break;
+
+ default:
+ break;
+ }
+ nr_route = nr_node->routes[1];
+ nr_node->routes[1] = nr_node->routes[2];
+ nr_node->routes[2] = nr_route;
+ }
+ case 2:
+ if (nr_node->routes[1].quality > nr_node->routes[0].quality) {
+ switch (nr_node->which) {
+ case 0: nr_node->which = 1;
+ break;
+
+ case 1: nr_node->which = 0;
+ break;
+
+ default: break;
+ }
+ nr_route = nr_node->routes[0];
+ nr_node->routes[0] = nr_node->routes[1];
+ nr_node->routes[1] = nr_route;
+ }
+ case 1:
+ break;
+ }
+
+ for (i = 0; i < nr_node->count; i++) {
+ if (nr_node->routes[i].neighbour == nr_neigh) {
+ if (i < nr_node->which)
+ nr_node->which = i;
+ break;
+ }
+ }
+
+ nr_neigh_put(nr_neigh);
+ nr_node_unlock(nr_node);
+ nr_node_put(nr_node);
+ return 0;
+}
+
+static inline void __nr_remove_node(struct nr_node *nr_node)
+{
+ hlist_del_init(&nr_node->node_node);
+ nr_node_put(nr_node);
+}
+
+#define nr_remove_node_locked(__node) \
+ __nr_remove_node(__node)
+
+static void nr_remove_node(struct nr_node *nr_node)
+{
+ spin_lock_bh(&nr_node_list_lock);
+ __nr_remove_node(nr_node);
+ spin_unlock_bh(&nr_node_list_lock);
+}
+
+static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh)
+{
+ hlist_del_init(&nr_neigh->neigh_node);
+ nr_neigh_put(nr_neigh);
+}
+
+#define nr_remove_neigh_locked(__neigh) \
+ __nr_remove_neigh(__neigh)
+
+static void nr_remove_neigh(struct nr_neigh *nr_neigh)
+{
+ spin_lock_bh(&nr_neigh_list_lock);
+ __nr_remove_neigh(nr_neigh);
+ spin_unlock_bh(&nr_neigh_list_lock);
+}
+
+/*
+ * "Delete" a node. Strictly speaking remove a route to a node. The node
+ * is only deleted if no routes are left to it.
+ */
+static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct net_device *dev)
+{
+ struct nr_node *nr_node;
+ struct nr_neigh *nr_neigh;
+ int i;
+
+ nr_node = nr_node_get(callsign);
+
+ if (nr_node == NULL)
+ return -EINVAL;
+
+ nr_neigh = nr_neigh_get_dev(neighbour, dev);
+
+ if (nr_neigh == NULL) {
+ nr_node_put(nr_node);
+ return -EINVAL;
+ }
+
+ nr_node_lock(nr_node);
+ for (i = 0; i < nr_node->count; i++) {
+ if (nr_node->routes[i].neighbour == nr_neigh) {
+ nr_neigh->count--;
+ nr_neigh_put(nr_neigh);
+
+ if (nr_neigh->count == 0 && !nr_neigh->locked)
+ nr_remove_neigh(nr_neigh);
+ nr_neigh_put(nr_neigh);
+
+ nr_node->count--;
+
+ if (nr_node->count == 0) {
+ nr_remove_node(nr_node);
+ } else {
+ switch (i) {
+ case 0:
+ nr_node->routes[0] = nr_node->routes[1];
+ case 1:
+ nr_node->routes[1] = nr_node->routes[2];
+ case 2:
+ break;
+ }
+ nr_node_put(nr_node);
+ }
+ nr_node_unlock(nr_node);
+
+ return 0;
+ }
+ }
+ nr_neigh_put(nr_neigh);
+ nr_node_unlock(nr_node);
+ nr_node_put(nr_node);
+
+ return -EINVAL;
+}
+
+/*
+ * Lock a neighbour with a quality.
+ */
+static int __must_check nr_add_neigh(ax25_address *callsign,
+ ax25_digi *ax25_digi, struct net_device *dev, unsigned int quality)
+{
+ struct nr_neigh *nr_neigh;
+
+ nr_neigh = nr_neigh_get_dev(callsign, dev);
+ if (nr_neigh) {
+ nr_neigh->quality = quality;
+ nr_neigh->locked = 1;
+ nr_neigh_put(nr_neigh);
+ return 0;
+ }
+
+ if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL)
+ return -ENOMEM;
+
+ nr_neigh->callsign = *callsign;
+ nr_neigh->digipeat = NULL;
+ nr_neigh->ax25 = NULL;
+ nr_neigh->dev = dev;
+ nr_neigh->quality = quality;
+ nr_neigh->locked = 1;
+ nr_neigh->count = 0;
+ nr_neigh->number = nr_neigh_no++;
+ nr_neigh->failed = 0;
+ atomic_set(&nr_neigh->refcount, 1);
+
+ if (ax25_digi != NULL && ax25_digi->ndigi > 0) {
+ nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi),
+ GFP_KERNEL);
+ if (nr_neigh->digipeat == NULL) {
+ kfree(nr_neigh);
+ return -ENOMEM;
+ }
+ }
+
+ spin_lock_bh(&nr_neigh_list_lock);
+ hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list);
+ /* refcount is initialized at 1 */
+ spin_unlock_bh(&nr_neigh_list_lock);
+
+ return 0;
+}
+
+/*
+ * "Delete" a neighbour. The neighbour is only removed if the number
+ * of nodes that may use it is zero.
+ */
+static int nr_del_neigh(ax25_address *callsign, struct net_device *dev, unsigned int quality)
+{
+ struct nr_neigh *nr_neigh;
+
+ nr_neigh = nr_neigh_get_dev(callsign, dev);
+
+ if (nr_neigh == NULL) return -EINVAL;
+
+ nr_neigh->quality = quality;
+ nr_neigh->locked = 0;
+
+ if (nr_neigh->count == 0)
+ nr_remove_neigh(nr_neigh);
+ nr_neigh_put(nr_neigh);
+
+ return 0;
+}
+
+/*
+ * Decrement the obsolescence count by one. If a route is reduced to a
+ * count of zero, remove it. Also remove any unlocked neighbours with
+ * zero nodes routing via it.
+ */
+static int nr_dec_obs(void)
+{
+ struct nr_neigh *nr_neigh;
+ struct nr_node *s;
+ struct hlist_node *nodet;
+ int i;
+
+ spin_lock_bh(&nr_node_list_lock);
+ nr_node_for_each_safe(s, nodet, &nr_node_list) {
+ nr_node_lock(s);
+ for (i = 0; i < s->count; i++) {
+ switch (s->routes[i].obs_count) {
+ case 0: /* A locked entry */
+ break;
+
+ case 1: /* From 1 -> 0 */
+ nr_neigh = s->routes[i].neighbour;
+
+ nr_neigh->count--;
+ nr_neigh_put(nr_neigh);
+
+ if (nr_neigh->count == 0 && !nr_neigh->locked)
+ nr_remove_neigh(nr_neigh);
+
+ s->count--;
+
+ switch (i) {
+ case 0:
+ s->routes[0] = s->routes[1];
+ /* Fallthrough */
+ case 1:
+ s->routes[1] = s->routes[2];
+ case 2:
+ break;
+ }
+ break;
+
+ default:
+ s->routes[i].obs_count--;
+ break;
+
+ }
+ }
+
+ if (s->count <= 0)
+ nr_remove_node_locked(s);
+ nr_node_unlock(s);
+ }
+ spin_unlock_bh(&nr_node_list_lock);
+
+ return 0;
+}
+
+/*
+ * A device has been removed. Remove its routes and neighbours.
+ */
+void nr_rt_device_down(struct net_device *dev)
+{
+ struct nr_neigh *s;
+ struct hlist_node *nodet, *node2t;
+ struct nr_node *t;
+ int i;
+
+ spin_lock_bh(&nr_neigh_list_lock);
+ nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) {
+ if (s->dev == dev) {
+ spin_lock_bh(&nr_node_list_lock);
+ nr_node_for_each_safe(t, node2t, &nr_node_list) {
+ nr_node_lock(t);
+ for (i = 0; i < t->count; i++) {
+ if (t->routes[i].neighbour == s) {
+ t->count--;
+
+ switch (i) {
+ case 0:
+ t->routes[0] = t->routes[1];
+ case 1:
+ t->routes[1] = t->routes[2];
+ case 2:
+ break;
+ }
+ }
+ }
+
+ if (t->count <= 0)
+ nr_remove_node_locked(t);
+ nr_node_unlock(t);
+ }
+ spin_unlock_bh(&nr_node_list_lock);
+
+ nr_remove_neigh_locked(s);
+ }
+ }
+ spin_unlock_bh(&nr_neigh_list_lock);
+}
+
+/*
+ * Check that the device given is a valid AX.25 interface that is "up".
+ * Or a valid ethernet interface with an AX.25 callsign binding.
+ */
+static struct net_device *nr_ax25_dev_get(char *devname)
+{
+ struct net_device *dev;
+
+ if ((dev = dev_get_by_name(&init_net, devname)) == NULL)
+ return NULL;
+
+ if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25)
+ return dev;
+
+ dev_put(dev);
+ return NULL;
+}
+
+/*
+ * Find the first active NET/ROM device, usually "nr0".
+ */
+struct net_device *nr_dev_first(void)
+{
+ struct net_device *dev, *first = NULL;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM)
+ if (first == NULL || strncmp(dev->name, first->name, 3) < 0)
+ first = dev;
+ }
+ if (first)
+ dev_hold(first);
+ rcu_read_unlock();
+
+ return first;
+}
+
+/*
+ * Find the NET/ROM device for the given callsign.
+ */
+struct net_device *nr_dev_get(ax25_address *addr)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM &&
+ ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) {
+ dev_hold(dev);
+ goto out;
+ }
+ }
+ dev = NULL;
+out:
+ rcu_read_unlock();
+ return dev;
+}
+
+static ax25_digi *nr_call_to_digi(ax25_digi *digi, int ndigis,
+ ax25_address *digipeaters)
+{
+ int i;
+
+ if (ndigis == 0)
+ return NULL;
+
+ for (i = 0; i < ndigis; i++) {
+ digi->calls[i] = digipeaters[i];
+ digi->repeated[i] = 0;
+ }
+
+ digi->ndigi = ndigis;
+ digi->lastrepeat = -1;
+
+ return digi;
+}
+
+/*
+ * Handle the ioctls that control the routing functions.
+ */
+int nr_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct nr_route_struct nr_route;
+ struct net_device *dev;
+ ax25_digi digi;
+ int ret;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct)))
+ return -EFAULT;
+ if (nr_route.ndigis > AX25_MAX_DIGIS)
+ return -EINVAL;
+ if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL)
+ return -EINVAL;
+ switch (nr_route.type) {
+ case NETROM_NODE:
+ if (strnlen(nr_route.mnemonic, 7) == 7) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = nr_add_node(&nr_route.callsign,
+ nr_route.mnemonic,
+ &nr_route.neighbour,
+ nr_call_to_digi(&digi, nr_route.ndigis,
+ nr_route.digipeaters),
+ dev, nr_route.quality,
+ nr_route.obs_count);
+ break;
+ case NETROM_NEIGH:
+ ret = nr_add_neigh(&nr_route.callsign,
+ nr_call_to_digi(&digi, nr_route.ndigis,
+ nr_route.digipeaters),
+ dev, nr_route.quality);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ dev_put(dev);
+ return ret;
+
+ case SIOCDELRT:
+ if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct)))
+ return -EFAULT;
+ if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL)
+ return -EINVAL;
+ switch (nr_route.type) {
+ case NETROM_NODE:
+ ret = nr_del_node(&nr_route.callsign,
+ &nr_route.neighbour, dev);
+ break;
+ case NETROM_NEIGH:
+ ret = nr_del_neigh(&nr_route.callsign,
+ dev, nr_route.quality);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ dev_put(dev);
+ return ret;
+
+ case SIOCNRDECOBS:
+ return nr_dec_obs();
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * A level 2 link has timed out, therefore it appears to be a poor link,
+ * then don't use that neighbour until it is reset.
+ */
+void nr_link_failed(ax25_cb *ax25, int reason)
+{
+ struct nr_neigh *s, *nr_neigh = NULL;
+ struct nr_node *nr_node = NULL;
+
+ spin_lock_bh(&nr_neigh_list_lock);
+ nr_neigh_for_each(s, &nr_neigh_list) {
+ if (s->ax25 == ax25) {
+ nr_neigh_hold(s);
+ nr_neigh = s;
+ break;
+ }
+ }
+ spin_unlock_bh(&nr_neigh_list_lock);
+
+ if (nr_neigh == NULL)
+ return;
+
+ nr_neigh->ax25 = NULL;
+ ax25_cb_put(ax25);
+
+ if (++nr_neigh->failed < sysctl_netrom_link_fails_count) {
+ nr_neigh_put(nr_neigh);
+ return;
+ }
+ spin_lock_bh(&nr_node_list_lock);
+ nr_node_for_each(nr_node, &nr_node_list) {
+ nr_node_lock(nr_node);
+ if (nr_node->which < nr_node->count &&
+ nr_node->routes[nr_node->which].neighbour == nr_neigh)
+ nr_node->which++;
+ nr_node_unlock(nr_node);
+ }
+ spin_unlock_bh(&nr_node_list_lock);
+ nr_neigh_put(nr_neigh);
+}
+
+/*
+ * Route a frame to an appropriate AX.25 connection. A NULL ax25_cb
+ * indicates an internally generated frame.
+ */
+int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25)
+{
+ ax25_address *nr_src, *nr_dest;
+ struct nr_neigh *nr_neigh;
+ struct nr_node *nr_node;
+ struct net_device *dev;
+ unsigned char *dptr;
+ ax25_cb *ax25s;
+ int ret;
+ struct sk_buff *skbn;
+
+
+ nr_src = (ax25_address *)(skb->data + 0);
+ nr_dest = (ax25_address *)(skb->data + 7);
+
+ if (ax25 != NULL) {
+ ret = nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat,
+ ax25->ax25_dev->dev, 0,
+ sysctl_netrom_obsolescence_count_initialiser);
+ if (ret)
+ return ret;
+ }
+
+ if ((dev = nr_dev_get(nr_dest)) != NULL) { /* Its for me */
+ if (ax25 == NULL) /* Its from me */
+ ret = nr_loopback_queue(skb);
+ else
+ ret = nr_rx_frame(skb, dev);
+ dev_put(dev);
+ return ret;
+ }
+
+ if (!sysctl_netrom_routing_control && ax25 != NULL)
+ return 0;
+
+ /* Its Time-To-Live has expired */
+ if (skb->data[14] == 1) {
+ return 0;
+ }
+
+ nr_node = nr_node_get(nr_dest);
+ if (nr_node == NULL)
+ return 0;
+ nr_node_lock(nr_node);
+
+ if (nr_node->which >= nr_node->count) {
+ nr_node_unlock(nr_node);
+ nr_node_put(nr_node);
+ return 0;
+ }
+
+ nr_neigh = nr_node->routes[nr_node->which].neighbour;
+
+ if ((dev = nr_dev_first()) == NULL) {
+ nr_node_unlock(nr_node);
+ nr_node_put(nr_node);
+ return 0;
+ }
+
+ /* We are going to change the netrom headers so we should get our
+ own skb, we also did not know until now how much header space
+ we had to reserve... - RXQ */
+ if ((skbn=skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC)) == NULL) {
+ nr_node_unlock(nr_node);
+ nr_node_put(nr_node);
+ dev_put(dev);
+ return 0;
+ }
+ kfree_skb(skb);
+ skb=skbn;
+ skb->data[14]--;
+
+ dptr = skb_push(skb, 1);
+ *dptr = AX25_P_NETROM;
+
+ ax25s = nr_neigh->ax25;
+ nr_neigh->ax25 = ax25_send_frame(skb, 256,
+ (ax25_address *)dev->dev_addr,
+ &nr_neigh->callsign,
+ nr_neigh->digipeat, nr_neigh->dev);
+ if (ax25s)
+ ax25_cb_put(ax25s);
+
+ dev_put(dev);
+ ret = (nr_neigh->ax25 != NULL);
+ nr_node_unlock(nr_node);
+ nr_node_put(nr_node);
+
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *nr_node_start(struct seq_file *seq, loff_t *pos)
+{
+ spin_lock_bh(&nr_node_list_lock);
+ return seq_hlist_start_head(&nr_node_list, *pos);
+}
+
+static void *nr_node_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_hlist_next(v, &nr_node_list, pos);
+}
+
+static void nr_node_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_bh(&nr_node_list_lock);
+}
+
+static int nr_node_show(struct seq_file *seq, void *v)
+{
+ char buf[11];
+ int i;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "callsign mnemonic w n qual obs neigh qual obs neigh qual obs neigh\n");
+ else {
+ struct nr_node *nr_node = hlist_entry(v, struct nr_node,
+ node_node);
+
+ nr_node_lock(nr_node);
+ seq_printf(seq, "%-9s %-7s %d %d",
+ ax2asc(buf, &nr_node->callsign),
+ (nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic,
+ nr_node->which + 1,
+ nr_node->count);
+
+ for (i = 0; i < nr_node->count; i++) {
+ seq_printf(seq, " %3d %d %05d",
+ nr_node->routes[i].quality,
+ nr_node->routes[i].obs_count,
+ nr_node->routes[i].neighbour->number);
+ }
+ nr_node_unlock(nr_node);
+
+ seq_puts(seq, "\n");
+ }
+ return 0;
+}
+
+static const struct seq_operations nr_node_seqops = {
+ .start = nr_node_start,
+ .next = nr_node_next,
+ .stop = nr_node_stop,
+ .show = nr_node_show,
+};
+
+static int nr_node_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &nr_node_seqops);
+}
+
+const struct file_operations nr_nodes_fops = {
+ .owner = THIS_MODULE,
+ .open = nr_node_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *nr_neigh_start(struct seq_file *seq, loff_t *pos)
+{
+ spin_lock_bh(&nr_neigh_list_lock);
+ return seq_hlist_start_head(&nr_neigh_list, *pos);
+}
+
+static void *nr_neigh_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_hlist_next(v, &nr_neigh_list, pos);
+}
+
+static void nr_neigh_stop(struct seq_file *seq, void *v)
+{
+ spin_unlock_bh(&nr_neigh_list_lock);
+}
+
+static int nr_neigh_show(struct seq_file *seq, void *v)
+{
+ char buf[11];
+ int i;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "addr callsign dev qual lock count failed digipeaters\n");
+ else {
+ struct nr_neigh *nr_neigh;
+
+ nr_neigh = hlist_entry(v, struct nr_neigh, neigh_node);
+ seq_printf(seq, "%05d %-9s %-4s %3d %d %3d %3d",
+ nr_neigh->number,
+ ax2asc(buf, &nr_neigh->callsign),
+ nr_neigh->dev ? nr_neigh->dev->name : "???",
+ nr_neigh->quality,
+ nr_neigh->locked,
+ nr_neigh->count,
+ nr_neigh->failed);
+
+ if (nr_neigh->digipeat != NULL) {
+ for (i = 0; i < nr_neigh->digipeat->ndigi; i++)
+ seq_printf(seq, " %s",
+ ax2asc(buf, &nr_neigh->digipeat->calls[i]));
+ }
+
+ seq_puts(seq, "\n");
+ }
+ return 0;
+}
+
+static const struct seq_operations nr_neigh_seqops = {
+ .start = nr_neigh_start,
+ .next = nr_neigh_next,
+ .stop = nr_neigh_stop,
+ .show = nr_neigh_show,
+};
+
+static int nr_neigh_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &nr_neigh_seqops);
+}
+
+const struct file_operations nr_neigh_fops = {
+ .owner = THIS_MODULE,
+ .open = nr_neigh_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif
+
+/*
+ * Free all memory associated with the nodes and routes lists.
+ */
+void __exit nr_rt_free(void)
+{
+ struct nr_neigh *s = NULL;
+ struct nr_node *t = NULL;
+ struct hlist_node *nodet;
+
+ spin_lock_bh(&nr_neigh_list_lock);
+ spin_lock_bh(&nr_node_list_lock);
+ nr_node_for_each_safe(t, nodet, &nr_node_list) {
+ nr_node_lock(t);
+ nr_remove_node_locked(t);
+ nr_node_unlock(t);
+ }
+ nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) {
+ while(s->count) {
+ s->count--;
+ nr_neigh_put(s);
+ }
+ nr_remove_neigh_locked(s);
+ }
+ spin_unlock_bh(&nr_node_list_lock);
+ spin_unlock_bh(&nr_neigh_list_lock);
+}
diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c
new file mode 100644
index 000000000..029c8bb90
--- /dev/null
+++ b/net/netrom/nr_subr.c
@@ -0,0 +1,281 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/netrom.h>
+
+/*
+ * This routine purges all of the queues of frames.
+ */
+void nr_clear_queues(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+
+ skb_queue_purge(&sk->sk_write_queue);
+ skb_queue_purge(&nr->ack_queue);
+ skb_queue_purge(&nr->reseq_queue);
+ skb_queue_purge(&nr->frag_queue);
+}
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+ */
+void nr_frames_acked(struct sock *sk, unsigned short nr)
+{
+ struct nr_sock *nrom = nr_sk(sk);
+ struct sk_buff *skb;
+
+ /*
+ * Remove all the ack-ed frames from the ack queue.
+ */
+ if (nrom->va != nr) {
+ while (skb_peek(&nrom->ack_queue) != NULL && nrom->va != nr) {
+ skb = skb_dequeue(&nrom->ack_queue);
+ kfree_skb(skb);
+ nrom->va = (nrom->va + 1) % NR_MODULUS;
+ }
+ }
+}
+
+/*
+ * Requeue all the un-ack-ed frames on the output queue to be picked
+ * up by nr_kick called from the timer. This arrangement handles the
+ * possibility of an empty output queue.
+ */
+void nr_requeue_frames(struct sock *sk)
+{
+ struct sk_buff *skb, *skb_prev = NULL;
+
+ while ((skb = skb_dequeue(&nr_sk(sk)->ack_queue)) != NULL) {
+ if (skb_prev == NULL)
+ skb_queue_head(&sk->sk_write_queue, skb);
+ else
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
+ skb_prev = skb;
+ }
+}
+
+/*
+ * Validate that the value of nr is between va and vs. Return true or
+ * false for testing.
+ */
+int nr_validate_nr(struct sock *sk, unsigned short nr)
+{
+ struct nr_sock *nrom = nr_sk(sk);
+ unsigned short vc = nrom->va;
+
+ while (vc != nrom->vs) {
+ if (nr == vc) return 1;
+ vc = (vc + 1) % NR_MODULUS;
+ }
+
+ return nr == nrom->vs;
+}
+
+/*
+ * Check that ns is within the receive window.
+ */
+int nr_in_rx_window(struct sock *sk, unsigned short ns)
+{
+ struct nr_sock *nr = nr_sk(sk);
+ unsigned short vc = nr->vr;
+ unsigned short vt = (nr->vl + nr->window) % NR_MODULUS;
+
+ while (vc != vt) {
+ if (ns == vc) return 1;
+ vc = (vc + 1) % NR_MODULUS;
+ }
+
+ return 0;
+}
+
+/*
+ * This routine is called when the HDLC layer internally generates a
+ * control frame.
+ */
+void nr_write_internal(struct sock *sk, int frametype)
+{
+ struct nr_sock *nr = nr_sk(sk);
+ struct sk_buff *skb;
+ unsigned char *dptr;
+ int len, timeout;
+
+ len = NR_NETWORK_LEN + NR_TRANSPORT_LEN;
+
+ switch (frametype & 0x0F) {
+ case NR_CONNREQ:
+ len += 17;
+ break;
+ case NR_CONNACK:
+ len += (nr->bpqext) ? 2 : 1;
+ break;
+ case NR_DISCREQ:
+ case NR_DISCACK:
+ case NR_INFOACK:
+ break;
+ default:
+ printk(KERN_ERR "NET/ROM: nr_write_internal - invalid frame type %d\n", frametype);
+ return;
+ }
+
+ if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+ return;
+
+ /*
+ * Space for AX.25 and NET/ROM network header
+ */
+ skb_reserve(skb, NR_NETWORK_LEN);
+
+ dptr = skb_put(skb, skb_tailroom(skb));
+
+ switch (frametype & 0x0F) {
+ case NR_CONNREQ:
+ timeout = nr->t1 / HZ;
+ *dptr++ = nr->my_index;
+ *dptr++ = nr->my_id;
+ *dptr++ = 0;
+ *dptr++ = 0;
+ *dptr++ = frametype;
+ *dptr++ = nr->window;
+ memcpy(dptr, &nr->user_addr, AX25_ADDR_LEN);
+ dptr[6] &= ~AX25_CBIT;
+ dptr[6] &= ~AX25_EBIT;
+ dptr[6] |= AX25_SSSID_SPARE;
+ dptr += AX25_ADDR_LEN;
+ memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN);
+ dptr[6] &= ~AX25_CBIT;
+ dptr[6] &= ~AX25_EBIT;
+ dptr[6] |= AX25_SSSID_SPARE;
+ dptr += AX25_ADDR_LEN;
+ *dptr++ = timeout % 256;
+ *dptr++ = timeout / 256;
+ break;
+
+ case NR_CONNACK:
+ *dptr++ = nr->your_index;
+ *dptr++ = nr->your_id;
+ *dptr++ = nr->my_index;
+ *dptr++ = nr->my_id;
+ *dptr++ = frametype;
+ *dptr++ = nr->window;
+ if (nr->bpqext) *dptr++ = sysctl_netrom_network_ttl_initialiser;
+ break;
+
+ case NR_DISCREQ:
+ case NR_DISCACK:
+ *dptr++ = nr->your_index;
+ *dptr++ = nr->your_id;
+ *dptr++ = 0;
+ *dptr++ = 0;
+ *dptr++ = frametype;
+ break;
+
+ case NR_INFOACK:
+ *dptr++ = nr->your_index;
+ *dptr++ = nr->your_id;
+ *dptr++ = 0;
+ *dptr++ = nr->vr;
+ *dptr++ = frametype;
+ break;
+ }
+
+ nr_transmit_buffer(sk, skb);
+}
+
+/*
+ * This routine is called to send an error reply.
+ */
+void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags)
+{
+ struct sk_buff *skbn;
+ unsigned char *dptr;
+ int len;
+
+ len = NR_NETWORK_LEN + NR_TRANSPORT_LEN + 1;
+
+ if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skbn, 0);
+
+ dptr = skb_put(skbn, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
+
+ skb_copy_from_linear_data_offset(skb, 7, dptr, AX25_ADDR_LEN);
+ dptr[6] &= ~AX25_CBIT;
+ dptr[6] &= ~AX25_EBIT;
+ dptr[6] |= AX25_SSSID_SPARE;
+ dptr += AX25_ADDR_LEN;
+
+ skb_copy_from_linear_data(skb, dptr, AX25_ADDR_LEN);
+ dptr[6] &= ~AX25_CBIT;
+ dptr[6] |= AX25_EBIT;
+ dptr[6] |= AX25_SSSID_SPARE;
+ dptr += AX25_ADDR_LEN;
+
+ *dptr++ = sysctl_netrom_network_ttl_initialiser;
+
+ if (mine) {
+ *dptr++ = 0;
+ *dptr++ = 0;
+ *dptr++ = skb->data[15];
+ *dptr++ = skb->data[16];
+ } else {
+ *dptr++ = skb->data[15];
+ *dptr++ = skb->data[16];
+ *dptr++ = 0;
+ *dptr++ = 0;
+ }
+
+ *dptr++ = cmdflags;
+ *dptr++ = 0;
+
+ if (!nr_route_frame(skbn, NULL))
+ kfree_skb(skbn);
+}
+
+void nr_disconnect(struct sock *sk, int reason)
+{
+ nr_stop_t1timer(sk);
+ nr_stop_t2timer(sk);
+ nr_stop_t4timer(sk);
+ nr_stop_idletimer(sk);
+
+ nr_clear_queues(sk);
+
+ nr_sk(sk)->state = NR_STATE_0;
+
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_err = reason;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ }
+}
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
new file mode 100644
index 000000000..94d05806a
--- /dev/null
+++ b/net/netrom/nr_timer.c
@@ -0,0 +1,248 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/netrom.h>
+
+static void nr_heartbeat_expiry(unsigned long);
+static void nr_t1timer_expiry(unsigned long);
+static void nr_t2timer_expiry(unsigned long);
+static void nr_t4timer_expiry(unsigned long);
+static void nr_idletimer_expiry(unsigned long);
+
+void nr_init_timers(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+
+ setup_timer(&nr->t1timer, nr_t1timer_expiry, (unsigned long)sk);
+ setup_timer(&nr->t2timer, nr_t2timer_expiry, (unsigned long)sk);
+ setup_timer(&nr->t4timer, nr_t4timer_expiry, (unsigned long)sk);
+ setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk);
+
+ /* initialized by sock_init_data */
+ sk->sk_timer.data = (unsigned long)sk;
+ sk->sk_timer.function = &nr_heartbeat_expiry;
+}
+
+void nr_start_t1timer(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+
+ mod_timer(&nr->t1timer, jiffies + nr->t1);
+}
+
+void nr_start_t2timer(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+
+ mod_timer(&nr->t2timer, jiffies + nr->t2);
+}
+
+void nr_start_t4timer(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+
+ mod_timer(&nr->t4timer, jiffies + nr->t4);
+}
+
+void nr_start_idletimer(struct sock *sk)
+{
+ struct nr_sock *nr = nr_sk(sk);
+
+ if (nr->idle > 0)
+ mod_timer(&nr->idletimer, jiffies + nr->idle);
+}
+
+void nr_start_heartbeat(struct sock *sk)
+{
+ mod_timer(&sk->sk_timer, jiffies + 5 * HZ);
+}
+
+void nr_stop_t1timer(struct sock *sk)
+{
+ del_timer(&nr_sk(sk)->t1timer);
+}
+
+void nr_stop_t2timer(struct sock *sk)
+{
+ del_timer(&nr_sk(sk)->t2timer);
+}
+
+void nr_stop_t4timer(struct sock *sk)
+{
+ del_timer(&nr_sk(sk)->t4timer);
+}
+
+void nr_stop_idletimer(struct sock *sk)
+{
+ del_timer(&nr_sk(sk)->idletimer);
+}
+
+void nr_stop_heartbeat(struct sock *sk)
+{
+ del_timer(&sk->sk_timer);
+}
+
+int nr_t1timer_running(struct sock *sk)
+{
+ return timer_pending(&nr_sk(sk)->t1timer);
+}
+
+static void nr_heartbeat_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+ struct nr_sock *nr = nr_sk(sk);
+
+ bh_lock_sock(sk);
+ switch (nr->state) {
+ case NR_STATE_0:
+ /* Magic here: If we listen() and a new link dies before it
+ is accepted() it isn't 'dead' so doesn't get removed. */
+ if (sock_flag(sk, SOCK_DESTROY) ||
+ (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) {
+ sock_hold(sk);
+ bh_unlock_sock(sk);
+ nr_destroy_socket(sk);
+ sock_put(sk);
+ return;
+ }
+ break;
+
+ case NR_STATE_3:
+ /*
+ * Check for the state of the receive buffer.
+ */
+ if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) &&
+ (nr->condition & NR_COND_OWN_RX_BUSY)) {
+ nr->condition &= ~NR_COND_OWN_RX_BUSY;
+ nr->condition &= ~NR_COND_ACK_PENDING;
+ nr->vl = nr->vr;
+ nr_write_internal(sk, NR_INFOACK);
+ break;
+ }
+ break;
+ }
+
+ nr_start_heartbeat(sk);
+ bh_unlock_sock(sk);
+}
+
+static void nr_t2timer_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+ struct nr_sock *nr = nr_sk(sk);
+
+ bh_lock_sock(sk);
+ if (nr->condition & NR_COND_ACK_PENDING) {
+ nr->condition &= ~NR_COND_ACK_PENDING;
+ nr_enquiry_response(sk);
+ }
+ bh_unlock_sock(sk);
+}
+
+static void nr_t4timer_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+
+ bh_lock_sock(sk);
+ nr_sk(sk)->condition &= ~NR_COND_PEER_RX_BUSY;
+ bh_unlock_sock(sk);
+}
+
+static void nr_idletimer_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+ struct nr_sock *nr = nr_sk(sk);
+
+ bh_lock_sock(sk);
+
+ nr_clear_queues(sk);
+
+ nr->n2count = 0;
+ nr_write_internal(sk, NR_DISCREQ);
+ nr->state = NR_STATE_2;
+
+ nr_start_t1timer(sk);
+ nr_stop_t2timer(sk);
+ nr_stop_t4timer(sk);
+
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_err = 0;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ }
+ bh_unlock_sock(sk);
+}
+
+static void nr_t1timer_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+ struct nr_sock *nr = nr_sk(sk);
+
+ bh_lock_sock(sk);
+ switch (nr->state) {
+ case NR_STATE_1:
+ if (nr->n2count == nr->n2) {
+ nr_disconnect(sk, ETIMEDOUT);
+ bh_unlock_sock(sk);
+ return;
+ } else {
+ nr->n2count++;
+ nr_write_internal(sk, NR_CONNREQ);
+ }
+ break;
+
+ case NR_STATE_2:
+ if (nr->n2count == nr->n2) {
+ nr_disconnect(sk, ETIMEDOUT);
+ bh_unlock_sock(sk);
+ return;
+ } else {
+ nr->n2count++;
+ nr_write_internal(sk, NR_DISCREQ);
+ }
+ break;
+
+ case NR_STATE_3:
+ if (nr->n2count == nr->n2) {
+ nr_disconnect(sk, ETIMEDOUT);
+ bh_unlock_sock(sk);
+ return;
+ } else {
+ nr->n2count++;
+ nr_requeue_frames(sk);
+ }
+ break;
+ }
+
+ nr_start_t1timer(sk);
+ bh_unlock_sock(sk);
+}
diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c
new file mode 100644
index 000000000..ba1c368b3
--- /dev/null
+++ b/net/netrom/sysctl_net_netrom.c
@@ -0,0 +1,157 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com)
+ */
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <net/ax25.h>
+#include <net/netrom.h>
+
+/*
+ * Values taken from NET/ROM documentation.
+ */
+static int min_quality[] = {0}, max_quality[] = {255};
+static int min_obs[] = {0}, max_obs[] = {255};
+static int min_ttl[] = {0}, max_ttl[] = {255};
+static int min_t1[] = {5 * HZ};
+static int max_t1[] = {600 * HZ};
+static int min_n2[] = {2}, max_n2[] = {127};
+static int min_t2[] = {1 * HZ};
+static int max_t2[] = {60 * HZ};
+static int min_t4[] = {1 * HZ};
+static int max_t4[] = {1000 * HZ};
+static int min_window[] = {1}, max_window[] = {127};
+static int min_idle[] = {0 * HZ};
+static int max_idle[] = {65535 * HZ};
+static int min_route[] = {0}, max_route[] = {1};
+static int min_fails[] = {1}, max_fails[] = {10};
+static int min_reset[] = {0}, max_reset[] = {1};
+
+static struct ctl_table_header *nr_table_header;
+
+static struct ctl_table nr_table[] = {
+ {
+ .procname = "default_path_quality",
+ .data = &sysctl_netrom_default_path_quality,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_quality,
+ .extra2 = &max_quality
+ },
+ {
+ .procname = "obsolescence_count_initialiser",
+ .data = &sysctl_netrom_obsolescence_count_initialiser,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_obs,
+ .extra2 = &max_obs
+ },
+ {
+ .procname = "network_ttl_initialiser",
+ .data = &sysctl_netrom_network_ttl_initialiser,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_ttl,
+ .extra2 = &max_ttl
+ },
+ {
+ .procname = "transport_timeout",
+ .data = &sysctl_netrom_transport_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_t1,
+ .extra2 = &max_t1
+ },
+ {
+ .procname = "transport_maximum_tries",
+ .data = &sysctl_netrom_transport_maximum_tries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_n2,
+ .extra2 = &max_n2
+ },
+ {
+ .procname = "transport_acknowledge_delay",
+ .data = &sysctl_netrom_transport_acknowledge_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_t2,
+ .extra2 = &max_t2
+ },
+ {
+ .procname = "transport_busy_delay",
+ .data = &sysctl_netrom_transport_busy_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_t4,
+ .extra2 = &max_t4
+ },
+ {
+ .procname = "transport_requested_window_size",
+ .data = &sysctl_netrom_transport_requested_window_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_window,
+ .extra2 = &max_window
+ },
+ {
+ .procname = "transport_no_activity_timeout",
+ .data = &sysctl_netrom_transport_no_activity_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_idle,
+ .extra2 = &max_idle
+ },
+ {
+ .procname = "routing_control",
+ .data = &sysctl_netrom_routing_control,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_route,
+ .extra2 = &max_route
+ },
+ {
+ .procname = "link_fails_count",
+ .data = &sysctl_netrom_link_fails_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_fails,
+ .extra2 = &max_fails
+ },
+ {
+ .procname = "reset",
+ .data = &sysctl_netrom_reset_circuit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_reset,
+ .extra2 = &max_reset
+ },
+ { }
+};
+
+void __init nr_register_sysctl(void)
+{
+ nr_table_header = register_net_sysctl(&init_net, "net/netrom", nr_table);
+}
+
+void nr_unregister_sysctl(void)
+{
+ unregister_net_sysctl_table(nr_table_header);
+}
diff --git a/net/nfc/Kconfig b/net/nfc/Kconfig
new file mode 100644
index 000000000..6e0fa0cce
--- /dev/null
+++ b/net/nfc/Kconfig
@@ -0,0 +1,34 @@
+#
+# NFC sybsystem configuration
+#
+
+menuconfig NFC
+ depends on NET
+ depends on RFKILL || !RFKILL
+ tristate "NFC subsystem support"
+ default n
+ help
+ Say Y here if you want to build support for NFC (Near field
+ communication) devices.
+
+ To compile this support as a module, choose M here: the module will
+ be called nfc.
+
+config NFC_DIGITAL
+ depends on NFC
+ select CRC_CCITT
+ select CRC_ITU_T
+ tristate "NFC Digital Protocol stack support"
+ default n
+ help
+ Say Y if you want to build NFC digital protocol stack support.
+ This is needed by NFC chipsets whose firmware only implement
+ the NFC analog layer.
+
+ To compile this support as a module, choose M here: the module will
+ be called nfc_digital.
+
+source "net/nfc/nci/Kconfig"
+source "net/nfc/hci/Kconfig"
+
+source "drivers/nfc/Kconfig"
diff --git a/net/nfc/Makefile b/net/nfc/Makefile
new file mode 100644
index 000000000..2555ff8e7
--- /dev/null
+++ b/net/nfc/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the Linux NFC subsystem.
+#
+
+obj-$(CONFIG_NFC) += nfc.o
+obj-$(CONFIG_NFC_NCI) += nci/
+obj-$(CONFIG_NFC_HCI) += hci/
+obj-$(CONFIG_NFC_DIGITAL) += nfc_digital.o
+
+nfc-objs := core.o netlink.o af_nfc.o rawsock.o llcp_core.o llcp_commands.o \
+ llcp_sock.o
+
+nfc_digital-objs := digital_core.o digital_technology.o digital_dep.o
diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c
new file mode 100644
index 000000000..2277276f5
--- /dev/null
+++ b/net/nfc/af_nfc.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ * Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ * Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/nfc.h>
+#include <linux/module.h>
+
+#include "nfc.h"
+
+static DEFINE_RWLOCK(proto_tab_lock);
+static const struct nfc_protocol *proto_tab[NFC_SOCKPROTO_MAX];
+
+static int nfc_sock_create(struct net *net, struct socket *sock, int proto,
+ int kern)
+{
+ int rc = -EPROTONOSUPPORT;
+
+ if (net != &init_net)
+ return -EAFNOSUPPORT;
+
+ if (proto < 0 || proto >= NFC_SOCKPROTO_MAX)
+ return -EINVAL;
+
+ read_lock(&proto_tab_lock);
+ if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) {
+ rc = proto_tab[proto]->create(net, sock, proto_tab[proto]);
+ module_put(proto_tab[proto]->owner);
+ }
+ read_unlock(&proto_tab_lock);
+
+ return rc;
+}
+
+static struct net_proto_family nfc_sock_family_ops = {
+ .owner = THIS_MODULE,
+ .family = PF_NFC,
+ .create = nfc_sock_create,
+};
+
+int nfc_proto_register(const struct nfc_protocol *nfc_proto)
+{
+ int rc;
+
+ if (nfc_proto->id < 0 || nfc_proto->id >= NFC_SOCKPROTO_MAX)
+ return -EINVAL;
+
+ rc = proto_register(nfc_proto->proto, 0);
+ if (rc)
+ return rc;
+
+ write_lock(&proto_tab_lock);
+ if (proto_tab[nfc_proto->id])
+ rc = -EBUSY;
+ else
+ proto_tab[nfc_proto->id] = nfc_proto;
+ write_unlock(&proto_tab_lock);
+
+ return rc;
+}
+EXPORT_SYMBOL(nfc_proto_register);
+
+void nfc_proto_unregister(const struct nfc_protocol *nfc_proto)
+{
+ write_lock(&proto_tab_lock);
+ proto_tab[nfc_proto->id] = NULL;
+ write_unlock(&proto_tab_lock);
+
+ proto_unregister(nfc_proto->proto);
+}
+EXPORT_SYMBOL(nfc_proto_unregister);
+
+int __init af_nfc_init(void)
+{
+ return sock_register(&nfc_sock_family_ops);
+}
+
+void af_nfc_exit(void)
+{
+ sock_unregister(PF_NFC);
+}
diff --git a/net/nfc/core.c b/net/nfc/core.c
new file mode 100644
index 000000000..cff3f1614
--- /dev/null
+++ b/net/nfc/core.c
@@ -0,0 +1,1233 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ * Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ * Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rfkill.h>
+#include <linux/nfc.h>
+
+#include <net/genetlink.h>
+
+#include "nfc.h"
+
+#define VERSION "0.1"
+
+#define NFC_CHECK_PRES_FREQ_MS 2000
+
+int nfc_devlist_generation;
+DEFINE_MUTEX(nfc_devlist_mutex);
+
+/* NFC device ID bitmap */
+static DEFINE_IDA(nfc_index_ida);
+
+int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name)
+{
+ int rc = 0;
+
+ pr_debug("%s do firmware %s\n", dev_name(&dev->dev), firmware_name);
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (dev->dev_up) {
+ rc = -EBUSY;
+ goto error;
+ }
+
+ if (!dev->ops->fw_download) {
+ rc = -EOPNOTSUPP;
+ goto error;
+ }
+
+ dev->fw_download_in_progress = true;
+ rc = dev->ops->fw_download(dev, firmware_name);
+ if (rc)
+ dev->fw_download_in_progress = false;
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+/**
+ * nfc_fw_download_done - inform that a firmware download was completed
+ *
+ * @dev: The nfc device to which firmware was downloaded
+ * @firmware_name: The firmware filename
+ * @result: The positive value of a standard errno value
+ */
+int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+ u32 result)
+{
+ dev->fw_download_in_progress = false;
+
+ return nfc_genl_fw_download_done(dev, firmware_name, result);
+}
+EXPORT_SYMBOL(nfc_fw_download_done);
+
+/**
+ * nfc_dev_up - turn on the NFC device
+ *
+ * @dev: The nfc device to be turned on
+ *
+ * The device remains up until the nfc_dev_down function is called.
+ */
+int nfc_dev_up(struct nfc_dev *dev)
+{
+ int rc = 0;
+
+ pr_debug("dev_name=%s\n", dev_name(&dev->dev));
+
+ device_lock(&dev->dev);
+
+ if (dev->rfkill && rfkill_blocked(dev->rfkill)) {
+ rc = -ERFKILL;
+ goto error;
+ }
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (dev->fw_download_in_progress) {
+ rc = -EBUSY;
+ goto error;
+ }
+
+ if (dev->dev_up) {
+ rc = -EALREADY;
+ goto error;
+ }
+
+ if (dev->ops->dev_up)
+ rc = dev->ops->dev_up(dev);
+
+ if (!rc)
+ dev->dev_up = true;
+
+ /* We have to enable the device before discovering SEs */
+ if (dev->ops->discover_se && dev->ops->discover_se(dev))
+ pr_err("SE discovery failed\n");
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+/**
+ * nfc_dev_down - turn off the NFC device
+ *
+ * @dev: The nfc device to be turned off
+ */
+int nfc_dev_down(struct nfc_dev *dev)
+{
+ int rc = 0;
+
+ pr_debug("dev_name=%s\n", dev_name(&dev->dev));
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!dev->dev_up) {
+ rc = -EALREADY;
+ goto error;
+ }
+
+ if (dev->polling || dev->active_target) {
+ rc = -EBUSY;
+ goto error;
+ }
+
+ if (dev->ops->dev_down)
+ dev->ops->dev_down(dev);
+
+ dev->dev_up = false;
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+static int nfc_rfkill_set_block(void *data, bool blocked)
+{
+ struct nfc_dev *dev = data;
+
+ pr_debug("%s blocked %d", dev_name(&dev->dev), blocked);
+
+ if (!blocked)
+ return 0;
+
+ nfc_dev_down(dev);
+
+ return 0;
+}
+
+static const struct rfkill_ops nfc_rfkill_ops = {
+ .set_block = nfc_rfkill_set_block,
+};
+
+/**
+ * nfc_start_poll - start polling for nfc targets
+ *
+ * @dev: The nfc device that must start polling
+ * @protocols: bitset of nfc protocols that must be used for polling
+ *
+ * The device remains polling for targets until a target is found or
+ * the nfc_stop_poll function is called.
+ */
+int nfc_start_poll(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols)
+{
+ int rc;
+
+ pr_debug("dev_name %s initiator protocols 0x%x target protocols 0x%x\n",
+ dev_name(&dev->dev), im_protocols, tm_protocols);
+
+ if (!im_protocols && !tm_protocols)
+ return -EINVAL;
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!dev->dev_up) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (dev->polling) {
+ rc = -EBUSY;
+ goto error;
+ }
+
+ rc = dev->ops->start_poll(dev, im_protocols, tm_protocols);
+ if (!rc) {
+ dev->polling = true;
+ dev->rf_mode = NFC_RF_NONE;
+ }
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+/**
+ * nfc_stop_poll - stop polling for nfc targets
+ *
+ * @dev: The nfc device that must stop polling
+ */
+int nfc_stop_poll(struct nfc_dev *dev)
+{
+ int rc = 0;
+
+ pr_debug("dev_name=%s\n", dev_name(&dev->dev));
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!dev->polling) {
+ rc = -EINVAL;
+ goto error;
+ }
+
+ dev->ops->stop_poll(dev);
+ dev->polling = false;
+ dev->rf_mode = NFC_RF_NONE;
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+static struct nfc_target *nfc_find_target(struct nfc_dev *dev, u32 target_idx)
+{
+ int i;
+
+ for (i = 0; i < dev->n_targets; i++) {
+ if (dev->targets[i].idx == target_idx)
+ return &dev->targets[i];
+ }
+
+ return NULL;
+}
+
+int nfc_dep_link_up(struct nfc_dev *dev, int target_index, u8 comm_mode)
+{
+ int rc = 0;
+ u8 *gb;
+ size_t gb_len;
+ struct nfc_target *target;
+
+ pr_debug("dev_name=%s comm %d\n", dev_name(&dev->dev), comm_mode);
+
+ if (!dev->ops->dep_link_up)
+ return -EOPNOTSUPP;
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (dev->dep_link_up == true) {
+ rc = -EALREADY;
+ goto error;
+ }
+
+ gb = nfc_llcp_general_bytes(dev, &gb_len);
+ if (gb_len > NFC_MAX_GT_LEN) {
+ rc = -EINVAL;
+ goto error;
+ }
+
+ target = nfc_find_target(dev, target_index);
+ if (target == NULL) {
+ rc = -ENOTCONN;
+ goto error;
+ }
+
+ rc = dev->ops->dep_link_up(dev, target, comm_mode, gb, gb_len);
+ if (!rc) {
+ dev->active_target = target;
+ dev->rf_mode = NFC_RF_INITIATOR;
+ }
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+int nfc_dep_link_down(struct nfc_dev *dev)
+{
+ int rc = 0;
+
+ pr_debug("dev_name=%s\n", dev_name(&dev->dev));
+
+ if (!dev->ops->dep_link_down)
+ return -EOPNOTSUPP;
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (dev->dep_link_up == false) {
+ rc = -EALREADY;
+ goto error;
+ }
+
+ rc = dev->ops->dep_link_down(dev);
+ if (!rc) {
+ dev->dep_link_up = false;
+ dev->active_target = NULL;
+ dev->rf_mode = NFC_RF_NONE;
+ nfc_llcp_mac_is_down(dev);
+ nfc_genl_dep_link_down_event(dev);
+ }
+
+error:
+ device_unlock(&dev->dev);
+
+ return rc;
+}
+
+int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx,
+ u8 comm_mode, u8 rf_mode)
+{
+ dev->dep_link_up = true;
+
+ if (!dev->active_target && rf_mode == NFC_RF_INITIATOR) {
+ struct nfc_target *target;
+
+ target = nfc_find_target(dev, target_idx);
+ if (target == NULL)
+ return -ENOTCONN;
+
+ dev->active_target = target;
+ }
+
+ dev->polling = false;
+ dev->rf_mode = rf_mode;
+
+ nfc_llcp_mac_is_up(dev, target_idx, comm_mode, rf_mode);
+
+ return nfc_genl_dep_link_up_event(dev, target_idx, comm_mode, rf_mode);
+}
+EXPORT_SYMBOL(nfc_dep_link_is_up);
+
+/**
+ * nfc_activate_target - prepare the target for data exchange
+ *
+ * @dev: The nfc device that found the target
+ * @target_idx: index of the target that must be activated
+ * @protocol: nfc protocol that will be used for data exchange
+ */
+int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol)
+{
+ int rc;
+ struct nfc_target *target;
+
+ pr_debug("dev_name=%s target_idx=%u protocol=%u\n",
+ dev_name(&dev->dev), target_idx, protocol);
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (dev->active_target) {
+ rc = -EBUSY;
+ goto error;
+ }
+
+ target = nfc_find_target(dev, target_idx);
+ if (target == NULL) {
+ rc = -ENOTCONN;
+ goto error;
+ }
+
+ rc = dev->ops->activate_target(dev, target, protocol);
+ if (!rc) {
+ dev->active_target = target;
+ dev->rf_mode = NFC_RF_INITIATOR;
+
+ if (dev->ops->check_presence && !dev->shutting_down)
+ mod_timer(&dev->check_pres_timer, jiffies +
+ msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS));
+ }
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+/**
+ * nfc_deactivate_target - deactivate a nfc target
+ *
+ * @dev: The nfc device that found the target
+ * @target_idx: index of the target that must be deactivated
+ */
+int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx)
+{
+ int rc = 0;
+
+ pr_debug("dev_name=%s target_idx=%u\n",
+ dev_name(&dev->dev), target_idx);
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (dev->active_target == NULL) {
+ rc = -ENOTCONN;
+ goto error;
+ }
+
+ if (dev->active_target->idx != target_idx) {
+ rc = -ENOTCONN;
+ goto error;
+ }
+
+ if (dev->ops->check_presence)
+ del_timer_sync(&dev->check_pres_timer);
+
+ dev->ops->deactivate_target(dev, dev->active_target);
+ dev->active_target = NULL;
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+/**
+ * nfc_data_exchange - transceive data
+ *
+ * @dev: The nfc device that found the target
+ * @target_idx: index of the target
+ * @skb: data to be sent
+ * @cb: callback called when the response is received
+ * @cb_context: parameter for the callback function
+ *
+ * The user must wait for the callback before calling this function again.
+ */
+int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb,
+ data_exchange_cb_t cb, void *cb_context)
+{
+ int rc;
+
+ pr_debug("dev_name=%s target_idx=%u skb->len=%u\n",
+ dev_name(&dev->dev), target_idx, skb->len);
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ if (dev->rf_mode == NFC_RF_INITIATOR && dev->active_target != NULL) {
+ if (dev->active_target->idx != target_idx) {
+ rc = -EADDRNOTAVAIL;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ if (dev->ops->check_presence)
+ del_timer_sync(&dev->check_pres_timer);
+
+ rc = dev->ops->im_transceive(dev, dev->active_target, skb, cb,
+ cb_context);
+
+ if (!rc && dev->ops->check_presence && !dev->shutting_down)
+ mod_timer(&dev->check_pres_timer, jiffies +
+ msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS));
+ } else if (dev->rf_mode == NFC_RF_TARGET && dev->ops->tm_send != NULL) {
+ rc = dev->ops->tm_send(dev, skb);
+ } else {
+ rc = -ENOTCONN;
+ kfree_skb(skb);
+ goto error;
+ }
+
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx)
+{
+ struct nfc_se *se;
+
+ list_for_each_entry(se, &dev->secure_elements, list)
+ if (se->idx == se_idx)
+ return se;
+
+ return NULL;
+}
+EXPORT_SYMBOL(nfc_find_se);
+
+int nfc_enable_se(struct nfc_dev *dev, u32 se_idx)
+{
+ struct nfc_se *se;
+ int rc;
+
+ pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!dev->dev_up) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (dev->polling) {
+ rc = -EBUSY;
+ goto error;
+ }
+
+ if (!dev->ops->enable_se || !dev->ops->disable_se) {
+ rc = -EOPNOTSUPP;
+ goto error;
+ }
+
+ se = nfc_find_se(dev, se_idx);
+ if (!se) {
+ rc = -EINVAL;
+ goto error;
+ }
+
+ if (se->state == NFC_SE_ENABLED) {
+ rc = -EALREADY;
+ goto error;
+ }
+
+ rc = dev->ops->enable_se(dev, se_idx);
+ if (rc >= 0)
+ se->state = NFC_SE_ENABLED;
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+int nfc_disable_se(struct nfc_dev *dev, u32 se_idx)
+{
+ struct nfc_se *se;
+ int rc;
+
+ pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!dev->dev_up) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!dev->ops->enable_se || !dev->ops->disable_se) {
+ rc = -EOPNOTSUPP;
+ goto error;
+ }
+
+ se = nfc_find_se(dev, se_idx);
+ if (!se) {
+ rc = -EINVAL;
+ goto error;
+ }
+
+ if (se->state == NFC_SE_DISABLED) {
+ rc = -EALREADY;
+ goto error;
+ }
+
+ rc = dev->ops->disable_se(dev, se_idx);
+ if (rc >= 0)
+ se->state = NFC_SE_DISABLED;
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gb, u8 gb_len)
+{
+ pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len);
+
+ return nfc_llcp_set_remote_gb(dev, gb, gb_len);
+}
+EXPORT_SYMBOL(nfc_set_remote_general_bytes);
+
+u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len)
+{
+ pr_debug("dev_name=%s\n", dev_name(&dev->dev));
+
+ return nfc_llcp_general_bytes(dev, gb_len);
+}
+EXPORT_SYMBOL(nfc_get_local_general_bytes);
+
+int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb)
+{
+ /* Only LLCP target mode for now */
+ if (dev->dep_link_up == false) {
+ kfree_skb(skb);
+ return -ENOLINK;
+ }
+
+ return nfc_llcp_data_received(dev, skb);
+}
+EXPORT_SYMBOL(nfc_tm_data_received);
+
+int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode,
+ u8 *gb, size_t gb_len)
+{
+ int rc;
+
+ device_lock(&dev->dev);
+
+ dev->polling = false;
+
+ if (gb != NULL) {
+ rc = nfc_set_remote_general_bytes(dev, gb, gb_len);
+ if (rc < 0)
+ goto out;
+ }
+
+ dev->rf_mode = NFC_RF_TARGET;
+
+ if (protocol == NFC_PROTO_NFC_DEP_MASK)
+ nfc_dep_link_is_up(dev, 0, comm_mode, NFC_RF_TARGET);
+
+ rc = nfc_genl_tm_activated(dev, protocol);
+
+out:
+ device_unlock(&dev->dev);
+
+ return rc;
+}
+EXPORT_SYMBOL(nfc_tm_activated);
+
+int nfc_tm_deactivated(struct nfc_dev *dev)
+{
+ dev->dep_link_up = false;
+ dev->rf_mode = NFC_RF_NONE;
+
+ return nfc_genl_tm_deactivated(dev);
+}
+EXPORT_SYMBOL(nfc_tm_deactivated);
+
+/**
+ * nfc_alloc_send_skb - allocate a skb for data exchange responses
+ *
+ * @size: size to allocate
+ * @gfp: gfp flags
+ */
+struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk,
+ unsigned int flags, unsigned int size,
+ unsigned int *err)
+{
+ struct sk_buff *skb;
+ unsigned int total_size;
+
+ total_size = size +
+ dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE;
+
+ skb = sock_alloc_send_skb(sk, total_size, flags & MSG_DONTWAIT, err);
+ if (skb)
+ skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE);
+
+ return skb;
+}
+
+/**
+ * nfc_alloc_recv_skb - allocate a skb for data exchange responses
+ *
+ * @size: size to allocate
+ * @gfp: gfp flags
+ */
+struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp)
+{
+ struct sk_buff *skb;
+ unsigned int total_size;
+
+ total_size = size + 1;
+ skb = alloc_skb(total_size, gfp);
+
+ if (skb)
+ skb_reserve(skb, 1);
+
+ return skb;
+}
+EXPORT_SYMBOL(nfc_alloc_recv_skb);
+
+/**
+ * nfc_targets_found - inform that targets were found
+ *
+ * @dev: The nfc device that found the targets
+ * @targets: array of nfc targets found
+ * @ntargets: targets array size
+ *
+ * The device driver must call this function when one or many nfc targets
+ * are found. After calling this function, the device driver must stop
+ * polling for targets.
+ * NOTE: This function can be called with targets=NULL and n_targets=0 to
+ * notify a driver error, meaning that the polling operation cannot complete.
+ * IMPORTANT: this function must not be called from an atomic context.
+ * In addition, it must also not be called from a context that would prevent
+ * the NFC Core to call other nfc ops entry point concurrently.
+ */
+int nfc_targets_found(struct nfc_dev *dev,
+ struct nfc_target *targets, int n_targets)
+{
+ int i;
+
+ pr_debug("dev_name=%s n_targets=%d\n", dev_name(&dev->dev), n_targets);
+
+ for (i = 0; i < n_targets; i++)
+ targets[i].idx = dev->target_next_idx++;
+
+ device_lock(&dev->dev);
+
+ if (dev->polling == false) {
+ device_unlock(&dev->dev);
+ return 0;
+ }
+
+ dev->polling = false;
+
+ dev->targets_generation++;
+
+ kfree(dev->targets);
+ dev->targets = NULL;
+
+ if (targets) {
+ dev->targets = kmemdup(targets,
+ n_targets * sizeof(struct nfc_target),
+ GFP_ATOMIC);
+
+ if (!dev->targets) {
+ dev->n_targets = 0;
+ device_unlock(&dev->dev);
+ return -ENOMEM;
+ }
+ }
+
+ dev->n_targets = n_targets;
+ device_unlock(&dev->dev);
+
+ nfc_genl_targets_found(dev);
+
+ return 0;
+}
+EXPORT_SYMBOL(nfc_targets_found);
+
+/**
+ * nfc_target_lost - inform that an activated target went out of field
+ *
+ * @dev: The nfc device that had the activated target in field
+ * @target_idx: the nfc index of the target
+ *
+ * The device driver must call this function when the activated target
+ * goes out of the field.
+ * IMPORTANT: this function must not be called from an atomic context.
+ * In addition, it must also not be called from a context that would prevent
+ * the NFC Core to call other nfc ops entry point concurrently.
+ */
+int nfc_target_lost(struct nfc_dev *dev, u32 target_idx)
+{
+ struct nfc_target *tg;
+ int i;
+
+ pr_debug("dev_name %s n_target %d\n", dev_name(&dev->dev), target_idx);
+
+ device_lock(&dev->dev);
+
+ for (i = 0; i < dev->n_targets; i++) {
+ tg = &dev->targets[i];
+ if (tg->idx == target_idx)
+ break;
+ }
+
+ if (i == dev->n_targets) {
+ device_unlock(&dev->dev);
+ return -EINVAL;
+ }
+
+ dev->targets_generation++;
+ dev->n_targets--;
+ dev->active_target = NULL;
+
+ if (dev->n_targets) {
+ memcpy(&dev->targets[i], &dev->targets[i + 1],
+ (dev->n_targets - i) * sizeof(struct nfc_target));
+ } else {
+ kfree(dev->targets);
+ dev->targets = NULL;
+ }
+
+ device_unlock(&dev->dev);
+
+ nfc_genl_target_lost(dev, target_idx);
+
+ return 0;
+}
+EXPORT_SYMBOL(nfc_target_lost);
+
+inline void nfc_driver_failure(struct nfc_dev *dev, int err)
+{
+ nfc_targets_found(dev, NULL, 0);
+}
+EXPORT_SYMBOL(nfc_driver_failure);
+
+int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type)
+{
+ struct nfc_se *se;
+ int rc;
+
+ pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
+
+ se = nfc_find_se(dev, se_idx);
+ if (se)
+ return -EALREADY;
+
+ se = kzalloc(sizeof(struct nfc_se), GFP_KERNEL);
+ if (!se)
+ return -ENOMEM;
+
+ se->idx = se_idx;
+ se->type = type;
+ se->state = NFC_SE_DISABLED;
+ INIT_LIST_HEAD(&se->list);
+
+ list_add(&se->list, &dev->secure_elements);
+
+ rc = nfc_genl_se_added(dev, se_idx, type);
+ if (rc < 0) {
+ list_del(&se->list);
+ kfree(se);
+
+ return rc;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(nfc_add_se);
+
+int nfc_remove_se(struct nfc_dev *dev, u32 se_idx)
+{
+ struct nfc_se *se, *n;
+ int rc;
+
+ pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
+
+ list_for_each_entry_safe(se, n, &dev->secure_elements, list)
+ if (se->idx == se_idx) {
+ rc = nfc_genl_se_removed(dev, se_idx);
+ if (rc < 0)
+ return rc;
+
+ list_del(&se->list);
+ kfree(se);
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(nfc_remove_se);
+
+int nfc_se_transaction(struct nfc_dev *dev, u8 se_idx,
+ struct nfc_evt_transaction *evt_transaction)
+{
+ int rc;
+
+ pr_debug("transaction: %x\n", se_idx);
+
+ device_lock(&dev->dev);
+
+ if (!evt_transaction) {
+ rc = -EPROTO;
+ goto out;
+ }
+
+ rc = nfc_genl_se_transaction(dev, se_idx, evt_transaction);
+out:
+ device_unlock(&dev->dev);
+ return rc;
+}
+EXPORT_SYMBOL(nfc_se_transaction);
+
+static void nfc_release(struct device *d)
+{
+ struct nfc_dev *dev = to_nfc_dev(d);
+ struct nfc_se *se, *n;
+
+ pr_debug("dev_name=%s\n", dev_name(&dev->dev));
+
+ nfc_genl_data_exit(&dev->genl_data);
+ kfree(dev->targets);
+
+ list_for_each_entry_safe(se, n, &dev->secure_elements, list) {
+ nfc_genl_se_removed(dev, se->idx);
+ list_del(&se->list);
+ kfree(se);
+ }
+
+ kfree(dev);
+}
+
+static void nfc_check_pres_work(struct work_struct *work)
+{
+ struct nfc_dev *dev = container_of(work, struct nfc_dev,
+ check_pres_work);
+ int rc;
+
+ device_lock(&dev->dev);
+
+ if (dev->active_target && timer_pending(&dev->check_pres_timer) == 0) {
+ rc = dev->ops->check_presence(dev, dev->active_target);
+ if (rc == -EOPNOTSUPP)
+ goto exit;
+ if (rc) {
+ u32 active_target_idx = dev->active_target->idx;
+ device_unlock(&dev->dev);
+ nfc_target_lost(dev, active_target_idx);
+ return;
+ }
+
+ if (!dev->shutting_down)
+ mod_timer(&dev->check_pres_timer, jiffies +
+ msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS));
+ }
+
+exit:
+ device_unlock(&dev->dev);
+}
+
+static void nfc_check_pres_timeout(unsigned long data)
+{
+ struct nfc_dev *dev = (struct nfc_dev *)data;
+
+ schedule_work(&dev->check_pres_work);
+}
+
+struct class nfc_class = {
+ .name = "nfc",
+ .dev_release = nfc_release,
+};
+EXPORT_SYMBOL(nfc_class);
+
+static int match_idx(struct device *d, const void *data)
+{
+ struct nfc_dev *dev = to_nfc_dev(d);
+ const unsigned int *idx = data;
+
+ return dev->idx == *idx;
+}
+
+struct nfc_dev *nfc_get_device(unsigned int idx)
+{
+ struct device *d;
+
+ d = class_find_device(&nfc_class, NULL, &idx, match_idx);
+ if (!d)
+ return NULL;
+
+ return to_nfc_dev(d);
+}
+
+/**
+ * nfc_allocate_device - allocate a new nfc device
+ *
+ * @ops: device operations
+ * @supported_protocols: NFC protocols supported by the device
+ */
+struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
+ u32 supported_protocols,
+ int tx_headroom, int tx_tailroom)
+{
+ struct nfc_dev *dev;
+
+ if (!ops->start_poll || !ops->stop_poll || !ops->activate_target ||
+ !ops->deactivate_target || !ops->im_transceive)
+ return NULL;
+
+ if (!supported_protocols)
+ return NULL;
+
+ dev = kzalloc(sizeof(struct nfc_dev), GFP_KERNEL);
+ if (!dev)
+ return NULL;
+
+ dev->ops = ops;
+ dev->supported_protocols = supported_protocols;
+ dev->tx_headroom = tx_headroom;
+ dev->tx_tailroom = tx_tailroom;
+ INIT_LIST_HEAD(&dev->secure_elements);
+
+ nfc_genl_data_init(&dev->genl_data);
+
+ dev->rf_mode = NFC_RF_NONE;
+
+ /* first generation must not be 0 */
+ dev->targets_generation = 1;
+
+ if (ops->check_presence) {
+ init_timer(&dev->check_pres_timer);
+ dev->check_pres_timer.data = (unsigned long)dev;
+ dev->check_pres_timer.function = nfc_check_pres_timeout;
+
+ INIT_WORK(&dev->check_pres_work, nfc_check_pres_work);
+ }
+
+ return dev;
+}
+EXPORT_SYMBOL(nfc_allocate_device);
+
+/**
+ * nfc_register_device - register a nfc device in the nfc subsystem
+ *
+ * @dev: The nfc device to register
+ */
+int nfc_register_device(struct nfc_dev *dev)
+{
+ int rc;
+
+ pr_debug("dev_name=%s\n", dev_name(&dev->dev));
+
+ dev->idx = ida_simple_get(&nfc_index_ida, 0, 0, GFP_KERNEL);
+ if (dev->idx < 0)
+ return dev->idx;
+
+ dev->dev.class = &nfc_class;
+ dev_set_name(&dev->dev, "nfc%d", dev->idx);
+ device_initialize(&dev->dev);
+
+ mutex_lock(&nfc_devlist_mutex);
+ nfc_devlist_generation++;
+ rc = device_add(&dev->dev);
+ mutex_unlock(&nfc_devlist_mutex);
+
+ if (rc < 0)
+ return rc;
+
+ rc = nfc_llcp_register_device(dev);
+ if (rc)
+ pr_err("Could not register llcp device\n");
+
+ rc = nfc_genl_device_added(dev);
+ if (rc)
+ pr_debug("The userspace won't be notified that the device %s was added\n",
+ dev_name(&dev->dev));
+
+ dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev,
+ RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev);
+ if (dev->rfkill) {
+ if (rfkill_register(dev->rfkill) < 0) {
+ rfkill_destroy(dev->rfkill);
+ dev->rfkill = NULL;
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(nfc_register_device);
+
+/**
+ * nfc_unregister_device - unregister a nfc device in the nfc subsystem
+ *
+ * @dev: The nfc device to unregister
+ */
+void nfc_unregister_device(struct nfc_dev *dev)
+{
+ int rc, id;
+
+ pr_debug("dev_name=%s\n", dev_name(&dev->dev));
+
+ id = dev->idx;
+
+ if (dev->rfkill) {
+ rfkill_unregister(dev->rfkill);
+ rfkill_destroy(dev->rfkill);
+ }
+
+ if (dev->ops->check_presence) {
+ device_lock(&dev->dev);
+ dev->shutting_down = true;
+ device_unlock(&dev->dev);
+ del_timer_sync(&dev->check_pres_timer);
+ cancel_work_sync(&dev->check_pres_work);
+ }
+
+ rc = nfc_genl_device_removed(dev);
+ if (rc)
+ pr_debug("The userspace won't be notified that the device %s "
+ "was removed\n", dev_name(&dev->dev));
+
+ nfc_llcp_unregister_device(dev);
+
+ mutex_lock(&nfc_devlist_mutex);
+ nfc_devlist_generation++;
+ device_del(&dev->dev);
+ mutex_unlock(&nfc_devlist_mutex);
+
+ ida_simple_remove(&nfc_index_ida, id);
+}
+EXPORT_SYMBOL(nfc_unregister_device);
+
+static int __init nfc_init(void)
+{
+ int rc;
+
+ pr_info("NFC Core ver %s\n", VERSION);
+
+ rc = class_register(&nfc_class);
+ if (rc)
+ return rc;
+
+ rc = nfc_genl_init();
+ if (rc)
+ goto err_genl;
+
+ /* the first generation must not be 0 */
+ nfc_devlist_generation = 1;
+
+ rc = rawsock_init();
+ if (rc)
+ goto err_rawsock;
+
+ rc = nfc_llcp_init();
+ if (rc)
+ goto err_llcp_sock;
+
+ rc = af_nfc_init();
+ if (rc)
+ goto err_af_nfc;
+
+ return 0;
+
+err_af_nfc:
+ nfc_llcp_exit();
+err_llcp_sock:
+ rawsock_exit();
+err_rawsock:
+ nfc_genl_exit();
+err_genl:
+ class_unregister(&nfc_class);
+ return rc;
+}
+
+static void __exit nfc_exit(void)
+{
+ af_nfc_exit();
+ nfc_llcp_exit();
+ rawsock_exit();
+ nfc_genl_exit();
+ class_unregister(&nfc_class);
+}
+
+subsys_initcall(nfc_init);
+module_exit(nfc_exit);
+
+MODULE_AUTHOR("Lauro Ramos Venancio <lauro.venancio@openbossa.org>");
+MODULE_DESCRIPTION("NFC Core ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_NFC);
+MODULE_ALIAS_GENL_FAMILY(NFC_GENL_NAME);
diff --git a/net/nfc/digital.h b/net/nfc/digital.h
new file mode 100644
index 000000000..3c39c72eb
--- /dev/null
+++ b/net/nfc/digital.h
@@ -0,0 +1,180 @@
+/*
+ * NFC Digital Protocol stack
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef __DIGITAL_H
+#define __DIGITAL_H
+
+#include <net/nfc/nfc.h>
+#include <net/nfc/digital.h>
+
+#include <linux/crc-ccitt.h>
+#include <linux/crc-itu-t.h>
+
+#define PROTOCOL_ERR(req) pr_err("%d: NFC Digital Protocol error: %s\n", \
+ __LINE__, req)
+
+#define DIGITAL_CMD_IN_SEND 0
+#define DIGITAL_CMD_TG_SEND 1
+#define DIGITAL_CMD_TG_LISTEN 2
+#define DIGITAL_CMD_TG_LISTEN_MDAA 3
+#define DIGITAL_CMD_TG_LISTEN_MD 4
+
+#define DIGITAL_MAX_HEADER_LEN 7
+#define DIGITAL_CRC_LEN 2
+
+#define DIGITAL_SENSF_NFCID2_NFC_DEP_B1 0x01
+#define DIGITAL_SENSF_NFCID2_NFC_DEP_B2 0xFE
+
+#define DIGITAL_SENS_RES_NFC_DEP 0x0100
+#define DIGITAL_SEL_RES_NFC_DEP 0x40
+#define DIGITAL_SENSF_FELICA_SC 0xFFFF
+
+#define DIGITAL_DRV_CAPS_IN_CRC(ddev) \
+ ((ddev)->driver_capabilities & NFC_DIGITAL_DRV_CAPS_IN_CRC)
+#define DIGITAL_DRV_CAPS_TG_CRC(ddev) \
+ ((ddev)->driver_capabilities & NFC_DIGITAL_DRV_CAPS_TG_CRC)
+
+struct digital_data_exch {
+ data_exchange_cb_t cb;
+ void *cb_context;
+};
+
+struct sk_buff *digital_skb_alloc(struct nfc_digital_dev *ddev,
+ unsigned int len);
+
+int digital_send_cmd(struct nfc_digital_dev *ddev, u8 cmd_type,
+ struct sk_buff *skb, struct digital_tg_mdaa_params *params,
+ u16 timeout, nfc_digital_cmd_complete_t cmd_cb,
+ void *cb_context);
+
+int digital_in_configure_hw(struct nfc_digital_dev *ddev, int type, int param);
+static inline int digital_in_send_cmd(struct nfc_digital_dev *ddev,
+ struct sk_buff *skb, u16 timeout,
+ nfc_digital_cmd_complete_t cmd_cb,
+ void *cb_context)
+{
+ return digital_send_cmd(ddev, DIGITAL_CMD_IN_SEND, skb, NULL, timeout,
+ cmd_cb, cb_context);
+}
+
+void digital_poll_next_tech(struct nfc_digital_dev *ddev);
+
+int digital_in_send_sens_req(struct nfc_digital_dev *ddev, u8 rf_tech);
+int digital_in_send_sensb_req(struct nfc_digital_dev *ddev, u8 rf_tech);
+int digital_in_send_sensf_req(struct nfc_digital_dev *ddev, u8 rf_tech);
+int digital_in_send_iso15693_inv_req(struct nfc_digital_dev *ddev, u8 rf_tech);
+
+int digital_in_iso_dep_pull_sod(struct nfc_digital_dev *ddev,
+ struct sk_buff *skb);
+int digital_in_iso_dep_push_sod(struct nfc_digital_dev *ddev,
+ struct sk_buff *skb);
+
+int digital_target_found(struct nfc_digital_dev *ddev,
+ struct nfc_target *target, u8 protocol);
+
+int digital_in_recv_mifare_res(struct sk_buff *resp);
+
+int digital_in_send_atr_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target, __u8 comm_mode, __u8 *gb,
+ size_t gb_len);
+int digital_in_send_dep_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target, struct sk_buff *skb,
+ struct digital_data_exch *data_exch);
+
+int digital_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param);
+static inline int digital_tg_send_cmd(struct nfc_digital_dev *ddev,
+ struct sk_buff *skb, u16 timeout,
+ nfc_digital_cmd_complete_t cmd_cb, void *cb_context)
+{
+ return digital_send_cmd(ddev, DIGITAL_CMD_TG_SEND, skb, NULL, timeout,
+ cmd_cb, cb_context);
+}
+
+void digital_tg_recv_sens_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp);
+
+void digital_tg_recv_sensf_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp);
+
+static inline int digital_tg_listen(struct nfc_digital_dev *ddev, u16 timeout,
+ nfc_digital_cmd_complete_t cb, void *arg)
+{
+ return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN, NULL, NULL,
+ timeout, cb, arg);
+}
+
+void digital_tg_recv_atr_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp);
+
+int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb);
+
+int digital_tg_listen_nfca(struct nfc_digital_dev *ddev, u8 rf_tech);
+int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech);
+void digital_tg_recv_md_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp);
+
+typedef u16 (*crc_func_t)(u16, const u8 *, size_t);
+
+#define CRC_A_INIT 0x6363
+#define CRC_B_INIT 0xFFFF
+#define CRC_F_INIT 0x0000
+
+void digital_skb_add_crc(struct sk_buff *skb, crc_func_t crc_func, u16 init,
+ u8 bitwise_inv, u8 msb_first);
+
+static inline void digital_skb_add_crc_a(struct sk_buff *skb)
+{
+ digital_skb_add_crc(skb, crc_ccitt, CRC_A_INIT, 0, 0);
+}
+
+static inline void digital_skb_add_crc_b(struct sk_buff *skb)
+{
+ digital_skb_add_crc(skb, crc_ccitt, CRC_B_INIT, 1, 0);
+}
+
+static inline void digital_skb_add_crc_f(struct sk_buff *skb)
+{
+ digital_skb_add_crc(skb, crc_itu_t, CRC_F_INIT, 0, 1);
+}
+
+static inline void digital_skb_add_crc_none(struct sk_buff *skb)
+{
+ return;
+}
+
+int digital_skb_check_crc(struct sk_buff *skb, crc_func_t crc_func,
+ u16 crc_init, u8 bitwise_inv, u8 msb_first);
+
+static inline int digital_skb_check_crc_a(struct sk_buff *skb)
+{
+ return digital_skb_check_crc(skb, crc_ccitt, CRC_A_INIT, 0, 0);
+}
+
+static inline int digital_skb_check_crc_b(struct sk_buff *skb)
+{
+ return digital_skb_check_crc(skb, crc_ccitt, CRC_B_INIT, 1, 0);
+}
+
+static inline int digital_skb_check_crc_f(struct sk_buff *skb)
+{
+ return digital_skb_check_crc(skb, crc_itu_t, CRC_F_INIT, 0, 1);
+}
+
+static inline int digital_skb_check_crc_none(struct sk_buff *skb)
+{
+ return 0;
+}
+
+#endif /* __DIGITAL_H */
diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c
new file mode 100644
index 000000000..009bcf317
--- /dev/null
+++ b/net/nfc/digital_core.c
@@ -0,0 +1,845 @@
+/*
+ * NFC Digital Protocol stack
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) "digital: %s: " fmt, __func__
+
+#include <linux/module.h>
+
+#include "digital.h"
+
+#define DIGITAL_PROTO_NFCA_RF_TECH \
+ (NFC_PROTO_JEWEL_MASK | NFC_PROTO_MIFARE_MASK | NFC_PROTO_NFC_DEP_MASK)
+
+#define DIGITAL_PROTO_NFCB_RF_TECH NFC_PROTO_ISO14443_B_MASK
+
+#define DIGITAL_PROTO_NFCF_RF_TECH \
+ (NFC_PROTO_FELICA_MASK | NFC_PROTO_NFC_DEP_MASK)
+
+#define DIGITAL_PROTO_ISO15693_RF_TECH NFC_PROTO_ISO15693_MASK
+
+struct digital_cmd {
+ struct list_head queue;
+
+ u8 type;
+ u8 pending;
+
+ u16 timeout;
+ struct sk_buff *req;
+ struct sk_buff *resp;
+ struct digital_tg_mdaa_params *mdaa_params;
+
+ nfc_digital_cmd_complete_t cmd_cb;
+ void *cb_context;
+};
+
+struct sk_buff *digital_skb_alloc(struct nfc_digital_dev *ddev,
+ unsigned int len)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb(len + ddev->tx_headroom + ddev->tx_tailroom,
+ GFP_KERNEL);
+ if (skb)
+ skb_reserve(skb, ddev->tx_headroom);
+
+ return skb;
+}
+
+void digital_skb_add_crc(struct sk_buff *skb, crc_func_t crc_func, u16 init,
+ u8 bitwise_inv, u8 msb_first)
+{
+ u16 crc;
+
+ crc = crc_func(init, skb->data, skb->len);
+
+ if (bitwise_inv)
+ crc = ~crc;
+
+ if (msb_first)
+ crc = __fswab16(crc);
+
+ *skb_put(skb, 1) = crc & 0xFF;
+ *skb_put(skb, 1) = (crc >> 8) & 0xFF;
+}
+
+int digital_skb_check_crc(struct sk_buff *skb, crc_func_t crc_func,
+ u16 crc_init, u8 bitwise_inv, u8 msb_first)
+{
+ int rc;
+ u16 crc;
+
+ if (skb->len <= 2)
+ return -EIO;
+
+ crc = crc_func(crc_init, skb->data, skb->len - 2);
+
+ if (bitwise_inv)
+ crc = ~crc;
+
+ if (msb_first)
+ crc = __swab16(crc);
+
+ rc = (skb->data[skb->len - 2] - (crc & 0xFF)) +
+ (skb->data[skb->len - 1] - ((crc >> 8) & 0xFF));
+
+ if (rc)
+ return -EIO;
+
+ skb_trim(skb, skb->len - 2);
+
+ return 0;
+}
+
+static inline void digital_switch_rf(struct nfc_digital_dev *ddev, bool on)
+{
+ ddev->ops->switch_rf(ddev, on);
+}
+
+static inline void digital_abort_cmd(struct nfc_digital_dev *ddev)
+{
+ ddev->ops->abort_cmd(ddev);
+}
+
+static void digital_wq_cmd_complete(struct work_struct *work)
+{
+ struct digital_cmd *cmd;
+ struct nfc_digital_dev *ddev = container_of(work,
+ struct nfc_digital_dev,
+ cmd_complete_work);
+
+ mutex_lock(&ddev->cmd_lock);
+
+ cmd = list_first_entry_or_null(&ddev->cmd_queue, struct digital_cmd,
+ queue);
+ if (!cmd) {
+ mutex_unlock(&ddev->cmd_lock);
+ return;
+ }
+
+ list_del(&cmd->queue);
+
+ mutex_unlock(&ddev->cmd_lock);
+
+ if (!IS_ERR(cmd->resp))
+ print_hex_dump_debug("DIGITAL RX: ", DUMP_PREFIX_NONE, 16, 1,
+ cmd->resp->data, cmd->resp->len, false);
+
+ cmd->cmd_cb(ddev, cmd->cb_context, cmd->resp);
+
+ kfree(cmd->mdaa_params);
+ kfree(cmd);
+
+ schedule_work(&ddev->cmd_work);
+}
+
+static void digital_send_cmd_complete(struct nfc_digital_dev *ddev,
+ void *arg, struct sk_buff *resp)
+{
+ struct digital_cmd *cmd = arg;
+
+ cmd->resp = resp;
+
+ schedule_work(&ddev->cmd_complete_work);
+}
+
+static void digital_wq_cmd(struct work_struct *work)
+{
+ int rc;
+ struct digital_cmd *cmd;
+ struct digital_tg_mdaa_params *params;
+ struct nfc_digital_dev *ddev = container_of(work,
+ struct nfc_digital_dev,
+ cmd_work);
+
+ mutex_lock(&ddev->cmd_lock);
+
+ cmd = list_first_entry_or_null(&ddev->cmd_queue, struct digital_cmd,
+ queue);
+ if (!cmd || cmd->pending) {
+ mutex_unlock(&ddev->cmd_lock);
+ return;
+ }
+
+ mutex_unlock(&ddev->cmd_lock);
+
+ if (cmd->req)
+ print_hex_dump_debug("DIGITAL TX: ", DUMP_PREFIX_NONE, 16, 1,
+ cmd->req->data, cmd->req->len, false);
+
+ switch (cmd->type) {
+ case DIGITAL_CMD_IN_SEND:
+ rc = ddev->ops->in_send_cmd(ddev, cmd->req, cmd->timeout,
+ digital_send_cmd_complete, cmd);
+ break;
+
+ case DIGITAL_CMD_TG_SEND:
+ rc = ddev->ops->tg_send_cmd(ddev, cmd->req, cmd->timeout,
+ digital_send_cmd_complete, cmd);
+ break;
+
+ case DIGITAL_CMD_TG_LISTEN:
+ rc = ddev->ops->tg_listen(ddev, cmd->timeout,
+ digital_send_cmd_complete, cmd);
+ break;
+
+ case DIGITAL_CMD_TG_LISTEN_MDAA:
+ params = cmd->mdaa_params;
+
+ rc = ddev->ops->tg_listen_mdaa(ddev, params, cmd->timeout,
+ digital_send_cmd_complete, cmd);
+ break;
+
+ case DIGITAL_CMD_TG_LISTEN_MD:
+ rc = ddev->ops->tg_listen_md(ddev, cmd->timeout,
+ digital_send_cmd_complete, cmd);
+ break;
+
+ default:
+ pr_err("Unknown cmd type %d\n", cmd->type);
+ return;
+ }
+
+ if (!rc)
+ return;
+
+ pr_err("in_send_command returned err %d\n", rc);
+
+ mutex_lock(&ddev->cmd_lock);
+ list_del(&cmd->queue);
+ mutex_unlock(&ddev->cmd_lock);
+
+ kfree_skb(cmd->req);
+ kfree(cmd->mdaa_params);
+ kfree(cmd);
+
+ schedule_work(&ddev->cmd_work);
+}
+
+int digital_send_cmd(struct nfc_digital_dev *ddev, u8 cmd_type,
+ struct sk_buff *skb, struct digital_tg_mdaa_params *params,
+ u16 timeout, nfc_digital_cmd_complete_t cmd_cb,
+ void *cb_context)
+{
+ struct digital_cmd *cmd;
+
+ cmd = kzalloc(sizeof(struct digital_cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ cmd->type = cmd_type;
+ cmd->timeout = timeout;
+ cmd->req = skb;
+ cmd->mdaa_params = params;
+ cmd->cmd_cb = cmd_cb;
+ cmd->cb_context = cb_context;
+ INIT_LIST_HEAD(&cmd->queue);
+
+ mutex_lock(&ddev->cmd_lock);
+ list_add_tail(&cmd->queue, &ddev->cmd_queue);
+ mutex_unlock(&ddev->cmd_lock);
+
+ schedule_work(&ddev->cmd_work);
+
+ return 0;
+}
+
+int digital_in_configure_hw(struct nfc_digital_dev *ddev, int type, int param)
+{
+ int rc;
+
+ rc = ddev->ops->in_configure_hw(ddev, type, param);
+ if (rc)
+ pr_err("in_configure_hw failed: %d\n", rc);
+
+ return rc;
+}
+
+int digital_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param)
+{
+ int rc;
+
+ rc = ddev->ops->tg_configure_hw(ddev, type, param);
+ if (rc)
+ pr_err("tg_configure_hw failed: %d\n", rc);
+
+ return rc;
+}
+
+static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ struct digital_tg_mdaa_params *params;
+
+ params = kzalloc(sizeof(struct digital_tg_mdaa_params), GFP_KERNEL);
+ if (!params)
+ return -ENOMEM;
+
+ params->sens_res = DIGITAL_SENS_RES_NFC_DEP;
+ get_random_bytes(params->nfcid1, sizeof(params->nfcid1));
+ params->sel_res = DIGITAL_SEL_RES_NFC_DEP;
+
+ params->nfcid2[0] = DIGITAL_SENSF_NFCID2_NFC_DEP_B1;
+ params->nfcid2[1] = DIGITAL_SENSF_NFCID2_NFC_DEP_B2;
+ get_random_bytes(params->nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2);
+ params->sc = DIGITAL_SENSF_FELICA_SC;
+
+ return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params,
+ 500, digital_tg_recv_atr_req, NULL);
+}
+
+static int digital_tg_listen_md(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MD, NULL, NULL, 500,
+ digital_tg_recv_md_req, NULL);
+}
+
+int digital_target_found(struct nfc_digital_dev *ddev,
+ struct nfc_target *target, u8 protocol)
+{
+ int rc;
+ u8 framing;
+ u8 rf_tech;
+ u8 poll_tech_count;
+ int (*check_crc)(struct sk_buff *skb);
+ void (*add_crc)(struct sk_buff *skb);
+
+ rf_tech = ddev->poll_techs[ddev->poll_tech_index].rf_tech;
+
+ switch (protocol) {
+ case NFC_PROTO_JEWEL:
+ framing = NFC_DIGITAL_FRAMING_NFCA_T1T;
+ check_crc = digital_skb_check_crc_b;
+ add_crc = digital_skb_add_crc_b;
+ break;
+
+ case NFC_PROTO_MIFARE:
+ framing = NFC_DIGITAL_FRAMING_NFCA_T2T;
+ check_crc = digital_skb_check_crc_a;
+ add_crc = digital_skb_add_crc_a;
+ break;
+
+ case NFC_PROTO_FELICA:
+ framing = NFC_DIGITAL_FRAMING_NFCF_T3T;
+ check_crc = digital_skb_check_crc_f;
+ add_crc = digital_skb_add_crc_f;
+ break;
+
+ case NFC_PROTO_NFC_DEP:
+ if (rf_tech == NFC_DIGITAL_RF_TECH_106A) {
+ framing = NFC_DIGITAL_FRAMING_NFCA_NFC_DEP;
+ check_crc = digital_skb_check_crc_a;
+ add_crc = digital_skb_add_crc_a;
+ } else {
+ framing = NFC_DIGITAL_FRAMING_NFCF_NFC_DEP;
+ check_crc = digital_skb_check_crc_f;
+ add_crc = digital_skb_add_crc_f;
+ }
+ break;
+
+ case NFC_PROTO_ISO15693:
+ framing = NFC_DIGITAL_FRAMING_ISO15693_T5T;
+ check_crc = digital_skb_check_crc_b;
+ add_crc = digital_skb_add_crc_b;
+ break;
+
+ case NFC_PROTO_ISO14443:
+ framing = NFC_DIGITAL_FRAMING_NFCA_T4T;
+ check_crc = digital_skb_check_crc_a;
+ add_crc = digital_skb_add_crc_a;
+ break;
+
+ case NFC_PROTO_ISO14443_B:
+ framing = NFC_DIGITAL_FRAMING_NFCB_T4T;
+ check_crc = digital_skb_check_crc_b;
+ add_crc = digital_skb_add_crc_b;
+ break;
+
+ default:
+ pr_err("Invalid protocol %d\n", protocol);
+ return -EINVAL;
+ }
+
+ pr_debug("rf_tech=%d, protocol=%d\n", rf_tech, protocol);
+
+ ddev->curr_rf_tech = rf_tech;
+
+ if (DIGITAL_DRV_CAPS_IN_CRC(ddev)) {
+ ddev->skb_add_crc = digital_skb_add_crc_none;
+ ddev->skb_check_crc = digital_skb_check_crc_none;
+ } else {
+ ddev->skb_add_crc = add_crc;
+ ddev->skb_check_crc = check_crc;
+ }
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING, framing);
+ if (rc)
+ return rc;
+
+ target->supported_protocols = (1 << protocol);
+
+ poll_tech_count = ddev->poll_tech_count;
+ ddev->poll_tech_count = 0;
+
+ rc = nfc_targets_found(ddev->nfc_dev, target, 1);
+ if (rc) {
+ ddev->poll_tech_count = poll_tech_count;
+ return rc;
+ }
+
+ return 0;
+}
+
+void digital_poll_next_tech(struct nfc_digital_dev *ddev)
+{
+ u8 rand_mod;
+
+ digital_switch_rf(ddev, 0);
+
+ mutex_lock(&ddev->poll_lock);
+
+ if (!ddev->poll_tech_count) {
+ mutex_unlock(&ddev->poll_lock);
+ return;
+ }
+
+ get_random_bytes(&rand_mod, sizeof(rand_mod));
+ ddev->poll_tech_index = rand_mod % ddev->poll_tech_count;
+
+ mutex_unlock(&ddev->poll_lock);
+
+ schedule_work(&ddev->poll_work);
+}
+
+static void digital_wq_poll(struct work_struct *work)
+{
+ int rc;
+ struct digital_poll_tech *poll_tech;
+ struct nfc_digital_dev *ddev = container_of(work,
+ struct nfc_digital_dev,
+ poll_work);
+ mutex_lock(&ddev->poll_lock);
+
+ if (!ddev->poll_tech_count) {
+ mutex_unlock(&ddev->poll_lock);
+ return;
+ }
+
+ poll_tech = &ddev->poll_techs[ddev->poll_tech_index];
+
+ mutex_unlock(&ddev->poll_lock);
+
+ rc = poll_tech->poll_func(ddev, poll_tech->rf_tech);
+ if (rc)
+ digital_poll_next_tech(ddev);
+}
+
+static void digital_add_poll_tech(struct nfc_digital_dev *ddev, u8 rf_tech,
+ digital_poll_t poll_func)
+{
+ struct digital_poll_tech *poll_tech;
+
+ if (ddev->poll_tech_count >= NFC_DIGITAL_POLL_MODE_COUNT_MAX)
+ return;
+
+ poll_tech = &ddev->poll_techs[ddev->poll_tech_count++];
+
+ poll_tech->rf_tech = rf_tech;
+ poll_tech->poll_func = poll_func;
+}
+
+/**
+ * start_poll operation
+ *
+ * For every supported protocol, the corresponding polling function is added
+ * to the table of polling technologies (ddev->poll_techs[]) using
+ * digital_add_poll_tech().
+ * When a polling function fails (by timeout or protocol error) the next one is
+ * schedule by digital_poll_next_tech() on the poll workqueue (ddev->poll_work).
+ */
+static int digital_start_poll(struct nfc_dev *nfc_dev, __u32 im_protocols,
+ __u32 tm_protocols)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+ u32 matching_im_protocols, matching_tm_protocols;
+
+ pr_debug("protocols: im 0x%x, tm 0x%x, supported 0x%x\n", im_protocols,
+ tm_protocols, ddev->protocols);
+
+ matching_im_protocols = ddev->protocols & im_protocols;
+ matching_tm_protocols = ddev->protocols & tm_protocols;
+
+ if (!matching_im_protocols && !matching_tm_protocols) {
+ pr_err("Unknown protocol\n");
+ return -EINVAL;
+ }
+
+ if (ddev->poll_tech_count) {
+ pr_err("Already polling\n");
+ return -EBUSY;
+ }
+
+ if (ddev->curr_protocol) {
+ pr_err("A target is already active\n");
+ return -EBUSY;
+ }
+
+ ddev->poll_tech_count = 0;
+ ddev->poll_tech_index = 0;
+
+ if (matching_im_protocols & DIGITAL_PROTO_NFCA_RF_TECH)
+ digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_106A,
+ digital_in_send_sens_req);
+
+ if (matching_im_protocols & DIGITAL_PROTO_NFCB_RF_TECH)
+ digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_106B,
+ digital_in_send_sensb_req);
+
+ if (matching_im_protocols & DIGITAL_PROTO_NFCF_RF_TECH) {
+ digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_212F,
+ digital_in_send_sensf_req);
+
+ digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_424F,
+ digital_in_send_sensf_req);
+ }
+
+ if (matching_im_protocols & DIGITAL_PROTO_ISO15693_RF_TECH)
+ digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_ISO15693,
+ digital_in_send_iso15693_inv_req);
+
+ if (matching_tm_protocols & NFC_PROTO_NFC_DEP_MASK) {
+ if (ddev->ops->tg_listen_mdaa) {
+ digital_add_poll_tech(ddev, 0,
+ digital_tg_listen_mdaa);
+ } else if (ddev->ops->tg_listen_md) {
+ digital_add_poll_tech(ddev, 0,
+ digital_tg_listen_md);
+ } else {
+ digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_106A,
+ digital_tg_listen_nfca);
+
+ digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_212F,
+ digital_tg_listen_nfcf);
+
+ digital_add_poll_tech(ddev, NFC_DIGITAL_RF_TECH_424F,
+ digital_tg_listen_nfcf);
+ }
+ }
+
+ if (!ddev->poll_tech_count) {
+ pr_err("Unsupported protocols: im=0x%x, tm=0x%x\n",
+ matching_im_protocols, matching_tm_protocols);
+ return -EINVAL;
+ }
+
+ schedule_work(&ddev->poll_work);
+
+ return 0;
+}
+
+static void digital_stop_poll(struct nfc_dev *nfc_dev)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+
+ mutex_lock(&ddev->poll_lock);
+
+ if (!ddev->poll_tech_count) {
+ pr_err("Polling operation was not running\n");
+ mutex_unlock(&ddev->poll_lock);
+ return;
+ }
+
+ ddev->poll_tech_count = 0;
+
+ mutex_unlock(&ddev->poll_lock);
+
+ cancel_work_sync(&ddev->poll_work);
+
+ digital_abort_cmd(ddev);
+}
+
+static int digital_dev_up(struct nfc_dev *nfc_dev)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+
+ digital_switch_rf(ddev, 1);
+
+ return 0;
+}
+
+static int digital_dev_down(struct nfc_dev *nfc_dev)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+
+ digital_switch_rf(ddev, 0);
+
+ return 0;
+}
+
+static int digital_dep_link_up(struct nfc_dev *nfc_dev,
+ struct nfc_target *target,
+ __u8 comm_mode, __u8 *gb, size_t gb_len)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+ int rc;
+
+ rc = digital_in_send_atr_req(ddev, target, comm_mode, gb, gb_len);
+
+ if (!rc)
+ ddev->curr_protocol = NFC_PROTO_NFC_DEP;
+
+ return rc;
+}
+
+static int digital_dep_link_down(struct nfc_dev *nfc_dev)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+
+ ddev->curr_protocol = 0;
+
+ return 0;
+}
+
+static int digital_activate_target(struct nfc_dev *nfc_dev,
+ struct nfc_target *target, __u32 protocol)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+
+ if (ddev->poll_tech_count) {
+ pr_err("Can't activate a target while polling\n");
+ return -EBUSY;
+ }
+
+ if (ddev->curr_protocol) {
+ pr_err("A target is already active\n");
+ return -EBUSY;
+ }
+
+ ddev->curr_protocol = protocol;
+
+ return 0;
+}
+
+static void digital_deactivate_target(struct nfc_dev *nfc_dev,
+ struct nfc_target *target)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+
+ if (!ddev->curr_protocol) {
+ pr_err("No active target\n");
+ return;
+ }
+
+ ddev->curr_protocol = 0;
+}
+
+static int digital_tg_send(struct nfc_dev *dev, struct sk_buff *skb)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(dev);
+
+ return digital_tg_send_dep_res(ddev, skb);
+}
+
+static void digital_in_send_complete(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct digital_data_exch *data_exch = arg;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto done;
+ }
+
+ if (ddev->curr_protocol == NFC_PROTO_MIFARE) {
+ rc = digital_in_recv_mifare_res(resp);
+ /* crc check is done in digital_in_recv_mifare_res() */
+ goto done;
+ }
+
+ if ((ddev->curr_protocol == NFC_PROTO_ISO14443) ||
+ (ddev->curr_protocol == NFC_PROTO_ISO14443_B)) {
+ rc = digital_in_iso_dep_pull_sod(ddev, resp);
+ if (rc)
+ goto done;
+ }
+
+ rc = ddev->skb_check_crc(resp);
+
+done:
+ if (rc) {
+ kfree_skb(resp);
+ resp = NULL;
+ }
+
+ data_exch->cb(data_exch->cb_context, resp, rc);
+
+ kfree(data_exch);
+}
+
+static int digital_in_send(struct nfc_dev *nfc_dev, struct nfc_target *target,
+ struct sk_buff *skb, data_exchange_cb_t cb,
+ void *cb_context)
+{
+ struct nfc_digital_dev *ddev = nfc_get_drvdata(nfc_dev);
+ struct digital_data_exch *data_exch;
+ int rc;
+
+ data_exch = kzalloc(sizeof(struct digital_data_exch), GFP_KERNEL);
+ if (!data_exch) {
+ pr_err("Failed to allocate data_exch struct\n");
+ return -ENOMEM;
+ }
+
+ data_exch->cb = cb;
+ data_exch->cb_context = cb_context;
+
+ if (ddev->curr_protocol == NFC_PROTO_NFC_DEP) {
+ rc = digital_in_send_dep_req(ddev, target, skb, data_exch);
+ goto exit;
+ }
+
+ if ((ddev->curr_protocol == NFC_PROTO_ISO14443) ||
+ (ddev->curr_protocol == NFC_PROTO_ISO14443_B)) {
+ rc = digital_in_iso_dep_push_sod(ddev, skb);
+ if (rc)
+ goto exit;
+ }
+
+ ddev->skb_add_crc(skb);
+
+ rc = digital_in_send_cmd(ddev, skb, 500, digital_in_send_complete,
+ data_exch);
+
+exit:
+ if (rc)
+ kfree(data_exch);
+
+ return rc;
+}
+
+static struct nfc_ops digital_nfc_ops = {
+ .dev_up = digital_dev_up,
+ .dev_down = digital_dev_down,
+ .start_poll = digital_start_poll,
+ .stop_poll = digital_stop_poll,
+ .dep_link_up = digital_dep_link_up,
+ .dep_link_down = digital_dep_link_down,
+ .activate_target = digital_activate_target,
+ .deactivate_target = digital_deactivate_target,
+ .tm_send = digital_tg_send,
+ .im_transceive = digital_in_send,
+};
+
+struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops,
+ __u32 supported_protocols,
+ __u32 driver_capabilities,
+ int tx_headroom, int tx_tailroom)
+{
+ struct nfc_digital_dev *ddev;
+
+ if (!ops->in_configure_hw || !ops->in_send_cmd || !ops->tg_listen ||
+ !ops->tg_configure_hw || !ops->tg_send_cmd || !ops->abort_cmd ||
+ !ops->switch_rf || (ops->tg_listen_md && !ops->tg_get_rf_tech))
+ return NULL;
+
+ ddev = kzalloc(sizeof(struct nfc_digital_dev), GFP_KERNEL);
+ if (!ddev)
+ return NULL;
+
+ ddev->driver_capabilities = driver_capabilities;
+ ddev->ops = ops;
+
+ mutex_init(&ddev->cmd_lock);
+ INIT_LIST_HEAD(&ddev->cmd_queue);
+
+ INIT_WORK(&ddev->cmd_work, digital_wq_cmd);
+ INIT_WORK(&ddev->cmd_complete_work, digital_wq_cmd_complete);
+
+ mutex_init(&ddev->poll_lock);
+ INIT_WORK(&ddev->poll_work, digital_wq_poll);
+
+ if (supported_protocols & NFC_PROTO_JEWEL_MASK)
+ ddev->protocols |= NFC_PROTO_JEWEL_MASK;
+ if (supported_protocols & NFC_PROTO_MIFARE_MASK)
+ ddev->protocols |= NFC_PROTO_MIFARE_MASK;
+ if (supported_protocols & NFC_PROTO_FELICA_MASK)
+ ddev->protocols |= NFC_PROTO_FELICA_MASK;
+ if (supported_protocols & NFC_PROTO_NFC_DEP_MASK)
+ ddev->protocols |= NFC_PROTO_NFC_DEP_MASK;
+ if (supported_protocols & NFC_PROTO_ISO15693_MASK)
+ ddev->protocols |= NFC_PROTO_ISO15693_MASK;
+ if (supported_protocols & NFC_PROTO_ISO14443_MASK)
+ ddev->protocols |= NFC_PROTO_ISO14443_MASK;
+ if (supported_protocols & NFC_PROTO_ISO14443_B_MASK)
+ ddev->protocols |= NFC_PROTO_ISO14443_B_MASK;
+
+ ddev->tx_headroom = tx_headroom + DIGITAL_MAX_HEADER_LEN;
+ ddev->tx_tailroom = tx_tailroom + DIGITAL_CRC_LEN;
+
+ ddev->nfc_dev = nfc_allocate_device(&digital_nfc_ops, ddev->protocols,
+ ddev->tx_headroom,
+ ddev->tx_tailroom);
+ if (!ddev->nfc_dev) {
+ pr_err("nfc_allocate_device failed\n");
+ goto free_dev;
+ }
+
+ nfc_set_drvdata(ddev->nfc_dev, ddev);
+
+ return ddev;
+
+free_dev:
+ kfree(ddev);
+
+ return NULL;
+}
+EXPORT_SYMBOL(nfc_digital_allocate_device);
+
+void nfc_digital_free_device(struct nfc_digital_dev *ddev)
+{
+ nfc_free_device(ddev->nfc_dev);
+ kfree(ddev);
+}
+EXPORT_SYMBOL(nfc_digital_free_device);
+
+int nfc_digital_register_device(struct nfc_digital_dev *ddev)
+{
+ return nfc_register_device(ddev->nfc_dev);
+}
+EXPORT_SYMBOL(nfc_digital_register_device);
+
+void nfc_digital_unregister_device(struct nfc_digital_dev *ddev)
+{
+ struct digital_cmd *cmd, *n;
+
+ nfc_unregister_device(ddev->nfc_dev);
+
+ mutex_lock(&ddev->poll_lock);
+ ddev->poll_tech_count = 0;
+ mutex_unlock(&ddev->poll_lock);
+
+ cancel_work_sync(&ddev->poll_work);
+ cancel_work_sync(&ddev->cmd_work);
+ cancel_work_sync(&ddev->cmd_complete_work);
+
+ list_for_each_entry_safe(cmd, n, &ddev->cmd_queue, queue) {
+ list_del(&cmd->queue);
+ kfree(cmd->mdaa_params);
+ kfree(cmd);
+ }
+}
+EXPORT_SYMBOL(nfc_digital_unregister_device);
+
+MODULE_LICENSE("GPL");
diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
new file mode 100644
index 000000000..f72be7433
--- /dev/null
+++ b/net/nfc/digital_dep.c
@@ -0,0 +1,1542 @@
+/*
+ * NFC Digital Protocol stack
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) "digital: %s: " fmt, __func__
+
+#include "digital.h"
+
+#define DIGITAL_NFC_DEP_N_RETRY_NACK 2
+#define DIGITAL_NFC_DEP_N_RETRY_ATN 2
+
+#define DIGITAL_NFC_DEP_FRAME_DIR_OUT 0xD4
+#define DIGITAL_NFC_DEP_FRAME_DIR_IN 0xD5
+
+#define DIGITAL_NFC_DEP_NFCA_SOD_SB 0xF0
+
+#define DIGITAL_CMD_ATR_REQ 0x00
+#define DIGITAL_CMD_ATR_RES 0x01
+#define DIGITAL_CMD_PSL_REQ 0x04
+#define DIGITAL_CMD_PSL_RES 0x05
+#define DIGITAL_CMD_DEP_REQ 0x06
+#define DIGITAL_CMD_DEP_RES 0x07
+
+#define DIGITAL_ATR_REQ_MIN_SIZE 16
+#define DIGITAL_ATR_REQ_MAX_SIZE 64
+
+#define DIGITAL_DID_MAX 14
+
+#define DIGITAL_PAYLOAD_SIZE_MAX 254
+#define DIGITAL_PAYLOAD_BITS_TO_PP(s) (((s) & 0x3) << 4)
+#define DIGITAL_PAYLOAD_PP_TO_BITS(s) (((s) >> 4) & 0x3)
+#define DIGITAL_PAYLOAD_BITS_TO_FSL(s) ((s) & 0x3)
+#define DIGITAL_PAYLOAD_FSL_TO_BITS(s) ((s) & 0x3)
+
+#define DIGITAL_GB_BIT 0x02
+
+#define DIGITAL_NFC_DEP_REQ_RES_HEADROOM 2 /* SoD: [SB (NFC-A)] + LEN */
+#define DIGITAL_NFC_DEP_REQ_RES_TAILROOM 2 /* EoD: 2-byte CRC */
+
+#define DIGITAL_NFC_DEP_PFB_TYPE(pfb) ((pfb) & 0xE0)
+
+#define DIGITAL_NFC_DEP_PFB_TIMEOUT_BIT 0x10
+#define DIGITAL_NFC_DEP_PFB_MI_BIT 0x10
+#define DIGITAL_NFC_DEP_PFB_NACK_BIT 0x10
+#define DIGITAL_NFC_DEP_PFB_DID_BIT 0x04
+
+#define DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb) \
+ ((pfb) & DIGITAL_NFC_DEP_PFB_TIMEOUT_BIT)
+#define DIGITAL_NFC_DEP_MI_BIT_SET(pfb) ((pfb) & DIGITAL_NFC_DEP_PFB_MI_BIT)
+#define DIGITAL_NFC_DEP_NACK_BIT_SET(pfb) ((pfb) & DIGITAL_NFC_DEP_PFB_NACK_BIT)
+#define DIGITAL_NFC_DEP_NAD_BIT_SET(pfb) ((pfb) & 0x08)
+#define DIGITAL_NFC_DEP_DID_BIT_SET(pfb) ((pfb) & DIGITAL_NFC_DEP_PFB_DID_BIT)
+#define DIGITAL_NFC_DEP_PFB_PNI(pfb) ((pfb) & 0x03)
+
+#define DIGITAL_NFC_DEP_PFB_I_PDU 0x00
+#define DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU 0x40
+#define DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU 0x80
+
+struct digital_atr_req {
+ u8 dir;
+ u8 cmd;
+ u8 nfcid3[10];
+ u8 did;
+ u8 bs;
+ u8 br;
+ u8 pp;
+ u8 gb[0];
+} __packed;
+
+struct digital_atr_res {
+ u8 dir;
+ u8 cmd;
+ u8 nfcid3[10];
+ u8 did;
+ u8 bs;
+ u8 br;
+ u8 to;
+ u8 pp;
+ u8 gb[0];
+} __packed;
+
+struct digital_psl_req {
+ u8 dir;
+ u8 cmd;
+ u8 did;
+ u8 brs;
+ u8 fsl;
+} __packed;
+
+struct digital_psl_res {
+ u8 dir;
+ u8 cmd;
+ u8 did;
+} __packed;
+
+struct digital_dep_req_res {
+ u8 dir;
+ u8 cmd;
+ u8 pfb;
+} __packed;
+
+static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp);
+static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp);
+
+static const u8 digital_payload_bits_map[4] = {
+ [0] = 64,
+ [1] = 128,
+ [2] = 192,
+ [3] = 254
+};
+
+static u8 digital_payload_bits_to_size(u8 payload_bits)
+{
+ if (payload_bits >= ARRAY_SIZE(digital_payload_bits_map))
+ return 0;
+
+ return digital_payload_bits_map[payload_bits];
+}
+
+static u8 digital_payload_size_to_bits(u8 payload_size)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(digital_payload_bits_map); i++)
+ if (digital_payload_bits_map[i] == payload_size)
+ return i;
+
+ return 0xff;
+}
+
+static void digital_skb_push_dep_sod(struct nfc_digital_dev *ddev,
+ struct sk_buff *skb)
+{
+ skb_push(skb, sizeof(u8));
+
+ skb->data[0] = skb->len;
+
+ if (ddev->curr_rf_tech == NFC_DIGITAL_RF_TECH_106A)
+ *skb_push(skb, sizeof(u8)) = DIGITAL_NFC_DEP_NFCA_SOD_SB;
+}
+
+static int digital_skb_pull_dep_sod(struct nfc_digital_dev *ddev,
+ struct sk_buff *skb)
+{
+ u8 size;
+
+ if (skb->len < 2)
+ return -EIO;
+
+ if (ddev->curr_rf_tech == NFC_DIGITAL_RF_TECH_106A)
+ skb_pull(skb, sizeof(u8));
+
+ size = skb->data[0];
+ if (size != skb->len)
+ return -EIO;
+
+ skb_pull(skb, sizeof(u8));
+
+ return 0;
+}
+
+static struct sk_buff *
+digital_send_dep_data_prep(struct nfc_digital_dev *ddev, struct sk_buff *skb,
+ struct digital_dep_req_res *dep_req_res,
+ struct digital_data_exch *data_exch)
+{
+ struct sk_buff *new_skb;
+
+ if (skb->len > ddev->remote_payload_max) {
+ dep_req_res->pfb |= DIGITAL_NFC_DEP_PFB_MI_BIT;
+
+ new_skb = digital_skb_alloc(ddev, ddev->remote_payload_max);
+ if (!new_skb) {
+ kfree_skb(ddev->chaining_skb);
+ ddev->chaining_skb = NULL;
+
+ return ERR_PTR(-ENOMEM);
+ }
+
+ skb_reserve(new_skb, ddev->tx_headroom + NFC_HEADER_SIZE +
+ DIGITAL_NFC_DEP_REQ_RES_HEADROOM);
+ memcpy(skb_put(new_skb, ddev->remote_payload_max), skb->data,
+ ddev->remote_payload_max);
+ skb_pull(skb, ddev->remote_payload_max);
+
+ ddev->chaining_skb = skb;
+ ddev->data_exch = data_exch;
+ } else {
+ ddev->chaining_skb = NULL;
+ new_skb = skb;
+ }
+
+ return new_skb;
+}
+
+static struct sk_buff *
+digital_recv_dep_data_gather(struct nfc_digital_dev *ddev, u8 pfb,
+ struct sk_buff *resp,
+ int (*send_ack)(struct nfc_digital_dev *ddev,
+ struct digital_data_exch
+ *data_exch),
+ struct digital_data_exch *data_exch)
+{
+ struct sk_buff *new_skb;
+ int rc;
+
+ if (DIGITAL_NFC_DEP_MI_BIT_SET(pfb) && (!ddev->chaining_skb)) {
+ ddev->chaining_skb =
+ nfc_alloc_recv_skb(8 * ddev->local_payload_max,
+ GFP_KERNEL);
+ if (!ddev->chaining_skb) {
+ rc = -ENOMEM;
+ goto error;
+ }
+ }
+
+ if (ddev->chaining_skb) {
+ if (resp->len > skb_tailroom(ddev->chaining_skb)) {
+ new_skb = skb_copy_expand(ddev->chaining_skb,
+ skb_headroom(
+ ddev->chaining_skb),
+ 8 * ddev->local_payload_max,
+ GFP_KERNEL);
+ if (!new_skb) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ kfree_skb(ddev->chaining_skb);
+ ddev->chaining_skb = new_skb;
+ }
+
+ memcpy(skb_put(ddev->chaining_skb, resp->len), resp->data,
+ resp->len);
+
+ kfree_skb(resp);
+ resp = NULL;
+
+ if (DIGITAL_NFC_DEP_MI_BIT_SET(pfb)) {
+ rc = send_ack(ddev, data_exch);
+ if (rc)
+ goto error;
+
+ return NULL;
+ }
+
+ resp = ddev->chaining_skb;
+ ddev->chaining_skb = NULL;
+ }
+
+ return resp;
+
+error:
+ kfree_skb(resp);
+
+ kfree_skb(ddev->chaining_skb);
+ ddev->chaining_skb = NULL;
+
+ return ERR_PTR(rc);
+}
+
+static void digital_in_recv_psl_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct nfc_target *target = arg;
+ struct digital_psl_res *psl_res;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ rc = ddev->skb_check_crc(resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.6");
+ goto exit;
+ }
+
+ rc = digital_skb_pull_dep_sod(ddev, resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.2");
+ goto exit;
+ }
+
+ psl_res = (struct digital_psl_res *)resp->data;
+
+ if ((resp->len != sizeof(*psl_res)) ||
+ (psl_res->dir != DIGITAL_NFC_DEP_FRAME_DIR_IN) ||
+ (psl_res->cmd != DIGITAL_CMD_PSL_RES)) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH,
+ NFC_DIGITAL_RF_TECH_424F);
+ if (rc)
+ goto exit;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCF_NFC_DEP);
+ if (rc)
+ goto exit;
+
+ if (!DIGITAL_DRV_CAPS_IN_CRC(ddev) &&
+ (ddev->curr_rf_tech == NFC_DIGITAL_RF_TECH_106A)) {
+ ddev->skb_add_crc = digital_skb_add_crc_f;
+ ddev->skb_check_crc = digital_skb_check_crc_f;
+ }
+
+ ddev->curr_rf_tech = NFC_DIGITAL_RF_TECH_424F;
+
+ nfc_dep_link_is_up(ddev->nfc_dev, target->idx, NFC_COMM_ACTIVE,
+ NFC_RF_INITIATOR);
+
+ ddev->curr_nfc_dep_pni = 0;
+
+exit:
+ dev_kfree_skb(resp);
+
+ if (rc)
+ ddev->curr_protocol = 0;
+}
+
+static int digital_in_send_psl_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target)
+{
+ struct sk_buff *skb;
+ struct digital_psl_req *psl_req;
+ int rc;
+ u8 payload_size, payload_bits;
+
+ skb = digital_skb_alloc(ddev, sizeof(*psl_req));
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, sizeof(*psl_req));
+
+ psl_req = (struct digital_psl_req *)skb->data;
+
+ psl_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT;
+ psl_req->cmd = DIGITAL_CMD_PSL_REQ;
+ psl_req->did = 0;
+ psl_req->brs = (0x2 << 3) | 0x2; /* 424F both directions */
+
+ payload_size = min(ddev->local_payload_max, ddev->remote_payload_max);
+ payload_bits = digital_payload_size_to_bits(payload_size);
+ psl_req->fsl = DIGITAL_PAYLOAD_BITS_TO_FSL(payload_bits);
+
+ ddev->local_payload_max = payload_size;
+ ddev->remote_payload_max = payload_size;
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_psl_res,
+ target);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_in_recv_atr_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct nfc_target *target = arg;
+ struct digital_atr_res *atr_res;
+ u8 gb_len, payload_bits;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ rc = ddev->skb_check_crc(resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.6");
+ goto exit;
+ }
+
+ rc = digital_skb_pull_dep_sod(ddev, resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.2");
+ goto exit;
+ }
+
+ if (resp->len < sizeof(struct digital_atr_res)) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ gb_len = resp->len - sizeof(struct digital_atr_res);
+
+ atr_res = (struct digital_atr_res *)resp->data;
+
+ payload_bits = DIGITAL_PAYLOAD_PP_TO_BITS(atr_res->pp);
+ ddev->remote_payload_max = digital_payload_bits_to_size(payload_bits);
+
+ if (!ddev->remote_payload_max) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ rc = nfc_set_remote_general_bytes(ddev->nfc_dev, atr_res->gb, gb_len);
+ if (rc)
+ goto exit;
+
+ if ((ddev->protocols & NFC_PROTO_FELICA_MASK) &&
+ (ddev->curr_rf_tech != NFC_DIGITAL_RF_TECH_424F)) {
+ rc = digital_in_send_psl_req(ddev, target);
+ if (!rc)
+ goto exit;
+ }
+
+ rc = nfc_dep_link_is_up(ddev->nfc_dev, target->idx, NFC_COMM_ACTIVE,
+ NFC_RF_INITIATOR);
+
+ ddev->curr_nfc_dep_pni = 0;
+
+exit:
+ dev_kfree_skb(resp);
+
+ if (rc)
+ ddev->curr_protocol = 0;
+}
+
+int digital_in_send_atr_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target, __u8 comm_mode, __u8 *gb,
+ size_t gb_len)
+{
+ struct sk_buff *skb;
+ struct digital_atr_req *atr_req;
+ uint size;
+ int rc;
+ u8 payload_bits;
+
+ size = DIGITAL_ATR_REQ_MIN_SIZE + gb_len;
+
+ if (size > DIGITAL_ATR_REQ_MAX_SIZE) {
+ PROTOCOL_ERR("14.6.1.1");
+ return -EINVAL;
+ }
+
+ skb = digital_skb_alloc(ddev, size);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, sizeof(struct digital_atr_req));
+
+ atr_req = (struct digital_atr_req *)skb->data;
+ memset(atr_req, 0, sizeof(struct digital_atr_req));
+
+ atr_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT;
+ atr_req->cmd = DIGITAL_CMD_ATR_REQ;
+ if (target->nfcid2_len)
+ memcpy(atr_req->nfcid3, target->nfcid2, NFC_NFCID2_MAXSIZE);
+ else
+ get_random_bytes(atr_req->nfcid3, NFC_NFCID3_MAXSIZE);
+
+ atr_req->did = 0;
+ atr_req->bs = 0;
+ atr_req->br = 0;
+
+ ddev->local_payload_max = DIGITAL_PAYLOAD_SIZE_MAX;
+ payload_bits = digital_payload_size_to_bits(ddev->local_payload_max);
+ atr_req->pp = DIGITAL_PAYLOAD_BITS_TO_PP(payload_bits);
+
+ if (gb_len) {
+ atr_req->pp |= DIGITAL_GB_BIT;
+ memcpy(skb_put(skb, gb_len), gb, gb_len);
+ }
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ rc = digital_in_send_cmd(ddev, skb, 500, digital_in_recv_atr_res,
+ target);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static int digital_in_send_ack(struct nfc_digital_dev *ddev,
+ struct digital_data_exch *data_exch)
+{
+ struct digital_dep_req_res *dep_req;
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, 1);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_push(skb, sizeof(struct digital_dep_req_res));
+
+ dep_req = (struct digital_dep_req_res *)skb->data;
+
+ dep_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT;
+ dep_req->cmd = DIGITAL_CMD_DEP_REQ;
+ dep_req->pfb = DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU |
+ ddev->curr_nfc_dep_pni;
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ ddev->saved_skb = skb_get(skb);
+ ddev->saved_skb_len = skb->len;
+
+ rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
+ data_exch);
+ if (rc) {
+ kfree_skb(skb);
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+ }
+
+ return rc;
+}
+
+static int digital_in_send_nack(struct nfc_digital_dev *ddev,
+ struct digital_data_exch *data_exch)
+{
+ struct digital_dep_req_res *dep_req;
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, 1);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_push(skb, sizeof(struct digital_dep_req_res));
+
+ dep_req = (struct digital_dep_req_res *)skb->data;
+
+ dep_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT;
+ dep_req->cmd = DIGITAL_CMD_DEP_REQ;
+ dep_req->pfb = DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU |
+ DIGITAL_NFC_DEP_PFB_NACK_BIT | ddev->curr_nfc_dep_pni;
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
+ data_exch);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static int digital_in_send_atn(struct nfc_digital_dev *ddev,
+ struct digital_data_exch *data_exch)
+{
+ struct digital_dep_req_res *dep_req;
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, 1);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_push(skb, sizeof(struct digital_dep_req_res));
+
+ dep_req = (struct digital_dep_req_res *)skb->data;
+
+ dep_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT;
+ dep_req->cmd = DIGITAL_CMD_DEP_REQ;
+ dep_req->pfb = DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU;
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
+ data_exch);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static int digital_in_send_rtox(struct nfc_digital_dev *ddev,
+ struct digital_data_exch *data_exch, u8 rtox)
+{
+ struct digital_dep_req_res *dep_req;
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, 1);
+ if (!skb)
+ return -ENOMEM;
+
+ *skb_put(skb, 1) = rtox;
+
+ skb_push(skb, sizeof(struct digital_dep_req_res));
+
+ dep_req = (struct digital_dep_req_res *)skb->data;
+
+ dep_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT;
+ dep_req->cmd = DIGITAL_CMD_DEP_REQ;
+ dep_req->pfb = DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU |
+ DIGITAL_NFC_DEP_PFB_TIMEOUT_BIT;
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ ddev->saved_skb = skb_get(skb);
+ ddev->saved_skb_len = skb->len;
+
+ rc = digital_in_send_cmd(ddev, skb, 1500, digital_in_recv_dep_res,
+ data_exch);
+ if (rc) {
+ kfree_skb(skb);
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+ }
+
+ return rc;
+}
+
+static int digital_in_send_saved_skb(struct nfc_digital_dev *ddev,
+ struct digital_data_exch *data_exch)
+{
+ skb_get(ddev->saved_skb);
+ skb_push(ddev->saved_skb, ddev->saved_skb_len);
+
+ return digital_in_send_cmd(ddev, ddev->saved_skb, 1500,
+ digital_in_recv_dep_res, data_exch);
+}
+
+static void digital_in_recv_dep_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct digital_data_exch *data_exch = arg;
+ struct digital_dep_req_res *dep_res;
+ u8 pfb;
+ uint size;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+
+ if (((rc != -ETIMEDOUT) || ddev->nack_count) &&
+ (ddev->nack_count++ < DIGITAL_NFC_DEP_N_RETRY_NACK)) {
+ ddev->atn_count = 0;
+
+ rc = digital_in_send_nack(ddev, data_exch);
+ if (rc)
+ goto error;
+
+ return;
+ } else if ((rc == -ETIMEDOUT) &&
+ (ddev->atn_count++ < DIGITAL_NFC_DEP_N_RETRY_ATN)) {
+ ddev->nack_count = 0;
+
+ rc = digital_in_send_atn(ddev, data_exch);
+ if (rc)
+ goto error;
+
+ return;
+ }
+
+ goto exit;
+ }
+
+ rc = digital_skb_pull_dep_sod(ddev, resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.2");
+ goto exit;
+ }
+
+ rc = ddev->skb_check_crc(resp);
+ if (rc) {
+ if ((resp->len >= 4) &&
+ (ddev->nack_count++ < DIGITAL_NFC_DEP_N_RETRY_NACK)) {
+ ddev->atn_count = 0;
+
+ rc = digital_in_send_nack(ddev, data_exch);
+ if (rc)
+ goto error;
+
+ kfree_skb(resp);
+
+ return;
+ }
+
+ PROTOCOL_ERR("14.4.1.6");
+ goto error;
+ }
+
+ ddev->atn_count = 0;
+ ddev->nack_count = 0;
+
+ if (resp->len > ddev->local_payload_max) {
+ rc = -EMSGSIZE;
+ goto exit;
+ }
+
+ size = sizeof(struct digital_dep_req_res);
+ dep_res = (struct digital_dep_req_res *)resp->data;
+
+ if (resp->len < size || dep_res->dir != DIGITAL_NFC_DEP_FRAME_DIR_IN ||
+ dep_res->cmd != DIGITAL_CMD_DEP_RES) {
+ rc = -EIO;
+ goto error;
+ }
+
+ pfb = dep_res->pfb;
+
+ if (DIGITAL_NFC_DEP_DID_BIT_SET(pfb)) {
+ PROTOCOL_ERR("14.8.2.1");
+ rc = -EIO;
+ goto error;
+ }
+
+ if (DIGITAL_NFC_DEP_NAD_BIT_SET(pfb)) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (size > resp->len) {
+ rc = -EIO;
+ goto error;
+ }
+
+ skb_pull(resp, size);
+
+ switch (DIGITAL_NFC_DEP_PFB_TYPE(pfb)) {
+ case DIGITAL_NFC_DEP_PFB_I_PDU:
+ if (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni) {
+ PROTOCOL_ERR("14.12.3.3");
+ rc = -EIO;
+ goto error;
+ }
+
+ ddev->curr_nfc_dep_pni =
+ DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni + 1);
+
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+
+ resp = digital_recv_dep_data_gather(ddev, pfb, resp,
+ digital_in_send_ack,
+ data_exch);
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto error;
+ }
+
+ /* If resp is NULL then we're still chaining so return and
+ * wait for the next part of the PDU. Else, the PDU is
+ * complete so pass it up.
+ */
+ if (!resp)
+ return;
+
+ rc = 0;
+ break;
+
+ case DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU:
+ if (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni) {
+ PROTOCOL_ERR("14.12.3.3");
+ rc = -EIO;
+ goto exit;
+ }
+
+ ddev->curr_nfc_dep_pni =
+ DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni + 1);
+
+ if (ddev->chaining_skb && !DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) {
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+
+ rc = digital_in_send_dep_req(ddev, NULL,
+ ddev->chaining_skb,
+ ddev->data_exch);
+ if (rc)
+ goto error;
+
+ return;
+ }
+
+ pr_err("Received a ACK/NACK PDU\n");
+ rc = -EINVAL;
+ goto exit;
+
+ case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU:
+ if (!DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) { /* ATN */
+ rc = digital_in_send_saved_skb(ddev, data_exch);
+ if (rc) {
+ kfree_skb(ddev->saved_skb);
+ goto error;
+ }
+
+ return;
+ }
+
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+
+ rc = digital_in_send_rtox(ddev, data_exch, resp->data[0]);
+ if (rc)
+ goto error;
+
+ kfree_skb(resp);
+ return;
+ }
+
+exit:
+ data_exch->cb(data_exch->cb_context, resp, rc);
+
+error:
+ kfree(data_exch);
+
+ kfree_skb(ddev->chaining_skb);
+ ddev->chaining_skb = NULL;
+
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+
+ if (rc)
+ kfree_skb(resp);
+}
+
+int digital_in_send_dep_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target, struct sk_buff *skb,
+ struct digital_data_exch *data_exch)
+{
+ struct digital_dep_req_res *dep_req;
+ struct sk_buff *chaining_skb, *tmp_skb;
+ int rc;
+
+ skb_push(skb, sizeof(struct digital_dep_req_res));
+
+ dep_req = (struct digital_dep_req_res *)skb->data;
+
+ dep_req->dir = DIGITAL_NFC_DEP_FRAME_DIR_OUT;
+ dep_req->cmd = DIGITAL_CMD_DEP_REQ;
+ dep_req->pfb = ddev->curr_nfc_dep_pni;
+
+ ddev->atn_count = 0;
+ ddev->nack_count = 0;
+
+ chaining_skb = ddev->chaining_skb;
+
+ tmp_skb = digital_send_dep_data_prep(ddev, skb, dep_req, data_exch);
+ if (IS_ERR(tmp_skb))
+ return PTR_ERR(tmp_skb);
+
+ digital_skb_push_dep_sod(ddev, tmp_skb);
+
+ ddev->skb_add_crc(tmp_skb);
+
+ ddev->saved_skb = skb_get(tmp_skb);
+ ddev->saved_skb_len = tmp_skb->len;
+
+ rc = digital_in_send_cmd(ddev, tmp_skb, 1500, digital_in_recv_dep_res,
+ data_exch);
+ if (rc) {
+ if (tmp_skb != skb)
+ kfree_skb(tmp_skb);
+
+ kfree_skb(chaining_skb);
+ ddev->chaining_skb = NULL;
+
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+ }
+
+ return rc;
+}
+
+static void digital_tg_set_rf_tech(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ ddev->curr_rf_tech = rf_tech;
+
+ ddev->skb_add_crc = digital_skb_add_crc_none;
+ ddev->skb_check_crc = digital_skb_check_crc_none;
+
+ if (DIGITAL_DRV_CAPS_TG_CRC(ddev))
+ return;
+
+ switch (ddev->curr_rf_tech) {
+ case NFC_DIGITAL_RF_TECH_106A:
+ ddev->skb_add_crc = digital_skb_add_crc_a;
+ ddev->skb_check_crc = digital_skb_check_crc_a;
+ break;
+
+ case NFC_DIGITAL_RF_TECH_212F:
+ case NFC_DIGITAL_RF_TECH_424F:
+ ddev->skb_add_crc = digital_skb_add_crc_f;
+ ddev->skb_check_crc = digital_skb_check_crc_f;
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int digital_tg_send_ack(struct nfc_digital_dev *ddev,
+ struct digital_data_exch *data_exch)
+{
+ struct digital_dep_req_res *dep_res;
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, 1);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_push(skb, sizeof(struct digital_dep_req_res));
+
+ dep_res = (struct digital_dep_req_res *)skb->data;
+
+ dep_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN;
+ dep_res->cmd = DIGITAL_CMD_DEP_RES;
+ dep_res->pfb = DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU |
+ ddev->curr_nfc_dep_pni;
+
+ if (ddev->did) {
+ dep_res->pfb |= DIGITAL_NFC_DEP_PFB_DID_BIT;
+
+ memcpy(skb_put(skb, sizeof(ddev->did)), &ddev->did,
+ sizeof(ddev->did));
+ }
+
+ ddev->curr_nfc_dep_pni =
+ DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni + 1);
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ ddev->saved_skb = skb_get(skb);
+ ddev->saved_skb_len = skb->len;
+
+ rc = digital_tg_send_cmd(ddev, skb, 1500, digital_tg_recv_dep_req,
+ data_exch);
+ if (rc) {
+ kfree_skb(skb);
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+ }
+
+ return rc;
+}
+
+static int digital_tg_send_atn(struct nfc_digital_dev *ddev)
+{
+ struct digital_dep_req_res *dep_res;
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, 1);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_push(skb, sizeof(struct digital_dep_req_res));
+
+ dep_res = (struct digital_dep_req_res *)skb->data;
+
+ dep_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN;
+ dep_res->cmd = DIGITAL_CMD_DEP_RES;
+ dep_res->pfb = DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU;
+
+ if (ddev->did) {
+ dep_res->pfb |= DIGITAL_NFC_DEP_PFB_DID_BIT;
+
+ memcpy(skb_put(skb, sizeof(ddev->did)), &ddev->did,
+ sizeof(ddev->did));
+ }
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ rc = digital_tg_send_cmd(ddev, skb, 1500, digital_tg_recv_dep_req,
+ NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static int digital_tg_send_saved_skb(struct nfc_digital_dev *ddev)
+{
+ skb_get(ddev->saved_skb);
+ skb_push(ddev->saved_skb, ddev->saved_skb_len);
+
+ return digital_tg_send_cmd(ddev, ddev->saved_skb, 1500,
+ digital_tg_recv_dep_req, NULL);
+}
+
+static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ int rc;
+ struct digital_dep_req_res *dep_req;
+ u8 pfb;
+ size_t size;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ rc = ddev->skb_check_crc(resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.6");
+ goto exit;
+ }
+
+ rc = digital_skb_pull_dep_sod(ddev, resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.2");
+ goto exit;
+ }
+
+ if (resp->len > ddev->local_payload_max) {
+ rc = -EMSGSIZE;
+ goto exit;
+ }
+
+ size = sizeof(struct digital_dep_req_res);
+ dep_req = (struct digital_dep_req_res *)resp->data;
+
+ if (resp->len < size || dep_req->dir != DIGITAL_NFC_DEP_FRAME_DIR_OUT ||
+ dep_req->cmd != DIGITAL_CMD_DEP_REQ) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ pfb = dep_req->pfb;
+
+ if (DIGITAL_NFC_DEP_DID_BIT_SET(pfb)) {
+ if (ddev->did && (ddev->did == resp->data[3])) {
+ size++;
+ } else {
+ rc = -EIO;
+ goto exit;
+ }
+ } else if (ddev->did) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (DIGITAL_NFC_DEP_NAD_BIT_SET(pfb)) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (size > resp->len) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ skb_pull(resp, size);
+
+ switch (DIGITAL_NFC_DEP_PFB_TYPE(pfb)) {
+ case DIGITAL_NFC_DEP_PFB_I_PDU:
+ pr_debug("DIGITAL_NFC_DEP_PFB_I_PDU\n");
+
+ if ((ddev->atn_count && (DIGITAL_NFC_DEP_PFB_PNI(pfb - 1) !=
+ ddev->curr_nfc_dep_pni)) ||
+ (DIGITAL_NFC_DEP_PFB_PNI(pfb) != ddev->curr_nfc_dep_pni)) {
+ PROTOCOL_ERR("14.12.3.4");
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (ddev->atn_count) {
+ ddev->atn_count = 0;
+
+ rc = digital_tg_send_saved_skb(ddev);
+ if (rc)
+ goto exit;
+
+ return;
+ }
+
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+
+ resp = digital_recv_dep_data_gather(ddev, pfb, resp,
+ digital_tg_send_ack, NULL);
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ /* If resp is NULL then we're still chaining so return and
+ * wait for the next part of the PDU. Else, the PDU is
+ * complete so pass it up.
+ */
+ if (!resp)
+ return;
+
+ rc = 0;
+ break;
+ case DIGITAL_NFC_DEP_PFB_ACK_NACK_PDU:
+ if (!DIGITAL_NFC_DEP_NACK_BIT_SET(pfb)) { /* ACK */
+ if ((ddev->atn_count &&
+ (DIGITAL_NFC_DEP_PFB_PNI(pfb - 1) !=
+ ddev->curr_nfc_dep_pni)) ||
+ (DIGITAL_NFC_DEP_PFB_PNI(pfb) !=
+ ddev->curr_nfc_dep_pni) ||
+ !ddev->chaining_skb || !ddev->saved_skb) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (ddev->atn_count) {
+ ddev->atn_count = 0;
+
+ rc = digital_tg_send_saved_skb(ddev);
+ if (rc)
+ goto exit;
+
+ return;
+ }
+
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+
+ rc = digital_tg_send_dep_res(ddev, ddev->chaining_skb);
+ if (rc)
+ goto exit;
+ } else { /* NACK */
+ if ((DIGITAL_NFC_DEP_PFB_PNI(pfb + 1) !=
+ ddev->curr_nfc_dep_pni) ||
+ !ddev->saved_skb) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ ddev->atn_count = 0;
+
+ rc = digital_tg_send_saved_skb(ddev);
+ if (rc) {
+ kfree_skb(ddev->saved_skb);
+ goto exit;
+ }
+ }
+
+ return;
+ case DIGITAL_NFC_DEP_PFB_SUPERVISOR_PDU:
+ if (DIGITAL_NFC_DEP_PFB_IS_TIMEOUT(pfb)) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ rc = digital_tg_send_atn(ddev);
+ if (rc)
+ goto exit;
+
+ ddev->atn_count++;
+
+ kfree_skb(resp);
+ return;
+ }
+
+ rc = nfc_tm_data_received(ddev->nfc_dev, resp);
+
+exit:
+ kfree_skb(ddev->chaining_skb);
+ ddev->chaining_skb = NULL;
+
+ ddev->atn_count = 0;
+
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+
+ if (rc)
+ kfree_skb(resp);
+}
+
+int digital_tg_send_dep_res(struct nfc_digital_dev *ddev, struct sk_buff *skb)
+{
+ struct digital_dep_req_res *dep_res;
+ struct sk_buff *chaining_skb, *tmp_skb;
+ int rc;
+
+ skb_push(skb, sizeof(struct digital_dep_req_res));
+
+ dep_res = (struct digital_dep_req_res *)skb->data;
+
+ dep_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN;
+ dep_res->cmd = DIGITAL_CMD_DEP_RES;
+ dep_res->pfb = ddev->curr_nfc_dep_pni;
+
+ if (ddev->did) {
+ dep_res->pfb |= DIGITAL_NFC_DEP_PFB_DID_BIT;
+
+ memcpy(skb_put(skb, sizeof(ddev->did)), &ddev->did,
+ sizeof(ddev->did));
+ }
+
+ ddev->curr_nfc_dep_pni =
+ DIGITAL_NFC_DEP_PFB_PNI(ddev->curr_nfc_dep_pni + 1);
+
+ chaining_skb = ddev->chaining_skb;
+
+ tmp_skb = digital_send_dep_data_prep(ddev, skb, dep_res, NULL);
+ if (IS_ERR(tmp_skb))
+ return PTR_ERR(tmp_skb);
+
+ digital_skb_push_dep_sod(ddev, tmp_skb);
+
+ ddev->skb_add_crc(tmp_skb);
+
+ ddev->saved_skb = skb_get(tmp_skb);
+ ddev->saved_skb_len = tmp_skb->len;
+
+ rc = digital_tg_send_cmd(ddev, tmp_skb, 1500, digital_tg_recv_dep_req,
+ NULL);
+ if (rc) {
+ if (tmp_skb != skb)
+ kfree_skb(tmp_skb);
+
+ kfree_skb(chaining_skb);
+ ddev->chaining_skb = NULL;
+
+ kfree_skb(ddev->saved_skb);
+ ddev->saved_skb = NULL;
+ }
+
+ return rc;
+}
+
+static void digital_tg_send_psl_res_complete(struct nfc_digital_dev *ddev,
+ void *arg, struct sk_buff *resp)
+{
+ u8 rf_tech = (unsigned long)arg;
+
+ if (IS_ERR(resp))
+ return;
+
+ digital_tg_set_rf_tech(ddev, rf_tech);
+
+ digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, rf_tech);
+
+ digital_tg_listen(ddev, 1500, digital_tg_recv_dep_req, NULL);
+
+ dev_kfree_skb(resp);
+}
+
+static int digital_tg_send_psl_res(struct nfc_digital_dev *ddev, u8 did,
+ u8 rf_tech)
+{
+ struct digital_psl_res *psl_res;
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, sizeof(struct digital_psl_res));
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, sizeof(struct digital_psl_res));
+
+ psl_res = (struct digital_psl_res *)skb->data;
+
+ psl_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN;
+ psl_res->cmd = DIGITAL_CMD_PSL_RES;
+ psl_res->did = did;
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ ddev->curr_nfc_dep_pni = 0;
+
+ rc = digital_tg_send_cmd(ddev, skb, 0, digital_tg_send_psl_res_complete,
+ (void *)(unsigned long)rf_tech);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_tg_recv_psl_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ int rc;
+ struct digital_psl_req *psl_req;
+ u8 rf_tech;
+ u8 dsi, payload_size, payload_bits;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ rc = ddev->skb_check_crc(resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.6");
+ goto exit;
+ }
+
+ rc = digital_skb_pull_dep_sod(ddev, resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.2");
+ goto exit;
+ }
+
+ psl_req = (struct digital_psl_req *)resp->data;
+
+ if (resp->len != sizeof(struct digital_psl_req) ||
+ psl_req->dir != DIGITAL_NFC_DEP_FRAME_DIR_OUT ||
+ psl_req->cmd != DIGITAL_CMD_PSL_REQ) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ dsi = (psl_req->brs >> 3) & 0x07;
+ switch (dsi) {
+ case 0:
+ rf_tech = NFC_DIGITAL_RF_TECH_106A;
+ break;
+ case 1:
+ rf_tech = NFC_DIGITAL_RF_TECH_212F;
+ break;
+ case 2:
+ rf_tech = NFC_DIGITAL_RF_TECH_424F;
+ break;
+ default:
+ pr_err("Unsupported dsi value %d\n", dsi);
+ goto exit;
+ }
+
+ payload_bits = DIGITAL_PAYLOAD_FSL_TO_BITS(psl_req->fsl);
+ payload_size = digital_payload_bits_to_size(payload_bits);
+
+ if (!payload_size || (payload_size > min(ddev->local_payload_max,
+ ddev->remote_payload_max))) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ ddev->local_payload_max = payload_size;
+ ddev->remote_payload_max = payload_size;
+
+ rc = digital_tg_send_psl_res(ddev, psl_req->did, rf_tech);
+
+exit:
+ kfree_skb(resp);
+}
+
+static void digital_tg_send_atr_res_complete(struct nfc_digital_dev *ddev,
+ void *arg, struct sk_buff *resp)
+{
+ int offset;
+
+ if (IS_ERR(resp)) {
+ digital_poll_next_tech(ddev);
+ return;
+ }
+
+ offset = 2;
+ if (resp->data[0] == DIGITAL_NFC_DEP_NFCA_SOD_SB)
+ offset++;
+
+ ddev->atn_count = 0;
+
+ if (resp->data[offset] == DIGITAL_CMD_PSL_REQ)
+ digital_tg_recv_psl_req(ddev, arg, resp);
+ else
+ digital_tg_recv_dep_req(ddev, arg, resp);
+}
+
+static int digital_tg_send_atr_res(struct nfc_digital_dev *ddev,
+ struct digital_atr_req *atr_req)
+{
+ struct digital_atr_res *atr_res;
+ struct sk_buff *skb;
+ u8 *gb, payload_bits;
+ size_t gb_len;
+ int rc;
+
+ gb = nfc_get_local_general_bytes(ddev->nfc_dev, &gb_len);
+ if (!gb)
+ gb_len = 0;
+
+ skb = digital_skb_alloc(ddev, sizeof(struct digital_atr_res) + gb_len);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, sizeof(struct digital_atr_res));
+ atr_res = (struct digital_atr_res *)skb->data;
+
+ memset(atr_res, 0, sizeof(struct digital_atr_res));
+
+ atr_res->dir = DIGITAL_NFC_DEP_FRAME_DIR_IN;
+ atr_res->cmd = DIGITAL_CMD_ATR_RES;
+ memcpy(atr_res->nfcid3, atr_req->nfcid3, sizeof(atr_req->nfcid3));
+ atr_res->to = 8;
+
+ ddev->local_payload_max = DIGITAL_PAYLOAD_SIZE_MAX;
+ payload_bits = digital_payload_size_to_bits(ddev->local_payload_max);
+ atr_res->pp = DIGITAL_PAYLOAD_BITS_TO_PP(payload_bits);
+
+ if (gb_len) {
+ skb_put(skb, gb_len);
+
+ atr_res->pp |= DIGITAL_GB_BIT;
+ memcpy(atr_res->gb, gb, gb_len);
+ }
+
+ digital_skb_push_dep_sod(ddev, skb);
+
+ ddev->skb_add_crc(skb);
+
+ ddev->curr_nfc_dep_pni = 0;
+
+ rc = digital_tg_send_cmd(ddev, skb, 999,
+ digital_tg_send_atr_res_complete, NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+void digital_tg_recv_atr_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ int rc;
+ struct digital_atr_req *atr_req;
+ size_t gb_len, min_size;
+ u8 poll_tech_count, payload_bits;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (!resp->len) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (resp->data[0] == DIGITAL_NFC_DEP_NFCA_SOD_SB) {
+ min_size = DIGITAL_ATR_REQ_MIN_SIZE + 2;
+ digital_tg_set_rf_tech(ddev, NFC_DIGITAL_RF_TECH_106A);
+ } else {
+ min_size = DIGITAL_ATR_REQ_MIN_SIZE + 1;
+ digital_tg_set_rf_tech(ddev, NFC_DIGITAL_RF_TECH_212F);
+ }
+
+ if (resp->len < min_size) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ ddev->curr_protocol = NFC_PROTO_NFC_DEP_MASK;
+
+ rc = ddev->skb_check_crc(resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.6");
+ goto exit;
+ }
+
+ rc = digital_skb_pull_dep_sod(ddev, resp);
+ if (rc) {
+ PROTOCOL_ERR("14.4.1.2");
+ goto exit;
+ }
+
+ atr_req = (struct digital_atr_req *)resp->data;
+
+ if (atr_req->dir != DIGITAL_NFC_DEP_FRAME_DIR_OUT ||
+ atr_req->cmd != DIGITAL_CMD_ATR_REQ ||
+ atr_req->did > DIGITAL_DID_MAX) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ payload_bits = DIGITAL_PAYLOAD_PP_TO_BITS(atr_req->pp);
+ ddev->remote_payload_max = digital_payload_bits_to_size(payload_bits);
+
+ if (!ddev->remote_payload_max) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ ddev->did = atr_req->did;
+
+ rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFC_DEP_ACTIVATED);
+ if (rc)
+ goto exit;
+
+ rc = digital_tg_send_atr_res(ddev, atr_req);
+ if (rc)
+ goto exit;
+
+ gb_len = resp->len - sizeof(struct digital_atr_req);
+
+ poll_tech_count = ddev->poll_tech_count;
+ ddev->poll_tech_count = 0;
+
+ rc = nfc_tm_activated(ddev->nfc_dev, NFC_PROTO_NFC_DEP_MASK,
+ NFC_COMM_PASSIVE, atr_req->gb, gb_len);
+ if (rc) {
+ ddev->poll_tech_count = poll_tech_count;
+ goto exit;
+ }
+
+ rc = 0;
+exit:
+ if (rc)
+ digital_poll_next_tech(ddev);
+
+ dev_kfree_skb(resp);
+}
diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c
new file mode 100644
index 000000000..fb58ed2dd
--- /dev/null
+++ b/net/nfc/digital_technology.c
@@ -0,0 +1,1315 @@
+/*
+ * NFC Digital Protocol stack
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) "digital: %s: " fmt, __func__
+
+#include "digital.h"
+
+#define DIGITAL_CMD_SENS_REQ 0x26
+#define DIGITAL_CMD_ALL_REQ 0x52
+#define DIGITAL_CMD_SEL_REQ_CL1 0x93
+#define DIGITAL_CMD_SEL_REQ_CL2 0x95
+#define DIGITAL_CMD_SEL_REQ_CL3 0x97
+
+#define DIGITAL_SDD_REQ_SEL_PAR 0x20
+
+#define DIGITAL_SDD_RES_CT 0x88
+#define DIGITAL_SDD_RES_LEN 5
+
+#define DIGITAL_SEL_RES_NFCID1_COMPLETE(sel_res) (!((sel_res) & 0x04))
+#define DIGITAL_SEL_RES_IS_T2T(sel_res) (!((sel_res) & 0x60))
+#define DIGITAL_SEL_RES_IS_T4T(sel_res) ((sel_res) & 0x20)
+#define DIGITAL_SEL_RES_IS_NFC_DEP(sel_res) ((sel_res) & 0x40)
+
+#define DIGITAL_SENS_RES_IS_T1T(sens_res) (((sens_res) & 0x0C00) == 0x0C00)
+#define DIGITAL_SENS_RES_IS_VALID(sens_res) \
+ ((!((sens_res) & 0x001F) && (((sens_res) & 0x0C00) == 0x0C00)) || \
+ (((sens_res) & 0x001F) && ((sens_res) & 0x0C00) != 0x0C00))
+
+#define DIGITAL_MIFARE_READ_RES_LEN 16
+#define DIGITAL_MIFARE_ACK_RES 0x0A
+
+#define DIGITAL_CMD_SENSB_REQ 0x05
+#define DIGITAL_SENSB_ADVANCED BIT(5)
+#define DIGITAL_SENSB_EXTENDED BIT(4)
+#define DIGITAL_SENSB_ALLB_REQ BIT(3)
+#define DIGITAL_SENSB_N(n) ((n) & 0x7)
+
+#define DIGITAL_CMD_SENSB_RES 0x50
+
+#define DIGITAL_CMD_ATTRIB_REQ 0x1D
+#define DIGITAL_ATTRIB_P1_TR0_DEFAULT (0x0 << 6)
+#define DIGITAL_ATTRIB_P1_TR1_DEFAULT (0x0 << 4)
+#define DIGITAL_ATTRIB_P1_SUPRESS_EOS BIT(3)
+#define DIGITAL_ATTRIB_P1_SUPRESS_SOS BIT(2)
+#define DIGITAL_ATTRIB_P2_LISTEN_POLL_1 (0x0 << 6)
+#define DIGITAL_ATTRIB_P2_POLL_LISTEN_1 (0x0 << 4)
+#define DIGITAL_ATTRIB_P2_MAX_FRAME_256 0x8
+#define DIGITAL_ATTRIB_P4_DID(n) ((n) & 0xf)
+
+#define DIGITAL_CMD_SENSF_REQ 0x00
+#define DIGITAL_CMD_SENSF_RES 0x01
+
+#define DIGITAL_SENSF_RES_MIN_LENGTH 17
+#define DIGITAL_SENSF_RES_RD_AP_B1 0x00
+#define DIGITAL_SENSF_RES_RD_AP_B2 0x8F
+
+#define DIGITAL_SENSF_REQ_RC_NONE 0
+#define DIGITAL_SENSF_REQ_RC_SC 1
+#define DIGITAL_SENSF_REQ_RC_AP 2
+
+#define DIGITAL_CMD_ISO15693_INVENTORY_REQ 0x01
+
+#define DIGITAL_ISO15693_REQ_FLAG_DATA_RATE BIT(1)
+#define DIGITAL_ISO15693_REQ_FLAG_INVENTORY BIT(2)
+#define DIGITAL_ISO15693_REQ_FLAG_NB_SLOTS BIT(5)
+#define DIGITAL_ISO15693_RES_FLAG_ERROR BIT(0)
+#define DIGITAL_ISO15693_RES_IS_VALID(flags) \
+ (!((flags) & DIGITAL_ISO15693_RES_FLAG_ERROR))
+
+#define DIGITAL_ISO_DEP_I_PCB 0x02
+#define DIGITAL_ISO_DEP_PNI(pni) ((pni) & 0x01)
+
+#define DIGITAL_ISO_DEP_PCB_TYPE(pcb) ((pcb) & 0xC0)
+
+#define DIGITAL_ISO_DEP_I_BLOCK 0x00
+
+#define DIGITAL_ISO_DEP_BLOCK_HAS_DID(pcb) ((pcb) & 0x08)
+
+static const u8 digital_ats_fsc[] = {
+ 16, 24, 32, 40, 48, 64, 96, 128,
+};
+
+#define DIGITAL_ATS_FSCI(t0) ((t0) & 0x0F)
+#define DIGITAL_SENSB_FSCI(pi2) (((pi2) & 0xF0) >> 4)
+#define DIGITAL_ATS_MAX_FSC 256
+
+#define DIGITAL_RATS_BYTE1 0xE0
+#define DIGITAL_RATS_PARAM 0x80
+
+struct digital_sdd_res {
+ u8 nfcid1[4];
+ u8 bcc;
+} __packed;
+
+struct digital_sel_req {
+ u8 sel_cmd;
+ u8 b2;
+ u8 nfcid1[4];
+ u8 bcc;
+} __packed;
+
+struct digital_sensb_req {
+ u8 cmd;
+ u8 afi;
+ u8 param;
+} __packed;
+
+struct digital_sensb_res {
+ u8 cmd;
+ u8 nfcid0[4];
+ u8 app_data[4];
+ u8 proto_info[3];
+} __packed;
+
+struct digital_attrib_req {
+ u8 cmd;
+ u8 nfcid0[4];
+ u8 param1;
+ u8 param2;
+ u8 param3;
+ u8 param4;
+} __packed;
+
+struct digital_attrib_res {
+ u8 mbli_did;
+} __packed;
+
+struct digital_sensf_req {
+ u8 cmd;
+ u8 sc1;
+ u8 sc2;
+ u8 rc;
+ u8 tsn;
+} __packed;
+
+struct digital_sensf_res {
+ u8 cmd;
+ u8 nfcid2[8];
+ u8 pad0[2];
+ u8 pad1[3];
+ u8 mrti_check;
+ u8 mrti_update;
+ u8 pad2;
+ u8 rd[2];
+} __packed;
+
+struct digital_iso15693_inv_req {
+ u8 flags;
+ u8 cmd;
+ u8 mask_len;
+ u64 mask;
+} __packed;
+
+struct digital_iso15693_inv_res {
+ u8 flags;
+ u8 dsfid;
+ u64 uid;
+} __packed;
+
+static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target);
+
+int digital_in_iso_dep_pull_sod(struct nfc_digital_dev *ddev,
+ struct sk_buff *skb)
+{
+ u8 pcb;
+ u8 block_type;
+
+ if (skb->len < 1)
+ return -EIO;
+
+ pcb = *skb->data;
+ block_type = DIGITAL_ISO_DEP_PCB_TYPE(pcb);
+
+ /* No support fo R-block nor S-block */
+ if (block_type != DIGITAL_ISO_DEP_I_BLOCK) {
+ pr_err("ISO_DEP R-block and S-block not supported\n");
+ return -EIO;
+ }
+
+ if (DIGITAL_ISO_DEP_BLOCK_HAS_DID(pcb)) {
+ pr_err("DID field in ISO_DEP PCB not supported\n");
+ return -EIO;
+ }
+
+ skb_pull(skb, 1);
+
+ return 0;
+}
+
+int digital_in_iso_dep_push_sod(struct nfc_digital_dev *ddev,
+ struct sk_buff *skb)
+{
+ /*
+ * Chaining not supported so skb->len + 1 PCB byte + 2 CRC bytes must
+ * not be greater than remote FSC
+ */
+ if (skb->len + 3 > ddev->target_fsc)
+ return -EIO;
+
+ skb_push(skb, 1);
+
+ *skb->data = DIGITAL_ISO_DEP_I_PCB | ddev->curr_nfc_dep_pni;
+
+ ddev->curr_nfc_dep_pni =
+ DIGITAL_ISO_DEP_PNI(ddev->curr_nfc_dep_pni + 1);
+
+ return 0;
+}
+
+static void digital_in_recv_ats(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct nfc_target *target = arg;
+ u8 fsdi;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (resp->len < 2) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ fsdi = DIGITAL_ATS_FSCI(resp->data[1]);
+ if (fsdi >= 8)
+ ddev->target_fsc = DIGITAL_ATS_MAX_FSC;
+ else
+ ddev->target_fsc = digital_ats_fsc[fsdi];
+
+ ddev->curr_nfc_dep_pni = 0;
+
+ rc = digital_target_found(ddev, target, NFC_PROTO_ISO14443);
+
+exit:
+ dev_kfree_skb(resp);
+ kfree(target);
+
+ if (rc)
+ digital_poll_next_tech(ddev);
+}
+
+static int digital_in_send_rats(struct nfc_digital_dev *ddev,
+ struct nfc_target *target)
+{
+ int rc;
+ struct sk_buff *skb;
+
+ skb = digital_skb_alloc(ddev, 2);
+ if (!skb)
+ return -ENOMEM;
+
+ *skb_put(skb, 1) = DIGITAL_RATS_BYTE1;
+ *skb_put(skb, 1) = DIGITAL_RATS_PARAM;
+
+ rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_ats,
+ target);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_in_recv_sel_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct nfc_target *target = arg;
+ int rc;
+ u8 sel_res;
+ u8 nfc_proto;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (!DIGITAL_DRV_CAPS_IN_CRC(ddev)) {
+ rc = digital_skb_check_crc_a(resp);
+ if (rc) {
+ PROTOCOL_ERR("4.4.1.3");
+ goto exit;
+ }
+ }
+
+ if (!resp->len) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ sel_res = resp->data[0];
+
+ if (!DIGITAL_SEL_RES_NFCID1_COMPLETE(sel_res)) {
+ rc = digital_in_send_sdd_req(ddev, target);
+ if (rc)
+ goto exit;
+
+ goto exit_free_skb;
+ }
+
+ target->sel_res = sel_res;
+
+ if (DIGITAL_SEL_RES_IS_T2T(sel_res)) {
+ nfc_proto = NFC_PROTO_MIFARE;
+ } else if (DIGITAL_SEL_RES_IS_NFC_DEP(sel_res)) {
+ nfc_proto = NFC_PROTO_NFC_DEP;
+ } else if (DIGITAL_SEL_RES_IS_T4T(sel_res)) {
+ rc = digital_in_send_rats(ddev, target);
+ if (rc)
+ goto exit;
+ /*
+ * Skip target_found and don't free it for now. This will be
+ * done when receiving the ATS
+ */
+ goto exit_free_skb;
+ } else {
+ rc = -EOPNOTSUPP;
+ goto exit;
+ }
+
+ rc = digital_target_found(ddev, target, nfc_proto);
+
+exit:
+ kfree(target);
+
+exit_free_skb:
+ dev_kfree_skb(resp);
+
+ if (rc)
+ digital_poll_next_tech(ddev);
+}
+
+static int digital_in_send_sel_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target,
+ struct digital_sdd_res *sdd_res)
+{
+ struct sk_buff *skb;
+ struct digital_sel_req *sel_req;
+ u8 sel_cmd;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, sizeof(struct digital_sel_req));
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, sizeof(struct digital_sel_req));
+ sel_req = (struct digital_sel_req *)skb->data;
+
+ if (target->nfcid1_len <= 4)
+ sel_cmd = DIGITAL_CMD_SEL_REQ_CL1;
+ else if (target->nfcid1_len < 10)
+ sel_cmd = DIGITAL_CMD_SEL_REQ_CL2;
+ else
+ sel_cmd = DIGITAL_CMD_SEL_REQ_CL3;
+
+ sel_req->sel_cmd = sel_cmd;
+ sel_req->b2 = 0x70;
+ memcpy(sel_req->nfcid1, sdd_res->nfcid1, 4);
+ sel_req->bcc = sdd_res->bcc;
+
+ if (DIGITAL_DRV_CAPS_IN_CRC(ddev)) {
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCA_STANDARD_WITH_CRC_A);
+ if (rc)
+ goto exit;
+ } else {
+ digital_skb_add_crc_a(skb);
+ }
+
+ rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sel_res,
+ target);
+exit:
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_in_recv_sdd_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct nfc_target *target = arg;
+ struct digital_sdd_res *sdd_res;
+ int rc;
+ u8 offset, size;
+ u8 i, bcc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (resp->len < DIGITAL_SDD_RES_LEN) {
+ PROTOCOL_ERR("4.7.2.8");
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ sdd_res = (struct digital_sdd_res *)resp->data;
+
+ for (i = 0, bcc = 0; i < 4; i++)
+ bcc ^= sdd_res->nfcid1[i];
+
+ if (bcc != sdd_res->bcc) {
+ PROTOCOL_ERR("4.7.2.6");
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ if (sdd_res->nfcid1[0] == DIGITAL_SDD_RES_CT) {
+ offset = 1;
+ size = 3;
+ } else {
+ offset = 0;
+ size = 4;
+ }
+
+ memcpy(target->nfcid1 + target->nfcid1_len, sdd_res->nfcid1 + offset,
+ size);
+ target->nfcid1_len += size;
+
+ rc = digital_in_send_sel_req(ddev, target, sdd_res);
+
+exit:
+ dev_kfree_skb(resp);
+
+ if (rc) {
+ kfree(target);
+ digital_poll_next_tech(ddev);
+ }
+}
+
+static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target)
+{
+ int rc;
+ struct sk_buff *skb;
+ u8 sel_cmd;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCA_STANDARD);
+ if (rc)
+ return rc;
+
+ skb = digital_skb_alloc(ddev, 2);
+ if (!skb)
+ return -ENOMEM;
+
+ if (target->nfcid1_len == 0)
+ sel_cmd = DIGITAL_CMD_SEL_REQ_CL1;
+ else if (target->nfcid1_len == 3)
+ sel_cmd = DIGITAL_CMD_SEL_REQ_CL2;
+ else
+ sel_cmd = DIGITAL_CMD_SEL_REQ_CL3;
+
+ *skb_put(skb, sizeof(u8)) = sel_cmd;
+ *skb_put(skb, sizeof(u8)) = DIGITAL_SDD_REQ_SEL_PAR;
+
+ return digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res,
+ target);
+}
+
+static void digital_in_recv_sens_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct nfc_target *target = NULL;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (resp->len < sizeof(u16)) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ target = kzalloc(sizeof(struct nfc_target), GFP_KERNEL);
+ if (!target) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ target->sens_res = __le16_to_cpu(*(__le16 *)resp->data);
+
+ if (!DIGITAL_SENS_RES_IS_VALID(target->sens_res)) {
+ PROTOCOL_ERR("4.6.3.3");
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ if (DIGITAL_SENS_RES_IS_T1T(target->sens_res))
+ rc = digital_target_found(ddev, target, NFC_PROTO_JEWEL);
+ else
+ rc = digital_in_send_sdd_req(ddev, target);
+
+exit:
+ dev_kfree_skb(resp);
+
+ if (rc) {
+ kfree(target);
+ digital_poll_next_tech(ddev);
+ }
+}
+
+int digital_in_send_sens_req(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ struct sk_buff *skb;
+ int rc;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH,
+ NFC_DIGITAL_RF_TECH_106A);
+ if (rc)
+ return rc;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCA_SHORT);
+ if (rc)
+ return rc;
+
+ skb = digital_skb_alloc(ddev, 1);
+ if (!skb)
+ return -ENOMEM;
+
+ *skb_put(skb, sizeof(u8)) = DIGITAL_CMD_SENS_REQ;
+
+ rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sens_res, NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+int digital_in_recv_mifare_res(struct sk_buff *resp)
+{
+ /* Successful READ command response is 16 data bytes + 2 CRC bytes long.
+ * Since the driver can't differentiate a ACK/NACK response from a valid
+ * READ response, the CRC calculation must be handled at digital level
+ * even if the driver supports it for this technology.
+ */
+ if (resp->len == DIGITAL_MIFARE_READ_RES_LEN + DIGITAL_CRC_LEN) {
+ if (digital_skb_check_crc_a(resp)) {
+ PROTOCOL_ERR("9.4.1.2");
+ return -EIO;
+ }
+
+ return 0;
+ }
+
+ /* ACK response (i.e. successful WRITE). */
+ if (resp->len == 1 && resp->data[0] == DIGITAL_MIFARE_ACK_RES) {
+ resp->data[0] = 0;
+ return 0;
+ }
+
+ /* NACK and any other responses are treated as error. */
+ return -EIO;
+}
+
+static void digital_in_recv_attrib_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct nfc_target *target = arg;
+ struct digital_attrib_res *attrib_res;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (resp->len < sizeof(*attrib_res)) {
+ PROTOCOL_ERR("12.6.2");
+ rc = -EIO;
+ goto exit;
+ }
+
+ attrib_res = (struct digital_attrib_res *)resp->data;
+
+ if (attrib_res->mbli_did & 0x0f) {
+ PROTOCOL_ERR("12.6.2.1");
+ rc = -EIO;
+ goto exit;
+ }
+
+ rc = digital_target_found(ddev, target, NFC_PROTO_ISO14443_B);
+
+exit:
+ dev_kfree_skb(resp);
+ kfree(target);
+
+ if (rc)
+ digital_poll_next_tech(ddev);
+}
+
+static int digital_in_send_attrib_req(struct nfc_digital_dev *ddev,
+ struct nfc_target *target,
+ struct digital_sensb_res *sensb_res)
+{
+ struct digital_attrib_req *attrib_req;
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, sizeof(*attrib_req));
+ if (!skb)
+ return -ENOMEM;
+
+ attrib_req = (struct digital_attrib_req *)skb_put(skb,
+ sizeof(*attrib_req));
+
+ attrib_req->cmd = DIGITAL_CMD_ATTRIB_REQ;
+ memcpy(attrib_req->nfcid0, sensb_res->nfcid0,
+ sizeof(attrib_req->nfcid0));
+ attrib_req->param1 = DIGITAL_ATTRIB_P1_TR0_DEFAULT |
+ DIGITAL_ATTRIB_P1_TR1_DEFAULT;
+ attrib_req->param2 = DIGITAL_ATTRIB_P2_LISTEN_POLL_1 |
+ DIGITAL_ATTRIB_P2_POLL_LISTEN_1 |
+ DIGITAL_ATTRIB_P2_MAX_FRAME_256;
+ attrib_req->param3 = sensb_res->proto_info[1] & 0x07;
+ attrib_req->param4 = DIGITAL_ATTRIB_P4_DID(0);
+
+ rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_attrib_res,
+ target);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_in_recv_sensb_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct nfc_target *target = NULL;
+ struct digital_sensb_res *sensb_res;
+ u8 fsci;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (resp->len != sizeof(*sensb_res)) {
+ PROTOCOL_ERR("5.6.2.1");
+ rc = -EIO;
+ goto exit;
+ }
+
+ sensb_res = (struct digital_sensb_res *)resp->data;
+
+ if (sensb_res->cmd != DIGITAL_CMD_SENSB_RES) {
+ PROTOCOL_ERR("5.6.2");
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (!(sensb_res->proto_info[1] & BIT(0))) {
+ PROTOCOL_ERR("5.6.2.12");
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (sensb_res->proto_info[1] & BIT(3)) {
+ PROTOCOL_ERR("5.6.2.16");
+ rc = -EIO;
+ goto exit;
+ }
+
+ fsci = DIGITAL_SENSB_FSCI(sensb_res->proto_info[1]);
+ if (fsci >= 8)
+ ddev->target_fsc = DIGITAL_ATS_MAX_FSC;
+ else
+ ddev->target_fsc = digital_ats_fsc[fsci];
+
+ target = kzalloc(sizeof(struct nfc_target), GFP_KERNEL);
+ if (!target) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ rc = digital_in_send_attrib_req(ddev, target, sensb_res);
+
+exit:
+ dev_kfree_skb(resp);
+
+ if (rc) {
+ kfree(target);
+ digital_poll_next_tech(ddev);
+ }
+}
+
+int digital_in_send_sensb_req(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ struct digital_sensb_req *sensb_req;
+ struct sk_buff *skb;
+ int rc;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH,
+ NFC_DIGITAL_RF_TECH_106B);
+ if (rc)
+ return rc;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCB);
+ if (rc)
+ return rc;
+
+ skb = digital_skb_alloc(ddev, sizeof(*sensb_req));
+ if (!skb)
+ return -ENOMEM;
+
+ sensb_req = (struct digital_sensb_req *)skb_put(skb,
+ sizeof(*sensb_req));
+
+ sensb_req->cmd = DIGITAL_CMD_SENSB_REQ;
+ sensb_req->afi = 0x00; /* All families and sub-families */
+ sensb_req->param = DIGITAL_SENSB_N(0);
+
+ rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sensb_res,
+ NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_in_recv_sensf_res(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ int rc;
+ u8 proto;
+ struct nfc_target target;
+ struct digital_sensf_res *sensf_res;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (resp->len < DIGITAL_SENSF_RES_MIN_LENGTH) {
+ rc = -EIO;
+ goto exit;
+ }
+
+ if (!DIGITAL_DRV_CAPS_IN_CRC(ddev)) {
+ rc = digital_skb_check_crc_f(resp);
+ if (rc) {
+ PROTOCOL_ERR("6.4.1.8");
+ goto exit;
+ }
+ }
+
+ skb_pull(resp, 1);
+
+ memset(&target, 0, sizeof(struct nfc_target));
+
+ sensf_res = (struct digital_sensf_res *)resp->data;
+
+ memcpy(target.sensf_res, sensf_res, resp->len);
+ target.sensf_res_len = resp->len;
+
+ memcpy(target.nfcid2, sensf_res->nfcid2, NFC_NFCID2_MAXSIZE);
+ target.nfcid2_len = NFC_NFCID2_MAXSIZE;
+
+ if (target.nfcid2[0] == DIGITAL_SENSF_NFCID2_NFC_DEP_B1 &&
+ target.nfcid2[1] == DIGITAL_SENSF_NFCID2_NFC_DEP_B2)
+ proto = NFC_PROTO_NFC_DEP;
+ else
+ proto = NFC_PROTO_FELICA;
+
+ rc = digital_target_found(ddev, &target, proto);
+
+exit:
+ dev_kfree_skb(resp);
+
+ if (rc)
+ digital_poll_next_tech(ddev);
+}
+
+int digital_in_send_sensf_req(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ struct digital_sensf_req *sensf_req;
+ struct sk_buff *skb;
+ int rc;
+ u8 size;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, rf_tech);
+ if (rc)
+ return rc;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCF);
+ if (rc)
+ return rc;
+
+ size = sizeof(struct digital_sensf_req);
+
+ skb = digital_skb_alloc(ddev, size);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, size);
+
+ sensf_req = (struct digital_sensf_req *)skb->data;
+ sensf_req->cmd = DIGITAL_CMD_SENSF_REQ;
+ sensf_req->sc1 = 0xFF;
+ sensf_req->sc2 = 0xFF;
+ sensf_req->rc = 0;
+ sensf_req->tsn = 0;
+
+ *skb_push(skb, 1) = size + 1;
+
+ if (!DIGITAL_DRV_CAPS_IN_CRC(ddev))
+ digital_skb_add_crc_f(skb);
+
+ rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sensf_res,
+ NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_in_recv_iso15693_inv_res(struct nfc_digital_dev *ddev,
+ void *arg, struct sk_buff *resp)
+{
+ struct digital_iso15693_inv_res *res;
+ struct nfc_target *target = NULL;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto out_free_skb;
+ }
+
+ if (resp->len != sizeof(*res)) {
+ rc = -EIO;
+ goto out_free_skb;
+ }
+
+ res = (struct digital_iso15693_inv_res *)resp->data;
+
+ if (!DIGITAL_ISO15693_RES_IS_VALID(res->flags)) {
+ PROTOCOL_ERR("ISO15693 - 10.3.1");
+ rc = -EINVAL;
+ goto out_free_skb;
+ }
+
+ target = kzalloc(sizeof(*target), GFP_KERNEL);
+ if (!target) {
+ rc = -ENOMEM;
+ goto out_free_skb;
+ }
+
+ target->is_iso15693 = 1;
+ target->iso15693_dsfid = res->dsfid;
+ memcpy(target->iso15693_uid, &res->uid, sizeof(target->iso15693_uid));
+
+ rc = digital_target_found(ddev, target, NFC_PROTO_ISO15693);
+
+ kfree(target);
+
+out_free_skb:
+ dev_kfree_skb(resp);
+
+ if (rc)
+ digital_poll_next_tech(ddev);
+}
+
+int digital_in_send_iso15693_inv_req(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ struct digital_iso15693_inv_req *req;
+ struct sk_buff *skb;
+ int rc;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH,
+ NFC_DIGITAL_RF_TECH_ISO15693);
+ if (rc)
+ return rc;
+
+ rc = digital_in_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_ISO15693_INVENTORY);
+ if (rc)
+ return rc;
+
+ skb = digital_skb_alloc(ddev, sizeof(*req));
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, sizeof(*req) - sizeof(req->mask)); /* No mask */
+ req = (struct digital_iso15693_inv_req *)skb->data;
+
+ /* Single sub-carrier, high data rate, no AFI, single slot
+ * Inventory command
+ */
+ req->flags = DIGITAL_ISO15693_REQ_FLAG_DATA_RATE |
+ DIGITAL_ISO15693_REQ_FLAG_INVENTORY |
+ DIGITAL_ISO15693_REQ_FLAG_NB_SLOTS;
+ req->cmd = DIGITAL_CMD_ISO15693_INVENTORY_REQ;
+ req->mask_len = 0;
+
+ rc = digital_in_send_cmd(ddev, skb, 30,
+ digital_in_recv_iso15693_inv_res, NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static int digital_tg_send_sel_res(struct nfc_digital_dev *ddev)
+{
+ struct sk_buff *skb;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, 1);
+ if (!skb)
+ return -ENOMEM;
+
+ *skb_put(skb, 1) = DIGITAL_SEL_RES_NFC_DEP;
+
+ if (!DIGITAL_DRV_CAPS_TG_CRC(ddev))
+ digital_skb_add_crc_a(skb);
+
+ rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCA_ANTICOL_COMPLETE);
+ if (rc) {
+ kfree_skb(skb);
+ return rc;
+ }
+
+ rc = digital_tg_send_cmd(ddev, skb, 300, digital_tg_recv_atr_req,
+ NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_tg_recv_sel_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (!DIGITAL_DRV_CAPS_TG_CRC(ddev)) {
+ rc = digital_skb_check_crc_a(resp);
+ if (rc) {
+ PROTOCOL_ERR("4.4.1.3");
+ goto exit;
+ }
+ }
+
+ /* Silently ignore SEL_REQ content and send a SEL_RES for NFC-DEP */
+
+ rc = digital_tg_send_sel_res(ddev);
+
+exit:
+ if (rc)
+ digital_poll_next_tech(ddev);
+
+ dev_kfree_skb(resp);
+}
+
+static int digital_tg_send_sdd_res(struct nfc_digital_dev *ddev)
+{
+ struct sk_buff *skb;
+ struct digital_sdd_res *sdd_res;
+ int rc, i;
+
+ skb = digital_skb_alloc(ddev, sizeof(struct digital_sdd_res));
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, sizeof(struct digital_sdd_res));
+ sdd_res = (struct digital_sdd_res *)skb->data;
+
+ sdd_res->nfcid1[0] = 0x08;
+ get_random_bytes(sdd_res->nfcid1 + 1, 3);
+
+ sdd_res->bcc = 0;
+ for (i = 0; i < 4; i++)
+ sdd_res->bcc ^= sdd_res->nfcid1[i];
+
+ rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCA_STANDARD_WITH_CRC_A);
+ if (rc) {
+ kfree_skb(skb);
+ return rc;
+ }
+
+ rc = digital_tg_send_cmd(ddev, skb, 300, digital_tg_recv_sel_req,
+ NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+static void digital_tg_recv_sdd_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ u8 *sdd_req;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ sdd_req = resp->data;
+
+ if (resp->len < 2 || sdd_req[0] != DIGITAL_CMD_SEL_REQ_CL1 ||
+ sdd_req[1] != DIGITAL_SDD_REQ_SEL_PAR) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ rc = digital_tg_send_sdd_res(ddev);
+
+exit:
+ if (rc)
+ digital_poll_next_tech(ddev);
+
+ dev_kfree_skb(resp);
+}
+
+static int digital_tg_send_sens_res(struct nfc_digital_dev *ddev)
+{
+ struct sk_buff *skb;
+ u8 *sens_res;
+ int rc;
+
+ skb = digital_skb_alloc(ddev, 2);
+ if (!skb)
+ return -ENOMEM;
+
+ sens_res = skb_put(skb, 2);
+
+ sens_res[0] = (DIGITAL_SENS_RES_NFC_DEP >> 8) & 0xFF;
+ sens_res[1] = DIGITAL_SENS_RES_NFC_DEP & 0xFF;
+
+ rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCA_STANDARD);
+ if (rc) {
+ kfree_skb(skb);
+ return rc;
+ }
+
+ rc = digital_tg_send_cmd(ddev, skb, 300, digital_tg_recv_sdd_req,
+ NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+void digital_tg_recv_sens_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ u8 sens_req;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ sens_req = resp->data[0];
+
+ if (!resp->len || (sens_req != DIGITAL_CMD_SENS_REQ &&
+ sens_req != DIGITAL_CMD_ALL_REQ)) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ rc = digital_tg_send_sens_res(ddev);
+
+exit:
+ if (rc)
+ digital_poll_next_tech(ddev);
+
+ dev_kfree_skb(resp);
+}
+
+static void digital_tg_recv_atr_or_sensf_req(struct nfc_digital_dev *ddev,
+ void *arg, struct sk_buff *resp)
+{
+ if (!IS_ERR(resp) && (resp->len >= 2) &&
+ (resp->data[1] == DIGITAL_CMD_SENSF_REQ))
+ digital_tg_recv_sensf_req(ddev, arg, resp);
+ else
+ digital_tg_recv_atr_req(ddev, arg, resp);
+
+ return;
+}
+
+static int digital_tg_send_sensf_res(struct nfc_digital_dev *ddev,
+ struct digital_sensf_req *sensf_req)
+{
+ struct sk_buff *skb;
+ u8 size;
+ int rc;
+ struct digital_sensf_res *sensf_res;
+
+ size = sizeof(struct digital_sensf_res);
+
+ if (sensf_req->rc == DIGITAL_SENSF_REQ_RC_NONE)
+ size -= sizeof(sensf_res->rd);
+
+ skb = digital_skb_alloc(ddev, size);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_put(skb, size);
+
+ sensf_res = (struct digital_sensf_res *)skb->data;
+
+ memset(sensf_res, 0, size);
+
+ sensf_res->cmd = DIGITAL_CMD_SENSF_RES;
+ sensf_res->nfcid2[0] = DIGITAL_SENSF_NFCID2_NFC_DEP_B1;
+ sensf_res->nfcid2[1] = DIGITAL_SENSF_NFCID2_NFC_DEP_B2;
+ get_random_bytes(&sensf_res->nfcid2[2], 6);
+
+ switch (sensf_req->rc) {
+ case DIGITAL_SENSF_REQ_RC_SC:
+ sensf_res->rd[0] = sensf_req->sc1;
+ sensf_res->rd[1] = sensf_req->sc2;
+ break;
+ case DIGITAL_SENSF_REQ_RC_AP:
+ sensf_res->rd[0] = DIGITAL_SENSF_RES_RD_AP_B1;
+ sensf_res->rd[1] = DIGITAL_SENSF_RES_RD_AP_B2;
+ break;
+ }
+
+ *skb_push(skb, sizeof(u8)) = size + 1;
+
+ if (!DIGITAL_DRV_CAPS_TG_CRC(ddev))
+ digital_skb_add_crc_f(skb);
+
+ rc = digital_tg_send_cmd(ddev, skb, 300,
+ digital_tg_recv_atr_or_sensf_req, NULL);
+ if (rc)
+ kfree_skb(skb);
+
+ return rc;
+}
+
+void digital_tg_recv_sensf_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ struct digital_sensf_req *sensf_req;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ rc = PTR_ERR(resp);
+ resp = NULL;
+ goto exit;
+ }
+
+ if (!DIGITAL_DRV_CAPS_TG_CRC(ddev)) {
+ rc = digital_skb_check_crc_f(resp);
+ if (rc) {
+ PROTOCOL_ERR("6.4.1.8");
+ goto exit;
+ }
+ }
+
+ if (resp->len != sizeof(struct digital_sensf_req) + 1) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ skb_pull(resp, 1);
+ sensf_req = (struct digital_sensf_req *)resp->data;
+
+ if (sensf_req->cmd != DIGITAL_CMD_SENSF_REQ) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ rc = digital_tg_send_sensf_res(ddev, sensf_req);
+
+exit:
+ if (rc)
+ digital_poll_next_tech(ddev);
+
+ dev_kfree_skb(resp);
+}
+
+static int digital_tg_config_nfca(struct nfc_digital_dev *ddev)
+{
+ int rc;
+
+ rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH,
+ NFC_DIGITAL_RF_TECH_106A);
+ if (rc)
+ return rc;
+
+ return digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCA_NFC_DEP);
+}
+
+int digital_tg_listen_nfca(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ int rc;
+
+ rc = digital_tg_config_nfca(ddev);
+ if (rc)
+ return rc;
+
+ return digital_tg_listen(ddev, 300, digital_tg_recv_sens_req, NULL);
+}
+
+static int digital_tg_config_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ int rc;
+
+ rc = digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_RF_TECH, rf_tech);
+ if (rc)
+ return rc;
+
+ return digital_tg_configure_hw(ddev, NFC_DIGITAL_CONFIG_FRAMING,
+ NFC_DIGITAL_FRAMING_NFCF_NFC_DEP);
+}
+
+int digital_tg_listen_nfcf(struct nfc_digital_dev *ddev, u8 rf_tech)
+{
+ int rc;
+ u8 *nfcid2;
+
+ rc = digital_tg_config_nfcf(ddev, rf_tech);
+ if (rc)
+ return rc;
+
+ nfcid2 = kzalloc(NFC_NFCID2_MAXSIZE, GFP_KERNEL);
+ if (!nfcid2)
+ return -ENOMEM;
+
+ nfcid2[0] = DIGITAL_SENSF_NFCID2_NFC_DEP_B1;
+ nfcid2[1] = DIGITAL_SENSF_NFCID2_NFC_DEP_B2;
+ get_random_bytes(nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2);
+
+ return digital_tg_listen(ddev, 300, digital_tg_recv_sensf_req, nfcid2);
+}
+
+void digital_tg_recv_md_req(struct nfc_digital_dev *ddev, void *arg,
+ struct sk_buff *resp)
+{
+ u8 rf_tech;
+ int rc;
+
+ if (IS_ERR(resp)) {
+ resp = NULL;
+ goto exit_free_skb;
+ }
+
+ rc = ddev->ops->tg_get_rf_tech(ddev, &rf_tech);
+ if (rc)
+ goto exit_free_skb;
+
+ switch (rf_tech) {
+ case NFC_DIGITAL_RF_TECH_106A:
+ rc = digital_tg_config_nfca(ddev);
+ if (rc)
+ goto exit_free_skb;
+ digital_tg_recv_sens_req(ddev, arg, resp);
+ break;
+ case NFC_DIGITAL_RF_TECH_212F:
+ case NFC_DIGITAL_RF_TECH_424F:
+ rc = digital_tg_config_nfcf(ddev, rf_tech);
+ if (rc)
+ goto exit_free_skb;
+ digital_tg_recv_sensf_req(ddev, arg, resp);
+ break;
+ default:
+ goto exit_free_skb;
+ }
+
+ return;
+
+exit_free_skb:
+ digital_poll_next_tech(ddev);
+ dev_kfree_skb(resp);
+}
diff --git a/net/nfc/hci/Kconfig b/net/nfc/hci/Kconfig
new file mode 100644
index 000000000..fd67f51d1
--- /dev/null
+++ b/net/nfc/hci/Kconfig
@@ -0,0 +1,17 @@
+config NFC_HCI
+ depends on NFC
+ tristate "NFC HCI implementation"
+ default n
+ help
+ Say Y here if you want to build support for a kernel NFC HCI
+ implementation. This is mostly needed for devices that only process
+ HCI frames, like for example the NXP pn544.
+
+config NFC_SHDLC
+ depends on NFC_HCI
+ select CRC_CCITT
+ bool "SHDLC link layer for HCI based NFC drivers"
+ default n
+ ---help---
+ Say yes if you use an NFC HCI driver that requires SHDLC link layer.
+ If unsure, say N here.
diff --git a/net/nfc/hci/Makefile b/net/nfc/hci/Makefile
new file mode 100644
index 000000000..c5dbb6891
--- /dev/null
+++ b/net/nfc/hci/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux NFC HCI layer.
+#
+
+obj-$(CONFIG_NFC_HCI) += hci.o
+
+hci-y := core.o hcp.o command.o llc.o llc_nop.o
+hci-$(CONFIG_NFC_SHDLC) += llc_shdlc.o
diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c
new file mode 100644
index 000000000..844673cb7
--- /dev/null
+++ b/net/nfc/hci/command.c
@@ -0,0 +1,372 @@
+/*
+ * Copyright (C) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) "hci: %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#include <net/nfc/hci.h>
+
+#include "hci.h"
+
+#define MAX_FWI 4949
+
+static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
+ const u8 *param, size_t param_len,
+ data_exchange_cb_t cb, void *cb_context)
+{
+ pr_debug("exec cmd async through pipe=%d, cmd=%d, plen=%zd\n", pipe,
+ cmd, param_len);
+
+ /* TODO: Define hci cmd execution delay. Should it be the same
+ * for all commands?
+ */
+ return nfc_hci_hcp_message_tx(hdev, pipe, NFC_HCI_HCP_COMMAND, cmd,
+ param, param_len, cb, cb_context, MAX_FWI);
+}
+
+/*
+ * HCI command execution completion callback.
+ * err will be a standard linux error (may be converted from HCI response)
+ * skb contains the response data and must be disposed, or may be NULL if
+ * an error occured
+ */
+static void nfc_hci_execute_cb(void *context, struct sk_buff *skb, int err)
+{
+ struct hcp_exec_waiter *hcp_ew = (struct hcp_exec_waiter *)context;
+
+ pr_debug("HCI Cmd completed with result=%d\n", err);
+
+ hcp_ew->exec_result = err;
+ if (hcp_ew->exec_result == 0)
+ hcp_ew->result_skb = skb;
+ else
+ kfree_skb(skb);
+ hcp_ew->exec_complete = true;
+
+ wake_up(hcp_ew->wq);
+}
+
+static int nfc_hci_execute_cmd(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
+ const u8 *param, size_t param_len,
+ struct sk_buff **skb)
+{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(ew_wq);
+ struct hcp_exec_waiter hcp_ew;
+ hcp_ew.wq = &ew_wq;
+ hcp_ew.exec_complete = false;
+ hcp_ew.result_skb = NULL;
+
+ pr_debug("exec cmd sync through pipe=%d, cmd=%d, plen=%zd\n", pipe,
+ cmd, param_len);
+
+ /* TODO: Define hci cmd execution delay. Should it be the same
+ * for all commands?
+ */
+ hcp_ew.exec_result = nfc_hci_hcp_message_tx(hdev, pipe,
+ NFC_HCI_HCP_COMMAND, cmd,
+ param, param_len,
+ nfc_hci_execute_cb, &hcp_ew,
+ MAX_FWI);
+ if (hcp_ew.exec_result < 0)
+ return hcp_ew.exec_result;
+
+ wait_event(ew_wq, hcp_ew.exec_complete == true);
+
+ if (hcp_ew.exec_result == 0) {
+ if (skb)
+ *skb = hcp_ew.result_skb;
+ else
+ kfree_skb(hcp_ew.result_skb);
+ }
+
+ return hcp_ew.exec_result;
+}
+
+int nfc_hci_send_event(struct nfc_hci_dev *hdev, u8 gate, u8 event,
+ const u8 *param, size_t param_len)
+{
+ u8 pipe;
+
+ pr_debug("%d to gate %d\n", event, gate);
+
+ pipe = hdev->gate2pipe[gate];
+ if (pipe == NFC_HCI_INVALID_PIPE)
+ return -EADDRNOTAVAIL;
+
+ return nfc_hci_hcp_message_tx(hdev, pipe, NFC_HCI_HCP_EVENT, event,
+ param, param_len, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(nfc_hci_send_event);
+
+/*
+ * Execute an hci command sent to gate.
+ * skb will contain response data if success. skb can be NULL if you are not
+ * interested by the response.
+ */
+int nfc_hci_send_cmd(struct nfc_hci_dev *hdev, u8 gate, u8 cmd,
+ const u8 *param, size_t param_len, struct sk_buff **skb)
+{
+ u8 pipe;
+
+ pr_debug("\n");
+
+ pipe = hdev->gate2pipe[gate];
+ if (pipe == NFC_HCI_INVALID_PIPE)
+ return -EADDRNOTAVAIL;
+
+ return nfc_hci_execute_cmd(hdev, pipe, cmd, param, param_len, skb);
+}
+EXPORT_SYMBOL(nfc_hci_send_cmd);
+
+int nfc_hci_send_cmd_async(struct nfc_hci_dev *hdev, u8 gate, u8 cmd,
+ const u8 *param, size_t param_len,
+ data_exchange_cb_t cb, void *cb_context)
+{
+ u8 pipe;
+
+ pr_debug("\n");
+
+ pipe = hdev->gate2pipe[gate];
+ if (pipe == NFC_HCI_INVALID_PIPE)
+ return -EADDRNOTAVAIL;
+
+ return nfc_hci_execute_cmd_async(hdev, pipe, cmd, param, param_len,
+ cb, cb_context);
+}
+EXPORT_SYMBOL(nfc_hci_send_cmd_async);
+
+int nfc_hci_set_param(struct nfc_hci_dev *hdev, u8 gate, u8 idx,
+ const u8 *param, size_t param_len)
+{
+ int r;
+ u8 *tmp;
+
+ /* TODO ELa: reg idx must be inserted before param, but we don't want
+ * to ask the caller to do it to keep a simpler API.
+ * For now, just create a new temporary param buffer. This is far from
+ * optimal though, and the plan is to modify APIs to pass idx down to
+ * nfc_hci_hcp_message_tx where the frame is actually built, thereby
+ * eliminating the need for the temp allocation-copy here.
+ */
+
+ pr_debug("idx=%d to gate %d\n", idx, gate);
+
+ tmp = kmalloc(1 + param_len, GFP_KERNEL);
+ if (tmp == NULL)
+ return -ENOMEM;
+
+ *tmp = idx;
+ memcpy(tmp + 1, param, param_len);
+
+ r = nfc_hci_send_cmd(hdev, gate, NFC_HCI_ANY_SET_PARAMETER,
+ tmp, param_len + 1, NULL);
+
+ kfree(tmp);
+
+ return r;
+}
+EXPORT_SYMBOL(nfc_hci_set_param);
+
+int nfc_hci_get_param(struct nfc_hci_dev *hdev, u8 gate, u8 idx,
+ struct sk_buff **skb)
+{
+ pr_debug("gate=%d regidx=%d\n", gate, idx);
+
+ return nfc_hci_send_cmd(hdev, gate, NFC_HCI_ANY_GET_PARAMETER,
+ &idx, 1, skb);
+}
+EXPORT_SYMBOL(nfc_hci_get_param);
+
+static int nfc_hci_open_pipe(struct nfc_hci_dev *hdev, u8 pipe)
+{
+ struct sk_buff *skb;
+ int r;
+
+ pr_debug("pipe=%d\n", pipe);
+
+ r = nfc_hci_execute_cmd(hdev, pipe, NFC_HCI_ANY_OPEN_PIPE,
+ NULL, 0, &skb);
+ if (r == 0) {
+ /* dest host other than host controller will send
+ * number of pipes already open on this gate before
+ * execution. The number can be found in skb->data[0]
+ */
+ kfree_skb(skb);
+ }
+
+ return r;
+}
+
+static int nfc_hci_close_pipe(struct nfc_hci_dev *hdev, u8 pipe)
+{
+ pr_debug("\n");
+
+ return nfc_hci_execute_cmd(hdev, pipe, NFC_HCI_ANY_CLOSE_PIPE,
+ NULL, 0, NULL);
+}
+
+static u8 nfc_hci_create_pipe(struct nfc_hci_dev *hdev, u8 dest_host,
+ u8 dest_gate, int *result)
+{
+ struct sk_buff *skb;
+ struct hci_create_pipe_params params;
+ struct hci_create_pipe_resp *resp;
+ u8 pipe;
+
+ pr_debug("gate=%d\n", dest_gate);
+
+ params.src_gate = NFC_HCI_ADMIN_GATE;
+ params.dest_host = dest_host;
+ params.dest_gate = dest_gate;
+
+ *result = nfc_hci_execute_cmd(hdev, NFC_HCI_ADMIN_PIPE,
+ NFC_HCI_ADM_CREATE_PIPE,
+ (u8 *) &params, sizeof(params), &skb);
+ if (*result < 0)
+ return NFC_HCI_INVALID_PIPE;
+
+ resp = (struct hci_create_pipe_resp *)skb->data;
+ pipe = resp->pipe;
+ kfree_skb(skb);
+
+ pr_debug("pipe created=%d\n", pipe);
+
+ return pipe;
+}
+
+static int nfc_hci_delete_pipe(struct nfc_hci_dev *hdev, u8 pipe)
+{
+ pr_debug("\n");
+
+ return nfc_hci_execute_cmd(hdev, NFC_HCI_ADMIN_PIPE,
+ NFC_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL);
+}
+
+static int nfc_hci_clear_all_pipes(struct nfc_hci_dev *hdev)
+{
+ u8 param[2];
+ size_t param_len = 2;
+
+ /* TODO: Find out what the identity reference data is
+ * and fill param with it. HCI spec 6.1.3.5 */
+
+ pr_debug("\n");
+
+ if (test_bit(NFC_HCI_QUIRK_SHORT_CLEAR, &hdev->quirks))
+ param_len = 0;
+
+ return nfc_hci_execute_cmd(hdev, NFC_HCI_ADMIN_PIPE,
+ NFC_HCI_ADM_CLEAR_ALL_PIPE, param, param_len,
+ NULL);
+}
+
+int nfc_hci_disconnect_gate(struct nfc_hci_dev *hdev, u8 gate)
+{
+ int r;
+ u8 pipe = hdev->gate2pipe[gate];
+
+ pr_debug("\n");
+
+ if (pipe == NFC_HCI_INVALID_PIPE)
+ return -EADDRNOTAVAIL;
+
+ r = nfc_hci_close_pipe(hdev, pipe);
+ if (r < 0)
+ return r;
+
+ if (pipe != NFC_HCI_LINK_MGMT_PIPE && pipe != NFC_HCI_ADMIN_PIPE) {
+ r = nfc_hci_delete_pipe(hdev, pipe);
+ if (r < 0)
+ return r;
+ }
+
+ hdev->gate2pipe[gate] = NFC_HCI_INVALID_PIPE;
+
+ return 0;
+}
+EXPORT_SYMBOL(nfc_hci_disconnect_gate);
+
+int nfc_hci_disconnect_all_gates(struct nfc_hci_dev *hdev)
+{
+ int r;
+
+ pr_debug("\n");
+
+ r = nfc_hci_clear_all_pipes(hdev);
+ if (r < 0)
+ return r;
+
+ nfc_hci_reset_pipes(hdev);
+
+ return 0;
+}
+EXPORT_SYMBOL(nfc_hci_disconnect_all_gates);
+
+int nfc_hci_connect_gate(struct nfc_hci_dev *hdev, u8 dest_host, u8 dest_gate,
+ u8 pipe)
+{
+ bool pipe_created = false;
+ int r;
+
+ pr_debug("\n");
+
+ if (pipe == NFC_HCI_DO_NOT_CREATE_PIPE)
+ return 0;
+
+ if (hdev->gate2pipe[dest_gate] != NFC_HCI_INVALID_PIPE)
+ return -EADDRINUSE;
+
+ if (pipe != NFC_HCI_INVALID_PIPE)
+ goto open_pipe;
+
+ switch (dest_gate) {
+ case NFC_HCI_LINK_MGMT_GATE:
+ pipe = NFC_HCI_LINK_MGMT_PIPE;
+ break;
+ case NFC_HCI_ADMIN_GATE:
+ pipe = NFC_HCI_ADMIN_PIPE;
+ break;
+ default:
+ pipe = nfc_hci_create_pipe(hdev, dest_host, dest_gate, &r);
+ if (pipe == NFC_HCI_INVALID_PIPE)
+ return r;
+ pipe_created = true;
+ break;
+ }
+
+open_pipe:
+ r = nfc_hci_open_pipe(hdev, pipe);
+ if (r < 0) {
+ if (pipe_created)
+ if (nfc_hci_delete_pipe(hdev, pipe) < 0) {
+ /* TODO: Cannot clean by deleting pipe...
+ * -> inconsistent state */
+ }
+ return r;
+ }
+
+ hdev->pipes[pipe].gate = dest_gate;
+ hdev->pipes[pipe].dest_host = dest_host;
+ hdev->gate2pipe[dest_gate] = pipe;
+
+ return 0;
+}
+EXPORT_SYMBOL(nfc_hci_connect_gate);
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
new file mode 100644
index 000000000..6e061da22
--- /dev/null
+++ b/net/nfc/hci/core.c
@@ -0,0 +1,1095 @@
+/*
+ * Copyright (C) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) "hci: %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/nfc.h>
+
+#include <net/nfc/nfc.h>
+#include <net/nfc/hci.h>
+#include <net/nfc/llc.h>
+
+#include "hci.h"
+
+/* Largest headroom needed for outgoing HCI commands */
+#define HCI_CMDS_HEADROOM 1
+
+int nfc_hci_result_to_errno(u8 result)
+{
+ switch (result) {
+ case NFC_HCI_ANY_OK:
+ return 0;
+ case NFC_HCI_ANY_E_REG_PAR_UNKNOWN:
+ return -EOPNOTSUPP;
+ case NFC_HCI_ANY_E_TIMEOUT:
+ return -ETIME;
+ default:
+ return -1;
+ }
+}
+EXPORT_SYMBOL(nfc_hci_result_to_errno);
+
+void nfc_hci_reset_pipes(struct nfc_hci_dev *hdev)
+{
+ int i = 0;
+
+ for (i = 0; i < NFC_HCI_MAX_PIPES; i++) {
+ hdev->pipes[i].gate = NFC_HCI_INVALID_GATE;
+ hdev->pipes[i].dest_host = NFC_HCI_INVALID_HOST;
+ }
+ memset(hdev->gate2pipe, NFC_HCI_INVALID_PIPE, sizeof(hdev->gate2pipe));
+}
+EXPORT_SYMBOL(nfc_hci_reset_pipes);
+
+void nfc_hci_reset_pipes_per_host(struct nfc_hci_dev *hdev, u8 host)
+{
+ int i = 0;
+
+ for (i = 0; i < NFC_HCI_MAX_PIPES; i++) {
+ if (hdev->pipes[i].dest_host != host)
+ continue;
+
+ hdev->pipes[i].gate = NFC_HCI_INVALID_GATE;
+ hdev->pipes[i].dest_host = NFC_HCI_INVALID_HOST;
+ }
+}
+EXPORT_SYMBOL(nfc_hci_reset_pipes_per_host);
+
+static void nfc_hci_msg_tx_work(struct work_struct *work)
+{
+ struct nfc_hci_dev *hdev = container_of(work, struct nfc_hci_dev,
+ msg_tx_work);
+ struct hci_msg *msg;
+ struct sk_buff *skb;
+ int r = 0;
+
+ mutex_lock(&hdev->msg_tx_mutex);
+ if (hdev->shutting_down)
+ goto exit;
+
+ if (hdev->cmd_pending_msg) {
+ if (timer_pending(&hdev->cmd_timer) == 0) {
+ if (hdev->cmd_pending_msg->cb)
+ hdev->cmd_pending_msg->cb(hdev->
+ cmd_pending_msg->
+ cb_context,
+ NULL,
+ -ETIME);
+ kfree(hdev->cmd_pending_msg);
+ hdev->cmd_pending_msg = NULL;
+ } else {
+ goto exit;
+ }
+ }
+
+next_msg:
+ if (list_empty(&hdev->msg_tx_queue))
+ goto exit;
+
+ msg = list_first_entry(&hdev->msg_tx_queue, struct hci_msg, msg_l);
+ list_del(&msg->msg_l);
+
+ pr_debug("msg_tx_queue has a cmd to send\n");
+ while ((skb = skb_dequeue(&msg->msg_frags)) != NULL) {
+ r = nfc_llc_xmit_from_hci(hdev->llc, skb);
+ if (r < 0) {
+ kfree_skb(skb);
+ skb_queue_purge(&msg->msg_frags);
+ if (msg->cb)
+ msg->cb(msg->cb_context, NULL, r);
+ kfree(msg);
+ break;
+ }
+ }
+
+ if (r)
+ goto next_msg;
+
+ if (msg->wait_response == false) {
+ kfree(msg);
+ goto next_msg;
+ }
+
+ hdev->cmd_pending_msg = msg;
+ mod_timer(&hdev->cmd_timer, jiffies +
+ msecs_to_jiffies(hdev->cmd_pending_msg->completion_delay));
+
+exit:
+ mutex_unlock(&hdev->msg_tx_mutex);
+}
+
+static void nfc_hci_msg_rx_work(struct work_struct *work)
+{
+ struct nfc_hci_dev *hdev = container_of(work, struct nfc_hci_dev,
+ msg_rx_work);
+ struct sk_buff *skb;
+ struct hcp_message *message;
+ u8 pipe;
+ u8 type;
+ u8 instruction;
+
+ while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) {
+ pipe = skb->data[0];
+ skb_pull(skb, NFC_HCI_HCP_PACKET_HEADER_LEN);
+ message = (struct hcp_message *)skb->data;
+ type = HCP_MSG_GET_TYPE(message->header);
+ instruction = HCP_MSG_GET_CMD(message->header);
+ skb_pull(skb, NFC_HCI_HCP_MESSAGE_HEADER_LEN);
+
+ nfc_hci_hcp_message_rx(hdev, pipe, type, instruction, skb);
+ }
+}
+
+static void __nfc_hci_cmd_completion(struct nfc_hci_dev *hdev, int err,
+ struct sk_buff *skb)
+{
+ del_timer_sync(&hdev->cmd_timer);
+
+ if (hdev->cmd_pending_msg->cb)
+ hdev->cmd_pending_msg->cb(hdev->cmd_pending_msg->cb_context,
+ skb, err);
+ else
+ kfree_skb(skb);
+
+ kfree(hdev->cmd_pending_msg);
+ hdev->cmd_pending_msg = NULL;
+
+ schedule_work(&hdev->msg_tx_work);
+}
+
+void nfc_hci_resp_received(struct nfc_hci_dev *hdev, u8 result,
+ struct sk_buff *skb)
+{
+ mutex_lock(&hdev->msg_tx_mutex);
+
+ if (hdev->cmd_pending_msg == NULL) {
+ kfree_skb(skb);
+ goto exit;
+ }
+
+ __nfc_hci_cmd_completion(hdev, nfc_hci_result_to_errno(result), skb);
+
+exit:
+ mutex_unlock(&hdev->msg_tx_mutex);
+}
+
+void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
+ struct sk_buff *skb)
+{
+ u8 gate = hdev->pipes[pipe].gate;
+ u8 status = NFC_HCI_ANY_OK;
+ struct hci_create_pipe_resp *create_info;
+ struct hci_delete_pipe_noti *delete_info;
+ struct hci_all_pipe_cleared_noti *cleared_info;
+
+ pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd);
+
+ switch (cmd) {
+ case NFC_HCI_ADM_NOTIFY_PIPE_CREATED:
+ if (skb->len != 5) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+ create_info = (struct hci_create_pipe_resp *)skb->data;
+
+ /* Save the new created pipe and bind with local gate,
+ * the description for skb->data[3] is destination gate id
+ * but since we received this cmd from host controller, we
+ * are the destination and it is our local gate
+ */
+ hdev->gate2pipe[create_info->dest_gate] = create_info->pipe;
+ hdev->pipes[create_info->pipe].gate = create_info->dest_gate;
+ hdev->pipes[create_info->pipe].dest_host =
+ create_info->src_host;
+ break;
+ case NFC_HCI_ANY_OPEN_PIPE:
+ if (gate == NFC_HCI_INVALID_GATE) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+ break;
+ case NFC_HCI_ADM_NOTIFY_PIPE_DELETED:
+ if (skb->len != 1) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+ delete_info = (struct hci_delete_pipe_noti *)skb->data;
+
+ hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE;
+ hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST;
+ break;
+ case NFC_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED:
+ if (skb->len != 1) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+ cleared_info = (struct hci_all_pipe_cleared_noti *)skb->data;
+
+ nfc_hci_reset_pipes_per_host(hdev, cleared_info->host);
+ break;
+ default:
+ pr_info("Discarded unknown cmd %x to gate %x\n", cmd, gate);
+ break;
+ }
+
+ if (hdev->ops->cmd_received)
+ hdev->ops->cmd_received(hdev, pipe, cmd, skb);
+
+exit:
+ nfc_hci_hcp_message_tx(hdev, pipe, NFC_HCI_HCP_RESPONSE,
+ status, NULL, 0, NULL, NULL, 0);
+
+ kfree_skb(skb);
+}
+
+u32 nfc_hci_sak_to_protocol(u8 sak)
+{
+ switch (NFC_HCI_TYPE_A_SEL_PROT(sak)) {
+ case NFC_HCI_TYPE_A_SEL_PROT_MIFARE:
+ return NFC_PROTO_MIFARE_MASK;
+ case NFC_HCI_TYPE_A_SEL_PROT_ISO14443:
+ return NFC_PROTO_ISO14443_MASK;
+ case NFC_HCI_TYPE_A_SEL_PROT_DEP:
+ return NFC_PROTO_NFC_DEP_MASK;
+ case NFC_HCI_TYPE_A_SEL_PROT_ISO14443_DEP:
+ return NFC_PROTO_ISO14443_MASK | NFC_PROTO_NFC_DEP_MASK;
+ default:
+ return 0xffffffff;
+ }
+}
+EXPORT_SYMBOL(nfc_hci_sak_to_protocol);
+
+int nfc_hci_target_discovered(struct nfc_hci_dev *hdev, u8 gate)
+{
+ struct nfc_target *targets;
+ struct sk_buff *atqa_skb = NULL;
+ struct sk_buff *sak_skb = NULL;
+ struct sk_buff *uid_skb = NULL;
+ int r;
+
+ pr_debug("from gate %d\n", gate);
+
+ targets = kzalloc(sizeof(struct nfc_target), GFP_KERNEL);
+ if (targets == NULL)
+ return -ENOMEM;
+
+ switch (gate) {
+ case NFC_HCI_RF_READER_A_GATE:
+ r = nfc_hci_get_param(hdev, NFC_HCI_RF_READER_A_GATE,
+ NFC_HCI_RF_READER_A_ATQA, &atqa_skb);
+ if (r < 0)
+ goto exit;
+
+ r = nfc_hci_get_param(hdev, NFC_HCI_RF_READER_A_GATE,
+ NFC_HCI_RF_READER_A_SAK, &sak_skb);
+ if (r < 0)
+ goto exit;
+
+ if (atqa_skb->len != 2 || sak_skb->len != 1) {
+ r = -EPROTO;
+ goto exit;
+ }
+
+ targets->supported_protocols =
+ nfc_hci_sak_to_protocol(sak_skb->data[0]);
+ if (targets->supported_protocols == 0xffffffff) {
+ r = -EPROTO;
+ goto exit;
+ }
+
+ targets->sens_res = be16_to_cpu(*(__be16 *)atqa_skb->data);
+ targets->sel_res = sak_skb->data[0];
+
+ r = nfc_hci_get_param(hdev, NFC_HCI_RF_READER_A_GATE,
+ NFC_HCI_RF_READER_A_UID, &uid_skb);
+ if (r < 0)
+ goto exit;
+
+ if (uid_skb->len == 0 || uid_skb->len > NFC_NFCID1_MAXSIZE) {
+ r = -EPROTO;
+ goto exit;
+ }
+
+ memcpy(targets->nfcid1, uid_skb->data, uid_skb->len);
+ targets->nfcid1_len = uid_skb->len;
+
+ if (hdev->ops->complete_target_discovered) {
+ r = hdev->ops->complete_target_discovered(hdev, gate,
+ targets);
+ if (r < 0)
+ goto exit;
+ }
+ break;
+ case NFC_HCI_RF_READER_B_GATE:
+ targets->supported_protocols = NFC_PROTO_ISO14443_B_MASK;
+ break;
+ default:
+ if (hdev->ops->target_from_gate)
+ r = hdev->ops->target_from_gate(hdev, gate, targets);
+ else
+ r = -EPROTO;
+ if (r < 0)
+ goto exit;
+
+ if (hdev->ops->complete_target_discovered) {
+ r = hdev->ops->complete_target_discovered(hdev, gate,
+ targets);
+ if (r < 0)
+ goto exit;
+ }
+ break;
+ }
+
+ /* if driver set the new gate, we will skip the old one */
+ if (targets->hci_reader_gate == 0x00)
+ targets->hci_reader_gate = gate;
+
+ r = nfc_targets_found(hdev->ndev, targets, 1);
+
+exit:
+ kfree(targets);
+ kfree_skb(atqa_skb);
+ kfree_skb(sak_skb);
+ kfree_skb(uid_skb);
+
+ return r;
+}
+EXPORT_SYMBOL(nfc_hci_target_discovered);
+
+void nfc_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe, u8 event,
+ struct sk_buff *skb)
+{
+ int r = 0;
+ u8 gate = hdev->pipes[pipe].gate;
+
+ if (gate == NFC_HCI_INVALID_GATE) {
+ pr_err("Discarded event %x to unopened pipe %x\n", event, pipe);
+ goto exit;
+ }
+
+ if (hdev->ops->event_received) {
+ r = hdev->ops->event_received(hdev, pipe, event, skb);
+ if (r <= 0)
+ goto exit_noskb;
+ }
+
+ switch (event) {
+ case NFC_HCI_EVT_TARGET_DISCOVERED:
+ if (skb->len < 1) { /* no status data? */
+ r = -EPROTO;
+ goto exit;
+ }
+
+ if (skb->data[0] == 3) {
+ /* TODO: Multiple targets in field, none activated
+ * poll is supposedly stopped, but there is no
+ * single target to activate, so nothing to report
+ * up.
+ * if we need to restart poll, we must save the
+ * protocols from the initial poll and reuse here.
+ */
+ }
+
+ if (skb->data[0] != 0) {
+ r = -EPROTO;
+ goto exit;
+ }
+
+ r = nfc_hci_target_discovered(hdev, gate);
+ break;
+ default:
+ pr_info("Discarded unknown event %x to gate %x\n", event, gate);
+ r = -EINVAL;
+ break;
+ }
+
+exit:
+ kfree_skb(skb);
+
+exit_noskb:
+ if (r)
+ nfc_hci_driver_failure(hdev, r);
+}
+
+static void nfc_hci_cmd_timeout(unsigned long data)
+{
+ struct nfc_hci_dev *hdev = (struct nfc_hci_dev *)data;
+
+ schedule_work(&hdev->msg_tx_work);
+}
+
+static int hci_dev_connect_gates(struct nfc_hci_dev *hdev, u8 gate_count,
+ struct nfc_hci_gate *gates)
+{
+ int r;
+ while (gate_count--) {
+ r = nfc_hci_connect_gate(hdev, NFC_HCI_HOST_CONTROLLER_ID,
+ gates->gate, gates->pipe);
+ if (r < 0)
+ return r;
+ gates++;
+ }
+
+ return 0;
+}
+
+static int hci_dev_session_init(struct nfc_hci_dev *hdev)
+{
+ struct sk_buff *skb = NULL;
+ int r;
+
+ if (hdev->init_data.gates[0].gate != NFC_HCI_ADMIN_GATE)
+ return -EPROTO;
+
+ r = nfc_hci_connect_gate(hdev, NFC_HCI_HOST_CONTROLLER_ID,
+ hdev->init_data.gates[0].gate,
+ hdev->init_data.gates[0].pipe);
+ if (r < 0)
+ goto exit;
+
+ r = nfc_hci_get_param(hdev, NFC_HCI_ADMIN_GATE,
+ NFC_HCI_ADMIN_SESSION_IDENTITY, &skb);
+ if (r < 0)
+ goto disconnect_all;
+
+ if (skb->len && skb->len == strlen(hdev->init_data.session_id) &&
+ (memcmp(hdev->init_data.session_id, skb->data,
+ skb->len) == 0) && hdev->ops->load_session) {
+ /* Restore gate<->pipe table from some proprietary location. */
+
+ r = hdev->ops->load_session(hdev);
+
+ if (r < 0)
+ goto disconnect_all;
+ } else {
+
+ r = nfc_hci_disconnect_all_gates(hdev);
+ if (r < 0)
+ goto exit;
+
+ r = hci_dev_connect_gates(hdev, hdev->init_data.gate_count,
+ hdev->init_data.gates);
+ if (r < 0)
+ goto disconnect_all;
+
+ r = nfc_hci_set_param(hdev, NFC_HCI_ADMIN_GATE,
+ NFC_HCI_ADMIN_SESSION_IDENTITY,
+ hdev->init_data.session_id,
+ strlen(hdev->init_data.session_id));
+ }
+ if (r == 0)
+ goto exit;
+
+disconnect_all:
+ nfc_hci_disconnect_all_gates(hdev);
+
+exit:
+ kfree_skb(skb);
+
+ return r;
+}
+
+static int hci_dev_version(struct nfc_hci_dev *hdev)
+{
+ int r;
+ struct sk_buff *skb;
+
+ r = nfc_hci_get_param(hdev, NFC_HCI_ID_MGMT_GATE,
+ NFC_HCI_ID_MGMT_VERSION_SW, &skb);
+ if (r == -EOPNOTSUPP) {
+ pr_info("Software/Hardware info not available\n");
+ return 0;
+ }
+ if (r < 0)
+ return r;
+
+ if (skb->len != 3) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ hdev->sw_romlib = (skb->data[0] & 0xf0) >> 4;
+ hdev->sw_patch = skb->data[0] & 0x0f;
+ hdev->sw_flashlib_major = skb->data[1];
+ hdev->sw_flashlib_minor = skb->data[2];
+
+ kfree_skb(skb);
+
+ r = nfc_hci_get_param(hdev, NFC_HCI_ID_MGMT_GATE,
+ NFC_HCI_ID_MGMT_VERSION_HW, &skb);
+ if (r < 0)
+ return r;
+
+ if (skb->len != 3) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ hdev->hw_derivative = (skb->data[0] & 0xe0) >> 5;
+ hdev->hw_version = skb->data[0] & 0x1f;
+ hdev->hw_mpw = (skb->data[1] & 0xc0) >> 6;
+ hdev->hw_software = skb->data[1] & 0x3f;
+ hdev->hw_bsid = skb->data[2];
+
+ kfree_skb(skb);
+
+ pr_info("SOFTWARE INFO:\n");
+ pr_info("RomLib : %d\n", hdev->sw_romlib);
+ pr_info("Patch : %d\n", hdev->sw_patch);
+ pr_info("FlashLib Major : %d\n", hdev->sw_flashlib_major);
+ pr_info("FlashLib Minor : %d\n", hdev->sw_flashlib_minor);
+ pr_info("HARDWARE INFO:\n");
+ pr_info("Derivative : %d\n", hdev->hw_derivative);
+ pr_info("HW Version : %d\n", hdev->hw_version);
+ pr_info("#MPW : %d\n", hdev->hw_mpw);
+ pr_info("Software : %d\n", hdev->hw_software);
+ pr_info("BSID Version : %d\n", hdev->hw_bsid);
+
+ return 0;
+}
+
+static int hci_dev_up(struct nfc_dev *nfc_dev)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+ int r = 0;
+
+ if (hdev->ops->open) {
+ r = hdev->ops->open(hdev);
+ if (r < 0)
+ return r;
+ }
+
+ r = nfc_llc_start(hdev->llc);
+ if (r < 0)
+ goto exit_close;
+
+ r = hci_dev_session_init(hdev);
+ if (r < 0)
+ goto exit_llc;
+
+ r = nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE,
+ NFC_HCI_EVT_END_OPERATION, NULL, 0);
+ if (r < 0)
+ goto exit_llc;
+
+ if (hdev->ops->hci_ready) {
+ r = hdev->ops->hci_ready(hdev);
+ if (r < 0)
+ goto exit_llc;
+ }
+
+ r = hci_dev_version(hdev);
+ if (r < 0)
+ goto exit_llc;
+
+ return 0;
+
+exit_llc:
+ nfc_llc_stop(hdev->llc);
+
+exit_close:
+ if (hdev->ops->close)
+ hdev->ops->close(hdev);
+
+ return r;
+}
+
+static int hci_dev_down(struct nfc_dev *nfc_dev)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ nfc_llc_stop(hdev->llc);
+
+ if (hdev->ops->close)
+ hdev->ops->close(hdev);
+
+ nfc_hci_reset_pipes(hdev);
+
+ return 0;
+}
+
+static int hci_start_poll(struct nfc_dev *nfc_dev,
+ u32 im_protocols, u32 tm_protocols)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (hdev->ops->start_poll)
+ return hdev->ops->start_poll(hdev, im_protocols, tm_protocols);
+ else
+ return nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE,
+ NFC_HCI_EVT_READER_REQUESTED,
+ NULL, 0);
+}
+
+static void hci_stop_poll(struct nfc_dev *nfc_dev)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (hdev->ops->stop_poll)
+ hdev->ops->stop_poll(hdev);
+ else
+ nfc_hci_send_event(hdev, NFC_HCI_RF_READER_A_GATE,
+ NFC_HCI_EVT_END_OPERATION, NULL, 0);
+}
+
+static int hci_dep_link_up(struct nfc_dev *nfc_dev, struct nfc_target *target,
+ __u8 comm_mode, __u8 *gb, size_t gb_len)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (!hdev->ops->dep_link_up)
+ return 0;
+
+ return hdev->ops->dep_link_up(hdev, target, comm_mode,
+ gb, gb_len);
+}
+
+static int hci_dep_link_down(struct nfc_dev *nfc_dev)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (!hdev->ops->dep_link_down)
+ return 0;
+
+ return hdev->ops->dep_link_down(hdev);
+}
+
+static int hci_activate_target(struct nfc_dev *nfc_dev,
+ struct nfc_target *target, u32 protocol)
+{
+ return 0;
+}
+
+static void hci_deactivate_target(struct nfc_dev *nfc_dev,
+ struct nfc_target *target)
+{
+}
+
+#define HCI_CB_TYPE_TRANSCEIVE 1
+
+static void hci_transceive_cb(void *context, struct sk_buff *skb, int err)
+{
+ struct nfc_hci_dev *hdev = context;
+
+ switch (hdev->async_cb_type) {
+ case HCI_CB_TYPE_TRANSCEIVE:
+ /*
+ * TODO: Check RF Error indicator to make sure data is valid.
+ * It seems that HCI cmd can complete without error, but data
+ * can be invalid if an RF error occured? Ignore for now.
+ */
+ if (err == 0)
+ skb_trim(skb, skb->len - 1); /* RF Err ind */
+
+ hdev->async_cb(hdev->async_cb_context, skb, err);
+ break;
+ default:
+ if (err == 0)
+ kfree_skb(skb);
+ break;
+ }
+}
+
+static int hci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target,
+ struct sk_buff *skb, data_exchange_cb_t cb,
+ void *cb_context)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+ int r;
+
+ pr_debug("target_idx=%d\n", target->idx);
+
+ switch (target->hci_reader_gate) {
+ case NFC_HCI_RF_READER_A_GATE:
+ case NFC_HCI_RF_READER_B_GATE:
+ if (hdev->ops->im_transceive) {
+ r = hdev->ops->im_transceive(hdev, target, skb, cb,
+ cb_context);
+ if (r <= 0) /* handled */
+ break;
+ }
+
+ *skb_push(skb, 1) = 0; /* CTR, see spec:10.2.2.1 */
+
+ hdev->async_cb_type = HCI_CB_TYPE_TRANSCEIVE;
+ hdev->async_cb = cb;
+ hdev->async_cb_context = cb_context;
+
+ r = nfc_hci_send_cmd_async(hdev, target->hci_reader_gate,
+ NFC_HCI_WR_XCHG_DATA, skb->data,
+ skb->len, hci_transceive_cb, hdev);
+ break;
+ default:
+ if (hdev->ops->im_transceive) {
+ r = hdev->ops->im_transceive(hdev, target, skb, cb,
+ cb_context);
+ if (r == 1)
+ r = -ENOTSUPP;
+ } else {
+ r = -ENOTSUPP;
+ }
+ break;
+ }
+
+ kfree_skb(skb);
+
+ return r;
+}
+
+static int hci_tm_send(struct nfc_dev *nfc_dev, struct sk_buff *skb)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (!hdev->ops->tm_send) {
+ kfree_skb(skb);
+ return -ENOTSUPP;
+ }
+
+ return hdev->ops->tm_send(hdev, skb);
+}
+
+static int hci_check_presence(struct nfc_dev *nfc_dev,
+ struct nfc_target *target)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (!hdev->ops->check_presence)
+ return 0;
+
+ return hdev->ops->check_presence(hdev, target);
+}
+
+static int hci_discover_se(struct nfc_dev *nfc_dev)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (hdev->ops->discover_se)
+ return hdev->ops->discover_se(hdev);
+
+ return 0;
+}
+
+static int hci_enable_se(struct nfc_dev *nfc_dev, u32 se_idx)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (hdev->ops->enable_se)
+ return hdev->ops->enable_se(hdev, se_idx);
+
+ return 0;
+}
+
+static int hci_disable_se(struct nfc_dev *nfc_dev, u32 se_idx)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (hdev->ops->disable_se)
+ return hdev->ops->disable_se(hdev, se_idx);
+
+ return 0;
+}
+
+static int hci_se_io(struct nfc_dev *nfc_dev, u32 se_idx,
+ u8 *apdu, size_t apdu_length,
+ se_io_cb_t cb, void *cb_context)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (hdev->ops->se_io)
+ return hdev->ops->se_io(hdev, se_idx, apdu,
+ apdu_length, cb, cb_context);
+
+ return 0;
+}
+
+static void nfc_hci_failure(struct nfc_hci_dev *hdev, int err)
+{
+ mutex_lock(&hdev->msg_tx_mutex);
+
+ if (hdev->cmd_pending_msg == NULL) {
+ nfc_driver_failure(hdev->ndev, err);
+ goto exit;
+ }
+
+ __nfc_hci_cmd_completion(hdev, err, NULL);
+
+exit:
+ mutex_unlock(&hdev->msg_tx_mutex);
+}
+
+static void nfc_hci_llc_failure(struct nfc_hci_dev *hdev, int err)
+{
+ nfc_hci_failure(hdev, err);
+}
+
+static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hcp_packet *packet;
+ u8 type;
+ u8 instruction;
+ struct sk_buff *hcp_skb;
+ u8 pipe;
+ struct sk_buff *frag_skb;
+ int msg_len;
+
+ packet = (struct hcp_packet *)skb->data;
+ if ((packet->header & ~NFC_HCI_FRAGMENT) == 0) {
+ skb_queue_tail(&hdev->rx_hcp_frags, skb);
+ return;
+ }
+
+ /* it's the last fragment. Does it need re-aggregation? */
+ if (skb_queue_len(&hdev->rx_hcp_frags)) {
+ pipe = packet->header & NFC_HCI_FRAGMENT;
+ skb_queue_tail(&hdev->rx_hcp_frags, skb);
+
+ msg_len = 0;
+ skb_queue_walk(&hdev->rx_hcp_frags, frag_skb) {
+ msg_len += (frag_skb->len -
+ NFC_HCI_HCP_PACKET_HEADER_LEN);
+ }
+
+ hcp_skb = nfc_alloc_recv_skb(NFC_HCI_HCP_PACKET_HEADER_LEN +
+ msg_len, GFP_KERNEL);
+ if (hcp_skb == NULL) {
+ nfc_hci_failure(hdev, -ENOMEM);
+ return;
+ }
+
+ *skb_put(hcp_skb, NFC_HCI_HCP_PACKET_HEADER_LEN) = pipe;
+
+ skb_queue_walk(&hdev->rx_hcp_frags, frag_skb) {
+ msg_len = frag_skb->len - NFC_HCI_HCP_PACKET_HEADER_LEN;
+ memcpy(skb_put(hcp_skb, msg_len),
+ frag_skb->data + NFC_HCI_HCP_PACKET_HEADER_LEN,
+ msg_len);
+ }
+
+ skb_queue_purge(&hdev->rx_hcp_frags);
+ } else {
+ packet->header &= NFC_HCI_FRAGMENT;
+ hcp_skb = skb;
+ }
+
+ /* if this is a response, dispatch immediately to
+ * unblock waiting cmd context. Otherwise, enqueue to dispatch
+ * in separate context where handler can also execute command.
+ */
+ packet = (struct hcp_packet *)hcp_skb->data;
+ type = HCP_MSG_GET_TYPE(packet->message.header);
+ if (type == NFC_HCI_HCP_RESPONSE) {
+ pipe = packet->header;
+ instruction = HCP_MSG_GET_CMD(packet->message.header);
+ skb_pull(hcp_skb, NFC_HCI_HCP_PACKET_HEADER_LEN +
+ NFC_HCI_HCP_MESSAGE_HEADER_LEN);
+ nfc_hci_hcp_message_rx(hdev, pipe, type, instruction, hcp_skb);
+ } else {
+ skb_queue_tail(&hdev->msg_rx_queue, hcp_skb);
+ schedule_work(&hdev->msg_rx_work);
+ }
+}
+
+static int hci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name)
+{
+ struct nfc_hci_dev *hdev = nfc_get_drvdata(nfc_dev);
+
+ if (!hdev->ops->fw_download)
+ return -ENOTSUPP;
+
+ return hdev->ops->fw_download(hdev, firmware_name);
+}
+
+static struct nfc_ops hci_nfc_ops = {
+ .dev_up = hci_dev_up,
+ .dev_down = hci_dev_down,
+ .start_poll = hci_start_poll,
+ .stop_poll = hci_stop_poll,
+ .dep_link_up = hci_dep_link_up,
+ .dep_link_down = hci_dep_link_down,
+ .activate_target = hci_activate_target,
+ .deactivate_target = hci_deactivate_target,
+ .im_transceive = hci_transceive,
+ .tm_send = hci_tm_send,
+ .check_presence = hci_check_presence,
+ .fw_download = hci_fw_download,
+ .discover_se = hci_discover_se,
+ .enable_se = hci_enable_se,
+ .disable_se = hci_disable_se,
+ .se_io = hci_se_io,
+};
+
+struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops,
+ struct nfc_hci_init_data *init_data,
+ unsigned long quirks,
+ u32 protocols,
+ const char *llc_name,
+ int tx_headroom,
+ int tx_tailroom,
+ int max_link_payload)
+{
+ struct nfc_hci_dev *hdev;
+
+ if (ops->xmit == NULL)
+ return NULL;
+
+ if (protocols == 0)
+ return NULL;
+
+ hdev = kzalloc(sizeof(struct nfc_hci_dev), GFP_KERNEL);
+ if (hdev == NULL)
+ return NULL;
+
+ hdev->llc = nfc_llc_allocate(llc_name, hdev, ops->xmit,
+ nfc_hci_recv_from_llc, tx_headroom,
+ tx_tailroom, nfc_hci_llc_failure);
+ if (hdev->llc == NULL) {
+ kfree(hdev);
+ return NULL;
+ }
+
+ hdev->ndev = nfc_allocate_device(&hci_nfc_ops, protocols,
+ tx_headroom + HCI_CMDS_HEADROOM,
+ tx_tailroom);
+ if (!hdev->ndev) {
+ nfc_llc_free(hdev->llc);
+ kfree(hdev);
+ return NULL;
+ }
+
+ hdev->ops = ops;
+ hdev->max_data_link_payload = max_link_payload;
+ hdev->init_data = *init_data;
+
+ nfc_set_drvdata(hdev->ndev, hdev);
+
+ nfc_hci_reset_pipes(hdev);
+
+ hdev->quirks = quirks;
+
+ return hdev;
+}
+EXPORT_SYMBOL(nfc_hci_allocate_device);
+
+void nfc_hci_free_device(struct nfc_hci_dev *hdev)
+{
+ nfc_free_device(hdev->ndev);
+ nfc_llc_free(hdev->llc);
+ kfree(hdev);
+}
+EXPORT_SYMBOL(nfc_hci_free_device);
+
+int nfc_hci_register_device(struct nfc_hci_dev *hdev)
+{
+ mutex_init(&hdev->msg_tx_mutex);
+
+ INIT_LIST_HEAD(&hdev->msg_tx_queue);
+
+ INIT_WORK(&hdev->msg_tx_work, nfc_hci_msg_tx_work);
+
+ init_timer(&hdev->cmd_timer);
+ hdev->cmd_timer.data = (unsigned long)hdev;
+ hdev->cmd_timer.function = nfc_hci_cmd_timeout;
+
+ skb_queue_head_init(&hdev->rx_hcp_frags);
+
+ INIT_WORK(&hdev->msg_rx_work, nfc_hci_msg_rx_work);
+
+ skb_queue_head_init(&hdev->msg_rx_queue);
+
+ return nfc_register_device(hdev->ndev);
+}
+EXPORT_SYMBOL(nfc_hci_register_device);
+
+void nfc_hci_unregister_device(struct nfc_hci_dev *hdev)
+{
+ struct hci_msg *msg, *n;
+
+ mutex_lock(&hdev->msg_tx_mutex);
+
+ if (hdev->cmd_pending_msg) {
+ if (hdev->cmd_pending_msg->cb)
+ hdev->cmd_pending_msg->cb(
+ hdev->cmd_pending_msg->cb_context,
+ NULL, -ESHUTDOWN);
+ kfree(hdev->cmd_pending_msg);
+ hdev->cmd_pending_msg = NULL;
+ }
+
+ hdev->shutting_down = true;
+
+ mutex_unlock(&hdev->msg_tx_mutex);
+
+ del_timer_sync(&hdev->cmd_timer);
+ cancel_work_sync(&hdev->msg_tx_work);
+
+ cancel_work_sync(&hdev->msg_rx_work);
+
+ nfc_unregister_device(hdev->ndev);
+
+ skb_queue_purge(&hdev->rx_hcp_frags);
+ skb_queue_purge(&hdev->msg_rx_queue);
+
+ list_for_each_entry_safe(msg, n, &hdev->msg_tx_queue, msg_l) {
+ list_del(&msg->msg_l);
+ skb_queue_purge(&msg->msg_frags);
+ kfree(msg);
+ }
+}
+EXPORT_SYMBOL(nfc_hci_unregister_device);
+
+void nfc_hci_set_clientdata(struct nfc_hci_dev *hdev, void *clientdata)
+{
+ hdev->clientdata = clientdata;
+}
+EXPORT_SYMBOL(nfc_hci_set_clientdata);
+
+void *nfc_hci_get_clientdata(struct nfc_hci_dev *hdev)
+{
+ return hdev->clientdata;
+}
+EXPORT_SYMBOL(nfc_hci_get_clientdata);
+
+void nfc_hci_driver_failure(struct nfc_hci_dev *hdev, int err)
+{
+ nfc_hci_failure(hdev, err);
+}
+EXPORT_SYMBOL(nfc_hci_driver_failure);
+
+void nfc_hci_recv_frame(struct nfc_hci_dev *hdev, struct sk_buff *skb)
+{
+ nfc_llc_rcv_from_drv(hdev->llc, skb);
+}
+EXPORT_SYMBOL(nfc_hci_recv_frame);
+
+static int __init nfc_hci_init(void)
+{
+ return nfc_llc_init();
+}
+
+static void __exit nfc_hci_exit(void)
+{
+ nfc_llc_exit();
+}
+
+subsys_initcall(nfc_hci_init);
+module_exit(nfc_hci_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("NFC HCI Core");
diff --git a/net/nfc/hci/hci.h b/net/nfc/hci/hci.h
new file mode 100644
index 000000000..ab4c8e80b
--- /dev/null
+++ b/net/nfc/hci/hci.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LOCAL_HCI_H
+#define __LOCAL_HCI_H
+
+#include <net/nfc/hci.h>
+
+struct gate_pipe_map {
+ u8 gate;
+ u8 pipe;
+};
+
+struct hcp_message {
+ u8 header; /* type -cmd,evt,rsp- + instruction */
+ u8 data[];
+} __packed;
+
+struct hcp_packet {
+ u8 header; /* cbit+pipe */
+ struct hcp_message message;
+} __packed;
+
+struct hcp_exec_waiter {
+ wait_queue_head_t *wq;
+ bool exec_complete;
+ int exec_result;
+ struct sk_buff *result_skb;
+};
+
+struct hci_msg {
+ struct list_head msg_l;
+ struct sk_buff_head msg_frags;
+ bool wait_response;
+ data_exchange_cb_t cb;
+ void *cb_context;
+ unsigned long completion_delay;
+};
+
+struct hci_create_pipe_params {
+ u8 src_gate;
+ u8 dest_host;
+ u8 dest_gate;
+} __packed;
+
+struct hci_create_pipe_resp {
+ u8 src_host;
+ u8 src_gate;
+ u8 dest_host;
+ u8 dest_gate;
+ u8 pipe;
+} __packed;
+
+struct hci_delete_pipe_noti {
+ u8 pipe;
+} __packed;
+
+struct hci_all_pipe_cleared_noti {
+ u8 host;
+} __packed;
+
+#define NFC_HCI_FRAGMENT 0x7f
+
+#define HCP_HEADER(type, instr) ((((type) & 0x03) << 6) | ((instr) & 0x3f))
+#define HCP_MSG_GET_TYPE(header) ((header & 0xc0) >> 6)
+#define HCP_MSG_GET_CMD(header) (header & 0x3f)
+
+int nfc_hci_hcp_message_tx(struct nfc_hci_dev *hdev, u8 pipe,
+ u8 type, u8 instruction,
+ const u8 *payload, size_t payload_len,
+ data_exchange_cb_t cb, void *cb_context,
+ unsigned long completion_delay);
+
+void nfc_hci_hcp_message_rx(struct nfc_hci_dev *hdev, u8 pipe, u8 type,
+ u8 instruction, struct sk_buff *skb);
+
+/* HCP headers */
+#define NFC_HCI_HCP_PACKET_HEADER_LEN 1
+#define NFC_HCI_HCP_MESSAGE_HEADER_LEN 1
+#define NFC_HCI_HCP_HEADER_LEN 2
+
+/* HCP types */
+#define NFC_HCI_HCP_COMMAND 0x00
+#define NFC_HCI_HCP_EVENT 0x01
+#define NFC_HCI_HCP_RESPONSE 0x02
+
+/* Generic commands */
+#define NFC_HCI_ANY_SET_PARAMETER 0x01
+#define NFC_HCI_ANY_GET_PARAMETER 0x02
+#define NFC_HCI_ANY_OPEN_PIPE 0x03
+#define NFC_HCI_ANY_CLOSE_PIPE 0x04
+
+/* Reader RF commands */
+#define NFC_HCI_WR_XCHG_DATA 0x10
+
+/* Admin commands */
+#define NFC_HCI_ADM_CREATE_PIPE 0x10
+#define NFC_HCI_ADM_DELETE_PIPE 0x11
+#define NFC_HCI_ADM_NOTIFY_PIPE_CREATED 0x12
+#define NFC_HCI_ADM_NOTIFY_PIPE_DELETED 0x13
+#define NFC_HCI_ADM_CLEAR_ALL_PIPE 0x14
+#define NFC_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED 0x15
+
+/* Generic responses */
+#define NFC_HCI_ANY_OK 0x00
+#define NFC_HCI_ANY_E_NOT_CONNECTED 0x01
+#define NFC_HCI_ANY_E_CMD_PAR_UNKNOWN 0x02
+#define NFC_HCI_ANY_E_NOK 0x03
+#define NFC_HCI_ANY_E_PIPES_FULL 0x04
+#define NFC_HCI_ANY_E_REG_PAR_UNKNOWN 0x05
+#define NFC_HCI_ANY_E_PIPE_NOT_OPENED 0x06
+#define NFC_HCI_ANY_E_CMD_NOT_SUPPORTED 0x07
+#define NFC_HCI_ANY_E_INHIBITED 0x08
+#define NFC_HCI_ANY_E_TIMEOUT 0x09
+#define NFC_HCI_ANY_E_REG_ACCESS_DENIED 0x0a
+#define NFC_HCI_ANY_E_PIPE_ACCESS_DENIED 0x0b
+
+#endif /* __LOCAL_HCI_H */
diff --git a/net/nfc/hci/hcp.c b/net/nfc/hci/hcp.c
new file mode 100644
index 000000000..1fe725d66
--- /dev/null
+++ b/net/nfc/hci/hcp.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) "hci: %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <net/nfc/hci.h>
+
+#include "hci.h"
+
+/*
+ * Payload is the HCP message data only. Instruction will be prepended.
+ * Guarantees that cb will be called upon completion or timeout delay
+ * counted from the moment the cmd is sent to the transport.
+ */
+int nfc_hci_hcp_message_tx(struct nfc_hci_dev *hdev, u8 pipe,
+ u8 type, u8 instruction,
+ const u8 *payload, size_t payload_len,
+ data_exchange_cb_t cb, void *cb_context,
+ unsigned long completion_delay)
+{
+ struct nfc_dev *ndev = hdev->ndev;
+ struct hci_msg *cmd;
+ const u8 *ptr = payload;
+ int hci_len, err;
+ bool firstfrag = true;
+
+ cmd = kzalloc(sizeof(struct hci_msg), GFP_KERNEL);
+ if (cmd == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&cmd->msg_l);
+ skb_queue_head_init(&cmd->msg_frags);
+ cmd->wait_response = (type == NFC_HCI_HCP_COMMAND) ? true : false;
+ cmd->cb = cb;
+ cmd->cb_context = cb_context;
+ cmd->completion_delay = completion_delay;
+
+ hci_len = payload_len + 1;
+ while (hci_len > 0) {
+ struct sk_buff *skb;
+ int skb_len, data_link_len;
+ struct hcp_packet *packet;
+
+ if (NFC_HCI_HCP_PACKET_HEADER_LEN + hci_len <=
+ hdev->max_data_link_payload)
+ data_link_len = hci_len;
+ else
+ data_link_len = hdev->max_data_link_payload -
+ NFC_HCI_HCP_PACKET_HEADER_LEN;
+
+ skb_len = ndev->tx_headroom + NFC_HCI_HCP_PACKET_HEADER_LEN +
+ data_link_len + ndev->tx_tailroom;
+ hci_len -= data_link_len;
+
+ skb = alloc_skb(skb_len, GFP_KERNEL);
+ if (skb == NULL) {
+ err = -ENOMEM;
+ goto out_skb_err;
+ }
+ skb_reserve(skb, ndev->tx_headroom);
+
+ skb_put(skb, NFC_HCI_HCP_PACKET_HEADER_LEN + data_link_len);
+
+ /* Only the last fragment will have the cb bit set to 1 */
+ packet = (struct hcp_packet *)skb->data;
+ packet->header = pipe;
+ if (firstfrag) {
+ firstfrag = false;
+ packet->message.header = HCP_HEADER(type, instruction);
+ if (ptr) {
+ memcpy(packet->message.data, ptr,
+ data_link_len - 1);
+ ptr += data_link_len - 1;
+ }
+ } else {
+ memcpy(&packet->message, ptr, data_link_len);
+ ptr += data_link_len;
+ }
+
+ /* This is the last fragment, set the cb bit */
+ if (hci_len == 0)
+ packet->header |= ~NFC_HCI_FRAGMENT;
+
+ skb_queue_tail(&cmd->msg_frags, skb);
+ }
+
+ mutex_lock(&hdev->msg_tx_mutex);
+
+ if (hdev->shutting_down) {
+ err = -ESHUTDOWN;
+ mutex_unlock(&hdev->msg_tx_mutex);
+ goto out_skb_err;
+ }
+
+ list_add_tail(&cmd->msg_l, &hdev->msg_tx_queue);
+ mutex_unlock(&hdev->msg_tx_mutex);
+
+ schedule_work(&hdev->msg_tx_work);
+
+ return 0;
+
+out_skb_err:
+ skb_queue_purge(&cmd->msg_frags);
+ kfree(cmd);
+
+ return err;
+}
+
+/*
+ * Receive hcp message for pipe, with type and cmd.
+ * skb contains optional message data only.
+ */
+void nfc_hci_hcp_message_rx(struct nfc_hci_dev *hdev, u8 pipe, u8 type,
+ u8 instruction, struct sk_buff *skb)
+{
+ switch (type) {
+ case NFC_HCI_HCP_RESPONSE:
+ nfc_hci_resp_received(hdev, instruction, skb);
+ break;
+ case NFC_HCI_HCP_COMMAND:
+ nfc_hci_cmd_received(hdev, pipe, instruction, skb);
+ break;
+ case NFC_HCI_HCP_EVENT:
+ nfc_hci_event_received(hdev, pipe, instruction, skb);
+ break;
+ default:
+ pr_err("UNKNOWN MSG Type %d, instruction=%d\n",
+ type, instruction);
+ kfree_skb(skb);
+ break;
+ }
+}
diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c
new file mode 100644
index 000000000..1b90c0531
--- /dev/null
+++ b/net/nfc/hci/llc.c
@@ -0,0 +1,166 @@
+/*
+ * Link Layer Control manager
+ *
+ * Copyright (C) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <net/nfc/llc.h>
+
+#include "llc.h"
+
+static LIST_HEAD(llc_engines);
+
+int nfc_llc_init(void)
+{
+ int r;
+
+ r = nfc_llc_nop_register();
+ if (r)
+ goto exit;
+
+ r = nfc_llc_shdlc_register();
+ if (r)
+ goto exit;
+
+ return 0;
+
+exit:
+ nfc_llc_exit();
+ return r;
+}
+
+void nfc_llc_exit(void)
+{
+ struct nfc_llc_engine *llc_engine, *n;
+
+ list_for_each_entry_safe(llc_engine, n, &llc_engines, entry) {
+ list_del(&llc_engine->entry);
+ kfree(llc_engine->name);
+ kfree(llc_engine);
+ }
+}
+
+int nfc_llc_register(const char *name, struct nfc_llc_ops *ops)
+{
+ struct nfc_llc_engine *llc_engine;
+
+ llc_engine = kzalloc(sizeof(struct nfc_llc_engine), GFP_KERNEL);
+ if (llc_engine == NULL)
+ return -ENOMEM;
+
+ llc_engine->name = kstrdup(name, GFP_KERNEL);
+ if (llc_engine->name == NULL) {
+ kfree(llc_engine);
+ return -ENOMEM;
+ }
+ llc_engine->ops = ops;
+
+ INIT_LIST_HEAD(&llc_engine->entry);
+ list_add_tail(&llc_engine->entry, &llc_engines);
+
+ return 0;
+}
+
+static struct nfc_llc_engine *nfc_llc_name_to_engine(const char *name)
+{
+ struct nfc_llc_engine *llc_engine;
+
+ list_for_each_entry(llc_engine, &llc_engines, entry) {
+ if (strcmp(llc_engine->name, name) == 0)
+ return llc_engine;
+ }
+
+ return NULL;
+}
+
+void nfc_llc_unregister(const char *name)
+{
+ struct nfc_llc_engine *llc_engine;
+
+ llc_engine = nfc_llc_name_to_engine(name);
+ if (llc_engine == NULL)
+ return;
+
+ list_del(&llc_engine->entry);
+ kfree(llc_engine->name);
+ kfree(llc_engine);
+}
+
+struct nfc_llc *nfc_llc_allocate(const char *name, struct nfc_hci_dev *hdev,
+ xmit_to_drv_t xmit_to_drv,
+ rcv_to_hci_t rcv_to_hci, int tx_headroom,
+ int tx_tailroom, llc_failure_t llc_failure)
+{
+ struct nfc_llc_engine *llc_engine;
+ struct nfc_llc *llc;
+
+ llc_engine = nfc_llc_name_to_engine(name);
+ if (llc_engine == NULL)
+ return NULL;
+
+ llc = kzalloc(sizeof(struct nfc_llc), GFP_KERNEL);
+ if (llc == NULL)
+ return NULL;
+
+ llc->data = llc_engine->ops->init(hdev, xmit_to_drv, rcv_to_hci,
+ tx_headroom, tx_tailroom,
+ &llc->rx_headroom, &llc->rx_tailroom,
+ llc_failure);
+ if (llc->data == NULL) {
+ kfree(llc);
+ return NULL;
+ }
+ llc->ops = llc_engine->ops;
+
+ return llc;
+}
+
+void nfc_llc_free(struct nfc_llc *llc)
+{
+ llc->ops->deinit(llc);
+ kfree(llc);
+}
+
+inline void nfc_llc_get_rx_head_tail_room(struct nfc_llc *llc, int *rx_headroom,
+ int *rx_tailroom)
+{
+ *rx_headroom = llc->rx_headroom;
+ *rx_tailroom = llc->rx_tailroom;
+}
+
+inline int nfc_llc_start(struct nfc_llc *llc)
+{
+ return llc->ops->start(llc);
+}
+
+inline int nfc_llc_stop(struct nfc_llc *llc)
+{
+ return llc->ops->stop(llc);
+}
+
+inline void nfc_llc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb)
+{
+ llc->ops->rcv_from_drv(llc, skb);
+}
+
+inline int nfc_llc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb)
+{
+ return llc->ops->xmit_from_hci(llc, skb);
+}
+
+inline void *nfc_llc_get_data(struct nfc_llc *llc)
+{
+ return llc->data;
+}
diff --git a/net/nfc/hci/llc.h b/net/nfc/hci/llc.h
new file mode 100644
index 000000000..5dad4c57f
--- /dev/null
+++ b/net/nfc/hci/llc.h
@@ -0,0 +1,67 @@
+/*
+ * Link Layer Control manager
+ *
+ * Copyright (C) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LOCAL_LLC_H_
+#define __LOCAL_LLC_H_
+
+#include <net/nfc/hci.h>
+#include <net/nfc/llc.h>
+#include <linux/skbuff.h>
+
+struct nfc_llc_ops {
+ void *(*init) (struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv,
+ rcv_to_hci_t rcv_to_hci, int tx_headroom,
+ int tx_tailroom, int *rx_headroom, int *rx_tailroom,
+ llc_failure_t llc_failure);
+ void (*deinit) (struct nfc_llc *llc);
+ int (*start) (struct nfc_llc *llc);
+ int (*stop) (struct nfc_llc *llc);
+ void (*rcv_from_drv) (struct nfc_llc *llc, struct sk_buff *skb);
+ int (*xmit_from_hci) (struct nfc_llc *llc, struct sk_buff *skb);
+};
+
+struct nfc_llc_engine {
+ const char *name;
+ struct nfc_llc_ops *ops;
+ struct list_head entry;
+};
+
+struct nfc_llc {
+ void *data;
+ struct nfc_llc_ops *ops;
+ int rx_headroom;
+ int rx_tailroom;
+};
+
+void *nfc_llc_get_data(struct nfc_llc *llc);
+
+int nfc_llc_register(const char *name, struct nfc_llc_ops *ops);
+void nfc_llc_unregister(const char *name);
+
+int nfc_llc_nop_register(void);
+
+#if defined(CONFIG_NFC_SHDLC)
+int nfc_llc_shdlc_register(void);
+#else
+static inline int nfc_llc_shdlc_register(void)
+{
+ return 0;
+}
+#endif
+
+#endif /* __LOCAL_LLC_H_ */
diff --git a/net/nfc/hci/llc_nop.c b/net/nfc/hci/llc_nop.c
new file mode 100644
index 000000000..d0435d5a1
--- /dev/null
+++ b/net/nfc/hci/llc_nop.c
@@ -0,0 +1,97 @@
+/*
+ * nop (passthrough) Link Layer Control
+ *
+ * Copyright (C) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/types.h>
+
+#include "llc.h"
+
+struct llc_nop {
+ struct nfc_hci_dev *hdev;
+ xmit_to_drv_t xmit_to_drv;
+ rcv_to_hci_t rcv_to_hci;
+ int tx_headroom;
+ int tx_tailroom;
+ llc_failure_t llc_failure;
+};
+
+static void *llc_nop_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv,
+ rcv_to_hci_t rcv_to_hci, int tx_headroom,
+ int tx_tailroom, int *rx_headroom, int *rx_tailroom,
+ llc_failure_t llc_failure)
+{
+ struct llc_nop *llc_nop;
+
+ *rx_headroom = 0;
+ *rx_tailroom = 0;
+
+ llc_nop = kzalloc(sizeof(struct llc_nop), GFP_KERNEL);
+ if (llc_nop == NULL)
+ return NULL;
+
+ llc_nop->hdev = hdev;
+ llc_nop->xmit_to_drv = xmit_to_drv;
+ llc_nop->rcv_to_hci = rcv_to_hci;
+ llc_nop->tx_headroom = tx_headroom;
+ llc_nop->tx_tailroom = tx_tailroom;
+ llc_nop->llc_failure = llc_failure;
+
+ return llc_nop;
+}
+
+static void llc_nop_deinit(struct nfc_llc *llc)
+{
+ kfree(nfc_llc_get_data(llc));
+}
+
+static int llc_nop_start(struct nfc_llc *llc)
+{
+ return 0;
+}
+
+static int llc_nop_stop(struct nfc_llc *llc)
+{
+ return 0;
+}
+
+static void llc_nop_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb)
+{
+ struct llc_nop *llc_nop = nfc_llc_get_data(llc);
+
+ llc_nop->rcv_to_hci(llc_nop->hdev, skb);
+}
+
+static int llc_nop_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb)
+{
+ struct llc_nop *llc_nop = nfc_llc_get_data(llc);
+
+ return llc_nop->xmit_to_drv(llc_nop->hdev, skb);
+}
+
+static struct nfc_llc_ops llc_nop_ops = {
+ .init = llc_nop_init,
+ .deinit = llc_nop_deinit,
+ .start = llc_nop_start,
+ .stop = llc_nop_stop,
+ .rcv_from_drv = llc_nop_rcv_from_drv,
+ .xmit_from_hci = llc_nop_xmit_from_hci,
+};
+
+int nfc_llc_nop_register(void)
+{
+ return nfc_llc_register(LLC_NOP_NAME, &llc_nop_ops);
+}
diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c
new file mode 100644
index 000000000..401c7e255
--- /dev/null
+++ b/net/nfc/hci/llc_shdlc.c
@@ -0,0 +1,854 @@
+/*
+ * shdlc Link Layer Control
+ *
+ * Copyright (C) 2012 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) "shdlc: %s: " fmt, __func__
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+
+#include "llc.h"
+
+enum shdlc_state {
+ SHDLC_DISCONNECTED = 0,
+ SHDLC_CONNECTING = 1,
+ SHDLC_NEGOTIATING = 2,
+ SHDLC_HALF_CONNECTED = 3,
+ SHDLC_CONNECTED = 4
+};
+
+struct llc_shdlc {
+ struct nfc_hci_dev *hdev;
+ xmit_to_drv_t xmit_to_drv;
+ rcv_to_hci_t rcv_to_hci;
+
+ struct mutex state_mutex;
+ enum shdlc_state state;
+ int hard_fault;
+
+ wait_queue_head_t *connect_wq;
+ int connect_tries;
+ int connect_result;
+ struct timer_list connect_timer;/* aka T3 in spec 10.6.1 */
+
+ u8 w; /* window size */
+ bool srej_support;
+
+ struct timer_list t1_timer; /* send ack timeout */
+ bool t1_active;
+
+ struct timer_list t2_timer; /* guard/retransmit timeout */
+ bool t2_active;
+
+ int ns; /* next seq num for send */
+ int nr; /* next expected seq num for receive */
+ int dnr; /* oldest sent unacked seq num */
+
+ struct sk_buff_head rcv_q;
+
+ struct sk_buff_head send_q;
+ bool rnr; /* other side is not ready to receive */
+
+ struct sk_buff_head ack_pending_q;
+
+ struct work_struct sm_work;
+
+ int tx_headroom;
+ int tx_tailroom;
+
+ llc_failure_t llc_failure;
+};
+
+#define SHDLC_LLC_HEAD_ROOM 2
+
+#define SHDLC_MAX_WINDOW 4
+#define SHDLC_SREJ_SUPPORT false
+
+#define SHDLC_CONTROL_HEAD_MASK 0xe0
+#define SHDLC_CONTROL_HEAD_I 0x80
+#define SHDLC_CONTROL_HEAD_I2 0xa0
+#define SHDLC_CONTROL_HEAD_S 0xc0
+#define SHDLC_CONTROL_HEAD_U 0xe0
+
+#define SHDLC_CONTROL_NS_MASK 0x38
+#define SHDLC_CONTROL_NR_MASK 0x07
+#define SHDLC_CONTROL_TYPE_MASK 0x18
+
+#define SHDLC_CONTROL_M_MASK 0x1f
+
+enum sframe_type {
+ S_FRAME_RR = 0x00,
+ S_FRAME_REJ = 0x01,
+ S_FRAME_RNR = 0x02,
+ S_FRAME_SREJ = 0x03
+};
+
+enum uframe_modifier {
+ U_FRAME_UA = 0x06,
+ U_FRAME_RSET = 0x19
+};
+
+#define SHDLC_CONNECT_VALUE_MS 5
+#define SHDLC_T1_VALUE_MS(w) ((5 * w) / 4)
+#define SHDLC_T2_VALUE_MS 300
+
+#define SHDLC_DUMP_SKB(info, skb) \
+do { \
+ pr_debug("%s:\n", info); \
+ print_hex_dump(KERN_DEBUG, "shdlc: ", DUMP_PREFIX_OFFSET, \
+ 16, 1, skb->data, skb->len, 0); \
+} while (0)
+
+/* checks x < y <= z modulo 8 */
+static bool llc_shdlc_x_lt_y_lteq_z(int x, int y, int z)
+{
+ if (x < z)
+ return ((x < y) && (y <= z)) ? true : false;
+ else
+ return ((y > x) || (y <= z)) ? true : false;
+}
+
+/* checks x <= y < z modulo 8 */
+static bool llc_shdlc_x_lteq_y_lt_z(int x, int y, int z)
+{
+ if (x <= z)
+ return ((x <= y) && (y < z)) ? true : false;
+ else /* x > z -> z+8 > x */
+ return ((y >= x) || (y < z)) ? true : false;
+}
+
+static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc,
+ int payload_len)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb(shdlc->tx_headroom + SHDLC_LLC_HEAD_ROOM +
+ shdlc->tx_tailroom + payload_len, GFP_KERNEL);
+ if (skb)
+ skb_reserve(skb, shdlc->tx_headroom + SHDLC_LLC_HEAD_ROOM);
+
+ return skb;
+}
+
+/* immediately sends an S frame. */
+static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc,
+ enum sframe_type sframe_type, int nr)
+{
+ int r;
+ struct sk_buff *skb;
+
+ pr_debug("sframe_type=%d nr=%d\n", sframe_type, nr);
+
+ skb = llc_shdlc_alloc_skb(shdlc, 0);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ *skb_push(skb, 1) = SHDLC_CONTROL_HEAD_S | (sframe_type << 3) | nr;
+
+ r = shdlc->xmit_to_drv(shdlc->hdev, skb);
+
+ kfree_skb(skb);
+
+ return r;
+}
+
+/* immediately sends an U frame. skb may contain optional payload */
+static int llc_shdlc_send_u_frame(struct llc_shdlc *shdlc,
+ struct sk_buff *skb,
+ enum uframe_modifier uframe_modifier)
+{
+ int r;
+
+ pr_debug("uframe_modifier=%d\n", uframe_modifier);
+
+ *skb_push(skb, 1) = SHDLC_CONTROL_HEAD_U | uframe_modifier;
+
+ r = shdlc->xmit_to_drv(shdlc->hdev, skb);
+
+ kfree_skb(skb);
+
+ return r;
+}
+
+/*
+ * Free ack_pending frames until y_nr - 1, and reset t2 according to
+ * the remaining oldest ack_pending frame sent time
+ */
+static void llc_shdlc_reset_t2(struct llc_shdlc *shdlc, int y_nr)
+{
+ struct sk_buff *skb;
+ int dnr = shdlc->dnr; /* MUST initially be < y_nr */
+
+ pr_debug("release ack pending up to frame %d excluded\n", y_nr);
+
+ while (dnr != y_nr) {
+ pr_debug("release ack pending frame %d\n", dnr);
+
+ skb = skb_dequeue(&shdlc->ack_pending_q);
+ kfree_skb(skb);
+
+ dnr = (dnr + 1) % 8;
+ }
+
+ if (skb_queue_empty(&shdlc->ack_pending_q)) {
+ if (shdlc->t2_active) {
+ del_timer_sync(&shdlc->t2_timer);
+ shdlc->t2_active = false;
+
+ pr_debug
+ ("All sent frames acked. Stopped T2(retransmit)\n");
+ }
+ } else {
+ skb = skb_peek(&shdlc->ack_pending_q);
+
+ mod_timer(&shdlc->t2_timer, *(unsigned long *)skb->cb +
+ msecs_to_jiffies(SHDLC_T2_VALUE_MS));
+ shdlc->t2_active = true;
+
+ pr_debug
+ ("Start T2(retransmit) for remaining unacked sent frames\n");
+ }
+}
+
+/*
+ * Receive validated frames from lower layer. skb contains HCI payload only.
+ * Handle according to algorithm at spec:10.8.2
+ */
+static void llc_shdlc_rcv_i_frame(struct llc_shdlc *shdlc,
+ struct sk_buff *skb, int ns, int nr)
+{
+ int x_ns = ns;
+ int y_nr = nr;
+
+ pr_debug("recvd I-frame %d, remote waiting frame %d\n", ns, nr);
+
+ if (shdlc->state != SHDLC_CONNECTED)
+ goto exit;
+
+ if (x_ns != shdlc->nr) {
+ llc_shdlc_send_s_frame(shdlc, S_FRAME_REJ, shdlc->nr);
+ goto exit;
+ }
+
+ if (shdlc->t1_active == false) {
+ shdlc->t1_active = true;
+ mod_timer(&shdlc->t1_timer, jiffies +
+ msecs_to_jiffies(SHDLC_T1_VALUE_MS(shdlc->w)));
+ pr_debug("(re)Start T1(send ack)\n");
+ }
+
+ if (skb->len) {
+ shdlc->rcv_to_hci(shdlc->hdev, skb);
+ skb = NULL;
+ }
+
+ shdlc->nr = (shdlc->nr + 1) % 8;
+
+ if (llc_shdlc_x_lt_y_lteq_z(shdlc->dnr, y_nr, shdlc->ns)) {
+ llc_shdlc_reset_t2(shdlc, y_nr);
+
+ shdlc->dnr = y_nr;
+ }
+
+exit:
+ kfree_skb(skb);
+}
+
+static void llc_shdlc_rcv_ack(struct llc_shdlc *shdlc, int y_nr)
+{
+ pr_debug("remote acked up to frame %d excluded\n", y_nr);
+
+ if (llc_shdlc_x_lt_y_lteq_z(shdlc->dnr, y_nr, shdlc->ns)) {
+ llc_shdlc_reset_t2(shdlc, y_nr);
+ shdlc->dnr = y_nr;
+ }
+}
+
+static void llc_shdlc_requeue_ack_pending(struct llc_shdlc *shdlc)
+{
+ struct sk_buff *skb;
+
+ pr_debug("ns reset to %d\n", shdlc->dnr);
+
+ while ((skb = skb_dequeue_tail(&shdlc->ack_pending_q))) {
+ skb_pull(skb, 1); /* remove control field */
+ skb_queue_head(&shdlc->send_q, skb);
+ }
+ shdlc->ns = shdlc->dnr;
+}
+
+static void llc_shdlc_rcv_rej(struct llc_shdlc *shdlc, int y_nr)
+{
+ struct sk_buff *skb;
+
+ pr_debug("remote asks retransmission from frame %d\n", y_nr);
+
+ if (llc_shdlc_x_lteq_y_lt_z(shdlc->dnr, y_nr, shdlc->ns)) {
+ if (shdlc->t2_active) {
+ del_timer_sync(&shdlc->t2_timer);
+ shdlc->t2_active = false;
+ pr_debug("Stopped T2(retransmit)\n");
+ }
+
+ if (shdlc->dnr != y_nr) {
+ while ((shdlc->dnr = ((shdlc->dnr + 1) % 8)) != y_nr) {
+ skb = skb_dequeue(&shdlc->ack_pending_q);
+ kfree_skb(skb);
+ }
+ }
+
+ llc_shdlc_requeue_ack_pending(shdlc);
+ }
+}
+
+/* See spec RR:10.8.3 REJ:10.8.4 */
+static void llc_shdlc_rcv_s_frame(struct llc_shdlc *shdlc,
+ enum sframe_type s_frame_type, int nr)
+{
+ struct sk_buff *skb;
+
+ if (shdlc->state != SHDLC_CONNECTED)
+ return;
+
+ switch (s_frame_type) {
+ case S_FRAME_RR:
+ llc_shdlc_rcv_ack(shdlc, nr);
+ if (shdlc->rnr == true) { /* see SHDLC 10.7.7 */
+ shdlc->rnr = false;
+ if (shdlc->send_q.qlen == 0) {
+ skb = llc_shdlc_alloc_skb(shdlc, 0);
+ if (skb)
+ skb_queue_tail(&shdlc->send_q, skb);
+ }
+ }
+ break;
+ case S_FRAME_REJ:
+ llc_shdlc_rcv_rej(shdlc, nr);
+ break;
+ case S_FRAME_RNR:
+ llc_shdlc_rcv_ack(shdlc, nr);
+ shdlc->rnr = true;
+ break;
+ default:
+ break;
+ }
+}
+
+static void llc_shdlc_connect_complete(struct llc_shdlc *shdlc, int r)
+{
+ pr_debug("result=%d\n", r);
+
+ del_timer_sync(&shdlc->connect_timer);
+
+ if (r == 0) {
+ shdlc->ns = 0;
+ shdlc->nr = 0;
+ shdlc->dnr = 0;
+
+ shdlc->state = SHDLC_HALF_CONNECTED;
+ } else {
+ shdlc->state = SHDLC_DISCONNECTED;
+ }
+
+ shdlc->connect_result = r;
+
+ wake_up(shdlc->connect_wq);
+}
+
+static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc)
+{
+ struct sk_buff *skb;
+
+ pr_debug("\n");
+
+ skb = llc_shdlc_alloc_skb(shdlc, 2);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ *skb_put(skb, 1) = SHDLC_MAX_WINDOW;
+ *skb_put(skb, 1) = SHDLC_SREJ_SUPPORT ? 1 : 0;
+
+ return llc_shdlc_send_u_frame(shdlc, skb, U_FRAME_RSET);
+}
+
+static int llc_shdlc_connect_send_ua(struct llc_shdlc *shdlc)
+{
+ struct sk_buff *skb;
+
+ pr_debug("\n");
+
+ skb = llc_shdlc_alloc_skb(shdlc, 0);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ return llc_shdlc_send_u_frame(shdlc, skb, U_FRAME_UA);
+}
+
+static void llc_shdlc_rcv_u_frame(struct llc_shdlc *shdlc,
+ struct sk_buff *skb,
+ enum uframe_modifier u_frame_modifier)
+{
+ u8 w = SHDLC_MAX_WINDOW;
+ bool srej_support = SHDLC_SREJ_SUPPORT;
+ int r;
+
+ pr_debug("u_frame_modifier=%d\n", u_frame_modifier);
+
+ switch (u_frame_modifier) {
+ case U_FRAME_RSET:
+ switch (shdlc->state) {
+ case SHDLC_NEGOTIATING:
+ case SHDLC_CONNECTING:
+ /*
+ * We sent RSET, but chip wants to negociate or we
+ * got RSET before we managed to send out our.
+ */
+ if (skb->len > 0)
+ w = skb->data[0];
+
+ if (skb->len > 1)
+ srej_support = skb->data[1] & 0x01 ? true :
+ false;
+
+ if ((w <= SHDLC_MAX_WINDOW) &&
+ (SHDLC_SREJ_SUPPORT || (srej_support == false))) {
+ shdlc->w = w;
+ shdlc->srej_support = srej_support;
+ r = llc_shdlc_connect_send_ua(shdlc);
+ llc_shdlc_connect_complete(shdlc, r);
+ }
+ break;
+ case SHDLC_HALF_CONNECTED:
+ /*
+ * Chip resent RSET due to its timeout - Ignote it
+ * as we already sent UA.
+ */
+ break;
+ case SHDLC_CONNECTED:
+ /*
+ * Chip wants to reset link. This is unexpected and
+ * unsupported.
+ */
+ shdlc->hard_fault = -ECONNRESET;
+ break;
+ default:
+ break;
+ }
+ break;
+ case U_FRAME_UA:
+ if ((shdlc->state == SHDLC_CONNECTING &&
+ shdlc->connect_tries > 0) ||
+ (shdlc->state == SHDLC_NEGOTIATING)) {
+ llc_shdlc_connect_complete(shdlc, 0);
+ shdlc->state = SHDLC_CONNECTED;
+ }
+ break;
+ default:
+ break;
+ }
+
+ kfree_skb(skb);
+}
+
+static void llc_shdlc_handle_rcv_queue(struct llc_shdlc *shdlc)
+{
+ struct sk_buff *skb;
+ u8 control;
+ int nr;
+ int ns;
+ enum sframe_type s_frame_type;
+ enum uframe_modifier u_frame_modifier;
+
+ if (shdlc->rcv_q.qlen)
+ pr_debug("rcvQlen=%d\n", shdlc->rcv_q.qlen);
+
+ while ((skb = skb_dequeue(&shdlc->rcv_q)) != NULL) {
+ control = skb->data[0];
+ skb_pull(skb, 1);
+ switch (control & SHDLC_CONTROL_HEAD_MASK) {
+ case SHDLC_CONTROL_HEAD_I:
+ case SHDLC_CONTROL_HEAD_I2:
+ if (shdlc->state == SHDLC_HALF_CONNECTED)
+ shdlc->state = SHDLC_CONNECTED;
+
+ ns = (control & SHDLC_CONTROL_NS_MASK) >> 3;
+ nr = control & SHDLC_CONTROL_NR_MASK;
+ llc_shdlc_rcv_i_frame(shdlc, skb, ns, nr);
+ break;
+ case SHDLC_CONTROL_HEAD_S:
+ if (shdlc->state == SHDLC_HALF_CONNECTED)
+ shdlc->state = SHDLC_CONNECTED;
+
+ s_frame_type = (control & SHDLC_CONTROL_TYPE_MASK) >> 3;
+ nr = control & SHDLC_CONTROL_NR_MASK;
+ llc_shdlc_rcv_s_frame(shdlc, s_frame_type, nr);
+ kfree_skb(skb);
+ break;
+ case SHDLC_CONTROL_HEAD_U:
+ u_frame_modifier = control & SHDLC_CONTROL_M_MASK;
+ llc_shdlc_rcv_u_frame(shdlc, skb, u_frame_modifier);
+ break;
+ default:
+ pr_err("UNKNOWN Control=%d\n", control);
+ kfree_skb(skb);
+ break;
+ }
+ }
+}
+
+static int llc_shdlc_w_used(int ns, int dnr)
+{
+ int unack_count;
+
+ if (dnr <= ns)
+ unack_count = ns - dnr;
+ else
+ unack_count = 8 - dnr + ns;
+
+ return unack_count;
+}
+
+/* Send frames according to algorithm at spec:10.8.1 */
+static void llc_shdlc_handle_send_queue(struct llc_shdlc *shdlc)
+{
+ struct sk_buff *skb;
+ int r;
+ unsigned long time_sent;
+
+ if (shdlc->send_q.qlen)
+ pr_debug
+ ("sendQlen=%d ns=%d dnr=%d rnr=%s w_room=%d unackQlen=%d\n",
+ shdlc->send_q.qlen, shdlc->ns, shdlc->dnr,
+ shdlc->rnr == false ? "false" : "true",
+ shdlc->w - llc_shdlc_w_used(shdlc->ns, shdlc->dnr),
+ shdlc->ack_pending_q.qlen);
+
+ while (shdlc->send_q.qlen && shdlc->ack_pending_q.qlen < shdlc->w &&
+ (shdlc->rnr == false)) {
+
+ if (shdlc->t1_active) {
+ del_timer_sync(&shdlc->t1_timer);
+ shdlc->t1_active = false;
+ pr_debug("Stopped T1(send ack)\n");
+ }
+
+ skb = skb_dequeue(&shdlc->send_q);
+
+ *skb_push(skb, 1) = SHDLC_CONTROL_HEAD_I | (shdlc->ns << 3) |
+ shdlc->nr;
+
+ pr_debug("Sending I-Frame %d, waiting to rcv %d\n", shdlc->ns,
+ shdlc->nr);
+ SHDLC_DUMP_SKB("shdlc frame written", skb);
+
+ r = shdlc->xmit_to_drv(shdlc->hdev, skb);
+ if (r < 0) {
+ shdlc->hard_fault = r;
+ break;
+ }
+
+ shdlc->ns = (shdlc->ns + 1) % 8;
+
+ time_sent = jiffies;
+ *(unsigned long *)skb->cb = time_sent;
+
+ skb_queue_tail(&shdlc->ack_pending_q, skb);
+
+ if (shdlc->t2_active == false) {
+ shdlc->t2_active = true;
+ mod_timer(&shdlc->t2_timer, time_sent +
+ msecs_to_jiffies(SHDLC_T2_VALUE_MS));
+ pr_debug("Started T2 (retransmit)\n");
+ }
+ }
+}
+
+static void llc_shdlc_connect_timeout(unsigned long data)
+{
+ struct llc_shdlc *shdlc = (struct llc_shdlc *)data;
+
+ pr_debug("\n");
+
+ schedule_work(&shdlc->sm_work);
+}
+
+static void llc_shdlc_t1_timeout(unsigned long data)
+{
+ struct llc_shdlc *shdlc = (struct llc_shdlc *)data;
+
+ pr_debug("SoftIRQ: need to send ack\n");
+
+ schedule_work(&shdlc->sm_work);
+}
+
+static void llc_shdlc_t2_timeout(unsigned long data)
+{
+ struct llc_shdlc *shdlc = (struct llc_shdlc *)data;
+
+ pr_debug("SoftIRQ: need to retransmit\n");
+
+ schedule_work(&shdlc->sm_work);
+}
+
+static void llc_shdlc_sm_work(struct work_struct *work)
+{
+ struct llc_shdlc *shdlc = container_of(work, struct llc_shdlc, sm_work);
+ int r;
+
+ pr_debug("\n");
+
+ mutex_lock(&shdlc->state_mutex);
+
+ switch (shdlc->state) {
+ case SHDLC_DISCONNECTED:
+ skb_queue_purge(&shdlc->rcv_q);
+ skb_queue_purge(&shdlc->send_q);
+ skb_queue_purge(&shdlc->ack_pending_q);
+ break;
+ case SHDLC_CONNECTING:
+ if (shdlc->hard_fault) {
+ llc_shdlc_connect_complete(shdlc, shdlc->hard_fault);
+ break;
+ }
+
+ if (shdlc->connect_tries++ < 5)
+ r = llc_shdlc_connect_initiate(shdlc);
+ else
+ r = -ETIME;
+ if (r < 0) {
+ llc_shdlc_connect_complete(shdlc, r);
+ } else {
+ mod_timer(&shdlc->connect_timer, jiffies +
+ msecs_to_jiffies(SHDLC_CONNECT_VALUE_MS));
+
+ shdlc->state = SHDLC_NEGOTIATING;
+ }
+ break;
+ case SHDLC_NEGOTIATING:
+ if (timer_pending(&shdlc->connect_timer) == 0) {
+ shdlc->state = SHDLC_CONNECTING;
+ schedule_work(&shdlc->sm_work);
+ }
+
+ llc_shdlc_handle_rcv_queue(shdlc);
+
+ if (shdlc->hard_fault) {
+ llc_shdlc_connect_complete(shdlc, shdlc->hard_fault);
+ break;
+ }
+ break;
+ case SHDLC_HALF_CONNECTED:
+ case SHDLC_CONNECTED:
+ llc_shdlc_handle_rcv_queue(shdlc);
+ llc_shdlc_handle_send_queue(shdlc);
+
+ if (shdlc->t1_active && timer_pending(&shdlc->t1_timer) == 0) {
+ pr_debug
+ ("Handle T1(send ack) elapsed (T1 now inactive)\n");
+
+ shdlc->t1_active = false;
+ r = llc_shdlc_send_s_frame(shdlc, S_FRAME_RR,
+ shdlc->nr);
+ if (r < 0)
+ shdlc->hard_fault = r;
+ }
+
+ if (shdlc->t2_active && timer_pending(&shdlc->t2_timer) == 0) {
+ pr_debug
+ ("Handle T2(retransmit) elapsed (T2 inactive)\n");
+
+ shdlc->t2_active = false;
+
+ llc_shdlc_requeue_ack_pending(shdlc);
+ llc_shdlc_handle_send_queue(shdlc);
+ }
+
+ if (shdlc->hard_fault)
+ shdlc->llc_failure(shdlc->hdev, shdlc->hard_fault);
+ break;
+ default:
+ break;
+ }
+ mutex_unlock(&shdlc->state_mutex);
+}
+
+/*
+ * Called from syscall context to establish shdlc link. Sleeps until
+ * link is ready or failure.
+ */
+static int llc_shdlc_connect(struct llc_shdlc *shdlc)
+{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(connect_wq);
+
+ pr_debug("\n");
+
+ mutex_lock(&shdlc->state_mutex);
+
+ shdlc->state = SHDLC_CONNECTING;
+ shdlc->connect_wq = &connect_wq;
+ shdlc->connect_tries = 0;
+ shdlc->connect_result = 1;
+
+ mutex_unlock(&shdlc->state_mutex);
+
+ schedule_work(&shdlc->sm_work);
+
+ wait_event(connect_wq, shdlc->connect_result != 1);
+
+ return shdlc->connect_result;
+}
+
+static void llc_shdlc_disconnect(struct llc_shdlc *shdlc)
+{
+ pr_debug("\n");
+
+ mutex_lock(&shdlc->state_mutex);
+
+ shdlc->state = SHDLC_DISCONNECTED;
+
+ mutex_unlock(&shdlc->state_mutex);
+
+ schedule_work(&shdlc->sm_work);
+}
+
+/*
+ * Receive an incoming shdlc frame. Frame has already been crc-validated.
+ * skb contains only LLC header and payload.
+ * If skb == NULL, it is a notification that the link below is dead.
+ */
+static void llc_shdlc_recv_frame(struct llc_shdlc *shdlc, struct sk_buff *skb)
+{
+ if (skb == NULL) {
+ pr_err("NULL Frame -> link is dead\n");
+ shdlc->hard_fault = -EREMOTEIO;
+ } else {
+ SHDLC_DUMP_SKB("incoming frame", skb);
+ skb_queue_tail(&shdlc->rcv_q, skb);
+ }
+
+ schedule_work(&shdlc->sm_work);
+}
+
+static void *llc_shdlc_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv,
+ rcv_to_hci_t rcv_to_hci, int tx_headroom,
+ int tx_tailroom, int *rx_headroom, int *rx_tailroom,
+ llc_failure_t llc_failure)
+{
+ struct llc_shdlc *shdlc;
+
+ *rx_headroom = SHDLC_LLC_HEAD_ROOM;
+ *rx_tailroom = 0;
+
+ shdlc = kzalloc(sizeof(struct llc_shdlc), GFP_KERNEL);
+ if (shdlc == NULL)
+ return NULL;
+
+ mutex_init(&shdlc->state_mutex);
+ shdlc->state = SHDLC_DISCONNECTED;
+
+ init_timer(&shdlc->connect_timer);
+ shdlc->connect_timer.data = (unsigned long)shdlc;
+ shdlc->connect_timer.function = llc_shdlc_connect_timeout;
+
+ init_timer(&shdlc->t1_timer);
+ shdlc->t1_timer.data = (unsigned long)shdlc;
+ shdlc->t1_timer.function = llc_shdlc_t1_timeout;
+
+ init_timer(&shdlc->t2_timer);
+ shdlc->t2_timer.data = (unsigned long)shdlc;
+ shdlc->t2_timer.function = llc_shdlc_t2_timeout;
+
+ shdlc->w = SHDLC_MAX_WINDOW;
+ shdlc->srej_support = SHDLC_SREJ_SUPPORT;
+
+ skb_queue_head_init(&shdlc->rcv_q);
+ skb_queue_head_init(&shdlc->send_q);
+ skb_queue_head_init(&shdlc->ack_pending_q);
+
+ INIT_WORK(&shdlc->sm_work, llc_shdlc_sm_work);
+
+ shdlc->hdev = hdev;
+ shdlc->xmit_to_drv = xmit_to_drv;
+ shdlc->rcv_to_hci = rcv_to_hci;
+ shdlc->tx_headroom = tx_headroom;
+ shdlc->tx_tailroom = tx_tailroom;
+ shdlc->llc_failure = llc_failure;
+
+ return shdlc;
+}
+
+static void llc_shdlc_deinit(struct nfc_llc *llc)
+{
+ struct llc_shdlc *shdlc = nfc_llc_get_data(llc);
+
+ skb_queue_purge(&shdlc->rcv_q);
+ skb_queue_purge(&shdlc->send_q);
+ skb_queue_purge(&shdlc->ack_pending_q);
+
+ kfree(shdlc);
+}
+
+static int llc_shdlc_start(struct nfc_llc *llc)
+{
+ struct llc_shdlc *shdlc = nfc_llc_get_data(llc);
+
+ return llc_shdlc_connect(shdlc);
+}
+
+static int llc_shdlc_stop(struct nfc_llc *llc)
+{
+ struct llc_shdlc *shdlc = nfc_llc_get_data(llc);
+
+ llc_shdlc_disconnect(shdlc);
+
+ return 0;
+}
+
+static void llc_shdlc_rcv_from_drv(struct nfc_llc *llc, struct sk_buff *skb)
+{
+ struct llc_shdlc *shdlc = nfc_llc_get_data(llc);
+
+ llc_shdlc_recv_frame(shdlc, skb);
+}
+
+static int llc_shdlc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb)
+{
+ struct llc_shdlc *shdlc = nfc_llc_get_data(llc);
+
+ skb_queue_tail(&shdlc->send_q, skb);
+
+ schedule_work(&shdlc->sm_work);
+
+ return 0;
+}
+
+static struct nfc_llc_ops llc_shdlc_ops = {
+ .init = llc_shdlc_init,
+ .deinit = llc_shdlc_deinit,
+ .start = llc_shdlc_start,
+ .stop = llc_shdlc_stop,
+ .rcv_from_drv = llc_shdlc_rcv_from_drv,
+ .xmit_from_hci = llc_shdlc_xmit_from_hci,
+};
+
+int nfc_llc_shdlc_register(void)
+{
+ return nfc_llc_register(LLC_SHDLC_NAME, &llc_shdlc_ops);
+}
diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h
new file mode 100644
index 000000000..de1789e3c
--- /dev/null
+++ b/net/nfc/llcp.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright (C) 2011 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+enum llcp_state {
+ LLCP_CONNECTED = 1, /* wait_for_packet() wants that */
+ LLCP_CONNECTING,
+ LLCP_DISCONNECTING,
+ LLCP_CLOSED,
+ LLCP_BOUND,
+ LLCP_LISTEN,
+};
+
+#define LLCP_DEFAULT_LTO 100
+#define LLCP_DEFAULT_RW 1
+#define LLCP_DEFAULT_MIU 128
+
+#define LLCP_MAX_LTO 0xff
+#define LLCP_MAX_RW 15
+#define LLCP_MAX_MIUX 0x7ff
+#define LLCP_MAX_MIU (LLCP_MAX_MIUX + 128)
+
+#define LLCP_WKS_NUM_SAP 16
+#define LLCP_SDP_NUM_SAP 16
+#define LLCP_LOCAL_NUM_SAP 32
+#define LLCP_LOCAL_SAP_OFFSET (LLCP_WKS_NUM_SAP + LLCP_SDP_NUM_SAP)
+#define LLCP_MAX_SAP (LLCP_WKS_NUM_SAP + LLCP_SDP_NUM_SAP + LLCP_LOCAL_NUM_SAP)
+#define LLCP_SDP_UNBOUND (LLCP_MAX_SAP + 1)
+
+struct nfc_llcp_sock;
+
+struct llcp_sock_list {
+ struct hlist_head head;
+ rwlock_t lock;
+};
+
+struct nfc_llcp_sdp_tlv {
+ u8 *tlv;
+ u8 tlv_len;
+
+ char *uri;
+ u8 tid;
+ u8 sap;
+
+ unsigned long time;
+
+ struct hlist_node node;
+};
+
+struct nfc_llcp_local {
+ struct list_head list;
+ struct nfc_dev *dev;
+
+ struct kref ref;
+
+ struct mutex sdp_lock;
+
+ struct timer_list link_timer;
+ struct sk_buff_head tx_queue;
+ struct work_struct tx_work;
+ struct work_struct rx_work;
+ struct sk_buff *rx_pending;
+ struct work_struct timeout_work;
+
+ u32 target_idx;
+ u8 rf_mode;
+ u8 comm_mode;
+ u8 lto;
+ u8 rw;
+ __be16 miux;
+ unsigned long local_wks; /* Well known services */
+ unsigned long local_sdp; /* Local services */
+ unsigned long local_sap; /* Local SAPs, not available for discovery */
+ atomic_t local_sdp_cnt[LLCP_SDP_NUM_SAP];
+
+ /* local */
+ u8 gb[NFC_MAX_GT_LEN];
+ u8 gb_len;
+
+ /* remote */
+ u8 remote_gb[NFC_MAX_GT_LEN];
+ u8 remote_gb_len;
+
+ u8 remote_version;
+ u16 remote_miu;
+ u16 remote_lto;
+ u8 remote_opt;
+ u16 remote_wks;
+
+ struct mutex sdreq_lock;
+ struct hlist_head pending_sdreqs;
+ struct timer_list sdreq_timer;
+ struct work_struct sdreq_timeout_work;
+ u8 sdreq_next_tid;
+
+ /* sockets array */
+ struct llcp_sock_list sockets;
+ struct llcp_sock_list connecting_sockets;
+ struct llcp_sock_list raw_sockets;
+};
+
+struct nfc_llcp_sock {
+ struct sock sk;
+ struct nfc_dev *dev;
+ struct nfc_llcp_local *local;
+ u32 target_idx;
+ u32 nfc_protocol;
+
+ /* Link parameters */
+ u8 ssap;
+ u8 dsap;
+ char *service_name;
+ size_t service_name_len;
+ u8 rw;
+ __be16 miux;
+
+
+ /* Remote link parameters */
+ u8 remote_rw;
+ u16 remote_miu;
+
+ /* Link variables */
+ u8 send_n;
+ u8 send_ack_n;
+ u8 recv_n;
+ u8 recv_ack_n;
+
+ /* Is the remote peer ready to receive */
+ u8 remote_ready;
+
+ /* Reserved source SAP */
+ u8 reserved_ssap;
+
+ struct sk_buff_head tx_queue;
+ struct sk_buff_head tx_pending_queue;
+
+ struct list_head accept_queue;
+ struct sock *parent;
+};
+
+struct nfc_llcp_ui_cb {
+ __u8 dsap;
+ __u8 ssap;
+};
+
+#define nfc_llcp_ui_skb_cb(__skb) ((struct nfc_llcp_ui_cb *)&((__skb)->cb[0]))
+
+#define nfc_llcp_sock(sk) ((struct nfc_llcp_sock *) (sk))
+#define nfc_llcp_dev(sk) (nfc_llcp_sock((sk))->dev)
+
+#define LLCP_HEADER_SIZE 2
+#define LLCP_SEQUENCE_SIZE 1
+#define LLCP_AGF_PDU_HEADER_SIZE 2
+
+/* LLCP versions: 1.1 is 1.0 plus SDP */
+#define LLCP_VERSION_10 0x10
+#define LLCP_VERSION_11 0x11
+
+/* LLCP PDU types */
+#define LLCP_PDU_SYMM 0x0
+#define LLCP_PDU_PAX 0x1
+#define LLCP_PDU_AGF 0x2
+#define LLCP_PDU_UI 0x3
+#define LLCP_PDU_CONNECT 0x4
+#define LLCP_PDU_DISC 0x5
+#define LLCP_PDU_CC 0x6
+#define LLCP_PDU_DM 0x7
+#define LLCP_PDU_FRMR 0x8
+#define LLCP_PDU_SNL 0x9
+#define LLCP_PDU_I 0xc
+#define LLCP_PDU_RR 0xd
+#define LLCP_PDU_RNR 0xe
+
+/* Parameters TLV types */
+#define LLCP_TLV_VERSION 0x1
+#define LLCP_TLV_MIUX 0x2
+#define LLCP_TLV_WKS 0x3
+#define LLCP_TLV_LTO 0x4
+#define LLCP_TLV_RW 0x5
+#define LLCP_TLV_SN 0x6
+#define LLCP_TLV_OPT 0x7
+#define LLCP_TLV_SDREQ 0x8
+#define LLCP_TLV_SDRES 0x9
+#define LLCP_TLV_MAX 0xa
+
+/* Well known LLCP SAP */
+#define LLCP_SAP_SDP 0x1
+#define LLCP_SAP_IP 0x2
+#define LLCP_SAP_OBEX 0x3
+#define LLCP_SAP_SNEP 0x4
+#define LLCP_SAP_MAX 0xff
+
+/* Disconnection reason code */
+#define LLCP_DM_DISC 0x00
+#define LLCP_DM_NOCONN 0x01
+#define LLCP_DM_NOBOUND 0x02
+#define LLCP_DM_REJ 0x03
+
+
+void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *s);
+void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *s);
+void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock);
+struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev);
+struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local);
+int nfc_llcp_local_put(struct nfc_llcp_local *local);
+u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local,
+ struct nfc_llcp_sock *sock);
+u8 nfc_llcp_get_local_ssap(struct nfc_llcp_local *local);
+void nfc_llcp_put_ssap(struct nfc_llcp_local *local, u8 ssap);
+int nfc_llcp_queue_i_frames(struct nfc_llcp_sock *sock);
+void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local,
+ struct sk_buff *skb, u8 direction);
+
+/* Sock API */
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp);
+void nfc_llcp_sock_free(struct nfc_llcp_sock *sock);
+void nfc_llcp_accept_unlink(struct sock *sk);
+void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk);
+struct sock *nfc_llcp_accept_dequeue(struct sock *sk, struct socket *newsock);
+
+/* TLV API */
+int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local,
+ u8 *tlv_array, u16 tlv_array_len);
+int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock,
+ u8 *tlv_array, u16 tlv_array_len);
+
+/* Commands API */
+void nfc_llcp_recv(void *data, struct sk_buff *skb, int err);
+u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length);
+struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap);
+struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri,
+ size_t uri_len);
+void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp);
+void nfc_llcp_free_sdp_tlv_list(struct hlist_head *sdp_head);
+void nfc_llcp_recv(void *data, struct sk_buff *skb, int err);
+int nfc_llcp_send_symm(struct nfc_dev *dev);
+int nfc_llcp_send_connect(struct nfc_llcp_sock *sock);
+int nfc_llcp_send_cc(struct nfc_llcp_sock *sock);
+int nfc_llcp_send_snl_sdres(struct nfc_llcp_local *local,
+ struct hlist_head *tlv_list, size_t tlvs_len);
+int nfc_llcp_send_snl_sdreq(struct nfc_llcp_local *local,
+ struct hlist_head *tlv_list, size_t tlvs_len);
+int nfc_llcp_send_dm(struct nfc_llcp_local *local, u8 ssap, u8 dsap, u8 reason);
+int nfc_llcp_send_disconnect(struct nfc_llcp_sock *sock);
+int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock,
+ struct msghdr *msg, size_t len);
+int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
+ struct msghdr *msg, size_t len);
+int nfc_llcp_send_rr(struct nfc_llcp_sock *sock);
+
+/* Socket API */
+int __init nfc_llcp_sock_init(void);
+void nfc_llcp_sock_exit(void);
diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
new file mode 100644
index 000000000..3621a902c
--- /dev/null
+++ b/net/nfc/llcp_commands.c
@@ -0,0 +1,799 @@
+/*
+ * Copyright (C) 2011 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) "llcp: %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/nfc.h>
+
+#include <net/nfc/nfc.h>
+
+#include "nfc.h"
+#include "llcp.h"
+
+static u8 llcp_tlv_length[LLCP_TLV_MAX] = {
+ 0,
+ 1, /* VERSION */
+ 2, /* MIUX */
+ 2, /* WKS */
+ 1, /* LTO */
+ 1, /* RW */
+ 0, /* SN */
+ 1, /* OPT */
+ 0, /* SDREQ */
+ 2, /* SDRES */
+
+};
+
+static u8 llcp_tlv8(u8 *tlv, u8 type)
+{
+ if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]])
+ return 0;
+
+ return tlv[2];
+}
+
+static u16 llcp_tlv16(u8 *tlv, u8 type)
+{
+ if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]])
+ return 0;
+
+ return be16_to_cpu(*((__be16 *)(tlv + 2)));
+}
+
+
+static u8 llcp_tlv_version(u8 *tlv)
+{
+ return llcp_tlv8(tlv, LLCP_TLV_VERSION);
+}
+
+static u16 llcp_tlv_miux(u8 *tlv)
+{
+ return llcp_tlv16(tlv, LLCP_TLV_MIUX) & 0x7ff;
+}
+
+static u16 llcp_tlv_wks(u8 *tlv)
+{
+ return llcp_tlv16(tlv, LLCP_TLV_WKS);
+}
+
+static u16 llcp_tlv_lto(u8 *tlv)
+{
+ return llcp_tlv8(tlv, LLCP_TLV_LTO);
+}
+
+static u8 llcp_tlv_opt(u8 *tlv)
+{
+ return llcp_tlv8(tlv, LLCP_TLV_OPT);
+}
+
+static u8 llcp_tlv_rw(u8 *tlv)
+{
+ return llcp_tlv8(tlv, LLCP_TLV_RW) & 0xf;
+}
+
+u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length)
+{
+ u8 *tlv, length;
+
+ pr_debug("type %d\n", type);
+
+ if (type >= LLCP_TLV_MAX)
+ return NULL;
+
+ length = llcp_tlv_length[type];
+ if (length == 0 && value_length == 0)
+ return NULL;
+ else if (length == 0)
+ length = value_length;
+
+ *tlv_length = 2 + length;
+ tlv = kzalloc(2 + length, GFP_KERNEL);
+ if (tlv == NULL)
+ return tlv;
+
+ tlv[0] = type;
+ tlv[1] = length;
+ memcpy(tlv + 2, value, length);
+
+ return tlv;
+}
+
+struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap)
+{
+ struct nfc_llcp_sdp_tlv *sdres;
+ u8 value[2];
+
+ sdres = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL);
+ if (sdres == NULL)
+ return NULL;
+
+ value[0] = tid;
+ value[1] = sap;
+
+ sdres->tlv = nfc_llcp_build_tlv(LLCP_TLV_SDRES, value, 2,
+ &sdres->tlv_len);
+ if (sdres->tlv == NULL) {
+ kfree(sdres);
+ return NULL;
+ }
+
+ sdres->tid = tid;
+ sdres->sap = sap;
+
+ INIT_HLIST_NODE(&sdres->node);
+
+ return sdres;
+}
+
+struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri,
+ size_t uri_len)
+{
+ struct nfc_llcp_sdp_tlv *sdreq;
+
+ pr_debug("uri: %s, len: %zu\n", uri, uri_len);
+
+ sdreq = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL);
+ if (sdreq == NULL)
+ return NULL;
+
+ sdreq->tlv_len = uri_len + 3;
+
+ if (uri[uri_len - 1] == 0)
+ sdreq->tlv_len--;
+
+ sdreq->tlv = kzalloc(sdreq->tlv_len + 1, GFP_KERNEL);
+ if (sdreq->tlv == NULL) {
+ kfree(sdreq);
+ return NULL;
+ }
+
+ sdreq->tlv[0] = LLCP_TLV_SDREQ;
+ sdreq->tlv[1] = sdreq->tlv_len - 2;
+ sdreq->tlv[2] = tid;
+
+ sdreq->tid = tid;
+ sdreq->uri = sdreq->tlv + 3;
+ memcpy(sdreq->uri, uri, uri_len);
+
+ sdreq->time = jiffies;
+
+ INIT_HLIST_NODE(&sdreq->node);
+
+ return sdreq;
+}
+
+void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp)
+{
+ kfree(sdp->tlv);
+ kfree(sdp);
+}
+
+void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head)
+{
+ struct nfc_llcp_sdp_tlv *sdp;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(sdp, n, head, node) {
+ hlist_del(&sdp->node);
+
+ nfc_llcp_free_sdp_tlv(sdp);
+ }
+}
+
+int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local,
+ u8 *tlv_array, u16 tlv_array_len)
+{
+ u8 *tlv = tlv_array, type, length, offset = 0;
+
+ pr_debug("TLV array length %d\n", tlv_array_len);
+
+ if (local == NULL)
+ return -ENODEV;
+
+ while (offset < tlv_array_len) {
+ type = tlv[0];
+ length = tlv[1];
+
+ pr_debug("type 0x%x length %d\n", type, length);
+
+ switch (type) {
+ case LLCP_TLV_VERSION:
+ local->remote_version = llcp_tlv_version(tlv);
+ break;
+ case LLCP_TLV_MIUX:
+ local->remote_miu = llcp_tlv_miux(tlv) + 128;
+ break;
+ case LLCP_TLV_WKS:
+ local->remote_wks = llcp_tlv_wks(tlv);
+ break;
+ case LLCP_TLV_LTO:
+ local->remote_lto = llcp_tlv_lto(tlv) * 10;
+ break;
+ case LLCP_TLV_OPT:
+ local->remote_opt = llcp_tlv_opt(tlv);
+ break;
+ default:
+ pr_err("Invalid gt tlv value 0x%x\n", type);
+ break;
+ }
+
+ offset += length + 2;
+ tlv += length + 2;
+ }
+
+ pr_debug("version 0x%x miu %d lto %d opt 0x%x wks 0x%x\n",
+ local->remote_version, local->remote_miu,
+ local->remote_lto, local->remote_opt,
+ local->remote_wks);
+
+ return 0;
+}
+
+int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock,
+ u8 *tlv_array, u16 tlv_array_len)
+{
+ u8 *tlv = tlv_array, type, length, offset = 0;
+
+ pr_debug("TLV array length %d\n", tlv_array_len);
+
+ if (sock == NULL)
+ return -ENOTCONN;
+
+ while (offset < tlv_array_len) {
+ type = tlv[0];
+ length = tlv[1];
+
+ pr_debug("type 0x%x length %d\n", type, length);
+
+ switch (type) {
+ case LLCP_TLV_MIUX:
+ sock->remote_miu = llcp_tlv_miux(tlv) + 128;
+ break;
+ case LLCP_TLV_RW:
+ sock->remote_rw = llcp_tlv_rw(tlv);
+ break;
+ case LLCP_TLV_SN:
+ break;
+ default:
+ pr_err("Invalid gt tlv value 0x%x\n", type);
+ break;
+ }
+
+ offset += length + 2;
+ tlv += length + 2;
+ }
+
+ pr_debug("sock %p rw %d miu %d\n", sock,
+ sock->remote_rw, sock->remote_miu);
+
+ return 0;
+}
+
+static struct sk_buff *llcp_add_header(struct sk_buff *pdu,
+ u8 dsap, u8 ssap, u8 ptype)
+{
+ u8 header[2];
+
+ pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap);
+
+ header[0] = (u8)((dsap << 2) | (ptype >> 2));
+ header[1] = (u8)((ptype << 6) | ssap);
+
+ pr_debug("header 0x%x 0x%x\n", header[0], header[1]);
+
+ memcpy(skb_put(pdu, LLCP_HEADER_SIZE), header, LLCP_HEADER_SIZE);
+
+ return pdu;
+}
+
+static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, u8 *tlv,
+ u8 tlv_length)
+{
+ /* XXX Add an skb length check */
+
+ if (tlv == NULL)
+ return NULL;
+
+ memcpy(skb_put(pdu, tlv_length), tlv, tlv_length);
+
+ return pdu;
+}
+
+static struct sk_buff *llcp_allocate_pdu(struct nfc_llcp_sock *sock,
+ u8 cmd, u16 size)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (sock->ssap == 0)
+ return NULL;
+
+ skb = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT,
+ size + LLCP_HEADER_SIZE, &err);
+ if (skb == NULL) {
+ pr_err("Could not allocate PDU\n");
+ return NULL;
+ }
+
+ skb = llcp_add_header(skb, sock->dsap, sock->ssap, cmd);
+
+ return skb;
+}
+
+int nfc_llcp_send_disconnect(struct nfc_llcp_sock *sock)
+{
+ struct sk_buff *skb;
+ struct nfc_dev *dev;
+ struct nfc_llcp_local *local;
+
+ pr_debug("Sending DISC\n");
+
+ local = sock->local;
+ if (local == NULL)
+ return -ENODEV;
+
+ dev = sock->dev;
+ if (dev == NULL)
+ return -ENODEV;
+
+ skb = llcp_allocate_pdu(sock, LLCP_PDU_DISC, 0);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ skb_queue_tail(&local->tx_queue, skb);
+
+ return 0;
+}
+
+int nfc_llcp_send_symm(struct nfc_dev *dev)
+{
+ struct sk_buff *skb;
+ struct nfc_llcp_local *local;
+ u16 size = 0;
+
+ pr_debug("Sending SYMM\n");
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL)
+ return -ENODEV;
+
+ size += LLCP_HEADER_SIZE;
+ size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE;
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE);
+
+ skb = llcp_add_header(skb, 0, 0, LLCP_PDU_SYMM);
+
+ __net_timestamp(skb);
+
+ nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX);
+
+ return nfc_data_exchange(dev, local->target_idx, skb,
+ nfc_llcp_recv, local);
+}
+
+int nfc_llcp_send_connect(struct nfc_llcp_sock *sock)
+{
+ struct nfc_llcp_local *local;
+ struct sk_buff *skb;
+ u8 *service_name_tlv = NULL, service_name_tlv_length;
+ u8 *miux_tlv = NULL, miux_tlv_length;
+ u8 *rw_tlv = NULL, rw_tlv_length, rw;
+ int err;
+ u16 size = 0;
+ __be16 miux;
+
+ pr_debug("Sending CONNECT\n");
+
+ local = sock->local;
+ if (local == NULL)
+ return -ENODEV;
+
+ if (sock->service_name != NULL) {
+ service_name_tlv = nfc_llcp_build_tlv(LLCP_TLV_SN,
+ sock->service_name,
+ sock->service_name_len,
+ &service_name_tlv_length);
+ size += service_name_tlv_length;
+ }
+
+ /* If the socket parameters are not set, use the local ones */
+ miux = be16_to_cpu(sock->miux) > LLCP_MAX_MIUX ?
+ local->miux : sock->miux;
+ rw = sock->rw > LLCP_MAX_RW ? local->rw : sock->rw;
+
+ miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0,
+ &miux_tlv_length);
+ size += miux_tlv_length;
+
+ rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length);
+ size += rw_tlv_length;
+
+ pr_debug("SKB size %d SN length %zu\n", size, sock->service_name_len);
+
+ skb = llcp_allocate_pdu(sock, LLCP_PDU_CONNECT, size);
+ if (skb == NULL) {
+ err = -ENOMEM;
+ goto error_tlv;
+ }
+
+ if (service_name_tlv != NULL)
+ skb = llcp_add_tlv(skb, service_name_tlv,
+ service_name_tlv_length);
+
+ skb = llcp_add_tlv(skb, miux_tlv, miux_tlv_length);
+ skb = llcp_add_tlv(skb, rw_tlv, rw_tlv_length);
+
+ skb_queue_tail(&local->tx_queue, skb);
+
+ return 0;
+
+error_tlv:
+ pr_err("error %d\n", err);
+
+ kfree(service_name_tlv);
+ kfree(miux_tlv);
+ kfree(rw_tlv);
+
+ return err;
+}
+
+int nfc_llcp_send_cc(struct nfc_llcp_sock *sock)
+{
+ struct nfc_llcp_local *local;
+ struct sk_buff *skb;
+ u8 *miux_tlv = NULL, miux_tlv_length;
+ u8 *rw_tlv = NULL, rw_tlv_length, rw;
+ int err;
+ u16 size = 0;
+ __be16 miux;
+
+ pr_debug("Sending CC\n");
+
+ local = sock->local;
+ if (local == NULL)
+ return -ENODEV;
+
+ /* If the socket parameters are not set, use the local ones */
+ miux = be16_to_cpu(sock->miux) > LLCP_MAX_MIUX ?
+ local->miux : sock->miux;
+ rw = sock->rw > LLCP_MAX_RW ? local->rw : sock->rw;
+
+ miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0,
+ &miux_tlv_length);
+ size += miux_tlv_length;
+
+ rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length);
+ size += rw_tlv_length;
+
+ skb = llcp_allocate_pdu(sock, LLCP_PDU_CC, size);
+ if (skb == NULL) {
+ err = -ENOMEM;
+ goto error_tlv;
+ }
+
+ skb = llcp_add_tlv(skb, miux_tlv, miux_tlv_length);
+ skb = llcp_add_tlv(skb, rw_tlv, rw_tlv_length);
+
+ skb_queue_tail(&local->tx_queue, skb);
+
+ return 0;
+
+error_tlv:
+ pr_err("error %d\n", err);
+
+ kfree(miux_tlv);
+ kfree(rw_tlv);
+
+ return err;
+}
+
+static struct sk_buff *nfc_llcp_allocate_snl(struct nfc_llcp_local *local,
+ size_t tlv_length)
+{
+ struct sk_buff *skb;
+ struct nfc_dev *dev;
+ u16 size = 0;
+
+ if (local == NULL)
+ return ERR_PTR(-ENODEV);
+
+ dev = local->dev;
+ if (dev == NULL)
+ return ERR_PTR(-ENODEV);
+
+ size += LLCP_HEADER_SIZE;
+ size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE;
+ size += tlv_length;
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (skb == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE);
+
+ skb = llcp_add_header(skb, LLCP_SAP_SDP, LLCP_SAP_SDP, LLCP_PDU_SNL);
+
+ return skb;
+}
+
+int nfc_llcp_send_snl_sdres(struct nfc_llcp_local *local,
+ struct hlist_head *tlv_list, size_t tlvs_len)
+{
+ struct nfc_llcp_sdp_tlv *sdp;
+ struct hlist_node *n;
+ struct sk_buff *skb;
+
+ skb = nfc_llcp_allocate_snl(local, tlvs_len);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ hlist_for_each_entry_safe(sdp, n, tlv_list, node) {
+ memcpy(skb_put(skb, sdp->tlv_len), sdp->tlv, sdp->tlv_len);
+
+ hlist_del(&sdp->node);
+
+ nfc_llcp_free_sdp_tlv(sdp);
+ }
+
+ skb_queue_tail(&local->tx_queue, skb);
+
+ return 0;
+}
+
+int nfc_llcp_send_snl_sdreq(struct nfc_llcp_local *local,
+ struct hlist_head *tlv_list, size_t tlvs_len)
+{
+ struct nfc_llcp_sdp_tlv *sdreq;
+ struct hlist_node *n;
+ struct sk_buff *skb;
+
+ skb = nfc_llcp_allocate_snl(local, tlvs_len);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ mutex_lock(&local->sdreq_lock);
+
+ if (hlist_empty(&local->pending_sdreqs))
+ mod_timer(&local->sdreq_timer,
+ jiffies + msecs_to_jiffies(3 * local->remote_lto));
+
+ hlist_for_each_entry_safe(sdreq, n, tlv_list, node) {
+ pr_debug("tid %d for %s\n", sdreq->tid, sdreq->uri);
+
+ memcpy(skb_put(skb, sdreq->tlv_len), sdreq->tlv,
+ sdreq->tlv_len);
+
+ hlist_del(&sdreq->node);
+
+ hlist_add_head(&sdreq->node, &local->pending_sdreqs);
+ }
+
+ mutex_unlock(&local->sdreq_lock);
+
+ skb_queue_tail(&local->tx_queue, skb);
+
+ return 0;
+}
+
+int nfc_llcp_send_dm(struct nfc_llcp_local *local, u8 ssap, u8 dsap, u8 reason)
+{
+ struct sk_buff *skb;
+ struct nfc_dev *dev;
+ u16 size = 1; /* Reason code */
+
+ pr_debug("Sending DM reason 0x%x\n", reason);
+
+ if (local == NULL)
+ return -ENODEV;
+
+ dev = local->dev;
+ if (dev == NULL)
+ return -ENODEV;
+
+ size += LLCP_HEADER_SIZE;
+ size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE;
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE);
+
+ skb = llcp_add_header(skb, dsap, ssap, LLCP_PDU_DM);
+
+ memcpy(skb_put(skb, 1), &reason, 1);
+
+ skb_queue_head(&local->tx_queue, skb);
+
+ return 0;
+}
+
+int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock,
+ struct msghdr *msg, size_t len)
+{
+ struct sk_buff *pdu;
+ struct sock *sk = &sock->sk;
+ struct nfc_llcp_local *local;
+ size_t frag_len = 0, remaining_len;
+ u8 *msg_data, *msg_ptr;
+ u16 remote_miu;
+
+ pr_debug("Send I frame len %zd\n", len);
+
+ local = sock->local;
+ if (local == NULL)
+ return -ENODEV;
+
+ /* Remote is ready but has not acknowledged our frames */
+ if((sock->remote_ready &&
+ skb_queue_len(&sock->tx_pending_queue) >= sock->remote_rw &&
+ skb_queue_len(&sock->tx_queue) >= 2 * sock->remote_rw)) {
+ pr_err("Pending queue is full %d frames\n",
+ skb_queue_len(&sock->tx_pending_queue));
+ return -ENOBUFS;
+ }
+
+ /* Remote is not ready and we've been queueing enough frames */
+ if ((!sock->remote_ready &&
+ skb_queue_len(&sock->tx_queue) >= 2 * sock->remote_rw)) {
+ pr_err("Tx queue is full %d frames\n",
+ skb_queue_len(&sock->tx_queue));
+ return -ENOBUFS;
+ }
+
+ msg_data = kzalloc(len, GFP_KERNEL);
+ if (msg_data == NULL)
+ return -ENOMEM;
+
+ if (memcpy_from_msg(msg_data, msg, len)) {
+ kfree(msg_data);
+ return -EFAULT;
+ }
+
+ remaining_len = len;
+ msg_ptr = msg_data;
+
+ do {
+ remote_miu = sock->remote_miu > LLCP_MAX_MIU ?
+ LLCP_DEFAULT_MIU : sock->remote_miu;
+
+ frag_len = min_t(size_t, remote_miu, remaining_len);
+
+ pr_debug("Fragment %zd bytes remaining %zd",
+ frag_len, remaining_len);
+
+ pdu = llcp_allocate_pdu(sock, LLCP_PDU_I,
+ frag_len + LLCP_SEQUENCE_SIZE);
+ if (pdu == NULL) {
+ kfree(msg_data);
+ return -ENOMEM;
+ }
+
+ skb_put(pdu, LLCP_SEQUENCE_SIZE);
+
+ if (likely(frag_len > 0))
+ memcpy(skb_put(pdu, frag_len), msg_ptr, frag_len);
+
+ skb_queue_tail(&sock->tx_queue, pdu);
+
+ lock_sock(sk);
+
+ nfc_llcp_queue_i_frames(sock);
+
+ release_sock(sk);
+
+ remaining_len -= frag_len;
+ msg_ptr += frag_len;
+ } while (remaining_len > 0);
+
+ kfree(msg_data);
+
+ return len;
+}
+
+int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
+ struct msghdr *msg, size_t len)
+{
+ struct sk_buff *pdu;
+ struct nfc_llcp_local *local;
+ size_t frag_len = 0, remaining_len;
+ u8 *msg_ptr, *msg_data;
+ u16 remote_miu;
+ int err;
+
+ pr_debug("Send UI frame len %zd\n", len);
+
+ local = sock->local;
+ if (local == NULL)
+ return -ENODEV;
+
+ msg_data = kzalloc(len, GFP_KERNEL);
+ if (msg_data == NULL)
+ return -ENOMEM;
+
+ if (memcpy_from_msg(msg_data, msg, len)) {
+ kfree(msg_data);
+ return -EFAULT;
+ }
+
+ remaining_len = len;
+ msg_ptr = msg_data;
+
+ do {
+ remote_miu = sock->remote_miu > LLCP_MAX_MIU ?
+ local->remote_miu : sock->remote_miu;
+
+ frag_len = min_t(size_t, remote_miu, remaining_len);
+
+ pr_debug("Fragment %zd bytes remaining %zd",
+ frag_len, remaining_len);
+
+ pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT,
+ frag_len + LLCP_HEADER_SIZE, &err);
+ if (pdu == NULL) {
+ pr_err("Could not allocate PDU\n");
+ continue;
+ }
+
+ pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI);
+
+ if (likely(frag_len > 0))
+ memcpy(skb_put(pdu, frag_len), msg_ptr, frag_len);
+
+ /* No need to check for the peer RW for UI frames */
+ skb_queue_tail(&local->tx_queue, pdu);
+
+ remaining_len -= frag_len;
+ msg_ptr += frag_len;
+ } while (remaining_len > 0);
+
+ kfree(msg_data);
+
+ return len;
+}
+
+int nfc_llcp_send_rr(struct nfc_llcp_sock *sock)
+{
+ struct sk_buff *skb;
+ struct nfc_llcp_local *local;
+
+ pr_debug("Send rr nr %d\n", sock->recv_n);
+
+ local = sock->local;
+ if (local == NULL)
+ return -ENODEV;
+
+ skb = llcp_allocate_pdu(sock, LLCP_PDU_RR, LLCP_SEQUENCE_SIZE);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ skb_put(skb, LLCP_SEQUENCE_SIZE);
+
+ skb->data[2] = sock->recv_n;
+
+ skb_queue_head(&local->tx_queue, skb);
+
+ return 0;
+}
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
new file mode 100644
index 000000000..b18f07ccb
--- /dev/null
+++ b/net/nfc/llcp_core.c
@@ -0,0 +1,1637 @@
+/*
+ * Copyright (C) 2011 Intel Corporation. All rights reserved.
+ * Copyright (C) 2014 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) "llcp: %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/nfc.h>
+
+#include "nfc.h"
+#include "llcp.h"
+
+static u8 llcp_magic[3] = {0x46, 0x66, 0x6d};
+
+static LIST_HEAD(llcp_devices);
+
+static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb);
+
+void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *sk)
+{
+ write_lock(&l->lock);
+ sk_add_node(sk, &l->head);
+ write_unlock(&l->lock);
+}
+
+void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *sk)
+{
+ write_lock(&l->lock);
+ sk_del_node_init(sk);
+ write_unlock(&l->lock);
+}
+
+void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock)
+{
+ sock->remote_rw = LLCP_DEFAULT_RW;
+ sock->remote_miu = LLCP_MAX_MIU + 1;
+}
+
+static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock)
+{
+ struct nfc_llcp_local *local = sock->local;
+ struct sk_buff *s, *tmp;
+
+ pr_debug("%p\n", &sock->sk);
+
+ skb_queue_purge(&sock->tx_queue);
+ skb_queue_purge(&sock->tx_pending_queue);
+
+ if (local == NULL)
+ return;
+
+ /* Search for local pending SKBs that are related to this socket */
+ skb_queue_walk_safe(&local->tx_queue, s, tmp) {
+ if (s->sk != &sock->sk)
+ continue;
+
+ skb_unlink(s, &local->tx_queue);
+ kfree_skb(s);
+ }
+}
+
+static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device,
+ int err)
+{
+ struct sock *sk;
+ struct hlist_node *tmp;
+ struct nfc_llcp_sock *llcp_sock;
+
+ skb_queue_purge(&local->tx_queue);
+
+ write_lock(&local->sockets.lock);
+
+ sk_for_each_safe(sk, tmp, &local->sockets.head) {
+ llcp_sock = nfc_llcp_sock(sk);
+
+ bh_lock_sock(sk);
+
+ nfc_llcp_socket_purge(llcp_sock);
+
+ if (sk->sk_state == LLCP_CONNECTED)
+ nfc_put_device(llcp_sock->dev);
+
+ if (sk->sk_state == LLCP_LISTEN) {
+ struct nfc_llcp_sock *lsk, *n;
+ struct sock *accept_sk;
+
+ list_for_each_entry_safe(lsk, n,
+ &llcp_sock->accept_queue,
+ accept_queue) {
+ accept_sk = &lsk->sk;
+ bh_lock_sock(accept_sk);
+
+ nfc_llcp_accept_unlink(accept_sk);
+
+ if (err)
+ accept_sk->sk_err = err;
+ accept_sk->sk_state = LLCP_CLOSED;
+ accept_sk->sk_state_change(sk);
+
+ bh_unlock_sock(accept_sk);
+ }
+ }
+
+ if (err)
+ sk->sk_err = err;
+ sk->sk_state = LLCP_CLOSED;
+ sk->sk_state_change(sk);
+
+ bh_unlock_sock(sk);
+
+ sk_del_node_init(sk);
+ }
+
+ write_unlock(&local->sockets.lock);
+
+ /* If we still have a device, we keep the RAW sockets alive */
+ if (device == true)
+ return;
+
+ write_lock(&local->raw_sockets.lock);
+
+ sk_for_each_safe(sk, tmp, &local->raw_sockets.head) {
+ llcp_sock = nfc_llcp_sock(sk);
+
+ bh_lock_sock(sk);
+
+ nfc_llcp_socket_purge(llcp_sock);
+
+ if (err)
+ sk->sk_err = err;
+ sk->sk_state = LLCP_CLOSED;
+ sk->sk_state_change(sk);
+
+ bh_unlock_sock(sk);
+
+ sk_del_node_init(sk);
+ }
+
+ write_unlock(&local->raw_sockets.lock);
+}
+
+struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local)
+{
+ kref_get(&local->ref);
+
+ return local;
+}
+
+static void local_cleanup(struct nfc_llcp_local *local)
+{
+ nfc_llcp_socket_release(local, false, ENXIO);
+ del_timer_sync(&local->link_timer);
+ skb_queue_purge(&local->tx_queue);
+ cancel_work_sync(&local->tx_work);
+ cancel_work_sync(&local->rx_work);
+ cancel_work_sync(&local->timeout_work);
+ kfree_skb(local->rx_pending);
+ del_timer_sync(&local->sdreq_timer);
+ cancel_work_sync(&local->sdreq_timeout_work);
+ nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs);
+}
+
+static void local_release(struct kref *ref)
+{
+ struct nfc_llcp_local *local;
+
+ local = container_of(ref, struct nfc_llcp_local, ref);
+
+ list_del(&local->list);
+ local_cleanup(local);
+ kfree(local);
+}
+
+int nfc_llcp_local_put(struct nfc_llcp_local *local)
+{
+ if (local == NULL)
+ return 0;
+
+ return kref_put(&local->ref, local_release);
+}
+
+static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local,
+ u8 ssap, u8 dsap)
+{
+ struct sock *sk;
+ struct nfc_llcp_sock *llcp_sock, *tmp_sock;
+
+ pr_debug("ssap dsap %d %d\n", ssap, dsap);
+
+ if (ssap == 0 && dsap == 0)
+ return NULL;
+
+ read_lock(&local->sockets.lock);
+
+ llcp_sock = NULL;
+
+ sk_for_each(sk, &local->sockets.head) {
+ tmp_sock = nfc_llcp_sock(sk);
+
+ if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) {
+ llcp_sock = tmp_sock;
+ break;
+ }
+ }
+
+ read_unlock(&local->sockets.lock);
+
+ if (llcp_sock == NULL)
+ return NULL;
+
+ sock_hold(&llcp_sock->sk);
+
+ return llcp_sock;
+}
+
+static void nfc_llcp_sock_put(struct nfc_llcp_sock *sock)
+{
+ sock_put(&sock->sk);
+}
+
+static void nfc_llcp_timeout_work(struct work_struct *work)
+{
+ struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local,
+ timeout_work);
+
+ nfc_dep_link_down(local->dev);
+}
+
+static void nfc_llcp_symm_timer(unsigned long data)
+{
+ struct nfc_llcp_local *local = (struct nfc_llcp_local *) data;
+
+ pr_err("SYMM timeout\n");
+
+ schedule_work(&local->timeout_work);
+}
+
+static void nfc_llcp_sdreq_timeout_work(struct work_struct *work)
+{
+ unsigned long time;
+ HLIST_HEAD(nl_sdres_list);
+ struct hlist_node *n;
+ struct nfc_llcp_sdp_tlv *sdp;
+ struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local,
+ sdreq_timeout_work);
+
+ mutex_lock(&local->sdreq_lock);
+
+ time = jiffies - msecs_to_jiffies(3 * local->remote_lto);
+
+ hlist_for_each_entry_safe(sdp, n, &local->pending_sdreqs, node) {
+ if (time_after(sdp->time, time))
+ continue;
+
+ sdp->sap = LLCP_SDP_UNBOUND;
+
+ hlist_del(&sdp->node);
+
+ hlist_add_head(&sdp->node, &nl_sdres_list);
+ }
+
+ if (!hlist_empty(&local->pending_sdreqs))
+ mod_timer(&local->sdreq_timer,
+ jiffies + msecs_to_jiffies(3 * local->remote_lto));
+
+ mutex_unlock(&local->sdreq_lock);
+
+ if (!hlist_empty(&nl_sdres_list))
+ nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list);
+}
+
+static void nfc_llcp_sdreq_timer(unsigned long data)
+{
+ struct nfc_llcp_local *local = (struct nfc_llcp_local *) data;
+
+ schedule_work(&local->sdreq_timeout_work);
+}
+
+struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev)
+{
+ struct nfc_llcp_local *local;
+
+ list_for_each_entry(local, &llcp_devices, list)
+ if (local->dev == dev)
+ return local;
+
+ pr_debug("No device found\n");
+
+ return NULL;
+}
+
+static char *wks[] = {
+ NULL,
+ NULL, /* SDP */
+ "urn:nfc:sn:ip",
+ "urn:nfc:sn:obex",
+ "urn:nfc:sn:snep",
+};
+
+static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len)
+{
+ int sap, num_wks;
+
+ pr_debug("%s\n", service_name);
+
+ if (service_name == NULL)
+ return -EINVAL;
+
+ num_wks = ARRAY_SIZE(wks);
+
+ for (sap = 0; sap < num_wks; sap++) {
+ if (wks[sap] == NULL)
+ continue;
+
+ if (strncmp(wks[sap], service_name, service_name_len) == 0)
+ return sap;
+ }
+
+ return -EINVAL;
+}
+
+static
+struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local,
+ u8 *sn, size_t sn_len)
+{
+ struct sock *sk;
+ struct nfc_llcp_sock *llcp_sock, *tmp_sock;
+
+ pr_debug("sn %zd %p\n", sn_len, sn);
+
+ if (sn == NULL || sn_len == 0)
+ return NULL;
+
+ read_lock(&local->sockets.lock);
+
+ llcp_sock = NULL;
+
+ sk_for_each(sk, &local->sockets.head) {
+ tmp_sock = nfc_llcp_sock(sk);
+
+ pr_debug("llcp sock %p\n", tmp_sock);
+
+ if (tmp_sock->sk.sk_type == SOCK_STREAM &&
+ tmp_sock->sk.sk_state != LLCP_LISTEN)
+ continue;
+
+ if (tmp_sock->sk.sk_type == SOCK_DGRAM &&
+ tmp_sock->sk.sk_state != LLCP_BOUND)
+ continue;
+
+ if (tmp_sock->service_name == NULL ||
+ tmp_sock->service_name_len == 0)
+ continue;
+
+ if (tmp_sock->service_name_len != sn_len)
+ continue;
+
+ if (memcmp(sn, tmp_sock->service_name, sn_len) == 0) {
+ llcp_sock = tmp_sock;
+ break;
+ }
+ }
+
+ read_unlock(&local->sockets.lock);
+
+ pr_debug("Found llcp sock %p\n", llcp_sock);
+
+ return llcp_sock;
+}
+
+u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local,
+ struct nfc_llcp_sock *sock)
+{
+ mutex_lock(&local->sdp_lock);
+
+ if (sock->service_name != NULL && sock->service_name_len > 0) {
+ int ssap = nfc_llcp_wks_sap(sock->service_name,
+ sock->service_name_len);
+
+ if (ssap > 0) {
+ pr_debug("WKS %d\n", ssap);
+
+ /* This is a WKS, let's check if it's free */
+ if (local->local_wks & BIT(ssap)) {
+ mutex_unlock(&local->sdp_lock);
+
+ return LLCP_SAP_MAX;
+ }
+
+ set_bit(ssap, &local->local_wks);
+ mutex_unlock(&local->sdp_lock);
+
+ return ssap;
+ }
+
+ /*
+ * Check if there already is a non WKS socket bound
+ * to this service name.
+ */
+ if (nfc_llcp_sock_from_sn(local, sock->service_name,
+ sock->service_name_len) != NULL) {
+ mutex_unlock(&local->sdp_lock);
+
+ return LLCP_SAP_MAX;
+ }
+
+ mutex_unlock(&local->sdp_lock);
+
+ return LLCP_SDP_UNBOUND;
+
+ } else if (sock->ssap != 0 && sock->ssap < LLCP_WKS_NUM_SAP) {
+ if (!test_bit(sock->ssap, &local->local_wks)) {
+ set_bit(sock->ssap, &local->local_wks);
+ mutex_unlock(&local->sdp_lock);
+
+ return sock->ssap;
+ }
+ }
+
+ mutex_unlock(&local->sdp_lock);
+
+ return LLCP_SAP_MAX;
+}
+
+u8 nfc_llcp_get_local_ssap(struct nfc_llcp_local *local)
+{
+ u8 local_ssap;
+
+ mutex_lock(&local->sdp_lock);
+
+ local_ssap = find_first_zero_bit(&local->local_sap, LLCP_LOCAL_NUM_SAP);
+ if (local_ssap == LLCP_LOCAL_NUM_SAP) {
+ mutex_unlock(&local->sdp_lock);
+ return LLCP_SAP_MAX;
+ }
+
+ set_bit(local_ssap, &local->local_sap);
+
+ mutex_unlock(&local->sdp_lock);
+
+ return local_ssap + LLCP_LOCAL_SAP_OFFSET;
+}
+
+void nfc_llcp_put_ssap(struct nfc_llcp_local *local, u8 ssap)
+{
+ u8 local_ssap;
+ unsigned long *sdp;
+
+ if (ssap < LLCP_WKS_NUM_SAP) {
+ local_ssap = ssap;
+ sdp = &local->local_wks;
+ } else if (ssap < LLCP_LOCAL_NUM_SAP) {
+ atomic_t *client_cnt;
+
+ local_ssap = ssap - LLCP_WKS_NUM_SAP;
+ sdp = &local->local_sdp;
+ client_cnt = &local->local_sdp_cnt[local_ssap];
+
+ pr_debug("%d clients\n", atomic_read(client_cnt));
+
+ mutex_lock(&local->sdp_lock);
+
+ if (atomic_dec_and_test(client_cnt)) {
+ struct nfc_llcp_sock *l_sock;
+
+ pr_debug("No more clients for SAP %d\n", ssap);
+
+ clear_bit(local_ssap, sdp);
+
+ /* Find the listening sock and set it back to UNBOUND */
+ l_sock = nfc_llcp_sock_get(local, ssap, LLCP_SAP_SDP);
+ if (l_sock) {
+ l_sock->ssap = LLCP_SDP_UNBOUND;
+ nfc_llcp_sock_put(l_sock);
+ }
+ }
+
+ mutex_unlock(&local->sdp_lock);
+
+ return;
+ } else if (ssap < LLCP_MAX_SAP) {
+ local_ssap = ssap - LLCP_LOCAL_NUM_SAP;
+ sdp = &local->local_sap;
+ } else {
+ return;
+ }
+
+ mutex_lock(&local->sdp_lock);
+
+ clear_bit(local_ssap, sdp);
+
+ mutex_unlock(&local->sdp_lock);
+}
+
+static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local)
+{
+ u8 ssap;
+
+ mutex_lock(&local->sdp_lock);
+
+ ssap = find_first_zero_bit(&local->local_sdp, LLCP_SDP_NUM_SAP);
+ if (ssap == LLCP_SDP_NUM_SAP) {
+ mutex_unlock(&local->sdp_lock);
+
+ return LLCP_SAP_MAX;
+ }
+
+ pr_debug("SDP ssap %d\n", LLCP_WKS_NUM_SAP + ssap);
+
+ set_bit(ssap, &local->local_sdp);
+
+ mutex_unlock(&local->sdp_lock);
+
+ return LLCP_WKS_NUM_SAP + ssap;
+}
+
+static int nfc_llcp_build_gb(struct nfc_llcp_local *local)
+{
+ u8 *gb_cur, *version_tlv, version, version_length;
+ u8 *lto_tlv, lto_length;
+ u8 *wks_tlv, wks_length;
+ u8 *miux_tlv, miux_length;
+ __be16 wks = cpu_to_be16(local->local_wks);
+ u8 gb_len = 0;
+ int ret = 0;
+
+ version = LLCP_VERSION_11;
+ version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version,
+ 1, &version_length);
+ gb_len += version_length;
+
+ lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, &lto_length);
+ gb_len += lto_length;
+
+ pr_debug("Local wks 0x%lx\n", local->local_wks);
+ wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length);
+ gb_len += wks_length;
+
+ miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0,
+ &miux_length);
+ gb_len += miux_length;
+
+ gb_len += ARRAY_SIZE(llcp_magic);
+
+ if (gb_len > NFC_MAX_GT_LEN) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ gb_cur = local->gb;
+
+ memcpy(gb_cur, llcp_magic, ARRAY_SIZE(llcp_magic));
+ gb_cur += ARRAY_SIZE(llcp_magic);
+
+ memcpy(gb_cur, version_tlv, version_length);
+ gb_cur += version_length;
+
+ memcpy(gb_cur, lto_tlv, lto_length);
+ gb_cur += lto_length;
+
+ memcpy(gb_cur, wks_tlv, wks_length);
+ gb_cur += wks_length;
+
+ memcpy(gb_cur, miux_tlv, miux_length);
+ gb_cur += miux_length;
+
+ local->gb_len = gb_len;
+
+out:
+ kfree(version_tlv);
+ kfree(lto_tlv);
+ kfree(wks_tlv);
+ kfree(miux_tlv);
+
+ return ret;
+}
+
+u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len)
+{
+ struct nfc_llcp_local *local;
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL) {
+ *general_bytes_len = 0;
+ return NULL;
+ }
+
+ nfc_llcp_build_gb(local);
+
+ *general_bytes_len = local->gb_len;
+
+ return local->gb;
+}
+
+int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len)
+{
+ struct nfc_llcp_local *local;
+
+ if (gb_len < 3 || gb_len > NFC_MAX_GT_LEN)
+ return -EINVAL;
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL) {
+ pr_err("No LLCP device\n");
+ return -ENODEV;
+ }
+
+ memset(local->remote_gb, 0, NFC_MAX_GT_LEN);
+ memcpy(local->remote_gb, gb, gb_len);
+ local->remote_gb_len = gb_len;
+
+ if (memcmp(local->remote_gb, llcp_magic, 3)) {
+ pr_err("MAC does not support LLCP\n");
+ return -EINVAL;
+ }
+
+ return nfc_llcp_parse_gb_tlv(local,
+ &local->remote_gb[3],
+ local->remote_gb_len - 3);
+}
+
+static u8 nfc_llcp_dsap(struct sk_buff *pdu)
+{
+ return (pdu->data[0] & 0xfc) >> 2;
+}
+
+static u8 nfc_llcp_ptype(struct sk_buff *pdu)
+{
+ return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6);
+}
+
+static u8 nfc_llcp_ssap(struct sk_buff *pdu)
+{
+ return pdu->data[1] & 0x3f;
+}
+
+static u8 nfc_llcp_ns(struct sk_buff *pdu)
+{
+ return pdu->data[2] >> 4;
+}
+
+static u8 nfc_llcp_nr(struct sk_buff *pdu)
+{
+ return pdu->data[2] & 0xf;
+}
+
+static void nfc_llcp_set_nrns(struct nfc_llcp_sock *sock, struct sk_buff *pdu)
+{
+ pdu->data[2] = (sock->send_n << 4) | (sock->recv_n);
+ sock->send_n = (sock->send_n + 1) % 16;
+ sock->recv_ack_n = (sock->recv_n - 1) % 16;
+}
+
+void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local,
+ struct sk_buff *skb, u8 direction)
+{
+ struct sk_buff *skb_copy = NULL, *nskb;
+ struct sock *sk;
+ u8 *data;
+
+ read_lock(&local->raw_sockets.lock);
+
+ sk_for_each(sk, &local->raw_sockets.head) {
+ if (sk->sk_state != LLCP_BOUND)
+ continue;
+
+ if (skb_copy == NULL) {
+ skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE,
+ GFP_ATOMIC, true);
+
+ if (skb_copy == NULL)
+ continue;
+
+ data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE);
+
+ data[0] = local->dev ? local->dev->idx : 0xFF;
+ data[1] = direction & 0x01;
+ data[1] |= (RAW_PAYLOAD_LLCP << 1);
+ }
+
+ nskb = skb_clone(skb_copy, GFP_ATOMIC);
+ if (!nskb)
+ continue;
+
+ if (sock_queue_rcv_skb(sk, nskb))
+ kfree_skb(nskb);
+ }
+
+ read_unlock(&local->raw_sockets.lock);
+
+ kfree_skb(skb_copy);
+}
+
+static void nfc_llcp_tx_work(struct work_struct *work)
+{
+ struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local,
+ tx_work);
+ struct sk_buff *skb;
+ struct sock *sk;
+ struct nfc_llcp_sock *llcp_sock;
+
+ skb = skb_dequeue(&local->tx_queue);
+ if (skb != NULL) {
+ sk = skb->sk;
+ llcp_sock = nfc_llcp_sock(sk);
+
+ if (llcp_sock == NULL && nfc_llcp_ptype(skb) == LLCP_PDU_I) {
+ kfree_skb(skb);
+ nfc_llcp_send_symm(local->dev);
+ } else if (llcp_sock && !llcp_sock->remote_ready) {
+ skb_queue_head(&local->tx_queue, skb);
+ nfc_llcp_send_symm(local->dev);
+ } else {
+ struct sk_buff *copy_skb = NULL;
+ u8 ptype = nfc_llcp_ptype(skb);
+ int ret;
+
+ pr_debug("Sending pending skb\n");
+ print_hex_dump(KERN_DEBUG, "LLCP Tx: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ skb->data, skb->len, true);
+
+ if (ptype == LLCP_PDU_DISC && sk != NULL &&
+ sk->sk_state == LLCP_DISCONNECTING) {
+ nfc_llcp_sock_unlink(&local->sockets, sk);
+ sock_orphan(sk);
+ sock_put(sk);
+ }
+
+ if (ptype == LLCP_PDU_I)
+ copy_skb = skb_copy(skb, GFP_ATOMIC);
+
+ __net_timestamp(skb);
+
+ nfc_llcp_send_to_raw_sock(local, skb,
+ NFC_DIRECTION_TX);
+
+ ret = nfc_data_exchange(local->dev, local->target_idx,
+ skb, nfc_llcp_recv, local);
+
+ if (ret) {
+ kfree_skb(copy_skb);
+ goto out;
+ }
+
+ if (ptype == LLCP_PDU_I && copy_skb)
+ skb_queue_tail(&llcp_sock->tx_pending_queue,
+ copy_skb);
+ }
+ } else {
+ nfc_llcp_send_symm(local->dev);
+ }
+
+out:
+ mod_timer(&local->link_timer,
+ jiffies + msecs_to_jiffies(2 * local->remote_lto));
+}
+
+static struct nfc_llcp_sock *nfc_llcp_connecting_sock_get(struct nfc_llcp_local *local,
+ u8 ssap)
+{
+ struct sock *sk;
+ struct nfc_llcp_sock *llcp_sock;
+
+ read_lock(&local->connecting_sockets.lock);
+
+ sk_for_each(sk, &local->connecting_sockets.head) {
+ llcp_sock = nfc_llcp_sock(sk);
+
+ if (llcp_sock->ssap == ssap) {
+ sock_hold(&llcp_sock->sk);
+ goto out;
+ }
+ }
+
+ llcp_sock = NULL;
+
+out:
+ read_unlock(&local->connecting_sockets.lock);
+
+ return llcp_sock;
+}
+
+static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local,
+ u8 *sn, size_t sn_len)
+{
+ struct nfc_llcp_sock *llcp_sock;
+
+ llcp_sock = nfc_llcp_sock_from_sn(local, sn, sn_len);
+
+ if (llcp_sock == NULL)
+ return NULL;
+
+ sock_hold(&llcp_sock->sk);
+
+ return llcp_sock;
+}
+
+static u8 *nfc_llcp_connect_sn(struct sk_buff *skb, size_t *sn_len)
+{
+ u8 *tlv = &skb->data[2], type, length;
+ size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0;
+
+ while (offset < tlv_array_len) {
+ type = tlv[0];
+ length = tlv[1];
+
+ pr_debug("type 0x%x length %d\n", type, length);
+
+ if (type == LLCP_TLV_SN) {
+ *sn_len = length;
+ return &tlv[2];
+ }
+
+ offset += length + 2;
+ tlv += length + 2;
+ }
+
+ return NULL;
+}
+
+static void nfc_llcp_recv_ui(struct nfc_llcp_local *local,
+ struct sk_buff *skb)
+{
+ struct nfc_llcp_sock *llcp_sock;
+ struct nfc_llcp_ui_cb *ui_cb;
+ u8 dsap, ssap;
+
+ dsap = nfc_llcp_dsap(skb);
+ ssap = nfc_llcp_ssap(skb);
+
+ ui_cb = nfc_llcp_ui_skb_cb(skb);
+ ui_cb->dsap = dsap;
+ ui_cb->ssap = ssap;
+
+ pr_debug("%d %d\n", dsap, ssap);
+
+ /* We're looking for a bound socket, not a client one */
+ llcp_sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP);
+ if (llcp_sock == NULL || llcp_sock->sk.sk_type != SOCK_DGRAM)
+ return;
+
+ /* There is no sequence with UI frames */
+ skb_pull(skb, LLCP_HEADER_SIZE);
+ if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) {
+ /*
+ * UI frames will be freed from the socket layer, so we
+ * need to keep them alive until someone receives them.
+ */
+ skb_get(skb);
+ } else {
+ pr_err("Receive queue is full\n");
+ }
+
+ nfc_llcp_sock_put(llcp_sock);
+}
+
+static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
+ struct sk_buff *skb)
+{
+ struct sock *new_sk, *parent;
+ struct nfc_llcp_sock *sock, *new_sock;
+ u8 dsap, ssap, reason;
+
+ dsap = nfc_llcp_dsap(skb);
+ ssap = nfc_llcp_ssap(skb);
+
+ pr_debug("%d %d\n", dsap, ssap);
+
+ if (dsap != LLCP_SAP_SDP) {
+ sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP);
+ if (sock == NULL || sock->sk.sk_state != LLCP_LISTEN) {
+ reason = LLCP_DM_NOBOUND;
+ goto fail;
+ }
+ } else {
+ u8 *sn;
+ size_t sn_len;
+
+ sn = nfc_llcp_connect_sn(skb, &sn_len);
+ if (sn == NULL) {
+ reason = LLCP_DM_NOBOUND;
+ goto fail;
+ }
+
+ pr_debug("Service name length %zu\n", sn_len);
+
+ sock = nfc_llcp_sock_get_sn(local, sn, sn_len);
+ if (sock == NULL) {
+ reason = LLCP_DM_NOBOUND;
+ goto fail;
+ }
+ }
+
+ lock_sock(&sock->sk);
+
+ parent = &sock->sk;
+
+ if (sk_acceptq_is_full(parent)) {
+ reason = LLCP_DM_REJ;
+ release_sock(&sock->sk);
+ sock_put(&sock->sk);
+ goto fail;
+ }
+
+ if (sock->ssap == LLCP_SDP_UNBOUND) {
+ u8 ssap = nfc_llcp_reserve_sdp_ssap(local);
+
+ pr_debug("First client, reserving %d\n", ssap);
+
+ if (ssap == LLCP_SAP_MAX) {
+ reason = LLCP_DM_REJ;
+ release_sock(&sock->sk);
+ sock_put(&sock->sk);
+ goto fail;
+ }
+
+ sock->ssap = ssap;
+ }
+
+ new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC);
+ if (new_sk == NULL) {
+ reason = LLCP_DM_REJ;
+ release_sock(&sock->sk);
+ sock_put(&sock->sk);
+ goto fail;
+ }
+
+ new_sock = nfc_llcp_sock(new_sk);
+ new_sock->dev = local->dev;
+ new_sock->local = nfc_llcp_local_get(local);
+ new_sock->rw = sock->rw;
+ new_sock->miux = sock->miux;
+ new_sock->nfc_protocol = sock->nfc_protocol;
+ new_sock->dsap = ssap;
+ new_sock->target_idx = local->target_idx;
+ new_sock->parent = parent;
+ new_sock->ssap = sock->ssap;
+ if (sock->ssap < LLCP_LOCAL_NUM_SAP && sock->ssap >= LLCP_WKS_NUM_SAP) {
+ atomic_t *client_count;
+
+ pr_debug("reserved_ssap %d for %p\n", sock->ssap, new_sock);
+
+ client_count =
+ &local->local_sdp_cnt[sock->ssap - LLCP_WKS_NUM_SAP];
+
+ atomic_inc(client_count);
+ new_sock->reserved_ssap = sock->ssap;
+ }
+
+ nfc_llcp_parse_connection_tlv(new_sock, &skb->data[LLCP_HEADER_SIZE],
+ skb->len - LLCP_HEADER_SIZE);
+
+ pr_debug("new sock %p sk %p\n", new_sock, &new_sock->sk);
+
+ nfc_llcp_sock_link(&local->sockets, new_sk);
+
+ nfc_llcp_accept_enqueue(&sock->sk, new_sk);
+
+ nfc_get_device(local->dev->idx);
+
+ new_sk->sk_state = LLCP_CONNECTED;
+
+ /* Wake the listening processes */
+ parent->sk_data_ready(parent);
+
+ /* Send CC */
+ nfc_llcp_send_cc(new_sock);
+
+ release_sock(&sock->sk);
+ sock_put(&sock->sk);
+
+ return;
+
+fail:
+ /* Send DM */
+ nfc_llcp_send_dm(local, dsap, ssap, reason);
+}
+
+int nfc_llcp_queue_i_frames(struct nfc_llcp_sock *sock)
+{
+ int nr_frames = 0;
+ struct nfc_llcp_local *local = sock->local;
+
+ pr_debug("Remote ready %d tx queue len %d remote rw %d",
+ sock->remote_ready, skb_queue_len(&sock->tx_pending_queue),
+ sock->remote_rw);
+
+ /* Try to queue some I frames for transmission */
+ while (sock->remote_ready &&
+ skb_queue_len(&sock->tx_pending_queue) < sock->remote_rw) {
+ struct sk_buff *pdu;
+
+ pdu = skb_dequeue(&sock->tx_queue);
+ if (pdu == NULL)
+ break;
+
+ /* Update N(S)/N(R) */
+ nfc_llcp_set_nrns(sock, pdu);
+
+ skb_queue_tail(&local->tx_queue, pdu);
+ nr_frames++;
+ }
+
+ return nr_frames;
+}
+
+static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local,
+ struct sk_buff *skb)
+{
+ struct nfc_llcp_sock *llcp_sock;
+ struct sock *sk;
+ u8 dsap, ssap, ptype, ns, nr;
+
+ ptype = nfc_llcp_ptype(skb);
+ dsap = nfc_llcp_dsap(skb);
+ ssap = nfc_llcp_ssap(skb);
+ ns = nfc_llcp_ns(skb);
+ nr = nfc_llcp_nr(skb);
+
+ pr_debug("%d %d R %d S %d\n", dsap, ssap, nr, ns);
+
+ llcp_sock = nfc_llcp_sock_get(local, dsap, ssap);
+ if (llcp_sock == NULL) {
+ nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN);
+ return;
+ }
+
+ sk = &llcp_sock->sk;
+ lock_sock(sk);
+ if (sk->sk_state == LLCP_CLOSED) {
+ release_sock(sk);
+ nfc_llcp_sock_put(llcp_sock);
+ }
+
+ /* Pass the payload upstream */
+ if (ptype == LLCP_PDU_I) {
+ pr_debug("I frame, queueing on %p\n", &llcp_sock->sk);
+
+ if (ns == llcp_sock->recv_n)
+ llcp_sock->recv_n = (llcp_sock->recv_n + 1) % 16;
+ else
+ pr_err("Received out of sequence I PDU\n");
+
+ skb_pull(skb, LLCP_HEADER_SIZE + LLCP_SEQUENCE_SIZE);
+ if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) {
+ /*
+ * I frames will be freed from the socket layer, so we
+ * need to keep them alive until someone receives them.
+ */
+ skb_get(skb);
+ } else {
+ pr_err("Receive queue is full\n");
+ }
+ }
+
+ /* Remove skbs from the pending queue */
+ if (llcp_sock->send_ack_n != nr) {
+ struct sk_buff *s, *tmp;
+ u8 n;
+
+ llcp_sock->send_ack_n = nr;
+
+ /* Remove and free all skbs until ns == nr */
+ skb_queue_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) {
+ n = nfc_llcp_ns(s);
+
+ skb_unlink(s, &llcp_sock->tx_pending_queue);
+ kfree_skb(s);
+
+ if (n == nr)
+ break;
+ }
+
+ /* Re-queue the remaining skbs for transmission */
+ skb_queue_reverse_walk_safe(&llcp_sock->tx_pending_queue,
+ s, tmp) {
+ skb_unlink(s, &llcp_sock->tx_pending_queue);
+ skb_queue_head(&local->tx_queue, s);
+ }
+ }
+
+ if (ptype == LLCP_PDU_RR)
+ llcp_sock->remote_ready = true;
+ else if (ptype == LLCP_PDU_RNR)
+ llcp_sock->remote_ready = false;
+
+ if (nfc_llcp_queue_i_frames(llcp_sock) == 0 && ptype == LLCP_PDU_I)
+ nfc_llcp_send_rr(llcp_sock);
+
+ release_sock(sk);
+ nfc_llcp_sock_put(llcp_sock);
+}
+
+static void nfc_llcp_recv_disc(struct nfc_llcp_local *local,
+ struct sk_buff *skb)
+{
+ struct nfc_llcp_sock *llcp_sock;
+ struct sock *sk;
+ u8 dsap, ssap;
+
+ dsap = nfc_llcp_dsap(skb);
+ ssap = nfc_llcp_ssap(skb);
+
+ if ((dsap == 0) && (ssap == 0)) {
+ pr_debug("Connection termination");
+ nfc_dep_link_down(local->dev);
+ return;
+ }
+
+ llcp_sock = nfc_llcp_sock_get(local, dsap, ssap);
+ if (llcp_sock == NULL) {
+ nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN);
+ return;
+ }
+
+ sk = &llcp_sock->sk;
+ lock_sock(sk);
+
+ nfc_llcp_socket_purge(llcp_sock);
+
+ if (sk->sk_state == LLCP_CLOSED) {
+ release_sock(sk);
+ nfc_llcp_sock_put(llcp_sock);
+ }
+
+ if (sk->sk_state == LLCP_CONNECTED) {
+ nfc_put_device(local->dev);
+ sk->sk_state = LLCP_CLOSED;
+ sk->sk_state_change(sk);
+ }
+
+ nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_DISC);
+
+ release_sock(sk);
+ nfc_llcp_sock_put(llcp_sock);
+}
+
+static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb)
+{
+ struct nfc_llcp_sock *llcp_sock;
+ struct sock *sk;
+ u8 dsap, ssap;
+
+ dsap = nfc_llcp_dsap(skb);
+ ssap = nfc_llcp_ssap(skb);
+
+ llcp_sock = nfc_llcp_connecting_sock_get(local, dsap);
+ if (llcp_sock == NULL) {
+ pr_err("Invalid CC\n");
+ nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN);
+
+ return;
+ }
+
+ sk = &llcp_sock->sk;
+
+ /* Unlink from connecting and link to the client array */
+ nfc_llcp_sock_unlink(&local->connecting_sockets, sk);
+ nfc_llcp_sock_link(&local->sockets, sk);
+ llcp_sock->dsap = ssap;
+
+ nfc_llcp_parse_connection_tlv(llcp_sock, &skb->data[LLCP_HEADER_SIZE],
+ skb->len - LLCP_HEADER_SIZE);
+
+ sk->sk_state = LLCP_CONNECTED;
+ sk->sk_state_change(sk);
+
+ nfc_llcp_sock_put(llcp_sock);
+}
+
+static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb)
+{
+ struct nfc_llcp_sock *llcp_sock;
+ struct sock *sk;
+ u8 dsap, ssap, reason;
+
+ dsap = nfc_llcp_dsap(skb);
+ ssap = nfc_llcp_ssap(skb);
+ reason = skb->data[2];
+
+ pr_debug("%d %d reason %d\n", ssap, dsap, reason);
+
+ switch (reason) {
+ case LLCP_DM_NOBOUND:
+ case LLCP_DM_REJ:
+ llcp_sock = nfc_llcp_connecting_sock_get(local, dsap);
+ break;
+
+ default:
+ llcp_sock = nfc_llcp_sock_get(local, dsap, ssap);
+ break;
+ }
+
+ if (llcp_sock == NULL) {
+ pr_debug("Already closed\n");
+ return;
+ }
+
+ sk = &llcp_sock->sk;
+
+ sk->sk_err = ENXIO;
+ sk->sk_state = LLCP_CLOSED;
+ sk->sk_state_change(sk);
+
+ nfc_llcp_sock_put(llcp_sock);
+}
+
+static void nfc_llcp_recv_snl(struct nfc_llcp_local *local,
+ struct sk_buff *skb)
+{
+ struct nfc_llcp_sock *llcp_sock;
+ u8 dsap, ssap, *tlv, type, length, tid, sap;
+ u16 tlv_len, offset;
+ char *service_name;
+ size_t service_name_len;
+ struct nfc_llcp_sdp_tlv *sdp;
+ HLIST_HEAD(llc_sdres_list);
+ size_t sdres_tlvs_len;
+ HLIST_HEAD(nl_sdres_list);
+
+ dsap = nfc_llcp_dsap(skb);
+ ssap = nfc_llcp_ssap(skb);
+
+ pr_debug("%d %d\n", dsap, ssap);
+
+ if (dsap != LLCP_SAP_SDP || ssap != LLCP_SAP_SDP) {
+ pr_err("Wrong SNL SAP\n");
+ return;
+ }
+
+ tlv = &skb->data[LLCP_HEADER_SIZE];
+ tlv_len = skb->len - LLCP_HEADER_SIZE;
+ offset = 0;
+ sdres_tlvs_len = 0;
+
+ while (offset < tlv_len) {
+ type = tlv[0];
+ length = tlv[1];
+
+ switch (type) {
+ case LLCP_TLV_SDREQ:
+ tid = tlv[2];
+ service_name = (char *) &tlv[3];
+ service_name_len = length - 1;
+
+ pr_debug("Looking for %.16s\n", service_name);
+
+ if (service_name_len == strlen("urn:nfc:sn:sdp") &&
+ !strncmp(service_name, "urn:nfc:sn:sdp",
+ service_name_len)) {
+ sap = 1;
+ goto add_snl;
+ }
+
+ llcp_sock = nfc_llcp_sock_from_sn(local, service_name,
+ service_name_len);
+ if (!llcp_sock) {
+ sap = 0;
+ goto add_snl;
+ }
+
+ /*
+ * We found a socket but its ssap has not been reserved
+ * yet. We need to assign it for good and send a reply.
+ * The ssap will be freed when the socket is closed.
+ */
+ if (llcp_sock->ssap == LLCP_SDP_UNBOUND) {
+ atomic_t *client_count;
+
+ sap = nfc_llcp_reserve_sdp_ssap(local);
+
+ pr_debug("Reserving %d\n", sap);
+
+ if (sap == LLCP_SAP_MAX) {
+ sap = 0;
+ goto add_snl;
+ }
+
+ client_count =
+ &local->local_sdp_cnt[sap -
+ LLCP_WKS_NUM_SAP];
+
+ atomic_inc(client_count);
+
+ llcp_sock->ssap = sap;
+ llcp_sock->reserved_ssap = sap;
+ } else {
+ sap = llcp_sock->ssap;
+ }
+
+ pr_debug("%p %d\n", llcp_sock, sap);
+
+add_snl:
+ sdp = nfc_llcp_build_sdres_tlv(tid, sap);
+ if (sdp == NULL)
+ goto exit;
+
+ sdres_tlvs_len += sdp->tlv_len;
+ hlist_add_head(&sdp->node, &llc_sdres_list);
+ break;
+
+ case LLCP_TLV_SDRES:
+ mutex_lock(&local->sdreq_lock);
+
+ pr_debug("LLCP_TLV_SDRES: searching tid %d\n", tlv[2]);
+
+ hlist_for_each_entry(sdp, &local->pending_sdreqs, node) {
+ if (sdp->tid != tlv[2])
+ continue;
+
+ sdp->sap = tlv[3];
+
+ pr_debug("Found: uri=%s, sap=%d\n",
+ sdp->uri, sdp->sap);
+
+ hlist_del(&sdp->node);
+
+ hlist_add_head(&sdp->node, &nl_sdres_list);
+
+ break;
+ }
+
+ mutex_unlock(&local->sdreq_lock);
+ break;
+
+ default:
+ pr_err("Invalid SNL tlv value 0x%x\n", type);
+ break;
+ }
+
+ offset += length + 2;
+ tlv += length + 2;
+ }
+
+exit:
+ if (!hlist_empty(&nl_sdres_list))
+ nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list);
+
+ if (!hlist_empty(&llc_sdres_list))
+ nfc_llcp_send_snl_sdres(local, &llc_sdres_list, sdres_tlvs_len);
+}
+
+static void nfc_llcp_recv_agf(struct nfc_llcp_local *local, struct sk_buff *skb)
+{
+ u8 ptype;
+ u16 pdu_len;
+ struct sk_buff *new_skb;
+
+ if (skb->len <= LLCP_HEADER_SIZE) {
+ pr_err("Malformed AGF PDU\n");
+ return;
+ }
+
+ skb_pull(skb, LLCP_HEADER_SIZE);
+
+ while (skb->len > LLCP_AGF_PDU_HEADER_SIZE) {
+ pdu_len = skb->data[0] << 8 | skb->data[1];
+
+ skb_pull(skb, LLCP_AGF_PDU_HEADER_SIZE);
+
+ if (pdu_len < LLCP_HEADER_SIZE || pdu_len > skb->len) {
+ pr_err("Malformed AGF PDU\n");
+ return;
+ }
+
+ ptype = nfc_llcp_ptype(skb);
+
+ if (ptype == LLCP_PDU_SYMM || ptype == LLCP_PDU_AGF)
+ goto next;
+
+ new_skb = nfc_alloc_recv_skb(pdu_len, GFP_KERNEL);
+ if (new_skb == NULL) {
+ pr_err("Could not allocate PDU\n");
+ return;
+ }
+
+ memcpy(skb_put(new_skb, pdu_len), skb->data, pdu_len);
+
+ nfc_llcp_rx_skb(local, new_skb);
+
+ kfree_skb(new_skb);
+next:
+ skb_pull(skb, pdu_len);
+ }
+}
+
+static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb)
+{
+ u8 dsap, ssap, ptype;
+
+ ptype = nfc_llcp_ptype(skb);
+ dsap = nfc_llcp_dsap(skb);
+ ssap = nfc_llcp_ssap(skb);
+
+ pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap);
+
+ if (ptype != LLCP_PDU_SYMM)
+ print_hex_dump(KERN_DEBUG, "LLCP Rx: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->data, skb->len, true);
+
+ switch (ptype) {
+ case LLCP_PDU_SYMM:
+ pr_debug("SYMM\n");
+ break;
+
+ case LLCP_PDU_UI:
+ pr_debug("UI\n");
+ nfc_llcp_recv_ui(local, skb);
+ break;
+
+ case LLCP_PDU_CONNECT:
+ pr_debug("CONNECT\n");
+ nfc_llcp_recv_connect(local, skb);
+ break;
+
+ case LLCP_PDU_DISC:
+ pr_debug("DISC\n");
+ nfc_llcp_recv_disc(local, skb);
+ break;
+
+ case LLCP_PDU_CC:
+ pr_debug("CC\n");
+ nfc_llcp_recv_cc(local, skb);
+ break;
+
+ case LLCP_PDU_DM:
+ pr_debug("DM\n");
+ nfc_llcp_recv_dm(local, skb);
+ break;
+
+ case LLCP_PDU_SNL:
+ pr_debug("SNL\n");
+ nfc_llcp_recv_snl(local, skb);
+ break;
+
+ case LLCP_PDU_I:
+ case LLCP_PDU_RR:
+ case LLCP_PDU_RNR:
+ pr_debug("I frame\n");
+ nfc_llcp_recv_hdlc(local, skb);
+ break;
+
+ case LLCP_PDU_AGF:
+ pr_debug("AGF frame\n");
+ nfc_llcp_recv_agf(local, skb);
+ break;
+ }
+}
+
+static void nfc_llcp_rx_work(struct work_struct *work)
+{
+ struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local,
+ rx_work);
+ struct sk_buff *skb;
+
+ skb = local->rx_pending;
+ if (skb == NULL) {
+ pr_debug("No pending SKB\n");
+ return;
+ }
+
+ __net_timestamp(skb);
+
+ nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_RX);
+
+ nfc_llcp_rx_skb(local, skb);
+
+ schedule_work(&local->tx_work);
+ kfree_skb(local->rx_pending);
+ local->rx_pending = NULL;
+}
+
+static void __nfc_llcp_recv(struct nfc_llcp_local *local, struct sk_buff *skb)
+{
+ local->rx_pending = skb;
+ del_timer(&local->link_timer);
+ schedule_work(&local->rx_work);
+}
+
+void nfc_llcp_recv(void *data, struct sk_buff *skb, int err)
+{
+ struct nfc_llcp_local *local = (struct nfc_llcp_local *) data;
+
+ pr_debug("Received an LLCP PDU\n");
+ if (err < 0) {
+ pr_err("err %d\n", err);
+ return;
+ }
+
+ __nfc_llcp_recv(local, skb);
+}
+
+int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb)
+{
+ struct nfc_llcp_local *local;
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ __nfc_llcp_recv(local, skb);
+
+ return 0;
+}
+
+void nfc_llcp_mac_is_down(struct nfc_dev *dev)
+{
+ struct nfc_llcp_local *local;
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL)
+ return;
+
+ local->remote_miu = LLCP_DEFAULT_MIU;
+ local->remote_lto = LLCP_DEFAULT_LTO;
+
+ /* Close and purge all existing sockets */
+ nfc_llcp_socket_release(local, true, 0);
+}
+
+void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx,
+ u8 comm_mode, u8 rf_mode)
+{
+ struct nfc_llcp_local *local;
+
+ pr_debug("rf mode %d\n", rf_mode);
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL)
+ return;
+
+ local->target_idx = target_idx;
+ local->comm_mode = comm_mode;
+ local->rf_mode = rf_mode;
+
+ if (rf_mode == NFC_RF_INITIATOR) {
+ pr_debug("Queueing Tx work\n");
+
+ schedule_work(&local->tx_work);
+ } else {
+ mod_timer(&local->link_timer,
+ jiffies + msecs_to_jiffies(local->remote_lto));
+ }
+}
+
+int nfc_llcp_register_device(struct nfc_dev *ndev)
+{
+ struct nfc_llcp_local *local;
+
+ local = kzalloc(sizeof(struct nfc_llcp_local), GFP_KERNEL);
+ if (local == NULL)
+ return -ENOMEM;
+
+ local->dev = ndev;
+ INIT_LIST_HEAD(&local->list);
+ kref_init(&local->ref);
+ mutex_init(&local->sdp_lock);
+ init_timer(&local->link_timer);
+ local->link_timer.data = (unsigned long) local;
+ local->link_timer.function = nfc_llcp_symm_timer;
+
+ skb_queue_head_init(&local->tx_queue);
+ INIT_WORK(&local->tx_work, nfc_llcp_tx_work);
+
+ local->rx_pending = NULL;
+ INIT_WORK(&local->rx_work, nfc_llcp_rx_work);
+
+ INIT_WORK(&local->timeout_work, nfc_llcp_timeout_work);
+
+ rwlock_init(&local->sockets.lock);
+ rwlock_init(&local->connecting_sockets.lock);
+ rwlock_init(&local->raw_sockets.lock);
+
+ local->lto = 150; /* 1500 ms */
+ local->rw = LLCP_MAX_RW;
+ local->miux = cpu_to_be16(LLCP_MAX_MIUX);
+ local->local_wks = 0x1; /* LLC Link Management */
+
+ nfc_llcp_build_gb(local);
+
+ local->remote_miu = LLCP_DEFAULT_MIU;
+ local->remote_lto = LLCP_DEFAULT_LTO;
+
+ mutex_init(&local->sdreq_lock);
+ INIT_HLIST_HEAD(&local->pending_sdreqs);
+ init_timer(&local->sdreq_timer);
+ local->sdreq_timer.data = (unsigned long) local;
+ local->sdreq_timer.function = nfc_llcp_sdreq_timer;
+ INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work);
+
+ list_add(&local->list, &llcp_devices);
+
+ return 0;
+}
+
+void nfc_llcp_unregister_device(struct nfc_dev *dev)
+{
+ struct nfc_llcp_local *local = nfc_llcp_find_local(dev);
+
+ if (local == NULL) {
+ pr_debug("No such device\n");
+ return;
+ }
+
+ local_cleanup(local);
+
+ nfc_llcp_local_put(local);
+}
+
+int __init nfc_llcp_init(void)
+{
+ return nfc_llcp_sock_init();
+}
+
+void nfc_llcp_exit(void)
+{
+ nfc_llcp_sock_exit();
+}
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
new file mode 100644
index 000000000..9578bd6a4
--- /dev/null
+++ b/net/nfc/llcp_sock.c
@@ -0,0 +1,1034 @@
+/*
+ * Copyright (C) 2011 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) "llcp: %s: " fmt, __func__
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/nfc.h>
+
+#include "nfc.h"
+#include "llcp.h"
+
+static int sock_wait_state(struct sock *sk, int state, unsigned long timeo)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ int err = 0;
+
+ pr_debug("sk %p", sk);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (sk->sk_state != state) {
+ if (!timeo) {
+ err = -EINPROGRESS;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = sock_error(sk);
+ if (err)
+ break;
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return err;
+}
+
+static struct proto llcp_sock_proto = {
+ .name = "NFC_LLCP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct nfc_llcp_sock),
+};
+
+static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
+{
+ struct sock *sk = sock->sk;
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ struct nfc_llcp_local *local;
+ struct nfc_dev *dev;
+ struct sockaddr_nfc_llcp llcp_addr;
+ int len, ret = 0;
+
+ if (!addr || addr->sa_family != AF_NFC)
+ return -EINVAL;
+
+ pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family);
+
+ memset(&llcp_addr, 0, sizeof(llcp_addr));
+ len = min_t(unsigned int, sizeof(llcp_addr), alen);
+ memcpy(&llcp_addr, addr, len);
+
+ /* This is going to be a listening socket, dsap must be 0 */
+ if (llcp_addr.dsap != 0)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (sk->sk_state != LLCP_CLOSED) {
+ ret = -EBADFD;
+ goto error;
+ }
+
+ dev = nfc_get_device(llcp_addr.dev_idx);
+ if (dev == NULL) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL) {
+ ret = -ENODEV;
+ goto put_dev;
+ }
+
+ llcp_sock->dev = dev;
+ llcp_sock->local = nfc_llcp_local_get(local);
+ llcp_sock->nfc_protocol = llcp_addr.nfc_protocol;
+ llcp_sock->service_name_len = min_t(unsigned int,
+ llcp_addr.service_name_len,
+ NFC_LLCP_MAX_SERVICE_NAME);
+ llcp_sock->service_name = kmemdup(llcp_addr.service_name,
+ llcp_sock->service_name_len,
+ GFP_KERNEL);
+
+ llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock);
+ if (llcp_sock->ssap == LLCP_SAP_MAX) {
+ ret = -EADDRINUSE;
+ goto put_dev;
+ }
+
+ llcp_sock->reserved_ssap = llcp_sock->ssap;
+
+ nfc_llcp_sock_link(&local->sockets, sk);
+
+ pr_debug("Socket bound to SAP %d\n", llcp_sock->ssap);
+
+ sk->sk_state = LLCP_BOUND;
+
+put_dev:
+ nfc_put_device(dev);
+
+error:
+ release_sock(sk);
+ return ret;
+}
+
+static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr,
+ int alen)
+{
+ struct sock *sk = sock->sk;
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ struct nfc_llcp_local *local;
+ struct nfc_dev *dev;
+ struct sockaddr_nfc_llcp llcp_addr;
+ int len, ret = 0;
+
+ if (!addr || addr->sa_family != AF_NFC)
+ return -EINVAL;
+
+ pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family);
+
+ memset(&llcp_addr, 0, sizeof(llcp_addr));
+ len = min_t(unsigned int, sizeof(llcp_addr), alen);
+ memcpy(&llcp_addr, addr, len);
+
+ lock_sock(sk);
+
+ if (sk->sk_state != LLCP_CLOSED) {
+ ret = -EBADFD;
+ goto error;
+ }
+
+ dev = nfc_get_device(llcp_addr.dev_idx);
+ if (dev == NULL) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL) {
+ ret = -ENODEV;
+ goto put_dev;
+ }
+
+ llcp_sock->dev = dev;
+ llcp_sock->local = nfc_llcp_local_get(local);
+ llcp_sock->nfc_protocol = llcp_addr.nfc_protocol;
+
+ nfc_llcp_sock_link(&local->raw_sockets, sk);
+
+ sk->sk_state = LLCP_BOUND;
+
+put_dev:
+ nfc_put_device(dev);
+
+error:
+ release_sock(sk);
+ return ret;
+}
+
+static int llcp_sock_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int ret = 0;
+
+ pr_debug("sk %p backlog %d\n", sk, backlog);
+
+ lock_sock(sk);
+
+ if ((sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM) ||
+ sk->sk_state != LLCP_BOUND) {
+ ret = -EBADFD;
+ goto error;
+ }
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+
+ pr_debug("Socket listening\n");
+ sk->sk_state = LLCP_LISTEN;
+
+error:
+ release_sock(sk);
+
+ return ret;
+}
+
+static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ u32 opt;
+ int err = 0;
+
+ pr_debug("%p optname %d\n", sk, optname);
+
+ if (level != SOL_NFC)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case NFC_LLCP_RW:
+ if (sk->sk_state == LLCP_CONNECTED ||
+ sk->sk_state == LLCP_BOUND ||
+ sk->sk_state == LLCP_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt > LLCP_MAX_RW) {
+ err = -EINVAL;
+ break;
+ }
+
+ llcp_sock->rw = (u8) opt;
+
+ break;
+
+ case NFC_LLCP_MIUX:
+ if (sk->sk_state == LLCP_CONNECTED ||
+ sk->sk_state == LLCP_BOUND ||
+ sk->sk_state == LLCP_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (get_user(opt, (u32 __user *) optval)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (opt > LLCP_MAX_MIUX) {
+ err = -EINVAL;
+ break;
+ }
+
+ llcp_sock->miux = cpu_to_be16((u16) opt);
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+
+ pr_debug("%p rw %d miux %d\n", llcp_sock,
+ llcp_sock->rw, llcp_sock->miux);
+
+ return err;
+}
+
+static int nfc_llcp_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct nfc_llcp_local *local;
+ struct sock *sk = sock->sk;
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ int len, err = 0;
+ u16 miux, remote_miu;
+ u8 rw;
+
+ pr_debug("%p optname %d\n", sk, optname);
+
+ if (level != SOL_NFC)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ local = llcp_sock->local;
+ if (!local)
+ return -ENODEV;
+
+ len = min_t(u32, len, sizeof(u32));
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case NFC_LLCP_RW:
+ rw = llcp_sock->rw > LLCP_MAX_RW ? local->rw : llcp_sock->rw;
+ if (put_user(rw, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case NFC_LLCP_MIUX:
+ miux = be16_to_cpu(llcp_sock->miux) > LLCP_MAX_MIUX ?
+ be16_to_cpu(local->miux) : be16_to_cpu(llcp_sock->miux);
+
+ if (put_user(miux, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case NFC_LLCP_REMOTE_MIU:
+ remote_miu = llcp_sock->remote_miu > LLCP_MAX_MIU ?
+ local->remote_miu : llcp_sock->remote_miu;
+
+ if (put_user(remote_miu, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case NFC_LLCP_REMOTE_LTO:
+ if (put_user(local->remote_lto / 10, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ case NFC_LLCP_REMOTE_RW:
+ if (put_user(llcp_sock->remote_rw, (u32 __user *) optval))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return err;
+}
+
+void nfc_llcp_accept_unlink(struct sock *sk)
+{
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+
+ pr_debug("state %d\n", sk->sk_state);
+
+ list_del_init(&llcp_sock->accept_queue);
+ sk_acceptq_removed(llcp_sock->parent);
+ llcp_sock->parent = NULL;
+
+ sock_put(sk);
+}
+
+void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ struct nfc_llcp_sock *llcp_sock_parent = nfc_llcp_sock(parent);
+
+ /* Lock will be free from unlink */
+ sock_hold(sk);
+
+ list_add_tail(&llcp_sock->accept_queue,
+ &llcp_sock_parent->accept_queue);
+ llcp_sock->parent = parent;
+ sk_acceptq_added(parent);
+}
+
+struct sock *nfc_llcp_accept_dequeue(struct sock *parent,
+ struct socket *newsock)
+{
+ struct nfc_llcp_sock *lsk, *n, *llcp_parent;
+ struct sock *sk;
+
+ llcp_parent = nfc_llcp_sock(parent);
+
+ list_for_each_entry_safe(lsk, n, &llcp_parent->accept_queue,
+ accept_queue) {
+ sk = &lsk->sk;
+ lock_sock(sk);
+
+ if (sk->sk_state == LLCP_CLOSED) {
+ release_sock(sk);
+ nfc_llcp_accept_unlink(sk);
+ continue;
+ }
+
+ if (sk->sk_state == LLCP_CONNECTED || !newsock) {
+ list_del_init(&lsk->accept_queue);
+ sock_put(sk);
+
+ if (newsock)
+ sock_graft(sk, newsock);
+
+ release_sock(sk);
+
+ pr_debug("Returning sk state %d\n", sk->sk_state);
+
+ sk_acceptq_removed(parent);
+
+ return sk;
+ }
+
+ release_sock(sk);
+ }
+
+ return NULL;
+}
+
+static int llcp_sock_accept(struct socket *sock, struct socket *newsock,
+ int flags)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct sock *sk = sock->sk, *new_sk;
+ long timeo;
+ int ret = 0;
+
+ pr_debug("parent %p\n", sk);
+
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+ if (sk->sk_state != LLCP_LISTEN) {
+ ret = -EBADFD;
+ goto error;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ /* Wait for an incoming connection. */
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (!(new_sk = nfc_llcp_accept_dequeue(sk, newsock))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (ret)
+ goto error;
+
+ newsock->state = SS_CONNECTED;
+
+ pr_debug("new socket %p\n", new_sk);
+
+error:
+ release_sock(sk);
+
+ return ret;
+}
+
+static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, llcp_addr, uaddr);
+
+ if (llcp_sock == NULL || llcp_sock->dev == NULL)
+ return -EBADFD;
+
+ pr_debug("%p %d %d %d\n", sk, llcp_sock->target_idx,
+ llcp_sock->dsap, llcp_sock->ssap);
+
+ memset(llcp_addr, 0, sizeof(*llcp_addr));
+ *len = sizeof(struct sockaddr_nfc_llcp);
+
+ llcp_addr->sa_family = AF_NFC;
+ llcp_addr->dev_idx = llcp_sock->dev->idx;
+ llcp_addr->target_idx = llcp_sock->target_idx;
+ llcp_addr->nfc_protocol = llcp_sock->nfc_protocol;
+ llcp_addr->dsap = llcp_sock->dsap;
+ llcp_addr->ssap = llcp_sock->ssap;
+ llcp_addr->service_name_len = llcp_sock->service_name_len;
+ memcpy(llcp_addr->service_name, llcp_sock->service_name,
+ llcp_addr->service_name_len);
+
+ return 0;
+}
+
+static inline unsigned int llcp_accept_poll(struct sock *parent)
+{
+ struct nfc_llcp_sock *llcp_sock, *parent_sock;
+ struct sock *sk;
+
+ parent_sock = nfc_llcp_sock(parent);
+
+ list_for_each_entry(llcp_sock, &parent_sock->accept_queue,
+ accept_queue) {
+ sk = &llcp_sock->sk;
+
+ if (sk->sk_state == LLCP_CONNECTED)
+ return POLLIN | POLLRDNORM;
+ }
+
+ return 0;
+}
+
+static unsigned int llcp_sock_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask = 0;
+
+ pr_debug("%p\n", sk);
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+
+ if (sk->sk_state == LLCP_LISTEN)
+ return llcp_accept_poll(sk);
+
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sk->sk_state == LLCP_CLOSED)
+ mask |= POLLHUP;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED)
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ pr_debug("mask 0x%x\n", mask);
+
+ return mask;
+}
+
+static int llcp_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct nfc_llcp_local *local;
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ int err = 0;
+
+ if (!sk)
+ return 0;
+
+ pr_debug("%p\n", sk);
+
+ local = llcp_sock->local;
+ if (local == NULL) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ lock_sock(sk);
+
+ /* Send a DISC */
+ if (sk->sk_state == LLCP_CONNECTED)
+ nfc_llcp_send_disconnect(llcp_sock);
+
+ if (sk->sk_state == LLCP_LISTEN) {
+ struct nfc_llcp_sock *lsk, *n;
+ struct sock *accept_sk;
+
+ list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue,
+ accept_queue) {
+ accept_sk = &lsk->sk;
+ lock_sock(accept_sk);
+
+ nfc_llcp_send_disconnect(lsk);
+ nfc_llcp_accept_unlink(accept_sk);
+
+ release_sock(accept_sk);
+ }
+ }
+
+ if (llcp_sock->reserved_ssap < LLCP_SAP_MAX)
+ nfc_llcp_put_ssap(llcp_sock->local, llcp_sock->ssap);
+
+ release_sock(sk);
+
+ /* Keep this sock alive and therefore do not remove it from the sockets
+ * list until the DISC PDU has been actually sent. Otherwise we would
+ * reply with DM PDUs before sending the DISC one.
+ */
+ if (sk->sk_state == LLCP_DISCONNECTING)
+ return err;
+
+ if (sock->type == SOCK_RAW)
+ nfc_llcp_sock_unlink(&local->raw_sockets, sk);
+ else
+ nfc_llcp_sock_unlink(&local->sockets, sk);
+
+out:
+ sock_orphan(sk);
+ sock_put(sk);
+
+ return err;
+}
+
+static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr,
+ int len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ struct sockaddr_nfc_llcp *addr = (struct sockaddr_nfc_llcp *)_addr;
+ struct nfc_dev *dev;
+ struct nfc_llcp_local *local;
+ int ret = 0;
+
+ pr_debug("sock %p sk %p flags 0x%x\n", sock, sk, flags);
+
+ if (!addr || len < sizeof(struct sockaddr_nfc) ||
+ addr->sa_family != AF_NFC)
+ return -EINVAL;
+
+ if (addr->service_name_len == 0 && addr->dsap == 0)
+ return -EINVAL;
+
+ pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx,
+ addr->target_idx, addr->nfc_protocol);
+
+ lock_sock(sk);
+
+ if (sk->sk_state == LLCP_CONNECTED) {
+ ret = -EISCONN;
+ goto error;
+ }
+
+ dev = nfc_get_device(addr->dev_idx);
+ if (dev == NULL) {
+ ret = -ENODEV;
+ goto error;
+ }
+
+ local = nfc_llcp_find_local(dev);
+ if (local == NULL) {
+ ret = -ENODEV;
+ goto put_dev;
+ }
+
+ device_lock(&dev->dev);
+ if (dev->dep_link_up == false) {
+ ret = -ENOLINK;
+ device_unlock(&dev->dev);
+ goto put_dev;
+ }
+ device_unlock(&dev->dev);
+
+ if (local->rf_mode == NFC_RF_INITIATOR &&
+ addr->target_idx != local->target_idx) {
+ ret = -ENOLINK;
+ goto put_dev;
+ }
+
+ llcp_sock->dev = dev;
+ llcp_sock->local = nfc_llcp_local_get(local);
+ llcp_sock->ssap = nfc_llcp_get_local_ssap(local);
+ if (llcp_sock->ssap == LLCP_SAP_MAX) {
+ ret = -ENOMEM;
+ goto put_dev;
+ }
+
+ llcp_sock->reserved_ssap = llcp_sock->ssap;
+
+ if (addr->service_name_len == 0)
+ llcp_sock->dsap = addr->dsap;
+ else
+ llcp_sock->dsap = LLCP_SAP_SDP;
+ llcp_sock->nfc_protocol = addr->nfc_protocol;
+ llcp_sock->service_name_len = min_t(unsigned int,
+ addr->service_name_len,
+ NFC_LLCP_MAX_SERVICE_NAME);
+ llcp_sock->service_name = kmemdup(addr->service_name,
+ llcp_sock->service_name_len,
+ GFP_KERNEL);
+
+ nfc_llcp_sock_link(&local->connecting_sockets, sk);
+
+ ret = nfc_llcp_send_connect(llcp_sock);
+ if (ret)
+ goto sock_unlink;
+
+ sk->sk_state = LLCP_CONNECTING;
+
+ ret = sock_wait_state(sk, LLCP_CONNECTED,
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
+ if (ret && ret != -EINPROGRESS)
+ goto sock_unlink;
+
+ release_sock(sk);
+
+ return ret;
+
+sock_unlink:
+ nfc_llcp_put_ssap(local, llcp_sock->ssap);
+
+ nfc_llcp_sock_unlink(&local->connecting_sockets, sk);
+
+put_dev:
+ nfc_put_device(dev);
+
+error:
+ release_sock(sk);
+ return ret;
+}
+
+static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+ int ret;
+
+ pr_debug("sock %p sk %p", sock, sk);
+
+ ret = sock_error(sk);
+ if (ret)
+ return ret;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+
+ if (sk->sk_type == SOCK_DGRAM) {
+ DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr,
+ msg->msg_name);
+
+ if (msg->msg_namelen < sizeof(*addr)) {
+ release_sock(sk);
+ return -EINVAL;
+ }
+
+ release_sock(sk);
+
+ return nfc_llcp_send_ui_frame(llcp_sock, addr->dsap, addr->ssap,
+ msg, len);
+ }
+
+ if (sk->sk_state != LLCP_CONNECTED) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ release_sock(sk);
+
+ return nfc_llcp_send_i_frame(llcp_sock, msg, len);
+}
+
+static int llcp_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ int noblock = flags & MSG_DONTWAIT;
+ struct sock *sk = sock->sk;
+ unsigned int copied, rlen;
+ struct sk_buff *skb, *cskb;
+ int err = 0;
+
+ pr_debug("%p %zu\n", sk, len);
+
+ lock_sock(sk);
+
+ if (sk->sk_state == LLCP_CLOSED &&
+ skb_queue_empty(&sk->sk_receive_queue)) {
+ release_sock(sk);
+ return 0;
+ }
+
+ release_sock(sk);
+
+ if (flags & (MSG_OOB))
+ return -EOPNOTSUPP;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb) {
+ pr_err("Recv datagram failed state %d %d %d",
+ sk->sk_state, err, sock_error(sk));
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 0;
+
+ return err;
+ }
+
+ rlen = skb->len; /* real length of skb */
+ copied = min_t(unsigned int, rlen, len);
+
+ cskb = skb;
+ if (skb_copy_datagram_msg(cskb, 0, msg, copied)) {
+ if (!(flags & MSG_PEEK))
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ return -EFAULT;
+ }
+
+ sock_recv_timestamp(msg, sk, skb);
+
+ if (sk->sk_type == SOCK_DGRAM && msg->msg_name) {
+ struct nfc_llcp_ui_cb *ui_cb = nfc_llcp_ui_skb_cb(skb);
+ DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, sockaddr,
+ msg->msg_name);
+
+ msg->msg_namelen = sizeof(struct sockaddr_nfc_llcp);
+
+ pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap);
+
+ memset(sockaddr, 0, sizeof(*sockaddr));
+ sockaddr->sa_family = AF_NFC;
+ sockaddr->nfc_protocol = NFC_PROTO_NFC_DEP;
+ sockaddr->dsap = ui_cb->dsap;
+ sockaddr->ssap = ui_cb->ssap;
+ }
+
+ /* Mark read part of skb as used */
+ if (!(flags & MSG_PEEK)) {
+
+ /* SOCK_STREAM: re-queue skb if it contains unreceived data */
+ if (sk->sk_type == SOCK_STREAM ||
+ sk->sk_type == SOCK_DGRAM ||
+ sk->sk_type == SOCK_RAW) {
+ skb_pull(skb, copied);
+ if (skb->len) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ goto done;
+ }
+ }
+
+ kfree_skb(skb);
+ }
+
+ /* XXX Queue backlogged skbs */
+
+done:
+ /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */
+ if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC))
+ copied = rlen;
+
+ return copied;
+}
+
+static const struct proto_ops llcp_sock_ops = {
+ .family = PF_NFC,
+ .owner = THIS_MODULE,
+ .bind = llcp_sock_bind,
+ .connect = llcp_sock_connect,
+ .release = llcp_sock_release,
+ .socketpair = sock_no_socketpair,
+ .accept = llcp_sock_accept,
+ .getname = llcp_sock_getname,
+ .poll = llcp_sock_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = llcp_sock_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = nfc_llcp_setsockopt,
+ .getsockopt = nfc_llcp_getsockopt,
+ .sendmsg = llcp_sock_sendmsg,
+ .recvmsg = llcp_sock_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static const struct proto_ops llcp_rawsock_ops = {
+ .family = PF_NFC,
+ .owner = THIS_MODULE,
+ .bind = llcp_raw_sock_bind,
+ .connect = sock_no_connect,
+ .release = llcp_sock_release,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = llcp_sock_getname,
+ .poll = llcp_sock_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = llcp_sock_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static void llcp_sock_destruct(struct sock *sk)
+{
+ struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+
+ pr_debug("%p\n", sk);
+
+ if (sk->sk_state == LLCP_CONNECTED)
+ nfc_put_device(llcp_sock->dev);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+
+ nfc_llcp_sock_free(llcp_sock);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Freeing alive NFC LLCP socket %p\n", sk);
+ return;
+ }
+}
+
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp)
+{
+ struct sock *sk;
+ struct nfc_llcp_sock *llcp_sock;
+
+ sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto);
+ if (!sk)
+ return NULL;
+
+ llcp_sock = nfc_llcp_sock(sk);
+
+ sock_init_data(sock, sk);
+ sk->sk_state = LLCP_CLOSED;
+ sk->sk_protocol = NFC_SOCKPROTO_LLCP;
+ sk->sk_type = type;
+ sk->sk_destruct = llcp_sock_destruct;
+
+ llcp_sock->ssap = 0;
+ llcp_sock->dsap = LLCP_SAP_SDP;
+ llcp_sock->rw = LLCP_MAX_RW + 1;
+ llcp_sock->miux = cpu_to_be16(LLCP_MAX_MIUX + 1);
+ llcp_sock->send_n = llcp_sock->send_ack_n = 0;
+ llcp_sock->recv_n = llcp_sock->recv_ack_n = 0;
+ llcp_sock->remote_ready = 1;
+ llcp_sock->reserved_ssap = LLCP_SAP_MAX;
+ nfc_llcp_socket_remote_param_init(llcp_sock);
+ skb_queue_head_init(&llcp_sock->tx_queue);
+ skb_queue_head_init(&llcp_sock->tx_pending_queue);
+ INIT_LIST_HEAD(&llcp_sock->accept_queue);
+
+ if (sock != NULL)
+ sock->state = SS_UNCONNECTED;
+
+ return sk;
+}
+
+void nfc_llcp_sock_free(struct nfc_llcp_sock *sock)
+{
+ kfree(sock->service_name);
+
+ skb_queue_purge(&sock->tx_queue);
+ skb_queue_purge(&sock->tx_pending_queue);
+
+ list_del_init(&sock->accept_queue);
+
+ sock->parent = NULL;
+
+ nfc_llcp_local_put(sock->local);
+}
+
+static int llcp_sock_create(struct net *net, struct socket *sock,
+ const struct nfc_protocol *nfc_proto)
+{
+ struct sock *sk;
+
+ pr_debug("%p\n", sock);
+
+ if (sock->type != SOCK_STREAM &&
+ sock->type != SOCK_DGRAM &&
+ sock->type != SOCK_RAW)
+ return -ESOCKTNOSUPPORT;
+
+ if (sock->type == SOCK_RAW)
+ sock->ops = &llcp_rawsock_ops;
+ else
+ sock->ops = &llcp_sock_ops;
+
+ sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC);
+ if (sk == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static const struct nfc_protocol llcp_nfc_proto = {
+ .id = NFC_SOCKPROTO_LLCP,
+ .proto = &llcp_sock_proto,
+ .owner = THIS_MODULE,
+ .create = llcp_sock_create
+};
+
+int __init nfc_llcp_sock_init(void)
+{
+ return nfc_proto_register(&llcp_nfc_proto);
+}
+
+void nfc_llcp_sock_exit(void)
+{
+ nfc_proto_unregister(&llcp_nfc_proto);
+}
diff --git a/net/nfc/nci/Kconfig b/net/nfc/nci/Kconfig
new file mode 100644
index 000000000..a4f1e42e3
--- /dev/null
+++ b/net/nfc/nci/Kconfig
@@ -0,0 +1,21 @@
+config NFC_NCI
+ depends on NFC
+ tristate "NCI protocol support"
+ default n
+ help
+ NCI (NFC Controller Interface) is a communication protocol between
+ an NFC Controller (NFCC) and a Device Host (DH).
+
+ Say Y here to compile NCI support into the kernel or say M to
+ compile it as module (nci).
+
+config NFC_NCI_SPI
+ depends on NFC_NCI && SPI
+ select CRC_CCITT
+ bool "NCI over SPI protocol support"
+ default n
+ help
+ NCI (NFC Controller Interface) is a communication protocol between
+ an NFC Controller (NFCC) and a Device Host (DH).
+
+ Say yes if you use an NCI driver that requires SPI link layer.
diff --git a/net/nfc/nci/Makefile b/net/nfc/nci/Makefile
new file mode 100644
index 000000000..7ed894926
--- /dev/null
+++ b/net/nfc/nci/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux NFC NCI layer.
+#
+
+obj-$(CONFIG_NFC_NCI) += nci.o
+
+nci-objs := core.o data.o lib.o ntf.o rsp.o hci.o
+
+nci-$(CONFIG_NFC_NCI_SPI) += spi.o
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
new file mode 100644
index 000000000..49ff32106
--- /dev/null
+++ b/net/nfc/nci/core.c
@@ -0,0 +1,1281 @@
+/*
+ * The NFC Controller Interface is the communication protocol between an
+ * NFC Controller (NFCC) and a Device Host (DH).
+ *
+ * Copyright (C) 2011 Texas Instruments, Inc.
+ * Copyright (C) 2014 Marvell International Ltd.
+ *
+ * Written by Ilan Elias <ilane@ti.com>
+ *
+ * Acknowledgements:
+ * This file is based on hci_core.c, which was written
+ * by Maxim Krasnyansky.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+
+#include "../nfc.h"
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+#include <linux/nfc.h>
+
+struct core_conn_create_data {
+ int length;
+ struct nci_core_conn_create_cmd *cmd;
+};
+
+static void nci_cmd_work(struct work_struct *work);
+static void nci_rx_work(struct work_struct *work);
+static void nci_tx_work(struct work_struct *work);
+
+struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev,
+ int conn_id)
+{
+ struct nci_conn_info *conn_info;
+
+ list_for_each_entry(conn_info, &ndev->conn_info_list, list) {
+ if (conn_info->conn_id == conn_id)
+ return conn_info;
+ }
+
+ return NULL;
+}
+
+/* ---- NCI requests ---- */
+
+void nci_req_complete(struct nci_dev *ndev, int result)
+{
+ if (ndev->req_status == NCI_REQ_PEND) {
+ ndev->req_result = result;
+ ndev->req_status = NCI_REQ_DONE;
+ complete(&ndev->req_completion);
+ }
+}
+
+static void nci_req_cancel(struct nci_dev *ndev, int err)
+{
+ if (ndev->req_status == NCI_REQ_PEND) {
+ ndev->req_result = err;
+ ndev->req_status = NCI_REQ_CANCELED;
+ complete(&ndev->req_completion);
+ }
+}
+
+/* Execute request and wait for completion. */
+static int __nci_request(struct nci_dev *ndev,
+ void (*req)(struct nci_dev *ndev, unsigned long opt),
+ unsigned long opt, __u32 timeout)
+{
+ int rc = 0;
+ long completion_rc;
+
+ ndev->req_status = NCI_REQ_PEND;
+
+ reinit_completion(&ndev->req_completion);
+ req(ndev, opt);
+ completion_rc =
+ wait_for_completion_interruptible_timeout(&ndev->req_completion,
+ timeout);
+
+ pr_debug("wait_for_completion return %ld\n", completion_rc);
+
+ if (completion_rc > 0) {
+ switch (ndev->req_status) {
+ case NCI_REQ_DONE:
+ rc = nci_to_errno(ndev->req_result);
+ break;
+
+ case NCI_REQ_CANCELED:
+ rc = -ndev->req_result;
+ break;
+
+ default:
+ rc = -ETIMEDOUT;
+ break;
+ }
+ } else {
+ pr_err("wait_for_completion_interruptible_timeout failed %ld\n",
+ completion_rc);
+
+ rc = ((completion_rc == 0) ? (-ETIMEDOUT) : (completion_rc));
+ }
+
+ ndev->req_status = ndev->req_result = 0;
+
+ return rc;
+}
+
+inline int nci_request(struct nci_dev *ndev,
+ void (*req)(struct nci_dev *ndev,
+ unsigned long opt),
+ unsigned long opt, __u32 timeout)
+{
+ int rc;
+
+ if (!test_bit(NCI_UP, &ndev->flags))
+ return -ENETDOWN;
+
+ /* Serialize all requests */
+ mutex_lock(&ndev->req_lock);
+ rc = __nci_request(ndev, req, opt, timeout);
+ mutex_unlock(&ndev->req_lock);
+
+ return rc;
+}
+
+static void nci_reset_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_core_reset_cmd cmd;
+
+ cmd.reset_type = NCI_RESET_TYPE_RESET_CONFIG;
+ nci_send_cmd(ndev, NCI_OP_CORE_RESET_CMD, 1, &cmd);
+}
+
+static void nci_init_req(struct nci_dev *ndev, unsigned long opt)
+{
+ nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, 0, NULL);
+}
+
+static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_rf_disc_map_cmd cmd;
+ struct disc_map_config *cfg = cmd.mapping_configs;
+ __u8 *num = &cmd.num_mapping_configs;
+ int i;
+
+ /* set rf mapping configurations */
+ *num = 0;
+
+ /* by default mapping is set to NCI_RF_INTERFACE_FRAME */
+ for (i = 0; i < ndev->num_supported_rf_interfaces; i++) {
+ if (ndev->supported_rf_interfaces[i] ==
+ NCI_RF_INTERFACE_ISO_DEP) {
+ cfg[*num].rf_protocol = NCI_RF_PROTOCOL_ISO_DEP;
+ cfg[*num].mode = NCI_DISC_MAP_MODE_POLL |
+ NCI_DISC_MAP_MODE_LISTEN;
+ cfg[*num].rf_interface = NCI_RF_INTERFACE_ISO_DEP;
+ (*num)++;
+ } else if (ndev->supported_rf_interfaces[i] ==
+ NCI_RF_INTERFACE_NFC_DEP) {
+ cfg[*num].rf_protocol = NCI_RF_PROTOCOL_NFC_DEP;
+ cfg[*num].mode = NCI_DISC_MAP_MODE_POLL |
+ NCI_DISC_MAP_MODE_LISTEN;
+ cfg[*num].rf_interface = NCI_RF_INTERFACE_NFC_DEP;
+ (*num)++;
+ }
+
+ if (*num == NCI_MAX_NUM_MAPPING_CONFIGS)
+ break;
+ }
+
+ nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_MAP_CMD,
+ (1 + ((*num) * sizeof(struct disc_map_config))), &cmd);
+}
+
+struct nci_set_config_param {
+ __u8 id;
+ size_t len;
+ __u8 *val;
+};
+
+static void nci_set_config_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_set_config_param *param = (struct nci_set_config_param *)opt;
+ struct nci_core_set_config_cmd cmd;
+
+ BUG_ON(param->len > NCI_MAX_PARAM_LEN);
+
+ cmd.num_params = 1;
+ cmd.param.id = param->id;
+ cmd.param.len = param->len;
+ memcpy(cmd.param.val, param->val, param->len);
+
+ nci_send_cmd(ndev, NCI_OP_CORE_SET_CONFIG_CMD, (3 + param->len), &cmd);
+}
+
+struct nci_rf_discover_param {
+ __u32 im_protocols;
+ __u32 tm_protocols;
+};
+
+static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_rf_discover_param *param =
+ (struct nci_rf_discover_param *)opt;
+ struct nci_rf_disc_cmd cmd;
+
+ cmd.num_disc_configs = 0;
+
+ if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) &&
+ (param->im_protocols & NFC_PROTO_JEWEL_MASK ||
+ param->im_protocols & NFC_PROTO_MIFARE_MASK ||
+ param->im_protocols & NFC_PROTO_ISO14443_MASK ||
+ param->im_protocols & NFC_PROTO_NFC_DEP_MASK)) {
+ cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode =
+ NCI_NFC_A_PASSIVE_POLL_MODE;
+ cmd.disc_configs[cmd.num_disc_configs].frequency = 1;
+ cmd.num_disc_configs++;
+ }
+
+ if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) &&
+ (param->im_protocols & NFC_PROTO_ISO14443_B_MASK)) {
+ cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode =
+ NCI_NFC_B_PASSIVE_POLL_MODE;
+ cmd.disc_configs[cmd.num_disc_configs].frequency = 1;
+ cmd.num_disc_configs++;
+ }
+
+ if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) &&
+ (param->im_protocols & NFC_PROTO_FELICA_MASK ||
+ param->im_protocols & NFC_PROTO_NFC_DEP_MASK)) {
+ cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode =
+ NCI_NFC_F_PASSIVE_POLL_MODE;
+ cmd.disc_configs[cmd.num_disc_configs].frequency = 1;
+ cmd.num_disc_configs++;
+ }
+
+ if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS) &&
+ (param->im_protocols & NFC_PROTO_ISO15693_MASK)) {
+ cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode =
+ NCI_NFC_V_PASSIVE_POLL_MODE;
+ cmd.disc_configs[cmd.num_disc_configs].frequency = 1;
+ cmd.num_disc_configs++;
+ }
+
+ if ((cmd.num_disc_configs < NCI_MAX_NUM_RF_CONFIGS - 1) &&
+ (param->tm_protocols & NFC_PROTO_NFC_DEP_MASK)) {
+ cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode =
+ NCI_NFC_A_PASSIVE_LISTEN_MODE;
+ cmd.disc_configs[cmd.num_disc_configs].frequency = 1;
+ cmd.num_disc_configs++;
+ cmd.disc_configs[cmd.num_disc_configs].rf_tech_and_mode =
+ NCI_NFC_F_PASSIVE_LISTEN_MODE;
+ cmd.disc_configs[cmd.num_disc_configs].frequency = 1;
+ cmd.num_disc_configs++;
+ }
+
+ nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_CMD,
+ (1 + (cmd.num_disc_configs * sizeof(struct disc_config))),
+ &cmd);
+}
+
+struct nci_rf_discover_select_param {
+ __u8 rf_discovery_id;
+ __u8 rf_protocol;
+};
+
+static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_rf_discover_select_param *param =
+ (struct nci_rf_discover_select_param *)opt;
+ struct nci_rf_discover_select_cmd cmd;
+
+ cmd.rf_discovery_id = param->rf_discovery_id;
+ cmd.rf_protocol = param->rf_protocol;
+
+ switch (cmd.rf_protocol) {
+ case NCI_RF_PROTOCOL_ISO_DEP:
+ cmd.rf_interface = NCI_RF_INTERFACE_ISO_DEP;
+ break;
+
+ case NCI_RF_PROTOCOL_NFC_DEP:
+ cmd.rf_interface = NCI_RF_INTERFACE_NFC_DEP;
+ break;
+
+ default:
+ cmd.rf_interface = NCI_RF_INTERFACE_FRAME;
+ break;
+ }
+
+ nci_send_cmd(ndev, NCI_OP_RF_DISCOVER_SELECT_CMD,
+ sizeof(struct nci_rf_discover_select_cmd), &cmd);
+}
+
+static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_rf_deactivate_cmd cmd;
+
+ cmd.type = opt;
+
+ nci_send_cmd(ndev, NCI_OP_RF_DEACTIVATE_CMD,
+ sizeof(struct nci_rf_deactivate_cmd), &cmd);
+}
+
+static int nci_open_device(struct nci_dev *ndev)
+{
+ int rc = 0;
+
+ mutex_lock(&ndev->req_lock);
+
+ if (test_bit(NCI_UP, &ndev->flags)) {
+ rc = -EALREADY;
+ goto done;
+ }
+
+ if (ndev->ops->open(ndev)) {
+ rc = -EIO;
+ goto done;
+ }
+
+ atomic_set(&ndev->cmd_cnt, 1);
+
+ set_bit(NCI_INIT, &ndev->flags);
+
+ rc = __nci_request(ndev, nci_reset_req, 0,
+ msecs_to_jiffies(NCI_RESET_TIMEOUT));
+
+ if (ndev->ops->setup)
+ ndev->ops->setup(ndev);
+
+ if (!rc) {
+ rc = __nci_request(ndev, nci_init_req, 0,
+ msecs_to_jiffies(NCI_INIT_TIMEOUT));
+ }
+
+ if (!rc) {
+ rc = __nci_request(ndev, nci_init_complete_req, 0,
+ msecs_to_jiffies(NCI_INIT_TIMEOUT));
+ }
+
+ clear_bit(NCI_INIT, &ndev->flags);
+
+ if (!rc) {
+ set_bit(NCI_UP, &ndev->flags);
+ nci_clear_target_list(ndev);
+ atomic_set(&ndev->state, NCI_IDLE);
+ } else {
+ /* Init failed, cleanup */
+ skb_queue_purge(&ndev->cmd_q);
+ skb_queue_purge(&ndev->rx_q);
+ skb_queue_purge(&ndev->tx_q);
+
+ ndev->ops->close(ndev);
+ ndev->flags = 0;
+ }
+
+done:
+ mutex_unlock(&ndev->req_lock);
+ return rc;
+}
+
+static int nci_close_device(struct nci_dev *ndev)
+{
+ nci_req_cancel(ndev, ENODEV);
+ mutex_lock(&ndev->req_lock);
+
+ if (!test_and_clear_bit(NCI_UP, &ndev->flags)) {
+ del_timer_sync(&ndev->cmd_timer);
+ del_timer_sync(&ndev->data_timer);
+ mutex_unlock(&ndev->req_lock);
+ return 0;
+ }
+
+ /* Drop RX and TX queues */
+ skb_queue_purge(&ndev->rx_q);
+ skb_queue_purge(&ndev->tx_q);
+
+ /* Flush RX and TX wq */
+ flush_workqueue(ndev->rx_wq);
+ flush_workqueue(ndev->tx_wq);
+
+ /* Reset device */
+ skb_queue_purge(&ndev->cmd_q);
+ atomic_set(&ndev->cmd_cnt, 1);
+
+ set_bit(NCI_INIT, &ndev->flags);
+ __nci_request(ndev, nci_reset_req, 0,
+ msecs_to_jiffies(NCI_RESET_TIMEOUT));
+ clear_bit(NCI_INIT, &ndev->flags);
+
+ del_timer_sync(&ndev->cmd_timer);
+
+ /* Flush cmd wq */
+ flush_workqueue(ndev->cmd_wq);
+
+ /* After this point our queues are empty
+ * and no works are scheduled. */
+ ndev->ops->close(ndev);
+
+ /* Clear flags */
+ ndev->flags = 0;
+
+ mutex_unlock(&ndev->req_lock);
+
+ return 0;
+}
+
+/* NCI command timer function */
+static void nci_cmd_timer(unsigned long arg)
+{
+ struct nci_dev *ndev = (void *) arg;
+
+ atomic_set(&ndev->cmd_cnt, 1);
+ queue_work(ndev->cmd_wq, &ndev->cmd_work);
+}
+
+/* NCI data exchange timer function */
+static void nci_data_timer(unsigned long arg)
+{
+ struct nci_dev *ndev = (void *) arg;
+
+ set_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags);
+ queue_work(ndev->rx_wq, &ndev->rx_work);
+}
+
+static int nci_dev_up(struct nfc_dev *nfc_dev)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ return nci_open_device(ndev);
+}
+
+static int nci_dev_down(struct nfc_dev *nfc_dev)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ return nci_close_device(ndev);
+}
+
+int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val)
+{
+ struct nci_set_config_param param;
+
+ if (!val || !len)
+ return 0;
+
+ param.id = id;
+ param.len = len;
+ param.val = val;
+
+ return __nci_request(ndev, nci_set_config_req, (unsigned long)&param,
+ msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_set_config);
+
+static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_nfcee_discover_cmd cmd;
+ __u8 action = opt;
+
+ cmd.discovery_action = action;
+
+ nci_send_cmd(ndev, NCI_OP_NFCEE_DISCOVER_CMD, 1, &cmd);
+}
+
+int nci_nfcee_discover(struct nci_dev *ndev, u8 action)
+{
+ return nci_request(ndev, nci_nfcee_discover_req, action,
+ msecs_to_jiffies(NCI_CMD_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_nfcee_discover);
+
+static void nci_nfcee_mode_set_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_nfcee_mode_set_cmd *cmd =
+ (struct nci_nfcee_mode_set_cmd *)opt;
+
+ nci_send_cmd(ndev, NCI_OP_NFCEE_MODE_SET_CMD,
+ sizeof(struct nci_nfcee_mode_set_cmd), cmd);
+}
+
+int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode)
+{
+ struct nci_nfcee_mode_set_cmd cmd;
+
+ cmd.nfcee_id = nfcee_id;
+ cmd.nfcee_mode = nfcee_mode;
+
+ return nci_request(ndev, nci_nfcee_mode_set_req, (unsigned long)&cmd,
+ msecs_to_jiffies(NCI_CMD_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_nfcee_mode_set);
+
+static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct core_conn_create_data *data =
+ (struct core_conn_create_data *)opt;
+
+ nci_send_cmd(ndev, NCI_OP_CORE_CONN_CREATE_CMD, data->length, data->cmd);
+}
+
+int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type,
+ u8 number_destination_params,
+ size_t params_len,
+ struct core_conn_create_dest_spec_params *params)
+{
+ int r;
+ struct nci_core_conn_create_cmd *cmd;
+ struct core_conn_create_data data;
+
+ data.length = params_len + sizeof(struct nci_core_conn_create_cmd);
+ cmd = kzalloc(data.length, GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ cmd->destination_type = destination_type;
+ cmd->number_destination_params = number_destination_params;
+ memcpy(cmd->params, params, params_len);
+
+ data.cmd = cmd;
+ ndev->cur_id = params->value[DEST_SPEC_PARAMS_ID_INDEX];
+
+ r = __nci_request(ndev, nci_core_conn_create_req,
+ (unsigned long)&data,
+ msecs_to_jiffies(NCI_CMD_TIMEOUT));
+ kfree(cmd);
+ return r;
+}
+EXPORT_SYMBOL(nci_core_conn_create);
+
+static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt)
+{
+ __u8 conn_id = opt;
+
+ nci_send_cmd(ndev, NCI_OP_CORE_CONN_CLOSE_CMD, 1, &conn_id);
+}
+
+int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id)
+{
+ return nci_request(ndev, nci_core_conn_close_req, conn_id,
+ msecs_to_jiffies(NCI_CMD_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_core_conn_close);
+
+static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ struct nci_set_config_param param;
+ int rc;
+
+ param.val = nfc_get_local_general_bytes(nfc_dev, &param.len);
+ if ((param.val == NULL) || (param.len == 0))
+ return 0;
+
+ if (param.len > NFC_MAX_GT_LEN)
+ return -EINVAL;
+
+ param.id = NCI_PN_ATR_REQ_GEN_BYTES;
+
+ rc = nci_request(ndev, nci_set_config_req, (unsigned long)&param,
+ msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT));
+ if (rc)
+ return rc;
+
+ param.id = NCI_LN_ATR_RES_GEN_BYTES;
+
+ return nci_request(ndev, nci_set_config_req, (unsigned long)&param,
+ msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT));
+}
+
+static int nci_set_listen_parameters(struct nfc_dev *nfc_dev)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ int rc;
+ __u8 val;
+
+ val = NCI_LA_SEL_INFO_NFC_DEP_MASK;
+
+ rc = nci_set_config(ndev, NCI_LA_SEL_INFO, 1, &val);
+ if (rc)
+ return rc;
+
+ val = NCI_LF_PROTOCOL_TYPE_NFC_DEP_MASK;
+
+ rc = nci_set_config(ndev, NCI_LF_PROTOCOL_TYPE, 1, &val);
+ if (rc)
+ return rc;
+
+ val = NCI_LF_CON_BITR_F_212 | NCI_LF_CON_BITR_F_424;
+
+ return nci_set_config(ndev, NCI_LF_CON_BITR_F, 1, &val);
+}
+
+static int nci_start_poll(struct nfc_dev *nfc_dev,
+ __u32 im_protocols, __u32 tm_protocols)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ struct nci_rf_discover_param param;
+ int rc;
+
+ if ((atomic_read(&ndev->state) == NCI_DISCOVERY) ||
+ (atomic_read(&ndev->state) == NCI_W4_ALL_DISCOVERIES)) {
+ pr_err("unable to start poll, since poll is already active\n");
+ return -EBUSY;
+ }
+
+ if (ndev->target_active_prot) {
+ pr_err("there is an active target\n");
+ return -EBUSY;
+ }
+
+ if ((atomic_read(&ndev->state) == NCI_W4_HOST_SELECT) ||
+ (atomic_read(&ndev->state) == NCI_POLL_ACTIVE)) {
+ pr_debug("target active or w4 select, implicitly deactivate\n");
+
+ rc = nci_request(ndev, nci_rf_deactivate_req,
+ NCI_DEACTIVATE_TYPE_IDLE_MODE,
+ msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
+ if (rc)
+ return -EBUSY;
+ }
+
+ if ((im_protocols | tm_protocols) & NFC_PROTO_NFC_DEP_MASK) {
+ rc = nci_set_local_general_bytes(nfc_dev);
+ if (rc) {
+ pr_err("failed to set local general bytes\n");
+ return rc;
+ }
+ }
+
+ if (tm_protocols & NFC_PROTO_NFC_DEP_MASK) {
+ rc = nci_set_listen_parameters(nfc_dev);
+ if (rc)
+ pr_err("failed to set listen parameters\n");
+ }
+
+ param.im_protocols = im_protocols;
+ param.tm_protocols = tm_protocols;
+ rc = nci_request(ndev, nci_rf_discover_req, (unsigned long)&param,
+ msecs_to_jiffies(NCI_RF_DISC_TIMEOUT));
+
+ if (!rc)
+ ndev->poll_prots = im_protocols;
+
+ return rc;
+}
+
+static void nci_stop_poll(struct nfc_dev *nfc_dev)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ if ((atomic_read(&ndev->state) != NCI_DISCOVERY) &&
+ (atomic_read(&ndev->state) != NCI_W4_ALL_DISCOVERIES)) {
+ pr_err("unable to stop poll, since poll is not active\n");
+ return;
+ }
+
+ nci_request(ndev, nci_rf_deactivate_req, NCI_DEACTIVATE_TYPE_IDLE_MODE,
+ msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
+}
+
+static int nci_activate_target(struct nfc_dev *nfc_dev,
+ struct nfc_target *target, __u32 protocol)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ struct nci_rf_discover_select_param param;
+ struct nfc_target *nci_target = NULL;
+ int i;
+ int rc = 0;
+
+ pr_debug("target_idx %d, protocol 0x%x\n", target->idx, protocol);
+
+ if ((atomic_read(&ndev->state) != NCI_W4_HOST_SELECT) &&
+ (atomic_read(&ndev->state) != NCI_POLL_ACTIVE)) {
+ pr_err("there is no available target to activate\n");
+ return -EINVAL;
+ }
+
+ if (ndev->target_active_prot) {
+ pr_err("there is already an active target\n");
+ return -EBUSY;
+ }
+
+ for (i = 0; i < ndev->n_targets; i++) {
+ if (ndev->targets[i].idx == target->idx) {
+ nci_target = &ndev->targets[i];
+ break;
+ }
+ }
+
+ if (!nci_target) {
+ pr_err("unable to find the selected target\n");
+ return -EINVAL;
+ }
+
+ if (!(nci_target->supported_protocols & (1 << protocol))) {
+ pr_err("target does not support the requested protocol 0x%x\n",
+ protocol);
+ return -EINVAL;
+ }
+
+ if (atomic_read(&ndev->state) == NCI_W4_HOST_SELECT) {
+ param.rf_discovery_id = nci_target->logical_idx;
+
+ if (protocol == NFC_PROTO_JEWEL)
+ param.rf_protocol = NCI_RF_PROTOCOL_T1T;
+ else if (protocol == NFC_PROTO_MIFARE)
+ param.rf_protocol = NCI_RF_PROTOCOL_T2T;
+ else if (protocol == NFC_PROTO_FELICA)
+ param.rf_protocol = NCI_RF_PROTOCOL_T3T;
+ else if (protocol == NFC_PROTO_ISO14443 ||
+ protocol == NFC_PROTO_ISO14443_B)
+ param.rf_protocol = NCI_RF_PROTOCOL_ISO_DEP;
+ else
+ param.rf_protocol = NCI_RF_PROTOCOL_NFC_DEP;
+
+ rc = nci_request(ndev, nci_rf_discover_select_req,
+ (unsigned long)&param,
+ msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT));
+ }
+
+ if (!rc)
+ ndev->target_active_prot = protocol;
+
+ return rc;
+}
+
+static void nci_deactivate_target(struct nfc_dev *nfc_dev,
+ struct nfc_target *target)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ pr_debug("entry\n");
+
+ if (!ndev->target_active_prot) {
+ pr_err("unable to deactivate target, no active target\n");
+ return;
+ }
+
+ ndev->target_active_prot = 0;
+
+ if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) {
+ nci_request(ndev, nci_rf_deactivate_req,
+ NCI_DEACTIVATE_TYPE_SLEEP_MODE,
+ msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
+ }
+}
+
+static int nci_dep_link_up(struct nfc_dev *nfc_dev, struct nfc_target *target,
+ __u8 comm_mode, __u8 *gb, size_t gb_len)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ int rc;
+
+ pr_debug("target_idx %d, comm_mode %d\n", target->idx, comm_mode);
+
+ rc = nci_activate_target(nfc_dev, target, NFC_PROTO_NFC_DEP);
+ if (rc)
+ return rc;
+
+ rc = nfc_set_remote_general_bytes(nfc_dev, ndev->remote_gb,
+ ndev->remote_gb_len);
+ if (!rc)
+ rc = nfc_dep_link_is_up(nfc_dev, target->idx, NFC_COMM_PASSIVE,
+ NFC_RF_INITIATOR);
+
+ return rc;
+}
+
+static int nci_dep_link_down(struct nfc_dev *nfc_dev)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ int rc;
+
+ pr_debug("entry\n");
+
+ if (nfc_dev->rf_mode == NFC_RF_INITIATOR) {
+ nci_deactivate_target(nfc_dev, NULL);
+ } else {
+ if (atomic_read(&ndev->state) == NCI_LISTEN_ACTIVE ||
+ atomic_read(&ndev->state) == NCI_DISCOVERY) {
+ nci_request(ndev, nci_rf_deactivate_req, 0,
+ msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT));
+ }
+
+ rc = nfc_tm_deactivated(nfc_dev);
+ if (rc)
+ pr_err("error when signaling tm deactivation\n");
+ }
+
+ return 0;
+}
+
+
+static int nci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target,
+ struct sk_buff *skb,
+ data_exchange_cb_t cb, void *cb_context)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ int rc;
+ struct nci_conn_info *conn_info;
+
+ conn_info = ndev->rf_conn_info;
+ if (!conn_info)
+ return -EPROTO;
+
+ pr_debug("target_idx %d, len %d\n", target->idx, skb->len);
+
+ if (!ndev->target_active_prot) {
+ pr_err("unable to exchange data, no active target\n");
+ return -EINVAL;
+ }
+
+ if (test_and_set_bit(NCI_DATA_EXCHANGE, &ndev->flags))
+ return -EBUSY;
+
+ /* store cb and context to be used on receiving data */
+ conn_info->data_exchange_cb = cb;
+ conn_info->data_exchange_cb_context = cb_context;
+
+ rc = nci_send_data(ndev, NCI_STATIC_RF_CONN_ID, skb);
+ if (rc)
+ clear_bit(NCI_DATA_EXCHANGE, &ndev->flags);
+
+ return rc;
+}
+
+static int nci_tm_send(struct nfc_dev *nfc_dev, struct sk_buff *skb)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+ int rc;
+
+ rc = nci_send_data(ndev, NCI_STATIC_RF_CONN_ID, skb);
+ if (rc)
+ pr_err("unable to send data\n");
+
+ return rc;
+}
+
+static int nci_enable_se(struct nfc_dev *nfc_dev, u32 se_idx)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ if (ndev->ops->enable_se)
+ return ndev->ops->enable_se(ndev, se_idx);
+
+ return 0;
+}
+
+static int nci_disable_se(struct nfc_dev *nfc_dev, u32 se_idx)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ if (ndev->ops->disable_se)
+ return ndev->ops->disable_se(ndev, se_idx);
+
+ return 0;
+}
+
+static int nci_discover_se(struct nfc_dev *nfc_dev)
+{
+ int r;
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ if (ndev->ops->discover_se) {
+ r = nci_nfcee_discover(ndev, NCI_NFCEE_DISCOVERY_ACTION_ENABLE);
+ if (r != NCI_STATUS_OK)
+ return -EPROTO;
+
+ return ndev->ops->discover_se(ndev);
+ }
+
+ return 0;
+}
+
+static int nci_se_io(struct nfc_dev *nfc_dev, u32 se_idx,
+ u8 *apdu, size_t apdu_length,
+ se_io_cb_t cb, void *cb_context)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ if (ndev->ops->se_io)
+ return ndev->ops->se_io(ndev, se_idx, apdu,
+ apdu_length, cb, cb_context);
+
+ return 0;
+}
+
+static int nci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name)
+{
+ struct nci_dev *ndev = nfc_get_drvdata(nfc_dev);
+
+ if (!ndev->ops->fw_download)
+ return -ENOTSUPP;
+
+ return ndev->ops->fw_download(ndev, firmware_name);
+}
+
+static struct nfc_ops nci_nfc_ops = {
+ .dev_up = nci_dev_up,
+ .dev_down = nci_dev_down,
+ .start_poll = nci_start_poll,
+ .stop_poll = nci_stop_poll,
+ .dep_link_up = nci_dep_link_up,
+ .dep_link_down = nci_dep_link_down,
+ .activate_target = nci_activate_target,
+ .deactivate_target = nci_deactivate_target,
+ .im_transceive = nci_transceive,
+ .tm_send = nci_tm_send,
+ .enable_se = nci_enable_se,
+ .disable_se = nci_disable_se,
+ .discover_se = nci_discover_se,
+ .se_io = nci_se_io,
+ .fw_download = nci_fw_download,
+};
+
+/* ---- Interface to NCI drivers ---- */
+/**
+ * nci_allocate_device - allocate a new nci device
+ *
+ * @ops: device operations
+ * @supported_protocols: NFC protocols supported by the device
+ */
+struct nci_dev *nci_allocate_device(struct nci_ops *ops,
+ __u32 supported_protocols,
+ int tx_headroom, int tx_tailroom)
+{
+ struct nci_dev *ndev;
+
+ pr_debug("supported_protocols 0x%x\n", supported_protocols);
+
+ if (!ops->open || !ops->close || !ops->send)
+ return NULL;
+
+ if (!supported_protocols)
+ return NULL;
+
+ ndev = kzalloc(sizeof(struct nci_dev), GFP_KERNEL);
+ if (!ndev)
+ return NULL;
+
+ ndev->ops = ops;
+ ndev->tx_headroom = tx_headroom;
+ ndev->tx_tailroom = tx_tailroom;
+ init_completion(&ndev->req_completion);
+
+ ndev->nfc_dev = nfc_allocate_device(&nci_nfc_ops,
+ supported_protocols,
+ tx_headroom + NCI_DATA_HDR_SIZE,
+ tx_tailroom);
+ if (!ndev->nfc_dev)
+ goto free_nci;
+
+ ndev->hci_dev = nci_hci_allocate(ndev);
+ if (!ndev->hci_dev)
+ goto free_nfc;
+
+ nfc_set_drvdata(ndev->nfc_dev, ndev);
+
+ return ndev;
+
+free_nfc:
+ kfree(ndev->nfc_dev);
+
+free_nci:
+ kfree(ndev);
+ return NULL;
+}
+EXPORT_SYMBOL(nci_allocate_device);
+
+/**
+ * nci_free_device - deallocate nci device
+ *
+ * @ndev: The nci device to deallocate
+ */
+void nci_free_device(struct nci_dev *ndev)
+{
+ nfc_free_device(ndev->nfc_dev);
+ kfree(ndev);
+}
+EXPORT_SYMBOL(nci_free_device);
+
+/**
+ * nci_register_device - register a nci device in the nfc subsystem
+ *
+ * @dev: The nci device to register
+ */
+int nci_register_device(struct nci_dev *ndev)
+{
+ int rc;
+ struct device *dev = &ndev->nfc_dev->dev;
+ char name[32];
+
+ ndev->flags = 0;
+
+ INIT_WORK(&ndev->cmd_work, nci_cmd_work);
+ snprintf(name, sizeof(name), "%s_nci_cmd_wq", dev_name(dev));
+ ndev->cmd_wq = create_singlethread_workqueue(name);
+ if (!ndev->cmd_wq) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ INIT_WORK(&ndev->rx_work, nci_rx_work);
+ snprintf(name, sizeof(name), "%s_nci_rx_wq", dev_name(dev));
+ ndev->rx_wq = create_singlethread_workqueue(name);
+ if (!ndev->rx_wq) {
+ rc = -ENOMEM;
+ goto destroy_cmd_wq_exit;
+ }
+
+ INIT_WORK(&ndev->tx_work, nci_tx_work);
+ snprintf(name, sizeof(name), "%s_nci_tx_wq", dev_name(dev));
+ ndev->tx_wq = create_singlethread_workqueue(name);
+ if (!ndev->tx_wq) {
+ rc = -ENOMEM;
+ goto destroy_rx_wq_exit;
+ }
+
+ skb_queue_head_init(&ndev->cmd_q);
+ skb_queue_head_init(&ndev->rx_q);
+ skb_queue_head_init(&ndev->tx_q);
+
+ setup_timer(&ndev->cmd_timer, nci_cmd_timer,
+ (unsigned long) ndev);
+ setup_timer(&ndev->data_timer, nci_data_timer,
+ (unsigned long) ndev);
+
+ mutex_init(&ndev->req_lock);
+ INIT_LIST_HEAD(&ndev->conn_info_list);
+
+ rc = nfc_register_device(ndev->nfc_dev);
+ if (rc)
+ goto destroy_rx_wq_exit;
+
+ goto exit;
+
+destroy_rx_wq_exit:
+ destroy_workqueue(ndev->rx_wq);
+
+destroy_cmd_wq_exit:
+ destroy_workqueue(ndev->cmd_wq);
+
+exit:
+ return rc;
+}
+EXPORT_SYMBOL(nci_register_device);
+
+/**
+ * nci_unregister_device - unregister a nci device in the nfc subsystem
+ *
+ * @dev: The nci device to unregister
+ */
+void nci_unregister_device(struct nci_dev *ndev)
+{
+ struct nci_conn_info *conn_info, *n;
+
+ nci_close_device(ndev);
+
+ destroy_workqueue(ndev->cmd_wq);
+ destroy_workqueue(ndev->rx_wq);
+ destroy_workqueue(ndev->tx_wq);
+
+ list_for_each_entry_safe(conn_info, n, &ndev->conn_info_list, list) {
+ list_del(&conn_info->list);
+ /* conn_info is allocated with devm_kzalloc */
+ }
+
+ nfc_unregister_device(ndev->nfc_dev);
+}
+EXPORT_SYMBOL(nci_unregister_device);
+
+/**
+ * nci_recv_frame - receive frame from NCI drivers
+ *
+ * @ndev: The nci device
+ * @skb: The sk_buff to receive
+ */
+int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ pr_debug("len %d\n", skb->len);
+
+ if (!ndev || (!test_bit(NCI_UP, &ndev->flags) &&
+ !test_bit(NCI_INIT, &ndev->flags))) {
+ kfree_skb(skb);
+ return -ENXIO;
+ }
+
+ /* Queue frame for rx worker thread */
+ skb_queue_tail(&ndev->rx_q, skb);
+ queue_work(ndev->rx_wq, &ndev->rx_work);
+
+ return 0;
+}
+EXPORT_SYMBOL(nci_recv_frame);
+
+static int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ pr_debug("len %d\n", skb->len);
+
+ if (!ndev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+
+ /* Get rid of skb owner, prior to sending to the driver. */
+ skb_orphan(skb);
+
+ /* Send copy to sniffer */
+ nfc_send_to_raw_sock(ndev->nfc_dev, skb,
+ RAW_PAYLOAD_NCI, NFC_DIRECTION_TX);
+
+ return ndev->ops->send(ndev, skb);
+}
+
+/* Send NCI command */
+int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload)
+{
+ struct nci_ctrl_hdr *hdr;
+ struct sk_buff *skb;
+
+ pr_debug("opcode 0x%x, plen %d\n", opcode, plen);
+
+ skb = nci_skb_alloc(ndev, (NCI_CTRL_HDR_SIZE + plen), GFP_KERNEL);
+ if (!skb) {
+ pr_err("no memory for command\n");
+ return -ENOMEM;
+ }
+
+ hdr = (struct nci_ctrl_hdr *) skb_put(skb, NCI_CTRL_HDR_SIZE);
+ hdr->gid = nci_opcode_gid(opcode);
+ hdr->oid = nci_opcode_oid(opcode);
+ hdr->plen = plen;
+
+ nci_mt_set((__u8 *)hdr, NCI_MT_CMD_PKT);
+ nci_pbf_set((__u8 *)hdr, NCI_PBF_LAST);
+
+ if (plen)
+ memcpy(skb_put(skb, plen), payload, plen);
+
+ skb_queue_tail(&ndev->cmd_q, skb);
+ queue_work(ndev->cmd_wq, &ndev->cmd_work);
+
+ return 0;
+}
+
+/* ---- NCI TX Data worker thread ---- */
+
+static void nci_tx_work(struct work_struct *work)
+{
+ struct nci_dev *ndev = container_of(work, struct nci_dev, tx_work);
+ struct nci_conn_info *conn_info;
+ struct sk_buff *skb;
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id);
+ if (!conn_info)
+ return;
+
+ pr_debug("credits_cnt %d\n", atomic_read(&conn_info->credits_cnt));
+
+ /* Send queued tx data */
+ while (atomic_read(&conn_info->credits_cnt)) {
+ skb = skb_dequeue(&ndev->tx_q);
+ if (!skb)
+ return;
+
+ /* Check if data flow control is used */
+ if (atomic_read(&conn_info->credits_cnt) !=
+ NCI_DATA_FLOW_CONTROL_NOT_USED)
+ atomic_dec(&conn_info->credits_cnt);
+
+ pr_debug("NCI TX: MT=data, PBF=%d, conn_id=%d, plen=%d\n",
+ nci_pbf(skb->data),
+ nci_conn_id(skb->data),
+ nci_plen(skb->data));
+
+ nci_send_frame(ndev, skb);
+
+ mod_timer(&ndev->data_timer,
+ jiffies + msecs_to_jiffies(NCI_DATA_TIMEOUT));
+ }
+}
+
+/* ----- NCI RX worker thread (data & control) ----- */
+
+static void nci_rx_work(struct work_struct *work)
+{
+ struct nci_dev *ndev = container_of(work, struct nci_dev, rx_work);
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&ndev->rx_q))) {
+
+ /* Send copy to sniffer */
+ nfc_send_to_raw_sock(ndev->nfc_dev, skb,
+ RAW_PAYLOAD_NCI, NFC_DIRECTION_RX);
+
+ /* Process frame */
+ switch (nci_mt(skb->data)) {
+ case NCI_MT_RSP_PKT:
+ nci_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_MT_NTF_PKT:
+ nci_ntf_packet(ndev, skb);
+ break;
+
+ case NCI_MT_DATA_PKT:
+ nci_rx_data_packet(ndev, skb);
+ break;
+
+ default:
+ pr_err("unknown MT 0x%x\n", nci_mt(skb->data));
+ kfree_skb(skb);
+ break;
+ }
+ }
+
+ /* check if a data exchange timout has occurred */
+ if (test_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags)) {
+ /* complete the data exchange transaction, if exists */
+ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags))
+ nci_data_exchange_complete(ndev, NULL,
+ ndev->cur_conn_id,
+ -ETIMEDOUT);
+
+ clear_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags);
+ }
+}
+
+/* ----- NCI TX CMD worker thread ----- */
+
+static void nci_cmd_work(struct work_struct *work)
+{
+ struct nci_dev *ndev = container_of(work, struct nci_dev, cmd_work);
+ struct sk_buff *skb;
+
+ pr_debug("cmd_cnt %d\n", atomic_read(&ndev->cmd_cnt));
+
+ /* Send queued command */
+ if (atomic_read(&ndev->cmd_cnt)) {
+ skb = skb_dequeue(&ndev->cmd_q);
+ if (!skb)
+ return;
+
+ atomic_dec(&ndev->cmd_cnt);
+
+ pr_debug("NCI TX: MT=cmd, PBF=%d, GID=0x%x, OID=0x%x, plen=%d\n",
+ nci_pbf(skb->data),
+ nci_opcode_gid(nci_opcode(skb->data)),
+ nci_opcode_oid(nci_opcode(skb->data)),
+ nci_plen(skb->data));
+
+ nci_send_frame(ndev, skb);
+
+ mod_timer(&ndev->cmd_timer,
+ jiffies + msecs_to_jiffies(NCI_CMD_TIMEOUT));
+ }
+}
+
+MODULE_LICENSE("GPL");
diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c
new file mode 100644
index 000000000..566466d90
--- /dev/null
+++ b/net/nfc/nci/data.c
@@ -0,0 +1,298 @@
+/*
+ * The NFC Controller Interface is the communication protocol between an
+ * NFC Controller (NFCC) and a Device Host (DH).
+ *
+ * Copyright (C) 2011 Texas Instruments, Inc.
+ * Copyright (C) 2014 Marvell International Ltd.
+ *
+ * Written by Ilan Elias <ilane@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+
+#include "../nfc.h"
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+#include <linux/nfc.h>
+
+/* Complete data exchange transaction and forward skb to nfc core */
+void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb,
+ __u8 conn_id, int err)
+{
+ struct nci_conn_info *conn_info;
+ data_exchange_cb_t cb;
+ void *cb_context;
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id);
+ if (!conn_info) {
+ kfree_skb(skb);
+ goto exit;
+ }
+
+ cb = conn_info->data_exchange_cb;
+ cb_context = conn_info->data_exchange_cb_context;
+
+ pr_debug("len %d, err %d\n", skb ? skb->len : 0, err);
+
+ /* data exchange is complete, stop the data timer */
+ del_timer_sync(&ndev->data_timer);
+ clear_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags);
+
+ if (cb) {
+ /* forward skb to nfc core */
+ cb(cb_context, skb, err);
+ } else if (skb) {
+ pr_err("no rx callback, dropping rx data...\n");
+
+ /* no waiting callback, free skb */
+ kfree_skb(skb);
+ }
+
+exit:
+ clear_bit(NCI_DATA_EXCHANGE, &ndev->flags);
+}
+
+/* ----------------- NCI TX Data ----------------- */
+
+static inline void nci_push_data_hdr(struct nci_dev *ndev,
+ __u8 conn_id,
+ struct sk_buff *skb,
+ __u8 pbf)
+{
+ struct nci_data_hdr *hdr;
+ int plen = skb->len;
+
+ hdr = (struct nci_data_hdr *) skb_push(skb, NCI_DATA_HDR_SIZE);
+ hdr->conn_id = conn_id;
+ hdr->rfu = 0;
+ hdr->plen = plen;
+
+ nci_mt_set((__u8 *)hdr, NCI_MT_DATA_PKT);
+ nci_pbf_set((__u8 *)hdr, pbf);
+}
+
+static int nci_queue_tx_data_frags(struct nci_dev *ndev,
+ __u8 conn_id,
+ struct sk_buff *skb) {
+ struct nci_conn_info *conn_info;
+ int total_len = skb->len;
+ unsigned char *data = skb->data;
+ unsigned long flags;
+ struct sk_buff_head frags_q;
+ struct sk_buff *skb_frag;
+ int frag_len;
+ int rc = 0;
+
+ pr_debug("conn_id 0x%x, total_len %d\n", conn_id, total_len);
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id);
+ if (!conn_info) {
+ rc = -EPROTO;
+ goto free_exit;
+ }
+
+ __skb_queue_head_init(&frags_q);
+
+ while (total_len) {
+ frag_len =
+ min_t(int, total_len, conn_info->max_pkt_payload_len);
+
+ skb_frag = nci_skb_alloc(ndev,
+ (NCI_DATA_HDR_SIZE + frag_len),
+ GFP_KERNEL);
+ if (skb_frag == NULL) {
+ rc = -ENOMEM;
+ goto free_exit;
+ }
+ skb_reserve(skb_frag, NCI_DATA_HDR_SIZE);
+
+ /* first, copy the data */
+ memcpy(skb_put(skb_frag, frag_len), data, frag_len);
+
+ /* second, set the header */
+ nci_push_data_hdr(ndev, conn_id, skb_frag,
+ ((total_len == frag_len) ?
+ (NCI_PBF_LAST) : (NCI_PBF_CONT)));
+
+ __skb_queue_tail(&frags_q, skb_frag);
+
+ data += frag_len;
+ total_len -= frag_len;
+
+ pr_debug("frag_len %d, remaining total_len %d\n",
+ frag_len, total_len);
+ }
+
+ /* queue all fragments atomically */
+ spin_lock_irqsave(&ndev->tx_q.lock, flags);
+
+ while ((skb_frag = __skb_dequeue(&frags_q)) != NULL)
+ __skb_queue_tail(&ndev->tx_q, skb_frag);
+
+ spin_unlock_irqrestore(&ndev->tx_q.lock, flags);
+
+ /* free the original skb */
+ kfree_skb(skb);
+
+ goto exit;
+
+free_exit:
+ while ((skb_frag = __skb_dequeue(&frags_q)) != NULL)
+ kfree_skb(skb_frag);
+
+exit:
+ return rc;
+}
+
+/* Send NCI data */
+int nci_send_data(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb)
+{
+ struct nci_conn_info *conn_info;
+ int rc = 0;
+
+ pr_debug("conn_id 0x%x, plen %d\n", conn_id, skb->len);
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id);
+ if (!conn_info) {
+ rc = -EPROTO;
+ goto free_exit;
+ }
+
+ /* check if the packet need to be fragmented */
+ if (skb->len <= conn_info->max_pkt_payload_len) {
+ /* no need to fragment packet */
+ nci_push_data_hdr(ndev, conn_id, skb, NCI_PBF_LAST);
+
+ skb_queue_tail(&ndev->tx_q, skb);
+ } else {
+ /* fragment packet and queue the fragments */
+ rc = nci_queue_tx_data_frags(ndev, conn_id, skb);
+ if (rc) {
+ pr_err("failed to fragment tx data packet\n");
+ goto free_exit;
+ }
+ }
+
+ ndev->cur_conn_id = conn_id;
+ queue_work(ndev->tx_wq, &ndev->tx_work);
+
+ goto exit;
+
+free_exit:
+ kfree_skb(skb);
+
+exit:
+ return rc;
+}
+
+/* ----------------- NCI RX Data ----------------- */
+
+static void nci_add_rx_data_frag(struct nci_dev *ndev,
+ struct sk_buff *skb,
+ __u8 pbf, __u8 conn_id, __u8 status)
+{
+ int reassembly_len;
+ int err = 0;
+
+ if (status) {
+ err = status;
+ goto exit;
+ }
+
+ if (ndev->rx_data_reassembly) {
+ reassembly_len = ndev->rx_data_reassembly->len;
+
+ /* first, make enough room for the already accumulated data */
+ if (skb_cow_head(skb, reassembly_len)) {
+ pr_err("error adding room for accumulated rx data\n");
+
+ kfree_skb(skb);
+ skb = NULL;
+
+ kfree_skb(ndev->rx_data_reassembly);
+ ndev->rx_data_reassembly = NULL;
+
+ err = -ENOMEM;
+ goto exit;
+ }
+
+ /* second, combine the two fragments */
+ memcpy(skb_push(skb, reassembly_len),
+ ndev->rx_data_reassembly->data,
+ reassembly_len);
+
+ /* third, free old reassembly */
+ kfree_skb(ndev->rx_data_reassembly);
+ ndev->rx_data_reassembly = NULL;
+ }
+
+ if (pbf == NCI_PBF_CONT) {
+ /* need to wait for next fragment, store skb and exit */
+ ndev->rx_data_reassembly = skb;
+ return;
+ }
+
+exit:
+ if (ndev->nfc_dev->rf_mode == NFC_RF_TARGET) {
+ /* Data received in Target mode, forward to nfc core */
+ err = nfc_tm_data_received(ndev->nfc_dev, skb);
+ if (err)
+ pr_err("unable to handle received data\n");
+ } else {
+ nci_data_exchange_complete(ndev, skb, conn_id, err);
+ }
+}
+
+/* Rx Data packet */
+void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ __u8 pbf = nci_pbf(skb->data);
+ __u8 status = 0;
+ __u8 conn_id = nci_conn_id(skb->data);
+ struct nci_conn_info *conn_info;
+
+ pr_debug("len %d\n", skb->len);
+
+ pr_debug("NCI RX: MT=data, PBF=%d, conn_id=%d, plen=%d\n",
+ nci_pbf(skb->data),
+ nci_conn_id(skb->data),
+ nci_plen(skb->data));
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev, nci_conn_id(skb->data));
+ if (!conn_info)
+ return;
+
+ /* strip the nci data header */
+ skb_pull(skb, NCI_DATA_HDR_SIZE);
+
+ if (ndev->target_active_prot == NFC_PROTO_MIFARE ||
+ ndev->target_active_prot == NFC_PROTO_JEWEL ||
+ ndev->target_active_prot == NFC_PROTO_FELICA ||
+ ndev->target_active_prot == NFC_PROTO_ISO15693) {
+ /* frame I/F => remove the status byte */
+ pr_debug("frame I/F => remove the status byte\n");
+ status = skb->data[skb->len - 1];
+ skb_trim(skb, (skb->len - 1));
+ }
+
+ nci_add_rx_data_frag(ndev, skb, pbf, conn_id, nci_to_errno(status));
+}
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
new file mode 100644
index 000000000..ed54ec533
--- /dev/null
+++ b/net/nfc/nci/hci.c
@@ -0,0 +1,694 @@
+/*
+ * The NFC Controller Interface is the communication protocol between an
+ * NFC Controller (NFCC) and a Device Host (DH).
+ * This is the HCI over NCI implementation, as specified in the 10.2
+ * section of the NCI 1.1 specification.
+ *
+ * Copyright (C) 2014 STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/skbuff.h>
+
+#include "../nfc.h"
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+#include <linux/nfc.h>
+
+struct nci_data {
+ u8 conn_id;
+ u8 pipe;
+ u8 cmd;
+ const u8 *data;
+ u32 data_len;
+} __packed;
+
+struct nci_hci_create_pipe_params {
+ u8 src_gate;
+ u8 dest_host;
+ u8 dest_gate;
+} __packed;
+
+struct nci_hci_create_pipe_resp {
+ u8 src_host;
+ u8 src_gate;
+ u8 dest_host;
+ u8 dest_gate;
+ u8 pipe;
+} __packed;
+
+struct nci_hci_delete_pipe_noti {
+ u8 pipe;
+} __packed;
+
+struct nci_hci_all_pipe_cleared_noti {
+ u8 host;
+} __packed;
+
+struct nci_hcp_message {
+ u8 header; /* type -cmd,evt,rsp- + instruction */
+ u8 data[];
+} __packed;
+
+struct nci_hcp_packet {
+ u8 header; /* cbit+pipe */
+ struct nci_hcp_message message;
+} __packed;
+
+#define NCI_HCI_ANY_SET_PARAMETER 0x01
+#define NCI_HCI_ANY_GET_PARAMETER 0x02
+#define NCI_HCI_ANY_CLOSE_PIPE 0x04
+
+#define NCI_HFP_NO_CHAINING 0x80
+
+#define NCI_NFCEE_ID_HCI 0x80
+
+#define NCI_EVT_HOT_PLUG 0x03
+
+#define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY 0x01
+
+/* HCP headers */
+#define NCI_HCI_HCP_PACKET_HEADER_LEN 1
+#define NCI_HCI_HCP_MESSAGE_HEADER_LEN 1
+#define NCI_HCI_HCP_HEADER_LEN 2
+
+/* HCP types */
+#define NCI_HCI_HCP_COMMAND 0x00
+#define NCI_HCI_HCP_EVENT 0x01
+#define NCI_HCI_HCP_RESPONSE 0x02
+
+#define NCI_HCI_ADM_NOTIFY_PIPE_CREATED 0x12
+#define NCI_HCI_ADM_NOTIFY_PIPE_DELETED 0x13
+#define NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED 0x15
+
+#define NCI_HCI_FRAGMENT 0x7f
+#define NCI_HCP_HEADER(type, instr) ((((type) & 0x03) << 6) |\
+ ((instr) & 0x3f))
+
+#define NCI_HCP_MSG_GET_TYPE(header) ((header & 0xc0) >> 6)
+#define NCI_HCP_MSG_GET_CMD(header) (header & 0x3f)
+#define NCI_HCP_MSG_GET_PIPE(header) (header & 0x7f)
+
+/* HCI core */
+static void nci_hci_reset_pipes(struct nci_hci_dev *hdev)
+{
+ int i;
+
+ for (i = 0; i < NCI_HCI_MAX_PIPES; i++) {
+ hdev->pipes[i].gate = NCI_HCI_INVALID_GATE;
+ hdev->pipes[i].host = NCI_HCI_INVALID_HOST;
+ }
+ memset(hdev->gate2pipe, NCI_HCI_INVALID_PIPE, sizeof(hdev->gate2pipe));
+}
+
+static void nci_hci_reset_pipes_per_host(struct nci_dev *ndev, u8 host)
+{
+ int i;
+
+ for (i = 0; i < NCI_HCI_MAX_PIPES; i++) {
+ if (ndev->hci_dev->pipes[i].host == host) {
+ ndev->hci_dev->pipes[i].gate = NCI_HCI_INVALID_GATE;
+ ndev->hci_dev->pipes[i].host = NCI_HCI_INVALID_HOST;
+ }
+ }
+}
+
+/* Fragment HCI data over NCI packet.
+ * NFC Forum NCI 10.2.2 Data Exchange:
+ * The payload of the Data Packets sent on the Logical Connection SHALL be
+ * valid HCP packets, as defined within [ETSI_102622]. Each Data Packet SHALL
+ * contain a single HCP packet. NCI Segmentation and Reassembly SHALL NOT be
+ * applied to Data Messages in either direction. The HCI fragmentation mechanism
+ * is used if required.
+ */
+static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe,
+ const u8 data_type, const u8 *data,
+ size_t data_len)
+{
+ struct nci_conn_info *conn_info;
+ struct sk_buff *skb;
+ int len, i, r;
+ u8 cb = pipe;
+
+ conn_info = ndev->hci_dev->conn_info;
+ if (!conn_info)
+ return -EPROTO;
+
+ skb = nci_skb_alloc(ndev, 2 + conn_info->max_pkt_payload_len +
+ NCI_DATA_HDR_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_reserve(skb, 2 + NCI_DATA_HDR_SIZE);
+ *skb_push(skb, 1) = data_type;
+
+ i = 0;
+ len = conn_info->max_pkt_payload_len;
+
+ do {
+ /* If last packet add NCI_HFP_NO_CHAINING */
+ if (i + conn_info->max_pkt_payload_len -
+ (skb->len + 1) >= data_len) {
+ cb |= NCI_HFP_NO_CHAINING;
+ len = data_len - i;
+ } else {
+ len = conn_info->max_pkt_payload_len - skb->len - 1;
+ }
+
+ *skb_push(skb, 1) = cb;
+
+ if (len > 0)
+ memcpy(skb_put(skb, len), data + i, len);
+
+ r = nci_send_data(ndev, conn_info->conn_id, skb);
+ if (r < 0)
+ return r;
+
+ i += len;
+ if (i < data_len) {
+ skb_trim(skb, 0);
+ skb_pull(skb, len);
+ }
+ } while (i < data_len);
+
+ return i;
+}
+
+static void nci_hci_send_data_req(struct nci_dev *ndev, unsigned long opt)
+{
+ struct nci_data *data = (struct nci_data *)opt;
+
+ nci_hci_send_data(ndev, data->pipe, data->cmd,
+ data->data, data->data_len);
+}
+
+int nci_hci_send_event(struct nci_dev *ndev, u8 gate, u8 event,
+ const u8 *param, size_t param_len)
+{
+ u8 pipe = ndev->hci_dev->gate2pipe[gate];
+
+ if (pipe == NCI_HCI_INVALID_PIPE)
+ return -EADDRNOTAVAIL;
+
+ return nci_hci_send_data(ndev, pipe,
+ NCI_HCP_HEADER(NCI_HCI_HCP_EVENT, event),
+ param, param_len);
+}
+EXPORT_SYMBOL(nci_hci_send_event);
+
+int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd,
+ const u8 *param, size_t param_len,
+ struct sk_buff **skb)
+{
+ struct nci_conn_info *conn_info;
+ struct nci_data data;
+ int r;
+ u8 pipe = ndev->hci_dev->gate2pipe[gate];
+
+ if (pipe == NCI_HCI_INVALID_PIPE)
+ return -EADDRNOTAVAIL;
+
+ conn_info = ndev->hci_dev->conn_info;
+ if (!conn_info)
+ return -EPROTO;
+
+ data.conn_id = conn_info->conn_id;
+ data.pipe = pipe;
+ data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, cmd);
+ data.data = param;
+ data.data_len = param_len;
+
+ r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data,
+ msecs_to_jiffies(NCI_DATA_TIMEOUT));
+
+ if (r == NCI_STATUS_OK)
+ *skb = conn_info->rx_skb;
+
+ return r;
+}
+EXPORT_SYMBOL(nci_hci_send_cmd);
+
+static void nci_hci_event_received(struct nci_dev *ndev, u8 pipe,
+ u8 event, struct sk_buff *skb)
+{
+ if (ndev->ops->hci_event_received)
+ ndev->ops->hci_event_received(ndev, pipe, event, skb);
+}
+
+static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe,
+ u8 cmd, struct sk_buff *skb)
+{
+ u8 gate = ndev->hci_dev->pipes[pipe].gate;
+ u8 status = NCI_HCI_ANY_OK | ~NCI_HCI_FRAGMENT;
+ u8 dest_gate, new_pipe;
+ struct nci_hci_create_pipe_resp *create_info;
+ struct nci_hci_delete_pipe_noti *delete_info;
+ struct nci_hci_all_pipe_cleared_noti *cleared_info;
+
+ pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd);
+
+ switch (cmd) {
+ case NCI_HCI_ADM_NOTIFY_PIPE_CREATED:
+ if (skb->len != 5) {
+ status = NCI_HCI_ANY_E_NOK;
+ goto exit;
+ }
+ create_info = (struct nci_hci_create_pipe_resp *)skb->data;
+ dest_gate = create_info->dest_gate;
+ new_pipe = create_info->pipe;
+
+ /* Save the new created pipe and bind with local gate,
+ * the description for skb->data[3] is destination gate id
+ * but since we received this cmd from host controller, we
+ * are the destination and it is our local gate
+ */
+ ndev->hci_dev->gate2pipe[dest_gate] = new_pipe;
+ ndev->hci_dev->pipes[new_pipe].gate = dest_gate;
+ ndev->hci_dev->pipes[new_pipe].host =
+ create_info->src_host;
+ break;
+ case NCI_HCI_ANY_OPEN_PIPE:
+ /* If the pipe is not created report an error */
+ if (gate == NCI_HCI_INVALID_GATE) {
+ status = NCI_HCI_ANY_E_NOK;
+ goto exit;
+ }
+ break;
+ case NCI_HCI_ADM_NOTIFY_PIPE_DELETED:
+ if (skb->len != 1) {
+ status = NCI_HCI_ANY_E_NOK;
+ goto exit;
+ }
+ delete_info = (struct nci_hci_delete_pipe_noti *)skb->data;
+
+ ndev->hci_dev->pipes[delete_info->pipe].gate =
+ NCI_HCI_INVALID_GATE;
+ ndev->hci_dev->pipes[delete_info->pipe].host =
+ NCI_HCI_INVALID_HOST;
+ break;
+ case NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED:
+ if (skb->len != 1) {
+ status = NCI_HCI_ANY_E_NOK;
+ goto exit;
+ }
+
+ cleared_info =
+ (struct nci_hci_all_pipe_cleared_noti *)skb->data;
+ nci_hci_reset_pipes_per_host(ndev, cleared_info->host);
+ break;
+ default:
+ pr_debug("Discarded unknown cmd %x to gate %x\n", cmd, gate);
+ break;
+ }
+
+ if (ndev->ops->hci_cmd_received)
+ ndev->ops->hci_cmd_received(ndev, pipe, cmd, skb);
+
+exit:
+ nci_hci_send_data(ndev, pipe, status, NULL, 0);
+
+ kfree_skb(skb);
+}
+
+static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe,
+ u8 result, struct sk_buff *skb)
+{
+ struct nci_conn_info *conn_info;
+ u8 status = result;
+
+ if (result != NCI_HCI_ANY_OK)
+ goto exit;
+
+ conn_info = ndev->hci_dev->conn_info;
+ if (!conn_info) {
+ status = NCI_STATUS_REJECTED;
+ goto exit;
+ }
+
+ conn_info->rx_skb = skb;
+
+exit:
+ nci_req_complete(ndev, status);
+}
+
+/* Receive hcp message for pipe, with type and cmd.
+ * skb contains optional message data only.
+ */
+static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe,
+ u8 type, u8 instruction, struct sk_buff *skb)
+{
+ switch (type) {
+ case NCI_HCI_HCP_RESPONSE:
+ nci_hci_resp_received(ndev, pipe, instruction, skb);
+ break;
+ case NCI_HCI_HCP_COMMAND:
+ nci_hci_cmd_received(ndev, pipe, instruction, skb);
+ break;
+ case NCI_HCI_HCP_EVENT:
+ nci_hci_event_received(ndev, pipe, instruction, skb);
+ break;
+ default:
+ pr_err("UNKNOWN MSG Type %d, instruction=%d\n",
+ type, instruction);
+ kfree_skb(skb);
+ break;
+ }
+
+ nci_req_complete(ndev, 0);
+}
+
+static void nci_hci_msg_rx_work(struct work_struct *work)
+{
+ struct nci_hci_dev *hdev =
+ container_of(work, struct nci_hci_dev, msg_rx_work);
+ struct sk_buff *skb;
+ struct nci_hcp_message *message;
+ u8 pipe, type, instruction;
+
+ while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) {
+ pipe = skb->data[0];
+ skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN);
+ message = (struct nci_hcp_message *)skb->data;
+ type = NCI_HCP_MSG_GET_TYPE(message->header);
+ instruction = NCI_HCP_MSG_GET_CMD(message->header);
+ skb_pull(skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN);
+
+ nci_hci_hcp_message_rx(hdev->ndev, pipe,
+ type, instruction, skb);
+ }
+}
+
+void nci_hci_data_received_cb(void *context,
+ struct sk_buff *skb, int err)
+{
+ struct nci_dev *ndev = (struct nci_dev *)context;
+ struct nci_hcp_packet *packet;
+ u8 pipe, type, instruction;
+ struct sk_buff *hcp_skb;
+ struct sk_buff *frag_skb;
+ int msg_len;
+
+ pr_debug("\n");
+
+ if (err) {
+ nci_req_complete(ndev, err);
+ return;
+ }
+
+ packet = (struct nci_hcp_packet *)skb->data;
+ if ((packet->header & ~NCI_HCI_FRAGMENT) == 0) {
+ skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb);
+ return;
+ }
+
+ /* it's the last fragment. Does it need re-aggregation? */
+ if (skb_queue_len(&ndev->hci_dev->rx_hcp_frags)) {
+ pipe = packet->header & NCI_HCI_FRAGMENT;
+ skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb);
+
+ msg_len = 0;
+ skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) {
+ msg_len += (frag_skb->len -
+ NCI_HCI_HCP_PACKET_HEADER_LEN);
+ }
+
+ hcp_skb = nfc_alloc_recv_skb(NCI_HCI_HCP_PACKET_HEADER_LEN +
+ msg_len, GFP_KERNEL);
+ if (!hcp_skb) {
+ nci_req_complete(ndev, -ENOMEM);
+ return;
+ }
+
+ *skb_put(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN) = pipe;
+
+ skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) {
+ msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN;
+ memcpy(skb_put(hcp_skb, msg_len), frag_skb->data +
+ NCI_HCI_HCP_PACKET_HEADER_LEN, msg_len);
+ }
+
+ skb_queue_purge(&ndev->hci_dev->rx_hcp_frags);
+ } else {
+ packet->header &= NCI_HCI_FRAGMENT;
+ hcp_skb = skb;
+ }
+
+ /* if this is a response, dispatch immediately to
+ * unblock waiting cmd context. Otherwise, enqueue to dispatch
+ * in separate context where handler can also execute command.
+ */
+ packet = (struct nci_hcp_packet *)hcp_skb->data;
+ type = NCI_HCP_MSG_GET_TYPE(packet->message.header);
+ if (type == NCI_HCI_HCP_RESPONSE) {
+ pipe = packet->header;
+ instruction = NCI_HCP_MSG_GET_CMD(packet->message.header);
+ skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN +
+ NCI_HCI_HCP_MESSAGE_HEADER_LEN);
+ nci_hci_hcp_message_rx(ndev, pipe, type, instruction, hcp_skb);
+ } else {
+ skb_queue_tail(&ndev->hci_dev->msg_rx_queue, hcp_skb);
+ schedule_work(&ndev->hci_dev->msg_rx_work);
+ }
+}
+
+int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe)
+{
+ struct nci_data data;
+ struct nci_conn_info *conn_info;
+
+ conn_info = ndev->hci_dev->conn_info;
+ if (!conn_info)
+ return -EPROTO;
+
+ data.conn_id = conn_info->conn_id;
+ data.pipe = pipe;
+ data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND,
+ NCI_HCI_ANY_OPEN_PIPE);
+ data.data = NULL;
+ data.data_len = 0;
+
+ return nci_request(ndev, nci_hci_send_data_req,
+ (unsigned long)&data,
+ msecs_to_jiffies(NCI_DATA_TIMEOUT));
+}
+EXPORT_SYMBOL(nci_hci_open_pipe);
+
+int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx,
+ const u8 *param, size_t param_len)
+{
+ struct nci_conn_info *conn_info;
+ struct nci_data data;
+ int r;
+ u8 *tmp;
+ u8 pipe = ndev->hci_dev->gate2pipe[gate];
+
+ pr_debug("idx=%d to gate %d\n", idx, gate);
+
+ if (pipe == NCI_HCI_INVALID_PIPE)
+ return -EADDRNOTAVAIL;
+
+ conn_info = ndev->hci_dev->conn_info;
+ if (!conn_info)
+ return -EPROTO;
+
+ tmp = kmalloc(1 + param_len, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ *tmp = idx;
+ memcpy(tmp + 1, param, param_len);
+
+ data.conn_id = conn_info->conn_id;
+ data.pipe = pipe;
+ data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND,
+ NCI_HCI_ANY_SET_PARAMETER);
+ data.data = tmp;
+ data.data_len = param_len + 1;
+
+ r = nci_request(ndev, nci_hci_send_data_req,
+ (unsigned long)&data,
+ msecs_to_jiffies(NCI_DATA_TIMEOUT));
+
+ kfree(tmp);
+ return r;
+}
+EXPORT_SYMBOL(nci_hci_set_param);
+
+int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx,
+ struct sk_buff **skb)
+{
+ struct nci_conn_info *conn_info;
+ struct nci_data data;
+ int r;
+ u8 pipe = ndev->hci_dev->gate2pipe[gate];
+
+ pr_debug("idx=%d to gate %d\n", idx, gate);
+
+ if (pipe == NCI_HCI_INVALID_PIPE)
+ return -EADDRNOTAVAIL;
+
+ conn_info = ndev->hci_dev->conn_info;
+ if (!conn_info)
+ return -EPROTO;
+
+ data.conn_id = conn_info->conn_id;
+ data.pipe = pipe;
+ data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND,
+ NCI_HCI_ANY_GET_PARAMETER);
+ data.data = &idx;
+ data.data_len = 1;
+
+ r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data,
+ msecs_to_jiffies(NCI_DATA_TIMEOUT));
+
+ if (r == NCI_STATUS_OK)
+ *skb = conn_info->rx_skb;
+
+ return r;
+}
+EXPORT_SYMBOL(nci_hci_get_param);
+
+int nci_hci_connect_gate(struct nci_dev *ndev,
+ u8 dest_host, u8 dest_gate, u8 pipe)
+{
+ int r;
+
+ if (pipe == NCI_HCI_DO_NOT_OPEN_PIPE)
+ return 0;
+
+ if (ndev->hci_dev->gate2pipe[dest_gate] != NCI_HCI_INVALID_PIPE)
+ return -EADDRINUSE;
+
+ if (pipe != NCI_HCI_INVALID_PIPE)
+ goto open_pipe;
+
+ switch (dest_gate) {
+ case NCI_HCI_LINK_MGMT_GATE:
+ pipe = NCI_HCI_LINK_MGMT_PIPE;
+ break;
+ case NCI_HCI_ADMIN_GATE:
+ pipe = NCI_HCI_ADMIN_PIPE;
+ break;
+ }
+
+open_pipe:
+ r = nci_hci_open_pipe(ndev, pipe);
+ if (r < 0)
+ return r;
+
+ ndev->hci_dev->pipes[pipe].gate = dest_gate;
+ ndev->hci_dev->pipes[pipe].host = dest_host;
+ ndev->hci_dev->gate2pipe[dest_gate] = pipe;
+
+ return 0;
+}
+EXPORT_SYMBOL(nci_hci_connect_gate);
+
+static int nci_hci_dev_connect_gates(struct nci_dev *ndev,
+ u8 gate_count,
+ struct nci_hci_gate *gates)
+{
+ int r;
+
+ while (gate_count--) {
+ r = nci_hci_connect_gate(ndev, gates->dest_host,
+ gates->gate, gates->pipe);
+ if (r < 0)
+ return r;
+ gates++;
+ }
+
+ return 0;
+}
+
+int nci_hci_dev_session_init(struct nci_dev *ndev)
+{
+ struct nci_conn_info *conn_info;
+ struct sk_buff *skb;
+ int r;
+
+ ndev->hci_dev->count_pipes = 0;
+ ndev->hci_dev->expected_pipes = 0;
+
+ conn_info = ndev->hci_dev->conn_info;
+ if (!conn_info)
+ return -EPROTO;
+
+ conn_info->data_exchange_cb = nci_hci_data_received_cb;
+ conn_info->data_exchange_cb_context = ndev;
+
+ nci_hci_reset_pipes(ndev->hci_dev);
+
+ if (ndev->hci_dev->init_data.gates[0].gate != NCI_HCI_ADMIN_GATE)
+ return -EPROTO;
+
+ r = nci_hci_connect_gate(ndev,
+ ndev->hci_dev->init_data.gates[0].dest_host,
+ ndev->hci_dev->init_data.gates[0].gate,
+ ndev->hci_dev->init_data.gates[0].pipe);
+ if (r < 0)
+ goto exit;
+
+ r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE,
+ NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb);
+ if (r < 0)
+ goto exit;
+
+ if (skb->len &&
+ skb->len == strlen(ndev->hci_dev->init_data.session_id) &&
+ memcmp(ndev->hci_dev->init_data.session_id,
+ skb->data, skb->len) == 0 &&
+ ndev->ops->hci_load_session) {
+ /* Restore gate<->pipe table from some proprietary location. */
+ r = ndev->ops->hci_load_session(ndev);
+ if (r < 0)
+ goto exit;
+ } else {
+ r = nci_hci_dev_connect_gates(ndev,
+ ndev->hci_dev->init_data.gate_count,
+ ndev->hci_dev->init_data.gates);
+ if (r < 0)
+ goto exit;
+
+ r = nci_hci_set_param(ndev, NCI_HCI_ADMIN_GATE,
+ NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY,
+ ndev->hci_dev->init_data.session_id,
+ strlen(ndev->hci_dev->init_data.session_id));
+ }
+ if (r == 0)
+ goto exit;
+
+exit:
+ kfree_skb(skb);
+
+ return r;
+}
+EXPORT_SYMBOL(nci_hci_dev_session_init);
+
+struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev)
+{
+ struct nci_hci_dev *hdev;
+
+ hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
+ if (!hdev)
+ return NULL;
+
+ skb_queue_head_init(&hdev->rx_hcp_frags);
+ INIT_WORK(&hdev->msg_rx_work, nci_hci_msg_rx_work);
+ skb_queue_head_init(&hdev->msg_rx_queue);
+ hdev->ndev = ndev;
+
+ return hdev;
+}
diff --git a/net/nfc/nci/lib.c b/net/nfc/nci/lib.c
new file mode 100644
index 000000000..ed774a2e9
--- /dev/null
+++ b/net/nfc/nci/lib.c
@@ -0,0 +1,85 @@
+/*
+ * The NFC Controller Interface is the communication protocol between an
+ * NFC Controller (NFCC) and a Device Host (DH).
+ *
+ * Copyright (C) 2011 Texas Instruments, Inc.
+ *
+ * Written by Ilan Elias <ilane@ti.com>
+ *
+ * Acknowledgements:
+ * This file is based on lib.c, which was written
+ * by Maxim Krasnyansky.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+
+/* NCI status codes to Unix errno mapping */
+int nci_to_errno(__u8 code)
+{
+ switch (code) {
+ case NCI_STATUS_OK:
+ return 0;
+
+ case NCI_STATUS_REJECTED:
+ return -EBUSY;
+
+ case NCI_STATUS_RF_FRAME_CORRUPTED:
+ return -EBADMSG;
+
+ case NCI_STATUS_NOT_INITIALIZED:
+ return -EHOSTDOWN;
+
+ case NCI_STATUS_SYNTAX_ERROR:
+ case NCI_STATUS_SEMANTIC_ERROR:
+ case NCI_STATUS_INVALID_PARAM:
+ case NCI_STATUS_RF_PROTOCOL_ERROR:
+ case NCI_STATUS_NFCEE_PROTOCOL_ERROR:
+ return -EPROTO;
+
+ case NCI_STATUS_UNKNOWN_GID:
+ case NCI_STATUS_UNKNOWN_OID:
+ return -EBADRQC;
+
+ case NCI_STATUS_MESSAGE_SIZE_EXCEEDED:
+ return -EMSGSIZE;
+
+ case NCI_STATUS_DISCOVERY_ALREADY_STARTED:
+ return -EALREADY;
+
+ case NCI_STATUS_DISCOVERY_TARGET_ACTIVATION_FAILED:
+ case NCI_STATUS_NFCEE_INTERFACE_ACTIVATION_FAILED:
+ return -ECONNREFUSED;
+
+ case NCI_STATUS_RF_TRANSMISSION_ERROR:
+ case NCI_STATUS_NFCEE_TRANSMISSION_ERROR:
+ return -ECOMM;
+
+ case NCI_STATUS_RF_TIMEOUT_ERROR:
+ case NCI_STATUS_NFCEE_TIMEOUT_ERROR:
+ return -ETIMEDOUT;
+
+ case NCI_STATUS_FAILED:
+ default:
+ return -ENOSYS;
+ }
+}
+EXPORT_SYMBOL(nci_to_errno);
diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
new file mode 100644
index 000000000..321807107
--- /dev/null
+++ b/net/nfc/nci/ntf.c
@@ -0,0 +1,800 @@
+/*
+ * The NFC Controller Interface is the communication protocol between an
+ * NFC Controller (NFCC) and a Device Host (DH).
+ *
+ * Copyright (C) 2014 Marvell International Ltd.
+ * Copyright (C) 2011 Texas Instruments, Inc.
+ *
+ * Written by Ilan Elias <ilane@ti.com>
+ *
+ * Acknowledgements:
+ * This file is based on hci_event.c, which was written
+ * by Maxim Krasnyansky.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+
+#include "../nfc.h"
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+#include <linux/nfc.h>
+
+/* Handle NCI Notification packets */
+
+static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ struct nci_core_conn_credit_ntf *ntf = (void *) skb->data;
+ struct nci_conn_info *conn_info;
+ int i;
+
+ pr_debug("num_entries %d\n", ntf->num_entries);
+
+ if (ntf->num_entries > NCI_MAX_NUM_CONN)
+ ntf->num_entries = NCI_MAX_NUM_CONN;
+
+ /* update the credits */
+ for (i = 0; i < ntf->num_entries; i++) {
+ ntf->conn_entries[i].conn_id =
+ nci_conn_id(&ntf->conn_entries[i].conn_id);
+
+ pr_debug("entry[%d]: conn_id %d, credits %d\n",
+ i, ntf->conn_entries[i].conn_id,
+ ntf->conn_entries[i].credits);
+
+ conn_info = nci_get_conn_info_by_conn_id(ndev,
+ ntf->conn_entries[i].conn_id);
+ if (!conn_info)
+ return;
+
+ atomic_add(ntf->conn_entries[i].credits,
+ &conn_info->credits_cnt);
+ }
+
+ /* trigger the next tx */
+ if (!skb_queue_empty(&ndev->tx_q))
+ queue_work(ndev->tx_wq, &ndev->tx_work);
+}
+
+static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ __u8 status = skb->data[0];
+
+ pr_debug("status 0x%x\n", status);
+
+ if (atomic_read(&ndev->state) == NCI_W4_HOST_SELECT) {
+ /* Activation failed, so complete the request
+ (the state remains the same) */
+ nci_req_complete(ndev, status);
+ }
+}
+
+static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ struct nci_core_intf_error_ntf *ntf = (void *) skb->data;
+
+ ntf->conn_id = nci_conn_id(&ntf->conn_id);
+
+ pr_debug("status 0x%x, conn_id %d\n", ntf->status, ntf->conn_id);
+
+ /* complete the data exchange transaction, if exists */
+ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags))
+ nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO);
+}
+
+static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev,
+ struct rf_tech_specific_params_nfca_poll *nfca_poll,
+ __u8 *data)
+{
+ nfca_poll->sens_res = __le16_to_cpu(*((__le16 *)data));
+ data += 2;
+
+ nfca_poll->nfcid1_len = min_t(__u8, *data++, NFC_NFCID1_MAXSIZE);
+
+ pr_debug("sens_res 0x%x, nfcid1_len %d\n",
+ nfca_poll->sens_res, nfca_poll->nfcid1_len);
+
+ memcpy(nfca_poll->nfcid1, data, nfca_poll->nfcid1_len);
+ data += nfca_poll->nfcid1_len;
+
+ nfca_poll->sel_res_len = *data++;
+
+ if (nfca_poll->sel_res_len != 0)
+ nfca_poll->sel_res = *data++;
+
+ pr_debug("sel_res_len %d, sel_res 0x%x\n",
+ nfca_poll->sel_res_len,
+ nfca_poll->sel_res);
+
+ return data;
+}
+
+static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev,
+ struct rf_tech_specific_params_nfcb_poll *nfcb_poll,
+ __u8 *data)
+{
+ nfcb_poll->sensb_res_len = min_t(__u8, *data++, NFC_SENSB_RES_MAXSIZE);
+
+ pr_debug("sensb_res_len %d\n", nfcb_poll->sensb_res_len);
+
+ memcpy(nfcb_poll->sensb_res, data, nfcb_poll->sensb_res_len);
+ data += nfcb_poll->sensb_res_len;
+
+ return data;
+}
+
+static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev,
+ struct rf_tech_specific_params_nfcf_poll *nfcf_poll,
+ __u8 *data)
+{
+ nfcf_poll->bit_rate = *data++;
+ nfcf_poll->sensf_res_len = min_t(__u8, *data++, NFC_SENSF_RES_MAXSIZE);
+
+ pr_debug("bit_rate %d, sensf_res_len %d\n",
+ nfcf_poll->bit_rate, nfcf_poll->sensf_res_len);
+
+ memcpy(nfcf_poll->sensf_res, data, nfcf_poll->sensf_res_len);
+ data += nfcf_poll->sensf_res_len;
+
+ return data;
+}
+
+static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev,
+ struct rf_tech_specific_params_nfcv_poll *nfcv_poll,
+ __u8 *data)
+{
+ ++data;
+ nfcv_poll->dsfid = *data++;
+ memcpy(nfcv_poll->uid, data, NFC_ISO15693_UID_MAXSIZE);
+ data += NFC_ISO15693_UID_MAXSIZE;
+ return data;
+}
+
+static __u8 *nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev,
+ struct rf_tech_specific_params_nfcf_listen *nfcf_listen,
+ __u8 *data)
+{
+ nfcf_listen->local_nfcid2_len = min_t(__u8, *data++,
+ NFC_NFCID2_MAXSIZE);
+ memcpy(nfcf_listen->local_nfcid2, data, nfcf_listen->local_nfcid2_len);
+ data += nfcf_listen->local_nfcid2_len;
+
+ return data;
+}
+
+static __u32 nci_get_prop_rf_protocol(struct nci_dev *ndev, __u8 rf_protocol)
+{
+ if (ndev->ops->get_rfprotocol)
+ return ndev->ops->get_rfprotocol(ndev, rf_protocol);
+ return 0;
+}
+
+static int nci_add_new_protocol(struct nci_dev *ndev,
+ struct nfc_target *target,
+ __u8 rf_protocol,
+ __u8 rf_tech_and_mode,
+ void *params)
+{
+ struct rf_tech_specific_params_nfca_poll *nfca_poll;
+ struct rf_tech_specific_params_nfcb_poll *nfcb_poll;
+ struct rf_tech_specific_params_nfcf_poll *nfcf_poll;
+ struct rf_tech_specific_params_nfcv_poll *nfcv_poll;
+ __u32 protocol;
+
+ if (rf_protocol == NCI_RF_PROTOCOL_T1T)
+ protocol = NFC_PROTO_JEWEL_MASK;
+ else if (rf_protocol == NCI_RF_PROTOCOL_T2T)
+ protocol = NFC_PROTO_MIFARE_MASK;
+ else if (rf_protocol == NCI_RF_PROTOCOL_ISO_DEP)
+ if (rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE)
+ protocol = NFC_PROTO_ISO14443_MASK;
+ else
+ protocol = NFC_PROTO_ISO14443_B_MASK;
+ else if (rf_protocol == NCI_RF_PROTOCOL_T3T)
+ protocol = NFC_PROTO_FELICA_MASK;
+ else if (rf_protocol == NCI_RF_PROTOCOL_NFC_DEP)
+ protocol = NFC_PROTO_NFC_DEP_MASK;
+ else if (rf_protocol == NCI_RF_PROTOCOL_T5T)
+ protocol = NFC_PROTO_ISO15693_MASK;
+ else
+ protocol = nci_get_prop_rf_protocol(ndev, rf_protocol);
+
+ if (!(protocol & ndev->poll_prots)) {
+ pr_err("the target found does not have the desired protocol\n");
+ return -EPROTO;
+ }
+
+ if (rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) {
+ nfca_poll = (struct rf_tech_specific_params_nfca_poll *)params;
+
+ target->sens_res = nfca_poll->sens_res;
+ target->sel_res = nfca_poll->sel_res;
+ target->nfcid1_len = nfca_poll->nfcid1_len;
+ if (target->nfcid1_len > 0) {
+ memcpy(target->nfcid1, nfca_poll->nfcid1,
+ target->nfcid1_len);
+ }
+ } else if (rf_tech_and_mode == NCI_NFC_B_PASSIVE_POLL_MODE) {
+ nfcb_poll = (struct rf_tech_specific_params_nfcb_poll *)params;
+
+ target->sensb_res_len = nfcb_poll->sensb_res_len;
+ if (target->sensb_res_len > 0) {
+ memcpy(target->sensb_res, nfcb_poll->sensb_res,
+ target->sensb_res_len);
+ }
+ } else if (rf_tech_and_mode == NCI_NFC_F_PASSIVE_POLL_MODE) {
+ nfcf_poll = (struct rf_tech_specific_params_nfcf_poll *)params;
+
+ target->sensf_res_len = nfcf_poll->sensf_res_len;
+ if (target->sensf_res_len > 0) {
+ memcpy(target->sensf_res, nfcf_poll->sensf_res,
+ target->sensf_res_len);
+ }
+ } else if (rf_tech_and_mode == NCI_NFC_V_PASSIVE_POLL_MODE) {
+ nfcv_poll = (struct rf_tech_specific_params_nfcv_poll *)params;
+
+ target->is_iso15693 = 1;
+ target->iso15693_dsfid = nfcv_poll->dsfid;
+ memcpy(target->iso15693_uid, nfcv_poll->uid, NFC_ISO15693_UID_MAXSIZE);
+ } else {
+ pr_err("unsupported rf_tech_and_mode 0x%x\n", rf_tech_and_mode);
+ return -EPROTO;
+ }
+
+ target->supported_protocols |= protocol;
+
+ pr_debug("protocol 0x%x\n", protocol);
+
+ return 0;
+}
+
+static void nci_add_new_target(struct nci_dev *ndev,
+ struct nci_rf_discover_ntf *ntf)
+{
+ struct nfc_target *target;
+ int i, rc;
+
+ for (i = 0; i < ndev->n_targets; i++) {
+ target = &ndev->targets[i];
+ if (target->logical_idx == ntf->rf_discovery_id) {
+ /* This target already exists, add the new protocol */
+ nci_add_new_protocol(ndev, target, ntf->rf_protocol,
+ ntf->rf_tech_and_mode,
+ &ntf->rf_tech_specific_params);
+ return;
+ }
+ }
+
+ /* This is a new target, check if we've enough room */
+ if (ndev->n_targets == NCI_MAX_DISCOVERED_TARGETS) {
+ pr_debug("not enough room, ignoring new target...\n");
+ return;
+ }
+
+ target = &ndev->targets[ndev->n_targets];
+
+ rc = nci_add_new_protocol(ndev, target, ntf->rf_protocol,
+ ntf->rf_tech_and_mode,
+ &ntf->rf_tech_specific_params);
+ if (!rc) {
+ target->logical_idx = ntf->rf_discovery_id;
+ ndev->n_targets++;
+
+ pr_debug("logical idx %d, n_targets %d\n", target->logical_idx,
+ ndev->n_targets);
+ }
+}
+
+void nci_clear_target_list(struct nci_dev *ndev)
+{
+ memset(ndev->targets, 0,
+ (sizeof(struct nfc_target)*NCI_MAX_DISCOVERED_TARGETS));
+
+ ndev->n_targets = 0;
+}
+
+static void nci_rf_discover_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ struct nci_rf_discover_ntf ntf;
+ __u8 *data = skb->data;
+ bool add_target = true;
+
+ ntf.rf_discovery_id = *data++;
+ ntf.rf_protocol = *data++;
+ ntf.rf_tech_and_mode = *data++;
+ ntf.rf_tech_specific_params_len = *data++;
+
+ pr_debug("rf_discovery_id %d\n", ntf.rf_discovery_id);
+ pr_debug("rf_protocol 0x%x\n", ntf.rf_protocol);
+ pr_debug("rf_tech_and_mode 0x%x\n", ntf.rf_tech_and_mode);
+ pr_debug("rf_tech_specific_params_len %d\n",
+ ntf.rf_tech_specific_params_len);
+
+ if (ntf.rf_tech_specific_params_len > 0) {
+ switch (ntf.rf_tech_and_mode) {
+ case NCI_NFC_A_PASSIVE_POLL_MODE:
+ data = nci_extract_rf_params_nfca_passive_poll(ndev,
+ &(ntf.rf_tech_specific_params.nfca_poll), data);
+ break;
+
+ case NCI_NFC_B_PASSIVE_POLL_MODE:
+ data = nci_extract_rf_params_nfcb_passive_poll(ndev,
+ &(ntf.rf_tech_specific_params.nfcb_poll), data);
+ break;
+
+ case NCI_NFC_F_PASSIVE_POLL_MODE:
+ data = nci_extract_rf_params_nfcf_passive_poll(ndev,
+ &(ntf.rf_tech_specific_params.nfcf_poll), data);
+ break;
+
+ case NCI_NFC_V_PASSIVE_POLL_MODE:
+ data = nci_extract_rf_params_nfcv_passive_poll(ndev,
+ &(ntf.rf_tech_specific_params.nfcv_poll), data);
+ break;
+
+ default:
+ pr_err("unsupported rf_tech_and_mode 0x%x\n",
+ ntf.rf_tech_and_mode);
+ data += ntf.rf_tech_specific_params_len;
+ add_target = false;
+ }
+ }
+
+ ntf.ntf_type = *data++;
+ pr_debug("ntf_type %d\n", ntf.ntf_type);
+
+ if (add_target == true)
+ nci_add_new_target(ndev, &ntf);
+
+ if (ntf.ntf_type == NCI_DISCOVER_NTF_TYPE_MORE) {
+ atomic_set(&ndev->state, NCI_W4_ALL_DISCOVERIES);
+ } else {
+ atomic_set(&ndev->state, NCI_W4_HOST_SELECT);
+ nfc_targets_found(ndev->nfc_dev, ndev->targets,
+ ndev->n_targets);
+ }
+}
+
+static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev,
+ struct nci_rf_intf_activated_ntf *ntf, __u8 *data)
+{
+ struct activation_params_nfca_poll_iso_dep *nfca_poll;
+ struct activation_params_nfcb_poll_iso_dep *nfcb_poll;
+
+ switch (ntf->activation_rf_tech_and_mode) {
+ case NCI_NFC_A_PASSIVE_POLL_MODE:
+ nfca_poll = &ntf->activation_params.nfca_poll_iso_dep;
+ nfca_poll->rats_res_len = min_t(__u8, *data++, 20);
+ pr_debug("rats_res_len %d\n", nfca_poll->rats_res_len);
+ if (nfca_poll->rats_res_len > 0) {
+ memcpy(nfca_poll->rats_res,
+ data, nfca_poll->rats_res_len);
+ }
+ break;
+
+ case NCI_NFC_B_PASSIVE_POLL_MODE:
+ nfcb_poll = &ntf->activation_params.nfcb_poll_iso_dep;
+ nfcb_poll->attrib_res_len = min_t(__u8, *data++, 50);
+ pr_debug("attrib_res_len %d\n", nfcb_poll->attrib_res_len);
+ if (nfcb_poll->attrib_res_len > 0) {
+ memcpy(nfcb_poll->attrib_res,
+ data, nfcb_poll->attrib_res_len);
+ }
+ break;
+
+ default:
+ pr_err("unsupported activation_rf_tech_and_mode 0x%x\n",
+ ntf->activation_rf_tech_and_mode);
+ return NCI_STATUS_RF_PROTOCOL_ERROR;
+ }
+
+ return NCI_STATUS_OK;
+}
+
+static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev,
+ struct nci_rf_intf_activated_ntf *ntf, __u8 *data)
+{
+ struct activation_params_poll_nfc_dep *poll;
+ struct activation_params_listen_nfc_dep *listen;
+
+ switch (ntf->activation_rf_tech_and_mode) {
+ case NCI_NFC_A_PASSIVE_POLL_MODE:
+ case NCI_NFC_F_PASSIVE_POLL_MODE:
+ poll = &ntf->activation_params.poll_nfc_dep;
+ poll->atr_res_len = min_t(__u8, *data++,
+ NFC_ATR_RES_MAXSIZE - 2);
+ pr_debug("atr_res_len %d\n", poll->atr_res_len);
+ if (poll->atr_res_len > 0)
+ memcpy(poll->atr_res, data, poll->atr_res_len);
+ break;
+
+ case NCI_NFC_A_PASSIVE_LISTEN_MODE:
+ case NCI_NFC_F_PASSIVE_LISTEN_MODE:
+ listen = &ntf->activation_params.listen_nfc_dep;
+ listen->atr_req_len = min_t(__u8, *data++,
+ NFC_ATR_REQ_MAXSIZE - 2);
+ pr_debug("atr_req_len %d\n", listen->atr_req_len);
+ if (listen->atr_req_len > 0)
+ memcpy(listen->atr_req, data, listen->atr_req_len);
+ break;
+
+ default:
+ pr_err("unsupported activation_rf_tech_and_mode 0x%x\n",
+ ntf->activation_rf_tech_and_mode);
+ return NCI_STATUS_RF_PROTOCOL_ERROR;
+ }
+
+ return NCI_STATUS_OK;
+}
+
+static void nci_target_auto_activated(struct nci_dev *ndev,
+ struct nci_rf_intf_activated_ntf *ntf)
+{
+ struct nfc_target *target;
+ int rc;
+
+ target = &ndev->targets[ndev->n_targets];
+
+ rc = nci_add_new_protocol(ndev, target, ntf->rf_protocol,
+ ntf->activation_rf_tech_and_mode,
+ &ntf->rf_tech_specific_params);
+ if (rc)
+ return;
+
+ target->logical_idx = ntf->rf_discovery_id;
+ ndev->n_targets++;
+
+ pr_debug("logical idx %d, n_targets %d\n",
+ target->logical_idx, ndev->n_targets);
+
+ nfc_targets_found(ndev->nfc_dev, ndev->targets, ndev->n_targets);
+}
+
+static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev,
+ struct nci_rf_intf_activated_ntf *ntf)
+{
+ ndev->remote_gb_len = 0;
+
+ if (ntf->activation_params_len <= 0)
+ return NCI_STATUS_OK;
+
+ switch (ntf->activation_rf_tech_and_mode) {
+ case NCI_NFC_A_PASSIVE_POLL_MODE:
+ case NCI_NFC_F_PASSIVE_POLL_MODE:
+ ndev->remote_gb_len = min_t(__u8,
+ (ntf->activation_params.poll_nfc_dep.atr_res_len
+ - NFC_ATR_RES_GT_OFFSET),
+ NFC_ATR_RES_GB_MAXSIZE);
+ memcpy(ndev->remote_gb,
+ (ntf->activation_params.poll_nfc_dep.atr_res
+ + NFC_ATR_RES_GT_OFFSET),
+ ndev->remote_gb_len);
+ break;
+
+ case NCI_NFC_A_PASSIVE_LISTEN_MODE:
+ case NCI_NFC_F_PASSIVE_LISTEN_MODE:
+ ndev->remote_gb_len = min_t(__u8,
+ (ntf->activation_params.listen_nfc_dep.atr_req_len
+ - NFC_ATR_REQ_GT_OFFSET),
+ NFC_ATR_REQ_GB_MAXSIZE);
+ memcpy(ndev->remote_gb,
+ (ntf->activation_params.listen_nfc_dep.atr_req
+ + NFC_ATR_REQ_GT_OFFSET),
+ ndev->remote_gb_len);
+ break;
+
+ default:
+ pr_err("unsupported activation_rf_tech_and_mode 0x%x\n",
+ ntf->activation_rf_tech_and_mode);
+ return NCI_STATUS_RF_PROTOCOL_ERROR;
+ }
+
+ return NCI_STATUS_OK;
+}
+
+static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ struct nci_conn_info *conn_info;
+ struct nci_rf_intf_activated_ntf ntf;
+ __u8 *data = skb->data;
+ int err = NCI_STATUS_OK;
+
+ ntf.rf_discovery_id = *data++;
+ ntf.rf_interface = *data++;
+ ntf.rf_protocol = *data++;
+ ntf.activation_rf_tech_and_mode = *data++;
+ ntf.max_data_pkt_payload_size = *data++;
+ ntf.initial_num_credits = *data++;
+ ntf.rf_tech_specific_params_len = *data++;
+
+ pr_debug("rf_discovery_id %d\n", ntf.rf_discovery_id);
+ pr_debug("rf_interface 0x%x\n", ntf.rf_interface);
+ pr_debug("rf_protocol 0x%x\n", ntf.rf_protocol);
+ pr_debug("activation_rf_tech_and_mode 0x%x\n",
+ ntf.activation_rf_tech_and_mode);
+ pr_debug("max_data_pkt_payload_size 0x%x\n",
+ ntf.max_data_pkt_payload_size);
+ pr_debug("initial_num_credits 0x%x\n",
+ ntf.initial_num_credits);
+ pr_debug("rf_tech_specific_params_len %d\n",
+ ntf.rf_tech_specific_params_len);
+
+ /* If this contains a value of 0x00 (NFCEE Direct RF
+ * Interface) then all following parameters SHALL contain a
+ * value of 0 and SHALL be ignored.
+ */
+ if (ntf.rf_interface == NCI_RF_INTERFACE_NFCEE_DIRECT)
+ goto listen;
+
+ if (ntf.rf_tech_specific_params_len > 0) {
+ switch (ntf.activation_rf_tech_and_mode) {
+ case NCI_NFC_A_PASSIVE_POLL_MODE:
+ data = nci_extract_rf_params_nfca_passive_poll(ndev,
+ &(ntf.rf_tech_specific_params.nfca_poll), data);
+ break;
+
+ case NCI_NFC_B_PASSIVE_POLL_MODE:
+ data = nci_extract_rf_params_nfcb_passive_poll(ndev,
+ &(ntf.rf_tech_specific_params.nfcb_poll), data);
+ break;
+
+ case NCI_NFC_F_PASSIVE_POLL_MODE:
+ data = nci_extract_rf_params_nfcf_passive_poll(ndev,
+ &(ntf.rf_tech_specific_params.nfcf_poll), data);
+ break;
+
+ case NCI_NFC_V_PASSIVE_POLL_MODE:
+ data = nci_extract_rf_params_nfcv_passive_poll(ndev,
+ &(ntf.rf_tech_specific_params.nfcv_poll), data);
+ break;
+
+ case NCI_NFC_A_PASSIVE_LISTEN_MODE:
+ /* no RF technology specific parameters */
+ break;
+
+ case NCI_NFC_F_PASSIVE_LISTEN_MODE:
+ data = nci_extract_rf_params_nfcf_passive_listen(ndev,
+ &(ntf.rf_tech_specific_params.nfcf_listen),
+ data);
+ break;
+
+ default:
+ pr_err("unsupported activation_rf_tech_and_mode 0x%x\n",
+ ntf.activation_rf_tech_and_mode);
+ err = NCI_STATUS_RF_PROTOCOL_ERROR;
+ goto exit;
+ }
+ }
+
+ ntf.data_exch_rf_tech_and_mode = *data++;
+ ntf.data_exch_tx_bit_rate = *data++;
+ ntf.data_exch_rx_bit_rate = *data++;
+ ntf.activation_params_len = *data++;
+
+ pr_debug("data_exch_rf_tech_and_mode 0x%x\n",
+ ntf.data_exch_rf_tech_and_mode);
+ pr_debug("data_exch_tx_bit_rate 0x%x\n", ntf.data_exch_tx_bit_rate);
+ pr_debug("data_exch_rx_bit_rate 0x%x\n", ntf.data_exch_rx_bit_rate);
+ pr_debug("activation_params_len %d\n", ntf.activation_params_len);
+
+ if (ntf.activation_params_len > 0) {
+ switch (ntf.rf_interface) {
+ case NCI_RF_INTERFACE_ISO_DEP:
+ err = nci_extract_activation_params_iso_dep(ndev,
+ &ntf, data);
+ break;
+
+ case NCI_RF_INTERFACE_NFC_DEP:
+ err = nci_extract_activation_params_nfc_dep(ndev,
+ &ntf, data);
+ break;
+
+ case NCI_RF_INTERFACE_FRAME:
+ /* no activation params */
+ break;
+
+ default:
+ pr_err("unsupported rf_interface 0x%x\n",
+ ntf.rf_interface);
+ err = NCI_STATUS_RF_PROTOCOL_ERROR;
+ break;
+ }
+ }
+
+exit:
+ if (err == NCI_STATUS_OK) {
+ conn_info = ndev->rf_conn_info;
+ if (!conn_info)
+ return;
+
+ conn_info->max_pkt_payload_len = ntf.max_data_pkt_payload_size;
+ conn_info->initial_num_credits = ntf.initial_num_credits;
+
+ /* set the available credits to initial value */
+ atomic_set(&conn_info->credits_cnt,
+ conn_info->initial_num_credits);
+
+ /* store general bytes to be reported later in dep_link_up */
+ if (ntf.rf_interface == NCI_RF_INTERFACE_NFC_DEP) {
+ err = nci_store_general_bytes_nfc_dep(ndev, &ntf);
+ if (err != NCI_STATUS_OK)
+ pr_err("unable to store general bytes\n");
+ }
+ }
+
+ if (!(ntf.activation_rf_tech_and_mode & NCI_RF_TECH_MODE_LISTEN_MASK)) {
+ /* Poll mode */
+ if (atomic_read(&ndev->state) == NCI_DISCOVERY) {
+ /* A single target was found and activated
+ * automatically */
+ atomic_set(&ndev->state, NCI_POLL_ACTIVE);
+ if (err == NCI_STATUS_OK)
+ nci_target_auto_activated(ndev, &ntf);
+ } else { /* ndev->state == NCI_W4_HOST_SELECT */
+ /* A selected target was activated, so complete the
+ * request */
+ atomic_set(&ndev->state, NCI_POLL_ACTIVE);
+ nci_req_complete(ndev, err);
+ }
+ } else {
+listen:
+ /* Listen mode */
+ atomic_set(&ndev->state, NCI_LISTEN_ACTIVE);
+ if (err == NCI_STATUS_OK &&
+ ntf.rf_protocol == NCI_RF_PROTOCOL_NFC_DEP) {
+ err = nfc_tm_activated(ndev->nfc_dev,
+ NFC_PROTO_NFC_DEP_MASK,
+ NFC_COMM_PASSIVE,
+ ndev->remote_gb,
+ ndev->remote_gb_len);
+ if (err != NCI_STATUS_OK)
+ pr_err("error when signaling tm activation\n");
+ }
+ }
+}
+
+static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ struct nci_conn_info *conn_info;
+ struct nci_rf_deactivate_ntf *ntf = (void *) skb->data;
+
+ pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason);
+
+ conn_info = ndev->rf_conn_info;
+ if (!conn_info)
+ return;
+
+ /* drop tx data queue */
+ skb_queue_purge(&ndev->tx_q);
+
+ /* drop partial rx data packet */
+ if (ndev->rx_data_reassembly) {
+ kfree_skb(ndev->rx_data_reassembly);
+ ndev->rx_data_reassembly = NULL;
+ }
+
+ /* complete the data exchange transaction, if exists */
+ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags))
+ nci_data_exchange_complete(ndev, NULL, NCI_STATIC_RF_CONN_ID,
+ -EIO);
+
+ switch (ntf->type) {
+ case NCI_DEACTIVATE_TYPE_IDLE_MODE:
+ nci_clear_target_list(ndev);
+ atomic_set(&ndev->state, NCI_IDLE);
+ break;
+ case NCI_DEACTIVATE_TYPE_SLEEP_MODE:
+ case NCI_DEACTIVATE_TYPE_SLEEP_AF_MODE:
+ atomic_set(&ndev->state, NCI_W4_HOST_SELECT);
+ break;
+ case NCI_DEACTIVATE_TYPE_DISCOVERY:
+ nci_clear_target_list(ndev);
+ atomic_set(&ndev->state, NCI_DISCOVERY);
+ break;
+ }
+
+ nci_req_complete(ndev, NCI_STATUS_OK);
+}
+
+static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ u8 status = NCI_STATUS_OK;
+ struct nci_nfcee_discover_ntf *nfcee_ntf =
+ (struct nci_nfcee_discover_ntf *)skb->data;
+
+ pr_debug("\n");
+
+ /* NFCForum NCI 9.2.1 HCI Network Specific Handling
+ * If the NFCC supports the HCI Network, it SHALL return one,
+ * and only one, NFCEE_DISCOVER_NTF with a Protocol type of
+ * “HCI Access”, even if the HCI Network contains multiple NFCEEs.
+ */
+ ndev->hci_dev->nfcee_id = nfcee_ntf->nfcee_id;
+ ndev->cur_id = nfcee_ntf->nfcee_id;
+
+ nci_req_complete(ndev, status);
+}
+
+static void nci_nfcee_action_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ pr_debug("\n");
+}
+
+void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ __u16 ntf_opcode = nci_opcode(skb->data);
+
+ pr_debug("NCI RX: MT=ntf, PBF=%d, GID=0x%x, OID=0x%x, plen=%d\n",
+ nci_pbf(skb->data),
+ nci_opcode_gid(ntf_opcode),
+ nci_opcode_oid(ntf_opcode),
+ nci_plen(skb->data));
+
+ /* strip the nci control header */
+ skb_pull(skb, NCI_CTRL_HDR_SIZE);
+
+ switch (ntf_opcode) {
+ case NCI_OP_CORE_CONN_CREDITS_NTF:
+ nci_core_conn_credits_ntf_packet(ndev, skb);
+ break;
+
+ case NCI_OP_CORE_GENERIC_ERROR_NTF:
+ nci_core_generic_error_ntf_packet(ndev, skb);
+ break;
+
+ case NCI_OP_CORE_INTF_ERROR_NTF:
+ nci_core_conn_intf_error_ntf_packet(ndev, skb);
+ break;
+
+ case NCI_OP_RF_DISCOVER_NTF:
+ nci_rf_discover_ntf_packet(ndev, skb);
+ break;
+
+ case NCI_OP_RF_INTF_ACTIVATED_NTF:
+ nci_rf_intf_activated_ntf_packet(ndev, skb);
+ break;
+
+ case NCI_OP_RF_DEACTIVATE_NTF:
+ nci_rf_deactivate_ntf_packet(ndev, skb);
+ break;
+
+ case NCI_OP_NFCEE_DISCOVER_NTF:
+ nci_nfcee_discover_ntf_packet(ndev, skb);
+ break;
+
+ case NCI_OP_RF_NFCEE_ACTION_NTF:
+ nci_nfcee_action_ntf_packet(ndev, skb);
+ break;
+
+ default:
+ pr_err("unknown ntf opcode 0x%x\n", ntf_opcode);
+ break;
+ }
+
+ kfree_skb(skb);
+}
diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c
new file mode 100644
index 000000000..02486bc2c
--- /dev/null
+++ b/net/nfc/nci/rsp.c
@@ -0,0 +1,355 @@
+/*
+ * The NFC Controller Interface is the communication protocol between an
+ * NFC Controller (NFCC) and a Device Host (DH).
+ *
+ * Copyright (C) 2011 Texas Instruments, Inc.
+ *
+ * Written by Ilan Elias <ilane@ti.com>
+ *
+ * Acknowledgements:
+ * This file is based on hci_event.c, which was written
+ * by Maxim Krasnyansky.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+
+#include "../nfc.h"
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+
+/* Handle NCI Response packets */
+
+static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ struct nci_core_reset_rsp *rsp = (void *) skb->data;
+
+ pr_debug("status 0x%x\n", rsp->status);
+
+ if (rsp->status == NCI_STATUS_OK) {
+ ndev->nci_ver = rsp->nci_ver;
+ pr_debug("nci_ver 0x%x, config_status 0x%x\n",
+ rsp->nci_ver, rsp->config_status);
+ }
+
+ nci_req_complete(ndev, rsp->status);
+}
+
+static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ struct nci_core_init_rsp_1 *rsp_1 = (void *) skb->data;
+ struct nci_core_init_rsp_2 *rsp_2;
+
+ pr_debug("status 0x%x\n", rsp_1->status);
+
+ if (rsp_1->status != NCI_STATUS_OK)
+ goto exit;
+
+ ndev->nfcc_features = __le32_to_cpu(rsp_1->nfcc_features);
+ ndev->num_supported_rf_interfaces = rsp_1->num_supported_rf_interfaces;
+
+ if (ndev->num_supported_rf_interfaces >
+ NCI_MAX_SUPPORTED_RF_INTERFACES) {
+ ndev->num_supported_rf_interfaces =
+ NCI_MAX_SUPPORTED_RF_INTERFACES;
+ }
+
+ memcpy(ndev->supported_rf_interfaces,
+ rsp_1->supported_rf_interfaces,
+ ndev->num_supported_rf_interfaces);
+
+ rsp_2 = (void *) (skb->data + 6 + rsp_1->num_supported_rf_interfaces);
+
+ ndev->max_logical_connections = rsp_2->max_logical_connections;
+ ndev->max_routing_table_size =
+ __le16_to_cpu(rsp_2->max_routing_table_size);
+ ndev->max_ctrl_pkt_payload_len =
+ rsp_2->max_ctrl_pkt_payload_len;
+ ndev->max_size_for_large_params =
+ __le16_to_cpu(rsp_2->max_size_for_large_params);
+ ndev->manufact_id =
+ rsp_2->manufact_id;
+ ndev->manufact_specific_info =
+ __le32_to_cpu(rsp_2->manufact_specific_info);
+
+ pr_debug("nfcc_features 0x%x\n",
+ ndev->nfcc_features);
+ pr_debug("num_supported_rf_interfaces %d\n",
+ ndev->num_supported_rf_interfaces);
+ pr_debug("supported_rf_interfaces[0] 0x%x\n",
+ ndev->supported_rf_interfaces[0]);
+ pr_debug("supported_rf_interfaces[1] 0x%x\n",
+ ndev->supported_rf_interfaces[1]);
+ pr_debug("supported_rf_interfaces[2] 0x%x\n",
+ ndev->supported_rf_interfaces[2]);
+ pr_debug("supported_rf_interfaces[3] 0x%x\n",
+ ndev->supported_rf_interfaces[3]);
+ pr_debug("max_logical_connections %d\n",
+ ndev->max_logical_connections);
+ pr_debug("max_routing_table_size %d\n",
+ ndev->max_routing_table_size);
+ pr_debug("max_ctrl_pkt_payload_len %d\n",
+ ndev->max_ctrl_pkt_payload_len);
+ pr_debug("max_size_for_large_params %d\n",
+ ndev->max_size_for_large_params);
+ pr_debug("manufact_id 0x%x\n",
+ ndev->manufact_id);
+ pr_debug("manufact_specific_info 0x%x\n",
+ ndev->manufact_specific_info);
+
+exit:
+ nci_req_complete(ndev, rsp_1->status);
+}
+
+static void nci_core_set_config_rsp_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ struct nci_core_set_config_rsp *rsp = (void *) skb->data;
+
+ pr_debug("status 0x%x\n", rsp->status);
+
+ nci_req_complete(ndev, rsp->status);
+}
+
+static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ __u8 status = skb->data[0];
+
+ pr_debug("status 0x%x\n", status);
+
+ nci_req_complete(ndev, status);
+}
+
+static void nci_rf_disc_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ struct nci_conn_info *conn_info;
+ __u8 status = skb->data[0];
+
+ pr_debug("status 0x%x\n", status);
+
+ if (status == NCI_STATUS_OK) {
+ atomic_set(&ndev->state, NCI_DISCOVERY);
+
+ conn_info = ndev->rf_conn_info;
+ if (!conn_info) {
+ conn_info = devm_kzalloc(&ndev->nfc_dev->dev,
+ sizeof(struct nci_conn_info),
+ GFP_KERNEL);
+ if (!conn_info) {
+ status = NCI_STATUS_REJECTED;
+ goto exit;
+ }
+ conn_info->conn_id = NCI_STATIC_RF_CONN_ID;
+ INIT_LIST_HEAD(&conn_info->list);
+ list_add(&conn_info->list, &ndev->conn_info_list);
+ ndev->rf_conn_info = conn_info;
+ }
+ }
+
+exit:
+ nci_req_complete(ndev, status);
+}
+
+static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ __u8 status = skb->data[0];
+
+ pr_debug("status 0x%x\n", status);
+
+ /* Complete the request on intf_activated_ntf or generic_error_ntf */
+ if (status != NCI_STATUS_OK)
+ nci_req_complete(ndev, status);
+}
+
+static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ __u8 status = skb->data[0];
+
+ pr_debug("status 0x%x\n", status);
+
+ /* If target was active, complete the request only in deactivate_ntf */
+ if ((status != NCI_STATUS_OK) ||
+ (atomic_read(&ndev->state) != NCI_POLL_ACTIVE)) {
+ nci_clear_target_list(ndev);
+ atomic_set(&ndev->state, NCI_IDLE);
+ nci_req_complete(ndev, status);
+ }
+}
+
+static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ struct nci_nfcee_discover_rsp *discover_rsp;
+
+ if (skb->len != 2) {
+ nci_req_complete(ndev, NCI_STATUS_NFCEE_PROTOCOL_ERROR);
+ return;
+ }
+
+ discover_rsp = (struct nci_nfcee_discover_rsp *)skb->data;
+
+ if (discover_rsp->status != NCI_STATUS_OK ||
+ discover_rsp->num_nfcee == 0)
+ nci_req_complete(ndev, discover_rsp->status);
+}
+
+static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ __u8 status = skb->data[0];
+
+ pr_debug("status 0x%x\n", status);
+ nci_req_complete(ndev, status);
+}
+
+static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ __u8 status = skb->data[0];
+ struct nci_conn_info *conn_info;
+ struct nci_core_conn_create_rsp *rsp;
+
+ pr_debug("status 0x%x\n", status);
+
+ if (status == NCI_STATUS_OK) {
+ rsp = (struct nci_core_conn_create_rsp *)skb->data;
+
+ conn_info = devm_kzalloc(&ndev->nfc_dev->dev,
+ sizeof(*conn_info), GFP_KERNEL);
+ if (!conn_info) {
+ status = NCI_STATUS_REJECTED;
+ goto exit;
+ }
+
+ conn_info->id = ndev->cur_id;
+ conn_info->conn_id = rsp->conn_id;
+
+ /* Note: data_exchange_cb and data_exchange_cb_context need to
+ * be specify out of nci_core_conn_create_rsp_packet
+ */
+
+ INIT_LIST_HEAD(&conn_info->list);
+ list_add(&conn_info->list, &ndev->conn_info_list);
+
+ if (ndev->cur_id == ndev->hci_dev->nfcee_id)
+ ndev->hci_dev->conn_info = conn_info;
+
+ conn_info->conn_id = rsp->conn_id;
+ conn_info->max_pkt_payload_len = rsp->max_ctrl_pkt_payload_len;
+ atomic_set(&conn_info->credits_cnt, rsp->credits_cnt);
+ }
+
+exit:
+ nci_req_complete(ndev, status);
+}
+
+static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
+{
+ struct nci_conn_info *conn_info;
+ __u8 status = skb->data[0];
+
+ pr_debug("status 0x%x\n", status);
+ if (status == NCI_STATUS_OK) {
+ conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_id);
+ if (conn_info) {
+ list_del(&conn_info->list);
+ devm_kfree(&ndev->nfc_dev->dev, conn_info);
+ }
+ }
+ nci_req_complete(ndev, status);
+}
+
+void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb)
+{
+ __u16 rsp_opcode = nci_opcode(skb->data);
+
+ /* we got a rsp, stop the cmd timer */
+ del_timer(&ndev->cmd_timer);
+
+ pr_debug("NCI RX: MT=rsp, PBF=%d, GID=0x%x, OID=0x%x, plen=%d\n",
+ nci_pbf(skb->data),
+ nci_opcode_gid(rsp_opcode),
+ nci_opcode_oid(rsp_opcode),
+ nci_plen(skb->data));
+
+ /* strip the nci control header */
+ skb_pull(skb, NCI_CTRL_HDR_SIZE);
+
+ switch (rsp_opcode) {
+ case NCI_OP_CORE_RESET_RSP:
+ nci_core_reset_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_CORE_INIT_RSP:
+ nci_core_init_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_CORE_SET_CONFIG_RSP:
+ nci_core_set_config_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_CORE_CONN_CREATE_RSP:
+ nci_core_conn_create_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_CORE_CONN_CLOSE_RSP:
+ nci_core_conn_close_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_RF_DISCOVER_MAP_RSP:
+ nci_rf_disc_map_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_RF_DISCOVER_RSP:
+ nci_rf_disc_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_RF_DISCOVER_SELECT_RSP:
+ nci_rf_disc_select_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_RF_DEACTIVATE_RSP:
+ nci_rf_deactivate_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_NFCEE_DISCOVER_RSP:
+ nci_nfcee_discover_rsp_packet(ndev, skb);
+ break;
+
+ case NCI_OP_NFCEE_MODE_SET_RSP:
+ nci_nfcee_mode_set_rsp_packet(ndev, skb);
+ break;
+
+ default:
+ pr_err("unknown rsp opcode 0x%x\n", rsp_opcode);
+ break;
+ }
+
+ kfree_skb(skb);
+
+ /* trigger the next cmd */
+ atomic_set(&ndev->cmd_cnt, 1);
+ if (!skb_queue_empty(&ndev->cmd_q))
+ queue_work(ndev->cmd_wq, &ndev->cmd_work);
+}
diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c
new file mode 100644
index 000000000..ec250e777
--- /dev/null
+++ b/net/nfc/nci/spi.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#define pr_fmt(fmt) "nci_spi: %s: " fmt, __func__
+
+#include <linux/export.h>
+#include <linux/spi/spi.h>
+#include <linux/crc-ccitt.h>
+#include <net/nfc/nci_core.h>
+
+#define NCI_SPI_ACK_SHIFT 6
+#define NCI_SPI_MSB_PAYLOAD_MASK 0x3F
+
+#define NCI_SPI_SEND_TIMEOUT (NCI_CMD_TIMEOUT > NCI_DATA_TIMEOUT ? \
+ NCI_CMD_TIMEOUT : NCI_DATA_TIMEOUT)
+
+#define NCI_SPI_DIRECT_WRITE 0x01
+#define NCI_SPI_DIRECT_READ 0x02
+
+#define ACKNOWLEDGE_NONE 0
+#define ACKNOWLEDGE_ACK 1
+#define ACKNOWLEDGE_NACK 2
+
+#define CRC_INIT 0xFFFF
+
+static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb,
+ int cs_change)
+{
+ struct spi_message m;
+ struct spi_transfer t;
+
+ memset(&t, 0, sizeof(struct spi_transfer));
+ /* a NULL skb means we just want the SPI chip select line to raise */
+ if (skb) {
+ t.tx_buf = skb->data;
+ t.len = skb->len;
+ } else {
+ /* still set tx_buf non NULL to make the driver happy */
+ t.tx_buf = &t;
+ t.len = 0;
+ }
+ t.cs_change = cs_change;
+ t.delay_usecs = nspi->xfer_udelay;
+
+ spi_message_init(&m);
+ spi_message_add_tail(&t, &m);
+
+ return spi_sync(nspi->spi, &m);
+}
+
+int nci_spi_send(struct nci_spi *nspi,
+ struct completion *write_handshake_completion,
+ struct sk_buff *skb)
+{
+ unsigned int payload_len = skb->len;
+ unsigned char *hdr;
+ int ret;
+ long completion_rc;
+
+ /* add the NCI SPI header to the start of the buffer */
+ hdr = skb_push(skb, NCI_SPI_HDR_LEN);
+ hdr[0] = NCI_SPI_DIRECT_WRITE;
+ hdr[1] = nspi->acknowledge_mode;
+ hdr[2] = payload_len >> 8;
+ hdr[3] = payload_len & 0xFF;
+
+ if (nspi->acknowledge_mode == NCI_SPI_CRC_ENABLED) {
+ u16 crc;
+
+ crc = crc_ccitt(CRC_INIT, skb->data, skb->len);
+ *skb_put(skb, 1) = crc >> 8;
+ *skb_put(skb, 1) = crc & 0xFF;
+ }
+
+ if (write_handshake_completion) {
+ /* Trick SPI driver to raise chip select */
+ ret = __nci_spi_send(nspi, NULL, 1);
+ if (ret)
+ goto done;
+
+ /* wait for NFC chip hardware handshake to complete */
+ if (wait_for_completion_timeout(write_handshake_completion,
+ msecs_to_jiffies(1000)) == 0) {
+ ret = -ETIME;
+ goto done;
+ }
+ }
+
+ ret = __nci_spi_send(nspi, skb, 0);
+ if (ret != 0 || nspi->acknowledge_mode == NCI_SPI_CRC_DISABLED)
+ goto done;
+
+ reinit_completion(&nspi->req_completion);
+ completion_rc = wait_for_completion_interruptible_timeout(
+ &nspi->req_completion,
+ NCI_SPI_SEND_TIMEOUT);
+
+ if (completion_rc <= 0 || nspi->req_result == ACKNOWLEDGE_NACK)
+ ret = -EIO;
+
+done:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nci_spi_send);
+
+/* ---- Interface to NCI SPI drivers ---- */
+
+/**
+ * nci_spi_allocate_spi - allocate a new nci spi
+ *
+ * @spi: SPI device
+ * @acknowledge_mode: Acknowledge mode used by the NFC device
+ * @delay: delay between transactions in us
+ * @ndev: nci dev to send incoming nci frames to
+ */
+struct nci_spi *nci_spi_allocate_spi(struct spi_device *spi,
+ u8 acknowledge_mode, unsigned int delay,
+ struct nci_dev *ndev)
+{
+ struct nci_spi *nspi;
+
+ nspi = devm_kzalloc(&spi->dev, sizeof(struct nci_spi), GFP_KERNEL);
+ if (!nspi)
+ return NULL;
+
+ nspi->acknowledge_mode = acknowledge_mode;
+ nspi->xfer_udelay = delay;
+
+ nspi->spi = spi;
+ nspi->ndev = ndev;
+ init_completion(&nspi->req_completion);
+
+ return nspi;
+}
+EXPORT_SYMBOL_GPL(nci_spi_allocate_spi);
+
+static int send_acknowledge(struct nci_spi *nspi, u8 acknowledge)
+{
+ struct sk_buff *skb;
+ unsigned char *hdr;
+ u16 crc;
+ int ret;
+
+ skb = nci_skb_alloc(nspi->ndev, 0, GFP_KERNEL);
+
+ /* add the NCI SPI header to the start of the buffer */
+ hdr = skb_push(skb, NCI_SPI_HDR_LEN);
+ hdr[0] = NCI_SPI_DIRECT_WRITE;
+ hdr[1] = NCI_SPI_CRC_ENABLED;
+ hdr[2] = acknowledge << NCI_SPI_ACK_SHIFT;
+ hdr[3] = 0;
+
+ crc = crc_ccitt(CRC_INIT, skb->data, skb->len);
+ *skb_put(skb, 1) = crc >> 8;
+ *skb_put(skb, 1) = crc & 0xFF;
+
+ ret = __nci_spi_send(nspi, skb, 0);
+
+ kfree_skb(skb);
+
+ return ret;
+}
+
+static struct sk_buff *__nci_spi_read(struct nci_spi *nspi)
+{
+ struct sk_buff *skb;
+ struct spi_message m;
+ unsigned char req[2], resp_hdr[2];
+ struct spi_transfer tx, rx;
+ unsigned short rx_len = 0;
+ int ret;
+
+ spi_message_init(&m);
+
+ memset(&tx, 0, sizeof(struct spi_transfer));
+ req[0] = NCI_SPI_DIRECT_READ;
+ req[1] = nspi->acknowledge_mode;
+ tx.tx_buf = req;
+ tx.len = 2;
+ tx.cs_change = 0;
+ spi_message_add_tail(&tx, &m);
+
+ memset(&rx, 0, sizeof(struct spi_transfer));
+ rx.rx_buf = resp_hdr;
+ rx.len = 2;
+ rx.cs_change = 1;
+ spi_message_add_tail(&rx, &m);
+
+ ret = spi_sync(nspi->spi, &m);
+ if (ret)
+ return NULL;
+
+ if (nspi->acknowledge_mode == NCI_SPI_CRC_ENABLED)
+ rx_len = ((resp_hdr[0] & NCI_SPI_MSB_PAYLOAD_MASK) << 8) +
+ resp_hdr[1] + NCI_SPI_CRC_LEN;
+ else
+ rx_len = (resp_hdr[0] << 8) | resp_hdr[1];
+
+ skb = nci_skb_alloc(nspi->ndev, rx_len, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ spi_message_init(&m);
+
+ memset(&rx, 0, sizeof(struct spi_transfer));
+ rx.rx_buf = skb_put(skb, rx_len);
+ rx.len = rx_len;
+ rx.cs_change = 0;
+ rx.delay_usecs = nspi->xfer_udelay;
+ spi_message_add_tail(&rx, &m);
+
+ ret = spi_sync(nspi->spi, &m);
+ if (ret)
+ goto receive_error;
+
+ if (nspi->acknowledge_mode == NCI_SPI_CRC_ENABLED) {
+ *skb_push(skb, 1) = resp_hdr[1];
+ *skb_push(skb, 1) = resp_hdr[0];
+ }
+
+ return skb;
+
+receive_error:
+ kfree_skb(skb);
+
+ return NULL;
+}
+
+static int nci_spi_check_crc(struct sk_buff *skb)
+{
+ u16 crc_data = (skb->data[skb->len - 2] << 8) |
+ skb->data[skb->len - 1];
+ int ret;
+
+ ret = (crc_ccitt(CRC_INIT, skb->data, skb->len - NCI_SPI_CRC_LEN)
+ == crc_data);
+
+ skb_trim(skb, skb->len - NCI_SPI_CRC_LEN);
+
+ return ret;
+}
+
+static u8 nci_spi_get_ack(struct sk_buff *skb)
+{
+ u8 ret;
+
+ ret = skb->data[0] >> NCI_SPI_ACK_SHIFT;
+
+ /* Remove NFCC part of the header: ACK, NACK and MSB payload len */
+ skb_pull(skb, 2);
+
+ return ret;
+}
+
+/**
+ * nci_spi_read - read frame from NCI SPI drivers
+ *
+ * @nspi: The nci spi
+ * Context: can sleep
+ *
+ * This call may only be used from a context that may sleep. The sleep
+ * is non-interruptible, and has no timeout.
+ *
+ * It returns an allocated skb containing the frame on success, or NULL.
+ */
+struct sk_buff *nci_spi_read(struct nci_spi *nspi)
+{
+ struct sk_buff *skb;
+
+ /* Retrieve frame from SPI */
+ skb = __nci_spi_read(nspi);
+ if (!skb)
+ goto done;
+
+ if (nspi->acknowledge_mode == NCI_SPI_CRC_ENABLED) {
+ if (!nci_spi_check_crc(skb)) {
+ send_acknowledge(nspi, ACKNOWLEDGE_NACK);
+ goto done;
+ }
+
+ /* In case of acknowledged mode: if ACK or NACK received,
+ * unblock completion of latest frame sent.
+ */
+ nspi->req_result = nci_spi_get_ack(skb);
+ if (nspi->req_result)
+ complete(&nspi->req_completion);
+ }
+
+ /* If there is no payload (ACK/NACK only frame),
+ * free the socket buffer
+ */
+ if (!skb->len) {
+ kfree_skb(skb);
+ skb = NULL;
+ goto done;
+ }
+
+ if (nspi->acknowledge_mode == NCI_SPI_CRC_ENABLED)
+ send_acknowledge(nspi, ACKNOWLEDGE_ACK);
+
+done:
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(nci_spi_read);
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
new file mode 100644
index 000000000..376303671
--- /dev/null
+++ b/net/nfc/netlink.c
@@ -0,0 +1,1689 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ * Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ * Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__
+
+#include <net/genetlink.h>
+#include <linux/nfc.h>
+#include <linux/slab.h>
+
+#include "nfc.h"
+#include "llcp.h"
+
+static const struct genl_multicast_group nfc_genl_mcgrps[] = {
+ { .name = NFC_GENL_MCAST_EVENT_NAME, },
+};
+
+static struct genl_family nfc_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = NFC_GENL_NAME,
+ .version = NFC_GENL_VERSION,
+ .maxattr = NFC_ATTR_MAX,
+};
+
+static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
+ [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 },
+ [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
+ .len = NFC_DEVICE_NAME_MAXSIZE },
+ [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 },
+ [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 },
+ [NFC_ATTR_RF_MODE] = { .type = NLA_U8 },
+ [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 },
+ [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 },
+ [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 },
+ [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 },
+ [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 },
+ [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 },
+ [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED },
+ [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING,
+ .len = NFC_FIRMWARE_NAME_MAXSIZE },
+ [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY },
+};
+
+static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = {
+ [NFC_SDP_ATTR_URI] = { .type = NLA_STRING },
+ [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 },
+};
+
+static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target,
+ struct netlink_callback *cb, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &nfc_genl_family, flags, NFC_CMD_GET_TARGET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ genl_dump_check_consistent(cb, hdr, &nfc_genl_family);
+
+ if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) ||
+ nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) ||
+ nla_put_u16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res) ||
+ nla_put_u8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res))
+ goto nla_put_failure;
+ if (target->nfcid1_len > 0 &&
+ nla_put(msg, NFC_ATTR_TARGET_NFCID1, target->nfcid1_len,
+ target->nfcid1))
+ goto nla_put_failure;
+ if (target->sensb_res_len > 0 &&
+ nla_put(msg, NFC_ATTR_TARGET_SENSB_RES, target->sensb_res_len,
+ target->sensb_res))
+ goto nla_put_failure;
+ if (target->sensf_res_len > 0 &&
+ nla_put(msg, NFC_ATTR_TARGET_SENSF_RES, target->sensf_res_len,
+ target->sensf_res))
+ goto nla_put_failure;
+
+ if (target->is_iso15693) {
+ if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID,
+ target->iso15693_dsfid) ||
+ nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID,
+ sizeof(target->iso15693_uid), target->iso15693_uid))
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx;
+
+ rc = nlmsg_parse(cb->nlh, GENL_HDRLEN + nfc_genl_family.hdrsize,
+ nfc_genl_family.attrbuf,
+ nfc_genl_family.maxattr,
+ nfc_genl_policy);
+ if (rc < 0)
+ return ERR_PTR(rc);
+
+ if (!nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX])
+ return ERR_PTR(-EINVAL);
+
+ idx = nla_get_u32(nfc_genl_family.attrbuf[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ return dev;
+}
+
+static int nfc_genl_dump_targets(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int i = cb->args[0];
+ struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+ int rc;
+
+ if (!dev) {
+ dev = __get_device_from_cb(cb);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ cb->args[1] = (long) dev;
+ }
+
+ device_lock(&dev->dev);
+
+ cb->seq = dev->targets_generation;
+
+ while (i < dev->n_targets) {
+ rc = nfc_genl_send_target(skb, &dev->targets[i], cb,
+ NLM_F_MULTI);
+ if (rc < 0)
+ break;
+
+ i++;
+ }
+
+ device_unlock(&dev->dev);
+
+ cb->args[0] = i;
+
+ return skb->len;
+}
+
+static int nfc_genl_dump_targets_done(struct netlink_callback *cb)
+{
+ struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+
+ if (dev)
+ nfc_put_device(dev);
+
+ return 0;
+}
+
+int nfc_genl_targets_found(struct nfc_dev *dev)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ dev->genl_data.poll_req_portid = 0;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_TARGETS_FOUND);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_TARGET_LOST);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) ||
+ nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_TM_ACTIVATED);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, NFC_ATTR_TM_PROTOCOLS, protocol))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_tm_deactivated(struct nfc_dev *dev)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_TM_DEACTIVATED);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_device_added(struct nfc_dev *dev)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_DEVICE_ADDED);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) ||
+ nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+ nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) ||
+ nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_device_removed(struct nfc_dev *dev)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_DEVICE_REMOVED);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list)
+{
+ struct sk_buff *msg;
+ struct nlattr *sdp_attr, *uri_attr;
+ struct nfc_llcp_sdp_tlv *sdres;
+ struct hlist_node *n;
+ void *hdr;
+ int rc = -EMSGSIZE;
+ int i;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_LLC_SDRES);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+
+ sdp_attr = nla_nest_start(msg, NFC_ATTR_LLC_SDP);
+ if (sdp_attr == NULL) {
+ rc = -ENOMEM;
+ goto nla_put_failure;
+ }
+
+ i = 1;
+ hlist_for_each_entry_safe(sdres, n, sdres_list, node) {
+ pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap);
+
+ uri_attr = nla_nest_start(msg, i++);
+ if (uri_attr == NULL) {
+ rc = -ENOMEM;
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, uri_attr);
+
+ hlist_del(&sdres->node);
+
+ nfc_llcp_free_sdp_tlv(sdres);
+ }
+
+ nla_nest_end(msg, sdp_attr);
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+
+free_msg:
+ nlmsg_free(msg);
+
+ nfc_llcp_free_sdp_tlv_list(sdres_list);
+
+ return rc;
+}
+
+int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_SE_ADDED);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+ nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) ||
+ nla_put_u8(msg, NFC_ATTR_SE_TYPE, type))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_SE_REMOVED);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+ nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx,
+ struct nfc_evt_transaction *evt_transaction)
+{
+ struct nfc_se *se;
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_SE_TRANSACTION);
+ if (!hdr)
+ goto free_msg;
+
+ se = nfc_find_se(dev, se_idx);
+ if (!se)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+ nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) ||
+ nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type) ||
+ nla_put(msg, NFC_ATTR_SE_AID, evt_transaction->aid_len,
+ evt_transaction->aid) ||
+ nla_put(msg, NFC_ATTR_SE_PARAMS, evt_transaction->params_len,
+ evt_transaction->params))
+ goto nla_put_failure;
+
+ /* evt_transaction is no more used */
+ devm_kfree(&dev->dev, evt_transaction);
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ /* evt_transaction is no more used */
+ devm_kfree(&dev->dev, evt_transaction);
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev,
+ u32 portid, u32 seq,
+ struct netlink_callback *cb,
+ int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags,
+ NFC_CMD_GET_DEVICE);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (cb)
+ genl_dump_check_consistent(cb, hdr, &nfc_genl_family);
+
+ if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) ||
+ nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+ nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) ||
+ nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) ||
+ nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nfc_genl_dump_devices(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+ struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+ bool first_call = false;
+
+ if (!iter) {
+ first_call = true;
+ iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+ cb->args[0] = (long) iter;
+ }
+
+ mutex_lock(&nfc_devlist_mutex);
+
+ cb->seq = nfc_devlist_generation;
+
+ if (first_call) {
+ nfc_device_iter_init(iter);
+ dev = nfc_device_iter_next(iter);
+ }
+
+ while (dev) {
+ int rc;
+
+ rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb, NLM_F_MULTI);
+ if (rc < 0)
+ break;
+
+ dev = nfc_device_iter_next(iter);
+ }
+
+ mutex_unlock(&nfc_devlist_mutex);
+
+ cb->args[1] = (long) dev;
+
+ return skb->len;
+}
+
+static int nfc_genl_dump_devices_done(struct netlink_callback *cb)
+{
+ struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+
+ nfc_device_iter_exit(iter);
+ kfree(iter);
+
+ return 0;
+}
+
+int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx,
+ u8 comm_mode, u8 rf_mode)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ pr_debug("DEP link is up\n");
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_UP);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+ if (rf_mode == NFC_RF_INITIATOR &&
+ nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, NFC_ATTR_COMM_MODE, comm_mode) ||
+ nla_put_u8(msg, NFC_ATTR_RF_MODE, rf_mode))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ dev->dep_link_up = true;
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int nfc_genl_dep_link_down_event(struct nfc_dev *dev)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ pr_debug("DEP link is down\n");
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_CMD_DEP_LINK_DOWN);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct nfc_dev *dev;
+ u32 idx;
+ int rc = -ENOBUFS;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ rc = -ENOMEM;
+ goto out_putdev;
+ }
+
+ rc = nfc_genl_send_device(msg, dev, info->snd_portid, info->snd_seq,
+ NULL, 0);
+ if (rc < 0)
+ goto out_free;
+
+ nfc_put_device(dev);
+
+ return genlmsg_reply(msg, info);
+
+out_free:
+ nlmsg_free(msg);
+out_putdev:
+ nfc_put_device(dev);
+ return rc;
+}
+
+static int nfc_genl_dev_up(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ rc = nfc_dev_up(dev);
+
+ nfc_put_device(dev);
+ return rc;
+}
+
+static int nfc_genl_dev_down(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ rc = nfc_dev_down(dev);
+
+ nfc_put_device(dev);
+ return rc;
+}
+
+static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx;
+ u32 im_protocols = 0, tm_protocols = 0;
+
+ pr_debug("Poll start\n");
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ ((!info->attrs[NFC_ATTR_IM_PROTOCOLS] &&
+ !info->attrs[NFC_ATTR_PROTOCOLS]) &&
+ !info->attrs[NFC_ATTR_TM_PROTOCOLS]))
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ if (info->attrs[NFC_ATTR_TM_PROTOCOLS])
+ tm_protocols = nla_get_u32(info->attrs[NFC_ATTR_TM_PROTOCOLS]);
+
+ if (info->attrs[NFC_ATTR_IM_PROTOCOLS])
+ im_protocols = nla_get_u32(info->attrs[NFC_ATTR_IM_PROTOCOLS]);
+ else if (info->attrs[NFC_ATTR_PROTOCOLS])
+ im_protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ mutex_lock(&dev->genl_data.genl_data_mutex);
+
+ rc = nfc_start_poll(dev, im_protocols, tm_protocols);
+ if (!rc)
+ dev->genl_data.poll_req_portid = info->snd_portid;
+
+ mutex_unlock(&dev->genl_data.genl_data_mutex);
+
+ nfc_put_device(dev);
+ return rc;
+}
+
+static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ device_lock(&dev->dev);
+
+ if (!dev->polling) {
+ device_unlock(&dev->dev);
+ return -EINVAL;
+ }
+
+ device_unlock(&dev->dev);
+
+ mutex_lock(&dev->genl_data.genl_data_mutex);
+
+ if (dev->genl_data.poll_req_portid != info->snd_portid) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ rc = nfc_stop_poll(dev);
+ dev->genl_data.poll_req_portid = 0;
+
+out:
+ mutex_unlock(&dev->genl_data.genl_data_mutex);
+ nfc_put_device(dev);
+ return rc;
+}
+
+static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ u32 device_idx, target_idx, protocol;
+ int rc;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ return -EINVAL;
+
+ device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(device_idx);
+ if (!dev)
+ return -ENODEV;
+
+ target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]);
+ protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]);
+
+ nfc_deactivate_target(dev, target_idx);
+ rc = nfc_activate_target(dev, target_idx, protocol);
+
+ nfc_put_device(dev);
+ return 0;
+}
+
+static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc, tgt_idx;
+ u32 idx;
+ u8 comm;
+
+ pr_debug("DEP link up\n");
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ !info->attrs[NFC_ATTR_COMM_MODE])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+ if (!info->attrs[NFC_ATTR_TARGET_INDEX])
+ tgt_idx = NFC_TARGET_IDX_ANY;
+ else
+ tgt_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]);
+
+ comm = nla_get_u8(info->attrs[NFC_ATTR_COMM_MODE]);
+
+ if (comm != NFC_COMM_ACTIVE && comm != NFC_COMM_PASSIVE)
+ return -EINVAL;
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ rc = nfc_dep_link_up(dev, tgt_idx, comm);
+
+ nfc_put_device(dev);
+
+ return rc;
+}
+
+static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ rc = nfc_dep_link_down(dev);
+
+ nfc_put_device(dev);
+ return rc;
+}
+
+static int nfc_genl_send_params(struct sk_buff *msg,
+ struct nfc_llcp_local *local,
+ u32 portid, u32 seq)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, 0,
+ NFC_CMD_LLC_GET_PARAMS);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, local->dev->idx) ||
+ nla_put_u8(msg, NFC_ATTR_LLC_PARAM_LTO, local->lto) ||
+ nla_put_u8(msg, NFC_ATTR_LLC_PARAM_RW, local->rw) ||
+ nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ struct nfc_llcp_local *local;
+ int rc = 0;
+ struct sk_buff *msg = NULL;
+ u32 idx;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ device_lock(&dev->dev);
+
+ local = nfc_llcp_find_local(dev);
+ if (!local) {
+ rc = -ENODEV;
+ goto exit;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq);
+
+exit:
+ device_unlock(&dev->dev);
+
+ nfc_put_device(dev);
+
+ if (rc < 0) {
+ if (msg)
+ nlmsg_free(msg);
+
+ return rc;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ struct nfc_llcp_local *local;
+ u8 rw = 0;
+ u16 miux = 0;
+ u32 idx;
+ int rc = 0;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ (!info->attrs[NFC_ATTR_LLC_PARAM_LTO] &&
+ !info->attrs[NFC_ATTR_LLC_PARAM_RW] &&
+ !info->attrs[NFC_ATTR_LLC_PARAM_MIUX]))
+ return -EINVAL;
+
+ if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) {
+ rw = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_RW]);
+
+ if (rw > LLCP_MAX_RW)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) {
+ miux = nla_get_u16(info->attrs[NFC_ATTR_LLC_PARAM_MIUX]);
+
+ if (miux > LLCP_MAX_MIUX)
+ return -EINVAL;
+ }
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ device_lock(&dev->dev);
+
+ local = nfc_llcp_find_local(dev);
+ if (!local) {
+ nfc_put_device(dev);
+ rc = -ENODEV;
+ goto exit;
+ }
+
+ if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) {
+ if (dev->dep_link_up) {
+ rc = -EINPROGRESS;
+ goto exit;
+ }
+
+ local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]);
+ }
+
+ if (info->attrs[NFC_ATTR_LLC_PARAM_RW])
+ local->rw = rw;
+
+ if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX])
+ local->miux = cpu_to_be16(miux);
+
+exit:
+ device_unlock(&dev->dev);
+
+ nfc_put_device(dev);
+
+ return rc;
+}
+
+static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ struct nfc_llcp_local *local;
+ struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1];
+ u32 idx;
+ u8 tid;
+ char *uri;
+ int rc = 0, rem;
+ size_t uri_len, tlvs_len;
+ struct hlist_head sdreq_list;
+ struct nfc_llcp_sdp_tlv *sdreq;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ !info->attrs[NFC_ATTR_LLC_SDP])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev) {
+ rc = -ENODEV;
+ goto exit;
+ }
+
+ device_lock(&dev->dev);
+
+ if (dev->dep_link_up == false) {
+ rc = -ENOLINK;
+ goto exit;
+ }
+
+ local = nfc_llcp_find_local(dev);
+ if (!local) {
+ nfc_put_device(dev);
+ rc = -ENODEV;
+ goto exit;
+ }
+
+ INIT_HLIST_HEAD(&sdreq_list);
+
+ tlvs_len = 0;
+
+ nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) {
+ rc = nla_parse_nested(sdp_attrs, NFC_SDP_ATTR_MAX, attr,
+ nfc_sdp_genl_policy);
+
+ if (rc != 0) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ if (!sdp_attrs[NFC_SDP_ATTR_URI])
+ continue;
+
+ uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]);
+ if (uri_len == 0)
+ continue;
+
+ uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]);
+ if (uri == NULL || *uri == 0)
+ continue;
+
+ tid = local->sdreq_next_tid++;
+
+ sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len);
+ if (sdreq == NULL) {
+ rc = -ENOMEM;
+ goto exit;
+ }
+
+ tlvs_len += sdreq->tlv_len;
+
+ hlist_add_head(&sdreq->node, &sdreq_list);
+ }
+
+ if (hlist_empty(&sdreq_list)) {
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len);
+exit:
+ device_unlock(&dev->dev);
+
+ nfc_put_device(dev);
+
+ return rc;
+}
+
+static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx;
+ char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1];
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME],
+ sizeof(firmware_name));
+
+ rc = nfc_fw_download(dev, firmware_name);
+
+ nfc_put_device(dev);
+ return rc;
+}
+
+int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+ u32 result)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_CMD_FW_DOWNLOAD);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) ||
+ nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) ||
+ nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static int nfc_genl_enable_se(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx, se_idx;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ !info->attrs[NFC_ATTR_SE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+ se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ rc = nfc_enable_se(dev, se_idx);
+
+ nfc_put_device(dev);
+ return rc;
+}
+
+static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ int rc;
+ u32 idx, se_idx;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ !info->attrs[NFC_ATTR_SE_INDEX])
+ return -EINVAL;
+
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+ se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]);
+
+ dev = nfc_get_device(idx);
+ if (!dev)
+ return -ENODEV;
+
+ rc = nfc_disable_se(dev, se_idx);
+
+ nfc_put_device(dev);
+ return rc;
+}
+
+static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev,
+ u32 portid, u32 seq,
+ struct netlink_callback *cb,
+ int flags)
+{
+ void *hdr;
+ struct nfc_se *se, *n;
+
+ list_for_each_entry_safe(se, n, &dev->secure_elements, list) {
+ hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags,
+ NFC_CMD_GET_SE);
+ if (!hdr)
+ goto nla_put_failure;
+
+ if (cb)
+ genl_dump_check_consistent(cb, hdr, &nfc_genl_family);
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+ nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) ||
+ nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ }
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nfc_genl_dump_ses(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+ struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+ bool first_call = false;
+
+ if (!iter) {
+ first_call = true;
+ iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+ cb->args[0] = (long) iter;
+ }
+
+ mutex_lock(&nfc_devlist_mutex);
+
+ cb->seq = nfc_devlist_generation;
+
+ if (first_call) {
+ nfc_device_iter_init(iter);
+ dev = nfc_device_iter_next(iter);
+ }
+
+ while (dev) {
+ int rc;
+
+ rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb, NLM_F_MULTI);
+ if (rc < 0)
+ break;
+
+ dev = nfc_device_iter_next(iter);
+ }
+
+ mutex_unlock(&nfc_devlist_mutex);
+
+ cb->args[1] = (long) dev;
+
+ return skb->len;
+}
+
+static int nfc_genl_dump_ses_done(struct netlink_callback *cb)
+{
+ struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+
+ nfc_device_iter_exit(iter);
+ kfree(iter);
+
+ return 0;
+}
+
+static int nfc_se_io(struct nfc_dev *dev, u32 se_idx,
+ u8 *apdu, size_t apdu_length,
+ se_io_cb_t cb, void *cb_context)
+{
+ struct nfc_se *se;
+ int rc;
+
+ pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);
+
+ device_lock(&dev->dev);
+
+ if (!device_is_registered(&dev->dev)) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!dev->dev_up) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!dev->ops->se_io) {
+ rc = -EOPNOTSUPP;
+ goto error;
+ }
+
+ se = nfc_find_se(dev, se_idx);
+ if (!se) {
+ rc = -EINVAL;
+ goto error;
+ }
+
+ if (se->state != NFC_SE_ENABLED) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ rc = dev->ops->se_io(dev, se_idx, apdu,
+ apdu_length, cb, cb_context);
+
+error:
+ device_unlock(&dev->dev);
+ return rc;
+}
+
+struct se_io_ctx {
+ u32 dev_idx;
+ u32 se_idx;
+};
+
+static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err)
+{
+ struct se_io_ctx *ctx = context;
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ kfree(ctx);
+ return;
+ }
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_CMD_SE_IO);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, ctx->dev_idx) ||
+ nla_put_u32(msg, NFC_ATTR_SE_INDEX, ctx->se_idx) ||
+ nla_put(msg, NFC_ATTR_SE_APDU, apdu_len, apdu))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ kfree(ctx);
+
+ return;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ kfree(ctx);
+
+ return;
+}
+
+static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nfc_dev *dev;
+ struct se_io_ctx *ctx;
+ u32 dev_idx, se_idx;
+ u8 *apdu;
+ size_t apdu_len;
+
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ !info->attrs[NFC_ATTR_SE_INDEX] ||
+ !info->attrs[NFC_ATTR_SE_APDU])
+ return -EINVAL;
+
+ dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
+ se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]);
+
+ dev = nfc_get_device(dev_idx);
+ if (!dev)
+ return -ENODEV;
+
+ if (!dev->ops || !dev->ops->se_io)
+ return -ENOTSUPP;
+
+ apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]);
+ if (apdu_len == 0)
+ return -EINVAL;
+
+ apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]);
+ if (!apdu)
+ return -EINVAL;
+
+ ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->dev_idx = dev_idx;
+ ctx->se_idx = se_idx;
+
+ return nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx);
+}
+
+static const struct genl_ops nfc_genl_ops[] = {
+ {
+ .cmd = NFC_CMD_GET_DEVICE,
+ .doit = nfc_genl_get_device,
+ .dumpit = nfc_genl_dump_devices,
+ .done = nfc_genl_dump_devices_done,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_DEV_UP,
+ .doit = nfc_genl_dev_up,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_DEV_DOWN,
+ .doit = nfc_genl_dev_down,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_START_POLL,
+ .doit = nfc_genl_start_poll,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_STOP_POLL,
+ .doit = nfc_genl_stop_poll,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_DEP_LINK_UP,
+ .doit = nfc_genl_dep_link_up,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_DEP_LINK_DOWN,
+ .doit = nfc_genl_dep_link_down,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_GET_TARGET,
+ .dumpit = nfc_genl_dump_targets,
+ .done = nfc_genl_dump_targets_done,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_LLC_GET_PARAMS,
+ .doit = nfc_genl_llc_get_params,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_LLC_SET_PARAMS,
+ .doit = nfc_genl_llc_set_params,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_LLC_SDREQ,
+ .doit = nfc_genl_llc_sdreq,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_FW_DOWNLOAD,
+ .doit = nfc_genl_fw_download,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_ENABLE_SE,
+ .doit = nfc_genl_enable_se,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_DISABLE_SE,
+ .doit = nfc_genl_disable_se,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_GET_SE,
+ .dumpit = nfc_genl_dump_ses,
+ .done = nfc_genl_dump_ses_done,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_SE_IO,
+ .doit = nfc_genl_se_io,
+ .policy = nfc_genl_policy,
+ },
+ {
+ .cmd = NFC_CMD_ACTIVATE_TARGET,
+ .doit = nfc_genl_activate_target,
+ .policy = nfc_genl_policy,
+ },
+};
+
+
+struct urelease_work {
+ struct work_struct w;
+ u32 portid;
+};
+
+static void nfc_urelease_event_work(struct work_struct *work)
+{
+ struct urelease_work *w = container_of(work, struct urelease_work, w);
+ struct class_dev_iter iter;
+ struct nfc_dev *dev;
+
+ pr_debug("portid %d\n", w->portid);
+
+ mutex_lock(&nfc_devlist_mutex);
+
+ nfc_device_iter_init(&iter);
+ dev = nfc_device_iter_next(&iter);
+
+ while (dev) {
+ mutex_lock(&dev->genl_data.genl_data_mutex);
+
+ if (dev->genl_data.poll_req_portid == w->portid) {
+ nfc_stop_poll(dev);
+ dev->genl_data.poll_req_portid = 0;
+ }
+
+ mutex_unlock(&dev->genl_data.genl_data_mutex);
+
+ dev = nfc_device_iter_next(&iter);
+ }
+
+ nfc_device_iter_exit(&iter);
+
+ mutex_unlock(&nfc_devlist_mutex);
+
+ kfree(w);
+}
+
+static int nfc_genl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+ struct urelease_work *w;
+
+ if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC)
+ goto out;
+
+ pr_debug("NETLINK_URELEASE event from id %d\n", n->portid);
+
+ w = kmalloc(sizeof(*w), GFP_ATOMIC);
+ if (w) {
+ INIT_WORK((struct work_struct *) w, nfc_urelease_event_work);
+ w->portid = n->portid;
+ schedule_work((struct work_struct *) w);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+void nfc_genl_data_init(struct nfc_genl_data *genl_data)
+{
+ genl_data->poll_req_portid = 0;
+ mutex_init(&genl_data->genl_data_mutex);
+}
+
+void nfc_genl_data_exit(struct nfc_genl_data *genl_data)
+{
+ mutex_destroy(&genl_data->genl_data_mutex);
+}
+
+static struct notifier_block nl_notifier = {
+ .notifier_call = nfc_genl_rcv_nl_event,
+};
+
+/**
+ * nfc_genl_init() - Initialize netlink interface
+ *
+ * This initialization function registers the nfc netlink family.
+ */
+int __init nfc_genl_init(void)
+{
+ int rc;
+
+ rc = genl_register_family_with_ops_groups(&nfc_genl_family,
+ nfc_genl_ops,
+ nfc_genl_mcgrps);
+ if (rc)
+ return rc;
+
+ netlink_register_notifier(&nl_notifier);
+
+ return 0;
+}
+
+/**
+ * nfc_genl_exit() - Deinitialize netlink interface
+ *
+ * This exit function unregisters the nfc netlink family.
+ */
+void nfc_genl_exit(void)
+{
+ netlink_unregister_notifier(&nl_notifier);
+ genl_unregister_family(&nfc_genl_family);
+}
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
new file mode 100644
index 000000000..a8ce80b47
--- /dev/null
+++ b/net/nfc/nfc.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ * Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ * Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LOCAL_NFC_H
+#define __LOCAL_NFC_H
+
+#include <net/nfc/nfc.h>
+#include <net/sock.h>
+
+struct nfc_protocol {
+ int id;
+ struct proto *proto;
+ struct module *owner;
+ int (*create)(struct net *net, struct socket *sock,
+ const struct nfc_protocol *nfc_proto);
+};
+
+struct nfc_rawsock {
+ struct sock sk;
+ struct nfc_dev *dev;
+ u32 target_idx;
+ struct work_struct tx_work;
+ bool tx_work_scheduled;
+};
+
+struct nfc_sock_list {
+ struct hlist_head head;
+ rwlock_t lock;
+};
+
+#define nfc_rawsock(sk) ((struct nfc_rawsock *) sk)
+#define to_rawsock_sk(_tx_work) \
+ ((struct sock *) container_of(_tx_work, struct nfc_rawsock, tx_work))
+
+struct nfc_llcp_sdp_tlv;
+
+void nfc_llcp_mac_is_down(struct nfc_dev *dev);
+void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx,
+ u8 comm_mode, u8 rf_mode);
+int nfc_llcp_register_device(struct nfc_dev *dev);
+void nfc_llcp_unregister_device(struct nfc_dev *dev);
+int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len);
+u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len);
+int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb);
+struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev);
+int __init nfc_llcp_init(void);
+void nfc_llcp_exit(void);
+void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp);
+void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head);
+
+int __init rawsock_init(void);
+void rawsock_exit(void);
+
+int __init af_nfc_init(void);
+void af_nfc_exit(void);
+int nfc_proto_register(const struct nfc_protocol *nfc_proto);
+void nfc_proto_unregister(const struct nfc_protocol *nfc_proto);
+
+extern int nfc_devlist_generation;
+extern struct mutex nfc_devlist_mutex;
+
+int __init nfc_genl_init(void);
+void nfc_genl_exit(void);
+
+void nfc_genl_data_init(struct nfc_genl_data *genl_data);
+void nfc_genl_data_exit(struct nfc_genl_data *genl_data);
+
+int nfc_genl_targets_found(struct nfc_dev *dev);
+int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx);
+
+int nfc_genl_device_added(struct nfc_dev *dev);
+int nfc_genl_device_removed(struct nfc_dev *dev);
+
+int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx,
+ u8 comm_mode, u8 rf_mode);
+int nfc_genl_dep_link_down_event(struct nfc_dev *dev);
+
+int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol);
+int nfc_genl_tm_deactivated(struct nfc_dev *dev);
+
+int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list);
+
+int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type);
+int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx);
+int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx,
+ struct nfc_evt_transaction *evt_transaction);
+
+struct nfc_dev *nfc_get_device(unsigned int idx);
+
+static inline void nfc_put_device(struct nfc_dev *dev)
+{
+ put_device(&dev->dev);
+}
+
+static inline void nfc_device_iter_init(struct class_dev_iter *iter)
+{
+ class_dev_iter_init(iter, &nfc_class, NULL, NULL);
+}
+
+static inline struct nfc_dev *nfc_device_iter_next(struct class_dev_iter *iter)
+{
+ struct device *d = class_dev_iter_next(iter);
+ if (!d)
+ return NULL;
+
+ return to_nfc_dev(d);
+}
+
+static inline void nfc_device_iter_exit(struct class_dev_iter *iter)
+{
+ class_dev_iter_exit(iter);
+}
+
+int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name);
+int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+ u32 result);
+
+int nfc_dev_up(struct nfc_dev *dev);
+
+int nfc_dev_down(struct nfc_dev *dev);
+
+int nfc_start_poll(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols);
+
+int nfc_stop_poll(struct nfc_dev *dev);
+
+int nfc_dep_link_up(struct nfc_dev *dev, int target_idx, u8 comm_mode);
+
+int nfc_dep_link_down(struct nfc_dev *dev);
+
+int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol);
+
+int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx);
+
+int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb,
+ data_exchange_cb_t cb, void *cb_context);
+
+int nfc_enable_se(struct nfc_dev *dev, u32 se_idx);
+int nfc_disable_se(struct nfc_dev *dev, u32 se_idx);
+
+#endif /* __LOCAL_NFC_H */
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
new file mode 100644
index 000000000..82b4e8024
--- /dev/null
+++ b/net/nfc/rawsock.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (C) 2011 Instituto Nokia de Tecnologia
+ *
+ * Authors:
+ * Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
+ * Lauro Ramos Venancio <lauro.venancio@openbossa.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__
+
+#include <net/tcp_states.h>
+#include <linux/nfc.h>
+#include <linux/export.h>
+
+#include "nfc.h"
+
+static struct nfc_sock_list raw_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(raw_sk_list.lock)
+};
+
+static void nfc_sock_link(struct nfc_sock_list *l, struct sock *sk)
+{
+ write_lock(&l->lock);
+ sk_add_node(sk, &l->head);
+ write_unlock(&l->lock);
+}
+
+static void nfc_sock_unlink(struct nfc_sock_list *l, struct sock *sk)
+{
+ write_lock(&l->lock);
+ sk_del_node_init(sk);
+ write_unlock(&l->lock);
+}
+
+static void rawsock_write_queue_purge(struct sock *sk)
+{
+ pr_debug("sk=%p\n", sk);
+
+ spin_lock_bh(&sk->sk_write_queue.lock);
+ __skb_queue_purge(&sk->sk_write_queue);
+ nfc_rawsock(sk)->tx_work_scheduled = false;
+ spin_unlock_bh(&sk->sk_write_queue.lock);
+}
+
+static void rawsock_report_error(struct sock *sk, int err)
+{
+ pr_debug("sk=%p err=%d\n", sk, err);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ sk->sk_err = -err;
+ sk->sk_error_report(sk);
+
+ rawsock_write_queue_purge(sk);
+}
+
+static int rawsock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ pr_debug("sock=%p sk=%p\n", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ if (sock->type == SOCK_RAW)
+ nfc_sock_unlink(&raw_sk_list, sk);
+
+ sock_orphan(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int rawsock_connect(struct socket *sock, struct sockaddr *_addr,
+ int len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_nfc *addr = (struct sockaddr_nfc *)_addr;
+ struct nfc_dev *dev;
+ int rc = 0;
+
+ pr_debug("sock=%p sk=%p flags=%d\n", sock, sk, flags);
+
+ if (!addr || len < sizeof(struct sockaddr_nfc) ||
+ addr->sa_family != AF_NFC)
+ return -EINVAL;
+
+ pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n",
+ addr->dev_idx, addr->target_idx, addr->nfc_protocol);
+
+ lock_sock(sk);
+
+ if (sock->state == SS_CONNECTED) {
+ rc = -EISCONN;
+ goto error;
+ }
+
+ dev = nfc_get_device(addr->dev_idx);
+ if (!dev) {
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (addr->target_idx > dev->target_next_idx - 1 ||
+ addr->target_idx < dev->target_next_idx - dev->n_targets) {
+ rc = -EINVAL;
+ goto error;
+ }
+
+ rc = nfc_activate_target(dev, addr->target_idx, addr->nfc_protocol);
+ if (rc)
+ goto put_dev;
+
+ nfc_rawsock(sk)->dev = dev;
+ nfc_rawsock(sk)->target_idx = addr->target_idx;
+ sock->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
+ sk->sk_state_change(sk);
+
+ release_sock(sk);
+ return 0;
+
+put_dev:
+ nfc_put_device(dev);
+error:
+ release_sock(sk);
+ return rc;
+}
+
+static int rawsock_add_header(struct sk_buff *skb)
+{
+ *skb_push(skb, NFC_HEADER_SIZE) = 0;
+
+ return 0;
+}
+
+static void rawsock_data_exchange_complete(void *context, struct sk_buff *skb,
+ int err)
+{
+ struct sock *sk = (struct sock *) context;
+
+ BUG_ON(in_irq());
+
+ pr_debug("sk=%p err=%d\n", sk, err);
+
+ if (err)
+ goto error;
+
+ err = rawsock_add_header(skb);
+ if (err)
+ goto error_skb;
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err)
+ goto error_skb;
+
+ spin_lock_bh(&sk->sk_write_queue.lock);
+ if (!skb_queue_empty(&sk->sk_write_queue))
+ schedule_work(&nfc_rawsock(sk)->tx_work);
+ else
+ nfc_rawsock(sk)->tx_work_scheduled = false;
+ spin_unlock_bh(&sk->sk_write_queue.lock);
+
+ sock_put(sk);
+ return;
+
+error_skb:
+ kfree_skb(skb);
+
+error:
+ rawsock_report_error(sk, err);
+ sock_put(sk);
+}
+
+static void rawsock_tx_work(struct work_struct *work)
+{
+ struct sock *sk = to_rawsock_sk(work);
+ struct nfc_dev *dev = nfc_rawsock(sk)->dev;
+ u32 target_idx = nfc_rawsock(sk)->target_idx;
+ struct sk_buff *skb;
+ int rc;
+
+ pr_debug("sk=%p target_idx=%u\n", sk, target_idx);
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ rawsock_write_queue_purge(sk);
+ return;
+ }
+
+ skb = skb_dequeue(&sk->sk_write_queue);
+
+ sock_hold(sk);
+ rc = nfc_data_exchange(dev, target_idx, skb,
+ rawsock_data_exchange_complete, sk);
+ if (rc) {
+ rawsock_report_error(sk, rc);
+ sock_put(sk);
+ }
+}
+
+static int rawsock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct nfc_dev *dev = nfc_rawsock(sk)->dev;
+ struct sk_buff *skb;
+ int rc;
+
+ pr_debug("sock=%p sk=%p len=%zu\n", sock, sk, len);
+
+ if (msg->msg_namelen)
+ return -EOPNOTSUPP;
+
+ if (sock->state != SS_CONNECTED)
+ return -ENOTCONN;
+
+ skb = nfc_alloc_send_skb(dev, sk, msg->msg_flags, len, &rc);
+ if (skb == NULL)
+ return rc;
+
+ rc = memcpy_from_msg(skb_put(skb, len), msg, len);
+ if (rc < 0) {
+ kfree_skb(skb);
+ return rc;
+ }
+
+ spin_lock_bh(&sk->sk_write_queue.lock);
+ __skb_queue_tail(&sk->sk_write_queue, skb);
+ if (!nfc_rawsock(sk)->tx_work_scheduled) {
+ schedule_work(&nfc_rawsock(sk)->tx_work);
+ nfc_rawsock(sk)->tx_work_scheduled = true;
+ }
+ spin_unlock_bh(&sk->sk_write_queue.lock);
+
+ return len;
+}
+
+static int rawsock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ int noblock = flags & MSG_DONTWAIT;
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int copied;
+ int rc;
+
+ pr_debug("sock=%p sk=%p len=%zu flags=%d\n", sock, sk, len, flags);
+
+ skb = skb_recv_datagram(sk, flags, noblock, &rc);
+ if (!skb)
+ return rc;
+
+ copied = skb->len;
+ if (len < copied) {
+ msg->msg_flags |= MSG_TRUNC;
+ copied = len;
+ }
+
+ rc = skb_copy_datagram_msg(skb, 0, msg, copied);
+
+ skb_free_datagram(sk, skb);
+
+ return rc ? : copied;
+}
+
+static const struct proto_ops rawsock_ops = {
+ .family = PF_NFC,
+ .owner = THIS_MODULE,
+ .release = rawsock_release,
+ .bind = sock_no_bind,
+ .connect = rawsock_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = rawsock_sendmsg,
+ .recvmsg = rawsock_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static const struct proto_ops rawsock_raw_ops = {
+ .family = PF_NFC,
+ .owner = THIS_MODULE,
+ .release = rawsock_release,
+ .bind = sock_no_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = sock_no_sendmsg,
+ .recvmsg = rawsock_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static void rawsock_destruct(struct sock *sk)
+{
+ pr_debug("sk=%p\n", sk);
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ nfc_deactivate_target(nfc_rawsock(sk)->dev,
+ nfc_rawsock(sk)->target_idx);
+ nfc_put_device(nfc_rawsock(sk)->dev);
+ }
+
+ skb_queue_purge(&sk->sk_receive_queue);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Freeing alive NFC raw socket %p\n", sk);
+ return;
+ }
+}
+
+static int rawsock_create(struct net *net, struct socket *sock,
+ const struct nfc_protocol *nfc_proto)
+{
+ struct sock *sk;
+
+ pr_debug("sock=%p\n", sock);
+
+ if ((sock->type != SOCK_SEQPACKET) && (sock->type != SOCK_RAW))
+ return -ESOCKTNOSUPPORT;
+
+ if (sock->type == SOCK_RAW)
+ sock->ops = &rawsock_raw_ops;
+ else
+ sock->ops = &rawsock_ops;
+
+ sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+ sk->sk_protocol = nfc_proto->id;
+ sk->sk_destruct = rawsock_destruct;
+ sock->state = SS_UNCONNECTED;
+ if (sock->type == SOCK_RAW)
+ nfc_sock_link(&raw_sk_list, sk);
+ else {
+ INIT_WORK(&nfc_rawsock(sk)->tx_work, rawsock_tx_work);
+ nfc_rawsock(sk)->tx_work_scheduled = false;
+ }
+
+ return 0;
+}
+
+void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb,
+ u8 payload_type, u8 direction)
+{
+ struct sk_buff *skb_copy = NULL, *nskb;
+ struct sock *sk;
+ u8 *data;
+
+ read_lock(&raw_sk_list.lock);
+
+ sk_for_each(sk, &raw_sk_list.head) {
+ if (!skb_copy) {
+ skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE,
+ GFP_ATOMIC, true);
+ if (!skb_copy)
+ continue;
+
+ data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE);
+
+ data[0] = dev ? dev->idx : 0xFF;
+ data[1] = direction & 0x01;
+ data[1] |= (payload_type << 1);
+ }
+
+ nskb = skb_clone(skb_copy, GFP_ATOMIC);
+ if (!nskb)
+ continue;
+
+ if (sock_queue_rcv_skb(sk, nskb))
+ kfree_skb(nskb);
+ }
+
+ read_unlock(&raw_sk_list.lock);
+
+ kfree_skb(skb_copy);
+}
+EXPORT_SYMBOL(nfc_send_to_raw_sock);
+
+static struct proto rawsock_proto = {
+ .name = "NFC_RAW",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct nfc_rawsock),
+};
+
+static const struct nfc_protocol rawsock_nfc_proto = {
+ .id = NFC_SOCKPROTO_RAW,
+ .proto = &rawsock_proto,
+ .owner = THIS_MODULE,
+ .create = rawsock_create
+};
+
+int __init rawsock_init(void)
+{
+ int rc;
+
+ rc = nfc_proto_register(&rawsock_nfc_proto);
+
+ return rc;
+}
+
+void rawsock_exit(void)
+{
+ nfc_proto_unregister(&rawsock_nfc_proto);
+}
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
new file mode 100644
index 000000000..ed6b0f8dd
--- /dev/null
+++ b/net/openvswitch/Kconfig
@@ -0,0 +1,67 @@
+#
+# Open vSwitch
+#
+
+config OPENVSWITCH
+ tristate "Open vSwitch"
+ depends on INET
+ select LIBCRC32C
+ select MPLS
+ select NET_MPLS_GSO
+ ---help---
+ Open vSwitch is a multilayer Ethernet switch targeted at virtualized
+ environments. In addition to supporting a variety of features
+ expected in a traditional hardware switch, it enables fine-grained
+ programmatic extension and flow-based control of the network. This
+ control is useful in a wide variety of applications but is
+ particularly important in multi-server virtualization deployments,
+ which are often characterized by highly dynamic endpoints and the
+ need to maintain logical abstractions for multiple tenants.
+
+ The Open vSwitch datapath provides an in-kernel fast path for packet
+ forwarding. It is complemented by a userspace daemon, ovs-vswitchd,
+ which is able to accept configuration from a variety of sources and
+ translate it into packet processing rules.
+
+ See http://openvswitch.org for more information and userspace
+ utilities.
+
+ To compile this code as a module, choose M here: the module will be
+ called openvswitch.
+
+ If unsure, say N.
+
+config OPENVSWITCH_GRE
+ tristate "Open vSwitch GRE tunneling support"
+ depends on OPENVSWITCH
+ depends on NET_IPGRE_DEMUX
+ default OPENVSWITCH
+ ---help---
+ If you say Y here, then the Open vSwitch will be able create GRE
+ vport.
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say Y.
+
+config OPENVSWITCH_VXLAN
+ tristate "Open vSwitch VXLAN tunneling support"
+ depends on OPENVSWITCH
+ depends on VXLAN
+ default OPENVSWITCH
+ ---help---
+ If you say Y here, then the Open vSwitch will be able create vxlan vport.
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say Y.
+
+config OPENVSWITCH_GENEVE
+ tristate "Open vSwitch Geneve tunneling support"
+ depends on OPENVSWITCH
+ depends on GENEVE
+ default OPENVSWITCH
+ ---help---
+ If you say Y here, then the Open vSwitch will be able create geneve vport.
+
+ Say N to exclude this support and reduce the binary size.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
new file mode 100644
index 000000000..91b947841
--- /dev/null
+++ b/net/openvswitch/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for Open vSwitch.
+#
+
+obj-$(CONFIG_OPENVSWITCH) += openvswitch.o
+
+openvswitch-y := \
+ actions.o \
+ datapath.o \
+ dp_notify.o \
+ flow.o \
+ flow_netlink.o \
+ flow_table.o \
+ vport.o \
+ vport-internal_dev.o \
+ vport-netdev.o
+
+obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
+obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o
+obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
new file mode 100644
index 000000000..b491c1c29
--- /dev/null
+++ b/net/openvswitch/actions.c
@@ -0,0 +1,995 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/openvswitch.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in6.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/mpls.h>
+#include <net/sctp/checksum.h>
+
+#include "datapath.h"
+#include "flow.h"
+#include "vport.h"
+
+static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ const struct nlattr *attr, int len);
+
+struct deferred_action {
+ struct sk_buff *skb;
+ const struct nlattr *actions;
+
+ /* Store pkt_key clone when creating deferred action. */
+ struct sw_flow_key pkt_key;
+};
+
+#define DEFERRED_ACTION_FIFO_SIZE 10
+struct action_fifo {
+ int head;
+ int tail;
+ /* Deferred action fifo queue storage. */
+ struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
+};
+
+static struct action_fifo __percpu *action_fifos;
+static DEFINE_PER_CPU(int, exec_actions_level);
+
+static void action_fifo_init(struct action_fifo *fifo)
+{
+ fifo->head = 0;
+ fifo->tail = 0;
+}
+
+static bool action_fifo_is_empty(const struct action_fifo *fifo)
+{
+ return (fifo->head == fifo->tail);
+}
+
+static struct deferred_action *action_fifo_get(struct action_fifo *fifo)
+{
+ if (action_fifo_is_empty(fifo))
+ return NULL;
+
+ return &fifo->fifo[fifo->tail++];
+}
+
+static struct deferred_action *action_fifo_put(struct action_fifo *fifo)
+{
+ if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1)
+ return NULL;
+
+ return &fifo->fifo[fifo->head++];
+}
+
+/* Return true if fifo is not full */
+static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
+ const struct sw_flow_key *key,
+ const struct nlattr *attr)
+{
+ struct action_fifo *fifo;
+ struct deferred_action *da;
+
+ fifo = this_cpu_ptr(action_fifos);
+ da = action_fifo_put(fifo);
+ if (da) {
+ da->skb = skb;
+ da->actions = attr;
+ da->pkt_key = *key;
+ }
+
+ return da;
+}
+
+static void invalidate_flow_key(struct sw_flow_key *key)
+{
+ key->eth.type = htons(0);
+}
+
+static bool is_flow_key_valid(const struct sw_flow_key *key)
+{
+ return !!key->eth.type;
+}
+
+static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
+ const struct ovs_action_push_mpls *mpls)
+{
+ __be32 *new_mpls_lse;
+ struct ethhdr *hdr;
+
+ /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
+ if (skb->encapsulation)
+ return -ENOTSUPP;
+
+ if (skb_cow_head(skb, MPLS_HLEN) < 0)
+ return -ENOMEM;
+
+ skb_push(skb, MPLS_HLEN);
+ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+ skb_reset_mac_header(skb);
+
+ new_mpls_lse = (__be32 *)skb_mpls_header(skb);
+ *new_mpls_lse = mpls->mpls_lse;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
+ MPLS_HLEN, 0));
+
+ hdr = eth_hdr(skb);
+ hdr->h_proto = mpls->mpls_ethertype;
+
+ if (!skb->inner_protocol)
+ skb_set_inner_protocol(skb, skb->protocol);
+ skb->protocol = mpls->mpls_ethertype;
+
+ invalidate_flow_key(key);
+ return 0;
+}
+
+static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
+ const __be16 ethertype)
+{
+ struct ethhdr *hdr;
+ int err;
+
+ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN);
+
+ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+
+ __skb_pull(skb, MPLS_HLEN);
+ skb_reset_mac_header(skb);
+
+ /* skb_mpls_header() is used to locate the ethertype
+ * field correctly in the presence of VLAN tags.
+ */
+ hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
+ hdr->h_proto = ethertype;
+ if (eth_p_mpls(skb->protocol))
+ skb->protocol = ethertype;
+
+ invalidate_flow_key(key);
+ return 0;
+}
+
+/* 'KEY' must not have any bits set outside of the 'MASK' */
+#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
+#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK))
+
+static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
+ const __be32 *mpls_lse, const __be32 *mask)
+{
+ __be32 *stack;
+ __be32 lse;
+ int err;
+
+ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ stack = (__be32 *)skb_mpls_header(skb);
+ lse = MASKED(*stack, *mpls_lse, *mask);
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be32 diff[] = { ~(*stack), lse };
+
+ skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+ ~skb->csum);
+ }
+
+ *stack = lse;
+ flow_key->mpls.top_lse = lse;
+ return 0;
+}
+
+static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int err;
+
+ err = skb_vlan_pop(skb);
+ if (skb_vlan_tag_present(skb))
+ invalidate_flow_key(key);
+ else
+ key->eth.tci = 0;
+ return err;
+}
+
+static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
+ const struct ovs_action_push_vlan *vlan)
+{
+ if (skb_vlan_tag_present(skb))
+ invalidate_flow_key(key);
+ else
+ key->eth.tci = vlan->vlan_tci;
+ return skb_vlan_push(skb, vlan->vlan_tpid,
+ ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
+}
+
+/* 'src' is already properly masked. */
+static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_)
+{
+ u16 *dst = (u16 *)dst_;
+ const u16 *src = (const u16 *)src_;
+ const u16 *mask = (const u16 *)mask_;
+
+ SET_MASKED(dst[0], src[0], mask[0]);
+ SET_MASKED(dst[1], src[1], mask[1]);
+ SET_MASKED(dst[2], src[2], mask[2]);
+}
+
+static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
+ const struct ovs_key_ethernet *key,
+ const struct ovs_key_ethernet *mask)
+{
+ int err;
+
+ err = skb_ensure_writable(skb, ETH_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
+
+ ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src,
+ mask->eth_src);
+ ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
+ mask->eth_dst);
+
+ ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
+
+ ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
+ ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
+ return 0;
+}
+
+static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
+ __be32 *addr, __be32 new_addr)
+{
+ int transport_len = skb->len - skb_transport_offset(skb);
+
+ if (nh->protocol == IPPROTO_TCP) {
+ if (likely(transport_len >= sizeof(struct tcphdr)))
+ inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
+ *addr, new_addr, 1);
+ } else if (nh->protocol == IPPROTO_UDP) {
+ if (likely(transport_len >= sizeof(struct udphdr))) {
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&uh->check, skb,
+ *addr, new_addr, 1);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ }
+ }
+
+ csum_replace4(&nh->check, *addr, new_addr);
+ skb_clear_hash(skb);
+ *addr = new_addr;
+}
+
+static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
+ __be32 addr[4], const __be32 new_addr[4])
+{
+ int transport_len = skb->len - skb_transport_offset(skb);
+
+ if (l4_proto == NEXTHDR_TCP) {
+ if (likely(transport_len >= sizeof(struct tcphdr)))
+ inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
+ addr, new_addr, 1);
+ } else if (l4_proto == NEXTHDR_UDP) {
+ if (likely(transport_len >= sizeof(struct udphdr))) {
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace16(&uh->check, skb,
+ addr, new_addr, 1);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ }
+ } else if (l4_proto == NEXTHDR_ICMP) {
+ if (likely(transport_len >= sizeof(struct icmp6hdr)))
+ inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum,
+ skb, addr, new_addr, 1);
+ }
+}
+
+static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4],
+ const __be32 mask[4], __be32 masked[4])
+{
+ masked[0] = MASKED(old[0], addr[0], mask[0]);
+ masked[1] = MASKED(old[1], addr[1], mask[1]);
+ masked[2] = MASKED(old[2], addr[2], mask[2]);
+ masked[3] = MASKED(old[3], addr[3], mask[3]);
+}
+
+static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
+ __be32 addr[4], const __be32 new_addr[4],
+ bool recalculate_csum)
+{
+ if (recalculate_csum)
+ update_ipv6_checksum(skb, l4_proto, addr, new_addr);
+
+ skb_clear_hash(skb);
+ memcpy(addr, new_addr, sizeof(__be32[4]));
+}
+
+static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask)
+{
+ /* Bits 21-24 are always unmasked, so this retains their values. */
+ SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
+ SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
+ SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
+}
+
+static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl,
+ u8 mask)
+{
+ new_ttl = MASKED(nh->ttl, new_ttl, mask);
+
+ csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
+ nh->ttl = new_ttl;
+}
+
+static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key,
+ const struct ovs_key_ipv4 *key,
+ const struct ovs_key_ipv4 *mask)
+{
+ struct iphdr *nh;
+ __be32 new_addr;
+ int err;
+
+ err = skb_ensure_writable(skb, skb_network_offset(skb) +
+ sizeof(struct iphdr));
+ if (unlikely(err))
+ return err;
+
+ nh = ip_hdr(skb);
+
+ /* Setting an IP addresses is typically only a side effect of
+ * matching on them in the current userspace implementation, so it
+ * makes sense to check if the value actually changed.
+ */
+ if (mask->ipv4_src) {
+ new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
+
+ if (unlikely(new_addr != nh->saddr)) {
+ set_ip_addr(skb, nh, &nh->saddr, new_addr);
+ flow_key->ipv4.addr.src = new_addr;
+ }
+ }
+ if (mask->ipv4_dst) {
+ new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
+
+ if (unlikely(new_addr != nh->daddr)) {
+ set_ip_addr(skb, nh, &nh->daddr, new_addr);
+ flow_key->ipv4.addr.dst = new_addr;
+ }
+ }
+ if (mask->ipv4_tos) {
+ ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos);
+ flow_key->ip.tos = nh->tos;
+ }
+ if (mask->ipv4_ttl) {
+ set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl);
+ flow_key->ip.ttl = nh->ttl;
+ }
+
+ return 0;
+}
+
+static bool is_ipv6_mask_nonzero(const __be32 addr[4])
+{
+ return !!(addr[0] | addr[1] | addr[2] | addr[3]);
+}
+
+static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
+ const struct ovs_key_ipv6 *key,
+ const struct ovs_key_ipv6 *mask)
+{
+ struct ipv6hdr *nh;
+ int err;
+
+ err = skb_ensure_writable(skb, skb_network_offset(skb) +
+ sizeof(struct ipv6hdr));
+ if (unlikely(err))
+ return err;
+
+ nh = ipv6_hdr(skb);
+
+ /* Setting an IP addresses is typically only a side effect of
+ * matching on them in the current userspace implementation, so it
+ * makes sense to check if the value actually changed.
+ */
+ if (is_ipv6_mask_nonzero(mask->ipv6_src)) {
+ __be32 *saddr = (__be32 *)&nh->saddr;
+ __be32 masked[4];
+
+ mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked);
+
+ if (unlikely(memcmp(saddr, masked, sizeof(masked)))) {
+ set_ipv6_addr(skb, key->ipv6_proto, saddr, masked,
+ true);
+ memcpy(&flow_key->ipv6.addr.src, masked,
+ sizeof(flow_key->ipv6.addr.src));
+ }
+ }
+ if (is_ipv6_mask_nonzero(mask->ipv6_dst)) {
+ unsigned int offset = 0;
+ int flags = IP6_FH_F_SKIP_RH;
+ bool recalc_csum = true;
+ __be32 *daddr = (__be32 *)&nh->daddr;
+ __be32 masked[4];
+
+ mask_ipv6_addr(daddr, key->ipv6_dst, mask->ipv6_dst, masked);
+
+ if (unlikely(memcmp(daddr, masked, sizeof(masked)))) {
+ if (ipv6_ext_hdr(nh->nexthdr))
+ recalc_csum = (ipv6_find_hdr(skb, &offset,
+ NEXTHDR_ROUTING,
+ NULL, &flags)
+ != NEXTHDR_ROUTING);
+
+ set_ipv6_addr(skb, key->ipv6_proto, daddr, masked,
+ recalc_csum);
+ memcpy(&flow_key->ipv6.addr.dst, masked,
+ sizeof(flow_key->ipv6.addr.dst));
+ }
+ }
+ if (mask->ipv6_tclass) {
+ ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass);
+ flow_key->ip.tos = ipv6_get_dsfield(nh);
+ }
+ if (mask->ipv6_label) {
+ set_ipv6_fl(nh, ntohl(key->ipv6_label),
+ ntohl(mask->ipv6_label));
+ flow_key->ipv6.label =
+ *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
+ }
+ if (mask->ipv6_hlimit) {
+ SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit);
+ flow_key->ip.ttl = nh->hop_limit;
+ }
+ return 0;
+}
+
+/* Must follow skb_ensure_writable() since that can move the skb data. */
+static void set_tp_port(struct sk_buff *skb, __be16 *port,
+ __be16 new_port, __sum16 *check)
+{
+ inet_proto_csum_replace2(check, skb, *port, new_port, 0);
+ *port = new_port;
+}
+
+static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
+ const struct ovs_key_udp *key,
+ const struct ovs_key_udp *mask)
+{
+ struct udphdr *uh;
+ __be16 src, dst;
+ int err;
+
+ err = skb_ensure_writable(skb, skb_transport_offset(skb) +
+ sizeof(struct udphdr));
+ if (unlikely(err))
+ return err;
+
+ uh = udp_hdr(skb);
+ /* Either of the masks is non-zero, so do not bother checking them. */
+ src = MASKED(uh->source, key->udp_src, mask->udp_src);
+ dst = MASKED(uh->dest, key->udp_dst, mask->udp_dst);
+
+ if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (likely(src != uh->source)) {
+ set_tp_port(skb, &uh->source, src, &uh->check);
+ flow_key->tp.src = src;
+ }
+ if (likely(dst != uh->dest)) {
+ set_tp_port(skb, &uh->dest, dst, &uh->check);
+ flow_key->tp.dst = dst;
+ }
+
+ if (unlikely(!uh->check))
+ uh->check = CSUM_MANGLED_0;
+ } else {
+ uh->source = src;
+ uh->dest = dst;
+ flow_key->tp.src = src;
+ flow_key->tp.dst = dst;
+ }
+
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key,
+ const struct ovs_key_tcp *key,
+ const struct ovs_key_tcp *mask)
+{
+ struct tcphdr *th;
+ __be16 src, dst;
+ int err;
+
+ err = skb_ensure_writable(skb, skb_transport_offset(skb) +
+ sizeof(struct tcphdr));
+ if (unlikely(err))
+ return err;
+
+ th = tcp_hdr(skb);
+ src = MASKED(th->source, key->tcp_src, mask->tcp_src);
+ if (likely(src != th->source)) {
+ set_tp_port(skb, &th->source, src, &th->check);
+ flow_key->tp.src = src;
+ }
+ dst = MASKED(th->dest, key->tcp_dst, mask->tcp_dst);
+ if (likely(dst != th->dest)) {
+ set_tp_port(skb, &th->dest, dst, &th->check);
+ flow_key->tp.dst = dst;
+ }
+ skb_clear_hash(skb);
+
+ return 0;
+}
+
+static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
+ const struct ovs_key_sctp *key,
+ const struct ovs_key_sctp *mask)
+{
+ unsigned int sctphoff = skb_transport_offset(skb);
+ struct sctphdr *sh;
+ __le32 old_correct_csum, new_csum, old_csum;
+ int err;
+
+ err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr));
+ if (unlikely(err))
+ return err;
+
+ sh = sctp_hdr(skb);
+ old_csum = sh->checksum;
+ old_correct_csum = sctp_compute_cksum(skb, sctphoff);
+
+ sh->source = MASKED(sh->source, key->sctp_src, mask->sctp_src);
+ sh->dest = MASKED(sh->dest, key->sctp_dst, mask->sctp_dst);
+
+ new_csum = sctp_compute_cksum(skb, sctphoff);
+
+ /* Carry any checksum errors through. */
+ sh->checksum = old_csum ^ old_correct_csum ^ new_csum;
+
+ skb_clear_hash(skb);
+ flow_key->tp.src = sh->source;
+ flow_key->tp.dst = sh->dest;
+
+ return 0;
+}
+
+static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
+{
+ struct vport *vport = ovs_vport_rcu(dp, out_port);
+
+ if (likely(vport))
+ ovs_vport_send(vport, skb);
+ else
+ kfree_skb(skb);
+}
+
+static int output_userspace(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key, const struct nlattr *attr)
+{
+ struct ovs_tunnel_info info;
+ struct dp_upcall_info upcall;
+ const struct nlattr *a;
+ int rem;
+
+ upcall.cmd = OVS_PACKET_CMD_ACTION;
+ upcall.userdata = NULL;
+ upcall.portid = 0;
+ upcall.egress_tun_info = NULL;
+
+ for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
+ a = nla_next(a, &rem)) {
+ switch (nla_type(a)) {
+ case OVS_USERSPACE_ATTR_USERDATA:
+ upcall.userdata = a;
+ break;
+
+ case OVS_USERSPACE_ATTR_PID:
+ upcall.portid = nla_get_u32(a);
+ break;
+
+ case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
+ /* Get out tunnel info. */
+ struct vport *vport;
+
+ vport = ovs_vport_rcu(dp, nla_get_u32(a));
+ if (vport) {
+ int err;
+
+ err = ovs_vport_get_egress_tun_info(vport, skb,
+ &info);
+ if (!err)
+ upcall.egress_tun_info = &info;
+ }
+ break;
+ }
+
+ } /* End of switch. */
+ }
+
+ return ovs_dp_upcall(dp, skb, key, &upcall);
+}
+
+static int sample(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key, const struct nlattr *attr)
+{
+ const struct nlattr *acts_list = NULL;
+ const struct nlattr *a;
+ int rem;
+
+ for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
+ a = nla_next(a, &rem)) {
+ switch (nla_type(a)) {
+ case OVS_SAMPLE_ATTR_PROBABILITY:
+ if (prandom_u32() >= nla_get_u32(a))
+ return 0;
+ break;
+
+ case OVS_SAMPLE_ATTR_ACTIONS:
+ acts_list = a;
+ break;
+ }
+ }
+
+ rem = nla_len(acts_list);
+ a = nla_data(acts_list);
+
+ /* Actions list is empty, do nothing */
+ if (unlikely(!rem))
+ return 0;
+
+ /* The only known usage of sample action is having a single user-space
+ * action. Treat this usage as a special case.
+ * The output_userspace() should clone the skb to be sent to the
+ * user space. This skb will be consumed by its caller.
+ */
+ if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
+ nla_is_last(a, rem)))
+ return output_userspace(dp, skb, key, a);
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ /* Skip the sample action when out of memory. */
+ return 0;
+
+ if (!add_deferred_actions(skb, key, a)) {
+ if (net_ratelimit())
+ pr_warn("%s: deferred actions limit reached, dropping sample action\n",
+ ovs_dp_name(dp));
+
+ kfree_skb(skb);
+ }
+ return 0;
+}
+
+static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
+ const struct nlattr *attr)
+{
+ struct ovs_action_hash *hash_act = nla_data(attr);
+ u32 hash = 0;
+
+ /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */
+ hash = skb_get_hash(skb);
+ hash = jhash_1word(hash, hash_act->hash_basis);
+ if (!hash)
+ hash = 0x1;
+
+ key->ovs_flow_hash = hash;
+}
+
+static int execute_set_action(struct sk_buff *skb,
+ struct sw_flow_key *flow_key,
+ const struct nlattr *a)
+{
+ /* Only tunnel set execution is supported without a mask. */
+ if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
+ OVS_CB(skb)->egress_tun_info = nla_data(a);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+/* Mask is at the midpoint of the data. */
+#define get_mask(a, type) ((const type)nla_data(a) + 1)
+
+static int execute_masked_set_action(struct sk_buff *skb,
+ struct sw_flow_key *flow_key,
+ const struct nlattr *a)
+{
+ int err = 0;
+
+ switch (nla_type(a)) {
+ case OVS_KEY_ATTR_PRIORITY:
+ SET_MASKED(skb->priority, nla_get_u32(a), *get_mask(a, u32 *));
+ flow_key->phy.priority = skb->priority;
+ break;
+
+ case OVS_KEY_ATTR_SKB_MARK:
+ SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *));
+ flow_key->phy.skb_mark = skb->mark;
+ break;
+
+ case OVS_KEY_ATTR_TUNNEL_INFO:
+ /* Masked data not supported for tunnel. */
+ err = -EINVAL;
+ break;
+
+ case OVS_KEY_ATTR_ETHERNET:
+ err = set_eth_addr(skb, flow_key, nla_data(a),
+ get_mask(a, struct ovs_key_ethernet *));
+ break;
+
+ case OVS_KEY_ATTR_IPV4:
+ err = set_ipv4(skb, flow_key, nla_data(a),
+ get_mask(a, struct ovs_key_ipv4 *));
+ break;
+
+ case OVS_KEY_ATTR_IPV6:
+ err = set_ipv6(skb, flow_key, nla_data(a),
+ get_mask(a, struct ovs_key_ipv6 *));
+ break;
+
+ case OVS_KEY_ATTR_TCP:
+ err = set_tcp(skb, flow_key, nla_data(a),
+ get_mask(a, struct ovs_key_tcp *));
+ break;
+
+ case OVS_KEY_ATTR_UDP:
+ err = set_udp(skb, flow_key, nla_data(a),
+ get_mask(a, struct ovs_key_udp *));
+ break;
+
+ case OVS_KEY_ATTR_SCTP:
+ err = set_sctp(skb, flow_key, nla_data(a),
+ get_mask(a, struct ovs_key_sctp *));
+ break;
+
+ case OVS_KEY_ATTR_MPLS:
+ err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
+ __be32 *));
+ break;
+ }
+
+ return err;
+}
+
+static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ const struct nlattr *a, int rem)
+{
+ struct deferred_action *da;
+
+ if (!is_flow_key_valid(key)) {
+ int err;
+
+ err = ovs_flow_key_update(skb, key);
+ if (err)
+ return err;
+ }
+ BUG_ON(!is_flow_key_valid(key));
+
+ if (!nla_is_last(a, rem)) {
+ /* Recirc action is the not the last action
+ * of the action list, need to clone the skb.
+ */
+ skb = skb_clone(skb, GFP_ATOMIC);
+
+ /* Skip the recirc action when out of memory, but
+ * continue on with the rest of the action list.
+ */
+ if (!skb)
+ return 0;
+ }
+
+ da = add_deferred_actions(skb, key, NULL);
+ if (da) {
+ da->pkt_key.recirc_id = nla_get_u32(a);
+ } else {
+ kfree_skb(skb);
+
+ if (net_ratelimit())
+ pr_warn("%s: deferred action limit reached, drop recirc action\n",
+ ovs_dp_name(dp));
+ }
+
+ return 0;
+}
+
+/* Execute a list of actions against 'skb'. */
+static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ const struct nlattr *attr, int len)
+{
+ /* Every output action needs a separate clone of 'skb', but the common
+ * case is just a single output action, so that doing a clone and
+ * then freeing the original skbuff is wasteful. So the following code
+ * is slightly obscure just to avoid that.
+ */
+ int prev_port = -1;
+ const struct nlattr *a;
+ int rem;
+
+ for (a = attr, rem = len; rem > 0;
+ a = nla_next(a, &rem)) {
+ int err = 0;
+
+ if (unlikely(prev_port != -1)) {
+ struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
+
+ if (out_skb)
+ do_output(dp, out_skb, prev_port);
+
+ prev_port = -1;
+ }
+
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_OUTPUT:
+ prev_port = nla_get_u32(a);
+ break;
+
+ case OVS_ACTION_ATTR_USERSPACE:
+ output_userspace(dp, skb, key, a);
+ break;
+
+ case OVS_ACTION_ATTR_HASH:
+ execute_hash(skb, key, a);
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_MPLS:
+ err = push_mpls(skb, key, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_POP_MPLS:
+ err = pop_mpls(skb, key, nla_get_be16(a));
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_VLAN:
+ err = push_vlan(skb, key, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_POP_VLAN:
+ err = pop_vlan(skb, key);
+ break;
+
+ case OVS_ACTION_ATTR_RECIRC:
+ err = execute_recirc(dp, skb, key, a, rem);
+ if (nla_is_last(a, rem)) {
+ /* If this is the last action, the skb has
+ * been consumed or freed.
+ * Return immediately.
+ */
+ return err;
+ }
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ err = execute_set_action(skb, key, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_SET_MASKED:
+ case OVS_ACTION_ATTR_SET_TO_MASKED:
+ err = execute_masked_set_action(skb, key, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = sample(dp, skb, key, a);
+ break;
+ }
+
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+ }
+
+ if (prev_port != -1)
+ do_output(dp, skb, prev_port);
+ else
+ consume_skb(skb);
+
+ return 0;
+}
+
+static void process_deferred_actions(struct datapath *dp)
+{
+ struct action_fifo *fifo = this_cpu_ptr(action_fifos);
+
+ /* Do not touch the FIFO in case there is no deferred actions. */
+ if (action_fifo_is_empty(fifo))
+ return;
+
+ /* Finishing executing all deferred actions. */
+ do {
+ struct deferred_action *da = action_fifo_get(fifo);
+ struct sk_buff *skb = da->skb;
+ struct sw_flow_key *key = &da->pkt_key;
+ const struct nlattr *actions = da->actions;
+
+ if (actions)
+ do_execute_actions(dp, skb, key, actions,
+ nla_len(actions));
+ else
+ ovs_dp_process_packet(skb, key);
+ } while (!action_fifo_is_empty(fifo));
+
+ /* Reset FIFO for the next packet. */
+ action_fifo_init(fifo);
+}
+
+/* Execute a list of actions against 'skb'. */
+int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
+ const struct sw_flow_actions *acts,
+ struct sw_flow_key *key)
+{
+ int level = this_cpu_read(exec_actions_level);
+ int err;
+
+ this_cpu_inc(exec_actions_level);
+ OVS_CB(skb)->egress_tun_info = NULL;
+ err = do_execute_actions(dp, skb, key,
+ acts->actions, acts->actions_len);
+
+ if (!level)
+ process_deferred_actions(dp);
+
+ this_cpu_dec(exec_actions_level);
+ return err;
+}
+
+int action_fifos_init(void)
+{
+ action_fifos = alloc_percpu(struct action_fifo);
+ if (!action_fifos)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void action_fifos_exit(void)
+{
+ free_percpu(action_fifos);
+}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
new file mode 100644
index 000000000..096c6276e
--- /dev/null
+++ b/net/openvswitch/datapath.c
@@ -0,0 +1,2333 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/time.h>
+#include <linux/etherdevice.h>
+#include <linux/genetlink.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/ethtool.h>
+#include <linux/wait.h>
+#include <asm/div64.h>
+#include <linux/highmem.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/inetdevice.h>
+#include <linux/list.h>
+#include <linux/openvswitch.h>
+#include <linux/rculist.h>
+#include <linux/dmi.h>
+#include <net/genetlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "datapath.h"
+#include "flow.h"
+#include "flow_table.h"
+#include "flow_netlink.h"
+#include "vport-internal_dev.h"
+#include "vport-netdev.h"
+
+int ovs_net_id __read_mostly;
+EXPORT_SYMBOL_GPL(ovs_net_id);
+
+static struct genl_family dp_packet_genl_family;
+static struct genl_family dp_flow_genl_family;
+static struct genl_family dp_datapath_genl_family;
+
+static const struct nla_policy flow_policy[];
+
+static const struct genl_multicast_group ovs_dp_flow_multicast_group = {
+ .name = OVS_FLOW_MCGROUP,
+};
+
+static const struct genl_multicast_group ovs_dp_datapath_multicast_group = {
+ .name = OVS_DATAPATH_MCGROUP,
+};
+
+static const struct genl_multicast_group ovs_dp_vport_multicast_group = {
+ .name = OVS_VPORT_MCGROUP,
+};
+
+/* Check if need to build a reply message.
+ * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */
+static bool ovs_must_notify(struct genl_family *family, struct genl_info *info,
+ unsigned int group)
+{
+ return info->nlhdr->nlmsg_flags & NLM_F_ECHO ||
+ genl_has_listeners(family, genl_info_net(info), group);
+}
+
+static void ovs_notify(struct genl_family *family,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ genl_notify(family, skb, genl_info_net(info), info->snd_portid,
+ 0, info->nlhdr, GFP_KERNEL);
+}
+
+/**
+ * DOC: Locking:
+ *
+ * All writes e.g. Writes to device state (add/remove datapath, port, set
+ * operations on vports, etc.), Writes to other state (flow table
+ * modifications, set miscellaneous datapath parameters, etc.) are protected
+ * by ovs_lock.
+ *
+ * Reads are protected by RCU.
+ *
+ * There are a few special cases (mostly stats) that have their own
+ * synchronization but they nest under all of above and don't interact with
+ * each other.
+ *
+ * The RTNL lock nests inside ovs_mutex.
+ */
+
+static DEFINE_MUTEX(ovs_mutex);
+
+void ovs_lock(void)
+{
+ mutex_lock(&ovs_mutex);
+}
+
+void ovs_unlock(void)
+{
+ mutex_unlock(&ovs_mutex);
+}
+
+#ifdef CONFIG_LOCKDEP
+int lockdep_ovsl_is_held(void)
+{
+ if (debug_locks)
+ return lockdep_is_held(&ovs_mutex);
+ else
+ return 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held);
+#endif
+
+static struct vport *new_vport(const struct vport_parms *);
+static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
+ const struct sw_flow_key *,
+ const struct dp_upcall_info *);
+static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
+ const struct sw_flow_key *,
+ const struct dp_upcall_info *);
+
+/* Must be called with rcu_read_lock. */
+static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex)
+{
+ struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex);
+
+ if (dev) {
+ struct vport *vport = ovs_internal_dev_get_vport(dev);
+ if (vport)
+ return vport->dp;
+ }
+
+ return NULL;
+}
+
+/* The caller must hold either ovs_mutex or rcu_read_lock to keep the
+ * returned dp pointer valid.
+ */
+static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
+{
+ struct datapath *dp;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+ rcu_read_lock();
+ dp = get_dp_rcu(net, dp_ifindex);
+ rcu_read_unlock();
+
+ return dp;
+}
+
+/* Must be called with rcu_read_lock or ovs_mutex. */
+const char *ovs_dp_name(const struct datapath *dp)
+{
+ struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
+ return vport->ops->get_name(vport);
+}
+
+static int get_dpifindex(const struct datapath *dp)
+{
+ struct vport *local;
+ int ifindex;
+
+ rcu_read_lock();
+
+ local = ovs_vport_rcu(dp, OVSP_LOCAL);
+ if (local)
+ ifindex = netdev_vport_priv(local)->dev->ifindex;
+ else
+ ifindex = 0;
+
+ rcu_read_unlock();
+
+ return ifindex;
+}
+
+static void destroy_dp_rcu(struct rcu_head *rcu)
+{
+ struct datapath *dp = container_of(rcu, struct datapath, rcu);
+
+ ovs_flow_tbl_destroy(&dp->table);
+ free_percpu(dp->stats_percpu);
+ kfree(dp->ports);
+ kfree(dp);
+}
+
+static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
+ u16 port_no)
+{
+ return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
+{
+ struct vport *vport;
+ struct hlist_head *head;
+
+ head = vport_hash_bucket(dp, port_no);
+ hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
+ if (vport->port_no == port_no)
+ return vport;
+ }
+ return NULL;
+}
+
+/* Called with ovs_mutex. */
+static struct vport *new_vport(const struct vport_parms *parms)
+{
+ struct vport *vport;
+
+ vport = ovs_vport_add(parms);
+ if (!IS_ERR(vport)) {
+ struct datapath *dp = parms->dp;
+ struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
+
+ hlist_add_head_rcu(&vport->dp_hash_node, head);
+ }
+ return vport;
+}
+
+void ovs_dp_detach_port(struct vport *p)
+{
+ ASSERT_OVSL();
+
+ /* First drop references to device. */
+ hlist_del_rcu(&p->dp_hash_node);
+
+ /* Then destroy it. */
+ ovs_vport_del(p);
+}
+
+/* Must be called with rcu_read_lock. */
+void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ const struct vport *p = OVS_CB(skb)->input_vport;
+ struct datapath *dp = p->dp;
+ struct sw_flow *flow;
+ struct sw_flow_actions *sf_acts;
+ struct dp_stats_percpu *stats;
+ u64 *stats_counter;
+ u32 n_mask_hit;
+
+ stats = this_cpu_ptr(dp->stats_percpu);
+
+ /* Look up flow. */
+ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit);
+ if (unlikely(!flow)) {
+ struct dp_upcall_info upcall;
+ int error;
+
+ upcall.cmd = OVS_PACKET_CMD_MISS;
+ upcall.userdata = NULL;
+ upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+ upcall.egress_tun_info = NULL;
+ error = ovs_dp_upcall(dp, skb, key, &upcall);
+ if (unlikely(error))
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
+ stats_counter = &stats->n_missed;
+ goto out;
+ }
+
+ ovs_flow_stats_update(flow, key->tp.flags, skb);
+ sf_acts = rcu_dereference(flow->sf_acts);
+ ovs_execute_actions(dp, skb, sf_acts, key);
+
+ stats_counter = &stats->n_hit;
+
+out:
+ /* Update datapath statistics. */
+ u64_stats_update_begin(&stats->syncp);
+ (*stats_counter)++;
+ stats->n_mask_hit += n_mask_hit;
+ u64_stats_update_end(&stats->syncp);
+}
+
+int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
+ const struct sw_flow_key *key,
+ const struct dp_upcall_info *upcall_info)
+{
+ struct dp_stats_percpu *stats;
+ int err;
+
+ if (upcall_info->portid == 0) {
+ err = -ENOTCONN;
+ goto err;
+ }
+
+ if (!skb_is_gso(skb))
+ err = queue_userspace_packet(dp, skb, key, upcall_info);
+ else
+ err = queue_gso_packets(dp, skb, key, upcall_info);
+ if (err)
+ goto err;
+
+ return 0;
+
+err:
+ stats = this_cpu_ptr(dp->stats_percpu);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->n_lost++;
+ u64_stats_update_end(&stats->syncp);
+
+ return err;
+}
+
+static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
+ const struct sw_flow_key *key,
+ const struct dp_upcall_info *upcall_info)
+{
+ unsigned short gso_type = skb_shinfo(skb)->gso_type;
+ struct sw_flow_key later_key;
+ struct sk_buff *segs, *nskb;
+ struct ovs_skb_cb ovs_cb;
+ int err;
+
+ ovs_cb = *OVS_CB(skb);
+ segs = __skb_gso_segment(skb, NETIF_F_SG, false);
+ *OVS_CB(skb) = ovs_cb;
+ if (IS_ERR(segs))
+ return PTR_ERR(segs);
+ if (segs == NULL)
+ return -EINVAL;
+
+ if (gso_type & SKB_GSO_UDP) {
+ /* The initial flow key extracted by ovs_flow_key_extract()
+ * in this case is for a first fragment, so we need to
+ * properly mark later fragments.
+ */
+ later_key = *key;
+ later_key.ip.frag = OVS_FRAG_TYPE_LATER;
+ }
+
+ /* Queue all of the segments. */
+ skb = segs;
+ do {
+ *OVS_CB(skb) = ovs_cb;
+ if (gso_type & SKB_GSO_UDP && skb != segs)
+ key = &later_key;
+
+ err = queue_userspace_packet(dp, skb, key, upcall_info);
+ if (err)
+ break;
+
+ } while ((skb = skb->next));
+
+ /* Free all of the segments. */
+ skb = segs;
+ do {
+ nskb = skb->next;
+ if (err)
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
+ } while ((skb = nskb));
+ return err;
+}
+
+static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
+ unsigned int hdrlen)
+{
+ size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
+ + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */
+
+ /* OVS_PACKET_ATTR_USERDATA */
+ if (upcall_info->userdata)
+ size += NLA_ALIGN(upcall_info->userdata->nla_len);
+
+ /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
+ if (upcall_info->egress_tun_info)
+ size += nla_total_size(ovs_tun_key_attr_size());
+
+ return size;
+}
+
+static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
+ const struct sw_flow_key *key,
+ const struct dp_upcall_info *upcall_info)
+{
+ struct ovs_header *upcall;
+ struct sk_buff *nskb = NULL;
+ struct sk_buff *user_skb = NULL; /* to be queued to userspace */
+ struct nlattr *nla;
+ struct genl_info info = {
+ .dst_sk = ovs_dp_get_net(dp)->genl_sock,
+ .snd_portid = upcall_info->portid,
+ };
+ size_t len;
+ unsigned int hlen;
+ int err, dp_ifindex;
+
+ dp_ifindex = get_dpifindex(dp);
+ if (!dp_ifindex)
+ return -ENODEV;
+
+ if (skb_vlan_tag_present(skb)) {
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ nskb = __vlan_hwaccel_push_inside(nskb);
+ if (!nskb)
+ return -ENOMEM;
+
+ skb = nskb;
+ }
+
+ if (nla_attr_size(skb->len) > USHRT_MAX) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ /* Complete checksum if needed */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto out;
+
+ /* Older versions of OVS user space enforce alignment of the last
+ * Netlink attribute to NLA_ALIGNTO which would require extensive
+ * padding logic. Only perform zerocopy if padding is not required.
+ */
+ if (dp->user_features & OVS_DP_F_UNALIGNED)
+ hlen = skb_zerocopy_headlen(skb);
+ else
+ hlen = skb->len;
+
+ len = upcall_msg_size(upcall_info, hlen);
+ user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
+ if (!user_skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
+ 0, upcall_info->cmd);
+ upcall->dp_ifindex = dp_ifindex;
+
+ err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb);
+ BUG_ON(err);
+
+ if (upcall_info->userdata)
+ __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
+ nla_len(upcall_info->userdata),
+ nla_data(upcall_info->userdata));
+
+ if (upcall_info->egress_tun_info) {
+ nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
+ err = ovs_nla_put_egress_tunnel_key(user_skb,
+ upcall_info->egress_tun_info);
+ BUG_ON(err);
+ nla_nest_end(user_skb, nla);
+ }
+
+ /* Only reserve room for attribute header, packet data is added
+ * in skb_zerocopy() */
+ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
+ err = -ENOBUFS;
+ goto out;
+ }
+ nla->nla_len = nla_attr_size(skb->len);
+
+ err = skb_zerocopy(user_skb, skb, skb->len, hlen);
+ if (err)
+ goto out;
+
+ /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
+ if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
+ size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;
+
+ if (plen > 0)
+ memset(skb_put(user_skb, plen), 0, plen);
+ }
+
+ ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
+
+ err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
+ user_skb = NULL;
+out:
+ if (err)
+ skb_tx_error(skb);
+ kfree_skb(user_skb);
+ kfree_skb(nskb);
+ return err;
+}
+
+static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ovs_header *ovs_header = info->userhdr;
+ struct nlattr **a = info->attrs;
+ struct sw_flow_actions *acts;
+ struct sk_buff *packet;
+ struct sw_flow *flow;
+ struct sw_flow_actions *sf_acts;
+ struct datapath *dp;
+ struct ethhdr *eth;
+ struct vport *input_vport;
+ int len;
+ int err;
+ bool log = !a[OVS_PACKET_ATTR_PROBE];
+
+ err = -EINVAL;
+ if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
+ !a[OVS_PACKET_ATTR_ACTIONS])
+ goto err;
+
+ len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
+ packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
+ err = -ENOMEM;
+ if (!packet)
+ goto err;
+ skb_reserve(packet, NET_IP_ALIGN);
+
+ nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
+
+ skb_reset_mac_header(packet);
+ eth = eth_hdr(packet);
+
+ /* Normally, setting the skb 'protocol' field would be handled by a
+ * call to eth_type_trans(), but it assumes there's a sending
+ * device, which we may not have. */
+ if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
+ packet->protocol = eth->h_proto;
+ else
+ packet->protocol = htons(ETH_P_802_2);
+
+ /* Build an sw_flow for sending this packet. */
+ flow = ovs_flow_alloc();
+ err = PTR_ERR(flow);
+ if (IS_ERR(flow))
+ goto err_kfree_skb;
+
+ err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet,
+ &flow->key, log);
+ if (err)
+ goto err_flow_free;
+
+ err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
+ &flow->key, &acts, log);
+ if (err)
+ goto err_flow_free;
+
+ rcu_assign_pointer(flow->sf_acts, acts);
+ OVS_CB(packet)->egress_tun_info = NULL;
+ packet->priority = flow->key.phy.priority;
+ packet->mark = flow->key.phy.skb_mark;
+
+ rcu_read_lock();
+ dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
+ err = -ENODEV;
+ if (!dp)
+ goto err_unlock;
+
+ input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port);
+ if (!input_vport)
+ input_vport = ovs_vport_rcu(dp, OVSP_LOCAL);
+
+ if (!input_vport)
+ goto err_unlock;
+
+ OVS_CB(packet)->input_vport = input_vport;
+ sf_acts = rcu_dereference(flow->sf_acts);
+
+ local_bh_disable();
+ err = ovs_execute_actions(dp, packet, sf_acts, &flow->key);
+ local_bh_enable();
+ rcu_read_unlock();
+
+ ovs_flow_free(flow, false);
+ return err;
+
+err_unlock:
+ rcu_read_unlock();
+err_flow_free:
+ ovs_flow_free(flow, false);
+err_kfree_skb:
+ kfree_skb(packet);
+err:
+ return err;
+}
+
+static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
+ [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
+ [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
+ [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
+ [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
+};
+
+static const struct genl_ops dp_packet_genl_ops[] = {
+ { .cmd = OVS_PACKET_CMD_EXECUTE,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = packet_policy,
+ .doit = ovs_packet_cmd_execute
+ }
+};
+
+static struct genl_family dp_packet_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_PACKET_FAMILY,
+ .version = OVS_PACKET_VERSION,
+ .maxattr = OVS_PACKET_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = dp_packet_genl_ops,
+ .n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+};
+
+static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
+ struct ovs_dp_megaflow_stats *mega_stats)
+{
+ int i;
+
+ memset(mega_stats, 0, sizeof(*mega_stats));
+
+ stats->n_flows = ovs_flow_tbl_count(&dp->table);
+ mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
+
+ stats->n_hit = stats->n_missed = stats->n_lost = 0;
+
+ for_each_possible_cpu(i) {
+ const struct dp_stats_percpu *percpu_stats;
+ struct dp_stats_percpu local_stats;
+ unsigned int start;
+
+ percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
+
+ do {
+ start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
+ local_stats = *percpu_stats;
+ } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
+
+ stats->n_hit += local_stats.n_hit;
+ stats->n_missed += local_stats.n_missed;
+ stats->n_lost += local_stats.n_lost;
+ mega_stats->n_mask_hit += local_stats.n_mask_hit;
+ }
+}
+
+static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags)
+{
+ return ovs_identifier_is_ufid(sfid) &&
+ !(ufid_flags & OVS_UFID_F_OMIT_KEY);
+}
+
+static bool should_fill_mask(uint32_t ufid_flags)
+{
+ return !(ufid_flags & OVS_UFID_F_OMIT_MASK);
+}
+
+static bool should_fill_actions(uint32_t ufid_flags)
+{
+ return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS);
+}
+
+static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
+ const struct sw_flow_id *sfid,
+ uint32_t ufid_flags)
+{
+ size_t len = NLMSG_ALIGN(sizeof(struct ovs_header));
+
+ /* OVS_FLOW_ATTR_UFID */
+ if (sfid && ovs_identifier_is_ufid(sfid))
+ len += nla_total_size(sfid->ufid_len);
+
+ /* OVS_FLOW_ATTR_KEY */
+ if (!sfid || should_fill_key(sfid, ufid_flags))
+ len += nla_total_size(ovs_key_attr_size());
+
+ /* OVS_FLOW_ATTR_MASK */
+ if (should_fill_mask(ufid_flags))
+ len += nla_total_size(ovs_key_attr_size());
+
+ /* OVS_FLOW_ATTR_ACTIONS */
+ if (should_fill_actions(ufid_flags))
+ len += nla_total_size(acts->actions_len);
+
+ return len
+ + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
+ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
+ + nla_total_size(8); /* OVS_FLOW_ATTR_USED */
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
+ struct sk_buff *skb)
+{
+ struct ovs_flow_stats stats;
+ __be16 tcp_flags;
+ unsigned long used;
+
+ ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
+
+ if (used &&
+ nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
+ return -EMSGSIZE;
+
+ if (stats.n_packets &&
+ nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats))
+ return -EMSGSIZE;
+
+ if ((u8)ntohs(tcp_flags) &&
+ nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags)))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
+ struct sk_buff *skb, int skb_orig_len)
+{
+ struct nlattr *start;
+ int err;
+
+ /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
+ * this is the first flow to be dumped into 'skb'. This is unusual for
+ * Netlink but individual action lists can be longer than
+ * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
+ * The userspace caller can always fetch the actions separately if it
+ * really wants them. (Most userspace callers in fact don't care.)
+ *
+ * This can only fail for dump operations because the skb is always
+ * properly sized for single flows.
+ */
+ start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
+ if (start) {
+ const struct sw_flow_actions *sf_acts;
+
+ sf_acts = rcu_dereference_ovsl(flow->sf_acts);
+ err = ovs_nla_put_actions(sf_acts->actions,
+ sf_acts->actions_len, skb);
+
+ if (!err)
+ nla_nest_end(skb, start);
+ else {
+ if (skb_orig_len)
+ return err;
+
+ nla_nest_cancel(skb, start);
+ }
+ } else if (skb_orig_len) {
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
+ struct sk_buff *skb, u32 portid,
+ u32 seq, u32 flags, u8 cmd, u32 ufid_flags)
+{
+ const int skb_orig_len = skb->len;
+ struct ovs_header *ovs_header;
+ int err;
+
+ ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family,
+ flags, cmd);
+ if (!ovs_header)
+ return -EMSGSIZE;
+
+ ovs_header->dp_ifindex = dp_ifindex;
+
+ err = ovs_nla_put_identifier(flow, skb);
+ if (err)
+ goto error;
+
+ if (should_fill_key(&flow->id, ufid_flags)) {
+ err = ovs_nla_put_masked_key(flow, skb);
+ if (err)
+ goto error;
+ }
+
+ if (should_fill_mask(ufid_flags)) {
+ err = ovs_nla_put_mask(flow, skb);
+ if (err)
+ goto error;
+ }
+
+ err = ovs_flow_cmd_fill_stats(flow, skb);
+ if (err)
+ goto error;
+
+ if (should_fill_actions(ufid_flags)) {
+ err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
+ if (err)
+ goto error;
+ }
+
+ genlmsg_end(skb, ovs_header);
+ return 0;
+
+error:
+ genlmsg_cancel(skb, ovs_header);
+ return err;
+}
+
+/* May not be called with RCU read lock. */
+static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts,
+ const struct sw_flow_id *sfid,
+ struct genl_info *info,
+ bool always,
+ uint32_t ufid_flags)
+{
+ struct sk_buff *skb;
+ size_t len;
+
+ if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0))
+ return NULL;
+
+ len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
+ skb = genlmsg_new_unicast(len, info, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ return skb;
+}
+
+/* Called with ovs_mutex. */
+static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
+ int dp_ifindex,
+ struct genl_info *info, u8 cmd,
+ bool always, u32 ufid_flags)
+{
+ struct sk_buff *skb;
+ int retval;
+
+ skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts),
+ &flow->id, info, always, ufid_flags);
+ if (IS_ERR_OR_NULL(skb))
+ return skb;
+
+ retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
+ info->snd_portid, info->snd_seq, 0,
+ cmd, ufid_flags);
+ BUG_ON(retval < 0);
+ return skb;
+}
+
+static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sw_flow *flow = NULL, *new_flow;
+ struct sw_flow_mask mask;
+ struct sk_buff *reply;
+ struct datapath *dp;
+ struct sw_flow_key key;
+ struct sw_flow_actions *acts;
+ struct sw_flow_match match;
+ u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
+ int error;
+ bool log = !a[OVS_FLOW_ATTR_PROBE];
+
+ /* Must have key and actions. */
+ error = -EINVAL;
+ if (!a[OVS_FLOW_ATTR_KEY]) {
+ OVS_NLERR(log, "Flow key attr not present in new flow.");
+ goto error;
+ }
+ if (!a[OVS_FLOW_ATTR_ACTIONS]) {
+ OVS_NLERR(log, "Flow actions attr not present in new flow.");
+ goto error;
+ }
+
+ /* Most of the time we need to allocate a new flow, do it before
+ * locking.
+ */
+ new_flow = ovs_flow_alloc();
+ if (IS_ERR(new_flow)) {
+ error = PTR_ERR(new_flow);
+ goto error;
+ }
+
+ /* Extract key. */
+ ovs_match_init(&match, &key, &mask);
+ error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
+ a[OVS_FLOW_ATTR_MASK], log);
+ if (error)
+ goto err_kfree_flow;
+
+ ovs_flow_mask_key(&new_flow->key, &key, &mask);
+
+ /* Extract flow identifier. */
+ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
+ &key, log);
+ if (error)
+ goto err_kfree_flow;
+
+ /* Validate actions. */
+ error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
+ &acts, log);
+ if (error) {
+ OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
+ goto err_kfree_flow;
+ }
+
+ reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false,
+ ufid_flags);
+ if (IS_ERR(reply)) {
+ error = PTR_ERR(reply);
+ goto err_kfree_acts;
+ }
+
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (unlikely(!dp)) {
+ error = -ENODEV;
+ goto err_unlock_ovs;
+ }
+
+ /* Check if this is a duplicate flow */
+ if (ovs_identifier_is_ufid(&new_flow->id))
+ flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
+ if (!flow)
+ flow = ovs_flow_tbl_lookup(&dp->table, &key);
+ if (likely(!flow)) {
+ rcu_assign_pointer(new_flow->sf_acts, acts);
+
+ /* Put flow in bucket. */
+ error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
+ if (unlikely(error)) {
+ acts = NULL;
+ goto err_unlock_ovs;
+ }
+
+ if (unlikely(reply)) {
+ error = ovs_flow_cmd_fill_info(new_flow,
+ ovs_header->dp_ifindex,
+ reply, info->snd_portid,
+ info->snd_seq, 0,
+ OVS_FLOW_CMD_NEW,
+ ufid_flags);
+ BUG_ON(error < 0);
+ }
+ ovs_unlock();
+ } else {
+ struct sw_flow_actions *old_acts;
+
+ /* Bail out if we're not allowed to modify an existing flow.
+ * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
+ * because Generic Netlink treats the latter as a dump
+ * request. We also accept NLM_F_EXCL in case that bug ever
+ * gets fixed.
+ */
+ if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
+ | NLM_F_EXCL))) {
+ error = -EEXIST;
+ goto err_unlock_ovs;
+ }
+ /* The flow identifier has to be the same for flow updates.
+ * Look for any overlapping flow.
+ */
+ if (unlikely(!ovs_flow_cmp(flow, &match))) {
+ if (ovs_identifier_is_key(&flow->id))
+ flow = ovs_flow_tbl_lookup_exact(&dp->table,
+ &match);
+ else /* UFID matches but key is different */
+ flow = NULL;
+ if (!flow) {
+ error = -ENOENT;
+ goto err_unlock_ovs;
+ }
+ }
+ /* Update actions. */
+ old_acts = ovsl_dereference(flow->sf_acts);
+ rcu_assign_pointer(flow->sf_acts, acts);
+
+ if (unlikely(reply)) {
+ error = ovs_flow_cmd_fill_info(flow,
+ ovs_header->dp_ifindex,
+ reply, info->snd_portid,
+ info->snd_seq, 0,
+ OVS_FLOW_CMD_NEW,
+ ufid_flags);
+ BUG_ON(error < 0);
+ }
+ ovs_unlock();
+
+ ovs_nla_free_flow_actions(old_acts);
+ ovs_flow_free(new_flow, false);
+ }
+
+ if (reply)
+ ovs_notify(&dp_flow_genl_family, reply, info);
+ return 0;
+
+err_unlock_ovs:
+ ovs_unlock();
+ kfree_skb(reply);
+err_kfree_acts:
+ kfree(acts);
+err_kfree_flow:
+ ovs_flow_free(new_flow, false);
+error:
+ return error;
+}
+
+/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
+static struct sw_flow_actions *get_flow_actions(const struct nlattr *a,
+ const struct sw_flow_key *key,
+ const struct sw_flow_mask *mask,
+ bool log)
+{
+ struct sw_flow_actions *acts;
+ struct sw_flow_key masked_key;
+ int error;
+
+ ovs_flow_mask_key(&masked_key, key, mask);
+ error = ovs_nla_copy_actions(a, &masked_key, &acts, log);
+ if (error) {
+ OVS_NLERR(log,
+ "Actions may not be safe on all matching packets");
+ return ERR_PTR(error);
+ }
+
+ return acts;
+}
+
+static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sw_flow_key key;
+ struct sw_flow *flow;
+ struct sw_flow_mask mask;
+ struct sk_buff *reply = NULL;
+ struct datapath *dp;
+ struct sw_flow_actions *old_acts = NULL, *acts = NULL;
+ struct sw_flow_match match;
+ struct sw_flow_id sfid;
+ u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
+ int error;
+ bool log = !a[OVS_FLOW_ATTR_PROBE];
+ bool ufid_present;
+
+ /* Extract key. */
+ error = -EINVAL;
+ if (!a[OVS_FLOW_ATTR_KEY]) {
+ OVS_NLERR(log, "Flow key attribute not present in set flow.");
+ goto error;
+ }
+
+ ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
+ ovs_match_init(&match, &key, &mask);
+ error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY],
+ a[OVS_FLOW_ATTR_MASK], log);
+ if (error)
+ goto error;
+
+ /* Validate actions. */
+ if (a[OVS_FLOW_ATTR_ACTIONS]) {
+ acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask,
+ log);
+ if (IS_ERR(acts)) {
+ error = PTR_ERR(acts);
+ goto error;
+ }
+
+ /* Can allocate before locking if have acts. */
+ reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false,
+ ufid_flags);
+ if (IS_ERR(reply)) {
+ error = PTR_ERR(reply);
+ goto err_kfree_acts;
+ }
+ }
+
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (unlikely(!dp)) {
+ error = -ENODEV;
+ goto err_unlock_ovs;
+ }
+ /* Check that the flow exists. */
+ if (ufid_present)
+ flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid);
+ else
+ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+ if (unlikely(!flow)) {
+ error = -ENOENT;
+ goto err_unlock_ovs;
+ }
+
+ /* Update actions, if present. */
+ if (likely(acts)) {
+ old_acts = ovsl_dereference(flow->sf_acts);
+ rcu_assign_pointer(flow->sf_acts, acts);
+
+ if (unlikely(reply)) {
+ error = ovs_flow_cmd_fill_info(flow,
+ ovs_header->dp_ifindex,
+ reply, info->snd_portid,
+ info->snd_seq, 0,
+ OVS_FLOW_CMD_NEW,
+ ufid_flags);
+ BUG_ON(error < 0);
+ }
+ } else {
+ /* Could not alloc without acts before locking. */
+ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
+ info, OVS_FLOW_CMD_NEW, false,
+ ufid_flags);
+
+ if (unlikely(IS_ERR(reply))) {
+ error = PTR_ERR(reply);
+ goto err_unlock_ovs;
+ }
+ }
+
+ /* Clear stats. */
+ if (a[OVS_FLOW_ATTR_CLEAR])
+ ovs_flow_stats_clear(flow);
+ ovs_unlock();
+
+ if (reply)
+ ovs_notify(&dp_flow_genl_family, reply, info);
+ if (old_acts)
+ ovs_nla_free_flow_actions(old_acts);
+
+ return 0;
+
+err_unlock_ovs:
+ ovs_unlock();
+ kfree_skb(reply);
+err_kfree_acts:
+ kfree(acts);
+error:
+ return error;
+}
+
+static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sw_flow_key key;
+ struct sk_buff *reply;
+ struct sw_flow *flow;
+ struct datapath *dp;
+ struct sw_flow_match match;
+ struct sw_flow_id ufid;
+ u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
+ int err = 0;
+ bool log = !a[OVS_FLOW_ATTR_PROBE];
+ bool ufid_present;
+
+ ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
+ if (a[OVS_FLOW_ATTR_KEY]) {
+ ovs_match_init(&match, &key, NULL);
+ err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL,
+ log);
+ } else if (!ufid_present) {
+ OVS_NLERR(log,
+ "Flow get message rejected, Key attribute missing.");
+ err = -EINVAL;
+ }
+ if (err)
+ return err;
+
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (!dp) {
+ err = -ENODEV;
+ goto unlock;
+ }
+
+ if (ufid_present)
+ flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
+ else
+ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+ if (!flow) {
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
+ OVS_FLOW_CMD_NEW, true, ufid_flags);
+ if (IS_ERR(reply)) {
+ err = PTR_ERR(reply);
+ goto unlock;
+ }
+
+ ovs_unlock();
+ return genlmsg_reply(reply, info);
+unlock:
+ ovs_unlock();
+ return err;
+}
+
+static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sw_flow_key key;
+ struct sk_buff *reply;
+ struct sw_flow *flow = NULL;
+ struct datapath *dp;
+ struct sw_flow_match match;
+ struct sw_flow_id ufid;
+ u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
+ int err;
+ bool log = !a[OVS_FLOW_ATTR_PROBE];
+ bool ufid_present;
+
+ ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log);
+ if (a[OVS_FLOW_ATTR_KEY]) {
+ ovs_match_init(&match, &key, NULL);
+ err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL,
+ log);
+ if (unlikely(err))
+ return err;
+ }
+
+ ovs_lock();
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (unlikely(!dp)) {
+ err = -ENODEV;
+ goto unlock;
+ }
+
+ if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) {
+ err = ovs_flow_tbl_flush(&dp->table);
+ goto unlock;
+ }
+
+ if (ufid_present)
+ flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
+ else
+ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+ if (unlikely(!flow)) {
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ ovs_flow_tbl_remove(&dp->table, flow);
+ ovs_unlock();
+
+ reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts,
+ &flow->id, info, false, ufid_flags);
+ if (likely(reply)) {
+ if (likely(!IS_ERR(reply))) {
+ rcu_read_lock(); /*To keep RCU checker happy. */
+ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
+ reply, info->snd_portid,
+ info->snd_seq, 0,
+ OVS_FLOW_CMD_DEL,
+ ufid_flags);
+ rcu_read_unlock();
+ BUG_ON(err < 0);
+
+ ovs_notify(&dp_flow_genl_family, reply, info);
+ } else {
+ netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply));
+ }
+ }
+
+ ovs_flow_free(flow, true);
+ return 0;
+unlock:
+ ovs_unlock();
+ return err;
+}
+
+static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *a[__OVS_FLOW_ATTR_MAX];
+ struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
+ struct table_instance *ti;
+ struct datapath *dp;
+ u32 ufid_flags;
+ int err;
+
+ err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a,
+ OVS_FLOW_ATTR_MAX, flow_policy);
+ if (err)
+ return err;
+ ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
+
+ rcu_read_lock();
+ dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (!dp) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+
+ ti = rcu_dereference(dp->table.ti);
+ for (;;) {
+ struct sw_flow *flow;
+ u32 bucket, obj;
+
+ bucket = cb->args[0];
+ obj = cb->args[1];
+ flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj);
+ if (!flow)
+ break;
+
+ if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ OVS_FLOW_CMD_NEW, ufid_flags) < 0)
+ break;
+
+ cb->args[0] = bucket;
+ cb->args[1] = obj;
+ }
+ rcu_read_unlock();
+ return skb->len;
+}
+
+static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
+ [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
+ [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED },
+ [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
+ [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
+ [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG },
+ [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 },
+ [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops dp_flow_genl_ops[] = {
+ { .cmd = OVS_FLOW_CMD_NEW,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = flow_policy,
+ .doit = ovs_flow_cmd_new
+ },
+ { .cmd = OVS_FLOW_CMD_DEL,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = flow_policy,
+ .doit = ovs_flow_cmd_del
+ },
+ { .cmd = OVS_FLOW_CMD_GET,
+ .flags = 0, /* OK for unprivileged users. */
+ .policy = flow_policy,
+ .doit = ovs_flow_cmd_get,
+ .dumpit = ovs_flow_cmd_dump
+ },
+ { .cmd = OVS_FLOW_CMD_SET,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = flow_policy,
+ .doit = ovs_flow_cmd_set,
+ },
+};
+
+static struct genl_family dp_flow_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_FLOW_FAMILY,
+ .version = OVS_FLOW_VERSION,
+ .maxattr = OVS_FLOW_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = dp_flow_genl_ops,
+ .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
+ .mcgrps = &ovs_dp_flow_multicast_group,
+ .n_mcgrps = 1,
+};
+
+static size_t ovs_dp_cmd_msg_size(void)
+{
+ size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
+
+ msgsize += nla_total_size(IFNAMSIZ);
+ msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
+ msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
+ msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
+
+ return msgsize;
+}
+
+/* Called with ovs_mutex. */
+static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
+ u32 portid, u32 seq, u32 flags, u8 cmd)
+{
+ struct ovs_header *ovs_header;
+ struct ovs_dp_stats dp_stats;
+ struct ovs_dp_megaflow_stats dp_megaflow_stats;
+ int err;
+
+ ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
+ flags, cmd);
+ if (!ovs_header)
+ goto error;
+
+ ovs_header->dp_ifindex = get_dpifindex(dp);
+
+ err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
+ if (err)
+ goto nla_put_failure;
+
+ get_dp_stats(dp, &dp_stats, &dp_megaflow_stats);
+ if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats),
+ &dp_stats))
+ goto nla_put_failure;
+
+ if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS,
+ sizeof(struct ovs_dp_megaflow_stats),
+ &dp_megaflow_stats))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features))
+ goto nla_put_failure;
+
+ genlmsg_end(skb, ovs_header);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, ovs_header);
+error:
+ return -EMSGSIZE;
+}
+
+static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
+{
+ return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL);
+}
+
+/* Called with rcu_read_lock or ovs_mutex. */
+static struct datapath *lookup_datapath(struct net *net,
+ const struct ovs_header *ovs_header,
+ struct nlattr *a[OVS_DP_ATTR_MAX + 1])
+{
+ struct datapath *dp;
+
+ if (!a[OVS_DP_ATTR_NAME])
+ dp = get_dp(net, ovs_header->dp_ifindex);
+ else {
+ struct vport *vport;
+
+ vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME]));
+ dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
+ }
+ return dp ? dp : ERR_PTR(-ENODEV);
+}
+
+static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info)
+{
+ struct datapath *dp;
+
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ if (IS_ERR(dp))
+ return;
+
+ WARN(dp->user_features, "Dropping previously announced user features\n");
+ dp->user_features = 0;
+}
+
+static void ovs_dp_change(struct datapath *dp, struct nlattr *a[])
+{
+ if (a[OVS_DP_ATTR_USER_FEATURES])
+ dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
+}
+
+static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct vport_parms parms;
+ struct sk_buff *reply;
+ struct datapath *dp;
+ struct vport *vport;
+ struct ovs_net *ovs_net;
+ int err, i;
+
+ err = -EINVAL;
+ if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
+ goto err;
+
+ reply = ovs_dp_cmd_alloc_info(info);
+ if (!reply)
+ return -ENOMEM;
+
+ err = -ENOMEM;
+ dp = kzalloc(sizeof(*dp), GFP_KERNEL);
+ if (dp == NULL)
+ goto err_free_reply;
+
+ ovs_dp_set_net(dp, sock_net(skb->sk));
+
+ /* Allocate table. */
+ err = ovs_flow_tbl_init(&dp->table);
+ if (err)
+ goto err_free_dp;
+
+ dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
+ if (!dp->stats_percpu) {
+ err = -ENOMEM;
+ goto err_destroy_table;
+ }
+
+ dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!dp->ports) {
+ err = -ENOMEM;
+ goto err_destroy_percpu;
+ }
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(&dp->ports[i]);
+
+ /* Set up our datapath device. */
+ parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
+ parms.type = OVS_VPORT_TYPE_INTERNAL;
+ parms.options = NULL;
+ parms.dp = dp;
+ parms.port_no = OVSP_LOCAL;
+ parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
+
+ ovs_dp_change(dp, a);
+
+ /* So far only local changes have been made, now need the lock. */
+ ovs_lock();
+
+ vport = new_vport(&parms);
+ if (IS_ERR(vport)) {
+ err = PTR_ERR(vport);
+ if (err == -EBUSY)
+ err = -EEXIST;
+
+ if (err == -EEXIST) {
+ /* An outdated user space instance that does not understand
+ * the concept of user_features has attempted to create a new
+ * datapath and is likely to reuse it. Drop all user features.
+ */
+ if (info->genlhdr->version < OVS_DP_VER_FEATURES)
+ ovs_dp_reset_user_features(skb, info);
+ }
+
+ goto err_destroy_ports_array;
+ }
+
+ err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_DP_CMD_NEW);
+ BUG_ON(err < 0);
+
+ ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
+ list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
+
+ ovs_unlock();
+
+ ovs_notify(&dp_datapath_genl_family, reply, info);
+ return 0;
+
+err_destroy_ports_array:
+ ovs_unlock();
+ kfree(dp->ports);
+err_destroy_percpu:
+ free_percpu(dp->stats_percpu);
+err_destroy_table:
+ ovs_flow_tbl_destroy(&dp->table);
+err_free_dp:
+ kfree(dp);
+err_free_reply:
+ kfree_skb(reply);
+err:
+ return err;
+}
+
+/* Called with ovs_mutex. */
+static void __dp_destroy(struct datapath *dp)
+{
+ int i;
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
+ struct vport *vport;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node)
+ if (vport->port_no != OVSP_LOCAL)
+ ovs_dp_detach_port(vport);
+ }
+
+ list_del_rcu(&dp->list_node);
+
+ /* OVSP_LOCAL is datapath internal port. We need to make sure that
+ * all ports in datapath are destroyed first before freeing datapath.
+ */
+ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
+
+ /* RCU destroy the flow table */
+ call_rcu(&dp->rcu, destroy_dp_rcu);
+}
+
+static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *reply;
+ struct datapath *dp;
+ int err;
+
+ reply = ovs_dp_cmd_alloc_info(info);
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ err = PTR_ERR(dp);
+ if (IS_ERR(dp))
+ goto err_unlock_free;
+
+ err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_DP_CMD_DEL);
+ BUG_ON(err < 0);
+
+ __dp_destroy(dp);
+ ovs_unlock();
+
+ ovs_notify(&dp_datapath_genl_family, reply, info);
+
+ return 0;
+
+err_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
+ return err;
+}
+
+static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *reply;
+ struct datapath *dp;
+ int err;
+
+ reply = ovs_dp_cmd_alloc_info(info);
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ err = PTR_ERR(dp);
+ if (IS_ERR(dp))
+ goto err_unlock_free;
+
+ ovs_dp_change(dp, info->attrs);
+
+ err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_DP_CMD_NEW);
+ BUG_ON(err < 0);
+
+ ovs_unlock();
+ ovs_notify(&dp_datapath_genl_family, reply, info);
+
+ return 0;
+
+err_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
+ return err;
+}
+
+static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *reply;
+ struct datapath *dp;
+ int err;
+
+ reply = ovs_dp_cmd_alloc_info(info);
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
+ if (IS_ERR(dp)) {
+ err = PTR_ERR(dp);
+ goto err_unlock_free;
+ }
+ err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_DP_CMD_NEW);
+ BUG_ON(err < 0);
+ ovs_unlock();
+
+ return genlmsg_reply(reply, info);
+
+err_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
+ return err;
+}
+
+static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+ struct datapath *dp;
+ int skip = cb->args[0];
+ int i = 0;
+
+ ovs_lock();
+ list_for_each_entry(dp, &ovs_net->dps, list_node) {
+ if (i >= skip &&
+ ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ OVS_DP_CMD_NEW) < 0)
+ break;
+ i++;
+ }
+ ovs_unlock();
+
+ cb->args[0] = i;
+
+ return skb->len;
+}
+
+static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
+ [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+ [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+ [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops dp_datapath_genl_ops[] = {
+ { .cmd = OVS_DP_CMD_NEW,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = datapath_policy,
+ .doit = ovs_dp_cmd_new
+ },
+ { .cmd = OVS_DP_CMD_DEL,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = datapath_policy,
+ .doit = ovs_dp_cmd_del
+ },
+ { .cmd = OVS_DP_CMD_GET,
+ .flags = 0, /* OK for unprivileged users. */
+ .policy = datapath_policy,
+ .doit = ovs_dp_cmd_get,
+ .dumpit = ovs_dp_cmd_dump
+ },
+ { .cmd = OVS_DP_CMD_SET,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = datapath_policy,
+ .doit = ovs_dp_cmd_set,
+ },
+};
+
+static struct genl_family dp_datapath_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_DATAPATH_FAMILY,
+ .version = OVS_DATAPATH_VERSION,
+ .maxattr = OVS_DP_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = dp_datapath_genl_ops,
+ .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
+ .mcgrps = &ovs_dp_datapath_multicast_group,
+ .n_mcgrps = 1,
+};
+
+/* Called with ovs_mutex or RCU read lock. */
+static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
+ u32 portid, u32 seq, u32 flags, u8 cmd)
+{
+ struct ovs_header *ovs_header;
+ struct ovs_vport_stats vport_stats;
+ int err;
+
+ ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
+ flags, cmd);
+ if (!ovs_header)
+ return -EMSGSIZE;
+
+ ovs_header->dp_ifindex = get_dpifindex(vport->dp);
+
+ if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
+ nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
+ nla_put_string(skb, OVS_VPORT_ATTR_NAME,
+ vport->ops->get_name(vport)))
+ goto nla_put_failure;
+
+ ovs_vport_get_stats(vport, &vport_stats);
+ if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
+ &vport_stats))
+ goto nla_put_failure;
+
+ if (ovs_vport_get_upcall_portids(vport, skb))
+ goto nla_put_failure;
+
+ err = ovs_vport_get_options(vport, skb);
+ if (err == -EMSGSIZE)
+ goto error;
+
+ genlmsg_end(skb, ovs_header);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+error:
+ genlmsg_cancel(skb, ovs_header);
+ return err;
+}
+
+static struct sk_buff *ovs_vport_cmd_alloc_info(void)
+{
+ return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+}
+
+/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
+ u32 seq, u8 cmd)
+{
+ struct sk_buff *skb;
+ int retval;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
+ BUG_ON(retval < 0);
+
+ return skb;
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+static struct vport *lookup_vport(struct net *net,
+ const struct ovs_header *ovs_header,
+ struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
+{
+ struct datapath *dp;
+ struct vport *vport;
+
+ if (a[OVS_VPORT_ATTR_NAME]) {
+ vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
+ if (!vport)
+ return ERR_PTR(-ENODEV);
+ if (ovs_header->dp_ifindex &&
+ ovs_header->dp_ifindex != get_dpifindex(vport->dp))
+ return ERR_PTR(-ENODEV);
+ return vport;
+ } else if (a[OVS_VPORT_ATTR_PORT_NO]) {
+ u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
+
+ if (port_no >= DP_MAX_PORTS)
+ return ERR_PTR(-EFBIG);
+
+ dp = get_dp(net, ovs_header->dp_ifindex);
+ if (!dp)
+ return ERR_PTR(-ENODEV);
+
+ vport = ovs_vport_ovsl_rcu(dp, port_no);
+ if (!vport)
+ return ERR_PTR(-ENODEV);
+ return vport;
+ } else
+ return ERR_PTR(-EINVAL);
+}
+
+static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct vport_parms parms;
+ struct sk_buff *reply;
+ struct vport *vport;
+ struct datapath *dp;
+ u32 port_no;
+ int err;
+
+ if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
+ !a[OVS_VPORT_ATTR_UPCALL_PID])
+ return -EINVAL;
+
+ port_no = a[OVS_VPORT_ATTR_PORT_NO]
+ ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0;
+ if (port_no >= DP_MAX_PORTS)
+ return -EFBIG;
+
+ reply = ovs_vport_cmd_alloc_info();
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+restart:
+ dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
+ err = -ENODEV;
+ if (!dp)
+ goto exit_unlock_free;
+
+ if (port_no) {
+ vport = ovs_vport_ovsl(dp, port_no);
+ err = -EBUSY;
+ if (vport)
+ goto exit_unlock_free;
+ } else {
+ for (port_no = 1; ; port_no++) {
+ if (port_no >= DP_MAX_PORTS) {
+ err = -EFBIG;
+ goto exit_unlock_free;
+ }
+ vport = ovs_vport_ovsl(dp, port_no);
+ if (!vport)
+ break;
+ }
+ }
+
+ parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
+ parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
+ parms.options = a[OVS_VPORT_ATTR_OPTIONS];
+ parms.dp = dp;
+ parms.port_no = port_no;
+ parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
+
+ vport = new_vport(&parms);
+ err = PTR_ERR(vport);
+ if (IS_ERR(vport)) {
+ if (err == -EAGAIN)
+ goto restart;
+ goto exit_unlock_free;
+ }
+
+ err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+ BUG_ON(err < 0);
+ ovs_unlock();
+
+ ovs_notify(&dp_vport_genl_family, reply, info);
+ return 0;
+
+exit_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
+ return err;
+}
+
+static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct sk_buff *reply;
+ struct vport *vport;
+ int err;
+
+ reply = ovs_vport_cmd_alloc_info();
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
+ err = PTR_ERR(vport);
+ if (IS_ERR(vport))
+ goto exit_unlock_free;
+
+ if (a[OVS_VPORT_ATTR_TYPE] &&
+ nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
+ err = -EINVAL;
+ goto exit_unlock_free;
+ }
+
+ if (a[OVS_VPORT_ATTR_OPTIONS]) {
+ err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
+ if (err)
+ goto exit_unlock_free;
+ }
+
+
+ if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
+ struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID];
+
+ err = ovs_vport_set_upcall_portids(vport, ids);
+ if (err)
+ goto exit_unlock_free;
+ }
+
+ err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+ BUG_ON(err < 0);
+
+ ovs_unlock();
+ ovs_notify(&dp_vport_genl_family, reply, info);
+ return 0;
+
+exit_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
+ return err;
+}
+
+static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct sk_buff *reply;
+ struct vport *vport;
+ int err;
+
+ reply = ovs_vport_cmd_alloc_info();
+ if (!reply)
+ return -ENOMEM;
+
+ ovs_lock();
+ vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
+ err = PTR_ERR(vport);
+ if (IS_ERR(vport))
+ goto exit_unlock_free;
+
+ if (vport->port_no == OVSP_LOCAL) {
+ err = -EINVAL;
+ goto exit_unlock_free;
+ }
+
+ err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_VPORT_CMD_DEL);
+ BUG_ON(err < 0);
+ ovs_dp_detach_port(vport);
+ ovs_unlock();
+
+ ovs_notify(&dp_vport_genl_family, reply, info);
+ return 0;
+
+exit_unlock_free:
+ ovs_unlock();
+ kfree_skb(reply);
+ return err;
+}
+
+static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr **a = info->attrs;
+ struct ovs_header *ovs_header = info->userhdr;
+ struct sk_buff *reply;
+ struct vport *vport;
+ int err;
+
+ reply = ovs_vport_cmd_alloc_info();
+ if (!reply)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
+ err = PTR_ERR(vport);
+ if (IS_ERR(vport))
+ goto exit_unlock_free;
+ err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
+ info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+ BUG_ON(err < 0);
+ rcu_read_unlock();
+
+ return genlmsg_reply(reply, info);
+
+exit_unlock_free:
+ rcu_read_unlock();
+ kfree_skb(reply);
+ return err;
+}
+
+static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
+ struct datapath *dp;
+ int bucket = cb->args[0], skip = cb->args[1];
+ int i, j = 0;
+
+ rcu_read_lock();
+ dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex);
+ if (!dp) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
+ struct vport *vport;
+
+ j = 0;
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
+ if (j >= skip &&
+ ovs_vport_cmd_fill_info(vport, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ OVS_VPORT_CMD_NEW) < 0)
+ goto out;
+
+ j++;
+ }
+ skip = 0;
+ }
+out:
+ rcu_read_unlock();
+
+ cb->args[0] = i;
+ cb->args[1] = j;
+
+ return skb->len;
+}
+
+static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
+ [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+ [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
+ [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
+};
+
+static const struct genl_ops dp_vport_genl_ops[] = {
+ { .cmd = OVS_VPORT_CMD_NEW,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = vport_policy,
+ .doit = ovs_vport_cmd_new
+ },
+ { .cmd = OVS_VPORT_CMD_DEL,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = vport_policy,
+ .doit = ovs_vport_cmd_del
+ },
+ { .cmd = OVS_VPORT_CMD_GET,
+ .flags = 0, /* OK for unprivileged users. */
+ .policy = vport_policy,
+ .doit = ovs_vport_cmd_get,
+ .dumpit = ovs_vport_cmd_dump
+ },
+ { .cmd = OVS_VPORT_CMD_SET,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .policy = vport_policy,
+ .doit = ovs_vport_cmd_set,
+ },
+};
+
+struct genl_family dp_vport_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = sizeof(struct ovs_header),
+ .name = OVS_VPORT_FAMILY,
+ .version = OVS_VPORT_VERSION,
+ .maxattr = OVS_VPORT_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = dp_vport_genl_ops,
+ .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
+ .mcgrps = &ovs_dp_vport_multicast_group,
+ .n_mcgrps = 1,
+};
+
+static struct genl_family * const dp_genl_families[] = {
+ &dp_datapath_genl_family,
+ &dp_vport_genl_family,
+ &dp_flow_genl_family,
+ &dp_packet_genl_family,
+};
+
+static void dp_unregister_genl(int n_families)
+{
+ int i;
+
+ for (i = 0; i < n_families; i++)
+ genl_unregister_family(dp_genl_families[i]);
+}
+
+static int dp_register_genl(void)
+{
+ int err;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
+
+ err = genl_register_family(dp_genl_families[i]);
+ if (err)
+ goto error;
+ }
+
+ return 0;
+
+error:
+ dp_unregister_genl(i);
+ return err;
+}
+
+static int __net_init ovs_init_net(struct net *net)
+{
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+ INIT_LIST_HEAD(&ovs_net->dps);
+ INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
+ return 0;
+}
+
+static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
+ struct list_head *head)
+{
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ struct datapath *dp;
+
+ list_for_each_entry(dp, &ovs_net->dps, list_node) {
+ int i;
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
+ struct vport *vport;
+
+ hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
+ struct netdev_vport *netdev_vport;
+
+ if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
+ continue;
+
+ netdev_vport = netdev_vport_priv(vport);
+ if (dev_net(netdev_vport->dev) == dnet)
+ list_add(&vport->detach_list, head);
+ }
+ }
+ }
+}
+
+static void __net_exit ovs_exit_net(struct net *dnet)
+{
+ struct datapath *dp, *dp_next;
+ struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id);
+ struct vport *vport, *vport_next;
+ struct net *net;
+ LIST_HEAD(head);
+
+ ovs_lock();
+ list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
+ __dp_destroy(dp);
+
+ rtnl_lock();
+ for_each_net(net)
+ list_vports_from_net(net, dnet, &head);
+ rtnl_unlock();
+
+ /* Detach all vports from given namespace. */
+ list_for_each_entry_safe(vport, vport_next, &head, detach_list) {
+ list_del(&vport->detach_list);
+ ovs_dp_detach_port(vport);
+ }
+
+ ovs_unlock();
+
+ cancel_work_sync(&ovs_net->dp_notify_work);
+}
+
+static struct pernet_operations ovs_net_ops = {
+ .init = ovs_init_net,
+ .exit = ovs_exit_net,
+ .id = &ovs_net_id,
+ .size = sizeof(struct ovs_net),
+};
+
+static int __init dp_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
+
+ pr_info("Open vSwitch switching datapath\n");
+
+ err = action_fifos_init();
+ if (err)
+ goto error;
+
+ err = ovs_internal_dev_rtnl_link_register();
+ if (err)
+ goto error_action_fifos_exit;
+
+ err = ovs_flow_init();
+ if (err)
+ goto error_unreg_rtnl_link;
+
+ err = ovs_vport_init();
+ if (err)
+ goto error_flow_exit;
+
+ err = register_pernet_device(&ovs_net_ops);
+ if (err)
+ goto error_vport_exit;
+
+ err = register_netdevice_notifier(&ovs_dp_device_notifier);
+ if (err)
+ goto error_netns_exit;
+
+ err = ovs_netdev_init();
+ if (err)
+ goto error_unreg_notifier;
+
+ err = dp_register_genl();
+ if (err < 0)
+ goto error_unreg_netdev;
+
+ return 0;
+
+error_unreg_netdev:
+ ovs_netdev_exit();
+error_unreg_notifier:
+ unregister_netdevice_notifier(&ovs_dp_device_notifier);
+error_netns_exit:
+ unregister_pernet_device(&ovs_net_ops);
+error_vport_exit:
+ ovs_vport_exit();
+error_flow_exit:
+ ovs_flow_exit();
+error_unreg_rtnl_link:
+ ovs_internal_dev_rtnl_link_unregister();
+error_action_fifos_exit:
+ action_fifos_exit();
+error:
+ return err;
+}
+
+static void dp_cleanup(void)
+{
+ dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
+ ovs_netdev_exit();
+ unregister_netdevice_notifier(&ovs_dp_device_notifier);
+ unregister_pernet_device(&ovs_net_ops);
+ rcu_barrier();
+ ovs_vport_exit();
+ ovs_flow_exit();
+ ovs_internal_dev_rtnl_link_unregister();
+ action_fifos_exit();
+}
+
+module_init(dp_init);
+module_exit(dp_cleanup);
+
+MODULE_DESCRIPTION("Open vSwitch switching datapath");
+MODULE_LICENSE("GPL");
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
new file mode 100644
index 000000000..4ec4a480b
--- /dev/null
+++ b/net/openvswitch/datapath.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef DATAPATH_H
+#define DATAPATH_H 1
+
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/u64_stats_sync.h>
+
+#include "flow.h"
+#include "flow_table.h"
+#include "vport.h"
+
+#define DP_MAX_PORTS USHRT_MAX
+#define DP_VPORT_HASH_BUCKETS 1024
+
+#define SAMPLE_ACTION_DEPTH 3
+
+/**
+ * struct dp_stats_percpu - per-cpu packet processing statistics for a given
+ * datapath.
+ * @n_hit: Number of received packets for which a matching flow was found in
+ * the flow table.
+ * @n_miss: Number of received packets that had no matching flow in the flow
+ * table. The sum of @n_hit and @n_miss is the number of packets that have
+ * been received by the datapath.
+ * @n_lost: Number of received packets that had no matching flow in the flow
+ * table that could not be sent to userspace (normally due to an overflow in
+ * one of the datapath's queues).
+ * @n_mask_hit: Number of masks looked up for flow match.
+ * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked
+ * up per packet.
+ */
+struct dp_stats_percpu {
+ u64 n_hit;
+ u64 n_missed;
+ u64 n_lost;
+ u64 n_mask_hit;
+ struct u64_stats_sync syncp;
+};
+
+/**
+ * struct datapath - datapath for flow-based packet switching
+ * @rcu: RCU callback head for deferred destruction.
+ * @list_node: Element in global 'dps' list.
+ * @table: flow table.
+ * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by
+ * ovs_mutex and RCU.
+ * @stats_percpu: Per-CPU datapath statistics.
+ * @net: Reference to net namespace.
+ *
+ * Context: See the comment on locking at the top of datapath.c for additional
+ * locking information.
+ */
+struct datapath {
+ struct rcu_head rcu;
+ struct list_head list_node;
+
+ /* Flow table. */
+ struct flow_table table;
+
+ /* Switch ports. */
+ struct hlist_head *ports;
+
+ /* Stats. */
+ struct dp_stats_percpu __percpu *stats_percpu;
+
+ /* Network namespace ref. */
+ possible_net_t net;
+
+ u32 user_features;
+};
+
+/**
+ * struct ovs_skb_cb - OVS data in skb CB
+ * @egress_tun_key: Tunnel information about this packet on egress path.
+ * NULL if the packet is not being tunneled.
+ * @input_vport: The original vport packet came in on. This value is cached
+ * when a packet is received by OVS.
+ */
+struct ovs_skb_cb {
+ struct ovs_tunnel_info *egress_tun_info;
+ struct vport *input_vport;
+};
+#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
+
+/**
+ * struct dp_upcall - metadata to include with a packet to send to userspace
+ * @cmd: One of %OVS_PACKET_CMD_*.
+ * @userdata: If nonnull, its variable-length value is passed to userspace as
+ * %OVS_PACKET_ATTR_USERDATA.
+ * @portid: Netlink portid to which packet should be sent. If @portid is 0
+ * then no packet is sent and the packet is accounted in the datapath's @n_lost
+ * counter.
+ * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
+ */
+struct dp_upcall_info {
+ const struct ovs_tunnel_info *egress_tun_info;
+ const struct nlattr *userdata;
+ u32 portid;
+ u8 cmd;
+};
+
+/**
+ * struct ovs_net - Per net-namespace data for ovs.
+ * @dps: List of datapaths to enable dumping them all out.
+ * Protected by genl_mutex.
+ */
+struct ovs_net {
+ struct list_head dps;
+ struct work_struct dp_notify_work;
+ struct vport_net vport_net;
+};
+
+extern int ovs_net_id;
+void ovs_lock(void);
+void ovs_unlock(void);
+
+#ifdef CONFIG_LOCKDEP
+int lockdep_ovsl_is_held(void);
+#else
+#define lockdep_ovsl_is_held() 1
+#endif
+
+#define ASSERT_OVSL() WARN_ON(!lockdep_ovsl_is_held())
+#define ovsl_dereference(p) \
+ rcu_dereference_protected(p, lockdep_ovsl_is_held())
+#define rcu_dereference_ovsl(p) \
+ rcu_dereference_check(p, lockdep_ovsl_is_held())
+
+static inline struct net *ovs_dp_get_net(const struct datapath *dp)
+{
+ return read_pnet(&dp->net);
+}
+
+static inline void ovs_dp_set_net(struct datapath *dp, struct net *net)
+{
+ write_pnet(&dp->net, net);
+}
+
+struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no);
+
+static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return ovs_lookup_vport(dp, port_no);
+}
+
+static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held());
+ return ovs_lookup_vport(dp, port_no);
+}
+
+static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no)
+{
+ ASSERT_OVSL();
+ return ovs_lookup_vport(dp, port_no);
+}
+
+extern struct notifier_block ovs_dp_device_notifier;
+extern struct genl_family dp_vport_genl_family;
+
+void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key);
+void ovs_dp_detach_port(struct vport *);
+int ovs_dp_upcall(struct datapath *, struct sk_buff *,
+ const struct sw_flow_key *, const struct dp_upcall_info *);
+
+const char *ovs_dp_name(const struct datapath *dp);
+struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
+ u8 cmd);
+
+int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
+ const struct sw_flow_actions *, struct sw_flow_key *);
+
+void ovs_dp_notify_wq(struct work_struct *work);
+
+int action_fifos_init(void);
+void action_fifos_exit(void);
+
+#define OVS_NLERR(logging_allowed, fmt, ...) \
+do { \
+ if (logging_allowed && net_ratelimit()) \
+ pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \
+} while (0)
+#endif /* datapath.h */
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
new file mode 100644
index 000000000..2c631fe76
--- /dev/null
+++ b/net/openvswitch/dp_notify.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2007-2012 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include <linux/netdevice.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include "datapath.h"
+#include "vport-internal_dev.h"
+#include "vport-netdev.h"
+
+static void dp_detach_port_notify(struct vport *vport)
+{
+ struct sk_buff *notify;
+ struct datapath *dp;
+
+ dp = vport->dp;
+ notify = ovs_vport_cmd_build_info(vport, 0, 0,
+ OVS_VPORT_CMD_DEL);
+ ovs_dp_detach_port(vport);
+ if (IS_ERR(notify)) {
+ genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0,
+ 0, PTR_ERR(notify));
+ return;
+ }
+
+ genlmsg_multicast_netns(&dp_vport_genl_family,
+ ovs_dp_get_net(dp), notify, 0,
+ 0, GFP_KERNEL);
+}
+
+void ovs_dp_notify_wq(struct work_struct *work)
+{
+ struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work);
+ struct datapath *dp;
+
+ ovs_lock();
+ list_for_each_entry(dp, &ovs_net->dps, list_node) {
+ int i;
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
+ struct vport *vport;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) {
+ struct netdev_vport *netdev_vport;
+
+ if (vport->ops->type != OVS_VPORT_TYPE_NETDEV)
+ continue;
+
+ netdev_vport = netdev_vport_priv(vport);
+ if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH))
+ dp_detach_port_notify(vport);
+ }
+ }
+ }
+ ovs_unlock();
+}
+
+static int dp_device_event(struct notifier_block *unused, unsigned long event,
+ void *ptr)
+{
+ struct ovs_net *ovs_net;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct vport *vport = NULL;
+
+ if (!ovs_is_internal_dev(dev))
+ vport = ovs_netdev_get_vport(dev);
+
+ if (!vport)
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_UNREGISTER) {
+ /* upper_dev_unlink and decrement promisc immediately */
+ ovs_netdev_detach_dev(vport);
+
+ /* schedule vport destroy, dev_put and genl notification */
+ ovs_net = net_generic(dev_net(dev), ovs_net_id);
+ queue_work(system_wq, &ovs_net->dp_notify_work);
+ }
+
+ return NOTIFY_DONE;
+}
+
+struct notifier_block ovs_dp_device_notifier = {
+ .notifier_call = dp_device_event
+};
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
new file mode 100644
index 000000000..2dacc7b5a
--- /dev/null
+++ b/net/openvswitch/flow.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/mpls.h>
+#include <linux/sctp.h>
+#include <linux/smp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/ipv6.h>
+#include <net/mpls.h>
+#include <net/ndisc.h>
+
+#include "datapath.h"
+#include "flow.h"
+#include "flow_netlink.h"
+
+u64 ovs_flow_used_time(unsigned long flow_jiffies)
+{
+ struct timespec cur_ts;
+ u64 cur_ms, idle_ms;
+
+ ktime_get_ts(&cur_ts);
+ idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
+ cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
+ cur_ts.tv_nsec / NSEC_PER_MSEC;
+
+ return cur_ms - idle_ms;
+}
+
+#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
+
+void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
+ const struct sk_buff *skb)
+{
+ struct flow_stats *stats;
+ int node = numa_node_id();
+ int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+
+ stats = rcu_dereference(flow->stats[node]);
+
+ /* Check if already have node-specific stats. */
+ if (likely(stats)) {
+ spin_lock(&stats->lock);
+ /* Mark if we write on the pre-allocated stats. */
+ if (node == 0 && unlikely(flow->stats_last_writer != node))
+ flow->stats_last_writer = node;
+ } else {
+ stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
+ spin_lock(&stats->lock);
+
+ /* If the current NUMA-node is the only writer on the
+ * pre-allocated stats keep using them.
+ */
+ if (unlikely(flow->stats_last_writer != node)) {
+ /* A previous locker may have already allocated the
+ * stats, so we need to check again. If node-specific
+ * stats were already allocated, we update the pre-
+ * allocated stats as we have already locked them.
+ */
+ if (likely(flow->stats_last_writer != NUMA_NO_NODE)
+ && likely(!rcu_access_pointer(flow->stats[node]))) {
+ /* Try to allocate node-specific stats. */
+ struct flow_stats *new_stats;
+
+ new_stats =
+ kmem_cache_alloc_node(flow_stats_cache,
+ GFP_NOWAIT |
+ __GFP_THISNODE |
+ __GFP_NOWARN |
+ __GFP_NOMEMALLOC,
+ node);
+ if (likely(new_stats)) {
+ new_stats->used = jiffies;
+ new_stats->packet_count = 1;
+ new_stats->byte_count = len;
+ new_stats->tcp_flags = tcp_flags;
+ spin_lock_init(&new_stats->lock);
+
+ rcu_assign_pointer(flow->stats[node],
+ new_stats);
+ goto unlock;
+ }
+ }
+ flow->stats_last_writer = node;
+ }
+ }
+
+ stats->used = jiffies;
+ stats->packet_count++;
+ stats->byte_count += len;
+ stats->tcp_flags |= tcp_flags;
+unlock:
+ spin_unlock(&stats->lock);
+}
+
+/* Must be called with rcu_read_lock or ovs_mutex. */
+void ovs_flow_stats_get(const struct sw_flow *flow,
+ struct ovs_flow_stats *ovs_stats,
+ unsigned long *used, __be16 *tcp_flags)
+{
+ int node;
+
+ *used = 0;
+ *tcp_flags = 0;
+ memset(ovs_stats, 0, sizeof(*ovs_stats));
+
+ for_each_node(node) {
+ struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]);
+
+ if (stats) {
+ /* Local CPU may write on non-local stats, so we must
+ * block bottom-halves here.
+ */
+ spin_lock_bh(&stats->lock);
+ if (!*used || time_after(stats->used, *used))
+ *used = stats->used;
+ *tcp_flags |= stats->tcp_flags;
+ ovs_stats->n_packets += stats->packet_count;
+ ovs_stats->n_bytes += stats->byte_count;
+ spin_unlock_bh(&stats->lock);
+ }
+ }
+}
+
+/* Called with ovs_mutex. */
+void ovs_flow_stats_clear(struct sw_flow *flow)
+{
+ int node;
+
+ for_each_node(node) {
+ struct flow_stats *stats = ovsl_dereference(flow->stats[node]);
+
+ if (stats) {
+ spin_lock_bh(&stats->lock);
+ stats->used = 0;
+ stats->packet_count = 0;
+ stats->byte_count = 0;
+ stats->tcp_flags = 0;
+ spin_unlock_bh(&stats->lock);
+ }
+ }
+}
+
+static int check_header(struct sk_buff *skb, int len)
+{
+ if (unlikely(skb->len < len))
+ return -EINVAL;
+ if (unlikely(!pskb_may_pull(skb, len)))
+ return -ENOMEM;
+ return 0;
+}
+
+static bool arphdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_network_offset(skb) +
+ sizeof(struct arp_eth_header));
+}
+
+static int check_iphdr(struct sk_buff *skb)
+{
+ unsigned int nh_ofs = skb_network_offset(skb);
+ unsigned int ip_len;
+ int err;
+
+ err = check_header(skb, nh_ofs + sizeof(struct iphdr));
+ if (unlikely(err))
+ return err;
+
+ ip_len = ip_hdrlen(skb);
+ if (unlikely(ip_len < sizeof(struct iphdr) ||
+ skb->len < nh_ofs + ip_len))
+ return -EINVAL;
+
+ skb_set_transport_header(skb, nh_ofs + ip_len);
+ return 0;
+}
+
+static bool tcphdr_ok(struct sk_buff *skb)
+{
+ int th_ofs = skb_transport_offset(skb);
+ int tcp_len;
+
+ if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))))
+ return false;
+
+ tcp_len = tcp_hdrlen(skb);
+ if (unlikely(tcp_len < sizeof(struct tcphdr) ||
+ skb->len < th_ofs + tcp_len))
+ return false;
+
+ return true;
+}
+
+static bool udphdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct udphdr));
+}
+
+static bool sctphdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct sctphdr));
+}
+
+static bool icmphdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct icmphdr));
+}
+
+static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ unsigned int nh_ofs = skb_network_offset(skb);
+ unsigned int nh_len;
+ int payload_ofs;
+ struct ipv6hdr *nh;
+ uint8_t nexthdr;
+ __be16 frag_off;
+ int err;
+
+ err = check_header(skb, nh_ofs + sizeof(*nh));
+ if (unlikely(err))
+ return err;
+
+ nh = ipv6_hdr(skb);
+ nexthdr = nh->nexthdr;
+ payload_ofs = (u8 *)(nh + 1) - skb->data;
+
+ key->ip.proto = NEXTHDR_NONE;
+ key->ip.tos = ipv6_get_dsfield(nh);
+ key->ip.ttl = nh->hop_limit;
+ key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
+ key->ipv6.addr.src = nh->saddr;
+ key->ipv6.addr.dst = nh->daddr;
+
+ payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
+ if (unlikely(payload_ofs < 0))
+ return -EINVAL;
+
+ if (frag_off) {
+ if (frag_off & htons(~0x7))
+ key->ip.frag = OVS_FRAG_TYPE_LATER;
+ else
+ key->ip.frag = OVS_FRAG_TYPE_FIRST;
+ } else {
+ key->ip.frag = OVS_FRAG_TYPE_NONE;
+ }
+
+ nh_len = payload_ofs - nh_ofs;
+ skb_set_transport_header(skb, nh_ofs + nh_len);
+ key->ip.proto = nexthdr;
+ return nh_len;
+}
+
+static bool icmp6hdr_ok(struct sk_buff *skb)
+{
+ return pskb_may_pull(skb, skb_transport_offset(skb) +
+ sizeof(struct icmp6hdr));
+}
+
+static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ struct qtag_prefix {
+ __be16 eth_type; /* ETH_P_8021Q */
+ __be16 tci;
+ };
+ struct qtag_prefix *qp;
+
+ if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
+ return 0;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
+ sizeof(__be16))))
+ return -ENOMEM;
+
+ qp = (struct qtag_prefix *) skb->data;
+ key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
+ __skb_pull(skb, sizeof(struct qtag_prefix));
+
+ return 0;
+}
+
+static __be16 parse_ethertype(struct sk_buff *skb)
+{
+ struct llc_snap_hdr {
+ u8 dsap; /* Always 0xAA */
+ u8 ssap; /* Always 0xAA */
+ u8 ctrl;
+ u8 oui[3];
+ __be16 ethertype;
+ };
+ struct llc_snap_hdr *llc;
+ __be16 proto;
+
+ proto = *(__be16 *) skb->data;
+ __skb_pull(skb, sizeof(__be16));
+
+ if (ntohs(proto) >= ETH_P_802_3_MIN)
+ return proto;
+
+ if (skb->len < sizeof(struct llc_snap_hdr))
+ return htons(ETH_P_802_2);
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr))))
+ return htons(0);
+
+ llc = (struct llc_snap_hdr *) skb->data;
+ if (llc->dsap != LLC_SAP_SNAP ||
+ llc->ssap != LLC_SAP_SNAP ||
+ (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
+ return htons(ETH_P_802_2);
+
+ __skb_pull(skb, sizeof(struct llc_snap_hdr));
+
+ if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
+ return llc->ethertype;
+
+ return htons(ETH_P_802_2);
+}
+
+static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
+ int nh_len)
+{
+ struct icmp6hdr *icmp = icmp6_hdr(skb);
+
+ /* The ICMPv6 type and code fields use the 16-bit transport port
+ * fields, so we need to store them in 16-bit network byte order.
+ */
+ key->tp.src = htons(icmp->icmp6_type);
+ key->tp.dst = htons(icmp->icmp6_code);
+ memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd));
+
+ if (icmp->icmp6_code == 0 &&
+ (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
+ icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+ int icmp_len = skb->len - skb_transport_offset(skb);
+ struct nd_msg *nd;
+ int offset;
+
+ /* In order to process neighbor discovery options, we need the
+ * entire packet.
+ */
+ if (unlikely(icmp_len < sizeof(*nd)))
+ return 0;
+
+ if (unlikely(skb_linearize(skb)))
+ return -ENOMEM;
+
+ nd = (struct nd_msg *)skb_transport_header(skb);
+ key->ipv6.nd.target = nd->target;
+
+ icmp_len -= sizeof(*nd);
+ offset = 0;
+ while (icmp_len >= 8) {
+ struct nd_opt_hdr *nd_opt =
+ (struct nd_opt_hdr *)(nd->opt + offset);
+ int opt_len = nd_opt->nd_opt_len * 8;
+
+ if (unlikely(!opt_len || opt_len > icmp_len))
+ return 0;
+
+ /* Store the link layer address if the appropriate
+ * option is provided. It is considered an error if
+ * the same link layer option is specified twice.
+ */
+ if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
+ && opt_len == 8) {
+ if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
+ goto invalid;
+ ether_addr_copy(key->ipv6.nd.sll,
+ &nd->opt[offset+sizeof(*nd_opt)]);
+ } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
+ && opt_len == 8) {
+ if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
+ goto invalid;
+ ether_addr_copy(key->ipv6.nd.tll,
+ &nd->opt[offset+sizeof(*nd_opt)]);
+ }
+
+ icmp_len -= opt_len;
+ offset += opt_len;
+ }
+ }
+
+ return 0;
+
+invalid:
+ memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
+ memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
+ memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
+
+ return 0;
+}
+
+/**
+ * key_extract - extracts a flow key from an Ethernet frame.
+ * @skb: sk_buff that contains the frame, with skb->data pointing to the
+ * Ethernet header
+ * @key: output flow key
+ *
+ * The caller must ensure that skb->len >= ETH_HLEN.
+ *
+ * Returns 0 if successful, otherwise a negative errno value.
+ *
+ * Initializes @skb header pointers as follows:
+ *
+ * - skb->mac_header: the Ethernet header.
+ *
+ * - skb->network_header: just past the Ethernet header, or just past the
+ * VLAN header, to the first byte of the Ethernet payload.
+ *
+ * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
+ * on output, then just past the IP header, if one is present and
+ * of a correct length, otherwise the same as skb->network_header.
+ * For other key->eth.type values it is left untouched.
+ */
+static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int error;
+ struct ethhdr *eth;
+
+ /* Flags are always used as part of stats */
+ key->tp.flags = 0;
+
+ skb_reset_mac_header(skb);
+
+ /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
+ * header in the linear data area.
+ */
+ eth = eth_hdr(skb);
+ ether_addr_copy(key->eth.src, eth->h_source);
+ ether_addr_copy(key->eth.dst, eth->h_dest);
+
+ __skb_pull(skb, 2 * ETH_ALEN);
+ /* We are going to push all headers that we pull, so no need to
+ * update skb->csum here.
+ */
+
+ key->eth.tci = 0;
+ if (skb_vlan_tag_present(skb))
+ key->eth.tci = htons(skb->vlan_tci);
+ else if (eth->h_proto == htons(ETH_P_8021Q))
+ if (unlikely(parse_vlan(skb, key)))
+ return -ENOMEM;
+
+ key->eth.type = parse_ethertype(skb);
+ if (unlikely(key->eth.type == htons(0)))
+ return -ENOMEM;
+
+ skb_reset_network_header(skb);
+ skb_reset_mac_len(skb);
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ /* Network layer. */
+ if (key->eth.type == htons(ETH_P_IP)) {
+ struct iphdr *nh;
+ __be16 offset;
+
+ error = check_iphdr(skb);
+ if (unlikely(error)) {
+ memset(&key->ip, 0, sizeof(key->ip));
+ memset(&key->ipv4, 0, sizeof(key->ipv4));
+ if (error == -EINVAL) {
+ skb->transport_header = skb->network_header;
+ error = 0;
+ }
+ return error;
+ }
+
+ nh = ip_hdr(skb);
+ key->ipv4.addr.src = nh->saddr;
+ key->ipv4.addr.dst = nh->daddr;
+
+ key->ip.proto = nh->protocol;
+ key->ip.tos = nh->tos;
+ key->ip.ttl = nh->ttl;
+
+ offset = nh->frag_off & htons(IP_OFFSET);
+ if (offset) {
+ key->ip.frag = OVS_FRAG_TYPE_LATER;
+ return 0;
+ }
+ if (nh->frag_off & htons(IP_MF) ||
+ skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+ key->ip.frag = OVS_FRAG_TYPE_FIRST;
+ else
+ key->ip.frag = OVS_FRAG_TYPE_NONE;
+
+ /* Transport layer. */
+ if (key->ip.proto == IPPROTO_TCP) {
+ if (tcphdr_ok(skb)) {
+ struct tcphdr *tcp = tcp_hdr(skb);
+ key->tp.src = tcp->source;
+ key->tp.dst = tcp->dest;
+ key->tp.flags = TCP_FLAGS_BE16(tcp);
+ } else {
+ memset(&key->tp, 0, sizeof(key->tp));
+ }
+
+ } else if (key->ip.proto == IPPROTO_UDP) {
+ if (udphdr_ok(skb)) {
+ struct udphdr *udp = udp_hdr(skb);
+ key->tp.src = udp->source;
+ key->tp.dst = udp->dest;
+ } else {
+ memset(&key->tp, 0, sizeof(key->tp));
+ }
+ } else if (key->ip.proto == IPPROTO_SCTP) {
+ if (sctphdr_ok(skb)) {
+ struct sctphdr *sctp = sctp_hdr(skb);
+ key->tp.src = sctp->source;
+ key->tp.dst = sctp->dest;
+ } else {
+ memset(&key->tp, 0, sizeof(key->tp));
+ }
+ } else if (key->ip.proto == IPPROTO_ICMP) {
+ if (icmphdr_ok(skb)) {
+ struct icmphdr *icmp = icmp_hdr(skb);
+ /* The ICMP type and code fields use the 16-bit
+ * transport port fields, so we need to store
+ * them in 16-bit network byte order. */
+ key->tp.src = htons(icmp->type);
+ key->tp.dst = htons(icmp->code);
+ } else {
+ memset(&key->tp, 0, sizeof(key->tp));
+ }
+ }
+
+ } else if (key->eth.type == htons(ETH_P_ARP) ||
+ key->eth.type == htons(ETH_P_RARP)) {
+ struct arp_eth_header *arp;
+ bool arp_available = arphdr_ok(skb);
+
+ arp = (struct arp_eth_header *)skb_network_header(skb);
+
+ if (arp_available &&
+ arp->ar_hrd == htons(ARPHRD_ETHER) &&
+ arp->ar_pro == htons(ETH_P_IP) &&
+ arp->ar_hln == ETH_ALEN &&
+ arp->ar_pln == 4) {
+
+ /* We only match on the lower 8 bits of the opcode. */
+ if (ntohs(arp->ar_op) <= 0xff)
+ key->ip.proto = ntohs(arp->ar_op);
+ else
+ key->ip.proto = 0;
+
+ memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
+ memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
+ ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha);
+ ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha);
+ } else {
+ memset(&key->ip, 0, sizeof(key->ip));
+ memset(&key->ipv4, 0, sizeof(key->ipv4));
+ }
+ } else if (eth_p_mpls(key->eth.type)) {
+ size_t stack_len = MPLS_HLEN;
+
+ /* In the presence of an MPLS label stack the end of the L2
+ * header and the beginning of the L3 header differ.
+ *
+ * Advance network_header to the beginning of the L3
+ * header. mac_len corresponds to the end of the L2 header.
+ */
+ while (1) {
+ __be32 lse;
+
+ error = check_header(skb, skb->mac_len + stack_len);
+ if (unlikely(error))
+ return 0;
+
+ memcpy(&lse, skb_network_header(skb), MPLS_HLEN);
+
+ if (stack_len == MPLS_HLEN)
+ memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
+
+ skb_set_network_header(skb, skb->mac_len + stack_len);
+ if (lse & htonl(MPLS_LS_S_MASK))
+ break;
+
+ stack_len += MPLS_HLEN;
+ }
+ } else if (key->eth.type == htons(ETH_P_IPV6)) {
+ int nh_len; /* IPv6 Header + Extensions */
+
+ nh_len = parse_ipv6hdr(skb, key);
+ if (unlikely(nh_len < 0)) {
+ memset(&key->ip, 0, sizeof(key->ip));
+ memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr));
+ if (nh_len == -EINVAL) {
+ skb->transport_header = skb->network_header;
+ error = 0;
+ } else {
+ error = nh_len;
+ }
+ return error;
+ }
+
+ if (key->ip.frag == OVS_FRAG_TYPE_LATER)
+ return 0;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+ key->ip.frag = OVS_FRAG_TYPE_FIRST;
+
+ /* Transport layer. */
+ if (key->ip.proto == NEXTHDR_TCP) {
+ if (tcphdr_ok(skb)) {
+ struct tcphdr *tcp = tcp_hdr(skb);
+ key->tp.src = tcp->source;
+ key->tp.dst = tcp->dest;
+ key->tp.flags = TCP_FLAGS_BE16(tcp);
+ } else {
+ memset(&key->tp, 0, sizeof(key->tp));
+ }
+ } else if (key->ip.proto == NEXTHDR_UDP) {
+ if (udphdr_ok(skb)) {
+ struct udphdr *udp = udp_hdr(skb);
+ key->tp.src = udp->source;
+ key->tp.dst = udp->dest;
+ } else {
+ memset(&key->tp, 0, sizeof(key->tp));
+ }
+ } else if (key->ip.proto == NEXTHDR_SCTP) {
+ if (sctphdr_ok(skb)) {
+ struct sctphdr *sctp = sctp_hdr(skb);
+ key->tp.src = sctp->source;
+ key->tp.dst = sctp->dest;
+ } else {
+ memset(&key->tp, 0, sizeof(key->tp));
+ }
+ } else if (key->ip.proto == NEXTHDR_ICMP) {
+ if (icmp6hdr_ok(skb)) {
+ error = parse_icmpv6(skb, key, nh_len);
+ if (error)
+ return error;
+ } else {
+ memset(&key->tp, 0, sizeof(key->tp));
+ }
+ }
+ }
+ return 0;
+}
+
+int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ return key_extract(skb, key);
+}
+
+int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+ struct sk_buff *skb, struct sw_flow_key *key)
+{
+ /* Extract metadata from packet. */
+ if (tun_info) {
+ memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
+
+ if (tun_info->options) {
+ BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
+ 8)) - 1
+ > sizeof(key->tun_opts));
+ memcpy(TUN_METADATA_OPTS(key, tun_info->options_len),
+ tun_info->options, tun_info->options_len);
+ key->tun_opts_len = tun_info->options_len;
+ } else {
+ key->tun_opts_len = 0;
+ }
+ } else {
+ key->tun_opts_len = 0;
+ memset(&key->tun_key, 0, sizeof(key->tun_key));
+ }
+
+ key->phy.priority = skb->priority;
+ key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
+ key->phy.skb_mark = skb->mark;
+ key->ovs_flow_hash = 0;
+ key->recirc_id = 0;
+
+ return key_extract(skb, key);
+}
+
+int ovs_flow_key_extract_userspace(const struct nlattr *attr,
+ struct sk_buff *skb,
+ struct sw_flow_key *key, bool log)
+{
+ int err;
+
+ memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE);
+
+ /* Extract metadata from netlink attributes. */
+ err = ovs_nla_get_flow_metadata(attr, key, log);
+ if (err)
+ return err;
+
+ return key_extract(skb, key);
+}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
new file mode 100644
index 000000000..a076e445c
--- /dev/null
+++ b/net/openvswitch/flow.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef FLOW_H
+#define FLOW_H 1
+
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+#include <net/inet_ecn.h>
+
+struct sk_buff;
+
+/* Used to memset ovs_key_ipv4_tunnel padding. */
+#define OVS_TUNNEL_KEY_SIZE \
+ (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \
+ FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst))
+
+struct ovs_key_ipv4_tunnel {
+ __be64 tun_id;
+ __be32 ipv4_src;
+ __be32 ipv4_dst;
+ __be16 tun_flags;
+ u8 ipv4_tos;
+ u8 ipv4_ttl;
+ __be16 tp_src;
+ __be16 tp_dst;
+} __packed __aligned(4); /* Minimize padding. */
+
+struct ovs_tunnel_info {
+ struct ovs_key_ipv4_tunnel tunnel;
+ const void *options;
+ u8 options_len;
+};
+
+/* Store options at the end of the array if they are less than the
+ * maximum size. This allows us to get the benefits of variable length
+ * matching for small options.
+ */
+#define TUN_METADATA_OFFSET(opt_len) \
+ (FIELD_SIZEOF(struct sw_flow_key, tun_opts) - opt_len)
+#define TUN_METADATA_OPTS(flow_key, opt_len) \
+ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
+
+static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
+ __be32 saddr, __be32 daddr,
+ u8 tos, u8 ttl,
+ __be16 tp_src,
+ __be16 tp_dst,
+ __be64 tun_id,
+ __be16 tun_flags,
+ const void *opts,
+ u8 opts_len)
+{
+ tun_info->tunnel.tun_id = tun_id;
+ tun_info->tunnel.ipv4_src = saddr;
+ tun_info->tunnel.ipv4_dst = daddr;
+ tun_info->tunnel.ipv4_tos = tos;
+ tun_info->tunnel.ipv4_ttl = ttl;
+ tun_info->tunnel.tun_flags = tun_flags;
+
+ /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
+ * the upper tunnel are used.
+ * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
+ */
+ tun_info->tunnel.tp_src = tp_src;
+ tun_info->tunnel.tp_dst = tp_dst;
+
+ /* Clear struct padding. */
+ if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE)
+ memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE,
+ 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
+
+ tun_info->options = opts;
+ tun_info->options_len = opts_len;
+}
+
+static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
+ const struct iphdr *iph,
+ __be16 tp_src,
+ __be16 tp_dst,
+ __be64 tun_id,
+ __be16 tun_flags,
+ const void *opts,
+ u8 opts_len)
+{
+ __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr,
+ iph->tos, iph->ttl,
+ tp_src, tp_dst,
+ tun_id, tun_flags,
+ opts, opts_len);
+}
+
+#define OVS_SW_FLOW_KEY_METADATA_SIZE \
+ (offsetof(struct sw_flow_key, recirc_id) + \
+ FIELD_SIZEOF(struct sw_flow_key, recirc_id))
+
+struct sw_flow_key {
+ u8 tun_opts[255];
+ u8 tun_opts_len;
+ struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
+ struct {
+ u32 priority; /* Packet QoS priority. */
+ u32 skb_mark; /* SKB mark. */
+ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */
+ } __packed phy; /* Safe when right after 'tun_key'. */
+ u32 ovs_flow_hash; /* Datapath computed hash value. */
+ u32 recirc_id; /* Recirculation ID. */
+ struct {
+ u8 src[ETH_ALEN]; /* Ethernet source address. */
+ u8 dst[ETH_ALEN]; /* Ethernet destination address. */
+ __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
+ __be16 type; /* Ethernet frame type. */
+ } eth;
+ union {
+ struct {
+ __be32 top_lse; /* top label stack entry */
+ } mpls;
+ struct {
+ u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
+ u8 tos; /* IP ToS. */
+ u8 ttl; /* IP TTL/hop limit. */
+ u8 frag; /* One of OVS_FRAG_TYPE_*. */
+ } ip;
+ };
+ struct {
+ __be16 src; /* TCP/UDP/SCTP source port. */
+ __be16 dst; /* TCP/UDP/SCTP destination port. */
+ __be16 flags; /* TCP flags. */
+ } tp;
+ union {
+ struct {
+ struct {
+ __be32 src; /* IP source address. */
+ __be32 dst; /* IP destination address. */
+ } addr;
+ struct {
+ u8 sha[ETH_ALEN]; /* ARP source hardware address. */
+ u8 tha[ETH_ALEN]; /* ARP target hardware address. */
+ } arp;
+ } ipv4;
+ struct {
+ struct {
+ struct in6_addr src; /* IPv6 source address. */
+ struct in6_addr dst; /* IPv6 destination address. */
+ } addr;
+ __be32 label; /* IPv6 flow label. */
+ struct {
+ struct in6_addr target; /* ND target address. */
+ u8 sll[ETH_ALEN]; /* ND source link layer address. */
+ u8 tll[ETH_ALEN]; /* ND target link layer address. */
+ } nd;
+ } ipv6;
+ };
+} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
+
+struct sw_flow_key_range {
+ unsigned short int start;
+ unsigned short int end;
+};
+
+struct sw_flow_mask {
+ int ref_count;
+ struct rcu_head rcu;
+ struct list_head list;
+ struct sw_flow_key_range range;
+ struct sw_flow_key key;
+};
+
+struct sw_flow_match {
+ struct sw_flow_key *key;
+ struct sw_flow_key_range range;
+ struct sw_flow_mask *mask;
+};
+
+#define MAX_UFID_LENGTH 16 /* 128 bits */
+
+struct sw_flow_id {
+ u32 ufid_len;
+ union {
+ u32 ufid[MAX_UFID_LENGTH / 4];
+ struct sw_flow_key *unmasked_key;
+ };
+};
+
+struct sw_flow_actions {
+ struct rcu_head rcu;
+ u32 actions_len;
+ struct nlattr actions[];
+};
+
+struct flow_stats {
+ u64 packet_count; /* Number of packets matched. */
+ u64 byte_count; /* Number of bytes matched. */
+ unsigned long used; /* Last used time (in jiffies). */
+ spinlock_t lock; /* Lock for atomic stats update. */
+ __be16 tcp_flags; /* Union of seen TCP flags. */
+};
+
+struct sw_flow {
+ struct rcu_head rcu;
+ struct {
+ struct hlist_node node[2];
+ u32 hash;
+ } flow_table, ufid_table;
+ int stats_last_writer; /* NUMA-node id of the last writer on
+ * 'stats[0]'.
+ */
+ struct sw_flow_key key;
+ struct sw_flow_id id;
+ struct sw_flow_mask *mask;
+ struct sw_flow_actions __rcu *sf_acts;
+ struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one
+ * is allocated at flow creation time,
+ * the rest are allocated on demand
+ * while holding the 'stats[0].lock'.
+ */
+};
+
+struct arp_eth_header {
+ __be16 ar_hrd; /* format of hardware address */
+ __be16 ar_pro; /* format of protocol address */
+ unsigned char ar_hln; /* length of hardware address */
+ unsigned char ar_pln; /* length of protocol address */
+ __be16 ar_op; /* ARP opcode (command) */
+
+ /* Ethernet+IPv4 specific members. */
+ unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */
+ unsigned char ar_sip[4]; /* sender IP address */
+ unsigned char ar_tha[ETH_ALEN]; /* target hardware address */
+ unsigned char ar_tip[4]; /* target IP address */
+} __packed;
+
+static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid)
+{
+ return sfid->ufid_len;
+}
+
+static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid)
+{
+ return !ovs_identifier_is_ufid(sfid);
+}
+
+void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags,
+ const struct sk_buff *);
+void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *,
+ unsigned long *used, __be16 *tcp_flags);
+void ovs_flow_stats_clear(struct sw_flow *);
+u64 ovs_flow_used_time(unsigned long flow_jiffies);
+
+int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
+int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+ struct sk_buff *skb,
+ struct sw_flow_key *key);
+/* Extract key from packet coming from userspace. */
+int ovs_flow_key_extract_userspace(const struct nlattr *attr,
+ struct sk_buff *skb,
+ struct sw_flow_key *key, bool log);
+
+#endif /* flow.h */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
new file mode 100644
index 000000000..c691b1a1e
--- /dev/null
+++ b/net/openvswitch/flow_netlink.c
@@ -0,0 +1,2309 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/geneve.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/mpls.h>
+
+#include "flow_netlink.h"
+#include "vport-vxlan.h"
+
+struct ovs_len_tbl {
+ int len;
+ const struct ovs_len_tbl *next;
+};
+
+#define OVS_ATTR_NESTED -1
+
+static void update_range(struct sw_flow_match *match,
+ size_t offset, size_t size, bool is_mask)
+{
+ struct sw_flow_key_range *range;
+ size_t start = rounddown(offset, sizeof(long));
+ size_t end = roundup(offset + size, sizeof(long));
+
+ if (!is_mask)
+ range = &match->range;
+ else
+ range = &match->mask->range;
+
+ if (range->start == range->end) {
+ range->start = start;
+ range->end = end;
+ return;
+ }
+
+ if (range->start > start)
+ range->start = start;
+
+ if (range->end < end)
+ range->end = end;
+}
+
+#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
+ do { \
+ update_range(match, offsetof(struct sw_flow_key, field), \
+ sizeof((match)->key->field), is_mask); \
+ if (is_mask) \
+ (match)->mask->key.field = value; \
+ else \
+ (match)->key->field = value; \
+ } while (0)
+
+#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \
+ do { \
+ update_range(match, offset, len, is_mask); \
+ if (is_mask) \
+ memcpy((u8 *)&(match)->mask->key + offset, value_p, \
+ len); \
+ else \
+ memcpy((u8 *)(match)->key + offset, value_p, len); \
+ } while (0)
+
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+ SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \
+ value_p, len, is_mask)
+
+#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \
+ do { \
+ update_range(match, offsetof(struct sw_flow_key, field), \
+ sizeof((match)->key->field), is_mask); \
+ if (is_mask) \
+ memset((u8 *)&(match)->mask->key.field, value, \
+ sizeof((match)->mask->key.field)); \
+ else \
+ memset((u8 *)&(match)->key->field, value, \
+ sizeof((match)->key->field)); \
+ } while (0)
+
+static bool match_validate(const struct sw_flow_match *match,
+ u64 key_attrs, u64 mask_attrs, bool log)
+{
+ u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+ u64 mask_allowed = key_attrs; /* At most allow all key attributes */
+
+ /* The following mask attributes allowed only if they
+ * pass the validation tests. */
+ mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+ | (1 << OVS_KEY_ATTR_IPV6)
+ | (1 << OVS_KEY_ATTR_TCP)
+ | (1 << OVS_KEY_ATTR_TCP_FLAGS)
+ | (1 << OVS_KEY_ATTR_UDP)
+ | (1 << OVS_KEY_ATTR_SCTP)
+ | (1 << OVS_KEY_ATTR_ICMP)
+ | (1 << OVS_KEY_ATTR_ICMPV6)
+ | (1 << OVS_KEY_ATTR_ARP)
+ | (1 << OVS_KEY_ATTR_ND)
+ | (1 << OVS_KEY_ATTR_MPLS));
+
+ /* Always allowed mask fields. */
+ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
+ | (1 << OVS_KEY_ATTR_IN_PORT)
+ | (1 << OVS_KEY_ATTR_ETHERTYPE));
+
+ /* Check key attributes. */
+ if (match->key->eth.type == htons(ETH_P_ARP)
+ || match->key->eth.type == htons(ETH_P_RARP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ARP;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
+ }
+
+ if (eth_p_mpls(match->key->eth.type)) {
+ key_expected |= 1 << OVS_KEY_ATTR_MPLS;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_MPLS;
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IP)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV4;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_SCTP) {
+ key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ if (match->mask && (match->mask->key.ip.proto == 0xff)) {
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ }
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMP) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
+ }
+ }
+ }
+
+ if (match->key->eth.type == htons(ETH_P_IPV6)) {
+ key_expected |= 1 << OVS_KEY_ATTR_IPV6;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+
+ if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+ if (match->key->ip.proto == IPPROTO_UDP) {
+ key_expected |= 1 << OVS_KEY_ATTR_UDP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_SCTP) {
+ key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+ }
+
+ if (match->key->ip.proto == IPPROTO_TCP) {
+ key_expected |= 1 << OVS_KEY_ATTR_TCP;
+ key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ if (match->mask && (match->mask->key.ip.proto == 0xff)) {
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+ mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS;
+ }
+ }
+
+ if (match->key->ip.proto == IPPROTO_ICMPV6) {
+ key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
+ if (match->mask && (match->mask->key.ip.proto == 0xff))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
+
+ if (match->key->tp.src ==
+ htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+ key_expected |= 1 << OVS_KEY_ATTR_ND;
+ if (match->mask && (match->mask->key.tp.src == htons(0xff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_ND;
+ }
+ }
+ }
+ }
+
+ if ((key_attrs & key_expected) != key_expected) {
+ /* Key attributes check failed. */
+ OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)",
+ (unsigned long long)key_attrs,
+ (unsigned long long)key_expected);
+ return false;
+ }
+
+ if ((mask_attrs & mask_allowed) != mask_attrs) {
+ /* Mask attributes check failed. */
+ OVS_NLERR(log, "Unexpected mask (mask=%llx, allowed=%llx)",
+ (unsigned long long)mask_attrs,
+ (unsigned long long)mask_allowed);
+ return false;
+ }
+
+ return true;
+}
+
+size_t ovs_tun_key_attr_size(void)
+{
+ /* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider
+ * updating this function.
+ */
+ return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */
+ + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */
+ + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */
+ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */
+ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */
+ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
+ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */
+ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */
+ + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
+ /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with
+ * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
+ */
+ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
+ + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+}
+
+size_t ovs_key_attr_size(void)
+{
+ /* Whenever adding new OVS_KEY_ FIELDS, we should consider
+ * updating this function.
+ */
+ BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22);
+
+ return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
+ + ovs_tun_key_attr_size()
+ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */
+ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */
+ + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */
+ + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */
+ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
+ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */
+ + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */
+ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */
+ + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */
+ + nla_total_size(28); /* OVS_KEY_ATTR_ND */
+}
+
+static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
+ [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) },
+ [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) },
+ [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) },
+ [OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 },
+ [OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 },
+ [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 },
+ [OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 },
+ [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) },
+ [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) },
+ [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 },
+ [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED },
+ [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED },
+};
+
+/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
+static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
+ [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED },
+ [OVS_KEY_ATTR_PRIORITY] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_IN_PORT] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_SKB_MARK] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_ETHERNET] = { .len = sizeof(struct ovs_key_ethernet) },
+ [OVS_KEY_ATTR_VLAN] = { .len = sizeof(__be16) },
+ [OVS_KEY_ATTR_ETHERTYPE] = { .len = sizeof(__be16) },
+ [OVS_KEY_ATTR_IPV4] = { .len = sizeof(struct ovs_key_ipv4) },
+ [OVS_KEY_ATTR_IPV6] = { .len = sizeof(struct ovs_key_ipv6) },
+ [OVS_KEY_ATTR_TCP] = { .len = sizeof(struct ovs_key_tcp) },
+ [OVS_KEY_ATTR_TCP_FLAGS] = { .len = sizeof(__be16) },
+ [OVS_KEY_ATTR_UDP] = { .len = sizeof(struct ovs_key_udp) },
+ [OVS_KEY_ATTR_SCTP] = { .len = sizeof(struct ovs_key_sctp) },
+ [OVS_KEY_ATTR_ICMP] = { .len = sizeof(struct ovs_key_icmp) },
+ [OVS_KEY_ATTR_ICMPV6] = { .len = sizeof(struct ovs_key_icmpv6) },
+ [OVS_KEY_ATTR_ARP] = { .len = sizeof(struct ovs_key_arp) },
+ [OVS_KEY_ATTR_ND] = { .len = sizeof(struct ovs_key_nd) },
+ [OVS_KEY_ATTR_RECIRC_ID] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED,
+ .next = ovs_tunnel_key_lens, },
+ [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) },
+};
+
+static bool is_all_zero(const u8 *fp, size_t size)
+{
+ int i;
+
+ if (!fp)
+ return false;
+
+ for (i = 0; i < size; i++)
+ if (fp[i])
+ return false;
+
+ return true;
+}
+
+static int __parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[],
+ u64 *attrsp, bool log, bool nz)
+{
+ const struct nlattr *nla;
+ u64 attrs;
+ int rem;
+
+ attrs = *attrsp;
+ nla_for_each_nested(nla, attr, rem) {
+ u16 type = nla_type(nla);
+ int expected_len;
+
+ if (type > OVS_KEY_ATTR_MAX) {
+ OVS_NLERR(log, "Key type %d is out of range max %d",
+ type, OVS_KEY_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (attrs & (1 << type)) {
+ OVS_NLERR(log, "Duplicate key (type %d).", type);
+ return -EINVAL;
+ }
+
+ expected_len = ovs_key_lens[type].len;
+ if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) {
+ OVS_NLERR(log, "Key %d has unexpected len %d expected %d",
+ type, nla_len(nla), expected_len);
+ return -EINVAL;
+ }
+
+ if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
+ attrs |= 1 << type;
+ a[type] = nla;
+ }
+ }
+ if (rem) {
+ OVS_NLERR(log, "Message has %d unknown bytes.", rem);
+ return -EINVAL;
+ }
+
+ *attrsp = attrs;
+ return 0;
+}
+
+static int parse_flow_mask_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u64 *attrsp,
+ bool log)
+{
+ return __parse_flow_nlattrs(attr, a, attrsp, log, true);
+}
+
+static int parse_flow_nlattrs(const struct nlattr *attr,
+ const struct nlattr *a[], u64 *attrsp,
+ bool log)
+{
+ return __parse_flow_nlattrs(attr, a, attrsp, log, false);
+}
+
+static int genev_tun_opt_from_nlattr(const struct nlattr *a,
+ struct sw_flow_match *match, bool is_mask,
+ bool log)
+{
+ unsigned long opt_key_offset;
+
+ if (nla_len(a) > sizeof(match->key->tun_opts)) {
+ OVS_NLERR(log, "Geneve option length err (len %d, max %zu).",
+ nla_len(a), sizeof(match->key->tun_opts));
+ return -EINVAL;
+ }
+
+ if (nla_len(a) % 4 != 0) {
+ OVS_NLERR(log, "Geneve opt len %d is not a multiple of 4.",
+ nla_len(a));
+ return -EINVAL;
+ }
+
+ /* We need to record the length of the options passed
+ * down, otherwise packets with the same format but
+ * additional options will be silently matched.
+ */
+ if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a),
+ false);
+ } else {
+ /* This is somewhat unusual because it looks at
+ * both the key and mask while parsing the
+ * attributes (and by extension assumes the key
+ * is parsed first). Normally, we would verify
+ * that each is the correct length and that the
+ * attributes line up in the validate function.
+ * However, that is difficult because this is
+ * variable length and we won't have the
+ * information later.
+ */
+ if (match->key->tun_opts_len != nla_len(a)) {
+ OVS_NLERR(log, "Geneve option len %d != mask len %d",
+ match->key->tun_opts_len, nla_len(a));
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
+ }
+
+ opt_key_offset = TUN_METADATA_OFFSET(nla_len(a));
+ SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a),
+ nla_len(a), is_mask);
+ return 0;
+}
+
+static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = {
+ [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 },
+};
+
+static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
+ struct sw_flow_match *match, bool is_mask,
+ bool log)
+{
+ struct nlattr *tb[OVS_VXLAN_EXT_MAX+1];
+ unsigned long opt_key_offset;
+ struct ovs_vxlan_opts opts;
+ int err;
+
+ BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
+
+ err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy);
+ if (err < 0)
+ return err;
+
+ memset(&opts, 0, sizeof(opts));
+
+ if (tb[OVS_VXLAN_EXT_GBP])
+ opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]);
+
+ if (!is_mask)
+ SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false);
+ else
+ SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
+
+ opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
+ SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
+ is_mask);
+ return 0;
+}
+
+static int ipv4_tun_from_nlattr(const struct nlattr *attr,
+ struct sw_flow_match *match, bool is_mask,
+ bool log)
+{
+ struct nlattr *a;
+ int rem;
+ bool ttl = false;
+ __be16 tun_flags = 0;
+ int opts_type = 0;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ int err;
+
+ if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
+ OVS_NLERR(log, "Tunnel attr %d out of range max %d",
+ type, OVS_TUNNEL_KEY_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (ovs_tunnel_key_lens[type].len != nla_len(a) &&
+ ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) {
+ OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d",
+ type, nla_len(a), ovs_tunnel_key_lens[type].len);
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case OVS_TUNNEL_KEY_ATTR_ID:
+ SW_FLOW_KEY_PUT(match, tun_key.tun_id,
+ nla_get_be64(a), is_mask);
+ tun_flags |= TUNNEL_KEY;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+ nla_get_in_addr(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+ nla_get_in_addr(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TOS:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+ nla_get_u8(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TTL:
+ SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+ nla_get_u8(a), is_mask);
+ ttl = true;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
+ tun_flags |= TUNNEL_DONT_FRAGMENT;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_CSUM:
+ tun_flags |= TUNNEL_CSUM;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TP_SRC:
+ SW_FLOW_KEY_PUT(match, tun_key.tp_src,
+ nla_get_be16(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_TP_DST:
+ SW_FLOW_KEY_PUT(match, tun_key.tp_dst,
+ nla_get_be16(a), is_mask);
+ break;
+ case OVS_TUNNEL_KEY_ATTR_OAM:
+ tun_flags |= TUNNEL_OAM;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
+ if (opts_type) {
+ OVS_NLERR(log, "Multiple metadata blocks provided");
+ return -EINVAL;
+ }
+
+ err = genev_tun_opt_from_nlattr(a, match, is_mask, log);
+ if (err)
+ return err;
+
+ tun_flags |= TUNNEL_GENEVE_OPT;
+ opts_type = type;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
+ if (opts_type) {
+ OVS_NLERR(log, "Multiple metadata blocks provided");
+ return -EINVAL;
+ }
+
+ err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log);
+ if (err)
+ return err;
+
+ tun_flags |= TUNNEL_VXLAN_OPT;
+ opts_type = type;
+ break;
+ default:
+ OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d",
+ type);
+ return -EINVAL;
+ }
+ }
+
+ SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
+
+ if (rem > 0) {
+ OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.",
+ rem);
+ return -EINVAL;
+ }
+
+ if (!is_mask) {
+ if (!match->key->tun_key.ipv4_dst) {
+ OVS_NLERR(log, "IPv4 tunnel dst address is zero");
+ return -EINVAL;
+ }
+
+ if (!ttl) {
+ OVS_NLERR(log, "IPv4 tunnel TTL not specified.");
+ return -EINVAL;
+ }
+ }
+
+ return opts_type;
+}
+
+static int vxlan_opt_to_nlattr(struct sk_buff *skb,
+ const void *tun_opts, int swkey_tun_opts_len)
+{
+ const struct ovs_vxlan_opts *opts = tun_opts;
+ struct nlattr *nla;
+
+ nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
+ if (!nla)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0)
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
+ const struct ovs_key_ipv4_tunnel *output,
+ const void *tun_opts, int swkey_tun_opts_len)
+{
+ if (output->tun_flags & TUNNEL_KEY &&
+ nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
+ return -EMSGSIZE;
+ if (output->ipv4_src &&
+ nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC,
+ output->ipv4_src))
+ return -EMSGSIZE;
+ if (output->ipv4_dst &&
+ nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST,
+ output->ipv4_dst))
+ return -EMSGSIZE;
+ if (output->ipv4_tos &&
+ nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+ return -EMSGSIZE;
+ if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_CSUM) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
+ return -EMSGSIZE;
+ if (output->tp_src &&
+ nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_SRC, output->tp_src))
+ return -EMSGSIZE;
+ if (output->tp_dst &&
+ nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst))
+ return -EMSGSIZE;
+ if ((output->tun_flags & TUNNEL_OAM) &&
+ nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
+ return -EMSGSIZE;
+ if (tun_opts) {
+ if (output->tun_flags & TUNNEL_GENEVE_OPT &&
+ nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
+ swkey_tun_opts_len, tun_opts))
+ return -EMSGSIZE;
+ else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
+ vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+
+static int ipv4_tun_to_nlattr(struct sk_buff *skb,
+ const struct ovs_key_ipv4_tunnel *output,
+ const void *tun_opts, int swkey_tun_opts_len)
+{
+ struct nlattr *nla;
+ int err;
+
+ nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
+ if (!nla)
+ return -EMSGSIZE;
+
+ err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len);
+ if (err)
+ return err;
+
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
+ const struct ovs_tunnel_info *egress_tun_info)
+{
+ return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel,
+ egress_tun_info->options,
+ egress_tun_info->options_len);
+}
+
+static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
+ const struct nlattr **a, bool is_mask,
+ bool log)
+{
+ if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) {
+ u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]);
+
+ SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) {
+ u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]);
+
+ SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
+ SW_FLOW_KEY_PUT(match, phy.priority,
+ nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
+ u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
+
+ if (is_mask) {
+ in_port = 0xffffffff; /* Always exact match in_port. */
+ } else if (in_port >= DP_MAX_PORTS) {
+ OVS_NLERR(log, "Port %d exceeds max allowable %d",
+ in_port, DP_MAX_PORTS);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
+ }
+
+ if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
+ uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
+
+ SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
+ *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
+ }
+ if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
+ if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
+ is_mask, log) < 0)
+ return -EINVAL;
+ *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
+ }
+ return 0;
+}
+
+static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
+ const struct nlattr **a, bool is_mask,
+ bool log)
+{
+ int err;
+
+ err = metadata_from_nlattrs(match, &attrs, a, is_mask, log);
+ if (err)
+ return err;
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
+ const struct ovs_key_ethernet *eth_key;
+
+ eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
+ SW_FLOW_KEY_MEMCPY(match, eth.src,
+ eth_key->eth_src, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, eth.dst,
+ eth_key->eth_dst, ETH_ALEN, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
+ __be16 tci;
+
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ if (is_mask)
+ OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.");
+ else
+ OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set.");
+
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
+ __be16 eth_type;
+
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+ if (is_mask) {
+ /* Always exact match EtherType. */
+ eth_type = htons(0xffff);
+ } else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+ OVS_NLERR(log, "EtherType %x is less than min %x",
+ ntohs(eth_type), ETH_P_802_3_MIN);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ } else if (!is_mask) {
+ SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+ const struct ovs_key_ipv4 *ipv4_key;
+
+ ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
+ if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
+ OVS_NLERR(log, "IPv4 frag type %d is out of range max %d",
+ ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
+ return -EINVAL;
+ }
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ipv4_key->ipv4_proto, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.tos,
+ ipv4_key->ipv4_tos, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.ttl,
+ ipv4_key->ipv4_ttl, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.frag,
+ ipv4_key->ipv4_frag, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+ ipv4_key->ipv4_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+ ipv4_key->ipv4_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
+ const struct ovs_key_ipv6 *ipv6_key;
+
+ ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
+ if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
+ OVS_NLERR(log, "IPv6 frag type %d is out of range max %d",
+ ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
+ return -EINVAL;
+ }
+
+ if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) {
+ OVS_NLERR(log, "IPv6 flow label %x is out of range (max=%x).\n",
+ ntohl(ipv6_key->ipv6_label), (1 << 20) - 1);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, ipv6.label,
+ ipv6_key->ipv6_label, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ipv6_key->ipv6_proto, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.tos,
+ ipv6_key->ipv6_tclass, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.ttl,
+ ipv6_key->ipv6_hlimit, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.frag,
+ ipv6_key->ipv6_frag, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
+ ipv6_key->ipv6_src,
+ sizeof(match->key->ipv6.addr.src),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
+ ipv6_key->ipv6_dst,
+ sizeof(match->key->ipv6.addr.dst),
+ is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
+ const struct ovs_key_arp *arp_key;
+
+ arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
+ if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
+ OVS_NLERR(log, "Unknown ARP opcode (opcode=%d).",
+ arp_key->arp_op);
+ return -EINVAL;
+ }
+
+ SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+ arp_key->arp_sip, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+ arp_key->arp_tip, is_mask);
+ SW_FLOW_KEY_PUT(match, ip.proto,
+ ntohs(arp_key->arp_op), is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
+ arp_key->arp_sha, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
+ arp_key->arp_tha, ETH_ALEN, is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_ARP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
+ const struct ovs_key_mpls *mpls_key;
+
+ mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]);
+ SW_FLOW_KEY_PUT(match, mpls.top_lse,
+ mpls_key->mpls_lse, is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_MPLS);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
+ const struct ovs_key_tcp *tcp_key;
+
+ tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+ SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
+ SW_FLOW_KEY_PUT(match, tp.flags,
+ nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]),
+ is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
+ const struct ovs_key_udp *udp_key;
+
+ udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+ SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_UDP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
+ const struct ovs_key_sctp *sctp_key;
+
+ sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
+ SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
+ const struct ovs_key_icmp *icmp_key;
+
+ icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
+ SW_FLOW_KEY_PUT(match, tp.src,
+ htons(icmp_key->icmp_type), is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst,
+ htons(icmp_key->icmp_code), is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
+ const struct ovs_key_icmpv6 *icmpv6_key;
+
+ icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
+ SW_FLOW_KEY_PUT(match, tp.src,
+ htons(icmpv6_key->icmpv6_type), is_mask);
+ SW_FLOW_KEY_PUT(match, tp.dst,
+ htons(icmpv6_key->icmpv6_code), is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
+ }
+
+ if (attrs & (1 << OVS_KEY_ATTR_ND)) {
+ const struct ovs_key_nd *nd_key;
+
+ nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
+ nd_key->nd_target,
+ sizeof(match->key->ipv6.nd.target),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
+ nd_key->nd_sll, ETH_ALEN, is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
+ nd_key->nd_tll, ETH_ALEN, is_mask);
+ attrs &= ~(1 << OVS_KEY_ATTR_ND);
+ }
+
+ if (attrs != 0) {
+ OVS_NLERR(log, "Unknown key attributes %llx",
+ (unsigned long long)attrs);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void nlattr_set(struct nlattr *attr, u8 val,
+ const struct ovs_len_tbl *tbl)
+{
+ struct nlattr *nla;
+ int rem;
+
+ /* The nlattr stream should already have been validated */
+ nla_for_each_nested(nla, attr, rem) {
+ if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED)
+ nlattr_set(nla, val, tbl[nla_type(nla)].next);
+ else
+ memset(nla_data(nla), val, nla_len(nla));
+ }
+}
+
+static void mask_set_nlattr(struct nlattr *attr, u8 val)
+{
+ nlattr_set(attr, val, ovs_key_lens);
+}
+
+/**
+ * ovs_nla_get_match - parses Netlink attributes into a flow key and
+ * mask. In case the 'mask' is NULL, the flow is treated as exact match
+ * flow. Otherwise, it is treated as a wildcarded flow, except the mask
+ * does not include any don't care bit.
+ * @match: receives the extracted flow match information.
+ * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence. The fields should of the packet that triggered the creation
+ * of this flow.
+ * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
+ * attribute specifies the mask field of the wildcarded flow.
+ * @log: Boolean to allow kernel error logging. Normally true, but when
+ * probing for feature compatibility this should be passed in as false to
+ * suppress unnecessary error logging.
+ */
+int ovs_nla_get_match(struct sw_flow_match *match,
+ const struct nlattr *nla_key,
+ const struct nlattr *nla_mask,
+ bool log)
+{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ const struct nlattr *encap;
+ struct nlattr *newmask = NULL;
+ u64 key_attrs = 0;
+ u64 mask_attrs = 0;
+ bool encap_valid = false;
+ int err;
+
+ err = parse_flow_nlattrs(nla_key, a, &key_attrs, log);
+ if (err)
+ return err;
+
+ if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+ (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
+ __be16 tci;
+
+ if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+ (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+ OVS_NLERR(log, "Invalid Vlan frame.");
+ return -EINVAL;
+ }
+
+ key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ encap_valid = true;
+
+ if (tci & htons(VLAN_TAG_PRESENT)) {
+ err = parse_flow_nlattrs(encap, a, &key_attrs, log);
+ if (err)
+ return err;
+ } else if (!tci) {
+ /* Corner case for truncated 802.1Q header. */
+ if (nla_len(encap)) {
+ OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute.");
+ return -EINVAL;
+ }
+ } else {
+ OVS_NLERR(log, "Encap attr is set for non-VLAN frame");
+ return -EINVAL;
+ }
+ }
+
+ err = ovs_key_from_nlattrs(match, key_attrs, a, false, log);
+ if (err)
+ return err;
+
+ if (match->mask) {
+ if (!nla_mask) {
+ /* Create an exact match mask. We need to set to 0xff
+ * all the 'match->mask' fields that have been touched
+ * in 'match->key'. We cannot simply memset
+ * 'match->mask', because padding bytes and fields not
+ * specified in 'match->key' should be left to 0.
+ * Instead, we use a stream of netlink attributes,
+ * copied from 'key' and set to 0xff.
+ * ovs_key_from_nlattrs() will take care of filling
+ * 'match->mask' appropriately.
+ */
+ newmask = kmemdup(nla_key,
+ nla_total_size(nla_len(nla_key)),
+ GFP_KERNEL);
+ if (!newmask)
+ return -ENOMEM;
+
+ mask_set_nlattr(newmask, 0xff);
+
+ /* The userspace does not send tunnel attributes that
+ * are 0, but we should not wildcard them nonetheless.
+ */
+ if (match->key->tun_key.ipv4_dst)
+ SW_FLOW_KEY_MEMSET_FIELD(match, tun_key,
+ 0xff, true);
+
+ nla_mask = newmask;
+ }
+
+ err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log);
+ if (err)
+ goto free_newmask;
+
+ /* Always match on tci. */
+ SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
+
+ if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) {
+ __be16 eth_type = 0;
+ __be16 tci = 0;
+
+ if (!encap_valid) {
+ OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame.");
+ err = -EINVAL;
+ goto free_newmask;
+ }
+
+ mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+ if (a[OVS_KEY_ATTR_ETHERTYPE])
+ eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+ if (eth_type == htons(0xffff)) {
+ mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+ encap = a[OVS_KEY_ATTR_ENCAP];
+ err = parse_flow_mask_nlattrs(encap, a,
+ &mask_attrs, log);
+ if (err)
+ goto free_newmask;
+ } else {
+ OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).",
+ ntohs(eth_type));
+ err = -EINVAL;
+ goto free_newmask;
+ }
+
+ if (a[OVS_KEY_ATTR_VLAN])
+ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+ if (!(tci & htons(VLAN_TAG_PRESENT))) {
+ OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).",
+ ntohs(tci));
+ err = -EINVAL;
+ goto free_newmask;
+ }
+ }
+
+ err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log);
+ if (err)
+ goto free_newmask;
+ }
+
+ if (!match_validate(match, key_attrs, mask_attrs, log))
+ err = -EINVAL;
+
+free_newmask:
+ kfree(newmask);
+ return err;
+}
+
+static size_t get_ufid_len(const struct nlattr *attr, bool log)
+{
+ size_t len;
+
+ if (!attr)
+ return 0;
+
+ len = nla_len(attr);
+ if (len < 1 || len > MAX_UFID_LENGTH) {
+ OVS_NLERR(log, "ufid size %u bytes exceeds the range (1, %d)",
+ nla_len(attr), MAX_UFID_LENGTH);
+ return 0;
+ }
+
+ return len;
+}
+
+/* Initializes 'flow->ufid', returning true if 'attr' contains a valid UFID,
+ * or false otherwise.
+ */
+bool ovs_nla_get_ufid(struct sw_flow_id *sfid, const struct nlattr *attr,
+ bool log)
+{
+ sfid->ufid_len = get_ufid_len(attr, log);
+ if (sfid->ufid_len)
+ memcpy(sfid->ufid, nla_data(attr), sfid->ufid_len);
+
+ return sfid->ufid_len;
+}
+
+int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
+ const struct sw_flow_key *key, bool log)
+{
+ struct sw_flow_key *new_key;
+
+ if (ovs_nla_get_ufid(sfid, ufid, log))
+ return 0;
+
+ /* If UFID was not provided, use unmasked key. */
+ new_key = kmalloc(sizeof(*new_key), GFP_KERNEL);
+ if (!new_key)
+ return -ENOMEM;
+ memcpy(new_key, key, sizeof(*key));
+ sfid->unmasked_key = new_key;
+
+ return 0;
+}
+
+u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
+{
+ return attr ? nla_get_u32(attr) : 0;
+}
+
+/**
+ * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
+ * @key: Receives extracted in_port, priority, tun_key and skb_mark.
+ * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence.
+ * @log: Boolean to allow kernel error logging. Normally true, but when
+ * probing for feature compatibility this should be passed in as false to
+ * suppress unnecessary error logging.
+ *
+ * This parses a series of Netlink attributes that form a flow key, which must
+ * take the same form accepted by flow_from_nlattrs(), but only enough of it to
+ * get the metadata, that is, the parts of the flow key that cannot be
+ * extracted from the packet itself.
+ */
+
+int ovs_nla_get_flow_metadata(const struct nlattr *attr,
+ struct sw_flow_key *key,
+ bool log)
+{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ struct sw_flow_match match;
+ u64 attrs = 0;
+ int err;
+
+ err = parse_flow_nlattrs(attr, a, &attrs, log);
+ if (err)
+ return -EINVAL;
+
+ memset(&match, 0, sizeof(match));
+ match.key = key;
+
+ key->phy.in_port = DP_MAX_PORTS;
+
+ return metadata_from_nlattrs(&match, &attrs, a, false, log);
+}
+
+static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, bool is_mask,
+ struct sk_buff *skb)
+{
+ struct ovs_key_ethernet *eth_key;
+ struct nlattr *nla, *encap;
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
+ goto nla_put_failure;
+
+ if ((swkey->tun_key.ipv4_dst || is_mask)) {
+ const void *opts = NULL;
+
+ if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
+ opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len);
+
+ if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
+ swkey->tun_opts_len))
+ goto nla_put_failure;
+ }
+
+ if (swkey->phy.in_port == DP_MAX_PORTS) {
+ if (is_mask && (output->phy.in_port == 0xffff))
+ if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
+ goto nla_put_failure;
+ } else {
+ u16 upper_u16;
+ upper_u16 = !is_mask ? 0 : 0xffff;
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
+ (upper_u16 << 16) | output->phy.in_port))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
+ if (!nla)
+ goto nla_put_failure;
+
+ eth_key = nla_data(nla);
+ ether_addr_copy(eth_key->eth_src, output->eth.src);
+ ether_addr_copy(eth_key->eth_dst, output->eth.dst);
+
+ if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
+ __be16 eth_type;
+ eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+ nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
+ goto nla_put_failure;
+ encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
+ if (!swkey->eth.tci)
+ goto unencap;
+ } else
+ encap = NULL;
+
+ if (swkey->eth.type == htons(ETH_P_802_2)) {
+ /*
+ * Ethertype 802.2 is represented in the netlink with omitted
+ * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+ * 0xffff in the mask attribute. Ethertype can also
+ * be wildcarded.
+ */
+ if (is_mask && output->eth.type)
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+ output->eth.type))
+ goto nla_put_failure;
+ goto unencap;
+ }
+
+ if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
+ goto nla_put_failure;
+
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ struct ovs_key_ipv4 *ipv4_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv4_key = nla_data(nla);
+ ipv4_key->ipv4_src = output->ipv4.addr.src;
+ ipv4_key->ipv4_dst = output->ipv4.addr.dst;
+ ipv4_key->ipv4_proto = output->ip.proto;
+ ipv4_key->ipv4_tos = output->ip.tos;
+ ipv4_key->ipv4_ttl = output->ip.ttl;
+ ipv4_key->ipv4_frag = output->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ struct ovs_key_ipv6 *ipv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ ipv6_key = nla_data(nla);
+ memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
+ sizeof(ipv6_key->ipv6_src));
+ memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
+ sizeof(ipv6_key->ipv6_dst));
+ ipv6_key->ipv6_label = output->ipv6.label;
+ ipv6_key->ipv6_proto = output->ip.proto;
+ ipv6_key->ipv6_tclass = output->ip.tos;
+ ipv6_key->ipv6_hlimit = output->ip.ttl;
+ ipv6_key->ipv6_frag = output->ip.frag;
+ } else if (swkey->eth.type == htons(ETH_P_ARP) ||
+ swkey->eth.type == htons(ETH_P_RARP)) {
+ struct ovs_key_arp *arp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
+ if (!nla)
+ goto nla_put_failure;
+ arp_key = nla_data(nla);
+ memset(arp_key, 0, sizeof(struct ovs_key_arp));
+ arp_key->arp_sip = output->ipv4.addr.src;
+ arp_key->arp_tip = output->ipv4.addr.dst;
+ arp_key->arp_op = htons(output->ip.proto);
+ ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha);
+ ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha);
+ } else if (eth_p_mpls(swkey->eth.type)) {
+ struct ovs_key_mpls *mpls_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key));
+ if (!nla)
+ goto nla_put_failure;
+ mpls_key = nla_data(nla);
+ mpls_key->mpls_lse = output->mpls.top_lse;
+ }
+
+ if ((swkey->eth.type == htons(ETH_P_IP) ||
+ swkey->eth.type == htons(ETH_P_IPV6)) &&
+ swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
+
+ if (swkey->ip.proto == IPPROTO_TCP) {
+ struct ovs_key_tcp *tcp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
+ if (!nla)
+ goto nla_put_failure;
+ tcp_key = nla_data(nla);
+ tcp_key->tcp_src = output->tp.src;
+ tcp_key->tcp_dst = output->tp.dst;
+ if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS,
+ output->tp.flags))
+ goto nla_put_failure;
+ } else if (swkey->ip.proto == IPPROTO_UDP) {
+ struct ovs_key_udp *udp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
+ if (!nla)
+ goto nla_put_failure;
+ udp_key = nla_data(nla);
+ udp_key->udp_src = output->tp.src;
+ udp_key->udp_dst = output->tp.dst;
+ } else if (swkey->ip.proto == IPPROTO_SCTP) {
+ struct ovs_key_sctp *sctp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
+ if (!nla)
+ goto nla_put_failure;
+ sctp_key = nla_data(nla);
+ sctp_key->sctp_src = output->tp.src;
+ sctp_key->sctp_dst = output->tp.dst;
+ } else if (swkey->eth.type == htons(ETH_P_IP) &&
+ swkey->ip.proto == IPPROTO_ICMP) {
+ struct ovs_key_icmp *icmp_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmp_key = nla_data(nla);
+ icmp_key->icmp_type = ntohs(output->tp.src);
+ icmp_key->icmp_code = ntohs(output->tp.dst);
+ } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
+ swkey->ip.proto == IPPROTO_ICMPV6) {
+ struct ovs_key_icmpv6 *icmpv6_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
+ sizeof(*icmpv6_key));
+ if (!nla)
+ goto nla_put_failure;
+ icmpv6_key = nla_data(nla);
+ icmpv6_key->icmpv6_type = ntohs(output->tp.src);
+ icmpv6_key->icmpv6_code = ntohs(output->tp.dst);
+
+ if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
+ icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
+ struct ovs_key_nd *nd_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
+ if (!nla)
+ goto nla_put_failure;
+ nd_key = nla_data(nla);
+ memcpy(nd_key->nd_target, &output->ipv6.nd.target,
+ sizeof(nd_key->nd_target));
+ ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll);
+ ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll);
+ }
+ }
+ }
+
+unencap:
+ if (encap)
+ nla_nest_end(skb, encap);
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+int ovs_nla_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, int attr, bool is_mask,
+ struct sk_buff *skb)
+{
+ int err;
+ struct nlattr *nla;
+
+ nla = nla_nest_start(skb, attr);
+ if (!nla)
+ return -EMSGSIZE;
+ err = __ovs_nla_put_key(swkey, output, is_mask, skb);
+ if (err)
+ return err;
+ nla_nest_end(skb, nla);
+
+ return 0;
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb)
+{
+ if (ovs_identifier_is_ufid(&flow->id))
+ return nla_put(skb, OVS_FLOW_ATTR_UFID, flow->id.ufid_len,
+ flow->id.ufid);
+
+ return ovs_nla_put_key(flow->id.unmasked_key, flow->id.unmasked_key,
+ OVS_FLOW_ATTR_KEY, false, skb);
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb)
+{
+ return ovs_nla_put_key(&flow->key, &flow->key,
+ OVS_FLOW_ATTR_KEY, false, skb);
+}
+
+/* Called with ovs_mutex or RCU read lock. */
+int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb)
+{
+ return ovs_nla_put_key(&flow->key, &flow->mask->key,
+ OVS_FLOW_ATTR_MASK, true, skb);
+}
+
+#define MAX_ACTIONS_BUFSIZE (32 * 1024)
+
+static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log)
+{
+ struct sw_flow_actions *sfa;
+
+ if (size > MAX_ACTIONS_BUFSIZE) {
+ OVS_NLERR(log, "Flow action size %u bytes exceeds max", size);
+ return ERR_PTR(-EINVAL);
+ }
+
+ sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
+ if (!sfa)
+ return ERR_PTR(-ENOMEM);
+
+ sfa->actions_len = 0;
+ return sfa;
+}
+
+/* Schedules 'sf_acts' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ kfree_rcu(sf_acts, rcu);
+}
+
+static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
+ int attr_len, bool log)
+{
+
+ struct sw_flow_actions *acts;
+ int new_acts_size;
+ int req_size = NLA_ALIGN(attr_len);
+ int next_offset = offsetof(struct sw_flow_actions, actions) +
+ (*sfa)->actions_len;
+
+ if (req_size <= (ksize(*sfa) - next_offset))
+ goto out;
+
+ new_acts_size = ksize(*sfa) * 2;
+
+ if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
+ if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
+ return ERR_PTR(-EMSGSIZE);
+ new_acts_size = MAX_ACTIONS_BUFSIZE;
+ }
+
+ acts = nla_alloc_flow_actions(new_acts_size, log);
+ if (IS_ERR(acts))
+ return (void *)acts;
+
+ memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
+ acts->actions_len = (*sfa)->actions_len;
+ kfree(*sfa);
+ *sfa = acts;
+
+out:
+ (*sfa)->actions_len += req_size;
+ return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
+}
+
+static struct nlattr *__add_action(struct sw_flow_actions **sfa,
+ int attrtype, void *data, int len, bool log)
+{
+ struct nlattr *a;
+
+ a = reserve_sfa_size(sfa, nla_attr_size(len), log);
+ if (IS_ERR(a))
+ return a;
+
+ a->nla_type = attrtype;
+ a->nla_len = nla_attr_size(len);
+
+ if (data)
+ memcpy(nla_data(a), data, len);
+ memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
+
+ return a;
+}
+
+static int add_action(struct sw_flow_actions **sfa, int attrtype,
+ void *data, int len, bool log)
+{
+ struct nlattr *a;
+
+ a = __add_action(sfa, attrtype, data, len, log);
+
+ return PTR_ERR_OR_ZERO(a);
+}
+
+static inline int add_nested_action_start(struct sw_flow_actions **sfa,
+ int attrtype, bool log)
+{
+ int used = (*sfa)->actions_len;
+ int err;
+
+ err = add_action(sfa, attrtype, NULL, 0, log);
+ if (err)
+ return err;
+
+ return used;
+}
+
+static inline void add_nested_action_end(struct sw_flow_actions *sfa,
+ int st_offset)
+{
+ struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions +
+ st_offset);
+
+ a->nla_len = sfa->actions_len - st_offset;
+}
+
+static int __ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ int depth, struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci, bool log);
+
+static int validate_and_copy_sample(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci, bool log)
+{
+ const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
+ const struct nlattr *probability, *actions;
+ const struct nlattr *a;
+ int rem, start, err, st_acts;
+
+ memset(attrs, 0, sizeof(attrs));
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
+ return -EINVAL;
+ attrs[type] = a;
+ }
+ if (rem)
+ return -EINVAL;
+
+ probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
+ if (!probability || nla_len(probability) != sizeof(u32))
+ return -EINVAL;
+
+ actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
+ if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
+ return -EINVAL;
+
+ /* validation done, copy sample action. */
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log);
+ if (start < 0)
+ return start;
+ err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
+ nla_data(probability), sizeof(u32), log);
+ if (err)
+ return err;
+ st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log);
+ if (st_acts < 0)
+ return st_acts;
+
+ err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa,
+ eth_type, vlan_tci, log);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, st_acts);
+ add_nested_action_end(*sfa, start);
+
+ return 0;
+}
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key,
+ struct sw_flow_mask *mask)
+{
+ memset(match, 0, sizeof(*match));
+ match->key = key;
+ match->mask = mask;
+
+ memset(key, 0, sizeof(*key));
+
+ if (mask) {
+ memset(&mask->key, 0, sizeof(mask->key));
+ mask->range.start = mask->range.end = 0;
+ }
+}
+
+static int validate_geneve_opts(struct sw_flow_key *key)
+{
+ struct geneve_opt *option;
+ int opts_len = key->tun_opts_len;
+ bool crit_opt = false;
+
+ option = (struct geneve_opt *)TUN_METADATA_OPTS(key, key->tun_opts_len);
+ while (opts_len > 0) {
+ int len;
+
+ if (opts_len < sizeof(*option))
+ return -EINVAL;
+
+ len = sizeof(*option) + option->length * 4;
+ if (len > opts_len)
+ return -EINVAL;
+
+ crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE);
+
+ option = (struct geneve_opt *)((u8 *)option + len);
+ opts_len -= len;
+ };
+
+ key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
+
+ return 0;
+}
+
+static int validate_and_copy_set_tun(const struct nlattr *attr,
+ struct sw_flow_actions **sfa, bool log)
+{
+ struct sw_flow_match match;
+ struct sw_flow_key key;
+ struct ovs_tunnel_info *tun_info;
+ struct nlattr *a;
+ int err = 0, start, opts_type;
+
+ ovs_match_init(&match, &key, NULL);
+ opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log);
+ if (opts_type < 0)
+ return opts_type;
+
+ if (key.tun_opts_len) {
+ switch (opts_type) {
+ case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
+ err = validate_geneve_opts(&key);
+ if (err < 0)
+ return err;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
+ break;
+ }
+ };
+
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log);
+ if (start < 0)
+ return start;
+
+ a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
+ sizeof(*tun_info) + key.tun_opts_len, log);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ tun_info = nla_data(a);
+ tun_info->tunnel = key.tun_key;
+ tun_info->options_len = key.tun_opts_len;
+
+ if (tun_info->options_len) {
+ /* We need to store the options in the action itself since
+ * everything else will go away after flow setup. We can append
+ * it to tun_info and then point there.
+ */
+ memcpy((tun_info + 1),
+ TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len);
+ tun_info->options = (tun_info + 1);
+ } else {
+ tun_info->options = NULL;
+ }
+
+ add_nested_action_end(*sfa, start);
+
+ return err;
+}
+
+/* Return false if there are any non-masked bits set.
+ * Mask follows data immediately, before any netlink padding.
+ */
+static bool validate_masked(u8 *data, int len)
+{
+ u8 *mask = data + len;
+
+ while (len--)
+ if (*data++ & ~*mask++)
+ return false;
+
+ return true;
+}
+
+static int validate_set(const struct nlattr *a,
+ const struct sw_flow_key *flow_key,
+ struct sw_flow_actions **sfa,
+ bool *skip_copy, __be16 eth_type, bool masked, bool log)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+ size_t key_len;
+
+ /* There can be only one key in a action */
+ if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
+ return -EINVAL;
+
+ key_len = nla_len(ovs_key);
+ if (masked)
+ key_len /= 2;
+
+ if (key_type > OVS_KEY_ATTR_MAX ||
+ (ovs_key_lens[key_type].len != key_len &&
+ ovs_key_lens[key_type].len != OVS_ATTR_NESTED))
+ return -EINVAL;
+
+ if (masked && !validate_masked(nla_data(ovs_key), key_len))
+ return -EINVAL;
+
+ switch (key_type) {
+ const struct ovs_key_ipv4 *ipv4_key;
+ const struct ovs_key_ipv6 *ipv6_key;
+ int err;
+
+ case OVS_KEY_ATTR_PRIORITY:
+ case OVS_KEY_ATTR_SKB_MARK:
+ case OVS_KEY_ATTR_ETHERNET:
+ break;
+
+ case OVS_KEY_ATTR_TUNNEL:
+ if (eth_p_mpls(eth_type))
+ return -EINVAL;
+
+ if (masked)
+ return -EINVAL; /* Masked tunnel set not supported. */
+
+ *skip_copy = true;
+ err = validate_and_copy_set_tun(a, sfa, log);
+ if (err)
+ return err;
+ break;
+
+ case OVS_KEY_ATTR_IPV4:
+ if (eth_type != htons(ETH_P_IP))
+ return -EINVAL;
+
+ ipv4_key = nla_data(ovs_key);
+
+ if (masked) {
+ const struct ovs_key_ipv4 *mask = ipv4_key + 1;
+
+ /* Non-writeable fields. */
+ if (mask->ipv4_proto || mask->ipv4_frag)
+ return -EINVAL;
+ } else {
+ if (ipv4_key->ipv4_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv4_key->ipv4_frag != flow_key->ip.frag)
+ return -EINVAL;
+ }
+ break;
+
+ case OVS_KEY_ATTR_IPV6:
+ if (eth_type != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ ipv6_key = nla_data(ovs_key);
+
+ if (masked) {
+ const struct ovs_key_ipv6 *mask = ipv6_key + 1;
+
+ /* Non-writeable fields. */
+ if (mask->ipv6_proto || mask->ipv6_frag)
+ return -EINVAL;
+
+ /* Invalid bits in the flow label mask? */
+ if (ntohl(mask->ipv6_label) & 0xFFF00000)
+ return -EINVAL;
+ } else {
+ if (ipv6_key->ipv6_proto != flow_key->ip.proto)
+ return -EINVAL;
+
+ if (ipv6_key->ipv6_frag != flow_key->ip.frag)
+ return -EINVAL;
+ }
+ if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_TCP:
+ if ((eth_type != htons(ETH_P_IP) &&
+ eth_type != htons(ETH_P_IPV6)) ||
+ flow_key->ip.proto != IPPROTO_TCP)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_UDP:
+ if ((eth_type != htons(ETH_P_IP) &&
+ eth_type != htons(ETH_P_IPV6)) ||
+ flow_key->ip.proto != IPPROTO_UDP)
+ return -EINVAL;
+
+ break;
+
+ case OVS_KEY_ATTR_MPLS:
+ if (!eth_p_mpls(eth_type))
+ return -EINVAL;
+ break;
+
+ case OVS_KEY_ATTR_SCTP:
+ if ((eth_type != htons(ETH_P_IP) &&
+ eth_type != htons(ETH_P_IPV6)) ||
+ flow_key->ip.proto != IPPROTO_SCTP)
+ return -EINVAL;
+
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ /* Convert non-masked non-tunnel set actions to masked set actions. */
+ if (!masked && key_type != OVS_KEY_ATTR_TUNNEL) {
+ int start, len = key_len * 2;
+ struct nlattr *at;
+
+ *skip_copy = true;
+
+ start = add_nested_action_start(sfa,
+ OVS_ACTION_ATTR_SET_TO_MASKED,
+ log);
+ if (start < 0)
+ return start;
+
+ at = __add_action(sfa, key_type, NULL, len, log);
+ if (IS_ERR(at))
+ return PTR_ERR(at);
+
+ memcpy(nla_data(at), nla_data(ovs_key), key_len); /* Key. */
+ memset(nla_data(at) + key_len, 0xff, key_len); /* Mask. */
+ /* Clear non-writeable bits from otherwise writeable fields. */
+ if (key_type == OVS_KEY_ATTR_IPV6) {
+ struct ovs_key_ipv6 *mask = nla_data(at) + key_len;
+
+ mask->ipv6_label &= htonl(0x000FFFFF);
+ }
+ add_nested_action_end(*sfa, start);
+ }
+
+ return 0;
+}
+
+static int validate_userspace(const struct nlattr *attr)
+{
+ static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
+ [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
+ [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
+ [OVS_USERSPACE_ATTR_EGRESS_TUN_PORT] = {.type = NLA_U32 },
+ };
+ struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
+ int error;
+
+ error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
+ attr, userspace_policy);
+ if (error)
+ return error;
+
+ if (!a[OVS_USERSPACE_ATTR_PID] ||
+ !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int copy_action(const struct nlattr *from,
+ struct sw_flow_actions **sfa, bool log)
+{
+ int totlen = NLA_ALIGN(from->nla_len);
+ struct nlattr *to;
+
+ to = reserve_sfa_size(sfa, from->nla_len, log);
+ if (IS_ERR(to))
+ return PTR_ERR(to);
+
+ memcpy(to, from, totlen);
+ return 0;
+}
+
+static int __ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ int depth, struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci, bool log)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ if (depth >= SAMPLE_ACTION_DEPTH)
+ return -EOVERFLOW;
+
+ nla_for_each_nested(a, attr, rem) {
+ /* Expected argument lengths, (u32)-1 for variable length. */
+ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
+ [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
+ [OVS_ACTION_ATTR_RECIRC] = sizeof(u32),
+ [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+ [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls),
+ [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16),
+ [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
+ [OVS_ACTION_ATTR_POP_VLAN] = 0,
+ [OVS_ACTION_ATTR_SET] = (u32)-1,
+ [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1,
+ [OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
+ [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash)
+ };
+ const struct ovs_action_push_vlan *vlan;
+ int type = nla_type(a);
+ bool skip_copy;
+
+ if (type > OVS_ACTION_ATTR_MAX ||
+ (action_lens[type] != nla_len(a) &&
+ action_lens[type] != (u32)-1))
+ return -EINVAL;
+
+ skip_copy = false;
+ switch (type) {
+ case OVS_ACTION_ATTR_UNSPEC:
+ return -EINVAL;
+
+ case OVS_ACTION_ATTR_USERSPACE:
+ err = validate_userspace(a);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_OUTPUT:
+ if (nla_get_u32(a) >= DP_MAX_PORTS)
+ return -EINVAL;
+ break;
+
+ case OVS_ACTION_ATTR_HASH: {
+ const struct ovs_action_hash *act_hash = nla_data(a);
+
+ switch (act_hash->hash_alg) {
+ case OVS_HASH_ALG_L4:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ break;
+ }
+
+ case OVS_ACTION_ATTR_POP_VLAN:
+ vlan_tci = htons(0);
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_VLAN:
+ vlan = nla_data(a);
+ if (vlan->vlan_tpid != htons(ETH_P_8021Q))
+ return -EINVAL;
+ if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
+ return -EINVAL;
+ vlan_tci = vlan->vlan_tci;
+ break;
+
+ case OVS_ACTION_ATTR_RECIRC:
+ break;
+
+ case OVS_ACTION_ATTR_PUSH_MPLS: {
+ const struct ovs_action_push_mpls *mpls = nla_data(a);
+
+ if (!eth_p_mpls(mpls->mpls_ethertype))
+ return -EINVAL;
+ /* Prohibit push MPLS other than to a white list
+ * for packets that have a known tag order.
+ */
+ if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+ (eth_type != htons(ETH_P_IP) &&
+ eth_type != htons(ETH_P_IPV6) &&
+ eth_type != htons(ETH_P_ARP) &&
+ eth_type != htons(ETH_P_RARP) &&
+ !eth_p_mpls(eth_type)))
+ return -EINVAL;
+ eth_type = mpls->mpls_ethertype;
+ break;
+ }
+
+ case OVS_ACTION_ATTR_POP_MPLS:
+ if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+ !eth_p_mpls(eth_type))
+ return -EINVAL;
+
+ /* Disallow subsequent L2.5+ set and mpls_pop actions
+ * as there is no check here to ensure that the new
+ * eth_type is valid and thus set actions could
+ * write off the end of the packet or otherwise
+ * corrupt it.
+ *
+ * Support for these actions is planned using packet
+ * recirculation.
+ */
+ eth_type = htons(0);
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ err = validate_set(a, key, sfa,
+ &skip_copy, eth_type, false, log);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SET_MASKED:
+ err = validate_set(a, key, sfa,
+ &skip_copy, eth_type, true, log);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = validate_and_copy_sample(a, key, depth, sfa,
+ eth_type, vlan_tci, log);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+
+ default:
+ OVS_NLERR(log, "Unknown Action type %d", type);
+ return -EINVAL;
+ }
+ if (!skip_copy) {
+ err = copy_action(a, sfa, log);
+ if (err)
+ return err;
+ }
+ }
+
+ if (rem > 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* 'key' must be the masked key. */
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa, bool log)
+{
+ int err;
+
+ *sfa = nla_alloc_flow_actions(nla_len(attr), log);
+ if (IS_ERR(*sfa))
+ return PTR_ERR(*sfa);
+
+ err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type,
+ key->eth.tci, log);
+ if (err)
+ kfree(*sfa);
+
+ return err;
+}
+
+static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ struct nlattr *start;
+ int err = 0, rem;
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
+ if (!start)
+ return -EMSGSIZE;
+
+ nla_for_each_nested(a, attr, rem) {
+ int type = nla_type(a);
+ struct nlattr *st_sample;
+
+ switch (type) {
+ case OVS_SAMPLE_ATTR_PROBABILITY:
+ if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY,
+ sizeof(u32), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ case OVS_SAMPLE_ATTR_ACTIONS:
+ st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
+ if (!st_sample)
+ return -EMSGSIZE;
+ err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
+ if (err)
+ return err;
+ nla_nest_end(skb, st_sample);
+ break;
+ }
+ }
+
+ nla_nest_end(skb, start);
+ return err;
+}
+
+static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ int key_type = nla_type(ovs_key);
+ struct nlattr *start;
+ int err;
+
+ switch (key_type) {
+ case OVS_KEY_ATTR_TUNNEL_INFO: {
+ struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
+ if (!start)
+ return -EMSGSIZE;
+
+ err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
+ tun_info->options_len ?
+ tun_info->options : NULL,
+ tun_info->options_len);
+ if (err)
+ return err;
+ nla_nest_end(skb, start);
+ break;
+ }
+ default:
+ if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
+ return -EMSGSIZE;
+ break;
+ }
+
+ return 0;
+}
+
+static int masked_set_action_to_set_action_attr(const struct nlattr *a,
+ struct sk_buff *skb)
+{
+ const struct nlattr *ovs_key = nla_data(a);
+ struct nlattr *nla;
+ size_t key_len = nla_len(ovs_key) / 2;
+
+ /* Revert the conversion we did from a non-masked set action to
+ * masked set action.
+ */
+ nla = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
+ if (!nla)
+ return -EMSGSIZE;
+
+ if (nla_put(skb, nla_type(ovs_key), key_len, nla_data(ovs_key)))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
+{
+ const struct nlattr *a;
+ int rem, err;
+
+ nla_for_each_attr(a, attr, len, rem) {
+ int type = nla_type(a);
+
+ switch (type) {
+ case OVS_ACTION_ATTR_SET:
+ err = set_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SET_TO_MASKED:
+ err = masked_set_action_to_set_action_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ err = sample_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+ default:
+ if (nla_put(skb, type, nla_len(a), nla_data(a)))
+ return -EMSGSIZE;
+ break;
+ }
+ }
+
+ return 0;
+}
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
new file mode 100644
index 000000000..5c3d75bff
--- /dev/null
+++ b/net/openvswitch/flow_netlink.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+
+#ifndef FLOW_NETLINK_H
+#define FLOW_NETLINK_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+
+#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+
+#include "flow.h"
+
+size_t ovs_tun_key_attr_size(void);
+size_t ovs_key_attr_size(void);
+
+void ovs_match_init(struct sw_flow_match *match,
+ struct sw_flow_key *key, struct sw_flow_mask *mask);
+
+int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
+ int attr, bool is_mask, struct sk_buff *);
+int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *,
+ bool log);
+
+int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
+int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
+int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
+
+int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key,
+ const struct nlattr *mask, bool log);
+int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
+ const struct ovs_tunnel_info *);
+
+bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
+int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
+ const struct sw_flow_key *key, bool log);
+u32 ovs_nla_get_ufid_flags(const struct nlattr *attr);
+
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa, bool log);
+int ovs_nla_put_actions(const struct nlattr *attr,
+ int len, struct sk_buff *skb);
+
+void ovs_nla_free_flow_actions(struct sw_flow_actions *);
+
+#endif /* flow_netlink.h */
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
new file mode 100644
index 000000000..4613df8c8
--- /dev/null
+++ b/net/openvswitch/flow_table.c
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include "flow.h"
+#include "datapath.h"
+#include <linux/uaccess.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/sctp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+
+#define TBL_MIN_BUCKETS 1024
+#define REHASH_INTERVAL (10 * 60 * HZ)
+
+static struct kmem_cache *flow_cache;
+struct kmem_cache *flow_stats_cache __read_mostly;
+
+static u16 range_n_bytes(const struct sw_flow_key_range *range)
+{
+ return range->end - range->start;
+}
+
+void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask)
+{
+ const long *m = (const long *)((const u8 *)&mask->key +
+ mask->range.start);
+ const long *s = (const long *)((const u8 *)src +
+ mask->range.start);
+ long *d = (long *)((u8 *)dst + mask->range.start);
+ int i;
+
+ /* The memory outside of the 'mask->range' are not set since
+ * further operations on 'dst' only uses contents within
+ * 'mask->range'.
+ */
+ for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
+ *d++ = *s++ & *m++;
+}
+
+struct sw_flow *ovs_flow_alloc(void)
+{
+ struct sw_flow *flow;
+ struct flow_stats *stats;
+ int node;
+
+ flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ flow->sf_acts = NULL;
+ flow->mask = NULL;
+ flow->id.unmasked_key = NULL;
+ flow->id.ufid_len = 0;
+ flow->stats_last_writer = NUMA_NO_NODE;
+
+ /* Initialize the default stat node. */
+ stats = kmem_cache_alloc_node(flow_stats_cache,
+ GFP_KERNEL | __GFP_ZERO, 0);
+ if (!stats)
+ goto err;
+
+ spin_lock_init(&stats->lock);
+
+ RCU_INIT_POINTER(flow->stats[0], stats);
+
+ for_each_node(node)
+ if (node != 0)
+ RCU_INIT_POINTER(flow->stats[node], NULL);
+
+ return flow;
+err:
+ kmem_cache_free(flow_cache, flow);
+ return ERR_PTR(-ENOMEM);
+}
+
+int ovs_flow_tbl_count(const struct flow_table *table)
+{
+ return table->count;
+}
+
+static struct flex_array *alloc_buckets(unsigned int n_buckets)
+{
+ struct flex_array *buckets;
+ int i, err;
+
+ buckets = flex_array_alloc(sizeof(struct hlist_head),
+ n_buckets, GFP_KERNEL);
+ if (!buckets)
+ return NULL;
+
+ err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
+ if (err) {
+ flex_array_free(buckets);
+ return NULL;
+ }
+
+ for (i = 0; i < n_buckets; i++)
+ INIT_HLIST_HEAD((struct hlist_head *)
+ flex_array_get(buckets, i));
+
+ return buckets;
+}
+
+static void flow_free(struct sw_flow *flow)
+{
+ int node;
+
+ if (ovs_identifier_is_key(&flow->id))
+ kfree(flow->id.unmasked_key);
+ kfree((struct sw_flow_actions __force *)flow->sf_acts);
+ for_each_node(node)
+ if (flow->stats[node])
+ kmem_cache_free(flow_stats_cache,
+ (struct flow_stats __force *)flow->stats[node]);
+ kmem_cache_free(flow_cache, flow);
+}
+
+static void rcu_free_flow_callback(struct rcu_head *rcu)
+{
+ struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
+
+ flow_free(flow);
+}
+
+void ovs_flow_free(struct sw_flow *flow, bool deferred)
+{
+ if (!flow)
+ return;
+
+ if (deferred)
+ call_rcu(&flow->rcu, rcu_free_flow_callback);
+ else
+ flow_free(flow);
+}
+
+static void free_buckets(struct flex_array *buckets)
+{
+ flex_array_free(buckets);
+}
+
+
+static void __table_instance_destroy(struct table_instance *ti)
+{
+ free_buckets(ti->buckets);
+ kfree(ti);
+}
+
+static struct table_instance *table_instance_alloc(int new_size)
+{
+ struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+
+ if (!ti)
+ return NULL;
+
+ ti->buckets = alloc_buckets(new_size);
+
+ if (!ti->buckets) {
+ kfree(ti);
+ return NULL;
+ }
+ ti->n_buckets = new_size;
+ ti->node_ver = 0;
+ ti->keep_flows = false;
+ get_random_bytes(&ti->hash_seed, sizeof(u32));
+
+ return ti;
+}
+
+int ovs_flow_tbl_init(struct flow_table *table)
+{
+ struct table_instance *ti, *ufid_ti;
+
+ ti = table_instance_alloc(TBL_MIN_BUCKETS);
+
+ if (!ti)
+ return -ENOMEM;
+
+ ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ if (!ufid_ti)
+ goto free_ti;
+
+ rcu_assign_pointer(table->ti, ti);
+ rcu_assign_pointer(table->ufid_ti, ufid_ti);
+ INIT_LIST_HEAD(&table->mask_list);
+ table->last_rehash = jiffies;
+ table->count = 0;
+ table->ufid_count = 0;
+ return 0;
+
+free_ti:
+ __table_instance_destroy(ti);
+ return -ENOMEM;
+}
+
+static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
+{
+ struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
+
+ __table_instance_destroy(ti);
+}
+
+static void table_instance_destroy(struct table_instance *ti,
+ struct table_instance *ufid_ti,
+ bool deferred)
+{
+ int i;
+
+ if (!ti)
+ return;
+
+ BUG_ON(!ufid_ti);
+ if (ti->keep_flows)
+ goto skip_flows;
+
+ for (i = 0; i < ti->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head = flex_array_get(ti->buckets, i);
+ struct hlist_node *n;
+ int ver = ti->node_ver;
+ int ufid_ver = ufid_ti->node_ver;
+
+ hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) {
+ hlist_del_rcu(&flow->flow_table.node[ver]);
+ if (ovs_identifier_is_ufid(&flow->id))
+ hlist_del_rcu(&flow->ufid_table.node[ufid_ver]);
+ ovs_flow_free(flow, deferred);
+ }
+ }
+
+skip_flows:
+ if (deferred) {
+ call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
+ call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
+ } else {
+ __table_instance_destroy(ti);
+ __table_instance_destroy(ufid_ti);
+ }
+}
+
+/* No need for locking this function is called from RCU callback or
+ * error path.
+ */
+void ovs_flow_tbl_destroy(struct flow_table *table)
+{
+ struct table_instance *ti = rcu_dereference_raw(table->ti);
+ struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
+
+ table_instance_destroy(ti, ufid_ti, false);
+}
+
+struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
+ u32 *bucket, u32 *last)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ int ver;
+ int i;
+
+ ver = ti->node_ver;
+ while (*bucket < ti->n_buckets) {
+ i = 0;
+ head = flex_array_get(ti->buckets, *bucket);
+ hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
+ if (i < *last) {
+ i++;
+ continue;
+ }
+ *last = i + 1;
+ return flow;
+ }
+ (*bucket)++;
+ *last = 0;
+ }
+
+ return NULL;
+}
+
+static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
+{
+ hash = jhash_1word(hash, ti->hash_seed);
+ return flex_array_get(ti->buckets,
+ (hash & (ti->n_buckets - 1)));
+}
+
+static void table_instance_insert(struct table_instance *ti,
+ struct sw_flow *flow)
+{
+ struct hlist_head *head;
+
+ head = find_bucket(ti, flow->flow_table.hash);
+ hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head);
+}
+
+static void ufid_table_instance_insert(struct table_instance *ti,
+ struct sw_flow *flow)
+{
+ struct hlist_head *head;
+
+ head = find_bucket(ti, flow->ufid_table.hash);
+ hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head);
+}
+
+static void flow_table_copy_flows(struct table_instance *old,
+ struct table_instance *new, bool ufid)
+{
+ int old_ver;
+ int i;
+
+ old_ver = old->node_ver;
+ new->node_ver = !old_ver;
+
+ /* Insert in new table. */
+ for (i = 0; i < old->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head;
+
+ head = flex_array_get(old->buckets, i);
+
+ if (ufid)
+ hlist_for_each_entry(flow, head,
+ ufid_table.node[old_ver])
+ ufid_table_instance_insert(new, flow);
+ else
+ hlist_for_each_entry(flow, head,
+ flow_table.node[old_ver])
+ table_instance_insert(new, flow);
+ }
+
+ old->keep_flows = true;
+}
+
+static struct table_instance *table_instance_rehash(struct table_instance *ti,
+ int n_buckets, bool ufid)
+{
+ struct table_instance *new_ti;
+
+ new_ti = table_instance_alloc(n_buckets);
+ if (!new_ti)
+ return NULL;
+
+ flow_table_copy_flows(ti, new_ti, ufid);
+
+ return new_ti;
+}
+
+int ovs_flow_tbl_flush(struct flow_table *flow_table)
+{
+ struct table_instance *old_ti, *new_ti;
+ struct table_instance *old_ufid_ti, *new_ufid_ti;
+
+ new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ if (!new_ti)
+ return -ENOMEM;
+ new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ if (!new_ufid_ti)
+ goto err_free_ti;
+
+ old_ti = ovsl_dereference(flow_table->ti);
+ old_ufid_ti = ovsl_dereference(flow_table->ufid_ti);
+
+ rcu_assign_pointer(flow_table->ti, new_ti);
+ rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
+ flow_table->last_rehash = jiffies;
+ flow_table->count = 0;
+ flow_table->ufid_count = 0;
+
+ table_instance_destroy(old_ti, old_ufid_ti, true);
+ return 0;
+
+err_free_ti:
+ __table_instance_destroy(new_ti);
+ return -ENOMEM;
+}
+
+static u32 flow_hash(const struct sw_flow_key *key,
+ const struct sw_flow_key_range *range)
+{
+ int key_start = range->start;
+ int key_end = range->end;
+ const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
+ int hash_u32s = (key_end - key_start) >> 2;
+
+ /* Make sure number of hash bytes are multiple of u32. */
+ BUILD_BUG_ON(sizeof(long) % sizeof(u32));
+
+ return jhash2(hash_key, hash_u32s, 0);
+}
+
+static int flow_key_start(const struct sw_flow_key *key)
+{
+ if (key->tun_key.ipv4_dst)
+ return 0;
+ else
+ return rounddown(offsetof(struct sw_flow_key, phy),
+ sizeof(long));
+}
+
+static bool cmp_key(const struct sw_flow_key *key1,
+ const struct sw_flow_key *key2,
+ int key_start, int key_end)
+{
+ const long *cp1 = (const long *)((const u8 *)key1 + key_start);
+ const long *cp2 = (const long *)((const u8 *)key2 + key_start);
+ long diffs = 0;
+ int i;
+
+ for (i = key_start; i < key_end; i += sizeof(long))
+ diffs |= *cp1++ ^ *cp2++;
+
+ return diffs == 0;
+}
+
+static bool flow_cmp_masked_key(const struct sw_flow *flow,
+ const struct sw_flow_key *key,
+ const struct sw_flow_key_range *range)
+{
+ return cmp_key(&flow->key, key, range->start, range->end);
+}
+
+static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+ const struct sw_flow_match *match)
+{
+ struct sw_flow_key *key = match->key;
+ int key_start = flow_key_start(key);
+ int key_end = match->range.end;
+
+ BUG_ON(ovs_identifier_is_ufid(&flow->id));
+ return cmp_key(flow->id.unmasked_key, key, key_start, key_end);
+}
+
+static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
+ const struct sw_flow_key *unmasked,
+ const struct sw_flow_mask *mask)
+{
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ u32 hash;
+ struct sw_flow_key masked_key;
+
+ ovs_flow_mask_key(&masked_key, unmasked, mask);
+ hash = flow_hash(&masked_key, &mask->range);
+ head = find_bucket(ti, hash);
+ hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
+ if (flow->mask == mask && flow->flow_table.hash == hash &&
+ flow_cmp_masked_key(flow, &masked_key, &mask->range))
+ return flow;
+ }
+ return NULL;
+}
+
+struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
+ const struct sw_flow_key *key,
+ u32 *n_mask_hit)
+{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct sw_flow_mask *mask;
+ struct sw_flow *flow;
+
+ *n_mask_hit = 0;
+ list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
+ (*n_mask_hit)++;
+ flow = masked_flow_lookup(ti, key, mask);
+ if (flow) /* Found */
+ return flow;
+ }
+ return NULL;
+}
+
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
+ const struct sw_flow_key *key)
+{
+ u32 __always_unused n_mask_hit;
+
+ return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
+}
+
+struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
+ const struct sw_flow_match *match)
+{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct sw_flow_mask *mask;
+ struct sw_flow *flow;
+
+ /* Always called under ovs-mutex. */
+ list_for_each_entry(mask, &tbl->mask_list, list) {
+ flow = masked_flow_lookup(ti, match->key, mask);
+ if (flow && ovs_identifier_is_key(&flow->id) &&
+ ovs_flow_cmp_unmasked_key(flow, match))
+ return flow;
+ }
+ return NULL;
+}
+
+static u32 ufid_hash(const struct sw_flow_id *sfid)
+{
+ return jhash(sfid->ufid, sfid->ufid_len, 0);
+}
+
+static bool ovs_flow_cmp_ufid(const struct sw_flow *flow,
+ const struct sw_flow_id *sfid)
+{
+ if (flow->id.ufid_len != sfid->ufid_len)
+ return false;
+
+ return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
+}
+
+bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match)
+{
+ if (ovs_identifier_is_ufid(&flow->id))
+ return flow_cmp_masked_key(flow, match->key, &match->range);
+
+ return ovs_flow_cmp_unmasked_key(flow, match);
+}
+
+struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
+ const struct sw_flow_id *ufid)
+{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti);
+ struct sw_flow *flow;
+ struct hlist_head *head;
+ u32 hash;
+
+ hash = ufid_hash(ufid);
+ head = find_bucket(ti, hash);
+ hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) {
+ if (flow->ufid_table.hash == hash &&
+ ovs_flow_cmp_ufid(flow, ufid))
+ return flow;
+ }
+ return NULL;
+}
+
+int ovs_flow_tbl_num_masks(const struct flow_table *table)
+{
+ struct sw_flow_mask *mask;
+ int num = 0;
+
+ list_for_each_entry(mask, &table->mask_list, list)
+ num++;
+
+ return num;
+}
+
+static struct table_instance *table_instance_expand(struct table_instance *ti,
+ bool ufid)
+{
+ return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
+}
+
+/* Remove 'mask' from the mask list, if it is not needed any more. */
+static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+ if (mask) {
+ /* ovs-lock is required to protect mask-refcount and
+ * mask list.
+ */
+ ASSERT_OVSL();
+ BUG_ON(!mask->ref_count);
+ mask->ref_count--;
+
+ if (!mask->ref_count) {
+ list_del_rcu(&mask->list);
+ kfree_rcu(mask, rcu);
+ }
+ }
+}
+
+/* Must be called with OVS mutex held. */
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
+{
+ struct table_instance *ti = ovsl_dereference(table->ti);
+ struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
+
+ BUG_ON(table->count == 0);
+ hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
+ table->count--;
+ if (ovs_identifier_is_ufid(&flow->id)) {
+ hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
+ table->ufid_count--;
+ }
+
+ /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
+ * accessible as long as the RCU read lock is held.
+ */
+ flow_mask_remove(table, flow->mask);
+}
+
+static struct sw_flow_mask *mask_alloc(void)
+{
+ struct sw_flow_mask *mask;
+
+ mask = kmalloc(sizeof(*mask), GFP_KERNEL);
+ if (mask)
+ mask->ref_count = 1;
+
+ return mask;
+}
+
+static bool mask_equal(const struct sw_flow_mask *a,
+ const struct sw_flow_mask *b)
+{
+ const u8 *a_ = (const u8 *)&a->key + a->range.start;
+ const u8 *b_ = (const u8 *)&b->key + b->range.start;
+
+ return (a->range.end == b->range.end)
+ && (a->range.start == b->range.start)
+ && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
+}
+
+static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
+ const struct sw_flow_mask *mask)
+{
+ struct list_head *ml;
+
+ list_for_each(ml, &tbl->mask_list) {
+ struct sw_flow_mask *m;
+ m = container_of(ml, struct sw_flow_mask, list);
+ if (mask_equal(mask, m))
+ return m;
+ }
+
+ return NULL;
+}
+
+/* Add 'mask' into the mask list, if it is not already there. */
+static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
+ const struct sw_flow_mask *new)
+{
+ struct sw_flow_mask *mask;
+ mask = flow_mask_find(tbl, new);
+ if (!mask) {
+ /* Allocate a new mask if none exsits. */
+ mask = mask_alloc();
+ if (!mask)
+ return -ENOMEM;
+ mask->key = new->key;
+ mask->range = new->range;
+ list_add_rcu(&mask->list, &tbl->mask_list);
+ } else {
+ BUG_ON(!mask->ref_count);
+ mask->ref_count++;
+ }
+
+ flow->mask = mask;
+ return 0;
+}
+
+/* Must be called with OVS mutex held. */
+static void flow_key_insert(struct flow_table *table, struct sw_flow *flow)
+{
+ struct table_instance *new_ti = NULL;
+ struct table_instance *ti;
+
+ flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range);
+ ti = ovsl_dereference(table->ti);
+ table_instance_insert(ti, flow);
+ table->count++;
+
+ /* Expand table, if necessary, to make room. */
+ if (table->count > ti->n_buckets)
+ new_ti = table_instance_expand(ti, false);
+ else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
+ new_ti = table_instance_rehash(ti, ti->n_buckets, false);
+
+ if (new_ti) {
+ rcu_assign_pointer(table->ti, new_ti);
+ call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
+ table->last_rehash = jiffies;
+ }
+}
+
+/* Must be called with OVS mutex held. */
+static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
+{
+ struct table_instance *ti;
+
+ flow->ufid_table.hash = ufid_hash(&flow->id);
+ ti = ovsl_dereference(table->ufid_ti);
+ ufid_table_instance_insert(ti, flow);
+ table->ufid_count++;
+
+ /* Expand table, if necessary, to make room. */
+ if (table->ufid_count > ti->n_buckets) {
+ struct table_instance *new_ti;
+
+ new_ti = table_instance_expand(ti, true);
+ if (new_ti) {
+ rcu_assign_pointer(table->ufid_ti, new_ti);
+ call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
+ }
+ }
+}
+
+/* Must be called with OVS mutex held. */
+int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+ const struct sw_flow_mask *mask)
+{
+ int err;
+
+ err = flow_mask_insert(table, flow, mask);
+ if (err)
+ return err;
+ flow_key_insert(table, flow);
+ if (ovs_identifier_is_ufid(&flow->id))
+ flow_ufid_insert(table, flow);
+
+ return 0;
+}
+
+/* Initializes the flow module.
+ * Returns zero if successful or a negative error code. */
+int ovs_flow_init(void)
+{
+ BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
+ BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
+
+ flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+ + (num_possible_nodes()
+ * sizeof(struct flow_stats *)),
+ 0, 0, NULL);
+ if (flow_cache == NULL)
+ return -ENOMEM;
+
+ flow_stats_cache
+ = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (flow_stats_cache == NULL) {
+ kmem_cache_destroy(flow_cache);
+ flow_cache = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/* Uninitializes the flow module. */
+void ovs_flow_exit(void)
+{
+ kmem_cache_destroy(flow_stats_cache);
+ kmem_cache_destroy(flow_cache);
+}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
new file mode 100644
index 000000000..616eda10d
--- /dev/null
+++ b/net/openvswitch/flow_table.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2007-2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef FLOW_TABLE_H
+#define FLOW_TABLE_H 1
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/in6.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/flex_array.h>
+
+#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+
+#include "flow.h"
+
+struct table_instance {
+ struct flex_array *buckets;
+ unsigned int n_buckets;
+ struct rcu_head rcu;
+ int node_ver;
+ u32 hash_seed;
+ bool keep_flows;
+};
+
+struct flow_table {
+ struct table_instance __rcu *ti;
+ struct table_instance __rcu *ufid_ti;
+ struct list_head mask_list;
+ unsigned long last_rehash;
+ unsigned int count;
+ unsigned int ufid_count;
+};
+
+extern struct kmem_cache *flow_stats_cache;
+
+int ovs_flow_init(void);
+void ovs_flow_exit(void);
+
+struct sw_flow *ovs_flow_alloc(void);
+void ovs_flow_free(struct sw_flow *, bool deferred);
+
+int ovs_flow_tbl_init(struct flow_table *);
+int ovs_flow_tbl_count(const struct flow_table *table);
+void ovs_flow_tbl_destroy(struct flow_table *table);
+int ovs_flow_tbl_flush(struct flow_table *flow_table);
+
+int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
+ const struct sw_flow_mask *mask);
+void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
+int ovs_flow_tbl_num_masks(const struct flow_table *table);
+struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table,
+ u32 *bucket, u32 *idx);
+struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *,
+ const struct sw_flow_key *,
+ u32 *n_mask_hit);
+struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
+ const struct sw_flow_key *);
+struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
+ const struct sw_flow_match *match);
+struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *,
+ const struct sw_flow_id *);
+
+bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *);
+
+void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
+ const struct sw_flow_mask *mask);
+#endif /* flow_table.h */
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
new file mode 100644
index 000000000..bf02fd580
--- /dev/null
+++ b/net/openvswitch/vport-geneve.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+#include <linux/if_vlan.h>
+#include <linux/module.h>
+
+#include <net/geneve.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/udp.h>
+#include <net/xfrm.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+static struct vport_ops ovs_geneve_vport_ops;
+
+/**
+ * struct geneve_port - Keeps track of open UDP ports
+ * @gs: The socket created for this port number.
+ * @name: vport name.
+ */
+struct geneve_port {
+ struct geneve_sock *gs;
+ char name[IFNAMSIZ];
+};
+
+static LIST_HEAD(geneve_ports);
+
+static inline struct geneve_port *geneve_vport(const struct vport *vport)
+{
+ return vport_priv(vport);
+}
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+ return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+ vni[0] = (__force __u8)(tun_id >> 16);
+ vni[1] = (__force __u8)(tun_id >> 8);
+ vni[2] = (__force __u8)tun_id;
+#else
+ vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+ vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+ vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
+/* Convert 24 bit VNI to 64 bit tunnel ID. */
+static __be64 vni_to_tunnel_id(const __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+ return (vni[0] << 16) | (vni[1] << 8) | vni[2];
+#else
+ return (__force __be64)(((__force u64)vni[0] << 40) |
+ ((__force u64)vni[1] << 48) |
+ ((__force u64)vni[2] << 56));
+#endif
+}
+
+static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
+{
+ struct vport *vport = gs->rcv_data;
+ struct genevehdr *geneveh = geneve_hdr(skb);
+ int opts_len;
+ struct ovs_tunnel_info tun_info;
+ __be64 key;
+ __be16 flags;
+
+ opts_len = geneveh->opt_len * 4;
+
+ flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
+ (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
+ (geneveh->oam ? TUNNEL_OAM : 0) |
+ (geneveh->critical ? TUNNEL_CRIT_OPT : 0);
+
+ key = vni_to_tunnel_id(geneveh->vni);
+
+ ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
+ udp_hdr(skb)->source, udp_hdr(skb)->dest,
+ key, flags,
+ geneveh->options, opts_len);
+
+ ovs_vport_receive(vport, skb, &tun_info);
+}
+
+static int geneve_get_options(const struct vport *vport,
+ struct sk_buff *skb)
+{
+ struct geneve_port *geneve_port = geneve_vport(vport);
+ struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk);
+
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static void geneve_tnl_destroy(struct vport *vport)
+{
+ struct geneve_port *geneve_port = geneve_vport(vport);
+
+ geneve_sock_release(geneve_port->gs);
+
+ ovs_vport_deferred_free(vport);
+}
+
+static struct vport *geneve_tnl_create(const struct vport_parms *parms)
+{
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct nlattr *options = parms->options;
+ struct geneve_port *geneve_port;
+ struct geneve_sock *gs;
+ struct vport *vport;
+ struct nlattr *a;
+ int err;
+ u16 dst_port;
+
+ if (!options) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+ if (a && nla_len(a) == sizeof(u16)) {
+ dst_port = nla_get_u16(a);
+ } else {
+ /* Require destination port from userspace. */
+ err = -EINVAL;
+ goto error;
+ }
+
+ vport = ovs_vport_alloc(sizeof(struct geneve_port),
+ &ovs_geneve_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ geneve_port = geneve_vport(vport);
+ strncpy(geneve_port->name, parms->name, IFNAMSIZ);
+
+ gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0);
+ if (IS_ERR(gs)) {
+ ovs_vport_free(vport);
+ return (void *)gs;
+ }
+ geneve_port->gs = gs;
+
+ return vport;
+error:
+ return ERR_PTR(err);
+}
+
+static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+ const struct ovs_key_ipv4_tunnel *tun_key;
+ struct ovs_tunnel_info *tun_info;
+ struct net *net = ovs_dp_get_net(vport->dp);
+ struct geneve_port *geneve_port = geneve_vport(vport);
+ __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
+ __be16 sport;
+ struct rtable *rt;
+ struct flowi4 fl;
+ u8 vni[3], opts_len, *opts;
+ __be16 df;
+ int err;
+
+ tun_info = OVS_CB(skb)->egress_tun_info;
+ if (unlikely(!tun_info)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ tun_key = &tun_info->tunnel;
+ rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto error;
+ }
+
+ df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+ tunnel_id_to_vni(tun_key->tun_id, vni);
+ skb->ignore_df = 1;
+
+ if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) {
+ opts = (u8 *)tun_info->options;
+ opts_len = tun_info->options_len;
+ } else {
+ opts = NULL;
+ opts_len = 0;
+ }
+
+ err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
+ tun_key->ipv4_dst, tun_key->ipv4_tos,
+ tun_key->ipv4_ttl, df, sport, dport,
+ tun_key->tun_flags, vni, opts_len, opts,
+ !!(tun_key->tun_flags & TUNNEL_CSUM), false);
+ if (err < 0)
+ ip_rt_put(rt);
+ return err;
+
+error:
+ kfree_skb(skb);
+ return err;
+}
+
+static const char *geneve_get_name(const struct vport *vport)
+{
+ struct geneve_port *geneve_port = geneve_vport(vport);
+
+ return geneve_port->name;
+}
+
+static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct ovs_tunnel_info *egress_tun_info)
+{
+ struct geneve_port *geneve_port = geneve_vport(vport);
+ struct net *net = ovs_dp_get_net(vport->dp);
+ __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport;
+ __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+
+ /* Get tp_src and tp_dst, refert to geneve_build_header().
+ */
+ return ovs_tunnel_get_egress_info(egress_tun_info,
+ ovs_dp_get_net(vport->dp),
+ OVS_CB(skb)->egress_tun_info,
+ IPPROTO_UDP, skb->mark, sport, dport);
+}
+
+static struct vport_ops ovs_geneve_vport_ops = {
+ .type = OVS_VPORT_TYPE_GENEVE,
+ .create = geneve_tnl_create,
+ .destroy = geneve_tnl_destroy,
+ .get_name = geneve_get_name,
+ .get_options = geneve_get_options,
+ .send = geneve_tnl_send,
+ .owner = THIS_MODULE,
+ .get_egress_tun_info = geneve_get_egress_tun_info,
+};
+
+static int __init ovs_geneve_tnl_init(void)
+{
+ return ovs_vport_ops_register(&ovs_geneve_vport_ops);
+}
+
+static void __exit ovs_geneve_tnl_exit(void)
+{
+ ovs_vport_ops_unregister(&ovs_geneve_vport_ops);
+}
+
+module_init(ovs_geneve_tnl_init);
+module_exit(ovs_geneve_tnl_exit);
+
+MODULE_DESCRIPTION("OVS: Geneve swiching port");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("vport-type-5");
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
new file mode 100644
index 000000000..f17ac9642
--- /dev/null
+++ b/net/openvswitch/vport-gre.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
+#include <net/gre.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/protocol.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+static struct vport_ops ovs_gre_vport_ops;
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 be64_get_low32(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be32)x;
+#else
+ return (__force __be32)((__force u64)x >> 32);
+#endif
+}
+
+static __be16 filter_tnl_flags(__be16 flags)
+{
+ return flags & (TUNNEL_CSUM | TUNNEL_KEY);
+}
+
+static struct sk_buff *__build_header(struct sk_buff *skb,
+ int tunnel_hlen)
+{
+ struct tnl_ptk_info tpi;
+ const struct ovs_key_ipv4_tunnel *tun_key;
+
+ tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+
+ skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
+ if (IS_ERR(skb))
+ return skb;
+
+ tpi.flags = filter_tnl_flags(tun_key->tun_flags);
+ tpi.proto = htons(ETH_P_TEB);
+ tpi.key = be64_get_low32(tun_key->tun_id);
+ tpi.seq = 0;
+ gre_build_header(skb, &tpi, tunnel_hlen);
+
+ return skb;
+}
+
+static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
+#else
+ return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
+#endif
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static int gre_rcv(struct sk_buff *skb,
+ const struct tnl_ptk_info *tpi)
+{
+ struct ovs_tunnel_info tun_info;
+ struct ovs_net *ovs_net;
+ struct vport *vport;
+ __be64 key;
+
+ ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
+ vport = rcu_dereference(ovs_net->vport_net.gre_vport);
+ if (unlikely(!vport))
+ return PACKET_REJECT;
+
+ key = key_to_tunnel_id(tpi->key, tpi->seq);
+ ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
+ filter_tnl_flags(tpi->flags), NULL, 0);
+
+ ovs_vport_receive(vport, skb, &tun_info);
+ return PACKET_RCVD;
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static int gre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
+{
+ struct ovs_net *ovs_net;
+ struct vport *vport;
+
+ ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
+ vport = rcu_dereference(ovs_net->vport_net.gre_vport);
+
+ if (unlikely(!vport))
+ return PACKET_REJECT;
+ else
+ return PACKET_RCVD;
+}
+
+static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+ struct net *net = ovs_dp_get_net(vport->dp);
+ const struct ovs_key_ipv4_tunnel *tun_key;
+ struct flowi4 fl;
+ struct rtable *rt;
+ int min_headroom;
+ int tunnel_hlen;
+ __be16 df;
+ int err;
+
+ if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
+ err = -EINVAL;
+ goto err_free_skb;
+ }
+
+ tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+ rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto err_free_skb;
+ }
+
+ tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags);
+
+ min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ + tunnel_hlen + sizeof(struct iphdr)
+ + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto err_free_rt;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (unlikely(!skb)) {
+ err = -ENOMEM;
+ goto err_free_rt;
+ }
+
+ /* Push Tunnel header. */
+ skb = __build_header(skb, tunnel_hlen);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ skb = NULL;
+ goto err_free_rt;
+ }
+
+ df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
+ htons(IP_DF) : 0;
+
+ skb->ignore_df = 1;
+
+ return iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
+ tun_key->ipv4_dst, IPPROTO_GRE,
+ tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false);
+err_free_rt:
+ ip_rt_put(rt);
+err_free_skb:
+ kfree_skb(skb);
+ return err;
+}
+
+static struct gre_cisco_protocol gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .priority = 1,
+};
+
+static int gre_ports;
+static int gre_init(void)
+{
+ int err;
+
+ gre_ports++;
+ if (gre_ports > 1)
+ return 0;
+
+ err = gre_cisco_register(&gre_protocol);
+ if (err)
+ pr_warn("cannot register gre protocol handler\n");
+
+ return err;
+}
+
+static void gre_exit(void)
+{
+ gre_ports--;
+ if (gre_ports > 0)
+ return;
+
+ gre_cisco_unregister(&gre_protocol);
+}
+
+static const char *gre_get_name(const struct vport *vport)
+{
+ return vport_priv(vport);
+}
+
+static struct vport *gre_create(const struct vport_parms *parms)
+{
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct ovs_net *ovs_net;
+ struct vport *vport;
+ int err;
+
+ err = gre_init();
+ if (err)
+ return ERR_PTR(err);
+
+ ovs_net = net_generic(net, ovs_net_id);
+ if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
+ vport = ERR_PTR(-EEXIST);
+ goto error;
+ }
+
+ vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms);
+ if (IS_ERR(vport))
+ goto error;
+
+ strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
+ rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
+ return vport;
+
+error:
+ gre_exit();
+ return vport;
+}
+
+static void gre_tnl_destroy(struct vport *vport)
+{
+ struct net *net = ovs_dp_get_net(vport->dp);
+ struct ovs_net *ovs_net;
+
+ ovs_net = net_generic(net, ovs_net_id);
+
+ RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL);
+ ovs_vport_deferred_free(vport);
+ gre_exit();
+}
+
+static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct ovs_tunnel_info *egress_tun_info)
+{
+ return ovs_tunnel_get_egress_info(egress_tun_info,
+ ovs_dp_get_net(vport->dp),
+ OVS_CB(skb)->egress_tun_info,
+ IPPROTO_GRE, skb->mark, 0, 0);
+}
+
+static struct vport_ops ovs_gre_vport_ops = {
+ .type = OVS_VPORT_TYPE_GRE,
+ .create = gre_create,
+ .destroy = gre_tnl_destroy,
+ .get_name = gre_get_name,
+ .send = gre_tnl_send,
+ .get_egress_tun_info = gre_get_egress_tun_info,
+ .owner = THIS_MODULE,
+};
+
+static int __init ovs_gre_tnl_init(void)
+{
+ return ovs_vport_ops_register(&ovs_gre_vport_ops);
+}
+
+static void __exit ovs_gre_tnl_exit(void)
+{
+ ovs_vport_ops_unregister(&ovs_gre_vport_ops);
+}
+
+module_init(ovs_gre_tnl_init);
+module_exit(ovs_gre_tnl_exit);
+
+MODULE_DESCRIPTION("OVS: GRE switching port");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("vport-type-3");
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
new file mode 100644
index 000000000..6a55f7105
--- /dev/null
+++ b/net/openvswitch/vport-internal_dev.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2007-2012 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include <linux/hardirq.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/skbuff.h>
+
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/rtnetlink.h>
+
+#include "datapath.h"
+#include "vport-internal_dev.h"
+#include "vport-netdev.h"
+
+struct internal_dev {
+ struct vport *vport;
+};
+
+static struct vport_ops ovs_internal_vport_ops;
+
+static struct internal_dev *internal_dev_priv(struct net_device *netdev)
+{
+ return netdev_priv(netdev);
+}
+
+/* This function is only called by the kernel network layer.*/
+static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct vport *vport = ovs_internal_dev_get_vport(netdev);
+ struct ovs_vport_stats vport_stats;
+
+ ovs_vport_get_stats(vport, &vport_stats);
+
+ /* The tx and rx stats need to be swapped because the
+ * switch and host OS have opposite perspectives. */
+ stats->rx_packets = vport_stats.tx_packets;
+ stats->tx_packets = vport_stats.rx_packets;
+ stats->rx_bytes = vport_stats.tx_bytes;
+ stats->tx_bytes = vport_stats.rx_bytes;
+ stats->rx_errors = vport_stats.tx_errors;
+ stats->tx_errors = vport_stats.rx_errors;
+ stats->rx_dropped = vport_stats.tx_dropped;
+ stats->tx_dropped = vport_stats.rx_dropped;
+
+ return stats;
+}
+
+/* Called with rcu_read_lock_bh. */
+static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ rcu_read_lock();
+ ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
+ rcu_read_unlock();
+ return 0;
+}
+
+static int internal_dev_open(struct net_device *netdev)
+{
+ netif_start_queue(netdev);
+ return 0;
+}
+
+static int internal_dev_stop(struct net_device *netdev)
+{
+ netif_stop_queue(netdev);
+ return 0;
+}
+
+static void internal_dev_getinfo(struct net_device *netdev,
+ struct ethtool_drvinfo *info)
+{
+ strlcpy(info->driver, "openvswitch", sizeof(info->driver));
+}
+
+static const struct ethtool_ops internal_dev_ethtool_ops = {
+ .get_drvinfo = internal_dev_getinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu)
+{
+ if (new_mtu < 68)
+ return -EINVAL;
+
+ netdev->mtu = new_mtu;
+ return 0;
+}
+
+static void internal_dev_destructor(struct net_device *dev)
+{
+ struct vport *vport = ovs_internal_dev_get_vport(dev);
+
+ ovs_vport_free(vport);
+ free_netdev(dev);
+}
+
+static const struct net_device_ops internal_dev_netdev_ops = {
+ .ndo_open = internal_dev_open,
+ .ndo_stop = internal_dev_stop,
+ .ndo_start_xmit = internal_dev_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_change_mtu = internal_dev_change_mtu,
+ .ndo_get_stats64 = internal_dev_get_stats,
+};
+
+static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
+ .kind = "openvswitch",
+};
+
+static void do_setup(struct net_device *netdev)
+{
+ ether_setup(netdev);
+
+ netdev->netdev_ops = &internal_dev_netdev_ops;
+
+ netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netdev->destructor = internal_dev_destructor;
+ netdev->ethtool_ops = &internal_dev_ethtool_ops;
+ netdev->rtnl_link_ops = &internal_dev_link_ops;
+ netdev->tx_queue_len = 0;
+
+ netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
+ NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
+ NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL;
+
+ netdev->vlan_features = netdev->features;
+ netdev->hw_enc_features = netdev->features;
+ netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
+ netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
+
+ eth_hw_addr_random(netdev);
+}
+
+static struct vport *internal_dev_create(const struct vport_parms *parms)
+{
+ struct vport *vport;
+ struct netdev_vport *netdev_vport;
+ struct internal_dev *internal_dev;
+ int err;
+
+ vport = ovs_vport_alloc(sizeof(struct netdev_vport),
+ &ovs_internal_vport_ops, parms);
+ if (IS_ERR(vport)) {
+ err = PTR_ERR(vport);
+ goto error;
+ }
+
+ netdev_vport = netdev_vport_priv(vport);
+
+ netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
+ parms->name, NET_NAME_UNKNOWN,
+ do_setup);
+ if (!netdev_vport->dev) {
+ err = -ENOMEM;
+ goto error_free_vport;
+ }
+
+ dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp));
+ internal_dev = internal_dev_priv(netdev_vport->dev);
+ internal_dev->vport = vport;
+
+ /* Restrict bridge port to current netns. */
+ if (vport->port_no == OVSP_LOCAL)
+ netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL;
+
+ rtnl_lock();
+ err = register_netdevice(netdev_vport->dev);
+ if (err)
+ goto error_free_netdev;
+
+ dev_set_promiscuity(netdev_vport->dev, 1);
+ rtnl_unlock();
+ netif_start_queue(netdev_vport->dev);
+
+ return vport;
+
+error_free_netdev:
+ rtnl_unlock();
+ free_netdev(netdev_vport->dev);
+error_free_vport:
+ ovs_vport_free(vport);
+error:
+ return ERR_PTR(err);
+}
+
+static void internal_dev_destroy(struct vport *vport)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+
+ netif_stop_queue(netdev_vport->dev);
+ rtnl_lock();
+ dev_set_promiscuity(netdev_vport->dev, -1);
+
+ /* unregister_netdevice() waits for an RCU grace period. */
+ unregister_netdevice(netdev_vport->dev);
+
+ rtnl_unlock();
+}
+
+static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
+{
+ struct net_device *netdev = netdev_vport_priv(vport)->dev;
+ int len;
+
+ if (unlikely(!(netdev->flags & IFF_UP))) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ len = skb->len;
+
+ skb_dst_drop(skb);
+ nf_reset(skb);
+ secpath_reset(skb);
+
+ skb->dev = netdev;
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, netdev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+ netif_rx(skb);
+
+ return len;
+}
+
+static struct vport_ops ovs_internal_vport_ops = {
+ .type = OVS_VPORT_TYPE_INTERNAL,
+ .create = internal_dev_create,
+ .destroy = internal_dev_destroy,
+ .get_name = ovs_netdev_get_name,
+ .send = internal_dev_recv,
+};
+
+int ovs_is_internal_dev(const struct net_device *netdev)
+{
+ return netdev->netdev_ops == &internal_dev_netdev_ops;
+}
+
+struct vport *ovs_internal_dev_get_vport(struct net_device *netdev)
+{
+ if (!ovs_is_internal_dev(netdev))
+ return NULL;
+
+ return internal_dev_priv(netdev)->vport;
+}
+
+int ovs_internal_dev_rtnl_link_register(void)
+{
+ int err;
+
+ err = rtnl_link_register(&internal_dev_link_ops);
+ if (err < 0)
+ return err;
+
+ err = ovs_vport_ops_register(&ovs_internal_vport_ops);
+ if (err < 0)
+ rtnl_link_unregister(&internal_dev_link_ops);
+
+ return err;
+}
+
+void ovs_internal_dev_rtnl_link_unregister(void)
+{
+ ovs_vport_ops_unregister(&ovs_internal_vport_ops);
+ rtnl_link_unregister(&internal_dev_link_ops);
+}
diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h
new file mode 100644
index 000000000..1b179a190
--- /dev/null
+++ b/net/openvswitch/vport-internal_dev.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2007-2011 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef VPORT_INTERNAL_DEV_H
+#define VPORT_INTERNAL_DEV_H 1
+
+#include "datapath.h"
+#include "vport.h"
+
+int ovs_is_internal_dev(const struct net_device *);
+struct vport *ovs_internal_dev_get_vport(struct net_device *);
+int ovs_internal_dev_rtnl_link_register(void);
+void ovs_internal_dev_rtnl_link_unregister(void);
+
+#endif /* vport-internal_dev.h */
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
new file mode 100644
index 000000000..33e6d6e29
--- /dev/null
+++ b/net/openvswitch/vport-netdev.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2007-2012 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if_arp.h>
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/llc.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/openvswitch.h>
+
+#include <net/llc.h>
+
+#include "datapath.h"
+#include "vport-internal_dev.h"
+#include "vport-netdev.h"
+
+static struct vport_ops ovs_netdev_vport_ops;
+
+/* Must be called with rcu_read_lock. */
+static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
+{
+ if (unlikely(!vport))
+ goto error;
+
+ if (unlikely(skb_warn_if_lro(skb)))
+ goto error;
+
+ /* Make our own copy of the packet. Otherwise we will mangle the
+ * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
+ */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return;
+
+ skb_push(skb, ETH_HLEN);
+ ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+
+ ovs_vport_receive(vport, skb, NULL);
+ return;
+
+error:
+ kfree_skb(skb);
+}
+
+/* Called with rcu_read_lock and bottom-halves disabled. */
+static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct vport *vport;
+
+ if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+ return RX_HANDLER_PASS;
+
+ vport = ovs_netdev_get_vport(skb->dev);
+
+ netdev_port_receive(vport, skb);
+
+ return RX_HANDLER_CONSUMED;
+}
+
+static struct net_device *get_dpdev(const struct datapath *dp)
+{
+ struct vport *local;
+
+ local = ovs_vport_ovsl(dp, OVSP_LOCAL);
+ BUG_ON(!local);
+ return netdev_vport_priv(local)->dev;
+}
+
+static struct vport *netdev_create(const struct vport_parms *parms)
+{
+ struct vport *vport;
+ struct netdev_vport *netdev_vport;
+ int err;
+
+ vport = ovs_vport_alloc(sizeof(struct netdev_vport),
+ &ovs_netdev_vport_ops, parms);
+ if (IS_ERR(vport)) {
+ err = PTR_ERR(vport);
+ goto error;
+ }
+
+ netdev_vport = netdev_vport_priv(vport);
+
+ netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
+ if (!netdev_vport->dev) {
+ err = -ENODEV;
+ goto error_free_vport;
+ }
+
+ if (netdev_vport->dev->flags & IFF_LOOPBACK ||
+ netdev_vport->dev->type != ARPHRD_ETHER ||
+ ovs_is_internal_dev(netdev_vport->dev)) {
+ err = -EINVAL;
+ goto error_put;
+ }
+
+ rtnl_lock();
+ err = netdev_master_upper_dev_link(netdev_vport->dev,
+ get_dpdev(vport->dp));
+ if (err)
+ goto error_unlock;
+
+ err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
+ vport);
+ if (err)
+ goto error_master_upper_dev_unlink;
+
+ dev_disable_lro(netdev_vport->dev);
+ dev_set_promiscuity(netdev_vport->dev, 1);
+ netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
+ rtnl_unlock();
+
+ return vport;
+
+error_master_upper_dev_unlink:
+ netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
+error_unlock:
+ rtnl_unlock();
+error_put:
+ dev_put(netdev_vport->dev);
+error_free_vport:
+ ovs_vport_free(vport);
+error:
+ return ERR_PTR(err);
+}
+
+static void free_port_rcu(struct rcu_head *rcu)
+{
+ struct netdev_vport *netdev_vport = container_of(rcu,
+ struct netdev_vport, rcu);
+
+ dev_put(netdev_vport->dev);
+ ovs_vport_free(vport_from_priv(netdev_vport));
+}
+
+void ovs_netdev_detach_dev(struct vport *vport)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+
+ ASSERT_RTNL();
+ netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
+ netdev_rx_handler_unregister(netdev_vport->dev);
+ netdev_upper_dev_unlink(netdev_vport->dev,
+ netdev_master_upper_dev_get(netdev_vport->dev));
+ dev_set_promiscuity(netdev_vport->dev, -1);
+}
+
+static void netdev_destroy(struct vport *vport)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+
+ rtnl_lock();
+ if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ ovs_netdev_detach_dev(vport);
+ rtnl_unlock();
+
+ call_rcu(&netdev_vport->rcu, free_port_rcu);
+}
+
+const char *ovs_netdev_get_name(const struct vport *vport)
+{
+ const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+ return netdev_vport->dev->name;
+}
+
+static unsigned int packet_length(const struct sk_buff *skb)
+{
+ unsigned int length = skb->len - ETH_HLEN;
+
+ if (skb->protocol == htons(ETH_P_8021Q))
+ length -= VLAN_HLEN;
+
+ return length;
+}
+
+static int netdev_send(struct vport *vport, struct sk_buff *skb)
+{
+ struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
+ int mtu = netdev_vport->dev->mtu;
+ int len;
+
+ if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
+ net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
+ netdev_vport->dev->name,
+ packet_length(skb), mtu);
+ goto drop;
+ }
+
+ skb->dev = netdev_vport->dev;
+ len = skb->len;
+ dev_queue_xmit(skb);
+
+ return len;
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+/* Returns null if this device is not attached to a datapath. */
+struct vport *ovs_netdev_get_vport(struct net_device *dev)
+{
+ if (likely(dev->priv_flags & IFF_OVS_DATAPATH))
+ return (struct vport *)
+ rcu_dereference_rtnl(dev->rx_handler_data);
+ else
+ return NULL;
+}
+
+static struct vport_ops ovs_netdev_vport_ops = {
+ .type = OVS_VPORT_TYPE_NETDEV,
+ .create = netdev_create,
+ .destroy = netdev_destroy,
+ .get_name = ovs_netdev_get_name,
+ .send = netdev_send,
+};
+
+int __init ovs_netdev_init(void)
+{
+ return ovs_vport_ops_register(&ovs_netdev_vport_ops);
+}
+
+void ovs_netdev_exit(void)
+{
+ ovs_vport_ops_unregister(&ovs_netdev_vport_ops);
+}
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
new file mode 100644
index 000000000..6f7038e79
--- /dev/null
+++ b/net/openvswitch/vport-netdev.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2007-2011 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef VPORT_NETDEV_H
+#define VPORT_NETDEV_H 1
+
+#include <linux/netdevice.h>
+#include <linux/rcupdate.h>
+
+#include "vport.h"
+
+struct vport *ovs_netdev_get_vport(struct net_device *dev);
+
+struct netdev_vport {
+ struct rcu_head rcu;
+
+ struct net_device *dev;
+};
+
+static inline struct netdev_vport *
+netdev_vport_priv(const struct vport *vport)
+{
+ return vport_priv(vport);
+}
+
+const char *ovs_netdev_get_name(const struct vport *);
+void ovs_netdev_detach_dev(struct vport *);
+
+int __init ovs_netdev_init(void);
+void ovs_netdev_exit(void);
+
+#endif /* vport_netdev.h */
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
new file mode 100644
index 000000000..6d39766e7
--- /dev/null
+++ b/net/openvswitch/vport-vxlan.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ * Copyright (c) 2013 Cisco Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+#include <linux/module.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/ip_tunnels.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/vxlan.h>
+
+#include "datapath.h"
+#include "vport.h"
+#include "vport-vxlan.h"
+
+/**
+ * struct vxlan_port - Keeps track of open UDP ports
+ * @vs: vxlan_sock created for the port.
+ * @name: vport name.
+ */
+struct vxlan_port {
+ struct vxlan_sock *vs;
+ char name[IFNAMSIZ];
+ u32 exts; /* VXLAN_F_* in <net/vxlan.h> */
+};
+
+static struct vport_ops ovs_vxlan_vport_ops;
+
+static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
+{
+ return vport_priv(vport);
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+ struct vxlan_metadata *md)
+{
+ struct ovs_tunnel_info tun_info;
+ struct vxlan_port *vxlan_port;
+ struct vport *vport = vs->data;
+ struct iphdr *iph;
+ struct ovs_vxlan_opts opts = {
+ .gbp = md->gbp,
+ };
+ __be64 key;
+ __be16 flags;
+
+ flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
+ vxlan_port = vxlan_vport(vport);
+ if (vxlan_port->exts & VXLAN_F_GBP && md->gbp)
+ flags |= TUNNEL_VXLAN_OPT;
+
+ /* Save outer tunnel values */
+ iph = ip_hdr(skb);
+ key = cpu_to_be64(ntohl(md->vni) >> 8);
+ ovs_flow_tun_info_init(&tun_info, iph,
+ udp_hdr(skb)->source, udp_hdr(skb)->dest,
+ key, flags, &opts, sizeof(opts));
+
+ ovs_vport_receive(vport, skb, &tun_info);
+}
+
+static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
+{
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+ __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
+ return -EMSGSIZE;
+
+ if (vxlan_port->exts) {
+ struct nlattr *exts;
+
+ exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
+ if (!exts)
+ return -EMSGSIZE;
+
+ if (vxlan_port->exts & VXLAN_F_GBP &&
+ nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, exts);
+ }
+
+ return 0;
+}
+
+static void vxlan_tnl_destroy(struct vport *vport)
+{
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+
+ vxlan_sock_release(vxlan_port->vs);
+
+ ovs_vport_deferred_free(vport);
+}
+
+static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = {
+ [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
+};
+
+static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
+{
+ struct nlattr *exts[OVS_VXLAN_EXT_MAX+1];
+ struct vxlan_port *vxlan_port;
+ int err;
+
+ if (nla_len(attr) < sizeof(struct nlattr))
+ return -EINVAL;
+
+ err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy);
+ if (err < 0)
+ return err;
+
+ vxlan_port = vxlan_vport(vport);
+
+ if (exts[OVS_VXLAN_EXT_GBP])
+ vxlan_port->exts |= VXLAN_F_GBP;
+
+ return 0;
+}
+
+static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
+{
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct nlattr *options = parms->options;
+ struct vxlan_port *vxlan_port;
+ struct vxlan_sock *vs;
+ struct vport *vport;
+ struct nlattr *a;
+ u16 dst_port;
+ int err;
+
+ if (!options) {
+ err = -EINVAL;
+ goto error;
+ }
+ a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+ if (a && nla_len(a) == sizeof(u16)) {
+ dst_port = nla_get_u16(a);
+ } else {
+ /* Require destination port from userspace. */
+ err = -EINVAL;
+ goto error;
+ }
+
+ vport = ovs_vport_alloc(sizeof(struct vxlan_port),
+ &ovs_vxlan_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ vxlan_port = vxlan_vport(vport);
+ strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
+
+ a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
+ if (a) {
+ err = vxlan_configure_exts(vport, a);
+ if (err) {
+ ovs_vport_free(vport);
+ goto error;
+ }
+ }
+
+ vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true,
+ vxlan_port->exts);
+ if (IS_ERR(vs)) {
+ ovs_vport_free(vport);
+ return (void *)vs;
+ }
+ vxlan_port->vs = vs;
+
+ return vport;
+
+error:
+ return ERR_PTR(err);
+}
+
+static int vxlan_ext_gbp(struct sk_buff *skb)
+{
+ const struct ovs_tunnel_info *tun_info;
+ const struct ovs_vxlan_opts *opts;
+
+ tun_info = OVS_CB(skb)->egress_tun_info;
+ opts = tun_info->options;
+
+ if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT &&
+ tun_info->options_len >= sizeof(*opts))
+ return opts->gbp;
+ else
+ return 0;
+}
+
+static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+ struct net *net = ovs_dp_get_net(vport->dp);
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+ struct sock *sk = vxlan_port->vs->sock->sk;
+ __be16 dst_port = inet_sk(sk)->inet_sport;
+ const struct ovs_key_ipv4_tunnel *tun_key;
+ struct vxlan_metadata md = {0};
+ struct rtable *rt;
+ struct flowi4 fl;
+ __be16 src_port;
+ __be16 df;
+ int err;
+ u32 vxflags;
+
+ if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
+ rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto error;
+ }
+
+ df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
+ htons(IP_DF) : 0;
+
+ skb->ignore_df = 1;
+
+ src_port = udp_flow_src_port(net, skb, 0, 0, true);
+ md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
+ md.gbp = vxlan_ext_gbp(skb);
+ vxflags = vxlan_port->exts |
+ (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0);
+
+ err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst,
+ tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
+ src_port, dst_port,
+ &md, false, vxflags);
+ if (err < 0)
+ ip_rt_put(rt);
+ return err;
+error:
+ kfree_skb(skb);
+ return err;
+}
+
+static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct ovs_tunnel_info *egress_tun_info)
+{
+ struct net *net = ovs_dp_get_net(vport->dp);
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+ __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+ __be16 src_port;
+ int port_min;
+ int port_max;
+
+ inet_get_local_port_range(net, &port_min, &port_max);
+ src_port = udp_flow_src_port(net, skb, 0, 0, true);
+
+ return ovs_tunnel_get_egress_info(egress_tun_info, net,
+ OVS_CB(skb)->egress_tun_info,
+ IPPROTO_UDP, skb->mark,
+ src_port, dst_port);
+}
+
+static const char *vxlan_get_name(const struct vport *vport)
+{
+ struct vxlan_port *vxlan_port = vxlan_vport(vport);
+ return vxlan_port->name;
+}
+
+static struct vport_ops ovs_vxlan_vport_ops = {
+ .type = OVS_VPORT_TYPE_VXLAN,
+ .create = vxlan_tnl_create,
+ .destroy = vxlan_tnl_destroy,
+ .get_name = vxlan_get_name,
+ .get_options = vxlan_get_options,
+ .send = vxlan_tnl_send,
+ .get_egress_tun_info = vxlan_get_egress_tun_info,
+ .owner = THIS_MODULE,
+};
+
+static int __init ovs_vxlan_tnl_init(void)
+{
+ return ovs_vport_ops_register(&ovs_vxlan_vport_ops);
+}
+
+static void __exit ovs_vxlan_tnl_exit(void)
+{
+ ovs_vport_ops_unregister(&ovs_vxlan_vport_ops);
+}
+
+module_init(ovs_vxlan_tnl_init);
+module_exit(ovs_vxlan_tnl_exit);
+
+MODULE_DESCRIPTION("OVS: VXLAN switching port");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("vport-type-4");
diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h
new file mode 100644
index 000000000..4b08233e7
--- /dev/null
+++ b/net/openvswitch/vport-vxlan.h
@@ -0,0 +1,11 @@
+#ifndef VPORT_VXLAN_H
+#define VPORT_VXLAN_H 1
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+struct ovs_vxlan_opts {
+ __u32 gbp;
+};
+
+#endif
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
new file mode 100644
index 000000000..067a3fff1
--- /dev/null
+++ b/net/openvswitch/vport.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2007-2014 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+#include <linux/compat.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+
+#include "datapath.h"
+#include "vport.h"
+#include "vport-internal_dev.h"
+
+static void ovs_vport_record_error(struct vport *,
+ enum vport_err_type err_type);
+
+static LIST_HEAD(vport_ops_list);
+
+/* Protected by RCU read lock for reading, ovs_mutex for writing. */
+static struct hlist_head *dev_table;
+#define VPORT_HASH_BUCKETS 1024
+
+/**
+ * ovs_vport_init - initialize vport subsystem
+ *
+ * Called at module load time to initialize the vport subsystem.
+ */
+int ovs_vport_init(void)
+{
+ dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!dev_table)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * ovs_vport_exit - shutdown vport subsystem
+ *
+ * Called at module exit time to shutdown the vport subsystem.
+ */
+void ovs_vport_exit(void)
+{
+ kfree(dev_table);
+}
+
+static struct hlist_head *hash_bucket(const struct net *net, const char *name)
+{
+ unsigned int hash = jhash(name, strlen(name), (unsigned long) net);
+ return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)];
+}
+
+int ovs_vport_ops_register(struct vport_ops *ops)
+{
+ int err = -EEXIST;
+ struct vport_ops *o;
+
+ ovs_lock();
+ list_for_each_entry(o, &vport_ops_list, list)
+ if (ops->type == o->type)
+ goto errout;
+
+ list_add_tail(&ops->list, &vport_ops_list);
+ err = 0;
+errout:
+ ovs_unlock();
+ return err;
+}
+EXPORT_SYMBOL_GPL(ovs_vport_ops_register);
+
+void ovs_vport_ops_unregister(struct vport_ops *ops)
+{
+ ovs_lock();
+ list_del(&ops->list);
+ ovs_unlock();
+}
+EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister);
+
+/**
+ * ovs_vport_locate - find a port that has already been created
+ *
+ * @name: name of port to find
+ *
+ * Must be called with ovs or RCU read lock.
+ */
+struct vport *ovs_vport_locate(const struct net *net, const char *name)
+{
+ struct hlist_head *bucket = hash_bucket(net, name);
+ struct vport *vport;
+
+ hlist_for_each_entry_rcu(vport, bucket, hash_node)
+ if (!strcmp(name, vport->ops->get_name(vport)) &&
+ net_eq(ovs_dp_get_net(vport->dp), net))
+ return vport;
+
+ return NULL;
+}
+
+/**
+ * ovs_vport_alloc - allocate and initialize new vport
+ *
+ * @priv_size: Size of private data area to allocate.
+ * @ops: vport device ops
+ *
+ * Allocate and initialize a new vport defined by @ops. The vport will contain
+ * a private data area of size @priv_size that can be accessed using
+ * vport_priv(). vports that are no longer needed should be released with
+ * vport_free().
+ */
+struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
+ const struct vport_parms *parms)
+{
+ struct vport *vport;
+ size_t alloc_size;
+
+ alloc_size = sizeof(struct vport);
+ if (priv_size) {
+ alloc_size = ALIGN(alloc_size, VPORT_ALIGN);
+ alloc_size += priv_size;
+ }
+
+ vport = kzalloc(alloc_size, GFP_KERNEL);
+ if (!vport)
+ return ERR_PTR(-ENOMEM);
+
+ vport->dp = parms->dp;
+ vport->port_no = parms->port_no;
+ vport->ops = ops;
+ INIT_HLIST_NODE(&vport->dp_hash_node);
+
+ if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) {
+ kfree(vport);
+ return ERR_PTR(-EINVAL);
+ }
+
+ vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!vport->percpu_stats) {
+ kfree(vport);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return vport;
+}
+EXPORT_SYMBOL_GPL(ovs_vport_alloc);
+
+/**
+ * ovs_vport_free - uninitialize and free vport
+ *
+ * @vport: vport to free
+ *
+ * Frees a vport allocated with vport_alloc() when it is no longer needed.
+ *
+ * The caller must ensure that an RCU grace period has passed since the last
+ * time @vport was in a datapath.
+ */
+void ovs_vport_free(struct vport *vport)
+{
+ /* vport is freed from RCU callback or error path, Therefore
+ * it is safe to use raw dereference.
+ */
+ kfree(rcu_dereference_raw(vport->upcall_portids));
+ free_percpu(vport->percpu_stats);
+ kfree(vport);
+}
+EXPORT_SYMBOL_GPL(ovs_vport_free);
+
+static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms)
+{
+ struct vport_ops *ops;
+
+ list_for_each_entry(ops, &vport_ops_list, list)
+ if (ops->type == parms->type)
+ return ops;
+
+ return NULL;
+}
+
+/**
+ * ovs_vport_add - add vport device (for kernel callers)
+ *
+ * @parms: Information about new vport.
+ *
+ * Creates a new vport with the specified configuration (which is dependent on
+ * device type). ovs_mutex must be held.
+ */
+struct vport *ovs_vport_add(const struct vport_parms *parms)
+{
+ struct vport_ops *ops;
+ struct vport *vport;
+
+ ops = ovs_vport_lookup(parms);
+ if (ops) {
+ struct hlist_head *bucket;
+
+ if (!try_module_get(ops->owner))
+ return ERR_PTR(-EAFNOSUPPORT);
+
+ vport = ops->create(parms);
+ if (IS_ERR(vport)) {
+ module_put(ops->owner);
+ return vport;
+ }
+
+ bucket = hash_bucket(ovs_dp_get_net(vport->dp),
+ vport->ops->get_name(vport));
+ hlist_add_head_rcu(&vport->hash_node, bucket);
+ return vport;
+ }
+
+ /* Unlock to attempt module load and return -EAGAIN if load
+ * was successful as we need to restart the port addition
+ * workflow.
+ */
+ ovs_unlock();
+ request_module("vport-type-%d", parms->type);
+ ovs_lock();
+
+ if (!ovs_vport_lookup(parms))
+ return ERR_PTR(-EAFNOSUPPORT);
+ else
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * ovs_vport_set_options - modify existing vport device (for kernel callers)
+ *
+ * @vport: vport to modify.
+ * @options: New configuration.
+ *
+ * Modifies an existing device with the specified configuration (which is
+ * dependent on device type). ovs_mutex must be held.
+ */
+int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
+{
+ if (!vport->ops->set_options)
+ return -EOPNOTSUPP;
+ return vport->ops->set_options(vport, options);
+}
+
+/**
+ * ovs_vport_del - delete existing vport device
+ *
+ * @vport: vport to delete.
+ *
+ * Detaches @vport from its datapath and destroys it. It is possible to fail
+ * for reasons such as lack of memory. ovs_mutex must be held.
+ */
+void ovs_vport_del(struct vport *vport)
+{
+ ASSERT_OVSL();
+
+ hlist_del_rcu(&vport->hash_node);
+ module_put(vport->ops->owner);
+ vport->ops->destroy(vport);
+}
+
+/**
+ * ovs_vport_get_stats - retrieve device stats
+ *
+ * @vport: vport from which to retrieve the stats
+ * @stats: location to store stats
+ *
+ * Retrieves transmit, receive, and error stats for the given device.
+ *
+ * Must be called with ovs_mutex or rcu_read_lock.
+ */
+void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
+{
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+
+ /* We potentially have 2 sources of stats that need to be combined:
+ * those we have collected (split into err_stats and percpu_stats) from
+ * set_stats() and device error stats from netdev->get_stats() (for
+ * errors that happen downstream and therefore aren't reported through
+ * our vport_record_error() function).
+ * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS).
+ * netdev-stats can be directly read over netlink-ioctl.
+ */
+
+ stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors);
+ stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors);
+ stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped);
+ stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped);
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_sw_netstats *percpu_stats;
+ struct pcpu_sw_netstats local_stats;
+ unsigned int start;
+
+ percpu_stats = per_cpu_ptr(vport->percpu_stats, i);
+
+ do {
+ start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
+ local_stats = *percpu_stats;
+ } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
+
+ stats->rx_bytes += local_stats.rx_bytes;
+ stats->rx_packets += local_stats.rx_packets;
+ stats->tx_bytes += local_stats.tx_bytes;
+ stats->tx_packets += local_stats.tx_packets;
+ }
+}
+
+/**
+ * ovs_vport_get_options - retrieve device options
+ *
+ * @vport: vport from which to retrieve the options.
+ * @skb: sk_buff where options should be appended.
+ *
+ * Retrieves the configuration of the given device, appending an
+ * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested
+ * vport-specific attributes to @skb.
+ *
+ * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another
+ * negative error code if a real error occurred. If an error occurs, @skb is
+ * left unmodified.
+ *
+ * Must be called with ovs_mutex or rcu_read_lock.
+ */
+int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
+{
+ struct nlattr *nla;
+ int err;
+
+ if (!vport->ops->get_options)
+ return 0;
+
+ nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS);
+ if (!nla)
+ return -EMSGSIZE;
+
+ err = vport->ops->get_options(vport, skb);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
+ }
+
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+/**
+ * ovs_vport_set_upcall_portids - set upcall portids of @vport.
+ *
+ * @vport: vport to modify.
+ * @ids: new configuration, an array of port ids.
+ *
+ * Sets the vport's upcall_portids to @ids.
+ *
+ * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed
+ * as an array of U32.
+ *
+ * Must be called with ovs_mutex.
+ */
+int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids)
+{
+ struct vport_portids *old, *vport_portids;
+
+ if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
+ return -EINVAL;
+
+ old = ovsl_dereference(vport->upcall_portids);
+
+ vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids),
+ GFP_KERNEL);
+ if (!vport_portids)
+ return -ENOMEM;
+
+ vport_portids->n_ids = nla_len(ids) / sizeof(u32);
+ vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids);
+ nla_memcpy(vport_portids->ids, ids, nla_len(ids));
+
+ rcu_assign_pointer(vport->upcall_portids, vport_portids);
+
+ if (old)
+ kfree_rcu(old, rcu);
+ return 0;
+}
+
+/**
+ * ovs_vport_get_upcall_portids - get the upcall_portids of @vport.
+ *
+ * @vport: vport from which to retrieve the portids.
+ * @skb: sk_buff where portids should be appended.
+ *
+ * Retrieves the configuration of the given vport, appending the
+ * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall
+ * portids to @skb.
+ *
+ * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room.
+ * If an error occurs, @skb is left unmodified. Must be called with
+ * ovs_mutex or rcu_read_lock.
+ */
+int ovs_vport_get_upcall_portids(const struct vport *vport,
+ struct sk_buff *skb)
+{
+ struct vport_portids *ids;
+
+ ids = rcu_dereference_ovsl(vport->upcall_portids);
+
+ if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS)
+ return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID,
+ ids->n_ids * sizeof(u32), (void *)ids->ids);
+ else
+ return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]);
+}
+
+/**
+ * ovs_vport_find_upcall_portid - find the upcall portid to send upcall.
+ *
+ * @vport: vport from which the missed packet is received.
+ * @skb: skb that the missed packet was received.
+ *
+ * Uses the skb_get_hash() to select the upcall portid to send the
+ * upcall.
+ *
+ * Returns the portid of the target socket. Must be called with rcu_read_lock.
+ */
+u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
+{
+ struct vport_portids *ids;
+ u32 ids_index;
+ u32 hash;
+
+ ids = rcu_dereference(vport->upcall_portids);
+
+ if (ids->n_ids == 1 && ids->ids[0] == 0)
+ return 0;
+
+ hash = skb_get_hash(skb);
+ ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids);
+ return ids->ids[ids_index];
+}
+
+/**
+ * ovs_vport_receive - pass up received packet to the datapath for processing
+ *
+ * @vport: vport that received the packet
+ * @skb: skb that was received
+ * @tun_key: tunnel (if any) that carried packet
+ *
+ * Must be called with rcu_read_lock. The packet cannot be shared and
+ * skb->data should point to the Ethernet header.
+ */
+void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
+ const struct ovs_tunnel_info *tun_info)
+{
+ struct pcpu_sw_netstats *stats;
+ struct sw_flow_key key;
+ int error;
+
+ stats = this_cpu_ptr(vport->percpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len +
+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+ u64_stats_update_end(&stats->syncp);
+
+ OVS_CB(skb)->input_vport = vport;
+ OVS_CB(skb)->egress_tun_info = NULL;
+ /* Extract flow from 'skb' into 'key'. */
+ error = ovs_flow_key_extract(tun_info, skb, &key);
+ if (unlikely(error)) {
+ kfree_skb(skb);
+ return;
+ }
+ ovs_dp_process_packet(skb, &key);
+}
+EXPORT_SYMBOL_GPL(ovs_vport_receive);
+
+/**
+ * ovs_vport_send - send a packet on a device
+ *
+ * @vport: vport on which to send the packet
+ * @skb: skb to send
+ *
+ * Sends the given packet and returns the length of data sent. Either ovs
+ * lock or rcu_read_lock must be held.
+ */
+int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
+{
+ int sent = vport->ops->send(vport, skb);
+
+ if (likely(sent > 0)) {
+ struct pcpu_sw_netstats *stats;
+
+ stats = this_cpu_ptr(vport->percpu_stats);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->tx_packets++;
+ stats->tx_bytes += sent;
+ u64_stats_update_end(&stats->syncp);
+ } else if (sent < 0) {
+ ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
+ } else {
+ ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
+ }
+ return sent;
+}
+
+/**
+ * ovs_vport_record_error - indicate device error to generic stats layer
+ *
+ * @vport: vport that encountered the error
+ * @err_type: one of enum vport_err_type types to indicate the error type
+ *
+ * If using the vport generic stats layer indicate that an error of the given
+ * type has occurred.
+ */
+static void ovs_vport_record_error(struct vport *vport,
+ enum vport_err_type err_type)
+{
+ switch (err_type) {
+ case VPORT_E_RX_DROPPED:
+ atomic_long_inc(&vport->err_stats.rx_dropped);
+ break;
+
+ case VPORT_E_RX_ERROR:
+ atomic_long_inc(&vport->err_stats.rx_errors);
+ break;
+
+ case VPORT_E_TX_DROPPED:
+ atomic_long_inc(&vport->err_stats.tx_dropped);
+ break;
+
+ case VPORT_E_TX_ERROR:
+ atomic_long_inc(&vport->err_stats.tx_errors);
+ break;
+ }
+
+}
+
+static void free_vport_rcu(struct rcu_head *rcu)
+{
+ struct vport *vport = container_of(rcu, struct vport, rcu);
+
+ ovs_vport_free(vport);
+}
+
+void ovs_vport_deferred_free(struct vport *vport)
+{
+ if (!vport)
+ return;
+
+ call_rcu(&vport->rcu, free_vport_rcu);
+}
+EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
+
+int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+ struct net *net,
+ const struct ovs_tunnel_info *tun_info,
+ u8 ipproto,
+ u32 skb_mark,
+ __be16 tp_src,
+ __be16 tp_dst)
+{
+ const struct ovs_key_ipv4_tunnel *tun_key;
+ struct rtable *rt;
+ struct flowi4 fl;
+
+ if (unlikely(!tun_info))
+ return -EINVAL;
+
+ tun_key = &tun_info->tunnel;
+
+ /* Route lookup to get srouce IP address.
+ * The process may need to be changed if the corresponding process
+ * in vports ops changed.
+ */
+ rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ ip_rt_put(rt);
+
+ /* Generate egress_tun_info based on tun_info,
+ * saddr, tp_src and tp_dst
+ */
+ __ovs_flow_tun_info_init(egress_tun_info,
+ fl.saddr, tun_key->ipv4_dst,
+ tun_key->ipv4_tos,
+ tun_key->ipv4_ttl,
+ tp_src, tp_dst,
+ tun_key->tun_id,
+ tun_key->tun_flags,
+ tun_info->options,
+ tun_info->options_len);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
+
+int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct ovs_tunnel_info *info)
+{
+ /* get_egress_tun_info() is only implemented on tunnel ports. */
+ if (unlikely(!vport->ops->get_egress_tun_info))
+ return -EINVAL;
+
+ return vport->ops->get_egress_tun_info(vport, skb, info);
+}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
new file mode 100644
index 000000000..bc85331a6
--- /dev/null
+++ b/net/openvswitch/vport.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2007-2012 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef VPORT_H
+#define VPORT_H 1
+
+#include <linux/if_tunnel.h>
+#include <linux/list.h>
+#include <linux/netlink.h>
+#include <linux/openvswitch.h>
+#include <linux/reciprocal_div.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/u64_stats_sync.h>
+
+#include "datapath.h"
+
+struct vport;
+struct vport_parms;
+
+/* The following definitions are for users of the vport subsytem: */
+
+struct vport_net {
+ struct vport __rcu *gre_vport;
+};
+
+int ovs_vport_init(void);
+void ovs_vport_exit(void);
+
+struct vport *ovs_vport_add(const struct vport_parms *);
+void ovs_vport_del(struct vport *);
+
+struct vport *ovs_vport_locate(const struct net *net, const char *name);
+
+void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *);
+
+int ovs_vport_set_options(struct vport *, struct nlattr *options);
+int ovs_vport_get_options(const struct vport *, struct sk_buff *);
+
+int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids);
+int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *);
+u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
+
+int ovs_vport_send(struct vport *, struct sk_buff *);
+
+int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+ struct net *net,
+ const struct ovs_tunnel_info *tun_info,
+ u8 ipproto,
+ u32 skb_mark,
+ __be16 tp_src,
+ __be16 tp_dst);
+int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct ovs_tunnel_info *info);
+
+/* The following definitions are for implementers of vport devices: */
+
+struct vport_err_stats {
+ atomic_long_t rx_dropped;
+ atomic_long_t rx_errors;
+ atomic_long_t tx_dropped;
+ atomic_long_t tx_errors;
+};
+/**
+ * struct vport_portids - array of netlink portids of a vport.
+ * must be protected by rcu.
+ * @rn_ids: The reciprocal value of @n_ids.
+ * @rcu: RCU callback head for deferred destruction.
+ * @n_ids: Size of @ids array.
+ * @ids: Array storing the Netlink socket pids to be used for packets received
+ * on this port that miss the flow table.
+ */
+struct vport_portids {
+ struct reciprocal_value rn_ids;
+ struct rcu_head rcu;
+ u32 n_ids;
+ u32 ids[];
+};
+
+/**
+ * struct vport - one port within a datapath
+ * @rcu: RCU callback head for deferred destruction.
+ * @dp: Datapath to which this port belongs.
+ * @upcall_portids: RCU protected 'struct vport_portids'.
+ * @port_no: Index into @dp's @ports array.
+ * @hash_node: Element in @dev_table hash table in vport.c.
+ * @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
+ * @ops: Class structure.
+ * @percpu_stats: Points to per-CPU statistics used and maintained by vport
+ * @err_stats: Points to error statistics used and maintained by vport
+ * @detach_list: list used for detaching vport in net-exit call.
+ */
+struct vport {
+ struct rcu_head rcu;
+ struct datapath *dp;
+ struct vport_portids __rcu *upcall_portids;
+ u16 port_no;
+
+ struct hlist_node hash_node;
+ struct hlist_node dp_hash_node;
+ const struct vport_ops *ops;
+
+ struct pcpu_sw_netstats __percpu *percpu_stats;
+
+ struct vport_err_stats err_stats;
+ struct list_head detach_list;
+};
+
+/**
+ * struct vport_parms - parameters for creating a new vport
+ *
+ * @name: New vport's name.
+ * @type: New vport's type.
+ * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if
+ * none was supplied.
+ * @dp: New vport's datapath.
+ * @port_no: New vport's port number.
+ */
+struct vport_parms {
+ const char *name;
+ enum ovs_vport_type type;
+ struct nlattr *options;
+
+ /* For ovs_vport_alloc(). */
+ struct datapath *dp;
+ u16 port_no;
+ struct nlattr *upcall_portids;
+};
+
+/**
+ * struct vport_ops - definition of a type of virtual port
+ *
+ * @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
+ * @create: Create a new vport configured as specified. On success returns
+ * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
+ * @destroy: Destroys a vport. Must call vport_free() on the vport but not
+ * before an RCU grace period has elapsed.
+ * @set_options: Modify the configuration of an existing vport. May be %NULL
+ * if modification is not supported.
+ * @get_options: Appends vport-specific attributes for the configuration of an
+ * existing vport to a &struct sk_buff. May be %NULL for a vport that does not
+ * have any configuration.
+ * @get_name: Get the device's name.
+ * @send: Send a packet on the device. Returns the length of the packet sent,
+ * zero for dropped packets or negative for error.
+ * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for
+ * a packet.
+ */
+struct vport_ops {
+ enum ovs_vport_type type;
+
+ /* Called with ovs_mutex. */
+ struct vport *(*create)(const struct vport_parms *);
+ void (*destroy)(struct vport *);
+
+ int (*set_options)(struct vport *, struct nlattr *);
+ int (*get_options)(const struct vport *, struct sk_buff *);
+
+ /* Called with rcu_read_lock or ovs_mutex. */
+ const char *(*get_name)(const struct vport *);
+
+ int (*send)(struct vport *, struct sk_buff *);
+ int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
+ struct ovs_tunnel_info *);
+
+ struct module *owner;
+ struct list_head list;
+};
+
+enum vport_err_type {
+ VPORT_E_RX_DROPPED,
+ VPORT_E_RX_ERROR,
+ VPORT_E_TX_DROPPED,
+ VPORT_E_TX_ERROR,
+};
+
+struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
+ const struct vport_parms *);
+void ovs_vport_free(struct vport *);
+void ovs_vport_deferred_free(struct vport *vport);
+
+#define VPORT_ALIGN 8
+
+/**
+ * vport_priv - access private data area of vport
+ *
+ * @vport: vport to access
+ *
+ * If a nonzero size was passed in priv_size of vport_alloc() a private data
+ * area was allocated on creation. This allows that area to be accessed and
+ * used for any purpose needed by the vport implementer.
+ */
+static inline void *vport_priv(const struct vport *vport)
+{
+ return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN);
+}
+
+/**
+ * vport_from_priv - lookup vport from private data pointer
+ *
+ * @priv: Start of private data area.
+ *
+ * It is sometimes useful to translate from a pointer to the private data
+ * area to the vport, such as in the case where the private data pointer is
+ * the result of a hash table lookup. @priv must point to the start of the
+ * private data area.
+ */
+static inline struct vport *vport_from_priv(void *priv)
+{
+ return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
+}
+
+void ovs_vport_receive(struct vport *, struct sk_buff *,
+ const struct ovs_tunnel_info *);
+
+static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
+ const void *start, unsigned int len)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
+}
+
+int ovs_vport_ops_register(struct vport_ops *ops);
+void ovs_vport_ops_unregister(struct vport_ops *ops);
+
+static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
+ const struct ovs_key_ipv4_tunnel *key,
+ u32 mark,
+ struct flowi4 *fl,
+ u8 protocol)
+{
+ struct rtable *rt;
+
+ memset(fl, 0, sizeof(*fl));
+ fl->daddr = key->ipv4_dst;
+ fl->saddr = key->ipv4_src;
+ fl->flowi4_tos = RT_TOS(key->ipv4_tos);
+ fl->flowi4_mark = mark;
+ fl->flowi4_proto = protocol;
+
+ rt = ip_route_output_key(net, fl);
+ return rt;
+}
+#endif /* vport.h */
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 000000000..cc55b35f8
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,24 @@
+#
+# Packet configuration
+#
+
+config PACKET
+ tristate "Packet socket"
+ ---help---
+ The Packet protocol is used by applications which communicate
+ directly with network devices without an intermediate network
+ protocol implemented in the kernel, e.g. tcpdump. If you want them
+ to work, choose Y.
+
+ To compile this driver as a module, choose M here: the module will
+ be called af_packet.
+
+ If unsure, say Y.
+
+config PACKET_DIAG
+ tristate "Packet: sockets monitoring interface"
+ depends on PACKET
+ default n
+ ---help---
+ Support for PF_PACKET sockets monitoring interface used by the ss tool.
+ If unsure, say Y.
diff --git a/net/packet/Makefile b/net/packet/Makefile
new file mode 100644
index 000000000..9df61347a
--- /dev/null
+++ b/net/packet/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the packet AF.
+#
+
+obj-$(CONFIG_PACKET) += af_packet.o
+obj-$(CONFIG_PACKET_DIAG) += af_packet_diag.o
+af_packet_diag-y += diag.o
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
new file mode 100644
index 000000000..fe1610dde
--- /dev/null
+++ b/net/packet/af_packet.c
@@ -0,0 +1,4176 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * PACKET - implements raw packet sockets.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ * Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *
+ * Fixes:
+ * Alan Cox : verify_area() now used correctly
+ * Alan Cox : new skbuff lists, look ma no backlogs!
+ * Alan Cox : tidied skbuff lists.
+ * Alan Cox : Now uses generic datagram routines I
+ * added. Also fixed the peek/read crash
+ * from all old Linux datagram code.
+ * Alan Cox : Uses the improved datagram code.
+ * Alan Cox : Added NULL's for socket options.
+ * Alan Cox : Re-commented the code.
+ * Alan Cox : Use new kernel side addressing
+ * Rob Janssen : Correct MTU usage.
+ * Dave Platt : Counter leaks caused by incorrect
+ * interrupt locking and some slightly
+ * dubious gcc output. Can you read
+ * compiler: it said _VOLATILE_
+ * Richard Kooijman : Timestamp fixes.
+ * Alan Cox : New buffers. Use sk->mac.raw.
+ * Alan Cox : sendmsg/recvmsg support.
+ * Alan Cox : Protocol setting support
+ * Alexey Kuznetsov : Untied from IPv4 stack.
+ * Cyrus Durgin : Fixed kerneld for kmod.
+ * Michal Ostrowski : Module initialization cleanup.
+ * Ulises Alonso : Frame number limit removal and
+ * packet_set_ring memory leak.
+ * Eric Biederman : Allow for > 8 byte hardware addresses.
+ * The convention is that longer addresses
+ * will simply extend the hardware address
+ * byte arrays at the end of sockaddr_ll
+ * and packet_mreq.
+ * Johann Baudy : Added TX RING.
+ * Chetan Loke : Implemented TPACKET_V3 block abstraction
+ * layer.
+ * Copyright (C) 2011, <lokec@ccs.neu.edu>
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/capability.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_packet.h>
+#include <linux/wireless.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <asm/page.h>
+#include <asm/cacheflush.h>
+#include <asm/io.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/poll.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/if_vlan.h>
+#include <linux/virtio_net.h>
+#include <linux/errqueue.h>
+#include <linux/net_tstamp.h>
+#include <linux/percpu.h>
+#ifdef CONFIG_INET
+#include <net/inet_common.h>
+#endif
+
+#include "internal.h"
+
+/*
+ Assumptions:
+ - if device has no dev->hard_header routine, it adds and removes ll header
+ inside itself. In this case ll header is invisible outside of device,
+ but higher levels still should reserve dev->hard_header_len.
+ Some devices are enough clever to reallocate skb, when header
+ will not fit to reserved space (tunnel), another ones are silly
+ (PPP).
+ - packet socket receives packets with pulled ll header,
+ so that SOCK_RAW should push it back.
+
+On receive:
+-----------
+
+Incoming, dev->hard_header!=NULL
+ mac_header -> ll header
+ data -> data
+
+Outgoing, dev->hard_header!=NULL
+ mac_header -> ll header
+ data -> ll header
+
+Incoming, dev->hard_header==NULL
+ mac_header -> UNKNOWN position. It is very likely, that it points to ll
+ header. PPP makes it, that is wrong, because introduce
+ assymetry between rx and tx paths.
+ data -> data
+
+Outgoing, dev->hard_header==NULL
+ mac_header -> data. ll header is still not built!
+ data -> data
+
+Resume
+ If dev->hard_header==NULL we are unlikely to restore sensible ll header.
+
+
+On transmit:
+------------
+
+dev->hard_header != NULL
+ mac_header -> ll header
+ data -> ll header
+
+dev->hard_header == NULL (ll header is added by device, we cannot control it)
+ mac_header -> data
+ data -> data
+
+ We should set nh.raw on output to correct posistion,
+ packet classifier depends on it.
+ */
+
+/* Private packet socket structures. */
+
+/* identical to struct packet_mreq except it has
+ * a longer address field.
+ */
+struct packet_mreq_max {
+ int mr_ifindex;
+ unsigned short mr_type;
+ unsigned short mr_alen;
+ unsigned char mr_address[MAX_ADDR_LEN];
+};
+
+union tpacket_uhdr {
+ struct tpacket_hdr *h1;
+ struct tpacket2_hdr *h2;
+ struct tpacket3_hdr *h3;
+ void *raw;
+};
+
+static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
+ int closing, int tx_ring);
+
+#define V3_ALIGNMENT (8)
+
+#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
+
+#define BLK_PLUS_PRIV(sz_of_priv) \
+ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
+
+#define PGV_FROM_VMALLOC 1
+
+#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
+#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
+#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
+#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
+#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
+#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
+#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
+
+struct packet_sock;
+static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
+
+static void *packet_previous_frame(struct packet_sock *po,
+ struct packet_ring_buffer *rb,
+ int status);
+static void packet_increment_head(struct packet_ring_buffer *buff);
+static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
+ struct tpacket_block_desc *);
+static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
+ struct packet_sock *);
+static void prb_retire_current_block(struct tpacket_kbdq_core *,
+ struct packet_sock *, unsigned int status);
+static int prb_queue_frozen(struct tpacket_kbdq_core *);
+static void prb_open_block(struct tpacket_kbdq_core *,
+ struct tpacket_block_desc *);
+static void prb_retire_rx_blk_timer_expired(unsigned long);
+static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
+static void prb_init_blk_timer(struct packet_sock *,
+ struct tpacket_kbdq_core *,
+ void (*func) (unsigned long));
+static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
+static void prb_clear_rxhash(struct tpacket_kbdq_core *,
+ struct tpacket3_hdr *);
+static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
+ struct tpacket3_hdr *);
+static void packet_flush_mclist(struct sock *sk);
+
+struct packet_skb_cb {
+ union {
+ struct sockaddr_pkt pkt;
+ union {
+ /* Trick: alias skb original length with
+ * ll.sll_family and ll.protocol in order
+ * to save room.
+ */
+ unsigned int origlen;
+ struct sockaddr_ll ll;
+ };
+ } sa;
+};
+
+#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
+
+#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
+#define GET_PBLOCK_DESC(x, bid) \
+ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
+#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
+ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
+#define GET_NEXT_PRB_BLK_NUM(x) \
+ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
+ ((x)->kactive_blk_num+1) : 0)
+
+static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
+static void __fanout_link(struct sock *sk, struct packet_sock *po);
+
+static int packet_direct_xmit(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ netdev_features_t features;
+ struct netdev_queue *txq;
+ int ret = NETDEV_TX_BUSY;
+
+ if (unlikely(!netif_running(dev) ||
+ !netif_carrier_ok(dev)))
+ goto drop;
+
+ features = netif_skb_features(skb);
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto drop;
+
+ txq = skb_get_tx_queue(dev, skb);
+
+ local_bh_disable();
+
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_drv_stopped(txq))
+ ret = netdev_start_xmit(skb, dev, txq, false);
+ HARD_TX_UNLOCK(dev, txq);
+
+ local_bh_enable();
+
+ if (!dev_xmit_complete(ret))
+ kfree_skb(skb);
+
+ return ret;
+drop:
+ atomic_long_inc(&dev->tx_dropped);
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+
+static struct net_device *packet_cached_dev_get(struct packet_sock *po)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = rcu_dereference(po->cached_dev);
+ if (likely(dev))
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ return dev;
+}
+
+static void packet_cached_dev_assign(struct packet_sock *po,
+ struct net_device *dev)
+{
+ rcu_assign_pointer(po->cached_dev, dev);
+}
+
+static void packet_cached_dev_reset(struct packet_sock *po)
+{
+ RCU_INIT_POINTER(po->cached_dev, NULL);
+}
+
+static bool packet_use_direct_xmit(const struct packet_sock *po)
+{
+ return po->xmit == packet_direct_xmit;
+}
+
+static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+{
+ return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
+}
+
+static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ u16 queue_index;
+
+ if (ops->ndo_select_queue) {
+ queue_index = ops->ndo_select_queue(dev, skb, NULL,
+ __packet_pick_tx_queue);
+ queue_index = netdev_cap_txqueue(dev, queue_index);
+ } else {
+ queue_index = __packet_pick_tx_queue(dev, skb);
+ }
+
+ skb_set_queue_mapping(skb, queue_index);
+}
+
+/* register_prot_hook must be invoked with the po->bind_lock held,
+ * or from a context in which asynchronous accesses to the packet
+ * socket is not possible (packet_create()).
+ */
+static void register_prot_hook(struct sock *sk)
+{
+ struct packet_sock *po = pkt_sk(sk);
+
+ if (!po->running) {
+ if (po->fanout)
+ __fanout_link(sk, po);
+ else
+ dev_add_pack(&po->prot_hook);
+
+ sock_hold(sk);
+ po->running = 1;
+ }
+}
+
+/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
+ * held. If the sync parameter is true, we will temporarily drop
+ * the po->bind_lock and do a synchronize_net to make sure no
+ * asynchronous packet processing paths still refer to the elements
+ * of po->prot_hook. If the sync parameter is false, it is the
+ * callers responsibility to take care of this.
+ */
+static void __unregister_prot_hook(struct sock *sk, bool sync)
+{
+ struct packet_sock *po = pkt_sk(sk);
+
+ po->running = 0;
+
+ if (po->fanout)
+ __fanout_unlink(sk, po);
+ else
+ __dev_remove_pack(&po->prot_hook);
+
+ __sock_put(sk);
+
+ if (sync) {
+ spin_unlock(&po->bind_lock);
+ synchronize_net();
+ spin_lock(&po->bind_lock);
+ }
+}
+
+static void unregister_prot_hook(struct sock *sk, bool sync)
+{
+ struct packet_sock *po = pkt_sk(sk);
+
+ if (po->running)
+ __unregister_prot_hook(sk, sync);
+}
+
+static inline struct page * __pure pgv_to_page(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ return virt_to_page(addr);
+}
+
+static void __packet_set_status(struct packet_sock *po, void *frame, int status)
+{
+ union tpacket_uhdr h;
+
+ h.raw = frame;
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ h.h1->tp_status = status;
+ flush_dcache_page(pgv_to_page(&h.h1->tp_status));
+ break;
+ case TPACKET_V2:
+ h.h2->tp_status = status;
+ flush_dcache_page(pgv_to_page(&h.h2->tp_status));
+ break;
+ case TPACKET_V3:
+ default:
+ WARN(1, "TPACKET version not supported.\n");
+ BUG();
+ }
+
+ smp_wmb();
+}
+
+static int __packet_get_status(struct packet_sock *po, void *frame)
+{
+ union tpacket_uhdr h;
+
+ smp_rmb();
+
+ h.raw = frame;
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ flush_dcache_page(pgv_to_page(&h.h1->tp_status));
+ return h.h1->tp_status;
+ case TPACKET_V2:
+ flush_dcache_page(pgv_to_page(&h.h2->tp_status));
+ return h.h2->tp_status;
+ case TPACKET_V3:
+ default:
+ WARN(1, "TPACKET version not supported.\n");
+ BUG();
+ return 0;
+ }
+}
+
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
+ unsigned int flags)
+{
+ struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+
+ if (shhwtstamps &&
+ (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
+ return TP_STATUS_TS_RAW_HARDWARE;
+
+ if (ktime_to_timespec_cond(skb->tstamp, ts))
+ return TP_STATUS_TS_SOFTWARE;
+
+ return 0;
+}
+
+static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
+ struct sk_buff *skb)
+{
+ union tpacket_uhdr h;
+ struct timespec ts;
+ __u32 ts_status;
+
+ if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+ return 0;
+
+ h.raw = frame;
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ h.h1->tp_sec = ts.tv_sec;
+ h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+ break;
+ case TPACKET_V2:
+ h.h2->tp_sec = ts.tv_sec;
+ h.h2->tp_nsec = ts.tv_nsec;
+ break;
+ case TPACKET_V3:
+ default:
+ WARN(1, "TPACKET version not supported.\n");
+ BUG();
+ }
+
+ /* one flush is safe, as both fields always lie on the same cacheline */
+ flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
+ smp_wmb();
+
+ return ts_status;
+}
+
+static void *packet_lookup_frame(struct packet_sock *po,
+ struct packet_ring_buffer *rb,
+ unsigned int position,
+ int status)
+{
+ unsigned int pg_vec_pos, frame_offset;
+ union tpacket_uhdr h;
+
+ pg_vec_pos = position / rb->frames_per_block;
+ frame_offset = position % rb->frames_per_block;
+
+ h.raw = rb->pg_vec[pg_vec_pos].buffer +
+ (frame_offset * rb->frame_size);
+
+ if (status != __packet_get_status(po, h.raw))
+ return NULL;
+
+ return h.raw;
+}
+
+static void *packet_current_frame(struct packet_sock *po,
+ struct packet_ring_buffer *rb,
+ int status)
+{
+ return packet_lookup_frame(po, rb, rb->head, status);
+}
+
+static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
+{
+ del_timer_sync(&pkc->retire_blk_timer);
+}
+
+static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
+ int tx_ring,
+ struct sk_buff_head *rb_queue)
+{
+ struct tpacket_kbdq_core *pkc;
+
+ pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
+ GET_PBDQC_FROM_RB(&po->rx_ring);
+
+ spin_lock_bh(&rb_queue->lock);
+ pkc->delete_blk_timer = 1;
+ spin_unlock_bh(&rb_queue->lock);
+
+ prb_del_retire_blk_timer(pkc);
+}
+
+static void prb_init_blk_timer(struct packet_sock *po,
+ struct tpacket_kbdq_core *pkc,
+ void (*func) (unsigned long))
+{
+ init_timer(&pkc->retire_blk_timer);
+ pkc->retire_blk_timer.data = (long)po;
+ pkc->retire_blk_timer.function = func;
+ pkc->retire_blk_timer.expires = jiffies;
+}
+
+static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
+{
+ struct tpacket_kbdq_core *pkc;
+
+ if (tx_ring)
+ BUG();
+
+ pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
+ GET_PBDQC_FROM_RB(&po->rx_ring);
+ prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
+}
+
+static int prb_calc_retire_blk_tmo(struct packet_sock *po,
+ int blk_size_in_bytes)
+{
+ struct net_device *dev;
+ unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
+ struct ethtool_cmd ecmd;
+ int err;
+ u32 speed;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
+ if (unlikely(!dev)) {
+ rtnl_unlock();
+ return DEFAULT_PRB_RETIRE_TOV;
+ }
+ err = __ethtool_get_settings(dev, &ecmd);
+ speed = ethtool_cmd_speed(&ecmd);
+ rtnl_unlock();
+ if (!err) {
+ /*
+ * If the link speed is so slow you don't really
+ * need to worry about perf anyways
+ */
+ if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
+ return DEFAULT_PRB_RETIRE_TOV;
+ } else {
+ msec = 1;
+ div = speed / 1000;
+ }
+ }
+
+ mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
+
+ if (div)
+ mbits /= div;
+
+ tmo = mbits * msec;
+
+ if (div)
+ return tmo+1;
+ return tmo;
+}
+
+static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
+ union tpacket_req_u *req_u)
+{
+ p1->feature_req_word = req_u->req3.tp_feature_req_word;
+}
+
+static void init_prb_bdqc(struct packet_sock *po,
+ struct packet_ring_buffer *rb,
+ struct pgv *pg_vec,
+ union tpacket_req_u *req_u, int tx_ring)
+{
+ struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
+ struct tpacket_block_desc *pbd;
+
+ memset(p1, 0x0, sizeof(*p1));
+
+ p1->knxt_seq_num = 1;
+ p1->pkbdq = pg_vec;
+ pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
+ p1->pkblk_start = pg_vec[0].buffer;
+ p1->kblk_size = req_u->req3.tp_block_size;
+ p1->knum_blocks = req_u->req3.tp_block_nr;
+ p1->hdrlen = po->tp_hdrlen;
+ p1->version = po->tp_version;
+ p1->last_kactive_blk_num = 0;
+ po->stats.stats3.tp_freeze_q_cnt = 0;
+ if (req_u->req3.tp_retire_blk_tov)
+ p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
+ else
+ p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
+ req_u->req3.tp_block_size);
+ p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
+ p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
+
+ p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
+ prb_init_ft_ops(p1, req_u);
+ prb_setup_retire_blk_timer(po, tx_ring);
+ prb_open_block(p1, pbd);
+}
+
+/* Do NOT update the last_blk_num first.
+ * Assumes sk_buff_head lock is held.
+ */
+static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
+{
+ mod_timer(&pkc->retire_blk_timer,
+ jiffies + pkc->tov_in_jiffies);
+ pkc->last_kactive_blk_num = pkc->kactive_blk_num;
+}
+
+/*
+ * Timer logic:
+ * 1) We refresh the timer only when we open a block.
+ * By doing this we don't waste cycles refreshing the timer
+ * on packet-by-packet basis.
+ *
+ * With a 1MB block-size, on a 1Gbps line, it will take
+ * i) ~8 ms to fill a block + ii) memcpy etc.
+ * In this cut we are not accounting for the memcpy time.
+ *
+ * So, if the user sets the 'tmo' to 10ms then the timer
+ * will never fire while the block is still getting filled
+ * (which is what we want). However, the user could choose
+ * to close a block early and that's fine.
+ *
+ * But when the timer does fire, we check whether or not to refresh it.
+ * Since the tmo granularity is in msecs, it is not too expensive
+ * to refresh the timer, lets say every '8' msecs.
+ * Either the user can set the 'tmo' or we can derive it based on
+ * a) line-speed and b) block-size.
+ * prb_calc_retire_blk_tmo() calculates the tmo.
+ *
+ */
+static void prb_retire_rx_blk_timer_expired(unsigned long data)
+{
+ struct packet_sock *po = (struct packet_sock *)data;
+ struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
+ unsigned int frozen;
+ struct tpacket_block_desc *pbd;
+
+ spin_lock(&po->sk.sk_receive_queue.lock);
+
+ frozen = prb_queue_frozen(pkc);
+ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+ if (unlikely(pkc->delete_blk_timer))
+ goto out;
+
+ /* We only need to plug the race when the block is partially filled.
+ * tpacket_rcv:
+ * lock(); increment BLOCK_NUM_PKTS; unlock()
+ * copy_bits() is in progress ...
+ * timer fires on other cpu:
+ * we can't retire the current block because copy_bits
+ * is in progress.
+ *
+ */
+ if (BLOCK_NUM_PKTS(pbd)) {
+ while (atomic_read(&pkc->blk_fill_in_prog)) {
+ /* Waiting for skb_copy_bits to finish... */
+ cpu_relax();
+ }
+ }
+
+ if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
+ if (!frozen) {
+ if (!BLOCK_NUM_PKTS(pbd)) {
+ /* An empty block. Just refresh the timer. */
+ goto refresh_timer;
+ }
+ prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
+ if (!prb_dispatch_next_block(pkc, po))
+ goto refresh_timer;
+ else
+ goto out;
+ } else {
+ /* Case 1. Queue was frozen because user-space was
+ * lagging behind.
+ */
+ if (prb_curr_blk_in_use(pkc, pbd)) {
+ /*
+ * Ok, user-space is still behind.
+ * So just refresh the timer.
+ */
+ goto refresh_timer;
+ } else {
+ /* Case 2. queue was frozen,user-space caught up,
+ * now the link went idle && the timer fired.
+ * We don't have a block to close.So we open this
+ * block and restart the timer.
+ * opening a block thaws the queue,restarts timer
+ * Thawing/timer-refresh is a side effect.
+ */
+ prb_open_block(pkc, pbd);
+ goto out;
+ }
+ }
+ }
+
+refresh_timer:
+ _prb_refresh_rx_retire_blk_timer(pkc);
+
+out:
+ spin_unlock(&po->sk.sk_receive_queue.lock);
+}
+
+static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
+ struct tpacket_block_desc *pbd1, __u32 status)
+{
+ /* Flush everything minus the block header */
+
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+ u8 *start, *end;
+
+ start = (u8 *)pbd1;
+
+ /* Skip the block header(we know header WILL fit in 4K) */
+ start += PAGE_SIZE;
+
+ end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
+ for (; start < end; start += PAGE_SIZE)
+ flush_dcache_page(pgv_to_page(start));
+
+ smp_wmb();
+#endif
+
+ /* Now update the block status. */
+
+ BLOCK_STATUS(pbd1) = status;
+
+ /* Flush the block header */
+
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+ start = (u8 *)pbd1;
+ flush_dcache_page(pgv_to_page(start));
+
+ smp_wmb();
+#endif
+}
+
+/*
+ * Side effect:
+ *
+ * 1) flush the block
+ * 2) Increment active_blk_num
+ *
+ * Note:We DONT refresh the timer on purpose.
+ * Because almost always the next block will be opened.
+ */
+static void prb_close_block(struct tpacket_kbdq_core *pkc1,
+ struct tpacket_block_desc *pbd1,
+ struct packet_sock *po, unsigned int stat)
+{
+ __u32 status = TP_STATUS_USER | stat;
+
+ struct tpacket3_hdr *last_pkt;
+ struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
+ struct sock *sk = &po->sk;
+
+ if (po->stats.stats3.tp_drops)
+ status |= TP_STATUS_LOSING;
+
+ last_pkt = (struct tpacket3_hdr *)pkc1->prev;
+ last_pkt->tp_next_offset = 0;
+
+ /* Get the ts of the last pkt */
+ if (BLOCK_NUM_PKTS(pbd1)) {
+ h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
+ h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
+ } else {
+ /* Ok, we tmo'd - so get the current time.
+ *
+ * It shouldn't really happen as we don't close empty
+ * blocks. See prb_retire_rx_blk_timer_expired().
+ */
+ struct timespec ts;
+ getnstimeofday(&ts);
+ h1->ts_last_pkt.ts_sec = ts.tv_sec;
+ h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
+ }
+
+ smp_wmb();
+
+ /* Flush the block */
+ prb_flush_block(pkc1, pbd1, status);
+
+ sk->sk_data_ready(sk);
+
+ pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
+}
+
+static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
+{
+ pkc->reset_pending_on_curr_blk = 0;
+}
+
+/*
+ * Side effect of opening a block:
+ *
+ * 1) prb_queue is thawed.
+ * 2) retire_blk_timer is refreshed.
+ *
+ */
+static void prb_open_block(struct tpacket_kbdq_core *pkc1,
+ struct tpacket_block_desc *pbd1)
+{
+ struct timespec ts;
+ struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
+
+ smp_rmb();
+
+ /* We could have just memset this but we will lose the
+ * flexibility of making the priv area sticky
+ */
+
+ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
+ BLOCK_NUM_PKTS(pbd1) = 0;
+ BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+
+ getnstimeofday(&ts);
+
+ h1->ts_first_pkt.ts_sec = ts.tv_sec;
+ h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
+
+ pkc1->pkblk_start = (char *)pbd1;
+ pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+
+ BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+ BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
+
+ pbd1->version = pkc1->version;
+ pkc1->prev = pkc1->nxt_offset;
+ pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
+
+ prb_thaw_queue(pkc1);
+ _prb_refresh_rx_retire_blk_timer(pkc1);
+
+ smp_wmb();
+}
+
+/*
+ * Queue freeze logic:
+ * 1) Assume tp_block_nr = 8 blocks.
+ * 2) At time 't0', user opens Rx ring.
+ * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
+ * 4) user-space is either sleeping or processing block '0'.
+ * 5) tpacket_rcv is currently filling block '7', since there is no space left,
+ * it will close block-7,loop around and try to fill block '0'.
+ * call-flow:
+ * __packet_lookup_frame_in_block
+ * prb_retire_current_block()
+ * prb_dispatch_next_block()
+ * |->(BLOCK_STATUS == USER) evaluates to true
+ * 5.1) Since block-0 is currently in-use, we just freeze the queue.
+ * 6) Now there are two cases:
+ * 6.1) Link goes idle right after the queue is frozen.
+ * But remember, the last open_block() refreshed the timer.
+ * When this timer expires,it will refresh itself so that we can
+ * re-open block-0 in near future.
+ * 6.2) Link is busy and keeps on receiving packets. This is a simple
+ * case and __packet_lookup_frame_in_block will check if block-0
+ * is free and can now be re-used.
+ */
+static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
+ struct packet_sock *po)
+{
+ pkc->reset_pending_on_curr_blk = 1;
+ po->stats.stats3.tp_freeze_q_cnt++;
+}
+
+#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
+
+/*
+ * If the next block is free then we will dispatch it
+ * and return a good offset.
+ * Else, we will freeze the queue.
+ * So, caller must check the return value.
+ */
+static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
+ struct packet_sock *po)
+{
+ struct tpacket_block_desc *pbd;
+
+ smp_rmb();
+
+ /* 1. Get current block num */
+ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+ /* 2. If this block is currently in_use then freeze the queue */
+ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
+ prb_freeze_queue(pkc, po);
+ return NULL;
+ }
+
+ /*
+ * 3.
+ * open this block and return the offset where the first packet
+ * needs to get stored.
+ */
+ prb_open_block(pkc, pbd);
+ return (void *)pkc->nxt_offset;
+}
+
+static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
+ struct packet_sock *po, unsigned int status)
+{
+ struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+ /* retire/close the current block */
+ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
+ /*
+ * Plug the case where copy_bits() is in progress on
+ * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
+ * have space to copy the pkt in the current block and
+ * called prb_retire_current_block()
+ *
+ * We don't need to worry about the TMO case because
+ * the timer-handler already handled this case.
+ */
+ if (!(status & TP_STATUS_BLK_TMO)) {
+ while (atomic_read(&pkc->blk_fill_in_prog)) {
+ /* Waiting for skb_copy_bits to finish... */
+ cpu_relax();
+ }
+ }
+ prb_close_block(pkc, pbd, po, status);
+ return;
+ }
+}
+
+static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
+ struct tpacket_block_desc *pbd)
+{
+ return TP_STATUS_USER & BLOCK_STATUS(pbd);
+}
+
+static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
+{
+ return pkc->reset_pending_on_curr_blk;
+}
+
+static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
+{
+ struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
+ atomic_dec(&pkc->blk_fill_in_prog);
+}
+
+static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
+ struct tpacket3_hdr *ppd)
+{
+ ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
+}
+
+static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
+ struct tpacket3_hdr *ppd)
+{
+ ppd->hv1.tp_rxhash = 0;
+}
+
+static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
+ struct tpacket3_hdr *ppd)
+{
+ if (skb_vlan_tag_present(pkc->skb)) {
+ ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
+ ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
+ ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
+ } else {
+ ppd->hv1.tp_vlan_tci = 0;
+ ppd->hv1.tp_vlan_tpid = 0;
+ ppd->tp_status = TP_STATUS_AVAILABLE;
+ }
+}
+
+static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
+ struct tpacket3_hdr *ppd)
+{
+ ppd->hv1.tp_padding = 0;
+ prb_fill_vlan_info(pkc, ppd);
+
+ if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
+ prb_fill_rxhash(pkc, ppd);
+ else
+ prb_clear_rxhash(pkc, ppd);
+}
+
+static void prb_fill_curr_block(char *curr,
+ struct tpacket_kbdq_core *pkc,
+ struct tpacket_block_desc *pbd,
+ unsigned int len)
+{
+ struct tpacket3_hdr *ppd;
+
+ ppd = (struct tpacket3_hdr *)curr;
+ ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
+ pkc->prev = curr;
+ pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
+ BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
+ BLOCK_NUM_PKTS(pbd) += 1;
+ atomic_inc(&pkc->blk_fill_in_prog);
+ prb_run_all_ft_ops(pkc, ppd);
+}
+
+/* Assumes caller has the sk->rx_queue.lock */
+static void *__packet_lookup_frame_in_block(struct packet_sock *po,
+ struct sk_buff *skb,
+ int status,
+ unsigned int len
+ )
+{
+ struct tpacket_kbdq_core *pkc;
+ struct tpacket_block_desc *pbd;
+ char *curr, *end;
+
+ pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
+ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+ /* Queue is frozen when user space is lagging behind */
+ if (prb_queue_frozen(pkc)) {
+ /*
+ * Check if that last block which caused the queue to freeze,
+ * is still in_use by user-space.
+ */
+ if (prb_curr_blk_in_use(pkc, pbd)) {
+ /* Can't record this packet */
+ return NULL;
+ } else {
+ /*
+ * Ok, the block was released by user-space.
+ * Now let's open that block.
+ * opening a block also thaws the queue.
+ * Thawing is a side effect.
+ */
+ prb_open_block(pkc, pbd);
+ }
+ }
+
+ smp_mb();
+ curr = pkc->nxt_offset;
+ pkc->skb = skb;
+ end = (char *)pbd + pkc->kblk_size;
+
+ /* first try the current block */
+ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
+ prb_fill_curr_block(curr, pkc, pbd, len);
+ return (void *)curr;
+ }
+
+ /* Ok, close the current block */
+ prb_retire_current_block(pkc, po, 0);
+
+ /* Now, try to dispatch the next block */
+ curr = (char *)prb_dispatch_next_block(pkc, po);
+ if (curr) {
+ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+ prb_fill_curr_block(curr, pkc, pbd, len);
+ return (void *)curr;
+ }
+
+ /*
+ * No free blocks are available.user_space hasn't caught up yet.
+ * Queue was just frozen and now this packet will get dropped.
+ */
+ return NULL;
+}
+
+static void *packet_current_rx_frame(struct packet_sock *po,
+ struct sk_buff *skb,
+ int status, unsigned int len)
+{
+ char *curr = NULL;
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ case TPACKET_V2:
+ curr = packet_lookup_frame(po, &po->rx_ring,
+ po->rx_ring.head, status);
+ return curr;
+ case TPACKET_V3:
+ return __packet_lookup_frame_in_block(po, skb, status, len);
+ default:
+ WARN(1, "TPACKET version not supported\n");
+ BUG();
+ return NULL;
+ }
+}
+
+static void *prb_lookup_block(struct packet_sock *po,
+ struct packet_ring_buffer *rb,
+ unsigned int idx,
+ int status)
+{
+ struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
+ struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
+
+ if (status != BLOCK_STATUS(pbd))
+ return NULL;
+ return pbd;
+}
+
+static int prb_previous_blk_num(struct packet_ring_buffer *rb)
+{
+ unsigned int prev;
+ if (rb->prb_bdqc.kactive_blk_num)
+ prev = rb->prb_bdqc.kactive_blk_num-1;
+ else
+ prev = rb->prb_bdqc.knum_blocks-1;
+ return prev;
+}
+
+/* Assumes caller has held the rx_queue.lock */
+static void *__prb_previous_block(struct packet_sock *po,
+ struct packet_ring_buffer *rb,
+ int status)
+{
+ unsigned int previous = prb_previous_blk_num(rb);
+ return prb_lookup_block(po, rb, previous, status);
+}
+
+static void *packet_previous_rx_frame(struct packet_sock *po,
+ struct packet_ring_buffer *rb,
+ int status)
+{
+ if (po->tp_version <= TPACKET_V2)
+ return packet_previous_frame(po, rb, status);
+
+ return __prb_previous_block(po, rb, status);
+}
+
+static void packet_increment_rx_head(struct packet_sock *po,
+ struct packet_ring_buffer *rb)
+{
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ case TPACKET_V2:
+ return packet_increment_head(rb);
+ case TPACKET_V3:
+ default:
+ WARN(1, "TPACKET version not supported.\n");
+ BUG();
+ return;
+ }
+}
+
+static void *packet_previous_frame(struct packet_sock *po,
+ struct packet_ring_buffer *rb,
+ int status)
+{
+ unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
+ return packet_lookup_frame(po, rb, previous, status);
+}
+
+static void packet_increment_head(struct packet_ring_buffer *buff)
+{
+ buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
+}
+
+static void packet_inc_pending(struct packet_ring_buffer *rb)
+{
+ this_cpu_inc(*rb->pending_refcnt);
+}
+
+static void packet_dec_pending(struct packet_ring_buffer *rb)
+{
+ this_cpu_dec(*rb->pending_refcnt);
+}
+
+static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
+{
+ unsigned int refcnt = 0;
+ int cpu;
+
+ /* We don't use pending refcount in rx_ring. */
+ if (rb->pending_refcnt == NULL)
+ return 0;
+
+ for_each_possible_cpu(cpu)
+ refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
+
+ return refcnt;
+}
+
+static int packet_alloc_pending(struct packet_sock *po)
+{
+ po->rx_ring.pending_refcnt = NULL;
+
+ po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
+ if (unlikely(po->tx_ring.pending_refcnt == NULL))
+ return -ENOBUFS;
+
+ return 0;
+}
+
+static void packet_free_pending(struct packet_sock *po)
+{
+ free_percpu(po->tx_ring.pending_refcnt);
+}
+
+static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+{
+ struct sock *sk = &po->sk;
+ bool has_room;
+
+ if (po->prot_hook.func != tpacket_rcv)
+ return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
+ <= sk->sk_rcvbuf;
+
+ spin_lock(&sk->sk_receive_queue.lock);
+ if (po->tp_version == TPACKET_V3)
+ has_room = prb_lookup_block(po, &po->rx_ring,
+ po->rx_ring.prb_bdqc.kactive_blk_num,
+ TP_STATUS_KERNEL);
+ else
+ has_room = packet_lookup_frame(po, &po->rx_ring,
+ po->rx_ring.head,
+ TP_STATUS_KERNEL);
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+ return has_room;
+}
+
+static void packet_sock_destruct(struct sock *sk)
+{
+ skb_queue_purge(&sk->sk_error_queue);
+
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Attempt to release alive packet socket: %p\n", sk);
+ return;
+ }
+
+ sk_refcnt_debug_dec(sk);
+}
+
+static unsigned int fanout_demux_hash(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ return reciprocal_scale(skb_get_hash(skb), num);
+}
+
+static unsigned int fanout_demux_lb(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ unsigned int val = atomic_inc_return(&f->rr_cur);
+
+ return val % num;
+}
+
+static unsigned int fanout_demux_cpu(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ return smp_processor_id() % num;
+}
+
+static unsigned int fanout_demux_rnd(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ return prandom_u32_max(num);
+}
+
+static unsigned int fanout_demux_rollover(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int idx, unsigned int skip,
+ unsigned int num)
+{
+ unsigned int i, j;
+
+ i = j = min_t(int, f->next[idx], num - 1);
+ do {
+ if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
+ if (i != j)
+ f->next[idx] = i;
+ return i;
+ }
+ if (++i == num)
+ i = 0;
+ } while (i != j);
+
+ return idx;
+}
+
+static unsigned int fanout_demux_qm(struct packet_fanout *f,
+ struct sk_buff *skb,
+ unsigned int num)
+{
+ return skb_get_queue_mapping(skb) % num;
+}
+
+static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
+{
+ return f->flags & (flag >> 8);
+}
+
+static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct packet_fanout *f = pt->af_packet_priv;
+ unsigned int num = READ_ONCE(f->num_members);
+ struct packet_sock *po;
+ unsigned int idx;
+
+ if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
+ !num) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
+ skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
+ if (!skb)
+ return 0;
+ }
+ switch (f->type) {
+ case PACKET_FANOUT_HASH:
+ default:
+ idx = fanout_demux_hash(f, skb, num);
+ break;
+ case PACKET_FANOUT_LB:
+ idx = fanout_demux_lb(f, skb, num);
+ break;
+ case PACKET_FANOUT_CPU:
+ idx = fanout_demux_cpu(f, skb, num);
+ break;
+ case PACKET_FANOUT_RND:
+ idx = fanout_demux_rnd(f, skb, num);
+ break;
+ case PACKET_FANOUT_QM:
+ idx = fanout_demux_qm(f, skb, num);
+ break;
+ case PACKET_FANOUT_ROLLOVER:
+ idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
+ break;
+ }
+
+ po = pkt_sk(f->arr[idx]);
+ if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
+ unlikely(!packet_rcv_has_room(po, skb))) {
+ idx = fanout_demux_rollover(f, skb, idx, idx, num);
+ po = pkt_sk(f->arr[idx]);
+ }
+
+ return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
+}
+
+DEFINE_MUTEX(fanout_mutex);
+EXPORT_SYMBOL_GPL(fanout_mutex);
+static LIST_HEAD(fanout_list);
+
+static void __fanout_link(struct sock *sk, struct packet_sock *po)
+{
+ struct packet_fanout *f = po->fanout;
+
+ spin_lock(&f->lock);
+ f->arr[f->num_members] = sk;
+ smp_wmb();
+ f->num_members++;
+ spin_unlock(&f->lock);
+}
+
+static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
+{
+ struct packet_fanout *f = po->fanout;
+ int i;
+
+ spin_lock(&f->lock);
+ for (i = 0; i < f->num_members; i++) {
+ if (f->arr[i] == sk)
+ break;
+ }
+ BUG_ON(i >= f->num_members);
+ f->arr[i] = f->arr[f->num_members - 1];
+ f->num_members--;
+ spin_unlock(&f->lock);
+}
+
+static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
+{
+ if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
+ return true;
+
+ return false;
+}
+
+static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
+{
+ struct packet_sock *po = pkt_sk(sk);
+ struct packet_fanout *f, *match;
+ u8 type = type_flags & 0xff;
+ u8 flags = type_flags >> 8;
+ int err;
+
+ switch (type) {
+ case PACKET_FANOUT_ROLLOVER:
+ if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
+ return -EINVAL;
+ case PACKET_FANOUT_HASH:
+ case PACKET_FANOUT_LB:
+ case PACKET_FANOUT_CPU:
+ case PACKET_FANOUT_RND:
+ case PACKET_FANOUT_QM:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (!po->running)
+ return -EINVAL;
+
+ if (po->fanout)
+ return -EALREADY;
+
+ mutex_lock(&fanout_mutex);
+ match = NULL;
+ list_for_each_entry(f, &fanout_list, list) {
+ if (f->id == id &&
+ read_pnet(&f->net) == sock_net(sk)) {
+ match = f;
+ break;
+ }
+ }
+ err = -EINVAL;
+ if (match && match->flags != flags)
+ goto out;
+ if (!match) {
+ err = -ENOMEM;
+ match = kzalloc(sizeof(*match), GFP_KERNEL);
+ if (!match)
+ goto out;
+ write_pnet(&match->net, sock_net(sk));
+ match->id = id;
+ match->type = type;
+ match->flags = flags;
+ atomic_set(&match->rr_cur, 0);
+ INIT_LIST_HEAD(&match->list);
+ spin_lock_init(&match->lock);
+ atomic_set(&match->sk_ref, 0);
+ match->prot_hook.type = po->prot_hook.type;
+ match->prot_hook.dev = po->prot_hook.dev;
+ match->prot_hook.func = packet_rcv_fanout;
+ match->prot_hook.af_packet_priv = match;
+ match->prot_hook.id_match = match_fanout_group;
+ dev_add_pack(&match->prot_hook);
+ list_add(&match->list, &fanout_list);
+ }
+ err = -EINVAL;
+ if (match->type == type &&
+ match->prot_hook.type == po->prot_hook.type &&
+ match->prot_hook.dev == po->prot_hook.dev) {
+ err = -ENOSPC;
+ if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
+ __dev_remove_pack(&po->prot_hook);
+ po->fanout = match;
+ atomic_inc(&match->sk_ref);
+ __fanout_link(sk, po);
+ err = 0;
+ }
+ }
+out:
+ mutex_unlock(&fanout_mutex);
+ return err;
+}
+
+static void fanout_release(struct sock *sk)
+{
+ struct packet_sock *po = pkt_sk(sk);
+ struct packet_fanout *f;
+
+ f = po->fanout;
+ if (!f)
+ return;
+
+ mutex_lock(&fanout_mutex);
+ po->fanout = NULL;
+
+ if (atomic_dec_and_test(&f->sk_ref)) {
+ list_del(&f->list);
+ dev_remove_pack(&f->prot_hook);
+ kfree(f);
+ }
+ mutex_unlock(&fanout_mutex);
+}
+
+static const struct proto_ops packet_ops;
+
+static const struct proto_ops packet_ops_spkt;
+
+static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct sock *sk;
+ struct sockaddr_pkt *spkt;
+
+ /*
+ * When we registered the protocol we saved the socket in the data
+ * field for just this event.
+ */
+
+ sk = pt->af_packet_priv;
+
+ /*
+ * Yank back the headers [hope the device set this
+ * right or kerboom...]
+ *
+ * Incoming packets have ll header pulled,
+ * push it back.
+ *
+ * For outgoing ones skb->data == skb_mac_header(skb)
+ * so that this procedure is noop.
+ */
+
+ if (skb->pkt_type == PACKET_LOOPBACK)
+ goto out;
+
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ goto out;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ goto oom;
+
+ /* drop any routing info */
+ skb_dst_drop(skb);
+
+ /* drop conntrack reference */
+ nf_reset(skb);
+
+ spkt = &PACKET_SKB_CB(skb)->sa.pkt;
+
+ skb_push(skb, skb->data - skb_mac_header(skb));
+
+ /*
+ * The SOCK_PACKET socket receives _all_ frames.
+ */
+
+ spkt->spkt_family = dev->type;
+ strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
+ spkt->spkt_protocol = skb->protocol;
+
+ /*
+ * Charge the memory to the socket. This is done specifically
+ * to prevent sockets using all the memory up.
+ */
+
+ if (sock_queue_rcv_skb(sk, skb) == 0)
+ return 0;
+
+out:
+ kfree_skb(skb);
+oom:
+ return 0;
+}
+
+
+/*
+ * Output a raw packet to a device layer. This bypasses all the other
+ * protocol layers and you must therefore supply it with a complete frame
+ */
+
+static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
+ struct sk_buff *skb = NULL;
+ struct net_device *dev;
+ __be16 proto = 0;
+ int err;
+ int extra_len = 0;
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (saddr) {
+ if (msg->msg_namelen < sizeof(struct sockaddr))
+ return -EINVAL;
+ if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
+ proto = saddr->spkt_protocol;
+ } else
+ return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
+
+ /*
+ * Find the device first to size check it
+ */
+
+ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
+retry:
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
+ err = -ENODEV;
+ if (dev == NULL)
+ goto out_unlock;
+
+ err = -ENETDOWN;
+ if (!(dev->flags & IFF_UP))
+ goto out_unlock;
+
+ /*
+ * You may not queue a frame bigger than the mtu. This is the lowest level
+ * raw protocol and you must do your own fragmentation at this level.
+ */
+
+ if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
+ if (!netif_supports_nofcs(dev)) {
+ err = -EPROTONOSUPPORT;
+ goto out_unlock;
+ }
+ extra_len = 4; /* We're doing our own CRC */
+ }
+
+ err = -EMSGSIZE;
+ if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
+ goto out_unlock;
+
+ if (!skb) {
+ size_t reserved = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
+
+ rcu_read_unlock();
+ skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
+ if (skb == NULL)
+ return -ENOBUFS;
+ /* FIXME: Save some space for broken drivers that write a hard
+ * header at transmission time by themselves. PPP is the notable
+ * one here. This should really be fixed at the driver level.
+ */
+ skb_reserve(skb, reserved);
+ skb_reset_network_header(skb);
+
+ /* Try to align data part correctly */
+ if (hhlen) {
+ skb->data -= hhlen;
+ skb->tail -= hhlen;
+ if (len < hhlen)
+ skb_reset_network_header(skb);
+ }
+ err = memcpy_from_msg(skb_put(skb, len), msg, len);
+ if (err)
+ goto out_free;
+ goto retry;
+ }
+
+ if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
+ /* Earlier code assumed this would be a VLAN pkt,
+ * double-check this now that we have the actual
+ * packet in hand.
+ */
+ struct ethhdr *ehdr;
+ skb_reset_mac_header(skb);
+ ehdr = eth_hdr(skb);
+ if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+ err = -EMSGSIZE;
+ goto out_unlock;
+ }
+ }
+
+ skb->protocol = proto;
+ skb->dev = dev;
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+
+ if (unlikely(extra_len == 4))
+ skb->no_fcs = 1;
+
+ skb_probe_transport_header(skb, 0);
+
+ dev_queue_xmit(skb);
+ rcu_read_unlock();
+ return len;
+
+out_unlock:
+ rcu_read_unlock();
+out_free:
+ kfree_skb(skb);
+ return err;
+}
+
+static unsigned int run_filter(const struct sk_buff *skb,
+ const struct sock *sk,
+ unsigned int res)
+{
+ struct sk_filter *filter;
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+ res = SK_RUN_FILTER(filter, skb);
+ rcu_read_unlock();
+
+ return res;
+}
+
+/*
+ * This function makes lazy skb cloning in hope that most of packets
+ * are discarded by BPF.
+ *
+ * Note tricky part: we DO mangle shared skb! skb->data, skb->len
+ * and skb->cb are mangled. It works because (and until) packets
+ * falling here are owned by current CPU. Output packets are cloned
+ * by dev_queue_xmit_nit(), input packets are processed by net_bh
+ * sequencially, so that if we return skb to original state on exit,
+ * we will not harm anyone.
+ */
+
+static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct sock *sk;
+ struct sockaddr_ll *sll;
+ struct packet_sock *po;
+ u8 *skb_head = skb->data;
+ int skb_len = skb->len;
+ unsigned int snaplen, res;
+
+ if (skb->pkt_type == PACKET_LOOPBACK)
+ goto drop;
+
+ sk = pt->af_packet_priv;
+ po = pkt_sk(sk);
+
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ goto drop;
+
+ skb->dev = dev;
+
+ if (dev->header_ops) {
+ /* The device has an explicit notion of ll header,
+ * exported to higher levels.
+ *
+ * Otherwise, the device hides details of its frame
+ * structure, so that corresponding packet head is
+ * never delivered to user.
+ */
+ if (sk->sk_type != SOCK_DGRAM)
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ else if (skb->pkt_type == PACKET_OUTGOING) {
+ /* Special case: outgoing packets have ll header at head */
+ skb_pull(skb, skb_network_offset(skb));
+ }
+ }
+
+ snaplen = skb->len;
+
+ res = run_filter(skb, sk, snaplen);
+ if (!res)
+ goto drop_n_restore;
+ if (snaplen > res)
+ snaplen = res;
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ goto drop_n_acct;
+
+ if (skb_shared(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+ if (nskb == NULL)
+ goto drop_n_acct;
+
+ if (skb_head != skb->data) {
+ skb->data = skb_head;
+ skb->len = skb_len;
+ }
+ consume_skb(skb);
+ skb = nskb;
+ }
+
+ sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
+
+ sll = &PACKET_SKB_CB(skb)->sa.ll;
+ sll->sll_hatype = dev->type;
+ sll->sll_pkttype = skb->pkt_type;
+ if (unlikely(po->origdev))
+ sll->sll_ifindex = orig_dev->ifindex;
+ else
+ sll->sll_ifindex = dev->ifindex;
+
+ sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
+
+ /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
+ * Use their space for storing the original skb length.
+ */
+ PACKET_SKB_CB(skb)->sa.origlen = skb->len;
+
+ if (pskb_trim(skb, snaplen))
+ goto drop_n_acct;
+
+ skb_set_owner_r(skb, sk);
+ skb->dev = NULL;
+ skb_dst_drop(skb);
+
+ /* drop conntrack reference */
+ nf_reset(skb);
+
+ spin_lock(&sk->sk_receive_queue.lock);
+ po->stats.stats1.tp_packets++;
+ sock_skb_set_dropcount(sk, skb);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ spin_unlock(&sk->sk_receive_queue.lock);
+ sk->sk_data_ready(sk);
+ return 0;
+
+drop_n_acct:
+ spin_lock(&sk->sk_receive_queue.lock);
+ po->stats.stats1.tp_drops++;
+ atomic_inc(&sk->sk_drops);
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+drop_n_restore:
+ if (skb_head != skb->data && skb_shared(skb)) {
+ skb->data = skb_head;
+ skb->len = skb_len;
+ }
+drop:
+ consume_skb(skb);
+ return 0;
+}
+
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct sock *sk;
+ struct packet_sock *po;
+ struct sockaddr_ll *sll;
+ union tpacket_uhdr h;
+ u8 *skb_head = skb->data;
+ int skb_len = skb->len;
+ unsigned int snaplen, res;
+ unsigned long status = TP_STATUS_USER;
+ unsigned short macoff, netoff, hdrlen;
+ struct sk_buff *copy_skb = NULL;
+ struct timespec ts;
+ __u32 ts_status;
+
+ /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
+ * We may add members to them until current aligned size without forcing
+ * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
+ */
+ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
+ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
+
+ if (skb->pkt_type == PACKET_LOOPBACK)
+ goto drop;
+
+ sk = pt->af_packet_priv;
+ po = pkt_sk(sk);
+
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ goto drop;
+
+ if (dev->header_ops) {
+ if (sk->sk_type != SOCK_DGRAM)
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ else if (skb->pkt_type == PACKET_OUTGOING) {
+ /* Special case: outgoing packets have ll header at head */
+ skb_pull(skb, skb_network_offset(skb));
+ }
+ }
+
+ snaplen = skb->len;
+
+ res = run_filter(skb, sk, snaplen);
+ if (!res)
+ goto drop_n_restore;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ status |= TP_STATUS_CSUMNOTREADY;
+ else if (skb->pkt_type != PACKET_OUTGOING &&
+ (skb->ip_summed == CHECKSUM_COMPLETE ||
+ skb_csum_unnecessary(skb)))
+ status |= TP_STATUS_CSUM_VALID;
+
+ if (snaplen > res)
+ snaplen = res;
+
+ if (sk->sk_type == SOCK_DGRAM) {
+ macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
+ po->tp_reserve;
+ } else {
+ unsigned int maclen = skb_network_offset(skb);
+ netoff = TPACKET_ALIGN(po->tp_hdrlen +
+ (maclen < 16 ? 16 : maclen)) +
+ po->tp_reserve;
+ macoff = netoff - maclen;
+ }
+ if (po->tp_version <= TPACKET_V2) {
+ if (macoff + snaplen > po->rx_ring.frame_size) {
+ if (po->copy_thresh &&
+ atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
+ if (skb_shared(skb)) {
+ copy_skb = skb_clone(skb, GFP_ATOMIC);
+ } else {
+ copy_skb = skb_get(skb);
+ skb_head = skb->data;
+ }
+ if (copy_skb)
+ skb_set_owner_r(copy_skb, sk);
+ }
+ snaplen = po->rx_ring.frame_size - macoff;
+ if ((int)snaplen < 0)
+ snaplen = 0;
+ }
+ } else if (unlikely(macoff + snaplen >
+ GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
+ u32 nval;
+
+ nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
+ pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
+ snaplen, nval, macoff);
+ snaplen = nval;
+ if (unlikely((int)snaplen < 0)) {
+ snaplen = 0;
+ macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
+ }
+ }
+ spin_lock(&sk->sk_receive_queue.lock);
+ h.raw = packet_current_rx_frame(po, skb,
+ TP_STATUS_KERNEL, (macoff+snaplen));
+ if (!h.raw)
+ goto ring_is_full;
+ if (po->tp_version <= TPACKET_V2) {
+ packet_increment_rx_head(po, &po->rx_ring);
+ /*
+ * LOSING will be reported till you read the stats,
+ * because it's COR - Clear On Read.
+ * Anyways, moving it for V1/V2 only as V3 doesn't need this
+ * at packet level.
+ */
+ if (po->stats.stats1.tp_drops)
+ status |= TP_STATUS_LOSING;
+ }
+ po->stats.stats1.tp_packets++;
+ if (copy_skb) {
+ status |= TP_STATUS_COPY;
+ __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
+ }
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+ skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
+
+ if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+ getnstimeofday(&ts);
+
+ status |= ts_status;
+
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ h.h1->tp_len = skb->len;
+ h.h1->tp_snaplen = snaplen;
+ h.h1->tp_mac = macoff;
+ h.h1->tp_net = netoff;
+ h.h1->tp_sec = ts.tv_sec;
+ h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+ hdrlen = sizeof(*h.h1);
+ break;
+ case TPACKET_V2:
+ h.h2->tp_len = skb->len;
+ h.h2->tp_snaplen = snaplen;
+ h.h2->tp_mac = macoff;
+ h.h2->tp_net = netoff;
+ h.h2->tp_sec = ts.tv_sec;
+ h.h2->tp_nsec = ts.tv_nsec;
+ if (skb_vlan_tag_present(skb)) {
+ h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
+ h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
+ status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
+ } else {
+ h.h2->tp_vlan_tci = 0;
+ h.h2->tp_vlan_tpid = 0;
+ }
+ memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
+ hdrlen = sizeof(*h.h2);
+ break;
+ case TPACKET_V3:
+ /* tp_nxt_offset,vlan are already populated above.
+ * So DONT clear those fields here
+ */
+ h.h3->tp_status |= status;
+ h.h3->tp_len = skb->len;
+ h.h3->tp_snaplen = snaplen;
+ h.h3->tp_mac = macoff;
+ h.h3->tp_net = netoff;
+ h.h3->tp_sec = ts.tv_sec;
+ h.h3->tp_nsec = ts.tv_nsec;
+ memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
+ hdrlen = sizeof(*h.h3);
+ break;
+ default:
+ BUG();
+ }
+
+ sll = h.raw + TPACKET_ALIGN(hdrlen);
+ sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
+ sll->sll_family = AF_PACKET;
+ sll->sll_hatype = dev->type;
+ sll->sll_protocol = skb->protocol;
+ sll->sll_pkttype = skb->pkt_type;
+ if (unlikely(po->origdev))
+ sll->sll_ifindex = orig_dev->ifindex;
+ else
+ sll->sll_ifindex = dev->ifindex;
+
+ smp_mb();
+
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+ if (po->tp_version <= TPACKET_V2) {
+ u8 *start, *end;
+
+ end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
+ macoff + snaplen);
+
+ for (start = h.raw; start < end; start += PAGE_SIZE)
+ flush_dcache_page(pgv_to_page(start));
+ }
+ smp_wmb();
+#endif
+
+ if (po->tp_version <= TPACKET_V2) {
+ __packet_set_status(po, h.raw, status);
+ sk->sk_data_ready(sk);
+ } else {
+ prb_clear_blk_fill_status(&po->rx_ring);
+ }
+
+drop_n_restore:
+ if (skb_head != skb->data && skb_shared(skb)) {
+ skb->data = skb_head;
+ skb->len = skb_len;
+ }
+drop:
+ kfree_skb(skb);
+ return 0;
+
+ring_is_full:
+ po->stats.stats1.tp_drops++;
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+ sk->sk_data_ready(sk);
+ kfree_skb(copy_skb);
+ goto drop_n_restore;
+}
+
+static void tpacket_destruct_skb(struct sk_buff *skb)
+{
+ struct packet_sock *po = pkt_sk(skb->sk);
+
+ if (likely(po->tx_ring.pg_vec)) {
+ void *ph;
+ __u32 ts;
+
+ ph = skb_shinfo(skb)->destructor_arg;
+ packet_dec_pending(&po->tx_ring);
+
+ ts = __packet_set_timestamp(po, ph, skb);
+ __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
+ }
+
+ sock_wfree(skb);
+}
+
+static bool ll_header_truncated(const struct net_device *dev, int len)
+{
+ /* net device doesn't like empty head */
+ if (unlikely(len <= dev->hard_header_len)) {
+ net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
+ current->comm, len, dev->hard_header_len);
+ return true;
+ }
+
+ return false;
+}
+
+static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
+ void *frame, struct net_device *dev, int size_max,
+ __be16 proto, unsigned char *addr, int hlen)
+{
+ union tpacket_uhdr ph;
+ int to_write, offset, len, tp_len, nr_frags, len_max;
+ struct socket *sock = po->sk.sk_socket;
+ struct page *page;
+ void *data;
+ int err;
+
+ ph.raw = frame;
+
+ skb->protocol = proto;
+ skb->dev = dev;
+ skb->priority = po->sk.sk_priority;
+ skb->mark = po->sk.sk_mark;
+ sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
+ skb_shinfo(skb)->destructor_arg = ph.raw;
+
+ switch (po->tp_version) {
+ case TPACKET_V2:
+ tp_len = ph.h2->tp_len;
+ break;
+ default:
+ tp_len = ph.h1->tp_len;
+ break;
+ }
+ if (unlikely(tp_len > size_max)) {
+ pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
+ return -EMSGSIZE;
+ }
+
+ skb_reserve(skb, hlen);
+ skb_reset_network_header(skb);
+
+ if (!packet_use_direct_xmit(po))
+ skb_probe_transport_header(skb, 0);
+ if (unlikely(po->tp_tx_has_off)) {
+ int off_min, off_max, off;
+ off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
+ off_max = po->tx_ring.frame_size - tp_len;
+ if (sock->type == SOCK_DGRAM) {
+ switch (po->tp_version) {
+ case TPACKET_V2:
+ off = ph.h2->tp_net;
+ break;
+ default:
+ off = ph.h1->tp_net;
+ break;
+ }
+ } else {
+ switch (po->tp_version) {
+ case TPACKET_V2:
+ off = ph.h2->tp_mac;
+ break;
+ default:
+ off = ph.h1->tp_mac;
+ break;
+ }
+ }
+ if (unlikely((off < off_min) || (off_max < off)))
+ return -EINVAL;
+ data = ph.raw + off;
+ } else {
+ data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
+ }
+ to_write = tp_len;
+
+ if (sock->type == SOCK_DGRAM) {
+ err = dev_hard_header(skb, dev, ntohs(proto), addr,
+ NULL, tp_len);
+ if (unlikely(err < 0))
+ return -EINVAL;
+ } else if (dev->hard_header_len) {
+ if (ll_header_truncated(dev, tp_len))
+ return -EINVAL;
+
+ skb_push(skb, dev->hard_header_len);
+ err = skb_store_bits(skb, 0, data,
+ dev->hard_header_len);
+ if (unlikely(err))
+ return err;
+
+ data += dev->hard_header_len;
+ to_write -= dev->hard_header_len;
+ }
+
+ offset = offset_in_page(data);
+ len_max = PAGE_SIZE - offset;
+ len = ((to_write > len_max) ? len_max : to_write);
+
+ skb->data_len = to_write;
+ skb->len += to_write;
+ skb->truesize += to_write;
+ atomic_add(to_write, &po->sk.sk_wmem_alloc);
+
+ while (likely(to_write)) {
+ nr_frags = skb_shinfo(skb)->nr_frags;
+
+ if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
+ pr_err("Packet exceed the number of skb frags(%lu)\n",
+ MAX_SKB_FRAGS);
+ return -EFAULT;
+ }
+
+ page = pgv_to_page(data);
+ data += len;
+ flush_dcache_page(page);
+ get_page(page);
+ skb_fill_page_desc(skb, nr_frags, page, offset, len);
+ to_write -= len;
+ offset = 0;
+ len_max = PAGE_SIZE;
+ len = ((to_write > len_max) ? len_max : to_write);
+ }
+
+ return tp_len;
+}
+
+static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
+{
+ struct sk_buff *skb;
+ struct net_device *dev;
+ __be16 proto;
+ int err, reserve = 0;
+ void *ph;
+ DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
+ bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
+ int tp_len, size_max;
+ unsigned char *addr;
+ int len_sum = 0;
+ int status = TP_STATUS_AVAILABLE;
+ int hlen, tlen;
+
+ mutex_lock(&po->pg_vec_lock);
+
+ if (likely(saddr == NULL)) {
+ dev = packet_cached_dev_get(po);
+ proto = po->num;
+ addr = NULL;
+ } else {
+ err = -EINVAL;
+ if (msg->msg_namelen < sizeof(struct sockaddr_ll))
+ goto out;
+ if (msg->msg_namelen < (saddr->sll_halen
+ + offsetof(struct sockaddr_ll,
+ sll_addr)))
+ goto out;
+ proto = saddr->sll_protocol;
+ addr = saddr->sll_addr;
+ dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
+ }
+
+ err = -ENXIO;
+ if (unlikely(dev == NULL))
+ goto out;
+ err = -ENETDOWN;
+ if (unlikely(!(dev->flags & IFF_UP)))
+ goto out_put;
+
+ reserve = dev->hard_header_len + VLAN_HLEN;
+ size_max = po->tx_ring.frame_size
+ - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
+
+ if (size_max > dev->mtu + reserve)
+ size_max = dev->mtu + reserve;
+
+ do {
+ ph = packet_current_frame(po, &po->tx_ring,
+ TP_STATUS_SEND_REQUEST);
+ if (unlikely(ph == NULL)) {
+ if (need_wait && need_resched())
+ schedule();
+ continue;
+ }
+
+ status = TP_STATUS_SEND_REQUEST;
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = sock_alloc_send_skb(&po->sk,
+ hlen + tlen + sizeof(struct sockaddr_ll),
+ !need_wait, &err);
+
+ if (unlikely(skb == NULL)) {
+ /* we assume the socket was initially writeable ... */
+ if (likely(len_sum > 0))
+ err = len_sum;
+ goto out_status;
+ }
+ tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
+ addr, hlen);
+ if (tp_len > dev->mtu + dev->hard_header_len) {
+ struct ethhdr *ehdr;
+ /* Earlier code assumed this would be a VLAN pkt,
+ * double-check this now that we have the actual
+ * packet in hand.
+ */
+
+ skb_reset_mac_header(skb);
+ ehdr = eth_hdr(skb);
+ if (ehdr->h_proto != htons(ETH_P_8021Q))
+ tp_len = -EMSGSIZE;
+ }
+ if (unlikely(tp_len < 0)) {
+ if (po->tp_loss) {
+ __packet_set_status(po, ph,
+ TP_STATUS_AVAILABLE);
+ packet_increment_head(&po->tx_ring);
+ kfree_skb(skb);
+ continue;
+ } else {
+ status = TP_STATUS_WRONG_FORMAT;
+ err = tp_len;
+ goto out_status;
+ }
+ }
+
+ packet_pick_tx_queue(dev, skb);
+
+ skb->destructor = tpacket_destruct_skb;
+ __packet_set_status(po, ph, TP_STATUS_SENDING);
+ packet_inc_pending(&po->tx_ring);
+
+ status = TP_STATUS_SEND_REQUEST;
+ err = po->xmit(skb);
+ if (unlikely(err > 0)) {
+ err = net_xmit_errno(err);
+ if (err && __packet_get_status(po, ph) ==
+ TP_STATUS_AVAILABLE) {
+ /* skb was destructed already */
+ skb = NULL;
+ goto out_status;
+ }
+ /*
+ * skb was dropped but not destructed yet;
+ * let's treat it like congestion or err < 0
+ */
+ err = 0;
+ }
+ packet_increment_head(&po->tx_ring);
+ len_sum += tp_len;
+ } while (likely((ph != NULL) ||
+ /* Note: packet_read_pending() might be slow if we have
+ * to call it as it's per_cpu variable, but in fast-path
+ * we already short-circuit the loop with the first
+ * condition, and luckily don't have to go that path
+ * anyway.
+ */
+ (need_wait && packet_read_pending(&po->tx_ring))));
+
+ err = len_sum;
+ goto out_put;
+
+out_status:
+ __packet_set_status(po, ph, status);
+ kfree_skb(skb);
+out_put:
+ dev_put(dev);
+out:
+ mutex_unlock(&po->pg_vec_lock);
+ return err;
+}
+
+static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
+ size_t reserve, size_t len,
+ size_t linear, int noblock,
+ int *err)
+{
+ struct sk_buff *skb;
+
+ /* Under a page? Don't bother with paged skb. */
+ if (prepad + len < PAGE_SIZE || !linear)
+ linear = len;
+
+ skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+ err, 0);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, reserve);
+ skb_put(skb, linear);
+ skb->data_len = len - linear;
+ skb->len += len - linear;
+
+ return skb;
+}
+
+static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
+ struct sk_buff *skb;
+ struct net_device *dev;
+ __be16 proto;
+ unsigned char *addr;
+ int err, reserve = 0;
+ struct virtio_net_hdr vnet_hdr = { 0 };
+ int offset = 0;
+ int vnet_hdr_len;
+ struct packet_sock *po = pkt_sk(sk);
+ unsigned short gso_type = 0;
+ int hlen, tlen;
+ int extra_len = 0;
+ ssize_t n;
+
+ /*
+ * Get and verify the address.
+ */
+
+ if (likely(saddr == NULL)) {
+ dev = packet_cached_dev_get(po);
+ proto = po->num;
+ addr = NULL;
+ } else {
+ err = -EINVAL;
+ if (msg->msg_namelen < sizeof(struct sockaddr_ll))
+ goto out;
+ if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
+ goto out;
+ proto = saddr->sll_protocol;
+ addr = saddr->sll_addr;
+ dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
+ }
+
+ err = -ENXIO;
+ if (unlikely(dev == NULL))
+ goto out_unlock;
+ err = -ENETDOWN;
+ if (unlikely(!(dev->flags & IFF_UP)))
+ goto out_unlock;
+
+ if (sock->type == SOCK_RAW)
+ reserve = dev->hard_header_len;
+ if (po->has_vnet_hdr) {
+ vnet_hdr_len = sizeof(vnet_hdr);
+
+ err = -EINVAL;
+ if (len < vnet_hdr_len)
+ goto out_unlock;
+
+ len -= vnet_hdr_len;
+
+ err = -EFAULT;
+ n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
+ if (n != vnet_hdr_len)
+ goto out_unlock;
+
+ if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+ (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
+ __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
+ __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
+ vnet_hdr.hdr_len = __cpu_to_virtio16(false,
+ __virtio16_to_cpu(false, vnet_hdr.csum_start) +
+ __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
+
+ err = -EINVAL;
+ if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
+ goto out_unlock;
+
+ if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ gso_type = SKB_GSO_TCPV4;
+ break;
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ gso_type = SKB_GSO_TCPV6;
+ break;
+ case VIRTIO_NET_HDR_GSO_UDP:
+ gso_type = SKB_GSO_UDP;
+ break;
+ default:
+ goto out_unlock;
+ }
+
+ if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+ gso_type |= SKB_GSO_TCP_ECN;
+
+ if (vnet_hdr.gso_size == 0)
+ goto out_unlock;
+
+ }
+ }
+
+ if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
+ if (!netif_supports_nofcs(dev)) {
+ err = -EPROTONOSUPPORT;
+ goto out_unlock;
+ }
+ extra_len = 4; /* We're doing our own CRC */
+ }
+
+ err = -EMSGSIZE;
+ if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
+ goto out_unlock;
+
+ err = -ENOBUFS;
+ hlen = LL_RESERVED_SPACE(dev);
+ tlen = dev->needed_tailroom;
+ skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
+ __virtio16_to_cpu(false, vnet_hdr.hdr_len),
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (skb == NULL)
+ goto out_unlock;
+
+ skb_set_network_header(skb, reserve);
+
+ err = -EINVAL;
+ if (sock->type == SOCK_DGRAM) {
+ offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
+ if (unlikely(offset < 0))
+ goto out_free;
+ } else {
+ if (ll_header_truncated(dev, len))
+ goto out_free;
+ }
+
+ /* Returns -EFAULT on error */
+ err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
+ if (err)
+ goto out_free;
+
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+
+ if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
+ /* Earlier code assumed this would be a VLAN pkt,
+ * double-check this now that we have the actual
+ * packet in hand.
+ */
+ struct ethhdr *ehdr;
+ skb_reset_mac_header(skb);
+ ehdr = eth_hdr(skb);
+ if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+ err = -EMSGSIZE;
+ goto out_free;
+ }
+ }
+
+ skb->protocol = proto;
+ skb->dev = dev;
+ skb->priority = sk->sk_priority;
+ skb->mark = sk->sk_mark;
+
+ packet_pick_tx_queue(dev, skb);
+
+ if (po->has_vnet_hdr) {
+ if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+ u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
+ u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
+ if (!skb_partial_csum_set(skb, s, o)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+ }
+
+ skb_shinfo(skb)->gso_size =
+ __virtio16_to_cpu(false, vnet_hdr.gso_size);
+ skb_shinfo(skb)->gso_type = gso_type;
+
+ /* Header must be checked, and gso_segs computed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+
+ len += vnet_hdr_len;
+ }
+
+ if (!packet_use_direct_xmit(po))
+ skb_probe_transport_header(skb, reserve);
+ if (unlikely(extra_len == 4))
+ skb->no_fcs = 1;
+
+ err = po->xmit(skb);
+ if (err > 0 && (err = net_xmit_errno(err)) != 0)
+ goto out_unlock;
+
+ dev_put(dev);
+
+ return len;
+
+out_free:
+ kfree_skb(skb);
+out_unlock:
+ if (dev)
+ dev_put(dev);
+out:
+ return err;
+}
+
+static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct packet_sock *po = pkt_sk(sk);
+
+ if (po->tx_ring.pg_vec)
+ return tpacket_snd(po, msg);
+ else
+ return packet_snd(sock, msg, len);
+}
+
+/*
+ * Close a PACKET socket. This is fairly simple. We immediately go
+ * to 'closed' state and remove our protocol entry in the device list.
+ */
+
+static int packet_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct packet_sock *po;
+ struct net *net;
+ union tpacket_req_u req_u;
+
+ if (!sk)
+ return 0;
+
+ net = sock_net(sk);
+ po = pkt_sk(sk);
+
+ mutex_lock(&net->packet.sklist_lock);
+ sk_del_node_init_rcu(sk);
+ mutex_unlock(&net->packet.sklist_lock);
+
+ preempt_disable();
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ preempt_enable();
+
+ spin_lock(&po->bind_lock);
+ unregister_prot_hook(sk, false);
+ packet_cached_dev_reset(po);
+
+ if (po->prot_hook.dev) {
+ dev_put(po->prot_hook.dev);
+ po->prot_hook.dev = NULL;
+ }
+ spin_unlock(&po->bind_lock);
+
+ packet_flush_mclist(sk);
+
+ if (po->rx_ring.pg_vec) {
+ memset(&req_u, 0, sizeof(req_u));
+ packet_set_ring(sk, &req_u, 1, 0);
+ }
+
+ if (po->tx_ring.pg_vec) {
+ memset(&req_u, 0, sizeof(req_u));
+ packet_set_ring(sk, &req_u, 1, 1);
+ }
+
+ fanout_release(sk);
+
+ synchronize_net();
+ /*
+ * Now the socket is dead. No more input will appear.
+ */
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ /* Purge queues */
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ packet_free_pending(po);
+ sk_refcnt_debug_release(sk);
+
+ sock_put(sk);
+ return 0;
+}
+
+/*
+ * Attach a packet hook.
+ */
+
+static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
+{
+ struct packet_sock *po = pkt_sk(sk);
+ const struct net_device *dev_curr;
+ __be16 proto_curr;
+ bool need_rehook;
+
+ if (po->fanout) {
+ if (dev)
+ dev_put(dev);
+
+ return -EINVAL;
+ }
+
+ lock_sock(sk);
+ spin_lock(&po->bind_lock);
+
+ proto_curr = po->prot_hook.type;
+ dev_curr = po->prot_hook.dev;
+
+ need_rehook = proto_curr != proto || dev_curr != dev;
+
+ if (need_rehook) {
+ unregister_prot_hook(sk, true);
+
+ po->num = proto;
+ po->prot_hook.type = proto;
+
+ if (po->prot_hook.dev)
+ dev_put(po->prot_hook.dev);
+
+ po->prot_hook.dev = dev;
+
+ po->ifindex = dev ? dev->ifindex : 0;
+ packet_cached_dev_assign(po, dev);
+ }
+
+ if (proto == 0 || !need_rehook)
+ goto out_unlock;
+
+ if (!dev || (dev->flags & IFF_UP)) {
+ register_prot_hook(sk);
+ } else {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ }
+
+out_unlock:
+ spin_unlock(&po->bind_lock);
+ release_sock(sk);
+ return 0;
+}
+
+/*
+ * Bind a packet socket to a device
+ */
+
+static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sock *sk = sock->sk;
+ char name[15];
+ struct net_device *dev;
+ int err = -ENODEV;
+
+ /*
+ * Check legality
+ */
+
+ if (addr_len != sizeof(struct sockaddr))
+ return -EINVAL;
+ strlcpy(name, uaddr->sa_data, sizeof(name));
+
+ dev = dev_get_by_name(sock_net(sk), name);
+ if (dev)
+ err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
+ return err;
+}
+
+static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
+ struct sock *sk = sock->sk;
+ struct net_device *dev = NULL;
+ int err;
+
+
+ /*
+ * Check legality
+ */
+
+ if (addr_len < sizeof(struct sockaddr_ll))
+ return -EINVAL;
+ if (sll->sll_family != AF_PACKET)
+ return -EINVAL;
+
+ if (sll->sll_ifindex) {
+ err = -ENODEV;
+ dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
+ if (dev == NULL)
+ goto out;
+ }
+ err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
+
+out:
+ return err;
+}
+
+static struct proto packet_proto = {
+ .name = "PACKET",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct packet_sock),
+};
+
+/*
+ * Create a packet of type SOCK_PACKET.
+ */
+
+static int packet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct packet_sock *po;
+ __be16 proto = (__force __be16)protocol; /* weird, but documented */
+ int err;
+
+ if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+ if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
+ sock->type != SOCK_PACKET)
+ return -ESOCKTNOSUPPORT;
+
+ sock->state = SS_UNCONNECTED;
+
+ err = -ENOBUFS;
+ sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+ if (sk == NULL)
+ goto out;
+
+ sock->ops = &packet_ops;
+ if (sock->type == SOCK_PACKET)
+ sock->ops = &packet_ops_spkt;
+
+ sock_init_data(sock, sk);
+
+ po = pkt_sk(sk);
+ sk->sk_family = PF_PACKET;
+ po->num = proto;
+ po->xmit = dev_queue_xmit;
+
+ err = packet_alloc_pending(po);
+ if (err)
+ goto out2;
+
+ packet_cached_dev_reset(po);
+
+ sk->sk_destruct = packet_sock_destruct;
+ sk_refcnt_debug_inc(sk);
+
+ /*
+ * Attach a protocol block
+ */
+
+ spin_lock_init(&po->bind_lock);
+ mutex_init(&po->pg_vec_lock);
+ po->prot_hook.func = packet_rcv;
+
+ if (sock->type == SOCK_PACKET)
+ po->prot_hook.func = packet_rcv_spkt;
+
+ po->prot_hook.af_packet_priv = sk;
+
+ if (proto) {
+ po->prot_hook.type = proto;
+ register_prot_hook(sk);
+ }
+
+ mutex_lock(&net->packet.sklist_lock);
+ sk_add_node_rcu(sk, &net->packet.sklist);
+ mutex_unlock(&net->packet.sklist_lock);
+
+ preempt_disable();
+ sock_prot_inuse_add(net, &packet_proto, 1);
+ preempt_enable();
+
+ return 0;
+out2:
+ sk_free(sk);
+out:
+ return err;
+}
+
+/*
+ * Pull a packet from our receive queue and hand it to the user.
+ * If necessary we block.
+ */
+
+static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ int copied, err;
+ int vnet_hdr_len = 0;
+ unsigned int origlen = 0;
+
+ err = -EINVAL;
+ if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
+ goto out;
+
+#if 0
+ /* What error should we return now? EUNATTACH? */
+ if (pkt_sk(sk)->ifindex < 0)
+ return -ENODEV;
+#endif
+
+ if (flags & MSG_ERRQUEUE) {
+ err = sock_recv_errqueue(sk, msg, len,
+ SOL_PACKET, PACKET_TX_TIMESTAMP);
+ goto out;
+ }
+
+ /*
+ * Call the generic datagram receiver. This handles all sorts
+ * of horrible races and re-entrancy so we can forget about it
+ * in the protocol layers.
+ *
+ * Now it will return ENETDOWN, if device have just gone down,
+ * but then it will block.
+ */
+
+ skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
+
+ /*
+ * An error occurred so return it. Because skb_recv_datagram()
+ * handles the blocking we don't see and worry about blocking
+ * retries.
+ */
+
+ if (skb == NULL)
+ goto out;
+
+ if (pkt_sk(sk)->has_vnet_hdr) {
+ struct virtio_net_hdr vnet_hdr = { 0 };
+
+ err = -EINVAL;
+ vnet_hdr_len = sizeof(vnet_hdr);
+ if (len < vnet_hdr_len)
+ goto out_free;
+
+ len -= vnet_hdr_len;
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+ /* This is a hint as to how much should be linear. */
+ vnet_hdr.hdr_len =
+ __cpu_to_virtio16(false, skb_headlen(skb));
+ vnet_hdr.gso_size =
+ __cpu_to_virtio16(false, sinfo->gso_size);
+ if (sinfo->gso_type & SKB_GSO_TCPV4)
+ vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ else if (sinfo->gso_type & SKB_GSO_TCPV6)
+ vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ else if (sinfo->gso_type & SKB_GSO_UDP)
+ vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+ else if (sinfo->gso_type & SKB_GSO_FCOE)
+ goto out_free;
+ else
+ BUG();
+ if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+ vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+ } else
+ vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ vnet_hdr.csum_start = __cpu_to_virtio16(false,
+ skb_checksum_start_offset(skb));
+ vnet_hdr.csum_offset = __cpu_to_virtio16(false,
+ skb->csum_offset);
+ } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+ vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
+ } /* else everything is zero */
+
+ err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
+ if (err < 0)
+ goto out_free;
+ }
+
+ /* You lose any data beyond the buffer you gave. If it worries
+ * a user program they can ask the device for its MTU
+ * anyway.
+ */
+ copied = skb->len;
+ if (copied > len) {
+ copied = len;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (err)
+ goto out_free;
+
+ if (sock->type != SOCK_PACKET) {
+ struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
+
+ /* Original length was stored in sockaddr_ll fields */
+ origlen = PACKET_SKB_CB(skb)->sa.origlen;
+ sll->sll_family = AF_PACKET;
+ sll->sll_protocol = skb->protocol;
+ }
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (msg->msg_name) {
+ /* If the address length field is there to be filled
+ * in, we fill it in now.
+ */
+ if (sock->type == SOCK_PACKET) {
+ __sockaddr_check_size(sizeof(struct sockaddr_pkt));
+ msg->msg_namelen = sizeof(struct sockaddr_pkt);
+ } else {
+ struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
+
+ msg->msg_namelen = sll->sll_halen +
+ offsetof(struct sockaddr_ll, sll_addr);
+ }
+ memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
+ msg->msg_namelen);
+ }
+
+ if (pkt_sk(sk)->auxdata) {
+ struct tpacket_auxdata aux;
+
+ aux.tp_status = TP_STATUS_USER;
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ aux.tp_status |= TP_STATUS_CSUMNOTREADY;
+ else if (skb->pkt_type != PACKET_OUTGOING &&
+ (skb->ip_summed == CHECKSUM_COMPLETE ||
+ skb_csum_unnecessary(skb)))
+ aux.tp_status |= TP_STATUS_CSUM_VALID;
+
+ aux.tp_len = origlen;
+ aux.tp_snaplen = skb->len;
+ aux.tp_mac = 0;
+ aux.tp_net = skb_network_offset(skb);
+ if (skb_vlan_tag_present(skb)) {
+ aux.tp_vlan_tci = skb_vlan_tag_get(skb);
+ aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
+ aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
+ } else {
+ aux.tp_vlan_tci = 0;
+ aux.tp_vlan_tpid = 0;
+ }
+ put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
+ }
+
+ /*
+ * Free or return the buffer as appropriate. Again this
+ * hides all the races and re-entrancy issues from us.
+ */
+ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
+
+out_free:
+ skb_free_datagram(sk, skb);
+out:
+ return err;
+}
+
+static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct net_device *dev;
+ struct sock *sk = sock->sk;
+
+ if (peer)
+ return -EOPNOTSUPP;
+
+ uaddr->sa_family = AF_PACKET;
+ memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
+ if (dev)
+ strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
+ rcu_read_unlock();
+ *uaddr_len = sizeof(*uaddr);
+
+ return 0;
+}
+
+static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct net_device *dev;
+ struct sock *sk = sock->sk;
+ struct packet_sock *po = pkt_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
+
+ if (peer)
+ return -EOPNOTSUPP;
+
+ sll->sll_family = AF_PACKET;
+ sll->sll_ifindex = po->ifindex;
+ sll->sll_protocol = po->num;
+ sll->sll_pkttype = 0;
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
+ if (dev) {
+ sll->sll_hatype = dev->type;
+ sll->sll_halen = dev->addr_len;
+ memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
+ } else {
+ sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
+ sll->sll_halen = 0;
+ }
+ rcu_read_unlock();
+ *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
+
+ return 0;
+}
+
+static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
+ int what)
+{
+ switch (i->type) {
+ case PACKET_MR_MULTICAST:
+ if (i->alen != dev->addr_len)
+ return -EINVAL;
+ if (what > 0)
+ return dev_mc_add(dev, i->addr);
+ else
+ return dev_mc_del(dev, i->addr);
+ break;
+ case PACKET_MR_PROMISC:
+ return dev_set_promiscuity(dev, what);
+ case PACKET_MR_ALLMULTI:
+ return dev_set_allmulti(dev, what);
+ case PACKET_MR_UNICAST:
+ if (i->alen != dev->addr_len)
+ return -EINVAL;
+ if (what > 0)
+ return dev_uc_add(dev, i->addr);
+ else
+ return dev_uc_del(dev, i->addr);
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void packet_dev_mclist_delete(struct net_device *dev,
+ struct packet_mclist **mlp)
+{
+ struct packet_mclist *ml;
+
+ while ((ml = *mlp) != NULL) {
+ if (ml->ifindex == dev->ifindex) {
+ packet_dev_mc(dev, ml, -1);
+ *mlp = ml->next;
+ kfree(ml);
+ } else
+ mlp = &ml->next;
+ }
+}
+
+static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
+{
+ struct packet_sock *po = pkt_sk(sk);
+ struct packet_mclist *ml, *i;
+ struct net_device *dev;
+ int err;
+
+ rtnl_lock();
+
+ err = -ENODEV;
+ dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
+ if (!dev)
+ goto done;
+
+ err = -EINVAL;
+ if (mreq->mr_alen > dev->addr_len)
+ goto done;
+
+ err = -ENOBUFS;
+ i = kmalloc(sizeof(*i), GFP_KERNEL);
+ if (i == NULL)
+ goto done;
+
+ err = 0;
+ for (ml = po->mclist; ml; ml = ml->next) {
+ if (ml->ifindex == mreq->mr_ifindex &&
+ ml->type == mreq->mr_type &&
+ ml->alen == mreq->mr_alen &&
+ memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
+ ml->count++;
+ /* Free the new element ... */
+ kfree(i);
+ goto done;
+ }
+ }
+
+ i->type = mreq->mr_type;
+ i->ifindex = mreq->mr_ifindex;
+ i->alen = mreq->mr_alen;
+ memcpy(i->addr, mreq->mr_address, i->alen);
+ i->count = 1;
+ i->next = po->mclist;
+ po->mclist = i;
+ err = packet_dev_mc(dev, i, 1);
+ if (err) {
+ po->mclist = i->next;
+ kfree(i);
+ }
+
+done:
+ rtnl_unlock();
+ return err;
+}
+
+static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
+{
+ struct packet_mclist *ml, **mlp;
+
+ rtnl_lock();
+
+ for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
+ if (ml->ifindex == mreq->mr_ifindex &&
+ ml->type == mreq->mr_type &&
+ ml->alen == mreq->mr_alen &&
+ memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
+ if (--ml->count == 0) {
+ struct net_device *dev;
+ *mlp = ml->next;
+ dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
+ if (dev)
+ packet_dev_mc(dev, ml, -1);
+ kfree(ml);
+ }
+ break;
+ }
+ }
+ rtnl_unlock();
+ return 0;
+}
+
+static void packet_flush_mclist(struct sock *sk)
+{
+ struct packet_sock *po = pkt_sk(sk);
+ struct packet_mclist *ml;
+
+ if (!po->mclist)
+ return;
+
+ rtnl_lock();
+ while ((ml = po->mclist) != NULL) {
+ struct net_device *dev;
+
+ po->mclist = ml->next;
+ dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
+ if (dev != NULL)
+ packet_dev_mc(dev, ml, -1);
+ kfree(ml);
+ }
+ rtnl_unlock();
+}
+
+static int
+packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct packet_sock *po = pkt_sk(sk);
+ int ret;
+
+ if (level != SOL_PACKET)
+ return -ENOPROTOOPT;
+
+ switch (optname) {
+ case PACKET_ADD_MEMBERSHIP:
+ case PACKET_DROP_MEMBERSHIP:
+ {
+ struct packet_mreq_max mreq;
+ int len = optlen;
+ memset(&mreq, 0, sizeof(mreq));
+ if (len < sizeof(struct packet_mreq))
+ return -EINVAL;
+ if (len > sizeof(mreq))
+ len = sizeof(mreq);
+ if (copy_from_user(&mreq, optval, len))
+ return -EFAULT;
+ if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
+ return -EINVAL;
+ if (optname == PACKET_ADD_MEMBERSHIP)
+ ret = packet_mc_add(sk, &mreq);
+ else
+ ret = packet_mc_drop(sk, &mreq);
+ return ret;
+ }
+
+ case PACKET_RX_RING:
+ case PACKET_TX_RING:
+ {
+ union tpacket_req_u req_u;
+ int len;
+
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ case TPACKET_V2:
+ len = sizeof(req_u.req);
+ break;
+ case TPACKET_V3:
+ default:
+ len = sizeof(req_u.req3);
+ break;
+ }
+ if (optlen < len)
+ return -EINVAL;
+ if (pkt_sk(sk)->has_vnet_hdr)
+ return -EINVAL;
+ if (copy_from_user(&req_u.req, optval, len))
+ return -EFAULT;
+ return packet_set_ring(sk, &req_u, 0,
+ optname == PACKET_TX_RING);
+ }
+ case PACKET_COPY_THRESH:
+ {
+ int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ pkt_sk(sk)->copy_thresh = val;
+ return 0;
+ }
+ case PACKET_VERSION:
+ {
+ int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+ return -EBUSY;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+ switch (val) {
+ case TPACKET_V1:
+ case TPACKET_V2:
+ case TPACKET_V3:
+ po->tp_version = val;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ }
+ case PACKET_RESERVE:
+ {
+ unsigned int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+ return -EBUSY;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+ po->tp_reserve = val;
+ return 0;
+ }
+ case PACKET_LOSS:
+ {
+ unsigned int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+ return -EBUSY;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+ po->tp_loss = !!val;
+ return 0;
+ }
+ case PACKET_AUXDATA:
+ {
+ int val;
+
+ if (optlen < sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ po->auxdata = !!val;
+ return 0;
+ }
+ case PACKET_ORIGDEV:
+ {
+ int val;
+
+ if (optlen < sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ po->origdev = !!val;
+ return 0;
+ }
+ case PACKET_VNET_HDR:
+ {
+ int val;
+
+ if (sock->type != SOCK_RAW)
+ return -EINVAL;
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+ return -EBUSY;
+ if (optlen < sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ po->has_vnet_hdr = !!val;
+ return 0;
+ }
+ case PACKET_TIMESTAMP:
+ {
+ int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ po->tp_tstamp = val;
+ return 0;
+ }
+ case PACKET_FANOUT:
+ {
+ int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ return fanout_add(sk, val & 0xffff, val >> 16);
+ }
+ case PACKET_TX_HAS_OFF:
+ {
+ unsigned int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+ return -EBUSY;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+ po->tp_tx_has_off = !!val;
+ return 0;
+ }
+ case PACKET_QDISC_BYPASS:
+ {
+ int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
+ return 0;
+ }
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+static int packet_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ int len;
+ int val, lv = sizeof(val);
+ struct sock *sk = sock->sk;
+ struct packet_sock *po = pkt_sk(sk);
+ void *data = &val;
+ union tpacket_stats_u st;
+
+ if (level != SOL_PACKET)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case PACKET_STATISTICS:
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ memcpy(&st, &po->stats, sizeof(st));
+ memset(&po->stats, 0, sizeof(po->stats));
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+ if (po->tp_version == TPACKET_V3) {
+ lv = sizeof(struct tpacket_stats_v3);
+ st.stats3.tp_packets += st.stats3.tp_drops;
+ data = &st.stats3;
+ } else {
+ lv = sizeof(struct tpacket_stats);
+ st.stats1.tp_packets += st.stats1.tp_drops;
+ data = &st.stats1;
+ }
+
+ break;
+ case PACKET_AUXDATA:
+ val = po->auxdata;
+ break;
+ case PACKET_ORIGDEV:
+ val = po->origdev;
+ break;
+ case PACKET_VNET_HDR:
+ val = po->has_vnet_hdr;
+ break;
+ case PACKET_VERSION:
+ val = po->tp_version;
+ break;
+ case PACKET_HDRLEN:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ if (copy_from_user(&val, optval, len))
+ return -EFAULT;
+ switch (val) {
+ case TPACKET_V1:
+ val = sizeof(struct tpacket_hdr);
+ break;
+ case TPACKET_V2:
+ val = sizeof(struct tpacket2_hdr);
+ break;
+ case TPACKET_V3:
+ val = sizeof(struct tpacket3_hdr);
+ break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ case PACKET_RESERVE:
+ val = po->tp_reserve;
+ break;
+ case PACKET_LOSS:
+ val = po->tp_loss;
+ break;
+ case PACKET_TIMESTAMP:
+ val = po->tp_tstamp;
+ break;
+ case PACKET_FANOUT:
+ val = (po->fanout ?
+ ((u32)po->fanout->id |
+ ((u32)po->fanout->type << 16) |
+ ((u32)po->fanout->flags << 24)) :
+ 0);
+ break;
+ case PACKET_TX_HAS_OFF:
+ val = po->tp_tx_has_off;
+ break;
+ case PACKET_QDISC_BYPASS:
+ val = packet_use_direct_xmit(po);
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (len > lv)
+ len = lv;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, data, len))
+ return -EFAULT;
+ return 0;
+}
+
+
+static int packet_notifier(struct notifier_block *this,
+ unsigned long msg, void *ptr)
+{
+ struct sock *sk;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ rcu_read_lock();
+ sk_for_each_rcu(sk, &net->packet.sklist) {
+ struct packet_sock *po = pkt_sk(sk);
+
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ if (po->mclist)
+ packet_dev_mclist_delete(dev, &po->mclist);
+ /* fallthrough */
+
+ case NETDEV_DOWN:
+ if (dev->ifindex == po->ifindex) {
+ spin_lock(&po->bind_lock);
+ if (po->running) {
+ __unregister_prot_hook(sk, false);
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+ }
+ if (msg == NETDEV_UNREGISTER) {
+ packet_cached_dev_reset(po);
+ po->ifindex = -1;
+ if (po->prot_hook.dev)
+ dev_put(po->prot_hook.dev);
+ po->prot_hook.dev = NULL;
+ }
+ spin_unlock(&po->bind_lock);
+ }
+ break;
+ case NETDEV_UP:
+ if (dev->ifindex == po->ifindex) {
+ spin_lock(&po->bind_lock);
+ if (po->num)
+ register_prot_hook(sk);
+ spin_unlock(&po->bind_lock);
+ }
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return NOTIFY_DONE;
+}
+
+
+static int packet_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCOUTQ:
+ {
+ int amount = sk_wmem_alloc_get(sk);
+
+ return put_user(amount, (int __user *)arg);
+ }
+ case SIOCINQ:
+ {
+ struct sk_buff *skb;
+ int amount = 0;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ amount = skb->len;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ return put_user(amount, (int __user *)arg);
+ }
+ case SIOCGSTAMP:
+ return sock_get_timestamp(sk, (struct timeval __user *)arg);
+ case SIOCGSTAMPNS:
+ return sock_get_timestampns(sk, (struct timespec __user *)arg);
+
+#ifdef CONFIG_INET
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFFLAGS:
+ return inet_dgram_ops.ioctl(sock, cmd, arg);
+#endif
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return 0;
+}
+
+static unsigned int packet_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct packet_sock *po = pkt_sk(sk);
+ unsigned int mask = datagram_poll(file, sock, wait);
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (po->rx_ring.pg_vec) {
+ if (!packet_previous_rx_frame(po, &po->rx_ring,
+ TP_STATUS_KERNEL))
+ mask |= POLLIN | POLLRDNORM;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ spin_lock_bh(&sk->sk_write_queue.lock);
+ if (po->tx_ring.pg_vec) {
+ if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ spin_unlock_bh(&sk->sk_write_queue.lock);
+ return mask;
+}
+
+
+/* Dirty? Well, I still did not learn better way to account
+ * for user mmaps.
+ */
+
+static void packet_mm_open(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct socket *sock = file->private_data;
+ struct sock *sk = sock->sk;
+
+ if (sk)
+ atomic_inc(&pkt_sk(sk)->mapped);
+}
+
+static void packet_mm_close(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct socket *sock = file->private_data;
+ struct sock *sk = sock->sk;
+
+ if (sk)
+ atomic_dec(&pkt_sk(sk)->mapped);
+}
+
+static const struct vm_operations_struct packet_mmap_ops = {
+ .open = packet_mm_open,
+ .close = packet_mm_close,
+};
+
+static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
+ unsigned int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (likely(pg_vec[i].buffer)) {
+ if (is_vmalloc_addr(pg_vec[i].buffer))
+ vfree(pg_vec[i].buffer);
+ else
+ free_pages((unsigned long)pg_vec[i].buffer,
+ order);
+ pg_vec[i].buffer = NULL;
+ }
+ }
+ kfree(pg_vec);
+}
+
+static char *alloc_one_pg_vec_page(unsigned long order)
+{
+ char *buffer;
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
+ __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+ buffer = (char *) __get_free_pages(gfp_flags, order);
+ if (buffer)
+ return buffer;
+
+ /* __get_free_pages failed, fall back to vmalloc */
+ buffer = vzalloc((1 << order) * PAGE_SIZE);
+ if (buffer)
+ return buffer;
+
+ /* vmalloc failed, lets dig into swap here */
+ gfp_flags &= ~__GFP_NORETRY;
+ buffer = (char *) __get_free_pages(gfp_flags, order);
+ if (buffer)
+ return buffer;
+
+ /* complete and utter failure */
+ return NULL;
+}
+
+static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
+{
+ unsigned int block_nr = req->tp_block_nr;
+ struct pgv *pg_vec;
+ int i;
+
+ pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
+ if (unlikely(!pg_vec))
+ goto out;
+
+ for (i = 0; i < block_nr; i++) {
+ pg_vec[i].buffer = alloc_one_pg_vec_page(order);
+ if (unlikely(!pg_vec[i].buffer))
+ goto out_free_pgvec;
+ }
+
+out:
+ return pg_vec;
+
+out_free_pgvec:
+ free_pg_vec(pg_vec, order, block_nr);
+ pg_vec = NULL;
+ goto out;
+}
+
+static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
+ int closing, int tx_ring)
+{
+ struct pgv *pg_vec = NULL;
+ struct packet_sock *po = pkt_sk(sk);
+ int was_running, order = 0;
+ struct packet_ring_buffer *rb;
+ struct sk_buff_head *rb_queue;
+ __be16 num;
+ int err = -EINVAL;
+ /* Added to avoid minimal code churn */
+ struct tpacket_req *req = &req_u->req;
+
+ /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
+ if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
+ WARN(1, "Tx-ring is not supported.\n");
+ goto out;
+ }
+
+ rb = tx_ring ? &po->tx_ring : &po->rx_ring;
+ rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
+
+ err = -EBUSY;
+ if (!closing) {
+ if (atomic_read(&po->mapped))
+ goto out;
+ if (packet_read_pending(rb))
+ goto out;
+ }
+
+ if (req->tp_block_nr) {
+ /* Sanity tests and some calculations */
+ err = -EBUSY;
+ if (unlikely(rb->pg_vec))
+ goto out;
+
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ po->tp_hdrlen = TPACKET_HDRLEN;
+ break;
+ case TPACKET_V2:
+ po->tp_hdrlen = TPACKET2_HDRLEN;
+ break;
+ case TPACKET_V3:
+ po->tp_hdrlen = TPACKET3_HDRLEN;
+ break;
+ }
+
+ err = -EINVAL;
+ if (unlikely((int)req->tp_block_size <= 0))
+ goto out;
+ if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
+ goto out;
+ if (po->tp_version >= TPACKET_V3 &&
+ (int)(req->tp_block_size -
+ BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
+ goto out;
+ if (unlikely(req->tp_frame_size < po->tp_hdrlen +
+ po->tp_reserve))
+ goto out;
+ if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
+ goto out;
+
+ rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
+ if (unlikely(rb->frames_per_block <= 0))
+ goto out;
+ if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
+ req->tp_frame_nr))
+ goto out;
+
+ err = -ENOMEM;
+ order = get_order(req->tp_block_size);
+ pg_vec = alloc_pg_vec(req, order);
+ if (unlikely(!pg_vec))
+ goto out;
+ switch (po->tp_version) {
+ case TPACKET_V3:
+ /* Transmit path is not supported. We checked
+ * it above but just being paranoid
+ */
+ if (!tx_ring)
+ init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
+ break;
+ default:
+ break;
+ }
+ }
+ /* Done */
+ else {
+ err = -EINVAL;
+ if (unlikely(req->tp_frame_nr))
+ goto out;
+ }
+
+ lock_sock(sk);
+
+ /* Detach socket from network */
+ spin_lock(&po->bind_lock);
+ was_running = po->running;
+ num = po->num;
+ if (was_running) {
+ po->num = 0;
+ __unregister_prot_hook(sk, false);
+ }
+ spin_unlock(&po->bind_lock);
+
+ synchronize_net();
+
+ err = -EBUSY;
+ mutex_lock(&po->pg_vec_lock);
+ if (closing || atomic_read(&po->mapped) == 0) {
+ err = 0;
+ spin_lock_bh(&rb_queue->lock);
+ swap(rb->pg_vec, pg_vec);
+ rb->frame_max = (req->tp_frame_nr - 1);
+ rb->head = 0;
+ rb->frame_size = req->tp_frame_size;
+ spin_unlock_bh(&rb_queue->lock);
+
+ swap(rb->pg_vec_order, order);
+ swap(rb->pg_vec_len, req->tp_block_nr);
+
+ rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
+ po->prot_hook.func = (po->rx_ring.pg_vec) ?
+ tpacket_rcv : packet_rcv;
+ skb_queue_purge(rb_queue);
+ if (atomic_read(&po->mapped))
+ pr_err("packet_mmap: vma is busy: %d\n",
+ atomic_read(&po->mapped));
+ }
+ mutex_unlock(&po->pg_vec_lock);
+
+ spin_lock(&po->bind_lock);
+ if (was_running) {
+ po->num = num;
+ register_prot_hook(sk);
+ }
+ spin_unlock(&po->bind_lock);
+ if (closing && (po->tp_version > TPACKET_V2)) {
+ /* Because we don't support block-based V3 on tx-ring */
+ if (!tx_ring)
+ prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
+ }
+ release_sock(sk);
+
+ if (pg_vec)
+ free_pg_vec(pg_vec, order, req->tp_block_nr);
+out:
+ return err;
+}
+
+static int packet_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ struct sock *sk = sock->sk;
+ struct packet_sock *po = pkt_sk(sk);
+ unsigned long size, expected_size;
+ struct packet_ring_buffer *rb;
+ unsigned long start;
+ int err = -EINVAL;
+ int i;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ mutex_lock(&po->pg_vec_lock);
+
+ expected_size = 0;
+ for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
+ if (rb->pg_vec) {
+ expected_size += rb->pg_vec_len
+ * rb->pg_vec_pages
+ * PAGE_SIZE;
+ }
+ }
+
+ if (expected_size == 0)
+ goto out;
+
+ size = vma->vm_end - vma->vm_start;
+ if (size != expected_size)
+ goto out;
+
+ start = vma->vm_start;
+ for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
+ if (rb->pg_vec == NULL)
+ continue;
+
+ for (i = 0; i < rb->pg_vec_len; i++) {
+ struct page *page;
+ void *kaddr = rb->pg_vec[i].buffer;
+ int pg_num;
+
+ for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
+ page = pgv_to_page(kaddr);
+ err = vm_insert_page(vma, start, page);
+ if (unlikely(err))
+ goto out;
+ start += PAGE_SIZE;
+ kaddr += PAGE_SIZE;
+ }
+ }
+ }
+
+ atomic_inc(&po->mapped);
+ vma->vm_ops = &packet_mmap_ops;
+ err = 0;
+
+out:
+ mutex_unlock(&po->pg_vec_lock);
+ return err;
+}
+
+static const struct proto_ops packet_ops_spkt = {
+ .family = PF_PACKET,
+ .owner = THIS_MODULE,
+ .release = packet_release,
+ .bind = packet_bind_spkt,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = packet_getname_spkt,
+ .poll = datagram_poll,
+ .ioctl = packet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = packet_sendmsg_spkt,
+ .recvmsg = packet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static const struct proto_ops packet_ops = {
+ .family = PF_PACKET,
+ .owner = THIS_MODULE,
+ .release = packet_release,
+ .bind = packet_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = packet_getname,
+ .poll = packet_poll,
+ .ioctl = packet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = packet_setsockopt,
+ .getsockopt = packet_getsockopt,
+ .sendmsg = packet_sendmsg,
+ .recvmsg = packet_recvmsg,
+ .mmap = packet_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static const struct net_proto_family packet_family_ops = {
+ .family = PF_PACKET,
+ .create = packet_create,
+ .owner = THIS_MODULE,
+};
+
+static struct notifier_block packet_netdev_notifier = {
+ .notifier_call = packet_notifier,
+};
+
+#ifdef CONFIG_PROC_FS
+
+static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct net *net = seq_file_net(seq);
+
+ rcu_read_lock();
+ return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
+}
+
+static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
+}
+
+static void packet_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int packet_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
+ else {
+ struct sock *s = sk_entry(v);
+ const struct packet_sock *po = pkt_sk(s);
+
+ seq_printf(seq,
+ "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
+ s,
+ atomic_read(&s->sk_refcnt),
+ s->sk_type,
+ ntohs(po->num),
+ po->ifindex,
+ po->running,
+ atomic_read(&s->sk_rmem_alloc),
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
+ sock_i_ino(s));
+ }
+
+ return 0;
+}
+
+static const struct seq_operations packet_seq_ops = {
+ .start = packet_seq_start,
+ .next = packet_seq_next,
+ .stop = packet_seq_stop,
+ .show = packet_seq_show,
+};
+
+static int packet_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &packet_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations packet_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = packet_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif
+
+static int __net_init packet_net_init(struct net *net)
+{
+ mutex_init(&net->packet.sklist_lock);
+ INIT_HLIST_HEAD(&net->packet.sklist);
+
+ if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit packet_net_exit(struct net *net)
+{
+ remove_proc_entry("packet", net->proc_net);
+}
+
+static struct pernet_operations packet_net_ops = {
+ .init = packet_net_init,
+ .exit = packet_net_exit,
+};
+
+
+static void __exit packet_exit(void)
+{
+ unregister_netdevice_notifier(&packet_netdev_notifier);
+ unregister_pernet_subsys(&packet_net_ops);
+ sock_unregister(PF_PACKET);
+ proto_unregister(&packet_proto);
+}
+
+static int __init packet_init(void)
+{
+ int rc = proto_register(&packet_proto, 0);
+
+ if (rc != 0)
+ goto out;
+
+ sock_register(&packet_family_ops);
+ register_pernet_subsys(&packet_net_ops);
+ register_netdevice_notifier(&packet_netdev_notifier);
+out:
+ return rc;
+}
+
+module_init(packet_init);
+module_exit(packet_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_PACKET);
diff --git a/net/packet/diag.c b/net/packet/diag.c
new file mode 100644
index 000000000..0ed68f023
--- /dev/null
+++ b/net/packet/diag.c
@@ -0,0 +1,265 @@
+#include <linux/module.h>
+#include <linux/sock_diag.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/packet_diag.h>
+#include <linux/percpu.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "internal.h"
+
+static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
+{
+ struct packet_diag_info pinfo;
+
+ pinfo.pdi_index = po->ifindex;
+ pinfo.pdi_version = po->tp_version;
+ pinfo.pdi_reserve = po->tp_reserve;
+ pinfo.pdi_copy_thresh = po->copy_thresh;
+ pinfo.pdi_tstamp = po->tp_tstamp;
+
+ pinfo.pdi_flags = 0;
+ if (po->running)
+ pinfo.pdi_flags |= PDI_RUNNING;
+ if (po->auxdata)
+ pinfo.pdi_flags |= PDI_AUXDATA;
+ if (po->origdev)
+ pinfo.pdi_flags |= PDI_ORIGDEV;
+ if (po->has_vnet_hdr)
+ pinfo.pdi_flags |= PDI_VNETHDR;
+ if (po->tp_loss)
+ pinfo.pdi_flags |= PDI_LOSS;
+
+ return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo);
+}
+
+static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb)
+{
+ struct nlattr *mca;
+ struct packet_mclist *ml;
+
+ mca = nla_nest_start(nlskb, PACKET_DIAG_MCLIST);
+ if (!mca)
+ return -EMSGSIZE;
+
+ rtnl_lock();
+ for (ml = po->mclist; ml; ml = ml->next) {
+ struct packet_diag_mclist *dml;
+
+ dml = nla_reserve_nohdr(nlskb, sizeof(*dml));
+ if (!dml) {
+ rtnl_unlock();
+ nla_nest_cancel(nlskb, mca);
+ return -EMSGSIZE;
+ }
+
+ dml->pdmc_index = ml->ifindex;
+ dml->pdmc_type = ml->type;
+ dml->pdmc_alen = ml->alen;
+ dml->pdmc_count = ml->count;
+ BUILD_BUG_ON(sizeof(dml->pdmc_addr) != sizeof(ml->addr));
+ memcpy(dml->pdmc_addr, ml->addr, sizeof(ml->addr));
+ }
+
+ rtnl_unlock();
+ nla_nest_end(nlskb, mca);
+
+ return 0;
+}
+
+static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
+ struct sk_buff *nlskb)
+{
+ struct packet_diag_ring pdr;
+
+ if (!ring->pg_vec || ((ver > TPACKET_V2) &&
+ (nl_type == PACKET_DIAG_TX_RING)))
+ return 0;
+
+ pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
+ pdr.pdr_block_nr = ring->pg_vec_len;
+ pdr.pdr_frame_size = ring->frame_size;
+ pdr.pdr_frame_nr = ring->frame_max + 1;
+
+ if (ver > TPACKET_V2) {
+ pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
+ pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
+ pdr.pdr_features = ring->prb_bdqc.feature_req_word;
+ } else {
+ pdr.pdr_retire_tmo = 0;
+ pdr.pdr_sizeof_priv = 0;
+ pdr.pdr_features = 0;
+ }
+
+ return nla_put(nlskb, nl_type, sizeof(pdr), &pdr);
+}
+
+static int pdiag_put_rings_cfg(struct packet_sock *po, struct sk_buff *skb)
+{
+ int ret;
+
+ mutex_lock(&po->pg_vec_lock);
+ ret = pdiag_put_ring(&po->rx_ring, po->tp_version,
+ PACKET_DIAG_RX_RING, skb);
+ if (!ret)
+ ret = pdiag_put_ring(&po->tx_ring, po->tp_version,
+ PACKET_DIAG_TX_RING, skb);
+ mutex_unlock(&po->pg_vec_lock);
+
+ return ret;
+}
+
+static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb)
+{
+ int ret = 0;
+
+ mutex_lock(&fanout_mutex);
+ if (po->fanout) {
+ u32 val;
+
+ val = (u32)po->fanout->id | ((u32)po->fanout->type << 16);
+ ret = nla_put_u32(nlskb, PACKET_DIAG_FANOUT, val);
+ }
+ mutex_unlock(&fanout_mutex);
+
+ return ret;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct packet_diag_req *req,
+ bool may_report_filterinfo,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u32 flags, int sk_ino)
+{
+ struct nlmsghdr *nlh;
+ struct packet_diag_msg *rp;
+ struct packet_sock *po = pkt_sk(sk);
+
+ nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rp), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rp = nlmsg_data(nlh);
+ rp->pdiag_family = AF_PACKET;
+ rp->pdiag_type = sk->sk_type;
+ rp->pdiag_num = ntohs(po->num);
+ rp->pdiag_ino = sk_ino;
+ sock_diag_save_cookie(sk, rp->pdiag_cookie);
+
+ if ((req->pdiag_show & PACKET_SHOW_INFO) &&
+ pdiag_put_info(po, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_INFO) &&
+ nla_put_u32(skb, PACKET_DIAG_UID,
+ from_kuid_munged(user_ns, sock_i_uid(sk))))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_MCLIST) &&
+ pdiag_put_mclist(po, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_RING_CFG) &&
+ pdiag_put_rings_cfg(po, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_FANOUT) &&
+ pdiag_put_fanout(po, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_MEMINFO) &&
+ sock_diag_put_meminfo(sk, skb, PACKET_DIAG_MEMINFO))
+ goto out_nlmsg_trim;
+
+ if ((req->pdiag_show & PACKET_SHOW_FILTER) &&
+ sock_diag_put_filterinfo(may_report_filterinfo, sk, skb,
+ PACKET_DIAG_FILTER))
+ goto out_nlmsg_trim;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+out_nlmsg_trim:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int num = 0, s_num = cb->args[0];
+ struct packet_diag_req *req;
+ struct net *net;
+ struct sock *sk;
+ bool may_report_filterinfo;
+
+ net = sock_net(skb->sk);
+ req = nlmsg_data(cb->nlh);
+ may_report_filterinfo = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+
+ mutex_lock(&net->packet.sklist_lock);
+ sk_for_each(sk, &net->packet.sklist) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+
+ if (sk_diag_fill(sk, skb, req,
+ may_report_filterinfo,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ sock_i_ino(sk)) < 0)
+ goto done;
+next:
+ num++;
+ }
+done:
+ mutex_unlock(&net->packet.sklist_lock);
+ cb->args[0] = num;
+
+ return skb->len;
+}
+
+static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ int hdrlen = sizeof(struct packet_diag_req);
+ struct net *net = sock_net(skb->sk);
+ struct packet_diag_req *req;
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ req = nlmsg_data(h);
+ /* Make it possible to support protocol filtering later */
+ if (req->sdiag_protocol)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = packet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ } else
+ return -EOPNOTSUPP;
+}
+
+static const struct sock_diag_handler packet_diag_handler = {
+ .family = AF_PACKET,
+ .dump = packet_diag_handler_dump,
+};
+
+static int __init packet_diag_init(void)
+{
+ return sock_diag_register(&packet_diag_handler);
+}
+
+static void __exit packet_diag_exit(void)
+{
+ sock_diag_unregister(&packet_diag_handler);
+}
+
+module_init(packet_diag_init);
+module_exit(packet_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */);
diff --git a/net/packet/internal.h b/net/packet/internal.h
new file mode 100644
index 000000000..fe6e20cae
--- /dev/null
+++ b/net/packet/internal.h
@@ -0,0 +1,125 @@
+#ifndef __PACKET_INTERNAL_H__
+#define __PACKET_INTERNAL_H__
+
+struct packet_mclist {
+ struct packet_mclist *next;
+ int ifindex;
+ int count;
+ unsigned short type;
+ unsigned short alen;
+ unsigned char addr[MAX_ADDR_LEN];
+};
+
+/* kbdq - kernel block descriptor queue */
+struct tpacket_kbdq_core {
+ struct pgv *pkbdq;
+ unsigned int feature_req_word;
+ unsigned int hdrlen;
+ unsigned char reset_pending_on_curr_blk;
+ unsigned char delete_blk_timer;
+ unsigned short kactive_blk_num;
+ unsigned short blk_sizeof_priv;
+
+ /* last_kactive_blk_num:
+ * trick to see if user-space has caught up
+ * in order to avoid refreshing timer when every single pkt arrives.
+ */
+ unsigned short last_kactive_blk_num;
+
+ char *pkblk_start;
+ char *pkblk_end;
+ int kblk_size;
+ unsigned int max_frame_len;
+ unsigned int knum_blocks;
+ uint64_t knxt_seq_num;
+ char *prev;
+ char *nxt_offset;
+ struct sk_buff *skb;
+
+ atomic_t blk_fill_in_prog;
+
+ /* Default is set to 8ms */
+#define DEFAULT_PRB_RETIRE_TOV (8)
+
+ unsigned short retire_blk_tov;
+ unsigned short version;
+ unsigned long tov_in_jiffies;
+
+ /* timer to retire an outstanding block */
+ struct timer_list retire_blk_timer;
+};
+
+struct pgv {
+ char *buffer;
+};
+
+struct packet_ring_buffer {
+ struct pgv *pg_vec;
+
+ unsigned int head;
+ unsigned int frames_per_block;
+ unsigned int frame_size;
+ unsigned int frame_max;
+
+ unsigned int pg_vec_order;
+ unsigned int pg_vec_pages;
+ unsigned int pg_vec_len;
+
+ unsigned int __percpu *pending_refcnt;
+
+ struct tpacket_kbdq_core prb_bdqc;
+};
+
+extern struct mutex fanout_mutex;
+#define PACKET_FANOUT_MAX 256
+
+struct packet_fanout {
+ possible_net_t net;
+ unsigned int num_members;
+ u16 id;
+ u8 type;
+ u8 flags;
+ atomic_t rr_cur;
+ struct list_head list;
+ struct sock *arr[PACKET_FANOUT_MAX];
+ int next[PACKET_FANOUT_MAX];
+ spinlock_t lock;
+ atomic_t sk_ref;
+ struct packet_type prot_hook ____cacheline_aligned_in_smp;
+};
+
+struct packet_sock {
+ /* struct sock has to be the first member of packet_sock */
+ struct sock sk;
+ struct packet_fanout *fanout;
+ union tpacket_stats_u stats;
+ struct packet_ring_buffer rx_ring;
+ struct packet_ring_buffer tx_ring;
+ int copy_thresh;
+ spinlock_t bind_lock;
+ struct mutex pg_vec_lock;
+ unsigned int running:1, /* prot_hook is attached*/
+ auxdata:1,
+ origdev:1,
+ has_vnet_hdr:1;
+ int ifindex; /* bound device */
+ __be16 num;
+ struct packet_mclist *mclist;
+ atomic_t mapped;
+ enum tpacket_versions tp_version;
+ unsigned int tp_hdrlen;
+ unsigned int tp_reserve;
+ unsigned int tp_loss:1;
+ unsigned int tp_tx_has_off:1;
+ unsigned int tp_tstamp;
+ struct net_device __rcu *cached_dev;
+ int (*xmit)(struct sk_buff *skb);
+ struct packet_type prot_hook ____cacheline_aligned_in_smp;
+};
+
+static struct packet_sock *pkt_sk(struct sock *sk)
+{
+ return (struct packet_sock *)sk;
+}
+
+#endif
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
new file mode 100644
index 000000000..6ec7d55b1
--- /dev/null
+++ b/net/phonet/Kconfig
@@ -0,0 +1,16 @@
+#
+# Phonet protocol
+#
+
+config PHONET
+ tristate "Phonet protocols family"
+ help
+ The Phone Network protocol (PhoNet) is a packet-oriented
+ communication protocol developed by Nokia for use with its modems.
+
+ This is required for Maemo to use cellular data connectivity (if
+ supported). It can also be used to control Nokia phones
+ from a Linux computer, although AT commands may be easier to use.
+
+ To compile this driver as a module, choose M here: the module
+ will be called phonet. If unsure, say N.
diff --git a/net/phonet/Makefile b/net/phonet/Makefile
new file mode 100644
index 000000000..e10b1b182
--- /dev/null
+++ b/net/phonet/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_PHONET) += phonet.o pn_pep.o
+
+phonet-y := \
+ pn_dev.o \
+ pn_netlink.o \
+ socket.o \
+ datagram.o \
+ sysctl.o \
+ af_phonet.o
+
+pn_pep-y := pep.o pep-gprs.o
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
new file mode 100644
index 000000000..32ab87d34
--- /dev/null
+++ b/net/phonet/af_phonet.c
@@ -0,0 +1,549 @@
+/*
+ * File: af_phonet.c
+ *
+ * Phonet protocols family
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Authors: Sakari Ailus <sakari.ailus@nokia.com>
+ * Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include <net/sock.h>
+
+#include <linux/if_phonet.h>
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+#include <net/phonet/pn_dev.h>
+
+/* Transport protocol registration */
+static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+
+static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
+{
+ struct phonet_protocol *pp;
+
+ if (protocol >= PHONET_NPROTO)
+ return NULL;
+
+ rcu_read_lock();
+ pp = rcu_dereference(proto_tab[protocol]);
+ if (pp && !try_module_get(pp->prot->owner))
+ pp = NULL;
+ rcu_read_unlock();
+
+ return pp;
+}
+
+static inline void phonet_proto_put(struct phonet_protocol *pp)
+{
+ module_put(pp->prot->owner);
+}
+
+/* protocol family functions */
+
+static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct pn_sock *pn;
+ struct phonet_protocol *pnp;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (protocol == 0) {
+ /* Default protocol selection */
+ switch (sock->type) {
+ case SOCK_DGRAM:
+ protocol = PN_PROTO_PHONET;
+ break;
+ case SOCK_SEQPACKET:
+ protocol = PN_PROTO_PIPE;
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+ }
+
+ pnp = phonet_proto_get(protocol);
+ if (pnp == NULL &&
+ request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0)
+ pnp = phonet_proto_get(protocol);
+
+ if (pnp == NULL)
+ return -EPROTONOSUPPORT;
+ if (sock->type != pnp->sock_type) {
+ err = -EPROTONOSUPPORT;
+ goto out;
+ }
+
+ sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot);
+ if (sk == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ sock_init_data(sock, sk);
+ sock->state = SS_UNCONNECTED;
+ sock->ops = pnp->ops;
+ sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+ sk->sk_protocol = protocol;
+ pn = pn_sk(sk);
+ pn->sobject = 0;
+ pn->dobject = 0;
+ pn->resource = 0;
+ sk->sk_prot->init(sk);
+ err = 0;
+
+out:
+ phonet_proto_put(pnp);
+ return err;
+}
+
+static const struct net_proto_family phonet_proto_family = {
+ .family = PF_PHONET,
+ .create = pn_socket_create,
+ .owner = THIS_MODULE,
+};
+
+/* Phonet device header operations */
+static int pn_header_create(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type, const void *daddr,
+ const void *saddr, unsigned int len)
+{
+ u8 *media = skb_push(skb, 1);
+
+ if (type != ETH_P_PHONET)
+ return -1;
+
+ if (!saddr)
+ saddr = dev->dev_addr;
+ *media = *(const u8 *)saddr;
+ return 1;
+}
+
+static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+ const u8 *media = skb_mac_header(skb);
+ *haddr = *media;
+ return 1;
+}
+
+struct header_ops phonet_header_ops = {
+ .create = pn_header_create,
+ .parse = pn_header_parse,
+};
+EXPORT_SYMBOL(phonet_header_ops);
+
+/*
+ * Prepends an ISI header and sends a datagram.
+ */
+static int pn_send(struct sk_buff *skb, struct net_device *dev,
+ u16 dst, u16 src, u8 res, u8 irq)
+{
+ struct phonethdr *ph;
+ int err;
+
+ if (skb->len + 2 > 0xffff /* Phonet length field limit */ ||
+ skb->len + sizeof(struct phonethdr) > dev->mtu) {
+ err = -EMSGSIZE;
+ goto drop;
+ }
+
+ /* Broadcast sending is not implemented */
+ if (pn_addr(dst) == PNADDR_BROADCAST) {
+ err = -EOPNOTSUPP;
+ goto drop;
+ }
+
+ skb_reset_transport_header(skb);
+ WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */
+ skb_push(skb, sizeof(struct phonethdr));
+ skb_reset_network_header(skb);
+ ph = pn_hdr(skb);
+ ph->pn_rdev = pn_dev(dst);
+ ph->pn_sdev = pn_dev(src);
+ ph->pn_res = res;
+ ph->pn_length = __cpu_to_be16(skb->len + 2 - sizeof(*ph));
+ ph->pn_robj = pn_obj(dst);
+ ph->pn_sobj = pn_obj(src);
+
+ skb->protocol = htons(ETH_P_PHONET);
+ skb->priority = 0;
+ skb->dev = dev;
+
+ if (skb->pkt_type == PACKET_LOOPBACK) {
+ skb_reset_mac_header(skb);
+ skb_orphan(skb);
+ err = (irq ? netif_rx(skb) : netif_rx_ni(skb)) ? -ENOBUFS : 0;
+ } else {
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ NULL, NULL, skb->len);
+ if (err < 0) {
+ err = -EHOSTUNREACH;
+ goto drop;
+ }
+ err = dev_queue_xmit(skb);
+ if (unlikely(err > 0))
+ err = net_xmit_errno(err);
+ }
+
+ return err;
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int pn_raw_send(const void *data, int len, struct net_device *dev,
+ u16 dst, u16 src, u8 res)
+{
+ struct sk_buff *skb = alloc_skb(MAX_PHONET_HEADER + len, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ if (phonet_address_lookup(dev_net(dev), pn_addr(dst)) == 0)
+ skb->pkt_type = PACKET_LOOPBACK;
+
+ skb_reserve(skb, MAX_PHONET_HEADER);
+ __skb_put(skb, len);
+ skb_copy_to_linear_data(skb, data, len);
+ return pn_send(skb, dev, dst, src, res, 1);
+}
+
+/*
+ * Create a Phonet header for the skb and send it out. Returns
+ * non-zero error code if failed. The skb is freed then.
+ */
+int pn_skb_send(struct sock *sk, struct sk_buff *skb,
+ const struct sockaddr_pn *target)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev;
+ struct pn_sock *pn = pn_sk(sk);
+ int err;
+ u16 src, dst;
+ u8 daddr, saddr, res;
+
+ src = pn->sobject;
+ if (target != NULL) {
+ dst = pn_sockaddr_get_object(target);
+ res = pn_sockaddr_get_resource(target);
+ } else {
+ dst = pn->dobject;
+ res = pn->resource;
+ }
+ daddr = pn_addr(dst);
+
+ err = -EHOSTUNREACH;
+ if (sk->sk_bound_dev_if)
+ dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+ else if (phonet_address_lookup(net, daddr) == 0) {
+ dev = phonet_device_get(net);
+ skb->pkt_type = PACKET_LOOPBACK;
+ } else if (dst == 0) {
+ /* Resource routing (small race until phonet_rcv()) */
+ struct sock *sk = pn_find_sock_by_res(net, res);
+ if (sk) {
+ sock_put(sk);
+ dev = phonet_device_get(net);
+ skb->pkt_type = PACKET_LOOPBACK;
+ } else
+ dev = phonet_route_output(net, daddr);
+ } else
+ dev = phonet_route_output(net, daddr);
+
+ if (!dev || !(dev->flags & IFF_UP))
+ goto drop;
+
+ saddr = phonet_address_get(dev, daddr);
+ if (saddr == PN_NO_ADDR)
+ goto drop;
+
+ if (!pn_addr(src))
+ src = pn_object(saddr, pn_obj(src));
+
+ err = pn_send(skb, dev, dst, src, res, 0);
+ dev_put(dev);
+ return err;
+
+drop:
+ kfree_skb(skb);
+ if (dev)
+ dev_put(dev);
+ return err;
+}
+EXPORT_SYMBOL(pn_skb_send);
+
+/* Do not send an error message in response to an error message */
+static inline int can_respond(struct sk_buff *skb)
+{
+ const struct phonethdr *ph;
+ const struct phonetmsg *pm;
+ u8 submsg_id;
+
+ if (!pskb_may_pull(skb, 3))
+ return 0;
+
+ ph = pn_hdr(skb);
+ if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5))
+ return 0;
+ if (ph->pn_res == PN_COMMGR) /* indications */
+ return 0;
+
+ ph = pn_hdr(skb); /* re-acquires the pointer */
+ pm = pn_msg(skb);
+ if (pm->pn_msg_id != PN_COMMON_MESSAGE)
+ return 1;
+ submsg_id = (ph->pn_res == PN_PREFIX)
+ ? pm->pn_e_submsg_id : pm->pn_submsg_id;
+ if (submsg_id != PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP &&
+ pm->pn_e_submsg_id != PN_COMM_SERVICE_NOT_IDENTIFIED_RESP)
+ return 1;
+ return 0;
+}
+
+static int send_obj_unreachable(struct sk_buff *rskb)
+{
+ const struct phonethdr *oph = pn_hdr(rskb);
+ const struct phonetmsg *opm = pn_msg(rskb);
+ struct phonetmsg resp;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.pn_trans_id = opm->pn_trans_id;
+ resp.pn_msg_id = PN_COMMON_MESSAGE;
+ if (oph->pn_res == PN_PREFIX) {
+ resp.pn_e_res_id = opm->pn_e_res_id;
+ resp.pn_e_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP;
+ resp.pn_e_orig_msg_id = opm->pn_msg_id;
+ resp.pn_e_status = 0;
+ } else {
+ resp.pn_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP;
+ resp.pn_orig_msg_id = opm->pn_msg_id;
+ resp.pn_status = 0;
+ }
+ return pn_raw_send(&resp, sizeof(resp), rskb->dev,
+ pn_object(oph->pn_sdev, oph->pn_sobj),
+ pn_object(oph->pn_rdev, oph->pn_robj),
+ oph->pn_res);
+}
+
+static int send_reset_indications(struct sk_buff *rskb)
+{
+ struct phonethdr *oph = pn_hdr(rskb);
+ static const u8 data[4] = {
+ 0x00 /* trans ID */, 0x10 /* subscribe msg */,
+ 0x00 /* subscription count */, 0x00 /* dummy */
+ };
+
+ return pn_raw_send(data, sizeof(data), rskb->dev,
+ pn_object(oph->pn_sdev, 0x00),
+ pn_object(oph->pn_rdev, oph->pn_robj),
+ PN_COMMGR);
+}
+
+
+/* packet type functions */
+
+/*
+ * Stuff received packets to associated sockets.
+ * On error, returns non-zero and releases the skb.
+ */
+static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pkttype,
+ struct net_device *orig_dev)
+{
+ struct net *net = dev_net(dev);
+ struct phonethdr *ph;
+ struct sockaddr_pn sa;
+ u16 len;
+
+ /* check we have at least a full Phonet header */
+ if (!pskb_pull(skb, sizeof(struct phonethdr)))
+ goto out;
+
+ /* check that the advertised length is correct */
+ ph = pn_hdr(skb);
+ len = get_unaligned_be16(&ph->pn_length);
+ if (len < 2)
+ goto out;
+ len -= 2;
+ if ((len > skb->len) || pskb_trim(skb, len))
+ goto out;
+ skb_reset_transport_header(skb);
+
+ pn_skb_get_dst_sockaddr(skb, &sa);
+
+ /* check if this is broadcasted */
+ if (pn_sockaddr_get_addr(&sa) == PNADDR_BROADCAST) {
+ pn_deliver_sock_broadcast(net, skb);
+ goto out;
+ }
+
+ /* resource routing */
+ if (pn_sockaddr_get_object(&sa) == 0) {
+ struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource);
+ if (sk)
+ return sk_receive_skb(sk, skb, 0);
+ }
+
+ /* check if we are the destination */
+ if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) {
+ /* Phonet packet input */
+ struct sock *sk = pn_find_sock_by_sa(net, &sa);
+
+ if (sk)
+ return sk_receive_skb(sk, skb, 0);
+
+ if (can_respond(skb)) {
+ send_obj_unreachable(skb);
+ send_reset_indications(skb);
+ }
+ } else if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+ goto out; /* Race between address deletion and loopback */
+ else {
+ /* Phonet packet routing */
+ struct net_device *out_dev;
+
+ out_dev = phonet_route_output(net, pn_sockaddr_get_addr(&sa));
+ if (!out_dev) {
+ net_dbg_ratelimited("No Phonet route to %02X\n",
+ pn_sockaddr_get_addr(&sa));
+ goto out;
+ }
+
+ __skb_push(skb, sizeof(struct phonethdr));
+ skb->dev = out_dev;
+ if (out_dev == dev) {
+ net_dbg_ratelimited("Phonet loop to %02X on %s\n",
+ pn_sockaddr_get_addr(&sa),
+ dev->name);
+ goto out_dev;
+ }
+ /* Some drivers (e.g. TUN) do not allocate HW header space */
+ if (skb_cow_head(skb, out_dev->hard_header_len))
+ goto out_dev;
+
+ if (dev_hard_header(skb, out_dev, ETH_P_PHONET, NULL, NULL,
+ skb->len) < 0)
+ goto out_dev;
+ dev_queue_xmit(skb);
+ dev_put(out_dev);
+ return NET_RX_SUCCESS;
+out_dev:
+ dev_put(out_dev);
+ }
+
+out:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static struct packet_type phonet_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_PHONET),
+ .func = phonet_rcv,
+};
+
+static DEFINE_MUTEX(proto_tab_lock);
+
+int __init_or_module phonet_proto_register(unsigned int protocol,
+ struct phonet_protocol *pp)
+{
+ int err = 0;
+
+ if (protocol >= PHONET_NPROTO)
+ return -EINVAL;
+
+ err = proto_register(pp->prot, 1);
+ if (err)
+ return err;
+
+ mutex_lock(&proto_tab_lock);
+ if (proto_tab[protocol])
+ err = -EBUSY;
+ else
+ rcu_assign_pointer(proto_tab[protocol], pp);
+ mutex_unlock(&proto_tab_lock);
+
+ return err;
+}
+EXPORT_SYMBOL(phonet_proto_register);
+
+void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp)
+{
+ mutex_lock(&proto_tab_lock);
+ BUG_ON(proto_tab[protocol] != pp);
+ RCU_INIT_POINTER(proto_tab[protocol], NULL);
+ mutex_unlock(&proto_tab_lock);
+ synchronize_rcu();
+ proto_unregister(pp->prot);
+}
+EXPORT_SYMBOL(phonet_proto_unregister);
+
+/* Module registration */
+static int __init phonet_init(void)
+{
+ int err;
+
+ err = phonet_device_init();
+ if (err)
+ return err;
+
+ pn_sock_init();
+ err = sock_register(&phonet_proto_family);
+ if (err) {
+ printk(KERN_ALERT
+ "phonet protocol family initialization failed\n");
+ goto err_sock;
+ }
+
+ dev_add_pack(&phonet_packet_type);
+ phonet_sysctl_init();
+
+ err = isi_register();
+ if (err)
+ goto err;
+ return 0;
+
+err:
+ phonet_sysctl_exit();
+ sock_unregister(PF_PHONET);
+ dev_remove_pack(&phonet_packet_type);
+err_sock:
+ phonet_device_exit();
+ return err;
+}
+
+static void __exit phonet_exit(void)
+{
+ isi_unregister();
+ phonet_sysctl_exit();
+ sock_unregister(PF_PHONET);
+ dev_remove_pack(&phonet_packet_type);
+ phonet_device_exit();
+}
+
+module_init(phonet_init);
+module_exit(phonet_exit);
+MODULE_DESCRIPTION("Phonet protocol stack for Linux");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_PHONET);
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
new file mode 100644
index 000000000..5e710435f
--- /dev/null
+++ b/net/phonet/datagram.c
@@ -0,0 +1,212 @@
+/*
+ * File: datagram.c
+ *
+ * Datagram (ISI) Phonet sockets
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Authors: Sakari Ailus <sakari.ailus@nokia.com>
+ * Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <asm/ioctls.h>
+#include <net/sock.h>
+
+#include <linux/phonet.h>
+#include <linux/export.h>
+#include <net/phonet/phonet.h>
+
+static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+
+/* associated socket ceases to exist */
+static void pn_sock_close(struct sock *sk, long timeout)
+{
+ sk_common_release(sk);
+}
+
+static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct sk_buff *skb;
+ int answ;
+
+ switch (cmd) {
+ case SIOCINQ:
+ lock_sock(sk);
+ skb = skb_peek(&sk->sk_receive_queue);
+ answ = skb ? skb->len : 0;
+ release_sock(sk);
+ return put_user(answ, (int __user *)arg);
+
+ case SIOCPNADDRESOURCE:
+ case SIOCPNDELRESOURCE: {
+ u32 res;
+ if (get_user(res, (u32 __user *)arg))
+ return -EFAULT;
+ if (res >= 256)
+ return -EINVAL;
+ if (cmd == SIOCPNADDRESOURCE)
+ return pn_sock_bind_res(sk, res);
+ else
+ return pn_sock_unbind_res(sk, res);
+ }
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+/* Destroy socket. All references are gone. */
+static void pn_destruct(struct sock *sk)
+{
+ skb_queue_purge(&sk->sk_receive_queue);
+}
+
+static int pn_init(struct sock *sk)
+{
+ sk->sk_destruct = pn_destruct;
+ return 0;
+}
+
+static int pn_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ DECLARE_SOCKADDR(struct sockaddr_pn *, target, msg->msg_name);
+ struct sk_buff *skb;
+ int err;
+
+ if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|
+ MSG_CMSG_COMPAT))
+ return -EOPNOTSUPP;
+
+ if (target == NULL)
+ return -EDESTADDRREQ;
+
+ if (msg->msg_namelen < sizeof(struct sockaddr_pn))
+ return -EINVAL;
+
+ if (target->spn_family != AF_PHONET)
+ return -EAFNOSUPPORT;
+
+ skb = sock_alloc_send_skb(sk, MAX_PHONET_HEADER + len,
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (skb == NULL)
+ return err;
+ skb_reserve(skb, MAX_PHONET_HEADER);
+
+ err = memcpy_from_msg((void *)skb_put(skb, len), msg, len);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ /*
+ * Fill in the Phonet header and
+ * finally pass the packet forwards.
+ */
+ err = pn_skb_send(sk, skb, target);
+
+ /* If ok, return len. */
+ return (err >= 0) ? len : err;
+}
+
+static int pn_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct sk_buff *skb = NULL;
+ struct sockaddr_pn sa;
+ int rval = -EOPNOTSUPP;
+ int copylen;
+
+ if (flags & ~(MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_NOSIGNAL|
+ MSG_CMSG_COMPAT))
+ goto out_nofree;
+
+ skb = skb_recv_datagram(sk, flags, noblock, &rval);
+ if (skb == NULL)
+ goto out_nofree;
+
+ pn_skb_get_src_sockaddr(skb, &sa);
+
+ copylen = skb->len;
+ if (len < copylen) {
+ msg->msg_flags |= MSG_TRUNC;
+ copylen = len;
+ }
+
+ rval = skb_copy_datagram_msg(skb, 0, msg, copylen);
+ if (rval) {
+ rval = -EFAULT;
+ goto out;
+ }
+
+ rval = (flags & MSG_TRUNC) ? skb->len : copylen;
+
+ if (msg->msg_name != NULL) {
+ __sockaddr_check_size(sizeof(sa));
+ memcpy(msg->msg_name, &sa, sizeof(sa));
+ *addr_len = sizeof(sa);
+ }
+
+out:
+ skb_free_datagram(sk, skb);
+
+out_nofree:
+ return rval;
+}
+
+/* Queue an skb for a sock. */
+static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int err = sock_queue_rcv_skb(sk, skb);
+
+ if (err < 0)
+ kfree_skb(skb);
+ return err ? NET_RX_DROP : NET_RX_SUCCESS;
+}
+
+/* Module registration */
+static struct proto pn_proto = {
+ .close = pn_sock_close,
+ .ioctl = pn_ioctl,
+ .init = pn_init,
+ .sendmsg = pn_sendmsg,
+ .recvmsg = pn_recvmsg,
+ .backlog_rcv = pn_backlog_rcv,
+ .hash = pn_sock_hash,
+ .unhash = pn_sock_unhash,
+ .get_port = pn_sock_get_port,
+ .obj_size = sizeof(struct pn_sock),
+ .owner = THIS_MODULE,
+ .name = "PHONET",
+};
+
+static struct phonet_protocol pn_dgram_proto = {
+ .ops = &phonet_dgram_ops,
+ .prot = &pn_proto,
+ .sock_type = SOCK_DGRAM,
+};
+
+int __init isi_register(void)
+{
+ return phonet_proto_register(PN_PROTO_PHONET, &pn_dgram_proto);
+}
+
+void __exit isi_unregister(void)
+{
+ phonet_proto_unregister(PN_PROTO_PHONET, &pn_dgram_proto);
+}
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
new file mode 100644
index 000000000..fa8237fdc
--- /dev/null
+++ b/net/phonet/pep-gprs.c
@@ -0,0 +1,327 @@
+/*
+ * File: pep-gprs.c
+ *
+ * GPRS over Phonet pipe end point socket
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Author: Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <net/sock.h>
+
+#include <linux/if_phonet.h>
+#include <net/tcp_states.h>
+#include <net/phonet/gprs.h>
+
+#define GPRS_DEFAULT_MTU 1400
+
+struct gprs_dev {
+ struct sock *sk;
+ void (*old_state_change)(struct sock *);
+ void (*old_data_ready)(struct sock *);
+ void (*old_write_space)(struct sock *);
+
+ struct net_device *dev;
+};
+
+static __be16 gprs_type_trans(struct sk_buff *skb)
+{
+ const u8 *pvfc;
+ u8 buf;
+
+ pvfc = skb_header_pointer(skb, 0, 1, &buf);
+ if (!pvfc)
+ return htons(0);
+ /* Look at IP version field */
+ switch (*pvfc >> 4) {
+ case 4:
+ return htons(ETH_P_IP);
+ case 6:
+ return htons(ETH_P_IPV6);
+ }
+ return htons(0);
+}
+
+static void gprs_writeable(struct gprs_dev *gp)
+{
+ struct net_device *dev = gp->dev;
+
+ if (pep_writeable(gp->sk))
+ netif_wake_queue(dev);
+}
+
+/*
+ * Socket callbacks
+ */
+
+static void gprs_state_change(struct sock *sk)
+{
+ struct gprs_dev *gp = sk->sk_user_data;
+
+ if (sk->sk_state == TCP_CLOSE_WAIT) {
+ struct net_device *dev = gp->dev;
+
+ netif_stop_queue(dev);
+ netif_carrier_off(dev);
+ }
+}
+
+static int gprs_recv(struct gprs_dev *gp, struct sk_buff *skb)
+{
+ struct net_device *dev = gp->dev;
+ int err = 0;
+ __be16 protocol = gprs_type_trans(skb);
+
+ if (!protocol) {
+ err = -EINVAL;
+ goto drop;
+ }
+
+ if (skb_headroom(skb) & 3) {
+ struct sk_buff *rskb, *fs;
+ int flen = 0;
+
+ /* Phonet Pipe data header may be misaligned (3 bytes),
+ * so wrap the IP packet as a single fragment of an head-less
+ * socket buffer. The network stack will pull what it needs,
+ * but at least, the whole IP payload is not memcpy'd. */
+ rskb = netdev_alloc_skb(dev, 0);
+ if (!rskb) {
+ err = -ENOBUFS;
+ goto drop;
+ }
+ skb_shinfo(rskb)->frag_list = skb;
+ rskb->len += skb->len;
+ rskb->data_len += rskb->len;
+ rskb->truesize += rskb->len;
+
+ /* Avoid nested fragments */
+ skb_walk_frags(skb, fs)
+ flen += fs->len;
+ skb->next = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+ skb->len -= flen;
+ skb->data_len -= flen;
+ skb->truesize -= flen;
+
+ skb = rskb;
+ }
+
+ skb->protocol = protocol;
+ skb_reset_mac_header(skb);
+ skb->dev = dev;
+
+ if (likely(dev->flags & IFF_UP)) {
+ dev->stats.rx_packets++;
+ dev->stats.rx_bytes += skb->len;
+ netif_rx(skb);
+ skb = NULL;
+ } else
+ err = -ENODEV;
+
+drop:
+ if (skb) {
+ dev_kfree_skb(skb);
+ dev->stats.rx_dropped++;
+ }
+ return err;
+}
+
+static void gprs_data_ready(struct sock *sk)
+{
+ struct gprs_dev *gp = sk->sk_user_data;
+ struct sk_buff *skb;
+
+ while ((skb = pep_read(sk)) != NULL) {
+ skb_orphan(skb);
+ gprs_recv(gp, skb);
+ }
+}
+
+static void gprs_write_space(struct sock *sk)
+{
+ struct gprs_dev *gp = sk->sk_user_data;
+
+ if (netif_running(gp->dev))
+ gprs_writeable(gp);
+}
+
+/*
+ * Network device callbacks
+ */
+
+static int gprs_open(struct net_device *dev)
+{
+ struct gprs_dev *gp = netdev_priv(dev);
+
+ gprs_writeable(gp);
+ return 0;
+}
+
+static int gprs_close(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct gprs_dev *gp = netdev_priv(dev);
+ struct sock *sk = gp->sk;
+ int len, err;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ break;
+ default:
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ skb_orphan(skb);
+ skb_set_owner_w(skb, sk);
+ len = skb->len;
+ err = pep_write(sk, skb);
+ if (err) {
+ net_dbg_ratelimited("%s: TX error (%d)\n", dev->name, err);
+ dev->stats.tx_aborted_errors++;
+ dev->stats.tx_errors++;
+ } else {
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += len;
+ }
+
+ netif_stop_queue(dev);
+ if (pep_writeable(sk))
+ netif_wake_queue(dev);
+ return NETDEV_TX_OK;
+}
+
+static int gprs_set_mtu(struct net_device *dev, int new_mtu)
+{
+ if ((new_mtu < 576) || (new_mtu > (PHONET_MAX_MTU - 11)))
+ return -EINVAL;
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static const struct net_device_ops gprs_netdev_ops = {
+ .ndo_open = gprs_open,
+ .ndo_stop = gprs_close,
+ .ndo_start_xmit = gprs_xmit,
+ .ndo_change_mtu = gprs_set_mtu,
+};
+
+static void gprs_setup(struct net_device *dev)
+{
+ dev->features = NETIF_F_FRAGLIST;
+ dev->type = ARPHRD_PHONET_PIPE;
+ dev->flags = IFF_POINTOPOINT | IFF_NOARP;
+ dev->mtu = GPRS_DEFAULT_MTU;
+ dev->hard_header_len = 0;
+ dev->addr_len = 0;
+ dev->tx_queue_len = 10;
+
+ dev->netdev_ops = &gprs_netdev_ops;
+ dev->destructor = free_netdev;
+}
+
+/*
+ * External interface
+ */
+
+/*
+ * Attach a GPRS interface to a datagram socket.
+ * Returns the interface index on success, negative error code on error.
+ */
+int gprs_attach(struct sock *sk)
+{
+ static const char ifname[] = "gprs%d";
+ struct gprs_dev *gp;
+ struct net_device *dev;
+ int err;
+
+ if (unlikely(sk->sk_type == SOCK_STREAM))
+ return -EINVAL; /* need packet boundaries */
+
+ /* Create net device */
+ dev = alloc_netdev(sizeof(*gp), ifname, NET_NAME_UNKNOWN, gprs_setup);
+ if (!dev)
+ return -ENOMEM;
+ gp = netdev_priv(dev);
+ gp->sk = sk;
+ gp->dev = dev;
+
+ netif_stop_queue(dev);
+ err = register_netdev(dev);
+ if (err) {
+ free_netdev(dev);
+ return err;
+ }
+
+ lock_sock(sk);
+ if (unlikely(sk->sk_user_data)) {
+ err = -EBUSY;
+ goto out_rel;
+ }
+ if (unlikely((1 << sk->sk_state & (TCPF_CLOSE|TCPF_LISTEN)) ||
+ sock_flag(sk, SOCK_DEAD))) {
+ err = -EINVAL;
+ goto out_rel;
+ }
+ sk->sk_user_data = gp;
+ gp->old_state_change = sk->sk_state_change;
+ gp->old_data_ready = sk->sk_data_ready;
+ gp->old_write_space = sk->sk_write_space;
+ sk->sk_state_change = gprs_state_change;
+ sk->sk_data_ready = gprs_data_ready;
+ sk->sk_write_space = gprs_write_space;
+ release_sock(sk);
+ sock_hold(sk);
+
+ printk(KERN_DEBUG"%s: attached\n", dev->name);
+ return dev->ifindex;
+
+out_rel:
+ release_sock(sk);
+ unregister_netdev(dev);
+ return err;
+}
+
+void gprs_detach(struct sock *sk)
+{
+ struct gprs_dev *gp = sk->sk_user_data;
+ struct net_device *dev = gp->dev;
+
+ lock_sock(sk);
+ sk->sk_user_data = NULL;
+ sk->sk_state_change = gp->old_state_change;
+ sk->sk_data_ready = gp->old_data_ready;
+ sk->sk_write_space = gp->old_write_space;
+ release_sock(sk);
+
+ printk(KERN_DEBUG"%s: detached\n", dev->name);
+ unregister_netdev(dev);
+ sock_put(sk);
+}
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
new file mode 100644
index 000000000..6de2aeb98
--- /dev/null
+++ b/net/phonet/pep.c
@@ -0,0 +1,1373 @@
+/*
+ * File: pep.c
+ *
+ * Phonet pipe protocol end point socket
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Author: Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/ioctls.h>
+
+#include <linux/phonet.h>
+#include <linux/module.h>
+#include <net/phonet/phonet.h>
+#include <net/phonet/pep.h>
+#include <net/phonet/gprs.h>
+
+/* sk_state values:
+ * TCP_CLOSE sock not in use yet
+ * TCP_CLOSE_WAIT disconnected pipe
+ * TCP_LISTEN listening pipe endpoint
+ * TCP_SYN_RECV connected pipe in disabled state
+ * TCP_ESTABLISHED connected pipe in enabled state
+ *
+ * pep_sock locking:
+ * - sk_state, hlist: sock lock needed
+ * - listener: read only
+ * - pipe_handle: read only
+ */
+
+#define CREDITS_MAX 10
+#define CREDITS_THR 7
+
+#define pep_sb_size(s) (((s) + 5) & ~3) /* 2-bytes head, 32-bits aligned */
+
+/* Get the next TLV sub-block. */
+static unsigned char *pep_get_sb(struct sk_buff *skb, u8 *ptype, u8 *plen,
+ void *buf)
+{
+ void *data = NULL;
+ struct {
+ u8 sb_type;
+ u8 sb_len;
+ } *ph, h;
+ int buflen = *plen;
+
+ ph = skb_header_pointer(skb, 0, 2, &h);
+ if (ph == NULL || ph->sb_len < 2 || !pskb_may_pull(skb, ph->sb_len))
+ return NULL;
+ ph->sb_len -= 2;
+ *ptype = ph->sb_type;
+ *plen = ph->sb_len;
+
+ if (buflen > ph->sb_len)
+ buflen = ph->sb_len;
+ data = skb_header_pointer(skb, 2, buflen, buf);
+ __skb_pull(skb, 2 + ph->sb_len);
+ return data;
+}
+
+static struct sk_buff *pep_alloc_skb(struct sock *sk, const void *payload,
+ int len, gfp_t priority)
+{
+ struct sk_buff *skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
+ if (!skb)
+ return NULL;
+ skb_set_owner_w(skb, sk);
+
+ skb_reserve(skb, MAX_PNPIPE_HEADER);
+ __skb_put(skb, len);
+ skb_copy_to_linear_data(skb, payload, len);
+ __skb_push(skb, sizeof(struct pnpipehdr));
+ skb_reset_transport_header(skb);
+ return skb;
+}
+
+static int pep_reply(struct sock *sk, struct sk_buff *oskb, u8 code,
+ const void *data, int len, gfp_t priority)
+{
+ const struct pnpipehdr *oph = pnp_hdr(oskb);
+ struct pnpipehdr *ph;
+ struct sk_buff *skb;
+ struct sockaddr_pn peer;
+
+ skb = pep_alloc_skb(sk, data, len, priority);
+ if (!skb)
+ return -ENOMEM;
+
+ ph = pnp_hdr(skb);
+ ph->utid = oph->utid;
+ ph->message_id = oph->message_id + 1; /* REQ -> RESP */
+ ph->pipe_handle = oph->pipe_handle;
+ ph->error_code = code;
+
+ pn_skb_get_src_sockaddr(oskb, &peer);
+ return pn_skb_send(sk, skb, &peer);
+}
+
+static int pep_indicate(struct sock *sk, u8 id, u8 code,
+ const void *data, int len, gfp_t priority)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *ph;
+ struct sk_buff *skb;
+
+ skb = pep_alloc_skb(sk, data, len, priority);
+ if (!skb)
+ return -ENOMEM;
+
+ ph = pnp_hdr(skb);
+ ph->utid = 0;
+ ph->message_id = id;
+ ph->pipe_handle = pn->pipe_handle;
+ ph->data[0] = code;
+ return pn_skb_send(sk, skb, NULL);
+}
+
+#define PAD 0x00
+
+static int pipe_handler_request(struct sock *sk, u8 id, u8 code,
+ const void *data, int len)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *ph;
+ struct sk_buff *skb;
+
+ skb = pep_alloc_skb(sk, data, len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ ph = pnp_hdr(skb);
+ ph->utid = id; /* whatever */
+ ph->message_id = id;
+ ph->pipe_handle = pn->pipe_handle;
+ ph->data[0] = code;
+ return pn_skb_send(sk, skb, NULL);
+}
+
+static int pipe_handler_send_created_ind(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ u8 data[4] = {
+ PN_PIPE_SB_NEGOTIATED_FC, pep_sb_size(2),
+ pn->tx_fc, pn->rx_fc,
+ };
+
+ return pep_indicate(sk, PNS_PIPE_CREATED_IND, 1 /* sub-blocks */,
+ data, 4, GFP_ATOMIC);
+}
+
+static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
+{
+ static const u8 data[20] = {
+ PAD, PAD, PAD, 2 /* sub-blocks */,
+ PN_PIPE_SB_REQUIRED_FC_TX, pep_sb_size(5), 3, PAD,
+ PN_MULTI_CREDIT_FLOW_CONTROL,
+ PN_ONE_CREDIT_FLOW_CONTROL,
+ PN_LEGACY_FLOW_CONTROL,
+ PAD,
+ PN_PIPE_SB_PREFERRED_FC_RX, pep_sb_size(5), 3, PAD,
+ PN_MULTI_CREDIT_FLOW_CONTROL,
+ PN_ONE_CREDIT_FLOW_CONTROL,
+ PN_LEGACY_FLOW_CONTROL,
+ PAD,
+ };
+
+ might_sleep();
+ return pep_reply(sk, skb, PN_PIPE_NO_ERROR, data, sizeof(data),
+ GFP_KERNEL);
+}
+
+static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code,
+ gfp_t priority)
+{
+ static const u8 data[4] = { PAD, PAD, PAD, 0 /* sub-blocks */ };
+ WARN_ON(code == PN_PIPE_NO_ERROR);
+ return pep_reply(sk, skb, code, data, sizeof(data), priority);
+}
+
+/* Control requests are not sent by the pipe service and have a specific
+ * message format. */
+static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code,
+ gfp_t priority)
+{
+ const struct pnpipehdr *oph = pnp_hdr(oskb);
+ struct sk_buff *skb;
+ struct pnpipehdr *ph;
+ struct sockaddr_pn dst;
+ u8 data[4] = {
+ oph->data[0], /* PEP type */
+ code, /* error code, at an unusual offset */
+ PAD, PAD,
+ };
+
+ skb = pep_alloc_skb(sk, data, 4, priority);
+ if (!skb)
+ return -ENOMEM;
+
+ ph = pnp_hdr(skb);
+ ph->utid = oph->utid;
+ ph->message_id = PNS_PEP_CTRL_RESP;
+ ph->pipe_handle = oph->pipe_handle;
+ ph->data[0] = oph->data[1]; /* CTRL id */
+
+ pn_skb_get_src_sockaddr(oskb, &dst);
+ return pn_skb_send(sk, skb, &dst);
+}
+
+static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority)
+{
+ u8 data[4] = { type, PAD, PAD, status };
+
+ return pep_indicate(sk, PNS_PEP_STATUS_IND, PN_PEP_TYPE_COMMON,
+ data, 4, priority);
+}
+
+/* Send our RX flow control information to the sender.
+ * Socket must be locked. */
+static void pipe_grant_credits(struct sock *sk, gfp_t priority)
+{
+ struct pep_sock *pn = pep_sk(sk);
+
+ BUG_ON(sk->sk_state != TCP_ESTABLISHED);
+
+ switch (pn->rx_fc) {
+ case PN_LEGACY_FLOW_CONTROL: /* TODO */
+ break;
+ case PN_ONE_CREDIT_FLOW_CONTROL:
+ if (pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL,
+ PEP_IND_READY, priority) == 0)
+ pn->rx_credits = 1;
+ break;
+ case PN_MULTI_CREDIT_FLOW_CONTROL:
+ if ((pn->rx_credits + CREDITS_THR) > CREDITS_MAX)
+ break;
+ if (pipe_snd_status(sk, PN_PEP_IND_ID_MCFC_GRANT_CREDITS,
+ CREDITS_MAX - pn->rx_credits,
+ priority) == 0)
+ pn->rx_credits = CREDITS_MAX;
+ break;
+ }
+}
+
+static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *hdr;
+ int wake = 0;
+
+ if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
+ return -EINVAL;
+
+ hdr = pnp_hdr(skb);
+ if (hdr->data[0] != PN_PEP_TYPE_COMMON) {
+ net_dbg_ratelimited("Phonet unknown PEP type: %u\n",
+ (unsigned int)hdr->data[0]);
+ return -EOPNOTSUPP;
+ }
+
+ switch (hdr->data[1]) {
+ case PN_PEP_IND_FLOW_CONTROL:
+ switch (pn->tx_fc) {
+ case PN_LEGACY_FLOW_CONTROL:
+ switch (hdr->data[4]) {
+ case PEP_IND_BUSY:
+ atomic_set(&pn->tx_credits, 0);
+ break;
+ case PEP_IND_READY:
+ atomic_set(&pn->tx_credits, wake = 1);
+ break;
+ }
+ break;
+ case PN_ONE_CREDIT_FLOW_CONTROL:
+ if (hdr->data[4] == PEP_IND_READY)
+ atomic_set(&pn->tx_credits, wake = 1);
+ break;
+ }
+ break;
+
+ case PN_PEP_IND_ID_MCFC_GRANT_CREDITS:
+ if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL)
+ break;
+ atomic_add(wake = hdr->data[4], &pn->tx_credits);
+ break;
+
+ default:
+ net_dbg_ratelimited("Phonet unknown PEP indication: %u\n",
+ (unsigned int)hdr->data[1]);
+ return -EOPNOTSUPP;
+ }
+ if (wake)
+ sk->sk_write_space(sk);
+ return 0;
+}
+
+static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *hdr = pnp_hdr(skb);
+ u8 n_sb = hdr->data[0];
+
+ pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+ __skb_pull(skb, sizeof(*hdr));
+ while (n_sb > 0) {
+ u8 type, buf[2], len = sizeof(buf);
+ u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+ if (data == NULL)
+ return -EINVAL;
+ switch (type) {
+ case PN_PIPE_SB_NEGOTIATED_FC:
+ if (len < 2 || (data[0] | data[1]) > 3)
+ break;
+ pn->tx_fc = data[0] & 3;
+ pn->rx_fc = data[1] & 3;
+ break;
+ }
+ n_sb--;
+ }
+ return 0;
+}
+
+/* Queue an skb to a connected sock.
+ * Socket lock must be held. */
+static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *hdr = pnp_hdr(skb);
+ struct sk_buff_head *queue;
+ int err = 0;
+
+ BUG_ON(sk->sk_state == TCP_CLOSE_WAIT);
+
+ switch (hdr->message_id) {
+ case PNS_PEP_CONNECT_REQ:
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC);
+ break;
+
+ case PNS_PEP_DISCONNECT_REQ:
+ pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+ sk->sk_state = TCP_CLOSE_WAIT;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ break;
+
+ case PNS_PEP_ENABLE_REQ:
+ /* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */
+ pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+ break;
+
+ case PNS_PEP_RESET_REQ:
+ switch (hdr->state_after_reset) {
+ case PN_PIPE_DISABLE:
+ pn->init_enable = 0;
+ break;
+ case PN_PIPE_ENABLE:
+ pn->init_enable = 1;
+ break;
+ default: /* not allowed to send an error here!? */
+ err = -EINVAL;
+ goto out;
+ }
+ /* fall through */
+ case PNS_PEP_DISABLE_REQ:
+ atomic_set(&pn->tx_credits, 0);
+ pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+ break;
+
+ case PNS_PEP_CTRL_REQ:
+ if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
+ atomic_inc(&sk->sk_drops);
+ break;
+ }
+ __skb_pull(skb, 4);
+ queue = &pn->ctrlreq_queue;
+ goto queue;
+
+ case PNS_PIPE_ALIGNED_DATA:
+ __skb_pull(skb, 1);
+ /* fall through */
+ case PNS_PIPE_DATA:
+ __skb_pull(skb, 3); /* Pipe data header */
+ if (!pn_flow_safe(pn->rx_fc)) {
+ err = sock_queue_rcv_skb(sk, skb);
+ if (!err)
+ return NET_RX_SUCCESS;
+ err = -ENOBUFS;
+ break;
+ }
+
+ if (pn->rx_credits == 0) {
+ atomic_inc(&sk->sk_drops);
+ err = -ENOBUFS;
+ break;
+ }
+ pn->rx_credits--;
+ queue = &sk->sk_receive_queue;
+ goto queue;
+
+ case PNS_PEP_STATUS_IND:
+ pipe_rcv_status(sk, skb);
+ break;
+
+ case PNS_PIPE_REDIRECTED_IND:
+ err = pipe_rcv_created(sk, skb);
+ break;
+
+ case PNS_PIPE_CREATED_IND:
+ err = pipe_rcv_created(sk, skb);
+ if (err)
+ break;
+ /* fall through */
+ case PNS_PIPE_RESET_IND:
+ if (!pn->init_enable)
+ break;
+ /* fall through */
+ case PNS_PIPE_ENABLED_IND:
+ if (!pn_flow_safe(pn->tx_fc)) {
+ atomic_set(&pn->tx_credits, 1);
+ sk->sk_write_space(sk);
+ }
+ if (sk->sk_state == TCP_ESTABLISHED)
+ break; /* Nothing to do */
+ sk->sk_state = TCP_ESTABLISHED;
+ pipe_grant_credits(sk, GFP_ATOMIC);
+ break;
+
+ case PNS_PIPE_DISABLED_IND:
+ sk->sk_state = TCP_SYN_RECV;
+ pn->rx_credits = 0;
+ break;
+
+ default:
+ net_dbg_ratelimited("Phonet unknown PEP message: %u\n",
+ hdr->message_id);
+ err = -EINVAL;
+ }
+out:
+ kfree_skb(skb);
+ return (err == -ENOBUFS) ? NET_RX_DROP : NET_RX_SUCCESS;
+
+queue:
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+ skb_queue_tail(queue, skb);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return NET_RX_SUCCESS;
+}
+
+/* Destroy connected sock. */
+static void pipe_destruct(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&pn->ctrlreq_queue);
+}
+
+static u8 pipe_negotiate_fc(const u8 *fcs, unsigned int n)
+{
+ unsigned int i;
+ u8 final_fc = PN_NO_FLOW_CONTROL;
+
+ for (i = 0; i < n; i++) {
+ u8 fc = fcs[i];
+
+ if (fc > final_fc && fc < PN_MAX_FLOW_CONTROL)
+ final_fc = fc;
+ }
+ return final_fc;
+}
+
+static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *hdr;
+ u8 n_sb;
+
+ if (!pskb_pull(skb, sizeof(*hdr) + 4))
+ return -EINVAL;
+
+ hdr = pnp_hdr(skb);
+ if (hdr->error_code != PN_PIPE_NO_ERROR)
+ return -ECONNREFUSED;
+
+ /* Parse sub-blocks */
+ n_sb = hdr->data[4];
+ while (n_sb > 0) {
+ u8 type, buf[6], len = sizeof(buf);
+ const u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+ if (data == NULL)
+ return -EINVAL;
+
+ switch (type) {
+ case PN_PIPE_SB_REQUIRED_FC_TX:
+ if (len < 2 || len < data[0])
+ break;
+ pn->tx_fc = pipe_negotiate_fc(data + 2, len - 2);
+ break;
+
+ case PN_PIPE_SB_PREFERRED_FC_RX:
+ if (len < 2 || len < data[0])
+ break;
+ pn->rx_fc = pipe_negotiate_fc(data + 2, len - 2);
+ break;
+
+ }
+ n_sb--;
+ }
+
+ return pipe_handler_send_created_ind(sk);
+}
+
+static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pnpipehdr *hdr = pnp_hdr(skb);
+
+ if (hdr->error_code != PN_PIPE_NO_ERROR)
+ return -ECONNREFUSED;
+
+ return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */,
+ NULL, 0, GFP_ATOMIC);
+
+}
+
+static void pipe_start_flow_control(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+
+ if (!pn_flow_safe(pn->tx_fc)) {
+ atomic_set(&pn->tx_credits, 1);
+ sk->sk_write_space(sk);
+ }
+ pipe_grant_credits(sk, GFP_ATOMIC);
+}
+
+/* Queue an skb to an actively connected sock.
+ * Socket lock must be held. */
+static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *hdr = pnp_hdr(skb);
+ int err = NET_RX_SUCCESS;
+
+ switch (hdr->message_id) {
+ case PNS_PIPE_ALIGNED_DATA:
+ __skb_pull(skb, 1);
+ /* fall through */
+ case PNS_PIPE_DATA:
+ __skb_pull(skb, 3); /* Pipe data header */
+ if (!pn_flow_safe(pn->rx_fc)) {
+ err = sock_queue_rcv_skb(sk, skb);
+ if (!err)
+ return NET_RX_SUCCESS;
+ err = NET_RX_DROP;
+ break;
+ }
+
+ if (pn->rx_credits == 0) {
+ atomic_inc(&sk->sk_drops);
+ err = NET_RX_DROP;
+ break;
+ }
+ pn->rx_credits--;
+ skb->dev = NULL;
+ skb_set_owner_r(skb, sk);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return NET_RX_SUCCESS;
+
+ case PNS_PEP_CONNECT_RESP:
+ if (sk->sk_state != TCP_SYN_SENT)
+ break;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ if (pep_connresp_rcv(sk, skb)) {
+ sk->sk_state = TCP_CLOSE_WAIT;
+ break;
+ }
+ if (pn->init_enable == PN_PIPE_DISABLE)
+ sk->sk_state = TCP_SYN_RECV;
+ else {
+ sk->sk_state = TCP_ESTABLISHED;
+ pipe_start_flow_control(sk);
+ }
+ break;
+
+ case PNS_PEP_ENABLE_RESP:
+ if (sk->sk_state != TCP_SYN_SENT)
+ break;
+
+ if (pep_enableresp_rcv(sk, skb)) {
+ sk->sk_state = TCP_CLOSE_WAIT;
+ break;
+ }
+
+ sk->sk_state = TCP_ESTABLISHED;
+ pipe_start_flow_control(sk);
+ break;
+
+ case PNS_PEP_DISCONNECT_RESP:
+ /* sock should already be dead, nothing to do */
+ break;
+
+ case PNS_PEP_STATUS_IND:
+ pipe_rcv_status(sk, skb);
+ break;
+ }
+ kfree_skb(skb);
+ return err;
+}
+
+/* Listening sock must be locked */
+static struct sock *pep_find_pipe(const struct hlist_head *hlist,
+ const struct sockaddr_pn *dst,
+ u8 pipe_handle)
+{
+ struct sock *sknode;
+ u16 dobj = pn_sockaddr_get_object(dst);
+
+ sk_for_each(sknode, hlist) {
+ struct pep_sock *pnnode = pep_sk(sknode);
+
+ /* Ports match, but addresses might not: */
+ if (pnnode->pn_sk.sobject != dobj)
+ continue;
+ if (pnnode->pipe_handle != pipe_handle)
+ continue;
+ if (sknode->sk_state == TCP_CLOSE_WAIT)
+ continue;
+
+ sock_hold(sknode);
+ return sknode;
+ }
+ return NULL;
+}
+
+/*
+ * Deliver an skb to a listening sock.
+ * Socket lock must be held.
+ * We then queue the skb to the right connected sock (if any).
+ */
+static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct sock *sknode;
+ struct pnpipehdr *hdr;
+ struct sockaddr_pn dst;
+ u8 pipe_handle;
+
+ if (!pskb_may_pull(skb, sizeof(*hdr)))
+ goto drop;
+
+ hdr = pnp_hdr(skb);
+ pipe_handle = hdr->pipe_handle;
+ if (pipe_handle == PN_PIPE_INVALID_HANDLE)
+ goto drop;
+
+ pn_skb_get_dst_sockaddr(skb, &dst);
+
+ /* Look for an existing pipe handle */
+ sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
+ if (sknode)
+ return sk_receive_skb(sknode, skb, 1);
+
+ switch (hdr->message_id) {
+ case PNS_PEP_CONNECT_REQ:
+ if (sk->sk_state != TCP_LISTEN || sk_acceptq_is_full(sk)) {
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE,
+ GFP_ATOMIC);
+ break;
+ }
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ sk_acceptq_added(sk);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ return NET_RX_SUCCESS;
+
+ case PNS_PEP_DISCONNECT_REQ:
+ pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+ break;
+
+ case PNS_PEP_CTRL_REQ:
+ pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE, GFP_ATOMIC);
+ break;
+
+ case PNS_PEP_RESET_REQ:
+ case PNS_PEP_ENABLE_REQ:
+ case PNS_PEP_DISABLE_REQ:
+ /* invalid handle is not even allowed here! */
+ break;
+
+ default:
+ if ((1 << sk->sk_state)
+ & ~(TCPF_CLOSE|TCPF_LISTEN|TCPF_CLOSE_WAIT))
+ /* actively connected socket */
+ return pipe_handler_do_rcv(sk, skb);
+ }
+drop:
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+}
+
+static int pipe_do_remove(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *ph;
+ struct sk_buff *skb;
+
+ skb = pep_alloc_skb(sk, NULL, 0, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ ph = pnp_hdr(skb);
+ ph->utid = 0;
+ ph->message_id = PNS_PIPE_REMOVE_REQ;
+ ph->pipe_handle = pn->pipe_handle;
+ ph->data[0] = PAD;
+ return pn_skb_send(sk, skb, NULL);
+}
+
+/* associated socket ceases to exist */
+static void pep_sock_close(struct sock *sk, long timeout)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ int ifindex = 0;
+
+ sock_hold(sk); /* keep a reference after sk_common_release() */
+ sk_common_release(sk);
+
+ lock_sock(sk);
+ if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) {
+ if (sk->sk_backlog_rcv == pipe_do_rcv)
+ /* Forcefully remove dangling Phonet pipe */
+ pipe_do_remove(sk);
+ else
+ pipe_handler_request(sk, PNS_PEP_DISCONNECT_REQ, PAD,
+ NULL, 0);
+ }
+ sk->sk_state = TCP_CLOSE;
+
+ ifindex = pn->ifindex;
+ pn->ifindex = 0;
+ release_sock(sk);
+
+ if (ifindex)
+ gprs_detach(sk);
+ sock_put(sk);
+}
+
+static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
+{
+ struct pep_sock *pn = pep_sk(sk), *newpn;
+ struct sock *newsk = NULL;
+ struct sk_buff *skb;
+ struct pnpipehdr *hdr;
+ struct sockaddr_pn dst, src;
+ int err;
+ u16 peer_type;
+ u8 pipe_handle, enabled, n_sb;
+ u8 aligned = 0;
+
+ skb = skb_recv_datagram(sk, 0, flags & O_NONBLOCK, errp);
+ if (!skb)
+ return NULL;
+
+ lock_sock(sk);
+ if (sk->sk_state != TCP_LISTEN) {
+ err = -EINVAL;
+ goto drop;
+ }
+ sk_acceptq_removed(sk);
+
+ err = -EPROTO;
+ if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
+ goto drop;
+
+ hdr = pnp_hdr(skb);
+ pipe_handle = hdr->pipe_handle;
+ switch (hdr->state_after_connect) {
+ case PN_PIPE_DISABLE:
+ enabled = 0;
+ break;
+ case PN_PIPE_ENABLE:
+ enabled = 1;
+ break;
+ default:
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM,
+ GFP_KERNEL);
+ goto drop;
+ }
+ peer_type = hdr->other_pep_type << 8;
+
+ /* Parse sub-blocks (options) */
+ n_sb = hdr->data[4];
+ while (n_sb > 0) {
+ u8 type, buf[1], len = sizeof(buf);
+ const u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+ if (data == NULL)
+ goto drop;
+ switch (type) {
+ case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE:
+ if (len < 1)
+ goto drop;
+ peer_type = (peer_type & 0xff00) | data[0];
+ break;
+ case PN_PIPE_SB_ALIGNED_DATA:
+ aligned = data[0] != 0;
+ break;
+ }
+ n_sb--;
+ }
+
+ /* Check for duplicate pipe handle */
+ newsk = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
+ if (unlikely(newsk)) {
+ __sock_put(newsk);
+ newsk = NULL;
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_KERNEL);
+ goto drop;
+ }
+
+ /* Create a new to-be-accepted sock */
+ newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot);
+ if (!newsk) {
+ pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
+ err = -ENOBUFS;
+ goto drop;
+ }
+
+ sock_init_data(NULL, newsk);
+ newsk->sk_state = TCP_SYN_RECV;
+ newsk->sk_backlog_rcv = pipe_do_rcv;
+ newsk->sk_protocol = sk->sk_protocol;
+ newsk->sk_destruct = pipe_destruct;
+
+ newpn = pep_sk(newsk);
+ pn_skb_get_dst_sockaddr(skb, &dst);
+ pn_skb_get_src_sockaddr(skb, &src);
+ newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst);
+ newpn->pn_sk.dobject = pn_sockaddr_get_object(&src);
+ newpn->pn_sk.resource = pn_sockaddr_get_resource(&dst);
+ sock_hold(sk);
+ newpn->listener = sk;
+ skb_queue_head_init(&newpn->ctrlreq_queue);
+ newpn->pipe_handle = pipe_handle;
+ atomic_set(&newpn->tx_credits, 0);
+ newpn->ifindex = 0;
+ newpn->peer_type = peer_type;
+ newpn->rx_credits = 0;
+ newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+ newpn->init_enable = enabled;
+ newpn->aligned = aligned;
+
+ err = pep_accept_conn(newsk, skb);
+ if (err) {
+ sock_put(newsk);
+ newsk = NULL;
+ goto drop;
+ }
+ sk_add_node(newsk, &pn->hlist);
+drop:
+ release_sock(sk);
+ kfree_skb(skb);
+ *errp = err;
+ return newsk;
+}
+
+static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ int err;
+ u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
+
+ if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
+ pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+
+ err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
+ pn->init_enable, data, 4);
+ if (err) {
+ pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+ return err;
+ }
+
+ sk->sk_state = TCP_SYN_SENT;
+
+ return 0;
+}
+
+static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
+{
+ int err;
+
+ err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
+ NULL, 0);
+ if (err)
+ return err;
+
+ sk->sk_state = TCP_SYN_SENT;
+
+ return 0;
+}
+
+static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ int answ;
+ int ret = -ENOIOCTLCMD;
+
+ switch (cmd) {
+ case SIOCINQ:
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = -EINVAL;
+ break;
+ }
+
+ lock_sock(sk);
+ if (sock_flag(sk, SOCK_URGINLINE) &&
+ !skb_queue_empty(&pn->ctrlreq_queue))
+ answ = skb_peek(&pn->ctrlreq_queue)->len;
+ else if (!skb_queue_empty(&sk->sk_receive_queue))
+ answ = skb_peek(&sk->sk_receive_queue)->len;
+ else
+ answ = 0;
+ release_sock(sk);
+ ret = put_user(answ, (int __user *)arg);
+ break;
+
+ case SIOCPNENABLEPIPE:
+ lock_sock(sk);
+ if (sk->sk_state == TCP_SYN_SENT)
+ ret = -EBUSY;
+ else if (sk->sk_state == TCP_ESTABLISHED)
+ ret = -EISCONN;
+ else
+ ret = pep_sock_enable(sk, NULL, 0);
+ release_sock(sk);
+ break;
+ }
+
+ return ret;
+}
+
+static int pep_init(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+
+ sk->sk_destruct = pipe_destruct;
+ INIT_HLIST_HEAD(&pn->hlist);
+ pn->listener = NULL;
+ skb_queue_head_init(&pn->ctrlreq_queue);
+ atomic_set(&pn->tx_credits, 0);
+ pn->ifindex = 0;
+ pn->peer_type = 0;
+ pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+ pn->rx_credits = 0;
+ pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+ pn->init_enable = 1;
+ pn->aligned = 0;
+ return 0;
+}
+
+static int pep_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ int val = 0, err = 0;
+
+ if (level != SOL_PNPIPE)
+ return -ENOPROTOOPT;
+ if (optlen >= sizeof(int)) {
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+ }
+
+ lock_sock(sk);
+ switch (optname) {
+ case PNPIPE_ENCAP:
+ if (val && val != PNPIPE_ENCAP_IP) {
+ err = -EINVAL;
+ break;
+ }
+ if (!pn->ifindex == !val)
+ break; /* Nothing to do! */
+ if (!capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ break;
+ }
+ if (val) {
+ release_sock(sk);
+ err = gprs_attach(sk);
+ if (err > 0) {
+ pn->ifindex = err;
+ err = 0;
+ }
+ } else {
+ pn->ifindex = 0;
+ release_sock(sk);
+ gprs_detach(sk);
+ err = 0;
+ }
+ goto out_norel;
+
+ case PNPIPE_HANDLE:
+ if ((sk->sk_state == TCP_CLOSE) &&
+ (val >= 0) && (val < PN_PIPE_INVALID_HANDLE))
+ pn->pipe_handle = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case PNPIPE_INITSTATE:
+ pn->init_enable = !!val;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ }
+ release_sock(sk);
+
+out_norel:
+ return err;
+}
+
+static int pep_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ int len, val;
+
+ if (level != SOL_PNPIPE)
+ return -ENOPROTOOPT;
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ switch (optname) {
+ case PNPIPE_ENCAP:
+ val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE;
+ break;
+
+ case PNPIPE_IFINDEX:
+ val = pn->ifindex;
+ break;
+
+ case PNPIPE_HANDLE:
+ val = pn->pipe_handle;
+ if (val == PN_PIPE_INVALID_HANDLE)
+ return -EINVAL;
+ break;
+
+ case PNPIPE_INITSTATE:
+ val = pn->init_enable;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ len = min_t(unsigned int, sizeof(int), len);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (put_user(val, (int __user *) optval))
+ return -EFAULT;
+ return 0;
+}
+
+static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *ph;
+ int err;
+
+ if (pn_flow_safe(pn->tx_fc) &&
+ !atomic_add_unless(&pn->tx_credits, -1, 0)) {
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+
+ skb_push(skb, 3 + pn->aligned);
+ skb_reset_transport_header(skb);
+ ph = pnp_hdr(skb);
+ ph->utid = 0;
+ if (pn->aligned) {
+ ph->message_id = PNS_PIPE_ALIGNED_DATA;
+ ph->data[0] = 0; /* padding */
+ } else
+ ph->message_id = PNS_PIPE_DATA;
+ ph->pipe_handle = pn->pipe_handle;
+ err = pn_skb_send(sk, skb, NULL);
+
+ if (err && pn_flow_safe(pn->tx_fc))
+ atomic_inc(&pn->tx_credits);
+ return err;
+
+}
+
+static int pep_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct sk_buff *skb;
+ long timeo;
+ int flags = msg->msg_flags;
+ int err, done;
+
+ if (len > USHRT_MAX)
+ return -EMSGSIZE;
+
+ if ((msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|
+ MSG_CMSG_COMPAT)) ||
+ !(msg->msg_flags & MSG_EOR))
+ return -EOPNOTSUPP;
+
+ skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len,
+ flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ return err;
+
+ skb_reserve(skb, MAX_PHONET_HEADER + 3 + pn->aligned);
+ err = memcpy_from_msg(skb_put(skb, len), msg, len);
+ if (err < 0)
+ goto outfree;
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ if ((1 << sk->sk_state) & (TCPF_LISTEN|TCPF_CLOSE)) {
+ err = -ENOTCONN;
+ goto out;
+ }
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ /* Wait until the pipe gets to enabled state */
+disabled:
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err)
+ goto out;
+
+ if (sk->sk_state == TCP_CLOSE_WAIT) {
+ err = -ECONNRESET;
+ goto out;
+ }
+ }
+ BUG_ON(sk->sk_state != TCP_ESTABLISHED);
+
+ /* Wait until flow control allows TX */
+ done = atomic_read(&pn->tx_credits);
+ while (!done) {
+ DEFINE_WAIT(wait);
+
+ if (!timeo) {
+ err = -EAGAIN;
+ goto out;
+ }
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits));
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto disabled;
+ }
+
+ err = pipe_skb_send(sk, skb);
+ if (err >= 0)
+ err = len; /* success! */
+ skb = NULL;
+out:
+ release_sock(sk);
+outfree:
+ kfree_skb(skb);
+ return err;
+}
+
+int pep_writeable(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+
+ return atomic_read(&pn->tx_credits);
+}
+
+int pep_write(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *rskb, *fs;
+ int flen = 0;
+
+ if (pep_sk(sk)->aligned)
+ return pipe_skb_send(sk, skb);
+
+ rskb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC);
+ if (!rskb) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ skb_shinfo(rskb)->frag_list = skb;
+ rskb->len += skb->len;
+ rskb->data_len += rskb->len;
+ rskb->truesize += rskb->len;
+
+ /* Avoid nested fragments */
+ skb_walk_frags(skb, fs)
+ flen += fs->len;
+ skb->next = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+ skb->len -= flen;
+ skb->data_len -= flen;
+ skb->truesize -= flen;
+
+ skb_reserve(rskb, MAX_PHONET_HEADER + 3);
+ return pipe_skb_send(sk, rskb);
+}
+
+struct sk_buff *pep_read(struct sock *sk)
+{
+ struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue);
+
+ if (sk->sk_state == TCP_ESTABLISHED)
+ pipe_grant_credits(sk, GFP_ATOMIC);
+ return skb;
+}
+
+static int pep_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (flags & ~(MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_WAITALL|
+ MSG_NOSIGNAL|MSG_CMSG_COMPAT))
+ return -EOPNOTSUPP;
+
+ if (unlikely(1 << sk->sk_state & (TCPF_LISTEN | TCPF_CLOSE)))
+ return -ENOTCONN;
+
+ if ((flags & MSG_OOB) || sock_flag(sk, SOCK_URGINLINE)) {
+ /* Dequeue and acknowledge control request */
+ struct pep_sock *pn = pep_sk(sk);
+
+ if (flags & MSG_PEEK)
+ return -EOPNOTSUPP;
+ skb = skb_dequeue(&pn->ctrlreq_queue);
+ if (skb) {
+ pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR,
+ GFP_KERNEL);
+ msg->msg_flags |= MSG_OOB;
+ goto copy;
+ }
+ if (flags & MSG_OOB)
+ return -EINVAL;
+ }
+
+ skb = skb_recv_datagram(sk, flags, noblock, &err);
+ lock_sock(sk);
+ if (skb == NULL) {
+ if (err == -ENOTCONN && sk->sk_state == TCP_CLOSE_WAIT)
+ err = -ECONNRESET;
+ release_sock(sk);
+ return err;
+ }
+
+ if (sk->sk_state == TCP_ESTABLISHED)
+ pipe_grant_credits(sk, GFP_KERNEL);
+ release_sock(sk);
+copy:
+ msg->msg_flags |= MSG_EOR;
+ if (skb->len > len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ len = skb->len;
+
+ err = skb_copy_datagram_msg(skb, 0, msg, len);
+ if (!err)
+ err = (flags & MSG_TRUNC) ? skb->len : len;
+
+ skb_free_datagram(sk, skb);
+ return err;
+}
+
+static void pep_sock_unhash(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct sock *skparent = NULL;
+
+ lock_sock(sk);
+
+ if (pn->listener != NULL) {
+ skparent = pn->listener;
+ pn->listener = NULL;
+ release_sock(sk);
+
+ pn = pep_sk(skparent);
+ lock_sock(skparent);
+ sk_del_node_init(sk);
+ sk = skparent;
+ }
+
+ /* Unhash a listening sock only when it is closed
+ * and all of its active connected pipes are closed. */
+ if (hlist_empty(&pn->hlist))
+ pn_sock_unhash(&pn->pn_sk.sk);
+ release_sock(sk);
+
+ if (skparent)
+ sock_put(skparent);
+}
+
+static struct proto pep_proto = {
+ .close = pep_sock_close,
+ .accept = pep_sock_accept,
+ .connect = pep_sock_connect,
+ .ioctl = pep_ioctl,
+ .init = pep_init,
+ .setsockopt = pep_setsockopt,
+ .getsockopt = pep_getsockopt,
+ .sendmsg = pep_sendmsg,
+ .recvmsg = pep_recvmsg,
+ .backlog_rcv = pep_do_rcv,
+ .hash = pn_sock_hash,
+ .unhash = pep_sock_unhash,
+ .get_port = pn_sock_get_port,
+ .obj_size = sizeof(struct pep_sock),
+ .owner = THIS_MODULE,
+ .name = "PNPIPE",
+};
+
+static struct phonet_protocol pep_pn_proto = {
+ .ops = &phonet_stream_ops,
+ .prot = &pep_proto,
+ .sock_type = SOCK_SEQPACKET,
+};
+
+static int __init pep_register(void)
+{
+ return phonet_proto_register(PN_PROTO_PIPE, &pep_pn_proto);
+}
+
+static void __exit pep_unregister(void)
+{
+ phonet_proto_unregister(PN_PROTO_PIPE, &pep_pn_proto);
+}
+
+module_init(pep_register);
+module_exit(pep_unregister);
+MODULE_AUTHOR("Remi Denis-Courmont, Nokia");
+MODULE_DESCRIPTION("Phonet pipe protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_PHONET, PN_PROTO_PIPE);
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
new file mode 100644
index 000000000..a58680016
--- /dev/null
+++ b/net/phonet/pn_dev.c
@@ -0,0 +1,431 @@
+/*
+ * File: pn_dev.c
+ *
+ * Phonet network device
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Authors: Sakari Ailus <sakari.ailus@nokia.com>
+ * Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/phonet.h>
+#include <linux/proc_fs.h>
+#include <linux/if_arp.h>
+#include <net/sock.h>
+#include <net/netns/generic.h>
+#include <net/phonet/pn_dev.h>
+
+struct phonet_routes {
+ struct mutex lock;
+ struct net_device __rcu *table[64];
+};
+
+struct phonet_net {
+ struct phonet_device_list pndevs;
+ struct phonet_routes routes;
+};
+
+static int phonet_net_id __read_mostly;
+
+static struct phonet_net *phonet_pernet(struct net *net)
+{
+ BUG_ON(!net);
+
+ return net_generic(net, phonet_net_id);
+}
+
+struct phonet_device_list *phonet_device_list(struct net *net)
+{
+ struct phonet_net *pnn = phonet_pernet(net);
+ return &pnn->pndevs;
+}
+
+/* Allocate new Phonet device. */
+static struct phonet_device *__phonet_device_alloc(struct net_device *dev)
+{
+ struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+ struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC);
+ if (pnd == NULL)
+ return NULL;
+ pnd->netdev = dev;
+ bitmap_zero(pnd->addrs, 64);
+
+ BUG_ON(!mutex_is_locked(&pndevs->lock));
+ list_add_rcu(&pnd->list, &pndevs->list);
+ return pnd;
+}
+
+static struct phonet_device *__phonet_get(struct net_device *dev)
+{
+ struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+ struct phonet_device *pnd;
+
+ BUG_ON(!mutex_is_locked(&pndevs->lock));
+ list_for_each_entry(pnd, &pndevs->list, list) {
+ if (pnd->netdev == dev)
+ return pnd;
+ }
+ return NULL;
+}
+
+static struct phonet_device *__phonet_get_rcu(struct net_device *dev)
+{
+ struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+ struct phonet_device *pnd;
+
+ list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+ if (pnd->netdev == dev)
+ return pnd;
+ }
+ return NULL;
+}
+
+static void phonet_device_destroy(struct net_device *dev)
+{
+ struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+ struct phonet_device *pnd;
+
+ ASSERT_RTNL();
+
+ mutex_lock(&pndevs->lock);
+ pnd = __phonet_get(dev);
+ if (pnd)
+ list_del_rcu(&pnd->list);
+ mutex_unlock(&pndevs->lock);
+
+ if (pnd) {
+ u8 addr;
+
+ for_each_set_bit(addr, pnd->addrs, 64)
+ phonet_address_notify(RTM_DELADDR, dev, addr);
+ kfree(pnd);
+ }
+}
+
+struct net_device *phonet_device_get(struct net *net)
+{
+ struct phonet_device_list *pndevs = phonet_device_list(net);
+ struct phonet_device *pnd;
+ struct net_device *dev = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+ dev = pnd->netdev;
+ BUG_ON(!dev);
+
+ if ((dev->reg_state == NETREG_REGISTERED) &&
+ ((pnd->netdev->flags & IFF_UP)) == IFF_UP)
+ break;
+ dev = NULL;
+ }
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+
+int phonet_address_add(struct net_device *dev, u8 addr)
+{
+ struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+ struct phonet_device *pnd;
+ int err = 0;
+
+ mutex_lock(&pndevs->lock);
+ /* Find or create Phonet-specific device data */
+ pnd = __phonet_get(dev);
+ if (pnd == NULL)
+ pnd = __phonet_device_alloc(dev);
+ if (unlikely(pnd == NULL))
+ err = -ENOMEM;
+ else if (test_and_set_bit(addr >> 2, pnd->addrs))
+ err = -EEXIST;
+ mutex_unlock(&pndevs->lock);
+ return err;
+}
+
+int phonet_address_del(struct net_device *dev, u8 addr)
+{
+ struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+ struct phonet_device *pnd;
+ int err = 0;
+
+ mutex_lock(&pndevs->lock);
+ pnd = __phonet_get(dev);
+ if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) {
+ err = -EADDRNOTAVAIL;
+ pnd = NULL;
+ } else if (bitmap_empty(pnd->addrs, 64))
+ list_del_rcu(&pnd->list);
+ else
+ pnd = NULL;
+ mutex_unlock(&pndevs->lock);
+
+ if (pnd)
+ kfree_rcu(pnd, rcu);
+
+ return err;
+}
+
+/* Gets a source address toward a destination, through a interface. */
+u8 phonet_address_get(struct net_device *dev, u8 daddr)
+{
+ struct phonet_device *pnd;
+ u8 saddr;
+
+ rcu_read_lock();
+ pnd = __phonet_get_rcu(dev);
+ if (pnd) {
+ BUG_ON(bitmap_empty(pnd->addrs, 64));
+
+ /* Use same source address as destination, if possible */
+ if (test_bit(daddr >> 2, pnd->addrs))
+ saddr = daddr;
+ else
+ saddr = find_first_bit(pnd->addrs, 64) << 2;
+ } else
+ saddr = PN_NO_ADDR;
+ rcu_read_unlock();
+
+ if (saddr == PN_NO_ADDR) {
+ /* Fallback to another device */
+ struct net_device *def_dev;
+
+ def_dev = phonet_device_get(dev_net(dev));
+ if (def_dev) {
+ if (def_dev != dev)
+ saddr = phonet_address_get(def_dev, daddr);
+ dev_put(def_dev);
+ }
+ }
+ return saddr;
+}
+
+int phonet_address_lookup(struct net *net, u8 addr)
+{
+ struct phonet_device_list *pndevs = phonet_device_list(net);
+ struct phonet_device *pnd;
+ int err = -EADDRNOTAVAIL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+ /* Don't allow unregistering devices! */
+ if ((pnd->netdev->reg_state != NETREG_REGISTERED) ||
+ ((pnd->netdev->flags & IFF_UP)) != IFF_UP)
+ continue;
+
+ if (test_bit(addr >> 2, pnd->addrs)) {
+ err = 0;
+ goto found;
+ }
+ }
+found:
+ rcu_read_unlock();
+ return err;
+}
+
+/* automatically configure a Phonet device, if supported */
+static int phonet_device_autoconf(struct net_device *dev)
+{
+ struct if_phonet_req req;
+ int ret;
+
+ if (!dev->netdev_ops->ndo_do_ioctl)
+ return -EOPNOTSUPP;
+
+ ret = dev->netdev_ops->ndo_do_ioctl(dev, (struct ifreq *)&req,
+ SIOCPNGAUTOCONF);
+ if (ret < 0)
+ return ret;
+
+ ASSERT_RTNL();
+ ret = phonet_address_add(dev, req.ifr_phonet_autoconf.device);
+ if (ret)
+ return ret;
+ phonet_address_notify(RTM_NEWADDR, dev,
+ req.ifr_phonet_autoconf.device);
+ return 0;
+}
+
+static void phonet_route_autodel(struct net_device *dev)
+{
+ struct phonet_net *pnn = phonet_pernet(dev_net(dev));
+ unsigned int i;
+ DECLARE_BITMAP(deleted, 64);
+
+ /* Remove left-over Phonet routes */
+ bitmap_zero(deleted, 64);
+ mutex_lock(&pnn->routes.lock);
+ for (i = 0; i < 64; i++)
+ if (rcu_access_pointer(pnn->routes.table[i]) == dev) {
+ RCU_INIT_POINTER(pnn->routes.table[i], NULL);
+ set_bit(i, deleted);
+ }
+ mutex_unlock(&pnn->routes.lock);
+
+ if (bitmap_empty(deleted, 64))
+ return; /* short-circuit RCU */
+ synchronize_rcu();
+ for_each_set_bit(i, deleted, 64) {
+ rtm_phonet_notify(RTM_DELROUTE, dev, i);
+ dev_put(dev);
+ }
+}
+
+/* notify Phonet of device events */
+static int phonet_device_notify(struct notifier_block *me, unsigned long what,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (what) {
+ case NETDEV_REGISTER:
+ if (dev->type == ARPHRD_PHONET)
+ phonet_device_autoconf(dev);
+ break;
+ case NETDEV_UNREGISTER:
+ phonet_device_destroy(dev);
+ phonet_route_autodel(dev);
+ break;
+ }
+ return 0;
+
+}
+
+static struct notifier_block phonet_device_notifier = {
+ .notifier_call = phonet_device_notify,
+ .priority = 0,
+};
+
+/* Per-namespace Phonet devices handling */
+static int __net_init phonet_init_net(struct net *net)
+{
+ struct phonet_net *pnn = phonet_pernet(net);
+
+ if (!proc_create("phonet", 0, net->proc_net, &pn_sock_seq_fops))
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pnn->pndevs.list);
+ mutex_init(&pnn->pndevs.lock);
+ mutex_init(&pnn->routes.lock);
+ return 0;
+}
+
+static void __net_exit phonet_exit_net(struct net *net)
+{
+ remove_proc_entry("phonet", net->proc_net);
+}
+
+static struct pernet_operations phonet_net_ops = {
+ .init = phonet_init_net,
+ .exit = phonet_exit_net,
+ .id = &phonet_net_id,
+ .size = sizeof(struct phonet_net),
+};
+
+/* Initialize Phonet devices list */
+int __init phonet_device_init(void)
+{
+ int err = register_pernet_subsys(&phonet_net_ops);
+ if (err)
+ return err;
+
+ proc_create("pnresource", 0, init_net.proc_net, &pn_res_seq_fops);
+ register_netdevice_notifier(&phonet_device_notifier);
+ err = phonet_netlink_register();
+ if (err)
+ phonet_device_exit();
+ return err;
+}
+
+void phonet_device_exit(void)
+{
+ rtnl_unregister_all(PF_PHONET);
+ unregister_netdevice_notifier(&phonet_device_notifier);
+ unregister_pernet_subsys(&phonet_net_ops);
+ remove_proc_entry("pnresource", init_net.proc_net);
+}
+
+int phonet_route_add(struct net_device *dev, u8 daddr)
+{
+ struct phonet_net *pnn = phonet_pernet(dev_net(dev));
+ struct phonet_routes *routes = &pnn->routes;
+ int err = -EEXIST;
+
+ daddr = daddr >> 2;
+ mutex_lock(&routes->lock);
+ if (routes->table[daddr] == NULL) {
+ rcu_assign_pointer(routes->table[daddr], dev);
+ dev_hold(dev);
+ err = 0;
+ }
+ mutex_unlock(&routes->lock);
+ return err;
+}
+
+int phonet_route_del(struct net_device *dev, u8 daddr)
+{
+ struct phonet_net *pnn = phonet_pernet(dev_net(dev));
+ struct phonet_routes *routes = &pnn->routes;
+
+ daddr = daddr >> 2;
+ mutex_lock(&routes->lock);
+ if (rcu_access_pointer(routes->table[daddr]) == dev)
+ RCU_INIT_POINTER(routes->table[daddr], NULL);
+ else
+ dev = NULL;
+ mutex_unlock(&routes->lock);
+
+ if (!dev)
+ return -ENOENT;
+ synchronize_rcu();
+ dev_put(dev);
+ return 0;
+}
+
+struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr)
+{
+ struct phonet_net *pnn = phonet_pernet(net);
+ struct phonet_routes *routes = &pnn->routes;
+ struct net_device *dev;
+
+ daddr >>= 2;
+ dev = rcu_dereference(routes->table[daddr]);
+ return dev;
+}
+
+struct net_device *phonet_route_output(struct net *net, u8 daddr)
+{
+ struct phonet_net *pnn = phonet_pernet(net);
+ struct phonet_routes *routes = &pnn->routes;
+ struct net_device *dev;
+
+ daddr >>= 2;
+ rcu_read_lock();
+ dev = rcu_dereference(routes->table[daddr]);
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ if (!dev)
+ dev = phonet_device_get(net); /* Default route */
+ return dev;
+}
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
new file mode 100644
index 000000000..bc5ee5fbe
--- /dev/null
+++ b/net/phonet/pn_netlink.c
@@ -0,0 +1,310 @@
+/*
+ * File: pn_netlink.c
+ *
+ * Phonet netlink interface
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Authors: Sakari Ailus <sakari.ailus@nokia.com>
+ * Remi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/phonet.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/phonet/pn_dev.h>
+
+/* Device address handling */
+
+static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr,
+ u32 portid, u32 seq, int event);
+
+void phonet_address_notify(int event, struct net_device *dev, u8 addr)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(1), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+ err = fill_addr(skb, dev, addr, 0, 0, event);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, dev_net(dev), 0,
+ RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+ return;
+errout:
+ rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err);
+}
+
+static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = {
+ [IFA_LOCAL] = { .type = NLA_U8 },
+};
+
+static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFA_MAX+1];
+ struct net_device *dev;
+ struct ifaddrmsg *ifm;
+ int err;
+ u8 pnaddr;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ASSERT_RTNL();
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy);
+ if (err < 0)
+ return err;
+
+ ifm = nlmsg_data(nlh);
+ if (tb[IFA_LOCAL] == NULL)
+ return -EINVAL;
+ pnaddr = nla_get_u8(tb[IFA_LOCAL]);
+ if (pnaddr & 3)
+ /* Phonet addresses only have 6 high-order bits */
+ return -EINVAL;
+
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ if (dev == NULL)
+ return -ENODEV;
+
+ if (nlh->nlmsg_type == RTM_NEWADDR)
+ err = phonet_address_add(dev, pnaddr);
+ else
+ err = phonet_address_del(dev, pnaddr);
+ if (!err)
+ phonet_address_notify(nlh->nlmsg_type, dev, pnaddr);
+ return err;
+}
+
+static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr,
+ u32 portid, u32 seq, int event)
+{
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_PHONET;
+ ifm->ifa_prefixlen = 0;
+ ifm->ifa_flags = IFA_F_PERMANENT;
+ ifm->ifa_scope = RT_SCOPE_LINK;
+ ifm->ifa_index = dev->ifindex;
+ if (nla_put_u8(skb, IFA_LOCAL, addr))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct phonet_device_list *pndevs;
+ struct phonet_device *pnd;
+ int dev_idx = 0, dev_start_idx = cb->args[0];
+ int addr_idx = 0, addr_start_idx = cb->args[1];
+
+ pndevs = phonet_device_list(sock_net(skb->sk));
+ rcu_read_lock();
+ list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+ u8 addr;
+
+ if (dev_idx > dev_start_idx)
+ addr_start_idx = 0;
+ if (dev_idx++ < dev_start_idx)
+ continue;
+
+ addr_idx = 0;
+ for_each_set_bit(addr, pnd->addrs, 64) {
+ if (addr_idx++ < addr_start_idx)
+ continue;
+
+ if (fill_addr(skb, pnd->netdev, addr << 2,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0)
+ goto out;
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ cb->args[0] = dev_idx;
+ cb->args[1] = addr_idx;
+
+ return skb->len;
+}
+
+/* Routes handling */
+
+static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst,
+ u32 portid, u32 seq, int event)
+{
+ struct rtmsg *rtm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = AF_PHONET;
+ rtm->rtm_dst_len = 6;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = RT_TABLE_MAIN;
+ rtm->rtm_protocol = RTPROT_STATIC;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ rtm->rtm_type = RTN_UNICAST;
+ rtm->rtm_flags = 0;
+ if (nla_put_u8(skb, RTA_DST, dst) ||
+ nla_put_u32(skb, RTA_OIF, dev->ifindex))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+void rtm_phonet_notify(int event, struct net_device *dev, u8 dst)
+{
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(1) + nla_total_size(4), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+ err = fill_route(skb, dev, dst, 0, 0, event);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, dev_net(dev), 0,
+ RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL);
+ return;
+errout:
+ rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_ROUTE, err);
+}
+
+static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = {
+ [RTA_DST] = { .type = NLA_U8 },
+ [RTA_OIF] = { .type = NLA_U32 },
+};
+
+static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[RTA_MAX+1];
+ struct net_device *dev;
+ struct rtmsg *rtm;
+ int err;
+ u8 dst;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ASSERT_RTNL();
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy);
+ if (err < 0)
+ return err;
+
+ rtm = nlmsg_data(nlh);
+ if (rtm->rtm_table != RT_TABLE_MAIN || rtm->rtm_type != RTN_UNICAST)
+ return -EINVAL;
+ if (tb[RTA_DST] == NULL || tb[RTA_OIF] == NULL)
+ return -EINVAL;
+ dst = nla_get_u8(tb[RTA_DST]);
+ if (dst & 3) /* Phonet addresses only have 6 high-order bits */
+ return -EINVAL;
+
+ dev = __dev_get_by_index(net, nla_get_u32(tb[RTA_OIF]));
+ if (dev == NULL)
+ return -ENODEV;
+
+ if (nlh->nlmsg_type == RTM_NEWROUTE)
+ err = phonet_route_add(dev, dst);
+ else
+ err = phonet_route_del(dev, dst);
+ if (!err)
+ rtm_phonet_notify(nlh->nlmsg_type, dev, dst);
+ return err;
+}
+
+static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ u8 addr;
+
+ rcu_read_lock();
+ for (addr = cb->args[0]; addr < 64; addr++) {
+ struct net_device *dev = phonet_route_get_rcu(net, addr << 2);
+
+ if (!dev)
+ continue;
+
+ if (fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE) < 0)
+ goto out;
+ }
+
+out:
+ rcu_read_unlock();
+ cb->args[0] = addr;
+
+ return skb->len;
+}
+
+int __init phonet_netlink_register(void)
+{
+ int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit,
+ NULL, NULL);
+ if (err)
+ return err;
+
+ /* Further __rtnl_register() cannot fail */
+ __rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, NULL);
+ __rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, NULL);
+ __rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, NULL);
+ __rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, NULL);
+ __rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, NULL);
+ return 0;
+}
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
new file mode 100644
index 000000000..d575ef4e9
--- /dev/null
+++ b/net/phonet/socket.c
@@ -0,0 +1,823 @@
+/*
+ * File: socket.c
+ *
+ * Phonet sockets
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Authors: Sakari Ailus <sakari.ailus@nokia.com>
+ * Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+#include <linux/phonet.h>
+#include <linux/export.h>
+#include <net/phonet/phonet.h>
+#include <net/phonet/pep.h>
+#include <net/phonet/pn_dev.h>
+
+static int pn_socket_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, 0);
+ }
+ return 0;
+}
+
+#define PN_HASHSIZE 16
+#define PN_HASHMASK (PN_HASHSIZE-1)
+
+
+static struct {
+ struct hlist_head hlist[PN_HASHSIZE];
+ struct mutex lock;
+} pnsocks;
+
+void __init pn_sock_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < PN_HASHSIZE; i++)
+ INIT_HLIST_HEAD(pnsocks.hlist + i);
+ mutex_init(&pnsocks.lock);
+}
+
+static struct hlist_head *pn_hash_list(u16 obj)
+{
+ return pnsocks.hlist + (obj & PN_HASHMASK);
+}
+
+/*
+ * Find address based on socket address, match only certain fields.
+ * Also grab sock if it was found. Remember to sock_put it later.
+ */
+struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *spn)
+{
+ struct sock *sknode;
+ struct sock *rval = NULL;
+ u16 obj = pn_sockaddr_get_object(spn);
+ u8 res = spn->spn_resource;
+ struct hlist_head *hlist = pn_hash_list(obj);
+
+ rcu_read_lock();
+ sk_for_each_rcu(sknode, hlist) {
+ struct pn_sock *pn = pn_sk(sknode);
+ BUG_ON(!pn->sobject); /* unbound socket */
+
+ if (!net_eq(sock_net(sknode), net))
+ continue;
+ if (pn_port(obj)) {
+ /* Look up socket by port */
+ if (pn_port(pn->sobject) != pn_port(obj))
+ continue;
+ } else {
+ /* If port is zero, look up by resource */
+ if (pn->resource != res)
+ continue;
+ }
+ if (pn_addr(pn->sobject) &&
+ pn_addr(pn->sobject) != pn_addr(obj))
+ continue;
+
+ rval = sknode;
+ sock_hold(sknode);
+ break;
+ }
+ rcu_read_unlock();
+
+ return rval;
+}
+
+/* Deliver a broadcast packet (only in bottom-half) */
+void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb)
+{
+ struct hlist_head *hlist = pnsocks.hlist;
+ unsigned int h;
+
+ rcu_read_lock();
+ for (h = 0; h < PN_HASHSIZE; h++) {
+ struct sock *sknode;
+
+ sk_for_each(sknode, hlist) {
+ struct sk_buff *clone;
+
+ if (!net_eq(sock_net(sknode), net))
+ continue;
+ if (!sock_flag(sknode, SOCK_BROADCAST))
+ continue;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone) {
+ sock_hold(sknode);
+ sk_receive_skb(sknode, clone, 0);
+ }
+ }
+ hlist++;
+ }
+ rcu_read_unlock();
+}
+
+void pn_sock_hash(struct sock *sk)
+{
+ struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject);
+
+ mutex_lock(&pnsocks.lock);
+ sk_add_node_rcu(sk, hlist);
+ mutex_unlock(&pnsocks.lock);
+}
+EXPORT_SYMBOL(pn_sock_hash);
+
+void pn_sock_unhash(struct sock *sk)
+{
+ mutex_lock(&pnsocks.lock);
+ sk_del_node_init_rcu(sk);
+ mutex_unlock(&pnsocks.lock);
+ pn_sock_unbind_all_res(sk);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(pn_sock_unhash);
+
+static DEFINE_MUTEX(port_mutex);
+
+static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
+{
+ struct sock *sk = sock->sk;
+ struct pn_sock *pn = pn_sk(sk);
+ struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
+ int err;
+ u16 handle;
+ u8 saddr;
+
+ if (sk->sk_prot->bind)
+ return sk->sk_prot->bind(sk, addr, len);
+
+ if (len < sizeof(struct sockaddr_pn))
+ return -EINVAL;
+ if (spn->spn_family != AF_PHONET)
+ return -EAFNOSUPPORT;
+
+ handle = pn_sockaddr_get_object((struct sockaddr_pn *)addr);
+ saddr = pn_addr(handle);
+ if (saddr && phonet_address_lookup(sock_net(sk), saddr))
+ return -EADDRNOTAVAIL;
+
+ lock_sock(sk);
+ if (sk->sk_state != TCP_CLOSE || pn_port(pn->sobject)) {
+ err = -EINVAL; /* attempt to rebind */
+ goto out;
+ }
+ WARN_ON(sk_hashed(sk));
+ mutex_lock(&port_mutex);
+ err = sk->sk_prot->get_port(sk, pn_port(handle));
+ if (err)
+ goto out_port;
+
+ /* get_port() sets the port, bind() sets the address if applicable */
+ pn->sobject = pn_object(saddr, pn_port(pn->sobject));
+ pn->resource = spn->spn_resource;
+
+ /* Enable RX on the socket */
+ sk->sk_prot->hash(sk);
+out_port:
+ mutex_unlock(&port_mutex);
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int pn_socket_autobind(struct socket *sock)
+{
+ struct sockaddr_pn sa;
+ int err;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.spn_family = AF_PHONET;
+ err = pn_socket_bind(sock, (struct sockaddr *)&sa,
+ sizeof(struct sockaddr_pn));
+ if (err != -EINVAL)
+ return err;
+ BUG_ON(!pn_port(pn_sk(sock->sk)->sobject));
+ return 0; /* socket was already bound */
+}
+
+static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
+ int len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct pn_sock *pn = pn_sk(sk);
+ struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
+ struct task_struct *tsk = current;
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ int err;
+
+ if (pn_socket_autobind(sock))
+ return -ENOBUFS;
+ if (len < sizeof(struct sockaddr_pn))
+ return -EINVAL;
+ if (spn->spn_family != AF_PHONET)
+ return -EAFNOSUPPORT;
+
+ lock_sock(sk);
+
+ switch (sock->state) {
+ case SS_UNCONNECTED:
+ if (sk->sk_state != TCP_CLOSE) {
+ err = -EISCONN;
+ goto out;
+ }
+ break;
+ case SS_CONNECTING:
+ err = -EALREADY;
+ goto out;
+ default:
+ err = -EISCONN;
+ goto out;
+ }
+
+ pn->dobject = pn_sockaddr_get_object(spn);
+ pn->resource = pn_sockaddr_get_resource(spn);
+ sock->state = SS_CONNECTING;
+
+ err = sk->sk_prot->connect(sk, addr, len);
+ if (err) {
+ sock->state = SS_UNCONNECTED;
+ pn->dobject = 0;
+ goto out;
+ }
+
+ while (sk->sk_state == TCP_SYN_SENT) {
+ DEFINE_WAIT(wait);
+
+ if (!timeo) {
+ err = -EINPROGRESS;
+ goto out;
+ }
+ if (signal_pending(tsk)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ finish_wait(sk_sleep(sk), &wait);
+ }
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED))
+ err = 0;
+ else if (sk->sk_state == TCP_CLOSE_WAIT)
+ err = -ECONNRESET;
+ else
+ err = -ECONNREFUSED;
+ sock->state = err ? SS_UNCONNECTED : SS_CONNECTED;
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int pn_socket_accept(struct socket *sock, struct socket *newsock,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sock *newsk;
+ int err;
+
+ if (unlikely(sk->sk_state != TCP_LISTEN))
+ return -EINVAL;
+
+ newsk = sk->sk_prot->accept(sk, flags, &err);
+ if (!newsk)
+ return err;
+
+ lock_sock(newsk);
+ sock_graft(newsk, newsock);
+ newsock->state = SS_CONNECTED;
+ release_sock(newsk);
+ return 0;
+}
+
+static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
+ int *sockaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct pn_sock *pn = pn_sk(sk);
+
+ memset(addr, 0, sizeof(struct sockaddr_pn));
+ addr->sa_family = AF_PHONET;
+ if (!peer) /* Race with bind() here is userland's problem. */
+ pn_sockaddr_set_object((struct sockaddr_pn *)addr,
+ pn->sobject);
+
+ *sockaddr_len = sizeof(struct sockaddr_pn);
+ return 0;
+}
+
+static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct pep_sock *pn = pep_sk(sk);
+ unsigned int mask = 0;
+
+ poll_wait(file, sk_sleep(sk), wait);
+
+ if (sk->sk_state == TCP_CLOSE)
+ return POLLERR;
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+ if (!skb_queue_empty(&pn->ctrlreq_queue))
+ mask |= POLLPRI;
+ if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
+ return POLLHUP;
+
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf &&
+ atomic_read(&pn->tx_credits))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+ return mask;
+}
+
+static int pn_socket_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct pn_sock *pn = pn_sk(sk);
+
+ if (cmd == SIOCPNGETOBJECT) {
+ struct net_device *dev;
+ u16 handle;
+ u8 saddr;
+
+ if (get_user(handle, (__u16 __user *)arg))
+ return -EFAULT;
+
+ lock_sock(sk);
+ if (sk->sk_bound_dev_if)
+ dev = dev_get_by_index(sock_net(sk),
+ sk->sk_bound_dev_if);
+ else
+ dev = phonet_device_get(sock_net(sk));
+ if (dev && (dev->flags & IFF_UP))
+ saddr = phonet_address_get(dev, pn_addr(handle));
+ else
+ saddr = PN_NO_ADDR;
+ release_sock(sk);
+
+ if (dev)
+ dev_put(dev);
+ if (saddr == PN_NO_ADDR)
+ return -EHOSTUNREACH;
+
+ handle = pn_object(saddr, pn_port(pn->sobject));
+ return put_user(handle, (__u16 __user *)arg);
+ }
+
+ return sk->sk_prot->ioctl(sk, cmd, arg);
+}
+
+static int pn_socket_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ if (pn_socket_autobind(sock))
+ return -ENOBUFS;
+
+ lock_sock(sk);
+ if (sock->state != SS_UNCONNECTED) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (sk->sk_state != TCP_LISTEN) {
+ sk->sk_state = TCP_LISTEN;
+ sk->sk_ack_backlog = 0;
+ }
+ sk->sk_max_ack_backlog = backlog;
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int pn_socket_sendmsg(struct socket *sock, struct msghdr *m,
+ size_t total_len)
+{
+ struct sock *sk = sock->sk;
+
+ if (pn_socket_autobind(sock))
+ return -EAGAIN;
+
+ return sk->sk_prot->sendmsg(sk, m, total_len);
+}
+
+const struct proto_ops phonet_dgram_ops = {
+ .family = AF_PHONET,
+ .owner = THIS_MODULE,
+ .release = pn_socket_release,
+ .bind = pn_socket_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = pn_socket_getname,
+ .poll = datagram_poll,
+ .ioctl = pn_socket_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = sock_no_setsockopt,
+ .compat_getsockopt = sock_no_getsockopt,
+#endif
+ .sendmsg = pn_socket_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+const struct proto_ops phonet_stream_ops = {
+ .family = AF_PHONET,
+ .owner = THIS_MODULE,
+ .release = pn_socket_release,
+ .bind = pn_socket_bind,
+ .connect = pn_socket_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = pn_socket_accept,
+ .getname = pn_socket_getname,
+ .poll = pn_socket_poll,
+ .ioctl = pn_socket_ioctl,
+ .listen = pn_socket_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+ .sendmsg = pn_socket_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+EXPORT_SYMBOL(phonet_stream_ops);
+
+/* allocate port for a socket */
+int pn_sock_get_port(struct sock *sk, unsigned short sport)
+{
+ static int port_cur;
+ struct net *net = sock_net(sk);
+ struct pn_sock *pn = pn_sk(sk);
+ struct sockaddr_pn try_sa;
+ struct sock *tmpsk;
+
+ memset(&try_sa, 0, sizeof(struct sockaddr_pn));
+ try_sa.spn_family = AF_PHONET;
+ WARN_ON(!mutex_is_locked(&port_mutex));
+ if (!sport) {
+ /* search free port */
+ int port, pmin, pmax;
+
+ phonet_get_local_port_range(&pmin, &pmax);
+ for (port = pmin; port <= pmax; port++) {
+ port_cur++;
+ if (port_cur < pmin || port_cur > pmax)
+ port_cur = pmin;
+
+ pn_sockaddr_set_port(&try_sa, port_cur);
+ tmpsk = pn_find_sock_by_sa(net, &try_sa);
+ if (tmpsk == NULL) {
+ sport = port_cur;
+ goto found;
+ } else
+ sock_put(tmpsk);
+ }
+ } else {
+ /* try to find specific port */
+ pn_sockaddr_set_port(&try_sa, sport);
+ tmpsk = pn_find_sock_by_sa(net, &try_sa);
+ if (tmpsk == NULL)
+ /* No sock there! We can use that port... */
+ goto found;
+ else
+ sock_put(tmpsk);
+ }
+ /* the port must be in use already */
+ return -EADDRINUSE;
+
+found:
+ pn->sobject = pn_object(pn_addr(pn->sobject), sport);
+ return 0;
+}
+EXPORT_SYMBOL(pn_sock_get_port);
+
+#ifdef CONFIG_PROC_FS
+static struct sock *pn_sock_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct hlist_head *hlist = pnsocks.hlist;
+ struct sock *sknode;
+ unsigned int h;
+
+ for (h = 0; h < PN_HASHSIZE; h++) {
+ sk_for_each_rcu(sknode, hlist) {
+ if (!net_eq(net, sock_net(sknode)))
+ continue;
+ if (!pos)
+ return sknode;
+ pos--;
+ }
+ hlist++;
+ }
+ return NULL;
+}
+
+static struct sock *pn_sock_get_next(struct seq_file *seq, struct sock *sk)
+{
+ struct net *net = seq_file_net(seq);
+
+ do
+ sk = sk_next(sk);
+ while (sk && !net_eq(net, sock_net(sk)));
+
+ return sk;
+}
+
+static void *pn_sock_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return *pos ? pn_sock_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *pn_sock_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock *sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = pn_sock_get_idx(seq, 0);
+ else
+ sk = pn_sock_get_next(seq, v);
+ (*pos)++;
+ return sk;
+}
+
+static void pn_sock_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static int pn_sock_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 127);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "pt loc rem rs st tx_queue rx_queue "
+ " uid inode ref pointer drops");
+ else {
+ struct sock *sk = v;
+ struct pn_sock *pn = pn_sk(sk);
+
+ seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu "
+ "%d %pK %d",
+ sk->sk_protocol, pn->sobject, pn->dobject,
+ pn->resource, sk->sk_state,
+ sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk),
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+ sock_i_ino(sk),
+ atomic_read(&sk->sk_refcnt), sk,
+ atomic_read(&sk->sk_drops));
+ }
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct seq_operations pn_sock_seq_ops = {
+ .start = pn_sock_seq_start,
+ .next = pn_sock_seq_next,
+ .stop = pn_sock_seq_stop,
+ .show = pn_sock_seq_show,
+};
+
+static int pn_sock_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &pn_sock_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+const struct file_operations pn_sock_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = pn_sock_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
+
+static struct {
+ struct sock *sk[256];
+} pnres;
+
+/*
+ * Find and hold socket based on resource.
+ */
+struct sock *pn_find_sock_by_res(struct net *net, u8 res)
+{
+ struct sock *sk;
+
+ if (!net_eq(net, &init_net))
+ return NULL;
+
+ rcu_read_lock();
+ sk = rcu_dereference(pnres.sk[res]);
+ if (sk)
+ sock_hold(sk);
+ rcu_read_unlock();
+ return sk;
+}
+
+static DEFINE_MUTEX(resource_mutex);
+
+int pn_sock_bind_res(struct sock *sk, u8 res)
+{
+ int ret = -EADDRINUSE;
+
+ if (!net_eq(sock_net(sk), &init_net))
+ return -ENOIOCTLCMD;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (pn_socket_autobind(sk->sk_socket))
+ return -EAGAIN;
+
+ mutex_lock(&resource_mutex);
+ if (pnres.sk[res] == NULL) {
+ sock_hold(sk);
+ rcu_assign_pointer(pnres.sk[res], sk);
+ ret = 0;
+ }
+ mutex_unlock(&resource_mutex);
+ return ret;
+}
+
+int pn_sock_unbind_res(struct sock *sk, u8 res)
+{
+ int ret = -ENOENT;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&resource_mutex);
+ if (pnres.sk[res] == sk) {
+ RCU_INIT_POINTER(pnres.sk[res], NULL);
+ ret = 0;
+ }
+ mutex_unlock(&resource_mutex);
+
+ if (ret == 0) {
+ synchronize_rcu();
+ sock_put(sk);
+ }
+ return ret;
+}
+
+void pn_sock_unbind_all_res(struct sock *sk)
+{
+ unsigned int res, match = 0;
+
+ mutex_lock(&resource_mutex);
+ for (res = 0; res < 256; res++) {
+ if (pnres.sk[res] == sk) {
+ RCU_INIT_POINTER(pnres.sk[res], NULL);
+ match++;
+ }
+ }
+ mutex_unlock(&resource_mutex);
+
+ while (match > 0) {
+ __sock_put(sk);
+ match--;
+ }
+ /* Caller is responsible for RCU sync before final sock_put() */
+}
+
+#ifdef CONFIG_PROC_FS
+static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct net *net = seq_file_net(seq);
+ unsigned int i;
+
+ if (!net_eq(net, &init_net))
+ return NULL;
+
+ for (i = 0; i < 256; i++) {
+ if (pnres.sk[i] == NULL)
+ continue;
+ if (!pos)
+ return pnres.sk + i;
+ pos--;
+ }
+ return NULL;
+}
+
+static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk)
+{
+ struct net *net = seq_file_net(seq);
+ unsigned int i;
+
+ BUG_ON(!net_eq(net, &init_net));
+
+ for (i = (sk - pnres.sk) + 1; i < 256; i++)
+ if (pnres.sk[i])
+ return pnres.sk + i;
+ return NULL;
+}
+
+static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(resource_mutex)
+{
+ mutex_lock(&resource_mutex);
+ return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock **sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = pn_res_get_idx(seq, 0);
+ else
+ sk = pn_res_get_next(seq, v);
+ (*pos)++;
+ return sk;
+}
+
+static void pn_res_seq_stop(struct seq_file *seq, void *v)
+ __releases(resource_mutex)
+{
+ mutex_unlock(&resource_mutex);
+}
+
+static int pn_res_seq_show(struct seq_file *seq, void *v)
+{
+ seq_setwidth(seq, 63);
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "rs uid inode");
+ else {
+ struct sock **psk = v;
+ struct sock *sk = *psk;
+
+ seq_printf(seq, "%02X %5u %lu",
+ (int) (psk - pnres.sk),
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+ sock_i_ino(sk));
+ }
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static const struct seq_operations pn_res_seq_ops = {
+ .start = pn_res_seq_start,
+ .next = pn_res_seq_next,
+ .stop = pn_res_seq_stop,
+ .show = pn_res_seq_show,
+};
+
+static int pn_res_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &pn_res_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+const struct file_operations pn_res_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = pn_res_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
new file mode 100644
index 000000000..c02a8c4bc
--- /dev/null
+++ b/net/phonet/sysctl.c
@@ -0,0 +1,110 @@
+/*
+ * File: sysctl.c
+ *
+ * Phonet /proc/sys/net/phonet interface implementation
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Author: Rémi Denis-Courmont
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/seqlock.h>
+#include <linux/sysctl.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <net/sock.h>
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+
+#define DYNAMIC_PORT_MIN 0x40
+#define DYNAMIC_PORT_MAX 0x7f
+
+static DEFINE_SEQLOCK(local_port_range_lock);
+static int local_port_range_min[2] = {0, 0};
+static int local_port_range_max[2] = {1023, 1023};
+static int local_port_range[2] = {DYNAMIC_PORT_MIN, DYNAMIC_PORT_MAX};
+static struct ctl_table_header *phonet_table_hrd;
+
+static void set_local_port_range(int range[2])
+{
+ write_seqlock(&local_port_range_lock);
+ local_port_range[0] = range[0];
+ local_port_range[1] = range[1];
+ write_sequnlock(&local_port_range_lock);
+}
+
+void phonet_get_local_port_range(int *min, int *max)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&local_port_range_lock);
+ if (min)
+ *min = local_port_range[0];
+ if (max)
+ *max = local_port_range[1];
+ } while (read_seqretry(&local_port_range_lock, seq));
+}
+
+static int proc_local_port_range(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+ int range[2] = {local_port_range[0], local_port_range[1]};
+ struct ctl_table tmp = {
+ .data = &range,
+ .maxlen = sizeof(range),
+ .mode = table->mode,
+ .extra1 = &local_port_range_min,
+ .extra2 = &local_port_range_max,
+ };
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ if (range[1] < range[0])
+ ret = -EINVAL;
+ else
+ set_local_port_range(range);
+ }
+
+ return ret;
+}
+
+static struct ctl_table phonet_table[] = {
+ {
+ .procname = "local_port_range",
+ .data = &local_port_range,
+ .maxlen = sizeof(local_port_range),
+ .mode = 0644,
+ .proc_handler = proc_local_port_range,
+ },
+ { }
+};
+
+int __init phonet_sysctl_init(void)
+{
+ phonet_table_hrd = register_net_sysctl(&init_net, "net/phonet", phonet_table);
+ return phonet_table_hrd == NULL ? -ENOMEM : 0;
+}
+
+void phonet_sysctl_exit(void)
+{
+ unregister_net_sysctl_table(phonet_table_hrd);
+}
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 000000000..f2c670ba7
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,28 @@
+
+config RDS
+ tristate "The RDS Protocol"
+ depends on INET
+ ---help---
+ The RDS (Reliable Datagram Sockets) protocol provides reliable,
+ sequenced delivery of datagrams over Infiniband, iWARP,
+ or TCP.
+
+config RDS_RDMA
+ tristate "RDS over Infiniband and iWARP"
+ depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
+ ---help---
+ Allow RDS to use Infiniband and iWARP as a transport.
+ This transport supports RDMA operations.
+
+config RDS_TCP
+ tristate "RDS over TCP"
+ depends on RDS
+ ---help---
+ Allow RDS to use TCP as a transport.
+ This transport does not support RDMA operations.
+
+config RDS_DEBUG
+ bool "RDS debugging messages"
+ depends on RDS
+ default n
+
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 000000000..56d3f6023
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_RDS) += rds.o
+rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
+ recv.o send.o stats.o sysctl.o threads.o transport.o \
+ loop.o page.o rdma.o
+
+obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
+rds_rdma-y := rdma_transport.o \
+ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
+ ib_sysctl.o ib_rdma.o \
+ iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
+ iw_sysctl.o iw_rdma.o
+
+
+obj-$(CONFIG_RDS_TCP) += rds_tcp.o
+rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
+ tcp_send.o tcp_stats.o
+
+ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG
+
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 000000000..10443377f
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+
+#include "rds.h"
+
+char *rds_str_array(char **array, size_t elements, size_t index)
+{
+ if ((index < elements) && array[index])
+ return array[index];
+ else
+ return "unknown";
+}
+EXPORT_SYMBOL(rds_str_array);
+
+/* this is just used for stats gathering :/ */
+static DEFINE_SPINLOCK(rds_sock_lock);
+static unsigned long rds_sock_count;
+static LIST_HEAD(rds_sock_list);
+DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
+
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path. sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+static int rds_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs;
+
+ if (!sk)
+ goto out;
+
+ rs = rds_sk_to_rs(sk);
+
+ sock_orphan(sk);
+ /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
+ * that ensures the recv path has completed messing
+ * with the socket. */
+ rds_clear_recv_queue(rs);
+ rds_cong_remove_socket(rs);
+
+ /*
+ * the binding lookup hash uses rcu, we need to
+ * make sure we synchronize_rcu before we free our
+ * entry
+ */
+ rds_remove_bound(rs);
+ synchronize_rcu();
+
+ rds_send_drop_to(rs, NULL);
+ rds_rdma_drop_keys(rs);
+ rds_notify_queue_get(rs, NULL);
+
+ spin_lock_bh(&rds_sock_lock);
+ list_del_init(&rs->rs_item);
+ rds_sock_count--;
+ spin_unlock_bh(&rds_sock_lock);
+
+ rds_trans_put(rs->rs_transport);
+
+ sock->sk = NULL;
+ sock_put(sk);
+out:
+ return 0;
+}
+
+/*
+ * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void rds_wake_sk_sleep(struct rds_sock *rs)
+{
+ unsigned long flags;
+
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ __rds_wake_sk_sleep(rds_rs_to_sk(rs));
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+ /* racey, don't care */
+ if (peer) {
+ if (!rs->rs_conn_addr)
+ return -ENOTCONN;
+
+ sin->sin_port = rs->rs_conn_port;
+ sin->sin_addr.s_addr = rs->rs_conn_addr;
+ } else {
+ sin->sin_port = rs->rs_bound_port;
+ sin->sin_addr.s_addr = rs->rs_bound_addr;
+ }
+
+ sin->sin_family = AF_INET;
+
+ *uaddr_len = sizeof(*sin);
+ return 0;
+}
+
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as POLLIN and POLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * POLLIN is asserted if
+ * - there is data on the receive queue.
+ * - to signal that a previously congested destination may have become
+ * uncongested
+ * - A notification has been queued to the socket (this can be a congestion
+ * update, or a RDMA completion).
+ *
+ * POLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+static unsigned int rds_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ unsigned int mask = 0;
+ unsigned long flags;
+
+ poll_wait(file, sk_sleep(sk), wait);
+
+ if (rs->rs_seen_congestion)
+ poll_wait(file, &rds_poll_waitq, wait);
+
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!rs->rs_cong_monitor) {
+ /* When a congestion map was updated, we signal POLLIN for
+ * "historical" reasons. Applications can also poll for
+ * WRBAND instead. */
+ if (rds_cong_updated_since(&rs->rs_cong_track))
+ mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
+ } else {
+ spin_lock(&rs->rs_lock);
+ if (rs->rs_cong_notify)
+ mask |= (POLLIN | POLLRDNORM);
+ spin_unlock(&rs->rs_lock);
+ }
+ if (!list_empty(&rs->rs_recv_queue) ||
+ !list_empty(&rs->rs_notify_queue))
+ mask |= (POLLIN | POLLRDNORM);
+ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
+ mask |= (POLLOUT | POLLWRNORM);
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+ /* clear state any time we wake a seen-congested socket */
+ if (mask)
+ rs->rs_seen_congestion = 0;
+
+ return mask;
+}
+
+static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return -ENOIOCTLCMD;
+}
+
+static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
+ int len)
+{
+ struct sockaddr_in sin;
+ int ret = 0;
+
+ /* racing with another thread binding seems ok here */
+ if (rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (len < sizeof(struct sockaddr_in)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (copy_from_user(&sin, optval, sizeof(sin))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ rds_send_drop_to(rs, &sin);
+out:
+ return ret;
+}
+
+static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+ int optlen)
+{
+ int value;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (get_user(value, (int __user *) optval))
+ return -EFAULT;
+ *optvar = !!value;
+ return 0;
+}
+
+static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
+ int optlen)
+{
+ int ret;
+
+ ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+ if (ret == 0) {
+ if (rs->rs_cong_monitor) {
+ rds_cong_add_socket(rs);
+ } else {
+ rds_cong_remove_socket(rs);
+ rs->rs_cong_mask = 0;
+ rs->rs_cong_notify = 0;
+ }
+ }
+ return ret;
+}
+
+static int rds_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ int ret;
+
+ if (level != SOL_RDS) {
+ ret = -ENOPROTOOPT;
+ goto out;
+ }
+
+ switch (optname) {
+ case RDS_CANCEL_SENT_TO:
+ ret = rds_cancel_sent_to(rs, optval, optlen);
+ break;
+ case RDS_GET_MR:
+ ret = rds_get_mr(rs, optval, optlen);
+ break;
+ case RDS_GET_MR_FOR_DEST:
+ ret = rds_get_mr_for_dest(rs, optval, optlen);
+ break;
+ case RDS_FREE_MR:
+ ret = rds_free_mr(rs, optval, optlen);
+ break;
+ case RDS_RECVERR:
+ ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
+ break;
+ case RDS_CONG_MONITOR:
+ ret = rds_cong_monitor(rs, optval, optlen);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ }
+out:
+ return ret;
+}
+
+static int rds_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+ int ret = -ENOPROTOOPT, len;
+
+ if (level != SOL_RDS)
+ goto out;
+
+ if (get_user(len, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ switch (optname) {
+ case RDS_INFO_FIRST ... RDS_INFO_LAST:
+ ret = rds_info_getsockopt(sock, optname, optval,
+ optlen);
+ break;
+
+ case RDS_RECVERR:
+ if (len < sizeof(int))
+ ret = -EINVAL;
+ else
+ if (put_user(rs->rs_recverr, (int __user *) optval) ||
+ put_user(sizeof(int), optlen))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+out:
+ return ret;
+
+}
+
+static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ int ret = 0;
+
+ lock_sock(sk);
+
+ if (addr_len != sizeof(struct sockaddr_in)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (sin->sin_family != AF_INET) {
+ ret = -EAFNOSUPPORT;
+ goto out;
+ }
+
+ if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EDESTADDRREQ;
+ goto out;
+ }
+
+ rs->rs_conn_addr = sin->sin_addr.s_addr;
+ rs->rs_conn_port = sin->sin_port;
+
+out:
+ release_sock(sk);
+ return ret;
+}
+
+static struct proto rds_proto = {
+ .name = "RDS",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct rds_sock),
+};
+
+static const struct proto_ops rds_proto_ops = {
+ .family = AF_RDS,
+ .owner = THIS_MODULE,
+ .release = rds_release,
+ .bind = rds_bind,
+ .connect = rds_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = rds_getname,
+ .poll = rds_poll,
+ .ioctl = rds_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = rds_setsockopt,
+ .getsockopt = rds_getsockopt,
+ .sendmsg = rds_sendmsg,
+ .recvmsg = rds_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
+{
+ struct rds_sock *rs;
+
+ sock_init_data(sock, sk);
+ sock->ops = &rds_proto_ops;
+ sk->sk_protocol = protocol;
+
+ rs = rds_sk_to_rs(sk);
+ spin_lock_init(&rs->rs_lock);
+ rwlock_init(&rs->rs_recv_lock);
+ INIT_LIST_HEAD(&rs->rs_send_queue);
+ INIT_LIST_HEAD(&rs->rs_recv_queue);
+ INIT_LIST_HEAD(&rs->rs_notify_queue);
+ INIT_LIST_HEAD(&rs->rs_cong_list);
+ spin_lock_init(&rs->rs_rdma_lock);
+ rs->rs_rdma_keys = RB_ROOT;
+
+ spin_lock_bh(&rds_sock_lock);
+ list_add_tail(&rs->rs_item, &rds_sock_list);
+ rds_sock_count++;
+ spin_unlock_bh(&rds_sock_lock);
+
+ return 0;
+}
+
+static int rds_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ if (sock->type != SOCK_SEQPACKET || protocol)
+ return -ESOCKTNOSUPPORT;
+
+ sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ return __rds_create(sock, sk, protocol);
+}
+
+void rds_sock_addref(struct rds_sock *rs)
+{
+ sock_hold(rds_rs_to_sk(rs));
+}
+
+void rds_sock_put(struct rds_sock *rs)
+{
+ sock_put(rds_rs_to_sk(rs));
+}
+
+static const struct net_proto_family rds_family_ops = {
+ .family = AF_RDS,
+ .create = rds_create,
+ .owner = THIS_MODULE,
+};
+
+static void rds_sock_inc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_sock *rs;
+ struct rds_incoming *inc;
+ unsigned int total = 0;
+
+ len /= sizeof(struct rds_info_message);
+
+ spin_lock_bh(&rds_sock_lock);
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ read_lock(&rs->rs_recv_lock);
+
+ /* XXX too lazy to maintain counts.. */
+ list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+ total++;
+ if (total <= len)
+ rds_inc_info_copy(inc, iter, inc->i_saddr,
+ rs->rs_bound_addr, 1);
+ }
+
+ read_unlock(&rs->rs_recv_lock);
+ }
+
+ spin_unlock_bh(&rds_sock_lock);
+
+ lens->nr = total;
+ lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_sock_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_info_socket sinfo;
+ struct rds_sock *rs;
+
+ len /= sizeof(struct rds_info_socket);
+
+ spin_lock_bh(&rds_sock_lock);
+
+ if (len < rds_sock_count)
+ goto out;
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ sinfo.sndbuf = rds_sk_sndbuf(rs);
+ sinfo.rcvbuf = rds_sk_rcvbuf(rs);
+ sinfo.bound_addr = rs->rs_bound_addr;
+ sinfo.connected_addr = rs->rs_conn_addr;
+ sinfo.bound_port = rs->rs_bound_port;
+ sinfo.connected_port = rs->rs_conn_port;
+ sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+ rds_info_copy(iter, &sinfo, sizeof(sinfo));
+ }
+
+out:
+ lens->nr = rds_sock_count;
+ lens->each = sizeof(struct rds_info_socket);
+
+ spin_unlock_bh(&rds_sock_lock);
+}
+
+static void rds_exit(void)
+{
+ sock_unregister(rds_family_ops.family);
+ proto_unregister(&rds_proto);
+ rds_conn_exit();
+ rds_cong_exit();
+ rds_sysctl_exit();
+ rds_threads_exit();
+ rds_stats_exit();
+ rds_page_exit();
+ rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
+ rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+}
+module_exit(rds_exit);
+
+static int rds_init(void)
+{
+ int ret;
+
+ ret = rds_conn_init();
+ if (ret)
+ goto out;
+ ret = rds_threads_init();
+ if (ret)
+ goto out_conn;
+ ret = rds_sysctl_init();
+ if (ret)
+ goto out_threads;
+ ret = rds_stats_init();
+ if (ret)
+ goto out_sysctl;
+ ret = proto_register(&rds_proto, 1);
+ if (ret)
+ goto out_stats;
+ ret = sock_register(&rds_family_ops);
+ if (ret)
+ goto out_proto;
+
+ rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
+ rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+
+ goto out;
+
+out_proto:
+ proto_unregister(&rds_proto);
+out_stats:
+ rds_stats_exit();
+out_sysctl:
+ rds_sysctl_exit();
+out_threads:
+ rds_threads_exit();
+out_conn:
+ rds_conn_exit();
+ rds_cong_exit();
+ rds_page_exit();
+out:
+ return ret;
+}
+module_init(rds_init);
+
+#define DRV_VERSION "4.0"
+#define DRV_RELDATE "Feb 12, 2009"
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
+ " v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 000000000..a2e6562da
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/jhash.h>
+#include <linux/ratelimit.h>
+#include "rds.h"
+
+#define BIND_HASH_SIZE 1024
+static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
+static DEFINE_SPINLOCK(rds_bind_lock);
+
+static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
+{
+ return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
+ (BIND_HASH_SIZE - 1));
+}
+
+static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
+ struct rds_sock *insert)
+{
+ struct rds_sock *rs;
+ struct hlist_head *head = hash_to_bucket(addr, port);
+ u64 cmp;
+ u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(rs, head, rs_bound_node) {
+ cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
+ be16_to_cpu(rs->rs_bound_port);
+
+ if (cmp == needle) {
+ rcu_read_unlock();
+ return rs;
+ }
+ }
+ rcu_read_unlock();
+
+ if (insert) {
+ /*
+ * make sure our addr and port are set before
+ * we are added to the list, other people
+ * in rcu will find us as soon as the
+ * hlist_add_head_rcu is done
+ */
+ insert->rs_bound_addr = addr;
+ insert->rs_bound_port = port;
+ rds_sock_addref(insert);
+
+ hlist_add_head_rcu(&insert->rs_bound_node, head);
+ }
+ return NULL;
+}
+
+/*
+ * Return the rds_sock bound at the given local address.
+ *
+ * The rx path can race with rds_release. We notice if rds_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+{
+ struct rds_sock *rs;
+
+ rs = rds_bind_lookup(addr, port, NULL);
+
+ if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
+ rds_sock_addref(rs);
+ else
+ rs = NULL;
+
+ rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
+ ntohs(port));
+ return rs;
+}
+
+/* returns -ve errno or +ve port */
+static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+{
+ unsigned long flags;
+ int ret = -EADDRINUSE;
+ u16 rover, last;
+
+ if (*port != 0) {
+ rover = be16_to_cpu(*port);
+ last = rover;
+ } else {
+ rover = max_t(u16, prandom_u32(), 2);
+ last = rover - 1;
+ }
+
+ spin_lock_irqsave(&rds_bind_lock, flags);
+
+ do {
+ if (rover == 0)
+ rover++;
+ if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
+ *port = rs->rs_bound_port;
+ ret = 0;
+ rdsdebug("rs %p binding to %pI4:%d\n",
+ rs, &addr, (int)ntohs(*port));
+ break;
+ }
+ } while (rover++ != last);
+
+ spin_unlock_irqrestore(&rds_bind_lock, flags);
+
+ return ret;
+}
+
+void rds_remove_bound(struct rds_sock *rs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_bind_lock, flags);
+
+ if (rs->rs_bound_addr) {
+ rdsdebug("rs %p unbinding from %pI4:%d\n",
+ rs, &rs->rs_bound_addr,
+ ntohs(rs->rs_bound_port));
+
+ hlist_del_init_rcu(&rs->rs_bound_node);
+ rds_sock_put(rs);
+ rs->rs_bound_addr = 0;
+ }
+
+ spin_unlock_irqrestore(&rds_bind_lock, flags);
+}
+
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ struct rds_transport *trans;
+ int ret = 0;
+
+ lock_sock(sk);
+
+ if (addr_len != sizeof(struct sockaddr_in) ||
+ sin->sin_family != AF_INET ||
+ rs->rs_bound_addr ||
+ sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+ if (ret)
+ goto out;
+
+ trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+ if (!trans) {
+ ret = -EADDRNOTAVAIL;
+ rds_remove_bound(rs);
+ printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
+ "load rds_tcp or rds_rdma?\n");
+ goto out;
+ }
+
+ rs->rs_transport = trans;
+ ret = 0;
+
+out:
+ release_sock(sk);
+
+ /* we might have called rds_remove_bound on error */
+ if (ret)
+ synchronize_rcu();
+ return ret;
+}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 000000000..e6144b824
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2007 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value. Only the payload bytes in the
+ * message are accounted for. If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested. All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs. An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested. As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up. This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently. This is much easier to implement than some
+ * finer-grained communication of per-port congestion. The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rds_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rds_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t rds_cong_generation = ATOMIC_INIT(0);
+
+/*
+ * Congestion monitoring
+ */
+static LIST_HEAD(rds_cong_monitor);
+static DEFINE_RWLOCK(rds_cong_monitor_lock);
+
+/*
+ * Yes, a global lock. It's used so infrequently that it's worth keeping it
+ * global to simplify the locking. It's only used in the following
+ * circumstances:
+ *
+ * - on connection buildup to associate a conn with its maps
+ * - on map changes to inform conns of a new map to send
+ *
+ * It's sadly ordered under the socket callback lock and the connection lock.
+ * Receive paths can mark ports congested from interrupt context so the
+ * lock masks interrupts.
+ */
+static DEFINE_SPINLOCK(rds_cong_lock);
+static struct rb_root rds_cong_tree = RB_ROOT;
+
+static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+ struct rds_cong_map *insert)
+{
+ struct rb_node **p = &rds_cong_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct rds_cong_map *map;
+
+ while (*p) {
+ parent = *p;
+ map = rb_entry(parent, struct rds_cong_map, m_rb_node);
+
+ if (addr < map->m_addr)
+ p = &(*p)->rb_left;
+ else if (addr > map->m_addr)
+ p = &(*p)->rb_right;
+ else
+ return map;
+ }
+
+ if (insert) {
+ rb_link_node(&insert->m_rb_node, parent, p);
+ rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
+ }
+ return NULL;
+}
+
+/*
+ * There is only ever one bitmap for any address. Connections try and allocate
+ * these bitmaps in the process getting pointers to them. The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+{
+ struct rds_cong_map *map;
+ struct rds_cong_map *ret = NULL;
+ unsigned long zp;
+ unsigned long i;
+ unsigned long flags;
+
+ map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
+ if (!map)
+ return NULL;
+
+ map->m_addr = addr;
+ init_waitqueue_head(&map->m_waitq);
+ INIT_LIST_HEAD(&map->m_conn_list);
+
+ for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+ zp = get_zeroed_page(GFP_KERNEL);
+ if (zp == 0)
+ goto out;
+ map->m_page_addrs[i] = zp;
+ }
+
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ ret = rds_cong_tree_walk(addr, map);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+ if (!ret) {
+ ret = map;
+ map = NULL;
+ }
+
+out:
+ if (map) {
+ for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+ free_page(map->m_page_addrs[i]);
+ kfree(map);
+ }
+
+ rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+
+ return ret;
+}
+
+/*
+ * Put the conn on its local map's list. This is called when the conn is
+ * really added to the hash. It's nested under the rds_conn_lock, sadly.
+ */
+void rds_cong_add_conn(struct rds_connection *conn)
+{
+ unsigned long flags;
+
+ rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_remove_conn(struct rds_connection *conn)
+{
+ unsigned long flags;
+
+ rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ list_del_init(&conn->c_map_item);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+int rds_cong_get_maps(struct rds_connection *conn)
+{
+ conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
+ conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+
+ if (!(conn->c_lcong && conn->c_fcong))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rds_cong_queue_updates(struct rds_cong_map *map)
+{
+ struct rds_connection *conn;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_cong_lock, flags);
+
+ list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
+ if (!test_and_set_bit(0, &conn->c_map_queued)) {
+ rds_stats_inc(s_cong_update_queued);
+ /* We cannot inline the call to rds_send_xmit() here
+ * for two reasons (both pertaining to a TCP transport):
+ * 1. When we get here from the receive path, we
+ * are already holding the sock_lock (held by
+ * tcp_v4_rcv()). So inlining calls to
+ * tcp_setsockopt and/or tcp_sendmsg will deadlock
+ * when it tries to get the sock_lock())
+ * 2. Interrupts are masked so that we can mark the
+ * the port congested from both send and recv paths.
+ * (See comment around declaration of rdc_cong_lock).
+ * An attempt to get the sock_lock() here will
+ * therefore trigger warnings.
+ * Defer the xmit to rds_send_worker() instead.
+ */
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ }
+ }
+
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
+{
+ rdsdebug("waking map %p for %pI4\n",
+ map, &map->m_addr);
+ rds_stats_inc(s_cong_update_received);
+ atomic_inc(&rds_cong_generation);
+ if (waitqueue_active(&map->m_waitq))
+ wake_up(&map->m_waitq);
+ if (waitqueue_active(&rds_poll_waitq))
+ wake_up_all(&rds_poll_waitq);
+
+ if (portmask && !list_empty(&rds_cong_monitor)) {
+ unsigned long flags;
+ struct rds_sock *rs;
+
+ read_lock_irqsave(&rds_cong_monitor_lock, flags);
+ list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
+ spin_lock(&rs->rs_lock);
+ rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+ rs->rs_cong_mask &= ~portmask;
+ spin_unlock(&rs->rs_lock);
+ if (rs->rs_cong_notify)
+ rds_wake_sk_sleep(rs);
+ }
+ read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+ }
+}
+EXPORT_SYMBOL_GPL(rds_cong_map_updated);
+
+int rds_cong_updated_since(unsigned long *recent)
+{
+ unsigned long gen = atomic_read(&rds_cong_generation);
+
+ if (likely(*recent == gen))
+ return 0;
+ *recent = gen;
+ return 1;
+}
+
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption. This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ rdsdebug("setting congestion for %pI4:%u in map %p\n",
+ &map->m_addr, ntohs(port), map);
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ __set_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ rdsdebug("clearing congestion for %pI4:%u in map %p\n",
+ &map->m_addr, ntohs(port), map);
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ __clear_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
+{
+ unsigned long i;
+ unsigned long off;
+
+ i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+ off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+ return test_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_add_socket(struct rds_sock *rs)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&rds_cong_monitor_lock, flags);
+ if (list_empty(&rs->rs_cong_list))
+ list_add(&rs->rs_cong_list, &rds_cong_monitor);
+ write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+}
+
+void rds_cong_remove_socket(struct rds_sock *rs)
+{
+ unsigned long flags;
+ struct rds_cong_map *map;
+
+ write_lock_irqsave(&rds_cong_monitor_lock, flags);
+ list_del_init(&rs->rs_cong_list);
+ write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+
+ /* update congestion map for now-closed port */
+ spin_lock_irqsave(&rds_cong_lock, flags);
+ map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+ spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+ if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+ rds_cong_clear_bit(map, rs->rs_bound_port);
+ rds_cong_queue_updates(map);
+ }
+}
+
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
+ struct rds_sock *rs)
+{
+ if (!rds_cong_test_bit(map, port))
+ return 0;
+ if (nonblock) {
+ if (rs && rs->rs_cong_monitor) {
+ unsigned long flags;
+
+ /* It would have been nice to have an atomic set_bit on
+ * a uint64_t. */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ /* Test again - a congestion update may have arrived in
+ * the meantime. */
+ if (!rds_cong_test_bit(map, port))
+ return 0;
+ }
+ rds_stats_inc(s_cong_send_error);
+ return -ENOBUFS;
+ }
+
+ rds_stats_inc(s_cong_send_blocked);
+ rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
+
+ return wait_event_interruptible(map->m_waitq,
+ !rds_cong_test_bit(map, port));
+}
+
+void rds_cong_exit(void)
+{
+ struct rb_node *node;
+ struct rds_cong_map *map;
+ unsigned long i;
+
+ while ((node = rb_first(&rds_cong_tree))) {
+ map = rb_entry(node, struct rds_cong_map, m_rb_node);
+ rdsdebug("freeing map %p\n", map);
+ rb_erase(&map->m_rb_node, &rds_cong_tree);
+ for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+ free_page(map->m_page_addrs[i]);
+ kfree(map);
+ }
+}
+
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
+{
+ struct rds_cong_map *map = conn->c_lcong;
+ struct rds_message *rm;
+
+ rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
+ if (!IS_ERR(rm))
+ rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
+
+ return rm;
+}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 000000000..da6da57e5
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/inet_hashtables.h>
+
+#include "rds.h"
+#include "loop.h"
+
+#define RDS_CONNECTION_HASH_BITS 12
+#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
+#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
+
+/* converting this to RCU is a chore for another day.. */
+static DEFINE_SPINLOCK(rds_conn_lock);
+static unsigned long rds_conn_count;
+static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
+static struct kmem_cache *rds_conn_slab;
+
+static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+{
+ static u32 rds_hash_secret __read_mostly;
+
+ unsigned long hash;
+
+ net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+
+ /* Pass NULL, don't need struct net for hash */
+ hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
+ be32_to_cpu(faddr), 0,
+ rds_hash_secret);
+ return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
+}
+
+#define rds_conn_info_set(var, test, suffix) do { \
+ if (test) \
+ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
+} while (0)
+
+/* rcu read lock must be held or the connection spinlock */
+static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+ __be32 laddr, __be32 faddr,
+ struct rds_transport *trans)
+{
+ struct rds_connection *conn, *ret = NULL;
+
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
+ conn->c_trans == trans) {
+ ret = conn;
+ break;
+ }
+ }
+ rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
+ &laddr, &faddr);
+ return ret;
+}
+
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future. It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+static void rds_conn_reset(struct rds_connection *conn)
+{
+ rdsdebug("connection %pI4 to %pI4 reset\n",
+ &conn->c_laddr, &conn->c_faddr);
+
+ rds_stats_inc(s_conn_reset);
+ rds_send_reset(conn);
+ conn->c_flags = 0;
+
+ /* Do not clear next_rx_seq here, else we cannot distinguish
+ * retransmitted packets from new packets, and will hand all
+ * of them to the application. That is not consistent with the
+ * reliability guarantees of RDS. */
+}
+
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time. They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created. They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp,
+ int is_outgoing)
+{
+ struct rds_connection *conn, *parent = NULL;
+ struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+ struct rds_transport *loop_trans;
+ unsigned long flags;
+ int ret;
+ struct rds_transport *otrans = trans;
+
+ if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
+ goto new_conn;
+ rcu_read_lock();
+ conn = rds_conn_lookup(head, laddr, faddr, trans);
+ if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
+ laddr == faddr && !is_outgoing) {
+ /* This is a looped back IB connection, and we're
+ * called by the code handling the incoming connect.
+ * We need a second connection object into which we
+ * can stick the other QP. */
+ parent = conn;
+ conn = parent->c_passive;
+ }
+ rcu_read_unlock();
+ if (conn)
+ goto out;
+
+new_conn:
+ conn = kmem_cache_zalloc(rds_conn_slab, gfp);
+ if (!conn) {
+ conn = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ INIT_HLIST_NODE(&conn->c_hash_node);
+ conn->c_laddr = laddr;
+ conn->c_faddr = faddr;
+ spin_lock_init(&conn->c_lock);
+ conn->c_next_tx_seq = 1;
+
+ init_waitqueue_head(&conn->c_waitq);
+ INIT_LIST_HEAD(&conn->c_send_queue);
+ INIT_LIST_HEAD(&conn->c_retrans);
+
+ ret = rds_cong_get_maps(conn);
+ if (ret) {
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(ret);
+ goto out;
+ }
+
+ /*
+ * This is where a connection becomes loopback. If *any* RDS sockets
+ * can bind to the destination address then we'd rather the messages
+ * flow through loopback rather than either transport.
+ */
+ loop_trans = rds_trans_get_preferred(faddr);
+ if (loop_trans) {
+ rds_trans_put(loop_trans);
+ conn->c_loopback = 1;
+ if (is_outgoing && trans->t_prefer_loopback) {
+ /* "outgoing" connection - and the transport
+ * says it wants the connection handled by the
+ * loopback transport. This is what TCP does.
+ */
+ trans = &rds_loop_transport;
+ }
+ }
+
+ conn->c_trans = trans;
+
+ ret = trans->conn_alloc(conn, gfp);
+ if (ret) {
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(ret);
+ goto out;
+ }
+
+ atomic_set(&conn->c_state, RDS_CONN_DOWN);
+ conn->c_send_gen = 0;
+ conn->c_reconnect_jiffies = 0;
+ INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
+ INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
+ INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
+ INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
+ mutex_init(&conn->c_cm_lock);
+ conn->c_flags = 0;
+
+ rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
+ conn, &laddr, &faddr,
+ trans->t_name ? trans->t_name : "[unknown]",
+ is_outgoing ? "(outgoing)" : "");
+
+ /*
+ * Since we ran without holding the conn lock, someone could
+ * have created the same conn (either normal or passive) in the
+ * interim. We check while holding the lock. If we won, we complete
+ * init and return our conn. If we lost, we rollback and return the
+ * other one.
+ */
+ spin_lock_irqsave(&rds_conn_lock, flags);
+ if (parent) {
+ /* Creating passive conn */
+ if (parent->c_passive) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = parent->c_passive;
+ } else {
+ parent->c_passive = conn;
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
+ } else {
+ /* Creating normal conn */
+ struct rds_connection *found;
+
+ if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
+ found = NULL;
+ else
+ found = rds_conn_lookup(head, laddr, faddr, trans);
+ if (found) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = found;
+ } else {
+ if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
+ (otrans->t_type != RDS_TRANS_TCP)) {
+ /* Only the active side should be added to
+ * reconnect list for TCP.
+ */
+ hlist_add_head_rcu(&conn->c_hash_node, head);
+ }
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
+ }
+ }
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
+
+out:
+ return conn;
+}
+
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp)
+{
+ return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create);
+
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp)
+{
+ return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
+
+void rds_conn_shutdown(struct rds_connection *conn)
+{
+ /* shut it down unless it's down already */
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+ /*
+ * Quiesce the connection mgmt handlers before we start tearing
+ * things down. We don't hold the mutex for the entire
+ * duration of the shutdown operation, else we may be
+ * deadlocking with the CM handler. Instead, the CM event
+ * handler is supposed to check for state DISCONNECTING
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+ && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+ rds_conn_error(conn, "shutdown called in state %d\n",
+ atomic_read(&conn->c_state));
+ mutex_unlock(&conn->c_cm_lock);
+ return;
+ }
+ mutex_unlock(&conn->c_cm_lock);
+
+ wait_event(conn->c_waitq,
+ !test_bit(RDS_IN_XMIT, &conn->c_flags));
+
+ conn->c_trans->conn_shutdown(conn);
+ rds_conn_reset(conn);
+
+ if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+ /* This can happen - eg when we're in the middle of tearing
+ * down the connection, and someone unloads the rds module.
+ * Quite reproduceable with loopback connections.
+ * Mostly harmless.
+ */
+ rds_conn_error(conn,
+ "%s: failed to transition to state DOWN, "
+ "current state is %d\n",
+ __func__,
+ atomic_read(&conn->c_state));
+ return;
+ }
+ }
+
+ /* Then reconnect if it's still live.
+ * The passive side of an IB loopback connection is never added
+ * to the conn hash, so we never trigger a reconnect on this
+ * conn - the reconnect is always triggered by the active peer. */
+ cancel_delayed_work_sync(&conn->c_conn_w);
+ rcu_read_lock();
+ if (!hlist_unhashed(&conn->c_hash_node)) {
+ rcu_read_unlock();
+ rds_queue_reconnect(conn);
+ } else {
+ rcu_read_unlock();
+ }
+}
+
+/*
+ * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances. It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
+ */
+void rds_conn_destroy(struct rds_connection *conn)
+{
+ struct rds_message *rm, *rtmp;
+ unsigned long flags;
+
+ rdsdebug("freeing conn %p for %pI4 -> "
+ "%pI4\n", conn, &conn->c_laddr,
+ &conn->c_faddr);
+
+ /* Ensure conn will not be scheduled for reconnect */
+ spin_lock_irq(&rds_conn_lock);
+ hlist_del_init_rcu(&conn->c_hash_node);
+ spin_unlock_irq(&rds_conn_lock);
+ synchronize_rcu();
+
+ /* shut the connection down */
+ rds_conn_drop(conn);
+ flush_work(&conn->c_down_w);
+
+ /* make sure lingering queued work won't try to ref the conn */
+ cancel_delayed_work_sync(&conn->c_send_w);
+ cancel_delayed_work_sync(&conn->c_recv_w);
+
+ /* tear down queued messages */
+ list_for_each_entry_safe(rm, rtmp,
+ &conn->c_send_queue,
+ m_conn_item) {
+ list_del_init(&rm->m_conn_item);
+ BUG_ON(!list_empty(&rm->m_sock_item));
+ rds_message_put(rm);
+ }
+ if (conn->c_xmit_rm)
+ rds_message_put(conn->c_xmit_rm);
+
+ conn->c_trans->conn_free(conn->c_transport_data);
+
+ /*
+ * The congestion maps aren't freed up here. They're
+ * freed by rds_cong_exit() after all the connections
+ * have been freed.
+ */
+ rds_cong_remove_conn(conn);
+
+ BUG_ON(!list_empty(&conn->c_retrans));
+ kmem_cache_free(rds_conn_slab, conn);
+
+ spin_lock_irqsave(&rds_conn_lock, flags);
+ rds_conn_count--;
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
+}
+EXPORT_SYMBOL_GPL(rds_conn_destroy);
+
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ struct hlist_head *head;
+ struct list_head *list;
+ struct rds_connection *conn;
+ struct rds_message *rm;
+ unsigned int total = 0;
+ unsigned long flags;
+ size_t i;
+
+ len /= sizeof(struct rds_info_message);
+
+ rcu_read_lock();
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ if (want_send)
+ list = &conn->c_send_queue;
+ else
+ list = &conn->c_retrans;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+
+ /* XXX too lazy to maintain counts.. */
+ list_for_each_entry(rm, list, m_conn_item) {
+ total++;
+ if (total <= len)
+ rds_inc_info_copy(&rm->m_inc, iter,
+ conn->c_laddr,
+ conn->c_faddr, 0);
+ }
+
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+ }
+ }
+ rcu_read_unlock();
+
+ lens->nr = total;
+ lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_conn_message_info(sock, len, iter, lens, 1);
+}
+
+static void rds_conn_message_info_retrans(struct socket *sock,
+ unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_conn_message_info(sock, len, iter, lens, 0);
+}
+
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_connection *, void *),
+ size_t item_len)
+{
+ uint64_t buffer[(item_len + 7) / 8];
+ struct hlist_head *head;
+ struct rds_connection *conn;
+ size_t i;
+
+ rcu_read_lock();
+
+ lens->nr = 0;
+ lens->each = item_len;
+
+ for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+ i++, head++) {
+ hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+
+ /* XXX no c_lock usage.. */
+ if (!visitor(conn, buffer))
+ continue;
+
+ /* We copy as much as we can fit in the buffer,
+ * but we count all items so that the caller
+ * can resize the buffer. */
+ if (len >= item_len) {
+ rds_info_copy(iter, buffer, item_len);
+ len -= item_len;
+ }
+ lens->nr++;
+ }
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
+
+static int rds_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds_info_connection *cinfo = buffer;
+
+ cinfo->next_tx_seq = conn->c_next_tx_seq;
+ cinfo->next_rx_seq = conn->c_next_rx_seq;
+ cinfo->laddr = conn->c_laddr;
+ cinfo->faddr = conn->c_faddr;
+ strncpy(cinfo->transport, conn->c_trans->t_name,
+ sizeof(cinfo->transport));
+ cinfo->flags = 0;
+
+ rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+ SENDING);
+ /* XXX Future: return the state rather than these funky bits */
+ rds_conn_info_set(cinfo->flags,
+ atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
+ CONNECTING);
+ rds_conn_info_set(cinfo->flags,
+ atomic_read(&conn->c_state) == RDS_CONN_UP,
+ CONNECTED);
+ return 1;
+}
+
+static void rds_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds_conn_info_visitor,
+ sizeof(struct rds_info_connection));
+}
+
+int rds_conn_init(void)
+{
+ rds_conn_slab = kmem_cache_create("rds_connection",
+ sizeof(struct rds_connection),
+ 0, 0, NULL);
+ if (!rds_conn_slab)
+ return -ENOMEM;
+
+ rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+ rds_info_register_func(RDS_INFO_SEND_MESSAGES,
+ rds_conn_message_info_send);
+ rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
+ rds_conn_message_info_retrans);
+
+ return 0;
+}
+
+void rds_conn_exit(void)
+{
+ rds_loop_exit();
+
+ WARN_ON(!hlist_empty(rds_conn_hash));
+
+ kmem_cache_destroy(rds_conn_slab);
+
+ rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+ rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
+ rds_conn_message_info_send);
+ rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
+ rds_conn_message_info_retrans);
+}
+
+/*
+ * Force a disconnect
+ */
+void rds_conn_drop(struct rds_connection *conn)
+{
+ atomic_set(&conn->c_state, RDS_CONN_ERROR);
+ queue_work(rds_wq, &conn->c_down_w);
+}
+EXPORT_SYMBOL_GPL(rds_conn_drop);
+
+/*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+ if (rds_conn_state(conn) == RDS_CONN_DOWN &&
+ !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+
+ rds_conn_drop(conn);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 000000000..ba2dffeff
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
+
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
+module_param(fmr_message_size, int, 0444);
+MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+module_param(rds_ib_retry_count, int, 0444);
+MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
+
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
+struct list_head rds_ib_devices;
+
+/* NOTE: if also grabbing ibdev lock, grab this first */
+DEFINE_SPINLOCK(ib_nodev_conns_lock);
+LIST_HEAD(ib_nodev_conns);
+
+static void rds_ib_nodev_connect(void)
+{
+ struct rds_ib_connection *ic;
+
+ spin_lock(&ib_nodev_conns_lock);
+ list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+ rds_conn_connect_if_down(ic->conn);
+ spin_unlock(&ib_nodev_conns_lock);
+}
+
+static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+ struct rds_ib_connection *ic;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+ list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+ rds_conn_drop(ic->conn);
+ spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+ struct rds_ib_ipaddr *i_ipaddr, *i_next;
+ struct rds_ib_device *rds_ibdev = container_of(work,
+ struct rds_ib_device, free_work);
+
+ if (rds_ibdev->mr_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+ if (rds_ibdev->mr)
+ ib_dereg_mr(rds_ibdev->mr);
+ if (rds_ibdev->pd)
+ ib_dealloc_pd(rds_ibdev->pd);
+
+ list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+ list_del(&i_ipaddr->list);
+ kfree(i_ipaddr);
+ }
+
+ kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+ BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
+ if (atomic_dec_and_test(&rds_ibdev->refcount))
+ queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
+static void rds_ib_add_one(struct ib_device *device)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct ib_device_attr *dev_attr;
+
+ /* Only handle IB (no iWARP) devices */
+ if (device->node_type != RDMA_NODE_IB_CA)
+ return;
+
+ dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+ if (!dev_attr)
+ return;
+
+ if (ib_query_device(device, dev_attr)) {
+ rdsdebug("Query device failed for %s\n", device->name);
+ goto free_attr;
+ }
+
+ rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+ ibdev_to_node(device));
+ if (!rds_ibdev)
+ goto free_attr;
+
+ spin_lock_init(&rds_ibdev->spinlock);
+ atomic_set(&rds_ibdev->refcount, 1);
+ INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
+
+ rds_ibdev->max_wrs = dev_attr->max_qp_wr;
+ rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+
+ rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
+ rds_ibdev->max_fmrs = dev_attr->max_fmr ?
+ min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
+ fmr_pool_size;
+
+ rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
+ rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+
+ rds_ibdev->dev = device;
+ rds_ibdev->pd = ib_alloc_pd(device);
+ if (IS_ERR(rds_ibdev->pd)) {
+ rds_ibdev->pd = NULL;
+ goto put_dev;
+ }
+
+ rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(rds_ibdev->mr)) {
+ rds_ibdev->mr = NULL;
+ goto put_dev;
+ }
+
+ rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
+ if (IS_ERR(rds_ibdev->mr_pool)) {
+ rds_ibdev->mr_pool = NULL;
+ goto put_dev;
+ }
+
+ INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+ INIT_LIST_HEAD(&rds_ibdev->conn_list);
+
+ down_write(&rds_ib_devices_lock);
+ list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+ up_write(&rds_ib_devices_lock);
+ atomic_inc(&rds_ibdev->refcount);
+
+ ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+ atomic_inc(&rds_ibdev->refcount);
+
+ rds_ib_nodev_connect();
+
+put_dev:
+ rds_ib_dev_put(rds_ibdev);
+free_attr:
+ kfree(dev_attr);
+}
+
+/*
+ * New connections use this to find the device to associate with the
+ * connection. It's not in the fast path so we're not concerned about the
+ * performance of the IB call. (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period. The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
+{
+ struct rds_ib_device *rds_ibdev;
+
+ rcu_read_lock();
+ rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+ if (rds_ibdev)
+ atomic_inc(&rds_ibdev->refcount);
+ rcu_read_unlock();
+ return rds_ibdev;
+}
+
+/*
+ * The IB stack is letting us know that a device is going away. This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
+static void rds_ib_remove_one(struct ib_device *device)
+{
+ struct rds_ib_device *rds_ibdev;
+
+ rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+ if (!rds_ibdev)
+ return;
+
+ rds_ib_dev_shutdown(rds_ibdev);
+
+ /* stop connection attempts from getting a reference to this device. */
+ ib_set_client_data(device, &rds_ib_client, NULL);
+
+ down_write(&rds_ib_devices_lock);
+ list_del_rcu(&rds_ibdev->list);
+ up_write(&rds_ib_devices_lock);
+
+ /*
+ * This synchronize rcu is waiting for readers of both the ib
+ * client data and the devices list to finish before we drop
+ * both of those references.
+ */
+ synchronize_rcu();
+ rds_ib_dev_put(rds_ibdev);
+ rds_ib_dev_put(rds_ibdev);
+}
+
+struct ib_client rds_ib_client = {
+ .name = "rds_ib",
+ .add = rds_ib_add_one,
+ .remove = rds_ib_remove_one
+};
+
+static int rds_ib_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds_info_rdma_connection *iinfo = buffer;
+ struct rds_ib_connection *ic;
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rds_ib_transport)
+ return 0;
+
+ iinfo->src_addr = conn->c_laddr;
+ iinfo->dst_addr = conn->c_faddr;
+
+ memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+ memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ struct rds_ib_device *rds_ibdev;
+ struct rdma_dev_addr *dev_addr;
+
+ ic = conn->c_transport_data;
+ dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+ rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+ rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+ rds_ibdev = ic->rds_ibdev;
+ iinfo->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo->max_send_sge = rds_ibdev->max_sge;
+ rds_ib_get_mr_info(rds_ibdev, iinfo);
+ }
+ return 1;
+}
+
+static void rds_ib_ic_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds_ib_conn_info_visitor,
+ sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible. Sending and
+ * receiving should be device-agnostic. Transports would try and maintain
+ * connections between peers who have messages queued. Userspace would be
+ * allowed to influence which paths have priority. We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_ib_laddr_check(__be32 addr)
+{
+ int ret;
+ struct rdma_cm_id *cm_id;
+ struct sockaddr_in sin;
+
+ /* Create a CMA ID and try to bind it. This catches both
+ * IB and iWARP capable NICs.
+ */
+ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+
+ /* rdma_bind_addr will only succeed for IB & iWARP devices */
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ /* due to this, we will claim to support iWARP devices unless we
+ check node_type. */
+ if (ret || !cm_id->device ||
+ cm_id->device->node_type != RDMA_NODE_IB_CA)
+ ret = -EADDRNOTAVAIL;
+
+ rdsdebug("addr %pI4 ret %d node type %d\n",
+ &addr, ret,
+ cm_id->device ? cm_id->device->node_type : -1);
+
+ rdma_destroy_id(cm_id);
+
+ return ret;
+}
+
+static void rds_ib_unregister_client(void)
+{
+ ib_unregister_client(&rds_ib_client);
+ /* wait for rds_ib_dev_free() to complete */
+ flush_workqueue(rds_wq);
+}
+
+void rds_ib_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+ rds_ib_unregister_client();
+ rds_ib_destroy_nodev_conns();
+ rds_ib_sysctl_exit();
+ rds_ib_recv_exit();
+ rds_trans_unregister(&rds_ib_transport);
+}
+
+struct rds_transport rds_ib_transport = {
+ .laddr_check = rds_ib_laddr_check,
+ .xmit_complete = rds_ib_xmit_complete,
+ .xmit = rds_ib_xmit,
+ .xmit_rdma = rds_ib_xmit_rdma,
+ .xmit_atomic = rds_ib_xmit_atomic,
+ .recv = rds_ib_recv,
+ .conn_alloc = rds_ib_conn_alloc,
+ .conn_free = rds_ib_conn_free,
+ .conn_connect = rds_ib_conn_connect,
+ .conn_shutdown = rds_ib_conn_shutdown,
+ .inc_copy_to_user = rds_ib_inc_copy_to_user,
+ .inc_free = rds_ib_inc_free,
+ .cm_initiate_connect = rds_ib_cm_initiate_connect,
+ .cm_handle_connect = rds_ib_cm_handle_connect,
+ .cm_connect_complete = rds_ib_cm_connect_complete,
+ .stats_info_copy = rds_ib_stats_info_copy,
+ .exit = rds_ib_exit,
+ .get_mr = rds_ib_get_mr,
+ .sync_mr = rds_ib_sync_mr,
+ .free_mr = rds_ib_free_mr,
+ .flush_mrs = rds_ib_flush_mrs,
+ .t_owner = THIS_MODULE,
+ .t_name = "infiniband",
+ .t_type = RDS_TRANS_IB
+};
+
+int rds_ib_init(void)
+{
+ int ret;
+
+ INIT_LIST_HEAD(&rds_ib_devices);
+
+ ret = ib_register_client(&rds_ib_client);
+ if (ret)
+ goto out;
+
+ ret = rds_ib_sysctl_init();
+ if (ret)
+ goto out_ibreg;
+
+ ret = rds_ib_recv_init();
+ if (ret)
+ goto out_sysctl;
+
+ ret = rds_trans_register(&rds_ib_transport);
+ if (ret)
+ goto out_recv;
+
+ rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+
+ goto out;
+
+out_recv:
+ rds_ib_recv_exit();
+out_sysctl:
+ rds_ib_sysctl_exit();
+out_ibreg:
+ rds_ib_unregister_client();
+out:
+ return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 000000000..c36d71322
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,372 @@
+#ifndef _RDS_IB_H
+#define _RDS_IB_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FMR_SIZE 256
+#define RDS_FMR_POOL_SIZE 8192
+
+#define RDS_IB_MAX_SGE 8
+#define RDS_IB_RECV_SGE 2
+
+#define RDS_IB_DEFAULT_RECV_WR 1024
+#define RDS_IB_DEFAULT_SEND_WR 256
+
+#define RDS_IB_DEFAULT_RETRY_COUNT 2
+
+#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
+
+#define RDS_IB_RECYCLE_BATCH_COUNT 32
+
+extern struct rw_semaphore rds_ib_devices_lock;
+extern struct list_head rds_ib_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+struct rds_page_frag {
+ struct list_head f_item;
+ struct list_head f_cache_entry;
+ struct scatterlist f_sg;
+};
+
+struct rds_ib_incoming {
+ struct list_head ii_frags;
+ struct list_head ii_cache_entry;
+ struct rds_incoming ii_inc;
+};
+
+struct rds_ib_cache_head {
+ struct list_head *first;
+ unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+ struct rds_ib_cache_head __percpu *percpu;
+ struct list_head *xfer;
+ struct list_head *ready;
+};
+
+struct rds_ib_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ __be32 dp_saddr;
+ __be32 dp_daddr;
+ u8 dp_protocol_major;
+ u8 dp_protocol_minor;
+ __be16 dp_protocol_minor_mask; /* bitmask */
+ __be32 dp_reserved1;
+ __be64 dp_ack_seq;
+ __be32 dp_credit; /* non-zero enables flow ctl */
+};
+
+struct rds_ib_send_work {
+ void *s_op;
+ struct ib_send_wr s_wr;
+ struct ib_sge s_sge[RDS_IB_MAX_SGE];
+ unsigned long s_queued;
+};
+
+struct rds_ib_recv_work {
+ struct rds_ib_incoming *r_ibinc;
+ struct rds_page_frag *r_frag;
+ struct ib_recv_wr r_wr;
+ struct ib_sge r_sge[2];
+};
+
+struct rds_ib_work_ring {
+ u32 w_nr;
+ u32 w_alloc_ptr;
+ u32 w_alloc_ctr;
+ u32 w_free_ptr;
+ atomic_t w_free_ctr;
+};
+
+struct rds_ib_device;
+
+struct rds_ib_connection {
+
+ struct list_head ib_node;
+ struct rds_ib_device *rds_ibdev;
+ struct rds_connection *conn;
+
+ /* alphabet soup, IBTA style */
+ struct rdma_cm_id *i_cm_id;
+ struct ib_pd *i_pd;
+ struct ib_mr *i_mr;
+ struct ib_cq *i_send_cq;
+ struct ib_cq *i_recv_cq;
+
+ /* tx */
+ struct rds_ib_work_ring i_send_ring;
+ struct rm_data_op *i_data_op;
+ struct rds_header *i_send_hdrs;
+ u64 i_send_hdrs_dma;
+ struct rds_ib_send_work *i_sends;
+ atomic_t i_signaled_sends;
+
+ /* rx */
+ struct tasklet_struct i_recv_tasklet;
+ struct mutex i_recv_mutex;
+ struct rds_ib_work_ring i_recv_ring;
+ struct rds_ib_incoming *i_ibinc;
+ u32 i_recv_data_rem;
+ struct rds_header *i_recv_hdrs;
+ u64 i_recv_hdrs_dma;
+ struct rds_ib_recv_work *i_recvs;
+ u64 i_ack_recv; /* last ACK received */
+ struct rds_ib_refill_cache i_cache_incs;
+ struct rds_ib_refill_cache i_cache_frags;
+
+ /* sending acks */
+ unsigned long i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t i_ack_next; /* next ACK to send */
+#else
+ spinlock_t i_ack_lock; /* protect i_ack_next */
+ u64 i_ack_next; /* next ACK to send */
+#endif
+ struct rds_header *i_ack;
+ struct ib_send_wr i_ack_wr;
+ struct ib_sge i_ack_sge;
+ u64 i_ack_dma;
+ unsigned long i_ack_queued;
+
+ /* Flow control related information
+ *
+ * Our algorithm uses a pair variables that we need to access
+ * atomically - one for the send credits, and one posted
+ * recv credits we need to transfer to remote.
+ * Rather than protect them using a slow spinlock, we put both into
+ * a single atomic_t and update it using cmpxchg
+ */
+ atomic_t i_credits;
+
+ /* Protocol version specific information */
+ unsigned int i_flowctl:1; /* enable/disable flow ctl */
+
+ /* Batched completions */
+ unsigned int i_unsignaled_wrs;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v) ((v) >> 16)
+#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v) ((v) << 16)
+
+struct rds_ib_ipaddr {
+ struct list_head list;
+ __be32 ipaddr;
+};
+
+struct rds_ib_device {
+ struct list_head list;
+ struct list_head ipaddr_list;
+ struct list_head conn_list;
+ struct ib_device *dev;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ struct rds_ib_mr_pool *mr_pool;
+ unsigned int fmr_max_remaps;
+ unsigned int max_fmrs;
+ int max_sge;
+ unsigned int max_wrs;
+ unsigned int max_initiator_depth;
+ unsigned int max_responder_resources;
+ spinlock_t spinlock; /* protect the above */
+ atomic_t refcount;
+ struct work_struct free_work;
+};
+
+#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT 0
+#define IB_ACK_REQUESTED 1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IB_ACK_WR_ID (~(u64) 0)
+
+struct rds_ib_statistics {
+ uint64_t s_ib_connect_raced;
+ uint64_t s_ib_listen_closed_stale;
+ uint64_t s_ib_tx_cq_call;
+ uint64_t s_ib_tx_cq_event;
+ uint64_t s_ib_tx_ring_full;
+ uint64_t s_ib_tx_throttle;
+ uint64_t s_ib_tx_sg_mapping_failure;
+ uint64_t s_ib_tx_stalled;
+ uint64_t s_ib_tx_credit_updates;
+ uint64_t s_ib_rx_cq_call;
+ uint64_t s_ib_rx_cq_event;
+ uint64_t s_ib_rx_ring_empty;
+ uint64_t s_ib_rx_refill_from_cq;
+ uint64_t s_ib_rx_refill_from_thread;
+ uint64_t s_ib_rx_alloc_limit;
+ uint64_t s_ib_rx_credit_updates;
+ uint64_t s_ib_ack_sent;
+ uint64_t s_ib_ack_send_failure;
+ uint64_t s_ib_ack_send_delayed;
+ uint64_t s_ib_ack_send_piggybacked;
+ uint64_t s_ib_ack_received;
+ uint64_t s_ib_rdma_mr_alloc;
+ uint64_t s_ib_rdma_mr_free;
+ uint64_t s_ib_rdma_mr_used;
+ uint64_t s_ib_rdma_mr_pool_flush;
+ uint64_t s_ib_rdma_mr_pool_wait;
+ uint64_t s_ib_rdma_mr_pool_depleted;
+ uint64_t s_ib_atomic_cswp;
+ uint64_t s_ib_atomic_fadd;
+};
+
+extern struct workqueue_struct *rds_ib_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
+ struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+ unsigned int i;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ ib_dma_sync_single_for_cpu(dev,
+ ib_sg_dma_address(dev, &sg[i]),
+ ib_sg_dma_len(dev, &sg[i]),
+ direction);
+ }
+}
+#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu
+
+static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
+ struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+ unsigned int i;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ ib_dma_sync_single_for_device(dev,
+ ib_sg_dma_address(dev, &sg[i]),
+ ib_sg_dma_len(dev, &sg[i]),
+ direction);
+ }
+}
+#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device
+
+
+/* ib.c */
+extern struct rds_transport rds_ib_transport;
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
+extern struct ib_client rds_ib_client;
+
+extern unsigned int fmr_message_size;
+extern unsigned int rds_ib_retry_count;
+
+extern spinlock_t ib_nodev_conns_lock;
+extern struct list_head ib_nodev_conns;
+
+/* ib_cm.c */
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_ib_conn_free(void *arg);
+int rds_ib_conn_connect(struct rds_connection *conn);
+void rds_ib_conn_shutdown(struct rds_connection *conn);
+void rds_ib_state_change(struct sock *sk);
+int rds_ib_listen_init(void);
+void rds_ib_listen_stop(void);
+void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_ib_cm_connect_complete(struct rds_connection *conn,
+ struct rdma_cm_event *event);
+
+
+#define rds_ib_conn_error(conn, fmt...) \
+ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
+
+/* ib_rdma.c */
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_destroy_nodev_conns(void);
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+
+/* ib_recv.c */
+int rds_ib_recv_init(void);
+void rds_ib_recv_exit(void);
+int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+void rds_ib_inc_free(struct rds_incoming *inc);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_recv_tasklet_fn(unsigned long data);
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
+void rds_ib_attempt_ack(struct rds_ib_connection *ic);
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+
+/* ib_ring.c */
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
+int rds_ib_ring_low(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_ib_ring_empty_wait;
+
+/* ib_send.c */
+char *rds_ib_wc_status_str(enum ib_wc_status status);
+void rds_ib_xmit_complete(struct rds_connection *conn);
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_send_init_ring(struct rds_ib_connection *ic);
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
+ u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
+#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+
+/* ib_sysctl.c */
+int rds_ib_sysctl_init(void);
+void rds_ib_sysctl_exit(void);
+extern unsigned long rds_ib_sysctl_max_send_wr;
+extern unsigned long rds_ib_sysctl_max_recv_wr;
+extern unsigned long rds_ib_sysctl_max_unsig_wrs;
+extern unsigned long rds_ib_sysctl_max_unsig_bytes;
+extern unsigned long rds_ib_sysctl_max_recv_allocation;
+extern unsigned int rds_ib_sysctl_flow_control;
+
+#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 000000000..8a09ee7db
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,838 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) \
+ [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
+ RDS_IB_EVENT_STRING(CQ_ERR),
+ RDS_IB_EVENT_STRING(QP_FATAL),
+ RDS_IB_EVENT_STRING(QP_REQ_ERR),
+ RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+ RDS_IB_EVENT_STRING(COMM_EST),
+ RDS_IB_EVENT_STRING(SQ_DRAINED),
+ RDS_IB_EVENT_STRING(PATH_MIG),
+ RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+ RDS_IB_EVENT_STRING(DEVICE_FATAL),
+ RDS_IB_EVENT_STRING(PORT_ACTIVE),
+ RDS_IB_EVENT_STRING(PORT_ERR),
+ RDS_IB_EVENT_STRING(LID_CHANGE),
+ RDS_IB_EVENT_STRING(PKEY_CHANGE),
+ RDS_IB_EVENT_STRING(SM_CHANGE),
+ RDS_IB_EVENT_STRING(SRQ_ERR),
+ RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+ RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+ RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+ return rds_str_array(rds_ib_event_type_strings,
+ ARRAY_SIZE(rds_ib_event_type_strings), type);
+};
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+ conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (rds_ib_sysctl_flow_control && credits != 0) {
+ /* We're doing flow control */
+ ic->i_flowctl = 1;
+ rds_ib_send_add_credits(conn, credits);
+ } else {
+ ic->i_flowctl = 0;
+ }
+}
+
+/*
+ * Tune RNR behavior. Without flow control, we use a rather
+ * low timeout, but not the absolute minimum - this should
+ * be tunable.
+ *
+ * We already set the RNR retry count to 7 (which is the
+ * smallest infinite number :-) above.
+ * If flow control is off, we want to change this back to 0
+ * so that we learn quickly when our credit accounting is
+ * buggy.
+ *
+ * Caller passes in a qp_attr pointer - don't waste stack spacv
+ * by allocation this twice.
+ */
+static void
+rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
+{
+ int ret;
+
+ attr->min_rnr_timer = IB_RNR_TIMER_000_32;
+ ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
+ if (ret)
+ printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+ const struct rds_ib_connect_private *dp = NULL;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_qp_attr qp_attr;
+ int err;
+
+ if (event->param.conn.private_data_len >= sizeof(*dp)) {
+ dp = event->param.conn.private_data;
+
+ /* make sure it isn't empty data */
+ if (dp->dp_protocol_major) {
+ rds_ib_set_protocol(conn,
+ RDS_PROTOCOL(dp->dp_protocol_major,
+ dp->dp_protocol_minor));
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ }
+ }
+
+ if (conn->c_version < RDS_PROTOCOL(3,1)) {
+ printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+ " no longer supported\n",
+ &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version));
+ rds_conn_destroy(conn);
+ return;
+ } else {
+ printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+ &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+ }
+
+ /*
+ * Init rings and fill recv. this needs to wait until protocol negotiation
+ * is complete, since ring layout is different from 3.0 to 3.1.
+ */
+ rds_ib_send_init_ring(ic);
+ rds_ib_recv_init_ring(ic);
+ /* Post receive buffers - as a side effect, this will update
+ * the posted credit count. */
+ rds_ib_recv_refill(conn, 1);
+
+ /* Tune RNR behavior */
+ rds_ib_tune_rnr(ic, &qp_attr);
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+ if (err)
+ printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
+
+ /* update ib_device with this local ipaddr */
+ err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+ if (err)
+ printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+ err);
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp) {
+ /* dp structure start is not guaranteed to be 8 bytes aligned.
+ * Since dp_ack_seq is 64-bit extended load operations can be
+ * used so go through get_unaligned to avoid unaligned errors.
+ */
+ __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
+
+ if (dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
+ NULL);
+ }
+
+ rds_connect_complete(conn);
+}
+
+static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
+ struct rdma_conn_param *conn_param,
+ struct rds_ib_connect_private *dp,
+ u32 protocol_version,
+ u32 max_responder_resources,
+ u32 max_initiator_depth)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
+ memset(conn_param, 0, sizeof(struct rdma_conn_param));
+
+ conn_param->responder_resources =
+ min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+ conn_param->initiator_depth =
+ min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
+ conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+ conn_param->rnr_retry_count = 7;
+
+ if (dp) {
+ memset(dp, 0, sizeof(*dp));
+ dp->dp_saddr = conn->c_laddr;
+ dp->dp_daddr = conn->c_faddr;
+ dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+ dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+ dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+
+ /* Advertise flow control */
+ if (ic->i_flowctl) {
+ unsigned int credits;
+
+ credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+ dp->dp_credit = cpu_to_be32(credits);
+ atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+ }
+
+ conn_param->private_data = dp;
+ conn_param->private_data_len = sizeof(*dp);
+ }
+}
+
+static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+ rdsdebug("event %u (%s) data %p\n",
+ event->event, rds_ib_event_str(event->event), data);
+}
+
+static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+ struct rds_connection *conn = data;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+ rds_ib_event_str(event->event));
+
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+ break;
+ default:
+ rdsdebug("Fatal QP Event %u (%s) "
+ "- connection %pI4->%pI4, reconnecting\n",
+ event->event, rds_ib_event_str(event->event),
+ &conn->c_laddr, &conn->c_faddr);
+ rds_conn_drop(conn);
+ break;
+ }
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_ib_setup_qp(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct ib_qp_init_attr attr;
+ struct rds_ib_device *rds_ibdev;
+ int ret;
+
+ /*
+ * It's normal to see a null device if an incoming connection races
+ * with device removal, so we don't print a warning.
+ */
+ rds_ibdev = rds_ib_get_client_data(dev);
+ if (!rds_ibdev)
+ return -EOPNOTSUPP;
+
+ /* add the conn now so that connection establishment has the dev */
+ rds_ib_add_conn(rds_ibdev, conn);
+
+ if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
+ rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
+ if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
+ rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+
+ /* Protection domain and memory range */
+ ic->i_pd = rds_ibdev->pd;
+ ic->i_mr = rds_ibdev->mr;
+
+ ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
+ rds_ib_cq_event_handler, conn,
+ ic->i_send_ring.w_nr + 1, 0);
+ if (IS_ERR(ic->i_send_cq)) {
+ ret = PTR_ERR(ic->i_send_cq);
+ ic->i_send_cq = NULL;
+ rdsdebug("ib_create_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
+ rds_ib_cq_event_handler, conn,
+ ic->i_recv_ring.w_nr, 0);
+ if (IS_ERR(ic->i_recv_cq)) {
+ ret = PTR_ERR(ic->i_recv_cq);
+ ic->i_recv_cq = NULL;
+ rdsdebug("ib_create_cq recv failed: %d\n", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+ goto out;
+ }
+
+ /* XXX negotiate max send/recv with remote? */
+ memset(&attr, 0, sizeof(attr));
+ attr.event_handler = rds_ib_qp_event_handler;
+ attr.qp_context = conn;
+ /* + 1 to allow for the single ack message */
+ attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+ attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+ attr.cap.max_send_sge = rds_ibdev->max_sge;
+ attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+ attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ attr.qp_type = IB_QPT_RC;
+ attr.send_cq = ic->i_send_cq;
+ attr.recv_cq = ic->i_recv_cq;
+
+ /*
+ * XXX this can fail if max_*_wr is too large? Are we supposed
+ * to back off until we get a value that the hardware can support?
+ */
+ ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+ if (ret) {
+ rdsdebug("rdma_create_qp failed: %d\n", ret);
+ goto out;
+ }
+
+ ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+ ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ &ic->i_send_hdrs_dma, GFP_KERNEL);
+ if (!ic->i_send_hdrs) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent send failed\n");
+ goto out;
+ }
+
+ ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+ ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ &ic->i_recv_hdrs_dma, GFP_KERNEL);
+ if (!ic->i_recv_hdrs) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent recv failed\n");
+ goto out;
+ }
+
+ ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+ &ic->i_ack_dma, GFP_KERNEL);
+ if (!ic->i_ack) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent ack failed\n");
+ goto out;
+ }
+
+ ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+ ibdev_to_node(dev));
+ if (!ic->i_sends) {
+ ret = -ENOMEM;
+ rdsdebug("send allocation failed\n");
+ goto out;
+ }
+
+ ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+ ibdev_to_node(dev));
+ if (!ic->i_recvs) {
+ ret = -ENOMEM;
+ rdsdebug("recv allocation failed\n");
+ goto out;
+ }
+
+ rds_ib_recv_init_ack(ic);
+
+ rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+ ic->i_send_cq, ic->i_recv_cq);
+
+out:
+ rds_ib_dev_put(rds_ibdev);
+ return ret;
+}
+
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+{
+ const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+ u16 common;
+ u32 version = 0;
+
+ /*
+ * rdma_cm private data is odd - when there is any private data in the
+ * request, we will be given a pretty large buffer without telling us the
+ * original size. The only way to tell the difference is by looking at
+ * the contents, which are initialized to zero.
+ * If the protocol version fields aren't set, this is a connection attempt
+ * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+ * We really should have changed this for OFED 1.3 :-(
+ */
+
+ /* Be paranoid. RDS always has privdata */
+ if (!event->param.conn.private_data_len) {
+ printk(KERN_NOTICE "RDS incoming connection has no private data, "
+ "rejecting\n");
+ return 0;
+ }
+
+ /* Even if len is crap *now* I still want to check it. -ASG */
+ if (event->param.conn.private_data_len < sizeof (*dp) ||
+ dp->dp_protocol_major == 0)
+ return RDS_PROTOCOL_3_0;
+
+ common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+ if (dp->dp_protocol_major == 3 && common) {
+ version = RDS_PROTOCOL_3_0;
+ while ((common >>= 1) != 0)
+ version++;
+ } else
+ printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
+ &dp->dp_saddr,
+ dp->dp_protocol_major,
+ dp->dp_protocol_minor);
+ return version;
+}
+
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
+ __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
+ const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+ struct rds_ib_connect_private dp_rep;
+ struct rds_connection *conn = NULL;
+ struct rds_ib_connection *ic = NULL;
+ struct rdma_conn_param conn_param;
+ u32 version;
+ int err = 1, destroy = 1;
+
+ /* Check whether the remote protocol version matches ours. */
+ version = rds_ib_protocol_compatible(event);
+ if (!version)
+ goto out;
+
+ rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
+ "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+ RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+ (unsigned long long)be64_to_cpu(lguid),
+ (unsigned long long)be64_to_cpu(fguid));
+
+ conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
+ GFP_KERNEL);
+ if (IS_ERR(conn)) {
+ rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+ conn = NULL;
+ goto out;
+ }
+
+ /*
+ * The connection request may occur while the
+ * previous connection exist, e.g. in case of failover.
+ * But as connections may be initiated simultaneously
+ * by both hosts, we have a random backoff mechanism -
+ * see the comment above rds_queue_reconnect()
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ rdsdebug("incoming connect while connecting\n");
+ rds_conn_drop(conn);
+ rds_ib_stats_inc(s_ib_listen_closed_stale);
+ } else
+ if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+ /* Wait and see - our connect may still be succeeding */
+ rds_ib_stats_inc(s_ib_connect_raced);
+ }
+ goto out;
+ }
+
+ ic = conn->c_transport_data;
+
+ rds_ib_set_protocol(conn, version);
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp->dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+ BUG_ON(cm_id->context);
+ BUG_ON(ic->i_cm_id);
+
+ ic->i_cm_id = cm_id;
+ cm_id->context = conn;
+
+ /* We got halfway through setting up the ib_connection, if we
+ * fail now, we have to take the long route out of this mess. */
+ destroy = 0;
+
+ err = rds_ib_setup_qp(conn);
+ if (err) {
+ rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
+ goto out;
+ }
+
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+ event->param.conn.responder_resources,
+ event->param.conn.initiator_depth);
+
+ /* rdma_accept() calls rdma_reject() internally if it fails */
+ err = rdma_accept(cm_id, &conn_param);
+ if (err)
+ rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+
+out:
+ if (conn)
+ mutex_unlock(&conn->c_cm_lock);
+ if (err)
+ rdma_reject(cm_id, NULL, 0);
+ return destroy;
+}
+
+
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+ struct rds_connection *conn = cm_id->context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rdma_conn_param conn_param;
+ struct rds_ib_connect_private dp;
+ int ret;
+
+ /* If the peer doesn't do protocol negotiation, we must
+ * default to RDSv3.0 */
+ rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+ ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
+
+ ret = rds_ib_setup_qp(conn);
+ if (ret) {
+ rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
+ goto out;
+ }
+
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+ UINT_MAX, UINT_MAX);
+ ret = rdma_connect(cm_id, &conn_param);
+ if (ret)
+ rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+ /* Beware - returning non-zero tells the rdma_cm to destroy
+ * the cm_id. We should certainly not do it as long as we still
+ * "own" the cm_id. */
+ if (ret) {
+ if (ic->i_cm_id == cm_id)
+ ret = 0;
+ }
+ return ret;
+}
+
+int rds_ib_conn_connect(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct sockaddr_in src, dest;
+ int ret;
+
+ /* XXX I wonder what affect the port space has */
+ /* delegate cm event handler to rdma_transport */
+ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(ic->i_cm_id)) {
+ ret = PTR_ERR(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ rdsdebug("rdma_create_id() failed: %d\n", ret);
+ goto out;
+ }
+
+ rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+ src.sin_family = AF_INET;
+ src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+ src.sin_port = (__force u16)htons(0);
+
+ dest.sin_family = AF_INET;
+ dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+ dest.sin_port = (__force u16)htons(RDS_PORT);
+
+ ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+ (struct sockaddr *)&dest,
+ RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ if (ret) {
+ rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+ ret);
+ rdma_destroy_id(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup. In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_ib_conn_shutdown(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ int err = 0;
+
+ rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+ ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+ ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+ if (ic->i_cm_id) {
+ struct ib_device *dev = ic->i_cm_id->device;
+
+ rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+ err = rdma_disconnect(ic->i_cm_id);
+ if (err) {
+ /* Actually this may happen quite frequently, when
+ * an outgoing connect raced with an incoming connect.
+ */
+ rdsdebug("failed to disconnect, cm: %p err %d\n",
+ ic->i_cm_id, err);
+ }
+
+ /*
+ * We want to wait for tx and rx completion to finish
+ * before we tear down the connection, but we have to be
+ * careful not to get stuck waiting on a send ring that
+ * only has unsignaled sends in it. We've shutdown new
+ * sends before getting here so by waiting for signaled
+ * sends to complete we're ensured that there will be no
+ * more tx processing.
+ */
+ wait_event(rds_ib_ring_empty_wait,
+ rds_ib_ring_empty(&ic->i_recv_ring) &&
+ (atomic_read(&ic->i_signaled_sends) == 0));
+ tasklet_kill(&ic->i_recv_tasklet);
+
+ if (ic->i_send_hdrs)
+ ib_dma_free_coherent(dev,
+ ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_send_hdrs,
+ ic->i_send_hdrs_dma);
+
+ if (ic->i_recv_hdrs)
+ ib_dma_free_coherent(dev,
+ ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_recv_hdrs,
+ ic->i_recv_hdrs_dma);
+
+ if (ic->i_ack)
+ ib_dma_free_coherent(dev, sizeof(struct rds_header),
+ ic->i_ack, ic->i_ack_dma);
+
+ if (ic->i_sends)
+ rds_ib_send_clear_ring(ic);
+ if (ic->i_recvs)
+ rds_ib_recv_clear_ring(ic);
+
+ if (ic->i_cm_id->qp)
+ rdma_destroy_qp(ic->i_cm_id);
+ if (ic->i_send_cq)
+ ib_destroy_cq(ic->i_send_cq);
+ if (ic->i_recv_cq)
+ ib_destroy_cq(ic->i_recv_cq);
+ rdma_destroy_id(ic->i_cm_id);
+
+ /*
+ * Move connection back to the nodev list.
+ */
+ if (ic->rds_ibdev)
+ rds_ib_remove_conn(ic->rds_ibdev, conn);
+
+ ic->i_cm_id = NULL;
+ ic->i_pd = NULL;
+ ic->i_mr = NULL;
+ ic->i_send_cq = NULL;
+ ic->i_recv_cq = NULL;
+ ic->i_send_hdrs = NULL;
+ ic->i_recv_hdrs = NULL;
+ ic->i_ack = NULL;
+ }
+ BUG_ON(ic->rds_ibdev);
+
+ /* Clear pending transmit */
+ if (ic->i_data_op) {
+ struct rds_message *rm;
+
+ rm = container_of(ic->i_data_op, struct rds_message, data);
+ rds_message_put(rm);
+ ic->i_data_op = NULL;
+ }
+
+ /* Clear the ACK state */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_set(&ic->i_ack_next, 0);
+#else
+ ic->i_ack_next = 0;
+#endif
+ ic->i_ack_recv = 0;
+
+ /* Clear flow control state */
+ ic->i_flowctl = 0;
+ atomic_set(&ic->i_credits, 0);
+
+ rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+ rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+ if (ic->i_ibinc) {
+ rds_inc_put(&ic->i_ibinc->ii_inc);
+ ic->i_ibinc = NULL;
+ }
+
+ vfree(ic->i_sends);
+ ic->i_sends = NULL;
+ vfree(ic->i_recvs);
+ ic->i_recvs = NULL;
+}
+
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_ib_connection *ic;
+ unsigned long flags;
+ int ret;
+
+ /* XXX too lazy? */
+ ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
+ if (!ic)
+ return -ENOMEM;
+
+ ret = rds_ib_recv_alloc_caches(ic);
+ if (ret) {
+ kfree(ic);
+ return ret;
+ }
+
+ INIT_LIST_HEAD(&ic->ib_node);
+ tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
+ (unsigned long) ic);
+ mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+ spin_lock_init(&ic->i_ack_lock);
+#endif
+ atomic_set(&ic->i_signaled_sends, 0);
+
+ /*
+ * rds_ib_conn_shutdown() waits for these to be emptied so they
+ * must be initialized before it can be called.
+ */
+ rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+ rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+ ic->conn = conn;
+ conn->c_transport_data = ic;
+
+ spin_lock_irqsave(&ib_nodev_conns_lock, flags);
+ list_add_tail(&ic->ib_node, &ib_nodev_conns);
+ spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
+
+
+ rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+ return 0;
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void rds_ib_conn_free(void *arg)
+{
+ struct rds_ib_connection *ic = arg;
+ spinlock_t *lock_ptr;
+
+ rdsdebug("ic %p\n", ic);
+
+ /*
+ * Conn is either on a dev's list or on the nodev list.
+ * A race with shutdown() or connect() would cause problems
+ * (since rds_ibdev would change) but that should never happen.
+ */
+ lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
+
+ spin_lock_irq(lock_ptr);
+ list_del(&ic->ib_node);
+ spin_unlock_irq(lock_ptr);
+
+ rds_ib_recv_free_caches(ic);
+
+ kfree(ic);
+}
+
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+ va_list ap;
+
+ rds_conn_drop(conn);
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 000000000..273b8bff6
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/llist.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static DEFINE_PER_CPU(unsigned long, clean_list_grace);
+#define CLEAN_LIST_BUSY_BIT 0
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_ib_mr {
+ struct rds_ib_device *device;
+ struct rds_ib_mr_pool *pool;
+ struct ib_fmr *fmr;
+
+ struct llist_node llnode;
+
+ /* unmap_list is for freeing */
+ struct list_head unmap_list;
+ unsigned int remap_count;
+
+ struct scatterlist *sg;
+ unsigned int sg_len;
+ u64 *dma;
+ int sg_dma_len;
+};
+
+/*
+ * Our own little FMR pool
+ */
+struct rds_ib_mr_pool {
+ struct mutex flush_lock; /* serialize fmr invalidate */
+ struct delayed_work flush_worker; /* flush worker */
+
+ atomic_t item_count; /* total # of MRs */
+ atomic_t dirty_count; /* # dirty of MRs */
+
+ struct llist_head drop_list; /* MRs that have reached their max_maps limit */
+ struct llist_head free_list; /* unused MRs */
+ struct llist_head clean_list; /* global unused & unamapped MRs */
+ wait_queue_head_t flush_wait;
+
+ atomic_t free_pinned; /* memory pinned by free MRs */
+ unsigned long max_items;
+ unsigned long max_items_soft;
+ unsigned long max_free_pinned;
+ struct ib_fmr_attr fmr_attr;
+};
+
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
+
+static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct rds_ib_ipaddr *i_ipaddr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+ list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+ if (i_ipaddr->ipaddr == ipaddr) {
+ atomic_inc(&rds_ibdev->refcount);
+ rcu_read_unlock();
+ return rds_ibdev;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+ struct rds_ib_ipaddr *i_ipaddr;
+
+ i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
+ if (!i_ipaddr)
+ return -ENOMEM;
+
+ i_ipaddr->ipaddr = ipaddr;
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ return 0;
+}
+
+static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+ struct rds_ib_ipaddr *i_ipaddr;
+ struct rds_ib_ipaddr *to_free = NULL;
+
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+ if (i_ipaddr->ipaddr == ipaddr) {
+ list_del_rcu(&i_ipaddr->list);
+ to_free = i_ipaddr;
+ break;
+ }
+ }
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ if (to_free) {
+ synchronize_rcu();
+ kfree(to_free);
+ }
+}
+
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+ struct rds_ib_device *rds_ibdev_old;
+
+ rds_ibdev_old = rds_ib_get_device(ipaddr);
+ if (rds_ibdev_old) {
+ rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+ rds_ib_dev_put(rds_ibdev_old);
+ }
+
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+}
+
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* conn was previously on the nodev_conns_list */
+ spin_lock_irq(&ib_nodev_conns_lock);
+ BUG_ON(list_empty(&ib_nodev_conns));
+ BUG_ON(list_empty(&ic->ib_node));
+ list_del(&ic->ib_node);
+
+ spin_lock(&rds_ibdev->spinlock);
+ list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
+ spin_unlock(&rds_ibdev->spinlock);
+ spin_unlock_irq(&ib_nodev_conns_lock);
+
+ ic->rds_ibdev = rds_ibdev;
+ atomic_inc(&rds_ibdev->refcount);
+}
+
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* place conn on nodev_conns_list */
+ spin_lock(&ib_nodev_conns_lock);
+
+ spin_lock_irq(&rds_ibdev->spinlock);
+ BUG_ON(list_empty(&ic->ib_node));
+ list_del(&ic->ib_node);
+ spin_unlock_irq(&rds_ibdev->spinlock);
+
+ list_add_tail(&ic->ib_node, &ib_nodev_conns);
+
+ spin_unlock(&ib_nodev_conns_lock);
+
+ ic->rds_ibdev = NULL;
+ rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_destroy_nodev_conns(void)
+{
+ struct rds_ib_connection *ic, *_ic;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&ib_nodev_conns_lock);
+ list_splice(&ib_nodev_conns, &tmp_list);
+ spin_unlock_irq(&ib_nodev_conns_lock);
+
+ list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
+ rds_conn_destroy(ic->conn);
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
+{
+ struct rds_ib_mr_pool *pool;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ init_llist_head(&pool->free_list);
+ init_llist_head(&pool->drop_list);
+ init_llist_head(&pool->clean_list);
+ mutex_init(&pool->flush_lock);
+ init_waitqueue_head(&pool->flush_wait);
+ INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+ pool->fmr_attr.max_pages = fmr_message_size;
+ pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+ pool->fmr_attr.page_shift = PAGE_SHIFT;
+ pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
+
+ /* We never allow more than max_items MRs to be allocated.
+ * When we exceed more than max_items_soft, we start freeing
+ * items more aggressively.
+ * Make sure that max_items > max_items_soft > max_items / 2
+ */
+ pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
+ pool->max_items = rds_ibdev->max_fmrs;
+
+ return pool;
+}
+
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
+{
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+ iinfo->rdma_mr_max = pool->max_items;
+ iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
+}
+
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+ cancel_delayed_work_sync(&pool->flush_worker);
+ rds_ib_flush_mr_pool(pool, 1, NULL);
+ WARN_ON(atomic_read(&pool->item_count));
+ WARN_ON(atomic_read(&pool->free_pinned));
+ kfree(pool);
+}
+
+static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ struct llist_node *ret;
+ unsigned long *flag;
+
+ preempt_disable();
+ flag = this_cpu_ptr(&clean_list_grace);
+ set_bit(CLEAN_LIST_BUSY_BIT, flag);
+ ret = llist_del_first(&pool->clean_list);
+ if (ret)
+ ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+
+ clear_bit(CLEAN_LIST_BUSY_BIT, flag);
+ preempt_enable();
+ return ibmr;
+}
+
+static inline void wait_clean_list_grace(void)
+{
+ int cpu;
+ unsigned long *flag;
+
+ for_each_online_cpu(cpu) {
+ flag = &per_cpu(clean_list_grace, cpu);
+ while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
+ cpu_relax();
+ }
+}
+
+static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
+{
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+ struct rds_ib_mr *ibmr = NULL;
+ int err = 0, iter = 0;
+
+ if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ schedule_delayed_work(&pool->flush_worker, 10);
+
+ while (1) {
+ ibmr = rds_ib_reuse_fmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ /* No clean MRs - now we have the choice of either
+ * allocating a fresh MR up to the limit imposed by the
+ * driver, or flush any dirty unused MRs.
+ * We try to avoid stalling in the send path if possible,
+ * so we allocate as long as we're allowed to.
+ *
+ * We're fussy with enforcing the FMR limit, though. If the driver
+ * tells us we can't use more than N fmrs, we shouldn't start
+ * arguing with it */
+ if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+ break;
+
+ atomic_dec(&pool->item_count);
+
+ if (++iter > 2) {
+ rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* We do have some empty MRs. Flush them out. */
+ rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
+ rds_ib_flush_mr_pool(pool, 0, &ibmr);
+ if (ibmr)
+ return ibmr;
+ }
+
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ memset(ibmr, 0, sizeof(*ibmr));
+
+ ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE|
+ IB_ACCESS_REMOTE_ATOMIC),
+ &pool->fmr_attr);
+ if (IS_ERR(ibmr->fmr)) {
+ err = PTR_ERR(ibmr->fmr);
+ ibmr->fmr = NULL;
+ printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
+ goto out_no_cigar;
+ }
+
+ rds_ib_stats_inc(s_ib_rdma_mr_alloc);
+ return ibmr;
+
+out_no_cigar:
+ if (ibmr) {
+ if (ibmr->fmr)
+ ib_dealloc_fmr(ibmr->fmr);
+ kfree(ibmr);
+ }
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+ struct scatterlist *sg, unsigned int nents)
+{
+ struct ib_device *dev = rds_ibdev->dev;
+ struct scatterlist *scat = sg;
+ u64 io_addr = 0;
+ u64 *dma_pages;
+ u32 len;
+ int page_cnt, sg_dma_len;
+ int i, j;
+ int ret;
+
+ sg_dma_len = ib_dma_map_sg(dev, sg, nents,
+ DMA_BIDIRECTIONAL);
+ if (unlikely(!sg_dma_len)) {
+ printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
+ return -EBUSY;
+ }
+
+ len = 0;
+ page_cnt = 0;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ if (dma_addr & ~PAGE_MASK) {
+ if (i > 0)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
+ if (i < sg_dma_len - 1)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+
+ len += dma_len;
+ }
+
+ page_cnt += len >> PAGE_SHIFT;
+ if (page_cnt > fmr_message_size)
+ return -EINVAL;
+
+ dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+ rdsibdev_to_node(rds_ibdev));
+ if (!dma_pages)
+ return -ENOMEM;
+
+ page_cnt = 0;
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ for (j = 0; j < dma_len; j += PAGE_SIZE)
+ dma_pages[page_cnt++] =
+ (dma_addr & PAGE_MASK) + j;
+ }
+
+ ret = ib_map_phys_fmr(ibmr->fmr,
+ dma_pages, page_cnt, io_addr);
+ if (ret)
+ goto out;
+
+ /* Success - we successfully remapped the MR, so we can
+ * safely tear down the old mapping. */
+ rds_ib_teardown_mr(ibmr);
+
+ ibmr->sg = scat;
+ ibmr->sg_len = nents;
+ ibmr->sg_dma_len = sg_dma_len;
+ ibmr->remap_count++;
+
+ rds_ib_stats_inc(s_ib_rdma_mr_used);
+ ret = 0;
+
+out:
+ kfree(dma_pages);
+
+ return ret;
+}
+
+void rds_ib_sync_mr(void *trans_private, int direction)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+
+ switch (direction) {
+ case DMA_FROM_DEVICE:
+ ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
+ ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+ break;
+ case DMA_TO_DEVICE:
+ ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
+ ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+ break;
+ }
+}
+
+static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+
+ if (ibmr->sg_dma_len) {
+ ib_dma_unmap_sg(rds_ibdev->dev,
+ ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ ibmr->sg_dma_len = 0;
+ }
+
+ /* Release the s/g list */
+ if (ibmr->sg_len) {
+ unsigned int i;
+
+ for (i = 0; i < ibmr->sg_len; ++i) {
+ struct page *page = sg_page(&ibmr->sg[i]);
+
+ /* FIXME we need a way to tell a r/w MR
+ * from a r/o MR */
+ BUG_ON(irqs_disabled());
+ set_page_dirty(page);
+ put_page(page);
+ }
+ kfree(ibmr->sg);
+
+ ibmr->sg = NULL;
+ ibmr->sg_len = 0;
+ }
+}
+
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+ unsigned int pinned = ibmr->sg_len;
+
+ __rds_ib_teardown_mr(ibmr);
+ if (pinned) {
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+ atomic_sub(pinned, &pool->free_pinned);
+ }
+}
+
+static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
+{
+ unsigned int item_count;
+
+ item_count = atomic_read(&pool->item_count);
+ if (free_all)
+ return item_count;
+
+ return 0;
+}
+
+/*
+ * given an llist of mrs, put them all into the list_head for more processing
+ */
+static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
+{
+ struct rds_ib_mr *ibmr;
+ struct llist_node *node;
+ struct llist_node *next;
+
+ node = llist_del_all(llist);
+ while (node) {
+ next = node->next;
+ ibmr = llist_entry(node, struct rds_ib_mr, llnode);
+ list_add_tail(&ibmr->unmap_list, list);
+ node = next;
+ }
+}
+
+/*
+ * this takes a list head of mrs and turns it into linked llist nodes
+ * of clusters. Each cluster has linked llist nodes of
+ * MR_CLUSTER_SIZE mrs that are ready for reuse.
+ */
+static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
+ struct list_head *list,
+ struct llist_node **nodes_head,
+ struct llist_node **nodes_tail)
+{
+ struct rds_ib_mr *ibmr;
+ struct llist_node *cur = NULL;
+ struct llist_node **next = nodes_head;
+
+ list_for_each_entry(ibmr, list, unmap_list) {
+ cur = &ibmr->llnode;
+ *next = cur;
+ next = &cur->next;
+ }
+ *next = NULL;
+ *nodes_tail = cur;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+ int free_all, struct rds_ib_mr **ibmr_ret)
+{
+ struct rds_ib_mr *ibmr, *next;
+ struct llist_node *clean_nodes;
+ struct llist_node *clean_tail;
+ LIST_HEAD(unmap_list);
+ LIST_HEAD(fmr_list);
+ unsigned long unpinned = 0;
+ unsigned int nfreed = 0, ncleaned = 0, free_goal;
+ int ret = 0;
+
+ rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
+
+ if (ibmr_ret) {
+ DEFINE_WAIT(wait);
+ while(!mutex_trylock(&pool->flush_lock)) {
+ ibmr = rds_ib_reuse_fmr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ finish_wait(&pool->flush_wait, &wait);
+ goto out_nolock;
+ }
+
+ prepare_to_wait(&pool->flush_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (llist_empty(&pool->clean_list))
+ schedule();
+
+ ibmr = rds_ib_reuse_fmr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ finish_wait(&pool->flush_wait, &wait);
+ goto out_nolock;
+ }
+ }
+ finish_wait(&pool->flush_wait, &wait);
+ } else
+ mutex_lock(&pool->flush_lock);
+
+ if (ibmr_ret) {
+ ibmr = rds_ib_reuse_fmr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ goto out;
+ }
+ }
+
+ /* Get the list of all MRs to be dropped. Ordering matters -
+ * we want to put drop_list ahead of free_list.
+ */
+ llist_append_to_list(&pool->drop_list, &unmap_list);
+ llist_append_to_list(&pool->free_list, &unmap_list);
+ if (free_all)
+ llist_append_to_list(&pool->clean_list, &unmap_list);
+
+ free_goal = rds_ib_flush_goal(pool, free_all);
+
+ if (list_empty(&unmap_list))
+ goto out;
+
+ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+ list_for_each_entry(ibmr, &unmap_list, unmap_list)
+ list_add(&ibmr->fmr->list, &fmr_list);
+
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
+
+ /* Now we can destroy the DMA mapping and unpin any pages */
+ list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
+ unpinned += ibmr->sg_len;
+ __rds_ib_teardown_mr(ibmr);
+ if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
+ rds_ib_stats_inc(s_ib_rdma_mr_free);
+ list_del(&ibmr->unmap_list);
+ ib_dealloc_fmr(ibmr->fmr);
+ kfree(ibmr);
+ nfreed++;
+ }
+ ncleaned++;
+ }
+
+ if (!list_empty(&unmap_list)) {
+ /* we have to make sure that none of the things we're about
+ * to put on the clean list would race with other cpus trying
+ * to pull items off. The llist would explode if we managed to
+ * remove something from the clean list and then add it back again
+ * while another CPU was spinning on that same item in llist_del_first.
+ *
+ * This is pretty unlikely, but just in case wait for an llist grace period
+ * here before adding anything back into the clean list.
+ */
+ wait_clean_list_grace();
+
+ list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
+ if (ibmr_ret)
+ *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
+
+ /* more than one entry in llist nodes */
+ if (clean_nodes->next)
+ llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
+
+ }
+
+ atomic_sub(unpinned, &pool->free_pinned);
+ atomic_sub(ncleaned, &pool->dirty_count);
+ atomic_sub(nfreed, &pool->item_count);
+
+out:
+ mutex_unlock(&pool->flush_lock);
+ if (waitqueue_active(&pool->flush_wait))
+ wake_up(&pool->flush_wait);
+out_nolock:
+ return ret;
+}
+
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
+{
+ struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
+
+ rds_ib_flush_mr_pool(pool, 0, NULL);
+}
+
+void rds_ib_free_mr(void *trans_private, int invalidate)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+ struct rds_ib_device *rds_ibdev = ibmr->device;
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+ rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+
+ /* Return it to the pool's free list */
+ if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+
+ atomic_add(ibmr->sg_len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+ atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ schedule_delayed_work(&pool->flush_worker, 10);
+
+ if (invalidate) {
+ if (likely(!in_interrupt())) {
+ rds_ib_flush_mr_pool(pool, 0, NULL);
+ } else {
+ /* We get here if the user created a MR marked
+ * as use_once and invalidate at the same time. */
+ schedule_delayed_work(&pool->flush_worker, 10);
+ }
+ }
+
+ rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_flush_mrs(void)
+{
+ struct rds_ib_device *rds_ibdev;
+
+ down_read(&rds_ib_devices_lock);
+ list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+ struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+ if (pool)
+ rds_ib_flush_mr_pool(pool, 0, NULL);
+ }
+ up_read(&rds_ib_devices_lock);
+}
+
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct rds_ib_mr *ibmr = NULL;
+ int ret;
+
+ rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+ if (!rds_ibdev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!rds_ibdev->mr_pool) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ ibmr = rds_ib_alloc_fmr(rds_ibdev);
+ if (IS_ERR(ibmr))
+ return ibmr;
+
+ ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+ if (ret == 0)
+ *key_ret = ibmr->fmr->rkey;
+ else
+ printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
+
+ ibmr->device = rds_ibdev;
+ rds_ibdev = NULL;
+
+ out:
+ if (ret) {
+ if (ibmr)
+ rds_ib_free_mr(ibmr, 0);
+ ibmr = ERR_PTR(ret);
+ }
+ if (rds_ibdev)
+ rds_ib_dev_put(rds_ibdev);
+ return ibmr;
+}
+
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 000000000..1b981a4e4
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,1064 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static struct kmem_cache *rds_ib_incoming_slab;
+static struct kmem_cache *rds_ib_frag_slab;
+static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
+
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_recv_work *recv;
+ u32 i;
+
+ for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+ struct ib_sge *sge;
+
+ recv->r_ibinc = NULL;
+ recv->r_frag = NULL;
+
+ recv->r_wr.next = NULL;
+ recv->r_wr.wr_id = i;
+ recv->r_wr.sg_list = recv->r_sge;
+ recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+
+ sge = &recv->r_sge[0];
+ sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
+
+ sge = &recv->r_sge[1];
+ sge->addr = 0;
+ sge->length = RDS_FRAG_SIZE;
+ sge->lkey = ic->i_mr->lkey;
+ }
+}
+
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+ struct list_head *to)
+{
+ struct list_head *from_last = from->prev;
+
+ list_splice_tail(from_last, to);
+ list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+ struct list_head *tmp;
+
+ tmp = xchg(&cache->xfer, NULL);
+ if (tmp) {
+ if (cache->ready)
+ list_splice_entire_tail(tmp, cache->ready);
+ else
+ cache->ready = tmp;
+ }
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+ struct rds_ib_cache_head *head;
+ int cpu;
+
+ cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+ if (!cache->percpu)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ head = per_cpu_ptr(cache->percpu, cpu);
+ head->first = NULL;
+ head->count = 0;
+ }
+ cache->xfer = NULL;
+ cache->ready = NULL;
+
+ return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+ int ret;
+
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+ if (!ret) {
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+ if (ret)
+ free_percpu(ic->i_cache_incs.percpu);
+ }
+
+ return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+ struct list_head *caller_list)
+{
+ struct rds_ib_cache_head *head;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ head = per_cpu_ptr(cache->percpu, cpu);
+ if (head->first) {
+ list_splice_entire_tail(head->first, caller_list);
+ head->first = NULL;
+ }
+ }
+
+ if (cache->ready) {
+ list_splice_entire_tail(cache->ready, caller_list);
+ cache->ready = NULL;
+ }
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+ struct rds_ib_incoming *inc;
+ struct rds_ib_incoming *inc_tmp;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *frag_tmp;
+ LIST_HEAD(list);
+
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+ rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+ free_percpu(ic->i_cache_incs.percpu);
+
+ list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+ list_del(&inc->ii_cache_entry);
+ WARN_ON(!list_empty(&inc->ii_frags));
+ kmem_cache_free(rds_ib_incoming_slab, inc);
+ }
+
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+ rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+ free_percpu(ic->i_cache_frags.percpu);
+
+ list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+ list_del(&frag->f_cache_entry);
+ WARN_ON(!list_empty(&frag->f_item));
+ kmem_cache_free(rds_ib_frag_slab, frag);
+ }
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+ struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+ struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+ rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+ struct rds_ib_incoming *ibinc;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *pos;
+ struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+ /* Free attached frags */
+ list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+ list_del_init(&frag->f_item);
+ rds_ib_frag_free(ic, frag);
+ }
+ BUG_ON(!list_empty(&ibinc->ii_frags));
+
+ rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+ rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
+}
+
+static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
+ struct rds_ib_recv_work *recv)
+{
+ if (recv->r_ibinc) {
+ rds_inc_put(&recv->r_ibinc->ii_inc);
+ recv->r_ibinc = NULL;
+ }
+ if (recv->r_frag) {
+ ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+ rds_ib_frag_free(ic, recv->r_frag);
+ recv->r_frag = NULL;
+ }
+}
+
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
+{
+ u32 i;
+
+ for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+ rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+}
+
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+ gfp_t slab_mask)
+{
+ struct rds_ib_incoming *ibinc;
+ struct list_head *cache_item;
+ int avail_allocs;
+
+ cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+ if (cache_item) {
+ ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+ } else {
+ avail_allocs = atomic_add_unless(&rds_ib_allocation,
+ 1, rds_ib_sysctl_max_recv_allocation);
+ if (!avail_allocs) {
+ rds_ib_stats_inc(s_ib_rx_alloc_limit);
+ return NULL;
+ }
+ ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+ if (!ibinc) {
+ atomic_dec(&rds_ib_allocation);
+ return NULL;
+ }
+ }
+ INIT_LIST_HEAD(&ibinc->ii_frags);
+ rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+
+ return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+ gfp_t slab_mask, gfp_t page_mask)
+{
+ struct rds_page_frag *frag;
+ struct list_head *cache_item;
+ int ret;
+
+ cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+ if (cache_item) {
+ frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+ } else {
+ frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+ if (!frag)
+ return NULL;
+
+ sg_init_table(&frag->f_sg, 1);
+ ret = rds_page_remainder_alloc(&frag->f_sg,
+ RDS_FRAG_SIZE, page_mask);
+ if (ret) {
+ kmem_cache_free(rds_ib_frag_slab, frag);
+ return NULL;
+ }
+ }
+
+ INIT_LIST_HEAD(&frag->f_item);
+
+ return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv, int prefill)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_sge *sge;
+ int ret = -ENOMEM;
+ gfp_t slab_mask = GFP_NOWAIT;
+ gfp_t page_mask = GFP_NOWAIT;
+
+ if (prefill) {
+ slab_mask = GFP_KERNEL;
+ page_mask = GFP_HIGHUSER;
+ }
+
+ if (!ic->i_cache_incs.ready)
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+ if (!ic->i_cache_frags.ready)
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+
+ /*
+ * ibinc was taken from recv if recv contained the start of a message.
+ * recvs that were continuations will still have this allocated.
+ */
+ if (!recv->r_ibinc) {
+ recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+ if (!recv->r_ibinc)
+ goto out;
+ }
+
+ WARN_ON(recv->r_frag); /* leak! */
+ recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+ if (!recv->r_frag)
+ goto out;
+
+ ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+ 1, DMA_FROM_DEVICE);
+ WARN_ON(ret != 1);
+
+ sge = &recv->r_sge[0];
+ sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+ sge->length = sizeof(struct rds_header);
+
+ sge = &recv->r_sge[1];
+ sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg);
+ sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_recv_work *recv;
+ struct ib_recv_wr *failed_wr;
+ unsigned int posted = 0;
+ int ret = 0;
+ u32 pos;
+
+ while ((prefill || rds_conn_up(conn)) &&
+ rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+ if (pos >= ic->i_recv_ring.w_nr) {
+ printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+ pos);
+ break;
+ }
+
+ recv = &ic->i_recvs[pos];
+ ret = rds_ib_recv_refill_one(conn, recv, prefill);
+ if (ret) {
+ break;
+ }
+
+ /* XXX when can this fail? */
+ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+ rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
+ recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+ (long) ib_sg_dma_address(
+ ic->i_cm_id->device,
+ &recv->r_frag->f_sg),
+ ret);
+ if (ret) {
+ rds_ib_conn_error(conn, "recv post on "
+ "%pI4 returned %d, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ ret);
+ break;
+ }
+
+ posted++;
+ }
+
+ /* We're doing flow control - update the window. */
+ if (ic->i_flowctl && posted)
+ rds_ib_advertise_credits(conn, posted);
+
+ if (ret)
+ rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+}
+
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+ struct rds_ib_refill_cache *cache)
+{
+ unsigned long flags;
+ struct list_head *old, *chpfirst;
+
+ local_irq_save(flags);
+
+ chpfirst = __this_cpu_read(cache->percpu->first);
+ if (!chpfirst)
+ INIT_LIST_HEAD(new_item);
+ else /* put on front */
+ list_add_tail(new_item, chpfirst);
+
+ __this_cpu_write(cache->percpu->first, new_item);
+ __this_cpu_inc(cache->percpu->count);
+
+ if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
+ goto end;
+
+ /*
+ * Return our per-cpu first list to the cache's xfer by atomically
+ * grabbing the current xfer list, appending it to our per-cpu list,
+ * and then atomically returning that entire list back to the
+ * cache's xfer list as long as it's still empty.
+ */
+ do {
+ old = xchg(&cache->xfer, NULL);
+ if (old)
+ list_splice_entire_tail(old, chpfirst);
+ old = cmpxchg(&cache->xfer, NULL, chpfirst);
+ } while (old);
+
+
+ __this_cpu_write(cache->percpu->first, NULL);
+ __this_cpu_write(cache->percpu->count, 0);
+end:
+ local_irq_restore(flags);
+}
+
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
+{
+ struct list_head *head = cache->ready;
+
+ if (head) {
+ if (!list_empty(head)) {
+ cache->ready = head->next;
+ list_del_init(head);
+ } else
+ cache->ready = NULL;
+ }
+
+ return head;
+}
+
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+ struct rds_ib_incoming *ibinc;
+ struct rds_page_frag *frag;
+ unsigned long to_copy;
+ unsigned long frag_off = 0;
+ int copied = 0;
+ int ret;
+ u32 len;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+ frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+ len = be32_to_cpu(inc->i_hdr.h_len);
+
+ while (iov_iter_count(to) && copied < len) {
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ to_copy = min_t(unsigned long, iov_iter_count(to),
+ RDS_FRAG_SIZE - frag_off);
+ to_copy = min_t(unsigned long, to_copy, len - copied);
+
+ /* XXX needs + offset for multiple recvs per page */
+ rds_stats_add(s_copy_to_user, to_copy);
+ ret = copy_page_to_iter(sg_page(&frag->f_sg),
+ frag->f_sg.offset + frag_off,
+ to_copy,
+ to);
+ if (ret != to_copy)
+ return -EFAULT;
+
+ frag_off += to_copy;
+ copied += to_copy;
+ }
+
+ return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
+{
+ struct ib_send_wr *wr = &ic->i_ack_wr;
+ struct ib_sge *sge = &ic->i_ack_sge;
+
+ sge->addr = ic->i_ack_dma;
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
+
+ wr->sg_list = sge;
+ wr->num_sge = 1;
+ wr->opcode = IB_WR_SEND;
+ wr->wr_id = RDS_IB_ACK_WR_ID;
+ wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received. The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory. This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed. This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue. To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time. This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight. This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame. This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do. The QP attribute specifically makes
+ * room for it beyond the ring size. Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+ int ack_required)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ic->i_ack_lock, flags);
+ ic->i_ack_next = seq;
+ if (ack_required)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+ unsigned long flags;
+ u64 seq;
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ spin_lock_irqsave(&ic->i_ack_lock, flags);
+ seq = ic->i_ack_next;
+ spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+ return seq;
+}
+#else
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+ int ack_required)
+{
+ atomic64_set(&ic->i_ack_next, seq);
+ if (ack_required) {
+ smp_mb__before_atomic();
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ }
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ smp_mb__after_atomic();
+
+ return atomic64_read(&ic->i_ack_next);
+}
+#endif
+
+
+static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
+{
+ struct rds_header *hdr = ic->i_ack;
+ struct ib_send_wr *failed_wr;
+ u64 seq;
+ int ret;
+
+ seq = rds_ib_get_ack(ic);
+
+ rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+ rds_message_populate_header(hdr, 0, 0, 0);
+ hdr->h_ack = cpu_to_be64(seq);
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ ic->i_ack_queued = jiffies;
+
+ ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+ if (unlikely(ret)) {
+ /* Failed to send. Release the WR, and
+ * force another ACK.
+ */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ rds_ib_stats_inc(s_ib_ack_send_failure);
+
+ rds_ib_conn_error(ic->conn, "sending ack failed\n");
+ } else
+ rds_ib_stats_inc(s_ib_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ * 1. We call rds_ib_attempt_ack from the recv completion handler
+ * to send an ACK-only frame.
+ * However, there can be only one such frame in the send queue
+ * at any time, so we may have to postpone it.
+ * 2. When another (data) packet is transmitted while there's
+ * an ACK in the queue, we piggyback the ACK sequence number
+ * on the data packet.
+ * 3. If the ACK WR is done sending, we get called from the
+ * send queue completion handler, and check whether there's
+ * another ACK pending (postponed because the WR was on the
+ * queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ * - i_ack_flags, which keeps track of whether the ACK WR
+ * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ * - i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_ib_attempt_ack(struct rds_ib_connection *ic)
+{
+ unsigned int adv_credits;
+
+ if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ return;
+
+ if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+ rds_ib_stats_inc(s_ib_ack_send_delayed);
+ return;
+ }
+
+ /* Can we get a send credit? */
+ if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+ rds_ib_stats_inc(s_ib_tx_throttle);
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ return;
+ }
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ rds_ib_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
+{
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ rds_ib_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
+{
+ if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ rds_ib_stats_inc(s_ib_ack_send_piggybacked);
+ return rds_ib_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps. We could have posted the bitmaps and rdma written into
+ * them. But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient. By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_ib_cong_recv(struct rds_connection *conn,
+ struct rds_ib_incoming *ibinc)
+{
+ struct rds_cong_map *map;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rds_page_frag *frag;
+ unsigned long frag_off;
+ unsigned long to_copy;
+ unsigned long copied;
+ uint64_t uncongested = 0;
+ void *addr;
+
+ /* catch completely corrupt packets */
+ if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+ return;
+
+ map = conn->c_fcong;
+ map_page = 0;
+ map_off = 0;
+
+ frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+ frag_off = 0;
+
+ copied = 0;
+
+ while (copied < RDS_CONG_MAP_BYTES) {
+ uint64_t *src, *dst;
+ unsigned int k;
+
+ to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+ BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+ addr = kmap_atomic(sg_page(&frag->f_sg));
+
+ src = addr + frag_off;
+ dst = (void *)map->m_page_addrs[map_page] + map_off;
+ for (k = 0; k < to_copy; k += 8) {
+ /* Record ports that became uncongested, ie
+ * bits that changed from 0 to 1. */
+ uncongested |= ~(*src) & *dst;
+ *dst++ = *src++;
+ }
+ kunmap_atomic(addr);
+
+ copied += to_copy;
+
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+
+ frag_off += to_copy;
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ }
+
+ /* the congestion map is in little endian order */
+ uncongested = le64_to_cpu(uncongested);
+
+ rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+ u64 ack_next;
+ u64 ack_recv;
+ unsigned int ack_required:1;
+ unsigned int ack_next_valid:1;
+ unsigned int ack_recv_valid:1;
+};
+
+static void rds_ib_process_recv(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv, u32 data_len,
+ struct rds_ib_ack_state *state)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_incoming *ibinc = ic->i_ibinc;
+ struct rds_header *ihdr, *hdr;
+
+ /* XXX shut down the connection if port 0,0 are seen? */
+
+ rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
+ data_len);
+
+ if (data_len < sizeof(struct rds_header)) {
+ rds_ib_conn_error(conn, "incoming message "
+ "from %pI4 didn't include a "
+ "header, disconnecting and "
+ "reconnecting\n",
+ &conn->c_faddr);
+ return;
+ }
+ data_len -= sizeof(struct rds_header);
+
+ ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+ /* Validate the checksum. */
+ if (!rds_message_verify_checksum(ihdr)) {
+ rds_ib_conn_error(conn, "incoming message "
+ "from %pI4 has corrupted header - "
+ "forcing a reconnect\n",
+ &conn->c_faddr);
+ rds_stats_inc(s_recv_drop_bad_checksum);
+ return;
+ }
+
+ /* Process the ACK sequence which comes with every packet */
+ state->ack_recv = be64_to_cpu(ihdr->h_ack);
+ state->ack_recv_valid = 1;
+
+ /* Process the credits update if there was one */
+ if (ihdr->h_credit)
+ rds_ib_send_add_credits(conn, ihdr->h_credit);
+
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+ /* This is an ACK-only packet. The fact that it gets
+ * special treatment here is that historically, ACKs
+ * were rather special beasts.
+ */
+ rds_ib_stats_inc(s_ib_ack_received);
+
+ /*
+ * Usually the frags make their way on to incs and are then freed as
+ * the inc is freed. We don't go that route, so we have to drop the
+ * page ref ourselves. We can't just leave the page on the recv
+ * because that confuses the dma mapping of pages and each recv's use
+ * of a partial page.
+ *
+ * FIXME: Fold this into the code path below.
+ */
+ rds_ib_frag_free(ic, recv->r_frag);
+ recv->r_frag = NULL;
+ return;
+ }
+
+ /*
+ * If we don't already have an inc on the connection then this
+ * fragment has a header and starts a message.. copy its header
+ * into the inc and save the inc so we can hang upcoming fragments
+ * off its list.
+ */
+ if (!ibinc) {
+ ibinc = recv->r_ibinc;
+ recv->r_ibinc = NULL;
+ ic->i_ibinc = ibinc;
+
+ hdr = &ibinc->ii_inc.i_hdr;
+ memcpy(hdr, ihdr, sizeof(*hdr));
+ ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+ rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
+ ic->i_recv_data_rem, hdr->h_flags);
+ } else {
+ hdr = &ibinc->ii_inc.i_hdr;
+ /* We can't just use memcmp here; fragments of a
+ * single message may carry different ACKs */
+ if (hdr->h_sequence != ihdr->h_sequence ||
+ hdr->h_len != ihdr->h_len ||
+ hdr->h_sport != ihdr->h_sport ||
+ hdr->h_dport != ihdr->h_dport) {
+ rds_ib_conn_error(conn,
+ "fragment header mismatch; forcing reconnect\n");
+ return;
+ }
+ }
+
+ list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
+ recv->r_frag = NULL;
+
+ if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+ ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+ else {
+ ic->i_recv_data_rem = 0;
+ ic->i_ibinc = NULL;
+
+ if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ rds_ib_cong_recv(conn, ibinc);
+ else {
+ rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+ &ibinc->ii_inc, GFP_ATOMIC);
+ state->ack_next = be64_to_cpu(hdr->h_sequence);
+ state->ack_next_valid = 1;
+ }
+
+ /* Evaluate the ACK_REQUIRED flag *after* we received
+ * the complete frame, and after bumping the next_rx
+ * sequence. */
+ if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+ rds_stats_inc(s_recv_ack_required);
+ state->ack_required = 1;
+ }
+
+ rds_inc_put(&ibinc->ii_inc);
+ }
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring. Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p cq %p\n", conn, cq);
+
+ rds_ib_stats_inc(s_ib_rx_cq_call);
+
+ tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static inline void rds_poll_cq(struct rds_ib_connection *ic,
+ struct rds_ib_ack_state *state)
+{
+ struct rds_connection *conn = ic->conn;
+ struct ib_wc wc;
+ struct rds_ib_recv_work *recv;
+
+ while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status,
+ rds_ib_wc_status_str(wc.status), wc.byte_len,
+ be32_to_cpu(wc.ex.imm_data));
+ rds_ib_stats_inc(s_ib_rx_cq_event);
+
+ recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+
+ ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+
+ /*
+ * Also process recvs in connecting state because it is possible
+ * to get a recv completion _before_ the rdmacm ESTABLISHED
+ * event is processed.
+ */
+ if (wc.status == IB_WC_SUCCESS) {
+ rds_ib_process_recv(conn, recv, wc.byte_len, state);
+ } else {
+ /* We expect errors as the qp is drained during shutdown */
+ if (rds_conn_up(conn) || rds_conn_connecting(conn))
+ rds_ib_conn_error(conn, "recv completion on %pI4 had "
+ "status %u (%s), disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ wc.status,
+ rds_ib_wc_status_str(wc.status));
+ }
+
+ /*
+ * It's very important that we only free this ring entry if we've truly
+ * freed the resources allocated to the entry. The refilling path can
+ * leak if we don't.
+ */
+ rds_ib_ring_free(&ic->i_recv_ring, 1);
+ }
+}
+
+void rds_ib_recv_tasklet_fn(unsigned long data)
+{
+ struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
+ struct rds_connection *conn = ic->conn;
+ struct rds_ib_ack_state state = { 0, };
+
+ rds_poll_cq(ic, &state);
+ ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+ rds_poll_cq(ic, &state);
+
+ if (state.ack_next_valid)
+ rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+ if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+ rds_send_drop_acked(conn, state.ack_recv, NULL);
+ ic->i_ack_recv = state.ack_recv;
+ }
+ if (rds_conn_up(conn))
+ rds_ib_attempt_ack(ic);
+
+ /* If we ever end up with a really empty receive ring, we're
+ * in deep trouble, as the sender will definitely see RNR
+ * timeouts. */
+ if (rds_ib_ring_empty(&ic->i_recv_ring))
+ rds_ib_stats_inc(s_ib_rx_ring_empty);
+
+ if (rds_ib_ring_low(&ic->i_recv_ring))
+ rds_ib_recv_refill(conn, 0);
+}
+
+int rds_ib_recv(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ int ret = 0;
+
+ rdsdebug("conn %p\n", conn);
+ if (rds_conn_up(conn))
+ rds_ib_attempt_ack(ic);
+
+ return ret;
+}
+
+int rds_ib_recv_init(void)
+{
+ struct sysinfo si;
+ int ret = -ENOMEM;
+
+ /* Default to 30% of all available RAM for recv memory */
+ si_meminfo(&si);
+ rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+ rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
+ sizeof(struct rds_ib_incoming),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!rds_ib_incoming_slab)
+ goto out;
+
+ rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
+ sizeof(struct rds_page_frag),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!rds_ib_frag_slab)
+ kmem_cache_destroy(rds_ib_incoming_slab);
+ else
+ ret = 0;
+out:
+ return ret;
+}
+
+void rds_ib_recv_exit(void)
+{
+ kmem_cache_destroy(rds_ib_incoming_slab);
+ kmem_cache_destroy(rds_ib_frag_slab);
+}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 000000000..ff97e8eda
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
+
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
+{
+ memset(ring, 0, sizeof(*ring));
+ ring->w_nr = nr;
+ rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
+{
+ u32 diff;
+
+ /* This assumes that atomic_t has at least as many bits as u32 */
+ diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+ BUG_ON(diff > ring->w_nr);
+
+ return diff;
+}
+
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
+{
+ /* We only ever get called from the connection setup code,
+ * prior to creating the QP. */
+ BUG_ON(__rds_ib_ring_used(ring));
+ ring->w_nr = nr;
+}
+
+static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_used(ring) == 0;
+}
+
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
+{
+ u32 ret = 0, avail;
+
+ avail = ring->w_nr - __rds_ib_ring_used(ring);
+
+ rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+ ring->w_alloc_ptr, avail);
+
+ if (val && avail) {
+ ret = min(val, avail);
+ *pos = ring->w_alloc_ptr;
+
+ ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+ ring->w_alloc_ctr += ret;
+ }
+
+ return ret;
+}
+
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
+{
+ ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+ atomic_add(val, &ring->w_free_ctr);
+
+ if (__rds_ib_ring_empty(ring) &&
+ waitqueue_active(&rds_ib_ring_empty_wait))
+ wake_up(&rds_ib_ring_empty_wait);
+}
+
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
+{
+ ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+ ring->w_alloc_ctr -= val;
+}
+
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_empty(ring);
+}
+
+int rds_ib_ring_low(struct rds_ib_work_ring *ring)
+{
+ return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
+}
+
+/*
+ * returns the oldest alloced ring entry. This will be the next one
+ * freed. This can't be called if there are none allocated.
+ */
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
+{
+ return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
+{
+ u32 ret;
+
+ if (oldest <= (unsigned long long)wr_id)
+ ret = (unsigned long long)wr_id - oldest + 1;
+ else
+ ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+ rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+ wr_id, oldest);
+ return ret;
+}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 000000000..bd3825d38
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,1020 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static char *rds_ib_wc_status_strings[] = {
+#define RDS_IB_WC_STATUS_STR(foo) \
+ [IB_WC_##foo] = __stringify(IB_WC_##foo)
+ RDS_IB_WC_STATUS_STR(SUCCESS),
+ RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
+ RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
+ RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
+ RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
+ RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
+ RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
+ RDS_IB_WC_STATUS_STR(REM_OP_ERR),
+ RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
+ RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
+ RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
+ RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
+ RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
+ RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
+ RDS_IB_WC_STATUS_STR(FATAL_ERR),
+ RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
+ RDS_IB_WC_STATUS_STR(GENERAL_ERR),
+#undef RDS_IB_WC_STATUS_STR
+};
+
+char *rds_ib_wc_status_str(enum ib_wc_status status)
+{
+ return rds_str_array(rds_ib_wc_status_strings,
+ ARRAY_SIZE(rds_ib_wc_status_strings), status);
+}
+
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+ int wc_status,
+ void (*complete)(struct rds_message *rm, int status))
+{
+ int notify_status;
+
+ switch (wc_status) {
+ case IB_WC_WR_FLUSH_ERR:
+ return;
+
+ case IB_WC_SUCCESS:
+ notify_status = RDS_RDMA_SUCCESS;
+ break;
+
+ case IB_WC_REM_ACCESS_ERR:
+ notify_status = RDS_RDMA_REMOTE_ERROR;
+ break;
+
+ default:
+ notify_status = RDS_RDMA_OTHER_ERROR;
+ break;
+ }
+ complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+ struct rm_data_op *op,
+ int wc_status)
+{
+ if (op->op_nents)
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents,
+ DMA_TO_DEVICE);
+}
+
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+ struct rm_rdma_op *op,
+ int wc_status)
+{
+ if (op->op_mapped) {
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents,
+ op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->op_mapped = 0;
+ }
+
+ /* If the user asked for a completion notification on this
+ * message, we can implement three different semantics:
+ * 1. Notify when we received the ACK on the RDS message
+ * that was queued with the RDMA. This provides reliable
+ * notification of RDMA status at the expense of a one-way
+ * packet delay.
+ * 2. Notify when the IB stack gives us the completion event for
+ * the RDMA operation.
+ * 3. Notify when the IB stack gives us the completion event for
+ * the accompanying RDS messages.
+ * Here, we implement approach #3. To implement approach #2,
+ * we would need to take an event for the rdma WR. To implement #1,
+ * don't call rds_rdma_send_complete at all, and fall back to the notify
+ * handling in the ACK processing code.
+ *
+ * Note: There's no need to explicitly sync any RDMA buffers using
+ * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+ * operation itself unmapped the RDMA buffers, which takes care
+ * of synching.
+ */
+ rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+ wc_status, rds_rdma_send_complete);
+
+ if (op->op_write)
+ rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+ else
+ rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
+}
+
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+ struct rm_atomic_op *op,
+ int wc_status)
+{
+ /* unmap atomic recvbuf */
+ if (op->op_mapped) {
+ ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+ DMA_FROM_DEVICE);
+ op->op_mapped = 0;
+ }
+
+ rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+ wc_status, rds_atomic_send_complete);
+
+ if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+ rds_ib_stats_inc(s_ib_atomic_cswp);
+ else
+ rds_ib_stats_inc(s_ib_atomic_fadd);
+}
+
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send,
+ int wc_status)
+{
+ struct rds_message *rm = NULL;
+
+ /* In the error case, wc.opcode sometimes contains garbage */
+ switch (send->s_wr.opcode) {
+ case IB_WR_SEND:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, data);
+ rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+ }
+ break;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, rdma);
+ rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+ }
+ break;
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, atomic);
+ rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+ }
+ break;
+ default:
+ printk_ratelimited(KERN_NOTICE
+ "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+ __func__, send->s_wr.opcode);
+ break;
+ }
+
+ send->s_wr.opcode = 0xdead;
+
+ return rm;
+}
+
+void rds_ib_send_init_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ struct ib_sge *sge;
+
+ send->s_op = NULL;
+
+ send->s_wr.wr_id = i;
+ send->s_wr.sg_list = send->s_sge;
+ send->s_wr.ex.imm_data = 0;
+
+ sge = &send->s_sge[0];
+ sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
+
+ send->s_sge[1].lkey = ic->i_mr->lkey;
+ }
+}
+
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
+{
+ struct rds_ib_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ if (send->s_op && send->s_wr.opcode != 0xdead)
+ rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
+ }
+}
+
+/*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+ if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+ waitqueue_active(&rds_ib_ring_empty_wait))
+ wake_up(&rds_ib_ring_empty_wait);
+ BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path. As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_message *rm = NULL;
+ struct ib_wc wc;
+ struct rds_ib_send_work *send;
+ u32 completed;
+ u32 oldest;
+ u32 i = 0;
+ int ret;
+ int nr_sig = 0;
+
+ rdsdebug("cq %p conn %p\n", cq, conn);
+ rds_ib_stats_inc(s_ib_tx_cq_call);
+ ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ if (ret)
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status,
+ rds_ib_wc_status_str(wc.status), wc.byte_len,
+ be32_to_cpu(wc.ex.imm_data));
+ rds_ib_stats_inc(s_ib_tx_cq_event);
+
+ if (wc.wr_id == RDS_IB_ACK_WR_ID) {
+ if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+ rds_ib_stats_inc(s_ib_tx_stalled);
+ rds_ib_ack_send_complete(ic);
+ continue;
+ }
+
+ oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+
+ completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+ for (i = 0; i < completed; i++) {
+ send = &ic->i_sends[oldest];
+ if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+ nr_sig++;
+
+ rm = rds_ib_send_unmap_op(ic, send, wc.status);
+
+ if (time_after(jiffies, send->s_queued + HZ/2))
+ rds_ib_stats_inc(s_ib_tx_stalled);
+
+ if (send->s_op) {
+ if (send->s_op == rm->m_final_op) {
+ /* If anyone waited for this message to get flushed out, wake
+ * them up now */
+ rds_message_unmapped(rm);
+ }
+ rds_message_put(rm);
+ send->s_op = NULL;
+ }
+
+ oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+ }
+
+ rds_ib_ring_free(&ic->i_send_ring, completed);
+ rds_ib_sub_signaled(ic, nr_sig);
+ nr_sig = 0;
+
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+ test_bit(0, &conn->c_map_queued))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+ rds_ib_conn_error(conn, "send completion on %pI4 had status "
+ "%u (%s), disconnecting and reconnecting\n",
+ &conn->c_faddr, wc.status,
+ rds_ib_wc_status_str(wc.status));
+ }
+ }
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ * - send credits: this tells us how many WRs we're allowed
+ * to submit without overruning the receiver's queue. For
+ * each SEND WR we post, we decrement this by one.
+ *
+ * - posted credits: this tells us how many WRs we recently
+ * posted to the receive queue. This value is transferred
+ * to the peer as a "credit update" in a RDS header field.
+ * Every time we transmit credits to the peer, we subtract
+ * the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter. Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
+ u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+{
+ unsigned int avail, posted, got = 0, advertise;
+ long oldval, newval;
+
+ *adv_credits = 0;
+ if (!ic->i_flowctl)
+ return wanted;
+
+try_again:
+ advertise = 0;
+ oldval = newval = atomic_read(&ic->i_credits);
+ posted = IB_GET_POST_CREDITS(oldval);
+ avail = IB_GET_SEND_CREDITS(oldval);
+
+ rdsdebug("wanted=%u credits=%u posted=%u\n",
+ wanted, avail, posted);
+
+ /* The last credit must be used to send a credit update. */
+ if (avail && !posted)
+ avail--;
+
+ if (avail < wanted) {
+ struct rds_connection *conn = ic->i_cm_id->context;
+
+ /* Oops, there aren't that many credits left! */
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ got = avail;
+ } else {
+ /* Sometimes you get what you want, lalala. */
+ got = wanted;
+ }
+ newval -= IB_SET_SEND_CREDITS(got);
+
+ /*
+ * If need_posted is non-zero, then the caller wants
+ * the posted regardless of whether any send credits are
+ * available.
+ */
+ if (posted && (got || need_posted)) {
+ advertise = min_t(unsigned int, posted, max_posted);
+ newval -= IB_SET_POST_CREDITS(advertise);
+ }
+
+ /* Finally bill everything */
+ if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+ goto try_again;
+
+ *adv_credits = advertise;
+ return got;
+}
+
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (credits == 0)
+ return;
+
+ rdsdebug("credits=%u current=%u%s\n",
+ credits,
+ IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+ test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+ atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+ rds_ib_stats_inc(s_ib_rx_credit_updates);
+}
+
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ if (posted == 0)
+ return;
+
+ atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+ /* Decide whether to send an update to the peer now.
+ * If we would send a credit update for every single buffer we
+ * post, we would end up with an ACK storm (ACK arrives,
+ * consumes buffer, we refill the ring, send ACK to remote
+ * advertising the newly posted buffer... ad inf)
+ *
+ * Performance pretty much depends on how often we send
+ * credit updates - too frequent updates mean lots of ACKs.
+ * Too infrequent updates, and the peer will run out of
+ * credits and has to throttle.
+ * For the time being, 16 seems to be a good compromise.
+ */
+ if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send,
+ bool notify)
+{
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time
+ * on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0 || notify) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * This can be called multiple times for a given message. The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests. We translate the scatterlist into a series
+ * of work requests that fragment the message. These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection. This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct rds_ib_send_work *send = NULL;
+ struct rds_ib_send_work *first;
+ struct rds_ib_send_work *prev;
+ struct ib_send_wr *failed_wr;
+ struct scatterlist *scat;
+ u32 pos;
+ u32 i;
+ u32 work_alloc;
+ u32 credit_alloc = 0;
+ u32 posted;
+ u32 adv_credits = 0;
+ int send_flags = 0;
+ int bytes_sent = 0;
+ int ret;
+ int flow_controlled = 0;
+ int nr_sig = 0;
+
+ BUG_ON(off % RDS_FRAG_SIZE);
+ BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+ /* Do not send cong updates to IB loopback */
+ if (conn->c_loopback
+ && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+ rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+ scat = &rm->data.op_sg[sg];
+ ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length);
+ return sizeof(struct rds_header) + ret;
+ }
+
+ /* FIXME we may overallocate here */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+ i = 1;
+ else
+ i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc == 0) {
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (ic->i_flowctl) {
+ credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+ adv_credits += posted;
+ if (credit_alloc < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+ work_alloc = credit_alloc;
+ flow_controlled = 1;
+ }
+ if (work_alloc == 0) {
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_ib_stats_inc(s_ib_tx_throttle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* map the message the first time we see it */
+ if (!ic->i_data_op) {
+ if (rm->data.op_nents) {
+ rm->data.op_count = ib_dma_map_sg(dev,
+ rm->data.op_sg,
+ rm->data.op_nents,
+ DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+ if (rm->data.op_count == 0) {
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+ } else {
+ rm->data.op_count = 0;
+ }
+
+ rds_message_addref(rm);
+ ic->i_data_op = &rm->data;
+
+ /* Finalize the header */
+ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+ /* If it has a RDMA op, tell the peer we did it. This is
+ * used by the peer to release use-once RDMA MRs. */
+ if (rm->rdma.op_active) {
+ struct rds_ext_header_rdma ext_hdr;
+
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ }
+ if (rm->m_rdma_cookie) {
+ rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+ rds_rdma_cookie_key(rm->m_rdma_cookie),
+ rds_rdma_cookie_offset(rm->m_rdma_cookie));
+ }
+
+ /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+ * we should not do this unless we have a chance of at least
+ * sticking the header into the send ring. Which is why we
+ * should call rds_ib_ring_alloc first. */
+ rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
+ rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+ /*
+ * Update adv_credits since we reset the ACK_REQUIRED bit.
+ */
+ if (ic->i_flowctl) {
+ rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+ adv_credits += posted;
+ BUG_ON(adv_credits > 255);
+ }
+ }
+
+ /* Sometimes you want to put a fence between an RDMA
+ * READ and the following SEND.
+ * We could either do this all the time
+ * or when requested by the user. Right now, we let
+ * the application choose.
+ */
+ if (rm->rdma.op_active && rm->rdma.op_fence)
+ send_flags = IB_SEND_FENCE;
+
+ /* Each frag gets a header. Msgs may be 0 bytes */
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &ic->i_data_op->op_sg[sg];
+ i = 0;
+ do {
+ unsigned int len = 0;
+
+ /* Set up the header */
+ send->s_wr.send_flags = send_flags;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.num_sge = 1;
+ send->s_wr.next = NULL;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+
+ send->s_sge[0].addr = ic->i_send_hdrs_dma
+ + (pos * sizeof(struct rds_header));
+ send->s_sge[0].length = sizeof(struct rds_header);
+
+ memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+ /* Set up the data, if present */
+ if (i < work_alloc
+ && scat != &rm->data.op_sg[rm->data.op_count]) {
+ len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+ send->s_wr.num_sge = 2;
+
+ send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+ send->s_sge[1].length = len;
+
+ bytes_sent += len;
+ off += len;
+ if (off == ib_sg_dma_len(dev, scat)) {
+ scat++;
+ off = 0;
+ }
+ }
+
+ rds_ib_set_wr_signal_state(ic, send, 0);
+
+ /*
+ * Always signal the last one if we're stopping due to flow control.
+ */
+ if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+ if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+ nr_sig++;
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ if (ic->i_flowctl && adv_credits) {
+ struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+ /* add credit and redo the header checksum */
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ adv_credits = 0;
+ rds_ib_stats_inc(s_ib_tx_credit_updates);
+ }
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+ prev = send;
+
+ pos = (pos + 1) % ic->i_send_ring.w_nr;
+ send = &ic->i_sends[pos];
+ i++;
+
+ } while (i < work_alloc
+ && scat != &rm->data.op_sg[rm->data.op_count]);
+
+ /* Account the RDS header in the number of bytes we sent, but just once.
+ * The caller has no concept of fragmentation. */
+ if (hdr_off == 0)
+ bytes_sent += sizeof(struct rds_header);
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &rm->data.op_sg[rm->data.op_count]) {
+ prev->s_op = ic->i_data_op;
+ prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+ ic->i_data_op = NULL;
+ }
+
+ /* Put back wrs & credits we didn't use */
+ if (i < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+ if (ic->i_flowctl && i < credit_alloc)
+ rds_ib_send_add_credits(conn, credit_alloc - i);
+
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
+ /* XXX need to worry about failed_wr and partial sends. */
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_sub_signaled(ic, nr_sig);
+ if (prev->s_op) {
+ ic->i_data_op = prev->s_op;
+ prev->s_op = NULL;
+ }
+
+ rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
+ goto out;
+ }
+
+ ret = bytes_sent;
+out:
+ BUG_ON(adv_credits);
+ return ret;
+}
+
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_send_work *send = NULL;
+ struct ib_send_wr *failed_wr;
+ struct rds_ib_device *rds_ibdev;
+ u32 pos;
+ u32 work_alloc;
+ int ret;
+ int nr_sig = 0;
+
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+ if (work_alloc != 1) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* address of send request in ring */
+ send = &ic->i_sends[pos];
+ send->s_queued = jiffies;
+
+ if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+ send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+ send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
+ send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
+ send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
+ send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+ } else { /* FADD */
+ send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+ send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
+ send->s_wr.wr.atomic.swap = 0;
+ send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
+ send->s_wr.wr.atomic.swap_mask = 0;
+ }
+ nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+ send->s_wr.num_sge = 1;
+ send->s_wr.next = NULL;
+ send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
+ send->s_wr.wr.atomic.rkey = op->op_rkey;
+ send->s_op = op;
+ rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
+
+ /* map 8 byte retval buffer to the device */
+ ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+ if (ret != 1) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+
+ /* Convert our struct scatterlist to struct ib_sge */
+ send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
+ send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+ send->s_sge[0].lkey = ic->i_mr->lkey;
+
+ rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+ send->s_sge[0].addr, send->s_sge[0].length);
+
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
+ failed_wr = &send->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+ rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+ send, &send->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &send->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_sub_signaled(ic, nr_sig);
+ goto out;
+ }
+
+ if (unlikely(failed_wr != &send->s_wr)) {
+ printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+ BUG_ON(failed_wr != &send->s_wr);
+ }
+
+out:
+ return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_send_work *send = NULL;
+ struct rds_ib_send_work *first;
+ struct rds_ib_send_work *prev;
+ struct ib_send_wr *failed_wr;
+ struct scatterlist *scat;
+ unsigned long len;
+ u64 remote_addr = op->op_remote_addr;
+ u32 max_sge = ic->rds_ibdev->max_sge;
+ u32 pos;
+ u32 work_alloc;
+ u32 i;
+ u32 j;
+ int sent;
+ int ret;
+ int num_sge;
+ int nr_sig = 0;
+
+ /* map the op the first time we see it */
+ if (!op->op_mapped) {
+ op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents, (op->op_write) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+ if (op->op_count == 0) {
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+
+ op->op_mapped = 1;
+ }
+
+ /*
+ * Instead of knowing how to return a partial rdma read/write we insist that there
+ * be enough work requests to send the entire message.
+ */
+ i = ceil(op->op_count, max_sge);
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc != i) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &op->op_sg[0];
+ sent = 0;
+ num_sge = op->op_count;
+
+ for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+ send->s_wr.send_flags = 0;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+
+ nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+
+ send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ send->s_wr.wr.rdma.remote_addr = remote_addr;
+ send->s_wr.wr.rdma.rkey = op->op_rkey;
+
+ if (num_sge > max_sge) {
+ send->s_wr.num_sge = max_sge;
+ num_sge -= max_sge;
+ } else {
+ send->s_wr.num_sge = num_sge;
+ }
+
+ send->s_wr.next = NULL;
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+
+ for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+ len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+ send->s_sge[j].addr =
+ ib_sg_dma_address(ic->i_cm_id->device, scat);
+ send->s_sge[j].length = len;
+ send->s_sge[j].lkey = ic->i_mr->lkey;
+
+ sent += len;
+ rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+
+ remote_addr += len;
+ scat++;
+ }
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ prev = send;
+ if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+ send = ic->i_sends;
+ }
+
+ /* give a reference to the last op */
+ if (scat == &op->op_sg[op->op_count]) {
+ prev->s_op = op;
+ rds_message_addref(container_of(op, struct rds_message, rdma));
+ }
+
+ if (i < work_alloc) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_sub_signaled(ic, nr_sig);
+ goto out;
+ }
+
+ if (unlikely(failed_wr != &first->s_wr)) {
+ printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+ BUG_ON(failed_wr != &first->s_wr);
+ }
+
+
+out:
+ return ret;
+}
+
+void rds_ib_xmit_complete(struct rds_connection *conn)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
+ /* We may have a pending ACK or window update we were unable
+ * to send previously (due to flow control). Try again. */
+ rds_ib_attempt_ack(ic);
+}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 000000000..2d5965d6e
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "ib.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+
+static const char *const rds_ib_stat_names[] = {
+ "ib_connect_raced",
+ "ib_listen_closed_stale",
+ "ib_tx_cq_call",
+ "ib_tx_cq_event",
+ "ib_tx_ring_full",
+ "ib_tx_throttle",
+ "ib_tx_sg_mapping_failure",
+ "ib_tx_stalled",
+ "ib_tx_credit_updates",
+ "ib_rx_cq_call",
+ "ib_rx_cq_event",
+ "ib_rx_ring_empty",
+ "ib_rx_refill_from_cq",
+ "ib_rx_refill_from_thread",
+ "ib_rx_alloc_limit",
+ "ib_rx_credit_updates",
+ "ib_ack_sent",
+ "ib_ack_send_failure",
+ "ib_ack_send_delayed",
+ "ib_ack_send_piggybacked",
+ "ib_ack_received",
+ "ib_rdma_mr_alloc",
+ "ib_rdma_mr_free",
+ "ib_rdma_mr_used",
+ "ib_rdma_mr_pool_flush",
+ "ib_rdma_mr_pool_wait",
+ "ib_rdma_mr_pool_depleted",
+ "ib_atomic_cswp",
+ "ib_atomic_fadd",
+};
+
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rds_ib_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ if (avail < ARRAY_SIZE(rds_ib_stat_names))
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
+ ARRAY_SIZE(rds_ib_stat_names));
+out:
+ return ARRAY_SIZE(rds_ib_stat_names);
+}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 000000000..e4e41b3af
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "ib.h"
+
+static struct ctl_table_header *rds_ib_sysctl_hdr;
+
+unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
+unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
+unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_ib_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
+
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rds_ib_sysctl_flow_control = 0;
+
+static struct ctl_table rds_ib_sysctl_table[] = {
+ {
+ .procname = "max_send_wr",
+ .data = &rds_ib_sysctl_max_send_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_wr_min,
+ .extra2 = &rds_ib_sysctl_max_wr_max,
+ },
+ {
+ .procname = "max_recv_wr",
+ .data = &rds_ib_sysctl_max_recv_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_wr_min,
+ .extra2 = &rds_ib_sysctl_max_wr_max,
+ },
+ {
+ .procname = "max_unsignaled_wr",
+ .data = &rds_ib_sysctl_max_unsig_wrs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_unsig_wr_min,
+ .extra2 = &rds_ib_sysctl_max_unsig_wr_max,
+ },
+ {
+ .procname = "max_recv_allocation",
+ .data = &rds_ib_sysctl_max_recv_allocation,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "flow_control",
+ .data = &rds_ib_sysctl_flow_control,
+ .maxlen = sizeof(rds_ib_sysctl_flow_control),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+void rds_ib_sysctl_exit(void)
+{
+ if (rds_ib_sysctl_hdr)
+ unregister_net_sysctl_table(rds_ib_sysctl_hdr);
+}
+
+int rds_ib_sysctl_init(void)
+{
+ rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table);
+ if (!rds_ib_sysctl_hdr)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 000000000..9a6b4f661
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time. The structs are only copied if the user-specified
+ * buffer is big enough. The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+
+struct rds_info_iterator {
+ struct page **pages;
+ void *addr;
+ unsigned long offset;
+};
+
+static DEFINE_SPINLOCK(rds_info_lock);
+static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
+
+void rds_info_register_func(int optname, rds_info_func func)
+{
+ int offset = optname - RDS_INFO_FIRST;
+
+ BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+ spin_lock(&rds_info_lock);
+ BUG_ON(rds_info_funcs[offset]);
+ rds_info_funcs[offset] = func;
+ spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_register_func);
+
+void rds_info_deregister_func(int optname, rds_info_func func)
+{
+ int offset = optname - RDS_INFO_FIRST;
+
+ BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+ spin_lock(&rds_info_lock);
+ BUG_ON(rds_info_funcs[offset] != func);
+ rds_info_funcs[offset] = NULL;
+ spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_deregister_func);
+
+/*
+ * Typically we hold an atomic kmap across multiple rds_info_copy() calls
+ * because the kmap is so expensive. This must be called before using blocking
+ * operations while holding the mapping and as the iterator is torn down.
+ */
+void rds_info_iter_unmap(struct rds_info_iterator *iter)
+{
+ if (iter->addr) {
+ kunmap_atomic(iter->addr);
+ iter->addr = NULL;
+ }
+}
+
+/*
+ * get_user_pages() called flush_dcache_page() on the pages for us.
+ */
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+ unsigned long bytes)
+{
+ unsigned long this;
+
+ while (bytes) {
+ if (!iter->addr)
+ iter->addr = kmap_atomic(*iter->pages);
+
+ this = min(bytes, PAGE_SIZE - iter->offset);
+
+ rdsdebug("page %p addr %p offset %lu this %lu data %p "
+ "bytes %lu\n", *iter->pages, iter->addr,
+ iter->offset, this, data, bytes);
+
+ memcpy(iter->addr + iter->offset, data, this);
+
+ data += this;
+ bytes -= this;
+ iter->offset += this;
+
+ if (iter->offset == PAGE_SIZE) {
+ kunmap_atomic(iter->addr);
+ iter->addr = NULL;
+ iter->offset = 0;
+ iter->pages++;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(rds_info_copy);
+
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace. @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+ int __user *optlen)
+{
+ struct rds_info_iterator iter;
+ struct rds_info_lengths lens;
+ unsigned long nr_pages = 0;
+ unsigned long start;
+ unsigned long i;
+ rds_info_func func;
+ struct page **pages = NULL;
+ int ret;
+ int len;
+ int total;
+
+ if (get_user(len, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* check for all kinds of wrapping and the like */
+ start = (unsigned long)optval;
+ if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* a 0 len call is just trying to probe its length */
+ if (len == 0)
+ goto call_func;
+
+ nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
+ >> PAGE_SHIFT;
+
+ pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = get_user_pages_fast(start, nr_pages, 1, pages);
+ if (ret != nr_pages) {
+ if (ret > 0)
+ nr_pages = ret;
+ else
+ nr_pages = 0;
+ ret = -EAGAIN; /* XXX ? */
+ goto out;
+ }
+
+ rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
+
+call_func:
+ func = rds_info_funcs[optname - RDS_INFO_FIRST];
+ if (!func) {
+ ret = -ENOPROTOOPT;
+ goto out;
+ }
+
+ iter.pages = pages;
+ iter.addr = NULL;
+ iter.offset = start & (PAGE_SIZE - 1);
+
+ func(sock, len, &iter, &lens);
+ BUG_ON(lens.each == 0);
+
+ total = lens.nr * lens.each;
+
+ rds_info_iter_unmap(&iter);
+
+ if (total > len) {
+ len = total;
+ ret = -ENOSPC;
+ } else {
+ len = total;
+ ret = lens.each;
+ }
+
+ if (put_user(len, optlen))
+ ret = -EFAULT;
+
+out:
+ for (i = 0; pages && i < nr_pages; i++)
+ put_page(pages[i]);
+ kfree(pages);
+
+ return ret;
+}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 000000000..b6c052ca7
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,30 @@
+#ifndef _RDS_INFO_H
+#define _RDS_INFO_H
+
+struct rds_info_lengths {
+ unsigned int nr;
+ unsigned int each;
+};
+
+struct rds_info_iterator;
+
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source. If the snapshot fits in @len then it
+ * should be copied using @iter. The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens);
+
+void rds_info_register_func(int optname, rds_info_func func);
+void rds_info_deregister_func(int optname, rds_info_func func);
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+ int __user *optlen);
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+ unsigned long bytes);
+void rds_info_iter_unmap(struct rds_info_iterator *iter);
+
+
+#endif
diff --git a/net/rds/iw.c b/net/rds/iw.c
new file mode 100644
index 000000000..589935661
--- /dev/null
+++ b/net/rds/iw.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "rds.h"
+#include "iw.h"
+
+unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
+unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
+
+module_param(fastreg_pool_size, int, 0444);
+MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
+module_param(fastreg_message_size, int, 0444);
+MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
+
+struct list_head rds_iw_devices;
+
+/* NOTE: if also grabbing iwdev lock, grab this first */
+DEFINE_SPINLOCK(iw_nodev_conns_lock);
+LIST_HEAD(iw_nodev_conns);
+
+static void rds_iw_add_one(struct ib_device *device)
+{
+ struct rds_iw_device *rds_iwdev;
+ struct ib_device_attr *dev_attr;
+
+ /* Only handle iwarp devices */
+ if (device->node_type != RDMA_NODE_RNIC)
+ return;
+
+ dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+ if (!dev_attr)
+ return;
+
+ if (ib_query_device(device, dev_attr)) {
+ rdsdebug("Query device failed for %s\n", device->name);
+ goto free_attr;
+ }
+
+ rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
+ if (!rds_iwdev)
+ goto free_attr;
+
+ spin_lock_init(&rds_iwdev->spinlock);
+
+ rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+ rds_iwdev->max_wrs = dev_attr->max_qp_wr;
+ rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+
+ rds_iwdev->dev = device;
+ rds_iwdev->pd = ib_alloc_pd(device);
+ if (IS_ERR(rds_iwdev->pd))
+ goto free_dev;
+
+ if (!rds_iwdev->dma_local_lkey) {
+ rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(rds_iwdev->mr))
+ goto err_pd;
+ } else
+ rds_iwdev->mr = NULL;
+
+ rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
+ if (IS_ERR(rds_iwdev->mr_pool)) {
+ rds_iwdev->mr_pool = NULL;
+ goto err_mr;
+ }
+
+ INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
+ INIT_LIST_HEAD(&rds_iwdev->conn_list);
+ list_add_tail(&rds_iwdev->list, &rds_iw_devices);
+
+ ib_set_client_data(device, &rds_iw_client, rds_iwdev);
+
+ goto free_attr;
+
+err_mr:
+ if (rds_iwdev->mr)
+ ib_dereg_mr(rds_iwdev->mr);
+err_pd:
+ ib_dealloc_pd(rds_iwdev->pd);
+free_dev:
+ kfree(rds_iwdev);
+free_attr:
+ kfree(dev_attr);
+}
+
+static void rds_iw_remove_one(struct ib_device *device)
+{
+ struct rds_iw_device *rds_iwdev;
+ struct rds_iw_cm_id *i_cm_id, *next;
+
+ rds_iwdev = ib_get_client_data(device, &rds_iw_client);
+ if (!rds_iwdev)
+ return;
+
+ spin_lock_irq(&rds_iwdev->spinlock);
+ list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
+ list_del(&i_cm_id->list);
+ kfree(i_cm_id);
+ }
+ spin_unlock_irq(&rds_iwdev->spinlock);
+
+ rds_iw_destroy_conns(rds_iwdev);
+
+ if (rds_iwdev->mr_pool)
+ rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
+
+ if (rds_iwdev->mr)
+ ib_dereg_mr(rds_iwdev->mr);
+
+ while (ib_dealloc_pd(rds_iwdev->pd)) {
+ rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
+ msleep(1);
+ }
+
+ list_del(&rds_iwdev->list);
+ kfree(rds_iwdev);
+}
+
+struct ib_client rds_iw_client = {
+ .name = "rds_iw",
+ .add = rds_iw_add_one,
+ .remove = rds_iw_remove_one
+};
+
+static int rds_iw_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds_info_rdma_connection *iinfo = buffer;
+ struct rds_iw_connection *ic;
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rds_iw_transport)
+ return 0;
+
+ iinfo->src_addr = conn->c_laddr;
+ iinfo->dst_addr = conn->c_faddr;
+
+ memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+ memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ struct rds_iw_device *rds_iwdev;
+ struct rdma_dev_addr *dev_addr;
+
+ ic = conn->c_transport_data;
+ dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+ rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+ rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+ rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+ iinfo->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo->max_send_sge = rds_iwdev->max_sge;
+ rds_iw_get_mr_info(rds_iwdev, iinfo);
+ }
+ return 1;
+}
+
+static void rds_iw_ic_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds_iw_conn_info_visitor,
+ sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible. Sending and
+ * receiving should be device-agnostic. Transports would try and maintain
+ * connections between peers who have messages queued. Userspace would be
+ * allowed to influence which paths have priority. We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_iw_laddr_check(__be32 addr)
+{
+ int ret;
+ struct rdma_cm_id *cm_id;
+ struct sockaddr_in sin;
+
+ /* Create a CMA ID and try to bind it. This catches both
+ * IB and iWARP capable NICs.
+ */
+ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+
+ /* rdma_bind_addr will only succeed for IB & iWARP devices */
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ /* due to this, we will claim to support IB devices unless we
+ check node_type. */
+ if (ret || !cm_id->device ||
+ cm_id->device->node_type != RDMA_NODE_RNIC)
+ ret = -EADDRNOTAVAIL;
+
+ rdsdebug("addr %pI4 ret %d node type %d\n",
+ &addr, ret,
+ cm_id->device ? cm_id->device->node_type : -1);
+
+ rdma_destroy_id(cm_id);
+
+ return ret;
+}
+
+void rds_iw_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+ rds_iw_destroy_nodev_conns();
+ ib_unregister_client(&rds_iw_client);
+ rds_iw_sysctl_exit();
+ rds_iw_recv_exit();
+ rds_trans_unregister(&rds_iw_transport);
+}
+
+struct rds_transport rds_iw_transport = {
+ .laddr_check = rds_iw_laddr_check,
+ .xmit_complete = rds_iw_xmit_complete,
+ .xmit = rds_iw_xmit,
+ .xmit_rdma = rds_iw_xmit_rdma,
+ .recv = rds_iw_recv,
+ .conn_alloc = rds_iw_conn_alloc,
+ .conn_free = rds_iw_conn_free,
+ .conn_connect = rds_iw_conn_connect,
+ .conn_shutdown = rds_iw_conn_shutdown,
+ .inc_copy_to_user = rds_iw_inc_copy_to_user,
+ .inc_free = rds_iw_inc_free,
+ .cm_initiate_connect = rds_iw_cm_initiate_connect,
+ .cm_handle_connect = rds_iw_cm_handle_connect,
+ .cm_connect_complete = rds_iw_cm_connect_complete,
+ .stats_info_copy = rds_iw_stats_info_copy,
+ .exit = rds_iw_exit,
+ .get_mr = rds_iw_get_mr,
+ .sync_mr = rds_iw_sync_mr,
+ .free_mr = rds_iw_free_mr,
+ .flush_mrs = rds_iw_flush_mrs,
+ .t_owner = THIS_MODULE,
+ .t_name = "iwarp",
+ .t_type = RDS_TRANS_IWARP,
+ .t_prefer_loopback = 1,
+};
+
+int rds_iw_init(void)
+{
+ int ret;
+
+ INIT_LIST_HEAD(&rds_iw_devices);
+
+ ret = ib_register_client(&rds_iw_client);
+ if (ret)
+ goto out;
+
+ ret = rds_iw_sysctl_init();
+ if (ret)
+ goto out_ibreg;
+
+ ret = rds_iw_recv_init();
+ if (ret)
+ goto out_sysctl;
+
+ ret = rds_trans_register(&rds_iw_transport);
+ if (ret)
+ goto out_recv;
+
+ rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+
+ goto out;
+
+out_recv:
+ rds_iw_recv_exit();
+out_sysctl:
+ rds_iw_sysctl_exit();
+out_ibreg:
+ ib_unregister_client(&rds_iw_client);
+out:
+ return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/iw.h b/net/rds/iw.h
new file mode 100644
index 000000000..cbe6674e3
--- /dev/null
+++ b/net/rds/iw.h
@@ -0,0 +1,395 @@
+#ifndef _RDS_IW_H
+#define _RDS_IW_H
+
+#include <linux/interrupt.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FASTREG_SIZE 20
+#define RDS_FASTREG_POOL_SIZE 2048
+
+#define RDS_IW_MAX_SGE 8
+#define RDS_IW_RECV_SGE 2
+
+#define RDS_IW_DEFAULT_RECV_WR 1024
+#define RDS_IW_DEFAULT_SEND_WR 256
+
+#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
+
+extern struct list_head rds_iw_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+ struct list_head f_item;
+ struct page *f_page;
+ unsigned long f_offset;
+ dma_addr_t f_mapped;
+};
+
+struct rds_iw_incoming {
+ struct list_head ii_frags;
+ struct rds_incoming ii_inc;
+};
+
+struct rds_iw_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ __be32 dp_saddr;
+ __be32 dp_daddr;
+ u8 dp_protocol_major;
+ u8 dp_protocol_minor;
+ __be16 dp_protocol_minor_mask; /* bitmask */
+ __be32 dp_reserved1;
+ __be64 dp_ack_seq;
+ __be32 dp_credit; /* non-zero enables flow ctl */
+};
+
+struct rds_iw_scatterlist {
+ struct scatterlist *list;
+ unsigned int len;
+ int dma_len;
+ unsigned int dma_npages;
+ unsigned int bytes;
+};
+
+struct rds_iw_mapping {
+ spinlock_t m_lock; /* protect the mapping struct */
+ struct list_head m_list;
+ struct rds_iw_mr *m_mr;
+ uint32_t m_rkey;
+ struct rds_iw_scatterlist m_sg;
+};
+
+struct rds_iw_send_work {
+ struct rds_message *s_rm;
+
+ /* We should really put these into a union: */
+ struct rm_rdma_op *s_op;
+ struct rds_iw_mapping *s_mapping;
+ struct ib_mr *s_mr;
+ struct ib_fast_reg_page_list *s_page_list;
+ unsigned char s_remap_count;
+
+ struct ib_send_wr s_wr;
+ struct ib_sge s_sge[RDS_IW_MAX_SGE];
+ unsigned long s_queued;
+};
+
+struct rds_iw_recv_work {
+ struct rds_iw_incoming *r_iwinc;
+ struct rds_page_frag *r_frag;
+ struct ib_recv_wr r_wr;
+ struct ib_sge r_sge[2];
+};
+
+struct rds_iw_work_ring {
+ u32 w_nr;
+ u32 w_alloc_ptr;
+ u32 w_alloc_ctr;
+ u32 w_free_ptr;
+ atomic_t w_free_ctr;
+};
+
+struct rds_iw_device;
+
+struct rds_iw_connection {
+
+ struct list_head iw_node;
+ struct rds_iw_device *rds_iwdev;
+ struct rds_connection *conn;
+
+ /* alphabet soup, IBTA style */
+ struct rdma_cm_id *i_cm_id;
+ struct ib_pd *i_pd;
+ struct ib_mr *i_mr;
+ struct ib_cq *i_send_cq;
+ struct ib_cq *i_recv_cq;
+
+ /* tx */
+ struct rds_iw_work_ring i_send_ring;
+ struct rds_message *i_rm;
+ struct rds_header *i_send_hdrs;
+ u64 i_send_hdrs_dma;
+ struct rds_iw_send_work *i_sends;
+
+ /* rx */
+ struct tasklet_struct i_recv_tasklet;
+ struct mutex i_recv_mutex;
+ struct rds_iw_work_ring i_recv_ring;
+ struct rds_iw_incoming *i_iwinc;
+ u32 i_recv_data_rem;
+ struct rds_header *i_recv_hdrs;
+ u64 i_recv_hdrs_dma;
+ struct rds_iw_recv_work *i_recvs;
+ struct rds_page_frag i_frag;
+ u64 i_ack_recv; /* last ACK received */
+
+ /* sending acks */
+ unsigned long i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t i_ack_next; /* next ACK to send */
+#else
+ spinlock_t i_ack_lock; /* protect i_ack_next */
+ u64 i_ack_next; /* next ACK to send */
+#endif
+ struct rds_header *i_ack;
+ struct ib_send_wr i_ack_wr;
+ struct ib_sge i_ack_sge;
+ u64 i_ack_dma;
+ unsigned long i_ack_queued;
+
+ /* Flow control related information
+ *
+ * Our algorithm uses a pair variables that we need to access
+ * atomically - one for the send credits, and one posted
+ * recv credits we need to transfer to remote.
+ * Rather than protect them using a slow spinlock, we put both into
+ * a single atomic_t and update it using cmpxchg
+ */
+ atomic_t i_credits;
+
+ /* Protocol version specific information */
+ unsigned int i_flowctl:1; /* enable/disable flow ctl */
+ unsigned int i_dma_local_lkey:1;
+ unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */
+ /* Batched completions */
+ unsigned int i_unsignaled_wrs;
+ long i_unsignaled_bytes;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v) ((v) >> 16)
+#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v) ((v) << 16)
+
+struct rds_iw_cm_id {
+ struct list_head list;
+ struct rdma_cm_id *cm_id;
+};
+
+struct rds_iw_device {
+ struct list_head list;
+ struct list_head cm_id_list;
+ struct list_head conn_list;
+ struct ib_device *dev;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ struct rds_iw_mr_pool *mr_pool;
+ int max_sge;
+ unsigned int max_wrs;
+ unsigned int dma_local_lkey:1;
+ spinlock_t spinlock; /* protect the above */
+};
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT 0
+#define IB_ACK_REQUESTED 1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
+#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL)
+#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
+
+struct rds_iw_statistics {
+ uint64_t s_iw_connect_raced;
+ uint64_t s_iw_listen_closed_stale;
+ uint64_t s_iw_tx_cq_call;
+ uint64_t s_iw_tx_cq_event;
+ uint64_t s_iw_tx_ring_full;
+ uint64_t s_iw_tx_throttle;
+ uint64_t s_iw_tx_sg_mapping_failure;
+ uint64_t s_iw_tx_stalled;
+ uint64_t s_iw_tx_credit_updates;
+ uint64_t s_iw_rx_cq_call;
+ uint64_t s_iw_rx_cq_event;
+ uint64_t s_iw_rx_ring_empty;
+ uint64_t s_iw_rx_refill_from_cq;
+ uint64_t s_iw_rx_refill_from_thread;
+ uint64_t s_iw_rx_alloc_limit;
+ uint64_t s_iw_rx_credit_updates;
+ uint64_t s_iw_ack_sent;
+ uint64_t s_iw_ack_send_failure;
+ uint64_t s_iw_ack_send_delayed;
+ uint64_t s_iw_ack_send_piggybacked;
+ uint64_t s_iw_ack_received;
+ uint64_t s_iw_rdma_mr_alloc;
+ uint64_t s_iw_rdma_mr_free;
+ uint64_t s_iw_rdma_mr_used;
+ uint64_t s_iw_rdma_mr_pool_flush;
+ uint64_t s_iw_rdma_mr_pool_wait;
+ uint64_t s_iw_rdma_mr_pool_depleted;
+};
+
+extern struct workqueue_struct *rds_iw_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
+ struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+ unsigned int i;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ ib_dma_sync_single_for_cpu(dev,
+ ib_sg_dma_address(dev, &sg[i]),
+ ib_sg_dma_len(dev, &sg[i]),
+ direction);
+ }
+}
+#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu
+
+static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
+ struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+ unsigned int i;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ ib_dma_sync_single_for_device(dev,
+ ib_sg_dma_address(dev, &sg[i]),
+ ib_sg_dma_len(dev, &sg[i]),
+ direction);
+ }
+}
+#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device
+
+static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
+{
+ return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
+}
+
+/* ib.c */
+extern struct rds_transport rds_iw_transport;
+extern struct ib_client rds_iw_client;
+
+extern unsigned int fastreg_pool_size;
+extern unsigned int fastreg_message_size;
+
+extern spinlock_t iw_nodev_conns_lock;
+extern struct list_head iw_nodev_conns;
+
+/* ib_cm.c */
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_iw_conn_free(void *arg);
+int rds_iw_conn_connect(struct rds_connection *conn);
+void rds_iw_conn_shutdown(struct rds_connection *conn);
+void rds_iw_state_change(struct sock *sk);
+int rds_iw_listen_init(void);
+void rds_iw_listen_stop(void);
+void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_iw_cm_connect_complete(struct rds_connection *conn,
+ struct rdma_cm_event *event);
+
+
+#define rds_iw_conn_error(conn, fmt...) \
+ __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
+
+/* ib_rdma.c */
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_iw_destroy_nodev_conns(void)
+{
+ __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
+}
+static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
+{
+ __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
+}
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret);
+void rds_iw_sync_mr(void *trans_private, int dir);
+void rds_iw_free_mr(void *trans_private, int invalidate);
+void rds_iw_flush_mrs(void);
+
+/* ib_recv.c */
+int rds_iw_recv_init(void);
+void rds_iw_recv_exit(void);
+int rds_iw_recv(struct rds_connection *conn);
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+ gfp_t page_gfp, int prefill);
+void rds_iw_inc_free(struct rds_incoming *inc);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_recv_tasklet_fn(unsigned long data);
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
+void rds_iw_attempt_ack(struct rds_iw_connection *ic);
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
+
+/* ib_ring.c */
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
+int rds_iw_ring_low(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_iw_ring_empty_wait;
+
+/* ib_send.c */
+void rds_iw_xmit_complete(struct rds_connection *conn);
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_send_init_ring(struct rds_iw_connection *ic);
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
+ u32 *adv_credits, int need_posted, int max_posted);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
+#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+
+/* ib_sysctl.c */
+int rds_iw_sysctl_init(void);
+void rds_iw_sysctl_exit(void);
+extern unsigned long rds_iw_sysctl_max_send_wr;
+extern unsigned long rds_iw_sysctl_max_recv_wr;
+extern unsigned long rds_iw_sysctl_max_unsig_wrs;
+extern unsigned long rds_iw_sysctl_max_unsig_bytes;
+extern unsigned long rds_iw_sysctl_max_recv_allocation;
+extern unsigned int rds_iw_sysctl_flow_control;
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+ return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+ return &sge[1];
+}
+
+#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
new file mode 100644
index 000000000..a6c2bea9f
--- /dev/null
+++ b/net/rds/iw_cm.c
@@ -0,0 +1,765 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+ conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ if (rds_iw_sysctl_flow_control && credits != 0) {
+ /* We're doing flow control */
+ ic->i_flowctl = 1;
+ rds_iw_send_add_credits(conn, credits);
+ } else {
+ ic->i_flowctl = 0;
+ }
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+ const struct rds_iw_connect_private *dp = NULL;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_device *rds_iwdev;
+ int err;
+
+ if (event->param.conn.private_data_len) {
+ dp = event->param.conn.private_data;
+
+ rds_iw_set_protocol(conn,
+ RDS_PROTOCOL(dp->dp_protocol_major,
+ dp->dp_protocol_minor));
+ rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ }
+
+ /* update ib_device with this local ipaddr & conn */
+ rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+ err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
+ if (err)
+ printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
+ rds_iw_add_conn(rds_iwdev, conn);
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp && dp->dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+ printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
+ &conn->c_laddr, &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+
+ rds_connect_complete(conn);
+}
+
+static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
+ struct rdma_conn_param *conn_param,
+ struct rds_iw_connect_private *dp,
+ u32 protocol_version)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ memset(conn_param, 0, sizeof(struct rdma_conn_param));
+ /* XXX tune these? */
+ conn_param->responder_resources = 1;
+ conn_param->initiator_depth = 1;
+
+ if (dp) {
+ memset(dp, 0, sizeof(*dp));
+ dp->dp_saddr = conn->c_laddr;
+ dp->dp_daddr = conn->c_faddr;
+ dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+ dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
+ dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
+
+ /* Advertise flow control */
+ if (ic->i_flowctl) {
+ unsigned int credits;
+
+ credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+ dp->dp_credit = cpu_to_be32(credits);
+ atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+ }
+
+ conn_param->private_data = dp;
+ conn_param->private_data_len = sizeof(*dp);
+ }
+}
+
+static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
+{
+ rdsdebug("event %u data %p\n", event->event, data);
+}
+
+static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
+{
+ struct rds_connection *conn = data;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+ break;
+ case IB_EVENT_QP_REQ_ERR:
+ case IB_EVENT_QP_FATAL:
+ default:
+ rdsdebug("Fatal QP Event %u "
+ "- connection %pI4->%pI4, reconnecting\n",
+ event->event, &conn->c_laddr,
+ &conn->c_faddr);
+ rds_conn_drop(conn);
+ break;
+ }
+}
+
+/*
+ * Create a QP
+ */
+static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
+ struct rds_iw_device *rds_iwdev,
+ struct rds_iw_work_ring *send_ring,
+ void (*send_cq_handler)(struct ib_cq *, void *),
+ struct rds_iw_work_ring *recv_ring,
+ void (*recv_cq_handler)(struct ib_cq *, void *),
+ void *context)
+{
+ struct ib_device *dev = rds_iwdev->dev;
+ unsigned int send_size, recv_size;
+ int ret;
+
+ /* The offset of 1 is to accommodate the additional ACK WR. */
+ send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
+ recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
+ rds_iw_ring_resize(send_ring, send_size - 1);
+ rds_iw_ring_resize(recv_ring, recv_size - 1);
+
+ memset(attr, 0, sizeof(*attr));
+ attr->event_handler = rds_iw_qp_event_handler;
+ attr->qp_context = context;
+ attr->cap.max_send_wr = send_size;
+ attr->cap.max_recv_wr = recv_size;
+ attr->cap.max_send_sge = rds_iwdev->max_sge;
+ attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
+ attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+ attr->qp_type = IB_QPT_RC;
+
+ attr->send_cq = ib_create_cq(dev, send_cq_handler,
+ rds_iw_cq_event_handler,
+ context, send_size, 0);
+ if (IS_ERR(attr->send_cq)) {
+ ret = PTR_ERR(attr->send_cq);
+ attr->send_cq = NULL;
+ rdsdebug("ib_create_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
+ rds_iw_cq_event_handler,
+ context, recv_size, 0);
+ if (IS_ERR(attr->recv_cq)) {
+ ret = PTR_ERR(attr->recv_cq);
+ attr->recv_cq = NULL;
+ rdsdebug("ib_create_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+ goto out;
+ }
+
+ ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
+ if (ret) {
+ rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+ goto out;
+ }
+
+out:
+ if (ret) {
+ if (attr->send_cq)
+ ib_destroy_cq(attr->send_cq);
+ if (attr->recv_cq)
+ ib_destroy_cq(attr->recv_cq);
+ }
+ return ret;
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_iw_setup_qp(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct ib_qp_init_attr attr;
+ struct rds_iw_device *rds_iwdev;
+ int ret;
+
+ /* rds_iw_add_one creates a rds_iw_device object per IB device,
+ * and allocates a protection domain, memory range and MR pool
+ * for each. If that fails for any reason, it will not register
+ * the rds_iwdev at all.
+ */
+ rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
+ if (!rds_iwdev) {
+ printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+ dev->name);
+ return -EOPNOTSUPP;
+ }
+
+ /* Protection domain and memory range */
+ ic->i_pd = rds_iwdev->pd;
+ ic->i_mr = rds_iwdev->mr;
+
+ ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
+ &ic->i_send_ring, rds_iw_send_cq_comp_handler,
+ &ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
+ conn);
+ if (ret < 0)
+ goto out;
+
+ ic->i_send_cq = attr.send_cq;
+ ic->i_recv_cq = attr.recv_cq;
+
+ /*
+ * XXX this can fail if max_*_wr is too large? Are we supposed
+ * to back off until we get a value that the hardware can support?
+ */
+ ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+ if (ret) {
+ rdsdebug("rdma_create_qp failed: %d\n", ret);
+ goto out;
+ }
+
+ ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+ ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ &ic->i_send_hdrs_dma, GFP_KERNEL);
+ if (!ic->i_send_hdrs) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent send failed\n");
+ goto out;
+ }
+
+ ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+ ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ &ic->i_recv_hdrs_dma, GFP_KERNEL);
+ if (!ic->i_recv_hdrs) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent recv failed\n");
+ goto out;
+ }
+
+ ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+ &ic->i_ack_dma, GFP_KERNEL);
+ if (!ic->i_ack) {
+ ret = -ENOMEM;
+ rdsdebug("ib_dma_alloc_coherent ack failed\n");
+ goto out;
+ }
+
+ ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
+ if (!ic->i_sends) {
+ ret = -ENOMEM;
+ rdsdebug("send allocation failed\n");
+ goto out;
+ }
+ rds_iw_send_init_ring(ic);
+
+ ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
+ if (!ic->i_recvs) {
+ ret = -ENOMEM;
+ rdsdebug("recv allocation failed\n");
+ goto out;
+ }
+
+ rds_iw_recv_init_ring(ic);
+ rds_iw_recv_init_ack(ic);
+
+ /* Post receive buffers - as a side effect, this will update
+ * the posted credit count. */
+ rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
+ rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+ ic->i_send_cq, ic->i_recv_cq);
+
+out:
+ return ret;
+}
+
+static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
+{
+ u16 common;
+ u32 version = 0;
+
+ /* rdma_cm private data is odd - when there is any private data in the
+ * request, we will be given a pretty large buffer without telling us the
+ * original size. The only way to tell the difference is by looking at
+ * the contents, which are initialized to zero.
+ * If the protocol version fields aren't set, this is a connection attempt
+ * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+ * We really should have changed this for OFED 1.3 :-( */
+ if (dp->dp_protocol_major == 0)
+ return RDS_PROTOCOL_3_0;
+
+ common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
+ if (dp->dp_protocol_major == 3 && common) {
+ version = RDS_PROTOCOL_3_0;
+ while ((common >>= 1) != 0)
+ version++;
+ }
+ printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
+ "incompatible protocol version %u.%u\n",
+ &dp->dp_saddr,
+ dp->dp_protocol_major,
+ dp->dp_protocol_minor);
+ return version;
+}
+
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ const struct rds_iw_connect_private *dp = event->param.conn.private_data;
+ struct rds_iw_connect_private dp_rep;
+ struct rds_connection *conn = NULL;
+ struct rds_iw_connection *ic = NULL;
+ struct rdma_conn_param conn_param;
+ struct rds_iw_device *rds_iwdev;
+ u32 version;
+ int err, destroy = 1;
+
+ /* Check whether the remote protocol version matches ours. */
+ version = rds_iw_protocol_compatible(dp);
+ if (!version)
+ goto out;
+
+ rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
+ &dp->dp_saddr, &dp->dp_daddr,
+ RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
+
+ conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
+ GFP_KERNEL);
+ if (IS_ERR(conn)) {
+ rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+ conn = NULL;
+ goto out;
+ }
+
+ /*
+ * The connection request may occur while the
+ * previous connection exist, e.g. in case of failover.
+ * But as connections may be initiated simultaneously
+ * by both hosts, we have a random backoff mechanism -
+ * see the comment above rds_queue_reconnect()
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ rdsdebug("incoming connect while connecting\n");
+ rds_conn_drop(conn);
+ rds_iw_stats_inc(s_iw_listen_closed_stale);
+ } else
+ if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+ /* Wait and see - our connect may still be succeeding */
+ rds_iw_stats_inc(s_iw_connect_raced);
+ }
+ mutex_unlock(&conn->c_cm_lock);
+ goto out;
+ }
+
+ ic = conn->c_transport_data;
+
+ rds_iw_set_protocol(conn, version);
+ rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+ /* If the peer gave us the last packet it saw, process this as if
+ * we had received a regular ACK. */
+ if (dp->dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+ BUG_ON(cm_id->context);
+ BUG_ON(ic->i_cm_id);
+
+ ic->i_cm_id = cm_id;
+ cm_id->context = conn;
+
+ rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
+ ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+ /* We got halfway through setting up the ib_connection, if we
+ * fail now, we have to take the long route out of this mess. */
+ destroy = 0;
+
+ err = rds_iw_setup_qp(conn);
+ if (err) {
+ rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
+ mutex_unlock(&conn->c_cm_lock);
+ goto out;
+ }
+
+ rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+
+ /* rdma_accept() calls rdma_reject() internally if it fails */
+ err = rdma_accept(cm_id, &conn_param);
+ mutex_unlock(&conn->c_cm_lock);
+ if (err) {
+ rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ rdma_reject(cm_id, NULL, 0);
+ return destroy;
+}
+
+
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+ struct rds_connection *conn = cm_id->context;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rdma_conn_param conn_param;
+ struct rds_iw_connect_private dp;
+ int ret;
+
+ /* If the peer doesn't do protocol negotiation, we must
+ * default to RDSv3.0 */
+ rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
+ ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */
+
+ ret = rds_iw_setup_qp(conn);
+ if (ret) {
+ rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
+ goto out;
+ }
+
+ rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
+ ret = rdma_connect(cm_id, &conn_param);
+ if (ret)
+ rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+ /* Beware - returning non-zero tells the rdma_cm to destroy
+ * the cm_id. We should certainly not do it as long as we still
+ * "own" the cm_id. */
+ if (ret) {
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ if (ic->i_cm_id == cm_id)
+ ret = 0;
+ }
+ return ret;
+}
+
+int rds_iw_conn_connect(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_device *rds_iwdev;
+ struct sockaddr_in src, dest;
+ int ret;
+
+ /* XXX I wonder what affect the port space has */
+ /* delegate cm event handler to rdma_transport */
+ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(ic->i_cm_id)) {
+ ret = PTR_ERR(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ rdsdebug("rdma_create_id() failed: %d\n", ret);
+ goto out;
+ }
+
+ rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+ src.sin_family = AF_INET;
+ src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+ src.sin_port = (__force u16)htons(0);
+
+ /* First, bind to the local address and device. */
+ ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
+ if (ret) {
+ rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
+ &conn->c_laddr, ret);
+ rdma_destroy_id(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ goto out;
+ }
+
+ rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+ ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+ dest.sin_family = AF_INET;
+ dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+ dest.sin_port = (__force u16)htons(RDS_PORT);
+
+ ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+ (struct sockaddr *)&dest,
+ RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ if (ret) {
+ rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+ ret);
+ rdma_destroy_id(ic->i_cm_id);
+ ic->i_cm_id = NULL;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup. In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_iw_conn_shutdown(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ int err = 0;
+ struct ib_qp_attr qp_attr;
+
+ rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+ ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+ ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+ if (ic->i_cm_id) {
+ struct ib_device *dev = ic->i_cm_id->device;
+
+ rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+ err = rdma_disconnect(ic->i_cm_id);
+ if (err) {
+ /* Actually this may happen quite frequently, when
+ * an outgoing connect raced with an incoming connect.
+ */
+ rdsdebug("failed to disconnect, cm: %p err %d\n",
+ ic->i_cm_id, err);
+ }
+
+ if (ic->i_cm_id->qp) {
+ qp_attr.qp_state = IB_QPS_ERR;
+ ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+ }
+
+ wait_event(rds_iw_ring_empty_wait,
+ rds_iw_ring_empty(&ic->i_send_ring) &&
+ rds_iw_ring_empty(&ic->i_recv_ring));
+
+ if (ic->i_send_hdrs)
+ ib_dma_free_coherent(dev,
+ ic->i_send_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_send_hdrs,
+ ic->i_send_hdrs_dma);
+
+ if (ic->i_recv_hdrs)
+ ib_dma_free_coherent(dev,
+ ic->i_recv_ring.w_nr *
+ sizeof(struct rds_header),
+ ic->i_recv_hdrs,
+ ic->i_recv_hdrs_dma);
+
+ if (ic->i_ack)
+ ib_dma_free_coherent(dev, sizeof(struct rds_header),
+ ic->i_ack, ic->i_ack_dma);
+
+ if (ic->i_sends)
+ rds_iw_send_clear_ring(ic);
+ if (ic->i_recvs)
+ rds_iw_recv_clear_ring(ic);
+
+ if (ic->i_cm_id->qp)
+ rdma_destroy_qp(ic->i_cm_id);
+ if (ic->i_send_cq)
+ ib_destroy_cq(ic->i_send_cq);
+ if (ic->i_recv_cq)
+ ib_destroy_cq(ic->i_recv_cq);
+
+ /*
+ * If associated with an rds_iw_device:
+ * Move connection back to the nodev list.
+ * Remove cm_id from the device cm_id list.
+ */
+ if (ic->rds_iwdev)
+ rds_iw_remove_conn(ic->rds_iwdev, conn);
+
+ rdma_destroy_id(ic->i_cm_id);
+
+ ic->i_cm_id = NULL;
+ ic->i_pd = NULL;
+ ic->i_mr = NULL;
+ ic->i_send_cq = NULL;
+ ic->i_recv_cq = NULL;
+ ic->i_send_hdrs = NULL;
+ ic->i_recv_hdrs = NULL;
+ ic->i_ack = NULL;
+ }
+ BUG_ON(ic->rds_iwdev);
+
+ /* Clear pending transmit */
+ if (ic->i_rm) {
+ rds_message_put(ic->i_rm);
+ ic->i_rm = NULL;
+ }
+
+ /* Clear the ACK state */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_set(&ic->i_ack_next, 0);
+#else
+ ic->i_ack_next = 0;
+#endif
+ ic->i_ack_recv = 0;
+
+ /* Clear flow control state */
+ ic->i_flowctl = 0;
+ atomic_set(&ic->i_credits, 0);
+
+ rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+ rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+ if (ic->i_iwinc) {
+ rds_inc_put(&ic->i_iwinc->ii_inc);
+ ic->i_iwinc = NULL;
+ }
+
+ vfree(ic->i_sends);
+ ic->i_sends = NULL;
+ vfree(ic->i_recvs);
+ ic->i_recvs = NULL;
+ rdsdebug("shutdown complete\n");
+}
+
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_iw_connection *ic;
+ unsigned long flags;
+
+ /* XXX too lazy? */
+ ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
+ if (!ic)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&ic->iw_node);
+ tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
+ (unsigned long) ic);
+ mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+ spin_lock_init(&ic->i_ack_lock);
+#endif
+
+ /*
+ * rds_iw_conn_shutdown() waits for these to be emptied so they
+ * must be initialized before it can be called.
+ */
+ rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+ rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+ ic->conn = conn;
+ conn->c_transport_data = ic;
+
+ spin_lock_irqsave(&iw_nodev_conns_lock, flags);
+ list_add_tail(&ic->iw_node, &iw_nodev_conns);
+ spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
+
+
+ rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+ return 0;
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void rds_iw_conn_free(void *arg)
+{
+ struct rds_iw_connection *ic = arg;
+ spinlock_t *lock_ptr;
+
+ rdsdebug("ic %p\n", ic);
+
+ /*
+ * Conn is either on a dev's list or on the nodev list.
+ * A race with shutdown() or connect() would cause problems
+ * (since rds_iwdev would change) but that should never happen.
+ */
+ lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
+
+ spin_lock_irq(lock_ptr);
+ list_del(&ic->iw_node);
+ spin_unlock_irq(lock_ptr);
+
+ kfree(ic);
+}
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+ va_list ap;
+
+ rds_conn_drop(conn);
+
+ va_start(ap, fmt);
+ vprintk(fmt, ap);
+ va_end(ap);
+}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
new file mode 100644
index 000000000..dba8d0864
--- /dev/null
+++ b/net/rds/iw_rdma.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "iw.h"
+
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_iw_mr {
+ struct rds_iw_device *device;
+ struct rds_iw_mr_pool *pool;
+ struct rdma_cm_id *cm_id;
+
+ struct ib_mr *mr;
+ struct ib_fast_reg_page_list *page_list;
+
+ struct rds_iw_mapping mapping;
+ unsigned char remap_count;
+};
+
+/*
+ * Our own little MR pool
+ */
+struct rds_iw_mr_pool {
+ struct rds_iw_device *device; /* back ptr to the device that owns us */
+
+ struct mutex flush_lock; /* serialize fmr invalidate */
+ struct work_struct flush_worker; /* flush worker */
+
+ spinlock_t list_lock; /* protect variables below */
+ atomic_t item_count; /* total # of MRs */
+ atomic_t dirty_count; /* # dirty of MRs */
+ struct list_head dirty_list; /* dirty mappings */
+ struct list_head clean_list; /* unused & unamapped MRs */
+ atomic_t free_pinned; /* memory pinned by free MRs */
+ unsigned long max_message_size; /* in pages */
+ unsigned long max_items;
+ unsigned long max_items_soft;
+ unsigned long max_free_pinned;
+ int max_pages;
+};
+
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr,
+ struct scatterlist *sg, unsigned int nents);
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+ struct list_head *unmap_list,
+ struct list_head *kill_list,
+ int *unpinned);
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+
+static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
+ struct rds_iw_device **rds_iwdev,
+ struct rdma_cm_id **cm_id)
+{
+ struct rds_iw_device *iwdev;
+ struct rds_iw_cm_id *i_cm_id;
+
+ *rds_iwdev = NULL;
+ *cm_id = NULL;
+
+ list_for_each_entry(iwdev, &rds_iw_devices, list) {
+ spin_lock_irq(&iwdev->spinlock);
+ list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
+ struct sockaddr_in *src_addr, *dst_addr;
+
+ src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
+ dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
+
+ rdsdebug("local ipaddr = %x port %d, "
+ "remote ipaddr = %x port %d"
+ "..looking for %x port %d, "
+ "remote ipaddr = %x port %d\n",
+ src_addr->sin_addr.s_addr,
+ src_addr->sin_port,
+ dst_addr->sin_addr.s_addr,
+ dst_addr->sin_port,
+ src->sin_addr.s_addr,
+ src->sin_port,
+ dst->sin_addr.s_addr,
+ dst->sin_port);
+#ifdef WORKING_TUPLE_DETECTION
+ if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
+ src_addr->sin_port == src->sin_port &&
+ dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
+ dst_addr->sin_port == dst->sin_port) {
+#else
+ /* FIXME - needs to compare the local and remote
+ * ipaddr/port tuple, but the ipaddr is the only
+ * available information in the rds_sock (as the rest are
+ * zero'ed. It doesn't appear to be properly populated
+ * during connection setup...
+ */
+ if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
+#endif
+ spin_unlock_irq(&iwdev->spinlock);
+ *rds_iwdev = iwdev;
+ *cm_id = i_cm_id->cm_id;
+ return 0;
+ }
+ }
+ spin_unlock_irq(&iwdev->spinlock);
+ }
+
+ return 1;
+}
+
+static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+ struct rds_iw_cm_id *i_cm_id;
+
+ i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
+ if (!i_cm_id)
+ return -ENOMEM;
+
+ i_cm_id->cm_id = cm_id;
+
+ spin_lock_irq(&rds_iwdev->spinlock);
+ list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
+ spin_unlock_irq(&rds_iwdev->spinlock);
+
+ return 0;
+}
+
+static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
+ struct rdma_cm_id *cm_id)
+{
+ struct rds_iw_cm_id *i_cm_id;
+
+ spin_lock_irq(&rds_iwdev->spinlock);
+ list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
+ if (i_cm_id->cm_id == cm_id) {
+ list_del(&i_cm_id->list);
+ kfree(i_cm_id);
+ break;
+ }
+ }
+ spin_unlock_irq(&rds_iwdev->spinlock);
+}
+
+
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+ struct sockaddr_in *src_addr, *dst_addr;
+ struct rds_iw_device *rds_iwdev_old;
+ struct rdma_cm_id *pcm_id;
+ int rc;
+
+ src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
+ dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
+
+ rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
+ if (rc)
+ rds_iw_remove_cm_id(rds_iwdev, cm_id);
+
+ return rds_iw_add_cm_id(rds_iwdev, cm_id);
+}
+
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ /* conn was previously on the nodev_conns_list */
+ spin_lock_irq(&iw_nodev_conns_lock);
+ BUG_ON(list_empty(&iw_nodev_conns));
+ BUG_ON(list_empty(&ic->iw_node));
+ list_del(&ic->iw_node);
+
+ spin_lock(&rds_iwdev->spinlock);
+ list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
+ spin_unlock(&rds_iwdev->spinlock);
+ spin_unlock_irq(&iw_nodev_conns_lock);
+
+ ic->rds_iwdev = rds_iwdev;
+}
+
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ /* place conn on nodev_conns_list */
+ spin_lock(&iw_nodev_conns_lock);
+
+ spin_lock_irq(&rds_iwdev->spinlock);
+ BUG_ON(list_empty(&ic->iw_node));
+ list_del(&ic->iw_node);
+ spin_unlock_irq(&rds_iwdev->spinlock);
+
+ list_add_tail(&ic->iw_node, &iw_nodev_conns);
+
+ spin_unlock(&iw_nodev_conns_lock);
+
+ rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
+ ic->rds_iwdev = NULL;
+}
+
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
+{
+ struct rds_iw_connection *ic, *_ic;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(list_lock);
+ list_splice(list, &tmp_list);
+ INIT_LIST_HEAD(list);
+ spin_unlock_irq(list_lock);
+
+ list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
+ rds_conn_destroy(ic->conn);
+}
+
+static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
+ struct scatterlist *list, unsigned int sg_len)
+{
+ sg->list = list;
+ sg->len = sg_len;
+ sg->dma_len = 0;
+ sg->dma_npages = 0;
+ sg->bytes = 0;
+}
+
+static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
+ struct rds_iw_scatterlist *sg)
+{
+ struct ib_device *dev = rds_iwdev->dev;
+ u64 *dma_pages = NULL;
+ int i, j, ret;
+
+ WARN_ON(sg->dma_len);
+
+ sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+ if (unlikely(!sg->dma_len)) {
+ printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
+ return ERR_PTR(-EBUSY);
+ }
+
+ sg->bytes = 0;
+ sg->dma_npages = 0;
+
+ ret = -EINVAL;
+ for (i = 0; i < sg->dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+ u64 end_addr;
+
+ sg->bytes += dma_len;
+
+ end_addr = dma_addr + dma_len;
+ if (dma_addr & PAGE_MASK) {
+ if (i > 0)
+ goto out_unmap;
+ dma_addr &= ~PAGE_MASK;
+ }
+ if (end_addr & PAGE_MASK) {
+ if (i < sg->dma_len - 1)
+ goto out_unmap;
+ end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
+ }
+
+ sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
+ }
+
+ /* Now gather the dma addrs into one list */
+ if (sg->dma_npages > fastreg_message_size)
+ goto out_unmap;
+
+ dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
+ if (!dma_pages) {
+ ret = -ENOMEM;
+ goto out_unmap;
+ }
+
+ for (i = j = 0; i < sg->dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+ u64 end_addr;
+
+ end_addr = dma_addr + dma_len;
+ dma_addr &= ~PAGE_MASK;
+ for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
+ dma_pages[j++] = dma_addr;
+ BUG_ON(j > sg->dma_npages);
+ }
+
+ return dma_pages;
+
+out_unmap:
+ ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+ sg->dma_len = 0;
+ kfree(dma_pages);
+ return ERR_PTR(ret);
+}
+
+
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
+{
+ struct rds_iw_mr_pool *pool;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool) {
+ printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ pool->device = rds_iwdev;
+ INIT_LIST_HEAD(&pool->dirty_list);
+ INIT_LIST_HEAD(&pool->clean_list);
+ mutex_init(&pool->flush_lock);
+ spin_lock_init(&pool->list_lock);
+ INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
+
+ pool->max_message_size = fastreg_message_size;
+ pool->max_items = fastreg_pool_size;
+ pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
+ pool->max_pages = fastreg_message_size;
+
+ /* We never allow more than max_items MRs to be allocated.
+ * When we exceed more than max_items_soft, we start freeing
+ * items more aggressively.
+ * Make sure that max_items > max_items_soft > max_items / 2
+ */
+ pool->max_items_soft = pool->max_items * 3 / 4;
+
+ return pool;
+}
+
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
+{
+ struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+ iinfo->rdma_mr_max = pool->max_items;
+ iinfo->rdma_mr_size = pool->max_pages;
+}
+
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
+{
+ flush_workqueue(rds_wq);
+ rds_iw_flush_mr_pool(pool, 1);
+ BUG_ON(atomic_read(&pool->item_count));
+ BUG_ON(atomic_read(&pool->free_pinned));
+ kfree(pool);
+}
+
+static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
+{
+ struct rds_iw_mr *ibmr = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ if (!list_empty(&pool->clean_list)) {
+ ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
+ list_del_init(&ibmr->mapping.m_list);
+ }
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+
+ return ibmr;
+}
+
+static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
+{
+ struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+ struct rds_iw_mr *ibmr = NULL;
+ int err = 0, iter = 0;
+
+ while (1) {
+ ibmr = rds_iw_reuse_fmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ /* No clean MRs - now we have the choice of either
+ * allocating a fresh MR up to the limit imposed by the
+ * driver, or flush any dirty unused MRs.
+ * We try to avoid stalling in the send path if possible,
+ * so we allocate as long as we're allowed to.
+ *
+ * We're fussy with enforcing the FMR limit, though. If the driver
+ * tells us we can't use more than N fmrs, we shouldn't start
+ * arguing with it */
+ if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+ break;
+
+ atomic_dec(&pool->item_count);
+
+ if (++iter > 2) {
+ rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* We do have some empty MRs. Flush them out. */
+ rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
+ rds_iw_flush_mr_pool(pool, 0);
+ }
+
+ ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ spin_lock_init(&ibmr->mapping.m_lock);
+ INIT_LIST_HEAD(&ibmr->mapping.m_list);
+ ibmr->mapping.m_mr = ibmr;
+
+ err = rds_iw_init_fastreg(pool, ibmr);
+ if (err)
+ goto out_no_cigar;
+
+ rds_iw_stats_inc(s_iw_rdma_mr_alloc);
+ return ibmr;
+
+out_no_cigar:
+ if (ibmr) {
+ rds_iw_destroy_fastreg(pool, ibmr);
+ kfree(ibmr);
+ }
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+void rds_iw_sync_mr(void *trans_private, int direction)
+{
+ struct rds_iw_mr *ibmr = trans_private;
+ struct rds_iw_device *rds_iwdev = ibmr->device;
+
+ switch (direction) {
+ case DMA_FROM_DEVICE:
+ ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+ ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+ break;
+ case DMA_TO_DEVICE:
+ ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+ ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+ break;
+ }
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
+{
+ struct rds_iw_mr *ibmr, *next;
+ LIST_HEAD(unmap_list);
+ LIST_HEAD(kill_list);
+ unsigned long flags;
+ unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
+ int ret = 0;
+
+ rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
+
+ mutex_lock(&pool->flush_lock);
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ /* Get the list of all mappings to be destroyed */
+ list_splice_init(&pool->dirty_list, &unmap_list);
+ if (free_all)
+ list_splice_init(&pool->clean_list, &kill_list);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+
+ /* Batched invalidate of dirty MRs.
+ * For FMR based MRs, the mappings on the unmap list are
+ * actually members of an ibmr (ibmr->mapping). They either
+ * migrate to the kill_list, or have been cleaned and should be
+ * moved to the clean_list.
+ * For fastregs, they will be dynamically allocated, and
+ * will be destroyed by the unmap function.
+ */
+ if (!list_empty(&unmap_list)) {
+ ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
+ &kill_list, &unpinned);
+ /* If we've been asked to destroy all MRs, move those
+ * that were simply cleaned to the kill list */
+ if (free_all)
+ list_splice_init(&unmap_list, &kill_list);
+ }
+
+ /* Destroy any MRs that are past their best before date */
+ list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
+ rds_iw_stats_inc(s_iw_rdma_mr_free);
+ list_del(&ibmr->mapping.m_list);
+ rds_iw_destroy_fastreg(pool, ibmr);
+ kfree(ibmr);
+ nfreed++;
+ }
+
+ /* Anything that remains are laundered ibmrs, which we can add
+ * back to the clean list. */
+ if (!list_empty(&unmap_list)) {
+ spin_lock_irqsave(&pool->list_lock, flags);
+ list_splice(&unmap_list, &pool->clean_list);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+ }
+
+ atomic_sub(unpinned, &pool->free_pinned);
+ atomic_sub(ncleaned, &pool->dirty_count);
+ atomic_sub(nfreed, &pool->item_count);
+
+ mutex_unlock(&pool->flush_lock);
+ return ret;
+}
+
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
+{
+ struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
+
+ rds_iw_flush_mr_pool(pool, 0);
+}
+
+void rds_iw_free_mr(void *trans_private, int invalidate)
+{
+ struct rds_iw_mr *ibmr = trans_private;
+ struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
+
+ rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
+ if (!pool)
+ return;
+
+ /* Return it to the pool's free list */
+ rds_iw_free_fastreg(pool, ibmr);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+ atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ queue_work(rds_wq, &pool->flush_worker);
+
+ if (invalidate) {
+ if (likely(!in_interrupt())) {
+ rds_iw_flush_mr_pool(pool, 0);
+ } else {
+ /* We get here if the user created a MR marked
+ * as use_once and invalidate at the same time. */
+ queue_work(rds_wq, &pool->flush_worker);
+ }
+ }
+}
+
+void rds_iw_flush_mrs(void)
+{
+ struct rds_iw_device *rds_iwdev;
+
+ list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
+ struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+ if (pool)
+ rds_iw_flush_mr_pool(pool, 0);
+ }
+}
+
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret)
+{
+ struct rds_iw_device *rds_iwdev;
+ struct rds_iw_mr *ibmr = NULL;
+ struct rdma_cm_id *cm_id;
+ struct sockaddr_in src = {
+ .sin_addr.s_addr = rs->rs_bound_addr,
+ .sin_port = rs->rs_bound_port,
+ };
+ struct sockaddr_in dst = {
+ .sin_addr.s_addr = rs->rs_conn_addr,
+ .sin_port = rs->rs_conn_port,
+ };
+ int ret;
+
+ ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
+ if (ret || !cm_id) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!rds_iwdev->mr_pool) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ ibmr = rds_iw_alloc_mr(rds_iwdev);
+ if (IS_ERR(ibmr))
+ return ibmr;
+
+ ibmr->cm_id = cm_id;
+ ibmr->device = rds_iwdev;
+
+ ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
+ if (ret == 0)
+ *key_ret = ibmr->mr->rkey;
+ else
+ printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
+
+out:
+ if (ret) {
+ if (ibmr)
+ rds_iw_free_mr(ibmr, 0);
+ ibmr = ERR_PTR(ret);
+ }
+ return ibmr;
+}
+
+/*
+ * iWARP fastreg handling
+ *
+ * The life cycle of a fastreg registration is a bit different from
+ * FMRs.
+ * The idea behind fastreg is to have one MR, to which we bind different
+ * mappings over time. To avoid stalling on the expensive map and invalidate
+ * operations, these operations are pipelined on the same send queue on
+ * which we want to send the message containing the r_key.
+ *
+ * This creates a bit of a problem for us, as we do not have the destination
+ * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
+ * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit
+ * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
+ * before queuing the SEND. When completions for these arrive, they are
+ * dispatched to the MR has a bit set showing that RDMa can be performed.
+ *
+ * There is another interesting aspect that's related to invalidation.
+ * The application can request that a mapping is invalidated in FREE_MR.
+ * The expectation there is that this invalidation step includes ALL
+ * PREVIOUSLY FREED MRs.
+ */
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr)
+{
+ struct rds_iw_device *rds_iwdev = pool->device;
+ struct ib_fast_reg_page_list *page_list = NULL;
+ struct ib_mr *mr;
+ int err;
+
+ mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+ if (IS_ERR(mr)) {
+ err = PTR_ERR(mr);
+
+ printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
+ return err;
+ }
+
+ /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
+ * is not filled in.
+ */
+ page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
+ if (IS_ERR(page_list)) {
+ err = PTR_ERR(page_list);
+
+ printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
+ ib_dereg_mr(mr);
+ return err;
+ }
+
+ ibmr->page_list = page_list;
+ ibmr->mr = mr;
+ return 0;
+}
+
+static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
+{
+ struct rds_iw_mr *ibmr = mapping->m_mr;
+ struct ib_send_wr f_wr, *failed_wr;
+ int ret;
+
+ /*
+ * Perform a WR for the fast_reg_mr. Each individual page
+ * in the sg list is added to the fast reg page list and placed
+ * inside the fast_reg_mr WR. The key used is a rolling 8bit
+ * counter, which should guarantee uniqueness.
+ */
+ ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
+ mapping->m_rkey = ibmr->mr->rkey;
+
+ memset(&f_wr, 0, sizeof(f_wr));
+ f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
+ f_wr.opcode = IB_WR_FAST_REG_MR;
+ f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
+ f_wr.wr.fast_reg.rkey = mapping->m_rkey;
+ f_wr.wr.fast_reg.page_list = ibmr->page_list;
+ f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
+ f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
+ f_wr.wr.fast_reg.iova_start = 0;
+ f_wr.send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = &f_wr;
+ ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
+ BUG_ON(failed_wr != &f_wr);
+ if (ret)
+ printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+ __func__, __LINE__, ret);
+ return ret;
+}
+
+static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
+{
+ struct ib_send_wr s_wr, *failed_wr;
+ int ret = 0;
+
+ if (!ibmr->cm_id->qp || !ibmr->mr)
+ goto out;
+
+ memset(&s_wr, 0, sizeof(s_wr));
+ s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
+ s_wr.opcode = IB_WR_LOCAL_INV;
+ s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
+ s_wr.send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = &s_wr;
+ ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
+ if (ret) {
+ printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+ __func__, __LINE__, ret);
+ goto out;
+ }
+out:
+ return ret;
+}
+
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr,
+ struct scatterlist *sg,
+ unsigned int sg_len)
+{
+ struct rds_iw_device *rds_iwdev = pool->device;
+ struct rds_iw_mapping *mapping = &ibmr->mapping;
+ u64 *dma_pages;
+ int i, ret = 0;
+
+ rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
+
+ dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
+ if (IS_ERR(dma_pages)) {
+ ret = PTR_ERR(dma_pages);
+ dma_pages = NULL;
+ goto out;
+ }
+
+ if (mapping->m_sg.dma_len > pool->max_message_size) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
+ for (i = 0; i < mapping->m_sg.dma_npages; ++i)
+ ibmr->page_list->page_list[i] = dma_pages[i];
+
+ ret = rds_iw_rdma_build_fastreg(mapping);
+ if (ret)
+ goto out;
+
+ rds_iw_stats_inc(s_iw_rdma_mr_used);
+
+out:
+ kfree(dma_pages);
+
+ return ret;
+}
+
+/*
+ * "Free" a fastreg MR.
+ */
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr)
+{
+ unsigned long flags;
+ int ret;
+
+ if (!ibmr->mapping.m_sg.dma_len)
+ return;
+
+ ret = rds_iw_rdma_fastreg_inv(ibmr);
+ if (ret)
+ return;
+
+ /* Try to post the LOCAL_INV WR to the queue. */
+ spin_lock_irqsave(&pool->list_lock, flags);
+
+ list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
+ atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+}
+
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+ struct list_head *unmap_list,
+ struct list_head *kill_list,
+ int *unpinned)
+{
+ struct rds_iw_mapping *mapping, *next;
+ unsigned int ncleaned = 0;
+ LIST_HEAD(laundered);
+
+ /* Batched invalidation of fastreg MRs.
+ * Why do we do it this way, even though we could pipeline unmap
+ * and remap? The reason is the application semantics - when the
+ * application requests an invalidation of MRs, it expects all
+ * previously released R_Keys to become invalid.
+ *
+ * If we implement MR reuse naively, we risk memory corruption
+ * (this has actually been observed). So the default behavior
+ * requires that a MR goes through an explicit unmap operation before
+ * we can reuse it again.
+ *
+ * We could probably improve on this a little, by allowing immediate
+ * reuse of a MR on the same socket (eg you could add small
+ * cache of unused MRs to strct rds_socket - GET_MR could grab one
+ * of these without requiring an explicit invalidate).
+ */
+ while (!list_empty(unmap_list)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->list_lock, flags);
+ list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+ *unpinned += mapping->m_sg.len;
+ list_move(&mapping->m_list, &laundered);
+ ncleaned++;
+ }
+ spin_unlock_irqrestore(&pool->list_lock, flags);
+ }
+
+ /* Move all laundered mappings back to the unmap list.
+ * We do not kill any WRs right now - it doesn't seem the
+ * fastreg API has a max_remap limit. */
+ list_splice_init(&laundered, unmap_list);
+
+ return ncleaned;
+}
+
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
+ struct rds_iw_mr *ibmr)
+{
+ if (ibmr->page_list)
+ ib_free_fast_reg_page_list(ibmr->page_list);
+ if (ibmr->mr)
+ ib_dereg_mr(ibmr->mr);
+}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
new file mode 100644
index 000000000..a66d1794b
--- /dev/null
+++ b/net/rds/iw_recv.c
@@ -0,0 +1,904 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "iw.h"
+
+static struct kmem_cache *rds_iw_incoming_slab;
+static struct kmem_cache *rds_iw_frag_slab;
+static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
+
+static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, frag->f_page);
+ __free_page(frag->f_page);
+ frag->f_page = NULL;
+}
+
+static void rds_iw_frag_free(struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, frag->f_page);
+ BUG_ON(frag->f_page);
+ kmem_cache_free(rds_iw_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time. Its fragments are posted in order. This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
+ struct rds_iw_recv_work *recv)
+{
+ struct rds_page_frag *frag = recv->r_frag;
+
+ rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+ if (frag->f_mapped)
+ ib_dma_unmap_page(ic->i_cm_id->device,
+ frag->f_mapped,
+ RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+ frag->f_mapped = 0;
+}
+
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
+{
+ struct rds_iw_recv_work *recv;
+ u32 i;
+
+ for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+ struct ib_sge *sge;
+
+ recv->r_iwinc = NULL;
+ recv->r_frag = NULL;
+
+ recv->r_wr.next = NULL;
+ recv->r_wr.wr_id = i;
+ recv->r_wr.sg_list = recv->r_sge;
+ recv->r_wr.num_sge = RDS_IW_RECV_SGE;
+
+ sge = rds_iw_data_sge(ic, recv->r_sge);
+ sge->addr = 0;
+ sge->length = RDS_FRAG_SIZE;
+ sge->lkey = 0;
+
+ sge = rds_iw_header_sge(ic, recv->r_sge);
+ sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = 0;
+ }
+}
+
+static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
+ struct rds_iw_recv_work *recv)
+{
+ if (recv->r_iwinc) {
+ rds_inc_put(&recv->r_iwinc->ii_inc);
+ recv->r_iwinc = NULL;
+ }
+ if (recv->r_frag) {
+ rds_iw_recv_unmap_page(ic, recv);
+ if (recv->r_frag->f_page)
+ rds_iw_frag_drop_page(recv->r_frag);
+ rds_iw_frag_free(recv->r_frag);
+ recv->r_frag = NULL;
+ }
+}
+
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
+{
+ u32 i;
+
+ for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+ rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
+
+ if (ic->i_frag.f_page)
+ rds_iw_frag_drop_page(&ic->i_frag);
+}
+
+static int rds_iw_recv_refill_one(struct rds_connection *conn,
+ struct rds_iw_recv_work *recv,
+ gfp_t kptr_gfp, gfp_t page_gfp)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ dma_addr_t dma_addr;
+ struct ib_sge *sge;
+ int ret = -ENOMEM;
+
+ if (!recv->r_iwinc) {
+ if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
+ rds_iw_stats_inc(s_iw_rx_alloc_limit);
+ goto out;
+ }
+ recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
+ kptr_gfp);
+ if (!recv->r_iwinc) {
+ atomic_dec(&rds_iw_allocation);
+ goto out;
+ }
+ INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
+ rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
+ }
+
+ if (!recv->r_frag) {
+ recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
+ if (!recv->r_frag)
+ goto out;
+ INIT_LIST_HEAD(&recv->r_frag->f_item);
+ recv->r_frag->f_page = NULL;
+ }
+
+ if (!ic->i_frag.f_page) {
+ ic->i_frag.f_page = alloc_page(page_gfp);
+ if (!ic->i_frag.f_page)
+ goto out;
+ ic->i_frag.f_offset = 0;
+ }
+
+ dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+ ic->i_frag.f_page,
+ ic->i_frag.f_offset,
+ RDS_FRAG_SIZE,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+ goto out;
+
+ /*
+ * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
+ * must be called on this recv. This happens as completions hit
+ * in order or on connection shutdown.
+ */
+ recv->r_frag->f_page = ic->i_frag.f_page;
+ recv->r_frag->f_offset = ic->i_frag.f_offset;
+ recv->r_frag->f_mapped = dma_addr;
+
+ sge = rds_iw_data_sge(ic, recv->r_sge);
+ sge->addr = dma_addr;
+ sge->length = RDS_FRAG_SIZE;
+
+ sge = rds_iw_header_sge(ic, recv->r_sge);
+ sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+ sge->length = sizeof(struct rds_header);
+
+ get_page(recv->r_frag->f_page);
+
+ if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+ ic->i_frag.f_offset += RDS_FRAG_SIZE;
+ } else {
+ put_page(ic->i_frag.f_page);
+ ic->i_frag.f_page = NULL;
+ ic->i_frag.f_offset = 0;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+ gfp_t page_gfp, int prefill)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_recv_work *recv;
+ struct ib_recv_wr *failed_wr;
+ unsigned int posted = 0;
+ int ret = 0;
+ u32 pos;
+
+ while ((prefill || rds_conn_up(conn)) &&
+ rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+ if (pos >= ic->i_recv_ring.w_nr) {
+ printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+ pos);
+ ret = -EINVAL;
+ break;
+ }
+
+ recv = &ic->i_recvs[pos];
+ ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+ if (ret) {
+ ret = -1;
+ break;
+ }
+
+ /* XXX when can this fail? */
+ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+ rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
+ recv->r_iwinc, recv->r_frag->f_page,
+ (long) recv->r_frag->f_mapped, ret);
+ if (ret) {
+ rds_iw_conn_error(conn, "recv post on "
+ "%pI4 returned %d, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ ret);
+ ret = -1;
+ break;
+ }
+
+ posted++;
+ }
+
+ /* We're doing flow control - update the window. */
+ if (ic->i_flowctl && posted)
+ rds_iw_advertise_credits(conn, posted);
+
+ if (ret)
+ rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
+ return ret;
+}
+
+static void rds_iw_inc_purge(struct rds_incoming *inc)
+{
+ struct rds_iw_incoming *iwinc;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *pos;
+
+ iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+ rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
+
+ list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
+ list_del_init(&frag->f_item);
+ rds_iw_frag_drop_page(frag);
+ rds_iw_frag_free(frag);
+ }
+}
+
+void rds_iw_inc_free(struct rds_incoming *inc)
+{
+ struct rds_iw_incoming *iwinc;
+
+ iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+
+ rds_iw_inc_purge(inc);
+ rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
+ BUG_ON(!list_empty(&iwinc->ii_frags));
+ kmem_cache_free(rds_iw_incoming_slab, iwinc);
+ atomic_dec(&rds_iw_allocation);
+ BUG_ON(atomic_read(&rds_iw_allocation) < 0);
+}
+
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+ struct rds_iw_incoming *iwinc;
+ struct rds_page_frag *frag;
+ unsigned long to_copy;
+ unsigned long frag_off = 0;
+ int copied = 0;
+ int ret;
+ u32 len;
+
+ iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+ frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+ len = be32_to_cpu(inc->i_hdr.h_len);
+
+ while (iov_iter_count(to) && copied < len) {
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ to_copy = min_t(unsigned long, iov_iter_count(to),
+ RDS_FRAG_SIZE - frag_off);
+ to_copy = min_t(unsigned long, to_copy, len - copied);
+
+ /* XXX needs + offset for multiple recvs per page */
+ rds_stats_add(s_copy_to_user, to_copy);
+ ret = copy_page_to_iter(frag->f_page,
+ frag->f_offset + frag_off,
+ to_copy,
+ to);
+ if (ret != to_copy)
+ return -EFAULT;
+
+ frag_off += to_copy;
+ copied += to_copy;
+ }
+
+ return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
+{
+ struct ib_send_wr *wr = &ic->i_ack_wr;
+ struct ib_sge *sge = &ic->i_ack_sge;
+
+ sge->addr = ic->i_ack_dma;
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = rds_iw_local_dma_lkey(ic);
+
+ wr->sg_list = sge;
+ wr->num_sge = 1;
+ wr->opcode = IB_WR_SEND;
+ wr->wr_id = RDS_IW_ACK_WR_ID;
+ wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received. The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory. This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed. This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue. To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time. This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight. This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame. This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do. The QP attribute specifically makes
+ * room for it beyond the ring size. Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+ int ack_required)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ic->i_ack_lock, flags);
+ ic->i_ack_next = seq;
+ if (ack_required)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+ unsigned long flags;
+ u64 seq;
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ spin_lock_irqsave(&ic->i_ack_lock, flags);
+ seq = ic->i_ack_next;
+ spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+ return seq;
+}
+#else
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+ int ack_required)
+{
+ atomic64_set(&ic->i_ack_next, seq);
+ if (ack_required) {
+ smp_mb__before_atomic();
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ }
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ smp_mb__after_atomic();
+
+ return atomic64_read(&ic->i_ack_next);
+}
+#endif
+
+
+static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
+{
+ struct rds_header *hdr = ic->i_ack;
+ struct ib_send_wr *failed_wr;
+ u64 seq;
+ int ret;
+
+ seq = rds_iw_get_ack(ic);
+
+ rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+ rds_message_populate_header(hdr, 0, 0, 0);
+ hdr->h_ack = cpu_to_be64(seq);
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ ic->i_ack_queued = jiffies;
+
+ ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+ if (unlikely(ret)) {
+ /* Failed to send. Release the WR, and
+ * force another ACK.
+ */
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+ rds_iw_stats_inc(s_iw_ack_send_failure);
+
+ rds_iw_conn_error(ic->conn, "sending ack failed\n");
+ } else
+ rds_iw_stats_inc(s_iw_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ * 1. We call rds_iw_attempt_ack from the recv completion handler
+ * to send an ACK-only frame.
+ * However, there can be only one such frame in the send queue
+ * at any time, so we may have to postpone it.
+ * 2. When another (data) packet is transmitted while there's
+ * an ACK in the queue, we piggyback the ACK sequence number
+ * on the data packet.
+ * 3. If the ACK WR is done sending, we get called from the
+ * send queue completion handler, and check whether there's
+ * another ACK pending (postponed because the WR was on the
+ * queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ * - i_ack_flags, which keeps track of whether the ACK WR
+ * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ * - i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_iw_attempt_ack(struct rds_iw_connection *ic)
+{
+ unsigned int adv_credits;
+
+ if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ return;
+
+ if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+ rds_iw_stats_inc(s_iw_ack_send_delayed);
+ return;
+ }
+
+ /* Can we get a send credit? */
+ if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+ rds_iw_stats_inc(s_iw_tx_throttle);
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ return;
+ }
+
+ clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+ rds_iw_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
+{
+ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+ rds_iw_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
+{
+ if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+ rds_iw_stats_inc(s_iw_ack_send_piggybacked);
+ return rds_iw_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps. We could have posted the bitmaps and rdma written into
+ * them. But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient. By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_iw_cong_recv(struct rds_connection *conn,
+ struct rds_iw_incoming *iwinc)
+{
+ struct rds_cong_map *map;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rds_page_frag *frag;
+ unsigned long frag_off;
+ unsigned long to_copy;
+ unsigned long copied;
+ uint64_t uncongested = 0;
+ void *addr;
+
+ /* catch completely corrupt packets */
+ if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+ return;
+
+ map = conn->c_fcong;
+ map_page = 0;
+ map_off = 0;
+
+ frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+ frag_off = 0;
+
+ copied = 0;
+
+ while (copied < RDS_CONG_MAP_BYTES) {
+ uint64_t *src, *dst;
+ unsigned int k;
+
+ to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+ BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+ addr = kmap_atomic(frag->f_page);
+
+ src = addr + frag_off;
+ dst = (void *)map->m_page_addrs[map_page] + map_off;
+ for (k = 0; k < to_copy; k += 8) {
+ /* Record ports that became uncongested, ie
+ * bits that changed from 0 to 1. */
+ uncongested |= ~(*src) & *dst;
+ *dst++ = *src++;
+ }
+ kunmap_atomic(addr);
+
+ copied += to_copy;
+
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+
+ frag_off += to_copy;
+ if (frag_off == RDS_FRAG_SIZE) {
+ frag = list_entry(frag->f_item.next,
+ struct rds_page_frag, f_item);
+ frag_off = 0;
+ }
+ }
+
+ /* the congestion map is in little endian order */
+ uncongested = le64_to_cpu(uncongested);
+
+ rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_iw_ack_state {
+ u64 ack_next;
+ u64 ack_recv;
+ unsigned int ack_required:1;
+ unsigned int ack_next_valid:1;
+ unsigned int ack_recv_valid:1;
+};
+
+static void rds_iw_process_recv(struct rds_connection *conn,
+ struct rds_iw_recv_work *recv, u32 byte_len,
+ struct rds_iw_ack_state *state)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_incoming *iwinc = ic->i_iwinc;
+ struct rds_header *ihdr, *hdr;
+
+ /* XXX shut down the connection if port 0,0 are seen? */
+
+ rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
+ byte_len);
+
+ if (byte_len < sizeof(struct rds_header)) {
+ rds_iw_conn_error(conn, "incoming message "
+ "from %pI4 didn't include a "
+ "header, disconnecting and "
+ "reconnecting\n",
+ &conn->c_faddr);
+ return;
+ }
+ byte_len -= sizeof(struct rds_header);
+
+ ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+ /* Validate the checksum. */
+ if (!rds_message_verify_checksum(ihdr)) {
+ rds_iw_conn_error(conn, "incoming message "
+ "from %pI4 has corrupted header - "
+ "forcing a reconnect\n",
+ &conn->c_faddr);
+ rds_stats_inc(s_recv_drop_bad_checksum);
+ return;
+ }
+
+ /* Process the ACK sequence which comes with every packet */
+ state->ack_recv = be64_to_cpu(ihdr->h_ack);
+ state->ack_recv_valid = 1;
+
+ /* Process the credits update if there was one */
+ if (ihdr->h_credit)
+ rds_iw_send_add_credits(conn, ihdr->h_credit);
+
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+ /* This is an ACK-only packet. The fact that it gets
+ * special treatment here is that historically, ACKs
+ * were rather special beasts.
+ */
+ rds_iw_stats_inc(s_iw_ack_received);
+
+ /*
+ * Usually the frags make their way on to incs and are then freed as
+ * the inc is freed. We don't go that route, so we have to drop the
+ * page ref ourselves. We can't just leave the page on the recv
+ * because that confuses the dma mapping of pages and each recv's use
+ * of a partial page. We can leave the frag, though, it will be
+ * reused.
+ *
+ * FIXME: Fold this into the code path below.
+ */
+ rds_iw_frag_drop_page(recv->r_frag);
+ return;
+ }
+
+ /*
+ * If we don't already have an inc on the connection then this
+ * fragment has a header and starts a message.. copy its header
+ * into the inc and save the inc so we can hang upcoming fragments
+ * off its list.
+ */
+ if (!iwinc) {
+ iwinc = recv->r_iwinc;
+ recv->r_iwinc = NULL;
+ ic->i_iwinc = iwinc;
+
+ hdr = &iwinc->ii_inc.i_hdr;
+ memcpy(hdr, ihdr, sizeof(*hdr));
+ ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+ rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
+ ic->i_recv_data_rem, hdr->h_flags);
+ } else {
+ hdr = &iwinc->ii_inc.i_hdr;
+ /* We can't just use memcmp here; fragments of a
+ * single message may carry different ACKs */
+ if (hdr->h_sequence != ihdr->h_sequence ||
+ hdr->h_len != ihdr->h_len ||
+ hdr->h_sport != ihdr->h_sport ||
+ hdr->h_dport != ihdr->h_dport) {
+ rds_iw_conn_error(conn,
+ "fragment header mismatch; forcing reconnect\n");
+ return;
+ }
+ }
+
+ list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
+ recv->r_frag = NULL;
+
+ if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+ ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+ else {
+ ic->i_recv_data_rem = 0;
+ ic->i_iwinc = NULL;
+
+ if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ rds_iw_cong_recv(conn, iwinc);
+ else {
+ rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+ &iwinc->ii_inc, GFP_ATOMIC);
+ state->ack_next = be64_to_cpu(hdr->h_sequence);
+ state->ack_next_valid = 1;
+ }
+
+ /* Evaluate the ACK_REQUIRED flag *after* we received
+ * the complete frame, and after bumping the next_rx
+ * sequence. */
+ if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+ rds_stats_inc(s_recv_ack_required);
+ state->ack_required = 1;
+ }
+
+ rds_inc_put(&iwinc->ii_inc);
+ }
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring. Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ rdsdebug("conn %p cq %p\n", conn, cq);
+
+ rds_iw_stats_inc(s_iw_rx_cq_call);
+
+ tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static inline void rds_poll_cq(struct rds_iw_connection *ic,
+ struct rds_iw_ack_state *state)
+{
+ struct rds_connection *conn = ic->conn;
+ struct ib_wc wc;
+ struct rds_iw_recv_work *recv;
+
+ while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+ be32_to_cpu(wc.ex.imm_data));
+ rds_iw_stats_inc(s_iw_rx_cq_event);
+
+ recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
+
+ rds_iw_recv_unmap_page(ic, recv);
+
+ /*
+ * Also process recvs in connecting state because it is possible
+ * to get a recv completion _before_ the rdmacm ESTABLISHED
+ * event is processed.
+ */
+ if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc.status == IB_WC_SUCCESS) {
+ rds_iw_process_recv(conn, recv, wc.byte_len, state);
+ } else {
+ rds_iw_conn_error(conn, "recv completion on "
+ "%pI4 had status %u, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ wc.status);
+ }
+ }
+
+ rds_iw_ring_free(&ic->i_recv_ring, 1);
+ }
+}
+
+void rds_iw_recv_tasklet_fn(unsigned long data)
+{
+ struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
+ struct rds_connection *conn = ic->conn;
+ struct rds_iw_ack_state state = { 0, };
+
+ rds_poll_cq(ic, &state);
+ ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+ rds_poll_cq(ic, &state);
+
+ if (state.ack_next_valid)
+ rds_iw_set_ack(ic, state.ack_next, state.ack_required);
+ if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+ rds_send_drop_acked(conn, state.ack_recv, NULL);
+ ic->i_ack_recv = state.ack_recv;
+ }
+ if (rds_conn_up(conn))
+ rds_iw_attempt_ack(ic);
+
+ /* If we ever end up with a really empty receive ring, we're
+ * in deep trouble, as the sender will definitely see RNR
+ * timeouts. */
+ if (rds_iw_ring_empty(&ic->i_recv_ring))
+ rds_iw_stats_inc(s_iw_rx_ring_empty);
+
+ /*
+ * If the ring is running low, then schedule the thread to refill.
+ */
+ if (rds_iw_ring_low(&ic->i_recv_ring))
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+int rds_iw_recv(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ int ret = 0;
+
+ rdsdebug("conn %p\n", conn);
+
+ /*
+ * If we get a temporary posting failure in this context then
+ * we're really low and we want the caller to back off for a bit.
+ */
+ mutex_lock(&ic->i_recv_mutex);
+ if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+ ret = -ENOMEM;
+ else
+ rds_iw_stats_inc(s_iw_rx_refill_from_thread);
+ mutex_unlock(&ic->i_recv_mutex);
+
+ if (rds_conn_up(conn))
+ rds_iw_attempt_ack(ic);
+
+ return ret;
+}
+
+int rds_iw_recv_init(void)
+{
+ struct sysinfo si;
+ int ret = -ENOMEM;
+
+ /* Default to 30% of all available RAM for recv memory */
+ si_meminfo(&si);
+ rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+ rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
+ sizeof(struct rds_iw_incoming),
+ 0, 0, NULL);
+ if (!rds_iw_incoming_slab)
+ goto out;
+
+ rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
+ sizeof(struct rds_page_frag),
+ 0, 0, NULL);
+ if (!rds_iw_frag_slab)
+ kmem_cache_destroy(rds_iw_incoming_slab);
+ else
+ ret = 0;
+out:
+ return ret;
+}
+
+void rds_iw_recv_exit(void)
+{
+ kmem_cache_destroy(rds_iw_incoming_slab);
+ kmem_cache_destroy(rds_iw_frag_slab);
+}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
new file mode 100644
index 000000000..da8e3b63f
--- /dev/null
+++ b/net/rds/iw_ring.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
+
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
+{
+ memset(ring, 0, sizeof(*ring));
+ ring->w_nr = nr;
+ rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
+{
+ u32 diff;
+
+ /* This assumes that atomic_t has at least as many bits as u32 */
+ diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+ BUG_ON(diff > ring->w_nr);
+
+ return diff;
+}
+
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
+{
+ /* We only ever get called from the connection setup code,
+ * prior to creating the QP. */
+ BUG_ON(__rds_iw_ring_used(ring));
+ ring->w_nr = nr;
+}
+
+static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+ return __rds_iw_ring_used(ring) == 0;
+}
+
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
+{
+ u32 ret = 0, avail;
+
+ avail = ring->w_nr - __rds_iw_ring_used(ring);
+
+ rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+ ring->w_alloc_ptr, avail);
+
+ if (val && avail) {
+ ret = min(val, avail);
+ *pos = ring->w_alloc_ptr;
+
+ ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+ ring->w_alloc_ctr += ret;
+ }
+
+ return ret;
+}
+
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
+{
+ ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+ atomic_add(val, &ring->w_free_ctr);
+
+ if (__rds_iw_ring_empty(ring) &&
+ waitqueue_active(&rds_iw_ring_empty_wait))
+ wake_up(&rds_iw_ring_empty_wait);
+}
+
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
+{
+ ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+ ring->w_alloc_ctr -= val;
+}
+
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+ return __rds_iw_ring_empty(ring);
+}
+
+int rds_iw_ring_low(struct rds_iw_work_ring *ring)
+{
+ return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
+}
+
+
+/*
+ * returns the oldest alloced ring entry. This will be the next one
+ * freed. This can't be called if there are none allocated.
+ */
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
+{
+ return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
+{
+ u32 ret;
+
+ if (oldest <= (unsigned long long)wr_id)
+ ret = (unsigned long long)wr_id - oldest + 1;
+ else
+ ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+ rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+ wr_id, oldest);
+ return ret;
+}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
new file mode 100644
index 000000000..13834780a
--- /dev/null
+++ b/net/rds/iw_send.c
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "iw.h"
+
+static void rds_iw_send_rdma_complete(struct rds_message *rm,
+ int wc_status)
+{
+ int notify_status;
+
+ switch (wc_status) {
+ case IB_WC_WR_FLUSH_ERR:
+ return;
+
+ case IB_WC_SUCCESS:
+ notify_status = RDS_RDMA_SUCCESS;
+ break;
+
+ case IB_WC_REM_ACCESS_ERR:
+ notify_status = RDS_RDMA_REMOTE_ERROR;
+ break;
+
+ default:
+ notify_status = RDS_RDMA_OTHER_ERROR;
+ break;
+ }
+ rds_rdma_send_complete(rm, notify_status);
+}
+
+static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
+ struct rm_rdma_op *op)
+{
+ if (op->op_mapped) {
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents,
+ op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->op_mapped = 0;
+ }
+}
+
+static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
+ struct rds_iw_send_work *send,
+ int wc_status)
+{
+ struct rds_message *rm = send->s_rm;
+
+ rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ rm->data.op_sg, rm->data.op_nents,
+ DMA_TO_DEVICE);
+
+ if (rm->rdma.op_active) {
+ rds_iw_send_unmap_rdma(ic, &rm->rdma);
+
+ /* If the user asked for a completion notification on this
+ * message, we can implement three different semantics:
+ * 1. Notify when we received the ACK on the RDS message
+ * that was queued with the RDMA. This provides reliable
+ * notification of RDMA status at the expense of a one-way
+ * packet delay.
+ * 2. Notify when the IB stack gives us the completion event for
+ * the RDMA operation.
+ * 3. Notify when the IB stack gives us the completion event for
+ * the accompanying RDS messages.
+ * Here, we implement approach #3. To implement approach #2,
+ * call rds_rdma_send_complete from the cq_handler. To implement #1,
+ * don't call rds_rdma_send_complete at all, and fall back to the notify
+ * handling in the ACK processing code.
+ *
+ * Note: There's no need to explicitly sync any RDMA buffers using
+ * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+ * operation itself unmapped the RDMA buffers, which takes care
+ * of synching.
+ */
+ rds_iw_send_rdma_complete(rm, wc_status);
+
+ if (rm->rdma.op_write)
+ rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
+ else
+ rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
+ }
+
+ /* If anyone waited for this message to get flushed out, wake
+ * them up now */
+ rds_message_unmapped(rm);
+
+ rds_message_put(rm);
+ send->s_rm = NULL;
+}
+
+void rds_iw_send_init_ring(struct rds_iw_connection *ic)
+{
+ struct rds_iw_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ struct ib_sge *sge;
+
+ send->s_rm = NULL;
+ send->s_op = NULL;
+ send->s_mapping = NULL;
+
+ send->s_wr.next = NULL;
+ send->s_wr.wr_id = i;
+ send->s_wr.sg_list = send->s_sge;
+ send->s_wr.num_sge = 1;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.send_flags = 0;
+ send->s_wr.ex.imm_data = 0;
+
+ sge = rds_iw_data_sge(ic, send->s_sge);
+ sge->lkey = 0;
+
+ sge = rds_iw_header_sge(ic, send->s_sge);
+ sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = 0;
+
+ send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+ if (IS_ERR(send->s_mr)) {
+ printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
+ break;
+ }
+
+ send->s_page_list = ib_alloc_fast_reg_page_list(
+ ic->i_cm_id->device, fastreg_message_size);
+ if (IS_ERR(send->s_page_list)) {
+ printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
+ break;
+ }
+ }
+}
+
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
+{
+ struct rds_iw_send_work *send;
+ u32 i;
+
+ for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+ BUG_ON(!send->s_mr);
+ ib_dereg_mr(send->s_mr);
+ BUG_ON(!send->s_page_list);
+ ib_free_fast_reg_page_list(send->s_page_list);
+ if (send->s_wr.opcode == 0xdead)
+ continue;
+ if (send->s_rm)
+ rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+ if (send->s_op)
+ rds_iw_send_unmap_rdma(ic, send->s_op);
+ }
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path. As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+ struct rds_connection *conn = context;
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct ib_wc wc;
+ struct rds_iw_send_work *send;
+ u32 completed;
+ u32 oldest;
+ u32 i;
+ int ret;
+
+ rdsdebug("cq %p conn %p\n", cq, conn);
+ rds_iw_stats_inc(s_iw_tx_cq_call);
+ ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ if (ret)
+ rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+ be32_to_cpu(wc.ex.imm_data));
+ rds_iw_stats_inc(s_iw_tx_cq_event);
+
+ if (wc.status != IB_WC_SUCCESS) {
+ printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode);
+ break;
+ }
+
+ if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
+ ic->i_fastreg_posted = 0;
+ continue;
+ }
+
+ if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
+ ic->i_fastreg_posted = 1;
+ continue;
+ }
+
+ if (wc.wr_id == RDS_IW_ACK_WR_ID) {
+ if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+ rds_iw_stats_inc(s_iw_tx_stalled);
+ rds_iw_ack_send_complete(ic);
+ continue;
+ }
+
+ oldest = rds_iw_ring_oldest(&ic->i_send_ring);
+
+ completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+ for (i = 0; i < completed; i++) {
+ send = &ic->i_sends[oldest];
+
+ /* In the error case, wc.opcode sometimes contains garbage */
+ switch (send->s_wr.opcode) {
+ case IB_WR_SEND:
+ if (send->s_rm)
+ rds_iw_send_unmap_rm(ic, send, wc.status);
+ break;
+ case IB_WR_FAST_REG_MR:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_READ_WITH_INV:
+ /* Nothing to be done - the SG list will be unmapped
+ * when the SEND completes. */
+ break;
+ default:
+ printk_ratelimited(KERN_NOTICE
+ "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
+ __func__, send->s_wr.opcode);
+ break;
+ }
+
+ send->s_wr.opcode = 0xdead;
+ send->s_wr.num_sge = 1;
+ if (time_after(jiffies, send->s_queued + HZ/2))
+ rds_iw_stats_inc(s_iw_tx_stalled);
+
+ /* If a RDMA operation produced an error, signal this right
+ * away. If we don't, the subsequent SEND that goes with this
+ * RDMA will be canceled with ERR_WFLUSH, and the application
+ * never learn that the RDMA failed. */
+ if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+ struct rds_message *rm;
+
+ rm = rds_send_get_message(conn, send->s_op);
+ if (rm)
+ rds_iw_send_rdma_complete(rm, wc.status);
+ }
+
+ oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+ }
+
+ rds_iw_ring_free(&ic->i_send_ring, completed);
+
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+ test_bit(0, &conn->c_map_queued))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ /* We expect errors as the qp is drained during shutdown */
+ if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+ rds_iw_conn_error(conn,
+ "send completion on %pI4 "
+ "had status %u, disconnecting and reconnecting\n",
+ &conn->c_faddr, wc.status);
+ }
+ }
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ * - send credits: this tells us how many WRs we're allowed
+ * to submit without overruning the receiver's queue. For
+ * each SEND WR we post, we decrement this by one.
+ *
+ * - posted credits: this tells us how many WRs we recently
+ * posted to the receive queue. This value is transferred
+ * to the peer as a "credit update" in a RDS header field.
+ * Every time we transmit credits to the peer, we subtract
+ * the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_iw_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter. Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
+ u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+{
+ unsigned int avail, posted, got = 0, advertise;
+ long oldval, newval;
+
+ *adv_credits = 0;
+ if (!ic->i_flowctl)
+ return wanted;
+
+try_again:
+ advertise = 0;
+ oldval = newval = atomic_read(&ic->i_credits);
+ posted = IB_GET_POST_CREDITS(oldval);
+ avail = IB_GET_SEND_CREDITS(oldval);
+
+ rdsdebug("wanted=%u credits=%u posted=%u\n",
+ wanted, avail, posted);
+
+ /* The last credit must be used to send a credit update. */
+ if (avail && !posted)
+ avail--;
+
+ if (avail < wanted) {
+ struct rds_connection *conn = ic->i_cm_id->context;
+
+ /* Oops, there aren't that many credits left! */
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ got = avail;
+ } else {
+ /* Sometimes you get what you want, lalala. */
+ got = wanted;
+ }
+ newval -= IB_SET_SEND_CREDITS(got);
+
+ /*
+ * If need_posted is non-zero, then the caller wants
+ * the posted regardless of whether any send credits are
+ * available.
+ */
+ if (posted && (got || need_posted)) {
+ advertise = min_t(unsigned int, posted, max_posted);
+ newval -= IB_SET_POST_CREDITS(advertise);
+ }
+
+ /* Finally bill everything */
+ if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+ goto try_again;
+
+ *adv_credits = advertise;
+ return got;
+}
+
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ if (credits == 0)
+ return;
+
+ rdsdebug("credits=%u current=%u%s\n",
+ credits,
+ IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+ test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+ atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+ rds_iw_stats_inc(s_iw_rx_credit_updates);
+}
+
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ if (posted == 0)
+ return;
+
+ atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+ /* Decide whether to send an update to the peer now.
+ * If we would send a credit update for every single buffer we
+ * post, we would end up with an ACK storm (ACK arrives,
+ * consumes buffer, we refill the ring, send ACK to remote
+ * advertising the newly posted buffer... ad inf)
+ *
+ * Performance pretty much depends on how often we send
+ * credit updates - too frequent updates mean lots of ACKs.
+ * Too infrequent updates, and the peer will run out of
+ * credits and has to throttle.
+ * For the time being, 16 seems to be a good compromise.
+ */
+ if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+ set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline void
+rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
+ struct rds_iw_send_work *send, unsigned int pos,
+ unsigned long buffer, unsigned int length,
+ int send_flags)
+{
+ struct ib_sge *sge;
+
+ WARN_ON(pos != send - ic->i_sends);
+
+ send->s_wr.send_flags = send_flags;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.num_sge = 2;
+ send->s_wr.next = NULL;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+
+ if (length != 0) {
+ sge = rds_iw_data_sge(ic, send->s_sge);
+ sge->addr = buffer;
+ sge->length = length;
+ sge->lkey = rds_iw_local_dma_lkey(ic);
+
+ sge = rds_iw_header_sge(ic, send->s_sge);
+ } else {
+ /* We're sending a packet with no payload. There is only
+ * one SGE */
+ send->s_wr.num_sge = 1;
+ sge = &send->s_sge[0];
+ }
+
+ sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = rds_iw_local_dma_lkey(ic);
+}
+
+/*
+ * This can be called multiple times for a given message. The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests. We translate the scatterlist into a series
+ * of work requests that fragment the message. These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection. This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct ib_device *dev = ic->i_cm_id->device;
+ struct rds_iw_send_work *send = NULL;
+ struct rds_iw_send_work *first;
+ struct rds_iw_send_work *prev;
+ struct ib_send_wr *failed_wr;
+ struct scatterlist *scat;
+ u32 pos;
+ u32 i;
+ u32 work_alloc;
+ u32 credit_alloc;
+ u32 posted;
+ u32 adv_credits = 0;
+ int send_flags = 0;
+ int sent;
+ int ret;
+ int flow_controlled = 0;
+
+ BUG_ON(off % RDS_FRAG_SIZE);
+ BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+ /* Fastreg support */
+ if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /* FIXME we may overallocate here */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+ i = 1;
+ else
+ i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc == 0) {
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_iw_stats_inc(s_iw_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ credit_alloc = work_alloc;
+ if (ic->i_flowctl) {
+ credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+ adv_credits += posted;
+ if (credit_alloc < work_alloc) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+ work_alloc = credit_alloc;
+ flow_controlled++;
+ }
+ if (work_alloc == 0) {
+ set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_iw_stats_inc(s_iw_tx_throttle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* map the message the first time we see it */
+ if (!ic->i_rm) {
+ /*
+ printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
+ be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+ rm->m_inc.i_hdr.h_flags,
+ be32_to_cpu(rm->m_inc.i_hdr.h_len));
+ */
+ if (rm->data.op_nents) {
+ rm->data.op_count = ib_dma_map_sg(dev,
+ rm->data.op_sg,
+ rm->data.op_nents,
+ DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+ if (rm->data.op_count == 0) {
+ rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+ } else {
+ rm->data.op_count = 0;
+ }
+
+ ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+ ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+ rds_message_addref(rm);
+ ic->i_rm = rm;
+
+ /* Finalize the header */
+ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+ if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+ rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+ /* If it has a RDMA op, tell the peer we did it. This is
+ * used by the peer to release use-once RDMA MRs. */
+ if (rm->rdma.op_active) {
+ struct rds_ext_header_rdma ext_hdr;
+
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ }
+ if (rm->m_rdma_cookie) {
+ rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+ rds_rdma_cookie_key(rm->m_rdma_cookie),
+ rds_rdma_cookie_offset(rm->m_rdma_cookie));
+ }
+
+ /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
+ * we should not do this unless we have a chance of at least
+ * sticking the header into the send ring. Which is why we
+ * should call rds_iw_ring_alloc first. */
+ rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
+ rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+ /*
+ * Update adv_credits since we reset the ACK_REQUIRED bit.
+ */
+ rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+ adv_credits += posted;
+ BUG_ON(adv_credits > 255);
+ }
+
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &rm->data.op_sg[sg];
+ sent = 0;
+ i = 0;
+
+ /* Sometimes you want to put a fence between an RDMA
+ * READ and the following SEND.
+ * We could either do this all the time
+ * or when requested by the user. Right now, we let
+ * the application choose.
+ */
+ if (rm->rdma.op_active && rm->rdma.op_fence)
+ send_flags = IB_SEND_FENCE;
+
+ /*
+ * We could be copying the header into the unused tail of the page.
+ * That would need to be changed in the future when those pages might
+ * be mapped userspace pages or page cache pages. So instead we always
+ * use a second sge and our long-lived ring of mapped headers. We send
+ * the header after the data so that the data payload can be aligned on
+ * the receiver.
+ */
+
+ /* handle a 0-len message */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+ rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+ goto add_header;
+ }
+
+ /* if there's data reference it with a chain of work reqs */
+ for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
+ unsigned int len;
+
+ send = &ic->i_sends[pos];
+
+ len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+ rds_iw_xmit_populate_wr(ic, send, pos,
+ ib_sg_dma_address(dev, scat) + off, len,
+ send_flags);
+
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time
+ * on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
+
+ ic->i_unsignaled_bytes -= len;
+ if (ic->i_unsignaled_bytes <= 0) {
+ ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
+
+ /*
+ * Always signal the last one if we're stopping due to flow control.
+ */
+ if (flow_controlled && i == (work_alloc-1))
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ sent += len;
+ off += len;
+ if (off == ib_sg_dma_len(dev, scat)) {
+ scat++;
+ off = 0;
+ }
+
+add_header:
+ /* Tack on the header after the data. The header SGE should already
+ * have been set up to point to the right header buffer. */
+ memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+ if (0) {
+ struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+ printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+ be16_to_cpu(hdr->h_dport),
+ hdr->h_flags,
+ be32_to_cpu(hdr->h_len));
+ }
+ if (adv_credits) {
+ struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+ /* add credit and redo the header checksum */
+ hdr->h_credit = adv_credits;
+ rds_message_make_checksum(hdr);
+ adv_credits = 0;
+ rds_iw_stats_inc(s_iw_tx_credit_updates);
+ }
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+ prev = send;
+
+ pos = (pos + 1) % ic->i_send_ring.w_nr;
+ }
+
+ /* Account the RDS header in the number of bytes we sent, but just once.
+ * The caller has no concept of fragmentation. */
+ if (hdr_off == 0)
+ sent += sizeof(struct rds_header);
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &rm->data.op_sg[rm->data.op_count]) {
+ prev->s_rm = ic->i_rm;
+ prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ ic->i_rm = NULL;
+ }
+
+ if (i < work_alloc) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+ if (ic->i_flowctl && i < credit_alloc)
+ rds_iw_send_add_credits(conn, credit_alloc - i);
+
+ /* XXX need to worry about failed_wr and partial sends. */
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ if (prev->s_rm) {
+ ic->i_rm = prev->s_rm;
+ prev->s_rm = NULL;
+ }
+ goto out;
+ }
+
+ ret = sent;
+out:
+ BUG_ON(adv_credits);
+ return ret;
+}
+
+static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
+{
+ BUG_ON(nent > send->s_page_list->max_page_list_len);
+ /*
+ * Perform a WR for the fast_reg_mr. Each individual page
+ * in the sg list is added to the fast reg page list and placed
+ * inside the fast_reg_mr WR.
+ */
+ send->s_wr.opcode = IB_WR_FAST_REG_MR;
+ send->s_wr.wr.fast_reg.length = len;
+ send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
+ send->s_wr.wr.fast_reg.page_list = send->s_page_list;
+ send->s_wr.wr.fast_reg.page_list_len = nent;
+ send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
+ send->s_wr.wr.fast_reg.iova_start = sg_addr;
+
+ ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
+}
+
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+ struct rds_iw_send_work *send = NULL;
+ struct rds_iw_send_work *first;
+ struct rds_iw_send_work *prev;
+ struct ib_send_wr *failed_wr;
+ struct rds_iw_device *rds_iwdev;
+ struct scatterlist *scat;
+ unsigned long len;
+ u64 remote_addr = op->op_remote_addr;
+ u32 pos, fr_pos;
+ u32 work_alloc;
+ u32 i;
+ u32 j;
+ int sent;
+ int ret;
+ int num_sge;
+
+ rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+
+ /* map the message the first time we see it */
+ if (!op->op_mapped) {
+ op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents, (op->op_write) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+ if (op->op_count == 0) {
+ rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+
+ op->op_mapped = 1;
+ }
+
+ if (!op->op_write) {
+ /* Alloc space on the send queue for the fastreg */
+ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
+ if (work_alloc != 1) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_iw_stats_inc(s_iw_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /*
+ * Instead of knowing how to return a partial rdma read/write we insist that there
+ * be enough work requests to send the entire message.
+ */
+ i = ceil(op->op_count, rds_iwdev->max_sge);
+
+ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+ if (work_alloc != i) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_iw_stats_inc(s_iw_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ send = &ic->i_sends[pos];
+ if (!op->op_write) {
+ first = prev = &ic->i_sends[fr_pos];
+ } else {
+ first = send;
+ prev = NULL;
+ }
+ scat = &op->op_sg[0];
+ sent = 0;
+ num_sge = op->op_count;
+
+ for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+ send->s_wr.send_flags = 0;
+ send->s_queued = jiffies;
+
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags = IB_SEND_SIGNALED;
+ }
+
+ /* To avoid the need to have the plumbing to invalidate the fastreg_mr used
+ * for local access after RDS is finished with it, using
+ * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
+ */
+ if (op->op_write)
+ send->s_wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+
+ send->s_wr.wr.rdma.remote_addr = remote_addr;
+ send->s_wr.wr.rdma.rkey = op->op_rkey;
+ send->s_op = op;
+
+ if (num_sge > rds_iwdev->max_sge) {
+ send->s_wr.num_sge = rds_iwdev->max_sge;
+ num_sge -= rds_iwdev->max_sge;
+ } else
+ send->s_wr.num_sge = num_sge;
+
+ send->s_wr.next = NULL;
+
+ if (prev)
+ prev->s_wr.next = &send->s_wr;
+
+ for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+ len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+
+ if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
+ send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
+ else {
+ send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
+ send->s_sge[j].length = len;
+ send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
+ }
+
+ sent += len;
+ rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+ remote_addr += len;
+
+ scat++;
+ }
+
+ if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
+ send->s_wr.num_sge = 1;
+ send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
+ send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
+ send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
+ }
+
+ rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+ &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+ prev = send;
+ if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+ send = ic->i_sends;
+ }
+
+ /* if we finished the message then send completion owns it */
+ if (scat == &op->op_sg[op->op_count])
+ first->s_wr.send_flags = IB_SEND_SIGNALED;
+
+ if (i < work_alloc) {
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+ work_alloc = i;
+ }
+
+ /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
+ * recommended. Putting the lkey on the wire is a security hole, as it can
+ * allow for memory access to all of memory on the remote system. Some
+ * adapters do not allow using the lkey for this at all. To bypass this use a
+ * fastreg_mr (or possibly a dma_mr)
+ */
+ if (!op->op_write) {
+ rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
+ op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+ work_alloc++;
+ }
+
+ failed_wr = &first->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+ first, &first->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+void rds_iw_xmit_complete(struct rds_connection *conn)
+{
+ struct rds_iw_connection *ic = conn->c_transport_data;
+
+ /* We may have a pending ACK or window update we were unable
+ * to send previously (due to flow control). Try again. */
+ rds_iw_attempt_ack(ic);
+}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
new file mode 100644
index 000000000..5fe67f6a1
--- /dev/null
+++ b/net/rds/iw_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "iw.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
+
+static const char *const rds_iw_stat_names[] = {
+ "iw_connect_raced",
+ "iw_listen_closed_stale",
+ "iw_tx_cq_call",
+ "iw_tx_cq_event",
+ "iw_tx_ring_full",
+ "iw_tx_throttle",
+ "iw_tx_sg_mapping_failure",
+ "iw_tx_stalled",
+ "iw_tx_credit_updates",
+ "iw_rx_cq_call",
+ "iw_rx_cq_event",
+ "iw_rx_ring_empty",
+ "iw_rx_refill_from_cq",
+ "iw_rx_refill_from_thread",
+ "iw_rx_alloc_limit",
+ "iw_rx_credit_updates",
+ "iw_ack_sent",
+ "iw_ack_send_failure",
+ "iw_ack_send_delayed",
+ "iw_ack_send_piggybacked",
+ "iw_ack_received",
+ "iw_rdma_mr_alloc",
+ "iw_rdma_mr_free",
+ "iw_rdma_mr_used",
+ "iw_rdma_mr_pool_flush",
+ "iw_rdma_mr_pool_wait",
+ "iw_rdma_mr_pool_depleted",
+};
+
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rds_iw_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ if (avail < ARRAY_SIZE(rds_iw_stat_names))
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
+ ARRAY_SIZE(rds_iw_stat_names));
+out:
+ return ARRAY_SIZE(rds_iw_stat_names);
+}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
new file mode 100644
index 000000000..139239d2c
--- /dev/null
+++ b/net/rds/iw_sysctl.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "iw.h"
+
+static struct ctl_table_header *rds_iw_sysctl_hdr;
+
+unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
+unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
+unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_iw_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
+
+unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
+
+unsigned int rds_iw_sysctl_flow_control = 1;
+
+static struct ctl_table rds_iw_sysctl_table[] = {
+ {
+ .procname = "max_send_wr",
+ .data = &rds_iw_sysctl_max_send_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_iw_sysctl_max_wr_min,
+ .extra2 = &rds_iw_sysctl_max_wr_max,
+ },
+ {
+ .procname = "max_recv_wr",
+ .data = &rds_iw_sysctl_max_recv_wr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_iw_sysctl_max_wr_min,
+ .extra2 = &rds_iw_sysctl_max_wr_max,
+ },
+ {
+ .procname = "max_unsignaled_wr",
+ .data = &rds_iw_sysctl_max_unsig_wrs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_iw_sysctl_max_unsig_wr_min,
+ .extra2 = &rds_iw_sysctl_max_unsig_wr_max,
+ },
+ {
+ .procname = "max_unsignaled_bytes",
+ .data = &rds_iw_sysctl_max_unsig_bytes,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
+ .extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
+ },
+ {
+ .procname = "max_recv_allocation",
+ .data = &rds_iw_sysctl_max_recv_allocation,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "flow_control",
+ .data = &rds_iw_sysctl_flow_control,
+ .maxlen = sizeof(rds_iw_sysctl_flow_control),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+void rds_iw_sysctl_exit(void)
+{
+ unregister_net_sysctl_table(rds_iw_sysctl_hdr);
+}
+
+int rds_iw_sysctl_init(void)
+{
+ rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
+ if (!rds_iw_sysctl_hdr)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 000000000..6b12b6854
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static DEFINE_SPINLOCK(loop_conns_lock);
+static LIST_HEAD(loop_conns);
+
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport. At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver. In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg,
+ unsigned int off)
+{
+ struct scatterlist *sgp = &rm->data.op_sg[sg];
+ int ret = sizeof(struct rds_header) +
+ be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ /* Do not send cong updates to loopback */
+ if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+ rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+ ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
+ goto out;
+ }
+
+ BUG_ON(hdr_off || sg || off);
+
+ rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+ /* For the embedded inc. Matching put is in loop_inc_free() */
+ rds_message_addref(rm);
+
+ rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+ GFP_KERNEL);
+
+ rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+ NULL);
+
+ rds_inc_put(&rm->m_inc);
+out:
+ return ret;
+}
+
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
+{
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+ rds_message_put(rm);
+}
+
+/* we need to at least give the thread something to succeed */
+static int rds_loop_recv(struct rds_connection *conn)
+{
+ return 0;
+}
+
+struct rds_loop_connection {
+ struct list_head loop_node;
+ struct rds_connection *conn;
+};
+
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rds_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_loop_connection *lc;
+ unsigned long flags;
+
+ lc = kzalloc(sizeof(struct rds_loop_connection), gfp);
+ if (!lc)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&lc->loop_node);
+ lc->conn = conn;
+ conn->c_transport_data = lc;
+
+ spin_lock_irqsave(&loop_conns_lock, flags);
+ list_add_tail(&lc->loop_node, &loop_conns);
+ spin_unlock_irqrestore(&loop_conns_lock, flags);
+
+ return 0;
+}
+
+static void rds_loop_conn_free(void *arg)
+{
+ struct rds_loop_connection *lc = arg;
+ unsigned long flags;
+
+ rdsdebug("lc %p\n", lc);
+ spin_lock_irqsave(&loop_conns_lock, flags);
+ list_del(&lc->loop_node);
+ spin_unlock_irqrestore(&loop_conns_lock, flags);
+ kfree(lc);
+}
+
+static int rds_loop_conn_connect(struct rds_connection *conn)
+{
+ rds_connect_complete(conn);
+ return 0;
+}
+
+static void rds_loop_conn_shutdown(struct rds_connection *conn)
+{
+}
+
+void rds_loop_exit(void)
+{
+ struct rds_loop_connection *lc, *_lc;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&loop_conns_lock);
+ list_splice(&loop_conns, &tmp_list);
+ INIT_LIST_HEAD(&loop_conns);
+ spin_unlock_irq(&loop_conns_lock);
+
+ list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+ WARN_ON(lc->conn->c_passive);
+ rds_conn_destroy(lc->conn);
+ }
+}
+
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rds_loop_transport.
+ */
+struct rds_transport rds_loop_transport = {
+ .xmit = rds_loop_xmit,
+ .recv = rds_loop_recv,
+ .conn_alloc = rds_loop_conn_alloc,
+ .conn_free = rds_loop_conn_free,
+ .conn_connect = rds_loop_conn_connect,
+ .conn_shutdown = rds_loop_conn_shutdown,
+ .inc_copy_to_user = rds_message_inc_copy_to_user,
+ .inc_free = rds_loop_inc_free,
+ .t_name = "loopback",
+};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 000000000..f32b0939a
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,9 @@
+#ifndef _RDS_LOOP_H
+#define _RDS_LOOP_H
+
+/* loop.c */
+extern struct rds_transport rds_loop_transport;
+
+void rds_loop_exit(void);
+
+#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 000000000..756c73729
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
+[RDS_EXTHDR_NONE] = 0,
+[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
+[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
+[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+};
+
+
+void rds_message_addref(struct rds_message *rm)
+{
+ rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+ atomic_inc(&rm->m_refcount);
+}
+EXPORT_SYMBOL_GPL(rds_message_addref);
+
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void rds_message_purge(struct rds_message *rm)
+{
+ unsigned long i;
+
+ if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
+ return;
+
+ for (i = 0; i < rm->data.op_nents; i++) {
+ rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
+ /* XXX will have to put_page for page refs */
+ __free_page(sg_page(&rm->data.op_sg[i]));
+ }
+ rm->data.op_nents = 0;
+
+ if (rm->rdma.op_active)
+ rds_rdma_free_op(&rm->rdma);
+ if (rm->rdma.op_rdma_mr)
+ rds_mr_put(rm->rdma.op_rdma_mr);
+
+ if (rm->atomic.op_active)
+ rds_atomic_free_op(&rm->atomic);
+ if (rm->atomic.op_rdma_mr)
+ rds_mr_put(rm->atomic.op_rdma_mr);
+}
+
+void rds_message_put(struct rds_message *rm)
+{
+ rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+ WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
+ if (atomic_dec_and_test(&rm->m_refcount)) {
+ BUG_ON(!list_empty(&rm->m_sock_item));
+ BUG_ON(!list_empty(&rm->m_conn_item));
+ rds_message_purge(rm);
+
+ kfree(rm);
+ }
+}
+EXPORT_SYMBOL_GPL(rds_message_put);
+
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+ __be16 dport, u64 seq)
+{
+ hdr->h_flags = 0;
+ hdr->h_sport = sport;
+ hdr->h_dport = dport;
+ hdr->h_sequence = cpu_to_be64(seq);
+ hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+}
+EXPORT_SYMBOL_GPL(rds_message_populate_header);
+
+int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
+ const void *data, unsigned int len)
+{
+ unsigned int ext_len = sizeof(u8) + len;
+ unsigned char *dst;
+
+ /* For now, refuse to add more than one extension header */
+ if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
+ return 0;
+
+ if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+ return 0;
+
+ if (ext_len >= RDS_HEADER_EXT_SPACE)
+ return 0;
+ dst = hdr->h_exthdr;
+
+ *dst++ = type;
+ memcpy(dst, data, len);
+
+ dst[len] = RDS_EXTHDR_NONE;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(rds_message_add_extension);
+
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ * buflen = sizeof(buffer);
+ * type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
+ * if (type == RDS_EXTHDR_NONE)
+ * break;
+ * ...
+ * }
+ */
+int rds_message_next_extension(struct rds_header *hdr,
+ unsigned int *pos, void *buf, unsigned int *buflen)
+{
+ unsigned int offset, ext_type, ext_len;
+ u8 *src = hdr->h_exthdr;
+
+ offset = *pos;
+ if (offset >= RDS_HEADER_EXT_SPACE)
+ goto none;
+
+ /* Get the extension type and length. For now, the
+ * length is implied by the extension type. */
+ ext_type = src[offset++];
+
+ if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
+ goto none;
+ ext_len = rds_exthdr_size[ext_type];
+ if (offset + ext_len > RDS_HEADER_EXT_SPACE)
+ goto none;
+
+ *pos = offset + ext_len;
+ if (ext_len < *buflen)
+ *buflen = ext_len;
+ memcpy(buf, src + offset, *buflen);
+ return ext_type;
+
+none:
+ *pos = RDS_HEADER_EXT_SPACE;
+ *buflen = 0;
+ return RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
+{
+ struct rds_ext_header_rdma_dest ext_hdr;
+
+ ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
+ ext_hdr.h_rdma_offset = cpu_to_be32(offset);
+ return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+}
+EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
+
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
+{
+ struct rds_message *rm;
+
+ if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
+ return NULL;
+
+ rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
+ if (!rm)
+ goto out;
+
+ rm->m_used_sgs = 0;
+ rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
+ atomic_set(&rm->m_refcount, 1);
+ INIT_LIST_HEAD(&rm->m_sock_item);
+ INIT_LIST_HEAD(&rm->m_conn_item);
+ spin_lock_init(&rm->m_rs_lock);
+ init_waitqueue_head(&rm->m_flush_wait);
+
+out:
+ return rm;
+}
+
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+ struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+ struct scatterlist *sg_ret;
+
+ WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+ WARN_ON(!nents);
+
+ if (rm->m_used_sgs + nents > rm->m_total_sgs)
+ return NULL;
+
+ sg_ret = &sg_first[rm->m_used_sgs];
+ sg_init_table(sg_ret, nents);
+ rm->m_used_sgs += nents;
+
+ return sg_ret;
+}
+
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+ struct rds_message *rm;
+ unsigned int i;
+ int num_sgs = ceil(total_len, PAGE_SIZE);
+ int extra_bytes = num_sgs * sizeof(struct scatterlist);
+
+ rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
+ if (!rm)
+ return ERR_PTR(-ENOMEM);
+
+ set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
+ rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+ rm->data.op_nents = ceil(total_len, PAGE_SIZE);
+ rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+ if (!rm->data.op_sg) {
+ rds_message_put(rm);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < rm->data.op_nents; ++i) {
+ sg_set_page(&rm->data.op_sg[i],
+ virt_to_page(page_addrs[i]),
+ PAGE_SIZE, 0);
+ }
+
+ return rm;
+}
+
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
+{
+ unsigned long to_copy, nbytes;
+ unsigned long sg_off;
+ struct scatterlist *sg;
+ int ret = 0;
+
+ rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+
+ /*
+ * now allocate and copy in the data payload.
+ */
+ sg = rm->data.op_sg;
+ sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+
+ while (iov_iter_count(from)) {
+ if (!sg_page(sg)) {
+ ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
+ GFP_HIGHUSER);
+ if (ret)
+ return ret;
+ rm->data.op_nents++;
+ sg_off = 0;
+ }
+
+ to_copy = min_t(unsigned long, iov_iter_count(from),
+ sg->length - sg_off);
+
+ rds_stats_add(s_copy_from_user, to_copy);
+ nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off,
+ to_copy, from);
+ if (nbytes != to_copy)
+ return -EFAULT;
+
+ sg_off += to_copy;
+
+ if (sg_off == sg->length)
+ sg++;
+ }
+
+ return ret;
+}
+
+int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+ struct rds_message *rm;
+ struct scatterlist *sg;
+ unsigned long to_copy;
+ unsigned long vec_off;
+ int copied;
+ int ret;
+ u32 len;
+
+ rm = container_of(inc, struct rds_message, m_inc);
+ len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ sg = rm->data.op_sg;
+ vec_off = 0;
+ copied = 0;
+
+ while (iov_iter_count(to) && copied < len) {
+ to_copy = min_t(unsigned long, iov_iter_count(to),
+ sg->length - vec_off);
+ to_copy = min_t(unsigned long, to_copy, len - copied);
+
+ rds_stats_add(s_copy_to_user, to_copy);
+ ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off,
+ to_copy, to);
+ if (ret != to_copy)
+ return -EFAULT;
+
+ vec_off += to_copy;
+ copied += to_copy;
+
+ if (vec_off == sg->length) {
+ vec_off = 0;
+ sg++;
+ }
+ }
+
+ return copied;
+}
+
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void rds_message_wait(struct rds_message *rm)
+{
+ wait_event_interruptible(rm->m_flush_wait,
+ !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
+}
+
+void rds_message_unmapped(struct rds_message *rm)
+{
+ clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ wake_up_interruptible(&rm->m_flush_wait);
+}
+EXPORT_SYMBOL_GPL(rds_message_unmapped);
+
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 000000000..9005a2c92
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+struct rds_page_remainder {
+ struct page *r_page;
+ unsigned long r_offset;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
+ rds_page_remainders);
+
+/*
+ * returns 0 on success or -errno on failure.
+ *
+ * We don't have to worry about flush_dcache_page() as this only works
+ * with private pages. If, say, we were to do directed receive to pinned
+ * user pages we'd have to worry more about cache coherence. (Though
+ * the flush_dcache_page() in get_user_pages() would probably be enough).
+ */
+int rds_page_copy_user(struct page *page, unsigned long offset,
+ void __user *ptr, unsigned long bytes,
+ int to_user)
+{
+ unsigned long ret;
+ void *addr;
+
+ addr = kmap(page);
+ if (to_user) {
+ rds_stats_add(s_copy_to_user, bytes);
+ ret = copy_to_user(ptr, addr + offset, bytes);
+ } else {
+ rds_stats_add(s_copy_from_user, bytes);
+ ret = copy_from_user(addr + offset, ptr, bytes);
+ }
+ kunmap(page);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(rds_page_copy_user);
+
+/**
+ * rds_page_remainder_alloc - build up regions of a message.
+ *
+ * @scat: Scatter list for message
+ * @bytes: the number of bytes needed.
+ * @gfp: the waiting behaviour of the allocation
+ *
+ * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to
+ * kmap the pages, etc.
+ *
+ * If @bytes is at least a full page then this just returns a page from
+ * alloc_page().
+ *
+ * If @bytes is a partial page then this stores the unused region of the
+ * page in a per-cpu structure. Future partial-page allocations may be
+ * satisfied from that cached region. This lets us waste less memory on
+ * small allocations with minimal complexity. It works because the transmit
+ * path passes read-only page regions down to devices. They hold a page
+ * reference until they are done with the region.
+ */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+ gfp_t gfp)
+{
+ struct rds_page_remainder *rem;
+ unsigned long flags;
+ struct page *page;
+ int ret;
+
+ gfp |= __GFP_HIGHMEM;
+
+ /* jump straight to allocation if we're trying for a huge page */
+ if (bytes >= PAGE_SIZE) {
+ page = alloc_page(gfp);
+ if (!page) {
+ ret = -ENOMEM;
+ } else {
+ sg_set_page(scat, page, PAGE_SIZE, 0);
+ ret = 0;
+ }
+ goto out;
+ }
+
+ rem = &per_cpu(rds_page_remainders, get_cpu());
+ local_irq_save(flags);
+
+ while (1) {
+ /* avoid a tiny region getting stuck by tossing it */
+ if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
+ rds_stats_inc(s_page_remainder_miss);
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ }
+
+ /* hand out a fragment from the cached page */
+ if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
+ sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
+ get_page(sg_page(scat));
+
+ if (rem->r_offset != 0)
+ rds_stats_inc(s_page_remainder_hit);
+
+ rem->r_offset += bytes;
+ if (rem->r_offset == PAGE_SIZE) {
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ }
+ ret = 0;
+ break;
+ }
+
+ /* alloc if there is nothing for us to use */
+ local_irq_restore(flags);
+ put_cpu();
+
+ page = alloc_page(gfp);
+
+ rem = &per_cpu(rds_page_remainders, get_cpu());
+ local_irq_save(flags);
+
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* did someone race to fill the remainder before us? */
+ if (rem->r_page) {
+ __free_page(page);
+ continue;
+ }
+
+ /* otherwise install our page and loop around to alloc */
+ rem->r_page = page;
+ rem->r_offset = 0;
+ }
+
+ local_irq_restore(flags);
+ put_cpu();
+out:
+ rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
+ ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
+ ret ? 0 : scat->length);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
+
+static int rds_page_remainder_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ struct rds_page_remainder *rem;
+ long cpu = (long)hcpu;
+
+ rem = &per_cpu(rds_page_remainders, cpu);
+
+ rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
+
+ switch (action) {
+ case CPU_DEAD:
+ if (rem->r_page)
+ __free_page(rem->r_page);
+ rem->r_page = NULL;
+ break;
+ }
+
+ return 0;
+}
+
+static struct notifier_block rds_page_remainder_nb = {
+ .notifier_call = rds_page_remainder_cpu_notify,
+};
+
+void rds_page_exit(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
+ (unsigned long)CPU_DEAD,
+ (void *)(long)i);
+}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 000000000..40084d843
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) 2007 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
+
+#include "rds.h"
+
+/*
+ * XXX
+ * - build with sparse
+ * - should we limit the size of a mr region? let transport return failure?
+ * - should we detect duplicate keys on a socket? hmm.
+ * - an rdma is an mlock, apply rlimit?
+ */
+
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid. It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int. This comes
+ * from being stored in the 'length' member of 'struct scatterlist'.
+ */
+static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
+{
+ if ((vec->addr + vec->bytes <= vec->addr) ||
+ (vec->bytes > (u64)UINT_MAX))
+ return 0;
+
+ return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ (vec->addr >> PAGE_SHIFT);
+}
+
+static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
+ struct rds_mr *insert)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rds_mr *mr;
+
+ while (*p) {
+ parent = *p;
+ mr = rb_entry(parent, struct rds_mr, r_rb_node);
+
+ if (key < mr->r_key)
+ p = &(*p)->rb_left;
+ else if (key > mr->r_key)
+ p = &(*p)->rb_right;
+ else
+ return mr;
+ }
+
+ if (insert) {
+ rb_link_node(&insert->r_rb_node, parent, p);
+ rb_insert_color(&insert->r_rb_node, root);
+ atomic_inc(&insert->r_refcount);
+ }
+ return NULL;
+}
+
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void rds_destroy_mr(struct rds_mr *mr)
+{
+ struct rds_sock *rs = mr->r_sock;
+ void *trans_private = NULL;
+ unsigned long flags;
+
+ rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
+ mr->r_key, atomic_read(&mr->r_refcount));
+
+ if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
+ return;
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ if (!RB_EMPTY_NODE(&mr->r_rb_node))
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ trans_private = mr->r_trans_private;
+ mr->r_trans_private = NULL;
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (trans_private)
+ mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+
+void __rds_put_mr_final(struct rds_mr *mr)
+{
+ rds_destroy_mr(mr);
+ kfree(mr);
+}
+
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void rds_rdma_drop_keys(struct rds_sock *rs)
+{
+ struct rds_mr *mr;
+ struct rb_node *node;
+ unsigned long flags;
+
+ /* Release any MRs associated with this socket */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ while ((node = rb_first(&rs->rs_rdma_keys))) {
+ mr = container_of(node, struct rds_mr, r_rb_node);
+ if (mr->r_trans == rs->rs_transport)
+ mr->r_invalidate = 0;
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+ rds_destroy_mr(mr);
+ rds_mr_put(mr);
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ }
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (rs->rs_transport && rs->rs_transport->flush_mrs)
+ rs->rs_transport->flush_mrs();
+}
+
+/*
+ * Helper function to pin user pages.
+ */
+static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+ struct page **pages, int write)
+{
+ int ret;
+
+ ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+
+ if (ret >= 0 && ret < nr_pages) {
+ while (ret--)
+ put_page(pages[ret]);
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
+ u64 *cookie_ret, struct rds_mr **mr_ret)
+{
+ struct rds_mr *mr = NULL, *found;
+ unsigned int nr_pages;
+ struct page **pages = NULL;
+ struct scatterlist *sg;
+ void *trans_private;
+ unsigned long flags;
+ rds_rdma_cookie_t cookie;
+ unsigned int nents;
+ long i;
+ int ret;
+
+ if (rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ if (!rs->rs_transport->get_mr) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ nr_pages = rds_pages_in_vec(&args->vec);
+ if (nr_pages == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
+ args->vec.addr, args->vec.bytes, nr_pages);
+
+ /* XXX clamp nr_pages to limit the size of this alloc? */
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
+ if (!mr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ atomic_set(&mr->r_refcount, 1);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ mr->r_trans = rs->rs_transport;
+ mr->r_sock = rs;
+
+ if (args->flags & RDS_RDMA_USE_ONCE)
+ mr->r_use_once = 1;
+ if (args->flags & RDS_RDMA_INVALIDATE)
+ mr->r_invalidate = 1;
+ if (args->flags & RDS_RDMA_READWRITE)
+ mr->r_write = 1;
+
+ /*
+ * Pin the pages that make up the user buffer and transfer the page
+ * pointers to the mr's sg array. We check to see if we've mapped
+ * the whole region after transferring the partial page references
+ * to the sg array so that we can have one page ref cleanup path.
+ *
+ * For now we have no flag that tells us whether the mapping is
+ * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
+ * the zero page.
+ */
+ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
+ if (ret < 0)
+ goto out;
+
+ nents = ret;
+ sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+ if (!sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ WARN_ON(!nents);
+ sg_init_table(sg, nents);
+
+ /* Stick all pages into the scatterlist */
+ for (i = 0 ; i < nents; i++)
+ sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+
+ rdsdebug("RDS: trans_private nents is %u\n", nents);
+
+ /* Obtain a transport specific MR. If this succeeds, the
+ * s/g list is now owned by the MR.
+ * Note that dma_map() implies that pending writes are
+ * flushed to RAM, so no dma_sync is needed here. */
+ trans_private = rs->rs_transport->get_mr(sg, nents, rs,
+ &mr->r_key);
+
+ if (IS_ERR(trans_private)) {
+ for (i = 0 ; i < nents; i++)
+ put_page(sg_page(&sg[i]));
+ kfree(sg);
+ ret = PTR_ERR(trans_private);
+ goto out;
+ }
+
+ mr->r_trans_private = trans_private;
+
+ rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
+ mr->r_key, (void *)(unsigned long) args->cookie_addr);
+
+ /* The user may pass us an unaligned address, but we can only
+ * map page aligned regions. So we keep the offset, and build
+ * a 64bit cookie containing <R_Key, offset> and pass that
+ * around. */
+ cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+ if (cookie_ret)
+ *cookie_ret = cookie;
+
+ if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* Inserting the new MR into the rbtree bumps its
+ * reference count. */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ BUG_ON(found && found != mr);
+
+ rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
+ if (mr_ret) {
+ atomic_inc(&mr->r_refcount);
+ *mr_ret = mr;
+ }
+
+ ret = 0;
+out:
+ kfree(pages);
+ if (mr)
+ rds_mr_put(mr);
+ return ret;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+ struct rds_get_mr_args args;
+
+ if (optlen != sizeof(struct rds_get_mr_args))
+ return -EINVAL;
+
+ if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
+ sizeof(struct rds_get_mr_args)))
+ return -EFAULT;
+
+ return __rds_rdma_map(rs, &args, NULL, NULL);
+}
+
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
+{
+ struct rds_get_mr_for_dest_args args;
+ struct rds_get_mr_args new_args;
+
+ if (optlen != sizeof(struct rds_get_mr_for_dest_args))
+ return -EINVAL;
+
+ if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
+ sizeof(struct rds_get_mr_for_dest_args)))
+ return -EFAULT;
+
+ /*
+ * Initially, just behave like get_mr().
+ * TODO: Implement get_mr as wrapper around this
+ * and deprecate it.
+ */
+ new_args.vec = args.vec;
+ new_args.cookie_addr = args.cookie_addr;
+ new_args.flags = args.flags;
+
+ return __rds_rdma_map(rs, &new_args, NULL, NULL);
+}
+
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+ struct rds_free_mr_args args;
+ struct rds_mr *mr;
+ unsigned long flags;
+
+ if (optlen != sizeof(struct rds_free_mr_args))
+ return -EINVAL;
+
+ if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
+ sizeof(struct rds_free_mr_args)))
+ return -EFAULT;
+
+ /* Special case - a null cookie means flush all unused MRs */
+ if (args.cookie == 0) {
+ if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+ return -EINVAL;
+ rs->rs_transport->flush_mrs();
+ return 0;
+ }
+
+ /* Look up the MR given its R_key and remove it from the rbtree
+ * so nobody else finds it.
+ * This should also prevent races with rds_rdma_unuse.
+ */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
+ if (mr) {
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ if (args.flags & RDS_RDMA_INVALIDATE)
+ mr->r_invalidate = 1;
+ }
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (!mr)
+ return -EINVAL;
+
+ /*
+ * call rds_destroy_mr() ourselves so that we're sure it's done by the time
+ * we return. If we let rds_mr_put() do it it might not happen until
+ * someone else drops their ref.
+ */
+ rds_destroy_mr(mr);
+ rds_mr_put(mr);
+ return 0;
+}
+
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
+{
+ struct rds_mr *mr;
+ unsigned long flags;
+ int zot_me = 0;
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+ if (!mr) {
+ printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+ return;
+ }
+
+ if (mr->r_use_once || force) {
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ zot_me = 1;
+ }
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ /* May have to issue a dma_sync on this memory region.
+ * Note we could avoid this if the operation was a RDMA READ,
+ * but at this point we can't tell. */
+ if (mr->r_trans->sync_mr)
+ mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+ /* If the MR was marked as invalidate, this will
+ * trigger an async flush. */
+ if (zot_me)
+ rds_destroy_mr(mr);
+ rds_mr_put(mr);
+}
+
+void rds_rdma_free_op(struct rm_rdma_op *ro)
+{
+ unsigned int i;
+
+ for (i = 0; i < ro->op_nents; i++) {
+ struct page *page = sg_page(&ro->op_sg[i]);
+
+ /* Mark page dirty if it was possibly modified, which
+ * is the case for a RDMA_READ which copies from remote
+ * to local memory */
+ if (!ro->op_write) {
+ BUG_ON(irqs_disabled());
+ set_page_dirty(page);
+ }
+ put_page(page);
+ }
+
+ kfree(ro->op_notifier);
+ ro->op_notifier = NULL;
+ ro->op_active = 0;
+}
+
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+ struct page *page = sg_page(ao->op_sg);
+
+ /* Mark page dirty if it was possibly modified, which
+ * is the case for a RDMA_READ which copies from remote
+ * to local memory */
+ set_page_dirty(page);
+ put_page(page);
+
+ kfree(ao->op_notifier);
+ ao->op_notifier = NULL;
+ ao->op_active = 0;
+}
+
+
+/*
+ * Count the number of pages needed to describe an incoming iovec array.
+ */
+static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
+{
+ int tot_pages = 0;
+ unsigned int nr_pages;
+ unsigned int i;
+
+ /* figure out the number of pages in the vector */
+ for (i = 0; i < nr_iovecs; i++) {
+ nr_pages = rds_pages_in_vec(&iov[i]);
+ if (nr_pages == 0)
+ return -EINVAL;
+
+ tot_pages += nr_pages;
+
+ /*
+ * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+ * so tot_pages cannot overflow without first going negative.
+ */
+ if (tot_pages < 0)
+ return -EINVAL;
+ }
+
+ return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+ struct rds_iovec vec;
+ struct rds_iovec __user *local_vec;
+ int tot_pages = 0;
+ unsigned int nr_pages;
+ unsigned int i;
+
+ local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+ /* figure out the number of pages in the vector */
+ for (i = 0; i < args->nr_local; i++) {
+ if (copy_from_user(&vec, &local_vec[i],
+ sizeof(struct rds_iovec)))
+ return -EFAULT;
+
+ nr_pages = rds_pages_in_vec(&vec);
+ if (nr_pages == 0)
+ return -EINVAL;
+
+ tot_pages += nr_pages;
+
+ /*
+ * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+ * so tot_pages cannot overflow without first going negative.
+ */
+ if (tot_pages < 0)
+ return -EINVAL;
+ }
+
+ return tot_pages * sizeof(struct scatterlist);
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct rds_rdma_args *args;
+ struct rm_rdma_op *op = &rm->rdma;
+ int nr_pages;
+ unsigned int nr_bytes;
+ struct page **pages = NULL;
+ struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
+ int iov_size;
+ unsigned int i, j;
+ int ret = 0;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+ || rm->rdma.op_active)
+ return -EINVAL;
+
+ args = CMSG_DATA(cmsg);
+
+ if (rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out_ret;
+ }
+
+ if (args->nr_local > UIO_MAXIOV) {
+ ret = -EMSGSIZE;
+ goto out_ret;
+ }
+
+ /* Check whether to allocate the iovec area */
+ iov_size = args->nr_local * sizeof(struct rds_iovec);
+ if (args->nr_local > UIO_FASTIOV) {
+ iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
+ if (!iovs) {
+ ret = -ENOMEM;
+ goto out_ret;
+ }
+ }
+
+ if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ nr_pages = rds_rdma_pages(iovs, args->nr_local);
+ if (nr_pages < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+ op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+ op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
+ op->op_active = 1;
+ op->op_recverr = rs->rs_recverr;
+ WARN_ON(!nr_pages);
+ op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
+ if (!op->op_sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (op->op_notify || op->op_recverr) {
+ /* We allocate an uninitialized notifier here, because
+ * we don't want to do that in the completion handler. We
+ * would have to use GFP_ATOMIC there, and don't want to deal
+ * with failed allocations.
+ */
+ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+ if (!op->op_notifier) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ op->op_notifier->n_user_token = args->user_token;
+ op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+ }
+
+ /* The cookie contains the R_Key of the remote memory region, and
+ * optionally an offset into it. This is how we implement RDMA into
+ * unaligned memory.
+ * When setting up the RDMA, we need to add that offset to the
+ * destination address (which is really an offset into the MR)
+ * FIXME: We may want to move this into ib_rdma.c
+ */
+ op->op_rkey = rds_rdma_cookie_key(args->cookie);
+ op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+
+ nr_bytes = 0;
+
+ rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
+ (unsigned long long)args->nr_local,
+ (unsigned long long)args->remote_vec.addr,
+ op->op_rkey);
+
+ for (i = 0; i < args->nr_local; i++) {
+ struct rds_iovec *iov = &iovs[i];
+ /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
+ unsigned int nr = rds_pages_in_vec(iov);
+
+ rs->rs_user_addr = iov->addr;
+ rs->rs_user_bytes = iov->bytes;
+
+ /* If it's a WRITE operation, we want to pin the pages for reading.
+ * If it's a READ operation, we need to pin the pages for writing.
+ */
+ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
+ if (ret < 0)
+ goto out;
+
+ rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
+ nr_bytes, nr, iov->bytes, iov->addr);
+
+ nr_bytes += iov->bytes;
+
+ for (j = 0; j < nr; j++) {
+ unsigned int offset = iov->addr & ~PAGE_MASK;
+ struct scatterlist *sg;
+
+ sg = &op->op_sg[op->op_nents + j];
+ sg_set_page(sg, pages[j],
+ min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
+ offset);
+
+ rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
+ sg->offset, sg->length, iov->addr, iov->bytes);
+
+ iov->addr += sg->length;
+ iov->bytes -= sg->length;
+ }
+
+ op->op_nents += nr;
+ }
+
+ if (nr_bytes > args->remote_vec.bytes) {
+ rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
+ nr_bytes,
+ (unsigned int) args->remote_vec.bytes);
+ ret = -EINVAL;
+ goto out;
+ }
+ op->op_bytes = nr_bytes;
+
+out:
+ if (iovs != iovstack)
+ sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
+ kfree(pages);
+out_ret:
+ if (ret)
+ rds_rdma_free_op(op);
+ else
+ rds_stats_inc(s_send_rdma);
+
+ return ret;
+}
+
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ unsigned long flags;
+ struct rds_mr *mr;
+ u32 r_key;
+ int err = 0;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
+ rm->m_rdma_cookie != 0)
+ return -EINVAL;
+
+ memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
+
+ /* We are reusing a previously mapped MR here. Most likely, the
+ * application has written to the buffer, so we need to explicitly
+ * flush those writes to RAM. Otherwise the HCA may not see them
+ * when doing a DMA from that buffer.
+ */
+ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
+
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+ mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+ if (!mr)
+ err = -EINVAL; /* invalid r_key */
+ else
+ atomic_inc(&mr->r_refcount);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+ if (mr) {
+ mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+ rm->rdma.op_rdma_mr = mr;
+ }
+ return err;
+}
+
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
+ rm->m_rdma_cookie != 0)
+ return -EINVAL;
+
+ return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
+}
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct page *page = NULL;
+ struct rds_atomic_args *args;
+ int ret = 0;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+ || rm->atomic.op_active)
+ return -EINVAL;
+
+ args = CMSG_DATA(cmsg);
+
+ /* Nonmasked & masked cmsg ops converted to masked hw ops */
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_ATOMIC_FADD:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+ rm->atomic.op_m_fadd.add = args->fadd.add;
+ rm->atomic.op_m_fadd.nocarry_mask = 0;
+ break;
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+ rm->atomic.op_m_fadd.add = args->m_fadd.add;
+ rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+ break;
+ case RDS_CMSG_ATOMIC_CSWP:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+ rm->atomic.op_m_cswp.compare = args->cswp.compare;
+ rm->atomic.op_m_cswp.swap = args->cswp.swap;
+ rm->atomic.op_m_cswp.compare_mask = ~0;
+ rm->atomic.op_m_cswp.swap_mask = ~0;
+ break;
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+ rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+ rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+ rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+ rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+ break;
+ default:
+ BUG(); /* should never happen */
+ }
+
+ rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
+ rm->atomic.op_active = 1;
+ rm->atomic.op_recverr = rs->rs_recverr;
+ rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+ if (!rm->atomic.op_sg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /* verify 8 byte-aligned */
+ if (args->local_addr & 0x7) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+ if (ret != 1)
+ goto err;
+ ret = 0;
+
+ sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+ if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+ /* We allocate an uninitialized notifier here, because
+ * we don't want to do that in the completion handler. We
+ * would have to use GFP_ATOMIC there, and don't want to deal
+ * with failed allocations.
+ */
+ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+ if (!rm->atomic.op_notifier) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ rm->atomic.op_notifier->n_user_token = args->user_token;
+ rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+ }
+
+ rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
+ rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+ return ret;
+err:
+ if (page)
+ put_page(page);
+ kfree(rm->atomic.op_notifier);
+
+ return ret;
+}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 000000000..6cd9d1dea
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2009 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <rdma/rdma_cm.h>
+
+#include "rdma_transport.h"
+
+static struct rdma_cm_id *rds_rdma_listen_id;
+
+static char *rds_cm_event_strings[] = {
+#define RDS_CM_EVENT_STRING(foo) \
+ [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
+ RDS_CM_EVENT_STRING(ADDR_RESOLVED),
+ RDS_CM_EVENT_STRING(ADDR_ERROR),
+ RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
+ RDS_CM_EVENT_STRING(ROUTE_ERROR),
+ RDS_CM_EVENT_STRING(CONNECT_REQUEST),
+ RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
+ RDS_CM_EVENT_STRING(CONNECT_ERROR),
+ RDS_CM_EVENT_STRING(UNREACHABLE),
+ RDS_CM_EVENT_STRING(REJECTED),
+ RDS_CM_EVENT_STRING(ESTABLISHED),
+ RDS_CM_EVENT_STRING(DISCONNECTED),
+ RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
+ RDS_CM_EVENT_STRING(MULTICAST_JOIN),
+ RDS_CM_EVENT_STRING(MULTICAST_ERROR),
+ RDS_CM_EVENT_STRING(ADDR_CHANGE),
+ RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
+#undef RDS_CM_EVENT_STRING
+};
+
+static char *rds_cm_event_str(enum rdma_cm_event_type type)
+{
+ return rds_str_array(rds_cm_event_strings,
+ ARRAY_SIZE(rds_cm_event_strings), type);
+};
+
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ /* this can be null in the listening path */
+ struct rds_connection *conn = cm_id->context;
+ struct rds_transport *trans;
+ int ret = 0;
+
+ rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+ event->event, rds_cm_event_str(event->event));
+
+ if (cm_id->device->node_type == RDMA_NODE_RNIC)
+ trans = &rds_iw_transport;
+ else
+ trans = &rds_ib_transport;
+
+ /* Prevent shutdown from tearing down the connection
+ * while we're executing. */
+ if (conn) {
+ mutex_lock(&conn->c_cm_lock);
+
+ /* If the connection is being shut down, bail out
+ * right away. We return 0 so cm_id doesn't get
+ * destroyed prematurely */
+ if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
+ /* Reject incoming connections while we're tearing
+ * down an existing one. */
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ ret = 1;
+ goto out;
+ }
+ }
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ ret = trans->cm_handle_connect(cm_id, event);
+ break;
+
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ /* XXX do we need to clean up if this fails? */
+ ret = rdma_resolve_route(cm_id,
+ RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ /* XXX worry about racing with listen acceptance */
+ ret = trans->cm_initiate_connect(cm_id);
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ trans->cm_connect_complete(conn, event);
+ break;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ if (conn)
+ rds_conn_drop(conn);
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ rdsdebug("DISCONNECT event - dropping connection "
+ "%pI4->%pI4\n", &conn->c_laddr,
+ &conn->c_faddr);
+ rds_conn_drop(conn);
+ break;
+
+ default:
+ /* things like device disconnect? */
+ printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+ event->event, rds_cm_event_str(event->event));
+ break;
+ }
+
+out:
+ if (conn)
+ mutex_unlock(&conn->c_cm_lock);
+
+ rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+ rds_cm_event_str(event->event), ret);
+
+ return ret;
+}
+
+static int rds_rdma_listen_init(void)
+{
+ struct sockaddr_in sin;
+ struct rdma_cm_id *cm_id;
+ int ret;
+
+ cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
+ IB_QPT_RC);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ "rdma_create_id() returned %d\n", ret);
+ return ret;
+ }
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+ sin.sin_port = (__force u16)htons(RDS_PORT);
+
+ /*
+ * XXX I bet this binds the cm_id to a device. If we want to support
+ * fail-over we'll have to take this into consideration.
+ */
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ if (ret) {
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ "rdma_bind_addr() returned %d\n", ret);
+ goto out;
+ }
+
+ ret = rdma_listen(cm_id, 128);
+ if (ret) {
+ printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ "rdma_listen() returned %d\n", ret);
+ goto out;
+ }
+
+ rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
+
+ rds_rdma_listen_id = cm_id;
+ cm_id = NULL;
+out:
+ if (cm_id)
+ rdma_destroy_id(cm_id);
+ return ret;
+}
+
+static void rds_rdma_listen_stop(void)
+{
+ if (rds_rdma_listen_id) {
+ rdsdebug("cm %p\n", rds_rdma_listen_id);
+ rdma_destroy_id(rds_rdma_listen_id);
+ rds_rdma_listen_id = NULL;
+ }
+}
+
+static int rds_rdma_init(void)
+{
+ int ret;
+
+ ret = rds_rdma_listen_init();
+ if (ret)
+ goto out;
+
+ ret = rds_iw_init();
+ if (ret)
+ goto err_iw_init;
+
+ ret = rds_ib_init();
+ if (ret)
+ goto err_ib_init;
+
+ goto out;
+
+err_ib_init:
+ rds_iw_exit();
+err_iw_init:
+ rds_rdma_listen_stop();
+out:
+ return ret;
+}
+module_init(rds_rdma_init);
+
+static void rds_rdma_exit(void)
+{
+ /* stop listening first to ensure no new connections are attempted */
+ rds_rdma_listen_stop();
+ rds_ib_exit();
+ rds_iw_exit();
+}
+module_exit(rds_rdma_exit);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: IB/iWARP transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 000000000..faba4e382
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,24 @@
+#ifndef _RDMA_TRANSPORT_H
+#define _RDMA_TRANSPORT_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+
+#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
+
+int rds_rdma_conn_connect(struct rds_connection *conn);
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+
+/* from ib.c */
+extern struct rds_transport rds_ib_transport;
+int rds_ib_init(void);
+void rds_ib_exit(void);
+
+/* from iw.c */
+extern struct rds_transport rds_iw_transport;
+int rds_iw_init(void);
+void rds_iw_exit(void);
+
+#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 000000000..0d41155a2
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,809 @@
+#ifndef _RDS_RDS_H
+#define _RDS_RDS_H
+
+#include <net/sock.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/rds.h>
+
+#include "info.h"
+
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0 0x0300
+#define RDS_PROTOCOL_3_1 0x0301
+#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
+#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
+
+/*
+ * XXX randomly chosen, but at least seems to be unused:
+ * # 18464-18768 Unassigned
+ * We should do better. We want a reserved port to discourage unpriv'ed
+ * userspace from listening.
+ */
+#define RDS_PORT 18634
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+#ifdef DEBUG
+#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
+#else
+/* sigh, pr_debug() causes unused variable warnings */
+static inline __printf(1, 2)
+void rdsdebug(char *fmt, ...)
+{
+}
+#endif
+
+/* XXX is there one of these somewhere? */
+#define ceil(x, y) \
+ ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
+
+#define RDS_FRAG_SHIFT 12
+#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
+
+#define RDS_CONG_MAP_BYTES (65536 / 8)
+#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
+#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
+
+struct rds_cong_map {
+ struct rb_node m_rb_node;
+ __be32 m_addr;
+ wait_queue_head_t m_waitq;
+ struct list_head m_conn_list;
+ unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
+};
+
+
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+ RDS_CONN_DOWN = 0,
+ RDS_CONN_CONNECTING,
+ RDS_CONN_DISCONNECTING,
+ RDS_CONN_UP,
+ RDS_CONN_ERROR,
+};
+
+/* Bits for c_flags */
+#define RDS_LL_SEND_FULL 0
+#define RDS_RECONNECT_PENDING 1
+#define RDS_IN_XMIT 2
+
+struct rds_connection {
+ struct hlist_node c_hash_node;
+ __be32 c_laddr;
+ __be32 c_faddr;
+ unsigned int c_loopback:1;
+ struct rds_connection *c_passive;
+
+ struct rds_cong_map *c_lcong;
+ struct rds_cong_map *c_fcong;
+
+ struct rds_message *c_xmit_rm;
+ unsigned long c_xmit_sg;
+ unsigned int c_xmit_hdr_off;
+ unsigned int c_xmit_data_off;
+ unsigned int c_xmit_atomic_sent;
+ unsigned int c_xmit_rdma_sent;
+ unsigned int c_xmit_data_sent;
+
+ spinlock_t c_lock; /* protect msg queues */
+ u64 c_next_tx_seq;
+ struct list_head c_send_queue;
+ struct list_head c_retrans;
+
+ u64 c_next_rx_seq;
+
+ struct rds_transport *c_trans;
+ void *c_transport_data;
+
+ atomic_t c_state;
+ unsigned long c_send_gen;
+ unsigned long c_flags;
+ unsigned long c_reconnect_jiffies;
+ struct delayed_work c_send_w;
+ struct delayed_work c_recv_w;
+ struct delayed_work c_conn_w;
+ struct work_struct c_down_w;
+ struct mutex c_cm_lock; /* protect conn state & cm */
+ wait_queue_head_t c_waitq;
+
+ struct list_head c_map_item;
+ unsigned long c_map_queued;
+
+ unsigned int c_unacked_packets;
+ unsigned int c_unacked_bytes;
+
+ /* Protocol version */
+ unsigned int c_version;
+};
+
+#define RDS_FLAG_CONG_BITMAP 0x01
+#define RDS_FLAG_ACK_REQUIRED 0x02
+#define RDS_FLAG_RETRANSMITTED 0x04
+#define RDS_MAX_ADV_CREDIT 255
+
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDS_HEADER_EXT_SPACE 16
+
+struct rds_header {
+ __be64 h_sequence;
+ __be64 h_ack;
+ __be32 h_len;
+ __be16 h_sport;
+ __be16 h_dport;
+ u8 h_flags;
+ u8 h_credit;
+ u8 h_padding[4];
+ __sum16 h_csum;
+
+ u8 h_exthdr[RDS_HEADER_EXT_SPACE];
+};
+
+/*
+ * Reserved - indicates end of extensions
+ */
+#define RDS_EXTHDR_NONE 0
+
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ * NB: This is no longer true for IB, where we do a version
+ * negotiation during the connection setup phase (protocol
+ * version information is included in the RDMA CM private data).
+ */
+#define RDS_EXTHDR_VERSION 1
+struct rds_ext_header_version {
+ __be32 h_version;
+};
+
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDS_EXTHDR_RDMA 2
+struct rds_ext_header_rdma {
+ __be32 h_rdma_rkey;
+};
+
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDS_EXTHDR_RDMA_DEST 3
+struct rds_ext_header_rdma_dest {
+ __be32 h_rdma_rkey;
+ __be32 h_rdma_offset;
+};
+
+#define __RDS_EXTHDR_MAX 16 /* for now */
+
+struct rds_incoming {
+ atomic_t i_refcount;
+ struct list_head i_item;
+ struct rds_connection *i_conn;
+ struct rds_header i_hdr;
+ unsigned long i_rx_jiffies;
+ __be32 i_saddr;
+
+ rds_rdma_cookie_t i_rdma_cookie;
+};
+
+struct rds_mr {
+ struct rb_node r_rb_node;
+ atomic_t r_refcount;
+ u32 r_key;
+
+ /* A copy of the creation flags */
+ unsigned int r_use_once:1;
+ unsigned int r_invalidate:1;
+ unsigned int r_write:1;
+
+ /* This is for RDS_MR_DEAD.
+ * It would be nice & consistent to make this part of the above
+ * bit field here, but we need to use test_and_set_bit.
+ */
+ unsigned long r_state;
+ struct rds_sock *r_sock; /* back pointer to the socket that owns us */
+ struct rds_transport *r_trans;
+ void *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD 0
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+ return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+ return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+ return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP 0
+#define RDS_ATOMIC_TYPE_FADD 1
+
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock. m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path. It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue. m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting. As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly. That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate. They can use this in a callback
+ * that they pass to rds_send_drop_acked() to see if each message has been
+ * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDS_MSG_ON_SOCK 1
+#define RDS_MSG_ON_CONN 2
+#define RDS_MSG_HAS_ACK_SEQ 3
+#define RDS_MSG_ACK_REQUIRED 4
+#define RDS_MSG_RETRANSMITTED 5
+#define RDS_MSG_MAPPED 6
+#define RDS_MSG_PAGEVEC 7
+
+struct rds_message {
+ atomic_t m_refcount;
+ struct list_head m_sock_item;
+ struct list_head m_conn_item;
+ struct rds_incoming m_inc;
+ u64 m_ack_seq;
+ __be32 m_daddr;
+ unsigned long m_flags;
+
+ /* Never access m_rs without holding m_rs_lock.
+ * Lock nesting is
+ * rm->m_rs_lock
+ * -> rs->rs_lock
+ */
+ spinlock_t m_rs_lock;
+ wait_queue_head_t m_flush_wait;
+
+ struct rds_sock *m_rs;
+
+ /* cookie to send to remote, in rds header */
+ rds_rdma_cookie_t m_rdma_cookie;
+
+ unsigned int m_used_sgs;
+ unsigned int m_total_sgs;
+
+ void *m_final_op;
+
+ struct {
+ struct rm_atomic_op {
+ int op_type;
+ union {
+ struct {
+ uint64_t compare;
+ uint64_t swap;
+ uint64_t compare_mask;
+ uint64_t swap_mask;
+ } op_m_cswp;
+ struct {
+ uint64_t add;
+ uint64_t nocarry_mask;
+ } op_m_fadd;
+ };
+
+ u32 op_rkey;
+ u64 op_remote_addr;
+ unsigned int op_notify:1;
+ unsigned int op_recverr:1;
+ unsigned int op_mapped:1;
+ unsigned int op_silent:1;
+ unsigned int op_active:1;
+ struct scatterlist *op_sg;
+ struct rds_notifier *op_notifier;
+
+ struct rds_mr *op_rdma_mr;
+ } atomic;
+ struct rm_rdma_op {
+ u32 op_rkey;
+ u64 op_remote_addr;
+ unsigned int op_write:1;
+ unsigned int op_fence:1;
+ unsigned int op_notify:1;
+ unsigned int op_recverr:1;
+ unsigned int op_mapped:1;
+ unsigned int op_silent:1;
+ unsigned int op_active:1;
+ unsigned int op_bytes;
+ unsigned int op_nents;
+ unsigned int op_count;
+ struct scatterlist *op_sg;
+ struct rds_notifier *op_notifier;
+
+ struct rds_mr *op_rdma_mr;
+ } rdma;
+ struct rm_data_op {
+ unsigned int op_active:1;
+ unsigned int op_nents;
+ unsigned int op_count;
+ struct scatterlist *op_sg;
+ } data;
+ };
+};
+
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rds_notifier {
+ struct list_head n_list;
+ uint64_t n_user_token;
+ int n_status;
+};
+
+/**
+ * struct rds_transport - transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
+ * part of a message. The caller serializes on the send_sem so this
+ * doesn't need to be reentrant for a given conn. The header must be
+ * sent before the data payload. .xmit must be prepared to send a
+ * message with no data payload. .xmit should return the number of
+ * bytes that were sent down the connection, including header bytes.
+ * Returning 0 tells the caller that it doesn't need to perform any
+ * additional work now. This is usually the case when the transport has
+ * filled the sending queue for its connection and will handle
+ * triggering the rds thread to continue the send when space becomes
+ * available. Returning -EAGAIN tells the caller to retry the send
+ * immediately. Returning -ENOMEM tells the caller to retry the send at
+ * some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once
+ * it returns the connection can not call rds_recv_incoming().
+ * This will only be called once after conn_connect returns
+ * non-zero success and will The caller serializes this with
+ * the send and connecting paths (xmit_* and conn_*). The
+ * transport is responsible for other serialization, including
+ * rds_recv_incoming(). This is called in process context but
+ * should try hard not to block.
+ */
+
+#define RDS_TRANS_IB 0
+#define RDS_TRANS_IWARP 1
+#define RDS_TRANS_TCP 2
+#define RDS_TRANS_COUNT 3
+
+struct rds_transport {
+ char t_name[TRANSNAMSIZ];
+ struct list_head t_item;
+ struct module *t_owner;
+ unsigned int t_prefer_loopback:1;
+ unsigned int t_type;
+
+ int (*laddr_check)(__be32 addr);
+ int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
+ void (*conn_free)(void *data);
+ int (*conn_connect)(struct rds_connection *conn);
+ void (*conn_shutdown)(struct rds_connection *conn);
+ void (*xmit_prepare)(struct rds_connection *conn);
+ void (*xmit_complete)(struct rds_connection *conn);
+ int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+ int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+ int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+ int (*recv)(struct rds_connection *conn);
+ int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
+ void (*inc_free)(struct rds_incoming *inc);
+
+ int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
+ int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+ void (*cm_connect_complete)(struct rds_connection *conn,
+ struct rdma_cm_event *event);
+
+ unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
+ unsigned int avail);
+ void (*exit)(void);
+ void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
+ struct rds_sock *rs, u32 *key_ret);
+ void (*sync_mr)(void *trans_private, int direction);
+ void (*free_mr)(void *trans_private, int invalidate);
+ void (*flush_mrs)(void);
+};
+
+struct rds_sock {
+ struct sock rs_sk;
+
+ u64 rs_user_addr;
+ u64 rs_user_bytes;
+
+ /*
+ * bound_addr used for both incoming and outgoing, no INADDR_ANY
+ * support.
+ */
+ struct hlist_node rs_bound_node;
+ __be32 rs_bound_addr;
+ __be32 rs_conn_addr;
+ __be16 rs_bound_port;
+ __be16 rs_conn_port;
+ struct rds_transport *rs_transport;
+
+ /*
+ * rds_sendmsg caches the conn it used the last time around.
+ * This helps avoid costly lookups.
+ */
+ struct rds_connection *rs_conn;
+
+ /* flag indicating we were congested or not */
+ int rs_congested;
+ /* seen congestion (ENOBUFS) when sending? */
+ int rs_seen_congestion;
+
+ /* rs_lock protects all these adjacent members before the newline */
+ spinlock_t rs_lock;
+ struct list_head rs_send_queue;
+ u32 rs_snd_bytes;
+ int rs_rcv_bytes;
+ struct list_head rs_notify_queue; /* currently used for failed RDMAs */
+
+ /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+ * to decide whether the application should be woken up.
+ * If not set, we use rs_cong_track to find out whether a cong map
+ * update arrived.
+ */
+ uint64_t rs_cong_mask;
+ uint64_t rs_cong_notify;
+ struct list_head rs_cong_list;
+ unsigned long rs_cong_track;
+
+ /*
+ * rs_recv_lock protects the receive queue, and is
+ * used to serialize with rds_release.
+ */
+ rwlock_t rs_recv_lock;
+ struct list_head rs_recv_queue;
+
+ /* just for stats reporting */
+ struct list_head rs_item;
+
+ /* these have their own lock */
+ spinlock_t rs_rdma_lock;
+ struct rb_root rs_rdma_keys;
+
+ /* Socket options - in case there will be more */
+ unsigned char rs_recverr,
+ rs_cong_monitor;
+};
+
+static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
+{
+ return container_of(sk, struct rds_sock, rs_sk);
+}
+static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
+{
+ return &rs->rs_sk;
+}
+
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead. We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+static inline int rds_sk_sndbuf(struct rds_sock *rs)
+{
+ return rds_rs_to_sk(rs)->sk_sndbuf / 2;
+}
+static inline int rds_sk_rcvbuf(struct rds_sock *rs)
+{
+ return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
+}
+
+struct rds_statistics {
+ uint64_t s_conn_reset;
+ uint64_t s_recv_drop_bad_checksum;
+ uint64_t s_recv_drop_old_seq;
+ uint64_t s_recv_drop_no_sock;
+ uint64_t s_recv_drop_dead_sock;
+ uint64_t s_recv_deliver_raced;
+ uint64_t s_recv_delivered;
+ uint64_t s_recv_queued;
+ uint64_t s_recv_immediate_retry;
+ uint64_t s_recv_delayed_retry;
+ uint64_t s_recv_ack_required;
+ uint64_t s_recv_rdma_bytes;
+ uint64_t s_recv_ping;
+ uint64_t s_send_queue_empty;
+ uint64_t s_send_queue_full;
+ uint64_t s_send_lock_contention;
+ uint64_t s_send_lock_queue_raced;
+ uint64_t s_send_immediate_retry;
+ uint64_t s_send_delayed_retry;
+ uint64_t s_send_drop_acked;
+ uint64_t s_send_ack_required;
+ uint64_t s_send_queued;
+ uint64_t s_send_rdma;
+ uint64_t s_send_rdma_bytes;
+ uint64_t s_send_pong;
+ uint64_t s_page_remainder_hit;
+ uint64_t s_page_remainder_miss;
+ uint64_t s_copy_to_user;
+ uint64_t s_copy_from_user;
+ uint64_t s_cong_update_queued;
+ uint64_t s_cong_update_received;
+ uint64_t s_cong_send_error;
+ uint64_t s_cong_send_blocked;
+};
+
+/* af_rds.c */
+char *rds_str_array(char **array, size_t elements, size_t index);
+void rds_sock_addref(struct rds_sock *rs);
+void rds_sock_put(struct rds_sock *rs);
+void rds_wake_sk_sleep(struct rds_sock *rs);
+static inline void __rds_wake_sk_sleep(struct sock *sk)
+{
+ wait_queue_head_t *waitq = sk_sleep(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD) && waitq)
+ wake_up(waitq);
+}
+extern wait_queue_head_t rds_poll_waitq;
+
+
+/* bind.c */
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+void rds_remove_bound(struct rds_sock *rs);
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+
+/* cong.c */
+int rds_cong_get_maps(struct rds_connection *conn);
+void rds_cong_add_conn(struct rds_connection *conn);
+void rds_cong_remove_conn(struct rds_connection *conn);
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
+void rds_cong_queue_updates(struct rds_cong_map *map);
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
+int rds_cong_updated_since(unsigned long *recent);
+void rds_cong_add_socket(struct rds_sock *);
+void rds_cong_remove_socket(struct rds_sock *);
+void rds_cong_exit(void);
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
+
+/* conn.c */
+int rds_conn_init(void);
+void rds_conn_exit(void);
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp);
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+ struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_connection *conn);
+void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_connect_if_down(struct rds_connection *conn);
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int (*visitor)(struct rds_connection *, void *),
+ size_t item_len);
+__printf(2, 3)
+void __rds_conn_error(struct rds_connection *conn, const char *, ...);
+#define rds_conn_error(conn, fmt...) \
+ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_transition(struct rds_connection *conn, int old, int new)
+{
+ return atomic_cmpxchg(&conn->c_state, old, new) == old;
+}
+
+static inline int
+rds_conn_state(struct rds_connection *conn)
+{
+ return atomic_read(&conn->c_state);
+}
+
+static inline int
+rds_conn_up(struct rds_connection *conn)
+{
+ return atomic_read(&conn->c_state) == RDS_CONN_UP;
+}
+
+static inline int
+rds_conn_connecting(struct rds_connection *conn)
+{
+ return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
+}
+
+/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from);
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+ __be16 dport, u64 seq);
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data, unsigned int len);
+int rds_message_next_extension(struct rds_header *hdr,
+ unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+void rds_message_inc_free(struct rds_incoming *inc);
+void rds_message_addref(struct rds_message *rm);
+void rds_message_put(struct rds_message *rm);
+void rds_message_wait(struct rds_message *rm);
+void rds_message_unmapped(struct rds_message *rm);
+
+static inline void rds_message_make_checksum(struct rds_header *hdr)
+{
+ hdr->h_csum = 0;
+ hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
+}
+
+static inline int rds_message_verify_checksum(const struct rds_header *hdr)
+{
+ return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
+}
+
+
+/* page.c */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+ gfp_t gfp);
+int rds_page_copy_user(struct page *page, unsigned long offset,
+ void __user *ptr, unsigned long bytes,
+ int to_user);
+#define rds_page_copy_to_user(page, offset, ptr, bytes) \
+ rds_page_copy_user(page, offset, ptr, bytes, 1)
+#define rds_page_copy_from_user(page, offset, ptr, bytes) \
+ rds_page_copy_user(page, offset, ptr, bytes, 0)
+void rds_page_exit(void);
+
+/* recv.c */
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+ __be32 saddr);
+void rds_inc_put(struct rds_incoming *inc);
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+ struct rds_incoming *inc, gfp_t gfp);
+int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int msg_flags);
+void rds_clear_recv_queue(struct rds_sock *rs);
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
+void rds_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ __be32 saddr, __be32 daddr, int flip);
+
+/* send.c */
+int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
+void rds_send_reset(struct rds_connection *conn);
+int rds_send_xmit(struct rds_connection *conn);
+struct sockaddr_in;
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+ is_acked_func is_acked);
+int rds_send_pong(struct rds_connection *conn, __be16 dport);
+struct rds_message *rds_send_get_message(struct rds_connection *,
+ struct rm_rdma_op *);
+
+/* rdma.c */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+
+void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+ if (atomic_dec_and_test(&mr->r_refcount))
+ __rds_put_mr_final(mr);
+}
+
+/* stats.c */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+#define rds_stats_inc_which(which, member) do { \
+ per_cpu(which, get_cpu()).member++; \
+ put_cpu(); \
+} while (0)
+#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
+#define rds_stats_add_which(which, member, count) do { \
+ per_cpu(which, get_cpu()).member += count; \
+ put_cpu(); \
+} while (0)
+#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
+int rds_stats_init(void);
+void rds_stats_exit(void);
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+ uint64_t *values, const char *const *names,
+ size_t nr);
+
+/* sysctl.c */
+int rds_sysctl_init(void);
+void rds_sysctl_exit(void);
+extern unsigned long rds_sysctl_sndbuf_min;
+extern unsigned long rds_sysctl_sndbuf_default;
+extern unsigned long rds_sysctl_sndbuf_max;
+extern unsigned long rds_sysctl_reconnect_min_jiffies;
+extern unsigned long rds_sysctl_reconnect_max_jiffies;
+extern unsigned int rds_sysctl_max_unacked_packets;
+extern unsigned int rds_sysctl_max_unacked_bytes;
+extern unsigned int rds_sysctl_ping_enable;
+extern unsigned long rds_sysctl_trace_flags;
+extern unsigned int rds_sysctl_trace_level;
+
+/* threads.c */
+int rds_threads_init(void);
+void rds_threads_exit(void);
+extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_connection *conn);
+void rds_connect_worker(struct work_struct *);
+void rds_shutdown_worker(struct work_struct *);
+void rds_send_worker(struct work_struct *);
+void rds_recv_worker(struct work_struct *);
+void rds_connect_complete(struct rds_connection *conn);
+
+/* transport.c */
+int rds_trans_register(struct rds_transport *trans);
+void rds_trans_unregister(struct rds_transport *trans);
+struct rds_transport *rds_trans_get_preferred(__be32 addr);
+void rds_trans_put(struct rds_transport *trans);
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+int rds_trans_init(void);
+void rds_trans_exit(void);
+
+#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 000000000..a00462b0d
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,549 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+ __be32 saddr)
+{
+ atomic_set(&inc->i_refcount, 1);
+ INIT_LIST_HEAD(&inc->i_item);
+ inc->i_conn = conn;
+ inc->i_saddr = saddr;
+ inc->i_rdma_cookie = 0;
+}
+EXPORT_SYMBOL_GPL(rds_inc_init);
+
+static void rds_inc_addref(struct rds_incoming *inc)
+{
+ rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+ atomic_inc(&inc->i_refcount);
+}
+
+void rds_inc_put(struct rds_incoming *inc)
+{
+ rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+ if (atomic_dec_and_test(&inc->i_refcount)) {
+ BUG_ON(!list_empty(&inc->i_item));
+
+ inc->i_conn->c_trans->inc_free(inc);
+ }
+}
+EXPORT_SYMBOL_GPL(rds_inc_put);
+
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+ struct rds_cong_map *map,
+ int delta, __be16 port)
+{
+ int now_congested;
+
+ if (delta == 0)
+ return;
+
+ rs->rs_rcv_bytes += delta;
+ now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+
+ rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+ "now_cong %d delta %d\n",
+ rs, &rs->rs_bound_addr,
+ ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+ rds_sk_rcvbuf(rs), now_congested, delta);
+
+ /* wasn't -> am congested */
+ if (!rs->rs_congested && now_congested) {
+ rs->rs_congested = 1;
+ rds_cong_set_bit(map, port);
+ rds_cong_queue_updates(map);
+ }
+ /* was -> aren't congested */
+ /* Require more free space before reporting uncongested to prevent
+ bouncing cong/uncong state too often */
+ else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+ rs->rs_congested = 0;
+ rds_cong_clear_bit(map, port);
+ rds_cong_queue_updates(map);
+ }
+
+ /* do nothing if no change in cong state */
+}
+
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+ struct rds_header *hdr = &inc->i_hdr;
+ unsigned int pos = 0, type, len;
+ union {
+ struct rds_ext_header_version version;
+ struct rds_ext_header_rdma rdma;
+ struct rds_ext_header_rdma_dest rdma_dest;
+ } buffer;
+
+ while (1) {
+ len = sizeof(buffer);
+ type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+ if (type == RDS_EXTHDR_NONE)
+ break;
+ /* Process extension header here */
+ switch (type) {
+ case RDS_EXTHDR_RDMA:
+ rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+ break;
+
+ case RDS_EXTHDR_RDMA_DEST:
+ /* We ignore the size for now. We could stash it
+ * somewhere and use it for error checking. */
+ inc->i_rdma_cookie = rds_rdma_make_cookie(
+ be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+ be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+
+ break;
+ }
+ }
+}
+
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time. This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival. It does mean
+ * that small messages will wait behind large ones. Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn. This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+ struct rds_incoming *inc, gfp_t gfp)
+{
+ struct rds_sock *rs = NULL;
+ struct sock *sk;
+ unsigned long flags;
+
+ inc->i_conn = conn;
+ inc->i_rx_jiffies = jiffies;
+
+ rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+ "flags 0x%x rx_jiffies %lu\n", conn,
+ (unsigned long long)conn->c_next_rx_seq,
+ inc,
+ (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+ be32_to_cpu(inc->i_hdr.h_len),
+ be16_to_cpu(inc->i_hdr.h_sport),
+ be16_to_cpu(inc->i_hdr.h_dport),
+ inc->i_hdr.h_flags,
+ inc->i_rx_jiffies);
+
+ /*
+ * Sequence numbers should only increase. Messages get their
+ * sequence number as they're queued in a sending conn. They
+ * can be dropped, though, if the sending socket is closed before
+ * they hit the wire. So sequence numbers can skip forward
+ * under normal operation. They can also drop back in the conn
+ * failover case as previously sent messages are resent down the
+ * new instance of a conn. We drop those, otherwise we have
+ * to assume that the next valid seq does not come after a
+ * hole in the fragment stream.
+ *
+ * The headers don't give us a way to realize if fragments of
+ * a message have been dropped. We assume that frags that arrive
+ * to a flow are part of the current message on the flow that is
+ * being reassembled. This means that senders can't drop messages
+ * from the sending conn until all their frags are sent.
+ *
+ * XXX we could spend more on the wire to get more robust failure
+ * detection, arguably worth it to avoid data corruption.
+ */
+ if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
+ (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+ rds_stats_inc(s_recv_drop_old_seq);
+ goto out;
+ }
+ conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+
+ if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+ rds_stats_inc(s_recv_ping);
+ rds_send_pong(conn, inc->i_hdr.h_sport);
+ goto out;
+ }
+
+ rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+ if (!rs) {
+ rds_stats_inc(s_recv_drop_no_sock);
+ goto out;
+ }
+
+ /* Process extension headers */
+ rds_recv_incoming_exthdrs(inc, rs);
+
+ /* We can be racing with rds_release() which marks the socket dead. */
+ sk = rds_rs_to_sk(rs);
+
+ /* serialize with rds_release -> sock_orphan */
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+ rds_stats_inc(s_recv_queued);
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ rds_inc_addref(inc);
+ list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+ __rds_wake_sk_sleep(sk);
+ } else {
+ rds_stats_inc(s_recv_drop_dead_sock);
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+out:
+ if (rs)
+ rds_sock_put(rs);
+}
+EXPORT_SYMBOL_GPL(rds_recv_incoming);
+
+/*
+ * be very careful here. This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+ unsigned long flags;
+
+ if (!*inc) {
+ read_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!list_empty(&rs->rs_recv_queue)) {
+ *inc = list_entry(rs->rs_recv_queue.next,
+ struct rds_incoming,
+ i_item);
+ rds_inc_addref(*inc);
+ }
+ read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+ }
+
+ return *inc != NULL;
+}
+
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+ int drop)
+{
+ struct sock *sk = rds_rs_to_sk(rs);
+ int ret = 0;
+ unsigned long flags;
+
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ if (!list_empty(&inc->i_item)) {
+ ret = 1;
+ if (drop) {
+ /* XXX make sure this i_conn is reliable */
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ -be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ list_del_init(&inc->i_item);
+ rds_inc_put(inc);
+ }
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+ rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+ return ret;
+}
+
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+ struct rds_notifier *notifier;
+ struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
+ unsigned int count = 0, max_messages = ~0U;
+ unsigned long flags;
+ LIST_HEAD(copy);
+ int err = 0;
+
+
+ /* put_cmsg copies to user space and thus may sleep. We can't do this
+ * with rs_lock held, so first grab as many notifications as we can stuff
+ * in the user provided cmsg buffer. We don't try to copy more, to avoid
+ * losing notifications - except when the buffer is so small that it wouldn't
+ * even hold a single notification. Then we give him as much of this single
+ * msg as we can squeeze in, and set MSG_CTRUNC.
+ */
+ if (msghdr) {
+ max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+ if (!max_messages)
+ max_messages = 1;
+ }
+
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+ notifier = list_entry(rs->rs_notify_queue.next,
+ struct rds_notifier, n_list);
+ list_move(&notifier->n_list, &copy);
+ count++;
+ }
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ if (!count)
+ return 0;
+
+ while (!list_empty(&copy)) {
+ notifier = list_entry(copy.next, struct rds_notifier, n_list);
+
+ if (msghdr) {
+ cmsg.user_token = notifier->n_user_token;
+ cmsg.status = notifier->n_status;
+
+ err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+ sizeof(cmsg), &cmsg);
+ if (err)
+ break;
+ }
+
+ list_del_init(&notifier->n_list);
+ kfree(notifier);
+ }
+
+ /* If we bailed out because of an error in put_cmsg,
+ * we may be left with one or more notifications that we
+ * didn't process. Return them to the head of the list. */
+ if (!list_empty(&copy)) {
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ list_splice(&copy, &rs->rs_notify_queue);
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+ }
+
+ return err;
+}
+
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+ uint64_t notify = rs->rs_cong_notify;
+ unsigned long flags;
+ int err;
+
+ err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+ sizeof(notify), &notify);
+ if (err)
+ return err;
+
+ spin_lock_irqsave(&rs->rs_lock, flags);
+ rs->rs_cong_notify &= ~notify;
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ return 0;
+}
+
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+{
+ int ret = 0;
+
+ if (inc->i_rdma_cookie) {
+ ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+ sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int msg_flags)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ long timeo;
+ int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct rds_incoming *inc = NULL;
+
+ /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+
+ if (msg_flags & MSG_OOB)
+ goto out;
+
+ while (1) {
+ struct iov_iter save;
+ /* If there are pending notifications, do those - and nothing else */
+ if (!list_empty(&rs->rs_notify_queue)) {
+ ret = rds_notify_queue_get(rs, msg);
+ break;
+ }
+
+ if (rs->rs_cong_notify) {
+ ret = rds_notify_cong(rs, msg);
+ break;
+ }
+
+ if (!rds_next_incoming(rs, &inc)) {
+ if (nonblock) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+ (!list_empty(&rs->rs_notify_queue) ||
+ rs->rs_cong_notify ||
+ rds_next_incoming(rs, &inc)), timeo);
+ rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+ timeo);
+ if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+ continue;
+
+ ret = timeo;
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+ break;
+ }
+
+ rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+ &inc->i_conn->c_faddr,
+ ntohs(inc->i_hdr.h_sport));
+ save = msg->msg_iter;
+ ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
+ if (ret < 0)
+ break;
+
+ /*
+ * if the message we just copied isn't at the head of the
+ * recv queue then someone else raced us to return it, try
+ * to get the next message.
+ */
+ if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+ rds_inc_put(inc);
+ inc = NULL;
+ rds_stats_inc(s_recv_deliver_raced);
+ msg->msg_iter = save;
+ continue;
+ }
+
+ if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+ if (msg_flags & MSG_TRUNC)
+ ret = be32_to_cpu(inc->i_hdr.h_len);
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ if (rds_cmsg_recv(inc, msg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ rds_stats_inc(s_recv_delivered);
+
+ if (sin) {
+ sin->sin_family = AF_INET;
+ sin->sin_port = inc->i_hdr.h_sport;
+ sin->sin_addr.s_addr = inc->i_saddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ msg->msg_namelen = sizeof(*sin);
+ }
+ break;
+ }
+
+ if (inc)
+ rds_inc_put(inc);
+
+out:
+ return ret;
+}
+
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg. The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+ struct sock *sk = rds_rs_to_sk(rs);
+ struct rds_incoming *inc, *tmp;
+ unsigned long flags;
+
+ write_lock_irqsave(&rs->rs_recv_lock, flags);
+ list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+ rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+ -be32_to_cpu(inc->i_hdr.h_len),
+ inc->i_hdr.h_dport);
+ list_del_init(&inc->i_item);
+ rds_inc_put(inc);
+ }
+ write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ __be32 saddr, __be32 daddr, int flip)
+{
+ struct rds_info_message minfo;
+
+ minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+ minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+
+ if (flip) {
+ minfo.laddr = daddr;
+ minfo.faddr = saddr;
+ minfo.lport = inc->i_hdr.h_dport;
+ minfo.fport = inc->i_hdr.h_sport;
+ } else {
+ minfo.laddr = saddr;
+ minfo.faddr = daddr;
+ minfo.lport = inc->i_hdr.h_sport;
+ minfo.fport = inc->i_hdr.h_dport;
+ }
+
+ rds_info_copy(iter, &minfo, sizeof(minfo));
+}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 000000000..e9430f537
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1165 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/ratelimit.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/* When transmitting messages in rds_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = 64;
+module_param(send_batch_count, int, 0444);
+MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+
+static void rds_send_remove_from_sock(struct list_head *messages, int status);
+
+/*
+ * Reset the send state. Callers must ensure that this doesn't race with
+ * rds_send_xmit().
+ */
+void rds_send_reset(struct rds_connection *conn)
+{
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+
+ if (conn->c_xmit_rm) {
+ rm = conn->c_xmit_rm;
+ conn->c_xmit_rm = NULL;
+ /* Tell the user the RDMA op is no longer mapped by the
+ * transport. This isn't entirely true (it's flushed out
+ * independently) but as the connection is down, there's
+ * no ongoing RDMA to/from that memory */
+ rds_message_unmapped(rm);
+ rds_message_put(rm);
+ }
+
+ conn->c_xmit_sg = 0;
+ conn->c_xmit_hdr_off = 0;
+ conn->c_xmit_data_off = 0;
+ conn->c_xmit_atomic_sent = 0;
+ conn->c_xmit_rdma_sent = 0;
+ conn->c_xmit_data_sent = 0;
+
+ conn->c_map_queued = 0;
+
+ conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+ conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+
+ /* Mark messages as retransmissions, and move them to the send q */
+ spin_lock_irqsave(&conn->c_lock, flags);
+ list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+ set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
+ }
+ list_splice_init(&conn->c_retrans, &conn->c_send_queue);
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+}
+
+static int acquire_in_xmit(struct rds_connection *conn)
+{
+ return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_connection *conn)
+{
+ clear_bit(RDS_IN_XMIT, &conn->c_flags);
+ smp_mb__after_atomic();
+ /*
+ * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+ * hot path and finding waiters is very rare. We don't want to walk
+ * the system-wide hashed waitqueue buckets in the fast path only to
+ * almost never find waiters.
+ */
+ if (waitqueue_active(&conn->c_waitq))
+ wake_up_all(&conn->c_waitq);
+}
+
+/*
+ * We're making the conscious trade-off here to only send one message
+ * down the connection at a time.
+ * Pro:
+ * - tx queueing is a simple fifo list
+ * - reassembly is optional and easily done by transports per conn
+ * - no per flow rx lookup at all, straight to the socket
+ * - less per-frag memory and wire overhead
+ * Con:
+ * - queued acks can be delayed behind large messages
+ * Depends:
+ * - small message latency is higher behind queued large messages
+ * - large message latency isn't starved by intervening small sends
+ */
+int rds_send_xmit(struct rds_connection *conn)
+{
+ struct rds_message *rm;
+ unsigned long flags;
+ unsigned int tmp;
+ struct scatterlist *sg;
+ int ret = 0;
+ LIST_HEAD(to_be_dropped);
+ int batch_count;
+ unsigned long send_gen = 0;
+
+restart:
+ batch_count = 0;
+
+ /*
+ * sendmsg calls here after having queued its message on the send
+ * queue. We only have one task feeding the connection at a time. If
+ * another thread is already feeding the queue then we back off. This
+ * avoids blocking the caller and trading per-connection data between
+ * caches per message.
+ */
+ if (!acquire_in_xmit(conn)) {
+ rds_stats_inc(s_send_lock_contention);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * we record the send generation after doing the xmit acquire.
+ * if someone else manages to jump in and do some work, we'll use
+ * this to avoid a goto restart farther down.
+ *
+ * The acquire_in_xmit() check above ensures that only one
+ * caller can increment c_send_gen at any time.
+ */
+ conn->c_send_gen++;
+ send_gen = conn->c_send_gen;
+
+ /*
+ * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+ * we do the opposite to avoid races.
+ */
+ if (!rds_conn_up(conn)) {
+ release_in_xmit(conn);
+ ret = 0;
+ goto out;
+ }
+
+ if (conn->c_trans->xmit_prepare)
+ conn->c_trans->xmit_prepare(conn);
+
+ /*
+ * spin trying to push headers and data down the connection until
+ * the connection doesn't make forward progress.
+ */
+ while (1) {
+
+ rm = conn->c_xmit_rm;
+
+ /*
+ * If between sending messages, we can send a pending congestion
+ * map update.
+ */
+ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+ rm = rds_cong_update_alloc(conn);
+ if (IS_ERR(rm)) {
+ ret = PTR_ERR(rm);
+ break;
+ }
+ rm->data.op_active = 1;
+
+ conn->c_xmit_rm = rm;
+ }
+
+ /*
+ * If not already working on one, grab the next message.
+ *
+ * c_xmit_rm holds a ref while we're sending this message down
+ * the connction. We can use this ref while holding the
+ * send_sem.. rds_send_reset() is serialized with it.
+ */
+ if (!rm) {
+ unsigned int len;
+
+ batch_count++;
+
+ /* we want to process as big a batch as we can, but
+ * we also want to avoid softlockups. If we've been
+ * through a lot of messages, lets back off and see
+ * if anyone else jumps in
+ */
+ if (batch_count >= 1024)
+ goto over_batch;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+
+ if (!list_empty(&conn->c_send_queue)) {
+ rm = list_entry(conn->c_send_queue.next,
+ struct rds_message,
+ m_conn_item);
+ rds_message_addref(rm);
+
+ /*
+ * Move the message from the send queue to the retransmit
+ * list right away.
+ */
+ list_move_tail(&rm->m_conn_item, &conn->c_retrans);
+ }
+
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ if (!rm)
+ break;
+
+ /* Unfortunately, the way Infiniband deals with
+ * RDMA to a bad MR key is by moving the entire
+ * queue pair to error state. We cold possibly
+ * recover from that, but right now we drop the
+ * connection.
+ * Therefore, we never retransmit messages with RDMA ops.
+ */
+ if (rm->rdma.op_active &&
+ test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+ spin_lock_irqsave(&conn->c_lock, flags);
+ if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+ list_move(&rm->m_conn_item, &to_be_dropped);
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+ continue;
+ }
+
+ /* Require an ACK every once in a while */
+ len = ntohl(rm->m_inc.i_hdr.h_len);
+ if (conn->c_unacked_packets == 0 ||
+ conn->c_unacked_bytes < len) {
+ __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+ conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+ conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+ rds_stats_inc(s_send_ack_required);
+ } else {
+ conn->c_unacked_bytes -= len;
+ conn->c_unacked_packets--;
+ }
+
+ conn->c_xmit_rm = rm;
+ }
+
+ /* The transport either sends the whole rdma or none of it */
+ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+ rm->m_final_op = &rm->rdma;
+ ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+ if (ret)
+ break;
+ conn->c_xmit_rdma_sent = 1;
+
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ }
+
+ if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+ rm->m_final_op = &rm->atomic;
+ ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+ if (ret)
+ break;
+ conn->c_xmit_atomic_sent = 1;
+
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ }
+
+ /*
+ * A number of cases require an RDS header to be sent
+ * even if there is no data.
+ * We permit 0-byte sends; rds-ping depends on this.
+ * However, if there are exclusively attached silent ops,
+ * we skip the hdr/data send, to enable silent operation.
+ */
+ if (rm->data.op_nents == 0) {
+ int ops_present;
+ int all_ops_are_silent = 1;
+
+ ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+ if (rm->atomic.op_active && !rm->atomic.op_silent)
+ all_ops_are_silent = 0;
+ if (rm->rdma.op_active && !rm->rdma.op_silent)
+ all_ops_are_silent = 0;
+
+ if (ops_present && all_ops_are_silent
+ && !rm->m_rdma_cookie)
+ rm->data.op_active = 0;
+ }
+
+ if (rm->data.op_active && !conn->c_xmit_data_sent) {
+ rm->m_final_op = &rm->data;
+ ret = conn->c_trans->xmit(conn, rm,
+ conn->c_xmit_hdr_off,
+ conn->c_xmit_sg,
+ conn->c_xmit_data_off);
+ if (ret <= 0)
+ break;
+
+ if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
+ tmp = min_t(int, ret,
+ sizeof(struct rds_header) -
+ conn->c_xmit_hdr_off);
+ conn->c_xmit_hdr_off += tmp;
+ ret -= tmp;
+ }
+
+ sg = &rm->data.op_sg[conn->c_xmit_sg];
+ while (ret) {
+ tmp = min_t(int, ret, sg->length -
+ conn->c_xmit_data_off);
+ conn->c_xmit_data_off += tmp;
+ ret -= tmp;
+ if (conn->c_xmit_data_off == sg->length) {
+ conn->c_xmit_data_off = 0;
+ sg++;
+ conn->c_xmit_sg++;
+ BUG_ON(ret != 0 &&
+ conn->c_xmit_sg == rm->data.op_nents);
+ }
+ }
+
+ if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+ (conn->c_xmit_sg == rm->data.op_nents))
+ conn->c_xmit_data_sent = 1;
+ }
+
+ /*
+ * A rm will only take multiple times through this loop
+ * if there is a data op. Thus, if the data is sent (or there was
+ * none), then we're done with the rm.
+ */
+ if (!rm->data.op_active || conn->c_xmit_data_sent) {
+ conn->c_xmit_rm = NULL;
+ conn->c_xmit_sg = 0;
+ conn->c_xmit_hdr_off = 0;
+ conn->c_xmit_data_off = 0;
+ conn->c_xmit_rdma_sent = 0;
+ conn->c_xmit_atomic_sent = 0;
+ conn->c_xmit_data_sent = 0;
+
+ rds_message_put(rm);
+ }
+ }
+
+over_batch:
+ if (conn->c_trans->xmit_complete)
+ conn->c_trans->xmit_complete(conn);
+ release_in_xmit(conn);
+
+ /* Nuke any messages we decided not to retransmit. */
+ if (!list_empty(&to_be_dropped)) {
+ /* irqs on here, so we can put(), unlike above */
+ list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+ rds_message_put(rm);
+ rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+ }
+
+ /*
+ * Other senders can queue a message after we last test the send queue
+ * but before we clear RDS_IN_XMIT. In that case they'd back off and
+ * not try and send their newly queued message. We need to check the
+ * send queue after having cleared RDS_IN_XMIT so that their message
+ * doesn't get stuck on the send queue.
+ *
+ * If the transport cannot continue (i.e ret != 0), then it must
+ * call us when more room is available, such as from the tx
+ * completion handler.
+ *
+ * We have an extra generation check here so that if someone manages
+ * to jump in after our release_in_xmit, we'll see that they have done
+ * some work and we will skip our goto
+ */
+ if (ret == 0) {
+ smp_mb();
+ if (!list_empty(&conn->c_send_queue) &&
+ send_gen == conn->c_send_gen) {
+ rds_stats_inc(s_send_lock_queue_raced);
+ goto restart;
+ }
+ }
+out:
+ return ret;
+}
+
+static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
+{
+ u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ assert_spin_locked(&rs->rs_lock);
+
+ BUG_ON(rs->rs_snd_bytes < len);
+ rs->rs_snd_bytes -= len;
+
+ if (rs->rs_snd_bytes == 0)
+ rds_stats_inc(s_send_queue_empty);
+}
+
+static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
+ is_acked_func is_acked)
+{
+ if (is_acked)
+ return is_acked(rm, ack);
+ return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
+}
+
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void rds_rdma_send_complete(struct rds_message *rm, int status)
+{
+ struct rds_sock *rs = NULL;
+ struct rm_rdma_op *ro;
+ struct rds_notifier *notifier;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ ro = &rm->rdma;
+ if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
+ ro->op_active && ro->op_notify && ro->op_notifier) {
+ notifier = ro->op_notifier;
+ rs = rm->m_rs;
+ sock_hold(rds_rs_to_sk(rs));
+
+ notifier->n_status = status;
+ spin_lock(&rs->rs_lock);
+ list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+ spin_unlock(&rs->rs_lock);
+
+ ro->op_notifier = NULL;
+ }
+
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+}
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
+
+/*
+ * Just like above, except looks at atomic op
+ */
+void rds_atomic_send_complete(struct rds_message *rm, int status)
+{
+ struct rds_sock *rs = NULL;
+ struct rm_atomic_op *ao;
+ struct rds_notifier *notifier;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ ao = &rm->atomic;
+ if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+ && ao->op_active && ao->op_notify && ao->op_notifier) {
+ notifier = ao->op_notifier;
+ rs = rm->m_rs;
+ sock_hold(rds_rs_to_sk(rs));
+
+ notifier->n_status = status;
+ spin_lock(&rs->rs_lock);
+ list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+ spin_unlock(&rs->rs_lock);
+
+ ao->op_notifier = NULL;
+ }
+
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+}
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
+
+/*
+ * This is the same as rds_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+{
+ struct rm_rdma_op *ro;
+ struct rm_atomic_op *ao;
+
+ ro = &rm->rdma;
+ if (ro->op_active && ro->op_notify && ro->op_notifier) {
+ ro->op_notifier->n_status = status;
+ list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+ ro->op_notifier = NULL;
+ }
+
+ ao = &rm->atomic;
+ if (ao->op_active && ao->op_notify && ao->op_notifier) {
+ ao->op_notifier->n_status = status;
+ list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+ ao->op_notifier = NULL;
+ }
+
+ /* No need to wake the app - caller does this */
+}
+
+/*
+ * This is called from the IB send completion when we detect
+ * a RDMA operation that failed with remote access error.
+ * So speed is not an issue here.
+ */
+struct rds_message *rds_send_get_message(struct rds_connection *conn,
+ struct rm_rdma_op *op)
+{
+ struct rds_message *rm, *tmp, *found = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+
+ list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ if (&rm->rdma == op) {
+ atomic_inc(&rm->m_refcount);
+ found = rm;
+ goto out;
+ }
+ }
+
+ list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+ if (&rm->rdma == op) {
+ atomic_inc(&rm->m_refcount);
+ found = rm;
+ break;
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ return found;
+}
+EXPORT_SYMBOL_GPL(rds_send_get_message);
+
+/*
+ * This removes messages from the socket's list if they're on it. The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks. The messages must have a reference held for their
+ * position on the list. This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+static void rds_send_remove_from_sock(struct list_head *messages, int status)
+{
+ unsigned long flags;
+ struct rds_sock *rs = NULL;
+ struct rds_message *rm;
+
+ while (!list_empty(messages)) {
+ int was_on_sock = 0;
+
+ rm = list_entry(messages->next, struct rds_message,
+ m_conn_item);
+ list_del_init(&rm->m_conn_item);
+
+ /*
+ * If we see this flag cleared then we're *sure* that someone
+ * else beat us to removing it from the sock. If we race
+ * with their flag update we'll get the lock and then really
+ * see that the flag has been cleared.
+ *
+ * The message spinlock makes sure nobody clears rm->m_rs
+ * while we're messing with it. It does not prevent the
+ * message from being removed from the socket, though.
+ */
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+ if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
+ goto unlock_and_drop;
+
+ if (rs != rm->m_rs) {
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+ rs = rm->m_rs;
+ if (rs)
+ sock_hold(rds_rs_to_sk(rs));
+ }
+ if (!rs)
+ goto unlock_and_drop;
+ spin_lock(&rs->rs_lock);
+
+ if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
+ struct rm_rdma_op *ro = &rm->rdma;
+ struct rds_notifier *notifier;
+
+ list_del_init(&rm->m_sock_item);
+ rds_send_sndbuf_remove(rs, rm);
+
+ if (ro->op_active && ro->op_notifier &&
+ (ro->op_notify || (ro->op_recverr && status))) {
+ notifier = ro->op_notifier;
+ list_add_tail(&notifier->n_list,
+ &rs->rs_notify_queue);
+ if (!notifier->n_status)
+ notifier->n_status = status;
+ rm->rdma.op_notifier = NULL;
+ }
+ was_on_sock = 1;
+ rm->m_rs = NULL;
+ }
+ spin_unlock(&rs->rs_lock);
+
+unlock_and_drop:
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+ rds_message_put(rm);
+ if (was_on_sock)
+ rds_message_put(rm);
+ }
+
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
+}
+
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number. Messages are
+ * moved to the retrans queue when rds_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ */
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+ is_acked_func is_acked)
+{
+ struct rds_message *rm, *tmp;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+
+ list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ if (!rds_send_is_acked(rm, ack, is_acked))
+ break;
+
+ list_move(&rm->m_conn_item, &list);
+ clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ }
+
+ /* order flag updates with spin locks */
+ if (!list_empty(&list))
+ smp_mb__after_atomic();
+
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ /* now remove the messages from the sock list as needed */
+ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
+}
+EXPORT_SYMBOL_GPL(rds_send_drop_acked);
+
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+{
+ struct rds_message *rm, *tmp;
+ struct rds_connection *conn;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ /* get all the messages we're dropping under the rs lock */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+
+ list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
+ if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
+ dest->sin_port != rm->m_inc.i_hdr.h_dport))
+ continue;
+
+ list_move(&rm->m_sock_item, &list);
+ rds_send_sndbuf_remove(rs, rm);
+ clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+ }
+
+ /* order flag updates with the rs lock */
+ smp_mb__after_atomic();
+
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+ if (list_empty(&list))
+ return;
+
+ /* Remove the messages from the conn */
+ list_for_each_entry(rm, &list, m_sock_item) {
+
+ conn = rm->m_inc.i_conn;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+ /*
+ * Maybe someone else beat us to removing rm from the conn.
+ * If we race with their flag update we'll get the lock and
+ * then really see that the flag has been cleared.
+ */
+ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+ rm->m_rs = NULL;
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+ continue;
+ }
+ list_del_init(&rm->m_conn_item);
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ /*
+ * Couldn't grab m_rs_lock in top loop (lock ordering),
+ * but we can now.
+ */
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+ spin_lock(&rs->rs_lock);
+ __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+ spin_unlock(&rs->rs_lock);
+
+ rm->m_rs = NULL;
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ rds_message_put(rm);
+ }
+
+ rds_wake_sk_sleep(rs);
+
+ while (!list_empty(&list)) {
+ rm = list_entry(list.next, struct rds_message, m_sock_item);
+ list_del_init(&rm->m_sock_item);
+
+ rds_message_wait(rm);
+ rds_message_put(rm);
+ }
+}
+
+/*
+ * we only want this to fire once so we use the callers 'queued'. It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDS_CANCEL_SENT_TO.
+ */
+static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+ struct rds_message *rm, __be16 sport,
+ __be16 dport, int *queued)
+{
+ unsigned long flags;
+ u32 len;
+
+ if (*queued)
+ goto out;
+
+ len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+ /* this is the only place which holds both the socket's rs_lock
+ * and the connection's c_lock */
+ spin_lock_irqsave(&rs->rs_lock, flags);
+
+ /*
+ * If there is a little space in sndbuf, we don't queue anything,
+ * and userspace gets -EAGAIN. But poll() indicates there's send
+ * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+ * freed up by incoming acks. So we check the *old* value of
+ * rs_snd_bytes here to allow the last msg to exceed the buffer,
+ * and poll() now knows no more data can be sent.
+ */
+ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
+ rs->rs_snd_bytes += len;
+
+ /* let recv side know we are close to send space exhaustion.
+ * This is probably not the optimal way to do it, as this
+ * means we set the flag on *all* messages as soon as our
+ * throughput hits a certain threshold.
+ */
+ if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
+ __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+ list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
+ set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+ rds_message_addref(rm);
+ rm->m_rs = rs;
+
+ /* The code ordering is a little weird, but we're
+ trying to minimize the time we hold c_lock */
+ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
+ rm->m_inc.i_conn = conn;
+ rds_message_addref(rm);
+
+ spin_lock(&conn->c_lock);
+ rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
+ list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ spin_unlock(&conn->c_lock);
+
+ rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
+ rm, len, rs, rs->rs_snd_bytes,
+ (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
+
+ *queued = 1;
+ }
+
+ spin_unlock_irqrestore(&rs->rs_lock, flags);
+out:
+ return *queued;
+}
+
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int data_len)
+{
+ struct cmsghdr *cmsg;
+ int size = 0;
+ int cmsg_groups = 0;
+ int retval;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_RDMA_ARGS:
+ cmsg_groups |= 1;
+ retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+ if (retval < 0)
+ return retval;
+ size += retval;
+
+ break;
+
+ case RDS_CMSG_RDMA_DEST:
+ case RDS_CMSG_RDMA_MAP:
+ cmsg_groups |= 2;
+ /* these are valid but do no add any size */
+ break;
+
+ case RDS_CMSG_ATOMIC_CSWP:
+ case RDS_CMSG_ATOMIC_FADD:
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ cmsg_groups |= 1;
+ size += sizeof(struct scatterlist);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ }
+
+ size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+
+ /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+ if (cmsg_groups == 3)
+ return -EINVAL;
+
+ return size;
+}
+
+static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
+ struct msghdr *msg, int *allocated_mr)
+{
+ struct cmsghdr *cmsg;
+ int ret = 0;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ /* As a side effect, RDMA_DEST and RDMA_MAP will set
+ * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
+ */
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_RDMA_ARGS:
+ ret = rds_cmsg_rdma_args(rs, rm, cmsg);
+ break;
+
+ case RDS_CMSG_RDMA_DEST:
+ ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
+ break;
+
+ case RDS_CMSG_RDMA_MAP:
+ ret = rds_cmsg_rdma_map(rs, rm, cmsg);
+ if (!ret)
+ *allocated_mr = 1;
+ break;
+ case RDS_CMSG_ATOMIC_CSWP:
+ case RDS_CMSG_ATOMIC_FADD:
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ ret = rds_cmsg_atomic(rs, rm, cmsg);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
+{
+ struct sock *sk = sock->sk;
+ struct rds_sock *rs = rds_sk_to_rs(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ __be32 daddr;
+ __be16 dport;
+ struct rds_message *rm = NULL;
+ struct rds_connection *conn;
+ int ret = 0;
+ int queued = 0, allocated_mr = 0;
+ int nonblock = msg->msg_flags & MSG_DONTWAIT;
+ long timeo = sock_sndtimeo(sk, nonblock);
+
+ /* Mirror Linux UDP mirror of BSD error message compatibility */
+ /* XXX: Perhaps MSG_MORE someday */
+ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (msg->msg_namelen) {
+ /* XXX fail non-unicast destination IPs? */
+ if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+ ret = -EINVAL;
+ goto out;
+ }
+ daddr = usin->sin_addr.s_addr;
+ dport = usin->sin_port;
+ } else {
+ /* We only care about consistency with ->connect() */
+ lock_sock(sk);
+ daddr = rs->rs_conn_addr;
+ dport = rs->rs_conn_port;
+ release_sock(sk);
+ }
+
+ /* racing with another thread binding seems ok here */
+ if (daddr == 0 || rs->rs_bound_addr == 0) {
+ ret = -ENOTCONN; /* XXX not a great errno */
+ goto out;
+ }
+
+ /* size of rm including all sgs */
+ ret = rds_rm_size(msg, payload_len);
+ if (ret < 0)
+ goto out;
+
+ rm = rds_message_alloc(ret, GFP_KERNEL);
+ if (!rm) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Attach data to the rm */
+ if (payload_len) {
+ rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+ if (!rm->data.op_sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = rds_message_copy_from_user(rm, &msg->msg_iter);
+ if (ret)
+ goto out;
+ }
+ rm->data.op_active = 1;
+
+ rm->m_daddr = daddr;
+
+ /* rds_conn_create has a spinlock that runs with IRQ off.
+ * Caching the conn in the socket helps a lot. */
+ if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+ conn = rs->rs_conn;
+ else {
+ conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+ rs->rs_transport,
+ sock->sk->sk_allocation);
+ if (IS_ERR(conn)) {
+ ret = PTR_ERR(conn);
+ goto out;
+ }
+ rs->rs_conn = conn;
+ }
+
+ /* Parse any control messages the user may have included. */
+ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+ if (ret)
+ goto out;
+
+ if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
+ printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+ &rm->rdma, conn->c_trans->xmit_rdma);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+ printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+ &rm->atomic, conn->c_trans->xmit_atomic);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rds_conn_connect_if_down(conn);
+
+ ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
+ if (ret) {
+ rs->rs_seen_congestion = 1;
+ goto out;
+ }
+
+ while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+ dport, &queued)) {
+ rds_stats_inc(s_send_queue_full);
+ /* XXX make sure this is reasonable */
+ if (payload_len > rds_sk_sndbuf(rs)) {
+ ret = -EMSGSIZE;
+ goto out;
+ }
+ if (nonblock) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+ rds_send_queue_rm(rs, conn, rm,
+ rs->rs_bound_port,
+ dport,
+ &queued),
+ timeo);
+ rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
+ if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+ continue;
+
+ ret = timeo;
+ if (ret == 0)
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+
+ /*
+ * By now we've committed to the send. We reuse rds_send_worker()
+ * to retry sends in the rds thread if the transport asks us to.
+ */
+ rds_stats_inc(s_send_queued);
+
+ if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ rds_send_xmit(conn);
+
+ rds_message_put(rm);
+ return payload_len;
+
+out:
+ /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+ * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+ * or in any other way, we need to destroy the MR again */
+ if (allocated_mr)
+ rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
+
+ if (rm)
+ rds_message_put(rm);
+ return ret;
+}
+
+/*
+ * Reply to a ping packet.
+ */
+int
+rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+ struct rds_message *rm;
+ unsigned long flags;
+ int ret = 0;
+
+ rm = rds_message_alloc(0, GFP_ATOMIC);
+ if (!rm) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rm->m_daddr = conn->c_faddr;
+ rm->data.op_active = 1;
+
+ rds_conn_connect_if_down(conn);
+
+ ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+ if (ret)
+ goto out;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
+ list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+ set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+ rds_message_addref(rm);
+ rm->m_inc.i_conn = conn;
+
+ rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+ conn->c_next_tx_seq);
+ conn->c_next_tx_seq++;
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+
+ rds_stats_inc(s_send_queued);
+ rds_stats_inc(s_send_pong);
+
+ if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+ rds_message_put(rm);
+ return 0;
+
+out:
+ if (rm)
+ rds_message_put(rm);
+ return ret;
+}
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 000000000..73be187d3
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
+
+/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
+
+static const char *const rds_stat_names[] = {
+ "conn_reset",
+ "recv_drop_bad_checksum",
+ "recv_drop_old_seq",
+ "recv_drop_no_sock",
+ "recv_drop_dead_sock",
+ "recv_deliver_raced",
+ "recv_delivered",
+ "recv_queued",
+ "recv_immediate_retry",
+ "recv_delayed_retry",
+ "recv_ack_required",
+ "recv_rdma_bytes",
+ "recv_ping",
+ "send_queue_empty",
+ "send_queue_full",
+ "send_lock_contention",
+ "send_lock_queue_raced",
+ "send_immediate_retry",
+ "send_delayed_retry",
+ "send_drop_acked",
+ "send_ack_required",
+ "send_queued",
+ "send_rdma",
+ "send_rdma_bytes",
+ "send_pong",
+ "page_remainder_hit",
+ "page_remainder_miss",
+ "copy_to_user",
+ "copy_from_user",
+ "cong_update_queued",
+ "cong_update_received",
+ "cong_send_error",
+ "cong_send_blocked",
+};
+
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+ uint64_t *values, const char *const *names, size_t nr)
+{
+ struct rds_info_counter ctr;
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
+ strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+ ctr.name[sizeof(ctr.name) - 1] = '\0';
+ ctr.value = values[i];
+
+ rds_info_copy(iter, &ctr, sizeof(ctr));
+ }
+}
+EXPORT_SYMBOL_GPL(rds_stats_info_copy);
+
+/*
+ * This gives global counters across all the transports. The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting. Some are pretty implementation dependent
+ * and may change over time. That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace. It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+static void rds_stats_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+ unsigned int avail;
+
+ avail = len / sizeof(struct rds_info_counter);
+
+ if (avail < ARRAY_SIZE(rds_stat_names)) {
+ avail = 0;
+ goto trans;
+ }
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
+ ARRAY_SIZE(rds_stat_names));
+ avail -= ARRAY_SIZE(rds_stat_names);
+
+trans:
+ lens->each = sizeof(struct rds_info_counter);
+ lens->nr = rds_trans_stats_info_copy(iter, avail) +
+ ARRAY_SIZE(rds_stat_names);
+}
+
+void rds_stats_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
+}
+
+int rds_stats_init(void)
+{
+ rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
+ return 0;
+}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 000000000..c173f69e1
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+static struct ctl_table_header *rds_sysctl_reg_table;
+
+static unsigned long rds_sysctl_reconnect_min = 1;
+static unsigned long rds_sysctl_reconnect_max = ~0UL;
+
+unsigned long rds_sysctl_reconnect_min_jiffies;
+unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
+
+unsigned int rds_sysctl_max_unacked_packets = 8;
+unsigned int rds_sysctl_max_unacked_bytes = (16 << 20);
+
+unsigned int rds_sysctl_ping_enable = 1;
+
+static struct ctl_table rds_sysctl_rds_table[] = {
+ {
+ .procname = "reconnect_min_delay_ms",
+ .data = &rds_sysctl_reconnect_min_jiffies,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = &rds_sysctl_reconnect_min,
+ .extra2 = &rds_sysctl_reconnect_max_jiffies,
+ },
+ {
+ .procname = "reconnect_max_delay_ms",
+ .data = &rds_sysctl_reconnect_max_jiffies,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .extra1 = &rds_sysctl_reconnect_min_jiffies,
+ .extra2 = &rds_sysctl_reconnect_max,
+ },
+ {
+ .procname = "max_unacked_packets",
+ .data = &rds_sysctl_max_unacked_packets,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "max_unacked_bytes",
+ .data = &rds_sysctl_max_unacked_bytes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "ping_enable",
+ .data = &rds_sysctl_ping_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+void rds_sysctl_exit(void)
+{
+ unregister_net_sysctl_table(rds_sysctl_reg_table);
+}
+
+int rds_sysctl_init(void)
+{
+ rds_sysctl_reconnect_min = msecs_to_jiffies(1);
+ rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
+
+ rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
+ if (!rds_sysctl_reg_table)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
new file mode 100644
index 000000000..edac9ef2b
--- /dev/null
+++ b/net/rds/tcp.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/* only for info exporting */
+static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
+static LIST_HEAD(rds_tcp_tc_list);
+static unsigned int rds_tcp_tc_count;
+
+/* Track rds_tcp_connection structs so they can be cleaned up */
+static DEFINE_SPINLOCK(rds_tcp_conn_lock);
+static LIST_HEAD(rds_tcp_conn_list);
+
+static struct kmem_cache *rds_tcp_conn_slab;
+
+#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
+
+/* doing it this way avoids calling tcp_sk() */
+void rds_tcp_nonagle(struct socket *sock)
+{
+ mm_segment_t oldfs = get_fs();
+ int val = 1;
+
+ set_fs(KERNEL_DS);
+ sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
+ sizeof(val));
+ set_fs(oldfs);
+}
+
+void rds_tcp_tune(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ rds_tcp_nonagle(sock);
+
+ /*
+ * We're trying to saturate gigabit with the default,
+ * see svc_sock_setbufsize().
+ */
+ lock_sock(sk);
+ sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
+ sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
+ release_sock(sk);
+}
+
+u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
+{
+ return tcp_sk(tc->t_sock->sk)->snd_nxt;
+}
+
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
+{
+ return tcp_sk(tc->t_sock->sk)->snd_una;
+}
+
+void rds_tcp_restore_callbacks(struct socket *sock,
+ struct rds_tcp_connection *tc)
+{
+ rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+
+ /* done under the callback_lock to serialize with write_space */
+ spin_lock(&rds_tcp_tc_list_lock);
+ list_del_init(&tc->t_list_item);
+ rds_tcp_tc_count--;
+ spin_unlock(&rds_tcp_tc_list_lock);
+
+ tc->t_sock = NULL;
+
+ sock->sk->sk_write_space = tc->t_orig_write_space;
+ sock->sk->sk_data_ready = tc->t_orig_data_ready;
+ sock->sk->sk_state_change = tc->t_orig_state_change;
+ sock->sk->sk_user_data = NULL;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+/*
+ * This is the only path that sets tc->t_sock. Send and receive trust that
+ * it is set. The RDS_CONN_CONNECTED bit protects those paths from being
+ * called while it isn't set.
+ */
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+
+ rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
+ write_lock_bh(&sock->sk->sk_callback_lock);
+
+ /* done under the callback_lock to serialize with write_space */
+ spin_lock(&rds_tcp_tc_list_lock);
+ list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
+ rds_tcp_tc_count++;
+ spin_unlock(&rds_tcp_tc_list_lock);
+
+ /* accepted sockets need our listen data ready undone */
+ if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
+ sock->sk->sk_data_ready = sock->sk->sk_user_data;
+
+ tc->t_sock = sock;
+ tc->conn = conn;
+ tc->t_orig_data_ready = sock->sk->sk_data_ready;
+ tc->t_orig_write_space = sock->sk->sk_write_space;
+ tc->t_orig_state_change = sock->sk->sk_state_change;
+
+ sock->sk->sk_user_data = conn;
+ sock->sk->sk_data_ready = rds_tcp_data_ready;
+ sock->sk->sk_write_space = rds_tcp_write_space;
+ sock->sk->sk_state_change = rds_tcp_state_change;
+
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_info_tcp_socket tsinfo;
+ struct rds_tcp_connection *tc;
+ unsigned long flags;
+ struct sockaddr_in sin;
+ int sinlen;
+
+ spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+ if (len / sizeof(tsinfo) < rds_tcp_tc_count)
+ goto out;
+
+ list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+
+ sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
+ tsinfo.local_addr = sin.sin_addr.s_addr;
+ tsinfo.local_port = sin.sin_port;
+ sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
+ tsinfo.peer_addr = sin.sin_addr.s_addr;
+ tsinfo.peer_port = sin.sin_port;
+
+ tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
+ tsinfo.data_rem = tc->t_tinc_data_rem;
+ tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
+ tsinfo.last_expected_una = tc->t_last_expected_una;
+ tsinfo.last_seen_una = tc->t_last_seen_una;
+
+ rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
+ }
+
+out:
+ lens->nr = rds_tcp_tc_count;
+ lens->each = sizeof(tsinfo);
+
+ spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
+static int rds_tcp_laddr_check(__be32 addr)
+{
+ if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
+ return 0;
+ return -EADDRNOTAVAIL;
+}
+
+static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_tcp_connection *tc;
+
+ tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
+ if (!tc)
+ return -ENOMEM;
+
+ tc->t_sock = NULL;
+ tc->t_tinc = NULL;
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+
+ conn->c_transport_data = tc;
+
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+ spin_unlock_irq(&rds_tcp_conn_lock);
+
+ rdsdebug("alloced tc %p\n", conn->c_transport_data);
+ return 0;
+}
+
+static void rds_tcp_conn_free(void *arg)
+{
+ struct rds_tcp_connection *tc = arg;
+ unsigned long flags;
+ rdsdebug("freeing tc %p\n", tc);
+
+ spin_lock_irqsave(&rds_tcp_conn_lock, flags);
+ list_del(&tc->t_tcp_node);
+ spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
+
+ kmem_cache_free(rds_tcp_conn_slab, tc);
+}
+
+static void rds_tcp_destroy_conns(void)
+{
+ struct rds_tcp_connection *tc, *_tc;
+ LIST_HEAD(tmp_list);
+
+ /* avoid calling conn_destroy with irqs off */
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_splice(&rds_tcp_conn_list, &tmp_list);
+ INIT_LIST_HEAD(&rds_tcp_conn_list);
+ spin_unlock_irq(&rds_tcp_conn_lock);
+
+ list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
+ if (tc->conn->c_passive)
+ rds_conn_destroy(tc->conn->c_passive);
+ rds_conn_destroy(tc->conn);
+ }
+}
+
+static void rds_tcp_exit(void)
+{
+ rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+ rds_tcp_listen_stop();
+ rds_tcp_destroy_conns();
+ rds_trans_unregister(&rds_tcp_transport);
+ rds_tcp_recv_exit();
+ kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
+struct rds_transport rds_tcp_transport = {
+ .laddr_check = rds_tcp_laddr_check,
+ .xmit_prepare = rds_tcp_xmit_prepare,
+ .xmit_complete = rds_tcp_xmit_complete,
+ .xmit = rds_tcp_xmit,
+ .recv = rds_tcp_recv,
+ .conn_alloc = rds_tcp_conn_alloc,
+ .conn_free = rds_tcp_conn_free,
+ .conn_connect = rds_tcp_conn_connect,
+ .conn_shutdown = rds_tcp_conn_shutdown,
+ .inc_copy_to_user = rds_tcp_inc_copy_to_user,
+ .inc_free = rds_tcp_inc_free,
+ .stats_info_copy = rds_tcp_stats_info_copy,
+ .exit = rds_tcp_exit,
+ .t_owner = THIS_MODULE,
+ .t_name = "tcp",
+ .t_type = RDS_TRANS_TCP,
+ .t_prefer_loopback = 1,
+};
+
+static int rds_tcp_init(void)
+{
+ int ret;
+
+ rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
+ sizeof(struct rds_tcp_connection),
+ 0, 0, NULL);
+ if (!rds_tcp_conn_slab) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = rds_tcp_recv_init();
+ if (ret)
+ goto out_slab;
+
+ ret = rds_trans_register(&rds_tcp_transport);
+ if (ret)
+ goto out_recv;
+
+ ret = rds_tcp_listen_init();
+ if (ret)
+ goto out_register;
+
+ rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+
+ goto out;
+
+out_register:
+ rds_trans_unregister(&rds_tcp_transport);
+out_recv:
+ rds_tcp_recv_exit();
+out_slab:
+ kmem_cache_destroy(rds_tcp_conn_slab);
+out:
+ return ret;
+}
+module_init(rds_tcp_init);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: TCP transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
new file mode 100644
index 000000000..0dbdd3716
--- /dev/null
+++ b/net/rds/tcp.h
@@ -0,0 +1,87 @@
+#ifndef _RDS_TCP_H
+#define _RDS_TCP_H
+
+#define RDS_TCP_PORT 16385
+
+struct rds_tcp_incoming {
+ struct rds_incoming ti_inc;
+ struct sk_buff_head ti_skb_list;
+};
+
+struct rds_tcp_connection {
+
+ struct list_head t_tcp_node;
+ struct rds_connection *conn;
+ struct socket *t_sock;
+ void *t_orig_write_space;
+ void *t_orig_data_ready;
+ void *t_orig_state_change;
+
+ struct rds_tcp_incoming *t_tinc;
+ size_t t_tinc_hdr_rem;
+ size_t t_tinc_data_rem;
+
+ /* XXX error report? */
+ struct work_struct t_conn_w;
+ struct work_struct t_send_w;
+ struct work_struct t_down_w;
+ struct work_struct t_recv_w;
+
+ /* for info exporting only */
+ struct list_head t_list_item;
+ u32 t_last_sent_nxt;
+ u32 t_last_expected_una;
+ u32 t_last_seen_una;
+};
+
+struct rds_tcp_statistics {
+ uint64_t s_tcp_data_ready_calls;
+ uint64_t s_tcp_write_space_calls;
+ uint64_t s_tcp_sndbuf_full;
+ uint64_t s_tcp_connect_raced;
+ uint64_t s_tcp_listen_closed_stale;
+};
+
+/* tcp.c */
+void rds_tcp_tune(struct socket *sock);
+void rds_tcp_nonagle(struct socket *sock);
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_restore_callbacks(struct socket *sock,
+ struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
+u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
+extern struct rds_transport rds_tcp_transport;
+
+/* tcp_connect.c */
+int rds_tcp_conn_connect(struct rds_connection *conn);
+void rds_tcp_conn_shutdown(struct rds_connection *conn);
+void rds_tcp_state_change(struct sock *sk);
+
+/* tcp_listen.c */
+int rds_tcp_listen_init(void);
+void rds_tcp_listen_stop(void);
+void rds_tcp_listen_data_ready(struct sock *sk);
+
+/* tcp_recv.c */
+int rds_tcp_recv_init(void);
+void rds_tcp_recv_exit(void);
+void rds_tcp_data_ready(struct sock *sk);
+int rds_tcp_recv(struct rds_connection *conn);
+void rds_tcp_inc_free(struct rds_incoming *inc);
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+
+/* tcp_send.c */
+void rds_tcp_xmit_prepare(struct rds_connection *conn);
+void rds_tcp_xmit_complete(struct rds_connection *conn);
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_tcp_write_space(struct sock *sk);
+
+/* tcp_stats.c */
+DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
+#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail);
+
+#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
new file mode 100644
index 000000000..973109c7b
--- /dev/null
+++ b/net/rds/tcp_connect.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+void rds_tcp_state_change(struct sock *sk)
+{
+ void (*state_change)(struct sock *sk);
+ struct rds_connection *conn;
+ struct rds_tcp_connection *tc;
+
+ read_lock(&sk->sk_callback_lock);
+ conn = sk->sk_user_data;
+ if (!conn) {
+ state_change = sk->sk_state_change;
+ goto out;
+ }
+ tc = conn->c_transport_data;
+ state_change = tc->t_orig_state_change;
+
+ rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
+
+ switch(sk->sk_state) {
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ rds_connect_complete(conn);
+ break;
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSE:
+ rds_conn_drop(conn);
+ default:
+ break;
+ }
+out:
+ read_unlock(&sk->sk_callback_lock);
+ state_change(sk);
+}
+
+int rds_tcp_conn_connect(struct rds_connection *conn)
+{
+ struct socket *sock = NULL;
+ struct sockaddr_in src, dest;
+ int ret;
+
+ ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ret < 0)
+ goto out;
+
+ rds_tcp_tune(sock);
+
+ src.sin_family = AF_INET;
+ src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+ src.sin_port = (__force u16)htons(0);
+
+ ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+ if (ret) {
+ rdsdebug("bind failed with %d at address %pI4\n",
+ ret, &conn->c_laddr);
+ goto out;
+ }
+
+ dest.sin_family = AF_INET;
+ dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+ dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+ /*
+ * once we call connect() we can start getting callbacks and they
+ * own the socket
+ */
+ rds_tcp_set_callbacks(sock, conn);
+ ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
+ O_NONBLOCK);
+
+ rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
+ if (ret == -EINPROGRESS)
+ ret = 0;
+ if (ret == 0)
+ sock = NULL;
+ else
+ rds_tcp_restore_callbacks(sock, conn->c_transport_data);
+
+out:
+ if (sock)
+ sock_release(sock);
+ return ret;
+}
+
+/*
+ * Before killing the tcp socket this needs to serialize with callbacks. The
+ * caller has already grabbed the sending sem so we're serialized with other
+ * senders.
+ *
+ * TCP calls the callbacks with the sock lock so we hold it while we reset the
+ * callbacks to those set by TCP. Our callbacks won't execute again once we
+ * hold the sock lock.
+ */
+void rds_tcp_conn_shutdown(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *sock = tc->t_sock;
+
+ rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
+
+ if (sock) {
+ sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
+ lock_sock(sock->sk);
+ rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
+
+ release_sock(sock->sk);
+ sock_release(sock);
+ }
+
+ if (tc->t_tinc) {
+ rds_inc_put(&tc->t_tinc->ti_inc);
+ tc->t_tinc = NULL;
+ }
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
new file mode 100644
index 000000000..0da49e344
--- /dev/null
+++ b/net/rds/tcp_listen.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/*
+ * cheesy, but simple..
+ */
+static void rds_tcp_accept_worker(struct work_struct *work);
+static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
+static struct socket *rds_tcp_listen_sock;
+
+static int rds_tcp_keepalive(struct socket *sock)
+{
+ /* values below based on xs_udp_default_timeout */
+ int keepidle = 5; /* send a probe 'keepidle' secs after last data */
+ int keepcnt = 5; /* number of unack'ed probes before declaring dead */
+ int keepalive = 1;
+ int ret = 0;
+
+ ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&keepalive, sizeof(keepalive));
+ if (ret < 0)
+ goto bail;
+
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
+ (char *)&keepcnt, sizeof(keepcnt));
+ if (ret < 0)
+ goto bail;
+
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
+ (char *)&keepidle, sizeof(keepidle));
+ if (ret < 0)
+ goto bail;
+
+ /* KEEPINTVL is the interval between successive probes. We follow
+ * the model in xs_tcp_finish_connecting() and re-use keepidle.
+ */
+ ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
+ (char *)&keepidle, sizeof(keepidle));
+bail:
+ return ret;
+}
+
+static int rds_tcp_accept_one(struct socket *sock)
+{
+ struct socket *new_sock = NULL;
+ struct rds_connection *conn;
+ int ret;
+ struct inet_sock *inet;
+ struct rds_tcp_connection *rs_tcp;
+
+ ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+ sock->sk->sk_protocol, &new_sock);
+ if (ret)
+ goto out;
+
+ new_sock->type = sock->type;
+ new_sock->ops = sock->ops;
+ ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+ if (ret < 0)
+ goto out;
+
+ ret = rds_tcp_keepalive(new_sock);
+ if (ret < 0)
+ goto out;
+
+ rds_tcp_tune(new_sock);
+
+ inet = inet_sk(new_sock->sk);
+
+ rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
+ &inet->inet_saddr, ntohs(inet->inet_sport),
+ &inet->inet_daddr, ntohs(inet->inet_dport));
+
+ conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
+ &rds_tcp_transport, GFP_KERNEL);
+ if (IS_ERR(conn)) {
+ ret = PTR_ERR(conn);
+ goto out;
+ }
+ /* An incoming SYN request came in, and TCP just accepted it.
+ * We always create a new conn for listen side of TCP, and do not
+ * add it to the c_hash_list.
+ *
+ * If the client reboots, this conn will need to be cleaned up.
+ * rds_tcp_state_change() will do that cleanup
+ */
+ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
+ WARN_ON(!rs_tcp || rs_tcp->t_sock);
+
+ /*
+ * see the comment above rds_queue_delayed_reconnect()
+ */
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ if (rds_conn_state(conn) == RDS_CONN_UP)
+ rds_tcp_stats_inc(s_tcp_listen_closed_stale);
+ else
+ rds_tcp_stats_inc(s_tcp_connect_raced);
+ rds_conn_drop(conn);
+ ret = 0;
+ goto out;
+ }
+
+ rds_tcp_set_callbacks(new_sock, conn);
+ rds_connect_complete(conn);
+ new_sock = NULL;
+ ret = 0;
+
+out:
+ if (new_sock)
+ sock_release(new_sock);
+ return ret;
+}
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+ while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
+ cond_resched();
+}
+
+void rds_tcp_listen_data_ready(struct sock *sk)
+{
+ void (*ready)(struct sock *sk);
+
+ rdsdebug("listen data ready sk %p\n", sk);
+
+ read_lock(&sk->sk_callback_lock);
+ ready = sk->sk_user_data;
+ if (!ready) { /* check for teardown race */
+ ready = sk->sk_data_ready;
+ goto out;
+ }
+
+ /*
+ * ->sk_data_ready is also called for a newly established child socket
+ * before it has been accepted and the accepter has set up their
+ * data_ready.. we only want to queue listen work for our listening
+ * socket
+ */
+ if (sk->sk_state == TCP_LISTEN)
+ queue_work(rds_wq, &rds_tcp_listen_work);
+
+out:
+ read_unlock(&sk->sk_callback_lock);
+ ready(sk);
+}
+
+int rds_tcp_listen_init(void)
+{
+ struct sockaddr_in sin;
+ struct socket *sock = NULL;
+ int ret;
+
+ ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ret < 0)
+ goto out;
+
+ sock->sk->sk_reuse = SK_CAN_REUSE;
+ rds_tcp_nonagle(sock);
+
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ sock->sk->sk_user_data = sock->sk->sk_data_ready;
+ sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+
+ sin.sin_family = PF_INET;
+ sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+ sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+ ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+ if (ret < 0)
+ goto out;
+
+ ret = sock->ops->listen(sock, 64);
+ if (ret < 0)
+ goto out;
+
+ rds_tcp_listen_sock = sock;
+ sock = NULL;
+out:
+ if (sock)
+ sock_release(sock);
+ return ret;
+}
+
+void rds_tcp_listen_stop(void)
+{
+ struct socket *sock = rds_tcp_listen_sock;
+ struct sock *sk;
+
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+
+ /* serialize with and prevent further callbacks */
+ lock_sock(sk);
+ write_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_user_data) {
+ sk->sk_data_ready = sk->sk_user_data;
+ sk->sk_user_data = NULL;
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
+
+ /* wait for accepts to stop and close the socket */
+ flush_workqueue(rds_wq);
+ sock_release(sock);
+ rds_tcp_listen_sock = NULL;
+}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
new file mode 100644
index 000000000..fbc5ef88b
--- /dev/null
+++ b/net/rds/tcp_recv.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static struct kmem_cache *rds_tcp_incoming_slab;
+
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
+{
+ struct rds_tcp_incoming *tinc;
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+ rdsdebug("purging tinc %p inc %p\n", tinc, inc);
+ skb_queue_purge(&tinc->ti_skb_list);
+}
+
+void rds_tcp_inc_free(struct rds_incoming *inc)
+{
+ struct rds_tcp_incoming *tinc;
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+ rds_tcp_inc_purge(inc);
+ rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
+ kmem_cache_free(rds_tcp_incoming_slab, tinc);
+}
+
+/*
+ * this is pretty lame, but, whatever.
+ */
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+{
+ struct rds_tcp_incoming *tinc;
+ struct sk_buff *skb;
+ int ret = 0;
+
+ if (!iov_iter_count(to))
+ goto out;
+
+ tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+
+ skb_queue_walk(&tinc->ti_skb_list, skb) {
+ unsigned long to_copy, skb_off;
+ for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
+ to_copy = iov_iter_count(to);
+ to_copy = min(to_copy, skb->len - skb_off);
+
+ if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
+ return -EFAULT;
+
+ rds_stats_add(s_copy_to_user, to_copy);
+ ret += to_copy;
+
+ if (!iov_iter_count(to))
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
+/*
+ * We have a series of skbs that have fragmented pieces of the congestion
+ * bitmap. They must add up to the exact size of the congestion bitmap. We
+ * use the skb helpers to copy those into the pages that make up the in-memory
+ * congestion bitmap for the remote address of this connection. We then tell
+ * the congestion core that the bitmap has been changed so that it can wake up
+ * sleepers.
+ *
+ * This is racing with sending paths which are using test_bit to see if the
+ * bitmap indicates that their recipient is congested.
+ */
+
+static void rds_tcp_cong_recv(struct rds_connection *conn,
+ struct rds_tcp_incoming *tinc)
+{
+ struct sk_buff *skb;
+ unsigned int to_copy, skb_off;
+ unsigned int map_off;
+ unsigned int map_page;
+ struct rds_cong_map *map;
+ int ret;
+
+ /* catch completely corrupt packets */
+ if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+ return;
+
+ map_page = 0;
+ map_off = 0;
+ map = conn->c_fcong;
+
+ skb_queue_walk(&tinc->ti_skb_list, skb) {
+ skb_off = 0;
+ while (skb_off < skb->len) {
+ to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
+ skb->len - skb_off);
+
+ BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
+
+ /* only returns 0 or -error */
+ ret = skb_copy_bits(skb, skb_off,
+ (void *)map->m_page_addrs[map_page] + map_off,
+ to_copy);
+ BUG_ON(ret != 0);
+
+ skb_off += to_copy;
+ map_off += to_copy;
+ if (map_off == PAGE_SIZE) {
+ map_off = 0;
+ map_page++;
+ }
+ }
+ }
+
+ rds_cong_map_updated(map, ~(u64) 0);
+}
+
+struct rds_tcp_desc_arg {
+ struct rds_connection *conn;
+ gfp_t gfp;
+};
+
+static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct rds_tcp_desc_arg *arg = desc->arg.data;
+ struct rds_connection *conn = arg->conn;
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct rds_tcp_incoming *tinc = tc->t_tinc;
+ struct sk_buff *clone;
+ size_t left = len, to_copy;
+
+ rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
+ len);
+
+ /*
+ * tcp_read_sock() interprets partial progress as an indication to stop
+ * processing.
+ */
+ while (left) {
+ if (!tinc) {
+ tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
+ arg->gfp);
+ if (!tinc) {
+ desc->error = -ENOMEM;
+ goto out;
+ }
+ tc->t_tinc = tinc;
+ rdsdebug("alloced tinc %p\n", tinc);
+ rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
+ /*
+ * XXX * we might be able to use the __ variants when
+ * we've already serialized at a higher level.
+ */
+ skb_queue_head_init(&tinc->ti_skb_list);
+ }
+
+ if (left && tc->t_tinc_hdr_rem) {
+ to_copy = min(tc->t_tinc_hdr_rem, left);
+ rdsdebug("copying %zu header from skb %p\n", to_copy,
+ skb);
+ skb_copy_bits(skb, offset,
+ (char *)&tinc->ti_inc.i_hdr +
+ sizeof(struct rds_header) -
+ tc->t_tinc_hdr_rem,
+ to_copy);
+ tc->t_tinc_hdr_rem -= to_copy;
+ left -= to_copy;
+ offset += to_copy;
+
+ if (tc->t_tinc_hdr_rem == 0) {
+ /* could be 0 for a 0 len message */
+ tc->t_tinc_data_rem =
+ be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+ }
+ }
+
+ if (left && tc->t_tinc_data_rem) {
+ clone = skb_clone(skb, arg->gfp);
+ if (!clone) {
+ desc->error = -ENOMEM;
+ goto out;
+ }
+
+ to_copy = min(tc->t_tinc_data_rem, left);
+ pskb_pull(clone, offset);
+ pskb_trim(clone, to_copy);
+ skb_queue_tail(&tinc->ti_skb_list, clone);
+
+ rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
+ "clone %p data %p len %d\n",
+ skb, skb->data, skb->len, offset, to_copy,
+ clone, clone->data, clone->len);
+
+ tc->t_tinc_data_rem -= to_copy;
+ left -= to_copy;
+ offset += to_copy;
+ }
+
+ if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
+ if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ rds_tcp_cong_recv(conn, tinc);
+ else
+ rds_recv_incoming(conn, conn->c_faddr,
+ conn->c_laddr, &tinc->ti_inc,
+ arg->gfp);
+
+ tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+ tc->t_tinc_data_rem = 0;
+ tc->t_tinc = NULL;
+ rds_inc_put(&tinc->ti_inc);
+ tinc = NULL;
+ }
+ }
+out:
+ rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
+ len, left, skb->len,
+ skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
+ return len - left;
+}
+
+/* the caller has to hold the sock lock */
+static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *sock = tc->t_sock;
+ read_descriptor_t desc;
+ struct rds_tcp_desc_arg arg;
+
+ /* It's like glib in the kernel! */
+ arg.conn = conn;
+ arg.gfp = gfp;
+ desc.arg.data = &arg;
+ desc.error = 0;
+ desc.count = 1; /* give more than one skb per call */
+
+ tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
+ rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
+ desc.error);
+
+ return desc.error;
+}
+
+/*
+ * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
+ * data_ready.
+ *
+ * if we fail to allocate we're in trouble.. blindly wait some time before
+ * trying again to see if the VM can free up something for us.
+ */
+int rds_tcp_recv(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ struct socket *sock = tc->t_sock;
+ int ret = 0;
+
+ rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
+
+ lock_sock(sock->sk);
+ ret = rds_tcp_read_sock(conn, GFP_KERNEL);
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+void rds_tcp_data_ready(struct sock *sk)
+{
+ void (*ready)(struct sock *sk);
+ struct rds_connection *conn;
+ struct rds_tcp_connection *tc;
+
+ rdsdebug("data ready sk %p\n", sk);
+
+ read_lock(&sk->sk_callback_lock);
+ conn = sk->sk_user_data;
+ if (!conn) { /* check for teardown race */
+ ready = sk->sk_data_ready;
+ goto out;
+ }
+
+ tc = conn->c_transport_data;
+ ready = tc->t_orig_data_ready;
+ rds_tcp_stats_inc(s_tcp_data_ready_calls);
+
+ if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+out:
+ read_unlock(&sk->sk_callback_lock);
+ ready(sk);
+}
+
+int rds_tcp_recv_init(void)
+{
+ rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
+ sizeof(struct rds_tcp_incoming),
+ 0, 0, NULL);
+ if (!rds_tcp_incoming_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void rds_tcp_recv_exit(void)
+{
+ kmem_cache_destroy(rds_tcp_incoming_slab);
+}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
new file mode 100644
index 000000000..53b17ca0d
--- /dev/null
+++ b/net/rds/tcp_send.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static void rds_tcp_cork(struct socket *sock, int val)
+{
+ mm_segment_t oldfs;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
+ sizeof(val));
+ set_fs(oldfs);
+}
+
+void rds_tcp_xmit_prepare(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+
+ rds_tcp_cork(tc->t_sock, 1);
+}
+
+void rds_tcp_xmit_complete(struct rds_connection *conn)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+
+ rds_tcp_cork(tc->t_sock, 0);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
+{
+ struct kvec vec = {
+ .iov_base = data,
+ .iov_len = len,
+ };
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
+ };
+
+ return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+ unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+ int done = 0;
+ int ret = 0;
+
+ if (hdr_off == 0) {
+ /*
+ * m_ack_seq is set to the sequence number of the last byte of
+ * header and data. see rds_tcp_is_acked().
+ */
+ tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
+ rm->m_ack_seq = tc->t_last_sent_nxt +
+ sizeof(struct rds_header) +
+ be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
+ smp_mb__before_atomic();
+ set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
+ tc->t_last_expected_una = rm->m_ack_seq + 1;
+
+ rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
+ rm, rds_tcp_snd_nxt(tc),
+ (unsigned long long)rm->m_ack_seq);
+ }
+
+ if (hdr_off < sizeof(struct rds_header)) {
+ /* see rds_tcp_write_space() */
+ set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
+
+ ret = rds_tcp_sendmsg(tc->t_sock,
+ (void *)&rm->m_inc.i_hdr + hdr_off,
+ sizeof(rm->m_inc.i_hdr) - hdr_off);
+ if (ret < 0)
+ goto out;
+ done += ret;
+ if (hdr_off + done != sizeof(struct rds_header))
+ goto out;
+ }
+
+ while (sg < rm->data.op_nents) {
+ ret = tc->t_sock->ops->sendpage(tc->t_sock,
+ sg_page(&rm->data.op_sg[sg]),
+ rm->data.op_sg[sg].offset + off,
+ rm->data.op_sg[sg].length - off,
+ MSG_DONTWAIT|MSG_NOSIGNAL);
+ rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+ rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
+ ret);
+ if (ret <= 0)
+ break;
+
+ off += ret;
+ done += ret;
+ if (off == rm->data.op_sg[sg].length) {
+ off = 0;
+ sg++;
+ }
+ }
+
+out:
+ if (ret <= 0) {
+ /* write_space will hit after EAGAIN, all else fatal */
+ if (ret == -EAGAIN) {
+ rds_tcp_stats_inc(s_tcp_sndbuf_full);
+ ret = 0;
+ } else {
+ printk(KERN_WARNING "RDS/tcp: send to %pI4 "
+ "returned %d, disconnecting and reconnecting\n",
+ &conn->c_faddr, ret);
+ rds_conn_drop(conn);
+ }
+ }
+ if (done == 0)
+ done = ret;
+ return done;
+}
+
+/*
+ * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
+ * last byte of the message, including the header. This means that the
+ * entire message has been received if rm->m_ack_seq is "before" the next
+ * unacked byte of the TCP sequence space. We have to do very careful
+ * wrapping 32bit comparisons here.
+ */
+static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
+{
+ if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
+ return 0;
+ return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
+}
+
+void rds_tcp_write_space(struct sock *sk)
+{
+ void (*write_space)(struct sock *sk);
+ struct rds_connection *conn;
+ struct rds_tcp_connection *tc;
+
+ read_lock(&sk->sk_callback_lock);
+ conn = sk->sk_user_data;
+ if (!conn) {
+ write_space = sk->sk_write_space;
+ goto out;
+ }
+
+ tc = conn->c_transport_data;
+ rdsdebug("write_space for tc %p\n", tc);
+ write_space = tc->t_orig_write_space;
+ rds_tcp_stats_inc(s_tcp_write_space_calls);
+
+ rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
+ tc->t_last_seen_una = rds_tcp_snd_una(tc);
+ rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
+
+ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+out:
+ read_unlock(&sk->sk_callback_lock);
+
+ /*
+ * write_space is only called when data leaves tcp's send queue if
+ * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put
+ * data in tcp's send queue because we use write_space to parse the
+ * sequence numbers and notice that rds messages have been fully
+ * received.
+ *
+ * tcp's write_space clears SOCK_NOSPACE if the send queue has more
+ * than a certain amount of space. So we need to set it again *after*
+ * we call tcp's write_space or else we might only get called on the
+ * first of a series of incoming tcp acks.
+ */
+ write_space(sk);
+
+ if (sk->sk_socket)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c
new file mode 100644
index 000000000..f8a7954f1
--- /dev/null
+++ b/net/rds/tcp_stats.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
+ ____cacheline_aligned;
+
+static const char * const rds_tcp_stat_names[] = {
+ "tcp_data_ready_calls",
+ "tcp_write_space_calls",
+ "tcp_sndbuf_full",
+ "tcp_connect_raced",
+ "tcp_listen_closed_stale",
+};
+
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+{
+ struct rds_tcp_statistics stats = {0, };
+ uint64_t *src;
+ uint64_t *sum;
+ size_t i;
+ int cpu;
+
+ if (avail < ARRAY_SIZE(rds_tcp_stat_names))
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
+ sum = (uint64_t *)&stats;
+ for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+ *(sum++) += *(src++);
+ }
+
+ rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
+ ARRAY_SIZE(rds_tcp_stat_names));
+out:
+ return ARRAY_SIZE(rds_tcp_stat_names);
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 000000000..dc2402e87
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+
+/* Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ * ANY -> ERROR
+ * UP -> DISCONNECTING
+ * ERROR -> DISCONNECTING
+ * DISCONNECTING -> DOWN
+ * DOWN -> CONNECTING
+ * CONNECTING -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ * - Inside the shutdown worker; synchronizes with xmit path
+ * through RDS_IN_XMIT, and with connection management callbacks
+ * via c_cm_lock.
+ *
+ * For receive callbacks, we rely on the underlying transport
+ * (TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct workqueue_struct *rds_wq;
+EXPORT_SYMBOL_GPL(rds_wq);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+ if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+ printk(KERN_WARNING "%s: Cannot transition to state UP, "
+ "current state is %d\n",
+ __func__,
+ atomic_read(&conn->c_state));
+ rds_conn_drop(conn);
+ return;
+ }
+
+ rdsdebug("conn %p for %pI4 to %pI4 complete\n",
+ conn, &conn->c_laddr, &conn->c_faddr);
+
+ conn->c_reconnect_jiffies = 0;
+ set_bit(0, &conn->c_map_queued);
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_connect_complete);
+
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again. Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects. This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+void rds_queue_reconnect(struct rds_connection *conn)
+{
+ unsigned long rand;
+
+ rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
+ conn, &conn->c_laddr, &conn->c_faddr,
+ conn->c_reconnect_jiffies);
+
+ set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+ if (conn->c_reconnect_jiffies == 0) {
+ conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+ return;
+ }
+
+ get_random_bytes(&rand, sizeof(rand));
+ rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+ rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+ conn, &conn->c_laddr, &conn->c_faddr);
+ queue_delayed_work(rds_wq, &conn->c_conn_w,
+ rand % conn->c_reconnect_jiffies);
+
+ conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+ rds_sysctl_reconnect_max_jiffies);
+}
+
+void rds_connect_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
+ int ret;
+
+ clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+ if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+ ret = conn->c_trans->conn_connect(conn);
+ rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
+ conn, &conn->c_laddr, &conn->c_faddr, ret);
+
+ if (ret) {
+ if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
+ rds_queue_reconnect(conn);
+ else
+ rds_conn_error(conn, "RDS: connect failed\n");
+ }
+ }
+}
+
+void rds_send_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
+ int ret;
+
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ ret = rds_send_xmit(conn);
+ rdsdebug("conn %p ret %d\n", conn, ret);
+ switch (ret) {
+ case -EAGAIN:
+ rds_stats_inc(s_send_immediate_retry);
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ break;
+ case -ENOMEM:
+ rds_stats_inc(s_send_delayed_retry);
+ queue_delayed_work(rds_wq, &conn->c_send_w, 2);
+ default:
+ break;
+ }
+ }
+}
+
+void rds_recv_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
+ int ret;
+
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ ret = conn->c_trans->recv(conn);
+ rdsdebug("conn %p ret %d\n", conn, ret);
+ switch (ret) {
+ case -EAGAIN:
+ rds_stats_inc(s_recv_immediate_retry);
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ break;
+ case -ENOMEM:
+ rds_stats_inc(s_recv_delayed_retry);
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
+ default:
+ break;
+ }
+ }
+}
+
+void rds_shutdown_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+ rds_conn_shutdown(conn);
+}
+
+void rds_threads_exit(void)
+{
+ destroy_workqueue(rds_wq);
+}
+
+int rds_threads_init(void)
+{
+ rds_wq = create_singlethread_workqueue("krdsd");
+ if (!rds_wq)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 000000000..7f2ac4fec
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static struct rds_transport *transports[RDS_TRANS_COUNT];
+static DECLARE_RWSEM(rds_trans_sem);
+
+int rds_trans_register(struct rds_transport *trans)
+{
+ BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
+
+ down_write(&rds_trans_sem);
+
+ if (transports[trans->t_type])
+ printk(KERN_ERR "RDS Transport type %d already registered\n",
+ trans->t_type);
+ else {
+ transports[trans->t_type] = trans;
+ printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+ }
+
+ up_write(&rds_trans_sem);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rds_trans_register);
+
+void rds_trans_unregister(struct rds_transport *trans)
+{
+ down_write(&rds_trans_sem);
+
+ transports[trans->t_type] = NULL;
+ printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
+
+ up_write(&rds_trans_sem);
+}
+EXPORT_SYMBOL_GPL(rds_trans_unregister);
+
+void rds_trans_put(struct rds_transport *trans)
+{
+ if (trans && trans->t_owner)
+ module_put(trans->t_owner);
+}
+
+struct rds_transport *rds_trans_get_preferred(__be32 addr)
+{
+ struct rds_transport *ret = NULL;
+ struct rds_transport *trans;
+ unsigned int i;
+
+ if (IN_LOOPBACK(ntohl(addr)))
+ return &rds_loop_transport;
+
+ down_read(&rds_trans_sem);
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
+ trans = transports[i];
+
+ if (trans && (trans->laddr_check(addr) == 0) &&
+ (!trans->t_owner || try_module_get(trans->t_owner))) {
+ ret = trans;
+ break;
+ }
+ }
+ up_read(&rds_trans_sem);
+
+ return ret;
+}
+
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them. The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+ unsigned int avail)
+
+{
+ struct rds_transport *trans;
+ unsigned int total = 0;
+ unsigned int part;
+ int i;
+
+ rds_info_iter_unmap(iter);
+ down_read(&rds_trans_sem);
+
+ for (i = 0; i < RDS_TRANS_COUNT; i++)
+ {
+ trans = transports[i];
+ if (!trans || !trans->stats_info_copy)
+ continue;
+
+ part = trans->stats_info_copy(iter, avail);
+ avail -= min(avail, part);
+ total += part;
+ }
+
+ up_read(&rds_trans_sem);
+
+ return total;
+}
+
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
new file mode 100644
index 000000000..4c10e7e6c
--- /dev/null
+++ b/net/rfkill/Kconfig
@@ -0,0 +1,44 @@
+#
+# RF switch subsystem configuration
+#
+menuconfig RFKILL
+ tristate "RF switch subsystem support"
+ help
+ Say Y here if you want to have control over RF switches
+ found on many WiFi and Bluetooth cards.
+
+ To compile this driver as a module, choose M here: the
+ module will be called rfkill.
+
+# LED trigger support
+config RFKILL_LEDS
+ bool
+ depends on RFKILL
+ depends on LEDS_TRIGGERS = y || RFKILL = LEDS_TRIGGERS
+ default y
+
+config RFKILL_INPUT
+ bool "RF switch input support" if EXPERT
+ depends on RFKILL
+ depends on INPUT = y || RFKILL = INPUT
+ default y if !EXPERT
+
+config RFKILL_REGULATOR
+ tristate "Generic rfkill regulator driver"
+ depends on RFKILL || !RFKILL
+ depends on REGULATOR
+ help
+ This options enable controlling radio transmitters connected to
+ voltage regulator using the regulator framework.
+
+ To compile this driver as a module, choose M here: the module will
+ be called rfkill-regulator.
+
+config RFKILL_GPIO
+ tristate "GPIO RFKILL driver"
+ depends on RFKILL && GPIOLIB
+ default n
+ help
+ If you say yes here you get support of a generic gpio RFKILL
+ driver. The platform should fill in the appropriate fields in the
+ rfkill_gpio_platform_data structure and pass that to the driver.
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
new file mode 100644
index 000000000..311768783
--- /dev/null
+++ b/net/rfkill/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the RF switch subsystem.
+#
+
+rfkill-y += core.o
+rfkill-$(CONFIG_RFKILL_INPUT) += input.o
+obj-$(CONFIG_RFKILL) += rfkill.o
+obj-$(CONFIG_RFKILL_REGULATOR) += rfkill-regulator.o
+obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
new file mode 100644
index 000000000..fa7cd7927
--- /dev/null
+++ b/net/rfkill/core.c
@@ -0,0 +1,1295 @@
+/*
+ * Copyright (C) 2006 - 2007 Ivo van Doorn
+ * Copyright (C) 2007 Dmitry Torokhov
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/workqueue.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rfkill.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#include "rfkill.h"
+
+#define POLL_INTERVAL (5 * HZ)
+
+#define RFKILL_BLOCK_HW BIT(0)
+#define RFKILL_BLOCK_SW BIT(1)
+#define RFKILL_BLOCK_SW_PREV BIT(2)
+#define RFKILL_BLOCK_ANY (RFKILL_BLOCK_HW |\
+ RFKILL_BLOCK_SW |\
+ RFKILL_BLOCK_SW_PREV)
+#define RFKILL_BLOCK_SW_SETCALL BIT(31)
+
+struct rfkill {
+ spinlock_t lock;
+
+ const char *name;
+ enum rfkill_type type;
+
+ unsigned long state;
+
+ u32 idx;
+
+ bool registered;
+ bool persistent;
+
+ const struct rfkill_ops *ops;
+ void *data;
+
+#ifdef CONFIG_RFKILL_LEDS
+ struct led_trigger led_trigger;
+ const char *ledtrigname;
+#endif
+
+ struct device dev;
+ struct list_head node;
+
+ struct delayed_work poll_work;
+ struct work_struct uevent_work;
+ struct work_struct sync_work;
+};
+#define to_rfkill(d) container_of(d, struct rfkill, dev)
+
+struct rfkill_int_event {
+ struct list_head list;
+ struct rfkill_event ev;
+};
+
+struct rfkill_data {
+ struct list_head list;
+ struct list_head events;
+ struct mutex mtx;
+ wait_queue_head_t read_wait;
+ bool input_handler;
+};
+
+
+MODULE_AUTHOR("Ivo van Doorn <IvDoorn@gmail.com>");
+MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
+MODULE_DESCRIPTION("RF switch support");
+MODULE_LICENSE("GPL");
+
+
+/*
+ * The locking here should be made much smarter, we currently have
+ * a bit of a stupid situation because drivers might want to register
+ * the rfkill struct under their own lock, and take this lock during
+ * rfkill method calls -- which will cause an AB-BA deadlock situation.
+ *
+ * To fix that, we need to rework this code here to be mostly lock-free
+ * and only use the mutex for list manipulations, not to protect the
+ * various other global variables. Then we can avoid holding the mutex
+ * around driver operations, and all is happy.
+ */
+static LIST_HEAD(rfkill_list); /* list of registered rf switches */
+static DEFINE_MUTEX(rfkill_global_mutex);
+static LIST_HEAD(rfkill_fds); /* list of open fds of /dev/rfkill */
+
+static unsigned int rfkill_default_state = 1;
+module_param_named(default_state, rfkill_default_state, uint, 0444);
+MODULE_PARM_DESC(default_state,
+ "Default initial state for all radio types, 0 = radio off");
+
+static struct {
+ bool cur, sav;
+} rfkill_global_states[NUM_RFKILL_TYPES];
+
+static bool rfkill_epo_lock_active;
+
+
+#ifdef CONFIG_RFKILL_LEDS
+static void rfkill_led_trigger_event(struct rfkill *rfkill)
+{
+ struct led_trigger *trigger;
+
+ if (!rfkill->registered)
+ return;
+
+ trigger = &rfkill->led_trigger;
+
+ if (rfkill->state & RFKILL_BLOCK_ANY)
+ led_trigger_event(trigger, LED_OFF);
+ else
+ led_trigger_event(trigger, LED_FULL);
+}
+
+static void rfkill_led_trigger_activate(struct led_classdev *led)
+{
+ struct rfkill *rfkill;
+
+ rfkill = container_of(led->trigger, struct rfkill, led_trigger);
+
+ rfkill_led_trigger_event(rfkill);
+}
+
+const char *rfkill_get_led_trigger_name(struct rfkill *rfkill)
+{
+ return rfkill->led_trigger.name;
+}
+EXPORT_SYMBOL(rfkill_get_led_trigger_name);
+
+void rfkill_set_led_trigger_name(struct rfkill *rfkill, const char *name)
+{
+ BUG_ON(!rfkill);
+
+ rfkill->ledtrigname = name;
+}
+EXPORT_SYMBOL(rfkill_set_led_trigger_name);
+
+static int rfkill_led_trigger_register(struct rfkill *rfkill)
+{
+ rfkill->led_trigger.name = rfkill->ledtrigname
+ ? : dev_name(&rfkill->dev);
+ rfkill->led_trigger.activate = rfkill_led_trigger_activate;
+ return led_trigger_register(&rfkill->led_trigger);
+}
+
+static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
+{
+ led_trigger_unregister(&rfkill->led_trigger);
+}
+#else
+static void rfkill_led_trigger_event(struct rfkill *rfkill)
+{
+}
+
+static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
+{
+ return 0;
+}
+
+static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
+{
+}
+#endif /* CONFIG_RFKILL_LEDS */
+
+static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
+ enum rfkill_operation op)
+{
+ unsigned long flags;
+
+ ev->idx = rfkill->idx;
+ ev->type = rfkill->type;
+ ev->op = op;
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+ ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW);
+ ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW |
+ RFKILL_BLOCK_SW_PREV));
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+}
+
+static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op)
+{
+ struct rfkill_data *data;
+ struct rfkill_int_event *ev;
+
+ list_for_each_entry(data, &rfkill_fds, list) {
+ ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ if (!ev)
+ continue;
+ rfkill_fill_event(&ev->ev, rfkill, op);
+ mutex_lock(&data->mtx);
+ list_add_tail(&ev->list, &data->events);
+ mutex_unlock(&data->mtx);
+ wake_up_interruptible(&data->read_wait);
+ }
+}
+
+static void rfkill_event(struct rfkill *rfkill)
+{
+ if (!rfkill->registered)
+ return;
+
+ kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
+
+ /* also send event to /dev/rfkill */
+ rfkill_send_events(rfkill, RFKILL_OP_CHANGE);
+}
+
+static bool __rfkill_set_hw_state(struct rfkill *rfkill,
+ bool blocked, bool *change)
+{
+ unsigned long flags;
+ bool prev, any;
+
+ BUG_ON(!rfkill);
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+ prev = !!(rfkill->state & RFKILL_BLOCK_HW);
+ if (blocked)
+ rfkill->state |= RFKILL_BLOCK_HW;
+ else
+ rfkill->state &= ~RFKILL_BLOCK_HW;
+ *change = prev != blocked;
+ any = !!(rfkill->state & RFKILL_BLOCK_ANY);
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+
+ rfkill_led_trigger_event(rfkill);
+
+ return any;
+}
+
+/**
+ * rfkill_set_block - wrapper for set_block method
+ *
+ * @rfkill: the rfkill struct to use
+ * @blocked: the new software state
+ *
+ * Calls the set_block method (when applicable) and handles notifications
+ * etc. as well.
+ */
+static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
+{
+ unsigned long flags;
+ bool prev, curr;
+ int err;
+
+ if (unlikely(rfkill->dev.power.power_state.event & PM_EVENT_SLEEP))
+ return;
+
+ /*
+ * Some platforms (...!) generate input events which affect the
+ * _hard_ kill state -- whenever something tries to change the
+ * current software state query the hardware state too.
+ */
+ if (rfkill->ops->query)
+ rfkill->ops->query(rfkill, rfkill->data);
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+ prev = rfkill->state & RFKILL_BLOCK_SW;
+
+ if (rfkill->state & RFKILL_BLOCK_SW)
+ rfkill->state |= RFKILL_BLOCK_SW_PREV;
+ else
+ rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
+
+ if (blocked)
+ rfkill->state |= RFKILL_BLOCK_SW;
+ else
+ rfkill->state &= ~RFKILL_BLOCK_SW;
+
+ rfkill->state |= RFKILL_BLOCK_SW_SETCALL;
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+
+ err = rfkill->ops->set_block(rfkill->data, blocked);
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+ if (err) {
+ /*
+ * Failed -- reset status to _prev, this may be different
+ * from what set set _PREV to earlier in this function
+ * if rfkill_set_sw_state was invoked.
+ */
+ if (rfkill->state & RFKILL_BLOCK_SW_PREV)
+ rfkill->state |= RFKILL_BLOCK_SW;
+ else
+ rfkill->state &= ~RFKILL_BLOCK_SW;
+ }
+ rfkill->state &= ~RFKILL_BLOCK_SW_SETCALL;
+ rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
+ curr = rfkill->state & RFKILL_BLOCK_SW;
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+
+ rfkill_led_trigger_event(rfkill);
+
+ if (prev != curr)
+ rfkill_event(rfkill);
+}
+
+#ifdef CONFIG_RFKILL_INPUT
+static atomic_t rfkill_input_disabled = ATOMIC_INIT(0);
+
+/**
+ * __rfkill_switch_all - Toggle state of all switches of given type
+ * @type: type of interfaces to be affected
+ * @blocked: the new state
+ *
+ * This function sets the state of all switches of given type,
+ * unless a specific switch is claimed by userspace (in which case,
+ * that switch is left alone) or suspended.
+ *
+ * Caller must have acquired rfkill_global_mutex.
+ */
+static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
+{
+ struct rfkill *rfkill;
+
+ rfkill_global_states[type].cur = blocked;
+ list_for_each_entry(rfkill, &rfkill_list, node) {
+ if (rfkill->type != type && type != RFKILL_TYPE_ALL)
+ continue;
+
+ rfkill_set_block(rfkill, blocked);
+ }
+}
+
+/**
+ * rfkill_switch_all - Toggle state of all switches of given type
+ * @type: type of interfaces to be affected
+ * @blocked: the new state
+ *
+ * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state).
+ * Please refer to __rfkill_switch_all() for details.
+ *
+ * Does nothing if the EPO lock is active.
+ */
+void rfkill_switch_all(enum rfkill_type type, bool blocked)
+{
+ if (atomic_read(&rfkill_input_disabled))
+ return;
+
+ mutex_lock(&rfkill_global_mutex);
+
+ if (!rfkill_epo_lock_active)
+ __rfkill_switch_all(type, blocked);
+
+ mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_epo - emergency power off all transmitters
+ *
+ * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED,
+ * ignoring everything in its path but rfkill_global_mutex and rfkill->mutex.
+ *
+ * The global state before the EPO is saved and can be restored later
+ * using rfkill_restore_states().
+ */
+void rfkill_epo(void)
+{
+ struct rfkill *rfkill;
+ int i;
+
+ if (atomic_read(&rfkill_input_disabled))
+ return;
+
+ mutex_lock(&rfkill_global_mutex);
+
+ rfkill_epo_lock_active = true;
+ list_for_each_entry(rfkill, &rfkill_list, node)
+ rfkill_set_block(rfkill, true);
+
+ for (i = 0; i < NUM_RFKILL_TYPES; i++) {
+ rfkill_global_states[i].sav = rfkill_global_states[i].cur;
+ rfkill_global_states[i].cur = true;
+ }
+
+ mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_restore_states - restore global states
+ *
+ * Restore (and sync switches to) the global state from the
+ * states in rfkill_default_states. This can undo the effects of
+ * a call to rfkill_epo().
+ */
+void rfkill_restore_states(void)
+{
+ int i;
+
+ if (atomic_read(&rfkill_input_disabled))
+ return;
+
+ mutex_lock(&rfkill_global_mutex);
+
+ rfkill_epo_lock_active = false;
+ for (i = 0; i < NUM_RFKILL_TYPES; i++)
+ __rfkill_switch_all(i, rfkill_global_states[i].sav);
+ mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_remove_epo_lock - unlock state changes
+ *
+ * Used by rfkill-input manually unlock state changes, when
+ * the EPO switch is deactivated.
+ */
+void rfkill_remove_epo_lock(void)
+{
+ if (atomic_read(&rfkill_input_disabled))
+ return;
+
+ mutex_lock(&rfkill_global_mutex);
+ rfkill_epo_lock_active = false;
+ mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_is_epo_lock_active - returns true EPO is active
+ *
+ * Returns 0 (false) if there is NOT an active EPO contidion,
+ * and 1 (true) if there is an active EPO contition, which
+ * locks all radios in one of the BLOCKED states.
+ *
+ * Can be called in atomic context.
+ */
+bool rfkill_is_epo_lock_active(void)
+{
+ return rfkill_epo_lock_active;
+}
+
+/**
+ * rfkill_get_global_sw_state - returns global state for a type
+ * @type: the type to get the global state of
+ *
+ * Returns the current global state for a given wireless
+ * device type.
+ */
+bool rfkill_get_global_sw_state(const enum rfkill_type type)
+{
+ return rfkill_global_states[type].cur;
+}
+#endif
+
+
+bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
+{
+ bool ret, change;
+
+ ret = __rfkill_set_hw_state(rfkill, blocked, &change);
+
+ if (!rfkill->registered)
+ return ret;
+
+ if (change)
+ schedule_work(&rfkill->uevent_work);
+
+ return ret;
+}
+EXPORT_SYMBOL(rfkill_set_hw_state);
+
+static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
+{
+ u32 bit = RFKILL_BLOCK_SW;
+
+ /* if in a ops->set_block right now, use other bit */
+ if (rfkill->state & RFKILL_BLOCK_SW_SETCALL)
+ bit = RFKILL_BLOCK_SW_PREV;
+
+ if (blocked)
+ rfkill->state |= bit;
+ else
+ rfkill->state &= ~bit;
+}
+
+bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
+{
+ unsigned long flags;
+ bool prev, hwblock;
+
+ BUG_ON(!rfkill);
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+ prev = !!(rfkill->state & RFKILL_BLOCK_SW);
+ __rfkill_set_sw_state(rfkill, blocked);
+ hwblock = !!(rfkill->state & RFKILL_BLOCK_HW);
+ blocked = blocked || hwblock;
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+
+ if (!rfkill->registered)
+ return blocked;
+
+ if (prev != blocked && !hwblock)
+ schedule_work(&rfkill->uevent_work);
+
+ rfkill_led_trigger_event(rfkill);
+
+ return blocked;
+}
+EXPORT_SYMBOL(rfkill_set_sw_state);
+
+void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked)
+{
+ unsigned long flags;
+
+ BUG_ON(!rfkill);
+ BUG_ON(rfkill->registered);
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+ __rfkill_set_sw_state(rfkill, blocked);
+ rfkill->persistent = true;
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+}
+EXPORT_SYMBOL(rfkill_init_sw_state);
+
+void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
+{
+ unsigned long flags;
+ bool swprev, hwprev;
+
+ BUG_ON(!rfkill);
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+
+ /*
+ * No need to care about prev/setblock ... this is for uevent only
+ * and that will get triggered by rfkill_set_block anyway.
+ */
+ swprev = !!(rfkill->state & RFKILL_BLOCK_SW);
+ hwprev = !!(rfkill->state & RFKILL_BLOCK_HW);
+ __rfkill_set_sw_state(rfkill, sw);
+ if (hw)
+ rfkill->state |= RFKILL_BLOCK_HW;
+ else
+ rfkill->state &= ~RFKILL_BLOCK_HW;
+
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+
+ if (!rfkill->registered) {
+ rfkill->persistent = true;
+ } else {
+ if (swprev != sw || hwprev != hw)
+ schedule_work(&rfkill->uevent_work);
+
+ rfkill_led_trigger_event(rfkill);
+ }
+}
+EXPORT_SYMBOL(rfkill_set_states);
+
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ return sprintf(buf, "%s\n", rfkill->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static const char *rfkill_get_type_str(enum rfkill_type type)
+{
+ BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_NFC + 1);
+
+ switch (type) {
+ case RFKILL_TYPE_WLAN:
+ return "wlan";
+ case RFKILL_TYPE_BLUETOOTH:
+ return "bluetooth";
+ case RFKILL_TYPE_UWB:
+ return "ultrawideband";
+ case RFKILL_TYPE_WIMAX:
+ return "wimax";
+ case RFKILL_TYPE_WWAN:
+ return "wwan";
+ case RFKILL_TYPE_GPS:
+ return "gps";
+ case RFKILL_TYPE_FM:
+ return "fm";
+ case RFKILL_TYPE_NFC:
+ return "nfc";
+ default:
+ BUG();
+ }
+}
+
+static ssize_t type_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type));
+}
+static DEVICE_ATTR_RO(type);
+
+static ssize_t index_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ return sprintf(buf, "%d\n", rfkill->idx);
+}
+static DEVICE_ATTR_RO(index);
+
+static ssize_t persistent_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ return sprintf(buf, "%d\n", rfkill->persistent);
+}
+static DEVICE_ATTR_RO(persistent);
+
+static ssize_t hard_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ return sprintf(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_HW) ? 1 : 0 );
+}
+static DEVICE_ATTR_RO(hard);
+
+static ssize_t soft_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ return sprintf(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0 );
+}
+
+static ssize_t soft_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+ unsigned long state;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = kstrtoul(buf, 0, &state);
+ if (err)
+ return err;
+
+ if (state > 1 )
+ return -EINVAL;
+
+ mutex_lock(&rfkill_global_mutex);
+ rfkill_set_block(rfkill, state);
+ mutex_unlock(&rfkill_global_mutex);
+
+ return count;
+}
+static DEVICE_ATTR_RW(soft);
+
+static u8 user_state_from_blocked(unsigned long state)
+{
+ if (state & RFKILL_BLOCK_HW)
+ return RFKILL_USER_STATE_HARD_BLOCKED;
+ if (state & RFKILL_BLOCK_SW)
+ return RFKILL_USER_STATE_SOFT_BLOCKED;
+
+ return RFKILL_USER_STATE_UNBLOCKED;
+}
+
+static ssize_t state_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ return sprintf(buf, "%d\n", user_state_from_blocked(rfkill->state));
+}
+
+static ssize_t state_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+ unsigned long state;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = kstrtoul(buf, 0, &state);
+ if (err)
+ return err;
+
+ if (state != RFKILL_USER_STATE_SOFT_BLOCKED &&
+ state != RFKILL_USER_STATE_UNBLOCKED)
+ return -EINVAL;
+
+ mutex_lock(&rfkill_global_mutex);
+ rfkill_set_block(rfkill, state == RFKILL_USER_STATE_SOFT_BLOCKED);
+ mutex_unlock(&rfkill_global_mutex);
+
+ return count;
+}
+static DEVICE_ATTR_RW(state);
+
+static ssize_t claim_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", 0);
+}
+static DEVICE_ATTR_RO(claim);
+
+static struct attribute *rfkill_dev_attrs[] = {
+ &dev_attr_name.attr,
+ &dev_attr_type.attr,
+ &dev_attr_index.attr,
+ &dev_attr_persistent.attr,
+ &dev_attr_state.attr,
+ &dev_attr_claim.attr,
+ &dev_attr_soft.attr,
+ &dev_attr_hard.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(rfkill_dev);
+
+static void rfkill_release(struct device *dev)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ kfree(rfkill);
+}
+
+static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+ unsigned long flags;
+ u32 state;
+ int error;
+
+ error = add_uevent_var(env, "RFKILL_NAME=%s", rfkill->name);
+ if (error)
+ return error;
+ error = add_uevent_var(env, "RFKILL_TYPE=%s",
+ rfkill_get_type_str(rfkill->type));
+ if (error)
+ return error;
+ spin_lock_irqsave(&rfkill->lock, flags);
+ state = rfkill->state;
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+ error = add_uevent_var(env, "RFKILL_STATE=%d",
+ user_state_from_blocked(state));
+ return error;
+}
+
+void rfkill_pause_polling(struct rfkill *rfkill)
+{
+ BUG_ON(!rfkill);
+
+ if (!rfkill->ops->poll)
+ return;
+
+ cancel_delayed_work_sync(&rfkill->poll_work);
+}
+EXPORT_SYMBOL(rfkill_pause_polling);
+
+void rfkill_resume_polling(struct rfkill *rfkill)
+{
+ BUG_ON(!rfkill);
+
+ if (!rfkill->ops->poll)
+ return;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &rfkill->poll_work, 0);
+}
+EXPORT_SYMBOL(rfkill_resume_polling);
+
+static int rfkill_suspend(struct device *dev, pm_message_t state)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+
+ rfkill_pause_polling(rfkill);
+
+ return 0;
+}
+
+static int rfkill_resume(struct device *dev)
+{
+ struct rfkill *rfkill = to_rfkill(dev);
+ bool cur;
+
+ if (!rfkill->persistent) {
+ cur = !!(rfkill->state & RFKILL_BLOCK_SW);
+ rfkill_set_block(rfkill, cur);
+ }
+
+ rfkill_resume_polling(rfkill);
+
+ return 0;
+}
+
+static struct class rfkill_class = {
+ .name = "rfkill",
+ .dev_release = rfkill_release,
+ .dev_groups = rfkill_dev_groups,
+ .dev_uevent = rfkill_dev_uevent,
+ .suspend = rfkill_suspend,
+ .resume = rfkill_resume,
+};
+
+bool rfkill_blocked(struct rfkill *rfkill)
+{
+ unsigned long flags;
+ u32 state;
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+ state = rfkill->state;
+ spin_unlock_irqrestore(&rfkill->lock, flags);
+
+ return !!(state & RFKILL_BLOCK_ANY);
+}
+EXPORT_SYMBOL(rfkill_blocked);
+
+
+struct rfkill * __must_check rfkill_alloc(const char *name,
+ struct device *parent,
+ const enum rfkill_type type,
+ const struct rfkill_ops *ops,
+ void *ops_data)
+{
+ struct rfkill *rfkill;
+ struct device *dev;
+
+ if (WARN_ON(!ops))
+ return NULL;
+
+ if (WARN_ON(!ops->set_block))
+ return NULL;
+
+ if (WARN_ON(!name))
+ return NULL;
+
+ if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES))
+ return NULL;
+
+ rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL);
+ if (!rfkill)
+ return NULL;
+
+ spin_lock_init(&rfkill->lock);
+ INIT_LIST_HEAD(&rfkill->node);
+ rfkill->type = type;
+ rfkill->name = name;
+ rfkill->ops = ops;
+ rfkill->data = ops_data;
+
+ dev = &rfkill->dev;
+ dev->class = &rfkill_class;
+ dev->parent = parent;
+ device_initialize(dev);
+
+ return rfkill;
+}
+EXPORT_SYMBOL(rfkill_alloc);
+
+static void rfkill_poll(struct work_struct *work)
+{
+ struct rfkill *rfkill;
+
+ rfkill = container_of(work, struct rfkill, poll_work.work);
+
+ /*
+ * Poll hardware state -- driver will use one of the
+ * rfkill_set{,_hw,_sw}_state functions and use its
+ * return value to update the current status.
+ */
+ rfkill->ops->poll(rfkill, rfkill->data);
+
+ queue_delayed_work(system_power_efficient_wq,
+ &rfkill->poll_work,
+ round_jiffies_relative(POLL_INTERVAL));
+}
+
+static void rfkill_uevent_work(struct work_struct *work)
+{
+ struct rfkill *rfkill;
+
+ rfkill = container_of(work, struct rfkill, uevent_work);
+
+ mutex_lock(&rfkill_global_mutex);
+ rfkill_event(rfkill);
+ mutex_unlock(&rfkill_global_mutex);
+}
+
+static void rfkill_sync_work(struct work_struct *work)
+{
+ struct rfkill *rfkill;
+ bool cur;
+
+ rfkill = container_of(work, struct rfkill, sync_work);
+
+ mutex_lock(&rfkill_global_mutex);
+ cur = rfkill_global_states[rfkill->type].cur;
+ rfkill_set_block(rfkill, cur);
+ mutex_unlock(&rfkill_global_mutex);
+}
+
+int __must_check rfkill_register(struct rfkill *rfkill)
+{
+ static unsigned long rfkill_no;
+ struct device *dev = &rfkill->dev;
+ int error;
+
+ BUG_ON(!rfkill);
+
+ mutex_lock(&rfkill_global_mutex);
+
+ if (rfkill->registered) {
+ error = -EALREADY;
+ goto unlock;
+ }
+
+ rfkill->idx = rfkill_no;
+ dev_set_name(dev, "rfkill%lu", rfkill_no);
+ rfkill_no++;
+
+ list_add_tail(&rfkill->node, &rfkill_list);
+
+ error = device_add(dev);
+ if (error)
+ goto remove;
+
+ error = rfkill_led_trigger_register(rfkill);
+ if (error)
+ goto devdel;
+
+ rfkill->registered = true;
+
+ INIT_DELAYED_WORK(&rfkill->poll_work, rfkill_poll);
+ INIT_WORK(&rfkill->uevent_work, rfkill_uevent_work);
+ INIT_WORK(&rfkill->sync_work, rfkill_sync_work);
+
+ if (rfkill->ops->poll)
+ queue_delayed_work(system_power_efficient_wq,
+ &rfkill->poll_work,
+ round_jiffies_relative(POLL_INTERVAL));
+
+ if (!rfkill->persistent || rfkill_epo_lock_active) {
+ schedule_work(&rfkill->sync_work);
+ } else {
+#ifdef CONFIG_RFKILL_INPUT
+ bool soft_blocked = !!(rfkill->state & RFKILL_BLOCK_SW);
+
+ if (!atomic_read(&rfkill_input_disabled))
+ __rfkill_switch_all(rfkill->type, soft_blocked);
+#endif
+ }
+
+ rfkill_send_events(rfkill, RFKILL_OP_ADD);
+
+ mutex_unlock(&rfkill_global_mutex);
+ return 0;
+
+ devdel:
+ device_del(&rfkill->dev);
+ remove:
+ list_del_init(&rfkill->node);
+ unlock:
+ mutex_unlock(&rfkill_global_mutex);
+ return error;
+}
+EXPORT_SYMBOL(rfkill_register);
+
+void rfkill_unregister(struct rfkill *rfkill)
+{
+ BUG_ON(!rfkill);
+
+ if (rfkill->ops->poll)
+ cancel_delayed_work_sync(&rfkill->poll_work);
+
+ cancel_work_sync(&rfkill->uevent_work);
+ cancel_work_sync(&rfkill->sync_work);
+
+ rfkill->registered = false;
+
+ device_del(&rfkill->dev);
+
+ mutex_lock(&rfkill_global_mutex);
+ rfkill_send_events(rfkill, RFKILL_OP_DEL);
+ list_del_init(&rfkill->node);
+ mutex_unlock(&rfkill_global_mutex);
+
+ rfkill_led_trigger_unregister(rfkill);
+}
+EXPORT_SYMBOL(rfkill_unregister);
+
+void rfkill_destroy(struct rfkill *rfkill)
+{
+ if (rfkill)
+ put_device(&rfkill->dev);
+}
+EXPORT_SYMBOL(rfkill_destroy);
+
+static int rfkill_fop_open(struct inode *inode, struct file *file)
+{
+ struct rfkill_data *data;
+ struct rfkill *rfkill;
+ struct rfkill_int_event *ev, *tmp;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&data->events);
+ mutex_init(&data->mtx);
+ init_waitqueue_head(&data->read_wait);
+
+ mutex_lock(&rfkill_global_mutex);
+ mutex_lock(&data->mtx);
+ /*
+ * start getting events from elsewhere but hold mtx to get
+ * startup events added first
+ */
+
+ list_for_each_entry(rfkill, &rfkill_list, node) {
+ ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ if (!ev)
+ goto free;
+ rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD);
+ list_add_tail(&ev->list, &data->events);
+ }
+ list_add(&data->list, &rfkill_fds);
+ mutex_unlock(&data->mtx);
+ mutex_unlock(&rfkill_global_mutex);
+
+ file->private_data = data;
+
+ return nonseekable_open(inode, file);
+
+ free:
+ mutex_unlock(&data->mtx);
+ mutex_unlock(&rfkill_global_mutex);
+ mutex_destroy(&data->mtx);
+ list_for_each_entry_safe(ev, tmp, &data->events, list)
+ kfree(ev);
+ kfree(data);
+ return -ENOMEM;
+}
+
+static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait)
+{
+ struct rfkill_data *data = file->private_data;
+ unsigned int res = POLLOUT | POLLWRNORM;
+
+ poll_wait(file, &data->read_wait, wait);
+
+ mutex_lock(&data->mtx);
+ if (!list_empty(&data->events))
+ res = POLLIN | POLLRDNORM;
+ mutex_unlock(&data->mtx);
+
+ return res;
+}
+
+static bool rfkill_readable(struct rfkill_data *data)
+{
+ bool r;
+
+ mutex_lock(&data->mtx);
+ r = !list_empty(&data->events);
+ mutex_unlock(&data->mtx);
+
+ return r;
+}
+
+static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct rfkill_data *data = file->private_data;
+ struct rfkill_int_event *ev;
+ unsigned long sz;
+ int ret;
+
+ mutex_lock(&data->mtx);
+
+ while (list_empty(&data->events)) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ mutex_unlock(&data->mtx);
+ ret = wait_event_interruptible(data->read_wait,
+ rfkill_readable(data));
+ mutex_lock(&data->mtx);
+
+ if (ret)
+ goto out;
+ }
+
+ ev = list_first_entry(&data->events, struct rfkill_int_event,
+ list);
+
+ sz = min_t(unsigned long, sizeof(ev->ev), count);
+ ret = sz;
+ if (copy_to_user(buf, &ev->ev, sz))
+ ret = -EFAULT;
+
+ list_del(&ev->list);
+ kfree(ev);
+ out:
+ mutex_unlock(&data->mtx);
+ return ret;
+}
+
+static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct rfkill *rfkill;
+ struct rfkill_event ev;
+
+ /* we don't need the 'hard' variable but accept it */
+ if (count < RFKILL_EVENT_SIZE_V1 - 1)
+ return -EINVAL;
+
+ /*
+ * Copy as much data as we can accept into our 'ev' buffer,
+ * but tell userspace how much we've copied so it can determine
+ * our API version even in a write() call, if it cares.
+ */
+ count = min(count, sizeof(ev));
+ if (copy_from_user(&ev, buf, count))
+ return -EFAULT;
+
+ if (ev.op != RFKILL_OP_CHANGE && ev.op != RFKILL_OP_CHANGE_ALL)
+ return -EINVAL;
+
+ if (ev.type >= NUM_RFKILL_TYPES)
+ return -EINVAL;
+
+ mutex_lock(&rfkill_global_mutex);
+
+ if (ev.op == RFKILL_OP_CHANGE_ALL) {
+ if (ev.type == RFKILL_TYPE_ALL) {
+ enum rfkill_type i;
+ for (i = 0; i < NUM_RFKILL_TYPES; i++)
+ rfkill_global_states[i].cur = ev.soft;
+ } else {
+ rfkill_global_states[ev.type].cur = ev.soft;
+ }
+ }
+
+ list_for_each_entry(rfkill, &rfkill_list, node) {
+ if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL)
+ continue;
+
+ if (rfkill->type != ev.type && ev.type != RFKILL_TYPE_ALL)
+ continue;
+
+ rfkill_set_block(rfkill, ev.soft);
+ }
+ mutex_unlock(&rfkill_global_mutex);
+
+ return count;
+}
+
+static int rfkill_fop_release(struct inode *inode, struct file *file)
+{
+ struct rfkill_data *data = file->private_data;
+ struct rfkill_int_event *ev, *tmp;
+
+ mutex_lock(&rfkill_global_mutex);
+ list_del(&data->list);
+ mutex_unlock(&rfkill_global_mutex);
+
+ mutex_destroy(&data->mtx);
+ list_for_each_entry_safe(ev, tmp, &data->events, list)
+ kfree(ev);
+
+#ifdef CONFIG_RFKILL_INPUT
+ if (data->input_handler)
+ if (atomic_dec_return(&rfkill_input_disabled) == 0)
+ printk(KERN_DEBUG "rfkill: input handler enabled\n");
+#endif
+
+ kfree(data);
+
+ return 0;
+}
+
+#ifdef CONFIG_RFKILL_INPUT
+static long rfkill_fop_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct rfkill_data *data = file->private_data;
+
+ if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC)
+ return -ENOSYS;
+
+ if (_IOC_NR(cmd) != RFKILL_IOC_NOINPUT)
+ return -ENOSYS;
+
+ mutex_lock(&data->mtx);
+
+ if (!data->input_handler) {
+ if (atomic_inc_return(&rfkill_input_disabled) == 1)
+ printk(KERN_DEBUG "rfkill: input handler disabled\n");
+ data->input_handler = true;
+ }
+
+ mutex_unlock(&data->mtx);
+
+ return 0;
+}
+#endif
+
+static const struct file_operations rfkill_fops = {
+ .owner = THIS_MODULE,
+ .open = rfkill_fop_open,
+ .read = rfkill_fop_read,
+ .write = rfkill_fop_write,
+ .poll = rfkill_fop_poll,
+ .release = rfkill_fop_release,
+#ifdef CONFIG_RFKILL_INPUT
+ .unlocked_ioctl = rfkill_fop_ioctl,
+ .compat_ioctl = rfkill_fop_ioctl,
+#endif
+ .llseek = no_llseek,
+};
+
+static struct miscdevice rfkill_miscdev = {
+ .name = "rfkill",
+ .fops = &rfkill_fops,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init rfkill_init(void)
+{
+ int error;
+ int i;
+
+ for (i = 0; i < NUM_RFKILL_TYPES; i++)
+ rfkill_global_states[i].cur = !rfkill_default_state;
+
+ error = class_register(&rfkill_class);
+ if (error)
+ goto out;
+
+ error = misc_register(&rfkill_miscdev);
+ if (error) {
+ class_unregister(&rfkill_class);
+ goto out;
+ }
+
+#ifdef CONFIG_RFKILL_INPUT
+ error = rfkill_handler_init();
+ if (error) {
+ misc_deregister(&rfkill_miscdev);
+ class_unregister(&rfkill_class);
+ goto out;
+ }
+#endif
+
+ out:
+ return error;
+}
+subsys_initcall(rfkill_init);
+
+static void __exit rfkill_exit(void)
+{
+#ifdef CONFIG_RFKILL_INPUT
+ rfkill_handler_exit();
+#endif
+ misc_deregister(&rfkill_miscdev);
+ class_unregister(&rfkill_class);
+}
+module_exit(rfkill_exit);
diff --git a/net/rfkill/input.c b/net/rfkill/input.c
new file mode 100644
index 000000000..b85107b5e
--- /dev/null
+++ b/net/rfkill/input.c
@@ -0,0 +1,348 @@
+/*
+ * Input layer to RF Kill interface connector
+ *
+ * Copyright (c) 2007 Dmitry Torokhov
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * If you ever run into a situation in which you have a SW_ type rfkill
+ * input device, then you can revive code that was removed in the patch
+ * "rfkill-input: remove unused code".
+ */
+
+#include <linux/input.h>
+#include <linux/slab.h>
+#include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/init.h>
+#include <linux/rfkill.h>
+#include <linux/sched.h>
+
+#include "rfkill.h"
+
+enum rfkill_input_master_mode {
+ RFKILL_INPUT_MASTER_UNLOCK = 0,
+ RFKILL_INPUT_MASTER_RESTORE = 1,
+ RFKILL_INPUT_MASTER_UNBLOCKALL = 2,
+ NUM_RFKILL_INPUT_MASTER_MODES
+};
+
+/* Delay (in ms) between consecutive switch ops */
+#define RFKILL_OPS_DELAY 200
+
+static enum rfkill_input_master_mode rfkill_master_switch_mode =
+ RFKILL_INPUT_MASTER_UNBLOCKALL;
+module_param_named(master_switch_mode, rfkill_master_switch_mode, uint, 0);
+MODULE_PARM_DESC(master_switch_mode,
+ "SW_RFKILL_ALL ON should: 0=do nothing (only unlock); 1=restore; 2=unblock all");
+
+static spinlock_t rfkill_op_lock;
+static bool rfkill_op_pending;
+static unsigned long rfkill_sw_pending[BITS_TO_LONGS(NUM_RFKILL_TYPES)];
+static unsigned long rfkill_sw_state[BITS_TO_LONGS(NUM_RFKILL_TYPES)];
+
+enum rfkill_sched_op {
+ RFKILL_GLOBAL_OP_EPO = 0,
+ RFKILL_GLOBAL_OP_RESTORE,
+ RFKILL_GLOBAL_OP_UNLOCK,
+ RFKILL_GLOBAL_OP_UNBLOCK,
+};
+
+static enum rfkill_sched_op rfkill_master_switch_op;
+static enum rfkill_sched_op rfkill_op;
+
+static void __rfkill_handle_global_op(enum rfkill_sched_op op)
+{
+ unsigned int i;
+
+ switch (op) {
+ case RFKILL_GLOBAL_OP_EPO:
+ rfkill_epo();
+ break;
+ case RFKILL_GLOBAL_OP_RESTORE:
+ rfkill_restore_states();
+ break;
+ case RFKILL_GLOBAL_OP_UNLOCK:
+ rfkill_remove_epo_lock();
+ break;
+ case RFKILL_GLOBAL_OP_UNBLOCK:
+ rfkill_remove_epo_lock();
+ for (i = 0; i < NUM_RFKILL_TYPES; i++)
+ rfkill_switch_all(i, false);
+ break;
+ default:
+ /* memory corruption or bug, fail safely */
+ rfkill_epo();
+ WARN(1, "Unknown requested operation %d! "
+ "rfkill Emergency Power Off activated\n",
+ op);
+ }
+}
+
+static void __rfkill_handle_normal_op(const enum rfkill_type type,
+ const bool complement)
+{
+ bool blocked;
+
+ blocked = rfkill_get_global_sw_state(type);
+ if (complement)
+ blocked = !blocked;
+
+ rfkill_switch_all(type, blocked);
+}
+
+static void rfkill_op_handler(struct work_struct *work)
+{
+ unsigned int i;
+ bool c;
+
+ spin_lock_irq(&rfkill_op_lock);
+ do {
+ if (rfkill_op_pending) {
+ enum rfkill_sched_op op = rfkill_op;
+ rfkill_op_pending = false;
+ memset(rfkill_sw_pending, 0,
+ sizeof(rfkill_sw_pending));
+ spin_unlock_irq(&rfkill_op_lock);
+
+ __rfkill_handle_global_op(op);
+
+ spin_lock_irq(&rfkill_op_lock);
+
+ /*
+ * handle global ops first -- during unlocked period
+ * we might have gotten a new global op.
+ */
+ if (rfkill_op_pending)
+ continue;
+ }
+
+ if (rfkill_is_epo_lock_active())
+ continue;
+
+ for (i = 0; i < NUM_RFKILL_TYPES; i++) {
+ if (__test_and_clear_bit(i, rfkill_sw_pending)) {
+ c = __test_and_clear_bit(i, rfkill_sw_state);
+ spin_unlock_irq(&rfkill_op_lock);
+
+ __rfkill_handle_normal_op(i, c);
+
+ spin_lock_irq(&rfkill_op_lock);
+ }
+ }
+ } while (rfkill_op_pending);
+ spin_unlock_irq(&rfkill_op_lock);
+}
+
+static DECLARE_DELAYED_WORK(rfkill_op_work, rfkill_op_handler);
+static unsigned long rfkill_last_scheduled;
+
+static unsigned long rfkill_ratelimit(const unsigned long last)
+{
+ const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY);
+ return time_after(jiffies, last + delay) ? 0 : delay;
+}
+
+static void rfkill_schedule_ratelimited(void)
+{
+ if (schedule_delayed_work(&rfkill_op_work,
+ rfkill_ratelimit(rfkill_last_scheduled)))
+ rfkill_last_scheduled = jiffies;
+}
+
+static void rfkill_schedule_global_op(enum rfkill_sched_op op)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rfkill_op_lock, flags);
+ rfkill_op = op;
+ rfkill_op_pending = true;
+ if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) {
+ /* bypass the limiter for EPO */
+ mod_delayed_work(system_wq, &rfkill_op_work, 0);
+ rfkill_last_scheduled = jiffies;
+ } else
+ rfkill_schedule_ratelimited();
+ spin_unlock_irqrestore(&rfkill_op_lock, flags);
+}
+
+static void rfkill_schedule_toggle(enum rfkill_type type)
+{
+ unsigned long flags;
+
+ if (rfkill_is_epo_lock_active())
+ return;
+
+ spin_lock_irqsave(&rfkill_op_lock, flags);
+ if (!rfkill_op_pending) {
+ __set_bit(type, rfkill_sw_pending);
+ __change_bit(type, rfkill_sw_state);
+ rfkill_schedule_ratelimited();
+ }
+ spin_unlock_irqrestore(&rfkill_op_lock, flags);
+}
+
+static void rfkill_schedule_evsw_rfkillall(int state)
+{
+ if (state)
+ rfkill_schedule_global_op(rfkill_master_switch_op);
+ else
+ rfkill_schedule_global_op(RFKILL_GLOBAL_OP_EPO);
+}
+
+static void rfkill_event(struct input_handle *handle, unsigned int type,
+ unsigned int code, int data)
+{
+ if (type == EV_KEY && data == 1) {
+ switch (code) {
+ case KEY_WLAN:
+ rfkill_schedule_toggle(RFKILL_TYPE_WLAN);
+ break;
+ case KEY_BLUETOOTH:
+ rfkill_schedule_toggle(RFKILL_TYPE_BLUETOOTH);
+ break;
+ case KEY_UWB:
+ rfkill_schedule_toggle(RFKILL_TYPE_UWB);
+ break;
+ case KEY_WIMAX:
+ rfkill_schedule_toggle(RFKILL_TYPE_WIMAX);
+ break;
+ case KEY_RFKILL:
+ rfkill_schedule_toggle(RFKILL_TYPE_ALL);
+ break;
+ }
+ } else if (type == EV_SW && code == SW_RFKILL_ALL)
+ rfkill_schedule_evsw_rfkillall(data);
+}
+
+static int rfkill_connect(struct input_handler *handler, struct input_dev *dev,
+ const struct input_device_id *id)
+{
+ struct input_handle *handle;
+ int error;
+
+ handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
+ if (!handle)
+ return -ENOMEM;
+
+ handle->dev = dev;
+ handle->handler = handler;
+ handle->name = "rfkill";
+
+ /* causes rfkill_start() to be called */
+ error = input_register_handle(handle);
+ if (error)
+ goto err_free_handle;
+
+ error = input_open_device(handle);
+ if (error)
+ goto err_unregister_handle;
+
+ return 0;
+
+ err_unregister_handle:
+ input_unregister_handle(handle);
+ err_free_handle:
+ kfree(handle);
+ return error;
+}
+
+static void rfkill_start(struct input_handle *handle)
+{
+ /*
+ * Take event_lock to guard against configuration changes, we
+ * should be able to deal with concurrency with rfkill_event()
+ * just fine (which event_lock will also avoid).
+ */
+ spin_lock_irq(&handle->dev->event_lock);
+
+ if (test_bit(EV_SW, handle->dev->evbit) &&
+ test_bit(SW_RFKILL_ALL, handle->dev->swbit))
+ rfkill_schedule_evsw_rfkillall(test_bit(SW_RFKILL_ALL,
+ handle->dev->sw));
+
+ spin_unlock_irq(&handle->dev->event_lock);
+}
+
+static void rfkill_disconnect(struct input_handle *handle)
+{
+ input_close_device(handle);
+ input_unregister_handle(handle);
+ kfree(handle);
+}
+
+static const struct input_device_id rfkill_ids[] = {
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+ .evbit = { BIT_MASK(EV_KEY) },
+ .keybit = { [BIT_WORD(KEY_WLAN)] = BIT_MASK(KEY_WLAN) },
+ },
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+ .evbit = { BIT_MASK(EV_KEY) },
+ .keybit = { [BIT_WORD(KEY_BLUETOOTH)] = BIT_MASK(KEY_BLUETOOTH) },
+ },
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+ .evbit = { BIT_MASK(EV_KEY) },
+ .keybit = { [BIT_WORD(KEY_UWB)] = BIT_MASK(KEY_UWB) },
+ },
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+ .evbit = { BIT_MASK(EV_KEY) },
+ .keybit = { [BIT_WORD(KEY_WIMAX)] = BIT_MASK(KEY_WIMAX) },
+ },
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+ .evbit = { BIT_MASK(EV_KEY) },
+ .keybit = { [BIT_WORD(KEY_RFKILL)] = BIT_MASK(KEY_RFKILL) },
+ },
+ {
+ .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_SWBIT,
+ .evbit = { BIT(EV_SW) },
+ .swbit = { [BIT_WORD(SW_RFKILL_ALL)] = BIT_MASK(SW_RFKILL_ALL) },
+ },
+ { }
+};
+
+static struct input_handler rfkill_handler = {
+ .name = "rfkill",
+ .event = rfkill_event,
+ .connect = rfkill_connect,
+ .start = rfkill_start,
+ .disconnect = rfkill_disconnect,
+ .id_table = rfkill_ids,
+};
+
+int __init rfkill_handler_init(void)
+{
+ switch (rfkill_master_switch_mode) {
+ case RFKILL_INPUT_MASTER_UNBLOCKALL:
+ rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNBLOCK;
+ break;
+ case RFKILL_INPUT_MASTER_RESTORE:
+ rfkill_master_switch_op = RFKILL_GLOBAL_OP_RESTORE;
+ break;
+ case RFKILL_INPUT_MASTER_UNLOCK:
+ rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNLOCK;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_init(&rfkill_op_lock);
+
+ /* Avoid delay at first schedule */
+ rfkill_last_scheduled =
+ jiffies - msecs_to_jiffies(RFKILL_OPS_DELAY) - 1;
+ return input_register_handler(&rfkill_handler);
+}
+
+void __exit rfkill_handler_exit(void)
+{
+ input_unregister_handler(&rfkill_handler);
+ cancel_delayed_work_sync(&rfkill_op_work);
+}
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
new file mode 100644
index 000000000..d978f2f46
--- /dev/null
+++ b/net/rfkill/rfkill-gpio.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2011, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rfkill.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+#include <linux/slab.h>
+#include <linux/acpi.h>
+#include <linux/gpio/consumer.h>
+
+#include <linux/rfkill-gpio.h>
+
+struct rfkill_gpio_data {
+ const char *name;
+ enum rfkill_type type;
+ struct gpio_desc *reset_gpio;
+ struct gpio_desc *shutdown_gpio;
+
+ struct rfkill *rfkill_dev;
+ struct clk *clk;
+
+ bool clk_enabled;
+};
+
+static int rfkill_gpio_set_power(void *data, bool blocked)
+{
+ struct rfkill_gpio_data *rfkill = data;
+
+ if (!blocked && !IS_ERR(rfkill->clk) && !rfkill->clk_enabled)
+ clk_enable(rfkill->clk);
+
+ gpiod_set_value_cansleep(rfkill->shutdown_gpio, !blocked);
+ gpiod_set_value_cansleep(rfkill->reset_gpio, !blocked);
+
+ if (blocked && !IS_ERR(rfkill->clk) && rfkill->clk_enabled)
+ clk_disable(rfkill->clk);
+
+ rfkill->clk_enabled = !blocked;
+
+ return 0;
+}
+
+static const struct rfkill_ops rfkill_gpio_ops = {
+ .set_block = rfkill_gpio_set_power,
+};
+
+static const struct acpi_gpio_params reset_gpios = { 0, 0, false };
+static const struct acpi_gpio_params shutdown_gpios = { 1, 0, false };
+
+static const struct acpi_gpio_mapping acpi_rfkill_default_gpios[] = {
+ { "reset-gpios", &reset_gpios, 1 },
+ { "shutdown-gpios", &shutdown_gpios, 1 },
+ { },
+};
+
+static int rfkill_gpio_acpi_probe(struct device *dev,
+ struct rfkill_gpio_data *rfkill)
+{
+ const struct acpi_device_id *id;
+
+ id = acpi_match_device(dev->driver->acpi_match_table, dev);
+ if (!id)
+ return -ENODEV;
+
+ rfkill->name = dev_name(dev);
+ rfkill->type = (unsigned)id->driver_data;
+
+ return acpi_dev_add_driver_gpios(ACPI_COMPANION(dev),
+ acpi_rfkill_default_gpios);
+}
+
+static int rfkill_gpio_probe(struct platform_device *pdev)
+{
+ struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data;
+ struct rfkill_gpio_data *rfkill;
+ struct gpio_desc *gpio;
+ int ret;
+
+ rfkill = devm_kzalloc(&pdev->dev, sizeof(*rfkill), GFP_KERNEL);
+ if (!rfkill)
+ return -ENOMEM;
+
+ if (ACPI_HANDLE(&pdev->dev)) {
+ ret = rfkill_gpio_acpi_probe(&pdev->dev, rfkill);
+ if (ret)
+ return ret;
+ } else if (pdata) {
+ rfkill->name = pdata->name;
+ rfkill->type = pdata->type;
+ } else {
+ return -ENODEV;
+ }
+
+ rfkill->clk = devm_clk_get(&pdev->dev, NULL);
+
+ gpio = devm_gpiod_get(&pdev->dev, "reset");
+ if (!IS_ERR(gpio)) {
+ ret = gpiod_direction_output(gpio, 0);
+ if (ret)
+ return ret;
+ rfkill->reset_gpio = gpio;
+ }
+
+ gpio = devm_gpiod_get(&pdev->dev, "shutdown");
+ if (!IS_ERR(gpio)) {
+ ret = gpiod_direction_output(gpio, 0);
+ if (ret)
+ return ret;
+ rfkill->shutdown_gpio = gpio;
+ }
+
+ /* Make sure at-least one of the GPIO is defined and that
+ * a name is specified for this instance
+ */
+ if ((!rfkill->reset_gpio && !rfkill->shutdown_gpio) || !rfkill->name) {
+ dev_err(&pdev->dev, "invalid platform data\n");
+ return -EINVAL;
+ }
+
+ rfkill->rfkill_dev = rfkill_alloc(rfkill->name, &pdev->dev,
+ rfkill->type, &rfkill_gpio_ops,
+ rfkill);
+ if (!rfkill->rfkill_dev)
+ return -ENOMEM;
+
+ ret = rfkill_register(rfkill->rfkill_dev);
+ if (ret < 0)
+ return ret;
+
+ platform_set_drvdata(pdev, rfkill);
+
+ dev_info(&pdev->dev, "%s device registered.\n", rfkill->name);
+
+ return 0;
+}
+
+static int rfkill_gpio_remove(struct platform_device *pdev)
+{
+ struct rfkill_gpio_data *rfkill = platform_get_drvdata(pdev);
+
+ rfkill_unregister(rfkill->rfkill_dev);
+ rfkill_destroy(rfkill->rfkill_dev);
+
+ acpi_dev_remove_driver_gpios(ACPI_COMPANION(&pdev->dev));
+
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id rfkill_acpi_match[] = {
+ { "BCM2E1A", RFKILL_TYPE_BLUETOOTH },
+ { "BCM2E39", RFKILL_TYPE_BLUETOOTH },
+ { "BCM2E3D", RFKILL_TYPE_BLUETOOTH },
+ { "BCM2E40", RFKILL_TYPE_BLUETOOTH },
+ { "BCM2E64", RFKILL_TYPE_BLUETOOTH },
+ { "BCM4752", RFKILL_TYPE_GPS },
+ { "LNV4752", RFKILL_TYPE_GPS },
+ { },
+};
+MODULE_DEVICE_TABLE(acpi, rfkill_acpi_match);
+#endif
+
+static struct platform_driver rfkill_gpio_driver = {
+ .probe = rfkill_gpio_probe,
+ .remove = rfkill_gpio_remove,
+ .driver = {
+ .name = "rfkill_gpio",
+ .acpi_match_table = ACPI_PTR(rfkill_acpi_match),
+ },
+};
+
+module_platform_driver(rfkill_gpio_driver);
+
+MODULE_DESCRIPTION("gpio rfkill");
+MODULE_AUTHOR("NVIDIA");
+MODULE_LICENSE("GPL");
diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
new file mode 100644
index 000000000..50cd26a48
--- /dev/null
+++ b/net/rfkill/rfkill-regulator.c
@@ -0,0 +1,154 @@
+/*
+ * rfkill-regulator.c - Regulator consumer driver for rfkill
+ *
+ * Copyright (C) 2009 Guiming Zhuo <gmzhuo@gmail.com>
+ * Copyright (C) 2011 Antonio Ospite <ospite@studenti.unina.it>
+ *
+ * Implementation inspired by leds-regulator driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/rfkill.h>
+#include <linux/rfkill-regulator.h>
+
+struct rfkill_regulator_data {
+ struct rfkill *rf_kill;
+ bool reg_enabled;
+
+ struct regulator *vcc;
+};
+
+static int rfkill_regulator_set_block(void *data, bool blocked)
+{
+ struct rfkill_regulator_data *rfkill_data = data;
+ int ret = 0;
+
+ pr_debug("%s: blocked: %d\n", __func__, blocked);
+
+ if (blocked) {
+ if (rfkill_data->reg_enabled) {
+ regulator_disable(rfkill_data->vcc);
+ rfkill_data->reg_enabled = false;
+ }
+ } else {
+ if (!rfkill_data->reg_enabled) {
+ ret = regulator_enable(rfkill_data->vcc);
+ if (!ret)
+ rfkill_data->reg_enabled = true;
+ }
+ }
+
+ pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
+ regulator_is_enabled(rfkill_data->vcc));
+
+ return ret;
+}
+
+static struct rfkill_ops rfkill_regulator_ops = {
+ .set_block = rfkill_regulator_set_block,
+};
+
+static int rfkill_regulator_probe(struct platform_device *pdev)
+{
+ struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data;
+ struct rfkill_regulator_data *rfkill_data;
+ struct regulator *vcc;
+ struct rfkill *rf_kill;
+ int ret = 0;
+
+ if (pdata == NULL) {
+ dev_err(&pdev->dev, "no platform data\n");
+ return -ENODEV;
+ }
+
+ if (pdata->name == NULL || pdata->type == 0) {
+ dev_err(&pdev->dev, "invalid name or type in platform data\n");
+ return -EINVAL;
+ }
+
+ vcc = regulator_get_exclusive(&pdev->dev, "vrfkill");
+ if (IS_ERR(vcc)) {
+ dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name);
+ ret = PTR_ERR(vcc);
+ goto out;
+ }
+
+ rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL);
+ if (rfkill_data == NULL) {
+ ret = -ENOMEM;
+ goto err_data_alloc;
+ }
+
+ rf_kill = rfkill_alloc(pdata->name, &pdev->dev,
+ pdata->type,
+ &rfkill_regulator_ops, rfkill_data);
+ if (rf_kill == NULL) {
+ ret = -ENOMEM;
+ goto err_rfkill_alloc;
+ }
+
+ if (regulator_is_enabled(vcc)) {
+ dev_dbg(&pdev->dev, "Regulator already enabled\n");
+ rfkill_data->reg_enabled = true;
+ }
+ rfkill_data->vcc = vcc;
+ rfkill_data->rf_kill = rf_kill;
+
+ ret = rfkill_register(rf_kill);
+ if (ret) {
+ dev_err(&pdev->dev, "Cannot register rfkill device\n");
+ goto err_rfkill_register;
+ }
+
+ platform_set_drvdata(pdev, rfkill_data);
+ dev_info(&pdev->dev, "%s initialized\n", pdata->name);
+
+ return 0;
+
+err_rfkill_register:
+ rfkill_destroy(rf_kill);
+err_rfkill_alloc:
+ kfree(rfkill_data);
+err_data_alloc:
+ regulator_put(vcc);
+out:
+ return ret;
+}
+
+static int rfkill_regulator_remove(struct platform_device *pdev)
+{
+ struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev);
+ struct rfkill *rf_kill = rfkill_data->rf_kill;
+
+ rfkill_unregister(rf_kill);
+ rfkill_destroy(rf_kill);
+ regulator_put(rfkill_data->vcc);
+ kfree(rfkill_data);
+
+ return 0;
+}
+
+static struct platform_driver rfkill_regulator_driver = {
+ .probe = rfkill_regulator_probe,
+ .remove = rfkill_regulator_remove,
+ .driver = {
+ .name = "rfkill-regulator",
+ },
+};
+
+module_platform_driver(rfkill_regulator_driver);
+
+MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>");
+MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
+MODULE_DESCRIPTION("Regulator consumer driver for rfkill");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:rfkill-regulator");
diff --git a/net/rfkill/rfkill.h b/net/rfkill/rfkill.h
new file mode 100644
index 000000000..d1117cb6e
--- /dev/null
+++ b/net/rfkill/rfkill.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Ivo van Doorn
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef __RFKILL_INPUT_H
+#define __RFKILL_INPUT_H
+
+/* core code */
+void rfkill_switch_all(const enum rfkill_type type, bool blocked);
+void rfkill_epo(void);
+void rfkill_restore_states(void);
+void rfkill_remove_epo_lock(void);
+bool rfkill_is_epo_lock_active(void);
+bool rfkill_get_global_sw_state(const enum rfkill_type type);
+
+/* input handler */
+int rfkill_handler_init(void);
+void rfkill_handler_exit(void);
+
+#endif /* __RFKILL_INPUT_H */
diff --git a/net/rose/Makefile b/net/rose/Makefile
new file mode 100644
index 000000000..fa248116f
--- /dev/null
+++ b/net/rose/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux Rose (X.25 PLP) layer.
+#
+
+obj-$(CONFIG_ROSE) += rose.o
+
+rose-y := af_rose.o rose_dev.o rose_in.o rose_link.o rose_loopback.o \
+ rose_out.o rose_route.o rose_subr.o rose_timer.o
+rose-$(CONFIG_SYSCTL) += sysctl_net_rose.o
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
new file mode 100644
index 000000000..8ae603069
--- /dev/null
+++ b/net/rose/af_rose.c
@@ -0,0 +1,1636 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net)
+ * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi)
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/stat.h>
+#include <net/net_namespace.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <net/rose.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/tcp_states.h>
+#include <net/ip.h>
+#include <net/arp.h>
+
+static int rose_ndevs = 10;
+
+int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0;
+int sysctl_rose_call_request_timeout = ROSE_DEFAULT_T1;
+int sysctl_rose_reset_request_timeout = ROSE_DEFAULT_T2;
+int sysctl_rose_clear_request_timeout = ROSE_DEFAULT_T3;
+int sysctl_rose_no_activity_timeout = ROSE_DEFAULT_IDLE;
+int sysctl_rose_ack_hold_back_timeout = ROSE_DEFAULT_HB;
+int sysctl_rose_routing_control = ROSE_DEFAULT_ROUTING;
+int sysctl_rose_link_fail_timeout = ROSE_DEFAULT_FAIL_TIMEOUT;
+int sysctl_rose_maximum_vcs = ROSE_DEFAULT_MAXVC;
+int sysctl_rose_window_size = ROSE_DEFAULT_WINDOW_SIZE;
+
+static HLIST_HEAD(rose_list);
+static DEFINE_SPINLOCK(rose_list_lock);
+
+static const struct proto_ops rose_proto_ops;
+
+ax25_address rose_callsign;
+
+/*
+ * ROSE network devices are virtual network devices encapsulating ROSE
+ * frames into AX.25 which will be sent through an AX.25 device, so form a
+ * special "super class" of normal net devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key rose_netdev_xmit_lock_key;
+static struct lock_class_key rose_netdev_addr_lock_key;
+
+static void rose_set_lockdep_one(struct net_device *dev,
+ struct netdev_queue *txq,
+ void *_unused)
+{
+ lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key);
+}
+
+static void rose_set_lockdep_key(struct net_device *dev)
+{
+ lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key);
+ netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL);
+}
+
+/*
+ * Convert a ROSE address into text.
+ */
+char *rose2asc(char *buf, const rose_address *addr)
+{
+ if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 &&
+ addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 &&
+ addr->rose_addr[4] == 0x00) {
+ strcpy(buf, "*");
+ } else {
+ sprintf(buf, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF,
+ addr->rose_addr[1] & 0xFF,
+ addr->rose_addr[2] & 0xFF,
+ addr->rose_addr[3] & 0xFF,
+ addr->rose_addr[4] & 0xFF);
+ }
+
+ return buf;
+}
+
+/*
+ * Compare two ROSE addresses, 0 == equal.
+ */
+int rosecmp(rose_address *addr1, rose_address *addr2)
+{
+ int i;
+
+ for (i = 0; i < 5; i++)
+ if (addr1->rose_addr[i] != addr2->rose_addr[i])
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Compare two ROSE addresses for only mask digits, 0 == equal.
+ */
+int rosecmpm(rose_address *addr1, rose_address *addr2, unsigned short mask)
+{
+ unsigned int i, j;
+
+ if (mask > 10)
+ return 1;
+
+ for (i = 0; i < mask; i++) {
+ j = i / 2;
+
+ if ((i % 2) != 0) {
+ if ((addr1->rose_addr[j] & 0x0F) != (addr2->rose_addr[j] & 0x0F))
+ return 1;
+ } else {
+ if ((addr1->rose_addr[j] & 0xF0) != (addr2->rose_addr[j] & 0xF0))
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Socket removal during an interrupt is now safe.
+ */
+static void rose_remove_socket(struct sock *sk)
+{
+ spin_lock_bh(&rose_list_lock);
+ sk_del_node_init(sk);
+ spin_unlock_bh(&rose_list_lock);
+}
+
+/*
+ * Kill all bound sockets on a broken link layer connection to a
+ * particular neighbour.
+ */
+void rose_kill_by_neigh(struct rose_neigh *neigh)
+{
+ struct sock *s;
+
+ spin_lock_bh(&rose_list_lock);
+ sk_for_each(s, &rose_list) {
+ struct rose_sock *rose = rose_sk(s);
+
+ if (rose->neighbour == neigh) {
+ rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
+ rose->neighbour->use--;
+ rose->neighbour = NULL;
+ }
+ }
+ spin_unlock_bh(&rose_list_lock);
+}
+
+/*
+ * Kill all bound sockets on a dropped device.
+ */
+static void rose_kill_by_device(struct net_device *dev)
+{
+ struct sock *s;
+
+ spin_lock_bh(&rose_list_lock);
+ sk_for_each(s, &rose_list) {
+ struct rose_sock *rose = rose_sk(s);
+
+ if (rose->device == dev) {
+ rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
+ rose->neighbour->use--;
+ rose->device = NULL;
+ }
+ }
+ spin_unlock_bh(&rose_list_lock);
+}
+
+/*
+ * Handle device status changes.
+ */
+static int rose_device_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ switch (dev->type) {
+ case ARPHRD_ROSE:
+ rose_kill_by_device(dev);
+ break;
+ case ARPHRD_AX25:
+ rose_link_device_down(dev);
+ rose_rt_device_down(dev);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Add a socket to the bound sockets list.
+ */
+static void rose_insert_socket(struct sock *sk)
+{
+
+ spin_lock_bh(&rose_list_lock);
+ sk_add_node(sk, &rose_list);
+ spin_unlock_bh(&rose_list_lock);
+}
+
+/*
+ * Find a socket that wants to accept the Call Request we just
+ * received.
+ */
+static struct sock *rose_find_listener(rose_address *addr, ax25_address *call)
+{
+ struct sock *s;
+
+ spin_lock_bh(&rose_list_lock);
+ sk_for_each(s, &rose_list) {
+ struct rose_sock *rose = rose_sk(s);
+
+ if (!rosecmp(&rose->source_addr, addr) &&
+ !ax25cmp(&rose->source_call, call) &&
+ !rose->source_ndigis && s->sk_state == TCP_LISTEN)
+ goto found;
+ }
+
+ sk_for_each(s, &rose_list) {
+ struct rose_sock *rose = rose_sk(s);
+
+ if (!rosecmp(&rose->source_addr, addr) &&
+ !ax25cmp(&rose->source_call, &null_ax25_address) &&
+ s->sk_state == TCP_LISTEN)
+ goto found;
+ }
+ s = NULL;
+found:
+ spin_unlock_bh(&rose_list_lock);
+ return s;
+}
+
+/*
+ * Find a connected ROSE socket given my LCI and device.
+ */
+struct sock *rose_find_socket(unsigned int lci, struct rose_neigh *neigh)
+{
+ struct sock *s;
+
+ spin_lock_bh(&rose_list_lock);
+ sk_for_each(s, &rose_list) {
+ struct rose_sock *rose = rose_sk(s);
+
+ if (rose->lci == lci && rose->neighbour == neigh)
+ goto found;
+ }
+ s = NULL;
+found:
+ spin_unlock_bh(&rose_list_lock);
+ return s;
+}
+
+/*
+ * Find a unique LCI for a given device.
+ */
+unsigned int rose_new_lci(struct rose_neigh *neigh)
+{
+ int lci;
+
+ if (neigh->dce_mode) {
+ for (lci = 1; lci <= sysctl_rose_maximum_vcs; lci++)
+ if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL)
+ return lci;
+ } else {
+ for (lci = sysctl_rose_maximum_vcs; lci > 0; lci--)
+ if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL)
+ return lci;
+ }
+
+ return 0;
+}
+
+/*
+ * Deferred destroy.
+ */
+void rose_destroy_socket(struct sock *);
+
+/*
+ * Handler for deferred kills.
+ */
+static void rose_destroy_timer(unsigned long data)
+{
+ rose_destroy_socket((struct sock *)data);
+}
+
+/*
+ * This is called from user mode and the timers. Thus it protects itself
+ * against interrupt users but doesn't worry about being called during
+ * work. Once it is removed from the queue no interrupt or bottom half
+ * will touch it and we are (fairly 8-) ) safe.
+ */
+void rose_destroy_socket(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ rose_remove_socket(sk);
+ rose_stop_heartbeat(sk);
+ rose_stop_idletimer(sk);
+ rose_stop_timer(sk);
+
+ rose_clear_queues(sk); /* Flush the queues */
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (skb->sk != sk) { /* A pending connection */
+ /* Queue the unaccepted socket for death */
+ sock_set_flag(skb->sk, SOCK_DEAD);
+ rose_start_heartbeat(skb->sk);
+ rose_sk(skb->sk)->state = ROSE_STATE_0;
+ }
+
+ kfree_skb(skb);
+ }
+
+ if (sk_has_allocations(sk)) {
+ /* Defer: outstanding buffers */
+ setup_timer(&sk->sk_timer, rose_destroy_timer,
+ (unsigned long)sk);
+ sk->sk_timer.expires = jiffies + 10 * HZ;
+ add_timer(&sk->sk_timer);
+ } else
+ sock_put(sk);
+}
+
+/*
+ * Handling for system calls applied via the various interfaces to a
+ * ROSE socket object.
+ */
+
+static int rose_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose = rose_sk(sk);
+ int opt;
+
+ if (level != SOL_ROSE)
+ return -ENOPROTOOPT;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(opt, (int __user *)optval))
+ return -EFAULT;
+
+ switch (optname) {
+ case ROSE_DEFER:
+ rose->defer = opt ? 1 : 0;
+ return 0;
+
+ case ROSE_T1:
+ if (opt < 1)
+ return -EINVAL;
+ rose->t1 = opt * HZ;
+ return 0;
+
+ case ROSE_T2:
+ if (opt < 1)
+ return -EINVAL;
+ rose->t2 = opt * HZ;
+ return 0;
+
+ case ROSE_T3:
+ if (opt < 1)
+ return -EINVAL;
+ rose->t3 = opt * HZ;
+ return 0;
+
+ case ROSE_HOLDBACK:
+ if (opt < 1)
+ return -EINVAL;
+ rose->hb = opt * HZ;
+ return 0;
+
+ case ROSE_IDLE:
+ if (opt < 0)
+ return -EINVAL;
+ rose->idle = opt * 60 * HZ;
+ return 0;
+
+ case ROSE_QBITINCL:
+ rose->qbitincl = opt ? 1 : 0;
+ return 0;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+static int rose_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose = rose_sk(sk);
+ int val = 0;
+ int len;
+
+ if (level != SOL_ROSE)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case ROSE_DEFER:
+ val = rose->defer;
+ break;
+
+ case ROSE_T1:
+ val = rose->t1 / HZ;
+ break;
+
+ case ROSE_T2:
+ val = rose->t2 / HZ;
+ break;
+
+ case ROSE_T3:
+ val = rose->t3 / HZ;
+ break;
+
+ case ROSE_HOLDBACK:
+ val = rose->hb / HZ;
+ break;
+
+ case ROSE_IDLE:
+ val = rose->idle / (60 * HZ);
+ break;
+
+ case ROSE_QBITINCL:
+ val = rose->qbitincl;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return copy_to_user(optval, &val, len) ? -EFAULT : 0;
+}
+
+static int rose_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_state != TCP_LISTEN) {
+ struct rose_sock *rose = rose_sk(sk);
+
+ rose->dest_ndigis = 0;
+ memset(&rose->dest_addr, 0, ROSE_ADDR_LEN);
+ memset(&rose->dest_call, 0, AX25_ADDR_LEN);
+ memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS);
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_state = TCP_LISTEN;
+ return 0;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static struct proto rose_proto = {
+ .name = "ROSE",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct rose_sock),
+};
+
+static int rose_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct rose_sock *rose;
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ if (sock->type != SOCK_SEQPACKET || protocol != 0)
+ return -ESOCKTNOSUPPORT;
+
+ sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto);
+ if (sk == NULL)
+ return -ENOMEM;
+
+ rose = rose_sk(sk);
+
+ sock_init_data(sock, sk);
+
+ skb_queue_head_init(&rose->ack_queue);
+#ifdef M_BIT
+ skb_queue_head_init(&rose->frag_queue);
+ rose->fraglen = 0;
+#endif
+
+ sock->ops = &rose_proto_ops;
+ sk->sk_protocol = protocol;
+
+ init_timer(&rose->timer);
+ init_timer(&rose->idletimer);
+
+ rose->t1 = msecs_to_jiffies(sysctl_rose_call_request_timeout);
+ rose->t2 = msecs_to_jiffies(sysctl_rose_reset_request_timeout);
+ rose->t3 = msecs_to_jiffies(sysctl_rose_clear_request_timeout);
+ rose->hb = msecs_to_jiffies(sysctl_rose_ack_hold_back_timeout);
+ rose->idle = msecs_to_jiffies(sysctl_rose_no_activity_timeout);
+
+ rose->state = ROSE_STATE_0;
+
+ return 0;
+}
+
+static struct sock *rose_make_new(struct sock *osk)
+{
+ struct sock *sk;
+ struct rose_sock *rose, *orose;
+
+ if (osk->sk_type != SOCK_SEQPACKET)
+ return NULL;
+
+ sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto);
+ if (sk == NULL)
+ return NULL;
+
+ rose = rose_sk(sk);
+
+ sock_init_data(NULL, sk);
+
+ skb_queue_head_init(&rose->ack_queue);
+#ifdef M_BIT
+ skb_queue_head_init(&rose->frag_queue);
+ rose->fraglen = 0;
+#endif
+
+ sk->sk_type = osk->sk_type;
+ sk->sk_priority = osk->sk_priority;
+ sk->sk_protocol = osk->sk_protocol;
+ sk->sk_rcvbuf = osk->sk_rcvbuf;
+ sk->sk_sndbuf = osk->sk_sndbuf;
+ sk->sk_state = TCP_ESTABLISHED;
+ sock_copy_flags(sk, osk);
+
+ init_timer(&rose->timer);
+ init_timer(&rose->idletimer);
+
+ orose = rose_sk(osk);
+ rose->t1 = orose->t1;
+ rose->t2 = orose->t2;
+ rose->t3 = orose->t3;
+ rose->hb = orose->hb;
+ rose->idle = orose->idle;
+ rose->defer = orose->defer;
+ rose->device = orose->device;
+ rose->qbitincl = orose->qbitincl;
+
+ return sk;
+}
+
+static int rose_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose;
+
+ if (sk == NULL) return 0;
+
+ sock_hold(sk);
+ sock_orphan(sk);
+ lock_sock(sk);
+ rose = rose_sk(sk);
+
+ switch (rose->state) {
+ case ROSE_STATE_0:
+ release_sock(sk);
+ rose_disconnect(sk, 0, -1, -1);
+ lock_sock(sk);
+ rose_destroy_socket(sk);
+ break;
+
+ case ROSE_STATE_2:
+ rose->neighbour->use--;
+ release_sock(sk);
+ rose_disconnect(sk, 0, -1, -1);
+ lock_sock(sk);
+ rose_destroy_socket(sk);
+ break;
+
+ case ROSE_STATE_1:
+ case ROSE_STATE_3:
+ case ROSE_STATE_4:
+ case ROSE_STATE_5:
+ rose_clear_queues(sk);
+ rose_stop_idletimer(sk);
+ rose_write_internal(sk, ROSE_CLEAR_REQUEST);
+ rose_start_t3timer(sk);
+ rose->state = ROSE_STATE_2;
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_set_flag(sk, SOCK_DESTROY);
+ break;
+
+ default:
+ break;
+ }
+
+ sock->sk = NULL;
+ release_sock(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose = rose_sk(sk);
+ struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
+ struct net_device *dev;
+ ax25_address *source;
+ ax25_uid_assoc *user;
+ int n;
+
+ if (!sock_flag(sk, SOCK_ZAPPED))
+ return -EINVAL;
+
+ if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose))
+ return -EINVAL;
+
+ if (addr->srose_family != AF_ROSE)
+ return -EINVAL;
+
+ if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
+ return -EINVAL;
+
+ if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
+ return -EINVAL;
+
+ if ((dev = rose_dev_get(&addr->srose_addr)) == NULL)
+ return -EADDRNOTAVAIL;
+
+ source = &addr->srose_call;
+
+ user = ax25_findbyuid(current_euid());
+ if (user) {
+ rose->source_call = user->call;
+ ax25_uid_put(user);
+ } else {
+ if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+ rose->source_call = *source;
+ }
+
+ rose->source_addr = addr->srose_addr;
+ rose->device = dev;
+ rose->source_ndigis = addr->srose_ndigis;
+
+ if (addr_len == sizeof(struct full_sockaddr_rose)) {
+ struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr;
+ for (n = 0 ; n < addr->srose_ndigis ; n++)
+ rose->source_digis[n] = full_addr->srose_digis[n];
+ } else {
+ if (rose->source_ndigis == 1) {
+ rose->source_digis[0] = addr->srose_digi;
+ }
+ }
+
+ rose_insert_socket(sk);
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ return 0;
+}
+
+static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose = rose_sk(sk);
+ struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
+ unsigned char cause, diagnostic;
+ struct net_device *dev;
+ ax25_uid_assoc *user;
+ int n, err = 0;
+
+ if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose))
+ return -EINVAL;
+
+ if (addr->srose_family != AF_ROSE)
+ return -EINVAL;
+
+ if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
+ return -EINVAL;
+
+ if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
+ return -EINVAL;
+
+ /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */
+ if ((rose->source_ndigis + addr->srose_ndigis) > ROSE_MAX_DIGIS)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
+ /* Connect completed during a ERESTARTSYS event */
+ sock->state = SS_CONNECTED;
+ goto out_release;
+ }
+
+ if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
+ sock->state = SS_UNCONNECTED;
+ err = -ECONNREFUSED;
+ goto out_release;
+ }
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ /* No reconnect on a seqpacket socket */
+ err = -EISCONN;
+ goto out_release;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ sock->state = SS_UNCONNECTED;
+
+ rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause,
+ &diagnostic, 0);
+ if (!rose->neighbour) {
+ err = -ENETUNREACH;
+ goto out_release;
+ }
+
+ rose->lci = rose_new_lci(rose->neighbour);
+ if (!rose->lci) {
+ err = -ENETUNREACH;
+ goto out_release;
+ }
+
+ if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ if ((dev = rose_dev_first()) == NULL) {
+ err = -ENETUNREACH;
+ goto out_release;
+ }
+
+ user = ax25_findbyuid(current_euid());
+ if (!user) {
+ err = -EINVAL;
+ goto out_release;
+ }
+
+ memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
+ rose->source_call = user->call;
+ rose->device = dev;
+ ax25_uid_put(user);
+
+ rose_insert_socket(sk); /* Finish the bind */
+ }
+ rose->dest_addr = addr->srose_addr;
+ rose->dest_call = addr->srose_call;
+ rose->rand = ((long)rose & 0xFFFF) + rose->lci;
+ rose->dest_ndigis = addr->srose_ndigis;
+
+ if (addr_len == sizeof(struct full_sockaddr_rose)) {
+ struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr;
+ for (n = 0 ; n < addr->srose_ndigis ; n++)
+ rose->dest_digis[n] = full_addr->srose_digis[n];
+ } else {
+ if (rose->dest_ndigis == 1) {
+ rose->dest_digis[0] = addr->srose_digi;
+ }
+ }
+
+ /* Move to connecting socket, start sending Connect Requests */
+ sock->state = SS_CONNECTING;
+ sk->sk_state = TCP_SYN_SENT;
+
+ rose->state = ROSE_STATE_1;
+
+ rose->neighbour->use++;
+
+ rose_write_internal(sk, ROSE_CALL_REQUEST);
+ rose_start_heartbeat(sk);
+ rose_start_t1timer(sk);
+
+ /* Now the loop */
+ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) {
+ err = -EINPROGRESS;
+ goto out_release;
+ }
+
+ /*
+ * A Connect Ack with Choke or timeout or failed routing will go to
+ * closed.
+ */
+ if (sk->sk_state == TCP_SYN_SENT) {
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk->sk_state != TCP_SYN_SENT)
+ break;
+ if (!signal_pending(current)) {
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ continue;
+ }
+ err = -ERESTARTSYS;
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (err)
+ goto out_release;
+ }
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ sock->state = SS_UNCONNECTED;
+ err = sock_error(sk); /* Always set at this point */
+ goto out_release;
+ }
+
+ sock->state = SS_CONNECTED;
+
+out_release:
+ release_sock(sk);
+
+ return err;
+}
+
+static int rose_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sk_buff *skb;
+ struct sock *newsk;
+ DEFINE_WAIT(wait);
+ struct sock *sk;
+ int err = 0;
+
+ if ((sk = sock->sk) == NULL)
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EOPNOTSUPP;
+ goto out_release;
+ }
+
+ if (sk->sk_state != TCP_LISTEN) {
+ err = -EINVAL;
+ goto out_release;
+ }
+
+ /*
+ * The write queue this time is holding sockets ready to use
+ * hooked into the SABM we saved
+ */
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (skb)
+ break;
+
+ if (flags & O_NONBLOCK) {
+ err = -EWOULDBLOCK;
+ break;
+ }
+ if (!signal_pending(current)) {
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ continue;
+ }
+ err = -ERESTARTSYS;
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ if (err)
+ goto out_release;
+
+ newsk = skb->sk;
+ sock_graft(newsk, newsock);
+
+ /* Now attach up the new socket */
+ skb->sk = NULL;
+ kfree_skb(skb);
+ sk->sk_ack_backlog--;
+
+out_release:
+ release_sock(sk);
+
+ return err;
+}
+
+static int rose_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr;
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose = rose_sk(sk);
+ int n;
+
+ memset(srose, 0, sizeof(*srose));
+ if (peer != 0) {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+ srose->srose_family = AF_ROSE;
+ srose->srose_addr = rose->dest_addr;
+ srose->srose_call = rose->dest_call;
+ srose->srose_ndigis = rose->dest_ndigis;
+ for (n = 0; n < rose->dest_ndigis; n++)
+ srose->srose_digis[n] = rose->dest_digis[n];
+ } else {
+ srose->srose_family = AF_ROSE;
+ srose->srose_addr = rose->source_addr;
+ srose->srose_call = rose->source_call;
+ srose->srose_ndigis = rose->source_ndigis;
+ for (n = 0; n < rose->source_ndigis; n++)
+ srose->srose_digis[n] = rose->source_digis[n];
+ }
+
+ *uaddr_len = sizeof(struct full_sockaddr_rose);
+ return 0;
+}
+
+int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci)
+{
+ struct sock *sk;
+ struct sock *make;
+ struct rose_sock *make_rose;
+ struct rose_facilities_struct facilities;
+ int n;
+
+ skb->sk = NULL; /* Initially we don't know who it's for */
+
+ /*
+ * skb->data points to the rose frame start
+ */
+ memset(&facilities, 0x00, sizeof(struct rose_facilities_struct));
+
+ if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF,
+ skb->len - ROSE_CALL_REQ_FACILITIES_OFF,
+ &facilities)) {
+ rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76);
+ return 0;
+ }
+
+ sk = rose_find_listener(&facilities.source_addr, &facilities.source_call);
+
+ /*
+ * We can't accept the Call Request.
+ */
+ if (sk == NULL || sk_acceptq_is_full(sk) ||
+ (make = rose_make_new(sk)) == NULL) {
+ rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120);
+ return 0;
+ }
+
+ skb->sk = make;
+ make->sk_state = TCP_ESTABLISHED;
+ make_rose = rose_sk(make);
+
+ make_rose->lci = lci;
+ make_rose->dest_addr = facilities.dest_addr;
+ make_rose->dest_call = facilities.dest_call;
+ make_rose->dest_ndigis = facilities.dest_ndigis;
+ for (n = 0 ; n < facilities.dest_ndigis ; n++)
+ make_rose->dest_digis[n] = facilities.dest_digis[n];
+ make_rose->source_addr = facilities.source_addr;
+ make_rose->source_call = facilities.source_call;
+ make_rose->source_ndigis = facilities.source_ndigis;
+ for (n = 0 ; n < facilities.source_ndigis ; n++)
+ make_rose->source_digis[n] = facilities.source_digis[n];
+ make_rose->neighbour = neigh;
+ make_rose->device = dev;
+ make_rose->facilities = facilities;
+
+ make_rose->neighbour->use++;
+
+ if (rose_sk(sk)->defer) {
+ make_rose->state = ROSE_STATE_5;
+ } else {
+ rose_write_internal(make, ROSE_CALL_ACCEPTED);
+ make_rose->state = ROSE_STATE_3;
+ rose_start_idletimer(make);
+ }
+
+ make_rose->condition = 0x00;
+ make_rose->vs = 0;
+ make_rose->va = 0;
+ make_rose->vr = 0;
+ make_rose->vl = 0;
+ sk->sk_ack_backlog++;
+
+ rose_insert_socket(make);
+
+ skb_queue_head(&sk->sk_receive_queue, skb);
+
+ rose_start_heartbeat(make);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+
+ return 1;
+}
+
+static int rose_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose = rose_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_rose *, usrose, msg->msg_name);
+ int err;
+ struct full_sockaddr_rose srose;
+ struct sk_buff *skb;
+ unsigned char *asmptr;
+ int n, size, qbit = 0;
+
+ if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ if (sock_flag(sk, SOCK_ZAPPED))
+ return -EADDRNOTAVAIL;
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ send_sig(SIGPIPE, current, 0);
+ return -EPIPE;
+ }
+
+ if (rose->neighbour == NULL || rose->device == NULL)
+ return -ENETUNREACH;
+
+ if (usrose != NULL) {
+ if (msg->msg_namelen != sizeof(struct sockaddr_rose) && msg->msg_namelen != sizeof(struct full_sockaddr_rose))
+ return -EINVAL;
+ memset(&srose, 0, sizeof(struct full_sockaddr_rose));
+ memcpy(&srose, usrose, msg->msg_namelen);
+ if (rosecmp(&rose->dest_addr, &srose.srose_addr) != 0 ||
+ ax25cmp(&rose->dest_call, &srose.srose_call) != 0)
+ return -EISCONN;
+ if (srose.srose_ndigis != rose->dest_ndigis)
+ return -EISCONN;
+ if (srose.srose_ndigis == rose->dest_ndigis) {
+ for (n = 0 ; n < srose.srose_ndigis ; n++)
+ if (ax25cmp(&rose->dest_digis[n],
+ &srose.srose_digis[n]))
+ return -EISCONN;
+ }
+ if (srose.srose_family != AF_ROSE)
+ return -EINVAL;
+ } else {
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ srose.srose_family = AF_ROSE;
+ srose.srose_addr = rose->dest_addr;
+ srose.srose_call = rose->dest_call;
+ srose.srose_ndigis = rose->dest_ndigis;
+ for (n = 0 ; n < rose->dest_ndigis ; n++)
+ srose.srose_digis[n] = rose->dest_digis[n];
+ }
+
+ /* Build a packet */
+ /* Sanity check the packet size */
+ if (len > 65535)
+ return -EMSGSIZE;
+
+ size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN;
+
+ if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL)
+ return err;
+
+ skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN);
+
+ /*
+ * Put the data on the end
+ */
+
+ skb_reset_transport_header(skb);
+ skb_put(skb, len);
+
+ err = memcpy_from_msg(skb_transport_header(skb), msg, len);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ /*
+ * If the Q BIT Include socket option is in force, the first
+ * byte of the user data is the logical value of the Q Bit.
+ */
+ if (rose->qbitincl) {
+ qbit = skb->data[0];
+ skb_pull(skb, 1);
+ }
+
+ /*
+ * Push down the ROSE header
+ */
+ asmptr = skb_push(skb, ROSE_MIN_LEN);
+
+ /* Build a ROSE Network header */
+ asmptr[0] = ((rose->lci >> 8) & 0x0F) | ROSE_GFI;
+ asmptr[1] = (rose->lci >> 0) & 0xFF;
+ asmptr[2] = ROSE_DATA;
+
+ if (qbit)
+ asmptr[0] |= ROSE_Q_BIT;
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ kfree_skb(skb);
+ return -ENOTCONN;
+ }
+
+#ifdef M_BIT
+#define ROSE_PACLEN (256-ROSE_MIN_LEN)
+ if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) {
+ unsigned char header[ROSE_MIN_LEN];
+ struct sk_buff *skbn;
+ int frontlen;
+ int lg;
+
+ /* Save a copy of the Header */
+ skb_copy_from_linear_data(skb, header, ROSE_MIN_LEN);
+ skb_pull(skb, ROSE_MIN_LEN);
+
+ frontlen = skb_headroom(skb);
+
+ while (skb->len > 0) {
+ if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, &err)) == NULL) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skbn->sk = sk;
+ skbn->free = 1;
+ skbn->arp = 1;
+
+ skb_reserve(skbn, frontlen);
+
+ lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN;
+
+ /* Copy the user data */
+ skb_copy_from_linear_data(skb, skb_put(skbn, lg), lg);
+ skb_pull(skb, lg);
+
+ /* Duplicate the Header */
+ skb_push(skbn, ROSE_MIN_LEN);
+ skb_copy_to_linear_data(skbn, header, ROSE_MIN_LEN);
+
+ if (skb->len > 0)
+ skbn->data[2] |= M_BIT;
+
+ skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */
+ }
+
+ skb->free = 1;
+ kfree_skb(skb);
+ } else {
+ skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */
+ }
+#else
+ skb_queue_tail(&sk->sk_write_queue, skb); /* Shove it onto the queue */
+#endif
+
+ rose_kick(sk);
+
+ return len;
+}
+
+
+static int rose_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose = rose_sk(sk);
+ size_t copied;
+ unsigned char *asmptr;
+ struct sk_buff *skb;
+ int n, er, qbit;
+
+ /*
+ * This works for seqpacket too. The receiver has ordered the queue for
+ * us! We do one quick check first though
+ */
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ /* Now we can treat all alike */
+ if ((skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &er)) == NULL)
+ return er;
+
+ qbit = (skb->data[0] & ROSE_Q_BIT) == ROSE_Q_BIT;
+
+ skb_pull(skb, ROSE_MIN_LEN);
+
+ if (rose->qbitincl) {
+ asmptr = skb_push(skb, 1);
+ *asmptr = qbit;
+ }
+
+ skb_reset_transport_header(skb);
+ copied = skb->len;
+
+ if (copied > size) {
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ skb_copy_datagram_msg(skb, 0, msg, copied);
+
+ if (msg->msg_name) {
+ struct sockaddr_rose *srose;
+ DECLARE_SOCKADDR(struct full_sockaddr_rose *, full_srose,
+ msg->msg_name);
+
+ memset(msg->msg_name, 0, sizeof(struct full_sockaddr_rose));
+ srose = msg->msg_name;
+ srose->srose_family = AF_ROSE;
+ srose->srose_addr = rose->dest_addr;
+ srose->srose_call = rose->dest_call;
+ srose->srose_ndigis = rose->dest_ndigis;
+ for (n = 0 ; n < rose->dest_ndigis ; n++)
+ full_srose->srose_digis[n] = rose->dest_digis[n];
+ msg->msg_namelen = sizeof(struct full_sockaddr_rose);
+ }
+
+ skb_free_datagram(sk, skb);
+
+ return copied;
+}
+
+
+static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct rose_sock *rose = rose_sk(sk);
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case TIOCOUTQ: {
+ long amount;
+
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ return put_user(amount, (unsigned int __user *) argp);
+ }
+
+ case TIOCINQ: {
+ struct sk_buff *skb;
+ long amount = 0L;
+ /* These two are safe on a single CPU system as only user tasks fiddle here */
+ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+ amount = skb->len;
+ return put_user(amount, (unsigned int __user *) argp);
+ }
+
+ case SIOCGSTAMP:
+ return sock_get_timestamp(sk, (struct timeval __user *) argp);
+
+ case SIOCGSTAMPNS:
+ return sock_get_timestampns(sk, (struct timespec __user *) argp);
+
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFMETRIC:
+ case SIOCSIFMETRIC:
+ return -EINVAL;
+
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRSCLRRT:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ return rose_rt_ioctl(cmd, argp);
+
+ case SIOCRSGCAUSE: {
+ struct rose_cause_struct rose_cause;
+ rose_cause.cause = rose->cause;
+ rose_cause.diagnostic = rose->diagnostic;
+ return copy_to_user(argp, &rose_cause, sizeof(struct rose_cause_struct)) ? -EFAULT : 0;
+ }
+
+ case SIOCRSSCAUSE: {
+ struct rose_cause_struct rose_cause;
+ if (copy_from_user(&rose_cause, argp, sizeof(struct rose_cause_struct)))
+ return -EFAULT;
+ rose->cause = rose_cause.cause;
+ rose->diagnostic = rose_cause.diagnostic;
+ return 0;
+ }
+
+ case SIOCRSSL2CALL:
+ if (!capable(CAP_NET_ADMIN)) return -EPERM;
+ if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
+ ax25_listen_release(&rose_callsign, NULL);
+ if (copy_from_user(&rose_callsign, argp, sizeof(ax25_address)))
+ return -EFAULT;
+ if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
+ return ax25_listen_register(&rose_callsign, NULL);
+
+ return 0;
+
+ case SIOCRSGL2CALL:
+ return copy_to_user(argp, &rose_callsign, sizeof(ax25_address)) ? -EFAULT : 0;
+
+ case SIOCRSACCEPT:
+ if (rose->state == ROSE_STATE_5) {
+ rose_write_internal(sk, ROSE_CALL_ACCEPTED);
+ rose_start_idletimer(sk);
+ rose->condition = 0x00;
+ rose->vs = 0;
+ rose->va = 0;
+ rose->vr = 0;
+ rose->vl = 0;
+ rose->state = ROSE_STATE_3;
+ }
+ return 0;
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static void *rose_info_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rose_list_lock)
+{
+ spin_lock_bh(&rose_list_lock);
+ return seq_hlist_start_head(&rose_list, *pos);
+}
+
+static void *rose_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_hlist_next(v, &rose_list, pos);
+}
+
+static void rose_info_stop(struct seq_file *seq, void *v)
+ __releases(rose_list_lock)
+{
+ spin_unlock_bh(&rose_list_lock);
+}
+
+static int rose_info_show(struct seq_file *seq, void *v)
+{
+ char buf[11], rsbuf[11];
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "dest_addr dest_call src_addr src_call dev lci neigh st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n");
+
+ else {
+ struct sock *s = sk_entry(v);
+ struct rose_sock *rose = rose_sk(s);
+ const char *devname, *callsign;
+ const struct net_device *dev = rose->device;
+
+ if (!dev)
+ devname = "???";
+ else
+ devname = dev->name;
+
+ seq_printf(seq, "%-10s %-9s ",
+ rose2asc(rsbuf, &rose->dest_addr),
+ ax2asc(buf, &rose->dest_call));
+
+ if (ax25cmp(&rose->source_call, &null_ax25_address) == 0)
+ callsign = "??????-?";
+ else
+ callsign = ax2asc(buf, &rose->source_call);
+
+ seq_printf(seq,
+ "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n",
+ rose2asc(rsbuf, &rose->source_addr),
+ callsign,
+ devname,
+ rose->lci & 0x0FFF,
+ (rose->neighbour) ? rose->neighbour->number : 0,
+ rose->state,
+ rose->vs,
+ rose->vr,
+ rose->va,
+ ax25_display_timer(&rose->timer) / HZ,
+ rose->t1 / HZ,
+ rose->t2 / HZ,
+ rose->t3 / HZ,
+ rose->hb / HZ,
+ ax25_display_timer(&rose->idletimer) / (60 * HZ),
+ rose->idle / (60 * HZ),
+ sk_wmem_alloc_get(s),
+ sk_rmem_alloc_get(s),
+ s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations rose_info_seqops = {
+ .start = rose_info_start,
+ .next = rose_info_next,
+ .stop = rose_info_stop,
+ .show = rose_info_show,
+};
+
+static int rose_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rose_info_seqops);
+}
+
+static const struct file_operations rose_info_fops = {
+ .owner = THIS_MODULE,
+ .open = rose_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+static const struct net_proto_family rose_family_ops = {
+ .family = PF_ROSE,
+ .create = rose_create,
+ .owner = THIS_MODULE,
+};
+
+static const struct proto_ops rose_proto_ops = {
+ .family = PF_ROSE,
+ .owner = THIS_MODULE,
+ .release = rose_release,
+ .bind = rose_bind,
+ .connect = rose_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = rose_accept,
+ .getname = rose_getname,
+ .poll = datagram_poll,
+ .ioctl = rose_ioctl,
+ .listen = rose_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = rose_setsockopt,
+ .getsockopt = rose_getsockopt,
+ .sendmsg = rose_sendmsg,
+ .recvmsg = rose_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct notifier_block rose_dev_notifier = {
+ .notifier_call = rose_device_event,
+};
+
+static struct net_device **dev_rose;
+
+static struct ax25_protocol rose_pid = {
+ .pid = AX25_P_ROSE,
+ .func = rose_route_frame
+};
+
+static struct ax25_linkfail rose_linkfail_notifier = {
+ .func = rose_link_failed
+};
+
+static int __init rose_proto_init(void)
+{
+ int i;
+ int rc;
+
+ if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) {
+ printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = proto_register(&rose_proto, 0);
+ if (rc != 0)
+ goto out;
+
+ rose_callsign = null_ax25_address;
+
+ dev_rose = kzalloc(rose_ndevs * sizeof(struct net_device *), GFP_KERNEL);
+ if (dev_rose == NULL) {
+ printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n");
+ rc = -ENOMEM;
+ goto out_proto_unregister;
+ }
+
+ for (i = 0; i < rose_ndevs; i++) {
+ struct net_device *dev;
+ char name[IFNAMSIZ];
+
+ sprintf(name, "rose%d", i);
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, rose_setup);
+ if (!dev) {
+ printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n");
+ rc = -ENOMEM;
+ goto fail;
+ }
+ rc = register_netdev(dev);
+ if (rc) {
+ printk(KERN_ERR "ROSE: netdevice registration failed\n");
+ free_netdev(dev);
+ goto fail;
+ }
+ rose_set_lockdep_key(dev);
+ dev_rose[i] = dev;
+ }
+
+ sock_register(&rose_family_ops);
+ register_netdevice_notifier(&rose_dev_notifier);
+
+ ax25_register_pid(&rose_pid);
+ ax25_linkfail_register(&rose_linkfail_notifier);
+
+#ifdef CONFIG_SYSCTL
+ rose_register_sysctl();
+#endif
+ rose_loopback_init();
+
+ rose_add_loopback_neigh();
+
+ proc_create("rose", S_IRUGO, init_net.proc_net, &rose_info_fops);
+ proc_create("rose_neigh", S_IRUGO, init_net.proc_net,
+ &rose_neigh_fops);
+ proc_create("rose_nodes", S_IRUGO, init_net.proc_net,
+ &rose_nodes_fops);
+ proc_create("rose_routes", S_IRUGO, init_net.proc_net,
+ &rose_routes_fops);
+out:
+ return rc;
+fail:
+ while (--i >= 0) {
+ unregister_netdev(dev_rose[i]);
+ free_netdev(dev_rose[i]);
+ }
+ kfree(dev_rose);
+out_proto_unregister:
+ proto_unregister(&rose_proto);
+ goto out;
+}
+module_init(rose_proto_init);
+
+module_param(rose_ndevs, int, 0);
+MODULE_PARM_DESC(rose_ndevs, "number of ROSE devices");
+
+MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The amateur radio ROSE network layer protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_ROSE);
+
+static void __exit rose_exit(void)
+{
+ int i;
+
+ remove_proc_entry("rose", init_net.proc_net);
+ remove_proc_entry("rose_neigh", init_net.proc_net);
+ remove_proc_entry("rose_nodes", init_net.proc_net);
+ remove_proc_entry("rose_routes", init_net.proc_net);
+ rose_loopback_clear();
+
+ rose_rt_free();
+
+ ax25_protocol_release(AX25_P_ROSE);
+ ax25_linkfail_release(&rose_linkfail_notifier);
+
+ if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
+ ax25_listen_release(&rose_callsign, NULL);
+
+#ifdef CONFIG_SYSCTL
+ rose_unregister_sysctl();
+#endif
+ unregister_netdevice_notifier(&rose_dev_notifier);
+
+ sock_unregister(PF_ROSE);
+
+ for (i = 0; i < rose_ndevs; i++) {
+ struct net_device *dev = dev_rose[i];
+
+ if (dev) {
+ unregister_netdev(dev);
+ free_netdev(dev);
+ }
+ }
+
+ kfree(dev_rose);
+ proto_unregister(&rose_proto);
+}
+
+module_exit(rose_exit);
diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c
new file mode 100644
index 000000000..369ca81a8
--- /dev/null
+++ b/net/rose/rose_dev.c
@@ -0,0 +1,144 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/sysctl.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/arp.h>
+
+#include <net/ax25.h>
+#include <net/rose.h>
+
+static int rose_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned int len)
+{
+ unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2);
+
+ if (daddr)
+ memcpy(buff + 7, daddr, dev->addr_len);
+
+ *buff++ = ROSE_GFI | ROSE_Q_BIT;
+ *buff++ = 0x00;
+ *buff++ = ROSE_DATA;
+ *buff++ = 0x7F;
+ *buff++ = AX25_P_IP;
+
+ if (daddr != NULL)
+ return 37;
+
+ return -37;
+}
+
+static int rose_set_mac_address(struct net_device *dev, void *addr)
+{
+ struct sockaddr *sa = addr;
+ int err;
+
+ if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len))
+ return 0;
+
+ if (dev->flags & IFF_UP) {
+ err = rose_add_loopback_node((rose_address *)sa->sa_data);
+ if (err)
+ return err;
+
+ rose_del_loopback_node((rose_address *)dev->dev_addr);
+ }
+
+ memcpy(dev->dev_addr, sa->sa_data, dev->addr_len);
+
+ return 0;
+}
+
+static int rose_open(struct net_device *dev)
+{
+ int err;
+
+ err = rose_add_loopback_node((rose_address *)dev->dev_addr);
+ if (err)
+ return err;
+
+ netif_start_queue(dev);
+
+ return 0;
+}
+
+static int rose_close(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+ rose_del_loopback_node((rose_address *)dev->dev_addr);
+ return 0;
+}
+
+static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_device_stats *stats = &dev->stats;
+ unsigned int len = skb->len;
+
+ if (!netif_running(dev)) {
+ printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n");
+ return NETDEV_TX_BUSY;
+ }
+
+ if (!rose_route_frame(skb, NULL)) {
+ dev_kfree_skb(skb);
+ stats->tx_errors++;
+ return NETDEV_TX_OK;
+ }
+
+ stats->tx_packets++;
+ stats->tx_bytes += len;
+ return NETDEV_TX_OK;
+}
+
+static const struct header_ops rose_header_ops = {
+ .create = rose_header,
+};
+
+static const struct net_device_ops rose_netdev_ops = {
+ .ndo_open = rose_open,
+ .ndo_stop = rose_close,
+ .ndo_start_xmit = rose_xmit,
+ .ndo_set_mac_address = rose_set_mac_address,
+};
+
+void rose_setup(struct net_device *dev)
+{
+ dev->mtu = ROSE_MAX_PACKET_SIZE - 2;
+ dev->netdev_ops = &rose_netdev_ops;
+
+ dev->header_ops = &rose_header_ops;
+ dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN;
+ dev->addr_len = ROSE_ADDR_LEN;
+ dev->type = ARPHRD_ROSE;
+
+ /* New-style flags. */
+ dev->flags = IFF_NOARP;
+}
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
new file mode 100644
index 000000000..79c4abcfa
--- /dev/null
+++ b/net/rose/rose_in.c
@@ -0,0 +1,294 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ *
+ * Most of this code is based on the SDL diagrams published in the 7th ARRL
+ * Computer Networking Conference papers. The diagrams have mistakes in them,
+ * but are mostly correct. Before you modify the code could you read the SDL
+ * diagrams as the code is not obvious and probably very easy to break.
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/rose.h>
+
+/*
+ * State machine for state 1, Awaiting Call Accepted State.
+ * The handling of the timer(s) is in file rose_timer.c.
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ switch (frametype) {
+ case ROSE_CALL_ACCEPTED:
+ rose_stop_timer(sk);
+ rose_start_idletimer(sk);
+ rose->condition = 0x00;
+ rose->vs = 0;
+ rose->va = 0;
+ rose->vr = 0;
+ rose->vl = 0;
+ rose->state = ROSE_STATE_3;
+ sk->sk_state = TCP_ESTABLISHED;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ break;
+
+ case ROSE_CLEAR_REQUEST:
+ rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+ rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]);
+ rose->neighbour->use--;
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * State machine for state 2, Awaiting Clear Confirmation State.
+ * The handling of the timer(s) is in file rose_timer.c
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ switch (frametype) {
+ case ROSE_CLEAR_REQUEST:
+ rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+ rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
+ rose->neighbour->use--;
+ break;
+
+ case ROSE_CLEAR_CONFIRMATION:
+ rose_disconnect(sk, 0, -1, -1);
+ rose->neighbour->use--;
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file rose_timer.c
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m)
+{
+ struct rose_sock *rose = rose_sk(sk);
+ int queued = 0;
+
+ switch (frametype) {
+ case ROSE_RESET_REQUEST:
+ rose_stop_timer(sk);
+ rose_start_idletimer(sk);
+ rose_write_internal(sk, ROSE_RESET_CONFIRMATION);
+ rose->condition = 0x00;
+ rose->vs = 0;
+ rose->vr = 0;
+ rose->va = 0;
+ rose->vl = 0;
+ rose_requeue_frames(sk);
+ break;
+
+ case ROSE_CLEAR_REQUEST:
+ rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+ rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
+ rose->neighbour->use--;
+ break;
+
+ case ROSE_RR:
+ case ROSE_RNR:
+ if (!rose_validate_nr(sk, nr)) {
+ rose_write_internal(sk, ROSE_RESET_REQUEST);
+ rose->condition = 0x00;
+ rose->vs = 0;
+ rose->vr = 0;
+ rose->va = 0;
+ rose->vl = 0;
+ rose->state = ROSE_STATE_4;
+ rose_start_t2timer(sk);
+ rose_stop_idletimer(sk);
+ } else {
+ rose_frames_acked(sk, nr);
+ if (frametype == ROSE_RNR) {
+ rose->condition |= ROSE_COND_PEER_RX_BUSY;
+ } else {
+ rose->condition &= ~ROSE_COND_PEER_RX_BUSY;
+ }
+ }
+ break;
+
+ case ROSE_DATA: /* XXX */
+ rose->condition &= ~ROSE_COND_PEER_RX_BUSY;
+ if (!rose_validate_nr(sk, nr)) {
+ rose_write_internal(sk, ROSE_RESET_REQUEST);
+ rose->condition = 0x00;
+ rose->vs = 0;
+ rose->vr = 0;
+ rose->va = 0;
+ rose->vl = 0;
+ rose->state = ROSE_STATE_4;
+ rose_start_t2timer(sk);
+ rose_stop_idletimer(sk);
+ break;
+ }
+ rose_frames_acked(sk, nr);
+ if (ns == rose->vr) {
+ rose_start_idletimer(sk);
+ if (sock_queue_rcv_skb(sk, skb) == 0) {
+ rose->vr = (rose->vr + 1) % ROSE_MODULUS;
+ queued = 1;
+ } else {
+ /* Should never happen ! */
+ rose_write_internal(sk, ROSE_RESET_REQUEST);
+ rose->condition = 0x00;
+ rose->vs = 0;
+ rose->vr = 0;
+ rose->va = 0;
+ rose->vl = 0;
+ rose->state = ROSE_STATE_4;
+ rose_start_t2timer(sk);
+ rose_stop_idletimer(sk);
+ break;
+ }
+ if (atomic_read(&sk->sk_rmem_alloc) >
+ (sk->sk_rcvbuf >> 1))
+ rose->condition |= ROSE_COND_OWN_RX_BUSY;
+ }
+ /*
+ * If the window is full, ack the frame, else start the
+ * acknowledge hold back timer.
+ */
+ if (((rose->vl + sysctl_rose_window_size) % ROSE_MODULUS) == rose->vr) {
+ rose->condition &= ~ROSE_COND_ACK_PENDING;
+ rose_stop_timer(sk);
+ rose_enquiry_response(sk);
+ } else {
+ rose->condition |= ROSE_COND_ACK_PENDING;
+ rose_start_hbtimer(sk);
+ }
+ break;
+
+ default:
+ printk(KERN_WARNING "ROSE: unknown %02X in state 3\n", frametype);
+ break;
+ }
+
+ return queued;
+}
+
+/*
+ * State machine for state 4, Awaiting Reset Confirmation State.
+ * The handling of the timer(s) is in file rose_timer.c
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ switch (frametype) {
+ case ROSE_RESET_REQUEST:
+ rose_write_internal(sk, ROSE_RESET_CONFIRMATION);
+ case ROSE_RESET_CONFIRMATION:
+ rose_stop_timer(sk);
+ rose_start_idletimer(sk);
+ rose->condition = 0x00;
+ rose->va = 0;
+ rose->vr = 0;
+ rose->vs = 0;
+ rose->vl = 0;
+ rose->state = ROSE_STATE_3;
+ rose_requeue_frames(sk);
+ break;
+
+ case ROSE_CLEAR_REQUEST:
+ rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+ rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
+ rose->neighbour->use--;
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * State machine for state 5, Awaiting Call Acceptance State.
+ * The handling of the timer(s) is in file rose_timer.c
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ if (frametype == ROSE_CLEAR_REQUEST) {
+ rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+ rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
+ rose_sk(sk)->neighbour->use--;
+ }
+
+ return 0;
+}
+
+/* Higher level upcall for a LAPB frame */
+int rose_process_rx_frame(struct sock *sk, struct sk_buff *skb)
+{
+ struct rose_sock *rose = rose_sk(sk);
+ int queued = 0, frametype, ns, nr, q, d, m;
+
+ if (rose->state == ROSE_STATE_0)
+ return 0;
+
+ frametype = rose_decode(skb, &ns, &nr, &q, &d, &m);
+
+ switch (rose->state) {
+ case ROSE_STATE_1:
+ queued = rose_state1_machine(sk, skb, frametype);
+ break;
+ case ROSE_STATE_2:
+ queued = rose_state2_machine(sk, skb, frametype);
+ break;
+ case ROSE_STATE_3:
+ queued = rose_state3_machine(sk, skb, frametype, ns, nr, q, d, m);
+ break;
+ case ROSE_STATE_4:
+ queued = rose_state4_machine(sk, skb, frametype);
+ break;
+ case ROSE_STATE_5:
+ queued = rose_state5_machine(sk, skb, frametype);
+ break;
+ }
+
+ rose_kick(sk);
+
+ return queued;
+}
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
new file mode 100644
index 000000000..e873d7d9f
--- /dev/null
+++ b/net/rose/rose_link.c
@@ -0,0 +1,292 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/netfilter.h>
+#include <net/rose.h>
+
+static void rose_ftimer_expiry(unsigned long);
+static void rose_t0timer_expiry(unsigned long);
+
+static void rose_transmit_restart_confirmation(struct rose_neigh *neigh);
+static void rose_transmit_restart_request(struct rose_neigh *neigh);
+
+void rose_start_ftimer(struct rose_neigh *neigh)
+{
+ del_timer(&neigh->ftimer);
+
+ neigh->ftimer.data = (unsigned long)neigh;
+ neigh->ftimer.function = &rose_ftimer_expiry;
+ neigh->ftimer.expires =
+ jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout);
+
+ add_timer(&neigh->ftimer);
+}
+
+static void rose_start_t0timer(struct rose_neigh *neigh)
+{
+ del_timer(&neigh->t0timer);
+
+ neigh->t0timer.data = (unsigned long)neigh;
+ neigh->t0timer.function = &rose_t0timer_expiry;
+ neigh->t0timer.expires =
+ jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout);
+
+ add_timer(&neigh->t0timer);
+}
+
+void rose_stop_ftimer(struct rose_neigh *neigh)
+{
+ del_timer(&neigh->ftimer);
+}
+
+void rose_stop_t0timer(struct rose_neigh *neigh)
+{
+ del_timer(&neigh->t0timer);
+}
+
+int rose_ftimer_running(struct rose_neigh *neigh)
+{
+ return timer_pending(&neigh->ftimer);
+}
+
+static int rose_t0timer_running(struct rose_neigh *neigh)
+{
+ return timer_pending(&neigh->t0timer);
+}
+
+static void rose_ftimer_expiry(unsigned long param)
+{
+}
+
+static void rose_t0timer_expiry(unsigned long param)
+{
+ struct rose_neigh *neigh = (struct rose_neigh *)param;
+
+ rose_transmit_restart_request(neigh);
+
+ neigh->dce_mode = 0;
+
+ rose_start_t0timer(neigh);
+}
+
+/*
+ * Interface to ax25_send_frame. Changes my level 2 callsign depending
+ * on whether we have a global ROSE callsign or use the default port
+ * callsign.
+ */
+static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh)
+{
+ ax25_address *rose_call;
+ ax25_cb *ax25s;
+
+ if (ax25cmp(&rose_callsign, &null_ax25_address) == 0)
+ rose_call = (ax25_address *)neigh->dev->dev_addr;
+ else
+ rose_call = &rose_callsign;
+
+ ax25s = neigh->ax25;
+ neigh->ax25 = ax25_send_frame(skb, 260, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev);
+ if (ax25s)
+ ax25_cb_put(ax25s);
+
+ return neigh->ax25 != NULL;
+}
+
+/*
+ * Interface to ax25_link_up. Changes my level 2 callsign depending
+ * on whether we have a global ROSE callsign or use the default port
+ * callsign.
+ */
+static int rose_link_up(struct rose_neigh *neigh)
+{
+ ax25_address *rose_call;
+ ax25_cb *ax25s;
+
+ if (ax25cmp(&rose_callsign, &null_ax25_address) == 0)
+ rose_call = (ax25_address *)neigh->dev->dev_addr;
+ else
+ rose_call = &rose_callsign;
+
+ ax25s = neigh->ax25;
+ neigh->ax25 = ax25_find_cb(rose_call, &neigh->callsign, neigh->digipeat, neigh->dev);
+ if (ax25s)
+ ax25_cb_put(ax25s);
+
+ return neigh->ax25 != NULL;
+}
+
+/*
+ * This handles all restart and diagnostic frames.
+ */
+void rose_link_rx_restart(struct sk_buff *skb, struct rose_neigh *neigh, unsigned short frametype)
+{
+ struct sk_buff *skbn;
+
+ switch (frametype) {
+ case ROSE_RESTART_REQUEST:
+ rose_stop_t0timer(neigh);
+ neigh->restarted = 1;
+ neigh->dce_mode = (skb->data[3] == ROSE_DTE_ORIGINATED);
+ rose_transmit_restart_confirmation(neigh);
+ break;
+
+ case ROSE_RESTART_CONFIRMATION:
+ rose_stop_t0timer(neigh);
+ neigh->restarted = 1;
+ break;
+
+ case ROSE_DIAGNOSTIC:
+ pr_warn("ROSE: received diagnostic #%d - %3ph\n", skb->data[3],
+ skb->data + 4);
+ break;
+
+ default:
+ printk(KERN_WARNING "ROSE: received unknown %02X with LCI 000\n", frametype);
+ break;
+ }
+
+ if (neigh->restarted) {
+ while ((skbn = skb_dequeue(&neigh->queue)) != NULL)
+ if (!rose_send_frame(skbn, neigh))
+ kfree_skb(skbn);
+ }
+}
+
+/*
+ * This routine is called when a Restart Request is needed
+ */
+static void rose_transmit_restart_request(struct rose_neigh *neigh)
+{
+ struct sk_buff *skb;
+ unsigned char *dptr;
+ int len;
+
+ len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3;
+
+ if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN);
+
+ dptr = skb_put(skb, ROSE_MIN_LEN + 3);
+
+ *dptr++ = AX25_P_ROSE;
+ *dptr++ = ROSE_GFI;
+ *dptr++ = 0x00;
+ *dptr++ = ROSE_RESTART_REQUEST;
+ *dptr++ = ROSE_DTE_ORIGINATED;
+ *dptr++ = 0;
+
+ if (!rose_send_frame(skb, neigh))
+ kfree_skb(skb);
+}
+
+/*
+ * This routine is called when a Restart Confirmation is needed
+ */
+static void rose_transmit_restart_confirmation(struct rose_neigh *neigh)
+{
+ struct sk_buff *skb;
+ unsigned char *dptr;
+ int len;
+
+ len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1;
+
+ if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN);
+
+ dptr = skb_put(skb, ROSE_MIN_LEN + 1);
+
+ *dptr++ = AX25_P_ROSE;
+ *dptr++ = ROSE_GFI;
+ *dptr++ = 0x00;
+ *dptr++ = ROSE_RESTART_CONFIRMATION;
+
+ if (!rose_send_frame(skb, neigh))
+ kfree_skb(skb);
+}
+
+/*
+ * This routine is called when a Clear Request is needed outside of the context
+ * of a connected socket.
+ */
+void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, unsigned char cause, unsigned char diagnostic)
+{
+ struct sk_buff *skb;
+ unsigned char *dptr;
+ int len;
+
+ len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3;
+
+ if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+ return;
+
+ skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN);
+
+ dptr = skb_put(skb, ROSE_MIN_LEN + 3);
+
+ *dptr++ = AX25_P_ROSE;
+ *dptr++ = ((lci >> 8) & 0x0F) | ROSE_GFI;
+ *dptr++ = ((lci >> 0) & 0xFF);
+ *dptr++ = ROSE_CLEAR_REQUEST;
+ *dptr++ = cause;
+ *dptr++ = diagnostic;
+
+ if (!rose_send_frame(skb, neigh))
+ kfree_skb(skb);
+}
+
+void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh)
+{
+ unsigned char *dptr;
+
+ if (neigh->loopback) {
+ rose_loopback_queue(skb, neigh);
+ return;
+ }
+
+ if (!rose_link_up(neigh))
+ neigh->restarted = 0;
+
+ dptr = skb_push(skb, 1);
+ *dptr++ = AX25_P_ROSE;
+
+ if (neigh->restarted) {
+ if (!rose_send_frame(skb, neigh))
+ kfree_skb(skb);
+ } else {
+ skb_queue_tail(&neigh->queue, skb);
+
+ if (!rose_t0timer_running(neigh)) {
+ rose_transmit_restart_request(neigh);
+ neigh->dce_mode = 0;
+ rose_start_t0timer(neigh);
+ }
+ }
+}
diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c
new file mode 100644
index 000000000..344456206
--- /dev/null
+++ b/net/rose/rose_loopback.c
@@ -0,0 +1,124 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/timer.h>
+#include <net/ax25.h>
+#include <linux/skbuff.h>
+#include <net/rose.h>
+#include <linux/init.h>
+
+static struct sk_buff_head loopback_queue;
+static struct timer_list loopback_timer;
+
+static void rose_set_loopback_timer(void);
+
+void rose_loopback_init(void)
+{
+ skb_queue_head_init(&loopback_queue);
+
+ init_timer(&loopback_timer);
+}
+
+static int rose_loopback_running(void)
+{
+ return timer_pending(&loopback_timer);
+}
+
+int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh)
+{
+ struct sk_buff *skbn;
+
+ skbn = skb_clone(skb, GFP_ATOMIC);
+
+ kfree_skb(skb);
+
+ if (skbn != NULL) {
+ skb_queue_tail(&loopback_queue, skbn);
+
+ if (!rose_loopback_running())
+ rose_set_loopback_timer();
+ }
+
+ return 1;
+}
+
+static void rose_loopback_timer(unsigned long);
+
+static void rose_set_loopback_timer(void)
+{
+ del_timer(&loopback_timer);
+
+ loopback_timer.data = 0;
+ loopback_timer.function = &rose_loopback_timer;
+ loopback_timer.expires = jiffies + 10;
+
+ add_timer(&loopback_timer);
+}
+
+static void rose_loopback_timer(unsigned long param)
+{
+ struct sk_buff *skb;
+ struct net_device *dev;
+ rose_address *dest;
+ struct sock *sk;
+ unsigned short frametype;
+ unsigned int lci_i, lci_o;
+
+ while ((skb = skb_dequeue(&loopback_queue)) != NULL) {
+ if (skb->len < ROSE_MIN_LEN) {
+ kfree_skb(skb);
+ continue;
+ }
+ lci_i = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
+ frametype = skb->data[2];
+ if (frametype == ROSE_CALL_REQUEST &&
+ (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF ||
+ skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] !=
+ ROSE_CALL_REQ_ADDR_LEN_VAL)) {
+ kfree_skb(skb);
+ continue;
+ }
+ dest = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF);
+ lci_o = ROSE_DEFAULT_MAXVC + 1 - lci_i;
+
+ skb_reset_transport_header(skb);
+
+ sk = rose_find_socket(lci_o, rose_loopback_neigh);
+ if (sk) {
+ if (rose_process_rx_frame(sk, skb) == 0)
+ kfree_skb(skb);
+ continue;
+ }
+
+ if (frametype == ROSE_CALL_REQUEST) {
+ if ((dev = rose_dev_get(dest)) != NULL) {
+ if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0)
+ kfree_skb(skb);
+ } else {
+ kfree_skb(skb);
+ }
+ } else {
+ kfree_skb(skb);
+ }
+ }
+}
+
+void __exit rose_loopback_clear(void)
+{
+ struct sk_buff *skb;
+
+ del_timer(&loopback_timer);
+
+ while ((skb = skb_dequeue(&loopback_queue)) != NULL) {
+ skb->sk = NULL;
+ kfree_skb(skb);
+ }
+}
diff --git a/net/rose/rose_out.c b/net/rose/rose_out.c
new file mode 100644
index 000000000..9ad98b524
--- /dev/null
+++ b/net/rose/rose_out.c
@@ -0,0 +1,125 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/rose.h>
+
+/*
+ * This procedure is passed a buffer descriptor for an iframe. It builds
+ * the rest of the control part of the frame and then writes it out.
+ */
+static void rose_send_iframe(struct sock *sk, struct sk_buff *skb)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ if (skb == NULL)
+ return;
+
+ skb->data[2] |= (rose->vr << 5) & 0xE0;
+ skb->data[2] |= (rose->vs << 1) & 0x0E;
+
+ rose_start_idletimer(sk);
+
+ rose_transmit_link(skb, rose->neighbour);
+}
+
+void rose_kick(struct sock *sk)
+{
+ struct rose_sock *rose = rose_sk(sk);
+ struct sk_buff *skb, *skbn;
+ unsigned short start, end;
+
+ if (rose->state != ROSE_STATE_3)
+ return;
+
+ if (rose->condition & ROSE_COND_PEER_RX_BUSY)
+ return;
+
+ if (!skb_peek(&sk->sk_write_queue))
+ return;
+
+ start = (skb_peek(&rose->ack_queue) == NULL) ? rose->va : rose->vs;
+ end = (rose->va + sysctl_rose_window_size) % ROSE_MODULUS;
+
+ if (start == end)
+ return;
+
+ rose->vs = start;
+
+ /*
+ * Transmit data until either we're out of data to send or
+ * the window is full.
+ */
+
+ skb = skb_dequeue(&sk->sk_write_queue);
+
+ do {
+ if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ skb_queue_head(&sk->sk_write_queue, skb);
+ break;
+ }
+
+ skb_set_owner_w(skbn, sk);
+
+ /*
+ * Transmit the frame copy.
+ */
+ rose_send_iframe(sk, skbn);
+
+ rose->vs = (rose->vs + 1) % ROSE_MODULUS;
+
+ /*
+ * Requeue the original data frame.
+ */
+ skb_queue_tail(&rose->ack_queue, skb);
+
+ } while (rose->vs != end &&
+ (skb = skb_dequeue(&sk->sk_write_queue)) != NULL);
+
+ rose->vl = rose->vr;
+ rose->condition &= ~ROSE_COND_ACK_PENDING;
+
+ rose_stop_timer(sk);
+}
+
+/*
+ * The following routines are taken from page 170 of the 7th ARRL Computer
+ * Networking Conference paper, as is the whole state machine.
+ */
+
+void rose_enquiry_response(struct sock *sk)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ if (rose->condition & ROSE_COND_OWN_RX_BUSY)
+ rose_write_internal(sk, ROSE_RNR);
+ else
+ rose_write_internal(sk, ROSE_RR);
+
+ rose->vl = rose->vr;
+ rose->condition &= ~ROSE_COND_ACK_PENDING;
+
+ rose_stop_timer(sk);
+}
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
new file mode 100644
index 000000000..40148932c
--- /dev/null
+++ b/net/rose/rose_route.c
@@ -0,0 +1,1366 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/arp.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/netfilter.h>
+#include <linux/init.h>
+#include <net/rose.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+
+static unsigned int rose_neigh_no = 1;
+
+static struct rose_node *rose_node_list;
+static DEFINE_SPINLOCK(rose_node_list_lock);
+static struct rose_neigh *rose_neigh_list;
+static DEFINE_SPINLOCK(rose_neigh_list_lock);
+static struct rose_route *rose_route_list;
+static DEFINE_SPINLOCK(rose_route_list_lock);
+
+struct rose_neigh *rose_loopback_neigh;
+
+/*
+ * Add a new route to a node, and in the process add the node and the
+ * neighbour if it is new.
+ */
+static int __must_check rose_add_node(struct rose_route_struct *rose_route,
+ struct net_device *dev)
+{
+ struct rose_node *rose_node, *rose_tmpn, *rose_tmpp;
+ struct rose_neigh *rose_neigh;
+ int i, res = 0;
+
+ spin_lock_bh(&rose_node_list_lock);
+ spin_lock_bh(&rose_neigh_list_lock);
+
+ rose_node = rose_node_list;
+ while (rose_node != NULL) {
+ if ((rose_node->mask == rose_route->mask) &&
+ (rosecmpm(&rose_route->address, &rose_node->address,
+ rose_route->mask) == 0))
+ break;
+ rose_node = rose_node->next;
+ }
+
+ if (rose_node != NULL && rose_node->loopback) {
+ res = -EINVAL;
+ goto out;
+ }
+
+ rose_neigh = rose_neigh_list;
+ while (rose_neigh != NULL) {
+ if (ax25cmp(&rose_route->neighbour,
+ &rose_neigh->callsign) == 0 &&
+ rose_neigh->dev == dev)
+ break;
+ rose_neigh = rose_neigh->next;
+ }
+
+ if (rose_neigh == NULL) {
+ rose_neigh = kmalloc(sizeof(*rose_neigh), GFP_ATOMIC);
+ if (rose_neigh == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+
+ rose_neigh->callsign = rose_route->neighbour;
+ rose_neigh->digipeat = NULL;
+ rose_neigh->ax25 = NULL;
+ rose_neigh->dev = dev;
+ rose_neigh->count = 0;
+ rose_neigh->use = 0;
+ rose_neigh->dce_mode = 0;
+ rose_neigh->loopback = 0;
+ rose_neigh->number = rose_neigh_no++;
+ rose_neigh->restarted = 0;
+
+ skb_queue_head_init(&rose_neigh->queue);
+
+ init_timer(&rose_neigh->ftimer);
+ init_timer(&rose_neigh->t0timer);
+
+ if (rose_route->ndigis != 0) {
+ rose_neigh->digipeat =
+ kmalloc(sizeof(ax25_digi), GFP_ATOMIC);
+ if (rose_neigh->digipeat == NULL) {
+ kfree(rose_neigh);
+ res = -ENOMEM;
+ goto out;
+ }
+
+ rose_neigh->digipeat->ndigi = rose_route->ndigis;
+ rose_neigh->digipeat->lastrepeat = -1;
+
+ for (i = 0; i < rose_route->ndigis; i++) {
+ rose_neigh->digipeat->calls[i] =
+ rose_route->digipeaters[i];
+ rose_neigh->digipeat->repeated[i] = 0;
+ }
+ }
+
+ rose_neigh->next = rose_neigh_list;
+ rose_neigh_list = rose_neigh;
+ }
+
+ /*
+ * This is a new node to be inserted into the list. Find where it needs
+ * to be inserted into the list, and insert it. We want to be sure
+ * to order the list in descending order of mask size to ensure that
+ * later when we are searching this list the first match will be the
+ * best match.
+ */
+ if (rose_node == NULL) {
+ rose_tmpn = rose_node_list;
+ rose_tmpp = NULL;
+
+ while (rose_tmpn != NULL) {
+ if (rose_tmpn->mask > rose_route->mask) {
+ rose_tmpp = rose_tmpn;
+ rose_tmpn = rose_tmpn->next;
+ } else {
+ break;
+ }
+ }
+
+ /* create new node */
+ rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC);
+ if (rose_node == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+
+ rose_node->address = rose_route->address;
+ rose_node->mask = rose_route->mask;
+ rose_node->count = 1;
+ rose_node->loopback = 0;
+ rose_node->neighbour[0] = rose_neigh;
+
+ if (rose_tmpn == NULL) {
+ if (rose_tmpp == NULL) { /* Empty list */
+ rose_node_list = rose_node;
+ rose_node->next = NULL;
+ } else {
+ rose_tmpp->next = rose_node;
+ rose_node->next = NULL;
+ }
+ } else {
+ if (rose_tmpp == NULL) { /* 1st node */
+ rose_node->next = rose_node_list;
+ rose_node_list = rose_node;
+ } else {
+ rose_tmpp->next = rose_node;
+ rose_node->next = rose_tmpn;
+ }
+ }
+ rose_neigh->count++;
+
+ goto out;
+ }
+
+ /* We have space, slot it in */
+ if (rose_node->count < 3) {
+ rose_node->neighbour[rose_node->count] = rose_neigh;
+ rose_node->count++;
+ rose_neigh->count++;
+ }
+
+out:
+ spin_unlock_bh(&rose_neigh_list_lock);
+ spin_unlock_bh(&rose_node_list_lock);
+
+ return res;
+}
+
+/*
+ * Caller is holding rose_node_list_lock.
+ */
+static void rose_remove_node(struct rose_node *rose_node)
+{
+ struct rose_node *s;
+
+ if ((s = rose_node_list) == rose_node) {
+ rose_node_list = rose_node->next;
+ kfree(rose_node);
+ return;
+ }
+
+ while (s != NULL && s->next != NULL) {
+ if (s->next == rose_node) {
+ s->next = rose_node->next;
+ kfree(rose_node);
+ return;
+ }
+
+ s = s->next;
+ }
+}
+
+/*
+ * Caller is holding rose_neigh_list_lock.
+ */
+static void rose_remove_neigh(struct rose_neigh *rose_neigh)
+{
+ struct rose_neigh *s;
+
+ rose_stop_ftimer(rose_neigh);
+ rose_stop_t0timer(rose_neigh);
+
+ skb_queue_purge(&rose_neigh->queue);
+
+ if ((s = rose_neigh_list) == rose_neigh) {
+ rose_neigh_list = rose_neigh->next;
+ if (rose_neigh->ax25)
+ ax25_cb_put(rose_neigh->ax25);
+ kfree(rose_neigh->digipeat);
+ kfree(rose_neigh);
+ return;
+ }
+
+ while (s != NULL && s->next != NULL) {
+ if (s->next == rose_neigh) {
+ s->next = rose_neigh->next;
+ if (rose_neigh->ax25)
+ ax25_cb_put(rose_neigh->ax25);
+ kfree(rose_neigh->digipeat);
+ kfree(rose_neigh);
+ return;
+ }
+
+ s = s->next;
+ }
+}
+
+/*
+ * Caller is holding rose_route_list_lock.
+ */
+static void rose_remove_route(struct rose_route *rose_route)
+{
+ struct rose_route *s;
+
+ if (rose_route->neigh1 != NULL)
+ rose_route->neigh1->use--;
+
+ if (rose_route->neigh2 != NULL)
+ rose_route->neigh2->use--;
+
+ if ((s = rose_route_list) == rose_route) {
+ rose_route_list = rose_route->next;
+ kfree(rose_route);
+ return;
+ }
+
+ while (s != NULL && s->next != NULL) {
+ if (s->next == rose_route) {
+ s->next = rose_route->next;
+ kfree(rose_route);
+ return;
+ }
+
+ s = s->next;
+ }
+}
+
+/*
+ * "Delete" a node. Strictly speaking remove a route to a node. The node
+ * is only deleted if no routes are left to it.
+ */
+static int rose_del_node(struct rose_route_struct *rose_route,
+ struct net_device *dev)
+{
+ struct rose_node *rose_node;
+ struct rose_neigh *rose_neigh;
+ int i, err = 0;
+
+ spin_lock_bh(&rose_node_list_lock);
+ spin_lock_bh(&rose_neigh_list_lock);
+
+ rose_node = rose_node_list;
+ while (rose_node != NULL) {
+ if ((rose_node->mask == rose_route->mask) &&
+ (rosecmpm(&rose_route->address, &rose_node->address,
+ rose_route->mask) == 0))
+ break;
+ rose_node = rose_node->next;
+ }
+
+ if (rose_node == NULL || rose_node->loopback) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ rose_neigh = rose_neigh_list;
+ while (rose_neigh != NULL) {
+ if (ax25cmp(&rose_route->neighbour,
+ &rose_neigh->callsign) == 0 &&
+ rose_neigh->dev == dev)
+ break;
+ rose_neigh = rose_neigh->next;
+ }
+
+ if (rose_neigh == NULL) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < rose_node->count; i++) {
+ if (rose_node->neighbour[i] == rose_neigh) {
+ rose_neigh->count--;
+
+ if (rose_neigh->count == 0 && rose_neigh->use == 0)
+ rose_remove_neigh(rose_neigh);
+
+ rose_node->count--;
+
+ if (rose_node->count == 0) {
+ rose_remove_node(rose_node);
+ } else {
+ switch (i) {
+ case 0:
+ rose_node->neighbour[0] =
+ rose_node->neighbour[1];
+ case 1:
+ rose_node->neighbour[1] =
+ rose_node->neighbour[2];
+ case 2:
+ break;
+ }
+ }
+ goto out;
+ }
+ }
+ err = -EINVAL;
+
+out:
+ spin_unlock_bh(&rose_neigh_list_lock);
+ spin_unlock_bh(&rose_node_list_lock);
+
+ return err;
+}
+
+/*
+ * Add the loopback neighbour.
+ */
+void rose_add_loopback_neigh(void)
+{
+ struct rose_neigh *sn;
+
+ rose_loopback_neigh = kmalloc(sizeof(struct rose_neigh), GFP_KERNEL);
+ if (!rose_loopback_neigh)
+ return;
+ sn = rose_loopback_neigh;
+
+ sn->callsign = null_ax25_address;
+ sn->digipeat = NULL;
+ sn->ax25 = NULL;
+ sn->dev = NULL;
+ sn->count = 0;
+ sn->use = 0;
+ sn->dce_mode = 1;
+ sn->loopback = 1;
+ sn->number = rose_neigh_no++;
+ sn->restarted = 1;
+
+ skb_queue_head_init(&sn->queue);
+
+ init_timer(&sn->ftimer);
+ init_timer(&sn->t0timer);
+
+ spin_lock_bh(&rose_neigh_list_lock);
+ sn->next = rose_neigh_list;
+ rose_neigh_list = sn;
+ spin_unlock_bh(&rose_neigh_list_lock);
+}
+
+/*
+ * Add a loopback node.
+ */
+int rose_add_loopback_node(rose_address *address)
+{
+ struct rose_node *rose_node;
+ int err = 0;
+
+ spin_lock_bh(&rose_node_list_lock);
+
+ rose_node = rose_node_list;
+ while (rose_node != NULL) {
+ if ((rose_node->mask == 10) &&
+ (rosecmpm(address, &rose_node->address, 10) == 0) &&
+ rose_node->loopback)
+ break;
+ rose_node = rose_node->next;
+ }
+
+ if (rose_node != NULL)
+ goto out;
+
+ if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ rose_node->address = *address;
+ rose_node->mask = 10;
+ rose_node->count = 1;
+ rose_node->loopback = 1;
+ rose_node->neighbour[0] = rose_loopback_neigh;
+
+ /* Insert at the head of list. Address is always mask=10 */
+ rose_node->next = rose_node_list;
+ rose_node_list = rose_node;
+
+ rose_loopback_neigh->count++;
+
+out:
+ spin_unlock_bh(&rose_node_list_lock);
+
+ return err;
+}
+
+/*
+ * Delete a loopback node.
+ */
+void rose_del_loopback_node(rose_address *address)
+{
+ struct rose_node *rose_node;
+
+ spin_lock_bh(&rose_node_list_lock);
+
+ rose_node = rose_node_list;
+ while (rose_node != NULL) {
+ if ((rose_node->mask == 10) &&
+ (rosecmpm(address, &rose_node->address, 10) == 0) &&
+ rose_node->loopback)
+ break;
+ rose_node = rose_node->next;
+ }
+
+ if (rose_node == NULL)
+ goto out;
+
+ rose_remove_node(rose_node);
+
+ rose_loopback_neigh->count--;
+
+out:
+ spin_unlock_bh(&rose_node_list_lock);
+}
+
+/*
+ * A device has been removed. Remove its routes and neighbours.
+ */
+void rose_rt_device_down(struct net_device *dev)
+{
+ struct rose_neigh *s, *rose_neigh;
+ struct rose_node *t, *rose_node;
+ int i;
+
+ spin_lock_bh(&rose_node_list_lock);
+ spin_lock_bh(&rose_neigh_list_lock);
+ rose_neigh = rose_neigh_list;
+ while (rose_neigh != NULL) {
+ s = rose_neigh;
+ rose_neigh = rose_neigh->next;
+
+ if (s->dev != dev)
+ continue;
+
+ rose_node = rose_node_list;
+
+ while (rose_node != NULL) {
+ t = rose_node;
+ rose_node = rose_node->next;
+
+ for (i = 0; i < t->count; i++) {
+ if (t->neighbour[i] != s)
+ continue;
+
+ t->count--;
+
+ switch (i) {
+ case 0:
+ t->neighbour[0] = t->neighbour[1];
+ case 1:
+ t->neighbour[1] = t->neighbour[2];
+ case 2:
+ break;
+ }
+ }
+
+ if (t->count <= 0)
+ rose_remove_node(t);
+ }
+
+ rose_remove_neigh(s);
+ }
+ spin_unlock_bh(&rose_neigh_list_lock);
+ spin_unlock_bh(&rose_node_list_lock);
+}
+
+#if 0 /* Currently unused */
+/*
+ * A device has been removed. Remove its links.
+ */
+void rose_route_device_down(struct net_device *dev)
+{
+ struct rose_route *s, *rose_route;
+
+ spin_lock_bh(&rose_route_list_lock);
+ rose_route = rose_route_list;
+ while (rose_route != NULL) {
+ s = rose_route;
+ rose_route = rose_route->next;
+
+ if (s->neigh1->dev == dev || s->neigh2->dev == dev)
+ rose_remove_route(s);
+ }
+ spin_unlock_bh(&rose_route_list_lock);
+}
+#endif
+
+/*
+ * Clear all nodes and neighbours out, except for neighbours with
+ * active connections going through them.
+ * Do not clear loopback neighbour and nodes.
+ */
+static int rose_clear_routes(void)
+{
+ struct rose_neigh *s, *rose_neigh;
+ struct rose_node *t, *rose_node;
+
+ spin_lock_bh(&rose_node_list_lock);
+ spin_lock_bh(&rose_neigh_list_lock);
+
+ rose_neigh = rose_neigh_list;
+ rose_node = rose_node_list;
+
+ while (rose_node != NULL) {
+ t = rose_node;
+ rose_node = rose_node->next;
+ if (!t->loopback)
+ rose_remove_node(t);
+ }
+
+ while (rose_neigh != NULL) {
+ s = rose_neigh;
+ rose_neigh = rose_neigh->next;
+
+ if (s->use == 0 && !s->loopback) {
+ s->count = 0;
+ rose_remove_neigh(s);
+ }
+ }
+
+ spin_unlock_bh(&rose_neigh_list_lock);
+ spin_unlock_bh(&rose_node_list_lock);
+
+ return 0;
+}
+
+/*
+ * Check that the device given is a valid AX.25 interface that is "up".
+ * called with RTNL
+ */
+static struct net_device *rose_ax25_dev_find(char *devname)
+{
+ struct net_device *dev;
+
+ if ((dev = __dev_get_by_name(&init_net, devname)) == NULL)
+ return NULL;
+
+ if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25)
+ return dev;
+
+ return NULL;
+}
+
+/*
+ * Find the first active ROSE device, usually "rose0".
+ */
+struct net_device *rose_dev_first(void)
+{
+ struct net_device *dev, *first = NULL;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE)
+ if (first == NULL || strncmp(dev->name, first->name, 3) < 0)
+ first = dev;
+ }
+ rcu_read_unlock();
+
+ return first;
+}
+
+/*
+ * Find the ROSE device for the given address.
+ */
+struct net_device *rose_dev_get(rose_address *addr)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) {
+ dev_hold(dev);
+ goto out;
+ }
+ }
+ dev = NULL;
+out:
+ rcu_read_unlock();
+ return dev;
+}
+
+static int rose_dev_exists(rose_address *addr)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0)
+ goto out;
+ }
+ dev = NULL;
+out:
+ rcu_read_unlock();
+ return dev != NULL;
+}
+
+
+
+
+struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neigh)
+{
+ struct rose_route *rose_route;
+
+ for (rose_route = rose_route_list; rose_route != NULL; rose_route = rose_route->next)
+ if ((rose_route->neigh1 == neigh && rose_route->lci1 == lci) ||
+ (rose_route->neigh2 == neigh && rose_route->lci2 == lci))
+ return rose_route;
+
+ return NULL;
+}
+
+/*
+ * Find a neighbour or a route given a ROSE address.
+ */
+struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
+ unsigned char *diagnostic, int route_frame)
+{
+ struct rose_neigh *res = NULL;
+ struct rose_node *node;
+ int failed = 0;
+ int i;
+
+ if (!route_frame) spin_lock_bh(&rose_node_list_lock);
+ for (node = rose_node_list; node != NULL; node = node->next) {
+ if (rosecmpm(addr, &node->address, node->mask) == 0) {
+ for (i = 0; i < node->count; i++) {
+ if (node->neighbour[i]->restarted) {
+ res = node->neighbour[i];
+ goto out;
+ }
+ }
+ }
+ }
+ if (!route_frame) { /* connect request */
+ for (node = rose_node_list; node != NULL; node = node->next) {
+ if (rosecmpm(addr, &node->address, node->mask) == 0) {
+ for (i = 0; i < node->count; i++) {
+ if (!rose_ftimer_running(node->neighbour[i])) {
+ res = node->neighbour[i];
+ failed = 0;
+ goto out;
+ }
+ failed = 1;
+ }
+ }
+ }
+ }
+
+ if (failed) {
+ *cause = ROSE_OUT_OF_ORDER;
+ *diagnostic = 0;
+ } else {
+ *cause = ROSE_NOT_OBTAINABLE;
+ *diagnostic = 0;
+ }
+
+out:
+ if (!route_frame) spin_unlock_bh(&rose_node_list_lock);
+ return res;
+}
+
+/*
+ * Handle the ioctls that control the routing functions.
+ */
+int rose_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct rose_route_struct rose_route;
+ struct net_device *dev;
+ int err;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct)))
+ return -EFAULT;
+ if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL)
+ return -EINVAL;
+ if (rose_dev_exists(&rose_route.address)) /* Can't add routes to ourself */
+ return -EINVAL;
+ if (rose_route.mask > 10) /* Mask can't be more than 10 digits */
+ return -EINVAL;
+ if (rose_route.ndigis > AX25_MAX_DIGIS)
+ return -EINVAL;
+ err = rose_add_node(&rose_route, dev);
+ return err;
+
+ case SIOCDELRT:
+ if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct)))
+ return -EFAULT;
+ if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL)
+ return -EINVAL;
+ err = rose_del_node(&rose_route, dev);
+ return err;
+
+ case SIOCRSCLRRT:
+ return rose_clear_routes();
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh)
+{
+ struct rose_route *rose_route, *s;
+
+ rose_neigh->restarted = 0;
+
+ rose_stop_t0timer(rose_neigh);
+ rose_start_ftimer(rose_neigh);
+
+ skb_queue_purge(&rose_neigh->queue);
+
+ spin_lock_bh(&rose_route_list_lock);
+
+ rose_route = rose_route_list;
+
+ while (rose_route != NULL) {
+ if ((rose_route->neigh1 == rose_neigh && rose_route->neigh2 == rose_neigh) ||
+ (rose_route->neigh1 == rose_neigh && rose_route->neigh2 == NULL) ||
+ (rose_route->neigh2 == rose_neigh && rose_route->neigh1 == NULL)) {
+ s = rose_route->next;
+ rose_remove_route(rose_route);
+ rose_route = s;
+ continue;
+ }
+
+ if (rose_route->neigh1 == rose_neigh) {
+ rose_route->neigh1->use--;
+ rose_route->neigh1 = NULL;
+ rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0);
+ }
+
+ if (rose_route->neigh2 == rose_neigh) {
+ rose_route->neigh2->use--;
+ rose_route->neigh2 = NULL;
+ rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0);
+ }
+
+ rose_route = rose_route->next;
+ }
+ spin_unlock_bh(&rose_route_list_lock);
+}
+
+/*
+ * A level 2 link has timed out, therefore it appears to be a poor link,
+ * then don't use that neighbour until it is reset. Blow away all through
+ * routes and connections using this route.
+ */
+void rose_link_failed(ax25_cb *ax25, int reason)
+{
+ struct rose_neigh *rose_neigh;
+
+ spin_lock_bh(&rose_neigh_list_lock);
+ rose_neigh = rose_neigh_list;
+ while (rose_neigh != NULL) {
+ if (rose_neigh->ax25 == ax25)
+ break;
+ rose_neigh = rose_neigh->next;
+ }
+
+ if (rose_neigh != NULL) {
+ rose_neigh->ax25 = NULL;
+ ax25_cb_put(ax25);
+
+ rose_del_route_by_neigh(rose_neigh);
+ rose_kill_by_neigh(rose_neigh);
+ }
+ spin_unlock_bh(&rose_neigh_list_lock);
+}
+
+/*
+ * A device has been "downed" remove its link status. Blow away all
+ * through routes and connections that use this device.
+ */
+void rose_link_device_down(struct net_device *dev)
+{
+ struct rose_neigh *rose_neigh;
+
+ for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) {
+ if (rose_neigh->dev == dev) {
+ rose_del_route_by_neigh(rose_neigh);
+ rose_kill_by_neigh(rose_neigh);
+ }
+ }
+}
+
+/*
+ * Route a frame to an appropriate AX.25 connection.
+ */
+int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
+{
+ struct rose_neigh *rose_neigh, *new_neigh;
+ struct rose_route *rose_route;
+ struct rose_facilities_struct facilities;
+ rose_address *src_addr, *dest_addr;
+ struct sock *sk;
+ unsigned short frametype;
+ unsigned int lci, new_lci;
+ unsigned char cause, diagnostic;
+ struct net_device *dev;
+ int res = 0;
+ char buf[11];
+
+ if (skb->len < ROSE_MIN_LEN)
+ return res;
+ frametype = skb->data[2];
+ lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
+ if (frametype == ROSE_CALL_REQUEST &&
+ (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF ||
+ skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] !=
+ ROSE_CALL_REQ_ADDR_LEN_VAL))
+ return res;
+ src_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_SRC_ADDR_OFF);
+ dest_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF);
+
+ spin_lock_bh(&rose_neigh_list_lock);
+ spin_lock_bh(&rose_route_list_lock);
+
+ rose_neigh = rose_neigh_list;
+ while (rose_neigh != NULL) {
+ if (ax25cmp(&ax25->dest_addr, &rose_neigh->callsign) == 0 &&
+ ax25->ax25_dev->dev == rose_neigh->dev)
+ break;
+ rose_neigh = rose_neigh->next;
+ }
+
+ if (rose_neigh == NULL) {
+ printk("rose_route : unknown neighbour or device %s\n",
+ ax2asc(buf, &ax25->dest_addr));
+ goto out;
+ }
+
+ /*
+ * Obviously the link is working, halt the ftimer.
+ */
+ rose_stop_ftimer(rose_neigh);
+
+ /*
+ * LCI of zero is always for us, and its always a restart
+ * frame.
+ */
+ if (lci == 0) {
+ rose_link_rx_restart(skb, rose_neigh, frametype);
+ goto out;
+ }
+
+ /*
+ * Find an existing socket.
+ */
+ if ((sk = rose_find_socket(lci, rose_neigh)) != NULL) {
+ if (frametype == ROSE_CALL_REQUEST) {
+ struct rose_sock *rose = rose_sk(sk);
+
+ /* Remove an existing unused socket */
+ rose_clear_queues(sk);
+ rose->cause = ROSE_NETWORK_CONGESTION;
+ rose->diagnostic = 0;
+ rose->neighbour->use--;
+ rose->neighbour = NULL;
+ rose->lci = 0;
+ rose->state = ROSE_STATE_0;
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_err = 0;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ }
+ }
+ else {
+ skb_reset_transport_header(skb);
+ res = rose_process_rx_frame(sk, skb);
+ goto out;
+ }
+ }
+
+ /*
+ * Is is a Call Request and is it for us ?
+ */
+ if (frametype == ROSE_CALL_REQUEST)
+ if ((dev = rose_dev_get(dest_addr)) != NULL) {
+ res = rose_rx_call_request(skb, dev, rose_neigh, lci);
+ dev_put(dev);
+ goto out;
+ }
+
+ if (!sysctl_rose_routing_control) {
+ rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 0);
+ goto out;
+ }
+
+ /*
+ * Route it to the next in line if we have an entry for it.
+ */
+ rose_route = rose_route_list;
+ while (rose_route != NULL) {
+ if (rose_route->lci1 == lci &&
+ rose_route->neigh1 == rose_neigh) {
+ if (frametype == ROSE_CALL_REQUEST) {
+ /* F6FBB - Remove an existing unused route */
+ rose_remove_route(rose_route);
+ break;
+ } else if (rose_route->neigh2 != NULL) {
+ skb->data[0] &= 0xF0;
+ skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F;
+ skb->data[1] = (rose_route->lci2 >> 0) & 0xFF;
+ rose_transmit_link(skb, rose_route->neigh2);
+ if (frametype == ROSE_CLEAR_CONFIRMATION)
+ rose_remove_route(rose_route);
+ res = 1;
+ goto out;
+ } else {
+ if (frametype == ROSE_CLEAR_CONFIRMATION)
+ rose_remove_route(rose_route);
+ goto out;
+ }
+ }
+ if (rose_route->lci2 == lci &&
+ rose_route->neigh2 == rose_neigh) {
+ if (frametype == ROSE_CALL_REQUEST) {
+ /* F6FBB - Remove an existing unused route */
+ rose_remove_route(rose_route);
+ break;
+ } else if (rose_route->neigh1 != NULL) {
+ skb->data[0] &= 0xF0;
+ skb->data[0] |= (rose_route->lci1 >> 8) & 0x0F;
+ skb->data[1] = (rose_route->lci1 >> 0) & 0xFF;
+ rose_transmit_link(skb, rose_route->neigh1);
+ if (frametype == ROSE_CLEAR_CONFIRMATION)
+ rose_remove_route(rose_route);
+ res = 1;
+ goto out;
+ } else {
+ if (frametype == ROSE_CLEAR_CONFIRMATION)
+ rose_remove_route(rose_route);
+ goto out;
+ }
+ }
+ rose_route = rose_route->next;
+ }
+
+ /*
+ * We know that:
+ * 1. The frame isn't for us,
+ * 2. It isn't "owned" by any existing route.
+ */
+ if (frametype != ROSE_CALL_REQUEST) { /* XXX */
+ res = 0;
+ goto out;
+ }
+
+ memset(&facilities, 0x00, sizeof(struct rose_facilities_struct));
+
+ if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF,
+ skb->len - ROSE_CALL_REQ_FACILITIES_OFF,
+ &facilities)) {
+ rose_transmit_clear_request(rose_neigh, lci, ROSE_INVALID_FACILITY, 76);
+ goto out;
+ }
+
+ /*
+ * Check for routing loops.
+ */
+ rose_route = rose_route_list;
+ while (rose_route != NULL) {
+ if (rose_route->rand == facilities.rand &&
+ rosecmp(src_addr, &rose_route->src_addr) == 0 &&
+ ax25cmp(&facilities.dest_call, &rose_route->src_call) == 0 &&
+ ax25cmp(&facilities.source_call, &rose_route->dest_call) == 0) {
+ rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 120);
+ goto out;
+ }
+ rose_route = rose_route->next;
+ }
+
+ if ((new_neigh = rose_get_neigh(dest_addr, &cause, &diagnostic, 1)) == NULL) {
+ rose_transmit_clear_request(rose_neigh, lci, cause, diagnostic);
+ goto out;
+ }
+
+ if ((new_lci = rose_new_lci(new_neigh)) == 0) {
+ rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71);
+ goto out;
+ }
+
+ if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) {
+ rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120);
+ goto out;
+ }
+
+ rose_route->lci1 = lci;
+ rose_route->src_addr = *src_addr;
+ rose_route->dest_addr = *dest_addr;
+ rose_route->src_call = facilities.dest_call;
+ rose_route->dest_call = facilities.source_call;
+ rose_route->rand = facilities.rand;
+ rose_route->neigh1 = rose_neigh;
+ rose_route->lci2 = new_lci;
+ rose_route->neigh2 = new_neigh;
+
+ rose_route->neigh1->use++;
+ rose_route->neigh2->use++;
+
+ rose_route->next = rose_route_list;
+ rose_route_list = rose_route;
+
+ skb->data[0] &= 0xF0;
+ skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F;
+ skb->data[1] = (rose_route->lci2 >> 0) & 0xFF;
+
+ rose_transmit_link(skb, rose_route->neigh2);
+ res = 1;
+
+out:
+ spin_unlock_bh(&rose_route_list_lock);
+ spin_unlock_bh(&rose_neigh_list_lock);
+
+ return res;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *rose_node_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rose_node_list_lock)
+{
+ struct rose_node *rose_node;
+ int i = 1;
+
+ spin_lock_bh(&rose_node_list_lock);
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (rose_node = rose_node_list; rose_node && i < *pos;
+ rose_node = rose_node->next, ++i);
+
+ return (i == *pos) ? rose_node : NULL;
+}
+
+static void *rose_node_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+
+ return (v == SEQ_START_TOKEN) ? rose_node_list
+ : ((struct rose_node *)v)->next;
+}
+
+static void rose_node_stop(struct seq_file *seq, void *v)
+ __releases(rose_node_list_lock)
+{
+ spin_unlock_bh(&rose_node_list_lock);
+}
+
+static int rose_node_show(struct seq_file *seq, void *v)
+{
+ char rsbuf[11];
+ int i;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "address mask n neigh neigh neigh\n");
+ else {
+ const struct rose_node *rose_node = v;
+ /* if (rose_node->loopback) {
+ seq_printf(seq, "%-10s %04d 1 loopback\n",
+ rose2asc(rsbuf, &rose_node->address),
+ rose_node->mask);
+ } else { */
+ seq_printf(seq, "%-10s %04d %d",
+ rose2asc(rsbuf, &rose_node->address),
+ rose_node->mask,
+ rose_node->count);
+
+ for (i = 0; i < rose_node->count; i++)
+ seq_printf(seq, " %05d",
+ rose_node->neighbour[i]->number);
+
+ seq_puts(seq, "\n");
+ /* } */
+ }
+ return 0;
+}
+
+static const struct seq_operations rose_node_seqops = {
+ .start = rose_node_start,
+ .next = rose_node_next,
+ .stop = rose_node_stop,
+ .show = rose_node_show,
+};
+
+static int rose_nodes_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rose_node_seqops);
+}
+
+const struct file_operations rose_nodes_fops = {
+ .owner = THIS_MODULE,
+ .open = rose_nodes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *rose_neigh_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rose_neigh_list_lock)
+{
+ struct rose_neigh *rose_neigh;
+ int i = 1;
+
+ spin_lock_bh(&rose_neigh_list_lock);
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (rose_neigh = rose_neigh_list; rose_neigh && i < *pos;
+ rose_neigh = rose_neigh->next, ++i);
+
+ return (i == *pos) ? rose_neigh : NULL;
+}
+
+static void *rose_neigh_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+
+ return (v == SEQ_START_TOKEN) ? rose_neigh_list
+ : ((struct rose_neigh *)v)->next;
+}
+
+static void rose_neigh_stop(struct seq_file *seq, void *v)
+ __releases(rose_neigh_list_lock)
+{
+ spin_unlock_bh(&rose_neigh_list_lock);
+}
+
+static int rose_neigh_show(struct seq_file *seq, void *v)
+{
+ char buf[11];
+ int i;
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "addr callsign dev count use mode restart t0 tf digipeaters\n");
+ else {
+ struct rose_neigh *rose_neigh = v;
+
+ /* if (!rose_neigh->loopback) { */
+ seq_printf(seq, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu",
+ rose_neigh->number,
+ (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign),
+ rose_neigh->dev ? rose_neigh->dev->name : "???",
+ rose_neigh->count,
+ rose_neigh->use,
+ (rose_neigh->dce_mode) ? "DCE" : "DTE",
+ (rose_neigh->restarted) ? "yes" : "no",
+ ax25_display_timer(&rose_neigh->t0timer) / HZ,
+ ax25_display_timer(&rose_neigh->ftimer) / HZ);
+
+ if (rose_neigh->digipeat != NULL) {
+ for (i = 0; i < rose_neigh->digipeat->ndigi; i++)
+ seq_printf(seq, " %s", ax2asc(buf, &rose_neigh->digipeat->calls[i]));
+ }
+
+ seq_puts(seq, "\n");
+ }
+ return 0;
+}
+
+
+static const struct seq_operations rose_neigh_seqops = {
+ .start = rose_neigh_start,
+ .next = rose_neigh_next,
+ .stop = rose_neigh_stop,
+ .show = rose_neigh_show,
+};
+
+static int rose_neigh_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rose_neigh_seqops);
+}
+
+const struct file_operations rose_neigh_fops = {
+ .owner = THIS_MODULE,
+ .open = rose_neigh_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+static void *rose_route_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rose_route_list_lock)
+{
+ struct rose_route *rose_route;
+ int i = 1;
+
+ spin_lock_bh(&rose_route_list_lock);
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (rose_route = rose_route_list; rose_route && i < *pos;
+ rose_route = rose_route->next, ++i);
+
+ return (i == *pos) ? rose_route : NULL;
+}
+
+static void *rose_route_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+
+ return (v == SEQ_START_TOKEN) ? rose_route_list
+ : ((struct rose_route *)v)->next;
+}
+
+static void rose_route_stop(struct seq_file *seq, void *v)
+ __releases(rose_route_list_lock)
+{
+ spin_unlock_bh(&rose_route_list_lock);
+}
+
+static int rose_route_show(struct seq_file *seq, void *v)
+{
+ char buf[11], rsbuf[11];
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "lci address callsign neigh <-> lci address callsign neigh\n");
+ else {
+ struct rose_route *rose_route = v;
+
+ if (rose_route->neigh1)
+ seq_printf(seq,
+ "%3.3X %-10s %-9s %05d ",
+ rose_route->lci1,
+ rose2asc(rsbuf, &rose_route->src_addr),
+ ax2asc(buf, &rose_route->src_call),
+ rose_route->neigh1->number);
+ else
+ seq_puts(seq,
+ "000 * * 00000 ");
+
+ if (rose_route->neigh2)
+ seq_printf(seq,
+ "%3.3X %-10s %-9s %05d\n",
+ rose_route->lci2,
+ rose2asc(rsbuf, &rose_route->dest_addr),
+ ax2asc(buf, &rose_route->dest_call),
+ rose_route->neigh2->number);
+ else
+ seq_puts(seq,
+ "000 * * 00000\n");
+ }
+ return 0;
+}
+
+static const struct seq_operations rose_route_seqops = {
+ .start = rose_route_start,
+ .next = rose_route_next,
+ .stop = rose_route_stop,
+ .show = rose_route_show,
+};
+
+static int rose_route_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rose_route_seqops);
+}
+
+const struct file_operations rose_routes_fops = {
+ .owner = THIS_MODULE,
+ .open = rose_route_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Release all memory associated with ROSE routing structures.
+ */
+void __exit rose_rt_free(void)
+{
+ struct rose_neigh *s, *rose_neigh = rose_neigh_list;
+ struct rose_node *t, *rose_node = rose_node_list;
+ struct rose_route *u, *rose_route = rose_route_list;
+
+ while (rose_neigh != NULL) {
+ s = rose_neigh;
+ rose_neigh = rose_neigh->next;
+
+ rose_remove_neigh(s);
+ }
+
+ while (rose_node != NULL) {
+ t = rose_node;
+ rose_node = rose_node->next;
+
+ rose_remove_node(t);
+ }
+
+ while (rose_route != NULL) {
+ u = rose_route;
+ rose_route = rose_route->next;
+
+ rose_remove_route(u);
+ }
+}
diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c
new file mode 100644
index 000000000..7ca57741b
--- /dev/null
+++ b/net/rose/rose_subr.c
@@ -0,0 +1,556 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/rose.h>
+
+static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose);
+
+/*
+ * This routine purges all of the queues of frames.
+ */
+void rose_clear_queues(struct sock *sk)
+{
+ skb_queue_purge(&sk->sk_write_queue);
+ skb_queue_purge(&rose_sk(sk)->ack_queue);
+}
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+ */
+void rose_frames_acked(struct sock *sk, unsigned short nr)
+{
+ struct sk_buff *skb;
+ struct rose_sock *rose = rose_sk(sk);
+
+ /*
+ * Remove all the ack-ed frames from the ack queue.
+ */
+ if (rose->va != nr) {
+ while (skb_peek(&rose->ack_queue) != NULL && rose->va != nr) {
+ skb = skb_dequeue(&rose->ack_queue);
+ kfree_skb(skb);
+ rose->va = (rose->va + 1) % ROSE_MODULUS;
+ }
+ }
+}
+
+void rose_requeue_frames(struct sock *sk)
+{
+ struct sk_buff *skb, *skb_prev = NULL;
+
+ /*
+ * Requeue all the un-ack-ed frames on the output queue to be picked
+ * up by rose_kick. This arrangement handles the possibility of an
+ * empty output queue.
+ */
+ while ((skb = skb_dequeue(&rose_sk(sk)->ack_queue)) != NULL) {
+ if (skb_prev == NULL)
+ skb_queue_head(&sk->sk_write_queue, skb);
+ else
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
+ skb_prev = skb;
+ }
+}
+
+/*
+ * Validate that the value of nr is between va and vs. Return true or
+ * false for testing.
+ */
+int rose_validate_nr(struct sock *sk, unsigned short nr)
+{
+ struct rose_sock *rose = rose_sk(sk);
+ unsigned short vc = rose->va;
+
+ while (vc != rose->vs) {
+ if (nr == vc) return 1;
+ vc = (vc + 1) % ROSE_MODULUS;
+ }
+
+ return nr == rose->vs;
+}
+
+/*
+ * This routine is called when the packet layer internally generates a
+ * control frame.
+ */
+void rose_write_internal(struct sock *sk, int frametype)
+{
+ struct rose_sock *rose = rose_sk(sk);
+ struct sk_buff *skb;
+ unsigned char *dptr;
+ unsigned char lci1, lci2;
+ char buffer[100];
+ int len, faclen = 0;
+
+ len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1;
+
+ switch (frametype) {
+ case ROSE_CALL_REQUEST:
+ len += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN;
+ faclen = rose_create_facilities(buffer, rose);
+ len += faclen;
+ break;
+ case ROSE_CALL_ACCEPTED:
+ case ROSE_CLEAR_REQUEST:
+ case ROSE_RESET_REQUEST:
+ len += 2;
+ break;
+ }
+
+ if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+ return;
+
+ /*
+ * Space for AX.25 header and PID.
+ */
+ skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1);
+
+ dptr = skb_put(skb, skb_tailroom(skb));
+
+ lci1 = (rose->lci >> 8) & 0x0F;
+ lci2 = (rose->lci >> 0) & 0xFF;
+
+ switch (frametype) {
+ case ROSE_CALL_REQUEST:
+ *dptr++ = ROSE_GFI | lci1;
+ *dptr++ = lci2;
+ *dptr++ = frametype;
+ *dptr++ = ROSE_CALL_REQ_ADDR_LEN_VAL;
+ memcpy(dptr, &rose->dest_addr, ROSE_ADDR_LEN);
+ dptr += ROSE_ADDR_LEN;
+ memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN);
+ dptr += ROSE_ADDR_LEN;
+ memcpy(dptr, buffer, faclen);
+ dptr += faclen;
+ break;
+
+ case ROSE_CALL_ACCEPTED:
+ *dptr++ = ROSE_GFI | lci1;
+ *dptr++ = lci2;
+ *dptr++ = frametype;
+ *dptr++ = 0x00; /* Address length */
+ *dptr++ = 0; /* Facilities length */
+ break;
+
+ case ROSE_CLEAR_REQUEST:
+ *dptr++ = ROSE_GFI | lci1;
+ *dptr++ = lci2;
+ *dptr++ = frametype;
+ *dptr++ = rose->cause;
+ *dptr++ = rose->diagnostic;
+ break;
+
+ case ROSE_RESET_REQUEST:
+ *dptr++ = ROSE_GFI | lci1;
+ *dptr++ = lci2;
+ *dptr++ = frametype;
+ *dptr++ = ROSE_DTE_ORIGINATED;
+ *dptr++ = 0;
+ break;
+
+ case ROSE_RR:
+ case ROSE_RNR:
+ *dptr++ = ROSE_GFI | lci1;
+ *dptr++ = lci2;
+ *dptr = frametype;
+ *dptr++ |= (rose->vr << 5) & 0xE0;
+ break;
+
+ case ROSE_CLEAR_CONFIRMATION:
+ case ROSE_RESET_CONFIRMATION:
+ *dptr++ = ROSE_GFI | lci1;
+ *dptr++ = lci2;
+ *dptr++ = frametype;
+ break;
+
+ default:
+ printk(KERN_ERR "ROSE: rose_write_internal - invalid frametype %02X\n", frametype);
+ kfree_skb(skb);
+ return;
+ }
+
+ rose_transmit_link(skb, rose->neighbour);
+}
+
+int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m)
+{
+ unsigned char *frame;
+
+ frame = skb->data;
+
+ *ns = *nr = *q = *d = *m = 0;
+
+ switch (frame[2]) {
+ case ROSE_CALL_REQUEST:
+ case ROSE_CALL_ACCEPTED:
+ case ROSE_CLEAR_REQUEST:
+ case ROSE_CLEAR_CONFIRMATION:
+ case ROSE_RESET_REQUEST:
+ case ROSE_RESET_CONFIRMATION:
+ return frame[2];
+ default:
+ break;
+ }
+
+ if ((frame[2] & 0x1F) == ROSE_RR ||
+ (frame[2] & 0x1F) == ROSE_RNR) {
+ *nr = (frame[2] >> 5) & 0x07;
+ return frame[2] & 0x1F;
+ }
+
+ if ((frame[2] & 0x01) == ROSE_DATA) {
+ *q = (frame[0] & ROSE_Q_BIT) == ROSE_Q_BIT;
+ *d = (frame[0] & ROSE_D_BIT) == ROSE_D_BIT;
+ *m = (frame[2] & ROSE_M_BIT) == ROSE_M_BIT;
+ *nr = (frame[2] >> 5) & 0x07;
+ *ns = (frame[2] >> 1) & 0x07;
+ return ROSE_DATA;
+ }
+
+ return ROSE_ILLEGAL;
+}
+
+static int rose_parse_national(unsigned char *p, struct rose_facilities_struct *facilities, int len)
+{
+ unsigned char *pt;
+ unsigned char l, lg, n = 0;
+ int fac_national_digis_received = 0;
+
+ do {
+ switch (*p & 0xC0) {
+ case 0x00:
+ if (len < 2)
+ return -1;
+ p += 2;
+ n += 2;
+ len -= 2;
+ break;
+
+ case 0x40:
+ if (len < 3)
+ return -1;
+ if (*p == FAC_NATIONAL_RAND)
+ facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF);
+ p += 3;
+ n += 3;
+ len -= 3;
+ break;
+
+ case 0x80:
+ if (len < 4)
+ return -1;
+ p += 4;
+ n += 4;
+ len -= 4;
+ break;
+
+ case 0xC0:
+ if (len < 2)
+ return -1;
+ l = p[1];
+ if (len < 2 + l)
+ return -1;
+ if (*p == FAC_NATIONAL_DEST_DIGI) {
+ if (!fac_national_digis_received) {
+ if (l < AX25_ADDR_LEN)
+ return -1;
+ memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN);
+ facilities->source_ndigis = 1;
+ }
+ }
+ else if (*p == FAC_NATIONAL_SRC_DIGI) {
+ if (!fac_national_digis_received) {
+ if (l < AX25_ADDR_LEN)
+ return -1;
+ memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN);
+ facilities->dest_ndigis = 1;
+ }
+ }
+ else if (*p == FAC_NATIONAL_FAIL_CALL) {
+ if (l < AX25_ADDR_LEN)
+ return -1;
+ memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN);
+ }
+ else if (*p == FAC_NATIONAL_FAIL_ADD) {
+ if (l < 1 + ROSE_ADDR_LEN)
+ return -1;
+ memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN);
+ }
+ else if (*p == FAC_NATIONAL_DIGIS) {
+ if (l % AX25_ADDR_LEN)
+ return -1;
+ fac_national_digis_received = 1;
+ facilities->source_ndigis = 0;
+ facilities->dest_ndigis = 0;
+ for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) {
+ if (pt[6] & AX25_HBIT) {
+ if (facilities->dest_ndigis >= ROSE_MAX_DIGIS)
+ return -1;
+ memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN);
+ } else {
+ if (facilities->source_ndigis >= ROSE_MAX_DIGIS)
+ return -1;
+ memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN);
+ }
+ }
+ }
+ p += l + 2;
+ n += l + 2;
+ len -= l + 2;
+ break;
+ }
+ } while (*p != 0x00 && len > 0);
+
+ return n;
+}
+
+static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *facilities, int len)
+{
+ unsigned char l, n = 0;
+ char callsign[11];
+
+ do {
+ switch (*p & 0xC0) {
+ case 0x00:
+ if (len < 2)
+ return -1;
+ p += 2;
+ n += 2;
+ len -= 2;
+ break;
+
+ case 0x40:
+ if (len < 3)
+ return -1;
+ p += 3;
+ n += 3;
+ len -= 3;
+ break;
+
+ case 0x80:
+ if (len < 4)
+ return -1;
+ p += 4;
+ n += 4;
+ len -= 4;
+ break;
+
+ case 0xC0:
+ if (len < 2)
+ return -1;
+ l = p[1];
+
+ /* Prevent overflows*/
+ if (l < 10 || l > 20)
+ return -1;
+
+ if (*p == FAC_CCITT_DEST_NSAP) {
+ memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN);
+ memcpy(callsign, p + 12, l - 10);
+ callsign[l - 10] = '\0';
+ asc2ax(&facilities->source_call, callsign);
+ }
+ if (*p == FAC_CCITT_SRC_NSAP) {
+ memcpy(&facilities->dest_addr, p + 7, ROSE_ADDR_LEN);
+ memcpy(callsign, p + 12, l - 10);
+ callsign[l - 10] = '\0';
+ asc2ax(&facilities->dest_call, callsign);
+ }
+ p += l + 2;
+ n += l + 2;
+ len -= l + 2;
+ break;
+ }
+ } while (*p != 0x00 && len > 0);
+
+ return n;
+}
+
+int rose_parse_facilities(unsigned char *p, unsigned packet_len,
+ struct rose_facilities_struct *facilities)
+{
+ int facilities_len, len;
+
+ facilities_len = *p++;
+
+ if (facilities_len == 0 || (unsigned int)facilities_len > packet_len)
+ return 0;
+
+ while (facilities_len >= 3 && *p == 0x00) {
+ facilities_len--;
+ p++;
+
+ switch (*p) {
+ case FAC_NATIONAL: /* National */
+ len = rose_parse_national(p + 1, facilities, facilities_len - 1);
+ break;
+
+ case FAC_CCITT: /* CCITT */
+ len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1);
+ break;
+
+ default:
+ printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p);
+ len = 1;
+ break;
+ }
+
+ if (len < 0)
+ return 0;
+ if (WARN_ON(len >= facilities_len))
+ return 0;
+ facilities_len -= len + 1;
+ p += len + 1;
+ }
+
+ return facilities_len == 0;
+}
+
+static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose)
+{
+ unsigned char *p = buffer + 1;
+ char *callsign;
+ char buf[11];
+ int len, nb;
+
+ /* National Facilities */
+ if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) {
+ *p++ = 0x00;
+ *p++ = FAC_NATIONAL;
+
+ if (rose->rand != 0) {
+ *p++ = FAC_NATIONAL_RAND;
+ *p++ = (rose->rand >> 8) & 0xFF;
+ *p++ = (rose->rand >> 0) & 0xFF;
+ }
+
+ /* Sent before older facilities */
+ if ((rose->source_ndigis > 0) || (rose->dest_ndigis > 0)) {
+ int maxdigi = 0;
+ *p++ = FAC_NATIONAL_DIGIS;
+ *p++ = AX25_ADDR_LEN * (rose->source_ndigis + rose->dest_ndigis);
+ for (nb = 0 ; nb < rose->source_ndigis ; nb++) {
+ if (++maxdigi >= ROSE_MAX_DIGIS)
+ break;
+ memcpy(p, &rose->source_digis[nb], AX25_ADDR_LEN);
+ p[6] |= AX25_HBIT;
+ p += AX25_ADDR_LEN;
+ }
+ for (nb = 0 ; nb < rose->dest_ndigis ; nb++) {
+ if (++maxdigi >= ROSE_MAX_DIGIS)
+ break;
+ memcpy(p, &rose->dest_digis[nb], AX25_ADDR_LEN);
+ p[6] &= ~AX25_HBIT;
+ p += AX25_ADDR_LEN;
+ }
+ }
+
+ /* For compatibility */
+ if (rose->source_ndigis > 0) {
+ *p++ = FAC_NATIONAL_SRC_DIGI;
+ *p++ = AX25_ADDR_LEN;
+ memcpy(p, &rose->source_digis[0], AX25_ADDR_LEN);
+ p += AX25_ADDR_LEN;
+ }
+
+ /* For compatibility */
+ if (rose->dest_ndigis > 0) {
+ *p++ = FAC_NATIONAL_DEST_DIGI;
+ *p++ = AX25_ADDR_LEN;
+ memcpy(p, &rose->dest_digis[0], AX25_ADDR_LEN);
+ p += AX25_ADDR_LEN;
+ }
+ }
+
+ *p++ = 0x00;
+ *p++ = FAC_CCITT;
+
+ *p++ = FAC_CCITT_DEST_NSAP;
+
+ callsign = ax2asc(buf, &rose->dest_call);
+
+ *p++ = strlen(callsign) + 10;
+ *p++ = (strlen(callsign) + 9) * 2; /* ??? */
+
+ *p++ = 0x47; *p++ = 0x00; *p++ = 0x11;
+ *p++ = ROSE_ADDR_LEN * 2;
+ memcpy(p, &rose->dest_addr, ROSE_ADDR_LEN);
+ p += ROSE_ADDR_LEN;
+
+ memcpy(p, callsign, strlen(callsign));
+ p += strlen(callsign);
+
+ *p++ = FAC_CCITT_SRC_NSAP;
+
+ callsign = ax2asc(buf, &rose->source_call);
+
+ *p++ = strlen(callsign) + 10;
+ *p++ = (strlen(callsign) + 9) * 2; /* ??? */
+
+ *p++ = 0x47; *p++ = 0x00; *p++ = 0x11;
+ *p++ = ROSE_ADDR_LEN * 2;
+ memcpy(p, &rose->source_addr, ROSE_ADDR_LEN);
+ p += ROSE_ADDR_LEN;
+
+ memcpy(p, callsign, strlen(callsign));
+ p += strlen(callsign);
+
+ len = p - buffer;
+ buffer[0] = len - 1;
+
+ return len;
+}
+
+void rose_disconnect(struct sock *sk, int reason, int cause, int diagnostic)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ rose_stop_timer(sk);
+ rose_stop_idletimer(sk);
+
+ rose_clear_queues(sk);
+
+ rose->lci = 0;
+ rose->state = ROSE_STATE_0;
+
+ if (cause != -1)
+ rose->cause = cause;
+
+ if (diagnostic != -1)
+ rose->diagnostic = diagnostic;
+
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_err = reason;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ }
+}
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
new file mode 100644
index 000000000..bc5469d6d
--- /dev/null
+++ b/net/rose/rose_timer.c
@@ -0,0 +1,216 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/rose.h>
+
+static void rose_heartbeat_expiry(unsigned long);
+static void rose_timer_expiry(unsigned long);
+static void rose_idletimer_expiry(unsigned long);
+
+void rose_start_heartbeat(struct sock *sk)
+{
+ del_timer(&sk->sk_timer);
+
+ sk->sk_timer.data = (unsigned long)sk;
+ sk->sk_timer.function = &rose_heartbeat_expiry;
+ sk->sk_timer.expires = jiffies + 5 * HZ;
+
+ add_timer(&sk->sk_timer);
+}
+
+void rose_start_t1timer(struct sock *sk)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ del_timer(&rose->timer);
+
+ rose->timer.data = (unsigned long)sk;
+ rose->timer.function = &rose_timer_expiry;
+ rose->timer.expires = jiffies + rose->t1;
+
+ add_timer(&rose->timer);
+}
+
+void rose_start_t2timer(struct sock *sk)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ del_timer(&rose->timer);
+
+ rose->timer.data = (unsigned long)sk;
+ rose->timer.function = &rose_timer_expiry;
+ rose->timer.expires = jiffies + rose->t2;
+
+ add_timer(&rose->timer);
+}
+
+void rose_start_t3timer(struct sock *sk)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ del_timer(&rose->timer);
+
+ rose->timer.data = (unsigned long)sk;
+ rose->timer.function = &rose_timer_expiry;
+ rose->timer.expires = jiffies + rose->t3;
+
+ add_timer(&rose->timer);
+}
+
+void rose_start_hbtimer(struct sock *sk)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ del_timer(&rose->timer);
+
+ rose->timer.data = (unsigned long)sk;
+ rose->timer.function = &rose_timer_expiry;
+ rose->timer.expires = jiffies + rose->hb;
+
+ add_timer(&rose->timer);
+}
+
+void rose_start_idletimer(struct sock *sk)
+{
+ struct rose_sock *rose = rose_sk(sk);
+
+ del_timer(&rose->idletimer);
+
+ if (rose->idle > 0) {
+ rose->idletimer.data = (unsigned long)sk;
+ rose->idletimer.function = &rose_idletimer_expiry;
+ rose->idletimer.expires = jiffies + rose->idle;
+
+ add_timer(&rose->idletimer);
+ }
+}
+
+void rose_stop_heartbeat(struct sock *sk)
+{
+ del_timer(&sk->sk_timer);
+}
+
+void rose_stop_timer(struct sock *sk)
+{
+ del_timer(&rose_sk(sk)->timer);
+}
+
+void rose_stop_idletimer(struct sock *sk)
+{
+ del_timer(&rose_sk(sk)->idletimer);
+}
+
+static void rose_heartbeat_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+ struct rose_sock *rose = rose_sk(sk);
+
+ bh_lock_sock(sk);
+ switch (rose->state) {
+ case ROSE_STATE_0:
+ /* Magic here: If we listen() and a new link dies before it
+ is accepted() it isn't 'dead' so doesn't get removed. */
+ if (sock_flag(sk, SOCK_DESTROY) ||
+ (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) {
+ bh_unlock_sock(sk);
+ rose_destroy_socket(sk);
+ return;
+ }
+ break;
+
+ case ROSE_STATE_3:
+ /*
+ * Check for the state of the receive buffer.
+ */
+ if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) &&
+ (rose->condition & ROSE_COND_OWN_RX_BUSY)) {
+ rose->condition &= ~ROSE_COND_OWN_RX_BUSY;
+ rose->condition &= ~ROSE_COND_ACK_PENDING;
+ rose->vl = rose->vr;
+ rose_write_internal(sk, ROSE_RR);
+ rose_stop_timer(sk); /* HB */
+ break;
+ }
+ break;
+ }
+
+ rose_start_heartbeat(sk);
+ bh_unlock_sock(sk);
+}
+
+static void rose_timer_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+ struct rose_sock *rose = rose_sk(sk);
+
+ bh_lock_sock(sk);
+ switch (rose->state) {
+ case ROSE_STATE_1: /* T1 */
+ case ROSE_STATE_4: /* T2 */
+ rose_write_internal(sk, ROSE_CLEAR_REQUEST);
+ rose->state = ROSE_STATE_2;
+ rose_start_t3timer(sk);
+ break;
+
+ case ROSE_STATE_2: /* T3 */
+ rose->neighbour->use--;
+ rose_disconnect(sk, ETIMEDOUT, -1, -1);
+ break;
+
+ case ROSE_STATE_3: /* HB */
+ if (rose->condition & ROSE_COND_ACK_PENDING) {
+ rose->condition &= ~ROSE_COND_ACK_PENDING;
+ rose_enquiry_response(sk);
+ }
+ break;
+ }
+ bh_unlock_sock(sk);
+}
+
+static void rose_idletimer_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+
+ bh_lock_sock(sk);
+ rose_clear_queues(sk);
+
+ rose_write_internal(sk, ROSE_CLEAR_REQUEST);
+ rose_sk(sk)->state = ROSE_STATE_2;
+
+ rose_start_t3timer(sk);
+
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_err = 0;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ }
+ bh_unlock_sock(sk);
+}
diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c
new file mode 100644
index 000000000..89a927879
--- /dev/null
+++ b/net/rose/sysctl_net_rose.c
@@ -0,0 +1,129 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com)
+ */
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <net/ax25.h>
+#include <net/rose.h>
+
+static int min_timer[] = {1 * HZ};
+static int max_timer[] = {300 * HZ};
+static int min_idle[] = {0 * HZ};
+static int max_idle[] = {65535 * HZ};
+static int min_route[1], max_route[] = {1};
+static int min_ftimer[] = {60 * HZ};
+static int max_ftimer[] = {600 * HZ};
+static int min_maxvcs[] = {1}, max_maxvcs[] = {254};
+static int min_window[] = {1}, max_window[] = {7};
+
+static struct ctl_table_header *rose_table_header;
+
+static struct ctl_table rose_table[] = {
+ {
+ .procname = "restart_request_timeout",
+ .data = &sysctl_rose_restart_request_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer
+ },
+ {
+ .procname = "call_request_timeout",
+ .data = &sysctl_rose_call_request_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer
+ },
+ {
+ .procname = "reset_request_timeout",
+ .data = &sysctl_rose_reset_request_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer
+ },
+ {
+ .procname = "clear_request_timeout",
+ .data = &sysctl_rose_clear_request_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer
+ },
+ {
+ .procname = "no_activity_timeout",
+ .data = &sysctl_rose_no_activity_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_idle,
+ .extra2 = &max_idle
+ },
+ {
+ .procname = "acknowledge_hold_back_timeout",
+ .data = &sysctl_rose_ack_hold_back_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer
+ },
+ {
+ .procname = "routing_control",
+ .data = &sysctl_rose_routing_control,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_route,
+ .extra2 = &max_route
+ },
+ {
+ .procname = "link_fail_timeout",
+ .data = &sysctl_rose_link_fail_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_ftimer,
+ .extra2 = &max_ftimer
+ },
+ {
+ .procname = "maximum_virtual_circuits",
+ .data = &sysctl_rose_maximum_vcs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_maxvcs,
+ .extra2 = &max_maxvcs
+ },
+ {
+ .procname = "window_size",
+ .data = &sysctl_rose_window_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_window,
+ .extra2 = &max_window
+ },
+ { }
+};
+
+void __init rose_register_sysctl(void)
+{
+ rose_table_header = register_net_sysctl(&init_net, "net/rose", rose_table);
+}
+
+void rose_unregister_sysctl(void)
+{
+ unregister_net_sysctl_table(rose_table_header);
+}
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
new file mode 100644
index 000000000..23dcef12b
--- /dev/null
+++ b/net/rxrpc/Kconfig
@@ -0,0 +1,44 @@
+#
+# RxRPC session sockets
+#
+
+config AF_RXRPC
+ tristate "RxRPC session sockets"
+ depends on INET
+ select CRYPTO
+ select KEYS
+ help
+ Say Y or M here to include support for RxRPC session sockets (just
+ the transport part, not the presentation part: (un)marshalling is
+ left to the application).
+
+ These are used for AFS kernel filesystem and userspace utilities.
+
+ This module at the moment only supports client operations and is
+ currently incomplete.
+
+ See Documentation/networking/rxrpc.txt.
+
+
+config AF_RXRPC_DEBUG
+ bool "RxRPC dynamic debugging"
+ depends on AF_RXRPC
+ help
+ Say Y here to make runtime controllable debugging messages appear.
+
+ See Documentation/networking/rxrpc.txt.
+
+
+config RXKAD
+ tristate "RxRPC Kerberos security"
+ depends on AF_RXRPC
+ select CRYPTO
+ select CRYPTO_MANAGER
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_PCBC
+ select CRYPTO_FCRYPT
+ help
+ Provide kerberos 4 and AFS kaserver security handling for AF_RXRPC
+ through the use of the key retention service.
+
+ See Documentation/networking/rxrpc.txt.
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
new file mode 100644
index 000000000..ec126f912
--- /dev/null
+++ b/net/rxrpc/Makefile
@@ -0,0 +1,28 @@
+#
+# Makefile for Linux kernel RxRPC
+#
+
+af-rxrpc-y := \
+ af_rxrpc.o \
+ ar-accept.o \
+ ar-ack.o \
+ ar-call.o \
+ ar-connection.o \
+ ar-connevent.o \
+ ar-error.o \
+ ar-input.o \
+ ar-key.o \
+ ar-local.o \
+ ar-output.o \
+ ar-peer.o \
+ ar-recvmsg.o \
+ ar-security.o \
+ ar-skbuff.o \
+ ar-transport.o
+
+af-rxrpc-$(CONFIG_PROC_FS) += ar-proc.o
+af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o
+
+obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o
+
+obj-$(CONFIG_RXKAD) += rxkad.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
new file mode 100644
index 000000000..0095b9a0b
--- /dev/null
+++ b/net/rxrpc/af_rxrpc.c
@@ -0,0 +1,898 @@
+/* AF_RXRPC implementation
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/key-type.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+MODULE_DESCRIPTION("RxRPC network protocol");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_RXRPC);
+
+unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO;
+module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "RxRPC debugging mask");
+
+static int sysctl_rxrpc_max_qlen __read_mostly = 10;
+
+static struct proto rxrpc_proto;
+static const struct proto_ops rxrpc_rpc_ops;
+
+/* local epoch for detecting local-end reset */
+__be32 rxrpc_epoch;
+
+/* current debugging ID */
+atomic_t rxrpc_debug_id;
+
+/* count of skbs currently in use */
+atomic_t rxrpc_n_skbs;
+
+struct workqueue_struct *rxrpc_workqueue;
+
+static void rxrpc_sock_destructor(struct sock *);
+
+/*
+ * see if an RxRPC socket is currently writable
+ */
+static inline int rxrpc_writable(struct sock *sk)
+{
+ return atomic_read(&sk->sk_wmem_alloc) < (size_t) sk->sk_sndbuf;
+}
+
+/*
+ * wait for write bufferage to become available
+ */
+static void rxrpc_write_space(struct sock *sk)
+{
+ _enter("%p", sk);
+ rcu_read_lock();
+ if (rxrpc_writable(sk)) {
+ struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible(&wq->wait);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * validate an RxRPC address
+ */
+static int rxrpc_validate_address(struct rxrpc_sock *rx,
+ struct sockaddr_rxrpc *srx,
+ int len)
+{
+ if (len < sizeof(struct sockaddr_rxrpc))
+ return -EINVAL;
+
+ if (srx->srx_family != AF_RXRPC)
+ return -EAFNOSUPPORT;
+
+ if (srx->transport_type != SOCK_DGRAM)
+ return -ESOCKTNOSUPPORT;
+
+ len -= offsetof(struct sockaddr_rxrpc, transport);
+ if (srx->transport_len < sizeof(sa_family_t) ||
+ srx->transport_len > len)
+ return -EINVAL;
+
+ if (srx->transport.family != rx->proto)
+ return -EAFNOSUPPORT;
+
+ switch (srx->transport.family) {
+ case AF_INET:
+ _debug("INET: %x @ %pI4",
+ ntohs(srx->transport.sin.sin_port),
+ &srx->transport.sin.sin_addr);
+ if (srx->transport_len > 8)
+ memset((void *)&srx->transport + 8, 0,
+ srx->transport_len - 8);
+ break;
+
+ case AF_INET6:
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+/*
+ * bind a local address to an RxRPC socket
+ */
+static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+ struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) saddr;
+ struct sock *sk = sock->sk;
+ struct rxrpc_local *local;
+ struct rxrpc_sock *rx = rxrpc_sk(sk), *prx;
+ __be16 service_id;
+ int ret;
+
+ _enter("%p,%p,%d", rx, saddr, len);
+
+ ret = rxrpc_validate_address(rx, srx, len);
+ if (ret < 0)
+ goto error;
+
+ lock_sock(&rx->sk);
+
+ if (rx->sk.sk_state != RXRPC_UNCONNECTED) {
+ ret = -EINVAL;
+ goto error_unlock;
+ }
+
+ memcpy(&rx->srx, srx, sizeof(rx->srx));
+
+ /* find a local transport endpoint if we don't have one already */
+ local = rxrpc_lookup_local(&rx->srx);
+ if (IS_ERR(local)) {
+ ret = PTR_ERR(local);
+ goto error_unlock;
+ }
+
+ rx->local = local;
+ if (srx->srx_service) {
+ service_id = htons(srx->srx_service);
+ write_lock_bh(&local->services_lock);
+ list_for_each_entry(prx, &local->services, listen_link) {
+ if (prx->service_id == service_id)
+ goto service_in_use;
+ }
+
+ rx->service_id = service_id;
+ list_add_tail(&rx->listen_link, &local->services);
+ write_unlock_bh(&local->services_lock);
+
+ rx->sk.sk_state = RXRPC_SERVER_BOUND;
+ } else {
+ rx->sk.sk_state = RXRPC_CLIENT_BOUND;
+ }
+
+ release_sock(&rx->sk);
+ _leave(" = 0");
+ return 0;
+
+service_in_use:
+ ret = -EADDRINUSE;
+ write_unlock_bh(&local->services_lock);
+error_unlock:
+ release_sock(&rx->sk);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * set the number of pending calls permitted on a listening socket
+ */
+static int rxrpc_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ int ret;
+
+ _enter("%p,%d", rx, backlog);
+
+ lock_sock(&rx->sk);
+
+ switch (rx->sk.sk_state) {
+ case RXRPC_UNCONNECTED:
+ ret = -EADDRNOTAVAIL;
+ break;
+ case RXRPC_CLIENT_BOUND:
+ case RXRPC_CLIENT_CONNECTED:
+ default:
+ ret = -EBUSY;
+ break;
+ case RXRPC_SERVER_BOUND:
+ ASSERT(rx->local != NULL);
+ sk->sk_max_ack_backlog = backlog;
+ rx->sk.sk_state = RXRPC_SERVER_LISTENING;
+ ret = 0;
+ break;
+ }
+
+ release_sock(&rx->sk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * find a transport by address
+ */
+static struct rxrpc_transport *rxrpc_name_to_transport(struct socket *sock,
+ struct sockaddr *addr,
+ int addr_len, int flags,
+ gfp_t gfp)
+{
+ struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr;
+ struct rxrpc_transport *trans;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct rxrpc_peer *peer;
+
+ _enter("%p,%p,%d,%d", rx, addr, addr_len, flags);
+
+ ASSERT(rx->local != NULL);
+ ASSERT(rx->sk.sk_state > RXRPC_UNCONNECTED);
+
+ if (rx->srx.transport_type != srx->transport_type)
+ return ERR_PTR(-ESOCKTNOSUPPORT);
+ if (rx->srx.transport.family != srx->transport.family)
+ return ERR_PTR(-EAFNOSUPPORT);
+
+ /* find a remote transport endpoint from the local one */
+ peer = rxrpc_get_peer(srx, gfp);
+ if (IS_ERR(peer))
+ return ERR_CAST(peer);
+
+ /* find a transport */
+ trans = rxrpc_get_transport(rx->local, peer, gfp);
+ rxrpc_put_peer(peer);
+ _leave(" = %p", trans);
+ return trans;
+}
+
+/**
+ * rxrpc_kernel_begin_call - Allow a kernel service to begin a call
+ * @sock: The socket on which to make the call
+ * @srx: The address of the peer to contact (defaults to socket setting)
+ * @key: The security context to use (defaults to socket setting)
+ * @user_call_ID: The ID to use
+ *
+ * Allow a kernel service to begin a call on the nominated socket. This just
+ * sets up all the internal tracking structures and allocates connection and
+ * call IDs as appropriate. The call to be used is returned.
+ *
+ * The default socket destination address and security may be overridden by
+ * supplying @srx and @key.
+ */
+struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+ struct sockaddr_rxrpc *srx,
+ struct key *key,
+ unsigned long user_call_ID,
+ gfp_t gfp)
+{
+ struct rxrpc_conn_bundle *bundle;
+ struct rxrpc_transport *trans;
+ struct rxrpc_call *call;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ __be16 service_id;
+
+ _enter(",,%x,%lx", key_serial(key), user_call_ID);
+
+ lock_sock(&rx->sk);
+
+ if (srx) {
+ trans = rxrpc_name_to_transport(sock, (struct sockaddr *) srx,
+ sizeof(*srx), 0, gfp);
+ if (IS_ERR(trans)) {
+ call = ERR_CAST(trans);
+ trans = NULL;
+ goto out_notrans;
+ }
+ } else {
+ trans = rx->trans;
+ if (!trans) {
+ call = ERR_PTR(-ENOTCONN);
+ goto out_notrans;
+ }
+ atomic_inc(&trans->usage);
+ }
+
+ service_id = rx->service_id;
+ if (srx)
+ service_id = htons(srx->srx_service);
+
+ if (!key)
+ key = rx->key;
+ if (key && !key->payload.data)
+ key = NULL; /* a no-security key */
+
+ bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp);
+ if (IS_ERR(bundle)) {
+ call = ERR_CAST(bundle);
+ goto out;
+ }
+
+ call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, true,
+ gfp);
+ rxrpc_put_bundle(trans, bundle);
+out:
+ rxrpc_put_transport(trans);
+out_notrans:
+ release_sock(&rx->sk);
+ _leave(" = %p", call);
+ return call;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_begin_call);
+
+/**
+ * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using
+ * @call: The call to end
+ *
+ * Allow a kernel service to end a call it was using. The call must be
+ * complete before this is called (the call should be aborted if necessary).
+ */
+void rxrpc_kernel_end_call(struct rxrpc_call *call)
+{
+ _enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
+ rxrpc_remove_user_ID(call->socket, call);
+ rxrpc_put_call(call);
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_end_call);
+
+/**
+ * rxrpc_kernel_intercept_rx_messages - Intercept received RxRPC messages
+ * @sock: The socket to intercept received messages on
+ * @interceptor: The function to pass the messages to
+ *
+ * Allow a kernel service to intercept messages heading for the Rx queue on an
+ * RxRPC socket. They get passed to the specified function instead.
+ * @interceptor should free the socket buffers it is given. @interceptor is
+ * called with the socket receive queue spinlock held and softirqs disabled -
+ * this ensures that the messages will be delivered in the right order.
+ */
+void rxrpc_kernel_intercept_rx_messages(struct socket *sock,
+ rxrpc_interceptor_t interceptor)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+
+ _enter("");
+ rx->interceptor = interceptor;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages);
+
+/*
+ * connect an RxRPC socket
+ * - this just targets it at a specific destination; no actual connection
+ * negotiation takes place
+ */
+static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
+ int addr_len, int flags)
+{
+ struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr;
+ struct sock *sk = sock->sk;
+ struct rxrpc_transport *trans;
+ struct rxrpc_local *local;
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+ int ret;
+
+ _enter("%p,%p,%d,%d", rx, addr, addr_len, flags);
+
+ ret = rxrpc_validate_address(rx, srx, addr_len);
+ if (ret < 0) {
+ _leave(" = %d [bad addr]", ret);
+ return ret;
+ }
+
+ lock_sock(&rx->sk);
+
+ switch (rx->sk.sk_state) {
+ case RXRPC_UNCONNECTED:
+ /* find a local transport endpoint if we don't have one already */
+ ASSERTCMP(rx->local, ==, NULL);
+ rx->srx.srx_family = AF_RXRPC;
+ rx->srx.srx_service = 0;
+ rx->srx.transport_type = srx->transport_type;
+ rx->srx.transport_len = sizeof(sa_family_t);
+ rx->srx.transport.family = srx->transport.family;
+ local = rxrpc_lookup_local(&rx->srx);
+ if (IS_ERR(local)) {
+ release_sock(&rx->sk);
+ return PTR_ERR(local);
+ }
+ rx->local = local;
+ rx->sk.sk_state = RXRPC_CLIENT_BOUND;
+ case RXRPC_CLIENT_BOUND:
+ break;
+ case RXRPC_CLIENT_CONNECTED:
+ release_sock(&rx->sk);
+ return -EISCONN;
+ default:
+ release_sock(&rx->sk);
+ return -EBUSY; /* server sockets can't connect as well */
+ }
+
+ trans = rxrpc_name_to_transport(sock, addr, addr_len, flags,
+ GFP_KERNEL);
+ if (IS_ERR(trans)) {
+ release_sock(&rx->sk);
+ _leave(" = %ld", PTR_ERR(trans));
+ return PTR_ERR(trans);
+ }
+
+ rx->trans = trans;
+ rx->service_id = htons(srx->srx_service);
+ rx->sk.sk_state = RXRPC_CLIENT_CONNECTED;
+
+ release_sock(&rx->sk);
+ return 0;
+}
+
+/*
+ * send a message through an RxRPC socket
+ * - in a client this does a number of things:
+ * - finds/sets up a connection for the security specified (if any)
+ * - initiates a call (ID in control data)
+ * - ends the request phase of a call (if MSG_MORE is not set)
+ * - sends a call data packet
+ * - may send an abort (abort code in control data)
+ */
+static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
+{
+ struct rxrpc_transport *trans;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ int ret;
+
+ _enter(",{%d},,%zu", rx->sk.sk_state, len);
+
+ if (m->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ if (m->msg_name) {
+ ret = rxrpc_validate_address(rx, m->msg_name, m->msg_namelen);
+ if (ret < 0) {
+ _leave(" = %d [bad addr]", ret);
+ return ret;
+ }
+ }
+
+ trans = NULL;
+ lock_sock(&rx->sk);
+
+ if (m->msg_name) {
+ ret = -EISCONN;
+ trans = rxrpc_name_to_transport(sock, m->msg_name,
+ m->msg_namelen, 0, GFP_KERNEL);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+ } else {
+ trans = rx->trans;
+ if (trans)
+ atomic_inc(&trans->usage);
+ }
+
+ switch (rx->sk.sk_state) {
+ case RXRPC_SERVER_LISTENING:
+ if (!m->msg_name) {
+ ret = rxrpc_server_sendmsg(rx, m, len);
+ break;
+ }
+ case RXRPC_SERVER_BOUND:
+ case RXRPC_CLIENT_BOUND:
+ if (!m->msg_name) {
+ ret = -ENOTCONN;
+ break;
+ }
+ case RXRPC_CLIENT_CONNECTED:
+ ret = rxrpc_client_sendmsg(rx, trans, m, len);
+ break;
+ default:
+ ret = -ENOTCONN;
+ break;
+ }
+
+out:
+ release_sock(&rx->sk);
+ if (trans)
+ rxrpc_put_transport(trans);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * set RxRPC socket options
+ */
+static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ unsigned int min_sec_level;
+ int ret;
+
+ _enter(",%d,%d,,%d", level, optname, optlen);
+
+ lock_sock(&rx->sk);
+ ret = -EOPNOTSUPP;
+
+ if (level == SOL_RXRPC) {
+ switch (optname) {
+ case RXRPC_EXCLUSIVE_CONNECTION:
+ ret = -EINVAL;
+ if (optlen != 0)
+ goto error;
+ ret = -EISCONN;
+ if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+ goto error;
+ set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags);
+ goto success;
+
+ case RXRPC_SECURITY_KEY:
+ ret = -EINVAL;
+ if (rx->key)
+ goto error;
+ ret = -EISCONN;
+ if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+ goto error;
+ ret = rxrpc_request_key(rx, optval, optlen);
+ goto error;
+
+ case RXRPC_SECURITY_KEYRING:
+ ret = -EINVAL;
+ if (rx->key)
+ goto error;
+ ret = -EISCONN;
+ if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+ goto error;
+ ret = rxrpc_server_keyring(rx, optval, optlen);
+ goto error;
+
+ case RXRPC_MIN_SECURITY_LEVEL:
+ ret = -EINVAL;
+ if (optlen != sizeof(unsigned int))
+ goto error;
+ ret = -EISCONN;
+ if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+ goto error;
+ ret = get_user(min_sec_level,
+ (unsigned int __user *) optval);
+ if (ret < 0)
+ goto error;
+ ret = -EINVAL;
+ if (min_sec_level > RXRPC_SECURITY_MAX)
+ goto error;
+ rx->min_sec_level = min_sec_level;
+ goto success;
+
+ default:
+ break;
+ }
+ }
+
+success:
+ ret = 0;
+error:
+ release_sock(&rx->sk);
+ return ret;
+}
+
+/*
+ * permit an RxRPC socket to be polled
+ */
+static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ mask = 0;
+
+ /* the socket is readable if there are any messages waiting on the Rx
+ * queue */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* the socket is writable if there is space to add new data to the
+ * socket; there is no guarantee that any particular call in progress
+ * on the socket may have space in the Tx ACK window */
+ if (rxrpc_writable(sk))
+ mask |= POLLOUT | POLLWRNORM;
+
+ return mask;
+}
+
+/*
+ * create an RxRPC socket
+ */
+static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct rxrpc_sock *rx;
+ struct sock *sk;
+
+ _enter("%p,%d", sock, protocol);
+
+ if (!net_eq(net, &init_net))
+ return -EAFNOSUPPORT;
+
+ /* we support transport protocol UDP only */
+ if (protocol != PF_INET)
+ return -EPROTONOSUPPORT;
+
+ if (sock->type != SOCK_DGRAM)
+ return -ESOCKTNOSUPPORT;
+
+ sock->ops = &rxrpc_rpc_ops;
+ sock->state = SS_UNCONNECTED;
+
+ sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ sock_init_data(sock, sk);
+ sk->sk_state = RXRPC_UNCONNECTED;
+ sk->sk_write_space = rxrpc_write_space;
+ sk->sk_max_ack_backlog = sysctl_rxrpc_max_qlen;
+ sk->sk_destruct = rxrpc_sock_destructor;
+
+ rx = rxrpc_sk(sk);
+ rx->proto = protocol;
+ rx->calls = RB_ROOT;
+
+ INIT_LIST_HEAD(&rx->listen_link);
+ INIT_LIST_HEAD(&rx->secureq);
+ INIT_LIST_HEAD(&rx->acceptq);
+ rwlock_init(&rx->call_lock);
+ memset(&rx->srx, 0, sizeof(rx->srx));
+
+ _leave(" = 0 [%p]", rx);
+ return 0;
+}
+
+/*
+ * RxRPC socket destructor
+ */
+static void rxrpc_sock_destructor(struct sock *sk)
+{
+ _enter("%p", sk);
+
+ rxrpc_purge_queue(&sk->sk_receive_queue);
+
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(!sk_unhashed(sk));
+ WARN_ON(sk->sk_socket);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ printk("Attempt to release alive rxrpc socket: %p\n", sk);
+ return;
+ }
+}
+
+/*
+ * release an RxRPC socket
+ */
+static int rxrpc_release_sock(struct sock *sk)
+{
+ struct rxrpc_sock *rx = rxrpc_sk(sk);
+
+ _enter("%p{%d,%d}", sk, sk->sk_state, atomic_read(&sk->sk_refcnt));
+
+ /* declare the socket closed for business */
+ sock_orphan(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ sk->sk_state = RXRPC_CLOSE;
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+ ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1);
+
+ if (!list_empty(&rx->listen_link)) {
+ write_lock_bh(&rx->local->services_lock);
+ list_del(&rx->listen_link);
+ write_unlock_bh(&rx->local->services_lock);
+ }
+
+ /* try to flush out this socket */
+ rxrpc_release_calls_on_socket(rx);
+ flush_workqueue(rxrpc_workqueue);
+ rxrpc_purge_queue(&sk->sk_receive_queue);
+
+ if (rx->conn) {
+ rxrpc_put_connection(rx->conn);
+ rx->conn = NULL;
+ }
+
+ if (rx->bundle) {
+ rxrpc_put_bundle(rx->trans, rx->bundle);
+ rx->bundle = NULL;
+ }
+ if (rx->trans) {
+ rxrpc_put_transport(rx->trans);
+ rx->trans = NULL;
+ }
+ if (rx->local) {
+ rxrpc_put_local(rx->local);
+ rx->local = NULL;
+ }
+
+ key_put(rx->key);
+ rx->key = NULL;
+ key_put(rx->securities);
+ rx->securities = NULL;
+ sock_put(sk);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * release an RxRPC BSD socket on close() or equivalent
+ */
+static int rxrpc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ _enter("%p{%p}", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ sock->sk = NULL;
+
+ return rxrpc_release_sock(sk);
+}
+
+/*
+ * RxRPC network protocol
+ */
+static const struct proto_ops rxrpc_rpc_ops = {
+ .family = PF_UNIX,
+ .owner = THIS_MODULE,
+ .release = rxrpc_release,
+ .bind = rxrpc_bind,
+ .connect = rxrpc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = rxrpc_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = rxrpc_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = rxrpc_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = rxrpc_sendmsg,
+ .recvmsg = rxrpc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct proto rxrpc_proto = {
+ .name = "RXRPC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct rxrpc_sock),
+ .max_header = sizeof(struct rxrpc_header),
+};
+
+static const struct net_proto_family rxrpc_family_ops = {
+ .family = PF_RXRPC,
+ .create = rxrpc_create,
+ .owner = THIS_MODULE,
+};
+
+/*
+ * initialise and register the RxRPC protocol
+ */
+static int __init af_rxrpc_init(void)
+{
+ int ret = -1;
+
+ BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
+
+ rxrpc_epoch = htonl(get_seconds());
+
+ ret = -ENOMEM;
+ rxrpc_call_jar = kmem_cache_create(
+ "rxrpc_call_jar", sizeof(struct rxrpc_call), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!rxrpc_call_jar) {
+ printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n");
+ goto error_call_jar;
+ }
+
+ rxrpc_workqueue = alloc_workqueue("krxrpcd", 0, 1);
+ if (!rxrpc_workqueue) {
+ printk(KERN_NOTICE "RxRPC: Failed to allocate work queue\n");
+ goto error_work_queue;
+ }
+
+ ret = proto_register(&rxrpc_proto, 1);
+ if (ret < 0) {
+ printk(KERN_CRIT "RxRPC: Cannot register protocol\n");
+ goto error_proto;
+ }
+
+ ret = sock_register(&rxrpc_family_ops);
+ if (ret < 0) {
+ printk(KERN_CRIT "RxRPC: Cannot register socket family\n");
+ goto error_sock;
+ }
+
+ ret = register_key_type(&key_type_rxrpc);
+ if (ret < 0) {
+ printk(KERN_CRIT "RxRPC: Cannot register client key type\n");
+ goto error_key_type;
+ }
+
+ ret = register_key_type(&key_type_rxrpc_s);
+ if (ret < 0) {
+ printk(KERN_CRIT "RxRPC: Cannot register server key type\n");
+ goto error_key_type_s;
+ }
+
+ ret = rxrpc_sysctl_init();
+ if (ret < 0) {
+ printk(KERN_CRIT "RxRPC: Cannot register sysctls\n");
+ goto error_sysctls;
+ }
+
+#ifdef CONFIG_PROC_FS
+ proc_create("rxrpc_calls", 0, init_net.proc_net, &rxrpc_call_seq_fops);
+ proc_create("rxrpc_conns", 0, init_net.proc_net,
+ &rxrpc_connection_seq_fops);
+#endif
+ return 0;
+
+error_sysctls:
+ unregister_key_type(&key_type_rxrpc_s);
+error_key_type_s:
+ unregister_key_type(&key_type_rxrpc);
+error_key_type:
+ sock_unregister(PF_RXRPC);
+error_sock:
+ proto_unregister(&rxrpc_proto);
+error_proto:
+ destroy_workqueue(rxrpc_workqueue);
+error_work_queue:
+ kmem_cache_destroy(rxrpc_call_jar);
+error_call_jar:
+ return ret;
+}
+
+/*
+ * unregister the RxRPC protocol
+ */
+static void __exit af_rxrpc_exit(void)
+{
+ _enter("");
+ rxrpc_sysctl_exit();
+ unregister_key_type(&key_type_rxrpc_s);
+ unregister_key_type(&key_type_rxrpc);
+ sock_unregister(PF_RXRPC);
+ proto_unregister(&rxrpc_proto);
+ rxrpc_destroy_all_calls();
+ rxrpc_destroy_all_connections();
+ rxrpc_destroy_all_transports();
+ rxrpc_destroy_all_peers();
+ rxrpc_destroy_all_locals();
+
+ ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+
+ _debug("flush scheduled work");
+ flush_workqueue(rxrpc_workqueue);
+ remove_proc_entry("rxrpc_conns", init_net.proc_net);
+ remove_proc_entry("rxrpc_calls", init_net.proc_net);
+ destroy_workqueue(rxrpc_workqueue);
+ kmem_cache_destroy(rxrpc_call_jar);
+ _leave("");
+}
+
+module_init(af_rxrpc_init);
+module_exit(af_rxrpc_exit);
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
new file mode 100644
index 000000000..6d79310fc
--- /dev/null
+++ b/net/rxrpc/ar-accept.c
@@ -0,0 +1,510 @@
+/* incoming call handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+/*
+ * generate a connection-level abort
+ */
+static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
+ struct rxrpc_header *hdr)
+{
+ struct msghdr msg;
+ struct kvec iov[1];
+ size_t len;
+ int ret;
+
+ _enter("%d,,", local->debug_id);
+
+ msg.msg_name = &srx->transport.sin;
+ msg.msg_namelen = sizeof(srx->transport.sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ hdr->seq = 0;
+ hdr->type = RXRPC_PACKET_TYPE_BUSY;
+ hdr->flags = 0;
+ hdr->userStatus = 0;
+ hdr->_rsvd = 0;
+
+ iov[0].iov_base = hdr;
+ iov[0].iov_len = sizeof(*hdr);
+
+ len = iov[0].iov_len;
+
+ hdr->serial = htonl(1);
+ _proto("Tx BUSY %%%u", ntohl(hdr->serial));
+
+ ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
+ if (ret < 0) {
+ _leave(" = -EAGAIN [sendmsg failed: %d]", ret);
+ return -EAGAIN;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * accept an incoming call that needs peer, transport and/or connection setting
+ * up
+ */
+static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
+ struct rxrpc_sock *rx,
+ struct sk_buff *skb,
+ struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_transport *trans;
+ struct rxrpc_skb_priv *sp, *nsp;
+ struct rxrpc_peer *peer;
+ struct rxrpc_call *call;
+ struct sk_buff *notification;
+ int ret;
+
+ _enter("");
+
+ sp = rxrpc_skb(skb);
+
+ /* get a notification message to send to the server app */
+ notification = alloc_skb(0, GFP_NOFS);
+ if (!notification) {
+ _debug("no memory");
+ ret = -ENOMEM;
+ goto error_nofree;
+ }
+ rxrpc_new_skb(notification);
+ notification->mark = RXRPC_SKB_MARK_NEW_CALL;
+
+ peer = rxrpc_get_peer(srx, GFP_NOIO);
+ if (IS_ERR(peer)) {
+ _debug("no peer");
+ ret = -EBUSY;
+ goto error;
+ }
+
+ trans = rxrpc_get_transport(local, peer, GFP_NOIO);
+ rxrpc_put_peer(peer);
+ if (IS_ERR(trans)) {
+ _debug("no trans");
+ ret = -EBUSY;
+ goto error;
+ }
+
+ conn = rxrpc_incoming_connection(trans, &sp->hdr, GFP_NOIO);
+ rxrpc_put_transport(trans);
+ if (IS_ERR(conn)) {
+ _debug("no conn");
+ ret = PTR_ERR(conn);
+ goto error;
+ }
+
+ call = rxrpc_incoming_call(rx, conn, &sp->hdr, GFP_NOIO);
+ rxrpc_put_connection(conn);
+ if (IS_ERR(call)) {
+ _debug("no call");
+ ret = PTR_ERR(call);
+ goto error;
+ }
+
+ /* attach the call to the socket */
+ read_lock_bh(&local->services_lock);
+ if (rx->sk.sk_state == RXRPC_CLOSE)
+ goto invalid_service;
+
+ write_lock(&rx->call_lock);
+ if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
+ rxrpc_get_call(call);
+
+ spin_lock(&call->conn->state_lock);
+ if (sp->hdr.securityIndex > 0 &&
+ call->conn->state == RXRPC_CONN_SERVER_UNSECURED) {
+ _debug("await conn sec");
+ list_add_tail(&call->accept_link, &rx->secureq);
+ call->conn->state = RXRPC_CONN_SERVER_CHALLENGING;
+ atomic_inc(&call->conn->usage);
+ set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events);
+ rxrpc_queue_conn(call->conn);
+ } else {
+ _debug("conn ready");
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ list_add_tail(&call->accept_link, &rx->acceptq);
+ rxrpc_get_call(call);
+ nsp = rxrpc_skb(notification);
+ nsp->call = call;
+
+ ASSERTCMP(atomic_read(&call->usage), >=, 3);
+
+ _debug("notify");
+ spin_lock(&call->lock);
+ ret = rxrpc_queue_rcv_skb(call, notification, true,
+ false);
+ spin_unlock(&call->lock);
+ notification = NULL;
+ BUG_ON(ret < 0);
+ }
+ spin_unlock(&call->conn->state_lock);
+
+ _debug("queued");
+ }
+ write_unlock(&rx->call_lock);
+
+ _debug("process");
+ rxrpc_fast_process_packet(call, skb);
+
+ _debug("done");
+ read_unlock_bh(&local->services_lock);
+ rxrpc_free_skb(notification);
+ rxrpc_put_call(call);
+ _leave(" = 0");
+ return 0;
+
+invalid_service:
+ _debug("invalid");
+ read_unlock_bh(&local->services_lock);
+
+ read_lock_bh(&call->state_lock);
+ if (!test_bit(RXRPC_CALL_RELEASE, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) {
+ rxrpc_get_call(call);
+ rxrpc_queue_call(call);
+ }
+ read_unlock_bh(&call->state_lock);
+ rxrpc_put_call(call);
+ ret = -ECONNREFUSED;
+error:
+ rxrpc_free_skb(notification);
+error_nofree:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * accept incoming calls that need peer, transport and/or connection setting up
+ * - the packets we get are all incoming client DATA packets that have seq == 1
+ */
+void rxrpc_accept_incoming_calls(struct work_struct *work)
+{
+ struct rxrpc_local *local =
+ container_of(work, struct rxrpc_local, acceptor);
+ struct rxrpc_skb_priv *sp;
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_sock *rx;
+ struct sk_buff *skb;
+ __be16 service_id;
+ int ret;
+
+ _enter("%d", local->debug_id);
+
+ read_lock_bh(&rxrpc_local_lock);
+ if (atomic_read(&local->usage) > 0)
+ rxrpc_get_local(local);
+ else
+ local = NULL;
+ read_unlock_bh(&rxrpc_local_lock);
+ if (!local) {
+ _leave(" [local dead]");
+ return;
+ }
+
+process_next_packet:
+ skb = skb_dequeue(&local->accept_queue);
+ if (!skb) {
+ rxrpc_put_local(local);
+ _leave("\n");
+ return;
+ }
+
+ _net("incoming call skb %p", skb);
+
+ sp = rxrpc_skb(skb);
+
+ /* determine the remote address */
+ memset(&srx, 0, sizeof(srx));
+ srx.srx_family = AF_RXRPC;
+ srx.transport.family = local->srx.transport.family;
+ srx.transport_type = local->srx.transport_type;
+ switch (srx.transport.family) {
+ case AF_INET:
+ srx.transport_len = sizeof(struct sockaddr_in);
+ srx.transport.sin.sin_port = udp_hdr(skb)->source;
+ srx.transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+ break;
+ default:
+ goto busy;
+ }
+
+ /* get the socket providing the service */
+ service_id = sp->hdr.serviceId;
+ read_lock_bh(&local->services_lock);
+ list_for_each_entry(rx, &local->services, listen_link) {
+ if (rx->service_id == service_id &&
+ rx->sk.sk_state != RXRPC_CLOSE)
+ goto found_service;
+ }
+ read_unlock_bh(&local->services_lock);
+ goto invalid_service;
+
+found_service:
+ _debug("found service %hd", ntohs(rx->service_id));
+ if (sk_acceptq_is_full(&rx->sk))
+ goto backlog_full;
+ sk_acceptq_added(&rx->sk);
+ sock_hold(&rx->sk);
+ read_unlock_bh(&local->services_lock);
+
+ ret = rxrpc_accept_incoming_call(local, rx, skb, &srx);
+ if (ret < 0)
+ sk_acceptq_removed(&rx->sk);
+ sock_put(&rx->sk);
+ switch (ret) {
+ case -ECONNRESET: /* old calls are ignored */
+ case -ECONNABORTED: /* aborted calls are reaborted or ignored */
+ case 0:
+ goto process_next_packet;
+ case -ECONNREFUSED:
+ goto invalid_service;
+ case -EBUSY:
+ goto busy;
+ case -EKEYREJECTED:
+ goto security_mismatch;
+ default:
+ BUG();
+ }
+
+backlog_full:
+ read_unlock_bh(&local->services_lock);
+busy:
+ rxrpc_busy(local, &srx, &sp->hdr);
+ rxrpc_free_skb(skb);
+ goto process_next_packet;
+
+invalid_service:
+ skb->priority = RX_INVALID_OPERATION;
+ rxrpc_reject_packet(local, skb);
+ goto process_next_packet;
+
+ /* can't change connection security type mid-flow */
+security_mismatch:
+ skb->priority = RX_PROTOCOL_ERROR;
+ rxrpc_reject_packet(local, skb);
+ goto process_next_packet;
+}
+
+/*
+ * handle acceptance of a call by userspace
+ * - assign the user call ID to the call at the front of the queue
+ */
+struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
+ unsigned long user_call_ID)
+{
+ struct rxrpc_call *call;
+ struct rb_node *parent, **pp;
+ int ret;
+
+ _enter(",%lx", user_call_ID);
+
+ ASSERT(!irqs_disabled());
+
+ write_lock(&rx->call_lock);
+
+ ret = -ENODATA;
+ if (list_empty(&rx->acceptq))
+ goto out;
+
+ /* check the user ID isn't already in use */
+ ret = -EBADSLT;
+ pp = &rx->calls.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ call = rb_entry(parent, struct rxrpc_call, sock_node);
+
+ if (user_call_ID < call->user_call_ID)
+ pp = &(*pp)->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ pp = &(*pp)->rb_right;
+ else
+ goto out;
+ }
+
+ /* dequeue the first call and check it's still valid */
+ call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+ list_del_init(&call->accept_link);
+ sk_acceptq_removed(&rx->sk);
+
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_SERVER_ACCEPTING:
+ call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+ break;
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ ret = -ECONNABORTED;
+ goto out_release;
+ case RXRPC_CALL_NETWORK_ERROR:
+ ret = call->conn->error;
+ goto out_release;
+ case RXRPC_CALL_DEAD:
+ ret = -ETIME;
+ goto out_discard;
+ default:
+ BUG();
+ }
+
+ /* formalise the acceptance */
+ call->user_call_ID = user_call_ID;
+ rb_link_node(&call->sock_node, parent, pp);
+ rb_insert_color(&call->sock_node, &rx->calls);
+ if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
+ BUG();
+ if (test_and_set_bit(RXRPC_CALL_ACCEPTED, &call->events))
+ BUG();
+ rxrpc_queue_call(call);
+
+ rxrpc_get_call(call);
+ write_unlock_bh(&call->state_lock);
+ write_unlock(&rx->call_lock);
+ _leave(" = %p{%d}", call, call->debug_id);
+ return call;
+
+ /* if the call is already dying or dead, then we leave the socket's ref
+ * on it to be released by rxrpc_dead_call_expired() as induced by
+ * rxrpc_release_call() */
+out_release:
+ _debug("release %p", call);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ rxrpc_queue_call(call);
+out_discard:
+ write_unlock_bh(&call->state_lock);
+ _debug("discard %p", call);
+out:
+ write_unlock(&rx->call_lock);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * handle rejectance of a call by userspace
+ * - reject the call at the front of the queue
+ */
+int rxrpc_reject_call(struct rxrpc_sock *rx)
+{
+ struct rxrpc_call *call;
+ int ret;
+
+ _enter("");
+
+ ASSERT(!irqs_disabled());
+
+ write_lock(&rx->call_lock);
+
+ ret = -ENODATA;
+ if (list_empty(&rx->acceptq))
+ goto out;
+
+ /* dequeue the first call and check it's still valid */
+ call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+ list_del_init(&call->accept_link);
+ sk_acceptq_removed(&rx->sk);
+
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_SERVER_ACCEPTING:
+ call->state = RXRPC_CALL_SERVER_BUSY;
+ if (test_and_set_bit(RXRPC_CALL_REJECT_BUSY, &call->events))
+ rxrpc_queue_call(call);
+ ret = 0;
+ goto out_release;
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ ret = -ECONNABORTED;
+ goto out_release;
+ case RXRPC_CALL_NETWORK_ERROR:
+ ret = call->conn->error;
+ goto out_release;
+ case RXRPC_CALL_DEAD:
+ ret = -ETIME;
+ goto out_discard;
+ default:
+ BUG();
+ }
+
+ /* if the call is already dying or dead, then we leave the socket's ref
+ * on it to be released by rxrpc_dead_call_expired() as induced by
+ * rxrpc_release_call() */
+out_release:
+ _debug("release %p", call);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ rxrpc_queue_call(call);
+out_discard:
+ write_unlock_bh(&call->state_lock);
+ _debug("discard %p", call);
+out:
+ write_unlock(&rx->call_lock);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call
+ * @sock: The socket on which the impending call is waiting
+ * @user_call_ID: The tag to attach to the call
+ *
+ * Allow a kernel service to accept an incoming call, assuming the incoming
+ * call is still valid.
+ */
+struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
+ unsigned long user_call_ID)
+{
+ struct rxrpc_call *call;
+
+ _enter(",%lx", user_call_ID);
+ call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID);
+ _leave(" = %p", call);
+ return call;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_accept_call);
+
+/**
+ * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call
+ * @sock: The socket on which the impending call is waiting
+ *
+ * Allow a kernel service to reject an incoming call with a BUSY message,
+ * assuming the incoming call is still valid.
+ */
+int rxrpc_kernel_reject_call(struct socket *sock)
+{
+ int ret;
+
+ _enter("");
+ ret = rxrpc_reject_call(rxrpc_sk(sock->sk));
+ _leave(" = %d", ret);
+ return ret;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_reject_call);
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
new file mode 100644
index 000000000..e0547f521
--- /dev/null
+++ b/net/rxrpc/ar-ack.c
@@ -0,0 +1,1356 @@
+/* Management of Tx window, Tx resend, ACKs and out-of-sequence reception
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/circ_buf.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * How long to wait before scheduling ACK generation after seeing a
+ * packet with RXRPC_REQUEST_ACK set (in jiffies).
+ */
+unsigned rxrpc_requested_ack_delay = 1;
+
+/*
+ * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
+ *
+ * We use this when we've received new data packets. If those packets aren't
+ * all consumed within this time we will send a DELAY ACK if an ACK was not
+ * requested to let the sender know it doesn't need to resend.
+ */
+unsigned rxrpc_soft_ack_delay = 1 * HZ;
+
+/*
+ * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
+ *
+ * We use this when we've consumed some previously soft-ACK'd packets when
+ * further packets aren't immediately received to decide when to send an IDLE
+ * ACK let the other end know that it can free up its Tx buffer space.
+ */
+unsigned rxrpc_idle_ack_delay = 0.5 * HZ;
+
+/*
+ * Receive window size in packets. This indicates the maximum number of
+ * unconsumed received packets we're willing to retain in memory. Once this
+ * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
+ * packets.
+ */
+unsigned rxrpc_rx_window_size = 32;
+
+/*
+ * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
+ * made by gluing normal packets together that we're willing to handle.
+ */
+unsigned rxrpc_rx_mtu = 5692;
+
+/*
+ * The maximum number of fragments in a received jumbo packet that we tell the
+ * sender that we're willing to handle.
+ */
+unsigned rxrpc_rx_jumbo_max = 4;
+
+static const char *rxrpc_acks(u8 reason)
+{
+ static const char *const str[] = {
+ "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY",
+ "IDL", "-?-"
+ };
+
+ if (reason >= ARRAY_SIZE(str))
+ reason = ARRAY_SIZE(str) - 1;
+ return str[reason];
+}
+
+static const s8 rxrpc_ack_priority[] = {
+ [0] = 0,
+ [RXRPC_ACK_DELAY] = 1,
+ [RXRPC_ACK_REQUESTED] = 2,
+ [RXRPC_ACK_IDLE] = 3,
+ [RXRPC_ACK_PING_RESPONSE] = 4,
+ [RXRPC_ACK_DUPLICATE] = 5,
+ [RXRPC_ACK_OUT_OF_SEQUENCE] = 6,
+ [RXRPC_ACK_EXCEEDS_WINDOW] = 7,
+ [RXRPC_ACK_NOSPACE] = 8,
+};
+
+/*
+ * propose an ACK be sent
+ */
+void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+ __be32 serial, bool immediate)
+{
+ unsigned long expiry;
+ s8 prior = rxrpc_ack_priority[ack_reason];
+
+ ASSERTCMP(prior, >, 0);
+
+ _enter("{%d},%s,%%%x,%u",
+ call->debug_id, rxrpc_acks(ack_reason), ntohl(serial),
+ immediate);
+
+ if (prior < rxrpc_ack_priority[call->ackr_reason]) {
+ if (immediate)
+ goto cancel_timer;
+ return;
+ }
+
+ /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
+ * numbers */
+ if (prior == rxrpc_ack_priority[call->ackr_reason]) {
+ if (prior <= 4)
+ call->ackr_serial = serial;
+ if (immediate)
+ goto cancel_timer;
+ return;
+ }
+
+ call->ackr_reason = ack_reason;
+ call->ackr_serial = serial;
+
+ switch (ack_reason) {
+ case RXRPC_ACK_DELAY:
+ _debug("run delay timer");
+ expiry = rxrpc_soft_ack_delay;
+ goto run_timer;
+
+ case RXRPC_ACK_IDLE:
+ if (!immediate) {
+ _debug("run defer timer");
+ expiry = rxrpc_idle_ack_delay;
+ goto run_timer;
+ }
+ goto cancel_timer;
+
+ case RXRPC_ACK_REQUESTED:
+ expiry = rxrpc_requested_ack_delay;
+ if (!expiry)
+ goto cancel_timer;
+ if (!immediate || serial == cpu_to_be32(1)) {
+ _debug("run defer timer");
+ goto run_timer;
+ }
+
+ default:
+ _debug("immediate ACK");
+ goto cancel_timer;
+ }
+
+run_timer:
+ expiry += jiffies;
+ if (!timer_pending(&call->ack_timer) ||
+ time_after(call->ack_timer.expires, expiry))
+ mod_timer(&call->ack_timer, expiry);
+ return;
+
+cancel_timer:
+ _debug("cancel timer %%%u", ntohl(serial));
+ try_to_del_timer_sync(&call->ack_timer);
+ read_lock_bh(&call->state_lock);
+ if (call->state <= RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_ACK, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * propose an ACK be sent, locking the call structure
+ */
+void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+ __be32 serial, bool immediate)
+{
+ s8 prior = rxrpc_ack_priority[ack_reason];
+
+ if (prior > rxrpc_ack_priority[call->ackr_reason]) {
+ spin_lock_bh(&call->lock);
+ __rxrpc_propose_ACK(call, ack_reason, serial, immediate);
+ spin_unlock_bh(&call->lock);
+ }
+}
+
+/*
+ * set the resend timer
+ */
+static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
+ unsigned long resend_at)
+{
+ read_lock_bh(&call->state_lock);
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ resend = 0;
+
+ if (resend & 1) {
+ _debug("SET RESEND");
+ set_bit(RXRPC_CALL_RESEND, &call->events);
+ }
+
+ if (resend & 2) {
+ _debug("MODIFY RESEND TIMER");
+ set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ mod_timer(&call->resend_timer, resend_at);
+ } else {
+ _debug("KILL RESEND TIMER");
+ del_timer_sync(&call->resend_timer);
+ clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ }
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * resend packets
+ */
+static void rxrpc_resend(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_header *hdr;
+ struct sk_buff *txb;
+ unsigned long *p_txb, resend_at;
+ bool stop;
+ int loop;
+ u8 resend;
+
+ _enter("{%d,%d,%d,%d},",
+ call->acks_hard, call->acks_unacked,
+ atomic_read(&call->sequence),
+ CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
+
+ stop = false;
+ resend = 0;
+ resend_at = 0;
+
+ for (loop = call->acks_tail;
+ loop != call->acks_head || stop;
+ loop = (loop + 1) & (call->acks_winsz - 1)
+ ) {
+ p_txb = call->acks_window + loop;
+ smp_read_barrier_depends();
+ if (*p_txb & 1)
+ continue;
+
+ txb = (struct sk_buff *) *p_txb;
+ sp = rxrpc_skb(txb);
+
+ if (sp->need_resend) {
+ sp->need_resend = false;
+
+ /* each Tx packet has a new serial number */
+ sp->hdr.serial =
+ htonl(atomic_inc_return(&call->conn->serial));
+
+ hdr = (struct rxrpc_header *) txb->head;
+ hdr->serial = sp->hdr.serial;
+
+ _proto("Tx DATA %%%u { #%d }",
+ ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+ if (rxrpc_send_packet(call->conn->trans, txb) < 0) {
+ stop = true;
+ sp->resend_at = jiffies + 3;
+ } else {
+ sp->resend_at =
+ jiffies + rxrpc_resend_timeout;
+ }
+ }
+
+ if (time_after_eq(jiffies + 1, sp->resend_at)) {
+ sp->need_resend = true;
+ resend |= 1;
+ } else if (resend & 2) {
+ if (time_before(sp->resend_at, resend_at))
+ resend_at = sp->resend_at;
+ } else {
+ resend_at = sp->resend_at;
+ resend |= 2;
+ }
+ }
+
+ rxrpc_set_resend(call, resend, resend_at);
+ _leave("");
+}
+
+/*
+ * handle resend timer expiry
+ */
+static void rxrpc_resend_timer(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *txb;
+ unsigned long *p_txb, resend_at;
+ int loop;
+ u8 resend;
+
+ _enter("%d,%d,%d",
+ call->acks_tail, call->acks_unacked, call->acks_head);
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ resend = 0;
+ resend_at = 0;
+
+ for (loop = call->acks_unacked;
+ loop != call->acks_head;
+ loop = (loop + 1) & (call->acks_winsz - 1)
+ ) {
+ p_txb = call->acks_window + loop;
+ smp_read_barrier_depends();
+ txb = (struct sk_buff *) (*p_txb & ~1);
+ sp = rxrpc_skb(txb);
+
+ ASSERT(!(*p_txb & 1));
+
+ if (sp->need_resend) {
+ ;
+ } else if (time_after_eq(jiffies + 1, sp->resend_at)) {
+ sp->need_resend = true;
+ resend |= 1;
+ } else if (resend & 2) {
+ if (time_before(sp->resend_at, resend_at))
+ resend_at = sp->resend_at;
+ } else {
+ resend_at = sp->resend_at;
+ resend |= 2;
+ }
+ }
+
+ rxrpc_set_resend(call, resend, resend_at);
+ _leave("");
+}
+
+/*
+ * process soft ACKs of our transmitted packets
+ * - these indicate packets the peer has or has not received, but hasn't yet
+ * given to the consumer, and so can still be discarded and re-requested
+ */
+static int rxrpc_process_soft_ACKs(struct rxrpc_call *call,
+ struct rxrpc_ackpacket *ack,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *txb;
+ unsigned long *p_txb, resend_at;
+ int loop;
+ u8 sacks[RXRPC_MAXACKS], resend;
+
+ _enter("{%d,%d},{%d},",
+ call->acks_hard,
+ CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz),
+ ack->nAcks);
+
+ if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0)
+ goto protocol_error;
+
+ resend = 0;
+ resend_at = 0;
+ for (loop = 0; loop < ack->nAcks; loop++) {
+ p_txb = call->acks_window;
+ p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1);
+ smp_read_barrier_depends();
+ txb = (struct sk_buff *) (*p_txb & ~1);
+ sp = rxrpc_skb(txb);
+
+ switch (sacks[loop]) {
+ case RXRPC_ACK_TYPE_ACK:
+ sp->need_resend = false;
+ *p_txb |= 1;
+ break;
+ case RXRPC_ACK_TYPE_NACK:
+ sp->need_resend = true;
+ *p_txb &= ~1;
+ resend = 1;
+ break;
+ default:
+ _debug("Unsupported ACK type %d", sacks[loop]);
+ goto protocol_error;
+ }
+ }
+
+ smp_mb();
+ call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1);
+
+ /* anything not explicitly ACK'd is implicitly NACK'd, but may just not
+ * have been received or processed yet by the far end */
+ for (loop = call->acks_unacked;
+ loop != call->acks_head;
+ loop = (loop + 1) & (call->acks_winsz - 1)
+ ) {
+ p_txb = call->acks_window + loop;
+ smp_read_barrier_depends();
+ txb = (struct sk_buff *) (*p_txb & ~1);
+ sp = rxrpc_skb(txb);
+
+ if (*p_txb & 1) {
+ /* packet must have been discarded */
+ sp->need_resend = true;
+ *p_txb &= ~1;
+ resend |= 1;
+ } else if (sp->need_resend) {
+ ;
+ } else if (time_after_eq(jiffies + 1, sp->resend_at)) {
+ sp->need_resend = true;
+ resend |= 1;
+ } else if (resend & 2) {
+ if (time_before(sp->resend_at, resend_at))
+ resend_at = sp->resend_at;
+ } else {
+ resend_at = sp->resend_at;
+ resend |= 2;
+ }
+ }
+
+ rxrpc_set_resend(call, resend, resend_at);
+ _leave(" = 0");
+ return 0;
+
+protocol_error:
+ _leave(" = -EPROTO");
+ return -EPROTO;
+}
+
+/*
+ * discard hard-ACK'd packets from the Tx window
+ */
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
+{
+ unsigned long _skb;
+ int tail = call->acks_tail, old_tail;
+ int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
+
+ _enter("{%u,%u},%u", call->acks_hard, win, hard);
+
+ ASSERTCMP(hard - call->acks_hard, <=, win);
+
+ while (call->acks_hard < hard) {
+ smp_read_barrier_depends();
+ _skb = call->acks_window[tail] & ~1;
+ rxrpc_free_skb((struct sk_buff *) _skb);
+ old_tail = tail;
+ tail = (tail + 1) & (call->acks_winsz - 1);
+ call->acks_tail = tail;
+ if (call->acks_unacked == old_tail)
+ call->acks_unacked = tail;
+ call->acks_hard++;
+ }
+
+ wake_up(&call->tx_waitq);
+}
+
+/*
+ * clear the Tx window in the event of a failure
+ */
+static void rxrpc_clear_tx_window(struct rxrpc_call *call)
+{
+ rxrpc_rotate_tx_window(call, atomic_read(&call->sequence));
+}
+
+/*
+ * drain the out of sequence received packet queue into the packet Rx queue
+ */
+static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ bool terminal;
+ int ret;
+
+ _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos);
+
+ spin_lock_bh(&call->lock);
+
+ ret = -ECONNRESET;
+ if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
+ goto socket_unavailable;
+
+ skb = skb_dequeue(&call->rx_oos_queue);
+ if (skb) {
+ sp = rxrpc_skb(skb);
+
+ _debug("drain OOS packet %d [%d]",
+ ntohl(sp->hdr.seq), call->rx_first_oos);
+
+ if (ntohl(sp->hdr.seq) != call->rx_first_oos) {
+ skb_queue_head(&call->rx_oos_queue, skb);
+ call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq);
+ _debug("requeue %p {%u}", skb, call->rx_first_oos);
+ } else {
+ skb->mark = RXRPC_SKB_MARK_DATA;
+ terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
+ !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
+ ret = rxrpc_queue_rcv_skb(call, skb, true, terminal);
+ BUG_ON(ret < 0);
+ _debug("drain #%u", call->rx_data_post);
+ call->rx_data_post++;
+
+ /* find out what the next packet is */
+ skb = skb_peek(&call->rx_oos_queue);
+ if (skb)
+ call->rx_first_oos =
+ ntohl(rxrpc_skb(skb)->hdr.seq);
+ else
+ call->rx_first_oos = 0;
+ _debug("peek %p {%u}", skb, call->rx_first_oos);
+ }
+ }
+
+ ret = 0;
+socket_unavailable:
+ spin_unlock_bh(&call->lock);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * insert an out of sequence packet into the buffer
+ */
+static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp, *psp;
+ struct sk_buff *p;
+ u32 seq;
+
+ sp = rxrpc_skb(skb);
+ seq = ntohl(sp->hdr.seq);
+ _enter(",,{%u}", seq);
+
+ skb->destructor = rxrpc_packet_destructor;
+ ASSERTCMP(sp->call, ==, NULL);
+ sp->call = call;
+ rxrpc_get_call(call);
+
+ /* insert into the buffer in sequence order */
+ spin_lock_bh(&call->lock);
+
+ skb_queue_walk(&call->rx_oos_queue, p) {
+ psp = rxrpc_skb(p);
+ if (ntohl(psp->hdr.seq) > seq) {
+ _debug("insert oos #%u before #%u",
+ seq, ntohl(psp->hdr.seq));
+ skb_insert(p, skb, &call->rx_oos_queue);
+ goto inserted;
+ }
+ }
+
+ _debug("append oos #%u", seq);
+ skb_queue_tail(&call->rx_oos_queue, skb);
+inserted:
+
+ /* we might now have a new front to the queue */
+ if (call->rx_first_oos == 0 || seq < call->rx_first_oos)
+ call->rx_first_oos = seq;
+
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ call->rx_data_post == call->rx_first_oos) {
+ _debug("drain rx oos now");
+ set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events);
+ }
+ read_unlock(&call->state_lock);
+
+ spin_unlock_bh(&call->lock);
+ _leave(" [stored #%u]", call->rx_first_oos);
+}
+
+/*
+ * clear the Tx window on final ACK reception
+ */
+static void rxrpc_zap_tx_window(struct rxrpc_call *call)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ unsigned long _skb, *acks_window;
+ u8 winsz = call->acks_winsz;
+ int tail;
+
+ acks_window = call->acks_window;
+ call->acks_window = NULL;
+
+ while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) {
+ tail = call->acks_tail;
+ smp_read_barrier_depends();
+ _skb = acks_window[tail] & ~1;
+ smp_mb();
+ call->acks_tail = (call->acks_tail + 1) & (winsz - 1);
+
+ skb = (struct sk_buff *) _skb;
+ sp = rxrpc_skb(skb);
+ _debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
+ rxrpc_free_skb(skb);
+ }
+
+ kfree(acks_window);
+}
+
+/*
+ * process the extra information that may be appended to an ACK packet
+ */
+static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
+ unsigned int latest, int nAcks)
+{
+ struct rxrpc_ackinfo ackinfo;
+ struct rxrpc_peer *peer;
+ unsigned int mtu;
+
+ if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) {
+ _leave(" [no ackinfo]");
+ return;
+ }
+
+ _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
+ latest,
+ ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU),
+ ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max));
+
+ mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
+
+ peer = call->conn->trans->peer;
+ if (mtu < peer->maxdata) {
+ spin_lock_bh(&peer->lock);
+ peer->maxdata = mtu;
+ peer->mtu = mtu + peer->hdrsize;
+ spin_unlock_bh(&peer->lock);
+ _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
+ }
+}
+
+/*
+ * process packets in the reception queue
+ */
+static int rxrpc_process_rx_queue(struct rxrpc_call *call,
+ u32 *_abort_code)
+{
+ struct rxrpc_ackpacket ack;
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ bool post_ACK;
+ int latest;
+ u32 hard, tx;
+
+ _enter("");
+
+process_further:
+ skb = skb_dequeue(&call->rx_queue);
+ if (!skb)
+ return -EAGAIN;
+
+ _net("deferred skb %p", skb);
+
+ sp = rxrpc_skb(skb);
+
+ _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state);
+
+ post_ACK = false;
+
+ switch (sp->hdr.type) {
+ /* data packets that wind up here have been received out of
+ * order, need security processing or are jumbo packets */
+ case RXRPC_PACKET_TYPE_DATA:
+ _proto("OOSQ DATA %%%u { #%u }",
+ ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+
+ /* secured packets must be verified and possibly decrypted */
+ if (rxrpc_verify_packet(call, skb, _abort_code) < 0)
+ goto protocol_error;
+
+ rxrpc_insert_oos_packet(call, skb);
+ goto process_further;
+
+ /* partial ACK to process */
+ case RXRPC_PACKET_TYPE_ACK:
+ if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) {
+ _debug("extraction failure");
+ goto protocol_error;
+ }
+ if (!skb_pull(skb, sizeof(ack)))
+ BUG();
+
+ latest = ntohl(sp->hdr.serial);
+ hard = ntohl(ack.firstPacket);
+ tx = atomic_read(&call->sequence);
+
+ _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+ latest,
+ ntohs(ack.maxSkew),
+ hard,
+ ntohl(ack.previousPacket),
+ ntohl(ack.serial),
+ rxrpc_acks(ack.reason),
+ ack.nAcks);
+
+ rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks);
+
+ if (ack.reason == RXRPC_ACK_PING) {
+ _proto("Rx ACK %%%u PING Request", latest);
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
+ sp->hdr.serial, true);
+ }
+
+ /* discard any out-of-order or duplicate ACKs */
+ if (latest - call->acks_latest <= 0) {
+ _debug("discard ACK %d <= %d",
+ latest, call->acks_latest);
+ goto discard;
+ }
+ call->acks_latest = latest;
+
+ if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+ call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY &&
+ call->state != RXRPC_CALL_SERVER_SEND_REPLY &&
+ call->state != RXRPC_CALL_SERVER_AWAIT_ACK)
+ goto discard;
+
+ _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state);
+
+ if (hard > 0) {
+ if (hard - 1 > tx) {
+ _debug("hard-ACK'd packet %d not transmitted"
+ " (%d top)",
+ hard - 1, tx);
+ goto protocol_error;
+ }
+
+ if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
+ call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
+ hard > tx)
+ goto all_acked;
+
+ smp_rmb();
+ rxrpc_rotate_tx_window(call, hard - 1);
+ }
+
+ if (ack.nAcks > 0) {
+ if (hard - 1 + ack.nAcks > tx) {
+ _debug("soft-ACK'd packet %d+%d not"
+ " transmitted (%d top)",
+ hard - 1, ack.nAcks, tx);
+ goto protocol_error;
+ }
+
+ if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0)
+ goto protocol_error;
+ }
+ goto discard;
+
+ /* complete ACK to process */
+ case RXRPC_PACKET_TYPE_ACKALL:
+ goto all_acked;
+
+ /* abort and busy are handled elsewhere */
+ case RXRPC_PACKET_TYPE_BUSY:
+ case RXRPC_PACKET_TYPE_ABORT:
+ BUG();
+
+ /* connection level events - also handled elsewhere */
+ case RXRPC_PACKET_TYPE_CHALLENGE:
+ case RXRPC_PACKET_TYPE_RESPONSE:
+ case RXRPC_PACKET_TYPE_DEBUG:
+ BUG();
+ }
+
+ /* if we've had a hard ACK that covers all the packets we've sent, then
+ * that ends that phase of the operation */
+all_acked:
+ write_lock_bh(&call->state_lock);
+ _debug("ack all %d", call->state);
+
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ break;
+ case RXRPC_CALL_SERVER_AWAIT_ACK:
+ _debug("srv complete");
+ call->state = RXRPC_CALL_COMPLETE;
+ post_ACK = true;
+ break;
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ goto protocol_error_unlock; /* can't occur yet */
+ default:
+ write_unlock_bh(&call->state_lock);
+ goto discard; /* assume packet left over from earlier phase */
+ }
+
+ write_unlock_bh(&call->state_lock);
+
+ /* if all the packets we sent are hard-ACK'd, then we can discard
+ * whatever we've got left */
+ _debug("clear Tx %d",
+ CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
+
+ del_timer_sync(&call->resend_timer);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+
+ if (call->acks_window)
+ rxrpc_zap_tx_window(call);
+
+ if (post_ACK) {
+ /* post the final ACK message for userspace to pick up */
+ _debug("post ACK");
+ skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
+ sp->call = call;
+ rxrpc_get_call(call);
+ spin_lock_bh(&call->lock);
+ if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
+ BUG();
+ spin_unlock_bh(&call->lock);
+ goto process_further;
+ }
+
+discard:
+ rxrpc_free_skb(skb);
+ goto process_further;
+
+protocol_error_unlock:
+ write_unlock_bh(&call->state_lock);
+protocol_error:
+ rxrpc_free_skb(skb);
+ _leave(" = -EPROTO");
+ return -EPROTO;
+}
+
+/*
+ * post a message to the socket Rx queue for recvmsg() to pick up
+ */
+static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
+ bool fatal)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ int ret;
+
+ _enter("{%d,%lx},%u,%u,%d",
+ call->debug_id, call->flags, mark, error, fatal);
+
+ /* remove timers and things for fatal messages */
+ if (fatal) {
+ del_timer_sync(&call->resend_timer);
+ del_timer_sync(&call->ack_timer);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ }
+
+ if (mark != RXRPC_SKB_MARK_NEW_CALL &&
+ !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ _leave("[no userid]");
+ return 0;
+ }
+
+ if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
+ skb = alloc_skb(0, GFP_NOFS);
+ if (!skb)
+ return -ENOMEM;
+
+ rxrpc_new_skb(skb);
+
+ skb->mark = mark;
+
+ sp = rxrpc_skb(skb);
+ memset(sp, 0, sizeof(*sp));
+ sp->error = error;
+ sp->call = call;
+ rxrpc_get_call(call);
+
+ spin_lock_bh(&call->lock);
+ ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
+ spin_unlock_bh(&call->lock);
+ BUG_ON(ret < 0);
+ }
+
+ return 0;
+}
+
+/*
+ * handle background processing of incoming call packets and ACK / abort
+ * generation
+ */
+void rxrpc_process_call(struct work_struct *work)
+{
+ struct rxrpc_call *call =
+ container_of(work, struct rxrpc_call, processor);
+ struct rxrpc_ackpacket ack;
+ struct rxrpc_ackinfo ackinfo;
+ struct rxrpc_header hdr;
+ struct msghdr msg;
+ struct kvec iov[5];
+ unsigned long bits;
+ __be32 data, pad;
+ size_t len;
+ int genbit, loop, nbit, ioc, ret, mtu;
+ u32 abort_code = RX_PROTOCOL_ERROR;
+ u8 *acks = NULL;
+
+ //printk("\n--------------------\n");
+ _enter("{%d,%s,%lx} [%lu]",
+ call->debug_id, rxrpc_call_states[call->state], call->events,
+ (jiffies - call->creation_jif) / (HZ / 10));
+
+ if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) {
+ _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX");
+ return;
+ }
+
+ /* there's a good chance we're going to have to send a message, so set
+ * one up in advance */
+ msg.msg_name = &call->conn->trans->peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(call->conn->trans->peer->srx.transport.sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ hdr.epoch = call->conn->epoch;
+ hdr.cid = call->cid;
+ hdr.callNumber = call->call_id;
+ hdr.seq = 0;
+ hdr.type = RXRPC_PACKET_TYPE_ACK;
+ hdr.flags = call->conn->out_clientflag;
+ hdr.userStatus = 0;
+ hdr.securityIndex = call->conn->security_ix;
+ hdr._rsvd = 0;
+ hdr.serviceId = call->conn->service_id;
+
+ memset(iov, 0, sizeof(iov));
+ iov[0].iov_base = &hdr;
+ iov[0].iov_len = sizeof(hdr);
+
+ /* deal with events of a final nature */
+ if (test_bit(RXRPC_CALL_RELEASE, &call->events)) {
+ rxrpc_release_call(call);
+ clear_bit(RXRPC_CALL_RELEASE, &call->events);
+ }
+
+ if (test_bit(RXRPC_CALL_RCVD_ERROR, &call->events)) {
+ int error;
+
+ clear_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+ clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_ABORT, &call->events);
+
+ error = call->conn->trans->peer->net_error;
+ _debug("post net error %d", error);
+
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR,
+ error, true) < 0)
+ goto no_mem;
+ clear_bit(RXRPC_CALL_RCVD_ERROR, &call->events);
+ goto kill_ACKs;
+ }
+
+ if (test_bit(RXRPC_CALL_CONN_ABORT, &call->events)) {
+ ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
+
+ clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_ABORT, &call->events);
+
+ _debug("post conn abort");
+
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+ call->conn->error, true) < 0)
+ goto no_mem;
+ clear_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+ goto kill_ACKs;
+ }
+
+ if (test_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) {
+ hdr.type = RXRPC_PACKET_TYPE_BUSY;
+ genbit = RXRPC_CALL_REJECT_BUSY;
+ goto send_message;
+ }
+
+ if (test_bit(RXRPC_CALL_ABORT, &call->events)) {
+ ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
+
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+ ECONNABORTED, true) < 0)
+ goto no_mem;
+ hdr.type = RXRPC_PACKET_TYPE_ABORT;
+ data = htonl(call->abort_code);
+ iov[1].iov_base = &data;
+ iov[1].iov_len = sizeof(data);
+ genbit = RXRPC_CALL_ABORT;
+ goto send_message;
+ }
+
+ if (test_bit(RXRPC_CALL_ACK_FINAL, &call->events)) {
+ genbit = RXRPC_CALL_ACK_FINAL;
+
+ ack.bufferSpace = htons(8);
+ ack.maxSkew = 0;
+ ack.serial = 0;
+ ack.reason = RXRPC_ACK_IDLE;
+ ack.nAcks = 0;
+ call->ackr_reason = 0;
+
+ spin_lock_bh(&call->lock);
+ ack.serial = call->ackr_serial;
+ ack.previousPacket = call->ackr_prev_seq;
+ ack.firstPacket = htonl(call->rx_data_eaten + 1);
+ spin_unlock_bh(&call->lock);
+
+ pad = 0;
+
+ iov[1].iov_base = &ack;
+ iov[1].iov_len = sizeof(ack);
+ iov[2].iov_base = &pad;
+ iov[2].iov_len = 3;
+ iov[3].iov_base = &ackinfo;
+ iov[3].iov_len = sizeof(ackinfo);
+ goto send_ACK;
+ }
+
+ if (call->events & ((1 << RXRPC_CALL_RCVD_BUSY) |
+ (1 << RXRPC_CALL_RCVD_ABORT))
+ ) {
+ u32 mark;
+
+ if (test_bit(RXRPC_CALL_RCVD_ABORT, &call->events))
+ mark = RXRPC_SKB_MARK_REMOTE_ABORT;
+ else
+ mark = RXRPC_SKB_MARK_BUSY;
+
+ _debug("post abort/busy");
+ rxrpc_clear_tx_window(call);
+ if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
+ goto no_mem;
+
+ clear_bit(RXRPC_CALL_RCVD_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+ goto kill_ACKs;
+ }
+
+ if (test_and_clear_bit(RXRPC_CALL_RCVD_ACKALL, &call->events)) {
+ _debug("do implicit ackall");
+ rxrpc_clear_tx_window(call);
+ }
+
+ if (test_bit(RXRPC_CALL_LIFE_TIMER, &call->events)) {
+ write_lock_bh(&call->state_lock);
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->abort_code = RX_CALL_TIMEOUT;
+ set_bit(RXRPC_CALL_ABORT, &call->events);
+ }
+ write_unlock_bh(&call->state_lock);
+
+ _debug("post timeout");
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+ ETIME, true) < 0)
+ goto no_mem;
+
+ clear_bit(RXRPC_CALL_LIFE_TIMER, &call->events);
+ goto kill_ACKs;
+ }
+
+ /* deal with assorted inbound messages */
+ if (!skb_queue_empty(&call->rx_queue)) {
+ switch (rxrpc_process_rx_queue(call, &abort_code)) {
+ case 0:
+ case -EAGAIN:
+ break;
+ case -ENOMEM:
+ goto no_mem;
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EPROTO:
+ rxrpc_abort_call(call, abort_code);
+ goto kill_ACKs;
+ }
+ }
+
+ /* handle resending */
+ if (test_and_clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+ rxrpc_resend_timer(call);
+ if (test_and_clear_bit(RXRPC_CALL_RESEND, &call->events))
+ rxrpc_resend(call);
+
+ /* consider sending an ordinary ACK */
+ if (test_bit(RXRPC_CALL_ACK, &call->events)) {
+ _debug("send ACK: window: %d - %d { %lx }",
+ call->rx_data_eaten, call->ackr_win_top,
+ call->ackr_window[0]);
+
+ if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
+ call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
+ /* ACK by sending reply DATA packet in this state */
+ clear_bit(RXRPC_CALL_ACK, &call->events);
+ goto maybe_reschedule;
+ }
+
+ genbit = RXRPC_CALL_ACK;
+
+ acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
+ GFP_NOFS);
+ if (!acks)
+ goto no_mem;
+
+ //hdr.flags = RXRPC_SLOW_START_OK;
+ ack.bufferSpace = htons(8);
+ ack.maxSkew = 0;
+ ack.serial = 0;
+ ack.reason = 0;
+
+ spin_lock_bh(&call->lock);
+ ack.reason = call->ackr_reason;
+ ack.serial = call->ackr_serial;
+ ack.previousPacket = call->ackr_prev_seq;
+ ack.firstPacket = htonl(call->rx_data_eaten + 1);
+
+ ack.nAcks = 0;
+ for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
+ nbit = loop * BITS_PER_LONG;
+ for (bits = call->ackr_window[loop]; bits; bits >>= 1
+ ) {
+ _debug("- l=%d n=%d b=%lx", loop, nbit, bits);
+ if (bits & 1) {
+ acks[nbit] = RXRPC_ACK_TYPE_ACK;
+ ack.nAcks = nbit + 1;
+ }
+ nbit++;
+ }
+ }
+ call->ackr_reason = 0;
+ spin_unlock_bh(&call->lock);
+
+ pad = 0;
+
+ iov[1].iov_base = &ack;
+ iov[1].iov_len = sizeof(ack);
+ iov[2].iov_base = acks;
+ iov[2].iov_len = ack.nAcks;
+ iov[3].iov_base = &pad;
+ iov[3].iov_len = 3;
+ iov[4].iov_base = &ackinfo;
+ iov[4].iov_len = sizeof(ackinfo);
+
+ switch (ack.reason) {
+ case RXRPC_ACK_REQUESTED:
+ case RXRPC_ACK_DUPLICATE:
+ case RXRPC_ACK_OUT_OF_SEQUENCE:
+ case RXRPC_ACK_EXCEEDS_WINDOW:
+ case RXRPC_ACK_NOSPACE:
+ case RXRPC_ACK_PING:
+ case RXRPC_ACK_PING_RESPONSE:
+ goto send_ACK_with_skew;
+ case RXRPC_ACK_DELAY:
+ case RXRPC_ACK_IDLE:
+ goto send_ACK;
+ }
+ }
+
+ /* handle completion of security negotiations on an incoming
+ * connection */
+ if (test_and_clear_bit(RXRPC_CALL_SECURED, &call->events)) {
+ _debug("secured");
+ spin_lock_bh(&call->lock);
+
+ if (call->state == RXRPC_CALL_SERVER_SECURING) {
+ _debug("securing");
+ write_lock(&call->conn->lock);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_bit(RXRPC_CALL_RELEASE, &call->events)) {
+ _debug("not released");
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ list_move_tail(&call->accept_link,
+ &call->socket->acceptq);
+ }
+ write_unlock(&call->conn->lock);
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE)
+ set_bit(RXRPC_CALL_POST_ACCEPT, &call->events);
+ read_unlock(&call->state_lock);
+ }
+
+ spin_unlock_bh(&call->lock);
+ if (!test_bit(RXRPC_CALL_POST_ACCEPT, &call->events))
+ goto maybe_reschedule;
+ }
+
+ /* post a notification of an acceptable connection to the app */
+ if (test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) {
+ _debug("post accept");
+ if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
+ 0, false) < 0)
+ goto no_mem;
+ clear_bit(RXRPC_CALL_POST_ACCEPT, &call->events);
+ goto maybe_reschedule;
+ }
+
+ /* handle incoming call acceptance */
+ if (test_and_clear_bit(RXRPC_CALL_ACCEPTED, &call->events)) {
+ _debug("accepted");
+ ASSERTCMP(call->rx_data_post, ==, 0);
+ call->rx_data_post = 1;
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE)
+ set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events);
+ read_unlock_bh(&call->state_lock);
+ }
+
+ /* drain the out of sequence received packet queue into the packet Rx
+ * queue */
+ if (test_and_clear_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) {
+ while (call->rx_data_post == call->rx_first_oos)
+ if (rxrpc_drain_rx_oos_queue(call) < 0)
+ break;
+ goto maybe_reschedule;
+ }
+
+ /* other events may have been raised since we started checking */
+ goto maybe_reschedule;
+
+send_ACK_with_skew:
+ ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) -
+ ntohl(ack.serial));
+send_ACK:
+ mtu = call->conn->trans->peer->if_mtu;
+ mtu -= call->conn->trans->peer->hdrsize;
+ ackinfo.maxMTU = htonl(mtu);
+ ackinfo.rwind = htonl(rxrpc_rx_window_size);
+
+ /* permit the peer to send us jumbo packets if it wants to */
+ ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
+ ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
+
+ hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
+ _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+ ntohl(hdr.serial),
+ ntohs(ack.maxSkew),
+ ntohl(ack.firstPacket),
+ ntohl(ack.previousPacket),
+ ntohl(ack.serial),
+ rxrpc_acks(ack.reason),
+ ack.nAcks);
+
+ del_timer_sync(&call->ack_timer);
+ if (ack.nAcks > 0)
+ set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags);
+ goto send_message_2;
+
+send_message:
+ _debug("send message");
+
+ hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
+ _proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial));
+send_message_2:
+
+ len = iov[0].iov_len;
+ ioc = 1;
+ if (iov[4].iov_len) {
+ ioc = 5;
+ len += iov[4].iov_len;
+ len += iov[3].iov_len;
+ len += iov[2].iov_len;
+ len += iov[1].iov_len;
+ } else if (iov[3].iov_len) {
+ ioc = 4;
+ len += iov[3].iov_len;
+ len += iov[2].iov_len;
+ len += iov[1].iov_len;
+ } else if (iov[2].iov_len) {
+ ioc = 3;
+ len += iov[2].iov_len;
+ len += iov[1].iov_len;
+ } else if (iov[1].iov_len) {
+ ioc = 2;
+ len += iov[1].iov_len;
+ }
+
+ ret = kernel_sendmsg(call->conn->trans->local->socket,
+ &msg, iov, ioc, len);
+ if (ret < 0) {
+ _debug("sendmsg failed: %d", ret);
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD)
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+ goto error;
+ }
+
+ switch (genbit) {
+ case RXRPC_CALL_ABORT:
+ clear_bit(genbit, &call->events);
+ clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+ goto kill_ACKs;
+
+ case RXRPC_CALL_ACK_FINAL:
+ write_lock_bh(&call->state_lock);
+ if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK)
+ call->state = RXRPC_CALL_COMPLETE;
+ write_unlock_bh(&call->state_lock);
+ goto kill_ACKs;
+
+ default:
+ clear_bit(genbit, &call->events);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ _debug("start ACK timer");
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY,
+ call->ackr_serial, false);
+ default:
+ break;
+ }
+ goto maybe_reschedule;
+ }
+
+kill_ACKs:
+ del_timer_sync(&call->ack_timer);
+ if (test_and_clear_bit(RXRPC_CALL_ACK_FINAL, &call->events))
+ rxrpc_put_call(call);
+ clear_bit(RXRPC_CALL_ACK, &call->events);
+
+maybe_reschedule:
+ if (call->events || !skb_queue_empty(&call->rx_queue)) {
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD)
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+ }
+
+ /* don't leave aborted connections on the accept queue */
+ if (call->state >= RXRPC_CALL_COMPLETE &&
+ !list_empty(&call->accept_link)) {
+ _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
+ call, call->events, call->flags,
+ ntohl(call->conn->cid));
+
+ read_lock_bh(&call->state_lock);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+ }
+
+error:
+ clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags);
+ kfree(acks);
+
+ /* because we don't want two CPUs both processing the work item for one
+ * call at the same time, we use a flag to note when it's busy; however
+ * this means there's a race between clearing the flag and setting the
+ * work pending bit and the work item being processed again */
+ if (call->events && !work_pending(&call->processor)) {
+ _debug("jumpstart %x", ntohl(call->conn->cid));
+ rxrpc_queue_call(call);
+ }
+
+ _leave("");
+ return;
+
+no_mem:
+ _debug("out of memory");
+ goto maybe_reschedule;
+}
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
new file mode 100644
index 000000000..a9e05db0f
--- /dev/null
+++ b/net/rxrpc/ar-call.c
@@ -0,0 +1,1019 @@
+/* RxRPC individual remote procedure call handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/circ_buf.h>
+#include <linux/hashtable.h>
+#include <linux/spinlock_types.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * Maximum lifetime of a call (in jiffies).
+ */
+unsigned rxrpc_max_call_lifetime = 60 * HZ;
+
+/*
+ * Time till dead call expires after last use (in jiffies).
+ */
+unsigned rxrpc_dead_call_expiry = 2 * HZ;
+
+const char *const rxrpc_call_states[] = {
+ [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq",
+ [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl",
+ [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl",
+ [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK",
+ [RXRPC_CALL_SERVER_SECURING] = "SvSecure",
+ [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept",
+ [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq",
+ [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq",
+ [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl",
+ [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK",
+ [RXRPC_CALL_COMPLETE] = "Complete",
+ [RXRPC_CALL_SERVER_BUSY] = "SvBusy ",
+ [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort",
+ [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort",
+ [RXRPC_CALL_NETWORK_ERROR] = "NetError",
+ [RXRPC_CALL_DEAD] = "Dead ",
+};
+
+struct kmem_cache *rxrpc_call_jar;
+LIST_HEAD(rxrpc_calls);
+DEFINE_RWLOCK(rxrpc_call_lock);
+
+static void rxrpc_destroy_call(struct work_struct *work);
+static void rxrpc_call_life_expired(unsigned long _call);
+static void rxrpc_dead_call_expired(unsigned long _call);
+static void rxrpc_ack_time_expired(unsigned long _call);
+static void rxrpc_resend_time_expired(unsigned long _call);
+
+static DEFINE_SPINLOCK(rxrpc_call_hash_lock);
+static DEFINE_HASHTABLE(rxrpc_call_hash, 10);
+
+/*
+ * Hash function for rxrpc_call_hash
+ */
+static unsigned long rxrpc_call_hashfunc(
+ u8 clientflag,
+ __be32 cid,
+ __be32 call_id,
+ __be32 epoch,
+ __be16 service_id,
+ sa_family_t proto,
+ void *localptr,
+ unsigned int addr_size,
+ const u8 *peer_addr)
+{
+ const u16 *p;
+ unsigned int i;
+ unsigned long key;
+ u32 hcid = ntohl(cid);
+
+ _enter("");
+
+ key = (unsigned long)localptr;
+ /* We just want to add up the __be32 values, so forcing the
+ * cast should be okay.
+ */
+ key += (__force u32)epoch;
+ key += (__force u16)service_id;
+ key += (__force u32)call_id;
+ key += (hcid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT;
+ key += hcid & RXRPC_CHANNELMASK;
+ key += clientflag;
+ key += proto;
+ /* Step through the peer address in 16-bit portions for speed */
+ for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++)
+ key += *p;
+ _leave(" key = 0x%lx", key);
+ return key;
+}
+
+/*
+ * Add a call to the hashtable
+ */
+static void rxrpc_call_hash_add(struct rxrpc_call *call)
+{
+ unsigned long key;
+ unsigned int addr_size = 0;
+
+ _enter("");
+ switch (call->proto) {
+ case AF_INET:
+ addr_size = sizeof(call->peer_ip.ipv4_addr);
+ break;
+ case AF_INET6:
+ addr_size = sizeof(call->peer_ip.ipv6_addr);
+ break;
+ default:
+ break;
+ }
+ key = rxrpc_call_hashfunc(call->in_clientflag, call->cid,
+ call->call_id, call->epoch,
+ call->service_id, call->proto,
+ call->conn->trans->local, addr_size,
+ call->peer_ip.ipv6_addr);
+ /* Store the full key in the call */
+ call->hash_key = key;
+ spin_lock(&rxrpc_call_hash_lock);
+ hash_add_rcu(rxrpc_call_hash, &call->hash_node, key);
+ spin_unlock(&rxrpc_call_hash_lock);
+ _leave("");
+}
+
+/*
+ * Remove a call from the hashtable
+ */
+static void rxrpc_call_hash_del(struct rxrpc_call *call)
+{
+ _enter("");
+ spin_lock(&rxrpc_call_hash_lock);
+ hash_del_rcu(&call->hash_node);
+ spin_unlock(&rxrpc_call_hash_lock);
+ _leave("");
+}
+
+/*
+ * Find a call in the hashtable and return it, or NULL if it
+ * isn't there.
+ */
+struct rxrpc_call *rxrpc_find_call_hash(
+ u8 clientflag,
+ __be32 cid,
+ __be32 call_id,
+ __be32 epoch,
+ __be16 service_id,
+ void *localptr,
+ sa_family_t proto,
+ const u8 *peer_addr)
+{
+ unsigned long key;
+ unsigned int addr_size = 0;
+ struct rxrpc_call *call = NULL;
+ struct rxrpc_call *ret = NULL;
+
+ _enter("");
+ switch (proto) {
+ case AF_INET:
+ addr_size = sizeof(call->peer_ip.ipv4_addr);
+ break;
+ case AF_INET6:
+ addr_size = sizeof(call->peer_ip.ipv6_addr);
+ break;
+ default:
+ break;
+ }
+
+ key = rxrpc_call_hashfunc(clientflag, cid, call_id, epoch,
+ service_id, proto, localptr, addr_size,
+ peer_addr);
+ hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) {
+ if (call->hash_key == key &&
+ call->call_id == call_id &&
+ call->cid == cid &&
+ call->in_clientflag == clientflag &&
+ call->service_id == service_id &&
+ call->proto == proto &&
+ call->local == localptr &&
+ memcmp(call->peer_ip.ipv6_addr, peer_addr,
+ addr_size) == 0 &&
+ call->epoch == epoch) {
+ ret = call;
+ break;
+ }
+ }
+ _leave(" = %p", ret);
+ return ret;
+}
+
+/*
+ * allocate a new call
+ */
+static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
+{
+ struct rxrpc_call *call;
+
+ call = kmem_cache_zalloc(rxrpc_call_jar, gfp);
+ if (!call)
+ return NULL;
+
+ call->acks_winsz = 16;
+ call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long),
+ gfp);
+ if (!call->acks_window) {
+ kmem_cache_free(rxrpc_call_jar, call);
+ return NULL;
+ }
+
+ setup_timer(&call->lifetimer, &rxrpc_call_life_expired,
+ (unsigned long) call);
+ setup_timer(&call->deadspan, &rxrpc_dead_call_expired,
+ (unsigned long) call);
+ setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
+ (unsigned long) call);
+ setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
+ (unsigned long) call);
+ INIT_WORK(&call->destroyer, &rxrpc_destroy_call);
+ INIT_WORK(&call->processor, &rxrpc_process_call);
+ INIT_LIST_HEAD(&call->accept_link);
+ skb_queue_head_init(&call->rx_queue);
+ skb_queue_head_init(&call->rx_oos_queue);
+ init_waitqueue_head(&call->tx_waitq);
+ spin_lock_init(&call->lock);
+ rwlock_init(&call->state_lock);
+ atomic_set(&call->usage, 1);
+ call->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+
+ memset(&call->sock_node, 0xed, sizeof(call->sock_node));
+
+ call->rx_data_expect = 1;
+ call->rx_data_eaten = 0;
+ call->rx_first_oos = 0;
+ call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size;
+ call->creation_jif = jiffies;
+ return call;
+}
+
+/*
+ * allocate a new client call and attempt to get a connection slot for it
+ */
+static struct rxrpc_call *rxrpc_alloc_client_call(
+ struct rxrpc_sock *rx,
+ struct rxrpc_transport *trans,
+ struct rxrpc_conn_bundle *bundle,
+ gfp_t gfp)
+{
+ struct rxrpc_call *call;
+ int ret;
+
+ _enter("");
+
+ ASSERT(rx != NULL);
+ ASSERT(trans != NULL);
+ ASSERT(bundle != NULL);
+
+ call = rxrpc_alloc_call(gfp);
+ if (!call)
+ return ERR_PTR(-ENOMEM);
+
+ sock_hold(&rx->sk);
+ call->socket = rx;
+ call->rx_data_post = 1;
+
+ ret = rxrpc_connect_call(rx, trans, bundle, call, gfp);
+ if (ret < 0) {
+ kmem_cache_free(rxrpc_call_jar, call);
+ return ERR_PTR(ret);
+ }
+
+ /* Record copies of information for hashtable lookup */
+ call->proto = rx->proto;
+ call->local = trans->local;
+ switch (call->proto) {
+ case AF_INET:
+ call->peer_ip.ipv4_addr =
+ trans->peer->srx.transport.sin.sin_addr.s_addr;
+ break;
+ case AF_INET6:
+ memcpy(call->peer_ip.ipv6_addr,
+ trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8,
+ sizeof(call->peer_ip.ipv6_addr));
+ break;
+ }
+ call->epoch = call->conn->epoch;
+ call->service_id = call->conn->service_id;
+ call->in_clientflag = call->conn->in_clientflag;
+ /* Add the new call to the hashtable */
+ rxrpc_call_hash_add(call);
+
+ spin_lock(&call->conn->trans->peer->lock);
+ list_add(&call->error_link, &call->conn->trans->peer->error_targets);
+ spin_unlock(&call->conn->trans->peer->lock);
+
+ call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
+ add_timer(&call->lifetimer);
+
+ _leave(" = %p", call);
+ return call;
+}
+
+/*
+ * set up a call for the given data
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *rx,
+ struct rxrpc_transport *trans,
+ struct rxrpc_conn_bundle *bundle,
+ unsigned long user_call_ID,
+ int create,
+ gfp_t gfp)
+{
+ struct rxrpc_call *call, *candidate;
+ struct rb_node *p, *parent, **pp;
+
+ _enter("%p,%d,%d,%lx,%d",
+ rx, trans ? trans->debug_id : -1, bundle ? bundle->debug_id : -1,
+ user_call_ID, create);
+
+ /* search the extant calls first for one that matches the specified
+ * user ID */
+ read_lock(&rx->call_lock);
+
+ p = rx->calls.rb_node;
+ while (p) {
+ call = rb_entry(p, struct rxrpc_call, sock_node);
+
+ if (user_call_ID < call->user_call_ID)
+ p = p->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ p = p->rb_right;
+ else
+ goto found_extant_call;
+ }
+
+ read_unlock(&rx->call_lock);
+
+ if (!create || !trans)
+ return ERR_PTR(-EBADSLT);
+
+ /* not yet present - create a candidate for a new record and then
+ * redo the search */
+ candidate = rxrpc_alloc_client_call(rx, trans, bundle, gfp);
+ if (IS_ERR(candidate)) {
+ _leave(" = %ld", PTR_ERR(candidate));
+ return candidate;
+ }
+
+ candidate->user_call_ID = user_call_ID;
+ __set_bit(RXRPC_CALL_HAS_USERID, &candidate->flags);
+
+ write_lock(&rx->call_lock);
+
+ pp = &rx->calls.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ call = rb_entry(parent, struct rxrpc_call, sock_node);
+
+ if (user_call_ID < call->user_call_ID)
+ pp = &(*pp)->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ pp = &(*pp)->rb_right;
+ else
+ goto found_extant_second;
+ }
+
+ /* second search also failed; add the new call */
+ call = candidate;
+ candidate = NULL;
+ rxrpc_get_call(call);
+
+ rb_link_node(&call->sock_node, parent, pp);
+ rb_insert_color(&call->sock_node, &rx->calls);
+ write_unlock(&rx->call_lock);
+
+ write_lock_bh(&rxrpc_call_lock);
+ list_add_tail(&call->link, &rxrpc_calls);
+ write_unlock_bh(&rxrpc_call_lock);
+
+ _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
+
+ _leave(" = %p [new]", call);
+ return call;
+
+ /* we found the call in the list immediately */
+found_extant_call:
+ rxrpc_get_call(call);
+ read_unlock(&rx->call_lock);
+ _leave(" = %p [extant %d]", call, atomic_read(&call->usage));
+ return call;
+
+ /* we found the call on the second time through the list */
+found_extant_second:
+ rxrpc_get_call(call);
+ write_unlock(&rx->call_lock);
+ rxrpc_put_call(candidate);
+ _leave(" = %p [second %d]", call, atomic_read(&call->usage));
+ return call;
+}
+
+/*
+ * set up an incoming call
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
+ struct rxrpc_connection *conn,
+ struct rxrpc_header *hdr,
+ gfp_t gfp)
+{
+ struct rxrpc_call *call, *candidate;
+ struct rb_node **p, *parent;
+ __be32 call_id;
+
+ _enter(",%d,,%x", conn->debug_id, gfp);
+
+ ASSERT(rx != NULL);
+
+ candidate = rxrpc_alloc_call(gfp);
+ if (!candidate)
+ return ERR_PTR(-EBUSY);
+
+ candidate->socket = rx;
+ candidate->conn = conn;
+ candidate->cid = hdr->cid;
+ candidate->call_id = hdr->callNumber;
+ candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK;
+ candidate->rx_data_post = 0;
+ candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
+ if (conn->security_ix > 0)
+ candidate->state = RXRPC_CALL_SERVER_SECURING;
+
+ write_lock_bh(&conn->lock);
+
+ /* set the channel for this call */
+ call = conn->channels[candidate->channel];
+ _debug("channel[%u] is %p", candidate->channel, call);
+ if (call && call->call_id == hdr->callNumber) {
+ /* already set; must've been a duplicate packet */
+ _debug("extant call [%d]", call->state);
+ ASSERTCMP(call->conn, ==, conn);
+
+ read_lock(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+ rxrpc_queue_call(call);
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ read_unlock(&call->state_lock);
+ goto aborted_call;
+ default:
+ rxrpc_get_call(call);
+ read_unlock(&call->state_lock);
+ goto extant_call;
+ }
+ }
+
+ if (call) {
+ /* it seems the channel is still in use from the previous call
+ * - ditch the old binding if its call is now complete */
+ _debug("CALL: %u { %s }",
+ call->debug_id, rxrpc_call_states[call->state]);
+
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ conn->channels[call->channel] = NULL;
+ } else {
+ write_unlock_bh(&conn->lock);
+ kmem_cache_free(rxrpc_call_jar, candidate);
+ _leave(" = -EBUSY");
+ return ERR_PTR(-EBUSY);
+ }
+ }
+
+ /* check the call number isn't duplicate */
+ _debug("check dup");
+ call_id = hdr->callNumber;
+ p = &conn->calls.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ call = rb_entry(parent, struct rxrpc_call, conn_node);
+
+ /* The tree is sorted in order of the __be32 value without
+ * turning it into host order.
+ */
+ if ((__force u32)call_id < (__force u32)call->call_id)
+ p = &(*p)->rb_left;
+ else if ((__force u32)call_id > (__force u32)call->call_id)
+ p = &(*p)->rb_right;
+ else
+ goto old_call;
+ }
+
+ /* make the call available */
+ _debug("new call");
+ call = candidate;
+ candidate = NULL;
+ rb_link_node(&call->conn_node, parent, p);
+ rb_insert_color(&call->conn_node, &conn->calls);
+ conn->channels[call->channel] = call;
+ sock_hold(&rx->sk);
+ atomic_inc(&conn->usage);
+ write_unlock_bh(&conn->lock);
+
+ spin_lock(&conn->trans->peer->lock);
+ list_add(&call->error_link, &conn->trans->peer->error_targets);
+ spin_unlock(&conn->trans->peer->lock);
+
+ write_lock_bh(&rxrpc_call_lock);
+ list_add_tail(&call->link, &rxrpc_calls);
+ write_unlock_bh(&rxrpc_call_lock);
+
+ /* Record copies of information for hashtable lookup */
+ call->proto = rx->proto;
+ call->local = conn->trans->local;
+ switch (call->proto) {
+ case AF_INET:
+ call->peer_ip.ipv4_addr =
+ conn->trans->peer->srx.transport.sin.sin_addr.s_addr;
+ break;
+ case AF_INET6:
+ memcpy(call->peer_ip.ipv6_addr,
+ conn->trans->peer->srx.transport.sin6.sin6_addr.in6_u.u6_addr8,
+ sizeof(call->peer_ip.ipv6_addr));
+ break;
+ default:
+ break;
+ }
+ call->epoch = conn->epoch;
+ call->service_id = conn->service_id;
+ call->in_clientflag = conn->in_clientflag;
+ /* Add the new call to the hashtable */
+ rxrpc_call_hash_add(call);
+
+ _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
+
+ call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime;
+ add_timer(&call->lifetimer);
+ _leave(" = %p {%d} [new]", call, call->debug_id);
+ return call;
+
+extant_call:
+ write_unlock_bh(&conn->lock);
+ kmem_cache_free(rxrpc_call_jar, candidate);
+ _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1);
+ return call;
+
+aborted_call:
+ write_unlock_bh(&conn->lock);
+ kmem_cache_free(rxrpc_call_jar, candidate);
+ _leave(" = -ECONNABORTED");
+ return ERR_PTR(-ECONNABORTED);
+
+old_call:
+ write_unlock_bh(&conn->lock);
+ kmem_cache_free(rxrpc_call_jar, candidate);
+ _leave(" = -ECONNRESET [old]");
+ return ERR_PTR(-ECONNRESET);
+}
+
+/*
+ * find an extant server call
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *rx,
+ unsigned long user_call_ID)
+{
+ struct rxrpc_call *call;
+ struct rb_node *p;
+
+ _enter("%p,%lx", rx, user_call_ID);
+
+ /* search the extant calls for one that matches the specified user
+ * ID */
+ read_lock(&rx->call_lock);
+
+ p = rx->calls.rb_node;
+ while (p) {
+ call = rb_entry(p, struct rxrpc_call, sock_node);
+
+ if (user_call_ID < call->user_call_ID)
+ p = p->rb_left;
+ else if (user_call_ID > call->user_call_ID)
+ p = p->rb_right;
+ else
+ goto found_extant_call;
+ }
+
+ read_unlock(&rx->call_lock);
+ _leave(" = NULL");
+ return NULL;
+
+ /* we found the call in the list immediately */
+found_extant_call:
+ rxrpc_get_call(call);
+ read_unlock(&rx->call_lock);
+ _leave(" = %p [%d]", call, atomic_read(&call->usage));
+ return call;
+}
+
+/*
+ * detach a call from a socket and set up for release
+ */
+void rxrpc_release_call(struct rxrpc_call *call)
+{
+ struct rxrpc_connection *conn = call->conn;
+ struct rxrpc_sock *rx = call->socket;
+
+ _enter("{%d,%d,%d,%d}",
+ call->debug_id, atomic_read(&call->usage),
+ atomic_read(&call->ackr_not_idle),
+ call->rx_first_oos);
+
+ spin_lock_bh(&call->lock);
+ if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
+ BUG();
+ spin_unlock_bh(&call->lock);
+
+ /* dissociate from the socket
+ * - the socket's ref on the call is passed to the death timer
+ */
+ _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
+
+ write_lock_bh(&rx->call_lock);
+ if (!list_empty(&call->accept_link)) {
+ _debug("unlinking once-pending call %p { e=%lx f=%lx }",
+ call, call->events, call->flags);
+ ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+ list_del_init(&call->accept_link);
+ sk_acceptq_removed(&rx->sk);
+ } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ rb_erase(&call->sock_node, &rx->calls);
+ memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
+ clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ }
+ write_unlock_bh(&rx->call_lock);
+
+ /* free up the channel for reuse */
+ spin_lock(&conn->trans->client_lock);
+ write_lock_bh(&conn->lock);
+ write_lock(&call->state_lock);
+
+ if (conn->channels[call->channel] == call)
+ conn->channels[call->channel] = NULL;
+
+ if (conn->out_clientflag && conn->bundle) {
+ conn->avail_calls++;
+ switch (conn->avail_calls) {
+ case 1:
+ list_move_tail(&conn->bundle_link,
+ &conn->bundle->avail_conns);
+ case 2 ... RXRPC_MAXCALLS - 1:
+ ASSERT(conn->channels[0] == NULL ||
+ conn->channels[1] == NULL ||
+ conn->channels[2] == NULL ||
+ conn->channels[3] == NULL);
+ break;
+ case RXRPC_MAXCALLS:
+ list_move_tail(&conn->bundle_link,
+ &conn->bundle->unused_conns);
+ ASSERT(conn->channels[0] == NULL &&
+ conn->channels[1] == NULL &&
+ conn->channels[2] == NULL &&
+ conn->channels[3] == NULL);
+ break;
+ default:
+ printk(KERN_ERR "RxRPC: conn->avail_calls=%d\n",
+ conn->avail_calls);
+ BUG();
+ }
+ }
+
+ spin_unlock(&conn->trans->client_lock);
+
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ call->state != RXRPC_CALL_CLIENT_FINAL_ACK) {
+ _debug("+++ ABORTING STATE %d +++\n", call->state);
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->abort_code = RX_CALL_DEAD;
+ set_bit(RXRPC_CALL_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+ write_unlock(&call->state_lock);
+ write_unlock_bh(&conn->lock);
+
+ /* clean up the Rx queue */
+ if (!skb_queue_empty(&call->rx_queue) ||
+ !skb_queue_empty(&call->rx_oos_queue)) {
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+
+ _debug("purge Rx queues");
+
+ spin_lock_bh(&call->lock);
+ while ((skb = skb_dequeue(&call->rx_queue)) ||
+ (skb = skb_dequeue(&call->rx_oos_queue))) {
+ sp = rxrpc_skb(skb);
+ if (sp->call) {
+ ASSERTCMP(sp->call, ==, call);
+ rxrpc_put_call(call);
+ sp->call = NULL;
+ }
+ skb->destructor = NULL;
+ spin_unlock_bh(&call->lock);
+
+ _debug("- zap %s %%%u #%u",
+ rxrpc_pkts[sp->hdr.type],
+ ntohl(sp->hdr.serial),
+ ntohl(sp->hdr.seq));
+ rxrpc_free_skb(skb);
+ spin_lock_bh(&call->lock);
+ }
+ spin_unlock_bh(&call->lock);
+
+ ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE);
+ }
+
+ del_timer_sync(&call->resend_timer);
+ del_timer_sync(&call->ack_timer);
+ del_timer_sync(&call->lifetimer);
+ call->deadspan.expires = jiffies + rxrpc_dead_call_expiry;
+ add_timer(&call->deadspan);
+
+ _leave("");
+}
+
+/*
+ * handle a dead call being ready for reaping
+ */
+static void rxrpc_dead_call_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+ _enter("{%d}", call->debug_id);
+
+ write_lock_bh(&call->state_lock);
+ call->state = RXRPC_CALL_DEAD;
+ write_unlock_bh(&call->state_lock);
+ rxrpc_put_call(call);
+}
+
+/*
+ * mark a call as to be released, aborting it if it's still in progress
+ * - called with softirqs disabled
+ */
+static void rxrpc_mark_call_released(struct rxrpc_call *call)
+{
+ bool sched;
+
+ write_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD) {
+ sched = false;
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ _debug("abort call %p", call);
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->abort_code = RX_CALL_DEAD;
+ if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+ sched = true;
+ }
+ if (!test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ sched = true;
+ if (sched)
+ rxrpc_queue_call(call);
+ }
+ write_unlock(&call->state_lock);
+}
+
+/*
+ * release all the calls associated with a socket
+ */
+void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
+{
+ struct rxrpc_call *call;
+ struct rb_node *p;
+
+ _enter("%p", rx);
+
+ read_lock_bh(&rx->call_lock);
+
+ /* mark all the calls as no longer wanting incoming packets */
+ for (p = rb_first(&rx->calls); p; p = rb_next(p)) {
+ call = rb_entry(p, struct rxrpc_call, sock_node);
+ rxrpc_mark_call_released(call);
+ }
+
+ /* kill the not-yet-accepted incoming calls */
+ list_for_each_entry(call, &rx->secureq, accept_link) {
+ rxrpc_mark_call_released(call);
+ }
+
+ list_for_each_entry(call, &rx->acceptq, accept_link) {
+ rxrpc_mark_call_released(call);
+ }
+
+ read_unlock_bh(&rx->call_lock);
+ _leave("");
+}
+
+/*
+ * release a call
+ */
+void __rxrpc_put_call(struct rxrpc_call *call)
+{
+ ASSERT(call != NULL);
+
+ _enter("%p{u=%d}", call, atomic_read(&call->usage));
+
+ ASSERTCMP(atomic_read(&call->usage), >, 0);
+
+ if (atomic_dec_and_test(&call->usage)) {
+ _debug("call %d dead", call->debug_id);
+ ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+ rxrpc_queue_work(&call->destroyer);
+ }
+ _leave("");
+}
+
+/*
+ * clean up a call
+ */
+static void rxrpc_cleanup_call(struct rxrpc_call *call)
+{
+ _net("DESTROY CALL %d", call->debug_id);
+
+ ASSERT(call->socket);
+
+ memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
+
+ del_timer_sync(&call->lifetimer);
+ del_timer_sync(&call->deadspan);
+ del_timer_sync(&call->ack_timer);
+ del_timer_sync(&call->resend_timer);
+
+ ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
+ ASSERTCMP(call->events, ==, 0);
+ if (work_pending(&call->processor)) {
+ _debug("defer destroy");
+ rxrpc_queue_work(&call->destroyer);
+ return;
+ }
+
+ if (call->conn) {
+ spin_lock(&call->conn->trans->peer->lock);
+ list_del(&call->error_link);
+ spin_unlock(&call->conn->trans->peer->lock);
+
+ write_lock_bh(&call->conn->lock);
+ rb_erase(&call->conn_node, &call->conn->calls);
+ write_unlock_bh(&call->conn->lock);
+ rxrpc_put_connection(call->conn);
+ }
+
+ /* Remove the call from the hash */
+ rxrpc_call_hash_del(call);
+
+ if (call->acks_window) {
+ _debug("kill Tx window %d",
+ CIRC_CNT(call->acks_head, call->acks_tail,
+ call->acks_winsz));
+ smp_mb();
+ while (CIRC_CNT(call->acks_head, call->acks_tail,
+ call->acks_winsz) > 0) {
+ struct rxrpc_skb_priv *sp;
+ unsigned long _skb;
+
+ _skb = call->acks_window[call->acks_tail] & ~1;
+ sp = rxrpc_skb((struct sk_buff *) _skb);
+ _debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
+ rxrpc_free_skb((struct sk_buff *) _skb);
+ call->acks_tail =
+ (call->acks_tail + 1) & (call->acks_winsz - 1);
+ }
+
+ kfree(call->acks_window);
+ }
+
+ rxrpc_free_skb(call->tx_pending);
+
+ rxrpc_purge_queue(&call->rx_queue);
+ ASSERT(skb_queue_empty(&call->rx_oos_queue));
+ sock_put(&call->socket->sk);
+ kmem_cache_free(rxrpc_call_jar, call);
+}
+
+/*
+ * destroy a call
+ */
+static void rxrpc_destroy_call(struct work_struct *work)
+{
+ struct rxrpc_call *call =
+ container_of(work, struct rxrpc_call, destroyer);
+
+ _enter("%p{%d,%d,%p}",
+ call, atomic_read(&call->usage), call->channel, call->conn);
+
+ ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+
+ write_lock_bh(&rxrpc_call_lock);
+ list_del_init(&call->link);
+ write_unlock_bh(&rxrpc_call_lock);
+
+ rxrpc_cleanup_call(call);
+ _leave("");
+}
+
+/*
+ * preemptively destroy all the call records from a transport endpoint rather
+ * than waiting for them to time out
+ */
+void __exit rxrpc_destroy_all_calls(void)
+{
+ struct rxrpc_call *call;
+
+ _enter("");
+ write_lock_bh(&rxrpc_call_lock);
+
+ while (!list_empty(&rxrpc_calls)) {
+ call = list_entry(rxrpc_calls.next, struct rxrpc_call, link);
+ _debug("Zapping call %p", call);
+
+ list_del_init(&call->link);
+
+ switch (atomic_read(&call->usage)) {
+ case 0:
+ ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+ break;
+ case 1:
+ if (del_timer_sync(&call->deadspan) != 0 &&
+ call->state != RXRPC_CALL_DEAD)
+ rxrpc_dead_call_expired((unsigned long) call);
+ if (call->state != RXRPC_CALL_DEAD)
+ break;
+ default:
+ printk(KERN_ERR "RXRPC:"
+ " Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
+ call, atomic_read(&call->usage),
+ atomic_read(&call->ackr_not_idle),
+ rxrpc_call_states[call->state],
+ call->flags, call->events);
+ if (!skb_queue_empty(&call->rx_queue))
+ printk(KERN_ERR"RXRPC: Rx queue occupied\n");
+ if (!skb_queue_empty(&call->rx_oos_queue))
+ printk(KERN_ERR"RXRPC: OOS queue occupied\n");
+ break;
+ }
+
+ write_unlock_bh(&rxrpc_call_lock);
+ cond_resched();
+ write_lock_bh(&rxrpc_call_lock);
+ }
+
+ write_unlock_bh(&rxrpc_call_lock);
+ _leave("");
+}
+
+/*
+ * handle call lifetime being exceeded
+ */
+static void rxrpc_call_life_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ _enter("{%d}", call->debug_id);
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ set_bit(RXRPC_CALL_LIFE_TIMER, &call->events);
+ rxrpc_queue_call(call);
+ }
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * handle resend timer expiry
+ * - may not take call->state_lock as this can deadlock against del_timer_sync()
+ */
+static void rxrpc_resend_time_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+ _enter("{%d}", call->debug_id);
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ if (!test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+ rxrpc_queue_call(call);
+}
+
+/*
+ * handle ACK timer expiry
+ */
+static void rxrpc_ack_time_expired(unsigned long _call)
+{
+ struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+ _enter("{%d}", call->debug_id);
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return;
+
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_ACK, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+}
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
new file mode 100644
index 000000000..6631f4f1e
--- /dev/null
+++ b/net/rxrpc/ar-connection.c
@@ -0,0 +1,928 @@
+/* RxRPC virtual connection handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * Time till a connection expires after last use (in seconds).
+ */
+unsigned rxrpc_connection_expiry = 10 * 60;
+
+static void rxrpc_connection_reaper(struct work_struct *work);
+
+LIST_HEAD(rxrpc_connections);
+DEFINE_RWLOCK(rxrpc_connection_lock);
+static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper);
+
+/*
+ * allocate a new client connection bundle
+ */
+static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp)
+{
+ struct rxrpc_conn_bundle *bundle;
+
+ _enter("");
+
+ bundle = kzalloc(sizeof(struct rxrpc_conn_bundle), gfp);
+ if (bundle) {
+ INIT_LIST_HEAD(&bundle->unused_conns);
+ INIT_LIST_HEAD(&bundle->avail_conns);
+ INIT_LIST_HEAD(&bundle->busy_conns);
+ init_waitqueue_head(&bundle->chanwait);
+ atomic_set(&bundle->usage, 1);
+ }
+
+ _leave(" = %p", bundle);
+ return bundle;
+}
+
+/*
+ * compare bundle parameters with what we're looking for
+ * - return -ve, 0 or +ve
+ */
+static inline
+int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
+ struct key *key, __be16 service_id)
+{
+ return (bundle->service_id - service_id) ?:
+ ((unsigned long) bundle->key - (unsigned long) key);
+}
+
+/*
+ * get bundle of client connections that a client socket can make use of
+ */
+struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx,
+ struct rxrpc_transport *trans,
+ struct key *key,
+ __be16 service_id,
+ gfp_t gfp)
+{
+ struct rxrpc_conn_bundle *bundle, *candidate;
+ struct rb_node *p, *parent, **pp;
+
+ _enter("%p{%x},%x,%hx,",
+ rx, key_serial(key), trans->debug_id, ntohs(service_id));
+
+ if (rx->trans == trans && rx->bundle) {
+ atomic_inc(&rx->bundle->usage);
+ return rx->bundle;
+ }
+
+ /* search the extant bundles first for one that matches the specified
+ * user ID */
+ spin_lock(&trans->client_lock);
+
+ p = trans->bundles.rb_node;
+ while (p) {
+ bundle = rb_entry(p, struct rxrpc_conn_bundle, node);
+
+ if (rxrpc_cmp_bundle(bundle, key, service_id) < 0)
+ p = p->rb_left;
+ else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0)
+ p = p->rb_right;
+ else
+ goto found_extant_bundle;
+ }
+
+ spin_unlock(&trans->client_lock);
+
+ /* not yet present - create a candidate for a new record and then
+ * redo the search */
+ candidate = rxrpc_alloc_bundle(gfp);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ candidate->key = key_get(key);
+ candidate->service_id = service_id;
+
+ spin_lock(&trans->client_lock);
+
+ pp = &trans->bundles.rb_node;
+ parent = NULL;
+ while (*pp) {
+ parent = *pp;
+ bundle = rb_entry(parent, struct rxrpc_conn_bundle, node);
+
+ if (rxrpc_cmp_bundle(bundle, key, service_id) < 0)
+ pp = &(*pp)->rb_left;
+ else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0)
+ pp = &(*pp)->rb_right;
+ else
+ goto found_extant_second;
+ }
+
+ /* second search also failed; add the new bundle */
+ bundle = candidate;
+ candidate = NULL;
+
+ rb_link_node(&bundle->node, parent, pp);
+ rb_insert_color(&bundle->node, &trans->bundles);
+ spin_unlock(&trans->client_lock);
+ _net("BUNDLE new on trans %d", trans->debug_id);
+ if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) {
+ atomic_inc(&bundle->usage);
+ rx->bundle = bundle;
+ }
+ _leave(" = %p [new]", bundle);
+ return bundle;
+
+ /* we found the bundle in the list immediately */
+found_extant_bundle:
+ atomic_inc(&bundle->usage);
+ spin_unlock(&trans->client_lock);
+ _net("BUNDLE old on trans %d", trans->debug_id);
+ if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) {
+ atomic_inc(&bundle->usage);
+ rx->bundle = bundle;
+ }
+ _leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage));
+ return bundle;
+
+ /* we found the bundle on the second time through the list */
+found_extant_second:
+ atomic_inc(&bundle->usage);
+ spin_unlock(&trans->client_lock);
+ kfree(candidate);
+ _net("BUNDLE old2 on trans %d", trans->debug_id);
+ if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) {
+ atomic_inc(&bundle->usage);
+ rx->bundle = bundle;
+ }
+ _leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage));
+ return bundle;
+}
+
+/*
+ * release a bundle
+ */
+void rxrpc_put_bundle(struct rxrpc_transport *trans,
+ struct rxrpc_conn_bundle *bundle)
+{
+ _enter("%p,%p{%d}",trans, bundle, atomic_read(&bundle->usage));
+
+ if (atomic_dec_and_lock(&bundle->usage, &trans->client_lock)) {
+ _debug("Destroy bundle");
+ rb_erase(&bundle->node, &trans->bundles);
+ spin_unlock(&trans->client_lock);
+ ASSERT(list_empty(&bundle->unused_conns));
+ ASSERT(list_empty(&bundle->avail_conns));
+ ASSERT(list_empty(&bundle->busy_conns));
+ ASSERTCMP(bundle->num_conns, ==, 0);
+ key_put(bundle->key);
+ kfree(bundle);
+ }
+
+ _leave("");
+}
+
+/*
+ * allocate a new connection
+ */
+static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
+{
+ struct rxrpc_connection *conn;
+
+ _enter("");
+
+ conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
+ if (conn) {
+ INIT_WORK(&conn->processor, &rxrpc_process_connection);
+ INIT_LIST_HEAD(&conn->bundle_link);
+ conn->calls = RB_ROOT;
+ skb_queue_head_init(&conn->rx_queue);
+ rwlock_init(&conn->lock);
+ spin_lock_init(&conn->state_lock);
+ atomic_set(&conn->usage, 1);
+ conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ conn->avail_calls = RXRPC_MAXCALLS;
+ conn->size_align = 4;
+ conn->header_size = sizeof(struct rxrpc_header);
+ }
+
+ _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
+ return conn;
+}
+
+/*
+ * assign a connection ID to a connection and add it to the transport's
+ * connection lookup tree
+ * - called with transport client lock held
+ */
+static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
+{
+ struct rxrpc_connection *xconn;
+ struct rb_node *parent, **p;
+ __be32 epoch;
+ u32 real_conn_id;
+
+ _enter("");
+
+ epoch = conn->epoch;
+
+ write_lock_bh(&conn->trans->conn_lock);
+
+ conn->trans->conn_idcounter += RXRPC_CID_INC;
+ if (conn->trans->conn_idcounter < RXRPC_CID_INC)
+ conn->trans->conn_idcounter = RXRPC_CID_INC;
+ real_conn_id = conn->trans->conn_idcounter;
+
+attempt_insertion:
+ parent = NULL;
+ p = &conn->trans->client_conns.rb_node;
+
+ while (*p) {
+ parent = *p;
+ xconn = rb_entry(parent, struct rxrpc_connection, node);
+
+ if (epoch < xconn->epoch)
+ p = &(*p)->rb_left;
+ else if (epoch > xconn->epoch)
+ p = &(*p)->rb_right;
+ else if (real_conn_id < xconn->real_conn_id)
+ p = &(*p)->rb_left;
+ else if (real_conn_id > xconn->real_conn_id)
+ p = &(*p)->rb_right;
+ else
+ goto id_exists;
+ }
+
+ /* we've found a suitable hole - arrange for this connection to occupy
+ * it */
+ rb_link_node(&conn->node, parent, p);
+ rb_insert_color(&conn->node, &conn->trans->client_conns);
+
+ conn->real_conn_id = real_conn_id;
+ conn->cid = htonl(real_conn_id);
+ write_unlock_bh(&conn->trans->conn_lock);
+ _leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid));
+ return;
+
+ /* we found a connection with the proposed ID - walk the tree from that
+ * point looking for the next unused ID */
+id_exists:
+ for (;;) {
+ real_conn_id += RXRPC_CID_INC;
+ if (real_conn_id < RXRPC_CID_INC) {
+ real_conn_id = RXRPC_CID_INC;
+ conn->trans->conn_idcounter = real_conn_id;
+ goto attempt_insertion;
+ }
+
+ parent = rb_next(parent);
+ if (!parent)
+ goto attempt_insertion;
+
+ xconn = rb_entry(parent, struct rxrpc_connection, node);
+ if (epoch < xconn->epoch ||
+ real_conn_id < xconn->real_conn_id)
+ goto attempt_insertion;
+ }
+}
+
+/*
+ * add a call to a connection's call-by-ID tree
+ */
+static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn,
+ struct rxrpc_call *call)
+{
+ struct rxrpc_call *xcall;
+ struct rb_node *parent, **p;
+ __be32 call_id;
+
+ write_lock_bh(&conn->lock);
+
+ call_id = call->call_id;
+ p = &conn->calls.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ xcall = rb_entry(parent, struct rxrpc_call, conn_node);
+
+ if (call_id < xcall->call_id)
+ p = &(*p)->rb_left;
+ else if (call_id > xcall->call_id)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&call->conn_node, parent, p);
+ rb_insert_color(&call->conn_node, &conn->calls);
+
+ write_unlock_bh(&conn->lock);
+}
+
+/*
+ * connect a call on an exclusive connection
+ */
+static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
+ struct rxrpc_transport *trans,
+ __be16 service_id,
+ struct rxrpc_call *call,
+ gfp_t gfp)
+{
+ struct rxrpc_connection *conn;
+ int chan, ret;
+
+ _enter("");
+
+ conn = rx->conn;
+ if (!conn) {
+ /* not yet present - create a candidate for a new connection
+ * and then redo the check */
+ conn = rxrpc_alloc_connection(gfp);
+ if (!conn) {
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ conn->trans = trans;
+ conn->bundle = NULL;
+ conn->service_id = service_id;
+ conn->epoch = rxrpc_epoch;
+ conn->in_clientflag = 0;
+ conn->out_clientflag = RXRPC_CLIENT_INITIATED;
+ conn->cid = 0;
+ conn->state = RXRPC_CONN_CLIENT;
+ conn->avail_calls = RXRPC_MAXCALLS - 1;
+ conn->security_level = rx->min_sec_level;
+ conn->key = key_get(rx->key);
+
+ ret = rxrpc_init_client_conn_security(conn);
+ if (ret < 0) {
+ key_put(conn->key);
+ kfree(conn);
+ _leave(" = %d [key]", ret);
+ return ret;
+ }
+
+ write_lock_bh(&rxrpc_connection_lock);
+ list_add_tail(&conn->link, &rxrpc_connections);
+ write_unlock_bh(&rxrpc_connection_lock);
+
+ spin_lock(&trans->client_lock);
+ atomic_inc(&trans->usage);
+
+ _net("CONNECT EXCL new %d on TRANS %d",
+ conn->debug_id, conn->trans->debug_id);
+
+ rxrpc_assign_connection_id(conn);
+ rx->conn = conn;
+ } else {
+ spin_lock(&trans->client_lock);
+ }
+
+ /* we've got a connection with a free channel and we can now attach the
+ * call to it
+ * - we're holding the transport's client lock
+ * - we're holding a reference on the connection
+ */
+ for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
+ if (!conn->channels[chan])
+ goto found_channel;
+ goto no_free_channels;
+
+found_channel:
+ atomic_inc(&conn->usage);
+ conn->channels[chan] = call;
+ call->conn = conn;
+ call->channel = chan;
+ call->cid = conn->cid | htonl(chan);
+ call->call_id = htonl(++conn->call_counter);
+
+ _net("CONNECT client on conn %d chan %d as call %x",
+ conn->debug_id, chan, ntohl(call->call_id));
+
+ spin_unlock(&trans->client_lock);
+
+ rxrpc_add_call_ID_to_conn(conn, call);
+ _leave(" = 0");
+ return 0;
+
+no_free_channels:
+ spin_unlock(&trans->client_lock);
+ _leave(" = -ENOSR");
+ return -ENOSR;
+}
+
+/*
+ * find a connection for a call
+ * - called in process context with IRQs enabled
+ */
+int rxrpc_connect_call(struct rxrpc_sock *rx,
+ struct rxrpc_transport *trans,
+ struct rxrpc_conn_bundle *bundle,
+ struct rxrpc_call *call,
+ gfp_t gfp)
+{
+ struct rxrpc_connection *conn, *candidate;
+ int chan, ret;
+
+ DECLARE_WAITQUEUE(myself, current);
+
+ _enter("%p,%lx,", rx, call->user_call_ID);
+
+ if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags))
+ return rxrpc_connect_exclusive(rx, trans, bundle->service_id,
+ call, gfp);
+
+ spin_lock(&trans->client_lock);
+ for (;;) {
+ /* see if the bundle has a call slot available */
+ if (!list_empty(&bundle->avail_conns)) {
+ _debug("avail");
+ conn = list_entry(bundle->avail_conns.next,
+ struct rxrpc_connection,
+ bundle_link);
+ if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
+ list_del_init(&conn->bundle_link);
+ bundle->num_conns--;
+ continue;
+ }
+ if (--conn->avail_calls == 0)
+ list_move(&conn->bundle_link,
+ &bundle->busy_conns);
+ ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
+ ASSERT(conn->channels[0] == NULL ||
+ conn->channels[1] == NULL ||
+ conn->channels[2] == NULL ||
+ conn->channels[3] == NULL);
+ atomic_inc(&conn->usage);
+ break;
+ }
+
+ if (!list_empty(&bundle->unused_conns)) {
+ _debug("unused");
+ conn = list_entry(bundle->unused_conns.next,
+ struct rxrpc_connection,
+ bundle_link);
+ if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
+ list_del_init(&conn->bundle_link);
+ bundle->num_conns--;
+ continue;
+ }
+ ASSERTCMP(conn->avail_calls, ==, RXRPC_MAXCALLS);
+ conn->avail_calls = RXRPC_MAXCALLS - 1;
+ ASSERT(conn->channels[0] == NULL &&
+ conn->channels[1] == NULL &&
+ conn->channels[2] == NULL &&
+ conn->channels[3] == NULL);
+ atomic_inc(&conn->usage);
+ list_move(&conn->bundle_link, &bundle->avail_conns);
+ break;
+ }
+
+ /* need to allocate a new connection */
+ _debug("get new conn [%d]", bundle->num_conns);
+
+ spin_unlock(&trans->client_lock);
+
+ if (signal_pending(current))
+ goto interrupted;
+
+ if (bundle->num_conns >= 20) {
+ _debug("too many conns");
+
+ if (!(gfp & __GFP_WAIT)) {
+ _leave(" = -EAGAIN");
+ return -EAGAIN;
+ }
+
+ add_wait_queue(&bundle->chanwait, &myself);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (bundle->num_conns < 20 ||
+ !list_empty(&bundle->unused_conns) ||
+ !list_empty(&bundle->avail_conns))
+ break;
+ if (signal_pending(current))
+ goto interrupted_dequeue;
+ schedule();
+ }
+ remove_wait_queue(&bundle->chanwait, &myself);
+ __set_current_state(TASK_RUNNING);
+ spin_lock(&trans->client_lock);
+ continue;
+ }
+
+ /* not yet present - create a candidate for a new connection and then
+ * redo the check */
+ candidate = rxrpc_alloc_connection(gfp);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+ }
+
+ candidate->trans = trans;
+ candidate->bundle = bundle;
+ candidate->service_id = bundle->service_id;
+ candidate->epoch = rxrpc_epoch;
+ candidate->in_clientflag = 0;
+ candidate->out_clientflag = RXRPC_CLIENT_INITIATED;
+ candidate->cid = 0;
+ candidate->state = RXRPC_CONN_CLIENT;
+ candidate->avail_calls = RXRPC_MAXCALLS;
+ candidate->security_level = rx->min_sec_level;
+ candidate->key = key_get(bundle->key);
+
+ ret = rxrpc_init_client_conn_security(candidate);
+ if (ret < 0) {
+ key_put(candidate->key);
+ kfree(candidate);
+ _leave(" = %d [key]", ret);
+ return ret;
+ }
+
+ write_lock_bh(&rxrpc_connection_lock);
+ list_add_tail(&candidate->link, &rxrpc_connections);
+ write_unlock_bh(&rxrpc_connection_lock);
+
+ spin_lock(&trans->client_lock);
+
+ list_add(&candidate->bundle_link, &bundle->unused_conns);
+ bundle->num_conns++;
+ atomic_inc(&bundle->usage);
+ atomic_inc(&trans->usage);
+
+ _net("CONNECT new %d on TRANS %d",
+ candidate->debug_id, candidate->trans->debug_id);
+
+ rxrpc_assign_connection_id(candidate);
+ if (candidate->security)
+ candidate->security->prime_packet_security(candidate);
+
+ /* leave the candidate lurking in zombie mode attached to the
+ * bundle until we're ready for it */
+ rxrpc_put_connection(candidate);
+ candidate = NULL;
+ }
+
+ /* we've got a connection with a free channel and we can now attach the
+ * call to it
+ * - we're holding the transport's client lock
+ * - we're holding a reference on the connection
+ * - we're holding a reference on the bundle
+ */
+ for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
+ if (!conn->channels[chan])
+ goto found_channel;
+ ASSERT(conn->channels[0] == NULL ||
+ conn->channels[1] == NULL ||
+ conn->channels[2] == NULL ||
+ conn->channels[3] == NULL);
+ BUG();
+
+found_channel:
+ conn->channels[chan] = call;
+ call->conn = conn;
+ call->channel = chan;
+ call->cid = conn->cid | htonl(chan);
+ call->call_id = htonl(++conn->call_counter);
+
+ _net("CONNECT client on conn %d chan %d as call %x",
+ conn->debug_id, chan, ntohl(call->call_id));
+
+ ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
+ spin_unlock(&trans->client_lock);
+
+ rxrpc_add_call_ID_to_conn(conn, call);
+
+ _leave(" = 0");
+ return 0;
+
+interrupted_dequeue:
+ remove_wait_queue(&bundle->chanwait, &myself);
+ __set_current_state(TASK_RUNNING);
+interrupted:
+ _leave(" = -ERESTARTSYS");
+ return -ERESTARTSYS;
+}
+
+/*
+ * get a record of an incoming connection
+ */
+struct rxrpc_connection *
+rxrpc_incoming_connection(struct rxrpc_transport *trans,
+ struct rxrpc_header *hdr,
+ gfp_t gfp)
+{
+ struct rxrpc_connection *conn, *candidate = NULL;
+ struct rb_node *p, **pp;
+ const char *new = "old";
+ __be32 epoch;
+ u32 conn_id;
+
+ _enter("");
+
+ ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED);
+
+ epoch = hdr->epoch;
+ conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+
+ /* search the connection list first */
+ read_lock_bh(&trans->conn_lock);
+
+ p = trans->server_conns.rb_node;
+ while (p) {
+ conn = rb_entry(p, struct rxrpc_connection, node);
+
+ _debug("maybe %x", conn->real_conn_id);
+
+ if (epoch < conn->epoch)
+ p = p->rb_left;
+ else if (epoch > conn->epoch)
+ p = p->rb_right;
+ else if (conn_id < conn->real_conn_id)
+ p = p->rb_left;
+ else if (conn_id > conn->real_conn_id)
+ p = p->rb_right;
+ else
+ goto found_extant_connection;
+ }
+ read_unlock_bh(&trans->conn_lock);
+
+ /* not yet present - create a candidate for a new record and then
+ * redo the search */
+ candidate = rxrpc_alloc_connection(gfp);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ candidate->trans = trans;
+ candidate->epoch = hdr->epoch;
+ candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK);
+ candidate->service_id = hdr->serviceId;
+ candidate->security_ix = hdr->securityIndex;
+ candidate->in_clientflag = RXRPC_CLIENT_INITIATED;
+ candidate->out_clientflag = 0;
+ candidate->real_conn_id = conn_id;
+ candidate->state = RXRPC_CONN_SERVER;
+ if (candidate->service_id)
+ candidate->state = RXRPC_CONN_SERVER_UNSECURED;
+
+ write_lock_bh(&trans->conn_lock);
+
+ pp = &trans->server_conns.rb_node;
+ p = NULL;
+ while (*pp) {
+ p = *pp;
+ conn = rb_entry(p, struct rxrpc_connection, node);
+
+ if (epoch < conn->epoch)
+ pp = &(*pp)->rb_left;
+ else if (epoch > conn->epoch)
+ pp = &(*pp)->rb_right;
+ else if (conn_id < conn->real_conn_id)
+ pp = &(*pp)->rb_left;
+ else if (conn_id > conn->real_conn_id)
+ pp = &(*pp)->rb_right;
+ else
+ goto found_extant_second;
+ }
+
+ /* we can now add the new candidate to the list */
+ conn = candidate;
+ candidate = NULL;
+ rb_link_node(&conn->node, p, pp);
+ rb_insert_color(&conn->node, &trans->server_conns);
+ atomic_inc(&conn->trans->usage);
+
+ write_unlock_bh(&trans->conn_lock);
+
+ write_lock_bh(&rxrpc_connection_lock);
+ list_add_tail(&conn->link, &rxrpc_connections);
+ write_unlock_bh(&rxrpc_connection_lock);
+
+ new = "new";
+
+success:
+ _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id);
+
+ _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
+ return conn;
+
+ /* we found the connection in the list immediately */
+found_extant_connection:
+ if (hdr->securityIndex != conn->security_ix) {
+ read_unlock_bh(&trans->conn_lock);
+ goto security_mismatch;
+ }
+ atomic_inc(&conn->usage);
+ read_unlock_bh(&trans->conn_lock);
+ goto success;
+
+ /* we found the connection on the second time through the list */
+found_extant_second:
+ if (hdr->securityIndex != conn->security_ix) {
+ write_unlock_bh(&trans->conn_lock);
+ goto security_mismatch;
+ }
+ atomic_inc(&conn->usage);
+ write_unlock_bh(&trans->conn_lock);
+ kfree(candidate);
+ goto success;
+
+security_mismatch:
+ kfree(candidate);
+ _leave(" = -EKEYREJECTED");
+ return ERR_PTR(-EKEYREJECTED);
+}
+
+/*
+ * find a connection based on transport and RxRPC connection ID for an incoming
+ * packet
+ */
+struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
+ struct rxrpc_header *hdr)
+{
+ struct rxrpc_connection *conn;
+ struct rb_node *p;
+ __be32 epoch;
+ u32 conn_id;
+
+ _enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags);
+
+ read_lock_bh(&trans->conn_lock);
+
+ conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+ epoch = hdr->epoch;
+
+ if (hdr->flags & RXRPC_CLIENT_INITIATED)
+ p = trans->server_conns.rb_node;
+ else
+ p = trans->client_conns.rb_node;
+
+ while (p) {
+ conn = rb_entry(p, struct rxrpc_connection, node);
+
+ _debug("maybe %x", conn->real_conn_id);
+
+ if (epoch < conn->epoch)
+ p = p->rb_left;
+ else if (epoch > conn->epoch)
+ p = p->rb_right;
+ else if (conn_id < conn->real_conn_id)
+ p = p->rb_left;
+ else if (conn_id > conn->real_conn_id)
+ p = p->rb_right;
+ else
+ goto found;
+ }
+
+ read_unlock_bh(&trans->conn_lock);
+ _leave(" = NULL");
+ return NULL;
+
+found:
+ atomic_inc(&conn->usage);
+ read_unlock_bh(&trans->conn_lock);
+ _leave(" = %p", conn);
+ return conn;
+}
+
+/*
+ * release a virtual connection
+ */
+void rxrpc_put_connection(struct rxrpc_connection *conn)
+{
+ _enter("%p{u=%d,d=%d}",
+ conn, atomic_read(&conn->usage), conn->debug_id);
+
+ ASSERTCMP(atomic_read(&conn->usage), >, 0);
+
+ conn->put_time = get_seconds();
+ if (atomic_dec_and_test(&conn->usage)) {
+ _debug("zombie");
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+ }
+
+ _leave("");
+}
+
+/*
+ * destroy a virtual connection
+ */
+static void rxrpc_destroy_connection(struct rxrpc_connection *conn)
+{
+ _enter("%p{%d}", conn, atomic_read(&conn->usage));
+
+ ASSERTCMP(atomic_read(&conn->usage), ==, 0);
+
+ _net("DESTROY CONN %d", conn->debug_id);
+
+ if (conn->bundle)
+ rxrpc_put_bundle(conn->trans, conn->bundle);
+
+ ASSERT(RB_EMPTY_ROOT(&conn->calls));
+ rxrpc_purge_queue(&conn->rx_queue);
+
+ rxrpc_clear_conn_security(conn);
+ rxrpc_put_transport(conn->trans);
+ kfree(conn);
+ _leave("");
+}
+
+/*
+ * reap dead connections
+ */
+static void rxrpc_connection_reaper(struct work_struct *work)
+{
+ struct rxrpc_connection *conn, *_p;
+ unsigned long now, earliest, reap_time;
+
+ LIST_HEAD(graveyard);
+
+ _enter("");
+
+ now = get_seconds();
+ earliest = ULONG_MAX;
+
+ write_lock_bh(&rxrpc_connection_lock);
+ list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) {
+ _debug("reap CONN %d { u=%d,t=%ld }",
+ conn->debug_id, atomic_read(&conn->usage),
+ (long) now - (long) conn->put_time);
+
+ if (likely(atomic_read(&conn->usage) > 0))
+ continue;
+
+ spin_lock(&conn->trans->client_lock);
+ write_lock(&conn->trans->conn_lock);
+ reap_time = conn->put_time + rxrpc_connection_expiry;
+
+ if (atomic_read(&conn->usage) > 0) {
+ ;
+ } else if (reap_time <= now) {
+ list_move_tail(&conn->link, &graveyard);
+ if (conn->out_clientflag)
+ rb_erase(&conn->node,
+ &conn->trans->client_conns);
+ else
+ rb_erase(&conn->node,
+ &conn->trans->server_conns);
+ if (conn->bundle) {
+ list_del_init(&conn->bundle_link);
+ conn->bundle->num_conns--;
+ }
+
+ } else if (reap_time < earliest) {
+ earliest = reap_time;
+ }
+
+ write_unlock(&conn->trans->conn_lock);
+ spin_unlock(&conn->trans->client_lock);
+ }
+ write_unlock_bh(&rxrpc_connection_lock);
+
+ if (earliest != ULONG_MAX) {
+ _debug("reschedule reaper %ld", (long) earliest - now);
+ ASSERTCMP(earliest, >, now);
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap,
+ (earliest - now) * HZ);
+ }
+
+ /* then destroy all those pulled out */
+ while (!list_empty(&graveyard)) {
+ conn = list_entry(graveyard.next, struct rxrpc_connection,
+ link);
+ list_del_init(&conn->link);
+
+ ASSERTCMP(atomic_read(&conn->usage), ==, 0);
+ rxrpc_destroy_connection(conn);
+ }
+
+ _leave("");
+}
+
+/*
+ * preemptively destroy all the connection records rather than waiting for them
+ * to time out
+ */
+void __exit rxrpc_destroy_all_connections(void)
+{
+ _enter("");
+
+ rxrpc_connection_expiry = 0;
+ cancel_delayed_work(&rxrpc_connection_reap);
+ rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+
+ _leave("");
+}
diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c
new file mode 100644
index 000000000..e7ed43a54
--- /dev/null
+++ b/net/rxrpc/ar-connevent.c
@@ -0,0 +1,405 @@
+/* connection-level event handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+/*
+ * pass a connection-level abort onto all calls on that connection
+ */
+static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
+ u32 abort_code)
+{
+ struct rxrpc_call *call;
+ struct rb_node *p;
+
+ _enter("{%d},%x", conn->debug_id, abort_code);
+
+ read_lock_bh(&conn->lock);
+
+ for (p = rb_first(&conn->calls); p; p = rb_next(p)) {
+ call = rb_entry(p, struct rxrpc_call, conn_node);
+ write_lock(&call->state_lock);
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = state;
+ call->abort_code = abort_code;
+ if (state == RXRPC_CALL_LOCALLY_ABORTED)
+ set_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+ else
+ set_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+ write_unlock(&call->state_lock);
+ }
+
+ read_unlock_bh(&conn->lock);
+ _leave("");
+}
+
+/*
+ * generate a connection-level abort
+ */
+static int rxrpc_abort_connection(struct rxrpc_connection *conn,
+ u32 error, u32 abort_code)
+{
+ struct rxrpc_header hdr;
+ struct msghdr msg;
+ struct kvec iov[2];
+ __be32 word;
+ size_t len;
+ int ret;
+
+ _enter("%d,,%u,%u", conn->debug_id, error, abort_code);
+
+ /* generate a connection-level abort */
+ spin_lock_bh(&conn->state_lock);
+ if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) {
+ conn->state = RXRPC_CONN_LOCALLY_ABORTED;
+ conn->error = error;
+ spin_unlock_bh(&conn->state_lock);
+ } else {
+ spin_unlock_bh(&conn->state_lock);
+ _leave(" = 0 [already dead]");
+ return 0;
+ }
+
+ rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code);
+
+ msg.msg_name = &conn->trans->peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ hdr.epoch = conn->epoch;
+ hdr.cid = conn->cid;
+ hdr.callNumber = 0;
+ hdr.seq = 0;
+ hdr.type = RXRPC_PACKET_TYPE_ABORT;
+ hdr.flags = conn->out_clientflag;
+ hdr.userStatus = 0;
+ hdr.securityIndex = conn->security_ix;
+ hdr._rsvd = 0;
+ hdr.serviceId = conn->service_id;
+
+ word = htonl(abort_code);
+
+ iov[0].iov_base = &hdr;
+ iov[0].iov_len = sizeof(hdr);
+ iov[1].iov_base = &word;
+ iov[1].iov_len = sizeof(word);
+
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ hdr.serial = htonl(atomic_inc_return(&conn->serial));
+ _proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code);
+
+ ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
+ if (ret < 0) {
+ _debug("sendmsg failed: %d", ret);
+ return -EAGAIN;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * mark a call as being on a now-secured channel
+ * - must be called with softirqs disabled
+ */
+static void rxrpc_call_is_secure(struct rxrpc_call *call)
+{
+ _enter("%p", call);
+ if (call) {
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_SECURED, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock(&call->state_lock);
+ }
+}
+
+/*
+ * connection-level Rx packet processor
+ */
+static int rxrpc_process_event(struct rxrpc_connection *conn,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ __be32 tmp;
+ u32 serial;
+ int loop, ret;
+
+ if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
+ kleave(" = -ECONNABORTED [%u]", conn->state);
+ return -ECONNABORTED;
+ }
+
+ serial = ntohl(sp->hdr.serial);
+
+ _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, serial);
+
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_ABORT:
+ if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0)
+ return -EPROTO;
+ _proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp));
+
+ conn->state = RXRPC_CONN_REMOTELY_ABORTED;
+ rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
+ ntohl(tmp));
+ return -ECONNABORTED;
+
+ case RXRPC_PACKET_TYPE_CHALLENGE:
+ if (conn->security)
+ return conn->security->respond_to_challenge(
+ conn, skb, _abort_code);
+ return -EPROTO;
+
+ case RXRPC_PACKET_TYPE_RESPONSE:
+ if (!conn->security)
+ return -EPROTO;
+
+ ret = conn->security->verify_response(conn, skb, _abort_code);
+ if (ret < 0)
+ return ret;
+
+ ret = conn->security->init_connection_security(conn);
+ if (ret < 0)
+ return ret;
+
+ conn->security->prime_packet_security(conn);
+ read_lock_bh(&conn->lock);
+ spin_lock(&conn->state_lock);
+
+ if (conn->state == RXRPC_CONN_SERVER_CHALLENGING) {
+ conn->state = RXRPC_CONN_SERVER;
+ for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
+ rxrpc_call_is_secure(conn->channels[loop]);
+ }
+
+ spin_unlock(&conn->state_lock);
+ read_unlock_bh(&conn->lock);
+ return 0;
+
+ default:
+ _leave(" = -EPROTO [%u]", sp->hdr.type);
+ return -EPROTO;
+ }
+}
+
+/*
+ * set up security and issue a challenge
+ */
+static void rxrpc_secure_connection(struct rxrpc_connection *conn)
+{
+ u32 abort_code;
+ int ret;
+
+ _enter("{%d}", conn->debug_id);
+
+ ASSERT(conn->security_ix != 0);
+
+ if (!conn->key) {
+ _debug("set up security");
+ ret = rxrpc_init_server_conn_security(conn);
+ switch (ret) {
+ case 0:
+ break;
+ case -ENOENT:
+ abort_code = RX_CALL_DEAD;
+ goto abort;
+ default:
+ abort_code = RXKADNOAUTH;
+ goto abort;
+ }
+ }
+
+ ASSERT(conn->security != NULL);
+
+ if (conn->security->issue_challenge(conn) < 0) {
+ abort_code = RX_CALL_DEAD;
+ ret = -ENOMEM;
+ goto abort;
+ }
+
+ _leave("");
+ return;
+
+abort:
+ _debug("abort %d, %d", ret, abort_code);
+ rxrpc_abort_connection(conn, -ret, abort_code);
+ _leave(" [aborted]");
+}
+
+/*
+ * connection-level event processor
+ */
+void rxrpc_process_connection(struct work_struct *work)
+{
+ struct rxrpc_connection *conn =
+ container_of(work, struct rxrpc_connection, processor);
+ struct sk_buff *skb;
+ u32 abort_code = RX_PROTOCOL_ERROR;
+ int ret;
+
+ _enter("{%d}", conn->debug_id);
+
+ atomic_inc(&conn->usage);
+
+ if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) {
+ rxrpc_secure_connection(conn);
+ rxrpc_put_connection(conn);
+ }
+
+ /* go through the conn-level event packets, releasing the ref on this
+ * connection that each one has when we've finished with it */
+ while ((skb = skb_dequeue(&conn->rx_queue))) {
+ ret = rxrpc_process_event(conn, skb, &abort_code);
+ switch (ret) {
+ case -EPROTO:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ goto protocol_error;
+ case -EAGAIN:
+ goto requeue_and_leave;
+ case -ECONNABORTED:
+ default:
+ rxrpc_put_connection(conn);
+ rxrpc_free_skb(skb);
+ break;
+ }
+ }
+
+out:
+ rxrpc_put_connection(conn);
+ _leave("");
+ return;
+
+requeue_and_leave:
+ skb_queue_head(&conn->rx_queue, skb);
+ goto out;
+
+protocol_error:
+ if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
+ goto requeue_and_leave;
+ rxrpc_put_connection(conn);
+ rxrpc_free_skb(skb);
+ _leave(" [EPROTO]");
+ goto out;
+}
+
+/*
+ * put a packet up for transport-level abort
+ */
+void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
+{
+ CHECK_SLAB_OKAY(&local->usage);
+
+ if (!atomic_inc_not_zero(&local->usage)) {
+ printk("resurrected on reject\n");
+ BUG();
+ }
+
+ skb_queue_tail(&local->reject_queue, skb);
+ rxrpc_queue_work(&local->rejecter);
+}
+
+/*
+ * reject packets through the local endpoint
+ */
+void rxrpc_reject_packets(struct work_struct *work)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+ } sa;
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_header hdr;
+ struct rxrpc_local *local;
+ struct sk_buff *skb;
+ struct msghdr msg;
+ struct kvec iov[2];
+ size_t size;
+ __be32 code;
+
+ local = container_of(work, struct rxrpc_local, rejecter);
+ rxrpc_get_local(local);
+
+ _enter("%d", local->debug_id);
+
+ iov[0].iov_base = &hdr;
+ iov[0].iov_len = sizeof(hdr);
+ iov[1].iov_base = &code;
+ iov[1].iov_len = sizeof(code);
+ size = sizeof(hdr) + sizeof(code);
+
+ msg.msg_name = &sa;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa.sa_family = local->srx.transport.family;
+ switch (sa.sa.sa_family) {
+ case AF_INET:
+ msg.msg_namelen = sizeof(sa.sin);
+ break;
+ default:
+ msg.msg_namelen = 0;
+ break;
+ }
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.type = RXRPC_PACKET_TYPE_ABORT;
+
+ while ((skb = skb_dequeue(&local->reject_queue))) {
+ sp = rxrpc_skb(skb);
+ switch (sa.sa.sa_family) {
+ case AF_INET:
+ sa.sin.sin_port = udp_hdr(skb)->source;
+ sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+ code = htonl(skb->priority);
+
+ hdr.epoch = sp->hdr.epoch;
+ hdr.cid = sp->hdr.cid;
+ hdr.callNumber = sp->hdr.callNumber;
+ hdr.serviceId = sp->hdr.serviceId;
+ hdr.flags = sp->hdr.flags;
+ hdr.flags ^= RXRPC_CLIENT_INITIATED;
+ hdr.flags &= RXRPC_CLIENT_INITIATED;
+
+ kernel_sendmsg(local->socket, &msg, iov, 2, size);
+ break;
+
+ default:
+ break;
+ }
+
+ rxrpc_free_skb(skb);
+ rxrpc_put_local(local);
+ }
+
+ rxrpc_put_local(local);
+ _leave("");
+}
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
new file mode 100644
index 000000000..0610efa83
--- /dev/null
+++ b/net/rxrpc/ar-error.c
@@ -0,0 +1,241 @@
+/* Error message handling (ICMP)
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+/*
+ * handle an error received on the local endpoint
+ */
+void rxrpc_UDP_error_report(struct sock *sk)
+{
+ struct sock_exterr_skb *serr;
+ struct rxrpc_transport *trans;
+ struct rxrpc_local *local = sk->sk_user_data;
+ struct rxrpc_peer *peer;
+ struct sk_buff *skb;
+ __be32 addr;
+ __be16 port;
+
+ _enter("%p{%d}", sk, local->debug_id);
+
+ skb = sock_dequeue_err_skb(sk);
+ if (!skb) {
+ _leave("UDP socket errqueue empty");
+ return;
+ }
+ serr = SKB_EXT_ERR(skb);
+ if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
+ _leave("UDP empty message");
+ kfree_skb(skb);
+ return;
+ }
+
+ rxrpc_new_skb(skb);
+
+ addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset);
+ port = serr->port;
+
+ _net("Rx UDP Error from %pI4:%hu", &addr, ntohs(port));
+ _debug("Msg l:%d d:%d", skb->len, skb->data_len);
+
+ peer = rxrpc_find_peer(local, addr, port);
+ if (IS_ERR(peer)) {
+ rxrpc_free_skb(skb);
+ _leave(" [no peer]");
+ return;
+ }
+
+ trans = rxrpc_find_transport(local, peer);
+ if (!trans) {
+ rxrpc_put_peer(peer);
+ rxrpc_free_skb(skb);
+ _leave(" [no trans]");
+ return;
+ }
+
+ if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP &&
+ serr->ee.ee_type == ICMP_DEST_UNREACH &&
+ serr->ee.ee_code == ICMP_FRAG_NEEDED
+ ) {
+ u32 mtu = serr->ee.ee_info;
+
+ _net("Rx Received ICMP Fragmentation Needed (%d)", mtu);
+
+ /* wind down the local interface MTU */
+ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) {
+ peer->if_mtu = mtu;
+ _net("I/F MTU %u", mtu);
+ }
+
+ if (mtu == 0) {
+ /* they didn't give us a size, estimate one */
+ mtu = peer->if_mtu;
+ if (mtu > 1500) {
+ mtu >>= 1;
+ if (mtu < 1500)
+ mtu = 1500;
+ } else {
+ mtu -= 100;
+ if (mtu < peer->hdrsize)
+ mtu = peer->hdrsize + 4;
+ }
+ }
+
+ if (mtu < peer->mtu) {
+ spin_lock_bh(&peer->lock);
+ peer->mtu = mtu;
+ peer->maxdata = peer->mtu - peer->hdrsize;
+ spin_unlock_bh(&peer->lock);
+ _net("Net MTU %u (maxdata %u)",
+ peer->mtu, peer->maxdata);
+ }
+ }
+
+ rxrpc_put_peer(peer);
+
+ /* pass the transport ref to error_handler to release */
+ skb_queue_tail(&trans->error_queue, skb);
+ rxrpc_queue_work(&trans->error_handler);
+
+ _leave("");
+}
+
+/*
+ * deal with UDP error messages
+ */
+void rxrpc_UDP_error_handler(struct work_struct *work)
+{
+ struct sock_extended_err *ee;
+ struct sock_exterr_skb *serr;
+ struct rxrpc_transport *trans =
+ container_of(work, struct rxrpc_transport, error_handler);
+ struct sk_buff *skb;
+ int err;
+
+ _enter("");
+
+ skb = skb_dequeue(&trans->error_queue);
+ if (!skb)
+ return;
+
+ serr = SKB_EXT_ERR(skb);
+ ee = &serr->ee;
+
+ _net("Rx Error o=%d t=%d c=%d e=%d",
+ ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno);
+
+ err = ee->ee_errno;
+
+ switch (ee->ee_origin) {
+ case SO_EE_ORIGIN_ICMP:
+ switch (ee->ee_type) {
+ case ICMP_DEST_UNREACH:
+ switch (ee->ee_code) {
+ case ICMP_NET_UNREACH:
+ _net("Rx Received ICMP Network Unreachable");
+ err = ENETUNREACH;
+ break;
+ case ICMP_HOST_UNREACH:
+ _net("Rx Received ICMP Host Unreachable");
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_PORT_UNREACH:
+ _net("Rx Received ICMP Port Unreachable");
+ err = ECONNREFUSED;
+ break;
+ case ICMP_FRAG_NEEDED:
+ _net("Rx Received ICMP Fragmentation Needed (%d)",
+ ee->ee_info);
+ err = 0; /* dealt with elsewhere */
+ break;
+ case ICMP_NET_UNKNOWN:
+ _net("Rx Received ICMP Unknown Network");
+ err = ENETUNREACH;
+ break;
+ case ICMP_HOST_UNKNOWN:
+ _net("Rx Received ICMP Unknown Host");
+ err = EHOSTUNREACH;
+ break;
+ default:
+ _net("Rx Received ICMP DestUnreach code=%u",
+ ee->ee_code);
+ break;
+ }
+ break;
+
+ case ICMP_TIME_EXCEEDED:
+ _net("Rx Received ICMP TTL Exceeded");
+ break;
+
+ default:
+ _proto("Rx Received ICMP error { type=%u code=%u }",
+ ee->ee_type, ee->ee_code);
+ break;
+ }
+ break;
+
+ case SO_EE_ORIGIN_LOCAL:
+ _proto("Rx Received local error { error=%d }",
+ ee->ee_errno);
+ break;
+
+ case SO_EE_ORIGIN_NONE:
+ case SO_EE_ORIGIN_ICMP6:
+ default:
+ _proto("Rx Received error report { orig=%u }",
+ ee->ee_origin);
+ break;
+ }
+
+ /* terminate all the affected calls if there's an unrecoverable
+ * error */
+ if (err) {
+ struct rxrpc_call *call, *_n;
+
+ _debug("ISSUE ERROR %d", err);
+
+ spin_lock_bh(&trans->peer->lock);
+ trans->peer->net_error = err;
+
+ list_for_each_entry_safe(call, _n, &trans->peer->error_targets,
+ error_link) {
+ write_lock(&call->state_lock);
+ if (call->state != RXRPC_CALL_COMPLETE &&
+ call->state < RXRPC_CALL_NETWORK_ERROR) {
+ call->state = RXRPC_CALL_NETWORK_ERROR;
+ set_bit(RXRPC_CALL_RCVD_ERROR, &call->events);
+ rxrpc_queue_call(call);
+ }
+ write_unlock(&call->state_lock);
+ list_del_init(&call->error_link);
+ }
+
+ spin_unlock_bh(&trans->peer->lock);
+ }
+
+ if (!skb_queue_empty(&trans->error_queue))
+ rxrpc_queue_work(&trans->error_handler);
+
+ rxrpc_free_skb(skb);
+ rxrpc_put_transport(trans);
+ _leave("");
+}
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
new file mode 100644
index 000000000..4505a691d
--- /dev/null
+++ b/net/rxrpc/ar-input.c
@@ -0,0 +1,788 @@
+/* RxRPC packet reception
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/net_namespace.h>
+#include "ar-internal.h"
+
+const char *rxrpc_pkts[] = {
+ "?00",
+ "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
+ "?09", "?10", "?11", "?12", "VERSION", "?14", "?15"
+};
+
+/*
+ * queue a packet for recvmsg to pass to userspace
+ * - the caller must hold a lock on call->lock
+ * - must not be called with interrupts disabled (sk_filter() disables BH's)
+ * - eats the packet whether successful or not
+ * - there must be just one reference to the packet, which the caller passes to
+ * this function
+ */
+int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
+ bool force, bool terminal)
+{
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_sock *rx = call->socket;
+ struct sock *sk;
+ int ret;
+
+ _enter(",,%d,%d", force, terminal);
+
+ ASSERT(!irqs_disabled());
+
+ sp = rxrpc_skb(skb);
+ ASSERTCMP(sp->call, ==, call);
+
+ /* if we've already posted the terminal message for a call, then we
+ * don't post any more */
+ if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
+ _debug("already terminated");
+ ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
+ skb->destructor = NULL;
+ sp->call = NULL;
+ rxrpc_put_call(call);
+ rxrpc_free_skb(skb);
+ return 0;
+ }
+
+ sk = &rx->sk;
+
+ if (!force) {
+ /* cast skb->rcvbuf to unsigned... It's pointless, but
+ * reduces number of warnings when compiling with -W
+ * --ANK */
+// ret = -ENOBUFS;
+// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+// (unsigned int) sk->sk_rcvbuf)
+// goto out;
+
+ ret = sk_filter(sk, skb);
+ if (ret < 0)
+ goto out;
+ }
+
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) &&
+ !test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ call->socket->sk.sk_state != RXRPC_CLOSE) {
+ skb->destructor = rxrpc_packet_destructor;
+ skb->dev = NULL;
+ skb->sk = sk;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+ if (terminal) {
+ _debug("<<<< TERMINAL MESSAGE >>>>");
+ set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags);
+ }
+
+ /* allow interception by a kernel service */
+ if (rx->interceptor) {
+ rx->interceptor(sk, call->user_call_ID, skb);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ } else {
+ _net("post skb %p", skb);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ }
+ skb = NULL;
+ } else {
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+ ret = 0;
+
+out:
+ /* release the socket buffer */
+ if (skb) {
+ skb->destructor = NULL;
+ sp->call = NULL;
+ rxrpc_put_call(call);
+ rxrpc_free_skb(skb);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * process a DATA packet, posting the packet to the appropriate queue
+ * - eats the packet if successful
+ */
+static int rxrpc_fast_process_data(struct rxrpc_call *call,
+ struct sk_buff *skb, u32 seq)
+{
+ struct rxrpc_skb_priv *sp;
+ bool terminal;
+ int ret, ackbit, ack;
+
+ _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq);
+
+ sp = rxrpc_skb(skb);
+ ASSERTCMP(sp->call, ==, NULL);
+
+ spin_lock(&call->lock);
+
+ if (call->state > RXRPC_CALL_COMPLETE)
+ goto discard;
+
+ ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post);
+ ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv);
+ ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten);
+
+ if (seq < call->rx_data_post) {
+ _debug("dup #%u [-%u]", seq, call->rx_data_post);
+ ack = RXRPC_ACK_DUPLICATE;
+ ret = -ENOBUFS;
+ goto discard_and_ack;
+ }
+
+ /* we may already have the packet in the out of sequence queue */
+ ackbit = seq - (call->rx_data_eaten + 1);
+ ASSERTCMP(ackbit, >=, 0);
+ if (__test_and_set_bit(ackbit, call->ackr_window)) {
+ _debug("dup oos #%u [%u,%u]",
+ seq, call->rx_data_eaten, call->rx_data_post);
+ ack = RXRPC_ACK_DUPLICATE;
+ goto discard_and_ack;
+ }
+
+ if (seq >= call->ackr_win_top) {
+ _debug("exceed #%u [%u]", seq, call->ackr_win_top);
+ __clear_bit(ackbit, call->ackr_window);
+ ack = RXRPC_ACK_EXCEEDS_WINDOW;
+ goto discard_and_ack;
+ }
+
+ if (seq == call->rx_data_expect) {
+ clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags);
+ call->rx_data_expect++;
+ } else if (seq > call->rx_data_expect) {
+ _debug("oos #%u [%u]", seq, call->rx_data_expect);
+ call->rx_data_expect = seq + 1;
+ if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) {
+ ack = RXRPC_ACK_OUT_OF_SEQUENCE;
+ goto enqueue_and_ack;
+ }
+ goto enqueue_packet;
+ }
+
+ if (seq != call->rx_data_post) {
+ _debug("ahead #%u [%u]", seq, call->rx_data_post);
+ goto enqueue_packet;
+ }
+
+ if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags))
+ goto protocol_error;
+
+ /* if the packet need security things doing to it, then it goes down
+ * the slow path */
+ if (call->conn->security)
+ goto enqueue_packet;
+
+ sp->call = call;
+ rxrpc_get_call(call);
+ terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
+ !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
+ ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
+ if (ret < 0) {
+ if (ret == -ENOMEM || ret == -ENOBUFS) {
+ __clear_bit(ackbit, call->ackr_window);
+ ack = RXRPC_ACK_NOSPACE;
+ goto discard_and_ack;
+ }
+ goto out;
+ }
+
+ skb = NULL;
+
+ _debug("post #%u", seq);
+ ASSERTCMP(call->rx_data_post, ==, seq);
+ call->rx_data_post++;
+
+ if (sp->hdr.flags & RXRPC_LAST_PACKET)
+ set_bit(RXRPC_CALL_RCVD_LAST, &call->flags);
+
+ /* if we've reached an out of sequence packet then we need to drain
+ * that queue into the socket Rx queue now */
+ if (call->rx_data_post == call->rx_first_oos) {
+ _debug("drain rx oos now");
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock(&call->state_lock);
+ }
+
+ spin_unlock(&call->lock);
+ atomic_inc(&call->ackr_not_idle);
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, sp->hdr.serial, false);
+ _leave(" = 0 [posted]");
+ return 0;
+
+protocol_error:
+ ret = -EBADMSG;
+out:
+ spin_unlock(&call->lock);
+ _leave(" = %d", ret);
+ return ret;
+
+discard_and_ack:
+ _debug("discard and ACK packet %p", skb);
+ __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true);
+discard:
+ spin_unlock(&call->lock);
+ rxrpc_free_skb(skb);
+ _leave(" = 0 [discarded]");
+ return 0;
+
+enqueue_and_ack:
+ __rxrpc_propose_ACK(call, ack, sp->hdr.serial, true);
+enqueue_packet:
+ _net("defer skb %p", skb);
+ spin_unlock(&call->lock);
+ skb_queue_tail(&call->rx_queue, skb);
+ atomic_inc(&call->ackr_not_idle);
+ read_lock(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD)
+ rxrpc_queue_call(call);
+ read_unlock(&call->state_lock);
+ _leave(" = 0 [queued]");
+ return 0;
+}
+
+/*
+ * assume an implicit ACKALL of the transmission phase of a client socket upon
+ * reception of the first reply packet
+ */
+static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
+{
+ write_lock_bh(&call->state_lock);
+
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+ call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ call->acks_latest = serial;
+
+ _debug("implicit ACKALL %%%u", call->acks_latest);
+ set_bit(RXRPC_CALL_RCVD_ACKALL, &call->events);
+ write_unlock_bh(&call->state_lock);
+
+ if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
+ clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_RESEND, &call->events);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ }
+ break;
+
+ default:
+ write_unlock_bh(&call->state_lock);
+ break;
+ }
+}
+
+/*
+ * post an incoming packet to the nominated call to deal with
+ * - must get rid of the sk_buff, either by freeing it or by queuing it
+ */
+void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ __be32 _abort_code;
+ u32 serial, hi_serial, seq, abort_code;
+
+ _enter("%p,%p", call, skb);
+
+ ASSERT(!irqs_disabled());
+
+#if 0 // INJECT RX ERROR
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
+ static int skip = 0;
+ if (++skip == 3) {
+ printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n");
+ skip = 0;
+ goto free_packet;
+ }
+ }
+#endif
+
+ /* track the latest serial number on this connection for ACK packet
+ * information */
+ serial = ntohl(sp->hdr.serial);
+ hi_serial = atomic_read(&call->conn->hi_serial);
+ while (serial > hi_serial)
+ hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial,
+ serial);
+
+ /* request ACK generation for any ACK or DATA packet that requests
+ * it */
+ if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
+ _proto("ACK Requested on %%%u", serial);
+ rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false);
+ }
+
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_ABORT:
+ _debug("abort");
+
+ if (skb_copy_bits(skb, 0, &_abort_code,
+ sizeof(_abort_code)) < 0)
+ goto protocol_error;
+
+ abort_code = ntohl(_abort_code);
+ _proto("Rx ABORT %%%u { %x }", serial, abort_code);
+
+ write_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_REMOTELY_ABORTED;
+ call->abort_code = abort_code;
+ set_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+ goto free_packet_unlock;
+
+ case RXRPC_PACKET_TYPE_BUSY:
+ _proto("Rx BUSY %%%u", serial);
+
+ if (call->conn->out_clientflag)
+ goto protocol_error;
+
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ call->state = RXRPC_CALL_SERVER_BUSY;
+ set_bit(RXRPC_CALL_RCVD_BUSY, &call->events);
+ rxrpc_queue_call(call);
+ case RXRPC_CALL_SERVER_BUSY:
+ goto free_packet_unlock;
+ default:
+ goto protocol_error_locked;
+ }
+
+ default:
+ _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial);
+ goto protocol_error;
+
+ case RXRPC_PACKET_TYPE_DATA:
+ seq = ntohl(sp->hdr.seq);
+
+ _proto("Rx DATA %%%u { #%u }", serial, seq);
+
+ if (seq == 0)
+ goto protocol_error;
+
+ call->ackr_prev_seq = sp->hdr.seq;
+
+ /* received data implicitly ACKs all of the request packets we
+ * sent when we're acting as a client */
+ if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
+ rxrpc_assume_implicit_ackall(call, serial);
+
+ switch (rxrpc_fast_process_data(call, skb, seq)) {
+ case 0:
+ skb = NULL;
+ goto done;
+
+ default:
+ BUG();
+
+ /* data packet received beyond the last packet */
+ case -EBADMSG:
+ goto protocol_error;
+ }
+
+ case RXRPC_PACKET_TYPE_ACKALL:
+ case RXRPC_PACKET_TYPE_ACK:
+ /* ACK processing is done in process context */
+ read_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_DEAD) {
+ skb_queue_tail(&call->rx_queue, skb);
+ rxrpc_queue_call(call);
+ skb = NULL;
+ }
+ read_unlock_bh(&call->state_lock);
+ goto free_packet;
+ }
+
+protocol_error:
+ _debug("protocol error");
+ write_lock_bh(&call->state_lock);
+protocol_error_locked:
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->abort_code = RX_PROTOCOL_ERROR;
+ set_bit(RXRPC_CALL_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+free_packet_unlock:
+ write_unlock_bh(&call->state_lock);
+free_packet:
+ rxrpc_free_skb(skb);
+done:
+ _leave("");
+}
+
+/*
+ * split up a jumbo data packet
+ */
+static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
+ struct sk_buff *jumbo)
+{
+ struct rxrpc_jumbo_header jhdr;
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *part;
+
+ _enter(",{%u,%u}", jumbo->data_len, jumbo->len);
+
+ sp = rxrpc_skb(jumbo);
+
+ do {
+ sp->hdr.flags &= ~RXRPC_JUMBO_PACKET;
+
+ /* make a clone to represent the first subpacket in what's left
+ * of the jumbo packet */
+ part = skb_clone(jumbo, GFP_ATOMIC);
+ if (!part) {
+ /* simply ditch the tail in the event of ENOMEM */
+ pskb_trim(jumbo, RXRPC_JUMBO_DATALEN);
+ break;
+ }
+ rxrpc_new_skb(part);
+
+ pskb_trim(part, RXRPC_JUMBO_DATALEN);
+
+ if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN))
+ goto protocol_error;
+
+ if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0)
+ goto protocol_error;
+ if (!pskb_pull(jumbo, sizeof(jhdr)))
+ BUG();
+
+ sp->hdr.seq = htonl(ntohl(sp->hdr.seq) + 1);
+ sp->hdr.serial = htonl(ntohl(sp->hdr.serial) + 1);
+ sp->hdr.flags = jhdr.flags;
+ sp->hdr._rsvd = jhdr._rsvd;
+
+ _proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1);
+
+ rxrpc_fast_process_packet(call, part);
+ part = NULL;
+
+ } while (sp->hdr.flags & RXRPC_JUMBO_PACKET);
+
+ rxrpc_fast_process_packet(call, jumbo);
+ _leave("");
+ return;
+
+protocol_error:
+ _debug("protocol error");
+ rxrpc_free_skb(part);
+ rxrpc_free_skb(jumbo);
+ write_lock_bh(&call->state_lock);
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->abort_code = RX_PROTOCOL_ERROR;
+ set_bit(RXRPC_CALL_ABORT, &call->events);
+ rxrpc_queue_call(call);
+ }
+ write_unlock_bh(&call->state_lock);
+ _leave("");
+}
+
+/*
+ * post an incoming packet to the appropriate call/socket to deal with
+ * - must get rid of the sk_buff, either by freeing it or by queuing it
+ */
+static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
+ struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp;
+
+ _enter("%p,%p", call, skb);
+
+ sp = rxrpc_skb(skb);
+
+ _debug("extant call [%d]", call->state);
+
+ read_lock(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_LOCALLY_ABORTED:
+ if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) {
+ rxrpc_queue_call(call);
+ goto free_unlock;
+ }
+ case RXRPC_CALL_REMOTELY_ABORTED:
+ case RXRPC_CALL_NETWORK_ERROR:
+ case RXRPC_CALL_DEAD:
+ goto dead_call;
+ case RXRPC_CALL_COMPLETE:
+ case RXRPC_CALL_CLIENT_FINAL_ACK:
+ /* complete server call */
+ if (call->conn->in_clientflag)
+ goto dead_call;
+ /* resend last packet of a completed call */
+ _debug("final ack again");
+ rxrpc_get_call(call);
+ set_bit(RXRPC_CALL_ACK_FINAL, &call->events);
+ rxrpc_queue_call(call);
+ goto free_unlock;
+ default:
+ break;
+ }
+
+ read_unlock(&call->state_lock);
+ rxrpc_get_call(call);
+
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+ sp->hdr.flags & RXRPC_JUMBO_PACKET)
+ rxrpc_process_jumbo_packet(call, skb);
+ else
+ rxrpc_fast_process_packet(call, skb);
+
+ rxrpc_put_call(call);
+ goto done;
+
+dead_call:
+ if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
+ skb->priority = RX_CALL_DEAD;
+ rxrpc_reject_packet(call->conn->trans->local, skb);
+ goto unlock;
+ }
+free_unlock:
+ rxrpc_free_skb(skb);
+unlock:
+ read_unlock(&call->state_lock);
+done:
+ _leave("");
+}
+
+/*
+ * post connection-level events to the connection
+ * - this includes challenges, responses and some aborts
+ */
+static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
+ struct sk_buff *skb)
+{
+ _enter("%p,%p", conn, skb);
+
+ atomic_inc(&conn->usage);
+ skb_queue_tail(&conn->rx_queue, skb);
+ rxrpc_queue_conn(conn);
+}
+
+/*
+ * post endpoint-level events to the local endpoint
+ * - this includes debug and version messages
+ */
+static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
+ struct sk_buff *skb)
+{
+ _enter("%p,%p", local, skb);
+
+ atomic_inc(&local->usage);
+ skb_queue_tail(&local->event_queue, skb);
+ rxrpc_queue_work(&local->event_processor);
+}
+
+static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local,
+ struct sk_buff *skb,
+ struct rxrpc_skb_priv *sp)
+{
+ struct rxrpc_peer *peer;
+ struct rxrpc_transport *trans;
+ struct rxrpc_connection *conn;
+
+ peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr,
+ udp_hdr(skb)->source);
+ if (IS_ERR(peer))
+ goto cant_find_conn;
+
+ trans = rxrpc_find_transport(local, peer);
+ rxrpc_put_peer(peer);
+ if (!trans)
+ goto cant_find_conn;
+
+ conn = rxrpc_find_connection(trans, &sp->hdr);
+ rxrpc_put_transport(trans);
+ if (!conn)
+ goto cant_find_conn;
+
+ return conn;
+cant_find_conn:
+ return NULL;
+}
+
+/*
+ * handle data received on the local endpoint
+ * - may be called in interrupt context
+ */
+void rxrpc_data_ready(struct sock *sk)
+{
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_local *local;
+ struct sk_buff *skb;
+ int ret;
+
+ _enter("%p", sk);
+
+ ASSERT(!irqs_disabled());
+
+ read_lock_bh(&rxrpc_local_lock);
+ local = sk->sk_user_data;
+ if (local && atomic_read(&local->usage) > 0)
+ rxrpc_get_local(local);
+ else
+ local = NULL;
+ read_unlock_bh(&rxrpc_local_lock);
+ if (!local) {
+ _leave(" [local dead]");
+ return;
+ }
+
+ skb = skb_recv_datagram(sk, 0, 1, &ret);
+ if (!skb) {
+ rxrpc_put_local(local);
+ if (ret == -EAGAIN)
+ return;
+ _debug("UDP socket error %d", ret);
+ return;
+ }
+
+ rxrpc_new_skb(skb);
+
+ _net("recv skb %p", skb);
+
+ /* we'll probably need to checksum it (didn't call sock_recvmsg) */
+ if (skb_checksum_complete(skb)) {
+ rxrpc_free_skb(skb);
+ rxrpc_put_local(local);
+ UDP_INC_STATS_BH(&init_net, UDP_MIB_INERRORS, 0);
+ _leave(" [CSUM failed]");
+ return;
+ }
+
+ UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0);
+
+ /* the socket buffer we have is owned by UDP, with UDP's data all over
+ * it, but we really want our own */
+ skb_orphan(skb);
+ sp = rxrpc_skb(skb);
+ memset(sp, 0, sizeof(*sp));
+
+ _net("Rx UDP packet from %08x:%04hu",
+ ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
+
+ /* dig out the RxRPC connection details */
+ if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr,
+ sizeof(sp->hdr)) < 0)
+ goto bad_message;
+ if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr)))
+ BUG();
+
+ _net("Rx RxRPC %s ep=%x call=%x:%x",
+ sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
+ ntohl(sp->hdr.epoch),
+ ntohl(sp->hdr.cid),
+ ntohl(sp->hdr.callNumber));
+
+ if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) {
+ _proto("Rx Bad Packet Type %u", sp->hdr.type);
+ goto bad_message;
+ }
+
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) {
+ rxrpc_post_packet_to_local(local, skb);
+ goto out;
+ }
+
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+ (sp->hdr.callNumber == 0 || sp->hdr.seq == 0))
+ goto bad_message;
+
+ if (sp->hdr.callNumber == 0) {
+ /* This is a connection-level packet. These should be
+ * fairly rare, so the extra overhead of looking them up the
+ * old-fashioned way doesn't really hurt */
+ struct rxrpc_connection *conn;
+
+ conn = rxrpc_conn_from_local(local, skb, sp);
+ if (!conn)
+ goto cant_route_call;
+
+ _debug("CONN %p {%d}", conn, conn->debug_id);
+ rxrpc_post_packet_to_conn(conn, skb);
+ rxrpc_put_connection(conn);
+ } else {
+ struct rxrpc_call *call;
+ u8 in_clientflag = 0;
+
+ if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
+ in_clientflag = RXRPC_CLIENT_INITIATED;
+ call = rxrpc_find_call_hash(in_clientflag, sp->hdr.cid,
+ sp->hdr.callNumber, sp->hdr.epoch,
+ sp->hdr.serviceId, local, AF_INET,
+ (u8 *)&ip_hdr(skb)->saddr);
+ if (call)
+ rxrpc_post_packet_to_call(call, skb);
+ else
+ goto cant_route_call;
+ }
+
+out:
+ rxrpc_put_local(local);
+ return;
+
+cant_route_call:
+ _debug("can't route call");
+ if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
+ sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
+ if (sp->hdr.seq == cpu_to_be32(1)) {
+ _debug("first packet");
+ skb_queue_tail(&local->accept_queue, skb);
+ rxrpc_queue_work(&local->acceptor);
+ rxrpc_put_local(local);
+ _leave(" [incoming]");
+ return;
+ }
+ skb->priority = RX_INVALID_OPERATION;
+ } else {
+ skb->priority = RX_CALL_DEAD;
+ }
+
+ if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) {
+ _debug("reject type %d",sp->hdr.type);
+ rxrpc_reject_packet(local, skb);
+ }
+ rxrpc_put_local(local);
+ _leave(" [no call]");
+ return;
+
+bad_message:
+ skb->priority = RX_PROTOCOL_ERROR;
+ rxrpc_reject_packet(local, skb);
+ rxrpc_put_local(local);
+ _leave(" [badmsg]");
+}
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
new file mode 100644
index 000000000..aef1bd294
--- /dev/null
+++ b/net/rxrpc/ar-internal.h
@@ -0,0 +1,812 @@
+/* AF_RXRPC internal definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <rxrpc/packet.h>
+
+#if 0
+#define CHECK_SLAB_OKAY(X) \
+ BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \
+ (POISON_FREE << 8 | POISON_FREE))
+#else
+#define CHECK_SLAB_OKAY(X) do {} while(0)
+#endif
+
+#define FCRYPT_BSIZE 8
+struct rxrpc_crypt {
+ union {
+ u8 x[FCRYPT_BSIZE];
+ __be32 n[2];
+ };
+} __attribute__((aligned(8)));
+
+#define rxrpc_queue_work(WS) queue_work(rxrpc_workqueue, (WS))
+#define rxrpc_queue_delayed_work(WS,D) \
+ queue_delayed_work(rxrpc_workqueue, (WS), (D))
+
+#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor)
+#define rxrpc_queue_conn(CONN) rxrpc_queue_work(&(CONN)->processor)
+
+/*
+ * sk_state for RxRPC sockets
+ */
+enum {
+ RXRPC_UNCONNECTED = 0,
+ RXRPC_CLIENT_BOUND, /* client local address bound */
+ RXRPC_CLIENT_CONNECTED, /* client is connected */
+ RXRPC_SERVER_BOUND, /* server local address bound */
+ RXRPC_SERVER_LISTENING, /* server listening for connections */
+ RXRPC_CLOSE, /* socket is being closed */
+};
+
+/*
+ * RxRPC socket definition
+ */
+struct rxrpc_sock {
+ /* WARNING: sk has to be the first member */
+ struct sock sk;
+ rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */
+ struct rxrpc_local *local; /* local endpoint */
+ struct rxrpc_transport *trans; /* transport handler */
+ struct rxrpc_conn_bundle *bundle; /* virtual connection bundle */
+ struct rxrpc_connection *conn; /* exclusive virtual connection */
+ struct list_head listen_link; /* link in the local endpoint's listen list */
+ struct list_head secureq; /* calls awaiting connection security clearance */
+ struct list_head acceptq; /* calls awaiting acceptance */
+ struct key *key; /* security for this socket */
+ struct key *securities; /* list of server security descriptors */
+ struct rb_root calls; /* outstanding calls on this socket */
+ unsigned long flags;
+#define RXRPC_SOCK_EXCLUSIVE_CONN 1 /* exclusive connection for a client socket */
+ rwlock_t call_lock; /* lock for calls */
+ u32 min_sec_level; /* minimum security level */
+#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT
+ struct sockaddr_rxrpc srx; /* local address */
+ sa_family_t proto; /* protocol created with */
+ __be16 service_id; /* service ID of local/remote service */
+};
+
+#define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk)
+
+/*
+ * RxRPC socket buffer private variables
+ * - max 48 bytes (struct sk_buff::cb)
+ */
+struct rxrpc_skb_priv {
+ struct rxrpc_call *call; /* call with which associated */
+ unsigned long resend_at; /* time in jiffies at which to resend */
+ union {
+ unsigned int offset; /* offset into buffer of next read */
+ int remain; /* amount of space remaining for next write */
+ u32 error; /* network error code */
+ bool need_resend; /* T if needs resending */
+ };
+
+ struct rxrpc_header hdr; /* RxRPC packet header from this packet */
+};
+
+#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
+
+enum rxrpc_command {
+ RXRPC_CMD_SEND_DATA, /* send data message */
+ RXRPC_CMD_SEND_ABORT, /* request abort generation */
+ RXRPC_CMD_ACCEPT, /* [server] accept incoming call */
+ RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */
+};
+
+/*
+ * RxRPC security module interface
+ */
+struct rxrpc_security {
+ struct module *owner; /* providing module */
+ struct list_head link; /* link in master list */
+ const char *name; /* name of this service */
+ u8 security_index; /* security type provided */
+
+ /* initialise a connection's security */
+ int (*init_connection_security)(struct rxrpc_connection *);
+
+ /* prime a connection's packet security */
+ void (*prime_packet_security)(struct rxrpc_connection *);
+
+ /* impose security on a packet */
+ int (*secure_packet)(const struct rxrpc_call *,
+ struct sk_buff *,
+ size_t,
+ void *);
+
+ /* verify the security on a received packet */
+ int (*verify_packet)(const struct rxrpc_call *, struct sk_buff *,
+ u32 *);
+
+ /* issue a challenge */
+ int (*issue_challenge)(struct rxrpc_connection *);
+
+ /* respond to a challenge */
+ int (*respond_to_challenge)(struct rxrpc_connection *,
+ struct sk_buff *,
+ u32 *);
+
+ /* verify a response */
+ int (*verify_response)(struct rxrpc_connection *,
+ struct sk_buff *,
+ u32 *);
+
+ /* clear connection security */
+ void (*clear)(struct rxrpc_connection *);
+};
+
+/*
+ * RxRPC local transport endpoint definition
+ * - matched by local port, address and protocol type
+ */
+struct rxrpc_local {
+ struct socket *socket; /* my UDP socket */
+ struct work_struct destroyer; /* endpoint destroyer */
+ struct work_struct acceptor; /* incoming call processor */
+ struct work_struct rejecter; /* packet reject writer */
+ struct work_struct event_processor; /* endpoint event processor */
+ struct list_head services; /* services listening on this endpoint */
+ struct list_head link; /* link in endpoint list */
+ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */
+ struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */
+ struct sk_buff_head reject_queue; /* packets awaiting rejection */
+ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */
+ spinlock_t lock; /* access lock */
+ rwlock_t services_lock; /* lock for services list */
+ atomic_t usage;
+ int debug_id; /* debug ID for printks */
+ volatile char error_rcvd; /* T if received ICMP error outstanding */
+ struct sockaddr_rxrpc srx; /* local address */
+};
+
+/*
+ * RxRPC remote transport endpoint definition
+ * - matched by remote port, address and protocol type
+ * - holds the connection ID counter for connections between the two endpoints
+ */
+struct rxrpc_peer {
+ struct work_struct destroyer; /* peer destroyer */
+ struct list_head link; /* link in master peer list */
+ struct list_head error_targets; /* targets for net error distribution */
+ spinlock_t lock; /* access lock */
+ atomic_t usage;
+ unsigned int if_mtu; /* interface MTU for this peer */
+ unsigned int mtu; /* network MTU for this peer */
+ unsigned int maxdata; /* data size (MTU - hdrsize) */
+ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
+ int debug_id; /* debug ID for printks */
+ int net_error; /* network error distributed */
+ struct sockaddr_rxrpc srx; /* remote address */
+
+ /* calculated RTT cache */
+#define RXRPC_RTT_CACHE_SIZE 32
+ suseconds_t rtt; /* current RTT estimate (in uS) */
+ unsigned int rtt_point; /* next entry at which to insert */
+ unsigned int rtt_usage; /* amount of cache actually used */
+ suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */
+};
+
+/*
+ * RxRPC point-to-point transport / connection manager definition
+ * - handles a bundle of connections between two endpoints
+ * - matched by { local, peer }
+ */
+struct rxrpc_transport {
+ struct rxrpc_local *local; /* local transport endpoint */
+ struct rxrpc_peer *peer; /* remote transport endpoint */
+ struct work_struct error_handler; /* network error distributor */
+ struct rb_root bundles; /* client connection bundles on this transport */
+ struct rb_root client_conns; /* client connections on this transport */
+ struct rb_root server_conns; /* server connections on this transport */
+ struct list_head link; /* link in master session list */
+ struct sk_buff_head error_queue; /* error packets awaiting processing */
+ time_t put_time; /* time at which to reap */
+ spinlock_t client_lock; /* client connection allocation lock */
+ rwlock_t conn_lock; /* lock for active/dead connections */
+ atomic_t usage;
+ int debug_id; /* debug ID for printks */
+ unsigned int conn_idcounter; /* connection ID counter (client) */
+};
+
+/*
+ * RxRPC client connection bundle
+ * - matched by { transport, service_id, key }
+ */
+struct rxrpc_conn_bundle {
+ struct rb_node node; /* node in transport's lookup tree */
+ struct list_head unused_conns; /* unused connections in this bundle */
+ struct list_head avail_conns; /* available connections in this bundle */
+ struct list_head busy_conns; /* busy connections in this bundle */
+ struct key *key; /* security for this bundle */
+ wait_queue_head_t chanwait; /* wait for channel to become available */
+ atomic_t usage;
+ int debug_id; /* debug ID for printks */
+ unsigned short num_conns; /* number of connections in this bundle */
+ __be16 service_id; /* service ID */
+ u8 security_ix; /* security type */
+};
+
+/*
+ * RxRPC connection definition
+ * - matched by { transport, service_id, conn_id, direction, key }
+ * - each connection can only handle four simultaneous calls
+ */
+struct rxrpc_connection {
+ struct rxrpc_transport *trans; /* transport session */
+ struct rxrpc_conn_bundle *bundle; /* connection bundle (client) */
+ struct work_struct processor; /* connection event processor */
+ struct rb_node node; /* node in transport's lookup tree */
+ struct list_head link; /* link in master connection list */
+ struct list_head bundle_link; /* link in bundle */
+ struct rb_root calls; /* calls on this connection */
+ struct sk_buff_head rx_queue; /* received conn-level packets */
+ struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */
+ struct rxrpc_security *security; /* applied security module */
+ struct key *key; /* security for this connection (client) */
+ struct key *server_key; /* security for this service */
+ struct crypto_blkcipher *cipher; /* encryption handle */
+ struct rxrpc_crypt csum_iv; /* packet checksum base */
+ unsigned long events;
+#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */
+ time_t put_time; /* time at which to reap */
+ rwlock_t lock; /* access lock */
+ spinlock_t state_lock; /* state-change lock */
+ atomic_t usage;
+ u32 real_conn_id; /* connection ID (host-endian) */
+ enum { /* current state of connection */
+ RXRPC_CONN_UNUSED, /* - connection not yet attempted */
+ RXRPC_CONN_CLIENT, /* - client connection */
+ RXRPC_CONN_SERVER_UNSECURED, /* - server unsecured connection */
+ RXRPC_CONN_SERVER_CHALLENGING, /* - server challenging for security */
+ RXRPC_CONN_SERVER, /* - server secured connection */
+ RXRPC_CONN_REMOTELY_ABORTED, /* - conn aborted by peer */
+ RXRPC_CONN_LOCALLY_ABORTED, /* - conn aborted locally */
+ RXRPC_CONN_NETWORK_ERROR, /* - conn terminated by network error */
+ } state;
+ int error; /* error code for local abort */
+ int debug_id; /* debug ID for printks */
+ unsigned int call_counter; /* call ID counter */
+ atomic_t serial; /* packet serial number counter */
+ atomic_t hi_serial; /* highest serial number received */
+ u8 avail_calls; /* number of calls available */
+ u8 size_align; /* data size alignment (for security) */
+ u8 header_size; /* rxrpc + security header size */
+ u8 security_size; /* security header size */
+ u32 security_level; /* security level negotiated */
+ u32 security_nonce; /* response re-use preventer */
+
+ /* the following are all in net order */
+ __be32 epoch; /* epoch of this connection */
+ __be32 cid; /* connection ID */
+ __be16 service_id; /* service ID */
+ u8 security_ix; /* security type */
+ u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */
+ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
+};
+
+/*
+ * RxRPC call definition
+ * - matched by { connection, call_id }
+ */
+struct rxrpc_call {
+ struct rxrpc_connection *conn; /* connection carrying call */
+ struct rxrpc_sock *socket; /* socket responsible */
+ struct timer_list lifetimer; /* lifetime remaining on call */
+ struct timer_list deadspan; /* reap timer for re-ACK'ing, etc */
+ struct timer_list ack_timer; /* ACK generation timer */
+ struct timer_list resend_timer; /* Tx resend timer */
+ struct work_struct destroyer; /* call destroyer */
+ struct work_struct processor; /* packet processor and ACK generator */
+ struct list_head link; /* link in master call list */
+ struct list_head error_link; /* link in error distribution list */
+ struct list_head accept_link; /* calls awaiting acceptance */
+ struct rb_node sock_node; /* node in socket call tree */
+ struct rb_node conn_node; /* node in connection call tree */
+ struct sk_buff_head rx_queue; /* received packets */
+ struct sk_buff_head rx_oos_queue; /* packets received out of sequence */
+ struct sk_buff *tx_pending; /* Tx socket buffer being filled */
+ wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */
+ unsigned long user_call_ID; /* user-defined call ID */
+ unsigned long creation_jif; /* time of call creation */
+ unsigned long flags;
+#define RXRPC_CALL_RELEASED 0 /* call has been released - no more message to userspace */
+#define RXRPC_CALL_TERMINAL_MSG 1 /* call has given the socket its final message */
+#define RXRPC_CALL_RCVD_LAST 2 /* all packets received */
+#define RXRPC_CALL_RUN_RTIMER 3 /* Tx resend timer started */
+#define RXRPC_CALL_TX_SOFT_ACK 4 /* sent some soft ACKs */
+#define RXRPC_CALL_PROC_BUSY 5 /* the processor is busy */
+#define RXRPC_CALL_INIT_ACCEPT 6 /* acceptance was initiated */
+#define RXRPC_CALL_HAS_USERID 7 /* has a user ID attached */
+#define RXRPC_CALL_EXPECT_OOS 8 /* expect out of sequence packets */
+ unsigned long events;
+#define RXRPC_CALL_RCVD_ACKALL 0 /* ACKALL or reply received */
+#define RXRPC_CALL_RCVD_BUSY 1 /* busy packet received */
+#define RXRPC_CALL_RCVD_ABORT 2 /* abort packet received */
+#define RXRPC_CALL_RCVD_ERROR 3 /* network error received */
+#define RXRPC_CALL_ACK_FINAL 4 /* need to generate final ACK (and release call) */
+#define RXRPC_CALL_ACK 5 /* need to generate ACK */
+#define RXRPC_CALL_REJECT_BUSY 6 /* need to generate busy message */
+#define RXRPC_CALL_ABORT 7 /* need to generate abort */
+#define RXRPC_CALL_CONN_ABORT 8 /* local connection abort generated */
+#define RXRPC_CALL_RESEND_TIMER 9 /* Tx resend timer expired */
+#define RXRPC_CALL_RESEND 10 /* Tx resend required */
+#define RXRPC_CALL_DRAIN_RX_OOS 11 /* drain the Rx out of sequence queue */
+#define RXRPC_CALL_LIFE_TIMER 12 /* call's lifetimer ran out */
+#define RXRPC_CALL_ACCEPTED 13 /* incoming call accepted by userspace app */
+#define RXRPC_CALL_SECURED 14 /* incoming call's connection is now secure */
+#define RXRPC_CALL_POST_ACCEPT 15 /* need to post an "accept?" message to the app */
+#define RXRPC_CALL_RELEASE 16 /* need to release the call's resources */
+
+ spinlock_t lock;
+ rwlock_t state_lock; /* lock for state transition */
+ atomic_t usage;
+ atomic_t sequence; /* Tx data packet sequence counter */
+ u32 abort_code; /* local/remote abort code */
+ enum { /* current state of call */
+ RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
+ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
+ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
+ RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */
+ RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */
+ RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */
+ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */
+ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */
+ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */
+ RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */
+ RXRPC_CALL_COMPLETE, /* - call completed */
+ RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */
+ RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */
+ RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */
+ RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */
+ RXRPC_CALL_DEAD, /* - call is dead */
+ } state;
+ int debug_id; /* debug ID for printks */
+ u8 channel; /* connection channel occupied by this call */
+
+ /* transmission-phase ACK management */
+ u8 acks_head; /* offset into window of first entry */
+ u8 acks_tail; /* offset into window of last entry */
+ u8 acks_winsz; /* size of un-ACK'd window */
+ u8 acks_unacked; /* lowest unacked packet in last ACK received */
+ int acks_latest; /* serial number of latest ACK received */
+ rxrpc_seq_t acks_hard; /* highest definitively ACK'd msg seq */
+ unsigned long *acks_window; /* sent packet window
+ * - elements are pointers with LSB set if ACK'd
+ */
+
+ /* receive-phase ACK management */
+ rxrpc_seq_t rx_data_expect; /* next data seq ID expected to be received */
+ rxrpc_seq_t rx_data_post; /* next data seq ID expected to be posted */
+ rxrpc_seq_t rx_data_recv; /* last data seq ID encountered by recvmsg */
+ rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */
+ rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */
+ rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */
+ rxrpc_seq_net_t ackr_prev_seq; /* previous sequence number received */
+ u8 ackr_reason; /* reason to ACK */
+ __be32 ackr_serial; /* serial of packet being ACK'd */
+ atomic_t ackr_not_idle; /* number of packets in Rx queue */
+
+ /* received packet records, 1 bit per record */
+#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG)
+ unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
+
+ struct hlist_node hash_node;
+ unsigned long hash_key; /* Full hash key */
+ u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */
+ struct rxrpc_local *local; /* Local endpoint. Used for hashing. */
+ sa_family_t proto; /* Frame protocol */
+ /* the following should all be in net order */
+ __be32 cid; /* connection ID + channel index */
+ __be32 call_id; /* call ID on connection */
+ __be32 epoch; /* epoch of this connection */
+ __be16 service_id; /* service ID */
+ union { /* Peer IP address for hashing */
+ __be32 ipv4_addr;
+ __u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */
+ } peer_ip;
+};
+
+/*
+ * locally abort an RxRPC call
+ */
+static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
+{
+ write_lock_bh(&call->state_lock);
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ call->abort_code = abort_code;
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ set_bit(RXRPC_CALL_ABORT, &call->events);
+ }
+ write_unlock_bh(&call->state_lock);
+}
+
+/*
+ * af_rxrpc.c
+ */
+extern atomic_t rxrpc_n_skbs;
+extern __be32 rxrpc_epoch;
+extern atomic_t rxrpc_debug_id;
+extern struct workqueue_struct *rxrpc_workqueue;
+
+/*
+ * ar-accept.c
+ */
+void rxrpc_accept_incoming_calls(struct work_struct *);
+struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long);
+int rxrpc_reject_call(struct rxrpc_sock *);
+
+/*
+ * ar-ack.c
+ */
+extern unsigned rxrpc_requested_ack_delay;
+extern unsigned rxrpc_soft_ack_delay;
+extern unsigned rxrpc_idle_ack_delay;
+extern unsigned rxrpc_rx_window_size;
+extern unsigned rxrpc_rx_mtu;
+extern unsigned rxrpc_rx_jumbo_max;
+
+void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
+void rxrpc_process_call(struct work_struct *);
+
+/*
+ * ar-call.c
+ */
+extern unsigned rxrpc_max_call_lifetime;
+extern unsigned rxrpc_dead_call_expiry;
+extern struct kmem_cache *rxrpc_call_jar;
+extern struct list_head rxrpc_calls;
+extern rwlock_t rxrpc_call_lock;
+
+struct rxrpc_call *rxrpc_find_call_hash(u8, __be32, __be32, __be32,
+ __be16, void *, sa_family_t, const u8 *);
+struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *,
+ struct rxrpc_transport *,
+ struct rxrpc_conn_bundle *,
+ unsigned long, int, gfp_t);
+struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
+ struct rxrpc_connection *,
+ struct rxrpc_header *, gfp_t);
+struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long);
+void rxrpc_release_call(struct rxrpc_call *);
+void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
+void __rxrpc_put_call(struct rxrpc_call *);
+void __exit rxrpc_destroy_all_calls(void);
+
+/*
+ * ar-connection.c
+ */
+extern unsigned rxrpc_connection_expiry;
+extern struct list_head rxrpc_connections;
+extern rwlock_t rxrpc_connection_lock;
+
+struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *,
+ struct rxrpc_transport *,
+ struct key *, __be16, gfp_t);
+void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *);
+int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *,
+ struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t);
+void rxrpc_put_connection(struct rxrpc_connection *);
+void __exit rxrpc_destroy_all_connections(void);
+struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *,
+ struct rxrpc_header *);
+extern struct rxrpc_connection *
+rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *,
+ gfp_t);
+
+/*
+ * ar-connevent.c
+ */
+void rxrpc_process_connection(struct work_struct *);
+void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
+void rxrpc_reject_packets(struct work_struct *);
+
+/*
+ * ar-error.c
+ */
+void rxrpc_UDP_error_report(struct sock *);
+void rxrpc_UDP_error_handler(struct work_struct *);
+
+/*
+ * ar-input.c
+ */
+extern const char *rxrpc_pkts[];
+
+void rxrpc_data_ready(struct sock *);
+int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool);
+void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
+
+/*
+ * ar-local.c
+ */
+extern rwlock_t rxrpc_local_lock;
+
+struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *);
+void rxrpc_put_local(struct rxrpc_local *);
+void __exit rxrpc_destroy_all_locals(void);
+
+/*
+ * ar-key.c
+ */
+extern struct key_type key_type_rxrpc;
+extern struct key_type key_type_rxrpc_s;
+
+int rxrpc_request_key(struct rxrpc_sock *, char __user *, int);
+int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int);
+int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t,
+ u32);
+
+/*
+ * ar-output.c
+ */
+extern unsigned rxrpc_resend_timeout;
+
+int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *);
+int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *,
+ struct msghdr *, size_t);
+int rxrpc_server_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t);
+
+/*
+ * ar-peer.c
+ */
+struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t);
+void rxrpc_put_peer(struct rxrpc_peer *);
+struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *, __be32, __be16);
+void __exit rxrpc_destroy_all_peers(void);
+
+/*
+ * ar-proc.c
+ */
+extern const char *const rxrpc_call_states[];
+extern const struct file_operations rxrpc_call_seq_fops;
+extern const struct file_operations rxrpc_connection_seq_fops;
+
+/*
+ * ar-recvmsg.c
+ */
+void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
+int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
+
+/*
+ * ar-security.c
+ */
+int rxrpc_register_security(struct rxrpc_security *);
+void rxrpc_unregister_security(struct rxrpc_security *);
+int rxrpc_init_client_conn_security(struct rxrpc_connection *);
+int rxrpc_init_server_conn_security(struct rxrpc_connection *);
+int rxrpc_secure_packet(const struct rxrpc_call *, struct sk_buff *, size_t,
+ void *);
+int rxrpc_verify_packet(const struct rxrpc_call *, struct sk_buff *, u32 *);
+void rxrpc_clear_conn_security(struct rxrpc_connection *);
+
+/*
+ * ar-skbuff.c
+ */
+void rxrpc_packet_destructor(struct sk_buff *);
+
+/*
+ * ar-transport.c
+ */
+extern unsigned rxrpc_transport_expiry;
+
+struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *,
+ struct rxrpc_peer *, gfp_t);
+void rxrpc_put_transport(struct rxrpc_transport *);
+void __exit rxrpc_destroy_all_transports(void);
+struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *,
+ struct rxrpc_peer *);
+
+/*
+ * sysctl.c
+ */
+#ifdef CONFIG_SYSCTL
+extern int __init rxrpc_sysctl_init(void);
+extern void rxrpc_sysctl_exit(void);
+#else
+static inline int __init rxrpc_sysctl_init(void) { return 0; }
+static inline void rxrpc_sysctl_exit(void) {}
+#endif
+
+/*
+ * debug tracing
+ */
+extern unsigned int rxrpc_debug;
+
+#define dbgprintk(FMT,...) \
+ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
+
+#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
+#define kproto(FMT,...) dbgprintk("### "FMT ,##__VA_ARGS__)
+#define knet(FMT,...) dbgprintk("@@@ "FMT ,##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT,...) kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...) kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__)
+#define _proto(FMT,...) kproto(FMT,##__VA_ARGS__)
+#define _net(FMT,...) knet(FMT,##__VA_ARGS__)
+
+#elif defined(CONFIG_AF_RXRPC_DEBUG)
+#define RXRPC_DEBUG_KENTER 0x01
+#define RXRPC_DEBUG_KLEAVE 0x02
+#define RXRPC_DEBUG_KDEBUG 0x04
+#define RXRPC_DEBUG_KPROTO 0x08
+#define RXRPC_DEBUG_KNET 0x10
+
+#define _enter(FMT,...) \
+do { \
+ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KENTER)) \
+ kenter(FMT,##__VA_ARGS__); \
+} while (0)
+
+#define _leave(FMT,...) \
+do { \
+ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KLEAVE)) \
+ kleave(FMT,##__VA_ARGS__); \
+} while (0)
+
+#define _debug(FMT,...) \
+do { \
+ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KDEBUG)) \
+ kdebug(FMT,##__VA_ARGS__); \
+} while (0)
+
+#define _proto(FMT,...) \
+do { \
+ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KPROTO)) \
+ kproto(FMT,##__VA_ARGS__); \
+} while (0)
+
+#define _net(FMT,...) \
+do { \
+ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET)) \
+ knet(FMT,##__VA_ARGS__); \
+} while (0)
+
+#else
+#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__)
+#define _proto(FMT,...) no_printk("### "FMT ,##__VA_ARGS__)
+#define _net(FMT,...) no_printk("@@@ "FMT ,##__VA_ARGS__)
+#endif
+
+/*
+ * debug assertion checking
+ */
+#if 1 // defined(__KDEBUGALL)
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "RxRPC: Assertion failed\n"); \
+ BUG(); \
+ } \
+} while(0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "RxRPC: Assertion failed\n"); \
+ printk(KERN_ERR "%lu " #OP " %lu is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while(0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "RxRPC: Assertion failed\n"); \
+ BUG(); \
+ } \
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "RxRPC: Assertion failed\n"); \
+ printk(KERN_ERR "%lu " #OP " %lu is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while(0)
+
+#else
+
+#define ASSERT(X) \
+do { \
+} while(0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+} while(0)
+
+#define ASSERTIF(C, X) \
+do { \
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+} while(0)
+
+#endif /* __KDEBUGALL */
+
+/*
+ * socket buffer accounting / leak finding
+ */
+static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn)
+{
+ //_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs));
+ //atomic_inc(&rxrpc_n_skbs);
+}
+
+#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__)
+
+static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn)
+{
+ //_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs));
+ //atomic_dec(&rxrpc_n_skbs);
+}
+
+#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__)
+
+static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn)
+{
+ if (skb) {
+ CHECK_SLAB_OKAY(&skb->users);
+ //_net("free skb %p %s [%d]",
+ // skb, fn, atomic_read(&rxrpc_n_skbs));
+ //atomic_dec(&rxrpc_n_skbs);
+ kfree_skb(skb);
+ }
+}
+
+#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__)
+
+static inline void rxrpc_purge_queue(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+ while ((skb = skb_dequeue((list))) != NULL)
+ rxrpc_free_skb(skb);
+}
+
+static inline void __rxrpc_get_local(struct rxrpc_local *local, const char *f)
+{
+ CHECK_SLAB_OKAY(&local->usage);
+ if (atomic_inc_return(&local->usage) == 1)
+ printk("resurrected (%s)\n", f);
+}
+
+#define rxrpc_get_local(LOCAL) __rxrpc_get_local((LOCAL), __func__)
+
+#define rxrpc_get_call(CALL) \
+do { \
+ CHECK_SLAB_OKAY(&(CALL)->usage); \
+ if (atomic_inc_return(&(CALL)->usage) == 1) \
+ BUG(); \
+} while(0)
+
+#define rxrpc_put_call(CALL) \
+do { \
+ __rxrpc_put_call(CALL); \
+} while(0)
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
new file mode 100644
index 000000000..db0f39f5e
--- /dev/null
+++ b/net/rxrpc/ar-key.c
@@ -0,0 +1,1247 @@
+/* RxRPC key management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * RxRPC keys should have a description of describing their purpose:
+ * "afs@CAMBRIDGE.REDHAT.COM>
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/key-type.h>
+#include <linux/crypto.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#include <keys/user-type.h>
+#include "ar-internal.h"
+
+static int rxrpc_vet_description_s(const char *);
+static int rxrpc_preparse(struct key_preparsed_payload *);
+static int rxrpc_preparse_s(struct key_preparsed_payload *);
+static void rxrpc_free_preparse(struct key_preparsed_payload *);
+static void rxrpc_free_preparse_s(struct key_preparsed_payload *);
+static void rxrpc_destroy(struct key *);
+static void rxrpc_destroy_s(struct key *);
+static void rxrpc_describe(const struct key *, struct seq_file *);
+static long rxrpc_read(const struct key *, char __user *, size_t);
+
+/*
+ * rxrpc defined keys take an arbitrary string as the description and an
+ * arbitrary blob of data as the payload
+ */
+struct key_type key_type_rxrpc = {
+ .name = "rxrpc",
+ .preparse = rxrpc_preparse,
+ .free_preparse = rxrpc_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .destroy = rxrpc_destroy,
+ .describe = rxrpc_describe,
+ .read = rxrpc_read,
+};
+EXPORT_SYMBOL(key_type_rxrpc);
+
+/*
+ * rxrpc server defined keys take "<serviceId>:<securityIndex>" as the
+ * description and an 8-byte decryption key as the payload
+ */
+struct key_type key_type_rxrpc_s = {
+ .name = "rxrpc_s",
+ .vet_description = rxrpc_vet_description_s,
+ .preparse = rxrpc_preparse_s,
+ .free_preparse = rxrpc_free_preparse_s,
+ .instantiate = generic_key_instantiate,
+ .destroy = rxrpc_destroy_s,
+ .describe = rxrpc_describe,
+};
+
+/*
+ * Vet the description for an RxRPC server key
+ */
+static int rxrpc_vet_description_s(const char *desc)
+{
+ unsigned long num;
+ char *p;
+
+ num = simple_strtoul(desc, &p, 10);
+ if (*p != ':' || num > 65535)
+ return -EINVAL;
+ num = simple_strtoul(p + 1, &p, 10);
+ if (*p || num < 1 || num > 255)
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * parse an RxKAD type XDR format token
+ * - the caller guarantees we have at least 4 words
+ */
+static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
+ size_t datalen,
+ const __be32 *xdr, unsigned int toklen)
+{
+ struct rxrpc_key_token *token, **pptoken;
+ size_t plen;
+ u32 tktlen;
+
+ _enter(",{%x,%x,%x,%x},%u",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+ toklen);
+
+ if (toklen <= 8 * 4)
+ return -EKEYREJECTED;
+ tktlen = ntohl(xdr[7]);
+ _debug("tktlen: %x", tktlen);
+ if (tktlen > AFSTOKEN_RK_TIX_MAX)
+ return -EKEYREJECTED;
+ if (toklen < 8 * 4 + tktlen)
+ return -EKEYREJECTED;
+
+ plen = sizeof(*token) + sizeof(*token->kad) + tktlen;
+ prep->quotalen = datalen + plen;
+
+ plen -= sizeof(*token);
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ return -ENOMEM;
+
+ token->kad = kzalloc(plen, GFP_KERNEL);
+ if (!token->kad) {
+ kfree(token);
+ return -ENOMEM;
+ }
+
+ token->security_index = RXRPC_SECURITY_RXKAD;
+ token->kad->ticket_len = tktlen;
+ token->kad->vice_id = ntohl(xdr[0]);
+ token->kad->kvno = ntohl(xdr[1]);
+ token->kad->start = ntohl(xdr[4]);
+ token->kad->expiry = ntohl(xdr[5]);
+ token->kad->primary_flag = ntohl(xdr[6]);
+ memcpy(&token->kad->session_key, &xdr[2], 8);
+ memcpy(&token->kad->ticket, &xdr[8], tktlen);
+
+ _debug("SCIX: %u", token->security_index);
+ _debug("TLEN: %u", token->kad->ticket_len);
+ _debug("EXPY: %x", token->kad->expiry);
+ _debug("KVNO: %u", token->kad->kvno);
+ _debug("PRIM: %u", token->kad->primary_flag);
+ _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x",
+ token->kad->session_key[0], token->kad->session_key[1],
+ token->kad->session_key[2], token->kad->session_key[3],
+ token->kad->session_key[4], token->kad->session_key[5],
+ token->kad->session_key[6], token->kad->session_key[7]);
+ if (token->kad->ticket_len >= 8)
+ _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x",
+ token->kad->ticket[0], token->kad->ticket[1],
+ token->kad->ticket[2], token->kad->ticket[3],
+ token->kad->ticket[4], token->kad->ticket[5],
+ token->kad->ticket[6], token->kad->ticket[7]);
+
+ /* count the number of tokens attached */
+ prep->type_data[0] = (void *)((unsigned long)prep->type_data[0] + 1);
+
+ /* attach the data */
+ for (pptoken = (struct rxrpc_key_token **)&prep->payload[0];
+ *pptoken;
+ pptoken = &(*pptoken)->next)
+ continue;
+ *pptoken = token;
+ if (token->kad->expiry < prep->expiry)
+ prep->expiry = token->kad->expiry;
+
+ _leave(" = 0");
+ return 0;
+}
+
+static void rxrpc_free_krb5_principal(struct krb5_principal *princ)
+{
+ int loop;
+
+ if (princ->name_parts) {
+ for (loop = princ->n_name_parts - 1; loop >= 0; loop--)
+ kfree(princ->name_parts[loop]);
+ kfree(princ->name_parts);
+ }
+ kfree(princ->realm);
+}
+
+static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td)
+{
+ kfree(td->data);
+}
+
+/*
+ * free up an RxK5 token
+ */
+static void rxrpc_rxk5_free(struct rxk5_key *rxk5)
+{
+ int loop;
+
+ rxrpc_free_krb5_principal(&rxk5->client);
+ rxrpc_free_krb5_principal(&rxk5->server);
+ rxrpc_free_krb5_tagged(&rxk5->session);
+
+ if (rxk5->addresses) {
+ for (loop = rxk5->n_addresses - 1; loop >= 0; loop--)
+ rxrpc_free_krb5_tagged(&rxk5->addresses[loop]);
+ kfree(rxk5->addresses);
+ }
+ if (rxk5->authdata) {
+ for (loop = rxk5->n_authdata - 1; loop >= 0; loop--)
+ rxrpc_free_krb5_tagged(&rxk5->authdata[loop]);
+ kfree(rxk5->authdata);
+ }
+
+ kfree(rxk5->ticket);
+ kfree(rxk5->ticket2);
+ kfree(rxk5);
+}
+
+/*
+ * extract a krb5 principal
+ */
+static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
+ const __be32 **_xdr,
+ unsigned int *_toklen)
+{
+ const __be32 *xdr = *_xdr;
+ unsigned int toklen = *_toklen, n_parts, loop, tmp;
+
+ /* there must be at least one name, and at least #names+1 length
+ * words */
+ if (toklen <= 12)
+ return -EINVAL;
+
+ _enter(",{%x,%x,%x},%u",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen);
+
+ n_parts = ntohl(*xdr++);
+ toklen -= 4;
+ if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX)
+ return -EINVAL;
+ princ->n_name_parts = n_parts;
+
+ if (toklen <= (n_parts + 1) * 4)
+ return -EINVAL;
+
+ princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL);
+ if (!princ->name_parts)
+ return -ENOMEM;
+
+ for (loop = 0; loop < n_parts; loop++) {
+ if (toklen < 4)
+ return -EINVAL;
+ tmp = ntohl(*xdr++);
+ toklen -= 4;
+ if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX)
+ return -EINVAL;
+ if (tmp > toklen)
+ return -EINVAL;
+ princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL);
+ if (!princ->name_parts[loop])
+ return -ENOMEM;
+ memcpy(princ->name_parts[loop], xdr, tmp);
+ princ->name_parts[loop][tmp] = 0;
+ tmp = (tmp + 3) & ~3;
+ toklen -= tmp;
+ xdr += tmp >> 2;
+ }
+
+ if (toklen < 4)
+ return -EINVAL;
+ tmp = ntohl(*xdr++);
+ toklen -= 4;
+ if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX)
+ return -EINVAL;
+ if (tmp > toklen)
+ return -EINVAL;
+ princ->realm = kmalloc(tmp + 1, GFP_KERNEL);
+ if (!princ->realm)
+ return -ENOMEM;
+ memcpy(princ->realm, xdr, tmp);
+ princ->realm[tmp] = 0;
+ tmp = (tmp + 3) & ~3;
+ toklen -= tmp;
+ xdr += tmp >> 2;
+
+ _debug("%s/...@%s", princ->name_parts[0], princ->realm);
+
+ *_xdr = xdr;
+ *_toklen = toklen;
+ _leave(" = 0 [toklen=%u]", toklen);
+ return 0;
+}
+
+/*
+ * extract a piece of krb5 tagged data
+ */
+static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td,
+ size_t max_data_size,
+ const __be32 **_xdr,
+ unsigned int *_toklen)
+{
+ const __be32 *xdr = *_xdr;
+ unsigned int toklen = *_toklen, len;
+
+ /* there must be at least one tag and one length word */
+ if (toklen <= 8)
+ return -EINVAL;
+
+ _enter(",%zu,{%x,%x},%u",
+ max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen);
+
+ td->tag = ntohl(*xdr++);
+ len = ntohl(*xdr++);
+ toklen -= 8;
+ if (len > max_data_size)
+ return -EINVAL;
+ td->data_len = len;
+
+ if (len > 0) {
+ td->data = kmemdup(xdr, len, GFP_KERNEL);
+ if (!td->data)
+ return -ENOMEM;
+ len = (len + 3) & ~3;
+ toklen -= len;
+ xdr += len >> 2;
+ }
+
+ _debug("tag %x len %x", td->tag, td->data_len);
+
+ *_xdr = xdr;
+ *_toklen = toklen;
+ _leave(" = 0 [toklen=%u]", toklen);
+ return 0;
+}
+
+/*
+ * extract an array of tagged data
+ */
+static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td,
+ u8 *_n_elem,
+ u8 max_n_elem,
+ size_t max_elem_size,
+ const __be32 **_xdr,
+ unsigned int *_toklen)
+{
+ struct krb5_tagged_data *td;
+ const __be32 *xdr = *_xdr;
+ unsigned int toklen = *_toklen, n_elem, loop;
+ int ret;
+
+ /* there must be at least one count */
+ if (toklen < 4)
+ return -EINVAL;
+
+ _enter(",,%u,%zu,{%x},%u",
+ max_n_elem, max_elem_size, ntohl(xdr[0]), toklen);
+
+ n_elem = ntohl(*xdr++);
+ toklen -= 4;
+ if (n_elem > max_n_elem)
+ return -EINVAL;
+ *_n_elem = n_elem;
+ if (n_elem > 0) {
+ if (toklen <= (n_elem + 1) * 4)
+ return -EINVAL;
+
+ _debug("n_elem %d", n_elem);
+
+ td = kcalloc(n_elem, sizeof(struct krb5_tagged_data),
+ GFP_KERNEL);
+ if (!td)
+ return -ENOMEM;
+ *_td = td;
+
+ for (loop = 0; loop < n_elem; loop++) {
+ ret = rxrpc_krb5_decode_tagged_data(&td[loop],
+ max_elem_size,
+ &xdr, &toklen);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ *_xdr = xdr;
+ *_toklen = toklen;
+ _leave(" = 0 [toklen=%u]", toklen);
+ return 0;
+}
+
+/*
+ * extract a krb5 ticket
+ */
+static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
+ const __be32 **_xdr, unsigned int *_toklen)
+{
+ const __be32 *xdr = *_xdr;
+ unsigned int toklen = *_toklen, len;
+
+ /* there must be at least one length word */
+ if (toklen <= 4)
+ return -EINVAL;
+
+ _enter(",{%x},%u", ntohl(xdr[0]), toklen);
+
+ len = ntohl(*xdr++);
+ toklen -= 4;
+ if (len > AFSTOKEN_K5_TIX_MAX)
+ return -EINVAL;
+ *_tktlen = len;
+
+ _debug("ticket len %u", len);
+
+ if (len > 0) {
+ *_ticket = kmemdup(xdr, len, GFP_KERNEL);
+ if (!*_ticket)
+ return -ENOMEM;
+ len = (len + 3) & ~3;
+ toklen -= len;
+ xdr += len >> 2;
+ }
+
+ *_xdr = xdr;
+ *_toklen = toklen;
+ _leave(" = 0 [toklen=%u]", toklen);
+ return 0;
+}
+
+/*
+ * parse an RxK5 type XDR format token
+ * - the caller guarantees we have at least 4 words
+ */
+static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep,
+ size_t datalen,
+ const __be32 *xdr, unsigned int toklen)
+{
+ struct rxrpc_key_token *token, **pptoken;
+ struct rxk5_key *rxk5;
+ const __be32 *end_xdr = xdr + (toklen >> 2);
+ int ret;
+
+ _enter(",{%x,%x,%x,%x},%u",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+ toklen);
+
+ /* reserve some payload space for this subkey - the length of the token
+ * is a reasonable approximation */
+ prep->quotalen = datalen + toklen;
+
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ return -ENOMEM;
+
+ rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL);
+ if (!rxk5) {
+ kfree(token);
+ return -ENOMEM;
+ }
+
+ token->security_index = RXRPC_SECURITY_RXK5;
+ token->k5 = rxk5;
+
+ /* extract the principals */
+ ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+ ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ /* extract the session key and the encoding type (the tag field ->
+ * ENCTYPE_xxx) */
+ ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ if (toklen < 4 * 8 + 2 * 4)
+ goto inval;
+ rxk5->authtime = be64_to_cpup((const __be64 *) xdr);
+ xdr += 2;
+ rxk5->starttime = be64_to_cpup((const __be64 *) xdr);
+ xdr += 2;
+ rxk5->endtime = be64_to_cpup((const __be64 *) xdr);
+ xdr += 2;
+ rxk5->renew_till = be64_to_cpup((const __be64 *) xdr);
+ xdr += 2;
+ rxk5->is_skey = ntohl(*xdr++);
+ rxk5->flags = ntohl(*xdr++);
+ toklen -= 4 * 8 + 2 * 4;
+
+ _debug("times: a=%llx s=%llx e=%llx rt=%llx",
+ rxk5->authtime, rxk5->starttime, rxk5->endtime,
+ rxk5->renew_till);
+ _debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags);
+
+ /* extract the permitted client addresses */
+ ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses,
+ &rxk5->n_addresses,
+ AFSTOKEN_K5_ADDRESSES_MAX,
+ AFSTOKEN_DATA_MAX,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+ /* extract the tickets */
+ ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+ ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+ /* extract the typed auth data */
+ ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata,
+ &rxk5->n_authdata,
+ AFSTOKEN_K5_AUTHDATA_MAX,
+ AFSTOKEN_BDATALN_MAX,
+ &xdr, &toklen);
+ if (ret < 0)
+ goto error;
+
+ ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+ if (toklen != 0)
+ goto inval;
+
+ /* attach the payload */
+ for (pptoken = (struct rxrpc_key_token **)&prep->payload[0];
+ *pptoken;
+ pptoken = &(*pptoken)->next)
+ continue;
+ *pptoken = token;
+ if (token->kad->expiry < prep->expiry)
+ prep->expiry = token->kad->expiry;
+
+ _leave(" = 0");
+ return 0;
+
+inval:
+ ret = -EINVAL;
+error:
+ rxrpc_rxk5_free(rxk5);
+ kfree(token);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * attempt to parse the data as the XDR format
+ * - the caller guarantees we have more than 7 words
+ */
+static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep)
+{
+ const __be32 *xdr = prep->data, *token;
+ const char *cp;
+ unsigned int len, tmp, loop, ntoken, toklen, sec_ix;
+ size_t datalen = prep->datalen;
+ int ret;
+
+ _enter(",{%x,%x,%x,%x},%zu",
+ ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+ prep->datalen);
+
+ if (datalen > AFSTOKEN_LENGTH_MAX)
+ goto not_xdr;
+
+ /* XDR is an array of __be32's */
+ if (datalen & 3)
+ goto not_xdr;
+
+ /* the flags should be 0 (the setpag bit must be handled by
+ * userspace) */
+ if (ntohl(*xdr++) != 0)
+ goto not_xdr;
+ datalen -= 4;
+
+ /* check the cell name */
+ len = ntohl(*xdr++);
+ if (len < 1 || len > AFSTOKEN_CELL_MAX)
+ goto not_xdr;
+ datalen -= 4;
+ tmp = (len + 3) & ~3;
+ if (tmp > datalen)
+ goto not_xdr;
+
+ cp = (const char *) xdr;
+ for (loop = 0; loop < len; loop++)
+ if (!isprint(cp[loop]))
+ goto not_xdr;
+ if (len < tmp)
+ for (; loop < tmp; loop++)
+ if (cp[loop])
+ goto not_xdr;
+ _debug("cellname: [%u/%u] '%*.*s'",
+ len, tmp, len, len, (const char *) xdr);
+ datalen -= tmp;
+ xdr += tmp >> 2;
+
+ /* get the token count */
+ if (datalen < 12)
+ goto not_xdr;
+ ntoken = ntohl(*xdr++);
+ datalen -= 4;
+ _debug("ntoken: %x", ntoken);
+ if (ntoken < 1 || ntoken > AFSTOKEN_MAX)
+ goto not_xdr;
+
+ /* check each token wrapper */
+ token = xdr;
+ loop = ntoken;
+ do {
+ if (datalen < 8)
+ goto not_xdr;
+ toklen = ntohl(*xdr++);
+ sec_ix = ntohl(*xdr);
+ datalen -= 4;
+ _debug("token: [%x/%zx] %x", toklen, datalen, sec_ix);
+ if (toklen < 20 || toklen > datalen)
+ goto not_xdr;
+ datalen -= (toklen + 3) & ~3;
+ xdr += (toklen + 3) >> 2;
+
+ } while (--loop > 0);
+
+ _debug("remainder: %zu", datalen);
+ if (datalen != 0)
+ goto not_xdr;
+
+ /* okay: we're going to assume it's valid XDR format
+ * - we ignore the cellname, relying on the key to be correctly named
+ */
+ do {
+ xdr = token;
+ toklen = ntohl(*xdr++);
+ token = xdr + ((toklen + 3) >> 2);
+ sec_ix = ntohl(*xdr++);
+ toklen -= 4;
+
+ _debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token);
+
+ switch (sec_ix) {
+ case RXRPC_SECURITY_RXKAD:
+ ret = rxrpc_preparse_xdr_rxkad(prep, datalen, xdr, toklen);
+ if (ret != 0)
+ goto error;
+ break;
+
+ case RXRPC_SECURITY_RXK5:
+ ret = rxrpc_preparse_xdr_rxk5(prep, datalen, xdr, toklen);
+ if (ret != 0)
+ goto error;
+ break;
+
+ default:
+ ret = -EPROTONOSUPPORT;
+ goto error;
+ }
+
+ } while (--ntoken > 0);
+
+ _leave(" = 0");
+ return 0;
+
+not_xdr:
+ _leave(" = -EPROTO");
+ return -EPROTO;
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * Preparse an rxrpc defined key.
+ *
+ * Data should be of the form:
+ * OFFSET LEN CONTENT
+ * 0 4 key interface version number
+ * 4 2 security index (type)
+ * 6 2 ticket length
+ * 8 4 key expiry time (time_t)
+ * 12 4 kvno
+ * 16 8 session key
+ * 24 [len] ticket
+ *
+ * if no data is provided, then a no-security key is made
+ */
+static int rxrpc_preparse(struct key_preparsed_payload *prep)
+{
+ const struct rxrpc_key_data_v1 *v1;
+ struct rxrpc_key_token *token, **pp;
+ size_t plen;
+ u32 kver;
+ int ret;
+
+ _enter("%zu", prep->datalen);
+
+ /* handle a no-security key */
+ if (!prep->data && prep->datalen == 0)
+ return 0;
+
+ /* determine if the XDR payload format is being used */
+ if (prep->datalen > 7 * 4) {
+ ret = rxrpc_preparse_xdr(prep);
+ if (ret != -EPROTO)
+ return ret;
+ }
+
+ /* get the key interface version number */
+ ret = -EINVAL;
+ if (prep->datalen <= 4 || !prep->data)
+ goto error;
+ memcpy(&kver, prep->data, sizeof(kver));
+ prep->data += sizeof(kver);
+ prep->datalen -= sizeof(kver);
+
+ _debug("KEY I/F VERSION: %u", kver);
+
+ ret = -EKEYREJECTED;
+ if (kver != 1)
+ goto error;
+
+ /* deal with a version 1 key */
+ ret = -EINVAL;
+ if (prep->datalen < sizeof(*v1))
+ goto error;
+
+ v1 = prep->data;
+ if (prep->datalen != sizeof(*v1) + v1->ticket_length)
+ goto error;
+
+ _debug("SCIX: %u", v1->security_index);
+ _debug("TLEN: %u", v1->ticket_length);
+ _debug("EXPY: %x", v1->expiry);
+ _debug("KVNO: %u", v1->kvno);
+ _debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x",
+ v1->session_key[0], v1->session_key[1],
+ v1->session_key[2], v1->session_key[3],
+ v1->session_key[4], v1->session_key[5],
+ v1->session_key[6], v1->session_key[7]);
+ if (v1->ticket_length >= 8)
+ _debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x",
+ v1->ticket[0], v1->ticket[1],
+ v1->ticket[2], v1->ticket[3],
+ v1->ticket[4], v1->ticket[5],
+ v1->ticket[6], v1->ticket[7]);
+
+ ret = -EPROTONOSUPPORT;
+ if (v1->security_index != RXRPC_SECURITY_RXKAD)
+ goto error;
+
+ plen = sizeof(*token->kad) + v1->ticket_length;
+ prep->quotalen = plen + sizeof(*token);
+
+ ret = -ENOMEM;
+ token = kzalloc(sizeof(*token), GFP_KERNEL);
+ if (!token)
+ goto error;
+ token->kad = kzalloc(plen, GFP_KERNEL);
+ if (!token->kad)
+ goto error_free;
+
+ token->security_index = RXRPC_SECURITY_RXKAD;
+ token->kad->ticket_len = v1->ticket_length;
+ token->kad->expiry = v1->expiry;
+ token->kad->kvno = v1->kvno;
+ memcpy(&token->kad->session_key, &v1->session_key, 8);
+ memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length);
+
+ /* count the number of tokens attached */
+ prep->type_data[0] = (void *)((unsigned long)prep->type_data[0] + 1);
+
+ /* attach the data */
+ pp = (struct rxrpc_key_token **)&prep->payload[0];
+ while (*pp)
+ pp = &(*pp)->next;
+ *pp = token;
+ if (token->kad->expiry < prep->expiry)
+ prep->expiry = token->kad->expiry;
+ token = NULL;
+ ret = 0;
+
+error_free:
+ kfree(token);
+error:
+ return ret;
+}
+
+/*
+ * Free token list.
+ */
+static void rxrpc_free_token_list(struct rxrpc_key_token *token)
+{
+ struct rxrpc_key_token *next;
+
+ for (; token; token = next) {
+ next = token->next;
+ switch (token->security_index) {
+ case RXRPC_SECURITY_RXKAD:
+ kfree(token->kad);
+ break;
+ case RXRPC_SECURITY_RXK5:
+ if (token->k5)
+ rxrpc_rxk5_free(token->k5);
+ break;
+ default:
+ printk(KERN_ERR "Unknown token type %x on rxrpc key\n",
+ token->security_index);
+ BUG();
+ }
+
+ kfree(token);
+ }
+}
+
+/*
+ * Clean up preparse data.
+ */
+static void rxrpc_free_preparse(struct key_preparsed_payload *prep)
+{
+ rxrpc_free_token_list(prep->payload[0]);
+}
+
+/*
+ * Preparse a server secret key.
+ *
+ * The data should be the 8-byte secret key.
+ */
+static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
+{
+ struct crypto_blkcipher *ci;
+
+ _enter("%zu", prep->datalen);
+
+ if (prep->datalen != 8)
+ return -EINVAL;
+
+ memcpy(&prep->type_data, prep->data, 8);
+
+ ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(ci)) {
+ _leave(" = %ld", PTR_ERR(ci));
+ return PTR_ERR(ci);
+ }
+
+ if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0)
+ BUG();
+
+ prep->payload[0] = ci;
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * Clean up preparse data.
+ */
+static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep)
+{
+ if (prep->payload[0])
+ crypto_free_blkcipher(prep->payload[0]);
+}
+
+/*
+ * dispose of the data dangling from the corpse of a rxrpc key
+ */
+static void rxrpc_destroy(struct key *key)
+{
+ rxrpc_free_token_list(key->payload.data);
+}
+
+/*
+ * dispose of the data dangling from the corpse of a rxrpc key
+ */
+static void rxrpc_destroy_s(struct key *key)
+{
+ if (key->payload.data) {
+ crypto_free_blkcipher(key->payload.data);
+ key->payload.data = NULL;
+ }
+}
+
+/*
+ * describe the rxrpc key
+ */
+static void rxrpc_describe(const struct key *key, struct seq_file *m)
+{
+ seq_puts(m, key->description);
+}
+
+/*
+ * grab the security key for a socket
+ */
+int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen)
+{
+ struct key *key;
+ char *description;
+
+ _enter("");
+
+ if (optlen <= 0 || optlen > PAGE_SIZE - 1)
+ return -EINVAL;
+
+ description = kmalloc(optlen + 1, GFP_KERNEL);
+ if (!description)
+ return -ENOMEM;
+
+ if (copy_from_user(description, optval, optlen)) {
+ kfree(description);
+ return -EFAULT;
+ }
+ description[optlen] = 0;
+
+ key = request_key(&key_type_rxrpc, description, NULL);
+ if (IS_ERR(key)) {
+ kfree(description);
+ _leave(" = %ld", PTR_ERR(key));
+ return PTR_ERR(key);
+ }
+
+ rx->key = key;
+ kfree(description);
+ _leave(" = 0 [key %x]", key->serial);
+ return 0;
+}
+
+/*
+ * grab the security keyring for a server socket
+ */
+int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval,
+ int optlen)
+{
+ struct key *key;
+ char *description;
+
+ _enter("");
+
+ if (optlen <= 0 || optlen > PAGE_SIZE - 1)
+ return -EINVAL;
+
+ description = kmalloc(optlen + 1, GFP_KERNEL);
+ if (!description)
+ return -ENOMEM;
+
+ if (copy_from_user(description, optval, optlen)) {
+ kfree(description);
+ return -EFAULT;
+ }
+ description[optlen] = 0;
+
+ key = request_key(&key_type_keyring, description, NULL);
+ if (IS_ERR(key)) {
+ kfree(description);
+ _leave(" = %ld", PTR_ERR(key));
+ return PTR_ERR(key);
+ }
+
+ rx->securities = key;
+ kfree(description);
+ _leave(" = 0 [key %x]", key->serial);
+ return 0;
+}
+
+/*
+ * generate a server data key
+ */
+int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
+ const void *session_key,
+ time_t expiry,
+ u32 kvno)
+{
+ const struct cred *cred = current_cred();
+ struct key *key;
+ int ret;
+
+ struct {
+ u32 kver;
+ struct rxrpc_key_data_v1 v1;
+ } data;
+
+ _enter("");
+
+ key = key_alloc(&key_type_rxrpc, "x",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(key)) {
+ _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
+ return -ENOMEM;
+ }
+
+ _debug("key %d", key_serial(key));
+
+ data.kver = 1;
+ data.v1.security_index = RXRPC_SECURITY_RXKAD;
+ data.v1.ticket_length = 0;
+ data.v1.expiry = expiry;
+ data.v1.kvno = 0;
+
+ memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key));
+
+ ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL);
+ if (ret < 0)
+ goto error;
+
+ conn->key = key;
+ _leave(" = 0 [%d]", key_serial(key));
+ return 0;
+
+error:
+ key_revoke(key);
+ key_put(key);
+ _leave(" = -ENOMEM [ins %d]", ret);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(rxrpc_get_server_data_key);
+
+/**
+ * rxrpc_get_null_key - Generate a null RxRPC key
+ * @keyname: The name to give the key.
+ *
+ * Generate a null RxRPC key that can be used to indicate anonymous security is
+ * required for a particular domain.
+ */
+struct key *rxrpc_get_null_key(const char *keyname)
+{
+ const struct cred *cred = current_cred();
+ struct key *key;
+ int ret;
+
+ key = key_alloc(&key_type_rxrpc, keyname,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(key))
+ return key;
+
+ ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL);
+ if (ret < 0) {
+ key_revoke(key);
+ key_put(key);
+ return ERR_PTR(ret);
+ }
+
+ return key;
+}
+EXPORT_SYMBOL(rxrpc_get_null_key);
+
+/*
+ * read the contents of an rxrpc key
+ * - this returns the result in XDR form
+ */
+static long rxrpc_read(const struct key *key,
+ char __user *buffer, size_t buflen)
+{
+ const struct rxrpc_key_token *token;
+ const struct krb5_principal *princ;
+ size_t size;
+ __be32 __user *xdr, *oldxdr;
+ u32 cnlen, toksize, ntoks, tok, zero;
+ u16 toksizes[AFSTOKEN_MAX];
+ int loop;
+
+ _enter("");
+
+ /* we don't know what form we should return non-AFS keys in */
+ if (memcmp(key->description, "afs@", 4) != 0)
+ return -EOPNOTSUPP;
+ cnlen = strlen(key->description + 4);
+
+#define RND(X) (((X) + 3) & ~3)
+
+ /* AFS keys we return in XDR form, so we need to work out the size of
+ * the XDR */
+ size = 2 * 4; /* flags, cellname len */
+ size += RND(cnlen); /* cellname */
+ size += 1 * 4; /* token count */
+
+ ntoks = 0;
+ for (token = key->payload.data; token; token = token->next) {
+ toksize = 4; /* sec index */
+
+ switch (token->security_index) {
+ case RXRPC_SECURITY_RXKAD:
+ toksize += 8 * 4; /* viceid, kvno, key*2, begin,
+ * end, primary, tktlen */
+ toksize += RND(token->kad->ticket_len);
+ break;
+
+ case RXRPC_SECURITY_RXK5:
+ princ = &token->k5->client;
+ toksize += 4 + princ->n_name_parts * 4;
+ for (loop = 0; loop < princ->n_name_parts; loop++)
+ toksize += RND(strlen(princ->name_parts[loop]));
+ toksize += 4 + RND(strlen(princ->realm));
+
+ princ = &token->k5->server;
+ toksize += 4 + princ->n_name_parts * 4;
+ for (loop = 0; loop < princ->n_name_parts; loop++)
+ toksize += RND(strlen(princ->name_parts[loop]));
+ toksize += 4 + RND(strlen(princ->realm));
+
+ toksize += 8 + RND(token->k5->session.data_len);
+
+ toksize += 4 * 8 + 2 * 4;
+
+ toksize += 4 + token->k5->n_addresses * 8;
+ for (loop = 0; loop < token->k5->n_addresses; loop++)
+ toksize += RND(token->k5->addresses[loop].data_len);
+
+ toksize += 4 + RND(token->k5->ticket_len);
+ toksize += 4 + RND(token->k5->ticket2_len);
+
+ toksize += 4 + token->k5->n_authdata * 8;
+ for (loop = 0; loop < token->k5->n_authdata; loop++)
+ toksize += RND(token->k5->authdata[loop].data_len);
+ break;
+
+ default: /* we have a ticket we can't encode */
+ BUG();
+ continue;
+ }
+
+ _debug("token[%u]: toksize=%u", ntoks, toksize);
+ ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX);
+
+ toksizes[ntoks++] = toksize;
+ size += toksize + 4; /* each token has a length word */
+ }
+
+#undef RND
+
+ if (!buffer || buflen < size)
+ return size;
+
+ xdr = (__be32 __user *) buffer;
+ zero = 0;
+#define ENCODE(x) \
+ do { \
+ __be32 y = htonl(x); \
+ if (put_user(y, xdr++) < 0) \
+ goto fault; \
+ } while(0)
+#define ENCODE_DATA(l, s) \
+ do { \
+ u32 _l = (l); \
+ ENCODE(l); \
+ if (copy_to_user(xdr, (s), _l) != 0) \
+ goto fault; \
+ if (_l & 3 && \
+ copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \
+ goto fault; \
+ xdr += (_l + 3) >> 2; \
+ } while(0)
+#define ENCODE64(x) \
+ do { \
+ __be64 y = cpu_to_be64(x); \
+ if (copy_to_user(xdr, &y, 8) != 0) \
+ goto fault; \
+ xdr += 8 >> 2; \
+ } while(0)
+#define ENCODE_STR(s) \
+ do { \
+ const char *_s = (s); \
+ ENCODE_DATA(strlen(_s), _s); \
+ } while(0)
+
+ ENCODE(0); /* flags */
+ ENCODE_DATA(cnlen, key->description + 4); /* cellname */
+ ENCODE(ntoks);
+
+ tok = 0;
+ for (token = key->payload.data; token; token = token->next) {
+ toksize = toksizes[tok++];
+ ENCODE(toksize);
+ oldxdr = xdr;
+ ENCODE(token->security_index);
+
+ switch (token->security_index) {
+ case RXRPC_SECURITY_RXKAD:
+ ENCODE(token->kad->vice_id);
+ ENCODE(token->kad->kvno);
+ ENCODE_DATA(8, token->kad->session_key);
+ ENCODE(token->kad->start);
+ ENCODE(token->kad->expiry);
+ ENCODE(token->kad->primary_flag);
+ ENCODE_DATA(token->kad->ticket_len, token->kad->ticket);
+ break;
+
+ case RXRPC_SECURITY_RXK5:
+ princ = &token->k5->client;
+ ENCODE(princ->n_name_parts);
+ for (loop = 0; loop < princ->n_name_parts; loop++)
+ ENCODE_STR(princ->name_parts[loop]);
+ ENCODE_STR(princ->realm);
+
+ princ = &token->k5->server;
+ ENCODE(princ->n_name_parts);
+ for (loop = 0; loop < princ->n_name_parts; loop++)
+ ENCODE_STR(princ->name_parts[loop]);
+ ENCODE_STR(princ->realm);
+
+ ENCODE(token->k5->session.tag);
+ ENCODE_DATA(token->k5->session.data_len,
+ token->k5->session.data);
+
+ ENCODE64(token->k5->authtime);
+ ENCODE64(token->k5->starttime);
+ ENCODE64(token->k5->endtime);
+ ENCODE64(token->k5->renew_till);
+ ENCODE(token->k5->is_skey);
+ ENCODE(token->k5->flags);
+
+ ENCODE(token->k5->n_addresses);
+ for (loop = 0; loop < token->k5->n_addresses; loop++) {
+ ENCODE(token->k5->addresses[loop].tag);
+ ENCODE_DATA(token->k5->addresses[loop].data_len,
+ token->k5->addresses[loop].data);
+ }
+
+ ENCODE_DATA(token->k5->ticket_len, token->k5->ticket);
+ ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2);
+
+ ENCODE(token->k5->n_authdata);
+ for (loop = 0; loop < token->k5->n_authdata; loop++) {
+ ENCODE(token->k5->authdata[loop].tag);
+ ENCODE_DATA(token->k5->authdata[loop].data_len,
+ token->k5->authdata[loop].data);
+ }
+ break;
+
+ default:
+ BUG();
+ break;
+ }
+
+ ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==,
+ toksize);
+ }
+
+#undef ENCODE_STR
+#undef ENCODE_DATA
+#undef ENCODE64
+#undef ENCODE
+
+ ASSERTCMP(tok, ==, ntoks);
+ ASSERTCMP((char __user *) xdr - buffer, ==, size);
+ _leave(" = %zu", size);
+ return size;
+
+fault:
+ _leave(" = -EFAULT");
+ return -EFAULT;
+}
diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c
new file mode 100644
index 000000000..ca904ed54
--- /dev/null
+++ b/net/rxrpc/ar-local.c
@@ -0,0 +1,408 @@
+/* AF_RXRPC local endpoint management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <generated/utsrelease.h>
+#include "ar-internal.h"
+
+static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC";
+
+static LIST_HEAD(rxrpc_locals);
+DEFINE_RWLOCK(rxrpc_local_lock);
+static DECLARE_RWSEM(rxrpc_local_sem);
+static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq);
+
+static void rxrpc_destroy_local(struct work_struct *work);
+static void rxrpc_process_local_events(struct work_struct *work);
+
+/*
+ * allocate a new local
+ */
+static
+struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_local *local;
+
+ local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL);
+ if (local) {
+ INIT_WORK(&local->destroyer, &rxrpc_destroy_local);
+ INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls);
+ INIT_WORK(&local->rejecter, &rxrpc_reject_packets);
+ INIT_WORK(&local->event_processor, &rxrpc_process_local_events);
+ INIT_LIST_HEAD(&local->services);
+ INIT_LIST_HEAD(&local->link);
+ init_rwsem(&local->defrag_sem);
+ skb_queue_head_init(&local->accept_queue);
+ skb_queue_head_init(&local->reject_queue);
+ skb_queue_head_init(&local->event_queue);
+ spin_lock_init(&local->lock);
+ rwlock_init(&local->services_lock);
+ atomic_set(&local->usage, 1);
+ local->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ memcpy(&local->srx, srx, sizeof(*srx));
+ }
+
+ _leave(" = %p", local);
+ return local;
+}
+
+/*
+ * create the local socket
+ * - must be called with rxrpc_local_sem writelocked
+ */
+static int rxrpc_create_local(struct rxrpc_local *local)
+{
+ struct sock *sock;
+ int ret, opt;
+
+ _enter("%p{%d}", local, local->srx.transport_type);
+
+ /* create a socket to represent the local endpoint */
+ ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP,
+ &local->socket);
+ if (ret < 0) {
+ _leave(" = %d [socket]", ret);
+ return ret;
+ }
+
+ /* if a local address was supplied then bind it */
+ if (local->srx.transport_len > sizeof(sa_family_t)) {
+ _debug("bind");
+ ret = kernel_bind(local->socket,
+ (struct sockaddr *) &local->srx.transport,
+ local->srx.transport_len);
+ if (ret < 0) {
+ _debug("bind failed");
+ goto error;
+ }
+ }
+
+ /* we want to receive ICMP errors */
+ opt = 1;
+ ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
+ (char *) &opt, sizeof(opt));
+ if (ret < 0) {
+ _debug("setsockopt failed");
+ goto error;
+ }
+
+ /* we want to set the don't fragment bit */
+ opt = IP_PMTUDISC_DO;
+ ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
+ (char *) &opt, sizeof(opt));
+ if (ret < 0) {
+ _debug("setsockopt failed");
+ goto error;
+ }
+
+ write_lock_bh(&rxrpc_local_lock);
+ list_add(&local->link, &rxrpc_locals);
+ write_unlock_bh(&rxrpc_local_lock);
+
+ /* set the socket up */
+ sock = local->socket->sk;
+ sock->sk_user_data = local;
+ sock->sk_data_ready = rxrpc_data_ready;
+ sock->sk_error_report = rxrpc_UDP_error_report;
+ _leave(" = 0");
+ return 0;
+
+error:
+ kernel_sock_shutdown(local->socket, SHUT_RDWR);
+ local->socket->sk->sk_user_data = NULL;
+ sock_release(local->socket);
+ local->socket = NULL;
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * create a new local endpoint using the specified UDP address
+ */
+struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *srx)
+{
+ struct rxrpc_local *local;
+ int ret;
+
+ _enter("{%d,%u,%pI4+%hu}",
+ srx->transport_type,
+ srx->transport.family,
+ &srx->transport.sin.sin_addr,
+ ntohs(srx->transport.sin.sin_port));
+
+ down_write(&rxrpc_local_sem);
+
+ /* see if we have a suitable local local endpoint already */
+ read_lock_bh(&rxrpc_local_lock);
+
+ list_for_each_entry(local, &rxrpc_locals, link) {
+ _debug("CMP {%d,%u,%pI4+%hu}",
+ local->srx.transport_type,
+ local->srx.transport.family,
+ &local->srx.transport.sin.sin_addr,
+ ntohs(local->srx.transport.sin.sin_port));
+
+ if (local->srx.transport_type != srx->transport_type ||
+ local->srx.transport.family != srx->transport.family)
+ continue;
+
+ switch (srx->transport.family) {
+ case AF_INET:
+ if (local->srx.transport.sin.sin_port !=
+ srx->transport.sin.sin_port)
+ continue;
+ if (memcmp(&local->srx.transport.sin.sin_addr,
+ &srx->transport.sin.sin_addr,
+ sizeof(struct in_addr)) != 0)
+ continue;
+ goto found_local;
+
+ default:
+ BUG();
+ }
+ }
+
+ read_unlock_bh(&rxrpc_local_lock);
+
+ /* we didn't find one, so we need to create one */
+ local = rxrpc_alloc_local(srx);
+ if (!local) {
+ up_write(&rxrpc_local_sem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ret = rxrpc_create_local(local);
+ if (ret < 0) {
+ up_write(&rxrpc_local_sem);
+ kfree(local);
+ _leave(" = %d", ret);
+ return ERR_PTR(ret);
+ }
+
+ up_write(&rxrpc_local_sem);
+
+ _net("LOCAL new %d {%d,%u,%pI4+%hu}",
+ local->debug_id,
+ local->srx.transport_type,
+ local->srx.transport.family,
+ &local->srx.transport.sin.sin_addr,
+ ntohs(local->srx.transport.sin.sin_port));
+
+ _leave(" = %p [new]", local);
+ return local;
+
+found_local:
+ rxrpc_get_local(local);
+ read_unlock_bh(&rxrpc_local_lock);
+ up_write(&rxrpc_local_sem);
+
+ _net("LOCAL old %d {%d,%u,%pI4+%hu}",
+ local->debug_id,
+ local->srx.transport_type,
+ local->srx.transport.family,
+ &local->srx.transport.sin.sin_addr,
+ ntohs(local->srx.transport.sin.sin_port));
+
+ _leave(" = %p [reuse]", local);
+ return local;
+}
+
+/*
+ * release a local endpoint
+ */
+void rxrpc_put_local(struct rxrpc_local *local)
+{
+ _enter("%p{u=%d}", local, atomic_read(&local->usage));
+
+ ASSERTCMP(atomic_read(&local->usage), >, 0);
+
+ /* to prevent a race, the decrement and the dequeue must be effectively
+ * atomic */
+ write_lock_bh(&rxrpc_local_lock);
+ if (unlikely(atomic_dec_and_test(&local->usage))) {
+ _debug("destroy local");
+ rxrpc_queue_work(&local->destroyer);
+ }
+ write_unlock_bh(&rxrpc_local_lock);
+ _leave("");
+}
+
+/*
+ * destroy a local endpoint
+ */
+static void rxrpc_destroy_local(struct work_struct *work)
+{
+ struct rxrpc_local *local =
+ container_of(work, struct rxrpc_local, destroyer);
+
+ _enter("%p{%d}", local, atomic_read(&local->usage));
+
+ down_write(&rxrpc_local_sem);
+
+ write_lock_bh(&rxrpc_local_lock);
+ if (atomic_read(&local->usage) > 0) {
+ write_unlock_bh(&rxrpc_local_lock);
+ up_read(&rxrpc_local_sem);
+ _leave(" [resurrected]");
+ return;
+ }
+
+ list_del(&local->link);
+ local->socket->sk->sk_user_data = NULL;
+ write_unlock_bh(&rxrpc_local_lock);
+
+ downgrade_write(&rxrpc_local_sem);
+
+ ASSERT(list_empty(&local->services));
+ ASSERT(!work_pending(&local->acceptor));
+ ASSERT(!work_pending(&local->rejecter));
+ ASSERT(!work_pending(&local->event_processor));
+
+ /* finish cleaning up the local descriptor */
+ rxrpc_purge_queue(&local->accept_queue);
+ rxrpc_purge_queue(&local->reject_queue);
+ rxrpc_purge_queue(&local->event_queue);
+ kernel_sock_shutdown(local->socket, SHUT_RDWR);
+ sock_release(local->socket);
+
+ up_read(&rxrpc_local_sem);
+
+ _net("DESTROY LOCAL %d", local->debug_id);
+ kfree(local);
+
+ if (list_empty(&rxrpc_locals))
+ wake_up_all(&rxrpc_local_wq);
+
+ _leave("");
+}
+
+/*
+ * preemptively destroy all local local endpoint rather than waiting for
+ * them to be destroyed
+ */
+void __exit rxrpc_destroy_all_locals(void)
+{
+ DECLARE_WAITQUEUE(myself,current);
+
+ _enter("");
+
+ /* we simply have to wait for them to go away */
+ if (!list_empty(&rxrpc_locals)) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&rxrpc_local_wq, &myself);
+
+ while (!list_empty(&rxrpc_locals)) {
+ schedule();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+
+ remove_wait_queue(&rxrpc_local_wq, &myself);
+ set_current_state(TASK_RUNNING);
+ }
+
+ _leave("");
+}
+
+/*
+ * Reply to a version request
+ */
+static void rxrpc_send_version_request(struct rxrpc_local *local,
+ struct rxrpc_header *hdr,
+ struct sk_buff *skb)
+{
+ struct sockaddr_in sin;
+ struct msghdr msg;
+ struct kvec iov[2];
+ size_t len;
+ int ret;
+
+ _enter("");
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = udp_hdr(skb)->source;
+ sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+
+ msg.msg_name = &sin;
+ msg.msg_namelen = sizeof(sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ hdr->seq = 0;
+ hdr->serial = 0;
+ hdr->type = RXRPC_PACKET_TYPE_VERSION;
+ hdr->flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
+ hdr->userStatus = 0;
+ hdr->_rsvd = 0;
+
+ iov[0].iov_base = hdr;
+ iov[0].iov_len = sizeof(*hdr);
+ iov[1].iov_base = (char *)rxrpc_version_string;
+ iov[1].iov_len = sizeof(rxrpc_version_string);
+
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ _proto("Tx VERSION (reply)");
+
+ ret = kernel_sendmsg(local->socket, &msg, iov, 2, len);
+ if (ret < 0)
+ _debug("sendmsg failed: %d", ret);
+
+ _leave("");
+}
+
+/*
+ * Process event packets targetted at a local endpoint.
+ */
+static void rxrpc_process_local_events(struct work_struct *work)
+{
+ struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor);
+ struct sk_buff *skb;
+ char v;
+
+ _enter("");
+
+ atomic_inc(&local->usage);
+
+ while ((skb = skb_dequeue(&local->event_queue))) {
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ kdebug("{%d},{%u}", local->debug_id, sp->hdr.type);
+
+ switch (sp->hdr.type) {
+ case RXRPC_PACKET_TYPE_VERSION:
+ if (skb_copy_bits(skb, 0, &v, 1) < 0)
+ return;
+ _proto("Rx VERSION { %02x }", v);
+ if (v == 0)
+ rxrpc_send_version_request(local, &sp->hdr, skb);
+ break;
+
+ default:
+ /* Just ignore anything we don't understand */
+ break;
+ }
+
+ rxrpc_put_local(local);
+ rxrpc_free_skb(skb);
+ }
+
+ rxrpc_put_local(local);
+ _leave("");
+}
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
new file mode 100644
index 000000000..c0042807b
--- /dev/null
+++ b/net/rxrpc/ar-output.c
@@ -0,0 +1,711 @@
+/* RxRPC packet transmission
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/circ_buf.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * Time till packet resend (in jiffies).
+ */
+unsigned rxrpc_resend_timeout = 4 * HZ;
+
+static int rxrpc_send_data(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct msghdr *msg, size_t len);
+
+/*
+ * extract control messages from the sendmsg() control buffer
+ */
+static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg,
+ unsigned long *user_call_ID,
+ enum rxrpc_command *command,
+ u32 *abort_code,
+ bool server)
+{
+ struct cmsghdr *cmsg;
+ int len;
+
+ *command = RXRPC_CMD_SEND_DATA;
+
+ if (msg->msg_controllen == 0)
+ return -EINVAL;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+ _debug("CMSG %d, %d, %d",
+ cmsg->cmsg_level, cmsg->cmsg_type, len);
+
+ if (cmsg->cmsg_level != SOL_RXRPC)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RXRPC_USER_CALL_ID:
+ if (msg->msg_flags & MSG_CMSG_COMPAT) {
+ if (len != sizeof(u32))
+ return -EINVAL;
+ *user_call_ID = *(u32 *) CMSG_DATA(cmsg);
+ } else {
+ if (len != sizeof(unsigned long))
+ return -EINVAL;
+ *user_call_ID = *(unsigned long *)
+ CMSG_DATA(cmsg);
+ }
+ _debug("User Call ID %lx", *user_call_ID);
+ break;
+
+ case RXRPC_ABORT:
+ if (*command != RXRPC_CMD_SEND_DATA)
+ return -EINVAL;
+ *command = RXRPC_CMD_SEND_ABORT;
+ if (len != sizeof(*abort_code))
+ return -EINVAL;
+ *abort_code = *(unsigned int *) CMSG_DATA(cmsg);
+ _debug("Abort %x", *abort_code);
+ if (*abort_code == 0)
+ return -EINVAL;
+ break;
+
+ case RXRPC_ACCEPT:
+ if (*command != RXRPC_CMD_SEND_DATA)
+ return -EINVAL;
+ *command = RXRPC_CMD_ACCEPT;
+ if (len != 0)
+ return -EINVAL;
+ if (!server)
+ return -EISCONN;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * abort a call, sending an ABORT packet to the peer
+ */
+static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
+{
+ write_lock_bh(&call->state_lock);
+
+ if (call->state <= RXRPC_CALL_COMPLETE) {
+ call->state = RXRPC_CALL_LOCALLY_ABORTED;
+ call->abort_code = abort_code;
+ set_bit(RXRPC_CALL_ABORT, &call->events);
+ del_timer_sync(&call->resend_timer);
+ del_timer_sync(&call->ack_timer);
+ clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_ACK, &call->events);
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ rxrpc_queue_call(call);
+ }
+
+ write_unlock_bh(&call->state_lock);
+}
+
+/*
+ * send a message forming part of a client call through an RxRPC socket
+ * - caller holds the socket locked
+ * - the socket may be either a client socket or a server socket
+ */
+int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans,
+ struct msghdr *msg, size_t len)
+{
+ struct rxrpc_conn_bundle *bundle;
+ enum rxrpc_command cmd;
+ struct rxrpc_call *call;
+ unsigned long user_call_ID = 0;
+ struct key *key;
+ __be16 service_id;
+ u32 abort_code = 0;
+ int ret;
+
+ _enter("");
+
+ ASSERT(trans != NULL);
+
+ ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code,
+ false);
+ if (ret < 0)
+ return ret;
+
+ bundle = NULL;
+ if (trans) {
+ service_id = rx->service_id;
+ if (msg->msg_name) {
+ DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx,
+ msg->msg_name);
+ service_id = htons(srx->srx_service);
+ }
+ key = rx->key;
+ if (key && !rx->key->payload.data)
+ key = NULL;
+ bundle = rxrpc_get_bundle(rx, trans, key, service_id,
+ GFP_KERNEL);
+ if (IS_ERR(bundle))
+ return PTR_ERR(bundle);
+ }
+
+ call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID,
+ abort_code == 0, GFP_KERNEL);
+ if (trans)
+ rxrpc_put_bundle(trans, bundle);
+ if (IS_ERR(call)) {
+ _leave(" = %ld", PTR_ERR(call));
+ return PTR_ERR(call);
+ }
+
+ _debug("CALL %d USR %lx ST %d on CONN %p",
+ call->debug_id, call->user_call_ID, call->state, call->conn);
+
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ /* it's too late for this call */
+ ret = -ESHUTDOWN;
+ } else if (cmd == RXRPC_CMD_SEND_ABORT) {
+ rxrpc_send_abort(call, abort_code);
+ } else if (cmd != RXRPC_CMD_SEND_DATA) {
+ ret = -EINVAL;
+ } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
+ /* request phase complete for this client call */
+ ret = -EPROTO;
+ } else {
+ ret = rxrpc_send_data(rx, call, msg, len);
+ }
+
+ rxrpc_put_call(call);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * rxrpc_kernel_send_data - Allow a kernel service to send data on a call
+ * @call: The call to send data through
+ * @msg: The data to send
+ * @len: The amount of data to send
+ *
+ * Allow a kernel service to send data on a call. The call must be in an state
+ * appropriate to sending data. No control data should be supplied in @msg,
+ * nor should an address be supplied. MSG_MORE should be flagged if there's
+ * more data to come, otherwise this data will end the transmission phase.
+ */
+int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg,
+ size_t len)
+{
+ int ret;
+
+ _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]);
+
+ ASSERTCMP(msg->msg_name, ==, NULL);
+ ASSERTCMP(msg->msg_control, ==, NULL);
+
+ lock_sock(&call->socket->sk);
+
+ _debug("CALL %d USR %lx ST %d on CONN %p",
+ call->debug_id, call->user_call_ID, call->state, call->conn);
+
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ ret = -ESHUTDOWN; /* it's too late for this call */
+ } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+ ret = -EPROTO; /* request phase complete for this client call */
+ } else {
+ ret = rxrpc_send_data(call->socket, call, msg, len);
+ }
+
+ release_sock(&call->socket->sk);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_send_data);
+
+/**
+ * rxrpc_kernel_abort_call - Allow a kernel service to abort a call
+ * @call: The call to be aborted
+ * @abort_code: The abort code to stick into the ABORT packet
+ *
+ * Allow a kernel service to abort a call, if it's still in an abortable state.
+ */
+void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code)
+{
+ _enter("{%d},%d", call->debug_id, abort_code);
+
+ lock_sock(&call->socket->sk);
+
+ _debug("CALL %d USR %lx ST %d on CONN %p",
+ call->debug_id, call->user_call_ID, call->state, call->conn);
+
+ if (call->state < RXRPC_CALL_COMPLETE)
+ rxrpc_send_abort(call, abort_code);
+
+ release_sock(&call->socket->sk);
+ _leave("");
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_abort_call);
+
+/*
+ * send a message through a server socket
+ * - caller holds the socket locked
+ */
+int rxrpc_server_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
+{
+ enum rxrpc_command cmd;
+ struct rxrpc_call *call;
+ unsigned long user_call_ID = 0;
+ u32 abort_code = 0;
+ int ret;
+
+ _enter("");
+
+ ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code,
+ true);
+ if (ret < 0)
+ return ret;
+
+ if (cmd == RXRPC_CMD_ACCEPT) {
+ call = rxrpc_accept_call(rx, user_call_ID);
+ if (IS_ERR(call))
+ return PTR_ERR(call);
+ rxrpc_put_call(call);
+ return 0;
+ }
+
+ call = rxrpc_find_server_call(rx, user_call_ID);
+ if (!call)
+ return -EBADSLT;
+ if (call->state >= RXRPC_CALL_COMPLETE) {
+ ret = -ESHUTDOWN;
+ goto out;
+ }
+
+ switch (cmd) {
+ case RXRPC_CMD_SEND_DATA:
+ if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+ call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+ /* Tx phase not yet begun for this call */
+ ret = -EPROTO;
+ break;
+ }
+
+ ret = rxrpc_send_data(rx, call, msg, len);
+ break;
+
+ case RXRPC_CMD_SEND_ABORT:
+ rxrpc_send_abort(call, abort_code);
+ break;
+ default:
+ BUG();
+ }
+
+ out:
+ rxrpc_put_call(call);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * send a packet through the transport endpoint
+ */
+int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb)
+{
+ struct kvec iov[1];
+ struct msghdr msg;
+ int ret, opt;
+
+ _enter(",{%d}", skb->len);
+
+ iov[0].iov_base = skb->head;
+ iov[0].iov_len = skb->len;
+
+ msg.msg_name = &trans->peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(trans->peer->srx.transport.sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ /* send the packet with the don't fragment bit set if we currently
+ * think it's small enough */
+ if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) {
+ down_read(&trans->local->defrag_sem);
+ /* send the packet by UDP
+ * - returns -EMSGSIZE if UDP would have to fragment the packet
+ * to go out of the interface
+ * - in which case, we'll have processed the ICMP error
+ * message and update the peer record
+ */
+ ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1,
+ iov[0].iov_len);
+
+ up_read(&trans->local->defrag_sem);
+ if (ret == -EMSGSIZE)
+ goto send_fragmentable;
+
+ _leave(" = %d [%u]", ret, trans->peer->maxdata);
+ return ret;
+ }
+
+send_fragmentable:
+ /* attempt to send this message with fragmentation enabled */
+ _debug("send fragment");
+
+ down_write(&trans->local->defrag_sem);
+ opt = IP_PMTUDISC_DONT;
+ ret = kernel_setsockopt(trans->local->socket, SOL_IP, IP_MTU_DISCOVER,
+ (char *) &opt, sizeof(opt));
+ if (ret == 0) {
+ ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1,
+ iov[0].iov_len);
+
+ opt = IP_PMTUDISC_DO;
+ kernel_setsockopt(trans->local->socket, SOL_IP,
+ IP_MTU_DISCOVER, (char *) &opt, sizeof(opt));
+ }
+
+ up_write(&trans->local->defrag_sem);
+ _leave(" = %d [frag %u]", ret, trans->peer->maxdata);
+ return ret;
+}
+
+/*
+ * wait for space to appear in the transmit/ACK window
+ * - caller holds the socket locked
+ */
+static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ long *timeo)
+{
+ DECLARE_WAITQUEUE(myself, current);
+ int ret;
+
+ _enter(",{%d},%ld",
+ CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz),
+ *timeo);
+
+ add_wait_queue(&call->tx_waitq, &myself);
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = 0;
+ if (CIRC_SPACE(call->acks_head, call->acks_tail,
+ call->acks_winsz) > 0)
+ break;
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(*timeo);
+ break;
+ }
+
+ release_sock(&rx->sk);
+ *timeo = schedule_timeout(*timeo);
+ lock_sock(&rx->sk);
+ }
+
+ remove_wait_queue(&call->tx_waitq, &myself);
+ set_current_state(TASK_RUNNING);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * attempt to schedule an instant Tx resend
+ */
+static inline void rxrpc_instant_resend(struct rxrpc_call *call)
+{
+ read_lock_bh(&call->state_lock);
+ if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
+ clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+ rxrpc_queue_call(call);
+ }
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * queue a packet for transmission, set the resend timer and attempt
+ * to send the packet immediately
+ */
+static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
+ bool last)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ int ret;
+
+ _net("queue skb %p [%d]", skb, call->acks_head);
+
+ ASSERT(call->acks_window != NULL);
+ call->acks_window[call->acks_head] = (unsigned long) skb;
+ smp_wmb();
+ call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1);
+
+ if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
+ _debug("________awaiting reply/ACK__________");
+ write_lock_bh(&call->state_lock);
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_SEND_REQUEST:
+ call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+ break;
+ case RXRPC_CALL_SERVER_ACK_REQUEST:
+ call->state = RXRPC_CALL_SERVER_SEND_REPLY;
+ if (!last)
+ break;
+ case RXRPC_CALL_SERVER_SEND_REPLY:
+ call->state = RXRPC_CALL_SERVER_AWAIT_ACK;
+ break;
+ default:
+ break;
+ }
+ write_unlock_bh(&call->state_lock);
+ }
+
+ _proto("Tx DATA %%%u { #%u }",
+ ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+
+ sp->need_resend = false;
+ sp->resend_at = jiffies + rxrpc_resend_timeout;
+ if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) {
+ _debug("run timer");
+ call->resend_timer.expires = sp->resend_at;
+ add_timer(&call->resend_timer);
+ }
+
+ /* attempt to cancel the rx-ACK timer, deferring reply transmission if
+ * we're ACK'ing the request phase of an incoming call */
+ ret = -EAGAIN;
+ if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
+ /* the packet may be freed by rxrpc_process_call() before this
+ * returns */
+ ret = rxrpc_send_packet(call->conn->trans, skb);
+ _net("sent skb %p", skb);
+ } else {
+ _debug("failed to delete ACK timer");
+ }
+
+ if (ret < 0) {
+ _debug("need instant resend %d", ret);
+ sp->need_resend = true;
+ rxrpc_instant_resend(call);
+ }
+
+ _leave("");
+}
+
+/*
+ * send data through a socket
+ * - must be called in process context
+ * - caller holds the socket locked
+ */
+static int rxrpc_send_data(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ struct msghdr *msg, size_t len)
+{
+ struct rxrpc_skb_priv *sp;
+ struct sk_buff *skb;
+ struct sock *sk = &rx->sk;
+ long timeo;
+ bool more;
+ int ret, copied;
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ /* this should be in poll */
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ return -EPIPE;
+
+ more = msg->msg_flags & MSG_MORE;
+
+ skb = call->tx_pending;
+ call->tx_pending = NULL;
+
+ copied = 0;
+ do {
+ if (!skb) {
+ size_t size, chunk, max, space;
+
+ _debug("alloc");
+
+ if (CIRC_SPACE(call->acks_head, call->acks_tail,
+ call->acks_winsz) <= 0) {
+ ret = -EAGAIN;
+ if (msg->msg_flags & MSG_DONTWAIT)
+ goto maybe_error;
+ ret = rxrpc_wait_for_tx_window(rx, call,
+ &timeo);
+ if (ret < 0)
+ goto maybe_error;
+ }
+
+ max = call->conn->trans->peer->maxdata;
+ max -= call->conn->security_size;
+ max &= ~(call->conn->size_align - 1UL);
+
+ chunk = max;
+ if (chunk > msg_data_left(msg) && !more)
+ chunk = msg_data_left(msg);
+
+ space = chunk + call->conn->size_align;
+ space &= ~(call->conn->size_align - 1UL);
+
+ size = space + call->conn->header_size;
+
+ _debug("SIZE: %zu/%zu/%zu", chunk, space, size);
+
+ /* create a buffer that we can retain until it's ACK'd */
+ skb = sock_alloc_send_skb(
+ sk, size, msg->msg_flags & MSG_DONTWAIT, &ret);
+ if (!skb)
+ goto maybe_error;
+
+ rxrpc_new_skb(skb);
+
+ _debug("ALLOC SEND %p", skb);
+
+ ASSERTCMP(skb->mark, ==, 0);
+
+ _debug("HS: %u", call->conn->header_size);
+ skb_reserve(skb, call->conn->header_size);
+ skb->len += call->conn->header_size;
+
+ sp = rxrpc_skb(skb);
+ sp->remain = chunk;
+ if (sp->remain > skb_tailroom(skb))
+ sp->remain = skb_tailroom(skb);
+
+ _net("skb: hr %d, tr %d, hl %d, rm %d",
+ skb_headroom(skb),
+ skb_tailroom(skb),
+ skb_headlen(skb),
+ sp->remain);
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ _debug("append");
+ sp = rxrpc_skb(skb);
+
+ /* append next segment of data to the current buffer */
+ if (msg_data_left(msg) > 0) {
+ int copy = skb_tailroom(skb);
+ ASSERTCMP(copy, >, 0);
+ if (copy > msg_data_left(msg))
+ copy = msg_data_left(msg);
+ if (copy > sp->remain)
+ copy = sp->remain;
+
+ _debug("add");
+ ret = skb_add_data(skb, &msg->msg_iter, copy);
+ _debug("added");
+ if (ret < 0)
+ goto efault;
+ sp->remain -= copy;
+ skb->mark += copy;
+ copied += copy;
+ }
+
+ /* check for the far side aborting the call or a network error
+ * occurring */
+ if (call->state > RXRPC_CALL_COMPLETE)
+ goto call_aborted;
+
+ /* add the packet to the send queue if it's now full */
+ if (sp->remain <= 0 ||
+ (msg_data_left(msg) == 0 && !more)) {
+ struct rxrpc_connection *conn = call->conn;
+ uint32_t seq;
+ size_t pad;
+
+ /* pad out if we're using security */
+ if (conn->security) {
+ pad = conn->security_size + skb->mark;
+ pad = conn->size_align - pad;
+ pad &= conn->size_align - 1;
+ _debug("pad %zu", pad);
+ if (pad)
+ memset(skb_put(skb, pad), 0, pad);
+ }
+
+ seq = atomic_inc_return(&call->sequence);
+
+ sp->hdr.epoch = conn->epoch;
+ sp->hdr.cid = call->cid;
+ sp->hdr.callNumber = call->call_id;
+ sp->hdr.seq = htonl(seq);
+ sp->hdr.serial =
+ htonl(atomic_inc_return(&conn->serial));
+ sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
+ sp->hdr.userStatus = 0;
+ sp->hdr.securityIndex = conn->security_ix;
+ sp->hdr._rsvd = 0;
+ sp->hdr.serviceId = conn->service_id;
+
+ sp->hdr.flags = conn->out_clientflag;
+ if (msg_data_left(msg) == 0 && !more)
+ sp->hdr.flags |= RXRPC_LAST_PACKET;
+ else if (CIRC_SPACE(call->acks_head, call->acks_tail,
+ call->acks_winsz) > 1)
+ sp->hdr.flags |= RXRPC_MORE_PACKETS;
+ if (more && seq & 1)
+ sp->hdr.flags |= RXRPC_REQUEST_ACK;
+
+ ret = rxrpc_secure_packet(
+ call, skb, skb->mark,
+ skb->head + sizeof(struct rxrpc_header));
+ if (ret < 0)
+ goto out;
+
+ memcpy(skb->head, &sp->hdr,
+ sizeof(struct rxrpc_header));
+ rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
+ skb = NULL;
+ }
+ } while (msg_data_left(msg) > 0);
+
+success:
+ ret = copied;
+out:
+ call->tx_pending = skb;
+ _leave(" = %d", ret);
+ return ret;
+
+call_aborted:
+ rxrpc_free_skb(skb);
+ if (call->state == RXRPC_CALL_NETWORK_ERROR)
+ ret = call->conn->trans->peer->net_error;
+ else
+ ret = -ECONNABORTED;
+ _leave(" = %d", ret);
+ return ret;
+
+maybe_error:
+ if (copied)
+ goto success;
+ goto out;
+
+efault:
+ ret = -EFAULT;
+ goto out;
+}
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
new file mode 100644
index 000000000..bebaa4348
--- /dev/null
+++ b/net/rxrpc/ar-peer.c
@@ -0,0 +1,303 @@
+/* RxRPC remote transport endpoint management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include "ar-internal.h"
+
+static LIST_HEAD(rxrpc_peers);
+static DEFINE_RWLOCK(rxrpc_peer_lock);
+static DECLARE_WAIT_QUEUE_HEAD(rxrpc_peer_wq);
+
+static void rxrpc_destroy_peer(struct work_struct *work);
+
+/*
+ * assess the MTU size for the network interface through which this peer is
+ * reached
+ */
+static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
+{
+ struct rtable *rt;
+ struct flowi4 fl4;
+
+ peer->if_mtu = 1500;
+
+ rt = ip_route_output_ports(&init_net, &fl4, NULL,
+ peer->srx.transport.sin.sin_addr.s_addr, 0,
+ htons(7000), htons(7001),
+ IPPROTO_UDP, 0, 0);
+ if (IS_ERR(rt)) {
+ _leave(" [route err %ld]", PTR_ERR(rt));
+ return;
+ }
+
+ peer->if_mtu = dst_mtu(&rt->dst);
+ dst_release(&rt->dst);
+
+ _leave(" [if_mtu %u]", peer->if_mtu);
+}
+
+/*
+ * allocate a new peer
+ */
+static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx,
+ gfp_t gfp)
+{
+ struct rxrpc_peer *peer;
+
+ _enter("");
+
+ peer = kzalloc(sizeof(struct rxrpc_peer), gfp);
+ if (peer) {
+ INIT_WORK(&peer->destroyer, &rxrpc_destroy_peer);
+ INIT_LIST_HEAD(&peer->link);
+ INIT_LIST_HEAD(&peer->error_targets);
+ spin_lock_init(&peer->lock);
+ atomic_set(&peer->usage, 1);
+ peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
+ memcpy(&peer->srx, srx, sizeof(*srx));
+
+ rxrpc_assess_MTU_size(peer);
+ peer->mtu = peer->if_mtu;
+
+ if (srx->transport.family == AF_INET) {
+ peer->hdrsize = sizeof(struct iphdr);
+ switch (srx->transport_type) {
+ case SOCK_DGRAM:
+ peer->hdrsize += sizeof(struct udphdr);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ } else {
+ BUG();
+ }
+
+ peer->hdrsize += sizeof(struct rxrpc_header);
+ peer->maxdata = peer->mtu - peer->hdrsize;
+ }
+
+ _leave(" = %p", peer);
+ return peer;
+}
+
+/*
+ * obtain a remote transport endpoint for the specified address
+ */
+struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp)
+{
+ struct rxrpc_peer *peer, *candidate;
+ const char *new = "old";
+ int usage;
+
+ _enter("{%d,%d,%pI4+%hu}",
+ srx->transport_type,
+ srx->transport_len,
+ &srx->transport.sin.sin_addr,
+ ntohs(srx->transport.sin.sin_port));
+
+ /* search the peer list first */
+ read_lock_bh(&rxrpc_peer_lock);
+ list_for_each_entry(peer, &rxrpc_peers, link) {
+ _debug("check PEER %d { u=%d t=%d l=%d }",
+ peer->debug_id,
+ atomic_read(&peer->usage),
+ peer->srx.transport_type,
+ peer->srx.transport_len);
+
+ if (atomic_read(&peer->usage) > 0 &&
+ peer->srx.transport_type == srx->transport_type &&
+ peer->srx.transport_len == srx->transport_len &&
+ memcmp(&peer->srx.transport,
+ &srx->transport,
+ srx->transport_len) == 0)
+ goto found_extant_peer;
+ }
+ read_unlock_bh(&rxrpc_peer_lock);
+
+ /* not yet present - create a candidate for a new record and then
+ * redo the search */
+ candidate = rxrpc_alloc_peer(srx, gfp);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ write_lock_bh(&rxrpc_peer_lock);
+
+ list_for_each_entry(peer, &rxrpc_peers, link) {
+ if (atomic_read(&peer->usage) > 0 &&
+ peer->srx.transport_type == srx->transport_type &&
+ peer->srx.transport_len == srx->transport_len &&
+ memcmp(&peer->srx.transport,
+ &srx->transport,
+ srx->transport_len) == 0)
+ goto found_extant_second;
+ }
+
+ /* we can now add the new candidate to the list */
+ peer = candidate;
+ candidate = NULL;
+ usage = atomic_read(&peer->usage);
+
+ list_add_tail(&peer->link, &rxrpc_peers);
+ write_unlock_bh(&rxrpc_peer_lock);
+ new = "new";
+
+success:
+ _net("PEER %s %d {%d,%u,%pI4+%hu}",
+ new,
+ peer->debug_id,
+ peer->srx.transport_type,
+ peer->srx.transport.family,
+ &peer->srx.transport.sin.sin_addr,
+ ntohs(peer->srx.transport.sin.sin_port));
+
+ _leave(" = %p {u=%d}", peer, usage);
+ return peer;
+
+ /* we found the peer in the list immediately */
+found_extant_peer:
+ usage = atomic_inc_return(&peer->usage);
+ read_unlock_bh(&rxrpc_peer_lock);
+ goto success;
+
+ /* we found the peer on the second time through the list */
+found_extant_second:
+ usage = atomic_inc_return(&peer->usage);
+ write_unlock_bh(&rxrpc_peer_lock);
+ kfree(candidate);
+ goto success;
+}
+
+/*
+ * find the peer associated with a packet
+ */
+struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *local,
+ __be32 addr, __be16 port)
+{
+ struct rxrpc_peer *peer;
+
+ _enter("");
+
+ /* search the peer list */
+ read_lock_bh(&rxrpc_peer_lock);
+
+ if (local->srx.transport.family == AF_INET &&
+ local->srx.transport_type == SOCK_DGRAM
+ ) {
+ list_for_each_entry(peer, &rxrpc_peers, link) {
+ if (atomic_read(&peer->usage) > 0 &&
+ peer->srx.transport_type == SOCK_DGRAM &&
+ peer->srx.transport.family == AF_INET &&
+ peer->srx.transport.sin.sin_port == port &&
+ peer->srx.transport.sin.sin_addr.s_addr == addr)
+ goto found_UDP_peer;
+ }
+
+ goto new_UDP_peer;
+ }
+
+ read_unlock_bh(&rxrpc_peer_lock);
+ _leave(" = -EAFNOSUPPORT");
+ return ERR_PTR(-EAFNOSUPPORT);
+
+found_UDP_peer:
+ _net("Rx UDP DGRAM from peer %d", peer->debug_id);
+ atomic_inc(&peer->usage);
+ read_unlock_bh(&rxrpc_peer_lock);
+ _leave(" = %p", peer);
+ return peer;
+
+new_UDP_peer:
+ _net("Rx UDP DGRAM from NEW peer");
+ read_unlock_bh(&rxrpc_peer_lock);
+ _leave(" = -EBUSY [new]");
+ return ERR_PTR(-EBUSY);
+}
+
+/*
+ * release a remote transport endpoint
+ */
+void rxrpc_put_peer(struct rxrpc_peer *peer)
+{
+ _enter("%p{u=%d}", peer, atomic_read(&peer->usage));
+
+ ASSERTCMP(atomic_read(&peer->usage), >, 0);
+
+ if (likely(!atomic_dec_and_test(&peer->usage))) {
+ _leave(" [in use]");
+ return;
+ }
+
+ rxrpc_queue_work(&peer->destroyer);
+ _leave("");
+}
+
+/*
+ * destroy a remote transport endpoint
+ */
+static void rxrpc_destroy_peer(struct work_struct *work)
+{
+ struct rxrpc_peer *peer =
+ container_of(work, struct rxrpc_peer, destroyer);
+
+ _enter("%p{%d}", peer, atomic_read(&peer->usage));
+
+ write_lock_bh(&rxrpc_peer_lock);
+ list_del(&peer->link);
+ write_unlock_bh(&rxrpc_peer_lock);
+
+ _net("DESTROY PEER %d", peer->debug_id);
+ kfree(peer);
+
+ if (list_empty(&rxrpc_peers))
+ wake_up_all(&rxrpc_peer_wq);
+ _leave("");
+}
+
+/*
+ * preemptively destroy all the peer records from a transport endpoint rather
+ * than waiting for them to time out
+ */
+void __exit rxrpc_destroy_all_peers(void)
+{
+ DECLARE_WAITQUEUE(myself,current);
+
+ _enter("");
+
+ /* we simply have to wait for them to go away */
+ if (!list_empty(&rxrpc_peers)) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&rxrpc_peer_wq, &myself);
+
+ while (!list_empty(&rxrpc_peers)) {
+ schedule();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+
+ remove_wait_queue(&rxrpc_peer_wq, &myself);
+ set_current_state(TASK_RUNNING);
+ }
+
+ _leave("");
+}
diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c
new file mode 100644
index 000000000..38047f713
--- /dev/null
+++ b/net/rxrpc/ar-proc.c
@@ -0,0 +1,192 @@
+/* /proc/net/ support for AF_RXRPC
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static const char *const rxrpc_conn_states[] = {
+ [RXRPC_CONN_UNUSED] = "Unused ",
+ [RXRPC_CONN_CLIENT] = "Client ",
+ [RXRPC_CONN_SERVER_UNSECURED] = "SvUnsec ",
+ [RXRPC_CONN_SERVER_CHALLENGING] = "SvChall ",
+ [RXRPC_CONN_SERVER] = "SvSecure",
+ [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort",
+ [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort",
+ [RXRPC_CONN_NETWORK_ERROR] = "NetError",
+};
+
+/*
+ * generate a list of extant and dead calls in /proc/net/rxrpc_calls
+ */
+static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
+{
+ read_lock(&rxrpc_call_lock);
+ return seq_list_start_head(&rxrpc_calls, *_pos);
+}
+
+static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &rxrpc_calls, pos);
+}
+
+static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&rxrpc_call_lock);
+}
+
+static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
+{
+ struct rxrpc_transport *trans;
+ struct rxrpc_call *call;
+ char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+
+ if (v == &rxrpc_calls) {
+ seq_puts(seq,
+ "Proto Local Remote "
+ " SvID ConnID CallID End Use State Abort "
+ " UserID\n");
+ return 0;
+ }
+
+ call = list_entry(v, struct rxrpc_call, link);
+ trans = call->conn->trans;
+
+ sprintf(lbuff, "%pI4:%u",
+ &trans->local->srx.transport.sin.sin_addr,
+ ntohs(trans->local->srx.transport.sin.sin_port));
+
+ sprintf(rbuff, "%pI4:%u",
+ &trans->peer->srx.transport.sin.sin_addr,
+ ntohs(trans->peer->srx.transport.sin.sin_port));
+
+ seq_printf(seq,
+ "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u"
+ " %-8.8s %08x %lx\n",
+ lbuff,
+ rbuff,
+ ntohs(call->conn->service_id),
+ ntohl(call->conn->cid),
+ ntohl(call->call_id),
+ call->conn->in_clientflag ? "Svc" : "Clt",
+ atomic_read(&call->usage),
+ rxrpc_call_states[call->state],
+ call->abort_code,
+ call->user_call_ID);
+
+ return 0;
+}
+
+static const struct seq_operations rxrpc_call_seq_ops = {
+ .start = rxrpc_call_seq_start,
+ .next = rxrpc_call_seq_next,
+ .stop = rxrpc_call_seq_stop,
+ .show = rxrpc_call_seq_show,
+};
+
+static int rxrpc_call_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rxrpc_call_seq_ops);
+}
+
+const struct file_operations rxrpc_call_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rxrpc_call_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * generate a list of extant virtual connections in /proc/net/rxrpc_conns
+ */
+static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos)
+{
+ read_lock(&rxrpc_connection_lock);
+ return seq_list_start_head(&rxrpc_connections, *_pos);
+}
+
+static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos)
+{
+ return seq_list_next(v, &rxrpc_connections, pos);
+}
+
+static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
+{
+ read_unlock(&rxrpc_connection_lock);
+}
+
+static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
+{
+ struct rxrpc_connection *conn;
+ struct rxrpc_transport *trans;
+ char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+
+ if (v == &rxrpc_connections) {
+ seq_puts(seq,
+ "Proto Local Remote "
+ " SvID ConnID Calls End Use State Key "
+ " Serial ISerial\n"
+ );
+ return 0;
+ }
+
+ conn = list_entry(v, struct rxrpc_connection, link);
+ trans = conn->trans;
+
+ sprintf(lbuff, "%pI4:%u",
+ &trans->local->srx.transport.sin.sin_addr,
+ ntohs(trans->local->srx.transport.sin.sin_port));
+
+ sprintf(rbuff, "%pI4:%u",
+ &trans->peer->srx.transport.sin.sin_addr,
+ ntohs(trans->peer->srx.transport.sin.sin_port));
+
+ seq_printf(seq,
+ "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u"
+ " %s %08x %08x %08x\n",
+ lbuff,
+ rbuff,
+ ntohs(conn->service_id),
+ ntohl(conn->cid),
+ conn->call_counter,
+ conn->in_clientflag ? "Svc" : "Clt",
+ atomic_read(&conn->usage),
+ rxrpc_conn_states[conn->state],
+ key_serial(conn->key),
+ atomic_read(&conn->serial),
+ atomic_read(&conn->hi_serial));
+
+ return 0;
+}
+
+static const struct seq_operations rxrpc_connection_seq_ops = {
+ .start = rxrpc_connection_seq_start,
+ .next = rxrpc_connection_seq_next,
+ .stop = rxrpc_connection_seq_stop,
+ .show = rxrpc_connection_seq_show,
+};
+
+
+static int rxrpc_connection_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &rxrpc_connection_seq_ops);
+}
+
+const struct file_operations rxrpc_connection_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = rxrpc_connection_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
new file mode 100644
index 000000000..b92beded7
--- /dev/null
+++ b/net/rxrpc/ar-recvmsg.c
@@ -0,0 +1,424 @@
+/* RxRPC recvmsg() implementation
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * removal a call's user ID from the socket tree to make the user ID available
+ * again and so that it won't be seen again in association with that call
+ */
+void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call)
+{
+ _debug("RELEASE CALL %d", call->debug_id);
+
+ if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+ write_lock_bh(&rx->call_lock);
+ rb_erase(&call->sock_node, &call->socket->calls);
+ clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+ write_unlock_bh(&rx->call_lock);
+ }
+
+ read_lock_bh(&call->state_lock);
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ rxrpc_queue_call(call);
+ read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * receive a message from an RxRPC socket
+ * - we need to be careful about two or more threads calling recvmsg
+ * simultaneously
+ */
+int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_call *call = NULL, *continue_call = NULL;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+ struct sk_buff *skb;
+ long timeo;
+ int copy, ret, ullen, offset, copied = 0;
+ u32 abort_code;
+
+ DEFINE_WAIT(wait);
+
+ _enter(",,,%zu,%d", len, flags);
+
+ if (flags & (MSG_OOB | MSG_TRUNC))
+ return -EOPNOTSUPP;
+
+ ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long);
+
+ timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
+ msg->msg_flags |= MSG_MORE;
+
+ lock_sock(&rx->sk);
+
+ for (;;) {
+ /* return immediately if a client socket has no outstanding
+ * calls */
+ if (RB_EMPTY_ROOT(&rx->calls)) {
+ if (copied)
+ goto out;
+ if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
+ release_sock(&rx->sk);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ return -ENODATA;
+ }
+ }
+
+ /* get the next message on the Rx queue */
+ skb = skb_peek(&rx->sk.sk_receive_queue);
+ if (!skb) {
+ /* nothing remains on the queue */
+ if (copied &&
+ (flags & MSG_PEEK || timeo == 0))
+ goto out;
+
+ /* wait for a message to turn up */
+ release_sock(&rx->sk);
+ prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
+ TASK_INTERRUPTIBLE);
+ ret = sock_error(&rx->sk);
+ if (ret)
+ goto wait_error;
+
+ if (skb_queue_empty(&rx->sk.sk_receive_queue)) {
+ if (signal_pending(current))
+ goto wait_interrupted;
+ timeo = schedule_timeout(timeo);
+ }
+ finish_wait(sk_sleep(&rx->sk), &wait);
+ lock_sock(&rx->sk);
+ continue;
+ }
+
+ peek_next_packet:
+ sp = rxrpc_skb(skb);
+ call = sp->call;
+ ASSERT(call != NULL);
+
+ _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]);
+
+ /* make sure we wait for the state to be updated in this call */
+ spin_lock_bh(&call->lock);
+ spin_unlock_bh(&call->lock);
+
+ if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+ _debug("packet from released call");
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ continue;
+ }
+
+ /* determine whether to continue last data receive */
+ if (continue_call) {
+ _debug("maybe cont");
+ if (call != continue_call ||
+ skb->mark != RXRPC_SKB_MARK_DATA) {
+ release_sock(&rx->sk);
+ rxrpc_put_call(continue_call);
+ _leave(" = %d [noncont]", copied);
+ return copied;
+ }
+ }
+
+ rxrpc_get_call(call);
+
+ /* copy the peer address and timestamp */
+ if (!continue_call) {
+ if (msg->msg_name) {
+ size_t len =
+ sizeof(call->conn->trans->peer->srx);
+ memcpy(msg->msg_name,
+ &call->conn->trans->peer->srx, len);
+ msg->msg_namelen = len;
+ }
+ sock_recv_timestamp(msg, &rx->sk, skb);
+ }
+
+ /* receive the message */
+ if (skb->mark != RXRPC_SKB_MARK_DATA)
+ goto receive_non_data_message;
+
+ _debug("recvmsg DATA #%u { %d, %d }",
+ ntohl(sp->hdr.seq), skb->len, sp->offset);
+
+ if (!continue_call) {
+ /* only set the control data once per recvmsg() */
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ ullen, &call->user_call_ID);
+ if (ret < 0)
+ goto copy_error;
+ ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+ }
+
+ ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
+ ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
+ call->rx_data_recv = ntohl(sp->hdr.seq);
+
+ ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+
+ offset = sp->offset;
+ copy = skb->len - offset;
+ if (copy > len - copied)
+ copy = len - copied;
+
+ ret = skb_copy_datagram_msg(skb, offset, msg, copy);
+
+ if (ret < 0)
+ goto copy_error;
+
+ /* handle piecemeal consumption of data packets */
+ _debug("copied %d+%d", copy, copied);
+
+ offset += copy;
+ copied += copy;
+
+ if (!(flags & MSG_PEEK))
+ sp->offset = offset;
+
+ if (sp->offset < skb->len) {
+ _debug("buffer full");
+ ASSERTCMP(copied, ==, len);
+ break;
+ }
+
+ /* we transferred the whole data packet */
+ if (sp->hdr.flags & RXRPC_LAST_PACKET) {
+ _debug("last");
+ if (call->conn->out_clientflag) {
+ /* last byte of reply received */
+ ret = copied;
+ goto terminal_message;
+ }
+
+ /* last bit of request received */
+ if (!(flags & MSG_PEEK)) {
+ _debug("eat packet");
+ if (skb_dequeue(&rx->sk.sk_receive_queue) !=
+ skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ }
+ msg->msg_flags &= ~MSG_MORE;
+ break;
+ }
+
+ /* move on to the next data message */
+ _debug("next");
+ if (!continue_call)
+ continue_call = sp->call;
+ else
+ rxrpc_put_call(call);
+ call = NULL;
+
+ if (flags & MSG_PEEK) {
+ _debug("peek next");
+ skb = skb->next;
+ if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue)
+ break;
+ goto peek_next_packet;
+ }
+
+ _debug("eat packet");
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ }
+
+ /* end of non-terminal data packet reception for the moment */
+ _debug("end rcv data");
+out:
+ release_sock(&rx->sk);
+ if (call)
+ rxrpc_put_call(call);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ _leave(" = %d [data]", copied);
+ return copied;
+
+ /* handle non-DATA messages such as aborts, incoming connections and
+ * final ACKs */
+receive_non_data_message:
+ _debug("non-data");
+
+ if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) {
+ _debug("RECV NEW CALL");
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code);
+ if (ret < 0)
+ goto copy_error;
+ if (!(flags & MSG_PEEK)) {
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ }
+ goto out;
+ }
+
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+ ullen, &call->user_call_ID);
+ if (ret < 0)
+ goto copy_error;
+ ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+
+ switch (skb->mark) {
+ case RXRPC_SKB_MARK_DATA:
+ BUG();
+ case RXRPC_SKB_MARK_FINAL_ACK:
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_BUSY:
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_REMOTE_ABORT:
+ abort_code = call->abort_code;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_NET_ERROR:
+ _debug("RECV NET ERROR %d", sp->error);
+ abort_code = sp->error;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code);
+ break;
+ case RXRPC_SKB_MARK_LOCAL_ERROR:
+ _debug("RECV LOCAL ERROR %d", sp->error);
+ abort_code = sp->error;
+ ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
+ &abort_code);
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ if (ret < 0)
+ goto copy_error;
+
+terminal_message:
+ _debug("terminal");
+ msg->msg_flags &= ~MSG_MORE;
+ msg->msg_flags |= MSG_EOR;
+
+ if (!(flags & MSG_PEEK)) {
+ _net("free terminal skb %p", skb);
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ rxrpc_free_skb(skb);
+ rxrpc_remove_user_ID(rx, call);
+ }
+
+ release_sock(&rx->sk);
+ rxrpc_put_call(call);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ _leave(" = %d", ret);
+ return ret;
+
+copy_error:
+ _debug("copy error");
+ release_sock(&rx->sk);
+ rxrpc_put_call(call);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ _leave(" = %d", ret);
+ return ret;
+
+wait_interrupted:
+ ret = sock_intr_errno(timeo);
+wait_error:
+ finish_wait(sk_sleep(&rx->sk), &wait);
+ if (continue_call)
+ rxrpc_put_call(continue_call);
+ if (copied)
+ copied = ret;
+ _leave(" = %d [waitfail %d]", copied, ret);
+ return copied;
+
+}
+
+/**
+ * rxrpc_kernel_data_delivered - Record delivery of data message
+ * @skb: Message holding data
+ *
+ * Record the delivery of a data message. This permits RxRPC to keep its
+ * tracking correct. The socket buffer will be deleted.
+ */
+void rxrpc_kernel_data_delivered(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_call *call = sp->call;
+
+ ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
+ ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
+ call->rx_data_recv = ntohl(sp->hdr.seq);
+
+ ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+ rxrpc_free_skb(skb);
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_data_delivered);
+
+/**
+ * rxrpc_kernel_is_data_last - Determine if data message is last one
+ * @skb: Message holding data
+ *
+ * Determine if data message is last one for the parent call.
+ */
+bool rxrpc_kernel_is_data_last(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA);
+
+ return sp->hdr.flags & RXRPC_LAST_PACKET;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_is_data_last);
+
+/**
+ * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message
+ * @skb: Message indicating an abort
+ *
+ * Get the abort code from an RxRPC abort message.
+ */
+u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_REMOTE_ABORT);
+
+ return sp->call->abort_code;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_get_abort_code);
+
+/**
+ * rxrpc_kernel_get_error - Get the error number from an RxRPC error message
+ * @skb: Message indicating an error
+ *
+ * Get the error number from an RxRPC error message.
+ */
+int rxrpc_kernel_get_error_number(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ return sp->error;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_get_error_number);
diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c
new file mode 100644
index 000000000..49b3cc31e
--- /dev/null
+++ b/net/rxrpc/ar-security.c
@@ -0,0 +1,264 @@
+/* RxRPC security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/crypto.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#include "ar-internal.h"
+
+static LIST_HEAD(rxrpc_security_methods);
+static DECLARE_RWSEM(rxrpc_security_sem);
+
+/*
+ * get an RxRPC security module
+ */
+static struct rxrpc_security *rxrpc_security_get(struct rxrpc_security *sec)
+{
+ return try_module_get(sec->owner) ? sec : NULL;
+}
+
+/*
+ * release an RxRPC security module
+ */
+static void rxrpc_security_put(struct rxrpc_security *sec)
+{
+ module_put(sec->owner);
+}
+
+/*
+ * look up an rxrpc security module
+ */
+static struct rxrpc_security *rxrpc_security_lookup(u8 security_index)
+{
+ struct rxrpc_security *sec = NULL;
+
+ _enter("");
+
+ down_read(&rxrpc_security_sem);
+
+ list_for_each_entry(sec, &rxrpc_security_methods, link) {
+ if (sec->security_index == security_index) {
+ if (unlikely(!rxrpc_security_get(sec)))
+ break;
+ goto out;
+ }
+ }
+
+ sec = NULL;
+out:
+ up_read(&rxrpc_security_sem);
+ _leave(" = %p [%s]", sec, sec ? sec->name : "");
+ return sec;
+}
+
+/**
+ * rxrpc_register_security - register an RxRPC security handler
+ * @sec: security module
+ *
+ * register an RxRPC security handler for use by RxRPC
+ */
+int rxrpc_register_security(struct rxrpc_security *sec)
+{
+ struct rxrpc_security *psec;
+ int ret;
+
+ _enter("");
+ down_write(&rxrpc_security_sem);
+
+ ret = -EEXIST;
+ list_for_each_entry(psec, &rxrpc_security_methods, link) {
+ if (psec->security_index == sec->security_index)
+ goto out;
+ }
+
+ list_add(&sec->link, &rxrpc_security_methods);
+
+ printk(KERN_NOTICE "RxRPC: Registered security type %d '%s'\n",
+ sec->security_index, sec->name);
+ ret = 0;
+
+out:
+ up_write(&rxrpc_security_sem);
+ _leave(" = %d", ret);
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(rxrpc_register_security);
+
+/**
+ * rxrpc_unregister_security - unregister an RxRPC security handler
+ * @sec: security module
+ *
+ * unregister an RxRPC security handler
+ */
+void rxrpc_unregister_security(struct rxrpc_security *sec)
+{
+
+ _enter("");
+ down_write(&rxrpc_security_sem);
+ list_del_init(&sec->link);
+ up_write(&rxrpc_security_sem);
+
+ printk(KERN_NOTICE "RxRPC: Unregistered security type %d '%s'\n",
+ sec->security_index, sec->name);
+}
+
+EXPORT_SYMBOL_GPL(rxrpc_unregister_security);
+
+/*
+ * initialise the security on a client connection
+ */
+int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
+{
+ struct rxrpc_key_token *token;
+ struct rxrpc_security *sec;
+ struct key *key = conn->key;
+ int ret;
+
+ _enter("{%d},{%x}", conn->debug_id, key_serial(key));
+
+ if (!key)
+ return 0;
+
+ ret = key_validate(key);
+ if (ret < 0)
+ return ret;
+
+ if (!key->payload.data)
+ return -EKEYREJECTED;
+ token = key->payload.data;
+
+ sec = rxrpc_security_lookup(token->security_index);
+ if (!sec)
+ return -EKEYREJECTED;
+ conn->security = sec;
+
+ ret = conn->security->init_connection_security(conn);
+ if (ret < 0) {
+ rxrpc_security_put(conn->security);
+ conn->security = NULL;
+ return ret;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * initialise the security on a server connection
+ */
+int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
+{
+ struct rxrpc_security *sec;
+ struct rxrpc_local *local = conn->trans->local;
+ struct rxrpc_sock *rx;
+ struct key *key;
+ key_ref_t kref;
+ char kdesc[5+1+3+1];
+
+ _enter("");
+
+ sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix);
+
+ sec = rxrpc_security_lookup(conn->security_ix);
+ if (!sec) {
+ _leave(" = -ENOKEY [lookup]");
+ return -ENOKEY;
+ }
+
+ /* find the service */
+ read_lock_bh(&local->services_lock);
+ list_for_each_entry(rx, &local->services, listen_link) {
+ if (rx->service_id == conn->service_id)
+ goto found_service;
+ }
+
+ /* the service appears to have died */
+ read_unlock_bh(&local->services_lock);
+ rxrpc_security_put(sec);
+ _leave(" = -ENOENT");
+ return -ENOENT;
+
+found_service:
+ if (!rx->securities) {
+ read_unlock_bh(&local->services_lock);
+ rxrpc_security_put(sec);
+ _leave(" = -ENOKEY");
+ return -ENOKEY;
+ }
+
+ /* look through the service's keyring */
+ kref = keyring_search(make_key_ref(rx->securities, 1UL),
+ &key_type_rxrpc_s, kdesc);
+ if (IS_ERR(kref)) {
+ read_unlock_bh(&local->services_lock);
+ rxrpc_security_put(sec);
+ _leave(" = %ld [search]", PTR_ERR(kref));
+ return PTR_ERR(kref);
+ }
+
+ key = key_ref_to_ptr(kref);
+ read_unlock_bh(&local->services_lock);
+
+ conn->server_key = key;
+ conn->security = sec;
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * secure a packet prior to transmission
+ */
+int rxrpc_secure_packet(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ size_t data_size,
+ void *sechdr)
+{
+ if (call->conn->security)
+ return call->conn->security->secure_packet(
+ call, skb, data_size, sechdr);
+ return 0;
+}
+
+/*
+ * secure a packet prior to transmission
+ */
+int rxrpc_verify_packet(const struct rxrpc_call *call, struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ if (call->conn->security)
+ return call->conn->security->verify_packet(
+ call, skb, _abort_code);
+ return 0;
+}
+
+/*
+ * clear connection security
+ */
+void rxrpc_clear_conn_security(struct rxrpc_connection *conn)
+{
+ _enter("{%d}", conn->debug_id);
+
+ if (conn->security) {
+ conn->security->clear(conn);
+ rxrpc_security_put(conn->security);
+ conn->security = NULL;
+ }
+
+ key_put(conn->key);
+ key_put(conn->server_key);
+}
diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c
new file mode 100644
index 000000000..4cfab49e3
--- /dev/null
+++ b/net/rxrpc/ar-skbuff.c
@@ -0,0 +1,137 @@
+/* ar-skbuff.c: socket buffer destruction handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * set up for the ACK at the end of the receive phase when we discard the final
+ * receive phase data packet
+ * - called with softirqs disabled
+ */
+static void rxrpc_request_final_ACK(struct rxrpc_call *call)
+{
+ /* the call may be aborted before we have a chance to ACK it */
+ write_lock(&call->state_lock);
+
+ switch (call->state) {
+ case RXRPC_CALL_CLIENT_RECV_REPLY:
+ call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
+ _debug("request final ACK");
+
+ /* get an extra ref on the call for the final-ACK generator to
+ * release */
+ rxrpc_get_call(call);
+ set_bit(RXRPC_CALL_ACK_FINAL, &call->events);
+ if (try_to_del_timer_sync(&call->ack_timer) >= 0)
+ rxrpc_queue_call(call);
+ break;
+
+ case RXRPC_CALL_SERVER_RECV_REQUEST:
+ call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
+ default:
+ break;
+ }
+
+ write_unlock(&call->state_lock);
+}
+
+/*
+ * drop the bottom ACK off of the call ACK window and advance the window
+ */
+static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
+ struct rxrpc_skb_priv *sp)
+{
+ int loop;
+ u32 seq;
+
+ spin_lock_bh(&call->lock);
+
+ _debug("hard ACK #%u", ntohl(sp->hdr.seq));
+
+ for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
+ call->ackr_window[loop] >>= 1;
+ call->ackr_window[loop] |=
+ call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
+ }
+
+ seq = ntohl(sp->hdr.seq);
+ ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
+ call->rx_data_eaten = seq;
+
+ if (call->ackr_win_top < UINT_MAX)
+ call->ackr_win_top++;
+
+ ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
+ call->rx_data_post, >=, call->rx_data_recv);
+ ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
+ call->rx_data_recv, >=, call->rx_data_eaten);
+
+ if (sp->hdr.flags & RXRPC_LAST_PACKET) {
+ rxrpc_request_final_ACK(call);
+ } else if (atomic_dec_and_test(&call->ackr_not_idle) &&
+ test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) {
+ /* We previously soft-ACK'd some received packets that have now
+ * been consumed, so send a hard-ACK if no more packets are
+ * immediately forthcoming to allow the transmitter to free up
+ * its Tx bufferage.
+ */
+ _debug("send Rx idle ACK");
+ __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial,
+ false);
+ }
+
+ spin_unlock_bh(&call->lock);
+}
+
+/*
+ * destroy a packet that has an RxRPC control buffer
+ * - advance the hard-ACK state of the parent call (done here in case something
+ * in the kernel bypasses recvmsg() and steals the packet directly off of the
+ * socket receive queue)
+ */
+void rxrpc_packet_destructor(struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ struct rxrpc_call *call = sp->call;
+
+ _enter("%p{%p}", skb, call);
+
+ if (call) {
+ /* send the final ACK on a client call */
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
+ rxrpc_hard_ACK_data(call, sp);
+ rxrpc_put_call(call);
+ sp->call = NULL;
+ }
+
+ if (skb->sk)
+ sock_rfree(skb);
+ _leave("");
+}
+
+/**
+ * rxrpc_kernel_free_skb - Free an RxRPC socket buffer
+ * @skb: The socket buffer to be freed
+ *
+ * Let RxRPC free its own socket buffer, permitting it to maintain debug
+ * accounting.
+ */
+void rxrpc_kernel_free_skb(struct sk_buff *skb)
+{
+ rxrpc_free_skb(skb);
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_free_skb);
diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c
new file mode 100644
index 000000000..1976dec84
--- /dev/null
+++ b/net/rxrpc/ar-transport.c
@@ -0,0 +1,283 @@
+/* RxRPC point-to-point transport session management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * Time after last use at which transport record is cleaned up.
+ */
+unsigned rxrpc_transport_expiry = 3600 * 24;
+
+static void rxrpc_transport_reaper(struct work_struct *work);
+
+static LIST_HEAD(rxrpc_transports);
+static DEFINE_RWLOCK(rxrpc_transport_lock);
+static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper);
+
+/*
+ * allocate a new transport session manager
+ */
+static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local,
+ struct rxrpc_peer *peer,
+ gfp_t gfp)
+{
+ struct rxrpc_transport *trans;
+
+ _enter("");
+
+ trans = kzalloc(sizeof(struct rxrpc_transport), gfp);
+ if (trans) {
+ trans->local = local;
+ trans->peer = peer;
+ INIT_LIST_HEAD(&trans->link);
+ trans->bundles = RB_ROOT;
+ trans->client_conns = RB_ROOT;
+ trans->server_conns = RB_ROOT;
+ skb_queue_head_init(&trans->error_queue);
+ spin_lock_init(&trans->client_lock);
+ rwlock_init(&trans->conn_lock);
+ atomic_set(&trans->usage, 1);
+ trans->debug_id = atomic_inc_return(&rxrpc_debug_id);
+
+ if (peer->srx.transport.family == AF_INET) {
+ switch (peer->srx.transport_type) {
+ case SOCK_DGRAM:
+ INIT_WORK(&trans->error_handler,
+ rxrpc_UDP_error_handler);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ } else {
+ BUG();
+ }
+ }
+
+ _leave(" = %p", trans);
+ return trans;
+}
+
+/*
+ * obtain a transport session for the nominated endpoints
+ */
+struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local,
+ struct rxrpc_peer *peer,
+ gfp_t gfp)
+{
+ struct rxrpc_transport *trans, *candidate;
+ const char *new = "old";
+ int usage;
+
+ _enter("{%pI4+%hu},{%pI4+%hu},",
+ &local->srx.transport.sin.sin_addr,
+ ntohs(local->srx.transport.sin.sin_port),
+ &peer->srx.transport.sin.sin_addr,
+ ntohs(peer->srx.transport.sin.sin_port));
+
+ /* search the transport list first */
+ read_lock_bh(&rxrpc_transport_lock);
+ list_for_each_entry(trans, &rxrpc_transports, link) {
+ if (trans->local == local && trans->peer == peer)
+ goto found_extant_transport;
+ }
+ read_unlock_bh(&rxrpc_transport_lock);
+
+ /* not yet present - create a candidate for a new record and then
+ * redo the search */
+ candidate = rxrpc_alloc_transport(local, peer, gfp);
+ if (!candidate) {
+ _leave(" = -ENOMEM");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ write_lock_bh(&rxrpc_transport_lock);
+
+ list_for_each_entry(trans, &rxrpc_transports, link) {
+ if (trans->local == local && trans->peer == peer)
+ goto found_extant_second;
+ }
+
+ /* we can now add the new candidate to the list */
+ trans = candidate;
+ candidate = NULL;
+ usage = atomic_read(&trans->usage);
+
+ rxrpc_get_local(trans->local);
+ atomic_inc(&trans->peer->usage);
+ list_add_tail(&trans->link, &rxrpc_transports);
+ write_unlock_bh(&rxrpc_transport_lock);
+ new = "new";
+
+success:
+ _net("TRANSPORT %s %d local %d -> peer %d",
+ new,
+ trans->debug_id,
+ trans->local->debug_id,
+ trans->peer->debug_id);
+
+ _leave(" = %p {u=%d}", trans, usage);
+ return trans;
+
+ /* we found the transport in the list immediately */
+found_extant_transport:
+ usage = atomic_inc_return(&trans->usage);
+ read_unlock_bh(&rxrpc_transport_lock);
+ goto success;
+
+ /* we found the transport on the second time through the list */
+found_extant_second:
+ usage = atomic_inc_return(&trans->usage);
+ write_unlock_bh(&rxrpc_transport_lock);
+ kfree(candidate);
+ goto success;
+}
+
+/*
+ * find the transport connecting two endpoints
+ */
+struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *local,
+ struct rxrpc_peer *peer)
+{
+ struct rxrpc_transport *trans;
+
+ _enter("{%pI4+%hu},{%pI4+%hu},",
+ &local->srx.transport.sin.sin_addr,
+ ntohs(local->srx.transport.sin.sin_port),
+ &peer->srx.transport.sin.sin_addr,
+ ntohs(peer->srx.transport.sin.sin_port));
+
+ /* search the transport list */
+ read_lock_bh(&rxrpc_transport_lock);
+
+ list_for_each_entry(trans, &rxrpc_transports, link) {
+ if (trans->local == local && trans->peer == peer)
+ goto found_extant_transport;
+ }
+
+ read_unlock_bh(&rxrpc_transport_lock);
+ _leave(" = NULL");
+ return NULL;
+
+found_extant_transport:
+ atomic_inc(&trans->usage);
+ read_unlock_bh(&rxrpc_transport_lock);
+ _leave(" = %p", trans);
+ return trans;
+}
+
+/*
+ * release a transport session
+ */
+void rxrpc_put_transport(struct rxrpc_transport *trans)
+{
+ _enter("%p{u=%d}", trans, atomic_read(&trans->usage));
+
+ ASSERTCMP(atomic_read(&trans->usage), >, 0);
+
+ trans->put_time = get_seconds();
+ if (unlikely(atomic_dec_and_test(&trans->usage))) {
+ _debug("zombie");
+ /* let the reaper determine the timeout to avoid a race with
+ * overextending the timeout if the reaper is running at the
+ * same time */
+ rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0);
+ }
+ _leave("");
+}
+
+/*
+ * clean up a transport session
+ */
+static void rxrpc_cleanup_transport(struct rxrpc_transport *trans)
+{
+ _net("DESTROY TRANS %d", trans->debug_id);
+
+ rxrpc_purge_queue(&trans->error_queue);
+
+ rxrpc_put_local(trans->local);
+ rxrpc_put_peer(trans->peer);
+ kfree(trans);
+}
+
+/*
+ * reap dead transports that have passed their expiry date
+ */
+static void rxrpc_transport_reaper(struct work_struct *work)
+{
+ struct rxrpc_transport *trans, *_p;
+ unsigned long now, earliest, reap_time;
+
+ LIST_HEAD(graveyard);
+
+ _enter("");
+
+ now = get_seconds();
+ earliest = ULONG_MAX;
+
+ /* extract all the transports that have been dead too long */
+ write_lock_bh(&rxrpc_transport_lock);
+ list_for_each_entry_safe(trans, _p, &rxrpc_transports, link) {
+ _debug("reap TRANS %d { u=%d t=%ld }",
+ trans->debug_id, atomic_read(&trans->usage),
+ (long) now - (long) trans->put_time);
+
+ if (likely(atomic_read(&trans->usage) > 0))
+ continue;
+
+ reap_time = trans->put_time + rxrpc_transport_expiry;
+ if (reap_time <= now)
+ list_move_tail(&trans->link, &graveyard);
+ else if (reap_time < earliest)
+ earliest = reap_time;
+ }
+ write_unlock_bh(&rxrpc_transport_lock);
+
+ if (earliest != ULONG_MAX) {
+ _debug("reschedule reaper %ld", (long) earliest - now);
+ ASSERTCMP(earliest, >, now);
+ rxrpc_queue_delayed_work(&rxrpc_transport_reap,
+ (earliest - now) * HZ);
+ }
+
+ /* then destroy all those pulled out */
+ while (!list_empty(&graveyard)) {
+ trans = list_entry(graveyard.next, struct rxrpc_transport,
+ link);
+ list_del_init(&trans->link);
+
+ ASSERTCMP(atomic_read(&trans->usage), ==, 0);
+ rxrpc_cleanup_transport(trans);
+ }
+
+ _leave("");
+}
+
+/*
+ * preemptively destroy all the transport session records rather than waiting
+ * for them to time out
+ */
+void __exit rxrpc_destroy_all_transports(void)
+{
+ _enter("");
+
+ rxrpc_transport_expiry = 0;
+ cancel_delayed_work(&rxrpc_transport_reap);
+ rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0);
+
+ _leave("");
+}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
new file mode 100644
index 000000000..f226709eb
--- /dev/null
+++ b/net/rxrpc/rxkad.c
@@ -0,0 +1,1161 @@
+/* Kerberos-based RxRPC security
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#define rxrpc_debug rxkad_debug
+#include "ar-internal.h"
+
+#define RXKAD_VERSION 2
+#define MAXKRB5TICKETLEN 1024
+#define RXKAD_TKT_TYPE_KERBEROS_V5 256
+#define ANAME_SZ 40 /* size of authentication name */
+#define INST_SZ 40 /* size of principal's instance */
+#define REALM_SZ 40 /* size of principal's auth domain */
+#define SNAME_SZ 40 /* size of service name */
+
+unsigned int rxrpc_debug;
+module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "rxkad debugging mask");
+
+struct rxkad_level1_hdr {
+ __be32 data_size; /* true data size (excluding padding) */
+};
+
+struct rxkad_level2_hdr {
+ __be32 data_size; /* true data size (excluding padding) */
+ __be32 checksum; /* decrypted data checksum */
+};
+
+MODULE_DESCRIPTION("RxRPC network protocol type-2 security (Kerberos 4)");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+/*
+ * this holds a pinned cipher so that keventd doesn't get called by the cipher
+ * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE
+ * packets
+ */
+static struct crypto_blkcipher *rxkad_ci;
+static DEFINE_MUTEX(rxkad_ci_mutex);
+
+/*
+ * initialise connection security
+ */
+static int rxkad_init_connection_security(struct rxrpc_connection *conn)
+{
+ struct crypto_blkcipher *ci;
+ struct rxrpc_key_token *token;
+ int ret;
+
+ _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key));
+
+ token = conn->key->payload.data;
+ conn->security_ix = token->security_index;
+
+ ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(ci)) {
+ _debug("no cipher");
+ ret = PTR_ERR(ci);
+ goto error;
+ }
+
+ if (crypto_blkcipher_setkey(ci, token->kad->session_key,
+ sizeof(token->kad->session_key)) < 0)
+ BUG();
+
+ switch (conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ break;
+ case RXRPC_SECURITY_AUTH:
+ conn->size_align = 8;
+ conn->security_size = sizeof(struct rxkad_level1_hdr);
+ conn->header_size += sizeof(struct rxkad_level1_hdr);
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ conn->size_align = 8;
+ conn->security_size = sizeof(struct rxkad_level2_hdr);
+ conn->header_size += sizeof(struct rxkad_level2_hdr);
+ break;
+ default:
+ ret = -EKEYREJECTED;
+ goto error;
+ }
+
+ conn->cipher = ci;
+ ret = 0;
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * prime the encryption state with the invariant parts of a connection's
+ * description
+ */
+static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
+{
+ struct rxrpc_key_token *token;
+ struct blkcipher_desc desc;
+ struct scatterlist sg[2];
+ struct rxrpc_crypt iv;
+ struct {
+ __be32 x[4];
+ } tmpbuf __attribute__((aligned(16))); /* must all be in same page */
+
+ _enter("");
+
+ if (!conn->key)
+ return;
+
+ token = conn->key->payload.data;
+ memcpy(&iv, token->kad->session_key, sizeof(iv));
+
+ desc.tfm = conn->cipher;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ tmpbuf.x[0] = conn->epoch;
+ tmpbuf.x[1] = conn->cid;
+ tmpbuf.x[2] = 0;
+ tmpbuf.x[3] = htonl(conn->security_ix);
+
+ sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
+ sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+ crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+ memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv));
+ ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]);
+
+ _leave("");
+}
+
+/*
+ * partially encrypt a packet (level 1 security)
+ */
+static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ u32 data_size,
+ void *sechdr)
+{
+ struct rxrpc_skb_priv *sp;
+ struct blkcipher_desc desc;
+ struct rxrpc_crypt iv;
+ struct scatterlist sg[2];
+ struct {
+ struct rxkad_level1_hdr hdr;
+ __be32 first; /* first four bytes of data and padding */
+ } tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+ u16 check;
+
+ sp = rxrpc_skb(skb);
+
+ _enter("");
+
+ check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+ data_size |= (u32) check << 16;
+
+ tmpbuf.hdr.data_size = htonl(data_size);
+ memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first));
+
+ /* start the encryption afresh */
+ memset(&iv, 0, sizeof(iv));
+ desc.tfm = call->conn->cipher;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
+ sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+ crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+ memcpy(sechdr, &tmpbuf, sizeof(tmpbuf));
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * wholly encrypt a packet (level 2 security)
+ */
+static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ u32 data_size,
+ void *sechdr)
+{
+ const struct rxrpc_key_token *token;
+ struct rxkad_level2_hdr rxkhdr
+ __attribute__((aligned(8))); /* must be all on one page */
+ struct rxrpc_skb_priv *sp;
+ struct blkcipher_desc desc;
+ struct rxrpc_crypt iv;
+ struct scatterlist sg[16];
+ struct sk_buff *trailer;
+ unsigned int len;
+ u16 check;
+ int nsg;
+
+ sp = rxrpc_skb(skb);
+
+ _enter("");
+
+ check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+
+ rxkhdr.data_size = htonl(data_size | (u32) check << 16);
+ rxkhdr.checksum = 0;
+
+ /* encrypt from the session key */
+ token = call->conn->key->payload.data;
+ memcpy(&iv, token->kad->session_key, sizeof(iv));
+ desc.tfm = call->conn->cipher;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ sg_init_one(&sg[0], sechdr, sizeof(rxkhdr));
+ sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr));
+ crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(rxkhdr));
+
+ /* we want to encrypt the skbuff in-place */
+ nsg = skb_cow_data(skb, 0, &trailer);
+ if (nsg < 0 || nsg > 16)
+ return -ENOMEM;
+
+ len = data_size + call->conn->size_align - 1;
+ len &= ~(call->conn->size_align - 1);
+
+ sg_init_table(sg, nsg);
+ skb_to_sgvec(skb, sg, 0, len);
+ crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * checksum an RxRPC packet header
+ */
+static int rxkad_secure_packet(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ size_t data_size,
+ void *sechdr)
+{
+ struct rxrpc_skb_priv *sp;
+ struct blkcipher_desc desc;
+ struct rxrpc_crypt iv;
+ struct scatterlist sg[2];
+ struct {
+ __be32 x[2];
+ } tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+ __be32 x;
+ u32 y;
+ int ret;
+
+ sp = rxrpc_skb(skb);
+
+ _enter("{%d{%x}},{#%u},%zu,",
+ call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq),
+ data_size);
+
+ if (!call->conn->cipher)
+ return 0;
+
+ ret = key_validate(call->conn->key);
+ if (ret < 0)
+ return ret;
+
+ /* continue encrypting from where we left off */
+ memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
+ desc.tfm = call->conn->cipher;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ /* calculate the security checksum */
+ x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
+ x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
+ tmpbuf.x[0] = sp->hdr.callNumber;
+ tmpbuf.x[1] = x;
+
+ sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
+ sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+ crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+ y = ntohl(tmpbuf.x[1]);
+ y = (y >> 16) & 0xffff;
+ if (y == 0)
+ y = 1; /* zero checksums are not permitted */
+ sp->hdr.cksum = htons(y);
+
+ switch (call->conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ ret = 0;
+ break;
+ case RXRPC_SECURITY_AUTH:
+ ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr);
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ ret = rxkad_secure_packet_encrypt(call, skb, data_size,
+ sechdr);
+ break;
+ default:
+ ret = -EPERM;
+ break;
+ }
+
+ _leave(" = %d [set %hx]", ret, y);
+ return ret;
+}
+
+/*
+ * decrypt partial encryption on a packet (level 1 security)
+ */
+static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ struct rxkad_level1_hdr sechdr;
+ struct rxrpc_skb_priv *sp;
+ struct blkcipher_desc desc;
+ struct rxrpc_crypt iv;
+ struct scatterlist sg[16];
+ struct sk_buff *trailer;
+ u32 data_size, buf;
+ u16 check;
+ int nsg;
+
+ _enter("");
+
+ sp = rxrpc_skb(skb);
+
+ /* we want to decrypt the skbuff in-place */
+ nsg = skb_cow_data(skb, 0, &trailer);
+ if (nsg < 0 || nsg > 16)
+ goto nomem;
+
+ sg_init_table(sg, nsg);
+ skb_to_sgvec(skb, sg, 0, 8);
+
+ /* start the decryption afresh */
+ memset(&iv, 0, sizeof(iv));
+ desc.tfm = call->conn->cipher;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ crypto_blkcipher_decrypt_iv(&desc, sg, sg, 8);
+
+ /* remove the decrypted packet length */
+ if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
+ goto datalen_error;
+ if (!skb_pull(skb, sizeof(sechdr)))
+ BUG();
+
+ buf = ntohl(sechdr.data_size);
+ data_size = buf & 0xffff;
+
+ check = buf >> 16;
+ check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+ check &= 0xffff;
+ if (check != 0) {
+ *_abort_code = RXKADSEALEDINCON;
+ goto protocol_error;
+ }
+
+ /* shorten the packet to remove the padding */
+ if (data_size > skb->len)
+ goto datalen_error;
+ else if (data_size < skb->len)
+ skb->len = data_size;
+
+ _leave(" = 0 [dlen=%x]", data_size);
+ return 0;
+
+datalen_error:
+ *_abort_code = RXKADDATALEN;
+protocol_error:
+ _leave(" = -EPROTO");
+ return -EPROTO;
+
+nomem:
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+}
+
+/*
+ * wholly decrypt a packet (level 2 security)
+ */
+static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ const struct rxrpc_key_token *token;
+ struct rxkad_level2_hdr sechdr;
+ struct rxrpc_skb_priv *sp;
+ struct blkcipher_desc desc;
+ struct rxrpc_crypt iv;
+ struct scatterlist _sg[4], *sg;
+ struct sk_buff *trailer;
+ u32 data_size, buf;
+ u16 check;
+ int nsg;
+
+ _enter(",{%d}", skb->len);
+
+ sp = rxrpc_skb(skb);
+
+ /* we want to decrypt the skbuff in-place */
+ nsg = skb_cow_data(skb, 0, &trailer);
+ if (nsg < 0)
+ goto nomem;
+
+ sg = _sg;
+ if (unlikely(nsg > 4)) {
+ sg = kmalloc(sizeof(*sg) * nsg, GFP_NOIO);
+ if (!sg)
+ goto nomem;
+ }
+
+ sg_init_table(sg, nsg);
+ skb_to_sgvec(skb, sg, 0, skb->len);
+
+ /* decrypt from the session key */
+ token = call->conn->key->payload.data;
+ memcpy(&iv, token->kad->session_key, sizeof(iv));
+ desc.tfm = call->conn->cipher;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ crypto_blkcipher_decrypt_iv(&desc, sg, sg, skb->len);
+ if (sg != _sg)
+ kfree(sg);
+
+ /* remove the decrypted packet length */
+ if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
+ goto datalen_error;
+ if (!skb_pull(skb, sizeof(sechdr)))
+ BUG();
+
+ buf = ntohl(sechdr.data_size);
+ data_size = buf & 0xffff;
+
+ check = buf >> 16;
+ check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+ check &= 0xffff;
+ if (check != 0) {
+ *_abort_code = RXKADSEALEDINCON;
+ goto protocol_error;
+ }
+
+ /* shorten the packet to remove the padding */
+ if (data_size > skb->len)
+ goto datalen_error;
+ else if (data_size < skb->len)
+ skb->len = data_size;
+
+ _leave(" = 0 [dlen=%x]", data_size);
+ return 0;
+
+datalen_error:
+ *_abort_code = RXKADDATALEN;
+protocol_error:
+ _leave(" = -EPROTO");
+ return -EPROTO;
+
+nomem:
+ _leave(" = -ENOMEM");
+ return -ENOMEM;
+}
+
+/*
+ * verify the security on a received packet
+ */
+static int rxkad_verify_packet(const struct rxrpc_call *call,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ struct blkcipher_desc desc;
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_crypt iv;
+ struct scatterlist sg[2];
+ struct {
+ __be32 x[2];
+ } tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+ __be32 x;
+ __be16 cksum;
+ u32 y;
+ int ret;
+
+ sp = rxrpc_skb(skb);
+
+ _enter("{%d{%x}},{#%u}",
+ call->debug_id, key_serial(call->conn->key),
+ ntohl(sp->hdr.seq));
+
+ if (!call->conn->cipher)
+ return 0;
+
+ if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) {
+ *_abort_code = RXKADINCONSISTENCY;
+ _leave(" = -EPROTO [not rxkad]");
+ return -EPROTO;
+ }
+
+ /* continue encrypting from where we left off */
+ memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
+ desc.tfm = call->conn->cipher;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ /* validate the security checksum */
+ x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
+ x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
+ tmpbuf.x[0] = call->call_id;
+ tmpbuf.x[1] = x;
+
+ sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
+ sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+ crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+ y = ntohl(tmpbuf.x[1]);
+ y = (y >> 16) & 0xffff;
+ if (y == 0)
+ y = 1; /* zero checksums are not permitted */
+
+ cksum = htons(y);
+ if (sp->hdr.cksum != cksum) {
+ *_abort_code = RXKADSEALEDINCON;
+ _leave(" = -EPROTO [csum failed]");
+ return -EPROTO;
+ }
+
+ switch (call->conn->security_level) {
+ case RXRPC_SECURITY_PLAIN:
+ ret = 0;
+ break;
+ case RXRPC_SECURITY_AUTH:
+ ret = rxkad_verify_packet_auth(call, skb, _abort_code);
+ break;
+ case RXRPC_SECURITY_ENCRYPT:
+ ret = rxkad_verify_packet_encrypt(call, skb, _abort_code);
+ break;
+ default:
+ ret = -ENOANO;
+ break;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * issue a challenge
+ */
+static int rxkad_issue_challenge(struct rxrpc_connection *conn)
+{
+ struct rxkad_challenge challenge;
+ struct rxrpc_header hdr;
+ struct msghdr msg;
+ struct kvec iov[2];
+ size_t len;
+ int ret;
+
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+
+ ret = key_validate(conn->key);
+ if (ret < 0)
+ return ret;
+
+ get_random_bytes(&conn->security_nonce, sizeof(conn->security_nonce));
+
+ challenge.version = htonl(2);
+ challenge.nonce = htonl(conn->security_nonce);
+ challenge.min_level = htonl(0);
+ challenge.__padding = 0;
+
+ msg.msg_name = &conn->trans->peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ hdr.epoch = conn->epoch;
+ hdr.cid = conn->cid;
+ hdr.callNumber = 0;
+ hdr.seq = 0;
+ hdr.type = RXRPC_PACKET_TYPE_CHALLENGE;
+ hdr.flags = conn->out_clientflag;
+ hdr.userStatus = 0;
+ hdr.securityIndex = conn->security_ix;
+ hdr._rsvd = 0;
+ hdr.serviceId = conn->service_id;
+
+ iov[0].iov_base = &hdr;
+ iov[0].iov_len = sizeof(hdr);
+ iov[1].iov_base = &challenge;
+ iov[1].iov_len = sizeof(challenge);
+
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ hdr.serial = htonl(atomic_inc_return(&conn->serial));
+ _proto("Tx CHALLENGE %%%u", ntohl(hdr.serial));
+
+ ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
+ if (ret < 0) {
+ _debug("sendmsg failed: %d", ret);
+ return -EAGAIN;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * send a Kerberos security response
+ */
+static int rxkad_send_response(struct rxrpc_connection *conn,
+ struct rxrpc_header *hdr,
+ struct rxkad_response *resp,
+ const struct rxkad_key *s2)
+{
+ struct msghdr msg;
+ struct kvec iov[3];
+ size_t len;
+ int ret;
+
+ _enter("");
+
+ msg.msg_name = &conn->trans->peer->srx.transport.sin;
+ msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin);
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ hdr->epoch = conn->epoch;
+ hdr->seq = 0;
+ hdr->type = RXRPC_PACKET_TYPE_RESPONSE;
+ hdr->flags = conn->out_clientflag;
+ hdr->userStatus = 0;
+ hdr->_rsvd = 0;
+
+ iov[0].iov_base = hdr;
+ iov[0].iov_len = sizeof(*hdr);
+ iov[1].iov_base = resp;
+ iov[1].iov_len = sizeof(*resp);
+ iov[2].iov_base = (void *) s2->ticket;
+ iov[2].iov_len = s2->ticket_len;
+
+ len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len;
+
+ hdr->serial = htonl(atomic_inc_return(&conn->serial));
+ _proto("Tx RESPONSE %%%u", ntohl(hdr->serial));
+
+ ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len);
+ if (ret < 0) {
+ _debug("sendmsg failed: %d", ret);
+ return -EAGAIN;
+ }
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * calculate the response checksum
+ */
+static void rxkad_calc_response_checksum(struct rxkad_response *response)
+{
+ u32 csum = 1000003;
+ int loop;
+ u8 *p = (u8 *) response;
+
+ for (loop = sizeof(*response); loop > 0; loop--)
+ csum = csum * 0x10204081 + *p++;
+
+ response->encrypted.checksum = htonl(csum);
+}
+
+/*
+ * load a scatterlist with a potentially split-page buffer
+ */
+static void rxkad_sg_set_buf2(struct scatterlist sg[2],
+ void *buf, size_t buflen)
+{
+ int nsg = 1;
+
+ sg_init_table(sg, 2);
+
+ sg_set_buf(&sg[0], buf, buflen);
+ if (sg[0].offset + buflen > PAGE_SIZE) {
+ /* the buffer was split over two pages */
+ sg[0].length = PAGE_SIZE - sg[0].offset;
+ sg_set_buf(&sg[1], buf + sg[0].length, buflen - sg[0].length);
+ nsg++;
+ }
+
+ sg_mark_end(&sg[nsg - 1]);
+
+ ASSERTCMP(sg[0].length + sg[1].length, ==, buflen);
+}
+
+/*
+ * encrypt the response packet
+ */
+static void rxkad_encrypt_response(struct rxrpc_connection *conn,
+ struct rxkad_response *resp,
+ const struct rxkad_key *s2)
+{
+ struct blkcipher_desc desc;
+ struct rxrpc_crypt iv;
+ struct scatterlist sg[2];
+
+ /* continue encrypting from where we left off */
+ memcpy(&iv, s2->session_key, sizeof(iv));
+ desc.tfm = conn->cipher;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
+ crypto_blkcipher_encrypt_iv(&desc, sg, sg, sizeof(resp->encrypted));
+}
+
+/*
+ * respond to a challenge packet
+ */
+static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ const struct rxrpc_key_token *token;
+ struct rxkad_challenge challenge;
+ struct rxkad_response resp
+ __attribute__((aligned(8))); /* must be aligned for crypto */
+ struct rxrpc_skb_priv *sp;
+ u32 version, nonce, min_level, abort_code;
+ int ret;
+
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+
+ if (!conn->key) {
+ _leave(" = -EPROTO [no key]");
+ return -EPROTO;
+ }
+
+ ret = key_validate(conn->key);
+ if (ret < 0) {
+ *_abort_code = RXKADEXPIRED;
+ return ret;
+ }
+
+ abort_code = RXKADPACKETSHORT;
+ sp = rxrpc_skb(skb);
+ if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0)
+ goto protocol_error;
+
+ version = ntohl(challenge.version);
+ nonce = ntohl(challenge.nonce);
+ min_level = ntohl(challenge.min_level);
+
+ _proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }",
+ ntohl(sp->hdr.serial), version, nonce, min_level);
+
+ abort_code = RXKADINCONSISTENCY;
+ if (version != RXKAD_VERSION)
+ goto protocol_error;
+
+ abort_code = RXKADLEVELFAIL;
+ if (conn->security_level < min_level)
+ goto protocol_error;
+
+ token = conn->key->payload.data;
+
+ /* build the response packet */
+ memset(&resp, 0, sizeof(resp));
+
+ resp.version = RXKAD_VERSION;
+ resp.encrypted.epoch = conn->epoch;
+ resp.encrypted.cid = conn->cid;
+ resp.encrypted.securityIndex = htonl(conn->security_ix);
+ resp.encrypted.call_id[0] =
+ (conn->channels[0] ? conn->channels[0]->call_id : 0);
+ resp.encrypted.call_id[1] =
+ (conn->channels[1] ? conn->channels[1]->call_id : 0);
+ resp.encrypted.call_id[2] =
+ (conn->channels[2] ? conn->channels[2]->call_id : 0);
+ resp.encrypted.call_id[3] =
+ (conn->channels[3] ? conn->channels[3]->call_id : 0);
+ resp.encrypted.inc_nonce = htonl(nonce + 1);
+ resp.encrypted.level = htonl(conn->security_level);
+ resp.kvno = htonl(token->kad->kvno);
+ resp.ticket_len = htonl(token->kad->ticket_len);
+
+ /* calculate the response checksum and then do the encryption */
+ rxkad_calc_response_checksum(&resp);
+ rxkad_encrypt_response(conn, &resp, token->kad);
+ return rxkad_send_response(conn, &sp->hdr, &resp, token->kad);
+
+protocol_error:
+ *_abort_code = abort_code;
+ _leave(" = -EPROTO [%d]", abort_code);
+ return -EPROTO;
+}
+
+/*
+ * decrypt the kerberos IV ticket in the response
+ */
+static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
+ void *ticket, size_t ticket_len,
+ struct rxrpc_crypt *_session_key,
+ time_t *_expiry,
+ u32 *_abort_code)
+{
+ struct blkcipher_desc desc;
+ struct rxrpc_crypt iv, key;
+ struct scatterlist sg[1];
+ struct in_addr addr;
+ unsigned int life;
+ time_t issue, now;
+ bool little_endian;
+ int ret;
+ u8 *p, *q, *name, *end;
+
+ _enter("{%d},{%x}", conn->debug_id, key_serial(conn->server_key));
+
+ *_expiry = 0;
+
+ ret = key_validate(conn->server_key);
+ if (ret < 0) {
+ switch (ret) {
+ case -EKEYEXPIRED:
+ *_abort_code = RXKADEXPIRED;
+ goto error;
+ default:
+ *_abort_code = RXKADNOAUTH;
+ goto error;
+ }
+ }
+
+ ASSERT(conn->server_key->payload.data != NULL);
+ ASSERTCMP((unsigned long) ticket & 7UL, ==, 0);
+
+ memcpy(&iv, &conn->server_key->type_data, sizeof(iv));
+
+ desc.tfm = conn->server_key->payload.data;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ sg_init_one(&sg[0], ticket, ticket_len);
+ crypto_blkcipher_decrypt_iv(&desc, sg, sg, ticket_len);
+
+ p = ticket;
+ end = p + ticket_len;
+
+#define Z(size) \
+ ({ \
+ u8 *__str = p; \
+ q = memchr(p, 0, end - p); \
+ if (!q || q - p > (size)) \
+ goto bad_ticket; \
+ for (; p < q; p++) \
+ if (!isprint(*p)) \
+ goto bad_ticket; \
+ p++; \
+ __str; \
+ })
+
+ /* extract the ticket flags */
+ _debug("KIV FLAGS: %x", *p);
+ little_endian = *p & 1;
+ p++;
+
+ /* extract the authentication name */
+ name = Z(ANAME_SZ);
+ _debug("KIV ANAME: %s", name);
+
+ /* extract the principal's instance */
+ name = Z(INST_SZ);
+ _debug("KIV INST : %s", name);
+
+ /* extract the principal's authentication domain */
+ name = Z(REALM_SZ);
+ _debug("KIV REALM: %s", name);
+
+ if (end - p < 4 + 8 + 4 + 2)
+ goto bad_ticket;
+
+ /* get the IPv4 address of the entity that requested the ticket */
+ memcpy(&addr, p, sizeof(addr));
+ p += 4;
+ _debug("KIV ADDR : %pI4", &addr);
+
+ /* get the session key from the ticket */
+ memcpy(&key, p, sizeof(key));
+ p += 8;
+ _debug("KIV KEY : %08x %08x", ntohl(key.n[0]), ntohl(key.n[1]));
+ memcpy(_session_key, &key, sizeof(key));
+
+ /* get the ticket's lifetime */
+ life = *p++ * 5 * 60;
+ _debug("KIV LIFE : %u", life);
+
+ /* get the issue time of the ticket */
+ if (little_endian) {
+ __le32 stamp;
+ memcpy(&stamp, p, 4);
+ issue = le32_to_cpu(stamp);
+ } else {
+ __be32 stamp;
+ memcpy(&stamp, p, 4);
+ issue = be32_to_cpu(stamp);
+ }
+ p += 4;
+ now = get_seconds();
+ _debug("KIV ISSUE: %lx [%lx]", issue, now);
+
+ /* check the ticket is in date */
+ if (issue > now) {
+ *_abort_code = RXKADNOAUTH;
+ ret = -EKEYREJECTED;
+ goto error;
+ }
+
+ if (issue < now - life) {
+ *_abort_code = RXKADEXPIRED;
+ ret = -EKEYEXPIRED;
+ goto error;
+ }
+
+ *_expiry = issue + life;
+
+ /* get the service name */
+ name = Z(SNAME_SZ);
+ _debug("KIV SNAME: %s", name);
+
+ /* get the service instance name */
+ name = Z(INST_SZ);
+ _debug("KIV SINST: %s", name);
+
+ ret = 0;
+error:
+ _leave(" = %d", ret);
+ return ret;
+
+bad_ticket:
+ *_abort_code = RXKADBADTICKET;
+ ret = -EBADMSG;
+ goto error;
+}
+
+/*
+ * decrypt the response packet
+ */
+static void rxkad_decrypt_response(struct rxrpc_connection *conn,
+ struct rxkad_response *resp,
+ const struct rxrpc_crypt *session_key)
+{
+ struct blkcipher_desc desc;
+ struct scatterlist sg[2];
+ struct rxrpc_crypt iv;
+
+ _enter(",,%08x%08x",
+ ntohl(session_key->n[0]), ntohl(session_key->n[1]));
+
+ ASSERT(rxkad_ci != NULL);
+
+ mutex_lock(&rxkad_ci_mutex);
+ if (crypto_blkcipher_setkey(rxkad_ci, session_key->x,
+ sizeof(*session_key)) < 0)
+ BUG();
+
+ memcpy(&iv, session_key, sizeof(iv));
+ desc.tfm = rxkad_ci;
+ desc.info = iv.x;
+ desc.flags = 0;
+
+ rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
+ crypto_blkcipher_decrypt_iv(&desc, sg, sg, sizeof(resp->encrypted));
+ mutex_unlock(&rxkad_ci_mutex);
+
+ _leave("");
+}
+
+/*
+ * verify a response
+ */
+static int rxkad_verify_response(struct rxrpc_connection *conn,
+ struct sk_buff *skb,
+ u32 *_abort_code)
+{
+ struct rxkad_response response
+ __attribute__((aligned(8))); /* must be aligned for crypto */
+ struct rxrpc_skb_priv *sp;
+ struct rxrpc_crypt session_key;
+ time_t expiry;
+ void *ticket;
+ u32 abort_code, version, kvno, ticket_len, level;
+ __be32 csum;
+ int ret;
+
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
+
+ abort_code = RXKADPACKETSHORT;
+ if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0)
+ goto protocol_error;
+ if (!pskb_pull(skb, sizeof(response)))
+ BUG();
+
+ version = ntohl(response.version);
+ ticket_len = ntohl(response.ticket_len);
+ kvno = ntohl(response.kvno);
+ sp = rxrpc_skb(skb);
+ _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
+ ntohl(sp->hdr.serial), version, kvno, ticket_len);
+
+ abort_code = RXKADINCONSISTENCY;
+ if (version != RXKAD_VERSION)
+ goto protocol_error;
+
+ abort_code = RXKADTICKETLEN;
+ if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN)
+ goto protocol_error;
+
+ abort_code = RXKADUNKNOWNKEY;
+ if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5)
+ goto protocol_error;
+
+ /* extract the kerberos ticket and decrypt and decode it */
+ ticket = kmalloc(ticket_len, GFP_NOFS);
+ if (!ticket)
+ return -ENOMEM;
+
+ abort_code = RXKADPACKETSHORT;
+ if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0)
+ goto protocol_error_free;
+
+ ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key,
+ &expiry, &abort_code);
+ if (ret < 0) {
+ *_abort_code = abort_code;
+ kfree(ticket);
+ return ret;
+ }
+
+ /* use the session key from inside the ticket to decrypt the
+ * response */
+ rxkad_decrypt_response(conn, &response, &session_key);
+
+ abort_code = RXKADSEALEDINCON;
+ if (response.encrypted.epoch != conn->epoch)
+ goto protocol_error_free;
+ if (response.encrypted.cid != conn->cid)
+ goto protocol_error_free;
+ if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
+ goto protocol_error_free;
+ csum = response.encrypted.checksum;
+ response.encrypted.checksum = 0;
+ rxkad_calc_response_checksum(&response);
+ if (response.encrypted.checksum != csum)
+ goto protocol_error_free;
+
+ if (ntohl(response.encrypted.call_id[0]) > INT_MAX ||
+ ntohl(response.encrypted.call_id[1]) > INT_MAX ||
+ ntohl(response.encrypted.call_id[2]) > INT_MAX ||
+ ntohl(response.encrypted.call_id[3]) > INT_MAX)
+ goto protocol_error_free;
+
+ abort_code = RXKADOUTOFSEQUENCE;
+ if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1))
+ goto protocol_error_free;
+
+ abort_code = RXKADLEVELFAIL;
+ level = ntohl(response.encrypted.level);
+ if (level > RXRPC_SECURITY_ENCRYPT)
+ goto protocol_error_free;
+ conn->security_level = level;
+
+ /* create a key to hold the security data and expiration time - after
+ * this the connection security can be handled in exactly the same way
+ * as for a client connection */
+ ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno);
+ if (ret < 0) {
+ kfree(ticket);
+ return ret;
+ }
+
+ kfree(ticket);
+ _leave(" = 0");
+ return 0;
+
+protocol_error_free:
+ kfree(ticket);
+protocol_error:
+ *_abort_code = abort_code;
+ _leave(" = -EPROTO [%d]", abort_code);
+ return -EPROTO;
+}
+
+/*
+ * clear the connection security
+ */
+static void rxkad_clear(struct rxrpc_connection *conn)
+{
+ _enter("");
+
+ if (conn->cipher)
+ crypto_free_blkcipher(conn->cipher);
+}
+
+/*
+ * RxRPC Kerberos-based security
+ */
+static struct rxrpc_security rxkad = {
+ .owner = THIS_MODULE,
+ .name = "rxkad",
+ .security_index = RXRPC_SECURITY_RXKAD,
+ .init_connection_security = rxkad_init_connection_security,
+ .prime_packet_security = rxkad_prime_packet_security,
+ .secure_packet = rxkad_secure_packet,
+ .verify_packet = rxkad_verify_packet,
+ .issue_challenge = rxkad_issue_challenge,
+ .respond_to_challenge = rxkad_respond_to_challenge,
+ .verify_response = rxkad_verify_response,
+ .clear = rxkad_clear,
+};
+
+static __init int rxkad_init(void)
+{
+ _enter("");
+
+ /* pin the cipher we need so that the crypto layer doesn't invoke
+ * keventd to go get it */
+ rxkad_ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(rxkad_ci))
+ return PTR_ERR(rxkad_ci);
+
+ return rxrpc_register_security(&rxkad);
+}
+
+module_init(rxkad_init);
+
+static __exit void rxkad_exit(void)
+{
+ _enter("");
+
+ rxrpc_unregister_security(&rxkad);
+ crypto_free_blkcipher(rxkad_ci);
+}
+
+module_exit(rxkad_exit);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
new file mode 100644
index 000000000..50a98a910
--- /dev/null
+++ b/net/rxrpc/sysctl.c
@@ -0,0 +1,146 @@
+/* sysctls for configuring RxRPC operating parameters
+ *
+ * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/sysctl.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static struct ctl_table_header *rxrpc_sysctl_reg_table;
+static const unsigned zero = 0;
+static const unsigned one = 1;
+static const unsigned four = 4;
+static const unsigned n_65535 = 65535;
+static const unsigned n_max_acks = RXRPC_MAXACKS;
+
+/*
+ * RxRPC operating parameters.
+ *
+ * See Documentation/networking/rxrpc.txt and the variable definitions for more
+ * information on the individual parameters.
+ */
+static struct ctl_table rxrpc_sysctl_table[] = {
+ /* Values measured in milliseconds */
+ {
+ .procname = "req_ack_delay",
+ .data = &rxrpc_requested_ack_delay,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ .extra1 = (void *)&zero,
+ },
+ {
+ .procname = "soft_ack_delay",
+ .data = &rxrpc_soft_ack_delay,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ .extra1 = (void *)&one,
+ },
+ {
+ .procname = "idle_ack_delay",
+ .data = &rxrpc_idle_ack_delay,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ .extra1 = (void *)&one,
+ },
+ {
+ .procname = "resend_timeout",
+ .data = &rxrpc_resend_timeout,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ .extra1 = (void *)&one,
+ },
+
+ /* Values measured in seconds but used in jiffies */
+ {
+ .procname = "max_call_lifetime",
+ .data = &rxrpc_max_call_lifetime,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ .extra1 = (void *)&one,
+ },
+ {
+ .procname = "dead_call_expiry",
+ .data = &rxrpc_dead_call_expiry,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ .extra1 = (void *)&one,
+ },
+
+ /* Values measured in seconds */
+ {
+ .procname = "connection_expiry",
+ .data = &rxrpc_connection_expiry,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&one,
+ },
+ {
+ .procname = "transport_expiry",
+ .data = &rxrpc_transport_expiry,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&one,
+ },
+
+ /* Non-time values */
+ {
+ .procname = "rx_window_size",
+ .data = &rxrpc_rx_window_size,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&one,
+ .extra2 = (void *)&n_max_acks,
+ },
+ {
+ .procname = "rx_mtu",
+ .data = &rxrpc_rx_mtu,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&one,
+ .extra1 = (void *)&n_65535,
+ },
+ {
+ .procname = "rx_jumbo_max",
+ .data = &rxrpc_rx_jumbo_max,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&one,
+ .extra2 = (void *)&four,
+ },
+
+ { }
+};
+
+int __init rxrpc_sysctl_init(void)
+{
+ rxrpc_sysctl_reg_table = register_net_sysctl(&init_net, "net/rxrpc",
+ rxrpc_sysctl_table);
+ if (!rxrpc_sysctl_reg_table)
+ return -ENOMEM;
+ return 0;
+}
+
+void rxrpc_sysctl_exit(void)
+{
+ if (rxrpc_sysctl_reg_table)
+ unregister_net_sysctl_table(rxrpc_sysctl_reg_table);
+}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
new file mode 100644
index 000000000..2274e723a
--- /dev/null
+++ b/net/sched/Kconfig
@@ -0,0 +1,736 @@
+#
+# Traffic control configuration.
+#
+
+menuconfig NET_SCHED
+ bool "QoS and/or fair queueing"
+ select NET_SCH_FIFO
+ ---help---
+ When the kernel has several packets to send out over a network
+ device, it has to decide which ones to send first, which ones to
+ delay, and which ones to drop. This is the job of the queueing
+ disciplines, several different algorithms for how to do this
+ "fairly" have been proposed.
+
+ If you say N here, you will get the standard packet scheduler, which
+ is a FIFO (first come, first served). If you say Y here, you will be
+ able to choose from among several alternative algorithms which can
+ then be attached to different network devices. This is useful for
+ example if some of your network devices are real time devices that
+ need a certain minimum data flow rate, or if you need to limit the
+ maximum data flow rate for traffic which matches specified criteria.
+ This code is considered to be experimental.
+
+ To administer these schedulers, you'll need the user-level utilities
+ from the package iproute2+tc at
+ <https://www.kernel.org/pub/linux/utils/net/iproute2/>. That package
+ also contains some documentation; for more, check out
+ <http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>.
+
+ This Quality of Service (QoS) support will enable you to use
+ Differentiated Services (diffserv) and Resource Reservation Protocol
+ (RSVP) on your Linux router if you also say Y to the corresponding
+ classifiers below. Documentation and software is at
+ <http://diffserv.sourceforge.net/>.
+
+ If you say Y here and to "/proc file system" below, you will be able
+ to read status information about packet schedulers from the file
+ /proc/net/psched.
+
+ The available schedulers are listed in the following questions; you
+ can say Y to as many as you like. If unsure, say N now.
+
+if NET_SCHED
+
+comment "Queueing/Scheduling"
+
+config NET_SCH_CBQ
+ tristate "Class Based Queueing (CBQ)"
+ ---help---
+ Say Y here if you want to use the Class-Based Queueing (CBQ) packet
+ scheduling algorithm. This algorithm classifies the waiting packets
+ into a tree-like hierarchy of classes; the leaves of this tree are
+ in turn scheduled by separate algorithms.
+
+ See the top of <file:net/sched/sch_cbq.c> for more details.
+
+ CBQ is a commonly used scheduler, so if you're unsure, you should
+ say Y here. Then say Y to all the queueing algorithms below that you
+ want to use as leaf disciplines.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_cbq.
+
+config NET_SCH_HTB
+ tristate "Hierarchical Token Bucket (HTB)"
+ ---help---
+ Say Y here if you want to use the Hierarchical Token Buckets (HTB)
+ packet scheduling algorithm. See
+ <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
+ in-depth articles.
+
+ HTB is very similar to CBQ regarding its goals however is has
+ different properties and different algorithm.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_htb.
+
+config NET_SCH_HFSC
+ tristate "Hierarchical Fair Service Curve (HFSC)"
+ ---help---
+ Say Y here if you want to use the Hierarchical Fair Service Curve
+ (HFSC) packet scheduling algorithm.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_hfsc.
+
+config NET_SCH_ATM
+ tristate "ATM Virtual Circuits (ATM)"
+ depends on ATM
+ ---help---
+ Say Y here if you want to use the ATM pseudo-scheduler. This
+ provides a framework for invoking classifiers, which in turn
+ select classes of this queuing discipline. Each class maps
+ the flow(s) it is handling to a given virtual circuit.
+
+ See the top of <file:net/sched/sch_atm.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_atm.
+
+config NET_SCH_PRIO
+ tristate "Multi Band Priority Queueing (PRIO)"
+ ---help---
+ Say Y here if you want to use an n-band priority queue packet
+ scheduler.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_prio.
+
+config NET_SCH_MULTIQ
+ tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
+ ---help---
+ Say Y here if you want to use an n-band queue packet scheduler
+ to support devices that have multiple hardware transmit queues.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_multiq.
+
+config NET_SCH_RED
+ tristate "Random Early Detection (RED)"
+ ---help---
+ Say Y here if you want to use the Random Early Detection (RED)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_red.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_red.
+
+config NET_SCH_SFB
+ tristate "Stochastic Fair Blue (SFB)"
+ ---help---
+ Say Y here if you want to use the Stochastic Fair Blue (SFB)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_sfb.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_sfb.
+
+config NET_SCH_SFQ
+ tristate "Stochastic Fairness Queueing (SFQ)"
+ ---help---
+ Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
+ packet scheduling algorithm.
+
+ See the top of <file:net/sched/sch_sfq.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_sfq.
+
+config NET_SCH_TEQL
+ tristate "True Link Equalizer (TEQL)"
+ ---help---
+ Say Y here if you want to use the True Link Equalizer (TLE) packet
+ scheduling algorithm. This queueing discipline allows the combination
+ of several physical devices into one virtual device.
+
+ See the top of <file:net/sched/sch_teql.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_teql.
+
+config NET_SCH_TBF
+ tristate "Token Bucket Filter (TBF)"
+ ---help---
+ Say Y here if you want to use the Token Bucket Filter (TBF) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_tbf.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_tbf.
+
+config NET_SCH_GRED
+ tristate "Generic Random Early Detection (GRED)"
+ ---help---
+ Say Y here if you want to use the Generic Random Early Detection
+ (GRED) packet scheduling algorithm for some of your network devices
+ (see the top of <file:net/sched/sch_red.c> for details and
+ references about the algorithm).
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_gred.
+
+config NET_SCH_DSMARK
+ tristate "Differentiated Services marker (DSMARK)"
+ ---help---
+ Say Y if you want to schedule packets according to the
+ Differentiated Services architecture proposed in RFC 2475.
+ Technical information on this method, with pointers to associated
+ RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_dsmark.
+
+config NET_SCH_NETEM
+ tristate "Network emulator (NETEM)"
+ ---help---
+ Say Y if you want to emulate network delay, loss, and packet
+ re-ordering. This is often useful to simulate networks when
+ testing applications or protocols.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_netem.
+
+ If unsure, say N.
+
+config NET_SCH_DRR
+ tristate "Deficit Round Robin scheduler (DRR)"
+ help
+ Say Y here if you want to use the Deficit Round Robin (DRR) packet
+ scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_drr.
+
+ If unsure, say N.
+
+config NET_SCH_MQPRIO
+ tristate "Multi-queue priority scheduler (MQPRIO)"
+ help
+ Say Y here if you want to use the Multi-queue Priority scheduler.
+ This scheduler allows QOS to be offloaded on NICs that have support
+ for offloading QOS schedulers.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_mqprio.
+
+ If unsure, say N.
+
+config NET_SCH_CHOKE
+ tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+ help
+ Say Y here if you want to use the CHOKe packet scheduler (CHOose
+ and Keep for responsive flows, CHOose and Kill for unresponsive
+ flows). This is a variation of RED which trys to penalize flows
+ that monopolize the queue.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_choke.
+
+config NET_SCH_QFQ
+ tristate "Quick Fair Queueing scheduler (QFQ)"
+ help
+ Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_qfq.
+
+ If unsure, say N.
+
+config NET_SCH_CODEL
+ tristate "Controlled Delay AQM (CODEL)"
+ help
+ Say Y here if you want to use the Controlled Delay (CODEL)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_codel.
+
+ If unsure, say N.
+
+config NET_SCH_FQ_CODEL
+ tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)"
+ help
+ Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq_codel.
+
+ If unsure, say N.
+
+config NET_SCH_FQ
+ tristate "Fair Queue"
+ help
+ Say Y here if you want to use the FQ packet scheduling algorithm.
+
+ FQ does flow separation, and is able to respect pacing requirements
+ set by TCP stack into sk->sk_pacing_rate (for localy generated
+ traffic)
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq.
+
+ If unsure, say N.
+
+config NET_SCH_HHF
+ tristate "Heavy-Hitter Filter (HHF)"
+ help
+ Say Y here if you want to use the Heavy-Hitter Filter (HHF)
+ packet scheduling algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_hhf.
+
+config NET_SCH_PIE
+ tristate "Proportional Integral controller Enhanced (PIE) scheduler"
+ help
+ Say Y here if you want to use the Proportional Integral controller
+ Enhanced scheduler packet scheduling algorithm.
+ For more information, please see
+ http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_pie.
+
+ If unsure, say N.
+
+config NET_SCH_INGRESS
+ tristate "Ingress Qdisc"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here if you want to use classifiers for incoming packets.
+ If unsure, say Y.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_ingress.
+
+config NET_SCH_PLUG
+ tristate "Plug network traffic until release (PLUG)"
+ ---help---
+
+ This queuing discipline allows userspace to plug/unplug a network
+ output queue, using the netlink interface. When it receives an
+ enqueue command it inserts a plug into the outbound queue that
+ causes following packets to enqueue until a dequeue command arrives
+ over netlink, causing the plug to be removed and resuming the normal
+ packet flow.
+
+ This module also provides a generic "network output buffering"
+ functionality (aka output commit), wherein upon arrival of a dequeue
+ command, only packets up to the first plug are released for delivery.
+ The Remus HA project uses this module to enable speculative execution
+ of virtual machines by allowing the generated network output to be rolled
+ back if needed.
+
+ For more information, please refer to <http://wiki.xenproject.org/wiki/Remus>
+
+ Say Y here if you are using this kernel for Xen dom0 and
+ want to protect Xen guests with Remus.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_plug.
+
+comment "Classification"
+
+config NET_CLS
+ bool
+
+config NET_CLS_BASIC
+ tristate "Elementary classification (BASIC)"
+ select NET_CLS
+ ---help---
+ Say Y here if you want to be able to classify packets using
+ only extended matches and actions.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_basic.
+
+config NET_CLS_TCINDEX
+ tristate "Traffic-Control Index (TCINDEX)"
+ select NET_CLS
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ traffic control indices. You will want this feature if you want
+ to implement Differentiated Services together with DSMARK.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_tcindex.
+
+config NET_CLS_ROUTE4
+ tristate "Routing decision (ROUTE)"
+ depends on INET
+ select IP_ROUTE_CLASSID
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets
+ according to the route table entry they matched.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_route.
+
+config NET_CLS_FW
+ tristate "Netfilter mark (FW)"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets
+ according to netfilter/firewall marks.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_fw.
+
+config NET_CLS_U32
+ tristate "Universal 32bit comparisons w/ hashing (U32)"
+ select NET_CLS
+ ---help---
+ Say Y here to be able to classify packets using a universal
+ 32bit pieces based comparison scheme.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_u32.
+
+config CLS_U32_PERF
+ bool "Performance counters support"
+ depends on NET_CLS_U32
+ ---help---
+ Say Y here to make u32 gather additional statistics useful for
+ fine tuning u32 classifiers.
+
+config CLS_U32_MARK
+ bool "Netfilter marks support"
+ depends on NET_CLS_U32
+ ---help---
+ Say Y here to be able to use netfilter marks as u32 key.
+
+config NET_CLS_RSVP
+ tristate "IPv4 Resource Reservation Protocol (RSVP)"
+ select NET_CLS
+ ---help---
+ The Resource Reservation Protocol (RSVP) permits end systems to
+ request a minimum and maximum data flow rate for a connection; this
+ is important for real time data such as streaming sound or video.
+
+ Say Y here if you want to be able to classify outgoing packets based
+ on their RSVP requests.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_rsvp.
+
+config NET_CLS_RSVP6
+ tristate "IPv6 Resource Reservation Protocol (RSVP6)"
+ select NET_CLS
+ ---help---
+ The Resource Reservation Protocol (RSVP) permits end systems to
+ request a minimum and maximum data flow rate for a connection; this
+ is important for real time data such as streaming sound or video.
+
+ Say Y here if you want to be able to classify outgoing packets based
+ on their RSVP requests and you are using the IPv6 protocol.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_rsvp6.
+
+config NET_CLS_FLOW
+ tristate "Flow classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ a configurable combination of packet keys. This is mostly useful
+ in combination with SFQ.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_flow.
+
+config NET_CLS_CGROUP
+ tristate "Control Group Classifier"
+ select NET_CLS
+ select CGROUP_NET_CLASSID
+ depends on CGROUPS
+ ---help---
+ Say Y here if you want to classify packets based on the control
+ cgroup of their process.
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_cgroup.
+
+config NET_CLS_BPF
+ tristate "BPF-based classifier"
+ select NET_CLS
+ ---help---
+ If you say Y here, you will be able to classify packets based on
+ programmable BPF (JIT'ed) filters as an alternative to ematches.
+
+ To compile this code as a module, choose M here: the module will
+ be called cls_bpf.
+
+config NET_EMATCH
+ bool "Extended Matches"
+ select NET_CLS
+ ---help---
+ Say Y here if you want to use extended matches on top of classifiers
+ and select the extended matches below.
+
+ Extended matches are small classification helpers not worth writing
+ a separate classifier for.
+
+ A recent version of the iproute2 package is required to use
+ extended matches.
+
+config NET_EMATCH_STACK
+ int "Stack size"
+ depends on NET_EMATCH
+ default "32"
+ ---help---
+ Size of the local stack variable used while evaluating the tree of
+ ematches. Limits the depth of the tree, i.e. the number of
+ encapsulated precedences. Every level requires 4 bytes of additional
+ stack space.
+
+config NET_EMATCH_CMP
+ tristate "Simple packet data comparison"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ simple packet data comparisons for 8, 16, and 32bit values.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_cmp.
+
+config NET_EMATCH_NBYTE
+ tristate "Multi byte comparison"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ multiple byte comparisons mainly useful for IPv6 address comparisons.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_nbyte.
+
+config NET_EMATCH_U32
+ tristate "U32 key"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets using
+ the famous u32 key in combination with logic relations.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_u32.
+
+config NET_EMATCH_META
+ tristate "Metadata"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ metadata such as load average, netfilter attributes, socket
+ attributes and routing decisions.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_meta.
+
+config NET_EMATCH_TEXT
+ tristate "Textsearch"
+ depends on NET_EMATCH
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ select TEXTSEARCH_BM
+ select TEXTSEARCH_FSM
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ textsearch comparisons.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_text.
+
+config NET_EMATCH_CANID
+ tristate "CAN Identifier"
+ depends on NET_EMATCH && (CAN=y || CAN=m)
+ ---help---
+ Say Y here if you want to be able to classify CAN frames based
+ on CAN Identifier.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_canid.
+
+config NET_EMATCH_IPSET
+ tristate "IPset"
+ depends on NET_EMATCH && IP_SET
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ ipset membership.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_ipset.
+
+config NET_CLS_ACT
+ bool "Actions"
+ ---help---
+ Say Y here if you want to use traffic control actions. Actions
+ get attached to classifiers and are invoked after a successful
+ classification. They are used to overwrite the classification
+ result, instantly drop or redirect packets, etc.
+
+ A recent version of the iproute2 package is required to use
+ extended matches.
+
+config NET_ACT_POLICE
+ tristate "Traffic Policing"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here if you want to do traffic policing, i.e. strict
+ bandwidth limiting. This action replaces the existing policing
+ module.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_police.
+
+config NET_ACT_GACT
+ tristate "Generic actions"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to take generic actions such as dropping and
+ accepting packets.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_gact.
+
+config GACT_PROB
+ bool "Probability support"
+ depends on NET_ACT_GACT
+ ---help---
+ Say Y here to use the generic action randomly or deterministically.
+
+config NET_ACT_MIRRED
+ tristate "Redirecting and Mirroring"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow packets to be mirrored or redirected to
+ other devices.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_mirred.
+
+config NET_ACT_IPT
+ tristate "IPtables targets"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ ---help---
+ Say Y here to be able to invoke iptables targets after successful
+ classification.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ipt.
+
+config NET_ACT_NAT
+ tristate "Stateless NAT"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to do stateless NAT on IPv4 packets. You should use
+ netfilter for NAT unless you know what you are doing.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_nat.
+
+config NET_ACT_PEDIT
+ tristate "Packet Editing"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here if you want to mangle the content of packets.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_pedit.
+
+config NET_ACT_SIMP
+ tristate "Simple Example (Debug)"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to add a simple action for demonstration purposes.
+ It is meant as an example and for debugging purposes. It will
+ print a configured policy string followed by the packet count
+ to the console for every packet that passes by.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_simple.
+
+config NET_ACT_SKBEDIT
+ tristate "SKB Editing"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to change skb priority or queue_mapping settings.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_skbedit.
+
+config NET_ACT_CSUM
+ tristate "Checksum Updating"
+ depends on NET_CLS_ACT && INET
+ ---help---
+ Say Y here to update some common checksum after some direct
+ packet alterations.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_csum.
+
+config NET_ACT_VLAN
+ tristate "Vlan manipulation"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to push or pop vlan headers.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_vlan.
+
+config NET_ACT_BPF
+ tristate "BPF based action"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to execute BPF code on packets. The BPF code will decide
+ if the packet should be dropped or not.
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_bpf.
+
+config NET_ACT_CONNMARK
+ tristate "Netfilter Connection Mark Retriever"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ ---help---
+ Say Y here to allow retrieving of conn mark
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_connmark.
+
+config NET_CLS_IND
+ bool "Incoming device classification"
+ depends on NET_CLS_U32 || NET_CLS_FW
+ ---help---
+ Say Y here to extend the u32 and fw classifier to support
+ classification based on the incoming device. This option is
+ likely to disappear in favour of the metadata ematch.
+
+endif # NET_SCHED
+
+config NET_SCH_FIFO
+ bool
diff --git a/net/sched/Makefile b/net/sched/Makefile
new file mode 100644
index 000000000..7ca7f4c1b
--- /dev/null
+++ b/net/sched/Makefile
@@ -0,0 +1,66 @@
+#
+# Makefile for the Linux Traffic Control Unit.
+#
+
+obj-y := sch_generic.o sch_mq.o
+
+obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
+obj-$(CONFIG_NET_CLS) += cls_api.o
+obj-$(CONFIG_NET_CLS_ACT) += act_api.o
+obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
+obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
+obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
+obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
+obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
+obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
+obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
+obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
+obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
+obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
+obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
+obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
+obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
+obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
+obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
+obj-$(CONFIG_NET_SCH_RED) += sch_red.o
+obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
+obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
+obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
+obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
+obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
+obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
+obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
+obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
+obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
+obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
+obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
+obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
+obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
+obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
+obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
+obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
+obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
+obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
+
+obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
+obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
+obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
+obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
+obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
+obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
+obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
+obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
+obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
+obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o
+obj-$(CONFIG_NET_EMATCH) += ematch.o
+obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
+obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
+obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
+obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
+obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
+obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o
+obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
new file mode 100644
index 000000000..3d43e4979
--- /dev/null
+++ b/net/sched/act_api.c
@@ -0,0 +1,1094 @@
+/*
+ * net/sched/act_api.c Packet action API.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Author: Jamal Hadi Salim
+ *
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/sch_generic.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+
+void tcf_hash_destroy(struct tc_action *a)
+{
+ struct tcf_common *p = a->priv;
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+
+ spin_lock_bh(&hinfo->lock);
+ hlist_del(&p->tcfc_head);
+ spin_unlock_bh(&hinfo->lock);
+ gen_kill_estimator(&p->tcfc_bstats,
+ &p->tcfc_rate_est);
+ /*
+ * gen_estimator est_timer() might access p->tcfc_lock
+ * or bstats, wait a RCU grace period before freeing p
+ */
+ kfree_rcu(p, tcfc_rcu);
+}
+EXPORT_SYMBOL(tcf_hash_destroy);
+
+int tcf_hash_release(struct tc_action *a, int bind)
+{
+ struct tcf_common *p = a->priv;
+ int ret = 0;
+
+ if (p) {
+ if (bind)
+ p->tcfc_bindcnt--;
+ else if (p->tcfc_bindcnt > 0)
+ return -EPERM;
+
+ p->tcfc_refcnt--;
+ if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
+ if (a->ops->cleanup)
+ a->ops->cleanup(a, bind);
+ tcf_hash_destroy(a);
+ ret = 1;
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(tcf_hash_release);
+
+static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
+ struct tc_action *a)
+{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct hlist_head *head;
+ struct tcf_common *p;
+ int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+ struct nlattr *nest;
+
+ spin_lock_bh(&hinfo->lock);
+
+ s_i = cb->args[0];
+
+ for (i = 0; i < (hinfo->hmask + 1); i++) {
+ head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
+
+ hlist_for_each_entry_rcu(p, head, tcfc_head) {
+ index++;
+ if (index < s_i)
+ continue;
+ a->priv = p;
+ a->order = n_i;
+
+ nest = nla_nest_start(skb, a->order);
+ if (nest == NULL)
+ goto nla_put_failure;
+ err = tcf_action_dump_1(skb, a, 0, 0);
+ if (err < 0) {
+ index--;
+ nlmsg_trim(skb, nest);
+ goto done;
+ }
+ nla_nest_end(skb, nest);
+ n_i++;
+ if (n_i >= TCA_ACT_MAX_PRIO)
+ goto done;
+ }
+ }
+done:
+ spin_unlock_bh(&hinfo->lock);
+ if (n_i)
+ cb->args[0] += n_i;
+ return n_i;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ goto done;
+}
+
+static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
+{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct hlist_head *head;
+ struct hlist_node *n;
+ struct tcf_common *p;
+ struct nlattr *nest;
+ int i = 0, n_i = 0;
+ int ret = -EINVAL;
+
+ nest = nla_nest_start(skb, a->order);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+ goto nla_put_failure;
+ for (i = 0; i < (hinfo->hmask + 1); i++) {
+ head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
+ hlist_for_each_entry_safe(p, n, head, tcfc_head) {
+ a->priv = p;
+ ret = tcf_hash_release(a, 0);
+ if (ret == ACT_P_DELETED) {
+ module_put(a->ops->owner);
+ n_i++;
+ } else if (ret < 0)
+ goto nla_put_failure;
+ }
+ }
+ if (nla_put_u32(skb, TCA_FCNT, n_i))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ return n_i;
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return ret;
+}
+
+static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
+ int type, struct tc_action *a)
+{
+ if (type == RTM_DELACTION) {
+ return tcf_del_walker(skb, a);
+ } else if (type == RTM_GETACTION) {
+ return tcf_dump_walker(skb, cb, a);
+ } else {
+ WARN(1, "tcf_generic_walker: unknown action %d\n", type);
+ return -EINVAL;
+ }
+}
+
+static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+{
+ struct tcf_common *p = NULL;
+ struct hlist_head *head;
+
+ spin_lock_bh(&hinfo->lock);
+ head = &hinfo->htab[tcf_hash(index, hinfo->hmask)];
+ hlist_for_each_entry_rcu(p, head, tcfc_head)
+ if (p->tcfc_index == index)
+ break;
+ spin_unlock_bh(&hinfo->lock);
+
+ return p;
+}
+
+u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
+{
+ u32 val = hinfo->index;
+
+ do {
+ if (++val == 0)
+ val = 1;
+ } while (tcf_hash_lookup(val, hinfo));
+
+ hinfo->index = val;
+ return val;
+}
+EXPORT_SYMBOL(tcf_hash_new_index);
+
+int tcf_hash_search(struct tc_action *a, u32 index)
+{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct tcf_common *p = tcf_hash_lookup(index, hinfo);
+
+ if (p) {
+ a->priv = p;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcf_hash_search);
+
+int tcf_hash_check(u32 index, struct tc_action *a, int bind)
+{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct tcf_common *p = NULL;
+ if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
+ if (bind)
+ p->tcfc_bindcnt++;
+ p->tcfc_refcnt++;
+ a->priv = p;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcf_hash_check);
+
+void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
+{
+ struct tcf_common *pc = a->priv;
+ if (est)
+ gen_kill_estimator(&pc->tcfc_bstats,
+ &pc->tcfc_rate_est);
+ kfree_rcu(pc, tcfc_rcu);
+}
+EXPORT_SYMBOL(tcf_hash_cleanup);
+
+int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
+ int size, int bind)
+{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+
+ if (unlikely(!p))
+ return -ENOMEM;
+ p->tcfc_refcnt = 1;
+ if (bind)
+ p->tcfc_bindcnt = 1;
+
+ spin_lock_init(&p->tcfc_lock);
+ INIT_HLIST_NODE(&p->tcfc_head);
+ p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
+ p->tcfc_tm.install = jiffies;
+ p->tcfc_tm.lastuse = jiffies;
+ if (est) {
+ int err = gen_new_estimator(&p->tcfc_bstats, NULL,
+ &p->tcfc_rate_est,
+ &p->tcfc_lock, est);
+ if (err) {
+ kfree(p);
+ return err;
+ }
+ }
+
+ a->priv = (void *) p;
+ return 0;
+}
+EXPORT_SYMBOL(tcf_hash_create);
+
+void tcf_hash_insert(struct tc_action *a)
+{
+ struct tcf_common *p = a->priv;
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
+
+ spin_lock_bh(&hinfo->lock);
+ hlist_add_head(&p->tcfc_head, &hinfo->htab[h]);
+ spin_unlock_bh(&hinfo->lock);
+}
+EXPORT_SYMBOL(tcf_hash_insert);
+
+static LIST_HEAD(act_base);
+static DEFINE_RWLOCK(act_mod_lock);
+
+int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
+{
+ struct tc_action_ops *a;
+ int err;
+
+ /* Must supply act, dump and init */
+ if (!act->act || !act->dump || !act->init)
+ return -EINVAL;
+
+ /* Supply defaults */
+ if (!act->lookup)
+ act->lookup = tcf_hash_search;
+ if (!act->walk)
+ act->walk = tcf_generic_walker;
+
+ act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL);
+ if (!act->hinfo)
+ return -ENOMEM;
+ err = tcf_hashinfo_init(act->hinfo, mask);
+ if (err) {
+ kfree(act->hinfo);
+ return err;
+ }
+
+ write_lock(&act_mod_lock);
+ list_for_each_entry(a, &act_base, head) {
+ if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
+ write_unlock(&act_mod_lock);
+ tcf_hashinfo_destroy(act->hinfo);
+ kfree(act->hinfo);
+ return -EEXIST;
+ }
+ }
+ list_add_tail(&act->head, &act_base);
+ write_unlock(&act_mod_lock);
+ return 0;
+}
+EXPORT_SYMBOL(tcf_register_action);
+
+int tcf_unregister_action(struct tc_action_ops *act)
+{
+ struct tc_action_ops *a;
+ int err = -ENOENT;
+
+ write_lock(&act_mod_lock);
+ list_for_each_entry(a, &act_base, head) {
+ if (a == act) {
+ list_del(&act->head);
+ tcf_hashinfo_destroy(act->hinfo);
+ kfree(act->hinfo);
+ err = 0;
+ break;
+ }
+ }
+ write_unlock(&act_mod_lock);
+ return err;
+}
+EXPORT_SYMBOL(tcf_unregister_action);
+
+/* lookup by name */
+static struct tc_action_ops *tc_lookup_action_n(char *kind)
+{
+ struct tc_action_ops *a, *res = NULL;
+
+ if (kind) {
+ read_lock(&act_mod_lock);
+ list_for_each_entry(a, &act_base, head) {
+ if (strcmp(kind, a->kind) == 0) {
+ if (try_module_get(a->owner))
+ res = a;
+ break;
+ }
+ }
+ read_unlock(&act_mod_lock);
+ }
+ return res;
+}
+
+/* lookup by nlattr */
+static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
+{
+ struct tc_action_ops *a, *res = NULL;
+
+ if (kind) {
+ read_lock(&act_mod_lock);
+ list_for_each_entry(a, &act_base, head) {
+ if (nla_strcmp(kind, a->kind) == 0) {
+ if (try_module_get(a->owner))
+ res = a;
+ break;
+ }
+ }
+ read_unlock(&act_mod_lock);
+ }
+ return res;
+}
+
+int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
+ struct tcf_result *res)
+{
+ const struct tc_action *a;
+ int ret = -1;
+
+ if (skb->tc_verd & TC_NCLS) {
+ skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
+ ret = TC_ACT_OK;
+ goto exec_done;
+ }
+ list_for_each_entry(a, actions, list) {
+repeat:
+ ret = a->ops->act(skb, a, res);
+ if (TC_MUNGED & skb->tc_verd) {
+ /* copied already, allow trampling */
+ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
+ skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
+ }
+ if (ret == TC_ACT_REPEAT)
+ goto repeat; /* we need a ttl - JHS */
+ if (ret != TC_ACT_PIPE)
+ goto exec_done;
+ }
+exec_done:
+ return ret;
+}
+EXPORT_SYMBOL(tcf_action_exec);
+
+int tcf_action_destroy(struct list_head *actions, int bind)
+{
+ struct tc_action *a, *tmp;
+ int ret = 0;
+
+ list_for_each_entry_safe(a, tmp, actions, list) {
+ ret = tcf_hash_release(a, bind);
+ if (ret == ACT_P_DELETED)
+ module_put(a->ops->owner);
+ else if (ret < 0)
+ return ret;
+ list_del(&a->list);
+ kfree(a);
+ }
+ return ret;
+}
+
+int
+tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ return a->ops->dump(skb, a, bind, ref);
+}
+
+int
+tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ int err = -EINVAL;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+
+ if (nla_put_string(skb, TCA_KIND, a->ops->kind))
+ goto nla_put_failure;
+ if (tcf_action_copy_stats(skb, a, 0))
+ goto nla_put_failure;
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ err = tcf_action_dump_old(skb, a, bind, ref);
+ if (err > 0) {
+ nla_nest_end(skb, nest);
+ return err;
+ }
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+EXPORT_SYMBOL(tcf_action_dump_1);
+
+int
+tcf_action_dump(struct sk_buff *skb, struct list_head *actions, int bind, int ref)
+{
+ struct tc_action *a;
+ int err = -EINVAL;
+ struct nlattr *nest;
+
+ list_for_each_entry(a, actions, list) {
+ nest = nla_nest_start(skb, a->order);
+ if (nest == NULL)
+ goto nla_put_failure;
+ err = tcf_action_dump_1(skb, a, bind, ref);
+ if (err < 0)
+ goto errout;
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+nla_put_failure:
+ err = -EINVAL;
+errout:
+ nla_nest_cancel(skb, nest);
+ return err;
+}
+
+struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla,
+ struct nlattr *est, char *name, int ovr,
+ int bind)
+{
+ struct tc_action *a;
+ struct tc_action_ops *a_o;
+ char act_name[IFNAMSIZ];
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+ struct nlattr *kind;
+ int err;
+
+ if (name == NULL) {
+ err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+ if (err < 0)
+ goto err_out;
+ err = -EINVAL;
+ kind = tb[TCA_ACT_KIND];
+ if (kind == NULL)
+ goto err_out;
+ if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
+ goto err_out;
+ } else {
+ err = -EINVAL;
+ if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
+ goto err_out;
+ }
+
+ a_o = tc_lookup_action_n(act_name);
+ if (a_o == NULL) {
+#ifdef CONFIG_MODULES
+ rtnl_unlock();
+ request_module("act_%s", act_name);
+ rtnl_lock();
+
+ a_o = tc_lookup_action_n(act_name);
+
+ /* We dropped the RTNL semaphore in order to
+ * perform the module load. So, even if we
+ * succeeded in loading the module we have to
+ * tell the caller to replay the request. We
+ * indicate this using -EAGAIN.
+ */
+ if (a_o != NULL) {
+ err = -EAGAIN;
+ goto err_mod;
+ }
+#endif
+ err = -ENOENT;
+ goto err_out;
+ }
+
+ err = -ENOMEM;
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (a == NULL)
+ goto err_mod;
+
+ a->ops = a_o;
+ INIT_LIST_HEAD(&a->list);
+ /* backward compatibility for policer */
+ if (name == NULL)
+ err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
+ else
+ err = a_o->init(net, nla, est, a, ovr, bind);
+ if (err < 0)
+ goto err_free;
+
+ /* module count goes up only when brand new policy is created
+ * if it exists and is only bound to in a_o->init() then
+ * ACT_P_CREATED is not returned (a zero is).
+ */
+ if (err != ACT_P_CREATED)
+ module_put(a_o->owner);
+
+ return a;
+
+err_free:
+ kfree(a);
+err_mod:
+ module_put(a_o->owner);
+err_out:
+ return ERR_PTR(err);
+}
+
+int tcf_action_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, char *name, int ovr,
+ int bind, struct list_head *actions)
+{
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct tc_action *act;
+ int err;
+ int i;
+
+ err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
+ if (err < 0)
+ return err;
+
+ for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+ act = tcf_action_init_1(net, tb[i], est, name, ovr, bind);
+ if (IS_ERR(act)) {
+ err = PTR_ERR(act);
+ goto err;
+ }
+ act->order = i;
+ list_add_tail(&act->list, actions);
+ }
+ return 0;
+
+err:
+ tcf_action_destroy(actions, bind);
+ return err;
+}
+
+int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
+ int compat_mode)
+{
+ int err = 0;
+ struct gnet_dump d;
+ struct tcf_common *p = a->priv;
+
+ if (p == NULL)
+ goto errout;
+
+ /* compat_mode being true specifies a call that is supposed
+ * to add additional backward compatibility statistic TLVs.
+ */
+ if (compat_mode) {
+ if (a->type == TCA_OLD_COMPAT)
+ err = gnet_stats_start_copy_compat(skb, 0,
+ TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d);
+ else
+ return 0;
+ } else
+ err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
+ &p->tcfc_lock, &d);
+
+ if (err < 0)
+ goto errout;
+
+ if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
+ &p->tcfc_rate_est) < 0 ||
+ gnet_stats_copy_queue(&d, NULL,
+ &p->tcfc_qstats,
+ p->tcfc_qstats.qlen) < 0)
+ goto errout;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto errout;
+
+ return 0;
+
+errout:
+ return -1;
+}
+
+static int
+tca_get_fill(struct sk_buff *skb, struct list_head *actions, u32 portid, u32 seq,
+ u16 flags, int event, int bind, int ref)
+{
+ struct tcamsg *t;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ t = nlmsg_data(nlh);
+ t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
+
+ nest = nla_nest_start(skb, TCA_ACT_TAB);
+ if (nest == NULL)
+ goto out_nlmsg_trim;
+
+ if (tcf_action_dump(skb, actions, bind, ref) < 0)
+ goto out_nlmsg_trim;
+
+ nla_nest_end(skb, nest);
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int
+act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
+ struct list_head *actions, int event)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return rtnl_unicast(skb, net, portid);
+}
+
+static struct tc_action *create_a(int i)
+{
+ struct tc_action *act;
+
+ act = kzalloc(sizeof(*act), GFP_KERNEL);
+ if (act == NULL) {
+ pr_debug("create_a: failed to alloc!\n");
+ return NULL;
+ }
+ act->order = i;
+ INIT_LIST_HEAD(&act->list);
+ return act;
+}
+
+static struct tc_action *
+tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
+{
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+ struct tc_action *a;
+ int index;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+ if (err < 0)
+ goto err_out;
+
+ err = -EINVAL;
+ if (tb[TCA_ACT_INDEX] == NULL ||
+ nla_len(tb[TCA_ACT_INDEX]) < sizeof(index))
+ goto err_out;
+ index = nla_get_u32(tb[TCA_ACT_INDEX]);
+
+ err = -ENOMEM;
+ a = create_a(0);
+ if (a == NULL)
+ goto err_out;
+
+ err = -EINVAL;
+ a->ops = tc_lookup_action(tb[TCA_ACT_KIND]);
+ if (a->ops == NULL) /* could happen in batch of actions */
+ goto err_free;
+ err = -ENOENT;
+ if (a->ops->lookup(a, index) == 0)
+ goto err_mod;
+
+ module_put(a->ops->owner);
+ return a;
+
+err_mod:
+ module_put(a->ops->owner);
+err_free:
+ kfree(a);
+err_out:
+ return ERR_PTR(err);
+}
+
+static void cleanup_a(struct list_head *actions)
+{
+ struct tc_action *a, *tmp;
+
+ list_for_each_entry_safe(a, tmp, actions, list) {
+ list_del(&a->list);
+ kfree(a);
+ }
+}
+
+static int tca_action_flush(struct net *net, struct nlattr *nla,
+ struct nlmsghdr *n, u32 portid)
+{
+ struct sk_buff *skb;
+ unsigned char *b;
+ struct nlmsghdr *nlh;
+ struct tcamsg *t;
+ struct netlink_callback dcb;
+ struct nlattr *nest;
+ struct nlattr *tb[TCA_ACT_MAX + 1];
+ struct nlattr *kind;
+ struct tc_action a;
+ int err = -ENOMEM;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb) {
+ pr_debug("tca_action_flush: failed skb alloc\n");
+ return err;
+ }
+
+ b = skb_tail_pointer(skb);
+
+ err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+ if (err < 0)
+ goto err_out;
+
+ err = -EINVAL;
+ kind = tb[TCA_ACT_KIND];
+ memset(&a, 0, sizeof(struct tc_action));
+ INIT_LIST_HEAD(&a.list);
+ a.ops = tc_lookup_action(kind);
+ if (a.ops == NULL) /*some idjot trying to flush unknown action */
+ goto err_out;
+
+ nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0);
+ if (!nlh)
+ goto out_module_put;
+ t = nlmsg_data(nlh);
+ t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
+
+ nest = nla_nest_start(skb, TCA_ACT_TAB);
+ if (nest == NULL)
+ goto out_module_put;
+
+ err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);
+ if (err < 0)
+ goto out_module_put;
+ if (err == 0)
+ goto noflush_out;
+
+ nla_nest_end(skb, nest);
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ nlh->nlmsg_flags |= NLM_F_ROOT;
+ module_put(a.ops->owner);
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (err > 0)
+ return 0;
+
+ return err;
+
+out_module_put:
+ module_put(a.ops->owner);
+err_out:
+noflush_out:
+ kfree_skb(skb);
+ return err;
+}
+
+static int
+tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+ u32 portid)
+{
+ int ret;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
+ 0, 1) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ /* now do the delete */
+ ret = tcf_action_destroy(actions, 0);
+ if (ret < 0) {
+ kfree_skb(skb);
+ return ret;
+ }
+
+ ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (ret > 0)
+ return 0;
+ return ret;
+}
+
+static int
+tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+ u32 portid, int event)
+{
+ int i, ret;
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct tc_action *act;
+ LIST_HEAD(actions);
+
+ ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
+ if (tb[1] != NULL)
+ return tca_action_flush(net, tb[1], n, portid);
+ else
+ return -EINVAL;
+ }
+
+ for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+ act = tcf_action_get_1(tb[i], n, portid);
+ if (IS_ERR(act)) {
+ ret = PTR_ERR(act);
+ goto err;
+ }
+ act->order = i;
+ list_add_tail(&act->list, &actions);
+ }
+
+ if (event == RTM_GETACTION)
+ ret = act_get_notify(net, portid, n, &actions, event);
+ else { /* delete */
+ ret = tcf_del_notify(net, n, &actions, portid);
+ if (ret)
+ goto err;
+ return ret;
+ }
+err:
+ cleanup_a(&actions);
+ return ret;
+}
+
+static int
+tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+ u32 portid)
+{
+ struct sk_buff *skb;
+ int err = 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
+ RTM_NEWACTION, 0, 0) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ if (err > 0)
+ err = 0;
+ return err;
+}
+
+static int
+tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+ u32 portid, int ovr)
+{
+ int ret = 0;
+ LIST_HEAD(actions);
+
+ ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, &actions);
+ if (ret)
+ goto done;
+
+ /* dump then free all the actions after update; inserted policy
+ * stays intact
+ */
+ ret = tcf_add_notify(net, n, &actions, portid);
+ cleanup_a(&actions);
+done:
+ return ret;
+}
+
+static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_ACT_MAX + 1];
+ u32 portid = skb ? NETLINK_CB(skb).portid : 0;
+ int ret = 0, ovr = 0;
+
+ if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (tca[TCA_ACT_TAB] == NULL) {
+ pr_notice("tc_ctl_action: received NO action attribs\n");
+ return -EINVAL;
+ }
+
+ /* n->nlmsg_flags & NLM_F_CREATE */
+ switch (n->nlmsg_type) {
+ case RTM_NEWACTION:
+ /* we are going to assume all other flags
+ * imply create only if it doesn't exist
+ * Note that CREATE | EXCL implies that
+ * but since we want avoid ambiguity (eg when flags
+ * is zero) then just set this
+ */
+ if (n->nlmsg_flags & NLM_F_REPLACE)
+ ovr = 1;
+replay:
+ ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr);
+ if (ret == -EAGAIN)
+ goto replay;
+ break;
+ case RTM_DELACTION:
+ ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+ portid, RTM_DELACTION);
+ break;
+ case RTM_GETACTION:
+ ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+ portid, RTM_GETACTION);
+ break;
+ default:
+ BUG();
+ }
+
+ return ret;
+}
+
+static struct nlattr *
+find_dump_kind(const struct nlmsghdr *n)
+{
+ struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
+ struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+ struct nlattr *nla[TCAA_MAX + 1];
+ struct nlattr *kind;
+
+ if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, NULL) < 0)
+ return NULL;
+ tb1 = nla[TCA_ACT_TAB];
+ if (tb1 == NULL)
+ return NULL;
+
+ if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1),
+ NLMSG_ALIGN(nla_len(tb1)), NULL) < 0)
+ return NULL;
+
+ if (tb[1] == NULL)
+ return NULL;
+ if (nla_parse(tb2, TCA_ACT_MAX, nla_data(tb[1]),
+ nla_len(tb[1]), NULL) < 0)
+ return NULL;
+ kind = tb2[TCA_ACT_KIND];
+
+ return kind;
+}
+
+static int
+tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct nlattr *nest;
+ struct tc_action_ops *a_o;
+ struct tc_action a;
+ int ret = 0;
+ struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
+ struct nlattr *kind = find_dump_kind(cb->nlh);
+
+ if (kind == NULL) {
+ pr_info("tc_dump_action: action bad kind\n");
+ return 0;
+ }
+
+ a_o = tc_lookup_action(kind);
+ if (a_o == NULL)
+ return 0;
+
+ memset(&a, 0, sizeof(struct tc_action));
+ a.ops = a_o;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*t), 0);
+ if (!nlh)
+ goto out_module_put;
+ t = nlmsg_data(nlh);
+ t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
+
+ nest = nla_nest_start(skb, TCA_ACT_TAB);
+ if (nest == NULL)
+ goto out_module_put;
+
+ ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
+ if (ret < 0)
+ goto out_module_put;
+
+ if (ret > 0) {
+ nla_nest_end(skb, nest);
+ ret = skb->len;
+ } else
+ nla_nest_cancel(skb, nest);
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ if (NETLINK_CB(cb->skb).portid && ret)
+ nlh->nlmsg_flags |= NLM_F_MULTI;
+ module_put(a_o->owner);
+ return skb->len;
+
+out_module_put:
+ module_put(a_o->owner);
+ nlmsg_trim(skb, b);
+ return skb->len;
+}
+
+static int __init tc_action_init(void)
+{
+ rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
+ NULL);
+
+ return 0;
+}
+
+subsys_initcall(tc_action_init);
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
new file mode 100644
index 000000000..dc6a2d324
--- /dev/null
+++ b/net/sched/act_bpf.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_bpf.h>
+#include <net/tc_act/tc_bpf.h>
+
+#define BPF_TAB_MASK 15
+#define ACT_BPF_NAME_LEN 256
+
+struct tcf_bpf_cfg {
+ struct bpf_prog *filter;
+ struct sock_filter *bpf_ops;
+ char *bpf_name;
+ u32 bpf_fd;
+ u16 bpf_num_ops;
+};
+
+static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
+ struct tcf_result *res)
+{
+ struct tcf_bpf *prog = act->priv;
+ int action, filter_res;
+
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return TC_ACT_UNSPEC;
+
+ spin_lock(&prog->tcf_lock);
+
+ prog->tcf_tm.lastuse = jiffies;
+ bstats_update(&prog->tcf_bstats, skb);
+
+ /* Needed here for accessing maps. */
+ rcu_read_lock();
+ filter_res = BPF_PROG_RUN(prog->filter, skb);
+ rcu_read_unlock();
+
+ /* A BPF program may overwrite the default action opcode.
+ * Similarly as in cls_bpf, if filter_res == -1 we use the
+ * default action specified from tc.
+ *
+ * In case a different well-known TC_ACT opcode has been
+ * returned, it will overwrite the default one.
+ *
+ * For everything else that is unkown, TC_ACT_UNSPEC is
+ * returned.
+ */
+ switch (filter_res) {
+ case TC_ACT_PIPE:
+ case TC_ACT_RECLASSIFY:
+ case TC_ACT_OK:
+ action = filter_res;
+ break;
+ case TC_ACT_SHOT:
+ action = filter_res;
+ prog->tcf_qstats.drops++;
+ break;
+ case TC_ACT_UNSPEC:
+ action = prog->tcf_action;
+ break;
+ default:
+ action = TC_ACT_UNSPEC;
+ break;
+ }
+
+ spin_unlock(&prog->tcf_lock);
+ return action;
+}
+
+static bool tcf_bpf_is_ebpf(const struct tcf_bpf *prog)
+{
+ return !prog->bpf_ops;
+}
+
+static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
+ struct sk_buff *skb)
+{
+ struct nlattr *nla;
+
+ if (nla_put_u16(skb, TCA_ACT_BPF_OPS_LEN, prog->bpf_num_ops))
+ return -EMSGSIZE;
+
+ nla = nla_reserve(skb, TCA_ACT_BPF_OPS, prog->bpf_num_ops *
+ sizeof(struct sock_filter));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+ return 0;
+}
+
+static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
+ struct sk_buff *skb)
+{
+ if (nla_put_u32(skb, TCA_ACT_BPF_FD, prog->bpf_fd))
+ return -EMSGSIZE;
+
+ if (prog->bpf_name &&
+ nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
+ int bind, int ref)
+{
+ unsigned char *tp = skb_tail_pointer(skb);
+ struct tcf_bpf *prog = act->priv;
+ struct tc_act_bpf opt = {
+ .index = prog->tcf_index,
+ .refcnt = prog->tcf_refcnt - ref,
+ .bindcnt = prog->tcf_bindcnt - bind,
+ .action = prog->tcf_action,
+ };
+ struct tcf_t tm;
+ int ret;
+
+ if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (tcf_bpf_is_ebpf(prog))
+ ret = tcf_bpf_dump_ebpf_info(prog, skb);
+ else
+ ret = tcf_bpf_dump_bpf_info(prog, skb);
+ if (ret)
+ goto nla_put_failure;
+
+ tm.install = jiffies_to_clock_t(jiffies - prog->tcf_tm.install);
+ tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse);
+ tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires);
+
+ if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, tp);
+ return -1;
+}
+
+static const struct nla_policy act_bpf_policy[TCA_ACT_BPF_MAX + 1] = {
+ [TCA_ACT_BPF_PARMS] = { .len = sizeof(struct tc_act_bpf) },
+ [TCA_ACT_BPF_FD] = { .type = NLA_U32 },
+ [TCA_ACT_BPF_NAME] = { .type = NLA_NUL_STRING, .len = ACT_BPF_NAME_LEN },
+ [TCA_ACT_BPF_OPS_LEN] = { .type = NLA_U16 },
+ [TCA_ACT_BPF_OPS] = { .type = NLA_BINARY,
+ .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
+{
+ struct sock_filter *bpf_ops;
+ struct sock_fprog_kern fprog_tmp;
+ struct bpf_prog *fp;
+ u16 bpf_size, bpf_num_ops;
+ int ret;
+
+ bpf_num_ops = nla_get_u16(tb[TCA_ACT_BPF_OPS_LEN]);
+ if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0)
+ return -EINVAL;
+
+ bpf_size = bpf_num_ops * sizeof(*bpf_ops);
+ if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS]))
+ return -EINVAL;
+
+ bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+ if (bpf_ops == NULL)
+ return -ENOMEM;
+
+ memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size);
+
+ fprog_tmp.len = bpf_num_ops;
+ fprog_tmp.filter = bpf_ops;
+
+ ret = bpf_prog_create(&fp, &fprog_tmp);
+ if (ret < 0) {
+ kfree(bpf_ops);
+ return ret;
+ }
+
+ cfg->bpf_ops = bpf_ops;
+ cfg->bpf_num_ops = bpf_num_ops;
+ cfg->filter = fp;
+
+ return 0;
+}
+
+static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
+{
+ struct bpf_prog *fp;
+ char *name = NULL;
+ u32 bpf_fd;
+
+ bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]);
+
+ fp = bpf_prog_get(bpf_fd);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ if (fp->type != BPF_PROG_TYPE_SCHED_ACT) {
+ bpf_prog_put(fp);
+ return -EINVAL;
+ }
+
+ if (tb[TCA_ACT_BPF_NAME]) {
+ name = kmemdup(nla_data(tb[TCA_ACT_BPF_NAME]),
+ nla_len(tb[TCA_ACT_BPF_NAME]),
+ GFP_KERNEL);
+ if (!name) {
+ bpf_prog_put(fp);
+ return -ENOMEM;
+ }
+ }
+
+ cfg->bpf_fd = bpf_fd;
+ cfg->bpf_name = name;
+ cfg->filter = fp;
+
+ return 0;
+}
+
+static int tcf_bpf_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *act,
+ int replace, int bind)
+{
+ struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+ struct tc_act_bpf *parm;
+ struct tcf_bpf *prog;
+ struct tcf_bpf_cfg cfg;
+ bool is_bpf, is_ebpf;
+ int ret;
+
+ if (!nla)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy);
+ if (ret < 0)
+ return ret;
+
+ is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
+ is_ebpf = tb[TCA_ACT_BPF_FD];
+
+ if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) ||
+ !tb[TCA_ACT_BPF_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
+
+ memset(&cfg, 0, sizeof(cfg));
+
+ ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) :
+ tcf_bpf_init_from_efd(tb, &cfg);
+ if (ret < 0)
+ return ret;
+
+ if (!tcf_hash_check(parm->index, act, bind)) {
+ ret = tcf_hash_create(parm->index, est, act,
+ sizeof(*prog), bind);
+ if (ret < 0)
+ goto destroy_fp;
+
+ ret = ACT_P_CREATED;
+ } else {
+ /* Don't override defaults. */
+ if (bind)
+ goto destroy_fp;
+
+ tcf_hash_release(act, bind);
+ if (!replace) {
+ ret = -EEXIST;
+ goto destroy_fp;
+ }
+ }
+
+ prog = to_bpf(act);
+ spin_lock_bh(&prog->tcf_lock);
+
+ prog->bpf_ops = cfg.bpf_ops;
+ prog->bpf_name = cfg.bpf_name;
+
+ if (cfg.bpf_num_ops)
+ prog->bpf_num_ops = cfg.bpf_num_ops;
+ if (cfg.bpf_fd)
+ prog->bpf_fd = cfg.bpf_fd;
+
+ prog->tcf_action = parm->action;
+ prog->filter = cfg.filter;
+
+ spin_unlock_bh(&prog->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(act);
+
+ return ret;
+
+destroy_fp:
+ if (is_ebpf)
+ bpf_prog_put(cfg.filter);
+ else
+ bpf_prog_destroy(cfg.filter);
+
+ kfree(cfg.bpf_ops);
+ kfree(cfg.bpf_name);
+
+ return ret;
+}
+
+static void tcf_bpf_cleanup(struct tc_action *act, int bind)
+{
+ const struct tcf_bpf *prog = act->priv;
+
+ if (tcf_bpf_is_ebpf(prog))
+ bpf_prog_put(prog->filter);
+ else
+ bpf_prog_destroy(prog->filter);
+}
+
+static struct tc_action_ops act_bpf_ops __read_mostly = {
+ .kind = "bpf",
+ .type = TCA_ACT_BPF,
+ .owner = THIS_MODULE,
+ .act = tcf_bpf,
+ .dump = tcf_bpf_dump,
+ .cleanup = tcf_bpf_cleanup,
+ .init = tcf_bpf_init,
+};
+
+static int __init bpf_init_module(void)
+{
+ return tcf_register_action(&act_bpf_ops, BPF_TAB_MASK);
+}
+
+static void __exit bpf_cleanup_module(void)
+{
+ tcf_unregister_action(&act_bpf_ops);
+}
+
+module_init(bpf_init_module);
+module_exit(bpf_cleanup_module);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("TC BPF based action");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
new file mode 100644
index 000000000..295d14bd6
--- /dev/null
+++ b/net/sched/act_connmark.c
@@ -0,0 +1,190 @@
+/*
+ * net/sched/act_connmark.c netfilter connmark retriever action
+ * skb mark is over-written
+ *
+ * Copyright (c) 2011 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <uapi/linux/tc_act/tc_connmark.h>
+#include <net/tc_act/tc_connmark.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+#define CONNMARK_TAB_MASK 3
+
+static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash;
+ struct nf_conntrack_tuple tuple;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_connmark_info *ca = a->priv;
+ struct nf_conn *c;
+ int proto;
+
+ spin_lock(&ca->tcf_lock);
+ ca->tcf_tm.lastuse = jiffies;
+ bstats_update(&ca->tcf_bstats, skb);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (skb->len < sizeof(struct iphdr))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (skb->len < sizeof(struct ipv6hdr))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ } else {
+ goto out;
+ }
+
+ c = nf_ct_get(skb, &ctinfo);
+ if (c) {
+ skb->mark = c->mark;
+ /* using overlimits stats to count how many packets marked */
+ ca->tcf_qstats.overlimits++;
+ goto out;
+ }
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, &tuple))
+ goto out;
+
+ thash = nf_conntrack_find_get(dev_net(skb->dev), ca->zone, &tuple);
+ if (!thash)
+ goto out;
+
+ c = nf_ct_tuplehash_to_ctrack(thash);
+ /* using overlimits stats to count how many packets marked */
+ ca->tcf_qstats.overlimits++;
+ skb->mark = c->mark;
+ nf_ct_put(c);
+
+out:
+ spin_unlock(&ca->tcf_lock);
+ return ca->tcf_action;
+}
+
+static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
+ [TCA_CONNMARK_PARMS] = { .len = sizeof(struct tc_connmark) },
+};
+
+static int tcf_connmark_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
+{
+ struct nlattr *tb[TCA_CONNMARK_MAX + 1];
+ struct tcf_connmark_info *ci;
+ struct tc_connmark *parm;
+ int ret = 0;
+
+ if (!nla)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_CONNMARK_MAX, nla, connmark_policy);
+ if (ret < 0)
+ return ret;
+
+ parm = nla_data(tb[TCA_CONNMARK_PARMS]);
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind);
+ if (ret)
+ return ret;
+
+ ci = to_connmark(a);
+ ci->tcf_action = parm->action;
+ ci->zone = parm->zone;
+
+ tcf_hash_insert(a);
+ ret = ACT_P_CREATED;
+ } else {
+ ci = to_connmark(a);
+ if (bind)
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
+ return -EEXIST;
+ /* replacing action and zone */
+ ci->tcf_action = parm->action;
+ ci->zone = parm->zone;
+ }
+
+ return ret;
+}
+
+static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_connmark_info *ci = a->priv;
+
+ struct tc_connmark opt = {
+ .index = ci->tcf_index,
+ .refcnt = ci->tcf_refcnt - ref,
+ .bindcnt = ci->tcf_bindcnt - bind,
+ .action = ci->tcf_action,
+ .zone = ci->zone,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(ci->tcf_tm.expires);
+ if (nla_put(skb, TCA_CONNMARK_TM, sizeof(t), &t))
+ goto nla_put_failure;
+
+ return skb->len;
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct tc_action_ops act_connmark_ops = {
+ .kind = "connmark",
+ .type = TCA_ACT_CONNMARK,
+ .owner = THIS_MODULE,
+ .act = tcf_connmark,
+ .dump = tcf_connmark_dump,
+ .init = tcf_connmark_init,
+};
+
+static int __init connmark_init_module(void)
+{
+ return tcf_register_action(&act_connmark_ops, CONNMARK_TAB_MASK);
+}
+
+static void __exit connmark_cleanup_module(void)
+{
+ tcf_unregister_action(&act_connmark_ops);
+}
+
+module_init(connmark_init_module);
+module_exit(connmark_cleanup_module);
+MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
+MODULE_DESCRIPTION("Connection tracking mark restoring");
+MODULE_LICENSE("GPL");
+
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
new file mode 100644
index 000000000..4cd5cf1ae
--- /dev/null
+++ b/net/sched/act_csum.c
@@ -0,0 +1,584 @@
+/*
+ * Checksum updating actions
+ *
+ * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include <linux/netlink.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/igmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+
+#include <net/act_api.h>
+
+#include <linux/tc_act/tc_csum.h>
+#include <net/tc_act/tc_csum.h>
+
+#define CSUM_TAB_MASK 15
+
+static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
+ [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
+};
+
+static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
+ struct tc_action *a, int ovr, int bind)
+{
+ struct nlattr *tb[TCA_CSUM_MAX + 1];
+ struct tc_csum *parm;
+ struct tcf_csum *p;
+ int ret = 0, err;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CSUM_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_CSUM_PARMS]);
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
+ } else {
+ if (bind)/* dont override defaults */
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ p = to_tcf_csum(a);
+ spin_lock_bh(&p->tcf_lock);
+ p->tcf_action = parm->action;
+ p->update_flags = parm->update_flags;
+ spin_unlock_bh(&p->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(a);
+
+ return ret;
+}
+
+/**
+ * tcf_csum_skb_nextlayer - Get next layer pointer
+ * @skb: sk_buff to use
+ * @ihl: previous summed headers length
+ * @ipl: complete packet length
+ * @jhl: next header length
+ *
+ * Check the expected next layer availability in the specified sk_buff.
+ * Return the next layer pointer if pass, NULL otherwise.
+ */
+static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl,
+ unsigned int jhl)
+{
+ int ntkoff = skb_network_offset(skb);
+ int hl = ihl + jhl;
+
+ if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
+ (skb_cloned(skb) &&
+ !skb_clone_writable(skb, hl + ntkoff) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ return NULL;
+ else
+ return (void *)(skb_network_header(skb) + ihl);
+}
+
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct icmphdr *icmph;
+
+ icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
+ if (icmph == NULL)
+ return 0;
+
+ icmph->checksum = 0;
+ skb->csum = csum_partial(icmph, ipl - ihl, 0);
+ icmph->checksum = csum_fold(skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct igmphdr *igmph;
+
+ igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
+ if (igmph == NULL)
+ return 0;
+
+ igmph->csum = 0;
+ skb->csum = csum_partial(igmph, ipl - ihl, 0);
+ igmph->csum = csum_fold(skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct icmp6hdr *icmp6h;
+ const struct ipv6hdr *ip6h;
+
+ icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
+ if (icmp6h == NULL)
+ return 0;
+
+ ip6h = ipv6_hdr(skb);
+ icmp6h->icmp6_cksum = 0;
+ skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ ipl - ihl, IPPROTO_ICMPV6,
+ skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct tcphdr *tcph;
+ const struct iphdr *iph;
+
+ tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+ if (tcph == NULL)
+ return 0;
+
+ iph = ip_hdr(skb);
+ tcph->check = 0;
+ skb->csum = csum_partial(tcph, ipl - ihl, 0);
+ tcph->check = tcp_v4_check(ipl - ihl,
+ iph->saddr, iph->daddr, skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct tcphdr *tcph;
+ const struct ipv6hdr *ip6h;
+
+ tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+ if (tcph == NULL)
+ return 0;
+
+ ip6h = ipv6_hdr(skb);
+ tcph->check = 0;
+ skb->csum = csum_partial(tcph, ipl - ihl, 0);
+ tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ ipl - ihl, IPPROTO_TCP,
+ skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_udp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl, int udplite)
+{
+ struct udphdr *udph;
+ const struct iphdr *iph;
+ u16 ul;
+
+ /*
+ * Support both UDP and UDPLITE checksum algorithms, Don't use
+ * udph->len to get the real length without any protocol check,
+ * UDPLITE uses udph->len for another thing,
+ * Use iph->tot_len, or just ipl.
+ */
+
+ udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+ if (udph == NULL)
+ return 0;
+
+ iph = ip_hdr(skb);
+ ul = ntohs(udph->len);
+
+ if (udplite || udph->check) {
+
+ udph->check = 0;
+
+ if (udplite) {
+ if (ul == 0)
+ skb->csum = csum_partial(udph, ipl - ihl, 0);
+ else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+ skb->csum = csum_partial(udph, ul, 0);
+ else
+ goto ignore_obscure_skb;
+ } else {
+ if (ul != ipl - ihl)
+ goto ignore_obscure_skb;
+
+ skb->csum = csum_partial(udph, ul, 0);
+ }
+
+ udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ ul, iph->protocol,
+ skb->csum);
+
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+ return 1;
+}
+
+static int tcf_csum_ipv6_udp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl, int udplite)
+{
+ struct udphdr *udph;
+ const struct ipv6hdr *ip6h;
+ u16 ul;
+
+ /*
+ * Support both UDP and UDPLITE checksum algorithms, Don't use
+ * udph->len to get the real length without any protocol check,
+ * UDPLITE uses udph->len for another thing,
+ * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
+ */
+
+ udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+ if (udph == NULL)
+ return 0;
+
+ ip6h = ipv6_hdr(skb);
+ ul = ntohs(udph->len);
+
+ udph->check = 0;
+
+ if (udplite) {
+ if (ul == 0)
+ skb->csum = csum_partial(udph, ipl - ihl, 0);
+
+ else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+ skb->csum = csum_partial(udph, ul, 0);
+
+ else
+ goto ignore_obscure_skb;
+ } else {
+ if (ul != ipl - ihl)
+ goto ignore_obscure_skb;
+
+ skb->csum = csum_partial(udph, ul, 0);
+ }
+
+ udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
+ udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
+ skb->csum);
+
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+ return 1;
+}
+
+static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
+{
+ const struct iphdr *iph;
+ int ntkoff;
+
+ ntkoff = skb_network_offset(skb);
+
+ if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
+ goto fail;
+
+ iph = ip_hdr(skb);
+
+ switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+ case IPPROTO_ICMP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+ if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_IGMP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
+ if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_TCP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+ if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_UDP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+ if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len), 0))
+ goto fail;
+ break;
+ case IPPROTO_UDPLITE:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+ if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len), 1))
+ goto fail;
+ break;
+ }
+
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto fail;
+
+ ip_send_check(ip_hdr(skb));
+ }
+
+ return 1;
+
+fail:
+ return 0;
+}
+
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
+ unsigned int ixhl, unsigned int *pl)
+{
+ int off, len, optlen;
+ unsigned char *xh = (void *)ip6xh;
+
+ off = sizeof(*ip6xh);
+ len = ixhl - off;
+
+ while (len > 1) {
+ switch (xh[off]) {
+ case IPV6_TLV_PAD1:
+ optlen = 1;
+ break;
+ case IPV6_TLV_JUMBO:
+ optlen = xh[off + 1] + 2;
+ if (optlen != 6 || len < 6 || (off & 3) != 2)
+ /* wrong jumbo option length/alignment */
+ return 0;
+ *pl = ntohl(*(__be32 *)(xh + off + 2));
+ goto done;
+ default:
+ optlen = xh[off + 1] + 2;
+ if (optlen > len)
+ /* ignore obscure options */
+ goto done;
+ break;
+ }
+ off += optlen;
+ len -= optlen;
+ }
+
+done:
+ return 1;
+}
+
+static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
+{
+ struct ipv6hdr *ip6h;
+ struct ipv6_opt_hdr *ip6xh;
+ unsigned int hl, ixhl;
+ unsigned int pl;
+ int ntkoff;
+ u8 nexthdr;
+
+ ntkoff = skb_network_offset(skb);
+
+ hl = sizeof(*ip6h);
+
+ if (!pskb_may_pull(skb, hl + ntkoff))
+ goto fail;
+
+ ip6h = ipv6_hdr(skb);
+
+ pl = ntohs(ip6h->payload_len);
+ nexthdr = ip6h->nexthdr;
+
+ do {
+ switch (nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ goto ignore_skb;
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_HOP:
+ case NEXTHDR_DEST:
+ if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
+ goto fail;
+ ip6xh = (void *)(skb_network_header(skb) + hl);
+ ixhl = ipv6_optlen(ip6xh);
+ if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
+ goto fail;
+ ip6xh = (void *)(skb_network_header(skb) + hl);
+ if ((nexthdr == NEXTHDR_HOP) &&
+ !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
+ goto fail;
+ nexthdr = ip6xh->nexthdr;
+ hl += ixhl;
+ break;
+ case IPPROTO_ICMPV6:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+ if (!tcf_csum_ipv6_icmp(skb,
+ hl, pl + sizeof(*ip6h)))
+ goto fail;
+ goto done;
+ case IPPROTO_TCP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+ if (!tcf_csum_ipv6_tcp(skb,
+ hl, pl + sizeof(*ip6h)))
+ goto fail;
+ goto done;
+ case IPPROTO_UDP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+ if (!tcf_csum_ipv6_udp(skb, hl,
+ pl + sizeof(*ip6h), 0))
+ goto fail;
+ goto done;
+ case IPPROTO_UDPLITE:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+ if (!tcf_csum_ipv6_udp(skb, hl,
+ pl + sizeof(*ip6h), 1))
+ goto fail;
+ goto done;
+ default:
+ goto ignore_skb;
+ }
+ } while (pskb_may_pull(skb, hl + 1 + ntkoff));
+
+done:
+ignore_skb:
+ return 1;
+
+fail:
+ return 0;
+}
+
+static int tcf_csum(struct sk_buff *skb,
+ const struct tc_action *a, struct tcf_result *res)
+{
+ struct tcf_csum *p = a->priv;
+ int action;
+ u32 update_flags;
+
+ spin_lock(&p->tcf_lock);
+ p->tcf_tm.lastuse = jiffies;
+ bstats_update(&p->tcf_bstats, skb);
+ action = p->tcf_action;
+ update_flags = p->update_flags;
+ spin_unlock(&p->tcf_lock);
+
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
+
+ switch (tc_skb_protocol(skb)) {
+ case cpu_to_be16(ETH_P_IP):
+ if (!tcf_csum_ipv4(skb, update_flags))
+ goto drop;
+ break;
+ case cpu_to_be16(ETH_P_IPV6):
+ if (!tcf_csum_ipv6(skb, update_flags))
+ goto drop;
+ break;
+ }
+
+ return action;
+
+drop:
+ spin_lock(&p->tcf_lock);
+ p->tcf_qstats.drops++;
+ spin_unlock(&p->tcf_lock);
+ return TC_ACT_SHOT;
+}
+
+static int tcf_csum_dump(struct sk_buff *skb,
+ struct tc_action *a, int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_csum *p = a->priv;
+ struct tc_csum opt = {
+ .update_flags = p->update_flags,
+ .index = p->tcf_index,
+ .action = p->tcf_action,
+ .refcnt = p->tcf_refcnt - ref,
+ .bindcnt = p->tcf_bindcnt - bind,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+ if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct tc_action_ops act_csum_ops = {
+ .kind = "csum",
+ .type = TCA_ACT_CSUM,
+ .owner = THIS_MODULE,
+ .act = tcf_csum,
+ .dump = tcf_csum_dump,
+ .init = tcf_csum_init,
+};
+
+MODULE_DESCRIPTION("Checksum updating actions");
+MODULE_LICENSE("GPL");
+
+static int __init csum_init_module(void)
+{
+ return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
+}
+
+static void __exit csum_cleanup_module(void)
+{
+ tcf_unregister_action(&act_csum_ops);
+}
+
+module_init(csum_init_module);
+module_exit(csum_cleanup_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
new file mode 100644
index 000000000..7fffc2272
--- /dev/null
+++ b/net/sched/act_gact.c
@@ -0,0 +1,209 @@
+/*
+ * net/sched/act_gact.c Generic actions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2002-4)
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_gact.h>
+#include <net/tc_act/tc_gact.h>
+
+#define GACT_TAB_MASK 15
+
+#ifdef CONFIG_GACT_PROB
+static int gact_net_rand(struct tcf_gact *gact)
+{
+ if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
+ return gact->tcf_action;
+ return gact->tcfg_paction;
+}
+
+static int gact_determ(struct tcf_gact *gact)
+{
+ if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
+ return gact->tcf_action;
+ return gact->tcfg_paction;
+}
+
+typedef int (*g_rand)(struct tcf_gact *gact);
+static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
+#endif /* CONFIG_GACT_PROB */
+
+static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
+ [TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) },
+ [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) },
+};
+
+static int tcf_gact_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
+{
+ struct nlattr *tb[TCA_GACT_MAX + 1];
+ struct tc_gact *parm;
+ struct tcf_gact *gact;
+ int ret = 0;
+ int err;
+#ifdef CONFIG_GACT_PROB
+ struct tc_gact_p *p_parm = NULL;
+#endif
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_GACT_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_GACT_PARMS]);
+
+#ifndef CONFIG_GACT_PROB
+ if (tb[TCA_GACT_PROB] != NULL)
+ return -EOPNOTSUPP;
+#else
+ if (tb[TCA_GACT_PROB]) {
+ p_parm = nla_data(tb[TCA_GACT_PROB]);
+ if (p_parm->ptype >= MAX_RAND)
+ return -EINVAL;
+ }
+#endif
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
+ } else {
+ if (bind)/* dont override defaults */
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ gact = to_gact(a);
+
+ spin_lock_bh(&gact->tcf_lock);
+ gact->tcf_action = parm->action;
+#ifdef CONFIG_GACT_PROB
+ if (p_parm) {
+ gact->tcfg_paction = p_parm->paction;
+ gact->tcfg_pval = p_parm->pval;
+ gact->tcfg_ptype = p_parm->ptype;
+ }
+#endif
+ spin_unlock_bh(&gact->tcf_lock);
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(a);
+ return ret;
+}
+
+static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_gact *gact = a->priv;
+ int action = TC_ACT_SHOT;
+
+ spin_lock(&gact->tcf_lock);
+#ifdef CONFIG_GACT_PROB
+ if (gact->tcfg_ptype)
+ action = gact_rand[gact->tcfg_ptype](gact);
+ else
+ action = gact->tcf_action;
+#else
+ action = gact->tcf_action;
+#endif
+ gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
+ gact->tcf_bstats.packets++;
+ if (action == TC_ACT_SHOT)
+ gact->tcf_qstats.drops++;
+ gact->tcf_tm.lastuse = jiffies;
+ spin_unlock(&gact->tcf_lock);
+
+ return action;
+}
+
+static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_gact *gact = a->priv;
+ struct tc_gact opt = {
+ .index = gact->tcf_index,
+ .refcnt = gact->tcf_refcnt - ref,
+ .bindcnt = gact->tcf_bindcnt - bind,
+ .action = gact->tcf_action,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+#ifdef CONFIG_GACT_PROB
+ if (gact->tcfg_ptype) {
+ struct tc_gact_p p_opt = {
+ .paction = gact->tcfg_paction,
+ .pval = gact->tcfg_pval,
+ .ptype = gact->tcfg_ptype,
+ };
+
+ if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt))
+ goto nla_put_failure;
+ }
+#endif
+ t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
+ if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct tc_action_ops act_gact_ops = {
+ .kind = "gact",
+ .type = TCA_ACT_GACT,
+ .owner = THIS_MODULE,
+ .act = tcf_gact,
+ .dump = tcf_gact_dump,
+ .init = tcf_gact_init,
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Generic Classifier actions");
+MODULE_LICENSE("GPL");
+
+static int __init gact_init_module(void)
+{
+#ifdef CONFIG_GACT_PROB
+ pr_info("GACT probability on\n");
+#else
+ pr_info("GACT probability NOT on\n");
+#endif
+ return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
+}
+
+static void __exit gact_cleanup_module(void)
+{
+ tcf_unregister_action(&act_gact_ops);
+}
+
+module_init(gact_init_module);
+module_exit(gact_cleanup_module);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
new file mode 100644
index 000000000..cbc8dd7dd
--- /dev/null
+++ b/net/sched/act_ipt.c
@@ -0,0 +1,311 @@
+/*
+ * net/sched/act_ipt.c iptables target interface
+ *
+ *TODO: Add other tables. For now we only support the ipv4 table targets
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Copyright: Jamal Hadi Salim (2002-13)
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_ipt.h>
+#include <net/tc_act/tc_ipt.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+
+#define IPT_TAB_MASK 15
+
+static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
+{
+ struct xt_tgchk_param par;
+ struct xt_target *target;
+ int ret = 0;
+
+ target = xt_request_find_target(AF_INET, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target))
+ return PTR_ERR(target);
+
+ t->u.kernel.target = target;
+ par.table = table;
+ par.entryinfo = NULL;
+ par.target = target;
+ par.targinfo = t->data;
+ par.hook_mask = hook;
+ par.family = NFPROTO_IPV4;
+
+ ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
+ if (ret < 0) {
+ module_put(t->u.kernel.target->me);
+ return ret;
+ }
+ return 0;
+}
+
+static void ipt_destroy_target(struct xt_entry_target *t)
+{
+ struct xt_tgdtor_param par = {
+ .target = t->u.kernel.target,
+ .targinfo = t->data,
+ };
+ if (par.target->destroy != NULL)
+ par.target->destroy(&par);
+ module_put(par.target->me);
+}
+
+static void tcf_ipt_release(struct tc_action *a, int bind)
+{
+ struct tcf_ipt *ipt = to_ipt(a);
+ ipt_destroy_target(ipt->tcfi_t);
+ kfree(ipt->tcfi_tname);
+ kfree(ipt->tcfi_t);
+}
+
+static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
+ [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
+ [TCA_IPT_HOOK] = { .type = NLA_U32 },
+ [TCA_IPT_INDEX] = { .type = NLA_U32 },
+ [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
+};
+
+static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+ struct tc_action *a, int ovr, int bind)
+{
+ struct nlattr *tb[TCA_IPT_MAX + 1];
+ struct tcf_ipt *ipt;
+ struct xt_entry_target *td, *t;
+ char *tname;
+ int ret = 0, err;
+ u32 hook = 0;
+ u32 index = 0;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_IPT_HOOK] == NULL)
+ return -EINVAL;
+ if (tb[TCA_IPT_TARG] == NULL)
+ return -EINVAL;
+
+ td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
+ if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
+ return -EINVAL;
+
+ if (tb[TCA_IPT_INDEX] != NULL)
+ index = nla_get_u32(tb[TCA_IPT_INDEX]);
+
+ if (!tcf_hash_check(index, a, bind) ) {
+ ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
+ } else {
+ if (bind)/* dont override defaults */
+ return 0;
+ tcf_hash_release(a, bind);
+
+ if (!ovr)
+ return -EEXIST;
+ }
+ ipt = to_ipt(a);
+
+ hook = nla_get_u32(tb[TCA_IPT_HOOK]);
+
+ err = -ENOMEM;
+ tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
+ if (unlikely(!tname))
+ goto err1;
+ if (tb[TCA_IPT_TABLE] == NULL ||
+ nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
+ strcpy(tname, "mangle");
+
+ t = kmemdup(td, td->u.target_size, GFP_KERNEL);
+ if (unlikely(!t))
+ goto err2;
+
+ err = ipt_init_target(t, tname, hook);
+ if (err < 0)
+ goto err3;
+
+ spin_lock_bh(&ipt->tcf_lock);
+ if (ret != ACT_P_CREATED) {
+ ipt_destroy_target(ipt->tcfi_t);
+ kfree(ipt->tcfi_tname);
+ kfree(ipt->tcfi_t);
+ }
+ ipt->tcfi_tname = tname;
+ ipt->tcfi_t = t;
+ ipt->tcfi_hook = hook;
+ spin_unlock_bh(&ipt->tcf_lock);
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(a);
+ return ret;
+
+err3:
+ kfree(t);
+err2:
+ kfree(tname);
+err1:
+ if (ret == ACT_P_CREATED)
+ tcf_hash_cleanup(a, est);
+ return err;
+}
+
+static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ int ret = 0, result = 0;
+ struct tcf_ipt *ipt = a->priv;
+ struct xt_action_param par;
+
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return TC_ACT_UNSPEC;
+
+ spin_lock(&ipt->tcf_lock);
+
+ ipt->tcf_tm.lastuse = jiffies;
+ bstats_update(&ipt->tcf_bstats, skb);
+
+ /* yes, we have to worry about both in and out dev
+ * worry later - danger - this API seems to have changed
+ * from earlier kernels
+ */
+ par.in = skb->dev;
+ par.out = NULL;
+ par.hooknum = ipt->tcfi_hook;
+ par.target = ipt->tcfi_t->u.kernel.target;
+ par.targinfo = ipt->tcfi_t->data;
+ ret = par.target->target(skb, &par);
+
+ switch (ret) {
+ case NF_ACCEPT:
+ result = TC_ACT_OK;
+ break;
+ case NF_DROP:
+ result = TC_ACT_SHOT;
+ ipt->tcf_qstats.drops++;
+ break;
+ case XT_CONTINUE:
+ result = TC_ACT_PIPE;
+ break;
+ default:
+ net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
+ ret);
+ result = TC_POLICE_OK;
+ break;
+ }
+ spin_unlock(&ipt->tcf_lock);
+ return result;
+
+}
+
+static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ipt *ipt = a->priv;
+ struct xt_entry_target *t;
+ struct tcf_t tm;
+ struct tc_cnt c;
+
+ /* for simple targets kernel size == user size
+ * user name = target name
+ * for foolproof you need to not assume this
+ */
+
+ t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
+ if (unlikely(!t))
+ goto nla_put_failure;
+
+ c.bindcnt = ipt->tcf_bindcnt - bind;
+ c.refcnt = ipt->tcf_refcnt - ref;
+ strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
+
+ if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
+ nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) ||
+ nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) ||
+ nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
+ nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
+ goto nla_put_failure;
+ tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
+ tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
+ tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
+ if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm))
+ goto nla_put_failure;
+ kfree(t);
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ kfree(t);
+ return -1;
+}
+
+static struct tc_action_ops act_ipt_ops = {
+ .kind = "ipt",
+ .type = TCA_ACT_IPT,
+ .owner = THIS_MODULE,
+ .act = tcf_ipt,
+ .dump = tcf_ipt_dump,
+ .cleanup = tcf_ipt_release,
+ .init = tcf_ipt_init,
+};
+
+static struct tc_action_ops act_xt_ops = {
+ .kind = "xt",
+ .type = TCA_ACT_XT,
+ .owner = THIS_MODULE,
+ .act = tcf_ipt,
+ .dump = tcf_ipt_dump,
+ .cleanup = tcf_ipt_release,
+ .init = tcf_ipt_init,
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
+MODULE_DESCRIPTION("Iptables target actions");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("act_xt");
+
+static int __init ipt_init_module(void)
+{
+ int ret1, ret2;
+
+ ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
+ if (ret1 < 0)
+ printk("Failed to load xt action\n");
+ ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
+ if (ret2 < 0)
+ printk("Failed to load ipt action\n");
+
+ if (ret1 < 0 && ret2 < 0) {
+ return ret1;
+ } else
+ return 0;
+}
+
+static void __exit ipt_cleanup_module(void)
+{
+ tcf_unregister_action(&act_xt_ops);
+ tcf_unregister_action(&act_ipt_ops);
+}
+
+module_init(ipt_init_module);
+module_exit(ipt_cleanup_module);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
new file mode 100644
index 000000000..3f63ceac8
--- /dev/null
+++ b/net/sched/act_mirred.c
@@ -0,0 +1,267 @@
+/*
+ * net/sched/act_mirred.c packet mirroring and redirect actions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Jamal Hadi Salim (2002-4)
+ *
+ * TODO: Add ingress support (and socket redirect support)
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_mirred.h>
+
+#include <linux/if_arp.h>
+
+#define MIRRED_TAB_MASK 7
+static LIST_HEAD(mirred_list);
+
+static void tcf_mirred_release(struct tc_action *a, int bind)
+{
+ struct tcf_mirred *m = to_mirred(a);
+ list_del(&m->tcfm_list);
+ if (m->tcfm_dev)
+ dev_put(m->tcfm_dev);
+}
+
+static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
+ [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
+};
+
+static int tcf_mirred_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a, int ovr,
+ int bind)
+{
+ struct nlattr *tb[TCA_MIRRED_MAX + 1];
+ struct tc_mirred *parm;
+ struct tcf_mirred *m;
+ struct net_device *dev;
+ int ret, ok_push = 0;
+
+ if (nla == NULL)
+ return -EINVAL;
+ ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy);
+ if (ret < 0)
+ return ret;
+ if (tb[TCA_MIRRED_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_MIRRED_PARMS]);
+ switch (parm->eaction) {
+ case TCA_EGRESS_MIRROR:
+ case TCA_EGRESS_REDIR:
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (parm->ifindex) {
+ dev = __dev_get_by_index(net, parm->ifindex);
+ if (dev == NULL)
+ return -ENODEV;
+ switch (dev->type) {
+ case ARPHRD_TUNNEL:
+ case ARPHRD_TUNNEL6:
+ case ARPHRD_SIT:
+ case ARPHRD_IPGRE:
+ case ARPHRD_VOID:
+ case ARPHRD_NONE:
+ ok_push = 0;
+ break;
+ default:
+ ok_push = 1;
+ break;
+ }
+ } else {
+ dev = NULL;
+ }
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ if (dev == NULL)
+ return -EINVAL;
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
+ } else {
+ if (!ovr) {
+ tcf_hash_release(a, bind);
+ return -EEXIST;
+ }
+ }
+ m = to_mirred(a);
+
+ spin_lock_bh(&m->tcf_lock);
+ m->tcf_action = parm->action;
+ m->tcfm_eaction = parm->eaction;
+ if (dev != NULL) {
+ m->tcfm_ifindex = parm->ifindex;
+ if (ret != ACT_P_CREATED)
+ dev_put(m->tcfm_dev);
+ dev_hold(dev);
+ m->tcfm_dev = dev;
+ m->tcfm_ok_push = ok_push;
+ }
+ spin_unlock_bh(&m->tcf_lock);
+ if (ret == ACT_P_CREATED) {
+ list_add(&m->tcfm_list, &mirred_list);
+ tcf_hash_insert(a);
+ }
+
+ return ret;
+}
+
+static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_mirred *m = a->priv;
+ struct net_device *dev;
+ struct sk_buff *skb2;
+ u32 at;
+ int retval, err = 1;
+
+ spin_lock(&m->tcf_lock);
+ m->tcf_tm.lastuse = jiffies;
+ bstats_update(&m->tcf_bstats, skb);
+
+ dev = m->tcfm_dev;
+ if (!dev) {
+ printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
+ goto out;
+ }
+
+ if (!(dev->flags & IFF_UP)) {
+ net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
+ dev->name);
+ goto out;
+ }
+
+ at = G_TC_AT(skb->tc_verd);
+ skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
+ if (skb2 == NULL)
+ goto out;
+
+ if (!(at & AT_EGRESS)) {
+ if (m->tcfm_ok_push)
+ skb_push(skb2, skb->mac_len);
+ }
+
+ /* mirror is always swallowed */
+ if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+ skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+
+ skb2->skb_iif = skb->dev->ifindex;
+ skb2->dev = dev;
+ err = dev_queue_xmit(skb2);
+
+out:
+ if (err) {
+ m->tcf_qstats.overlimits++;
+ if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+ retval = TC_ACT_SHOT;
+ else
+ retval = m->tcf_action;
+ } else
+ retval = m->tcf_action;
+ spin_unlock(&m->tcf_lock);
+
+ return retval;
+}
+
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_mirred *m = a->priv;
+ struct tc_mirred opt = {
+ .index = m->tcf_index,
+ .action = m->tcf_action,
+ .refcnt = m->tcf_refcnt - ref,
+ .bindcnt = m->tcf_bindcnt - bind,
+ .eaction = m->tcfm_eaction,
+ .ifindex = m->tcfm_ifindex,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
+ if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int mirred_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct tcf_mirred *m;
+
+ if (event == NETDEV_UNREGISTER)
+ list_for_each_entry(m, &mirred_list, tcfm_list) {
+ spin_lock_bh(&m->tcf_lock);
+ if (m->tcfm_dev == dev) {
+ dev_put(dev);
+ m->tcfm_dev = NULL;
+ }
+ spin_unlock_bh(&m->tcf_lock);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mirred_device_notifier = {
+ .notifier_call = mirred_device_event,
+};
+
+static struct tc_action_ops act_mirred_ops = {
+ .kind = "mirred",
+ .type = TCA_ACT_MIRRED,
+ .owner = THIS_MODULE,
+ .act = tcf_mirred,
+ .dump = tcf_mirred_dump,
+ .cleanup = tcf_mirred_release,
+ .init = tcf_mirred_init,
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002)");
+MODULE_DESCRIPTION("Device Mirror/redirect actions");
+MODULE_LICENSE("GPL");
+
+static int __init mirred_init_module(void)
+{
+ int err = register_netdevice_notifier(&mirred_device_notifier);
+ if (err)
+ return err;
+
+ pr_info("Mirror/redirect action on\n");
+ return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
+}
+
+static void __exit mirred_cleanup_module(void)
+{
+ tcf_unregister_action(&act_mirred_ops);
+ unregister_netdevice_notifier(&mirred_device_notifier);
+}
+
+module_init(mirred_init_module);
+module_exit(mirred_cleanup_module);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
new file mode 100644
index 000000000..270a030d5
--- /dev/null
+++ b/net/sched/act_nat.c
@@ -0,0 +1,306 @@
+/*
+ * Stateless NAT actions
+ *
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tc_act/tc_nat.h>
+#include <net/act_api.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/tc_act/tc_nat.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+
+#define NAT_TAB_MASK 15
+
+static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
+ [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) },
+};
+
+static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
+ struct tc_action *a, int ovr, int bind)
+{
+ struct nlattr *tb[TCA_NAT_MAX + 1];
+ struct tc_nat *parm;
+ int ret = 0, err;
+ struct tcf_nat *p;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_NAT_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_NAT_PARMS]);
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
+ } else {
+ if (bind)
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+ p = to_tcf_nat(a);
+
+ spin_lock_bh(&p->tcf_lock);
+ p->old_addr = parm->old_addr;
+ p->new_addr = parm->new_addr;
+ p->mask = parm->mask;
+ p->flags = parm->flags;
+
+ p->tcf_action = parm->action;
+ spin_unlock_bh(&p->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(a);
+
+ return ret;
+}
+
+static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_nat *p = a->priv;
+ struct iphdr *iph;
+ __be32 old_addr;
+ __be32 new_addr;
+ __be32 mask;
+ __be32 addr;
+ int egress;
+ int action;
+ int ihl;
+ int noff;
+
+ spin_lock(&p->tcf_lock);
+
+ p->tcf_tm.lastuse = jiffies;
+ old_addr = p->old_addr;
+ new_addr = p->new_addr;
+ mask = p->mask;
+ egress = p->flags & TCA_NAT_FLAG_EGRESS;
+ action = p->tcf_action;
+
+ bstats_update(&p->tcf_bstats, skb);
+
+ spin_unlock(&p->tcf_lock);
+
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
+
+ noff = skb_network_offset(skb);
+ if (!pskb_may_pull(skb, sizeof(*iph) + noff))
+ goto drop;
+
+ iph = ip_hdr(skb);
+
+ if (egress)
+ addr = iph->saddr;
+ else
+ addr = iph->daddr;
+
+ if (!((old_addr ^ addr) & mask)) {
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, sizeof(*iph) + noff) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto drop;
+
+ new_addr &= mask;
+ new_addr |= addr & ~mask;
+
+ /* Rewrite IP header */
+ iph = ip_hdr(skb);
+ if (egress)
+ iph->saddr = new_addr;
+ else
+ iph->daddr = new_addr;
+
+ csum_replace4(&iph->check, addr, new_addr);
+ } else if ((iph->frag_off & htons(IP_OFFSET)) ||
+ iph->protocol != IPPROTO_ICMP) {
+ goto out;
+ }
+
+ ihl = iph->ihl * 4;
+
+ /* It would be nice to share code with stateful NAT. */
+ switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+ case IPPROTO_TCP:
+ {
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
+ (skb_cloned(skb) &&
+ !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ goto drop;
+
+ tcph = (void *)(skb_network_header(skb) + ihl);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
+ break;
+ }
+ case IPPROTO_UDP:
+ {
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
+ (skb_cloned(skb) &&
+ !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ goto drop;
+
+ udph = (void *)(skb_network_header(skb) + ihl);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&udph->check, skb, addr,
+ new_addr, 1);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+ break;
+ }
+ case IPPROTO_ICMP:
+ {
+ struct icmphdr *icmph;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff))
+ goto drop;
+
+ icmph = (void *)(skb_network_header(skb) + ihl);
+
+ if ((icmph->type != ICMP_DEST_UNREACH) &&
+ (icmph->type != ICMP_TIME_EXCEEDED) &&
+ (icmph->type != ICMP_PARAMETERPROB))
+ break;
+
+ if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
+ noff))
+ goto drop;
+
+ icmph = (void *)(skb_network_header(skb) + ihl);
+ iph = (void *)(icmph + 1);
+ if (egress)
+ addr = iph->daddr;
+ else
+ addr = iph->saddr;
+
+ if ((old_addr ^ addr) & mask)
+ break;
+
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, ihl + sizeof(*icmph) +
+ sizeof(*iph) + noff) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto drop;
+
+ icmph = (void *)(skb_network_header(skb) + ihl);
+ iph = (void *)(icmph + 1);
+
+ new_addr &= mask;
+ new_addr |= addr & ~mask;
+
+ /* XXX Fix up the inner checksums. */
+ if (egress)
+ iph->daddr = new_addr;
+ else
+ iph->saddr = new_addr;
+
+ inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
+ 0);
+ break;
+ }
+ default:
+ break;
+ }
+
+out:
+ return action;
+
+drop:
+ spin_lock(&p->tcf_lock);
+ p->tcf_qstats.drops++;
+ spin_unlock(&p->tcf_lock);
+ return TC_ACT_SHOT;
+}
+
+static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_nat *p = a->priv;
+ struct tc_nat opt = {
+ .old_addr = p->old_addr,
+ .new_addr = p->new_addr,
+ .mask = p->mask,
+ .flags = p->flags,
+
+ .index = p->tcf_index,
+ .action = p->tcf_action,
+ .refcnt = p->tcf_refcnt - ref,
+ .bindcnt = p->tcf_bindcnt - bind,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+ if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct tc_action_ops act_nat_ops = {
+ .kind = "nat",
+ .type = TCA_ACT_NAT,
+ .owner = THIS_MODULE,
+ .act = tcf_nat,
+ .dump = tcf_nat_dump,
+ .init = tcf_nat_init,
+};
+
+MODULE_DESCRIPTION("Stateless NAT actions");
+MODULE_LICENSE("GPL");
+
+static int __init nat_init_module(void)
+{
+ return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
+}
+
+static void __exit nat_cleanup_module(void)
+{
+ tcf_unregister_action(&act_nat_ops);
+}
+
+module_init(nat_init_module);
+module_exit(nat_cleanup_module);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
new file mode 100644
index 000000000..59649d588
--- /dev/null
+++ b/net/sched/act_pedit.c
@@ -0,0 +1,243 @@
+/*
+ * net/sched/act_pedit.c Generic packet editor
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Jamal Hadi Salim (2002-4)
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_pedit.h>
+
+#define PEDIT_TAB_MASK 15
+
+static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
+ [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
+};
+
+static int tcf_pedit_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
+{
+ struct nlattr *tb[TCA_PEDIT_MAX + 1];
+ struct tc_pedit *parm;
+ int ret = 0, err;
+ struct tcf_pedit *p;
+ struct tc_pedit_key *keys = NULL;
+ int ksize;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_PEDIT_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_PEDIT_PARMS]);
+ ksize = parm->nkeys * sizeof(struct tc_pedit_key);
+ if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
+ return -EINVAL;
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ if (!parm->nkeys)
+ return -EINVAL;
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
+ if (ret)
+ return ret;
+ p = to_pedit(a);
+ keys = kmalloc(ksize, GFP_KERNEL);
+ if (keys == NULL) {
+ tcf_hash_cleanup(a, est);
+ return -ENOMEM;
+ }
+ ret = ACT_P_CREATED;
+ } else {
+ p = to_pedit(a);
+ tcf_hash_release(a, bind);
+ if (bind)
+ return 0;
+ if (!ovr)
+ return -EEXIST;
+
+ if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
+ keys = kmalloc(ksize, GFP_KERNEL);
+ if (keys == NULL)
+ return -ENOMEM;
+ }
+ }
+
+ spin_lock_bh(&p->tcf_lock);
+ p->tcfp_flags = parm->flags;
+ p->tcf_action = parm->action;
+ if (keys) {
+ kfree(p->tcfp_keys);
+ p->tcfp_keys = keys;
+ p->tcfp_nkeys = parm->nkeys;
+ }
+ memcpy(p->tcfp_keys, parm->keys, ksize);
+ spin_unlock_bh(&p->tcf_lock);
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(a);
+ return ret;
+}
+
+static void tcf_pedit_cleanup(struct tc_action *a, int bind)
+{
+ struct tcf_pedit *p = a->priv;
+ struct tc_pedit_key *keys = p->tcfp_keys;
+ kfree(keys);
+}
+
+static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_pedit *p = a->priv;
+ int i, munged = 0;
+ unsigned int off;
+
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return p->tcf_action;
+
+ off = skb_network_offset(skb);
+
+ spin_lock(&p->tcf_lock);
+
+ p->tcf_tm.lastuse = jiffies;
+
+ if (p->tcfp_nkeys > 0) {
+ struct tc_pedit_key *tkey = p->tcfp_keys;
+
+ for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
+ u32 *ptr, _data;
+ int offset = tkey->off;
+
+ if (tkey->offmask) {
+ char *d, _d;
+
+ d = skb_header_pointer(skb, off + tkey->at, 1,
+ &_d);
+ if (!d)
+ goto bad;
+ offset += (*d & tkey->offmask) >> tkey->shift;
+ }
+
+ if (offset % 4) {
+ pr_info("tc filter pedit"
+ " offset must be on 32 bit boundaries\n");
+ goto bad;
+ }
+ if (offset > 0 && offset > skb->len) {
+ pr_info("tc filter pedit"
+ " offset %d can't exceed pkt length %d\n",
+ offset, skb->len);
+ goto bad;
+ }
+
+ ptr = skb_header_pointer(skb, off + offset, 4, &_data);
+ if (!ptr)
+ goto bad;
+ /* just do it, baby */
+ *ptr = ((*ptr & tkey->mask) ^ tkey->val);
+ if (ptr == &_data)
+ skb_store_bits(skb, off + offset, ptr, 4);
+ munged++;
+ }
+
+ if (munged)
+ skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
+ goto done;
+ } else
+ WARN(1, "pedit BUG: index %d\n", p->tcf_index);
+
+bad:
+ p->tcf_qstats.overlimits++;
+done:
+ bstats_update(&p->tcf_bstats, skb);
+ spin_unlock(&p->tcf_lock);
+ return p->tcf_action;
+}
+
+static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_pedit *p = a->priv;
+ struct tc_pedit *opt;
+ struct tcf_t t;
+ int s;
+
+ s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
+
+ /* netlink spinlocks held above us - must use ATOMIC */
+ opt = kzalloc(s, GFP_ATOMIC);
+ if (unlikely(!opt))
+ return -ENOBUFS;
+
+ memcpy(opt->keys, p->tcfp_keys,
+ p->tcfp_nkeys * sizeof(struct tc_pedit_key));
+ opt->index = p->tcf_index;
+ opt->nkeys = p->tcfp_nkeys;
+ opt->flags = p->tcfp_flags;
+ opt->action = p->tcf_action;
+ opt->refcnt = p->tcf_refcnt - ref;
+ opt->bindcnt = p->tcf_bindcnt - bind;
+
+ if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
+ goto nla_put_failure;
+ t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+ if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t))
+ goto nla_put_failure;
+ kfree(opt);
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ kfree(opt);
+ return -1;
+}
+
+static struct tc_action_ops act_pedit_ops = {
+ .kind = "pedit",
+ .type = TCA_ACT_PEDIT,
+ .owner = THIS_MODULE,
+ .act = tcf_pedit,
+ .dump = tcf_pedit_dump,
+ .cleanup = tcf_pedit_cleanup,
+ .init = tcf_pedit_init,
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Generic Packet Editor actions");
+MODULE_LICENSE("GPL");
+
+static int __init pedit_init_module(void)
+{
+ return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
+}
+
+static void __exit pedit_cleanup_module(void)
+{
+ tcf_unregister_action(&act_pedit_ops);
+}
+
+module_init(pedit_init_module);
+module_exit(pedit_cleanup_module);
+
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
new file mode 100644
index 000000000..9a1c42a43
--- /dev/null
+++ b/net/sched/act_police.c
@@ -0,0 +1,372 @@
+/*
+ * net/sched/act_police.c Input police filter
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * J Hadi Salim (action changes)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+
+struct tcf_police {
+ struct tcf_common common;
+ int tcfp_result;
+ u32 tcfp_ewma_rate;
+ s64 tcfp_burst;
+ u32 tcfp_mtu;
+ s64 tcfp_toks;
+ s64 tcfp_ptoks;
+ s64 tcfp_mtu_ptoks;
+ s64 tcfp_t_c;
+ struct psched_ratecfg rate;
+ bool rate_present;
+ struct psched_ratecfg peak;
+ bool peak_present;
+};
+#define to_police(pc) \
+ container_of(pc, struct tcf_police, common)
+
+#define POL_TAB_MASK 15
+
+/* old policer structure from before tc actions */
+struct tc_police_compat {
+ u32 index;
+ int action;
+ u32 limit;
+ u32 burst;
+ u32 mtu;
+ struct tc_ratespec rate;
+ struct tc_ratespec peakrate;
+};
+
+/* Each policer is serialized by its individual spinlock */
+
+static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
+ int type, struct tc_action *a)
+{
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct hlist_head *head;
+ struct tcf_common *p;
+ int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+ struct nlattr *nest;
+
+ spin_lock_bh(&hinfo->lock);
+
+ s_i = cb->args[0];
+
+ for (i = 0; i < (POL_TAB_MASK + 1); i++) {
+ head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
+
+ hlist_for_each_entry_rcu(p, head, tcfc_head) {
+ index++;
+ if (index < s_i)
+ continue;
+ a->priv = p;
+ a->order = index;
+ nest = nla_nest_start(skb, a->order);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (type == RTM_DELACTION)
+ err = tcf_action_dump_1(skb, a, 0, 1);
+ else
+ err = tcf_action_dump_1(skb, a, 0, 0);
+ if (err < 0) {
+ index--;
+ nla_nest_cancel(skb, nest);
+ goto done;
+ }
+ nla_nest_end(skb, nest);
+ n_i++;
+ }
+ }
+done:
+ spin_unlock_bh(&hinfo->lock);
+ if (n_i)
+ cb->args[0] += n_i;
+ return n_i;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ goto done;
+}
+
+static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
+ [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE },
+ [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
+ [TCA_POLICE_AVRATE] = { .type = NLA_U32 },
+ [TCA_POLICE_RESULT] = { .type = NLA_U32 },
+};
+
+static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
+{
+ unsigned int h;
+ int ret = 0, err;
+ struct nlattr *tb[TCA_POLICE_MAX + 1];
+ struct tc_police *parm;
+ struct tcf_police *police;
+ struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
+ struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ int size;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_POLICE_TBF] == NULL)
+ return -EINVAL;
+ size = nla_len(tb[TCA_POLICE_TBF]);
+ if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
+ return -EINVAL;
+ parm = nla_data(tb[TCA_POLICE_TBF]);
+
+ if (parm->index) {
+ if (tcf_hash_search(a, parm->index)) {
+ police = to_police(a->priv);
+ if (bind) {
+ police->tcf_bindcnt += 1;
+ police->tcf_refcnt += 1;
+ return 0;
+ }
+ if (ovr)
+ goto override;
+ /* not replacing */
+ return -EEXIST;
+ }
+ }
+
+ police = kzalloc(sizeof(*police), GFP_KERNEL);
+ if (police == NULL)
+ return -ENOMEM;
+ ret = ACT_P_CREATED;
+ police->tcf_refcnt = 1;
+ spin_lock_init(&police->tcf_lock);
+ if (bind)
+ police->tcf_bindcnt = 1;
+override:
+ if (parm->rate.rate) {
+ err = -ENOMEM;
+ R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
+ if (R_tab == NULL)
+ goto failure;
+
+ if (parm->peakrate.rate) {
+ P_tab = qdisc_get_rtab(&parm->peakrate,
+ tb[TCA_POLICE_PEAKRATE]);
+ if (P_tab == NULL)
+ goto failure;
+ }
+ }
+
+ spin_lock_bh(&police->tcf_lock);
+ if (est) {
+ err = gen_replace_estimator(&police->tcf_bstats, NULL,
+ &police->tcf_rate_est,
+ &police->tcf_lock, est);
+ if (err)
+ goto failure_unlock;
+ } else if (tb[TCA_POLICE_AVRATE] &&
+ (ret == ACT_P_CREATED ||
+ !gen_estimator_active(&police->tcf_bstats,
+ &police->tcf_rate_est))) {
+ err = -EINVAL;
+ goto failure_unlock;
+ }
+
+ /* No failure allowed after this point */
+ police->tcfp_mtu = parm->mtu;
+ if (police->tcfp_mtu == 0) {
+ police->tcfp_mtu = ~0;
+ if (R_tab)
+ police->tcfp_mtu = 255 << R_tab->rate.cell_log;
+ }
+ if (R_tab) {
+ police->rate_present = true;
+ psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
+ qdisc_put_rtab(R_tab);
+ } else {
+ police->rate_present = false;
+ }
+ if (P_tab) {
+ police->peak_present = true;
+ psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
+ qdisc_put_rtab(P_tab);
+ } else {
+ police->peak_present = false;
+ }
+
+ if (tb[TCA_POLICE_RESULT])
+ police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+ police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
+ police->tcfp_toks = police->tcfp_burst;
+ if (police->peak_present) {
+ police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
+ police->tcfp_mtu);
+ police->tcfp_ptoks = police->tcfp_mtu_ptoks;
+ }
+ police->tcf_action = parm->action;
+
+ if (tb[TCA_POLICE_AVRATE])
+ police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+
+ spin_unlock_bh(&police->tcf_lock);
+ if (ret != ACT_P_CREATED)
+ return ret;
+
+ police->tcfp_t_c = ktime_get_ns();
+ police->tcf_index = parm->index ? parm->index :
+ tcf_hash_new_index(hinfo);
+ h = tcf_hash(police->tcf_index, POL_TAB_MASK);
+ spin_lock_bh(&hinfo->lock);
+ hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
+ spin_unlock_bh(&hinfo->lock);
+
+ a->priv = police;
+ return ret;
+
+failure_unlock:
+ spin_unlock_bh(&police->tcf_lock);
+failure:
+ qdisc_put_rtab(P_tab);
+ qdisc_put_rtab(R_tab);
+ if (ret == ACT_P_CREATED)
+ kfree(police);
+ return err;
+}
+
+static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_police *police = a->priv;
+ s64 now;
+ s64 toks;
+ s64 ptoks = 0;
+
+ spin_lock(&police->tcf_lock);
+
+ bstats_update(&police->tcf_bstats, skb);
+
+ if (police->tcfp_ewma_rate &&
+ police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+ }
+
+ if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
+ if (!police->rate_present) {
+ spin_unlock(&police->tcf_lock);
+ return police->tcfp_result;
+ }
+
+ now = ktime_get_ns();
+ toks = min_t(s64, now - police->tcfp_t_c,
+ police->tcfp_burst);
+ if (police->peak_present) {
+ ptoks = toks + police->tcfp_ptoks;
+ if (ptoks > police->tcfp_mtu_ptoks)
+ ptoks = police->tcfp_mtu_ptoks;
+ ptoks -= (s64) psched_l2t_ns(&police->peak,
+ qdisc_pkt_len(skb));
+ }
+ toks += police->tcfp_toks;
+ if (toks > police->tcfp_burst)
+ toks = police->tcfp_burst;
+ toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
+ if ((toks|ptoks) >= 0) {
+ police->tcfp_t_c = now;
+ police->tcfp_toks = toks;
+ police->tcfp_ptoks = ptoks;
+ spin_unlock(&police->tcf_lock);
+ return police->tcfp_result;
+ }
+ }
+
+ police->tcf_qstats.overlimits++;
+ if (police->tcf_action == TC_ACT_SHOT)
+ police->tcf_qstats.drops++;
+ spin_unlock(&police->tcf_lock);
+ return police->tcf_action;
+}
+
+static int
+tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_police *police = a->priv;
+ struct tc_police opt = {
+ .index = police->tcf_index,
+ .action = police->tcf_action,
+ .mtu = police->tcfp_mtu,
+ .burst = PSCHED_NS2TICKS(police->tcfp_burst),
+ .refcnt = police->tcf_refcnt - ref,
+ .bindcnt = police->tcf_bindcnt - bind,
+ };
+
+ if (police->rate_present)
+ psched_ratecfg_getrate(&opt.rate, &police->rate);
+ if (police->peak_present)
+ psched_ratecfg_getrate(&opt.peakrate, &police->peak);
+ if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if (police->tcfp_result &&
+ nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
+ goto nla_put_failure;
+ if (police->tcfp_ewma_rate &&
+ nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+MODULE_AUTHOR("Alexey Kuznetsov");
+MODULE_DESCRIPTION("Policing actions");
+MODULE_LICENSE("GPL");
+
+static struct tc_action_ops act_police_ops = {
+ .kind = "police",
+ .type = TCA_ID_POLICE,
+ .owner = THIS_MODULE,
+ .act = tcf_act_police,
+ .dump = tcf_act_police_dump,
+ .init = tcf_act_police_locate,
+ .walk = tcf_act_police_walker
+};
+
+static int __init
+police_init_module(void)
+{
+ return tcf_register_action(&act_police_ops, POL_TAB_MASK);
+}
+
+static void __exit
+police_cleanup_module(void)
+{
+ tcf_unregister_action(&act_police_ops);
+}
+
+module_init(police_init_module);
+module_exit(police_cleanup_module);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
new file mode 100644
index 000000000..6a8d94886
--- /dev/null
+++ b/net/sched/act_simple.c
@@ -0,0 +1,192 @@
+/*
+ * net/sched/act_simple.c Simple example of an action
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Jamal Hadi Salim (2005-8)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#define TCA_ACT_SIMP 22
+
+#include <linux/tc_act/tc_defact.h>
+#include <net/tc_act/tc_defact.h>
+
+#define SIMP_TAB_MASK 7
+
+#define SIMP_MAX_DATA 32
+static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_defact *d = a->priv;
+
+ spin_lock(&d->tcf_lock);
+ d->tcf_tm.lastuse = jiffies;
+ bstats_update(&d->tcf_bstats, skb);
+
+ /* print policy string followed by _ then packet count
+ * Example if this was the 3rd packet and the string was "hello"
+ * then it would look like "hello_3" (without quotes)
+ */
+ pr_info("simple: %s_%d\n",
+ (char *)d->tcfd_defdata, d->tcf_bstats.packets);
+ spin_unlock(&d->tcf_lock);
+ return d->tcf_action;
+}
+
+static void tcf_simp_release(struct tc_action *a, int bind)
+{
+ struct tcf_defact *d = to_defact(a);
+ kfree(d->tcfd_defdata);
+}
+
+static int alloc_defdata(struct tcf_defact *d, char *defdata)
+{
+ d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
+ if (unlikely(!d->tcfd_defdata))
+ return -ENOMEM;
+ strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+ return 0;
+}
+
+static void reset_policy(struct tcf_defact *d, char *defdata,
+ struct tc_defact *p)
+{
+ spin_lock_bh(&d->tcf_lock);
+ d->tcf_action = p->action;
+ memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
+ strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+ spin_unlock_bh(&d->tcf_lock);
+}
+
+static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
+ [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) },
+ [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA },
+};
+
+static int tcf_simp_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
+{
+ struct nlattr *tb[TCA_DEF_MAX + 1];
+ struct tc_defact *parm;
+ struct tcf_defact *d;
+ char *defdata;
+ int ret = 0, err;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_DEF_PARMS] == NULL)
+ return -EINVAL;
+
+ if (tb[TCA_DEF_DATA] == NULL)
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_DEF_PARMS]);
+ defdata = nla_data(tb[TCA_DEF_DATA]);
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ if (ret)
+ return ret;
+
+ d = to_defact(a);
+ ret = alloc_defdata(d, defdata);
+ if (ret < 0) {
+ tcf_hash_cleanup(a, est);
+ return ret;
+ }
+ d->tcf_action = parm->action;
+ ret = ACT_P_CREATED;
+ } else {
+ d = to_defact(a);
+
+ if (bind)
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
+ return -EEXIST;
+
+ reset_policy(d, defdata, parm);
+ }
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(a);
+ return ret;
+}
+
+static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_defact *d = a->priv;
+ struct tc_defact opt = {
+ .index = d->tcf_index,
+ .refcnt = d->tcf_refcnt - ref,
+ .bindcnt = d->tcf_bindcnt - bind,
+ .action = d->tcf_action,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
+ nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
+ goto nla_put_failure;
+ t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+ if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct tc_action_ops act_simp_ops = {
+ .kind = "simple",
+ .type = TCA_ACT_SIMP,
+ .owner = THIS_MODULE,
+ .act = tcf_simp,
+ .dump = tcf_simp_dump,
+ .cleanup = tcf_simp_release,
+ .init = tcf_simp_init,
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2005)");
+MODULE_DESCRIPTION("Simple example action");
+MODULE_LICENSE("GPL");
+
+static int __init simp_init_module(void)
+{
+ int ret;
+ ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
+ if (!ret)
+ pr_info("Simple TC action Loaded\n");
+ return ret;
+}
+
+static void __exit simp_cleanup_module(void)
+{
+ tcf_unregister_action(&act_simp_ops);
+}
+
+module_init(simp_init_module);
+module_exit(simp_cleanup_module);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
new file mode 100644
index 000000000..fcfeeaf83
--- /dev/null
+++ b/net/sched/act_skbedit.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_skbedit.h>
+
+#define SKBEDIT_TAB_MASK 15
+
+static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_skbedit *d = a->priv;
+
+ spin_lock(&d->tcf_lock);
+ d->tcf_tm.lastuse = jiffies;
+ bstats_update(&d->tcf_bstats, skb);
+
+ if (d->flags & SKBEDIT_F_PRIORITY)
+ skb->priority = d->priority;
+ if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
+ skb->dev->real_num_tx_queues > d->queue_mapping)
+ skb_set_queue_mapping(skb, d->queue_mapping);
+ if (d->flags & SKBEDIT_F_MARK)
+ skb->mark = d->mark;
+
+ spin_unlock(&d->tcf_lock);
+ return d->tcf_action;
+}
+
+static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
+ [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
+ [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
+ [TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
+ [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
+};
+
+static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
+{
+ struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+ struct tc_skbedit *parm;
+ struct tcf_skbedit *d;
+ u32 flags = 0, *priority = NULL, *mark = NULL;
+ u16 *queue_mapping = NULL;
+ int ret = 0, err;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_SKBEDIT_PARMS] == NULL)
+ return -EINVAL;
+
+ if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
+ flags |= SKBEDIT_F_PRIORITY;
+ priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
+ }
+
+ if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+ flags |= SKBEDIT_F_QUEUE_MAPPING;
+ queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
+ }
+
+ if (tb[TCA_SKBEDIT_MARK] != NULL) {
+ flags |= SKBEDIT_F_MARK;
+ mark = nla_data(tb[TCA_SKBEDIT_MARK]);
+ }
+
+ if (!flags)
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
+ if (ret)
+ return ret;
+
+ d = to_skbedit(a);
+ ret = ACT_P_CREATED;
+ } else {
+ d = to_skbedit(a);
+ if (bind)
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ spin_lock_bh(&d->tcf_lock);
+
+ d->flags = flags;
+ if (flags & SKBEDIT_F_PRIORITY)
+ d->priority = *priority;
+ if (flags & SKBEDIT_F_QUEUE_MAPPING)
+ d->queue_mapping = *queue_mapping;
+ if (flags & SKBEDIT_F_MARK)
+ d->mark = *mark;
+
+ d->tcf_action = parm->action;
+
+ spin_unlock_bh(&d->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(a);
+ return ret;
+}
+
+static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_skbedit *d = a->priv;
+ struct tc_skbedit opt = {
+ .index = d->tcf_index,
+ .refcnt = d->tcf_refcnt - ref,
+ .bindcnt = d->tcf_bindcnt - bind,
+ .action = d->tcf_action,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_PRIORITY) &&
+ nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
+ &d->priority))
+ goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
+ nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING,
+ sizeof(d->queue_mapping), &d->queue_mapping))
+ goto nla_put_failure;
+ if ((d->flags & SKBEDIT_F_MARK) &&
+ nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
+ &d->mark))
+ goto nla_put_failure;
+ t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+ if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct tc_action_ops act_skbedit_ops = {
+ .kind = "skbedit",
+ .type = TCA_ACT_SKBEDIT,
+ .owner = THIS_MODULE,
+ .act = tcf_skbedit,
+ .dump = tcf_skbedit_dump,
+ .init = tcf_skbedit_init,
+};
+
+MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
+MODULE_DESCRIPTION("SKB Editing");
+MODULE_LICENSE("GPL");
+
+static int __init skbedit_init_module(void)
+{
+ return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
+}
+
+static void __exit skbedit_cleanup_module(void)
+{
+ tcf_unregister_action(&act_skbedit_ops);
+}
+
+module_init(skbedit_init_module);
+module_exit(skbedit_cleanup_module);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
new file mode 100644
index 000000000..d735ecf0b
--- /dev/null
+++ b/net/sched/act_vlan.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_vlan.h>
+#include <net/tc_act/tc_vlan.h>
+
+#define VLAN_TAB_MASK 15
+
+static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_vlan *v = a->priv;
+ int action;
+ int err;
+
+ spin_lock(&v->tcf_lock);
+ v->tcf_tm.lastuse = jiffies;
+ bstats_update(&v->tcf_bstats, skb);
+ action = v->tcf_action;
+
+ switch (v->tcfv_action) {
+ case TCA_VLAN_ACT_POP:
+ err = skb_vlan_pop(skb);
+ if (err)
+ goto drop;
+ break;
+ case TCA_VLAN_ACT_PUSH:
+ err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid);
+ if (err)
+ goto drop;
+ break;
+ default:
+ BUG();
+ }
+
+ goto unlock;
+
+drop:
+ action = TC_ACT_SHOT;
+ v->tcf_qstats.drops++;
+unlock:
+ spin_unlock(&v->tcf_lock);
+ return action;
+}
+
+static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
+ [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) },
+ [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 },
+ [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 },
+};
+
+static int tcf_vlan_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
+{
+ struct nlattr *tb[TCA_VLAN_MAX + 1];
+ struct tc_vlan *parm;
+ struct tcf_vlan *v;
+ int action;
+ __be16 push_vid = 0;
+ __be16 push_proto = 0;
+ int ret = 0;
+ int err;
+
+ if (!nla)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_VLAN_MAX, nla, vlan_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_VLAN_PARMS])
+ return -EINVAL;
+ parm = nla_data(tb[TCA_VLAN_PARMS]);
+ switch (parm->v_action) {
+ case TCA_VLAN_ACT_POP:
+ break;
+ case TCA_VLAN_ACT_PUSH:
+ if (!tb[TCA_VLAN_PUSH_VLAN_ID])
+ return -EINVAL;
+ push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
+ if (push_vid >= VLAN_VID_MASK)
+ return -ERANGE;
+
+ if (tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]) {
+ push_proto = nla_get_be16(tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]);
+ switch (push_proto) {
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+ } else {
+ push_proto = htons(ETH_P_8021Q);
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+ action = parm->v_action;
+
+ if (!tcf_hash_check(parm->index, a, bind)) {
+ ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind);
+ if (ret)
+ return ret;
+
+ ret = ACT_P_CREATED;
+ } else {
+ if (bind)
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ v = to_vlan(a);
+
+ spin_lock_bh(&v->tcf_lock);
+
+ v->tcfv_action = action;
+ v->tcfv_push_vid = push_vid;
+ v->tcfv_push_proto = push_proto;
+
+ v->tcf_action = parm->action;
+
+ spin_unlock_bh(&v->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(a);
+ return ret;
+}
+
+static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_vlan *v = a->priv;
+ struct tc_vlan opt = {
+ .index = v->tcf_index,
+ .refcnt = v->tcf_refcnt - ref,
+ .bindcnt = v->tcf_bindcnt - bind,
+ .action = v->tcf_action,
+ .v_action = v->tcfv_action,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ if (v->tcfv_action == TCA_VLAN_ACT_PUSH &&
+ (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) ||
+ nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto)))
+ goto nla_put_failure;
+
+ t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(v->tcf_tm.expires);
+ if (nla_put(skb, TCA_VLAN_TM, sizeof(t), &t))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct tc_action_ops act_vlan_ops = {
+ .kind = "vlan",
+ .type = TCA_ACT_VLAN,
+ .owner = THIS_MODULE,
+ .act = tcf_vlan,
+ .dump = tcf_vlan_dump,
+ .init = tcf_vlan_init,
+};
+
+static int __init vlan_init_module(void)
+{
+ return tcf_register_action(&act_vlan_ops, VLAN_TAB_MASK);
+}
+
+static void __exit vlan_cleanup_module(void)
+{
+ tcf_unregister_action(&act_vlan_ops);
+}
+
+module_init(vlan_init_module);
+module_exit(vlan_cleanup_module);
+
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("vlan manipulation actions");
+MODULE_LICENSE("GPL v2");
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
new file mode 100644
index 000000000..a75864d93
--- /dev/null
+++ b/net/sched/cls_api.c
@@ -0,0 +1,634 @@
+/*
+ * net/sched/cls_api.c Packet classifier API.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+/* The list of all installed classifier types */
+static LIST_HEAD(tcf_proto_base);
+
+/* Protects list of registered TC modules. It is pure SMP lock. */
+static DEFINE_RWLOCK(cls_mod_lock);
+
+/* Find classifier type by string name */
+
+static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
+{
+ const struct tcf_proto_ops *t, *res = NULL;
+
+ if (kind) {
+ read_lock(&cls_mod_lock);
+ list_for_each_entry(t, &tcf_proto_base, head) {
+ if (nla_strcmp(kind, t->kind) == 0) {
+ if (try_module_get(t->owner))
+ res = t;
+ break;
+ }
+ }
+ read_unlock(&cls_mod_lock);
+ }
+ return res;
+}
+
+/* Register(unregister) new classifier type */
+
+int register_tcf_proto_ops(struct tcf_proto_ops *ops)
+{
+ struct tcf_proto_ops *t;
+ int rc = -EEXIST;
+
+ write_lock(&cls_mod_lock);
+ list_for_each_entry(t, &tcf_proto_base, head)
+ if (!strcmp(ops->kind, t->kind))
+ goto out;
+
+ list_add_tail(&ops->head, &tcf_proto_base);
+ rc = 0;
+out:
+ write_unlock(&cls_mod_lock);
+ return rc;
+}
+EXPORT_SYMBOL(register_tcf_proto_ops);
+
+int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
+{
+ struct tcf_proto_ops *t;
+ int rc = -ENOENT;
+
+ /* Wait for outstanding call_rcu()s, if any, from a
+ * tcf_proto_ops's destroy() handler.
+ */
+ rcu_barrier();
+
+ write_lock(&cls_mod_lock);
+ list_for_each_entry(t, &tcf_proto_base, head) {
+ if (t == ops) {
+ list_del(&t->head);
+ rc = 0;
+ break;
+ }
+ }
+ write_unlock(&cls_mod_lock);
+ return rc;
+}
+EXPORT_SYMBOL(unregister_tcf_proto_ops);
+
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct tcf_proto *tp,
+ unsigned long fh, int event);
+
+
+/* Select new prio value from the range, managed by kernel. */
+
+static inline u32 tcf_auto_prio(struct tcf_proto *tp)
+{
+ u32 first = TC_H_MAKE(0xC0000000U, 0U);
+
+ if (tp)
+ first = tp->prio - 1;
+
+ return first;
+}
+
+/* Add/change/delete/get a filter node */
+
+static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 protocol;
+ u32 prio;
+ u32 nprio;
+ u32 parent;
+ struct net_device *dev;
+ struct Qdisc *q;
+ struct tcf_proto __rcu **back;
+ struct tcf_proto __rcu **chain;
+ struct tcf_proto *tp;
+ const struct tcf_proto_ops *tp_ops;
+ const struct Qdisc_class_ops *cops;
+ unsigned long cl;
+ unsigned long fh;
+ int err;
+ int tp_created = 0;
+
+ if ((n->nlmsg_type != RTM_GETTFILTER) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+replay:
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ protocol = TC_H_MIN(t->tcm_info);
+ prio = TC_H_MAJ(t->tcm_info);
+ nprio = prio;
+ parent = t->tcm_parent;
+ cl = 0;
+
+ if (prio == 0) {
+ /* If no priority is given, user wants we allocated it. */
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
+ return -ENOENT;
+ prio = TC_H_MAKE(0x80000000U, 0U);
+ }
+
+ /* Find head of filter chain. */
+
+ /* Find link */
+ dev = __dev_get_by_index(net, t->tcm_ifindex);
+ if (dev == NULL)
+ return -ENODEV;
+
+ /* Find qdisc */
+ if (!parent) {
+ q = dev->qdisc;
+ parent = q->handle;
+ } else {
+ q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
+ if (q == NULL)
+ return -EINVAL;
+ }
+
+ /* Is it classful? */
+ cops = q->ops->cl_ops;
+ if (!cops)
+ return -EINVAL;
+
+ if (cops->tcf_chain == NULL)
+ return -EOPNOTSUPP;
+
+ /* Do we search for filter, attached to class? */
+ if (TC_H_MIN(parent)) {
+ cl = cops->get(q, parent);
+ if (cl == 0)
+ return -ENOENT;
+ }
+
+ /* And the last stroke */
+ chain = cops->tcf_chain(q, cl);
+ err = -EINVAL;
+ if (chain == NULL)
+ goto errout;
+
+ /* Check the chain for existence of proto-tcf with this priority */
+ for (back = chain;
+ (tp = rtnl_dereference(*back)) != NULL;
+ back = &tp->next) {
+ if (tp->prio >= prio) {
+ if (tp->prio == prio) {
+ if (!nprio ||
+ (tp->protocol != protocol && protocol))
+ goto errout;
+ } else
+ tp = NULL;
+ break;
+ }
+ }
+
+ if (tp == NULL) {
+ /* Proto-tcf does not exist, create new one */
+
+ if (tca[TCA_KIND] == NULL || !protocol)
+ goto errout;
+
+ err = -ENOENT;
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
+ goto errout;
+
+
+ /* Create new proto tcf */
+
+ err = -ENOBUFS;
+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+ if (tp == NULL)
+ goto errout;
+ err = -ENOENT;
+ tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
+ if (tp_ops == NULL) {
+#ifdef CONFIG_MODULES
+ struct nlattr *kind = tca[TCA_KIND];
+ char name[IFNAMSIZ];
+
+ if (kind != NULL &&
+ nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+ rtnl_unlock();
+ request_module("cls_%s", name);
+ rtnl_lock();
+ tp_ops = tcf_proto_lookup_ops(kind);
+ /* We dropped the RTNL semaphore in order to
+ * perform the module load. So, even if we
+ * succeeded in loading the module we have to
+ * replay the request. We indicate this using
+ * -EAGAIN.
+ */
+ if (tp_ops != NULL) {
+ module_put(tp_ops->owner);
+ err = -EAGAIN;
+ }
+ }
+#endif
+ kfree(tp);
+ goto errout;
+ }
+ tp->ops = tp_ops;
+ tp->protocol = protocol;
+ tp->prio = nprio ? :
+ TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
+ tp->q = q;
+ tp->classify = tp_ops->classify;
+ tp->classid = parent;
+
+ err = tp_ops->init(tp);
+ if (err != 0) {
+ module_put(tp_ops->owner);
+ kfree(tp);
+ goto errout;
+ }
+
+ tp_created = 1;
+
+ } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
+ goto errout;
+
+ fh = tp->ops->get(tp, t->tcm_handle);
+
+ if (fh == 0) {
+ if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
+ struct tcf_proto *next = rtnl_dereference(tp->next);
+
+ RCU_INIT_POINTER(*back, next);
+
+ tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+ tcf_destroy(tp, true);
+ err = 0;
+ goto errout;
+ }
+
+ err = -ENOENT;
+ if (n->nlmsg_type != RTM_NEWTFILTER ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
+ goto errout;
+ } else {
+ switch (n->nlmsg_type) {
+ case RTM_NEWTFILTER:
+ err = -EEXIST;
+ if (n->nlmsg_flags & NLM_F_EXCL) {
+ if (tp_created)
+ tcf_destroy(tp, true);
+ goto errout;
+ }
+ break;
+ case RTM_DELTFILTER:
+ err = tp->ops->delete(tp, fh);
+ if (err == 0) {
+ struct tcf_proto *next = rtnl_dereference(tp->next);
+
+ tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+ if (tcf_destroy(tp, false))
+ RCU_INIT_POINTER(*back, next);
+ }
+ goto errout;
+ case RTM_GETTFILTER:
+ err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+ goto errout;
+ default:
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
+ err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
+ n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
+ if (err == 0) {
+ if (tp_created) {
+ RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
+ rcu_assign_pointer(*back, tp);
+ }
+ tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+ } else {
+ if (tp_created)
+ tcf_destroy(tp, true);
+ }
+
+errout:
+ if (cl)
+ cops->put(q, cl);
+ if (err == -EAGAIN)
+ /* Replay the request. */
+ goto replay;
+ return err;
+}
+
+static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,
+ unsigned long fh, u32 portid, u32 seq, u16 flags, int event)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
+ tcm->tcm_parent = tp->classid;
+ tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
+ if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
+ goto nla_put_failure;
+ tcm->tcm_handle = fh;
+ if (RTM_DELTFILTER != event) {
+ tcm->tcm_handle = 0;
+ if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
+ goto nla_put_failure;
+ }
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct tcf_proto *tp,
+ unsigned long fh, int event)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+struct tcf_dump_args {
+ struct tcf_walker w;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+};
+
+static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
+ struct tcf_walker *arg)
+{
+ struct tcf_dump_args *a = (void *)arg;
+ struct net *net = sock_net(a->skb->sk);
+
+ return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
+}
+
+/* called with RTNL */
+static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int t;
+ int s_t;
+ struct net_device *dev;
+ struct Qdisc *q;
+ struct tcf_proto *tp, __rcu **chain;
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ unsigned long cl = 0;
+ const struct Qdisc_class_ops *cops;
+ struct tcf_dump_args arg;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return skb->len;
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return skb->len;
+
+ if (!tcm->tcm_parent)
+ q = dev->qdisc;
+ else
+ q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+ if (!q)
+ goto out;
+ cops = q->ops->cl_ops;
+ if (!cops)
+ goto errout;
+ if (cops->tcf_chain == NULL)
+ goto errout;
+ if (TC_H_MIN(tcm->tcm_parent)) {
+ cl = cops->get(q, tcm->tcm_parent);
+ if (cl == 0)
+ goto errout;
+ }
+ chain = cops->tcf_chain(q, cl);
+ if (chain == NULL)
+ goto errout;
+
+ s_t = cb->args[0];
+
+ for (tp = rtnl_dereference(*chain), t = 0;
+ tp; tp = rtnl_dereference(tp->next), t++) {
+ if (t < s_t)
+ continue;
+ if (TC_H_MAJ(tcm->tcm_info) &&
+ TC_H_MAJ(tcm->tcm_info) != tp->prio)
+ continue;
+ if (TC_H_MIN(tcm->tcm_info) &&
+ TC_H_MIN(tcm->tcm_info) != tp->protocol)
+ continue;
+ if (t > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+ if (cb->args[1] == 0) {
+ if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWTFILTER) <= 0)
+ break;
+
+ cb->args[1] = 1;
+ }
+ if (tp->ops->walk == NULL)
+ continue;
+ arg.w.fn = tcf_node_dump;
+ arg.skb = skb;
+ arg.cb = cb;
+ arg.w.stop = 0;
+ arg.w.skip = cb->args[1] - 1;
+ arg.w.count = 0;
+ tp->ops->walk(tp, &arg.w);
+ cb->args[1] = arg.w.count + 1;
+ if (arg.w.stop)
+ break;
+ }
+
+ cb->args[0] = t;
+
+errout:
+ if (cl)
+ cops->put(q, cl);
+out:
+ return skb->len;
+}
+
+void tcf_exts_destroy(struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
+ INIT_LIST_HEAD(&exts->actions);
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_destroy);
+
+int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
+ struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ {
+ struct tc_action *act;
+
+ INIT_LIST_HEAD(&exts->actions);
+ if (exts->police && tb[exts->police]) {
+ act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
+ "police", ovr,
+ TCA_ACT_BIND);
+ if (IS_ERR(act))
+ return PTR_ERR(act);
+
+ act->type = exts->type = TCA_OLD_COMPAT;
+ list_add(&act->list, &exts->actions);
+ } else if (exts->action && tb[exts->action]) {
+ int err;
+ err = tcf_action_init(net, tb[exts->action], rate_tlv,
+ NULL, ovr,
+ TCA_ACT_BIND, &exts->actions);
+ if (err)
+ return err;
+ }
+ }
+#else
+ if ((exts->action && tb[exts->action]) ||
+ (exts->police && tb[exts->police]))
+ return -EOPNOTSUPP;
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL(tcf_exts_validate);
+
+void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
+ struct tcf_exts *src)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ LIST_HEAD(tmp);
+ tcf_tree_lock(tp);
+ list_splice_init(&dst->actions, &tmp);
+ list_splice(&src->actions, &dst->actions);
+ dst->type = src->type;
+ tcf_tree_unlock(tp);
+ tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_change);
+
+#define tcf_exts_first_act(ext) \
+ list_first_entry_or_null(&(exts)->actions, \
+ struct tc_action, list)
+
+int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct nlattr *nest;
+
+ if (exts->action && !list_empty(&exts->actions)) {
+ /*
+ * again for backward compatible mode - we want
+ * to work with both old and new modes of entering
+ * tc data even if iproute2 was newer - jhs
+ */
+ if (exts->type != TCA_OLD_COMPAT) {
+ nest = nla_nest_start(skb, exts->action);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ } else if (exts->police) {
+ struct tc_action *act = tcf_exts_first_act(exts);
+ nest = nla_nest_start(skb, exts->police);
+ if (nest == NULL || !act)
+ goto nla_put_failure;
+ if (tcf_action_dump_old(skb, act, 0, 0) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ }
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+#else
+ return 0;
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_dump);
+
+
+int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ struct tc_action *a = tcf_exts_first_act(exts);
+ if (a != NULL && tcf_action_copy_stats(skb, a, 1) < 0)
+ return -1;
+#endif
+ return 0;
+}
+EXPORT_SYMBOL(tcf_exts_dump_stats);
+
+static int __init tc_filter_init(void)
+{
+ rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
+ tc_dump_tfilter, NULL);
+
+ return 0;
+}
+
+subsys_initcall(tc_filter_init);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
new file mode 100644
index 000000000..0b8c3ace6
--- /dev/null
+++ b/net/sched/cls_basic.c
@@ -0,0 +1,312 @@
+/*
+ * net/sched/cls_basic.c Basic Packet Classifier.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+struct basic_head {
+ u32 hgenerator;
+ struct list_head flist;
+ struct rcu_head rcu;
+};
+
+struct basic_filter {
+ u32 handle;
+ struct tcf_exts exts;
+ struct tcf_ematch_tree ematches;
+ struct tcf_result res;
+ struct tcf_proto *tp;
+ struct list_head link;
+ struct rcu_head rcu;
+};
+
+static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ int r;
+ struct basic_head *head = rcu_dereference_bh(tp->root);
+ struct basic_filter *f;
+
+ list_for_each_entry_rcu(f, &head->flist, link) {
+ if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+ continue;
+ *res = f->res;
+ r = tcf_exts_exec(skb, &f->exts, res);
+ if (r < 0)
+ continue;
+ return r;
+ }
+ return -1;
+}
+
+static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
+{
+ unsigned long l = 0UL;
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct basic_filter *f;
+
+ if (head == NULL)
+ return 0UL;
+
+ list_for_each_entry(f, &head->flist, link) {
+ if (f->handle == handle) {
+ l = (unsigned long) f;
+ break;
+ }
+ }
+
+ return l;
+}
+
+static int basic_init(struct tcf_proto *tp)
+{
+ struct basic_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+ INIT_LIST_HEAD(&head->flist);
+ rcu_assign_pointer(tp->root, head);
+ return 0;
+}
+
+static void basic_delete_filter(struct rcu_head *head)
+{
+ struct basic_filter *f = container_of(head, struct basic_filter, rcu);
+
+ tcf_exts_destroy(&f->exts);
+ tcf_em_tree_destroy(&f->ematches);
+ kfree(f);
+}
+
+static bool basic_destroy(struct tcf_proto *tp, bool force)
+{
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct basic_filter *f, *n;
+
+ if (!force && !list_empty(&head->flist))
+ return false;
+
+ list_for_each_entry_safe(f, n, &head->flist, link) {
+ list_del_rcu(&f->link);
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, basic_delete_filter);
+ }
+ RCU_INIT_POINTER(tp->root, NULL);
+ kfree_rcu(head, rcu);
+ return true;
+}
+
+static int basic_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct basic_filter *f = (struct basic_filter *) arg;
+
+ list_del_rcu(&f->link);
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, basic_delete_filter);
+ return 0;
+}
+
+static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
+ [TCA_BASIC_CLASSID] = { .type = NLA_U32 },
+ [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
+};
+
+static int basic_set_parms(struct net *net, struct tcf_proto *tp,
+ struct basic_filter *f, unsigned long base,
+ struct nlattr **tb,
+ struct nlattr *est, bool ovr)
+{
+ int err;
+ struct tcf_exts e;
+ struct tcf_ematch_tree t;
+
+ tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ return err;
+
+ err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
+ if (err < 0)
+ goto errout;
+
+ if (tb[TCA_BASIC_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ tcf_exts_change(tp, &f->exts, &e);
+ tcf_em_tree_change(tp, &f->ematches, &t);
+ f->tp = tp;
+
+ return 0;
+errout:
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static int basic_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, unsigned long *arg, bool ovr)
+{
+ int err;
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct nlattr *tb[TCA_BASIC_MAX + 1];
+ struct basic_filter *fold = (struct basic_filter *) *arg;
+ struct basic_filter *fnew;
+
+ if (tca[TCA_OPTIONS] == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
+ basic_policy);
+ if (err < 0)
+ return err;
+
+ if (fold != NULL) {
+ if (handle && fold->handle != handle)
+ return -EINVAL;
+ }
+
+ fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+ if (!fnew)
+ return -ENOBUFS;
+
+ tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ err = -EINVAL;
+ if (handle) {
+ fnew->handle = handle;
+ } else if (fold) {
+ fnew->handle = fold->handle;
+ } else {
+ unsigned int i = 0x80000000;
+ do {
+ if (++head->hgenerator == 0x7FFFFFFF)
+ head->hgenerator = 1;
+ } while (--i > 0 && basic_get(tp, head->hgenerator));
+
+ if (i <= 0) {
+ pr_err("Insufficient number of handles\n");
+ goto errout;
+ }
+
+ fnew->handle = head->hgenerator;
+ }
+
+ err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
+ if (err < 0)
+ goto errout;
+
+ *arg = (unsigned long)fnew;
+
+ if (fold) {
+ list_replace_rcu(&fold->link, &fnew->link);
+ tcf_unbind_filter(tp, &fold->res);
+ call_rcu(&fold->rcu, basic_delete_filter);
+ } else {
+ list_add_rcu(&fnew->link, &head->flist);
+ }
+
+ return 0;
+errout:
+ kfree(fnew);
+ return err;
+}
+
+static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct basic_head *head = rtnl_dereference(tp->root);
+ struct basic_filter *f;
+
+ list_for_each_entry(f, &head->flist, link) {
+ if (arg->count < arg->skip)
+ goto skip;
+
+ if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct basic_filter *f = (struct basic_filter *) fh;
+ struct nlattr *nest;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0 ||
+ tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_basic_ops __read_mostly = {
+ .kind = "basic",
+ .classify = basic_classify,
+ .init = basic_init,
+ .destroy = basic_destroy,
+ .get = basic_get,
+ .change = basic_change,
+ .delete = basic_delete,
+ .walk = basic_walk,
+ .dump = basic_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_basic(void)
+{
+ return register_tcf_proto_ops(&cls_basic_ops);
+}
+
+static void __exit exit_basic(void)
+{
+ unregister_tcf_proto_ops(&cls_basic_ops);
+}
+
+module_init(init_basic)
+module_exit(exit_basic)
+MODULE_LICENSE("GPL");
+
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
new file mode 100644
index 000000000..91bd9c194
--- /dev/null
+++ b/net/sched/cls_bpf.c
@@ -0,0 +1,495 @@
+/*
+ * Berkeley Packet Filter based traffic classifier
+ *
+ * Might be used to classify traffic through flexible, user-defined and
+ * possibly JIT-ed BPF filters for traffic control as an alternative to
+ * ematches.
+ *
+ * (C) 2013 Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
+MODULE_DESCRIPTION("TC BPF based classifier");
+
+#define CLS_BPF_NAME_LEN 256
+
+struct cls_bpf_head {
+ struct list_head plist;
+ u32 hgen;
+ struct rcu_head rcu;
+};
+
+struct cls_bpf_prog {
+ struct bpf_prog *filter;
+ struct list_head link;
+ struct tcf_result res;
+ struct tcf_exts exts;
+ u32 handle;
+ union {
+ u32 bpf_fd;
+ u16 bpf_num_ops;
+ };
+ struct sock_filter *bpf_ops;
+ const char *bpf_name;
+ struct tcf_proto *tp;
+ struct rcu_head rcu;
+};
+
+static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
+ [TCA_BPF_CLASSID] = { .type = NLA_U32 },
+ [TCA_BPF_FD] = { .type = NLA_U32 },
+ [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN },
+ [TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
+ [TCA_BPF_OPS] = { .type = NLA_BINARY,
+ .len = sizeof(struct sock_filter) * BPF_MAXINSNS },
+};
+
+static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
+ struct cls_bpf_prog *prog;
+ int ret = -1;
+
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return -1;
+
+ /* Needed here for accessing maps. */
+ rcu_read_lock();
+ list_for_each_entry_rcu(prog, &head->plist, link) {
+ int filter_res = BPF_PROG_RUN(prog->filter, skb);
+
+ if (filter_res == 0)
+ continue;
+
+ *res = prog->res;
+ if (filter_res != -1)
+ res->classid = filter_res;
+
+ ret = tcf_exts_exec(skb, &prog->exts, res);
+ if (ret < 0)
+ continue;
+
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
+{
+ return !prog->bpf_ops;
+}
+
+static int cls_bpf_init(struct tcf_proto *tp)
+{
+ struct cls_bpf_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+
+ INIT_LIST_HEAD_RCU(&head->plist);
+ rcu_assign_pointer(tp->root, head);
+
+ return 0;
+}
+
+static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+{
+ tcf_exts_destroy(&prog->exts);
+
+ if (cls_bpf_is_ebpf(prog))
+ bpf_prog_put(prog->filter);
+ else
+ bpf_prog_destroy(prog->filter);
+
+ kfree(prog->bpf_name);
+ kfree(prog->bpf_ops);
+ kfree(prog);
+}
+
+static void __cls_bpf_delete_prog(struct rcu_head *rcu)
+{
+ struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
+
+ cls_bpf_delete_prog(prog->tp, prog);
+}
+
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
+
+ list_del_rcu(&prog->link);
+ tcf_unbind_filter(tp, &prog->res);
+ call_rcu(&prog->rcu, __cls_bpf_delete_prog);
+
+ return 0;
+}
+
+static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct cls_bpf_prog *prog, *tmp;
+
+ if (!force && !list_empty(&head->plist))
+ return false;
+
+ list_for_each_entry_safe(prog, tmp, &head->plist, link) {
+ list_del_rcu(&prog->link);
+ tcf_unbind_filter(tp, &prog->res);
+ call_rcu(&prog->rcu, __cls_bpf_delete_prog);
+ }
+
+ RCU_INIT_POINTER(tp->root, NULL);
+ kfree_rcu(head, rcu);
+ return true;
+}
+
+static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct cls_bpf_prog *prog;
+ unsigned long ret = 0UL;
+
+ if (head == NULL)
+ return 0UL;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (prog->handle == handle) {
+ ret = (unsigned long) prog;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int cls_bpf_prog_from_ops(struct nlattr **tb,
+ struct cls_bpf_prog *prog, u32 classid)
+{
+ struct sock_filter *bpf_ops;
+ struct sock_fprog_kern fprog_tmp;
+ struct bpf_prog *fp;
+ u16 bpf_size, bpf_num_ops;
+ int ret;
+
+ bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
+ if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0)
+ return -EINVAL;
+
+ bpf_size = bpf_num_ops * sizeof(*bpf_ops);
+ if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
+ return -EINVAL;
+
+ bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+ if (bpf_ops == NULL)
+ return -ENOMEM;
+
+ memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
+
+ fprog_tmp.len = bpf_num_ops;
+ fprog_tmp.filter = bpf_ops;
+
+ ret = bpf_prog_create(&fp, &fprog_tmp);
+ if (ret < 0) {
+ kfree(bpf_ops);
+ return ret;
+ }
+
+ prog->bpf_ops = bpf_ops;
+ prog->bpf_num_ops = bpf_num_ops;
+ prog->bpf_name = NULL;
+
+ prog->filter = fp;
+ prog->res.classid = classid;
+
+ return 0;
+}
+
+static int cls_bpf_prog_from_efd(struct nlattr **tb,
+ struct cls_bpf_prog *prog, u32 classid)
+{
+ struct bpf_prog *fp;
+ char *name = NULL;
+ u32 bpf_fd;
+
+ bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
+
+ fp = bpf_prog_get(bpf_fd);
+ if (IS_ERR(fp))
+ return PTR_ERR(fp);
+
+ if (fp->type != BPF_PROG_TYPE_SCHED_CLS) {
+ bpf_prog_put(fp);
+ return -EINVAL;
+ }
+
+ if (tb[TCA_BPF_NAME]) {
+ name = kmemdup(nla_data(tb[TCA_BPF_NAME]),
+ nla_len(tb[TCA_BPF_NAME]),
+ GFP_KERNEL);
+ if (!name) {
+ bpf_prog_put(fp);
+ return -ENOMEM;
+ }
+ }
+
+ prog->bpf_ops = NULL;
+ prog->bpf_fd = bpf_fd;
+ prog->bpf_name = name;
+
+ prog->filter = fp;
+ prog->res.classid = classid;
+
+ return 0;
+}
+
+static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
+ struct cls_bpf_prog *prog,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est, bool ovr)
+{
+ struct tcf_exts exts;
+ bool is_bpf, is_ebpf;
+ u32 classid;
+ int ret;
+
+ is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS];
+ is_ebpf = tb[TCA_BPF_FD];
+
+ if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) ||
+ !tb[TCA_BPF_CLASSID])
+ return -EINVAL;
+
+ tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+ ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
+ if (ret < 0)
+ return ret;
+
+ classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
+
+ ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) :
+ cls_bpf_prog_from_efd(tb, prog, classid);
+ if (ret < 0) {
+ tcf_exts_destroy(&exts);
+ return ret;
+ }
+
+ tcf_bind_filter(tp, &prog->res, base);
+ tcf_exts_change(tp, &prog->exts, &exts);
+
+ return 0;
+}
+
+static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
+ struct cls_bpf_head *head)
+{
+ unsigned int i = 0x80000000;
+ u32 handle;
+
+ do {
+ if (++head->hgen == 0x7FFFFFFF)
+ head->hgen = 1;
+ } while (--i > 0 && cls_bpf_get(tp, head->hgen));
+
+ if (unlikely(i == 0)) {
+ pr_err("Insufficient number of handles\n");
+ handle = 0;
+ } else {
+ handle = head->hgen;
+ }
+
+ return handle;
+}
+
+static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct cls_bpf_prog *oldprog = (struct cls_bpf_prog *) *arg;
+ struct nlattr *tb[TCA_BPF_MAX + 1];
+ struct cls_bpf_prog *prog;
+ int ret;
+
+ if (tca[TCA_OPTIONS] == NULL)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy);
+ if (ret < 0)
+ return ret;
+
+ prog = kzalloc(sizeof(*prog), GFP_KERNEL);
+ if (!prog)
+ return -ENOBUFS;
+
+ tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+
+ if (oldprog) {
+ if (handle && oldprog->handle != handle) {
+ ret = -EINVAL;
+ goto errout;
+ }
+ }
+
+ if (handle == 0)
+ prog->handle = cls_bpf_grab_new_handle(tp, head);
+ else
+ prog->handle = handle;
+ if (prog->handle == 0) {
+ ret = -EINVAL;
+ goto errout;
+ }
+
+ ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
+ if (ret < 0)
+ goto errout;
+
+ if (oldprog) {
+ list_replace_rcu(&prog->link, &oldprog->link);
+ tcf_unbind_filter(tp, &oldprog->res);
+ call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
+ } else {
+ list_add_rcu(&prog->link, &head->plist);
+ }
+
+ *arg = (unsigned long) prog;
+ return 0;
+errout:
+ kfree(prog);
+
+ return ret;
+}
+
+static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
+ struct sk_buff *skb)
+{
+ struct nlattr *nla;
+
+ if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops))
+ return -EMSGSIZE;
+
+ nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops *
+ sizeof(struct sock_filter));
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
+
+ return 0;
+}
+
+static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
+ struct sk_buff *skb)
+{
+ if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd))
+ return -EMSGSIZE;
+
+ if (prog->bpf_name &&
+ nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *tm)
+{
+ struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
+ struct nlattr *nest;
+ int ret;
+
+ if (prog == NULL)
+ return skb->len;
+
+ tm->tcm_handle = prog->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
+ goto nla_put_failure;
+
+ if (cls_bpf_is_ebpf(prog))
+ ret = cls_bpf_dump_ebpf_info(prog, skb);
+ else
+ ret = cls_bpf_dump_bpf_info(prog, skb);
+ if (ret)
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &prog->exts) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct cls_bpf_prog *prog;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
+ .kind = "bpf",
+ .owner = THIS_MODULE,
+ .classify = cls_bpf_classify,
+ .init = cls_bpf_init,
+ .destroy = cls_bpf_destroy,
+ .get = cls_bpf_get,
+ .change = cls_bpf_change,
+ .delete = cls_bpf_delete,
+ .walk = cls_bpf_walk,
+ .dump = cls_bpf_dump,
+};
+
+static int __init cls_bpf_init_mod(void)
+{
+ return register_tcf_proto_ops(&cls_bpf_ops);
+}
+
+static void __exit cls_bpf_exit_mod(void)
+{
+ unregister_tcf_proto_ops(&cls_bpf_ops);
+}
+
+module_init(cls_bpf_init_mod);
+module_exit(cls_bpf_exit_mod);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
new file mode 100644
index 000000000..ea611b216
--- /dev/null
+++ b/net/sched/cls_cgroup.c
@@ -0,0 +1,233 @@
+/*
+ * net/sched/cls_cgroup.c Control Group Classifier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rcupdate.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/cls_cgroup.h>
+
+struct cls_cgroup_head {
+ u32 handle;
+ struct tcf_exts exts;
+ struct tcf_ematch_tree ematches;
+ struct tcf_proto *tp;
+ struct rcu_head rcu;
+};
+
+static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
+ u32 classid;
+
+ classid = task_cls_state(current)->classid;
+
+ /*
+ * Due to the nature of the classifier it is required to ignore all
+ * packets originating from softirq context as accessing `current'
+ * would lead to false results.
+ *
+ * This test assumes that all callers of dev_queue_xmit() explicitely
+ * disable bh. Knowing this, it is possible to detect softirq based
+ * calls by looking at the number of nested bh disable calls because
+ * softirqs always disables bh.
+ */
+ if (in_serving_softirq()) {
+ /* If there is an sk_classid we'll use that. */
+ if (!skb->sk)
+ return -1;
+ classid = skb->sk->sk_classid;
+ }
+
+ if (!classid)
+ return -1;
+
+ if (!tcf_em_tree_match(skb, &head->ematches, NULL))
+ return -1;
+
+ res->classid = classid;
+ res->class = 0;
+ return tcf_exts_exec(skb, &head->exts, res);
+}
+
+static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
+{
+ return 0UL;
+}
+
+static int cls_cgroup_init(struct tcf_proto *tp)
+{
+ return 0;
+}
+
+static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
+ [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED },
+};
+
+static void cls_cgroup_destroy_rcu(struct rcu_head *root)
+{
+ struct cls_cgroup_head *head = container_of(root,
+ struct cls_cgroup_head,
+ rcu);
+
+ tcf_exts_destroy(&head->exts);
+ tcf_em_tree_destroy(&head->ematches);
+ kfree(head);
+}
+
+static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct nlattr *tb[TCA_CGROUP_MAX + 1];
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+ struct cls_cgroup_head *new;
+ struct tcf_ematch_tree t;
+ struct tcf_exts e;
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return -EINVAL;
+
+ if (!head && !handle)
+ return -EINVAL;
+
+ if (head && handle != head->handle)
+ return -ENOENT;
+
+ new = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (!new)
+ return -ENOBUFS;
+
+ tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ new->handle = handle;
+ new->tp = tp;
+ err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
+ cgroup_policy);
+ if (err < 0)
+ goto errout;
+
+ tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ if (err < 0)
+ goto errout;
+
+ err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
+ if (err < 0) {
+ tcf_exts_destroy(&e);
+ goto errout;
+ }
+
+ tcf_exts_change(tp, &new->exts, &e);
+ tcf_em_tree_change(tp, &new->ematches, &t);
+
+ rcu_assign_pointer(tp->root, new);
+ if (head)
+ call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
+ return 0;
+errout:
+ kfree(new);
+ return err;
+}
+
+static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force)
+{
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+
+ if (!force)
+ return false;
+
+ if (head) {
+ RCU_INIT_POINTER(tp->root, NULL);
+ call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
+ }
+ return true;
+}
+
+static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ return -EOPNOTSUPP;
+}
+
+static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+
+ if (arg->count < arg->skip)
+ goto skip;
+
+ if (arg->fn(tp, (unsigned long) head, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+skip:
+ arg->count++;
+}
+
+static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+ struct nlattr *nest;
+
+ t->tcm_handle = head->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &head->exts) < 0 ||
+ tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &head->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
+ .kind = "cgroup",
+ .init = cls_cgroup_init,
+ .change = cls_cgroup_change,
+ .classify = cls_cgroup_classify,
+ .destroy = cls_cgroup_destroy,
+ .get = cls_cgroup_get,
+ .delete = cls_cgroup_delete,
+ .walk = cls_cgroup_walk,
+ .dump = cls_cgroup_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_cgroup_cls(void)
+{
+ return register_tcf_proto_ops(&cls_cgroup_ops);
+}
+
+static void __exit exit_cgroup_cls(void)
+{
+ unregister_tcf_proto_ops(&cls_cgroup_ops);
+}
+
+module_init(init_cgroup_cls);
+module_exit(exit_cgroup_cls);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
new file mode 100644
index 000000000..a620c4e28
--- /dev/null
+++ b/net/sched/cls_flow.c
@@ -0,0 +1,694 @@
+/*
+ * net/sched/cls_flow.c Generic flow classifier
+ *
+ * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/pkt_cls.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <net/pkt_cls.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/flow_keys.h>
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+struct flow_head {
+ struct list_head filters;
+ struct rcu_head rcu;
+};
+
+struct flow_filter {
+ struct list_head list;
+ struct tcf_exts exts;
+ struct tcf_ematch_tree ematches;
+ struct tcf_proto *tp;
+ struct timer_list perturb_timer;
+ u32 perturb_period;
+ u32 handle;
+
+ u32 nkeys;
+ u32 keymask;
+ u32 mode;
+ u32 mask;
+ u32 xor;
+ u32 rshift;
+ u32 addend;
+ u32 divisor;
+ u32 baseclass;
+ u32 hashrnd;
+ struct rcu_head rcu;
+};
+
+static inline u32 addr_fold(void *addr)
+{
+ unsigned long a = (unsigned long)addr;
+
+ return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
+}
+
+static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ if (flow->src)
+ return ntohl(flow->src);
+ return addr_fold(skb->sk);
+}
+
+static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ if (flow->dst)
+ return ntohl(flow->dst);
+ return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
+}
+
+static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ return flow->ip_proto;
+}
+
+static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ if (flow->ports)
+ return ntohs(flow->port16[0]);
+
+ return addr_fold(skb->sk);
+}
+
+static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ if (flow->ports)
+ return ntohs(flow->port16[1]);
+
+ return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
+}
+
+static u32 flow_get_iif(const struct sk_buff *skb)
+{
+ return skb->skb_iif;
+}
+
+static u32 flow_get_priority(const struct sk_buff *skb)
+{
+ return skb->priority;
+}
+
+static u32 flow_get_mark(const struct sk_buff *skb)
+{
+ return skb->mark;
+}
+
+static u32 flow_get_nfct(const struct sk_buff *skb)
+{
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ return addr_fold(skb->nfct);
+#else
+ return 0;
+#endif
+}
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#define CTTUPLE(skb, member) \
+({ \
+ enum ip_conntrack_info ctinfo; \
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
+ if (ct == NULL) \
+ goto fallback; \
+ ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \
+})
+#else
+#define CTTUPLE(skb, member) \
+({ \
+ goto fallback; \
+ 0; \
+})
+#endif
+
+static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ return ntohl(CTTUPLE(skb, src.u3.ip));
+ case htons(ETH_P_IPV6):
+ return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
+ }
+fallback:
+ return flow_get_src(skb, flow);
+}
+
+static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ return ntohl(CTTUPLE(skb, dst.u3.ip));
+ case htons(ETH_P_IPV6):
+ return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
+ }
+fallback:
+ return flow_get_dst(skb, flow);
+}
+
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ return ntohs(CTTUPLE(skb, src.u.all));
+fallback:
+ return flow_get_proto_src(skb, flow);
+}
+
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
+{
+ return ntohs(CTTUPLE(skb, dst.u.all));
+fallback:
+ return flow_get_proto_dst(skb, flow);
+}
+
+static u32 flow_get_rtclassid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (skb_dst(skb))
+ return skb_dst(skb)->tclassid;
+#endif
+ return 0;
+}
+
+static u32 flow_get_skuid(const struct sk_buff *skb)
+{
+ if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
+ kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
+ return from_kuid(&init_user_ns, skuid);
+ }
+ return 0;
+}
+
+static u32 flow_get_skgid(const struct sk_buff *skb)
+{
+ if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
+ kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
+ return from_kgid(&init_user_ns, skgid);
+ }
+ return 0;
+}
+
+static u32 flow_get_vlan_tag(const struct sk_buff *skb)
+{
+ u16 uninitialized_var(tag);
+
+ if (vlan_get_tag(skb, &tag) < 0)
+ return 0;
+ return tag & VLAN_VID_MASK;
+}
+
+static u32 flow_get_rxhash(struct sk_buff *skb)
+{
+ return skb_get_hash(skb);
+}
+
+static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
+{
+ switch (key) {
+ case FLOW_KEY_SRC:
+ return flow_get_src(skb, flow);
+ case FLOW_KEY_DST:
+ return flow_get_dst(skb, flow);
+ case FLOW_KEY_PROTO:
+ return flow_get_proto(skb, flow);
+ case FLOW_KEY_PROTO_SRC:
+ return flow_get_proto_src(skb, flow);
+ case FLOW_KEY_PROTO_DST:
+ return flow_get_proto_dst(skb, flow);
+ case FLOW_KEY_IIF:
+ return flow_get_iif(skb);
+ case FLOW_KEY_PRIORITY:
+ return flow_get_priority(skb);
+ case FLOW_KEY_MARK:
+ return flow_get_mark(skb);
+ case FLOW_KEY_NFCT:
+ return flow_get_nfct(skb);
+ case FLOW_KEY_NFCT_SRC:
+ return flow_get_nfct_src(skb, flow);
+ case FLOW_KEY_NFCT_DST:
+ return flow_get_nfct_dst(skb, flow);
+ case FLOW_KEY_NFCT_PROTO_SRC:
+ return flow_get_nfct_proto_src(skb, flow);
+ case FLOW_KEY_NFCT_PROTO_DST:
+ return flow_get_nfct_proto_dst(skb, flow);
+ case FLOW_KEY_RTCLASSID:
+ return flow_get_rtclassid(skb);
+ case FLOW_KEY_SKUID:
+ return flow_get_skuid(skb);
+ case FLOW_KEY_SKGID:
+ return flow_get_skgid(skb);
+ case FLOW_KEY_VLAN_TAG:
+ return flow_get_vlan_tag(skb);
+ case FLOW_KEY_RXHASH:
+ return flow_get_rxhash(skb);
+ default:
+ WARN_ON(1);
+ return 0;
+ }
+}
+
+#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \
+ (1 << FLOW_KEY_DST) | \
+ (1 << FLOW_KEY_PROTO) | \
+ (1 << FLOW_KEY_PROTO_SRC) | \
+ (1 << FLOW_KEY_PROTO_DST) | \
+ (1 << FLOW_KEY_NFCT_SRC) | \
+ (1 << FLOW_KEY_NFCT_DST) | \
+ (1 << FLOW_KEY_NFCT_PROTO_SRC) | \
+ (1 << FLOW_KEY_NFCT_PROTO_DST))
+
+static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct flow_head *head = rcu_dereference_bh(tp->root);
+ struct flow_filter *f;
+ u32 keymask;
+ u32 classid;
+ unsigned int n, key;
+ int r;
+
+ list_for_each_entry_rcu(f, &head->filters, list) {
+ u32 keys[FLOW_KEY_MAX + 1];
+ struct flow_keys flow_keys;
+
+ if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+ continue;
+
+ keymask = f->keymask;
+ if (keymask & FLOW_KEYS_NEEDED)
+ skb_flow_dissect(skb, &flow_keys);
+
+ for (n = 0; n < f->nkeys; n++) {
+ key = ffs(keymask) - 1;
+ keymask &= ~(1 << key);
+ keys[n] = flow_key_get(skb, key, &flow_keys);
+ }
+
+ if (f->mode == FLOW_MODE_HASH)
+ classid = jhash2(keys, f->nkeys, f->hashrnd);
+ else {
+ classid = keys[0];
+ classid = (classid & f->mask) ^ f->xor;
+ classid = (classid >> f->rshift) + f->addend;
+ }
+
+ if (f->divisor)
+ classid %= f->divisor;
+
+ res->class = 0;
+ res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
+
+ r = tcf_exts_exec(skb, &f->exts, res);
+ if (r < 0)
+ continue;
+ return r;
+ }
+ return -1;
+}
+
+static void flow_perturbation(unsigned long arg)
+{
+ struct flow_filter *f = (struct flow_filter *)arg;
+
+ get_random_bytes(&f->hashrnd, 4);
+ if (f->perturb_period)
+ mod_timer(&f->perturb_timer, jiffies + f->perturb_period);
+}
+
+static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
+ [TCA_FLOW_KEYS] = { .type = NLA_U32 },
+ [TCA_FLOW_MODE] = { .type = NLA_U32 },
+ [TCA_FLOW_BASECLASS] = { .type = NLA_U32 },
+ [TCA_FLOW_RSHIFT] = { .type = NLA_U32 },
+ [TCA_FLOW_ADDEND] = { .type = NLA_U32 },
+ [TCA_FLOW_MASK] = { .type = NLA_U32 },
+ [TCA_FLOW_XOR] = { .type = NLA_U32 },
+ [TCA_FLOW_DIVISOR] = { .type = NLA_U32 },
+ [TCA_FLOW_ACT] = { .type = NLA_NESTED },
+ [TCA_FLOW_POLICE] = { .type = NLA_NESTED },
+ [TCA_FLOW_EMATCHES] = { .type = NLA_NESTED },
+ [TCA_FLOW_PERTURB] = { .type = NLA_U32 },
+};
+
+static void flow_destroy_filter(struct rcu_head *head)
+{
+ struct flow_filter *f = container_of(head, struct flow_filter, rcu);
+
+ del_timer_sync(&f->perturb_timer);
+ tcf_exts_destroy(&f->exts);
+ tcf_em_tree_destroy(&f->ematches);
+ kfree(f);
+}
+
+static int flow_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *fold, *fnew;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_FLOW_MAX + 1];
+ struct tcf_exts e;
+ struct tcf_ematch_tree t;
+ unsigned int nkeys = 0;
+ unsigned int perturb_period = 0;
+ u32 baseclass = 0;
+ u32 keymask = 0;
+ u32 mode;
+ int err;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_FLOW_BASECLASS]) {
+ baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
+ if (TC_H_MIN(baseclass) == 0)
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOW_KEYS]) {
+ keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
+
+ nkeys = hweight32(keymask);
+ if (nkeys == 0)
+ return -EINVAL;
+
+ if (fls(keymask) - 1 > FLOW_KEY_MAX)
+ return -EOPNOTSUPP;
+
+ if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
+ sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
+ return -EOPNOTSUPP;
+ }
+
+ tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ if (err < 0)
+ return err;
+
+ err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
+ if (err < 0)
+ goto err1;
+
+ err = -ENOBUFS;
+ fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+ if (!fnew)
+ goto err2;
+
+ fold = (struct flow_filter *)*arg;
+ if (fold) {
+ err = -EINVAL;
+ if (fold->handle != handle && handle)
+ goto err2;
+
+ /* Copy fold into fnew */
+ fnew->tp = fold->tp;
+ fnew->handle = fold->handle;
+ fnew->nkeys = fold->nkeys;
+ fnew->keymask = fold->keymask;
+ fnew->mode = fold->mode;
+ fnew->mask = fold->mask;
+ fnew->xor = fold->xor;
+ fnew->rshift = fold->rshift;
+ fnew->addend = fold->addend;
+ fnew->divisor = fold->divisor;
+ fnew->baseclass = fold->baseclass;
+ fnew->hashrnd = fold->hashrnd;
+
+ mode = fold->mode;
+ if (tb[TCA_FLOW_MODE])
+ mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+ if (mode != FLOW_MODE_HASH && nkeys > 1)
+ goto err2;
+
+ if (mode == FLOW_MODE_HASH)
+ perturb_period = fold->perturb_period;
+ if (tb[TCA_FLOW_PERTURB]) {
+ if (mode != FLOW_MODE_HASH)
+ goto err2;
+ perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
+ }
+ } else {
+ err = -EINVAL;
+ if (!handle)
+ goto err2;
+ if (!tb[TCA_FLOW_KEYS])
+ goto err2;
+
+ mode = FLOW_MODE_MAP;
+ if (tb[TCA_FLOW_MODE])
+ mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+ if (mode != FLOW_MODE_HASH && nkeys > 1)
+ goto err2;
+
+ if (tb[TCA_FLOW_PERTURB]) {
+ if (mode != FLOW_MODE_HASH)
+ goto err2;
+ perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
+ }
+
+ if (TC_H_MAJ(baseclass) == 0)
+ baseclass = TC_H_MAKE(tp->q->handle, baseclass);
+ if (TC_H_MIN(baseclass) == 0)
+ baseclass = TC_H_MAKE(baseclass, 1);
+
+ fnew->handle = handle;
+ fnew->mask = ~0U;
+ fnew->tp = tp;
+ get_random_bytes(&fnew->hashrnd, 4);
+ tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+ }
+
+ fnew->perturb_timer.function = flow_perturbation;
+ fnew->perturb_timer.data = (unsigned long)fnew;
+ init_timer_deferrable(&fnew->perturb_timer);
+
+ tcf_exts_change(tp, &fnew->exts, &e);
+ tcf_em_tree_change(tp, &fnew->ematches, &t);
+
+ netif_keep_dst(qdisc_dev(tp->q));
+
+ if (tb[TCA_FLOW_KEYS]) {
+ fnew->keymask = keymask;
+ fnew->nkeys = nkeys;
+ }
+
+ fnew->mode = mode;
+
+ if (tb[TCA_FLOW_MASK])
+ fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
+ if (tb[TCA_FLOW_XOR])
+ fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
+ if (tb[TCA_FLOW_RSHIFT])
+ fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
+ if (tb[TCA_FLOW_ADDEND])
+ fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
+
+ if (tb[TCA_FLOW_DIVISOR])
+ fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
+ if (baseclass)
+ fnew->baseclass = baseclass;
+
+ fnew->perturb_period = perturb_period;
+ if (perturb_period)
+ mod_timer(&fnew->perturb_timer, jiffies + perturb_period);
+
+ if (*arg == 0)
+ list_add_tail_rcu(&fnew->list, &head->filters);
+ else
+ list_replace_rcu(&fnew->list, &fold->list);
+
+ *arg = (unsigned long)fnew;
+
+ if (fold)
+ call_rcu(&fold->rcu, flow_destroy_filter);
+ return 0;
+
+err2:
+ tcf_em_tree_destroy(&t);
+ kfree(fnew);
+err1:
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static int flow_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct flow_filter *f = (struct flow_filter *)arg;
+
+ list_del_rcu(&f->list);
+ call_rcu(&f->rcu, flow_destroy_filter);
+ return 0;
+}
+
+static int flow_init(struct tcf_proto *tp)
+{
+ struct flow_head *head;
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+ INIT_LIST_HEAD(&head->filters);
+ rcu_assign_pointer(tp->root, head);
+ return 0;
+}
+
+static bool flow_destroy(struct tcf_proto *tp, bool force)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *f, *next;
+
+ if (!force && !list_empty(&head->filters))
+ return false;
+
+ list_for_each_entry_safe(f, next, &head->filters, list) {
+ list_del_rcu(&f->list);
+ call_rcu(&f->rcu, flow_destroy_filter);
+ }
+ RCU_INIT_POINTER(tp->root, NULL);
+ kfree_rcu(head, rcu);
+ return true;
+}
+
+static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *f;
+
+ list_for_each_entry(f, &head->filters, list)
+ if (f->handle == handle)
+ return (unsigned long)f;
+ return 0;
+}
+
+static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct flow_filter *f = (struct flow_filter *)fh;
+ struct nlattr *nest;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) ||
+ nla_put_u32(skb, TCA_FLOW_MODE, f->mode))
+ goto nla_put_failure;
+
+ if (f->mask != ~0 || f->xor != 0) {
+ if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) ||
+ nla_put_u32(skb, TCA_FLOW_XOR, f->xor))
+ goto nla_put_failure;
+ }
+ if (f->rshift &&
+ nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift))
+ goto nla_put_failure;
+ if (f->addend &&
+ nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend))
+ goto nla_put_failure;
+
+ if (f->divisor &&
+ nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor))
+ goto nla_put_failure;
+ if (f->baseclass &&
+ nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass))
+ goto nla_put_failure;
+
+ if (f->perturb_period &&
+ nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0)
+ goto nla_put_failure;
+#ifdef CONFIG_NET_EMATCH
+ if (f->ematches.hdr.nmatches &&
+ tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
+ goto nla_put_failure;
+#endif
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *f;
+
+ list_for_each_entry(f, &head->filters, list) {
+ if (arg->count < arg->skip)
+ goto skip;
+ if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static struct tcf_proto_ops cls_flow_ops __read_mostly = {
+ .kind = "flow",
+ .classify = flow_classify,
+ .init = flow_init,
+ .destroy = flow_destroy,
+ .change = flow_change,
+ .delete = flow_delete,
+ .get = flow_get,
+ .dump = flow_dump,
+ .walk = flow_walk,
+ .owner = THIS_MODULE,
+};
+
+static int __init cls_flow_init(void)
+{
+ return register_tcf_proto_ops(&cls_flow_ops);
+}
+
+static void __exit cls_flow_exit(void)
+{
+ unregister_tcf_proto_ops(&cls_flow_ops);
+}
+
+module_init(cls_flow_init);
+module_exit(cls_flow_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("TC flow classifier");
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
new file mode 100644
index 000000000..715e01e59
--- /dev/null
+++ b/net/sched/cls_fw.c
@@ -0,0 +1,438 @@
+/*
+ * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
+ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
+ * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
+ *
+ * JHS: We should remove the CONFIG_NET_CLS_IND from here
+ * eventually when the meta match extension is made available
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+#define HTSIZE 256
+
+struct fw_head {
+ u32 mask;
+ bool mask_set;
+ struct fw_filter __rcu *ht[HTSIZE];
+ struct rcu_head rcu;
+};
+
+struct fw_filter {
+ struct fw_filter __rcu *next;
+ u32 id;
+ struct tcf_result res;
+#ifdef CONFIG_NET_CLS_IND
+ int ifindex;
+#endif /* CONFIG_NET_CLS_IND */
+ struct tcf_exts exts;
+ struct tcf_proto *tp;
+ struct rcu_head rcu;
+};
+
+static u32 fw_hash(u32 handle)
+{
+ handle ^= (handle >> 16);
+ handle ^= (handle >> 8);
+ return handle % HTSIZE;
+}
+
+static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct fw_head *head = rcu_dereference_bh(tp->root);
+ struct fw_filter *f;
+ int r;
+ u32 id = skb->mark;
+
+ if (head != NULL) {
+ id &= head->mask;
+
+ for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f;
+ f = rcu_dereference_bh(f->next)) {
+ if (f->id == id) {
+ *res = f->res;
+#ifdef CONFIG_NET_CLS_IND
+ if (!tcf_match_indev(skb, f->ifindex))
+ continue;
+#endif /* CONFIG_NET_CLS_IND */
+ r = tcf_exts_exec(skb, &f->exts, res);
+ if (r < 0)
+ continue;
+
+ return r;
+ }
+ }
+ } else {
+ /* old method */
+ if (id && (TC_H_MAJ(id) == 0 ||
+ !(TC_H_MAJ(id ^ tp->q->handle)))) {
+ res->classid = id;
+ res->class = 0;
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f;
+
+ if (head == NULL)
+ return 0;
+
+ f = rtnl_dereference(head->ht[fw_hash(handle)]);
+ for (; f; f = rtnl_dereference(f->next)) {
+ if (f->id == handle)
+ return (unsigned long)f;
+ }
+ return 0;
+}
+
+static int fw_init(struct tcf_proto *tp)
+{
+ struct fw_head *head;
+
+ head = kzalloc(sizeof(struct fw_head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+
+ head->mask_set = false;
+ rcu_assign_pointer(tp->root, head);
+ return 0;
+}
+
+static void fw_delete_filter(struct rcu_head *head)
+{
+ struct fw_filter *f = container_of(head, struct fw_filter, rcu);
+
+ tcf_exts_destroy(&f->exts);
+ kfree(f);
+}
+
+static bool fw_destroy(struct tcf_proto *tp, bool force)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f;
+ int h;
+
+ if (head == NULL)
+ return true;
+
+ if (!force) {
+ for (h = 0; h < HTSIZE; h++)
+ if (rcu_access_pointer(head->ht[h]))
+ return false;
+ }
+
+ for (h = 0; h < HTSIZE; h++) {
+ while ((f = rtnl_dereference(head->ht[h])) != NULL) {
+ RCU_INIT_POINTER(head->ht[h],
+ rtnl_dereference(f->next));
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, fw_delete_filter);
+ }
+ }
+ RCU_INIT_POINTER(tp->root, NULL);
+ kfree_rcu(head, rcu);
+ return true;
+}
+
+static int fw_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f = (struct fw_filter *)arg;
+ struct fw_filter __rcu **fp;
+ struct fw_filter *pfp;
+
+ if (head == NULL || f == NULL)
+ goto out;
+
+ fp = &head->ht[fw_hash(f->id)];
+
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
+ if (pfp == f) {
+ RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, fw_delete_filter);
+ return 0;
+ }
+ }
+out:
+ return -EINVAL;
+}
+
+static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
+ [TCA_FW_CLASSID] = { .type = NLA_U32 },
+ [TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
+ [TCA_FW_MASK] = { .type = NLA_U32 },
+};
+
+static int
+fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
+ struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct tcf_exts e;
+ u32 mask;
+ int err;
+
+ tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_FW_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+#ifdef CONFIG_NET_CLS_IND
+ if (tb[TCA_FW_INDEV]) {
+ int ret;
+ ret = tcf_change_indev(net, tb[TCA_FW_INDEV]);
+ if (ret < 0) {
+ err = ret;
+ goto errout;
+ }
+ f->ifindex = ret;
+ }
+#endif /* CONFIG_NET_CLS_IND */
+
+ err = -EINVAL;
+ if (tb[TCA_FW_MASK]) {
+ mask = nla_get_u32(tb[TCA_FW_MASK]);
+ if (mask != head->mask)
+ goto errout;
+ } else if (head->mask != 0xFFFFFFFF)
+ goto errout;
+
+ tcf_exts_change(tp, &f->exts, &e);
+
+ return 0;
+errout:
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static int fw_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle,
+ struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f = (struct fw_filter *) *arg;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_FW_MAX + 1];
+ int err;
+
+ if (!opt)
+ return handle ? -EINVAL : 0;
+
+ err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy);
+ if (err < 0)
+ return err;
+
+ if (f) {
+ struct fw_filter *pfp, *fnew;
+ struct fw_filter __rcu **fp;
+
+ if (f->id != handle && handle)
+ return -EINVAL;
+
+ fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
+ if (!fnew)
+ return -ENOBUFS;
+
+ fnew->id = f->id;
+ fnew->res = f->res;
+#ifdef CONFIG_NET_CLS_IND
+ fnew->ifindex = f->ifindex;
+#endif /* CONFIG_NET_CLS_IND */
+ fnew->tp = f->tp;
+
+ tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
+
+ err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr);
+ if (err < 0) {
+ kfree(fnew);
+ return err;
+ }
+
+ fp = &head->ht[fw_hash(fnew->id)];
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp))
+ if (pfp == f)
+ break;
+
+ RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next));
+ rcu_assign_pointer(*fp, fnew);
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, fw_delete_filter);
+
+ *arg = (unsigned long)fnew;
+ return err;
+ }
+
+ if (!handle)
+ return -EINVAL;
+
+ if (!head->mask_set) {
+ head->mask = 0xFFFFFFFF;
+ if (tb[TCA_FW_MASK])
+ head->mask = nla_get_u32(tb[TCA_FW_MASK]);
+ head->mask_set = true;
+ }
+
+ f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
+ if (f == NULL)
+ return -ENOBUFS;
+
+ tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
+ f->id = handle;
+ f->tp = tp;
+
+ err = fw_change_attrs(net, tp, f, tb, tca, base, ovr);
+ if (err < 0)
+ goto errout;
+
+ RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]);
+ rcu_assign_pointer(head->ht[fw_hash(handle)], f);
+
+ *arg = (unsigned long)f;
+ return 0;
+
+errout:
+ kfree(f);
+ return err;
+}
+
+static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ int h;
+
+ if (head == NULL)
+ arg->stop = 1;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h < HTSIZE; h++) {
+ struct fw_filter *f;
+
+ for (f = rtnl_dereference(head->ht[h]); f;
+ f = rtnl_dereference(f->next)) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct fw_head *head = rtnl_dereference(tp->root);
+ struct fw_filter *f = (struct fw_filter *)fh;
+ struct nlattr *nest;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->id;
+
+ if (!f->res.classid && !tcf_exts_is_available(&f->exts))
+ return skb->len;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
+ goto nla_put_failure;
+#ifdef CONFIG_NET_CLS_IND
+ if (f->ifindex) {
+ struct net_device *dev;
+ dev = __dev_get_by_index(net, f->ifindex);
+ if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
+ goto nla_put_failure;
+ }
+#endif /* CONFIG_NET_CLS_IND */
+ if (head->mask != 0xFFFFFFFF &&
+ nla_put_u32(skb, TCA_FW_MASK, head->mask))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_fw_ops __read_mostly = {
+ .kind = "fw",
+ .classify = fw_classify,
+ .init = fw_init,
+ .destroy = fw_destroy,
+ .get = fw_get,
+ .change = fw_change,
+ .delete = fw_delete,
+ .walk = fw_walk,
+ .dump = fw_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_fw(void)
+{
+ return register_tcf_proto_ops(&cls_fw_ops);
+}
+
+static void __exit exit_fw(void)
+{
+ unregister_tcf_proto_ops(&cls_fw_ops);
+}
+
+module_init(init_fw)
+module_exit(exit_fw)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
new file mode 100644
index 000000000..08a3b0a6f
--- /dev/null
+++ b/net/sched/cls_route.c
@@ -0,0 +1,674 @@
+/*
+ * net/sched/cls_route.c ROUTE4 classifier.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+/*
+ * 1. For now we assume that route tags < 256.
+ * It allows to use direct table lookups, instead of hash tables.
+ * 2. For now we assume that "from TAG" and "fromdev DEV" statements
+ * are mutually exclusive.
+ * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ */
+struct route4_fastmap {
+ struct route4_filter *filter;
+ u32 id;
+ int iif;
+};
+
+struct route4_head {
+ struct route4_fastmap fastmap[16];
+ struct route4_bucket __rcu *table[256 + 1];
+ struct rcu_head rcu;
+};
+
+struct route4_bucket {
+ /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
+ struct route4_filter __rcu *ht[16 + 16 + 1];
+ struct rcu_head rcu;
+};
+
+struct route4_filter {
+ struct route4_filter __rcu *next;
+ u32 id;
+ int iif;
+
+ struct tcf_result res;
+ struct tcf_exts exts;
+ u32 handle;
+ struct route4_bucket *bkt;
+ struct tcf_proto *tp;
+ struct rcu_head rcu;
+};
+
+#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
+
+static inline int route4_fastmap_hash(u32 id, int iif)
+{
+ return id & 0xF;
+}
+
+static DEFINE_SPINLOCK(fastmap_lock);
+static void
+route4_reset_fastmap(struct route4_head *head)
+{
+ spin_lock_bh(&fastmap_lock);
+ memset(head->fastmap, 0, sizeof(head->fastmap));
+ spin_unlock_bh(&fastmap_lock);
+}
+
+static void
+route4_set_fastmap(struct route4_head *head, u32 id, int iif,
+ struct route4_filter *f)
+{
+ int h = route4_fastmap_hash(id, iif);
+
+ /* fastmap updates must look atomic to aling id, iff, filter */
+ spin_lock_bh(&fastmap_lock);
+ head->fastmap[h].id = id;
+ head->fastmap[h].iif = iif;
+ head->fastmap[h].filter = f;
+ spin_unlock_bh(&fastmap_lock);
+}
+
+static inline int route4_hash_to(u32 id)
+{
+ return id & 0xFF;
+}
+
+static inline int route4_hash_from(u32 id)
+{
+ return (id >> 16) & 0xF;
+}
+
+static inline int route4_hash_iif(int iif)
+{
+ return 16 + ((iif >> 16) & 0xF);
+}
+
+static inline int route4_hash_wild(void)
+{
+ return 32;
+}
+
+#define ROUTE4_APPLY_RESULT() \
+{ \
+ *res = f->res; \
+ if (tcf_exts_is_available(&f->exts)) { \
+ int r = tcf_exts_exec(skb, &f->exts, res); \
+ if (r < 0) { \
+ dont_cache = 1; \
+ continue; \
+ } \
+ return r; \
+ } else if (!dont_cache) \
+ route4_set_fastmap(head, id, iif, f); \
+ return 0; \
+}
+
+static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct route4_head *head = rcu_dereference_bh(tp->root);
+ struct dst_entry *dst;
+ struct route4_bucket *b;
+ struct route4_filter *f;
+ u32 id, h;
+ int iif, dont_cache = 0;
+
+ dst = skb_dst(skb);
+ if (!dst)
+ goto failure;
+
+ id = dst->tclassid;
+ if (head == NULL)
+ goto old_method;
+
+ iif = inet_iif(skb);
+
+ h = route4_fastmap_hash(id, iif);
+
+ spin_lock(&fastmap_lock);
+ if (id == head->fastmap[h].id &&
+ iif == head->fastmap[h].iif &&
+ (f = head->fastmap[h].filter) != NULL) {
+ if (f == ROUTE4_FAILURE) {
+ spin_unlock(&fastmap_lock);
+ goto failure;
+ }
+
+ *res = f->res;
+ spin_unlock(&fastmap_lock);
+ return 0;
+ }
+ spin_unlock(&fastmap_lock);
+
+ h = route4_hash_to(id);
+
+restart:
+ b = rcu_dereference_bh(head->table[h]);
+ if (b) {
+ for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]);
+ f;
+ f = rcu_dereference_bh(f->next))
+ if (f->id == id)
+ ROUTE4_APPLY_RESULT();
+
+ for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]);
+ f;
+ f = rcu_dereference_bh(f->next))
+ if (f->iif == iif)
+ ROUTE4_APPLY_RESULT();
+
+ for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]);
+ f;
+ f = rcu_dereference_bh(f->next))
+ ROUTE4_APPLY_RESULT();
+ }
+ if (h < 256) {
+ h = 256;
+ id &= ~0xFFFF;
+ goto restart;
+ }
+
+ if (!dont_cache)
+ route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
+failure:
+ return -1;
+
+old_method:
+ if (id && (TC_H_MAJ(id) == 0 ||
+ !(TC_H_MAJ(id^tp->q->handle)))) {
+ res->classid = id;
+ res->class = 0;
+ return 0;
+ }
+ return -1;
+}
+
+static inline u32 to_hash(u32 id)
+{
+ u32 h = id & 0xFF;
+
+ if (id & 0x8000)
+ h += 256;
+ return h;
+}
+
+static inline u32 from_hash(u32 id)
+{
+ id &= 0xFFFF;
+ if (id == 0xFFFF)
+ return 32;
+ if (!(id & 0x8000)) {
+ if (id > 255)
+ return 256;
+ return id & 0xF;
+ }
+ return 16 + (id & 0xF);
+}
+
+static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ struct route4_bucket *b;
+ struct route4_filter *f;
+ unsigned int h1, h2;
+
+ if (!head)
+ return 0;
+
+ h1 = to_hash(handle);
+ if (h1 > 256)
+ return 0;
+
+ h2 = from_hash(handle >> 16);
+ if (h2 > 32)
+ return 0;
+
+ b = rtnl_dereference(head->table[h1]);
+ if (b) {
+ for (f = rtnl_dereference(b->ht[h2]);
+ f;
+ f = rtnl_dereference(f->next))
+ if (f->handle == handle)
+ return (unsigned long)f;
+ }
+ return 0;
+}
+
+static int route4_init(struct tcf_proto *tp)
+{
+ struct route4_head *head;
+
+ head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+
+ rcu_assign_pointer(tp->root, head);
+ return 0;
+}
+
+static void
+route4_delete_filter(struct rcu_head *head)
+{
+ struct route4_filter *f = container_of(head, struct route4_filter, rcu);
+
+ tcf_exts_destroy(&f->exts);
+ kfree(f);
+}
+
+static bool route4_destroy(struct tcf_proto *tp, bool force)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ int h1, h2;
+
+ if (head == NULL)
+ return true;
+
+ if (!force) {
+ for (h1 = 0; h1 <= 256; h1++) {
+ if (rcu_access_pointer(head->table[h1]))
+ return false;
+ }
+ }
+
+ for (h1 = 0; h1 <= 256; h1++) {
+ struct route4_bucket *b;
+
+ b = rtnl_dereference(head->table[h1]);
+ if (b) {
+ for (h2 = 0; h2 <= 32; h2++) {
+ struct route4_filter *f;
+
+ while ((f = rtnl_dereference(b->ht[h2])) != NULL) {
+ struct route4_filter *next;
+
+ next = rtnl_dereference(f->next);
+ RCU_INIT_POINTER(b->ht[h2], next);
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, route4_delete_filter);
+ }
+ }
+ RCU_INIT_POINTER(head->table[h1], NULL);
+ kfree_rcu(b, rcu);
+ }
+ }
+ RCU_INIT_POINTER(tp->root, NULL);
+ kfree_rcu(head, rcu);
+ return true;
+}
+
+static int route4_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ struct route4_filter *f = (struct route4_filter *)arg;
+ struct route4_filter __rcu **fp;
+ struct route4_filter *nf;
+ struct route4_bucket *b;
+ unsigned int h = 0;
+ int i;
+
+ if (!head || !f)
+ return -EINVAL;
+
+ h = f->handle;
+ b = f->bkt;
+
+ fp = &b->ht[from_hash(h >> 16)];
+ for (nf = rtnl_dereference(*fp); nf;
+ fp = &nf->next, nf = rtnl_dereference(*fp)) {
+ if (nf == f) {
+ /* unlink it */
+ RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
+
+ /* Remove any fastmap lookups that might ref filter
+ * notice we unlink'd the filter so we can't get it
+ * back in the fastmap.
+ */
+ route4_reset_fastmap(head);
+
+ /* Delete it */
+ tcf_unbind_filter(tp, &f->res);
+ call_rcu(&f->rcu, route4_delete_filter);
+
+ /* Strip RTNL protected tree */
+ for (i = 0; i <= 32; i++) {
+ struct route4_filter *rt;
+
+ rt = rtnl_dereference(b->ht[i]);
+ if (rt)
+ return 0;
+ }
+
+ /* OK, session has no flows */
+ RCU_INIT_POINTER(head->table[to_hash(h)], NULL);
+ kfree_rcu(b, rcu);
+
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
+ [TCA_ROUTE4_CLASSID] = { .type = NLA_U32 },
+ [TCA_ROUTE4_TO] = { .type = NLA_U32 },
+ [TCA_ROUTE4_FROM] = { .type = NLA_U32 },
+ [TCA_ROUTE4_IIF] = { .type = NLA_U32 },
+};
+
+static int route4_set_parms(struct net *net, struct tcf_proto *tp,
+ unsigned long base, struct route4_filter *f,
+ u32 handle, struct route4_head *head,
+ struct nlattr **tb, struct nlattr *est, int new,
+ bool ovr)
+{
+ int err;
+ u32 id = 0, to = 0, nhandle = 0x8000;
+ struct route4_filter *fp;
+ unsigned int h1;
+ struct route4_bucket *b;
+ struct tcf_exts e;
+
+ tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ if (tb[TCA_ROUTE4_TO]) {
+ if (new && handle & 0x8000)
+ goto errout;
+ to = nla_get_u32(tb[TCA_ROUTE4_TO]);
+ if (to > 0xFF)
+ goto errout;
+ nhandle = to;
+ }
+
+ if (tb[TCA_ROUTE4_FROM]) {
+ if (tb[TCA_ROUTE4_IIF])
+ goto errout;
+ id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
+ if (id > 0xFF)
+ goto errout;
+ nhandle |= id << 16;
+ } else if (tb[TCA_ROUTE4_IIF]) {
+ id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
+ if (id > 0x7FFF)
+ goto errout;
+ nhandle |= (id | 0x8000) << 16;
+ } else
+ nhandle |= 0xFFFF << 16;
+
+ if (handle && new) {
+ nhandle |= handle & 0x7F00;
+ if (nhandle != handle)
+ goto errout;
+ }
+
+ h1 = to_hash(nhandle);
+ b = rtnl_dereference(head->table[h1]);
+ if (!b) {
+ err = -ENOBUFS;
+ b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
+ if (b == NULL)
+ goto errout;
+
+ rcu_assign_pointer(head->table[h1], b);
+ } else {
+ unsigned int h2 = from_hash(nhandle >> 16);
+
+ err = -EEXIST;
+ for (fp = rtnl_dereference(b->ht[h2]);
+ fp;
+ fp = rtnl_dereference(fp->next))
+ if (fp->handle == f->handle)
+ goto errout;
+ }
+
+ if (tb[TCA_ROUTE4_TO])
+ f->id = to;
+
+ if (tb[TCA_ROUTE4_FROM])
+ f->id = to | id<<16;
+ else if (tb[TCA_ROUTE4_IIF])
+ f->iif = id;
+
+ f->handle = nhandle;
+ f->bkt = b;
+ f->tp = tp;
+
+ if (tb[TCA_ROUTE4_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ tcf_exts_change(tp, &f->exts, &e);
+
+ return 0;
+errout:
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static int route4_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle,
+ struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ struct route4_filter __rcu **fp;
+ struct route4_filter *fold, *f1, *pfp, *f = NULL;
+ struct route4_bucket *b;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_ROUTE4_MAX + 1];
+ unsigned int h, th;
+ int err;
+ bool new = true;
+
+ if (opt == NULL)
+ return handle ? -EINVAL : 0;
+
+ err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy);
+ if (err < 0)
+ return err;
+
+ fold = (struct route4_filter *)*arg;
+ if (fold && handle && fold->handle != handle)
+ return -EINVAL;
+
+ err = -ENOBUFS;
+ f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
+ if (!f)
+ goto errout;
+
+ tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ if (fold) {
+ f->id = fold->id;
+ f->iif = fold->iif;
+ f->res = fold->res;
+ f->handle = fold->handle;
+
+ f->tp = fold->tp;
+ f->bkt = fold->bkt;
+ new = false;
+ }
+
+ err = route4_set_parms(net, tp, base, f, handle, head, tb,
+ tca[TCA_RATE], new, ovr);
+ if (err < 0)
+ goto errout;
+
+ h = from_hash(f->handle >> 16);
+ fp = &f->bkt->ht[h];
+ for (pfp = rtnl_dereference(*fp);
+ (f1 = rtnl_dereference(*fp)) != NULL;
+ fp = &f1->next)
+ if (f->handle < f1->handle)
+ break;
+
+ netif_keep_dst(qdisc_dev(tp->q));
+ rcu_assign_pointer(f->next, f1);
+ rcu_assign_pointer(*fp, f);
+
+ if (fold && fold->handle && f->handle != fold->handle) {
+ th = to_hash(fold->handle);
+ h = from_hash(fold->handle >> 16);
+ b = rtnl_dereference(head->table[th]);
+ if (b) {
+ fp = &b->ht[h];
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
+ if (pfp == f) {
+ *fp = f->next;
+ break;
+ }
+ }
+ }
+ }
+
+ route4_reset_fastmap(head);
+ *arg = (unsigned long)f;
+ if (fold) {
+ tcf_unbind_filter(tp, &fold->res);
+ call_rcu(&fold->rcu, route4_delete_filter);
+ }
+ return 0;
+
+errout:
+ kfree(f);
+ return err;
+}
+
+static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct route4_head *head = rtnl_dereference(tp->root);
+ unsigned int h, h1;
+
+ if (head == NULL)
+ arg->stop = 1;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h <= 256; h++) {
+ struct route4_bucket *b = rtnl_dereference(head->table[h]);
+
+ if (b) {
+ for (h1 = 0; h1 <= 32; h1++) {
+ struct route4_filter *f;
+
+ for (f = rtnl_dereference(b->ht[h1]);
+ f;
+ f = rtnl_dereference(f->next)) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+ }
+ }
+}
+
+static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct route4_filter *f = (struct route4_filter *)fh;
+ struct nlattr *nest;
+ u32 id;
+
+ if (f == NULL)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (!(f->handle & 0x8000)) {
+ id = f->id & 0xFF;
+ if (nla_put_u32(skb, TCA_ROUTE4_TO, id))
+ goto nla_put_failure;
+ }
+ if (f->handle & 0x80000000) {
+ if ((f->handle >> 16) != 0xFFFF &&
+ nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif))
+ goto nla_put_failure;
+ } else {
+ id = f->id >> 16;
+ if (nla_put_u32(skb, TCA_ROUTE4_FROM, id))
+ goto nla_put_failure;
+ }
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_route4_ops __read_mostly = {
+ .kind = "route",
+ .classify = route4_classify,
+ .init = route4_init,
+ .destroy = route4_destroy,
+ .get = route4_get,
+ .change = route4_change,
+ .delete = route4_delete,
+ .walk = route4_walk,
+ .dump = route4_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_route4(void)
+{
+ return register_tcf_proto_ops(&cls_route4_ops);
+}
+
+static void __exit exit_route4(void)
+{
+ unregister_tcf_proto_ops(&cls_route4_ops);
+}
+
+module_init(init_route4)
+module_exit(exit_route4)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
new file mode 100644
index 000000000..cbb5e0d60
--- /dev/null
+++ b/net/sched/cls_rsvp.c
@@ -0,0 +1,28 @@
+/*
+ * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+#define RSVP_DST_LEN 1
+#define RSVP_ID "rsvp"
+#define RSVP_OPS cls_rsvp_ops
+
+#include "cls_rsvp.h"
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
new file mode 100644
index 000000000..02fa82792
--- /dev/null
+++ b/net/sched/cls_rsvp.h
@@ -0,0 +1,732 @@
+/*
+ * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+/*
+ Comparing to general packet classification problem,
+ RSVP needs only sevaral relatively simple rules:
+
+ * (dst, protocol) are always specified,
+ so that we are able to hash them.
+ * src may be exact, or may be wildcard, so that
+ we can keep a hash table plus one wildcard entry.
+ * source port (or flow label) is important only if src is given.
+
+ IMPLEMENTATION.
+
+ We use a two level hash table: The top level is keyed by
+ destination address and protocol ID, every bucket contains a list
+ of "rsvp sessions", identified by destination address, protocol and
+ DPI(="Destination Port ID"): triple (key, mask, offset).
+
+ Every bucket has a smaller hash table keyed by source address
+ (cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
+ Every bucket is again a list of "RSVP flows", selected by
+ source address and SPI(="Source Port ID" here rather than
+ "security parameter index"): triple (key, mask, offset).
+
+
+ NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
+ and all fragmented packets go to the best-effort traffic class.
+
+
+ NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
+ only one "Generalized Port Identifier". So that for classic
+ ah, esp (and udp,tcp) both *pi should coincide or one of them
+ should be wildcard.
+
+ At first sight, this redundancy is just a waste of CPU
+ resources. But DPI and SPI add the possibility to assign different
+ priorities to GPIs. Look also at note 4 about tunnels below.
+
+
+ NOTE 3. One complication is the case of tunneled packets.
+ We implement it as following: if the first lookup
+ matches a special session with "tunnelhdr" value not zero,
+ flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
+ In this case, we pull tunnelhdr bytes and restart lookup
+ with tunnel ID added to the list of keys. Simple and stupid 8)8)
+ It's enough for PIMREG and IPIP.
+
+
+ NOTE 4. Two GPIs make it possible to parse even GRE packets.
+ F.e. DPI can select ETH_P_IP (and necessary flags to make
+ tunnelhdr correct) in GRE protocol field and SPI matches
+ GRE key. Is it not nice? 8)8)
+
+
+ Well, as result, despite its simplicity, we get a pretty
+ powerful classification engine. */
+
+
+struct rsvp_head {
+ u32 tmap[256/32];
+ u32 hgenerator;
+ u8 tgenerator;
+ struct rsvp_session __rcu *ht[256];
+ struct rcu_head rcu;
+};
+
+struct rsvp_session {
+ struct rsvp_session __rcu *next;
+ __be32 dst[RSVP_DST_LEN];
+ struct tc_rsvp_gpi dpi;
+ u8 protocol;
+ u8 tunnelid;
+ /* 16 (src,sport) hash slots, and one wildcard source slot */
+ struct rsvp_filter __rcu *ht[16 + 1];
+ struct rcu_head rcu;
+};
+
+
+struct rsvp_filter {
+ struct rsvp_filter __rcu *next;
+ __be32 src[RSVP_DST_LEN];
+ struct tc_rsvp_gpi spi;
+ u8 tunnelhdr;
+
+ struct tcf_result res;
+ struct tcf_exts exts;
+
+ u32 handle;
+ struct rsvp_session *sess;
+ struct rcu_head rcu;
+};
+
+static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
+{
+ unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
+
+ h ^= h>>16;
+ h ^= h>>8;
+ return (h ^ protocol ^ tunnelid) & 0xFF;
+}
+
+static inline unsigned int hash_src(__be32 *src)
+{
+ unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
+
+ h ^= h>>16;
+ h ^= h>>8;
+ h ^= h>>4;
+ return h & 0xF;
+}
+
+#define RSVP_APPLY_RESULT() \
+{ \
+ int r = tcf_exts_exec(skb, &f->exts, res); \
+ if (r < 0) \
+ continue; \
+ else if (r > 0) \
+ return r; \
+}
+
+static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct rsvp_head *head = rcu_dereference_bh(tp->root);
+ struct rsvp_session *s;
+ struct rsvp_filter *f;
+ unsigned int h1, h2;
+ __be32 *dst, *src;
+ u8 protocol;
+ u8 tunnelid = 0;
+ u8 *xprt;
+#if RSVP_DST_LEN == 4
+ struct ipv6hdr *nhptr;
+
+ if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+ return -1;
+ nhptr = ipv6_hdr(skb);
+#else
+ struct iphdr *nhptr;
+
+ if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+ return -1;
+ nhptr = ip_hdr(skb);
+#endif
+
+restart:
+
+#if RSVP_DST_LEN == 4
+ src = &nhptr->saddr.s6_addr32[0];
+ dst = &nhptr->daddr.s6_addr32[0];
+ protocol = nhptr->nexthdr;
+ xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
+#else
+ src = &nhptr->saddr;
+ dst = &nhptr->daddr;
+ protocol = nhptr->protocol;
+ xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
+ if (ip_is_fragment(nhptr))
+ return -1;
+#endif
+
+ h1 = hash_dst(dst, protocol, tunnelid);
+ h2 = hash_src(src);
+
+ for (s = rcu_dereference_bh(head->ht[h1]); s;
+ s = rcu_dereference_bh(s->next)) {
+ if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
+ protocol == s->protocol &&
+ !(s->dpi.mask &
+ (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
+#if RSVP_DST_LEN == 4
+ dst[0] == s->dst[0] &&
+ dst[1] == s->dst[1] &&
+ dst[2] == s->dst[2] &&
+#endif
+ tunnelid == s->tunnelid) {
+
+ for (f = rcu_dereference_bh(s->ht[h2]); f;
+ f = rcu_dereference_bh(f->next)) {
+ if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
+ !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
+#if RSVP_DST_LEN == 4
+ &&
+ src[0] == f->src[0] &&
+ src[1] == f->src[1] &&
+ src[2] == f->src[2]
+#endif
+ ) {
+ *res = f->res;
+ RSVP_APPLY_RESULT();
+
+matched:
+ if (f->tunnelhdr == 0)
+ return 0;
+
+ tunnelid = f->res.classid;
+ nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
+ goto restart;
+ }
+ }
+
+ /* And wildcard bucket... */
+ for (f = rcu_dereference_bh(s->ht[16]); f;
+ f = rcu_dereference_bh(f->next)) {
+ *res = f->res;
+ RSVP_APPLY_RESULT();
+ goto matched;
+ }
+ return -1;
+ }
+ }
+ return -1;
+}
+
+static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
+{
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ struct rsvp_session *s;
+ struct rsvp_filter __rcu **ins;
+ struct rsvp_filter *pins;
+ unsigned int h1 = h & 0xFF;
+ unsigned int h2 = (h >> 8) & 0xFF;
+
+ for (s = rtnl_dereference(head->ht[h1]); s;
+ s = rtnl_dereference(s->next)) {
+ for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ;
+ ins = &pins->next, pins = rtnl_dereference(*ins)) {
+ if (pins->handle == h) {
+ RCU_INIT_POINTER(n->next, pins->next);
+ rcu_assign_pointer(*ins, n);
+ return;
+ }
+ }
+ }
+
+ /* Something went wrong if we are trying to replace a non-existant
+ * node. Mind as well halt instead of silently failing.
+ */
+ BUG_ON(1);
+}
+
+static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
+{
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ struct rsvp_session *s;
+ struct rsvp_filter *f;
+ unsigned int h1 = handle & 0xFF;
+ unsigned int h2 = (handle >> 8) & 0xFF;
+
+ if (h2 > 16)
+ return 0;
+
+ for (s = rtnl_dereference(head->ht[h1]); s;
+ s = rtnl_dereference(s->next)) {
+ for (f = rtnl_dereference(s->ht[h2]); f;
+ f = rtnl_dereference(f->next)) {
+ if (f->handle == handle)
+ return (unsigned long)f;
+ }
+ }
+ return 0;
+}
+
+static int rsvp_init(struct tcf_proto *tp)
+{
+ struct rsvp_head *data;
+
+ data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
+ if (data) {
+ rcu_assign_pointer(tp->root, data);
+ return 0;
+ }
+ return -ENOBUFS;
+}
+
+static void
+rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
+{
+ tcf_unbind_filter(tp, &f->res);
+ tcf_exts_destroy(&f->exts);
+ kfree_rcu(f, rcu);
+}
+
+static bool rsvp_destroy(struct tcf_proto *tp, bool force)
+{
+ struct rsvp_head *data = rtnl_dereference(tp->root);
+ int h1, h2;
+
+ if (data == NULL)
+ return true;
+
+ if (!force) {
+ for (h1 = 0; h1 < 256; h1++) {
+ if (rcu_access_pointer(data->ht[h1]))
+ return false;
+ }
+ }
+
+ RCU_INIT_POINTER(tp->root, NULL);
+
+ for (h1 = 0; h1 < 256; h1++) {
+ struct rsvp_session *s;
+
+ while ((s = rtnl_dereference(data->ht[h1])) != NULL) {
+ RCU_INIT_POINTER(data->ht[h1], s->next);
+
+ for (h2 = 0; h2 <= 16; h2++) {
+ struct rsvp_filter *f;
+
+ while ((f = rtnl_dereference(s->ht[h2])) != NULL) {
+ rcu_assign_pointer(s->ht[h2], f->next);
+ rsvp_delete_filter(tp, f);
+ }
+ }
+ kfree_rcu(s, rcu);
+ }
+ }
+ kfree_rcu(data, rcu);
+ return true;
+}
+
+static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg;
+ struct rsvp_filter __rcu **fp;
+ unsigned int h = f->handle;
+ struct rsvp_session __rcu **sp;
+ struct rsvp_session *nsp, *s = f->sess;
+ int i;
+
+ fp = &s->ht[(h >> 8) & 0xFF];
+ for (nfp = rtnl_dereference(*fp); nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
+ if (nfp == f) {
+ RCU_INIT_POINTER(*fp, f->next);
+ rsvp_delete_filter(tp, f);
+
+ /* Strip tree */
+
+ for (i = 0; i <= 16; i++)
+ if (s->ht[i])
+ return 0;
+
+ /* OK, session has no flows */
+ sp = &head->ht[h & 0xFF];
+ for (nsp = rtnl_dereference(*sp); nsp;
+ sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
+ if (nsp == s) {
+ RCU_INIT_POINTER(*sp, s->next);
+ kfree_rcu(s, rcu);
+ return 0;
+ }
+ }
+
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
+{
+ struct rsvp_head *data = rtnl_dereference(tp->root);
+ int i = 0xFFFF;
+
+ while (i-- > 0) {
+ u32 h;
+
+ if ((data->hgenerator += 0x10000) == 0)
+ data->hgenerator = 0x10000;
+ h = data->hgenerator|salt;
+ if (rsvp_get(tp, h) == 0)
+ return h;
+ }
+ return 0;
+}
+
+static int tunnel_bts(struct rsvp_head *data)
+{
+ int n = data->tgenerator >> 5;
+ u32 b = 1 << (data->tgenerator & 0x1F);
+
+ if (data->tmap[n] & b)
+ return 0;
+ data->tmap[n] |= b;
+ return 1;
+}
+
+static void tunnel_recycle(struct rsvp_head *data)
+{
+ struct rsvp_session __rcu **sht = data->ht;
+ u32 tmap[256/32];
+ int h1, h2;
+
+ memset(tmap, 0, sizeof(tmap));
+
+ for (h1 = 0; h1 < 256; h1++) {
+ struct rsvp_session *s;
+ for (s = rtnl_dereference(sht[h1]); s;
+ s = rtnl_dereference(s->next)) {
+ for (h2 = 0; h2 <= 16; h2++) {
+ struct rsvp_filter *f;
+
+ for (f = rtnl_dereference(s->ht[h2]); f;
+ f = rtnl_dereference(f->next)) {
+ if (f->tunnelhdr == 0)
+ continue;
+ data->tgenerator = f->res.classid;
+ tunnel_bts(data);
+ }
+ }
+ }
+ }
+
+ memcpy(data->tmap, tmap, sizeof(tmap));
+}
+
+static u32 gen_tunnel(struct rsvp_head *data)
+{
+ int i, k;
+
+ for (k = 0; k < 2; k++) {
+ for (i = 255; i > 0; i--) {
+ if (++data->tgenerator == 0)
+ data->tgenerator = 1;
+ if (tunnel_bts(data))
+ return data->tgenerator;
+ }
+ tunnel_recycle(data);
+ }
+ return 0;
+}
+
+static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
+ [TCA_RSVP_CLASSID] = { .type = NLA_U32 },
+ [TCA_RSVP_DST] = { .type = NLA_BINARY,
+ .len = RSVP_DST_LEN * sizeof(u32) },
+ [TCA_RSVP_SRC] = { .type = NLA_BINARY,
+ .len = RSVP_DST_LEN * sizeof(u32) },
+ [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
+};
+
+static int rsvp_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base,
+ u32 handle,
+ struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct rsvp_head *data = rtnl_dereference(tp->root);
+ struct rsvp_filter *f, *nfp;
+ struct rsvp_filter __rcu **fp;
+ struct rsvp_session *nsp, *s;
+ struct rsvp_session __rcu **sp;
+ struct tc_rsvp_pinfo *pinfo = NULL;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_RSVP_MAX + 1];
+ struct tcf_exts e;
+ unsigned int h1, h2;
+ __be32 *dst;
+ int err;
+
+ if (opt == NULL)
+ return handle ? -EINVAL : 0;
+
+ err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy);
+ if (err < 0)
+ return err;
+
+ tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
+ if (err < 0)
+ return err;
+
+ f = (struct rsvp_filter *)*arg;
+ if (f) {
+ /* Node exists: adjust only classid */
+ struct rsvp_filter *n;
+
+ if (f->handle != handle && handle)
+ goto errout2;
+
+ n = kmemdup(f, sizeof(*f), GFP_KERNEL);
+ if (!n) {
+ err = -ENOMEM;
+ goto errout2;
+ }
+
+ tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+
+ if (tb[TCA_RSVP_CLASSID]) {
+ n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
+ tcf_bind_filter(tp, &n->res, base);
+ }
+
+ tcf_exts_change(tp, &n->exts, &e);
+ rsvp_replace(tp, n, handle);
+ return 0;
+ }
+
+ /* Now more serious part... */
+ err = -EINVAL;
+ if (handle)
+ goto errout2;
+ if (tb[TCA_RSVP_DST] == NULL)
+ goto errout2;
+
+ err = -ENOBUFS;
+ f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
+ if (f == NULL)
+ goto errout2;
+
+ tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
+ h2 = 16;
+ if (tb[TCA_RSVP_SRC]) {
+ memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
+ h2 = hash_src(f->src);
+ }
+ if (tb[TCA_RSVP_PINFO]) {
+ pinfo = nla_data(tb[TCA_RSVP_PINFO]);
+ f->spi = pinfo->spi;
+ f->tunnelhdr = pinfo->tunnelhdr;
+ }
+ if (tb[TCA_RSVP_CLASSID])
+ f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
+
+ dst = nla_data(tb[TCA_RSVP_DST]);
+ h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
+
+ err = -ENOMEM;
+ if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
+ goto errout;
+
+ if (f->tunnelhdr) {
+ err = -EINVAL;
+ if (f->res.classid > 255)
+ goto errout;
+
+ err = -ENOMEM;
+ if (f->res.classid == 0 &&
+ (f->res.classid = gen_tunnel(data)) == 0)
+ goto errout;
+ }
+
+ for (sp = &data->ht[h1];
+ (s = rtnl_dereference(*sp)) != NULL;
+ sp = &s->next) {
+ if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+ pinfo && pinfo->protocol == s->protocol &&
+ memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
+#if RSVP_DST_LEN == 4
+ dst[0] == s->dst[0] &&
+ dst[1] == s->dst[1] &&
+ dst[2] == s->dst[2] &&
+#endif
+ pinfo->tunnelid == s->tunnelid) {
+
+insert:
+ /* OK, we found appropriate session */
+
+ fp = &s->ht[h2];
+
+ f->sess = s;
+ if (f->tunnelhdr == 0)
+ tcf_bind_filter(tp, &f->res, base);
+
+ tcf_exts_change(tp, &f->exts, &e);
+
+ fp = &s->ht[h2];
+ for (nfp = rtnl_dereference(*fp); nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
+ __u32 mask = nfp->spi.mask & f->spi.mask;
+
+ if (mask != f->spi.mask)
+ break;
+ }
+ RCU_INIT_POINTER(f->next, nfp);
+ rcu_assign_pointer(*fp, f);
+
+ *arg = (unsigned long)f;
+ return 0;
+ }
+ }
+
+ /* No session found. Create new one. */
+
+ err = -ENOBUFS;
+ s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
+ if (s == NULL)
+ goto errout;
+ memcpy(s->dst, dst, sizeof(s->dst));
+
+ if (pinfo) {
+ s->dpi = pinfo->dpi;
+ s->protocol = pinfo->protocol;
+ s->tunnelid = pinfo->tunnelid;
+ }
+ sp = &data->ht[h1];
+ for (nsp = rtnl_dereference(*sp); nsp;
+ sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
+ if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask)
+ break;
+ }
+ RCU_INIT_POINTER(s->next, nsp);
+ rcu_assign_pointer(*sp, s);
+
+ goto insert;
+
+errout:
+ kfree(f);
+errout2:
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ unsigned int h, h1;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h < 256; h++) {
+ struct rsvp_session *s;
+
+ for (s = rtnl_dereference(head->ht[h]); s;
+ s = rtnl_dereference(s->next)) {
+ for (h1 = 0; h1 <= 16; h1++) {
+ struct rsvp_filter *f;
+
+ for (f = rtnl_dereference(s->ht[h1]); f;
+ f = rtnl_dereference(f->next)) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+ }
+ }
+}
+
+static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct rsvp_filter *f = (struct rsvp_filter *)fh;
+ struct rsvp_session *s;
+ struct nlattr *nest;
+ struct tc_rsvp_pinfo pinfo;
+
+ if (f == NULL)
+ return skb->len;
+ s = f->sess;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
+ goto nla_put_failure;
+ pinfo.dpi = s->dpi;
+ pinfo.spi = f->spi;
+ pinfo.protocol = s->protocol;
+ pinfo.tunnelid = s->tunnelid;
+ pinfo.tunnelhdr = f->tunnelhdr;
+ pinfo.pad = 0;
+ if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
+ goto nla_put_failure;
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
+ goto nla_put_failure;
+ if (((f->handle >> 8) & 0xFF) != 16 &&
+ nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &f->exts) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &f->exts) < 0)
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops RSVP_OPS __read_mostly = {
+ .kind = RSVP_ID,
+ .classify = rsvp_classify,
+ .init = rsvp_init,
+ .destroy = rsvp_destroy,
+ .get = rsvp_get,
+ .change = rsvp_change,
+ .delete = rsvp_delete,
+ .walk = rsvp_walk,
+ .dump = rsvp_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_rsvp(void)
+{
+ return register_tcf_proto_ops(&RSVP_OPS);
+}
+
+static void __exit exit_rsvp(void)
+{
+ unregister_tcf_proto_ops(&RSVP_OPS);
+}
+
+module_init(init_rsvp)
+module_exit(exit_rsvp)
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
new file mode 100644
index 000000000..dd08aea2a
--- /dev/null
+++ b/net/sched/cls_rsvp6.c
@@ -0,0 +1,28 @@
+/*
+ * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/ipv6.h>
+#include <linux/skbuff.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <net/netlink.h>
+
+#define RSVP_DST_LEN 4
+#define RSVP_ID "rsvp6"
+#define RSVP_OPS cls_rsvp6_ops
+
+#include "cls_rsvp.h"
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
new file mode 100644
index 000000000..a557dbaf5
--- /dev/null
+++ b/net/sched/cls_tcindex.c
@@ -0,0 +1,578 @@
+/*
+ * net/sched/cls_tcindex.c Packet classifier for skb->tc_index
+ *
+ * Written 1998,1999 by Werner Almesberger, EPFL ICA
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+
+/*
+ * Passing parameters to the root seems to be done more awkwardly than really
+ * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
+ * verified. FIXME.
+ */
+
+#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
+#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
+
+
+struct tcindex_filter_result {
+ struct tcf_exts exts;
+ struct tcf_result res;
+};
+
+struct tcindex_filter {
+ u16 key;
+ struct tcindex_filter_result result;
+ struct tcindex_filter __rcu *next;
+ struct rcu_head rcu;
+};
+
+
+struct tcindex_data {
+ struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
+ struct tcindex_filter __rcu **h; /* imperfect hash; */
+ struct tcf_proto *tp;
+ u16 mask; /* AND key with mask */
+ u32 shift; /* shift ANDed key to the right */
+ u32 hash; /* hash table size; 0 if undefined */
+ u32 alloc_hash; /* allocated size */
+ u32 fall_through; /* 0: only classify if explicit match */
+ struct rcu_head rcu;
+};
+
+static inline int
+tcindex_filter_is_set(struct tcindex_filter_result *r)
+{
+ return tcf_exts_is_predicative(&r->exts) || r->res.classid;
+}
+
+static struct tcindex_filter_result *
+tcindex_lookup(struct tcindex_data *p, u16 key)
+{
+ if (p->perfect) {
+ struct tcindex_filter_result *f = p->perfect + key;
+
+ return tcindex_filter_is_set(f) ? f : NULL;
+ } else if (p->h) {
+ struct tcindex_filter __rcu **fp;
+ struct tcindex_filter *f;
+
+ fp = &p->h[key % p->hash];
+ for (f = rcu_dereference_bh_rtnl(*fp);
+ f;
+ fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
+ if (f->key == key)
+ return &f->result;
+ }
+
+ return NULL;
+}
+
+
+static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct tcindex_data *p = rcu_dereference_bh(tp->root);
+ struct tcindex_filter_result *f;
+ int key = (skb->tc_index & p->mask) >> p->shift;
+
+ pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
+ skb, tp, res, p);
+
+ f = tcindex_lookup(p, key);
+ if (!f) {
+ if (!p->fall_through)
+ return -1;
+ res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key);
+ res->class = 0;
+ pr_debug("alg 0x%x\n", res->classid);
+ return 0;
+ }
+ *res = f->res;
+ pr_debug("map 0x%x\n", res->classid);
+
+ return tcf_exts_exec(skb, &f->exts, res);
+}
+
+
+static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter_result *r;
+
+ pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
+ if (p->perfect && handle >= p->alloc_hash)
+ return 0;
+ r = tcindex_lookup(p, handle);
+ return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL;
+}
+
+static int tcindex_init(struct tcf_proto *tp)
+{
+ struct tcindex_data *p;
+
+ pr_debug("tcindex_init(tp %p)\n", tp);
+ p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->mask = 0xffff;
+ p->hash = DEFAULT_HASH_SIZE;
+ p->fall_through = 1;
+
+ rcu_assign_pointer(tp->root, p);
+ return 0;
+}
+
+static int
+tcindex_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
+ struct tcindex_filter __rcu **walk;
+ struct tcindex_filter *f = NULL;
+
+ pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p);
+ if (p->perfect) {
+ if (!r->res.class)
+ return -ENOENT;
+ } else {
+ int i;
+
+ for (i = 0; i < p->hash; i++) {
+ walk = p->h + i;
+ for (f = rtnl_dereference(*walk); f;
+ walk = &f->next, f = rtnl_dereference(*walk)) {
+ if (&f->result == r)
+ goto found;
+ }
+ }
+ return -ENOENT;
+
+found:
+ rcu_assign_pointer(*walk, rtnl_dereference(f->next));
+ }
+ tcf_unbind_filter(tp, &r->res);
+ tcf_exts_destroy(&r->exts);
+ if (f)
+ kfree_rcu(f, rcu);
+ return 0;
+}
+
+static int tcindex_destroy_element(struct tcf_proto *tp,
+ unsigned long arg,
+ struct tcf_walker *walker)
+{
+ return tcindex_delete(tp, arg);
+}
+
+static void __tcindex_destroy(struct rcu_head *head)
+{
+ struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
+
+ kfree(p->perfect);
+ kfree(p->h);
+ kfree(p);
+}
+
+static inline int
+valid_perfect_hash(struct tcindex_data *p)
+{
+ return p->hash > (p->mask >> p->shift);
+}
+
+static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
+ [TCA_TCINDEX_HASH] = { .type = NLA_U32 },
+ [TCA_TCINDEX_MASK] = { .type = NLA_U16 },
+ [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 },
+ [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 },
+ [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
+};
+
+static void tcindex_filter_result_init(struct tcindex_filter_result *r)
+{
+ memset(r, 0, sizeof(*r));
+ tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+}
+
+static void __tcindex_partial_destroy(struct rcu_head *head)
+{
+ struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
+
+ kfree(p->perfect);
+ kfree(p);
+}
+
+static int
+tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
+ u32 handle, struct tcindex_data *p,
+ struct tcindex_filter_result *r, struct nlattr **tb,
+ struct nlattr *est, bool ovr)
+{
+ int err, balloc = 0;
+ struct tcindex_filter_result new_filter_result, *old_r = r;
+ struct tcindex_filter_result cr;
+ struct tcindex_data *cp, *oldp;
+ struct tcindex_filter *f = NULL; /* make gcc behave */
+ struct tcf_exts e;
+
+ tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ return err;
+
+ err = -ENOMEM;
+ /* tcindex_data attributes must look atomic to classifier/lookup so
+ * allocate new tcindex data and RCU assign it onto root. Keeping
+ * perfect hash and hash pointers from old data.
+ */
+ cp = kzalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp)
+ goto errout;
+
+ cp->mask = p->mask;
+ cp->shift = p->shift;
+ cp->hash = p->hash;
+ cp->alloc_hash = p->alloc_hash;
+ cp->fall_through = p->fall_through;
+ cp->tp = tp;
+
+ if (p->perfect) {
+ int i;
+
+ cp->perfect = kmemdup(p->perfect,
+ sizeof(*r) * cp->hash, GFP_KERNEL);
+ if (!cp->perfect)
+ goto errout;
+ for (i = 0; i < cp->hash; i++)
+ tcf_exts_init(&cp->perfect[i].exts,
+ TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
+ balloc = 1;
+ }
+ cp->h = p->h;
+
+ tcindex_filter_result_init(&new_filter_result);
+ tcindex_filter_result_init(&cr);
+ if (old_r)
+ cr.res = r->res;
+
+ if (tb[TCA_TCINDEX_HASH])
+ cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
+
+ if (tb[TCA_TCINDEX_MASK])
+ cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
+
+ if (tb[TCA_TCINDEX_SHIFT])
+ cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
+
+ err = -EBUSY;
+
+ /* Hash already allocated, make sure that we still meet the
+ * requirements for the allocated hash.
+ */
+ if (cp->perfect) {
+ if (!valid_perfect_hash(cp) ||
+ cp->hash > cp->alloc_hash)
+ goto errout_alloc;
+ } else if (cp->h && cp->hash != cp->alloc_hash) {
+ goto errout_alloc;
+ }
+
+ err = -EINVAL;
+ if (tb[TCA_TCINDEX_FALL_THROUGH])
+ cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
+
+ if (!cp->hash) {
+ /* Hash not specified, use perfect hash if the upper limit
+ * of the hashing index is below the threshold.
+ */
+ if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
+ cp->hash = (cp->mask >> cp->shift) + 1;
+ else
+ cp->hash = DEFAULT_HASH_SIZE;
+ }
+
+ if (!cp->perfect && !cp->h)
+ cp->alloc_hash = cp->hash;
+
+ /* Note: this could be as restrictive as if (handle & ~(mask >> shift))
+ * but then, we'd fail handles that may become valid after some future
+ * mask change. While this is extremely unlikely to ever matter,
+ * the check below is safer (and also more backwards-compatible).
+ */
+ if (cp->perfect || valid_perfect_hash(cp))
+ if (handle >= cp->alloc_hash)
+ goto errout_alloc;
+
+
+ err = -ENOMEM;
+ if (!cp->perfect && !cp->h) {
+ if (valid_perfect_hash(cp)) {
+ int i;
+
+ cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
+ if (!cp->perfect)
+ goto errout_alloc;
+ for (i = 0; i < cp->hash; i++)
+ tcf_exts_init(&cp->perfect[i].exts,
+ TCA_TCINDEX_ACT,
+ TCA_TCINDEX_POLICE);
+ balloc = 1;
+ } else {
+ struct tcindex_filter __rcu **hash;
+
+ hash = kcalloc(cp->hash,
+ sizeof(struct tcindex_filter *),
+ GFP_KERNEL);
+
+ if (!hash)
+ goto errout_alloc;
+
+ cp->h = hash;
+ balloc = 2;
+ }
+ }
+
+ if (cp->perfect)
+ r = cp->perfect + handle;
+ else
+ r = tcindex_lookup(cp, handle) ? : &new_filter_result;
+
+ if (r == &new_filter_result) {
+ f = kzalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ goto errout_alloc;
+ f->key = handle;
+ tcindex_filter_result_init(&f->result);
+ f->next = NULL;
+ }
+
+ if (tb[TCA_TCINDEX_CLASSID]) {
+ cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
+ tcf_bind_filter(tp, &cr.res, base);
+ }
+
+ if (old_r)
+ tcf_exts_change(tp, &r->exts, &e);
+ else
+ tcf_exts_change(tp, &cr.exts, &e);
+
+ if (old_r && old_r != r)
+ tcindex_filter_result_init(old_r);
+
+ oldp = p;
+ r->res = cr.res;
+ rcu_assign_pointer(tp->root, cp);
+
+ if (r == &new_filter_result) {
+ struct tcindex_filter *nfp;
+ struct tcindex_filter __rcu **fp;
+
+ tcf_exts_change(tp, &f->result.exts, &r->exts);
+
+ fp = cp->h + (handle % cp->hash);
+ for (nfp = rtnl_dereference(*fp);
+ nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp))
+ ; /* nothing */
+
+ rcu_assign_pointer(*fp, f);
+ }
+
+ if (oldp)
+ call_rcu(&oldp->rcu, __tcindex_partial_destroy);
+ return 0;
+
+errout_alloc:
+ if (balloc == 1)
+ kfree(cp->perfect);
+ else if (balloc == 2)
+ kfree(cp->h);
+errout:
+ kfree(cp);
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static int
+tcindex_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, unsigned long *arg, bool ovr)
+{
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_TCINDEX_MAX + 1];
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
+ int err;
+
+ pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
+ "p %p,r %p,*arg 0x%lx\n",
+ tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
+
+ if (!opt)
+ return 0;
+
+ err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy);
+ if (err < 0)
+ return err;
+
+ return tcindex_set_parms(net, tp, base, handle, p, r, tb,
+ tca[TCA_RATE], ovr);
+}
+
+static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter *f, *next;
+ int i;
+
+ pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
+ if (p->perfect) {
+ for (i = 0; i < p->hash; i++) {
+ if (!p->perfect[i].res.class)
+ continue;
+ if (walker->count >= walker->skip) {
+ if (walker->fn(tp,
+ (unsigned long) (p->perfect+i), walker)
+ < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+ }
+ if (!p->h)
+ return;
+ for (i = 0; i < p->hash; i++) {
+ for (f = rtnl_dereference(p->h[i]); f; f = next) {
+ next = rtnl_dereference(f->next);
+ if (walker->count >= walker->skip) {
+ if (walker->fn(tp, (unsigned long) &f->result,
+ walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+ }
+}
+
+static bool tcindex_destroy(struct tcf_proto *tp, bool force)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcf_walker walker;
+
+ if (!force)
+ return false;
+
+ pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
+ walker.count = 0;
+ walker.skip = 0;
+ walker.fn = tcindex_destroy_element;
+ tcindex_walk(tp, &walker);
+
+ RCU_INIT_POINTER(tp->root, NULL);
+ call_rcu(&p->rcu, __tcindex_destroy);
+ return true;
+}
+
+
+static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct tcindex_data *p = rtnl_dereference(tp->root);
+ struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
+ struct nlattr *nest;
+
+ pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p\n",
+ tp, fh, skb, t, p, r);
+ pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (!fh) {
+ t->tcm_handle = ~0; /* whatever ... */
+ if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
+ nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
+ nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
+ nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+ } else {
+ if (p->perfect) {
+ t->tcm_handle = r - p->perfect;
+ } else {
+ struct tcindex_filter *f;
+ struct tcindex_filter __rcu **fp;
+ int i;
+
+ t->tcm_handle = 0;
+ for (i = 0; !t->tcm_handle && i < p->hash; i++) {
+ fp = &p->h[i];
+ for (f = rtnl_dereference(*fp);
+ !t->tcm_handle && f;
+ fp = &f->next, f = rtnl_dereference(*fp)) {
+ if (&f->result == r)
+ t->tcm_handle = f->key;
+ }
+ }
+ }
+ pr_debug("handle = %d\n", t->tcm_handle);
+ if (r->res.class &&
+ nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
+ goto nla_put_failure;
+
+ if (tcf_exts_dump(skb, &r->exts) < 0)
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ if (tcf_exts_dump_stats(skb, &r->exts) < 0)
+ goto nla_put_failure;
+ }
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
+ .kind = "tcindex",
+ .classify = tcindex_classify,
+ .init = tcindex_init,
+ .destroy = tcindex_destroy,
+ .get = tcindex_get,
+ .change = tcindex_change,
+ .delete = tcindex_delete,
+ .walk = tcindex_walk,
+ .dump = tcindex_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_tcindex(void)
+{
+ return register_tcf_proto_ops(&cls_tcindex_ops);
+}
+
+static void __exit exit_tcindex(void)
+{
+ unregister_tcf_proto_ops(&cls_tcindex_ops);
+}
+
+module_init(init_tcindex)
+module_exit(exit_tcindex)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
new file mode 100644
index 000000000..cab9e9b43
--- /dev/null
+++ b/net/sched/cls_u32.c
@@ -0,0 +1,1075 @@
+/*
+ * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * The filters are packed to hash tables of key nodes
+ * with a set of 32bit key/mask pairs at every node.
+ * Nodes reference next level hash tables etc.
+ *
+ * This scheme is the best universal classifier I managed to
+ * invent; it is not super-fast, but it is not slow (provided you
+ * program it correctly), and general enough. And its relative
+ * speed grows as the number of rules becomes larger.
+ *
+ * It seems that it represents the best middle point between
+ * speed and manageability both by human and by machine.
+ *
+ * It is especially useful for link sharing combined with QoS;
+ * pure RSVP doesn't need such a general approach and can use
+ * much simpler (and faster) schemes, sort of cls_rsvp.c.
+ *
+ * JHS: We should remove the CONFIG_NET_CLS_IND from here
+ * eventually when the meta match extension is made available
+ *
+ * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/percpu.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/bitmap.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+struct tc_u_knode {
+ struct tc_u_knode __rcu *next;
+ u32 handle;
+ struct tc_u_hnode __rcu *ht_up;
+ struct tcf_exts exts;
+#ifdef CONFIG_NET_CLS_IND
+ int ifindex;
+#endif
+ u8 fshift;
+ struct tcf_result res;
+ struct tc_u_hnode __rcu *ht_down;
+#ifdef CONFIG_CLS_U32_PERF
+ struct tc_u32_pcnt __percpu *pf;
+#endif
+#ifdef CONFIG_CLS_U32_MARK
+ u32 val;
+ u32 mask;
+ u32 __percpu *pcpu_success;
+#endif
+ struct tcf_proto *tp;
+ struct rcu_head rcu;
+ /* The 'sel' field MUST be the last field in structure to allow for
+ * tc_u32_keys allocated at end of structure.
+ */
+ struct tc_u32_sel sel;
+};
+
+struct tc_u_hnode {
+ struct tc_u_hnode __rcu *next;
+ u32 handle;
+ u32 prio;
+ struct tc_u_common *tp_c;
+ int refcnt;
+ unsigned int divisor;
+ struct rcu_head rcu;
+ /* The 'ht' field MUST be the last field in structure to allow for
+ * more entries allocated at end of structure.
+ */
+ struct tc_u_knode __rcu *ht[1];
+};
+
+struct tc_u_common {
+ struct tc_u_hnode __rcu *hlist;
+ struct Qdisc *q;
+ int refcnt;
+ u32 hgenerator;
+ struct rcu_head rcu;
+};
+
+static inline unsigned int u32_hash_fold(__be32 key,
+ const struct tc_u32_sel *sel,
+ u8 fshift)
+{
+ unsigned int h = ntohl(key & sel->hmask) >> fshift;
+
+ return h;
+}
+
+static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res)
+{
+ struct {
+ struct tc_u_knode *knode;
+ unsigned int off;
+ } stack[TC_U32_MAXDEPTH];
+
+ struct tc_u_hnode *ht = rcu_dereference_bh(tp->root);
+ unsigned int off = skb_network_offset(skb);
+ struct tc_u_knode *n;
+ int sdepth = 0;
+ int off2 = 0;
+ int sel = 0;
+#ifdef CONFIG_CLS_U32_PERF
+ int j;
+#endif
+ int i, r;
+
+next_ht:
+ n = rcu_dereference_bh(ht->ht[sel]);
+
+next_knode:
+ if (n) {
+ struct tc_u32_key *key = n->sel.keys;
+
+#ifdef CONFIG_CLS_U32_PERF
+ __this_cpu_inc(n->pf->rcnt);
+ j = 0;
+#endif
+
+#ifdef CONFIG_CLS_U32_MARK
+ if ((skb->mark & n->mask) != n->val) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ } else {
+ __this_cpu_inc(*n->pcpu_success);
+ }
+#endif
+
+ for (i = n->sel.nkeys; i > 0; i--, key++) {
+ int toff = off + key->off + (off2 & key->offmask);
+ __be32 *data, hdata;
+
+ if (skb_headroom(skb) + toff > INT_MAX)
+ goto out;
+
+ data = skb_header_pointer(skb, toff, 4, &hdata);
+ if (!data)
+ goto out;
+ if ((*data ^ key->val) & key->mask) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+#ifdef CONFIG_CLS_U32_PERF
+ __this_cpu_inc(n->pf->kcnts[j]);
+ j++;
+#endif
+ }
+
+ ht = rcu_dereference_bh(n->ht_down);
+ if (!ht) {
+check_terminal:
+ if (n->sel.flags & TC_U32_TERMINAL) {
+
+ *res = n->res;
+#ifdef CONFIG_NET_CLS_IND
+ if (!tcf_match_indev(skb, n->ifindex)) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+#endif
+#ifdef CONFIG_CLS_U32_PERF
+ __this_cpu_inc(n->pf->rhit);
+#endif
+ r = tcf_exts_exec(skb, &n->exts, res);
+ if (r < 0) {
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+
+ return r;
+ }
+ n = rcu_dereference_bh(n->next);
+ goto next_knode;
+ }
+
+ /* PUSH */
+ if (sdepth >= TC_U32_MAXDEPTH)
+ goto deadloop;
+ stack[sdepth].knode = n;
+ stack[sdepth].off = off;
+ sdepth++;
+
+ ht = rcu_dereference_bh(n->ht_down);
+ sel = 0;
+ if (ht->divisor) {
+ __be32 *data, hdata;
+
+ data = skb_header_pointer(skb, off + n->sel.hoff, 4,
+ &hdata);
+ if (!data)
+ goto out;
+ sel = ht->divisor & u32_hash_fold(*data, &n->sel,
+ n->fshift);
+ }
+ if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
+ goto next_ht;
+
+ if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
+ off2 = n->sel.off + 3;
+ if (n->sel.flags & TC_U32_VAROFFSET) {
+ __be16 *data, hdata;
+
+ data = skb_header_pointer(skb,
+ off + n->sel.offoff,
+ 2, &hdata);
+ if (!data)
+ goto out;
+ off2 += ntohs(n->sel.offmask & *data) >>
+ n->sel.offshift;
+ }
+ off2 &= ~3;
+ }
+ if (n->sel.flags & TC_U32_EAT) {
+ off += off2;
+ off2 = 0;
+ }
+
+ if (off < skb->len)
+ goto next_ht;
+ }
+
+ /* POP */
+ if (sdepth--) {
+ n = stack[sdepth].knode;
+ ht = rcu_dereference_bh(n->ht_up);
+ off = stack[sdepth].off;
+ goto check_terminal;
+ }
+out:
+ return -1;
+
+deadloop:
+ net_warn_ratelimited("cls_u32: dead loop\n");
+ return -1;
+}
+
+static struct tc_u_hnode *
+u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
+{
+ struct tc_u_hnode *ht;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next))
+ if (ht->handle == handle)
+ break;
+
+ return ht;
+}
+
+static struct tc_u_knode *
+u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
+{
+ unsigned int sel;
+ struct tc_u_knode *n = NULL;
+
+ sel = TC_U32_HASH(handle);
+ if (sel > ht->divisor)
+ goto out;
+
+ for (n = rtnl_dereference(ht->ht[sel]);
+ n;
+ n = rtnl_dereference(n->next))
+ if (n->handle == handle)
+ break;
+out:
+ return n;
+}
+
+
+static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
+{
+ struct tc_u_hnode *ht;
+ struct tc_u_common *tp_c = tp->data;
+
+ if (TC_U32_HTID(handle) == TC_U32_ROOT)
+ ht = rtnl_dereference(tp->root);
+ else
+ ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
+
+ if (!ht)
+ return 0;
+
+ if (TC_U32_KEY(handle) == 0)
+ return (unsigned long)ht;
+
+ return (unsigned long)u32_lookup_key(ht, handle);
+}
+
+static u32 gen_new_htid(struct tc_u_common *tp_c)
+{
+ int i = 0x800;
+
+ /* hgenerator only used inside rtnl lock it is safe to increment
+ * without read _copy_ update semantics
+ */
+ do {
+ if (++tp_c->hgenerator == 0x7FF)
+ tp_c->hgenerator = 1;
+ } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
+
+ return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
+}
+
+static int u32_init(struct tcf_proto *tp)
+{
+ struct tc_u_hnode *root_ht;
+ struct tc_u_common *tp_c;
+
+ tp_c = tp->q->u32_node;
+
+ root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
+ if (root_ht == NULL)
+ return -ENOBUFS;
+
+ root_ht->divisor = 0;
+ root_ht->refcnt++;
+ root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
+ root_ht->prio = tp->prio;
+
+ if (tp_c == NULL) {
+ tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
+ if (tp_c == NULL) {
+ kfree(root_ht);
+ return -ENOBUFS;
+ }
+ tp_c->q = tp->q;
+ tp->q->u32_node = tp_c;
+ }
+
+ tp_c->refcnt++;
+ RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
+ rcu_assign_pointer(tp_c->hlist, root_ht);
+ root_ht->tp_c = tp_c;
+
+ rcu_assign_pointer(tp->root, root_ht);
+ tp->data = tp_c;
+ return 0;
+}
+
+static int u32_destroy_key(struct tcf_proto *tp,
+ struct tc_u_knode *n,
+ bool free_pf)
+{
+ tcf_exts_destroy(&n->exts);
+ if (n->ht_down)
+ n->ht_down->refcnt--;
+#ifdef CONFIG_CLS_U32_PERF
+ if (free_pf)
+ free_percpu(n->pf);
+#endif
+#ifdef CONFIG_CLS_U32_MARK
+ if (free_pf)
+ free_percpu(n->pcpu_success);
+#endif
+ kfree(n);
+ return 0;
+}
+
+/* u32_delete_key_rcu should be called when free'ing a copied
+ * version of a tc_u_knode obtained from u32_init_knode(). When
+ * copies are obtained from u32_init_knode() the statistics are
+ * shared between the old and new copies to allow readers to
+ * continue to update the statistics during the copy. To support
+ * this the u32_delete_key_rcu variant does not free the percpu
+ * statistics.
+ */
+static void u32_delete_key_rcu(struct rcu_head *rcu)
+{
+ struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
+
+ u32_destroy_key(key->tp, key, false);
+}
+
+/* u32_delete_key_freepf_rcu is the rcu callback variant
+ * that free's the entire structure including the statistics
+ * percpu variables. Only use this if the key is not a copy
+ * returned by u32_init_knode(). See u32_delete_key_rcu()
+ * for the variant that should be used with keys return from
+ * u32_init_knode()
+ */
+static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
+{
+ struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
+
+ u32_destroy_key(key->tp, key, true);
+}
+
+static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
+{
+ struct tc_u_knode __rcu **kp;
+ struct tc_u_knode *pkp;
+ struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
+
+ if (ht) {
+ kp = &ht->ht[TC_U32_HASH(key->handle)];
+ for (pkp = rtnl_dereference(*kp); pkp;
+ kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
+ if (pkp == key) {
+ RCU_INIT_POINTER(*kp, key->next);
+
+ tcf_unbind_filter(tp, &key->res);
+ call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
+ return 0;
+ }
+ }
+ }
+ WARN_ON(1);
+ return 0;
+}
+
+static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+{
+ struct tc_u_knode *n;
+ unsigned int h;
+
+ for (h = 0; h <= ht->divisor; h++) {
+ while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
+ RCU_INIT_POINTER(ht->ht[h],
+ rtnl_dereference(n->next));
+ tcf_unbind_filter(tp, &n->res);
+ call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
+ }
+ }
+}
+
+static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode __rcu **hn;
+ struct tc_u_hnode *phn;
+
+ WARN_ON(ht->refcnt);
+
+ u32_clear_hnode(tp, ht);
+
+ hn = &tp_c->hlist;
+ for (phn = rtnl_dereference(*hn);
+ phn;
+ hn = &phn->next, phn = rtnl_dereference(*hn)) {
+ if (phn == ht) {
+ RCU_INIT_POINTER(*hn, ht->next);
+ kfree_rcu(ht, rcu);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static bool ht_empty(struct tc_u_hnode *ht)
+{
+ unsigned int h;
+
+ for (h = 0; h <= ht->divisor; h++)
+ if (rcu_access_pointer(ht->ht[h]))
+ return false;
+
+ return true;
+}
+
+static bool u32_destroy(struct tcf_proto *tp, bool force)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
+
+ WARN_ON(root_ht == NULL);
+
+ if (!force) {
+ if (root_ht) {
+ if (root_ht->refcnt > 1)
+ return false;
+ if (root_ht->refcnt == 1) {
+ if (!ht_empty(root_ht))
+ return false;
+ }
+ }
+ }
+
+ if (root_ht && --root_ht->refcnt == 0)
+ u32_destroy_hnode(tp, root_ht);
+
+ if (--tp_c->refcnt == 0) {
+ struct tc_u_hnode *ht;
+
+ tp->q->u32_node = NULL;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next)) {
+ ht->refcnt--;
+ u32_clear_hnode(tp, ht);
+ }
+
+ while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
+ RCU_INIT_POINTER(tp_c->hlist, ht->next);
+ kfree_rcu(ht, rcu);
+ }
+
+ kfree(tp_c);
+ }
+
+ tp->data = NULL;
+ return true;
+}
+
+static int u32_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
+ struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
+
+ if (ht == NULL)
+ return 0;
+
+ if (TC_U32_KEY(ht->handle))
+ return u32_delete_key(tp, (struct tc_u_knode *)ht);
+
+ if (root_ht == ht)
+ return -EINVAL;
+
+ if (ht->refcnt == 1) {
+ ht->refcnt--;
+ u32_destroy_hnode(tp, ht);
+ } else {
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+#define NR_U32_NODE (1<<12)
+static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
+{
+ struct tc_u_knode *n;
+ unsigned long i;
+ unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!bitmap)
+ return handle | 0xFFF;
+
+ for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
+ n;
+ n = rtnl_dereference(n->next))
+ set_bit(TC_U32_NODE(n->handle), bitmap);
+
+ i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
+ if (i >= NR_U32_NODE)
+ i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
+
+ kfree(bitmap);
+ return handle | (i >= NR_U32_NODE ? 0xFFF : i);
+}
+
+static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
+ [TCA_U32_CLASSID] = { .type = NLA_U32 },
+ [TCA_U32_HASH] = { .type = NLA_U32 },
+ [TCA_U32_LINK] = { .type = NLA_U32 },
+ [TCA_U32_DIVISOR] = { .type = NLA_U32 },
+ [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) },
+ [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
+ [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) },
+};
+
+static int u32_set_parms(struct net *net, struct tcf_proto *tp,
+ unsigned long base, struct tc_u_hnode *ht,
+ struct tc_u_knode *n, struct nlattr **tb,
+ struct nlattr *est, bool ovr)
+{
+ int err;
+ struct tcf_exts e;
+
+ tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
+ err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ if (tb[TCA_U32_LINK]) {
+ u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
+ struct tc_u_hnode *ht_down = NULL, *ht_old;
+
+ if (TC_U32_KEY(handle))
+ goto errout;
+
+ if (handle) {
+ ht_down = u32_lookup_ht(ht->tp_c, handle);
+
+ if (ht_down == NULL)
+ goto errout;
+ ht_down->refcnt++;
+ }
+
+ ht_old = rtnl_dereference(n->ht_down);
+ rcu_assign_pointer(n->ht_down, ht_down);
+
+ if (ht_old)
+ ht_old->refcnt--;
+ }
+ if (tb[TCA_U32_CLASSID]) {
+ n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
+ tcf_bind_filter(tp, &n->res, base);
+ }
+
+#ifdef CONFIG_NET_CLS_IND
+ if (tb[TCA_U32_INDEV]) {
+ int ret;
+ ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
+ if (ret < 0)
+ goto errout;
+ n->ifindex = ret;
+ }
+#endif
+ tcf_exts_change(tp, &n->exts, &e);
+
+ return 0;
+errout:
+ tcf_exts_destroy(&e);
+ return err;
+}
+
+static void u32_replace_knode(struct tcf_proto *tp,
+ struct tc_u_common *tp_c,
+ struct tc_u_knode *n)
+{
+ struct tc_u_knode __rcu **ins;
+ struct tc_u_knode *pins;
+ struct tc_u_hnode *ht;
+
+ if (TC_U32_HTID(n->handle) == TC_U32_ROOT)
+ ht = rtnl_dereference(tp->root);
+ else
+ ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle));
+
+ ins = &ht->ht[TC_U32_HASH(n->handle)];
+
+ /* The node must always exist for it to be replaced if this is not the
+ * case then something went very wrong elsewhere.
+ */
+ for (pins = rtnl_dereference(*ins); ;
+ ins = &pins->next, pins = rtnl_dereference(*ins))
+ if (pins->handle == n->handle)
+ break;
+
+ RCU_INIT_POINTER(n->next, pins->next);
+ rcu_assign_pointer(*ins, n);
+}
+
+static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
+ struct tc_u_knode *n)
+{
+ struct tc_u_knode *new;
+ struct tc_u32_sel *s = &n->sel;
+
+ new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
+ GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ RCU_INIT_POINTER(new->next, n->next);
+ new->handle = n->handle;
+ RCU_INIT_POINTER(new->ht_up, n->ht_up);
+
+#ifdef CONFIG_NET_CLS_IND
+ new->ifindex = n->ifindex;
+#endif
+ new->fshift = n->fshift;
+ new->res = n->res;
+ RCU_INIT_POINTER(new->ht_down, n->ht_down);
+
+ /* bump reference count as long as we hold pointer to structure */
+ if (new->ht_down)
+ new->ht_down->refcnt++;
+
+#ifdef CONFIG_CLS_U32_PERF
+ /* Statistics may be incremented by readers during update
+ * so we must keep them in tact. When the node is later destroyed
+ * a special destroy call must be made to not free the pf memory.
+ */
+ new->pf = n->pf;
+#endif
+
+#ifdef CONFIG_CLS_U32_MARK
+ new->val = n->val;
+ new->mask = n->mask;
+ /* Similarly success statistics must be moved as pointers */
+ new->pcpu_success = n->pcpu_success;
+#endif
+ new->tp = tp;
+ memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
+
+ tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE);
+
+ return new;
+}
+
+static int u32_change(struct net *net, struct sk_buff *in_skb,
+ struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca,
+ unsigned long *arg, bool ovr)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *ht;
+ struct tc_u_knode *n;
+ struct tc_u32_sel *s;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_U32_MAX + 1];
+ u32 htid;
+ int err;
+#ifdef CONFIG_CLS_U32_PERF
+ size_t size;
+#endif
+
+ if (opt == NULL)
+ return handle ? -EINVAL : 0;
+
+ err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy);
+ if (err < 0)
+ return err;
+
+ n = (struct tc_u_knode *)*arg;
+ if (n) {
+ struct tc_u_knode *new;
+
+ if (TC_U32_KEY(n->handle) == 0)
+ return -EINVAL;
+
+ new = u32_init_knode(tp, n);
+ if (!new)
+ return -ENOMEM;
+
+ err = u32_set_parms(net, tp, base,
+ rtnl_dereference(n->ht_up), new, tb,
+ tca[TCA_RATE], ovr);
+
+ if (err) {
+ u32_destroy_key(tp, new, false);
+ return err;
+ }
+
+ u32_replace_knode(tp, tp_c, new);
+ tcf_unbind_filter(tp, &n->res);
+ call_rcu(&n->rcu, u32_delete_key_rcu);
+ return 0;
+ }
+
+ if (tb[TCA_U32_DIVISOR]) {
+ unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
+
+ if (--divisor > 0x100)
+ return -EINVAL;
+ if (TC_U32_KEY(handle))
+ return -EINVAL;
+ if (handle == 0) {
+ handle = gen_new_htid(tp->data);
+ if (handle == 0)
+ return -ENOMEM;
+ }
+ ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
+ if (ht == NULL)
+ return -ENOBUFS;
+ ht->tp_c = tp_c;
+ ht->refcnt = 1;
+ ht->divisor = divisor;
+ ht->handle = handle;
+ ht->prio = tp->prio;
+ RCU_INIT_POINTER(ht->next, tp_c->hlist);
+ rcu_assign_pointer(tp_c->hlist, ht);
+ *arg = (unsigned long)ht;
+ return 0;
+ }
+
+ if (tb[TCA_U32_HASH]) {
+ htid = nla_get_u32(tb[TCA_U32_HASH]);
+ if (TC_U32_HTID(htid) == TC_U32_ROOT) {
+ ht = rtnl_dereference(tp->root);
+ htid = ht->handle;
+ } else {
+ ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
+ if (ht == NULL)
+ return -EINVAL;
+ }
+ } else {
+ ht = rtnl_dereference(tp->root);
+ htid = ht->handle;
+ }
+
+ if (ht->divisor < TC_U32_HASH(htid))
+ return -EINVAL;
+
+ if (handle) {
+ if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
+ return -EINVAL;
+ handle = htid | TC_U32_NODE(handle);
+ } else
+ handle = gen_new_kid(ht, htid);
+
+ if (tb[TCA_U32_SEL] == NULL)
+ return -EINVAL;
+
+ s = nla_data(tb[TCA_U32_SEL]);
+
+ n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
+ if (n == NULL)
+ return -ENOBUFS;
+
+#ifdef CONFIG_CLS_U32_PERF
+ size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
+ n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
+ if (!n->pf) {
+ kfree(n);
+ return -ENOBUFS;
+ }
+#endif
+
+ memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
+ RCU_INIT_POINTER(n->ht_up, ht);
+ n->handle = handle;
+ n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+ tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
+ n->tp = tp;
+
+#ifdef CONFIG_CLS_U32_MARK
+ n->pcpu_success = alloc_percpu(u32);
+ if (!n->pcpu_success) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ if (tb[TCA_U32_MARK]) {
+ struct tc_u32_mark *mark;
+
+ mark = nla_data(tb[TCA_U32_MARK]);
+ n->val = mark->val;
+ n->mask = mark->mask;
+ }
+#endif
+
+ err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
+ if (err == 0) {
+ struct tc_u_knode __rcu **ins;
+ struct tc_u_knode *pins;
+
+ ins = &ht->ht[TC_U32_HASH(handle)];
+ for (pins = rtnl_dereference(*ins); pins;
+ ins = &pins->next, pins = rtnl_dereference(*ins))
+ if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle))
+ break;
+
+ RCU_INIT_POINTER(n->next, pins);
+ rcu_assign_pointer(*ins, n);
+
+ *arg = (unsigned long)n;
+ return 0;
+ }
+
+#ifdef CONFIG_CLS_U32_MARK
+ free_percpu(n->pcpu_success);
+errout:
+#endif
+
+#ifdef CONFIG_CLS_U32_PERF
+ free_percpu(n->pf);
+#endif
+ kfree(n);
+ return err;
+}
+
+static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *ht;
+ struct tc_u_knode *n;
+ unsigned int h;
+
+ if (arg->stop)
+ return;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next)) {
+ if (ht->prio != tp->prio)
+ continue;
+ if (arg->count >= arg->skip) {
+ if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ }
+ arg->count++;
+ for (h = 0; h <= ht->divisor; h++) {
+ for (n = rtnl_dereference(ht->ht[h]);
+ n;
+ n = rtnl_dereference(n->next)) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(tp, (unsigned long)n, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+ }
+}
+
+static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct tc_u_knode *n = (struct tc_u_knode *)fh;
+ struct tc_u_hnode *ht_up, *ht_down;
+ struct nlattr *nest;
+
+ if (n == NULL)
+ return skb->len;
+
+ t->tcm_handle = n->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (TC_U32_KEY(n->handle) == 0) {
+ struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
+ u32 divisor = ht->divisor + 1;
+
+ if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
+ goto nla_put_failure;
+ } else {
+#ifdef CONFIG_CLS_U32_PERF
+ struct tc_u32_pcnt *gpf;
+ int cpu;
+#endif
+
+ if (nla_put(skb, TCA_U32_SEL,
+ sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+ &n->sel))
+ goto nla_put_failure;
+
+ ht_up = rtnl_dereference(n->ht_up);
+ if (ht_up) {
+ u32 htid = n->handle & 0xFFFFF000;
+ if (nla_put_u32(skb, TCA_U32_HASH, htid))
+ goto nla_put_failure;
+ }
+ if (n->res.classid &&
+ nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
+ goto nla_put_failure;
+
+ ht_down = rtnl_dereference(n->ht_down);
+ if (ht_down &&
+ nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
+ goto nla_put_failure;
+
+#ifdef CONFIG_CLS_U32_MARK
+ if ((n->val || n->mask)) {
+ struct tc_u32_mark mark = {.val = n->val,
+ .mask = n->mask,
+ .success = 0};
+ int cpum;
+
+ for_each_possible_cpu(cpum) {
+ __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum);
+
+ mark.success += cnt;
+ }
+
+ if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark))
+ goto nla_put_failure;
+ }
+#endif
+
+ if (tcf_exts_dump(skb, &n->exts) < 0)
+ goto nla_put_failure;
+
+#ifdef CONFIG_NET_CLS_IND
+ if (n->ifindex) {
+ struct net_device *dev;
+ dev = __dev_get_by_index(net, n->ifindex);
+ if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
+ goto nla_put_failure;
+ }
+#endif
+#ifdef CONFIG_CLS_U32_PERF
+ gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
+ n->sel.nkeys * sizeof(u64),
+ GFP_KERNEL);
+ if (!gpf)
+ goto nla_put_failure;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu);
+
+ gpf->rcnt += pf->rcnt;
+ gpf->rhit += pf->rhit;
+ for (i = 0; i < n->sel.nkeys; i++)
+ gpf->kcnts[i] += pf->kcnts[i];
+ }
+
+ if (nla_put(skb, TCA_U32_PCNT,
+ sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
+ gpf)) {
+ kfree(gpf);
+ goto nla_put_failure;
+ }
+ kfree(gpf);
+#endif
+ }
+
+ nla_nest_end(skb, nest);
+
+ if (TC_U32_KEY(n->handle))
+ if (tcf_exts_dump_stats(skb, &n->exts) < 0)
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_u32_ops __read_mostly = {
+ .kind = "u32",
+ .classify = u32_classify,
+ .init = u32_init,
+ .destroy = u32_destroy,
+ .get = u32_get,
+ .change = u32_change,
+ .delete = u32_delete,
+ .walk = u32_walk,
+ .dump = u32_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_u32(void)
+{
+ pr_info("u32 classifier\n");
+#ifdef CONFIG_CLS_U32_PERF
+ pr_info(" Performance counters on\n");
+#endif
+#ifdef CONFIG_NET_CLS_IND
+ pr_info(" input device check on\n");
+#endif
+#ifdef CONFIG_NET_CLS_ACT
+ pr_info(" Actions configured\n");
+#endif
+ return register_tcf_proto_ops(&cls_u32_ops);
+}
+
+static void __exit exit_u32(void)
+{
+ unregister_tcf_proto_ops(&cls_u32_ops);
+}
+
+module_init(init_u32)
+module_exit(exit_u32)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
new file mode 100644
index 000000000..ddd883ca5
--- /dev/null
+++ b/net/sched/em_canid.c
@@ -0,0 +1,233 @@
+/*
+ * em_canid.c Ematch rule to match CAN frames according to their CAN IDs
+ *
+ * This program is free software; you can distribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Copyright: (c) 2011 Czech Technical University in Prague
+ * (c) 2011 Volkswagen Group Research
+ * Authors: Michal Sojka <sojkam1@fel.cvut.cz>
+ * Pavel Pisa <pisa@cmp.felk.cvut.cz>
+ * Rostislav Lisovy <lisovy@gmail.cz>
+ * Funded by: Volkswagen Group Research
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+#include <linux/can.h>
+
+#define EM_CAN_RULES_MAX 500
+
+struct canid_match {
+ /* For each SFF CAN ID (11 bit) there is one record in this bitfield */
+ DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS));
+
+ int rules_count;
+ int sff_rules_count;
+ int eff_rules_count;
+
+ /*
+ * Raw rules copied from netlink message; Used for sending
+ * information to userspace (when 'tc filter show' is invoked)
+ * AND when matching EFF frames
+ */
+ struct can_filter rules_raw[];
+};
+
+/**
+ * em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
+ */
+static canid_t em_canid_get_id(struct sk_buff *skb)
+{
+ /* CAN ID is stored within the data field */
+ struct can_frame *cf = (struct can_frame *)skb->data;
+
+ return cf->can_id;
+}
+
+static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id,
+ u32 can_mask)
+{
+ int i;
+
+ /*
+ * Limit can_mask and can_id to SFF range to
+ * protect against write after end of array
+ */
+ can_mask &= CAN_SFF_MASK;
+ can_id &= can_mask;
+
+ /* Single frame */
+ if (can_mask == CAN_SFF_MASK) {
+ set_bit(can_id, cm->match_sff);
+ return;
+ }
+
+ /* All frames */
+ if (can_mask == 0) {
+ bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS));
+ return;
+ }
+
+ /*
+ * Individual frame filter.
+ * Add record (set bit to 1) for each ID that
+ * conforms particular rule
+ */
+ for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
+ if ((i & can_mask) == can_id)
+ set_bit(i, cm->match_sff);
+ }
+}
+
+static inline struct canid_match *em_canid_priv(struct tcf_ematch *m)
+{
+ return (struct canid_match *)m->data;
+}
+
+static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ struct canid_match *cm = em_canid_priv(m);
+ canid_t can_id;
+ int match = 0;
+ int i;
+ const struct can_filter *lp;
+
+ can_id = em_canid_get_id(skb);
+
+ if (can_id & CAN_EFF_FLAG) {
+ for (i = 0, lp = cm->rules_raw;
+ i < cm->eff_rules_count; i++, lp++) {
+ if (!(((lp->can_id ^ can_id) & lp->can_mask))) {
+ match = 1;
+ break;
+ }
+ }
+ } else { /* SFF */
+ can_id &= CAN_SFF_MASK;
+ match = (test_bit(can_id, cm->match_sff) ? 1 : 0);
+ }
+
+ return match;
+}
+
+static int em_canid_change(struct net *net, void *data, int len,
+ struct tcf_ematch *m)
+{
+ struct can_filter *conf = data; /* Array with rules */
+ struct canid_match *cm;
+ int i;
+
+ if (!len)
+ return -EINVAL;
+
+ if (len % sizeof(struct can_filter))
+ return -EINVAL;
+
+ if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
+ return -EINVAL;
+
+ cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL);
+ if (!cm)
+ return -ENOMEM;
+
+ cm->rules_count = len / sizeof(struct can_filter);
+
+ /*
+ * We need two for() loops for copying rules into two contiguous
+ * areas in rules_raw to process all eff rules with a simple loop.
+ * NB: The configuration interface supports sff and eff rules.
+ * We do not support filters here that match for the same can_id
+ * provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123).
+ * For this (unusual case) two filters have to be specified. The
+ * SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id.
+ */
+
+ /* Fill rules_raw with EFF rules first */
+ for (i = 0; i < cm->rules_count; i++) {
+ if (conf[i].can_id & CAN_EFF_FLAG) {
+ memcpy(cm->rules_raw + cm->eff_rules_count,
+ &conf[i],
+ sizeof(struct can_filter));
+
+ cm->eff_rules_count++;
+ }
+ }
+
+ /* append SFF frame rules */
+ for (i = 0; i < cm->rules_count; i++) {
+ if (!(conf[i].can_id & CAN_EFF_FLAG)) {
+ memcpy(cm->rules_raw
+ + cm->eff_rules_count
+ + cm->sff_rules_count,
+ &conf[i], sizeof(struct can_filter));
+
+ cm->sff_rules_count++;
+
+ em_canid_sff_match_add(cm,
+ conf[i].can_id, conf[i].can_mask);
+ }
+ }
+
+ m->datalen = sizeof(struct canid_match) + len;
+ m->data = (unsigned long)cm;
+ return 0;
+}
+
+static void em_canid_destroy(struct tcf_ematch *m)
+{
+ struct canid_match *cm = em_canid_priv(m);
+
+ kfree(cm);
+}
+
+static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct canid_match *cm = em_canid_priv(m);
+
+ /*
+ * When configuring this ematch 'rules_count' is set not to exceed
+ * 'rules_raw' array size
+ */
+ if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count,
+ &cm->rules_raw) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static struct tcf_ematch_ops em_canid_ops = {
+ .kind = TCF_EM_CANID,
+ .change = em_canid_change,
+ .match = em_canid_match,
+ .destroy = em_canid_destroy,
+ .dump = em_canid_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_canid_ops.link)
+};
+
+static int __init init_em_canid(void)
+{
+ return tcf_em_register(&em_canid_ops);
+}
+
+static void __exit exit_em_canid(void)
+{
+ tcf_em_unregister(&em_canid_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_canid);
+module_exit(exit_em_canid);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
new file mode 100644
index 000000000..1c8360a27
--- /dev/null
+++ b/net/sched/em_cmp.c
@@ -0,0 +1,99 @@
+/*
+ * net/sched/em_cmp.c Simple packet data comparison ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_cmp.h>
+#include <asm/unaligned.h>
+#include <net/pkt_cls.h>
+
+static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
+{
+ return unlikely(cmp->flags & TCF_EM_CMP_TRANS);
+}
+
+static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
+ unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
+ u32 val = 0;
+
+ if (!tcf_valid_offset(skb, ptr, cmp->align))
+ return 0;
+
+ switch (cmp->align) {
+ case TCF_EM_ALIGN_U8:
+ val = *ptr;
+ break;
+
+ case TCF_EM_ALIGN_U16:
+ val = get_unaligned_be16(ptr);
+
+ if (cmp_needs_transformation(cmp))
+ val = be16_to_cpu(val);
+ break;
+
+ case TCF_EM_ALIGN_U32:
+ /* Worth checking boundries? The branching seems
+ * to get worse. Visit again.
+ */
+ val = get_unaligned_be32(ptr);
+
+ if (cmp_needs_transformation(cmp))
+ val = be32_to_cpu(val);
+ break;
+
+ default:
+ return 0;
+ }
+
+ if (cmp->mask)
+ val &= cmp->mask;
+
+ switch (cmp->opnd) {
+ case TCF_EM_OPND_EQ:
+ return val == cmp->val;
+ case TCF_EM_OPND_LT:
+ return val < cmp->val;
+ case TCF_EM_OPND_GT:
+ return val > cmp->val;
+ }
+
+ return 0;
+}
+
+static struct tcf_ematch_ops em_cmp_ops = {
+ .kind = TCF_EM_CMP,
+ .datalen = sizeof(struct tcf_em_cmp),
+ .match = em_cmp_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_cmp_ops.link)
+};
+
+static int __init init_em_cmp(void)
+{
+ return tcf_em_register(&em_cmp_ops);
+}
+
+static void __exit exit_em_cmp(void)
+{
+ tcf_em_unregister(&em_cmp_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_cmp);
+module_exit(exit_em_cmp);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP);
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
new file mode 100644
index 000000000..a3d79c8bf
--- /dev/null
+++ b/net/sched/em_ipset.c
@@ -0,0 +1,135 @@
+/*
+ * net/sched/em_ipset.c ipset ematch
+ *
+ * Copyright (c) 2012 Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_set.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/pkt_cls.h>
+
+static int em_ipset_change(struct net *net, void *data, int data_len,
+ struct tcf_ematch *em)
+{
+ struct xt_set_info *set = data;
+ ip_set_id_t index;
+
+ if (data_len != sizeof(*set))
+ return -EINVAL;
+
+ index = ip_set_nfnl_get_byindex(net, set->index);
+ if (index == IPSET_INVALID_ID)
+ return -ENOENT;
+
+ em->datalen = sizeof(*set);
+ em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
+ if (em->data)
+ return 0;
+
+ ip_set_nfnl_put(net, index);
+ return -ENOMEM;
+}
+
+static void em_ipset_destroy(struct tcf_ematch *em)
+{
+ const struct xt_set_info *set = (const void *) em->data;
+ if (set) {
+ ip_set_nfnl_put(em->net, set->index);
+ kfree((void *) em->data);
+ }
+}
+
+static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct ip_set_adt_opt opt;
+ struct xt_action_param acpar;
+ const struct xt_set_info *set = (const void *) em->data;
+ struct net_device *dev, *indev = NULL;
+ int ret, network_offset;
+
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ acpar.family = NFPROTO_IPV4;
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+ acpar.thoff = ip_hdrlen(skb);
+ break;
+ case htons(ETH_P_IPV6):
+ acpar.family = NFPROTO_IPV6;
+ if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+ /* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
+ acpar.thoff = sizeof(struct ipv6hdr);
+ break;
+ default:
+ return 0;
+ }
+
+ acpar.hooknum = 0;
+
+ opt.family = acpar.family;
+ opt.dim = set->dim;
+ opt.flags = set->flags;
+ opt.cmdflags = 0;
+ opt.ext.timeout = ~0u;
+
+ network_offset = skb_network_offset(skb);
+ skb_pull(skb, network_offset);
+
+ dev = skb->dev;
+
+ rcu_read_lock();
+
+ if (dev && skb->skb_iif)
+ indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif);
+
+ acpar.in = indev ? indev : dev;
+ acpar.out = dev;
+
+ ret = ip_set_test(set->index, skb, &acpar, &opt);
+
+ rcu_read_unlock();
+
+ skb_push(skb, network_offset);
+ return ret;
+}
+
+static struct tcf_ematch_ops em_ipset_ops = {
+ .kind = TCF_EM_IPSET,
+ .change = em_ipset_change,
+ .destroy = em_ipset_destroy,
+ .match = em_ipset_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_ipset_ops.link)
+};
+
+static int __init init_em_ipset(void)
+{
+ return tcf_em_register(&em_ipset_ops);
+}
+
+static void __exit exit_em_ipset(void)
+{
+ tcf_em_unregister(&em_ipset_ops);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("TC extended match for IP sets");
+
+module_init(init_em_ipset);
+module_exit(exit_em_ipset);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
new file mode 100644
index 000000000..b5294ce20
--- /dev/null
+++ b/net/sched/em_meta.c
@@ -0,0 +1,966 @@
+/*
+ * net/sched/em_meta.c Metadata ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ * The metadata ematch compares two meta objects where each object
+ * represents either a meta value stored in the kernel or a static
+ * value provided by userspace. The objects are not provided by
+ * userspace itself but rather a definition providing the information
+ * to build them. Every object is of a certain type which must be
+ * equal to the object it is being compared to.
+ *
+ * The definition of a objects conists of the type (meta type), a
+ * identifier (meta id) and additional type specific information.
+ * The meta id is either TCF_META_TYPE_VALUE for values provided by
+ * userspace or a index to the meta operations table consisting of
+ * function pointers to type specific meta data collectors returning
+ * the value of the requested meta value.
+ *
+ * lvalue rvalue
+ * +-----------+ +-----------+
+ * | type: INT | | type: INT |
+ * def | id: DEV | | id: VALUE |
+ * | data: | | data: 3 |
+ * +-----------+ +-----------+
+ * | |
+ * ---> meta_ops[INT][DEV](...) |
+ * | |
+ * ----------- |
+ * V V
+ * +-----------+ +-----------+
+ * | type: INT | | type: INT |
+ * obj | id: DEV | | id: VALUE |
+ * | data: 2 |<--data got filled out | data: 3 |
+ * +-----------+ +-----------+
+ * | |
+ * --------------> 2 equals 3 <--------------
+ *
+ * This is a simplified schema, the complexity varies depending
+ * on the meta type. Obviously, the length of the data must also
+ * be provided for non-numeric types.
+ *
+ * Additionally, type dependent modifiers such as shift operators
+ * or mask may be applied to extend the functionaliy. As of now,
+ * the variable length type supports shifting the byte string to
+ * the right, eating up any number of octets and thus supporting
+ * wildcard interface name comparisons such as "ppp%" matching
+ * ppp0..9.
+ *
+ * NOTE: Certain meta values depend on other subsystems and are
+ * only available if that subsystem is enabled in the kernel.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/if_vlan.h>
+#include <linux/tc_ematch/tc_em_meta.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+struct meta_obj {
+ unsigned long value;
+ unsigned int len;
+};
+
+struct meta_value {
+ struct tcf_meta_val hdr;
+ unsigned long val;
+ unsigned int len;
+};
+
+struct meta_match {
+ struct meta_value lvalue;
+ struct meta_value rvalue;
+};
+
+static inline int meta_id(struct meta_value *v)
+{
+ return TCF_META_ID(v->hdr.kind);
+}
+
+static inline int meta_type(struct meta_value *v)
+{
+ return TCF_META_TYPE(v->hdr.kind);
+}
+
+#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \
+ struct tcf_pkt_info *info, struct meta_value *v, \
+ struct meta_obj *dst, int *err)
+
+/**************************************************************************
+ * System status & misc
+ **************************************************************************/
+
+META_COLLECTOR(int_random)
+{
+ get_random_bytes(&dst->value, sizeof(dst->value));
+}
+
+static inline unsigned long fixed_loadavg(int load)
+{
+ int rnd_load = load + (FIXED_1/200);
+ int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT;
+
+ return ((rnd_load >> FSHIFT) * 100) + rnd_frac;
+}
+
+META_COLLECTOR(int_loadavg_0)
+{
+ dst->value = fixed_loadavg(avenrun[0]);
+}
+
+META_COLLECTOR(int_loadavg_1)
+{
+ dst->value = fixed_loadavg(avenrun[1]);
+}
+
+META_COLLECTOR(int_loadavg_2)
+{
+ dst->value = fixed_loadavg(avenrun[2]);
+}
+
+/**************************************************************************
+ * Device names & indices
+ **************************************************************************/
+
+static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
+{
+ if (unlikely(dev == NULL))
+ return -1;
+
+ dst->value = dev->ifindex;
+ return 0;
+}
+
+static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
+{
+ if (unlikely(dev == NULL))
+ return -1;
+
+ dst->value = (unsigned long) dev->name;
+ dst->len = strlen(dev->name);
+ return 0;
+}
+
+META_COLLECTOR(int_dev)
+{
+ *err = int_dev(skb->dev, dst);
+}
+
+META_COLLECTOR(var_dev)
+{
+ *err = var_dev(skb->dev, dst);
+}
+
+/**************************************************************************
+ * vlan tag
+ **************************************************************************/
+
+META_COLLECTOR(int_vlan_tag)
+{
+ unsigned short tag;
+
+ tag = skb_vlan_tag_get(skb);
+ if (!tag && __vlan_get_tag(skb, &tag))
+ *err = -1;
+ else
+ dst->value = tag;
+}
+
+
+
+/**************************************************************************
+ * skb attributes
+ **************************************************************************/
+
+META_COLLECTOR(int_priority)
+{
+ dst->value = skb->priority;
+}
+
+META_COLLECTOR(int_protocol)
+{
+ /* Let userspace take care of the byte ordering */
+ dst->value = tc_skb_protocol(skb);
+}
+
+META_COLLECTOR(int_pkttype)
+{
+ dst->value = skb->pkt_type;
+}
+
+META_COLLECTOR(int_pktlen)
+{
+ dst->value = skb->len;
+}
+
+META_COLLECTOR(int_datalen)
+{
+ dst->value = skb->data_len;
+}
+
+META_COLLECTOR(int_maclen)
+{
+ dst->value = skb->mac_len;
+}
+
+META_COLLECTOR(int_rxhash)
+{
+ dst->value = skb_get_hash(skb);
+}
+
+/**************************************************************************
+ * Netfilter
+ **************************************************************************/
+
+META_COLLECTOR(int_mark)
+{
+ dst->value = skb->mark;
+}
+
+/**************************************************************************
+ * Traffic Control
+ **************************************************************************/
+
+META_COLLECTOR(int_tcindex)
+{
+ dst->value = skb->tc_index;
+}
+
+/**************************************************************************
+ * Routing
+ **************************************************************************/
+
+META_COLLECTOR(int_rtclassid)
+{
+ if (unlikely(skb_dst(skb) == NULL))
+ *err = -1;
+ else
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ dst->value = skb_dst(skb)->tclassid;
+#else
+ dst->value = 0;
+#endif
+}
+
+META_COLLECTOR(int_rtiif)
+{
+ if (unlikely(skb_rtable(skb) == NULL))
+ *err = -1;
+ else
+ dst->value = inet_iif(skb);
+}
+
+/**************************************************************************
+ * Socket Attributes
+ **************************************************************************/
+
+#define skip_nonlocal(skb) \
+ (unlikely(skb->sk == NULL))
+
+META_COLLECTOR(int_sk_family)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_family;
+}
+
+META_COLLECTOR(int_sk_state)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_state;
+}
+
+META_COLLECTOR(int_sk_reuse)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_reuse;
+}
+
+META_COLLECTOR(int_sk_bound_if)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ /* No error if bound_dev_if is 0, legal userspace check */
+ dst->value = skb->sk->sk_bound_dev_if;
+}
+
+META_COLLECTOR(var_sk_bound_if)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+
+ if (skb->sk->sk_bound_dev_if == 0) {
+ dst->value = (unsigned long) "any";
+ dst->len = 3;
+ } else {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(skb->sk),
+ skb->sk->sk_bound_dev_if);
+ *err = var_dev(dev, dst);
+ rcu_read_unlock();
+ }
+}
+
+META_COLLECTOR(int_sk_refcnt)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = atomic_read(&skb->sk->sk_refcnt);
+}
+
+META_COLLECTOR(int_sk_rcvbuf)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_rcvbuf;
+}
+
+META_COLLECTOR(int_sk_shutdown)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_shutdown;
+}
+
+META_COLLECTOR(int_sk_proto)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_protocol;
+}
+
+META_COLLECTOR(int_sk_type)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_type;
+}
+
+META_COLLECTOR(int_sk_rmem_alloc)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk_rmem_alloc_get(skb->sk);
+}
+
+META_COLLECTOR(int_sk_wmem_alloc)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = sk_wmem_alloc_get(skb->sk);
+}
+
+META_COLLECTOR(int_sk_omem_alloc)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = atomic_read(&skb->sk->sk_omem_alloc);
+}
+
+META_COLLECTOR(int_sk_rcv_qlen)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_receive_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_snd_qlen)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_write_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_wmem_queued)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_wmem_queued;
+}
+
+META_COLLECTOR(int_sk_fwd_alloc)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_forward_alloc;
+}
+
+META_COLLECTOR(int_sk_sndbuf)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_sndbuf;
+}
+
+META_COLLECTOR(int_sk_alloc)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = (__force int) skb->sk->sk_allocation;
+}
+
+META_COLLECTOR(int_sk_hash)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_hash;
+}
+
+META_COLLECTOR(int_sk_lingertime)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_lingertime / HZ;
+}
+
+META_COLLECTOR(int_sk_err_qlen)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_error_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_ack_bl)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_ack_backlog;
+}
+
+META_COLLECTOR(int_sk_max_ack_bl)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_max_ack_backlog;
+}
+
+META_COLLECTOR(int_sk_prio)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_priority;
+}
+
+META_COLLECTOR(int_sk_rcvlowat)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_rcvlowat;
+}
+
+META_COLLECTOR(int_sk_rcvtimeo)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_rcvtimeo / HZ;
+}
+
+META_COLLECTOR(int_sk_sndtimeo)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_sndtimeo / HZ;
+}
+
+META_COLLECTOR(int_sk_sendmsg_off)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_frag.offset;
+}
+
+META_COLLECTOR(int_sk_write_pend)
+{
+ if (skip_nonlocal(skb)) {
+ *err = -1;
+ return;
+ }
+ dst->value = skb->sk->sk_write_pending;
+}
+
+/**************************************************************************
+ * Meta value collectors assignment table
+ **************************************************************************/
+
+struct meta_ops {
+ void (*get)(struct sk_buff *, struct tcf_pkt_info *,
+ struct meta_value *, struct meta_obj *, int *);
+};
+
+#define META_ID(name) TCF_META_ID_##name
+#define META_FUNC(name) { .get = meta_##name }
+
+/* Meta value operations table listing all meta value collectors and
+ * assigns them to a type and meta id. */
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
+ [TCF_META_TYPE_VAR] = {
+ [META_ID(DEV)] = META_FUNC(var_dev),
+ [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
+ },
+ [TCF_META_TYPE_INT] = {
+ [META_ID(RANDOM)] = META_FUNC(int_random),
+ [META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0),
+ [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
+ [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
+ [META_ID(DEV)] = META_FUNC(int_dev),
+ [META_ID(PRIORITY)] = META_FUNC(int_priority),
+ [META_ID(PROTOCOL)] = META_FUNC(int_protocol),
+ [META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
+ [META_ID(PKTLEN)] = META_FUNC(int_pktlen),
+ [META_ID(DATALEN)] = META_FUNC(int_datalen),
+ [META_ID(MACLEN)] = META_FUNC(int_maclen),
+ [META_ID(NFMARK)] = META_FUNC(int_mark),
+ [META_ID(TCINDEX)] = META_FUNC(int_tcindex),
+ [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
+ [META_ID(RTIIF)] = META_FUNC(int_rtiif),
+ [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
+ [META_ID(SK_STATE)] = META_FUNC(int_sk_state),
+ [META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse),
+ [META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if),
+ [META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt),
+ [META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf),
+ [META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf),
+ [META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown),
+ [META_ID(SK_PROTO)] = META_FUNC(int_sk_proto),
+ [META_ID(SK_TYPE)] = META_FUNC(int_sk_type),
+ [META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc),
+ [META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc),
+ [META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc),
+ [META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued),
+ [META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen),
+ [META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen),
+ [META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen),
+ [META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
+ [META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
+ [META_ID(SK_HASH)] = META_FUNC(int_sk_hash),
+ [META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
+ [META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
+ [META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl),
+ [META_ID(SK_PRIO)] = META_FUNC(int_sk_prio),
+ [META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat),
+ [META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo),
+ [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo),
+ [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
+ [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
+ [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag),
+ [META_ID(RXHASH)] = META_FUNC(int_rxhash),
+ }
+};
+
+static inline struct meta_ops *meta_ops(struct meta_value *val)
+{
+ return &__meta_ops[meta_type(val)][meta_id(val)];
+}
+
+/**************************************************************************
+ * Type specific operations for TCF_META_TYPE_VAR
+ **************************************************************************/
+
+static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
+{
+ int r = a->len - b->len;
+
+ if (r == 0)
+ r = memcmp((void *) a->value, (void *) b->value, a->len);
+
+ return r;
+}
+
+static int meta_var_change(struct meta_value *dst, struct nlattr *nla)
+{
+ int len = nla_len(nla);
+
+ dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL);
+ if (dst->val == 0UL)
+ return -ENOMEM;
+ dst->len = len;
+ return 0;
+}
+
+static void meta_var_destroy(struct meta_value *v)
+{
+ kfree((void *) v->val);
+}
+
+static void meta_var_apply_extras(struct meta_value *v,
+ struct meta_obj *dst)
+{
+ int shift = v->hdr.shift;
+
+ if (shift && shift < dst->len)
+ dst->len -= shift;
+}
+
+static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
+{
+ if (v->val && v->len &&
+ nla_put(skb, tlv, v->len, (void *) v->val))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+/**************************************************************************
+ * Type specific operations for TCF_META_TYPE_INT
+ **************************************************************************/
+
+static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
+{
+ /* Let gcc optimize it, the unlikely is not really based on
+ * some numbers but jump free code for mismatches seems
+ * more logical. */
+ if (unlikely(a->value == b->value))
+ return 0;
+ else if (a->value < b->value)
+ return -1;
+ else
+ return 1;
+}
+
+static int meta_int_change(struct meta_value *dst, struct nlattr *nla)
+{
+ if (nla_len(nla) >= sizeof(unsigned long)) {
+ dst->val = *(unsigned long *) nla_data(nla);
+ dst->len = sizeof(unsigned long);
+ } else if (nla_len(nla) == sizeof(u32)) {
+ dst->val = nla_get_u32(nla);
+ dst->len = sizeof(u32);
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+static void meta_int_apply_extras(struct meta_value *v,
+ struct meta_obj *dst)
+{
+ if (v->hdr.shift)
+ dst->value >>= v->hdr.shift;
+
+ if (v->val)
+ dst->value &= v->val;
+}
+
+static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
+{
+ if (v->len == sizeof(unsigned long)) {
+ if (nla_put(skb, tlv, sizeof(unsigned long), &v->val))
+ goto nla_put_failure;
+ } else if (v->len == sizeof(u32)) {
+ if (nla_put_u32(skb, tlv, v->val))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+/**************************************************************************
+ * Type specific operations table
+ **************************************************************************/
+
+struct meta_type_ops {
+ void (*destroy)(struct meta_value *);
+ int (*compare)(struct meta_obj *, struct meta_obj *);
+ int (*change)(struct meta_value *, struct nlattr *);
+ void (*apply_extras)(struct meta_value *, struct meta_obj *);
+ int (*dump)(struct sk_buff *, struct meta_value *, int);
+};
+
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
+ [TCF_META_TYPE_VAR] = {
+ .destroy = meta_var_destroy,
+ .compare = meta_var_compare,
+ .change = meta_var_change,
+ .apply_extras = meta_var_apply_extras,
+ .dump = meta_var_dump
+ },
+ [TCF_META_TYPE_INT] = {
+ .compare = meta_int_compare,
+ .change = meta_int_change,
+ .apply_extras = meta_int_apply_extras,
+ .dump = meta_int_dump
+ }
+};
+
+static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
+{
+ return &__meta_type_ops[meta_type(v)];
+}
+
+/**************************************************************************
+ * Core
+ **************************************************************************/
+
+static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ int err = 0;
+
+ if (meta_id(v) == TCF_META_ID_VALUE) {
+ dst->value = v->val;
+ dst->len = v->len;
+ return 0;
+ }
+
+ meta_ops(v)->get(skb, info, v, dst, &err);
+ if (err < 0)
+ return err;
+
+ if (meta_type_ops(v)->apply_extras)
+ meta_type_ops(v)->apply_extras(v, dst);
+
+ return 0;
+}
+
+static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ int r;
+ struct meta_match *meta = (struct meta_match *) m->data;
+ struct meta_obj l_value, r_value;
+
+ if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
+ meta_get(skb, info, &meta->rvalue, &r_value) < 0)
+ return 0;
+
+ r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
+
+ switch (meta->lvalue.hdr.op) {
+ case TCF_EM_OPND_EQ:
+ return !r;
+ case TCF_EM_OPND_LT:
+ return r < 0;
+ case TCF_EM_OPND_GT:
+ return r > 0;
+ }
+
+ return 0;
+}
+
+static void meta_delete(struct meta_match *meta)
+{
+ if (meta) {
+ struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+
+ if (ops && ops->destroy) {
+ ops->destroy(&meta->lvalue);
+ ops->destroy(&meta->rvalue);
+ }
+ }
+
+ kfree(meta);
+}
+
+static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
+{
+ if (nla) {
+ if (nla_len(nla) == 0)
+ return -EINVAL;
+
+ return meta_type_ops(dst)->change(dst, nla);
+ }
+
+ return 0;
+}
+
+static inline int meta_is_supported(struct meta_value *val)
+{
+ return !meta_id(val) || meta_ops(val)->get;
+}
+
+static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
+ [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) },
+};
+
+static int em_meta_change(struct net *net, void *data, int len,
+ struct tcf_ematch *m)
+{
+ int err;
+ struct nlattr *tb[TCA_EM_META_MAX + 1];
+ struct tcf_meta_hdr *hdr;
+ struct meta_match *meta = NULL;
+
+ err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ if (tb[TCA_EM_META_HDR] == NULL)
+ goto errout;
+ hdr = nla_data(tb[TCA_EM_META_HDR]);
+
+ if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
+ TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
+ TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
+ TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
+ goto errout;
+
+ meta = kzalloc(sizeof(*meta), GFP_KERNEL);
+ if (meta == NULL) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
+ memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
+
+ if (!meta_is_supported(&meta->lvalue) ||
+ !meta_is_supported(&meta->rvalue)) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 ||
+ meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0)
+ goto errout;
+
+ m->datalen = sizeof(*meta);
+ m->data = (unsigned long) meta;
+
+ err = 0;
+errout:
+ if (err && meta)
+ meta_delete(meta);
+ return err;
+}
+
+static void em_meta_destroy(struct tcf_ematch *m)
+{
+ if (m)
+ meta_delete((struct meta_match *) m->data);
+}
+
+static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
+{
+ struct meta_match *meta = (struct meta_match *) em->data;
+ struct tcf_meta_hdr hdr;
+ struct meta_type_ops *ops;
+
+ memset(&hdr, 0, sizeof(hdr));
+ memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
+ memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
+
+ if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr))
+ goto nla_put_failure;
+
+ ops = meta_type_ops(&meta->lvalue);
+ if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
+ ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct tcf_ematch_ops em_meta_ops = {
+ .kind = TCF_EM_META,
+ .change = em_meta_change,
+ .match = em_meta_match,
+ .destroy = em_meta_destroy,
+ .dump = em_meta_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_meta_ops.link)
+};
+
+static int __init init_em_meta(void)
+{
+ return tcf_em_register(&em_meta_ops);
+}
+
+static void __exit exit_em_meta(void)
+{
+ tcf_em_unregister(&em_meta_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_meta);
+module_exit(exit_em_meta);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
new file mode 100644
index 000000000..df3110d69
--- /dev/null
+++ b/net/sched/em_nbyte.c
@@ -0,0 +1,80 @@
+/*
+ * net/sched/em_nbyte.c N-Byte ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_nbyte.h>
+#include <net/pkt_cls.h>
+
+struct nbyte_data {
+ struct tcf_em_nbyte hdr;
+ char pattern[0];
+};
+
+static int em_nbyte_change(struct net *net, void *data, int data_len,
+ struct tcf_ematch *em)
+{
+ struct tcf_em_nbyte *nbyte = data;
+
+ if (data_len < sizeof(*nbyte) ||
+ data_len < (sizeof(*nbyte) + nbyte->len))
+ return -EINVAL;
+
+ em->datalen = sizeof(*nbyte) + nbyte->len;
+ em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
+ if (em->data == 0UL)
+ return -ENOBUFS;
+
+ return 0;
+}
+
+static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
+ unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
+
+ ptr += nbyte->hdr.off;
+
+ if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
+ return 0;
+
+ return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
+}
+
+static struct tcf_ematch_ops em_nbyte_ops = {
+ .kind = TCF_EM_NBYTE,
+ .change = em_nbyte_change,
+ .match = em_nbyte_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_nbyte_ops.link)
+};
+
+static int __init init_em_nbyte(void)
+{
+ return tcf_em_register(&em_nbyte_ops);
+}
+
+static void __exit exit_em_nbyte(void)
+{
+ tcf_em_unregister(&em_nbyte_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_nbyte);
+module_exit(exit_em_nbyte);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE);
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000..73e2ed576
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
+/*
+ * net/sched/em_text.c Textsearch ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/textsearch.h>
+#include <linux/tc_ematch/tc_em_text.h>
+#include <net/pkt_cls.h>
+
+struct text_match {
+ u16 from_offset;
+ u16 to_offset;
+ u8 from_layer;
+ u8 to_layer;
+ struct ts_config *config;
+};
+
+#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
+
+static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ struct text_match *tm = EM_TEXT_PRIV(m);
+ int from, to;
+
+ from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+ from += tm->from_offset;
+
+ to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+ to += tm->to_offset;
+
+ return skb_find_text(skb, from, to, tm->config) != UINT_MAX;
+}
+
+static int em_text_change(struct net *net, void *data, int len,
+ struct tcf_ematch *m)
+{
+ struct text_match *tm;
+ struct tcf_em_text *conf = data;
+ struct ts_config *ts_conf;
+ int flags = 0;
+
+ if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
+ return -EINVAL;
+
+ if (conf->from_layer > conf->to_layer)
+ return -EINVAL;
+
+ if (conf->from_layer == conf->to_layer &&
+ conf->from_offset > conf->to_offset)
+ return -EINVAL;
+
+retry:
+ ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
+ conf->pattern_len, GFP_KERNEL, flags);
+
+ if (flags & TS_AUTOLOAD)
+ rtnl_lock();
+
+ if (IS_ERR(ts_conf)) {
+ if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
+ rtnl_unlock();
+ flags |= TS_AUTOLOAD;
+ goto retry;
+ } else
+ return PTR_ERR(ts_conf);
+ } else if (flags & TS_AUTOLOAD) {
+ textsearch_destroy(ts_conf);
+ return -EAGAIN;
+ }
+
+ tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+ if (tm == NULL) {
+ textsearch_destroy(ts_conf);
+ return -ENOBUFS;
+ }
+
+ tm->from_offset = conf->from_offset;
+ tm->to_offset = conf->to_offset;
+ tm->from_layer = conf->from_layer;
+ tm->to_layer = conf->to_layer;
+ tm->config = ts_conf;
+
+ m->datalen = sizeof(*tm);
+ m->data = (unsigned long) tm;
+
+ return 0;
+}
+
+static void em_text_destroy(struct tcf_ematch *m)
+{
+ if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config)
+ textsearch_destroy(EM_TEXT_PRIV(m)->config);
+}
+
+static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct text_match *tm = EM_TEXT_PRIV(m);
+ struct tcf_em_text conf;
+
+ strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+ conf.from_offset = tm->from_offset;
+ conf.to_offset = tm->to_offset;
+ conf.from_layer = tm->from_layer;
+ conf.to_layer = tm->to_layer;
+ conf.pattern_len = textsearch_get_pattern_len(tm->config);
+ conf.pad = 0;
+
+ if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0)
+ goto nla_put_failure;
+ if (nla_append(skb, conf.pattern_len,
+ textsearch_get_pattern(tm->config)) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct tcf_ematch_ops em_text_ops = {
+ .kind = TCF_EM_TEXT,
+ .change = em_text_change,
+ .match = em_text_match,
+ .destroy = em_text_destroy,
+ .dump = em_text_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_text_ops.link)
+};
+
+static int __init init_em_text(void)
+{
+ return tcf_em_register(&em_text_ops);
+}
+
+static void __exit exit_em_text(void)
+{
+ tcf_em_unregister(&em_text_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_text);
+module_exit(exit_em_text);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT);
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
new file mode 100644
index 000000000..797bdb88c
--- /dev/null
+++ b/net/sched/em_u32.c
@@ -0,0 +1,64 @@
+/*
+ * net/sched/em_u32.c U32 Ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Based on net/sched/cls_u32.c
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ struct tc_u32_key *key = (struct tc_u32_key *) em->data;
+ const unsigned char *ptr = skb_network_header(skb);
+
+ if (info) {
+ if (info->ptr)
+ ptr = info->ptr;
+ ptr += (info->nexthdr & key->offmask);
+ }
+
+ ptr += key->off;
+
+ if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
+ return 0;
+
+ return !(((*(__be32 *) ptr) ^ key->val) & key->mask);
+}
+
+static struct tcf_ematch_ops em_u32_ops = {
+ .kind = TCF_EM_U32,
+ .datalen = sizeof(struct tc_u32_key),
+ .match = em_u32_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_u32_ops.link)
+};
+
+static int __init init_em_u32(void)
+{
+ return tcf_em_register(&em_u32_ops);
+}
+
+static void __exit exit_em_u32(void)
+{
+ tcf_em_unregister(&em_u32_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_u32);
+module_exit(exit_em_u32);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32);
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
new file mode 100644
index 000000000..fbb7ebfc5
--- /dev/null
+++ b/net/sched/ematch.c
@@ -0,0 +1,549 @@
+/*
+ * net/sched/ematch.c Extended Match API
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ * An extended match (ematch) is a small classification tool not worth
+ * writing a full classifier for. Ematches can be interconnected to form
+ * a logic expression and get attached to classifiers to extend their
+ * functionatlity.
+ *
+ * The userspace part transforms the logic expressions into an array
+ * consisting of multiple sequences of interconnected ematches separated
+ * by markers. Precedence is implemented by a special ematch kind
+ * referencing a sequence beyond the marker of the current sequence
+ * causing the current position in the sequence to be pushed onto a stack
+ * to allow the current position to be overwritten by the position referenced
+ * in the special ematch. Matching continues in the new sequence until a
+ * marker is reached causing the position to be restored from the stack.
+ *
+ * Example:
+ * A AND (B1 OR B2) AND C AND D
+ *
+ * ------->-PUSH-------
+ * -->-- / -->-- \ -->--
+ * / \ / / \ \ / \
+ * +-------+-------+-------+-------+-------+--------+
+ * | A AND | B AND | C AND | D END | B1 OR | B2 END |
+ * +-------+-------+-------+-------+-------+--------+
+ * \ /
+ * --------<-POP---------
+ *
+ * where B is a virtual ematch referencing to sequence starting with B1.
+ *
+ * ==========================================================================
+ *
+ * How to write an ematch in 60 seconds
+ * ------------------------------------
+ *
+ * 1) Provide a matcher function:
+ * static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
+ * struct tcf_pkt_info *info)
+ * {
+ * struct mydata *d = (struct mydata *) m->data;
+ *
+ * if (...matching goes here...)
+ * return 1;
+ * else
+ * return 0;
+ * }
+ *
+ * 2) Fill out a struct tcf_ematch_ops:
+ * static struct tcf_ematch_ops my_ops = {
+ * .kind = unique id,
+ * .datalen = sizeof(struct mydata),
+ * .match = my_match,
+ * .owner = THIS_MODULE,
+ * };
+ *
+ * 3) Register/Unregister your ematch:
+ * static int __init init_my_ematch(void)
+ * {
+ * return tcf_em_register(&my_ops);
+ * }
+ *
+ * static void __exit exit_my_ematch(void)
+ * {
+ * tcf_em_unregister(&my_ops);
+ * }
+ *
+ * module_init(init_my_ematch);
+ * module_exit(exit_my_ematch);
+ *
+ * 4) By now you should have two more seconds left, barely enough to
+ * open up a beer to watch the compilation going.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static LIST_HEAD(ematch_ops);
+static DEFINE_RWLOCK(ematch_mod_lock);
+
+static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
+{
+ struct tcf_ematch_ops *e = NULL;
+
+ read_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link) {
+ if (kind == e->kind) {
+ if (!try_module_get(e->owner))
+ e = NULL;
+ read_unlock(&ematch_mod_lock);
+ return e;
+ }
+ }
+ read_unlock(&ematch_mod_lock);
+
+ return NULL;
+}
+
+/**
+ * tcf_em_register - register an extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their presence.
+ * The given @ops must have kind set to a unique identifier and the
+ * callback match() must be implemented. All other callbacks are optional
+ * and a fallback implementation is used instead.
+ *
+ * Returns -EEXISTS if an ematch of the same kind has already registered.
+ */
+int tcf_em_register(struct tcf_ematch_ops *ops)
+{
+ int err = -EEXIST;
+ struct tcf_ematch_ops *e;
+
+ if (ops->match == NULL)
+ return -EINVAL;
+
+ write_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link)
+ if (ops->kind == e->kind)
+ goto errout;
+
+ list_add_tail(&ops->link, &ematch_ops);
+ err = 0;
+errout:
+ write_unlock(&ematch_mod_lock);
+ return err;
+}
+EXPORT_SYMBOL(tcf_em_register);
+
+/**
+ * tcf_em_unregister - unregster and extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their disappearance
+ * for examples when the module gets unloaded. The @ops parameter must be
+ * the same as the one used for registration.
+ *
+ * Returns -ENOENT if no matching ematch was found.
+ */
+void tcf_em_unregister(struct tcf_ematch_ops *ops)
+{
+ write_lock(&ematch_mod_lock);
+ list_del(&ops->link);
+ write_unlock(&ematch_mod_lock);
+}
+EXPORT_SYMBOL(tcf_em_unregister);
+
+static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
+ int index)
+{
+ return &tree->matches[index];
+}
+
+
+static int tcf_em_validate(struct tcf_proto *tp,
+ struct tcf_ematch_tree_hdr *tree_hdr,
+ struct tcf_ematch *em, struct nlattr *nla, int idx)
+{
+ int err = -EINVAL;
+ struct tcf_ematch_hdr *em_hdr = nla_data(nla);
+ int data_len = nla_len(nla) - sizeof(*em_hdr);
+ void *data = (void *) em_hdr + sizeof(*em_hdr);
+ struct net *net = dev_net(qdisc_dev(tp->q));
+
+ if (!TCF_EM_REL_VALID(em_hdr->flags))
+ goto errout;
+
+ if (em_hdr->kind == TCF_EM_CONTAINER) {
+ /* Special ematch called "container", carries an index
+ * referencing an external ematch sequence.
+ */
+ u32 ref;
+
+ if (data_len < sizeof(ref))
+ goto errout;
+ ref = *(u32 *) data;
+
+ if (ref >= tree_hdr->nmatches)
+ goto errout;
+
+ /* We do not allow backward jumps to avoid loops and jumps
+ * to our own position are of course illegal.
+ */
+ if (ref <= idx)
+ goto errout;
+
+
+ em->data = ref;
+ } else {
+ /* Note: This lookup will increase the module refcnt
+ * of the ematch module referenced. In case of a failure,
+ * a destroy function is called by the underlying layer
+ * which automatically releases the reference again, therefore
+ * the module MUST not be given back under any circumstances
+ * here. Be aware, the destroy function assumes that the
+ * module is held if the ops field is non zero.
+ */
+ em->ops = tcf_em_lookup(em_hdr->kind);
+
+ if (em->ops == NULL) {
+ err = -ENOENT;
+#ifdef CONFIG_MODULES
+ __rtnl_unlock();
+ request_module("ematch-kind-%u", em_hdr->kind);
+ rtnl_lock();
+ em->ops = tcf_em_lookup(em_hdr->kind);
+ if (em->ops) {
+ /* We dropped the RTNL mutex in order to
+ * perform the module load. Tell the caller
+ * to replay the request.
+ */
+ module_put(em->ops->owner);
+ em->ops = NULL;
+ err = -EAGAIN;
+ }
+#endif
+ goto errout;
+ }
+
+ /* ematch module provides expected length of data, so we
+ * can do a basic sanity check.
+ */
+ if (em->ops->datalen && data_len < em->ops->datalen)
+ goto errout;
+
+ if (em->ops->change) {
+ err = em->ops->change(net, data, data_len, em);
+ if (err < 0)
+ goto errout;
+ } else if (data_len > 0) {
+ /* ematch module doesn't provide an own change
+ * procedure and expects us to allocate and copy
+ * the ematch data.
+ *
+ * TCF_EM_SIMPLE may be specified stating that the
+ * data only consists of a u32 integer and the module
+ * does not expected a memory reference but rather
+ * the value carried.
+ */
+ if (em_hdr->flags & TCF_EM_SIMPLE) {
+ if (data_len < sizeof(u32))
+ goto errout;
+ em->data = *(u32 *) data;
+ } else {
+ void *v = kmemdup(data, data_len, GFP_KERNEL);
+ if (v == NULL) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+ em->data = (unsigned long) v;
+ }
+ }
+ }
+
+ em->matchid = em_hdr->matchid;
+ em->flags = em_hdr->flags;
+ em->datalen = data_len;
+ em->net = net;
+
+ err = 0;
+errout:
+ return err;
+}
+
+static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = {
+ [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) },
+ [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED },
+};
+
+/**
+ * tcf_em_tree_validate - validate ematch config TLV and build ematch tree
+ *
+ * @tp: classifier kind handle
+ * @nla: ematch tree configuration TLV
+ * @tree: destination ematch tree variable to store the resulting
+ * ematch tree.
+ *
+ * This function validates the given configuration TLV @nla and builds an
+ * ematch tree in @tree. The resulting tree must later be copied into
+ * the private classifier data using tcf_em_tree_change(). You MUST NOT
+ * provide the ematch tree variable of the private classifier data directly,
+ * the changes would not be locked properly.
+ *
+ * Returns a negative error code if the configuration TLV contains errors.
+ */
+int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
+ struct tcf_ematch_tree *tree)
+{
+ int idx, list_len, matches_len, err;
+ struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1];
+ struct nlattr *rt_match, *rt_hdr, *rt_list;
+ struct tcf_ematch_tree_hdr *tree_hdr;
+ struct tcf_ematch *em;
+
+ memset(tree, 0, sizeof(*tree));
+ if (!nla)
+ return 0;
+
+ err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ rt_hdr = tb[TCA_EMATCH_TREE_HDR];
+ rt_list = tb[TCA_EMATCH_TREE_LIST];
+
+ if (rt_hdr == NULL || rt_list == NULL)
+ goto errout;
+
+ tree_hdr = nla_data(rt_hdr);
+ memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
+
+ rt_match = nla_data(rt_list);
+ list_len = nla_len(rt_list);
+ matches_len = tree_hdr->nmatches * sizeof(*em);
+
+ tree->matches = kzalloc(matches_len, GFP_KERNEL);
+ if (tree->matches == NULL)
+ goto errout;
+
+ /* We do not use nla_parse_nested here because the maximum
+ * number of attributes is unknown. This saves us the allocation
+ * for a tb buffer which would serve no purpose at all.
+ *
+ * The array of rt attributes is parsed in the order as they are
+ * provided, their type must be incremental from 1 to n. Even
+ * if it does not serve any real purpose, a failure of sticking
+ * to this policy will result in parsing failure.
+ */
+ for (idx = 0; nla_ok(rt_match, list_len); idx++) {
+ err = -EINVAL;
+
+ if (rt_match->nla_type != (idx + 1))
+ goto errout_abort;
+
+ if (idx >= tree_hdr->nmatches)
+ goto errout_abort;
+
+ if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr))
+ goto errout_abort;
+
+ em = tcf_em_get_match(tree, idx);
+
+ err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
+ if (err < 0)
+ goto errout_abort;
+
+ rt_match = nla_next(rt_match, &list_len);
+ }
+
+ /* Check if the number of matches provided by userspace actually
+ * complies with the array of matches. The number was used for
+ * the validation of references and a mismatch could lead to
+ * undefined references during the matching process.
+ */
+ if (idx != tree_hdr->nmatches) {
+ err = -EINVAL;
+ goto errout_abort;
+ }
+
+ err = 0;
+errout:
+ return err;
+
+errout_abort:
+ tcf_em_tree_destroy(tree);
+ return err;
+}
+EXPORT_SYMBOL(tcf_em_tree_validate);
+
+/**
+ * tcf_em_tree_destroy - destroy an ematch tree
+ *
+ * @tp: classifier kind handle
+ * @tree: ematch tree to be deleted
+ *
+ * This functions destroys an ematch tree previously created by
+ * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
+ * the ematch tree is not in use before calling this function.
+ */
+void tcf_em_tree_destroy(struct tcf_ematch_tree *tree)
+{
+ int i;
+
+ if (tree->matches == NULL)
+ return;
+
+ for (i = 0; i < tree->hdr.nmatches; i++) {
+ struct tcf_ematch *em = tcf_em_get_match(tree, i);
+
+ if (em->ops) {
+ if (em->ops->destroy)
+ em->ops->destroy(em);
+ else if (!tcf_em_is_simple(em))
+ kfree((void *) em->data);
+ module_put(em->ops->owner);
+ }
+ }
+
+ tree->hdr.nmatches = 0;
+ kfree(tree->matches);
+ tree->matches = NULL;
+}
+EXPORT_SYMBOL(tcf_em_tree_destroy);
+
+/**
+ * tcf_em_tree_dump - dump ematch tree into a rtnl message
+ *
+ * @skb: skb holding the rtnl message
+ * @t: ematch tree to be dumped
+ * @tlv: TLV type to be used to encapsulate the tree
+ *
+ * This function dumps a ematch tree into a rtnl message. It is valid to
+ * call this function while the ematch tree is in use.
+ *
+ * Returns -1 if the skb tailroom is insufficient.
+ */
+int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
+{
+ int i;
+ u8 *tail;
+ struct nlattr *top_start;
+ struct nlattr *list_start;
+
+ top_start = nla_nest_start(skb, tlv);
+ if (top_start == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
+ goto nla_put_failure;
+
+ list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
+ if (list_start == NULL)
+ goto nla_put_failure;
+
+ tail = skb_tail_pointer(skb);
+ for (i = 0; i < tree->hdr.nmatches; i++) {
+ struct nlattr *match_start = (struct nlattr *)tail;
+ struct tcf_ematch *em = tcf_em_get_match(tree, i);
+ struct tcf_ematch_hdr em_hdr = {
+ .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
+ .matchid = em->matchid,
+ .flags = em->flags
+ };
+
+ if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr))
+ goto nla_put_failure;
+
+ if (em->ops && em->ops->dump) {
+ if (em->ops->dump(skb, em) < 0)
+ goto nla_put_failure;
+ } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
+ u32 u = em->data;
+ nla_put_nohdr(skb, sizeof(u), &u);
+ } else if (em->datalen > 0)
+ nla_put_nohdr(skb, em->datalen, (void *) em->data);
+
+ tail = skb_tail_pointer(skb);
+ match_start->nla_len = tail - (u8 *)match_start;
+ }
+
+ nla_nest_end(skb, list_start);
+ nla_nest_end(skb, top_start);
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+EXPORT_SYMBOL(tcf_em_tree_dump);
+
+static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
+ struct tcf_pkt_info *info)
+{
+ int r = em->ops->match(skb, em, info);
+
+ return tcf_em_is_inverted(em) ? !r : r;
+}
+
+/* Do not use this function directly, use tcf_em_tree_match instead */
+int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
+ struct tcf_pkt_info *info)
+{
+ int stackp = 0, match_idx = 0, res = 0;
+ struct tcf_ematch *cur_match;
+ int stack[CONFIG_NET_EMATCH_STACK];
+
+proceed:
+ while (match_idx < tree->hdr.nmatches) {
+ cur_match = tcf_em_get_match(tree, match_idx);
+
+ if (tcf_em_is_container(cur_match)) {
+ if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
+ goto stack_overflow;
+
+ stack[stackp++] = match_idx;
+ match_idx = cur_match->data;
+ goto proceed;
+ }
+
+ res = tcf_em_match(skb, cur_match, info);
+
+ if (tcf_em_early_end(cur_match, res))
+ break;
+
+ match_idx++;
+ }
+
+pop_stack:
+ if (stackp > 0) {
+ match_idx = stack[--stackp];
+ cur_match = tcf_em_get_match(tree, match_idx);
+
+ if (tcf_em_is_inverted(cur_match))
+ res = !res;
+
+ if (tcf_em_early_end(cur_match, res)) {
+ goto pop_stack;
+ } else {
+ match_idx++;
+ goto proceed;
+ }
+ }
+
+ return res;
+
+stack_overflow:
+ net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n");
+ return -1;
+}
+EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
new file mode 100644
index 000000000..1e1c89e51
--- /dev/null
+++ b/net/sched/sch_api.c
@@ -0,0 +1,1970 @@
+/*
+ * net/sched/sch_api.c Packet scheduler API.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
+ * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/hrtimer.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, u32 clid,
+ struct Qdisc *old, struct Qdisc *new);
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct Qdisc *q,
+ unsigned long cl, int event);
+
+/*
+
+ Short review.
+ -------------
+
+ This file consists of two interrelated parts:
+
+ 1. queueing disciplines manager frontend.
+ 2. traffic classes manager frontend.
+
+ Generally, queueing discipline ("qdisc") is a black box,
+ which is able to enqueue packets and to dequeue them (when
+ device is ready to send something) in order and at times
+ determined by algorithm hidden in it.
+
+ qdisc's are divided to two categories:
+ - "queues", which have no internal structure visible from outside.
+ - "schedulers", which split all the packets to "traffic classes",
+ using "packet classifiers" (look at cls_api.c)
+
+ In turn, classes may have child qdiscs (as rule, queues)
+ attached to them etc. etc. etc.
+
+ The goal of the routines in this file is to translate
+ information supplied by user in the form of handles
+ to more intelligible for kernel form, to make some sanity
+ checks and part of work, which is common to all qdiscs
+ and to provide rtnetlink notifications.
+
+ All real intelligent work is done inside qdisc modules.
+
+
+
+ Every discipline has two major routines: enqueue and dequeue.
+
+ ---dequeue
+
+ dequeue usually returns a skb to send. It is allowed to return NULL,
+ but it does not mean that queue is empty, it just means that
+ discipline does not want to send anything this time.
+ Queue is really empty if q->q.qlen == 0.
+ For complicated disciplines with multiple queues q->q is not
+ real packet queue, but however q->q.qlen must be valid.
+
+ ---enqueue
+
+ enqueue returns 0, if packet was enqueued successfully.
+ If packet (this one or another one) was dropped, it returns
+ not zero error code.
+ NET_XMIT_DROP - this packet dropped
+ Expected action: do not backoff, but wait until queue will clear.
+ NET_XMIT_CN - probably this packet enqueued, but another one dropped.
+ Expected action: backoff or ignore
+ NET_XMIT_POLICED - dropped by police.
+ Expected action: backoff or error to real-time apps.
+
+ Auxiliary routines:
+
+ ---peek
+
+ like dequeue but without removing a packet from the queue
+
+ ---reset
+
+ returns qdisc to initial state: purge all buffers, clear all
+ timers, counters (except for statistics) etc.
+
+ ---init
+
+ initializes newly created qdisc.
+
+ ---destroy
+
+ destroys resources allocated by init and during lifetime of qdisc.
+
+ ---change
+
+ changes qdisc parameters.
+ */
+
+/* Protects list of registered TC modules. It is pure SMP lock. */
+static DEFINE_RWLOCK(qdisc_mod_lock);
+
+
+/************************************************
+ * Queueing disciplines manipulation. *
+ ************************************************/
+
+
+/* The list of all installed queueing disciplines. */
+
+static struct Qdisc_ops *qdisc_base;
+
+/* Register/unregister queueing discipline */
+
+int register_qdisc(struct Qdisc_ops *qops)
+{
+ struct Qdisc_ops *q, **qp;
+ int rc = -EEXIST;
+
+ write_lock(&qdisc_mod_lock);
+ for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
+ if (!strcmp(qops->id, q->id))
+ goto out;
+
+ if (qops->enqueue == NULL)
+ qops->enqueue = noop_qdisc_ops.enqueue;
+ if (qops->peek == NULL) {
+ if (qops->dequeue == NULL)
+ qops->peek = noop_qdisc_ops.peek;
+ else
+ goto out_einval;
+ }
+ if (qops->dequeue == NULL)
+ qops->dequeue = noop_qdisc_ops.dequeue;
+
+ if (qops->cl_ops) {
+ const struct Qdisc_class_ops *cops = qops->cl_ops;
+
+ if (!(cops->get && cops->put && cops->walk && cops->leaf))
+ goto out_einval;
+
+ if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
+ goto out_einval;
+ }
+
+ qops->next = NULL;
+ *qp = qops;
+ rc = 0;
+out:
+ write_unlock(&qdisc_mod_lock);
+ return rc;
+
+out_einval:
+ rc = -EINVAL;
+ goto out;
+}
+EXPORT_SYMBOL(register_qdisc);
+
+int unregister_qdisc(struct Qdisc_ops *qops)
+{
+ struct Qdisc_ops *q, **qp;
+ int err = -ENOENT;
+
+ write_lock(&qdisc_mod_lock);
+ for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
+ if (q == qops)
+ break;
+ if (q) {
+ *qp = q->next;
+ q->next = NULL;
+ err = 0;
+ }
+ write_unlock(&qdisc_mod_lock);
+ return err;
+}
+EXPORT_SYMBOL(unregister_qdisc);
+
+/* Get default qdisc if not otherwise specified */
+void qdisc_get_default(char *name, size_t len)
+{
+ read_lock(&qdisc_mod_lock);
+ strlcpy(name, default_qdisc_ops->id, len);
+ read_unlock(&qdisc_mod_lock);
+}
+
+static struct Qdisc_ops *qdisc_lookup_default(const char *name)
+{
+ struct Qdisc_ops *q = NULL;
+
+ for (q = qdisc_base; q; q = q->next) {
+ if (!strcmp(name, q->id)) {
+ if (!try_module_get(q->owner))
+ q = NULL;
+ break;
+ }
+ }
+
+ return q;
+}
+
+/* Set new default qdisc to use */
+int qdisc_set_default(const char *name)
+{
+ const struct Qdisc_ops *ops;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ write_lock(&qdisc_mod_lock);
+ ops = qdisc_lookup_default(name);
+ if (!ops) {
+ /* Not found, drop lock and try to load module */
+ write_unlock(&qdisc_mod_lock);
+ request_module("sch_%s", name);
+ write_lock(&qdisc_mod_lock);
+
+ ops = qdisc_lookup_default(name);
+ }
+
+ if (ops) {
+ /* Set new default */
+ module_put(default_qdisc_ops->owner);
+ default_qdisc_ops = ops;
+ }
+ write_unlock(&qdisc_mod_lock);
+
+ return ops ? 0 : -ENOENT;
+}
+
+/* We know handle. Find qdisc among all qdisc's attached to device
+ (root qdisc, all its children, children of children etc.)
+ */
+
+static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
+{
+ struct Qdisc *q;
+
+ if (!(root->flags & TCQ_F_BUILTIN) &&
+ root->handle == handle)
+ return root;
+
+ list_for_each_entry(q, &root->list, list) {
+ if (q->handle == handle)
+ return q;
+ }
+ return NULL;
+}
+
+void qdisc_list_add(struct Qdisc *q)
+{
+ if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+ struct Qdisc *root = qdisc_dev(q)->qdisc;
+
+ WARN_ON_ONCE(root == &noop_qdisc);
+ list_add_tail(&q->list, &root->list);
+ }
+}
+EXPORT_SYMBOL(qdisc_list_add);
+
+void qdisc_list_del(struct Qdisc *q)
+{
+ if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
+ list_del(&q->list);
+}
+EXPORT_SYMBOL(qdisc_list_del);
+
+struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
+{
+ struct Qdisc *q;
+
+ q = qdisc_match_from_root(dev->qdisc, handle);
+ if (q)
+ goto out;
+
+ if (dev_ingress_queue(dev))
+ q = qdisc_match_from_root(
+ dev_ingress_queue(dev)->qdisc_sleeping,
+ handle);
+out:
+ return q;
+}
+
+static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
+{
+ unsigned long cl;
+ struct Qdisc *leaf;
+ const struct Qdisc_class_ops *cops = p->ops->cl_ops;
+
+ if (cops == NULL)
+ return NULL;
+ cl = cops->get(p, classid);
+
+ if (cl == 0)
+ return NULL;
+ leaf = cops->leaf(p, cl);
+ cops->put(p, cl);
+ return leaf;
+}
+
+/* Find queueing discipline by name */
+
+static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
+{
+ struct Qdisc_ops *q = NULL;
+
+ if (kind) {
+ read_lock(&qdisc_mod_lock);
+ for (q = qdisc_base; q; q = q->next) {
+ if (nla_strcmp(kind, q->id) == 0) {
+ if (!try_module_get(q->owner))
+ q = NULL;
+ break;
+ }
+ }
+ read_unlock(&qdisc_mod_lock);
+ }
+ return q;
+}
+
+/* The linklayer setting were not transferred from iproute2, in older
+ * versions, and the rate tables lookup systems have been dropped in
+ * the kernel. To keep backward compatible with older iproute2 tc
+ * utils, we detect the linklayer setting by detecting if the rate
+ * table were modified.
+ *
+ * For linklayer ATM table entries, the rate table will be aligned to
+ * 48 bytes, thus some table entries will contain the same value. The
+ * mpu (min packet unit) is also encoded into the old rate table, thus
+ * starting from the mpu, we find low and high table entries for
+ * mapping this cell. If these entries contain the same value, when
+ * the rate tables have been modified for linklayer ATM.
+ *
+ * This is done by rounding mpu to the nearest 48 bytes cell/entry,
+ * and then roundup to the next cell, calc the table entry one below,
+ * and compare.
+ */
+static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
+{
+ int low = roundup(r->mpu, 48);
+ int high = roundup(low+1, 48);
+ int cell_low = low >> r->cell_log;
+ int cell_high = (high >> r->cell_log) - 1;
+
+ /* rtab is too inaccurate at rates > 100Mbit/s */
+ if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
+ pr_debug("TC linklayer: Giving up ATM detection\n");
+ return TC_LINKLAYER_ETHERNET;
+ }
+
+ if ((cell_high > cell_low) && (cell_high < 256)
+ && (rtab[cell_low] == rtab[cell_high])) {
+ pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
+ cell_low, cell_high, rtab[cell_high]);
+ return TC_LINKLAYER_ATM;
+ }
+ return TC_LINKLAYER_ETHERNET;
+}
+
+static struct qdisc_rate_table *qdisc_rtab_list;
+
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
+{
+ struct qdisc_rate_table *rtab;
+
+ if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
+ nla_len(tab) != TC_RTAB_SIZE)
+ return NULL;
+
+ for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
+ if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
+ !memcmp(&rtab->data, nla_data(tab), 1024)) {
+ rtab->refcnt++;
+ return rtab;
+ }
+ }
+
+ rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
+ if (rtab) {
+ rtab->rate = *r;
+ rtab->refcnt = 1;
+ memcpy(rtab->data, nla_data(tab), 1024);
+ if (r->linklayer == TC_LINKLAYER_UNAWARE)
+ r->linklayer = __detect_linklayer(r, rtab->data);
+ rtab->next = qdisc_rtab_list;
+ qdisc_rtab_list = rtab;
+ }
+ return rtab;
+}
+EXPORT_SYMBOL(qdisc_get_rtab);
+
+void qdisc_put_rtab(struct qdisc_rate_table *tab)
+{
+ struct qdisc_rate_table *rtab, **rtabp;
+
+ if (!tab || --tab->refcnt)
+ return;
+
+ for (rtabp = &qdisc_rtab_list;
+ (rtab = *rtabp) != NULL;
+ rtabp = &rtab->next) {
+ if (rtab == tab) {
+ *rtabp = rtab->next;
+ kfree(rtab);
+ return;
+ }
+ }
+}
+EXPORT_SYMBOL(qdisc_put_rtab);
+
+static LIST_HEAD(qdisc_stab_list);
+static DEFINE_SPINLOCK(qdisc_stab_lock);
+
+static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
+ [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
+ [TCA_STAB_DATA] = { .type = NLA_BINARY },
+};
+
+static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
+{
+ struct nlattr *tb[TCA_STAB_MAX + 1];
+ struct qdisc_size_table *stab;
+ struct tc_sizespec *s;
+ unsigned int tsize = 0;
+ u16 *tab = NULL;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (!tb[TCA_STAB_BASE])
+ return ERR_PTR(-EINVAL);
+
+ s = nla_data(tb[TCA_STAB_BASE]);
+
+ if (s->tsize > 0) {
+ if (!tb[TCA_STAB_DATA])
+ return ERR_PTR(-EINVAL);
+ tab = nla_data(tb[TCA_STAB_DATA]);
+ tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
+ }
+
+ if (tsize != s->tsize || (!tab && tsize > 0))
+ return ERR_PTR(-EINVAL);
+
+ spin_lock(&qdisc_stab_lock);
+
+ list_for_each_entry(stab, &qdisc_stab_list, list) {
+ if (memcmp(&stab->szopts, s, sizeof(*s)))
+ continue;
+ if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
+ continue;
+ stab->refcnt++;
+ spin_unlock(&qdisc_stab_lock);
+ return stab;
+ }
+
+ spin_unlock(&qdisc_stab_lock);
+
+ stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ stab->refcnt = 1;
+ stab->szopts = *s;
+ if (tsize > 0)
+ memcpy(stab->data, tab, tsize * sizeof(u16));
+
+ spin_lock(&qdisc_stab_lock);
+ list_add_tail(&stab->list, &qdisc_stab_list);
+ spin_unlock(&qdisc_stab_lock);
+
+ return stab;
+}
+
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
+void qdisc_put_stab(struct qdisc_size_table *tab)
+{
+ if (!tab)
+ return;
+
+ spin_lock(&qdisc_stab_lock);
+
+ if (--tab->refcnt == 0) {
+ list_del(&tab->list);
+ call_rcu_bh(&tab->rcu, stab_kfree_rcu);
+ }
+
+ spin_unlock(&qdisc_stab_lock);
+}
+EXPORT_SYMBOL(qdisc_put_stab);
+
+static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_STAB);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
+ goto nla_put_failure;
+ nla_nest_end(skb, nest);
+
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
+{
+ int pkt_len, slot;
+
+ pkt_len = skb->len + stab->szopts.overhead;
+ if (unlikely(!stab->szopts.tsize))
+ goto out;
+
+ slot = pkt_len + stab->szopts.cell_align;
+ if (unlikely(slot < 0))
+ slot = 0;
+
+ slot >>= stab->szopts.cell_log;
+ if (likely(slot < stab->szopts.tsize))
+ pkt_len = stab->data[slot];
+ else
+ pkt_len = stab->data[stab->szopts.tsize - 1] *
+ (slot / stab->szopts.tsize) +
+ stab->data[slot % stab->szopts.tsize];
+
+ pkt_len <<= stab->szopts.size_log;
+out:
+ if (unlikely(pkt_len < 1))
+ pkt_len = 1;
+ qdisc_skb_cb(skb)->pkt_len = pkt_len;
+}
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
+
+void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
+{
+ if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
+ pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+ txt, qdisc->ops->id, qdisc->handle >> 16);
+ qdisc->flags |= TCQ_F_WARN_NONWC;
+ }
+}
+EXPORT_SYMBOL(qdisc_warn_nonwc);
+
+static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
+{
+ struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
+ timer);
+
+ rcu_read_lock();
+ qdisc_unthrottled(wd->qdisc);
+ __netif_schedule(qdisc_root(wd->qdisc));
+ rcu_read_unlock();
+
+ return HRTIMER_NORESTART;
+}
+
+void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+{
+ hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ wd->timer.function = qdisc_watchdog;
+ wd->qdisc = qdisc;
+}
+EXPORT_SYMBOL(qdisc_watchdog_init);
+
+void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
+{
+ if (test_bit(__QDISC_STATE_DEACTIVATED,
+ &qdisc_root_sleeping(wd->qdisc)->state))
+ return;
+
+ if (throttle)
+ qdisc_throttled(wd->qdisc);
+
+ hrtimer_start(&wd->timer,
+ ns_to_ktime(expires),
+ HRTIMER_MODE_ABS_PINNED);
+}
+EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
+
+void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
+{
+ hrtimer_cancel(&wd->timer);
+ qdisc_unthrottled(wd->qdisc);
+}
+EXPORT_SYMBOL(qdisc_watchdog_cancel);
+
+static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
+{
+ unsigned int size = n * sizeof(struct hlist_head), i;
+ struct hlist_head *h;
+
+ if (size <= PAGE_SIZE)
+ h = kmalloc(size, GFP_KERNEL);
+ else
+ h = (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL, get_order(size));
+
+ if (h != NULL) {
+ for (i = 0; i < n; i++)
+ INIT_HLIST_HEAD(&h[i]);
+ }
+ return h;
+}
+
+static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
+{
+ unsigned int size = n * sizeof(struct hlist_head);
+
+ if (size <= PAGE_SIZE)
+ kfree(h);
+ else
+ free_pages((unsigned long)h, get_order(size));
+}
+
+void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
+{
+ struct Qdisc_class_common *cl;
+ struct hlist_node *next;
+ struct hlist_head *nhash, *ohash;
+ unsigned int nsize, nmask, osize;
+ unsigned int i, h;
+
+ /* Rehash when load factor exceeds 0.75 */
+ if (clhash->hashelems * 4 <= clhash->hashsize * 3)
+ return;
+ nsize = clhash->hashsize * 2;
+ nmask = nsize - 1;
+ nhash = qdisc_class_hash_alloc(nsize);
+ if (nhash == NULL)
+ return;
+
+ ohash = clhash->hash;
+ osize = clhash->hashsize;
+
+ sch_tree_lock(sch);
+ for (i = 0; i < osize; i++) {
+ hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
+ h = qdisc_class_hash(cl->classid, nmask);
+ hlist_add_head(&cl->hnode, &nhash[h]);
+ }
+ }
+ clhash->hash = nhash;
+ clhash->hashsize = nsize;
+ clhash->hashmask = nmask;
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_free(ohash, osize);
+}
+EXPORT_SYMBOL(qdisc_class_hash_grow);
+
+int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
+{
+ unsigned int size = 4;
+
+ clhash->hash = qdisc_class_hash_alloc(size);
+ if (clhash->hash == NULL)
+ return -ENOMEM;
+ clhash->hashsize = size;
+ clhash->hashmask = size - 1;
+ clhash->hashelems = 0;
+ return 0;
+}
+EXPORT_SYMBOL(qdisc_class_hash_init);
+
+void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
+{
+ qdisc_class_hash_free(clhash->hash, clhash->hashsize);
+}
+EXPORT_SYMBOL(qdisc_class_hash_destroy);
+
+void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
+ struct Qdisc_class_common *cl)
+{
+ unsigned int h;
+
+ INIT_HLIST_NODE(&cl->hnode);
+ h = qdisc_class_hash(cl->classid, clhash->hashmask);
+ hlist_add_head(&cl->hnode, &clhash->hash[h]);
+ clhash->hashelems++;
+}
+EXPORT_SYMBOL(qdisc_class_hash_insert);
+
+void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
+ struct Qdisc_class_common *cl)
+{
+ hlist_del(&cl->hnode);
+ clhash->hashelems--;
+}
+EXPORT_SYMBOL(qdisc_class_hash_remove);
+
+/* Allocate an unique handle from space managed by kernel
+ * Possible range is [8000-FFFF]:0000 (0x8000 values)
+ */
+static u32 qdisc_alloc_handle(struct net_device *dev)
+{
+ int i = 0x8000;
+ static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
+
+ do {
+ autohandle += TC_H_MAKE(0x10000U, 0);
+ if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
+ autohandle = TC_H_MAKE(0x80000000U, 0);
+ if (!qdisc_lookup(dev, autohandle))
+ return autohandle;
+ cond_resched();
+ } while (--i > 0);
+
+ return 0;
+}
+
+void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
+{
+ const struct Qdisc_class_ops *cops;
+ unsigned long cl;
+ u32 parentid;
+ int drops;
+
+ if (n == 0)
+ return;
+ drops = max_t(int, n, 0);
+ while ((parentid = sch->parent)) {
+ if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
+ return;
+
+ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
+ if (sch == NULL) {
+ WARN_ON(parentid != TC_H_ROOT);
+ return;
+ }
+ cops = sch->ops->cl_ops;
+ if (cops->qlen_notify) {
+ cl = cops->get(sch, parentid);
+ cops->qlen_notify(sch, cl);
+ cops->put(sch, cl);
+ }
+ sch->q.qlen -= n;
+ __qdisc_qstats_drop(sch, drops);
+ }
+}
+EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
+
+static void notify_and_destroy(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *n, u32 clid,
+ struct Qdisc *old, struct Qdisc *new)
+{
+ if (new || old)
+ qdisc_notify(net, skb, n, clid, old, new);
+
+ if (old)
+ qdisc_destroy(old);
+}
+
+/* Graft qdisc "new" to class "classid" of qdisc "parent" or
+ * to device "dev".
+ *
+ * When appropriate send a netlink notification using 'skb'
+ * and "n".
+ *
+ * On success, destroy old qdisc.
+ */
+
+static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
+ struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
+ struct Qdisc *new, struct Qdisc *old)
+{
+ struct Qdisc *q = old;
+ struct net *net = dev_net(dev);
+ int err = 0;
+
+ if (parent == NULL) {
+ unsigned int i, num_q, ingress;
+
+ ingress = 0;
+ num_q = dev->num_tx_queues;
+ if ((q && q->flags & TCQ_F_INGRESS) ||
+ (new && new->flags & TCQ_F_INGRESS)) {
+ num_q = 1;
+ ingress = 1;
+ if (!dev_ingress_queue(dev))
+ return -ENOENT;
+ }
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ if (new && new->ops->attach)
+ goto skip;
+
+ for (i = 0; i < num_q; i++) {
+ struct netdev_queue *dev_queue = dev_ingress_queue(dev);
+
+ if (!ingress)
+ dev_queue = netdev_get_tx_queue(dev, i);
+
+ old = dev_graft_qdisc(dev_queue, new);
+ if (new && i > 0)
+ atomic_inc(&new->refcnt);
+
+ if (!ingress)
+ qdisc_destroy(old);
+ }
+
+skip:
+ if (!ingress) {
+ notify_and_destroy(net, skb, n, classid,
+ dev->qdisc, new);
+ if (new && !new->ops->attach)
+ atomic_inc(&new->refcnt);
+ dev->qdisc = new ? : &noop_qdisc;
+
+ if (new && new->ops->attach)
+ new->ops->attach(new);
+ } else {
+ notify_and_destroy(net, skb, n, classid, old, new);
+ }
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+ } else {
+ const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+
+ err = -EOPNOTSUPP;
+ if (cops && cops->graft) {
+ unsigned long cl = cops->get(parent, classid);
+ if (cl) {
+ err = cops->graft(parent, cl, new, &old);
+ cops->put(parent, cl);
+ } else
+ err = -ENOENT;
+ }
+ if (!err)
+ notify_and_destroy(net, skb, n, classid, old, new);
+ }
+ return err;
+}
+
+/* lockdep annotation is needed for ingress; egress gets it only for name */
+static struct lock_class_key qdisc_tx_lock;
+static struct lock_class_key qdisc_rx_lock;
+
+/*
+ Allocate and initialize new qdisc.
+
+ Parameters are passed via opt.
+ */
+
+static struct Qdisc *
+qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
+ struct Qdisc *p, u32 parent, u32 handle,
+ struct nlattr **tca, int *errp)
+{
+ int err;
+ struct nlattr *kind = tca[TCA_KIND];
+ struct Qdisc *sch;
+ struct Qdisc_ops *ops;
+ struct qdisc_size_table *stab;
+
+ ops = qdisc_lookup_ops(kind);
+#ifdef CONFIG_MODULES
+ if (ops == NULL && kind != NULL) {
+ char name[IFNAMSIZ];
+ if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+ /* We dropped the RTNL semaphore in order to
+ * perform the module load. So, even if we
+ * succeeded in loading the module we have to
+ * tell the caller to replay the request. We
+ * indicate this using -EAGAIN.
+ * We replay the request because the device may
+ * go away in the mean time.
+ */
+ rtnl_unlock();
+ request_module("sch_%s", name);
+ rtnl_lock();
+ ops = qdisc_lookup_ops(kind);
+ if (ops != NULL) {
+ /* We will try again qdisc_lookup_ops,
+ * so don't keep a reference.
+ */
+ module_put(ops->owner);
+ err = -EAGAIN;
+ goto err_out;
+ }
+ }
+ }
+#endif
+
+ err = -ENOENT;
+ if (ops == NULL)
+ goto err_out;
+
+ sch = qdisc_alloc(dev_queue, ops);
+ if (IS_ERR(sch)) {
+ err = PTR_ERR(sch);
+ goto err_out2;
+ }
+
+ sch->parent = parent;
+
+ if (handle == TC_H_INGRESS) {
+ sch->flags |= TCQ_F_INGRESS;
+ handle = TC_H_MAKE(TC_H_INGRESS, 0);
+ lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
+ } else {
+ if (handle == 0) {
+ handle = qdisc_alloc_handle(dev);
+ err = -ENOMEM;
+ if (handle == 0)
+ goto err_out3;
+ }
+ lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
+ if (!netif_is_multiqueue(dev))
+ sch->flags |= TCQ_F_ONETXQUEUE;
+ }
+
+ sch->handle = handle;
+
+ if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
+ if (qdisc_is_percpu_stats(sch)) {
+ sch->cpu_bstats =
+ netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!sch->cpu_bstats)
+ goto err_out4;
+
+ sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+ if (!sch->cpu_qstats)
+ goto err_out4;
+ }
+
+ if (tca[TCA_STAB]) {
+ stab = qdisc_get_stab(tca[TCA_STAB]);
+ if (IS_ERR(stab)) {
+ err = PTR_ERR(stab);
+ goto err_out4;
+ }
+ rcu_assign_pointer(sch->stab, stab);
+ }
+ if (tca[TCA_RATE]) {
+ spinlock_t *root_lock;
+
+ err = -EOPNOTSUPP;
+ if (sch->flags & TCQ_F_MQROOT)
+ goto err_out4;
+
+ if ((sch->parent != TC_H_ROOT) &&
+ !(sch->flags & TCQ_F_INGRESS) &&
+ (!p || !(p->flags & TCQ_F_MQROOT)))
+ root_lock = qdisc_root_sleeping_lock(sch);
+ else
+ root_lock = qdisc_lock(sch);
+
+ err = gen_new_estimator(&sch->bstats,
+ sch->cpu_bstats,
+ &sch->rate_est,
+ root_lock,
+ tca[TCA_RATE]);
+ if (err)
+ goto err_out4;
+ }
+
+ qdisc_list_add(sch);
+
+ return sch;
+ }
+err_out3:
+ dev_put(dev);
+ kfree((char *) sch - sch->padded);
+err_out2:
+ module_put(ops->owner);
+err_out:
+ *errp = err;
+ return NULL;
+
+err_out4:
+ free_percpu(sch->cpu_bstats);
+ free_percpu(sch->cpu_qstats);
+ /*
+ * Any broken qdiscs that would require a ops->reset() here?
+ * The qdisc was never in action so it shouldn't be necessary.
+ */
+ qdisc_put_stab(rtnl_dereference(sch->stab));
+ if (ops->destroy)
+ ops->destroy(sch);
+ goto err_out3;
+}
+
+static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
+{
+ struct qdisc_size_table *ostab, *stab = NULL;
+ int err = 0;
+
+ if (tca[TCA_OPTIONS]) {
+ if (sch->ops->change == NULL)
+ return -EINVAL;
+ err = sch->ops->change(sch, tca[TCA_OPTIONS]);
+ if (err)
+ return err;
+ }
+
+ if (tca[TCA_STAB]) {
+ stab = qdisc_get_stab(tca[TCA_STAB]);
+ if (IS_ERR(stab))
+ return PTR_ERR(stab);
+ }
+
+ ostab = rtnl_dereference(sch->stab);
+ rcu_assign_pointer(sch->stab, stab);
+ qdisc_put_stab(ostab);
+
+ if (tca[TCA_RATE]) {
+ /* NB: ignores errors from replace_estimator
+ because change can't be undone. */
+ if (sch->flags & TCQ_F_MQROOT)
+ goto out;
+ gen_replace_estimator(&sch->bstats,
+ sch->cpu_bstats,
+ &sch->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ }
+out:
+ return 0;
+}
+
+struct check_loop_arg {
+ struct qdisc_walker w;
+ struct Qdisc *p;
+ int depth;
+};
+
+static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+
+static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
+{
+ struct check_loop_arg arg;
+
+ if (q->ops->cl_ops == NULL)
+ return 0;
+
+ arg.w.stop = arg.w.skip = arg.w.count = 0;
+ arg.w.fn = check_loop_fn;
+ arg.depth = depth;
+ arg.p = p;
+ q->ops->cl_ops->walk(q, &arg.w);
+ return arg.w.stop ? -ELOOP : 0;
+}
+
+static int
+check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
+{
+ struct Qdisc *leaf;
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+ struct check_loop_arg *arg = (struct check_loop_arg *)w;
+
+ leaf = cops->leaf(q, cl);
+ if (leaf) {
+ if (leaf == arg->p || arg->depth > 7)
+ return -ELOOP;
+ return check_loop(leaf, arg->p, arg->depth + 1);
+ }
+ return 0;
+}
+
+/*
+ * Delete/get qdisc.
+ */
+
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm = nlmsg_data(n);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ u32 clid;
+ struct Qdisc *q = NULL;
+ struct Qdisc *p = NULL;
+ int err;
+
+ if ((n->nlmsg_type != RTM_GETQDISC) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ clid = tcm->tcm_parent;
+ if (clid) {
+ if (clid != TC_H_ROOT) {
+ if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p)
+ return -ENOENT;
+ q = qdisc_leaf(p, clid);
+ } else if (dev_ingress_queue(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
+ }
+ } else {
+ q = dev->qdisc;
+ }
+ if (!q)
+ return -ENOENT;
+
+ if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
+ return -EINVAL;
+ } else {
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
+ return -ENOENT;
+ }
+
+ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+ return -EINVAL;
+
+ if (n->nlmsg_type == RTM_DELQDISC) {
+ if (!clid)
+ return -EINVAL;
+ if (q->handle == 0)
+ return -ENOENT;
+ err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
+ if (err != 0)
+ return err;
+ } else {
+ qdisc_notify(net, skb, n, clid, NULL, q);
+ }
+ return 0;
+}
+
+/*
+ * Create/change qdisc.
+ */
+
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm;
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ u32 clid;
+ struct Qdisc *q, *p;
+ int err;
+
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+replay:
+ /* Reinit, just in case something touches this. */
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ tcm = nlmsg_data(n);
+ clid = tcm->tcm_parent;
+ q = p = NULL;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+
+ if (clid) {
+ if (clid != TC_H_ROOT) {
+ if (clid != TC_H_INGRESS) {
+ p = qdisc_lookup(dev, TC_H_MAJ(clid));
+ if (!p)
+ return -ENOENT;
+ q = qdisc_leaf(p, clid);
+ } else if (dev_ingress_queue_create(dev)) {
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
+ }
+ } else {
+ q = dev->qdisc;
+ }
+
+ /* It may be default qdisc, ignore it */
+ if (q && q->handle == 0)
+ q = NULL;
+
+ if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
+ if (tcm->tcm_handle) {
+ if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
+ return -EEXIST;
+ if (TC_H_MIN(tcm->tcm_handle))
+ return -EINVAL;
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ if (!q)
+ goto create_n_graft;
+ if (n->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+ return -EINVAL;
+ if (q == p ||
+ (p && check_loop(q, p, 0)))
+ return -ELOOP;
+ atomic_inc(&q->refcnt);
+ goto graft;
+ } else {
+ if (!q)
+ goto create_n_graft;
+
+ /* This magic test requires explanation.
+ *
+ * We know, that some child q is already
+ * attached to this parent and have choice:
+ * either to change it or to create/graft new one.
+ *
+ * 1. We are allowed to create/graft only
+ * if CREATE and REPLACE flags are set.
+ *
+ * 2. If EXCL is set, requestor wanted to say,
+ * that qdisc tcm_handle is not expected
+ * to exist, so that we choose create/graft too.
+ *
+ * 3. The last case is when no flags are set.
+ * Alas, it is sort of hole in API, we
+ * cannot decide what to do unambiguously.
+ * For now we select create/graft, if
+ * user gave KIND, which does not match existing.
+ */
+ if ((n->nlmsg_flags & NLM_F_CREATE) &&
+ (n->nlmsg_flags & NLM_F_REPLACE) &&
+ ((n->nlmsg_flags & NLM_F_EXCL) ||
+ (tca[TCA_KIND] &&
+ nla_strcmp(tca[TCA_KIND], q->ops->id))))
+ goto create_n_graft;
+ }
+ }
+ } else {
+ if (!tcm->tcm_handle)
+ return -EINVAL;
+ q = qdisc_lookup(dev, tcm->tcm_handle);
+ }
+
+ /* Change qdisc parameters */
+ if (q == NULL)
+ return -ENOENT;
+ if (n->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+ return -EINVAL;
+ err = qdisc_change(q, tca);
+ if (err == 0)
+ qdisc_notify(net, skb, n, clid, NULL, q);
+ return err;
+
+create_n_graft:
+ if (!(n->nlmsg_flags & NLM_F_CREATE))
+ return -ENOENT;
+ if (clid == TC_H_INGRESS) {
+ if (dev_ingress_queue(dev))
+ q = qdisc_create(dev, dev_ingress_queue(dev), p,
+ tcm->tcm_parent, tcm->tcm_parent,
+ tca, &err);
+ else
+ err = -ENOENT;
+ } else {
+ struct netdev_queue *dev_queue;
+
+ if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
+ dev_queue = p->ops->cl_ops->select_queue(p, tcm);
+ else if (p)
+ dev_queue = p->dev_queue;
+ else
+ dev_queue = netdev_get_tx_queue(dev, 0);
+
+ q = qdisc_create(dev, dev_queue, p,
+ tcm->tcm_parent, tcm->tcm_handle,
+ tca, &err);
+ }
+ if (q == NULL) {
+ if (err == -EAGAIN)
+ goto replay;
+ return err;
+ }
+
+graft:
+ err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
+ if (err) {
+ if (q)
+ qdisc_destroy(q);
+ return err;
+ }
+
+ return 0;
+}
+
+static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+ struct gnet_stats_queue __percpu *cpu_qstats = NULL;
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct gnet_dump d;
+ struct qdisc_size_table *stab;
+ __u32 qlen;
+
+ cond_resched();
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = clid;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = atomic_read(&q->refcnt);
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+ if (q->ops->dump && q->ops->dump(q, skb) < 0)
+ goto nla_put_failure;
+ qlen = q->q.qlen;
+
+ stab = rtnl_dereference(q->stab);
+ if (stab && qdisc_dump_stab(skb, stab) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+ qdisc_root_sleeping_lock(q), &d) < 0)
+ goto nla_put_failure;
+
+ if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+ goto nla_put_failure;
+
+ if (qdisc_is_percpu_stats(q)) {
+ cpu_bstats = q->cpu_bstats;
+ cpu_qstats = q->cpu_qstats;
+ }
+
+ if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
+ gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+ gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static bool tc_qdisc_dump_ignore(struct Qdisc *q)
+{
+ return (q->flags & TCQ_F_BUILTIN) ? true : false;
+}
+
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, u32 clid,
+ struct Qdisc *old, struct Qdisc *new)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (old && !tc_qdisc_dump_ignore(old)) {
+ if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
+ 0, RTM_DELQDISC) < 0)
+ goto err_out;
+ }
+ if (new && !tc_qdisc_dump_ignore(new)) {
+ if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
+ old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+ goto err_out;
+ }
+
+ if (skb->len)
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+
+err_out:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ int *q_idx_p, int s_q_idx)
+{
+ int ret = 0, q_idx = *q_idx_p;
+ struct Qdisc *q;
+
+ if (!root)
+ return 0;
+
+ q = root;
+ if (q_idx < s_q_idx) {
+ q_idx++;
+ } else {
+ if (!tc_qdisc_dump_ignore(q) &&
+ tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+ goto done;
+ q_idx++;
+ }
+ list_for_each_entry(q, &root->list, list) {
+ if (q_idx < s_q_idx) {
+ q_idx++;
+ continue;
+ }
+ if (!tc_qdisc_dump_ignore(q) &&
+ tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+ goto done;
+ q_idx++;
+ }
+
+out:
+ *q_idx_p = q_idx;
+ return ret;
+done:
+ ret = -1;
+ goto out;
+}
+
+static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int idx, q_idx;
+ int s_idx, s_q_idx;
+ struct net_device *dev;
+
+ s_idx = cb->args[0];
+ s_q_idx = q_idx = cb->args[1];
+
+ idx = 0;
+ ASSERT_RTNL();
+ for_each_netdev(net, dev) {
+ struct netdev_queue *dev_queue;
+
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ s_q_idx = 0;
+ q_idx = 0;
+
+ if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
+ goto done;
+
+ dev_queue = dev_ingress_queue(dev);
+ if (dev_queue &&
+ tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+ &q_idx, s_q_idx) < 0)
+ goto done;
+
+cont:
+ idx++;
+ }
+
+done:
+ cb->args[0] = idx;
+ cb->args[1] = q_idx;
+
+ return skb->len;
+}
+
+
+
+/************************************************
+ * Traffic classes manipulation. *
+ ************************************************/
+
+
+
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tcmsg *tcm = nlmsg_data(n);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct net_device *dev;
+ struct Qdisc *q = NULL;
+ const struct Qdisc_class_ops *cops;
+ unsigned long cl = 0;
+ unsigned long new_cl;
+ u32 portid;
+ u32 clid;
+ u32 qid;
+ int err;
+
+ if ((n->nlmsg_type != RTM_GETTCLASS) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+ if (err < 0)
+ return err;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ /*
+ parent == TC_H_UNSPEC - unspecified parent.
+ parent == TC_H_ROOT - class is root, which has no parent.
+ parent == X:0 - parent is root class.
+ parent == X:Y - parent is a node in hierarchy.
+ parent == 0:Y - parent is X:Y, where X:0 is qdisc.
+
+ handle == 0:0 - generate handle from kernel pool.
+ handle == 0:Y - class is X:Y, where X:0 is qdisc.
+ handle == X:Y - clear.
+ handle == X:0 - root class.
+ */
+
+ /* Step 1. Determine qdisc handle X:0 */
+
+ portid = tcm->tcm_parent;
+ clid = tcm->tcm_handle;
+ qid = TC_H_MAJ(clid);
+
+ if (portid != TC_H_ROOT) {
+ u32 qid1 = TC_H_MAJ(portid);
+
+ if (qid && qid1) {
+ /* If both majors are known, they must be identical. */
+ if (qid != qid1)
+ return -EINVAL;
+ } else if (qid1) {
+ qid = qid1;
+ } else if (qid == 0)
+ qid = dev->qdisc->handle;
+
+ /* Now qid is genuine qdisc handle consistent
+ * both with parent and child.
+ *
+ * TC_H_MAJ(portid) still may be unspecified, complete it now.
+ */
+ if (portid)
+ portid = TC_H_MAKE(qid, portid);
+ } else {
+ if (qid == 0)
+ qid = dev->qdisc->handle;
+ }
+
+ /* OK. Locate qdisc */
+ q = qdisc_lookup(dev, qid);
+ if (!q)
+ return -ENOENT;
+
+ /* An check that it supports classes */
+ cops = q->ops->cl_ops;
+ if (cops == NULL)
+ return -EINVAL;
+
+ /* Now try to get class */
+ if (clid == 0) {
+ if (portid == TC_H_ROOT)
+ clid = qid;
+ } else
+ clid = TC_H_MAKE(qid, clid);
+
+ if (clid)
+ cl = cops->get(q, clid);
+
+ if (cl == 0) {
+ err = -ENOENT;
+ if (n->nlmsg_type != RTM_NEWTCLASS ||
+ !(n->nlmsg_flags & NLM_F_CREATE))
+ goto out;
+ } else {
+ switch (n->nlmsg_type) {
+ case RTM_NEWTCLASS:
+ err = -EEXIST;
+ if (n->nlmsg_flags & NLM_F_EXCL)
+ goto out;
+ break;
+ case RTM_DELTCLASS:
+ err = -EOPNOTSUPP;
+ if (cops->delete)
+ err = cops->delete(q, cl);
+ if (err == 0)
+ tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
+ goto out;
+ case RTM_GETTCLASS:
+ err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
+ goto out;
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ new_cl = cl;
+ err = -EOPNOTSUPP;
+ if (cops->change)
+ err = cops->change(q, clid, portid, tca, &new_cl);
+ if (err == 0)
+ tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
+
+out:
+ if (cl)
+ cops->put(q, cl);
+
+ return err;
+}
+
+
+static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
+ unsigned long cl,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+ struct gnet_dump d;
+ const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
+
+ cond_resched();
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+ tcm->tcm_parent = q->handle;
+ tcm->tcm_handle = q->handle;
+ tcm->tcm_info = 0;
+ if (nla_put_string(skb, TCA_KIND, q->ops->id))
+ goto nla_put_failure;
+ if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+ qdisc_root_sleeping_lock(q), &d) < 0)
+ goto nla_put_failure;
+
+ if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
+ goto nla_put_failure;
+
+ if (gnet_stats_finish_copy(&d) < 0)
+ goto nla_put_failure;
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct Qdisc *q,
+ unsigned long cl, int event)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+struct qdisc_dump_args {
+ struct qdisc_walker w;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+};
+
+static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
+{
+ struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
+
+ return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
+ a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
+}
+
+static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
+ struct tcmsg *tcm, struct netlink_callback *cb,
+ int *t_p, int s_t)
+{
+ struct qdisc_dump_args arg;
+
+ if (tc_qdisc_dump_ignore(q) ||
+ *t_p < s_t || !q->ops->cl_ops ||
+ (tcm->tcm_parent &&
+ TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
+ (*t_p)++;
+ return 0;
+ }
+ if (*t_p > s_t)
+ memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+ arg.w.fn = qdisc_class_dump;
+ arg.skb = skb;
+ arg.cb = cb;
+ arg.w.stop = 0;
+ arg.w.skip = cb->args[1];
+ arg.w.count = 0;
+ q->ops->cl_ops->walk(q, &arg.w);
+ cb->args[1] = arg.w.count;
+ if (arg.w.stop)
+ return -1;
+ (*t_p)++;
+ return 0;
+}
+
+static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
+ struct tcmsg *tcm, struct netlink_callback *cb,
+ int *t_p, int s_t)
+{
+ struct Qdisc *q;
+
+ if (!root)
+ return 0;
+
+ if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
+ return -1;
+
+ list_for_each_entry(q, &root->list, list) {
+ if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ struct netdev_queue *dev_queue;
+ struct net_device *dev;
+ int t, s_t;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return 0;
+ dev = dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return 0;
+
+ s_t = cb->args[0];
+ t = 0;
+
+ if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
+ goto done;
+
+ dev_queue = dev_ingress_queue(dev);
+ if (dev_queue &&
+ tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+ &t, s_t) < 0)
+ goto done;
+
+done:
+ cb->args[0] = t;
+
+ dev_put(dev);
+ return skb->len;
+}
+
+/* Main classifier routine: scans classifier chain attached
+ * to this qdisc, (optionally) tests for protocol and asks
+ * specific classifiers.
+ */
+int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ __be16 protocol = tc_skb_protocol(skb);
+ int err;
+
+ for (; tp; tp = rcu_dereference_bh(tp->next)) {
+ if (tp->protocol != protocol &&
+ tp->protocol != htons(ETH_P_ALL))
+ continue;
+ err = tp->classify(skb, tp, res);
+
+ if (err >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
+ skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
+#endif
+ return err;
+ }
+ }
+ return -1;
+}
+EXPORT_SYMBOL(tc_classify_compat);
+
+int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ int err = 0;
+#ifdef CONFIG_NET_CLS_ACT
+ const struct tcf_proto *otp = tp;
+reclassify:
+#endif
+
+ err = tc_classify_compat(skb, tp, res);
+#ifdef CONFIG_NET_CLS_ACT
+ if (err == TC_ACT_RECLASSIFY) {
+ u32 verd = G_TC_VERD(skb->tc_verd);
+ tp = otp;
+
+ if (verd++ >= MAX_REC_LOOP) {
+ net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
+ tp->q->ops->id,
+ tp->prio & 0xffff,
+ ntohs(tp->protocol));
+ return TC_ACT_SHOT;
+ }
+ skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
+ goto reclassify;
+ }
+#endif
+ return err;
+}
+EXPORT_SYMBOL(tc_classify);
+
+bool tcf_destroy(struct tcf_proto *tp, bool force)
+{
+ if (tp->ops->destroy(tp, force)) {
+ module_put(tp->ops->owner);
+ kfree_rcu(tp, rcu);
+ return true;
+ }
+
+ return false;
+}
+
+void tcf_destroy_chain(struct tcf_proto __rcu **fl)
+{
+ struct tcf_proto *tp;
+
+ while ((tp = rtnl_dereference(*fl)) != NULL) {
+ RCU_INIT_POINTER(*fl, tp->next);
+ tcf_destroy(tp, true);
+ }
+}
+EXPORT_SYMBOL(tcf_destroy_chain);
+
+#ifdef CONFIG_PROC_FS
+static int psched_show(struct seq_file *seq, void *v)
+{
+ struct timespec ts;
+
+ hrtimer_get_res(CLOCK_MONOTONIC, &ts);
+ seq_printf(seq, "%08x %08x %08x %08x\n",
+ (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
+ 1000000,
+ (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
+
+ return 0;
+}
+
+static int psched_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psched_show, NULL);
+}
+
+static const struct file_operations psched_fops = {
+ .owner = THIS_MODULE,
+ .open = psched_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __net_init psched_net_init(struct net *net)
+{
+ struct proc_dir_entry *e;
+
+ e = proc_create("psched", 0, net->proc_net, &psched_fops);
+ if (e == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __net_exit psched_net_exit(struct net *net)
+{
+ remove_proc_entry("psched", net->proc_net);
+}
+#else
+static int __net_init psched_net_init(struct net *net)
+{
+ return 0;
+}
+
+static void __net_exit psched_net_exit(struct net *net)
+{
+}
+#endif
+
+static struct pernet_operations psched_net_ops = {
+ .init = psched_net_init,
+ .exit = psched_net_exit,
+};
+
+static int __init pktsched_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&psched_net_ops);
+ if (err) {
+ pr_err("pktsched_init: "
+ "cannot initialize per netns operations\n");
+ return err;
+ }
+
+ register_qdisc(&pfifo_fast_ops);
+ register_qdisc(&pfifo_qdisc_ops);
+ register_qdisc(&bfifo_qdisc_ops);
+ register_qdisc(&pfifo_head_drop_qdisc_ops);
+ register_qdisc(&mq_qdisc_ops);
+
+ rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
+
+ return 0;
+}
+
+subsys_initcall(pktsched_init);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
new file mode 100644
index 000000000..e3e2cc5fd
--- /dev/null
+++ b/net/sched/sch_atm.c
@@ -0,0 +1,694 @@
+/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
+
+/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/atmdev.h>
+#include <linux/atmclip.h>
+#include <linux/rtnetlink.h>
+#include <linux/file.h> /* for fput */
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+/*
+ * The ATM queuing discipline provides a framework for invoking classifiers
+ * (aka "filters"), which in turn select classes of this queuing discipline.
+ * Each class maps the flow(s) it is handling to a given VC. Multiple classes
+ * may share the same VC.
+ *
+ * When creating a class, VCs are specified by passing the number of the open
+ * socket descriptor by which the calling process references the VC. The kernel
+ * keeps the VC open at least until all classes using it are removed.
+ *
+ * In this file, most functions are named atm_tc_* to avoid confusion with all
+ * the atm_* in net/atm. This naming convention differs from what's used in the
+ * rest of net/sched.
+ *
+ * Known bugs:
+ * - sometimes messes up the IP stack
+ * - any manipulations besides the few operations described in the README, are
+ * untested and likely to crash the system
+ * - should lock the flow while there is data in the queue (?)
+ */
+
+#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
+
+struct atm_flow_data {
+ struct Qdisc *q; /* FIFO, TBF, etc. */
+ struct tcf_proto __rcu *filter_list;
+ struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
+ void (*old_pop)(struct atm_vcc *vcc,
+ struct sk_buff *skb); /* chaining */
+ struct atm_qdisc_data *parent; /* parent qdisc */
+ struct socket *sock; /* for closing */
+ u32 classid; /* x:y type ID */
+ int ref; /* reference count */
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct list_head list;
+ struct atm_flow_data *excess; /* flow for excess traffic;
+ NULL to set CLP instead */
+ int hdr_len;
+ unsigned char hdr[0]; /* header data; MUST BE LAST */
+};
+
+struct atm_qdisc_data {
+ struct atm_flow_data link; /* unclassified skbs go here */
+ struct list_head flows; /* NB: "link" is also on this
+ list */
+ struct tasklet_struct task; /* dequeue tasklet */
+};
+
+/* ------------------------- Class/flow operations ------------------------- */
+
+static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ list_for_each_entry(flow, &p->flows, list) {
+ if (flow->classid == classid)
+ return flow;
+ }
+ return NULL;
+}
+
+static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+ pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
+ sch, p, flow, new, old);
+ if (list_empty(&flow->list))
+ return -EINVAL;
+ if (!new)
+ new = &noop_qdisc;
+ *old = flow->q;
+ flow->q = new;
+ if (*old)
+ qdisc_reset(*old);
+ return 0;
+}
+
+static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+ pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
+ return flow ? flow->q : NULL;
+}
+
+static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid)
+{
+ struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
+ flow = lookup_flow(sch, classid);
+ if (flow)
+ flow->ref++;
+ pr_debug("atm_tc_get: flow %p\n", flow);
+ return (unsigned long)flow;
+}
+
+static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
+ unsigned long parent, u32 classid)
+{
+ return atm_tc_get(sch, classid);
+}
+
+/*
+ * atm_tc_put handles all destructions, including the ones that are explicitly
+ * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
+ * anything that still seems to be in use.
+ */
+static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+ pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+ if (--flow->ref)
+ return;
+ pr_debug("atm_tc_put: destroying\n");
+ list_del_init(&flow->list);
+ pr_debug("atm_tc_put: qdisc %p\n", flow->q);
+ qdisc_destroy(flow->q);
+ tcf_destroy_chain(&flow->filter_list);
+ if (flow->sock) {
+ pr_debug("atm_tc_put: f_count %ld\n",
+ file_count(flow->sock->file));
+ flow->vcc->pop = flow->old_pop;
+ sockfd_put(flow->sock);
+ }
+ if (flow->excess)
+ atm_tc_put(sch, (unsigned long)flow->excess);
+ if (flow != &p->link)
+ kfree(flow);
+ /*
+ * If flow == &p->link, the qdisc no longer works at this point and
+ * needs to be removed. (By the caller of atm_tc_put.)
+ */
+}
+
+static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
+
+ pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
+ VCC2FLOW(vcc)->old_pop(vcc, skb);
+ tasklet_schedule(&p->task);
+}
+
+static const u8 llc_oui_ip[] = {
+ 0xaa, /* DSAP: non-ISO */
+ 0xaa, /* SSAP: non-ISO */
+ 0x03, /* Ctrl: Unnumbered Information Command PDU */
+ 0x00, /* OUI: EtherType */
+ 0x00, 0x00,
+ 0x08, 0x00
+}; /* Ethertype IP (0800) */
+
+static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
+ [TCA_ATM_FD] = { .type = NLA_U32 },
+ [TCA_ATM_EXCESS] = { .type = NLA_U32 },
+};
+
+static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
+ struct nlattr **tca, unsigned long *arg)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
+ struct atm_flow_data *excess = NULL;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_ATM_MAX + 1];
+ struct socket *sock;
+ int fd, error, hdr_len;
+ void *hdr;
+
+ pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
+ "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
+ /*
+ * The concept of parents doesn't apply for this qdisc.
+ */
+ if (parent && parent != TC_H_ROOT && parent != sch->handle)
+ return -EINVAL;
+ /*
+ * ATM classes cannot be changed. In order to change properties of the
+ * ATM connection, that socket needs to be modified directly (via the
+ * native ATM API. In order to send a flow to a different VC, the old
+ * class needs to be removed and a new one added. (This may be changed
+ * later.)
+ */
+ if (flow)
+ return -EBUSY;
+ if (opt == NULL)
+ return -EINVAL;
+
+ error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy);
+ if (error < 0)
+ return error;
+
+ if (!tb[TCA_ATM_FD])
+ return -EINVAL;
+ fd = nla_get_u32(tb[TCA_ATM_FD]);
+ pr_debug("atm_tc_change: fd %d\n", fd);
+ if (tb[TCA_ATM_HDR]) {
+ hdr_len = nla_len(tb[TCA_ATM_HDR]);
+ hdr = nla_data(tb[TCA_ATM_HDR]);
+ } else {
+ hdr_len = RFC1483LLC_LEN;
+ hdr = NULL; /* default LLC/SNAP for IP */
+ }
+ if (!tb[TCA_ATM_EXCESS])
+ excess = NULL;
+ else {
+ excess = (struct atm_flow_data *)
+ atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
+ if (!excess)
+ return -ENOENT;
+ }
+ pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
+ opt->nla_type, nla_len(opt), hdr_len);
+ sock = sockfd_lookup(fd, &error);
+ if (!sock)
+ return error; /* f_count++ */
+ pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
+ if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
+ error = -EPROTOTYPE;
+ goto err_out;
+ }
+ /* @@@ should check if the socket is really operational or we'll crash
+ on vcc->send */
+ if (classid) {
+ if (TC_H_MAJ(classid ^ sch->handle)) {
+ pr_debug("atm_tc_change: classid mismatch\n");
+ error = -EINVAL;
+ goto err_out;
+ }
+ } else {
+ int i;
+ unsigned long cl;
+
+ for (i = 1; i < 0x8000; i++) {
+ classid = TC_H_MAKE(sch->handle, 0x8000 | i);
+ cl = atm_tc_get(sch, classid);
+ if (!cl)
+ break;
+ atm_tc_put(sch, cl);
+ }
+ }
+ pr_debug("atm_tc_change: new id %x\n", classid);
+ flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
+ pr_debug("atm_tc_change: flow %p\n", flow);
+ if (!flow) {
+ error = -ENOBUFS;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(flow->filter_list, NULL);
+ flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+ if (!flow->q)
+ flow->q = &noop_qdisc;
+ pr_debug("atm_tc_change: qdisc %p\n", flow->q);
+ flow->sock = sock;
+ flow->vcc = ATM_SD(sock); /* speedup */
+ flow->vcc->user_back = flow;
+ pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
+ flow->old_pop = flow->vcc->pop;
+ flow->parent = p;
+ flow->vcc->pop = sch_atm_pop;
+ flow->classid = classid;
+ flow->ref = 1;
+ flow->excess = excess;
+ list_add(&flow->list, &p->link.list);
+ flow->hdr_len = hdr_len;
+ if (hdr)
+ memcpy(flow->hdr, hdr, hdr_len);
+ else
+ memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
+ *arg = (unsigned long)flow;
+ return 0;
+err_out:
+ if (excess)
+ atm_tc_put(sch, (unsigned long)excess);
+ sockfd_put(sock);
+ return error;
+}
+
+static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+ pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+ if (list_empty(&flow->list))
+ return -EINVAL;
+ if (rcu_access_pointer(flow->filter_list) || flow == &p->link)
+ return -EBUSY;
+ /*
+ * Reference count must be 2: one for "keepalive" (set at class
+ * creation), and one for the reference held when calling delete.
+ */
+ if (flow->ref < 2) {
+ pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
+ return -EINVAL;
+ }
+ if (flow->ref > 2)
+ return -EBUSY; /* catch references via excess, etc. */
+ atm_tc_put(sch, arg);
+ return 0;
+}
+
+static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+ if (walker->stop)
+ return;
+ list_for_each_entry(flow, &p->flows, list) {
+ if (walker->count >= walker->skip &&
+ walker->fn(sch, (unsigned long)flow, walker) < 0) {
+ walker->stop = 1;
+ break;
+ }
+ walker->count++;
+ }
+}
+
+static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+ pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+ return flow ? &flow->filter_list : &p->link.filter_list;
+}
+
+/* --------------------------- Qdisc operations ---------------------------- */
+
+static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+ struct tcf_result res;
+ int result;
+ int ret = NET_XMIT_POLICED;
+
+ pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+ result = TC_POLICE_OK; /* be nice to gcc */
+ flow = NULL;
+ if (TC_H_MAJ(skb->priority) != sch->handle ||
+ !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
+ struct tcf_proto *fl;
+
+ list_for_each_entry(flow, &p->flows, list) {
+ fl = rcu_dereference_bh(flow->filter_list);
+ if (fl) {
+ result = tc_classify_compat(skb, fl, &res);
+ if (result < 0)
+ continue;
+ flow = (struct atm_flow_data *)res.class;
+ if (!flow)
+ flow = lookup_flow(sch, res.classid);
+ goto done;
+ }
+ }
+ flow = NULL;
+done:
+ ;
+ }
+ if (!flow) {
+ flow = &p->link;
+ } else {
+ if (flow->vcc)
+ ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
+ /*@@@ looks good ... but it's not supposed to work :-) */
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ kfree_skb(skb);
+ return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ kfree_skb(skb);
+ goto drop;
+ case TC_POLICE_RECLASSIFY:
+ if (flow->excess)
+ flow = flow->excess;
+ else
+ ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
+ break;
+ }
+#endif
+ }
+
+ ret = qdisc_enqueue(skb, flow->q);
+ if (ret != NET_XMIT_SUCCESS) {
+drop: __maybe_unused
+ if (net_xmit_drop_count(ret)) {
+ qdisc_qstats_drop(sch);
+ if (flow)
+ flow->qstats.drops++;
+ }
+ return ret;
+ }
+ /*
+ * Okay, this may seem weird. We pretend we've dropped the packet if
+ * it goes via ATM. The reason for this is that the outer qdisc
+ * expects to be able to q->dequeue the packet later on if we return
+ * success at this place. Also, sch->q.qdisc needs to reflect whether
+ * there is a packet egligible for dequeuing or not. Note that the
+ * statistics of the outer qdisc are necessarily wrong because of all
+ * this. There's currently no correct solution for this.
+ */
+ if (flow == &p->link) {
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+ tasklet_schedule(&p->task);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+/*
+ * Dequeue packets and send them over ATM. Note that we quite deliberately
+ * avoid checking net_device's flow control here, simply because sch_atm
+ * uses its own channels, which have nothing to do with any CLIP/LANE/or
+ * non-ATM interfaces.
+ */
+
+static void sch_atm_dequeue(unsigned long data)
+{
+ struct Qdisc *sch = (struct Qdisc *)data;
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+ struct sk_buff *skb;
+
+ pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
+ list_for_each_entry(flow, &p->flows, list) {
+ if (flow == &p->link)
+ continue;
+ /*
+ * If traffic is properly shaped, this won't generate nasty
+ * little bursts. Otherwise, it may ... (but that's okay)
+ */
+ while ((skb = flow->q->ops->peek(flow->q))) {
+ if (!atm_may_send(flow->vcc, skb->truesize))
+ break;
+
+ skb = qdisc_dequeue_peeked(flow->q);
+ if (unlikely(!skb))
+ break;
+
+ qdisc_bstats_update(sch, skb);
+ bstats_update(&flow->bstats, skb);
+ pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
+ /* remove any LL header somebody else has attached */
+ skb_pull(skb, skb_network_offset(skb));
+ if (skb_headroom(skb) < flow->hdr_len) {
+ struct sk_buff *new;
+
+ new = skb_realloc_headroom(skb, flow->hdr_len);
+ dev_kfree_skb(skb);
+ if (!new)
+ continue;
+ skb = new;
+ }
+ pr_debug("sch_atm_dequeue: ip %p, data %p\n",
+ skb_network_header(skb), skb->data);
+ ATM_SKB(skb)->vcc = flow->vcc;
+ memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
+ flow->hdr_len);
+ atomic_add(skb->truesize,
+ &sk_atm(flow->vcc)->sk_wmem_alloc);
+ /* atm.atm_options are already set by atm_tc_enqueue */
+ flow->vcc->send(flow->vcc, skb);
+ }
+ }
+}
+
+static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
+ tasklet_schedule(&p->task);
+ skb = qdisc_dequeue_peeked(p->link.q);
+ if (skb)
+ sch->q.qlen--;
+ return skb;
+}
+
+static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
+
+ return p->link.q->ops->peek(p->link.q);
+}
+
+static unsigned int atm_tc_drop(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+ unsigned int len;
+
+ pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
+ list_for_each_entry(flow, &p->flows, list) {
+ if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
+ return len;
+ }
+ return 0;
+}
+
+static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+ INIT_LIST_HEAD(&p->flows);
+ INIT_LIST_HEAD(&p->link.list);
+ list_add(&p->link.list, &p->flows);
+ p->link.q = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, sch->handle);
+ if (!p->link.q)
+ p->link.q = &noop_qdisc;
+ pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
+ RCU_INIT_POINTER(p->link.filter_list, NULL);
+ p->link.vcc = NULL;
+ p->link.sock = NULL;
+ p->link.classid = sch->handle;
+ p->link.ref = 1;
+ tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
+ return 0;
+}
+
+static void atm_tc_reset(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow;
+
+ pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
+ list_for_each_entry(flow, &p->flows, list)
+ qdisc_reset(flow->q);
+ sch->q.qlen = 0;
+}
+
+static void atm_tc_destroy(struct Qdisc *sch)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow, *tmp;
+
+ pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
+ list_for_each_entry(flow, &p->flows, list)
+ tcf_destroy_chain(&flow->filter_list);
+
+ list_for_each_entry_safe(flow, tmp, &p->flows, list) {
+ if (flow->ref > 1)
+ pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
+ atm_tc_put(sch, (unsigned long)flow);
+ }
+ tasklet_kill(&p->task);
+}
+
+static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct atm_qdisc_data *p = qdisc_priv(sch);
+ struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+ struct nlattr *nest;
+
+ pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
+ sch, p, flow, skb, tcm);
+ if (list_empty(&flow->list))
+ return -EINVAL;
+ tcm->tcm_handle = flow->classid;
+ tcm->tcm_info = flow->q->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
+ goto nla_put_failure;
+ if (flow->vcc) {
+ struct sockaddr_atmpvc pvc;
+ int state;
+
+ memset(&pvc, 0, sizeof(pvc));
+ pvc.sap_family = AF_ATMPVC;
+ pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
+ pvc.sap_addr.vpi = flow->vcc->vpi;
+ pvc.sap_addr.vci = flow->vcc->vci;
+ if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
+ goto nla_put_failure;
+ state = ATM_VF2VS(flow->vcc->flags);
+ if (nla_put_u32(skb, TCA_ATM_STATE, state))
+ goto nla_put_failure;
+ }
+ if (flow->excess) {
+ if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
+ goto nla_put_failure;
+ }
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+static int
+atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+ if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ return 0;
+}
+
+static const struct Qdisc_class_ops atm_class_ops = {
+ .graft = atm_tc_graft,
+ .leaf = atm_tc_leaf,
+ .get = atm_tc_get,
+ .put = atm_tc_put,
+ .change = atm_tc_change,
+ .delete = atm_tc_delete,
+ .walk = atm_tc_walk,
+ .tcf_chain = atm_tc_find_tcf,
+ .bind_tcf = atm_tc_bind_filter,
+ .unbind_tcf = atm_tc_put,
+ .dump = atm_tc_dump_class,
+ .dump_stats = atm_tc_dump_class_stats,
+};
+
+static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
+ .cl_ops = &atm_class_ops,
+ .id = "atm",
+ .priv_size = sizeof(struct atm_qdisc_data),
+ .enqueue = atm_tc_enqueue,
+ .dequeue = atm_tc_dequeue,
+ .peek = atm_tc_peek,
+ .drop = atm_tc_drop,
+ .init = atm_tc_init,
+ .reset = atm_tc_reset,
+ .destroy = atm_tc_destroy,
+ .dump = atm_tc_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init atm_init(void)
+{
+ return register_qdisc(&atm_qdisc_ops);
+}
+
+static void __exit atm_exit(void)
+{
+ unregister_qdisc(&atm_qdisc_ops);
+}
+
+module_init(atm_init)
+module_exit(atm_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000..094a874b4
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,53 @@
+/*
+ * net/sched/sch_blackhole.c Black hole queue
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ *
+ * Note: Quantum tunneling is not supported.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ qdisc_drop(skb, sch);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
+ .id = "blackhole",
+ .priv_size = 0,
+ .enqueue = blackhole_enqueue,
+ .dequeue = blackhole_dequeue,
+ .peek = blackhole_dequeue,
+ .owner = THIS_MODULE,
+};
+
+static int __init blackhole_module_init(void)
+{
+ return register_qdisc(&blackhole_qdisc_ops);
+}
+
+static void __exit blackhole_module_exit(void)
+{
+ unregister_qdisc(&blackhole_qdisc_ops);
+}
+
+module_init(blackhole_module_init)
+module_exit(blackhole_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
new file mode 100644
index 000000000..beeb75f80
--- /dev/null
+++ b/net/sched/sch_cbq.c
@@ -0,0 +1,2062 @@
+/*
+ * net/sched/sch_cbq.c Class-Based Queueing discipline.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+/* Class-Based Queueing (CBQ) algorithm.
+ =======================================
+
+ Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
+ Management Models for Packet Networks",
+ IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
+
+ [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
+
+ [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
+ Parameters", 1996
+
+ [4] Sally Floyd and Michael Speer, "Experimental Results
+ for Class-Based Queueing", 1998, not published.
+
+ -----------------------------------------------------------------------
+
+ Algorithm skeleton was taken from NS simulator cbq.cc.
+ If someone wants to check this code against the LBL version,
+ he should take into account that ONLY the skeleton was borrowed,
+ the implementation is different. Particularly:
+
+ --- The WRR algorithm is different. Our version looks more
+ reasonable (I hope) and works when quanta are allowed to be
+ less than MTU, which is always the case when real time classes
+ have small rates. Note, that the statement of [3] is
+ incomplete, delay may actually be estimated even if class
+ per-round allotment is less than MTU. Namely, if per-round
+ allotment is W*r_i, and r_1+...+r_k = r < 1
+
+ delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
+
+ In the worst case we have IntServ estimate with D = W*r+k*MTU
+ and C = MTU*r. The proof (if correct at all) is trivial.
+
+
+ --- It seems that cbq-2.0 is not very accurate. At least, I cannot
+ interpret some places, which look like wrong translations
+ from NS. Anyone is advised to find these differences
+ and explain to me, why I am wrong 8).
+
+ --- Linux has no EOI event, so that we cannot estimate true class
+ idle time. Workaround is to consider the next dequeue event
+ as sign that previous packet is finished. This is wrong because of
+ internal device queueing, but on a permanently loaded link it is true.
+ Moreover, combined with clock integrator, this scheme looks
+ very close to an ideal solution. */
+
+struct cbq_sched_data;
+
+
+struct cbq_class {
+ struct Qdisc_class_common common;
+ struct cbq_class *next_alive; /* next class with backlog in this priority band */
+
+/* Parameters */
+ unsigned char priority; /* class priority */
+ unsigned char priority2; /* priority to be used after overlimit */
+ unsigned char ewma_log; /* time constant for idle time calculation */
+ unsigned char ovl_strategy;
+#ifdef CONFIG_NET_CLS_ACT
+ unsigned char police;
+#endif
+
+ u32 defmap;
+
+ /* Link-sharing scheduler parameters */
+ long maxidle; /* Class parameters: see below. */
+ long offtime;
+ long minidle;
+ u32 avpkt;
+ struct qdisc_rate_table *R_tab;
+
+ /* Overlimit strategy parameters */
+ void (*overlimit)(struct cbq_class *cl);
+ psched_tdiff_t penalty;
+
+ /* General scheduler (WRR) parameters */
+ long allot;
+ long quantum; /* Allotment per WRR round */
+ long weight; /* Relative allotment: see below */
+
+ struct Qdisc *qdisc; /* Ptr to CBQ discipline */
+ struct cbq_class *split; /* Ptr to split node */
+ struct cbq_class *share; /* Ptr to LS parent in the class tree */
+ struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
+ struct cbq_class *borrow; /* NULL if class is bandwidth limited;
+ parent otherwise */
+ struct cbq_class *sibling; /* Sibling chain */
+ struct cbq_class *children; /* Pointer to children chain */
+
+ struct Qdisc *q; /* Elementary queueing discipline */
+
+
+/* Variables */
+ unsigned char cpriority; /* Effective priority */
+ unsigned char delayed;
+ unsigned char level; /* level of the class in hierarchy:
+ 0 for leaf classes, and maximal
+ level of children + 1 for nodes.
+ */
+
+ psched_time_t last; /* Last end of service */
+ psched_time_t undertime;
+ long avgidle;
+ long deficit; /* Saved deficit for WRR */
+ psched_time_t penalized;
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct gnet_stats_rate_est64 rate_est;
+ struct tc_cbq_xstats xstats;
+
+ struct tcf_proto __rcu *filter_list;
+
+ int refcnt;
+ int filters;
+
+ struct cbq_class *defaults[TC_PRIO_MAX + 1];
+};
+
+struct cbq_sched_data {
+ struct Qdisc_class_hash clhash; /* Hash table of all classes */
+ int nclasses[TC_CBQ_MAXPRIO + 1];
+ unsigned int quanta[TC_CBQ_MAXPRIO + 1];
+
+ struct cbq_class link;
+
+ unsigned int activemask;
+ struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
+ with backlog */
+
+#ifdef CONFIG_NET_CLS_ACT
+ struct cbq_class *rx_class;
+#endif
+ struct cbq_class *tx_class;
+ struct cbq_class *tx_borrowed;
+ int tx_len;
+ psched_time_t now; /* Cached timestamp */
+ unsigned int pmask;
+
+ struct hrtimer delay_timer;
+ struct qdisc_watchdog watchdog; /* Watchdog timer,
+ started when CBQ has
+ backlog, but cannot
+ transmit just now */
+ psched_tdiff_t wd_expires;
+ int toplevel;
+ u32 hgenerator;
+};
+
+
+#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
+
+static inline struct cbq_class *
+cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
+{
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct cbq_class, common);
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+
+static struct cbq_class *
+cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
+{
+ struct cbq_class *cl;
+
+ for (cl = this->tparent; cl; cl = cl->tparent) {
+ struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
+
+ if (new != NULL && new != this)
+ return new;
+ }
+ return NULL;
+}
+
+#endif
+
+/* Classify packet. The procedure is pretty complicated, but
+ * it allows us to combine link sharing and priority scheduling
+ * transparently.
+ *
+ * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+ * so that it resolves to split nodes. Then packets are classified
+ * by logical priority, or a more specific classifier may be attached
+ * to the split node.
+ */
+
+static struct cbq_class *
+cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *head = &q->link;
+ struct cbq_class **defmap;
+ struct cbq_class *cl = NULL;
+ u32 prio = skb->priority;
+ struct tcf_proto *fl;
+ struct tcf_result res;
+
+ /*
+ * Step 1. If skb->priority points to one of our classes, use it.
+ */
+ if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
+ (cl = cbq_class_lookup(q, prio)) != NULL)
+ return cl;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ for (;;) {
+ int result = 0;
+ defmap = head->defaults;
+
+ fl = rcu_dereference_bh(head->filter_list);
+ /*
+ * Step 2+n. Apply classifier.
+ */
+ result = tc_classify_compat(skb, fl, &res);
+ if (!fl || result < 0)
+ goto fallback;
+
+ cl = (void *)res.class;
+ if (!cl) {
+ if (TC_H_MAJ(res.classid))
+ cl = cbq_class_lookup(q, res.classid);
+ else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
+ cl = defmap[TC_PRIO_BESTEFFORT];
+
+ if (cl == NULL)
+ goto fallback;
+ }
+ if (cl->level >= head->level)
+ goto fallback;
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NULL;
+ case TC_ACT_RECLASSIFY:
+ return cbq_reclassify(skb, cl);
+ }
+#endif
+ if (cl->level == 0)
+ return cl;
+
+ /*
+ * Step 3+n. If classifier selected a link sharing class,
+ * apply agency specific classifier.
+ * Repeat this procdure until we hit a leaf node.
+ */
+ head = cl;
+ }
+
+fallback:
+ cl = head;
+
+ /*
+ * Step 4. No success...
+ */
+ if (TC_H_MAJ(prio) == 0 &&
+ !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
+ !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
+ return head;
+
+ return cl;
+}
+
+/*
+ * A packet has just been enqueued on the empty class.
+ * cbq_activate_class adds it to the tail of active class list
+ * of its priority band.
+ */
+
+static inline void cbq_activate_class(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ int prio = cl->cpriority;
+ struct cbq_class *cl_tail;
+
+ cl_tail = q->active[prio];
+ q->active[prio] = cl;
+
+ if (cl_tail != NULL) {
+ cl->next_alive = cl_tail->next_alive;
+ cl_tail->next_alive = cl;
+ } else {
+ cl->next_alive = cl;
+ q->activemask |= (1<<prio);
+ }
+}
+
+/*
+ * Unlink class from active chain.
+ * Note that this same procedure is done directly in cbq_dequeue*
+ * during round-robin procedure.
+ */
+
+static void cbq_deactivate_class(struct cbq_class *this)
+{
+ struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+ int prio = this->cpriority;
+ struct cbq_class *cl;
+ struct cbq_class *cl_prev = q->active[prio];
+
+ do {
+ cl = cl_prev->next_alive;
+ if (cl == this) {
+ cl_prev->next_alive = cl->next_alive;
+ cl->next_alive = NULL;
+
+ if (cl == q->active[prio]) {
+ q->active[prio] = cl_prev;
+ if (cl == q->active[prio]) {
+ q->active[prio] = NULL;
+ q->activemask &= ~(1<<prio);
+ return;
+ }
+ }
+ return;
+ }
+ } while ((cl_prev = cl) != q->active[prio]);
+}
+
+static void
+cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+ int toplevel = q->toplevel;
+
+ if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
+ psched_time_t now = psched_get_time();
+
+ do {
+ if (cl->undertime < now) {
+ q->toplevel = cl->level;
+ return;
+ }
+ } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
+ }
+}
+
+static int
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ int uninitialized_var(ret);
+ struct cbq_class *cl = cbq_classify(skb, sch, &ret);
+
+#ifdef CONFIG_NET_CLS_ACT
+ q->rx_class = cl;
+#endif
+ if (cl == NULL) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+ }
+
+#ifdef CONFIG_NET_CLS_ACT
+ cl->q->__parent = sch;
+#endif
+ ret = qdisc_enqueue(skb, cl->q);
+ if (ret == NET_XMIT_SUCCESS) {
+ sch->q.qlen++;
+ cbq_mark_toplevel(q, cl);
+ if (!cl->next_alive)
+ cbq_activate_class(cl);
+ return ret;
+ }
+
+ if (net_xmit_drop_count(ret)) {
+ qdisc_qstats_drop(sch);
+ cbq_mark_toplevel(q, cl);
+ cl->qstats.drops++;
+ }
+ return ret;
+}
+
+/* Overlimit actions */
+
+/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
+
+static void cbq_ovl_classic(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ psched_tdiff_t delay = cl->undertime - q->now;
+
+ if (!cl->delayed) {
+ delay += cl->offtime;
+
+ /*
+ * Class goes to sleep, so that it will have no
+ * chance to work avgidle. Let's forgive it 8)
+ *
+ * BTW cbq-2.0 has a crap in this
+ * place, apparently they forgot to shift it by cl->ewma_log.
+ */
+ if (cl->avgidle < 0)
+ delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+ if (cl->avgidle < cl->minidle)
+ cl->avgidle = cl->minidle;
+ if (delay <= 0)
+ delay = 1;
+ cl->undertime = q->now + delay;
+
+ cl->xstats.overactions++;
+ cl->delayed = 1;
+ }
+ if (q->wd_expires == 0 || q->wd_expires > delay)
+ q->wd_expires = delay;
+
+ /* Dirty work! We must schedule wakeups based on
+ * real available rate, rather than leaf rate,
+ * which may be tiny (even zero).
+ */
+ if (q->toplevel == TC_CBQ_MAXLEVEL) {
+ struct cbq_class *b;
+ psched_tdiff_t base_delay = q->wd_expires;
+
+ for (b = cl->borrow; b; b = b->borrow) {
+ delay = b->undertime - q->now;
+ if (delay < base_delay) {
+ if (delay <= 0)
+ delay = 1;
+ base_delay = delay;
+ }
+ }
+
+ q->wd_expires = base_delay;
+ }
+}
+
+/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
+ * they go overlimit
+ */
+
+static void cbq_ovl_rclassic(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ struct cbq_class *this = cl;
+
+ do {
+ if (cl->level > q->toplevel) {
+ cl = NULL;
+ break;
+ }
+ } while ((cl = cl->borrow) != NULL);
+
+ if (cl == NULL)
+ cl = this;
+ cbq_ovl_classic(cl);
+}
+
+/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
+
+static void cbq_ovl_delay(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ psched_tdiff_t delay = cl->undertime - q->now;
+
+ if (test_bit(__QDISC_STATE_DEACTIVATED,
+ &qdisc_root_sleeping(cl->qdisc)->state))
+ return;
+
+ if (!cl->delayed) {
+ psched_time_t sched = q->now;
+ ktime_t expires;
+
+ delay += cl->offtime;
+ if (cl->avgidle < 0)
+ delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+ if (cl->avgidle < cl->minidle)
+ cl->avgidle = cl->minidle;
+ cl->undertime = q->now + delay;
+
+ if (delay > 0) {
+ sched += delay + cl->penalty;
+ cl->penalized = sched;
+ cl->cpriority = TC_CBQ_MAXPRIO;
+ q->pmask |= (1<<TC_CBQ_MAXPRIO);
+
+ expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
+ if (hrtimer_try_to_cancel(&q->delay_timer) &&
+ ktime_to_ns(ktime_sub(
+ hrtimer_get_expires(&q->delay_timer),
+ expires)) > 0)
+ hrtimer_set_expires(&q->delay_timer, expires);
+ hrtimer_restart(&q->delay_timer);
+ cl->delayed = 1;
+ cl->xstats.overactions++;
+ return;
+ }
+ delay = 1;
+ }
+ if (q->wd_expires == 0 || q->wd_expires > delay)
+ q->wd_expires = delay;
+}
+
+/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
+
+static void cbq_ovl_lowprio(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+
+ cl->penalized = q->now + cl->penalty;
+
+ if (cl->cpriority != cl->priority2) {
+ cl->cpriority = cl->priority2;
+ q->pmask |= (1<<cl->cpriority);
+ cl->xstats.overactions++;
+ }
+ cbq_ovl_classic(cl);
+}
+
+/* TC_CBQ_OVL_DROP: penalize class by dropping */
+
+static void cbq_ovl_drop(struct cbq_class *cl)
+{
+ if (cl->q->ops->drop)
+ if (cl->q->ops->drop(cl->q))
+ cl->qdisc->q.qlen--;
+ cl->xstats.overactions++;
+ cbq_ovl_classic(cl);
+}
+
+static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
+ psched_time_t now)
+{
+ struct cbq_class *cl;
+ struct cbq_class *cl_prev = q->active[prio];
+ psched_time_t sched = now;
+
+ if (cl_prev == NULL)
+ return 0;
+
+ do {
+ cl = cl_prev->next_alive;
+ if (now - cl->penalized > 0) {
+ cl_prev->next_alive = cl->next_alive;
+ cl->next_alive = NULL;
+ cl->cpriority = cl->priority;
+ cl->delayed = 0;
+ cbq_activate_class(cl);
+
+ if (cl == q->active[prio]) {
+ q->active[prio] = cl_prev;
+ if (cl == q->active[prio]) {
+ q->active[prio] = NULL;
+ return 0;
+ }
+ }
+
+ cl = cl_prev->next_alive;
+ } else if (sched - cl->penalized > 0)
+ sched = cl->penalized;
+ } while ((cl_prev = cl) != q->active[prio]);
+
+ return sched - now;
+}
+
+static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
+{
+ struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
+ delay_timer);
+ struct Qdisc *sch = q->watchdog.qdisc;
+ psched_time_t now;
+ psched_tdiff_t delay = 0;
+ unsigned int pmask;
+
+ now = psched_get_time();
+
+ pmask = q->pmask;
+ q->pmask = 0;
+
+ while (pmask) {
+ int prio = ffz(~pmask);
+ psched_tdiff_t tmp;
+
+ pmask &= ~(1<<prio);
+
+ tmp = cbq_undelay_prio(q, prio, now);
+ if (tmp > 0) {
+ q->pmask |= 1<<prio;
+ if (tmp < delay || delay == 0)
+ delay = tmp;
+ }
+ }
+
+ if (delay) {
+ ktime_t time;
+
+ time = ktime_set(0, 0);
+ time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
+ hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
+ }
+
+ qdisc_unthrottled(sch);
+ __netif_schedule(qdisc_root(sch));
+ return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
+{
+ struct Qdisc *sch = child->__parent;
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = q->rx_class;
+
+ q->rx_class = NULL;
+
+ if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
+ int ret;
+
+ cbq_mark_toplevel(q, cl);
+
+ q->rx_class = cl;
+ cl->q->__parent = sch;
+
+ ret = qdisc_enqueue(skb, cl->q);
+ if (ret == NET_XMIT_SUCCESS) {
+ sch->q.qlen++;
+ if (!cl->next_alive)
+ cbq_activate_class(cl);
+ return 0;
+ }
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ return 0;
+ }
+
+ qdisc_qstats_drop(sch);
+ return -1;
+}
+#endif
+
+/*
+ * It is mission critical procedure.
+ *
+ * We "regenerate" toplevel cutoff, if transmitting class
+ * has backlog and it is not regulated. It is not part of
+ * original CBQ description, but looks more reasonable.
+ * Probably, it is wrong. This question needs further investigation.
+ */
+
+static inline void
+cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
+ struct cbq_class *borrowed)
+{
+ if (cl && q->toplevel >= borrowed->level) {
+ if (cl->q->q.qlen > 1) {
+ do {
+ if (borrowed->undertime == PSCHED_PASTPERFECT) {
+ q->toplevel = borrowed->level;
+ return;
+ }
+ } while ((borrowed = borrowed->borrow) != NULL);
+ }
+#if 0
+ /* It is not necessary now. Uncommenting it
+ will save CPU cycles, but decrease fairness.
+ */
+ q->toplevel = TC_CBQ_MAXLEVEL;
+#endif
+ }
+}
+
+static void
+cbq_update(struct cbq_sched_data *q)
+{
+ struct cbq_class *this = q->tx_class;
+ struct cbq_class *cl = this;
+ int len = q->tx_len;
+ psched_time_t now;
+
+ q->tx_class = NULL;
+ /* Time integrator. We calculate EOS time
+ * by adding expected packet transmission time.
+ */
+ now = q->now + L2T(&q->link, len);
+
+ for ( ; cl; cl = cl->share) {
+ long avgidle = cl->avgidle;
+ long idle;
+
+ cl->bstats.packets++;
+ cl->bstats.bytes += len;
+
+ /*
+ * (now - last) is total time between packet right edges.
+ * (last_pktlen/rate) is "virtual" busy time, so that
+ *
+ * idle = (now - last) - last_pktlen/rate
+ */
+
+ idle = now - cl->last;
+ if ((unsigned long)idle > 128*1024*1024) {
+ avgidle = cl->maxidle;
+ } else {
+ idle -= L2T(cl, len);
+
+ /* true_avgidle := (1-W)*true_avgidle + W*idle,
+ * where W=2^{-ewma_log}. But cl->avgidle is scaled:
+ * cl->avgidle == true_avgidle/W,
+ * hence:
+ */
+ avgidle += idle - (avgidle>>cl->ewma_log);
+ }
+
+ if (avgidle <= 0) {
+ /* Overlimit or at-limit */
+
+ if (avgidle < cl->minidle)
+ avgidle = cl->minidle;
+
+ cl->avgidle = avgidle;
+
+ /* Calculate expected time, when this class
+ * will be allowed to send.
+ * It will occur, when:
+ * (1-W)*true_avgidle + W*delay = 0, i.e.
+ * idle = (1/W - 1)*(-true_avgidle)
+ * or
+ * idle = (1 - W)*(-cl->avgidle);
+ */
+ idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
+
+ /*
+ * That is not all.
+ * To maintain the rate allocated to the class,
+ * we add to undertime virtual clock,
+ * necessary to complete transmitted packet.
+ * (len/phys_bandwidth has been already passed
+ * to the moment of cbq_update)
+ */
+
+ idle -= L2T(&q->link, len);
+ idle += L2T(cl, len);
+
+ cl->undertime = now + idle;
+ } else {
+ /* Underlimit */
+
+ cl->undertime = PSCHED_PASTPERFECT;
+ if (avgidle > cl->maxidle)
+ cl->avgidle = cl->maxidle;
+ else
+ cl->avgidle = avgidle;
+ }
+ if ((s64)(now - cl->last) > 0)
+ cl->last = now;
+ }
+
+ cbq_update_toplevel(q, this, q->tx_borrowed);
+}
+
+static inline struct cbq_class *
+cbq_under_limit(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ struct cbq_class *this_cl = cl;
+
+ if (cl->tparent == NULL)
+ return cl;
+
+ if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
+ cl->delayed = 0;
+ return cl;
+ }
+
+ do {
+ /* It is very suspicious place. Now overlimit
+ * action is generated for not bounded classes
+ * only if link is completely congested.
+ * Though it is in agree with ancestor-only paradigm,
+ * it looks very stupid. Particularly,
+ * it means that this chunk of code will either
+ * never be called or result in strong amplification
+ * of burstiness. Dangerous, silly, and, however,
+ * no another solution exists.
+ */
+ cl = cl->borrow;
+ if (!cl) {
+ this_cl->qstats.overlimits++;
+ this_cl->overlimit(this_cl);
+ return NULL;
+ }
+ if (cl->level > q->toplevel)
+ return NULL;
+ } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
+
+ cl->delayed = 0;
+ return cl;
+}
+
+static inline struct sk_buff *
+cbq_dequeue_prio(struct Qdisc *sch, int prio)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl_tail, *cl_prev, *cl;
+ struct sk_buff *skb;
+ int deficit;
+
+ cl_tail = cl_prev = q->active[prio];
+ cl = cl_prev->next_alive;
+
+ do {
+ deficit = 0;
+
+ /* Start round */
+ do {
+ struct cbq_class *borrow = cl;
+
+ if (cl->q->q.qlen &&
+ (borrow = cbq_under_limit(cl)) == NULL)
+ goto skip_class;
+
+ if (cl->deficit <= 0) {
+ /* Class exhausted its allotment per
+ * this round. Switch to the next one.
+ */
+ deficit = 1;
+ cl->deficit += cl->quantum;
+ goto next_class;
+ }
+
+ skb = cl->q->dequeue(cl->q);
+
+ /* Class did not give us any skb :-(
+ * It could occur even if cl->q->q.qlen != 0
+ * f.e. if cl->q == "tbf"
+ */
+ if (skb == NULL)
+ goto skip_class;
+
+ cl->deficit -= qdisc_pkt_len(skb);
+ q->tx_class = cl;
+ q->tx_borrowed = borrow;
+ if (borrow != cl) {
+#ifndef CBQ_XSTATS_BORROWS_BYTES
+ borrow->xstats.borrows++;
+ cl->xstats.borrows++;
+#else
+ borrow->xstats.borrows += qdisc_pkt_len(skb);
+ cl->xstats.borrows += qdisc_pkt_len(skb);
+#endif
+ }
+ q->tx_len = qdisc_pkt_len(skb);
+
+ if (cl->deficit <= 0) {
+ q->active[prio] = cl;
+ cl = cl->next_alive;
+ cl->deficit += cl->quantum;
+ }
+ return skb;
+
+skip_class:
+ if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
+ /* Class is empty or penalized.
+ * Unlink it from active chain.
+ */
+ cl_prev->next_alive = cl->next_alive;
+ cl->next_alive = NULL;
+
+ /* Did cl_tail point to it? */
+ if (cl == cl_tail) {
+ /* Repair it! */
+ cl_tail = cl_prev;
+
+ /* Was it the last class in this band? */
+ if (cl == cl_tail) {
+ /* Kill the band! */
+ q->active[prio] = NULL;
+ q->activemask &= ~(1<<prio);
+ if (cl->q->q.qlen)
+ cbq_activate_class(cl);
+ return NULL;
+ }
+
+ q->active[prio] = cl_tail;
+ }
+ if (cl->q->q.qlen)
+ cbq_activate_class(cl);
+
+ cl = cl_prev;
+ }
+
+next_class:
+ cl_prev = cl;
+ cl = cl->next_alive;
+ } while (cl_prev != cl_tail);
+ } while (deficit);
+
+ q->active[prio] = cl_prev;
+
+ return NULL;
+}
+
+static inline struct sk_buff *
+cbq_dequeue_1(struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ unsigned int activemask;
+
+ activemask = q->activemask & 0xFF;
+ while (activemask) {
+ int prio = ffz(~activemask);
+ activemask &= ~(1<<prio);
+ skb = cbq_dequeue_prio(sch, prio);
+ if (skb)
+ return skb;
+ }
+ return NULL;
+}
+
+static struct sk_buff *
+cbq_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ psched_time_t now;
+
+ now = psched_get_time();
+
+ if (q->tx_class)
+ cbq_update(q);
+
+ q->now = now;
+
+ for (;;) {
+ q->wd_expires = 0;
+
+ skb = cbq_dequeue_1(sch);
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ qdisc_unthrottled(sch);
+ return skb;
+ }
+
+ /* All the classes are overlimit.
+ *
+ * It is possible, if:
+ *
+ * 1. Scheduler is empty.
+ * 2. Toplevel cutoff inhibited borrowing.
+ * 3. Root class is overlimit.
+ *
+ * Reset 2d and 3d conditions and retry.
+ *
+ * Note, that NS and cbq-2.0 are buggy, peeking
+ * an arbitrary class is appropriate for ancestor-only
+ * sharing, but not for toplevel algorithm.
+ *
+ * Our version is better, but slower, because it requires
+ * two passes, but it is unavoidable with top-level sharing.
+ */
+
+ if (q->toplevel == TC_CBQ_MAXLEVEL &&
+ q->link.undertime == PSCHED_PASTPERFECT)
+ break;
+
+ q->toplevel = TC_CBQ_MAXLEVEL;
+ q->link.undertime = PSCHED_PASTPERFECT;
+ }
+
+ /* No packets in scheduler or nobody wants to give them to us :-(
+ * Sigh... start watchdog timer in the last case.
+ */
+
+ if (sch->q.qlen) {
+ qdisc_qstats_overlimit(sch);
+ if (q->wd_expires)
+ qdisc_watchdog_schedule(&q->watchdog,
+ now + q->wd_expires);
+ }
+ return NULL;
+}
+
+/* CBQ class maintanance routines */
+
+static void cbq_adjust_levels(struct cbq_class *this)
+{
+ if (this == NULL)
+ return;
+
+ do {
+ int level = 0;
+ struct cbq_class *cl;
+
+ cl = this->children;
+ if (cl) {
+ do {
+ if (cl->level > level)
+ level = cl->level;
+ } while ((cl = cl->sibling) != this->children);
+ }
+ this->level = level + 1;
+ } while ((this = this->tparent) != NULL);
+}
+
+static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
+{
+ struct cbq_class *cl;
+ unsigned int h;
+
+ if (q->quanta[prio] == 0)
+ return;
+
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+ /* BUGGGG... Beware! This expression suffer of
+ * arithmetic overflows!
+ */
+ if (cl->priority == prio) {
+ cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
+ q->quanta[prio];
+ }
+ if (cl->quantum <= 0 ||
+ cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
+ pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
+ cl->common.classid, cl->quantum);
+ cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
+ }
+ }
+ }
+}
+
+static void cbq_sync_defmap(struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+ struct cbq_class *split = cl->split;
+ unsigned int h;
+ int i;
+
+ if (split == NULL)
+ return;
+
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
+ split->defaults[i] = NULL;
+ }
+
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ int level = split->level;
+
+ if (split->defaults[i])
+ continue;
+
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ struct cbq_class *c;
+
+ hlist_for_each_entry(c, &q->clhash.hash[h],
+ common.hnode) {
+ if (c->split == split && c->level < level &&
+ c->defmap & (1<<i)) {
+ split->defaults[i] = c;
+ level = c->level;
+ }
+ }
+ }
+ }
+}
+
+static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
+{
+ struct cbq_class *split = NULL;
+
+ if (splitid == 0) {
+ split = cl->split;
+ if (!split)
+ return;
+ splitid = split->common.classid;
+ }
+
+ if (split == NULL || split->common.classid != splitid) {
+ for (split = cl->tparent; split; split = split->tparent)
+ if (split->common.classid == splitid)
+ break;
+ }
+
+ if (split == NULL)
+ return;
+
+ if (cl->split != split) {
+ cl->defmap = 0;
+ cbq_sync_defmap(cl);
+ cl->split = split;
+ cl->defmap = def & mask;
+ } else
+ cl->defmap = (cl->defmap & ~mask) | (def & mask);
+
+ cbq_sync_defmap(cl);
+}
+
+static void cbq_unlink_class(struct cbq_class *this)
+{
+ struct cbq_class *cl, **clp;
+ struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+
+ qdisc_class_hash_remove(&q->clhash, &this->common);
+
+ if (this->tparent) {
+ clp = &this->sibling;
+ cl = *clp;
+ do {
+ if (cl == this) {
+ *clp = cl->sibling;
+ break;
+ }
+ clp = &cl->sibling;
+ } while ((cl = *clp) != this->sibling);
+
+ if (this->tparent->children == this) {
+ this->tparent->children = this->sibling;
+ if (this->sibling == this)
+ this->tparent->children = NULL;
+ }
+ } else {
+ WARN_ON(this->sibling != this);
+ }
+}
+
+static void cbq_link_class(struct cbq_class *this)
+{
+ struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+ struct cbq_class *parent = this->tparent;
+
+ this->sibling = this;
+ qdisc_class_hash_insert(&q->clhash, &this->common);
+
+ if (parent == NULL)
+ return;
+
+ if (parent->children == NULL) {
+ parent->children = this;
+ } else {
+ this->sibling = parent->children->sibling;
+ parent->children->sibling = this;
+ }
+}
+
+static unsigned int cbq_drop(struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl, *cl_head;
+ int prio;
+ unsigned int len;
+
+ for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
+ cl_head = q->active[prio];
+ if (!cl_head)
+ continue;
+
+ cl = cl_head;
+ do {
+ if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
+ sch->q.qlen--;
+ if (!cl->q->q.qlen)
+ cbq_deactivate_class(cl);
+ return len;
+ }
+ } while ((cl = cl->next_alive) != cl_head);
+ }
+ return 0;
+}
+
+static void
+cbq_reset(struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl;
+ int prio;
+ unsigned int h;
+
+ q->activemask = 0;
+ q->pmask = 0;
+ q->tx_class = NULL;
+ q->tx_borrowed = NULL;
+ qdisc_watchdog_cancel(&q->watchdog);
+ hrtimer_cancel(&q->delay_timer);
+ q->toplevel = TC_CBQ_MAXLEVEL;
+ q->now = psched_get_time();
+
+ for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
+ q->active[prio] = NULL;
+
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+ qdisc_reset(cl->q);
+
+ cl->next_alive = NULL;
+ cl->undertime = PSCHED_PASTPERFECT;
+ cl->avgidle = cl->maxidle;
+ cl->deficit = cl->quantum;
+ cl->cpriority = cl->priority;
+ }
+ }
+ sch->q.qlen = 0;
+}
+
+
+static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
+{
+ if (lss->change & TCF_CBQ_LSS_FLAGS) {
+ cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
+ cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
+ }
+ if (lss->change & TCF_CBQ_LSS_EWMA)
+ cl->ewma_log = lss->ewma_log;
+ if (lss->change & TCF_CBQ_LSS_AVPKT)
+ cl->avpkt = lss->avpkt;
+ if (lss->change & TCF_CBQ_LSS_MINIDLE)
+ cl->minidle = -(long)lss->minidle;
+ if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
+ cl->maxidle = lss->maxidle;
+ cl->avgidle = lss->maxidle;
+ }
+ if (lss->change & TCF_CBQ_LSS_OFFTIME)
+ cl->offtime = lss->offtime;
+ return 0;
+}
+
+static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+ q->nclasses[cl->priority]--;
+ q->quanta[cl->priority] -= cl->weight;
+ cbq_normalize_quanta(q, cl->priority);
+}
+
+static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+ q->nclasses[cl->priority]++;
+ q->quanta[cl->priority] += cl->weight;
+ cbq_normalize_quanta(q, cl->priority);
+}
+
+static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
+{
+ struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+
+ if (wrr->allot)
+ cl->allot = wrr->allot;
+ if (wrr->weight)
+ cl->weight = wrr->weight;
+ if (wrr->priority) {
+ cl->priority = wrr->priority - 1;
+ cl->cpriority = cl->priority;
+ if (cl->priority >= cl->priority2)
+ cl->priority2 = TC_CBQ_MAXPRIO - 1;
+ }
+
+ cbq_addprio(q, cl);
+ return 0;
+}
+
+static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
+{
+ switch (ovl->strategy) {
+ case TC_CBQ_OVL_CLASSIC:
+ cl->overlimit = cbq_ovl_classic;
+ break;
+ case TC_CBQ_OVL_DELAY:
+ cl->overlimit = cbq_ovl_delay;
+ break;
+ case TC_CBQ_OVL_LOWPRIO:
+ if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
+ ovl->priority2 - 1 <= cl->priority)
+ return -EINVAL;
+ cl->priority2 = ovl->priority2 - 1;
+ cl->overlimit = cbq_ovl_lowprio;
+ break;
+ case TC_CBQ_OVL_DROP:
+ cl->overlimit = cbq_ovl_drop;
+ break;
+ case TC_CBQ_OVL_RCLASSIC:
+ cl->overlimit = cbq_ovl_rclassic;
+ break;
+ default:
+ return -EINVAL;
+ }
+ cl->penalty = ovl->penalty;
+ return 0;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
+{
+ cl->police = p->police;
+
+ if (cl->q->handle) {
+ if (p->police == TC_POLICE_RECLASSIFY)
+ cl->q->reshape_fail = cbq_reshape_fail;
+ else
+ cl->q->reshape_fail = NULL;
+ }
+ return 0;
+}
+#endif
+
+static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
+{
+ cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
+ return 0;
+}
+
+static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
+ [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
+ [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
+ [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
+ [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
+ [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
+ [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
+};
+
+static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CBQ_MAX + 1];
+ struct tc_ratespec *r;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
+ return -EINVAL;
+
+ r = nla_data(tb[TCA_CBQ_RATE]);
+
+ if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
+ return -EINVAL;
+
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ goto put_rtab;
+
+ q->link.refcnt = 1;
+ q->link.sibling = &q->link;
+ q->link.common.classid = sch->handle;
+ q->link.qdisc = sch;
+ q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle);
+ if (!q->link.q)
+ q->link.q = &noop_qdisc;
+
+ q->link.priority = TC_CBQ_MAXPRIO - 1;
+ q->link.priority2 = TC_CBQ_MAXPRIO - 1;
+ q->link.cpriority = TC_CBQ_MAXPRIO - 1;
+ q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
+ q->link.overlimit = cbq_ovl_classic;
+ q->link.allot = psched_mtu(qdisc_dev(sch));
+ q->link.quantum = q->link.allot;
+ q->link.weight = q->link.R_tab->rate.rate;
+
+ q->link.ewma_log = TC_CBQ_DEF_EWMA;
+ q->link.avpkt = q->link.allot/2;
+ q->link.minidle = -0x7FFFFFFF;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+ hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ q->delay_timer.function = cbq_undelay;
+ q->toplevel = TC_CBQ_MAXLEVEL;
+ q->now = psched_get_time();
+
+ cbq_link_class(&q->link);
+
+ if (tb[TCA_CBQ_LSSOPT])
+ cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
+
+ cbq_addprio(q, &q->link);
+ return 0;
+
+put_rtab:
+ qdisc_put_rtab(q->link.R_tab);
+ return err;
+}
+
+static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+
+ if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cbq_lssopt opt;
+
+ opt.flags = 0;
+ if (cl->borrow == NULL)
+ opt.flags |= TCF_CBQ_LSS_BOUNDED;
+ if (cl->share == NULL)
+ opt.flags |= TCF_CBQ_LSS_ISOLATED;
+ opt.ewma_log = cl->ewma_log;
+ opt.level = cl->level;
+ opt.avpkt = cl->avpkt;
+ opt.maxidle = cl->maxidle;
+ opt.minidle = (u32)(-cl->minidle);
+ opt.offtime = cl->offtime;
+ opt.change = ~0;
+ if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cbq_wrropt opt;
+
+ memset(&opt, 0, sizeof(opt));
+ opt.flags = 0;
+ opt.allot = cl->allot;
+ opt.priority = cl->priority + 1;
+ opt.cpriority = cl->cpriority + 1;
+ opt.weight = cl->weight;
+ if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cbq_ovl opt;
+
+ opt.strategy = cl->ovl_strategy;
+ opt.priority2 = cl->priority2 + 1;
+ opt.pad = 0;
+ opt.penalty = cl->penalty;
+ if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cbq_fopt opt;
+
+ if (cl->split || cl->defmap) {
+ opt.split = cl->split ? cl->split->common.classid : 0;
+ opt.defmap = cl->defmap;
+ opt.defchange = ~0;
+ if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
+ goto nla_put_failure;
+ }
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_cbq_police opt;
+
+ if (cl->police) {
+ opt.police = cl->police;
+ opt.__res1 = 0;
+ opt.__res2 = 0;
+ if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
+ goto nla_put_failure;
+ }
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+#endif
+
+static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
+{
+ if (cbq_dump_lss(skb, cl) < 0 ||
+ cbq_dump_rate(skb, cl) < 0 ||
+ cbq_dump_wrr(skb, cl) < 0 ||
+ cbq_dump_ovl(skb, cl) < 0 ||
+#ifdef CONFIG_NET_CLS_ACT
+ cbq_dump_police(skb, cl) < 0 ||
+#endif
+ cbq_dump_fopt(skb, cl) < 0)
+ return -1;
+ return 0;
+}
+
+static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (cbq_dump_attr(skb, &q->link) < 0)
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int
+cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+
+ q->link.xstats.avgidle = q->link.avgidle;
+ return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
+}
+
+static int
+cbq_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+ struct nlattr *nest;
+
+ if (cl->tparent)
+ tcm->tcm_parent = cl->tparent->common.classid;
+ else
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ tcm->tcm_info = cl->q->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (cbq_dump_attr(skb, cl) < 0)
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int
+cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ cl->xstats.avgidle = cl->avgidle;
+ cl->xstats.undertime = 0;
+
+ if (cl->undertime != PSCHED_PASTPERFECT)
+ cl->xstats.undertime = cl->undertime - q->now;
+
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
+}
+
+static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, cl->common.classid);
+ if (new == NULL)
+ return -ENOBUFS;
+ } else {
+#ifdef CONFIG_NET_CLS_ACT
+ if (cl->police == TC_POLICE_RECLASSIFY)
+ new->reshape_fail = cbq_reshape_fail;
+#endif
+ }
+ sch_tree_lock(sch);
+ *old = cl->q;
+ cl->q = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ return cl->q;
+}
+
+static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ if (cl->q->q.qlen == 0)
+ cbq_deactivate_class(cl);
+}
+
+static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = cbq_class_lookup(q, classid);
+
+ if (cl) {
+ cl->refcnt++;
+ return (unsigned long)cl;
+ }
+ return 0;
+}
+
+static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+
+ WARN_ON(cl->filters);
+
+ tcf_destroy_chain(&cl->filter_list);
+ qdisc_destroy(cl->q);
+ qdisc_put_rtab(cl->R_tab);
+ gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ if (cl != &q->link)
+ kfree(cl);
+}
+
+static void cbq_destroy(struct Qdisc *sch)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct hlist_node *next;
+ struct cbq_class *cl;
+ unsigned int h;
+
+#ifdef CONFIG_NET_CLS_ACT
+ q->rx_class = NULL;
+#endif
+ /*
+ * Filters must be destroyed first because we don't destroy the
+ * classes from root to leafs which means that filters can still
+ * be bound to classes which have been destroyed already. --TGR '04
+ */
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
+ tcf_destroy_chain(&cl->filter_list);
+ }
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
+ common.hnode)
+ cbq_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+}
+
+static void cbq_put(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ if (--cl->refcnt == 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
+ struct cbq_sched_data *q = qdisc_priv(sch);
+
+ spin_lock_bh(root_lock);
+ if (q->rx_class == cl)
+ q->rx_class = NULL;
+ spin_unlock_bh(root_lock);
+#endif
+
+ cbq_destroy_class(sch, cl);
+ }
+}
+
+static int
+cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
+ unsigned long *arg)
+{
+ int err;
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = (struct cbq_class *)*arg;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_CBQ_MAX + 1];
+ struct cbq_class *parent;
+ struct qdisc_rate_table *rtab = NULL;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
+ if (err < 0)
+ return err;
+
+ if (cl) {
+ /* Check parent */
+ if (parentid) {
+ if (cl->tparent &&
+ cl->tparent->common.classid != parentid)
+ return -EINVAL;
+ if (!cl->tparent && parentid != TC_H_ROOT)
+ return -EINVAL;
+ }
+
+ if (tb[TCA_CBQ_RATE]) {
+ rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
+ tb[TCA_CBQ_RTAB]);
+ if (rtab == NULL)
+ return -EINVAL;
+ }
+
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ qdisc_put_rtab(rtab);
+ return err;
+ }
+ }
+
+ /* Change class parameters */
+ sch_tree_lock(sch);
+
+ if (cl->next_alive != NULL)
+ cbq_deactivate_class(cl);
+
+ if (rtab) {
+ qdisc_put_rtab(cl->R_tab);
+ cl->R_tab = rtab;
+ }
+
+ if (tb[TCA_CBQ_LSSOPT])
+ cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
+
+ if (tb[TCA_CBQ_WRROPT]) {
+ cbq_rmprio(q, cl);
+ cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
+ }
+
+ if (tb[TCA_CBQ_OVL_STRATEGY])
+ cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
+
+#ifdef CONFIG_NET_CLS_ACT
+ if (tb[TCA_CBQ_POLICE])
+ cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
+#endif
+
+ if (tb[TCA_CBQ_FOPT])
+ cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
+
+ if (cl->q->q.qlen)
+ cbq_activate_class(cl);
+
+ sch_tree_unlock(sch);
+
+ return 0;
+ }
+
+ if (parentid == TC_H_ROOT)
+ return -EINVAL;
+
+ if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
+ tb[TCA_CBQ_LSSOPT] == NULL)
+ return -EINVAL;
+
+ rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
+ if (rtab == NULL)
+ return -EINVAL;
+
+ if (classid) {
+ err = -EINVAL;
+ if (TC_H_MAJ(classid ^ sch->handle) ||
+ cbq_class_lookup(q, classid))
+ goto failure;
+ } else {
+ int i;
+ classid = TC_H_MAKE(sch->handle, 0x8000);
+
+ for (i = 0; i < 0x8000; i++) {
+ if (++q->hgenerator >= 0x8000)
+ q->hgenerator = 1;
+ if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
+ break;
+ }
+ err = -ENOSR;
+ if (i >= 0x8000)
+ goto failure;
+ classid = classid|q->hgenerator;
+ }
+
+ parent = &q->link;
+ if (parentid) {
+ parent = cbq_class_lookup(q, parentid);
+ err = -EINVAL;
+ if (parent == NULL)
+ goto failure;
+ }
+
+ err = -ENOBUFS;
+ cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+ if (cl == NULL)
+ goto failure;
+
+ if (tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ kfree(cl);
+ goto failure;
+ }
+ }
+
+ cl->R_tab = rtab;
+ rtab = NULL;
+ cl->refcnt = 1;
+ cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+ if (!cl->q)
+ cl->q = &noop_qdisc;
+ cl->common.classid = classid;
+ cl->tparent = parent;
+ cl->qdisc = sch;
+ cl->allot = parent->allot;
+ cl->quantum = cl->allot;
+ cl->weight = cl->R_tab->rate.rate;
+
+ sch_tree_lock(sch);
+ cbq_link_class(cl);
+ cl->borrow = cl->tparent;
+ if (cl->tparent != &q->link)
+ cl->share = cl->tparent;
+ cbq_adjust_levels(parent);
+ cl->minidle = -0x7FFFFFFF;
+ cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
+ cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
+ if (cl->ewma_log == 0)
+ cl->ewma_log = q->link.ewma_log;
+ if (cl->maxidle == 0)
+ cl->maxidle = q->link.maxidle;
+ if (cl->avpkt == 0)
+ cl->avpkt = q->link.avpkt;
+ cl->overlimit = cbq_ovl_classic;
+ if (tb[TCA_CBQ_OVL_STRATEGY])
+ cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
+#ifdef CONFIG_NET_CLS_ACT
+ if (tb[TCA_CBQ_POLICE])
+ cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
+#endif
+ if (tb[TCA_CBQ_FOPT])
+ cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+
+failure:
+ qdisc_put_rtab(rtab);
+ return err;
+}
+
+static int cbq_delete(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = (struct cbq_class *)arg;
+ unsigned int qlen;
+
+ if (cl->filters || cl->children || cl == &q->link)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ qlen = cl->q->q.qlen;
+ qdisc_reset(cl->q);
+ qdisc_tree_decrease_qlen(cl->q, qlen);
+
+ if (cl->next_alive)
+ cbq_deactivate_class(cl);
+
+ if (q->tx_borrowed == cl)
+ q->tx_borrowed = q->tx_class;
+ if (q->tx_class == cl) {
+ q->tx_class = NULL;
+ q->tx_borrowed = NULL;
+ }
+#ifdef CONFIG_NET_CLS_ACT
+ if (q->rx_class == cl)
+ q->rx_class = NULL;
+#endif
+
+ cbq_unlink_class(cl);
+ cbq_adjust_levels(cl->tparent);
+ cl->defmap = 0;
+ cbq_sync_defmap(cl);
+
+ cbq_rmprio(q, cl);
+ sch_tree_unlock(sch);
+
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
+
+ return 0;
+}
+
+static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
+ unsigned long arg)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ if (cl == NULL)
+ cl = &q->link;
+
+ return &cl->filter_list;
+}
+
+static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *p = (struct cbq_class *)parent;
+ struct cbq_class *cl = cbq_class_lookup(q, classid);
+
+ if (cl) {
+ if (p && p->level <= cl->level)
+ return 0;
+ cl->filters++;
+ return (unsigned long)cl;
+ }
+ return 0;
+}
+
+static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+
+ cl->filters--;
+}
+
+static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl;
+ unsigned int h;
+
+ if (arg->stop)
+ return;
+
+ for (h = 0; h < q->clhash.hashsize; h++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static const struct Qdisc_class_ops cbq_class_ops = {
+ .graft = cbq_graft,
+ .leaf = cbq_leaf,
+ .qlen_notify = cbq_qlen_notify,
+ .get = cbq_get,
+ .put = cbq_put,
+ .change = cbq_change_class,
+ .delete = cbq_delete,
+ .walk = cbq_walk,
+ .tcf_chain = cbq_find_tcf,
+ .bind_tcf = cbq_bind_filter,
+ .unbind_tcf = cbq_unbind_filter,
+ .dump = cbq_dump_class,
+ .dump_stats = cbq_dump_class_stats,
+};
+
+static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &cbq_class_ops,
+ .id = "cbq",
+ .priv_size = sizeof(struct cbq_sched_data),
+ .enqueue = cbq_enqueue,
+ .dequeue = cbq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = cbq_drop,
+ .init = cbq_init,
+ .reset = cbq_reset,
+ .destroy = cbq_destroy,
+ .change = NULL,
+ .dump = cbq_dump,
+ .dump_stats = cbq_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init cbq_module_init(void)
+{
+ return register_qdisc(&cbq_qdisc_ops);
+}
+static void __exit cbq_module_exit(void)
+{
+ unregister_qdisc(&cbq_qdisc_ops);
+}
+module_init(cbq_module_init)
+module_exit(cbq_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
new file mode 100644
index 000000000..c009eb904
--- /dev/null
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,645 @@
+/*
+ * net/sched/sch_choke.c CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <net/flow_keys.h>
+
+/*
+ CHOKe stateless AQM for fair bandwidth allocation
+ =================================================
+
+ CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+ unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+ maintains no flow state. The difference from RED is an additional step
+ during the enqueuing process. If average queue size is over the
+ low threshold (qmin), a packet is chosen at random from the queue.
+ If both the new and chosen packet are from the same flow, both
+ are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+ needs to access packets in queue randomly. It has a minimal class
+ interface to allow overriding the builtin flow classifier with
+ filters.
+
+ Source:
+ R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+ Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+ IEEE INFOCOM, 2000.
+
+ A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+ Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE (128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+ u32 limit;
+ unsigned char flags;
+
+ struct red_parms parms;
+
+/* Variables */
+ struct red_vars vars;
+ struct tcf_proto __rcu *filter_list;
+ struct {
+ u32 prob_drop; /* Early probability drops */
+ u32 prob_mark; /* Early probability marks */
+ u32 forced_drop; /* Forced drops, qavg > max_thresh */
+ u32 forced_mark; /* Forced marks, qavg > max_thresh */
+ u32 pdrop; /* Drops due to queue limits */
+ u32 other; /* Drops due to drop() calls */
+ u32 matched; /* Drops to flow match */
+ } stats;
+
+ unsigned int head;
+ unsigned int tail;
+
+ unsigned int tab_mask; /* size - 1 */
+
+ struct sk_buff **tab;
+};
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+ return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+ do {
+ q->head = (q->head + 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+ do {
+ q->tail = (q->tail - 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = q->tab[idx];
+
+ q->tab[idx] = NULL;
+
+ if (idx == q->head)
+ choke_zap_head_holes(q);
+ if (idx == q->tail)
+ choke_zap_tail_holes(q);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_drop(skb, sch);
+ qdisc_tree_decrease_qlen(sch, 1);
+ --sch->q.qlen;
+}
+
+/* private part of skb->cb[] that a qdisc is allowed to use
+ * is limited to QDISC_CB_PRIV_LEN bytes.
+ * As a flow key might be too large, we store a part of it only.
+ */
+#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3)
+
+struct choke_skb_cb {
+ u16 classid;
+ u8 keys_valid;
+ u8 keys[QDISC_CB_PRIV_LEN - 3];
+};
+
+static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
+ return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+ choke_skb_cb(skb)->classid = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+ return choke_skb_cb(skb)->classid;
+}
+
+/*
+ * Compare flow of two packets
+ * Returns true only if source and destination address and port match.
+ * false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+ struct sk_buff *skb2)
+{
+ struct flow_keys temp;
+
+ if (skb1->protocol != skb2->protocol)
+ return false;
+
+ if (!choke_skb_cb(skb1)->keys_valid) {
+ choke_skb_cb(skb1)->keys_valid = 1;
+ skb_flow_dissect(skb1, &temp);
+ memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN);
+ }
+
+ if (!choke_skb_cb(skb2)->keys_valid) {
+ choke_skb_cb(skb2)->keys_valid = 1;
+ skb_flow_dissect(skb2, &temp);
+ memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN);
+ }
+
+ return !memcmp(&choke_skb_cb(skb1)->keys,
+ &choke_skb_cb(skb2)->keys,
+ CHOKE_K_LEN);
+}
+
+/*
+ * Classify flow using either:
+ * 1. pre-existing classification result in skb
+ * 2. fast internal classification
+ * 3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+ struct Qdisc *sch, int *qerr)
+
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ fl = rcu_dereference_bh(q->filter_list);
+ result = tc_classify(skb, fl, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ choke_set_classid(skb, TC_H_MIN(res.classid));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ * times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+ unsigned int *pidx)
+{
+ struct sk_buff *skb;
+ int retrys = 3;
+
+ do {
+ *pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
+ skb = q->tab[*pidx];
+ if (skb)
+ return skb;
+ } while (--retrys > 0);
+
+ return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+ struct sk_buff *nskb,
+ unsigned int *pidx)
+{
+ struct sk_buff *oskb;
+
+ if (q->head == q->tail)
+ return false;
+
+ oskb = choke_peek_random(q, pidx);
+ if (rcu_access_pointer(q->filter_list))
+ return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+ return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ struct choke_sched_data *q = qdisc_priv(sch);
+ const struct red_parms *p = &q->parms;
+
+ if (rcu_access_pointer(q->filter_list)) {
+ /* If using external classifiers, get result and record it. */
+ if (!choke_classify(skb, sch, &ret))
+ goto other_drop; /* Packet was eaten by filter */
+ }
+
+ choke_skb_cb(skb)->keys_valid = 0;
+ /* Compute average queue usage (see RED) */
+ q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
+
+ /* Is queue small? */
+ if (q->vars.qavg <= p->qth_min)
+ q->vars.qcount = -1;
+ else {
+ unsigned int idx;
+
+ /* Draw a packet at random from queue and compare flow */
+ if (choke_match_random(q, skb, &idx)) {
+ q->stats.matched++;
+ choke_drop_by_idx(sch, idx);
+ goto congestion_drop;
+ }
+
+ /* Queue is large, always mark/drop */
+ if (q->vars.qavg > p->qth_max) {
+ q->vars.qcount = -1;
+
+ qdisc_qstats_overlimit(sch);
+ if (use_harddrop(q) || !use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ } else if (++q->vars.qcount) {
+ if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
+ q->vars.qcount = 0;
+ q->vars.qR = red_random(p);
+
+ qdisc_qstats_overlimit(sch);
+ if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ }
+ } else
+ q->vars.qR = red_random(p);
+ }
+
+ /* Admit new packet */
+ if (sch->q.qlen < q->limit) {
+ q->tab[q->tail] = skb;
+ q->tail = (q->tail + 1) & q->tab_mask;
+ ++sch->q.qlen;
+ qdisc_qstats_backlog_inc(sch, skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ q->stats.pdrop++;
+ return qdisc_drop(skb, sch);
+
+congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+
+other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ if (q->head == q->tail) {
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+ return NULL;
+ }
+
+ skb = q->tab[q->head];
+ q->tab[q->head] = NULL;
+ choke_zap_head_holes(q);
+ --sch->q.qlen;
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+
+ return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ unsigned int len;
+
+ len = qdisc_queue_drop(sch);
+ if (len > 0)
+ q->stats.other++;
+ else {
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+ }
+
+ return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ red_restart(&q->vars);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+ [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
+ [TCA_CHOKE_MAX_P] = { .type = NLA_U32 },
+};
+
+
+static void choke_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CHOKE_MAX + 1];
+ const struct tc_red_qopt *ctl;
+ int err;
+ struct sk_buff **old = NULL;
+ unsigned int mask;
+ u32 max_P;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CHOKE_PARMS] == NULL ||
+ tb[TCA_CHOKE_STAB] == NULL)
+ return -EINVAL;
+
+ max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
+
+ ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+ if (ctl->limit > CHOKE_MAX_QUEUE)
+ return -EINVAL;
+
+ mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+ if (mask != q->tab_mask) {
+ struct sk_buff **ntab;
+
+ ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!ntab)
+ ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+ if (!ntab)
+ return -ENOMEM;
+
+ sch_tree_lock(sch);
+ old = q->tab;
+ if (old) {
+ unsigned int oqlen = sch->q.qlen, tail = 0;
+
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ if (tail < mask) {
+ ntab[tail++] = skb;
+ continue;
+ }
+ qdisc_qstats_backlog_dec(sch, skb);
+ --sch->q.qlen;
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+ q->head = 0;
+ q->tail = tail;
+ }
+
+ q->tab_mask = mask;
+ q->tab = ntab;
+ } else
+ sch_tree_lock(sch);
+
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_CHOKE_STAB]),
+ max_P);
+ red_set_vars(&q->vars);
+
+ if (q->head == q->tail)
+ red_end_of_idle_period(&q->vars);
+
+ sch_tree_unlock(sch);
+ choke_free(old);
+ return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
+ nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tc_choke_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .matched = q->stats.matched,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ if (!arg->stop) {
+ if (arg->fn(sch, 1, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+ .leaf = choke_leaf,
+ .get = choke_get,
+ .put = choke_put,
+ .tcf_chain = choke_find_tcf,
+ .bind_tcf = choke_bind,
+ .unbind_tcf = choke_put,
+ .dump = choke_dump_class,
+ .walk = choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+ .id = "choke",
+ .priv_size = sizeof(struct choke_sched_data),
+
+ .enqueue = choke_enqueue,
+ .dequeue = choke_dequeue,
+ .peek = choke_peek_head,
+ .drop = choke_drop,
+ .init = choke_init,
+ .destroy = choke_destroy,
+ .reset = choke_reset,
+ .change = choke_change,
+ .dump = choke_dump,
+ .dump_stats = choke_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+ return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+ unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
new file mode 100644
index 000000000..7a0bdb16a
--- /dev/null
+++ b/net/sched/sch_codel.c
@@ -0,0 +1,276 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm
+ *
+ * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
+ * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
+ *
+ * Implemented on linux by :
+ * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
+ * Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/prefetch.h>
+#include <net/pkt_sched.h>
+#include <net/codel.h>
+
+
+#define DEFAULT_CODEL_LIMIT 1000
+
+struct codel_sched_data {
+ struct codel_params params;
+ struct codel_vars vars;
+ struct codel_stats stats;
+ u32 drop_overlimit;
+};
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+ struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+ prefetch(&skb->end); /* we'll need skb_shinfo() */
+ return skb;
+}
+
+static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
+
+ /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+ * or HTB crashes. Defer it for next round.
+ */
+ if (q->stats.drop_count && sch->q.qlen) {
+ qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
+ q->stats.drop_count = 0;
+ }
+ if (skb)
+ qdisc_bstats_update(sch, skb);
+ return skb;
+}
+
+static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct codel_sched_data *q;
+
+ if (likely(qdisc_qlen(sch) < sch->limit)) {
+ codel_set_enqueue_time(skb);
+ return qdisc_enqueue_tail(skb, sch);
+ }
+ q = qdisc_priv(sch);
+ q->drop_overlimit++;
+ return qdisc_drop(skb, sch);
+}
+
+static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
+ [TCA_CODEL_TARGET] = { .type = NLA_U32 },
+ [TCA_CODEL_LIMIT] = { .type = NLA_U32 },
+ [TCA_CODEL_INTERVAL] = { .type = NLA_U32 },
+ [TCA_CODEL_ECN] = { .type = NLA_U32 },
+};
+
+static int codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CODEL_MAX + 1];
+ unsigned int qlen;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ if (tb[TCA_CODEL_TARGET]) {
+ u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
+
+ q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_CODEL_INTERVAL]) {
+ u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
+
+ q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_CODEL_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
+
+ if (tb[TCA_CODEL_ECN])
+ q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
+
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+
+ sch->limit = DEFAULT_CODEL_LIMIT;
+
+ codel_params_init(&q->params, sch);
+ codel_vars_init(&q->vars);
+ codel_stats_init(&q->stats);
+
+ if (opt) {
+ int err = codel_change(sch, opt);
+
+ if (err)
+ return err;
+ }
+
+ if (sch->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ return 0;
+}
+
+static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CODEL_TARGET,
+ codel_time_to_us(q->params.target)) ||
+ nla_put_u32(skb, TCA_CODEL_LIMIT,
+ sch->limit) ||
+ nla_put_u32(skb, TCA_CODEL_INTERVAL,
+ codel_time_to_us(q->params.interval)) ||
+ nla_put_u32(skb, TCA_CODEL_ECN,
+ q->params.ecn))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -1;
+}
+
+static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ const struct codel_sched_data *q = qdisc_priv(sch);
+ struct tc_codel_xstats st = {
+ .maxpacket = q->stats.maxpacket,
+ .count = q->vars.count,
+ .lastcount = q->vars.lastcount,
+ .drop_overlimit = q->drop_overlimit,
+ .ldelay = codel_time_to_us(q->vars.ldelay),
+ .dropping = q->vars.dropping,
+ .ecn_mark = q->stats.ecn_mark,
+ };
+
+ if (q->vars.dropping) {
+ codel_tdiff_t delta = q->vars.drop_next - codel_get_time();
+
+ if (delta >= 0)
+ st.drop_next = codel_time_to_us(delta);
+ else
+ st.drop_next = -codel_time_to_us(-delta);
+ }
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void codel_reset(struct Qdisc *sch)
+{
+ struct codel_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+ codel_vars_init(&q->vars);
+}
+
+static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
+ .id = "codel",
+ .priv_size = sizeof(struct codel_sched_data),
+
+ .enqueue = codel_qdisc_enqueue,
+ .dequeue = codel_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = codel_init,
+ .reset = codel_reset,
+ .change = codel_change,
+ .dump = codel_dump,
+ .dump_stats = codel_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init codel_module_init(void)
+{
+ return register_qdisc(&codel_qdisc_ops);
+}
+
+static void __exit codel_module_exit(void)
+{
+ unregister_qdisc(&codel_qdisc_ops);
+}
+
+module_init(codel_module_init)
+module_exit(codel_module_exit)
+
+MODULE_DESCRIPTION("Controlled Delay queue discipline");
+MODULE_AUTHOR("Dave Taht");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
new file mode 100644
index 000000000..338706092
--- /dev/null
+++ b/net/sched/sch_drr.c
@@ -0,0 +1,529 @@
+/*
+ * net/sched/sch_drr.c Deficit Round Robin scheduler
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+struct drr_class {
+ struct Qdisc_class_common common;
+ unsigned int refcnt;
+ unsigned int filter_cnt;
+
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct gnet_stats_rate_est64 rate_est;
+ struct list_head alist;
+ struct Qdisc *qdisc;
+
+ u32 quantum;
+ u32 deficit;
+};
+
+struct drr_sched {
+ struct list_head active;
+ struct tcf_proto __rcu *filter_list;
+ struct Qdisc_class_hash clhash;
+};
+
+static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct drr_class, common);
+}
+
+static void drr_purge_queue(struct drr_class *cl)
+{
+ unsigned int len = cl->qdisc->q.qlen;
+
+ qdisc_reset(cl->qdisc);
+ qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
+ [TCA_DRR_QUANTUM] = { .type = NLA_U32 },
+};
+
+static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl = (struct drr_class *)*arg;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_DRR_MAX + 1];
+ u32 quantum;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_DRR_QUANTUM]) {
+ quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
+ if (quantum == 0)
+ return -EINVAL;
+ } else
+ quantum = psched_mtu(qdisc_dev(sch));
+
+ if (cl != NULL) {
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err)
+ return err;
+ }
+
+ sch_tree_lock(sch);
+ if (tb[TCA_DRR_QUANTUM])
+ cl->quantum = quantum;
+ sch_tree_unlock(sch);
+
+ return 0;
+ }
+
+ cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL);
+ if (cl == NULL)
+ return -ENOBUFS;
+
+ cl->refcnt = 1;
+ cl->common.classid = classid;
+ cl->quantum = quantum;
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, classid);
+ if (cl->qdisc == NULL)
+ cl->qdisc = &noop_qdisc;
+
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ qdisc_destroy(cl->qdisc);
+ kfree(cl);
+ return err;
+ }
+ }
+
+ sch_tree_lock(sch);
+ qdisc_class_hash_insert(&q->clhash, &cl->common);
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+}
+
+static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
+{
+ gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ qdisc_destroy(cl->qdisc);
+ kfree(cl);
+}
+
+static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ if (cl->filter_cnt > 0)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ drr_purge_queue(cl);
+ qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static unsigned long drr_get_class(struct Qdisc *sch, u32 classid)
+{
+ struct drr_class *cl = drr_find_class(sch, classid);
+
+ if (cl != NULL)
+ cl->refcnt++;
+
+ return (unsigned long)cl;
+}
+
+static void drr_put_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ if (--cl->refcnt == 0)
+ drr_destroy_class(sch, cl);
+}
+
+static struct tcf_proto __rcu **drr_tcf_chain(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+
+ return &q->filter_list;
+}
+
+static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct drr_class *cl = drr_find_class(sch, classid);
+
+ if (cl != NULL)
+ cl->filter_cnt++;
+
+ return (unsigned long)cl;
+}
+
+static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ cl->filter_cnt--;
+}
+
+static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, cl->common.classid);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ sch_tree_lock(sch);
+ drr_purge_queue(cl);
+ *old = cl->qdisc;
+ cl->qdisc = new;
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ return cl->qdisc;
+}
+
+static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+
+ if (cl->qdisc->q.qlen == 0)
+ list_del(&cl->alist);
+}
+
+static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct drr_class *cl = (struct drr_class *)arg;
+ __u32 qlen = cl->qdisc->q.qlen;
+ struct tc_drr_stats xstats;
+
+ memset(&xstats, 0, sizeof(xstats));
+ if (qlen)
+ xstats.deficit = cl->deficit;
+
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+ cl = drr_find_class(sch, skb->priority);
+ if (cl != NULL)
+ return cl;
+ }
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ fl = rcu_dereference_bh(q->filter_list);
+ result = tc_classify(skb, fl, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (struct drr_class *)res.class;
+ if (cl == NULL)
+ cl = drr_find_class(sch, res.classid);
+ return cl;
+ }
+ return NULL;
+}
+
+static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ int err = 0;
+
+ cl = drr_classify(skb, sch, &err);
+ if (cl == NULL) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return err;
+ }
+
+ err = qdisc_enqueue(skb, cl->qdisc);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ if (cl->qdisc->q.qlen == 1) {
+ list_add_tail(&cl->alist, &q->active);
+ cl->deficit = cl->quantum;
+ }
+
+ sch->q.qlen++;
+ return err;
+}
+
+static struct sk_buff *drr_dequeue(struct Qdisc *sch)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ struct sk_buff *skb;
+ unsigned int len;
+
+ if (list_empty(&q->active))
+ goto out;
+ while (1) {
+ cl = list_first_entry(&q->active, struct drr_class, alist);
+ skb = cl->qdisc->ops->peek(cl->qdisc);
+ if (skb == NULL) {
+ qdisc_warn_nonwc(__func__, cl->qdisc);
+ goto out;
+ }
+
+ len = qdisc_pkt_len(skb);
+ if (len <= cl->deficit) {
+ cl->deficit -= len;
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (cl->qdisc->q.qlen == 0)
+ list_del(&cl->alist);
+
+ bstats_update(&cl->bstats, skb);
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ return skb;
+ }
+
+ cl->deficit += cl->quantum;
+ list_move_tail(&cl->alist, &q->active);
+ }
+out:
+ return NULL;
+}
+
+static unsigned int drr_drop(struct Qdisc *sch)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ unsigned int len;
+
+ list_for_each_entry(cl, &q->active, alist) {
+ if (cl->qdisc->ops->drop) {
+ len = cl->qdisc->ops->drop(cl->qdisc);
+ if (len > 0) {
+ sch->q.qlen--;
+ if (cl->qdisc->q.qlen == 0)
+ list_del(&cl->alist);
+ return len;
+ }
+ }
+ }
+ return 0;
+}
+
+static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ int err;
+
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+ INIT_LIST_HEAD(&q->active);
+ return 0;
+}
+
+static void drr_reset_qdisc(struct Qdisc *sch)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (cl->qdisc->q.qlen)
+ list_del(&cl->alist);
+ qdisc_reset(cl->qdisc);
+ }
+ }
+ sch->q.qlen = 0;
+}
+
+static void drr_destroy_qdisc(struct Qdisc *sch)
+{
+ struct drr_sched *q = qdisc_priv(sch);
+ struct drr_class *cl;
+ struct hlist_node *next;
+ unsigned int i;
+
+ tcf_destroy_chain(&q->filter_list);
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ common.hnode)
+ drr_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops drr_class_ops = {
+ .change = drr_change_class,
+ .delete = drr_delete_class,
+ .get = drr_get_class,
+ .put = drr_put_class,
+ .tcf_chain = drr_tcf_chain,
+ .bind_tcf = drr_bind_tcf,
+ .unbind_tcf = drr_unbind_tcf,
+ .graft = drr_graft_class,
+ .leaf = drr_class_leaf,
+ .qlen_notify = drr_qlen_notify,
+ .dump = drr_dump_class,
+ .dump_stats = drr_dump_class_stats,
+ .walk = drr_walk,
+};
+
+static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
+ .cl_ops = &drr_class_ops,
+ .id = "drr",
+ .priv_size = sizeof(struct drr_sched),
+ .enqueue = drr_enqueue,
+ .dequeue = drr_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = drr_drop,
+ .init = drr_init_qdisc,
+ .reset = drr_reset_qdisc,
+ .destroy = drr_destroy_qdisc,
+ .owner = THIS_MODULE,
+};
+
+static int __init drr_init(void)
+{
+ return register_qdisc(&drr_qdisc_ops);
+}
+
+static void __exit drr_exit(void)
+{
+ unregister_qdisc(&drr_qdisc_ops);
+}
+
+module_init(drr_init);
+module_exit(drr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
new file mode 100644
index 000000000..66700a611
--- /dev/null
+++ b/net/sched/sch_dsmark.c
@@ -0,0 +1,514 @@
+/* net/sched/sch_dsmark.c - Differentiated Services field marker */
+
+/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <net/pkt_sched.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <asm/byteorder.h>
+
+/*
+ * classid class marking
+ * ------- ----- -------
+ * n/a 0 n/a
+ * x:0 1 use entry [0]
+ * ... ... ...
+ * x:y y>0 y+1 use entry [y]
+ * ... ... ...
+ * x:indices-1 indices use entry [indices-1]
+ * ... ... ...
+ * x:y y+1 use entry [y & (indices-1)]
+ * ... ... ...
+ * 0xffff 0x10000 use entry [indices-1]
+ */
+
+
+#define NO_DEFAULT_INDEX (1 << 16)
+
+struct dsmark_qdisc_data {
+ struct Qdisc *q;
+ struct tcf_proto __rcu *filter_list;
+ u8 *mask; /* "owns" the array */
+ u8 *value;
+ u16 indices;
+ u32 default_index; /* index range is 0...0xffff */
+ int set_tc_index;
+};
+
+static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
+{
+ return index <= p->indices && index > 0;
+}
+
+/* ------------------------- Class/flow operations ------------------------- */
+
+static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
+ __func__, sch, p, new, old);
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ sch_tree_lock(sch);
+ *old = p->q;
+ p->q = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ return p->q;
+}
+
+static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
+{
+ pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
+ __func__, sch, qdisc_priv(sch), classid);
+
+ return TC_H_MIN(classid) + 1;
+}
+
+static unsigned long dsmark_bind_filter(struct Qdisc *sch,
+ unsigned long parent, u32 classid)
+{
+ return dsmark_get(sch, classid);
+}
+
+static void dsmark_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
+ [TCA_DSMARK_INDICES] = { .type = NLA_U16 },
+ [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 },
+ [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG },
+ [TCA_DSMARK_MASK] = { .type = NLA_U8 },
+ [TCA_DSMARK_VALUE] = { .type = NLA_U8 },
+};
+
+static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
+ struct nlattr **tca, unsigned long *arg)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_DSMARK_MAX + 1];
+ int err = -EINVAL;
+ u8 mask = 0;
+
+ pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
+ __func__, sch, p, classid, parent, *arg);
+
+ if (!dsmark_valid_index(p, *arg)) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ if (!opt)
+ goto errout;
+
+ err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
+ if (err < 0)
+ goto errout;
+
+ if (tb[TCA_DSMARK_MASK])
+ mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
+
+ if (tb[TCA_DSMARK_VALUE])
+ p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
+
+ if (tb[TCA_DSMARK_MASK])
+ p->mask[*arg - 1] = mask;
+
+ err = 0;
+
+errout:
+ return err;
+}
+
+static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ if (!dsmark_valid_index(p, arg))
+ return -EINVAL;
+
+ p->mask[arg - 1] = 0xff;
+ p->value[arg - 1] = 0;
+
+ return 0;
+}
+
+static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ int i;
+
+ pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
+ __func__, sch, p, walker);
+
+ if (walker->stop)
+ return;
+
+ for (i = 0; i < p->indices; i++) {
+ if (p->mask[i] == 0xff && !p->value[i])
+ goto ignore;
+ if (walker->count >= walker->skip) {
+ if (walker->fn(sch, i + 1, walker) < 0) {
+ walker->stop = 1;
+ break;
+ }
+ }
+ignore:
+ walker->count++;
+ }
+}
+
+static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ return &p->filter_list;
+}
+
+/* --------------------------- Qdisc operations ---------------------------- */
+
+static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ int err;
+
+ pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
+
+ if (p->set_tc_index) {
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ if (skb_cow_head(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
+ & ~INET_ECN_MASK;
+ break;
+
+ case htons(ETH_P_IPV6):
+ if (skb_cow_head(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
+ & ~INET_ECN_MASK;
+ break;
+ default:
+ skb->tc_index = 0;
+ break;
+ }
+ }
+
+ if (TC_H_MAJ(skb->priority) == sch->handle)
+ skb->tc_index = TC_H_MIN(skb->priority);
+ else {
+ struct tcf_result res;
+ struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
+ int result = tc_classify(skb, fl, &res);
+
+ pr_debug("result %d class 0x%04x\n", result, res.classid);
+
+ switch (result) {
+#ifdef CONFIG_NET_CLS_ACT
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ kfree_skb(skb);
+ return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+
+ case TC_ACT_SHOT:
+ goto drop;
+#endif
+ case TC_ACT_OK:
+ skb->tc_index = TC_H_MIN(res.classid);
+ break;
+
+ default:
+ if (p->default_index != NO_DEFAULT_INDEX)
+ skb->tc_index = p->default_index;
+ break;
+ }
+ }
+
+ err = qdisc_enqueue(skb, p->q);
+ if (err != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(err))
+ qdisc_qstats_drop(sch);
+ return err;
+ }
+
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+
+drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct sk_buff *skb;
+ u32 index;
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+
+ skb = p->q->ops->dequeue(p->q);
+ if (skb == NULL)
+ return NULL;
+
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+
+ index = skb->tc_index & (p->indices - 1);
+ pr_debug("index %d->%d\n", skb->tc_index, index);
+
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
+ p->value[index]);
+ break;
+ case htons(ETH_P_IPV6):
+ ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
+ p->value[index]);
+ break;
+ default:
+ /*
+ * Only complain if a change was actually attempted.
+ * This way, we can send non-IP traffic through dsmark
+ * and don't need yet another qdisc as a bypass.
+ */
+ if (p->mask[index] != 0xff || p->value[index])
+ pr_warn("%s: unsupported protocol %d\n",
+ __func__, ntohs(tc_skb_protocol(skb)));
+ break;
+ }
+
+ return skb;
+}
+
+static struct sk_buff *dsmark_peek(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+
+ return p->q->ops->peek(p->q);
+}
+
+static unsigned int dsmark_drop(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ unsigned int len;
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+
+ if (p->q->ops->drop == NULL)
+ return 0;
+
+ len = p->q->ops->drop(p->q);
+ if (len)
+ sch->q.qlen--;
+
+ return len;
+}
+
+static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct nlattr *tb[TCA_DSMARK_MAX + 1];
+ int err = -EINVAL;
+ u32 default_index = NO_DEFAULT_INDEX;
+ u16 indices;
+ u8 *mask;
+
+ pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
+
+ if (!opt)
+ goto errout;
+
+ err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
+ if (err < 0)
+ goto errout;
+
+ err = -EINVAL;
+ indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
+
+ if (hweight32(indices) != 1)
+ goto errout;
+
+ if (tb[TCA_DSMARK_DEFAULT_INDEX])
+ default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
+
+ mask = kmalloc(indices * 2, GFP_KERNEL);
+ if (mask == NULL) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ p->mask = mask;
+ memset(p->mask, 0xff, indices);
+
+ p->value = p->mask + indices;
+ memset(p->value, 0, indices);
+
+ p->indices = indices;
+ p->default_index = default_index;
+ p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
+
+ p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
+ if (p->q == NULL)
+ p->q = &noop_qdisc;
+
+ pr_debug("%s: qdisc %p\n", __func__, p->q);
+
+ err = 0;
+errout:
+ return err;
+}
+
+static void dsmark_reset(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+ qdisc_reset(p->q);
+ sch->q.qlen = 0;
+}
+
+static void dsmark_destroy(struct Qdisc *sch)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+ pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
+
+ tcf_destroy_chain(&p->filter_list);
+ qdisc_destroy(p->q);
+ kfree(p->mask);
+}
+
+static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+
+ pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
+
+ if (!dsmark_valid_index(p, cl))
+ return -EINVAL;
+
+ tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
+ tcm->tcm_info = p->q->handle;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) ||
+ nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
+ goto nla_put_failure;
+
+ if (p->default_index != NO_DEFAULT_INDEX &&
+ nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
+ goto nla_put_failure;
+
+ if (p->set_tc_index &&
+ nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static const struct Qdisc_class_ops dsmark_class_ops = {
+ .graft = dsmark_graft,
+ .leaf = dsmark_leaf,
+ .get = dsmark_get,
+ .put = dsmark_put,
+ .change = dsmark_change,
+ .delete = dsmark_delete,
+ .walk = dsmark_walk,
+ .tcf_chain = dsmark_find_tcf,
+ .bind_tcf = dsmark_bind_filter,
+ .unbind_tcf = dsmark_put,
+ .dump = dsmark_dump_class,
+};
+
+static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &dsmark_class_ops,
+ .id = "dsmark",
+ .priv_size = sizeof(struct dsmark_qdisc_data),
+ .enqueue = dsmark_enqueue,
+ .dequeue = dsmark_dequeue,
+ .peek = dsmark_peek,
+ .drop = dsmark_drop,
+ .init = dsmark_init,
+ .reset = dsmark_reset,
+ .destroy = dsmark_destroy,
+ .change = NULL,
+ .dump = dsmark_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init dsmark_module_init(void)
+{
+ return register_qdisc(&dsmark_qdisc_ops);
+}
+
+static void __exit dsmark_module_exit(void)
+{
+ unregister_qdisc(&dsmark_qdisc_ops);
+}
+
+module_init(dsmark_module_init)
+module_exit(dsmark_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
new file mode 100644
index 000000000..2e2398cfc
--- /dev/null
+++ b/net/sched/sch_fifo.c
@@ -0,0 +1,180 @@
+/*
+ * net/sched/sch_fifo.c The simplest FIFO queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+/* 1 band FIFO pseudo-"scheduler" */
+
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
+ return qdisc_enqueue_tail(skb, sch);
+
+ return qdisc_reshape_fail(skb, sch);
+}
+
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ if (likely(skb_queue_len(&sch->q) < sch->limit))
+ return qdisc_enqueue_tail(skb, sch);
+
+ return qdisc_reshape_fail(skb, sch);
+}
+
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ if (likely(skb_queue_len(&sch->q) < sch->limit))
+ return qdisc_enqueue_tail(skb, sch);
+
+ /* queue full, remove one skb to fulfill the limit */
+ __qdisc_queue_drop_head(sch, &sch->q);
+ qdisc_qstats_drop(sch);
+ qdisc_enqueue_tail(skb, sch);
+
+ return NET_XMIT_CN;
+}
+
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ bool bypass;
+ bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
+
+ if (opt == NULL) {
+ u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
+
+ if (is_bfifo)
+ limit *= psched_mtu(qdisc_dev(sch));
+
+ sch->limit = limit;
+ } else {
+ struct tc_fifo_qopt *ctl = nla_data(opt);
+
+ if (nla_len(opt) < sizeof(*ctl))
+ return -EINVAL;
+
+ sch->limit = ctl->limit;
+ }
+
+ if (is_bfifo)
+ bypass = sch->limit >= psched_mtu(qdisc_dev(sch));
+ else
+ bypass = sch->limit >= 1;
+
+ if (bypass)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tc_fifo_qopt opt = { .limit = sch->limit };
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
+ .id = "pfifo",
+ .priv_size = 0,
+ .enqueue = pfifo_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .peek = qdisc_peek_head,
+ .drop = qdisc_queue_drop,
+ .init = fifo_init,
+ .reset = qdisc_reset_queue,
+ .change = fifo_init,
+ .dump = fifo_dump,
+ .owner = THIS_MODULE,
+};
+EXPORT_SYMBOL(pfifo_qdisc_ops);
+
+struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
+ .id = "bfifo",
+ .priv_size = 0,
+ .enqueue = bfifo_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .peek = qdisc_peek_head,
+ .drop = qdisc_queue_drop,
+ .init = fifo_init,
+ .reset = qdisc_reset_queue,
+ .change = fifo_init,
+ .dump = fifo_dump,
+ .owner = THIS_MODULE,
+};
+EXPORT_SYMBOL(bfifo_qdisc_ops);
+
+struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
+ .id = "pfifo_head_drop",
+ .priv_size = 0,
+ .enqueue = pfifo_tail_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .peek = qdisc_peek_head,
+ .drop = qdisc_queue_drop_head,
+ .init = fifo_init,
+ .reset = qdisc_reset_queue,
+ .change = fifo_init,
+ .dump = fifo_dump,
+ .owner = THIS_MODULE,
+};
+
+/* Pass size change message down to embedded FIFO */
+int fifo_set_limit(struct Qdisc *q, unsigned int limit)
+{
+ struct nlattr *nla;
+ int ret = -ENOMEM;
+
+ /* Hack to avoid sending change message to non-FIFO */
+ if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+ return 0;
+
+ nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
+ if (nla) {
+ nla->nla_type = RTM_NEWQDISC;
+ nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
+ ((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
+
+ ret = q->ops->change(q, nla);
+ kfree(nla);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(fifo_set_limit);
+
+struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
+ unsigned int limit)
+{
+ struct Qdisc *q;
+ int err = -ENOMEM;
+
+ q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
+ if (q) {
+ err = fifo_set_limit(q, limit);
+ if (err < 0) {
+ qdisc_destroy(q);
+ q = NULL;
+ }
+ }
+
+ return q ? : ERR_PTR(err);
+}
+EXPORT_SYMBOL(fifo_create_dflt);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 000000000..f377702d4
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,873 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Meant to be mostly used for locally generated traffic :
+ * Fast classification depends on skb->sk being set before reaching us.
+ * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ * All packets belonging to a socket are considered as a 'flow'.
+ *
+ * Flows are dynamically allocated and stored in a hash table of RB trees
+ * They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ * Burst avoidance (aka pacing) capability :
+ *
+ * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ * bunch of packets, and this packet scheduler adds delay between
+ * packets to respect rate limitation.
+ *
+ * enqueue() :
+ * - lookup one RB tree (out of 1024 or more) to find the flow.
+ * If non existent flow, create it, add it to the tree.
+ * Add skb to the per flow list of skb (fifo).
+ * - Use a special fifo for high prio packets
+ *
+ * dequeue() : serves flows in Round Robin
+ * Note : When a flow becomes empty, we do not immediately remove it from
+ * rb trees, for performance reasons (its expected to send additional packets,
+ * or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+ struct sk_buff *head; /* list of skbs for this flow : first skb */
+ union {
+ struct sk_buff *tail; /* last skb in the list */
+ unsigned long age; /* jiffies when flow was emptied, for gc */
+ };
+ struct rb_node fq_node; /* anchor in fq_root[] trees */
+ struct sock *sk;
+ int qlen; /* number of packets in flow queue */
+ int credit;
+ u32 socket_hash; /* sk_hash */
+ struct fq_flow *next; /* next pointer in RR lists, or &detached */
+
+ struct rb_node rate_node; /* anchor in q->delayed tree */
+ u64 time_next_packet;
+};
+
+struct fq_flow_head {
+ struct fq_flow *first;
+ struct fq_flow *last;
+};
+
+struct fq_sched_data {
+ struct fq_flow_head new_flows;
+
+ struct fq_flow_head old_flows;
+
+ struct rb_root delayed; /* for rate limited flows */
+ u64 time_next_delayed_flow;
+
+ struct fq_flow internal; /* for non classified or high prio packets */
+ u32 quantum;
+ u32 initial_quantum;
+ u32 flow_refill_delay;
+ u32 flow_max_rate; /* optional max rate per flow */
+ u32 flow_plimit; /* max packets per flow */
+ u32 orphan_mask; /* mask for orphaned skb */
+ struct rb_root *fq_root;
+ u8 rate_enable;
+ u8 fq_trees_log;
+
+ u32 flows;
+ u32 inactive_flows;
+ u32 throttled_flows;
+
+ u64 stat_gc_flows;
+ u64 stat_internal_packets;
+ u64 stat_tcp_retrans;
+ u64 stat_throttled;
+ u64 stat_flows_plimit;
+ u64 stat_pkts_too_long;
+ u64 stat_allocation_errors;
+ struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+ f->next = &detached;
+ f->age = jiffies;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+ return f->next == &detached;
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+ struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct fq_flow *aux;
+
+ parent = *p;
+ aux = container_of(parent, struct fq_flow, rate_node);
+ if (f->time_next_packet >= aux->time_next_packet)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&f->rate_node, parent, p);
+ rb_insert_color(&f->rate_node, &q->delayed);
+ q->throttled_flows++;
+ q->stat_throttled++;
+
+ f->next = &throttled;
+ if (q->time_next_delayed_flow > f->time_next_packet)
+ q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+ if (head->first)
+ head->last->next = flow;
+ else
+ head->first = flow;
+ head->last = flow;
+ flow->next = NULL;
+}
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+ return fq_flow_is_detached(f) &&
+ time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+ struct rb_root *root,
+ struct sock *sk)
+{
+ struct fq_flow *f, *tofree[FQ_GC_MAX];
+ struct rb_node **p, *parent;
+ int fcnt = 0;
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = container_of(parent, struct fq_flow, fq_node);
+ if (f->sk == sk)
+ break;
+
+ if (fq_gc_candidate(f)) {
+ tofree[fcnt++] = f;
+ if (fcnt == FQ_GC_MAX)
+ break;
+ }
+
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+ while (fcnt) {
+ struct fq_flow *f = tofree[--fcnt];
+
+ rb_erase(&f->fq_node, root);
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+}
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+ struct rb_node **p, *parent;
+ struct sock *sk = skb->sk;
+ struct rb_root *root;
+ struct fq_flow *f;
+
+ /* warning: no starvation prevention... */
+ if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
+ return &q->internal;
+
+ /* SYNACK messages are attached to a listener socket.
+ * 1) They are not part of a 'flow' yet
+ * 2) We do not want to rate limit them (eg SYNFLOOD attack),
+ * especially if the listener set SO_MAX_PACING_RATE
+ * 3) We pretend they are orphaned
+ */
+ if (!sk || sk->sk_state == TCP_LISTEN) {
+ unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
+
+ /* By forcing low order bit to 1, we make sure to not
+ * collide with a local flow (socket pointers are word aligned)
+ */
+ sk = (struct sock *)((hash << 1) | 1UL);
+ skb_orphan(skb);
+ }
+
+ root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+
+ if (q->flows >= (2U << q->fq_trees_log) &&
+ q->inactive_flows > q->flows/2)
+ fq_gc(q, root, sk);
+
+ p = &root->rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+
+ f = container_of(parent, struct fq_flow, fq_node);
+ if (f->sk == sk) {
+ /* socket might have been reallocated, so check
+ * if its sk_hash is the same.
+ * It not, we need to refill credit with
+ * initial quantum
+ */
+ if (unlikely(skb->sk &&
+ f->socket_hash != sk->sk_hash)) {
+ f->credit = q->initial_quantum;
+ f->socket_hash = sk->sk_hash;
+ f->time_next_packet = 0ULL;
+ }
+ return f;
+ }
+ if (f->sk > sk)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+
+ f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!f)) {
+ q->stat_allocation_errors++;
+ return &q->internal;
+ }
+ fq_flow_set_detached(f);
+ f->sk = sk;
+ if (skb->sk)
+ f->socket_hash = sk->sk_hash;
+ f->credit = q->initial_quantum;
+
+ rb_link_node(&f->fq_node, parent, p);
+ rb_insert_color(&f->fq_node, root);
+
+ q->flows++;
+ q->inactive_flows++;
+ return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb->next = NULL;
+ flow->qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ }
+ return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+ return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head-> [retrans pkt 1]
+ * [retrans pkt 2]
+ * [ normal pkt 1]
+ * [ normal pkt 2]
+ * [ normal pkt 3]
+ * tail-> [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+ struct sk_buff *prev, *head = flow->head;
+
+ skb->next = NULL;
+ if (!head) {
+ flow->head = skb;
+ flow->tail = skb;
+ return;
+ }
+ if (likely(!skb_is_retransmit(skb))) {
+ flow->tail->next = skb;
+ flow->tail = skb;
+ return;
+ }
+
+ /* This skb is a tcp retransmit,
+ * find the last retrans packet in the queue
+ */
+ prev = NULL;
+ while (skb_is_retransmit(head)) {
+ prev = head;
+ head = head->next;
+ if (!head)
+ break;
+ }
+ if (!prev) { /* no rtx packet in queue, become the new head */
+ skb->next = flow->head;
+ flow->head = skb;
+ } else {
+ if (prev == flow->tail)
+ flow->tail = skb;
+ else
+ skb->next = prev->next;
+ prev->next = skb;
+ }
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct fq_flow *f;
+
+ if (unlikely(sch->q.qlen >= sch->limit))
+ return qdisc_drop(skb, sch);
+
+ f = fq_classify(skb, q);
+ if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+ q->stat_flows_plimit++;
+ return qdisc_drop(skb, sch);
+ }
+
+ f->qlen++;
+ if (skb_is_retransmit(skb))
+ q->stat_tcp_retrans++;
+ qdisc_qstats_backlog_inc(sch, skb);
+ if (fq_flow_is_detached(f)) {
+ fq_flow_add_tail(&q->new_flows, f);
+ if (time_after(jiffies, f->age + q->flow_refill_delay))
+ f->credit = max_t(u32, f->credit, q->quantum);
+ q->inactive_flows--;
+ }
+
+ /* Note: this overwrites f->age */
+ flow_queue_add(f, skb);
+
+ if (unlikely(f == &q->internal)) {
+ q->stat_internal_packets++;
+ }
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+ struct rb_node *p;
+
+ if (q->time_next_delayed_flow > now)
+ return;
+
+ q->time_next_delayed_flow = ~0ULL;
+ while ((p = rb_first(&q->delayed)) != NULL) {
+ struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+
+ if (f->time_next_packet > now) {
+ q->time_next_delayed_flow = f->time_next_packet;
+ break;
+ }
+ rb_erase(p, &q->delayed);
+ q->throttled_flows--;
+ fq_flow_add_tail(&q->old_flows, f);
+ }
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_get_ns();
+ struct fq_flow_head *head;
+ struct sk_buff *skb;
+ struct fq_flow *f;
+ u32 rate;
+
+ skb = fq_dequeue_head(sch, &q->internal);
+ if (skb)
+ goto out;
+ fq_check_throttled(q, now);
+begin:
+ head = &q->new_flows;
+ if (!head->first) {
+ head = &q->old_flows;
+ if (!head->first) {
+ if (q->time_next_delayed_flow != ~0ULL)
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ q->time_next_delayed_flow,
+ false);
+ return NULL;
+ }
+ }
+ f = head->first;
+
+ if (f->credit <= 0) {
+ f->credit += q->quantum;
+ head->first = f->next;
+ fq_flow_add_tail(&q->old_flows, f);
+ goto begin;
+ }
+
+ skb = f->head;
+ if (unlikely(skb && now < f->time_next_packet &&
+ !skb_is_tcp_pure_ack(skb))) {
+ head->first = f->next;
+ fq_flow_set_throttled(q, f);
+ goto begin;
+ }
+
+ skb = fq_dequeue_head(sch, f);
+ if (!skb) {
+ head->first = f->next;
+ /* force a pass through old_flows to prevent starvation */
+ if ((head == &q->new_flows) && q->old_flows.first) {
+ fq_flow_add_tail(&q->old_flows, f);
+ } else {
+ fq_flow_set_detached(f);
+ q->inactive_flows++;
+ }
+ goto begin;
+ }
+ prefetch(&skb->end);
+ f->credit -= qdisc_pkt_len(skb);
+
+ if (f->credit > 0 || !q->rate_enable)
+ goto out;
+
+ /* Do not pace locally generated ack packets */
+ if (skb_is_tcp_pure_ack(skb))
+ goto out;
+
+ rate = q->flow_max_rate;
+ if (skb->sk)
+ rate = min(skb->sk->sk_pacing_rate, rate);
+
+ if (rate != ~0U) {
+ u32 plen = max(qdisc_pkt_len(skb), q->quantum);
+ u64 len = (u64)plen * NSEC_PER_SEC;
+
+ if (likely(rate))
+ do_div(len, rate);
+ /* Since socket rate can change later,
+ * clamp the delay to 1 second.
+ * Really, providers of too big packets should be fixed !
+ */
+ if (unlikely(len > NSEC_PER_SEC)) {
+ len = NSEC_PER_SEC;
+ q->stat_pkts_too_long++;
+ }
+
+ f->time_next_packet = now + len;
+ }
+out:
+ qdisc_bstats_update(sch, skb);
+ return skb;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *root;
+ struct sk_buff *skb;
+ struct rb_node *p;
+ struct fq_flow *f;
+ unsigned int idx;
+
+ while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
+ kfree_skb(skb);
+
+ if (!q->fq_root)
+ return;
+
+ for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+ root = &q->fq_root[idx];
+ while ((p = rb_first(root)) != NULL) {
+ f = container_of(p, struct fq_flow, fq_node);
+ rb_erase(p, root);
+
+ while ((skb = fq_dequeue_head(sch, f)) != NULL)
+ kfree_skb(skb);
+
+ kmem_cache_free(fq_flow_cachep, f);
+ }
+ }
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->flows = 0;
+ q->inactive_flows = 0;
+ q->throttled_flows = 0;
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+ struct rb_root *old_array, u32 old_log,
+ struct rb_root *new_array, u32 new_log)
+{
+ struct rb_node *op, **np, *parent;
+ struct rb_root *oroot, *nroot;
+ struct fq_flow *of, *nf;
+ int fcnt = 0;
+ u32 idx;
+
+ for (idx = 0; idx < (1U << old_log); idx++) {
+ oroot = &old_array[idx];
+ while ((op = rb_first(oroot)) != NULL) {
+ rb_erase(op, oroot);
+ of = container_of(op, struct fq_flow, fq_node);
+ if (fq_gc_candidate(of)) {
+ fcnt++;
+ kmem_cache_free(fq_flow_cachep, of);
+ continue;
+ }
+ nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+
+ np = &nroot->rb_node;
+ parent = NULL;
+ while (*np) {
+ parent = *np;
+
+ nf = container_of(parent, struct fq_flow, fq_node);
+ BUG_ON(nf->sk == of->sk);
+
+ if (nf->sk > of->sk)
+ np = &parent->rb_right;
+ else
+ np = &parent->rb_left;
+ }
+
+ rb_link_node(&of->fq_node, parent, np);
+ rb_insert_color(&of->fq_node, nroot);
+ }
+ }
+ q->flows -= fcnt;
+ q->inactive_flows -= fcnt;
+ q->stat_gc_flows += fcnt;
+}
+
+static void *fq_alloc_node(size_t sz, int node)
+{
+ void *ptr;
+
+ ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node);
+ if (!ptr)
+ ptr = vmalloc_node(sz, node);
+ return ptr;
+}
+
+static void fq_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static int fq_resize(struct Qdisc *sch, u32 log)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct rb_root *array;
+ void *old_fq_root;
+ u32 idx;
+
+ if (q->fq_root && log == q->fq_trees_log)
+ return 0;
+
+ /* If XPS was setup, we can allocate memory on right NUMA node */
+ array = fq_alloc_node(sizeof(struct rb_root) << log,
+ netdev_queue_numa_node_read(sch->dev_queue));
+ if (!array)
+ return -ENOMEM;
+
+ for (idx = 0; idx < (1U << log); idx++)
+ array[idx] = RB_ROOT;
+
+ sch_tree_lock(sch);
+
+ old_fq_root = q->fq_root;
+ if (old_fq_root)
+ fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
+
+ q->fq_root = array;
+ q->fq_trees_log = log;
+
+ sch_tree_unlock(sch);
+
+ fq_free(old_fq_root);
+
+ return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
+ [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
+ [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
+ [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_MAX + 1];
+ int err, drop_count = 0;
+ u32 fq_log;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ fq_log = q->fq_trees_log;
+
+ if (tb[TCA_FQ_BUCKETS_LOG]) {
+ u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+ if (nval >= 1 && nval <= ilog2(256*1024))
+ fq_log = nval;
+ else
+ err = -EINVAL;
+ }
+ if (tb[TCA_FQ_PLIMIT])
+ sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+ if (tb[TCA_FQ_FLOW_PLIMIT])
+ q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+ if (tb[TCA_FQ_QUANTUM]) {
+ u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+ if (quantum > 0)
+ q->quantum = quantum;
+ else
+ err = -EINVAL;
+ }
+
+ if (tb[TCA_FQ_INITIAL_QUANTUM])
+ q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+ if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+ pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
+ nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
+
+ if (tb[TCA_FQ_FLOW_MAX_RATE])
+ q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+ if (tb[TCA_FQ_RATE_ENABLE]) {
+ u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+ if (enable <= 1)
+ q->rate_enable = enable;
+ else
+ err = -EINVAL;
+ }
+
+ if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
+ u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
+
+ q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
+ }
+
+ if (tb[TCA_FQ_ORPHAN_MASK])
+ q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
+
+ if (!err) {
+ sch_tree_unlock(sch);
+ err = fq_resize(sch, fq_log);
+ sch_tree_lock(sch);
+ }
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_dequeue(sch);
+
+ if (!skb)
+ break;
+ kfree_skb(skb);
+ drop_count++;
+ }
+ qdisc_tree_decrease_qlen(sch, drop_count);
+
+ sch_tree_unlock(sch);
+ return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+
+ fq_reset(sch);
+ fq_free(q->fq_root);
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ int err;
+
+ sch->limit = 10000;
+ q->flow_plimit = 100;
+ q->quantum = 2 * psched_mtu(qdisc_dev(sch));
+ q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
+ q->flow_refill_delay = msecs_to_jiffies(40);
+ q->flow_max_rate = ~0U;
+ q->rate_enable = 1;
+ q->new_flows.first = NULL;
+ q->old_flows.first = NULL;
+ q->delayed = RB_ROOT;
+ q->fq_root = NULL;
+ q->fq_trees_log = ilog2(1024);
+ q->orphan_mask = 1024 - 1;
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (opt)
+ err = fq_change(sch, opt);
+ else
+ err = fq_resize(sch, q->fq_trees_log);
+
+ return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
+
+ if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+ nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+ nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
+ jiffies_to_usecs(q->flow_refill_delay)) ||
+ nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_get_ns();
+ struct tc_fq_qd_stats st = {
+ .gc_flows = q->stat_gc_flows,
+ .highprio_packets = q->stat_internal_packets,
+ .tcp_retrans = q->stat_tcp_retrans,
+ .throttled = q->stat_throttled,
+ .flows_plimit = q->stat_flows_plimit,
+ .pkts_too_long = q->stat_pkts_too_long,
+ .allocation_errors = q->stat_allocation_errors,
+ .flows = q->flows,
+ .inactive_flows = q->inactive_flows,
+ .throttled_flows = q->throttled_flows,
+ .time_next_delayed_flow = q->time_next_delayed_flow - now,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+ .id = "fq",
+ .priv_size = sizeof(struct fq_sched_data),
+
+ .enqueue = fq_enqueue,
+ .dequeue = fq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_init,
+ .reset = fq_reset,
+ .destroy = fq_destroy,
+ .change = fq_change,
+ .dump = fq_dump,
+ .dump_stats = fq_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+ int ret;
+
+ fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+ sizeof(struct fq_flow),
+ 0, 0, NULL);
+ if (!fq_flow_cachep)
+ return -ENOMEM;
+
+ ret = register_qdisc(&fq_qdisc_ops);
+ if (ret)
+ kmem_cache_destroy(fq_flow_cachep);
+ return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+ unregister_qdisc(&fq_qdisc_ops);
+ kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
new file mode 100644
index 000000000..c244c45b7
--- /dev/null
+++ b/net/sched/sch_fq_codel.c
@@ -0,0 +1,624 @@
+/*
+ * Fair Queue CoDel discipline
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/flow_keys.h>
+#include <net/codel.h>
+
+/* Fair Queue CoDel.
+ *
+ * Principles :
+ * Packets are classified (internal classifier or external) on flows.
+ * This is a Stochastic model (as we use a hash, several flows
+ * might be hashed on same slot)
+ * Each flow has a CoDel managed queue.
+ * Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ *
+ * For a given flow, packets are not reordered (CoDel uses a FIFO)
+ * head drops only.
+ * ECN capability is on by default.
+ * Low memory footprint (64 bytes per flow)
+ */
+
+struct fq_codel_flow {
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head flowchain;
+ int deficit;
+ u32 dropped; /* number of drops (or ECN marks) on this flow */
+ struct codel_vars cvars;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct fq_codel_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
+ u32 *backlogs; /* backlog table [flows_cnt] */
+ u32 flows_cnt; /* number of flows */
+ u32 perturbation; /* hash perturbation */
+ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
+ struct codel_params cparams;
+ struct codel_stats cstats;
+ u32 drop_overlimit;
+ u32 new_flow_count;
+
+ struct list_head new_flows; /* list of new flows */
+ struct list_head old_flows; /* list of old flows */
+};
+
+static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
+ const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ unsigned int hash;
+
+ skb_flow_dissect(skb, &keys);
+ hash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src ^ keys.ip_proto,
+ (__force u32)keys.ports, q->perturbation);
+
+ return reciprocal_scale(hash, q->flows_cnt);
+}
+
+static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->flows_cnt)
+ return TC_H_MIN(skb->priority);
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ return fq_codel_hash(q, skb) + 1;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tc_classify(skb, filter, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->flows_cnt)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ flow->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_codel_flow *flow,
+ struct sk_buff *skb)
+{
+ if (flow->head == NULL)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static unsigned int fq_codel_drop(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ unsigned int maxbacklog = 0, idx = 0, i, len;
+ struct fq_codel_flow *flow;
+
+ /* Queue is full! Find the fat flow and drop packet from it.
+ * This might sound expensive, but with 1024 flows, we scan
+ * 4KB of memory, and we dont need to handle a complex tree
+ * in fast path (packet queue/enqueue) with many cache misses.
+ */
+ for (i = 0; i < q->flows_cnt; i++) {
+ if (q->backlogs[i] > maxbacklog) {
+ maxbacklog = q->backlogs[i];
+ idx = i;
+ }
+ }
+ flow = &q->flows[idx];
+ skb = dequeue_head(flow);
+ len = qdisc_pkt_len(skb);
+ q->backlogs[idx] -= len;
+ kfree_skb(skb);
+ sch->q.qlen--;
+ qdisc_qstats_drop(sch);
+ qdisc_qstats_backlog_dec(sch, skb);
+ flow->dropped++;
+ return idx;
+}
+
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ unsigned int idx;
+ struct fq_codel_flow *flow;
+ int uninitialized_var(ret);
+
+ idx = fq_codel_classify(skb, sch, &ret);
+ if (idx == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+ }
+ idx--;
+
+ codel_set_enqueue_time(skb);
+ flow = &q->flows[idx];
+ flow_queue_add(flow, skb);
+ q->backlogs[idx] += qdisc_pkt_len(skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+
+ if (list_empty(&flow->flowchain)) {
+ list_add_tail(&flow->flowchain, &q->new_flows);
+ q->new_flow_count++;
+ flow->deficit = q->quantum;
+ flow->dropped = 0;
+ }
+ if (++sch->q.qlen <= sch->limit)
+ return NET_XMIT_SUCCESS;
+
+ q->drop_overlimit++;
+ /* Return Congestion Notification only if we dropped a packet
+ * from this flow.
+ */
+ if (fq_codel_drop(sch) == idx)
+ return NET_XMIT_CN;
+
+ /* As we dropped a packet, better let upper stack know this */
+ qdisc_tree_decrease_qlen(sch, 1);
+ return NET_XMIT_SUCCESS;
+}
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct fq_codel_flow *flow;
+ struct sk_buff *skb = NULL;
+
+ flow = container_of(vars, struct fq_codel_flow, cvars);
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+ sch->q.qlen--;
+ }
+ return skb;
+}
+
+static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ struct fq_codel_flow *flow;
+ struct list_head *head;
+ u32 prev_drop_count, prev_ecn_mark;
+
+begin:
+ head = &q->new_flows;
+ if (list_empty(head)) {
+ head = &q->old_flows;
+ if (list_empty(head))
+ return NULL;
+ }
+ flow = list_first_entry(head, struct fq_codel_flow, flowchain);
+
+ if (flow->deficit <= 0) {
+ flow->deficit += q->quantum;
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ goto begin;
+ }
+
+ prev_drop_count = q->cstats.drop_count;
+ prev_ecn_mark = q->cstats.ecn_mark;
+
+ skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
+ dequeue);
+
+ flow->dropped += q->cstats.drop_count - prev_drop_count;
+ flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
+
+ if (!skb) {
+ /* force a pass through old_flows to prevent starvation */
+ if ((head == &q->new_flows) && !list_empty(&q->old_flows))
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ else
+ list_del_init(&flow->flowchain);
+ goto begin;
+ }
+ qdisc_bstats_update(sch, skb);
+ flow->deficit -= qdisc_pkt_len(skb);
+ /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+ * or HTB crashes. Defer it for next round.
+ */
+ if (q->cstats.drop_count && sch->q.qlen) {
+ qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+ q->cstats.drop_count = 0;
+ }
+ return skb;
+}
+
+static void fq_codel_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = fq_codel_dequeue(sch)) != NULL)
+ kfree_skb(skb);
+}
+
+static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
+ [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
+ [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
+};
+
+static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy);
+ if (err < 0)
+ return err;
+ if (tb[TCA_FQ_CODEL_FLOWS]) {
+ if (q->flows)
+ return -EINVAL;
+ q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]);
+ if (!q->flows_cnt ||
+ q->flows_cnt > 65536)
+ return -EINVAL;
+ }
+ sch_tree_lock(sch);
+
+ if (tb[TCA_FQ_CODEL_TARGET]) {
+ u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
+
+ q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_FQ_CODEL_INTERVAL]) {
+ u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
+
+ q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+ }
+
+ if (tb[TCA_FQ_CODEL_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
+
+ if (tb[TCA_FQ_CODEL_ECN])
+ q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
+
+ if (tb[TCA_FQ_CODEL_QUANTUM])
+ q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_codel_dequeue(sch);
+
+ kfree_skb(skb);
+ q->cstats.drop_count++;
+ }
+ qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+ q->cstats.drop_count = 0;
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static void *fq_codel_zalloc(size_t sz)
+{
+ void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+ if (!ptr)
+ ptr = vzalloc(sz);
+ return ptr;
+}
+
+static void fq_codel_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static void fq_codel_destroy(struct Qdisc *sch)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ fq_codel_free(q->backlogs);
+ fq_codel_free(q->flows);
+}
+
+static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ sch->limit = 10*1024;
+ q->flows_cnt = 1024;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->perturbation = prandom_u32();
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ codel_params_init(&q->cparams, sch);
+ codel_stats_init(&q->cstats);
+ q->cparams.ecn = true;
+
+ if (opt) {
+ int err = fq_codel_change(sch, opt);
+ if (err)
+ return err;
+ }
+
+ if (!q->flows) {
+ q->flows = fq_codel_zalloc(q->flows_cnt *
+ sizeof(struct fq_codel_flow));
+ if (!q->flows)
+ return -ENOMEM;
+ q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
+ if (!q->backlogs) {
+ fq_codel_free(q->flows);
+ return -ENOMEM;
+ }
+ for (i = 0; i < q->flows_cnt; i++) {
+ struct fq_codel_flow *flow = q->flows + i;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ codel_vars_init(&flow->cvars);
+ }
+ }
+ if (sch->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
+ codel_time_to_us(q->cparams.target)) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
+ sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
+ codel_time_to_us(q->cparams.interval)) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_ECN,
+ q->cparams.ecn) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
+ q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
+ q->flows_cnt))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct tc_fq_codel_xstats st = {
+ .type = TCA_FQ_CODEL_XSTATS_QDISC,
+ };
+ struct list_head *pos;
+
+ st.qdisc_stats.maxpacket = q->cstats.maxpacket;
+ st.qdisc_stats.drop_overlimit = q->drop_overlimit;
+ st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
+ st.qdisc_stats.new_flow_count = q->new_flow_count;
+
+ list_for_each(pos, &q->new_flows)
+ st.qdisc_stats.new_flows_len++;
+
+ list_for_each(pos, &q->old_flows)
+ st.qdisc_stats.old_flows_len++;
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ /* we cannot bypass queue discipline anymore */
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static void fq_codel_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ u32 idx = cl - 1;
+ struct gnet_stats_queue qs = { 0 };
+ struct tc_fq_codel_xstats xstats;
+
+ if (idx < q->flows_cnt) {
+ const struct fq_codel_flow *flow = &q->flows[idx];
+ const struct sk_buff *skb = flow->head;
+
+ memset(&xstats, 0, sizeof(xstats));
+ xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
+ xstats.class_stats.deficit = flow->deficit;
+ xstats.class_stats.ldelay =
+ codel_time_to_us(flow->cvars.ldelay);
+ xstats.class_stats.count = flow->cvars.count;
+ xstats.class_stats.lastcount = flow->cvars.lastcount;
+ xstats.class_stats.dropping = flow->cvars.dropping;
+ if (flow->cvars.dropping) {
+ codel_tdiff_t delta = flow->cvars.drop_next -
+ codel_get_time();
+
+ xstats.class_stats.drop_next = (delta >= 0) ?
+ codel_time_to_us(delta) :
+ -codel_time_to_us(-delta);
+ }
+ while (skb) {
+ qs.qlen++;
+ skb = skb->next;
+ }
+ qs.backlog = q->backlogs[idx];
+ qs.drops = flow->dropped;
+ }
+ if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0)
+ return -1;
+ if (idx < q->flows_cnt)
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+ return 0;
+}
+
+static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct fq_codel_sched_data *q = qdisc_priv(sch);
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->flows_cnt; i++) {
+ if (list_empty(&q->flows[i].flowchain) ||
+ arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops fq_codel_class_ops = {
+ .leaf = fq_codel_leaf,
+ .get = fq_codel_get,
+ .put = fq_codel_put,
+ .tcf_chain = fq_codel_find_tcf,
+ .bind_tcf = fq_codel_bind,
+ .unbind_tcf = fq_codel_put,
+ .dump = fq_codel_dump_class,
+ .dump_stats = fq_codel_dump_class_stats,
+ .walk = fq_codel_walk,
+};
+
+static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
+ .cl_ops = &fq_codel_class_ops,
+ .id = "fq_codel",
+ .priv_size = sizeof(struct fq_codel_sched_data),
+ .enqueue = fq_codel_enqueue,
+ .dequeue = fq_codel_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = fq_codel_drop,
+ .init = fq_codel_init,
+ .reset = fq_codel_reset,
+ .destroy = fq_codel_destroy,
+ .change = fq_codel_change,
+ .dump = fq_codel_dump,
+ .dump_stats = fq_codel_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_codel_module_init(void)
+{
+ return register_qdisc(&fq_codel_qdisc_ops);
+}
+
+static void __exit fq_codel_module_exit(void)
+{
+ unregister_qdisc(&fq_codel_qdisc_ops);
+}
+
+module_init(fq_codel_module_init)
+module_exit(fq_codel_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
new file mode 100644
index 000000000..6efca3089
--- /dev/null
+++ b/net/sched/sch_generic.c
@@ -0,0 +1,990 @@
+/*
+ * net/sched/sch_generic.c Generic packet scheduler routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Jamal Hadi Salim, <hadi@cyberus.ca> 990601
+ * - Ingress support
+ */
+
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/if_vlan.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+
+/* Qdisc to use by default */
+const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+EXPORT_SYMBOL(default_qdisc_ops);
+
+/* Main transmission queue. */
+
+/* Modifications to data participating in scheduling must be protected with
+ * qdisc_lock(qdisc) spinlock.
+ *
+ * The idea is the following:
+ * - enqueue, dequeue are serialized via qdisc root lock
+ * - ingress filtering is also serialized via qdisc root lock
+ * - updates to tree and tree walking are only done under the rtnl mutex.
+ */
+
+static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+ q->gso_skb = skb;
+ q->qstats.requeues++;
+ q->q.qlen++; /* it's still part of the queue */
+ __netif_schedule(q);
+
+ return 0;
+}
+
+static void try_bulk_dequeue_skb(struct Qdisc *q,
+ struct sk_buff *skb,
+ const struct netdev_queue *txq,
+ int *packets)
+{
+ int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
+
+ while (bytelimit > 0) {
+ struct sk_buff *nskb = q->dequeue(q);
+
+ if (!nskb)
+ break;
+
+ bytelimit -= nskb->len; /* covers GSO len */
+ skb->next = nskb;
+ skb = nskb;
+ (*packets)++; /* GSO counts as one pkt */
+ }
+ skb->next = NULL;
+}
+
+/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
+ * A requeued skb (via q->gso_skb) can also be a SKB list.
+ */
+static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
+ int *packets)
+{
+ struct sk_buff *skb = q->gso_skb;
+ const struct netdev_queue *txq = q->dev_queue;
+
+ *packets = 1;
+ *validate = true;
+ if (unlikely(skb)) {
+ /* check the reason of requeuing without tx lock first */
+ txq = skb_get_tx_queue(txq->dev, skb);
+ if (!netif_xmit_frozen_or_stopped(txq)) {
+ q->gso_skb = NULL;
+ q->q.qlen--;
+ } else
+ skb = NULL;
+ /* skb in gso_skb were already validated */
+ *validate = false;
+ } else {
+ if (!(q->flags & TCQ_F_ONETXQUEUE) ||
+ !netif_xmit_frozen_or_stopped(txq)) {
+ skb = q->dequeue(q);
+ if (skb && qdisc_may_bulk(q))
+ try_bulk_dequeue_skb(q, skb, txq, packets);
+ }
+ }
+ return skb;
+}
+
+static inline int handle_dev_cpu_collision(struct sk_buff *skb,
+ struct netdev_queue *dev_queue,
+ struct Qdisc *q)
+{
+ int ret;
+
+ if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
+ /*
+ * Same CPU holding the lock. It may be a transient
+ * configuration error, when hard_start_xmit() recurses. We
+ * detect it by checking xmit owner and drop the packet when
+ * deadloop is detected. Return OK to try the next skb.
+ */
+ kfree_skb_list(skb);
+ net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
+ dev_queue->dev->name);
+ ret = qdisc_qlen(q);
+ } else {
+ /*
+ * Another cpu is holding lock, requeue & delay xmits for
+ * some time.
+ */
+ __this_cpu_inc(softnet_data.cpu_collision);
+ ret = dev_requeue_skb(skb, q);
+ }
+
+ return ret;
+}
+
+/*
+ * Transmit possibly several skbs, and handle the return status as
+ * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
+ * only one CPU can execute this function.
+ *
+ * Returns to the caller:
+ * 0 - queue is empty or throttled.
+ * >0 - queue is not empty.
+ */
+int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+ struct net_device *dev, struct netdev_queue *txq,
+ spinlock_t *root_lock, bool validate)
+{
+ int ret = NETDEV_TX_BUSY;
+
+ /* And release qdisc */
+ spin_unlock(root_lock);
+
+ /* Note that we validate skb (GSO, checksum, ...) outside of locks */
+ if (validate)
+ skb = validate_xmit_skb_list(skb, dev);
+
+ if (skb) {
+ HARD_TX_LOCK(dev, txq, smp_processor_id());
+ if (!netif_xmit_frozen_or_stopped(txq))
+ skb = dev_hard_start_xmit(skb, dev, txq, &ret);
+
+ HARD_TX_UNLOCK(dev, txq);
+ }
+ spin_lock(root_lock);
+
+ if (dev_xmit_complete(ret)) {
+ /* Driver sent out skb successfully or skb was consumed */
+ ret = qdisc_qlen(q);
+ } else if (ret == NETDEV_TX_LOCKED) {
+ /* Driver try lock failed */
+ ret = handle_dev_cpu_collision(skb, txq, q);
+ } else {
+ /* Driver returned NETDEV_TX_BUSY - requeue skb */
+ if (unlikely(ret != NETDEV_TX_BUSY))
+ net_warn_ratelimited("BUG %s code %d qlen %d\n",
+ dev->name, ret, q->q.qlen);
+
+ ret = dev_requeue_skb(skb, q);
+ }
+
+ if (ret && netif_xmit_frozen_or_stopped(txq))
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * NOTE: Called under qdisc_lock(q) with locally disabled BH.
+ *
+ * __QDISC___STATE_RUNNING guarantees only one CPU can process
+ * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
+ * this queue.
+ *
+ * netif_tx_lock serializes accesses to device driver.
+ *
+ * qdisc_lock(q) and netif_tx_lock are mutually exclusive,
+ * if one is grabbed, another must be free.
+ *
+ * Note, that this procedure can be called by a watchdog timer
+ *
+ * Returns to the caller:
+ * 0 - queue is empty or throttled.
+ * >0 - queue is not empty.
+ *
+ */
+static inline int qdisc_restart(struct Qdisc *q, int *packets)
+{
+ struct netdev_queue *txq;
+ struct net_device *dev;
+ spinlock_t *root_lock;
+ struct sk_buff *skb;
+ bool validate;
+
+ /* Dequeue packet */
+ skb = dequeue_skb(q, &validate, packets);
+ if (unlikely(!skb))
+ return 0;
+
+ root_lock = qdisc_lock(q);
+ dev = qdisc_dev(q);
+ txq = skb_get_tx_queue(dev, skb);
+
+ return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
+}
+
+void __qdisc_run(struct Qdisc *q)
+{
+ int quota = weight_p;
+ int packets;
+
+ while (qdisc_restart(q, &packets)) {
+ /*
+ * Ordered by possible occurrence: Postpone processing if
+ * 1. we've exceeded packet quota
+ * 2. another process needs the CPU;
+ */
+ quota -= packets;
+ if (quota <= 0 || need_resched()) {
+ __netif_schedule(q);
+ break;
+ }
+ }
+
+ qdisc_run_end(q);
+}
+
+unsigned long dev_trans_start(struct net_device *dev)
+{
+ unsigned long val, res;
+ unsigned int i;
+
+ if (is_vlan_dev(dev))
+ dev = vlan_dev_real_dev(dev);
+ res = dev->trans_start;
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ val = netdev_get_tx_queue(dev, i)->trans_start;
+ if (val && time_after(val, res))
+ res = val;
+ }
+ dev->trans_start = res;
+
+ return res;
+}
+EXPORT_SYMBOL(dev_trans_start);
+
+static void dev_watchdog(unsigned long arg)
+{
+ struct net_device *dev = (struct net_device *)arg;
+
+ netif_tx_lock(dev);
+ if (!qdisc_tx_is_noop(dev)) {
+ if (netif_device_present(dev) &&
+ netif_running(dev) &&
+ netif_carrier_ok(dev)) {
+ int some_queue_timedout = 0;
+ unsigned int i;
+ unsigned long trans_start;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq;
+
+ txq = netdev_get_tx_queue(dev, i);
+ /*
+ * old device drivers set dev->trans_start
+ */
+ trans_start = txq->trans_start ? : dev->trans_start;
+ if (netif_xmit_stopped(txq) &&
+ time_after(jiffies, (trans_start +
+ dev->watchdog_timeo))) {
+ some_queue_timedout = 1;
+ txq->trans_timeout++;
+ break;
+ }
+ }
+
+ if (some_queue_timedout) {
+ WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
+ dev->name, netdev_drivername(dev), i);
+ dev->netdev_ops->ndo_tx_timeout(dev);
+ }
+ if (!mod_timer(&dev->watchdog_timer,
+ round_jiffies(jiffies +
+ dev->watchdog_timeo)))
+ dev_hold(dev);
+ }
+ }
+ netif_tx_unlock(dev);
+
+ dev_put(dev);
+}
+
+void __netdev_watchdog_up(struct net_device *dev)
+{
+ if (dev->netdev_ops->ndo_tx_timeout) {
+ if (dev->watchdog_timeo <= 0)
+ dev->watchdog_timeo = 5*HZ;
+ if (!mod_timer(&dev->watchdog_timer,
+ round_jiffies(jiffies + dev->watchdog_timeo)))
+ dev_hold(dev);
+ }
+}
+
+static void dev_watchdog_up(struct net_device *dev)
+{
+ __netdev_watchdog_up(dev);
+}
+
+static void dev_watchdog_down(struct net_device *dev)
+{
+ netif_tx_lock_bh(dev);
+ if (del_timer(&dev->watchdog_timer))
+ dev_put(dev);
+ netif_tx_unlock_bh(dev);
+}
+
+/**
+ * netif_carrier_on - set carrier
+ * @dev: network device
+ *
+ * Device has detected that carrier.
+ */
+void netif_carrier_on(struct net_device *dev)
+{
+ if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ return;
+ atomic_inc(&dev->carrier_changes);
+ linkwatch_fire_event(dev);
+ if (netif_running(dev))
+ __netdev_watchdog_up(dev);
+ }
+}
+EXPORT_SYMBOL(netif_carrier_on);
+
+/**
+ * netif_carrier_off - clear carrier
+ * @dev: network device
+ *
+ * Device has detected loss of carrier.
+ */
+void netif_carrier_off(struct net_device *dev)
+{
+ if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
+ if (dev->reg_state == NETREG_UNINITIALIZED)
+ return;
+ atomic_inc(&dev->carrier_changes);
+ linkwatch_fire_event(dev);
+ }
+}
+EXPORT_SYMBOL(netif_carrier_off);
+
+/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
+ under all circumstances. It is difficult to invent anything faster or
+ cheaper.
+ */
+
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+{
+ kfree_skb(skb);
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
+{
+ return NULL;
+}
+
+struct Qdisc_ops noop_qdisc_ops __read_mostly = {
+ .id = "noop",
+ .priv_size = 0,
+ .enqueue = noop_enqueue,
+ .dequeue = noop_dequeue,
+ .peek = noop_dequeue,
+ .owner = THIS_MODULE,
+};
+
+static struct netdev_queue noop_netdev_queue = {
+ .qdisc = &noop_qdisc,
+ .qdisc_sleeping = &noop_qdisc,
+};
+
+struct Qdisc noop_qdisc = {
+ .enqueue = noop_enqueue,
+ .dequeue = noop_dequeue,
+ .flags = TCQ_F_BUILTIN,
+ .ops = &noop_qdisc_ops,
+ .list = LIST_HEAD_INIT(noop_qdisc.list),
+ .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
+ .dev_queue = &noop_netdev_queue,
+ .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
+};
+EXPORT_SYMBOL(noop_qdisc);
+
+static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
+ .id = "noqueue",
+ .priv_size = 0,
+ .enqueue = noop_enqueue,
+ .dequeue = noop_dequeue,
+ .peek = noop_dequeue,
+ .owner = THIS_MODULE,
+};
+
+static struct Qdisc noqueue_qdisc;
+static struct netdev_queue noqueue_netdev_queue = {
+ .qdisc = &noqueue_qdisc,
+ .qdisc_sleeping = &noqueue_qdisc,
+};
+
+static struct Qdisc noqueue_qdisc = {
+ .enqueue = NULL,
+ .dequeue = noop_dequeue,
+ .flags = TCQ_F_BUILTIN,
+ .ops = &noqueue_qdisc_ops,
+ .list = LIST_HEAD_INIT(noqueue_qdisc.list),
+ .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
+ .dev_queue = &noqueue_netdev_queue,
+ .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
+};
+
+
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+/* 3-band FIFO queue: old style, but should be a bit faster than
+ generic prio+fifo combination.
+ */
+
+#define PFIFO_FAST_BANDS 3
+
+/*
+ * Private data for a pfifo_fast scheduler containing:
+ * - queues for the three band
+ * - bitmap indicating which of the bands contain skbs
+ */
+struct pfifo_fast_priv {
+ u32 bitmap;
+ struct sk_buff_head q[PFIFO_FAST_BANDS];
+};
+
+/*
+ * Convert a bitmap to the first band number where an skb is queued, where:
+ * bitmap=0 means there are no skbs on any band.
+ * bitmap=1 means there is an skb on band 0.
+ * bitmap=7 means there are skbs on all 3 bands, etc.
+ */
+static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
+
+static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+ int band)
+{
+ return priv->q + band;
+}
+
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+{
+ if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+ int band = prio2band[skb->priority & TC_PRIO_MAX];
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ struct sk_buff_head *list = band2list(priv, band);
+
+ priv->bitmap |= (1 << band);
+ qdisc->q.qlen++;
+ return __qdisc_enqueue_tail(skb, qdisc, list);
+ }
+
+ return qdisc_drop(skb, qdisc);
+}
+
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
+{
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ int band = bitmap2band[priv->bitmap];
+
+ if (likely(band >= 0)) {
+ struct sk_buff_head *list = band2list(priv, band);
+ struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+
+ qdisc->q.qlen--;
+ if (skb_queue_empty(list))
+ priv->bitmap &= ~(1 << band);
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
+{
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+ int band = bitmap2band[priv->bitmap];
+
+ if (band >= 0) {
+ struct sk_buff_head *list = band2list(priv, band);
+
+ return skb_peek(list);
+ }
+
+ return NULL;
+}
+
+static void pfifo_fast_reset(struct Qdisc *qdisc)
+{
+ int prio;
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+ __qdisc_reset_queue(qdisc, band2list(priv, prio));
+
+ priv->bitmap = 0;
+ qdisc->qstats.backlog = 0;
+ qdisc->q.qlen = 0;
+}
+
+static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
+{
+ struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
+
+ memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return skb->len;
+
+nla_put_failure:
+ return -1;
+}
+
+static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
+{
+ int prio;
+ struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+ __skb_queue_head_init(band2list(priv, prio));
+
+ /* Can by-pass the queue discipline */
+ qdisc->flags |= TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+struct Qdisc_ops pfifo_fast_ops __read_mostly = {
+ .id = "pfifo_fast",
+ .priv_size = sizeof(struct pfifo_fast_priv),
+ .enqueue = pfifo_fast_enqueue,
+ .dequeue = pfifo_fast_dequeue,
+ .peek = pfifo_fast_peek,
+ .init = pfifo_fast_init,
+ .reset = pfifo_fast_reset,
+ .dump = pfifo_fast_dump,
+ .owner = THIS_MODULE,
+};
+
+static struct lock_class_key qdisc_tx_busylock;
+
+struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
+ const struct Qdisc_ops *ops)
+{
+ void *p;
+ struct Qdisc *sch;
+ unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
+ int err = -ENOBUFS;
+ struct net_device *dev = dev_queue->dev;
+
+ p = kzalloc_node(size, GFP_KERNEL,
+ netdev_queue_numa_node_read(dev_queue));
+
+ if (!p)
+ goto errout;
+ sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+ /* if we got non aligned memory, ask more and do alignment ourself */
+ if (sch != p) {
+ kfree(p);
+ p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
+ netdev_queue_numa_node_read(dev_queue));
+ if (!p)
+ goto errout;
+ sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+ sch->padded = (char *) sch - (char *) p;
+ }
+ INIT_LIST_HEAD(&sch->list);
+ skb_queue_head_init(&sch->q);
+
+ spin_lock_init(&sch->busylock);
+ lockdep_set_class(&sch->busylock,
+ dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
+ sch->ops = ops;
+ sch->enqueue = ops->enqueue;
+ sch->dequeue = ops->dequeue;
+ sch->dev_queue = dev_queue;
+ dev_hold(dev);
+ atomic_set(&sch->refcnt, 1);
+
+ return sch;
+errout:
+ return ERR_PTR(err);
+}
+
+struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
+ const struct Qdisc_ops *ops,
+ unsigned int parentid)
+{
+ struct Qdisc *sch;
+
+ if (!try_module_get(ops->owner))
+ goto errout;
+
+ sch = qdisc_alloc(dev_queue, ops);
+ if (IS_ERR(sch))
+ goto errout;
+ sch->parent = parentid;
+
+ if (!ops->init || ops->init(sch, NULL) == 0)
+ return sch;
+
+ qdisc_destroy(sch);
+errout:
+ return NULL;
+}
+EXPORT_SYMBOL(qdisc_create_dflt);
+
+/* Under qdisc_lock(qdisc) and BH! */
+
+void qdisc_reset(struct Qdisc *qdisc)
+{
+ const struct Qdisc_ops *ops = qdisc->ops;
+
+ if (ops->reset)
+ ops->reset(qdisc);
+
+ if (qdisc->gso_skb) {
+ kfree_skb_list(qdisc->gso_skb);
+ qdisc->gso_skb = NULL;
+ qdisc->q.qlen = 0;
+ }
+}
+EXPORT_SYMBOL(qdisc_reset);
+
+static void qdisc_rcu_free(struct rcu_head *head)
+{
+ struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
+
+ if (qdisc_is_percpu_stats(qdisc))
+ free_percpu(qdisc->cpu_bstats);
+
+ kfree((char *) qdisc - qdisc->padded);
+}
+
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ const struct Qdisc_ops *ops = qdisc->ops;
+
+ if (qdisc->flags & TCQ_F_BUILTIN ||
+ !atomic_dec_and_test(&qdisc->refcnt))
+ return;
+
+#ifdef CONFIG_NET_SCHED
+ qdisc_list_del(qdisc);
+
+ qdisc_put_stab(rtnl_dereference(qdisc->stab));
+#endif
+ gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+ if (ops->reset)
+ ops->reset(qdisc);
+ if (ops->destroy)
+ ops->destroy(qdisc);
+
+ module_put(ops->owner);
+ dev_put(qdisc_dev(qdisc));
+
+ kfree_skb_list(qdisc->gso_skb);
+ /*
+ * gen_estimator est_timer() might access qdisc->q.lock,
+ * wait a RCU grace period before freeing qdisc.
+ */
+ call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
+}
+EXPORT_SYMBOL(qdisc_destroy);
+
+/* Attach toplevel qdisc to device queue. */
+struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
+ struct Qdisc *qdisc)
+{
+ struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
+ spinlock_t *root_lock;
+
+ root_lock = qdisc_lock(oqdisc);
+ spin_lock_bh(root_lock);
+
+ /* Prune old scheduler */
+ if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
+ qdisc_reset(oqdisc);
+
+ /* ... and graft new one */
+ if (qdisc == NULL)
+ qdisc = &noop_qdisc;
+ dev_queue->qdisc_sleeping = qdisc;
+ rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
+
+ spin_unlock_bh(root_lock);
+
+ return oqdisc;
+}
+EXPORT_SYMBOL(dev_graft_qdisc);
+
+static void attach_one_default_qdisc(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_unused)
+{
+ struct Qdisc *qdisc = &noqueue_qdisc;
+
+ if (dev->tx_queue_len) {
+ qdisc = qdisc_create_dflt(dev_queue,
+ default_qdisc_ops, TC_H_ROOT);
+ if (!qdisc) {
+ netdev_info(dev, "activation failed\n");
+ return;
+ }
+ if (!netif_is_multiqueue(dev))
+ qdisc->flags |= TCQ_F_ONETXQUEUE;
+ }
+ dev_queue->qdisc_sleeping = qdisc;
+}
+
+static void attach_default_qdiscs(struct net_device *dev)
+{
+ struct netdev_queue *txq;
+ struct Qdisc *qdisc;
+
+ txq = netdev_get_tx_queue(dev, 0);
+
+ if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
+ netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
+ dev->qdisc = txq->qdisc_sleeping;
+ atomic_inc(&dev->qdisc->refcnt);
+ } else {
+ qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
+ if (qdisc) {
+ dev->qdisc = qdisc;
+ qdisc->ops->attach(qdisc);
+ }
+ }
+}
+
+static void transition_one_qdisc(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_need_watchdog)
+{
+ struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
+ int *need_watchdog_p = _need_watchdog;
+
+ if (!(new_qdisc->flags & TCQ_F_BUILTIN))
+ clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
+
+ rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
+ if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
+ dev_queue->trans_start = 0;
+ *need_watchdog_p = 1;
+ }
+}
+
+void dev_activate(struct net_device *dev)
+{
+ int need_watchdog;
+
+ /* No queueing discipline is attached to device;
+ * create default one for devices, which need queueing
+ * and noqueue_qdisc for virtual interfaces
+ */
+
+ if (dev->qdisc == &noop_qdisc)
+ attach_default_qdiscs(dev);
+
+ if (!netif_carrier_ok(dev))
+ /* Delay activation until next carrier-on event */
+ return;
+
+ need_watchdog = 0;
+ netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
+ if (dev_ingress_queue(dev))
+ transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
+
+ if (need_watchdog) {
+ dev->trans_start = jiffies;
+ dev_watchdog_up(dev);
+ }
+}
+EXPORT_SYMBOL(dev_activate);
+
+static void dev_deactivate_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_qdisc_default)
+{
+ struct Qdisc *qdisc_default = _qdisc_default;
+ struct Qdisc *qdisc;
+
+ qdisc = rtnl_dereference(dev_queue->qdisc);
+ if (qdisc) {
+ spin_lock_bh(qdisc_lock(qdisc));
+
+ if (!(qdisc->flags & TCQ_F_BUILTIN))
+ set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
+
+ rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+ qdisc_reset(qdisc);
+
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+}
+
+static bool some_qdisc_is_busy(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *dev_queue;
+ spinlock_t *root_lock;
+ struct Qdisc *q;
+ int val;
+
+ dev_queue = netdev_get_tx_queue(dev, i);
+ q = dev_queue->qdisc_sleeping;
+ root_lock = qdisc_lock(q);
+
+ spin_lock_bh(root_lock);
+
+ val = (qdisc_is_running(q) ||
+ test_bit(__QDISC_STATE_SCHED, &q->state));
+
+ spin_unlock_bh(root_lock);
+
+ if (val)
+ return true;
+ }
+ return false;
+}
+
+/**
+ * dev_deactivate_many - deactivate transmissions on several devices
+ * @head: list of devices to deactivate
+ *
+ * This function returns only when all outstanding transmissions
+ * have completed, unless all devices are in dismantle phase.
+ */
+void dev_deactivate_many(struct list_head *head)
+{
+ struct net_device *dev;
+ bool sync_needed = false;
+
+ list_for_each_entry(dev, head, close_list) {
+ netdev_for_each_tx_queue(dev, dev_deactivate_queue,
+ &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ dev_deactivate_queue(dev, dev_ingress_queue(dev),
+ &noop_qdisc);
+
+ dev_watchdog_down(dev);
+ sync_needed |= !dev->dismantle;
+ }
+
+ /* Wait for outstanding qdisc-less dev_queue_xmit calls.
+ * This is avoided if all devices are in dismantle phase :
+ * Caller will call synchronize_net() for us
+ */
+ if (sync_needed)
+ synchronize_net();
+
+ /* Wait for outstanding qdisc_run calls. */
+ list_for_each_entry(dev, head, close_list)
+ while (some_qdisc_is_busy(dev))
+ yield();
+}
+
+void dev_deactivate(struct net_device *dev)
+{
+ LIST_HEAD(single);
+
+ list_add(&dev->close_list, &single);
+ dev_deactivate_many(&single);
+ list_del(&single);
+}
+EXPORT_SYMBOL(dev_deactivate);
+
+static void dev_init_scheduler_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_qdisc)
+{
+ struct Qdisc *qdisc = _qdisc;
+
+ rcu_assign_pointer(dev_queue->qdisc, qdisc);
+ dev_queue->qdisc_sleeping = qdisc;
+}
+
+void dev_init_scheduler(struct net_device *dev)
+{
+ dev->qdisc = &noop_qdisc;
+ netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+
+ setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
+}
+
+static void shutdown_scheduler_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_qdisc_default)
+{
+ struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+ struct Qdisc *qdisc_default = _qdisc_default;
+
+ if (qdisc) {
+ rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+ dev_queue->qdisc_sleeping = qdisc_default;
+
+ qdisc_destroy(qdisc);
+ }
+}
+
+void dev_shutdown(struct net_device *dev)
+{
+ netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+ qdisc_destroy(dev->qdisc);
+ dev->qdisc = &noop_qdisc;
+
+ WARN_ON(timer_pending(&dev->watchdog_timer));
+}
+
+void psched_ratecfg_precompute(struct psched_ratecfg *r,
+ const struct tc_ratespec *conf,
+ u64 rate64)
+{
+ memset(r, 0, sizeof(*r));
+ r->overhead = conf->overhead;
+ r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
+ r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
+ r->mult = 1;
+ /*
+ * The deal here is to replace a divide by a reciprocal one
+ * in fast path (a reciprocal divide is a multiply and a shift)
+ *
+ * Normal formula would be :
+ * time_in_ns = (NSEC_PER_SEC * len) / rate_bps
+ *
+ * We compute mult/shift to use instead :
+ * time_in_ns = (len * mult) >> shift;
+ *
+ * We try to get the highest possible mult value for accuracy,
+ * but have to make sure no overflows will ever happen.
+ */
+ if (r->rate_bytes_ps > 0) {
+ u64 factor = NSEC_PER_SEC;
+
+ for (;;) {
+ r->mult = div64_u64(factor, r->rate_bytes_ps);
+ if (r->mult & (1U << 31) || factor & (1ULL << 63))
+ break;
+ factor <<= 1;
+ r->shift++;
+ }
+ }
+}
+EXPORT_SYMBOL(psched_ratecfg_precompute);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
new file mode 100644
index 000000000..634529e0c
--- /dev/null
+++ b/net/sched/sch_gred.c
@@ -0,0 +1,630 @@
+/*
+ * net/sched/sch_gred.c Generic Random Early Detection queue.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
+ *
+ * 991129: - Bug fix with grio mode
+ * - a better sing. AvgQ mode with Grio(WRED)
+ * - A finer grained VQ dequeue based on sugestion
+ * from Ren Liu
+ * - More error checks
+ *
+ * For all the glorious comments look at include/net/red.h
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/red.h>
+
+#define GRED_DEF_PRIO (MAX_DPs / 2)
+#define GRED_VQ_MASK (MAX_DPs - 1)
+
+struct gred_sched_data;
+struct gred_sched;
+
+struct gred_sched_data {
+ u32 limit; /* HARD maximal queue length */
+ u32 DP; /* the drop parameters */
+ u32 bytesin; /* bytes seen on virtualQ so far*/
+ u32 packetsin; /* packets seen on virtualQ so far*/
+ u32 backlog; /* bytes on the virtualQ */
+ u8 prio; /* the prio of this vq */
+
+ struct red_parms parms;
+ struct red_vars vars;
+ struct red_stats stats;
+};
+
+enum {
+ GRED_WRED_MODE = 1,
+ GRED_RIO_MODE,
+};
+
+struct gred_sched {
+ struct gred_sched_data *tab[MAX_DPs];
+ unsigned long flags;
+ u32 red_flags;
+ u32 DPs;
+ u32 def;
+ struct red_vars wred_set;
+};
+
+static inline int gred_wred_mode(struct gred_sched *table)
+{
+ return test_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_enable_wred_mode(struct gred_sched *table)
+{
+ __set_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_disable_wred_mode(struct gred_sched *table)
+{
+ __clear_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline int gred_rio_mode(struct gred_sched *table)
+{
+ return test_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_enable_rio_mode(struct gred_sched *table)
+{
+ __set_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_disable_rio_mode(struct gred_sched *table)
+{
+ __clear_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline int gred_wred_mode_check(struct Qdisc *sch)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ int i;
+
+ /* Really ugly O(n^2) but shouldn't be necessary too frequent. */
+ for (i = 0; i < table->DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+ int n;
+
+ if (q == NULL)
+ continue;
+
+ for (n = i + 1; n < table->DPs; n++)
+ if (table->tab[n] && table->tab[n]->prio == q->prio)
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline unsigned int gred_backlog(struct gred_sched *table,
+ struct gred_sched_data *q,
+ struct Qdisc *sch)
+{
+ if (gred_wred_mode(table))
+ return sch->qstats.backlog;
+ else
+ return q->backlog;
+}
+
+static inline u16 tc_index_to_dp(struct sk_buff *skb)
+{
+ return skb->tc_index & GRED_VQ_MASK;
+}
+
+static inline void gred_load_wred_set(const struct gred_sched *table,
+ struct gred_sched_data *q)
+{
+ q->vars.qavg = table->wred_set.qavg;
+ q->vars.qidlestart = table->wred_set.qidlestart;
+}
+
+static inline void gred_store_wred_set(struct gred_sched *table,
+ struct gred_sched_data *q)
+{
+ table->wred_set.qavg = q->vars.qavg;
+ table->wred_set.qidlestart = q->vars.qidlestart;
+}
+
+static inline int gred_use_ecn(struct gred_sched *t)
+{
+ return t->red_flags & TC_RED_ECN;
+}
+
+static inline int gred_use_harddrop(struct gred_sched *t)
+{
+ return t->red_flags & TC_RED_HARDDROP;
+}
+
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct gred_sched_data *q = NULL;
+ struct gred_sched *t = qdisc_priv(sch);
+ unsigned long qavg = 0;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ dp = t->def;
+
+ q = t->tab[dp];
+ if (!q) {
+ /* Pass through packets not assigned to a DP
+ * if no default DP has been configured. This
+ * allows for DP flows to be left untouched.
+ */
+ if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len)
+ return qdisc_enqueue_tail(skb, sch);
+ else
+ goto drop;
+ }
+
+ /* fix tc_index? --could be controversial but needed for
+ requeueing */
+ skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
+ }
+
+ /* sum up all the qaves of prios < ours to get the new qave */
+ if (!gred_wred_mode(t) && gred_rio_mode(t)) {
+ int i;
+
+ for (i = 0; i < t->DPs; i++) {
+ if (t->tab[i] && t->tab[i]->prio < q->prio &&
+ !red_is_idling(&t->tab[i]->vars))
+ qavg += t->tab[i]->vars.qavg;
+ }
+
+ }
+
+ q->packetsin++;
+ q->bytesin += qdisc_pkt_len(skb);
+
+ if (gred_wred_mode(t))
+ gred_load_wred_set(t, q);
+
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ gred_backlog(t, q, sch));
+
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
+
+ if (gred_wred_mode(t))
+ gred_store_wred_set(t, q);
+
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) {
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ q->stats.forced_mark++;
+ break;
+ }
+
+ if (gred_backlog(t, q, sch) + qdisc_pkt_len(skb) <= q->limit) {
+ q->backlog += qdisc_pkt_len(skb);
+ return qdisc_enqueue_tail(skb, sch);
+ }
+
+ q->stats.pdrop++;
+drop:
+ return qdisc_drop(skb, sch);
+
+congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *gred_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct gred_sched *t = qdisc_priv(sch);
+
+ skb = qdisc_dequeue_head(sch);
+
+ if (skb) {
+ struct gred_sched_data *q;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n",
+ tc_index_to_dp(skb));
+ } else {
+ q->backlog -= qdisc_pkt_len(skb);
+
+ if (gred_wred_mode(t)) {
+ if (!sch->qstats.backlog)
+ red_start_of_idle_period(&t->wred_set);
+ } else {
+ if (!q->backlog)
+ red_start_of_idle_period(&q->vars);
+ }
+ }
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static unsigned int gred_drop(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct gred_sched *t = qdisc_priv(sch);
+
+ skb = qdisc_dequeue_tail(sch);
+ if (skb) {
+ unsigned int len = qdisc_pkt_len(skb);
+ struct gred_sched_data *q;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n",
+ tc_index_to_dp(skb));
+ } else {
+ q->backlog -= len;
+ q->stats.other++;
+
+ if (gred_wred_mode(t)) {
+ if (!sch->qstats.backlog)
+ red_start_of_idle_period(&t->wred_set);
+ } else {
+ if (!q->backlog)
+ red_start_of_idle_period(&q->vars);
+ }
+ }
+
+ qdisc_drop(skb, sch);
+ return len;
+ }
+
+ return 0;
+}
+
+static void gred_reset(struct Qdisc *sch)
+{
+ int i;
+ struct gred_sched *t = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+
+ for (i = 0; i < t->DPs; i++) {
+ struct gred_sched_data *q = t->tab[i];
+
+ if (!q)
+ continue;
+
+ red_restart(&q->vars);
+ q->backlog = 0;
+ }
+}
+
+static inline void gred_destroy_vq(struct gred_sched_data *q)
+{
+ kfree(q);
+}
+
+static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct tc_gred_sopt *sopt;
+ int i;
+
+ if (dps == NULL)
+ return -EINVAL;
+
+ sopt = nla_data(dps);
+
+ if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
+ return -EINVAL;
+
+ sch_tree_lock(sch);
+ table->DPs = sopt->DPs;
+ table->def = sopt->def_DP;
+ table->red_flags = sopt->flags;
+
+ /*
+ * Every entry point to GRED is synchronized with the above code
+ * and the DP is checked against DPs, i.e. shadowed VQs can no
+ * longer be found so we can unlock right here.
+ */
+ sch_tree_unlock(sch);
+
+ if (sopt->grio) {
+ gred_enable_rio_mode(table);
+ gred_disable_wred_mode(table);
+ if (gred_wred_mode_check(sch))
+ gred_enable_wred_mode(table);
+ } else {
+ gred_disable_rio_mode(table);
+ gred_disable_wred_mode(table);
+ }
+
+ for (i = table->DPs; i < MAX_DPs; i++) {
+ if (table->tab[i]) {
+ pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
+ i);
+ gred_destroy_vq(table->tab[i]);
+ table->tab[i] = NULL;
+ }
+ }
+
+ return 0;
+}
+
+static inline int gred_change_vq(struct Qdisc *sch, int dp,
+ struct tc_gred_qopt *ctl, int prio,
+ u8 *stab, u32 max_P,
+ struct gred_sched_data **prealloc)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct gred_sched_data *q = table->tab[dp];
+
+ if (!q) {
+ table->tab[dp] = q = *prealloc;
+ *prealloc = NULL;
+ if (!q)
+ return -ENOMEM;
+ }
+
+ q->DP = dp;
+ q->prio = prio;
+ q->limit = ctl->limit;
+
+ if (q->backlog == 0)
+ red_end_of_idle_period(&q->vars);
+
+ red_set_parms(&q->parms,
+ ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
+ ctl->Scell_log, stab, max_P);
+ red_set_vars(&q->vars);
+ return 0;
+}
+
+static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
+ [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
+ [TCA_GRED_STAB] = { .len = 256 },
+ [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
+ [TCA_GRED_MAX_P] = { .type = NLA_U32 },
+};
+
+static int gred_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct tc_gred_qopt *ctl;
+ struct nlattr *tb[TCA_GRED_MAX + 1];
+ int err, prio = GRED_DEF_PRIO;
+ u8 *stab;
+ u32 max_P;
+ struct gred_sched_data *prealloc;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL)
+ return gred_change_table_def(sch, opt);
+
+ if (tb[TCA_GRED_PARMS] == NULL ||
+ tb[TCA_GRED_STAB] == NULL)
+ return -EINVAL;
+
+ max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
+
+ err = -EINVAL;
+ ctl = nla_data(tb[TCA_GRED_PARMS]);
+ stab = nla_data(tb[TCA_GRED_STAB]);
+
+ if (ctl->DP >= table->DPs)
+ goto errout;
+
+ if (gred_rio_mode(table)) {
+ if (ctl->prio == 0) {
+ int def_prio = GRED_DEF_PRIO;
+
+ if (table->tab[table->def])
+ def_prio = table->tab[table->def]->prio;
+
+ printk(KERN_DEBUG "GRED: DP %u does not have a prio "
+ "setting default to %d\n", ctl->DP, def_prio);
+
+ prio = def_prio;
+ } else
+ prio = ctl->prio;
+ }
+
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+ sch_tree_lock(sch);
+
+ err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
+ if (err < 0)
+ goto errout_locked;
+
+ if (gred_rio_mode(table)) {
+ gred_disable_wred_mode(table);
+ if (gred_wred_mode_check(sch))
+ gred_enable_wred_mode(table);
+ }
+
+ err = 0;
+
+errout_locked:
+ sch_tree_unlock(sch);
+ kfree(prealloc);
+errout:
+ return err;
+}
+
+static int gred_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct nlattr *tb[TCA_GRED_MAX + 1];
+ int err;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
+ return -EINVAL;
+
+ return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+}
+
+static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct nlattr *parms, *opts = NULL;
+ int i;
+ u32 max_p[MAX_DPs];
+ struct tc_gred_sopt sopt = {
+ .DPs = table->DPs,
+ .def_DP = table->def,
+ .grio = gred_rio_mode(table),
+ .flags = table->red_flags,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
+ goto nla_put_failure;
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+
+ max_p[i] = q ? q->parms.max_P : 0;
+ }
+ if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p))
+ goto nla_put_failure;
+
+ parms = nla_nest_start(skb, TCA_GRED_PARMS);
+ if (parms == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+ struct tc_gred_qopt opt;
+ unsigned long qavg;
+
+ memset(&opt, 0, sizeof(opt));
+
+ if (!q) {
+ /* hack -- fix at some point with proper message
+ This is how we indicate to tc that there is no VQ
+ at this DP */
+
+ opt.DP = MAX_DPs + i;
+ goto append_opt;
+ }
+
+ opt.limit = q->limit;
+ opt.DP = q->DP;
+ opt.backlog = gred_backlog(table, q, sch);
+ opt.prio = q->prio;
+ opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
+ opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
+ opt.Wlog = q->parms.Wlog;
+ opt.Plog = q->parms.Plog;
+ opt.Scell_log = q->parms.Scell_log;
+ opt.other = q->stats.other;
+ opt.early = q->stats.prob_drop;
+ opt.forced = q->stats.forced_drop;
+ opt.pdrop = q->stats.pdrop;
+ opt.packets = q->packetsin;
+ opt.bytesin = q->bytesin;
+
+ if (gred_wred_mode(table))
+ gred_load_wred_set(table, q);
+
+ qavg = red_calc_qavg(&q->parms, &q->vars,
+ q->vars.qavg >> q->parms.Wlog);
+ opt.qave = qavg >> q->parms.Wlog;
+
+append_opt:
+ if (nla_append(skb, sizeof(opt), &opt) < 0)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, parms);
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static void gred_destroy(struct Qdisc *sch)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ int i;
+
+ for (i = 0; i < table->DPs; i++) {
+ if (table->tab[i])
+ gred_destroy_vq(table->tab[i]);
+ }
+}
+
+static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
+ .id = "gred",
+ .priv_size = sizeof(struct gred_sched),
+ .enqueue = gred_enqueue,
+ .dequeue = gred_dequeue,
+ .peek = qdisc_peek_head,
+ .drop = gred_drop,
+ .init = gred_init,
+ .reset = gred_reset,
+ .destroy = gred_destroy,
+ .change = gred_change,
+ .dump = gred_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init gred_module_init(void)
+{
+ return register_qdisc(&gred_qdisc_ops);
+}
+
+static void __exit gred_module_exit(void)
+{
+ unregister_qdisc(&gred_qdisc_ops);
+}
+
+module_init(gred_module_init)
+module_exit(gred_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
new file mode 100644
index 000000000..e6c7416d0
--- /dev/null
+++ b/net/sched/sch_hfsc.c
@@ -0,0 +1,1754 @@
+/*
+ * Copyright (c) 2003 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * 2003-10-17 - Ported from altq
+ */
+/*
+ * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation is hereby granted (including for commercial or
+ * for-profit use), provided that both the copyright notice and this
+ * permission notice appear in all copies of the software, derivative
+ * works, or modified versions, and any portions thereof.
+ *
+ * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
+ * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS
+ * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Carnegie Mellon encourages (but does not require) users of this
+ * software to return any improvements or extensions that they make,
+ * and to grant Carnegie Mellon the rights to redistribute these
+ * changes without encumbrance.
+ */
+/*
+ * H-FSC is described in Proceedings of SIGCOMM'97,
+ * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing,
+ * Real-Time and Priority Service"
+ * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng.
+ *
+ * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing.
+ * when a class has an upperlimit, the fit-time is computed from the
+ * upperlimit service curve. the link-sharing scheduler does not schedule
+ * a class whose fit-time exceeds the current time.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/init.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <asm/div64.h>
+
+/*
+ * kernel internal service curve representation:
+ * coordinates are given by 64 bit unsigned integers.
+ * x-axis: unit is clock count.
+ * y-axis: unit is byte.
+ *
+ * The service curve parameters are converted to the internal
+ * representation. The slope values are scaled to avoid overflow.
+ * the inverse slope values as well as the y-projection of the 1st
+ * segment are kept in order to avoid 64-bit divide operations
+ * that are expensive on 32-bit architectures.
+ */
+
+struct internal_sc {
+ u64 sm1; /* scaled slope of the 1st segment */
+ u64 ism1; /* scaled inverse-slope of the 1st segment */
+ u64 dx; /* the x-projection of the 1st segment */
+ u64 dy; /* the y-projection of the 1st segment */
+ u64 sm2; /* scaled slope of the 2nd segment */
+ u64 ism2; /* scaled inverse-slope of the 2nd segment */
+};
+
+/* runtime service curve */
+struct runtime_sc {
+ u64 x; /* current starting position on x-axis */
+ u64 y; /* current starting position on y-axis */
+ u64 sm1; /* scaled slope of the 1st segment */
+ u64 ism1; /* scaled inverse-slope of the 1st segment */
+ u64 dx; /* the x-projection of the 1st segment */
+ u64 dy; /* the y-projection of the 1st segment */
+ u64 sm2; /* scaled slope of the 2nd segment */
+ u64 ism2; /* scaled inverse-slope of the 2nd segment */
+};
+
+enum hfsc_class_flags {
+ HFSC_RSC = 0x1,
+ HFSC_FSC = 0x2,
+ HFSC_USC = 0x4
+};
+
+struct hfsc_class {
+ struct Qdisc_class_common cl_common;
+ unsigned int refcnt; /* usage count */
+
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct gnet_stats_rate_est64 rate_est;
+ unsigned int level; /* class level in hierarchy */
+ struct tcf_proto __rcu *filter_list; /* filter list */
+ unsigned int filter_cnt; /* filter count */
+
+ struct hfsc_sched *sched; /* scheduler data */
+ struct hfsc_class *cl_parent; /* parent class */
+ struct list_head siblings; /* sibling classes */
+ struct list_head children; /* child classes */
+ struct Qdisc *qdisc; /* leaf qdisc */
+
+ struct rb_node el_node; /* qdisc's eligible tree member */
+ struct rb_root vt_tree; /* active children sorted by cl_vt */
+ struct rb_node vt_node; /* parent's vt_tree member */
+ struct rb_root cf_tree; /* active children sorted by cl_f */
+ struct rb_node cf_node; /* parent's cf_heap member */
+ struct list_head dlist; /* drop list member */
+
+ u64 cl_total; /* total work in bytes */
+ u64 cl_cumul; /* cumulative work in bytes done by
+ real-time criteria */
+
+ u64 cl_d; /* deadline*/
+ u64 cl_e; /* eligible time */
+ u64 cl_vt; /* virtual time */
+ u64 cl_f; /* time when this class will fit for
+ link-sharing, max(myf, cfmin) */
+ u64 cl_myf; /* my fit-time (calculated from this
+ class's own upperlimit curve) */
+ u64 cl_myfadj; /* my fit-time adjustment (to cancel
+ history dependence) */
+ u64 cl_cfmin; /* earliest children's fit-time (used
+ with cl_myf to obtain cl_f) */
+ u64 cl_cvtmin; /* minimal virtual time among the
+ children fit for link-sharing
+ (monotonic within a period) */
+ u64 cl_vtadj; /* intra-period cumulative vt
+ adjustment */
+ u64 cl_vtoff; /* inter-period cumulative vt offset */
+ u64 cl_cvtmax; /* max child's vt in the last period */
+ u64 cl_cvtoff; /* cumulative cvtmax of all periods */
+ u64 cl_pcvtoff; /* parent's cvtoff at initialization
+ time */
+
+ struct internal_sc cl_rsc; /* internal real-time service curve */
+ struct internal_sc cl_fsc; /* internal fair service curve */
+ struct internal_sc cl_usc; /* internal upperlimit service curve */
+ struct runtime_sc cl_deadline; /* deadline curve */
+ struct runtime_sc cl_eligible; /* eligible curve */
+ struct runtime_sc cl_virtual; /* virtual curve */
+ struct runtime_sc cl_ulimit; /* upperlimit curve */
+
+ unsigned long cl_flags; /* which curves are valid */
+ unsigned long cl_vtperiod; /* vt period sequence number */
+ unsigned long cl_parentperiod;/* parent's vt period sequence number*/
+ unsigned long cl_nactive; /* number of active children */
+};
+
+struct hfsc_sched {
+ u16 defcls; /* default class id */
+ struct hfsc_class root; /* root class */
+ struct Qdisc_class_hash clhash; /* class hash */
+ struct rb_root eligible; /* eligible tree */
+ struct list_head droplist; /* active leaf class list (for
+ dropping) */
+ struct qdisc_watchdog watchdog; /* watchdog timer */
+};
+
+#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */
+
+
+/*
+ * eligible tree holds backlogged classes being sorted by their eligible times.
+ * there is one eligible tree per hfsc instance.
+ */
+
+static void
+eltree_insert(struct hfsc_class *cl)
+{
+ struct rb_node **p = &cl->sched->eligible.rb_node;
+ struct rb_node *parent = NULL;
+ struct hfsc_class *cl1;
+
+ while (*p != NULL) {
+ parent = *p;
+ cl1 = rb_entry(parent, struct hfsc_class, el_node);
+ if (cl->cl_e >= cl1->cl_e)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->el_node, parent, p);
+ rb_insert_color(&cl->el_node, &cl->sched->eligible);
+}
+
+static inline void
+eltree_remove(struct hfsc_class *cl)
+{
+ rb_erase(&cl->el_node, &cl->sched->eligible);
+}
+
+static inline void
+eltree_update(struct hfsc_class *cl)
+{
+ eltree_remove(cl);
+ eltree_insert(cl);
+}
+
+/* find the class with the minimum deadline among the eligible classes */
+static inline struct hfsc_class *
+eltree_get_mindl(struct hfsc_sched *q, u64 cur_time)
+{
+ struct hfsc_class *p, *cl = NULL;
+ struct rb_node *n;
+
+ for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) {
+ p = rb_entry(n, struct hfsc_class, el_node);
+ if (p->cl_e > cur_time)
+ break;
+ if (cl == NULL || p->cl_d < cl->cl_d)
+ cl = p;
+ }
+ return cl;
+}
+
+/* find the class with minimum eligible time among the eligible classes */
+static inline struct hfsc_class *
+eltree_get_minel(struct hfsc_sched *q)
+{
+ struct rb_node *n;
+
+ n = rb_first(&q->eligible);
+ if (n == NULL)
+ return NULL;
+ return rb_entry(n, struct hfsc_class, el_node);
+}
+
+/*
+ * vttree holds holds backlogged child classes being sorted by their virtual
+ * time. each intermediate class has one vttree.
+ */
+static void
+vttree_insert(struct hfsc_class *cl)
+{
+ struct rb_node **p = &cl->cl_parent->vt_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct hfsc_class *cl1;
+
+ while (*p != NULL) {
+ parent = *p;
+ cl1 = rb_entry(parent, struct hfsc_class, vt_node);
+ if (cl->cl_vt >= cl1->cl_vt)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->vt_node, parent, p);
+ rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree);
+}
+
+static inline void
+vttree_remove(struct hfsc_class *cl)
+{
+ rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree);
+}
+
+static inline void
+vttree_update(struct hfsc_class *cl)
+{
+ vttree_remove(cl);
+ vttree_insert(cl);
+}
+
+static inline struct hfsc_class *
+vttree_firstfit(struct hfsc_class *cl, u64 cur_time)
+{
+ struct hfsc_class *p;
+ struct rb_node *n;
+
+ for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) {
+ p = rb_entry(n, struct hfsc_class, vt_node);
+ if (p->cl_f <= cur_time)
+ return p;
+ }
+ return NULL;
+}
+
+/*
+ * get the leaf class with the minimum vt in the hierarchy
+ */
+static struct hfsc_class *
+vttree_get_minvt(struct hfsc_class *cl, u64 cur_time)
+{
+ /* if root-class's cfmin is bigger than cur_time nothing to do */
+ if (cl->cl_cfmin > cur_time)
+ return NULL;
+
+ while (cl->level > 0) {
+ cl = vttree_firstfit(cl, cur_time);
+ if (cl == NULL)
+ return NULL;
+ /*
+ * update parent's cl_cvtmin.
+ */
+ if (cl->cl_parent->cl_cvtmin < cl->cl_vt)
+ cl->cl_parent->cl_cvtmin = cl->cl_vt;
+ }
+ return cl;
+}
+
+static void
+cftree_insert(struct hfsc_class *cl)
+{
+ struct rb_node **p = &cl->cl_parent->cf_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct hfsc_class *cl1;
+
+ while (*p != NULL) {
+ parent = *p;
+ cl1 = rb_entry(parent, struct hfsc_class, cf_node);
+ if (cl->cl_f >= cl1->cl_f)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->cf_node, parent, p);
+ rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree);
+}
+
+static inline void
+cftree_remove(struct hfsc_class *cl)
+{
+ rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree);
+}
+
+static inline void
+cftree_update(struct hfsc_class *cl)
+{
+ cftree_remove(cl);
+ cftree_insert(cl);
+}
+
+/*
+ * service curve support functions
+ *
+ * external service curve parameters
+ * m: bps
+ * d: us
+ * internal service curve parameters
+ * sm: (bytes/psched_us) << SM_SHIFT
+ * ism: (psched_us/byte) << ISM_SHIFT
+ * dx: psched_us
+ *
+ * The clock source resolution with ktime and PSCHED_SHIFT 10 is 1.024us.
+ *
+ * sm and ism are scaled in order to keep effective digits.
+ * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective
+ * digits in decimal using the following table.
+ *
+ * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps
+ * ------------+-------------------------------------------------------
+ * bytes/1.024us 12.8e-3 128e-3 1280e-3 12800e-3 128000e-3
+ *
+ * 1.024us/byte 78.125 7.8125 0.78125 0.078125 0.0078125
+ *
+ * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18.
+ */
+#define SM_SHIFT (30 - PSCHED_SHIFT)
+#define ISM_SHIFT (8 + PSCHED_SHIFT)
+
+#define SM_MASK ((1ULL << SM_SHIFT) - 1)
+#define ISM_MASK ((1ULL << ISM_SHIFT) - 1)
+
+static inline u64
+seg_x2y(u64 x, u64 sm)
+{
+ u64 y;
+
+ /*
+ * compute
+ * y = x * sm >> SM_SHIFT
+ * but divide it for the upper and lower bits to avoid overflow
+ */
+ y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
+ return y;
+}
+
+static inline u64
+seg_y2x(u64 y, u64 ism)
+{
+ u64 x;
+
+ if (y == 0)
+ x = 0;
+ else if (ism == HT_INFINITY)
+ x = HT_INFINITY;
+ else {
+ x = (y >> ISM_SHIFT) * ism
+ + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
+ }
+ return x;
+}
+
+/* Convert m (bps) into sm (bytes/psched us) */
+static u64
+m2sm(u32 m)
+{
+ u64 sm;
+
+ sm = ((u64)m << SM_SHIFT);
+ sm += PSCHED_TICKS_PER_SEC - 1;
+ do_div(sm, PSCHED_TICKS_PER_SEC);
+ return sm;
+}
+
+/* convert m (bps) into ism (psched us/byte) */
+static u64
+m2ism(u32 m)
+{
+ u64 ism;
+
+ if (m == 0)
+ ism = HT_INFINITY;
+ else {
+ ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT);
+ ism += m - 1;
+ do_div(ism, m);
+ }
+ return ism;
+}
+
+/* convert d (us) into dx (psched us) */
+static u64
+d2dx(u32 d)
+{
+ u64 dx;
+
+ dx = ((u64)d * PSCHED_TICKS_PER_SEC);
+ dx += USEC_PER_SEC - 1;
+ do_div(dx, USEC_PER_SEC);
+ return dx;
+}
+
+/* convert sm (bytes/psched us) into m (bps) */
+static u32
+sm2m(u64 sm)
+{
+ u64 m;
+
+ m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT;
+ return (u32)m;
+}
+
+/* convert dx (psched us) into d (us) */
+static u32
+dx2d(u64 dx)
+{
+ u64 d;
+
+ d = dx * USEC_PER_SEC;
+ do_div(d, PSCHED_TICKS_PER_SEC);
+ return (u32)d;
+}
+
+static void
+sc2isc(struct tc_service_curve *sc, struct internal_sc *isc)
+{
+ isc->sm1 = m2sm(sc->m1);
+ isc->ism1 = m2ism(sc->m1);
+ isc->dx = d2dx(sc->d);
+ isc->dy = seg_x2y(isc->dx, isc->sm1);
+ isc->sm2 = m2sm(sc->m2);
+ isc->ism2 = m2ism(sc->m2);
+}
+
+/*
+ * initialize the runtime service curve with the given internal
+ * service curve starting at (x, y).
+ */
+static void
+rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
+{
+ rtsc->x = x;
+ rtsc->y = y;
+ rtsc->sm1 = isc->sm1;
+ rtsc->ism1 = isc->ism1;
+ rtsc->dx = isc->dx;
+ rtsc->dy = isc->dy;
+ rtsc->sm2 = isc->sm2;
+ rtsc->ism2 = isc->ism2;
+}
+
+/*
+ * calculate the y-projection of the runtime service curve by the
+ * given x-projection value
+ */
+static u64
+rtsc_y2x(struct runtime_sc *rtsc, u64 y)
+{
+ u64 x;
+
+ if (y < rtsc->y)
+ x = rtsc->x;
+ else if (y <= rtsc->y + rtsc->dy) {
+ /* x belongs to the 1st segment */
+ if (rtsc->dy == 0)
+ x = rtsc->x + rtsc->dx;
+ else
+ x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1);
+ } else {
+ /* x belongs to the 2nd segment */
+ x = rtsc->x + rtsc->dx
+ + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2);
+ }
+ return x;
+}
+
+static u64
+rtsc_x2y(struct runtime_sc *rtsc, u64 x)
+{
+ u64 y;
+
+ if (x <= rtsc->x)
+ y = rtsc->y;
+ else if (x <= rtsc->x + rtsc->dx)
+ /* y belongs to the 1st segment */
+ y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1);
+ else
+ /* y belongs to the 2nd segment */
+ y = rtsc->y + rtsc->dy
+ + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2);
+ return y;
+}
+
+/*
+ * update the runtime service curve by taking the minimum of the current
+ * runtime service curve and the service curve starting at (x, y).
+ */
+static void
+rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
+{
+ u64 y1, y2, dx, dy;
+ u32 dsm;
+
+ if (isc->sm1 <= isc->sm2) {
+ /* service curve is convex */
+ y1 = rtsc_x2y(rtsc, x);
+ if (y1 < y)
+ /* the current rtsc is smaller */
+ return;
+ rtsc->x = x;
+ rtsc->y = y;
+ return;
+ }
+
+ /*
+ * service curve is concave
+ * compute the two y values of the current rtsc
+ * y1: at x
+ * y2: at (x + dx)
+ */
+ y1 = rtsc_x2y(rtsc, x);
+ if (y1 <= y) {
+ /* rtsc is below isc, no change to rtsc */
+ return;
+ }
+
+ y2 = rtsc_x2y(rtsc, x + isc->dx);
+ if (y2 >= y + isc->dy) {
+ /* rtsc is above isc, replace rtsc by isc */
+ rtsc->x = x;
+ rtsc->y = y;
+ rtsc->dx = isc->dx;
+ rtsc->dy = isc->dy;
+ return;
+ }
+
+ /*
+ * the two curves intersect
+ * compute the offsets (dx, dy) using the reverse
+ * function of seg_x2y()
+ * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
+ */
+ dx = (y1 - y) << SM_SHIFT;
+ dsm = isc->sm1 - isc->sm2;
+ do_div(dx, dsm);
+ /*
+ * check if (x, y1) belongs to the 1st segment of rtsc.
+ * if so, add the offset.
+ */
+ if (rtsc->x + rtsc->dx > x)
+ dx += rtsc->x + rtsc->dx - x;
+ dy = seg_x2y(dx, isc->sm1);
+
+ rtsc->x = x;
+ rtsc->y = y;
+ rtsc->dx = dx;
+ rtsc->dy = dy;
+}
+
+static void
+init_ed(struct hfsc_class *cl, unsigned int next_len)
+{
+ u64 cur_time = psched_get_time();
+
+ /* update the deadline curve */
+ rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
+
+ /*
+ * update the eligible curve.
+ * for concave, it is equal to the deadline curve.
+ * for convex, it is a linear curve with slope m2.
+ */
+ cl->cl_eligible = cl->cl_deadline;
+ if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
+ cl->cl_eligible.dx = 0;
+ cl->cl_eligible.dy = 0;
+ }
+
+ /* compute e and d */
+ cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+ cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+
+ eltree_insert(cl);
+}
+
+static void
+update_ed(struct hfsc_class *cl, unsigned int next_len)
+{
+ cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+ cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+
+ eltree_update(cl);
+}
+
+static inline void
+update_d(struct hfsc_class *cl, unsigned int next_len)
+{
+ cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+}
+
+static inline void
+update_cfmin(struct hfsc_class *cl)
+{
+ struct rb_node *n = rb_first(&cl->cf_tree);
+ struct hfsc_class *p;
+
+ if (n == NULL) {
+ cl->cl_cfmin = 0;
+ return;
+ }
+ p = rb_entry(n, struct hfsc_class, cf_node);
+ cl->cl_cfmin = p->cl_f;
+}
+
+static void
+init_vf(struct hfsc_class *cl, unsigned int len)
+{
+ struct hfsc_class *max_cl;
+ struct rb_node *n;
+ u64 vt, f, cur_time;
+ int go_active;
+
+ cur_time = 0;
+ go_active = 1;
+ for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+ if (go_active && cl->cl_nactive++ == 0)
+ go_active = 1;
+ else
+ go_active = 0;
+
+ if (go_active) {
+ n = rb_last(&cl->cl_parent->vt_tree);
+ if (n != NULL) {
+ max_cl = rb_entry(n, struct hfsc_class, vt_node);
+ /*
+ * set vt to the average of the min and max
+ * classes. if the parent's period didn't
+ * change, don't decrease vt of the class.
+ */
+ vt = max_cl->cl_vt;
+ if (cl->cl_parent->cl_cvtmin != 0)
+ vt = (cl->cl_parent->cl_cvtmin + vt)/2;
+
+ if (cl->cl_parent->cl_vtperiod !=
+ cl->cl_parentperiod || vt > cl->cl_vt)
+ cl->cl_vt = vt;
+ } else {
+ /*
+ * first child for a new parent backlog period.
+ * add parent's cvtmax to cvtoff to make a new
+ * vt (vtoff + vt) larger than the vt in the
+ * last period for all children.
+ */
+ vt = cl->cl_parent->cl_cvtmax;
+ cl->cl_parent->cl_cvtoff += vt;
+ cl->cl_parent->cl_cvtmax = 0;
+ cl->cl_parent->cl_cvtmin = 0;
+ cl->cl_vt = 0;
+ }
+
+ cl->cl_vtoff = cl->cl_parent->cl_cvtoff -
+ cl->cl_pcvtoff;
+
+ /* update the virtual curve */
+ vt = cl->cl_vt + cl->cl_vtoff;
+ rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
+ cl->cl_total);
+ if (cl->cl_virtual.x == vt) {
+ cl->cl_virtual.x -= cl->cl_vtoff;
+ cl->cl_vtoff = 0;
+ }
+ cl->cl_vtadj = 0;
+
+ cl->cl_vtperiod++; /* increment vt period */
+ cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
+ if (cl->cl_parent->cl_nactive == 0)
+ cl->cl_parentperiod++;
+ cl->cl_f = 0;
+
+ vttree_insert(cl);
+ cftree_insert(cl);
+
+ if (cl->cl_flags & HFSC_USC) {
+ /* class has upper limit curve */
+ if (cur_time == 0)
+ cur_time = psched_get_time();
+
+ /* update the ulimit curve */
+ rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time,
+ cl->cl_total);
+ /* compute myf */
+ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
+ cl->cl_total);
+ cl->cl_myfadj = 0;
+ }
+ }
+
+ f = max(cl->cl_myf, cl->cl_cfmin);
+ if (f != cl->cl_f) {
+ cl->cl_f = f;
+ cftree_update(cl);
+ }
+ update_cfmin(cl->cl_parent);
+ }
+}
+
+static void
+update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
+{
+ u64 f; /* , myf_bound, delta; */
+ int go_passive = 0;
+
+ if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC)
+ go_passive = 1;
+
+ for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+ cl->cl_total += len;
+
+ if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0)
+ continue;
+
+ if (go_passive && --cl->cl_nactive == 0)
+ go_passive = 1;
+ else
+ go_passive = 0;
+
+ if (go_passive) {
+ /* no more active child, going passive */
+
+ /* update cvtmax of the parent class */
+ if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
+ cl->cl_parent->cl_cvtmax = cl->cl_vt;
+
+ /* remove this class from the vt tree */
+ vttree_remove(cl);
+
+ cftree_remove(cl);
+ update_cfmin(cl->cl_parent);
+
+ continue;
+ }
+
+ /*
+ * update vt and f
+ */
+ cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
+ - cl->cl_vtoff + cl->cl_vtadj;
+
+ /*
+ * if vt of the class is smaller than cvtmin,
+ * the class was skipped in the past due to non-fit.
+ * if so, we need to adjust vtadj.
+ */
+ if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
+ cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+ cl->cl_vt = cl->cl_parent->cl_cvtmin;
+ }
+
+ /* update the vt tree */
+ vttree_update(cl);
+
+ if (cl->cl_flags & HFSC_USC) {
+ cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
+ cl->cl_total);
+#if 0
+ /*
+ * This code causes classes to stay way under their
+ * limit when multiple classes are used at gigabit
+ * speed. needs investigation. -kaber
+ */
+ /*
+ * if myf lags behind by more than one clock tick
+ * from the current time, adjust myfadj to prevent
+ * a rate-limited class from going greedy.
+ * in a steady state under rate-limiting, myf
+ * fluctuates within one clock tick.
+ */
+ myf_bound = cur_time - PSCHED_JIFFIE2US(1);
+ if (cl->cl_myf < myf_bound) {
+ delta = cur_time - cl->cl_myf;
+ cl->cl_myfadj += delta;
+ cl->cl_myf += delta;
+ }
+#endif
+ }
+
+ f = max(cl->cl_myf, cl->cl_cfmin);
+ if (f != cl->cl_f) {
+ cl->cl_f = f;
+ cftree_update(cl);
+ update_cfmin(cl->cl_parent);
+ }
+ }
+}
+
+static void
+set_active(struct hfsc_class *cl, unsigned int len)
+{
+ if (cl->cl_flags & HFSC_RSC)
+ init_ed(cl, len);
+ if (cl->cl_flags & HFSC_FSC)
+ init_vf(cl, len);
+
+ list_add_tail(&cl->dlist, &cl->sched->droplist);
+}
+
+static void
+set_passive(struct hfsc_class *cl)
+{
+ if (cl->cl_flags & HFSC_RSC)
+ eltree_remove(cl);
+
+ list_del(&cl->dlist);
+
+ /*
+ * vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
+ * needs to be called explicitly to remove a class from vttree.
+ */
+}
+
+static unsigned int
+qdisc_peek_len(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ unsigned int len;
+
+ skb = sch->ops->peek(sch);
+ if (skb == NULL) {
+ qdisc_warn_nonwc("qdisc_peek_len", sch);
+ return 0;
+ }
+ len = qdisc_pkt_len(skb);
+
+ return len;
+}
+
+static void
+hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
+{
+ unsigned int len = cl->qdisc->q.qlen;
+
+ qdisc_reset(cl->qdisc);
+ qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static void
+hfsc_adjust_levels(struct hfsc_class *cl)
+{
+ struct hfsc_class *p;
+ unsigned int level;
+
+ do {
+ level = 0;
+ list_for_each_entry(p, &cl->children, siblings) {
+ if (p->level >= level)
+ level = p->level + 1;
+ }
+ cl->level = level;
+ } while ((cl = cl->cl_parent) != NULL);
+}
+
+static inline struct hfsc_class *
+hfsc_find_class(u32 classid, struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct hfsc_class, cl_common);
+}
+
+static void
+hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc,
+ u64 cur_time)
+{
+ sc2isc(rsc, &cl->cl_rsc);
+ rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
+ cl->cl_eligible = cl->cl_deadline;
+ if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
+ cl->cl_eligible.dx = 0;
+ cl->cl_eligible.dy = 0;
+ }
+ cl->cl_flags |= HFSC_RSC;
+}
+
+static void
+hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
+{
+ sc2isc(fsc, &cl->cl_fsc);
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+ cl->cl_flags |= HFSC_FSC;
+}
+
+static void
+hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
+ u64 cur_time)
+{
+ sc2isc(usc, &cl->cl_usc);
+ rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total);
+ cl->cl_flags |= HFSC_USC;
+}
+
+static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = {
+ [TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) },
+ [TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) },
+ [TCA_HFSC_USC] = { .len = sizeof(struct tc_service_curve) },
+};
+
+static int
+hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl = (struct hfsc_class *)*arg;
+ struct hfsc_class *parent = NULL;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_HFSC_MAX + 1];
+ struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL;
+ u64 cur_time;
+ int err;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_HFSC_RSC]) {
+ rsc = nla_data(tb[TCA_HFSC_RSC]);
+ if (rsc->m1 == 0 && rsc->m2 == 0)
+ rsc = NULL;
+ }
+
+ if (tb[TCA_HFSC_FSC]) {
+ fsc = nla_data(tb[TCA_HFSC_FSC]);
+ if (fsc->m1 == 0 && fsc->m2 == 0)
+ fsc = NULL;
+ }
+
+ if (tb[TCA_HFSC_USC]) {
+ usc = nla_data(tb[TCA_HFSC_USC]);
+ if (usc->m1 == 0 && usc->m2 == 0)
+ usc = NULL;
+ }
+
+ if (cl != NULL) {
+ if (parentid) {
+ if (cl->cl_parent &&
+ cl->cl_parent->cl_common.classid != parentid)
+ return -EINVAL;
+ if (cl->cl_parent == NULL && parentid != TC_H_ROOT)
+ return -EINVAL;
+ }
+ cur_time = psched_get_time();
+
+ if (tca[TCA_RATE]) {
+ spinlock_t *lock = qdisc_root_sleeping_lock(sch);
+
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ lock,
+ tca[TCA_RATE]);
+ if (err)
+ return err;
+ }
+
+ sch_tree_lock(sch);
+ if (rsc != NULL)
+ hfsc_change_rsc(cl, rsc, cur_time);
+ if (fsc != NULL)
+ hfsc_change_fsc(cl, fsc);
+ if (usc != NULL)
+ hfsc_change_usc(cl, usc, cur_time);
+
+ if (cl->qdisc->q.qlen != 0) {
+ if (cl->cl_flags & HFSC_RSC)
+ update_ed(cl, qdisc_peek_len(cl->qdisc));
+ if (cl->cl_flags & HFSC_FSC)
+ update_vf(cl, 0, cur_time);
+ }
+ sch_tree_unlock(sch);
+
+ return 0;
+ }
+
+ if (parentid == TC_H_ROOT)
+ return -EEXIST;
+
+ parent = &q->root;
+ if (parentid) {
+ parent = hfsc_find_class(parentid, sch);
+ if (parent == NULL)
+ return -ENOENT;
+ }
+
+ if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0)
+ return -EINVAL;
+ if (hfsc_find_class(classid, sch))
+ return -EEXIST;
+
+ if (rsc == NULL && fsc == NULL)
+ return -EINVAL;
+
+ cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL);
+ if (cl == NULL)
+ return -ENOBUFS;
+
+ if (tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err) {
+ kfree(cl);
+ return err;
+ }
+ }
+
+ if (rsc != NULL)
+ hfsc_change_rsc(cl, rsc, 0);
+ if (fsc != NULL)
+ hfsc_change_fsc(cl, fsc);
+ if (usc != NULL)
+ hfsc_change_usc(cl, usc, 0);
+
+ cl->cl_common.classid = classid;
+ cl->refcnt = 1;
+ cl->sched = q;
+ cl->cl_parent = parent;
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, classid);
+ if (cl->qdisc == NULL)
+ cl->qdisc = &noop_qdisc;
+ INIT_LIST_HEAD(&cl->children);
+ cl->vt_tree = RB_ROOT;
+ cl->cf_tree = RB_ROOT;
+
+ sch_tree_lock(sch);
+ qdisc_class_hash_insert(&q->clhash, &cl->cl_common);
+ list_add_tail(&cl->siblings, &parent->children);
+ if (parent->level == 0)
+ hfsc_purge_queue(sch, parent);
+ hfsc_adjust_levels(parent);
+ cl->cl_pcvtoff = parent->cl_cvtoff;
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+}
+
+static void
+hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&cl->filter_list);
+ qdisc_destroy(cl->qdisc);
+ gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ if (cl != &q->root)
+ kfree(cl);
+}
+
+static int
+hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ list_del(&cl->siblings);
+ hfsc_adjust_levels(cl->cl_parent);
+
+ hfsc_purge_queue(sch, cl);
+ qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
+
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct hfsc_class *
+hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *head, *cl;
+ struct tcf_result res;
+ struct tcf_proto *tcf;
+ int result;
+
+ if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 &&
+ (cl = hfsc_find_class(skb->priority, sch)) != NULL)
+ if (cl->level == 0)
+ return cl;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ head = &q->root;
+ tcf = rcu_dereference_bh(q->root.filter_list);
+ while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (struct hfsc_class *)res.class;
+ if (!cl) {
+ cl = hfsc_find_class(res.classid, sch);
+ if (!cl)
+ break; /* filter selected invalid classid */
+ if (cl->level >= head->level)
+ break; /* filter may only point downwards */
+ }
+
+ if (cl->level == 0)
+ return cl; /* hit leaf class */
+
+ /* apply inner filter chain */
+ tcf = rcu_dereference_bh(cl->filter_list);
+ head = cl;
+ }
+
+ /* classification failed, try default class */
+ cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+ if (cl == NULL || cl->level > 0)
+ return NULL;
+
+ return cl;
+}
+
+static int
+hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl->level > 0)
+ return -EINVAL;
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->cl_common.classid);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ sch_tree_lock(sch);
+ hfsc_purge_queue(sch, cl);
+ *old = cl->qdisc;
+ cl->qdisc = new;
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *
+hfsc_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl->level == 0)
+ return cl->qdisc;
+
+ return NULL;
+}
+
+static void
+hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl->qdisc->q.qlen == 0) {
+ update_vf(cl, 0, 0);
+ set_passive(cl);
+ }
+}
+
+static unsigned long
+hfsc_get_class(struct Qdisc *sch, u32 classid)
+{
+ struct hfsc_class *cl = hfsc_find_class(classid, sch);
+
+ if (cl != NULL)
+ cl->refcnt++;
+
+ return (unsigned long)cl;
+}
+
+static void
+hfsc_put_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (--cl->refcnt == 0)
+ hfsc_destroy_class(sch, cl);
+}
+
+static unsigned long
+hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+ struct hfsc_class *p = (struct hfsc_class *)parent;
+ struct hfsc_class *cl = hfsc_find_class(classid, sch);
+
+ if (cl != NULL) {
+ if (p != NULL && p->level <= cl->level)
+ return 0;
+ cl->filter_cnt++;
+ }
+
+ return (unsigned long)cl;
+}
+
+static void
+hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ cl->filter_cnt--;
+}
+
+static struct tcf_proto __rcu **
+hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+ if (cl == NULL)
+ cl = &q->root;
+
+ return &cl->filter_list;
+}
+
+static int
+hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
+{
+ struct tc_service_curve tsc;
+
+ tsc.m1 = sm2m(sc->sm1);
+ tsc.d = dx2d(sc->dx);
+ tsc.m2 = sm2m(sc->sm2);
+ if (nla_put(skb, attr, sizeof(tsc), &tsc))
+ goto nla_put_failure;
+
+ return skb->len;
+
+ nla_put_failure:
+ return -1;
+}
+
+static int
+hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
+{
+ if ((cl->cl_flags & HFSC_RSC) &&
+ (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0))
+ goto nla_put_failure;
+
+ if ((cl->cl_flags & HFSC_FSC) &&
+ (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0))
+ goto nla_put_failure;
+
+ if ((cl->cl_flags & HFSC_USC) &&
+ (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0))
+ goto nla_put_failure;
+
+ return skb->len;
+
+ nla_put_failure:
+ return -1;
+}
+
+static int
+hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
+ struct tcmsg *tcm)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+ struct nlattr *nest;
+
+ tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->cl_common.classid :
+ TC_H_ROOT;
+ tcm->tcm_handle = cl->cl_common.classid;
+ if (cl->level == 0)
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (hfsc_dump_curves(skb, cl) < 0)
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+ nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int
+hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct hfsc_class *cl = (struct hfsc_class *)arg;
+ struct tc_hfsc_stats xstats;
+
+ cl->qstats.backlog = cl->qdisc->qstats.backlog;
+ xstats.level = cl->level;
+ xstats.period = cl->cl_vtperiod;
+ xstats.work = cl->cl_total;
+ xstats.rtwork = cl->cl_cumul;
+
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+
+
+static void
+hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i],
+ cl_common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static void
+hfsc_schedule_watchdog(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ u64 next_time = 0;
+
+ cl = eltree_get_minel(q);
+ if (cl)
+ next_time = cl->cl_e;
+ if (q->root.cl_cfmin != 0) {
+ if (next_time == 0 || next_time > q->root.cl_cfmin)
+ next_time = q->root.cl_cfmin;
+ }
+ WARN_ON(next_time == 0);
+ qdisc_watchdog_schedule(&q->watchdog, next_time);
+}
+
+static int
+hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct tc_hfsc_qopt *qopt;
+ int err;
+
+ if (opt == NULL || nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+ qopt = nla_data(opt);
+
+ q->defcls = qopt->defcls;
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+ q->eligible = RB_ROOT;
+ INIT_LIST_HEAD(&q->droplist);
+
+ q->root.cl_common.classid = sch->handle;
+ q->root.refcnt = 1;
+ q->root.sched = q;
+ q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle);
+ if (q->root.qdisc == NULL)
+ q->root.qdisc = &noop_qdisc;
+ INIT_LIST_HEAD(&q->root.children);
+ q->root.vt_tree = RB_ROOT;
+ q->root.cf_tree = RB_ROOT;
+
+ qdisc_class_hash_insert(&q->clhash, &q->root.cl_common);
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ return 0;
+}
+
+static int
+hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct tc_hfsc_qopt *qopt;
+
+ if (opt == NULL || nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+ qopt = nla_data(opt);
+
+ sch_tree_lock(sch);
+ q->defcls = qopt->defcls;
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static void
+hfsc_reset_class(struct hfsc_class *cl)
+{
+ cl->cl_total = 0;
+ cl->cl_cumul = 0;
+ cl->cl_d = 0;
+ cl->cl_e = 0;
+ cl->cl_vt = 0;
+ cl->cl_vtadj = 0;
+ cl->cl_vtoff = 0;
+ cl->cl_cvtmin = 0;
+ cl->cl_cvtmax = 0;
+ cl->cl_cvtoff = 0;
+ cl->cl_pcvtoff = 0;
+ cl->cl_vtperiod = 0;
+ cl->cl_parentperiod = 0;
+ cl->cl_f = 0;
+ cl->cl_myf = 0;
+ cl->cl_myfadj = 0;
+ cl->cl_cfmin = 0;
+ cl->cl_nactive = 0;
+
+ cl->vt_tree = RB_ROOT;
+ cl->cf_tree = RB_ROOT;
+ qdisc_reset(cl->qdisc);
+
+ if (cl->cl_flags & HFSC_RSC)
+ rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0);
+ if (cl->cl_flags & HFSC_FSC)
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0);
+ if (cl->cl_flags & HFSC_USC)
+ rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0);
+}
+
+static void
+hfsc_reset_qdisc(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
+ hfsc_reset_class(cl);
+ }
+ q->eligible = RB_ROOT;
+ INIT_LIST_HEAD(&q->droplist);
+ qdisc_watchdog_cancel(&q->watchdog);
+ sch->q.qlen = 0;
+}
+
+static void
+hfsc_destroy_qdisc(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hlist_node *next;
+ struct hfsc_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
+ tcf_destroy_chain(&cl->filter_list);
+ }
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ cl_common.hnode)
+ hfsc_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int
+hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_hfsc_qopt qopt;
+ struct hfsc_class *cl;
+ unsigned int i;
+
+ sch->qstats.backlog = 0;
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
+ sch->qstats.backlog += cl->qdisc->qstats.backlog;
+ }
+
+ qopt.defcls = q->defcls;
+ if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+ goto nla_put_failure;
+ return skb->len;
+
+ nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int
+hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct hfsc_class *cl;
+ int uninitialized_var(err);
+
+ cl = hfsc_classify(skb, sch, &err);
+ if (cl == NULL) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return err;
+ }
+
+ err = qdisc_enqueue(skb, cl->qdisc);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ if (cl->qdisc->q.qlen == 1)
+ set_active(cl, qdisc_pkt_len(skb));
+
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+hfsc_dequeue(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ struct sk_buff *skb;
+ u64 cur_time;
+ unsigned int next_len;
+ int realtime = 0;
+
+ if (sch->q.qlen == 0)
+ return NULL;
+
+ cur_time = psched_get_time();
+
+ /*
+ * if there are eligible classes, use real-time criteria.
+ * find the class with the minimum deadline among
+ * the eligible classes.
+ */
+ cl = eltree_get_mindl(q, cur_time);
+ if (cl) {
+ realtime = 1;
+ } else {
+ /*
+ * use link-sharing criteria
+ * get the class with the minimum vt in the hierarchy
+ */
+ cl = vttree_get_minvt(&q->root, cur_time);
+ if (cl == NULL) {
+ qdisc_qstats_overlimit(sch);
+ hfsc_schedule_watchdog(sch);
+ return NULL;
+ }
+ }
+
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (skb == NULL) {
+ qdisc_warn_nonwc("HFSC", cl->qdisc);
+ return NULL;
+ }
+
+ bstats_update(&cl->bstats, skb);
+ update_vf(cl, qdisc_pkt_len(skb), cur_time);
+ if (realtime)
+ cl->cl_cumul += qdisc_pkt_len(skb);
+
+ if (cl->qdisc->q.qlen != 0) {
+ if (cl->cl_flags & HFSC_RSC) {
+ /* update ed */
+ next_len = qdisc_peek_len(cl->qdisc);
+ if (realtime)
+ update_ed(cl, next_len);
+ else
+ update_d(cl, next_len);
+ }
+ } else {
+ /* the class becomes passive */
+ set_passive(cl);
+ }
+
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
+static unsigned int
+hfsc_drop(struct Qdisc *sch)
+{
+ struct hfsc_sched *q = qdisc_priv(sch);
+ struct hfsc_class *cl;
+ unsigned int len;
+
+ list_for_each_entry(cl, &q->droplist, dlist) {
+ if (cl->qdisc->ops->drop != NULL &&
+ (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
+ if (cl->qdisc->q.qlen == 0) {
+ update_vf(cl, 0, 0);
+ set_passive(cl);
+ } else {
+ list_move_tail(&cl->dlist, &q->droplist);
+ }
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ sch->q.qlen--;
+ return len;
+ }
+ }
+ return 0;
+}
+
+static const struct Qdisc_class_ops hfsc_class_ops = {
+ .change = hfsc_change_class,
+ .delete = hfsc_delete_class,
+ .graft = hfsc_graft_class,
+ .leaf = hfsc_class_leaf,
+ .qlen_notify = hfsc_qlen_notify,
+ .get = hfsc_get_class,
+ .put = hfsc_put_class,
+ .bind_tcf = hfsc_bind_tcf,
+ .unbind_tcf = hfsc_unbind_tcf,
+ .tcf_chain = hfsc_tcf_chain,
+ .dump = hfsc_dump_class,
+ .dump_stats = hfsc_dump_class_stats,
+ .walk = hfsc_walk
+};
+
+static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = {
+ .id = "hfsc",
+ .init = hfsc_init_qdisc,
+ .change = hfsc_change_qdisc,
+ .reset = hfsc_reset_qdisc,
+ .destroy = hfsc_destroy_qdisc,
+ .dump = hfsc_dump_qdisc,
+ .enqueue = hfsc_enqueue,
+ .dequeue = hfsc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = hfsc_drop,
+ .cl_ops = &hfsc_class_ops,
+ .priv_size = sizeof(struct hfsc_sched),
+ .owner = THIS_MODULE
+};
+
+static int __init
+hfsc_init(void)
+{
+ return register_qdisc(&hfsc_qdisc_ops);
+}
+
+static void __exit
+hfsc_cleanup(void)
+{
+ unregister_qdisc(&hfsc_qdisc_ops);
+}
+
+MODULE_LICENSE("GPL");
+module_init(hfsc_init);
+module_exit(hfsc_cleanup);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
new file mode 100644
index 000000000..15d3aabfe
--- /dev/null
+++ b/net/sched/sch_hhf.c
@@ -0,0 +1,740 @@
+/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)
+ *
+ * Copyright (C) 2013 Terry Lam <vtlam@google.com>
+ * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
+ */
+
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <net/flow_keys.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+/* Heavy-Hitter Filter (HHF)
+ *
+ * Principles :
+ * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
+ * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
+ * as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
+ * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
+ * in which the heavy-hitter bucket is served with less weight.
+ * In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
+ * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
+ * higher share of bandwidth.
+ *
+ * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
+ * following paper:
+ * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
+ * Accounting", in ACM SIGCOMM, 2002.
+ *
+ * Conceptually, a multi-stage filter comprises k independent hash functions
+ * and k counter arrays. Packets are indexed into k counter arrays by k hash
+ * functions, respectively. The counters are then increased by the packet sizes.
+ * Therefore,
+ * - For a heavy-hitter flow: *all* of its k array counters must be large.
+ * - For a non-heavy-hitter flow: some of its k array counters can be large
+ * due to hash collision with other small flows; however, with high
+ * probability, not *all* k counters are large.
+ *
+ * By the design of the multi-stage filter algorithm, the false negative rate
+ * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
+ * susceptible to false positives (non-heavy-hitters mistakenly classified as
+ * heavy-hitters).
+ * Therefore, we also implement the following optimizations to reduce false
+ * positives by avoiding unnecessary increment of the counter values:
+ * - Optimization O1: once a heavy-hitter is identified, its bytes are not
+ * accounted in the array counters. This technique is called "shielding"
+ * in Section 3.3.1 of [EV02].
+ * - Optimization O2: conservative update of counters
+ * (Section 3.3.2 of [EV02]),
+ * New counter value = max {old counter value,
+ * smallest counter value + packet bytes}
+ *
+ * Finally, we refresh the counters periodically since otherwise the counter
+ * values will keep accumulating.
+ *
+ * Once a flow is classified as heavy-hitter, we also save its per-flow state
+ * in an exact-matching flow table so that its subsequent packets can be
+ * dispatched to the heavy-hitter bucket accordingly.
+ *
+ *
+ * At a high level, this qdisc works as follows:
+ * Given a packet p:
+ * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
+ * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
+ * bucket.
+ * - Otherwise, forward p to the multi-stage filter, denoted filter F
+ * + If F decides that p belongs to a non-heavy-hitter flow, then send p
+ * to the non-heavy-hitter bucket.
+ * + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
+ * then set up a new flow entry for the flow-id of p in the table T and
+ * send p to the heavy-hitter bucket.
+ *
+ * In this implementation:
+ * - T is a fixed-size hash-table with 1024 entries. Hash collision is
+ * resolved by linked-list chaining.
+ * - F has four counter arrays, each array containing 1024 32-bit counters.
+ * That means 4 * 1024 * 32 bits = 16KB of memory.
+ * - Since each array in F contains 1024 counters, 10 bits are sufficient to
+ * index into each array.
+ * Hence, instead of having four hash functions, we chop the 32-bit
+ * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
+ * computed as XOR sum of those three chunks.
+ * - We need to clear the counter arrays periodically; however, directly
+ * memsetting 16KB of memory can lead to cache eviction and unwanted delay.
+ * So by representing each counter by a valid bit, we only need to reset
+ * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
+ * - The Deficit Round Robin engine is taken from fq_codel implementation
+ * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
+ * fq_codel_flow in fq_codel implementation.
+ *
+ */
+
+/* Non-configurable parameters */
+#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */
+#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */
+#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */
+#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */
+#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */
+
+#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */
+enum wdrr_bucket_idx {
+ WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */
+ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */
+};
+
+#define hhf_time_before(a, b) \
+ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
+
+/* Heavy-hitter per-flow state */
+struct hh_flow_state {
+ u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */
+ u32 hit_timestamp; /* last time heavy-hitter was seen */
+ struct list_head flowchain; /* chaining under hash collision */
+};
+
+/* Weighted Deficit Round Robin (WDRR) scheduler */
+struct wdrr_bucket {
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head bucketchain;
+ int deficit;
+};
+
+struct hhf_sched_data {
+ struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
+ u32 perturbation; /* hash perturbation */
+ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
+ u32 drop_overlimit; /* number of times max qdisc packet
+ * limit was hit
+ */
+ struct list_head *hh_flows; /* table T (currently active HHs) */
+ u32 hh_flows_limit; /* max active HH allocs */
+ u32 hh_flows_overlimit; /* num of disallowed HH allocs */
+ u32 hh_flows_total_cnt; /* total admitted HHs */
+ u32 hh_flows_current_cnt; /* total current HHs */
+ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
+ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays
+ * was reset
+ */
+ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
+ * of hhf_arrays
+ */
+ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
+ struct list_head new_buckets; /* list of new buckets */
+ struct list_head old_buckets; /* list of old buckets */
+
+ /* Configurable HHF parameters */
+ u32 hhf_reset_timeout; /* interval to reset counter
+ * arrays in filter F
+ * (default 40ms)
+ */
+ u32 hhf_admit_bytes; /* counter thresh to classify as
+ * HH (default 128KB).
+ * With these default values,
+ * 128KB / 40ms = 25 Mbps
+ * i.e., we expect to capture HHs
+ * sending > 25 Mbps.
+ */
+ u32 hhf_evict_timeout; /* aging threshold to evict idle
+ * HHs out of table T. This should
+ * be large enough to avoid
+ * reordering during HH eviction.
+ * (default 1s)
+ */
+ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs
+ * (default 2,
+ * i.e., non-HH : HH = 2 : 1)
+ */
+};
+
+static u32 hhf_time_stamp(void)
+{
+ return jiffies;
+}
+
+static unsigned int skb_hash(const struct hhf_sched_data *q,
+ const struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ unsigned int hash;
+
+ if (skb->sk && skb->sk->sk_hash)
+ return skb->sk->sk_hash;
+
+ skb_flow_dissect(skb, &keys);
+ hash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src ^ keys.ip_proto,
+ (__force u32)keys.ports, q->perturbation);
+ return hash;
+}
+
+/* Looks up a heavy-hitter flow in a chaining list of table T. */
+static struct hh_flow_state *seek_list(const u32 hash,
+ struct list_head *head,
+ struct hhf_sched_data *q)
+{
+ struct hh_flow_state *flow, *next;
+ u32 now = hhf_time_stamp();
+
+ if (list_empty(head))
+ return NULL;
+
+ list_for_each_entry_safe(flow, next, head, flowchain) {
+ u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+ if (hhf_time_before(prev, now)) {
+ /* Delete expired heavy-hitters, but preserve one entry
+ * to avoid kzalloc() when next time this slot is hit.
+ */
+ if (list_is_last(&flow->flowchain, head))
+ return NULL;
+ list_del(&flow->flowchain);
+ kfree(flow);
+ q->hh_flows_current_cnt--;
+ } else if (flow->hash_id == hash) {
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired
+ * entry or dynamically alloc a new entry.
+ */
+static struct hh_flow_state *alloc_new_hh(struct list_head *head,
+ struct hhf_sched_data *q)
+{
+ struct hh_flow_state *flow;
+ u32 now = hhf_time_stamp();
+
+ if (!list_empty(head)) {
+ /* Find an expired heavy-hitter flow entry. */
+ list_for_each_entry(flow, head, flowchain) {
+ u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
+
+ if (hhf_time_before(prev, now))
+ return flow;
+ }
+ }
+
+ if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
+ q->hh_flows_overlimit++;
+ return NULL;
+ }
+ /* Create new entry. */
+ flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
+ if (!flow)
+ return NULL;
+
+ q->hh_flows_current_cnt++;
+ INIT_LIST_HEAD(&flow->flowchain);
+ list_add_tail(&flow->flowchain, head);
+
+ return flow;
+}
+
+/* Assigns packets to WDRR buckets. Implements a multi-stage filter to
+ * classify heavy-hitters.
+ */
+static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ u32 tmp_hash, hash;
+ u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
+ struct hh_flow_state *flow;
+ u32 pkt_len, min_hhf_val;
+ int i;
+ u32 prev;
+ u32 now = hhf_time_stamp();
+
+ /* Reset the HHF counter arrays if this is the right time. */
+ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
+ if (hhf_time_before(prev, now)) {
+ for (i = 0; i < HHF_ARRAYS_CNT; i++)
+ bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
+ q->hhf_arrays_reset_timestamp = now;
+ }
+
+ /* Get hashed flow-id of the skb. */
+ hash = skb_hash(q, skb);
+
+ /* Check if this packet belongs to an already established HH flow. */
+ flow_pos = hash & HHF_BIT_MASK;
+ flow = seek_list(hash, &q->hh_flows[flow_pos], q);
+ if (flow) { /* found its HH flow */
+ flow->hit_timestamp = now;
+ return WDRR_BUCKET_FOR_HH;
+ }
+
+ /* Now pass the packet through the multi-stage filter. */
+ tmp_hash = hash;
+ xorsum = 0;
+ for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
+ /* Split the skb_hash into three 10-bit chunks. */
+ filter_pos[i] = tmp_hash & HHF_BIT_MASK;
+ xorsum ^= filter_pos[i];
+ tmp_hash >>= HHF_BIT_MASK_LEN;
+ }
+ /* The last chunk is computed as XOR sum of other chunks. */
+ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
+
+ pkt_len = qdisc_pkt_len(skb);
+ min_hhf_val = ~0U;
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ u32 val;
+
+ if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
+ q->hhf_arrays[i][filter_pos[i]] = 0;
+ __set_bit(filter_pos[i], q->hhf_valid_bits[i]);
+ }
+
+ val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
+ if (min_hhf_val > val)
+ min_hhf_val = val;
+ }
+
+ /* Found a new HH iff all counter values > HH admit threshold. */
+ if (min_hhf_val > q->hhf_admit_bytes) {
+ /* Just captured a new heavy-hitter. */
+ flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
+ if (!flow) /* memory alloc problem */
+ return WDRR_BUCKET_FOR_NON_HH;
+ flow->hash_id = hash;
+ flow->hit_timestamp = now;
+ q->hh_flows_total_cnt++;
+
+ /* By returning without updating counters in q->hhf_arrays,
+ * we implicitly implement "shielding" (see Optimization O1).
+ */
+ return WDRR_BUCKET_FOR_HH;
+ }
+
+ /* Conservative update of HHF arrays (see Optimization O2). */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
+ q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
+ }
+ return WDRR_BUCKET_FOR_NON_HH;
+}
+
+/* Removes one skb from head of bucket. */
+static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
+{
+ struct sk_buff *skb = bucket->head;
+
+ bucket->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+/* Tail-adds skb to bucket. */
+static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
+{
+ if (bucket->head == NULL)
+ bucket->head = skb;
+ else
+ bucket->tail->next = skb;
+ bucket->tail = skb;
+ skb->next = NULL;
+}
+
+static unsigned int hhf_drop(struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct wdrr_bucket *bucket;
+
+ /* Always try to drop from heavy-hitters first. */
+ bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
+ if (!bucket->head)
+ bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
+
+ if (bucket->head) {
+ struct sk_buff *skb = dequeue_head(bucket);
+
+ sch->q.qlen--;
+ qdisc_qstats_drop(sch);
+ qdisc_qstats_backlog_dec(sch, skb);
+ kfree_skb(skb);
+ }
+
+ /* Return id of the bucket from which the packet was dropped. */
+ return bucket - q->buckets;
+}
+
+static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ enum wdrr_bucket_idx idx;
+ struct wdrr_bucket *bucket;
+
+ idx = hhf_classify(skb, sch);
+
+ bucket = &q->buckets[idx];
+ bucket_add(bucket, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+
+ if (list_empty(&bucket->bucketchain)) {
+ unsigned int weight;
+
+ /* The logic of new_buckets vs. old_buckets is the same as
+ * new_flows vs. old_flows in the implementation of fq_codel,
+ * i.e., short bursts of non-HHs should have strict priority.
+ */
+ if (idx == WDRR_BUCKET_FOR_HH) {
+ /* Always move heavy-hitters to old bucket. */
+ weight = 1;
+ list_add_tail(&bucket->bucketchain, &q->old_buckets);
+ } else {
+ weight = q->hhf_non_hh_weight;
+ list_add_tail(&bucket->bucketchain, &q->new_buckets);
+ }
+ bucket->deficit = weight * q->quantum;
+ }
+ if (++sch->q.qlen <= sch->limit)
+ return NET_XMIT_SUCCESS;
+
+ q->drop_overlimit++;
+ /* Return Congestion Notification only if we dropped a packet from this
+ * bucket.
+ */
+ if (hhf_drop(sch) == idx)
+ return NET_XMIT_CN;
+
+ /* As we dropped a packet, better let upper stack know this. */
+ qdisc_tree_decrease_qlen(sch, 1);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = NULL;
+ struct wdrr_bucket *bucket;
+ struct list_head *head;
+
+begin:
+ head = &q->new_buckets;
+ if (list_empty(head)) {
+ head = &q->old_buckets;
+ if (list_empty(head))
+ return NULL;
+ }
+ bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
+
+ if (bucket->deficit <= 0) {
+ int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
+ 1 : q->hhf_non_hh_weight;
+
+ bucket->deficit += weight * q->quantum;
+ list_move_tail(&bucket->bucketchain, &q->old_buckets);
+ goto begin;
+ }
+
+ if (bucket->head) {
+ skb = dequeue_head(bucket);
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ }
+
+ if (!skb) {
+ /* Force a pass through old_buckets to prevent starvation. */
+ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
+ list_move_tail(&bucket->bucketchain, &q->old_buckets);
+ else
+ list_del_init(&bucket->bucketchain);
+ goto begin;
+ }
+ qdisc_bstats_update(sch, skb);
+ bucket->deficit -= qdisc_pkt_len(skb);
+
+ return skb;
+}
+
+static void hhf_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = hhf_dequeue(sch)) != NULL)
+ kfree_skb(skb);
+}
+
+static void *hhf_zalloc(size_t sz)
+{
+ void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+ if (!ptr)
+ ptr = vzalloc(sz);
+
+ return ptr;
+}
+
+static void hhf_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static void hhf_destroy(struct Qdisc *sch)
+{
+ int i;
+ struct hhf_sched_data *q = qdisc_priv(sch);
+
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ hhf_free(q->hhf_arrays[i]);
+ hhf_free(q->hhf_valid_bits[i]);
+ }
+
+ for (i = 0; i < HH_FLOWS_CNT; i++) {
+ struct hh_flow_state *flow, *next;
+ struct list_head *head = &q->hh_flows[i];
+
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(flow, next, head, flowchain) {
+ list_del(&flow->flowchain);
+ kfree(flow);
+ }
+ }
+ hhf_free(q->hh_flows);
+}
+
+static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
+ [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 },
+ [TCA_HHF_QUANTUM] = { .type = NLA_U32 },
+ [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
+ [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 },
+ [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 },
+ [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 },
+ [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 },
+};
+
+static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_HHF_MAX + 1];
+ unsigned int qlen;
+ int err;
+ u64 non_hh_quantum;
+ u32 new_quantum = q->quantum;
+ u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_HHF_QUANTUM])
+ new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
+
+ if (tb[TCA_HHF_NON_HH_WEIGHT])
+ new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
+
+ non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
+ if (non_hh_quantum > INT_MAX)
+ return -EINVAL;
+
+ sch_tree_lock(sch);
+
+ if (tb[TCA_HHF_BACKLOG_LIMIT])
+ sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
+
+ q->quantum = new_quantum;
+ q->hhf_non_hh_weight = new_hhf_non_hh_weight;
+
+ if (tb[TCA_HHF_HH_FLOWS_LIMIT])
+ q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
+
+ if (tb[TCA_HHF_RESET_TIMEOUT]) {
+ u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
+
+ q->hhf_reset_timeout = usecs_to_jiffies(us);
+ }
+
+ if (tb[TCA_HHF_ADMIT_BYTES])
+ q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
+
+ if (tb[TCA_HHF_EVICT_TIMEOUT]) {
+ u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
+
+ q->hhf_evict_timeout = usecs_to_jiffies(us);
+ }
+
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = hhf_dequeue(sch);
+
+ kfree_skb(skb);
+ }
+ qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ sch->limit = 1000;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->perturbation = prandom_u32();
+ INIT_LIST_HEAD(&q->new_buckets);
+ INIT_LIST_HEAD(&q->old_buckets);
+
+ /* Configurable HHF parameters */
+ q->hhf_reset_timeout = HZ / 25; /* 40 ms */
+ q->hhf_admit_bytes = 131072; /* 128 KB */
+ q->hhf_evict_timeout = HZ; /* 1 sec */
+ q->hhf_non_hh_weight = 2;
+
+ if (opt) {
+ int err = hhf_change(sch, opt);
+
+ if (err)
+ return err;
+ }
+
+ if (!q->hh_flows) {
+ /* Initialize heavy-hitter flow table. */
+ q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
+ sizeof(struct list_head));
+ if (!q->hh_flows)
+ return -ENOMEM;
+ for (i = 0; i < HH_FLOWS_CNT; i++)
+ INIT_LIST_HEAD(&q->hh_flows[i]);
+
+ /* Cap max active HHs at twice len of hh_flows table. */
+ q->hh_flows_limit = 2 * HH_FLOWS_CNT;
+ q->hh_flows_overlimit = 0;
+ q->hh_flows_total_cnt = 0;
+ q->hh_flows_current_cnt = 0;
+
+ /* Initialize heavy-hitter filter arrays. */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
+ sizeof(u32));
+ if (!q->hhf_arrays[i]) {
+ hhf_destroy(sch);
+ return -ENOMEM;
+ }
+ }
+ q->hhf_arrays_reset_timestamp = hhf_time_stamp();
+
+ /* Initialize valid bits of heavy-hitter filter arrays. */
+ for (i = 0; i < HHF_ARRAYS_CNT; i++) {
+ q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
+ BITS_PER_BYTE);
+ if (!q->hhf_valid_bits[i]) {
+ hhf_destroy(sch);
+ return -ENOMEM;
+ }
+ }
+
+ /* Initialize Weighted DRR buckets. */
+ for (i = 0; i < WDRR_BUCKET_CNT; i++) {
+ struct wdrr_bucket *bucket = q->buckets + i;
+
+ INIT_LIST_HEAD(&bucket->bucketchain);
+ }
+ }
+
+ return 0;
+}
+
+static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
+ nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
+ jiffies_to_usecs(q->hhf_reset_timeout)) ||
+ nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
+ nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
+ jiffies_to_usecs(q->hhf_evict_timeout)) ||
+ nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct hhf_sched_data *q = qdisc_priv(sch);
+ struct tc_hhf_xstats st = {
+ .drop_overlimit = q->drop_overlimit,
+ .hh_overlimit = q->hh_flows_overlimit,
+ .hh_tot_count = q->hh_flows_total_cnt,
+ .hh_cur_count = q->hh_flows_current_cnt,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
+ .id = "hhf",
+ .priv_size = sizeof(struct hhf_sched_data),
+
+ .enqueue = hhf_enqueue,
+ .dequeue = hhf_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = hhf_drop,
+ .init = hhf_init,
+ .reset = hhf_reset,
+ .destroy = hhf_destroy,
+ .change = hhf_change,
+ .dump = hhf_dump,
+ .dump_stats = hhf_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init hhf_module_init(void)
+{
+ return register_qdisc(&hhf_qdisc_ops);
+}
+
+static void __exit hhf_module_exit(void)
+{
+ unregister_qdisc(&hhf_qdisc_ops);
+}
+
+module_init(hhf_module_init)
+module_exit(hhf_module_exit)
+MODULE_AUTHOR("Terry Lam");
+MODULE_AUTHOR("Nandita Dukkipati");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
new file mode 100644
index 000000000..f1acb0f60
--- /dev/null
+++ b/net/sched/sch_htb.c
@@ -0,0 +1,1630 @@
+/*
+ * net/sched/sch_htb.c Hierarchical token bucket, feed tree version
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Martin Devera, <devik@cdi.cz>
+ *
+ * Credits (in time order) for older HTB versions:
+ * Stef Coene <stef.coene@docum.org>
+ * HTB support at LARTC mailing list
+ * Ondrej Kraus, <krauso@barr.cz>
+ * found missing INIT_QDISC(htb)
+ * Vladimir Smelhaus, Aamer Akhter, Bert Hubert
+ * helped a lot to locate nasty class stall bug
+ * Andi Kleen, Jamal Hadi, Bert Hubert
+ * code review and helpful comments on shaping
+ * Tomasz Wrona, <tw@eter.tym.pl>
+ * created test case so that I was able to fix nasty bug
+ * Wilfried Weissmann
+ * spotted bug in dequeue code and helped with fix
+ * Jiri Fojtasek
+ * fixed requeue routine
+ * and many others. thanks.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+/* HTB algorithm.
+ Author: devik@cdi.cz
+ ========================================================================
+ HTB is like TBF with multiple classes. It is also similar to CBQ because
+ it allows to assign priority to each class in hierarchy.
+ In fact it is another implementation of Floyd's formal sharing.
+
+ Levels:
+ Each class is assigned level. Leaf has ALWAYS level 0 and root
+ classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
+ one less than their parent.
+*/
+
+static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
+#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */
+
+#if HTB_VER >> 16 != TC_HTB_PROTOVER
+#error "Mismatched sch_htb.c and pkt_sch.h"
+#endif
+
+/* Module parameter and sysfs export */
+module_param (htb_hysteresis, int, 0640);
+MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
+
+static int htb_rate_est = 0; /* htb classes have a default rate estimator */
+module_param(htb_rate_est, int, 0640);
+MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
+
+/* used internaly to keep status of single class */
+enum htb_cmode {
+ HTB_CANT_SEND, /* class can't send and can't borrow */
+ HTB_MAY_BORROW, /* class can't send but may borrow */
+ HTB_CAN_SEND /* class can send */
+};
+
+struct htb_prio {
+ union {
+ struct rb_root row;
+ struct rb_root feed;
+ };
+ struct rb_node *ptr;
+ /* When class changes from state 1->2 and disconnects from
+ * parent's feed then we lost ptr value and start from the
+ * first child again. Here we store classid of the
+ * last valid ptr (used when ptr is NULL).
+ */
+ u32 last_ptr_id;
+};
+
+/* interior & leaf nodes; props specific to leaves are marked L:
+ * To reduce false sharing, place mostly read fields at beginning,
+ * and mostly written ones at the end.
+ */
+struct htb_class {
+ struct Qdisc_class_common common;
+ struct psched_ratecfg rate;
+ struct psched_ratecfg ceil;
+ s64 buffer, cbuffer;/* token bucket depth/rate */
+ s64 mbuffer; /* max wait time */
+ u32 prio; /* these two are used only by leaves... */
+ int quantum; /* but stored for parent-to-leaf return */
+
+ struct tcf_proto __rcu *filter_list; /* class attached filters */
+ int filter_cnt;
+ int refcnt; /* usage count of this class */
+
+ int level; /* our level (see above) */
+ unsigned int children;
+ struct htb_class *parent; /* parent class */
+
+ struct gnet_stats_rate_est64 rate_est;
+
+ /*
+ * Written often fields
+ */
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct tc_htb_xstats xstats; /* our special stats */
+
+ /* token bucket parameters */
+ s64 tokens, ctokens;/* current number of tokens */
+ s64 t_c; /* checkpoint time */
+
+ union {
+ struct htb_class_leaf {
+ struct list_head drop_list;
+ int deficit[TC_HTB_MAXDEPTH];
+ struct Qdisc *q;
+ } leaf;
+ struct htb_class_inner {
+ struct htb_prio clprio[TC_HTB_NUMPRIO];
+ } inner;
+ } un;
+ s64 pq_key;
+
+ int prio_activity; /* for which prios are we active */
+ enum htb_cmode cmode; /* current mode of the class */
+ struct rb_node pq_node; /* node for event queue */
+ struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+};
+
+struct htb_level {
+ struct rb_root wait_pq;
+ struct htb_prio hprio[TC_HTB_NUMPRIO];
+};
+
+struct htb_sched {
+ struct Qdisc_class_hash clhash;
+ int defcls; /* class where unclassified flows go to */
+ int rate2quantum; /* quant = rate / rate2quantum */
+
+ /* filters for qdisc itself */
+ struct tcf_proto __rcu *filter_list;
+
+#define HTB_WARN_TOOMANYEVENTS 0x1
+ unsigned int warned; /* only one warning */
+ int direct_qlen;
+ struct work_struct work;
+
+ /* non shaped skbs; let them go directly thru */
+ struct sk_buff_head direct_queue;
+ long direct_pkts;
+
+ struct qdisc_watchdog watchdog;
+
+ s64 now; /* cached dequeue time */
+ struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
+
+ /* time of nearest event per level (row) */
+ s64 near_ev_cache[TC_HTB_MAXDEPTH];
+
+ int row_mask[TC_HTB_MAXDEPTH];
+
+ struct htb_level hlevel[TC_HTB_MAXDEPTH];
+};
+
+/* find class in global hash table using given handle */
+static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, handle);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct htb_class, common);
+}
+
+/**
+ * htb_classify - classify a packet into class
+ *
+ * It returns NULL if the packet should be dropped or -1 if the packet
+ * should be passed directly thru. In all other cases leaf class is returned.
+ * We allow direct class selection by classid in priority. The we examine
+ * filters in qdisc and in inner nodes (if higher filter points to the inner
+ * node). If we end up with classid MAJOR:0 we enqueue the skb into special
+ * internal fifo (direct). These packets then go directly thru. If we still
+ * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
+ * then finish and return direct queue.
+ */
+#define HTB_DIRECT ((struct htb_class *)-1L)
+
+static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl;
+ struct tcf_result res;
+ struct tcf_proto *tcf;
+ int result;
+
+ /* allow to select class by setting skb->priority to valid classid;
+ * note that nfmark can be used too by attaching filter fw with no
+ * rules in it
+ */
+ if (skb->priority == sch->handle)
+ return HTB_DIRECT; /* X:0 (direct flow) selected */
+ cl = htb_find(skb->priority, sch);
+ if (cl) {
+ if (cl->level == 0)
+ return cl;
+ /* Start with inner filter chain if a non-leaf class is selected */
+ tcf = rcu_dereference_bh(cl->filter_list);
+ } else {
+ tcf = rcu_dereference_bh(q->filter_list);
+ }
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (void *)res.class;
+ if (!cl) {
+ if (res.classid == sch->handle)
+ return HTB_DIRECT; /* X:0 (direct flow) */
+ cl = htb_find(res.classid, sch);
+ if (!cl)
+ break; /* filter selected invalid classid */
+ }
+ if (!cl->level)
+ return cl; /* we hit leaf; return it */
+
+ /* we have got inner class; apply inner filter chain */
+ tcf = rcu_dereference_bh(cl->filter_list);
+ }
+ /* classification failed; try to use default class */
+ cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+ if (!cl || cl->level)
+ return HTB_DIRECT; /* bad default .. this is safe bet */
+ return cl;
+}
+
+/**
+ * htb_add_to_id_tree - adds class to the round robin list
+ *
+ * Routine adds class to the list (actually tree) sorted by classid.
+ * Make sure that class is not already on such list for given prio.
+ */
+static void htb_add_to_id_tree(struct rb_root *root,
+ struct htb_class *cl, int prio)
+{
+ struct rb_node **p = &root->rb_node, *parent = NULL;
+
+ while (*p) {
+ struct htb_class *c;
+ parent = *p;
+ c = rb_entry(parent, struct htb_class, node[prio]);
+
+ if (cl->common.classid > c->common.classid)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->node[prio], parent, p);
+ rb_insert_color(&cl->node[prio], root);
+}
+
+/**
+ * htb_add_to_wait_tree - adds class to the event queue with delay
+ *
+ * The class is added to priority event queue to indicate that class will
+ * change its mode in cl->pq_key microseconds. Make sure that class is not
+ * already in the queue.
+ */
+static void htb_add_to_wait_tree(struct htb_sched *q,
+ struct htb_class *cl, s64 delay)
+{
+ struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
+
+ cl->pq_key = q->now + delay;
+ if (cl->pq_key == q->now)
+ cl->pq_key++;
+
+ /* update the nearest event cache */
+ if (q->near_ev_cache[cl->level] > cl->pq_key)
+ q->near_ev_cache[cl->level] = cl->pq_key;
+
+ while (*p) {
+ struct htb_class *c;
+ parent = *p;
+ c = rb_entry(parent, struct htb_class, pq_node);
+ if (cl->pq_key >= c->pq_key)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&cl->pq_node, parent, p);
+ rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
+}
+
+/**
+ * htb_next_rb_node - finds next node in binary tree
+ *
+ * When we are past last key we return NULL.
+ * Average complexity is 2 steps per call.
+ */
+static inline void htb_next_rb_node(struct rb_node **n)
+{
+ *n = rb_next(*n);
+}
+
+/**
+ * htb_add_class_to_row - add class to its row
+ *
+ * The class is added to row at priorities marked in mask.
+ * It does nothing if mask == 0.
+ */
+static inline void htb_add_class_to_row(struct htb_sched *q,
+ struct htb_class *cl, int mask)
+{
+ q->row_mask[cl->level] |= mask;
+ while (mask) {
+ int prio = ffz(~mask);
+ mask &= ~(1 << prio);
+ htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
+ }
+}
+
+/* If this triggers, it is a bug in this code, but it need not be fatal */
+static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
+{
+ if (RB_EMPTY_NODE(rb)) {
+ WARN_ON(1);
+ } else {
+ rb_erase(rb, root);
+ RB_CLEAR_NODE(rb);
+ }
+}
+
+
+/**
+ * htb_remove_class_from_row - removes class from its row
+ *
+ * The class is removed from row at priorities marked in mask.
+ * It does nothing if mask == 0.
+ */
+static inline void htb_remove_class_from_row(struct htb_sched *q,
+ struct htb_class *cl, int mask)
+{
+ int m = 0;
+ struct htb_level *hlevel = &q->hlevel[cl->level];
+
+ while (mask) {
+ int prio = ffz(~mask);
+ struct htb_prio *hprio = &hlevel->hprio[prio];
+
+ mask &= ~(1 << prio);
+ if (hprio->ptr == cl->node + prio)
+ htb_next_rb_node(&hprio->ptr);
+
+ htb_safe_rb_erase(cl->node + prio, &hprio->row);
+ if (!hprio->row.rb_node)
+ m |= 1 << prio;
+ }
+ q->row_mask[cl->level] &= ~m;
+}
+
+/**
+ * htb_activate_prios - creates active classe's feed chain
+ *
+ * The class is connected to ancestors and/or appropriate rows
+ * for priorities it is participating on. cl->cmode must be new
+ * (activated) mode. It does nothing if cl->prio_activity == 0.
+ */
+static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
+{
+ struct htb_class *p = cl->parent;
+ long m, mask = cl->prio_activity;
+
+ while (cl->cmode == HTB_MAY_BORROW && p && mask) {
+ m = mask;
+ while (m) {
+ int prio = ffz(~m);
+ m &= ~(1 << prio);
+
+ if (p->un.inner.clprio[prio].feed.rb_node)
+ /* parent already has its feed in use so that
+ * reset bit in mask as parent is already ok
+ */
+ mask &= ~(1 << prio);
+
+ htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
+ }
+ p->prio_activity |= mask;
+ cl = p;
+ p = cl->parent;
+
+ }
+ if (cl->cmode == HTB_CAN_SEND && mask)
+ htb_add_class_to_row(q, cl, mask);
+}
+
+/**
+ * htb_deactivate_prios - remove class from feed chain
+ *
+ * cl->cmode must represent old mode (before deactivation). It does
+ * nothing if cl->prio_activity == 0. Class is removed from all feed
+ * chains and rows.
+ */
+static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
+{
+ struct htb_class *p = cl->parent;
+ long m, mask = cl->prio_activity;
+
+ while (cl->cmode == HTB_MAY_BORROW && p && mask) {
+ m = mask;
+ mask = 0;
+ while (m) {
+ int prio = ffz(~m);
+ m &= ~(1 << prio);
+
+ if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
+ /* we are removing child which is pointed to from
+ * parent feed - forget the pointer but remember
+ * classid
+ */
+ p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
+ p->un.inner.clprio[prio].ptr = NULL;
+ }
+
+ htb_safe_rb_erase(cl->node + prio,
+ &p->un.inner.clprio[prio].feed);
+
+ if (!p->un.inner.clprio[prio].feed.rb_node)
+ mask |= 1 << prio;
+ }
+
+ p->prio_activity &= ~mask;
+ cl = p;
+ p = cl->parent;
+
+ }
+ if (cl->cmode == HTB_CAN_SEND && mask)
+ htb_remove_class_from_row(q, cl, mask);
+}
+
+static inline s64 htb_lowater(const struct htb_class *cl)
+{
+ if (htb_hysteresis)
+ return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
+ else
+ return 0;
+}
+static inline s64 htb_hiwater(const struct htb_class *cl)
+{
+ if (htb_hysteresis)
+ return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
+ else
+ return 0;
+}
+
+
+/**
+ * htb_class_mode - computes and returns current class mode
+ *
+ * It computes cl's mode at time cl->t_c+diff and returns it. If mode
+ * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
+ * from now to time when cl will change its state.
+ * Also it is worth to note that class mode doesn't change simply
+ * at cl->{c,}tokens == 0 but there can rather be hysteresis of
+ * 0 .. -cl->{c,}buffer range. It is meant to limit number of
+ * mode transitions per time unit. The speed gain is about 1/6.
+ */
+static inline enum htb_cmode
+htb_class_mode(struct htb_class *cl, s64 *diff)
+{
+ s64 toks;
+
+ if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
+ *diff = -toks;
+ return HTB_CANT_SEND;
+ }
+
+ if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
+ return HTB_CAN_SEND;
+
+ *diff = -toks;
+ return HTB_MAY_BORROW;
+}
+
+/**
+ * htb_change_class_mode - changes classe's mode
+ *
+ * This should be the only way how to change classe's mode under normal
+ * cirsumstances. Routine will update feed lists linkage, change mode
+ * and add class to the wait event queue if appropriate. New mode should
+ * be different from old one and cl->pq_key has to be valid if changing
+ * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
+ */
+static void
+htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
+{
+ enum htb_cmode new_mode = htb_class_mode(cl, diff);
+
+ if (new_mode == cl->cmode)
+ return;
+
+ if (cl->prio_activity) { /* not necessary: speed optimization */
+ if (cl->cmode != HTB_CANT_SEND)
+ htb_deactivate_prios(q, cl);
+ cl->cmode = new_mode;
+ if (new_mode != HTB_CANT_SEND)
+ htb_activate_prios(q, cl);
+ } else
+ cl->cmode = new_mode;
+}
+
+/**
+ * htb_activate - inserts leaf cl into appropriate active feeds
+ *
+ * Routine learns (new) priority of leaf and activates feed chain
+ * for the prio. It can be called on already active leaf safely.
+ * It also adds leaf into droplist.
+ */
+static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
+{
+ WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
+
+ if (!cl->prio_activity) {
+ cl->prio_activity = 1 << cl->prio;
+ htb_activate_prios(q, cl);
+ list_add_tail(&cl->un.leaf.drop_list,
+ q->drops + cl->prio);
+ }
+}
+
+/**
+ * htb_deactivate - remove leaf cl from active feeds
+ *
+ * Make sure that leaf is active. In the other words it can't be called
+ * with non-active leaf. It also removes class from the drop list.
+ */
+static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
+{
+ WARN_ON(!cl->prio_activity);
+
+ htb_deactivate_prios(q, cl);
+ cl->prio_activity = 0;
+ list_del_init(&cl->un.leaf.drop_list);
+}
+
+static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ int uninitialized_var(ret);
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl = htb_classify(skb, sch, &ret);
+
+ if (cl == HTB_DIRECT) {
+ /* enqueue to helper queue */
+ if (q->direct_queue.qlen < q->direct_qlen) {
+ __skb_queue_tail(&q->direct_queue, skb);
+ q->direct_pkts++;
+ } else {
+ return qdisc_drop(skb, sch);
+ }
+#ifdef CONFIG_NET_CLS_ACT
+ } else if (!cl) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+#endif
+ } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret)) {
+ qdisc_qstats_drop(sch);
+ cl->qstats.drops++;
+ }
+ return ret;
+ } else {
+ htb_activate(q, cl);
+ }
+
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+}
+
+static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
+{
+ s64 toks = diff + cl->tokens;
+
+ if (toks > cl->buffer)
+ toks = cl->buffer;
+ toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
+ if (toks <= -cl->mbuffer)
+ toks = 1 - cl->mbuffer;
+
+ cl->tokens = toks;
+}
+
+static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
+{
+ s64 toks = diff + cl->ctokens;
+
+ if (toks > cl->cbuffer)
+ toks = cl->cbuffer;
+ toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
+ if (toks <= -cl->mbuffer)
+ toks = 1 - cl->mbuffer;
+
+ cl->ctokens = toks;
+}
+
+/**
+ * htb_charge_class - charges amount "bytes" to leaf and ancestors
+ *
+ * Routine assumes that packet "bytes" long was dequeued from leaf cl
+ * borrowing from "level". It accounts bytes to ceil leaky bucket for
+ * leaf and all ancestors and to rate bucket for ancestors at levels
+ * "level" and higher. It also handles possible change of mode resulting
+ * from the update. Note that mode can also increase here (MAY_BORROW to
+ * CAN_SEND) because we can use more precise clock that event queue here.
+ * In such case we remove class from event queue first.
+ */
+static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
+ int level, struct sk_buff *skb)
+{
+ int bytes = qdisc_pkt_len(skb);
+ enum htb_cmode old_mode;
+ s64 diff;
+
+ while (cl) {
+ diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
+ if (cl->level >= level) {
+ if (cl->level == level)
+ cl->xstats.lends++;
+ htb_accnt_tokens(cl, bytes, diff);
+ } else {
+ cl->xstats.borrows++;
+ cl->tokens += diff; /* we moved t_c; update tokens */
+ }
+ htb_accnt_ctokens(cl, bytes, diff);
+ cl->t_c = q->now;
+
+ old_mode = cl->cmode;
+ diff = 0;
+ htb_change_class_mode(q, cl, &diff);
+ if (old_mode != cl->cmode) {
+ if (old_mode != HTB_CAN_SEND)
+ htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
+ if (cl->cmode != HTB_CAN_SEND)
+ htb_add_to_wait_tree(q, cl, diff);
+ }
+
+ /* update basic stats except for leaves which are already updated */
+ if (cl->level)
+ bstats_update(&cl->bstats, skb);
+
+ cl = cl->parent;
+ }
+}
+
+/**
+ * htb_do_events - make mode changes to classes at the level
+ *
+ * Scans event queue for pending events and applies them. Returns time of
+ * next pending event (0 for no event in pq, q->now for too many events).
+ * Note: Applied are events whose have cl->pq_key <= q->now.
+ */
+static s64 htb_do_events(struct htb_sched *q, const int level,
+ unsigned long start)
+{
+ /* don't run for longer than 2 jiffies; 2 is used instead of
+ * 1 to simplify things when jiffy is going to be incremented
+ * too soon
+ */
+ unsigned long stop_at = start + 2;
+ struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
+
+ while (time_before(jiffies, stop_at)) {
+ struct htb_class *cl;
+ s64 diff;
+ struct rb_node *p = rb_first(wait_pq);
+
+ if (!p)
+ return 0;
+
+ cl = rb_entry(p, struct htb_class, pq_node);
+ if (cl->pq_key > q->now)
+ return cl->pq_key;
+
+ htb_safe_rb_erase(p, wait_pq);
+ diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
+ htb_change_class_mode(q, cl, &diff);
+ if (cl->cmode != HTB_CAN_SEND)
+ htb_add_to_wait_tree(q, cl, diff);
+ }
+
+ /* too much load - let's continue after a break for scheduling */
+ if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
+ pr_warn("htb: too many events!\n");
+ q->warned |= HTB_WARN_TOOMANYEVENTS;
+ }
+
+ return q->now;
+}
+
+/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
+ * is no such one exists.
+ */
+static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
+ u32 id)
+{
+ struct rb_node *r = NULL;
+ while (n) {
+ struct htb_class *cl =
+ rb_entry(n, struct htb_class, node[prio]);
+
+ if (id > cl->common.classid) {
+ n = n->rb_right;
+ } else if (id < cl->common.classid) {
+ r = n;
+ n = n->rb_left;
+ } else {
+ return n;
+ }
+ }
+ return r;
+}
+
+/**
+ * htb_lookup_leaf - returns next leaf class in DRR order
+ *
+ * Find leaf where current feed pointers points to.
+ */
+static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
+{
+ int i;
+ struct {
+ struct rb_node *root;
+ struct rb_node **pptr;
+ u32 *pid;
+ } stk[TC_HTB_MAXDEPTH], *sp = stk;
+
+ BUG_ON(!hprio->row.rb_node);
+ sp->root = hprio->row.rb_node;
+ sp->pptr = &hprio->ptr;
+ sp->pid = &hprio->last_ptr_id;
+
+ for (i = 0; i < 65535; i++) {
+ if (!*sp->pptr && *sp->pid) {
+ /* ptr was invalidated but id is valid - try to recover
+ * the original or next ptr
+ */
+ *sp->pptr =
+ htb_id_find_next_upper(prio, sp->root, *sp->pid);
+ }
+ *sp->pid = 0; /* ptr is valid now so that remove this hint as it
+ * can become out of date quickly
+ */
+ if (!*sp->pptr) { /* we are at right end; rewind & go up */
+ *sp->pptr = sp->root;
+ while ((*sp->pptr)->rb_left)
+ *sp->pptr = (*sp->pptr)->rb_left;
+ if (sp > stk) {
+ sp--;
+ if (!*sp->pptr) {
+ WARN_ON(1);
+ return NULL;
+ }
+ htb_next_rb_node(sp->pptr);
+ }
+ } else {
+ struct htb_class *cl;
+ struct htb_prio *clp;
+
+ cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
+ if (!cl->level)
+ return cl;
+ clp = &cl->un.inner.clprio[prio];
+ (++sp)->root = clp->feed.rb_node;
+ sp->pptr = &clp->ptr;
+ sp->pid = &clp->last_ptr_id;
+ }
+ }
+ WARN_ON(1);
+ return NULL;
+}
+
+/* dequeues packet at given priority and level; call only if
+ * you are sure that there is active class at prio/level
+ */
+static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
+ const int level)
+{
+ struct sk_buff *skb = NULL;
+ struct htb_class *cl, *start;
+ struct htb_level *hlevel = &q->hlevel[level];
+ struct htb_prio *hprio = &hlevel->hprio[prio];
+
+ /* look initial class up in the row */
+ start = cl = htb_lookup_leaf(hprio, prio);
+
+ do {
+next:
+ if (unlikely(!cl))
+ return NULL;
+
+ /* class can be empty - it is unlikely but can be true if leaf
+ * qdisc drops packets in enqueue routine or if someone used
+ * graft operation on the leaf since last dequeue;
+ * simply deactivate and skip such class
+ */
+ if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
+ struct htb_class *next;
+ htb_deactivate(q, cl);
+
+ /* row/level might become empty */
+ if ((q->row_mask[level] & (1 << prio)) == 0)
+ return NULL;
+
+ next = htb_lookup_leaf(hprio, prio);
+
+ if (cl == start) /* fix start if we just deleted it */
+ start = next;
+ cl = next;
+ goto next;
+ }
+
+ skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+ if (likely(skb != NULL))
+ break;
+
+ qdisc_warn_nonwc("htb", cl->un.leaf.q);
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+ &q->hlevel[0].hprio[prio].ptr);
+ cl = htb_lookup_leaf(hprio, prio);
+
+ } while (cl != start);
+
+ if (likely(skb != NULL)) {
+ bstats_update(&cl->bstats, skb);
+ cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
+ if (cl->un.leaf.deficit[level] < 0) {
+ cl->un.leaf.deficit[level] += cl->quantum;
+ htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+ &q->hlevel[0].hprio[prio].ptr);
+ }
+ /* this used to be after charge_class but this constelation
+ * gives us slightly better performance
+ */
+ if (!cl->un.leaf.q->q.qlen)
+ htb_deactivate(q, cl);
+ htb_charge_class(q, cl, level, skb);
+ }
+ return skb;
+}
+
+static struct sk_buff *htb_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct htb_sched *q = qdisc_priv(sch);
+ int level;
+ s64 next_event;
+ unsigned long start_at;
+
+ /* try to dequeue direct packets as high prio (!) to minimize cpu work */
+ skb = __skb_dequeue(&q->direct_queue);
+ if (skb != NULL) {
+ok:
+ qdisc_bstats_update(sch, skb);
+ qdisc_unthrottled(sch);
+ sch->q.qlen--;
+ return skb;
+ }
+
+ if (!sch->q.qlen)
+ goto fin;
+ q->now = ktime_get_ns();
+ start_at = jiffies;
+
+ next_event = q->now + 5LLU * NSEC_PER_SEC;
+
+ for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
+ /* common case optimization - skip event handler quickly */
+ int m;
+ s64 event = q->near_ev_cache[level];
+
+ if (q->now >= event) {
+ event = htb_do_events(q, level, start_at);
+ if (!event)
+ event = q->now + NSEC_PER_SEC;
+ q->near_ev_cache[level] = event;
+ }
+
+ if (next_event > event)
+ next_event = event;
+
+ m = ~q->row_mask[level];
+ while (m != (int)(-1)) {
+ int prio = ffz(m);
+
+ m |= 1 << prio;
+ skb = htb_dequeue_tree(q, prio, level);
+ if (likely(skb != NULL))
+ goto ok;
+ }
+ }
+ qdisc_qstats_overlimit(sch);
+ if (likely(next_event > q->now)) {
+ if (!test_bit(__QDISC_STATE_DEACTIVATED,
+ &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
+ ktime_t time = ns_to_ktime(next_event);
+ qdisc_throttled(q->watchdog.qdisc);
+ hrtimer_start(&q->watchdog.timer, time,
+ HRTIMER_MODE_ABS_PINNED);
+ }
+ } else {
+ schedule_work(&q->work);
+ }
+fin:
+ return skb;
+}
+
+/* try to drop from each class (by prio) until one succeed */
+static unsigned int htb_drop(struct Qdisc *sch)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ int prio;
+
+ for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
+ struct list_head *p;
+ list_for_each(p, q->drops + prio) {
+ struct htb_class *cl = list_entry(p, struct htb_class,
+ un.leaf.drop_list);
+ unsigned int len;
+ if (cl->un.leaf.q->ops->drop &&
+ (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
+ sch->q.qlen--;
+ if (!cl->un.leaf.q->q.qlen)
+ htb_deactivate(q, cl);
+ return len;
+ }
+ }
+ }
+ return 0;
+}
+
+/* reset all classes */
+/* always caled under BH & queue lock */
+static void htb_reset(struct Qdisc *sch)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (cl->level)
+ memset(&cl->un.inner, 0, sizeof(cl->un.inner));
+ else {
+ if (cl->un.leaf.q)
+ qdisc_reset(cl->un.leaf.q);
+ INIT_LIST_HEAD(&cl->un.leaf.drop_list);
+ }
+ cl->prio_activity = 0;
+ cl->cmode = HTB_CAN_SEND;
+
+ }
+ }
+ qdisc_watchdog_cancel(&q->watchdog);
+ __skb_queue_purge(&q->direct_queue);
+ sch->q.qlen = 0;
+ memset(q->hlevel, 0, sizeof(q->hlevel));
+ memset(q->row_mask, 0, sizeof(q->row_mask));
+ for (i = 0; i < TC_HTB_NUMPRIO; i++)
+ INIT_LIST_HEAD(q->drops + i);
+}
+
+static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
+ [TCA_HTB_PARMS] = { .len = sizeof(struct tc_htb_opt) },
+ [TCA_HTB_INIT] = { .len = sizeof(struct tc_htb_glob) },
+ [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
+ [TCA_HTB_RATE64] = { .type = NLA_U64 },
+ [TCA_HTB_CEIL64] = { .type = NLA_U64 },
+};
+
+static void htb_work_func(struct work_struct *work)
+{
+ struct htb_sched *q = container_of(work, struct htb_sched, work);
+ struct Qdisc *sch = q->watchdog.qdisc;
+
+ __netif_schedule(qdisc_root(sch));
+}
+
+static int htb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_HTB_MAX + 1];
+ struct tc_htb_glob *gopt;
+ int err;
+ int i;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_HTB_INIT])
+ return -EINVAL;
+
+ gopt = nla_data(tb[TCA_HTB_INIT]);
+ if (gopt->version != HTB_VER >> 16)
+ return -EINVAL;
+
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+ for (i = 0; i < TC_HTB_NUMPRIO; i++)
+ INIT_LIST_HEAD(q->drops + i);
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+ INIT_WORK(&q->work, htb_work_func);
+ __skb_queue_head_init(&q->direct_queue);
+
+ if (tb[TCA_HTB_DIRECT_QLEN])
+ q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
+ else {
+ q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
+ if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */
+ q->direct_qlen = 2;
+ }
+ if ((q->rate2quantum = gopt->rate2quantum) < 1)
+ q->rate2quantum = 1;
+ q->defcls = gopt->defcls;
+
+ return 0;
+}
+
+static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct nlattr *nest;
+ struct tc_htb_glob gopt;
+
+ /* Its safe to not acquire qdisc lock. As we hold RTNL,
+ * no change can happen on the qdisc parameters.
+ */
+
+ gopt.direct_pkts = q->direct_pkts;
+ gopt.version = HTB_VER;
+ gopt.rate2quantum = q->rate2quantum;
+ gopt.defcls = q->defcls;
+ gopt.debug = 0;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
+ nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+ struct nlattr *nest;
+ struct tc_htb_opt opt;
+
+ /* Its safe to not acquire qdisc lock. As we hold RTNL,
+ * no change can happen on the class parameters.
+ */
+ tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ if (!cl->level && cl->un.leaf.q)
+ tcm->tcm_info = cl->un.leaf.q->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ memset(&opt, 0, sizeof(opt));
+
+ psched_ratecfg_getrate(&opt.rate, &cl->rate);
+ opt.buffer = PSCHED_NS2TICKS(cl->buffer);
+ psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
+ opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
+ opt.quantum = cl->quantum;
+ opt.prio = cl->prio;
+ opt.level = cl->level;
+ if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
+ goto nla_put_failure;
+ if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int
+htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+ __u32 qlen = 0;
+
+ if (!cl->level && cl->un.leaf.q)
+ qlen = cl->un.leaf.q->q.qlen;
+ cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
+ cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
+
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
+}
+
+static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+
+ if (cl->level)
+ return -EINVAL;
+ if (new == NULL &&
+ (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->common.classid)) == NULL)
+ return -ENOBUFS;
+
+ sch_tree_lock(sch);
+ *old = cl->un.leaf.q;
+ cl->un.leaf.q = new;
+ if (*old != NULL) {
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ }
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+ return !cl->level ? cl->un.leaf.q : NULL;
+}
+
+static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+
+ if (cl->un.leaf.q->q.qlen == 0)
+ htb_deactivate(qdisc_priv(sch), cl);
+}
+
+static unsigned long htb_get(struct Qdisc *sch, u32 classid)
+{
+ struct htb_class *cl = htb_find(classid, sch);
+ if (cl)
+ cl->refcnt++;
+ return (unsigned long)cl;
+}
+
+static inline int htb_parent_last_child(struct htb_class *cl)
+{
+ if (!cl->parent)
+ /* the root class */
+ return 0;
+ if (cl->parent->children > 1)
+ /* not the last child */
+ return 0;
+ return 1;
+}
+
+static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
+ struct Qdisc *new_q)
+{
+ struct htb_class *parent = cl->parent;
+
+ WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
+
+ if (parent->cmode != HTB_CAN_SEND)
+ htb_safe_rb_erase(&parent->pq_node,
+ &q->hlevel[parent->level].wait_pq);
+
+ parent->level = 0;
+ memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+ INIT_LIST_HEAD(&parent->un.leaf.drop_list);
+ parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
+ parent->tokens = parent->buffer;
+ parent->ctokens = parent->cbuffer;
+ parent->t_c = ktime_get_ns();
+ parent->cmode = HTB_CAN_SEND;
+}
+
+static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
+{
+ if (!cl->level) {
+ WARN_ON(!cl->un.leaf.q);
+ qdisc_destroy(cl->un.leaf.q);
+ }
+ gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ tcf_destroy_chain(&cl->filter_list);
+ kfree(cl);
+}
+
+static void htb_destroy(struct Qdisc *sch)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct hlist_node *next;
+ struct htb_class *cl;
+ unsigned int i;
+
+ cancel_work_sync(&q->work);
+ qdisc_watchdog_cancel(&q->watchdog);
+ /* This line used to be after htb_destroy_class call below
+ * and surprisingly it worked in 2.4. But it must precede it
+ * because filter need its target class alive to be able to call
+ * unbind_filter on it (without Oops).
+ */
+ tcf_destroy_chain(&q->filter_list);
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
+ tcf_destroy_chain(&cl->filter_list);
+ }
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ common.hnode)
+ htb_destroy_class(sch, cl);
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+ __skb_queue_purge(&q->direct_queue);
+}
+
+static int htb_delete(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl = (struct htb_class *)arg;
+ unsigned int qlen;
+ struct Qdisc *new_q = NULL;
+ int last_child = 0;
+
+ /* TODO: why don't allow to delete subtree ? references ? does
+ * tc subsys guarantee us that in htb_destroy it holds no class
+ * refs so that we can remove children safely there ?
+ */
+ if (cl->children || cl->filter_cnt)
+ return -EBUSY;
+
+ if (!cl->level && htb_parent_last_child(cl)) {
+ new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ cl->parent->common.classid);
+ last_child = 1;
+ }
+
+ sch_tree_lock(sch);
+
+ if (!cl->level) {
+ qlen = cl->un.leaf.q->q.qlen;
+ qdisc_reset(cl->un.leaf.q);
+ qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
+ }
+
+ /* delete from hash and active; remainder in destroy_class */
+ qdisc_class_hash_remove(&q->clhash, &cl->common);
+ if (cl->parent)
+ cl->parent->children--;
+
+ if (cl->prio_activity)
+ htb_deactivate(q, cl);
+
+ if (cl->cmode != HTB_CAN_SEND)
+ htb_safe_rb_erase(&cl->pq_node,
+ &q->hlevel[cl->level].wait_pq);
+
+ if (last_child)
+ htb_parent_to_leaf(q, cl, new_q);
+
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static void htb_put(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+
+ if (--cl->refcnt == 0)
+ htb_destroy_class(sch, cl);
+}
+
+static int htb_change_class(struct Qdisc *sch, u32 classid,
+ u32 parentid, struct nlattr **tca,
+ unsigned long *arg)
+{
+ int err = -EINVAL;
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl = (struct htb_class *)*arg, *parent;
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_HTB_MAX + 1];
+ struct tc_htb_opt *hopt;
+ u64 rate64, ceil64;
+
+ /* extract all subattrs from opt attr */
+ if (!opt)
+ goto failure;
+
+ err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
+ if (err < 0)
+ goto failure;
+
+ err = -EINVAL;
+ if (tb[TCA_HTB_PARMS] == NULL)
+ goto failure;
+
+ parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
+
+ hopt = nla_data(tb[TCA_HTB_PARMS]);
+ if (!hopt->rate.rate || !hopt->ceil.rate)
+ goto failure;
+
+ /* Keeping backward compatible with rate_table based iproute2 tc */
+ if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
+
+ if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
+
+ if (!cl) { /* new class */
+ struct Qdisc *new_q;
+ int prio;
+ struct {
+ struct nlattr nla;
+ struct gnet_estimator opt;
+ } est = {
+ .nla = {
+ .nla_len = nla_attr_size(sizeof(est.opt)),
+ .nla_type = TCA_RATE,
+ },
+ .opt = {
+ /* 4s interval, 16s averaging constant */
+ .interval = 2,
+ .ewma_log = 2,
+ },
+ };
+
+ /* check for valid classid */
+ if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
+ htb_find(classid, sch))
+ goto failure;
+
+ /* check maximal depth */
+ if (parent && parent->parent && parent->parent->level < 2) {
+ pr_err("htb: tree is too deep\n");
+ goto failure;
+ }
+ err = -ENOBUFS;
+ cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+ if (!cl)
+ goto failure;
+
+ if (htb_rate_est || tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE] ? : &est.nla);
+ if (err) {
+ kfree(cl);
+ goto failure;
+ }
+ }
+
+ cl->refcnt = 1;
+ cl->children = 0;
+ INIT_LIST_HEAD(&cl->un.leaf.drop_list);
+ RB_CLEAR_NODE(&cl->pq_node);
+
+ for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
+ RB_CLEAR_NODE(&cl->node[prio]);
+
+ /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
+ * so that can't be used inside of sch_tree_lock
+ * -- thanks to Karlis Peisenieks
+ */
+ new_q = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, classid);
+ sch_tree_lock(sch);
+ if (parent && !parent->level) {
+ unsigned int qlen = parent->un.leaf.q->q.qlen;
+
+ /* turn parent into inner node */
+ qdisc_reset(parent->un.leaf.q);
+ qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
+ qdisc_destroy(parent->un.leaf.q);
+ if (parent->prio_activity)
+ htb_deactivate(q, parent);
+
+ /* remove from evt list because of level change */
+ if (parent->cmode != HTB_CAN_SEND) {
+ htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
+ parent->cmode = HTB_CAN_SEND;
+ }
+ parent->level = (parent->parent ? parent->parent->level
+ : TC_HTB_MAXDEPTH) - 1;
+ memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+ }
+ /* leaf (we) needs elementary qdisc */
+ cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
+
+ cl->common.classid = classid;
+ cl->parent = parent;
+
+ /* set class to be in HTB_CAN_SEND state */
+ cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
+ cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
+ cl->mbuffer = 60ULL * NSEC_PER_SEC; /* 1min */
+ cl->t_c = ktime_get_ns();
+ cl->cmode = HTB_CAN_SEND;
+
+ /* attach to the hash list and parent's family */
+ qdisc_class_hash_insert(&q->clhash, &cl->common);
+ if (parent)
+ parent->children++;
+ } else {
+ if (tca[TCA_RATE]) {
+ spinlock_t *lock = qdisc_root_sleeping_lock(sch);
+
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ lock,
+ tca[TCA_RATE]);
+ if (err)
+ return err;
+ }
+ sch_tree_lock(sch);
+ }
+
+ rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
+
+ ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
+
+ psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
+ psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
+
+ /* it used to be a nasty bug here, we have to check that node
+ * is really leaf before changing cl->un.leaf !
+ */
+ if (!cl->level) {
+ u64 quantum = cl->rate.rate_bytes_ps;
+
+ do_div(quantum, q->rate2quantum);
+ cl->quantum = min_t(u64, quantum, INT_MAX);
+
+ if (!hopt->quantum && cl->quantum < 1000) {
+ pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
+ cl->common.classid);
+ cl->quantum = 1000;
+ }
+ if (!hopt->quantum && cl->quantum > 200000) {
+ pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
+ cl->common.classid);
+ cl->quantum = 200000;
+ }
+ if (hopt->quantum)
+ cl->quantum = hopt->quantum;
+ if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
+ cl->prio = TC_HTB_NUMPRIO - 1;
+ }
+
+ cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
+ cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
+
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+ *arg = (unsigned long)cl;
+ return 0;
+
+failure:
+ return err;
+}
+
+static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch,
+ unsigned long arg)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl = (struct htb_class *)arg;
+ struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list;
+
+ return fl;
+}
+
+static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct htb_class *cl = htb_find(classid, sch);
+
+ /*if (cl && !cl->level) return 0;
+ * The line above used to be there to prevent attaching filters to
+ * leaves. But at least tc_index filter uses this just to get class
+ * for other reasons so that we have to allow for it.
+ * ----
+ * 19.6.2002 As Werner explained it is ok - bind filter is just
+ * another way to "lock" the class - unlike "get" this lock can
+ * be broken by class during destroy IIUC.
+ */
+ if (cl)
+ cl->filter_cnt++;
+ return (unsigned long)cl;
+}
+
+static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+ struct htb_class *cl = (struct htb_class *)arg;
+
+ if (cl)
+ cl->filter_cnt--;
+}
+
+static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct htb_sched *q = qdisc_priv(sch);
+ struct htb_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static const struct Qdisc_class_ops htb_class_ops = {
+ .graft = htb_graft,
+ .leaf = htb_leaf,
+ .qlen_notify = htb_qlen_notify,
+ .get = htb_get,
+ .put = htb_put,
+ .change = htb_change_class,
+ .delete = htb_delete,
+ .walk = htb_walk,
+ .tcf_chain = htb_find_tcf,
+ .bind_tcf = htb_bind_filter,
+ .unbind_tcf = htb_unbind_filter,
+ .dump = htb_dump_class,
+ .dump_stats = htb_dump_class_stats,
+};
+
+static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
+ .cl_ops = &htb_class_ops,
+ .id = "htb",
+ .priv_size = sizeof(struct htb_sched),
+ .enqueue = htb_enqueue,
+ .dequeue = htb_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = htb_drop,
+ .init = htb_init,
+ .reset = htb_reset,
+ .destroy = htb_destroy,
+ .dump = htb_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init htb_module_init(void)
+{
+ return register_qdisc(&htb_qdisc_ops);
+}
+static void __exit htb_module_exit(void)
+{
+ unregister_qdisc(&htb_qdisc_ops);
+}
+
+module_init(htb_module_init)
+module_exit(htb_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
new file mode 100644
index 000000000..4cdbfb856
--- /dev/null
+++ b/net/sched/sch_ingress.c
@@ -0,0 +1,153 @@
+/* net/sched/sch_ingress.c - Ingress qdisc
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Jamal Hadi Salim 1999
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct ingress_qdisc_data {
+ struct tcf_proto __rcu *filter_list;
+};
+
+/* ------------------------- Class/flow operations ------------------------- */
+
+static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
+{
+ return TC_H_MIN(classid) + 1;
+}
+
+static unsigned long ingress_bind_filter(struct Qdisc *sch,
+ unsigned long parent, u32 classid)
+{
+ return ingress_get(sch, classid);
+}
+
+static void ingress_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+}
+
+static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct ingress_qdisc_data *p = qdisc_priv(sch);
+
+ return &p->filter_list;
+}
+
+/* --------------------------- Qdisc operations ---------------------------- */
+
+static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct ingress_qdisc_data *p = qdisc_priv(sch);
+ struct tcf_result res;
+ struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
+ int result;
+
+ result = tc_classify(skb, fl, &res);
+
+ qdisc_bstats_update(sch, skb);
+ switch (result) {
+ case TC_ACT_SHOT:
+ result = TC_ACT_SHOT;
+ qdisc_qstats_drop(sch);
+ break;
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ result = TC_ACT_STOLEN;
+ break;
+ case TC_ACT_RECLASSIFY:
+ case TC_ACT_OK:
+ skb->tc_index = TC_H_MIN(res.classid);
+ default:
+ result = TC_ACT_OK;
+ break;
+ }
+
+ return result;
+}
+
+/* ------------------------------------------------------------- */
+
+static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ net_inc_ingress_queue();
+
+ return 0;
+}
+
+static void ingress_destroy(struct Qdisc *sch)
+{
+ struct ingress_qdisc_data *p = qdisc_priv(sch);
+
+ tcf_destroy_chain(&p->filter_list);
+ net_dec_ingress_queue();
+}
+
+static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static const struct Qdisc_class_ops ingress_class_ops = {
+ .leaf = ingress_leaf,
+ .get = ingress_get,
+ .put = ingress_put,
+ .walk = ingress_walk,
+ .tcf_chain = ingress_find_tcf,
+ .bind_tcf = ingress_bind_filter,
+ .unbind_tcf = ingress_put,
+};
+
+static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
+ .cl_ops = &ingress_class_ops,
+ .id = "ingress",
+ .priv_size = sizeof(struct ingress_qdisc_data),
+ .enqueue = ingress_enqueue,
+ .init = ingress_init,
+ .destroy = ingress_destroy,
+ .dump = ingress_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init ingress_module_init(void)
+{
+ return register_qdisc(&ingress_qdisc_ops);
+}
+
+static void __exit ingress_module_exit(void)
+{
+ unregister_qdisc(&ingress_qdisc_ops);
+}
+
+module_init(ingress_module_init)
+module_exit(ingress_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
new file mode 100644
index 000000000..f3cbaecd2
--- /dev/null
+++ b/net/sched/sch_mq.c
@@ -0,0 +1,246 @@
+/*
+ * net/sched/sch_mq.c Classful multiqueue dummy scheduler
+ *
+ * Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+struct mq_sched {
+ struct Qdisc **qdiscs;
+};
+
+static void mq_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mq_sched *priv = qdisc_priv(sch);
+ unsigned int ntx;
+
+ if (!priv->qdiscs)
+ return;
+ for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
+ qdisc_destroy(priv->qdiscs[ntx]);
+ kfree(priv->qdiscs);
+}
+
+static int mq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mq_sched *priv = qdisc_priv(sch);
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+ unsigned int ntx;
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ /* pre-allocate qdiscs, attachment can't fail */
+ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+ GFP_KERNEL);
+ if (priv->qdiscs == NULL)
+ return -ENOMEM;
+
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ dev_queue = netdev_get_tx_queue(dev, ntx);
+ qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(ntx + 1)));
+ if (qdisc == NULL)
+ goto err;
+ priv->qdiscs[ntx] = qdisc;
+ qdisc->flags |= TCQ_F_ONETXQUEUE;
+ }
+
+ sch->flags |= TCQ_F_MQROOT;
+ return 0;
+
+err:
+ mq_destroy(sch);
+ return -ENOMEM;
+}
+
+static void mq_attach(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mq_sched *priv = qdisc_priv(sch);
+ struct Qdisc *qdisc, *old;
+ unsigned int ntx;
+
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = priv->qdiscs[ntx];
+ old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ if (old)
+ qdisc_destroy(old);
+#ifdef CONFIG_NET_SCHED
+ if (ntx < dev->real_num_tx_queues)
+ qdisc_list_add(qdisc);
+#endif
+
+ }
+ kfree(priv->qdiscs);
+ priv->qdiscs = NULL;
+}
+
+static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct Qdisc *qdisc;
+ unsigned int ntx;
+
+ sch->q.qlen = 0;
+ memset(&sch->bstats, 0, sizeof(sch->bstats));
+ memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+ spin_lock_bh(qdisc_lock(qdisc));
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+ return 0;
+}
+
+static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1;
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
+ struct tcmsg *tcm)
+{
+ unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
+ struct netdev_queue *dev_queue = mq_queue_get(sch, ntx);
+
+ if (!dev_queue) {
+ struct net_device *dev = qdisc_dev(sch);
+
+ return netdev_get_tx_queue(dev, 0);
+ }
+ return dev_queue;
+}
+
+static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = dev_graft_qdisc(dev_queue, new);
+ if (new)
+ new->flags |= TCQ_F_ONETXQUEUE;
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+ return 0;
+}
+
+static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mq_get(struct Qdisc *sch, u32 classid)
+{
+ unsigned int ntx = TC_H_MIN(classid);
+
+ if (!mq_queue_get(sch, ntx))
+ return 0;
+ return ntx;
+}
+
+static void mq_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ return 0;
+}
+
+static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
+ return -1;
+ return 0;
+}
+
+static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx;
+
+ if (arg->stop)
+ return;
+
+ arg->count = arg->skip;
+ for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops mq_class_ops = {
+ .select_queue = mq_select_queue,
+ .graft = mq_graft,
+ .leaf = mq_leaf,
+ .get = mq_get,
+ .put = mq_put,
+ .walk = mq_walk,
+ .dump = mq_dump_class,
+ .dump_stats = mq_dump_class_stats,
+};
+
+struct Qdisc_ops mq_qdisc_ops __read_mostly = {
+ .cl_ops = &mq_class_ops,
+ .id = "mq",
+ .priv_size = sizeof(struct mq_sched),
+ .init = mq_init,
+ .destroy = mq_destroy,
+ .attach = mq_attach,
+ .dump = mq_dump,
+ .owner = THIS_MODULE,
+};
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 000000000..3811a7454
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,428 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mqprio_sched {
+ struct Qdisc **qdiscs;
+ int hw_owned;
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned int ntx;
+
+ if (priv->qdiscs) {
+ for (ntx = 0;
+ ntx < dev->num_tx_queues && priv->qdiscs[ntx];
+ ntx++)
+ qdisc_destroy(priv->qdiscs[ntx]);
+ kfree(priv->qdiscs);
+ }
+
+ if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+ dev->netdev_ops->ndo_setup_tc(dev, 0);
+ else
+ netdev_set_num_tc(dev, 0);
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+ int i, j;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i < TC_BITMASK + 1; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc)
+ return -EINVAL;
+ }
+
+ /* net_device does not support requested operation */
+ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+ return -EINVAL;
+
+ /* if hw owned qcount and qoffset are taken from LLD so
+ * no reason to verify them here
+ */
+ if (qopt->hw)
+ return 0;
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->real_num_tx_queues ||
+ !qopt->count[i] ||
+ last > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (last > qopt->offset[j])
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+ int i, err = -EOPNOTSUPP;
+ struct tc_mqprio_qopt *qopt = NULL;
+
+ BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+ BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ if (!opt || nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+ if (mqprio_parse_opt(dev, qopt))
+ return -EINVAL;
+
+ /* pre-allocate qdisc, attachment can't fail */
+ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+ GFP_KERNEL);
+ if (priv->qdiscs == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ dev_queue = netdev_get_tx_queue(dev, i);
+ qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1)));
+ if (qdisc == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+ priv->qdiscs[i] = qdisc;
+ qdisc->flags |= TCQ_F_ONETXQUEUE;
+ }
+
+ /* If the mqprio options indicate that hardware should own
+ * the queue mapping then run ndo_setup_tc otherwise use the
+ * supplied and verified mapping
+ */
+ if (qopt->hw) {
+ priv->hw_owned = 1;
+ err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+ if (err)
+ goto err;
+ } else {
+ netdev_set_num_tc(dev, qopt->num_tc);
+ for (i = 0; i < qopt->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ qopt->count[i], qopt->offset[i]);
+ }
+
+ /* Always use supplied priority mappings */
+ for (i = 0; i < TC_BITMASK + 1; i++)
+ netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+ sch->flags |= TCQ_F_MQROOT;
+ return 0;
+
+err:
+ mqprio_destroy(sch);
+ return err;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct Qdisc *qdisc, *old;
+ unsigned int ntx;
+
+ /* Attach underlying qdisc */
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ qdisc = priv->qdiscs[ntx];
+ old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+ if (old)
+ qdisc_destroy(old);
+ if (ntx < dev->real_num_tx_queues)
+ qdisc_list_add(qdisc);
+ }
+ kfree(priv->qdiscs);
+ priv->qdiscs = NULL;
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return -EINVAL;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = dev_graft_qdisc(dev_queue, new);
+
+ if (new)
+ new->flags |= TCQ_F_ONETXQUEUE;
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return 0;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_mqprio_qopt opt = { 0 };
+ struct Qdisc *qdisc;
+ unsigned int i;
+
+ sch->q.qlen = 0;
+ memset(&sch->bstats, 0, sizeof(sch->bstats));
+ memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
+ spin_lock_bh(qdisc_lock(qdisc));
+ sch->q.qlen += qdisc->q.qlen;
+ sch->bstats.bytes += qdisc->bstats.bytes;
+ sch->bstats.packets += qdisc->bstats.packets;
+ sch->qstats.backlog += qdisc->qstats.backlog;
+ sch->qstats.drops += qdisc->qstats.drops;
+ sch->qstats.requeues += qdisc->qstats.requeues;
+ sch->qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+
+ opt.num_tc = netdev_get_num_tc(dev);
+ memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ opt.hw = priv->hw_owned;
+
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ opt.count[i] = dev->tc_to_txq[i].count;
+ opt.offset[i] = dev->tc_to_txq[i].offset;
+ }
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return skb->len;
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return NULL;
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned int ntx = TC_H_MIN(classid);
+
+ if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
+ return 0;
+ return ntx;
+}
+
+static void mqprio_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (cl <= netdev_get_num_tc(dev)) {
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_info = 0;
+ } else {
+ int i;
+ struct netdev_queue *dev_queue;
+
+ dev_queue = mqprio_queue_get(sch, cl);
+ tcm->tcm_parent = 0;
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ struct netdev_tc_txq tc = dev->tc_to_txq[i];
+ int q_idx = cl - netdev_get_num_tc(dev);
+
+ if (q_idx > tc.offset &&
+ q_idx <= tc.offset + tc.count) {
+ tcm->tcm_parent =
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1));
+ break;
+ }
+ }
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+ }
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+ __releases(d->lock)
+ __acquires(d->lock)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ if (cl <= netdev_get_num_tc(dev)) {
+ int i;
+ __u32 qlen = 0;
+ struct Qdisc *qdisc;
+ struct gnet_stats_queue qstats = {0};
+ struct gnet_stats_basic_packed bstats = {0};
+ struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+
+ /* Drop lock here it will be reclaimed before touching
+ * statistics this is required because the d->lock we
+ * hold here is the look on dev_queue->qdisc_sleeping
+ * also acquired below.
+ */
+ spin_unlock_bh(d->lock);
+
+ for (i = tc.offset; i < tc.offset + tc.count; i++) {
+ struct netdev_queue *q = netdev_get_tx_queue(dev, i);
+
+ qdisc = rtnl_dereference(q->qdisc);
+ spin_lock_bh(qdisc_lock(qdisc));
+ qlen += qdisc->q.qlen;
+ bstats.bytes += qdisc->bstats.bytes;
+ bstats.packets += qdisc->bstats.packets;
+ qstats.backlog += qdisc->qstats.backlog;
+ qstats.drops += qdisc->qstats.drops;
+ qstats.requeues += qdisc->qstats.requeues;
+ qstats.overlimits += qdisc->qstats.overlimits;
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+ /* Reclaim root sleeping lock before completing stats */
+ spin_lock_bh(d->lock);
+ if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0)
+ return -1;
+ } else {
+ struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL,
+ &sch->qstats, sch->q.qlen) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx;
+
+ if (arg->stop)
+ return;
+
+ /* Walk hierarchy with a virtual class per tc */
+ arg->count = arg->skip;
+ for (ntx = arg->skip;
+ ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
+ ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+ .graft = mqprio_graft,
+ .leaf = mqprio_leaf,
+ .get = mqprio_get,
+ .put = mqprio_put,
+ .walk = mqprio_walk,
+ .dump = mqprio_dump_class,
+ .dump_stats = mqprio_dump_class_stats,
+};
+
+static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+ .cl_ops = &mqprio_class_ops,
+ .id = "mqprio",
+ .priv_size = sizeof(struct mqprio_sched),
+ .init = mqprio_init,
+ .destroy = mqprio_destroy,
+ .attach = mqprio_attach,
+ .dump = mqprio_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+ return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+ unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
new file mode 100644
index 000000000..42dd21887
--- /dev/null
+++ b/net/sched/sch_multiq.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct multiq_sched_data {
+ u16 bands;
+ u16 max_bands;
+ u16 curband;
+ struct tcf_proto __rcu *filter_list;
+ struct Qdisc **queues;
+};
+
+
+static struct Qdisc *
+multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ u32 band;
+ struct tcf_result res;
+ struct tcf_proto *fl = rcu_dereference_bh(q->filter_list);
+ int err;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ err = tc_classify(skb, fl, &res);
+#ifdef CONFIG_NET_CLS_ACT
+ switch (err) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ band = skb_get_queue_mapping(skb);
+
+ if (band >= q->bands)
+ return q->queues[0];
+
+ return q->queues[band];
+}
+
+static int
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct Qdisc *qdisc;
+ int ret;
+
+ qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+ if (qdisc == NULL) {
+
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+ }
+#endif
+
+ ret = qdisc_enqueue(skb, qdisc);
+ if (ret == NET_XMIT_SUCCESS) {
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ return ret;
+}
+
+static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc;
+ struct sk_buff *skb;
+ int band;
+
+ for (band = 0; band < q->bands; band++) {
+ /* cycle through bands to ensure fairness */
+ q->curband++;
+ if (q->curband >= q->bands)
+ q->curband = 0;
+
+ /* Check that target subqueue is available before
+ * pulling an skb to avoid head-of-line blocking.
+ */
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {
+ qdisc = q->queues[q->curband];
+ skb = qdisc->dequeue(qdisc);
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ return skb;
+ }
+ }
+ }
+ return NULL;
+
+}
+
+static struct sk_buff *multiq_peek(struct Qdisc *sch)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned int curband = q->curband;
+ struct Qdisc *qdisc;
+ struct sk_buff *skb;
+ int band;
+
+ for (band = 0; band < q->bands; band++) {
+ /* cycle through bands to ensure fairness */
+ curband++;
+ if (curband >= q->bands)
+ curband = 0;
+
+ /* Check that target subqueue is available before
+ * pulling an skb to avoid head-of-line blocking.
+ */
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), curband))) {
+ qdisc = q->queues[curband];
+ skb = qdisc->ops->peek(qdisc);
+ if (skb)
+ return skb;
+ }
+ }
+ return NULL;
+
+}
+
+static unsigned int multiq_drop(struct Qdisc *sch)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ int band;
+ unsigned int len;
+ struct Qdisc *qdisc;
+
+ for (band = q->bands - 1; band >= 0; band--) {
+ qdisc = q->queues[band];
+ if (qdisc->ops->drop) {
+ len = qdisc->ops->drop(qdisc);
+ if (len != 0) {
+ sch->q.qlen--;
+ return len;
+ }
+ }
+ }
+ return 0;
+}
+
+
+static void
+multiq_reset(struct Qdisc *sch)
+{
+ u16 band;
+ struct multiq_sched_data *q = qdisc_priv(sch);
+
+ for (band = 0; band < q->bands; band++)
+ qdisc_reset(q->queues[band]);
+ sch->q.qlen = 0;
+ q->curband = 0;
+}
+
+static void
+multiq_destroy(struct Qdisc *sch)
+{
+ int band;
+ struct multiq_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ for (band = 0; band < q->bands; band++)
+ qdisc_destroy(q->queues[band]);
+
+ kfree(q->queues);
+}
+
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ struct tc_multiq_qopt *qopt;
+ int i;
+
+ if (!netif_is_multiqueue(qdisc_dev(sch)))
+ return -EOPNOTSUPP;
+ if (nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+
+ qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+ sch_tree_lock(sch);
+ q->bands = qopt->bands;
+ for (i = q->bands; i < q->max_bands; i++) {
+ if (q->queues[i] != &noop_qdisc) {
+ struct Qdisc *child = q->queues[i];
+ q->queues[i] = &noop_qdisc;
+ qdisc_tree_decrease_qlen(child, child->q.qlen);
+ qdisc_destroy(child);
+ }
+ }
+
+ sch_tree_unlock(sch);
+
+ for (i = 0; i < q->bands; i++) {
+ if (q->queues[i] == &noop_qdisc) {
+ struct Qdisc *child, *old;
+ child = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle,
+ i + 1));
+ if (child) {
+ sch_tree_lock(sch);
+ old = q->queues[i];
+ q->queues[i] = child;
+
+ if (old != &noop_qdisc) {
+ qdisc_tree_decrease_qlen(old,
+ old->q.qlen);
+ qdisc_destroy(old);
+ }
+ sch_tree_unlock(sch);
+ }
+ }
+ }
+ return 0;
+}
+
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ int i, err;
+
+ q->queues = NULL;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ q->max_bands = qdisc_dev(sch)->num_tx_queues;
+
+ q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
+ if (!q->queues)
+ return -ENOBUFS;
+ for (i = 0; i < q->max_bands; i++)
+ q->queues[i] = &noop_qdisc;
+
+ err = multiq_tune(sch, opt);
+
+ if (err)
+ kfree(q->queues);
+
+ return err;
+}
+
+static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_multiq_qopt opt;
+
+ opt.bands = q->bands;
+ opt.max_bands = q->max_bands;
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->queues[band];
+ q->queues[band] = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *
+multiq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ return q->queues[band];
+}
+
+static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ unsigned long band = TC_H_MIN(classid);
+
+ if (band - 1 >= q->bands)
+ return 0;
+ return band;
+}
+
+static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return multiq_get(sch, classid);
+}
+
+
+static void multiq_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ tcm->tcm_info = q->queues[cl - 1]->handle;
+ return 0;
+}
+
+static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *cl_q;
+
+ cl_q = q->queues[cl - 1];
+ if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+ int band;
+
+ if (arg->stop)
+ return;
+
+ for (band = 0; band < q->bands; band++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, band + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_proto __rcu **multiq_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct multiq_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops multiq_class_ops = {
+ .graft = multiq_graft,
+ .leaf = multiq_leaf,
+ .get = multiq_get,
+ .put = multiq_put,
+ .walk = multiq_walk,
+ .tcf_chain = multiq_find_tcf,
+ .bind_tcf = multiq_bind,
+ .unbind_tcf = multiq_put,
+ .dump = multiq_dump_class,
+ .dump_stats = multiq_dump_class_stats,
+};
+
+static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &multiq_class_ops,
+ .id = "multiq",
+ .priv_size = sizeof(struct multiq_sched_data),
+ .enqueue = multiq_enqueue,
+ .dequeue = multiq_dequeue,
+ .peek = multiq_peek,
+ .drop = multiq_drop,
+ .init = multiq_init,
+ .reset = multiq_reset,
+ .destroy = multiq_destroy,
+ .change = multiq_tune,
+ .dump = multiq_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init multiq_module_init(void)
+{
+ return register_qdisc(&multiq_qdisc_ops);
+}
+
+static void __exit multiq_module_exit(void)
+{
+ unregister_qdisc(&multiq_qdisc_ops);
+}
+
+module_init(multiq_module_init)
+module_exit(multiq_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
new file mode 100644
index 000000000..956ead2ca
--- /dev/null
+++ b/net/sched/sch_netem.c
@@ -0,0 +1,1116 @@
+/*
+ * net/sched/sch_netem.c Network emulator
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License.
+ *
+ * Many of the algorithms and ideas for this came from
+ * NIST Net which is not copyrighted.
+ *
+ * Authors: Stephen Hemminger <shemminger@osdl.org>
+ * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <linux/reciprocal_div.h>
+#include <linux/rbtree.h>
+
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+#define VERSION "1.3"
+
+/* Network Emulation Queuing algorithm.
+ ====================================
+
+ Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
+ Network Emulation Tool
+ [2] Luigi Rizzo, DummyNet for FreeBSD
+
+ ----------------------------------------------------------------
+
+ This started out as a simple way to delay outgoing packets to
+ test TCP but has grown to include most of the functionality
+ of a full blown network emulator like NISTnet. It can delay
+ packets and add random jitter (and correlation). The random
+ distribution can be loaded from a table as well to provide
+ normal, Pareto, or experimental curves. Packet loss,
+ duplication, and reordering can also be emulated.
+
+ This qdisc does not do classification that can be handled in
+ layering other disciplines. It does not need to do bandwidth
+ control either since that can be handled by using token
+ bucket or other rate control.
+
+ Correlated Loss Generator models
+
+ Added generation of correlated loss according to the
+ "Gilbert-Elliot" model, a 4-state markov model.
+
+ References:
+ [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
+ [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
+ and intuitive loss model for packet networks and its implementation
+ in the Netem module in the Linux kernel", available in [1]
+
+ Authors: Stefano Salsano <stefano.salsano at uniroma2.it
+ Fabio Ludovici <fabio.ludovici at yahoo.it>
+*/
+
+struct netem_sched_data {
+ /* internal t(ime)fifo qdisc uses t_root and sch->limit */
+ struct rb_root t_root;
+
+ /* optional qdisc for classful handling (NULL at netem init) */
+ struct Qdisc *qdisc;
+
+ struct qdisc_watchdog watchdog;
+
+ psched_tdiff_t latency;
+ psched_tdiff_t jitter;
+
+ u32 loss;
+ u32 ecn;
+ u32 limit;
+ u32 counter;
+ u32 gap;
+ u32 duplicate;
+ u32 reorder;
+ u32 corrupt;
+ u64 rate;
+ s32 packet_overhead;
+ u32 cell_size;
+ struct reciprocal_value cell_size_reciprocal;
+ s32 cell_overhead;
+
+ struct crndstate {
+ u32 last;
+ u32 rho;
+ } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
+
+ struct disttable {
+ u32 size;
+ s16 table[0];
+ } *delay_dist;
+
+ enum {
+ CLG_RANDOM,
+ CLG_4_STATES,
+ CLG_GILB_ELL,
+ } loss_model;
+
+ enum {
+ TX_IN_GAP_PERIOD = 1,
+ TX_IN_BURST_PERIOD,
+ LOST_IN_GAP_PERIOD,
+ LOST_IN_BURST_PERIOD,
+ } _4_state_model;
+
+ enum {
+ GOOD_STATE = 1,
+ BAD_STATE,
+ } GE_state_model;
+
+ /* Correlated Loss Generation models */
+ struct clgstate {
+ /* state of the Markov chain */
+ u8 state;
+
+ /* 4-states and Gilbert-Elliot models */
+ u32 a1; /* p13 for 4-states or p for GE */
+ u32 a2; /* p31 for 4-states or r for GE */
+ u32 a3; /* p32 for 4-states or h for GE */
+ u32 a4; /* p14 for 4-states or 1-k for GE */
+ u32 a5; /* p23 used only in 4-states */
+ } clg;
+
+};
+
+/* Time stamp put into socket buffer control block
+ * Only valid when skbs are in our internal t(ime)fifo queue.
+ *
+ * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
+ * and skb->next & skb->prev are scratch space for a qdisc,
+ * we save skb->tstamp value in skb->cb[] before destroying it.
+ */
+struct netem_skb_cb {
+ psched_time_t time_to_send;
+ ktime_t tstamp_save;
+};
+
+
+static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
+{
+ return container_of(rb, struct sk_buff, rbnode);
+}
+
+static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
+{
+ /* we assume we can use skb next/prev/tstamp as storage for rb_node */
+ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
+ return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/* init_crandom - initialize correlated random number generator
+ * Use entropy source for initial seed.
+ */
+static void init_crandom(struct crndstate *state, unsigned long rho)
+{
+ state->rho = rho;
+ state->last = prandom_u32();
+}
+
+/* get_crandom - correlated random number generator
+ * Next number depends on last value.
+ * rho is scaled to avoid floating point.
+ */
+static u32 get_crandom(struct crndstate *state)
+{
+ u64 value, rho;
+ unsigned long answer;
+
+ if (state->rho == 0) /* no correlation */
+ return prandom_u32();
+
+ value = prandom_u32();
+ rho = (u64)state->rho + 1;
+ answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
+ state->last = answer;
+ return answer;
+}
+
+/* loss_4state - 4-state model loss generator
+ * Generates losses according to the 4-state Markov chain adopted in
+ * the GI (General and Intuitive) loss model.
+ */
+static bool loss_4state(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+ u32 rnd = prandom_u32();
+
+ /*
+ * Makes a comparison between rnd and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state and if the next packet has to be transmitted or lost.
+ * The four states correspond to:
+ * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
+ * LOST_IN_BURST_PERIOD => isolated losses within a gap period
+ * LOST_IN_GAP_PERIOD => lost packets within a burst period
+ * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
+ */
+ switch (clg->state) {
+ case TX_IN_GAP_PERIOD:
+ if (rnd < clg->a4) {
+ clg->state = LOST_IN_BURST_PERIOD;
+ return true;
+ } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ } else if (clg->a1 + clg->a4 < rnd) {
+ clg->state = TX_IN_GAP_PERIOD;
+ }
+
+ break;
+ case TX_IN_BURST_PERIOD:
+ if (rnd < clg->a5) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ } else {
+ clg->state = TX_IN_BURST_PERIOD;
+ }
+
+ break;
+ case LOST_IN_GAP_PERIOD:
+ if (rnd < clg->a3)
+ clg->state = TX_IN_BURST_PERIOD;
+ else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+ clg->state = TX_IN_GAP_PERIOD;
+ } else if (clg->a2 + clg->a3 < rnd) {
+ clg->state = LOST_IN_GAP_PERIOD;
+ return true;
+ }
+ break;
+ case LOST_IN_BURST_PERIOD:
+ clg->state = TX_IN_GAP_PERIOD;
+ break;
+ }
+
+ return false;
+}
+
+/* loss_gilb_ell - Gilbert-Elliot model loss generator
+ * Generates losses according to the Gilbert-Elliot loss model or
+ * its special cases (Gilbert or Simple Gilbert)
+ *
+ * Makes a comparison between random number and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state. A second random number is extracted and the comparison
+ * with the loss probability of the current state decides if the next
+ * packet will be transmitted or lost.
+ */
+static bool loss_gilb_ell(struct netem_sched_data *q)
+{
+ struct clgstate *clg = &q->clg;
+
+ switch (clg->state) {
+ case GOOD_STATE:
+ if (prandom_u32() < clg->a1)
+ clg->state = BAD_STATE;
+ if (prandom_u32() < clg->a4)
+ return true;
+ break;
+ case BAD_STATE:
+ if (prandom_u32() < clg->a2)
+ clg->state = GOOD_STATE;
+ if (prandom_u32() > clg->a3)
+ return true;
+ }
+
+ return false;
+}
+
+static bool loss_event(struct netem_sched_data *q)
+{
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* Random packet drop 0 => none, ~0 => all */
+ return q->loss && q->loss >= get_crandom(&q->loss_cor);
+
+ case CLG_4_STATES:
+ /* 4state loss model algorithm (used also for GI model)
+ * Extracts a value from the markov 4 state loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_4state(q);
+
+ case CLG_GILB_ELL:
+ /* Gilbert-Elliot loss model algorithm
+ * Extracts a value from the Gilbert-Elliot loss generator,
+ * if it is 1 drops a packet and if needed writes the event in
+ * the kernel logs
+ */
+ return loss_gilb_ell(q);
+ }
+
+ return false; /* not reached */
+}
+
+
+/* tabledist - return a pseudo-randomly distributed value with mean mu and
+ * std deviation sigma. Uses table lookup to approximate the desired
+ * distribution, and a uniformly-distributed pseudo-random source.
+ */
+static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
+ struct crndstate *state,
+ const struct disttable *dist)
+{
+ psched_tdiff_t x;
+ long t;
+ u32 rnd;
+
+ if (sigma == 0)
+ return mu;
+
+ rnd = get_crandom(state);
+
+ /* default uniform distribution */
+ if (dist == NULL)
+ return (rnd % (2*sigma)) - sigma + mu;
+
+ t = dist->table[rnd % dist->size];
+ x = (sigma % NETEM_DIST_SCALE) * t;
+ if (x >= 0)
+ x += NETEM_DIST_SCALE/2;
+ else
+ x -= NETEM_DIST_SCALE/2;
+
+ return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
+}
+
+static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
+{
+ u64 ticks;
+
+ len += q->packet_overhead;
+
+ if (q->cell_size) {
+ u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
+
+ if (len > cells * q->cell_size) /* extra cell needed for remainder */
+ cells++;
+ len = cells * (q->cell_size + q->cell_overhead);
+ }
+
+ ticks = (u64)len * NSEC_PER_SEC;
+
+ do_div(ticks, q->rate);
+ return PSCHED_NS2TICKS(ticks);
+}
+
+static void tfifo_reset(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p;
+
+ while ((p = rb_first(&q->t_root))) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ skb->next = NULL;
+ skb->prev = NULL;
+ kfree_skb(skb);
+ }
+}
+
+static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
+ struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = netem_rb_to_skb(parent);
+ if (tnext >= netem_skb_cb(skb)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&nskb->rbnode, parent, p);
+ rb_insert_color(&nskb->rbnode, &q->t_root);
+ sch->q.qlen++;
+}
+
+/*
+ * Insert one skb into qdisc.
+ * Note: parent depends on return value to account for queue length.
+ * NET_XMIT_DROP: queue length didn't change.
+ * NET_XMIT_SUCCESS: one skb was queued.
+ */
+static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ /* We don't fill cb now as skb_unshare() may invalidate it */
+ struct netem_skb_cb *cb;
+ struct sk_buff *skb2;
+ int count = 1;
+
+ /* Random duplication */
+ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
+ ++count;
+
+ /* Drop packet? */
+ if (loss_event(q)) {
+ if (q->ecn && INET_ECN_set_ce(skb))
+ qdisc_qstats_drop(sch); /* mark packet */
+ else
+ --count;
+ }
+ if (count == 0) {
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ }
+
+ /* If a delay is expected, orphan the skb. (orphaning usually takes
+ * place at TX completion time, so _before_ the link transit delay)
+ */
+ if (q->latency || q->jitter)
+ skb_orphan_partial(skb);
+
+ /*
+ * If we need to duplicate packet, then re-insert at top of the
+ * qdisc tree, since parent queuer expects that only one
+ * skb will be queued.
+ */
+ if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
+ struct Qdisc *rootq = qdisc_root(sch);
+ u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
+ q->duplicate = 0;
+
+ qdisc_enqueue_root(skb2, rootq);
+ q->duplicate = dupsave;
+ }
+
+ /*
+ * Randomized packet corruption.
+ * Make copy if needed since we are modifying
+ * If packet is going to be hardware checksummed, then
+ * do it now in software before we mangle it.
+ */
+ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+ if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(skb)))
+ return qdisc_drop(skb, sch);
+
+ skb->data[prandom_u32() % skb_headlen(skb)] ^=
+ 1<<(prandom_u32() % 8);
+ }
+
+ if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
+ return qdisc_reshape_fail(skb, sch);
+
+ qdisc_qstats_backlog_inc(sch, skb);
+
+ cb = netem_skb_cb(skb);
+ if (q->gap == 0 || /* not doing reordering */
+ q->counter < q->gap - 1 || /* inside last reordering gap */
+ q->reorder < get_crandom(&q->reorder_cor)) {
+ psched_time_t now;
+ psched_tdiff_t delay;
+
+ delay = tabledist(q->latency, q->jitter,
+ &q->delay_cor, q->delay_dist);
+
+ now = psched_get_time();
+
+ if (q->rate) {
+ struct sk_buff *last;
+
+ if (!skb_queue_empty(&sch->q))
+ last = skb_peek_tail(&sch->q);
+ else
+ last = netem_rb_to_skb(rb_last(&q->t_root));
+ if (last) {
+ /*
+ * Last packet in queue is reference point (now),
+ * calculate this time bonus and subtract
+ * from delay.
+ */
+ delay -= netem_skb_cb(last)->time_to_send - now;
+ delay = max_t(psched_tdiff_t, 0, delay);
+ now = netem_skb_cb(last)->time_to_send;
+ }
+
+ delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
+ }
+
+ cb->time_to_send = now + delay;
+ cb->tstamp_save = skb->tstamp;
+ ++q->counter;
+ tfifo_enqueue(skb, sch);
+ } else {
+ /*
+ * Do re-ordering by putting one out of N packets at the front
+ * of the queue.
+ */
+ cb->time_to_send = psched_get_time();
+ q->counter = 0;
+
+ __skb_queue_head(&sch->q, skb);
+ sch->qstats.requeues++;
+ }
+
+ return NET_XMIT_SUCCESS;
+}
+
+static unsigned int netem_drop(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ unsigned int len;
+
+ len = qdisc_queue_drop(sch);
+
+ if (!len) {
+ struct rb_node *p = rb_first(&q->t_root);
+
+ if (p) {
+ struct sk_buff *skb = netem_rb_to_skb(p);
+
+ rb_erase(p, &q->t_root);
+ sch->q.qlen--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ qdisc_qstats_backlog_dec(sch, skb);
+ kfree_skb(skb);
+ }
+ }
+ if (!len && q->qdisc && q->qdisc->ops->drop)
+ len = q->qdisc->ops->drop(q->qdisc);
+ if (len)
+ qdisc_qstats_drop(sch);
+
+ return len;
+}
+
+static struct sk_buff *netem_dequeue(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ struct rb_node *p;
+
+ if (qdisc_is_throttled(sch))
+ return NULL;
+
+tfifo_dequeue:
+ skb = __skb_dequeue(&sch->q);
+ if (skb) {
+ qdisc_qstats_backlog_dec(sch, skb);
+deliver:
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
+ return skb;
+ }
+ p = rb_first(&q->t_root);
+ if (p) {
+ psched_time_t time_to_send;
+
+ skb = netem_rb_to_skb(p);
+
+ /* if more time remaining? */
+ time_to_send = netem_skb_cb(skb)->time_to_send;
+ if (time_to_send <= psched_get_time()) {
+ rb_erase(p, &q->t_root);
+
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->tstamp = netem_skb_cb(skb)->tstamp_save;
+
+#ifdef CONFIG_NET_CLS_ACT
+ /*
+ * If it's at ingress let's pretend the delay is
+ * from the network (tstamp will be updated).
+ */
+ if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
+ skb->tstamp.tv64 = 0;
+#endif
+
+ if (q->qdisc) {
+ int err = qdisc_enqueue(skb, q->qdisc);
+
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ qdisc_qstats_drop(sch);
+ qdisc_tree_decrease_qlen(sch, 1);
+ }
+ }
+ goto tfifo_dequeue;
+ }
+ goto deliver;
+ }
+
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
+ qdisc_watchdog_schedule(&q->watchdog, time_to_send);
+ }
+
+ if (q->qdisc) {
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ goto deliver;
+ }
+ return NULL;
+}
+
+static void netem_reset(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
+ tfifo_reset(sch);
+ if (q->qdisc)
+ qdisc_reset(q->qdisc);
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static void dist_free(struct disttable *d)
+{
+ kvfree(d);
+}
+
+/*
+ * Distribution data is a variable size payload containing
+ * signed 16 bit values.
+ */
+static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ size_t n = nla_len(attr)/sizeof(__s16);
+ const __s16 *data = nla_data(attr);
+ spinlock_t *root_lock;
+ struct disttable *d;
+ int i;
+ size_t s;
+
+ if (n > NETEM_DIST_MAX)
+ return -EINVAL;
+
+ s = sizeof(struct disttable) + n * sizeof(s16);
+ d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
+ if (!d)
+ d = vmalloc(s);
+ if (!d)
+ return -ENOMEM;
+
+ d->size = n;
+ for (i = 0; i < n; i++)
+ d->table[i] = data[i];
+
+ root_lock = qdisc_root_sleeping_lock(sch);
+
+ spin_lock_bh(root_lock);
+ swap(q->delay_dist, d);
+ spin_unlock_bh(root_lock);
+
+ dist_free(d);
+ return 0;
+}
+
+static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_corr *c = nla_data(attr);
+
+ init_crandom(&q->delay_cor, c->delay_corr);
+ init_crandom(&q->loss_cor, c->loss_corr);
+ init_crandom(&q->dup_cor, c->dup_corr);
+}
+
+static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_reorder *r = nla_data(attr);
+
+ q->reorder = r->probability;
+ init_crandom(&q->reorder_cor, r->correlation);
+}
+
+static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_corrupt *r = nla_data(attr);
+
+ q->corrupt = r->probability;
+ init_crandom(&q->corrupt_cor, r->correlation);
+}
+
+static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct tc_netem_rate *r = nla_data(attr);
+
+ q->rate = r->rate;
+ q->packet_overhead = r->packet_overhead;
+ q->cell_size = r->cell_size;
+ q->cell_overhead = r->cell_overhead;
+ if (q->cell_size)
+ q->cell_size_reciprocal = reciprocal_value(q->cell_size);
+ else
+ q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
+}
+
+static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
+{
+ const struct nlattr *la;
+ int rem;
+
+ nla_for_each_nested(la, attr, rem) {
+ u16 type = nla_type(la);
+
+ switch (type) {
+ case NETEM_LOSS_GI: {
+ const struct tc_netem_gimodel *gi = nla_data(la);
+
+ if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
+ pr_info("netem: incorrect gi model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_4_STATES;
+
+ q->clg.state = TX_IN_GAP_PERIOD;
+ q->clg.a1 = gi->p13;
+ q->clg.a2 = gi->p31;
+ q->clg.a3 = gi->p32;
+ q->clg.a4 = gi->p14;
+ q->clg.a5 = gi->p23;
+ break;
+ }
+
+ case NETEM_LOSS_GE: {
+ const struct tc_netem_gemodel *ge = nla_data(la);
+
+ if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
+ pr_info("netem: incorrect ge model size\n");
+ return -EINVAL;
+ }
+
+ q->loss_model = CLG_GILB_ELL;
+ q->clg.state = GOOD_STATE;
+ q->clg.a1 = ge->p;
+ q->clg.a2 = ge->r;
+ q->clg.a3 = ge->h;
+ q->clg.a4 = ge->k1;
+ break;
+ }
+
+ default:
+ pr_info("netem: unknown loss type %u\n", type);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
+ [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
+ [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
+ [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
+ [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
+ [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
+ [TCA_NETEM_ECN] = { .type = NLA_U32 },
+ [TCA_NETEM_RATE64] = { .type = NLA_U64 },
+};
+
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy, int len)
+{
+ int nested_len = nla_len(nla) - NLA_ALIGN(len);
+
+ if (nested_len < 0) {
+ pr_info("netem: invalid attributes len %d\n", nested_len);
+ return -EINVAL;
+ }
+
+ if (nested_len >= nla_attr_size(0))
+ return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+ nested_len, policy);
+
+ memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+ return 0;
+}
+
+/* Parse netlink message to set options */
+static int netem_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_NETEM_MAX + 1];
+ struct tc_netem_qopt *qopt;
+ struct clgstate old_clg;
+ int old_loss_model = CLG_RANDOM;
+ int ret;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ qopt = nla_data(opt);
+ ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
+ if (ret < 0)
+ return ret;
+
+ /* backup q->clg and q->loss_model */
+ old_clg = q->clg;
+ old_loss_model = q->loss_model;
+
+ if (tb[TCA_NETEM_LOSS]) {
+ ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
+ if (ret) {
+ q->loss_model = old_loss_model;
+ return ret;
+ }
+ } else {
+ q->loss_model = CLG_RANDOM;
+ }
+
+ if (tb[TCA_NETEM_DELAY_DIST]) {
+ ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
+ if (ret) {
+ /* recover clg and loss_model, in case of
+ * q->clg and q->loss_model were modified
+ * in get_loss_clg()
+ */
+ q->clg = old_clg;
+ q->loss_model = old_loss_model;
+ return ret;
+ }
+ }
+
+ sch->limit = qopt->limit;
+
+ q->latency = qopt->latency;
+ q->jitter = qopt->jitter;
+ q->limit = qopt->limit;
+ q->gap = qopt->gap;
+ q->counter = 0;
+ q->loss = qopt->loss;
+ q->duplicate = qopt->duplicate;
+
+ /* for compatibility with earlier versions.
+ * if gap is set, need to assume 100% probability
+ */
+ if (q->gap)
+ q->reorder = ~0;
+
+ if (tb[TCA_NETEM_CORR])
+ get_correlation(q, tb[TCA_NETEM_CORR]);
+
+ if (tb[TCA_NETEM_REORDER])
+ get_reorder(q, tb[TCA_NETEM_REORDER]);
+
+ if (tb[TCA_NETEM_CORRUPT])
+ get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
+
+ if (tb[TCA_NETEM_RATE])
+ get_rate(q, tb[TCA_NETEM_RATE]);
+
+ if (tb[TCA_NETEM_RATE64])
+ q->rate = max_t(u64, q->rate,
+ nla_get_u64(tb[TCA_NETEM_RATE64]));
+
+ if (tb[TCA_NETEM_ECN])
+ q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
+
+ return ret;
+}
+
+static int netem_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ int ret;
+
+ if (!opt)
+ return -EINVAL;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ q->loss_model = CLG_RANDOM;
+ ret = netem_change(sch, opt);
+ if (ret)
+ pr_info("netem: change failed\n");
+ return ret;
+}
+
+static void netem_destroy(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+ if (q->qdisc)
+ qdisc_destroy(q->qdisc);
+ dist_free(q->delay_dist);
+}
+
+static int dump_loss_model(const struct netem_sched_data *q,
+ struct sk_buff *skb)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ switch (q->loss_model) {
+ case CLG_RANDOM:
+ /* legacy loss model */
+ nla_nest_cancel(skb, nest);
+ return 0; /* no data */
+
+ case CLG_4_STATES: {
+ struct tc_netem_gimodel gi = {
+ .p13 = q->clg.a1,
+ .p31 = q->clg.a2,
+ .p32 = q->clg.a3,
+ .p14 = q->clg.a4,
+ .p23 = q->clg.a5,
+ };
+
+ if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
+ goto nla_put_failure;
+ break;
+ }
+ case CLG_GILB_ELL: {
+ struct tc_netem_gemodel ge = {
+ .p = q->clg.a1,
+ .r = q->clg.a2,
+ .h = q->clg.a3,
+ .k1 = q->clg.a4,
+ };
+
+ if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
+ goto nla_put_failure;
+ break;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ const struct netem_sched_data *q = qdisc_priv(sch);
+ struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
+ struct tc_netem_qopt qopt;
+ struct tc_netem_corr cor;
+ struct tc_netem_reorder reorder;
+ struct tc_netem_corrupt corrupt;
+ struct tc_netem_rate rate;
+
+ qopt.latency = q->latency;
+ qopt.jitter = q->jitter;
+ qopt.limit = q->limit;
+ qopt.loss = q->loss;
+ qopt.gap = q->gap;
+ qopt.duplicate = q->duplicate;
+ if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
+ goto nla_put_failure;
+
+ cor.delay_corr = q->delay_cor.rho;
+ cor.loss_corr = q->loss_cor.rho;
+ cor.dup_corr = q->dup_cor.rho;
+ if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
+ goto nla_put_failure;
+
+ reorder.probability = q->reorder;
+ reorder.correlation = q->reorder_cor.rho;
+ if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
+ goto nla_put_failure;
+
+ corrupt.probability = q->corrupt;
+ corrupt.correlation = q->corrupt_cor.rho;
+ if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
+ goto nla_put_failure;
+
+ if (q->rate >= (1ULL << 32)) {
+ if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
+ goto nla_put_failure;
+ rate.rate = ~0U;
+ } else {
+ rate.rate = q->rate;
+ }
+ rate.packet_overhead = q->packet_overhead;
+ rate.cell_size = q->cell_size;
+ rate.cell_overhead = q->cell_overhead;
+ if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
+ goto nla_put_failure;
+
+ if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
+ goto nla_put_failure;
+
+ if (dump_loss_model(q, skb) != 0)
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nla);
+
+nla_put_failure:
+ nlmsg_trim(skb, nla);
+ return -1;
+}
+
+static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ if (cl != 1 || !q->qdisc) /* only one class */
+ return -ENOENT;
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ if (*old) {
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ }
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long netem_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void netem_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops netem_class_ops = {
+ .graft = netem_graft,
+ .leaf = netem_leaf,
+ .get = netem_get,
+ .put = netem_put,
+ .walk = netem_walk,
+ .dump = netem_dump_class,
+};
+
+static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
+ .id = "netem",
+ .cl_ops = &netem_class_ops,
+ .priv_size = sizeof(struct netem_sched_data),
+ .enqueue = netem_enqueue,
+ .dequeue = netem_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = netem_drop,
+ .init = netem_init,
+ .reset = netem_reset,
+ .destroy = netem_destroy,
+ .change = netem_change,
+ .dump = netem_dump,
+ .owner = THIS_MODULE,
+};
+
+
+static int __init netem_module_init(void)
+{
+ pr_info("netem: version " VERSION "\n");
+ return register_qdisc(&netem_qdisc_ops);
+}
+static void __exit netem_module_exit(void)
+{
+ unregister_qdisc(&netem_qdisc_ops);
+}
+module_init(netem_module_init)
+module_exit(netem_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
new file mode 100644
index 000000000..b783a446d
--- /dev/null
+++ b/net/sched/sch_pie.c
@@ -0,0 +1,566 @@
+/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Author: Vijay Subramanian <vijaynsu@cisco.com>
+ * Author: Mythili Prabhu <mysuryan@cisco.com>
+ *
+ * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
+ * University of Oslo, Norway.
+ *
+ * References:
+ * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
+ * IEEE Conference on High Performance Switching and Routing 2013 :
+ * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+#define QUEUE_THRESHOLD 10000
+#define DQCOUNT_INVALID -1
+#define MAX_PROB 0xffffffff
+#define PIE_SCALE 8
+
+/* parameters used */
+struct pie_params {
+ psched_time_t target; /* user specified target delay in pschedtime */
+ u32 tupdate; /* timer frequency (in jiffies) */
+ u32 limit; /* number of packets that can be enqueued */
+ u32 alpha; /* alpha and beta are between 0 and 32 */
+ u32 beta; /* and are used for shift relative to 1 */
+ bool ecn; /* true if ecn is enabled */
+ bool bytemode; /* to scale drop early prob based on pkt size */
+};
+
+/* variables used */
+struct pie_vars {
+ u32 prob; /* probability but scaled by u32 limit. */
+ psched_time_t burst_time;
+ psched_time_t qdelay;
+ psched_time_t qdelay_old;
+ u64 dq_count; /* measured in bytes */
+ psched_time_t dq_tstamp; /* drain rate */
+ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
+ u32 qlen_old; /* in bytes */
+};
+
+/* statistics gathering */
+struct pie_stats {
+ u32 packets_in; /* total number of packets enqueued */
+ u32 dropped; /* packets dropped due to pie_action */
+ u32 overlimit; /* dropped due to lack of space in queue */
+ u32 maxq; /* maximum queue size */
+ u32 ecn_mark; /* packets marked with ECN */
+};
+
+/* private data for the Qdisc */
+struct pie_sched_data {
+ struct pie_params params;
+ struct pie_vars vars;
+ struct pie_stats stats;
+ struct timer_list adapt_timer;
+};
+
+static void pie_params_init(struct pie_params *params)
+{
+ params->alpha = 2;
+ params->beta = 20;
+ params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
+ params->limit = 1000; /* default of 1000 packets */
+ params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
+ params->ecn = false;
+ params->bytemode = false;
+}
+
+static void pie_vars_init(struct pie_vars *vars)
+{
+ vars->dq_count = DQCOUNT_INVALID;
+ vars->avg_dq_rate = 0;
+ /* default of 100 ms in pschedtime */
+ vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
+}
+
+static bool drop_early(struct Qdisc *sch, u32 packet_size)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ u32 rnd;
+ u32 local_prob = q->vars.prob;
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+
+ /* If there is still burst allowance left skip random early drop */
+ if (q->vars.burst_time > 0)
+ return false;
+
+ /* If current delay is less than half of target, and
+ * if drop prob is low already, disable early_drop
+ */
+ if ((q->vars.qdelay < q->params.target / 2)
+ && (q->vars.prob < MAX_PROB / 5))
+ return false;
+
+ /* If we have fewer than 2 mtu-sized packets, disable drop_early,
+ * similar to min_th in RED
+ */
+ if (sch->qstats.backlog < 2 * mtu)
+ return false;
+
+ /* If bytemode is turned on, use packet size to compute new
+ * probablity. Smaller packets will have lower drop prob in this case
+ */
+ if (q->params.bytemode && packet_size <= mtu)
+ local_prob = (local_prob / mtu) * packet_size;
+ else
+ local_prob = q->vars.prob;
+
+ rnd = prandom_u32();
+ if (rnd < local_prob)
+ return true;
+
+ return false;
+}
+
+static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ bool enqueue = false;
+
+ if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+ q->stats.overlimit++;
+ goto out;
+ }
+
+ if (!drop_early(sch, skb->len)) {
+ enqueue = true;
+ } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
+ INET_ECN_set_ce(skb)) {
+ /* If packet is ecn capable, mark it if drop probability
+ * is lower than 10%, else drop it.
+ */
+ q->stats.ecn_mark++;
+ enqueue = true;
+ }
+
+ /* we can enqueue the packet */
+ if (enqueue) {
+ q->stats.packets_in++;
+ if (qdisc_qlen(sch) > q->stats.maxq)
+ q->stats.maxq = qdisc_qlen(sch);
+
+ return qdisc_enqueue_tail(skb, sch);
+ }
+
+out:
+ q->stats.dropped++;
+ return qdisc_drop(skb, sch);
+}
+
+static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
+ [TCA_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_PIE_BETA] = {.type = NLA_U32},
+ [TCA_PIE_ECN] = {.type = NLA_U32},
+ [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+};
+
+static int pie_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_PIE_MAX + 1];
+ unsigned int qlen;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+
+ /* convert from microseconds to pschedtime */
+ if (tb[TCA_PIE_TARGET]) {
+ /* target is in us */
+ u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
+
+ /* convert to pschedtime */
+ q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+ }
+
+ /* tupdate is in jiffies */
+ if (tb[TCA_PIE_TUPDATE])
+ q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
+
+ if (tb[TCA_PIE_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
+
+ q->params.limit = limit;
+ sch->limit = limit;
+ }
+
+ if (tb[TCA_PIE_ALPHA])
+ q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
+
+ if (tb[TCA_PIE_BETA])
+ q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
+
+ if (tb[TCA_PIE_ECN])
+ q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
+
+ if (tb[TCA_PIE_BYTEMODE])
+ q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+
+ /* Drop excess packets if new limit is lower */
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = __skb_dequeue(&sch->q);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+{
+
+ struct pie_sched_data *q = qdisc_priv(sch);
+ int qlen = sch->qstats.backlog; /* current queue size in bytes */
+
+ /* If current queue is about 10 packets or more and dq_count is unset
+ * we have enough packets to calculate the drain rate. Save
+ * current time as dq_tstamp and start measurement cycle.
+ */
+ if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
+ q->vars.dq_tstamp = psched_get_time();
+ q->vars.dq_count = 0;
+ }
+
+ /* Calculate the average drain rate from this value. If queue length
+ * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+ * the dq_count to -1 as we don't have enough packets to calculate the
+ * drain rate anymore The following if block is entered only when we
+ * have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
+ * and we calculate the drain rate for the threshold here. dq_count is
+ * in bytes, time difference in psched_time, hence rate is in
+ * bytes/psched_time.
+ */
+ if (q->vars.dq_count != DQCOUNT_INVALID) {
+ q->vars.dq_count += skb->len;
+
+ if (q->vars.dq_count >= QUEUE_THRESHOLD) {
+ psched_time_t now = psched_get_time();
+ u32 dtime = now - q->vars.dq_tstamp;
+ u32 count = q->vars.dq_count << PIE_SCALE;
+
+ if (dtime == 0)
+ return;
+
+ count = count / dtime;
+
+ if (q->vars.avg_dq_rate == 0)
+ q->vars.avg_dq_rate = count;
+ else
+ q->vars.avg_dq_rate =
+ (q->vars.avg_dq_rate -
+ (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+
+ /* If the queue has receded below the threshold, we hold
+ * on to the last drain rate calculated, else we reset
+ * dq_count to 0 to re-enter the if block when the next
+ * packet is dequeued
+ */
+ if (qlen < QUEUE_THRESHOLD)
+ q->vars.dq_count = DQCOUNT_INVALID;
+ else {
+ q->vars.dq_count = 0;
+ q->vars.dq_tstamp = psched_get_time();
+ }
+
+ if (q->vars.burst_time > 0) {
+ if (q->vars.burst_time > dtime)
+ q->vars.burst_time -= dtime;
+ else
+ q->vars.burst_time = 0;
+ }
+ }
+ }
+}
+
+static void calculate_probability(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ u32 qlen = sch->qstats.backlog; /* queue size in bytes */
+ psched_time_t qdelay = 0; /* in pschedtime */
+ psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
+ s32 delta = 0; /* determines the change in probability */
+ u32 oldprob;
+ u32 alpha, beta;
+ bool update_prob = true;
+
+ q->vars.qdelay_old = q->vars.qdelay;
+
+ if (q->vars.avg_dq_rate > 0)
+ qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
+ else
+ qdelay = 0;
+
+ /* If qdelay is zero and qlen is not, it means qlen is very small, less
+ * than dequeue_rate, so we do not update probabilty in this round
+ */
+ if (qdelay == 0 && qlen != 0)
+ update_prob = false;
+
+ /* In the algorithm, alpha and beta are between 0 and 2 with typical
+ * value for alpha as 0.125. In this implementation, we use values 0-32
+ * passed from user space to represent this. Also, alpha and beta have
+ * unit of HZ and need to be scaled before they can used to update
+ * probability. alpha/beta are updated locally below by 1) scaling them
+ * appropriately 2) scaling down by 16 to come to 0-2 range.
+ * Please see paper for details.
+ *
+ * We scale alpha and beta differently depending on whether we are in
+ * light, medium or high dropping mode.
+ */
+ if (q->vars.prob < MAX_PROB / 100) {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
+ } else if (q->vars.prob < MAX_PROB / 10) {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
+ } else {
+ alpha =
+ (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ beta =
+ (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ }
+
+ /* alpha and beta should be between 0 and 32, in multiples of 1/16 */
+ delta += alpha * ((qdelay - q->params.target));
+ delta += beta * ((qdelay - qdelay_old));
+
+ oldprob = q->vars.prob;
+
+ /* to ensure we increase probability in steps of no more than 2% */
+ if (delta > (s32) (MAX_PROB / (100 / 2)) &&
+ q->vars.prob >= MAX_PROB / 10)
+ delta = (MAX_PROB / 100) * 2;
+
+ /* Non-linear drop:
+ * Tune drop probability to increase quickly for high delays(>= 250ms)
+ * 250ms is derived through experiments and provides error protection
+ */
+
+ if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
+ delta += MAX_PROB / (100 / 2);
+
+ q->vars.prob += delta;
+
+ if (delta > 0) {
+ /* prevent overflow */
+ if (q->vars.prob < oldprob) {
+ q->vars.prob = MAX_PROB;
+ /* Prevent normalization error. If probability is at
+ * maximum value already, we normalize it here, and
+ * skip the check to do a non-linear drop in the next
+ * section.
+ */
+ update_prob = false;
+ }
+ } else {
+ /* prevent underflow */
+ if (q->vars.prob > oldprob)
+ q->vars.prob = 0;
+ }
+
+ /* Non-linear drop in probability: Reduce drop probability quickly if
+ * delay is 0 for 2 consecutive Tupdate periods.
+ */
+
+ if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
+ q->vars.prob = (q->vars.prob * 98) / 100;
+
+ q->vars.qdelay = qdelay;
+ q->vars.qlen_old = qlen;
+
+ /* We restart the measurement cycle if the following conditions are met
+ * 1. If the delay has been low for 2 consecutive Tupdate periods
+ * 2. Calculated drop probability is zero
+ * 3. We have atleast one estimate for the avg_dq_rate ie.,
+ * is a non-zero value
+ */
+ if ((q->vars.qdelay < q->params.target / 2) &&
+ (q->vars.qdelay_old < q->params.target / 2) &&
+ (q->vars.prob == 0) &&
+ (q->vars.avg_dq_rate > 0))
+ pie_vars_init(&q->vars);
+}
+
+static void pie_timer(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc *)arg;
+ struct pie_sched_data *q = qdisc_priv(sch);
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+ spin_lock(root_lock);
+ calculate_probability(sch);
+
+ /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
+ if (q->params.tupdate)
+ mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
+ spin_unlock(root_lock);
+
+}
+
+static int pie_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+
+ pie_params_init(&q->params);
+ pie_vars_init(&q->vars);
+ sch->limit = q->params.limit;
+
+ setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch);
+
+ if (opt) {
+ int err = pie_change(sch, opt);
+
+ if (err)
+ return err;
+ }
+
+ mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+ return 0;
+}
+
+static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ /* convert target from pschedtime to us */
+ if (nla_put_u32(skb, TCA_PIE_TARGET,
+ ((u32) PSCHED_TICKS2NS(q->params.target)) /
+ NSEC_PER_USEC) ||
+ nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
+ nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
+ nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
+ nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
+ nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -1;
+
+}
+
+static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ struct tc_pie_xstats st = {
+ .prob = q->vars.prob,
+ .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
+ NSEC_PER_USEC,
+ /* unscale and return dq_rate in bytes per sec */
+ .avg_dq_rate = q->vars.avg_dq_rate *
+ (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
+ .packets_in = q->stats.packets_in,
+ .overlimit = q->stats.overlimit,
+ .maxq = q->stats.maxq,
+ .dropped = q->stats.dropped,
+ .ecn_mark = q->stats.ecn_mark,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ skb = __qdisc_dequeue_head(sch, &sch->q);
+
+ if (!skb)
+ return NULL;
+
+ pie_process_dequeue(sch, skb);
+ return skb;
+}
+
+static void pie_reset(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ qdisc_reset_queue(sch);
+ pie_vars_init(&q->vars);
+}
+
+static void pie_destroy(struct Qdisc *sch)
+{
+ struct pie_sched_data *q = qdisc_priv(sch);
+ q->params.tupdate = 0;
+ del_timer_sync(&q->adapt_timer);
+}
+
+static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
+ .id = "pie",
+ .priv_size = sizeof(struct pie_sched_data),
+ .enqueue = pie_qdisc_enqueue,
+ .dequeue = pie_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = pie_init,
+ .destroy = pie_destroy,
+ .reset = pie_reset,
+ .change = pie_change,
+ .dump = pie_dump,
+ .dump_stats = pie_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init pie_module_init(void)
+{
+ return register_qdisc(&pie_qdisc_ops);
+}
+
+static void __exit pie_module_exit(void)
+{
+ unregister_qdisc(&pie_qdisc_ops);
+}
+
+module_init(pie_module_init);
+module_exit(pie_module_exit);
+
+MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
+MODULE_AUTHOR("Vijay Subramanian");
+MODULE_AUTHOR("Mythili Prabhu");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 000000000..89f8fcf73
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,233 @@
+/*
+ * sch_plug.c Queue traffic until an explicit release command
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * There are two ways to use this qdisc:
+ * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
+ * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
+ *
+ * 2. For network output buffering (a.k.a output commit) functionality.
+ * Output commit property is commonly used by applications using checkpoint
+ * based fault-tolerance to ensure that the checkpoint from which a system
+ * is being restored is consistent w.r.t outside world.
+ *
+ * Consider for e.g. Remus - a Virtual Machine checkpointing system,
+ * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
+ * asynchronously to the backup host, while the VM continues executing the
+ * next epoch speculatively.
+ *
+ * The following is a typical sequence of output buffer operations:
+ * 1.At epoch i, start_buffer(i)
+ * 2. At end of epoch i (i.e. after 50ms):
+ * 2.1 Stop VM and take checkpoint(i).
+ * 2.2 start_buffer(i+1) and Resume VM
+ * 3. While speculatively executing epoch(i+1), asynchronously replicate
+ * checkpoint(i) to backup host.
+ * 4. When checkpoint_ack(i) is received from backup, release_buffer(i)
+ * Thus, this Qdisc would receive the following sequence of commands:
+ * TCQ_PLUG_BUFFER (epoch i)
+ * .. TCQ_PLUG_BUFFER (epoch i+1)
+ * ....TCQ_PLUG_RELEASE_ONE (epoch i)
+ * ......TCQ_PLUG_BUFFER (epoch i+2)
+ * ........
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+/*
+ * State of the queue, when used for network output buffering:
+ *
+ * plug(i+1) plug(i) head
+ * ------------------+--------------------+---------------->
+ * | |
+ * | |
+ * pkts_current_epoch| pkts_last_epoch |pkts_to_release
+ * ----------------->|<--------+--------->|+--------------->
+ * v v
+ *
+ */
+
+struct plug_sched_data {
+ /* If true, the dequeue function releases all packets
+ * from head to end of the queue. The queue turns into
+ * a pass-through queue for newly arriving packets.
+ */
+ bool unplug_indefinite;
+
+ /* Queue Limit in bytes */
+ u32 limit;
+
+ /* Number of packets (output) from the current speculatively
+ * executing epoch.
+ */
+ u32 pkts_current_epoch;
+
+ /* Number of packets corresponding to the recently finished
+ * epoch. These will be released when we receive a
+ * TCQ_PLUG_RELEASE_ONE command. This command is typically
+ * issued after committing a checkpoint at the target.
+ */
+ u32 pkts_last_epoch;
+
+ /*
+ * Number of packets from the head of the queue, that can
+ * be released (committed checkpoint).
+ */
+ u32 pkts_to_release;
+};
+
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
+ if (!q->unplug_indefinite)
+ q->pkts_current_epoch++;
+ return qdisc_enqueue_tail(skb, sch);
+ }
+
+ return qdisc_reshape_fail(skb, sch);
+}
+
+static struct sk_buff *plug_dequeue(struct Qdisc *sch)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ if (qdisc_is_throttled(sch))
+ return NULL;
+
+ if (!q->unplug_indefinite) {
+ if (!q->pkts_to_release) {
+ /* No more packets to dequeue. Block the queue
+ * and wait for the next release command.
+ */
+ qdisc_throttled(sch);
+ return NULL;
+ }
+ q->pkts_to_release--;
+ }
+
+ return qdisc_dequeue_head(sch);
+}
+
+static int plug_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+
+ q->pkts_current_epoch = 0;
+ q->pkts_last_epoch = 0;
+ q->pkts_to_release = 0;
+ q->unplug_indefinite = false;
+
+ if (opt == NULL) {
+ /* We will set a default limit of 100 pkts (~150kB)
+ * in case tx_queue_len is not available. The
+ * default value is completely arbitrary.
+ */
+ u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
+ q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
+ } else {
+ struct tc_plug_qopt *ctl = nla_data(opt);
+
+ if (nla_len(opt) < sizeof(*ctl))
+ return -EINVAL;
+
+ q->limit = ctl->limit;
+ }
+
+ qdisc_throttled(sch);
+ return 0;
+}
+
+/* Receives 4 types of messages:
+ * TCQ_PLUG_BUFFER: Inset a plug into the queue and
+ * buffer any incoming packets
+ * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
+ * to beginning of the next plug.
+ * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
+ * Stop buffering packets until the next TCQ_PLUG_BUFFER
+ * command is received (just act as a pass-thru queue).
+ * TCQ_PLUG_LIMIT: Increase/decrease queue size
+ */
+static int plug_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct plug_sched_data *q = qdisc_priv(sch);
+ struct tc_plug_qopt *msg;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ msg = nla_data(opt);
+ if (nla_len(opt) < sizeof(*msg))
+ return -EINVAL;
+
+ switch (msg->action) {
+ case TCQ_PLUG_BUFFER:
+ /* Save size of the current buffer */
+ q->pkts_last_epoch = q->pkts_current_epoch;
+ q->pkts_current_epoch = 0;
+ if (q->unplug_indefinite)
+ qdisc_throttled(sch);
+ q->unplug_indefinite = false;
+ break;
+ case TCQ_PLUG_RELEASE_ONE:
+ /* Add packets from the last complete buffer to the
+ * packets to be released set.
+ */
+ q->pkts_to_release += q->pkts_last_epoch;
+ q->pkts_last_epoch = 0;
+ qdisc_unthrottled(sch);
+ netif_schedule_queue(sch->dev_queue);
+ break;
+ case TCQ_PLUG_RELEASE_INDEFINITE:
+ q->unplug_indefinite = true;
+ q->pkts_to_release = 0;
+ q->pkts_last_epoch = 0;
+ q->pkts_current_epoch = 0;
+ qdisc_unthrottled(sch);
+ netif_schedule_queue(sch->dev_queue);
+ break;
+ case TCQ_PLUG_LIMIT:
+ /* Limit is supplied in bytes */
+ q->limit = msg->limit;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
+ .id = "plug",
+ .priv_size = sizeof(struct plug_sched_data),
+ .enqueue = plug_enqueue,
+ .dequeue = plug_dequeue,
+ .peek = qdisc_peek_head,
+ .init = plug_init,
+ .change = plug_change,
+ .owner = THIS_MODULE,
+};
+
+static int __init plug_module_init(void)
+{
+ return register_qdisc(&plug_qdisc_ops);
+}
+
+static void __exit plug_module_exit(void)
+{
+ unregister_qdisc(&plug_qdisc_ops);
+}
+module_init(plug_module_init)
+module_exit(plug_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
new file mode 100644
index 000000000..8e5cd34aa
--- /dev/null
+++ b/net/sched/sch_prio.c
@@ -0,0 +1,408 @@
+/*
+ * net/sched/sch_prio.c Simple 3-band priority "scheduler".
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
+ * Init -- EINVAL when opt undefined
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct prio_sched_data {
+ int bands;
+ struct tcf_proto __rcu *filter_list;
+ u8 prio2band[TC_PRIO_MAX+1];
+ struct Qdisc *queues[TCQ_PRIO_BANDS];
+};
+
+
+static struct Qdisc *
+prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ u32 band = skb->priority;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int err;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
+ fl = rcu_dereference_bh(q->filter_list);
+ err = tc_classify(skb, fl, &res);
+#ifdef CONFIG_NET_CLS_ACT
+ switch (err) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ if (!fl || err < 0) {
+ if (TC_H_MAJ(band))
+ band = 0;
+ return q->queues[q->prio2band[band & TC_PRIO_MAX]];
+ }
+ band = res.classid;
+ }
+ band = TC_H_MIN(band) - 1;
+ if (band >= q->bands)
+ return q->queues[q->prio2band[0]];
+
+ return q->queues[band];
+}
+
+static int
+prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct Qdisc *qdisc;
+ int ret;
+
+ qdisc = prio_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+ if (qdisc == NULL) {
+
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+ }
+#endif
+
+ ret = qdisc_enqueue(skb, qdisc);
+ if (ret == NET_XMIT_SUCCESS) {
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ return ret;
+}
+
+static struct sk_buff *prio_peek(struct Qdisc *sch)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < q->bands; prio++) {
+ struct Qdisc *qdisc = q->queues[prio];
+ struct sk_buff *skb = qdisc->ops->peek(qdisc);
+ if (skb)
+ return skb;
+ }
+ return NULL;
+}
+
+static struct sk_buff *prio_dequeue(struct Qdisc *sch)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < q->bands; prio++) {
+ struct Qdisc *qdisc = q->queues[prio];
+ struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ return skb;
+ }
+ }
+ return NULL;
+
+}
+
+static unsigned int prio_drop(struct Qdisc *sch)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int prio;
+ unsigned int len;
+ struct Qdisc *qdisc;
+
+ for (prio = q->bands-1; prio >= 0; prio--) {
+ qdisc = q->queues[prio];
+ if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
+ sch->q.qlen--;
+ return len;
+ }
+ }
+ return 0;
+}
+
+
+static void
+prio_reset(struct Qdisc *sch)
+{
+ int prio;
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ for (prio = 0; prio < q->bands; prio++)
+ qdisc_reset(q->queues[prio]);
+ sch->q.qlen = 0;
+}
+
+static void
+prio_destroy(struct Qdisc *sch)
+{
+ int prio;
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ for (prio = 0; prio < q->bands; prio++)
+ qdisc_destroy(q->queues[prio]);
+}
+
+static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ struct tc_prio_qopt *qopt;
+ int i;
+
+ if (nla_len(opt) < sizeof(*qopt))
+ return -EINVAL;
+ qopt = nla_data(opt);
+
+ if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+ return -EINVAL;
+
+ for (i = 0; i <= TC_PRIO_MAX; i++) {
+ if (qopt->priomap[i] >= qopt->bands)
+ return -EINVAL;
+ }
+
+ sch_tree_lock(sch);
+ q->bands = qopt->bands;
+ memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
+
+ for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
+ struct Qdisc *child = q->queues[i];
+ q->queues[i] = &noop_qdisc;
+ if (child != &noop_qdisc) {
+ qdisc_tree_decrease_qlen(child, child->q.qlen);
+ qdisc_destroy(child);
+ }
+ }
+ sch_tree_unlock(sch);
+
+ for (i = 0; i < q->bands; i++) {
+ if (q->queues[i] == &noop_qdisc) {
+ struct Qdisc *child, *old;
+
+ child = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, i + 1));
+ if (child) {
+ sch_tree_lock(sch);
+ old = q->queues[i];
+ q->queues[i] = child;
+
+ if (old != &noop_qdisc) {
+ qdisc_tree_decrease_qlen(old,
+ old->q.qlen);
+ qdisc_destroy(old);
+ }
+ sch_tree_unlock(sch);
+ }
+ }
+ }
+ return 0;
+}
+
+static int prio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ for (i = 0; i < TCQ_PRIO_BANDS; i++)
+ q->queues[i] = &noop_qdisc;
+
+ if (opt == NULL) {
+ return -EINVAL;
+ } else {
+ int err;
+
+ if ((err = prio_tune(sch, opt)) != 0)
+ return err;
+ }
+ return 0;
+}
+
+static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_prio_qopt opt;
+
+ opt.bands = q->bands;
+ memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->queues[band];
+ q->queues[band] = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *
+prio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ return q->queues[band];
+}
+
+static unsigned long prio_get(struct Qdisc *sch, u32 classid)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ unsigned long band = TC_H_MIN(classid);
+
+ if (band - 1 >= q->bands)
+ return 0;
+ return band;
+}
+
+static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+ return prio_get(sch, classid);
+}
+
+
+static void prio_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
+ struct tcmsg *tcm)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ tcm->tcm_info = q->queues[cl-1]->handle;
+ return 0;
+}
+
+static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *cl_q;
+
+ cl_q = q->queues[cl - 1];
+ if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ if (arg->stop)
+ return;
+
+ for (prio = 0; prio < q->bands; prio++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, prio + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_proto __rcu **prio_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops prio_class_ops = {
+ .graft = prio_graft,
+ .leaf = prio_leaf,
+ .get = prio_get,
+ .put = prio_put,
+ .walk = prio_walk,
+ .tcf_chain = prio_find_tcf,
+ .bind_tcf = prio_bind,
+ .unbind_tcf = prio_put,
+ .dump = prio_dump_class,
+ .dump_stats = prio_dump_class_stats,
+};
+
+static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &prio_class_ops,
+ .id = "prio",
+ .priv_size = sizeof(struct prio_sched_data),
+ .enqueue = prio_enqueue,
+ .dequeue = prio_dequeue,
+ .peek = prio_peek,
+ .drop = prio_drop,
+ .init = prio_init,
+ .reset = prio_reset,
+ .destroy = prio_destroy,
+ .change = prio_tune,
+ .dump = prio_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init prio_module_init(void)
+{
+ return register_qdisc(&prio_qdisc_ops);
+}
+
+static void __exit prio_module_exit(void)
+{
+ unregister_qdisc(&prio_qdisc_ops);
+}
+
+module_init(prio_module_init)
+module_exit(prio_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
new file mode 100644
index 000000000..3ec7e88a4
--- /dev/null
+++ b/net/sched/sch_qfq.c
@@ -0,0 +1,1587 @@
+/*
+ * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
+ *
+ * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
+ * Copyright (c) 2012 Paolo Valente.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+
+/* Quick Fair Queueing Plus
+ ========================
+
+ Sources:
+
+ [1] Paolo Valente,
+ "Reducing the Execution Time of Fair-Queueing Schedulers."
+ http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
+
+ Sources for QFQ:
+
+ [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
+ Packet Scheduling with Tight Bandwidth Distribution Guarantees."
+
+ See also:
+ http://retis.sssup.it/~fabio/linux/qfq/
+ */
+
+/*
+
+ QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
+ classes. Each aggregate is timestamped with a virtual start time S
+ and a virtual finish time F, and scheduled according to its
+ timestamps. S and F are computed as a function of a system virtual
+ time function V. The classes within each aggregate are instead
+ scheduled with DRR.
+
+ To speed up operations, QFQ+ divides also aggregates into a limited
+ number of groups. Which group a class belongs to depends on the
+ ratio between the maximum packet length for the class and the weight
+ of the class. Groups have their own S and F. In the end, QFQ+
+ schedules groups, then aggregates within groups, then classes within
+ aggregates. See [1] and [2] for a full description.
+
+ Virtual time computations.
+
+ S, F and V are all computed in fixed point arithmetic with
+ FRAC_BITS decimal bits.
+
+ QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+ one bit per index.
+ QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+
+ The layout of the bits is as below:
+
+ [ MTU_SHIFT ][ FRAC_BITS ]
+ [ MAX_INDEX ][ MIN_SLOT_SHIFT ]
+ ^.__grp->index = 0
+ *.__grp->slot_shift
+
+ where MIN_SLOT_SHIFT is derived by difference from the others.
+
+ The max group index corresponds to Lmax/w_min, where
+ Lmax=1<<MTU_SHIFT, w_min = 1 .
+ From this, and knowing how many groups (MAX_INDEX) we want,
+ we can derive the shift corresponding to each group.
+
+ Because we often need to compute
+ F = S + len/w_i and V = V + len/wsum
+ instead of storing w_i store the value
+ inv_w = (1<<FRAC_BITS)/w_i
+ so we can do F = S + len * inv_w * wsum.
+ We use W_TOT in the formulas so we can easily move between
+ static and adaptive weight sum.
+
+ The per-scheduler-instance data contain all the data structures
+ for the scheduler: bitmaps and bucket lists.
+
+ */
+
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group.
+ */
+#define QFQ_MAX_SLOTS 32
+
+/*
+ * Shifts used for aggregate<->group mapping. We allow class weights that are
+ * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
+ * group with the smallest index that can support the L_i / r_i configured
+ * for the classes in the aggregate.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ */
+#define QFQ_MAX_INDEX 24
+#define QFQ_MAX_WSHIFT 10
+
+#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
+#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
+
+#define FRAC_BITS 30 /* fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
+#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
+
+#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
+
+/*
+ * Possible group states. These values are used as indexes for the bitmaps
+ * array of struct qfq_queue.
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+
+struct qfq_aggregate;
+
+struct qfq_class {
+ struct Qdisc_class_common common;
+
+ unsigned int refcnt;
+ unsigned int filter_cnt;
+
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+ struct gnet_stats_rate_est64 rate_est;
+ struct Qdisc *qdisc;
+ struct list_head alist; /* Link for active-classes list. */
+ struct qfq_aggregate *agg; /* Parent aggregate. */
+ int deficit; /* DRR deficit counter. */
+};
+
+struct qfq_aggregate {
+ struct hlist_node next; /* Link for the slot list. */
+ u64 S, F; /* flow timestamps (exact) */
+
+ /* group we belong to. In principle we would need the index,
+ * which is log_2(lmax/weight), but we never reference it
+ * directly, only the group.
+ */
+ struct qfq_group *grp;
+
+ /* these are copied from the flowset. */
+ u32 class_weight; /* Weight of each class in this aggregate. */
+ /* Max pkt size for the classes in this aggregate, DRR quantum. */
+ int lmax;
+
+ u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
+ u32 budgetmax; /* Max budget for this aggregate. */
+ u32 initial_budget, budget; /* Initial and current budget. */
+
+ int num_classes; /* Number of classes in this aggr. */
+ struct list_head active; /* DRR queue of active classes. */
+
+ struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
+};
+
+struct qfq_group {
+ u64 S, F; /* group timestamps (approx). */
+ unsigned int slot_shift; /* Slot shift. */
+ unsigned int index; /* Group index. */
+ unsigned int front; /* Index of the front slot. */
+ unsigned long full_slots; /* non-empty slots */
+
+ /* Array of RR lists of active aggregates. */
+ struct hlist_head slots[QFQ_MAX_SLOTS];
+};
+
+struct qfq_sched {
+ struct tcf_proto __rcu *filter_list;
+ struct Qdisc_class_hash clhash;
+
+ u64 oldV, V; /* Precise virtual times. */
+ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
+ u32 num_active_agg; /* Num. of active aggregates */
+ u32 wsum; /* weight sum */
+ u32 iwsum; /* inverse weight sum */
+
+ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
+ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+ u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
+
+ u32 max_agg_classes; /* Max number of classes per aggr. */
+ struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
+};
+
+/*
+ * Possible reasons why the timestamps of an aggregate are updated
+ * enqueue: the aggregate switches from idle to active and must scheduled
+ * for service
+ * requeue: the aggregate finishes its budget, so it stops being served and
+ * must be rescheduled for service
+ */
+enum update_reason {enqueue, requeue};
+
+static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct Qdisc_class_common *clc;
+
+ clc = qdisc_class_find(&q->clhash, classid);
+ if (clc == NULL)
+ return NULL;
+ return container_of(clc, struct qfq_class, common);
+}
+
+static void qfq_purge_queue(struct qfq_class *cl)
+{
+ unsigned int len = cl->qdisc->q.qlen;
+
+ qdisc_reset(cl->qdisc);
+ qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
+ [TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
+ [TCA_QFQ_LMAX] = { .type = NLA_U32 },
+};
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
+{
+ u64 slot_size = (u64)maxlen * inv_w;
+ unsigned long size_map;
+ int index = 0;
+
+ size_map = slot_size >> min_slot_shift;
+ if (!size_map)
+ goto out;
+
+ index = __fls(size_map) + 1; /* basically a log_2 */
+ index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
+
+ if (index < 0)
+ index = 0;
+out:
+ pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
+ (unsigned long) ONE_FP/inv_w, maxlen, index);
+
+ return index;
+}
+
+static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *);
+static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *,
+ enum update_reason);
+
+static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ u32 lmax, u32 weight)
+{
+ INIT_LIST_HEAD(&agg->active);
+ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+ agg->lmax = lmax;
+ agg->class_weight = weight;
+}
+
+static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q,
+ u32 lmax, u32 weight)
+{
+ struct qfq_aggregate *agg;
+
+ hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
+ if (agg->lmax == lmax && agg->class_weight == weight)
+ return agg;
+
+ return NULL;
+}
+
+
+/* Update aggregate as a function of the new number of classes. */
+static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ int new_num_classes)
+{
+ u32 new_agg_weight;
+
+ if (new_num_classes == q->max_agg_classes)
+ hlist_del_init(&agg->nonfull_next);
+
+ if (agg->num_classes > new_num_classes &&
+ new_num_classes == q->max_agg_classes - 1) /* agg no more full */
+ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
+
+ /* The next assignment may let
+ * agg->initial_budget > agg->budgetmax
+ * hold, we will take it into account in charge_actual_service().
+ */
+ agg->budgetmax = new_num_classes * agg->lmax;
+ new_agg_weight = agg->class_weight * new_num_classes;
+ agg->inv_w = ONE_FP/new_agg_weight;
+
+ if (agg->grp == NULL) {
+ int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
+ q->min_slot_shift);
+ agg->grp = &q->groups[i];
+ }
+
+ q->wsum +=
+ (int) agg->class_weight * (new_num_classes - agg->num_classes);
+ q->iwsum = ONE_FP / q->wsum;
+
+ agg->num_classes = new_num_classes;
+}
+
+/* Add class to aggregate. */
+static void qfq_add_to_agg(struct qfq_sched *q,
+ struct qfq_aggregate *agg,
+ struct qfq_class *cl)
+{
+ cl->agg = agg;
+
+ qfq_update_agg(q, agg, agg->num_classes+1);
+ if (cl->qdisc->q.qlen > 0) { /* adding an active class */
+ list_add_tail(&cl->alist, &agg->active);
+ if (list_first_entry(&agg->active, struct qfq_class, alist) ==
+ cl && q->in_serv_agg != agg) /* agg was inactive */
+ qfq_activate_agg(q, agg, enqueue); /* schedule agg */
+ }
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *);
+
+static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ if (!hlist_unhashed(&agg->nonfull_next))
+ hlist_del_init(&agg->nonfull_next);
+ q->wsum -= agg->class_weight;
+ if (q->wsum != 0)
+ q->iwsum = ONE_FP / q->wsum;
+
+ if (q->in_serv_agg == agg)
+ q->in_serv_agg = qfq_choose_next_agg(q);
+ kfree(agg);
+}
+
+/* Deschedule class from within its parent aggregate. */
+static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
+{
+ struct qfq_aggregate *agg = cl->agg;
+
+
+ list_del(&cl->alist); /* remove from RR queue of the aggregate */
+ if (list_empty(&agg->active)) /* agg is now inactive */
+ qfq_deactivate_agg(q, agg);
+}
+
+/* Remove class from its parent aggregate. */
+static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+ struct qfq_aggregate *agg = cl->agg;
+
+ cl->agg = NULL;
+ if (agg->num_classes == 1) { /* agg being emptied, destroy it */
+ qfq_destroy_agg(q, agg);
+ return;
+ }
+ qfq_update_agg(q, agg, agg->num_classes-1);
+}
+
+/* Deschedule class and remove it from its parent aggregate. */
+static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl)
+{
+ if (cl->qdisc->q.qlen > 0) /* class is active */
+ qfq_deactivate_class(q, cl);
+
+ qfq_rm_from_agg(q, cl);
+}
+
+/* Move class to a new aggregate, matching the new class weight and/or lmax */
+static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight,
+ u32 lmax)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
+
+ if (new_agg == NULL) { /* create new aggregate */
+ new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
+ if (new_agg == NULL)
+ return -ENOBUFS;
+ qfq_init_agg(q, new_agg, lmax, weight);
+ }
+ qfq_deact_rm_from_agg(q, cl);
+ qfq_add_to_agg(q, new_agg, cl);
+
+ return 0;
+}
+
+static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)*arg;
+ bool existing = false;
+ struct nlattr *tb[TCA_QFQ_MAX + 1];
+ struct qfq_aggregate *new_agg = NULL;
+ u32 weight, lmax, inv_w;
+ int err;
+ int delta_w;
+
+ if (tca[TCA_OPTIONS] == NULL) {
+ pr_notice("qfq: no options\n");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_QFQ_WEIGHT]) {
+ weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
+ if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
+ pr_notice("qfq: invalid weight %u\n", weight);
+ return -EINVAL;
+ }
+ } else
+ weight = 1;
+
+ if (tb[TCA_QFQ_LMAX]) {
+ lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
+ if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) {
+ pr_notice("qfq: invalid max length %u\n", lmax);
+ return -EINVAL;
+ }
+ } else
+ lmax = psched_mtu(qdisc_dev(sch));
+
+ inv_w = ONE_FP / weight;
+ weight = ONE_FP / inv_w;
+
+ if (cl != NULL &&
+ lmax == cl->agg->lmax &&
+ weight == cl->agg->class_weight)
+ return 0; /* nothing to change */
+
+ delta_w = weight - (cl ? cl->agg->class_weight : 0);
+
+ if (q->wsum + delta_w > QFQ_MAX_WSUM) {
+ pr_notice("qfq: total weight out of range (%d + %u)\n",
+ delta_w, q->wsum);
+ return -EINVAL;
+ }
+
+ if (cl != NULL) { /* modify existing class */
+ if (tca[TCA_RATE]) {
+ err = gen_replace_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err)
+ return err;
+ }
+ existing = true;
+ goto set_change_agg;
+ }
+
+ /* create and init new class */
+ cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
+ if (cl == NULL)
+ return -ENOBUFS;
+
+ cl->refcnt = 1;
+ cl->common.classid = classid;
+ cl->deficit = lmax;
+
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, classid);
+ if (cl->qdisc == NULL)
+ cl->qdisc = &noop_qdisc;
+
+ if (tca[TCA_RATE]) {
+ err = gen_new_estimator(&cl->bstats, NULL,
+ &cl->rate_est,
+ qdisc_root_sleeping_lock(sch),
+ tca[TCA_RATE]);
+ if (err)
+ goto destroy_class;
+ }
+
+ sch_tree_lock(sch);
+ qdisc_class_hash_insert(&q->clhash, &cl->common);
+ sch_tree_unlock(sch);
+
+ qdisc_class_hash_grow(sch, &q->clhash);
+
+set_change_agg:
+ sch_tree_lock(sch);
+ new_agg = qfq_find_agg(q, lmax, weight);
+ if (new_agg == NULL) { /* create new aggregate */
+ sch_tree_unlock(sch);
+ new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
+ if (new_agg == NULL) {
+ err = -ENOBUFS;
+ gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ goto destroy_class;
+ }
+ sch_tree_lock(sch);
+ qfq_init_agg(q, new_agg, lmax, weight);
+ }
+ if (existing)
+ qfq_deact_rm_from_agg(q, cl);
+ qfq_add_to_agg(q, new_agg, cl);
+ sch_tree_unlock(sch);
+
+ *arg = (unsigned long)cl;
+ return 0;
+
+destroy_class:
+ qdisc_destroy(cl->qdisc);
+ kfree(cl);
+ return err;
+}
+
+static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+
+ qfq_rm_from_agg(q, cl);
+ gen_kill_estimator(&cl->bstats, &cl->rate_est);
+ qdisc_destroy(cl->qdisc);
+ kfree(cl);
+}
+
+static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (cl->filter_cnt > 0)
+ return -EBUSY;
+
+ sch_tree_lock(sch);
+
+ qfq_purge_queue(cl);
+ qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+ BUG_ON(--cl->refcnt == 0);
+ /*
+ * This shouldn't happen: we "hold" one cops->get() when called
+ * from tc_ctl_tclass; the destroy method is done from cops->put().
+ */
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
+{
+ struct qfq_class *cl = qfq_find_class(sch, classid);
+
+ if (cl != NULL)
+ cl->refcnt++;
+
+ return (unsigned long)cl;
+}
+
+static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (--cl->refcnt == 0)
+ qfq_destroy_class(sch, cl);
+}
+
+static struct tcf_proto __rcu **qfq_tcf_chain(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+
+ return &q->filter_list;
+}
+
+static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ struct qfq_class *cl = qfq_find_class(sch, classid);
+
+ if (cl != NULL)
+ cl->filter_cnt++;
+
+ return (unsigned long)cl;
+}
+
+static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ cl->filter_cnt--;
+}
+
+static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (new == NULL) {
+ new = qdisc_create_dflt(sch->dev_queue,
+ &pfifo_qdisc_ops, cl->common.classid);
+ if (new == NULL)
+ new = &noop_qdisc;
+ }
+
+ sch_tree_lock(sch);
+ qfq_purge_queue(cl);
+ *old = cl->qdisc;
+ cl->qdisc = new;
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ return cl->qdisc;
+}
+
+static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = cl->common.classid;
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) ||
+ nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
+ goto nla_put_failure;
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct qfq_class *cl = (struct qfq_class *)arg;
+ struct tc_qfq_stats xstats;
+
+ memset(&xstats, 0, sizeof(xstats));
+
+ xstats.weight = cl->agg->class_weight;
+ xstats.lmax = cl->agg->lmax;
+
+ if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
+ gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+ gnet_stats_copy_queue(d, NULL,
+ &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
+ return -1;
+
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+ pr_debug("qfq_classify: found %d\n", skb->priority);
+ cl = qfq_find_class(sch, skb->priority);
+ if (cl != NULL)
+ return cl;
+ }
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ fl = rcu_dereference_bh(q->filter_list);
+ result = tc_classify(skb, fl, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_QUEUED:
+ case TC_ACT_STOLEN:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ cl = (struct qfq_class *)res.class;
+ if (cl == NULL)
+ cl = qfq_find_class(sch, res.classid);
+ return cl;
+ }
+
+ return NULL;
+}
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(u64 a, u64 b)
+{
+ return (s64)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline u64 qfq_round_down(u64 ts, unsigned int shift)
+{
+ return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+ unsigned long bitmap)
+{
+ int index = __ffs(bitmap);
+ return &q->groups[index];
+}
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long mask_from(unsigned long bitmap, int from)
+{
+ return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
+{
+ /* if S > V we are not eligible */
+ unsigned int state = qfq_gt(grp->S, q->V);
+ unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (qfq_gt(grp->F, next->F))
+ state |= EB;
+ }
+
+ return state;
+}
+
+
+/*
+ * In principle
+ * q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ * q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
+ int src, int dst)
+{
+ q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ q->bitmaps[src] &= ~mask;
+}
+
+static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
+{
+ unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (!qfq_gt(next->F, old_F))
+ return;
+ }
+
+ mask = (1UL << index) - 1;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+ old_V ^= q->V;
+ old_V >>= q->min_slot_shift;
+ if (old_V) {
+ ...
+ }
+ *
+ */
+static void qfq_make_eligible(struct qfq_sched *q)
+{
+ unsigned long vslot = q->V >> q->min_slot_shift;
+ unsigned long old_vslot = q->oldV >> q->min_slot_shift;
+
+ if (vslot != old_vslot) {
+ unsigned long mask;
+ int last_flip_pos = fls(vslot ^ old_vslot);
+
+ if (last_flip_pos > 31) /* higher than the number of groups */
+ mask = ~0UL; /* make all groups eligible */
+ else
+ mask = (1UL << last_flip_pos) - 1;
+
+ qfq_move_groups(q, mask, IR, ER);
+ qfq_move_groups(q, mask, IB, EB);
+ }
+}
+
+/*
+ * The index of the slot in which the input aggregate agg is to be
+ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
+ * and not a '-1' because the start time of the group may be moved
+ * backward by one slot after the aggregate has been inserted, and
+ * this would cause non-empty slots to be right-shifted by one
+ * position.
+ *
+ * QFQ+ fully satisfies this bound to the slot index if the parameters
+ * of the classes are not changed dynamically, and if QFQ+ never
+ * happens to postpone the service of agg unjustly, i.e., it never
+ * happens that the aggregate becomes backlogged and eligible, or just
+ * eligible, while an aggregate with a higher approximated finish time
+ * is being served. In particular, in this case QFQ+ guarantees that
+ * the timestamps of agg are low enough that the slot index is never
+ * higher than 2. Unfortunately, QFQ+ cannot provide the same
+ * guarantee if it happens to unjustly postpone the service of agg, or
+ * if the parameters of some class are changed.
+ *
+ * As for the first event, i.e., an out-of-order service, the
+ * upper bound to the slot index guaranteed by QFQ+ grows to
+ * 2 +
+ * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
+ * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
+ *
+ * The following function deals with this problem by backward-shifting
+ * the timestamps of agg, if needed, so as to guarantee that the slot
+ * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
+ * cause the service of other aggregates to be postponed, yet the
+ * worst-case guarantees of these aggregates are not violated. In
+ * fact, in case of no out-of-order service, the timestamps of agg
+ * would have been even lower than they are after the backward shift,
+ * because QFQ+ would have guaranteed a maximum value equal to 2 for
+ * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
+ * service is postponed because of the backward-shift would have
+ * however waited for the service of agg before being served.
+ *
+ * The other event that may cause the slot index to be higher than 2
+ * for agg is a recent change of the parameters of some class. If the
+ * weight of a class is increased or the lmax (max_pkt_size) of the
+ * class is decreased, then a new aggregate with smaller slot size
+ * than the original parent aggregate of the class may happen to be
+ * activated. The activation of this aggregate should be properly
+ * delayed to when the service of the class has finished in the ideal
+ * system tracked by QFQ+. If the activation of the aggregate is not
+ * delayed to this reference time instant, then this aggregate may be
+ * unjustly served before other aggregates waiting for service. This
+ * may cause the above bound to the slot index to be violated for some
+ * of these unlucky aggregates.
+ *
+ * Instead of delaying the activation of the new aggregate, which is
+ * quite complex, the above-discussed capping of the slot index is
+ * used to handle also the consequences of a change of the parameters
+ * of a class.
+ */
+static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
+ u64 roundedS)
+{
+ u64 slot = (roundedS - grp->S) >> grp->slot_shift;
+ unsigned int i; /* slot index in the bucket list */
+
+ if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
+ u64 deltaS = roundedS - grp->S -
+ ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
+ agg->S -= deltaS;
+ agg->F -= deltaS;
+ slot = QFQ_MAX_SLOTS - 2;
+ }
+
+ i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+ hlist_add_head(&agg->next, &grp->slots[i]);
+ __set_bit(slot, &grp->full_slots);
+}
+
+/* Maybe introduce hlist_first_entry?? */
+static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp)
+{
+ return hlist_entry(grp->slots[grp->front].first,
+ struct qfq_aggregate, next);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static void qfq_front_slot_remove(struct qfq_group *grp)
+{
+ struct qfq_aggregate *agg = qfq_slot_head(grp);
+
+ BUG_ON(!agg);
+ hlist_del(&agg->next);
+ if (hlist_empty(&grp->slots[grp->front]))
+ __clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first aggregate in the first non-empty bucket of the
+ * group. As a side effect, adjusts the bucket list so the first
+ * non-empty bucket is at position 0 in full_slots.
+ */
+static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp)
+{
+ unsigned int i;
+
+ pr_debug("qfq slot_scan: grp %u full %#lx\n",
+ grp->index, grp->full_slots);
+
+ if (grp->full_slots == 0)
+ return NULL;
+
+ i = __ffs(grp->full_slots); /* zero based */
+ if (i > 0) {
+ grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+ grp->full_slots >>= i;
+ }
+
+ return qfq_slot_head(grp);
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
+{
+ unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+ grp->full_slots <<= i;
+ grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+static void qfq_update_eligible(struct qfq_sched *q)
+{
+ struct qfq_group *grp;
+ unsigned long ineligible;
+
+ ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+ if (ineligible) {
+ if (!q->bitmaps[ER]) {
+ grp = qfq_ffs(q, ineligible);
+ if (qfq_gt(grp->S, q->V))
+ q->V = grp->S;
+ }
+ qfq_make_eligible(q);
+ }
+}
+
+/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
+static void agg_dequeue(struct qfq_aggregate *agg,
+ struct qfq_class *cl, unsigned int len)
+{
+ qdisc_dequeue_peeked(cl->qdisc);
+
+ cl->deficit -= (int) len;
+
+ if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
+ list_del(&cl->alist);
+ else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
+ cl->deficit += agg->lmax;
+ list_move_tail(&cl->alist, &agg->active);
+ }
+}
+
+static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg,
+ struct qfq_class **cl,
+ unsigned int *len)
+{
+ struct sk_buff *skb;
+
+ *cl = list_first_entry(&agg->active, struct qfq_class, alist);
+ skb = (*cl)->qdisc->ops->peek((*cl)->qdisc);
+ if (skb == NULL)
+ WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
+ else
+ *len = qdisc_pkt_len(skb);
+
+ return skb;
+}
+
+/* Update F according to the actual service received by the aggregate. */
+static inline void charge_actual_service(struct qfq_aggregate *agg)
+{
+ /* Compute the service received by the aggregate, taking into
+ * account that, after decreasing the number of classes in
+ * agg, it may happen that
+ * agg->initial_budget - agg->budget > agg->bugdetmax
+ */
+ u32 service_received = min(agg->budgetmax,
+ agg->initial_budget - agg->budget);
+
+ agg->F = agg->S + (u64)service_received * agg->inv_w;
+}
+
+/* Assign a reasonable start time for a new aggregate in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in EB (see [2]). So, if we have groups in ER,
+ * set S to the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ unsigned long mask;
+ u64 limit, roundedF;
+ int slot_shift = agg->grp->slot_shift;
+
+ roundedF = qfq_round_down(agg->F, slot_shift);
+ limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
+
+ if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) {
+ /* timestamp was stale */
+ mask = mask_from(q->bitmaps[ER], agg->grp->index);
+ if (mask) {
+ struct qfq_group *next = qfq_ffs(q, mask);
+ if (qfq_gt(roundedF, next->F)) {
+ if (qfq_gt(limit, next->F))
+ agg->S = next->F;
+ else /* preserve timestamp correctness */
+ agg->S = limit;
+ return;
+ }
+ }
+ agg->S = q->V;
+ } else /* timestamp is not stale */
+ agg->S = agg->F;
+}
+
+/* Update the timestamps of agg before scheduling/rescheduling it for
+ * service. In particular, assign to agg->F its maximum possible
+ * value, i.e., the virtual finish time with which the aggregate
+ * should be labeled if it used all its budget once in service.
+ */
+static inline void
+qfq_update_agg_ts(struct qfq_sched *q,
+ struct qfq_aggregate *agg, enum update_reason reason)
+{
+ if (reason != requeue)
+ qfq_update_start(q, agg);
+ else /* just charge agg for the service received */
+ agg->S = agg->F;
+
+ agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
+}
+
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg);
+
+static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
+ struct qfq_class *cl;
+ struct sk_buff *skb = NULL;
+ /* next-packet len, 0 means no more active classes in in-service agg */
+ unsigned int len = 0;
+
+ if (in_serv_agg == NULL)
+ return NULL;
+
+ if (!list_empty(&in_serv_agg->active))
+ skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+
+ /*
+ * If there are no active classes in the in-service aggregate,
+ * or if the aggregate has not enough budget to serve its next
+ * class, then choose the next aggregate to serve.
+ */
+ if (len == 0 || in_serv_agg->budget < len) {
+ charge_actual_service(in_serv_agg);
+
+ /* recharge the budget of the aggregate */
+ in_serv_agg->initial_budget = in_serv_agg->budget =
+ in_serv_agg->budgetmax;
+
+ if (!list_empty(&in_serv_agg->active)) {
+ /*
+ * Still active: reschedule for
+ * service. Possible optimization: if no other
+ * aggregate is active, then there is no point
+ * in rescheduling this aggregate, and we can
+ * just keep it as the in-service one. This
+ * should be however a corner case, and to
+ * handle it, we would need to maintain an
+ * extra num_active_aggs field.
+ */
+ qfq_update_agg_ts(q, in_serv_agg, requeue);
+ qfq_schedule_agg(q, in_serv_agg);
+ } else if (sch->q.qlen == 0) { /* no aggregate to serve */
+ q->in_serv_agg = NULL;
+ return NULL;
+ }
+
+ /*
+ * If we get here, there are other aggregates queued:
+ * choose the new aggregate to serve.
+ */
+ in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
+ skb = qfq_peek_skb(in_serv_agg, &cl, &len);
+ }
+ if (!skb)
+ return NULL;
+
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
+
+ agg_dequeue(in_serv_agg, cl, len);
+ /* If lmax is lowered, through qfq_change_class, for a class
+ * owning pending packets with larger size than the new value
+ * of lmax, then the following condition may hold.
+ */
+ if (unlikely(in_serv_agg->budget < len))
+ in_serv_agg->budget = 0;
+ else
+ in_serv_agg->budget -= len;
+
+ q->V += (u64)len * q->iwsum;
+ pr_debug("qfq dequeue: len %u F %lld now %lld\n",
+ len, (unsigned long long) in_serv_agg->F,
+ (unsigned long long) q->V);
+
+ return skb;
+}
+
+static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q)
+{
+ struct qfq_group *grp;
+ struct qfq_aggregate *agg, *new_front_agg;
+ u64 old_F;
+
+ qfq_update_eligible(q);
+ q->oldV = q->V;
+
+ if (!q->bitmaps[ER])
+ return NULL;
+
+ grp = qfq_ffs(q, q->bitmaps[ER]);
+ old_F = grp->F;
+
+ agg = qfq_slot_head(grp);
+
+ /* agg starts to be served, remove it from schedule */
+ qfq_front_slot_remove(grp);
+
+ new_front_agg = qfq_slot_scan(grp);
+
+ if (new_front_agg == NULL) /* group is now inactive, remove from ER */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ else {
+ u64 roundedS = qfq_round_down(new_front_agg->S,
+ grp->slot_shift);
+ unsigned int s;
+
+ if (grp->S == roundedS)
+ return agg;
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+
+ qfq_unblock_groups(q, grp->index, old_F);
+
+ return agg;
+}
+
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct qfq_aggregate *agg;
+ int err = 0;
+
+ cl = qfq_classify(skb, sch, &err);
+ if (cl == NULL) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return err;
+ }
+ pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
+
+ if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
+ pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
+ cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
+ err = qfq_change_agg(sch, cl, cl->agg->class_weight,
+ qdisc_pkt_len(skb));
+ if (err)
+ return err;
+ }
+
+ err = qdisc_enqueue(skb, cl->qdisc);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ pr_debug("qfq_enqueue: enqueue failed %d\n", err);
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ bstats_update(&cl->bstats, skb);
+ ++sch->q.qlen;
+
+ agg = cl->agg;
+ /* if the queue was not empty, then done here */
+ if (cl->qdisc->q.qlen != 1) {
+ if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
+ list_first_entry(&agg->active, struct qfq_class, alist)
+ == cl && cl->deficit < qdisc_pkt_len(skb))
+ list_move_tail(&cl->alist, &agg->active);
+
+ return err;
+ }
+
+ /* schedule class for service within the aggregate */
+ cl->deficit = agg->lmax;
+ list_add_tail(&cl->alist, &agg->active);
+
+ if (list_first_entry(&agg->active, struct qfq_class, alist) != cl ||
+ q->in_serv_agg == agg)
+ return err; /* non-empty or in service, nothing else to do */
+
+ qfq_activate_agg(q, agg, enqueue);
+
+ return err;
+}
+
+/*
+ * Schedule aggregate according to its timestamps.
+ */
+static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ struct qfq_group *grp = agg->grp;
+ u64 roundedS;
+ int s;
+
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+
+ /*
+ * Insert agg in the correct bucket.
+ * If agg->S >= grp->S we don't need to adjust the
+ * bucket list and simply go to the insertion phase.
+ * Otherwise grp->S is decreasing, we must make room
+ * in the bucket list, and also recompute the group state.
+ * Finally, if there were no flows in this group and nobody
+ * was in ER make sure to adjust V.
+ */
+ if (grp->full_slots) {
+ if (!qfq_gt(grp->S, agg->S))
+ goto skip_update;
+
+ /* create a slot for this agg->S */
+ qfq_slot_rotate(grp, roundedS);
+ /* group was surely ineligible, remove */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
+ q->in_serv_agg == NULL)
+ q->V = roundedS;
+
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+
+ pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
+ s, q->bitmaps[s],
+ (unsigned long long) agg->S,
+ (unsigned long long) agg->F,
+ (unsigned long long) q->V);
+
+skip_update:
+ qfq_slot_insert(grp, agg, roundedS);
+}
+
+
+/* Update agg ts and schedule agg for service */
+static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
+ enum update_reason reason)
+{
+ agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
+
+ qfq_update_agg_ts(q, agg, reason);
+ if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
+ q->in_serv_agg = agg; /* start serving this aggregate */
+ /* update V: to be in service, agg must be eligible */
+ q->oldV = q->V = agg->S;
+ } else if (agg != q->in_serv_agg)
+ qfq_schedule_agg(q, agg);
+}
+
+static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_aggregate *agg)
+{
+ unsigned int i, offset;
+ u64 roundedS;
+
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+ offset = (roundedS - grp->S) >> grp->slot_shift;
+
+ i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+ hlist_del(&agg->next);
+ if (hlist_empty(&grp->slots[i]))
+ __clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * Called to forcibly deschedule an aggregate. If the aggregate is
+ * not in the front bucket, or if the latter has other aggregates in
+ * the front bucket, we can simply remove the aggregate with no other
+ * side effects.
+ * Otherwise we must propagate the event up.
+ */
+static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
+{
+ struct qfq_group *grp = agg->grp;
+ unsigned long mask;
+ u64 roundedS;
+ int s;
+
+ if (agg == q->in_serv_agg) {
+ charge_actual_service(agg);
+ q->in_serv_agg = qfq_choose_next_agg(q);
+ return;
+ }
+
+ agg->F = agg->S;
+ qfq_slot_remove(q, grp, agg);
+
+ if (!grp->full_slots) {
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+
+ if (test_bit(grp->index, &q->bitmaps[ER]) &&
+ !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+ mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+ if (mask)
+ mask = ~((1UL << __fls(mask)) - 1);
+ else
+ mask = ~0UL;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+ }
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ } else if (hlist_empty(&grp->slots[grp->front])) {
+ agg = qfq_slot_scan(grp);
+ roundedS = qfq_round_down(agg->S, grp->slot_shift);
+ if (grp->S != roundedS) {
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ }
+}
+
+static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl = (struct qfq_class *)arg;
+
+ if (cl->qdisc->q.qlen == 0)
+ qfq_deactivate_class(q, cl);
+}
+
+static unsigned int qfq_drop_from_slot(struct qfq_sched *q,
+ struct hlist_head *slot)
+{
+ struct qfq_aggregate *agg;
+ struct qfq_class *cl;
+ unsigned int len;
+
+ hlist_for_each_entry(agg, slot, next) {
+ list_for_each_entry(cl, &agg->active, alist) {
+
+ if (!cl->qdisc->ops->drop)
+ continue;
+
+ len = cl->qdisc->ops->drop(cl->qdisc);
+ if (len > 0) {
+ if (cl->qdisc->q.qlen == 0)
+ qfq_deactivate_class(q, cl);
+
+ return len;
+ }
+ }
+ }
+ return 0;
+}
+
+static unsigned int qfq_drop(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_group *grp;
+ unsigned int i, j, len;
+
+ for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+ grp = &q->groups[i];
+ for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+ len = qfq_drop_from_slot(q, &grp->slots[j]);
+ if (len > 0) {
+ sch->q.qlen--;
+ return len;
+ }
+ }
+
+ }
+
+ return 0;
+}
+
+static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_group *grp;
+ int i, j, err;
+ u32 max_cl_shift, maxbudg_shift, max_classes;
+
+ err = qdisc_class_hash_init(&q->clhash);
+ if (err < 0)
+ return err;
+
+ if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
+ max_classes = QFQ_MAX_AGG_CLASSES;
+ else
+ max_classes = qdisc_dev(sch)->tx_queue_len + 1;
+ /* max_cl_shift = floor(log_2(max_classes)) */
+ max_cl_shift = __fls(max_classes);
+ q->max_agg_classes = 1<<max_cl_shift;
+
+ /* maxbudg_shift = log2(max_len * max_classes_per_agg) */
+ maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
+ q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
+
+ for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+ grp = &q->groups[i];
+ grp->index = i;
+ grp->slot_shift = q->min_slot_shift + i;
+ for (j = 0; j < QFQ_MAX_SLOTS; j++)
+ INIT_HLIST_HEAD(&grp->slots[j]);
+ }
+
+ INIT_HLIST_HEAD(&q->nonfull_aggs);
+
+ return 0;
+}
+
+static void qfq_reset_qdisc(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ unsigned int i;
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ if (cl->qdisc->q.qlen > 0)
+ qfq_deactivate_class(q, cl);
+
+ qdisc_reset(cl->qdisc);
+ }
+ }
+ sch->q.qlen = 0;
+}
+
+static void qfq_destroy_qdisc(struct Qdisc *sch)
+{
+ struct qfq_sched *q = qdisc_priv(sch);
+ struct qfq_class *cl;
+ struct hlist_node *next;
+ unsigned int i;
+
+ tcf_destroy_chain(&q->filter_list);
+
+ for (i = 0; i < q->clhash.hashsize; i++) {
+ hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
+ common.hnode) {
+ qfq_destroy_class(sch, cl);
+ }
+ }
+ qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops qfq_class_ops = {
+ .change = qfq_change_class,
+ .delete = qfq_delete_class,
+ .get = qfq_get_class,
+ .put = qfq_put_class,
+ .tcf_chain = qfq_tcf_chain,
+ .bind_tcf = qfq_bind_tcf,
+ .unbind_tcf = qfq_unbind_tcf,
+ .graft = qfq_graft_class,
+ .leaf = qfq_class_leaf,
+ .qlen_notify = qfq_qlen_notify,
+ .dump = qfq_dump_class,
+ .dump_stats = qfq_dump_class_stats,
+ .walk = qfq_walk,
+};
+
+static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
+ .cl_ops = &qfq_class_ops,
+ .id = "qfq",
+ .priv_size = sizeof(struct qfq_sched),
+ .enqueue = qfq_enqueue,
+ .dequeue = qfq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = qfq_drop,
+ .init = qfq_init_qdisc,
+ .reset = qfq_reset_qdisc,
+ .destroy = qfq_destroy_qdisc,
+ .owner = THIS_MODULE,
+};
+
+static int __init qfq_init(void)
+{
+ return register_qdisc(&qfq_qdisc_ops);
+}
+
+static void __exit qfq_exit(void)
+{
+ unregister_qdisc(&qfq_qdisc_ops);
+}
+
+module_init(qfq_init);
+module_exit(qfq_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
new file mode 100644
index 000000000..6c0534cc7
--- /dev/null
+++ b/net/sched/sch_red.c
@@ -0,0 +1,391 @@
+/*
+ * net/sched/sch_red.c Random Early Detection queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * J Hadi Salim 980914: computation fixes
+ * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
+ * J Hadi Salim 980816: ECN support
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+
+/* Parameters, settable by user:
+ -----------------------------
+
+ limit - bytes (must be > qth_max + burst)
+
+ Hard limit on queue length, should be chosen >qth_max
+ to allow packet bursts. This parameter does not
+ affect the algorithms behaviour and can be chosen
+ arbitrarily high (well, less than ram size)
+ Really, this limit will never be reached
+ if RED works correctly.
+ */
+
+struct red_sched_data {
+ u32 limit; /* HARD maximal queue length */
+ unsigned char flags;
+ struct timer_list adapt_timer;
+ struct red_parms parms;
+ struct red_vars vars;
+ struct red_stats stats;
+ struct Qdisc *qdisc;
+};
+
+static inline int red_use_ecn(struct red_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+static inline int red_use_harddrop(struct red_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ int ret;
+
+ q->vars.qavg = red_calc_qavg(&q->parms,
+ &q->vars,
+ child->qstats.backlog);
+
+ if (red_is_idling(&q->vars))
+ red_end_of_idle_period(&q->vars);
+
+ switch (red_action(&q->parms, &q->vars, q->vars.qavg)) {
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (red_use_harddrop(q) || !red_use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ break;
+ }
+
+ ret = qdisc_enqueue(skb, child);
+ if (likely(ret == NET_XMIT_SUCCESS)) {
+ sch->q.qlen++;
+ } else if (net_xmit_drop_count(ret)) {
+ q->stats.pdrop++;
+ qdisc_qstats_drop(sch);
+ }
+ return ret;
+
+congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *red_dequeue(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ skb = child->dequeue(child);
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ } else {
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+ }
+ return skb;
+}
+
+static struct sk_buff *red_peek(struct Qdisc *sch)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ return child->ops->peek(child);
+}
+
+static unsigned int red_drop(struct Qdisc *sch)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ unsigned int len;
+
+ if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
+ q->stats.other++;
+ qdisc_qstats_drop(sch);
+ sch->q.qlen--;
+ return len;
+ }
+
+ if (!red_is_idling(&q->vars))
+ red_start_of_idle_period(&q->vars);
+
+ return 0;
+}
+
+static void red_reset(struct Qdisc *sch)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->q.qlen = 0;
+ red_restart(&q->vars);
+}
+
+static void red_destroy(struct Qdisc *sch)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ del_timer_sync(&q->adapt_timer);
+ qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
+ [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_RED_STAB] = { .len = RED_STAB_SIZE },
+ [TCA_RED_MAX_P] = { .type = NLA_U32 },
+};
+
+static int red_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_RED_MAX + 1];
+ struct tc_red_qopt *ctl;
+ struct Qdisc *child = NULL;
+ int err;
+ u32 max_P;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_RED_PARMS] == NULL ||
+ tb[TCA_RED_STAB] == NULL)
+ return -EINVAL;
+
+ max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
+
+ ctl = nla_data(tb[TCA_RED_PARMS]);
+
+ if (ctl->limit > 0) {
+ child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+ }
+
+ sch_tree_lock(sch);
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+ if (child) {
+ qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_destroy(q->qdisc);
+ q->qdisc = child;
+ }
+
+ red_set_parms(&q->parms,
+ ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_RED_STAB]),
+ max_P);
+ red_set_vars(&q->vars);
+
+ del_timer(&q->adapt_timer);
+ if (ctl->flags & TC_RED_ADAPTATIVE)
+ mod_timer(&q->adapt_timer, jiffies + HZ/2);
+
+ if (!q->qdisc->q.qlen)
+ red_start_of_idle_period(&q->vars);
+
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static inline void red_adaptative_timer(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc *)arg;
+ struct red_sched_data *q = qdisc_priv(sch);
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+ spin_lock(root_lock);
+ red_adaptative_algo(&q->parms, &q->vars);
+ mod_timer(&q->adapt_timer, jiffies + HZ/2);
+ spin_unlock(root_lock);
+}
+
+static int red_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ q->qdisc = &noop_qdisc;
+ setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch);
+ return red_change(sch, opt);
+}
+
+static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
+ nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ struct tc_red_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int red_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+ return 0;
+}
+
+static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long red_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void red_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops red_class_ops = {
+ .graft = red_graft,
+ .leaf = red_leaf,
+ .get = red_get,
+ .put = red_put,
+ .walk = red_walk,
+ .dump = red_dump_class,
+};
+
+static struct Qdisc_ops red_qdisc_ops __read_mostly = {
+ .id = "red",
+ .priv_size = sizeof(struct red_sched_data),
+ .cl_ops = &red_class_ops,
+ .enqueue = red_enqueue,
+ .dequeue = red_dequeue,
+ .peek = red_peek,
+ .drop = red_drop,
+ .init = red_init,
+ .reset = red_reset,
+ .destroy = red_destroy,
+ .change = red_change,
+ .dump = red_dump,
+ .dump_stats = red_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init red_module_init(void)
+{
+ return register_qdisc(&red_qdisc_ops);
+}
+
+static void __exit red_module_exit(void)
+{
+ unregister_qdisc(&red_qdisc_ops);
+}
+
+module_init(red_module_init)
+module_exit(red_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
new file mode 100644
index 000000000..5819dd826
--- /dev/null
+++ b/net/sched/sch_sfb.c
@@ -0,0 +1,728 @@
+/*
+ * net/sched/sch_sfb.c Stochastic Fair Blue
+ *
+ * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
+ * A New Class of Active Queue Management Algorithms.
+ * U. Michigan CSE-TR-387-99, April 1999.
+ *
+ * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/flow_keys.h>
+
+/*
+ * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
+ * This implementation uses L = 8 and N = 16
+ * This permits us to split one 32bit hash (provided per packet by rxhash or
+ * external classifier) into 8 subhashes of 4 bits.
+ */
+#define SFB_BUCKET_SHIFT 4
+#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
+#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
+#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
+
+/* SFB algo uses a virtual queue, named "bin" */
+struct sfb_bucket {
+ u16 qlen; /* length of virtual queue */
+ u16 p_mark; /* marking probability */
+};
+
+/* We use a double buffering right before hash change
+ * (Section 4.4 of SFB reference : moving hash functions)
+ */
+struct sfb_bins {
+ u32 perturbation; /* jhash perturbation */
+ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
+};
+
+struct sfb_sched_data {
+ struct Qdisc *qdisc;
+ struct tcf_proto __rcu *filter_list;
+ unsigned long rehash_interval;
+ unsigned long warmup_time; /* double buffering warmup time in jiffies */
+ u32 max;
+ u32 bin_size; /* maximum queue length per bin */
+ u32 increment; /* d1 */
+ u32 decrement; /* d2 */
+ u32 limit; /* HARD maximal queue length */
+ u32 penalty_rate;
+ u32 penalty_burst;
+ u32 tokens_avail;
+ unsigned long rehash_time;
+ unsigned long token_time;
+
+ u8 slot; /* current active bins (0 or 1) */
+ bool double_buffering;
+ struct sfb_bins bins[2];
+
+ struct {
+ u32 earlydrop;
+ u32 penaltydrop;
+ u32 bucketdrop;
+ u32 queuedrop;
+ u32 childdrop; /* drops in child qdisc */
+ u32 marked; /* ECN mark */
+ } stats;
+};
+
+/*
+ * Each queued skb might be hashed on one or two bins
+ * We store in skb_cb the two hash values.
+ * (A zero value means double buffering was not used)
+ */
+struct sfb_skb_cb {
+ u32 hashes[2];
+};
+
+static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
+ return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/*
+ * If using 'internal' SFB flow classifier, hash comes from skb rxhash
+ * If using external classifier, hash comes from the classid.
+ */
+static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
+{
+ return sfb_skb_cb(skb)->hashes[slot];
+}
+
+/* Probabilities are coded as Q0.16 fixed-point values,
+ * with 0xFFFF representing 65535/65536 (almost 1.0)
+ * Addition and subtraction are saturating in [0, 65535]
+ */
+static u32 prob_plus(u32 p1, u32 p2)
+{
+ u32 res = p1 + p2;
+
+ return min_t(u32, res, SFB_MAX_PROB);
+}
+
+static u32 prob_minus(u32 p1, u32 p2)
+{
+ return p1 > p2 ? p1 - p2 : 0;
+}
+
+static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen < 0xFFFF)
+ b[hash].qlen++;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ increment_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_one_qlen(u32 sfbhash, u32 slot,
+ struct sfb_sched_data *q)
+{
+ int i;
+ struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b[hash].qlen > 0)
+ b[hash].qlen--;
+ b += SFB_NUMBUCKETS; /* next level */
+ }
+}
+
+static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ u32 sfbhash;
+
+ sfbhash = sfb_hash(skb, 0);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 0, q);
+
+ sfbhash = sfb_hash(skb, 1);
+ if (sfbhash)
+ decrement_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_minus(b->p_mark, q->decrement);
+}
+
+static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+ b->p_mark = prob_plus(b->p_mark, q->increment);
+}
+
+static void sfb_zero_all_buckets(struct sfb_sched_data *q)
+{
+ memset(&q->bins, 0, sizeof(q->bins));
+}
+
+/*
+ * compute max qlen, max p_mark, and avg p_mark
+ */
+static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
+{
+ int i;
+ u32 qlen = 0, prob = 0, totalpm = 0;
+ const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
+
+ for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
+ if (qlen < b->qlen)
+ qlen = b->qlen;
+ totalpm += b->p_mark;
+ if (prob < b->p_mark)
+ prob = b->p_mark;
+ b++;
+ }
+ *prob_r = prob;
+ *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
+ return qlen;
+}
+
+
+static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
+{
+ q->bins[slot].perturbation = prandom_u32();
+}
+
+static void sfb_swap_slot(struct sfb_sched_data *q)
+{
+ sfb_init_perturbation(q->slot, q);
+ q->slot ^= 1;
+ q->double_buffering = false;
+}
+
+/* Non elastic flows are allowed to use part of the bandwidth, expressed
+ * in "penalty_rate" packets per second, with "penalty_burst" burst
+ */
+static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
+{
+ if (q->penalty_rate == 0 || q->penalty_burst == 0)
+ return true;
+
+ if (q->tokens_avail < 1) {
+ unsigned long age = min(10UL * HZ, jiffies - q->token_time);
+
+ q->tokens_avail = (age * q->penalty_rate) / HZ;
+ if (q->tokens_avail > q->penalty_burst)
+ q->tokens_avail = q->penalty_burst;
+ q->token_time = jiffies;
+ if (q->tokens_avail < 1)
+ return true;
+ }
+
+ q->tokens_avail--;
+ return false;
+}
+
+static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
+ int *qerr, u32 *salt)
+{
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, fl, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ *salt = TC_H_MIN(res.classid);
+ return true;
+ }
+ return false;
+}
+
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ struct tcf_proto *fl;
+ int i;
+ u32 p_min = ~0;
+ u32 minqlen = ~0;
+ u32 r, slot, salt, sfbhash;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ struct flow_keys keys;
+
+ if (unlikely(sch->q.qlen >= q->limit)) {
+ qdisc_qstats_overlimit(sch);
+ q->stats.queuedrop++;
+ goto drop;
+ }
+
+ if (q->rehash_interval > 0) {
+ unsigned long limit = q->rehash_time + q->rehash_interval;
+
+ if (unlikely(time_after(jiffies, limit))) {
+ sfb_swap_slot(q);
+ q->rehash_time = jiffies;
+ } else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
+ time_after(jiffies, limit - q->warmup_time))) {
+ q->double_buffering = true;
+ }
+ }
+
+ fl = rcu_dereference_bh(q->filter_list);
+ if (fl) {
+ /* If using external classifiers, get result and record it. */
+ if (!sfb_classify(skb, fl, &ret, &salt))
+ goto other_drop;
+ keys.src = salt;
+ keys.dst = 0;
+ keys.ports = 0;
+ } else {
+ skb_flow_dissect(skb, &keys);
+ }
+
+ slot = q->slot;
+
+ sfbhash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports,
+ q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ if (minqlen > b->qlen)
+ minqlen = b->qlen;
+ if (p_min > b->p_mark)
+ p_min = b->p_mark;
+ }
+
+ slot ^= 1;
+ sfb_skb_cb(skb)->hashes[slot] = 0;
+
+ if (unlikely(minqlen >= q->max)) {
+ qdisc_qstats_overlimit(sch);
+ q->stats.bucketdrop++;
+ goto drop;
+ }
+
+ if (unlikely(p_min >= SFB_MAX_PROB)) {
+ /* Inelastic flow */
+ if (q->double_buffering) {
+ sfbhash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports,
+ q->bins[slot].perturbation);
+ if (!sfbhash)
+ sfbhash = 1;
+ sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+ for (i = 0; i < SFB_LEVELS; i++) {
+ u32 hash = sfbhash & SFB_BUCKET_MASK;
+ struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+ sfbhash >>= SFB_BUCKET_SHIFT;
+ if (b->qlen == 0)
+ decrement_prob(b, q);
+ else if (b->qlen >= q->bin_size)
+ increment_prob(b, q);
+ }
+ }
+ if (sfb_rate_limit(skb, q)) {
+ qdisc_qstats_overlimit(sch);
+ q->stats.penaltydrop++;
+ goto drop;
+ }
+ goto enqueue;
+ }
+
+ r = prandom_u32() & SFB_MAX_PROB;
+
+ if (unlikely(r < p_min)) {
+ if (unlikely(p_min > SFB_MAX_PROB / 2)) {
+ /* If we're marking that many packets, then either
+ * this flow is unresponsive, or we're badly congested.
+ * In either case, we want to start dropping packets.
+ */
+ if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.marked++;
+ } else {
+ q->stats.earlydrop++;
+ goto drop;
+ }
+ }
+
+enqueue:
+ ret = qdisc_enqueue(skb, child);
+ if (likely(ret == NET_XMIT_SUCCESS)) {
+ sch->q.qlen++;
+ increment_qlen(skb, q);
+ } else if (net_xmit_drop_count(ret)) {
+ q->stats.childdrop++;
+ qdisc_qstats_drop(sch);
+ }
+ return ret;
+
+drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+ struct sk_buff *skb;
+
+ skb = child->dequeue(q->qdisc);
+
+ if (skb) {
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ decrement_qlen(skb, q);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *sfb_peek(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child = q->qdisc;
+
+ return child->ops->peek(child);
+}
+
+/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
+
+static void sfb_reset(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->q.qlen = 0;
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+}
+
+static void sfb_destroy(struct Qdisc *sch)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
+ [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
+};
+
+static const struct tc_sfb_qopt sfb_default_ops = {
+ .rehash_interval = 600 * MSEC_PER_SEC,
+ .warmup_time = 60 * MSEC_PER_SEC,
+ .limit = 0,
+ .max = 25,
+ .bin_size = 20,
+ .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
+ .decrement = (SFB_MAX_PROB + 3000) / 6000,
+ .penalty_rate = 10,
+ .penalty_burst = 20,
+};
+
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *child;
+ struct nlattr *tb[TCA_SFB_MAX + 1];
+ const struct tc_sfb_qopt *ctl = &sfb_default_ops;
+ u32 limit;
+ int err;
+
+ if (opt) {
+ err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
+ if (err < 0)
+ return -EINVAL;
+
+ if (tb[TCA_SFB_PARMS] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_SFB_PARMS]);
+ }
+
+ limit = ctl->limit;
+ if (limit == 0)
+ limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+
+ child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+
+ sch_tree_lock(sch);
+
+ qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_destroy(q->qdisc);
+ q->qdisc = child;
+
+ q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
+ q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
+ q->rehash_time = jiffies;
+ q->limit = limit;
+ q->increment = ctl->increment;
+ q->decrement = ctl->decrement;
+ q->max = ctl->max;
+ q->bin_size = ctl->bin_size;
+ q->penalty_rate = ctl->penalty_rate;
+ q->penalty_burst = ctl->penalty_burst;
+ q->tokens_avail = ctl->penalty_burst;
+ q->token_time = jiffies;
+
+ q->slot = 0;
+ q->double_buffering = false;
+ sfb_zero_all_buckets(q);
+ sfb_init_perturbation(0, q);
+ sfb_init_perturbation(1, q);
+
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ q->qdisc = &noop_qdisc;
+ return sfb_change(sch, opt);
+}
+
+static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct tc_sfb_qopt opt = {
+ .rehash_interval = jiffies_to_msecs(q->rehash_interval),
+ .warmup_time = jiffies_to_msecs(q->warmup_time),
+ .limit = q->limit,
+ .max = q->max,
+ .bin_size = q->bin_size,
+ .increment = q->increment,
+ .decrement = q->decrement,
+ .penalty_rate = q->penalty_rate,
+ .penalty_burst = q->penalty_burst,
+ };
+
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+ struct tc_sfb_xstats st = {
+ .earlydrop = q->stats.earlydrop,
+ .penaltydrop = q->stats.penaltydrop,
+ .bucketdrop = q->stats.bucketdrop,
+ .queuedrop = q->stats.queuedrop,
+ .childdrop = q->stats.childdrop,
+ .marked = q->stats.marked,
+ };
+
+ st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ return -ENOSYS;
+}
+
+static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ return q->qdisc;
+}
+
+static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void sfb_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg)
+{
+ return -ENOSYS;
+}
+
+static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+{
+ return -ENOSYS;
+}
+
+static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static struct tcf_proto __rcu **sfb_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct sfb_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+
+static const struct Qdisc_class_ops sfb_class_ops = {
+ .graft = sfb_graft,
+ .leaf = sfb_leaf,
+ .get = sfb_get,
+ .put = sfb_put,
+ .change = sfb_change_class,
+ .delete = sfb_delete,
+ .walk = sfb_walk,
+ .tcf_chain = sfb_find_tcf,
+ .bind_tcf = sfb_bind,
+ .unbind_tcf = sfb_put,
+ .dump = sfb_dump_class,
+};
+
+static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
+ .id = "sfb",
+ .priv_size = sizeof(struct sfb_sched_data),
+ .cl_ops = &sfb_class_ops,
+ .enqueue = sfb_enqueue,
+ .dequeue = sfb_dequeue,
+ .peek = sfb_peek,
+ .init = sfb_init,
+ .reset = sfb_reset,
+ .destroy = sfb_destroy,
+ .change = sfb_change,
+ .dump = sfb_dump,
+ .dump_stats = sfb_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init sfb_module_init(void)
+{
+ return register_qdisc(&sfb_qdisc_ops);
+}
+
+static void __exit sfb_module_exit(void)
+{
+ unregister_qdisc(&sfb_qdisc_ops);
+}
+
+module_init(sfb_module_init)
+module_exit(sfb_module_exit)
+
+MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
+MODULE_AUTHOR("Juliusz Chroboczek");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
new file mode 100644
index 000000000..b877140be
--- /dev/null
+++ b/net/sched/sch_sfq.c
@@ -0,0 +1,939 @@
+/*
+ * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/flow_keys.h>
+#include <net/red.h>
+
+
+/* Stochastic Fairness Queuing algorithm.
+ =======================================
+
+ Source:
+ Paul E. McKenney "Stochastic Fairness Queuing",
+ IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
+
+ Paul E. McKenney "Stochastic Fairness Queuing",
+ "Interworking: Research and Experience", v.2, 1991, p.113-131.
+
+
+ See also:
+ M. Shreedhar and George Varghese "Efficient Fair
+ Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
+
+
+ This is not the thing that is usually called (W)FQ nowadays.
+ It does not use any timestamp mechanism, but instead
+ processes queues in round-robin order.
+
+ ADVANTAGE:
+
+ - It is very cheap. Both CPU and memory requirements are minimal.
+
+ DRAWBACKS:
+
+ - "Stochastic" -> It is not 100% fair.
+ When hash collisions occur, several flows are considered as one.
+
+ - "Round-robin" -> It introduces larger delays than virtual clock
+ based schemes, and should not be used for isolating interactive
+ traffic from non-interactive. It means, that this scheduler
+ should be used as leaf of CBQ or P3, which put interactive traffic
+ to higher priority band.
+
+ We still need true WFQ for top level CSZ, but using WFQ
+ for the best effort traffic is absolutely pointless:
+ SFQ is superior for this purpose.
+
+ IMPLEMENTATION:
+ This implementation limits :
+ - maximal queue length per flow to 127 packets.
+ - max mtu to 2^18-1;
+ - max 65408 flows,
+ - number of hash buckets to 65536.
+
+ It is easy to increase these values, but not in flight. */
+
+#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
+#define SFQ_DEFAULT_FLOWS 128
+#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
+#define SFQ_EMPTY_SLOT 0xffff
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
+
+/* We use 16 bits to store allot, and want to handle packets up to 64K
+ * Scale allot by 8 (1<<3) so that no overflow occurs.
+ */
+#define SFQ_ALLOT_SHIFT 3
+#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
+
+/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
+typedef u16 sfq_index;
+
+/*
+ * We dont use pointers to save space.
+ * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
+ * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
+ * are 'pointers' to dep[] array
+ */
+struct sfq_head {
+ sfq_index next;
+ sfq_index prev;
+};
+
+struct sfq_slot {
+ struct sk_buff *skblist_next;
+ struct sk_buff *skblist_prev;
+ sfq_index qlen; /* number of skbs in skblist */
+ sfq_index next; /* next slot in sfq RR chain */
+ struct sfq_head dep; /* anchor in dep[] chains */
+ unsigned short hash; /* hash value (index in ht[]) */
+ short allot; /* credit for this slot */
+
+ unsigned int backlog;
+ struct red_vars vars;
+};
+
+struct sfq_sched_data {
+/* frequently used fields */
+ int limit; /* limit of total number of packets in this qdisc */
+ unsigned int divisor; /* number of slots in hash table */
+ u8 headdrop;
+ u8 maxdepth; /* limit of packets per flow */
+
+ u32 perturbation;
+ u8 cur_depth; /* depth of longest slot */
+ u8 flags;
+ unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
+ struct tcf_proto __rcu *filter_list;
+ sfq_index *ht; /* Hash table ('divisor' slots) */
+ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
+
+ struct red_parms *red_parms;
+ struct tc_sfqred_stats stats;
+ struct sfq_slot *tail; /* current slot in round */
+
+ struct sfq_head dep[SFQ_MAX_DEPTH + 1];
+ /* Linked lists of slots, indexed by depth
+ * dep[0] : list of unused flows
+ * dep[1] : list of flows with 1 packet
+ * dep[X] : list of flows with X packets
+ */
+
+ unsigned int maxflows; /* number of flows in flows array */
+ int perturb_period;
+ unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
+ struct timer_list perturb_timer;
+};
+
+/*
+ * sfq_head are either in a sfq_slot or in dep[] array
+ */
+static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
+{
+ if (val < SFQ_MAX_FLOWS)
+ return &q->slots[val].dep;
+ return &q->dep[val - SFQ_MAX_FLOWS];
+}
+
+/*
+ * In order to be able to quickly rehash our queue when timer changes
+ * q->perturbation, we store flow_keys in skb->cb[]
+ */
+struct sfq_skb_cb {
+ struct flow_keys keys;
+};
+
+static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
+ return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static unsigned int sfq_hash(const struct sfq_sched_data *q,
+ const struct sk_buff *skb)
+{
+ const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
+ unsigned int hash;
+
+ hash = jhash_3words((__force u32)keys->dst,
+ (__force u32)keys->src ^ keys->ip_proto,
+ (__force u32)keys->ports, q->perturbation);
+ return hash & (q->divisor - 1);
+}
+
+static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->divisor)
+ return TC_H_MIN(skb->priority);
+
+ fl = rcu_dereference_bh(q->filter_list);
+ if (!fl) {
+ skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
+ return sfq_hash(q, skb) + 1;
+ }
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tc_classify(skb, fl, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->divisor)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/*
+ * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
+ */
+static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+ struct sfq_slot *slot = &q->slots[x];
+ int qlen = slot->qlen;
+
+ p = qlen + SFQ_MAX_FLOWS;
+ n = q->dep[qlen].next;
+
+ slot->dep.next = n;
+ slot->dep.prev = p;
+
+ q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
+ sfq_dep_head(q, n)->prev = x;
+}
+
+#define sfq_unlink(q, x, n, p) \
+ do { \
+ n = q->slots[x].dep.next; \
+ p = q->slots[x].dep.prev; \
+ sfq_dep_head(q, p)->next = n; \
+ sfq_dep_head(q, n)->prev = p; \
+ } while (0)
+
+
+static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+ int d;
+
+ sfq_unlink(q, x, n, p);
+
+ d = q->slots[x].qlen--;
+ if (n == p && q->cur_depth == d)
+ q->cur_depth--;
+ sfq_link(q, x);
+}
+
+static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
+{
+ sfq_index p, n;
+ int d;
+
+ sfq_unlink(q, x, n, p);
+
+ d = ++q->slots[x].qlen;
+ if (q->cur_depth < d)
+ q->cur_depth = d;
+ sfq_link(q, x);
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from tail of slot queue */
+static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
+{
+ struct sk_buff *skb = slot->skblist_prev;
+
+ slot->skblist_prev = skb->prev;
+ skb->prev->next = (struct sk_buff *)slot;
+ skb->next = skb->prev = NULL;
+ return skb;
+}
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
+{
+ struct sk_buff *skb = slot->skblist_next;
+
+ slot->skblist_next = skb->next;
+ skb->next->prev = (struct sk_buff *)slot;
+ skb->next = skb->prev = NULL;
+ return skb;
+}
+
+static inline void slot_queue_init(struct sfq_slot *slot)
+{
+ memset(slot, 0, sizeof(*slot));
+ slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
+}
+
+/* add skb to slot queue (tail add) */
+static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
+{
+ skb->prev = slot->skblist_prev;
+ skb->next = (struct sk_buff *)slot;
+ slot->skblist_prev->next = skb;
+ slot->skblist_prev = skb;
+}
+
+static unsigned int sfq_drop(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ sfq_index x, d = q->cur_depth;
+ struct sk_buff *skb;
+ unsigned int len;
+ struct sfq_slot *slot;
+
+ /* Queue is full! Find the longest slot and drop tail packet from it */
+ if (d > 1) {
+ x = q->dep[d].next;
+ slot = &q->slots[x];
+drop:
+ skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
+ len = qdisc_pkt_len(skb);
+ slot->backlog -= len;
+ sfq_dec(q, x);
+ kfree_skb(skb);
+ sch->q.qlen--;
+ qdisc_qstats_drop(sch);
+ qdisc_qstats_backlog_dec(sch, skb);
+ return len;
+ }
+
+ if (d == 1) {
+ /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
+ x = q->tail->next;
+ slot = &q->slots[x];
+ q->tail->next = slot->next;
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ goto drop;
+ }
+
+ return 0;
+}
+
+/* Is ECN parameter configured */
+static int sfq_prob_mark(const struct sfq_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max threshold just be marked */
+static int sfq_hard_mark(const struct sfq_sched_data *q)
+{
+ return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
+}
+
+static int sfq_headdrop(const struct sfq_sched_data *q)
+{
+ return q->headdrop;
+}
+
+static int
+sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ unsigned int hash;
+ sfq_index x, qlen;
+ struct sfq_slot *slot;
+ int uninitialized_var(ret);
+ struct sk_buff *head;
+ int delta;
+
+ hash = sfq_classify(skb, sch, &ret);
+ if (hash == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ return ret;
+ }
+ hash--;
+
+ x = q->ht[hash];
+ slot = &q->slots[x];
+ if (x == SFQ_EMPTY_SLOT) {
+ x = q->dep[0].next; /* get a free slot */
+ if (x >= SFQ_MAX_FLOWS)
+ return qdisc_drop(skb, sch);
+ q->ht[hash] = x;
+ slot = &q->slots[x];
+ slot->hash = hash;
+ slot->backlog = 0; /* should already be 0 anyway... */
+ red_set_vars(&slot->vars);
+ goto enqueue;
+ }
+ if (q->red_parms) {
+ slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
+ &slot->vars,
+ slot->backlog);
+ switch (red_action(q->red_parms,
+ &slot->vars,
+ slot->vars.qavg)) {
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (sfq_prob_mark(q)) {
+ /* We know we have at least one packet in queue */
+ if (sfq_headdrop(q) &&
+ INET_ECN_set_ce(slot->skblist_next)) {
+ q->stats.prob_mark_head++;
+ break;
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.prob_mark++;
+ break;
+ }
+ }
+ q->stats.prob_drop++;
+ goto congestion_drop;
+
+ case RED_HARD_MARK:
+ qdisc_qstats_overlimit(sch);
+ if (sfq_hard_mark(q)) {
+ /* We know we have at least one packet in queue */
+ if (sfq_headdrop(q) &&
+ INET_ECN_set_ce(slot->skblist_next)) {
+ q->stats.forced_mark_head++;
+ break;
+ }
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.forced_mark++;
+ break;
+ }
+ }
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ }
+
+ if (slot->qlen >= q->maxdepth) {
+congestion_drop:
+ if (!sfq_headdrop(q))
+ return qdisc_drop(skb, sch);
+
+ /* We know we have at least one packet in queue */
+ head = slot_dequeue_head(slot);
+ delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
+ sch->qstats.backlog -= delta;
+ slot->backlog -= delta;
+ qdisc_drop(head, sch);
+
+ slot_queue_add(slot, skb);
+ return NET_XMIT_CN;
+ }
+
+enqueue:
+ qdisc_qstats_backlog_inc(sch, skb);
+ slot->backlog += qdisc_pkt_len(skb);
+ slot_queue_add(slot, skb);
+ sfq_inc(q, x);
+ if (slot->qlen == 1) { /* The flow is new */
+ if (q->tail == NULL) { /* It is the first flow */
+ slot->next = x;
+ } else {
+ slot->next = q->tail->next;
+ q->tail->next = x;
+ }
+ /* We put this flow at the end of our flow list.
+ * This might sound unfair for a new flow to wait after old ones,
+ * but we could endup servicing new flows only, and freeze old ones.
+ */
+ q->tail = slot;
+ /* We could use a bigger initial quantum for new flows */
+ slot->allot = q->scaled_quantum;
+ }
+ if (++sch->q.qlen <= q->limit)
+ return NET_XMIT_SUCCESS;
+
+ qlen = slot->qlen;
+ sfq_drop(sch);
+ /* Return Congestion Notification only if we dropped a packet
+ * from this flow.
+ */
+ if (qlen != slot->qlen)
+ return NET_XMIT_CN;
+
+ /* As we dropped a packet, better let upper stack know this */
+ qdisc_tree_decrease_qlen(sch, 1);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+sfq_dequeue(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ sfq_index a, next_a;
+ struct sfq_slot *slot;
+
+ /* No active slots */
+ if (q->tail == NULL)
+ return NULL;
+
+next_slot:
+ a = q->tail->next;
+ slot = &q->slots[a];
+ if (slot->allot <= 0) {
+ q->tail = slot;
+ slot->allot += q->scaled_quantum;
+ goto next_slot;
+ }
+ skb = slot_dequeue_head(slot);
+ sfq_dec(q, a);
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ slot->backlog -= qdisc_pkt_len(skb);
+ /* Is the slot empty? */
+ if (slot->qlen == 0) {
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ next_a = slot->next;
+ if (a == next_a) {
+ q->tail = NULL; /* no more active slots */
+ return skb;
+ }
+ q->tail->next = next_a;
+ } else {
+ slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
+ }
+ return skb;
+}
+
+static void
+sfq_reset(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+
+ while ((skb = sfq_dequeue(sch)) != NULL)
+ kfree_skb(skb);
+}
+
+/*
+ * When q->perturbation is changed, we rehash all queued skbs
+ * to avoid OOO (Out Of Order) effects.
+ * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
+ * counters.
+ */
+static void sfq_rehash(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ int i;
+ struct sfq_slot *slot;
+ struct sk_buff_head list;
+ int dropped = 0;
+
+ __skb_queue_head_init(&list);
+
+ for (i = 0; i < q->maxflows; i++) {
+ slot = &q->slots[i];
+ if (!slot->qlen)
+ continue;
+ while (slot->qlen) {
+ skb = slot_dequeue_head(slot);
+ sfq_dec(q, i);
+ __skb_queue_tail(&list, skb);
+ }
+ slot->backlog = 0;
+ red_set_vars(&slot->vars);
+ q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+ }
+ q->tail = NULL;
+
+ while ((skb = __skb_dequeue(&list)) != NULL) {
+ unsigned int hash = sfq_hash(q, skb);
+ sfq_index x = q->ht[hash];
+
+ slot = &q->slots[x];
+ if (x == SFQ_EMPTY_SLOT) {
+ x = q->dep[0].next; /* get a free slot */
+ if (x >= SFQ_MAX_FLOWS) {
+drop:
+ qdisc_qstats_backlog_dec(sch, skb);
+ kfree_skb(skb);
+ dropped++;
+ continue;
+ }
+ q->ht[hash] = x;
+ slot = &q->slots[x];
+ slot->hash = hash;
+ }
+ if (slot->qlen >= q->maxdepth)
+ goto drop;
+ slot_queue_add(slot, skb);
+ if (q->red_parms)
+ slot->vars.qavg = red_calc_qavg(q->red_parms,
+ &slot->vars,
+ slot->backlog);
+ slot->backlog += qdisc_pkt_len(skb);
+ sfq_inc(q, x);
+ if (slot->qlen == 1) { /* The flow is new */
+ if (q->tail == NULL) { /* It is the first flow */
+ slot->next = x;
+ } else {
+ slot->next = q->tail->next;
+ q->tail->next = x;
+ }
+ q->tail = slot;
+ slot->allot = q->scaled_quantum;
+ }
+ }
+ sch->q.qlen -= dropped;
+ qdisc_tree_decrease_qlen(sch, dropped);
+}
+
+static void sfq_perturbation(unsigned long arg)
+{
+ struct Qdisc *sch = (struct Qdisc *)arg;
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+
+ spin_lock(root_lock);
+ q->perturbation = prandom_u32();
+ if (!q->filter_list && q->tail)
+ sfq_rehash(sch);
+ spin_unlock(root_lock);
+
+ if (q->perturb_period)
+ mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+}
+
+static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ struct tc_sfq_qopt *ctl = nla_data(opt);
+ struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
+ unsigned int qlen;
+ struct red_parms *p = NULL;
+
+ if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
+ return -EINVAL;
+ if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
+ ctl_v1 = nla_data(opt);
+ if (ctl->divisor &&
+ (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
+ return -EINVAL;
+ if (ctl_v1 && ctl_v1->qth_min) {
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ }
+ sch_tree_lock(sch);
+ if (ctl->quantum) {
+ q->quantum = ctl->quantum;
+ q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ }
+ q->perturb_period = ctl->perturb_period * HZ;
+ if (ctl->flows)
+ q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ if (ctl->divisor) {
+ q->divisor = ctl->divisor;
+ q->maxflows = min_t(u32, q->maxflows, q->divisor);
+ }
+ if (ctl_v1) {
+ if (ctl_v1->depth)
+ q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ if (p) {
+ swap(q->red_parms, p);
+ red_set_parms(q->red_parms,
+ ctl_v1->qth_min, ctl_v1->qth_max,
+ ctl_v1->Wlog,
+ ctl_v1->Plog, ctl_v1->Scell_log,
+ NULL,
+ ctl_v1->max_P);
+ }
+ q->flags = ctl_v1->flags;
+ q->headdrop = ctl_v1->headdrop;
+ }
+ if (ctl->limit) {
+ q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
+ q->maxflows = min_t(u32, q->maxflows, q->limit);
+ }
+
+ qlen = sch->q.qlen;
+ while (sch->q.qlen > q->limit)
+ sfq_drop(sch);
+ qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+ del_timer(&q->perturb_timer);
+ if (q->perturb_period) {
+ mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+ q->perturbation = prandom_u32();
+ }
+ sch_tree_unlock(sch);
+ kfree(p);
+ return 0;
+}
+
+static void *sfq_alloc(size_t sz)
+{
+ void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+ if (!ptr)
+ ptr = vmalloc(sz);
+ return ptr;
+}
+
+static void sfq_free(void *addr)
+{
+ kvfree(addr);
+}
+
+static void sfq_destroy(struct Qdisc *sch)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ q->perturb_period = 0;
+ del_timer_sync(&q->perturb_timer);
+ sfq_free(q->ht);
+ sfq_free(q->slots);
+ kfree(q->red_parms);
+}
+
+static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ q->perturb_timer.function = sfq_perturbation;
+ q->perturb_timer.data = (unsigned long)sch;
+ init_timer_deferrable(&q->perturb_timer);
+
+ for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
+ q->dep[i].next = i + SFQ_MAX_FLOWS;
+ q->dep[i].prev = i + SFQ_MAX_FLOWS;
+ }
+
+ q->limit = SFQ_MAX_DEPTH;
+ q->maxdepth = SFQ_MAX_DEPTH;
+ q->cur_depth = 0;
+ q->tail = NULL;
+ q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
+ q->maxflows = SFQ_DEFAULT_FLOWS;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+ q->perturb_period = 0;
+ q->perturbation = prandom_u32();
+
+ if (opt) {
+ int err = sfq_change(sch, opt);
+ if (err)
+ return err;
+ }
+
+ q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
+ q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
+ if (!q->ht || !q->slots) {
+ sfq_destroy(sch);
+ return -ENOMEM;
+ }
+ for (i = 0; i < q->divisor; i++)
+ q->ht[i] = SFQ_EMPTY_SLOT;
+
+ for (i = 0; i < q->maxflows; i++) {
+ slot_queue_init(&q->slots[i]);
+ sfq_link(q, i);
+ }
+ if (q->limit >= 1)
+ sch->flags |= TCQ_F_CAN_BYPASS;
+ else
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_sfq_qopt_v1 opt;
+ struct red_parms *p = q->red_parms;
+
+ memset(&opt, 0, sizeof(opt));
+ opt.v0.quantum = q->quantum;
+ opt.v0.perturb_period = q->perturb_period / HZ;
+ opt.v0.limit = q->limit;
+ opt.v0.divisor = q->divisor;
+ opt.v0.flows = q->maxflows;
+ opt.depth = q->maxdepth;
+ opt.headdrop = q->headdrop;
+
+ if (p) {
+ opt.qth_min = p->qth_min >> p->Wlog;
+ opt.qth_max = p->qth_max >> p->Wlog;
+ opt.Wlog = p->Wlog;
+ opt.Plog = p->Plog;
+ opt.Scell_log = p->Scell_log;
+ opt.max_P = p->max_P;
+ }
+ memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
+ opt.flags = q->flags;
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ /* we cannot bypass queue discipline anymore */
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+ return 0;
+}
+
+static void sfq_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ sfq_index idx = q->ht[cl - 1];
+ struct gnet_stats_queue qs = { 0 };
+ struct tc_sfq_xstats xstats = { 0 };
+
+ if (idx != SFQ_EMPTY_SLOT) {
+ const struct sfq_slot *slot = &q->slots[idx];
+
+ xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
+ qs.qlen = slot->qlen;
+ qs.backlog = slot->backlog;
+ }
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+ return -1;
+ return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->divisor; i++) {
+ if (q->ht[i] == SFQ_EMPTY_SLOT ||
+ arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops sfq_class_ops = {
+ .leaf = sfq_leaf,
+ .get = sfq_get,
+ .put = sfq_put,
+ .tcf_chain = sfq_find_tcf,
+ .bind_tcf = sfq_bind,
+ .unbind_tcf = sfq_put,
+ .dump = sfq_dump_class,
+ .dump_stats = sfq_dump_class_stats,
+ .walk = sfq_walk,
+};
+
+static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
+ .cl_ops = &sfq_class_ops,
+ .id = "sfq",
+ .priv_size = sizeof(struct sfq_sched_data),
+ .enqueue = sfq_enqueue,
+ .dequeue = sfq_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = sfq_drop,
+ .init = sfq_init,
+ .reset = sfq_reset,
+ .destroy = sfq_destroy,
+ .change = NULL,
+ .dump = sfq_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init sfq_module_init(void)
+{
+ return register_qdisc(&sfq_qdisc_ops);
+}
+static void __exit sfq_module_exit(void)
+{
+ unregister_qdisc(&sfq_qdisc_ops);
+}
+module_init(sfq_module_init)
+module_exit(sfq_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
new file mode 100644
index 000000000..a4afde14e
--- /dev/null
+++ b/net/sched/sch_tbf.c
@@ -0,0 +1,579 @@
+/*
+ * net/sched/sch_tbf.c Token Bucket Filter queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
+ * original idea by Martin Devera
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+
+
+/* Simple Token Bucket Filter.
+ =======================================
+
+ SOURCE.
+ -------
+
+ None.
+
+ Description.
+ ------------
+
+ A data flow obeys TBF with rate R and depth B, if for any
+ time interval t_i...t_f the number of transmitted bits
+ does not exceed B + R*(t_f-t_i).
+
+ Packetized version of this definition:
+ The sequence of packets of sizes s_i served at moments t_i
+ obeys TBF, if for any i<=k:
+
+ s_i+....+s_k <= B + R*(t_k - t_i)
+
+ Algorithm.
+ ----------
+
+ Let N(t_i) be B/R initially and N(t) grow continuously with time as:
+
+ N(t+delta) = min{B/R, N(t) + delta}
+
+ If the first packet in queue has length S, it may be
+ transmitted only at the time t_* when S/R <= N(t_*),
+ and in this case N(t) jumps:
+
+ N(t_* + 0) = N(t_* - 0) - S/R.
+
+
+
+ Actually, QoS requires two TBF to be applied to a data stream.
+ One of them controls steady state burst size, another
+ one with rate P (peak rate) and depth M (equal to link MTU)
+ limits bursts at a smaller time scale.
+
+ It is easy to see that P>R, and B>M. If P is infinity, this double
+ TBF is equivalent to a single one.
+
+ When TBF works in reshaping mode, latency is estimated as:
+
+ lat = max ((L-B)/R, (L-M)/P)
+
+
+ NOTES.
+ ------
+
+ If TBF throttles, it starts a watchdog timer, which will wake it up
+ when it is ready to transmit.
+ Note that the minimal timer resolution is 1/HZ.
+ If no new packets arrive during this period,
+ or if the device is not awaken by EOI for some previous packet,
+ TBF can stop its activity for 1/HZ.
+
+
+ This means, that with depth B, the maximal rate is
+
+ R_crit = B*HZ
+
+ F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
+
+ Note that the peak rate TBF is much more tough: with MTU 1500
+ P_crit = 150Kbytes/sec. So, if you need greater peak
+ rates, use alpha with HZ=1000 :-)
+
+ With classful TBF, limit is just kept for backwards compatibility.
+ It is passed to the default bfifo qdisc - if the inner qdisc is
+ changed the limit is not effective anymore.
+*/
+
+struct tbf_sched_data {
+/* Parameters */
+ u32 limit; /* Maximal length of backlog: bytes */
+ u32 max_size;
+ s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
+ s64 mtu;
+ struct psched_ratecfg rate;
+ struct psched_ratecfg peak;
+
+/* Variables */
+ s64 tokens; /* Current number of B tokens */
+ s64 ptokens; /* Current number of P tokens */
+ s64 t_c; /* Time check-point */
+ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
+ struct qdisc_watchdog watchdog; /* Watchdog timer */
+};
+
+
+/* Time to Length, convert time in ns to length in bytes
+ * to determinate how many bytes can be sent in given time.
+ */
+static u64 psched_ns_t2l(const struct psched_ratecfg *r,
+ u64 time_in_ns)
+{
+ /* The formula is :
+ * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
+ */
+ u64 len = time_in_ns * r->rate_bytes_ps;
+
+ do_div(len, NSEC_PER_SEC);
+
+ if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
+ do_div(len, 53);
+ len = len * 48;
+ }
+
+ if (len > r->overhead)
+ len -= r->overhead;
+ else
+ len = 0;
+
+ return len;
+}
+
+/*
+ * Return length of individual segments of a gso packet,
+ * including all headers (MAC, IP, TCP/UDP)
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/* GSO packet is too big, segment it so that tbf can transmit
+ * each segment in time
+ */
+static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ int ret, nb;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_reshape_fail(skb, sch);
+
+ nb = 0;
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ ret = qdisc_enqueue(segs, q->qdisc);
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ } else {
+ nb++;
+ }
+ segs = nskb;
+ }
+ sch->q.qlen += nb;
+ if (nb > 1)
+ qdisc_tree_decrease_qlen(sch, 1 - nb);
+ consume_skb(skb);
+ return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ int ret;
+
+ if (qdisc_pkt_len(skb) > q->max_size) {
+ if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
+ return tbf_segment(skb, sch);
+ return qdisc_reshape_fail(skb, sch);
+ }
+ ret = qdisc_enqueue(skb, q->qdisc);
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ return ret;
+ }
+
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+}
+
+static unsigned int tbf_drop(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ unsigned int len = 0;
+
+ if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+ sch->q.qlen--;
+ qdisc_qstats_drop(sch);
+ }
+ return len;
+}
+
+static bool tbf_peak_present(const struct tbf_sched_data *q)
+{
+ return q->peak.rate_bytes_ps;
+}
+
+static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ skb = q->qdisc->ops->peek(q->qdisc);
+
+ if (skb) {
+ s64 now;
+ s64 toks;
+ s64 ptoks = 0;
+ unsigned int len = qdisc_pkt_len(skb);
+
+ now = ktime_get_ns();
+ toks = min_t(s64, now - q->t_c, q->buffer);
+
+ if (tbf_peak_present(q)) {
+ ptoks = toks + q->ptokens;
+ if (ptoks > q->mtu)
+ ptoks = q->mtu;
+ ptoks -= (s64) psched_l2t_ns(&q->peak, len);
+ }
+ toks += q->tokens;
+ if (toks > q->buffer)
+ toks = q->buffer;
+ toks -= (s64) psched_l2t_ns(&q->rate, len);
+
+ if ((toks|ptoks) >= 0) {
+ skb = qdisc_dequeue_peeked(q->qdisc);
+ if (unlikely(!skb))
+ return NULL;
+
+ q->t_c = now;
+ q->tokens = toks;
+ q->ptokens = ptoks;
+ sch->q.qlen--;
+ qdisc_unthrottled(sch);
+ qdisc_bstats_update(sch, skb);
+ return skb;
+ }
+
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ now + max_t(long, -toks, -ptoks),
+ true);
+
+ /* Maybe we have a shorter packet in the queue,
+ which can be sent now. It sounds cool,
+ but, however, this is wrong in principle.
+ We MUST NOT reorder packets under these circumstances.
+
+ Really, if we split the flow into independent
+ subflows, it would be a very good solution.
+ This is the main idea of all FQ algorithms
+ (cf. CSZ, HPFQ, HFSC)
+ */
+
+ qdisc_qstats_overlimit(sch);
+ }
+ return NULL;
+}
+
+static void tbf_reset(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_reset(q->qdisc);
+ sch->q.qlen = 0;
+ q->t_c = ktime_get_ns();
+ q->tokens = q->buffer;
+ q->ptokens = q->mtu;
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
+ [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
+ [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+ [TCA_TBF_RATE64] = { .type = NLA_U64 },
+ [TCA_TBF_PRATE64] = { .type = NLA_U64 },
+ [TCA_TBF_BURST] = { .type = NLA_U32 },
+ [TCA_TBF_PBURST] = { .type = NLA_U32 },
+};
+
+static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ int err;
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_TBF_MAX + 1];
+ struct tc_tbf_qopt *qopt;
+ struct Qdisc *child = NULL;
+ struct psched_ratecfg rate;
+ struct psched_ratecfg peak;
+ u64 max_size;
+ s64 buffer, mtu;
+ u64 rate64 = 0, prate64 = 0;
+
+ err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ if (tb[TCA_TBF_PARMS] == NULL)
+ goto done;
+
+ qopt = nla_data(tb[TCA_TBF_PARMS]);
+ if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
+ tb[TCA_TBF_RTAB]));
+
+ if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
+ qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
+ tb[TCA_TBF_PTAB]));
+
+ buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
+ mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
+
+ if (tb[TCA_TBF_RATE64])
+ rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
+ psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
+
+ if (tb[TCA_TBF_BURST]) {
+ max_size = nla_get_u32(tb[TCA_TBF_BURST]);
+ buffer = psched_l2t_ns(&rate, max_size);
+ } else {
+ max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
+ }
+
+ if (qopt->peakrate.rate) {
+ if (tb[TCA_TBF_PRATE64])
+ prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
+ psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
+ if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
+ pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
+ peak.rate_bytes_ps, rate.rate_bytes_ps);
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (tb[TCA_TBF_PBURST]) {
+ u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
+ max_size = min_t(u32, max_size, pburst);
+ mtu = psched_l2t_ns(&peak, pburst);
+ } else {
+ max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
+ }
+ } else {
+ memset(&peak, 0, sizeof(peak));
+ }
+
+ if (max_size < psched_mtu(qdisc_dev(sch)))
+ pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
+ max_size, qdisc_dev(sch)->name,
+ psched_mtu(qdisc_dev(sch)));
+
+ if (!max_size) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (q->qdisc != &noop_qdisc) {
+ err = fifo_set_limit(q->qdisc, qopt->limit);
+ if (err)
+ goto done;
+ } else if (qopt->limit > 0) {
+ child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
+ if (IS_ERR(child)) {
+ err = PTR_ERR(child);
+ goto done;
+ }
+ }
+
+ sch_tree_lock(sch);
+ if (child) {
+ qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_destroy(q->qdisc);
+ q->qdisc = child;
+ }
+ q->limit = qopt->limit;
+ if (tb[TCA_TBF_PBURST])
+ q->mtu = mtu;
+ else
+ q->mtu = PSCHED_TICKS2NS(qopt->mtu);
+ q->max_size = max_size;
+ if (tb[TCA_TBF_BURST])
+ q->buffer = buffer;
+ else
+ q->buffer = PSCHED_TICKS2NS(qopt->buffer);
+ q->tokens = q->buffer;
+ q->ptokens = q->mtu;
+
+ memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
+ memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
+
+ sch_tree_unlock(sch);
+ err = 0;
+done:
+ return err;
+}
+
+static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ q->t_c = ktime_get_ns();
+ qdisc_watchdog_init(&q->watchdog, sch);
+ q->qdisc = &noop_qdisc;
+
+ return tbf_change(sch, opt);
+}
+
+static void tbf_destroy(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+ qdisc_destroy(q->qdisc);
+}
+
+static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct nlattr *nest;
+ struct tc_tbf_qopt opt;
+
+ sch->qstats.backlog = q->qdisc->qstats.backlog;
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ opt.limit = q->limit;
+ psched_ratecfg_getrate(&opt.rate, &q->rate);
+ if (tbf_peak_present(q))
+ psched_ratecfg_getrate(&opt.peakrate, &q->peak);
+ else
+ memset(&opt.peakrate, 0, sizeof(opt.peakrate));
+ opt.mtu = PSCHED_NS2TICKS(q->mtu);
+ opt.buffer = PSCHED_NS2TICKS(q->buffer);
+ if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+ if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
+ nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
+ goto nla_put_failure;
+ if (tbf_peak_present(q) &&
+ q->peak.rate_bytes_ps >= (1ULL << 32) &&
+ nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->qdisc;
+ q->qdisc = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ return q->qdisc;
+}
+
+static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void tbf_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip)
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops tbf_class_ops = {
+ .graft = tbf_graft,
+ .leaf = tbf_leaf,
+ .get = tbf_get,
+ .put = tbf_put,
+ .walk = tbf_walk,
+ .dump = tbf_dump_class,
+};
+
+static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
+ .next = NULL,
+ .cl_ops = &tbf_class_ops,
+ .id = "tbf",
+ .priv_size = sizeof(struct tbf_sched_data),
+ .enqueue = tbf_enqueue,
+ .dequeue = tbf_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .drop = tbf_drop,
+ .init = tbf_init,
+ .reset = tbf_reset,
+ .destroy = tbf_destroy,
+ .change = tbf_change,
+ .dump = tbf_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init tbf_module_init(void)
+{
+ return register_qdisc(&tbf_qdisc_ops);
+}
+
+static void __exit tbf_module_exit(void)
+{
+ unregister_qdisc(&tbf_qdisc_ops);
+}
+module_init(tbf_module_init)
+module_exit(tbf_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
new file mode 100644
index 000000000..e02687185
--- /dev/null
+++ b/net/sched/sch_teql.c
@@ -0,0 +1,528 @@
+/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/moduleparam.h>
+#include <net/dst.h>
+#include <net/neighbour.h>
+#include <net/pkt_sched.h>
+
+/*
+ How to setup it.
+ ----------------
+
+ After loading this module you will find a new device teqlN
+ and new qdisc with the same name. To join a slave to the equalizer
+ you should just set this qdisc on a device f.e.
+
+ # tc qdisc add dev eth0 root teql0
+ # tc qdisc add dev eth1 root teql0
+
+ That's all. Full PnP 8)
+
+ Applicability.
+ --------------
+
+ 1. Slave devices MUST be active devices, i.e., they must raise the tbusy
+ signal and generate EOI events. If you want to equalize virtual devices
+ like tunnels, use a normal eql device.
+ 2. This device puts no limitations on physical slave characteristics
+ f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
+ Certainly, large difference in link speeds will make the resulting
+ eqalized link unusable, because of huge packet reordering.
+ I estimate an upper useful difference as ~10 times.
+ 3. If the slave requires address resolution, only protocols using
+ neighbour cache (IPv4/IPv6) will work over the equalized link.
+ Other protocols are still allowed to use the slave device directly,
+ which will not break load balancing, though native slave
+ traffic will have the highest priority. */
+
+struct teql_master {
+ struct Qdisc_ops qops;
+ struct net_device *dev;
+ struct Qdisc *slaves;
+ struct list_head master_list;
+ unsigned long tx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_errors;
+ unsigned long tx_dropped;
+};
+
+struct teql_sched_data {
+ struct Qdisc *next;
+ struct teql_master *m;
+ struct sk_buff_head q;
+};
+
+#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
+
+#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
+
+/* "teql*" qdisc routines */
+
+static int
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct teql_sched_data *q = qdisc_priv(sch);
+
+ if (q->q.qlen < dev->tx_queue_len) {
+ __skb_queue_tail(&q->q, skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ return qdisc_drop(skb, sch);
+}
+
+static struct sk_buff *
+teql_dequeue(struct Qdisc *sch)
+{
+ struct teql_sched_data *dat = qdisc_priv(sch);
+ struct netdev_queue *dat_queue;
+ struct sk_buff *skb;
+ struct Qdisc *q;
+
+ skb = __skb_dequeue(&dat->q);
+ dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
+ q = rcu_dereference_bh(dat_queue->qdisc);
+
+ if (skb == NULL) {
+ struct net_device *m = qdisc_dev(q);
+ if (m) {
+ dat->m->slaves = sch;
+ netif_wake_queue(m);
+ }
+ } else {
+ qdisc_bstats_update(sch, skb);
+ }
+ sch->q.qlen = dat->q.qlen + q->q.qlen;
+ return skb;
+}
+
+static struct sk_buff *
+teql_peek(struct Qdisc *sch)
+{
+ /* teql is meant to be used as root qdisc */
+ return NULL;
+}
+
+static void
+teql_reset(struct Qdisc *sch)
+{
+ struct teql_sched_data *dat = qdisc_priv(sch);
+
+ skb_queue_purge(&dat->q);
+ sch->q.qlen = 0;
+}
+
+static void
+teql_destroy(struct Qdisc *sch)
+{
+ struct Qdisc *q, *prev;
+ struct teql_sched_data *dat = qdisc_priv(sch);
+ struct teql_master *master = dat->m;
+
+ prev = master->slaves;
+ if (prev) {
+ do {
+ q = NEXT_SLAVE(prev);
+ if (q == sch) {
+ NEXT_SLAVE(prev) = NEXT_SLAVE(q);
+ if (q == master->slaves) {
+ master->slaves = NEXT_SLAVE(q);
+ if (q == master->slaves) {
+ struct netdev_queue *txq;
+ spinlock_t *root_lock;
+
+ txq = netdev_get_tx_queue(master->dev, 0);
+ master->slaves = NULL;
+
+ root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc));
+ spin_lock_bh(root_lock);
+ qdisc_reset(rtnl_dereference(txq->qdisc));
+ spin_unlock_bh(root_lock);
+ }
+ }
+ skb_queue_purge(&dat->q);
+ break;
+ }
+
+ } while ((prev = q) != master->slaves);
+ }
+}
+
+static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct teql_master *m = (struct teql_master *)sch->ops;
+ struct teql_sched_data *q = qdisc_priv(sch);
+
+ if (dev->hard_header_len > m->dev->hard_header_len)
+ return -EINVAL;
+
+ if (m->dev == dev)
+ return -ELOOP;
+
+ q->m = m;
+
+ skb_queue_head_init(&q->q);
+
+ if (m->slaves) {
+ if (m->dev->flags & IFF_UP) {
+ if ((m->dev->flags & IFF_POINTOPOINT &&
+ !(dev->flags & IFF_POINTOPOINT)) ||
+ (m->dev->flags & IFF_BROADCAST &&
+ !(dev->flags & IFF_BROADCAST)) ||
+ (m->dev->flags & IFF_MULTICAST &&
+ !(dev->flags & IFF_MULTICAST)) ||
+ dev->mtu < m->dev->mtu)
+ return -EINVAL;
+ } else {
+ if (!(dev->flags&IFF_POINTOPOINT))
+ m->dev->flags &= ~IFF_POINTOPOINT;
+ if (!(dev->flags&IFF_BROADCAST))
+ m->dev->flags &= ~IFF_BROADCAST;
+ if (!(dev->flags&IFF_MULTICAST))
+ m->dev->flags &= ~IFF_MULTICAST;
+ if (dev->mtu < m->dev->mtu)
+ m->dev->mtu = dev->mtu;
+ }
+ q->next = NEXT_SLAVE(m->slaves);
+ NEXT_SLAVE(m->slaves) = sch;
+ } else {
+ q->next = sch;
+ m->slaves = sch;
+ m->dev->mtu = dev->mtu;
+ m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
+ }
+ return 0;
+}
+
+
+static int
+__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
+ struct net_device *dev, struct netdev_queue *txq,
+ struct dst_entry *dst)
+{
+ struct neighbour *n;
+ int err = 0;
+
+ n = dst_neigh_lookup_skb(dst, skb);
+ if (!n)
+ return -ENOENT;
+
+ if (dst->dev != dev) {
+ struct neighbour *mn;
+
+ mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
+ neigh_release(n);
+ if (IS_ERR(mn))
+ return PTR_ERR(mn);
+ n = mn;
+ }
+
+ if (neigh_event_send(n, skb_res) == 0) {
+ int err;
+ char haddr[MAX_ADDR_LEN];
+
+ neigh_ha_snapshot(haddr, n, dev);
+ err = dev_hard_header(skb, dev, ntohs(tc_skb_protocol(skb)),
+ haddr, NULL, skb->len);
+
+ if (err < 0)
+ err = -EINVAL;
+ } else {
+ err = (skb_res == NULL) ? -EAGAIN : 1;
+ }
+ neigh_release(n);
+ return err;
+}
+
+static inline int teql_resolve(struct sk_buff *skb,
+ struct sk_buff *skb_res,
+ struct net_device *dev,
+ struct netdev_queue *txq)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ int res;
+
+ if (rcu_access_pointer(txq->qdisc) == &noop_qdisc)
+ return -ENODEV;
+
+ if (!dev->header_ops || !dst)
+ return 0;
+
+ rcu_read_lock();
+ res = __teql_resolve(skb, skb_res, dev, txq, dst);
+ rcu_read_unlock();
+
+ return res;
+}
+
+static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct teql_master *master = netdev_priv(dev);
+ struct Qdisc *start, *q;
+ int busy;
+ int nores;
+ int subq = skb_get_queue_mapping(skb);
+ struct sk_buff *skb_res = NULL;
+
+ start = master->slaves;
+
+restart:
+ nores = 0;
+ busy = 0;
+
+ q = start;
+ if (!q)
+ goto drop;
+
+ do {
+ struct net_device *slave = qdisc_dev(q);
+ struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
+
+ if (slave_txq->qdisc_sleeping != q)
+ continue;
+ if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
+ !netif_running(slave)) {
+ busy = 1;
+ continue;
+ }
+
+ switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
+ case 0:
+ if (__netif_tx_trylock(slave_txq)) {
+ unsigned int length = qdisc_pkt_len(skb);
+
+ if (!netif_xmit_frozen_or_stopped(slave_txq) &&
+ netdev_start_xmit(skb, slave, slave_txq, false) ==
+ NETDEV_TX_OK) {
+ __netif_tx_unlock(slave_txq);
+ master->slaves = NEXT_SLAVE(q);
+ netif_wake_queue(dev);
+ master->tx_packets++;
+ master->tx_bytes += length;
+ return NETDEV_TX_OK;
+ }
+ __netif_tx_unlock(slave_txq);
+ }
+ if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
+ busy = 1;
+ break;
+ case 1:
+ master->slaves = NEXT_SLAVE(q);
+ return NETDEV_TX_OK;
+ default:
+ nores = 1;
+ break;
+ }
+ __skb_pull(skb, skb_network_offset(skb));
+ } while ((q = NEXT_SLAVE(q)) != start);
+
+ if (nores && skb_res == NULL) {
+ skb_res = skb;
+ goto restart;
+ }
+
+ if (busy) {
+ netif_stop_queue(dev);
+ return NETDEV_TX_BUSY;
+ }
+ master->tx_errors++;
+
+drop:
+ master->tx_dropped++;
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int teql_master_open(struct net_device *dev)
+{
+ struct Qdisc *q;
+ struct teql_master *m = netdev_priv(dev);
+ int mtu = 0xFFFE;
+ unsigned int flags = IFF_NOARP | IFF_MULTICAST;
+
+ if (m->slaves == NULL)
+ return -EUNATCH;
+
+ flags = FMASK;
+
+ q = m->slaves;
+ do {
+ struct net_device *slave = qdisc_dev(q);
+
+ if (slave == NULL)
+ return -EUNATCH;
+
+ if (slave->mtu < mtu)
+ mtu = slave->mtu;
+ if (slave->hard_header_len > LL_MAX_HEADER)
+ return -EINVAL;
+
+ /* If all the slaves are BROADCAST, master is BROADCAST
+ If all the slaves are PtP, master is PtP
+ Otherwise, master is NBMA.
+ */
+ if (!(slave->flags&IFF_POINTOPOINT))
+ flags &= ~IFF_POINTOPOINT;
+ if (!(slave->flags&IFF_BROADCAST))
+ flags &= ~IFF_BROADCAST;
+ if (!(slave->flags&IFF_MULTICAST))
+ flags &= ~IFF_MULTICAST;
+ } while ((q = NEXT_SLAVE(q)) != m->slaves);
+
+ m->dev->mtu = mtu;
+ m->dev->flags = (m->dev->flags&~FMASK) | flags;
+ netif_start_queue(m->dev);
+ return 0;
+}
+
+static int teql_master_close(struct net_device *dev)
+{
+ netif_stop_queue(dev);
+ return 0;
+}
+
+static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct teql_master *m = netdev_priv(dev);
+
+ stats->tx_packets = m->tx_packets;
+ stats->tx_bytes = m->tx_bytes;
+ stats->tx_errors = m->tx_errors;
+ stats->tx_dropped = m->tx_dropped;
+ return stats;
+}
+
+static int teql_master_mtu(struct net_device *dev, int new_mtu)
+{
+ struct teql_master *m = netdev_priv(dev);
+ struct Qdisc *q;
+
+ if (new_mtu < 68)
+ return -EINVAL;
+
+ q = m->slaves;
+ if (q) {
+ do {
+ if (new_mtu > qdisc_dev(q)->mtu)
+ return -EINVAL;
+ } while ((q = NEXT_SLAVE(q)) != m->slaves);
+ }
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static const struct net_device_ops teql_netdev_ops = {
+ .ndo_open = teql_master_open,
+ .ndo_stop = teql_master_close,
+ .ndo_start_xmit = teql_master_xmit,
+ .ndo_get_stats64 = teql_master_stats64,
+ .ndo_change_mtu = teql_master_mtu,
+};
+
+static __init void teql_master_setup(struct net_device *dev)
+{
+ struct teql_master *master = netdev_priv(dev);
+ struct Qdisc_ops *ops = &master->qops;
+
+ master->dev = dev;
+ ops->priv_size = sizeof(struct teql_sched_data);
+
+ ops->enqueue = teql_enqueue;
+ ops->dequeue = teql_dequeue;
+ ops->peek = teql_peek;
+ ops->init = teql_qdisc_init;
+ ops->reset = teql_reset;
+ ops->destroy = teql_destroy;
+ ops->owner = THIS_MODULE;
+
+ dev->netdev_ops = &teql_netdev_ops;
+ dev->type = ARPHRD_VOID;
+ dev->mtu = 1500;
+ dev->tx_queue_len = 100;
+ dev->flags = IFF_NOARP;
+ dev->hard_header_len = LL_MAX_HEADER;
+ netif_keep_dst(dev);
+}
+
+static LIST_HEAD(master_dev_list);
+static int max_equalizers = 1;
+module_param(max_equalizers, int, 0);
+MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
+
+static int __init teql_init(void)
+{
+ int i;
+ int err = -ENODEV;
+
+ for (i = 0; i < max_equalizers; i++) {
+ struct net_device *dev;
+ struct teql_master *master;
+
+ dev = alloc_netdev(sizeof(struct teql_master), "teql%d",
+ NET_NAME_UNKNOWN, teql_master_setup);
+ if (!dev) {
+ err = -ENOMEM;
+ break;
+ }
+
+ if ((err = register_netdev(dev))) {
+ free_netdev(dev);
+ break;
+ }
+
+ master = netdev_priv(dev);
+
+ strlcpy(master->qops.id, dev->name, IFNAMSIZ);
+ err = register_qdisc(&master->qops);
+
+ if (err) {
+ unregister_netdev(dev);
+ free_netdev(dev);
+ break;
+ }
+
+ list_add_tail(&master->master_list, &master_dev_list);
+ }
+ return i ? 0 : err;
+}
+
+static void __exit teql_exit(void)
+{
+ struct teql_master *master, *nxt;
+
+ list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
+
+ list_del(&master->master_list);
+
+ unregister_qdisc(&master->qops);
+ unregister_netdev(master->dev);
+ free_netdev(master->dev);
+ }
+}
+
+module_init(teql_init);
+module_exit(teql_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
new file mode 100644
index 000000000..71c1a598d
--- /dev/null
+++ b/net/sctp/Kconfig
@@ -0,0 +1,103 @@
+#
+# SCTP configuration
+#
+
+menuconfig IP_SCTP
+ tristate "The SCTP Protocol"
+ depends on INET
+ depends on IPV6 || IPV6=n
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_SHA1
+ select LIBCRC32C
+ ---help---
+ Stream Control Transmission Protocol
+
+ From RFC 2960 <http://www.ietf.org/rfc/rfc2960.txt>.
+
+ "SCTP is a reliable transport protocol operating on top of a
+ connectionless packet network such as IP. It offers the following
+ services to its users:
+
+ -- acknowledged error-free non-duplicated transfer of user data,
+ -- data fragmentation to conform to discovered path MTU size,
+ -- sequenced delivery of user messages within multiple streams,
+ with an option for order-of-arrival delivery of individual user
+ messages,
+ -- optional bundling of multiple user messages into a single SCTP
+ packet, and
+ -- network-level fault tolerance through supporting of multi-
+ homing at either or both ends of an association."
+
+ To compile this protocol support as a module, choose M here: the
+ module will be called sctp. Debug messages are handeled by the
+ kernel's dynamic debugging framework.
+
+ If in doubt, say N.
+
+if IP_SCTP
+
+config NET_SCTPPROBE
+ tristate "SCTP: Association probing"
+ depends on PROC_FS && KPROBES
+ ---help---
+ This module allows for capturing the changes to SCTP association
+ state in response to incoming packets. It is used for debugging
+ SCTP congestion control algorithms. If you don't understand
+ what was just said, you don't need it: say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called sctp_probe.
+
+config SCTP_DBG_OBJCNT
+ bool "SCTP: Debug object counts"
+ depends on PROC_FS
+ help
+ If you say Y, this will enable debugging support for counting the
+ type of objects that are currently allocated. This is useful for
+ identifying memory leaks. This debug information can be viewed by
+ 'cat /proc/net/sctp/sctp_dbg_objcnt'
+
+ If unsure, say N
+choice
+ prompt "Default SCTP cookie HMAC encoding"
+ default SCTP_DEFAULT_COOKIE_HMAC_MD5
+ help
+ This option sets the default sctp cookie hmac algorithm
+ when in doubt select 'md5'
+
+config SCTP_DEFAULT_COOKIE_HMAC_MD5
+ bool "Enable optional MD5 hmac cookie generation"
+ help
+ Enable optional MD5 hmac based SCTP cookie generation
+ select SCTP_COOKIE_HMAC_MD5
+
+config SCTP_DEFAULT_COOKIE_HMAC_SHA1
+ bool "Enable optional SHA1 hmac cookie generation"
+ help
+ Enable optional SHA1 hmac based SCTP cookie generation
+ select SCTP_COOKIE_HMAC_SHA1
+
+config SCTP_DEFAULT_COOKIE_HMAC_NONE
+ bool "Use no hmac alg in SCTP cookie generation"
+ help
+ Use no hmac algorithm in SCTP cookie generation
+
+endchoice
+
+config SCTP_COOKIE_HMAC_MD5
+ bool "Enable optional MD5 hmac cookie generation"
+ help
+ Enable optional MD5 hmac based SCTP cookie generation
+ select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5
+ select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5
+
+config SCTP_COOKIE_HMAC_SHA1
+ bool "Enable optional SHA1 hmac cookie generation"
+ help
+ Enable optional SHA1 hmac based SCTP cookie generation
+ select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1
+ select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1
+
+
+endif # IP_SCTP
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
new file mode 100644
index 000000000..3b4ffb021
--- /dev/null
+++ b/net/sctp/Makefile
@@ -0,0 +1,21 @@
+#
+# Makefile for SCTP support code.
+#
+
+obj-$(CONFIG_IP_SCTP) += sctp.o
+obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
+
+sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
+ protocol.o endpointola.o associola.o \
+ transport.o chunk.o sm_make_chunk.o ulpevent.o \
+ inqueue.o outqueue.o ulpqueue.o \
+ tsnmap.o bind_addr.o socket.o primitive.o \
+ output.o input.o debug.o ssnmap.o auth.o
+
+sctp_probe-y := probe.o
+
+sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o
+sctp-$(CONFIG_PROC_FS) += proc.o
+sctp-$(CONFIG_SYSCTL) += sysctl.o
+
+sctp-$(subst m,y,$(CONFIG_IPV6)) += ipv6.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
new file mode 100644
index 000000000..197c3f59e
--- /dev/null
+++ b/net/sctp/associola.c
@@ -0,0 +1,1691 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This module provides the abstraction for an SCTP association.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Xingang Guo <xingang.guo@intel.com>
+ * Hui Huang <hui.huang@nokia.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Ryan Layer <rmlayer@us.ibm.com>
+ * Kevin Gao <kevin.gao@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <net/ipv6.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal functions. */
+static void sctp_select_active_and_retran_path(struct sctp_association *asoc);
+static void sctp_assoc_bh_rcv(struct work_struct *work);
+static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc);
+static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc);
+
+/* 1st Level Abstractions. */
+
+/* Initialize a new association from provided memory. */
+static struct sctp_association *sctp_association_init(struct sctp_association *asoc,
+ const struct sctp_endpoint *ep,
+ const struct sock *sk,
+ sctp_scope_t scope,
+ gfp_t gfp)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_sock *sp;
+ int i;
+ sctp_paramhdr_t *p;
+ int err;
+
+ /* Retrieve the SCTP per socket area. */
+ sp = sctp_sk((struct sock *)sk);
+
+ /* Discarding const is appropriate here. */
+ asoc->ep = (struct sctp_endpoint *)ep;
+ asoc->base.sk = (struct sock *)sk;
+
+ sctp_endpoint_hold(asoc->ep);
+ sock_hold(asoc->base.sk);
+
+ /* Initialize the common base substructure. */
+ asoc->base.type = SCTP_EP_TYPE_ASSOCIATION;
+
+ /* Initialize the object handling fields. */
+ atomic_set(&asoc->base.refcnt, 1);
+
+ /* Initialize the bind addr area. */
+ sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port);
+
+ asoc->state = SCTP_STATE_CLOSED;
+ asoc->cookie_life = ms_to_ktime(sp->assocparams.sasoc_cookie_life);
+ asoc->user_frag = sp->user_frag;
+
+ /* Set the association max_retrans and RTO values from the
+ * socket values.
+ */
+ asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+ asoc->pf_retrans = net->sctp.pf_retrans;
+
+ asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
+ asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
+ asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
+
+ /* Initialize the association's heartbeat interval based on the
+ * sock configured value.
+ */
+ asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
+
+ /* Initialize path max retrans value. */
+ asoc->pathmaxrxt = sp->pathmaxrxt;
+
+ /* Initialize default path MTU. */
+ asoc->pathmtu = sp->pathmtu;
+
+ /* Set association default SACK delay */
+ asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
+ asoc->sackfreq = sp->sackfreq;
+
+ /* Set the association default flags controlling
+ * Heartbeat, SACK delay, and Path MTU Discovery.
+ */
+ asoc->param_flags = sp->param_flags;
+
+ /* Initialize the maximum number of new data packets that can be sent
+ * in a burst.
+ */
+ asoc->max_burst = sp->max_burst;
+
+ /* initialize association timers */
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial;
+
+ /* sctpimpguide Section 2.12.2
+ * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
+ * recommended value of 5 times 'RTO.Max'.
+ */
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
+ = 5 * asoc->rto_max;
+
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
+
+ /* Initializes the timers */
+ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i)
+ setup_timer(&asoc->timers[i], sctp_timer_events[i],
+ (unsigned long)asoc);
+
+ /* Pull default initialization values from the sock options.
+ * Note: This assumes that the values have already been
+ * validated in the sock.
+ */
+ asoc->c.sinit_max_instreams = sp->initmsg.sinit_max_instreams;
+ asoc->c.sinit_num_ostreams = sp->initmsg.sinit_num_ostreams;
+ asoc->max_init_attempts = sp->initmsg.sinit_max_attempts;
+
+ asoc->max_init_timeo =
+ msecs_to_jiffies(sp->initmsg.sinit_max_init_timeo);
+
+ /* Set the local window size for receive.
+ * This is also the rcvbuf space per association.
+ * RFC 6 - A SCTP receiver MUST be able to receive a minimum of
+ * 1500 bytes in one SCTP packet.
+ */
+ if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW)
+ asoc->rwnd = SCTP_DEFAULT_MINWINDOW;
+ else
+ asoc->rwnd = sk->sk_rcvbuf/2;
+
+ asoc->a_rwnd = asoc->rwnd;
+
+ /* Use my own max window until I learn something better. */
+ asoc->peer.rwnd = SCTP_DEFAULT_MAXWINDOW;
+
+ /* Initialize the receive memory counter */
+ atomic_set(&asoc->rmem_alloc, 0);
+
+ init_waitqueue_head(&asoc->wait);
+
+ asoc->c.my_vtag = sctp_generate_tag(ep);
+ asoc->c.my_port = ep->base.bind_addr.port;
+
+ asoc->c.initial_tsn = sctp_generate_tsn(ep);
+
+ asoc->next_tsn = asoc->c.initial_tsn;
+
+ asoc->ctsn_ack_point = asoc->next_tsn - 1;
+ asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
+ asoc->highest_sacked = asoc->ctsn_ack_point;
+ asoc->last_cwr_tsn = asoc->ctsn_ack_point;
+
+ /* ADDIP Section 4.1 Asconf Chunk Procedures
+ *
+ * When an endpoint has an ASCONF signaled change to be sent to the
+ * remote endpoint it should do the following:
+ * ...
+ * A2) a serial number should be assigned to the chunk. The serial
+ * number SHOULD be a monotonically increasing number. The serial
+ * numbers SHOULD be initialized at the start of the
+ * association to the same value as the initial TSN.
+ */
+ asoc->addip_serial = asoc->c.initial_tsn;
+
+ INIT_LIST_HEAD(&asoc->addip_chunk_list);
+ INIT_LIST_HEAD(&asoc->asconf_ack_list);
+
+ /* Make an empty list of remote transport addresses. */
+ INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
+
+ /* RFC 2960 5.1 Normal Establishment of an Association
+ *
+ * After the reception of the first data chunk in an
+ * association the endpoint must immediately respond with a
+ * sack to acknowledge the data chunk. Subsequent
+ * acknowledgements should be done as described in Section
+ * 6.2.
+ *
+ * [We implement this by telling a new association that it
+ * already received one packet.]
+ */
+ asoc->peer.sack_needed = 1;
+ asoc->peer.sack_generation = 1;
+
+ /* Assume that the peer will tell us if he recognizes ASCONF
+ * as part of INIT exchange.
+ * The sctp_addip_noauth option is there for backward compatibility
+ * and will revert old behavior.
+ */
+ if (net->sctp.addip_noauth)
+ asoc->peer.asconf_capable = 1;
+
+ /* Create an input queue. */
+ sctp_inq_init(&asoc->base.inqueue);
+ sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv);
+
+ /* Create an output queue. */
+ sctp_outq_init(asoc, &asoc->outqueue);
+
+ if (!sctp_ulpq_init(&asoc->ulpq, asoc))
+ goto fail_init;
+
+ /* Assume that peer would support both address types unless we are
+ * told otherwise.
+ */
+ asoc->peer.ipv4_address = 1;
+ if (asoc->base.sk->sk_family == PF_INET6)
+ asoc->peer.ipv6_address = 1;
+ INIT_LIST_HEAD(&asoc->asocs);
+
+ asoc->default_stream = sp->default_stream;
+ asoc->default_ppid = sp->default_ppid;
+ asoc->default_flags = sp->default_flags;
+ asoc->default_context = sp->default_context;
+ asoc->default_timetolive = sp->default_timetolive;
+ asoc->default_rcv_context = sp->default_rcv_context;
+
+ /* AUTH related initializations */
+ INIT_LIST_HEAD(&asoc->endpoint_shared_keys);
+ err = sctp_auth_asoc_copy_shkeys(ep, asoc, gfp);
+ if (err)
+ goto fail_init;
+
+ asoc->active_key_id = ep->active_key_id;
+
+ /* Save the hmacs and chunks list into this association */
+ if (ep->auth_hmacs_list)
+ memcpy(asoc->c.auth_hmacs, ep->auth_hmacs_list,
+ ntohs(ep->auth_hmacs_list->param_hdr.length));
+ if (ep->auth_chunk_list)
+ memcpy(asoc->c.auth_chunks, ep->auth_chunk_list,
+ ntohs(ep->auth_chunk_list->param_hdr.length));
+
+ /* Get the AUTH random number for this association */
+ p = (sctp_paramhdr_t *)asoc->c.auth_random;
+ p->type = SCTP_PARAM_RANDOM;
+ p->length = htons(sizeof(sctp_paramhdr_t) + SCTP_AUTH_RANDOM_LENGTH);
+ get_random_bytes(p+1, SCTP_AUTH_RANDOM_LENGTH);
+
+ return asoc;
+
+fail_init:
+ sock_put(asoc->base.sk);
+ sctp_endpoint_put(asoc->ep);
+ return NULL;
+}
+
+/* Allocate and initialize a new association */
+struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
+ const struct sock *sk,
+ sctp_scope_t scope,
+ gfp_t gfp)
+{
+ struct sctp_association *asoc;
+
+ asoc = kzalloc(sizeof(*asoc), gfp);
+ if (!asoc)
+ goto fail;
+
+ if (!sctp_association_init(asoc, ep, sk, scope, gfp))
+ goto fail_init;
+
+ SCTP_DBG_OBJCNT_INC(assoc);
+
+ pr_debug("Created asoc %p\n", asoc);
+
+ return asoc;
+
+fail_init:
+ kfree(asoc);
+fail:
+ return NULL;
+}
+
+/* Free this association if possible. There may still be users, so
+ * the actual deallocation may be delayed.
+ */
+void sctp_association_free(struct sctp_association *asoc)
+{
+ struct sock *sk = asoc->base.sk;
+ struct sctp_transport *transport;
+ struct list_head *pos, *temp;
+ int i;
+
+ /* Only real associations count against the endpoint, so
+ * don't bother for if this is a temporary association.
+ */
+ if (!list_empty(&asoc->asocs)) {
+ list_del(&asoc->asocs);
+
+ /* Decrement the backlog value for a TCP-style listening
+ * socket.
+ */
+ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+ sk->sk_ack_backlog--;
+ }
+
+ /* Mark as dead, so other users can know this structure is
+ * going away.
+ */
+ asoc->base.dead = true;
+
+ /* Dispose of any data lying around in the outqueue. */
+ sctp_outq_free(&asoc->outqueue);
+
+ /* Dispose of any pending messages for the upper layer. */
+ sctp_ulpq_free(&asoc->ulpq);
+
+ /* Dispose of any pending chunks on the inqueue. */
+ sctp_inq_free(&asoc->base.inqueue);
+
+ sctp_tsnmap_free(&asoc->peer.tsn_map);
+
+ /* Free ssnmap storage. */
+ sctp_ssnmap_free(asoc->ssnmap);
+
+ /* Clean up the bound address list. */
+ sctp_bind_addr_free(&asoc->base.bind_addr);
+
+ /* Do we need to go through all of our timers and
+ * delete them? To be safe we will try to delete all, but we
+ * should be able to go through and make a guess based
+ * on our state.
+ */
+ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) {
+ if (del_timer(&asoc->timers[i]))
+ sctp_association_put(asoc);
+ }
+
+ /* Free peer's cached cookie. */
+ kfree(asoc->peer.cookie);
+ kfree(asoc->peer.peer_random);
+ kfree(asoc->peer.peer_chunks);
+ kfree(asoc->peer.peer_hmacs);
+
+ /* Release the transport structures. */
+ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+ transport = list_entry(pos, struct sctp_transport, transports);
+ list_del_rcu(pos);
+ sctp_transport_free(transport);
+ }
+
+ asoc->peer.transport_count = 0;
+
+ sctp_asconf_queue_teardown(asoc);
+
+ /* Free pending address space being deleted */
+ kfree(asoc->asconf_addr_del_pending);
+
+ /* AUTH - Free the endpoint shared keys */
+ sctp_auth_destroy_keys(&asoc->endpoint_shared_keys);
+
+ /* AUTH - Free the association shared key */
+ sctp_auth_key_put(asoc->asoc_shared_key);
+
+ sctp_association_put(asoc);
+}
+
+/* Cleanup and free up an association. */
+static void sctp_association_destroy(struct sctp_association *asoc)
+{
+ if (unlikely(!asoc->base.dead)) {
+ WARN(1, "Attempt to destroy undead association %p!\n", asoc);
+ return;
+ }
+
+ sctp_endpoint_put(asoc->ep);
+ sock_put(asoc->base.sk);
+
+ if (asoc->assoc_id != 0) {
+ spin_lock_bh(&sctp_assocs_id_lock);
+ idr_remove(&sctp_assocs_id, asoc->assoc_id);
+ spin_unlock_bh(&sctp_assocs_id_lock);
+ }
+
+ WARN_ON(atomic_read(&asoc->rmem_alloc));
+
+ kfree(asoc);
+ SCTP_DBG_OBJCNT_DEC(assoc);
+}
+
+/* Change the primary destination address for the peer. */
+void sctp_assoc_set_primary(struct sctp_association *asoc,
+ struct sctp_transport *transport)
+{
+ int changeover = 0;
+
+ /* it's a changeover only if we already have a primary path
+ * that we are changing
+ */
+ if (asoc->peer.primary_path != NULL &&
+ asoc->peer.primary_path != transport)
+ changeover = 1 ;
+
+ asoc->peer.primary_path = transport;
+
+ /* Set a default msg_name for events. */
+ memcpy(&asoc->peer.primary_addr, &transport->ipaddr,
+ sizeof(union sctp_addr));
+
+ /* If the primary path is changing, assume that the
+ * user wants to use this new path.
+ */
+ if ((transport->state == SCTP_ACTIVE) ||
+ (transport->state == SCTP_UNKNOWN))
+ asoc->peer.active_path = transport;
+
+ /*
+ * SFR-CACC algorithm:
+ * Upon the receipt of a request to change the primary
+ * destination address, on the data structure for the new
+ * primary destination, the sender MUST do the following:
+ *
+ * 1) If CHANGEOVER_ACTIVE is set, then there was a switch
+ * to this destination address earlier. The sender MUST set
+ * CYCLING_CHANGEOVER to indicate that this switch is a
+ * double switch to the same destination address.
+ *
+ * Really, only bother is we have data queued or outstanding on
+ * the association.
+ */
+ if (!asoc->outqueue.outstanding_bytes && !asoc->outqueue.out_qlen)
+ return;
+
+ if (transport->cacc.changeover_active)
+ transport->cacc.cycling_changeover = changeover;
+
+ /* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that
+ * a changeover has occurred.
+ */
+ transport->cacc.changeover_active = changeover;
+
+ /* 3) The sender MUST store the next TSN to be sent in
+ * next_tsn_at_change.
+ */
+ transport->cacc.next_tsn_at_change = asoc->next_tsn;
+}
+
+/* Remove a transport from an association. */
+void sctp_assoc_rm_peer(struct sctp_association *asoc,
+ struct sctp_transport *peer)
+{
+ struct list_head *pos;
+ struct sctp_transport *transport;
+
+ pr_debug("%s: association:%p addr:%pISpc\n",
+ __func__, asoc, &peer->ipaddr.sa);
+
+ /* If we are to remove the current retran_path, update it
+ * to the next peer before removing this peer from the list.
+ */
+ if (asoc->peer.retran_path == peer)
+ sctp_assoc_update_retran_path(asoc);
+
+ /* Remove this peer from the list. */
+ list_del_rcu(&peer->transports);
+
+ /* Get the first transport of asoc. */
+ pos = asoc->peer.transport_addr_list.next;
+ transport = list_entry(pos, struct sctp_transport, transports);
+
+ /* Update any entries that match the peer to be deleted. */
+ if (asoc->peer.primary_path == peer)
+ sctp_assoc_set_primary(asoc, transport);
+ if (asoc->peer.active_path == peer)
+ asoc->peer.active_path = transport;
+ if (asoc->peer.retran_path == peer)
+ asoc->peer.retran_path = transport;
+ if (asoc->peer.last_data_from == peer)
+ asoc->peer.last_data_from = transport;
+
+ /* If we remove the transport an INIT was last sent to, set it to
+ * NULL. Combined with the update of the retran path above, this
+ * will cause the next INIT to be sent to the next available
+ * transport, maintaining the cycle.
+ */
+ if (asoc->init_last_sent_to == peer)
+ asoc->init_last_sent_to = NULL;
+
+ /* If we remove the transport an SHUTDOWN was last sent to, set it
+ * to NULL. Combined with the update of the retran path above, this
+ * will cause the next SHUTDOWN to be sent to the next available
+ * transport, maintaining the cycle.
+ */
+ if (asoc->shutdown_last_sent_to == peer)
+ asoc->shutdown_last_sent_to = NULL;
+
+ /* If we remove the transport an ASCONF was last sent to, set it to
+ * NULL.
+ */
+ if (asoc->addip_last_asconf &&
+ asoc->addip_last_asconf->transport == peer)
+ asoc->addip_last_asconf->transport = NULL;
+
+ /* If we have something on the transmitted list, we have to
+ * save it off. The best place is the active path.
+ */
+ if (!list_empty(&peer->transmitted)) {
+ struct sctp_transport *active = asoc->peer.active_path;
+ struct sctp_chunk *ch;
+
+ /* Reset the transport of each chunk on this list */
+ list_for_each_entry(ch, &peer->transmitted,
+ transmitted_list) {
+ ch->transport = NULL;
+ ch->rtt_in_progress = 0;
+ }
+
+ list_splice_tail_init(&peer->transmitted,
+ &active->transmitted);
+
+ /* Start a T3 timer here in case it wasn't running so
+ * that these migrated packets have a chance to get
+ * retransmitted.
+ */
+ if (!timer_pending(&active->T3_rtx_timer))
+ if (!mod_timer(&active->T3_rtx_timer,
+ jiffies + active->rto))
+ sctp_transport_hold(active);
+ }
+
+ asoc->peer.transport_count--;
+
+ sctp_transport_free(peer);
+}
+
+/* Add a transport address to an association. */
+struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
+ const union sctp_addr *addr,
+ const gfp_t gfp,
+ const int peer_state)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ struct sctp_transport *peer;
+ struct sctp_sock *sp;
+ unsigned short port;
+
+ sp = sctp_sk(asoc->base.sk);
+
+ /* AF_INET and AF_INET6 share common port field. */
+ port = ntohs(addr->v4.sin_port);
+
+ pr_debug("%s: association:%p addr:%pISpc state:%d\n", __func__,
+ asoc, &addr->sa, peer_state);
+
+ /* Set the port if it has not been set yet. */
+ if (0 == asoc->peer.port)
+ asoc->peer.port = port;
+
+ /* Check to see if this is a duplicate. */
+ peer = sctp_assoc_lookup_paddr(asoc, addr);
+ if (peer) {
+ /* An UNKNOWN state is only set on transports added by
+ * user in sctp_connectx() call. Such transports should be
+ * considered CONFIRMED per RFC 4960, Section 5.4.
+ */
+ if (peer->state == SCTP_UNKNOWN) {
+ peer->state = SCTP_ACTIVE;
+ }
+ return peer;
+ }
+
+ peer = sctp_transport_new(net, addr, gfp);
+ if (!peer)
+ return NULL;
+
+ sctp_transport_set_owner(peer, asoc);
+
+ /* Initialize the peer's heartbeat interval based on the
+ * association configured value.
+ */
+ peer->hbinterval = asoc->hbinterval;
+
+ /* Set the path max_retrans. */
+ peer->pathmaxrxt = asoc->pathmaxrxt;
+
+ /* And the partial failure retrans threshold */
+ peer->pf_retrans = asoc->pf_retrans;
+
+ /* Initialize the peer's SACK delay timeout based on the
+ * association configured value.
+ */
+ peer->sackdelay = asoc->sackdelay;
+ peer->sackfreq = asoc->sackfreq;
+
+ /* Enable/disable heartbeat, SACK delay, and path MTU discovery
+ * based on association setting.
+ */
+ peer->param_flags = asoc->param_flags;
+
+ sctp_transport_route(peer, NULL, sp);
+
+ /* Initialize the pmtu of the transport. */
+ if (peer->param_flags & SPP_PMTUD_DISABLE) {
+ if (asoc->pathmtu)
+ peer->pathmtu = asoc->pathmtu;
+ else
+ peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+ }
+
+ /* If this is the first transport addr on this association,
+ * initialize the association PMTU to the peer's PMTU.
+ * If not and the current association PMTU is higher than the new
+ * peer's PMTU, reset the association PMTU to the new peer's PMTU.
+ */
+ if (asoc->pathmtu)
+ asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
+ else
+ asoc->pathmtu = peer->pathmtu;
+
+ pr_debug("%s: association:%p PMTU set to %d\n", __func__, asoc,
+ asoc->pathmtu);
+
+ peer->pmtu_pending = 0;
+
+ asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
+
+ /* The asoc->peer.port might not be meaningful yet, but
+ * initialize the packet structure anyway.
+ */
+ sctp_packet_init(&peer->packet, peer, asoc->base.bind_addr.port,
+ asoc->peer.port);
+
+ /* 7.2.1 Slow-Start
+ *
+ * o The initial cwnd before DATA transmission or after a sufficiently
+ * long idle period MUST be set to
+ * min(4*MTU, max(2*MTU, 4380 bytes))
+ *
+ * o The initial value of ssthresh MAY be arbitrarily high
+ * (for example, implementations MAY use the size of the
+ * receiver advertised window).
+ */
+ peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
+
+ /* At this point, we may not have the receiver's advertised window,
+ * so initialize ssthresh to the default value and it will be set
+ * later when we process the INIT.
+ */
+ peer->ssthresh = SCTP_DEFAULT_MAXWINDOW;
+
+ peer->partial_bytes_acked = 0;
+ peer->flight_size = 0;
+ peer->burst_limited = 0;
+
+ /* Set the transport's RTO.initial value */
+ peer->rto = asoc->rto_initial;
+ sctp_max_rto(asoc, peer);
+
+ /* Set the peer's active state. */
+ peer->state = peer_state;
+
+ /* Attach the remote transport to our asoc. */
+ list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
+ asoc->peer.transport_count++;
+
+ /* If we do not yet have a primary path, set one. */
+ if (!asoc->peer.primary_path) {
+ sctp_assoc_set_primary(asoc, peer);
+ asoc->peer.retran_path = peer;
+ }
+
+ if (asoc->peer.active_path == asoc->peer.retran_path &&
+ peer->state != SCTP_UNCONFIRMED) {
+ asoc->peer.retran_path = peer;
+ }
+
+ return peer;
+}
+
+/* Delete a transport address from an association. */
+void sctp_assoc_del_peer(struct sctp_association *asoc,
+ const union sctp_addr *addr)
+{
+ struct list_head *pos;
+ struct list_head *temp;
+ struct sctp_transport *transport;
+
+ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+ transport = list_entry(pos, struct sctp_transport, transports);
+ if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
+ /* Do book keeping for removing the peer and free it. */
+ sctp_assoc_rm_peer(asoc, transport);
+ break;
+ }
+ }
+}
+
+/* Lookup a transport by address. */
+struct sctp_transport *sctp_assoc_lookup_paddr(
+ const struct sctp_association *asoc,
+ const union sctp_addr *address)
+{
+ struct sctp_transport *t;
+
+ /* Cycle through all transports searching for a peer address. */
+
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports) {
+ if (sctp_cmp_addr_exact(address, &t->ipaddr))
+ return t;
+ }
+
+ return NULL;
+}
+
+/* Remove all transports except a give one */
+void sctp_assoc_del_nonprimary_peers(struct sctp_association *asoc,
+ struct sctp_transport *primary)
+{
+ struct sctp_transport *temp;
+ struct sctp_transport *t;
+
+ list_for_each_entry_safe(t, temp, &asoc->peer.transport_addr_list,
+ transports) {
+ /* if the current transport is not the primary one, delete it */
+ if (t != primary)
+ sctp_assoc_rm_peer(asoc, t);
+ }
+}
+
+/* Engage in transport control operations.
+ * Mark the transport up or down and send a notification to the user.
+ * Select and update the new active and retran paths.
+ */
+void sctp_assoc_control_transport(struct sctp_association *asoc,
+ struct sctp_transport *transport,
+ sctp_transport_cmd_t command,
+ sctp_sn_error_t error)
+{
+ struct sctp_ulpevent *event;
+ struct sockaddr_storage addr;
+ int spc_state = 0;
+ bool ulp_notify = true;
+
+ /* Record the transition on the transport. */
+ switch (command) {
+ case SCTP_TRANSPORT_UP:
+ /* If we are moving from UNCONFIRMED state due
+ * to heartbeat success, report the SCTP_ADDR_CONFIRMED
+ * state to the user, otherwise report SCTP_ADDR_AVAILABLE.
+ */
+ if (SCTP_UNCONFIRMED == transport->state &&
+ SCTP_HEARTBEAT_SUCCESS == error)
+ spc_state = SCTP_ADDR_CONFIRMED;
+ else
+ spc_state = SCTP_ADDR_AVAILABLE;
+ /* Don't inform ULP about transition from PF to
+ * active state and set cwnd to 1 MTU, see SCTP
+ * Quick failover draft section 5.1, point 5
+ */
+ if (transport->state == SCTP_PF) {
+ ulp_notify = false;
+ transport->cwnd = asoc->pathmtu;
+ }
+ transport->state = SCTP_ACTIVE;
+ break;
+
+ case SCTP_TRANSPORT_DOWN:
+ /* If the transport was never confirmed, do not transition it
+ * to inactive state. Also, release the cached route since
+ * there may be a better route next time.
+ */
+ if (transport->state != SCTP_UNCONFIRMED)
+ transport->state = SCTP_INACTIVE;
+ else {
+ dst_release(transport->dst);
+ transport->dst = NULL;
+ ulp_notify = false;
+ }
+
+ spc_state = SCTP_ADDR_UNREACHABLE;
+ break;
+
+ case SCTP_TRANSPORT_PF:
+ transport->state = SCTP_PF;
+ ulp_notify = false;
+ break;
+
+ default:
+ return;
+ }
+
+ /* Generate and send a SCTP_PEER_ADDR_CHANGE notification
+ * to the user.
+ */
+ if (ulp_notify) {
+ memset(&addr, 0, sizeof(struct sockaddr_storage));
+ memcpy(&addr, &transport->ipaddr,
+ transport->af_specific->sockaddr_len);
+
+ event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+ 0, spc_state, error, GFP_ATOMIC);
+ if (event)
+ sctp_ulpq_tail_event(&asoc->ulpq, event);
+ }
+
+ /* Select new active and retran paths. */
+ sctp_select_active_and_retran_path(asoc);
+}
+
+/* Hold a reference to an association. */
+void sctp_association_hold(struct sctp_association *asoc)
+{
+ atomic_inc(&asoc->base.refcnt);
+}
+
+/* Release a reference to an association and cleanup
+ * if there are no more references.
+ */
+void sctp_association_put(struct sctp_association *asoc)
+{
+ if (atomic_dec_and_test(&asoc->base.refcnt))
+ sctp_association_destroy(asoc);
+}
+
+/* Allocate the next TSN, Transmission Sequence Number, for the given
+ * association.
+ */
+__u32 sctp_association_get_next_tsn(struct sctp_association *asoc)
+{
+ /* From Section 1.6 Serial Number Arithmetic:
+ * Transmission Sequence Numbers wrap around when they reach
+ * 2**32 - 1. That is, the next TSN a DATA chunk MUST use
+ * after transmitting TSN = 2*32 - 1 is TSN = 0.
+ */
+ __u32 retval = asoc->next_tsn;
+ asoc->next_tsn++;
+ asoc->unack_data++;
+
+ return retval;
+}
+
+/* Compare two addresses to see if they match. Wildcard addresses
+ * only match themselves.
+ */
+int sctp_cmp_addr_exact(const union sctp_addr *ss1,
+ const union sctp_addr *ss2)
+{
+ struct sctp_af *af;
+
+ af = sctp_get_af_specific(ss1->sa.sa_family);
+ if (unlikely(!af))
+ return 0;
+
+ return af->cmp_addr(ss1, ss2);
+}
+
+/* Return an ecne chunk to get prepended to a packet.
+ * Note: We are sly and return a shared, prealloced chunk. FIXME:
+ * No we don't, but we could/should.
+ */
+struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc)
+{
+ if (!asoc->need_ecne)
+ return NULL;
+
+ /* Send ECNE if needed.
+ * Not being able to allocate a chunk here is not deadly.
+ */
+ return sctp_make_ecne(asoc, asoc->last_ecne_tsn);
+}
+
+/*
+ * Find which transport this TSN was sent on.
+ */
+struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc,
+ __u32 tsn)
+{
+ struct sctp_transport *active;
+ struct sctp_transport *match;
+ struct sctp_transport *transport;
+ struct sctp_chunk *chunk;
+ __be32 key = htonl(tsn);
+
+ match = NULL;
+
+ /*
+ * FIXME: In general, find a more efficient data structure for
+ * searching.
+ */
+
+ /*
+ * The general strategy is to search each transport's transmitted
+ * list. Return which transport this TSN lives on.
+ *
+ * Let's be hopeful and check the active_path first.
+ * Another optimization would be to know if there is only one
+ * outbound path and not have to look for the TSN at all.
+ *
+ */
+
+ active = asoc->peer.active_path;
+
+ list_for_each_entry(chunk, &active->transmitted,
+ transmitted_list) {
+
+ if (key == chunk->subh.data_hdr->tsn) {
+ match = active;
+ goto out;
+ }
+ }
+
+ /* If not found, go search all the other transports. */
+ list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+ transports) {
+
+ if (transport == active)
+ continue;
+ list_for_each_entry(chunk, &transport->transmitted,
+ transmitted_list) {
+ if (key == chunk->subh.data_hdr->tsn) {
+ match = transport;
+ goto out;
+ }
+ }
+ }
+out:
+ return match;
+}
+
+/* Is this the association we are looking for? */
+struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
+ struct net *net,
+ const union sctp_addr *laddr,
+ const union sctp_addr *paddr)
+{
+ struct sctp_transport *transport;
+
+ if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
+ (htons(asoc->peer.port) == paddr->v4.sin_port) &&
+ net_eq(sock_net(asoc->base.sk), net)) {
+ transport = sctp_assoc_lookup_paddr(asoc, paddr);
+ if (!transport)
+ goto out;
+
+ if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
+ sctp_sk(asoc->base.sk)))
+ goto out;
+ }
+ transport = NULL;
+
+out:
+ return transport;
+}
+
+/* Do delayed input processing. This is scheduled by sctp_rcv(). */
+static void sctp_assoc_bh_rcv(struct work_struct *work)
+{
+ struct sctp_association *asoc =
+ container_of(work, struct sctp_association,
+ base.inqueue.immediate);
+ struct net *net = sock_net(asoc->base.sk);
+ struct sctp_endpoint *ep;
+ struct sctp_chunk *chunk;
+ struct sctp_inq *inqueue;
+ int state;
+ sctp_subtype_t subtype;
+ int error = 0;
+
+ /* The association should be held so we should be safe. */
+ ep = asoc->ep;
+
+ inqueue = &asoc->base.inqueue;
+ sctp_association_hold(asoc);
+ while (NULL != (chunk = sctp_inq_pop(inqueue))) {
+ state = asoc->state;
+ subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type);
+
+ /* SCTP-AUTH, Section 6.3:
+ * The receiver has a list of chunk types which it expects
+ * to be received only after an AUTH-chunk. This list has
+ * been sent to the peer during the association setup. It
+ * MUST silently discard these chunks if they are not placed
+ * after an AUTH chunk in the packet.
+ */
+ if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth)
+ continue;
+
+ /* Remember where the last DATA chunk came from so we
+ * know where to send the SACK.
+ */
+ if (sctp_chunk_is_data(chunk))
+ asoc->peer.last_data_from = chunk->transport;
+ else {
+ SCTP_INC_STATS(net, SCTP_MIB_INCTRLCHUNKS);
+ asoc->stats.ictrlchunks++;
+ if (chunk->chunk_hdr->type == SCTP_CID_SACK)
+ asoc->stats.isacks++;
+ }
+
+ if (chunk->transport)
+ chunk->transport->last_time_heard = ktime_get();
+
+ /* Run through the state machine. */
+ error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype,
+ state, ep, asoc, chunk, GFP_ATOMIC);
+
+ /* Check to see if the association is freed in response to
+ * the incoming chunk. If so, get out of the while loop.
+ */
+ if (asoc->base.dead)
+ break;
+
+ /* If there is an error on chunk, discard this packet. */
+ if (error && chunk)
+ chunk->pdiscard = 1;
+ }
+ sctp_association_put(asoc);
+}
+
+/* This routine moves an association from its old sk to a new sk. */
+void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk)
+{
+ struct sctp_sock *newsp = sctp_sk(newsk);
+ struct sock *oldsk = assoc->base.sk;
+
+ /* Delete the association from the old endpoint's list of
+ * associations.
+ */
+ list_del_init(&assoc->asocs);
+
+ /* Decrement the backlog value for a TCP-style socket. */
+ if (sctp_style(oldsk, TCP))
+ oldsk->sk_ack_backlog--;
+
+ /* Release references to the old endpoint and the sock. */
+ sctp_endpoint_put(assoc->ep);
+ sock_put(assoc->base.sk);
+
+ /* Get a reference to the new endpoint. */
+ assoc->ep = newsp->ep;
+ sctp_endpoint_hold(assoc->ep);
+
+ /* Get a reference to the new sock. */
+ assoc->base.sk = newsk;
+ sock_hold(assoc->base.sk);
+
+ /* Add the association to the new endpoint's list of associations. */
+ sctp_endpoint_add_asoc(newsp->ep, assoc);
+}
+
+/* Update an association (possibly from unexpected COOKIE-ECHO processing). */
+void sctp_assoc_update(struct sctp_association *asoc,
+ struct sctp_association *new)
+{
+ struct sctp_transport *trans;
+ struct list_head *pos, *temp;
+
+ /* Copy in new parameters of peer. */
+ asoc->c = new->c;
+ asoc->peer.rwnd = new->peer.rwnd;
+ asoc->peer.sack_needed = new->peer.sack_needed;
+ asoc->peer.auth_capable = new->peer.auth_capable;
+ asoc->peer.i = new->peer.i;
+ sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
+ asoc->peer.i.initial_tsn, GFP_ATOMIC);
+
+ /* Remove any peer addresses not present in the new association. */
+ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+ trans = list_entry(pos, struct sctp_transport, transports);
+ if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) {
+ sctp_assoc_rm_peer(asoc, trans);
+ continue;
+ }
+
+ if (asoc->state >= SCTP_STATE_ESTABLISHED)
+ sctp_transport_reset(trans);
+ }
+
+ /* If the case is A (association restart), use
+ * initial_tsn as next_tsn. If the case is B, use
+ * current next_tsn in case data sent to peer
+ * has been discarded and needs retransmission.
+ */
+ if (asoc->state >= SCTP_STATE_ESTABLISHED) {
+ asoc->next_tsn = new->next_tsn;
+ asoc->ctsn_ack_point = new->ctsn_ack_point;
+ asoc->adv_peer_ack_point = new->adv_peer_ack_point;
+
+ /* Reinitialize SSN for both local streams
+ * and peer's streams.
+ */
+ sctp_ssnmap_clear(asoc->ssnmap);
+
+ /* Flush the ULP reassembly and ordered queue.
+ * Any data there will now be stale and will
+ * cause problems.
+ */
+ sctp_ulpq_flush(&asoc->ulpq);
+
+ /* reset the overall association error count so
+ * that the restarted association doesn't get torn
+ * down on the next retransmission timer.
+ */
+ asoc->overall_error_count = 0;
+
+ } else {
+ /* Add any peer addresses from the new association. */
+ list_for_each_entry(trans, &new->peer.transport_addr_list,
+ transports) {
+ if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr))
+ sctp_assoc_add_peer(asoc, &trans->ipaddr,
+ GFP_ATOMIC, trans->state);
+ }
+
+ asoc->ctsn_ack_point = asoc->next_tsn - 1;
+ asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
+ if (!asoc->ssnmap) {
+ /* Move the ssnmap. */
+ asoc->ssnmap = new->ssnmap;
+ new->ssnmap = NULL;
+ }
+
+ if (!asoc->assoc_id) {
+ /* get a new association id since we don't have one
+ * yet.
+ */
+ sctp_assoc_set_id(asoc, GFP_ATOMIC);
+ }
+ }
+
+ /* SCTP-AUTH: Save the peer parameters from the new associations
+ * and also move the association shared keys over
+ */
+ kfree(asoc->peer.peer_random);
+ asoc->peer.peer_random = new->peer.peer_random;
+ new->peer.peer_random = NULL;
+
+ kfree(asoc->peer.peer_chunks);
+ asoc->peer.peer_chunks = new->peer.peer_chunks;
+ new->peer.peer_chunks = NULL;
+
+ kfree(asoc->peer.peer_hmacs);
+ asoc->peer.peer_hmacs = new->peer.peer_hmacs;
+ new->peer.peer_hmacs = NULL;
+
+ sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC);
+}
+
+/* Update the retran path for sending a retransmitted packet.
+ * See also RFC4960, 6.4. Multi-Homed SCTP Endpoints:
+ *
+ * When there is outbound data to send and the primary path
+ * becomes inactive (e.g., due to failures), or where the
+ * SCTP user explicitly requests to send data to an
+ * inactive destination transport address, before reporting
+ * an error to its ULP, the SCTP endpoint should try to send
+ * the data to an alternate active destination transport
+ * address if one exists.
+ *
+ * When retransmitting data that timed out, if the endpoint
+ * is multihomed, it should consider each source-destination
+ * address pair in its retransmission selection policy.
+ * When retransmitting timed-out data, the endpoint should
+ * attempt to pick the most divergent source-destination
+ * pair from the original source-destination pair to which
+ * the packet was transmitted.
+ *
+ * Note: Rules for picking the most divergent source-destination
+ * pair are an implementation decision and are not specified
+ * within this document.
+ *
+ * Our basic strategy is to round-robin transports in priorities
+ * according to sctp_state_prio_map[] e.g., if no such
+ * transport with state SCTP_ACTIVE exists, round-robin through
+ * SCTP_UNKNOWN, etc. You get the picture.
+ */
+static const u8 sctp_trans_state_to_prio_map[] = {
+ [SCTP_ACTIVE] = 3, /* best case */
+ [SCTP_UNKNOWN] = 2,
+ [SCTP_PF] = 1,
+ [SCTP_INACTIVE] = 0, /* worst case */
+};
+
+static u8 sctp_trans_score(const struct sctp_transport *trans)
+{
+ return sctp_trans_state_to_prio_map[trans->state];
+}
+
+static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1,
+ struct sctp_transport *trans2)
+{
+ if (trans1->error_count > trans2->error_count) {
+ return trans2;
+ } else if (trans1->error_count == trans2->error_count &&
+ ktime_after(trans2->last_time_heard,
+ trans1->last_time_heard)) {
+ return trans2;
+ } else {
+ return trans1;
+ }
+}
+
+static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr,
+ struct sctp_transport *best)
+{
+ u8 score_curr, score_best;
+
+ if (best == NULL || curr == best)
+ return curr;
+
+ score_curr = sctp_trans_score(curr);
+ score_best = sctp_trans_score(best);
+
+ /* First, try a score-based selection if both transport states
+ * differ. If we're in a tie, lets try to make a more clever
+ * decision here based on error counts and last time heard.
+ */
+ if (score_curr > score_best)
+ return curr;
+ else if (score_curr == score_best)
+ return sctp_trans_elect_tie(curr, best);
+ else
+ return best;
+}
+
+void sctp_assoc_update_retran_path(struct sctp_association *asoc)
+{
+ struct sctp_transport *trans = asoc->peer.retran_path;
+ struct sctp_transport *trans_next = NULL;
+
+ /* We're done as we only have the one and only path. */
+ if (asoc->peer.transport_count == 1)
+ return;
+ /* If active_path and retran_path are the same and active,
+ * then this is the only active path. Use it.
+ */
+ if (asoc->peer.active_path == asoc->peer.retran_path &&
+ asoc->peer.active_path->state == SCTP_ACTIVE)
+ return;
+
+ /* Iterate from retran_path's successor back to retran_path. */
+ for (trans = list_next_entry(trans, transports); 1;
+ trans = list_next_entry(trans, transports)) {
+ /* Manually skip the head element. */
+ if (&trans->transports == &asoc->peer.transport_addr_list)
+ continue;
+ if (trans->state == SCTP_UNCONFIRMED)
+ continue;
+ trans_next = sctp_trans_elect_best(trans, trans_next);
+ /* Active is good enough for immediate return. */
+ if (trans_next->state == SCTP_ACTIVE)
+ break;
+ /* We've reached the end, time to update path. */
+ if (trans == asoc->peer.retran_path)
+ break;
+ }
+
+ asoc->peer.retran_path = trans_next;
+
+ pr_debug("%s: association:%p updated new path to addr:%pISpc\n",
+ __func__, asoc, &asoc->peer.retran_path->ipaddr.sa);
+}
+
+static void sctp_select_active_and_retran_path(struct sctp_association *asoc)
+{
+ struct sctp_transport *trans, *trans_pri = NULL, *trans_sec = NULL;
+ struct sctp_transport *trans_pf = NULL;
+
+ /* Look for the two most recently used active transports. */
+ list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+ transports) {
+ /* Skip uninteresting transports. */
+ if (trans->state == SCTP_INACTIVE ||
+ trans->state == SCTP_UNCONFIRMED)
+ continue;
+ /* Keep track of the best PF transport from our
+ * list in case we don't find an active one.
+ */
+ if (trans->state == SCTP_PF) {
+ trans_pf = sctp_trans_elect_best(trans, trans_pf);
+ continue;
+ }
+ /* For active transports, pick the most recent ones. */
+ if (trans_pri == NULL ||
+ ktime_after(trans->last_time_heard,
+ trans_pri->last_time_heard)) {
+ trans_sec = trans_pri;
+ trans_pri = trans;
+ } else if (trans_sec == NULL ||
+ ktime_after(trans->last_time_heard,
+ trans_sec->last_time_heard)) {
+ trans_sec = trans;
+ }
+ }
+
+ /* RFC 2960 6.4 Multi-Homed SCTP Endpoints
+ *
+ * By default, an endpoint should always transmit to the primary
+ * path, unless the SCTP user explicitly specifies the
+ * destination transport address (and possibly source transport
+ * address) to use. [If the primary is active but not most recent,
+ * bump the most recently used transport.]
+ */
+ if ((asoc->peer.primary_path->state == SCTP_ACTIVE ||
+ asoc->peer.primary_path->state == SCTP_UNKNOWN) &&
+ asoc->peer.primary_path != trans_pri) {
+ trans_sec = trans_pri;
+ trans_pri = asoc->peer.primary_path;
+ }
+
+ /* We did not find anything useful for a possible retransmission
+ * path; either primary path that we found is the the same as
+ * the current one, or we didn't generally find an active one.
+ */
+ if (trans_sec == NULL)
+ trans_sec = trans_pri;
+
+ /* If we failed to find a usable transport, just camp on the
+ * active or pick a PF iff it's the better choice.
+ */
+ if (trans_pri == NULL) {
+ trans_pri = sctp_trans_elect_best(asoc->peer.active_path, trans_pf);
+ trans_sec = trans_pri;
+ }
+
+ /* Set the active and retran transports. */
+ asoc->peer.active_path = trans_pri;
+ asoc->peer.retran_path = trans_sec;
+}
+
+struct sctp_transport *
+sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
+ struct sctp_transport *last_sent_to)
+{
+ /* If this is the first time packet is sent, use the active path,
+ * else use the retran path. If the last packet was sent over the
+ * retran path, update the retran path and use it.
+ */
+ if (last_sent_to == NULL) {
+ return asoc->peer.active_path;
+ } else {
+ if (last_sent_to == asoc->peer.retran_path)
+ sctp_assoc_update_retran_path(asoc);
+
+ return asoc->peer.retran_path;
+ }
+}
+
+/* Update the association's pmtu and frag_point by going through all the
+ * transports. This routine is called when a transport's PMTU has changed.
+ */
+void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
+{
+ struct sctp_transport *t;
+ __u32 pmtu = 0;
+
+ if (!asoc)
+ return;
+
+ /* Get the lowest pmtu of all the transports. */
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports) {
+ if (t->pmtu_pending && t->dst) {
+ sctp_transport_update_pmtu(sk, t, dst_mtu(t->dst));
+ t->pmtu_pending = 0;
+ }
+ if (!pmtu || (t->pathmtu < pmtu))
+ pmtu = t->pathmtu;
+ }
+
+ if (pmtu) {
+ asoc->pathmtu = pmtu;
+ asoc->frag_point = sctp_frag_point(asoc, pmtu);
+ }
+
+ pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
+ asoc->pathmtu, asoc->frag_point);
+}
+
+/* Should we send a SACK to update our peer? */
+static inline bool sctp_peer_needs_update(struct sctp_association *asoc)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ switch (asoc->state) {
+ case SCTP_STATE_ESTABLISHED:
+ case SCTP_STATE_SHUTDOWN_PENDING:
+ case SCTP_STATE_SHUTDOWN_RECEIVED:
+ case SCTP_STATE_SHUTDOWN_SENT:
+ if ((asoc->rwnd > asoc->a_rwnd) &&
+ ((asoc->rwnd - asoc->a_rwnd) >= max_t(__u32,
+ (asoc->base.sk->sk_rcvbuf >> net->sctp.rwnd_upd_shift),
+ asoc->pathmtu)))
+ return true;
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+/* Increase asoc's rwnd by len and send any window update SACK if needed. */
+void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
+{
+ struct sctp_chunk *sack;
+ struct timer_list *timer;
+
+ if (asoc->rwnd_over) {
+ if (asoc->rwnd_over >= len) {
+ asoc->rwnd_over -= len;
+ } else {
+ asoc->rwnd += (len - asoc->rwnd_over);
+ asoc->rwnd_over = 0;
+ }
+ } else {
+ asoc->rwnd += len;
+ }
+
+ /* If we had window pressure, start recovering it
+ * once our rwnd had reached the accumulated pressure
+ * threshold. The idea is to recover slowly, but up
+ * to the initial advertised window.
+ */
+ if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
+ int change = min(asoc->pathmtu, asoc->rwnd_press);
+ asoc->rwnd += change;
+ asoc->rwnd_press -= change;
+ }
+
+ pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n",
+ __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
+ asoc->a_rwnd);
+
+ /* Send a window update SACK if the rwnd has increased by at least the
+ * minimum of the association's PMTU and half of the receive buffer.
+ * The algorithm used is similar to the one described in
+ * Section 4.2.3.3 of RFC 1122.
+ */
+ if (sctp_peer_needs_update(asoc)) {
+ asoc->a_rwnd = asoc->rwnd;
+
+ pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u "
+ "a_rwnd:%u\n", __func__, asoc, asoc->rwnd,
+ asoc->a_rwnd);
+
+ sack = sctp_make_sack(asoc);
+ if (!sack)
+ return;
+
+ asoc->peer.sack_needed = 0;
+
+ sctp_outq_tail(&asoc->outqueue, sack);
+
+ /* Stop the SACK timer. */
+ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
+ if (del_timer(timer))
+ sctp_association_put(asoc);
+ }
+}
+
+/* Decrease asoc's rwnd by len. */
+void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
+{
+ int rx_count;
+ int over = 0;
+
+ if (unlikely(!asoc->rwnd || asoc->rwnd_over))
+ pr_debug("%s: association:%p has asoc->rwnd:%u, "
+ "asoc->rwnd_over:%u!\n", __func__, asoc,
+ asoc->rwnd, asoc->rwnd_over);
+
+ if (asoc->ep->rcvbuf_policy)
+ rx_count = atomic_read(&asoc->rmem_alloc);
+ else
+ rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
+
+ /* If we've reached or overflowed our receive buffer, announce
+ * a 0 rwnd if rwnd would still be positive. Store the
+ * the potential pressure overflow so that the window can be restored
+ * back to original value.
+ */
+ if (rx_count >= asoc->base.sk->sk_rcvbuf)
+ over = 1;
+
+ if (asoc->rwnd >= len) {
+ asoc->rwnd -= len;
+ if (over) {
+ asoc->rwnd_press += asoc->rwnd;
+ asoc->rwnd = 0;
+ }
+ } else {
+ asoc->rwnd_over = len - asoc->rwnd;
+ asoc->rwnd = 0;
+ }
+
+ pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n",
+ __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
+ asoc->rwnd_press);
+}
+
+/* Build the bind address list for the association based on info from the
+ * local endpoint and the remote peer.
+ */
+int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
+ sctp_scope_t scope, gfp_t gfp)
+{
+ int flags;
+
+ /* Use scoping rules to determine the subset of addresses from
+ * the endpoint.
+ */
+ flags = (PF_INET6 == asoc->base.sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0;
+ if (asoc->peer.ipv4_address)
+ flags |= SCTP_ADDR4_PEERSUPP;
+ if (asoc->peer.ipv6_address)
+ flags |= SCTP_ADDR6_PEERSUPP;
+
+ return sctp_bind_addr_copy(sock_net(asoc->base.sk),
+ &asoc->base.bind_addr,
+ &asoc->ep->base.bind_addr,
+ scope, gfp, flags);
+}
+
+/* Build the association's bind address list from the cookie. */
+int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
+ struct sctp_cookie *cookie,
+ gfp_t gfp)
+{
+ int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
+ int var_size3 = cookie->raw_addr_list_len;
+ __u8 *raw = (__u8 *)cookie->peer_init + var_size2;
+
+ return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3,
+ asoc->ep->base.bind_addr.port, gfp);
+}
+
+/* Lookup laddr in the bind address list of an association. */
+int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
+ const union sctp_addr *laddr)
+{
+ int found = 0;
+
+ if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) &&
+ sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
+ sctp_sk(asoc->base.sk)))
+ found = 1;
+
+ return found;
+}
+
+/* Set an association id for a given association */
+int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp)
+{
+ bool preload = !!(gfp & __GFP_WAIT);
+ int ret;
+
+ /* If the id is already assigned, keep it. */
+ if (asoc->assoc_id)
+ return 0;
+
+ if (preload)
+ idr_preload(gfp);
+ spin_lock_bh(&sctp_assocs_id_lock);
+ /* 0 is not a valid assoc_id, must be >= 1 */
+ ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, 1, 0, GFP_NOWAIT);
+ spin_unlock_bh(&sctp_assocs_id_lock);
+ if (preload)
+ idr_preload_end();
+ if (ret < 0)
+ return ret;
+
+ asoc->assoc_id = (sctp_assoc_t)ret;
+ return 0;
+}
+
+/* Free the ASCONF queue */
+static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc)
+{
+ struct sctp_chunk *asconf;
+ struct sctp_chunk *tmp;
+
+ list_for_each_entry_safe(asconf, tmp, &asoc->addip_chunk_list, list) {
+ list_del_init(&asconf->list);
+ sctp_chunk_free(asconf);
+ }
+}
+
+/* Free asconf_ack cache */
+static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc)
+{
+ struct sctp_chunk *ack;
+ struct sctp_chunk *tmp;
+
+ list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list,
+ transmitted_list) {
+ list_del_init(&ack->transmitted_list);
+ sctp_chunk_free(ack);
+ }
+}
+
+/* Clean up the ASCONF_ACK queue */
+void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc)
+{
+ struct sctp_chunk *ack;
+ struct sctp_chunk *tmp;
+
+ /* We can remove all the entries from the queue up to
+ * the "Peer-Sequence-Number".
+ */
+ list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list,
+ transmitted_list) {
+ if (ack->subh.addip_hdr->serial ==
+ htonl(asoc->peer.addip_serial))
+ break;
+
+ list_del_init(&ack->transmitted_list);
+ sctp_chunk_free(ack);
+ }
+}
+
+/* Find the ASCONF_ACK whose serial number matches ASCONF */
+struct sctp_chunk *sctp_assoc_lookup_asconf_ack(
+ const struct sctp_association *asoc,
+ __be32 serial)
+{
+ struct sctp_chunk *ack;
+
+ /* Walk through the list of cached ASCONF-ACKs and find the
+ * ack chunk whose serial number matches that of the request.
+ */
+ list_for_each_entry(ack, &asoc->asconf_ack_list, transmitted_list) {
+ if (sctp_chunk_pending(ack))
+ continue;
+ if (ack->subh.addip_hdr->serial == serial) {
+ sctp_chunk_hold(ack);
+ return ack;
+ }
+ }
+
+ return NULL;
+}
+
+void sctp_asconf_queue_teardown(struct sctp_association *asoc)
+{
+ /* Free any cached ASCONF_ACK chunk. */
+ sctp_assoc_free_asconf_acks(asoc);
+
+ /* Free the ASCONF queue. */
+ sctp_assoc_free_asconf_queue(asoc);
+
+ /* Free any cached ASCONF chunk. */
+ if (asoc->addip_last_asconf)
+ sctp_chunk_free(asoc->addip_last_asconf);
+}
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
new file mode 100644
index 000000000..4f15b7d73
--- /dev/null
+++ b/net/sctp/auth.c
@@ -0,0 +1,953 @@
+/* SCTP kernel implementation
+ * (C) Copyright 2007 Hewlett-Packard Development Company, L.P.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Vlad Yasevich <vladislav.yasevich@hp.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/auth.h>
+
+static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
+ {
+ /* id 0 is reserved. as all 0 */
+ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0,
+ },
+ {
+ .hmac_id = SCTP_AUTH_HMAC_ID_SHA1,
+ .hmac_name = "hmac(sha1)",
+ .hmac_len = SCTP_SHA1_SIG_SIZE,
+ },
+ {
+ /* id 2 is reserved as well */
+ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
+ },
+#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE)
+ {
+ .hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
+ .hmac_name = "hmac(sha256)",
+ .hmac_len = SCTP_SHA256_SIG_SIZE,
+ }
+#endif
+};
+
+
+void sctp_auth_key_put(struct sctp_auth_bytes *key)
+{
+ if (!key)
+ return;
+
+ if (atomic_dec_and_test(&key->refcnt)) {
+ kzfree(key);
+ SCTP_DBG_OBJCNT_DEC(keys);
+ }
+}
+
+/* Create a new key structure of a given length */
+static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp)
+{
+ struct sctp_auth_bytes *key;
+
+ /* Verify that we are not going to overflow INT_MAX */
+ if (key_len > (INT_MAX - sizeof(struct sctp_auth_bytes)))
+ return NULL;
+
+ /* Allocate the shared key */
+ key = kmalloc(sizeof(struct sctp_auth_bytes) + key_len, gfp);
+ if (!key)
+ return NULL;
+
+ key->len = key_len;
+ atomic_set(&key->refcnt, 1);
+ SCTP_DBG_OBJCNT_INC(keys);
+
+ return key;
+}
+
+/* Create a new shared key container with a give key id */
+struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp)
+{
+ struct sctp_shared_key *new;
+
+ /* Allocate the shared key container */
+ new = kzalloc(sizeof(struct sctp_shared_key), gfp);
+ if (!new)
+ return NULL;
+
+ INIT_LIST_HEAD(&new->key_list);
+ new->key_id = key_id;
+
+ return new;
+}
+
+/* Free the shared key structure */
+static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key)
+{
+ BUG_ON(!list_empty(&sh_key->key_list));
+ sctp_auth_key_put(sh_key->key);
+ sh_key->key = NULL;
+ kfree(sh_key);
+}
+
+/* Destroy the entire key list. This is done during the
+ * associon and endpoint free process.
+ */
+void sctp_auth_destroy_keys(struct list_head *keys)
+{
+ struct sctp_shared_key *ep_key;
+ struct sctp_shared_key *tmp;
+
+ if (list_empty(keys))
+ return;
+
+ key_for_each_safe(ep_key, tmp, keys) {
+ list_del_init(&ep_key->key_list);
+ sctp_auth_shkey_free(ep_key);
+ }
+}
+
+/* Compare two byte vectors as numbers. Return values
+ * are:
+ * 0 - vectors are equal
+ * < 0 - vector 1 is smaller than vector2
+ * > 0 - vector 1 is greater than vector2
+ *
+ * Algorithm is:
+ * This is performed by selecting the numerically smaller key vector...
+ * If the key vectors are equal as numbers but differ in length ...
+ * the shorter vector is considered smaller
+ *
+ * Examples (with small values):
+ * 000123456789 > 123456789 (first number is longer)
+ * 000123456789 < 234567891 (second number is larger numerically)
+ * 123456789 > 2345678 (first number is both larger & longer)
+ */
+static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1,
+ struct sctp_auth_bytes *vector2)
+{
+ int diff;
+ int i;
+ const __u8 *longer;
+
+ diff = vector1->len - vector2->len;
+ if (diff) {
+ longer = (diff > 0) ? vector1->data : vector2->data;
+
+ /* Check to see if the longer number is
+ * lead-zero padded. If it is not, it
+ * is automatically larger numerically.
+ */
+ for (i = 0; i < abs(diff); i++) {
+ if (longer[i] != 0)
+ return diff;
+ }
+ }
+
+ /* lengths are the same, compare numbers */
+ return memcmp(vector1->data, vector2->data, vector1->len);
+}
+
+/*
+ * Create a key vector as described in SCTP-AUTH, Section 6.1
+ * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO
+ * parameter sent by each endpoint are concatenated as byte vectors.
+ * These parameters include the parameter type, parameter length, and
+ * the parameter value, but padding is omitted; all padding MUST be
+ * removed from this concatenation before proceeding with further
+ * computation of keys. Parameters which were not sent are simply
+ * omitted from the concatenation process. The resulting two vectors
+ * are called the two key vectors.
+ */
+static struct sctp_auth_bytes *sctp_auth_make_key_vector(
+ sctp_random_param_t *random,
+ sctp_chunks_param_t *chunks,
+ sctp_hmac_algo_param_t *hmacs,
+ gfp_t gfp)
+{
+ struct sctp_auth_bytes *new;
+ __u32 len;
+ __u32 offset = 0;
+ __u16 random_len, hmacs_len, chunks_len = 0;
+
+ random_len = ntohs(random->param_hdr.length);
+ hmacs_len = ntohs(hmacs->param_hdr.length);
+ if (chunks)
+ chunks_len = ntohs(chunks->param_hdr.length);
+
+ len = random_len + hmacs_len + chunks_len;
+
+ new = sctp_auth_create_key(len, gfp);
+ if (!new)
+ return NULL;
+
+ memcpy(new->data, random, random_len);
+ offset += random_len;
+
+ if (chunks) {
+ memcpy(new->data + offset, chunks, chunks_len);
+ offset += chunks_len;
+ }
+
+ memcpy(new->data + offset, hmacs, hmacs_len);
+
+ return new;
+}
+
+
+/* Make a key vector based on our local parameters */
+static struct sctp_auth_bytes *sctp_auth_make_local_vector(
+ const struct sctp_association *asoc,
+ gfp_t gfp)
+{
+ return sctp_auth_make_key_vector(
+ (sctp_random_param_t *)asoc->c.auth_random,
+ (sctp_chunks_param_t *)asoc->c.auth_chunks,
+ (sctp_hmac_algo_param_t *)asoc->c.auth_hmacs,
+ gfp);
+}
+
+/* Make a key vector based on peer's parameters */
+static struct sctp_auth_bytes *sctp_auth_make_peer_vector(
+ const struct sctp_association *asoc,
+ gfp_t gfp)
+{
+ return sctp_auth_make_key_vector(asoc->peer.peer_random,
+ asoc->peer.peer_chunks,
+ asoc->peer.peer_hmacs,
+ gfp);
+}
+
+
+/* Set the value of the association shared key base on the parameters
+ * given. The algorithm is:
+ * From the endpoint pair shared keys and the key vectors the
+ * association shared keys are computed. This is performed by selecting
+ * the numerically smaller key vector and concatenating it to the
+ * endpoint pair shared key, and then concatenating the numerically
+ * larger key vector to that. The result of the concatenation is the
+ * association shared key.
+ */
+static struct sctp_auth_bytes *sctp_auth_asoc_set_secret(
+ struct sctp_shared_key *ep_key,
+ struct sctp_auth_bytes *first_vector,
+ struct sctp_auth_bytes *last_vector,
+ gfp_t gfp)
+{
+ struct sctp_auth_bytes *secret;
+ __u32 offset = 0;
+ __u32 auth_len;
+
+ auth_len = first_vector->len + last_vector->len;
+ if (ep_key->key)
+ auth_len += ep_key->key->len;
+
+ secret = sctp_auth_create_key(auth_len, gfp);
+ if (!secret)
+ return NULL;
+
+ if (ep_key->key) {
+ memcpy(secret->data, ep_key->key->data, ep_key->key->len);
+ offset += ep_key->key->len;
+ }
+
+ memcpy(secret->data + offset, first_vector->data, first_vector->len);
+ offset += first_vector->len;
+
+ memcpy(secret->data + offset, last_vector->data, last_vector->len);
+
+ return secret;
+}
+
+/* Create an association shared key. Follow the algorithm
+ * described in SCTP-AUTH, Section 6.1
+ */
+static struct sctp_auth_bytes *sctp_auth_asoc_create_secret(
+ const struct sctp_association *asoc,
+ struct sctp_shared_key *ep_key,
+ gfp_t gfp)
+{
+ struct sctp_auth_bytes *local_key_vector;
+ struct sctp_auth_bytes *peer_key_vector;
+ struct sctp_auth_bytes *first_vector,
+ *last_vector;
+ struct sctp_auth_bytes *secret = NULL;
+ int cmp;
+
+
+ /* Now we need to build the key vectors
+ * SCTP-AUTH , Section 6.1
+ * The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO
+ * parameter sent by each endpoint are concatenated as byte vectors.
+ * These parameters include the parameter type, parameter length, and
+ * the parameter value, but padding is omitted; all padding MUST be
+ * removed from this concatenation before proceeding with further
+ * computation of keys. Parameters which were not sent are simply
+ * omitted from the concatenation process. The resulting two vectors
+ * are called the two key vectors.
+ */
+
+ local_key_vector = sctp_auth_make_local_vector(asoc, gfp);
+ peer_key_vector = sctp_auth_make_peer_vector(asoc, gfp);
+
+ if (!peer_key_vector || !local_key_vector)
+ goto out;
+
+ /* Figure out the order in which the key_vectors will be
+ * added to the endpoint shared key.
+ * SCTP-AUTH, Section 6.1:
+ * This is performed by selecting the numerically smaller key
+ * vector and concatenating it to the endpoint pair shared
+ * key, and then concatenating the numerically larger key
+ * vector to that. If the key vectors are equal as numbers
+ * but differ in length, then the concatenation order is the
+ * endpoint shared key, followed by the shorter key vector,
+ * followed by the longer key vector. Otherwise, the key
+ * vectors are identical, and may be concatenated to the
+ * endpoint pair key in any order.
+ */
+ cmp = sctp_auth_compare_vectors(local_key_vector,
+ peer_key_vector);
+ if (cmp < 0) {
+ first_vector = local_key_vector;
+ last_vector = peer_key_vector;
+ } else {
+ first_vector = peer_key_vector;
+ last_vector = local_key_vector;
+ }
+
+ secret = sctp_auth_asoc_set_secret(ep_key, first_vector, last_vector,
+ gfp);
+out:
+ sctp_auth_key_put(local_key_vector);
+ sctp_auth_key_put(peer_key_vector);
+
+ return secret;
+}
+
+/*
+ * Populate the association overlay list with the list
+ * from the endpoint.
+ */
+int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ gfp_t gfp)
+{
+ struct sctp_shared_key *sh_key;
+ struct sctp_shared_key *new;
+
+ BUG_ON(!list_empty(&asoc->endpoint_shared_keys));
+
+ key_for_each(sh_key, &ep->endpoint_shared_keys) {
+ new = sctp_auth_shkey_create(sh_key->key_id, gfp);
+ if (!new)
+ goto nomem;
+
+ new->key = sh_key->key;
+ sctp_auth_key_hold(new->key);
+ list_add(&new->key_list, &asoc->endpoint_shared_keys);
+ }
+
+ return 0;
+
+nomem:
+ sctp_auth_destroy_keys(&asoc->endpoint_shared_keys);
+ return -ENOMEM;
+}
+
+
+/* Public interface to create the association shared key.
+ * See code above for the algorithm.
+ */
+int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp)
+{
+ struct sctp_auth_bytes *secret;
+ struct sctp_shared_key *ep_key;
+ struct sctp_chunk *chunk;
+
+ /* If we don't support AUTH, or peer is not capable
+ * we don't need to do anything.
+ */
+ if (!asoc->ep->auth_enable || !asoc->peer.auth_capable)
+ return 0;
+
+ /* If the key_id is non-zero and we couldn't find an
+ * endpoint pair shared key, we can't compute the
+ * secret.
+ * For key_id 0, endpoint pair shared key is a NULL key.
+ */
+ ep_key = sctp_auth_get_shkey(asoc, asoc->active_key_id);
+ BUG_ON(!ep_key);
+
+ secret = sctp_auth_asoc_create_secret(asoc, ep_key, gfp);
+ if (!secret)
+ return -ENOMEM;
+
+ sctp_auth_key_put(asoc->asoc_shared_key);
+ asoc->asoc_shared_key = secret;
+
+ /* Update send queue in case any chunk already in there now
+ * needs authenticating
+ */
+ list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) {
+ if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc))
+ chunk->auth = 1;
+ }
+
+ return 0;
+}
+
+
+/* Find the endpoint pair shared key based on the key_id */
+struct sctp_shared_key *sctp_auth_get_shkey(
+ const struct sctp_association *asoc,
+ __u16 key_id)
+{
+ struct sctp_shared_key *key;
+
+ /* First search associations set of endpoint pair shared keys */
+ key_for_each(key, &asoc->endpoint_shared_keys) {
+ if (key->key_id == key_id)
+ return key;
+ }
+
+ return NULL;
+}
+
+/*
+ * Initialize all the possible digest transforms that we can use. Right now
+ * now, the supported digests are SHA1 and SHA256. We do this here once
+ * because of the restrictiong that transforms may only be allocated in
+ * user context. This forces us to pre-allocated all possible transforms
+ * at the endpoint init time.
+ */
+int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
+{
+ struct crypto_hash *tfm = NULL;
+ __u16 id;
+
+ /* If AUTH extension is disabled, we are done */
+ if (!ep->auth_enable) {
+ ep->auth_hmacs = NULL;
+ return 0;
+ }
+
+ /* If the transforms are already allocated, we are done */
+ if (ep->auth_hmacs)
+ return 0;
+
+ /* Allocated the array of pointers to transorms */
+ ep->auth_hmacs = kzalloc(
+ sizeof(struct crypto_hash *) * SCTP_AUTH_NUM_HMACS,
+ gfp);
+ if (!ep->auth_hmacs)
+ return -ENOMEM;
+
+ for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) {
+
+ /* See is we support the id. Supported IDs have name and
+ * length fields set, so that we can allocated and use
+ * them. We can safely just check for name, for without the
+ * name, we can't allocate the TFM.
+ */
+ if (!sctp_hmac_list[id].hmac_name)
+ continue;
+
+ /* If this TFM has been allocated, we are all set */
+ if (ep->auth_hmacs[id])
+ continue;
+
+ /* Allocate the ID */
+ tfm = crypto_alloc_hash(sctp_hmac_list[id].hmac_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ goto out_err;
+
+ ep->auth_hmacs[id] = tfm;
+ }
+
+ return 0;
+
+out_err:
+ /* Clean up any successful allocations */
+ sctp_auth_destroy_hmacs(ep->auth_hmacs);
+ return -ENOMEM;
+}
+
+/* Destroy the hmac tfm array */
+void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[])
+{
+ int i;
+
+ if (!auth_hmacs)
+ return;
+
+ for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) {
+ if (auth_hmacs[i])
+ crypto_free_hash(auth_hmacs[i]);
+ }
+ kfree(auth_hmacs);
+}
+
+
+struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
+{
+ return &sctp_hmac_list[hmac_id];
+}
+
+/* Get an hmac description information that we can use to build
+ * the AUTH chunk
+ */
+struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
+{
+ struct sctp_hmac_algo_param *hmacs;
+ __u16 n_elt;
+ __u16 id = 0;
+ int i;
+
+ /* If we have a default entry, use it */
+ if (asoc->default_hmac_id)
+ return &sctp_hmac_list[asoc->default_hmac_id];
+
+ /* Since we do not have a default entry, find the first entry
+ * we support and return that. Do not cache that id.
+ */
+ hmacs = asoc->peer.peer_hmacs;
+ if (!hmacs)
+ return NULL;
+
+ n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t)) >> 1;
+ for (i = 0; i < n_elt; i++) {
+ id = ntohs(hmacs->hmac_ids[i]);
+
+ /* Check the id is in the supported range. And
+ * see if we support the id. Supported IDs have name and
+ * length fields set, so that we can allocate and use
+ * them. We can safely just check for name, for without the
+ * name, we can't allocate the TFM.
+ */
+ if (id > SCTP_AUTH_HMAC_ID_MAX ||
+ !sctp_hmac_list[id].hmac_name) {
+ id = 0;
+ continue;
+ }
+
+ break;
+ }
+
+ if (id == 0)
+ return NULL;
+
+ return &sctp_hmac_list[id];
+}
+
+static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id)
+{
+ int found = 0;
+ int i;
+
+ for (i = 0; i < n_elts; i++) {
+ if (hmac_id == hmacs[i]) {
+ found = 1;
+ break;
+ }
+ }
+
+ return found;
+}
+
+/* See if the HMAC_ID is one that we claim as supported */
+int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc,
+ __be16 hmac_id)
+{
+ struct sctp_hmac_algo_param *hmacs;
+ __u16 n_elt;
+
+ if (!asoc)
+ return 0;
+
+ hmacs = (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs;
+ n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t)) >> 1;
+
+ return __sctp_auth_find_hmacid(hmacs->hmac_ids, n_elt, hmac_id);
+}
+
+
+/* Cache the default HMAC id. This to follow this text from SCTP-AUTH:
+ * Section 6.1:
+ * The receiver of a HMAC-ALGO parameter SHOULD use the first listed
+ * algorithm it supports.
+ */
+void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
+ struct sctp_hmac_algo_param *hmacs)
+{
+ struct sctp_endpoint *ep;
+ __u16 id;
+ int i;
+ int n_params;
+
+ /* if the default id is already set, use it */
+ if (asoc->default_hmac_id)
+ return;
+
+ n_params = (ntohs(hmacs->param_hdr.length)
+ - sizeof(sctp_paramhdr_t)) >> 1;
+ ep = asoc->ep;
+ for (i = 0; i < n_params; i++) {
+ id = ntohs(hmacs->hmac_ids[i]);
+
+ /* Check the id is in the supported range */
+ if (id > SCTP_AUTH_HMAC_ID_MAX)
+ continue;
+
+ /* If this TFM has been allocated, use this id */
+ if (ep->auth_hmacs[id]) {
+ asoc->default_hmac_id = id;
+ break;
+ }
+ }
+}
+
+
+/* Check to see if the given chunk is supposed to be authenticated */
+static int __sctp_auth_cid(sctp_cid_t chunk, struct sctp_chunks_param *param)
+{
+ unsigned short len;
+ int found = 0;
+ int i;
+
+ if (!param || param->param_hdr.length == 0)
+ return 0;
+
+ len = ntohs(param->param_hdr.length) - sizeof(sctp_paramhdr_t);
+
+ /* SCTP-AUTH, Section 3.2
+ * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH
+ * chunks MUST NOT be listed in the CHUNKS parameter. However, if
+ * a CHUNKS parameter is received then the types for INIT, INIT-ACK,
+ * SHUTDOWN-COMPLETE and AUTH chunks MUST be ignored.
+ */
+ for (i = 0; !found && i < len; i++) {
+ switch (param->chunks[i]) {
+ case SCTP_CID_INIT:
+ case SCTP_CID_INIT_ACK:
+ case SCTP_CID_SHUTDOWN_COMPLETE:
+ case SCTP_CID_AUTH:
+ break;
+
+ default:
+ if (param->chunks[i] == chunk)
+ found = 1;
+ break;
+ }
+ }
+
+ return found;
+}
+
+/* Check if peer requested that this chunk is authenticated */
+int sctp_auth_send_cid(sctp_cid_t chunk, const struct sctp_association *asoc)
+{
+ if (!asoc)
+ return 0;
+
+ if (!asoc->ep->auth_enable || !asoc->peer.auth_capable)
+ return 0;
+
+ return __sctp_auth_cid(chunk, asoc->peer.peer_chunks);
+}
+
+/* Check if we requested that peer authenticate this chunk. */
+int sctp_auth_recv_cid(sctp_cid_t chunk, const struct sctp_association *asoc)
+{
+ if (!asoc)
+ return 0;
+
+ if (!asoc->ep->auth_enable)
+ return 0;
+
+ return __sctp_auth_cid(chunk,
+ (struct sctp_chunks_param *)asoc->c.auth_chunks);
+}
+
+/* SCTP-AUTH: Section 6.2:
+ * The sender MUST calculate the MAC as described in RFC2104 [2] using
+ * the hash function H as described by the MAC Identifier and the shared
+ * association key K based on the endpoint pair shared key described by
+ * the shared key identifier. The 'data' used for the computation of
+ * the AUTH-chunk is given by the AUTH chunk with its HMAC field set to
+ * zero (as shown in Figure 6) followed by all chunks that are placed
+ * after the AUTH chunk in the SCTP packet.
+ */
+void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
+ struct sk_buff *skb,
+ struct sctp_auth_chunk *auth,
+ gfp_t gfp)
+{
+ struct scatterlist sg;
+ struct hash_desc desc;
+ struct sctp_auth_bytes *asoc_key;
+ __u16 key_id, hmac_id;
+ __u8 *digest;
+ unsigned char *end;
+ int free_key = 0;
+
+ /* Extract the info we need:
+ * - hmac id
+ * - key id
+ */
+ key_id = ntohs(auth->auth_hdr.shkey_id);
+ hmac_id = ntohs(auth->auth_hdr.hmac_id);
+
+ if (key_id == asoc->active_key_id)
+ asoc_key = asoc->asoc_shared_key;
+ else {
+ struct sctp_shared_key *ep_key;
+
+ ep_key = sctp_auth_get_shkey(asoc, key_id);
+ if (!ep_key)
+ return;
+
+ asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp);
+ if (!asoc_key)
+ return;
+
+ free_key = 1;
+ }
+
+ /* set up scatter list */
+ end = skb_tail_pointer(skb);
+ sg_init_one(&sg, auth, end - (unsigned char *)auth);
+
+ desc.tfm = asoc->ep->auth_hmacs[hmac_id];
+ desc.flags = 0;
+
+ digest = auth->auth_hdr.hmac;
+ if (crypto_hash_setkey(desc.tfm, &asoc_key->data[0], asoc_key->len))
+ goto free;
+
+ crypto_hash_digest(&desc, &sg, sg.length, digest);
+
+free:
+ if (free_key)
+ sctp_auth_key_put(asoc_key);
+}
+
+/* API Helpers */
+
+/* Add a chunk to the endpoint authenticated chunk list */
+int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id)
+{
+ struct sctp_chunks_param *p = ep->auth_chunk_list;
+ __u16 nchunks;
+ __u16 param_len;
+
+ /* If this chunk is already specified, we are done */
+ if (__sctp_auth_cid(chunk_id, p))
+ return 0;
+
+ /* Check if we can add this chunk to the array */
+ param_len = ntohs(p->param_hdr.length);
+ nchunks = param_len - sizeof(sctp_paramhdr_t);
+ if (nchunks == SCTP_NUM_CHUNK_TYPES)
+ return -EINVAL;
+
+ p->chunks[nchunks] = chunk_id;
+ p->param_hdr.length = htons(param_len + 1);
+ return 0;
+}
+
+/* Add hmac identifires to the endpoint list of supported hmac ids */
+int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
+ struct sctp_hmacalgo *hmacs)
+{
+ int has_sha1 = 0;
+ __u16 id;
+ int i;
+
+ /* Scan the list looking for unsupported id. Also make sure that
+ * SHA1 is specified.
+ */
+ for (i = 0; i < hmacs->shmac_num_idents; i++) {
+ id = hmacs->shmac_idents[i];
+
+ if (id > SCTP_AUTH_HMAC_ID_MAX)
+ return -EOPNOTSUPP;
+
+ if (SCTP_AUTH_HMAC_ID_SHA1 == id)
+ has_sha1 = 1;
+
+ if (!sctp_hmac_list[id].hmac_name)
+ return -EOPNOTSUPP;
+ }
+
+ if (!has_sha1)
+ return -EINVAL;
+
+ memcpy(ep->auth_hmacs_list->hmac_ids, &hmacs->shmac_idents[0],
+ hmacs->shmac_num_idents * sizeof(__u16));
+ ep->auth_hmacs_list->param_hdr.length = htons(sizeof(sctp_paramhdr_t) +
+ hmacs->shmac_num_idents * sizeof(__u16));
+ return 0;
+}
+
+/* Set a new shared key on either endpoint or association. If the
+ * the key with a same ID already exists, replace the key (remove the
+ * old key and add a new one).
+ */
+int sctp_auth_set_key(struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ struct sctp_authkey *auth_key)
+{
+ struct sctp_shared_key *cur_key = NULL;
+ struct sctp_auth_bytes *key;
+ struct list_head *sh_keys;
+ int replace = 0;
+
+ /* Try to find the given key id to see if
+ * we are doing a replace, or adding a new key
+ */
+ if (asoc)
+ sh_keys = &asoc->endpoint_shared_keys;
+ else
+ sh_keys = &ep->endpoint_shared_keys;
+
+ key_for_each(cur_key, sh_keys) {
+ if (cur_key->key_id == auth_key->sca_keynumber) {
+ replace = 1;
+ break;
+ }
+ }
+
+ /* If we are not replacing a key id, we need to allocate
+ * a shared key.
+ */
+ if (!replace) {
+ cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber,
+ GFP_KERNEL);
+ if (!cur_key)
+ return -ENOMEM;
+ }
+
+ /* Create a new key data based on the info passed in */
+ key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL);
+ if (!key)
+ goto nomem;
+
+ memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength);
+
+ /* If we are replacing, remove the old keys data from the
+ * key id. If we are adding new key id, add it to the
+ * list.
+ */
+ if (replace)
+ sctp_auth_key_put(cur_key->key);
+ else
+ list_add(&cur_key->key_list, sh_keys);
+
+ cur_key->key = key;
+ return 0;
+nomem:
+ if (!replace)
+ sctp_auth_shkey_free(cur_key);
+
+ return -ENOMEM;
+}
+
+int sctp_auth_set_active_key(struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ __u16 key_id)
+{
+ struct sctp_shared_key *key;
+ struct list_head *sh_keys;
+ int found = 0;
+
+ /* The key identifier MUST correst to an existing key */
+ if (asoc)
+ sh_keys = &asoc->endpoint_shared_keys;
+ else
+ sh_keys = &ep->endpoint_shared_keys;
+
+ key_for_each(key, sh_keys) {
+ if (key->key_id == key_id) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ return -EINVAL;
+
+ if (asoc) {
+ asoc->active_key_id = key_id;
+ sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL);
+ } else
+ ep->active_key_id = key_id;
+
+ return 0;
+}
+
+int sctp_auth_del_key_id(struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ __u16 key_id)
+{
+ struct sctp_shared_key *key;
+ struct list_head *sh_keys;
+ int found = 0;
+
+ /* The key identifier MUST NOT be the current active key
+ * The key identifier MUST correst to an existing key
+ */
+ if (asoc) {
+ if (asoc->active_key_id == key_id)
+ return -EINVAL;
+
+ sh_keys = &asoc->endpoint_shared_keys;
+ } else {
+ if (ep->active_key_id == key_id)
+ return -EINVAL;
+
+ sh_keys = &ep->endpoint_shared_keys;
+ }
+
+ key_for_each(key, sh_keys) {
+ if (key->key_id == key_id) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ return -EINVAL;
+
+ /* Delete the shared key */
+ list_del_init(&key->key_list);
+ sctp_auth_shkey_free(key);
+
+ return 0;
+}
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
new file mode 100644
index 000000000..871cdf956
--- /dev/null
+++ b/net/sctp/bind_addr.c
@@ -0,0 +1,552 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2003
+ * Copyright (c) Cisco 1999,2000
+ * Copyright (c) Motorola 1999,2000,2001
+ * Copyright (c) La Monte H.P. Yarroll 2001
+ *
+ * This file is part of the SCTP kernel implementation.
+ *
+ * A collection class to handle the storage of transport addresses.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/if_inet6.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal helpers. */
+static int sctp_copy_one_addr(struct net *, struct sctp_bind_addr *,
+ union sctp_addr *, sctp_scope_t scope, gfp_t gfp,
+ int flags);
+static void sctp_bind_addr_clean(struct sctp_bind_addr *);
+
+/* First Level Abstractions. */
+
+/* Copy 'src' to 'dest' taking 'scope' into account. Omit addresses
+ * in 'src' which have a broader scope than 'scope'.
+ */
+int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest,
+ const struct sctp_bind_addr *src,
+ sctp_scope_t scope, gfp_t gfp,
+ int flags)
+{
+ struct sctp_sockaddr_entry *addr;
+ int error = 0;
+
+ /* All addresses share the same port. */
+ dest->port = src->port;
+
+ /* Extract the addresses which are relevant for this scope. */
+ list_for_each_entry(addr, &src->address_list, list) {
+ error = sctp_copy_one_addr(net, dest, &addr->a, scope,
+ gfp, flags);
+ if (error < 0)
+ goto out;
+ }
+
+ /* If there are no addresses matching the scope and
+ * this is global scope, try to get a link scope address, with
+ * the assumption that we must be sitting behind a NAT.
+ */
+ if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) {
+ list_for_each_entry(addr, &src->address_list, list) {
+ error = sctp_copy_one_addr(net, dest, &addr->a,
+ SCTP_SCOPE_LINK, gfp,
+ flags);
+ if (error < 0)
+ goto out;
+ }
+ }
+
+out:
+ if (error)
+ sctp_bind_addr_clean(dest);
+
+ return error;
+}
+
+/* Exactly duplicate the address lists. This is necessary when doing
+ * peer-offs and accepts. We don't want to put all the current system
+ * addresses into the endpoint. That's useless. But we do want duplicat
+ * the list of bound addresses that the older endpoint used.
+ */
+int sctp_bind_addr_dup(struct sctp_bind_addr *dest,
+ const struct sctp_bind_addr *src,
+ gfp_t gfp)
+{
+ struct sctp_sockaddr_entry *addr;
+ int error = 0;
+
+ /* All addresses share the same port. */
+ dest->port = src->port;
+
+ list_for_each_entry(addr, &src->address_list, list) {
+ error = sctp_add_bind_addr(dest, &addr->a, 1, gfp);
+ if (error < 0)
+ break;
+ }
+
+ return error;
+}
+
+/* Initialize the SCTP_bind_addr structure for either an endpoint or
+ * an association.
+ */
+void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port)
+{
+ INIT_LIST_HEAD(&bp->address_list);
+ bp->port = port;
+}
+
+/* Dispose of the address list. */
+static void sctp_bind_addr_clean(struct sctp_bind_addr *bp)
+{
+ struct sctp_sockaddr_entry *addr, *temp;
+
+ /* Empty the bind address list. */
+ list_for_each_entry_safe(addr, temp, &bp->address_list, list) {
+ list_del_rcu(&addr->list);
+ kfree_rcu(addr, rcu);
+ SCTP_DBG_OBJCNT_DEC(addr);
+ }
+}
+
+/* Dispose of an SCTP_bind_addr structure */
+void sctp_bind_addr_free(struct sctp_bind_addr *bp)
+{
+ /* Empty the bind address list. */
+ sctp_bind_addr_clean(bp);
+}
+
+/* Add an address to the bind address list in the SCTP_bind_addr structure. */
+int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
+ __u8 addr_state, gfp_t gfp)
+{
+ struct sctp_sockaddr_entry *addr;
+
+ /* Add the address to the bind address list. */
+ addr = kzalloc(sizeof(*addr), gfp);
+ if (!addr)
+ return -ENOMEM;
+
+ memcpy(&addr->a, new, sizeof(*new));
+
+ /* Fix up the port if it has not yet been set.
+ * Both v4 and v6 have the port at the same offset.
+ */
+ if (!addr->a.v4.sin_port)
+ addr->a.v4.sin_port = htons(bp->port);
+
+ addr->state = addr_state;
+ addr->valid = 1;
+
+ INIT_LIST_HEAD(&addr->list);
+
+ /* We always hold a socket lock when calling this function,
+ * and that acts as a writer synchronizing lock.
+ */
+ list_add_tail_rcu(&addr->list, &bp->address_list);
+ SCTP_DBG_OBJCNT_INC(addr);
+
+ return 0;
+}
+
+/* Delete an address from the bind address list in the SCTP_bind_addr
+ * structure.
+ */
+int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
+{
+ struct sctp_sockaddr_entry *addr, *temp;
+ int found = 0;
+
+ /* We hold the socket lock when calling this function,
+ * and that acts as a writer synchronizing lock.
+ */
+ list_for_each_entry_safe(addr, temp, &bp->address_list, list) {
+ if (sctp_cmp_addr_exact(&addr->a, del_addr)) {
+ /* Found the exact match. */
+ found = 1;
+ addr->valid = 0;
+ list_del_rcu(&addr->list);
+ break;
+ }
+ }
+
+ if (found) {
+ kfree_rcu(addr, rcu);
+ SCTP_DBG_OBJCNT_DEC(addr);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+/* Create a network byte-order representation of all the addresses
+ * formated as SCTP parameters.
+ *
+ * The second argument is the return value for the length.
+ */
+union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
+ int *addrs_len,
+ gfp_t gfp)
+{
+ union sctp_params addrparms;
+ union sctp_params retval;
+ int addrparms_len;
+ union sctp_addr_param rawaddr;
+ int len;
+ struct sctp_sockaddr_entry *addr;
+ struct list_head *pos;
+ struct sctp_af *af;
+
+ addrparms_len = 0;
+ len = 0;
+
+ /* Allocate enough memory at once. */
+ list_for_each(pos, &bp->address_list) {
+ len += sizeof(union sctp_addr_param);
+ }
+
+ /* Don't even bother embedding an address if there
+ * is only one.
+ */
+ if (len == sizeof(union sctp_addr_param)) {
+ retval.v = NULL;
+ goto end_raw;
+ }
+
+ retval.v = kmalloc(len, gfp);
+ if (!retval.v)
+ goto end_raw;
+
+ addrparms = retval;
+
+ list_for_each_entry(addr, &bp->address_list, list) {
+ af = sctp_get_af_specific(addr->a.v4.sin_family);
+ len = af->to_addr_param(&addr->a, &rawaddr);
+ memcpy(addrparms.v, &rawaddr, len);
+ addrparms.v += len;
+ addrparms_len += len;
+ }
+
+end_raw:
+ *addrs_len = addrparms_len;
+ return retval;
+}
+
+/*
+ * Create an address list out of the raw address list format (IPv4 and IPv6
+ * address parameters).
+ */
+int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
+ int addrs_len, __u16 port, gfp_t gfp)
+{
+ union sctp_addr_param *rawaddr;
+ struct sctp_paramhdr *param;
+ union sctp_addr addr;
+ int retval = 0;
+ int len;
+ struct sctp_af *af;
+
+ /* Convert the raw address to standard address format */
+ while (addrs_len) {
+ param = (struct sctp_paramhdr *)raw_addr_list;
+ rawaddr = (union sctp_addr_param *)raw_addr_list;
+
+ af = sctp_get_af_specific(param_type2af(param->type));
+ if (unlikely(!af)) {
+ retval = -EINVAL;
+ sctp_bind_addr_clean(bp);
+ break;
+ }
+
+ af->from_addr_param(&addr, rawaddr, htons(port), 0);
+ retval = sctp_add_bind_addr(bp, &addr, SCTP_ADDR_SRC, gfp);
+ if (retval) {
+ /* Can't finish building the list, clean up. */
+ sctp_bind_addr_clean(bp);
+ break;
+ }
+
+ len = ntohs(param->length);
+ addrs_len -= len;
+ raw_addr_list += len;
+ }
+
+ return retval;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* Does this contain a specified address? Allow wildcarding. */
+int sctp_bind_addr_match(struct sctp_bind_addr *bp,
+ const union sctp_addr *addr,
+ struct sctp_sock *opt)
+{
+ struct sctp_sockaddr_entry *laddr;
+ int match = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ if (!laddr->valid)
+ continue;
+ if (opt->pf->cmp_addr(&laddr->a, addr, opt)) {
+ match = 1;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return match;
+}
+
+/* Does the address 'addr' conflict with any addresses in
+ * the bp.
+ */
+int sctp_bind_addr_conflict(struct sctp_bind_addr *bp,
+ const union sctp_addr *addr,
+ struct sctp_sock *bp_sp,
+ struct sctp_sock *addr_sp)
+{
+ struct sctp_sockaddr_entry *laddr;
+ int conflict = 0;
+ struct sctp_sock *sp;
+
+ /* Pick the IPv6 socket as the basis of comparison
+ * since it's usually a superset of the IPv4.
+ * If there is no IPv6 socket, then default to bind_addr.
+ */
+ if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6)
+ sp = bp_sp;
+ else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6)
+ sp = addr_sp;
+ else
+ sp = bp_sp;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ if (!laddr->valid)
+ continue;
+
+ conflict = sp->pf->cmp_addr(&laddr->a, addr, sp);
+ if (conflict)
+ break;
+ }
+ rcu_read_unlock();
+
+ return conflict;
+}
+
+/* Get the state of the entry in the bind_addr_list */
+int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
+ const union sctp_addr *addr)
+{
+ struct sctp_sockaddr_entry *laddr;
+ struct sctp_af *af;
+ int state = -1;
+
+ af = sctp_get_af_specific(addr->sa.sa_family);
+ if (unlikely(!af))
+ return state;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ if (!laddr->valid)
+ continue;
+ if (af->cmp_addr(&laddr->a, addr)) {
+ state = laddr->state;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return state;
+}
+
+/* Find the first address in the bind address list that is not present in
+ * the addrs packed array.
+ */
+union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
+ const union sctp_addr *addrs,
+ int addrcnt,
+ struct sctp_sock *opt)
+{
+ struct sctp_sockaddr_entry *laddr;
+ union sctp_addr *addr;
+ void *addr_buf;
+ struct sctp_af *af;
+ int i;
+
+ /* This is only called sctp_send_asconf_del_ip() and we hold
+ * the socket lock in that code patch, so that address list
+ * can't change.
+ */
+ list_for_each_entry(laddr, &bp->address_list, list) {
+ addr_buf = (union sctp_addr *)addrs;
+ for (i = 0; i < addrcnt; i++) {
+ addr = addr_buf;
+ af = sctp_get_af_specific(addr->v4.sin_family);
+ if (!af)
+ break;
+
+ if (opt->pf->cmp_addr(&laddr->a, addr, opt))
+ break;
+
+ addr_buf += af->sockaddr_len;
+ }
+ if (i == addrcnt)
+ return &laddr->a;
+ }
+
+ return NULL;
+}
+
+/* Copy out addresses from the global local address list. */
+static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest,
+ union sctp_addr *addr,
+ sctp_scope_t scope, gfp_t gfp,
+ int flags)
+{
+ int error = 0;
+
+ if (sctp_is_any(NULL, addr)) {
+ error = sctp_copy_local_addr_list(net, dest, scope, gfp, flags);
+ } else if (sctp_in_scope(net, addr, scope)) {
+ /* Now that the address is in scope, check to see if
+ * the address type is supported by local sock as
+ * well as the remote peer.
+ */
+ if ((((AF_INET == addr->sa.sa_family) &&
+ (flags & SCTP_ADDR4_PEERSUPP))) ||
+ (((AF_INET6 == addr->sa.sa_family) &&
+ (flags & SCTP_ADDR6_ALLOWED) &&
+ (flags & SCTP_ADDR6_PEERSUPP))))
+ error = sctp_add_bind_addr(dest, addr, SCTP_ADDR_SRC,
+ gfp);
+ }
+
+ return error;
+}
+
+/* Is this a wildcard address? */
+int sctp_is_any(struct sock *sk, const union sctp_addr *addr)
+{
+ unsigned short fam = 0;
+ struct sctp_af *af;
+
+ /* Try to get the right address family */
+ if (addr->sa.sa_family != AF_UNSPEC)
+ fam = addr->sa.sa_family;
+ else if (sk)
+ fam = sk->sk_family;
+
+ af = sctp_get_af_specific(fam);
+ if (!af)
+ return 0;
+
+ return af->is_any(addr);
+}
+
+/* Is 'addr' valid for 'scope'? */
+int sctp_in_scope(struct net *net, const union sctp_addr *addr, sctp_scope_t scope)
+{
+ sctp_scope_t addr_scope = sctp_scope(addr);
+
+ /* The unusable SCTP addresses will not be considered with
+ * any defined scopes.
+ */
+ if (SCTP_SCOPE_UNUSABLE == addr_scope)
+ return 0;
+ /*
+ * For INIT and INIT-ACK address list, let L be the level of
+ * of requested destination address, sender and receiver
+ * SHOULD include all of its addresses with level greater
+ * than or equal to L.
+ *
+ * Address scoping can be selectively controlled via sysctl
+ * option
+ */
+ switch (net->sctp.scope_policy) {
+ case SCTP_SCOPE_POLICY_DISABLE:
+ return 1;
+ case SCTP_SCOPE_POLICY_ENABLE:
+ if (addr_scope <= scope)
+ return 1;
+ break;
+ case SCTP_SCOPE_POLICY_PRIVATE:
+ if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope)
+ return 1;
+ break;
+ case SCTP_SCOPE_POLICY_LINK:
+ if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope)
+ return 1;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+int sctp_is_ep_boundall(struct sock *sk)
+{
+ struct sctp_bind_addr *bp;
+ struct sctp_sockaddr_entry *addr;
+
+ bp = &sctp_sk(sk)->ep->base.bind_addr;
+ if (sctp_list_single_entry(&bp->address_list)) {
+ addr = list_entry(bp->address_list.next,
+ struct sctp_sockaddr_entry, list);
+ if (sctp_is_any(sk, &addr->a))
+ return 1;
+ }
+ return 0;
+}
+
+/********************************************************************
+ * 3rd Level Abstractions
+ ********************************************************************/
+
+/* What is the scope of 'addr'? */
+sctp_scope_t sctp_scope(const union sctp_addr *addr)
+{
+ struct sctp_af *af;
+
+ af = sctp_get_af_specific(addr->sa.sa_family);
+ if (!af)
+ return SCTP_SCOPE_UNUSABLE;
+
+ return af->scope((union sctp_addr *)addr);
+}
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
new file mode 100644
index 000000000..a3380917f
--- /dev/null
+++ b/net/sctp/chunk.c
@@ -0,0 +1,365 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2003, 2004
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This file contains the code relating the chunk abstraction.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* This file is mostly in anticipation of future work, but initially
+ * populate with fragment tracking for an outbound message.
+ */
+
+/* Initialize datamsg from memory. */
+static void sctp_datamsg_init(struct sctp_datamsg *msg)
+{
+ atomic_set(&msg->refcnt, 1);
+ msg->send_failed = 0;
+ msg->send_error = 0;
+ msg->can_abandon = 0;
+ msg->can_delay = 1;
+ msg->expires_at = 0;
+ INIT_LIST_HEAD(&msg->chunks);
+}
+
+/* Allocate and initialize datamsg. */
+static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
+{
+ struct sctp_datamsg *msg;
+ msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
+ if (msg) {
+ sctp_datamsg_init(msg);
+ SCTP_DBG_OBJCNT_INC(datamsg);
+ }
+ return msg;
+}
+
+void sctp_datamsg_free(struct sctp_datamsg *msg)
+{
+ struct sctp_chunk *chunk;
+
+ /* This doesn't have to be a _safe vairant because
+ * sctp_chunk_free() only drops the refs.
+ */
+ list_for_each_entry(chunk, &msg->chunks, frag_list)
+ sctp_chunk_free(chunk);
+
+ sctp_datamsg_put(msg);
+}
+
+/* Final destructruction of datamsg memory. */
+static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
+{
+ struct list_head *pos, *temp;
+ struct sctp_chunk *chunk;
+ struct sctp_sock *sp;
+ struct sctp_ulpevent *ev;
+ struct sctp_association *asoc = NULL;
+ int error = 0, notify;
+
+ /* If we failed, we may need to notify. */
+ notify = msg->send_failed ? -1 : 0;
+
+ /* Release all references. */
+ list_for_each_safe(pos, temp, &msg->chunks) {
+ list_del_init(pos);
+ chunk = list_entry(pos, struct sctp_chunk, frag_list);
+ /* Check whether we _really_ need to notify. */
+ if (notify < 0) {
+ asoc = chunk->asoc;
+ if (msg->send_error)
+ error = msg->send_error;
+ else
+ error = asoc->outqueue.error;
+
+ sp = sctp_sk(asoc->base.sk);
+ notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED,
+ &sp->subscribe);
+ }
+
+ /* Generate a SEND FAILED event only if enabled. */
+ if (notify > 0) {
+ int sent;
+ if (chunk->has_tsn)
+ sent = SCTP_DATA_SENT;
+ else
+ sent = SCTP_DATA_UNSENT;
+
+ ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent,
+ error, GFP_ATOMIC);
+ if (ev)
+ sctp_ulpq_tail_event(&asoc->ulpq, ev);
+ }
+
+ sctp_chunk_put(chunk);
+ }
+
+ SCTP_DBG_OBJCNT_DEC(datamsg);
+ kfree(msg);
+}
+
+/* Hold a reference. */
+static void sctp_datamsg_hold(struct sctp_datamsg *msg)
+{
+ atomic_inc(&msg->refcnt);
+}
+
+/* Release a reference. */
+void sctp_datamsg_put(struct sctp_datamsg *msg)
+{
+ if (atomic_dec_and_test(&msg->refcnt))
+ sctp_datamsg_destroy(msg);
+}
+
+/* Assign a chunk to this datamsg. */
+static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk)
+{
+ sctp_datamsg_hold(msg);
+ chunk->msg = msg;
+}
+
+
+/* A data chunk can have a maximum payload of (2^16 - 20). Break
+ * down any such message into smaller chunks. Opportunistically, fragment
+ * the chunks down to the current MTU constraints. We may get refragmented
+ * later if the PMTU changes, but it is _much better_ to fragment immediately
+ * with a reasonable guess than always doing our fragmentation on the
+ * soft-interrupt.
+ */
+struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
+ struct sctp_sndrcvinfo *sinfo,
+ struct iov_iter *from)
+{
+ int max, whole, i, offset, over, err;
+ int len, first_len;
+ int max_data;
+ struct sctp_chunk *chunk;
+ struct sctp_datamsg *msg;
+ struct list_head *pos, *temp;
+ size_t msg_len = iov_iter_count(from);
+ __u8 frag;
+
+ msg = sctp_datamsg_new(GFP_KERNEL);
+ if (!msg)
+ return ERR_PTR(-ENOMEM);
+
+ /* Note: Calculate this outside of the loop, so that all fragments
+ * have the same expiration.
+ */
+ if (sinfo->sinfo_timetolive) {
+ /* sinfo_timetolive is in milliseconds */
+ msg->expires_at = jiffies +
+ msecs_to_jiffies(sinfo->sinfo_timetolive);
+ msg->can_abandon = 1;
+
+ pr_debug("%s: msg:%p expires_at:%ld jiffies:%ld\n", __func__,
+ msg, msg->expires_at, jiffies);
+ }
+
+ /* This is the biggest possible DATA chunk that can fit into
+ * the packet
+ */
+ max_data = (asoc->pathmtu -
+ sctp_sk(asoc->base.sk)->pf->af->net_header_len -
+ sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3;
+
+ max = asoc->frag_point;
+ /* If the the peer requested that we authenticate DATA chunks
+ * we need to account for bundling of the AUTH chunks along with
+ * DATA.
+ */
+ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) {
+ struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
+
+ if (hmac_desc)
+ max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) +
+ hmac_desc->hmac_len);
+ }
+
+ /* Now, check if we need to reduce our max */
+ if (max > max_data)
+ max = max_data;
+
+ whole = 0;
+ first_len = max;
+
+ /* Check to see if we have a pending SACK and try to let it be bundled
+ * with this message. Do this if we don't have any data queued already.
+ * To check that, look at out_qlen and retransmit list.
+ * NOTE: we will not reduce to account for SACK, if the message would
+ * not have been fragmented.
+ */
+ if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
+ asoc->outqueue.out_qlen == 0 &&
+ list_empty(&asoc->outqueue.retransmit) &&
+ msg_len > max)
+ max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t));
+
+ /* Encourage Cookie-ECHO bundling. */
+ if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
+ max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
+
+ /* Now that we adjusted completely, reset first_len */
+ if (first_len > max_data)
+ first_len = max_data;
+
+ /* Account for a different sized first fragment */
+ if (msg_len >= first_len) {
+ msg_len -= first_len;
+ whole = 1;
+ msg->can_delay = 0;
+ }
+
+ /* How many full sized? How many bytes leftover? */
+ whole += msg_len / max;
+ over = msg_len % max;
+ offset = 0;
+
+ if ((whole > 1) || (whole && over))
+ SCTP_INC_STATS_USER(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS);
+
+ /* Create chunks for all the full sized DATA chunks. */
+ for (i = 0, len = first_len; i < whole; i++) {
+ frag = SCTP_DATA_MIDDLE_FRAG;
+
+ if (0 == i)
+ frag |= SCTP_DATA_FIRST_FRAG;
+
+ if ((i == (whole - 1)) && !over) {
+ frag |= SCTP_DATA_LAST_FRAG;
+
+ /* The application requests to set the I-bit of the
+ * last DATA chunk of a user message when providing
+ * the user message to the SCTP implementation.
+ */
+ if ((sinfo->sinfo_flags & SCTP_EOF) ||
+ (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
+ frag |= SCTP_DATA_SACK_IMM;
+ }
+
+ chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0);
+
+ if (!chunk) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ err = sctp_user_addto_chunk(chunk, len, from);
+ if (err < 0)
+ goto errout_chunk_free;
+
+ /* Put the chunk->skb back into the form expected by send. */
+ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
+ - (__u8 *)chunk->skb->data);
+
+ sctp_datamsg_assign(msg, chunk);
+ list_add_tail(&chunk->frag_list, &msg->chunks);
+
+ /* The first chunk, the first chunk was likely short
+ * to allow bundling, so reset to full size.
+ */
+ if (0 == i)
+ len = max;
+ }
+
+ /* .. now the leftover bytes. */
+ if (over) {
+ if (!whole)
+ frag = SCTP_DATA_NOT_FRAG;
+ else
+ frag = SCTP_DATA_LAST_FRAG;
+
+ if ((sinfo->sinfo_flags & SCTP_EOF) ||
+ (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
+ frag |= SCTP_DATA_SACK_IMM;
+
+ chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0);
+
+ if (!chunk) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ err = sctp_user_addto_chunk(chunk, over, from);
+
+ /* Put the chunk->skb back into the form expected by send. */
+ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
+ - (__u8 *)chunk->skb->data);
+ if (err < 0)
+ goto errout_chunk_free;
+
+ sctp_datamsg_assign(msg, chunk);
+ list_add_tail(&chunk->frag_list, &msg->chunks);
+ }
+
+ return msg;
+
+errout_chunk_free:
+ sctp_chunk_free(chunk);
+
+errout:
+ list_for_each_safe(pos, temp, &msg->chunks) {
+ list_del_init(pos);
+ chunk = list_entry(pos, struct sctp_chunk, frag_list);
+ sctp_chunk_free(chunk);
+ }
+ sctp_datamsg_put(msg);
+ return ERR_PTR(err);
+}
+
+/* Check whether this message has expired. */
+int sctp_chunk_abandoned(struct sctp_chunk *chunk)
+{
+ struct sctp_datamsg *msg = chunk->msg;
+
+ if (!msg->can_abandon)
+ return 0;
+
+ if (time_after(jiffies, msg->expires_at))
+ return 1;
+
+ return 0;
+}
+
+/* This chunk (and consequently entire message) has failed in its sending. */
+void sctp_chunk_fail(struct sctp_chunk *chunk, int error)
+{
+ chunk->msg->send_failed = 1;
+ chunk->msg->send_error = error;
+}
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
new file mode 100644
index 000000000..95d7b15da
--- /dev/null
+++ b/net/sctp/debug.c
@@ -0,0 +1,172 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This file converts numerical ID value to alphabetical names for SCTP
+ * terms such as chunk type, parameter time, event type, etc.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Xingang Guo <xingang.guo@intel.com>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#include <net/sctp/sctp.h>
+
+/* These are printable forms of Chunk ID's from section 3.1. */
+static const char *const sctp_cid_tbl[SCTP_NUM_BASE_CHUNK_TYPES] = {
+ "DATA",
+ "INIT",
+ "INIT_ACK",
+ "SACK",
+ "HEARTBEAT",
+ "HEARTBEAT_ACK",
+ "ABORT",
+ "SHUTDOWN",
+ "SHUTDOWN_ACK",
+ "ERROR",
+ "COOKIE_ECHO",
+ "COOKIE_ACK",
+ "ECN_ECNE",
+ "ECN_CWR",
+ "SHUTDOWN_COMPLETE",
+};
+
+/* Lookup "chunk type" debug name. */
+const char *sctp_cname(const sctp_subtype_t cid)
+{
+ if (cid.chunk <= SCTP_CID_BASE_MAX)
+ return sctp_cid_tbl[cid.chunk];
+
+ switch (cid.chunk) {
+ case SCTP_CID_ASCONF:
+ return "ASCONF";
+
+ case SCTP_CID_ASCONF_ACK:
+ return "ASCONF_ACK";
+
+ case SCTP_CID_FWD_TSN:
+ return "FWD_TSN";
+
+ case SCTP_CID_AUTH:
+ return "AUTH";
+
+ default:
+ break;
+ }
+
+ return "unknown chunk";
+}
+
+/* These are printable forms of the states. */
+const char *const sctp_state_tbl[SCTP_STATE_NUM_STATES] = {
+ "STATE_CLOSED",
+ "STATE_COOKIE_WAIT",
+ "STATE_COOKIE_ECHOED",
+ "STATE_ESTABLISHED",
+ "STATE_SHUTDOWN_PENDING",
+ "STATE_SHUTDOWN_SENT",
+ "STATE_SHUTDOWN_RECEIVED",
+ "STATE_SHUTDOWN_ACK_SENT",
+};
+
+/* Events that could change the state of an association. */
+const char *const sctp_evttype_tbl[] = {
+ "EVENT_T_unknown",
+ "EVENT_T_CHUNK",
+ "EVENT_T_TIMEOUT",
+ "EVENT_T_OTHER",
+ "EVENT_T_PRIMITIVE"
+};
+
+/* Return value of a state function */
+const char *const sctp_status_tbl[] = {
+ "DISPOSITION_DISCARD",
+ "DISPOSITION_CONSUME",
+ "DISPOSITION_NOMEM",
+ "DISPOSITION_DELETE_TCB",
+ "DISPOSITION_ABORT",
+ "DISPOSITION_VIOLATION",
+ "DISPOSITION_NOT_IMPL",
+ "DISPOSITION_ERROR",
+ "DISPOSITION_BUG"
+};
+
+/* Printable forms of primitives */
+static const char *const sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = {
+ "PRIMITIVE_ASSOCIATE",
+ "PRIMITIVE_SHUTDOWN",
+ "PRIMITIVE_ABORT",
+ "PRIMITIVE_SEND",
+ "PRIMITIVE_REQUESTHEARTBEAT",
+ "PRIMITIVE_ASCONF",
+};
+
+/* Lookup primitive debug name. */
+const char *sctp_pname(const sctp_subtype_t id)
+{
+ if (id.primitive <= SCTP_EVENT_PRIMITIVE_MAX)
+ return sctp_primitive_tbl[id.primitive];
+ return "unknown_primitive";
+}
+
+static const char *const sctp_other_tbl[] = {
+ "NO_PENDING_TSN",
+ "ICMP_PROTO_UNREACH",
+};
+
+/* Lookup "other" debug name. */
+const char *sctp_oname(const sctp_subtype_t id)
+{
+ if (id.other <= SCTP_EVENT_OTHER_MAX)
+ return sctp_other_tbl[id.other];
+ return "unknown 'other' event";
+}
+
+static const char *const sctp_timer_tbl[] = {
+ "TIMEOUT_NONE",
+ "TIMEOUT_T1_COOKIE",
+ "TIMEOUT_T1_INIT",
+ "TIMEOUT_T2_SHUTDOWN",
+ "TIMEOUT_T3_RTX",
+ "TIMEOUT_T4_RTO",
+ "TIMEOUT_T5_SHUTDOWN_GUARD",
+ "TIMEOUT_HEARTBEAT",
+ "TIMEOUT_SACK",
+ "TIMEOUT_AUTOCLOSE",
+};
+
+/* Lookup timer debug name. */
+const char *sctp_tname(const sctp_subtype_t id)
+{
+ if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX)
+ return sctp_timer_tbl[id.timeout];
+ return "unknown_timer";
+}
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
new file mode 100644
index 000000000..9da76ba4d
--- /dev/null
+++ b/net/sctp/endpointola.c
@@ -0,0 +1,501 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2002 International Business Machines, Corp.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This abstraction represents an SCTP endpoint.
+ *
+ * The SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * The SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@austin.ibm.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Dajiang Zhang <dajiang.zhang@nokia.com>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/random.h> /* get_random_bytes() */
+#include <linux/crypto.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal helpers. */
+static void sctp_endpoint_bh_rcv(struct work_struct *work);
+
+/*
+ * Initialize the base fields of the endpoint structure.
+ */
+static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
+ struct sock *sk,
+ gfp_t gfp)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_hmac_algo_param *auth_hmacs = NULL;
+ struct sctp_chunks_param *auth_chunks = NULL;
+ struct sctp_shared_key *null_key;
+ int err;
+
+ ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp);
+ if (!ep->digest)
+ return NULL;
+
+ ep->auth_enable = net->sctp.auth_enable;
+ if (ep->auth_enable) {
+ /* Allocate space for HMACS and CHUNKS authentication
+ * variables. There are arrays that we encode directly
+ * into parameters to make the rest of the operations easier.
+ */
+ auth_hmacs = kzalloc(sizeof(sctp_hmac_algo_param_t) +
+ sizeof(__u16) * SCTP_AUTH_NUM_HMACS, gfp);
+ if (!auth_hmacs)
+ goto nomem;
+
+ auth_chunks = kzalloc(sizeof(sctp_chunks_param_t) +
+ SCTP_NUM_CHUNK_TYPES, gfp);
+ if (!auth_chunks)
+ goto nomem;
+
+ /* Initialize the HMACS parameter.
+ * SCTP-AUTH: Section 3.3
+ * Every endpoint supporting SCTP chunk authentication MUST
+ * support the HMAC based on the SHA-1 algorithm.
+ */
+ auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO;
+ auth_hmacs->param_hdr.length =
+ htons(sizeof(sctp_paramhdr_t) + 2);
+ auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1);
+
+ /* Initialize the CHUNKS parameter */
+ auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS;
+ auth_chunks->param_hdr.length = htons(sizeof(sctp_paramhdr_t));
+
+ /* If the Add-IP functionality is enabled, we must
+ * authenticate, ASCONF and ASCONF-ACK chunks
+ */
+ if (net->sctp.addip_enable) {
+ auth_chunks->chunks[0] = SCTP_CID_ASCONF;
+ auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK;
+ auth_chunks->param_hdr.length =
+ htons(sizeof(sctp_paramhdr_t) + 2);
+ }
+ }
+
+ /* Initialize the base structure. */
+ /* What type of endpoint are we? */
+ ep->base.type = SCTP_EP_TYPE_SOCKET;
+
+ /* Initialize the basic object fields. */
+ atomic_set(&ep->base.refcnt, 1);
+ ep->base.dead = false;
+
+ /* Create an input queue. */
+ sctp_inq_init(&ep->base.inqueue);
+
+ /* Set its top-half handler */
+ sctp_inq_set_th_handler(&ep->base.inqueue, sctp_endpoint_bh_rcv);
+
+ /* Initialize the bind addr area */
+ sctp_bind_addr_init(&ep->base.bind_addr, 0);
+
+ /* Remember who we are attached to. */
+ ep->base.sk = sk;
+ sock_hold(ep->base.sk);
+
+ /* Create the lists of associations. */
+ INIT_LIST_HEAD(&ep->asocs);
+
+ /* Use SCTP specific send buffer space queues. */
+ ep->sndbuf_policy = net->sctp.sndbuf_policy;
+
+ sk->sk_data_ready = sctp_data_ready;
+ sk->sk_write_space = sctp_write_space;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ /* Get the receive buffer policy for this endpoint */
+ ep->rcvbuf_policy = net->sctp.rcvbuf_policy;
+
+ /* Initialize the secret key used with cookie. */
+ get_random_bytes(ep->secret_key, sizeof(ep->secret_key));
+
+ /* SCTP-AUTH extensions*/
+ INIT_LIST_HEAD(&ep->endpoint_shared_keys);
+ null_key = sctp_auth_shkey_create(0, gfp);
+ if (!null_key)
+ goto nomem;
+
+ list_add(&null_key->key_list, &ep->endpoint_shared_keys);
+
+ /* Allocate and initialize transorms arrays for supported HMACs. */
+ err = sctp_auth_init_hmacs(ep, gfp);
+ if (err)
+ goto nomem_hmacs;
+
+ /* Add the null key to the endpoint shared keys list and
+ * set the hmcas and chunks pointers.
+ */
+ ep->auth_hmacs_list = auth_hmacs;
+ ep->auth_chunk_list = auth_chunks;
+
+ return ep;
+
+nomem_hmacs:
+ sctp_auth_destroy_keys(&ep->endpoint_shared_keys);
+nomem:
+ /* Free all allocations */
+ kfree(auth_hmacs);
+ kfree(auth_chunks);
+ kfree(ep->digest);
+ return NULL;
+
+}
+
+/* Create a sctp_endpoint with all that boring stuff initialized.
+ * Returns NULL if there isn't enough memory.
+ */
+struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp)
+{
+ struct sctp_endpoint *ep;
+
+ /* Build a local endpoint. */
+ ep = kzalloc(sizeof(*ep), gfp);
+ if (!ep)
+ goto fail;
+
+ if (!sctp_endpoint_init(ep, sk, gfp))
+ goto fail_init;
+
+ SCTP_DBG_OBJCNT_INC(ep);
+ return ep;
+
+fail_init:
+ kfree(ep);
+fail:
+ return NULL;
+}
+
+/* Add an association to an endpoint. */
+void sctp_endpoint_add_asoc(struct sctp_endpoint *ep,
+ struct sctp_association *asoc)
+{
+ struct sock *sk = ep->base.sk;
+
+ /* If this is a temporary association, don't bother
+ * since we'll be removing it shortly and don't
+ * want anyone to find it anyway.
+ */
+ if (asoc->temp)
+ return;
+
+ /* Now just add it to our list of asocs */
+ list_add_tail(&asoc->asocs, &ep->asocs);
+
+ /* Increment the backlog value for a TCP-style listening socket. */
+ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+ sk->sk_ack_backlog++;
+}
+
+/* Free the endpoint structure. Delay cleanup until
+ * all users have released their reference count on this structure.
+ */
+void sctp_endpoint_free(struct sctp_endpoint *ep)
+{
+ ep->base.dead = true;
+
+ ep->base.sk->sk_state = SCTP_SS_CLOSED;
+
+ /* Unlink this endpoint, so we can't find it again! */
+ sctp_unhash_endpoint(ep);
+
+ sctp_endpoint_put(ep);
+}
+
+/* Final destructor for endpoint. */
+static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
+{
+ struct sock *sk;
+
+ if (unlikely(!ep->base.dead)) {
+ WARN(1, "Attempt to destroy undead endpoint %p!\n", ep);
+ return;
+ }
+
+ /* Free the digest buffer */
+ kfree(ep->digest);
+
+ /* SCTP-AUTH: Free up AUTH releated data such as shared keys
+ * chunks and hmacs arrays that were allocated
+ */
+ sctp_auth_destroy_keys(&ep->endpoint_shared_keys);
+ kfree(ep->auth_hmacs_list);
+ kfree(ep->auth_chunk_list);
+
+ /* AUTH - Free any allocated HMAC transform containers */
+ sctp_auth_destroy_hmacs(ep->auth_hmacs);
+
+ /* Cleanup. */
+ sctp_inq_free(&ep->base.inqueue);
+ sctp_bind_addr_free(&ep->base.bind_addr);
+
+ memset(ep->secret_key, 0, sizeof(ep->secret_key));
+
+ /* Give up our hold on the sock. */
+ sk = ep->base.sk;
+ if (sk != NULL) {
+ /* Remove and free the port */
+ if (sctp_sk(sk)->bind_hash)
+ sctp_put_port(sk);
+
+ sock_put(sk);
+ }
+
+ kfree(ep);
+ SCTP_DBG_OBJCNT_DEC(ep);
+}
+
+/* Hold a reference to an endpoint. */
+void sctp_endpoint_hold(struct sctp_endpoint *ep)
+{
+ atomic_inc(&ep->base.refcnt);
+}
+
+/* Release a reference to an endpoint and clean up if there are
+ * no more references.
+ */
+void sctp_endpoint_put(struct sctp_endpoint *ep)
+{
+ if (atomic_dec_and_test(&ep->base.refcnt))
+ sctp_endpoint_destroy(ep);
+}
+
+/* Is this the endpoint we are looking for? */
+struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
+ struct net *net,
+ const union sctp_addr *laddr)
+{
+ struct sctp_endpoint *retval = NULL;
+
+ if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) &&
+ net_eq(sock_net(ep->base.sk), net)) {
+ if (sctp_bind_addr_match(&ep->base.bind_addr, laddr,
+ sctp_sk(ep->base.sk)))
+ retval = ep;
+ }
+
+ return retval;
+}
+
+/* Find the association that goes with this chunk.
+ * We do a linear search of the associations for this endpoint.
+ * We return the matching transport address too.
+ */
+static struct sctp_association *__sctp_endpoint_lookup_assoc(
+ const struct sctp_endpoint *ep,
+ const union sctp_addr *paddr,
+ struct sctp_transport **transport)
+{
+ struct sctp_association *asoc = NULL;
+ struct sctp_association *tmp;
+ struct sctp_transport *t = NULL;
+ struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
+ int hash;
+ int rport;
+
+ *transport = NULL;
+
+ /* If the local port is not set, there can't be any associations
+ * on this endpoint.
+ */
+ if (!ep->base.bind_addr.port)
+ goto out;
+
+ rport = ntohs(paddr->v4.sin_port);
+
+ hash = sctp_assoc_hashfn(sock_net(ep->base.sk), ep->base.bind_addr.port,
+ rport);
+ head = &sctp_assoc_hashtable[hash];
+ read_lock(&head->lock);
+ sctp_for_each_hentry(epb, &head->chain) {
+ tmp = sctp_assoc(epb);
+ if (tmp->ep != ep || rport != tmp->peer.port)
+ continue;
+
+ t = sctp_assoc_lookup_paddr(tmp, paddr);
+ if (t) {
+ asoc = tmp;
+ *transport = t;
+ break;
+ }
+ }
+ read_unlock(&head->lock);
+out:
+ return asoc;
+}
+
+/* Lookup association on an endpoint based on a peer address. BH-safe. */
+struct sctp_association *sctp_endpoint_lookup_assoc(
+ const struct sctp_endpoint *ep,
+ const union sctp_addr *paddr,
+ struct sctp_transport **transport)
+{
+ struct sctp_association *asoc;
+
+ local_bh_disable();
+ asoc = __sctp_endpoint_lookup_assoc(ep, paddr, transport);
+ local_bh_enable();
+
+ return asoc;
+}
+
+/* Look for any peeled off association from the endpoint that matches the
+ * given peer address.
+ */
+int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
+ const union sctp_addr *paddr)
+{
+ struct sctp_sockaddr_entry *addr;
+ struct sctp_bind_addr *bp;
+ struct net *net = sock_net(ep->base.sk);
+
+ bp = &ep->base.bind_addr;
+ /* This function is called with the socket lock held,
+ * so the address_list can not change.
+ */
+ list_for_each_entry(addr, &bp->address_list, list) {
+ if (sctp_has_association(net, &addr->a, paddr))
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Do delayed input processing. This is scheduled by sctp_rcv().
+ * This may be called on BH or task time.
+ */
+static void sctp_endpoint_bh_rcv(struct work_struct *work)
+{
+ struct sctp_endpoint *ep =
+ container_of(work, struct sctp_endpoint,
+ base.inqueue.immediate);
+ struct sctp_association *asoc;
+ struct sock *sk;
+ struct net *net;
+ struct sctp_transport *transport;
+ struct sctp_chunk *chunk;
+ struct sctp_inq *inqueue;
+ sctp_subtype_t subtype;
+ sctp_state_t state;
+ int error = 0;
+ int first_time = 1; /* is this the first time through the loop */
+
+ if (ep->base.dead)
+ return;
+
+ asoc = NULL;
+ inqueue = &ep->base.inqueue;
+ sk = ep->base.sk;
+ net = sock_net(sk);
+
+ while (NULL != (chunk = sctp_inq_pop(inqueue))) {
+ subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type);
+
+ /* If the first chunk in the packet is AUTH, do special
+ * processing specified in Section 6.3 of SCTP-AUTH spec
+ */
+ if (first_time && (subtype.chunk == SCTP_CID_AUTH)) {
+ struct sctp_chunkhdr *next_hdr;
+
+ next_hdr = sctp_inq_peek(inqueue);
+ if (!next_hdr)
+ goto normal;
+
+ /* If the next chunk is COOKIE-ECHO, skip the AUTH
+ * chunk while saving a pointer to it so we can do
+ * Authentication later (during cookie-echo
+ * processing).
+ */
+ if (next_hdr->type == SCTP_CID_COOKIE_ECHO) {
+ chunk->auth_chunk = skb_clone(chunk->skb,
+ GFP_ATOMIC);
+ chunk->auth = 1;
+ continue;
+ }
+ }
+normal:
+ /* We might have grown an association since last we
+ * looked, so try again.
+ *
+ * This happens when we've just processed our
+ * COOKIE-ECHO chunk.
+ */
+ if (NULL == chunk->asoc) {
+ asoc = sctp_endpoint_lookup_assoc(ep,
+ sctp_source(chunk),
+ &transport);
+ chunk->asoc = asoc;
+ chunk->transport = transport;
+ }
+
+ state = asoc ? asoc->state : SCTP_STATE_CLOSED;
+ if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth)
+ continue;
+
+ /* Remember where the last DATA chunk came from so we
+ * know where to send the SACK.
+ */
+ if (asoc && sctp_chunk_is_data(chunk))
+ asoc->peer.last_data_from = chunk->transport;
+ else {
+ SCTP_INC_STATS(sock_net(ep->base.sk), SCTP_MIB_INCTRLCHUNKS);
+ if (asoc)
+ asoc->stats.ictrlchunks++;
+ }
+
+ if (chunk->transport)
+ chunk->transport->last_time_heard = ktime_get();
+
+ error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, state,
+ ep, asoc, chunk, GFP_ATOMIC);
+
+ if (error && chunk)
+ chunk->pdiscard = 1;
+
+ /* Check to see if the endpoint is freed in response to
+ * the incoming chunk. If so, get out of the while loop.
+ */
+ if (!sctp_sk(sk)->ep)
+ break;
+
+ if (first_time)
+ first_time = 0;
+ }
+}
diff --git a/net/sctp/input.c b/net/sctp/input.c
new file mode 100644
index 000000000..b6493b3f1
--- /dev/null
+++ b/net/sctp/input.c
@@ -0,0 +1,1141 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2003 International Business Machines, Corp.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions handle all input from the IP layer into SCTP.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Xingang Guo <xingang.guo@intel.com>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Hui Huang <hui.huang@nokia.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ */
+
+#include <linux/types.h>
+#include <linux/list.h> /* For struct list_head */
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/time.h> /* For struct timeval */
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/snmp.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/checksum.h>
+#include <net/net_namespace.h>
+
+/* Forward declarations for internal helpers. */
+static int sctp_rcv_ootb(struct sk_buff *);
+static struct sctp_association *__sctp_rcv_lookup(struct net *net,
+ struct sk_buff *skb,
+ const union sctp_addr *paddr,
+ const union sctp_addr *laddr,
+ struct sctp_transport **transportp);
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
+ const union sctp_addr *laddr);
+static struct sctp_association *__sctp_lookup_association(
+ struct net *net,
+ const union sctp_addr *local,
+ const union sctp_addr *peer,
+ struct sctp_transport **pt);
+
+static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb);
+
+
+/* Calculate the SCTP checksum of an SCTP packet. */
+static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb)
+{
+ struct sctphdr *sh = sctp_hdr(skb);
+ __le32 cmp = sh->checksum;
+ __le32 val = sctp_compute_cksum(skb, 0);
+
+ if (val != cmp) {
+ /* CRC failure, dump it. */
+ SCTP_INC_STATS_BH(net, SCTP_MIB_CHECKSUMERRORS);
+ return -1;
+ }
+ return 0;
+}
+
+struct sctp_input_cb {
+ union {
+ struct inet_skb_parm h4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_skb_parm h6;
+#endif
+ } header;
+ struct sctp_chunk *chunk;
+};
+#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
+
+/*
+ * This is the routine which IP calls when receiving an SCTP packet.
+ */
+int sctp_rcv(struct sk_buff *skb)
+{
+ struct sock *sk;
+ struct sctp_association *asoc;
+ struct sctp_endpoint *ep = NULL;
+ struct sctp_ep_common *rcvr;
+ struct sctp_transport *transport = NULL;
+ struct sctp_chunk *chunk;
+ struct sctphdr *sh;
+ union sctp_addr src;
+ union sctp_addr dest;
+ int family;
+ struct sctp_af *af;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->pkt_type != PACKET_HOST)
+ goto discard_it;
+
+ SCTP_INC_STATS_BH(net, SCTP_MIB_INSCTPPACKS);
+
+ if (skb_linearize(skb))
+ goto discard_it;
+
+ sh = sctp_hdr(skb);
+
+ /* Pull up the IP and SCTP headers. */
+ __skb_pull(skb, skb_transport_offset(skb));
+ if (skb->len < sizeof(struct sctphdr))
+ goto discard_it;
+
+ skb->csum_valid = 0; /* Previous value not applicable */
+ if (skb_csum_unnecessary(skb))
+ __skb_decr_checksum_unnecessary(skb);
+ else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0)
+ goto discard_it;
+ skb->csum_valid = 1;
+
+ skb_pull(skb, sizeof(struct sctphdr));
+
+ /* Make sure we at least have chunk headers worth of data left. */
+ if (skb->len < sizeof(struct sctp_chunkhdr))
+ goto discard_it;
+
+ family = ipver2af(ip_hdr(skb)->version);
+ af = sctp_get_af_specific(family);
+ if (unlikely(!af))
+ goto discard_it;
+
+ /* Initialize local addresses for lookups. */
+ af->from_skb(&src, skb, 1);
+ af->from_skb(&dest, skb, 0);
+
+ /* If the packet is to or from a non-unicast address,
+ * silently discard the packet.
+ *
+ * This is not clearly defined in the RFC except in section
+ * 8.4 - OOTB handling. However, based on the book "Stream Control
+ * Transmission Protocol" 2.1, "It is important to note that the
+ * IP address of an SCTP transport address must be a routable
+ * unicast address. In other words, IP multicast addresses and
+ * IP broadcast addresses cannot be used in an SCTP transport
+ * address."
+ */
+ if (!af->addr_valid(&src, NULL, skb) ||
+ !af->addr_valid(&dest, NULL, skb))
+ goto discard_it;
+
+ asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport);
+
+ if (!asoc)
+ ep = __sctp_rcv_lookup_endpoint(net, &dest);
+
+ /* Retrieve the common input handling substructure. */
+ rcvr = asoc ? &asoc->base : &ep->base;
+ sk = rcvr->sk;
+
+ /*
+ * If a frame arrives on an interface and the receiving socket is
+ * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
+ */
+ if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb))) {
+ if (asoc) {
+ sctp_association_put(asoc);
+ asoc = NULL;
+ } else {
+ sctp_endpoint_put(ep);
+ ep = NULL;
+ }
+ sk = net->sctp.ctl_sock;
+ ep = sctp_sk(sk)->ep;
+ sctp_endpoint_hold(ep);
+ rcvr = &ep->base;
+ }
+
+ /*
+ * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
+ * An SCTP packet is called an "out of the blue" (OOTB)
+ * packet if it is correctly formed, i.e., passed the
+ * receiver's checksum check, but the receiver is not
+ * able to identify the association to which this
+ * packet belongs.
+ */
+ if (!asoc) {
+ if (sctp_rcv_ootb(skb)) {
+ SCTP_INC_STATS_BH(net, SCTP_MIB_OUTOFBLUES);
+ goto discard_release;
+ }
+ }
+
+ if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family))
+ goto discard_release;
+ nf_reset(skb);
+
+ if (sk_filter(sk, skb))
+ goto discard_release;
+
+ /* Create an SCTP packet structure. */
+ chunk = sctp_chunkify(skb, asoc, sk);
+ if (!chunk)
+ goto discard_release;
+ SCTP_INPUT_CB(skb)->chunk = chunk;
+
+ /* Remember what endpoint is to handle this packet. */
+ chunk->rcvr = rcvr;
+
+ /* Remember the SCTP header. */
+ chunk->sctp_hdr = sh;
+
+ /* Set the source and destination addresses of the incoming chunk. */
+ sctp_init_addrs(chunk, &src, &dest);
+
+ /* Remember where we came from. */
+ chunk->transport = transport;
+
+ /* Acquire access to the sock lock. Note: We are safe from other
+ * bottom halves on this lock, but a user may be in the lock too,
+ * so check if it is busy.
+ */
+ bh_lock_sock(sk);
+
+ if (sk != rcvr->sk) {
+ /* Our cached sk is different from the rcvr->sk. This is
+ * because migrate()/accept() may have moved the association
+ * to a new socket and released all the sockets. So now we
+ * are holding a lock on the old socket while the user may
+ * be doing something with the new socket. Switch our veiw
+ * of the current sk.
+ */
+ bh_unlock_sock(sk);
+ sk = rcvr->sk;
+ bh_lock_sock(sk);
+ }
+
+ if (sock_owned_by_user(sk)) {
+ if (sctp_add_backlog(sk, skb)) {
+ bh_unlock_sock(sk);
+ sctp_chunk_free(chunk);
+ skb = NULL; /* sctp_chunk_free already freed the skb */
+ goto discard_release;
+ }
+ SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_BACKLOG);
+ } else {
+ SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_SOFTIRQ);
+ sctp_inq_push(&chunk->rcvr->inqueue, chunk);
+ }
+
+ bh_unlock_sock(sk);
+
+ /* Release the asoc/ep ref we took in the lookup calls. */
+ if (asoc)
+ sctp_association_put(asoc);
+ else
+ sctp_endpoint_put(ep);
+
+ return 0;
+
+discard_it:
+ SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_DISCARDS);
+ kfree_skb(skb);
+ return 0;
+
+discard_release:
+ /* Release the asoc/ep ref we took in the lookup calls. */
+ if (asoc)
+ sctp_association_put(asoc);
+ else
+ sctp_endpoint_put(ep);
+
+ goto discard_it;
+}
+
+/* Process the backlog queue of the socket. Every skb on
+ * the backlog holds a ref on an association or endpoint.
+ * We hold this ref throughout the state machine to make
+ * sure that the structure we need is still around.
+ */
+int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+ struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
+ struct sctp_ep_common *rcvr = NULL;
+ int backloged = 0;
+
+ rcvr = chunk->rcvr;
+
+ /* If the rcvr is dead then the association or endpoint
+ * has been deleted and we can safely drop the chunk
+ * and refs that we are holding.
+ */
+ if (rcvr->dead) {
+ sctp_chunk_free(chunk);
+ goto done;
+ }
+
+ if (unlikely(rcvr->sk != sk)) {
+ /* In this case, the association moved from one socket to
+ * another. We are currently sitting on the backlog of the
+ * old socket, so we need to move.
+ * However, since we are here in the process context we
+ * need to take make sure that the user doesn't own
+ * the new socket when we process the packet.
+ * If the new socket is user-owned, queue the chunk to the
+ * backlog of the new socket without dropping any refs.
+ * Otherwise, we can safely push the chunk on the inqueue.
+ */
+
+ sk = rcvr->sk;
+ bh_lock_sock(sk);
+
+ if (sock_owned_by_user(sk)) {
+ if (sk_add_backlog(sk, skb, sk->sk_rcvbuf))
+ sctp_chunk_free(chunk);
+ else
+ backloged = 1;
+ } else
+ sctp_inq_push(inqueue, chunk);
+
+ bh_unlock_sock(sk);
+
+ /* If the chunk was backloged again, don't drop refs */
+ if (backloged)
+ return 0;
+ } else {
+ sctp_inq_push(inqueue, chunk);
+ }
+
+done:
+ /* Release the refs we took in sctp_add_backlog */
+ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type)
+ sctp_association_put(sctp_assoc(rcvr));
+ else if (SCTP_EP_TYPE_SOCKET == rcvr->type)
+ sctp_endpoint_put(sctp_ep(rcvr));
+ else
+ BUG();
+
+ return 0;
+}
+
+static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb)
+{
+ struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+ struct sctp_ep_common *rcvr = chunk->rcvr;
+ int ret;
+
+ ret = sk_add_backlog(sk, skb, sk->sk_rcvbuf);
+ if (!ret) {
+ /* Hold the assoc/ep while hanging on the backlog queue.
+ * This way, we know structures we need will not disappear
+ * from us
+ */
+ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type)
+ sctp_association_hold(sctp_assoc(rcvr));
+ else if (SCTP_EP_TYPE_SOCKET == rcvr->type)
+ sctp_endpoint_hold(sctp_ep(rcvr));
+ else
+ BUG();
+ }
+ return ret;
+
+}
+
+/* Handle icmp frag needed error. */
+void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
+ struct sctp_transport *t, __u32 pmtu)
+{
+ if (!t || (t->pathmtu <= pmtu))
+ return;
+
+ if (sock_owned_by_user(sk)) {
+ asoc->pmtu_pending = 1;
+ t->pmtu_pending = 1;
+ return;
+ }
+
+ if (t->param_flags & SPP_PMTUD_ENABLE) {
+ /* Update transports view of the MTU */
+ sctp_transport_update_pmtu(sk, t, pmtu);
+
+ /* Update association pmtu. */
+ sctp_assoc_sync_pmtu(sk, asoc);
+ }
+
+ /* Retransmit with the new pmtu setting.
+ * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
+ * Needed will never be sent, but if a message was sent before
+ * PMTU discovery was disabled that was larger than the PMTU, it
+ * would not be fragmented, so it must be re-transmitted fragmented.
+ */
+ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
+}
+
+void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst;
+
+ if (!t)
+ return;
+ dst = sctp_transport_dst_check(t);
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+}
+
+/*
+ * SCTP Implementer's Guide, 2.37 ICMP handling procedures
+ *
+ * ICMP8) If the ICMP code is a "Unrecognized next header type encountered"
+ * or a "Protocol Unreachable" treat this message as an abort
+ * with the T bit set.
+ *
+ * This function sends an event to the state machine, which will abort the
+ * association.
+ *
+ */
+void sctp_icmp_proto_unreachable(struct sock *sk,
+ struct sctp_association *asoc,
+ struct sctp_transport *t)
+{
+ if (sock_owned_by_user(sk)) {
+ if (timer_pending(&t->proto_unreach_timer))
+ return;
+ else {
+ if (!mod_timer(&t->proto_unreach_timer,
+ jiffies + (HZ/20)))
+ sctp_association_hold(asoc);
+ }
+ } else {
+ struct net *net = sock_net(sk);
+
+ pr_debug("%s: unrecognized next header type "
+ "encountered!\n", __func__);
+
+ if (del_timer(&t->proto_unreach_timer))
+ sctp_association_put(asoc);
+
+ sctp_do_sm(net, SCTP_EVENT_T_OTHER,
+ SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
+ asoc->state, asoc->ep, asoc, t,
+ GFP_ATOMIC);
+ }
+}
+
+/* Common lookup code for icmp/icmpv6 error handler. */
+struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
+ struct sctphdr *sctphdr,
+ struct sctp_association **app,
+ struct sctp_transport **tpp)
+{
+ union sctp_addr saddr;
+ union sctp_addr daddr;
+ struct sctp_af *af;
+ struct sock *sk = NULL;
+ struct sctp_association *asoc;
+ struct sctp_transport *transport = NULL;
+ struct sctp_init_chunk *chunkhdr;
+ __u32 vtag = ntohl(sctphdr->vtag);
+ int len = skb->len - ((void *)sctphdr - (void *)skb->data);
+
+ *app = NULL; *tpp = NULL;
+
+ af = sctp_get_af_specific(family);
+ if (unlikely(!af)) {
+ return NULL;
+ }
+
+ /* Initialize local addresses for lookups. */
+ af->from_skb(&saddr, skb, 1);
+ af->from_skb(&daddr, skb, 0);
+
+ /* Look for an association that matches the incoming ICMP error
+ * packet.
+ */
+ asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport);
+ if (!asoc)
+ return NULL;
+
+ sk = asoc->base.sk;
+
+ /* RFC 4960, Appendix C. ICMP Handling
+ *
+ * ICMP6) An implementation MUST validate that the Verification Tag
+ * contained in the ICMP message matches the Verification Tag of
+ * the peer. If the Verification Tag is not 0 and does NOT
+ * match, discard the ICMP message. If it is 0 and the ICMP
+ * message contains enough bytes to verify that the chunk type is
+ * an INIT chunk and that the Initiate Tag matches the tag of the
+ * peer, continue with ICMP7. If the ICMP message is too short
+ * or the chunk type or the Initiate Tag does not match, silently
+ * discard the packet.
+ */
+ if (vtag == 0) {
+ chunkhdr = (void *)sctphdr + sizeof(struct sctphdr);
+ if (len < sizeof(struct sctphdr) + sizeof(sctp_chunkhdr_t)
+ + sizeof(__be32) ||
+ chunkhdr->chunk_hdr.type != SCTP_CID_INIT ||
+ ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) {
+ goto out;
+ }
+ } else if (vtag != asoc->c.peer_vtag) {
+ goto out;
+ }
+
+ bh_lock_sock(sk);
+
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ if (sock_owned_by_user(sk))
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+ *app = asoc;
+ *tpp = transport;
+ return sk;
+
+out:
+ sctp_association_put(asoc);
+ return NULL;
+}
+
+/* Common cleanup code for icmp/icmpv6 error handler. */
+void sctp_err_finish(struct sock *sk, struct sctp_association *asoc)
+{
+ bh_unlock_sock(sk);
+ sctp_association_put(asoc);
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition. If err < 0 then the socket should
+ * be closed and the error returned to the user. If err > 0
+ * it's just the icmp type << 8 | icmp code. After adjustment
+ * header points to the first 8 bytes of the sctp header. We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+void sctp_v4_err(struct sk_buff *skb, __u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ const int ihlen = iph->ihl * 4;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
+ struct sock *sk;
+ struct sctp_association *asoc = NULL;
+ struct sctp_transport *transport;
+ struct inet_sock *inet;
+ __u16 saveip, savesctp;
+ int err;
+ struct net *net = dev_net(skb->dev);
+
+ /* Fix up skb to look at the embedded net header. */
+ saveip = skb->network_header;
+ savesctp = skb->transport_header;
+ skb_reset_network_header(skb);
+ skb_set_transport_header(skb, ihlen);
+ sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport);
+ /* Put back, the original values. */
+ skb->network_header = saveip;
+ skb->transport_header = savesctp;
+ if (!sk) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ return;
+ }
+ /* Warning: The sock lock is held. Remember to call
+ * sctp_err_finish!
+ */
+
+ switch (type) {
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code > NR_ICMP_UNREACH)
+ goto out_unlock;
+
+ /* PMTU discovery (RFC1191) */
+ if (ICMP_FRAG_NEEDED == code) {
+ sctp_icmp_frag_needed(sk, asoc, transport, info);
+ goto out_unlock;
+ } else {
+ if (ICMP_PROT_UNREACH == code) {
+ sctp_icmp_proto_unreachable(sk, asoc,
+ transport);
+ goto out_unlock;
+ }
+ }
+ err = icmp_err_convert[code].errno;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ /* Ignore any time exceeded errors due to fragment reassembly
+ * timeouts.
+ */
+ if (ICMP_EXC_FRAGTIME == code)
+ goto out_unlock;
+
+ err = EHOSTUNREACH;
+ break;
+ case ICMP_REDIRECT:
+ sctp_icmp_redirect(sk, transport, skb);
+ /* Fall through to out_unlock. */
+ default:
+ goto out_unlock;
+ }
+
+ inet = inet_sk(sk);
+ if (!sock_owned_by_user(sk) && inet->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else { /* Only an error on timeout */
+ sk->sk_err_soft = err;
+ }
+
+out_unlock:
+ sctp_err_finish(sk, asoc);
+}
+
+/*
+ * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
+ *
+ * This function scans all the chunks in the OOTB packet to determine if
+ * the packet should be discarded right away. If a response might be needed
+ * for this packet, or, if further processing is possible, the packet will
+ * be queued to a proper inqueue for the next phase of handling.
+ *
+ * Output:
+ * Return 0 - If further processing is needed.
+ * Return 1 - If the packet can be discarded right away.
+ */
+static int sctp_rcv_ootb(struct sk_buff *skb)
+{
+ sctp_chunkhdr_t *ch;
+ __u8 *ch_end;
+
+ ch = (sctp_chunkhdr_t *) skb->data;
+
+ /* Scan through all the chunks in the packet. */
+ do {
+ /* Break out if chunk length is less then minimal. */
+ if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
+ break;
+
+ ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ if (ch_end > skb_tail_pointer(skb))
+ break;
+
+ /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the
+ * receiver MUST silently discard the OOTB packet and take no
+ * further action.
+ */
+ if (SCTP_CID_ABORT == ch->type)
+ goto discard;
+
+ /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE
+ * chunk, the receiver should silently discard the packet
+ * and take no further action.
+ */
+ if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type)
+ goto discard;
+
+ /* RFC 4460, 2.11.2
+ * This will discard packets with INIT chunk bundled as
+ * subsequent chunks in the packet. When INIT is first,
+ * the normal INIT processing will discard the chunk.
+ */
+ if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data)
+ goto discard;
+
+ ch = (sctp_chunkhdr_t *) ch_end;
+ } while (ch_end < skb_tail_pointer(skb));
+
+ return 0;
+
+discard:
+ return 1;
+}
+
+/* Insert endpoint into the hash table. */
+static void __sctp_hash_endpoint(struct sctp_endpoint *ep)
+{
+ struct net *net = sock_net(ep->base.sk);
+ struct sctp_ep_common *epb;
+ struct sctp_hashbucket *head;
+
+ epb = &ep->base;
+
+ epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
+ head = &sctp_ep_hashtable[epb->hashent];
+
+ write_lock(&head->lock);
+ hlist_add_head(&epb->node, &head->chain);
+ write_unlock(&head->lock);
+}
+
+/* Add an endpoint to the hash. Local BH-safe. */
+void sctp_hash_endpoint(struct sctp_endpoint *ep)
+{
+ local_bh_disable();
+ __sctp_hash_endpoint(ep);
+ local_bh_enable();
+}
+
+/* Remove endpoint from the hash table. */
+static void __sctp_unhash_endpoint(struct sctp_endpoint *ep)
+{
+ struct net *net = sock_net(ep->base.sk);
+ struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
+
+ epb = &ep->base;
+
+ epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
+
+ head = &sctp_ep_hashtable[epb->hashent];
+
+ write_lock(&head->lock);
+ hlist_del_init(&epb->node);
+ write_unlock(&head->lock);
+}
+
+/* Remove endpoint from the hash. Local BH-safe. */
+void sctp_unhash_endpoint(struct sctp_endpoint *ep)
+{
+ local_bh_disable();
+ __sctp_unhash_endpoint(ep);
+ local_bh_enable();
+}
+
+/* Look up an endpoint. */
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
+ const union sctp_addr *laddr)
+{
+ struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
+ struct sctp_endpoint *ep;
+ int hash;
+
+ hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
+ head = &sctp_ep_hashtable[hash];
+ read_lock(&head->lock);
+ sctp_for_each_hentry(epb, &head->chain) {
+ ep = sctp_ep(epb);
+ if (sctp_endpoint_is_match(ep, net, laddr))
+ goto hit;
+ }
+
+ ep = sctp_sk(net->sctp.ctl_sock)->ep;
+
+hit:
+ sctp_endpoint_hold(ep);
+ read_unlock(&head->lock);
+ return ep;
+}
+
+/* Insert association into the hash table. */
+static void __sctp_hash_established(struct sctp_association *asoc)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ struct sctp_ep_common *epb;
+ struct sctp_hashbucket *head;
+
+ epb = &asoc->base;
+
+ /* Calculate which chain this entry will belong to. */
+ epb->hashent = sctp_assoc_hashfn(net, epb->bind_addr.port,
+ asoc->peer.port);
+
+ head = &sctp_assoc_hashtable[epb->hashent];
+
+ write_lock(&head->lock);
+ hlist_add_head(&epb->node, &head->chain);
+ write_unlock(&head->lock);
+}
+
+/* Add an association to the hash. Local BH-safe. */
+void sctp_hash_established(struct sctp_association *asoc)
+{
+ if (asoc->temp)
+ return;
+
+ local_bh_disable();
+ __sctp_hash_established(asoc);
+ local_bh_enable();
+}
+
+/* Remove association from the hash table. */
+static void __sctp_unhash_established(struct sctp_association *asoc)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
+
+ epb = &asoc->base;
+
+ epb->hashent = sctp_assoc_hashfn(net, epb->bind_addr.port,
+ asoc->peer.port);
+
+ head = &sctp_assoc_hashtable[epb->hashent];
+
+ write_lock(&head->lock);
+ hlist_del_init(&epb->node);
+ write_unlock(&head->lock);
+}
+
+/* Remove association from the hash table. Local BH-safe. */
+void sctp_unhash_established(struct sctp_association *asoc)
+{
+ if (asoc->temp)
+ return;
+
+ local_bh_disable();
+ __sctp_unhash_established(asoc);
+ local_bh_enable();
+}
+
+/* Look up an association. */
+static struct sctp_association *__sctp_lookup_association(
+ struct net *net,
+ const union sctp_addr *local,
+ const union sctp_addr *peer,
+ struct sctp_transport **pt)
+{
+ struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
+ struct sctp_association *asoc;
+ struct sctp_transport *transport;
+ int hash;
+
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+ hash = sctp_assoc_hashfn(net, ntohs(local->v4.sin_port),
+ ntohs(peer->v4.sin_port));
+ head = &sctp_assoc_hashtable[hash];
+ read_lock(&head->lock);
+ sctp_for_each_hentry(epb, &head->chain) {
+ asoc = sctp_assoc(epb);
+ transport = sctp_assoc_is_match(asoc, net, local, peer);
+ if (transport)
+ goto hit;
+ }
+
+ read_unlock(&head->lock);
+
+ return NULL;
+
+hit:
+ *pt = transport;
+ sctp_association_hold(asoc);
+ read_unlock(&head->lock);
+ return asoc;
+}
+
+/* Look up an association. BH-safe. */
+static
+struct sctp_association *sctp_lookup_association(struct net *net,
+ const union sctp_addr *laddr,
+ const union sctp_addr *paddr,
+ struct sctp_transport **transportp)
+{
+ struct sctp_association *asoc;
+
+ local_bh_disable();
+ asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+ local_bh_enable();
+
+ return asoc;
+}
+
+/* Is there an association matching the given local and peer addresses? */
+int sctp_has_association(struct net *net,
+ const union sctp_addr *laddr,
+ const union sctp_addr *paddr)
+{
+ struct sctp_association *asoc;
+ struct sctp_transport *transport;
+
+ if ((asoc = sctp_lookup_association(net, laddr, paddr, &transport))) {
+ sctp_association_put(asoc);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * SCTP Implementors Guide, 2.18 Handling of address
+ * parameters within the INIT or INIT-ACK.
+ *
+ * D) When searching for a matching TCB upon reception of an INIT
+ * or INIT-ACK chunk the receiver SHOULD use not only the
+ * source address of the packet (containing the INIT or
+ * INIT-ACK) but the receiver SHOULD also use all valid
+ * address parameters contained within the chunk.
+ *
+ * 2.18.3 Solution description
+ *
+ * This new text clearly specifies to an implementor the need
+ * to look within the INIT or INIT-ACK. Any implementation that
+ * does not do this, may not be able to establish associations
+ * in certain circumstances.
+ *
+ */
+static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
+ struct sk_buff *skb,
+ const union sctp_addr *laddr, struct sctp_transport **transportp)
+{
+ struct sctp_association *asoc;
+ union sctp_addr addr;
+ union sctp_addr *paddr = &addr;
+ struct sctphdr *sh = sctp_hdr(skb);
+ union sctp_params params;
+ sctp_init_chunk_t *init;
+ struct sctp_transport *transport;
+ struct sctp_af *af;
+
+ /*
+ * This code will NOT touch anything inside the chunk--it is
+ * strictly READ-ONLY.
+ *
+ * RFC 2960 3 SCTP packet Format
+ *
+ * Multiple chunks can be bundled into one SCTP packet up to
+ * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN
+ * COMPLETE chunks. These chunks MUST NOT be bundled with any
+ * other chunk in a packet. See Section 6.10 for more details
+ * on chunk bundling.
+ */
+
+ /* Find the start of the TLVs and the end of the chunk. This is
+ * the region we search for address parameters.
+ */
+ init = (sctp_init_chunk_t *)skb->data;
+
+ /* Walk the parameters looking for embedded addresses. */
+ sctp_walk_params(params, init, init_hdr.params) {
+
+ /* Note: Ignoring hostname addresses. */
+ af = sctp_get_af_specific(param_type2af(params.p->type));
+ if (!af)
+ continue;
+
+ af->from_addr_param(paddr, params.addr, sh->source, 0);
+
+ asoc = __sctp_lookup_association(net, laddr, paddr, &transport);
+ if (asoc)
+ return asoc;
+ }
+
+ return NULL;
+}
+
+/* ADD-IP, Section 5.2
+ * When an endpoint receives an ASCONF Chunk from the remote peer
+ * special procedures may be needed to identify the association the
+ * ASCONF Chunk is associated with. To properly find the association
+ * the following procedures SHOULD be followed:
+ *
+ * D2) If the association is not found, use the address found in the
+ * Address Parameter TLV combined with the port number found in the
+ * SCTP common header. If found proceed to rule D4.
+ *
+ * D2-ext) If more than one ASCONF Chunks are packed together, use the
+ * address found in the ASCONF Address Parameter TLV of each of the
+ * subsequent ASCONF Chunks. If found, proceed to rule D4.
+ */
+static struct sctp_association *__sctp_rcv_asconf_lookup(
+ struct net *net,
+ sctp_chunkhdr_t *ch,
+ const union sctp_addr *laddr,
+ __be16 peer_port,
+ struct sctp_transport **transportp)
+{
+ sctp_addip_chunk_t *asconf = (struct sctp_addip_chunk *)ch;
+ struct sctp_af *af;
+ union sctp_addr_param *param;
+ union sctp_addr paddr;
+
+ /* Skip over the ADDIP header and find the Address parameter */
+ param = (union sctp_addr_param *)(asconf + 1);
+
+ af = sctp_get_af_specific(param_type2af(param->p.type));
+ if (unlikely(!af))
+ return NULL;
+
+ af->from_addr_param(&paddr, param, peer_port, 0);
+
+ return __sctp_lookup_association(net, laddr, &paddr, transportp);
+}
+
+
+/* SCTP-AUTH, Section 6.3:
+* If the receiver does not find a STCB for a packet containing an AUTH
+* chunk as the first chunk and not a COOKIE-ECHO chunk as the second
+* chunk, it MUST use the chunks after the AUTH chunk to look up an existing
+* association.
+*
+* This means that any chunks that can help us identify the association need
+* to be looked at to find this association.
+*/
+static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
+ struct sk_buff *skb,
+ const union sctp_addr *laddr,
+ struct sctp_transport **transportp)
+{
+ struct sctp_association *asoc = NULL;
+ sctp_chunkhdr_t *ch;
+ int have_auth = 0;
+ unsigned int chunk_num = 1;
+ __u8 *ch_end;
+
+ /* Walk through the chunks looking for AUTH or ASCONF chunks
+ * to help us find the association.
+ */
+ ch = (sctp_chunkhdr_t *) skb->data;
+ do {
+ /* Break out if chunk length is less then minimal. */
+ if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
+ break;
+
+ ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ if (ch_end > skb_tail_pointer(skb))
+ break;
+
+ switch (ch->type) {
+ case SCTP_CID_AUTH:
+ have_auth = chunk_num;
+ break;
+
+ case SCTP_CID_COOKIE_ECHO:
+ /* If a packet arrives containing an AUTH chunk as
+ * a first chunk, a COOKIE-ECHO chunk as the second
+ * chunk, and possibly more chunks after them, and
+ * the receiver does not have an STCB for that
+ * packet, then authentication is based on
+ * the contents of the COOKIE- ECHO chunk.
+ */
+ if (have_auth == 1 && chunk_num == 2)
+ return NULL;
+ break;
+
+ case SCTP_CID_ASCONF:
+ if (have_auth || net->sctp.addip_noauth)
+ asoc = __sctp_rcv_asconf_lookup(
+ net, ch, laddr,
+ sctp_hdr(skb)->source,
+ transportp);
+ default:
+ break;
+ }
+
+ if (asoc)
+ break;
+
+ ch = (sctp_chunkhdr_t *) ch_end;
+ chunk_num++;
+ } while (ch_end < skb_tail_pointer(skb));
+
+ return asoc;
+}
+
+/*
+ * There are circumstances when we need to look inside the SCTP packet
+ * for information to help us find the association. Examples
+ * include looking inside of INIT/INIT-ACK chunks or after the AUTH
+ * chunks.
+ */
+static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
+ struct sk_buff *skb,
+ const union sctp_addr *laddr,
+ struct sctp_transport **transportp)
+{
+ sctp_chunkhdr_t *ch;
+
+ ch = (sctp_chunkhdr_t *) skb->data;
+
+ /* The code below will attempt to walk the chunk and extract
+ * parameter information. Before we do that, we need to verify
+ * that the chunk length doesn't cause overflow. Otherwise, we'll
+ * walk off the end.
+ */
+ if (WORD_ROUND(ntohs(ch->length)) > skb->len)
+ return NULL;
+
+ /* If this is INIT/INIT-ACK look inside the chunk too. */
+ if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK)
+ return __sctp_rcv_init_lookup(net, skb, laddr, transportp);
+
+ return __sctp_rcv_walk_lookup(net, skb, laddr, transportp);
+}
+
+/* Lookup an association for an inbound skb. */
+static struct sctp_association *__sctp_rcv_lookup(struct net *net,
+ struct sk_buff *skb,
+ const union sctp_addr *paddr,
+ const union sctp_addr *laddr,
+ struct sctp_transport **transportp)
+{
+ struct sctp_association *asoc;
+
+ asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+
+ /* Further lookup for INIT/INIT-ACK packets.
+ * SCTP Implementors Guide, 2.18 Handling of address
+ * parameters within the INIT or INIT-ACK.
+ */
+ if (!asoc)
+ asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+
+ return asoc;
+}
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
new file mode 100644
index 000000000..7e8a16c77
--- /dev/null
+++ b/net/sctp/inqueue.c
@@ -0,0 +1,214 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2002 International Business Machines, Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions are the methods for accessing the SCTP inqueue.
+ *
+ * An SCTP inqueue is a queue into which you push SCTP packets
+ * (which might be bundles or fragments of chunks) and out of which you
+ * pop SCTP whole chunks.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+/* Initialize an SCTP inqueue. */
+void sctp_inq_init(struct sctp_inq *queue)
+{
+ INIT_LIST_HEAD(&queue->in_chunk_list);
+ queue->in_progress = NULL;
+
+ /* Create a task for delivering data. */
+ INIT_WORK(&queue->immediate, NULL);
+}
+
+/* Release the memory associated with an SCTP inqueue. */
+void sctp_inq_free(struct sctp_inq *queue)
+{
+ struct sctp_chunk *chunk, *tmp;
+
+ /* Empty the queue. */
+ list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
+ list_del_init(&chunk->list);
+ sctp_chunk_free(chunk);
+ }
+
+ /* If there is a packet which is currently being worked on,
+ * free it as well.
+ */
+ if (queue->in_progress) {
+ sctp_chunk_free(queue->in_progress);
+ queue->in_progress = NULL;
+ }
+}
+
+/* Put a new packet in an SCTP inqueue.
+ * We assume that packet->sctp_hdr is set and in host byte order.
+ */
+void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk)
+{
+ /* Directly call the packet handling routine. */
+ if (chunk->rcvr->dead) {
+ sctp_chunk_free(chunk);
+ return;
+ }
+
+ /* We are now calling this either from the soft interrupt
+ * or from the backlog processing.
+ * Eventually, we should clean up inqueue to not rely
+ * on the BH related data structures.
+ */
+ list_add_tail(&chunk->list, &q->in_chunk_list);
+ if (chunk->asoc)
+ chunk->asoc->stats.ipackets++;
+ q->immediate.func(&q->immediate);
+}
+
+/* Peek at the next chunk on the inqeue. */
+struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *queue)
+{
+ struct sctp_chunk *chunk;
+ sctp_chunkhdr_t *ch = NULL;
+
+ chunk = queue->in_progress;
+ /* If there is no more chunks in this packet, say so */
+ if (chunk->singleton ||
+ chunk->end_of_packet ||
+ chunk->pdiscard)
+ return NULL;
+
+ ch = (sctp_chunkhdr_t *)chunk->chunk_end;
+
+ return ch;
+}
+
+
+/* Extract a chunk from an SCTP inqueue.
+ *
+ * WARNING: If you need to put the chunk on another queue, you need to
+ * make a shallow copy (clone) of it.
+ */
+struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
+{
+ struct sctp_chunk *chunk;
+ sctp_chunkhdr_t *ch = NULL;
+
+ /* The assumption is that we are safe to process the chunks
+ * at this time.
+ */
+
+ if ((chunk = queue->in_progress)) {
+ /* There is a packet that we have been working on.
+ * Any post processing work to do before we move on?
+ */
+ if (chunk->singleton ||
+ chunk->end_of_packet ||
+ chunk->pdiscard) {
+ sctp_chunk_free(chunk);
+ chunk = queue->in_progress = NULL;
+ } else {
+ /* Nothing to do. Next chunk in the packet, please. */
+ ch = (sctp_chunkhdr_t *) chunk->chunk_end;
+ /* Force chunk->skb->data to chunk->chunk_end. */
+ skb_pull(chunk->skb, chunk->chunk_end - chunk->skb->data);
+ /* We are guaranteed to pull a SCTP header. */
+ }
+ }
+
+ /* Do we need to take the next packet out of the queue to process? */
+ if (!chunk) {
+ struct list_head *entry;
+
+ /* Is the queue empty? */
+ if (list_empty(&queue->in_chunk_list))
+ return NULL;
+
+ entry = queue->in_chunk_list.next;
+ chunk = queue->in_progress =
+ list_entry(entry, struct sctp_chunk, list);
+ list_del_init(entry);
+
+ /* This is the first chunk in the packet. */
+ chunk->singleton = 1;
+ ch = (sctp_chunkhdr_t *) chunk->skb->data;
+ chunk->data_accepted = 0;
+ }
+
+ chunk->chunk_hdr = ch;
+ chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ /* In the unlikely case of an IP reassembly, the skb could be
+ * non-linear. If so, update chunk_end so that it doesn't go past
+ * the skb->tail.
+ */
+ if (unlikely(skb_is_nonlinear(chunk->skb))) {
+ if (chunk->chunk_end > skb_tail_pointer(chunk->skb))
+ chunk->chunk_end = skb_tail_pointer(chunk->skb);
+ }
+ skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
+ chunk->subh.v = NULL; /* Subheader is no longer valid. */
+
+ if (chunk->chunk_end + sizeof(sctp_chunkhdr_t) <
+ skb_tail_pointer(chunk->skb)) {
+ /* This is not a singleton */
+ chunk->singleton = 0;
+ } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) {
+ /* Discard inside state machine. */
+ chunk->pdiscard = 1;
+ chunk->chunk_end = skb_tail_pointer(chunk->skb);
+ } else {
+ /* We are at the end of the packet, so mark the chunk
+ * in case we need to send a SACK.
+ */
+ chunk->end_of_packet = 1;
+ }
+
+ pr_debug("+++sctp_inq_pop+++ chunk:%p[%s], length:%d, skb->len:%d\n",
+ chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
+ ntohs(chunk->chunk_hdr->length), chunk->skb->len);
+
+ return chunk;
+}
+
+/* Set a top-half handler.
+ *
+ * Originally, we the top-half handler was scheduled as a BH. We now
+ * call the handler directly in sctp_inq_push() at a time that
+ * we know we are lock safe.
+ * The intent is that this routine will pull stuff out of the
+ * inqueue and process it.
+ */
+void sctp_inq_set_th_handler(struct sctp_inq *q, work_func_t callback)
+{
+ INIT_WORK(&q->immediate, callback);
+}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
new file mode 100644
index 000000000..0e4198ee2
--- /dev/null
+++ b/net/sctp/ipv6.c
@@ -0,0 +1,1076 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2002, 2004
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ * Copyright (c) 2002-2003 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * SCTP over IPv6.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Le Yanqun <yanqun.le@nokia.com>
+ * Hui Huang <hui.huang@nokia.com>
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ *
+ * Based on:
+ * linux/net/ipv6/tcp_ipv6.c
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/ipsec.h>
+#include <linux/slab.h>
+
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include <net/protocol.h>
+#include <net/ndisc.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/inet_common.h>
+#include <net/inet_ecn.h>
+#include <net/sctp/sctp.h>
+
+#include <asm/uaccess.h>
+
+static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
+ union sctp_addr *s2);
+static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
+ __be16 port);
+static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2);
+
+/* Event handler for inet6 address addition/deletion events.
+ * The sctp_local_addr_list needs to be protocted by a spin lock since
+ * multiple notifiers (say IPv4 and IPv6) may be running at the same
+ * time and thus corrupt the list.
+ * The reader side is protected with RCU.
+ */
+static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
+ void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct sctp_sockaddr_entry *addr = NULL;
+ struct sctp_sockaddr_entry *temp;
+ struct net *net = dev_net(ifa->idev->dev);
+ int found = 0;
+
+ switch (ev) {
+ case NETDEV_UP:
+ addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+ if (addr) {
+ addr->a.v6.sin6_family = AF_INET6;
+ addr->a.v6.sin6_port = 0;
+ addr->a.v6.sin6_addr = ifa->addr;
+ addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex;
+ addr->valid = 1;
+ spin_lock_bh(&net->sctp.local_addr_lock);
+ list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list);
+ sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW);
+ spin_unlock_bh(&net->sctp.local_addr_lock);
+ }
+ break;
+ case NETDEV_DOWN:
+ spin_lock_bh(&net->sctp.local_addr_lock);
+ list_for_each_entry_safe(addr, temp,
+ &net->sctp.local_addr_list, list) {
+ if (addr->a.sa.sa_family == AF_INET6 &&
+ ipv6_addr_equal(&addr->a.v6.sin6_addr,
+ &ifa->addr)) {
+ sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL);
+ found = 1;
+ addr->valid = 0;
+ list_del_rcu(&addr->list);
+ break;
+ }
+ }
+ spin_unlock_bh(&net->sctp.local_addr_lock);
+ if (found)
+ kfree_rcu(addr, rcu);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block sctp_inet6addr_notifier = {
+ .notifier_call = sctp_inet6addr_event,
+};
+
+/* ICMP error handler. */
+static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct inet6_dev *idev;
+ struct sock *sk;
+ struct sctp_association *asoc;
+ struct sctp_transport *transport;
+ struct ipv6_pinfo *np;
+ __u16 saveip, savesctp;
+ int err;
+ struct net *net = dev_net(skb->dev);
+
+ idev = in6_dev_get(skb->dev);
+
+ /* Fix up skb to look at the embedded net header. */
+ saveip = skb->network_header;
+ savesctp = skb->transport_header;
+ skb_reset_network_header(skb);
+ skb_set_transport_header(skb, offset);
+ sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport);
+ /* Put back, the original pointers. */
+ skb->network_header = saveip;
+ skb->transport_header = savesctp;
+ if (!sk) {
+ ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_INERRORS);
+ goto out;
+ }
+
+ /* Warning: The sock lock is held. Remember to call
+ * sctp_err_finish!
+ */
+
+ switch (type) {
+ case ICMPV6_PKT_TOOBIG:
+ if (ip6_sk_accept_pmtu(sk))
+ sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info));
+ goto out_unlock;
+ case ICMPV6_PARAMPROB:
+ if (ICMPV6_UNK_NEXTHDR == code) {
+ sctp_icmp_proto_unreachable(sk, asoc, transport);
+ goto out_unlock;
+ }
+ break;
+ case NDISC_REDIRECT:
+ sctp_icmp_redirect(sk, transport, skb);
+ goto out_unlock;
+ default:
+ break;
+ }
+
+ np = inet6_sk(sk);
+ icmpv6_err_convert(type, code, &err);
+ if (!sock_owned_by_user(sk) && np->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else { /* Only an error on timeout */
+ sk->sk_err_soft = err;
+ }
+
+out_unlock:
+ sctp_err_finish(sk, asoc);
+out:
+ if (likely(idev != NULL))
+ in6_dev_put(idev);
+}
+
+static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
+{
+ struct sock *sk = skb->sk;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct flowi6 *fl6 = &transport->fl.u.ip6;
+
+ pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
+ skb->len, &fl6->saddr, &fl6->daddr);
+
+ IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+
+ if (!(transport->param_flags & SPP_PMTUD_ENABLE))
+ skb->ignore_df = 1;
+
+ SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
+
+ return ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
+}
+
+/* Returns the dst cache entry for the given source and destination ip
+ * addresses.
+ */
+static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+ struct flowi *fl, struct sock *sk)
+{
+ struct sctp_association *asoc = t->asoc;
+ struct dst_entry *dst = NULL;
+ struct flowi6 *fl6 = &fl->u.ip6;
+ struct sctp_bind_addr *bp;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sctp_sockaddr_entry *laddr;
+ union sctp_addr *baddr = NULL;
+ union sctp_addr *daddr = &t->ipaddr;
+ union sctp_addr dst_saddr;
+ struct in6_addr *final_p, final;
+ __u8 matchlen = 0;
+ __u8 bmatchlen;
+ sctp_scope_t scope;
+
+ memset(fl6, 0, sizeof(struct flowi6));
+ fl6->daddr = daddr->v6.sin6_addr;
+ fl6->fl6_dport = daddr->v6.sin6_port;
+ fl6->flowi6_proto = IPPROTO_SCTP;
+ if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ fl6->flowi6_oif = daddr->v6.sin6_scope_id;
+
+ pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
+
+ if (asoc)
+ fl6->fl6_sport = htons(asoc->base.bind_addr.port);
+
+ if (saddr) {
+ fl6->saddr = saddr->v6.sin6_addr;
+ fl6->fl6_sport = saddr->v6.sin6_port;
+
+ pr_debug("src=%pI6 - ", &fl6->saddr);
+ }
+
+ final_p = fl6_update_dst(fl6, np->opt, &final);
+ dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ if (!asoc || saddr)
+ goto out;
+
+ bp = &asoc->base.bind_addr;
+ scope = sctp_scope(daddr);
+ /* ip6_dst_lookup has filled in the fl6->saddr for us. Check
+ * to see if we can use it.
+ */
+ if (!IS_ERR(dst)) {
+ /* Walk through the bind address list and look for a bind
+ * address that matches the source address of the returned dst.
+ */
+ sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port));
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ if (!laddr->valid || laddr->state == SCTP_ADDR_DEL ||
+ (laddr->state != SCTP_ADDR_SRC &&
+ !asoc->src_out_of_asoc_ok))
+ continue;
+
+ /* Do not compare against v4 addrs */
+ if ((laddr->a.sa.sa_family == AF_INET6) &&
+ (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) {
+ rcu_read_unlock();
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+ /* None of the bound addresses match the source address of the
+ * dst. So release it.
+ */
+ dst_release(dst);
+ dst = NULL;
+ }
+
+ /* Walk through the bind address list and try to get the
+ * best source address for a given destination.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ if (!laddr->valid)
+ continue;
+ if ((laddr->state == SCTP_ADDR_SRC) &&
+ (laddr->a.sa.sa_family == AF_INET6) &&
+ (scope <= sctp_scope(&laddr->a))) {
+ bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+ if (!baddr || (matchlen < bmatchlen)) {
+ baddr = &laddr->a;
+ matchlen = bmatchlen;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ if (baddr) {
+ fl6->saddr = baddr->v6.sin6_addr;
+ fl6->fl6_sport = baddr->v6.sin6_port;
+ final_p = fl6_update_dst(fl6, np->opt, &final);
+ dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ }
+
+out:
+ if (!IS_ERR_OR_NULL(dst)) {
+ struct rt6_info *rt;
+
+ rt = (struct rt6_info *)dst;
+ t->dst = dst;
+ t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ pr_debug("rt6_dst:%pI6 rt6_src:%pI6\n", &rt->rt6i_dst.addr,
+ &fl6->saddr);
+ } else {
+ t->dst = NULL;
+
+ pr_debug("no route\n");
+ }
+}
+
+/* Returns the number of consecutive initial bits that match in the 2 ipv6
+ * addresses.
+ */
+static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
+ union sctp_addr *s2)
+{
+ return ipv6_addr_diff(&s1->v6.sin6_addr, &s2->v6.sin6_addr);
+}
+
+/* Fills in the source address(saddr) based on the destination address(daddr)
+ * and asoc's bind address list.
+ */
+static void sctp_v6_get_saddr(struct sctp_sock *sk,
+ struct sctp_transport *t,
+ struct flowi *fl)
+{
+ struct flowi6 *fl6 = &fl->u.ip6;
+ union sctp_addr *saddr = &t->saddr;
+
+ pr_debug("%s: asoc:%p dst:%p\n", __func__, t->asoc, t->dst);
+
+ if (t->dst) {
+ saddr->v6.sin6_family = AF_INET6;
+ saddr->v6.sin6_addr = fl6->saddr;
+ }
+}
+
+/* Make a copy of all potential local addresses. */
+static void sctp_v6_copy_addrlist(struct list_head *addrlist,
+ struct net_device *dev)
+{
+ struct inet6_dev *in6_dev;
+ struct inet6_ifaddr *ifp;
+ struct sctp_sockaddr_entry *addr;
+
+ rcu_read_lock();
+ if ((in6_dev = __in6_dev_get(dev)) == NULL) {
+ rcu_read_unlock();
+ return;
+ }
+
+ read_lock_bh(&in6_dev->lock);
+ list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+ /* Add the address to the local list. */
+ addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
+ if (addr) {
+ addr->a.v6.sin6_family = AF_INET6;
+ addr->a.v6.sin6_port = 0;
+ addr->a.v6.sin6_addr = ifp->addr;
+ addr->a.v6.sin6_scope_id = dev->ifindex;
+ addr->valid = 1;
+ INIT_LIST_HEAD(&addr->list);
+ list_add_tail(&addr->list, addrlist);
+ }
+ }
+
+ read_unlock_bh(&in6_dev->lock);
+ rcu_read_unlock();
+}
+
+/* Initialize a sockaddr_storage from in incoming skb. */
+static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
+ int is_saddr)
+{
+ __be16 *port;
+ struct sctphdr *sh;
+
+ port = &addr->v6.sin6_port;
+ addr->v6.sin6_family = AF_INET6;
+ addr->v6.sin6_flowinfo = 0; /* FIXME */
+ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
+
+ sh = sctp_hdr(skb);
+ if (is_saddr) {
+ *port = sh->source;
+ addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
+ } else {
+ *port = sh->dest;
+ addr->v6.sin6_addr = ipv6_hdr(skb)->daddr;
+ }
+}
+
+/* Initialize an sctp_addr from a socket. */
+static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk)
+{
+ addr->v6.sin6_family = AF_INET6;
+ addr->v6.sin6_port = 0;
+ addr->v6.sin6_addr = sk->sk_v6_rcv_saddr;
+}
+
+/* Initialize sk->sk_rcv_saddr from sctp_addr. */
+static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
+{
+ if (addr->sa.sa_family == AF_INET) {
+ sk->sk_v6_rcv_saddr.s6_addr32[0] = 0;
+ sk->sk_v6_rcv_saddr.s6_addr32[1] = 0;
+ sk->sk_v6_rcv_saddr.s6_addr32[2] = htonl(0x0000ffff);
+ sk->sk_v6_rcv_saddr.s6_addr32[3] =
+ addr->v4.sin_addr.s_addr;
+ } else {
+ sk->sk_v6_rcv_saddr = addr->v6.sin6_addr;
+ }
+}
+
+/* Initialize sk->sk_daddr from sctp_addr. */
+static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk)
+{
+ if (addr->sa.sa_family == AF_INET) {
+ sk->sk_v6_daddr.s6_addr32[0] = 0;
+ sk->sk_v6_daddr.s6_addr32[1] = 0;
+ sk->sk_v6_daddr.s6_addr32[2] = htonl(0x0000ffff);
+ sk->sk_v6_daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr;
+ } else {
+ sk->sk_v6_daddr = addr->v6.sin6_addr;
+ }
+}
+
+/* Initialize a sctp_addr from an address parameter. */
+static void sctp_v6_from_addr_param(union sctp_addr *addr,
+ union sctp_addr_param *param,
+ __be16 port, int iif)
+{
+ addr->v6.sin6_family = AF_INET6;
+ addr->v6.sin6_port = port;
+ addr->v6.sin6_flowinfo = 0; /* BUG */
+ addr->v6.sin6_addr = param->v6.addr;
+ addr->v6.sin6_scope_id = iif;
+}
+
+/* Initialize an address parameter from a sctp_addr and return the length
+ * of the address parameter.
+ */
+static int sctp_v6_to_addr_param(const union sctp_addr *addr,
+ union sctp_addr_param *param)
+{
+ int length = sizeof(sctp_ipv6addr_param_t);
+
+ param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS;
+ param->v6.param_hdr.length = htons(length);
+ param->v6.addr = addr->v6.sin6_addr;
+
+ return length;
+}
+
+/* Initialize a sctp_addr from struct in6_addr. */
+static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
+ __be16 port)
+{
+ addr->sa.sa_family = AF_INET6;
+ addr->v6.sin6_port = port;
+ addr->v6.sin6_addr = *saddr;
+}
+
+/* Compare addresses exactly.
+ * v4-mapped-v6 is also in consideration.
+ */
+static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2)
+{
+ if (addr1->sa.sa_family != addr2->sa.sa_family) {
+ if (addr1->sa.sa_family == AF_INET &&
+ addr2->sa.sa_family == AF_INET6 &&
+ ipv6_addr_v4mapped(&addr2->v6.sin6_addr)) {
+ if (addr2->v6.sin6_port == addr1->v4.sin_port &&
+ addr2->v6.sin6_addr.s6_addr32[3] ==
+ addr1->v4.sin_addr.s_addr)
+ return 1;
+ }
+ if (addr2->sa.sa_family == AF_INET &&
+ addr1->sa.sa_family == AF_INET6 &&
+ ipv6_addr_v4mapped(&addr1->v6.sin6_addr)) {
+ if (addr1->v6.sin6_port == addr2->v4.sin_port &&
+ addr1->v6.sin6_addr.s6_addr32[3] ==
+ addr2->v4.sin_addr.s_addr)
+ return 1;
+ }
+ return 0;
+ }
+ if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
+ return 0;
+ /* If this is a linklocal address, compare the scope_id. */
+ if (ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
+ if (addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
+ (addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/* Initialize addr struct to INADDR_ANY. */
+static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port)
+{
+ memset(addr, 0x00, sizeof(union sctp_addr));
+ addr->v6.sin6_family = AF_INET6;
+ addr->v6.sin6_port = port;
+}
+
+/* Is this a wildcard address? */
+static int sctp_v6_is_any(const union sctp_addr *addr)
+{
+ return ipv6_addr_any(&addr->v6.sin6_addr);
+}
+
+/* Should this be available for binding? */
+static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
+{
+ int type;
+ const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr;
+
+ type = ipv6_addr_type(in6);
+ if (IPV6_ADDR_ANY == type)
+ return 1;
+ if (type == IPV6_ADDR_MAPPED) {
+ if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
+ return 0;
+ sctp_v6_map_v4(addr);
+ return sctp_get_af_specific(AF_INET)->available(addr, sp);
+ }
+ if (!(type & IPV6_ADDR_UNICAST))
+ return 0;
+
+ return ipv6_chk_addr(sock_net(&sp->inet.sk), in6, NULL, 0);
+}
+
+/* This function checks if the address is a valid address to be used for
+ * SCTP.
+ *
+ * Output:
+ * Return 0 - If the address is a non-unicast or an illegal address.
+ * Return 1 - If the address is a unicast.
+ */
+static int sctp_v6_addr_valid(union sctp_addr *addr,
+ struct sctp_sock *sp,
+ const struct sk_buff *skb)
+{
+ int ret = ipv6_addr_type(&addr->v6.sin6_addr);
+
+ /* Support v4-mapped-v6 address. */
+ if (ret == IPV6_ADDR_MAPPED) {
+ /* Note: This routine is used in input, so v4-mapped-v6
+ * are disallowed here when there is no sctp_sock.
+ */
+ if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
+ return 0;
+ sctp_v6_map_v4(addr);
+ return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb);
+ }
+
+ /* Is this a non-unicast address */
+ if (!(ret & IPV6_ADDR_UNICAST))
+ return 0;
+
+ return 1;
+}
+
+/* What is the scope of 'addr'? */
+static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
+{
+ int v6scope;
+ sctp_scope_t retval;
+
+ /* The IPv6 scope is really a set of bit fields.
+ * See IFA_* in <net/if_inet6.h>. Map to a generic SCTP scope.
+ */
+
+ v6scope = ipv6_addr_scope(&addr->v6.sin6_addr);
+ switch (v6scope) {
+ case IFA_HOST:
+ retval = SCTP_SCOPE_LOOPBACK;
+ break;
+ case IFA_LINK:
+ retval = SCTP_SCOPE_LINK;
+ break;
+ case IFA_SITE:
+ retval = SCTP_SCOPE_PRIVATE;
+ break;
+ default:
+ retval = SCTP_SCOPE_GLOBAL;
+ break;
+ }
+
+ return retval;
+}
+
+/* Create and initialize a new sk for the socket to be returned by accept(). */
+static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
+ struct sctp_association *asoc)
+{
+ struct sock *newsk;
+ struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+ struct sctp6_sock *newsctp6sk;
+
+ newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot);
+ if (!newsk)
+ goto out;
+
+ sock_init_data(NULL, newsk);
+
+ sctp_copy_sock(newsk, sk, asoc);
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ newsctp6sk = (struct sctp6_sock *)newsk;
+ inet_sk(newsk)->pinet6 = &newsctp6sk->inet6;
+
+ sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped;
+
+ newnp = inet6_sk(newsk);
+
+ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+ /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname()
+ * and getpeername().
+ */
+ sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk);
+
+ newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+
+ sk_refcnt_debug_inc(newsk);
+
+ if (newsk->sk_prot->init(newsk)) {
+ sk_common_release(newsk);
+ newsk = NULL;
+ }
+
+out:
+ return newsk;
+}
+
+/* Format a sockaddr for return to user space. This makes sure the return is
+ * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option.
+ */
+static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
+{
+ if (sp->v4mapped) {
+ if (addr->sa.sa_family == AF_INET)
+ sctp_v4_map_v6(addr);
+ } else {
+ if (addr->sa.sa_family == AF_INET6 &&
+ ipv6_addr_v4mapped(&addr->v6.sin6_addr))
+ sctp_v6_map_v4(addr);
+ }
+
+ if (addr->sa.sa_family == AF_INET)
+ return sizeof(struct sockaddr_in);
+ return sizeof(struct sockaddr_in6);
+}
+
+/* Where did this skb come from? */
+static int sctp_v6_skb_iif(const struct sk_buff *skb)
+{
+ struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb;
+ return opt->iif;
+}
+
+/* Was this packet marked by Explicit Congestion Notification? */
+static int sctp_v6_is_ce(const struct sk_buff *skb)
+{
+ return *((__u32 *)(ipv6_hdr(skb))) & htonl(1 << 20);
+}
+
+/* Dump the v6 addr to the seq file. */
+static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
+{
+ seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr);
+}
+
+static void sctp_v6_ecn_capable(struct sock *sk)
+{
+ inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
+}
+
+/* Initialize a PF_INET msgname from a ulpevent. */
+static void sctp_inet6_event_msgname(struct sctp_ulpevent *event,
+ char *msgname, int *addrlen)
+{
+ union sctp_addr *addr;
+ struct sctp_association *asoc;
+ union sctp_addr *paddr;
+
+ if (!msgname)
+ return;
+
+ addr = (union sctp_addr *)msgname;
+ asoc = event->asoc;
+ paddr = &asoc->peer.primary_addr;
+
+ if (paddr->sa.sa_family == AF_INET) {
+ addr->v4.sin_family = AF_INET;
+ addr->v4.sin_port = htons(asoc->peer.port);
+ addr->v4.sin_addr = paddr->v4.sin_addr;
+ } else {
+ addr->v6.sin6_family = AF_INET6;
+ addr->v6.sin6_flowinfo = 0;
+ if (ipv6_addr_type(&paddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ addr->v6.sin6_scope_id = paddr->v6.sin6_scope_id;
+ else
+ addr->v6.sin6_scope_id = 0;
+ addr->v6.sin6_port = htons(asoc->peer.port);
+ addr->v6.sin6_addr = paddr->v6.sin6_addr;
+ }
+
+ *addrlen = sctp_v6_addr_to_user(sctp_sk(asoc->base.sk), addr);
+}
+
+/* Initialize a msg_name from an inbound skb. */
+static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname,
+ int *addr_len)
+{
+ union sctp_addr *addr;
+ struct sctphdr *sh;
+
+ if (!msgname)
+ return;
+
+ addr = (union sctp_addr *)msgname;
+ sh = sctp_hdr(skb);
+
+ if (ip_hdr(skb)->version == 4) {
+ addr->v4.sin_family = AF_INET;
+ addr->v4.sin_port = sh->source;
+ addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr;
+ } else {
+ addr->v6.sin6_family = AF_INET6;
+ addr->v6.sin6_flowinfo = 0;
+ addr->v6.sin6_port = sh->source;
+ addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
+ if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
+ struct sctp_ulpevent *ev = sctp_skb2event(skb);
+ addr->v6.sin6_scope_id = ev->iif;
+ }
+ }
+
+ *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr);
+}
+
+/* Do we support this AF? */
+static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp)
+{
+ switch (family) {
+ case AF_INET6:
+ return 1;
+ /* v4-mapped-v6 addresses */
+ case AF_INET:
+ if (!__ipv6_only_sock(sctp_opt2sk(sp)))
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/* Address matching with wildcards allowed. This extra level
+ * of indirection lets us choose whether a PF_INET6 should
+ * disallow any v4 addresses if we so choose.
+ */
+static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2,
+ struct sctp_sock *opt)
+{
+ struct sctp_af *af1, *af2;
+ struct sock *sk = sctp_opt2sk(opt);
+
+ af1 = sctp_get_af_specific(addr1->sa.sa_family);
+ af2 = sctp_get_af_specific(addr2->sa.sa_family);
+
+ if (!af1 || !af2)
+ return 0;
+
+ /* If the socket is IPv6 only, v4 addrs will not match */
+ if (__ipv6_only_sock(sk) && af1 != af2)
+ return 0;
+
+ /* Today, wildcard AF_INET/AF_INET6. */
+ if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2))
+ return 1;
+
+ if (addr1->sa.sa_family != addr2->sa.sa_family)
+ return 0;
+
+ return af1->cmp_addr(addr1, addr2);
+}
+
+/* Verify that the provided sockaddr looks bindable. Common verification,
+ * has already been taken care of.
+ */
+static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
+{
+ struct sctp_af *af;
+
+ /* ASSERT: address family has already been verified. */
+ if (addr->sa.sa_family != AF_INET6)
+ af = sctp_get_af_specific(addr->sa.sa_family);
+ else {
+ int type = ipv6_addr_type(&addr->v6.sin6_addr);
+ struct net_device *dev;
+
+ if (type & IPV6_ADDR_LINKLOCAL) {
+ struct net *net;
+ if (!addr->v6.sin6_scope_id)
+ return 0;
+ net = sock_net(&opt->inet.sk);
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id);
+ if (!dev ||
+ !ipv6_chk_addr(net, &addr->v6.sin6_addr, dev, 0)) {
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ }
+
+ af = opt->pf->af;
+ }
+ return af->available(addr, opt);
+}
+
+/* Verify that the provided sockaddr looks sendable. Common verification,
+ * has already been taken care of.
+ */
+static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
+{
+ struct sctp_af *af = NULL;
+
+ /* ASSERT: address family has already been verified. */
+ if (addr->sa.sa_family != AF_INET6)
+ af = sctp_get_af_specific(addr->sa.sa_family);
+ else {
+ int type = ipv6_addr_type(&addr->v6.sin6_addr);
+ struct net_device *dev;
+
+ if (type & IPV6_ADDR_LINKLOCAL) {
+ if (!addr->v6.sin6_scope_id)
+ return 0;
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(&opt->inet.sk),
+ addr->v6.sin6_scope_id);
+ rcu_read_unlock();
+ if (!dev)
+ return 0;
+ }
+ af = opt->pf->af;
+ }
+
+ return af != NULL;
+}
+
+/* Fill in Supported Address Type information for INIT and INIT-ACK
+ * chunks. Note: In the future, we may want to look at sock options
+ * to determine whether a PF_INET6 socket really wants to have IPV4
+ * addresses.
+ * Returns number of addresses supported.
+ */
+static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
+ __be16 *types)
+{
+ types[0] = SCTP_PARAM_IPV6_ADDRESS;
+ if (!opt || !ipv6_only_sock(sctp_opt2sk(opt))) {
+ types[1] = SCTP_PARAM_IPV4_ADDRESS;
+ return 2;
+ }
+ return 1;
+}
+
+/* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */
+static int sctp_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ int rc;
+
+ rc = inet6_getname(sock, uaddr, uaddr_len, peer);
+
+ if (rc != 0)
+ return rc;
+
+ *uaddr_len = sctp_v6_addr_to_user(sctp_sk(sock->sk),
+ (union sctp_addr *)uaddr);
+
+ return rc;
+}
+
+static const struct proto_ops inet6_seqpacket_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = sctp_getname,
+ .poll = sctp_poll,
+ .ioctl = inet6_ioctl,
+ .listen = sctp_inet_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw sctpv6_seqpacket_protosw = {
+ .type = SOCK_SEQPACKET,
+ .protocol = IPPROTO_SCTP,
+ .prot = &sctpv6_prot,
+ .ops = &inet6_seqpacket_ops,
+ .flags = SCTP_PROTOSW_FLAG
+};
+static struct inet_protosw sctpv6_stream_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_SCTP,
+ .prot = &sctpv6_prot,
+ .ops = &inet6_seqpacket_ops,
+ .flags = SCTP_PROTOSW_FLAG,
+};
+
+static int sctp6_rcv(struct sk_buff *skb)
+{
+ return sctp_rcv(skb) ? -1 : 0;
+}
+
+static const struct inet6_protocol sctpv6_protocol = {
+ .handler = sctp6_rcv,
+ .err_handler = sctp_v6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+static struct sctp_af sctp_af_inet6 = {
+ .sa_family = AF_INET6,
+ .sctp_xmit = sctp_v6_xmit,
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .get_dst = sctp_v6_get_dst,
+ .get_saddr = sctp_v6_get_saddr,
+ .copy_addrlist = sctp_v6_copy_addrlist,
+ .from_skb = sctp_v6_from_skb,
+ .from_sk = sctp_v6_from_sk,
+ .from_addr_param = sctp_v6_from_addr_param,
+ .to_addr_param = sctp_v6_to_addr_param,
+ .cmp_addr = sctp_v6_cmp_addr,
+ .scope = sctp_v6_scope,
+ .addr_valid = sctp_v6_addr_valid,
+ .inaddr_any = sctp_v6_inaddr_any,
+ .is_any = sctp_v6_is_any,
+ .available = sctp_v6_available,
+ .skb_iif = sctp_v6_skb_iif,
+ .is_ce = sctp_v6_is_ce,
+ .seq_dump_addr = sctp_v6_seq_dump_addr,
+ .ecn_capable = sctp_v6_ecn_capable,
+ .net_header_len = sizeof(struct ipv6hdr),
+ .sockaddr_len = sizeof(struct sockaddr_in6),
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ipv6_setsockopt,
+ .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+static struct sctp_pf sctp_pf_inet6 = {
+ .event_msgname = sctp_inet6_event_msgname,
+ .skb_msgname = sctp_inet6_skb_msgname,
+ .af_supported = sctp_inet6_af_supported,
+ .cmp_addr = sctp_inet6_cmp_addr,
+ .bind_verify = sctp_inet6_bind_verify,
+ .send_verify = sctp_inet6_send_verify,
+ .supported_addrs = sctp_inet6_supported_addrs,
+ .create_accept_sk = sctp_v6_create_accept_sk,
+ .addr_to_user = sctp_v6_addr_to_user,
+ .to_sk_saddr = sctp_v6_to_sk_saddr,
+ .to_sk_daddr = sctp_v6_to_sk_daddr,
+ .af = &sctp_af_inet6,
+};
+
+/* Initialize IPv6 support and register with socket layer. */
+void sctp_v6_pf_init(void)
+{
+ /* Register the SCTP specific PF_INET6 functions. */
+ sctp_register_pf(&sctp_pf_inet6, PF_INET6);
+
+ /* Register the SCTP specific AF_INET6 functions. */
+ sctp_register_af(&sctp_af_inet6);
+}
+
+void sctp_v6_pf_exit(void)
+{
+ list_del(&sctp_af_inet6.list);
+}
+
+/* Initialize IPv6 support and register with socket layer. */
+int sctp_v6_protosw_init(void)
+{
+ int rc;
+
+ rc = proto_register(&sctpv6_prot, 1);
+ if (rc)
+ return rc;
+
+ /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */
+ inet6_register_protosw(&sctpv6_seqpacket_protosw);
+ inet6_register_protosw(&sctpv6_stream_protosw);
+
+ return 0;
+}
+
+void sctp_v6_protosw_exit(void)
+{
+ inet6_unregister_protosw(&sctpv6_seqpacket_protosw);
+ inet6_unregister_protosw(&sctpv6_stream_protosw);
+ proto_unregister(&sctpv6_prot);
+}
+
+
+/* Register with inet6 layer. */
+int sctp_v6_add_protocol(void)
+{
+ /* Register notifier for inet6 address additions/deletions. */
+ register_inet6addr_notifier(&sctp_inet6addr_notifier);
+
+ if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0)
+ return -EAGAIN;
+
+ return 0;
+}
+
+/* Unregister with inet6 layer. */
+void sctp_v6_del_protocol(void)
+{
+ inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP);
+ unregister_inet6addr_notifier(&sctp_inet6addr_notifier);
+}
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
new file mode 100644
index 000000000..40e7fac96
--- /dev/null
+++ b/net/sctp/objcnt.c
@@ -0,0 +1,142 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * Support for memory object debugging. This allows one to monitor the
+ * object allocations/deallocations for types instrumented for this
+ * via the proc fs.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Jon Grimm <jgrimm@us.ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <net/sctp/sctp.h>
+
+/*
+ * Global counters to count raw object allocation counts.
+ * To add new counters, choose a unique suffix for the variable
+ * name as the helper macros key off this suffix to make
+ * life easier for the programmer.
+ */
+
+SCTP_DBG_OBJCNT(sock);
+SCTP_DBG_OBJCNT(ep);
+SCTP_DBG_OBJCNT(transport);
+SCTP_DBG_OBJCNT(assoc);
+SCTP_DBG_OBJCNT(bind_addr);
+SCTP_DBG_OBJCNT(bind_bucket);
+SCTP_DBG_OBJCNT(chunk);
+SCTP_DBG_OBJCNT(addr);
+SCTP_DBG_OBJCNT(ssnmap);
+SCTP_DBG_OBJCNT(datamsg);
+SCTP_DBG_OBJCNT(keys);
+
+/* An array to make it easy to pretty print the debug information
+ * to the proc fs.
+ */
+static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
+ SCTP_DBG_OBJCNT_ENTRY(sock),
+ SCTP_DBG_OBJCNT_ENTRY(ep),
+ SCTP_DBG_OBJCNT_ENTRY(assoc),
+ SCTP_DBG_OBJCNT_ENTRY(transport),
+ SCTP_DBG_OBJCNT_ENTRY(chunk),
+ SCTP_DBG_OBJCNT_ENTRY(bind_addr),
+ SCTP_DBG_OBJCNT_ENTRY(bind_bucket),
+ SCTP_DBG_OBJCNT_ENTRY(addr),
+ SCTP_DBG_OBJCNT_ENTRY(ssnmap),
+ SCTP_DBG_OBJCNT_ENTRY(datamsg),
+ SCTP_DBG_OBJCNT_ENTRY(keys),
+};
+
+/* Callback from procfs to read out objcount information.
+ * Walk through the entries in the sctp_dbg_objcnt array, dumping
+ * the raw object counts for each monitored type.
+ */
+static int sctp_objcnt_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ i = (int)*(loff_t *)v;
+ seq_setwidth(seq, 127);
+ seq_printf(seq, "%s: %d", sctp_dbg_objcnt[i].label,
+ atomic_read(sctp_dbg_objcnt[i].counter));
+ seq_pad(seq, '\n');
+ return 0;
+}
+
+static void *sctp_objcnt_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos;
+}
+
+static void sctp_objcnt_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static void *sctp_objcnt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos;
+}
+
+static const struct seq_operations sctp_objcnt_seq_ops = {
+ .start = sctp_objcnt_seq_start,
+ .next = sctp_objcnt_seq_next,
+ .stop = sctp_objcnt_seq_stop,
+ .show = sctp_objcnt_seq_show,
+};
+
+static int sctp_objcnt_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &sctp_objcnt_seq_ops);
+}
+
+static const struct file_operations sctp_objcnt_ops = {
+ .open = sctp_objcnt_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* Initialize the objcount in the proc filesystem. */
+void sctp_dbg_objcnt_init(struct net *net)
+{
+ struct proc_dir_entry *ent;
+
+ ent = proc_create("sctp_dbg_objcnt", 0,
+ net->sctp.proc_net_sctp, &sctp_objcnt_ops);
+ if (!ent)
+ pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
+}
+
+/* Cleanup the objcount entry in the proc filesystem. */
+void sctp_dbg_objcnt_exit(struct net *net)
+{
+ remove_proc_entry("sctp_dbg_objcnt", net->sctp.proc_net_sctp);
+}
+
+
diff --git a/net/sctp/output.c b/net/sctp/output.c
new file mode 100644
index 000000000..abe7c2db2
--- /dev/null
+++ b/net/sctp/output.c
@@ -0,0 +1,788 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions handle output processing.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@austin.ibm.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/net_namespace.h>
+
+#include <linux/socket.h> /* for sa_family_t */
+#include <net/sock.h>
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/checksum.h>
+
+/* Forward declarations for private helpers. */
+static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
+ struct sctp_chunk *chunk);
+static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
+ struct sctp_chunk *chunk);
+static void sctp_packet_append_data(struct sctp_packet *packet,
+ struct sctp_chunk *chunk);
+static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
+ struct sctp_chunk *chunk,
+ u16 chunk_len);
+
+static void sctp_packet_reset(struct sctp_packet *packet)
+{
+ packet->size = packet->overhead;
+ packet->has_cookie_echo = 0;
+ packet->has_sack = 0;
+ packet->has_data = 0;
+ packet->has_auth = 0;
+ packet->ipfragok = 0;
+ packet->auth = NULL;
+}
+
+/* Config a packet.
+ * This appears to be a followup set of initializations.
+ */
+struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
+ __u32 vtag, int ecn_capable)
+{
+ struct sctp_chunk *chunk = NULL;
+
+ pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
+
+ packet->vtag = vtag;
+
+ if (ecn_capable && sctp_packet_empty(packet)) {
+ chunk = sctp_get_ecne_prepend(packet->transport->asoc);
+
+ /* If there a is a prepend chunk stick it on the list before
+ * any other chunks get appended.
+ */
+ if (chunk)
+ sctp_packet_append_chunk(packet, chunk);
+ }
+
+ return packet;
+}
+
+/* Initialize the packet structure. */
+struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
+ struct sctp_transport *transport,
+ __u16 sport, __u16 dport)
+{
+ struct sctp_association *asoc = transport->asoc;
+ size_t overhead;
+
+ pr_debug("%s: packet:%p transport:%p\n", __func__, packet, transport);
+
+ packet->transport = transport;
+ packet->source_port = sport;
+ packet->destination_port = dport;
+ INIT_LIST_HEAD(&packet->chunk_list);
+ if (asoc) {
+ struct sctp_sock *sp = sctp_sk(asoc->base.sk);
+ overhead = sp->pf->af->net_header_len;
+ } else {
+ overhead = sizeof(struct ipv6hdr);
+ }
+ overhead += sizeof(struct sctphdr);
+ packet->overhead = overhead;
+ sctp_packet_reset(packet);
+ packet->vtag = 0;
+
+ return packet;
+}
+
+/* Free a packet. */
+void sctp_packet_free(struct sctp_packet *packet)
+{
+ struct sctp_chunk *chunk, *tmp;
+
+ pr_debug("%s: packet:%p\n", __func__, packet);
+
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
+ sctp_chunk_free(chunk);
+ }
+}
+
+/* This routine tries to append the chunk to the offered packet. If adding
+ * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk
+ * is not present in the packet, it transmits the input packet.
+ * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long
+ * as it can fit in the packet, but any more data that does not fit in this
+ * packet can be sent only after receiving the COOKIE_ACK.
+ */
+sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
+ struct sctp_chunk *chunk,
+ int one_packet)
+{
+ sctp_xmit_t retval;
+ int error = 0;
+
+ pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk);
+
+ switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
+ case SCTP_XMIT_PMTU_FULL:
+ if (!packet->has_cookie_echo) {
+ error = sctp_packet_transmit(packet);
+ if (error < 0)
+ chunk->skb->sk->sk_err = -error;
+
+ /* If we have an empty packet, then we can NOT ever
+ * return PMTU_FULL.
+ */
+ if (!one_packet)
+ retval = sctp_packet_append_chunk(packet,
+ chunk);
+ }
+ break;
+
+ case SCTP_XMIT_RWND_FULL:
+ case SCTP_XMIT_OK:
+ case SCTP_XMIT_DELAY:
+ break;
+ }
+
+ return retval;
+}
+
+/* Try to bundle an auth chunk into the packet. */
+static sctp_xmit_t sctp_packet_bundle_auth(struct sctp_packet *pkt,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_association *asoc = pkt->transport->asoc;
+ struct sctp_chunk *auth;
+ sctp_xmit_t retval = SCTP_XMIT_OK;
+
+ /* if we don't have an association, we can't do authentication */
+ if (!asoc)
+ return retval;
+
+ /* See if this is an auth chunk we are bundling or if
+ * auth is already bundled.
+ */
+ if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth)
+ return retval;
+
+ /* if the peer did not request this chunk to be authenticated,
+ * don't do it
+ */
+ if (!chunk->auth)
+ return retval;
+
+ auth = sctp_make_auth(asoc);
+ if (!auth)
+ return retval;
+
+ retval = __sctp_packet_append_chunk(pkt, auth);
+
+ if (retval != SCTP_XMIT_OK)
+ sctp_chunk_free(auth);
+
+ return retval;
+}
+
+/* Try to bundle a SACK with the packet. */
+static sctp_xmit_t sctp_packet_bundle_sack(struct sctp_packet *pkt,
+ struct sctp_chunk *chunk)
+{
+ sctp_xmit_t retval = SCTP_XMIT_OK;
+
+ /* If sending DATA and haven't aleady bundled a SACK, try to
+ * bundle one in to the packet.
+ */
+ if (sctp_chunk_is_data(chunk) && !pkt->has_sack &&
+ !pkt->has_cookie_echo) {
+ struct sctp_association *asoc;
+ struct timer_list *timer;
+ asoc = pkt->transport->asoc;
+ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
+
+ /* If the SACK timer is running, we have a pending SACK */
+ if (timer_pending(timer)) {
+ struct sctp_chunk *sack;
+
+ if (pkt->transport->sack_generation !=
+ pkt->transport->asoc->peer.sack_generation)
+ return retval;
+
+ asoc->a_rwnd = asoc->rwnd;
+ sack = sctp_make_sack(asoc);
+ if (sack) {
+ retval = __sctp_packet_append_chunk(pkt, sack);
+ if (retval != SCTP_XMIT_OK) {
+ sctp_chunk_free(sack);
+ goto out;
+ }
+ asoc->peer.sack_needed = 0;
+ if (del_timer(timer))
+ sctp_association_put(asoc);
+ }
+ }
+ }
+out:
+ return retval;
+}
+
+
+/* Append a chunk to the offered packet reporting back any inability to do
+ * so.
+ */
+static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
+ struct sctp_chunk *chunk)
+{
+ sctp_xmit_t retval = SCTP_XMIT_OK;
+ __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length));
+
+ /* Check to see if this chunk will fit into the packet */
+ retval = sctp_packet_will_fit(packet, chunk, chunk_len);
+ if (retval != SCTP_XMIT_OK)
+ goto finish;
+
+ /* We believe that this chunk is OK to add to the packet */
+ switch (chunk->chunk_hdr->type) {
+ case SCTP_CID_DATA:
+ /* Account for the data being in the packet */
+ sctp_packet_append_data(packet, chunk);
+ /* Disallow SACK bundling after DATA. */
+ packet->has_sack = 1;
+ /* Disallow AUTH bundling after DATA */
+ packet->has_auth = 1;
+ /* Let it be knows that packet has DATA in it */
+ packet->has_data = 1;
+ /* timestamp the chunk for rtx purposes */
+ chunk->sent_at = jiffies;
+ break;
+ case SCTP_CID_COOKIE_ECHO:
+ packet->has_cookie_echo = 1;
+ break;
+
+ case SCTP_CID_SACK:
+ packet->has_sack = 1;
+ if (chunk->asoc)
+ chunk->asoc->stats.osacks++;
+ break;
+
+ case SCTP_CID_AUTH:
+ packet->has_auth = 1;
+ packet->auth = chunk;
+ break;
+ }
+
+ /* It is OK to send this chunk. */
+ list_add_tail(&chunk->list, &packet->chunk_list);
+ packet->size += chunk_len;
+ chunk->transport = packet->transport;
+finish:
+ return retval;
+}
+
+/* Append a chunk to the offered packet reporting back any inability to do
+ * so.
+ */
+sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet,
+ struct sctp_chunk *chunk)
+{
+ sctp_xmit_t retval = SCTP_XMIT_OK;
+
+ pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk);
+
+ /* Data chunks are special. Before seeing what else we can
+ * bundle into this packet, check to see if we are allowed to
+ * send this DATA.
+ */
+ if (sctp_chunk_is_data(chunk)) {
+ retval = sctp_packet_can_append_data(packet, chunk);
+ if (retval != SCTP_XMIT_OK)
+ goto finish;
+ }
+
+ /* Try to bundle AUTH chunk */
+ retval = sctp_packet_bundle_auth(packet, chunk);
+ if (retval != SCTP_XMIT_OK)
+ goto finish;
+
+ /* Try to bundle SACK chunk */
+ retval = sctp_packet_bundle_sack(packet, chunk);
+ if (retval != SCTP_XMIT_OK)
+ goto finish;
+
+ retval = __sctp_packet_append_chunk(packet, chunk);
+
+finish:
+ return retval;
+}
+
+static void sctp_packet_release_owner(struct sk_buff *skb)
+{
+ sk_free(skb->sk);
+}
+
+static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sctp_packet_release_owner;
+
+ /*
+ * The data chunks have already been accounted for in sctp_sendmsg(),
+ * therefore only reserve a single byte to keep socket around until
+ * the packet has been transmitted.
+ */
+ atomic_inc(&sk->sk_wmem_alloc);
+}
+
+/* All packets are sent to the network through this function from
+ * sctp_outq_tail().
+ *
+ * The return value is a normal kernel error return value.
+ */
+int sctp_packet_transmit(struct sctp_packet *packet)
+{
+ struct sctp_transport *tp = packet->transport;
+ struct sctp_association *asoc = tp->asoc;
+ struct sctphdr *sh;
+ struct sk_buff *nskb;
+ struct sctp_chunk *chunk, *tmp;
+ struct sock *sk;
+ int err = 0;
+ int padding; /* How much padding do we need? */
+ __u8 has_data = 0;
+ struct dst_entry *dst;
+ unsigned char *auth = NULL; /* pointer to auth in skb data */
+
+ pr_debug("%s: packet:%p\n", __func__, packet);
+
+ /* Do NOT generate a chunkless packet. */
+ if (list_empty(&packet->chunk_list))
+ return err;
+
+ /* Set up convenience variables... */
+ chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
+ sk = chunk->skb->sk;
+
+ /* Allocate the new skb. */
+ nskb = alloc_skb(packet->size + MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ goto nomem;
+
+ /* Make sure the outbound skb has enough header room reserved. */
+ skb_reserve(nskb, packet->overhead + MAX_HEADER);
+
+ /* Set the owning socket so that we know where to get the
+ * destination IP address.
+ */
+ sctp_packet_set_owner_w(nskb, sk);
+
+ if (!sctp_transport_dst_check(tp)) {
+ sctp_transport_route(tp, NULL, sctp_sk(sk));
+ if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) {
+ sctp_assoc_sync_pmtu(sk, asoc);
+ }
+ }
+ dst = dst_clone(tp->dst);
+ if (!dst)
+ goto no_route;
+ skb_dst_set(nskb, dst);
+
+ /* Build the SCTP header. */
+ sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
+ skb_reset_transport_header(nskb);
+ sh->source = htons(packet->source_port);
+ sh->dest = htons(packet->destination_port);
+
+ /* From 6.8 Adler-32 Checksum Calculation:
+ * After the packet is constructed (containing the SCTP common
+ * header and one or more control or DATA chunks), the
+ * transmitter shall:
+ *
+ * 1) Fill in the proper Verification Tag in the SCTP common
+ * header and initialize the checksum field to 0's.
+ */
+ sh->vtag = htonl(packet->vtag);
+ sh->checksum = 0;
+
+ /**
+ * 6.10 Bundling
+ *
+ * An endpoint bundles chunks by simply including multiple
+ * chunks in one outbound SCTP packet. ...
+ */
+
+ /**
+ * 3.2 Chunk Field Descriptions
+ *
+ * The total length of a chunk (including Type, Length and
+ * Value fields) MUST be a multiple of 4 bytes. If the length
+ * of the chunk is not a multiple of 4 bytes, the sender MUST
+ * pad the chunk with all zero bytes and this padding is not
+ * included in the chunk length field. The sender should
+ * never pad with more than 3 bytes.
+ *
+ * [This whole comment explains WORD_ROUND() below.]
+ */
+
+ pr_debug("***sctp_transmit_packet***\n");
+
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
+ if (sctp_chunk_is_data(chunk)) {
+ /* 6.3.1 C4) When data is in flight and when allowed
+ * by rule C5, a new RTT measurement MUST be made each
+ * round trip. Furthermore, new RTT measurements
+ * SHOULD be made no more than once per round-trip
+ * for a given destination transport address.
+ */
+
+ if (!chunk->resent && !tp->rto_pending) {
+ chunk->rtt_in_progress = 1;
+ tp->rto_pending = 1;
+ }
+
+ has_data = 1;
+ }
+
+ padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+ if (padding)
+ memset(skb_put(chunk->skb, padding), 0, padding);
+
+ /* if this is the auth chunk that we are adding,
+ * store pointer where it will be added and put
+ * the auth into the packet.
+ */
+ if (chunk == packet->auth)
+ auth = skb_tail_pointer(nskb);
+
+ memcpy(skb_put(nskb, chunk->skb->len),
+ chunk->skb->data, chunk->skb->len);
+
+ pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, "
+ "rtt_in_progress:%d\n", chunk,
+ sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
+ chunk->has_tsn ? "TSN" : "No TSN",
+ chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0,
+ ntohs(chunk->chunk_hdr->length), chunk->skb->len,
+ chunk->rtt_in_progress);
+
+ /*
+ * If this is a control chunk, this is our last
+ * reference. Free data chunks after they've been
+ * acknowledged or have failed.
+ */
+ if (!sctp_chunk_is_data(chunk))
+ sctp_chunk_free(chunk);
+ }
+
+ /* SCTP-AUTH, Section 6.2
+ * The sender MUST calculate the MAC as described in RFC2104 [2]
+ * using the hash function H as described by the MAC Identifier and
+ * the shared association key K based on the endpoint pair shared key
+ * described by the shared key identifier. The 'data' used for the
+ * computation of the AUTH-chunk is given by the AUTH chunk with its
+ * HMAC field set to zero (as shown in Figure 6) followed by all
+ * chunks that are placed after the AUTH chunk in the SCTP packet.
+ */
+ if (auth)
+ sctp_auth_calculate_hmac(asoc, nskb,
+ (struct sctp_auth_chunk *)auth,
+ GFP_ATOMIC);
+
+ /* 2) Calculate the Adler-32 checksum of the whole packet,
+ * including the SCTP common header and all the
+ * chunks.
+ *
+ * Note: Adler-32 is no longer applicable, as has been replaced
+ * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
+ */
+ if (!sctp_checksum_disable) {
+ if (!(dst->dev->features & NETIF_F_SCTP_CSUM) ||
+ (dst_xfrm(dst) != NULL) || packet->ipfragok) {
+ sh->checksum = sctp_compute_cksum(nskb, 0);
+ } else {
+ /* no need to seed pseudo checksum for SCTP */
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = skb_transport_header(nskb) - nskb->head;
+ nskb->csum_offset = offsetof(struct sctphdr, checksum);
+ }
+ }
+
+ /* IP layer ECN support
+ * From RFC 2481
+ * "The ECN-Capable Transport (ECT) bit would be set by the
+ * data sender to indicate that the end-points of the
+ * transport protocol are ECN-capable."
+ *
+ * Now setting the ECT bit all the time, as it should not cause
+ * any problems protocol-wise even if our peer ignores it.
+ *
+ * Note: The works for IPv6 layer checks this bit too later
+ * in transmission. See IP6_ECN_flow_xmit().
+ */
+ tp->af_specific->ecn_capable(nskb->sk);
+
+ /* Set up the IP options. */
+ /* BUG: not implemented
+ * For v4 this all lives somewhere in sk->sk_opt...
+ */
+
+ /* Dump that on IP! */
+ if (asoc) {
+ asoc->stats.opackets++;
+ if (asoc->peer.last_sent_to != tp)
+ /* Considering the multiple CPU scenario, this is a
+ * "correcter" place for last_sent_to. --xguo
+ */
+ asoc->peer.last_sent_to = tp;
+ }
+
+ if (has_data) {
+ struct timer_list *timer;
+ unsigned long timeout;
+
+ /* Restart the AUTOCLOSE timer when sending data. */
+ if (sctp_state(asoc, ESTABLISHED) &&
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
+ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
+ timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
+
+ if (!mod_timer(timer, jiffies + timeout))
+ sctp_association_hold(asoc);
+ }
+ }
+
+ pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len);
+
+ nskb->ignore_df = packet->ipfragok;
+ tp->af_specific->sctp_xmit(nskb, tp);
+
+out:
+ sctp_packet_reset(packet);
+ return err;
+no_route:
+ kfree_skb(nskb);
+
+ if (asoc)
+ IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES);
+
+ /* FIXME: Returning the 'err' will effect all the associations
+ * associated with a socket, although only one of the paths of the
+ * association is unreachable.
+ * The real failure of a transport or association can be passed on
+ * to the user via notifications. So setting this error may not be
+ * required.
+ */
+ /* err = -EHOSTUNREACH; */
+err:
+ /* Control chunks are unreliable so just drop them. DATA chunks
+ * will get resent or dropped later.
+ */
+
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
+ if (!sctp_chunk_is_data(chunk))
+ sctp_chunk_free(chunk);
+ }
+ goto out;
+nomem:
+ err = -ENOMEM;
+ goto err;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* This private function check to see if a chunk can be added */
+static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
+ struct sctp_chunk *chunk)
+{
+ size_t datasize, rwnd, inflight, flight_size;
+ struct sctp_transport *transport = packet->transport;
+ struct sctp_association *asoc = transport->asoc;
+ struct sctp_outq *q = &asoc->outqueue;
+
+ /* RFC 2960 6.1 Transmission of DATA Chunks
+ *
+ * A) At any given time, the data sender MUST NOT transmit new data to
+ * any destination transport address if its peer's rwnd indicates
+ * that the peer has no buffer space (i.e. rwnd is 0, see Section
+ * 6.2.1). However, regardless of the value of rwnd (including if it
+ * is 0), the data sender can always have one DATA chunk in flight to
+ * the receiver if allowed by cwnd (see rule B below). This rule
+ * allows the sender to probe for a change in rwnd that the sender
+ * missed due to the SACK having been lost in transit from the data
+ * receiver to the data sender.
+ */
+
+ rwnd = asoc->peer.rwnd;
+ inflight = q->outstanding_bytes;
+ flight_size = transport->flight_size;
+
+ datasize = sctp_data_size(chunk);
+
+ if (datasize > rwnd && inflight > 0)
+ /* We have (at least) one data chunk in flight,
+ * so we can't fall back to rule 6.1 B).
+ */
+ return SCTP_XMIT_RWND_FULL;
+
+ /* RFC 2960 6.1 Transmission of DATA Chunks
+ *
+ * B) At any given time, the sender MUST NOT transmit new data
+ * to a given transport address if it has cwnd or more bytes
+ * of data outstanding to that transport address.
+ */
+ /* RFC 7.2.4 & the Implementers Guide 2.8.
+ *
+ * 3) ...
+ * When a Fast Retransmit is being performed the sender SHOULD
+ * ignore the value of cwnd and SHOULD NOT delay retransmission.
+ */
+ if (chunk->fast_retransmit != SCTP_NEED_FRTX &&
+ flight_size >= transport->cwnd)
+ return SCTP_XMIT_RWND_FULL;
+
+ /* Nagle's algorithm to solve small-packet problem:
+ * Inhibit the sending of new chunks when new outgoing data arrives
+ * if any previously transmitted data on the connection remains
+ * unacknowledged.
+ */
+
+ if (sctp_sk(asoc->base.sk)->nodelay)
+ /* Nagle disabled */
+ return SCTP_XMIT_OK;
+
+ if (!sctp_packet_empty(packet))
+ /* Append to packet */
+ return SCTP_XMIT_OK;
+
+ if (inflight == 0)
+ /* Nothing unacked */
+ return SCTP_XMIT_OK;
+
+ if (!sctp_state(asoc, ESTABLISHED))
+ return SCTP_XMIT_OK;
+
+ /* Check whether this chunk and all the rest of pending data will fit
+ * or delay in hopes of bundling a full sized packet.
+ */
+ if (chunk->skb->len + q->out_qlen >= transport->pathmtu - packet->overhead)
+ /* Enough data queued to fill a packet */
+ return SCTP_XMIT_OK;
+
+ /* Don't delay large message writes that may have been fragmented */
+ if (!chunk->msg->can_delay)
+ return SCTP_XMIT_OK;
+
+ /* Defer until all data acked or packet full */
+ return SCTP_XMIT_DELAY;
+}
+
+/* This private function does management things when adding DATA chunk */
+static void sctp_packet_append_data(struct sctp_packet *packet,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_transport *transport = packet->transport;
+ size_t datasize = sctp_data_size(chunk);
+ struct sctp_association *asoc = transport->asoc;
+ u32 rwnd = asoc->peer.rwnd;
+
+ /* Keep track of how many bytes are in flight over this transport. */
+ transport->flight_size += datasize;
+
+ /* Keep track of how many bytes are in flight to the receiver. */
+ asoc->outqueue.outstanding_bytes += datasize;
+
+ /* Update our view of the receiver's rwnd. */
+ if (datasize < rwnd)
+ rwnd -= datasize;
+ else
+ rwnd = 0;
+
+ asoc->peer.rwnd = rwnd;
+ /* Has been accepted for transmission. */
+ if (!asoc->peer.prsctp_capable)
+ chunk->msg->can_abandon = 0;
+ sctp_chunk_assign_tsn(chunk);
+ sctp_chunk_assign_ssn(chunk);
+}
+
+static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
+ struct sctp_chunk *chunk,
+ u16 chunk_len)
+{
+ size_t psize;
+ size_t pmtu;
+ int too_big;
+ sctp_xmit_t retval = SCTP_XMIT_OK;
+
+ psize = packet->size;
+ pmtu = ((packet->transport->asoc) ?
+ (packet->transport->asoc->pathmtu) :
+ (packet->transport->pathmtu));
+
+ too_big = (psize + chunk_len > pmtu);
+
+ /* Decide if we need to fragment or resubmit later. */
+ if (too_big) {
+ /* It's OK to fragmet at IP level if any one of the following
+ * is true:
+ * 1. The packet is empty (meaning this chunk is greater
+ * the MTU)
+ * 2. The chunk we are adding is a control chunk
+ * 3. The packet doesn't have any data in it yet and data
+ * requires authentication.
+ */
+ if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) ||
+ (!packet->has_data && chunk->auth)) {
+ /* We no longer do re-fragmentation.
+ * Just fragment at the IP layer, if we
+ * actually hit this condition
+ */
+ packet->ipfragok = 1;
+ } else {
+ retval = SCTP_XMIT_PMTU_FULL;
+ }
+ }
+
+ return retval;
+}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
new file mode 100644
index 000000000..7e8f0a117
--- /dev/null
+++ b/net/sctp/outqueue.c
@@ -0,0 +1,1787 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2003 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions implement the sctp_outq class. The outqueue handles
+ * bundling and queueing of outgoing SCTP chunks.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Perry Melange <pmelange@null.cc.uic.edu>
+ * Xingang Guo <xingang.guo@intel.com>
+ * Hui Huang <hui.huang@nokia.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/list.h> /* For struct list_head */
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <net/sock.h> /* For skb_set_owner_w */
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Declare internal functions here. */
+static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn);
+static void sctp_check_transmitted(struct sctp_outq *q,
+ struct list_head *transmitted_queue,
+ struct sctp_transport *transport,
+ union sctp_addr *saddr,
+ struct sctp_sackhdr *sack,
+ __u32 *highest_new_tsn);
+
+static void sctp_mark_missing(struct sctp_outq *q,
+ struct list_head *transmitted_queue,
+ struct sctp_transport *transport,
+ __u32 highest_new_tsn,
+ int count_of_newacks);
+
+static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
+
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout);
+
+/* Add data to the front of the queue. */
+static inline void sctp_outq_head_data(struct sctp_outq *q,
+ struct sctp_chunk *ch)
+{
+ list_add(&ch->list, &q->out_chunk_list);
+ q->out_qlen += ch->skb->len;
+}
+
+/* Take data from the front of the queue. */
+static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
+{
+ struct sctp_chunk *ch = NULL;
+
+ if (!list_empty(&q->out_chunk_list)) {
+ struct list_head *entry = q->out_chunk_list.next;
+
+ ch = list_entry(entry, struct sctp_chunk, list);
+ list_del_init(entry);
+ q->out_qlen -= ch->skb->len;
+ }
+ return ch;
+}
+/* Add data chunk to the end of the queue. */
+static inline void sctp_outq_tail_data(struct sctp_outq *q,
+ struct sctp_chunk *ch)
+{
+ list_add_tail(&ch->list, &q->out_chunk_list);
+ q->out_qlen += ch->skb->len;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * D) If count_of_newacks is greater than or equal to 2
+ * and t was not sent to the current primary then the
+ * sender MUST NOT increment missing report count for t.
+ */
+static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary,
+ struct sctp_transport *transport,
+ int count_of_newacks)
+{
+ if (count_of_newacks >= 2 && transport != primary)
+ return 1;
+ return 0;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * F) If count_of_newacks is less than 2, let d be the
+ * destination to which t was sent. If cacc_saw_newack
+ * is 0 for destination d, then the sender MUST NOT
+ * increment missing report count for t.
+ */
+static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport,
+ int count_of_newacks)
+{
+ if (count_of_newacks < 2 &&
+ (transport && !transport->cacc.cacc_saw_newack))
+ return 1;
+ return 0;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD
+ * execute steps C, D, F.
+ *
+ * C has been implemented in sctp_outq_sack
+ */
+static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary,
+ struct sctp_transport *transport,
+ int count_of_newacks)
+{
+ if (!primary->cacc.cycling_changeover) {
+ if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks))
+ return 1;
+ if (sctp_cacc_skip_3_1_f(transport, count_of_newacks))
+ return 1;
+ return 0;
+ }
+ return 0;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less
+ * than next_tsn_at_change of the current primary, then
+ * the sender MUST NOT increment missing report count
+ * for t.
+ */
+static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn)
+{
+ if (primary->cacc.cycling_changeover &&
+ TSN_lt(tsn, primary->cacc.next_tsn_at_change))
+ return 1;
+ return 0;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * 3) If the missing report count for TSN t is to be
+ * incremented according to [RFC2960] and
+ * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set,
+ * then the sender MUST further execute steps 3.1 and
+ * 3.2 to determine if the missing report count for
+ * TSN t SHOULD NOT be incremented.
+ *
+ * 3.3) If 3.1 and 3.2 do not dictate that the missing
+ * report count for t should not be incremented, then
+ * the sender SHOULD increment missing report count for
+ * t (according to [RFC2960] and [SCTP_STEWART_2002]).
+ */
+static inline int sctp_cacc_skip(struct sctp_transport *primary,
+ struct sctp_transport *transport,
+ int count_of_newacks,
+ __u32 tsn)
+{
+ if (primary->cacc.changeover_active &&
+ (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) ||
+ sctp_cacc_skip_3_2(primary, tsn)))
+ return 1;
+ return 0;
+}
+
+/* Initialize an existing sctp_outq. This does the boring stuff.
+ * You still need to define handlers if you really want to DO
+ * something with this structure...
+ */
+void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
+{
+ memset(q, 0, sizeof(struct sctp_outq));
+
+ q->asoc = asoc;
+ INIT_LIST_HEAD(&q->out_chunk_list);
+ INIT_LIST_HEAD(&q->control_chunk_list);
+ INIT_LIST_HEAD(&q->retransmit);
+ INIT_LIST_HEAD(&q->sacked);
+ INIT_LIST_HEAD(&q->abandoned);
+}
+
+/* Free the outqueue structure and any related pending chunks.
+ */
+static void __sctp_outq_teardown(struct sctp_outq *q)
+{
+ struct sctp_transport *transport;
+ struct list_head *lchunk, *temp;
+ struct sctp_chunk *chunk, *tmp;
+
+ /* Throw away unacknowledged chunks. */
+ list_for_each_entry(transport, &q->asoc->peer.transport_addr_list,
+ transports) {
+ while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) {
+ chunk = list_entry(lchunk, struct sctp_chunk,
+ transmitted_list);
+ /* Mark as part of a failed message. */
+ sctp_chunk_fail(chunk, q->error);
+ sctp_chunk_free(chunk);
+ }
+ }
+
+ /* Throw away chunks that have been gap ACKed. */
+ list_for_each_safe(lchunk, temp, &q->sacked) {
+ list_del_init(lchunk);
+ chunk = list_entry(lchunk, struct sctp_chunk,
+ transmitted_list);
+ sctp_chunk_fail(chunk, q->error);
+ sctp_chunk_free(chunk);
+ }
+
+ /* Throw away any chunks in the retransmit queue. */
+ list_for_each_safe(lchunk, temp, &q->retransmit) {
+ list_del_init(lchunk);
+ chunk = list_entry(lchunk, struct sctp_chunk,
+ transmitted_list);
+ sctp_chunk_fail(chunk, q->error);
+ sctp_chunk_free(chunk);
+ }
+
+ /* Throw away any chunks that are in the abandoned queue. */
+ list_for_each_safe(lchunk, temp, &q->abandoned) {
+ list_del_init(lchunk);
+ chunk = list_entry(lchunk, struct sctp_chunk,
+ transmitted_list);
+ sctp_chunk_fail(chunk, q->error);
+ sctp_chunk_free(chunk);
+ }
+
+ /* Throw away any leftover data chunks. */
+ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+
+ /* Mark as send failure. */
+ sctp_chunk_fail(chunk, q->error);
+ sctp_chunk_free(chunk);
+ }
+
+ /* Throw away any leftover control chunks. */
+ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+ list_del_init(&chunk->list);
+ sctp_chunk_free(chunk);
+ }
+}
+
+void sctp_outq_teardown(struct sctp_outq *q)
+{
+ __sctp_outq_teardown(q);
+ sctp_outq_init(q->asoc, q);
+}
+
+/* Free the outqueue structure and any related pending chunks. */
+void sctp_outq_free(struct sctp_outq *q)
+{
+ /* Throw away leftover chunks. */
+ __sctp_outq_teardown(q);
+}
+
+/* Put a new chunk in an sctp_outq. */
+int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
+{
+ struct net *net = sock_net(q->asoc->base.sk);
+ int error = 0;
+
+ pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
+ chunk && chunk->chunk_hdr ?
+ sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+ "illegal chunk");
+
+ /* If it is data, queue it up, otherwise, send it
+ * immediately.
+ */
+ if (sctp_chunk_is_data(chunk)) {
+ /* Is it OK to queue data chunks? */
+ /* From 9. Termination of Association
+ *
+ * When either endpoint performs a shutdown, the
+ * association on each peer will stop accepting new
+ * data from its user and only deliver data in queue
+ * at the time of sending or receiving the SHUTDOWN
+ * chunk.
+ */
+ switch (q->asoc->state) {
+ case SCTP_STATE_CLOSED:
+ case SCTP_STATE_SHUTDOWN_PENDING:
+ case SCTP_STATE_SHUTDOWN_SENT:
+ case SCTP_STATE_SHUTDOWN_RECEIVED:
+ case SCTP_STATE_SHUTDOWN_ACK_SENT:
+ /* Cannot send after transport endpoint shutdown */
+ error = -ESHUTDOWN;
+ break;
+
+ default:
+ pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
+ __func__, q, chunk, chunk && chunk->chunk_hdr ?
+ sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+ "illegal chunk");
+
+ sctp_outq_tail_data(q, chunk);
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+ SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
+ else
+ SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
+ break;
+ }
+ } else {
+ list_add_tail(&chunk->list, &q->control_chunk_list);
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+ }
+
+ if (error < 0)
+ return error;
+
+ if (!q->cork)
+ error = sctp_outq_flush(q, 0);
+
+ return error;
+}
+
+/* Insert a chunk into the sorted list based on the TSNs. The retransmit list
+ * and the abandoned list are in ascending order.
+ */
+static void sctp_insert_list(struct list_head *head, struct list_head *new)
+{
+ struct list_head *pos;
+ struct sctp_chunk *nchunk, *lchunk;
+ __u32 ntsn, ltsn;
+ int done = 0;
+
+ nchunk = list_entry(new, struct sctp_chunk, transmitted_list);
+ ntsn = ntohl(nchunk->subh.data_hdr->tsn);
+
+ list_for_each(pos, head) {
+ lchunk = list_entry(pos, struct sctp_chunk, transmitted_list);
+ ltsn = ntohl(lchunk->subh.data_hdr->tsn);
+ if (TSN_lt(ntsn, ltsn)) {
+ list_add(new, pos->prev);
+ done = 1;
+ break;
+ }
+ }
+ if (!done)
+ list_add_tail(new, head);
+}
+
+/* Mark all the eligible packets on a transport for retransmission. */
+void sctp_retransmit_mark(struct sctp_outq *q,
+ struct sctp_transport *transport,
+ __u8 reason)
+{
+ struct list_head *lchunk, *ltemp;
+ struct sctp_chunk *chunk;
+
+ /* Walk through the specified transmitted queue. */
+ list_for_each_safe(lchunk, ltemp, &transport->transmitted) {
+ chunk = list_entry(lchunk, struct sctp_chunk,
+ transmitted_list);
+
+ /* If the chunk is abandoned, move it to abandoned list. */
+ if (sctp_chunk_abandoned(chunk)) {
+ list_del_init(lchunk);
+ sctp_insert_list(&q->abandoned, lchunk);
+
+ /* If this chunk has not been previousely acked,
+ * stop considering it 'outstanding'. Our peer
+ * will most likely never see it since it will
+ * not be retransmitted
+ */
+ if (!chunk->tsn_gap_acked) {
+ if (chunk->transport)
+ chunk->transport->flight_size -=
+ sctp_data_size(chunk);
+ q->outstanding_bytes -= sctp_data_size(chunk);
+ q->asoc->peer.rwnd += sctp_data_size(chunk);
+ }
+ continue;
+ }
+
+ /* If we are doing retransmission due to a timeout or pmtu
+ * discovery, only the chunks that are not yet acked should
+ * be added to the retransmit queue.
+ */
+ if ((reason == SCTP_RTXR_FAST_RTX &&
+ (chunk->fast_retransmit == SCTP_NEED_FRTX)) ||
+ (reason != SCTP_RTXR_FAST_RTX && !chunk->tsn_gap_acked)) {
+ /* RFC 2960 6.2.1 Processing a Received SACK
+ *
+ * C) Any time a DATA chunk is marked for
+ * retransmission (via either T3-rtx timer expiration
+ * (Section 6.3.3) or via fast retransmit
+ * (Section 7.2.4)), add the data size of those
+ * chunks to the rwnd.
+ */
+ q->asoc->peer.rwnd += sctp_data_size(chunk);
+ q->outstanding_bytes -= sctp_data_size(chunk);
+ if (chunk->transport)
+ transport->flight_size -= sctp_data_size(chunk);
+
+ /* sctpimpguide-05 Section 2.8.2
+ * M5) If a T3-rtx timer expires, the
+ * 'TSN.Missing.Report' of all affected TSNs is set
+ * to 0.
+ */
+ chunk->tsn_missing_report = 0;
+
+ /* If a chunk that is being used for RTT measurement
+ * has to be retransmitted, we cannot use this chunk
+ * anymore for RTT measurements. Reset rto_pending so
+ * that a new RTT measurement is started when a new
+ * data chunk is sent.
+ */
+ if (chunk->rtt_in_progress) {
+ chunk->rtt_in_progress = 0;
+ transport->rto_pending = 0;
+ }
+
+ chunk->resent = 1;
+
+ /* Move the chunk to the retransmit queue. The chunks
+ * on the retransmit queue are always kept in order.
+ */
+ list_del_init(lchunk);
+ sctp_insert_list(&q->retransmit, lchunk);
+ }
+ }
+
+ pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d, "
+ "flight_size:%d, pba:%d\n", __func__, transport, reason,
+ transport->cwnd, transport->ssthresh, transport->flight_size,
+ transport->partial_bytes_acked);
+}
+
+/* Mark all the eligible packets on a transport for retransmission and force
+ * one packet out.
+ */
+void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
+ sctp_retransmit_reason_t reason)
+{
+ struct net *net = sock_net(q->asoc->base.sk);
+ int error = 0;
+
+ switch (reason) {
+ case SCTP_RTXR_T3_RTX:
+ SCTP_INC_STATS(net, SCTP_MIB_T3_RETRANSMITS);
+ sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX);
+ /* Update the retran path if the T3-rtx timer has expired for
+ * the current retran path.
+ */
+ if (transport == transport->asoc->peer.retran_path)
+ sctp_assoc_update_retran_path(transport->asoc);
+ transport->asoc->rtx_data_chunks +=
+ transport->asoc->unack_data;
+ break;
+ case SCTP_RTXR_FAST_RTX:
+ SCTP_INC_STATS(net, SCTP_MIB_FAST_RETRANSMITS);
+ sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX);
+ q->fast_rtx = 1;
+ break;
+ case SCTP_RTXR_PMTUD:
+ SCTP_INC_STATS(net, SCTP_MIB_PMTUD_RETRANSMITS);
+ break;
+ case SCTP_RTXR_T1_RTX:
+ SCTP_INC_STATS(net, SCTP_MIB_T1_RETRANSMITS);
+ transport->asoc->init_retries++;
+ break;
+ default:
+ BUG();
+ }
+
+ sctp_retransmit_mark(q, transport, reason);
+
+ /* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination,
+ * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by
+ * following the procedures outlined in C1 - C5.
+ */
+ if (reason == SCTP_RTXR_T3_RTX)
+ sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
+
+ /* Flush the queues only on timeout, since fast_rtx is only
+ * triggered during sack processing and the queue
+ * will be flushed at the end.
+ */
+ if (reason != SCTP_RTXR_FAST_RTX)
+ error = sctp_outq_flush(q, /* rtx_timeout */ 1);
+
+ if (error)
+ q->asoc->base.sk->sk_err = -error;
+}
+
+/*
+ * Transmit DATA chunks on the retransmit queue. Upon return from
+ * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
+ * need to be transmitted by the caller.
+ * We assume that pkt->transport has already been set.
+ *
+ * The return value is a normal kernel error return value.
+ */
+static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
+ int rtx_timeout, int *start_timer)
+{
+ struct list_head *lqueue;
+ struct sctp_transport *transport = pkt->transport;
+ sctp_xmit_t status;
+ struct sctp_chunk *chunk, *chunk1;
+ int fast_rtx;
+ int error = 0;
+ int timer = 0;
+ int done = 0;
+
+ lqueue = &q->retransmit;
+ fast_rtx = q->fast_rtx;
+
+ /* This loop handles time-out retransmissions, fast retransmissions,
+ * and retransmissions due to opening of whindow.
+ *
+ * RFC 2960 6.3.3 Handle T3-rtx Expiration
+ *
+ * E3) Determine how many of the earliest (i.e., lowest TSN)
+ * outstanding DATA chunks for the address for which the
+ * T3-rtx has expired will fit into a single packet, subject
+ * to the MTU constraint for the path corresponding to the
+ * destination transport address to which the retransmission
+ * is being sent (this may be different from the address for
+ * which the timer expires [see Section 6.4]). Call this value
+ * K. Bundle and retransmit those K DATA chunks in a single
+ * packet to the destination endpoint.
+ *
+ * [Just to be painfully clear, if we are retransmitting
+ * because a timeout just happened, we should send only ONE
+ * packet of retransmitted data.]
+ *
+ * For fast retransmissions we also send only ONE packet. However,
+ * if we are just flushing the queue due to open window, we'll
+ * try to send as much as possible.
+ */
+ list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) {
+ /* If the chunk is abandoned, move it to abandoned list. */
+ if (sctp_chunk_abandoned(chunk)) {
+ list_del_init(&chunk->transmitted_list);
+ sctp_insert_list(&q->abandoned,
+ &chunk->transmitted_list);
+ continue;
+ }
+
+ /* Make sure that Gap Acked TSNs are not retransmitted. A
+ * simple approach is just to move such TSNs out of the
+ * way and into a 'transmitted' queue and skip to the
+ * next chunk.
+ */
+ if (chunk->tsn_gap_acked) {
+ list_move_tail(&chunk->transmitted_list,
+ &transport->transmitted);
+ continue;
+ }
+
+ /* If we are doing fast retransmit, ignore non-fast_rtransmit
+ * chunks
+ */
+ if (fast_rtx && !chunk->fast_retransmit)
+ continue;
+
+redo:
+ /* Attempt to append this chunk to the packet. */
+ status = sctp_packet_append_chunk(pkt, chunk);
+
+ switch (status) {
+ case SCTP_XMIT_PMTU_FULL:
+ if (!pkt->has_data && !pkt->has_cookie_echo) {
+ /* If this packet did not contain DATA then
+ * retransmission did not happen, so do it
+ * again. We'll ignore the error here since
+ * control chunks are already freed so there
+ * is nothing we can do.
+ */
+ sctp_packet_transmit(pkt);
+ goto redo;
+ }
+
+ /* Send this packet. */
+ error = sctp_packet_transmit(pkt);
+
+ /* If we are retransmitting, we should only
+ * send a single packet.
+ * Otherwise, try appending this chunk again.
+ */
+ if (rtx_timeout || fast_rtx)
+ done = 1;
+ else
+ goto redo;
+
+ /* Bundle next chunk in the next round. */
+ break;
+
+ case SCTP_XMIT_RWND_FULL:
+ /* Send this packet. */
+ error = sctp_packet_transmit(pkt);
+
+ /* Stop sending DATA as there is no more room
+ * at the receiver.
+ */
+ done = 1;
+ break;
+
+ case SCTP_XMIT_DELAY:
+ /* Send this packet. */
+ error = sctp_packet_transmit(pkt);
+
+ /* Stop sending DATA because of nagle delay. */
+ done = 1;
+ break;
+
+ default:
+ /* The append was successful, so add this chunk to
+ * the transmitted list.
+ */
+ list_move_tail(&chunk->transmitted_list,
+ &transport->transmitted);
+
+ /* Mark the chunk as ineligible for fast retransmit
+ * after it is retransmitted.
+ */
+ if (chunk->fast_retransmit == SCTP_NEED_FRTX)
+ chunk->fast_retransmit = SCTP_DONT_FRTX;
+
+ q->asoc->stats.rtxchunks++;
+ break;
+ }
+
+ /* Set the timer if there were no errors */
+ if (!error && !timer)
+ timer = 1;
+
+ if (done)
+ break;
+ }
+
+ /* If we are here due to a retransmit timeout or a fast
+ * retransmit and if there are any chunks left in the retransmit
+ * queue that could not fit in the PMTU sized packet, they need
+ * to be marked as ineligible for a subsequent fast retransmit.
+ */
+ if (rtx_timeout || fast_rtx) {
+ list_for_each_entry(chunk1, lqueue, transmitted_list) {
+ if (chunk1->fast_retransmit == SCTP_NEED_FRTX)
+ chunk1->fast_retransmit = SCTP_DONT_FRTX;
+ }
+ }
+
+ *start_timer = timer;
+
+ /* Clear fast retransmit hint */
+ if (fast_rtx)
+ q->fast_rtx = 0;
+
+ return error;
+}
+
+/* Cork the outqueue so queued chunks are really queued. */
+int sctp_outq_uncork(struct sctp_outq *q)
+{
+ if (q->cork)
+ q->cork = 0;
+
+ return sctp_outq_flush(q, 0);
+}
+
+
+/*
+ * Try to flush an outqueue.
+ *
+ * Description: Send everything in q which we legally can, subject to
+ * congestion limitations.
+ * * Note: This function can be called from multiple contexts so appropriate
+ * locking concerns must be made. Today we use the sock lock to protect
+ * this function.
+ */
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
+{
+ struct sctp_packet *packet;
+ struct sctp_packet singleton;
+ struct sctp_association *asoc = q->asoc;
+ __u16 sport = asoc->base.bind_addr.port;
+ __u16 dport = asoc->peer.port;
+ __u32 vtag = asoc->peer.i.init_tag;
+ struct sctp_transport *transport = NULL;
+ struct sctp_transport *new_transport;
+ struct sctp_chunk *chunk, *tmp;
+ sctp_xmit_t status;
+ int error = 0;
+ int start_timer = 0;
+ int one_packet = 0;
+
+ /* These transports have chunks to send. */
+ struct list_head transport_list;
+ struct list_head *ltransport;
+
+ INIT_LIST_HEAD(&transport_list);
+ packet = NULL;
+
+ /*
+ * 6.10 Bundling
+ * ...
+ * When bundling control chunks with DATA chunks, an
+ * endpoint MUST place control chunks first in the outbound
+ * SCTP packet. The transmitter MUST transmit DATA chunks
+ * within a SCTP packet in increasing order of TSN.
+ * ...
+ */
+
+ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+ /* RFC 5061, 5.3
+ * F1) This means that until such time as the ASCONF
+ * containing the add is acknowledged, the sender MUST
+ * NOT use the new IP address as a source for ANY SCTP
+ * packet except on carrying an ASCONF Chunk.
+ */
+ if (asoc->src_out_of_asoc_ok &&
+ chunk->chunk_hdr->type != SCTP_CID_ASCONF)
+ continue;
+
+ list_del_init(&chunk->list);
+
+ /* Pick the right transport to use. */
+ new_transport = chunk->transport;
+
+ if (!new_transport) {
+ /*
+ * If we have a prior transport pointer, see if
+ * the destination address of the chunk
+ * matches the destination address of the
+ * current transport. If not a match, then
+ * try to look up the transport with a given
+ * destination address. We do this because
+ * after processing ASCONFs, we may have new
+ * transports created.
+ */
+ if (transport &&
+ sctp_cmp_addr_exact(&chunk->dest,
+ &transport->ipaddr))
+ new_transport = transport;
+ else
+ new_transport = sctp_assoc_lookup_paddr(asoc,
+ &chunk->dest);
+
+ /* if we still don't have a new transport, then
+ * use the current active path.
+ */
+ if (!new_transport)
+ new_transport = asoc->peer.active_path;
+ } else if ((new_transport->state == SCTP_INACTIVE) ||
+ (new_transport->state == SCTP_UNCONFIRMED) ||
+ (new_transport->state == SCTP_PF)) {
+ /* If the chunk is Heartbeat or Heartbeat Ack,
+ * send it to chunk->transport, even if it's
+ * inactive.
+ *
+ * 3.3.6 Heartbeat Acknowledgement:
+ * ...
+ * A HEARTBEAT ACK is always sent to the source IP
+ * address of the IP datagram containing the
+ * HEARTBEAT chunk to which this ack is responding.
+ * ...
+ *
+ * ASCONF_ACKs also must be sent to the source.
+ */
+ if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT &&
+ chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK &&
+ chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK)
+ new_transport = asoc->peer.active_path;
+ }
+
+ /* Are we switching transports?
+ * Take care of transport locks.
+ */
+ if (new_transport != transport) {
+ transport = new_transport;
+ if (list_empty(&transport->send_ready)) {
+ list_add_tail(&transport->send_ready,
+ &transport_list);
+ }
+ packet = &transport->packet;
+ sctp_packet_config(packet, vtag,
+ asoc->peer.ecn_capable);
+ }
+
+ switch (chunk->chunk_hdr->type) {
+ /*
+ * 6.10 Bundling
+ * ...
+ * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN
+ * COMPLETE with any other chunks. [Send them immediately.]
+ */
+ case SCTP_CID_INIT:
+ case SCTP_CID_INIT_ACK:
+ case SCTP_CID_SHUTDOWN_COMPLETE:
+ sctp_packet_init(&singleton, transport, sport, dport);
+ sctp_packet_config(&singleton, vtag, 0);
+ sctp_packet_append_chunk(&singleton, chunk);
+ error = sctp_packet_transmit(&singleton);
+ if (error < 0)
+ return error;
+ break;
+
+ case SCTP_CID_ABORT:
+ if (sctp_test_T_bit(chunk)) {
+ packet->vtag = asoc->c.my_vtag;
+ }
+ /* The following chunks are "response" chunks, i.e.
+ * they are generated in response to something we
+ * received. If we are sending these, then we can
+ * send only 1 packet containing these chunks.
+ */
+ case SCTP_CID_HEARTBEAT_ACK:
+ case SCTP_CID_SHUTDOWN_ACK:
+ case SCTP_CID_COOKIE_ACK:
+ case SCTP_CID_COOKIE_ECHO:
+ case SCTP_CID_ERROR:
+ case SCTP_CID_ECN_CWR:
+ case SCTP_CID_ASCONF_ACK:
+ one_packet = 1;
+ /* Fall through */
+
+ case SCTP_CID_SACK:
+ case SCTP_CID_HEARTBEAT:
+ case SCTP_CID_SHUTDOWN:
+ case SCTP_CID_ECN_ECNE:
+ case SCTP_CID_ASCONF:
+ case SCTP_CID_FWD_TSN:
+ status = sctp_packet_transmit_chunk(packet, chunk,
+ one_packet);
+ if (status != SCTP_XMIT_OK) {
+ /* put the chunk back */
+ list_add(&chunk->list, &q->control_chunk_list);
+ } else {
+ asoc->stats.octrlchunks++;
+ /* PR-SCTP C5) If a FORWARD TSN is sent, the
+ * sender MUST assure that at least one T3-rtx
+ * timer is running.
+ */
+ if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN)
+ sctp_transport_reset_timers(transport);
+ }
+ break;
+
+ default:
+ /* We built a chunk with an illegal type! */
+ BUG();
+ }
+ }
+
+ if (q->asoc->src_out_of_asoc_ok)
+ goto sctp_flush_out;
+
+ /* Is it OK to send data chunks? */
+ switch (asoc->state) {
+ case SCTP_STATE_COOKIE_ECHOED:
+ /* Only allow bundling when this packet has a COOKIE-ECHO
+ * chunk.
+ */
+ if (!packet || !packet->has_cookie_echo)
+ break;
+
+ /* fallthru */
+ case SCTP_STATE_ESTABLISHED:
+ case SCTP_STATE_SHUTDOWN_PENDING:
+ case SCTP_STATE_SHUTDOWN_RECEIVED:
+ /*
+ * RFC 2960 6.1 Transmission of DATA Chunks
+ *
+ * C) When the time comes for the sender to transmit,
+ * before sending new DATA chunks, the sender MUST
+ * first transmit any outstanding DATA chunks which
+ * are marked for retransmission (limited by the
+ * current cwnd).
+ */
+ if (!list_empty(&q->retransmit)) {
+ if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
+ goto sctp_flush_out;
+ if (transport == asoc->peer.retran_path)
+ goto retran;
+
+ /* Switch transports & prepare the packet. */
+
+ transport = asoc->peer.retran_path;
+
+ if (list_empty(&transport->send_ready)) {
+ list_add_tail(&transport->send_ready,
+ &transport_list);
+ }
+
+ packet = &transport->packet;
+ sctp_packet_config(packet, vtag,
+ asoc->peer.ecn_capable);
+ retran:
+ error = sctp_outq_flush_rtx(q, packet,
+ rtx_timeout, &start_timer);
+
+ if (start_timer)
+ sctp_transport_reset_timers(transport);
+
+ /* This can happen on COOKIE-ECHO resend. Only
+ * one chunk can get bundled with a COOKIE-ECHO.
+ */
+ if (packet->has_cookie_echo)
+ goto sctp_flush_out;
+
+ /* Don't send new data if there is still data
+ * waiting to retransmit.
+ */
+ if (!list_empty(&q->retransmit))
+ goto sctp_flush_out;
+ }
+
+ /* Apply Max.Burst limitation to the current transport in
+ * case it will be used for new data. We are going to
+ * rest it before we return, but we want to apply the limit
+ * to the currently queued data.
+ */
+ if (transport)
+ sctp_transport_burst_limited(transport);
+
+ /* Finally, transmit new packets. */
+ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+ /* RFC 2960 6.5 Every DATA chunk MUST carry a valid
+ * stream identifier.
+ */
+ if (chunk->sinfo.sinfo_stream >=
+ asoc->c.sinit_num_ostreams) {
+
+ /* Mark as failed send. */
+ sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
+ sctp_chunk_free(chunk);
+ continue;
+ }
+
+ /* Has this chunk expired? */
+ if (sctp_chunk_abandoned(chunk)) {
+ sctp_chunk_fail(chunk, 0);
+ sctp_chunk_free(chunk);
+ continue;
+ }
+
+ /* If there is a specified transport, use it.
+ * Otherwise, we want to use the active path.
+ */
+ new_transport = chunk->transport;
+ if (!new_transport ||
+ ((new_transport->state == SCTP_INACTIVE) ||
+ (new_transport->state == SCTP_UNCONFIRMED) ||
+ (new_transport->state == SCTP_PF)))
+ new_transport = asoc->peer.active_path;
+ if (new_transport->state == SCTP_UNCONFIRMED)
+ continue;
+
+ /* Change packets if necessary. */
+ if (new_transport != transport) {
+ transport = new_transport;
+
+ /* Schedule to have this transport's
+ * packet flushed.
+ */
+ if (list_empty(&transport->send_ready)) {
+ list_add_tail(&transport->send_ready,
+ &transport_list);
+ }
+
+ packet = &transport->packet;
+ sctp_packet_config(packet, vtag,
+ asoc->peer.ecn_capable);
+ /* We've switched transports, so apply the
+ * Burst limit to the new transport.
+ */
+ sctp_transport_burst_limited(transport);
+ }
+
+ pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
+ "skb->users:%d\n",
+ __func__, q, chunk, chunk && chunk->chunk_hdr ?
+ sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+ "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
+ chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
+ atomic_read(&chunk->skb->users) : -1);
+
+ /* Add the chunk to the packet. */
+ status = sctp_packet_transmit_chunk(packet, chunk, 0);
+
+ switch (status) {
+ case SCTP_XMIT_PMTU_FULL:
+ case SCTP_XMIT_RWND_FULL:
+ case SCTP_XMIT_DELAY:
+ /* We could not append this chunk, so put
+ * the chunk back on the output queue.
+ */
+ pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
+ __func__, ntohl(chunk->subh.data_hdr->tsn),
+ status);
+
+ sctp_outq_head_data(q, chunk);
+ goto sctp_flush_out;
+
+ case SCTP_XMIT_OK:
+ /* The sender is in the SHUTDOWN-PENDING state,
+ * The sender MAY set the I-bit in the DATA
+ * chunk header.
+ */
+ if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
+ chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+ asoc->stats.ouodchunks++;
+ else
+ asoc->stats.oodchunks++;
+
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* BUG: We assume that the sctp_packet_transmit()
+ * call below will succeed all the time and add the
+ * chunk to the transmitted list and restart the
+ * timers.
+ * It is possible that the call can fail under OOM
+ * conditions.
+ *
+ * Is this really a problem? Won't this behave
+ * like a lost TSN?
+ */
+ list_add_tail(&chunk->transmitted_list,
+ &transport->transmitted);
+
+ sctp_transport_reset_timers(transport);
+
+ /* Only let one DATA chunk get bundled with a
+ * COOKIE-ECHO chunk.
+ */
+ if (packet->has_cookie_echo)
+ goto sctp_flush_out;
+ }
+ break;
+
+ default:
+ /* Do nothing. */
+ break;
+ }
+
+sctp_flush_out:
+
+ /* Before returning, examine all the transports touched in
+ * this call. Right now, we bluntly force clear all the
+ * transports. Things might change after we implement Nagle.
+ * But such an examination is still required.
+ *
+ * --xguo
+ */
+ while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL) {
+ struct sctp_transport *t = list_entry(ltransport,
+ struct sctp_transport,
+ send_ready);
+ packet = &t->packet;
+ if (!sctp_packet_empty(packet))
+ error = sctp_packet_transmit(packet);
+
+ /* Clear the burst limited state, if any */
+ sctp_transport_burst_reset(t);
+ }
+
+ return error;
+}
+
+/* Update unack_data based on the incoming SACK chunk */
+static void sctp_sack_update_unack_data(struct sctp_association *assoc,
+ struct sctp_sackhdr *sack)
+{
+ sctp_sack_variable_t *frags;
+ __u16 unack_data;
+ int i;
+
+ unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1;
+
+ frags = sack->variable;
+ for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) {
+ unack_data -= ((ntohs(frags[i].gab.end) -
+ ntohs(frags[i].gab.start) + 1));
+ }
+
+ assoc->unack_data = unack_data;
+}
+
+/* This is where we REALLY process a SACK.
+ *
+ * Process the SACK against the outqueue. Mostly, this just frees
+ * things off the transmitted queue.
+ */
+int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
+{
+ struct sctp_association *asoc = q->asoc;
+ struct sctp_sackhdr *sack = chunk->subh.sack_hdr;
+ struct sctp_transport *transport;
+ struct sctp_chunk *tchunk = NULL;
+ struct list_head *lchunk, *transport_list, *temp;
+ sctp_sack_variable_t *frags = sack->variable;
+ __u32 sack_ctsn, ctsn, tsn;
+ __u32 highest_tsn, highest_new_tsn;
+ __u32 sack_a_rwnd;
+ unsigned int outstanding;
+ struct sctp_transport *primary = asoc->peer.primary_path;
+ int count_of_newacks = 0;
+ int gap_ack_blocks;
+ u8 accum_moved = 0;
+
+ /* Grab the association's destination address list. */
+ transport_list = &asoc->peer.transport_addr_list;
+
+ sack_ctsn = ntohl(sack->cum_tsn_ack);
+ gap_ack_blocks = ntohs(sack->num_gap_ack_blocks);
+ asoc->stats.gapcnt += gap_ack_blocks;
+ /*
+ * SFR-CACC algorithm:
+ * On receipt of a SACK the sender SHOULD execute the
+ * following statements.
+ *
+ * 1) If the cumulative ack in the SACK passes next tsn_at_change
+ * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be
+ * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for
+ * all destinations.
+ * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE
+ * is set the receiver of the SACK MUST take the following actions:
+ *
+ * A) Initialize the cacc_saw_newack to 0 for all destination
+ * addresses.
+ *
+ * Only bother if changeover_active is set. Otherwise, this is
+ * totally suboptimal to do on every SACK.
+ */
+ if (primary->cacc.changeover_active) {
+ u8 clear_cycling = 0;
+
+ if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) {
+ primary->cacc.changeover_active = 0;
+ clear_cycling = 1;
+ }
+
+ if (clear_cycling || gap_ack_blocks) {
+ list_for_each_entry(transport, transport_list,
+ transports) {
+ if (clear_cycling)
+ transport->cacc.cycling_changeover = 0;
+ if (gap_ack_blocks)
+ transport->cacc.cacc_saw_newack = 0;
+ }
+ }
+ }
+
+ /* Get the highest TSN in the sack. */
+ highest_tsn = sack_ctsn;
+ if (gap_ack_blocks)
+ highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end);
+
+ if (TSN_lt(asoc->highest_sacked, highest_tsn))
+ asoc->highest_sacked = highest_tsn;
+
+ highest_new_tsn = sack_ctsn;
+
+ /* Run through the retransmit queue. Credit bytes received
+ * and free those chunks that we can.
+ */
+ sctp_check_transmitted(q, &q->retransmit, NULL, NULL, sack, &highest_new_tsn);
+
+ /* Run through the transmitted queue.
+ * Credit bytes received and free those chunks which we can.
+ *
+ * This is a MASSIVE candidate for optimization.
+ */
+ list_for_each_entry(transport, transport_list, transports) {
+ sctp_check_transmitted(q, &transport->transmitted,
+ transport, &chunk->source, sack,
+ &highest_new_tsn);
+ /*
+ * SFR-CACC algorithm:
+ * C) Let count_of_newacks be the number of
+ * destinations for which cacc_saw_newack is set.
+ */
+ if (transport->cacc.cacc_saw_newack)
+ count_of_newacks++;
+ }
+
+ /* Move the Cumulative TSN Ack Point if appropriate. */
+ if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) {
+ asoc->ctsn_ack_point = sack_ctsn;
+ accum_moved = 1;
+ }
+
+ if (gap_ack_blocks) {
+
+ if (asoc->fast_recovery && accum_moved)
+ highest_new_tsn = highest_tsn;
+
+ list_for_each_entry(transport, transport_list, transports)
+ sctp_mark_missing(q, &transport->transmitted, transport,
+ highest_new_tsn, count_of_newacks);
+ }
+
+ /* Update unack_data field in the assoc. */
+ sctp_sack_update_unack_data(asoc, sack);
+
+ ctsn = asoc->ctsn_ack_point;
+
+ /* Throw away stuff rotting on the sack queue. */
+ list_for_each_safe(lchunk, temp, &q->sacked) {
+ tchunk = list_entry(lchunk, struct sctp_chunk,
+ transmitted_list);
+ tsn = ntohl(tchunk->subh.data_hdr->tsn);
+ if (TSN_lte(tsn, ctsn)) {
+ list_del_init(&tchunk->transmitted_list);
+ sctp_chunk_free(tchunk);
+ }
+ }
+
+ /* ii) Set rwnd equal to the newly received a_rwnd minus the
+ * number of bytes still outstanding after processing the
+ * Cumulative TSN Ack and the Gap Ack Blocks.
+ */
+
+ sack_a_rwnd = ntohl(sack->a_rwnd);
+ outstanding = q->outstanding_bytes;
+
+ if (outstanding < sack_a_rwnd)
+ sack_a_rwnd -= outstanding;
+ else
+ sack_a_rwnd = 0;
+
+ asoc->peer.rwnd = sack_a_rwnd;
+
+ sctp_generate_fwdtsn(q, sack_ctsn);
+
+ pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn);
+ pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, "
+ "advertised peer ack point:0x%x\n", __func__, asoc, ctsn,
+ asoc->adv_peer_ack_point);
+
+ return sctp_outq_is_empty(q);
+}
+
+/* Is the outqueue empty?
+ * The queue is empty when we have not pending data, no in-flight data
+ * and nothing pending retransmissions.
+ */
+int sctp_outq_is_empty(const struct sctp_outq *q)
+{
+ return q->out_qlen == 0 && q->outstanding_bytes == 0 &&
+ list_empty(&q->retransmit);
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* Go through a transport's transmitted list or the association's retransmit
+ * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked.
+ * The retransmit list will not have an associated transport.
+ *
+ * I added coherent debug information output. --xguo
+ *
+ * Instead of printing 'sacked' or 'kept' for each TSN on the
+ * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5.
+ * KEPT TSN6-TSN7, etc.
+ */
+static void sctp_check_transmitted(struct sctp_outq *q,
+ struct list_head *transmitted_queue,
+ struct sctp_transport *transport,
+ union sctp_addr *saddr,
+ struct sctp_sackhdr *sack,
+ __u32 *highest_new_tsn_in_sack)
+{
+ struct list_head *lchunk;
+ struct sctp_chunk *tchunk;
+ struct list_head tlist;
+ __u32 tsn;
+ __u32 sack_ctsn;
+ __u32 rtt;
+ __u8 restart_timer = 0;
+ int bytes_acked = 0;
+ int migrate_bytes = 0;
+ bool forward_progress = false;
+
+ sack_ctsn = ntohl(sack->cum_tsn_ack);
+
+ INIT_LIST_HEAD(&tlist);
+
+ /* The while loop will skip empty transmitted queues. */
+ while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) {
+ tchunk = list_entry(lchunk, struct sctp_chunk,
+ transmitted_list);
+
+ if (sctp_chunk_abandoned(tchunk)) {
+ /* Move the chunk to abandoned list. */
+ sctp_insert_list(&q->abandoned, lchunk);
+
+ /* If this chunk has not been acked, stop
+ * considering it as 'outstanding'.
+ */
+ if (!tchunk->tsn_gap_acked) {
+ if (tchunk->transport)
+ tchunk->transport->flight_size -=
+ sctp_data_size(tchunk);
+ q->outstanding_bytes -= sctp_data_size(tchunk);
+ }
+ continue;
+ }
+
+ tsn = ntohl(tchunk->subh.data_hdr->tsn);
+ if (sctp_acked(sack, tsn)) {
+ /* If this queue is the retransmit queue, the
+ * retransmit timer has already reclaimed
+ * the outstanding bytes for this chunk, so only
+ * count bytes associated with a transport.
+ */
+ if (transport) {
+ /* If this chunk is being used for RTT
+ * measurement, calculate the RTT and update
+ * the RTO using this value.
+ *
+ * 6.3.1 C5) Karn's algorithm: RTT measurements
+ * MUST NOT be made using packets that were
+ * retransmitted (and thus for which it is
+ * ambiguous whether the reply was for the
+ * first instance of the packet or a later
+ * instance).
+ */
+ if (!tchunk->tsn_gap_acked &&
+ !tchunk->resent &&
+ tchunk->rtt_in_progress) {
+ tchunk->rtt_in_progress = 0;
+ rtt = jiffies - tchunk->sent_at;
+ sctp_transport_update_rto(transport,
+ rtt);
+ }
+ }
+
+ /* If the chunk hasn't been marked as ACKED,
+ * mark it and account bytes_acked if the
+ * chunk had a valid transport (it will not
+ * have a transport if ASCONF had deleted it
+ * while DATA was outstanding).
+ */
+ if (!tchunk->tsn_gap_acked) {
+ tchunk->tsn_gap_acked = 1;
+ if (TSN_lt(*highest_new_tsn_in_sack, tsn))
+ *highest_new_tsn_in_sack = tsn;
+ bytes_acked += sctp_data_size(tchunk);
+ if (!tchunk->transport)
+ migrate_bytes += sctp_data_size(tchunk);
+ forward_progress = true;
+ }
+
+ if (TSN_lte(tsn, sack_ctsn)) {
+ /* RFC 2960 6.3.2 Retransmission Timer Rules
+ *
+ * R3) Whenever a SACK is received
+ * that acknowledges the DATA chunk
+ * with the earliest outstanding TSN
+ * for that address, restart T3-rtx
+ * timer for that address with its
+ * current RTO.
+ */
+ restart_timer = 1;
+ forward_progress = true;
+
+ if (!tchunk->tsn_gap_acked) {
+ /*
+ * SFR-CACC algorithm:
+ * 2) If the SACK contains gap acks
+ * and the flag CHANGEOVER_ACTIVE is
+ * set the receiver of the SACK MUST
+ * take the following action:
+ *
+ * B) For each TSN t being acked that
+ * has not been acked in any SACK so
+ * far, set cacc_saw_newack to 1 for
+ * the destination that the TSN was
+ * sent to.
+ */
+ if (transport &&
+ sack->num_gap_ack_blocks &&
+ q->asoc->peer.primary_path->cacc.
+ changeover_active)
+ transport->cacc.cacc_saw_newack
+ = 1;
+ }
+
+ list_add_tail(&tchunk->transmitted_list,
+ &q->sacked);
+ } else {
+ /* RFC2960 7.2.4, sctpimpguide-05 2.8.2
+ * M2) Each time a SACK arrives reporting
+ * 'Stray DATA chunk(s)' record the highest TSN
+ * reported as newly acknowledged, call this
+ * value 'HighestTSNinSack'. A newly
+ * acknowledged DATA chunk is one not
+ * previously acknowledged in a SACK.
+ *
+ * When the SCTP sender of data receives a SACK
+ * chunk that acknowledges, for the first time,
+ * the receipt of a DATA chunk, all the still
+ * unacknowledged DATA chunks whose TSN is
+ * older than that newly acknowledged DATA
+ * chunk, are qualified as 'Stray DATA chunks'.
+ */
+ list_add_tail(lchunk, &tlist);
+ }
+ } else {
+ if (tchunk->tsn_gap_acked) {
+ pr_debug("%s: receiver reneged on data TSN:0x%x\n",
+ __func__, tsn);
+
+ tchunk->tsn_gap_acked = 0;
+
+ if (tchunk->transport)
+ bytes_acked -= sctp_data_size(tchunk);
+
+ /* RFC 2960 6.3.2 Retransmission Timer Rules
+ *
+ * R4) Whenever a SACK is received missing a
+ * TSN that was previously acknowledged via a
+ * Gap Ack Block, start T3-rtx for the
+ * destination address to which the DATA
+ * chunk was originally
+ * transmitted if it is not already running.
+ */
+ restart_timer = 1;
+ }
+
+ list_add_tail(lchunk, &tlist);
+ }
+ }
+
+ if (transport) {
+ if (bytes_acked) {
+ struct sctp_association *asoc = transport->asoc;
+
+ /* We may have counted DATA that was migrated
+ * to this transport due to DEL-IP operation.
+ * Subtract those bytes, since the were never
+ * send on this transport and shouldn't be
+ * credited to this transport.
+ */
+ bytes_acked -= migrate_bytes;
+
+ /* 8.2. When an outstanding TSN is acknowledged,
+ * the endpoint shall clear the error counter of
+ * the destination transport address to which the
+ * DATA chunk was last sent.
+ * The association's overall error counter is
+ * also cleared.
+ */
+ transport->error_count = 0;
+ transport->asoc->overall_error_count = 0;
+ forward_progress = true;
+
+ /*
+ * While in SHUTDOWN PENDING, we may have started
+ * the T5 shutdown guard timer after reaching the
+ * retransmission limit. Stop that timer as soon
+ * as the receiver acknowledged any data.
+ */
+ if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING &&
+ del_timer(&asoc->timers
+ [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]))
+ sctp_association_put(asoc);
+
+ /* Mark the destination transport address as
+ * active if it is not so marked.
+ */
+ if ((transport->state == SCTP_INACTIVE ||
+ transport->state == SCTP_UNCONFIRMED) &&
+ sctp_cmp_addr_exact(&transport->ipaddr, saddr)) {
+ sctp_assoc_control_transport(
+ transport->asoc,
+ transport,
+ SCTP_TRANSPORT_UP,
+ SCTP_RECEIVED_SACK);
+ }
+
+ sctp_transport_raise_cwnd(transport, sack_ctsn,
+ bytes_acked);
+
+ transport->flight_size -= bytes_acked;
+ if (transport->flight_size == 0)
+ transport->partial_bytes_acked = 0;
+ q->outstanding_bytes -= bytes_acked + migrate_bytes;
+ } else {
+ /* RFC 2960 6.1, sctpimpguide-06 2.15.2
+ * When a sender is doing zero window probing, it
+ * should not timeout the association if it continues
+ * to receive new packets from the receiver. The
+ * reason is that the receiver MAY keep its window
+ * closed for an indefinite time.
+ * A sender is doing zero window probing when the
+ * receiver's advertised window is zero, and there is
+ * only one data chunk in flight to the receiver.
+ *
+ * Allow the association to timeout while in SHUTDOWN
+ * PENDING or SHUTDOWN RECEIVED in case the receiver
+ * stays in zero window mode forever.
+ */
+ if (!q->asoc->peer.rwnd &&
+ !list_empty(&tlist) &&
+ (sack_ctsn+2 == q->asoc->next_tsn) &&
+ q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) {
+ pr_debug("%s: sack received for zero window "
+ "probe:%u\n", __func__, sack_ctsn);
+
+ q->asoc->overall_error_count = 0;
+ transport->error_count = 0;
+ }
+ }
+
+ /* RFC 2960 6.3.2 Retransmission Timer Rules
+ *
+ * R2) Whenever all outstanding data sent to an address have
+ * been acknowledged, turn off the T3-rtx timer of that
+ * address.
+ */
+ if (!transport->flight_size) {
+ if (del_timer(&transport->T3_rtx_timer))
+ sctp_transport_put(transport);
+ } else if (restart_timer) {
+ if (!mod_timer(&transport->T3_rtx_timer,
+ jiffies + transport->rto))
+ sctp_transport_hold(transport);
+ }
+
+ if (forward_progress) {
+ if (transport->dst)
+ dst_confirm(transport->dst);
+ }
+ }
+
+ list_splice(&tlist, transmitted_queue);
+}
+
+/* Mark chunks as missing and consequently may get retransmitted. */
+static void sctp_mark_missing(struct sctp_outq *q,
+ struct list_head *transmitted_queue,
+ struct sctp_transport *transport,
+ __u32 highest_new_tsn_in_sack,
+ int count_of_newacks)
+{
+ struct sctp_chunk *chunk;
+ __u32 tsn;
+ char do_fast_retransmit = 0;
+ struct sctp_association *asoc = q->asoc;
+ struct sctp_transport *primary = asoc->peer.primary_path;
+
+ list_for_each_entry(chunk, transmitted_queue, transmitted_list) {
+
+ tsn = ntohl(chunk->subh.data_hdr->tsn);
+
+ /* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all
+ * 'Unacknowledged TSN's', if the TSN number of an
+ * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack'
+ * value, increment the 'TSN.Missing.Report' count on that
+ * chunk if it has NOT been fast retransmitted or marked for
+ * fast retransmit already.
+ */
+ if (chunk->fast_retransmit == SCTP_CAN_FRTX &&
+ !chunk->tsn_gap_acked &&
+ TSN_lt(tsn, highest_new_tsn_in_sack)) {
+
+ /* SFR-CACC may require us to skip marking
+ * this chunk as missing.
+ */
+ if (!transport || !sctp_cacc_skip(primary,
+ chunk->transport,
+ count_of_newacks, tsn)) {
+ chunk->tsn_missing_report++;
+
+ pr_debug("%s: tsn:0x%x missing counter:%d\n",
+ __func__, tsn, chunk->tsn_missing_report);
+ }
+ }
+ /*
+ * M4) If any DATA chunk is found to have a
+ * 'TSN.Missing.Report'
+ * value larger than or equal to 3, mark that chunk for
+ * retransmission and start the fast retransmit procedure.
+ */
+
+ if (chunk->tsn_missing_report >= 3) {
+ chunk->fast_retransmit = SCTP_NEED_FRTX;
+ do_fast_retransmit = 1;
+ }
+ }
+
+ if (transport) {
+ if (do_fast_retransmit)
+ sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX);
+
+ pr_debug("%s: transport:%p, cwnd:%d, ssthresh:%d, "
+ "flight_size:%d, pba:%d\n", __func__, transport,
+ transport->cwnd, transport->ssthresh,
+ transport->flight_size, transport->partial_bytes_acked);
+ }
+}
+
+/* Is the given TSN acked by this packet? */
+static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
+{
+ int i;
+ sctp_sack_variable_t *frags;
+ __u16 gap;
+ __u32 ctsn = ntohl(sack->cum_tsn_ack);
+
+ if (TSN_lte(tsn, ctsn))
+ goto pass;
+
+ /* 3.3.4 Selective Acknowledgement (SACK) (3):
+ *
+ * Gap Ack Blocks:
+ * These fields contain the Gap Ack Blocks. They are repeated
+ * for each Gap Ack Block up to the number of Gap Ack Blocks
+ * defined in the Number of Gap Ack Blocks field. All DATA
+ * chunks with TSNs greater than or equal to (Cumulative TSN
+ * Ack + Gap Ack Block Start) and less than or equal to
+ * (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack
+ * Block are assumed to have been received correctly.
+ */
+
+ frags = sack->variable;
+ gap = tsn - ctsn;
+ for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) {
+ if (TSN_lte(ntohs(frags[i].gab.start), gap) &&
+ TSN_lte(gap, ntohs(frags[i].gab.end)))
+ goto pass;
+ }
+
+ return 0;
+pass:
+ return 1;
+}
+
+static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist,
+ int nskips, __be16 stream)
+{
+ int i;
+
+ for (i = 0; i < nskips; i++) {
+ if (skiplist[i].stream == stream)
+ return i;
+ }
+ return i;
+}
+
+/* Create and add a fwdtsn chunk to the outq's control queue if needed. */
+static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
+{
+ struct sctp_association *asoc = q->asoc;
+ struct sctp_chunk *ftsn_chunk = NULL;
+ struct sctp_fwdtsn_skip ftsn_skip_arr[10];
+ int nskips = 0;
+ int skip_pos = 0;
+ __u32 tsn;
+ struct sctp_chunk *chunk;
+ struct list_head *lchunk, *temp;
+
+ if (!asoc->peer.prsctp_capable)
+ return;
+
+ /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the
+ * received SACK.
+ *
+ * If (Advanced.Peer.Ack.Point < SackCumAck), then update
+ * Advanced.Peer.Ack.Point to be equal to SackCumAck.
+ */
+ if (TSN_lt(asoc->adv_peer_ack_point, ctsn))
+ asoc->adv_peer_ack_point = ctsn;
+
+ /* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point"
+ * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as
+ * the chunk next in the out-queue space is marked as "abandoned" as
+ * shown in the following example:
+ *
+ * Assuming that a SACK arrived with the Cumulative TSN ACK 102
+ * and the Advanced.Peer.Ack.Point is updated to this value:
+ *
+ * out-queue at the end of ==> out-queue after Adv.Ack.Point
+ * normal SACK processing local advancement
+ * ... ...
+ * Adv.Ack.Pt-> 102 acked 102 acked
+ * 103 abandoned 103 abandoned
+ * 104 abandoned Adv.Ack.P-> 104 abandoned
+ * 105 105
+ * 106 acked 106 acked
+ * ... ...
+ *
+ * In this example, the data sender successfully advanced the
+ * "Advanced.Peer.Ack.Point" from 102 to 104 locally.
+ */
+ list_for_each_safe(lchunk, temp, &q->abandoned) {
+ chunk = list_entry(lchunk, struct sctp_chunk,
+ transmitted_list);
+ tsn = ntohl(chunk->subh.data_hdr->tsn);
+
+ /* Remove any chunks in the abandoned queue that are acked by
+ * the ctsn.
+ */
+ if (TSN_lte(tsn, ctsn)) {
+ list_del_init(lchunk);
+ sctp_chunk_free(chunk);
+ } else {
+ if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) {
+ asoc->adv_peer_ack_point = tsn;
+ if (chunk->chunk_hdr->flags &
+ SCTP_DATA_UNORDERED)
+ continue;
+ skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0],
+ nskips,
+ chunk->subh.data_hdr->stream);
+ ftsn_skip_arr[skip_pos].stream =
+ chunk->subh.data_hdr->stream;
+ ftsn_skip_arr[skip_pos].ssn =
+ chunk->subh.data_hdr->ssn;
+ if (skip_pos == nskips)
+ nskips++;
+ if (nskips == 10)
+ break;
+ } else
+ break;
+ }
+ }
+
+ /* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point"
+ * is greater than the Cumulative TSN ACK carried in the received
+ * SACK, the data sender MUST send the data receiver a FORWARD TSN
+ * chunk containing the latest value of the
+ * "Advanced.Peer.Ack.Point".
+ *
+ * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD
+ * list each stream and sequence number in the forwarded TSN. This
+ * information will enable the receiver to easily find any
+ * stranded TSN's waiting on stream reorder queues. Each stream
+ * SHOULD only be reported once; this means that if multiple
+ * abandoned messages occur in the same stream then only the
+ * highest abandoned stream sequence number is reported. If the
+ * total size of the FORWARD TSN does NOT fit in a single MTU then
+ * the sender of the FORWARD TSN SHOULD lower the
+ * Advanced.Peer.Ack.Point to the last TSN that will fit in a
+ * single MTU.
+ */
+ if (asoc->adv_peer_ack_point > ctsn)
+ ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point,
+ nskips, &ftsn_skip_arr[0]);
+
+ if (ftsn_chunk) {
+ list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
+ SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS);
+ }
+}
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
new file mode 100644
index 000000000..ab8d9f96a
--- /dev/null
+++ b/net/sctp/primitive.c
@@ -0,0 +1,213 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions implement the SCTP primitive functions from Section 10.
+ *
+ * Note that the descriptions from the specification are USER level
+ * functions--this file is the functions which populate the struct proto
+ * for SCTP which is the BOTTOM of the sockets interface.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Narasimha Budihal <narasimha@refcode.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ * Kevin Gao <kevin.gao@intel.com>
+ */
+
+#include <linux/types.h>
+#include <linux/list.h> /* For struct list_head */
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/time.h> /* For struct timeval */
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+#define DECLARE_PRIMITIVE(name) \
+/* This is called in the code as sctp_primitive_ ## name. */ \
+int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
+ void *arg) { \
+ int error = 0; \
+ sctp_event_t event_type; sctp_subtype_t subtype; \
+ sctp_state_t state; \
+ struct sctp_endpoint *ep; \
+ \
+ event_type = SCTP_EVENT_T_PRIMITIVE; \
+ subtype = SCTP_ST_PRIMITIVE(SCTP_PRIMITIVE_ ## name); \
+ state = asoc ? asoc->state : SCTP_STATE_CLOSED; \
+ ep = asoc ? asoc->ep : NULL; \
+ \
+ error = sctp_do_sm(net, event_type, subtype, state, ep, asoc, \
+ arg, GFP_KERNEL); \
+ return error; \
+}
+
+/* 10.1 ULP-to-SCTP
+ * B) Associate
+ *
+ * Format: ASSOCIATE(local SCTP instance name, destination transport addr,
+ * outbound stream count)
+ * -> association id [,destination transport addr list] [,outbound stream
+ * count]
+ *
+ * This primitive allows the upper layer to initiate an association to a
+ * specific peer endpoint.
+ *
+ * This version assumes that asoc is fully populated with the initial
+ * parameters. We then return a traditional kernel indicator of
+ * success or failure.
+ */
+
+/* This is called in the code as sctp_primitive_ASSOCIATE. */
+
+DECLARE_PRIMITIVE(ASSOCIATE)
+
+/* 10.1 ULP-to-SCTP
+ * C) Shutdown
+ *
+ * Format: SHUTDOWN(association id)
+ * -> result
+ *
+ * Gracefully closes an association. Any locally queued user data
+ * will be delivered to the peer. The association will be terminated only
+ * after the peer acknowledges all the SCTP packets sent. A success code
+ * will be returned on successful termination of the association. If
+ * attempting to terminate the association results in a failure, an error
+ * code shall be returned.
+ */
+
+DECLARE_PRIMITIVE(SHUTDOWN);
+
+/* 10.1 ULP-to-SCTP
+ * C) Abort
+ *
+ * Format: Abort(association id [, cause code])
+ * -> result
+ *
+ * Ungracefully closes an association. Any locally queued user data
+ * will be discarded and an ABORT chunk is sent to the peer. A success
+ * code will be returned on successful abortion of the association. If
+ * attempting to abort the association results in a failure, an error
+ * code shall be returned.
+ */
+
+DECLARE_PRIMITIVE(ABORT);
+
+/* 10.1 ULP-to-SCTP
+ * E) Send
+ *
+ * Format: SEND(association id, buffer address, byte count [,context]
+ * [,stream id] [,life time] [,destination transport address]
+ * [,unorder flag] [,no-bundle flag] [,payload protocol-id] )
+ * -> result
+ *
+ * This is the main method to send user data via SCTP.
+ *
+ * Mandatory attributes:
+ *
+ * o association id - local handle to the SCTP association
+ *
+ * o buffer address - the location where the user message to be
+ * transmitted is stored;
+ *
+ * o byte count - The size of the user data in number of bytes;
+ *
+ * Optional attributes:
+ *
+ * o context - an optional 32 bit integer that will be carried in the
+ * sending failure notification to the ULP if the transportation of
+ * this User Message fails.
+ *
+ * o stream id - to indicate which stream to send the data on. If not
+ * specified, stream 0 will be used.
+ *
+ * o life time - specifies the life time of the user data. The user data
+ * will not be sent by SCTP after the life time expires. This
+ * parameter can be used to avoid efforts to transmit stale
+ * user messages. SCTP notifies the ULP if the data cannot be
+ * initiated to transport (i.e. sent to the destination via SCTP's
+ * send primitive) within the life time variable. However, the
+ * user data will be transmitted if SCTP has attempted to transmit a
+ * chunk before the life time expired.
+ *
+ * o destination transport address - specified as one of the destination
+ * transport addresses of the peer endpoint to which this packet
+ * should be sent. Whenever possible, SCTP should use this destination
+ * transport address for sending the packets, instead of the current
+ * primary path.
+ *
+ * o unorder flag - this flag, if present, indicates that the user
+ * would like the data delivered in an unordered fashion to the peer
+ * (i.e., the U flag is set to 1 on all DATA chunks carrying this
+ * message).
+ *
+ * o no-bundle flag - instructs SCTP not to bundle this user data with
+ * other outbound DATA chunks. SCTP MAY still bundle even when
+ * this flag is present, when faced with network congestion.
+ *
+ * o payload protocol-id - A 32 bit unsigned integer that is to be
+ * passed to the peer indicating the type of payload protocol data
+ * being transmitted. This value is passed as opaque data by SCTP.
+ */
+
+DECLARE_PRIMITIVE(SEND);
+
+/* 10.1 ULP-to-SCTP
+ * J) Request Heartbeat
+ *
+ * Format: REQUESTHEARTBEAT(association id, destination transport address)
+ *
+ * -> result
+ *
+ * Instructs the local endpoint to perform a HeartBeat on the specified
+ * destination transport address of the given association. The returned
+ * result should indicate whether the transmission of the HEARTBEAT
+ * chunk to the destination address is successful.
+ *
+ * Mandatory attributes:
+ *
+ * o association id - local handle to the SCTP association
+ *
+ * o destination transport address - the transport address of the
+ * association on which a heartbeat should be issued.
+ */
+
+DECLARE_PRIMITIVE(REQUESTHEARTBEAT);
+
+/* ADDIP
+* 3.1.1 Address Configuration Change Chunk (ASCONF)
+*
+* This chunk is used to communicate to the remote endpoint one of the
+* configuration change requests that MUST be acknowledged. The
+* information carried in the ASCONF Chunk uses the form of a
+* Type-Length-Value (TLV), as described in "3.2.1 Optional/
+* Variable-length Parameter Format" in RFC2960 [5], forall variable
+* parameters.
+*/
+
+DECLARE_PRIMITIVE(ASCONF);
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
new file mode 100644
index 000000000..5e68b94ee
--- /dev/null
+++ b/net/sctp/probe.c
@@ -0,0 +1,243 @@
+/*
+ * sctp_probe - Observe the SCTP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * Modified for SCTP from Stephen Hemminger's code
+ * Copyright (C) 2010, Wei Yongjun <yjwei@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/sctp.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/time.h>
+#include <net/net_namespace.h>
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+MODULE_SOFTDEP("pre: sctp");
+MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>");
+MODULE_DESCRIPTION("SCTP snooper");
+MODULE_LICENSE("GPL");
+
+static int port __read_mostly = 0;
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+static unsigned int fwmark __read_mostly = 0;
+MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
+module_param(fwmark, uint, 0);
+
+static int bufsize __read_mostly = 64 * 1024;
+MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
+module_param(bufsize, int, 0);
+
+static int full __read_mostly = 1;
+MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
+module_param(full, int, 0);
+
+static const char procname[] = "sctpprobe";
+
+static struct {
+ struct kfifo fifo;
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ struct timespec tstart;
+} sctpw;
+
+static __printf(1, 2) void printl(const char *fmt, ...)
+{
+ va_list args;
+ int len;
+ char tbuf[256];
+
+ va_start(args, fmt);
+ len = vscnprintf(tbuf, sizeof(tbuf), fmt, args);
+ va_end(args);
+
+ kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
+ wake_up(&sctpw.wait);
+}
+
+static int sctpprobe_open(struct inode *inode, struct file *file)
+{
+ kfifo_reset(&sctpw.fifo);
+ getnstimeofday(&sctpw.tstart);
+
+ return 0;
+}
+
+static ssize_t sctpprobe_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ int error = 0, cnt = 0;
+ unsigned char *tbuf;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (len == 0)
+ return 0;
+
+ tbuf = vmalloc(len);
+ if (!tbuf)
+ return -ENOMEM;
+
+ error = wait_event_interruptible(sctpw.wait,
+ kfifo_len(&sctpw.fifo) != 0);
+ if (error)
+ goto out_free;
+
+ cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
+ error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
+
+out_free:
+ vfree(tbuf);
+
+ return error ? error : cnt;
+}
+
+static const struct file_operations sctpprobe_fops = {
+ .owner = THIS_MODULE,
+ .open = sctpprobe_open,
+ .read = sctpprobe_read,
+ .llseek = noop_llseek,
+};
+
+static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sk_buff *skb = chunk->skb;
+ struct sctp_transport *sp;
+ static __u32 lcwnd = 0;
+ struct timespec now;
+
+ sp = asoc->peer.primary_path;
+
+ if (((port == 0 && fwmark == 0) ||
+ asoc->peer.port == port ||
+ ep->base.bind_addr.port == port ||
+ (fwmark > 0 && skb->mark == fwmark)) &&
+ (full || sp->cwnd != lcwnd)) {
+ lcwnd = sp->cwnd;
+
+ getnstimeofday(&now);
+ now = timespec_sub(now, sctpw.tstart);
+
+ printl("%lu.%06lu ", (unsigned long) now.tv_sec,
+ (unsigned long) now.tv_nsec / NSEC_PER_USEC);
+
+ printl("%p %5d %5d %5d %8d %5d ", asoc,
+ ep->base.bind_addr.port, asoc->peer.port,
+ asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data);
+
+ list_for_each_entry(sp, &asoc->peer.transport_addr_list,
+ transports) {
+ if (sp == asoc->peer.primary_path)
+ printl("*");
+
+ printl("%pISc %2u %8u %8u %8u %8u %8u ",
+ &sp->ipaddr, sp->state, sp->cwnd, sp->ssthresh,
+ sp->flight_size, sp->partial_bytes_acked,
+ sp->pathmtu);
+ }
+ printl("\n");
+ }
+
+ jprobe_return();
+ return 0;
+}
+
+static struct jprobe sctp_recv_probe = {
+ .kp = {
+ .symbol_name = "sctp_sf_eat_sack_6_2",
+ },
+ .entry = jsctp_sf_eat_sack,
+};
+
+static __init int sctp_setup_jprobe(void)
+{
+ int ret = register_jprobe(&sctp_recv_probe);
+
+ if (ret) {
+ if (request_module("sctp"))
+ goto out;
+ ret = register_jprobe(&sctp_recv_probe);
+ }
+
+out:
+ return ret;
+}
+
+static __init int sctpprobe_init(void)
+{
+ int ret = -ENOMEM;
+
+ /* Warning: if the function signature of sctp_sf_eat_sack_6_2,
+ * has been changed, you also have to change the signature of
+ * jsctp_sf_eat_sack, otherwise you end up right here!
+ */
+ BUILD_BUG_ON(__same_type(sctp_sf_eat_sack_6_2,
+ jsctp_sf_eat_sack) == 0);
+
+ init_waitqueue_head(&sctpw.wait);
+ spin_lock_init(&sctpw.lock);
+ if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL))
+ return ret;
+
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net,
+ &sctpprobe_fops))
+ goto free_kfifo;
+
+ ret = sctp_setup_jprobe();
+ if (ret)
+ goto remove_proc;
+
+ pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
+ port, fwmark, bufsize);
+ return 0;
+
+remove_proc:
+ remove_proc_entry(procname, init_net.proc_net);
+free_kfifo:
+ kfifo_free(&sctpw.fifo);
+ return ret;
+}
+
+static __exit void sctpprobe_exit(void)
+{
+ kfifo_free(&sctpw.fifo);
+ remove_proc_entry(procname, init_net.proc_net);
+ unregister_jprobe(&sctp_recv_probe);
+}
+
+module_init(sctpprobe_init);
+module_exit(sctpprobe_exit);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
new file mode 100644
index 000000000..0697eda5a
--- /dev/null
+++ b/net/sctp/proc.c
@@ -0,0 +1,555 @@
+/* SCTP kernel implementation
+ * Copyright (c) 2003 International Business Machines, Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <net/sctp/sctp.h>
+#include <net/ip.h> /* for snmp_fold_field */
+
+static const struct snmp_mib sctp_snmp_list[] = {
+ SNMP_MIB_ITEM("SctpCurrEstab", SCTP_MIB_CURRESTAB),
+ SNMP_MIB_ITEM("SctpActiveEstabs", SCTP_MIB_ACTIVEESTABS),
+ SNMP_MIB_ITEM("SctpPassiveEstabs", SCTP_MIB_PASSIVEESTABS),
+ SNMP_MIB_ITEM("SctpAborteds", SCTP_MIB_ABORTEDS),
+ SNMP_MIB_ITEM("SctpShutdowns", SCTP_MIB_SHUTDOWNS),
+ SNMP_MIB_ITEM("SctpOutOfBlues", SCTP_MIB_OUTOFBLUES),
+ SNMP_MIB_ITEM("SctpChecksumErrors", SCTP_MIB_CHECKSUMERRORS),
+ SNMP_MIB_ITEM("SctpOutCtrlChunks", SCTP_MIB_OUTCTRLCHUNKS),
+ SNMP_MIB_ITEM("SctpOutOrderChunks", SCTP_MIB_OUTORDERCHUNKS),
+ SNMP_MIB_ITEM("SctpOutUnorderChunks", SCTP_MIB_OUTUNORDERCHUNKS),
+ SNMP_MIB_ITEM("SctpInCtrlChunks", SCTP_MIB_INCTRLCHUNKS),
+ SNMP_MIB_ITEM("SctpInOrderChunks", SCTP_MIB_INORDERCHUNKS),
+ SNMP_MIB_ITEM("SctpInUnorderChunks", SCTP_MIB_INUNORDERCHUNKS),
+ SNMP_MIB_ITEM("SctpFragUsrMsgs", SCTP_MIB_FRAGUSRMSGS),
+ SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS),
+ SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS),
+ SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS),
+ SNMP_MIB_ITEM("SctpT1InitExpireds", SCTP_MIB_T1_INIT_EXPIREDS),
+ SNMP_MIB_ITEM("SctpT1CookieExpireds", SCTP_MIB_T1_COOKIE_EXPIREDS),
+ SNMP_MIB_ITEM("SctpT2ShutdownExpireds", SCTP_MIB_T2_SHUTDOWN_EXPIREDS),
+ SNMP_MIB_ITEM("SctpT3RtxExpireds", SCTP_MIB_T3_RTX_EXPIREDS),
+ SNMP_MIB_ITEM("SctpT4RtoExpireds", SCTP_MIB_T4_RTO_EXPIREDS),
+ SNMP_MIB_ITEM("SctpT5ShutdownGuardExpireds", SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS),
+ SNMP_MIB_ITEM("SctpDelaySackExpireds", SCTP_MIB_DELAY_SACK_EXPIREDS),
+ SNMP_MIB_ITEM("SctpAutocloseExpireds", SCTP_MIB_AUTOCLOSE_EXPIREDS),
+ SNMP_MIB_ITEM("SctpT3Retransmits", SCTP_MIB_T3_RETRANSMITS),
+ SNMP_MIB_ITEM("SctpPmtudRetransmits", SCTP_MIB_PMTUD_RETRANSMITS),
+ SNMP_MIB_ITEM("SctpFastRetransmits", SCTP_MIB_FAST_RETRANSMITS),
+ SNMP_MIB_ITEM("SctpInPktSoftirq", SCTP_MIB_IN_PKT_SOFTIRQ),
+ SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG),
+ SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS),
+ SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS),
+ SNMP_MIB_SENTINEL
+};
+
+/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */
+static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq->private;
+ int i;
+
+ for (i = 0; sctp_snmp_list[i].name != NULL; i++)
+ seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
+ snmp_fold_field(net->sctp.sctp_statistics,
+ sctp_snmp_list[i].entry));
+
+ return 0;
+}
+
+/* Initialize the seq file operations for 'snmp' object. */
+static int sctp_snmp_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, sctp_snmp_seq_show);
+}
+
+static const struct file_operations sctp_snmp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = sctp_snmp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+/* Set up the proc fs entry for 'snmp' object. */
+int __net_init sctp_snmp_proc_init(struct net *net)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp,
+ &sctp_snmp_seq_fops);
+ if (!p)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Cleanup the proc fs entry for 'snmp' object. */
+void sctp_snmp_proc_exit(struct net *net)
+{
+ remove_proc_entry("snmp", net->sctp.proc_net_sctp);
+}
+
+/* Dump local addresses of an association/endpoint. */
+static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
+{
+ struct sctp_association *asoc;
+ struct sctp_sockaddr_entry *laddr;
+ struct sctp_transport *peer;
+ union sctp_addr *addr, *primary = NULL;
+ struct sctp_af *af;
+
+ if (epb->type == SCTP_EP_TYPE_ASSOCIATION) {
+ asoc = sctp_assoc(epb);
+
+ peer = asoc->peer.primary_path;
+ if (unlikely(peer == NULL)) {
+ WARN(1, "Association %p with NULL primary path!\n", asoc);
+ return;
+ }
+
+ primary = &peer->saddr;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &epb->bind_addr.address_list, list) {
+ if (!laddr->valid)
+ continue;
+
+ addr = &laddr->a;
+ af = sctp_get_af_specific(addr->sa.sa_family);
+ if (primary && af->cmp_addr(addr, primary)) {
+ seq_printf(seq, "*");
+ }
+ af->seq_dump_addr(seq, addr);
+ }
+ rcu_read_unlock();
+}
+
+/* Dump remote addresses of an association. */
+static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_association *assoc)
+{
+ struct sctp_transport *transport;
+ union sctp_addr *addr, *primary;
+ struct sctp_af *af;
+
+ primary = &assoc->peer.primary_addr;
+ rcu_read_lock();
+ list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list,
+ transports) {
+ addr = &transport->ipaddr;
+ if (transport->dead)
+ continue;
+
+ af = sctp_get_af_specific(addr->sa.sa_family);
+ if (af->cmp_addr(addr, primary)) {
+ seq_printf(seq, "*");
+ }
+ af->seq_dump_addr(seq, addr);
+ }
+ rcu_read_unlock();
+}
+
+static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos >= sctp_ep_hashsize)
+ return NULL;
+
+ if (*pos < 0)
+ *pos = 0;
+
+ if (*pos == 0)
+ seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT UID INODE LADDRS\n");
+
+ return (void *)pos;
+}
+
+static void sctp_eps_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+
+static void *sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ if (++*pos >= sctp_ep_hashsize)
+ return NULL;
+
+ return pos;
+}
+
+
+/* Display sctp endpoints (/proc/net/sctp/eps). */
+static int sctp_eps_seq_show(struct seq_file *seq, void *v)
+{
+ struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
+ struct sctp_endpoint *ep;
+ struct sock *sk;
+ int hash = *(loff_t *)v;
+
+ if (hash >= sctp_ep_hashsize)
+ return -ENOMEM;
+
+ head = &sctp_ep_hashtable[hash];
+ local_bh_disable();
+ read_lock(&head->lock);
+ sctp_for_each_hentry(epb, &head->chain) {
+ ep = sctp_ep(epb);
+ sk = epb->sk;
+ if (!net_eq(sock_net(sk), seq_file_net(seq)))
+ continue;
+ seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk,
+ sctp_sk(sk)->type, sk->sk_state, hash,
+ epb->bind_addr.port,
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+ sock_i_ino(sk));
+
+ sctp_seq_dump_local_addrs(seq, epb);
+ seq_printf(seq, "\n");
+ }
+ read_unlock(&head->lock);
+ local_bh_enable();
+
+ return 0;
+}
+
+static const struct seq_operations sctp_eps_ops = {
+ .start = sctp_eps_seq_start,
+ .next = sctp_eps_seq_next,
+ .stop = sctp_eps_seq_stop,
+ .show = sctp_eps_seq_show,
+};
+
+
+/* Initialize the seq file operations for 'eps' object. */
+static int sctp_eps_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &sctp_eps_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations sctp_eps_seq_fops = {
+ .open = sctp_eps_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+/* Set up the proc fs entry for 'eps' object. */
+int __net_init sctp_eps_proc_init(struct net *net)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp,
+ &sctp_eps_seq_fops);
+ if (!p)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Cleanup the proc fs entry for 'eps' object. */
+void sctp_eps_proc_exit(struct net *net)
+{
+ remove_proc_entry("eps", net->sctp.proc_net_sctp);
+}
+
+
+static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos >= sctp_assoc_hashsize)
+ return NULL;
+
+ if (*pos < 0)
+ *pos = 0;
+
+ if (*pos == 0)
+ seq_printf(seq, " ASSOC SOCK STY SST ST HBKT "
+ "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT "
+ "RPORT LADDRS <-> RADDRS "
+ "HBINT INS OUTS MAXRT T1X T2X RTXC "
+ "wmema wmemq sndbuf rcvbuf\n");
+
+ return (void *)pos;
+}
+
+static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+
+static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ if (++*pos >= sctp_assoc_hashsize)
+ return NULL;
+
+ return pos;
+}
+
+/* Display sctp associations (/proc/net/sctp/assocs). */
+static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
+{
+ struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
+ struct sctp_association *assoc;
+ struct sock *sk;
+ int hash = *(loff_t *)v;
+
+ if (hash >= sctp_assoc_hashsize)
+ return -ENOMEM;
+
+ head = &sctp_assoc_hashtable[hash];
+ local_bh_disable();
+ read_lock(&head->lock);
+ sctp_for_each_hentry(epb, &head->chain) {
+ assoc = sctp_assoc(epb);
+ sk = epb->sk;
+ if (!net_eq(sock_net(sk), seq_file_net(seq)))
+ continue;
+ seq_printf(seq,
+ "%8pK %8pK %-3d %-3d %-2d %-4d "
+ "%4d %8d %8d %7u %5lu %-5d %5d ",
+ assoc, sk, sctp_sk(sk)->type, sk->sk_state,
+ assoc->state, hash,
+ assoc->assoc_id,
+ assoc->sndbuf_used,
+ atomic_read(&assoc->rmem_alloc),
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+ sock_i_ino(sk),
+ epb->bind_addr.port,
+ assoc->peer.port);
+ seq_printf(seq, " ");
+ sctp_seq_dump_local_addrs(seq, epb);
+ seq_printf(seq, "<-> ");
+ sctp_seq_dump_remote_addrs(seq, assoc);
+ seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d "
+ "%8d %8d %8d %8d",
+ assoc->hbinterval, assoc->c.sinit_max_instreams,
+ assoc->c.sinit_num_ostreams, assoc->max_retrans,
+ assoc->init_retries, assoc->shutdown_retries,
+ assoc->rtx_data_chunks,
+ atomic_read(&sk->sk_wmem_alloc),
+ sk->sk_wmem_queued,
+ sk->sk_sndbuf,
+ sk->sk_rcvbuf);
+ seq_printf(seq, "\n");
+ }
+ read_unlock(&head->lock);
+ local_bh_enable();
+
+ return 0;
+}
+
+static const struct seq_operations sctp_assoc_ops = {
+ .start = sctp_assocs_seq_start,
+ .next = sctp_assocs_seq_next,
+ .stop = sctp_assocs_seq_stop,
+ .show = sctp_assocs_seq_show,
+};
+
+/* Initialize the seq file operations for 'assocs' object. */
+static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &sctp_assoc_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations sctp_assocs_seq_fops = {
+ .open = sctp_assocs_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+/* Set up the proc fs entry for 'assocs' object. */
+int __net_init sctp_assocs_proc_init(struct net *net)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp,
+ &sctp_assocs_seq_fops);
+ if (!p)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Cleanup the proc fs entry for 'assocs' object. */
+void sctp_assocs_proc_exit(struct net *net)
+{
+ remove_proc_entry("assocs", net->sctp.proc_net_sctp);
+}
+
+static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos >= sctp_assoc_hashsize)
+ return NULL;
+
+ if (*pos < 0)
+ *pos = 0;
+
+ if (*pos == 0)
+ seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX "
+ "REM_ADDR_RTX START STATE\n");
+
+ return (void *)pos;
+}
+
+static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ if (++*pos >= sctp_assoc_hashsize)
+ return NULL;
+
+ return pos;
+}
+
+static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
+{
+ struct sctp_hashbucket *head;
+ struct sctp_ep_common *epb;
+ struct sctp_association *assoc;
+ struct sctp_transport *tsp;
+ int hash = *(loff_t *)v;
+
+ if (hash >= sctp_assoc_hashsize)
+ return -ENOMEM;
+
+ head = &sctp_assoc_hashtable[hash];
+ local_bh_disable();
+ read_lock(&head->lock);
+ rcu_read_lock();
+ sctp_for_each_hentry(epb, &head->chain) {
+ if (!net_eq(sock_net(epb->sk), seq_file_net(seq)))
+ continue;
+ assoc = sctp_assoc(epb);
+ list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
+ transports) {
+ if (tsp->dead)
+ continue;
+
+ /*
+ * The remote address (ADDR)
+ */
+ tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr);
+ seq_printf(seq, " ");
+
+ /*
+ * The association ID (ASSOC_ID)
+ */
+ seq_printf(seq, "%d ", tsp->asoc->assoc_id);
+
+ /*
+ * If the Heartbeat is active (HB_ACT)
+ * Note: 1 = Active, 0 = Inactive
+ */
+ seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer));
+
+ /*
+ * Retransmit time out (RTO)
+ */
+ seq_printf(seq, "%lu ", tsp->rto);
+
+ /*
+ * Maximum path retransmit count (PATH_MAX_RTX)
+ */
+ seq_printf(seq, "%d ", tsp->pathmaxrxt);
+
+ /*
+ * remote address retransmit count (REM_ADDR_RTX)
+ * Note: We don't have a way to tally this at the moment
+ * so lets just leave it as zero for the moment
+ */
+ seq_puts(seq, "0 ");
+
+ /*
+ * remote address start time (START). This is also not
+ * currently implemented, but we can record it with a
+ * jiffies marker in a subsequent patch
+ */
+ seq_puts(seq, "0 ");
+
+ /*
+ * The current state of this destination. I.e.
+ * SCTP_ACTIVE, SCTP_INACTIVE, ...
+ */
+ seq_printf(seq, "%d", tsp->state);
+
+ seq_printf(seq, "\n");
+ }
+ }
+
+ rcu_read_unlock();
+ read_unlock(&head->lock);
+ local_bh_enable();
+
+ return 0;
+
+}
+
+static const struct seq_operations sctp_remaddr_ops = {
+ .start = sctp_remaddr_seq_start,
+ .next = sctp_remaddr_seq_next,
+ .stop = sctp_remaddr_seq_stop,
+ .show = sctp_remaddr_seq_show,
+};
+
+/* Cleanup the proc fs entry for 'remaddr' object. */
+void sctp_remaddr_proc_exit(struct net *net)
+{
+ remove_proc_entry("remaddr", net->sctp.proc_net_sctp);
+}
+
+static int sctp_remaddr_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &sctp_remaddr_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations sctp_remaddr_seq_fops = {
+ .open = sctp_remaddr_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+int __net_init sctp_remaddr_proc_init(struct net *net)
+{
+ struct proc_dir_entry *p;
+
+ p = proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp,
+ &sctp_remaddr_seq_fops);
+ if (!p)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
new file mode 100644
index 000000000..53b7acde9
--- /dev/null
+++ b/net/sctp/protocol.c
@@ -0,0 +1,1550 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * Initialization/cleanup for SCTP protocol support.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/route.h>
+#include <net/sctp/sctp.h>
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/inet_ecn.h>
+
+/* Global data structures. */
+struct sctp_globals sctp_globals __read_mostly;
+
+struct idr sctp_assocs_id;
+DEFINE_SPINLOCK(sctp_assocs_id_lock);
+
+static struct sctp_pf *sctp_pf_inet6_specific;
+static struct sctp_pf *sctp_pf_inet_specific;
+static struct sctp_af *sctp_af_v4_specific;
+static struct sctp_af *sctp_af_v6_specific;
+
+struct kmem_cache *sctp_chunk_cachep __read_mostly;
+struct kmem_cache *sctp_bucket_cachep __read_mostly;
+
+long sysctl_sctp_mem[3];
+int sysctl_sctp_rmem[3];
+int sysctl_sctp_wmem[3];
+
+/* Set up the proc fs entry for the SCTP protocol. */
+static int __net_init sctp_proc_init(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net);
+ if (!net->sctp.proc_net_sctp)
+ goto out_proc_net_sctp;
+ if (sctp_snmp_proc_init(net))
+ goto out_snmp_proc_init;
+ if (sctp_eps_proc_init(net))
+ goto out_eps_proc_init;
+ if (sctp_assocs_proc_init(net))
+ goto out_assocs_proc_init;
+ if (sctp_remaddr_proc_init(net))
+ goto out_remaddr_proc_init;
+
+ return 0;
+
+out_remaddr_proc_init:
+ sctp_assocs_proc_exit(net);
+out_assocs_proc_init:
+ sctp_eps_proc_exit(net);
+out_eps_proc_init:
+ sctp_snmp_proc_exit(net);
+out_snmp_proc_init:
+ remove_proc_entry("sctp", net->proc_net);
+ net->sctp.proc_net_sctp = NULL;
+out_proc_net_sctp:
+ return -ENOMEM;
+#endif /* CONFIG_PROC_FS */
+ return 0;
+}
+
+/* Clean up the proc fs entry for the SCTP protocol.
+ * Note: Do not make this __exit as it is used in the init error
+ * path.
+ */
+static void sctp_proc_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ sctp_snmp_proc_exit(net);
+ sctp_eps_proc_exit(net);
+ sctp_assocs_proc_exit(net);
+ sctp_remaddr_proc_exit(net);
+
+ remove_proc_entry("sctp", net->proc_net);
+ net->sctp.proc_net_sctp = NULL;
+#endif
+}
+
+/* Private helper to extract ipv4 address and stash them in
+ * the protocol structure.
+ */
+static void sctp_v4_copy_addrlist(struct list_head *addrlist,
+ struct net_device *dev)
+{
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+ struct sctp_sockaddr_entry *addr;
+
+ rcu_read_lock();
+ if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
+ rcu_read_unlock();
+ return;
+ }
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ /* Add the address to the local list. */
+ addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
+ if (addr) {
+ addr->a.v4.sin_family = AF_INET;
+ addr->a.v4.sin_port = 0;
+ addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
+ addr->valid = 1;
+ INIT_LIST_HEAD(&addr->list);
+ list_add_tail(&addr->list, addrlist);
+ }
+ }
+
+ rcu_read_unlock();
+}
+
+/* Extract our IP addresses from the system and stash them in the
+ * protocol structure.
+ */
+static void sctp_get_local_addr_list(struct net *net)
+{
+ struct net_device *dev;
+ struct list_head *pos;
+ struct sctp_af *af;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ list_for_each(pos, &sctp_address_families) {
+ af = list_entry(pos, struct sctp_af, list);
+ af->copy_addrlist(&net->sctp.local_addr_list, dev);
+ }
+ }
+ rcu_read_unlock();
+}
+
+/* Free the existing local addresses. */
+static void sctp_free_local_addr_list(struct net *net)
+{
+ struct sctp_sockaddr_entry *addr;
+ struct list_head *pos, *temp;
+
+ list_for_each_safe(pos, temp, &net->sctp.local_addr_list) {
+ addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+ list_del(pos);
+ kfree(addr);
+ }
+}
+
+/* Copy the local addresses which are valid for 'scope' into 'bp'. */
+int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
+ sctp_scope_t scope, gfp_t gfp, int copy_flags)
+{
+ struct sctp_sockaddr_entry *addr;
+ int error = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) {
+ if (!addr->valid)
+ continue;
+ if (sctp_in_scope(net, &addr->a, scope)) {
+ /* Now that the address is in scope, check to see if
+ * the address type is really supported by the local
+ * sock as well as the remote peer.
+ */
+ if ((((AF_INET == addr->a.sa.sa_family) &&
+ (copy_flags & SCTP_ADDR4_PEERSUPP))) ||
+ (((AF_INET6 == addr->a.sa.sa_family) &&
+ (copy_flags & SCTP_ADDR6_ALLOWED) &&
+ (copy_flags & SCTP_ADDR6_PEERSUPP)))) {
+ error = sctp_add_bind_addr(bp, &addr->a,
+ SCTP_ADDR_SRC, GFP_ATOMIC);
+ if (error)
+ goto end_copy;
+ }
+ }
+ }
+
+end_copy:
+ rcu_read_unlock();
+ return error;
+}
+
+/* Initialize a sctp_addr from in incoming skb. */
+static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
+ int is_saddr)
+{
+ void *from;
+ __be16 *port;
+ struct sctphdr *sh;
+
+ port = &addr->v4.sin_port;
+ addr->v4.sin_family = AF_INET;
+
+ sh = sctp_hdr(skb);
+ if (is_saddr) {
+ *port = sh->source;
+ from = &ip_hdr(skb)->saddr;
+ } else {
+ *port = sh->dest;
+ from = &ip_hdr(skb)->daddr;
+ }
+ memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr));
+}
+
+/* Initialize an sctp_addr from a socket. */
+static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk)
+{
+ addr->v4.sin_family = AF_INET;
+ addr->v4.sin_port = 0;
+ addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr;
+}
+
+/* Initialize sk->sk_rcv_saddr from sctp_addr. */
+static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
+{
+ inet_sk(sk)->inet_rcv_saddr = addr->v4.sin_addr.s_addr;
+}
+
+/* Initialize sk->sk_daddr from sctp_addr. */
+static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk)
+{
+ inet_sk(sk)->inet_daddr = addr->v4.sin_addr.s_addr;
+}
+
+/* Initialize a sctp_addr from an address parameter. */
+static void sctp_v4_from_addr_param(union sctp_addr *addr,
+ union sctp_addr_param *param,
+ __be16 port, int iif)
+{
+ addr->v4.sin_family = AF_INET;
+ addr->v4.sin_port = port;
+ addr->v4.sin_addr.s_addr = param->v4.addr.s_addr;
+}
+
+/* Initialize an address parameter from a sctp_addr and return the length
+ * of the address parameter.
+ */
+static int sctp_v4_to_addr_param(const union sctp_addr *addr,
+ union sctp_addr_param *param)
+{
+ int length = sizeof(sctp_ipv4addr_param_t);
+
+ param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS;
+ param->v4.param_hdr.length = htons(length);
+ param->v4.addr.s_addr = addr->v4.sin_addr.s_addr;
+
+ return length;
+}
+
+/* Initialize a sctp_addr from a dst_entry. */
+static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4,
+ __be16 port)
+{
+ saddr->v4.sin_family = AF_INET;
+ saddr->v4.sin_port = port;
+ saddr->v4.sin_addr.s_addr = fl4->saddr;
+}
+
+/* Compare two addresses exactly. */
+static int sctp_v4_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2)
+{
+ if (addr1->sa.sa_family != addr2->sa.sa_family)
+ return 0;
+ if (addr1->v4.sin_port != addr2->v4.sin_port)
+ return 0;
+ if (addr1->v4.sin_addr.s_addr != addr2->v4.sin_addr.s_addr)
+ return 0;
+
+ return 1;
+}
+
+/* Initialize addr struct to INADDR_ANY. */
+static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port)
+{
+ addr->v4.sin_family = AF_INET;
+ addr->v4.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr->v4.sin_port = port;
+}
+
+/* Is this a wildcard address? */
+static int sctp_v4_is_any(const union sctp_addr *addr)
+{
+ return htonl(INADDR_ANY) == addr->v4.sin_addr.s_addr;
+}
+
+/* This function checks if the address is a valid address to be used for
+ * SCTP binding.
+ *
+ * Output:
+ * Return 0 - If the address is a non-unicast or an illegal address.
+ * Return 1 - If the address is a unicast.
+ */
+static int sctp_v4_addr_valid(union sctp_addr *addr,
+ struct sctp_sock *sp,
+ const struct sk_buff *skb)
+{
+ /* IPv4 addresses not allowed */
+ if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
+ return 0;
+
+ /* Is this a non-unicast address or a unusable SCTP address? */
+ if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr))
+ return 0;
+
+ /* Is this a broadcast address? */
+ if (skb && skb_rtable(skb)->rt_flags & RTCF_BROADCAST)
+ return 0;
+
+ return 1;
+}
+
+/* Should this be available for binding? */
+static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
+{
+ struct net *net = sock_net(&sp->inet.sk);
+ int ret = inet_addr_type(net, addr->v4.sin_addr.s_addr);
+
+
+ if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) &&
+ ret != RTN_LOCAL &&
+ !sp->inet.freebind &&
+ !net->ipv4.sysctl_ip_nonlocal_bind)
+ return 0;
+
+ if (ipv6_only_sock(sctp_opt2sk(sp)))
+ return 0;
+
+ return 1;
+}
+
+/* Checking the loopback, private and other address scopes as defined in
+ * RFC 1918. The IPv4 scoping is based on the draft for SCTP IPv4
+ * scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>.
+ *
+ * Level 0 - unusable SCTP addresses
+ * Level 1 - loopback address
+ * Level 2 - link-local addresses
+ * Level 3 - private addresses.
+ * Level 4 - global addresses
+ * For INIT and INIT-ACK address list, let L be the level of
+ * of requested destination address, sender and receiver
+ * SHOULD include all of its addresses with level greater
+ * than or equal to L.
+ *
+ * IPv4 scoping can be controlled through sysctl option
+ * net.sctp.addr_scope_policy
+ */
+static sctp_scope_t sctp_v4_scope(union sctp_addr *addr)
+{
+ sctp_scope_t retval;
+
+ /* Check for unusable SCTP addresses. */
+ if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) {
+ retval = SCTP_SCOPE_UNUSABLE;
+ } else if (ipv4_is_loopback(addr->v4.sin_addr.s_addr)) {
+ retval = SCTP_SCOPE_LOOPBACK;
+ } else if (ipv4_is_linklocal_169(addr->v4.sin_addr.s_addr)) {
+ retval = SCTP_SCOPE_LINK;
+ } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) ||
+ ipv4_is_private_172(addr->v4.sin_addr.s_addr) ||
+ ipv4_is_private_192(addr->v4.sin_addr.s_addr)) {
+ retval = SCTP_SCOPE_PRIVATE;
+ } else {
+ retval = SCTP_SCOPE_GLOBAL;
+ }
+
+ return retval;
+}
+
+/* Returns a valid dst cache entry for the given source and destination ip
+ * addresses. If an association is passed, trys to get a dst entry with a
+ * source address that matches an address in the bind address list.
+ */
+static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+ struct flowi *fl, struct sock *sk)
+{
+ struct sctp_association *asoc = t->asoc;
+ struct rtable *rt;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ struct sctp_bind_addr *bp;
+ struct sctp_sockaddr_entry *laddr;
+ struct dst_entry *dst = NULL;
+ union sctp_addr *daddr = &t->ipaddr;
+ union sctp_addr dst_saddr;
+
+ memset(fl4, 0x0, sizeof(struct flowi4));
+ fl4->daddr = daddr->v4.sin_addr.s_addr;
+ fl4->fl4_dport = daddr->v4.sin_port;
+ fl4->flowi4_proto = IPPROTO_SCTP;
+ if (asoc) {
+ fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk);
+ fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
+ fl4->fl4_sport = htons(asoc->base.bind_addr.port);
+ }
+ if (saddr) {
+ fl4->saddr = saddr->v4.sin_addr.s_addr;
+ fl4->fl4_sport = saddr->v4.sin_port;
+ }
+
+ pr_debug("%s: dst:%pI4, src:%pI4 - ", __func__, &fl4->daddr,
+ &fl4->saddr);
+
+ rt = ip_route_output_key(sock_net(sk), fl4);
+ if (!IS_ERR(rt))
+ dst = &rt->dst;
+
+ /* If there is no association or if a source address is passed, no
+ * more validation is required.
+ */
+ if (!asoc || saddr)
+ goto out;
+
+ bp = &asoc->base.bind_addr;
+
+ if (dst) {
+ /* Walk through the bind address list and look for a bind
+ * address that matches the source address of the returned dst.
+ */
+ sctp_v4_dst_saddr(&dst_saddr, fl4, htons(bp->port));
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ if (!laddr->valid || (laddr->state == SCTP_ADDR_DEL) ||
+ (laddr->state != SCTP_ADDR_SRC &&
+ !asoc->src_out_of_asoc_ok))
+ continue;
+ if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+
+ /* None of the bound addresses match the source address of the
+ * dst. So release it.
+ */
+ dst_release(dst);
+ dst = NULL;
+ }
+
+ /* Walk through the bind address list and try to get a dst that
+ * matches a bind address as the source address.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+ if (!laddr->valid)
+ continue;
+ if ((laddr->state == SCTP_ADDR_SRC) &&
+ (AF_INET == laddr->a.sa.sa_family)) {
+ fl4->fl4_sport = laddr->a.v4.sin_port;
+ flowi4_update_output(fl4,
+ asoc->base.sk->sk_bound_dev_if,
+ RT_CONN_FLAGS(asoc->base.sk),
+ daddr->v4.sin_addr.s_addr,
+ laddr->a.v4.sin_addr.s_addr);
+
+ rt = ip_route_output_key(sock_net(sk), fl4);
+ if (!IS_ERR(rt)) {
+ dst = &rt->dst;
+ goto out_unlock;
+ }
+ }
+ }
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ t->dst = dst;
+ if (dst)
+ pr_debug("rt_dst:%pI4, rt_src:%pI4\n",
+ &fl4->daddr, &fl4->saddr);
+ else
+ pr_debug("no route\n");
+}
+
+/* For v4, the source address is cached in the route entry(dst). So no need
+ * to cache it separately and hence this is an empty routine.
+ */
+static void sctp_v4_get_saddr(struct sctp_sock *sk,
+ struct sctp_transport *t,
+ struct flowi *fl)
+{
+ union sctp_addr *saddr = &t->saddr;
+ struct rtable *rt = (struct rtable *)t->dst;
+
+ if (rt) {
+ saddr->v4.sin_family = AF_INET;
+ saddr->v4.sin_addr.s_addr = fl->u.ip4.saddr;
+ }
+}
+
+/* What interface did this skb arrive on? */
+static int sctp_v4_skb_iif(const struct sk_buff *skb)
+{
+ return inet_iif(skb);
+}
+
+/* Was this packet marked by Explicit Congestion Notification? */
+static int sctp_v4_is_ce(const struct sk_buff *skb)
+{
+ return INET_ECN_is_ce(ip_hdr(skb)->tos);
+}
+
+/* Create and initialize a new sk for the socket returned by accept(). */
+static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
+ struct sctp_association *asoc)
+{
+ struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
+ sk->sk_prot);
+ struct inet_sock *newinet;
+
+ if (!newsk)
+ goto out;
+
+ sock_init_data(NULL, newsk);
+
+ sctp_copy_sock(newsk, sk, asoc);
+ sock_reset_flag(newsk, SOCK_ZAPPED);
+
+ newinet = inet_sk(newsk);
+
+ newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr;
+
+ sk_refcnt_debug_inc(newsk);
+
+ if (newsk->sk_prot->init(newsk)) {
+ sk_common_release(newsk);
+ newsk = NULL;
+ }
+
+out:
+ return newsk;
+}
+
+static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
+{
+ /* No address mapping for V4 sockets */
+ return sizeof(struct sockaddr_in);
+}
+
+/* Dump the v4 addr to the seq file. */
+static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
+{
+ seq_printf(seq, "%pI4 ", &addr->v4.sin_addr);
+}
+
+static void sctp_v4_ecn_capable(struct sock *sk)
+{
+ INET_ECN_xmit(sk);
+}
+
+static void sctp_addr_wq_timeout_handler(unsigned long arg)
+{
+ struct net *net = (struct net *)arg;
+ struct sctp_sockaddr_entry *addrw, *temp;
+ struct sctp_sock *sp;
+
+ spin_lock_bh(&net->sctp.addr_wq_lock);
+
+ list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
+ pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
+ "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
+ addrw->state, addrw);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* Now we send an ASCONF for each association */
+ /* Note. we currently don't handle link local IPv6 addressees */
+ if (addrw->a.sa.sa_family == AF_INET6) {
+ struct in6_addr *in6;
+
+ if (ipv6_addr_type(&addrw->a.v6.sin6_addr) &
+ IPV6_ADDR_LINKLOCAL)
+ goto free_next;
+
+ in6 = (struct in6_addr *)&addrw->a.v6.sin6_addr;
+ if (ipv6_chk_addr(net, in6, NULL, 0) == 0 &&
+ addrw->state == SCTP_ADDR_NEW) {
+ unsigned long timeo_val;
+
+ pr_debug("%s: this is on DAD, trying %d sec "
+ "later\n", __func__,
+ SCTP_ADDRESS_TICK_DELAY);
+
+ timeo_val = jiffies;
+ timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
+ mod_timer(&net->sctp.addr_wq_timer, timeo_val);
+ break;
+ }
+ }
+#endif
+ list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) {
+ struct sock *sk;
+
+ sk = sctp_opt2sk(sp);
+ /* ignore bound-specific endpoints */
+ if (!sctp_is_ep_boundall(sk))
+ continue;
+ bh_lock_sock(sk);
+ if (sctp_asconf_mgmt(sp, addrw) < 0)
+ pr_debug("%s: sctp_asconf_mgmt failed\n", __func__);
+ bh_unlock_sock(sk);
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+free_next:
+#endif
+ list_del(&addrw->list);
+ kfree(addrw);
+ }
+ spin_unlock_bh(&net->sctp.addr_wq_lock);
+}
+
+static void sctp_free_addr_wq(struct net *net)
+{
+ struct sctp_sockaddr_entry *addrw;
+ struct sctp_sockaddr_entry *temp;
+
+ spin_lock_bh(&net->sctp.addr_wq_lock);
+ del_timer(&net->sctp.addr_wq_timer);
+ list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
+ list_del(&addrw->list);
+ kfree(addrw);
+ }
+ spin_unlock_bh(&net->sctp.addr_wq_lock);
+}
+
+/* lookup the entry for the same address in the addr_waitq
+ * sctp_addr_wq MUST be locked
+ */
+static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
+ struct sctp_sockaddr_entry *addr)
+{
+ struct sctp_sockaddr_entry *addrw;
+
+ list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
+ if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
+ continue;
+ if (addrw->a.sa.sa_family == AF_INET) {
+ if (addrw->a.v4.sin_addr.s_addr ==
+ addr->a.v4.sin_addr.s_addr)
+ return addrw;
+ } else if (addrw->a.sa.sa_family == AF_INET6) {
+ if (ipv6_addr_equal(&addrw->a.v6.sin6_addr,
+ &addr->a.v6.sin6_addr))
+ return addrw;
+ }
+ }
+ return NULL;
+}
+
+void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cmd)
+{
+ struct sctp_sockaddr_entry *addrw;
+ unsigned long timeo_val;
+
+ /* first, we check if an opposite message already exist in the queue.
+ * If we found such message, it is removed.
+ * This operation is a bit stupid, but the DHCP client attaches the
+ * new address after a couple of addition and deletion of that address
+ */
+
+ spin_lock_bh(&net->sctp.addr_wq_lock);
+ /* Offsets existing events in addr_wq */
+ addrw = sctp_addr_wq_lookup(net, addr);
+ if (addrw) {
+ if (addrw->state != cmd) {
+ pr_debug("%s: offsets existing entry for %d, addr:%pISc "
+ "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
+ &net->sctp.addr_waitq);
+
+ list_del(&addrw->list);
+ kfree(addrw);
+ }
+ spin_unlock_bh(&net->sctp.addr_wq_lock);
+ return;
+ }
+
+ /* OK, we have to add the new address to the wait queue */
+ addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+ if (addrw == NULL) {
+ spin_unlock_bh(&net->sctp.addr_wq_lock);
+ return;
+ }
+ addrw->state = cmd;
+ list_add_tail(&addrw->list, &net->sctp.addr_waitq);
+
+ pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n",
+ __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq);
+
+ if (!timer_pending(&net->sctp.addr_wq_timer)) {
+ timeo_val = jiffies;
+ timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
+ mod_timer(&net->sctp.addr_wq_timer, timeo_val);
+ }
+ spin_unlock_bh(&net->sctp.addr_wq_lock);
+}
+
+/* Event handler for inet address addition/deletion events.
+ * The sctp_local_addr_list needs to be protocted by a spin lock since
+ * multiple notifiers (say IPv4 and IPv6) may be running at the same
+ * time and thus corrupt the list.
+ * The reader side is protected with RCU.
+ */
+static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
+ void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct sctp_sockaddr_entry *addr = NULL;
+ struct sctp_sockaddr_entry *temp;
+ struct net *net = dev_net(ifa->ifa_dev->dev);
+ int found = 0;
+
+ switch (ev) {
+ case NETDEV_UP:
+ addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+ if (addr) {
+ addr->a.v4.sin_family = AF_INET;
+ addr->a.v4.sin_port = 0;
+ addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
+ addr->valid = 1;
+ spin_lock_bh(&net->sctp.local_addr_lock);
+ list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list);
+ sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW);
+ spin_unlock_bh(&net->sctp.local_addr_lock);
+ }
+ break;
+ case NETDEV_DOWN:
+ spin_lock_bh(&net->sctp.local_addr_lock);
+ list_for_each_entry_safe(addr, temp,
+ &net->sctp.local_addr_list, list) {
+ if (addr->a.sa.sa_family == AF_INET &&
+ addr->a.v4.sin_addr.s_addr ==
+ ifa->ifa_local) {
+ sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL);
+ found = 1;
+ addr->valid = 0;
+ list_del_rcu(&addr->list);
+ break;
+ }
+ }
+ spin_unlock_bh(&net->sctp.local_addr_lock);
+ if (found)
+ kfree_rcu(addr, rcu);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Initialize the control inode/socket with a control endpoint data
+ * structure. This endpoint is reserved exclusively for the OOTB processing.
+ */
+static int sctp_ctl_sock_init(struct net *net)
+{
+ int err;
+ sa_family_t family = PF_INET;
+
+ if (sctp_get_pf_specific(PF_INET6))
+ family = PF_INET6;
+
+ err = inet_ctl_sock_create(&net->sctp.ctl_sock, family,
+ SOCK_SEQPACKET, IPPROTO_SCTP, net);
+
+ /* If IPv6 socket could not be created, try the IPv4 socket */
+ if (err < 0 && family == PF_INET6)
+ err = inet_ctl_sock_create(&net->sctp.ctl_sock, AF_INET,
+ SOCK_SEQPACKET, IPPROTO_SCTP,
+ net);
+
+ if (err < 0) {
+ pr_err("Failed to create the SCTP control socket\n");
+ return err;
+ }
+ return 0;
+}
+
+/* Register address family specific functions. */
+int sctp_register_af(struct sctp_af *af)
+{
+ switch (af->sa_family) {
+ case AF_INET:
+ if (sctp_af_v4_specific)
+ return 0;
+ sctp_af_v4_specific = af;
+ break;
+ case AF_INET6:
+ if (sctp_af_v6_specific)
+ return 0;
+ sctp_af_v6_specific = af;
+ break;
+ default:
+ return 0;
+ }
+
+ INIT_LIST_HEAD(&af->list);
+ list_add_tail(&af->list, &sctp_address_families);
+ return 1;
+}
+
+/* Get the table of functions for manipulating a particular address
+ * family.
+ */
+struct sctp_af *sctp_get_af_specific(sa_family_t family)
+{
+ switch (family) {
+ case AF_INET:
+ return sctp_af_v4_specific;
+ case AF_INET6:
+ return sctp_af_v6_specific;
+ default:
+ return NULL;
+ }
+}
+
+/* Common code to initialize a AF_INET msg_name. */
+static void sctp_inet_msgname(char *msgname, int *addr_len)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)msgname;
+ *addr_len = sizeof(struct sockaddr_in);
+ sin->sin_family = AF_INET;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+}
+
+/* Copy the primary address of the peer primary address as the msg_name. */
+static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname,
+ int *addr_len)
+{
+ struct sockaddr_in *sin, *sinfrom;
+
+ if (msgname) {
+ struct sctp_association *asoc;
+
+ asoc = event->asoc;
+ sctp_inet_msgname(msgname, addr_len);
+ sin = (struct sockaddr_in *)msgname;
+ sinfrom = &asoc->peer.primary_addr.v4;
+ sin->sin_port = htons(asoc->peer.port);
+ sin->sin_addr.s_addr = sinfrom->sin_addr.s_addr;
+ }
+}
+
+/* Initialize and copy out a msgname from an inbound skb. */
+static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len)
+{
+ if (msgname) {
+ struct sctphdr *sh = sctp_hdr(skb);
+ struct sockaddr_in *sin = (struct sockaddr_in *)msgname;
+
+ sctp_inet_msgname(msgname, len);
+ sin->sin_port = sh->source;
+ sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+ }
+}
+
+/* Do we support this AF? */
+static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp)
+{
+ /* PF_INET only supports AF_INET addresses. */
+ return AF_INET == family;
+}
+
+/* Address matching with wildcards allowed. */
+static int sctp_inet_cmp_addr(const union sctp_addr *addr1,
+ const union sctp_addr *addr2,
+ struct sctp_sock *opt)
+{
+ /* PF_INET only supports AF_INET addresses. */
+ if (addr1->sa.sa_family != addr2->sa.sa_family)
+ return 0;
+ if (htonl(INADDR_ANY) == addr1->v4.sin_addr.s_addr ||
+ htonl(INADDR_ANY) == addr2->v4.sin_addr.s_addr)
+ return 1;
+ if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr)
+ return 1;
+
+ return 0;
+}
+
+/* Verify that provided sockaddr looks bindable. Common verification has
+ * already been taken care of.
+ */
+static int sctp_inet_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
+{
+ return sctp_v4_available(addr, opt);
+}
+
+/* Verify that sockaddr looks sendable. Common verification has already
+ * been taken care of.
+ */
+static int sctp_inet_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
+{
+ return 1;
+}
+
+/* Fill in Supported Address Type information for INIT and INIT-ACK
+ * chunks. Returns number of addresses supported.
+ */
+static int sctp_inet_supported_addrs(const struct sctp_sock *opt,
+ __be16 *types)
+{
+ types[0] = SCTP_PARAM_IPV4_ADDRESS;
+ return 1;
+}
+
+/* Wrapper routine that calls the ip transmit routine. */
+static inline int sctp_v4_xmit(struct sk_buff *skb,
+ struct sctp_transport *transport)
+{
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
+ skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr);
+
+ inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
+ IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
+
+ SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS);
+
+ return ip_queue_xmit(&inet->sk, skb, &transport->fl);
+}
+
+static struct sctp_af sctp_af_inet;
+
+static struct sctp_pf sctp_pf_inet = {
+ .event_msgname = sctp_inet_event_msgname,
+ .skb_msgname = sctp_inet_skb_msgname,
+ .af_supported = sctp_inet_af_supported,
+ .cmp_addr = sctp_inet_cmp_addr,
+ .bind_verify = sctp_inet_bind_verify,
+ .send_verify = sctp_inet_send_verify,
+ .supported_addrs = sctp_inet_supported_addrs,
+ .create_accept_sk = sctp_v4_create_accept_sk,
+ .addr_to_user = sctp_v4_addr_to_user,
+ .to_sk_saddr = sctp_v4_to_sk_saddr,
+ .to_sk_daddr = sctp_v4_to_sk_daddr,
+ .af = &sctp_af_inet
+};
+
+/* Notifier for inetaddr addition/deletion events. */
+static struct notifier_block sctp_inetaddr_notifier = {
+ .notifier_call = sctp_inetaddr_event,
+};
+
+/* Socket operations. */
+static const struct proto_ops inet_seqpacket_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release, /* Needs to be wrapped... */
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname, /* Semantics are different. */
+ .poll = sctp_poll,
+ .ioctl = inet_ioctl,
+ .listen = sctp_inet_listen,
+ .shutdown = inet_shutdown, /* Looks harmless. */
+ .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+/* Registration with AF_INET family. */
+static struct inet_protosw sctp_seqpacket_protosw = {
+ .type = SOCK_SEQPACKET,
+ .protocol = IPPROTO_SCTP,
+ .prot = &sctp_prot,
+ .ops = &inet_seqpacket_ops,
+ .flags = SCTP_PROTOSW_FLAG
+};
+static struct inet_protosw sctp_stream_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_SCTP,
+ .prot = &sctp_prot,
+ .ops = &inet_seqpacket_ops,
+ .flags = SCTP_PROTOSW_FLAG
+};
+
+/* Register with IP layer. */
+static const struct net_protocol sctp_protocol = {
+ .handler = sctp_rcv,
+ .err_handler = sctp_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
+};
+
+/* IPv4 address related functions. */
+static struct sctp_af sctp_af_inet = {
+ .sa_family = AF_INET,
+ .sctp_xmit = sctp_v4_xmit,
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .get_dst = sctp_v4_get_dst,
+ .get_saddr = sctp_v4_get_saddr,
+ .copy_addrlist = sctp_v4_copy_addrlist,
+ .from_skb = sctp_v4_from_skb,
+ .from_sk = sctp_v4_from_sk,
+ .from_addr_param = sctp_v4_from_addr_param,
+ .to_addr_param = sctp_v4_to_addr_param,
+ .cmp_addr = sctp_v4_cmp_addr,
+ .addr_valid = sctp_v4_addr_valid,
+ .inaddr_any = sctp_v4_inaddr_any,
+ .is_any = sctp_v4_is_any,
+ .available = sctp_v4_available,
+ .scope = sctp_v4_scope,
+ .skb_iif = sctp_v4_skb_iif,
+ .is_ce = sctp_v4_is_ce,
+ .seq_dump_addr = sctp_v4_seq_dump_addr,
+ .ecn_capable = sctp_v4_ecn_capable,
+ .net_header_len = sizeof(struct iphdr),
+ .sockaddr_len = sizeof(struct sockaddr_in),
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ip_setsockopt,
+ .compat_getsockopt = compat_ip_getsockopt,
+#endif
+};
+
+struct sctp_pf *sctp_get_pf_specific(sa_family_t family)
+{
+ switch (family) {
+ case PF_INET:
+ return sctp_pf_inet_specific;
+ case PF_INET6:
+ return sctp_pf_inet6_specific;
+ default:
+ return NULL;
+ }
+}
+
+/* Register the PF specific function table. */
+int sctp_register_pf(struct sctp_pf *pf, sa_family_t family)
+{
+ switch (family) {
+ case PF_INET:
+ if (sctp_pf_inet_specific)
+ return 0;
+ sctp_pf_inet_specific = pf;
+ break;
+ case PF_INET6:
+ if (sctp_pf_inet6_specific)
+ return 0;
+ sctp_pf_inet6_specific = pf;
+ break;
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+static inline int init_sctp_mibs(struct net *net)
+{
+ net->sctp.sctp_statistics = alloc_percpu(struct sctp_mib);
+ if (!net->sctp.sctp_statistics)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void cleanup_sctp_mibs(struct net *net)
+{
+ free_percpu(net->sctp.sctp_statistics);
+}
+
+static void sctp_v4_pf_init(void)
+{
+ /* Initialize the SCTP specific PF functions. */
+ sctp_register_pf(&sctp_pf_inet, PF_INET);
+ sctp_register_af(&sctp_af_inet);
+}
+
+static void sctp_v4_pf_exit(void)
+{
+ list_del(&sctp_af_inet.list);
+}
+
+static int sctp_v4_protosw_init(void)
+{
+ int rc;
+
+ rc = proto_register(&sctp_prot, 1);
+ if (rc)
+ return rc;
+
+ /* Register SCTP(UDP and TCP style) with socket layer. */
+ inet_register_protosw(&sctp_seqpacket_protosw);
+ inet_register_protosw(&sctp_stream_protosw);
+
+ return 0;
+}
+
+static void sctp_v4_protosw_exit(void)
+{
+ inet_unregister_protosw(&sctp_stream_protosw);
+ inet_unregister_protosw(&sctp_seqpacket_protosw);
+ proto_unregister(&sctp_prot);
+}
+
+static int sctp_v4_add_protocol(void)
+{
+ /* Register notifier for inet address additions/deletions. */
+ register_inetaddr_notifier(&sctp_inetaddr_notifier);
+
+ /* Register SCTP with inet layer. */
+ if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static void sctp_v4_del_protocol(void)
+{
+ inet_del_protocol(&sctp_protocol, IPPROTO_SCTP);
+ unregister_inetaddr_notifier(&sctp_inetaddr_notifier);
+}
+
+static int __net_init sctp_net_init(struct net *net)
+{
+ int status;
+
+ /*
+ * 14. Suggested SCTP Protocol Parameter Values
+ */
+ /* The following protocol parameters are RECOMMENDED: */
+ /* RTO.Initial - 3 seconds */
+ net->sctp.rto_initial = SCTP_RTO_INITIAL;
+ /* RTO.Min - 1 second */
+ net->sctp.rto_min = SCTP_RTO_MIN;
+ /* RTO.Max - 60 seconds */
+ net->sctp.rto_max = SCTP_RTO_MAX;
+ /* RTO.Alpha - 1/8 */
+ net->sctp.rto_alpha = SCTP_RTO_ALPHA;
+ /* RTO.Beta - 1/4 */
+ net->sctp.rto_beta = SCTP_RTO_BETA;
+
+ /* Valid.Cookie.Life - 60 seconds */
+ net->sctp.valid_cookie_life = SCTP_DEFAULT_COOKIE_LIFE;
+
+ /* Whether Cookie Preservative is enabled(1) or not(0) */
+ net->sctp.cookie_preserve_enable = 1;
+
+ /* Default sctp sockets to use md5 as their hmac alg */
+#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5)
+ net->sctp.sctp_hmac_alg = "md5";
+#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1)
+ net->sctp.sctp_hmac_alg = "sha1";
+#else
+ net->sctp.sctp_hmac_alg = NULL;
+#endif
+
+ /* Max.Burst - 4 */
+ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST;
+
+ /* Association.Max.Retrans - 10 attempts
+ * Path.Max.Retrans - 5 attempts (per destination address)
+ * Max.Init.Retransmits - 8 attempts
+ */
+ net->sctp.max_retrans_association = 10;
+ net->sctp.max_retrans_path = 5;
+ net->sctp.max_retrans_init = 8;
+
+ /* Sendbuffer growth - do per-socket accounting */
+ net->sctp.sndbuf_policy = 0;
+
+ /* Rcvbuffer growth - do per-socket accounting */
+ net->sctp.rcvbuf_policy = 0;
+
+ /* HB.interval - 30 seconds */
+ net->sctp.hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+
+ /* delayed SACK timeout */
+ net->sctp.sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK;
+
+ /* Disable ADDIP by default. */
+ net->sctp.addip_enable = 0;
+ net->sctp.addip_noauth = 0;
+ net->sctp.default_auto_asconf = 0;
+
+ /* Enable PR-SCTP by default. */
+ net->sctp.prsctp_enable = 1;
+
+ /* Disable AUTH by default. */
+ net->sctp.auth_enable = 0;
+
+ /* Set SCOPE policy to enabled */
+ net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE;
+
+ /* Set the default rwnd update threshold */
+ net->sctp.rwnd_upd_shift = SCTP_DEFAULT_RWND_SHIFT;
+
+ /* Initialize maximum autoclose timeout. */
+ net->sctp.max_autoclose = INT_MAX / HZ;
+
+ status = sctp_sysctl_net_register(net);
+ if (status)
+ goto err_sysctl_register;
+
+ /* Allocate and initialise sctp mibs. */
+ status = init_sctp_mibs(net);
+ if (status)
+ goto err_init_mibs;
+
+ /* Initialize proc fs directory. */
+ status = sctp_proc_init(net);
+ if (status)
+ goto err_init_proc;
+
+ sctp_dbg_objcnt_init(net);
+
+ /* Initialize the control inode/socket for handling OOTB packets. */
+ if ((status = sctp_ctl_sock_init(net))) {
+ pr_err("Failed to initialize the SCTP control sock\n");
+ goto err_ctl_sock_init;
+ }
+
+ /* Initialize the local address list. */
+ INIT_LIST_HEAD(&net->sctp.local_addr_list);
+ spin_lock_init(&net->sctp.local_addr_lock);
+ sctp_get_local_addr_list(net);
+
+ /* Initialize the address event list */
+ INIT_LIST_HEAD(&net->sctp.addr_waitq);
+ INIT_LIST_HEAD(&net->sctp.auto_asconf_splist);
+ spin_lock_init(&net->sctp.addr_wq_lock);
+ net->sctp.addr_wq_timer.expires = 0;
+ setup_timer(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler,
+ (unsigned long)net);
+
+ return 0;
+
+err_ctl_sock_init:
+ sctp_dbg_objcnt_exit(net);
+ sctp_proc_exit(net);
+err_init_proc:
+ cleanup_sctp_mibs(net);
+err_init_mibs:
+ sctp_sysctl_net_unregister(net);
+err_sysctl_register:
+ return status;
+}
+
+static void __net_exit sctp_net_exit(struct net *net)
+{
+ /* Free the local address list */
+ sctp_free_addr_wq(net);
+ sctp_free_local_addr_list(net);
+
+ /* Free the control endpoint. */
+ inet_ctl_sock_destroy(net->sctp.ctl_sock);
+
+ sctp_dbg_objcnt_exit(net);
+
+ sctp_proc_exit(net);
+ cleanup_sctp_mibs(net);
+ sctp_sysctl_net_unregister(net);
+}
+
+static struct pernet_operations sctp_net_ops = {
+ .init = sctp_net_init,
+ .exit = sctp_net_exit,
+};
+
+/* Initialize the universe into something sensible. */
+static __init int sctp_init(void)
+{
+ int i;
+ int status = -EINVAL;
+ unsigned long goal;
+ unsigned long limit;
+ int max_share;
+ int order;
+
+ sock_skb_cb_check_size(sizeof(struct sctp_ulpevent));
+
+ /* Allocate bind_bucket and chunk caches. */
+ status = -ENOBUFS;
+ sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket",
+ sizeof(struct sctp_bind_bucket),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!sctp_bucket_cachep)
+ goto out;
+
+ sctp_chunk_cachep = kmem_cache_create("sctp_chunk",
+ sizeof(struct sctp_chunk),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!sctp_chunk_cachep)
+ goto err_chunk_cachep;
+
+ status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL);
+ if (status)
+ goto err_percpu_counter_init;
+
+ /* Implementation specific variables. */
+
+ /* Initialize default stream count setup information. */
+ sctp_max_instreams = SCTP_DEFAULT_INSTREAMS;
+ sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS;
+
+ /* Initialize handle used for association ids. */
+ idr_init(&sctp_assocs_id);
+
+ limit = nr_free_buffer_pages() / 8;
+ limit = max(limit, 128UL);
+ sysctl_sctp_mem[0] = limit / 4 * 3;
+ sysctl_sctp_mem[1] = limit;
+ sysctl_sctp_mem[2] = sysctl_sctp_mem[0] * 2;
+
+ /* Set per-socket limits to no more than 1/128 the pressure threshold*/
+ limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7);
+ max_share = min(4UL*1024*1024, limit);
+
+ sysctl_sctp_rmem[0] = SK_MEM_QUANTUM; /* give each asoc 1 page min */
+ sysctl_sctp_rmem[1] = 1500 * SKB_TRUESIZE(1);
+ sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share);
+
+ sysctl_sctp_wmem[0] = SK_MEM_QUANTUM;
+ sysctl_sctp_wmem[1] = 16*1024;
+ sysctl_sctp_wmem[2] = max(64*1024, max_share);
+
+ /* Size and allocate the association hash table.
+ * The methodology is similar to that of the tcp hash tables.
+ */
+ if (totalram_pages >= (128 * 1024))
+ goal = totalram_pages >> (22 - PAGE_SHIFT);
+ else
+ goal = totalram_pages >> (24 - PAGE_SHIFT);
+
+ for (order = 0; (1UL << order) < goal; order++)
+ ;
+
+ do {
+ sctp_assoc_hashsize = (1UL << order) * PAGE_SIZE /
+ sizeof(struct sctp_hashbucket);
+ if ((sctp_assoc_hashsize > (64 * 1024)) && order > 0)
+ continue;
+ sctp_assoc_hashtable = (struct sctp_hashbucket *)
+ __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order);
+ } while (!sctp_assoc_hashtable && --order > 0);
+ if (!sctp_assoc_hashtable) {
+ pr_err("Failed association hash alloc\n");
+ status = -ENOMEM;
+ goto err_ahash_alloc;
+ }
+ for (i = 0; i < sctp_assoc_hashsize; i++) {
+ rwlock_init(&sctp_assoc_hashtable[i].lock);
+ INIT_HLIST_HEAD(&sctp_assoc_hashtable[i].chain);
+ }
+
+ /* Allocate and initialize the endpoint hash table. */
+ sctp_ep_hashsize = 64;
+ sctp_ep_hashtable =
+ kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL);
+ if (!sctp_ep_hashtable) {
+ pr_err("Failed endpoint_hash alloc\n");
+ status = -ENOMEM;
+ goto err_ehash_alloc;
+ }
+ for (i = 0; i < sctp_ep_hashsize; i++) {
+ rwlock_init(&sctp_ep_hashtable[i].lock);
+ INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain);
+ }
+
+ /* Allocate and initialize the SCTP port hash table. */
+ do {
+ sctp_port_hashsize = (1UL << order) * PAGE_SIZE /
+ sizeof(struct sctp_bind_hashbucket);
+ if ((sctp_port_hashsize > (64 * 1024)) && order > 0)
+ continue;
+ sctp_port_hashtable = (struct sctp_bind_hashbucket *)
+ __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order);
+ } while (!sctp_port_hashtable && --order > 0);
+ if (!sctp_port_hashtable) {
+ pr_err("Failed bind hash alloc\n");
+ status = -ENOMEM;
+ goto err_bhash_alloc;
+ }
+ for (i = 0; i < sctp_port_hashsize; i++) {
+ spin_lock_init(&sctp_port_hashtable[i].lock);
+ INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
+ }
+
+ pr_info("Hash tables configured (established %d bind %d)\n",
+ sctp_assoc_hashsize, sctp_port_hashsize);
+
+ sctp_sysctl_register();
+
+ INIT_LIST_HEAD(&sctp_address_families);
+ sctp_v4_pf_init();
+ sctp_v6_pf_init();
+
+ status = sctp_v4_protosw_init();
+
+ if (status)
+ goto err_protosw_init;
+
+ status = sctp_v6_protosw_init();
+ if (status)
+ goto err_v6_protosw_init;
+
+ status = register_pernet_subsys(&sctp_net_ops);
+ if (status)
+ goto err_register_pernet_subsys;
+
+ status = sctp_v4_add_protocol();
+ if (status)
+ goto err_add_protocol;
+
+ /* Register SCTP with inet6 layer. */
+ status = sctp_v6_add_protocol();
+ if (status)
+ goto err_v6_add_protocol;
+
+out:
+ return status;
+err_v6_add_protocol:
+ sctp_v4_del_protocol();
+err_add_protocol:
+ unregister_pernet_subsys(&sctp_net_ops);
+err_register_pernet_subsys:
+ sctp_v6_protosw_exit();
+err_v6_protosw_init:
+ sctp_v4_protosw_exit();
+err_protosw_init:
+ sctp_v4_pf_exit();
+ sctp_v6_pf_exit();
+ sctp_sysctl_unregister();
+ free_pages((unsigned long)sctp_port_hashtable,
+ get_order(sctp_port_hashsize *
+ sizeof(struct sctp_bind_hashbucket)));
+err_bhash_alloc:
+ kfree(sctp_ep_hashtable);
+err_ehash_alloc:
+ free_pages((unsigned long)sctp_assoc_hashtable,
+ get_order(sctp_assoc_hashsize *
+ sizeof(struct sctp_hashbucket)));
+err_ahash_alloc:
+ percpu_counter_destroy(&sctp_sockets_allocated);
+err_percpu_counter_init:
+ kmem_cache_destroy(sctp_chunk_cachep);
+err_chunk_cachep:
+ kmem_cache_destroy(sctp_bucket_cachep);
+ goto out;
+}
+
+/* Exit handler for the SCTP protocol. */
+static __exit void sctp_exit(void)
+{
+ /* BUG. This should probably do something useful like clean
+ * up all the remaining associations and all that memory.
+ */
+
+ /* Unregister with inet6/inet layers. */
+ sctp_v6_del_protocol();
+ sctp_v4_del_protocol();
+
+ unregister_pernet_subsys(&sctp_net_ops);
+
+ /* Free protosw registrations */
+ sctp_v6_protosw_exit();
+ sctp_v4_protosw_exit();
+
+ /* Unregister with socket layer. */
+ sctp_v6_pf_exit();
+ sctp_v4_pf_exit();
+
+ sctp_sysctl_unregister();
+
+ free_pages((unsigned long)sctp_assoc_hashtable,
+ get_order(sctp_assoc_hashsize *
+ sizeof(struct sctp_hashbucket)));
+ kfree(sctp_ep_hashtable);
+ free_pages((unsigned long)sctp_port_hashtable,
+ get_order(sctp_port_hashsize *
+ sizeof(struct sctp_bind_hashbucket)));
+
+ percpu_counter_destroy(&sctp_sockets_allocated);
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ kmem_cache_destroy(sctp_chunk_cachep);
+ kmem_cache_destroy(sctp_bucket_cachep);
+}
+
+module_init(sctp_init);
+module_exit(sctp_exit);
+
+/*
+ * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly.
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132");
+MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>");
+MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
+module_param_named(no_checksums, sctp_checksum_disable, bool, 0644);
+MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification");
+MODULE_LICENSE("GPL");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
new file mode 100644
index 000000000..06320c8c1
--- /dev/null
+++ b/net/sctp/sm_make_chunk.c
@@ -0,0 +1,3501 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2002 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions work with the state functions in sctp_sm_statefuns.c
+ * to implement the state operations. These functions implement the
+ * steps which require modifying existing data structures.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * C. Robin <chris@hundredacre.ac.uk>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Xingang Guo <xingang.guo@intel.com>
+ * Dajiang Zhang <dajiang.zhang@nokia.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ * Kevin Gao <kevin.gao@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+
+#include <linux/skbuff.h>
+#include <linux/random.h> /* for get_random_bytes */
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
+ __u8 type, __u8 flags, int paylen);
+static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
+ __u8 flags, int paylen);
+static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
+ __u8 type, __u8 flags, int paylen);
+static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *init_chunk,
+ int *cookie_len,
+ const __u8 *raw_addrs, int addrs_len);
+static int sctp_process_param(struct sctp_association *asoc,
+ union sctp_params param,
+ const union sctp_addr *peer_addr,
+ gfp_t gfp);
+static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
+ const void *data);
+static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len,
+ const void *data);
+
+/* Control chunk destructor */
+static void sctp_control_release_owner(struct sk_buff *skb)
+{
+ /*TODO: do memory release */
+}
+
+static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
+{
+ struct sctp_association *asoc = chunk->asoc;
+ struct sk_buff *skb = chunk->skb;
+
+ /* TODO: properly account for control chunks.
+ * To do it right we'll need:
+ * 1) endpoint if association isn't known.
+ * 2) proper memory accounting.
+ *
+ * For now don't do anything for now.
+ */
+ skb->sk = asoc ? asoc->base.sk : NULL;
+ skb->destructor = sctp_control_release_owner;
+}
+
+/* What was the inbound interface for this chunk? */
+int sctp_chunk_iif(const struct sctp_chunk *chunk)
+{
+ struct sctp_af *af;
+ int iif = 0;
+
+ af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version));
+ if (af)
+ iif = af->skb_iif(chunk->skb);
+
+ return iif;
+}
+
+/* RFC 2960 3.3.2 Initiation (INIT) (1)
+ *
+ * Note 2: The ECN capable field is reserved for future use of
+ * Explicit Congestion Notification.
+ */
+static const struct sctp_paramhdr ecap_param = {
+ SCTP_PARAM_ECN_CAPABLE,
+ cpu_to_be16(sizeof(struct sctp_paramhdr)),
+};
+static const struct sctp_paramhdr prsctp_param = {
+ SCTP_PARAM_FWD_TSN_SUPPORT,
+ cpu_to_be16(sizeof(struct sctp_paramhdr)),
+};
+
+/* A helper to initialize an op error inside a
+ * provided chunk, as most cause codes will be embedded inside an
+ * abort chunk.
+ */
+void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
+ size_t paylen)
+{
+ sctp_errhdr_t err;
+ __u16 len;
+
+ /* Cause code constants are now defined in network order. */
+ err.cause = cause_code;
+ len = sizeof(sctp_errhdr_t) + paylen;
+ err.length = htons(len);
+ chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(sctp_errhdr_t), &err);
+}
+
+/* A helper to initialize an op error inside a
+ * provided chunk, as most cause codes will be embedded inside an
+ * abort chunk. Differs from sctp_init_cause in that it won't oops
+ * if there isn't enough space in the op error chunk
+ */
+static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
+ size_t paylen)
+{
+ sctp_errhdr_t err;
+ __u16 len;
+
+ /* Cause code constants are now defined in network order. */
+ err.cause = cause_code;
+ len = sizeof(sctp_errhdr_t) + paylen;
+ err.length = htons(len);
+
+ if (skb_tailroom(chunk->skb) < len)
+ return -ENOSPC;
+ chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk,
+ sizeof(sctp_errhdr_t),
+ &err);
+ return 0;
+}
+/* 3.3.2 Initiation (INIT) (1)
+ *
+ * This chunk is used to initiate a SCTP association between two
+ * endpoints. The format of the INIT chunk is shown below:
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 1 | Chunk Flags | Chunk Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Initiate Tag |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Advertised Receiver Window Credit (a_rwnd) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Number of Outbound Streams | Number of Inbound Streams |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Initial TSN |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * \ \
+ * / Optional/Variable-Length Parameters /
+ * \ \
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *
+ * The INIT chunk contains the following parameters. Unless otherwise
+ * noted, each parameter MUST only be included once in the INIT chunk.
+ *
+ * Fixed Parameters Status
+ * ----------------------------------------------
+ * Initiate Tag Mandatory
+ * Advertised Receiver Window Credit Mandatory
+ * Number of Outbound Streams Mandatory
+ * Number of Inbound Streams Mandatory
+ * Initial TSN Mandatory
+ *
+ * Variable Parameters Status Type Value
+ * -------------------------------------------------------------
+ * IPv4 Address (Note 1) Optional 5
+ * IPv6 Address (Note 1) Optional 6
+ * Cookie Preservative Optional 9
+ * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000)
+ * Host Name Address (Note 3) Optional 11
+ * Supported Address Types (Note 4) Optional 12
+ */
+struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
+ const struct sctp_bind_addr *bp,
+ gfp_t gfp, int vparam_len)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ struct sctp_endpoint *ep = asoc->ep;
+ sctp_inithdr_t init;
+ union sctp_params addrs;
+ size_t chunksize;
+ struct sctp_chunk *retval = NULL;
+ int num_types, addrs_len = 0;
+ struct sctp_sock *sp;
+ sctp_supported_addrs_param_t sat;
+ __be16 types[2];
+ sctp_adaptation_ind_param_t aiparam;
+ sctp_supported_ext_param_t ext_param;
+ int num_ext = 0;
+ __u8 extensions[3];
+ sctp_paramhdr_t *auth_chunks = NULL,
+ *auth_hmacs = NULL;
+
+ /* RFC 2960 3.3.2 Initiation (INIT) (1)
+ *
+ * Note 1: The INIT chunks can contain multiple addresses that
+ * can be IPv4 and/or IPv6 in any combination.
+ */
+ retval = NULL;
+
+ /* Convert the provided bind address list to raw format. */
+ addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp);
+
+ init.init_tag = htonl(asoc->c.my_vtag);
+ init.a_rwnd = htonl(asoc->rwnd);
+ init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams);
+ init.num_inbound_streams = htons(asoc->c.sinit_max_instreams);
+ init.initial_tsn = htonl(asoc->c.initial_tsn);
+
+ /* How many address types are needed? */
+ sp = sctp_sk(asoc->base.sk);
+ num_types = sp->pf->supported_addrs(sp, types);
+
+ chunksize = sizeof(init) + addrs_len;
+ chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
+ chunksize += sizeof(ecap_param);
+
+ if (net->sctp.prsctp_enable)
+ chunksize += sizeof(prsctp_param);
+
+ /* ADDIP: Section 4.2.7:
+ * An implementation supporting this extension [ADDIP] MUST list
+ * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and
+ * INIT-ACK parameters.
+ */
+ if (net->sctp.addip_enable) {
+ extensions[num_ext] = SCTP_CID_ASCONF;
+ extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
+ num_ext += 2;
+ }
+
+ if (sp->adaptation_ind)
+ chunksize += sizeof(aiparam);
+
+ chunksize += vparam_len;
+
+ /* Account for AUTH related parameters */
+ if (ep->auth_enable) {
+ /* Add random parameter length*/
+ chunksize += sizeof(asoc->c.auth_random);
+
+ /* Add HMACS parameter length if any were defined */
+ auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
+ if (auth_hmacs->length)
+ chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+ else
+ auth_hmacs = NULL;
+
+ /* Add CHUNKS parameter length */
+ auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
+ if (auth_chunks->length)
+ chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+ else
+ auth_chunks = NULL;
+
+ extensions[num_ext] = SCTP_CID_AUTH;
+ num_ext += 1;
+ }
+
+ /* If we have any extensions to report, account for that */
+ if (num_ext)
+ chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
+ num_ext);
+
+ /* RFC 2960 3.3.2 Initiation (INIT) (1)
+ *
+ * Note 3: An INIT chunk MUST NOT contain more than one Host
+ * Name address parameter. Moreover, the sender of the INIT
+ * MUST NOT combine any other address types with the Host Name
+ * address in the INIT. The receiver of INIT MUST ignore any
+ * other address types if the Host Name address parameter is
+ * present in the received INIT chunk.
+ *
+ * PLEASE DO NOT FIXME [This version does not support Host Name.]
+ */
+
+ retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize);
+ if (!retval)
+ goto nodata;
+
+ retval->subh.init_hdr =
+ sctp_addto_chunk(retval, sizeof(init), &init);
+ retval->param_hdr.v =
+ sctp_addto_chunk(retval, addrs_len, addrs.v);
+
+ /* RFC 2960 3.3.2 Initiation (INIT) (1)
+ *
+ * Note 4: This parameter, when present, specifies all the
+ * address types the sending endpoint can support. The absence
+ * of this parameter indicates that the sending endpoint can
+ * support any address type.
+ */
+ sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES;
+ sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types));
+ sctp_addto_chunk(retval, sizeof(sat), &sat);
+ sctp_addto_chunk(retval, num_types * sizeof(__u16), &types);
+
+ sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);
+
+ /* Add the supported extensions parameter. Be nice and add this
+ * fist before addiding the parameters for the extensions themselves
+ */
+ if (num_ext) {
+ ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT;
+ ext_param.param_hdr.length =
+ htons(sizeof(sctp_supported_ext_param_t) + num_ext);
+ sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t),
+ &ext_param);
+ sctp_addto_param(retval, num_ext, extensions);
+ }
+
+ if (net->sctp.prsctp_enable)
+ sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
+
+ if (sp->adaptation_ind) {
+ aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
+ aiparam.param_hdr.length = htons(sizeof(aiparam));
+ aiparam.adaptation_ind = htonl(sp->adaptation_ind);
+ sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+ }
+
+ /* Add SCTP-AUTH chunks to the parameter list */
+ if (ep->auth_enable) {
+ sctp_addto_chunk(retval, sizeof(asoc->c.auth_random),
+ asoc->c.auth_random);
+ if (auth_hmacs)
+ sctp_addto_chunk(retval, ntohs(auth_hmacs->length),
+ auth_hmacs);
+ if (auth_chunks)
+ sctp_addto_chunk(retval, ntohs(auth_chunks->length),
+ auth_chunks);
+ }
+nodata:
+ kfree(addrs.v);
+ return retval;
+}
+
+struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ gfp_t gfp, int unkparam_len)
+{
+ sctp_inithdr_t initack;
+ struct sctp_chunk *retval;
+ union sctp_params addrs;
+ struct sctp_sock *sp;
+ int addrs_len;
+ sctp_cookie_param_t *cookie;
+ int cookie_len;
+ size_t chunksize;
+ sctp_adaptation_ind_param_t aiparam;
+ sctp_supported_ext_param_t ext_param;
+ int num_ext = 0;
+ __u8 extensions[3];
+ sctp_paramhdr_t *auth_chunks = NULL,
+ *auth_hmacs = NULL,
+ *auth_random = NULL;
+
+ retval = NULL;
+
+ /* Note: there may be no addresses to embed. */
+ addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp);
+
+ initack.init_tag = htonl(asoc->c.my_vtag);
+ initack.a_rwnd = htonl(asoc->rwnd);
+ initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams);
+ initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams);
+ initack.initial_tsn = htonl(asoc->c.initial_tsn);
+
+ /* FIXME: We really ought to build the cookie right
+ * into the packet instead of allocating more fresh memory.
+ */
+ cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len,
+ addrs.v, addrs_len);
+ if (!cookie)
+ goto nomem_cookie;
+
+ /* Calculate the total size of allocation, include the reserved
+ * space for reporting unknown parameters if it is specified.
+ */
+ sp = sctp_sk(asoc->base.sk);
+ chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len;
+
+ /* Tell peer that we'll do ECN only if peer advertised such cap. */
+ if (asoc->peer.ecn_capable)
+ chunksize += sizeof(ecap_param);
+
+ if (asoc->peer.prsctp_capable)
+ chunksize += sizeof(prsctp_param);
+
+ if (asoc->peer.asconf_capable) {
+ extensions[num_ext] = SCTP_CID_ASCONF;
+ extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
+ num_ext += 2;
+ }
+
+ if (sp->adaptation_ind)
+ chunksize += sizeof(aiparam);
+
+ if (asoc->peer.auth_capable) {
+ auth_random = (sctp_paramhdr_t *)asoc->c.auth_random;
+ chunksize += ntohs(auth_random->length);
+
+ auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
+ if (auth_hmacs->length)
+ chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+ else
+ auth_hmacs = NULL;
+
+ auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
+ if (auth_chunks->length)
+ chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+ else
+ auth_chunks = NULL;
+
+ extensions[num_ext] = SCTP_CID_AUTH;
+ num_ext += 1;
+ }
+
+ if (num_ext)
+ chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
+ num_ext);
+
+ /* Now allocate and fill out the chunk. */
+ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize);
+ if (!retval)
+ goto nomem_chunk;
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [INIT ACK back to where the INIT came from.]
+ */
+ retval->transport = chunk->transport;
+
+ retval->subh.init_hdr =
+ sctp_addto_chunk(retval, sizeof(initack), &initack);
+ retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v);
+ sctp_addto_chunk(retval, cookie_len, cookie);
+ if (asoc->peer.ecn_capable)
+ sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);
+ if (num_ext) {
+ ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT;
+ ext_param.param_hdr.length =
+ htons(sizeof(sctp_supported_ext_param_t) + num_ext);
+ sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t),
+ &ext_param);
+ sctp_addto_param(retval, num_ext, extensions);
+ }
+ if (asoc->peer.prsctp_capable)
+ sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
+
+ if (sp->adaptation_ind) {
+ aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
+ aiparam.param_hdr.length = htons(sizeof(aiparam));
+ aiparam.adaptation_ind = htonl(sp->adaptation_ind);
+ sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+ }
+
+ if (asoc->peer.auth_capable) {
+ sctp_addto_chunk(retval, ntohs(auth_random->length),
+ auth_random);
+ if (auth_hmacs)
+ sctp_addto_chunk(retval, ntohs(auth_hmacs->length),
+ auth_hmacs);
+ if (auth_chunks)
+ sctp_addto_chunk(retval, ntohs(auth_chunks->length),
+ auth_chunks);
+ }
+
+ /* We need to remove the const qualifier at this point. */
+ retval->asoc = (struct sctp_association *) asoc;
+
+nomem_chunk:
+ kfree(cookie);
+nomem_cookie:
+ kfree(addrs.v);
+ return retval;
+}
+
+/* 3.3.11 Cookie Echo (COOKIE ECHO) (10):
+ *
+ * This chunk is used only during the initialization of an association.
+ * It is sent by the initiator of an association to its peer to complete
+ * the initialization process. This chunk MUST precede any DATA chunk
+ * sent within the association, but MAY be bundled with one or more DATA
+ * chunks in the same packet.
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 10 |Chunk Flags | Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * / Cookie /
+ * \ \
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Chunk Flags: 8 bit
+ *
+ * Set to zero on transmit and ignored on receipt.
+ *
+ * Length: 16 bits (unsigned integer)
+ *
+ * Set to the size of the chunk in bytes, including the 4 bytes of
+ * the chunk header and the size of the Cookie.
+ *
+ * Cookie: variable size
+ *
+ * This field must contain the exact cookie received in the
+ * State Cookie parameter from the previous INIT ACK.
+ *
+ * An implementation SHOULD make the cookie as small as possible
+ * to insure interoperability.
+ */
+struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_chunk *retval;
+ void *cookie;
+ int cookie_len;
+
+ cookie = asoc->peer.cookie;
+ cookie_len = asoc->peer.cookie_len;
+
+ /* Build a cookie echo chunk. */
+ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len);
+ if (!retval)
+ goto nodata;
+ retval->subh.cookie_hdr =
+ sctp_addto_chunk(retval, cookie_len, cookie);
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it * received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [COOKIE ECHO back to where the INIT ACK came from.]
+ */
+ if (chunk)
+ retval->transport = chunk->transport;
+
+nodata:
+ return retval;
+}
+
+/* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11):
+ *
+ * This chunk is used only during the initialization of an
+ * association. It is used to acknowledge the receipt of a COOKIE
+ * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent
+ * within the association, but MAY be bundled with one or more DATA
+ * chunks or SACK chunk in the same SCTP packet.
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 11 |Chunk Flags | Length = 4 |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Chunk Flags: 8 bits
+ *
+ * Set to zero on transmit and ignored on receipt.
+ */
+struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0);
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it * received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [COOKIE ACK back to where the COOKIE ECHO came from.]
+ */
+ if (retval && chunk)
+ retval->transport = chunk->transport;
+
+ return retval;
+}
+
+/*
+ * Appendix A: Explicit Congestion Notification:
+ * CWR:
+ *
+ * RFC 2481 details a specific bit for a sender to send in the header of
+ * its next outbound TCP segment to indicate to its peer that it has
+ * reduced its congestion window. This is termed the CWR bit. For
+ * SCTP the same indication is made by including the CWR chunk.
+ * This chunk contains one data element, i.e. the TSN number that
+ * was sent in the ECNE chunk. This element represents the lowest
+ * TSN number in the datagram that was originally marked with the
+ * CE bit.
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Lowest TSN Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Note: The CWR is considered a Control chunk.
+ */
+struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
+ const __u32 lowest_tsn,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_chunk *retval;
+ sctp_cwrhdr_t cwr;
+
+ cwr.lowest_tsn = htonl(lowest_tsn);
+ retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0,
+ sizeof(sctp_cwrhdr_t));
+
+ if (!retval)
+ goto nodata;
+
+ retval->subh.ecn_cwr_hdr =
+ sctp_addto_chunk(retval, sizeof(cwr), &cwr);
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it * received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [Report a reduced congestion window back to where the ECNE
+ * came from.]
+ */
+ if (chunk)
+ retval->transport = chunk->transport;
+
+nodata:
+ return retval;
+}
+
+/* Make an ECNE chunk. This is a congestion experienced report. */
+struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
+ const __u32 lowest_tsn)
+{
+ struct sctp_chunk *retval;
+ sctp_ecnehdr_t ecne;
+
+ ecne.lowest_tsn = htonl(lowest_tsn);
+ retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0,
+ sizeof(sctp_ecnehdr_t));
+ if (!retval)
+ goto nodata;
+ retval->subh.ecne_hdr =
+ sctp_addto_chunk(retval, sizeof(ecne), &ecne);
+
+nodata:
+ return retval;
+}
+
+/* Make a DATA chunk for the given association from the provided
+ * parameters. However, do not populate the data payload.
+ */
+struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
+ const struct sctp_sndrcvinfo *sinfo,
+ int data_len, __u8 flags, __u16 ssn)
+{
+ struct sctp_chunk *retval;
+ struct sctp_datahdr dp;
+ int chunk_len;
+
+ /* We assign the TSN as LATE as possible, not here when
+ * creating the chunk.
+ */
+ dp.tsn = 0;
+ dp.stream = htons(sinfo->sinfo_stream);
+ dp.ppid = sinfo->sinfo_ppid;
+
+ /* Set the flags for an unordered send. */
+ if (sinfo->sinfo_flags & SCTP_UNORDERED) {
+ flags |= SCTP_DATA_UNORDERED;
+ dp.ssn = 0;
+ } else
+ dp.ssn = htons(ssn);
+
+ chunk_len = sizeof(dp) + data_len;
+ retval = sctp_make_data(asoc, flags, chunk_len);
+ if (!retval)
+ goto nodata;
+
+ retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
+ memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
+
+nodata:
+ return retval;
+}
+
+/* Create a selective ackowledgement (SACK) for the given
+ * association. This reports on which TSN's we've seen to date,
+ * including duplicates and gaps.
+ */
+struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
+{
+ struct sctp_chunk *retval;
+ struct sctp_sackhdr sack;
+ int len;
+ __u32 ctsn;
+ __u16 num_gabs, num_dup_tsns;
+ struct sctp_association *aptr = (struct sctp_association *)asoc;
+ struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
+ struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
+ struct sctp_transport *trans;
+
+ memset(gabs, 0, sizeof(gabs));
+ ctsn = sctp_tsnmap_get_ctsn(map);
+
+ pr_debug("%s: sackCTSNAck sent:0x%x\n", __func__, ctsn);
+
+ /* How much room is needed in the chunk? */
+ num_gabs = sctp_tsnmap_num_gabs(map, gabs);
+ num_dup_tsns = sctp_tsnmap_num_dups(map);
+
+ /* Initialize the SACK header. */
+ sack.cum_tsn_ack = htonl(ctsn);
+ sack.a_rwnd = htonl(asoc->a_rwnd);
+ sack.num_gap_ack_blocks = htons(num_gabs);
+ sack.num_dup_tsns = htons(num_dup_tsns);
+
+ len = sizeof(sack)
+ + sizeof(struct sctp_gap_ack_block) * num_gabs
+ + sizeof(__u32) * num_dup_tsns;
+
+ /* Create the chunk. */
+ retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len);
+ if (!retval)
+ goto nodata;
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, etc.) to the same destination transport
+ * address from which it received the DATA or control chunk to
+ * which it is replying. This rule should also be followed if
+ * the endpoint is bundling DATA chunks together with the
+ * reply chunk.
+ *
+ * However, when acknowledging multiple DATA chunks received
+ * in packets from different source addresses in a single
+ * SACK, the SACK chunk may be transmitted to one of the
+ * destination transport addresses from which the DATA or
+ * control chunks being acknowledged were received.
+ *
+ * [BUG: We do not implement the following paragraph.
+ * Perhaps we should remember the last transport we used for a
+ * SACK and avoid that (if possible) if we have seen any
+ * duplicates. --piggy]
+ *
+ * When a receiver of a duplicate DATA chunk sends a SACK to a
+ * multi- homed endpoint it MAY be beneficial to vary the
+ * destination address and not use the source address of the
+ * DATA chunk. The reason being that receiving a duplicate
+ * from a multi-homed endpoint might indicate that the return
+ * path (as specified in the source address of the DATA chunk)
+ * for the SACK is broken.
+ *
+ * [Send to the address from which we last received a DATA chunk.]
+ */
+ retval->transport = asoc->peer.last_data_from;
+
+ retval->subh.sack_hdr =
+ sctp_addto_chunk(retval, sizeof(sack), &sack);
+
+ /* Add the gap ack block information. */
+ if (num_gabs)
+ sctp_addto_chunk(retval, sizeof(__u32) * num_gabs,
+ gabs);
+
+ /* Add the duplicate TSN information. */
+ if (num_dup_tsns) {
+ aptr->stats.idupchunks += num_dup_tsns;
+ sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
+ sctp_tsnmap_get_dups(map));
+ }
+ /* Once we have a sack generated, check to see what our sack
+ * generation is, if its 0, reset the transports to 0, and reset
+ * the association generation to 1
+ *
+ * The idea is that zero is never used as a valid generation for the
+ * association so no transport will match after a wrap event like this,
+ * Until the next sack
+ */
+ if (++aptr->peer.sack_generation == 0) {
+ list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+ transports)
+ trans->sack_generation = 0;
+ aptr->peer.sack_generation = 1;
+ }
+nodata:
+ return retval;
+}
+
+/* Make a SHUTDOWN chunk. */
+struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_chunk *retval;
+ sctp_shutdownhdr_t shut;
+ __u32 ctsn;
+
+ ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
+ shut.cum_tsn_ack = htonl(ctsn);
+
+ retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
+ sizeof(sctp_shutdownhdr_t));
+ if (!retval)
+ goto nodata;
+
+ retval->subh.shutdown_hdr =
+ sctp_addto_chunk(retval, sizeof(shut), &shut);
+
+ if (chunk)
+ retval->transport = chunk->transport;
+nodata:
+ return retval;
+}
+
+struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0);
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it * received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [ACK back to where the SHUTDOWN came from.]
+ */
+ if (retval && chunk)
+ retval->transport = chunk->transport;
+
+ return retval;
+}
+
+struct sctp_chunk *sctp_make_shutdown_complete(
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_chunk *retval;
+ __u8 flags = 0;
+
+ /* Set the T-bit if we have no association (vtag will be
+ * reflected)
+ */
+ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T;
+
+ retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0);
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it * received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK
+ * came from.]
+ */
+ if (retval && chunk)
+ retval->transport = chunk->transport;
+
+ return retval;
+}
+
+/* Create an ABORT. Note that we set the T bit if we have no
+ * association, except when responding to an INIT (sctpimpguide 2.41).
+ */
+struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ const size_t hint)
+{
+ struct sctp_chunk *retval;
+ __u8 flags = 0;
+
+ /* Set the T-bit if we have no association and 'chunk' is not
+ * an INIT (vtag will be reflected).
+ */
+ if (!asoc) {
+ if (chunk && chunk->chunk_hdr &&
+ chunk->chunk_hdr->type == SCTP_CID_INIT)
+ flags = 0;
+ else
+ flags = SCTP_CHUNK_FLAG_T;
+ }
+
+ retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint);
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it * received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [ABORT back to where the offender came from.]
+ */
+ if (retval && chunk)
+ retval->transport = chunk->transport;
+
+ return retval;
+}
+
+/* Helper to create ABORT with a NO_USER_DATA error. */
+struct sctp_chunk *sctp_make_abort_no_data(
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk, __u32 tsn)
+{
+ struct sctp_chunk *retval;
+ __be32 payload;
+
+ retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t)
+ + sizeof(tsn));
+
+ if (!retval)
+ goto no_mem;
+
+ /* Put the tsn back into network byte order. */
+ payload = htonl(tsn);
+ sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload));
+ sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload);
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it * received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [ABORT back to where the offender came from.]
+ */
+ if (chunk)
+ retval->transport = chunk->transport;
+
+no_mem:
+ return retval;
+}
+
+/* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */
+struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc,
+ struct msghdr *msg,
+ size_t paylen)
+{
+ struct sctp_chunk *retval;
+ void *payload = NULL;
+ int err;
+
+ retval = sctp_make_abort(asoc, NULL, sizeof(sctp_errhdr_t) + paylen);
+ if (!retval)
+ goto err_chunk;
+
+ if (paylen) {
+ /* Put the msg_iov together into payload. */
+ payload = kmalloc(paylen, GFP_KERNEL);
+ if (!payload)
+ goto err_payload;
+
+ err = memcpy_from_msg(payload, msg, paylen);
+ if (err < 0)
+ goto err_copy;
+ }
+
+ sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen);
+ sctp_addto_chunk(retval, paylen, payload);
+
+ if (paylen)
+ kfree(payload);
+
+ return retval;
+
+err_copy:
+ kfree(payload);
+err_payload:
+ sctp_chunk_free(retval);
+ retval = NULL;
+err_chunk:
+ return retval;
+}
+
+/* Append bytes to the end of a parameter. Will panic if chunk is not big
+ * enough.
+ */
+static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
+ const void *data)
+{
+ void *target;
+ int chunklen = ntohs(chunk->chunk_hdr->length);
+
+ target = skb_put(chunk->skb, len);
+
+ if (data)
+ memcpy(target, data, len);
+ else
+ memset(target, 0, len);
+
+ /* Adjust the chunk length field. */
+ chunk->chunk_hdr->length = htons(chunklen + len);
+ chunk->chunk_end = skb_tail_pointer(chunk->skb);
+
+ return target;
+}
+
+/* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */
+struct sctp_chunk *sctp_make_abort_violation(
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ const __u8 *payload,
+ const size_t paylen)
+{
+ struct sctp_chunk *retval;
+ struct sctp_paramhdr phdr;
+
+ retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t) + paylen
+ + sizeof(sctp_paramhdr_t));
+ if (!retval)
+ goto end;
+
+ sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen
+ + sizeof(sctp_paramhdr_t));
+
+ phdr.type = htons(chunk->chunk_hdr->type);
+ phdr.length = chunk->chunk_hdr->length;
+ sctp_addto_chunk(retval, paylen, payload);
+ sctp_addto_param(retval, sizeof(sctp_paramhdr_t), &phdr);
+
+end:
+ return retval;
+}
+
+struct sctp_chunk *sctp_make_violation_paramlen(
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ struct sctp_paramhdr *param)
+{
+ struct sctp_chunk *retval;
+ static const char error[] = "The following parameter had invalid length:";
+ size_t payload_len = sizeof(error) + sizeof(sctp_errhdr_t) +
+ sizeof(sctp_paramhdr_t);
+
+ retval = sctp_make_abort(asoc, chunk, payload_len);
+ if (!retval)
+ goto nodata;
+
+ sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION,
+ sizeof(error) + sizeof(sctp_paramhdr_t));
+ sctp_addto_chunk(retval, sizeof(error), error);
+ sctp_addto_param(retval, sizeof(sctp_paramhdr_t), param);
+
+nodata:
+ return retval;
+}
+
+struct sctp_chunk *sctp_make_violation_max_retrans(
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_chunk *retval;
+ static const char error[] = "Association exceeded its max_retans count";
+ size_t payload_len = sizeof(error) + sizeof(sctp_errhdr_t);
+
+ retval = sctp_make_abort(asoc, chunk, payload_len);
+ if (!retval)
+ goto nodata;
+
+ sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error));
+ sctp_addto_chunk(retval, sizeof(error), error);
+
+nodata:
+ return retval;
+}
+
+/* Make a HEARTBEAT chunk. */
+struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
+ const struct sctp_transport *transport)
+{
+ struct sctp_chunk *retval;
+ sctp_sender_hb_info_t hbinfo;
+
+ retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo));
+
+ if (!retval)
+ goto nodata;
+
+ hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO;
+ hbinfo.param_hdr.length = htons(sizeof(sctp_sender_hb_info_t));
+ hbinfo.daddr = transport->ipaddr;
+ hbinfo.sent_at = jiffies;
+ hbinfo.hb_nonce = transport->hb_nonce;
+
+ /* Cast away the 'const', as this is just telling the chunk
+ * what transport it belongs to.
+ */
+ retval->transport = (struct sctp_transport *) transport;
+ retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo),
+ &hbinfo);
+
+nodata:
+ return retval;
+}
+
+struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ const void *payload, const size_t paylen)
+{
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen);
+ if (!retval)
+ goto nodata;
+
+ retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload);
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, * etc.) to the same destination transport
+ * address from which it * received the DATA or control chunk
+ * to which it is replying.
+ *
+ * [HBACK back to where the HEARTBEAT came from.]
+ */
+ if (chunk)
+ retval->transport = chunk->transport;
+
+nodata:
+ return retval;
+}
+
+/* Create an Operation Error chunk with the specified space reserved.
+ * This routine can be used for containing multiple causes in the chunk.
+ */
+static struct sctp_chunk *sctp_make_op_error_space(
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ size_t size)
+{
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0,
+ sizeof(sctp_errhdr_t) + size);
+ if (!retval)
+ goto nodata;
+
+ /* RFC 2960 6.4 Multi-homed SCTP Endpoints
+ *
+ * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+ * HEARTBEAT ACK, etc.) to the same destination transport
+ * address from which it received the DATA or control chunk
+ * to which it is replying.
+ *
+ */
+ if (chunk)
+ retval->transport = chunk->transport;
+
+nodata:
+ return retval;
+}
+
+/* Create an Operation Error chunk of a fixed size,
+ * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT)
+ * This is a helper function to allocate an error chunk for
+ * for those invalid parameter codes in which we may not want
+ * to report all the errors, if the incoming chunk is large
+ */
+static inline struct sctp_chunk *sctp_make_op_error_fixed(
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ size_t size = asoc ? asoc->pathmtu : 0;
+
+ if (!size)
+ size = SCTP_DEFAULT_MAXSEGMENT;
+
+ return sctp_make_op_error_space(asoc, chunk, size);
+}
+
+/* Create an Operation Error chunk. */
+struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ __be16 cause_code, const void *payload,
+ size_t paylen, size_t reserve_tail)
+{
+ struct sctp_chunk *retval;
+
+ retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail);
+ if (!retval)
+ goto nodata;
+
+ sctp_init_cause(retval, cause_code, paylen + reserve_tail);
+ sctp_addto_chunk(retval, paylen, payload);
+ if (reserve_tail)
+ sctp_addto_param(retval, reserve_tail, NULL);
+
+nodata:
+ return retval;
+}
+
+struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
+{
+ struct sctp_chunk *retval;
+ struct sctp_hmac *hmac_desc;
+ struct sctp_authhdr auth_hdr;
+ __u8 *hmac;
+
+ /* Get the first hmac that the peer told us to use */
+ hmac_desc = sctp_auth_asoc_get_hmac(asoc);
+ if (unlikely(!hmac_desc))
+ return NULL;
+
+ retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0,
+ hmac_desc->hmac_len + sizeof(sctp_authhdr_t));
+ if (!retval)
+ return NULL;
+
+ auth_hdr.hmac_id = htons(hmac_desc->hmac_id);
+ auth_hdr.shkey_id = htons(asoc->active_key_id);
+
+ retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(sctp_authhdr_t),
+ &auth_hdr);
+
+ hmac = skb_put(retval->skb, hmac_desc->hmac_len);
+ memset(hmac, 0, hmac_desc->hmac_len);
+
+ /* Adjust the chunk header to include the empty MAC */
+ retval->chunk_hdr->length =
+ htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len);
+ retval->chunk_end = skb_tail_pointer(retval->skb);
+
+ return retval;
+}
+
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* Turn an skb into a chunk.
+ * FIXME: Eventually move the structure directly inside the skb->cb[].
+ *
+ * sctpimpguide-05.txt Section 2.8.2
+ * M1) Each time a new DATA chunk is transmitted
+ * set the 'TSN.Missing.Report' count for that TSN to 0. The
+ * 'TSN.Missing.Report' count will be used to determine missing chunks
+ * and when to fast retransmit.
+ *
+ */
+struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
+ const struct sctp_association *asoc,
+ struct sock *sk)
+{
+ struct sctp_chunk *retval;
+
+ retval = kmem_cache_zalloc(sctp_chunk_cachep, GFP_ATOMIC);
+
+ if (!retval)
+ goto nodata;
+ if (!sk)
+ pr_debug("%s: chunkifying skb:%p w/o an sk\n", __func__, skb);
+
+ INIT_LIST_HEAD(&retval->list);
+ retval->skb = skb;
+ retval->asoc = (struct sctp_association *)asoc;
+ retval->singleton = 1;
+
+ retval->fast_retransmit = SCTP_CAN_FRTX;
+
+ /* Polish the bead hole. */
+ INIT_LIST_HEAD(&retval->transmitted_list);
+ INIT_LIST_HEAD(&retval->frag_list);
+ SCTP_DBG_OBJCNT_INC(chunk);
+ atomic_set(&retval->refcnt, 1);
+
+nodata:
+ return retval;
+}
+
+/* Set chunk->source and dest based on the IP header in chunk->skb. */
+void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src,
+ union sctp_addr *dest)
+{
+ memcpy(&chunk->source, src, sizeof(union sctp_addr));
+ memcpy(&chunk->dest, dest, sizeof(union sctp_addr));
+}
+
+/* Extract the source address from a chunk. */
+const union sctp_addr *sctp_source(const struct sctp_chunk *chunk)
+{
+ /* If we have a known transport, use that. */
+ if (chunk->transport) {
+ return &chunk->transport->ipaddr;
+ } else {
+ /* Otherwise, extract it from the IP header. */
+ return &chunk->source;
+ }
+}
+
+/* Create a new chunk, setting the type and flags headers from the
+ * arguments, reserving enough space for a 'paylen' byte payload.
+ */
+static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
+ __u8 type, __u8 flags, int paylen)
+{
+ struct sctp_chunk *retval;
+ sctp_chunkhdr_t *chunk_hdr;
+ struct sk_buff *skb;
+ struct sock *sk;
+
+ /* No need to allocate LL here, as this is only a chunk. */
+ skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen),
+ GFP_ATOMIC);
+ if (!skb)
+ goto nodata;
+
+ /* Make room for the chunk header. */
+ chunk_hdr = (sctp_chunkhdr_t *)skb_put(skb, sizeof(sctp_chunkhdr_t));
+ chunk_hdr->type = type;
+ chunk_hdr->flags = flags;
+ chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t));
+
+ sk = asoc ? asoc->base.sk : NULL;
+ retval = sctp_chunkify(skb, asoc, sk);
+ if (!retval) {
+ kfree_skb(skb);
+ goto nodata;
+ }
+
+ retval->chunk_hdr = chunk_hdr;
+ retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(struct sctp_chunkhdr);
+
+ /* Determine if the chunk needs to be authenticated */
+ if (sctp_auth_send_cid(type, asoc))
+ retval->auth = 1;
+
+ return retval;
+nodata:
+ return NULL;
+}
+
+static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
+ __u8 flags, int paylen)
+{
+ return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen);
+}
+
+static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
+ __u8 type, __u8 flags, int paylen)
+{
+ struct sctp_chunk *chunk = _sctp_make_chunk(asoc, type, flags, paylen);
+
+ if (chunk)
+ sctp_control_set_owner_w(chunk);
+
+ return chunk;
+}
+
+/* Release the memory occupied by a chunk. */
+static void sctp_chunk_destroy(struct sctp_chunk *chunk)
+{
+ BUG_ON(!list_empty(&chunk->list));
+ list_del_init(&chunk->transmitted_list);
+
+ consume_skb(chunk->skb);
+ consume_skb(chunk->auth_chunk);
+
+ SCTP_DBG_OBJCNT_DEC(chunk);
+ kmem_cache_free(sctp_chunk_cachep, chunk);
+}
+
+/* Possibly, free the chunk. */
+void sctp_chunk_free(struct sctp_chunk *chunk)
+{
+ /* Release our reference on the message tracker. */
+ if (chunk->msg)
+ sctp_datamsg_put(chunk->msg);
+
+ sctp_chunk_put(chunk);
+}
+
+/* Grab a reference to the chunk. */
+void sctp_chunk_hold(struct sctp_chunk *ch)
+{
+ atomic_inc(&ch->refcnt);
+}
+
+/* Release a reference to the chunk. */
+void sctp_chunk_put(struct sctp_chunk *ch)
+{
+ if (atomic_dec_and_test(&ch->refcnt))
+ sctp_chunk_destroy(ch);
+}
+
+/* Append bytes to the end of a chunk. Will panic if chunk is not big
+ * enough.
+ */
+void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
+{
+ void *target;
+ void *padding;
+ int chunklen = ntohs(chunk->chunk_hdr->length);
+ int padlen = WORD_ROUND(chunklen) - chunklen;
+
+ padding = skb_put(chunk->skb, padlen);
+ target = skb_put(chunk->skb, len);
+
+ memset(padding, 0, padlen);
+ memcpy(target, data, len);
+
+ /* Adjust the chunk length field. */
+ chunk->chunk_hdr->length = htons(chunklen + padlen + len);
+ chunk->chunk_end = skb_tail_pointer(chunk->skb);
+
+ return target;
+}
+
+/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient
+ * space in the chunk
+ */
+static void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk,
+ int len, const void *data)
+{
+ if (skb_tailroom(chunk->skb) >= len)
+ return sctp_addto_chunk(chunk, len, data);
+ else
+ return NULL;
+}
+
+/* Append bytes from user space to the end of a chunk. Will panic if
+ * chunk is not big enough.
+ * Returns a kernel err value.
+ */
+int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len,
+ struct iov_iter *from)
+{
+ void *target;
+ ssize_t copied;
+
+ /* Make room in chunk for data. */
+ target = skb_put(chunk->skb, len);
+
+ /* Copy data (whole iovec) into chunk */
+ copied = copy_from_iter(target, len, from);
+ if (copied != len)
+ return -EFAULT;
+
+ /* Adjust the chunk length field. */
+ chunk->chunk_hdr->length =
+ htons(ntohs(chunk->chunk_hdr->length) + len);
+ chunk->chunk_end = skb_tail_pointer(chunk->skb);
+
+ return 0;
+}
+
+/* Helper function to assign a TSN if needed. This assumes that both
+ * the data_hdr and association have already been assigned.
+ */
+void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
+{
+ struct sctp_datamsg *msg;
+ struct sctp_chunk *lchunk;
+ struct sctp_stream *stream;
+ __u16 ssn;
+ __u16 sid;
+
+ if (chunk->has_ssn)
+ return;
+
+ /* All fragments will be on the same stream */
+ sid = ntohs(chunk->subh.data_hdr->stream);
+ stream = &chunk->asoc->ssnmap->out;
+
+ /* Now assign the sequence number to the entire message.
+ * All fragments must have the same stream sequence number.
+ */
+ msg = chunk->msg;
+ list_for_each_entry(lchunk, &msg->chunks, frag_list) {
+ if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
+ ssn = 0;
+ } else {
+ if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
+ ssn = sctp_ssn_next(stream, sid);
+ else
+ ssn = sctp_ssn_peek(stream, sid);
+ }
+
+ lchunk->subh.data_hdr->ssn = htons(ssn);
+ lchunk->has_ssn = 1;
+ }
+}
+
+/* Helper function to assign a TSN if needed. This assumes that both
+ * the data_hdr and association have already been assigned.
+ */
+void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
+{
+ if (!chunk->has_tsn) {
+ /* This is the last possible instant to
+ * assign a TSN.
+ */
+ chunk->subh.data_hdr->tsn =
+ htonl(sctp_association_get_next_tsn(chunk->asoc));
+ chunk->has_tsn = 1;
+ }
+}
+
+/* Create a CLOSED association to use with an incoming packet. */
+struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
+ struct sctp_chunk *chunk,
+ gfp_t gfp)
+{
+ struct sctp_association *asoc;
+ struct sk_buff *skb;
+ sctp_scope_t scope;
+ struct sctp_af *af;
+
+ /* Create the bare association. */
+ scope = sctp_scope(sctp_source(chunk));
+ asoc = sctp_association_new(ep, ep->base.sk, scope, gfp);
+ if (!asoc)
+ goto nodata;
+ asoc->temp = 1;
+ skb = chunk->skb;
+ /* Create an entry for the source address of the packet. */
+ af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version));
+ if (unlikely(!af))
+ goto fail;
+ af->from_skb(&asoc->c.peer_addr, skb, 1);
+nodata:
+ return asoc;
+
+fail:
+ sctp_association_free(asoc);
+ return NULL;
+}
+
+/* Build a cookie representing asoc.
+ * This INCLUDES the param header needed to put the cookie in the INIT ACK.
+ */
+static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *init_chunk,
+ int *cookie_len,
+ const __u8 *raw_addrs, int addrs_len)
+{
+ sctp_cookie_param_t *retval;
+ struct sctp_signed_cookie *cookie;
+ struct scatterlist sg;
+ int headersize, bodysize;
+
+ /* Header size is static data prior to the actual cookie, including
+ * any padding.
+ */
+ headersize = sizeof(sctp_paramhdr_t) +
+ (sizeof(struct sctp_signed_cookie) -
+ sizeof(struct sctp_cookie));
+ bodysize = sizeof(struct sctp_cookie)
+ + ntohs(init_chunk->chunk_hdr->length) + addrs_len;
+
+ /* Pad out the cookie to a multiple to make the signature
+ * functions simpler to write.
+ */
+ if (bodysize % SCTP_COOKIE_MULTIPLE)
+ bodysize += SCTP_COOKIE_MULTIPLE
+ - (bodysize % SCTP_COOKIE_MULTIPLE);
+ *cookie_len = headersize + bodysize;
+
+ /* Clear this memory since we are sending this data structure
+ * out on the network.
+ */
+ retval = kzalloc(*cookie_len, GFP_ATOMIC);
+ if (!retval)
+ goto nodata;
+
+ cookie = (struct sctp_signed_cookie *) retval->body;
+
+ /* Set up the parameter header. */
+ retval->p.type = SCTP_PARAM_STATE_COOKIE;
+ retval->p.length = htons(*cookie_len);
+
+ /* Copy the cookie part of the association itself. */
+ cookie->c = asoc->c;
+ /* Save the raw address list length in the cookie. */
+ cookie->c.raw_addr_list_len = addrs_len;
+
+ /* Remember PR-SCTP capability. */
+ cookie->c.prsctp_capable = asoc->peer.prsctp_capable;
+
+ /* Save adaptation indication in the cookie. */
+ cookie->c.adaptation_ind = asoc->peer.adaptation_ind;
+
+ /* Set an expiration time for the cookie. */
+ cookie->c.expiration = ktime_add(asoc->cookie_life,
+ ktime_get());
+
+ /* Copy the peer's init packet. */
+ memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr,
+ ntohs(init_chunk->chunk_hdr->length));
+
+ /* Copy the raw local address list of the association. */
+ memcpy((__u8 *)&cookie->c.peer_init[0] +
+ ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
+
+ if (sctp_sk(ep->base.sk)->hmac) {
+ struct hash_desc desc;
+
+ /* Sign the message. */
+ sg_init_one(&sg, &cookie->c, bodysize);
+ desc.tfm = sctp_sk(ep->base.sk)->hmac;
+ desc.flags = 0;
+
+ if (crypto_hash_setkey(desc.tfm, ep->secret_key,
+ sizeof(ep->secret_key)) ||
+ crypto_hash_digest(&desc, &sg, bodysize, cookie->signature))
+ goto free_cookie;
+ }
+
+ return retval;
+
+free_cookie:
+ kfree(retval);
+nodata:
+ *cookie_len = 0;
+ return NULL;
+}
+
+/* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */
+struct sctp_association *sctp_unpack_cookie(
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *chunk, gfp_t gfp,
+ int *error, struct sctp_chunk **errp)
+{
+ struct sctp_association *retval = NULL;
+ struct sctp_signed_cookie *cookie;
+ struct sctp_cookie *bear_cookie;
+ int headersize, bodysize, fixed_size;
+ __u8 *digest = ep->digest;
+ struct scatterlist sg;
+ unsigned int len;
+ sctp_scope_t scope;
+ struct sk_buff *skb = chunk->skb;
+ ktime_t kt;
+ struct hash_desc desc;
+
+ /* Header size is static data prior to the actual cookie, including
+ * any padding.
+ */
+ headersize = sizeof(sctp_chunkhdr_t) +
+ (sizeof(struct sctp_signed_cookie) -
+ sizeof(struct sctp_cookie));
+ bodysize = ntohs(chunk->chunk_hdr->length) - headersize;
+ fixed_size = headersize + sizeof(struct sctp_cookie);
+
+ /* Verify that the chunk looks like it even has a cookie.
+ * There must be enough room for our cookie and our peer's
+ * INIT chunk.
+ */
+ len = ntohs(chunk->chunk_hdr->length);
+ if (len < fixed_size + sizeof(struct sctp_chunkhdr))
+ goto malformed;
+
+ /* Verify that the cookie has been padded out. */
+ if (bodysize % SCTP_COOKIE_MULTIPLE)
+ goto malformed;
+
+ /* Process the cookie. */
+ cookie = chunk->subh.cookie_hdr;
+ bear_cookie = &cookie->c;
+
+ if (!sctp_sk(ep->base.sk)->hmac)
+ goto no_hmac;
+
+ /* Check the signature. */
+ sg_init_one(&sg, bear_cookie, bodysize);
+ desc.tfm = sctp_sk(ep->base.sk)->hmac;
+ desc.flags = 0;
+
+ memset(digest, 0x00, SCTP_SIGNATURE_SIZE);
+ if (crypto_hash_setkey(desc.tfm, ep->secret_key,
+ sizeof(ep->secret_key)) ||
+ crypto_hash_digest(&desc, &sg, bodysize, digest)) {
+ *error = -SCTP_IERROR_NOMEM;
+ goto fail;
+ }
+
+ if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) {
+ *error = -SCTP_IERROR_BAD_SIG;
+ goto fail;
+ }
+
+no_hmac:
+ /* IG Section 2.35.2:
+ * 3) Compare the port numbers and the verification tag contained
+ * within the COOKIE ECHO chunk to the actual port numbers and the
+ * verification tag within the SCTP common header of the received
+ * packet. If these values do not match the packet MUST be silently
+ * discarded,
+ */
+ if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) {
+ *error = -SCTP_IERROR_BAD_TAG;
+ goto fail;
+ }
+
+ if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port ||
+ ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) {
+ *error = -SCTP_IERROR_BAD_PORTS;
+ goto fail;
+ }
+
+ /* Check to see if the cookie is stale. If there is already
+ * an association, there is no need to check cookie's expiration
+ * for init collision case of lost COOKIE ACK.
+ * If skb has been timestamped, then use the stamp, otherwise
+ * use current time. This introduces a small possibility that
+ * that a cookie may be considered expired, but his would only slow
+ * down the new association establishment instead of every packet.
+ */
+ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP))
+ kt = skb_get_ktime(skb);
+ else
+ kt = ktime_get();
+
+ if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
+ /*
+ * Section 3.3.10.3 Stale Cookie Error (3)
+ *
+ * Cause of error
+ * ---------------
+ * Stale Cookie Error: Indicates the receipt of a valid State
+ * Cookie that has expired.
+ */
+ len = ntohs(chunk->chunk_hdr->length);
+ *errp = sctp_make_op_error_space(asoc, chunk, len);
+ if (*errp) {
+ suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
+ __be32 n = htonl(usecs);
+
+ sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
+ sizeof(n));
+ sctp_addto_chunk(*errp, sizeof(n), &n);
+ *error = -SCTP_IERROR_STALE_COOKIE;
+ } else
+ *error = -SCTP_IERROR_NOMEM;
+
+ goto fail;
+ }
+
+ /* Make a new base association. */
+ scope = sctp_scope(sctp_source(chunk));
+ retval = sctp_association_new(ep, ep->base.sk, scope, gfp);
+ if (!retval) {
+ *error = -SCTP_IERROR_NOMEM;
+ goto fail;
+ }
+
+ /* Set up our peer's port number. */
+ retval->peer.port = ntohs(chunk->sctp_hdr->source);
+
+ /* Populate the association from the cookie. */
+ memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie));
+
+ if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie,
+ GFP_ATOMIC) < 0) {
+ *error = -SCTP_IERROR_NOMEM;
+ goto fail;
+ }
+
+ /* Also, add the destination address. */
+ if (list_empty(&retval->base.bind_addr.address_list)) {
+ sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest,
+ SCTP_ADDR_SRC, GFP_ATOMIC);
+ }
+
+ retval->next_tsn = retval->c.initial_tsn;
+ retval->ctsn_ack_point = retval->next_tsn - 1;
+ retval->addip_serial = retval->c.initial_tsn;
+ retval->adv_peer_ack_point = retval->ctsn_ack_point;
+ retval->peer.prsctp_capable = retval->c.prsctp_capable;
+ retval->peer.adaptation_ind = retval->c.adaptation_ind;
+
+ /* The INIT stuff will be done by the side effects. */
+ return retval;
+
+fail:
+ if (retval)
+ sctp_association_free(retval);
+
+ return NULL;
+
+malformed:
+ /* Yikes! The packet is either corrupt or deliberately
+ * malformed.
+ */
+ *error = -SCTP_IERROR_MALFORMED;
+ goto fail;
+}
+
+/********************************************************************
+ * 3rd Level Abstractions
+ ********************************************************************/
+
+struct __sctp_missing {
+ __be32 num_missing;
+ __be16 type;
+} __packed;
+
+/*
+ * Report a missing mandatory parameter.
+ */
+static int sctp_process_missing_param(const struct sctp_association *asoc,
+ sctp_param_t paramtype,
+ struct sctp_chunk *chunk,
+ struct sctp_chunk **errp)
+{
+ struct __sctp_missing report;
+ __u16 len;
+
+ len = WORD_ROUND(sizeof(report));
+
+ /* Make an ERROR chunk, preparing enough room for
+ * returning multiple unknown parameters.
+ */
+ if (!*errp)
+ *errp = sctp_make_op_error_space(asoc, chunk, len);
+
+ if (*errp) {
+ report.num_missing = htonl(1);
+ report.type = paramtype;
+ sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM,
+ sizeof(report));
+ sctp_addto_chunk(*errp, sizeof(report), &report);
+ }
+
+ /* Stop processing this chunk. */
+ return 0;
+}
+
+/* Report an Invalid Mandatory Parameter. */
+static int sctp_process_inv_mandatory(const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ struct sctp_chunk **errp)
+{
+ /* Invalid Mandatory Parameter Error has no payload. */
+
+ if (!*errp)
+ *errp = sctp_make_op_error_space(asoc, chunk, 0);
+
+ if (*errp)
+ sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0);
+
+ /* Stop processing this chunk. */
+ return 0;
+}
+
+static int sctp_process_inv_paramlength(const struct sctp_association *asoc,
+ struct sctp_paramhdr *param,
+ const struct sctp_chunk *chunk,
+ struct sctp_chunk **errp)
+{
+ /* This is a fatal error. Any accumulated non-fatal errors are
+ * not reported.
+ */
+ if (*errp)
+ sctp_chunk_free(*errp);
+
+ /* Create an error chunk and fill it in with our payload. */
+ *errp = sctp_make_violation_paramlen(asoc, chunk, param);
+
+ return 0;
+}
+
+
+/* Do not attempt to handle the HOST_NAME parm. However, do
+ * send back an indicator to the peer.
+ */
+static int sctp_process_hn_param(const struct sctp_association *asoc,
+ union sctp_params param,
+ struct sctp_chunk *chunk,
+ struct sctp_chunk **errp)
+{
+ __u16 len = ntohs(param.p->length);
+
+ /* Processing of the HOST_NAME parameter will generate an
+ * ABORT. If we've accumulated any non-fatal errors, they
+ * would be unrecognized parameters and we should not include
+ * them in the ABORT.
+ */
+ if (*errp)
+ sctp_chunk_free(*errp);
+
+ *errp = sctp_make_op_error_space(asoc, chunk, len);
+
+ if (*errp) {
+ sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len);
+ sctp_addto_chunk(*errp, len, param.v);
+ }
+
+ /* Stop processing this chunk. */
+ return 0;
+}
+
+static int sctp_verify_ext_param(struct net *net, union sctp_params param)
+{
+ __u16 num_ext = ntohs(param.p->length) - sizeof(sctp_paramhdr_t);
+ int have_auth = 0;
+ int have_asconf = 0;
+ int i;
+
+ for (i = 0; i < num_ext; i++) {
+ switch (param.ext->chunks[i]) {
+ case SCTP_CID_AUTH:
+ have_auth = 1;
+ break;
+ case SCTP_CID_ASCONF:
+ case SCTP_CID_ASCONF_ACK:
+ have_asconf = 1;
+ break;
+ }
+ }
+
+ /* ADD-IP Security: The draft requires us to ABORT or ignore the
+ * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this
+ * only if ADD-IP is turned on and we are not backward-compatible
+ * mode.
+ */
+ if (net->sctp.addip_noauth)
+ return 1;
+
+ if (net->sctp.addip_enable && !have_auth && have_asconf)
+ return 0;
+
+ return 1;
+}
+
+static void sctp_process_ext_param(struct sctp_association *asoc,
+ union sctp_params param)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ __u16 num_ext = ntohs(param.p->length) - sizeof(sctp_paramhdr_t);
+ int i;
+
+ for (i = 0; i < num_ext; i++) {
+ switch (param.ext->chunks[i]) {
+ case SCTP_CID_FWD_TSN:
+ if (net->sctp.prsctp_enable && !asoc->peer.prsctp_capable)
+ asoc->peer.prsctp_capable = 1;
+ break;
+ case SCTP_CID_AUTH:
+ /* if the peer reports AUTH, assume that he
+ * supports AUTH.
+ */
+ if (asoc->ep->auth_enable)
+ asoc->peer.auth_capable = 1;
+ break;
+ case SCTP_CID_ASCONF:
+ case SCTP_CID_ASCONF_ACK:
+ if (net->sctp.addip_enable)
+ asoc->peer.asconf_capable = 1;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+/* RFC 3.2.1 & the Implementers Guide 2.2.
+ *
+ * The Parameter Types are encoded such that the
+ * highest-order two bits specify the action that must be
+ * taken if the processing endpoint does not recognize the
+ * Parameter Type.
+ *
+ * 00 - Stop processing this parameter; do not process any further
+ * parameters within this chunk
+ *
+ * 01 - Stop processing this parameter, do not process any further
+ * parameters within this chunk, and report the unrecognized
+ * parameter in an 'Unrecognized Parameter' ERROR chunk.
+ *
+ * 10 - Skip this parameter and continue processing.
+ *
+ * 11 - Skip this parameter and continue processing but
+ * report the unrecognized parameter in an
+ * 'Unrecognized Parameter' ERROR chunk.
+ *
+ * Return value:
+ * SCTP_IERROR_NO_ERROR - continue with the chunk
+ * SCTP_IERROR_ERROR - stop and report an error.
+ * SCTP_IERROR_NOMEME - out of memory.
+ */
+static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
+ union sctp_params param,
+ struct sctp_chunk *chunk,
+ struct sctp_chunk **errp)
+{
+ int retval = SCTP_IERROR_NO_ERROR;
+
+ switch (param.p->type & SCTP_PARAM_ACTION_MASK) {
+ case SCTP_PARAM_ACTION_DISCARD:
+ retval = SCTP_IERROR_ERROR;
+ break;
+ case SCTP_PARAM_ACTION_SKIP:
+ break;
+ case SCTP_PARAM_ACTION_DISCARD_ERR:
+ retval = SCTP_IERROR_ERROR;
+ /* Fall through */
+ case SCTP_PARAM_ACTION_SKIP_ERR:
+ /* Make an ERROR chunk, preparing enough room for
+ * returning multiple unknown parameters.
+ */
+ if (NULL == *errp)
+ *errp = sctp_make_op_error_fixed(asoc, chunk);
+
+ if (*errp) {
+ if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+ WORD_ROUND(ntohs(param.p->length))))
+ sctp_addto_chunk_fixed(*errp,
+ WORD_ROUND(ntohs(param.p->length)),
+ param.v);
+ } else {
+ /* If there is no memory for generating the ERROR
+ * report as specified, an ABORT will be triggered
+ * to the peer and the association won't be
+ * established.
+ */
+ retval = SCTP_IERROR_NOMEM;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return retval;
+}
+
+/* Verify variable length parameters
+ * Return values:
+ * SCTP_IERROR_ABORT - trigger an ABORT
+ * SCTP_IERROR_NOMEM - out of memory (abort)
+ * SCTP_IERROR_ERROR - stop processing, trigger an ERROR
+ * SCTP_IERROR_NO_ERROR - continue with the chunk
+ */
+static sctp_ierror_t sctp_verify_param(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ union sctp_params param,
+ sctp_cid_t cid,
+ struct sctp_chunk *chunk,
+ struct sctp_chunk **err_chunk)
+{
+ struct sctp_hmac_algo_param *hmacs;
+ int retval = SCTP_IERROR_NO_ERROR;
+ __u16 n_elt, id = 0;
+ int i;
+
+ /* FIXME - This routine is not looking at each parameter per the
+ * chunk type, i.e., unrecognized parameters should be further
+ * identified based on the chunk id.
+ */
+
+ switch (param.p->type) {
+ case SCTP_PARAM_IPV4_ADDRESS:
+ case SCTP_PARAM_IPV6_ADDRESS:
+ case SCTP_PARAM_COOKIE_PRESERVATIVE:
+ case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
+ case SCTP_PARAM_STATE_COOKIE:
+ case SCTP_PARAM_HEARTBEAT_INFO:
+ case SCTP_PARAM_UNRECOGNIZED_PARAMETERS:
+ case SCTP_PARAM_ECN_CAPABLE:
+ case SCTP_PARAM_ADAPTATION_LAYER_IND:
+ break;
+
+ case SCTP_PARAM_SUPPORTED_EXT:
+ if (!sctp_verify_ext_param(net, param))
+ return SCTP_IERROR_ABORT;
+ break;
+
+ case SCTP_PARAM_SET_PRIMARY:
+ if (net->sctp.addip_enable)
+ break;
+ goto fallthrough;
+
+ case SCTP_PARAM_HOST_NAME_ADDRESS:
+ /* Tell the peer, we won't support this param. */
+ sctp_process_hn_param(asoc, param, chunk, err_chunk);
+ retval = SCTP_IERROR_ABORT;
+ break;
+
+ case SCTP_PARAM_FWD_TSN_SUPPORT:
+ if (net->sctp.prsctp_enable)
+ break;
+ goto fallthrough;
+
+ case SCTP_PARAM_RANDOM:
+ if (!ep->auth_enable)
+ goto fallthrough;
+
+ /* SCTP-AUTH: Secion 6.1
+ * If the random number is not 32 byte long the association
+ * MUST be aborted. The ABORT chunk SHOULD contain the error
+ * cause 'Protocol Violation'.
+ */
+ if (SCTP_AUTH_RANDOM_LENGTH !=
+ ntohs(param.p->length) - sizeof(sctp_paramhdr_t)) {
+ sctp_process_inv_paramlength(asoc, param.p,
+ chunk, err_chunk);
+ retval = SCTP_IERROR_ABORT;
+ }
+ break;
+
+ case SCTP_PARAM_CHUNKS:
+ if (!ep->auth_enable)
+ goto fallthrough;
+
+ /* SCTP-AUTH: Section 3.2
+ * The CHUNKS parameter MUST be included once in the INIT or
+ * INIT-ACK chunk if the sender wants to receive authenticated
+ * chunks. Its maximum length is 260 bytes.
+ */
+ if (260 < ntohs(param.p->length)) {
+ sctp_process_inv_paramlength(asoc, param.p,
+ chunk, err_chunk);
+ retval = SCTP_IERROR_ABORT;
+ }
+ break;
+
+ case SCTP_PARAM_HMAC_ALGO:
+ if (!ep->auth_enable)
+ goto fallthrough;
+
+ hmacs = (struct sctp_hmac_algo_param *)param.p;
+ n_elt = (ntohs(param.p->length) - sizeof(sctp_paramhdr_t)) >> 1;
+
+ /* SCTP-AUTH: Section 6.1
+ * The HMAC algorithm based on SHA-1 MUST be supported and
+ * included in the HMAC-ALGO parameter.
+ */
+ for (i = 0; i < n_elt; i++) {
+ id = ntohs(hmacs->hmac_ids[i]);
+
+ if (id == SCTP_AUTH_HMAC_ID_SHA1)
+ break;
+ }
+
+ if (id != SCTP_AUTH_HMAC_ID_SHA1) {
+ sctp_process_inv_paramlength(asoc, param.p, chunk,
+ err_chunk);
+ retval = SCTP_IERROR_ABORT;
+ }
+ break;
+fallthrough:
+ default:
+ pr_debug("%s: unrecognized param:%d for chunk:%d\n",
+ __func__, ntohs(param.p->type), cid);
+
+ retval = sctp_process_unk_param(asoc, param, chunk, err_chunk);
+ break;
+ }
+ return retval;
+}
+
+/* Verify the INIT packet before we process it. */
+int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc, sctp_cid_t cid,
+ sctp_init_chunk_t *peer_init, struct sctp_chunk *chunk,
+ struct sctp_chunk **errp)
+{
+ union sctp_params param;
+ bool has_cookie = false;
+ int result;
+
+ /* Check for missing mandatory parameters. Note: Initial TSN is
+ * also mandatory, but is not checked here since the valid range
+ * is 0..2**32-1. RFC4960, section 3.3.3.
+ */
+ if (peer_init->init_hdr.num_outbound_streams == 0 ||
+ peer_init->init_hdr.num_inbound_streams == 0 ||
+ peer_init->init_hdr.init_tag == 0 ||
+ ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW)
+ return sctp_process_inv_mandatory(asoc, chunk, errp);
+
+ sctp_walk_params(param, peer_init, init_hdr.params) {
+ if (param.p->type == SCTP_PARAM_STATE_COOKIE)
+ has_cookie = true;
+ }
+
+ /* There is a possibility that a parameter length was bad and
+ * in that case we would have stoped walking the parameters.
+ * The current param.p would point at the bad one.
+ * Current consensus on the mailing list is to generate a PROTOCOL
+ * VIOLATION error. We build the ERROR chunk here and let the normal
+ * error handling code build and send the packet.
+ */
+ if (param.v != (void *)chunk->chunk_end)
+ return sctp_process_inv_paramlength(asoc, param.p, chunk, errp);
+
+ /* The only missing mandatory param possible today is
+ * the state cookie for an INIT-ACK chunk.
+ */
+ if ((SCTP_CID_INIT_ACK == cid) && !has_cookie)
+ return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE,
+ chunk, errp);
+
+ /* Verify all the variable length parameters */
+ sctp_walk_params(param, peer_init, init_hdr.params) {
+ result = sctp_verify_param(net, ep, asoc, param, cid,
+ chunk, errp);
+ switch (result) {
+ case SCTP_IERROR_ABORT:
+ case SCTP_IERROR_NOMEM:
+ return 0;
+ case SCTP_IERROR_ERROR:
+ return 1;
+ case SCTP_IERROR_NO_ERROR:
+ default:
+ break;
+ }
+
+ } /* for (loop through all parameters) */
+
+ return 1;
+}
+
+/* Unpack the parameters in an INIT packet into an association.
+ * Returns 0 on failure, else success.
+ * FIXME: This is an association method.
+ */
+int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
+ const union sctp_addr *peer_addr,
+ sctp_init_chunk_t *peer_init, gfp_t gfp)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ union sctp_params param;
+ struct sctp_transport *transport;
+ struct list_head *pos, *temp;
+ struct sctp_af *af;
+ union sctp_addr addr;
+ char *cookie;
+ int src_match = 0;
+
+ /* We must include the address that the INIT packet came from.
+ * This is the only address that matters for an INIT packet.
+ * When processing a COOKIE ECHO, we retrieve the from address
+ * of the INIT from the cookie.
+ */
+
+ /* This implementation defaults to making the first transport
+ * added as the primary transport. The source address seems to
+ * be a a better choice than any of the embedded addresses.
+ */
+ if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
+ goto nomem;
+
+ if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr))
+ src_match = 1;
+
+ /* Process the initialization parameters. */
+ sctp_walk_params(param, peer_init, init_hdr.params) {
+ if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
+ param.p->type == SCTP_PARAM_IPV6_ADDRESS)) {
+ af = sctp_get_af_specific(param_type2af(param.p->type));
+ af->from_addr_param(&addr, param.addr,
+ chunk->sctp_hdr->source, 0);
+ if (sctp_cmp_addr_exact(sctp_source(chunk), &addr))
+ src_match = 1;
+ }
+
+ if (!sctp_process_param(asoc, param, peer_addr, gfp))
+ goto clean_up;
+ }
+
+ /* source address of chunk may not match any valid address */
+ if (!src_match)
+ goto clean_up;
+
+ /* AUTH: After processing the parameters, make sure that we
+ * have all the required info to potentially do authentications.
+ */
+ if (asoc->peer.auth_capable && (!asoc->peer.peer_random ||
+ !asoc->peer.peer_hmacs))
+ asoc->peer.auth_capable = 0;
+
+ /* In a non-backward compatible mode, if the peer claims
+ * support for ADD-IP but not AUTH, the ADD-IP spec states
+ * that we MUST ABORT the association. Section 6. The section
+ * also give us an option to silently ignore the packet, which
+ * is what we'll do here.
+ */
+ if (!net->sctp.addip_noauth &&
+ (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) {
+ asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP |
+ SCTP_PARAM_DEL_IP |
+ SCTP_PARAM_SET_PRIMARY);
+ asoc->peer.asconf_capable = 0;
+ goto clean_up;
+ }
+
+ /* Walk list of transports, removing transports in the UNKNOWN state. */
+ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+ transport = list_entry(pos, struct sctp_transport, transports);
+ if (transport->state == SCTP_UNKNOWN) {
+ sctp_assoc_rm_peer(asoc, transport);
+ }
+ }
+
+ /* The fixed INIT headers are always in network byte
+ * order.
+ */
+ asoc->peer.i.init_tag =
+ ntohl(peer_init->init_hdr.init_tag);
+ asoc->peer.i.a_rwnd =
+ ntohl(peer_init->init_hdr.a_rwnd);
+ asoc->peer.i.num_outbound_streams =
+ ntohs(peer_init->init_hdr.num_outbound_streams);
+ asoc->peer.i.num_inbound_streams =
+ ntohs(peer_init->init_hdr.num_inbound_streams);
+ asoc->peer.i.initial_tsn =
+ ntohl(peer_init->init_hdr.initial_tsn);
+
+ /* Apply the upper bounds for output streams based on peer's
+ * number of inbound streams.
+ */
+ if (asoc->c.sinit_num_ostreams >
+ ntohs(peer_init->init_hdr.num_inbound_streams)) {
+ asoc->c.sinit_num_ostreams =
+ ntohs(peer_init->init_hdr.num_inbound_streams);
+ }
+
+ if (asoc->c.sinit_max_instreams >
+ ntohs(peer_init->init_hdr.num_outbound_streams)) {
+ asoc->c.sinit_max_instreams =
+ ntohs(peer_init->init_hdr.num_outbound_streams);
+ }
+
+ /* Copy Initiation tag from INIT to VT_peer in cookie. */
+ asoc->c.peer_vtag = asoc->peer.i.init_tag;
+
+ /* Peer Rwnd : Current calculated value of the peer's rwnd. */
+ asoc->peer.rwnd = asoc->peer.i.a_rwnd;
+
+ /* Copy cookie in case we need to resend COOKIE-ECHO. */
+ cookie = asoc->peer.cookie;
+ if (cookie) {
+ asoc->peer.cookie = kmemdup(cookie, asoc->peer.cookie_len, gfp);
+ if (!asoc->peer.cookie)
+ goto clean_up;
+ }
+
+ /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily
+ * high (for example, implementations MAY use the size of the receiver
+ * advertised window).
+ */
+ list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+ transports) {
+ transport->ssthresh = asoc->peer.i.a_rwnd;
+ }
+
+ /* Set up the TSN tracking pieces. */
+ if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
+ asoc->peer.i.initial_tsn, gfp))
+ goto clean_up;
+
+ /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
+ *
+ * The stream sequence number in all the streams shall start
+ * from 0 when the association is established. Also, when the
+ * stream sequence number reaches the value 65535 the next
+ * stream sequence number shall be set to 0.
+ */
+
+ /* Allocate storage for the negotiated streams if it is not a temporary
+ * association.
+ */
+ if (!asoc->temp) {
+ int error;
+
+ asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams,
+ asoc->c.sinit_num_ostreams, gfp);
+ if (!asoc->ssnmap)
+ goto clean_up;
+
+ error = sctp_assoc_set_id(asoc, gfp);
+ if (error)
+ goto clean_up;
+ }
+
+ /* ADDIP Section 4.1 ASCONF Chunk Procedures
+ *
+ * When an endpoint has an ASCONF signaled change to be sent to the
+ * remote endpoint it should do the following:
+ * ...
+ * A2) A serial number should be assigned to the Chunk. The serial
+ * number should be a monotonically increasing number. All serial
+ * numbers are defined to be initialized at the start of the
+ * association to the same value as the Initial TSN.
+ */
+ asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1;
+ return 1;
+
+clean_up:
+ /* Release the transport structures. */
+ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+ transport = list_entry(pos, struct sctp_transport, transports);
+ if (transport->state != SCTP_ACTIVE)
+ sctp_assoc_rm_peer(asoc, transport);
+ }
+
+nomem:
+ return 0;
+}
+
+
+/* Update asoc with the option described in param.
+ *
+ * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT
+ *
+ * asoc is the association to update.
+ * param is the variable length parameter to use for update.
+ * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO.
+ * If the current packet is an INIT we want to minimize the amount of
+ * work we do. In particular, we should not build transport
+ * structures for the addresses.
+ */
+static int sctp_process_param(struct sctp_association *asoc,
+ union sctp_params param,
+ const union sctp_addr *peer_addr,
+ gfp_t gfp)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ union sctp_addr addr;
+ int i;
+ __u16 sat;
+ int retval = 1;
+ sctp_scope_t scope;
+ time_t stale;
+ struct sctp_af *af;
+ union sctp_addr_param *addr_param;
+ struct sctp_transport *t;
+ struct sctp_endpoint *ep = asoc->ep;
+
+ /* We maintain all INIT parameters in network byte order all the
+ * time. This allows us to not worry about whether the parameters
+ * came from a fresh INIT, and INIT ACK, or were stored in a cookie.
+ */
+ switch (param.p->type) {
+ case SCTP_PARAM_IPV6_ADDRESS:
+ if (PF_INET6 != asoc->base.sk->sk_family)
+ break;
+ goto do_addr_param;
+
+ case SCTP_PARAM_IPV4_ADDRESS:
+ /* v4 addresses are not allowed on v6-only socket */
+ if (ipv6_only_sock(asoc->base.sk))
+ break;
+do_addr_param:
+ af = sctp_get_af_specific(param_type2af(param.p->type));
+ af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0);
+ scope = sctp_scope(peer_addr);
+ if (sctp_in_scope(net, &addr, scope))
+ if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED))
+ return 0;
+ break;
+
+ case SCTP_PARAM_COOKIE_PRESERVATIVE:
+ if (!net->sctp.cookie_preserve_enable)
+ break;
+
+ stale = ntohl(param.life->lifespan_increment);
+
+ /* Suggested Cookie Life span increment's unit is msec,
+ * (1/1000sec).
+ */
+ asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale);
+ break;
+
+ case SCTP_PARAM_HOST_NAME_ADDRESS:
+ pr_debug("%s: unimplemented SCTP_HOST_NAME_ADDRESS\n", __func__);
+ break;
+
+ case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
+ /* Turn off the default values first so we'll know which
+ * ones are really set by the peer.
+ */
+ asoc->peer.ipv4_address = 0;
+ asoc->peer.ipv6_address = 0;
+
+ /* Assume that peer supports the address family
+ * by which it sends a packet.
+ */
+ if (peer_addr->sa.sa_family == AF_INET6)
+ asoc->peer.ipv6_address = 1;
+ else if (peer_addr->sa.sa_family == AF_INET)
+ asoc->peer.ipv4_address = 1;
+
+ /* Cycle through address types; avoid divide by 0. */
+ sat = ntohs(param.p->length) - sizeof(sctp_paramhdr_t);
+ if (sat)
+ sat /= sizeof(__u16);
+
+ for (i = 0; i < sat; ++i) {
+ switch (param.sat->types[i]) {
+ case SCTP_PARAM_IPV4_ADDRESS:
+ asoc->peer.ipv4_address = 1;
+ break;
+
+ case SCTP_PARAM_IPV6_ADDRESS:
+ if (PF_INET6 == asoc->base.sk->sk_family)
+ asoc->peer.ipv6_address = 1;
+ break;
+
+ case SCTP_PARAM_HOST_NAME_ADDRESS:
+ asoc->peer.hostname_address = 1;
+ break;
+
+ default: /* Just ignore anything else. */
+ break;
+ }
+ }
+ break;
+
+ case SCTP_PARAM_STATE_COOKIE:
+ asoc->peer.cookie_len =
+ ntohs(param.p->length) - sizeof(sctp_paramhdr_t);
+ asoc->peer.cookie = param.cookie->body;
+ break;
+
+ case SCTP_PARAM_HEARTBEAT_INFO:
+ /* Would be odd to receive, but it causes no problems. */
+ break;
+
+ case SCTP_PARAM_UNRECOGNIZED_PARAMETERS:
+ /* Rejected during verify stage. */
+ break;
+
+ case SCTP_PARAM_ECN_CAPABLE:
+ asoc->peer.ecn_capable = 1;
+ break;
+
+ case SCTP_PARAM_ADAPTATION_LAYER_IND:
+ asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind);
+ break;
+
+ case SCTP_PARAM_SET_PRIMARY:
+ if (!net->sctp.addip_enable)
+ goto fall_through;
+
+ addr_param = param.v + sizeof(sctp_addip_param_t);
+
+ af = sctp_get_af_specific(param_type2af(addr_param->p.type));
+ if (af == NULL)
+ break;
+
+ af->from_addr_param(&addr, addr_param,
+ htons(asoc->peer.port), 0);
+
+ /* if the address is invalid, we can't process it.
+ * XXX: see spec for what to do.
+ */
+ if (!af->addr_valid(&addr, NULL, NULL))
+ break;
+
+ t = sctp_assoc_lookup_paddr(asoc, &addr);
+ if (!t)
+ break;
+
+ sctp_assoc_set_primary(asoc, t);
+ break;
+
+ case SCTP_PARAM_SUPPORTED_EXT:
+ sctp_process_ext_param(asoc, param);
+ break;
+
+ case SCTP_PARAM_FWD_TSN_SUPPORT:
+ if (net->sctp.prsctp_enable) {
+ asoc->peer.prsctp_capable = 1;
+ break;
+ }
+ /* Fall Through */
+ goto fall_through;
+
+ case SCTP_PARAM_RANDOM:
+ if (!ep->auth_enable)
+ goto fall_through;
+
+ /* Save peer's random parameter */
+ asoc->peer.peer_random = kmemdup(param.p,
+ ntohs(param.p->length), gfp);
+ if (!asoc->peer.peer_random) {
+ retval = 0;
+ break;
+ }
+ break;
+
+ case SCTP_PARAM_HMAC_ALGO:
+ if (!ep->auth_enable)
+ goto fall_through;
+
+ /* Save peer's HMAC list */
+ asoc->peer.peer_hmacs = kmemdup(param.p,
+ ntohs(param.p->length), gfp);
+ if (!asoc->peer.peer_hmacs) {
+ retval = 0;
+ break;
+ }
+
+ /* Set the default HMAC the peer requested*/
+ sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo);
+ break;
+
+ case SCTP_PARAM_CHUNKS:
+ if (!ep->auth_enable)
+ goto fall_through;
+
+ asoc->peer.peer_chunks = kmemdup(param.p,
+ ntohs(param.p->length), gfp);
+ if (!asoc->peer.peer_chunks)
+ retval = 0;
+ break;
+fall_through:
+ default:
+ /* Any unrecognized parameters should have been caught
+ * and handled by sctp_verify_param() which should be
+ * called prior to this routine. Simply log the error
+ * here.
+ */
+ pr_debug("%s: ignoring param:%d for association:%p.\n",
+ __func__, ntohs(param.p->type), asoc);
+ break;
+ }
+
+ return retval;
+}
+
+/* Select a new verification tag. */
+__u32 sctp_generate_tag(const struct sctp_endpoint *ep)
+{
+ /* I believe that this random number generator complies with RFC1750.
+ * A tag of 0 is reserved for special cases (e.g. INIT).
+ */
+ __u32 x;
+
+ do {
+ get_random_bytes(&x, sizeof(__u32));
+ } while (x == 0);
+
+ return x;
+}
+
+/* Select an initial TSN to send during startup. */
+__u32 sctp_generate_tsn(const struct sctp_endpoint *ep)
+{
+ __u32 retval;
+
+ get_random_bytes(&retval, sizeof(__u32));
+ return retval;
+}
+
+/*
+ * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 0xC1 | Chunk Flags | Chunk Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Serial Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Address Parameter |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | ASCONF Parameter #1 |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * \ \
+ * / .... /
+ * \ \
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | ASCONF Parameter #N |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Address Parameter and other parameter will not be wrapped in this function
+ */
+static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
+ union sctp_addr *addr,
+ int vparam_len)
+{
+ sctp_addiphdr_t asconf;
+ struct sctp_chunk *retval;
+ int length = sizeof(asconf) + vparam_len;
+ union sctp_addr_param addrparam;
+ int addrlen;
+ struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family);
+
+ addrlen = af->to_addr_param(addr, &addrparam);
+ if (!addrlen)
+ return NULL;
+ length += addrlen;
+
+ /* Create the chunk. */
+ retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length);
+ if (!retval)
+ return NULL;
+
+ asconf.serial = htonl(asoc->addip_serial++);
+
+ retval->subh.addip_hdr =
+ sctp_addto_chunk(retval, sizeof(asconf), &asconf);
+ retval->param_hdr.v =
+ sctp_addto_chunk(retval, addrlen, &addrparam);
+
+ return retval;
+}
+
+/* ADDIP
+ * 3.2.1 Add IP Address
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 0xC001 | Length = Variable |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | ASCONF-Request Correlation ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Address Parameter |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * 3.2.2 Delete IP Address
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 0xC002 | Length = Variable |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | ASCONF-Request Correlation ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Address Parameter |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ */
+struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
+ union sctp_addr *laddr,
+ struct sockaddr *addrs,
+ int addrcnt,
+ __be16 flags)
+{
+ sctp_addip_param_t param;
+ struct sctp_chunk *retval;
+ union sctp_addr_param addr_param;
+ union sctp_addr *addr;
+ void *addr_buf;
+ struct sctp_af *af;
+ int paramlen = sizeof(param);
+ int addr_param_len = 0;
+ int totallen = 0;
+ int i;
+ int del_pickup = 0;
+
+ /* Get total length of all the address parameters. */
+ addr_buf = addrs;
+ for (i = 0; i < addrcnt; i++) {
+ addr = addr_buf;
+ af = sctp_get_af_specific(addr->v4.sin_family);
+ addr_param_len = af->to_addr_param(addr, &addr_param);
+
+ totallen += paramlen;
+ totallen += addr_param_len;
+
+ addr_buf += af->sockaddr_len;
+ if (asoc->asconf_addr_del_pending && !del_pickup) {
+ /* reuse the parameter length from the same scope one */
+ totallen += paramlen;
+ totallen += addr_param_len;
+ del_pickup = 1;
+
+ pr_debug("%s: picked same-scope del_pending addr, "
+ "totallen for all addresses is %d\n",
+ __func__, totallen);
+ }
+ }
+
+ /* Create an asconf chunk with the required length. */
+ retval = sctp_make_asconf(asoc, laddr, totallen);
+ if (!retval)
+ return NULL;
+
+ /* Add the address parameters to the asconf chunk. */
+ addr_buf = addrs;
+ for (i = 0; i < addrcnt; i++) {
+ addr = addr_buf;
+ af = sctp_get_af_specific(addr->v4.sin_family);
+ addr_param_len = af->to_addr_param(addr, &addr_param);
+ param.param_hdr.type = flags;
+ param.param_hdr.length = htons(paramlen + addr_param_len);
+ param.crr_id = i;
+
+ sctp_addto_chunk(retval, paramlen, &param);
+ sctp_addto_chunk(retval, addr_param_len, &addr_param);
+
+ addr_buf += af->sockaddr_len;
+ }
+ if (flags == SCTP_PARAM_ADD_IP && del_pickup) {
+ addr = asoc->asconf_addr_del_pending;
+ af = sctp_get_af_specific(addr->v4.sin_family);
+ addr_param_len = af->to_addr_param(addr, &addr_param);
+ param.param_hdr.type = SCTP_PARAM_DEL_IP;
+ param.param_hdr.length = htons(paramlen + addr_param_len);
+ param.crr_id = i;
+
+ sctp_addto_chunk(retval, paramlen, &param);
+ sctp_addto_chunk(retval, addr_param_len, &addr_param);
+ }
+ return retval;
+}
+
+/* ADDIP
+ * 3.2.4 Set Primary IP Address
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type =0xC004 | Length = Variable |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | ASCONF-Request Correlation ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Address Parameter |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Create an ASCONF chunk with Set Primary IP address parameter.
+ */
+struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
+ union sctp_addr *addr)
+{
+ sctp_addip_param_t param;
+ struct sctp_chunk *retval;
+ int len = sizeof(param);
+ union sctp_addr_param addrparam;
+ int addrlen;
+ struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family);
+
+ addrlen = af->to_addr_param(addr, &addrparam);
+ if (!addrlen)
+ return NULL;
+ len += addrlen;
+
+ /* Create the chunk and make asconf header. */
+ retval = sctp_make_asconf(asoc, addr, len);
+ if (!retval)
+ return NULL;
+
+ param.param_hdr.type = SCTP_PARAM_SET_PRIMARY;
+ param.param_hdr.length = htons(len);
+ param.crr_id = 0;
+
+ sctp_addto_chunk(retval, sizeof(param), &param);
+ sctp_addto_chunk(retval, addrlen, &addrparam);
+
+ return retval;
+}
+
+/* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK)
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Type = 0x80 | Chunk Flags | Chunk Length |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Serial Number |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | ASCONF Parameter Response#1 |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * \ \
+ * / .... /
+ * \ \
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | ASCONF Parameter Response#N |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Create an ASCONF_ACK chunk with enough space for the parameter responses.
+ */
+static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc,
+ __u32 serial, int vparam_len)
+{
+ sctp_addiphdr_t asconf;
+ struct sctp_chunk *retval;
+ int length = sizeof(asconf) + vparam_len;
+
+ /* Create the chunk. */
+ retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length);
+ if (!retval)
+ return NULL;
+
+ asconf.serial = htonl(serial);
+
+ retval->subh.addip_hdr =
+ sctp_addto_chunk(retval, sizeof(asconf), &asconf);
+
+ return retval;
+}
+
+/* Add response parameters to an ASCONF_ACK chunk. */
+static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id,
+ __be16 err_code, sctp_addip_param_t *asconf_param)
+{
+ sctp_addip_param_t ack_param;
+ sctp_errhdr_t err_param;
+ int asconf_param_len = 0;
+ int err_param_len = 0;
+ __be16 response_type;
+
+ if (SCTP_ERROR_NO_ERROR == err_code) {
+ response_type = SCTP_PARAM_SUCCESS_REPORT;
+ } else {
+ response_type = SCTP_PARAM_ERR_CAUSE;
+ err_param_len = sizeof(err_param);
+ if (asconf_param)
+ asconf_param_len =
+ ntohs(asconf_param->param_hdr.length);
+ }
+
+ /* Add Success Indication or Error Cause Indication parameter. */
+ ack_param.param_hdr.type = response_type;
+ ack_param.param_hdr.length = htons(sizeof(ack_param) +
+ err_param_len +
+ asconf_param_len);
+ ack_param.crr_id = crr_id;
+ sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param);
+
+ if (SCTP_ERROR_NO_ERROR == err_code)
+ return;
+
+ /* Add Error Cause parameter. */
+ err_param.cause = err_code;
+ err_param.length = htons(err_param_len + asconf_param_len);
+ sctp_addto_chunk(chunk, err_param_len, &err_param);
+
+ /* Add the failed TLV copied from ASCONF chunk. */
+ if (asconf_param)
+ sctp_addto_chunk(chunk, asconf_param_len, asconf_param);
+}
+
+/* Process a asconf parameter. */
+static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
+ struct sctp_chunk *asconf,
+ sctp_addip_param_t *asconf_param)
+{
+ struct sctp_transport *peer;
+ struct sctp_af *af;
+ union sctp_addr addr;
+ union sctp_addr_param *addr_param;
+
+ addr_param = (void *)asconf_param + sizeof(sctp_addip_param_t);
+
+ if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP &&
+ asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP &&
+ asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY)
+ return SCTP_ERROR_UNKNOWN_PARAM;
+
+ switch (addr_param->p.type) {
+ case SCTP_PARAM_IPV6_ADDRESS:
+ if (!asoc->peer.ipv6_address)
+ return SCTP_ERROR_DNS_FAILED;
+ break;
+ case SCTP_PARAM_IPV4_ADDRESS:
+ if (!asoc->peer.ipv4_address)
+ return SCTP_ERROR_DNS_FAILED;
+ break;
+ default:
+ return SCTP_ERROR_DNS_FAILED;
+ }
+
+ af = sctp_get_af_specific(param_type2af(addr_param->p.type));
+ if (unlikely(!af))
+ return SCTP_ERROR_DNS_FAILED;
+
+ af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0);
+
+ /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast
+ * or multicast address.
+ * (note: wildcard is permitted and requires special handling so
+ * make sure we check for that)
+ */
+ if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb))
+ return SCTP_ERROR_DNS_FAILED;
+
+ switch (asconf_param->param_hdr.type) {
+ case SCTP_PARAM_ADD_IP:
+ /* Section 4.2.1:
+ * If the address 0.0.0.0 or ::0 is provided, the source
+ * address of the packet MUST be added.
+ */
+ if (af->is_any(&addr))
+ memcpy(&addr, &asconf->source, sizeof(addr));
+
+ /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address
+ * request and does not have the local resources to add this
+ * new address to the association, it MUST return an Error
+ * Cause TLV set to the new error code 'Operation Refused
+ * Due to Resource Shortage'.
+ */
+
+ peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED);
+ if (!peer)
+ return SCTP_ERROR_RSRC_LOW;
+
+ /* Start the heartbeat timer. */
+ if (!mod_timer(&peer->hb_timer, sctp_transport_timeout(peer)))
+ sctp_transport_hold(peer);
+ asoc->new_transport = peer;
+ break;
+ case SCTP_PARAM_DEL_IP:
+ /* ADDIP 4.3 D7) If a request is received to delete the
+ * last remaining IP address of a peer endpoint, the receiver
+ * MUST send an Error Cause TLV with the error cause set to the
+ * new error code 'Request to Delete Last Remaining IP Address'.
+ */
+ if (asoc->peer.transport_count == 1)
+ return SCTP_ERROR_DEL_LAST_IP;
+
+ /* ADDIP 4.3 D8) If a request is received to delete an IP
+ * address which is also the source address of the IP packet
+ * which contained the ASCONF chunk, the receiver MUST reject
+ * this request. To reject the request the receiver MUST send
+ * an Error Cause TLV set to the new error code 'Request to
+ * Delete Source IP Address'
+ */
+ if (sctp_cmp_addr_exact(&asconf->source, &addr))
+ return SCTP_ERROR_DEL_SRC_IP;
+
+ /* Section 4.2.2
+ * If the address 0.0.0.0 or ::0 is provided, all
+ * addresses of the peer except the source address of the
+ * packet MUST be deleted.
+ */
+ if (af->is_any(&addr)) {
+ sctp_assoc_set_primary(asoc, asconf->transport);
+ sctp_assoc_del_nonprimary_peers(asoc,
+ asconf->transport);
+ } else
+ sctp_assoc_del_peer(asoc, &addr);
+ break;
+ case SCTP_PARAM_SET_PRIMARY:
+ /* ADDIP Section 4.2.4
+ * If the address 0.0.0.0 or ::0 is provided, the receiver
+ * MAY mark the source address of the packet as its
+ * primary.
+ */
+ if (af->is_any(&addr))
+ memcpy(&addr.v4, sctp_source(asconf), sizeof(addr));
+
+ peer = sctp_assoc_lookup_paddr(asoc, &addr);
+ if (!peer)
+ return SCTP_ERROR_DNS_FAILED;
+
+ sctp_assoc_set_primary(asoc, peer);
+ break;
+ }
+
+ return SCTP_ERROR_NO_ERROR;
+}
+
+/* Verify the ASCONF packet before we process it. */
+bool sctp_verify_asconf(const struct sctp_association *asoc,
+ struct sctp_chunk *chunk, bool addr_param_needed,
+ struct sctp_paramhdr **errp)
+{
+ sctp_addip_chunk_t *addip = (sctp_addip_chunk_t *) chunk->chunk_hdr;
+ union sctp_params param;
+ bool addr_param_seen = false;
+
+ sctp_walk_params(param, addip, addip_hdr.params) {
+ size_t length = ntohs(param.p->length);
+
+ *errp = param.p;
+ switch (param.p->type) {
+ case SCTP_PARAM_ERR_CAUSE:
+ break;
+ case SCTP_PARAM_IPV4_ADDRESS:
+ if (length != sizeof(sctp_ipv4addr_param_t))
+ return false;
+ addr_param_seen = true;
+ break;
+ case SCTP_PARAM_IPV6_ADDRESS:
+ if (length != sizeof(sctp_ipv6addr_param_t))
+ return false;
+ addr_param_seen = true;
+ break;
+ case SCTP_PARAM_ADD_IP:
+ case SCTP_PARAM_DEL_IP:
+ case SCTP_PARAM_SET_PRIMARY:
+ /* In ASCONF chunks, these need to be first. */
+ if (addr_param_needed && !addr_param_seen)
+ return false;
+ length = ntohs(param.addip->param_hdr.length);
+ if (length < sizeof(sctp_addip_param_t) +
+ sizeof(sctp_paramhdr_t))
+ return false;
+ break;
+ case SCTP_PARAM_SUCCESS_REPORT:
+ case SCTP_PARAM_ADAPTATION_LAYER_IND:
+ if (length != sizeof(sctp_addip_param_t))
+ return false;
+ break;
+ default:
+ /* This is unkown to us, reject! */
+ return false;
+ }
+ }
+
+ /* Remaining sanity checks. */
+ if (addr_param_needed && !addr_param_seen)
+ return false;
+ if (!addr_param_needed && addr_param_seen)
+ return false;
+ if (param.v != chunk->chunk_end)
+ return false;
+
+ return true;
+}
+
+/* Process an incoming ASCONF chunk with the next expected serial no. and
+ * return an ASCONF_ACK chunk to be sent in response.
+ */
+struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
+ struct sctp_chunk *asconf)
+{
+ sctp_addip_chunk_t *addip = (sctp_addip_chunk_t *) asconf->chunk_hdr;
+ bool all_param_pass = true;
+ union sctp_params param;
+ sctp_addiphdr_t *hdr;
+ union sctp_addr_param *addr_param;
+ sctp_addip_param_t *asconf_param;
+ struct sctp_chunk *asconf_ack;
+ __be16 err_code;
+ int length = 0;
+ int chunk_len;
+ __u32 serial;
+
+ chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(sctp_chunkhdr_t);
+ hdr = (sctp_addiphdr_t *)asconf->skb->data;
+ serial = ntohl(hdr->serial);
+
+ /* Skip the addiphdr and store a pointer to address parameter. */
+ length = sizeof(sctp_addiphdr_t);
+ addr_param = (union sctp_addr_param *)(asconf->skb->data + length);
+ chunk_len -= length;
+
+ /* Skip the address parameter and store a pointer to the first
+ * asconf parameter.
+ */
+ length = ntohs(addr_param->p.length);
+ asconf_param = (void *)addr_param + length;
+ chunk_len -= length;
+
+ /* create an ASCONF_ACK chunk.
+ * Based on the definitions of parameters, we know that the size of
+ * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF
+ * parameters.
+ */
+ asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4);
+ if (!asconf_ack)
+ goto done;
+
+ /* Process the TLVs contained within the ASCONF chunk. */
+ sctp_walk_params(param, addip, addip_hdr.params) {
+ /* Skip preceeding address parameters. */
+ if (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
+ param.p->type == SCTP_PARAM_IPV6_ADDRESS)
+ continue;
+
+ err_code = sctp_process_asconf_param(asoc, asconf,
+ param.addip);
+ /* ADDIP 4.1 A7)
+ * If an error response is received for a TLV parameter,
+ * all TLVs with no response before the failed TLV are
+ * considered successful if not reported. All TLVs after
+ * the failed response are considered unsuccessful unless
+ * a specific success indication is present for the parameter.
+ */
+ if (err_code != SCTP_ERROR_NO_ERROR)
+ all_param_pass = false;
+ if (!all_param_pass)
+ sctp_add_asconf_response(asconf_ack, param.addip->crr_id,
+ err_code, param.addip);
+
+ /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add
+ * an IP address sends an 'Out of Resource' in its response, it
+ * MUST also fail any subsequent add or delete requests bundled
+ * in the ASCONF.
+ */
+ if (err_code == SCTP_ERROR_RSRC_LOW)
+ goto done;
+ }
+done:
+ asoc->peer.addip_serial++;
+
+ /* If we are sending a new ASCONF_ACK hold a reference to it in assoc
+ * after freeing the reference to old asconf ack if any.
+ */
+ if (asconf_ack) {
+ sctp_chunk_hold(asconf_ack);
+ list_add_tail(&asconf_ack->transmitted_list,
+ &asoc->asconf_ack_list);
+ }
+
+ return asconf_ack;
+}
+
+/* Process a asconf parameter that is successfully acked. */
+static void sctp_asconf_param_success(struct sctp_association *asoc,
+ sctp_addip_param_t *asconf_param)
+{
+ struct sctp_af *af;
+ union sctp_addr addr;
+ struct sctp_bind_addr *bp = &asoc->base.bind_addr;
+ union sctp_addr_param *addr_param;
+ struct sctp_transport *transport;
+ struct sctp_sockaddr_entry *saddr;
+
+ addr_param = (void *)asconf_param + sizeof(sctp_addip_param_t);
+
+ /* We have checked the packet before, so we do not check again. */
+ af = sctp_get_af_specific(param_type2af(addr_param->p.type));
+ af->from_addr_param(&addr, addr_param, htons(bp->port), 0);
+
+ switch (asconf_param->param_hdr.type) {
+ case SCTP_PARAM_ADD_IP:
+ /* This is always done in BH context with a socket lock
+ * held, so the list can not change.
+ */
+ local_bh_disable();
+ list_for_each_entry(saddr, &bp->address_list, list) {
+ if (sctp_cmp_addr_exact(&saddr->a, &addr))
+ saddr->state = SCTP_ADDR_SRC;
+ }
+ local_bh_enable();
+ list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+ transports) {
+ dst_release(transport->dst);
+ transport->dst = NULL;
+ }
+ break;
+ case SCTP_PARAM_DEL_IP:
+ local_bh_disable();
+ sctp_del_bind_addr(bp, &addr);
+ if (asoc->asconf_addr_del_pending != NULL &&
+ sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) {
+ kfree(asoc->asconf_addr_del_pending);
+ asoc->asconf_addr_del_pending = NULL;
+ }
+ local_bh_enable();
+ list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+ transports) {
+ dst_release(transport->dst);
+ transport->dst = NULL;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+/* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk
+ * for the given asconf parameter. If there is no response for this parameter,
+ * return the error code based on the third argument 'no_err'.
+ * ADDIP 4.1
+ * A7) If an error response is received for a TLV parameter, all TLVs with no
+ * response before the failed TLV are considered successful if not reported.
+ * All TLVs after the failed response are considered unsuccessful unless a
+ * specific success indication is present for the parameter.
+ */
+static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
+ sctp_addip_param_t *asconf_param,
+ int no_err)
+{
+ sctp_addip_param_t *asconf_ack_param;
+ sctp_errhdr_t *err_param;
+ int length;
+ int asconf_ack_len;
+ __be16 err_code;
+
+ if (no_err)
+ err_code = SCTP_ERROR_NO_ERROR;
+ else
+ err_code = SCTP_ERROR_REQ_REFUSED;
+
+ asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t);
+
+ /* Skip the addiphdr from the asconf_ack chunk and store a pointer to
+ * the first asconf_ack parameter.
+ */
+ length = sizeof(sctp_addiphdr_t);
+ asconf_ack_param = (sctp_addip_param_t *)(asconf_ack->skb->data +
+ length);
+ asconf_ack_len -= length;
+
+ while (asconf_ack_len > 0) {
+ if (asconf_ack_param->crr_id == asconf_param->crr_id) {
+ switch (asconf_ack_param->param_hdr.type) {
+ case SCTP_PARAM_SUCCESS_REPORT:
+ return SCTP_ERROR_NO_ERROR;
+ case SCTP_PARAM_ERR_CAUSE:
+ length = sizeof(sctp_addip_param_t);
+ err_param = (void *)asconf_ack_param + length;
+ asconf_ack_len -= length;
+ if (asconf_ack_len > 0)
+ return err_param->cause;
+ else
+ return SCTP_ERROR_INV_PARAM;
+ break;
+ default:
+ return SCTP_ERROR_INV_PARAM;
+ }
+ }
+
+ length = ntohs(asconf_ack_param->param_hdr.length);
+ asconf_ack_param = (void *)asconf_ack_param + length;
+ asconf_ack_len -= length;
+ }
+
+ return err_code;
+}
+
+/* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */
+int sctp_process_asconf_ack(struct sctp_association *asoc,
+ struct sctp_chunk *asconf_ack)
+{
+ struct sctp_chunk *asconf = asoc->addip_last_asconf;
+ union sctp_addr_param *addr_param;
+ sctp_addip_param_t *asconf_param;
+ int length = 0;
+ int asconf_len = asconf->skb->len;
+ int all_param_pass = 0;
+ int no_err = 1;
+ int retval = 0;
+ __be16 err_code = SCTP_ERROR_NO_ERROR;
+
+ /* Skip the chunkhdr and addiphdr from the last asconf sent and store
+ * a pointer to address parameter.
+ */
+ length = sizeof(sctp_addip_chunk_t);
+ addr_param = (union sctp_addr_param *)(asconf->skb->data + length);
+ asconf_len -= length;
+
+ /* Skip the address parameter in the last asconf sent and store a
+ * pointer to the first asconf parameter.
+ */
+ length = ntohs(addr_param->p.length);
+ asconf_param = (void *)addr_param + length;
+ asconf_len -= length;
+
+ /* ADDIP 4.1
+ * A8) If there is no response(s) to specific TLV parameter(s), and no
+ * failures are indicated, then all request(s) are considered
+ * successful.
+ */
+ if (asconf_ack->skb->len == sizeof(sctp_addiphdr_t))
+ all_param_pass = 1;
+
+ /* Process the TLVs contained in the last sent ASCONF chunk. */
+ while (asconf_len > 0) {
+ if (all_param_pass)
+ err_code = SCTP_ERROR_NO_ERROR;
+ else {
+ err_code = sctp_get_asconf_response(asconf_ack,
+ asconf_param,
+ no_err);
+ if (no_err && (SCTP_ERROR_NO_ERROR != err_code))
+ no_err = 0;
+ }
+
+ switch (err_code) {
+ case SCTP_ERROR_NO_ERROR:
+ sctp_asconf_param_success(asoc, asconf_param);
+ break;
+
+ case SCTP_ERROR_RSRC_LOW:
+ retval = 1;
+ break;
+
+ case SCTP_ERROR_UNKNOWN_PARAM:
+ /* Disable sending this type of asconf parameter in
+ * future.
+ */
+ asoc->peer.addip_disabled_mask |=
+ asconf_param->param_hdr.type;
+ break;
+
+ case SCTP_ERROR_REQ_REFUSED:
+ case SCTP_ERROR_DEL_LAST_IP:
+ case SCTP_ERROR_DEL_SRC_IP:
+ default:
+ break;
+ }
+
+ /* Skip the processed asconf parameter and move to the next
+ * one.
+ */
+ length = ntohs(asconf_param->param_hdr.length);
+ asconf_param = (void *)asconf_param + length;
+ asconf_len -= length;
+ }
+
+ if (no_err && asoc->src_out_of_asoc_ok) {
+ asoc->src_out_of_asoc_ok = 0;
+ sctp_transport_immediate_rtx(asoc->peer.primary_path);
+ }
+
+ /* Free the cached last sent asconf chunk. */
+ list_del_init(&asconf->transmitted_list);
+ sctp_chunk_free(asconf);
+ asoc->addip_last_asconf = NULL;
+
+ return retval;
+}
+
+/* Make a FWD TSN chunk. */
+struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
+ __u32 new_cum_tsn, size_t nstreams,
+ struct sctp_fwdtsn_skip *skiplist)
+{
+ struct sctp_chunk *retval = NULL;
+ struct sctp_fwdtsn_hdr ftsn_hdr;
+ struct sctp_fwdtsn_skip skip;
+ size_t hint;
+ int i;
+
+ hint = (nstreams + 1) * sizeof(__u32);
+
+ retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint);
+
+ if (!retval)
+ return NULL;
+
+ ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn);
+ retval->subh.fwdtsn_hdr =
+ sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr);
+
+ for (i = 0; i < nstreams; i++) {
+ skip.stream = skiplist[i].stream;
+ skip.ssn = skiplist[i].ssn;
+ sctp_addto_chunk(retval, sizeof(skip), &skip);
+ }
+
+ return retval;
+}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
new file mode 100644
index 000000000..fef2acdf4
--- /dev/null
+++ b/net/sctp/sm_sideeffect.c
@@ -0,0 +1,1751 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions work with the state functions in sctp_sm_statefuns.c
+ * to implement that state operations. These functions implement the
+ * steps which require modifying existing data structures.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@austin.ibm.com>
+ * Hui Huang <hui.huang@nokia.com>
+ * Dajiang Zhang <dajiang.zhang@nokia.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static int sctp_cmd_interpreter(sctp_event_t event_type,
+ sctp_subtype_t subtype,
+ sctp_state_t state,
+ struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ void *event_arg,
+ sctp_disposition_t status,
+ sctp_cmd_seq_t *commands,
+ gfp_t gfp);
+static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
+ sctp_state_t state,
+ struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ void *event_arg,
+ sctp_disposition_t status,
+ sctp_cmd_seq_t *commands,
+ gfp_t gfp);
+
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+ struct sctp_transport *t);
+/********************************************************************
+ * Helper functions
+ ********************************************************************/
+
+/* A helper function for delayed processing of INET ECN CE bit. */
+static void sctp_do_ecn_ce_work(struct sctp_association *asoc,
+ __u32 lowest_tsn)
+{
+ /* Save the TSN away for comparison when we receive CWR */
+
+ asoc->last_ecne_tsn = lowest_tsn;
+ asoc->need_ecne = 1;
+}
+
+/* Helper function for delayed processing of SCTP ECNE chunk. */
+/* RFC 2960 Appendix A
+ *
+ * RFC 2481 details a specific bit for a sender to send in
+ * the header of its next outbound TCP segment to indicate to
+ * its peer that it has reduced its congestion window. This
+ * is termed the CWR bit. For SCTP the same indication is made
+ * by including the CWR chunk. This chunk contains one data
+ * element, i.e. the TSN number that was sent in the ECNE chunk.
+ * This element represents the lowest TSN number in the datagram
+ * that was originally marked with the CE bit.
+ */
+static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc,
+ __u32 lowest_tsn,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_chunk *repl;
+
+ /* Our previously transmitted packet ran into some congestion
+ * so we should take action by reducing cwnd and ssthresh
+ * and then ACK our peer that we we've done so by
+ * sending a CWR.
+ */
+
+ /* First, try to determine if we want to actually lower
+ * our cwnd variables. Only lower them if the ECNE looks more
+ * recent than the last response.
+ */
+ if (TSN_lt(asoc->last_cwr_tsn, lowest_tsn)) {
+ struct sctp_transport *transport;
+
+ /* Find which transport's congestion variables
+ * need to be adjusted.
+ */
+ transport = sctp_assoc_lookup_tsn(asoc, lowest_tsn);
+
+ /* Update the congestion variables. */
+ if (transport)
+ sctp_transport_lower_cwnd(transport,
+ SCTP_LOWER_CWND_ECNE);
+ asoc->last_cwr_tsn = lowest_tsn;
+ }
+
+ /* Always try to quiet the other end. In case of lost CWR,
+ * resend last_cwr_tsn.
+ */
+ repl = sctp_make_cwr(asoc, asoc->last_cwr_tsn, chunk);
+
+ /* If we run out of memory, it will look like a lost CWR. We'll
+ * get back in sync eventually.
+ */
+ return repl;
+}
+
+/* Helper function to do delayed processing of ECN CWR chunk. */
+static void sctp_do_ecn_cwr_work(struct sctp_association *asoc,
+ __u32 lowest_tsn)
+{
+ /* Turn off ECNE getting auto-prepended to every outgoing
+ * packet
+ */
+ asoc->need_ecne = 0;
+}
+
+/* Generate SACK if necessary. We call this at the end of a packet. */
+static int sctp_gen_sack(struct sctp_association *asoc, int force,
+ sctp_cmd_seq_t *commands)
+{
+ __u32 ctsn, max_tsn_seen;
+ struct sctp_chunk *sack;
+ struct sctp_transport *trans = asoc->peer.last_data_from;
+ int error = 0;
+
+ if (force ||
+ (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) ||
+ (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE)))
+ asoc->peer.sack_needed = 1;
+
+ ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
+ max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map);
+
+ /* From 12.2 Parameters necessary per association (i.e. the TCB):
+ *
+ * Ack State : This flag indicates if the next received packet
+ * : is to be responded to with a SACK. ...
+ * : When DATA chunks are out of order, SACK's
+ * : are not delayed (see Section 6).
+ *
+ * [This is actually not mentioned in Section 6, but we
+ * implement it here anyway. --piggy]
+ */
+ if (max_tsn_seen != ctsn)
+ asoc->peer.sack_needed = 1;
+
+ /* From 6.2 Acknowledgement on Reception of DATA Chunks:
+ *
+ * Section 4.2 of [RFC2581] SHOULD be followed. Specifically,
+ * an acknowledgement SHOULD be generated for at least every
+ * second packet (not every second DATA chunk) received, and
+ * SHOULD be generated within 200 ms of the arrival of any
+ * unacknowledged DATA chunk. ...
+ */
+ if (!asoc->peer.sack_needed) {
+ asoc->peer.sack_cnt++;
+
+ /* Set the SACK delay timeout based on the
+ * SACK delay for the last transport
+ * data was received from, or the default
+ * for the association.
+ */
+ if (trans) {
+ /* We will need a SACK for the next packet. */
+ if (asoc->peer.sack_cnt >= trans->sackfreq - 1)
+ asoc->peer.sack_needed = 1;
+
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+ trans->sackdelay;
+ } else {
+ /* We will need a SACK for the next packet. */
+ if (asoc->peer.sack_cnt >= asoc->sackfreq - 1)
+ asoc->peer.sack_needed = 1;
+
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+ asoc->sackdelay;
+ }
+
+ /* Restart the SACK timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
+ } else {
+ asoc->a_rwnd = asoc->rwnd;
+ sack = sctp_make_sack(asoc);
+ if (!sack)
+ goto nomem;
+
+ asoc->peer.sack_needed = 0;
+ asoc->peer.sack_cnt = 0;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(sack));
+
+ /* Stop the SACK timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
+ }
+
+ return error;
+nomem:
+ error = -ENOMEM;
+ return error;
+}
+
+/* When the T3-RTX timer expires, it calls this function to create the
+ * relevant state machine event.
+ */
+void sctp_generate_t3_rtx_event(unsigned long peer)
+{
+ int error;
+ struct sctp_transport *transport = (struct sctp_transport *) peer;
+ struct sctp_association *asoc = transport->asoc;
+ struct net *net = sock_net(asoc->base.sk);
+
+ /* Check whether a task is in the sock. */
+
+ bh_lock_sock(asoc->base.sk);
+ if (sock_owned_by_user(asoc->base.sk)) {
+ pr_debug("%s: sock is busy\n", __func__);
+
+ /* Try again later. */
+ if (!mod_timer(&transport->T3_rtx_timer, jiffies + (HZ/20)))
+ sctp_transport_hold(transport);
+ goto out_unlock;
+ }
+
+ /* Is this transport really dead and just waiting around for
+ * the timer to let go of the reference?
+ */
+ if (transport->dead)
+ goto out_unlock;
+
+ /* Run through the state machine. */
+ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
+ SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX),
+ asoc->state,
+ asoc->ep, asoc,
+ transport, GFP_ATOMIC);
+
+ if (error)
+ asoc->base.sk->sk_err = -error;
+
+out_unlock:
+ bh_unlock_sock(asoc->base.sk);
+ sctp_transport_put(transport);
+}
+
+/* This is a sa interface for producing timeout events. It works
+ * for timeouts which use the association as their parameter.
+ */
+static void sctp_generate_timeout_event(struct sctp_association *asoc,
+ sctp_event_timeout_t timeout_type)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ int error = 0;
+
+ bh_lock_sock(asoc->base.sk);
+ if (sock_owned_by_user(asoc->base.sk)) {
+ pr_debug("%s: sock is busy: timer %d\n", __func__,
+ timeout_type);
+
+ /* Try again later. */
+ if (!mod_timer(&asoc->timers[timeout_type], jiffies + (HZ/20)))
+ sctp_association_hold(asoc);
+ goto out_unlock;
+ }
+
+ /* Is this association really dead and just waiting around for
+ * the timer to let go of the reference?
+ */
+ if (asoc->base.dead)
+ goto out_unlock;
+
+ /* Run through the state machine. */
+ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
+ SCTP_ST_TIMEOUT(timeout_type),
+ asoc->state, asoc->ep, asoc,
+ (void *)timeout_type, GFP_ATOMIC);
+
+ if (error)
+ asoc->base.sk->sk_err = -error;
+
+out_unlock:
+ bh_unlock_sock(asoc->base.sk);
+ sctp_association_put(asoc);
+}
+
+static void sctp_generate_t1_cookie_event(unsigned long data)
+{
+ struct sctp_association *asoc = (struct sctp_association *) data;
+ sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE);
+}
+
+static void sctp_generate_t1_init_event(unsigned long data)
+{
+ struct sctp_association *asoc = (struct sctp_association *) data;
+ sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT);
+}
+
+static void sctp_generate_t2_shutdown_event(unsigned long data)
+{
+ struct sctp_association *asoc = (struct sctp_association *) data;
+ sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN);
+}
+
+static void sctp_generate_t4_rto_event(unsigned long data)
+{
+ struct sctp_association *asoc = (struct sctp_association *) data;
+ sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO);
+}
+
+static void sctp_generate_t5_shutdown_guard_event(unsigned long data)
+{
+ struct sctp_association *asoc = (struct sctp_association *)data;
+ sctp_generate_timeout_event(asoc,
+ SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD);
+
+} /* sctp_generate_t5_shutdown_guard_event() */
+
+static void sctp_generate_autoclose_event(unsigned long data)
+{
+ struct sctp_association *asoc = (struct sctp_association *) data;
+ sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE);
+}
+
+/* Generate a heart beat event. If the sock is busy, reschedule. Make
+ * sure that the transport is still valid.
+ */
+void sctp_generate_heartbeat_event(unsigned long data)
+{
+ int error = 0;
+ struct sctp_transport *transport = (struct sctp_transport *) data;
+ struct sctp_association *asoc = transport->asoc;
+ struct net *net = sock_net(asoc->base.sk);
+
+ bh_lock_sock(asoc->base.sk);
+ if (sock_owned_by_user(asoc->base.sk)) {
+ pr_debug("%s: sock is busy\n", __func__);
+
+ /* Try again later. */
+ if (!mod_timer(&transport->hb_timer, jiffies + (HZ/20)))
+ sctp_transport_hold(transport);
+ goto out_unlock;
+ }
+
+ /* Is this structure just waiting around for us to actually
+ * get destroyed?
+ */
+ if (transport->dead)
+ goto out_unlock;
+
+ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
+ SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT),
+ asoc->state, asoc->ep, asoc,
+ transport, GFP_ATOMIC);
+
+ if (error)
+ asoc->base.sk->sk_err = -error;
+
+out_unlock:
+ bh_unlock_sock(asoc->base.sk);
+ sctp_transport_put(transport);
+}
+
+/* Handle the timeout of the ICMP protocol unreachable timer. Trigger
+ * the correct state machine transition that will close the association.
+ */
+void sctp_generate_proto_unreach_event(unsigned long data)
+{
+ struct sctp_transport *transport = (struct sctp_transport *) data;
+ struct sctp_association *asoc = transport->asoc;
+ struct net *net = sock_net(asoc->base.sk);
+
+ bh_lock_sock(asoc->base.sk);
+ if (sock_owned_by_user(asoc->base.sk)) {
+ pr_debug("%s: sock is busy\n", __func__);
+
+ /* Try again later. */
+ if (!mod_timer(&transport->proto_unreach_timer,
+ jiffies + (HZ/20)))
+ sctp_association_hold(asoc);
+ goto out_unlock;
+ }
+
+ /* Is this structure just waiting around for us to actually
+ * get destroyed?
+ */
+ if (asoc->base.dead)
+ goto out_unlock;
+
+ sctp_do_sm(net, SCTP_EVENT_T_OTHER,
+ SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
+ asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC);
+
+out_unlock:
+ bh_unlock_sock(asoc->base.sk);
+ sctp_association_put(asoc);
+}
+
+
+/* Inject a SACK Timeout event into the state machine. */
+static void sctp_generate_sack_event(unsigned long data)
+{
+ struct sctp_association *asoc = (struct sctp_association *) data;
+ sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK);
+}
+
+sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
+ NULL,
+ sctp_generate_t1_cookie_event,
+ sctp_generate_t1_init_event,
+ sctp_generate_t2_shutdown_event,
+ NULL,
+ sctp_generate_t4_rto_event,
+ sctp_generate_t5_shutdown_guard_event,
+ NULL,
+ sctp_generate_sack_event,
+ sctp_generate_autoclose_event,
+};
+
+
+/* RFC 2960 8.2 Path Failure Detection
+ *
+ * When its peer endpoint is multi-homed, an endpoint should keep a
+ * error counter for each of the destination transport addresses of the
+ * peer endpoint.
+ *
+ * Each time the T3-rtx timer expires on any address, or when a
+ * HEARTBEAT sent to an idle address is not acknowledged within a RTO,
+ * the error counter of that destination address will be incremented.
+ * When the value in the error counter exceeds the protocol parameter
+ * 'Path.Max.Retrans' of that destination address, the endpoint should
+ * mark the destination transport address as inactive, and a
+ * notification SHOULD be sent to the upper layer.
+ *
+ */
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+ struct sctp_association *asoc,
+ struct sctp_transport *transport,
+ int is_hb)
+{
+ /* The check for association's overall error counter exceeding the
+ * threshold is done in the state function.
+ */
+ /* We are here due to a timer expiration. If the timer was
+ * not a HEARTBEAT, then normal error tracking is done.
+ * If the timer was a heartbeat, we only increment error counts
+ * when we already have an outstanding HEARTBEAT that has not
+ * been acknowledged.
+ * Additionally, some tranport states inhibit error increments.
+ */
+ if (!is_hb) {
+ asoc->overall_error_count++;
+ if (transport->state != SCTP_INACTIVE)
+ transport->error_count++;
+ } else if (transport->hb_sent) {
+ if (transport->state != SCTP_UNCONFIRMED)
+ asoc->overall_error_count++;
+ if (transport->state != SCTP_INACTIVE)
+ transport->error_count++;
+ }
+
+ /* If the transport error count is greater than the pf_retrans
+ * threshold, and less than pathmaxrtx, and if the current state
+ * is SCTP_ACTIVE, then mark this transport as Partially Failed,
+ * see SCTP Quick Failover Draft, section 5.1
+ */
+ if ((transport->state == SCTP_ACTIVE) &&
+ (asoc->pf_retrans < transport->pathmaxrxt) &&
+ (transport->error_count > asoc->pf_retrans)) {
+
+ sctp_assoc_control_transport(asoc, transport,
+ SCTP_TRANSPORT_PF,
+ 0);
+
+ /* Update the hb timer to resend a heartbeat every rto */
+ sctp_cmd_hb_timer_update(commands, transport);
+ }
+
+ if (transport->state != SCTP_INACTIVE &&
+ (transport->error_count > transport->pathmaxrxt)) {
+ pr_debug("%s: association:%p transport addr:%pISpc failed\n",
+ __func__, asoc, &transport->ipaddr.sa);
+
+ sctp_assoc_control_transport(asoc, transport,
+ SCTP_TRANSPORT_DOWN,
+ SCTP_FAILED_THRESHOLD);
+ }
+
+ /* E2) For the destination address for which the timer
+ * expires, set RTO <- RTO * 2 ("back off the timer"). The
+ * maximum value discussed in rule C7 above (RTO.max) may be
+ * used to provide an upper bound to this doubling operation.
+ *
+ * Special Case: the first HB doesn't trigger exponential backoff.
+ * The first unacknowledged HB triggers it. We do this with a flag
+ * that indicates that we have an outstanding HB.
+ */
+ if (!is_hb || transport->hb_sent) {
+ transport->rto = min((transport->rto * 2), transport->asoc->rto_max);
+ sctp_max_rto(asoc, transport);
+ }
+}
+
+/* Worker routine to handle INIT command failure. */
+static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands,
+ struct sctp_association *asoc,
+ unsigned int error)
+{
+ struct sctp_ulpevent *event;
+
+ event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_CANT_STR_ASSOC,
+ (__u16)error, 0, 0, NULL,
+ GFP_ATOMIC);
+
+ if (event)
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(event));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_CLOSED));
+
+ /* SEND_FAILED sent later when cleaning up the association. */
+ asoc->outqueue.error = error;
+ sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+}
+
+/* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */
+static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
+ struct sctp_association *asoc,
+ sctp_event_t event_type,
+ sctp_subtype_t subtype,
+ struct sctp_chunk *chunk,
+ unsigned int error)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_chunk *abort;
+ /* Cancel any partial delivery in progress. */
+ sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+
+ if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT)
+ event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
+ (__u16)error, 0, 0, chunk,
+ GFP_ATOMIC);
+ else
+ event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
+ (__u16)error, 0, 0, NULL,
+ GFP_ATOMIC);
+ if (event)
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(event));
+
+ if (asoc->overall_error_count >= asoc->max_retrans) {
+ abort = sctp_make_violation_max_retrans(asoc, chunk);
+ if (abort)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(abort));
+ }
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_CLOSED));
+
+ /* SEND_FAILED sent later when cleaning up the association. */
+ asoc->outqueue.error = error;
+ sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+}
+
+/* Process an init chunk (may be real INIT/INIT-ACK or an embedded INIT
+ * inside the cookie. In reality, this is only used for INIT-ACK processing
+ * since all other cases use "temporary" associations and can do all
+ * their work in statefuns directly.
+ */
+static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
+ struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ sctp_init_chunk_t *peer_init,
+ gfp_t gfp)
+{
+ int error;
+
+ /* We only process the init as a sideeffect in a single
+ * case. This is when we process the INIT-ACK. If we
+ * fail during INIT processing (due to malloc problems),
+ * just return the error and stop processing the stack.
+ */
+ if (!sctp_process_init(asoc, chunk, sctp_source(chunk), peer_init, gfp))
+ error = -ENOMEM;
+ else
+ error = 0;
+
+ return error;
+}
+
+/* Helper function to break out starting up of heartbeat timers. */
+static void sctp_cmd_hb_timers_start(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc)
+{
+ struct sctp_transport *t;
+
+ /* Start a heartbeat timer for each transport on the association.
+ * hold a reference on the transport to make sure none of
+ * the needed data structures go away.
+ */
+ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
+
+ if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
+ sctp_transport_hold(t);
+ }
+}
+
+static void sctp_cmd_hb_timers_stop(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc)
+{
+ struct sctp_transport *t;
+
+ /* Stop all heartbeat timers. */
+
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports) {
+ if (del_timer(&t->hb_timer))
+ sctp_transport_put(t);
+ }
+}
+
+/* Helper function to stop any pending T3-RTX timers */
+static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc)
+{
+ struct sctp_transport *t;
+
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports) {
+ if (del_timer(&t->T3_rtx_timer))
+ sctp_transport_put(t);
+ }
+}
+
+
+/* Helper function to update the heartbeat timer. */
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+ struct sctp_transport *t)
+{
+ /* Update the heartbeat timer. */
+ if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
+ sctp_transport_hold(t);
+}
+
+/* Helper function to handle the reception of an HEARTBEAT ACK. */
+static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc,
+ struct sctp_transport *t,
+ struct sctp_chunk *chunk)
+{
+ sctp_sender_hb_info_t *hbinfo;
+ int was_unconfirmed = 0;
+
+ /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the
+ * HEARTBEAT should clear the error counter of the destination
+ * transport address to which the HEARTBEAT was sent.
+ */
+ t->error_count = 0;
+
+ /*
+ * Although RFC4960 specifies that the overall error count must
+ * be cleared when a HEARTBEAT ACK is received, we make an
+ * exception while in SHUTDOWN PENDING. If the peer keeps its
+ * window shut forever, we may never be able to transmit our
+ * outstanding data and rely on the retransmission limit be reached
+ * to shutdown the association.
+ */
+ if (t->asoc->state != SCTP_STATE_SHUTDOWN_PENDING)
+ t->asoc->overall_error_count = 0;
+
+ /* Clear the hb_sent flag to signal that we had a good
+ * acknowledgement.
+ */
+ t->hb_sent = 0;
+
+ /* Mark the destination transport address as active if it is not so
+ * marked.
+ */
+ if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED)) {
+ was_unconfirmed = 1;
+ sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+ SCTP_HEARTBEAT_SUCCESS);
+ }
+
+ if (t->state == SCTP_PF)
+ sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+ SCTP_HEARTBEAT_SUCCESS);
+
+ /* HB-ACK was received for a the proper HB. Consider this
+ * forward progress.
+ */
+ if (t->dst)
+ dst_confirm(t->dst);
+
+ /* The receiver of the HEARTBEAT ACK should also perform an
+ * RTT measurement for that destination transport address
+ * using the time value carried in the HEARTBEAT ACK chunk.
+ * If the transport's rto_pending variable has been cleared,
+ * it was most likely due to a retransmit. However, we want
+ * to re-enable it to properly update the rto.
+ */
+ if (t->rto_pending == 0)
+ t->rto_pending = 1;
+
+ hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data;
+ sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at));
+
+ /* Update the heartbeat timer. */
+ if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
+ sctp_transport_hold(t);
+
+ if (was_unconfirmed && asoc->peer.transport_count == 1)
+ sctp_transport_immediate_rtx(t);
+}
+
+
+/* Helper function to process the process SACK command. */
+static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc,
+ struct sctp_chunk *chunk)
+{
+ int err = 0;
+
+ if (sctp_outq_sack(&asoc->outqueue, chunk)) {
+ struct net *net = sock_net(asoc->base.sk);
+
+ /* There are no more TSNs awaiting SACK. */
+ err = sctp_do_sm(net, SCTP_EVENT_T_OTHER,
+ SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN),
+ asoc->state, asoc->ep, asoc, NULL,
+ GFP_ATOMIC);
+ }
+
+ return err;
+}
+
+/* Helper function to set the timeout value for T2-SHUTDOWN timer and to set
+ * the transport for a shutdown chunk.
+ */
+static void sctp_cmd_setup_t2(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_transport *t;
+
+ if (chunk->transport)
+ t = chunk->transport;
+ else {
+ t = sctp_assoc_choose_alter_transport(asoc,
+ asoc->shutdown_last_sent_to);
+ chunk->transport = t;
+ }
+ asoc->shutdown_last_sent_to = t;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto;
+}
+
+/* Helper function to change the state of an association. */
+static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc,
+ sctp_state_t state)
+{
+ struct sock *sk = asoc->base.sk;
+
+ asoc->state = state;
+
+ pr_debug("%s: asoc:%p[%s]\n", __func__, asoc, sctp_state_tbl[state]);
+
+ if (sctp_style(sk, TCP)) {
+ /* Change the sk->sk_state of a TCP-style socket that has
+ * successfully completed a connect() call.
+ */
+ if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
+ sk->sk_state = SCTP_SS_ESTABLISHED;
+
+ /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */
+ if (sctp_state(asoc, SHUTDOWN_RECEIVED) &&
+ sctp_sstate(sk, ESTABLISHED))
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ }
+
+ if (sctp_state(asoc, COOKIE_WAIT)) {
+ /* Reset init timeouts since they may have been
+ * increased due to timer expirations.
+ */
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
+ asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
+ asoc->rto_initial;
+ }
+
+ if (sctp_state(asoc, ESTABLISHED) ||
+ sctp_state(asoc, CLOSED) ||
+ sctp_state(asoc, SHUTDOWN_RECEIVED)) {
+ /* Wake up any processes waiting in the asoc's wait queue in
+ * sctp_wait_for_connect() or sctp_wait_for_sndbuf().
+ */
+ if (waitqueue_active(&asoc->wait))
+ wake_up_interruptible(&asoc->wait);
+
+ /* Wake up any processes waiting in the sk's sleep queue of
+ * a TCP-style or UDP-style peeled-off socket in
+ * sctp_wait_for_accept() or sctp_wait_for_packet().
+ * For a UDP-style socket, the waiters are woken up by the
+ * notifications.
+ */
+ if (!sctp_style(sk, UDP))
+ sk->sk_state_change(sk);
+ }
+}
+
+/* Helper function to delete an association. */
+static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc)
+{
+ struct sock *sk = asoc->base.sk;
+
+ /* If it is a non-temporary association belonging to a TCP-style
+ * listening socket that is not closed, do not free it so that accept()
+ * can pick it up later.
+ */
+ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING) &&
+ (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK))
+ return;
+
+ sctp_unhash_established(asoc);
+ sctp_association_free(asoc);
+}
+
+/*
+ * ADDIP Section 4.1 ASCONF Chunk Procedures
+ * A4) Start a T-4 RTO timer, using the RTO value of the selected
+ * destination address (we use active path instead of primary path just
+ * because primary path may be inactive.
+ */
+static void sctp_cmd_setup_t4(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_transport *t;
+
+ t = sctp_assoc_choose_alter_transport(asoc, chunk->transport);
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = t->rto;
+ chunk->transport = t;
+}
+
+/* Process an incoming Operation Error Chunk. */
+static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds,
+ struct sctp_association *asoc,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_errhdr *err_hdr;
+ struct sctp_ulpevent *ev;
+
+ while (chunk->chunk_end > chunk->skb->data) {
+ err_hdr = (struct sctp_errhdr *)(chunk->skb->data);
+
+ ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
+ GFP_ATOMIC);
+ if (!ev)
+ return;
+
+ sctp_ulpq_tail_event(&asoc->ulpq, ev);
+
+ switch (err_hdr->cause) {
+ case SCTP_ERROR_UNKNOWN_CHUNK:
+ {
+ sctp_chunkhdr_t *unk_chunk_hdr;
+
+ unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable;
+ switch (unk_chunk_hdr->type) {
+ /* ADDIP 4.1 A9) If the peer responds to an ASCONF with
+ * an ERROR chunk reporting that it did not recognized
+ * the ASCONF chunk type, the sender of the ASCONF MUST
+ * NOT send any further ASCONF chunks and MUST stop its
+ * T-4 timer.
+ */
+ case SCTP_CID_ASCONF:
+ if (asoc->peer.asconf_capable == 0)
+ break;
+
+ asoc->peer.asconf_capable = 0;
+ sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+}
+
+/* Process variable FWDTSN chunk information. */
+static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_fwdtsn_skip *skip;
+ /* Walk through all the skipped SSNs */
+ sctp_walk_fwdtsn(skip, chunk) {
+ sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
+ }
+}
+
+/* Helper function to remove the association non-primary peer
+ * transports.
+ */
+static void sctp_cmd_del_non_primary(struct sctp_association *asoc)
+{
+ struct sctp_transport *t;
+ struct list_head *pos;
+ struct list_head *temp;
+
+ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+ t = list_entry(pos, struct sctp_transport, transports);
+ if (!sctp_cmp_addr_exact(&t->ipaddr,
+ &asoc->peer.primary_addr)) {
+ sctp_assoc_del_peer(asoc, &t->ipaddr);
+ }
+ }
+}
+
+/* Helper function to set sk_err on a 1-1 style socket. */
+static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error)
+{
+ struct sock *sk = asoc->base.sk;
+
+ if (!sctp_style(sk, UDP))
+ sk->sk_err = error;
+}
+
+/* Helper function to generate an association change event */
+static void sctp_cmd_assoc_change(sctp_cmd_seq_t *commands,
+ struct sctp_association *asoc,
+ u8 state)
+{
+ struct sctp_ulpevent *ev;
+
+ ev = sctp_ulpevent_make_assoc_change(asoc, 0, state, 0,
+ asoc->c.sinit_num_ostreams,
+ asoc->c.sinit_max_instreams,
+ NULL, GFP_ATOMIC);
+ if (ev)
+ sctp_ulpq_tail_event(&asoc->ulpq, ev);
+}
+
+/* Helper function to generate an adaptation indication event */
+static void sctp_cmd_adaptation_ind(sctp_cmd_seq_t *commands,
+ struct sctp_association *asoc)
+{
+ struct sctp_ulpevent *ev;
+
+ ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);
+
+ if (ev)
+ sctp_ulpq_tail_event(&asoc->ulpq, ev);
+}
+
+
+static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
+ sctp_event_timeout_t timer,
+ char *name)
+{
+ struct sctp_transport *t;
+
+ t = asoc->init_last_sent_to;
+ asoc->init_err_counter++;
+
+ if (t->init_sent_count > (asoc->init_cycle + 1)) {
+ asoc->timeouts[timer] *= 2;
+ if (asoc->timeouts[timer] > asoc->max_init_timeo) {
+ asoc->timeouts[timer] = asoc->max_init_timeo;
+ }
+ asoc->init_cycle++;
+
+ pr_debug("%s: T1[%s] timeout adjustment init_err_counter:%d"
+ " cycle:%d timeout:%ld\n", __func__, name,
+ asoc->init_err_counter, asoc->init_cycle,
+ asoc->timeouts[timer]);
+ }
+
+}
+
+/* Send the whole message, chunk by chunk, to the outqueue.
+ * This way the whole message is queued up and bundling if
+ * encouraged for small fragments.
+ */
+static int sctp_cmd_send_msg(struct sctp_association *asoc,
+ struct sctp_datamsg *msg)
+{
+ struct sctp_chunk *chunk;
+ int error = 0;
+
+ list_for_each_entry(chunk, &msg->chunks, frag_list) {
+ error = sctp_outq_tail(&asoc->outqueue, chunk);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+
+/* Sent the next ASCONF packet currently stored in the association.
+ * This happens after the ASCONF_ACK was succeffully processed.
+ */
+static void sctp_cmd_send_asconf(struct sctp_association *asoc)
+{
+ struct net *net = sock_net(asoc->base.sk);
+
+ /* Send the next asconf chunk from the addip chunk
+ * queue.
+ */
+ if (!list_empty(&asoc->addip_chunk_list)) {
+ struct list_head *entry = asoc->addip_chunk_list.next;
+ struct sctp_chunk *asconf = list_entry(entry,
+ struct sctp_chunk, list);
+ list_del_init(entry);
+
+ /* Hold the chunk until an ASCONF_ACK is received. */
+ sctp_chunk_hold(asconf);
+ if (sctp_primitive_ASCONF(net, asoc, asconf))
+ sctp_chunk_free(asconf);
+ else
+ asoc->addip_last_asconf = asconf;
+ }
+}
+
+
+/* These three macros allow us to pull the debugging code out of the
+ * main flow of sctp_do_sm() to keep attention focused on the real
+ * functionality there.
+ */
+#define debug_pre_sfn() \
+ pr_debug("%s[pre-fn]: ep:%p, %s, %s, asoc:%p[%s], %s\n", __func__, \
+ ep, sctp_evttype_tbl[event_type], (*debug_fn)(subtype), \
+ asoc, sctp_state_tbl[state], state_fn->name)
+
+#define debug_post_sfn() \
+ pr_debug("%s[post-fn]: asoc:%p, status:%s\n", __func__, asoc, \
+ sctp_status_tbl[status])
+
+#define debug_post_sfx() \
+ pr_debug("%s[post-sfx]: error:%d, asoc:%p[%s]\n", __func__, error, \
+ asoc, sctp_state_tbl[(asoc && sctp_id2assoc(ep->base.sk, \
+ sctp_assoc2id(asoc))) ? asoc->state : SCTP_STATE_CLOSED])
+
+/*
+ * This is the master state machine processing function.
+ *
+ * If you want to understand all of lksctp, this is a
+ * good place to start.
+ */
+int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
+ sctp_state_t state,
+ struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ void *event_arg,
+ gfp_t gfp)
+{
+ sctp_cmd_seq_t commands;
+ const sctp_sm_table_entry_t *state_fn;
+ sctp_disposition_t status;
+ int error = 0;
+ typedef const char *(printfn_t)(sctp_subtype_t);
+ static printfn_t *table[] = {
+ NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname,
+ };
+ printfn_t *debug_fn __attribute__ ((unused)) = table[event_type];
+
+ /* Look up the state function, run it, and then process the
+ * side effects. These three steps are the heart of lksctp.
+ */
+ state_fn = sctp_sm_lookup_event(net, event_type, state, subtype);
+
+ sctp_init_cmd_seq(&commands);
+
+ debug_pre_sfn();
+ status = state_fn->fn(net, ep, asoc, subtype, event_arg, &commands);
+ debug_post_sfn();
+
+ error = sctp_side_effects(event_type, subtype, state,
+ ep, asoc, event_arg, status,
+ &commands, gfp);
+ debug_post_sfx();
+
+ return error;
+}
+
+/*****************************************************************
+ * This the master state function side effect processing function.
+ *****************************************************************/
+static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
+ sctp_state_t state,
+ struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ void *event_arg,
+ sctp_disposition_t status,
+ sctp_cmd_seq_t *commands,
+ gfp_t gfp)
+{
+ int error;
+
+ /* FIXME - Most of the dispositions left today would be categorized
+ * as "exceptional" dispositions. For those dispositions, it
+ * may not be proper to run through any of the commands at all.
+ * For example, the command interpreter might be run only with
+ * disposition SCTP_DISPOSITION_CONSUME.
+ */
+ if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state,
+ ep, asoc,
+ event_arg, status,
+ commands, gfp)))
+ goto bail;
+
+ switch (status) {
+ case SCTP_DISPOSITION_DISCARD:
+ pr_debug("%s: ignored sctp protocol event - state:%d, "
+ "event_type:%d, event_id:%d\n", __func__, state,
+ event_type, subtype.chunk);
+ break;
+
+ case SCTP_DISPOSITION_NOMEM:
+ /* We ran out of memory, so we need to discard this
+ * packet.
+ */
+ /* BUG--we should now recover some memory, probably by
+ * reneging...
+ */
+ error = -ENOMEM;
+ break;
+
+ case SCTP_DISPOSITION_DELETE_TCB:
+ /* This should now be a command. */
+ break;
+
+ case SCTP_DISPOSITION_CONSUME:
+ case SCTP_DISPOSITION_ABORT:
+ /*
+ * We should no longer have much work to do here as the
+ * real work has been done as explicit commands above.
+ */
+ break;
+
+ case SCTP_DISPOSITION_VIOLATION:
+ net_err_ratelimited("protocol violation state %d chunkid %d\n",
+ state, subtype.chunk);
+ break;
+
+ case SCTP_DISPOSITION_NOT_IMPL:
+ pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n",
+ state, event_type, subtype.chunk);
+ break;
+
+ case SCTP_DISPOSITION_BUG:
+ pr_err("bug in state %d, event_type %d, event_id %d\n",
+ state, event_type, subtype.chunk);
+ BUG();
+ break;
+
+ default:
+ pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n",
+ status, state, event_type, subtype.chunk);
+ BUG();
+ break;
+ }
+
+bail:
+ return error;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* This is the side-effect interpreter. */
+static int sctp_cmd_interpreter(sctp_event_t event_type,
+ sctp_subtype_t subtype,
+ sctp_state_t state,
+ struct sctp_endpoint *ep,
+ struct sctp_association *asoc,
+ void *event_arg,
+ sctp_disposition_t status,
+ sctp_cmd_seq_t *commands,
+ gfp_t gfp)
+{
+ int error = 0;
+ int force;
+ sctp_cmd_t *cmd;
+ struct sctp_chunk *new_obj;
+ struct sctp_chunk *chunk = NULL;
+ struct sctp_packet *packet;
+ struct timer_list *timer;
+ unsigned long timeout;
+ struct sctp_transport *t;
+ struct sctp_sackhdr sackh;
+ int local_cork = 0;
+
+ if (SCTP_EVENT_T_TIMEOUT != event_type)
+ chunk = event_arg;
+
+ /* Note: This whole file is a huge candidate for rework.
+ * For example, each command could either have its own handler, so
+ * the loop would look like:
+ * while (cmds)
+ * cmd->handle(x, y, z)
+ * --jgrimm
+ */
+ while (NULL != (cmd = sctp_next_cmd(commands))) {
+ switch (cmd->verb) {
+ case SCTP_CMD_NOP:
+ /* Do nothing. */
+ break;
+
+ case SCTP_CMD_NEW_ASOC:
+ /* Register a new association. */
+ if (local_cork) {
+ sctp_outq_uncork(&asoc->outqueue);
+ local_cork = 0;
+ }
+
+ /* Register with the endpoint. */
+ asoc = cmd->obj.asoc;
+ BUG_ON(asoc->peer.primary_path == NULL);
+ sctp_endpoint_add_asoc(ep, asoc);
+ sctp_hash_established(asoc);
+ break;
+
+ case SCTP_CMD_UPDATE_ASSOC:
+ sctp_assoc_update(asoc, cmd->obj.asoc);
+ break;
+
+ case SCTP_CMD_PURGE_OUTQUEUE:
+ sctp_outq_teardown(&asoc->outqueue);
+ break;
+
+ case SCTP_CMD_DELETE_TCB:
+ if (local_cork) {
+ sctp_outq_uncork(&asoc->outqueue);
+ local_cork = 0;
+ }
+ /* Delete the current association. */
+ sctp_cmd_delete_tcb(commands, asoc);
+ asoc = NULL;
+ break;
+
+ case SCTP_CMD_NEW_STATE:
+ /* Enter a new state. */
+ sctp_cmd_new_state(commands, asoc, cmd->obj.state);
+ break;
+
+ case SCTP_CMD_REPORT_TSN:
+ /* Record the arrival of a TSN. */
+ error = sctp_tsnmap_mark(&asoc->peer.tsn_map,
+ cmd->obj.u32, NULL);
+ break;
+
+ case SCTP_CMD_REPORT_FWDTSN:
+ /* Move the Cumulattive TSN Ack ahead. */
+ sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32);
+
+ /* purge the fragmentation queue */
+ sctp_ulpq_reasm_flushtsn(&asoc->ulpq, cmd->obj.u32);
+
+ /* Abort any in progress partial delivery. */
+ sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+ break;
+
+ case SCTP_CMD_PROCESS_FWDTSN:
+ sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.chunk);
+ break;
+
+ case SCTP_CMD_GEN_SACK:
+ /* Generate a Selective ACK.
+ * The argument tells us whether to just count
+ * the packet and MAYBE generate a SACK, or
+ * force a SACK out.
+ */
+ force = cmd->obj.i32;
+ error = sctp_gen_sack(asoc, force, commands);
+ break;
+
+ case SCTP_CMD_PROCESS_SACK:
+ /* Process an inbound SACK. */
+ error = sctp_cmd_process_sack(commands, asoc,
+ cmd->obj.chunk);
+ break;
+
+ case SCTP_CMD_GEN_INIT_ACK:
+ /* Generate an INIT ACK chunk. */
+ new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC,
+ 0);
+ if (!new_obj)
+ goto nomem;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(new_obj));
+ break;
+
+ case SCTP_CMD_PEER_INIT:
+ /* Process a unified INIT from the peer.
+ * Note: Only used during INIT-ACK processing. If
+ * there is an error just return to the outter
+ * layer which will bail.
+ */
+ error = sctp_cmd_process_init(commands, asoc, chunk,
+ cmd->obj.init, gfp);
+ break;
+
+ case SCTP_CMD_GEN_COOKIE_ECHO:
+ /* Generate a COOKIE ECHO chunk. */
+ new_obj = sctp_make_cookie_echo(asoc, chunk);
+ if (!new_obj) {
+ if (cmd->obj.chunk)
+ sctp_chunk_free(cmd->obj.chunk);
+ goto nomem;
+ }
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(new_obj));
+
+ /* If there is an ERROR chunk to be sent along with
+ * the COOKIE_ECHO, send it, too.
+ */
+ if (cmd->obj.chunk)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(cmd->obj.chunk));
+
+ if (new_obj->transport) {
+ new_obj->transport->init_sent_count++;
+ asoc->init_last_sent_to = new_obj->transport;
+ }
+
+ /* FIXME - Eventually come up with a cleaner way to
+ * enabling COOKIE-ECHO + DATA bundling during
+ * multihoming stale cookie scenarios, the following
+ * command plays with asoc->peer.retran_path to
+ * avoid the problem of sending the COOKIE-ECHO and
+ * DATA in different paths, which could result
+ * in the association being ABORTed if the DATA chunk
+ * is processed first by the server. Checking the
+ * init error counter simply causes this command
+ * to be executed only during failed attempts of
+ * association establishment.
+ */
+ if ((asoc->peer.retran_path !=
+ asoc->peer.primary_path) &&
+ (asoc->init_err_counter > 0)) {
+ sctp_add_cmd_sf(commands,
+ SCTP_CMD_FORCE_PRIM_RETRAN,
+ SCTP_NULL());
+ }
+
+ break;
+
+ case SCTP_CMD_GEN_SHUTDOWN:
+ /* Generate SHUTDOWN when in SHUTDOWN_SENT state.
+ * Reset error counts.
+ */
+ asoc->overall_error_count = 0;
+
+ /* Generate a SHUTDOWN chunk. */
+ new_obj = sctp_make_shutdown(asoc, chunk);
+ if (!new_obj)
+ goto nomem;
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(new_obj));
+ break;
+
+ case SCTP_CMD_CHUNK_ULP:
+ /* Send a chunk to the sockets layer. */
+ pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n",
+ __func__, cmd->obj.chunk, &asoc->ulpq);
+
+ sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.chunk,
+ GFP_ATOMIC);
+ break;
+
+ case SCTP_CMD_EVENT_ULP:
+ /* Send a notification to the sockets layer. */
+ pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n",
+ __func__, cmd->obj.ulpevent, &asoc->ulpq);
+
+ sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ulpevent);
+ break;
+
+ case SCTP_CMD_REPLY:
+ /* If an caller has not already corked, do cork. */
+ if (!asoc->outqueue.cork) {
+ sctp_outq_cork(&asoc->outqueue);
+ local_cork = 1;
+ }
+ /* Send a chunk to our peer. */
+ error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk);
+ break;
+
+ case SCTP_CMD_SEND_PKT:
+ /* Send a full packet to our peer. */
+ packet = cmd->obj.packet;
+ sctp_packet_transmit(packet);
+ sctp_ootb_pkt_free(packet);
+ break;
+
+ case SCTP_CMD_T1_RETRAN:
+ /* Mark a transport for retransmission. */
+ sctp_retransmit(&asoc->outqueue, cmd->obj.transport,
+ SCTP_RTXR_T1_RTX);
+ break;
+
+ case SCTP_CMD_RETRAN:
+ /* Mark a transport for retransmission. */
+ sctp_retransmit(&asoc->outqueue, cmd->obj.transport,
+ SCTP_RTXR_T3_RTX);
+ break;
+
+ case SCTP_CMD_ECN_CE:
+ /* Do delayed CE processing. */
+ sctp_do_ecn_ce_work(asoc, cmd->obj.u32);
+ break;
+
+ case SCTP_CMD_ECN_ECNE:
+ /* Do delayed ECNE processing. */
+ new_obj = sctp_do_ecn_ecne_work(asoc, cmd->obj.u32,
+ chunk);
+ if (new_obj)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(new_obj));
+ break;
+
+ case SCTP_CMD_ECN_CWR:
+ /* Do delayed CWR processing. */
+ sctp_do_ecn_cwr_work(asoc, cmd->obj.u32);
+ break;
+
+ case SCTP_CMD_SETUP_T2:
+ sctp_cmd_setup_t2(commands, asoc, cmd->obj.chunk);
+ break;
+
+ case SCTP_CMD_TIMER_START_ONCE:
+ timer = &asoc->timers[cmd->obj.to];
+
+ if (timer_pending(timer))
+ break;
+ /* fall through */
+
+ case SCTP_CMD_TIMER_START:
+ timer = &asoc->timers[cmd->obj.to];
+ timeout = asoc->timeouts[cmd->obj.to];
+ BUG_ON(!timeout);
+
+ timer->expires = jiffies + timeout;
+ sctp_association_hold(asoc);
+ add_timer(timer);
+ break;
+
+ case SCTP_CMD_TIMER_RESTART:
+ timer = &asoc->timers[cmd->obj.to];
+ timeout = asoc->timeouts[cmd->obj.to];
+ if (!mod_timer(timer, jiffies + timeout))
+ sctp_association_hold(asoc);
+ break;
+
+ case SCTP_CMD_TIMER_STOP:
+ timer = &asoc->timers[cmd->obj.to];
+ if (del_timer(timer))
+ sctp_association_put(asoc);
+ break;
+
+ case SCTP_CMD_INIT_CHOOSE_TRANSPORT:
+ chunk = cmd->obj.chunk;
+ t = sctp_assoc_choose_alter_transport(asoc,
+ asoc->init_last_sent_to);
+ asoc->init_last_sent_to = t;
+ chunk->transport = t;
+ t->init_sent_count++;
+ /* Set the new transport as primary */
+ sctp_assoc_set_primary(asoc, t);
+ break;
+
+ case SCTP_CMD_INIT_RESTART:
+ /* Do the needed accounting and updates
+ * associated with restarting an initialization
+ * timer. Only multiply the timeout by two if
+ * all transports have been tried at the current
+ * timeout.
+ */
+ sctp_cmd_t1_timer_update(asoc,
+ SCTP_EVENT_TIMEOUT_T1_INIT,
+ "INIT");
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+ break;
+
+ case SCTP_CMD_COOKIEECHO_RESTART:
+ /* Do the needed accounting and updates
+ * associated with restarting an initialization
+ * timer. Only multiply the timeout by two if
+ * all transports have been tried at the current
+ * timeout.
+ */
+ sctp_cmd_t1_timer_update(asoc,
+ SCTP_EVENT_TIMEOUT_T1_COOKIE,
+ "COOKIE");
+
+ /* If we've sent any data bundled with
+ * COOKIE-ECHO we need to resend.
+ */
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports) {
+ sctp_retransmit_mark(&asoc->outqueue, t,
+ SCTP_RTXR_T1_RTX);
+ }
+
+ sctp_add_cmd_sf(commands,
+ SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+ break;
+
+ case SCTP_CMD_INIT_FAILED:
+ sctp_cmd_init_failed(commands, asoc, cmd->obj.err);
+ break;
+
+ case SCTP_CMD_ASSOC_FAILED:
+ sctp_cmd_assoc_failed(commands, asoc, event_type,
+ subtype, chunk, cmd->obj.err);
+ break;
+
+ case SCTP_CMD_INIT_COUNTER_INC:
+ asoc->init_err_counter++;
+ break;
+
+ case SCTP_CMD_INIT_COUNTER_RESET:
+ asoc->init_err_counter = 0;
+ asoc->init_cycle = 0;
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports) {
+ t->init_sent_count = 0;
+ }
+ break;
+
+ case SCTP_CMD_REPORT_DUP:
+ sctp_tsnmap_mark_dup(&asoc->peer.tsn_map,
+ cmd->obj.u32);
+ break;
+
+ case SCTP_CMD_REPORT_BAD_TAG:
+ pr_debug("%s: vtag mismatch!\n", __func__);
+ break;
+
+ case SCTP_CMD_STRIKE:
+ /* Mark one strike against a transport. */
+ sctp_do_8_2_transport_strike(commands, asoc,
+ cmd->obj.transport, 0);
+ break;
+
+ case SCTP_CMD_TRANSPORT_IDLE:
+ t = cmd->obj.transport;
+ sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
+ break;
+
+ case SCTP_CMD_TRANSPORT_HB_SENT:
+ t = cmd->obj.transport;
+ sctp_do_8_2_transport_strike(commands, asoc,
+ t, 1);
+ t->hb_sent = 1;
+ break;
+
+ case SCTP_CMD_TRANSPORT_ON:
+ t = cmd->obj.transport;
+ sctp_cmd_transport_on(commands, asoc, t, chunk);
+ break;
+
+ case SCTP_CMD_HB_TIMERS_START:
+ sctp_cmd_hb_timers_start(commands, asoc);
+ break;
+
+ case SCTP_CMD_HB_TIMER_UPDATE:
+ t = cmd->obj.transport;
+ sctp_cmd_hb_timer_update(commands, t);
+ break;
+
+ case SCTP_CMD_HB_TIMERS_STOP:
+ sctp_cmd_hb_timers_stop(commands, asoc);
+ break;
+
+ case SCTP_CMD_REPORT_ERROR:
+ error = cmd->obj.error;
+ break;
+
+ case SCTP_CMD_PROCESS_CTSN:
+ /* Dummy up a SACK for processing. */
+ sackh.cum_tsn_ack = cmd->obj.be32;
+ sackh.a_rwnd = asoc->peer.rwnd +
+ asoc->outqueue.outstanding_bytes;
+ sackh.num_gap_ack_blocks = 0;
+ sackh.num_dup_tsns = 0;
+ chunk->subh.sack_hdr = &sackh;
+ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK,
+ SCTP_CHUNK(chunk));
+ break;
+
+ case SCTP_CMD_DISCARD_PACKET:
+ /* We need to discard the whole packet.
+ * Uncork the queue since there might be
+ * responses pending
+ */
+ chunk->pdiscard = 1;
+ if (asoc) {
+ sctp_outq_uncork(&asoc->outqueue);
+ local_cork = 0;
+ }
+ break;
+
+ case SCTP_CMD_RTO_PENDING:
+ t = cmd->obj.transport;
+ t->rto_pending = 1;
+ break;
+
+ case SCTP_CMD_PART_DELIVER:
+ sctp_ulpq_partial_delivery(&asoc->ulpq, GFP_ATOMIC);
+ break;
+
+ case SCTP_CMD_RENEGE:
+ sctp_ulpq_renege(&asoc->ulpq, cmd->obj.chunk,
+ GFP_ATOMIC);
+ break;
+
+ case SCTP_CMD_SETUP_T4:
+ sctp_cmd_setup_t4(commands, asoc, cmd->obj.chunk);
+ break;
+
+ case SCTP_CMD_PROCESS_OPERR:
+ sctp_cmd_process_operr(commands, asoc, chunk);
+ break;
+ case SCTP_CMD_CLEAR_INIT_TAG:
+ asoc->peer.i.init_tag = 0;
+ break;
+ case SCTP_CMD_DEL_NON_PRIMARY:
+ sctp_cmd_del_non_primary(asoc);
+ break;
+ case SCTP_CMD_T3_RTX_TIMERS_STOP:
+ sctp_cmd_t3_rtx_timers_stop(commands, asoc);
+ break;
+ case SCTP_CMD_FORCE_PRIM_RETRAN:
+ t = asoc->peer.retran_path;
+ asoc->peer.retran_path = asoc->peer.primary_path;
+ error = sctp_outq_uncork(&asoc->outqueue);
+ local_cork = 0;
+ asoc->peer.retran_path = t;
+ break;
+ case SCTP_CMD_SET_SK_ERR:
+ sctp_cmd_set_sk_err(asoc, cmd->obj.error);
+ break;
+ case SCTP_CMD_ASSOC_CHANGE:
+ sctp_cmd_assoc_change(commands, asoc,
+ cmd->obj.u8);
+ break;
+ case SCTP_CMD_ADAPTATION_IND:
+ sctp_cmd_adaptation_ind(commands, asoc);
+ break;
+
+ case SCTP_CMD_ASSOC_SHKEY:
+ error = sctp_auth_asoc_init_active_key(asoc,
+ GFP_ATOMIC);
+ break;
+ case SCTP_CMD_UPDATE_INITTAG:
+ asoc->peer.i.init_tag = cmd->obj.u32;
+ break;
+ case SCTP_CMD_SEND_MSG:
+ if (!asoc->outqueue.cork) {
+ sctp_outq_cork(&asoc->outqueue);
+ local_cork = 1;
+ }
+ error = sctp_cmd_send_msg(asoc, cmd->obj.msg);
+ break;
+ case SCTP_CMD_SEND_NEXT_ASCONF:
+ sctp_cmd_send_asconf(asoc);
+ break;
+ case SCTP_CMD_PURGE_ASCONF_QUEUE:
+ sctp_asconf_queue_teardown(asoc);
+ break;
+
+ case SCTP_CMD_SET_ASOC:
+ asoc = cmd->obj.asoc;
+ break;
+
+ default:
+ pr_warn("Impossible command: %u\n",
+ cmd->verb);
+ break;
+ }
+
+ if (error)
+ break;
+ }
+
+out:
+ /* If this is in response to a received chunk, wait until
+ * we are done with the packet to open the queue so that we don't
+ * send multiple packets in response to a single request.
+ */
+ if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
+ if (chunk->end_of_packet || chunk->singleton)
+ error = sctp_outq_uncork(&asoc->outqueue);
+ } else if (local_cork)
+ error = sctp_outq_uncork(&asoc->outqueue);
+ return error;
+nomem:
+ error = -ENOMEM;
+ goto out;
+}
+
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
new file mode 100644
index 000000000..3ee27b770
--- /dev/null
+++ b/net/sctp/sm_statefuns.c
@@ -0,0 +1,6301 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2002 Intel Corp.
+ * Copyright (c) 2002 Nokia Corp.
+ *
+ * This is part of the SCTP Linux Kernel Implementation.
+ *
+ * These are the state functions for the state machine.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Mathew Kotowsky <kotowsky@sctp.org>
+ * Sridhar Samudrala <samudrala@us.ibm.com>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Hui Huang <hui.huang@nokia.com>
+ * Dajiang Zhang <dajiang.zhang@nokia.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ * Ryan Layer <rmlayer@us.ibm.com>
+ * Kevin Gao <kevin.gao@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/inet_ecn.h>
+#include <linux/skbuff.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/structs.h>
+
+static struct sctp_packet *sctp_abort_pkt_new(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ const void *payload,
+ size_t paylen);
+static int sctp_eat_data(const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ sctp_cmd_seq_t *commands);
+static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk);
+static void sctp_send_stale_cookie_err(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ sctp_cmd_seq_t *commands,
+ struct sctp_chunk *err_chunk);
+static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands);
+static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands);
+static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands);
+static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
+
+static sctp_disposition_t sctp_stop_t1_and_abort(struct net *net,
+ sctp_cmd_seq_t *commands,
+ __be16 error, int sk_err,
+ const struct sctp_association *asoc,
+ struct sctp_transport *transport);
+
+static sctp_disposition_t sctp_sf_abort_violation(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ void *arg,
+ sctp_cmd_seq_t *commands,
+ const __u8 *payload,
+ const size_t paylen);
+
+static sctp_disposition_t sctp_sf_violation_chunklen(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands);
+
+static sctp_disposition_t sctp_sf_violation_paramlen(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg, void *ext,
+ sctp_cmd_seq_t *commands);
+
+static sctp_disposition_t sctp_sf_violation_ctsn(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands);
+
+static sctp_disposition_t sctp_sf_violation_chunk(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands);
+
+static sctp_ierror_t sctp_sf_authenticate(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ struct sctp_chunk *chunk);
+
+static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands);
+
+/* Small helper function that checks if the chunk length
+ * is of the appropriate length. The 'required_length' argument
+ * is set to be the size of a specific chunk we are testing.
+ * Return Values: 1 = Valid length
+ * 0 = Invalid length
+ *
+ */
+static inline int
+sctp_chunk_length_valid(struct sctp_chunk *chunk,
+ __u16 required_length)
+{
+ __u16 chunk_length = ntohs(chunk->chunk_hdr->length);
+
+ /* Previously already marked? */
+ if (unlikely(chunk->pdiscard))
+ return 0;
+ if (unlikely(chunk_length < required_length))
+ return 0;
+
+ return 1;
+}
+
+/**********************************************************
+ * These are the state functions for handling chunk events.
+ **********************************************************/
+
+/*
+ * Process the final SHUTDOWN COMPLETE.
+ *
+ * Section: 4 (C) (diagram), 9.2
+ * Upon reception of the SHUTDOWN COMPLETE chunk the endpoint will verify
+ * that it is in SHUTDOWN-ACK-SENT state, if it is not the chunk should be
+ * discarded. If the endpoint is in the SHUTDOWN-ACK-SENT state the endpoint
+ * should stop the T2-shutdown timer and remove all knowledge of the
+ * association (and thus the association enters the CLOSED state).
+ *
+ * Verification Tag: 8.5.1(C), sctpimpguide 2.41.
+ * C) Rules for packet carrying SHUTDOWN COMPLETE:
+ * ...
+ * - The receiver of a SHUTDOWN COMPLETE shall accept the packet
+ * if the Verification Tag field of the packet matches its own tag and
+ * the T bit is not set
+ * OR
+ * it is set to its peer's tag and the T bit is set in the Chunk
+ * Flags.
+ * Otherwise, the receiver MUST silently discard the packet
+ * and take no further action. An endpoint MUST ignore the
+ * SHUTDOWN COMPLETE if it is not in the SHUTDOWN-ACK-SENT state.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_4_C(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_ulpevent *ev;
+
+ if (!sctp_vtag_verify_either(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* RFC 2960 6.10 Bundling
+ *
+ * An endpoint MUST NOT bundle INIT, INIT ACK or
+ * SHUTDOWN COMPLETE with any other chunks.
+ */
+ if (!chunk->singleton)
+ return sctp_sf_violation_chunk(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the SHUTDOWN_COMPLETE chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* RFC 2960 10.2 SCTP-to-ULP
+ *
+ * H) SHUTDOWN COMPLETE notification
+ *
+ * When SCTP completes the shutdown procedures (section 9.2) this
+ * notification is passed to the upper layer.
+ */
+ ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP,
+ 0, 0, 0, NULL, GFP_ATOMIC);
+ if (ev)
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(ev));
+
+ /* Upon reception of the SHUTDOWN COMPLETE chunk the endpoint
+ * will verify that it is in SHUTDOWN-ACK-SENT state, if it is
+ * not the chunk should be discarded. If the endpoint is in
+ * the SHUTDOWN-ACK-SENT state the endpoint should stop the
+ * T2-shutdown timer and remove all knowledge of the
+ * association (and thus the association enters the CLOSED
+ * state).
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_CLOSED));
+
+ SCTP_INC_STATS(net, SCTP_MIB_SHUTDOWNS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+
+ return SCTP_DISPOSITION_DELETE_TCB;
+}
+
+/*
+ * Respond to a normal INIT chunk.
+ * We are the side that is being asked for an association.
+ *
+ * Section: 5.1 Normal Establishment of an Association, B
+ * B) "Z" shall respond immediately with an INIT ACK chunk. The
+ * destination IP address of the INIT ACK MUST be set to the source
+ * IP address of the INIT to which this INIT ACK is responding. In
+ * the response, besides filling in other parameters, "Z" must set the
+ * Verification Tag field to Tag_A, and also provide its own
+ * Verification Tag (Tag_Z) in the Initiate Tag field.
+ *
+ * Verification Tag: Must be 0.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *repl;
+ struct sctp_association *new_asoc;
+ struct sctp_chunk *err_chunk;
+ struct sctp_packet *packet;
+ sctp_unrecognized_param_t *unk_param;
+ int len;
+
+ /* 6.10 Bundling
+ * An endpoint MUST NOT bundle INIT, INIT ACK or
+ * SHUTDOWN COMPLETE with any other chunks.
+ *
+ * IG Section 2.11.2
+ * Furthermore, we require that the receiver of an INIT chunk MUST
+ * enforce these rules by silently discarding an arriving packet
+ * with an INIT chunk that is bundled with other chunks.
+ */
+ if (!chunk->singleton)
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* If the packet is an OOTB packet which is temporarily on the
+ * control endpoint, respond with an ABORT.
+ */
+ if (ep == sctp_sk(net->sctp.ctl_sock)->ep) {
+ SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+ }
+
+ /* 3.1 A packet containing an INIT chunk MUST have a zero Verification
+ * Tag.
+ */
+ if (chunk->sctp_hdr->vtag != 0)
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the INIT chunk has a valid length.
+ * Normally, this would cause an ABORT with a Protocol Violation
+ * error, but since we don't have an association, we'll
+ * just discard the packet.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_init_chunk_t)))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* If the INIT is coming toward a closing socket, we'll send back
+ * and ABORT. Essentially, this catches the race of INIT being
+ * backloged to the socket at the same time as the user isses close().
+ * Since the socket and all its associations are going away, we
+ * can treat this OOTB
+ */
+ if (sctp_sstate(ep->base.sk, CLOSING))
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+
+ /* Verify the INIT chunk before processing it. */
+ err_chunk = NULL;
+ if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type,
+ (sctp_init_chunk_t *)chunk->chunk_hdr, chunk,
+ &err_chunk)) {
+ /* This chunk contains fatal error. It is to be discarded.
+ * Send an ABORT, with causes if there is any.
+ */
+ if (err_chunk) {
+ packet = sctp_abort_pkt_new(net, ep, asoc, arg,
+ (__u8 *)(err_chunk->chunk_hdr) +
+ sizeof(sctp_chunkhdr_t),
+ ntohs(err_chunk->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t));
+
+ sctp_chunk_free(err_chunk);
+
+ if (packet) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+ return SCTP_DISPOSITION_CONSUME;
+ } else {
+ return SCTP_DISPOSITION_NOMEM;
+ }
+ } else {
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg,
+ commands);
+ }
+ }
+
+ /* Grab the INIT header. */
+ chunk->subh.init_hdr = (sctp_inithdr_t *)chunk->skb->data;
+
+ /* Tag the variable length parameters. */
+ chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t));
+
+ new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC);
+ if (!new_asoc)
+ goto nomem;
+
+ if (sctp_assoc_set_bind_addr_from_ep(new_asoc,
+ sctp_scope(sctp_source(chunk)),
+ GFP_ATOMIC) < 0)
+ goto nomem_init;
+
+ /* The call, sctp_process_init(), can fail on memory allocation. */
+ if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk),
+ (sctp_init_chunk_t *)chunk->chunk_hdr,
+ GFP_ATOMIC))
+ goto nomem_init;
+
+ /* B) "Z" shall respond immediately with an INIT ACK chunk. */
+
+ /* If there are errors need to be reported for unknown parameters,
+ * make sure to reserve enough room in the INIT ACK for them.
+ */
+ len = 0;
+ if (err_chunk)
+ len = ntohs(err_chunk->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t);
+
+ repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len);
+ if (!repl)
+ goto nomem_init;
+
+ /* If there are errors need to be reported for unknown parameters,
+ * include them in the outgoing INIT ACK as "Unrecognized parameter"
+ * parameter.
+ */
+ if (err_chunk) {
+ /* Get the "Unrecognized parameter" parameter(s) out of the
+ * ERROR chunk generated by sctp_verify_init(). Since the
+ * error cause code for "unknown parameter" and the
+ * "Unrecognized parameter" type is the same, we can
+ * construct the parameters in INIT ACK by copying the
+ * ERROR causes over.
+ */
+ unk_param = (sctp_unrecognized_param_t *)
+ ((__u8 *)(err_chunk->chunk_hdr) +
+ sizeof(sctp_chunkhdr_t));
+ /* Replace the cause code with the "Unrecognized parameter"
+ * parameter type.
+ */
+ sctp_addto_chunk(repl, len, unk_param);
+ sctp_chunk_free(err_chunk);
+ }
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+ /*
+ * Note: After sending out INIT ACK with the State Cookie parameter,
+ * "Z" MUST NOT allocate any resources, nor keep any states for the
+ * new association. Otherwise, "Z" will be vulnerable to resource
+ * attacks.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+
+ return SCTP_DISPOSITION_DELETE_TCB;
+
+nomem_init:
+ sctp_association_free(new_asoc);
+nomem:
+ if (err_chunk)
+ sctp_chunk_free(err_chunk);
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Respond to a normal INIT ACK chunk.
+ * We are the side that is initiating the association.
+ *
+ * Section: 5.1 Normal Establishment of an Association, C
+ * C) Upon reception of the INIT ACK from "Z", "A" shall stop the T1-init
+ * timer and leave COOKIE-WAIT state. "A" shall then send the State
+ * Cookie received in the INIT ACK chunk in a COOKIE ECHO chunk, start
+ * the T1-cookie timer, and enter the COOKIE-ECHOED state.
+ *
+ * Note: The COOKIE ECHO chunk can be bundled with any pending outbound
+ * DATA chunks, but it MUST be the first chunk in the packet and
+ * until the COOKIE ACK is returned the sender MUST NOT send any
+ * other packets to the peer.
+ *
+ * Verification Tag: 3.3.3
+ * If the value of the Initiate Tag in a received INIT ACK chunk is
+ * found to be 0, the receiver MUST treat it as an error and close the
+ * association by transmitting an ABORT.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ sctp_init_chunk_t *initchunk;
+ struct sctp_chunk *err_chunk;
+ struct sctp_packet *packet;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* 6.10 Bundling
+ * An endpoint MUST NOT bundle INIT, INIT ACK or
+ * SHUTDOWN COMPLETE with any other chunks.
+ */
+ if (!chunk->singleton)
+ return sctp_sf_violation_chunk(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the INIT-ACK chunk has a valid length */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_initack_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+ /* Grab the INIT header. */
+ chunk->subh.init_hdr = (sctp_inithdr_t *) chunk->skb->data;
+
+ /* Verify the INIT chunk before processing it. */
+ err_chunk = NULL;
+ if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type,
+ (sctp_init_chunk_t *)chunk->chunk_hdr, chunk,
+ &err_chunk)) {
+
+ sctp_error_t error = SCTP_ERROR_NO_RESOURCE;
+
+ /* This chunk contains fatal error. It is to be discarded.
+ * Send an ABORT, with causes. If there are no causes,
+ * then there wasn't enough memory. Just terminate
+ * the association.
+ */
+ if (err_chunk) {
+ packet = sctp_abort_pkt_new(net, ep, asoc, arg,
+ (__u8 *)(err_chunk->chunk_hdr) +
+ sizeof(sctp_chunkhdr_t),
+ ntohs(err_chunk->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t));
+
+ sctp_chunk_free(err_chunk);
+
+ if (packet) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+ error = SCTP_ERROR_INV_PARAM;
+ }
+ }
+
+ /* SCTP-AUTH, Section 6.3:
+ * It should be noted that if the receiver wants to tear
+ * down an association in an authenticated way only, the
+ * handling of malformed packets should not result in
+ * tearing down the association.
+ *
+ * This means that if we only want to abort associations
+ * in an authenticated way (i.e AUTH+ABORT), then we
+ * can't destroy this association just because the packet
+ * was malformed.
+ */
+ if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ return sctp_stop_t1_and_abort(net, commands, error, ECONNREFUSED,
+ asoc, chunk->transport);
+ }
+
+ /* Tag the variable length parameters. Note that we never
+ * convert the parameters in an INIT chunk.
+ */
+ chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t));
+
+ initchunk = (sctp_init_chunk_t *) chunk->chunk_hdr;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT,
+ SCTP_PEER_INIT(initchunk));
+
+ /* Reset init error count upon receipt of INIT-ACK. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
+
+ /* 5.1 C) "A" shall stop the T1-init timer and leave
+ * COOKIE-WAIT state. "A" shall then ... start the T1-cookie
+ * timer, and enter the COOKIE-ECHOED state.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_COOKIE_ECHOED));
+
+ /* SCTP-AUTH: genereate the assocition shared keys so that
+ * we can potentially signe the COOKIE-ECHO.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL());
+
+ /* 5.1 C) "A" shall then send the State Cookie received in the
+ * INIT ACK chunk in a COOKIE ECHO chunk, ...
+ */
+ /* If there is any errors to report, send the ERROR chunk generated
+ * for unknown parameters as well.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_COOKIE_ECHO,
+ SCTP_CHUNK(err_chunk));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Respond to a normal COOKIE ECHO chunk.
+ * We are the side that is being asked for an association.
+ *
+ * Section: 5.1 Normal Establishment of an Association, D
+ * D) Upon reception of the COOKIE ECHO chunk, Endpoint "Z" will reply
+ * with a COOKIE ACK chunk after building a TCB and moving to
+ * the ESTABLISHED state. A COOKIE ACK chunk may be bundled with
+ * any pending DATA chunks (and/or SACK chunks), but the COOKIE ACK
+ * chunk MUST be the first chunk in the packet.
+ *
+ * IMPLEMENTATION NOTE: An implementation may choose to send the
+ * Communication Up notification to the SCTP user upon reception
+ * of a valid COOKIE ECHO chunk.
+ *
+ * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules
+ * D) Rules for packet carrying a COOKIE ECHO
+ *
+ * - When sending a COOKIE ECHO, the endpoint MUST use the value of the
+ * Initial Tag received in the INIT ACK.
+ *
+ * - The receiver of a COOKIE ECHO follows the procedures in Section 5.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type, void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_association *new_asoc;
+ sctp_init_chunk_t *peer_init;
+ struct sctp_chunk *repl;
+ struct sctp_ulpevent *ev, *ai_ev = NULL;
+ int error = 0;
+ struct sctp_chunk *err_chk_p;
+ struct sock *sk;
+
+ /* If the packet is an OOTB packet which is temporarily on the
+ * control endpoint, respond with an ABORT.
+ */
+ if (ep == sctp_sk(net->sctp.ctl_sock)->ep) {
+ SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+ }
+
+ /* Make sure that the COOKIE_ECHO chunk has a valid length.
+ * In this case, we check that we have enough for at least a
+ * chunk header. More detailed verification is done
+ * in sctp_unpack_cookie().
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* If the endpoint is not listening or if the number of associations
+ * on the TCP-style socket exceed the max backlog, respond with an
+ * ABORT.
+ */
+ sk = ep->base.sk;
+ if (!sctp_sstate(sk, LISTENING) ||
+ (sctp_style(sk, TCP) && sk_acceptq_is_full(sk)))
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+
+ /* "Decode" the chunk. We have no optional parameters so we
+ * are in good shape.
+ */
+ chunk->subh.cookie_hdr =
+ (struct sctp_signed_cookie *)chunk->skb->data;
+ if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t)))
+ goto nomem;
+
+ /* 5.1 D) Upon reception of the COOKIE ECHO chunk, Endpoint
+ * "Z" will reply with a COOKIE ACK chunk after building a TCB
+ * and moving to the ESTABLISHED state.
+ */
+ new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error,
+ &err_chk_p);
+
+ /* FIXME:
+ * If the re-build failed, what is the proper error path
+ * from here?
+ *
+ * [We should abort the association. --piggy]
+ */
+ if (!new_asoc) {
+ /* FIXME: Several errors are possible. A bad cookie should
+ * be silently discarded, but think about logging it too.
+ */
+ switch (error) {
+ case -SCTP_IERROR_NOMEM:
+ goto nomem;
+
+ case -SCTP_IERROR_STALE_COOKIE:
+ sctp_send_stale_cookie_err(net, ep, asoc, chunk, commands,
+ err_chk_p);
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ case -SCTP_IERROR_BAD_SIG:
+ default:
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+ }
+
+
+ /* Delay state machine commands until later.
+ *
+ * Re-build the bind address for the association is done in
+ * the sctp_unpack_cookie() already.
+ */
+ /* This is a brand-new association, so these are not yet side
+ * effects--it is safe to run them here.
+ */
+ peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+
+ if (!sctp_process_init(new_asoc, chunk,
+ &chunk->subh.cookie_hdr->c.peer_addr,
+ peer_init, GFP_ATOMIC))
+ goto nomem_init;
+
+ /* SCTP-AUTH: Now that we've populate required fields in
+ * sctp_process_init, set up the assocaition shared keys as
+ * necessary so that we can potentially authenticate the ACK
+ */
+ error = sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC);
+ if (error)
+ goto nomem_init;
+
+ /* SCTP-AUTH: auth_chunk pointer is only set when the cookie-echo
+ * is supposed to be authenticated and we have to do delayed
+ * authentication. We've just recreated the association using
+ * the information in the cookie and now it's much easier to
+ * do the authentication.
+ */
+ if (chunk->auth_chunk) {
+ struct sctp_chunk auth;
+ sctp_ierror_t ret;
+
+ /* Make sure that we and the peer are AUTH capable */
+ if (!net->sctp.auth_enable || !new_asoc->peer.auth_capable) {
+ sctp_association_free(new_asoc);
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ /* set-up our fake chunk so that we can process it */
+ auth.skb = chunk->auth_chunk;
+ auth.asoc = chunk->asoc;
+ auth.sctp_hdr = chunk->sctp_hdr;
+ auth.chunk_hdr = (sctp_chunkhdr_t *)skb_push(chunk->auth_chunk,
+ sizeof(sctp_chunkhdr_t));
+ skb_pull(chunk->auth_chunk, sizeof(sctp_chunkhdr_t));
+ auth.transport = chunk->transport;
+
+ ret = sctp_sf_authenticate(net, ep, new_asoc, type, &auth);
+ if (ret != SCTP_IERROR_NO_ERROR) {
+ sctp_association_free(new_asoc);
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+ }
+
+ repl = sctp_make_cookie_ack(new_asoc, chunk);
+ if (!repl)
+ goto nomem_init;
+
+ /* RFC 2960 5.1 Normal Establishment of an Association
+ *
+ * D) IMPLEMENTATION NOTE: An implementation may choose to
+ * send the Communication Up notification to the SCTP user
+ * upon reception of a valid COOKIE ECHO chunk.
+ */
+ ev = sctp_ulpevent_make_assoc_change(new_asoc, 0, SCTP_COMM_UP, 0,
+ new_asoc->c.sinit_num_ostreams,
+ new_asoc->c.sinit_max_instreams,
+ NULL, GFP_ATOMIC);
+ if (!ev)
+ goto nomem_ev;
+
+ /* Sockets API Draft Section 5.3.1.6
+ * When a peer sends a Adaptation Layer Indication parameter , SCTP
+ * delivers this notification to inform the application that of the
+ * peers requested adaptation layer.
+ */
+ if (new_asoc->peer.adaptation_ind) {
+ ai_ev = sctp_ulpevent_make_adaptation_indication(new_asoc,
+ GFP_ATOMIC);
+ if (!ai_ev)
+ goto nomem_aiev;
+ }
+
+ /* Add all the state machine commands now since we've created
+ * everything. This way we don't introduce memory corruptions
+ * during side-effect processing and correclty count established
+ * associations.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_ESTABLISHED));
+ SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
+ SCTP_INC_STATS(net, SCTP_MIB_PASSIVEESTABS);
+ sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
+
+ if (new_asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE])
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+
+ /* This will send the COOKIE ACK */
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+ /* Queue the ASSOC_CHANGE event */
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
+ /* Send up the Adaptation Layer Indication event */
+ if (ai_ev)
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(ai_ev));
+
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem_aiev:
+ sctp_ulpevent_free(ev);
+nomem_ev:
+ sctp_chunk_free(repl);
+nomem_init:
+ sctp_association_free(new_asoc);
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Respond to a normal COOKIE ACK chunk.
+ * We are the side that is being asked for an association.
+ *
+ * RFC 2960 5.1 Normal Establishment of an Association
+ *
+ * E) Upon reception of the COOKIE ACK, endpoint "A" will move from the
+ * COOKIE-ECHOED state to the ESTABLISHED state, stopping the T1-cookie
+ * timer. It may also notify its ULP about the successful
+ * establishment of the association with a Communication Up
+ * notification (see Section 10).
+ *
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_1E_ca(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type, void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_ulpevent *ev;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Verify that the chunk length for the COOKIE-ACK is OK.
+ * If we don't do this, any bundled chunks may be junked.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* Reset init error count upon receipt of COOKIE-ACK,
+ * to avoid problems with the managemement of this
+ * counter in stale cookie situations when a transition back
+ * from the COOKIE-ECHOED state to the COOKIE-WAIT
+ * state is performed.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
+
+ /* RFC 2960 5.1 Normal Establishment of an Association
+ *
+ * E) Upon reception of the COOKIE ACK, endpoint "A" will move
+ * from the COOKIE-ECHOED state to the ESTABLISHED state,
+ * stopping the T1-cookie timer.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_ESTABLISHED));
+ SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
+ SCTP_INC_STATS(net, SCTP_MIB_ACTIVEESTABS);
+ sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
+ if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE])
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+
+ /* It may also notify its ULP about the successful
+ * establishment of the association with a Communication Up
+ * notification (see Section 10).
+ */
+ ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_UP,
+ 0, asoc->c.sinit_num_ostreams,
+ asoc->c.sinit_max_instreams,
+ NULL, GFP_ATOMIC);
+
+ if (!ev)
+ goto nomem;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
+ /* Sockets API Draft Section 5.3.1.6
+ * When a peer sends a Adaptation Layer Indication parameter , SCTP
+ * delivers this notification to inform the application that of the
+ * peers requested adaptation layer.
+ */
+ if (asoc->peer.adaptation_ind) {
+ ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);
+ if (!ev)
+ goto nomem;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(ev));
+ }
+
+ return SCTP_DISPOSITION_CONSUME;
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Generate and sendout a heartbeat packet. */
+static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_transport *transport = (struct sctp_transport *) arg;
+ struct sctp_chunk *reply;
+
+ /* Send a heartbeat to our peer. */
+ reply = sctp_make_heartbeat(asoc, transport);
+ if (!reply)
+ return SCTP_DISPOSITION_NOMEM;
+
+ /* Set rto_pending indicating that an RTT measurement
+ * is started with this heartbeat chunk.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_RTO_PENDING,
+ SCTP_TRANSPORT(transport));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/* Generate a HEARTBEAT packet on the given transport. */
+sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_transport *transport = (struct sctp_transport *) arg;
+
+ if (asoc->overall_error_count >= asoc->max_retrans) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_ERROR));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ return SCTP_DISPOSITION_DELETE_TCB;
+ }
+
+ /* Section 3.3.5.
+ * The Sender-specific Heartbeat Info field should normally include
+ * information about the sender's current time when this HEARTBEAT
+ * chunk is sent and the destination transport address to which this
+ * HEARTBEAT is sent (see Section 8.3).
+ */
+
+ if (transport->param_flags & SPP_HB_ENABLE) {
+ if (SCTP_DISPOSITION_NOMEM ==
+ sctp_sf_heartbeat(ep, asoc, type, arg,
+ commands))
+ return SCTP_DISPOSITION_NOMEM;
+
+ /* Set transport error counter and association error counter
+ * when sending heartbeat.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
+ SCTP_TRANSPORT(transport));
+ }
+ sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE,
+ SCTP_TRANSPORT(transport));
+ sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE,
+ SCTP_TRANSPORT(transport));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Process an heartbeat request.
+ *
+ * Section: 8.3 Path Heartbeat
+ * The receiver of the HEARTBEAT should immediately respond with a
+ * HEARTBEAT ACK that contains the Heartbeat Information field copied
+ * from the received HEARTBEAT chunk.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ * When receiving an SCTP packet, the endpoint MUST ensure that the
+ * value in the Verification Tag field of the received SCTP packet
+ * matches its own Tag. If the received Verification Tag value does not
+ * match the receiver's own tag value, the receiver shall silently
+ * discard the packet and shall not process it any further except for
+ * those cases listed in Section 8.5.1 below.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_beat_8_3(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ sctp_paramhdr_t *param_hdr;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *reply;
+ size_t paylen = 0;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the HEARTBEAT chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_heartbeat_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* 8.3 The receiver of the HEARTBEAT should immediately
+ * respond with a HEARTBEAT ACK that contains the Heartbeat
+ * Information field copied from the received HEARTBEAT chunk.
+ */
+ chunk->subh.hb_hdr = (sctp_heartbeathdr_t *) chunk->skb->data;
+ param_hdr = (sctp_paramhdr_t *) chunk->subh.hb_hdr;
+ paylen = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t);
+
+ if (ntohs(param_hdr->length) > paylen)
+ return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
+ param_hdr, commands);
+
+ if (!pskb_pull(chunk->skb, paylen))
+ goto nomem;
+
+ reply = sctp_make_heartbeat_ack(asoc, chunk, param_hdr, paylen);
+ if (!reply)
+ goto nomem;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Process the returning HEARTBEAT ACK.
+ *
+ * Section: 8.3 Path Heartbeat
+ * Upon the receipt of the HEARTBEAT ACK, the sender of the HEARTBEAT
+ * should clear the error counter of the destination transport
+ * address to which the HEARTBEAT was sent, and mark the destination
+ * transport address as active if it is not so marked. The endpoint may
+ * optionally report to the upper layer when an inactive destination
+ * address is marked as active due to the reception of the latest
+ * HEARTBEAT ACK. The receiver of the HEARTBEAT ACK must also
+ * clear the association overall error count as well (as defined
+ * in section 8.1).
+ *
+ * The receiver of the HEARTBEAT ACK should also perform an RTT
+ * measurement for that destination transport address using the time
+ * value carried in the HEARTBEAT ACK chunk.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_backbeat_8_3(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ union sctp_addr from_addr;
+ struct sctp_transport *link;
+ sctp_sender_hb_info_t *hbinfo;
+ unsigned long max_interval;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the HEARTBEAT-ACK chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t) +
+ sizeof(sctp_sender_hb_info_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data;
+ /* Make sure that the length of the parameter is what we expect */
+ if (ntohs(hbinfo->param_hdr.length) !=
+ sizeof(sctp_sender_hb_info_t)) {
+ return SCTP_DISPOSITION_DISCARD;
+ }
+
+ from_addr = hbinfo->daddr;
+ link = sctp_assoc_lookup_paddr(asoc, &from_addr);
+
+ /* This should never happen, but lets log it if so. */
+ if (unlikely(!link)) {
+ if (from_addr.sa.sa_family == AF_INET6) {
+ net_warn_ratelimited("%s association %p could not find address %pI6\n",
+ __func__,
+ asoc,
+ &from_addr.v6.sin6_addr);
+ } else {
+ net_warn_ratelimited("%s association %p could not find address %pI4\n",
+ __func__,
+ asoc,
+ &from_addr.v4.sin_addr.s_addr);
+ }
+ return SCTP_DISPOSITION_DISCARD;
+ }
+
+ /* Validate the 64-bit random nonce. */
+ if (hbinfo->hb_nonce != link->hb_nonce)
+ return SCTP_DISPOSITION_DISCARD;
+
+ max_interval = link->hbinterval + link->rto;
+
+ /* Check if the timestamp looks valid. */
+ if (time_after(hbinfo->sent_at, jiffies) ||
+ time_after(jiffies, hbinfo->sent_at + max_interval)) {
+ pr_debug("%s: HEARTBEAT ACK with invalid timestamp received "
+ "for transport:%p\n", __func__, link);
+
+ return SCTP_DISPOSITION_DISCARD;
+ }
+
+ /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of
+ * the HEARTBEAT should clear the error counter of the
+ * destination transport address to which the HEARTBEAT was
+ * sent and mark the destination transport address as active if
+ * it is not so marked.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_ON, SCTP_TRANSPORT(link));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/* Helper function to send out an abort for the restart
+ * condition.
+ */
+static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
+ struct sctp_chunk *init,
+ sctp_cmd_seq_t *commands)
+{
+ int len;
+ struct sctp_packet *pkt;
+ union sctp_addr_param *addrparm;
+ struct sctp_errhdr *errhdr;
+ struct sctp_endpoint *ep;
+ char buffer[sizeof(struct sctp_errhdr)+sizeof(union sctp_addr_param)];
+ struct sctp_af *af = sctp_get_af_specific(ssa->v4.sin_family);
+
+ /* Build the error on the stack. We are way to malloc crazy
+ * throughout the code today.
+ */
+ errhdr = (struct sctp_errhdr *)buffer;
+ addrparm = (union sctp_addr_param *)errhdr->variable;
+
+ /* Copy into a parm format. */
+ len = af->to_addr_param(ssa, addrparm);
+ len += sizeof(sctp_errhdr_t);
+
+ errhdr->cause = SCTP_ERROR_RESTART;
+ errhdr->length = htons(len);
+
+ /* Assign to the control socket. */
+ ep = sctp_sk(net->sctp.ctl_sock)->ep;
+
+ /* Association is NULL since this may be a restart attack and we
+ * want to send back the attacker's vtag.
+ */
+ pkt = sctp_abort_pkt_new(net, ep, NULL, init, errhdr, len);
+
+ if (!pkt)
+ goto out;
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(pkt));
+
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+
+ /* Discard the rest of the inbound packet. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
+
+out:
+ /* Even if there is no memory, treat as a failure so
+ * the packet will get dropped.
+ */
+ return 0;
+}
+
+static bool list_has_sctp_addr(const struct list_head *list,
+ union sctp_addr *ipaddr)
+{
+ struct sctp_transport *addr;
+
+ list_for_each_entry(addr, list, transports) {
+ if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr))
+ return true;
+ }
+
+ return false;
+}
+/* A restart is occurring, check to make sure no new addresses
+ * are being added as we may be under a takeover attack.
+ */
+static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *init,
+ sctp_cmd_seq_t *commands)
+{
+ struct net *net = sock_net(new_asoc->base.sk);
+ struct sctp_transport *new_addr;
+ int ret = 1;
+
+ /* Implementor's Guide - Section 5.2.2
+ * ...
+ * Before responding the endpoint MUST check to see if the
+ * unexpected INIT adds new addresses to the association. If new
+ * addresses are added to the association, the endpoint MUST respond
+ * with an ABORT..
+ */
+
+ /* Search through all current addresses and make sure
+ * we aren't adding any new ones.
+ */
+ list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list,
+ transports) {
+ if (!list_has_sctp_addr(&asoc->peer.transport_addr_list,
+ &new_addr->ipaddr)) {
+ sctp_sf_send_restart_abort(net, &new_addr->ipaddr, init,
+ commands);
+ ret = 0;
+ break;
+ }
+ }
+
+ /* Return success if all addresses were found. */
+ return ret;
+}
+
+/* Populate the verification/tie tags based on overlapping INIT
+ * scenario.
+ *
+ * Note: Do not use in CLOSED or SHUTDOWN-ACK-SENT state.
+ */
+static void sctp_tietags_populate(struct sctp_association *new_asoc,
+ const struct sctp_association *asoc)
+{
+ switch (asoc->state) {
+
+ /* 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State */
+
+ case SCTP_STATE_COOKIE_WAIT:
+ new_asoc->c.my_vtag = asoc->c.my_vtag;
+ new_asoc->c.my_ttag = asoc->c.my_vtag;
+ new_asoc->c.peer_ttag = 0;
+ break;
+
+ case SCTP_STATE_COOKIE_ECHOED:
+ new_asoc->c.my_vtag = asoc->c.my_vtag;
+ new_asoc->c.my_ttag = asoc->c.my_vtag;
+ new_asoc->c.peer_ttag = asoc->c.peer_vtag;
+ break;
+
+ /* 5.2.2 Unexpected INIT in States Other than CLOSED, COOKIE-ECHOED,
+ * COOKIE-WAIT and SHUTDOWN-ACK-SENT
+ */
+ default:
+ new_asoc->c.my_ttag = asoc->c.my_vtag;
+ new_asoc->c.peer_ttag = asoc->c.peer_vtag;
+ break;
+ }
+
+ /* Other parameters for the endpoint SHOULD be copied from the
+ * existing parameters of the association (e.g. number of
+ * outbound streams) into the INIT ACK and cookie.
+ */
+ new_asoc->rwnd = asoc->rwnd;
+ new_asoc->c.sinit_num_ostreams = asoc->c.sinit_num_ostreams;
+ new_asoc->c.sinit_max_instreams = asoc->c.sinit_max_instreams;
+ new_asoc->c.initial_tsn = asoc->c.initial_tsn;
+}
+
+/*
+ * Compare vtag/tietag values to determine unexpected COOKIE-ECHO
+ * handling action.
+ *
+ * RFC 2960 5.2.4 Handle a COOKIE ECHO when a TCB exists.
+ *
+ * Returns value representing action to be taken. These action values
+ * correspond to Action/Description values in RFC 2960, Table 2.
+ */
+static char sctp_tietags_compare(struct sctp_association *new_asoc,
+ const struct sctp_association *asoc)
+{
+ /* In this case, the peer may have restarted. */
+ if ((asoc->c.my_vtag != new_asoc->c.my_vtag) &&
+ (asoc->c.peer_vtag != new_asoc->c.peer_vtag) &&
+ (asoc->c.my_vtag == new_asoc->c.my_ttag) &&
+ (asoc->c.peer_vtag == new_asoc->c.peer_ttag))
+ return 'A';
+
+ /* Collision case B. */
+ if ((asoc->c.my_vtag == new_asoc->c.my_vtag) &&
+ ((asoc->c.peer_vtag != new_asoc->c.peer_vtag) ||
+ (0 == asoc->c.peer_vtag))) {
+ return 'B';
+ }
+
+ /* Collision case D. */
+ if ((asoc->c.my_vtag == new_asoc->c.my_vtag) &&
+ (asoc->c.peer_vtag == new_asoc->c.peer_vtag))
+ return 'D';
+
+ /* Collision case C. */
+ if ((asoc->c.my_vtag != new_asoc->c.my_vtag) &&
+ (asoc->c.peer_vtag == new_asoc->c.peer_vtag) &&
+ (0 == new_asoc->c.my_ttag) &&
+ (0 == new_asoc->c.peer_ttag))
+ return 'C';
+
+ /* No match to any of the special cases; discard this packet. */
+ return 'E';
+}
+
+/* Common helper routine for both duplicate and simulataneous INIT
+ * chunk handling.
+ */
+static sctp_disposition_t sctp_sf_do_unexpected_init(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg, sctp_cmd_seq_t *commands)
+{
+ sctp_disposition_t retval;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *repl;
+ struct sctp_association *new_asoc;
+ struct sctp_chunk *err_chunk;
+ struct sctp_packet *packet;
+ sctp_unrecognized_param_t *unk_param;
+ int len;
+
+ /* 6.10 Bundling
+ * An endpoint MUST NOT bundle INIT, INIT ACK or
+ * SHUTDOWN COMPLETE with any other chunks.
+ *
+ * IG Section 2.11.2
+ * Furthermore, we require that the receiver of an INIT chunk MUST
+ * enforce these rules by silently discarding an arriving packet
+ * with an INIT chunk that is bundled with other chunks.
+ */
+ if (!chunk->singleton)
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* 3.1 A packet containing an INIT chunk MUST have a zero Verification
+ * Tag.
+ */
+ if (chunk->sctp_hdr->vtag != 0)
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the INIT chunk has a valid length.
+ * In this case, we generate a protocol violation since we have
+ * an association established.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_init_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+ /* Grab the INIT header. */
+ chunk->subh.init_hdr = (sctp_inithdr_t *) chunk->skb->data;
+
+ /* Tag the variable length parameters. */
+ chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t));
+
+ /* Verify the INIT chunk before processing it. */
+ err_chunk = NULL;
+ if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type,
+ (sctp_init_chunk_t *)chunk->chunk_hdr, chunk,
+ &err_chunk)) {
+ /* This chunk contains fatal error. It is to be discarded.
+ * Send an ABORT, with causes if there is any.
+ */
+ if (err_chunk) {
+ packet = sctp_abort_pkt_new(net, ep, asoc, arg,
+ (__u8 *)(err_chunk->chunk_hdr) +
+ sizeof(sctp_chunkhdr_t),
+ ntohs(err_chunk->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t));
+
+ if (packet) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+ retval = SCTP_DISPOSITION_CONSUME;
+ } else {
+ retval = SCTP_DISPOSITION_NOMEM;
+ }
+ goto cleanup;
+ } else {
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg,
+ commands);
+ }
+ }
+
+ /*
+ * Other parameters for the endpoint SHOULD be copied from the
+ * existing parameters of the association (e.g. number of
+ * outbound streams) into the INIT ACK and cookie.
+ * FIXME: We are copying parameters from the endpoint not the
+ * association.
+ */
+ new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC);
+ if (!new_asoc)
+ goto nomem;
+
+ if (sctp_assoc_set_bind_addr_from_ep(new_asoc,
+ sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0)
+ goto nomem;
+
+ /* In the outbound INIT ACK the endpoint MUST copy its current
+ * Verification Tag and Peers Verification tag into a reserved
+ * place (local tie-tag and per tie-tag) within the state cookie.
+ */
+ if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk),
+ (sctp_init_chunk_t *)chunk->chunk_hdr,
+ GFP_ATOMIC))
+ goto nomem;
+
+ /* Make sure no new addresses are being added during the
+ * restart. Do not do this check for COOKIE-WAIT state,
+ * since there are no peer addresses to check against.
+ * Upon return an ABORT will have been sent if needed.
+ */
+ if (!sctp_state(asoc, COOKIE_WAIT)) {
+ if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk,
+ commands)) {
+ retval = SCTP_DISPOSITION_CONSUME;
+ goto nomem_retval;
+ }
+ }
+
+ sctp_tietags_populate(new_asoc, asoc);
+
+ /* B) "Z" shall respond immediately with an INIT ACK chunk. */
+
+ /* If there are errors need to be reported for unknown parameters,
+ * make sure to reserve enough room in the INIT ACK for them.
+ */
+ len = 0;
+ if (err_chunk) {
+ len = ntohs(err_chunk->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t);
+ }
+
+ repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len);
+ if (!repl)
+ goto nomem;
+
+ /* If there are errors need to be reported for unknown parameters,
+ * include them in the outgoing INIT ACK as "Unrecognized parameter"
+ * parameter.
+ */
+ if (err_chunk) {
+ /* Get the "Unrecognized parameter" parameter(s) out of the
+ * ERROR chunk generated by sctp_verify_init(). Since the
+ * error cause code for "unknown parameter" and the
+ * "Unrecognized parameter" type is the same, we can
+ * construct the parameters in INIT ACK by copying the
+ * ERROR causes over.
+ */
+ unk_param = (sctp_unrecognized_param_t *)
+ ((__u8 *)(err_chunk->chunk_hdr) +
+ sizeof(sctp_chunkhdr_t));
+ /* Replace the cause code with the "Unrecognized parameter"
+ * parameter type.
+ */
+ sctp_addto_chunk(repl, len, unk_param);
+ }
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+ /*
+ * Note: After sending out INIT ACK with the State Cookie parameter,
+ * "Z" MUST NOT allocate any resources for this new association.
+ * Otherwise, "Z" will be vulnerable to resource attacks.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+ retval = SCTP_DISPOSITION_CONSUME;
+
+ return retval;
+
+nomem:
+ retval = SCTP_DISPOSITION_NOMEM;
+nomem_retval:
+ if (new_asoc)
+ sctp_association_free(new_asoc);
+cleanup:
+ if (err_chunk)
+ sctp_chunk_free(err_chunk);
+ return retval;
+}
+
+/*
+ * Handle simultaneous INIT.
+ * This means we started an INIT and then we got an INIT request from
+ * our peer.
+ *
+ * Section: 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State (Item B)
+ * This usually indicates an initialization collision, i.e., each
+ * endpoint is attempting, at about the same time, to establish an
+ * association with the other endpoint.
+ *
+ * Upon receipt of an INIT in the COOKIE-WAIT or COOKIE-ECHOED state, an
+ * endpoint MUST respond with an INIT ACK using the same parameters it
+ * sent in its original INIT chunk (including its Verification Tag,
+ * unchanged). These original parameters are combined with those from the
+ * newly received INIT chunk. The endpoint shall also generate a State
+ * Cookie with the INIT ACK. The endpoint uses the parameters sent in its
+ * INIT to calculate the State Cookie.
+ *
+ * After that, the endpoint MUST NOT change its state, the T1-init
+ * timer shall be left running and the corresponding TCB MUST NOT be
+ * destroyed. The normal procedures for handling State Cookies when
+ * a TCB exists will resolve the duplicate INITs to a single association.
+ *
+ * For an endpoint that is in the COOKIE-ECHOED state it MUST populate
+ * its Tie-Tags with the Tag information of itself and its peer (see
+ * section 5.2.2 for a description of the Tie-Tags).
+ *
+ * Verification Tag: Not explicit, but an INIT can not have a valid
+ * verification tag, so we skip the check.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_2_1_siminit(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* Call helper to do the real work for both simulataneous and
+ * duplicate INIT chunk handling.
+ */
+ return sctp_sf_do_unexpected_init(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * Handle duplicated INIT messages. These are usually delayed
+ * restransmissions.
+ *
+ * Section: 5.2.2 Unexpected INIT in States Other than CLOSED,
+ * COOKIE-ECHOED and COOKIE-WAIT
+ *
+ * Unless otherwise stated, upon reception of an unexpected INIT for
+ * this association, the endpoint shall generate an INIT ACK with a
+ * State Cookie. In the outbound INIT ACK the endpoint MUST copy its
+ * current Verification Tag and peer's Verification Tag into a reserved
+ * place within the state cookie. We shall refer to these locations as
+ * the Peer's-Tie-Tag and the Local-Tie-Tag. The outbound SCTP packet
+ * containing this INIT ACK MUST carry a Verification Tag value equal to
+ * the Initiation Tag found in the unexpected INIT. And the INIT ACK
+ * MUST contain a new Initiation Tag (randomly generated see Section
+ * 5.3.1). Other parameters for the endpoint SHOULD be copied from the
+ * existing parameters of the association (e.g. number of outbound
+ * streams) into the INIT ACK and cookie.
+ *
+ * After sending out the INIT ACK, the endpoint shall take no further
+ * actions, i.e., the existing association, including its current state,
+ * and the corresponding TCB MUST NOT be changed.
+ *
+ * Note: Only when a TCB exists and the association is not in a COOKIE-
+ * WAIT state are the Tie-Tags populated. For a normal association INIT
+ * (i.e. the endpoint is in a COOKIE-WAIT state), the Tie-Tags MUST be
+ * set to 0 (indicating that no previous TCB existed). The INIT ACK and
+ * State Cookie are populated as specified in section 5.2.1.
+ *
+ * Verification Tag: Not specified, but an INIT has no way of knowing
+ * what the verification tag could be, so we ignore it.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_2_2_dupinit(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* Call helper to do the real work for both simulataneous and
+ * duplicate INIT chunk handling.
+ */
+ return sctp_sf_do_unexpected_init(net, ep, asoc, type, arg, commands);
+}
+
+
+/*
+ * Unexpected INIT-ACK handler.
+ *
+ * Section 5.2.3
+ * If an INIT ACK received by an endpoint in any state other than the
+ * COOKIE-WAIT state, the endpoint should discard the INIT ACK chunk.
+ * An unexpected INIT ACK usually indicates the processing of an old or
+ * duplicated INIT chunk.
+*/
+sctp_disposition_t sctp_sf_do_5_2_3_initack(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg, sctp_cmd_seq_t *commands)
+{
+ /* Per the above section, we'll discard the chunk if we have an
+ * endpoint. If this is an OOTB INIT-ACK, treat it as such.
+ */
+ if (ep == sctp_sk(net->sctp.ctl_sock)->ep)
+ return sctp_sf_ootb(net, ep, asoc, type, arg, commands);
+ else
+ return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+}
+
+/* Unexpected COOKIE-ECHO handler for peer restart (Table 2, action 'A')
+ *
+ * Section 5.2.4
+ * A) In this case, the peer may have restarted.
+ */
+static sctp_disposition_t sctp_sf_do_dupcook_a(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ sctp_cmd_seq_t *commands,
+ struct sctp_association *new_asoc)
+{
+ sctp_init_chunk_t *peer_init;
+ struct sctp_ulpevent *ev;
+ struct sctp_chunk *repl;
+ struct sctp_chunk *err;
+ sctp_disposition_t disposition;
+
+ /* new_asoc is a brand-new association, so these are not yet
+ * side effects--it is safe to run them here.
+ */
+ peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+
+ if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
+ GFP_ATOMIC))
+ goto nomem;
+
+ /* Make sure no new addresses are being added during the
+ * restart. Though this is a pretty complicated attack
+ * since you'd have to get inside the cookie.
+ */
+ if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands)) {
+ return SCTP_DISPOSITION_CONSUME;
+ }
+
+ /* If the endpoint is in the SHUTDOWN-ACK-SENT state and recognizes
+ * the peer has restarted (Action A), it MUST NOT setup a new
+ * association but instead resend the SHUTDOWN ACK and send an ERROR
+ * chunk with a "Cookie Received while Shutting Down" error cause to
+ * its peer.
+ */
+ if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) {
+ disposition = sctp_sf_do_9_2_reshutack(net, ep, asoc,
+ SCTP_ST_CHUNK(chunk->chunk_hdr->type),
+ chunk, commands);
+ if (SCTP_DISPOSITION_NOMEM == disposition)
+ goto nomem;
+
+ err = sctp_make_op_error(asoc, chunk,
+ SCTP_ERROR_COOKIE_IN_SHUTDOWN,
+ NULL, 0, 0);
+ if (err)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(err));
+
+ return SCTP_DISPOSITION_CONSUME;
+ }
+
+ /* For now, stop pending T3-rtx and SACK timers, fail any unsent/unacked
+ * data. Consider the optional choice of resending of this data.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL());
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
+ sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL());
+
+ /* Stop pending T4-rto timer, teardown ASCONF queue, ASCONF-ACK queue
+ * and ASCONF-ACK cache.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+ sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL());
+
+ repl = sctp_make_cookie_ack(new_asoc, chunk);
+ if (!repl)
+ goto nomem;
+
+ /* Report association restart to upper layer. */
+ ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0,
+ new_asoc->c.sinit_num_ostreams,
+ new_asoc->c.sinit_max_instreams,
+ NULL, GFP_ATOMIC);
+ if (!ev)
+ goto nomem_ev;
+
+ /* Update the content of current association. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+ if (sctp_state(asoc, SHUTDOWN_PENDING) &&
+ (sctp_sstate(asoc->base.sk, CLOSING) ||
+ sock_flag(asoc->base.sk, SOCK_DEAD))) {
+ /* if were currently in SHUTDOWN_PENDING, but the socket
+ * has been closed by user, don't transition to ESTABLISHED.
+ * Instead trigger SHUTDOWN bundled with COOKIE_ACK.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+ return sctp_sf_do_9_2_start_shutdown(net, ep, asoc,
+ SCTP_ST_CHUNK(0), NULL,
+ commands);
+ } else {
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_ESTABLISHED));
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+ }
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem_ev:
+ sctp_chunk_free(repl);
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'B')
+ *
+ * Section 5.2.4
+ * B) In this case, both sides may be attempting to start an association
+ * at about the same time but the peer endpoint started its INIT
+ * after responding to the local endpoint's INIT
+ */
+/* This case represents an initialization collision. */
+static sctp_disposition_t sctp_sf_do_dupcook_b(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ sctp_cmd_seq_t *commands,
+ struct sctp_association *new_asoc)
+{
+ sctp_init_chunk_t *peer_init;
+ struct sctp_chunk *repl;
+
+ /* new_asoc is a brand-new association, so these are not yet
+ * side effects--it is safe to run them here.
+ */
+ peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+ if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
+ GFP_ATOMIC))
+ goto nomem;
+
+ /* Update the content of current association. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_ESTABLISHED));
+ SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
+ sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
+
+ repl = sctp_make_cookie_ack(new_asoc, chunk);
+ if (!repl)
+ goto nomem;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+ /* RFC 2960 5.1 Normal Establishment of an Association
+ *
+ * D) IMPLEMENTATION NOTE: An implementation may choose to
+ * send the Communication Up notification to the SCTP user
+ * upon reception of a valid COOKIE ECHO chunk.
+ *
+ * Sadly, this needs to be implemented as a side-effect, because
+ * we are not guaranteed to have set the association id of the real
+ * association and so these notifications need to be delayed until
+ * the association id is allocated.
+ */
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_CHANGE, SCTP_U8(SCTP_COMM_UP));
+
+ /* Sockets API Draft Section 5.3.1.6
+ * When a peer sends a Adaptation Layer Indication parameter , SCTP
+ * delivers this notification to inform the application that of the
+ * peers requested adaptation layer.
+ *
+ * This also needs to be done as a side effect for the same reason as
+ * above.
+ */
+ if (asoc->peer.adaptation_ind)
+ sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL());
+
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'C')
+ *
+ * Section 5.2.4
+ * C) In this case, the local endpoint's cookie has arrived late.
+ * Before it arrived, the local endpoint sent an INIT and received an
+ * INIT-ACK and finally sent a COOKIE ECHO with the peer's same tag
+ * but a new tag of its own.
+ */
+/* This case represents an initialization collision. */
+static sctp_disposition_t sctp_sf_do_dupcook_c(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ sctp_cmd_seq_t *commands,
+ struct sctp_association *new_asoc)
+{
+ /* The cookie should be silently discarded.
+ * The endpoint SHOULD NOT change states and should leave
+ * any timers running.
+ */
+ return SCTP_DISPOSITION_DISCARD;
+}
+
+/* Unexpected COOKIE-ECHO handler lost chunk (Table 2, action 'D')
+ *
+ * Section 5.2.4
+ *
+ * D) When both local and remote tags match the endpoint should always
+ * enter the ESTABLISHED state, if it has not already done so.
+ */
+/* This case represents an initialization collision. */
+static sctp_disposition_t sctp_sf_do_dupcook_d(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ sctp_cmd_seq_t *commands,
+ struct sctp_association *new_asoc)
+{
+ struct sctp_ulpevent *ev = NULL, *ai_ev = NULL;
+ struct sctp_chunk *repl;
+
+ /* Clarification from Implementor's Guide:
+ * D) When both local and remote tags match the endpoint should
+ * enter the ESTABLISHED state, if it is in the COOKIE-ECHOED state.
+ * It should stop any cookie timer that may be running and send
+ * a COOKIE ACK.
+ */
+
+ /* Don't accidentally move back into established state. */
+ if (asoc->state < SCTP_STATE_ESTABLISHED) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_ESTABLISHED));
+ SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
+ sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START,
+ SCTP_NULL());
+
+ /* RFC 2960 5.1 Normal Establishment of an Association
+ *
+ * D) IMPLEMENTATION NOTE: An implementation may choose
+ * to send the Communication Up notification to the
+ * SCTP user upon reception of a valid COOKIE
+ * ECHO chunk.
+ */
+ ev = sctp_ulpevent_make_assoc_change(asoc, 0,
+ SCTP_COMM_UP, 0,
+ asoc->c.sinit_num_ostreams,
+ asoc->c.sinit_max_instreams,
+ NULL, GFP_ATOMIC);
+ if (!ev)
+ goto nomem;
+
+ /* Sockets API Draft Section 5.3.1.6
+ * When a peer sends a Adaptation Layer Indication parameter,
+ * SCTP delivers this notification to inform the application
+ * that of the peers requested adaptation layer.
+ */
+ if (asoc->peer.adaptation_ind) {
+ ai_ev = sctp_ulpevent_make_adaptation_indication(asoc,
+ GFP_ATOMIC);
+ if (!ai_ev)
+ goto nomem;
+
+ }
+ }
+
+ repl = sctp_make_cookie_ack(new_asoc, chunk);
+ if (!repl)
+ goto nomem;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+ if (ev)
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(ev));
+ if (ai_ev)
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(ai_ev));
+
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+ if (ai_ev)
+ sctp_ulpevent_free(ai_ev);
+ if (ev)
+ sctp_ulpevent_free(ev);
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Handle a duplicate COOKIE-ECHO. This usually means a cookie-carrying
+ * chunk was retransmitted and then delayed in the network.
+ *
+ * Section: 5.2.4 Handle a COOKIE ECHO when a TCB exists
+ *
+ * Verification Tag: None. Do cookie validation.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ sctp_disposition_t retval;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_association *new_asoc;
+ int error = 0;
+ char action;
+ struct sctp_chunk *err_chk_p;
+
+ /* Make sure that the chunk has a valid length from the protocol
+ * perspective. In this case check to make sure we have at least
+ * enough for the chunk header. Cookie length verification is
+ * done later.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* "Decode" the chunk. We have no optional parameters so we
+ * are in good shape.
+ */
+ chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data;
+ if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t)))
+ goto nomem;
+
+ /* In RFC 2960 5.2.4 3, if both Verification Tags in the State Cookie
+ * of a duplicate COOKIE ECHO match the Verification Tags of the
+ * current association, consider the State Cookie valid even if
+ * the lifespan is exceeded.
+ */
+ new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error,
+ &err_chk_p);
+
+ /* FIXME:
+ * If the re-build failed, what is the proper error path
+ * from here?
+ *
+ * [We should abort the association. --piggy]
+ */
+ if (!new_asoc) {
+ /* FIXME: Several errors are possible. A bad cookie should
+ * be silently discarded, but think about logging it too.
+ */
+ switch (error) {
+ case -SCTP_IERROR_NOMEM:
+ goto nomem;
+
+ case -SCTP_IERROR_STALE_COOKIE:
+ sctp_send_stale_cookie_err(net, ep, asoc, chunk, commands,
+ err_chk_p);
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ case -SCTP_IERROR_BAD_SIG:
+ default:
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+ }
+
+ /* Compare the tie_tag in cookie with the verification tag of
+ * current association.
+ */
+ action = sctp_tietags_compare(new_asoc, asoc);
+
+ switch (action) {
+ case 'A': /* Association restart. */
+ retval = sctp_sf_do_dupcook_a(net, ep, asoc, chunk, commands,
+ new_asoc);
+ break;
+
+ case 'B': /* Collision case B. */
+ retval = sctp_sf_do_dupcook_b(net, ep, asoc, chunk, commands,
+ new_asoc);
+ break;
+
+ case 'C': /* Collision case C. */
+ retval = sctp_sf_do_dupcook_c(net, ep, asoc, chunk, commands,
+ new_asoc);
+ break;
+
+ case 'D': /* Collision case D. */
+ retval = sctp_sf_do_dupcook_d(net, ep, asoc, chunk, commands,
+ new_asoc);
+ break;
+
+ default: /* Discard packet for all others. */
+ retval = sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ break;
+ }
+
+ /* Delete the tempory new association. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC, SCTP_ASOC(new_asoc));
+ sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+
+ /* Restore association pointer to provide SCTP command interpeter
+ * with a valid context in case it needs to manipulate
+ * the queues */
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC,
+ SCTP_ASOC((struct sctp_association *)asoc));
+
+ return retval;
+
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Process an ABORT. (SHUTDOWN-PENDING state)
+ *
+ * See sctp_sf_do_9_1_abort().
+ */
+sctp_disposition_t sctp_sf_shutdown_pending_abort(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+
+ if (!sctp_vtag_verify_either(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the ABORT chunk has a valid length.
+ * Since this is an ABORT chunk, we have to discard it
+ * because of the following text:
+ * RFC 2960, Section 3.3.7
+ * If an endpoint receives an ABORT with a format error or for an
+ * association that doesn't exist, it MUST silently discard it.
+ * Because the length is "invalid", we can't really discard just
+ * as we do not know its true length. So, to be safe, discard the
+ * packet.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* ADD-IP: Special case for ABORT chunks
+ * F4) One special consideration is that ABORT Chunks arriving
+ * destined to the IP address being deleted MUST be
+ * ignored (see Section 5.3.1 for further details).
+ */
+ if (SCTP_ADDR_DEL ==
+ sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
+ return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+
+ return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * Process an ABORT. (SHUTDOWN-SENT state)
+ *
+ * See sctp_sf_do_9_1_abort().
+ */
+sctp_disposition_t sctp_sf_shutdown_sent_abort(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+
+ if (!sctp_vtag_verify_either(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the ABORT chunk has a valid length.
+ * Since this is an ABORT chunk, we have to discard it
+ * because of the following text:
+ * RFC 2960, Section 3.3.7
+ * If an endpoint receives an ABORT with a format error or for an
+ * association that doesn't exist, it MUST silently discard it.
+ * Because the length is "invalid", we can't really discard just
+ * as we do not know its true length. So, to be safe, discard the
+ * packet.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* ADD-IP: Special case for ABORT chunks
+ * F4) One special consideration is that ABORT Chunks arriving
+ * destined to the IP address being deleted MUST be
+ * ignored (see Section 5.3.1 for further details).
+ */
+ if (SCTP_ADDR_DEL ==
+ sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
+ return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+
+ /* Stop the T2-shutdown timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+ /* Stop the T5-shutdown guard timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+ return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * Process an ABORT. (SHUTDOWN-ACK-SENT state)
+ *
+ * See sctp_sf_do_9_1_abort().
+ */
+sctp_disposition_t sctp_sf_shutdown_ack_sent_abort(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* The same T2 timer, so we should be able to use
+ * common function with the SHUTDOWN-SENT state.
+ */
+ return sctp_sf_shutdown_sent_abort(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * Handle an Error received in COOKIE_ECHOED state.
+ *
+ * Only handle the error type of stale COOKIE Error, the other errors will
+ * be ignored.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_cookie_echoed_err(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ sctp_errhdr_t *err;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the ERROR chunk has a valid length.
+ * The parameter walking depends on this as well.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* Process the error here */
+ /* FUTURE FIXME: When PR-SCTP related and other optional
+ * parms are emitted, this will have to change to handle multiple
+ * errors.
+ */
+ sctp_walk_errors(err, chunk->chunk_hdr) {
+ if (SCTP_ERROR_STALE_COOKIE == err->cause)
+ return sctp_sf_do_5_2_6_stale(net, ep, asoc, type,
+ arg, commands);
+ }
+
+ /* It is possible to have malformed error causes, and that
+ * will cause us to end the walk early. However, since
+ * we are discarding the packet, there should be no adverse
+ * affects.
+ */
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * Handle a Stale COOKIE Error
+ *
+ * Section: 5.2.6 Handle Stale COOKIE Error
+ * If the association is in the COOKIE-ECHOED state, the endpoint may elect
+ * one of the following three alternatives.
+ * ...
+ * 3) Send a new INIT chunk to the endpoint, adding a Cookie
+ * Preservative parameter requesting an extension to the lifetime of
+ * the State Cookie. When calculating the time extension, an
+ * implementation SHOULD use the RTT information measured based on the
+ * previous COOKIE ECHO / ERROR exchange, and should add no more
+ * than 1 second beyond the measured RTT, due to long State Cookie
+ * lifetimes making the endpoint more subject to a replay attack.
+ *
+ * Verification Tag: Not explicit, but safe to ignore.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ time_t stale;
+ sctp_cookie_preserve_param_t bht;
+ sctp_errhdr_t *err;
+ struct sctp_chunk *reply;
+ struct sctp_bind_addr *bp;
+ int attempts = asoc->init_err_counter + 1;
+
+ if (attempts > asoc->max_init_attempts) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+ SCTP_PERR(SCTP_ERROR_STALE_COOKIE));
+ return SCTP_DISPOSITION_DELETE_TCB;
+ }
+
+ err = (sctp_errhdr_t *)(chunk->skb->data);
+
+ /* When calculating the time extension, an implementation
+ * SHOULD use the RTT information measured based on the
+ * previous COOKIE ECHO / ERROR exchange, and should add no
+ * more than 1 second beyond the measured RTT, due to long
+ * State Cookie lifetimes making the endpoint more subject to
+ * a replay attack.
+ * Measure of Staleness's unit is usec. (1/1000000 sec)
+ * Suggested Cookie Life-span Increment's unit is msec.
+ * (1/1000 sec)
+ * In general, if you use the suggested cookie life, the value
+ * found in the field of measure of staleness should be doubled
+ * to give ample time to retransmit the new cookie and thus
+ * yield a higher probability of success on the reattempt.
+ */
+ stale = ntohl(*(__be32 *)((u8 *)err + sizeof(sctp_errhdr_t)));
+ stale = (stale * 2) / 1000;
+
+ bht.param_hdr.type = SCTP_PARAM_COOKIE_PRESERVATIVE;
+ bht.param_hdr.length = htons(sizeof(bht));
+ bht.lifespan_increment = htonl(stale);
+
+ /* Build that new INIT chunk. */
+ bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
+ reply = sctp_make_init(asoc, bp, GFP_ATOMIC, sizeof(bht));
+ if (!reply)
+ goto nomem;
+
+ sctp_addto_chunk(reply, sizeof(bht), &bht);
+
+ /* Clear peer's init_tag cached in assoc as we are sending a new INIT */
+ sctp_add_cmd_sf(commands, SCTP_CMD_CLEAR_INIT_TAG, SCTP_NULL());
+
+ /* Stop pending T3-rtx and heartbeat timers */
+ sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL());
+ sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());
+
+ /* Delete non-primary peer ip addresses since we are transitioning
+ * back to the COOKIE-WAIT state
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_DEL_NON_PRIMARY, SCTP_NULL());
+
+ /* If we've sent any data bundled with COOKIE-ECHO we will need to
+ * resend
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_T1_RETRAN,
+ SCTP_TRANSPORT(asoc->peer.primary_path));
+
+ /* Cast away the const modifier, as we want to just
+ * rerun it through as a sideffect.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL());
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_COOKIE_WAIT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Process an ABORT.
+ *
+ * Section: 9.1
+ * After checking the Verification Tag, the receiving endpoint shall
+ * remove the association from its record, and shall report the
+ * termination to its upper layer.
+ *
+ * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules
+ * B) Rules for packet carrying ABORT:
+ *
+ * - The endpoint shall always fill in the Verification Tag field of the
+ * outbound packet with the destination endpoint's tag value if it
+ * is known.
+ *
+ * - If the ABORT is sent in response to an OOTB packet, the endpoint
+ * MUST follow the procedure described in Section 8.4.
+ *
+ * - The receiver MUST accept the packet if the Verification Tag
+ * matches either its own tag, OR the tag of its peer. Otherwise, the
+ * receiver MUST silently discard the packet and take no further
+ * action.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+
+ if (!sctp_vtag_verify_either(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the ABORT chunk has a valid length.
+ * Since this is an ABORT chunk, we have to discard it
+ * because of the following text:
+ * RFC 2960, Section 3.3.7
+ * If an endpoint receives an ABORT with a format error or for an
+ * association that doesn't exist, it MUST silently discard it.
+ * Because the length is "invalid", we can't really discard just
+ * as we do not know its true length. So, to be safe, discard the
+ * packet.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* ADD-IP: Special case for ABORT chunks
+ * F4) One special consideration is that ABORT Chunks arriving
+ * destined to the IP address being deleted MUST be
+ * ignored (see Section 5.3.1 for further details).
+ */
+ if (SCTP_ADDR_DEL ==
+ sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
+ return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+
+ return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
+}
+
+static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ unsigned int len;
+ __be16 error = SCTP_ERROR_NO_ERROR;
+
+ /* See if we have an error cause code in the chunk. */
+ len = ntohs(chunk->chunk_hdr->length);
+ if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) {
+
+ sctp_errhdr_t *err;
+ sctp_walk_errors(err, chunk->chunk_hdr);
+ if ((void *)err != (void *)chunk->chunk_end)
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
+ }
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET));
+ /* ASSOC_FAILED will DELETE_TCB. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, SCTP_PERR(error));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+
+ return SCTP_DISPOSITION_ABORT;
+}
+
+/*
+ * Process an ABORT. (COOKIE-WAIT state)
+ *
+ * See sctp_sf_do_9_1_abort() above.
+ */
+sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ unsigned int len;
+ __be16 error = SCTP_ERROR_NO_ERROR;
+
+ if (!sctp_vtag_verify_either(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the ABORT chunk has a valid length.
+ * Since this is an ABORT chunk, we have to discard it
+ * because of the following text:
+ * RFC 2960, Section 3.3.7
+ * If an endpoint receives an ABORT with a format error or for an
+ * association that doesn't exist, it MUST silently discard it.
+ * Because the length is "invalid", we can't really discard just
+ * as we do not know its true length. So, to be safe, discard the
+ * packet.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* See if we have an error cause code in the chunk. */
+ len = ntohs(chunk->chunk_hdr->length);
+ if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
+ error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
+
+ return sctp_stop_t1_and_abort(net, commands, error, ECONNREFUSED, asoc,
+ chunk->transport);
+}
+
+/*
+ * Process an incoming ICMP as an ABORT. (COOKIE-WAIT state)
+ */
+sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ return sctp_stop_t1_and_abort(net, commands, SCTP_ERROR_NO_ERROR,
+ ENOPROTOOPT, asoc,
+ (struct sctp_transport *)arg);
+}
+
+/*
+ * Process an ABORT. (COOKIE-ECHOED state)
+ */
+sctp_disposition_t sctp_sf_cookie_echoed_abort(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* There is a single T1 timer, so we should be able to use
+ * common function with the COOKIE-WAIT state.
+ */
+ return sctp_sf_cookie_wait_abort(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * Stop T1 timer and abort association with "INIT failed".
+ *
+ * This is common code called by several sctp_sf_*_abort() functions above.
+ */
+static sctp_disposition_t sctp_stop_t1_and_abort(struct net *net,
+ sctp_cmd_seq_t *commands,
+ __be16 error, int sk_err,
+ const struct sctp_association *asoc,
+ struct sctp_transport *transport)
+{
+ pr_debug("%s: ABORT received (INIT)\n", __func__);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_CLOSED));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(sk_err));
+ /* CMD_INIT_FAILED will DELETE_TCB. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+ SCTP_PERR(error));
+
+ return SCTP_DISPOSITION_ABORT;
+}
+
+/*
+ * sctp_sf_do_9_2_shut
+ *
+ * Section: 9.2
+ * Upon the reception of the SHUTDOWN, the peer endpoint shall
+ * - enter the SHUTDOWN-RECEIVED state,
+ *
+ * - stop accepting new data from its SCTP user
+ *
+ * - verify, by checking the Cumulative TSN Ack field of the chunk,
+ * that all its outstanding DATA chunks have been received by the
+ * SHUTDOWN sender.
+ *
+ * Once an endpoint as reached the SHUTDOWN-RECEIVED state it MUST NOT
+ * send a SHUTDOWN in response to a ULP request. And should discard
+ * subsequent SHUTDOWN chunks.
+ *
+ * If there are still outstanding DATA chunks left, the SHUTDOWN
+ * receiver shall continue to follow normal data transmission
+ * procedures defined in Section 6 until all outstanding DATA chunks
+ * are acknowledged; however, the SHUTDOWN receiver MUST NOT accept
+ * new data from its SCTP user.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_9_2_shutdown(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ sctp_shutdownhdr_t *sdh;
+ sctp_disposition_t disposition;
+ struct sctp_ulpevent *ev;
+ __u32 ctsn;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the SHUTDOWN chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk,
+ sizeof(struct sctp_shutdown_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* Convert the elaborate header. */
+ sdh = (sctp_shutdownhdr_t *)chunk->skb->data;
+ skb_pull(chunk->skb, sizeof(sctp_shutdownhdr_t));
+ chunk->subh.shutdown_hdr = sdh;
+ ctsn = ntohl(sdh->cum_tsn_ack);
+
+ if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
+ pr_debug("%s: ctsn:%x, ctsn_ack_point:%x\n", __func__, ctsn,
+ asoc->ctsn_ack_point);
+
+ return SCTP_DISPOSITION_DISCARD;
+ }
+
+ /* If Cumulative TSN Ack beyond the max tsn currently
+ * send, terminating the association and respond to the
+ * sender with an ABORT.
+ */
+ if (!TSN_lt(ctsn, asoc->next_tsn))
+ return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands);
+
+ /* API 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ * When a peer sends a SHUTDOWN, SCTP delivers this notification to
+ * inform the application that it should cease sending data.
+ */
+ ev = sctp_ulpevent_make_shutdown_event(asoc, 0, GFP_ATOMIC);
+ if (!ev) {
+ disposition = SCTP_DISPOSITION_NOMEM;
+ goto out;
+ }
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
+ /* Upon the reception of the SHUTDOWN, the peer endpoint shall
+ * - enter the SHUTDOWN-RECEIVED state,
+ * - stop accepting new data from its SCTP user
+ *
+ * [This is implicit in the new state.]
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_SHUTDOWN_RECEIVED));
+ disposition = SCTP_DISPOSITION_CONSUME;
+
+ if (sctp_outq_is_empty(&asoc->outqueue)) {
+ disposition = sctp_sf_do_9_2_shutdown_ack(net, ep, asoc, type,
+ arg, commands);
+ }
+
+ if (SCTP_DISPOSITION_NOMEM == disposition)
+ goto out;
+
+ /* - verify, by checking the Cumulative TSN Ack field of the
+ * chunk, that all its outstanding DATA chunks have been
+ * received by the SHUTDOWN sender.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN,
+ SCTP_BE32(chunk->subh.shutdown_hdr->cum_tsn_ack));
+
+out:
+ return disposition;
+}
+
+/*
+ * sctp_sf_do_9_2_shut_ctsn
+ *
+ * Once an endpoint has reached the SHUTDOWN-RECEIVED state,
+ * it MUST NOT send a SHUTDOWN in response to a ULP request.
+ * The Cumulative TSN Ack of the received SHUTDOWN chunk
+ * MUST be processed.
+ */
+sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ sctp_shutdownhdr_t *sdh;
+ __u32 ctsn;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the SHUTDOWN chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk,
+ sizeof(struct sctp_shutdown_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ sdh = (sctp_shutdownhdr_t *)chunk->skb->data;
+ ctsn = ntohl(sdh->cum_tsn_ack);
+
+ if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
+ pr_debug("%s: ctsn:%x, ctsn_ack_point:%x\n", __func__, ctsn,
+ asoc->ctsn_ack_point);
+
+ return SCTP_DISPOSITION_DISCARD;
+ }
+
+ /* If Cumulative TSN Ack beyond the max tsn currently
+ * send, terminating the association and respond to the
+ * sender with an ABORT.
+ */
+ if (!TSN_lt(ctsn, asoc->next_tsn))
+ return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands);
+
+ /* verify, by checking the Cumulative TSN Ack field of the
+ * chunk, that all its outstanding DATA chunks have been
+ * received by the SHUTDOWN sender.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN,
+ SCTP_BE32(sdh->cum_tsn_ack));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/* RFC 2960 9.2
+ * If an endpoint is in SHUTDOWN-ACK-SENT state and receives an INIT chunk
+ * (e.g., if the SHUTDOWN COMPLETE was lost) with source and destination
+ * transport addresses (either in the IP addresses or in the INIT chunk)
+ * that belong to this association, it should discard the INIT chunk and
+ * retransmit the SHUTDOWN ACK chunk.
+ */
+sctp_disposition_t sctp_sf_do_9_2_reshutack(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = (struct sctp_chunk *) arg;
+ struct sctp_chunk *reply;
+
+ /* Make sure that the chunk has a valid length */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* Since we are not going to really process this INIT, there
+ * is no point in verifying chunk boundries. Just generate
+ * the SHUTDOWN ACK.
+ */
+ reply = sctp_make_shutdown_ack(asoc, chunk);
+ if (NULL == reply)
+ goto nomem;
+
+ /* Set the transport for the SHUTDOWN ACK chunk and the timeout for
+ * the T2-SHUTDOWN timer.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));
+
+ /* and restart the T2-shutdown timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+ return SCTP_DISPOSITION_CONSUME;
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * sctp_sf_do_ecn_cwr
+ *
+ * Section: Appendix A: Explicit Congestion Notification
+ *
+ * CWR:
+ *
+ * RFC 2481 details a specific bit for a sender to send in the header of
+ * its next outbound TCP segment to indicate to its peer that it has
+ * reduced its congestion window. This is termed the CWR bit. For
+ * SCTP the same indication is made by including the CWR chunk.
+ * This chunk contains one data element, i.e. the TSN number that
+ * was sent in the ECNE chunk. This element represents the lowest
+ * TSN number in the datagram that was originally marked with the
+ * CE bit.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ sctp_cwrhdr_t *cwr;
+ struct sctp_chunk *chunk = arg;
+ u32 lowest_tsn;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ cwr = (sctp_cwrhdr_t *) chunk->skb->data;
+ skb_pull(chunk->skb, sizeof(sctp_cwrhdr_t));
+
+ lowest_tsn = ntohl(cwr->lowest_tsn);
+
+ /* Does this CWR ack the last sent congestion notification? */
+ if (TSN_lte(asoc->last_ecne_tsn, lowest_tsn)) {
+ /* Stop sending ECNE. */
+ sctp_add_cmd_sf(commands,
+ SCTP_CMD_ECN_CWR,
+ SCTP_U32(lowest_tsn));
+ }
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_sf_do_ecne
+ *
+ * Section: Appendix A: Explicit Congestion Notification
+ *
+ * ECN-Echo
+ *
+ * RFC 2481 details a specific bit for a receiver to send back in its
+ * TCP acknowledgements to notify the sender of the Congestion
+ * Experienced (CE) bit having arrived from the network. For SCTP this
+ * same indication is made by including the ECNE chunk. This chunk
+ * contains one data element, i.e. the lowest TSN associated with the IP
+ * datagram marked with the CE bit.....
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_ecne(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ sctp_ecnehdr_t *ecne;
+ struct sctp_chunk *chunk = arg;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ ecne = (sctp_ecnehdr_t *) chunk->skb->data;
+ skb_pull(chunk->skb, sizeof(sctp_ecnehdr_t));
+
+ /* If this is a newer ECNE than the last CWR packet we sent out */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_ECNE,
+ SCTP_U32(ntohl(ecne->lowest_tsn)));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Section: 6.2 Acknowledgement on Reception of DATA Chunks
+ *
+ * The SCTP endpoint MUST always acknowledge the reception of each valid
+ * DATA chunk.
+ *
+ * The guidelines on delayed acknowledgement algorithm specified in
+ * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, an
+ * acknowledgement SHOULD be generated for at least every second packet
+ * (not every second DATA chunk) received, and SHOULD be generated within
+ * 200 ms of the arrival of any unacknowledged DATA chunk. In some
+ * situations it may be beneficial for an SCTP transmitter to be more
+ * conservative than the algorithms detailed in this document allow.
+ * However, an SCTP transmitter MUST NOT be more aggressive than the
+ * following algorithms allow.
+ *
+ * A SCTP receiver MUST NOT generate more than one SACK for every
+ * incoming packet, other than to update the offered window as the
+ * receiving application consumes new data.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ sctp_arg_t force = SCTP_NOFORCE();
+ int error;
+
+ if (!sctp_vtag_verify(chunk, asoc)) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+ SCTP_NULL());
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_data_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ error = sctp_eat_data(asoc, chunk, commands);
+ switch (error) {
+ case SCTP_IERROR_NO_ERROR:
+ break;
+ case SCTP_IERROR_HIGH_TSN:
+ case SCTP_IERROR_BAD_STREAM:
+ SCTP_INC_STATS(net, SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
+ goto discard_noforce;
+ case SCTP_IERROR_DUP_TSN:
+ case SCTP_IERROR_IGNORE_TSN:
+ SCTP_INC_STATS(net, SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
+ goto discard_force;
+ case SCTP_IERROR_NO_DATA:
+ goto consume;
+ case SCTP_IERROR_PROTO_VIOLATION:
+ return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
+ (u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t));
+ default:
+ BUG();
+ }
+
+ if (chunk->chunk_hdr->flags & SCTP_DATA_SACK_IMM)
+ force = SCTP_FORCE();
+
+ if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+ }
+
+ /* If this is the last chunk in a packet, we need to count it
+ * toward sack generation. Note that we need to SACK every
+ * OTHER packet containing data chunks, EVEN IF WE DISCARD
+ * THEM. We elect to NOT generate SACK's if the chunk fails
+ * the verification tag test.
+ *
+ * RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks
+ *
+ * The SCTP endpoint MUST always acknowledge the reception of
+ * each valid DATA chunk.
+ *
+ * The guidelines on delayed acknowledgement algorithm
+ * specified in Section 4.2 of [RFC2581] SHOULD be followed.
+ * Specifically, an acknowledgement SHOULD be generated for at
+ * least every second packet (not every second DATA chunk)
+ * received, and SHOULD be generated within 200 ms of the
+ * arrival of any unacknowledged DATA chunk. In some
+ * situations it may be beneficial for an SCTP transmitter to
+ * be more conservative than the algorithms detailed in this
+ * document allow. However, an SCTP transmitter MUST NOT be
+ * more aggressive than the following algorithms allow.
+ */
+ if (chunk->end_of_packet)
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force);
+
+ return SCTP_DISPOSITION_CONSUME;
+
+discard_force:
+ /* RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks
+ *
+ * When a packet arrives with duplicate DATA chunk(s) and with
+ * no new DATA chunk(s), the endpoint MUST immediately send a
+ * SACK with no delay. If a packet arrives with duplicate
+ * DATA chunk(s) bundled with new DATA chunks, the endpoint
+ * MAY immediately send a SACK. Normally receipt of duplicate
+ * DATA chunks will occur when the original SACK chunk was lost
+ * and the peer's RTO has expired. The duplicate TSN number(s)
+ * SHOULD be reported in the SACK as duplicate.
+ */
+ /* In our case, we split the MAY SACK advice up whether or not
+ * the last chunk is a duplicate.'
+ */
+ if (chunk->end_of_packet)
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
+ return SCTP_DISPOSITION_DISCARD;
+
+discard_noforce:
+ if (chunk->end_of_packet)
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force);
+
+ return SCTP_DISPOSITION_DISCARD;
+consume:
+ return SCTP_DISPOSITION_CONSUME;
+
+}
+
+/*
+ * sctp_sf_eat_data_fast_4_4
+ *
+ * Section: 4 (4)
+ * (4) In SHUTDOWN-SENT state the endpoint MUST acknowledge any received
+ * DATA chunks without delay.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ int error;
+
+ if (!sctp_vtag_verify(chunk, asoc)) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+ SCTP_NULL());
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_data_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ error = sctp_eat_data(asoc, chunk, commands);
+ switch (error) {
+ case SCTP_IERROR_NO_ERROR:
+ case SCTP_IERROR_HIGH_TSN:
+ case SCTP_IERROR_DUP_TSN:
+ case SCTP_IERROR_IGNORE_TSN:
+ case SCTP_IERROR_BAD_STREAM:
+ break;
+ case SCTP_IERROR_NO_DATA:
+ goto consume;
+ case SCTP_IERROR_PROTO_VIOLATION:
+ return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
+ (u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t));
+ default:
+ BUG();
+ }
+
+ /* Go a head and force a SACK, since we are shutting down. */
+
+ /* Implementor's Guide.
+ *
+ * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately
+ * respond to each received packet containing one or more DATA chunk(s)
+ * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer
+ */
+ if (chunk->end_of_packet) {
+ /* We must delay the chunk creation since the cumulative
+ * TSN has not been updated yet.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL());
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+ }
+
+consume:
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Section: 6.2 Processing a Received SACK
+ * D) Any time a SACK arrives, the endpoint performs the following:
+ *
+ * i) If Cumulative TSN Ack is less than the Cumulative TSN Ack Point,
+ * then drop the SACK. Since Cumulative TSN Ack is monotonically
+ * increasing, a SACK whose Cumulative TSN Ack is less than the
+ * Cumulative TSN Ack Point indicates an out-of-order SACK.
+ *
+ * ii) Set rwnd equal to the newly received a_rwnd minus the number
+ * of bytes still outstanding after processing the Cumulative TSN Ack
+ * and the Gap Ack Blocks.
+ *
+ * iii) If the SACK is missing a TSN that was previously
+ * acknowledged via a Gap Ack Block (e.g., the data receiver
+ * reneged on the data), then mark the corresponding DATA chunk
+ * as available for retransmit: Mark it as missing for fast
+ * retransmit as described in Section 7.2.4 and if no retransmit
+ * timer is running for the destination address to which the DATA
+ * chunk was originally transmitted, then T3-rtx is started for
+ * that destination address.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ sctp_sackhdr_t *sackh;
+ __u32 ctsn;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the SACK chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_sack_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* Pull the SACK chunk from the data buffer */
+ sackh = sctp_sm_pull_sack(chunk);
+ /* Was this a bogus SACK? */
+ if (!sackh)
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ chunk->subh.sack_hdr = sackh;
+ ctsn = ntohl(sackh->cum_tsn_ack);
+
+ /* i) If Cumulative TSN Ack is less than the Cumulative TSN
+ * Ack Point, then drop the SACK. Since Cumulative TSN
+ * Ack is monotonically increasing, a SACK whose
+ * Cumulative TSN Ack is less than the Cumulative TSN Ack
+ * Point indicates an out-of-order SACK.
+ */
+ if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
+ pr_debug("%s: ctsn:%x, ctsn_ack_point:%x\n", __func__, ctsn,
+ asoc->ctsn_ack_point);
+
+ return SCTP_DISPOSITION_DISCARD;
+ }
+
+ /* If Cumulative TSN Ack beyond the max tsn currently
+ * send, terminating the association and respond to the
+ * sender with an ABORT.
+ */
+ if (!TSN_lt(ctsn, asoc->next_tsn))
+ return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands);
+
+ /* Return this SACK for further processing. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk));
+
+ /* Note: We do the rest of the work on the PROCESS_SACK
+ * sideeffect.
+ */
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Generate an ABORT in response to a packet.
+ *
+ * Section: 8.4 Handle "Out of the blue" Packets, sctpimpguide 2.41
+ *
+ * 8) The receiver should respond to the sender of the OOTB packet with
+ * an ABORT. When sending the ABORT, the receiver of the OOTB packet
+ * MUST fill in the Verification Tag field of the outbound packet
+ * with the value found in the Verification Tag field of the OOTB
+ * packet and set the T-bit in the Chunk Flags to indicate that the
+ * Verification Tag is reflected. After sending this ABORT, the
+ * receiver of the OOTB packet shall discard the OOTB packet and take
+ * no further action.
+ *
+ * Verification Tag:
+ *
+ * The return value is the disposition of the chunk.
+*/
+static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_packet *packet = NULL;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *abort;
+
+ packet = sctp_ootb_pkt_new(net, asoc, chunk);
+
+ if (packet) {
+ /* Make an ABORT. The T bit will be set if the asoc
+ * is NULL.
+ */
+ abort = sctp_make_abort(asoc, chunk, 0);
+ if (!abort) {
+ sctp_ootb_pkt_free(packet);
+ return SCTP_DISPOSITION_NOMEM;
+ }
+
+ /* Reflect vtag if T-Bit is set */
+ if (sctp_test_T_bit(abort))
+ packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+
+ /* Set the skb to the belonging sock for accounting. */
+ abort->skb->sk = ep->base.sk;
+
+ sctp_packet_append_chunk(packet, abort);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
+
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+
+ sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ return SCTP_DISPOSITION_CONSUME;
+ }
+
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Received an ERROR chunk from peer. Generate SCTP_REMOTE_ERROR
+ * event as ULP notification for each cause included in the chunk.
+ *
+ * API 5.3.1.3 - SCTP_REMOTE_ERROR
+ *
+ * The return value is the disposition of the chunk.
+*/
+sctp_disposition_t sctp_sf_operr_notify(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ sctp_errhdr_t *err;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the ERROR chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+ sctp_walk_errors(err, chunk->chunk_hdr);
+ if ((void *)err != (void *)chunk->chunk_end)
+ return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
+ (void *)err, commands);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
+ SCTP_CHUNK(chunk));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Process an inbound SHUTDOWN ACK.
+ *
+ * From Section 9.2:
+ * Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall
+ * stop the T2-shutdown timer, send a SHUTDOWN COMPLETE chunk to its
+ * peer, and remove all record of the association.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_2_final(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *reply;
+ struct sctp_ulpevent *ev;
+
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+ /* 10.2 H) SHUTDOWN COMPLETE notification
+ *
+ * When SCTP completes the shutdown procedures (section 9.2) this
+ * notification is passed to the upper layer.
+ */
+ ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP,
+ 0, 0, 0, NULL, GFP_ATOMIC);
+ if (!ev)
+ goto nomem;
+
+ /* ...send a SHUTDOWN COMPLETE chunk to its peer, */
+ reply = sctp_make_shutdown_complete(asoc, chunk);
+ if (!reply)
+ goto nomem_chunk;
+
+ /* Do all the commands now (after allocation), so that we
+ * have consistent state if memory allocation failes
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
+ /* Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall
+ * stop the T2-shutdown timer,
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_CLOSED));
+ SCTP_INC_STATS(net, SCTP_MIB_SHUTDOWNS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+ /* ...and remove all record of the association. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+ return SCTP_DISPOSITION_DELETE_TCB;
+
+nomem_chunk:
+ sctp_ulpevent_free(ev);
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * RFC 2960, 8.4 - Handle "Out of the blue" Packets, sctpimpguide 2.41.
+ *
+ * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should
+ * respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE.
+ * When sending the SHUTDOWN COMPLETE, the receiver of the OOTB
+ * packet must fill in the Verification Tag field of the outbound
+ * packet with the Verification Tag received in the SHUTDOWN ACK and
+ * set the T-bit in the Chunk Flags to indicate that the Verification
+ * Tag is reflected.
+ *
+ * 8) The receiver should respond to the sender of the OOTB packet with
+ * an ABORT. When sending the ABORT, the receiver of the OOTB packet
+ * MUST fill in the Verification Tag field of the outbound packet
+ * with the value found in the Verification Tag field of the OOTB
+ * packet and set the T-bit in the Chunk Flags to indicate that the
+ * Verification Tag is reflected. After sending this ABORT, the
+ * receiver of the OOTB packet shall discard the OOTB packet and take
+ * no further action.
+ */
+sctp_disposition_t sctp_sf_ootb(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sk_buff *skb = chunk->skb;
+ sctp_chunkhdr_t *ch;
+ sctp_errhdr_t *err;
+ __u8 *ch_end;
+ int ootb_shut_ack = 0;
+ int ootb_cookie_ack = 0;
+
+ SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
+
+ ch = (sctp_chunkhdr_t *) chunk->chunk_hdr;
+ do {
+ /* Report violation if the chunk is less then minimal */
+ if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* Now that we know we at least have a chunk header,
+ * do things that are type appropriate.
+ */
+ if (SCTP_CID_SHUTDOWN_ACK == ch->type)
+ ootb_shut_ack = 1;
+
+ /* RFC 2960, Section 3.3.7
+ * Moreover, under any circumstances, an endpoint that
+ * receives an ABORT MUST NOT respond to that ABORT by
+ * sending an ABORT of its own.
+ */
+ if (SCTP_CID_ABORT == ch->type)
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* RFC 8.4, 7) If the packet contains a "Stale cookie" ERROR
+ * or a COOKIE ACK the SCTP Packet should be silently
+ * discarded.
+ */
+
+ if (SCTP_CID_COOKIE_ACK == ch->type)
+ ootb_cookie_ack = 1;
+
+ if (SCTP_CID_ERROR == ch->type) {
+ sctp_walk_errors(err, ch) {
+ if (SCTP_ERROR_STALE_COOKIE == err->cause) {
+ ootb_cookie_ack = 1;
+ break;
+ }
+ }
+ }
+
+ /* Report violation if chunk len overflows */
+ ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+ if (ch_end > skb_tail_pointer(skb))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ ch = (sctp_chunkhdr_t *) ch_end;
+ } while (ch_end < skb_tail_pointer(skb));
+
+ if (ootb_shut_ack)
+ return sctp_sf_shut_8_4_5(net, ep, asoc, type, arg, commands);
+ else if (ootb_cookie_ack)
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ else
+ return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * Handle an "Out of the blue" SHUTDOWN ACK.
+ *
+ * Section: 8.4 5, sctpimpguide 2.41.
+ *
+ * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should
+ * respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE.
+ * When sending the SHUTDOWN COMPLETE, the receiver of the OOTB
+ * packet must fill in the Verification Tag field of the outbound
+ * packet with the Verification Tag received in the SHUTDOWN ACK and
+ * set the T-bit in the Chunk Flags to indicate that the Verification
+ * Tag is reflected.
+ *
+ * Inputs
+ * (endpoint, asoc, type, arg, commands)
+ *
+ * Outputs
+ * (sctp_disposition_t)
+ *
+ * The return value is the disposition of the chunk.
+ */
+static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_packet *packet = NULL;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *shut;
+
+ packet = sctp_ootb_pkt_new(net, asoc, chunk);
+
+ if (packet) {
+ /* Make an SHUTDOWN_COMPLETE.
+ * The T bit will be set if the asoc is NULL.
+ */
+ shut = sctp_make_shutdown_complete(asoc, chunk);
+ if (!shut) {
+ sctp_ootb_pkt_free(packet);
+ return SCTP_DISPOSITION_NOMEM;
+ }
+
+ /* Reflect vtag if T-Bit is set */
+ if (sctp_test_T_bit(shut))
+ packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+
+ /* Set the skb to the belonging sock for accounting. */
+ shut->skb->sk = ep->base.sk;
+
+ sctp_packet_append_chunk(packet, shut);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
+
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+
+ /* If the chunk length is invalid, we don't want to process
+ * the reset of the packet.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* We need to discard the rest of the packet to prevent
+ * potential bomming attacks from additional bundled chunks.
+ * This is documented in SCTP Threats ID.
+ */
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Handle SHUTDOWN ACK in COOKIE_ECHOED or COOKIE_WAIT state.
+ *
+ * Verification Tag: 8.5.1 E) Rules for packet carrying a SHUTDOWN ACK
+ * If the receiver is in COOKIE-ECHOED or COOKIE-WAIT state the
+ * procedures in section 8.4 SHOULD be followed, in other words it
+ * should be treated as an Out Of The Blue packet.
+ * [This means that we do NOT check the Verification Tag on these
+ * chunks. --piggy ]
+ *
+ */
+sctp_disposition_t sctp_sf_do_8_5_1_E_sa(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+
+ /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ /* Although we do have an association in this case, it corresponds
+ * to a restarted association. So the packet is treated as an OOTB
+ * packet and the state function that handles OOTB SHUTDOWN_ACK is
+ * called with a NULL association.
+ */
+ SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
+
+ return sctp_sf_shut_8_4_5(net, ep, NULL, type, arg, commands);
+}
+
+/* ADDIP Section 4.2 Upon reception of an ASCONF Chunk. */
+sctp_disposition_t sctp_sf_do_asconf(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type, void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *asconf_ack = NULL;
+ struct sctp_paramhdr *err_param = NULL;
+ sctp_addiphdr_t *hdr;
+ __u32 serial;
+
+ if (!sctp_vtag_verify(chunk, asoc)) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+ SCTP_NULL());
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ /* ADD-IP: Section 4.1.1
+ * This chunk MUST be sent in an authenticated way by using
+ * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
+ * is received unauthenticated it MUST be silently discarded as
+ * described in [I-D.ietf-tsvwg-sctp-auth].
+ */
+ if (!net->sctp.addip_noauth && !chunk->auth)
+ return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the ASCONF ADDIP chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_addip_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ hdr = (sctp_addiphdr_t *)chunk->skb->data;
+ serial = ntohl(hdr->serial);
+
+ /* Verify the ASCONF chunk before processing it. */
+ if (!sctp_verify_asconf(asoc, chunk, true, &err_param))
+ return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
+ (void *)err_param, commands);
+
+ /* ADDIP 5.2 E1) Compare the value of the serial number to the value
+ * the endpoint stored in a new association variable
+ * 'Peer-Serial-Number'.
+ */
+ if (serial == asoc->peer.addip_serial + 1) {
+ /* If this is the first instance of ASCONF in the packet,
+ * we can clean our old ASCONF-ACKs.
+ */
+ if (!chunk->has_asconf)
+ sctp_assoc_clean_asconf_ack_cache(asoc);
+
+ /* ADDIP 5.2 E4) When the Sequence Number matches the next one
+ * expected, process the ASCONF as described below and after
+ * processing the ASCONF Chunk, append an ASCONF-ACK Chunk to
+ * the response packet and cache a copy of it (in the event it
+ * later needs to be retransmitted).
+ *
+ * Essentially, do V1-V5.
+ */
+ asconf_ack = sctp_process_asconf((struct sctp_association *)
+ asoc, chunk);
+ if (!asconf_ack)
+ return SCTP_DISPOSITION_NOMEM;
+ } else if (serial < asoc->peer.addip_serial + 1) {
+ /* ADDIP 5.2 E2)
+ * If the value found in the Sequence Number is less than the
+ * ('Peer- Sequence-Number' + 1), simply skip to the next
+ * ASCONF, and include in the outbound response packet
+ * any previously cached ASCONF-ACK response that was
+ * sent and saved that matches the Sequence Number of the
+ * ASCONF. Note: It is possible that no cached ASCONF-ACK
+ * Chunk exists. This will occur when an older ASCONF
+ * arrives out of order. In such a case, the receiver
+ * should skip the ASCONF Chunk and not include ASCONF-ACK
+ * Chunk for that chunk.
+ */
+ asconf_ack = sctp_assoc_lookup_asconf_ack(asoc, hdr->serial);
+ if (!asconf_ack)
+ return SCTP_DISPOSITION_DISCARD;
+
+ /* Reset the transport so that we select the correct one
+ * this time around. This is to make sure that we don't
+ * accidentally use a stale transport that's been removed.
+ */
+ asconf_ack->transport = NULL;
+ } else {
+ /* ADDIP 5.2 E5) Otherwise, the ASCONF Chunk is discarded since
+ * it must be either a stale packet or from an attacker.
+ */
+ return SCTP_DISPOSITION_DISCARD;
+ }
+
+ /* ADDIP 5.2 E6) The destination address of the SCTP packet
+ * containing the ASCONF-ACK Chunks MUST be the source address of
+ * the SCTP packet that held the ASCONF Chunks.
+ *
+ * To do this properly, we'll set the destination address of the chunk
+ * and at the transmit time, will try look up the transport to use.
+ * Since ASCONFs may be bundled, the correct transport may not be
+ * created until we process the entire packet, thus this workaround.
+ */
+ asconf_ack->dest = chunk->source;
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(asconf_ack));
+ if (asoc->new_transport) {
+ sctp_sf_heartbeat(ep, asoc, type, asoc->new_transport, commands);
+ ((struct sctp_association *)asoc)->new_transport = NULL;
+ }
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * ADDIP Section 4.3 General rules for address manipulation
+ * When building TLV parameters for the ASCONF Chunk that will add or
+ * delete IP addresses the D0 to D13 rules should be applied:
+ */
+sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type, void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *asconf_ack = arg;
+ struct sctp_chunk *last_asconf = asoc->addip_last_asconf;
+ struct sctp_chunk *abort;
+ struct sctp_paramhdr *err_param = NULL;
+ sctp_addiphdr_t *addip_hdr;
+ __u32 sent_serial, rcvd_serial;
+
+ if (!sctp_vtag_verify(asconf_ack, asoc)) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+ SCTP_NULL());
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ /* ADD-IP, Section 4.1.2:
+ * This chunk MUST be sent in an authenticated way by using
+ * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
+ * is received unauthenticated it MUST be silently discarded as
+ * described in [I-D.ietf-tsvwg-sctp-auth].
+ */
+ if (!net->sctp.addip_noauth && !asconf_ack->auth)
+ return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the ADDIP chunk has a valid length. */
+ if (!sctp_chunk_length_valid(asconf_ack, sizeof(sctp_addip_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ addip_hdr = (sctp_addiphdr_t *)asconf_ack->skb->data;
+ rcvd_serial = ntohl(addip_hdr->serial);
+
+ /* Verify the ASCONF-ACK chunk before processing it. */
+ if (!sctp_verify_asconf(asoc, asconf_ack, false, &err_param))
+ return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
+ (void *)err_param, commands);
+
+ if (last_asconf) {
+ addip_hdr = (sctp_addiphdr_t *)last_asconf->subh.addip_hdr;
+ sent_serial = ntohl(addip_hdr->serial);
+ } else {
+ sent_serial = asoc->addip_serial - 1;
+ }
+
+ /* D0) If an endpoint receives an ASCONF-ACK that is greater than or
+ * equal to the next serial number to be used but no ASCONF chunk is
+ * outstanding the endpoint MUST ABORT the association. Note that a
+ * sequence number is greater than if it is no more than 2^^31-1
+ * larger than the current sequence number (using serial arithmetic).
+ */
+ if (ADDIP_SERIAL_gte(rcvd_serial, sent_serial + 1) &&
+ !(asoc->addip_last_asconf)) {
+ abort = sctp_make_abort(asoc, asconf_ack,
+ sizeof(sctp_errhdr_t));
+ if (abort) {
+ sctp_init_cause(abort, SCTP_ERROR_ASCONF_ACK, 0);
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(abort));
+ }
+ /* We are going to ABORT, so we might as well stop
+ * processing the rest of the chunks in the packet.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+ sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ECONNABORTED));
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_ASCONF_ACK));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ return SCTP_DISPOSITION_ABORT;
+ }
+
+ if ((rcvd_serial == sent_serial) && asoc->addip_last_asconf) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+
+ if (!sctp_process_asconf_ack((struct sctp_association *)asoc,
+ asconf_ack)) {
+ /* Successfully processed ASCONF_ACK. We can
+ * release the next asconf if we have one.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_NEXT_ASCONF,
+ SCTP_NULL());
+ return SCTP_DISPOSITION_CONSUME;
+ }
+
+ abort = sctp_make_abort(asoc, asconf_ack,
+ sizeof(sctp_errhdr_t));
+ if (abort) {
+ sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(abort));
+ }
+ /* We are going to ABORT, so we might as well stop
+ * processing the rest of the chunks in the packet.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ECONNABORTED));
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_ASCONF_ACK));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ return SCTP_DISPOSITION_ABORT;
+ }
+
+ return SCTP_DISPOSITION_DISCARD;
+}
+
+/*
+ * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP
+ *
+ * When a FORWARD TSN chunk arrives, the data receiver MUST first update
+ * its cumulative TSN point to the value carried in the FORWARD TSN
+ * chunk, and then MUST further advance its cumulative TSN point locally
+ * if possible.
+ * After the above processing, the data receiver MUST stop reporting any
+ * missing TSNs earlier than or equal to the new cumulative TSN point.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_fwdtsn_hdr *fwdtsn_hdr;
+ struct sctp_fwdtsn_skip *skip;
+ __u16 len;
+ __u32 tsn;
+
+ if (!sctp_vtag_verify(chunk, asoc)) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+ SCTP_NULL());
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ /* Make sure that the FORWARD_TSN chunk has valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data;
+ chunk->subh.fwdtsn_hdr = fwdtsn_hdr;
+ len = ntohs(chunk->chunk_hdr->length);
+ len -= sizeof(struct sctp_chunkhdr);
+ skb_pull(chunk->skb, len);
+
+ tsn = ntohl(fwdtsn_hdr->new_cum_tsn);
+ pr_debug("%s: TSN 0x%x\n", __func__, tsn);
+
+ /* The TSN is too high--silently discard the chunk and count on it
+ * getting retransmitted later.
+ */
+ if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
+ goto discard_noforce;
+
+ /* Silently discard the chunk if stream-id is not valid */
+ sctp_walk_fwdtsn(skip, chunk) {
+ if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams)
+ goto discard_noforce;
+ }
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
+ if (len > sizeof(struct sctp_fwdtsn_hdr))
+ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
+ SCTP_CHUNK(chunk));
+
+ /* Count this as receiving DATA. */
+ if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+ }
+
+ /* FIXME: For now send a SACK, but DATA processing may
+ * send another.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
+
+ return SCTP_DISPOSITION_CONSUME;
+
+discard_noforce:
+ return SCTP_DISPOSITION_DISCARD;
+}
+
+sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_fwdtsn_hdr *fwdtsn_hdr;
+ struct sctp_fwdtsn_skip *skip;
+ __u16 len;
+ __u32 tsn;
+
+ if (!sctp_vtag_verify(chunk, asoc)) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+ SCTP_NULL());
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ /* Make sure that the FORWARD_TSN chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data;
+ chunk->subh.fwdtsn_hdr = fwdtsn_hdr;
+ len = ntohs(chunk->chunk_hdr->length);
+ len -= sizeof(struct sctp_chunkhdr);
+ skb_pull(chunk->skb, len);
+
+ tsn = ntohl(fwdtsn_hdr->new_cum_tsn);
+ pr_debug("%s: TSN 0x%x\n", __func__, tsn);
+
+ /* The TSN is too high--silently discard the chunk and count on it
+ * getting retransmitted later.
+ */
+ if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
+ goto gen_shutdown;
+
+ /* Silently discard the chunk if stream-id is not valid */
+ sctp_walk_fwdtsn(skip, chunk) {
+ if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams)
+ goto gen_shutdown;
+ }
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
+ if (len > sizeof(struct sctp_fwdtsn_hdr))
+ sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
+ SCTP_CHUNK(chunk));
+
+ /* Go a head and force a SACK, since we are shutting down. */
+gen_shutdown:
+ /* Implementor's Guide.
+ *
+ * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately
+ * respond to each received packet containing one or more DATA chunk(s)
+ * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL());
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * SCTP-AUTH Section 6.3 Receiving authenticated chukns
+ *
+ * The receiver MUST use the HMAC algorithm indicated in the HMAC
+ * Identifier field. If this algorithm was not specified by the
+ * receiver in the HMAC-ALGO parameter in the INIT or INIT-ACK chunk
+ * during association setup, the AUTH chunk and all chunks after it MUST
+ * be discarded and an ERROR chunk SHOULD be sent with the error cause
+ * defined in Section 4.1.
+ *
+ * If an endpoint with no shared key receives a Shared Key Identifier
+ * other than 0, it MUST silently discard all authenticated chunks. If
+ * the endpoint has at least one endpoint pair shared key for the peer,
+ * it MUST use the key specified by the Shared Key Identifier if a
+ * key has been configured for that Shared Key Identifier. If no
+ * endpoint pair shared key has been configured for that Shared Key
+ * Identifier, all authenticated chunks MUST be silently discarded.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ *
+ * The return value is the disposition of the chunk.
+ */
+static sctp_ierror_t sctp_sf_authenticate(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ struct sctp_chunk *chunk)
+{
+ struct sctp_authhdr *auth_hdr;
+ struct sctp_hmac *hmac;
+ unsigned int sig_len;
+ __u16 key_id;
+ __u8 *save_digest;
+ __u8 *digest;
+
+ /* Pull in the auth header, so we can do some more verification */
+ auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
+ chunk->subh.auth_hdr = auth_hdr;
+ skb_pull(chunk->skb, sizeof(struct sctp_authhdr));
+
+ /* Make sure that we support the HMAC algorithm from the auth
+ * chunk.
+ */
+ if (!sctp_auth_asoc_verify_hmac_id(asoc, auth_hdr->hmac_id))
+ return SCTP_IERROR_AUTH_BAD_HMAC;
+
+ /* Make sure that the provided shared key identifier has been
+ * configured
+ */
+ key_id = ntohs(auth_hdr->shkey_id);
+ if (key_id != asoc->active_key_id && !sctp_auth_get_shkey(asoc, key_id))
+ return SCTP_IERROR_AUTH_BAD_KEYID;
+
+
+ /* Make sure that the length of the signature matches what
+ * we expect.
+ */
+ sig_len = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_auth_chunk_t);
+ hmac = sctp_auth_get_hmac(ntohs(auth_hdr->hmac_id));
+ if (sig_len != hmac->hmac_len)
+ return SCTP_IERROR_PROTO_VIOLATION;
+
+ /* Now that we've done validation checks, we can compute and
+ * verify the hmac. The steps involved are:
+ * 1. Save the digest from the chunk.
+ * 2. Zero out the digest in the chunk.
+ * 3. Compute the new digest
+ * 4. Compare saved and new digests.
+ */
+ digest = auth_hdr->hmac;
+ skb_pull(chunk->skb, sig_len);
+
+ save_digest = kmemdup(digest, sig_len, GFP_ATOMIC);
+ if (!save_digest)
+ goto nomem;
+
+ memset(digest, 0, sig_len);
+
+ sctp_auth_calculate_hmac(asoc, chunk->skb,
+ (struct sctp_auth_chunk *)chunk->chunk_hdr,
+ GFP_ATOMIC);
+
+ /* Discard the packet if the digests do not match */
+ if (memcmp(save_digest, digest, sig_len)) {
+ kfree(save_digest);
+ return SCTP_IERROR_BAD_SIG;
+ }
+
+ kfree(save_digest);
+ chunk->auth = 1;
+
+ return SCTP_IERROR_NO_ERROR;
+nomem:
+ return SCTP_IERROR_NOMEM;
+}
+
+sctp_disposition_t sctp_sf_eat_auth(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_authhdr *auth_hdr;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *err_chunk;
+ sctp_ierror_t error;
+
+ /* Make sure that the peer has AUTH capable */
+ if (!asoc->peer.auth_capable)
+ return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);
+
+ if (!sctp_vtag_verify(chunk, asoc)) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+ SCTP_NULL());
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
+
+ /* Make sure that the AUTH chunk has valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_auth_chunk)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
+ error = sctp_sf_authenticate(net, ep, asoc, type, chunk);
+ switch (error) {
+ case SCTP_IERROR_AUTH_BAD_HMAC:
+ /* Generate the ERROR chunk and discard the rest
+ * of the packet
+ */
+ err_chunk = sctp_make_op_error(asoc, chunk,
+ SCTP_ERROR_UNSUP_HMAC,
+ &auth_hdr->hmac_id,
+ sizeof(__u16), 0);
+ if (err_chunk) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(err_chunk));
+ }
+ /* Fall Through */
+ case SCTP_IERROR_AUTH_BAD_KEYID:
+ case SCTP_IERROR_BAD_SIG:
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ case SCTP_IERROR_PROTO_VIOLATION:
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ case SCTP_IERROR_NOMEM:
+ return SCTP_DISPOSITION_NOMEM;
+
+ default: /* Prevent gcc warnings */
+ break;
+ }
+
+ if (asoc->active_key_id != ntohs(auth_hdr->shkey_id)) {
+ struct sctp_ulpevent *ev;
+
+ ev = sctp_ulpevent_make_authkey(asoc, ntohs(auth_hdr->shkey_id),
+ SCTP_AUTH_NEWKEY, GFP_ATOMIC);
+
+ if (!ev)
+ return -ENOMEM;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+ SCTP_ULPEVENT(ev));
+ }
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Process an unknown chunk.
+ *
+ * Section: 3.2. Also, 2.1 in the implementor's guide.
+ *
+ * Chunk Types are encoded such that the highest-order two bits specify
+ * the action that must be taken if the processing endpoint does not
+ * recognize the Chunk Type.
+ *
+ * 00 - Stop processing this SCTP packet and discard it, do not process
+ * any further chunks within it.
+ *
+ * 01 - Stop processing this SCTP packet and discard it, do not process
+ * any further chunks within it, and report the unrecognized
+ * chunk in an 'Unrecognized Chunk Type'.
+ *
+ * 10 - Skip this chunk and continue processing.
+ *
+ * 11 - Skip this chunk and continue processing, but report in an ERROR
+ * Chunk using the 'Unrecognized Chunk Type' cause of error.
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *unk_chunk = arg;
+ struct sctp_chunk *err_chunk;
+ sctp_chunkhdr_t *hdr;
+
+ pr_debug("%s: processing unknown chunk id:%d\n", __func__, type.chunk);
+
+ if (!sctp_vtag_verify(unk_chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the chunk has a valid length.
+ * Since we don't know the chunk type, we use a general
+ * chunkhdr structure to make a comparison.
+ */
+ if (!sctp_chunk_length_valid(unk_chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ switch (type.chunk & SCTP_CID_ACTION_MASK) {
+ case SCTP_CID_ACTION_DISCARD:
+ /* Discard the packet. */
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ case SCTP_CID_ACTION_DISCARD_ERR:
+ /* Generate an ERROR chunk as response. */
+ hdr = unk_chunk->chunk_hdr;
+ err_chunk = sctp_make_op_error(asoc, unk_chunk,
+ SCTP_ERROR_UNKNOWN_CHUNK, hdr,
+ WORD_ROUND(ntohs(hdr->length)),
+ 0);
+ if (err_chunk) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(err_chunk));
+ }
+
+ /* Discard the packet. */
+ sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ return SCTP_DISPOSITION_CONSUME;
+ case SCTP_CID_ACTION_SKIP:
+ /* Skip the chunk. */
+ return SCTP_DISPOSITION_DISCARD;
+ case SCTP_CID_ACTION_SKIP_ERR:
+ /* Generate an ERROR chunk as response. */
+ hdr = unk_chunk->chunk_hdr;
+ err_chunk = sctp_make_op_error(asoc, unk_chunk,
+ SCTP_ERROR_UNKNOWN_CHUNK, hdr,
+ WORD_ROUND(ntohs(hdr->length)),
+ 0);
+ if (err_chunk) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(err_chunk));
+ }
+ /* Skip the chunk. */
+ return SCTP_DISPOSITION_CONSUME;
+ default:
+ break;
+ }
+
+ return SCTP_DISPOSITION_DISCARD;
+}
+
+/*
+ * Discard the chunk.
+ *
+ * Section: 0.2, 5.2.3, 5.2.5, 5.2.6, 6.0, 8.4.6, 8.5.1c, 9.2
+ * [Too numerous to mention...]
+ * Verification Tag: No verification needed.
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_discard_chunk(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+
+ /* Make sure that the chunk has a valid length.
+ * Since we don't know the chunk type, we use a general
+ * chunkhdr structure to make a comparison.
+ */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ pr_debug("%s: chunk:%d is discarded\n", __func__, type.chunk);
+
+ return SCTP_DISPOSITION_DISCARD;
+}
+
+/*
+ * Discard the whole packet.
+ *
+ * Section: 8.4 2)
+ *
+ * 2) If the OOTB packet contains an ABORT chunk, the receiver MUST
+ * silently discard the OOTB packet and take no further action.
+ *
+ * Verification Tag: No verification necessary
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_pdiscard(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS);
+ sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+
+/*
+ * The other end is violating protocol.
+ *
+ * Section: Not specified
+ * Verification Tag: Not specified
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * We simply tag the chunk as a violation. The state machine will log
+ * the violation and continue.
+ */
+sctp_disposition_t sctp_sf_violation(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+
+ /* Make sure that the chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+
+ return SCTP_DISPOSITION_VIOLATION;
+}
+
+/*
+ * Common function to handle a protocol violation.
+ */
+static sctp_disposition_t sctp_sf_abort_violation(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ void *arg,
+ sctp_cmd_seq_t *commands,
+ const __u8 *payload,
+ const size_t paylen)
+{
+ struct sctp_packet *packet = NULL;
+ struct sctp_chunk *chunk = arg;
+ struct sctp_chunk *abort = NULL;
+
+ /* SCTP-AUTH, Section 6.3:
+ * It should be noted that if the receiver wants to tear
+ * down an association in an authenticated way only, the
+ * handling of malformed packets should not result in
+ * tearing down the association.
+ *
+ * This means that if we only want to abort associations
+ * in an authenticated way (i.e AUTH+ABORT), then we
+ * can't destroy this association just because the packet
+ * was malformed.
+ */
+ if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
+ goto discard;
+
+ /* Make the abort chunk. */
+ abort = sctp_make_abort_violation(asoc, chunk, payload, paylen);
+ if (!abort)
+ goto nomem;
+
+ if (asoc) {
+ /* Treat INIT-ACK as a special case during COOKIE-WAIT. */
+ if (chunk->chunk_hdr->type == SCTP_CID_INIT_ACK &&
+ !asoc->peer.i.init_tag) {
+ sctp_initack_chunk_t *initack;
+
+ initack = (sctp_initack_chunk_t *)chunk->chunk_hdr;
+ if (!sctp_chunk_length_valid(chunk,
+ sizeof(sctp_initack_chunk_t)))
+ abort->chunk_hdr->flags |= SCTP_CHUNK_FLAG_T;
+ else {
+ unsigned int inittag;
+
+ inittag = ntohl(initack->init_hdr.init_tag);
+ sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_INITTAG,
+ SCTP_U32(inittag));
+ }
+ }
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+
+ if (asoc->state <= SCTP_STATE_COOKIE_ECHOED) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ECONNREFUSED));
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+ SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
+ } else {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ECONNABORTED));
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ }
+ } else {
+ packet = sctp_ootb_pkt_new(net, asoc, chunk);
+
+ if (!packet)
+ goto nomem_pkt;
+
+ if (sctp_test_T_bit(abort))
+ packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+
+ abort->skb->sk = ep->base.sk;
+
+ sctp_packet_append_chunk(packet, abort);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
+
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+ }
+
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+
+discard:
+ sctp_sf_pdiscard(net, ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
+ return SCTP_DISPOSITION_ABORT;
+
+nomem_pkt:
+ sctp_chunk_free(abort);
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Handle a protocol violation when the chunk length is invalid.
+ * "Invalid" length is identified as smaller than the minimal length a
+ * given chunk can be. For example, a SACK chunk has invalid length
+ * if its length is set to be smaller than the size of sctp_sack_chunk_t.
+ *
+ * We inform the other end by sending an ABORT with a Protocol Violation
+ * error code.
+ *
+ * Section: Not specified
+ * Verification Tag: Nothing to do
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (reply_msg, msg_up, counters)
+ *
+ * Generate an ABORT chunk and terminate the association.
+ */
+static sctp_disposition_t sctp_sf_violation_chunklen(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ static const char err_str[] = "The following chunk had invalid length:";
+
+ return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
+ sizeof(err_str));
+}
+
+/*
+ * Handle a protocol violation when the parameter length is invalid.
+ * If the length is smaller than the minimum length of a given parameter,
+ * or accumulated length in multi parameters exceeds the end of the chunk,
+ * the length is considered as invalid.
+ */
+static sctp_disposition_t sctp_sf_violation_paramlen(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg, void *ext,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+ struct sctp_paramhdr *param = ext;
+ struct sctp_chunk *abort = NULL;
+
+ if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
+ goto discard;
+
+ /* Make the abort chunk. */
+ abort = sctp_make_violation_paramlen(asoc, chunk, param);
+ if (!abort)
+ goto nomem;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ECONNABORTED));
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+
+discard:
+ sctp_sf_pdiscard(net, ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
+ return SCTP_DISPOSITION_ABORT;
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Handle a protocol violation when the peer trying to advance the
+ * cumulative tsn ack to a point beyond the max tsn currently sent.
+ *
+ * We inform the other end by sending an ABORT with a Protocol Violation
+ * error code.
+ */
+static sctp_disposition_t sctp_sf_violation_ctsn(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ static const char err_str[] = "The cumulative tsn ack beyond the max tsn currently sent:";
+
+ return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
+ sizeof(err_str));
+}
+
+/* Handle protocol violation of an invalid chunk bundling. For example,
+ * when we have an association and we receive bundled INIT-ACK, or
+ * SHUDOWN-COMPLETE, our peer is clearly violationg the "MUST NOT bundle"
+ * statement from the specs. Additionally, there might be an attacker
+ * on the path and we may not want to continue this communication.
+ */
+static sctp_disposition_t sctp_sf_violation_chunk(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ static const char err_str[] = "The following chunk violates protocol:";
+
+ if (!asoc)
+ return sctp_sf_violation(net, ep, asoc, type, arg, commands);
+
+ return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
+ sizeof(err_str));
+}
+/***************************************************************************
+ * These are the state functions for handling primitive (Section 10) events.
+ ***************************************************************************/
+/*
+ * sctp_sf_do_prm_asoc
+ *
+ * Section: 10.1 ULP-to-SCTP
+ * B) Associate
+ *
+ * Format: ASSOCIATE(local SCTP instance name, destination transport addr,
+ * outbound stream count)
+ * -> association id [,destination transport addr list] [,outbound stream
+ * count]
+ *
+ * This primitive allows the upper layer to initiate an association to a
+ * specific peer endpoint.
+ *
+ * The peer endpoint shall be specified by one of the transport addresses
+ * which defines the endpoint (see Section 1.4). If the local SCTP
+ * instance has not been initialized, the ASSOCIATE is considered an
+ * error.
+ * [This is not relevant for the kernel implementation since we do all
+ * initialization at boot time. It we hadn't initialized we wouldn't
+ * get anywhere near this code.]
+ *
+ * An association id, which is a local handle to the SCTP association,
+ * will be returned on successful establishment of the association. If
+ * SCTP is not able to open an SCTP association with the peer endpoint,
+ * an error is returned.
+ * [In the kernel implementation, the struct sctp_association needs to
+ * be created BEFORE causing this primitive to run.]
+ *
+ * Other association parameters may be returned, including the
+ * complete destination transport addresses of the peer as well as the
+ * outbound stream count of the local endpoint. One of the transport
+ * address from the returned destination addresses will be selected by
+ * the local endpoint as default primary path for sending SCTP packets
+ * to this peer. The returned "destination transport addr list" can
+ * be used by the ULP to change the default primary path or to force
+ * sending a packet to a specific transport address. [All of this
+ * stuff happens when the INIT ACK arrives. This is a NON-BLOCKING
+ * function.]
+ *
+ * Mandatory attributes:
+ *
+ * o local SCTP instance name - obtained from the INITIALIZE operation.
+ * [This is the argument asoc.]
+ * o destination transport addr - specified as one of the transport
+ * addresses of the peer endpoint with which the association is to be
+ * established.
+ * [This is asoc->peer.active_path.]
+ * o outbound stream count - the number of outbound streams the ULP
+ * would like to open towards this peer endpoint.
+ * [BUG: This is not currently implemented.]
+ * Optional attributes:
+ *
+ * None.
+ *
+ * The return value is a disposition.
+ */
+sctp_disposition_t sctp_sf_do_prm_asoc(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *repl;
+ struct sctp_association *my_asoc;
+
+ /* The comment below says that we enter COOKIE-WAIT AFTER
+ * sending the INIT, but that doesn't actually work in our
+ * implementation...
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_COOKIE_WAIT));
+
+ /* RFC 2960 5.1 Normal Establishment of an Association
+ *
+ * A) "A" first sends an INIT chunk to "Z". In the INIT, "A"
+ * must provide its Verification Tag (Tag_A) in the Initiate
+ * Tag field. Tag_A SHOULD be a random number in the range of
+ * 1 to 4294967295 (see 5.3.1 for Tag value selection). ...
+ */
+
+ repl = sctp_make_init(asoc, &asoc->base.bind_addr, GFP_ATOMIC, 0);
+ if (!repl)
+ goto nomem;
+
+ /* Choose transport for INIT. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+ SCTP_CHUNK(repl));
+
+ /* Cast away the const modifier, as we want to just
+ * rerun it through as a sideffect.
+ */
+ my_asoc = (struct sctp_association *)asoc;
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(my_asoc));
+
+ /* After sending the INIT, "A" starts the T1-init timer and
+ * enters the COOKIE-WAIT state.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Process the SEND primitive.
+ *
+ * Section: 10.1 ULP-to-SCTP
+ * E) Send
+ *
+ * Format: SEND(association id, buffer address, byte count [,context]
+ * [,stream id] [,life time] [,destination transport address]
+ * [,unorder flag] [,no-bundle flag] [,payload protocol-id] )
+ * -> result
+ *
+ * This is the main method to send user data via SCTP.
+ *
+ * Mandatory attributes:
+ *
+ * o association id - local handle to the SCTP association
+ *
+ * o buffer address - the location where the user message to be
+ * transmitted is stored;
+ *
+ * o byte count - The size of the user data in number of bytes;
+ *
+ * Optional attributes:
+ *
+ * o context - an optional 32 bit integer that will be carried in the
+ * sending failure notification to the ULP if the transportation of
+ * this User Message fails.
+ *
+ * o stream id - to indicate which stream to send the data on. If not
+ * specified, stream 0 will be used.
+ *
+ * o life time - specifies the life time of the user data. The user data
+ * will not be sent by SCTP after the life time expires. This
+ * parameter can be used to avoid efforts to transmit stale
+ * user messages. SCTP notifies the ULP if the data cannot be
+ * initiated to transport (i.e. sent to the destination via SCTP's
+ * send primitive) within the life time variable. However, the
+ * user data will be transmitted if SCTP has attempted to transmit a
+ * chunk before the life time expired.
+ *
+ * o destination transport address - specified as one of the destination
+ * transport addresses of the peer endpoint to which this packet
+ * should be sent. Whenever possible, SCTP should use this destination
+ * transport address for sending the packets, instead of the current
+ * primary path.
+ *
+ * o unorder flag - this flag, if present, indicates that the user
+ * would like the data delivered in an unordered fashion to the peer
+ * (i.e., the U flag is set to 1 on all DATA chunks carrying this
+ * message).
+ *
+ * o no-bundle flag - instructs SCTP not to bundle this user data with
+ * other outbound DATA chunks. SCTP MAY still bundle even when
+ * this flag is present, when faced with network congestion.
+ *
+ * o payload protocol-id - A 32 bit unsigned integer that is to be
+ * passed to the peer indicating the type of payload protocol data
+ * being transmitted. This value is passed as opaque data by SCTP.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_prm_send(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_datamsg *msg = arg;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_MSG, SCTP_DATAMSG(msg));
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Process the SHUTDOWN primitive.
+ *
+ * Section: 10.1:
+ * C) Shutdown
+ *
+ * Format: SHUTDOWN(association id)
+ * -> result
+ *
+ * Gracefully closes an association. Any locally queued user data
+ * will be delivered to the peer. The association will be terminated only
+ * after the peer acknowledges all the SCTP packets sent. A success code
+ * will be returned on successful termination of the association. If
+ * attempting to terminate the association results in a failure, an error
+ * code shall be returned.
+ *
+ * Mandatory attributes:
+ *
+ * o association id - local handle to the SCTP association
+ *
+ * Optional attributes:
+ *
+ * None.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_2_prm_shutdown(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ int disposition;
+
+ /* From 9.2 Shutdown of an Association
+ * Upon receipt of the SHUTDOWN primitive from its upper
+ * layer, the endpoint enters SHUTDOWN-PENDING state and
+ * remains there until all outstanding data has been
+ * acknowledged by its peer. The endpoint accepts no new data
+ * from its upper layer, but retransmits data to the far end
+ * if necessary to fill gaps.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING));
+
+ disposition = SCTP_DISPOSITION_CONSUME;
+ if (sctp_outq_is_empty(&asoc->outqueue)) {
+ disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type,
+ arg, commands);
+ }
+ return disposition;
+}
+
+/*
+ * Process the ABORT primitive.
+ *
+ * Section: 10.1:
+ * C) Abort
+ *
+ * Format: Abort(association id [, cause code])
+ * -> result
+ *
+ * Ungracefully closes an association. Any locally queued user data
+ * will be discarded and an ABORT chunk is sent to the peer. A success code
+ * will be returned on successful abortion of the association. If
+ * attempting to abort the association results in a failure, an error
+ * code shall be returned.
+ *
+ * Mandatory attributes:
+ *
+ * o association id - local handle to the SCTP association
+ *
+ * Optional attributes:
+ *
+ * o cause code - reason of the abort to be passed to the peer
+ *
+ * None.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_1_prm_abort(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* From 9.1 Abort of an Association
+ * Upon receipt of the ABORT primitive from its upper
+ * layer, the endpoint enters CLOSED state and
+ * discard all outstanding data has been
+ * acknowledged by its peer. The endpoint accepts no new data
+ * from its upper layer, but retransmits data to the far end
+ * if necessary to fill gaps.
+ */
+ struct sctp_chunk *abort = arg;
+ sctp_disposition_t retval;
+
+ retval = SCTP_DISPOSITION_CONSUME;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+
+ /* Even if we can't send the ABORT due to low memory delete the
+ * TCB. This is a departure from our typical NOMEM handling.
+ */
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ECONNABORTED));
+ /* Delete the established association. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_USER_ABORT));
+
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+
+ return retval;
+}
+
+/* We tried an illegal operation on an association which is closed. */
+sctp_disposition_t sctp_sf_error_closed(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, SCTP_ERROR(-EINVAL));
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/* We tried an illegal operation on an association which is shutting
+ * down.
+ */
+sctp_disposition_t sctp_sf_error_shutdown(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR,
+ SCTP_ERROR(-ESHUTDOWN));
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_cookie_wait_prm_shutdown
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explicitly address this issue, but is the route through the
+ * state table when someone issues a shutdown while in COOKIE_WAIT state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_cookie_wait_prm_shutdown(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_CLOSED));
+
+ SCTP_INC_STATS(net, SCTP_MIB_SHUTDOWNS);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+
+ return SCTP_DISPOSITION_DELETE_TCB;
+}
+
+/*
+ * sctp_cookie_echoed_prm_shutdown
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explcitly address this issue, but is the route through the
+ * state table when someone issues a shutdown while in COOKIE_ECHOED state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_cookie_echoed_prm_shutdown(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg, sctp_cmd_seq_t *commands)
+{
+ /* There is a single T1 timer, so we should be able to use
+ * common function with the COOKIE-WAIT state.
+ */
+ return sctp_sf_cookie_wait_prm_shutdown(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * sctp_sf_cookie_wait_prm_abort
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explicitly address this issue, but is the route through the
+ * state table when someone issues an abort while in COOKIE_WAIT state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *abort = arg;
+ sctp_disposition_t retval;
+
+ /* Stop T1-init timer */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+ retval = SCTP_DISPOSITION_CONSUME;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_CLOSED));
+
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+
+ /* Even if we can't send the ABORT due to low memory delete the
+ * TCB. This is a departure from our typical NOMEM handling.
+ */
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ECONNREFUSED));
+ /* Delete the established association. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+ SCTP_PERR(SCTP_ERROR_USER_ABORT));
+
+ return retval;
+}
+
+/*
+ * sctp_sf_cookie_echoed_prm_abort
+ *
+ * Section: 4 Note: 3
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explcitly address this issue, but is the route through the
+ * state table when someone issues an abort while in COOKIE_ECHOED state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_cookie_echoed_prm_abort(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* There is a single T1 timer, so we should be able to use
+ * common function with the COOKIE-WAIT state.
+ */
+ return sctp_sf_cookie_wait_prm_abort(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * sctp_sf_shutdown_pending_prm_abort
+ *
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explicitly address this issue, but is the route through the
+ * state table when someone issues an abort while in SHUTDOWN-PENDING state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_shutdown_pending_prm_abort(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* Stop the T5-shutdown guard timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+ return sctp_sf_do_9_1_prm_abort(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * sctp_sf_shutdown_sent_prm_abort
+ *
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explicitly address this issue, but is the route through the
+ * state table when someone issues an abort while in SHUTDOWN-SENT state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_shutdown_sent_prm_abort(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* Stop the T2-shutdown timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+ /* Stop the T5-shutdown guard timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+ return sctp_sf_do_9_1_prm_abort(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * sctp_sf_cookie_echoed_prm_abort
+ *
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explcitly address this issue, but is the route through the
+ * state table when someone issues an abort while in COOKIE_ECHOED state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_shutdown_ack_sent_prm_abort(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ /* The same T2 timer, so we should be able to use
+ * common function with the SHUTDOWN-SENT state.
+ */
+ return sctp_sf_shutdown_sent_prm_abort(net, ep, asoc, type, arg, commands);
+}
+
+/*
+ * Process the REQUESTHEARTBEAT primitive
+ *
+ * 10.1 ULP-to-SCTP
+ * J) Request Heartbeat
+ *
+ * Format: REQUESTHEARTBEAT(association id, destination transport address)
+ *
+ * -> result
+ *
+ * Instructs the local endpoint to perform a HeartBeat on the specified
+ * destination transport address of the given association. The returned
+ * result should indicate whether the transmission of the HEARTBEAT
+ * chunk to the destination address is successful.
+ *
+ * Mandatory attributes:
+ *
+ * o association id - local handle to the SCTP association
+ *
+ * o destination transport address - the transport address of the
+ * association on which a heartbeat should be issued.
+ */
+sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type,
+ (struct sctp_transport *)arg, commands))
+ return SCTP_DISPOSITION_NOMEM;
+
+ /*
+ * RFC 2960 (bis), section 8.3
+ *
+ * D) Request an on-demand HEARTBEAT on a specific destination
+ * transport address of a given association.
+ *
+ * The endpoint should increment the respective error counter of
+ * the destination transport address each time a HEARTBEAT is sent
+ * to that address and not acknowledged within one RTO.
+ *
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
+ SCTP_TRANSPORT(arg));
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * ADDIP Section 4.1 ASCONF Chunk Procedures
+ * When an endpoint has an ASCONF signaled change to be sent to the
+ * remote endpoint it should do A1 to A9
+ */
+sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = arg;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk));
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Ignore the primitive event
+ *
+ * The return value is the disposition of the primitive.
+ */
+sctp_disposition_t sctp_sf_ignore_primitive(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ pr_debug("%s: primitive type:%d is ignored\n", __func__,
+ type.primitive);
+
+ return SCTP_DISPOSITION_DISCARD;
+}
+
+/***************************************************************************
+ * These are the state functions for the OTHER events.
+ ***************************************************************************/
+
+/*
+ * When the SCTP stack has no more user data to send or retransmit, this
+ * notification is given to the user. Also, at the time when a user app
+ * subscribes to this event, if there is no data to be sent or
+ * retransmit, the stack will immediately send up this notification.
+ */
+sctp_disposition_t sctp_sf_do_no_pending_tsn(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_ulpevent *event;
+
+ event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_ATOMIC);
+ if (!event)
+ return SCTP_DISPOSITION_NOMEM;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Start the shutdown negotiation.
+ *
+ * From Section 9.2:
+ * Once all its outstanding data has been acknowledged, the endpoint
+ * shall send a SHUTDOWN chunk to its peer including in the Cumulative
+ * TSN Ack field the last sequential TSN it has received from the peer.
+ * It shall then start the T2-shutdown timer and enter the SHUTDOWN-SENT
+ * state. If the timer expires, the endpoint must re-send the SHUTDOWN
+ * with the updated last sequential TSN received from its peer.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_2_start_shutdown(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *reply;
+
+ /* Once all its outstanding data has been acknowledged, the
+ * endpoint shall send a SHUTDOWN chunk to its peer including
+ * in the Cumulative TSN Ack field the last sequential TSN it
+ * has received from the peer.
+ */
+ reply = sctp_make_shutdown(asoc, NULL);
+ if (!reply)
+ goto nomem;
+
+ /* Set the transport for the SHUTDOWN chunk and the timeout for the
+ * T2-shutdown timer.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));
+
+ /* It shall then start the T2-shutdown timer */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+ /* RFC 4960 Section 9.2
+ * The sender of the SHUTDOWN MAY also start an overall guard timer
+ * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+ if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE])
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+
+ /* and enter the SHUTDOWN-SENT state. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_SHUTDOWN_SENT));
+
+ /* sctp-implguide 2.10 Issues with Heartbeating and failover
+ *
+ * HEARTBEAT ... is discontinued after sending either SHUTDOWN
+ * or SHUTDOWN-ACK.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Generate a SHUTDOWN ACK now that everything is SACK'd.
+ *
+ * From Section 9.2:
+ *
+ * If it has no more outstanding DATA chunks, the SHUTDOWN receiver
+ * shall send a SHUTDOWN ACK and start a T2-shutdown timer of its own,
+ * entering the SHUTDOWN-ACK-SENT state. If the timer expires, the
+ * endpoint must re-send the SHUTDOWN ACK.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_2_shutdown_ack(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = (struct sctp_chunk *) arg;
+ struct sctp_chunk *reply;
+
+ /* There are 2 ways of getting here:
+ * 1) called in response to a SHUTDOWN chunk
+ * 2) called when SCTP_EVENT_NO_PENDING_TSN event is issued.
+ *
+ * For the case (2), the arg parameter is set to NULL. We need
+ * to check that we have a chunk before accessing it's fields.
+ */
+ if (chunk) {
+ if (!sctp_vtag_verify(chunk, asoc))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
+ /* Make sure that the SHUTDOWN chunk has a valid length. */
+ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk_t)))
+ return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
+ commands);
+ }
+
+ /* If it has no more outstanding DATA chunks, the SHUTDOWN receiver
+ * shall send a SHUTDOWN ACK ...
+ */
+ reply = sctp_make_shutdown_ack(asoc, chunk);
+ if (!reply)
+ goto nomem;
+
+ /* Set the transport for the SHUTDOWN ACK chunk and the timeout for
+ * the T2-shutdown timer.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));
+
+ /* and start/restart a T2-shutdown timer of its own, */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+ if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE])
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+
+ /* Enter the SHUTDOWN-ACK-SENT state. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_SHUTDOWN_ACK_SENT));
+
+ /* sctp-implguide 2.10 Issues with Heartbeating and failover
+ *
+ * HEARTBEAT ... is discontinued after sending either SHUTDOWN
+ * or SHUTDOWN-ACK.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Ignore the event defined as other
+ *
+ * The return value is the disposition of the event.
+ */
+sctp_disposition_t sctp_sf_ignore_other(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ pr_debug("%s: the event other type:%d is ignored\n",
+ __func__, type.other);
+
+ return SCTP_DISPOSITION_DISCARD;
+}
+
+/************************************************************
+ * These are the state functions for handling timeout events.
+ ************************************************************/
+
+/*
+ * RTX Timeout
+ *
+ * Section: 6.3.3 Handle T3-rtx Expiration
+ *
+ * Whenever the retransmission timer T3-rtx expires for a destination
+ * address, do the following:
+ * [See below]
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_transport *transport = arg;
+
+ SCTP_INC_STATS(net, SCTP_MIB_T3_RTX_EXPIREDS);
+
+ if (asoc->overall_error_count >= asoc->max_retrans) {
+ if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
+ /*
+ * We are here likely because the receiver had its rwnd
+ * closed for a while and we have not been able to
+ * transmit the locally queued data within the maximum
+ * retransmission attempts limit. Start the T5
+ * shutdown guard timer to give the receiver one last
+ * chance and some additional time to recover before
+ * aborting.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START_ONCE,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+ } else {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_ERROR));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ return SCTP_DISPOSITION_DELETE_TCB;
+ }
+ }
+
+ /* E1) For the destination address for which the timer
+ * expires, adjust its ssthresh with rules defined in Section
+ * 7.2.3 and set the cwnd <- MTU.
+ */
+
+ /* E2) For the destination address for which the timer
+ * expires, set RTO <- RTO * 2 ("back off the timer"). The
+ * maximum value discussed in rule C7 above (RTO.max) may be
+ * used to provide an upper bound to this doubling operation.
+ */
+
+ /* E3) Determine how many of the earliest (i.e., lowest TSN)
+ * outstanding DATA chunks for the address for which the
+ * T3-rtx has expired will fit into a single packet, subject
+ * to the MTU constraint for the path corresponding to the
+ * destination transport address to which the retransmission
+ * is being sent (this may be different from the address for
+ * which the timer expires [see Section 6.4]). Call this
+ * value K. Bundle and retransmit those K DATA chunks in a
+ * single packet to the destination endpoint.
+ *
+ * Note: Any DATA chunks that were sent to the address for
+ * which the T3-rtx timer expired but did not fit in one MTU
+ * (rule E3 above), should be marked for retransmission and
+ * sent as soon as cwnd allows (normally when a SACK arrives).
+ */
+
+ /* Do some failure management (Section 8.2). */
+ sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));
+
+ /* NB: Rules E4 and F1 are implicit in R1. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_RETRAN, SCTP_TRANSPORT(transport));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Generate delayed SACK on timeout
+ *
+ * Section: 6.2 Acknowledgement on Reception of DATA Chunks
+ *
+ * The guidelines on delayed acknowledgement algorithm specified in
+ * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, an
+ * acknowledgement SHOULD be generated for at least every second packet
+ * (not every second DATA chunk) received, and SHOULD be generated
+ * within 200 ms of the arrival of any unacknowledged DATA chunk. In
+ * some situations it may be beneficial for an SCTP transmitter to be
+ * more conservative than the algorithms detailed in this document
+ * allow. However, an SCTP transmitter MUST NOT be more aggressive than
+ * the following algorithms allow.
+ */
+sctp_disposition_t sctp_sf_do_6_2_sack(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ SCTP_INC_STATS(net, SCTP_MIB_DELAY_SACK_EXPIREDS);
+ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_sf_t1_init_timer_expire
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * RFC 2960 Section 4 Notes
+ * 2) If the T1-init timer expires, the endpoint MUST retransmit INIT
+ * and re-start the T1-init timer without changing state. This MUST
+ * be repeated up to 'Max.Init.Retransmits' times. After that, the
+ * endpoint MUST abort the initialization process and report the
+ * error to SCTP user.
+ *
+ * Outputs
+ * (timers, events)
+ *
+ */
+sctp_disposition_t sctp_sf_t1_init_timer_expire(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *repl = NULL;
+ struct sctp_bind_addr *bp;
+ int attempts = asoc->init_err_counter + 1;
+
+ pr_debug("%s: timer T1 expired (INIT)\n", __func__);
+
+ SCTP_INC_STATS(net, SCTP_MIB_T1_INIT_EXPIREDS);
+
+ if (attempts <= asoc->max_init_attempts) {
+ bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
+ repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
+ if (!repl)
+ return SCTP_DISPOSITION_NOMEM;
+
+ /* Choose transport for INIT. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+ SCTP_CHUNK(repl));
+
+ /* Issue a sideeffect to do the needed accounting. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+ } else {
+ pr_debug("%s: giving up on INIT, attempts:%d "
+ "max_init_attempts:%d\n", __func__, attempts,
+ asoc->max_init_attempts);
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_ERROR));
+ return SCTP_DISPOSITION_DELETE_TCB;
+ }
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_sf_t1_cookie_timer_expire
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * RFC 2960 Section 4 Notes
+ * 3) If the T1-cookie timer expires, the endpoint MUST retransmit
+ * COOKIE ECHO and re-start the T1-cookie timer without changing
+ * state. This MUST be repeated up to 'Max.Init.Retransmits' times.
+ * After that, the endpoint MUST abort the initialization process and
+ * report the error to SCTP user.
+ *
+ * Outputs
+ * (timers, events)
+ *
+ */
+sctp_disposition_t sctp_sf_t1_cookie_timer_expire(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *repl = NULL;
+ int attempts = asoc->init_err_counter + 1;
+
+ pr_debug("%s: timer T1 expired (COOKIE-ECHO)\n", __func__);
+
+ SCTP_INC_STATS(net, SCTP_MIB_T1_COOKIE_EXPIREDS);
+
+ if (attempts <= asoc->max_init_attempts) {
+ repl = sctp_make_cookie_echo(asoc, NULL);
+ if (!repl)
+ return SCTP_DISPOSITION_NOMEM;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+ SCTP_CHUNK(repl));
+ /* Issue a sideeffect to do the needed accounting. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+ } else {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_ERROR));
+ return SCTP_DISPOSITION_DELETE_TCB;
+ }
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN
+ * with the updated last sequential TSN received from its peer.
+ *
+ * An endpoint should limit the number of retransmissions of the
+ * SHUTDOWN chunk to the protocol parameter 'Association.Max.Retrans'.
+ * If this threshold is exceeded the endpoint should destroy the TCB and
+ * MUST report the peer endpoint unreachable to the upper layer (and
+ * thus the association enters the CLOSED state). The reception of any
+ * packet from its peer (i.e. as the peer sends all of its queued DATA
+ * chunks) should clear the endpoint's retransmission count and restart
+ * the T2-Shutdown timer, giving its peer ample opportunity to transmit
+ * all of its queued DATA chunks that have not yet been sent.
+ */
+sctp_disposition_t sctp_sf_t2_timer_expire(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *reply = NULL;
+
+ pr_debug("%s: timer T2 expired\n", __func__);
+
+ SCTP_INC_STATS(net, SCTP_MIB_T2_SHUTDOWN_EXPIREDS);
+
+ ((struct sctp_association *)asoc)->shutdown_retries++;
+
+ if (asoc->overall_error_count >= asoc->max_retrans) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ /* Note: CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_ERROR));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ return SCTP_DISPOSITION_DELETE_TCB;
+ }
+
+ switch (asoc->state) {
+ case SCTP_STATE_SHUTDOWN_SENT:
+ reply = sctp_make_shutdown(asoc, NULL);
+ break;
+
+ case SCTP_STATE_SHUTDOWN_ACK_SENT:
+ reply = sctp_make_shutdown_ack(asoc, NULL);
+ break;
+
+ default:
+ BUG();
+ break;
+ }
+
+ if (!reply)
+ goto nomem;
+
+ /* Do some failure management (Section 8.2).
+ * If we remove the transport an SHUTDOWN was last sent to, don't
+ * do failure management.
+ */
+ if (asoc->shutdown_last_sent_to)
+ sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE,
+ SCTP_TRANSPORT(asoc->shutdown_last_sent_to));
+
+ /* Set the transport for the SHUTDOWN/ACK chunk and the timeout for
+ * the T2-shutdown timer.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));
+
+ /* Restart the T2-shutdown timer. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+ return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * ADDIP Section 4.1 ASCONF CHunk Procedures
+ * If the T4 RTO timer expires the endpoint should do B1 to B5
+ */
+sctp_disposition_t sctp_sf_t4_timer_expire(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *chunk = asoc->addip_last_asconf;
+ struct sctp_transport *transport = chunk->transport;
+
+ SCTP_INC_STATS(net, SCTP_MIB_T4_RTO_EXPIREDS);
+
+ /* ADDIP 4.1 B1) Increment the error counters and perform path failure
+ * detection on the appropriate destination address as defined in
+ * RFC2960 [5] section 8.1 and 8.2.
+ */
+ if (transport)
+ sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE,
+ SCTP_TRANSPORT(transport));
+
+ /* Reconfig T4 timer and transport. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk));
+
+ /* ADDIP 4.1 B2) Increment the association error counters and perform
+ * endpoint failure detection on the association as defined in
+ * RFC2960 [5] section 8.1 and 8.2.
+ * association error counter is incremented in SCTP_CMD_STRIKE.
+ */
+ if (asoc->overall_error_count >= asoc->max_retrans) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_ERROR));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ return SCTP_DISPOSITION_ABORT;
+ }
+
+ /* ADDIP 4.1 B3) Back-off the destination address RTO value to which
+ * the ASCONF chunk was sent by doubling the RTO timer value.
+ * This is done in SCTP_CMD_STRIKE.
+ */
+
+ /* ADDIP 4.1 B4) Re-transmit the ASCONF Chunk last sent and if possible
+ * choose an alternate destination address (please refer to RFC2960
+ * [5] section 6.4.1). An endpoint MUST NOT add new parameters to this
+ * chunk, it MUST be the same (including its serial number) as the last
+ * ASCONF sent.
+ */
+ sctp_chunk_hold(asoc->addip_last_asconf);
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(asoc->addip_last_asconf));
+
+ /* ADDIP 4.1 B5) Restart the T-4 RTO timer. Note that if a different
+ * destination is selected, then the RTO used will be that of the new
+ * destination address.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/* sctpimpguide-05 Section 2.12.2
+ * The sender of the SHUTDOWN MAY also start an overall guard timer
+ * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
+ * At the expiration of this timer the sender SHOULD abort the association
+ * by sending an ABORT chunk.
+ */
+sctp_disposition_t sctp_sf_t5_timer_expire(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *reply = NULL;
+
+ pr_debug("%s: timer T5 expired\n", __func__);
+
+ SCTP_INC_STATS(net, SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS);
+
+ reply = sctp_make_abort(asoc, NULL, 0);
+ if (!reply)
+ goto nomem;
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ETIMEDOUT));
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_ERROR));
+
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+
+ return SCTP_DISPOSITION_DELETE_TCB;
+nomem:
+ return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Handle expiration of AUTOCLOSE timer. When the autoclose timer expires,
+ * the association is automatically closed by starting the shutdown process.
+ * The work that needs to be done is same as when SHUTDOWN is initiated by
+ * the user. So this routine looks same as sctp_sf_do_9_2_prm_shutdown().
+ */
+sctp_disposition_t sctp_sf_autoclose_timer_expire(
+ struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ int disposition;
+
+ SCTP_INC_STATS(net, SCTP_MIB_AUTOCLOSE_EXPIREDS);
+
+ /* From 9.2 Shutdown of an Association
+ * Upon receipt of the SHUTDOWN primitive from its upper
+ * layer, the endpoint enters SHUTDOWN-PENDING state and
+ * remains there until all outstanding data has been
+ * acknowledged by its peer. The endpoint accepts no new data
+ * from its upper layer, but retransmits data to the far end
+ * if necessary to fill gaps.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+ SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING));
+
+ disposition = SCTP_DISPOSITION_CONSUME;
+ if (sctp_outq_is_empty(&asoc->outqueue)) {
+ disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type,
+ arg, commands);
+ }
+ return disposition;
+}
+
+/*****************************************************************************
+ * These are sa state functions which could apply to all types of events.
+ ****************************************************************************/
+
+/*
+ * This table entry is not implemented.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_not_impl(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ return SCTP_DISPOSITION_NOT_IMPL;
+}
+
+/*
+ * This table entry represents a bug.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_bug(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ return SCTP_DISPOSITION_BUG;
+}
+
+/*
+ * This table entry represents the firing of a timer in the wrong state.
+ * Since timer deletion cannot be guaranteed a timer 'may' end up firing
+ * when the association is in the wrong state. This event should
+ * be ignored, so as to prevent any rearming of the timer.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_timer_ignore(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ pr_debug("%s: timer %d ignored\n", __func__, type.chunk);
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* Pull the SACK chunk based on the SACK header. */
+static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk)
+{
+ struct sctp_sackhdr *sack;
+ unsigned int len;
+ __u16 num_blocks;
+ __u16 num_dup_tsns;
+
+ /* Protect ourselves from reading too far into
+ * the skb from a bogus sender.
+ */
+ sack = (struct sctp_sackhdr *) chunk->skb->data;
+
+ num_blocks = ntohs(sack->num_gap_ack_blocks);
+ num_dup_tsns = ntohs(sack->num_dup_tsns);
+ len = sizeof(struct sctp_sackhdr);
+ len += (num_blocks + num_dup_tsns) * sizeof(__u32);
+ if (len > chunk->skb->len)
+ return NULL;
+
+ skb_pull(chunk->skb, len);
+
+ return sack;
+}
+
+/* Create an ABORT packet to be sent as a response, with the specified
+ * error causes.
+ */
+static struct sctp_packet *sctp_abort_pkt_new(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ const void *payload,
+ size_t paylen)
+{
+ struct sctp_packet *packet;
+ struct sctp_chunk *abort;
+
+ packet = sctp_ootb_pkt_new(net, asoc, chunk);
+
+ if (packet) {
+ /* Make an ABORT.
+ * The T bit will be set if the asoc is NULL.
+ */
+ abort = sctp_make_abort(asoc, chunk, paylen);
+ if (!abort) {
+ sctp_ootb_pkt_free(packet);
+ return NULL;
+ }
+
+ /* Reflect vtag if T-Bit is set */
+ if (sctp_test_T_bit(abort))
+ packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+
+ /* Add specified error causes, i.e., payload, to the
+ * end of the chunk.
+ */
+ sctp_addto_chunk(abort, paylen, payload);
+
+ /* Set the skb to the belonging sock for accounting. */
+ abort->skb->sk = ep->base.sk;
+
+ sctp_packet_append_chunk(packet, abort);
+
+ }
+
+ return packet;
+}
+
+/* Allocate a packet for responding in the OOTB conditions. */
+static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk)
+{
+ struct sctp_packet *packet;
+ struct sctp_transport *transport;
+ __u16 sport;
+ __u16 dport;
+ __u32 vtag;
+
+ /* Get the source and destination port from the inbound packet. */
+ sport = ntohs(chunk->sctp_hdr->dest);
+ dport = ntohs(chunk->sctp_hdr->source);
+
+ /* The V-tag is going to be the same as the inbound packet if no
+ * association exists, otherwise, use the peer's vtag.
+ */
+ if (asoc) {
+ /* Special case the INIT-ACK as there is no peer's vtag
+ * yet.
+ */
+ switch (chunk->chunk_hdr->type) {
+ case SCTP_CID_INIT_ACK:
+ {
+ sctp_initack_chunk_t *initack;
+
+ initack = (sctp_initack_chunk_t *)chunk->chunk_hdr;
+ vtag = ntohl(initack->init_hdr.init_tag);
+ break;
+ }
+ default:
+ vtag = asoc->peer.i.init_tag;
+ break;
+ }
+ } else {
+ /* Special case the INIT and stale COOKIE_ECHO as there is no
+ * vtag yet.
+ */
+ switch (chunk->chunk_hdr->type) {
+ case SCTP_CID_INIT:
+ {
+ sctp_init_chunk_t *init;
+
+ init = (sctp_init_chunk_t *)chunk->chunk_hdr;
+ vtag = ntohl(init->init_hdr.init_tag);
+ break;
+ }
+ default:
+ vtag = ntohl(chunk->sctp_hdr->vtag);
+ break;
+ }
+ }
+
+ /* Make a transport for the bucket, Eliza... */
+ transport = sctp_transport_new(net, sctp_source(chunk), GFP_ATOMIC);
+ if (!transport)
+ goto nomem;
+
+ /* Cache a route for the transport with the chunk's destination as
+ * the source address.
+ */
+ sctp_transport_route(transport, (union sctp_addr *)&chunk->dest,
+ sctp_sk(net->sctp.ctl_sock));
+
+ packet = sctp_packet_init(&transport->packet, transport, sport, dport);
+ packet = sctp_packet_config(packet, vtag, 0);
+
+ return packet;
+
+nomem:
+ return NULL;
+}
+
+/* Free the packet allocated earlier for responding in the OOTB condition. */
+void sctp_ootb_pkt_free(struct sctp_packet *packet)
+{
+ sctp_transport_free(packet->transport);
+}
+
+/* Send a stale cookie error when a invalid COOKIE ECHO chunk is found */
+static void sctp_send_stale_cookie_err(struct net *net,
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const struct sctp_chunk *chunk,
+ sctp_cmd_seq_t *commands,
+ struct sctp_chunk *err_chunk)
+{
+ struct sctp_packet *packet;
+
+ if (err_chunk) {
+ packet = sctp_ootb_pkt_new(net, asoc, chunk);
+ if (packet) {
+ struct sctp_signed_cookie *cookie;
+
+ /* Override the OOTB vtag from the cookie. */
+ cookie = chunk->subh.cookie_hdr;
+ packet->vtag = cookie->c.peer_vtag;
+
+ /* Set the skb to the belonging sock for accounting. */
+ err_chunk->skb->sk = ep->base.sk;
+ sctp_packet_append_chunk(packet, err_chunk);
+ sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+ SCTP_PACKET(packet));
+ SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+ } else
+ sctp_chunk_free (err_chunk);
+ }
+}
+
+
+/* Process a data chunk */
+static int sctp_eat_data(const struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ sctp_cmd_seq_t *commands)
+{
+ sctp_datahdr_t *data_hdr;
+ struct sctp_chunk *err;
+ size_t datalen;
+ sctp_verb_t deliver;
+ int tmp;
+ __u32 tsn;
+ struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
+ struct sock *sk = asoc->base.sk;
+ struct net *net = sock_net(sk);
+ u16 ssn;
+ u16 sid;
+ u8 ordered = 0;
+
+ data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data;
+ skb_pull(chunk->skb, sizeof(sctp_datahdr_t));
+
+ tsn = ntohl(data_hdr->tsn);
+ pr_debug("%s: TSN 0x%x\n", __func__, tsn);
+
+ /* ASSERT: Now skb->data is really the user data. */
+
+ /* Process ECN based congestion.
+ *
+ * Since the chunk structure is reused for all chunks within
+ * a packet, we use ecn_ce_done to track if we've already
+ * done CE processing for this packet.
+ *
+ * We need to do ECN processing even if we plan to discard the
+ * chunk later.
+ */
+
+ if (!chunk->ecn_ce_done) {
+ struct sctp_af *af;
+ chunk->ecn_ce_done = 1;
+
+ af = sctp_get_af_specific(
+ ipver2af(ip_hdr(chunk->skb)->version));
+
+ if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) {
+ /* Do real work as sideffect. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE,
+ SCTP_U32(tsn));
+ }
+ }
+
+ tmp = sctp_tsnmap_check(&asoc->peer.tsn_map, tsn);
+ if (tmp < 0) {
+ /* The TSN is too high--silently discard the chunk and
+ * count on it getting retransmitted later.
+ */
+ if (chunk->asoc)
+ chunk->asoc->stats.outofseqtsns++;
+ return SCTP_IERROR_HIGH_TSN;
+ } else if (tmp > 0) {
+ /* This is a duplicate. Record it. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_DUP, SCTP_U32(tsn));
+ return SCTP_IERROR_DUP_TSN;
+ }
+
+ /* This is a new TSN. */
+
+ /* Discard if there is no room in the receive window.
+ * Actually, allow a little bit of overflow (up to a MTU).
+ */
+ datalen = ntohs(chunk->chunk_hdr->length);
+ datalen -= sizeof(sctp_data_chunk_t);
+
+ deliver = SCTP_CMD_CHUNK_ULP;
+
+ /* Think about partial delivery. */
+ if ((datalen >= asoc->rwnd) && (!asoc->ulpq.pd_mode)) {
+
+ /* Even if we don't accept this chunk there is
+ * memory pressure.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_PART_DELIVER, SCTP_NULL());
+ }
+
+ /* Spill over rwnd a little bit. Note: While allowed, this spill over
+ * seems a bit troublesome in that frag_point varies based on
+ * PMTU. In cases, such as loopback, this might be a rather
+ * large spill over.
+ */
+ if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over ||
+ (datalen > asoc->rwnd + asoc->frag_point))) {
+
+ /* If this is the next TSN, consider reneging to make
+ * room. Note: Playing nice with a confused sender. A
+ * malicious sender can still eat up all our buffer
+ * space and in the future we may want to detect and
+ * do more drastic reneging.
+ */
+ if (sctp_tsnmap_has_gap(map) &&
+ (sctp_tsnmap_get_ctsn(map) + 1) == tsn) {
+ pr_debug("%s: reneging for tsn:%u\n", __func__, tsn);
+ deliver = SCTP_CMD_RENEGE;
+ } else {
+ pr_debug("%s: discard tsn:%u len:%zu, rwnd:%d\n",
+ __func__, tsn, datalen, asoc->rwnd);
+
+ return SCTP_IERROR_IGNORE_TSN;
+ }
+ }
+
+ /*
+ * Also try to renege to limit our memory usage in the event that
+ * we are under memory pressure
+ * If we can't renege, don't worry about it, the sk_rmem_schedule
+ * in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our
+ * memory usage too much
+ */
+ if (*sk->sk_prot_creator->memory_pressure) {
+ if (sctp_tsnmap_has_gap(map) &&
+ (sctp_tsnmap_get_ctsn(map) + 1) == tsn) {
+ pr_debug("%s: under pressure, reneging for tsn:%u\n",
+ __func__, tsn);
+ deliver = SCTP_CMD_RENEGE;
+ }
+ }
+
+ /*
+ * Section 3.3.10.9 No User Data (9)
+ *
+ * Cause of error
+ * ---------------
+ * No User Data: This error cause is returned to the originator of a
+ * DATA chunk if a received DATA chunk has no user data.
+ */
+ if (unlikely(0 == datalen)) {
+ err = sctp_make_abort_no_data(asoc, chunk, tsn);
+ if (err) {
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(err));
+ }
+ /* We are going to ABORT, so we might as well stop
+ * processing the rest of the chunks in the packet.
+ */
+ sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
+ sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+ SCTP_ERROR(ECONNABORTED));
+ sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+ SCTP_PERR(SCTP_ERROR_NO_DATA));
+ SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
+ SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
+ return SCTP_IERROR_NO_DATA;
+ }
+
+ chunk->data_accepted = 1;
+
+ /* Note: Some chunks may get overcounted (if we drop) or overcounted
+ * if we renege and the chunk arrives again.
+ */
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
+ SCTP_INC_STATS(net, SCTP_MIB_INUNORDERCHUNKS);
+ if (chunk->asoc)
+ chunk->asoc->stats.iuodchunks++;
+ } else {
+ SCTP_INC_STATS(net, SCTP_MIB_INORDERCHUNKS);
+ if (chunk->asoc)
+ chunk->asoc->stats.iodchunks++;
+ ordered = 1;
+ }
+
+ /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
+ *
+ * If an endpoint receive a DATA chunk with an invalid stream
+ * identifier, it shall acknowledge the reception of the DATA chunk
+ * following the normal procedure, immediately send an ERROR chunk
+ * with cause set to "Invalid Stream Identifier" (See Section 3.3.10)
+ * and discard the DATA chunk.
+ */
+ sid = ntohs(data_hdr->stream);
+ if (sid >= asoc->c.sinit_max_instreams) {
+ /* Mark tsn as received even though we drop it */
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
+
+ err = sctp_make_op_error(asoc, chunk, SCTP_ERROR_INV_STRM,
+ &data_hdr->stream,
+ sizeof(data_hdr->stream),
+ sizeof(u16));
+ if (err)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+ SCTP_CHUNK(err));
+ return SCTP_IERROR_BAD_STREAM;
+ }
+
+ /* Check to see if the SSN is possible for this TSN.
+ * The biggest gap we can record is 4K wide. Since SSNs wrap
+ * at an unsigned short, there is no way that an SSN can
+ * wrap and for a valid TSN. We can simply check if the current
+ * SSN is smaller then the next expected one. If it is, it wrapped
+ * and is invalid.
+ */
+ ssn = ntohs(data_hdr->ssn);
+ if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) {
+ return SCTP_IERROR_PROTO_VIOLATION;
+ }
+
+ /* Send the data up to the user. Note: Schedule the
+ * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
+ * chunk needs the updated rwnd.
+ */
+ sctp_add_cmd_sf(commands, deliver, SCTP_CHUNK(chunk));
+
+ return SCTP_IERROR_NO_ERROR;
+}
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
new file mode 100644
index 000000000..a987d54b3
--- /dev/null
+++ b/net/sctp/sm_statetable.c
@@ -0,0 +1,933 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These are the state tables for the SCTP state machine.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Hui Huang <hui.huang@nokia.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/skbuff.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static const sctp_sm_table_entry_t
+primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES];
+static const sctp_sm_table_entry_t
+other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES];
+static const sctp_sm_table_entry_t
+timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES];
+
+static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
+ sctp_cid_t cid,
+ sctp_state_t state);
+
+
+static const sctp_sm_table_entry_t bug = {
+ .fn = sctp_sf_bug,
+ .name = "sctp_sf_bug"
+};
+
+#define DO_LOOKUP(_max, _type, _table) \
+({ \
+ const sctp_sm_table_entry_t *rtn; \
+ \
+ if ((event_subtype._type > (_max))) { \
+ pr_warn("table %p possible attack: event %d exceeds max %d\n", \
+ _table, event_subtype._type, _max); \
+ rtn = &bug; \
+ } else \
+ rtn = &_table[event_subtype._type][(int)state]; \
+ \
+ rtn; \
+})
+
+const sctp_sm_table_entry_t *sctp_sm_lookup_event(struct net *net,
+ sctp_event_t event_type,
+ sctp_state_t state,
+ sctp_subtype_t event_subtype)
+{
+ switch (event_type) {
+ case SCTP_EVENT_T_CHUNK:
+ return sctp_chunk_event_lookup(net, event_subtype.chunk, state);
+ case SCTP_EVENT_T_TIMEOUT:
+ return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout,
+ timeout_event_table);
+ case SCTP_EVENT_T_OTHER:
+ return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other,
+ other_event_table);
+ case SCTP_EVENT_T_PRIMITIVE:
+ return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive,
+ primitive_event_table);
+ default:
+ /* Yikes! We got an illegal event type. */
+ return &bug;
+ }
+}
+
+#define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func}
+
+#define TYPE_SCTP_DATA { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_data_fast_4_4), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_DATA */
+
+#define TYPE_SCTP_INIT { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_1B_init), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_reshutack), \
+} /* TYPE_SCTP_INIT */
+
+#define TYPE_SCTP_INIT_ACK { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_3_initack), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_1C_ack), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_INIT_ACK */
+
+#define TYPE_SCTP_SACK { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_SACK */
+
+#define TYPE_SCTP_HEARTBEAT { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ /* This should not happen, but we are nice. */ \
+ TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+} /* TYPE_SCTP_HEARTBEAT */
+
+#define TYPE_SCTP_HEARTBEAT_ACK { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_violation), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_HEARTBEAT_ACK */
+
+#define TYPE_SCTP_ABORT { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_pdiscard), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_abort), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_abort), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_abort), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_abort), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_abort), \
+} /* TYPE_SCTP_ABORT */
+
+#define TYPE_SCTP_SHUTDOWN { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_SHUTDOWN */
+
+#define TYPE_SCTP_SHUTDOWN_ACK { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_violation), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_violation), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_violation), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \
+} /* TYPE_SCTP_SHUTDOWN_ACK */
+
+#define TYPE_SCTP_ERROR { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_err), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ERROR */
+
+#define TYPE_SCTP_COOKIE_ECHO { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_1D_ce), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+} /* TYPE_SCTP_COOKIE_ECHO */
+
+#define TYPE_SCTP_COOKIE_ACK { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_5_1E_ca), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_COOKIE_ACK */
+
+#define TYPE_SCTP_ECN_ECNE { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ECN_ECNE */
+
+#define TYPE_SCTP_ECN_CWR { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ECN_CWR */
+
+#define TYPE_SCTP_SHUTDOWN_COMPLETE { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_4_C), \
+} /* TYPE_SCTP_SHUTDOWN_COMPLETE */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ *
+ * For base protocol (RFC 2960).
+ */
+static const sctp_sm_table_entry_t chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+ TYPE_SCTP_DATA,
+ TYPE_SCTP_INIT,
+ TYPE_SCTP_INIT_ACK,
+ TYPE_SCTP_SACK,
+ TYPE_SCTP_HEARTBEAT,
+ TYPE_SCTP_HEARTBEAT_ACK,
+ TYPE_SCTP_ABORT,
+ TYPE_SCTP_SHUTDOWN,
+ TYPE_SCTP_SHUTDOWN_ACK,
+ TYPE_SCTP_ERROR,
+ TYPE_SCTP_COOKIE_ECHO,
+ TYPE_SCTP_COOKIE_ACK,
+ TYPE_SCTP_ECN_ECNE,
+ TYPE_SCTP_ECN_CWR,
+ TYPE_SCTP_SHUTDOWN_COMPLETE,
+}; /* state_fn_t chunk_event_table[][] */
+
+#define TYPE_SCTP_ASCONF { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ASCONF */
+
+#define TYPE_SCTP_ASCONF_ACK { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ASCONF_ACK */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+ TYPE_SCTP_ASCONF,
+ TYPE_SCTP_ASCONF_ACK,
+}; /*state_fn_t addip_chunk_event_table[][] */
+
+#define TYPE_SCTP_FWD_TSN { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn_fast), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_FWD_TSN */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+ TYPE_SCTP_FWD_TSN,
+}; /*state_fn_t prsctp_chunk_event_table[][] */
+
+#define TYPE_SCTP_AUTH { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ootb), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+} /* TYPE_SCTP_AUTH */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+ TYPE_SCTP_AUTH,
+}; /*state_fn_t auth_chunk_event_table[][] */
+
+static const sctp_sm_table_entry_t
+chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
+ /* SCTP_STATE_CLOSED */
+ TYPE_SCTP_FUNC(sctp_sf_ootb),
+ /* SCTP_STATE_COOKIE_WAIT */
+ TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+ /* SCTP_STATE_COOKIE_ECHOED */
+ TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+ /* SCTP_STATE_ESTABLISHED */
+ TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+ /* SCTP_STATE_SHUTDOWN_PENDING */
+ TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+ /* SCTP_STATE_SHUTDOWN_SENT */
+ TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */
+ TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */
+ TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+}; /* chunk unknown */
+
+
+#define TYPE_SCTP_PRIMITIVE_ASSOCIATE { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_asoc), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+} /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */
+
+#define TYPE_SCTP_PRIMITIVE_SHUTDOWN { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_shutdown), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_shutdown),\
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_prm_shutdown), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
+} /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */
+
+#define TYPE_SCTP_PRIMITIVE_ABORT { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_abort), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_abort), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_prm_abort), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_prm_abort), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_prm_abort), \
+} /* TYPE_SCTP_PRIMITIVE_ABORT */
+
+#define TYPE_SCTP_PRIMITIVE_SEND { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+} /* TYPE_SCTP_PRIMITIVE_SEND */
+
+#define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat), \
+} /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */
+
+#define TYPE_SCTP_PRIMITIVE_ASCONF { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+} /* TYPE_SCTP_PRIMITIVE_ASCONF */
+
+/* The primary index for this table is the primitive type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = {
+ TYPE_SCTP_PRIMITIVE_ASSOCIATE,
+ TYPE_SCTP_PRIMITIVE_SHUTDOWN,
+ TYPE_SCTP_PRIMITIVE_ABORT,
+ TYPE_SCTP_PRIMITIVE_SEND,
+ TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
+ TYPE_SCTP_PRIMITIVE_ASCONF,
+};
+
+#define TYPE_SCTP_OTHER_NO_PENDING_TSN { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_no_pending_tsn), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_start_shutdown), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+}
+
+#define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_cookie_wait_icmp_abort), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+}
+
+static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = {
+ TYPE_SCTP_OTHER_NO_PENDING_TSN,
+ TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH,
+};
+
+#define TYPE_SCTP_EVENT_TIMEOUT_NONE { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_bug), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_t1_cookie_timer_expire), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_t1_init_timer_expire), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_t4_timer_expire), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_SACK { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \
+ /* SCTP_STATE_CLOSED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_ESTABLISHED */ \
+ TYPE_SCTP_FUNC(sctp_sf_autoclose_timer_expire), \
+ /* SCTP_STATE_SHUTDOWN_PENDING */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
+ TYPE_SCTP_EVENT_TIMEOUT_NONE,
+ TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
+ TYPE_SCTP_EVENT_TIMEOUT_T1_INIT,
+ TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN,
+ TYPE_SCTP_EVENT_TIMEOUT_T3_RTX,
+ TYPE_SCTP_EVENT_TIMEOUT_T4_RTO,
+ TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
+ TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT,
+ TYPE_SCTP_EVENT_TIMEOUT_SACK,
+ TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
+};
+
+static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
+ sctp_cid_t cid,
+ sctp_state_t state)
+{
+ if (state > SCTP_STATE_MAX)
+ return &bug;
+
+ if (cid <= SCTP_CID_BASE_MAX)
+ return &chunk_event_table[cid][state];
+
+ if (net->sctp.prsctp_enable) {
+ if (cid == SCTP_CID_FWD_TSN)
+ return &prsctp_chunk_event_table[0][state];
+ }
+
+ if (net->sctp.addip_enable) {
+ if (cid == SCTP_CID_ASCONF)
+ return &addip_chunk_event_table[0][state];
+
+ if (cid == SCTP_CID_ASCONF_ACK)
+ return &addip_chunk_event_table[1][state];
+ }
+
+ if (net->sctp.auth_enable) {
+ if (cid == SCTP_CID_AUTH)
+ return &auth_chunk_event_table[0][state];
+ }
+
+ return &chunk_event_table_unknown[state];
+}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
new file mode 100644
index 000000000..5f6c4e613
--- /dev/null
+++ b/net/sctp/socket.c
@@ -0,0 +1,7419 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2003 Intel Corp.
+ * Copyright (c) 2001-2002 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions interface with the sockets layer to implement the
+ * SCTP Extensions for the Sockets API.
+ *
+ * Note that the descriptions from the specification are USER level
+ * functions--this file is the functions which populate the struct proto
+ * for SCTP which is the BOTTOM of the sockets interface.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Narasimha Budihal <narsi@refcode.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Xingang Guo <xingang.guo@intel.com>
+ * Daisy Chang <daisyc@us.ibm.com>
+ * Sridhar Samudrala <samudrala@us.ibm.com>
+ * Inaky Perez-Gonzalez <inaky.gonzalez@intel.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ * Ryan Layer <rmlayer@us.ibm.com>
+ * Anup Pemmaiah <pemmaiah@cc.usu.edu>
+ * Kevin Gao <kevin.gao@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/ip.h>
+#include <linux/capability.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/crypto.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/busy_poll.h>
+
+#include <linux/socket.h> /* for sa_family_t */
+#include <linux/export.h>
+#include <net/sock.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal helper functions. */
+static int sctp_writeable(struct sock *sk);
+static void sctp_wfree(struct sk_buff *skb);
+static int sctp_wait_for_sndbuf(struct sctp_association *, long *timeo_p,
+ size_t msg_len);
+static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p);
+static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p);
+static int sctp_wait_for_accept(struct sock *sk, long timeo);
+static void sctp_wait_for_close(struct sock *sk, long timeo);
+static void sctp_destruct_sock(struct sock *sk);
+static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
+ union sctp_addr *addr, int len);
+static int sctp_bindx_add(struct sock *, struct sockaddr *, int);
+static int sctp_bindx_rem(struct sock *, struct sockaddr *, int);
+static int sctp_send_asconf_add_ip(struct sock *, struct sockaddr *, int);
+static int sctp_send_asconf_del_ip(struct sock *, struct sockaddr *, int);
+static int sctp_send_asconf(struct sctp_association *asoc,
+ struct sctp_chunk *chunk);
+static int sctp_do_bind(struct sock *, union sctp_addr *, int);
+static int sctp_autobind(struct sock *sk);
+static void sctp_sock_migrate(struct sock *, struct sock *,
+ struct sctp_association *, sctp_socket_type_t);
+
+static int sctp_memory_pressure;
+static atomic_long_t sctp_memory_allocated;
+struct percpu_counter sctp_sockets_allocated;
+
+static void sctp_enter_memory_pressure(struct sock *sk)
+{
+ sctp_memory_pressure = 1;
+}
+
+
+/* Get the sndbuf space available at the time on the association. */
+static inline int sctp_wspace(struct sctp_association *asoc)
+{
+ int amt;
+
+ if (asoc->ep->sndbuf_policy)
+ amt = asoc->sndbuf_used;
+ else
+ amt = sk_wmem_alloc_get(asoc->base.sk);
+
+ if (amt >= asoc->base.sk->sk_sndbuf) {
+ if (asoc->base.sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ amt = 0;
+ else {
+ amt = sk_stream_wspace(asoc->base.sk);
+ if (amt < 0)
+ amt = 0;
+ }
+ } else {
+ amt = asoc->base.sk->sk_sndbuf - amt;
+ }
+ return amt;
+}
+
+/* Increment the used sndbuf space count of the corresponding association by
+ * the size of the outgoing data chunk.
+ * Also, set the skb destructor for sndbuf accounting later.
+ *
+ * Since it is always 1-1 between chunk and skb, and also a new skb is always
+ * allocated for chunk bundling in sctp_packet_transmit(), we can use the
+ * destructor in the data chunk skb for the purpose of the sndbuf space
+ * tracking.
+ */
+static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
+{
+ struct sctp_association *asoc = chunk->asoc;
+ struct sock *sk = asoc->base.sk;
+
+ /* The sndbuf space is tracked per association. */
+ sctp_association_hold(asoc);
+
+ skb_set_owner_w(chunk->skb, sk);
+
+ chunk->skb->destructor = sctp_wfree;
+ /* Save the chunk pointer in skb for sctp_wfree to use later. */
+ skb_shinfo(chunk->skb)->destructor_arg = chunk;
+
+ asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) +
+ sizeof(struct sk_buff) +
+ sizeof(struct sctp_chunk);
+
+ atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
+ sk->sk_wmem_queued += chunk->skb->truesize;
+ sk_mem_charge(sk, chunk->skb->truesize);
+}
+
+/* Verify that this is a valid address. */
+static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
+ int len)
+{
+ struct sctp_af *af;
+
+ /* Verify basic sockaddr. */
+ af = sctp_sockaddr_af(sctp_sk(sk), addr, len);
+ if (!af)
+ return -EINVAL;
+
+ /* Is this a valid SCTP address? */
+ if (!af->addr_valid(addr, sctp_sk(sk), NULL))
+ return -EINVAL;
+
+ if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr)))
+ return -EINVAL;
+
+ return 0;
+}
+
+/* Look up the association by its id. If this is not a UDP-style
+ * socket, the ID field is always ignored.
+ */
+struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id)
+{
+ struct sctp_association *asoc = NULL;
+
+ /* If this is not a UDP-style socket, assoc id should be ignored. */
+ if (!sctp_style(sk, UDP)) {
+ /* Return NULL if the socket state is not ESTABLISHED. It
+ * could be a TCP-style listening socket or a socket which
+ * hasn't yet called connect() to establish an association.
+ */
+ if (!sctp_sstate(sk, ESTABLISHED))
+ return NULL;
+
+ /* Get the first and the only association from the list. */
+ if (!list_empty(&sctp_sk(sk)->ep->asocs))
+ asoc = list_entry(sctp_sk(sk)->ep->asocs.next,
+ struct sctp_association, asocs);
+ return asoc;
+ }
+
+ /* Otherwise this is a UDP-style socket. */
+ if (!id || (id == (sctp_assoc_t)-1))
+ return NULL;
+
+ spin_lock_bh(&sctp_assocs_id_lock);
+ asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id);
+ spin_unlock_bh(&sctp_assocs_id_lock);
+
+ if (!asoc || (asoc->base.sk != sk) || asoc->base.dead)
+ return NULL;
+
+ return asoc;
+}
+
+/* Look up the transport from an address and an assoc id. If both address and
+ * id are specified, the associations matching the address and the id should be
+ * the same.
+ */
+static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
+ struct sockaddr_storage *addr,
+ sctp_assoc_t id)
+{
+ struct sctp_association *addr_asoc = NULL, *id_asoc = NULL;
+ struct sctp_transport *transport;
+ union sctp_addr *laddr = (union sctp_addr *)addr;
+
+ addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep,
+ laddr,
+ &transport);
+
+ if (!addr_asoc)
+ return NULL;
+
+ id_asoc = sctp_id2assoc(sk, id);
+ if (id_asoc && (id_asoc != addr_asoc))
+ return NULL;
+
+ sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk),
+ (union sctp_addr *)addr);
+
+ return transport;
+}
+
+/* API 3.1.2 bind() - UDP Style Syntax
+ * The syntax of bind() is,
+ *
+ * ret = bind(int sd, struct sockaddr *addr, int addrlen);
+ *
+ * sd - the socket descriptor returned by socket().
+ * addr - the address structure (struct sockaddr_in or struct
+ * sockaddr_in6 [RFC 2553]),
+ * addr_len - the size of the address structure.
+ */
+static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
+{
+ int retval = 0;
+
+ lock_sock(sk);
+
+ pr_debug("%s: sk:%p, addr:%p, addr_len:%d\n", __func__, sk,
+ addr, addr_len);
+
+ /* Disallow binding twice. */
+ if (!sctp_sk(sk)->ep->base.bind_addr.port)
+ retval = sctp_do_bind(sk, (union sctp_addr *)addr,
+ addr_len);
+ else
+ retval = -EINVAL;
+
+ release_sock(sk);
+
+ return retval;
+}
+
+static long sctp_get_port_local(struct sock *, union sctp_addr *);
+
+/* Verify this is a valid sockaddr. */
+static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
+ union sctp_addr *addr, int len)
+{
+ struct sctp_af *af;
+
+ /* Check minimum size. */
+ if (len < sizeof (struct sockaddr))
+ return NULL;
+
+ /* V4 mapped address are really of AF_INET family */
+ if (addr->sa.sa_family == AF_INET6 &&
+ ipv6_addr_v4mapped(&addr->v6.sin6_addr)) {
+ if (!opt->pf->af_supported(AF_INET, opt))
+ return NULL;
+ } else {
+ /* Does this PF support this AF? */
+ if (!opt->pf->af_supported(addr->sa.sa_family, opt))
+ return NULL;
+ }
+
+ /* If we get this far, af is valid. */
+ af = sctp_get_af_specific(addr->sa.sa_family);
+
+ if (len < af->sockaddr_len)
+ return NULL;
+
+ return af;
+}
+
+/* Bind a local address either to an endpoint or to an association. */
+static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_endpoint *ep = sp->ep;
+ struct sctp_bind_addr *bp = &ep->base.bind_addr;
+ struct sctp_af *af;
+ unsigned short snum;
+ int ret = 0;
+
+ /* Common sockaddr verification. */
+ af = sctp_sockaddr_af(sp, addr, len);
+ if (!af) {
+ pr_debug("%s: sk:%p, newaddr:%p, len:%d EINVAL\n",
+ __func__, sk, addr, len);
+ return -EINVAL;
+ }
+
+ snum = ntohs(addr->v4.sin_port);
+
+ pr_debug("%s: sk:%p, new addr:%pISc, port:%d, new port:%d, len:%d\n",
+ __func__, sk, &addr->sa, bp->port, snum, len);
+
+ /* PF specific bind() address verification. */
+ if (!sp->pf->bind_verify(sp, addr))
+ return -EADDRNOTAVAIL;
+
+ /* We must either be unbound, or bind to the same port.
+ * It's OK to allow 0 ports if we are already bound.
+ * We'll just inhert an already bound port in this case
+ */
+ if (bp->port) {
+ if (!snum)
+ snum = bp->port;
+ else if (snum != bp->port) {
+ pr_debug("%s: new port %d doesn't match existing port "
+ "%d\n", __func__, snum, bp->port);
+ return -EINVAL;
+ }
+ }
+
+ if (snum && snum < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ /* See if the address matches any of the addresses we may have
+ * already bound before checking against other endpoints.
+ */
+ if (sctp_bind_addr_match(bp, addr, sp))
+ return -EINVAL;
+
+ /* Make sure we are allowed to bind here.
+ * The function sctp_get_port_local() does duplicate address
+ * detection.
+ */
+ addr->v4.sin_port = htons(snum);
+ if ((ret = sctp_get_port_local(sk, addr))) {
+ return -EADDRINUSE;
+ }
+
+ /* Refresh ephemeral port. */
+ if (!bp->port)
+ bp->port = inet_sk(sk)->inet_num;
+
+ /* Add the address to the bind address list.
+ * Use GFP_ATOMIC since BHs will be disabled.
+ */
+ ret = sctp_add_bind_addr(bp, addr, SCTP_ADDR_SRC, GFP_ATOMIC);
+
+ /* Copy back into socket for getsockname() use. */
+ if (!ret) {
+ inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num);
+ sp->pf->to_sk_saddr(addr, sk);
+ }
+
+ return ret;
+}
+
+ /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks
+ *
+ * R1) One and only one ASCONF Chunk MAY be in transit and unacknowledged
+ * at any one time. If a sender, after sending an ASCONF chunk, decides
+ * it needs to transfer another ASCONF Chunk, it MUST wait until the
+ * ASCONF-ACK Chunk returns from the previous ASCONF Chunk before sending a
+ * subsequent ASCONF. Note this restriction binds each side, so at any
+ * time two ASCONF may be in-transit on any given association (one sent
+ * from each endpoint).
+ */
+static int sctp_send_asconf(struct sctp_association *asoc,
+ struct sctp_chunk *chunk)
+{
+ struct net *net = sock_net(asoc->base.sk);
+ int retval = 0;
+
+ /* If there is an outstanding ASCONF chunk, queue it for later
+ * transmission.
+ */
+ if (asoc->addip_last_asconf) {
+ list_add_tail(&chunk->list, &asoc->addip_chunk_list);
+ goto out;
+ }
+
+ /* Hold the chunk until an ASCONF_ACK is received. */
+ sctp_chunk_hold(chunk);
+ retval = sctp_primitive_ASCONF(net, asoc, chunk);
+ if (retval)
+ sctp_chunk_free(chunk);
+ else
+ asoc->addip_last_asconf = chunk;
+
+out:
+ return retval;
+}
+
+/* Add a list of addresses as bind addresses to local endpoint or
+ * association.
+ *
+ * Basically run through each address specified in the addrs/addrcnt
+ * array/length pair, determine if it is IPv6 or IPv4 and call
+ * sctp_do_bind() on it.
+ *
+ * If any of them fails, then the operation will be reversed and the
+ * ones that were added will be removed.
+ *
+ * Only sctp_setsockopt_bindx() is supposed to call this function.
+ */
+static int sctp_bindx_add(struct sock *sk, struct sockaddr *addrs, int addrcnt)
+{
+ int cnt;
+ int retval = 0;
+ void *addr_buf;
+ struct sockaddr *sa_addr;
+ struct sctp_af *af;
+
+ pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk,
+ addrs, addrcnt);
+
+ addr_buf = addrs;
+ for (cnt = 0; cnt < addrcnt; cnt++) {
+ /* The list may contain either IPv4 or IPv6 address;
+ * determine the address length for walking thru the list.
+ */
+ sa_addr = addr_buf;
+ af = sctp_get_af_specific(sa_addr->sa_family);
+ if (!af) {
+ retval = -EINVAL;
+ goto err_bindx_add;
+ }
+
+ retval = sctp_do_bind(sk, (union sctp_addr *)sa_addr,
+ af->sockaddr_len);
+
+ addr_buf += af->sockaddr_len;
+
+err_bindx_add:
+ if (retval < 0) {
+ /* Failed. Cleanup the ones that have been added */
+ if (cnt > 0)
+ sctp_bindx_rem(sk, addrs, cnt);
+ return retval;
+ }
+ }
+
+ return retval;
+}
+
+/* Send an ASCONF chunk with Add IP address parameters to all the peers of the
+ * associations that are part of the endpoint indicating that a list of local
+ * addresses are added to the endpoint.
+ *
+ * If any of the addresses is already in the bind address list of the
+ * association, we do not send the chunk for that association. But it will not
+ * affect other associations.
+ *
+ * Only sctp_setsockopt_bindx() is supposed to call this function.
+ */
+static int sctp_send_asconf_add_ip(struct sock *sk,
+ struct sockaddr *addrs,
+ int addrcnt)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_sock *sp;
+ struct sctp_endpoint *ep;
+ struct sctp_association *asoc;
+ struct sctp_bind_addr *bp;
+ struct sctp_chunk *chunk;
+ struct sctp_sockaddr_entry *laddr;
+ union sctp_addr *addr;
+ union sctp_addr saveaddr;
+ void *addr_buf;
+ struct sctp_af *af;
+ struct list_head *p;
+ int i;
+ int retval = 0;
+
+ if (!net->sctp.addip_enable)
+ return retval;
+
+ sp = sctp_sk(sk);
+ ep = sp->ep;
+
+ pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n",
+ __func__, sk, addrs, addrcnt);
+
+ list_for_each_entry(asoc, &ep->asocs, asocs) {
+ if (!asoc->peer.asconf_capable)
+ continue;
+
+ if (asoc->peer.addip_disabled_mask & SCTP_PARAM_ADD_IP)
+ continue;
+
+ if (!sctp_state(asoc, ESTABLISHED))
+ continue;
+
+ /* Check if any address in the packed array of addresses is
+ * in the bind address list of the association. If so,
+ * do not send the asconf chunk to its peer, but continue with
+ * other associations.
+ */
+ addr_buf = addrs;
+ for (i = 0; i < addrcnt; i++) {
+ addr = addr_buf;
+ af = sctp_get_af_specific(addr->v4.sin_family);
+ if (!af) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (sctp_assoc_lookup_laddr(asoc, addr))
+ break;
+
+ addr_buf += af->sockaddr_len;
+ }
+ if (i < addrcnt)
+ continue;
+
+ /* Use the first valid address in bind addr list of
+ * association as Address Parameter of ASCONF CHUNK.
+ */
+ bp = &asoc->base.bind_addr;
+ p = bp->address_list.next;
+ laddr = list_entry(p, struct sctp_sockaddr_entry, list);
+ chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs,
+ addrcnt, SCTP_PARAM_ADD_IP);
+ if (!chunk) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ /* Add the new addresses to the bind address list with
+ * use_as_src set to 0.
+ */
+ addr_buf = addrs;
+ for (i = 0; i < addrcnt; i++) {
+ addr = addr_buf;
+ af = sctp_get_af_specific(addr->v4.sin_family);
+ memcpy(&saveaddr, addr, af->sockaddr_len);
+ retval = sctp_add_bind_addr(bp, &saveaddr,
+ SCTP_ADDR_NEW, GFP_ATOMIC);
+ addr_buf += af->sockaddr_len;
+ }
+ if (asoc->src_out_of_asoc_ok) {
+ struct sctp_transport *trans;
+
+ list_for_each_entry(trans,
+ &asoc->peer.transport_addr_list, transports) {
+ /* Clear the source and route cache */
+ dst_release(trans->dst);
+ trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
+ 2*asoc->pathmtu, 4380));
+ trans->ssthresh = asoc->peer.i.a_rwnd;
+ trans->rto = asoc->rto_initial;
+ sctp_max_rto(asoc, trans);
+ trans->rtt = trans->srtt = trans->rttvar = 0;
+ sctp_transport_route(trans, NULL,
+ sctp_sk(asoc->base.sk));
+ }
+ }
+ retval = sctp_send_asconf(asoc, chunk);
+ }
+
+out:
+ return retval;
+}
+
+/* Remove a list of addresses from bind addresses list. Do not remove the
+ * last address.
+ *
+ * Basically run through each address specified in the addrs/addrcnt
+ * array/length pair, determine if it is IPv6 or IPv4 and call
+ * sctp_del_bind() on it.
+ *
+ * If any of them fails, then the operation will be reversed and the
+ * ones that were removed will be added back.
+ *
+ * At least one address has to be left; if only one address is
+ * available, the operation will return -EBUSY.
+ *
+ * Only sctp_setsockopt_bindx() is supposed to call this function.
+ */
+static int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_endpoint *ep = sp->ep;
+ int cnt;
+ struct sctp_bind_addr *bp = &ep->base.bind_addr;
+ int retval = 0;
+ void *addr_buf;
+ union sctp_addr *sa_addr;
+ struct sctp_af *af;
+
+ pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n",
+ __func__, sk, addrs, addrcnt);
+
+ addr_buf = addrs;
+ for (cnt = 0; cnt < addrcnt; cnt++) {
+ /* If the bind address list is empty or if there is only one
+ * bind address, there is nothing more to be removed (we need
+ * at least one address here).
+ */
+ if (list_empty(&bp->address_list) ||
+ (sctp_list_single_entry(&bp->address_list))) {
+ retval = -EBUSY;
+ goto err_bindx_rem;
+ }
+
+ sa_addr = addr_buf;
+ af = sctp_get_af_specific(sa_addr->sa.sa_family);
+ if (!af) {
+ retval = -EINVAL;
+ goto err_bindx_rem;
+ }
+
+ if (!af->addr_valid(sa_addr, sp, NULL)) {
+ retval = -EADDRNOTAVAIL;
+ goto err_bindx_rem;
+ }
+
+ if (sa_addr->v4.sin_port &&
+ sa_addr->v4.sin_port != htons(bp->port)) {
+ retval = -EINVAL;
+ goto err_bindx_rem;
+ }
+
+ if (!sa_addr->v4.sin_port)
+ sa_addr->v4.sin_port = htons(bp->port);
+
+ /* FIXME - There is probably a need to check if sk->sk_saddr and
+ * sk->sk_rcv_addr are currently set to one of the addresses to
+ * be removed. This is something which needs to be looked into
+ * when we are fixing the outstanding issues with multi-homing
+ * socket routing and failover schemes. Refer to comments in
+ * sctp_do_bind(). -daisy
+ */
+ retval = sctp_del_bind_addr(bp, sa_addr);
+
+ addr_buf += af->sockaddr_len;
+err_bindx_rem:
+ if (retval < 0) {
+ /* Failed. Add the ones that has been removed back */
+ if (cnt > 0)
+ sctp_bindx_add(sk, addrs, cnt);
+ return retval;
+ }
+ }
+
+ return retval;
+}
+
+/* Send an ASCONF chunk with Delete IP address parameters to all the peers of
+ * the associations that are part of the endpoint indicating that a list of
+ * local addresses are removed from the endpoint.
+ *
+ * If any of the addresses is already in the bind address list of the
+ * association, we do not send the chunk for that association. But it will not
+ * affect other associations.
+ *
+ * Only sctp_setsockopt_bindx() is supposed to call this function.
+ */
+static int sctp_send_asconf_del_ip(struct sock *sk,
+ struct sockaddr *addrs,
+ int addrcnt)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_sock *sp;
+ struct sctp_endpoint *ep;
+ struct sctp_association *asoc;
+ struct sctp_transport *transport;
+ struct sctp_bind_addr *bp;
+ struct sctp_chunk *chunk;
+ union sctp_addr *laddr;
+ void *addr_buf;
+ struct sctp_af *af;
+ struct sctp_sockaddr_entry *saddr;
+ int i;
+ int retval = 0;
+ int stored = 0;
+
+ chunk = NULL;
+ if (!net->sctp.addip_enable)
+ return retval;
+
+ sp = sctp_sk(sk);
+ ep = sp->ep;
+
+ pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n",
+ __func__, sk, addrs, addrcnt);
+
+ list_for_each_entry(asoc, &ep->asocs, asocs) {
+
+ if (!asoc->peer.asconf_capable)
+ continue;
+
+ if (asoc->peer.addip_disabled_mask & SCTP_PARAM_DEL_IP)
+ continue;
+
+ if (!sctp_state(asoc, ESTABLISHED))
+ continue;
+
+ /* Check if any address in the packed array of addresses is
+ * not present in the bind address list of the association.
+ * If so, do not send the asconf chunk to its peer, but
+ * continue with other associations.
+ */
+ addr_buf = addrs;
+ for (i = 0; i < addrcnt; i++) {
+ laddr = addr_buf;
+ af = sctp_get_af_specific(laddr->v4.sin_family);
+ if (!af) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (!sctp_assoc_lookup_laddr(asoc, laddr))
+ break;
+
+ addr_buf += af->sockaddr_len;
+ }
+ if (i < addrcnt)
+ continue;
+
+ /* Find one address in the association's bind address list
+ * that is not in the packed array of addresses. This is to
+ * make sure that we do not delete all the addresses in the
+ * association.
+ */
+ bp = &asoc->base.bind_addr;
+ laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs,
+ addrcnt, sp);
+ if ((laddr == NULL) && (addrcnt == 1)) {
+ if (asoc->asconf_addr_del_pending)
+ continue;
+ asoc->asconf_addr_del_pending =
+ kzalloc(sizeof(union sctp_addr), GFP_ATOMIC);
+ if (asoc->asconf_addr_del_pending == NULL) {
+ retval = -ENOMEM;
+ goto out;
+ }
+ asoc->asconf_addr_del_pending->sa.sa_family =
+ addrs->sa_family;
+ asoc->asconf_addr_del_pending->v4.sin_port =
+ htons(bp->port);
+ if (addrs->sa_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)addrs;
+ asoc->asconf_addr_del_pending->v4.sin_addr.s_addr = sin->sin_addr.s_addr;
+ } else if (addrs->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)addrs;
+ asoc->asconf_addr_del_pending->v6.sin6_addr = sin6->sin6_addr;
+ }
+
+ pr_debug("%s: keep the last address asoc:%p %pISc at %p\n",
+ __func__, asoc, &asoc->asconf_addr_del_pending->sa,
+ asoc->asconf_addr_del_pending);
+
+ asoc->src_out_of_asoc_ok = 1;
+ stored = 1;
+ goto skip_mkasconf;
+ }
+
+ if (laddr == NULL)
+ return -EINVAL;
+
+ /* We do not need RCU protection throughout this loop
+ * because this is done under a socket lock from the
+ * setsockopt call.
+ */
+ chunk = sctp_make_asconf_update_ip(asoc, laddr, addrs, addrcnt,
+ SCTP_PARAM_DEL_IP);
+ if (!chunk) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+skip_mkasconf:
+ /* Reset use_as_src flag for the addresses in the bind address
+ * list that are to be deleted.
+ */
+ addr_buf = addrs;
+ for (i = 0; i < addrcnt; i++) {
+ laddr = addr_buf;
+ af = sctp_get_af_specific(laddr->v4.sin_family);
+ list_for_each_entry(saddr, &bp->address_list, list) {
+ if (sctp_cmp_addr_exact(&saddr->a, laddr))
+ saddr->state = SCTP_ADDR_DEL;
+ }
+ addr_buf += af->sockaddr_len;
+ }
+
+ /* Update the route and saddr entries for all the transports
+ * as some of the addresses in the bind address list are
+ * about to be deleted and cannot be used as source addresses.
+ */
+ list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+ transports) {
+ dst_release(transport->dst);
+ sctp_transport_route(transport, NULL,
+ sctp_sk(asoc->base.sk));
+ }
+
+ if (stored)
+ /* We don't need to transmit ASCONF */
+ continue;
+ retval = sctp_send_asconf(asoc, chunk);
+ }
+out:
+ return retval;
+}
+
+/* set addr events to assocs in the endpoint. ep and addr_wq must be locked */
+int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw)
+{
+ struct sock *sk = sctp_opt2sk(sp);
+ union sctp_addr *addr;
+ struct sctp_af *af;
+
+ /* It is safe to write port space in caller. */
+ addr = &addrw->a;
+ addr->v4.sin_port = htons(sp->ep->base.bind_addr.port);
+ af = sctp_get_af_specific(addr->sa.sa_family);
+ if (!af)
+ return -EINVAL;
+ if (sctp_verify_addr(sk, addr, af->sockaddr_len))
+ return -EINVAL;
+
+ if (addrw->state == SCTP_ADDR_NEW)
+ return sctp_send_asconf_add_ip(sk, (struct sockaddr *)addr, 1);
+ else
+ return sctp_send_asconf_del_ip(sk, (struct sockaddr *)addr, 1);
+}
+
+/* Helper for tunneling sctp_bindx() requests through sctp_setsockopt()
+ *
+ * API 8.1
+ * int sctp_bindx(int sd, struct sockaddr *addrs, int addrcnt,
+ * int flags);
+ *
+ * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
+ * If the sd is an IPv6 socket, the addresses passed can either be IPv4
+ * or IPv6 addresses.
+ *
+ * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
+ * Section 3.1.2 for this usage.
+ *
+ * addrs is a pointer to an array of one or more socket addresses. Each
+ * address is contained in its appropriate structure (i.e. struct
+ * sockaddr_in or struct sockaddr_in6) the family of the address type
+ * must be used to distinguish the address length (note that this
+ * representation is termed a "packed array" of addresses). The caller
+ * specifies the number of addresses in the array with addrcnt.
+ *
+ * On success, sctp_bindx() returns 0. On failure, sctp_bindx() returns
+ * -1, and sets errno to the appropriate error code.
+ *
+ * For SCTP, the port given in each socket address must be the same, or
+ * sctp_bindx() will fail, setting errno to EINVAL.
+ *
+ * The flags parameter is formed from the bitwise OR of zero or more of
+ * the following currently defined flags:
+ *
+ * SCTP_BINDX_ADD_ADDR
+ *
+ * SCTP_BINDX_REM_ADDR
+ *
+ * SCTP_BINDX_ADD_ADDR directs SCTP to add the given addresses to the
+ * association, and SCTP_BINDX_REM_ADDR directs SCTP to remove the given
+ * addresses from the association. The two flags are mutually exclusive;
+ * if both are given, sctp_bindx() will fail with EINVAL. A caller may
+ * not remove all addresses from an association; sctp_bindx() will
+ * reject such an attempt with EINVAL.
+ *
+ * An application can use sctp_bindx(SCTP_BINDX_ADD_ADDR) to associate
+ * additional addresses with an endpoint after calling bind(). Or use
+ * sctp_bindx(SCTP_BINDX_REM_ADDR) to remove some addresses a listening
+ * socket is associated with so that no new association accepted will be
+ * associated with those addresses. If the endpoint supports dynamic
+ * address a SCTP_BINDX_REM_ADDR or SCTP_BINDX_ADD_ADDR may cause a
+ * endpoint to send the appropriate message to the peer to change the
+ * peers address lists.
+ *
+ * Adding and removing addresses from a connected association is
+ * optional functionality. Implementations that do not support this
+ * functionality should return EOPNOTSUPP.
+ *
+ * Basically do nothing but copying the addresses from user to kernel
+ * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk.
+ * This is used for tunneling the sctp_bindx() request through sctp_setsockopt()
+ * from userspace.
+ *
+ * We don't use copy_from_user() for optimization: we first do the
+ * sanity checks (buffer size -fast- and access check-healthy
+ * pointer); if all of those succeed, then we can alloc the memory
+ * (expensive operation) needed to copy the data to kernel. Then we do
+ * the copying without checking the user space area
+ * (__copy_from_user()).
+ *
+ * On exit there is no need to do sockfd_put(), sys_setsockopt() does
+ * it.
+ *
+ * sk The sk of the socket
+ * addrs The pointer to the addresses in user land
+ * addrssize Size of the addrs buffer
+ * op Operation to perform (add or remove, see the flags of
+ * sctp_bindx)
+ *
+ * Returns 0 if ok, <0 errno code on error.
+ */
+static int sctp_setsockopt_bindx(struct sock *sk,
+ struct sockaddr __user *addrs,
+ int addrs_size, int op)
+{
+ struct sockaddr *kaddrs;
+ int err;
+ int addrcnt = 0;
+ int walk_size = 0;
+ struct sockaddr *sa_addr;
+ void *addr_buf;
+ struct sctp_af *af;
+
+ pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n",
+ __func__, sk, addrs, addrs_size, op);
+
+ if (unlikely(addrs_size <= 0))
+ return -EINVAL;
+
+ /* Check the user passed a healthy pointer. */
+ if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
+ return -EFAULT;
+
+ /* Alloc space for the address array in kernel memory. */
+ kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+ if (unlikely(!kaddrs))
+ return -ENOMEM;
+
+ if (__copy_from_user(kaddrs, addrs, addrs_size)) {
+ kfree(kaddrs);
+ return -EFAULT;
+ }
+
+ /* Walk through the addrs buffer and count the number of addresses. */
+ addr_buf = kaddrs;
+ while (walk_size < addrs_size) {
+ if (walk_size + sizeof(sa_family_t) > addrs_size) {
+ kfree(kaddrs);
+ return -EINVAL;
+ }
+
+ sa_addr = addr_buf;
+ af = sctp_get_af_specific(sa_addr->sa_family);
+
+ /* If the address family is not supported or if this address
+ * causes the address buffer to overflow return EINVAL.
+ */
+ if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
+ kfree(kaddrs);
+ return -EINVAL;
+ }
+ addrcnt++;
+ addr_buf += af->sockaddr_len;
+ walk_size += af->sockaddr_len;
+ }
+
+ /* Do the work. */
+ switch (op) {
+ case SCTP_BINDX_ADD_ADDR:
+ err = sctp_bindx_add(sk, kaddrs, addrcnt);
+ if (err)
+ goto out;
+ err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt);
+ break;
+
+ case SCTP_BINDX_REM_ADDR:
+ err = sctp_bindx_rem(sk, kaddrs, addrcnt);
+ if (err)
+ goto out;
+ err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt);
+ break;
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+out:
+ kfree(kaddrs);
+
+ return err;
+}
+
+/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
+ *
+ * Common routine for handling connect() and sctp_connectx().
+ * Connect will come in with just a single address.
+ */
+static int __sctp_connect(struct sock *sk,
+ struct sockaddr *kaddrs,
+ int addrs_size,
+ sctp_assoc_t *assoc_id)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_sock *sp;
+ struct sctp_endpoint *ep;
+ struct sctp_association *asoc = NULL;
+ struct sctp_association *asoc2;
+ struct sctp_transport *transport;
+ union sctp_addr to;
+ sctp_scope_t scope;
+ long timeo;
+ int err = 0;
+ int addrcnt = 0;
+ int walk_size = 0;
+ union sctp_addr *sa_addr = NULL;
+ void *addr_buf;
+ unsigned short port;
+ unsigned int f_flags = 0;
+
+ sp = sctp_sk(sk);
+ ep = sp->ep;
+
+ /* connect() cannot be done on a socket that is already in ESTABLISHED
+ * state - UDP-style peeled off socket or a TCP-style socket that
+ * is already connected.
+ * It cannot be done even on a TCP-style listening socket.
+ */
+ if (sctp_sstate(sk, ESTABLISHED) ||
+ (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
+ err = -EISCONN;
+ goto out_free;
+ }
+
+ /* Walk through the addrs buffer and count the number of addresses. */
+ addr_buf = kaddrs;
+ while (walk_size < addrs_size) {
+ struct sctp_af *af;
+
+ if (walk_size + sizeof(sa_family_t) > addrs_size) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ sa_addr = addr_buf;
+ af = sctp_get_af_specific(sa_addr->sa.sa_family);
+
+ /* If the address family is not supported or if this address
+ * causes the address buffer to overflow return EINVAL.
+ */
+ if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ port = ntohs(sa_addr->v4.sin_port);
+
+ /* Save current address so we can work with it */
+ memcpy(&to, sa_addr, af->sockaddr_len);
+
+ err = sctp_verify_addr(sk, &to, af->sockaddr_len);
+ if (err)
+ goto out_free;
+
+ /* Make sure the destination port is correctly set
+ * in all addresses.
+ */
+ if (asoc && asoc->peer.port && asoc->peer.port != port) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ /* Check if there already is a matching association on the
+ * endpoint (other than the one created here).
+ */
+ asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport);
+ if (asoc2 && asoc2 != asoc) {
+ if (asoc2->state >= SCTP_STATE_ESTABLISHED)
+ err = -EISCONN;
+ else
+ err = -EALREADY;
+ goto out_free;
+ }
+
+ /* If we could not find a matching association on the endpoint,
+ * make sure that there is no peeled-off association matching
+ * the peer address even on another socket.
+ */
+ if (sctp_endpoint_is_peeled_off(ep, &to)) {
+ err = -EADDRNOTAVAIL;
+ goto out_free;
+ }
+
+ if (!asoc) {
+ /* If a bind() or sctp_bindx() is not called prior to
+ * an sctp_connectx() call, the system picks an
+ * ephemeral port and will choose an address set
+ * equivalent to binding with a wildcard address.
+ */
+ if (!ep->base.bind_addr.port) {
+ if (sctp_autobind(sk)) {
+ err = -EAGAIN;
+ goto out_free;
+ }
+ } else {
+ /*
+ * If an unprivileged user inherits a 1-many
+ * style socket with open associations on a
+ * privileged port, it MAY be permitted to
+ * accept new associations, but it SHOULD NOT
+ * be permitted to open new associations.
+ */
+ if (ep->base.bind_addr.port < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
+ err = -EACCES;
+ goto out_free;
+ }
+ }
+
+ scope = sctp_scope(&to);
+ asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+ if (!asoc) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ err = sctp_assoc_set_bind_addr_from_ep(asoc, scope,
+ GFP_KERNEL);
+ if (err < 0) {
+ goto out_free;
+ }
+
+ }
+
+ /* Prime the peer's transport structures. */
+ transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL,
+ SCTP_UNKNOWN);
+ if (!transport) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ addrcnt++;
+ addr_buf += af->sockaddr_len;
+ walk_size += af->sockaddr_len;
+ }
+
+ /* In case the user of sctp_connectx() wants an association
+ * id back, assign one now.
+ */
+ if (assoc_id) {
+ err = sctp_assoc_set_id(asoc, GFP_KERNEL);
+ if (err < 0)
+ goto out_free;
+ }
+
+ err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
+ if (err < 0) {
+ goto out_free;
+ }
+
+ /* Initialize sk's dport and daddr for getpeername() */
+ inet_sk(sk)->inet_dport = htons(asoc->peer.port);
+ sp->pf->to_sk_daddr(sa_addr, sk);
+ sk->sk_err = 0;
+
+ /* in-kernel sockets don't generally have a file allocated to them
+ * if all they do is call sock_create_kern().
+ */
+ if (sk->sk_socket->file)
+ f_flags = sk->sk_socket->file->f_flags;
+
+ timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
+
+ err = sctp_wait_for_connect(asoc, &timeo);
+ if ((err == 0 || err == -EINPROGRESS) && assoc_id)
+ *assoc_id = asoc->assoc_id;
+
+ /* Don't free association on exit. */
+ asoc = NULL;
+
+out_free:
+ pr_debug("%s: took out_free path with asoc:%p kaddrs:%p err:%d\n",
+ __func__, asoc, kaddrs, err);
+
+ if (asoc) {
+ /* sctp_primitive_ASSOCIATE may have added this association
+ * To the hash table, try to unhash it, just in case, its a noop
+ * if it wasn't hashed so we're safe
+ */
+ sctp_unhash_established(asoc);
+ sctp_association_free(asoc);
+ }
+ return err;
+}
+
+/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt()
+ *
+ * API 8.9
+ * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt,
+ * sctp_assoc_t *asoc);
+ *
+ * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
+ * If the sd is an IPv6 socket, the addresses passed can either be IPv4
+ * or IPv6 addresses.
+ *
+ * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
+ * Section 3.1.2 for this usage.
+ *
+ * addrs is a pointer to an array of one or more socket addresses. Each
+ * address is contained in its appropriate structure (i.e. struct
+ * sockaddr_in or struct sockaddr_in6) the family of the address type
+ * must be used to distengish the address length (note that this
+ * representation is termed a "packed array" of addresses). The caller
+ * specifies the number of addresses in the array with addrcnt.
+ *
+ * On success, sctp_connectx() returns 0. It also sets the assoc_id to
+ * the association id of the new association. On failure, sctp_connectx()
+ * returns -1, and sets errno to the appropriate error code. The assoc_id
+ * is not touched by the kernel.
+ *
+ * For SCTP, the port given in each socket address must be the same, or
+ * sctp_connectx() will fail, setting errno to EINVAL.
+ *
+ * An application can use sctp_connectx to initiate an association with
+ * an endpoint that is multi-homed. Much like sctp_bindx() this call
+ * allows a caller to specify multiple addresses at which a peer can be
+ * reached. The way the SCTP stack uses the list of addresses to set up
+ * the association is implementation dependent. This function only
+ * specifies that the stack will try to make use of all the addresses in
+ * the list when needed.
+ *
+ * Note that the list of addresses passed in is only used for setting up
+ * the association. It does not necessarily equal the set of addresses
+ * the peer uses for the resulting association. If the caller wants to
+ * find out the set of peer addresses, it must use sctp_getpaddrs() to
+ * retrieve them after the association has been set up.
+ *
+ * Basically do nothing but copying the addresses from user to kernel
+ * land and invoking either sctp_connectx(). This is used for tunneling
+ * the sctp_connectx() request through sctp_setsockopt() from userspace.
+ *
+ * We don't use copy_from_user() for optimization: we first do the
+ * sanity checks (buffer size -fast- and access check-healthy
+ * pointer); if all of those succeed, then we can alloc the memory
+ * (expensive operation) needed to copy the data to kernel. Then we do
+ * the copying without checking the user space area
+ * (__copy_from_user()).
+ *
+ * On exit there is no need to do sockfd_put(), sys_setsockopt() does
+ * it.
+ *
+ * sk The sk of the socket
+ * addrs The pointer to the addresses in user land
+ * addrssize Size of the addrs buffer
+ *
+ * Returns >=0 if ok, <0 errno code on error.
+ */
+static int __sctp_setsockopt_connectx(struct sock *sk,
+ struct sockaddr __user *addrs,
+ int addrs_size,
+ sctp_assoc_t *assoc_id)
+{
+ int err = 0;
+ struct sockaddr *kaddrs;
+
+ pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
+ __func__, sk, addrs, addrs_size);
+
+ if (unlikely(addrs_size <= 0))
+ return -EINVAL;
+
+ /* Check the user passed a healthy pointer. */
+ if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
+ return -EFAULT;
+
+ /* Alloc space for the address array in kernel memory. */
+ kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+ if (unlikely(!kaddrs))
+ return -ENOMEM;
+
+ if (__copy_from_user(kaddrs, addrs, addrs_size)) {
+ err = -EFAULT;
+ } else {
+ err = __sctp_connect(sk, kaddrs, addrs_size, assoc_id);
+ }
+
+ kfree(kaddrs);
+
+ return err;
+}
+
+/*
+ * This is an older interface. It's kept for backward compatibility
+ * to the option that doesn't provide association id.
+ */
+static int sctp_setsockopt_connectx_old(struct sock *sk,
+ struct sockaddr __user *addrs,
+ int addrs_size)
+{
+ return __sctp_setsockopt_connectx(sk, addrs, addrs_size, NULL);
+}
+
+/*
+ * New interface for the API. The since the API is done with a socket
+ * option, to make it simple we feed back the association id is as a return
+ * indication to the call. Error is always negative and association id is
+ * always positive.
+ */
+static int sctp_setsockopt_connectx(struct sock *sk,
+ struct sockaddr __user *addrs,
+ int addrs_size)
+{
+ sctp_assoc_t assoc_id = 0;
+ int err = 0;
+
+ err = __sctp_setsockopt_connectx(sk, addrs, addrs_size, &assoc_id);
+
+ if (err)
+ return err;
+ else
+ return assoc_id;
+}
+
+/*
+ * New (hopefully final) interface for the API.
+ * We use the sctp_getaddrs_old structure so that use-space library
+ * can avoid any unnecessary allocations. The only different part
+ * is that we store the actual length of the address buffer into the
+ * addrs_num structure member. That way we can re-use the existing
+ * code.
+ */
+#ifdef CONFIG_COMPAT
+struct compat_sctp_getaddrs_old {
+ sctp_assoc_t assoc_id;
+ s32 addr_num;
+ compat_uptr_t addrs; /* struct sockaddr * */
+};
+#endif
+
+static int sctp_getsockopt_connectx3(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_getaddrs_old param;
+ sctp_assoc_t assoc_id = 0;
+ int err = 0;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat_task()) {
+ struct compat_sctp_getaddrs_old param32;
+
+ if (len < sizeof(param32))
+ return -EINVAL;
+ if (copy_from_user(&param32, optval, sizeof(param32)))
+ return -EFAULT;
+
+ param.assoc_id = param32.assoc_id;
+ param.addr_num = param32.addr_num;
+ param.addrs = compat_ptr(param32.addrs);
+ } else
+#endif
+ {
+ if (len < sizeof(param))
+ return -EINVAL;
+ if (copy_from_user(&param, optval, sizeof(param)))
+ return -EFAULT;
+ }
+
+ err = __sctp_setsockopt_connectx(sk, (struct sockaddr __user *)
+ param.addrs, param.addr_num,
+ &assoc_id);
+ if (err == 0 || err == -EINPROGRESS) {
+ if (copy_to_user(optval, &assoc_id, sizeof(assoc_id)))
+ return -EFAULT;
+ if (put_user(sizeof(assoc_id), optlen))
+ return -EFAULT;
+ }
+
+ return err;
+}
+
+/* API 3.1.4 close() - UDP Style Syntax
+ * Applications use close() to perform graceful shutdown (as described in
+ * Section 10.1 of [SCTP]) on ALL the associations currently represented
+ * by a UDP-style socket.
+ *
+ * The syntax is
+ *
+ * ret = close(int sd);
+ *
+ * sd - the socket descriptor of the associations to be closed.
+ *
+ * To gracefully shutdown a specific association represented by the
+ * UDP-style socket, an application should use the sendmsg() call,
+ * passing no user data, but including the appropriate flag in the
+ * ancillary data (see Section xxxx).
+ *
+ * If sd in the close() call is a branched-off socket representing only
+ * one association, the shutdown is performed on that association only.
+ *
+ * 4.1.6 close() - TCP Style Syntax
+ *
+ * Applications use close() to gracefully close down an association.
+ *
+ * The syntax is:
+ *
+ * int close(int sd);
+ *
+ * sd - the socket descriptor of the association to be closed.
+ *
+ * After an application calls close() on a socket descriptor, no further
+ * socket operations will succeed on that descriptor.
+ *
+ * API 7.1.4 SO_LINGER
+ *
+ * An application using the TCP-style socket can use this option to
+ * perform the SCTP ABORT primitive. The linger option structure is:
+ *
+ * struct linger {
+ * int l_onoff; // option on/off
+ * int l_linger; // linger time
+ * };
+ *
+ * To enable the option, set l_onoff to 1. If the l_linger value is set
+ * to 0, calling close() is the same as the ABORT primitive. If the
+ * value is set to a negative value, the setsockopt() call will return
+ * an error. If the value is set to a positive value linger_time, the
+ * close() can be blocked for at most linger_time ms. If the graceful
+ * shutdown phase does not finish during this period, close() will
+ * return but the graceful shutdown phase continues in the system.
+ */
+static void sctp_close(struct sock *sk, long timeout)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_endpoint *ep;
+ struct sctp_association *asoc;
+ struct list_head *pos, *temp;
+ unsigned int data_was_unread;
+
+ pr_debug("%s: sk:%p, timeout:%ld\n", __func__, sk, timeout);
+
+ lock_sock(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ sk->sk_state = SCTP_SS_CLOSING;
+
+ ep = sctp_sk(sk)->ep;
+
+ /* Clean up any skbs sitting on the receive queue. */
+ data_was_unread = sctp_queue_purge_ulpevents(&sk->sk_receive_queue);
+ data_was_unread += sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby);
+
+ /* Walk all associations on an endpoint. */
+ list_for_each_safe(pos, temp, &ep->asocs) {
+ asoc = list_entry(pos, struct sctp_association, asocs);
+
+ if (sctp_style(sk, TCP)) {
+ /* A closed association can still be in the list if
+ * it belongs to a TCP-style listening socket that is
+ * not yet accepted. If so, free it. If not, send an
+ * ABORT or SHUTDOWN based on the linger options.
+ */
+ if (sctp_state(asoc, CLOSED)) {
+ sctp_unhash_established(asoc);
+ sctp_association_free(asoc);
+ continue;
+ }
+ }
+
+ if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) ||
+ !skb_queue_empty(&asoc->ulpq.reasm) ||
+ (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) {
+ struct sctp_chunk *chunk;
+
+ chunk = sctp_make_abort_user(asoc, NULL, 0);
+ if (chunk)
+ sctp_primitive_ABORT(net, asoc, chunk);
+ } else
+ sctp_primitive_SHUTDOWN(net, asoc, NULL);
+ }
+
+ /* On a TCP-style socket, block for at most linger_time if set. */
+ if (sctp_style(sk, TCP) && timeout)
+ sctp_wait_for_close(sk, timeout);
+
+ /* This will run the backlog queue. */
+ release_sock(sk);
+
+ /* Supposedly, no process has access to the socket, but
+ * the net layers still may.
+ * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
+ * held and that should be grabbed before socket lock.
+ */
+ spin_lock_bh(&net->sctp.addr_wq_lock);
+ bh_lock_sock(sk);
+
+ /* Hold the sock, since sk_common_release() will put sock_put()
+ * and we have just a little more cleanup.
+ */
+ sock_hold(sk);
+ sk_common_release(sk);
+
+ bh_unlock_sock(sk);
+ spin_unlock_bh(&net->sctp.addr_wq_lock);
+
+ sock_put(sk);
+
+ SCTP_DBG_OBJCNT_DEC(sock);
+}
+
+/* Handle EPIPE error. */
+static int sctp_error(struct sock *sk, int flags, int err)
+{
+ if (err == -EPIPE)
+ err = sock_error(sk) ? : -EPIPE;
+ if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ return err;
+}
+
+/* API 3.1.3 sendmsg() - UDP Style Syntax
+ *
+ * An application uses sendmsg() and recvmsg() calls to transmit data to
+ * and receive data from its peer.
+ *
+ * ssize_t sendmsg(int socket, const struct msghdr *message,
+ * int flags);
+ *
+ * socket - the socket descriptor of the endpoint.
+ * message - pointer to the msghdr structure which contains a single
+ * user message and possibly some ancillary data.
+ *
+ * See Section 5 for complete description of the data
+ * structures.
+ *
+ * flags - flags sent or received with the user message, see Section
+ * 5 for complete description of the flags.
+ *
+ * Note: This function could use a rewrite especially when explicit
+ * connect support comes in.
+ */
+/* BUG: We do not implement the equivalent of sk_stream_wait_memory(). */
+
+static int sctp_msghdr_parse(const struct msghdr *, sctp_cmsgs_t *);
+
+static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_sock *sp;
+ struct sctp_endpoint *ep;
+ struct sctp_association *new_asoc = NULL, *asoc = NULL;
+ struct sctp_transport *transport, *chunk_tp;
+ struct sctp_chunk *chunk;
+ union sctp_addr to;
+ struct sockaddr *msg_name = NULL;
+ struct sctp_sndrcvinfo default_sinfo;
+ struct sctp_sndrcvinfo *sinfo;
+ struct sctp_initmsg *sinit;
+ sctp_assoc_t associd = 0;
+ sctp_cmsgs_t cmsgs = { NULL };
+ sctp_scope_t scope;
+ bool fill_sinfo_ttl = false, wait_connect = false;
+ struct sctp_datamsg *datamsg;
+ int msg_flags = msg->msg_flags;
+ __u16 sinfo_flags = 0;
+ long timeo;
+ int err;
+
+ err = 0;
+ sp = sctp_sk(sk);
+ ep = sp->ep;
+
+ pr_debug("%s: sk:%p, msg:%p, msg_len:%zu ep:%p\n", __func__, sk,
+ msg, msg_len, ep);
+
+ /* We cannot send a message over a TCP-style listening socket. */
+ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) {
+ err = -EPIPE;
+ goto out_nounlock;
+ }
+
+ /* Parse out the SCTP CMSGs. */
+ err = sctp_msghdr_parse(msg, &cmsgs);
+ if (err) {
+ pr_debug("%s: msghdr parse err:%x\n", __func__, err);
+ goto out_nounlock;
+ }
+
+ /* Fetch the destination address for this packet. This
+ * address only selects the association--it is not necessarily
+ * the address we will send to.
+ * For a peeled-off socket, msg_name is ignored.
+ */
+ if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) {
+ int msg_namelen = msg->msg_namelen;
+
+ err = sctp_verify_addr(sk, (union sctp_addr *)msg->msg_name,
+ msg_namelen);
+ if (err)
+ return err;
+
+ if (msg_namelen > sizeof(to))
+ msg_namelen = sizeof(to);
+ memcpy(&to, msg->msg_name, msg_namelen);
+ msg_name = msg->msg_name;
+ }
+
+ sinit = cmsgs.init;
+ if (cmsgs.sinfo != NULL) {
+ memset(&default_sinfo, 0, sizeof(default_sinfo));
+ default_sinfo.sinfo_stream = cmsgs.sinfo->snd_sid;
+ default_sinfo.sinfo_flags = cmsgs.sinfo->snd_flags;
+ default_sinfo.sinfo_ppid = cmsgs.sinfo->snd_ppid;
+ default_sinfo.sinfo_context = cmsgs.sinfo->snd_context;
+ default_sinfo.sinfo_assoc_id = cmsgs.sinfo->snd_assoc_id;
+
+ sinfo = &default_sinfo;
+ fill_sinfo_ttl = true;
+ } else {
+ sinfo = cmsgs.srinfo;
+ }
+ /* Did the user specify SNDINFO/SNDRCVINFO? */
+ if (sinfo) {
+ sinfo_flags = sinfo->sinfo_flags;
+ associd = sinfo->sinfo_assoc_id;
+ }
+
+ pr_debug("%s: msg_len:%zu, sinfo_flags:0x%x\n", __func__,
+ msg_len, sinfo_flags);
+
+ /* SCTP_EOF or SCTP_ABORT cannot be set on a TCP-style socket. */
+ if (sctp_style(sk, TCP) && (sinfo_flags & (SCTP_EOF | SCTP_ABORT))) {
+ err = -EINVAL;
+ goto out_nounlock;
+ }
+
+ /* If SCTP_EOF is set, no data can be sent. Disallow sending zero
+ * length messages when SCTP_EOF|SCTP_ABORT is not set.
+ * If SCTP_ABORT is set, the message length could be non zero with
+ * the msg_iov set to the user abort reason.
+ */
+ if (((sinfo_flags & SCTP_EOF) && (msg_len > 0)) ||
+ (!(sinfo_flags & (SCTP_EOF|SCTP_ABORT)) && (msg_len == 0))) {
+ err = -EINVAL;
+ goto out_nounlock;
+ }
+
+ /* If SCTP_ADDR_OVER is set, there must be an address
+ * specified in msg_name.
+ */
+ if ((sinfo_flags & SCTP_ADDR_OVER) && (!msg->msg_name)) {
+ err = -EINVAL;
+ goto out_nounlock;
+ }
+
+ transport = NULL;
+
+ pr_debug("%s: about to look up association\n", __func__);
+
+ lock_sock(sk);
+
+ /* If a msg_name has been specified, assume this is to be used. */
+ if (msg_name) {
+ /* Look for a matching association on the endpoint. */
+ asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport);
+ if (!asoc) {
+ /* If we could not find a matching association on the
+ * endpoint, make sure that it is not a TCP-style
+ * socket that already has an association or there is
+ * no peeled-off association on another socket.
+ */
+ if ((sctp_style(sk, TCP) &&
+ sctp_sstate(sk, ESTABLISHED)) ||
+ sctp_endpoint_is_peeled_off(ep, &to)) {
+ err = -EADDRNOTAVAIL;
+ goto out_unlock;
+ }
+ }
+ } else {
+ asoc = sctp_id2assoc(sk, associd);
+ if (!asoc) {
+ err = -EPIPE;
+ goto out_unlock;
+ }
+ }
+
+ if (asoc) {
+ pr_debug("%s: just looked up association:%p\n", __func__, asoc);
+
+ /* We cannot send a message on a TCP-style SCTP_SS_ESTABLISHED
+ * socket that has an association in CLOSED state. This can
+ * happen when an accepted socket has an association that is
+ * already CLOSED.
+ */
+ if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) {
+ err = -EPIPE;
+ goto out_unlock;
+ }
+
+ if (sinfo_flags & SCTP_EOF) {
+ pr_debug("%s: shutting down association:%p\n",
+ __func__, asoc);
+
+ sctp_primitive_SHUTDOWN(net, asoc, NULL);
+ err = 0;
+ goto out_unlock;
+ }
+ if (sinfo_flags & SCTP_ABORT) {
+
+ chunk = sctp_make_abort_user(asoc, msg, msg_len);
+ if (!chunk) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ pr_debug("%s: aborting association:%p\n",
+ __func__, asoc);
+
+ sctp_primitive_ABORT(net, asoc, chunk);
+ err = 0;
+ goto out_unlock;
+ }
+ }
+
+ /* Do we need to create the association? */
+ if (!asoc) {
+ pr_debug("%s: there is no association yet\n", __func__);
+
+ if (sinfo_flags & (SCTP_EOF | SCTP_ABORT)) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Check for invalid stream against the stream counts,
+ * either the default or the user specified stream counts.
+ */
+ if (sinfo) {
+ if (!sinit || !sinit->sinit_num_ostreams) {
+ /* Check against the defaults. */
+ if (sinfo->sinfo_stream >=
+ sp->initmsg.sinit_num_ostreams) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ } else {
+ /* Check against the requested. */
+ if (sinfo->sinfo_stream >=
+ sinit->sinit_num_ostreams) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ }
+ }
+
+ /*
+ * API 3.1.2 bind() - UDP Style Syntax
+ * If a bind() or sctp_bindx() is not called prior to a
+ * sendmsg() call that initiates a new association, the
+ * system picks an ephemeral port and will choose an address
+ * set equivalent to binding with a wildcard address.
+ */
+ if (!ep->base.bind_addr.port) {
+ if (sctp_autobind(sk)) {
+ err = -EAGAIN;
+ goto out_unlock;
+ }
+ } else {
+ /*
+ * If an unprivileged user inherits a one-to-many
+ * style socket with open associations on a privileged
+ * port, it MAY be permitted to accept new associations,
+ * but it SHOULD NOT be permitted to open new
+ * associations.
+ */
+ if (ep->base.bind_addr.port < PROT_SOCK &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
+ err = -EACCES;
+ goto out_unlock;
+ }
+ }
+
+ scope = sctp_scope(&to);
+ new_asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+ if (!new_asoc) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ asoc = new_asoc;
+ err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
+ if (err < 0) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ /* If the SCTP_INIT ancillary data is specified, set all
+ * the association init values accordingly.
+ */
+ if (sinit) {
+ if (sinit->sinit_num_ostreams) {
+ asoc->c.sinit_num_ostreams =
+ sinit->sinit_num_ostreams;
+ }
+ if (sinit->sinit_max_instreams) {
+ asoc->c.sinit_max_instreams =
+ sinit->sinit_max_instreams;
+ }
+ if (sinit->sinit_max_attempts) {
+ asoc->max_init_attempts
+ = sinit->sinit_max_attempts;
+ }
+ if (sinit->sinit_max_init_timeo) {
+ asoc->max_init_timeo =
+ msecs_to_jiffies(sinit->sinit_max_init_timeo);
+ }
+ }
+
+ /* Prime the peer's transport structures. */
+ transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN);
+ if (!transport) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ }
+
+ /* ASSERT: we have a valid association at this point. */
+ pr_debug("%s: we have a valid association\n", __func__);
+
+ if (!sinfo) {
+ /* If the user didn't specify SNDINFO/SNDRCVINFO, make up
+ * one with some defaults.
+ */
+ memset(&default_sinfo, 0, sizeof(default_sinfo));
+ default_sinfo.sinfo_stream = asoc->default_stream;
+ default_sinfo.sinfo_flags = asoc->default_flags;
+ default_sinfo.sinfo_ppid = asoc->default_ppid;
+ default_sinfo.sinfo_context = asoc->default_context;
+ default_sinfo.sinfo_timetolive = asoc->default_timetolive;
+ default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc);
+
+ sinfo = &default_sinfo;
+ } else if (fill_sinfo_ttl) {
+ /* In case SNDINFO was specified, we still need to fill
+ * it with a default ttl from the assoc here.
+ */
+ sinfo->sinfo_timetolive = asoc->default_timetolive;
+ }
+
+ /* API 7.1.7, the sndbuf size per association bounds the
+ * maximum size of data that can be sent in a single send call.
+ */
+ if (msg_len > sk->sk_sndbuf) {
+ err = -EMSGSIZE;
+ goto out_free;
+ }
+
+ if (asoc->pmtu_pending)
+ sctp_assoc_pending_pmtu(sk, asoc);
+
+ /* If fragmentation is disabled and the message length exceeds the
+ * association fragmentation point, return EMSGSIZE. The I-D
+ * does not specify what this error is, but this looks like
+ * a great fit.
+ */
+ if (sctp_sk(sk)->disable_fragments && (msg_len > asoc->frag_point)) {
+ err = -EMSGSIZE;
+ goto out_free;
+ }
+
+ /* Check for invalid stream. */
+ if (sinfo->sinfo_stream >= asoc->c.sinit_num_ostreams) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ if (!sctp_wspace(asoc)) {
+ err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
+ if (err)
+ goto out_free;
+ }
+
+ /* If an address is passed with the sendto/sendmsg call, it is used
+ * to override the primary destination address in the TCP model, or
+ * when SCTP_ADDR_OVER flag is set in the UDP model.
+ */
+ if ((sctp_style(sk, TCP) && msg_name) ||
+ (sinfo_flags & SCTP_ADDR_OVER)) {
+ chunk_tp = sctp_assoc_lookup_paddr(asoc, &to);
+ if (!chunk_tp) {
+ err = -EINVAL;
+ goto out_free;
+ }
+ } else
+ chunk_tp = NULL;
+
+ /* Auto-connect, if we aren't connected already. */
+ if (sctp_state(asoc, CLOSED)) {
+ err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
+ if (err < 0)
+ goto out_free;
+
+ wait_connect = true;
+ pr_debug("%s: we associated primitively\n", __func__);
+ }
+
+ /* Break the message into multiple chunks of maximum size. */
+ datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter);
+ if (IS_ERR(datamsg)) {
+ err = PTR_ERR(datamsg);
+ goto out_free;
+ }
+
+ /* Now send the (possibly) fragmented message. */
+ list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
+ sctp_chunk_hold(chunk);
+
+ /* Do accounting for the write space. */
+ sctp_set_owner_w(chunk);
+
+ chunk->transport = chunk_tp;
+ }
+
+ /* Send it to the lower layers. Note: all chunks
+ * must either fail or succeed. The lower layer
+ * works that way today. Keep it that way or this
+ * breaks.
+ */
+ err = sctp_primitive_SEND(net, asoc, datamsg);
+ /* Did the lower layer accept the chunk? */
+ if (err) {
+ sctp_datamsg_free(datamsg);
+ goto out_free;
+ }
+
+ pr_debug("%s: we sent primitively\n", __func__);
+
+ sctp_datamsg_put(datamsg);
+ err = msg_len;
+
+ if (unlikely(wait_connect)) {
+ timeo = sock_sndtimeo(sk, msg_flags & MSG_DONTWAIT);
+ sctp_wait_for_connect(asoc, &timeo);
+ }
+
+ /* If we are already past ASSOCIATE, the lower
+ * layers are responsible for association cleanup.
+ */
+ goto out_unlock;
+
+out_free:
+ if (new_asoc) {
+ sctp_unhash_established(asoc);
+ sctp_association_free(asoc);
+ }
+out_unlock:
+ release_sock(sk);
+
+out_nounlock:
+ return sctp_error(sk, msg_flags, err);
+
+#if 0
+do_sock_err:
+ if (msg_len)
+ err = msg_len;
+ else
+ err = sock_error(sk);
+ goto out;
+
+do_interrupted:
+ if (msg_len)
+ err = msg_len;
+ goto out;
+#endif /* 0 */
+}
+
+/* This is an extended version of skb_pull() that removes the data from the
+ * start of a skb even when data is spread across the list of skb's in the
+ * frag_list. len specifies the total amount of data that needs to be removed.
+ * when 'len' bytes could be removed from the skb, it returns 0.
+ * If 'len' exceeds the total skb length, it returns the no. of bytes that
+ * could not be removed.
+ */
+static int sctp_skb_pull(struct sk_buff *skb, int len)
+{
+ struct sk_buff *list;
+ int skb_len = skb_headlen(skb);
+ int rlen;
+
+ if (len <= skb_len) {
+ __skb_pull(skb, len);
+ return 0;
+ }
+ len -= skb_len;
+ __skb_pull(skb, skb_len);
+
+ skb_walk_frags(skb, list) {
+ rlen = sctp_skb_pull(list, len);
+ skb->len -= (len-rlen);
+ skb->data_len -= (len-rlen);
+
+ if (!rlen)
+ return 0;
+
+ len = rlen;
+ }
+
+ return len;
+}
+
+/* API 3.1.3 recvmsg() - UDP Style Syntax
+ *
+ * ssize_t recvmsg(int socket, struct msghdr *message,
+ * int flags);
+ *
+ * socket - the socket descriptor of the endpoint.
+ * message - pointer to the msghdr structure which contains a single
+ * user message and possibly some ancillary data.
+ *
+ * See Section 5 for complete description of the data
+ * structures.
+ *
+ * flags - flags sent or received with the user message, see Section
+ * 5 for complete description of the flags.
+ */
+static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
+{
+ struct sctp_ulpevent *event = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sk_buff *skb;
+ int copied;
+ int err = 0;
+ int skb_len;
+
+ pr_debug("%s: sk:%p, msghdr:%p, len:%zd, noblock:%d, flags:0x%x, "
+ "addr_len:%p)\n", __func__, sk, msg, len, noblock, flags,
+ addr_len);
+
+ lock_sock(sk);
+
+ if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) {
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ skb = sctp_skb_recv_datagram(sk, flags, noblock, &err);
+ if (!skb)
+ goto out;
+
+ /* Get the total length of the skb including any skb's in the
+ * frag_list.
+ */
+ skb_len = skb->len;
+
+ copied = skb_len;
+ if (copied > len)
+ copied = len;
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+
+ event = sctp_skb2event(skb);
+
+ if (err)
+ goto out_free;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+ if (sctp_ulpevent_is_notification(event)) {
+ msg->msg_flags |= MSG_NOTIFICATION;
+ sp->pf->event_msgname(event, msg->msg_name, addr_len);
+ } else {
+ sp->pf->skb_msgname(skb, msg->msg_name, addr_len);
+ }
+
+ /* Check if we allow SCTP_NXTINFO. */
+ if (sp->recvnxtinfo)
+ sctp_ulpevent_read_nxtinfo(event, msg, sk);
+ /* Check if we allow SCTP_RCVINFO. */
+ if (sp->recvrcvinfo)
+ sctp_ulpevent_read_rcvinfo(event, msg);
+ /* Check if we allow SCTP_SNDRCVINFO. */
+ if (sp->subscribe.sctp_data_io_event)
+ sctp_ulpevent_read_sndrcvinfo(event, msg);
+
+#if 0
+ /* FIXME: we should be calling IP/IPv6 layers. */
+ if (sk->sk_protinfo.af_inet.cmsg_flags)
+ ip_cmsg_recv(msg, skb);
+#endif
+
+ err = copied;
+
+ /* If skb's length exceeds the user's buffer, update the skb and
+ * push it back to the receive_queue so that the next call to
+ * recvmsg() will return the remaining data. Don't set MSG_EOR.
+ */
+ if (skb_len > copied) {
+ msg->msg_flags &= ~MSG_EOR;
+ if (flags & MSG_PEEK)
+ goto out_free;
+ sctp_skb_pull(skb, copied);
+ skb_queue_head(&sk->sk_receive_queue, skb);
+
+ /* When only partial message is copied to the user, increase
+ * rwnd by that amount. If all the data in the skb is read,
+ * rwnd is updated when the event is freed.
+ */
+ if (!sctp_ulpevent_is_notification(event))
+ sctp_assoc_rwnd_increase(event->asoc, copied);
+ goto out;
+ } else if ((event->msg_flags & MSG_NOTIFICATION) ||
+ (event->msg_flags & MSG_EOR))
+ msg->msg_flags |= MSG_EOR;
+ else
+ msg->msg_flags &= ~MSG_EOR;
+
+out_free:
+ if (flags & MSG_PEEK) {
+ /* Release the skb reference acquired after peeking the skb in
+ * sctp_skb_recv_datagram().
+ */
+ kfree_skb(skb);
+ } else {
+ /* Free the event which includes releasing the reference to
+ * the owner of the skb, freeing the skb and updating the
+ * rwnd.
+ */
+ sctp_ulpevent_free(event);
+ }
+out:
+ release_sock(sk);
+ return err;
+}
+
+/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
+ *
+ * This option is a on/off flag. If enabled no SCTP message
+ * fragmentation will be performed. Instead if a message being sent
+ * exceeds the current PMTU size, the message will NOT be sent and
+ * instead a error will be indicated to the user.
+ */
+static int sctp_setsockopt_disable_fragments(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ sctp_sk(sk)->disable_fragments = (val == 0) ? 0 : 1;
+
+ return 0;
+}
+
+static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_association *asoc;
+ struct sctp_ulpevent *event;
+
+ if (optlen > sizeof(struct sctp_event_subscribe))
+ return -EINVAL;
+ if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen))
+ return -EFAULT;
+
+ if (sctp_sk(sk)->subscribe.sctp_data_io_event)
+ pr_warn_ratelimited(DEPRECATED "%s (pid %d) "
+ "Requested SCTP_SNDRCVINFO event.\n"
+ "Use SCTP_RCVINFO through SCTP_RECVRCVINFO option instead.\n",
+ current->comm, task_pid_nr(current));
+
+ /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
+ * if there is no data to be sent or retransmit, the stack will
+ * immediately send up this notification.
+ */
+ if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT,
+ &sctp_sk(sk)->subscribe)) {
+ asoc = sctp_id2assoc(sk, 0);
+
+ if (asoc && sctp_outq_is_empty(&asoc->outqueue)) {
+ event = sctp_ulpevent_make_sender_dry_event(asoc,
+ GFP_ATOMIC);
+ if (!event)
+ return -ENOMEM;
+
+ sctp_ulpq_tail_event(&asoc->ulpq, event);
+ }
+ }
+
+ return 0;
+}
+
+/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE)
+ *
+ * This socket option is applicable to the UDP-style socket only. When
+ * set it will cause associations that are idle for more than the
+ * specified number of seconds to automatically close. An association
+ * being idle is defined an association that has NOT sent or received
+ * user data. The special value of '0' indicates that no automatic
+ * close of any associations should be performed. The option expects an
+ * integer defining the number of seconds of idle time before an
+ * association is closed.
+ */
+static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct net *net = sock_net(sk);
+
+ /* Applicable to UDP-style socket only */
+ if (sctp_style(sk, TCP))
+ return -EOPNOTSUPP;
+ if (optlen != sizeof(int))
+ return -EINVAL;
+ if (copy_from_user(&sp->autoclose, optval, optlen))
+ return -EFAULT;
+
+ if (sp->autoclose > net->sctp.max_autoclose)
+ sp->autoclose = net->sctp.max_autoclose;
+
+ return 0;
+}
+
+/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS)
+ *
+ * Applications can enable or disable heartbeats for any peer address of
+ * an association, modify an address's heartbeat interval, force a
+ * heartbeat to be sent immediately, and adjust the address's maximum
+ * number of retransmissions sent before an address is considered
+ * unreachable. The following structure is used to access and modify an
+ * address's parameters:
+ *
+ * struct sctp_paddrparams {
+ * sctp_assoc_t spp_assoc_id;
+ * struct sockaddr_storage spp_address;
+ * uint32_t spp_hbinterval;
+ * uint16_t spp_pathmaxrxt;
+ * uint32_t spp_pathmtu;
+ * uint32_t spp_sackdelay;
+ * uint32_t spp_flags;
+ * };
+ *
+ * spp_assoc_id - (one-to-many style socket) This is filled in the
+ * application, and identifies the association for
+ * this query.
+ * spp_address - This specifies which address is of interest.
+ * spp_hbinterval - This contains the value of the heartbeat interval,
+ * in milliseconds. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
+ * spp_pathmaxrxt - This contains the maximum number of
+ * retransmissions before this address shall be
+ * considered unreachable. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
+ * spp_pathmtu - When Path MTU discovery is disabled the value
+ * specified here will be the "fixed" path mtu.
+ * Note that if the spp_address field is empty
+ * then all associations on this address will
+ * have this fixed path mtu set upon them.
+ *
+ * spp_sackdelay - When delayed sack is enabled, this value specifies
+ * the number of milliseconds that sacks will be delayed
+ * for. This value will apply to all addresses of an
+ * association if the spp_address field is empty. Note
+ * also, that if delayed sack is enabled and this
+ * value is set to 0, no change is made to the last
+ * recorded delayed sack timer value.
+ *
+ * spp_flags - These flags are used to control various features
+ * on an association. The flag field may contain
+ * zero or more of the following options.
+ *
+ * SPP_HB_ENABLE - Enable heartbeats on the
+ * specified address. Note that if the address
+ * field is empty all addresses for the association
+ * have heartbeats enabled upon them.
+ *
+ * SPP_HB_DISABLE - Disable heartbeats on the
+ * speicifed address. Note that if the address
+ * field is empty all addresses for the association
+ * will have their heartbeats disabled. Note also
+ * that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ * mutually exclusive, only one of these two should
+ * be specified. Enabling both fields will have
+ * undetermined results.
+ *
+ * SPP_HB_DEMAND - Request a user initiated heartbeat
+ * to be made immediately.
+ *
+ * SPP_HB_TIME_IS_ZERO - Specify's that the time for
+ * heartbeat delayis to be set to the value of 0
+ * milliseconds.
+ *
+ * SPP_PMTUD_ENABLE - This field will enable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected.
+ *
+ * SPP_PMTUD_DISABLE - This field will disable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected. Not also that
+ * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ * exclusive. Enabling both will have undetermined
+ * results.
+ *
+ * SPP_SACKDELAY_ENABLE - Setting this flag turns
+ * on delayed sack. The time specified in spp_sackdelay
+ * is used to specify the sack delay for this address. Note
+ * that if spp_address is empty then all addresses will
+ * enable delayed sack and take on the sack delay
+ * value specified in spp_sackdelay.
+ * SPP_SACKDELAY_DISABLE - Setting this flag turns
+ * off delayed sack. If the spp_address field is blank then
+ * delayed sack is disabled for the entire association. Note
+ * also that this field is mutually exclusive to
+ * SPP_SACKDELAY_ENABLE, setting both will have undefined
+ * results.
+ */
+static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
+ struct sctp_transport *trans,
+ struct sctp_association *asoc,
+ struct sctp_sock *sp,
+ int hb_change,
+ int pmtud_change,
+ int sackdelay_change)
+{
+ int error;
+
+ if (params->spp_flags & SPP_HB_DEMAND && trans) {
+ struct net *net = sock_net(trans->asoc->base.sk);
+
+ error = sctp_primitive_REQUESTHEARTBEAT(net, trans->asoc, trans);
+ if (error)
+ return error;
+ }
+
+ /* Note that unless the spp_flag is set to SPP_HB_ENABLE the value of
+ * this field is ignored. Note also that a value of zero indicates
+ * the current setting should be left unchanged.
+ */
+ if (params->spp_flags & SPP_HB_ENABLE) {
+
+ /* Re-zero the interval if the SPP_HB_TIME_IS_ZERO is
+ * set. This lets us use 0 value when this flag
+ * is set.
+ */
+ if (params->spp_flags & SPP_HB_TIME_IS_ZERO)
+ params->spp_hbinterval = 0;
+
+ if (params->spp_hbinterval ||
+ (params->spp_flags & SPP_HB_TIME_IS_ZERO)) {
+ if (trans) {
+ trans->hbinterval =
+ msecs_to_jiffies(params->spp_hbinterval);
+ } else if (asoc) {
+ asoc->hbinterval =
+ msecs_to_jiffies(params->spp_hbinterval);
+ } else {
+ sp->hbinterval = params->spp_hbinterval;
+ }
+ }
+ }
+
+ if (hb_change) {
+ if (trans) {
+ trans->param_flags =
+ (trans->param_flags & ~SPP_HB) | hb_change;
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_HB) | hb_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_HB) | hb_change;
+ }
+ }
+
+ /* When Path MTU discovery is disabled the value specified here will
+ * be the "fixed" path mtu (i.e. the value of the spp_flags field must
+ * include the flag SPP_PMTUD_DISABLE for this field to have any
+ * effect).
+ */
+ if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) {
+ if (trans) {
+ trans->pathmtu = params->spp_pathmtu;
+ sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
+ } else if (asoc) {
+ asoc->pathmtu = params->spp_pathmtu;
+ sctp_frag_point(asoc, params->spp_pathmtu);
+ } else {
+ sp->pathmtu = params->spp_pathmtu;
+ }
+ }
+
+ if (pmtud_change) {
+ if (trans) {
+ int update = (trans->param_flags & SPP_PMTUD_DISABLE) &&
+ (params->spp_flags & SPP_PMTUD_ENABLE);
+ trans->param_flags =
+ (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
+ if (update) {
+ sctp_transport_pmtu(trans, sctp_opt2sk(sp));
+ sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
+ }
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_PMTUD) | pmtud_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_PMTUD) | pmtud_change;
+ }
+ }
+
+ /* Note that unless the spp_flag is set to SPP_SACKDELAY_ENABLE the
+ * value of this field is ignored. Note also that a value of zero
+ * indicates the current setting should be left unchanged.
+ */
+ if ((params->spp_flags & SPP_SACKDELAY_ENABLE) && params->spp_sackdelay) {
+ if (trans) {
+ trans->sackdelay =
+ msecs_to_jiffies(params->spp_sackdelay);
+ } else if (asoc) {
+ asoc->sackdelay =
+ msecs_to_jiffies(params->spp_sackdelay);
+ } else {
+ sp->sackdelay = params->spp_sackdelay;
+ }
+ }
+
+ if (sackdelay_change) {
+ if (trans) {
+ trans->param_flags =
+ (trans->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ } else if (asoc) {
+ asoc->param_flags =
+ (asoc->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ } else {
+ sp->param_flags =
+ (sp->param_flags & ~SPP_SACKDELAY) |
+ sackdelay_change;
+ }
+ }
+
+ /* Note that a value of zero indicates the current setting should be
+ left unchanged.
+ */
+ if (params->spp_pathmaxrxt) {
+ if (trans) {
+ trans->pathmaxrxt = params->spp_pathmaxrxt;
+ } else if (asoc) {
+ asoc->pathmaxrxt = params->spp_pathmaxrxt;
+ } else {
+ sp->pathmaxrxt = params->spp_pathmaxrxt;
+ }
+ }
+
+ return 0;
+}
+
+static int sctp_setsockopt_peer_addr_params(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_paddrparams params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+ int error;
+ int hb_change, pmtud_change, sackdelay_change;
+
+ if (optlen != sizeof(struct sctp_paddrparams))
+ return -EINVAL;
+
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+
+ /* Validate flags and value parameters. */
+ hb_change = params.spp_flags & SPP_HB;
+ pmtud_change = params.spp_flags & SPP_PMTUD;
+ sackdelay_change = params.spp_flags & SPP_SACKDELAY;
+
+ if (hb_change == SPP_HB ||
+ pmtud_change == SPP_PMTUD ||
+ sackdelay_change == SPP_SACKDELAY ||
+ params.spp_sackdelay > 500 ||
+ (params.spp_pathmtu &&
+ params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT))
+ return -EINVAL;
+
+ /* If an address other than INADDR_ANY is specified, and
+ * no transport is found, then the request is invalid.
+ */
+ if (!sctp_is_any(sk, (union sctp_addr *)&params.spp_address)) {
+ trans = sctp_addr_id2transport(sk, &params.spp_address,
+ params.spp_assoc_id);
+ if (!trans)
+ return -EINVAL;
+ }
+
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+ if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ /* Heartbeat demand can only be sent on a transport or
+ * association, but not a socket.
+ */
+ if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc)
+ return -EINVAL;
+
+ /* Process parameters. */
+ error = sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+ hb_change, pmtud_change,
+ sackdelay_change);
+
+ if (error)
+ return error;
+
+ /* If changes are for association, also apply parameters to each
+ * transport.
+ */
+ if (!trans && asoc) {
+ list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+ transports) {
+ sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+ hb_change, pmtud_change,
+ sackdelay_change);
+ }
+ }
+
+ return 0;
+}
+
+static inline __u32 sctp_spp_sackdelay_enable(__u32 param_flags)
+{
+ return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_ENABLE;
+}
+
+static inline __u32 sctp_spp_sackdelay_disable(__u32 param_flags)
+{
+ return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_DISABLE;
+}
+
+/*
+ * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK)
+ *
+ * This option will effect the way delayed acks are performed. This
+ * option allows you to get or set the delayed ack time, in
+ * milliseconds. It also allows changing the delayed ack frequency.
+ * Changing the frequency to 1 disables the delayed sack algorithm. If
+ * the assoc_id is 0, then this sets or gets the endpoints default
+ * values. If the assoc_id field is non-zero, then the set or get
+ * effects the specified association for the one to many model (the
+ * assoc_id field is ignored by the one to one model). Note that if
+ * sack_delay or sack_freq are 0 when setting this option, then the
+ * current values will remain unchanged.
+ *
+ * struct sctp_sack_info {
+ * sctp_assoc_t sack_assoc_id;
+ * uint32_t sack_delay;
+ * uint32_t sack_freq;
+ * };
+ *
+ * sack_assoc_id - This parameter, indicates which association the user
+ * is performing an action upon. Note that if this field's value is
+ * zero then the endpoints default value is changed (effecting future
+ * associations only).
+ *
+ * sack_delay - This parameter contains the number of milliseconds that
+ * the user is requesting the delayed ACK timer be set to. Note that
+ * this value is defined in the standard to be between 200 and 500
+ * milliseconds.
+ *
+ * sack_freq - This parameter contains the number of packets that must
+ * be received before a sack is sent without waiting for the delay
+ * timer to expire. The default value for this is 2, setting this
+ * value to 1 will disable the delayed sack algorithm.
+ */
+
+static int sctp_setsockopt_delayed_ack(struct sock *sk,
+ char __user *optval, unsigned int optlen)
+{
+ struct sctp_sack_info params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (optlen == sizeof(struct sctp_sack_info)) {
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+
+ if (params.sack_delay == 0 && params.sack_freq == 0)
+ return 0;
+ } else if (optlen == sizeof(struct sctp_assoc_value)) {
+ pr_warn_ratelimited(DEPRECATED
+ "%s (pid %d) "
+ "Use of struct sctp_assoc_value in delayed_ack socket option.\n"
+ "Use struct sctp_sack_info instead\n",
+ current->comm, task_pid_nr(current));
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+
+ if (params.sack_delay == 0)
+ params.sack_freq = 1;
+ else
+ params.sack_freq = 0;
+ } else
+ return -EINVAL;
+
+ /* Validate value parameter. */
+ if (params.sack_delay > 500)
+ return -EINVAL;
+
+ /* Get association, if sack_assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.sack_assoc_id);
+ if (!asoc && params.sack_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (params.sack_delay) {
+ if (asoc) {
+ asoc->sackdelay =
+ msecs_to_jiffies(params.sack_delay);
+ asoc->param_flags =
+ sctp_spp_sackdelay_enable(asoc->param_flags);
+ } else {
+ sp->sackdelay = params.sack_delay;
+ sp->param_flags =
+ sctp_spp_sackdelay_enable(sp->param_flags);
+ }
+ }
+
+ if (params.sack_freq == 1) {
+ if (asoc) {
+ asoc->param_flags =
+ sctp_spp_sackdelay_disable(asoc->param_flags);
+ } else {
+ sp->param_flags =
+ sctp_spp_sackdelay_disable(sp->param_flags);
+ }
+ } else if (params.sack_freq > 1) {
+ if (asoc) {
+ asoc->sackfreq = params.sack_freq;
+ asoc->param_flags =
+ sctp_spp_sackdelay_enable(asoc->param_flags);
+ } else {
+ sp->sackfreq = params.sack_freq;
+ sp->param_flags =
+ sctp_spp_sackdelay_enable(sp->param_flags);
+ }
+ }
+
+ /* If change is for association, also apply to each transport. */
+ if (asoc) {
+ list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+ transports) {
+ if (params.sack_delay) {
+ trans->sackdelay =
+ msecs_to_jiffies(params.sack_delay);
+ trans->param_flags =
+ sctp_spp_sackdelay_enable(trans->param_flags);
+ }
+ if (params.sack_freq == 1) {
+ trans->param_flags =
+ sctp_spp_sackdelay_disable(trans->param_flags);
+ } else if (params.sack_freq > 1) {
+ trans->sackfreq = params.sack_freq;
+ trans->param_flags =
+ sctp_spp_sackdelay_enable(trans->param_flags);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* 7.1.3 Initialization Parameters (SCTP_INITMSG)
+ *
+ * Applications can specify protocol parameters for the default association
+ * initialization. The option name argument to setsockopt() and getsockopt()
+ * is SCTP_INITMSG.
+ *
+ * Setting initialization parameters is effective only on an unconnected
+ * socket (for UDP-style sockets only future associations are effected
+ * by the change). With TCP-style sockets, this option is inherited by
+ * sockets derived from a listener socket.
+ */
+static int sctp_setsockopt_initmsg(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+ struct sctp_initmsg sinit;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (optlen != sizeof(struct sctp_initmsg))
+ return -EINVAL;
+ if (copy_from_user(&sinit, optval, optlen))
+ return -EFAULT;
+
+ if (sinit.sinit_num_ostreams)
+ sp->initmsg.sinit_num_ostreams = sinit.sinit_num_ostreams;
+ if (sinit.sinit_max_instreams)
+ sp->initmsg.sinit_max_instreams = sinit.sinit_max_instreams;
+ if (sinit.sinit_max_attempts)
+ sp->initmsg.sinit_max_attempts = sinit.sinit_max_attempts;
+ if (sinit.sinit_max_init_timeo)
+ sp->initmsg.sinit_max_init_timeo = sinit.sinit_max_init_timeo;
+
+ return 0;
+}
+
+/*
+ * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM)
+ *
+ * Applications that wish to use the sendto() system call may wish to
+ * specify a default set of parameters that would normally be supplied
+ * through the inclusion of ancillary data. This socket option allows
+ * such an application to set the default sctp_sndrcvinfo structure.
+ * The application that wishes to use this socket option simply passes
+ * in to this call the sctp_sndrcvinfo structure defined in Section
+ * 5.2.2) The input parameters accepted by this call include
+ * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context,
+ * sinfo_timetolive. The user must provide the sinfo_assoc_id field in
+ * to this call if the caller is using the UDP model.
+ */
+static int sctp_setsockopt_default_send_param(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_association *asoc;
+ struct sctp_sndrcvinfo info;
+
+ if (optlen != sizeof(info))
+ return -EINVAL;
+ if (copy_from_user(&info, optval, optlen))
+ return -EFAULT;
+ if (info.sinfo_flags &
+ ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_ABORT | SCTP_EOF))
+ return -EINVAL;
+
+ asoc = sctp_id2assoc(sk, info.sinfo_assoc_id);
+ if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+ if (asoc) {
+ asoc->default_stream = info.sinfo_stream;
+ asoc->default_flags = info.sinfo_flags;
+ asoc->default_ppid = info.sinfo_ppid;
+ asoc->default_context = info.sinfo_context;
+ asoc->default_timetolive = info.sinfo_timetolive;
+ } else {
+ sp->default_stream = info.sinfo_stream;
+ sp->default_flags = info.sinfo_flags;
+ sp->default_ppid = info.sinfo_ppid;
+ sp->default_context = info.sinfo_context;
+ sp->default_timetolive = info.sinfo_timetolive;
+ }
+
+ return 0;
+}
+
+/* RFC6458, Section 8.1.31. Set/get Default Send Parameters
+ * (SCTP_DEFAULT_SNDINFO)
+ */
+static int sctp_setsockopt_default_sndinfo(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_association *asoc;
+ struct sctp_sndinfo info;
+
+ if (optlen != sizeof(info))
+ return -EINVAL;
+ if (copy_from_user(&info, optval, optlen))
+ return -EFAULT;
+ if (info.snd_flags &
+ ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_ABORT | SCTP_EOF))
+ return -EINVAL;
+
+ asoc = sctp_id2assoc(sk, info.snd_assoc_id);
+ if (!asoc && info.snd_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+ if (asoc) {
+ asoc->default_stream = info.snd_sid;
+ asoc->default_flags = info.snd_flags;
+ asoc->default_ppid = info.snd_ppid;
+ asoc->default_context = info.snd_context;
+ } else {
+ sp->default_stream = info.snd_sid;
+ sp->default_flags = info.snd_flags;
+ sp->default_ppid = info.snd_ppid;
+ sp->default_context = info.snd_context;
+ }
+
+ return 0;
+}
+
+/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
+ *
+ * Requests that the local SCTP stack use the enclosed peer address as
+ * the association primary. The enclosed address must be one of the
+ * association peer's addresses.
+ */
+static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_prim prim;
+ struct sctp_transport *trans;
+
+ if (optlen != sizeof(struct sctp_prim))
+ return -EINVAL;
+
+ if (copy_from_user(&prim, optval, sizeof(struct sctp_prim)))
+ return -EFAULT;
+
+ trans = sctp_addr_id2transport(sk, &prim.ssp_addr, prim.ssp_assoc_id);
+ if (!trans)
+ return -EINVAL;
+
+ sctp_assoc_set_primary(trans->asoc, trans);
+
+ return 0;
+}
+
+/*
+ * 7.1.5 SCTP_NODELAY
+ *
+ * Turn on/off any Nagle-like algorithm. This means that packets are
+ * generally sent as soon as possible and no unnecessary delays are
+ * introduced, at the cost of more packets in the network. Expects an
+ * integer boolean flag.
+ */
+static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ sctp_sk(sk)->nodelay = (val == 0) ? 0 : 1;
+ return 0;
+}
+
+/*
+ *
+ * 7.1.1 SCTP_RTOINFO
+ *
+ * The protocol parameters used to initialize and bound retransmission
+ * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access
+ * and modify these parameters.
+ * All parameters are time values, in milliseconds. A value of 0, when
+ * modifying the parameters, indicates that the current value should not
+ * be changed.
+ *
+ */
+static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+ struct sctp_rtoinfo rtoinfo;
+ struct sctp_association *asoc;
+ unsigned long rto_min, rto_max;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (optlen != sizeof (struct sctp_rtoinfo))
+ return -EINVAL;
+
+ if (copy_from_user(&rtoinfo, optval, optlen))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id);
+
+ /* Set the values to the specific association */
+ if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ rto_max = rtoinfo.srto_max;
+ rto_min = rtoinfo.srto_min;
+
+ if (rto_max)
+ rto_max = asoc ? msecs_to_jiffies(rto_max) : rto_max;
+ else
+ rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max;
+
+ if (rto_min)
+ rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min;
+ else
+ rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min;
+
+ if (rto_min > rto_max)
+ return -EINVAL;
+
+ if (asoc) {
+ if (rtoinfo.srto_initial != 0)
+ asoc->rto_initial =
+ msecs_to_jiffies(rtoinfo.srto_initial);
+ asoc->rto_max = rto_max;
+ asoc->rto_min = rto_min;
+ } else {
+ /* If there is no association or the association-id = 0
+ * set the values to the endpoint.
+ */
+ if (rtoinfo.srto_initial != 0)
+ sp->rtoinfo.srto_initial = rtoinfo.srto_initial;
+ sp->rtoinfo.srto_max = rto_max;
+ sp->rtoinfo.srto_min = rto_min;
+ }
+
+ return 0;
+}
+
+/*
+ *
+ * 7.1.2 SCTP_ASSOCINFO
+ *
+ * This option is used to tune the maximum retransmission attempts
+ * of the association.
+ * Returns an error if the new association retransmission value is
+ * greater than the sum of the retransmission value of the peer.
+ * See [SCTP] for more information.
+ *
+ */
+static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+
+ struct sctp_assocparams assocparams;
+ struct sctp_association *asoc;
+
+ if (optlen != sizeof(struct sctp_assocparams))
+ return -EINVAL;
+ if (copy_from_user(&assocparams, optval, optlen))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id);
+
+ if (!asoc && assocparams.sasoc_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ /* Set the values to the specific association */
+ if (asoc) {
+ if (assocparams.sasoc_asocmaxrxt != 0) {
+ __u32 path_sum = 0;
+ int paths = 0;
+ struct sctp_transport *peer_addr;
+
+ list_for_each_entry(peer_addr, &asoc->peer.transport_addr_list,
+ transports) {
+ path_sum += peer_addr->pathmaxrxt;
+ paths++;
+ }
+
+ /* Only validate asocmaxrxt if we have more than
+ * one path/transport. We do this because path
+ * retransmissions are only counted when we have more
+ * then one path.
+ */
+ if (paths > 1 &&
+ assocparams.sasoc_asocmaxrxt > path_sum)
+ return -EINVAL;
+
+ asoc->max_retrans = assocparams.sasoc_asocmaxrxt;
+ }
+
+ if (assocparams.sasoc_cookie_life != 0)
+ asoc->cookie_life = ms_to_ktime(assocparams.sasoc_cookie_life);
+ } else {
+ /* Set the values to the endpoint */
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (assocparams.sasoc_asocmaxrxt != 0)
+ sp->assocparams.sasoc_asocmaxrxt =
+ assocparams.sasoc_asocmaxrxt;
+ if (assocparams.sasoc_cookie_life != 0)
+ sp->assocparams.sasoc_cookie_life =
+ assocparams.sasoc_cookie_life;
+ }
+ return 0;
+}
+
+/*
+ * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR)
+ *
+ * This socket option is a boolean flag which turns on or off mapped V4
+ * addresses. If this option is turned on and the socket is type
+ * PF_INET6, then IPv4 addresses will be mapped to V6 representation.
+ * If this option is turned off, then no mapping will be done of V4
+ * addresses and a user will receive both PF_INET6 and PF_INET type
+ * addresses on the socket.
+ */
+static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+ int val;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+ if (val)
+ sp->v4mapped = 1;
+ else
+ sp->v4mapped = 0;
+
+ return 0;
+}
+
+/*
+ * 8.1.16. Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG)
+ * This option will get or set the maximum size to put in any outgoing
+ * SCTP DATA chunk. If a message is larger than this size it will be
+ * fragmented by SCTP into the specified size. Note that the underlying
+ * SCTP implementation may fragment into smaller sized chunks when the
+ * PMTU of the underlying association is smaller than the value set by
+ * the user. The default value for this option is '0' which indicates
+ * the user is NOT limiting fragmentation and only the PMTU will effect
+ * SCTP's choice of DATA chunk size. Note also that values set larger
+ * than the maximum size of an IP datagram will effectively let SCTP
+ * control fragmentation (i.e. the same as setting this option to 0).
+ *
+ * The following structure is used to access and modify this parameter:
+ *
+ * struct sctp_assoc_value {
+ * sctp_assoc_t assoc_id;
+ * uint32_t assoc_value;
+ * };
+ *
+ * assoc_id: This parameter is ignored for one-to-one style sockets.
+ * For one-to-many style sockets this parameter indicates which
+ * association the user is performing an action upon. Note that if
+ * this field's value is zero then the endpoints default value is
+ * changed (effecting future associations only).
+ * assoc_value: This parameter specifies the maximum size in bytes.
+ */
+static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ struct sctp_sock *sp = sctp_sk(sk);
+ int val;
+
+ if (optlen == sizeof(int)) {
+ pr_warn_ratelimited(DEPRECATED
+ "%s (pid %d) "
+ "Use of int in maxseg socket option.\n"
+ "Use struct sctp_assoc_value instead\n",
+ current->comm, task_pid_nr(current));
+ if (copy_from_user(&val, optval, optlen))
+ return -EFAULT;
+ params.assoc_id = 0;
+ } else if (optlen == sizeof(struct sctp_assoc_value)) {
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+ val = params.assoc_value;
+ } else
+ return -EINVAL;
+
+ if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN)))
+ return -EINVAL;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (asoc) {
+ if (val == 0) {
+ val = asoc->pathmtu;
+ val -= sp->pf->af->net_header_len;
+ val -= sizeof(struct sctphdr) +
+ sizeof(struct sctp_data_chunk);
+ }
+ asoc->user_frag = val;
+ asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
+ } else {
+ sp->user_frag = val;
+ }
+
+ return 0;
+}
+
+
+/*
+ * 7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR)
+ *
+ * Requests that the peer mark the enclosed address as the association
+ * primary. The enclosed address must be one of the association's
+ * locally bound addresses. The following structure is used to make a
+ * set primary request:
+ */
+static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_sock *sp;
+ struct sctp_association *asoc = NULL;
+ struct sctp_setpeerprim prim;
+ struct sctp_chunk *chunk;
+ struct sctp_af *af;
+ int err;
+
+ sp = sctp_sk(sk);
+
+ if (!net->sctp.addip_enable)
+ return -EPERM;
+
+ if (optlen != sizeof(struct sctp_setpeerprim))
+ return -EINVAL;
+
+ if (copy_from_user(&prim, optval, optlen))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, prim.sspp_assoc_id);
+ if (!asoc)
+ return -EINVAL;
+
+ if (!asoc->peer.asconf_capable)
+ return -EPERM;
+
+ if (asoc->peer.addip_disabled_mask & SCTP_PARAM_SET_PRIMARY)
+ return -EPERM;
+
+ if (!sctp_state(asoc, ESTABLISHED))
+ return -ENOTCONN;
+
+ af = sctp_get_af_specific(prim.sspp_addr.ss_family);
+ if (!af)
+ return -EINVAL;
+
+ if (!af->addr_valid((union sctp_addr *)&prim.sspp_addr, sp, NULL))
+ return -EADDRNOTAVAIL;
+
+ if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim.sspp_addr))
+ return -EADDRNOTAVAIL;
+
+ /* Create an ASCONF chunk with SET_PRIMARY parameter */
+ chunk = sctp_make_asconf_set_prim(asoc,
+ (union sctp_addr *)&prim.sspp_addr);
+ if (!chunk)
+ return -ENOMEM;
+
+ err = sctp_send_asconf(asoc, chunk);
+
+ pr_debug("%s: we set peer primary addr primitively\n", __func__);
+
+ return err;
+}
+
+static int sctp_setsockopt_adaptation_layer(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_setadaptation adaptation;
+
+ if (optlen != sizeof(struct sctp_setadaptation))
+ return -EINVAL;
+ if (copy_from_user(&adaptation, optval, optlen))
+ return -EFAULT;
+
+ sctp_sk(sk)->adaptation_ind = adaptation.ssb_adaptation_ind;
+
+ return 0;
+}
+
+/*
+ * 7.1.29. Set or Get the default context (SCTP_CONTEXT)
+ *
+ * The context field in the sctp_sndrcvinfo structure is normally only
+ * used when a failed message is retrieved holding the value that was
+ * sent down on the actual send call. This option allows the setting of
+ * a default context on an association basis that will be received on
+ * reading messages from the peer. This is especially helpful in the
+ * one-2-many model for an application to keep some reference to an
+ * internal state machine that is processing messages on the
+ * association. Note that the setting of this value only effects
+ * received messages from the peer and does not effect the value that is
+ * saved with outbound messages.
+ */
+static int sctp_setsockopt_context(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_sock *sp;
+ struct sctp_association *asoc;
+
+ if (optlen != sizeof(struct sctp_assoc_value))
+ return -EINVAL;
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+
+ sp = sctp_sk(sk);
+
+ if (params.assoc_id != 0) {
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc)
+ return -EINVAL;
+ asoc->default_rcv_context = params.assoc_value;
+ } else {
+ sp->default_rcv_context = params.assoc_value;
+ }
+
+ return 0;
+}
+
+/*
+ * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE)
+ *
+ * This options will at a minimum specify if the implementation is doing
+ * fragmented interleave. Fragmented interleave, for a one to many
+ * socket, is when subsequent calls to receive a message may return
+ * parts of messages from different associations. Some implementations
+ * may allow you to turn this value on or off. If so, when turned off,
+ * no fragment interleave will occur (which will cause a head of line
+ * blocking amongst multiple associations sharing the same one to many
+ * socket). When this option is turned on, then each receive call may
+ * come from a different association (thus the user must receive data
+ * with the extended calls (e.g. sctp_recvmsg) to keep track of which
+ * association each receive belongs to.
+ *
+ * This option takes a boolean value. A non-zero value indicates that
+ * fragmented interleave is on. A value of zero indicates that
+ * fragmented interleave is off.
+ *
+ * Note that it is important that an implementation that allows this
+ * option to be turned on, have it off by default. Otherwise an unaware
+ * application using the one to many model may become confused and act
+ * incorrectly.
+ */
+static int sctp_setsockopt_fragment_interleave(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1;
+
+ return 0;
+}
+
+/*
+ * 8.1.21. Set or Get the SCTP Partial Delivery Point
+ * (SCTP_PARTIAL_DELIVERY_POINT)
+ *
+ * This option will set or get the SCTP partial delivery point. This
+ * point is the size of a message where the partial delivery API will be
+ * invoked to help free up rwnd space for the peer. Setting this to a
+ * lower value will cause partial deliveries to happen more often. The
+ * calls argument is an integer that sets or gets the partial delivery
+ * point. Note also that the call will fail if the user attempts to set
+ * this value larger than the socket receive buffer size.
+ *
+ * Note that any single message having a length smaller than or equal to
+ * the SCTP partial delivery point will be delivered in one single read
+ * call as long as the user provided buffer is large enough to hold the
+ * message.
+ */
+static int sctp_setsockopt_partial_delivery_point(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ u32 val;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ /* Note: We double the receive buffer from what the user sets
+ * it to be, also initial rwnd is based on rcvbuf/2.
+ */
+ if (val > (sk->sk_rcvbuf >> 1))
+ return -EINVAL;
+
+ sctp_sk(sk)->pd_point = val;
+
+ return 0; /* is this the right error code? */
+}
+
+/*
+ * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST)
+ *
+ * This option will allow a user to change the maximum burst of packets
+ * that can be emitted by this association. Note that the default value
+ * is 4, and some implementations may restrict this setting so that it
+ * can only be lowered.
+ *
+ * NOTE: This text doesn't seem right. Do this on a socket basis with
+ * future associations inheriting the socket value.
+ */
+static int sctp_setsockopt_maxburst(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_sock *sp;
+ struct sctp_association *asoc;
+ int val;
+ int assoc_id = 0;
+
+ if (optlen == sizeof(int)) {
+ pr_warn_ratelimited(DEPRECATED
+ "%s (pid %d) "
+ "Use of int in max_burst socket option deprecated.\n"
+ "Use struct sctp_assoc_value instead\n",
+ current->comm, task_pid_nr(current));
+ if (copy_from_user(&val, optval, optlen))
+ return -EFAULT;
+ } else if (optlen == sizeof(struct sctp_assoc_value)) {
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+ val = params.assoc_value;
+ assoc_id = params.assoc_id;
+ } else
+ return -EINVAL;
+
+ sp = sctp_sk(sk);
+
+ if (assoc_id != 0) {
+ asoc = sctp_id2assoc(sk, assoc_id);
+ if (!asoc)
+ return -EINVAL;
+ asoc->max_burst = val;
+ } else
+ sp->max_burst = val;
+
+ return 0;
+}
+
+/*
+ * 7.1.18. Add a chunk that must be authenticated (SCTP_AUTH_CHUNK)
+ *
+ * This set option adds a chunk type that the user is requesting to be
+ * received only in an authenticated way. Changes to the list of chunks
+ * will only effect future associations on the socket.
+ */
+static int sctp_setsockopt_auth_chunk(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_authchunk val;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ if (optlen != sizeof(struct sctp_authchunk))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, optlen))
+ return -EFAULT;
+
+ switch (val.sauth_chunk) {
+ case SCTP_CID_INIT:
+ case SCTP_CID_INIT_ACK:
+ case SCTP_CID_SHUTDOWN_COMPLETE:
+ case SCTP_CID_AUTH:
+ return -EINVAL;
+ }
+
+ /* add this chunk id to the endpoint */
+ return sctp_auth_ep_add_chunkid(ep, val.sauth_chunk);
+}
+
+/*
+ * 7.1.19. Get or set the list of supported HMAC Identifiers (SCTP_HMAC_IDENT)
+ *
+ * This option gets or sets the list of HMAC algorithms that the local
+ * endpoint requires the peer to use.
+ */
+static int sctp_setsockopt_hmac_ident(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_hmacalgo *hmacs;
+ u32 idents;
+ int err;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ if (optlen < sizeof(struct sctp_hmacalgo))
+ return -EINVAL;
+
+ hmacs = memdup_user(optval, optlen);
+ if (IS_ERR(hmacs))
+ return PTR_ERR(hmacs);
+
+ idents = hmacs->shmac_num_idents;
+ if (idents == 0 || idents > SCTP_AUTH_NUM_HMACS ||
+ (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = sctp_auth_ep_set_hmacs(ep, hmacs);
+out:
+ kfree(hmacs);
+ return err;
+}
+
+/*
+ * 7.1.20. Set a shared key (SCTP_AUTH_KEY)
+ *
+ * This option will set a shared secret key which is used to build an
+ * association shared key.
+ */
+static int sctp_setsockopt_auth_key(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_authkey *authkey;
+ struct sctp_association *asoc;
+ int ret;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ if (optlen <= sizeof(struct sctp_authkey))
+ return -EINVAL;
+
+ authkey = memdup_user(optval, optlen);
+ if (IS_ERR(authkey))
+ return PTR_ERR(authkey);
+
+ if (authkey->sca_keylength > optlen - sizeof(struct sctp_authkey)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ asoc = sctp_id2assoc(sk, authkey->sca_assoc_id);
+ if (!asoc && authkey->sca_assoc_id && sctp_style(sk, UDP)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sctp_auth_set_key(ep, asoc, authkey);
+out:
+ kzfree(authkey);
+ return ret;
+}
+
+/*
+ * 7.1.21. Get or set the active shared key (SCTP_AUTH_ACTIVE_KEY)
+ *
+ * This option will get or set the active shared key to be used to build
+ * the association shared key.
+ */
+static int sctp_setsockopt_active_key(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_authkeyid val;
+ struct sctp_association *asoc;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ if (optlen != sizeof(struct sctp_authkeyid))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, optlen))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, val.scact_assoc_id);
+ if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ return sctp_auth_set_active_key(ep, asoc, val.scact_keynumber);
+}
+
+/*
+ * 7.1.22. Delete a shared key (SCTP_AUTH_DELETE_KEY)
+ *
+ * This set option will delete a shared secret key from use.
+ */
+static int sctp_setsockopt_del_key(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_authkeyid val;
+ struct sctp_association *asoc;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ if (optlen != sizeof(struct sctp_authkeyid))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, optlen))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, val.scact_assoc_id);
+ if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ return sctp_auth_del_key_id(ep, asoc, val.scact_keynumber);
+
+}
+
+/*
+ * 8.1.23 SCTP_AUTO_ASCONF
+ *
+ * This option will enable or disable the use of the automatic generation of
+ * ASCONF chunks to add and delete addresses to an existing association. Note
+ * that this option has two caveats namely: a) it only affects sockets that
+ * are bound to all addresses available to the SCTP stack, and b) the system
+ * administrator may have an overriding control that turns the ASCONF feature
+ * off no matter what setting the socket option may have.
+ * This option expects an integer boolean flag, where a non-zero value turns on
+ * the option, and a zero value turns off the option.
+ * Note. In this implementation, socket operation overrides default parameter
+ * being set by sysctl as well as FreeBSD implementation
+ */
+static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+ if (!sctp_is_ep_boundall(sk) && val)
+ return -EINVAL;
+ if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
+ return 0;
+
+ spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
+ if (val == 0 && sp->do_auto_asconf) {
+ list_del(&sp->auto_asconf_list);
+ sp->do_auto_asconf = 0;
+ } else if (val && !sp->do_auto_asconf) {
+ list_add_tail(&sp->auto_asconf_list,
+ &sock_net(sk)->sctp.auto_asconf_splist);
+ sp->do_auto_asconf = 1;
+ }
+ spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
+ return 0;
+}
+
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association. See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_paddrthlds val;
+ struct sctp_transport *trans;
+ struct sctp_association *asoc;
+
+ if (optlen < sizeof(struct sctp_paddrthlds))
+ return -EINVAL;
+ if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+ sizeof(struct sctp_paddrthlds)))
+ return -EFAULT;
+
+
+ if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+ asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+ if (!asoc)
+ return -ENOENT;
+ list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+ transports) {
+ if (val.spt_pathmaxrxt)
+ trans->pathmaxrxt = val.spt_pathmaxrxt;
+ trans->pf_retrans = val.spt_pathpfthld;
+ }
+
+ if (val.spt_pathmaxrxt)
+ asoc->pathmaxrxt = val.spt_pathmaxrxt;
+ asoc->pf_retrans = val.spt_pathpfthld;
+ } else {
+ trans = sctp_addr_id2transport(sk, &val.spt_address,
+ val.spt_assoc_id);
+ if (!trans)
+ return -ENOENT;
+
+ if (val.spt_pathmaxrxt)
+ trans->pathmaxrxt = val.spt_pathmaxrxt;
+ trans->pf_retrans = val.spt_pathpfthld;
+ }
+
+ return 0;
+}
+
+static int sctp_setsockopt_recvrcvinfo(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+
+ sctp_sk(sk)->recvrcvinfo = (val == 0) ? 0 : 1;
+
+ return 0;
+}
+
+static int sctp_setsockopt_recvnxtinfo(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (get_user(val, (int __user *) optval))
+ return -EFAULT;
+
+ sctp_sk(sk)->recvnxtinfo = (val == 0) ? 0 : 1;
+
+ return 0;
+}
+
+/* API 6.2 setsockopt(), getsockopt()
+ *
+ * Applications use setsockopt() and getsockopt() to set or retrieve
+ * socket options. Socket options are used to change the default
+ * behavior of sockets calls. They are described in Section 7.
+ *
+ * The syntax is:
+ *
+ * ret = getsockopt(int sd, int level, int optname, void __user *optval,
+ * int __user *optlen);
+ * ret = setsockopt(int sd, int level, int optname, const void __user *optval,
+ * int optlen);
+ *
+ * sd - the socket descript.
+ * level - set to IPPROTO_SCTP for all SCTP options.
+ * optname - the option name.
+ * optval - the buffer to store the value of the option.
+ * optlen - the size of the buffer.
+ */
+static int sctp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ int retval = 0;
+
+ pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname);
+
+ /* I can hardly begin to describe how wrong this is. This is
+ * so broken as to be worse than useless. The API draft
+ * REALLY is NOT helpful here... I am not convinced that the
+ * semantics of setsockopt() with a level OTHER THAN SOL_SCTP
+ * are at all well-founded.
+ */
+ if (level != SOL_SCTP) {
+ struct sctp_af *af = sctp_sk(sk)->pf->af;
+ retval = af->setsockopt(sk, level, optname, optval, optlen);
+ goto out_nounlock;
+ }
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case SCTP_SOCKOPT_BINDX_ADD:
+ /* 'optlen' is the size of the addresses buffer. */
+ retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval,
+ optlen, SCTP_BINDX_ADD_ADDR);
+ break;
+
+ case SCTP_SOCKOPT_BINDX_REM:
+ /* 'optlen' is the size of the addresses buffer. */
+ retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval,
+ optlen, SCTP_BINDX_REM_ADDR);
+ break;
+
+ case SCTP_SOCKOPT_CONNECTX_OLD:
+ /* 'optlen' is the size of the addresses buffer. */
+ retval = sctp_setsockopt_connectx_old(sk,
+ (struct sockaddr __user *)optval,
+ optlen);
+ break;
+
+ case SCTP_SOCKOPT_CONNECTX:
+ /* 'optlen' is the size of the addresses buffer. */
+ retval = sctp_setsockopt_connectx(sk,
+ (struct sockaddr __user *)optval,
+ optlen);
+ break;
+
+ case SCTP_DISABLE_FRAGMENTS:
+ retval = sctp_setsockopt_disable_fragments(sk, optval, optlen);
+ break;
+
+ case SCTP_EVENTS:
+ retval = sctp_setsockopt_events(sk, optval, optlen);
+ break;
+
+ case SCTP_AUTOCLOSE:
+ retval = sctp_setsockopt_autoclose(sk, optval, optlen);
+ break;
+
+ case SCTP_PEER_ADDR_PARAMS:
+ retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
+ break;
+
+ case SCTP_DELAYED_SACK:
+ retval = sctp_setsockopt_delayed_ack(sk, optval, optlen);
+ break;
+ case SCTP_PARTIAL_DELIVERY_POINT:
+ retval = sctp_setsockopt_partial_delivery_point(sk, optval, optlen);
+ break;
+
+ case SCTP_INITMSG:
+ retval = sctp_setsockopt_initmsg(sk, optval, optlen);
+ break;
+ case SCTP_DEFAULT_SEND_PARAM:
+ retval = sctp_setsockopt_default_send_param(sk, optval,
+ optlen);
+ break;
+ case SCTP_DEFAULT_SNDINFO:
+ retval = sctp_setsockopt_default_sndinfo(sk, optval, optlen);
+ break;
+ case SCTP_PRIMARY_ADDR:
+ retval = sctp_setsockopt_primary_addr(sk, optval, optlen);
+ break;
+ case SCTP_SET_PEER_PRIMARY_ADDR:
+ retval = sctp_setsockopt_peer_primary_addr(sk, optval, optlen);
+ break;
+ case SCTP_NODELAY:
+ retval = sctp_setsockopt_nodelay(sk, optval, optlen);
+ break;
+ case SCTP_RTOINFO:
+ retval = sctp_setsockopt_rtoinfo(sk, optval, optlen);
+ break;
+ case SCTP_ASSOCINFO:
+ retval = sctp_setsockopt_associnfo(sk, optval, optlen);
+ break;
+ case SCTP_I_WANT_MAPPED_V4_ADDR:
+ retval = sctp_setsockopt_mappedv4(sk, optval, optlen);
+ break;
+ case SCTP_MAXSEG:
+ retval = sctp_setsockopt_maxseg(sk, optval, optlen);
+ break;
+ case SCTP_ADAPTATION_LAYER:
+ retval = sctp_setsockopt_adaptation_layer(sk, optval, optlen);
+ break;
+ case SCTP_CONTEXT:
+ retval = sctp_setsockopt_context(sk, optval, optlen);
+ break;
+ case SCTP_FRAGMENT_INTERLEAVE:
+ retval = sctp_setsockopt_fragment_interleave(sk, optval, optlen);
+ break;
+ case SCTP_MAX_BURST:
+ retval = sctp_setsockopt_maxburst(sk, optval, optlen);
+ break;
+ case SCTP_AUTH_CHUNK:
+ retval = sctp_setsockopt_auth_chunk(sk, optval, optlen);
+ break;
+ case SCTP_HMAC_IDENT:
+ retval = sctp_setsockopt_hmac_ident(sk, optval, optlen);
+ break;
+ case SCTP_AUTH_KEY:
+ retval = sctp_setsockopt_auth_key(sk, optval, optlen);
+ break;
+ case SCTP_AUTH_ACTIVE_KEY:
+ retval = sctp_setsockopt_active_key(sk, optval, optlen);
+ break;
+ case SCTP_AUTH_DELETE_KEY:
+ retval = sctp_setsockopt_del_key(sk, optval, optlen);
+ break;
+ case SCTP_AUTO_ASCONF:
+ retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
+ break;
+ case SCTP_PEER_ADDR_THLDS:
+ retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+ break;
+ case SCTP_RECVRCVINFO:
+ retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen);
+ break;
+ case SCTP_RECVNXTINFO:
+ retval = sctp_setsockopt_recvnxtinfo(sk, optval, optlen);
+ break;
+ default:
+ retval = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+
+out_nounlock:
+ return retval;
+}
+
+/* API 3.1.6 connect() - UDP Style Syntax
+ *
+ * An application may use the connect() call in the UDP model to initiate an
+ * association without sending data.
+ *
+ * The syntax is:
+ *
+ * ret = connect(int sd, const struct sockaddr *nam, socklen_t len);
+ *
+ * sd: the socket descriptor to have a new association added to.
+ *
+ * nam: the address structure (either struct sockaddr_in or struct
+ * sockaddr_in6 defined in RFC2553 [7]).
+ *
+ * len: the size of the address.
+ */
+static int sctp_connect(struct sock *sk, struct sockaddr *addr,
+ int addr_len)
+{
+ int err = 0;
+ struct sctp_af *af;
+
+ lock_sock(sk);
+
+ pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk,
+ addr, addr_len);
+
+ /* Validate addr_len before calling common connect/connectx routine. */
+ af = sctp_get_af_specific(addr->sa_family);
+ if (!af || addr_len < af->sockaddr_len) {
+ err = -EINVAL;
+ } else {
+ /* Pass correct addr len to common routine (so it knows there
+ * is only one address being passed.
+ */
+ err = __sctp_connect(sk, addr, af->sockaddr_len, NULL);
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+/* FIXME: Write comments. */
+static int sctp_disconnect(struct sock *sk, int flags)
+{
+ return -EOPNOTSUPP; /* STUB */
+}
+
+/* 4.1.4 accept() - TCP Style Syntax
+ *
+ * Applications use accept() call to remove an established SCTP
+ * association from the accept queue of the endpoint. A new socket
+ * descriptor will be returned from accept() to represent the newly
+ * formed association.
+ */
+static struct sock *sctp_accept(struct sock *sk, int flags, int *err)
+{
+ struct sctp_sock *sp;
+ struct sctp_endpoint *ep;
+ struct sock *newsk = NULL;
+ struct sctp_association *asoc;
+ long timeo;
+ int error = 0;
+
+ lock_sock(sk);
+
+ sp = sctp_sk(sk);
+ ep = sp->ep;
+
+ if (!sctp_style(sk, TCP)) {
+ error = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!sctp_sstate(sk, LISTENING)) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ error = sctp_wait_for_accept(sk, timeo);
+ if (error)
+ goto out;
+
+ /* We treat the list of associations on the endpoint as the accept
+ * queue and pick the first association on the list.
+ */
+ asoc = list_entry(ep->asocs.next, struct sctp_association, asocs);
+
+ newsk = sp->pf->create_accept_sk(sk, asoc);
+ if (!newsk) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ /* Populate the fields of the newsk from the oldsk and migrate the
+ * asoc to the newsk.
+ */
+ sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP);
+
+out:
+ release_sock(sk);
+ *err = error;
+ return newsk;
+}
+
+/* The SCTP ioctl handler. */
+static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ int rc = -ENOTCONN;
+
+ lock_sock(sk);
+
+ /*
+ * SEQPACKET-style sockets in LISTENING state are valid, for
+ * SCTP, so only discard TCP-style sockets in LISTENING state.
+ */
+ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+ goto out;
+
+ switch (cmd) {
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ unsigned int amount = 0;
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb != NULL) {
+ /*
+ * We will only return the amount of this packet since
+ * that is all that will be read.
+ */
+ amount = skb->len;
+ }
+ rc = put_user(amount, (int __user *)arg);
+ break;
+ }
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
+ }
+out:
+ release_sock(sk);
+ return rc;
+}
+
+/* This is the function which gets called during socket creation to
+ * initialized the SCTP-specific portion of the sock.
+ * The sock structure should already be zero-filled memory.
+ */
+static int sctp_init_sock(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_sock *sp;
+
+ pr_debug("%s: sk:%p\n", __func__, sk);
+
+ sp = sctp_sk(sk);
+
+ /* Initialize the SCTP per socket area. */
+ switch (sk->sk_type) {
+ case SOCK_SEQPACKET:
+ sp->type = SCTP_SOCKET_UDP;
+ break;
+ case SOCK_STREAM:
+ sp->type = SCTP_SOCKET_TCP;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ /* Initialize default send parameters. These parameters can be
+ * modified with the SCTP_DEFAULT_SEND_PARAM socket option.
+ */
+ sp->default_stream = 0;
+ sp->default_ppid = 0;
+ sp->default_flags = 0;
+ sp->default_context = 0;
+ sp->default_timetolive = 0;
+
+ sp->default_rcv_context = 0;
+ sp->max_burst = net->sctp.max_burst;
+
+ sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg;
+
+ /* Initialize default setup parameters. These parameters
+ * can be modified with the SCTP_INITMSG socket option or
+ * overridden by the SCTP_INIT CMSG.
+ */
+ sp->initmsg.sinit_num_ostreams = sctp_max_outstreams;
+ sp->initmsg.sinit_max_instreams = sctp_max_instreams;
+ sp->initmsg.sinit_max_attempts = net->sctp.max_retrans_init;
+ sp->initmsg.sinit_max_init_timeo = net->sctp.rto_max;
+
+ /* Initialize default RTO related parameters. These parameters can
+ * be modified for with the SCTP_RTOINFO socket option.
+ */
+ sp->rtoinfo.srto_initial = net->sctp.rto_initial;
+ sp->rtoinfo.srto_max = net->sctp.rto_max;
+ sp->rtoinfo.srto_min = net->sctp.rto_min;
+
+ /* Initialize default association related parameters. These parameters
+ * can be modified with the SCTP_ASSOCINFO socket option.
+ */
+ sp->assocparams.sasoc_asocmaxrxt = net->sctp.max_retrans_association;
+ sp->assocparams.sasoc_number_peer_destinations = 0;
+ sp->assocparams.sasoc_peer_rwnd = 0;
+ sp->assocparams.sasoc_local_rwnd = 0;
+ sp->assocparams.sasoc_cookie_life = net->sctp.valid_cookie_life;
+
+ /* Initialize default event subscriptions. By default, all the
+ * options are off.
+ */
+ memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe));
+
+ /* Default Peer Address Parameters. These defaults can
+ * be modified via SCTP_PEER_ADDR_PARAMS
+ */
+ sp->hbinterval = net->sctp.hb_interval;
+ sp->pathmaxrxt = net->sctp.max_retrans_path;
+ sp->pathmtu = 0; /* allow default discovery */
+ sp->sackdelay = net->sctp.sack_timeout;
+ sp->sackfreq = 2;
+ sp->param_flags = SPP_HB_ENABLE |
+ SPP_PMTUD_ENABLE |
+ SPP_SACKDELAY_ENABLE;
+
+ /* If enabled no SCTP message fragmentation will be performed.
+ * Configure through SCTP_DISABLE_FRAGMENTS socket option.
+ */
+ sp->disable_fragments = 0;
+
+ /* Enable Nagle algorithm by default. */
+ sp->nodelay = 0;
+
+ sp->recvrcvinfo = 0;
+ sp->recvnxtinfo = 0;
+
+ /* Enable by default. */
+ sp->v4mapped = 1;
+
+ /* Auto-close idle associations after the configured
+ * number of seconds. A value of 0 disables this
+ * feature. Configure through the SCTP_AUTOCLOSE socket option,
+ * for UDP-style sockets only.
+ */
+ sp->autoclose = 0;
+
+ /* User specified fragmentation limit. */
+ sp->user_frag = 0;
+
+ sp->adaptation_ind = 0;
+
+ sp->pf = sctp_get_pf_specific(sk->sk_family);
+
+ /* Control variables for partial data delivery. */
+ atomic_set(&sp->pd_mode, 0);
+ skb_queue_head_init(&sp->pd_lobby);
+ sp->frag_interleave = 0;
+
+ /* Create a per socket endpoint structure. Even if we
+ * change the data structure relationships, this may still
+ * be useful for storing pre-connect address information.
+ */
+ sp->ep = sctp_endpoint_new(sk, GFP_KERNEL);
+ if (!sp->ep)
+ return -ENOMEM;
+
+ sp->hmac = NULL;
+
+ sk->sk_destruct = sctp_destruct_sock;
+
+ SCTP_DBG_OBJCNT_INC(sock);
+
+ local_bh_disable();
+ percpu_counter_inc(&sctp_sockets_allocated);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
+
+ /* Nothing can fail after this block, otherwise
+ * sctp_destroy_sock() will be called without addr_wq_lock held
+ */
+ if (net->sctp.default_auto_asconf) {
+ spin_lock(&sock_net(sk)->sctp.addr_wq_lock);
+ list_add_tail(&sp->auto_asconf_list,
+ &net->sctp.auto_asconf_splist);
+ sp->do_auto_asconf = 1;
+ spin_unlock(&sock_net(sk)->sctp.addr_wq_lock);
+ } else {
+ sp->do_auto_asconf = 0;
+ }
+
+ local_bh_enable();
+
+ return 0;
+}
+
+/* Cleanup any SCTP per socket resources. Must be called with
+ * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true
+ */
+static void sctp_destroy_sock(struct sock *sk)
+{
+ struct sctp_sock *sp;
+
+ pr_debug("%s: sk:%p\n", __func__, sk);
+
+ /* Release our hold on the endpoint. */
+ sp = sctp_sk(sk);
+ /* This could happen during socket init, thus we bail out
+ * early, since the rest of the below is not setup either.
+ */
+ if (sp->ep == NULL)
+ return;
+
+ if (sp->do_auto_asconf) {
+ sp->do_auto_asconf = 0;
+ list_del(&sp->auto_asconf_list);
+ }
+ sctp_endpoint_free(sp->ep);
+ local_bh_disable();
+ percpu_counter_dec(&sctp_sockets_allocated);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ local_bh_enable();
+}
+
+/* Triggered when there are no references on the socket anymore */
+static void sctp_destruct_sock(struct sock *sk)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ /* Free up the HMAC transform. */
+ crypto_free_hash(sp->hmac);
+
+ inet_sock_destruct(sk);
+}
+
+/* API 4.1.7 shutdown() - TCP Style Syntax
+ * int shutdown(int socket, int how);
+ *
+ * sd - the socket descriptor of the association to be closed.
+ * how - Specifies the type of shutdown. The values are
+ * as follows:
+ * SHUT_RD
+ * Disables further receive operations. No SCTP
+ * protocol action is taken.
+ * SHUT_WR
+ * Disables further send operations, and initiates
+ * the SCTP shutdown sequence.
+ * SHUT_RDWR
+ * Disables further send and receive operations
+ * and initiates the SCTP shutdown sequence.
+ */
+static void sctp_shutdown(struct sock *sk, int how)
+{
+ struct net *net = sock_net(sk);
+ struct sctp_endpoint *ep;
+ struct sctp_association *asoc;
+
+ if (!sctp_style(sk, TCP))
+ return;
+
+ if (how & SEND_SHUTDOWN) {
+ ep = sctp_sk(sk)->ep;
+ if (!list_empty(&ep->asocs)) {
+ asoc = list_entry(ep->asocs.next,
+ struct sctp_association, asocs);
+ sctp_primitive_SHUTDOWN(net, asoc, NULL);
+ }
+ }
+}
+
+/* 7.2.1 Association Status (SCTP_STATUS)
+
+ * Applications can retrieve current status information about an
+ * association, including association state, peer receiver window size,
+ * number of unacked data chunks, and number of data chunks pending
+ * receipt. This information is read-only.
+ */
+static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_status status;
+ struct sctp_association *asoc = NULL;
+ struct sctp_transport *transport;
+ sctp_assoc_t associd;
+ int retval = 0;
+
+ if (len < sizeof(status)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(status);
+ if (copy_from_user(&status, optval, len)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ associd = status.sstat_assoc_id;
+ asoc = sctp_id2assoc(sk, associd);
+ if (!asoc) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ transport = asoc->peer.primary_path;
+
+ status.sstat_assoc_id = sctp_assoc2id(asoc);
+ status.sstat_state = sctp_assoc_to_state(asoc);
+ status.sstat_rwnd = asoc->peer.rwnd;
+ status.sstat_unackdata = asoc->unack_data;
+
+ status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
+ status.sstat_instrms = asoc->c.sinit_max_instreams;
+ status.sstat_outstrms = asoc->c.sinit_num_ostreams;
+ status.sstat_fragmentation_point = asoc->frag_point;
+ status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
+ memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr,
+ transport->af_specific->sockaddr_len);
+ /* Map ipv4 address into v4-mapped-on-v6 address. */
+ sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk),
+ (union sctp_addr *)&status.sstat_primary.spinfo_address);
+ status.sstat_primary.spinfo_state = transport->state;
+ status.sstat_primary.spinfo_cwnd = transport->cwnd;
+ status.sstat_primary.spinfo_srtt = transport->srtt;
+ status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
+ status.sstat_primary.spinfo_mtu = transport->pathmtu;
+
+ if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
+ status.sstat_primary.spinfo_state = SCTP_ACTIVE;
+
+ if (put_user(len, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ pr_debug("%s: len:%d, state:%d, rwnd:%d, assoc_id:%d\n",
+ __func__, len, status.sstat_state, status.sstat_rwnd,
+ status.sstat_assoc_id);
+
+ if (copy_to_user(optval, &status, len)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+out:
+ return retval;
+}
+
+
+/* 7.2.2 Peer Address Information (SCTP_GET_PEER_ADDR_INFO)
+ *
+ * Applications can retrieve information about a specific peer address
+ * of an association, including its reachability state, congestion
+ * window, and retransmission timer values. This information is
+ * read-only.
+ */
+static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_paddrinfo pinfo;
+ struct sctp_transport *transport;
+ int retval = 0;
+
+ if (len < sizeof(pinfo)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(pinfo);
+ if (copy_from_user(&pinfo, optval, len)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address,
+ pinfo.spinfo_assoc_id);
+ if (!transport)
+ return -EINVAL;
+
+ pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
+ pinfo.spinfo_state = transport->state;
+ pinfo.spinfo_cwnd = transport->cwnd;
+ pinfo.spinfo_srtt = transport->srtt;
+ pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
+ pinfo.spinfo_mtu = transport->pathmtu;
+
+ if (pinfo.spinfo_state == SCTP_UNKNOWN)
+ pinfo.spinfo_state = SCTP_ACTIVE;
+
+ if (put_user(len, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, &pinfo, len)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+out:
+ return retval;
+}
+
+/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
+ *
+ * This option is a on/off flag. If enabled no SCTP message
+ * fragmentation will be performed. Instead if a message being sent
+ * exceeds the current PMTU size, the message will NOT be sent and
+ * instead a error will be indicated to the user.
+ */
+static int sctp_getsockopt_disable_fragments(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ int val;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ val = (sctp_sk(sk)->disable_fragments == 1);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+/* 7.1.15 Set notification and ancillary events (SCTP_EVENTS)
+ *
+ * This socket option is used to specify various notifications and
+ * ancillary data the user wishes to receive.
+ */
+static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval,
+ int __user *optlen)
+{
+ if (len <= 0)
+ return -EINVAL;
+ if (len > sizeof(struct sctp_event_subscribe))
+ len = sizeof(struct sctp_event_subscribe);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len))
+ return -EFAULT;
+ return 0;
+}
+
+/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE)
+ *
+ * This socket option is applicable to the UDP-style socket only. When
+ * set it will cause associations that are idle for more than the
+ * specified number of seconds to automatically close. An association
+ * being idle is defined an association that has NOT sent or received
+ * user data. The special value of '0' indicates that no automatic
+ * close of any associations should be performed. The option expects an
+ * integer defining the number of seconds of idle time before an
+ * association is closed.
+ */
+static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optval, int __user *optlen)
+{
+ /* Applicable to UDP-style socket only */
+ if (sctp_style(sk, TCP))
+ return -EOPNOTSUPP;
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &sctp_sk(sk)->autoclose, sizeof(int)))
+ return -EFAULT;
+ return 0;
+}
+
+/* Helper routine to branch off an association to a new socket. */
+int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
+{
+ struct sctp_association *asoc = sctp_id2assoc(sk, id);
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct socket *sock;
+ int err = 0;
+
+ if (!asoc)
+ return -EINVAL;
+
+ /* An association cannot be branched off from an already peeled-off
+ * socket, nor is this supported for tcp style sockets.
+ */
+ if (!sctp_style(sk, UDP))
+ return -EINVAL;
+
+ /* Create a new socket. */
+ err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock);
+ if (err < 0)
+ return err;
+
+ sctp_copy_sock(sock->sk, sk, asoc);
+
+ /* Make peeled-off sockets more like 1-1 accepted sockets.
+ * Set the daddr and initialize id to something more random
+ */
+ sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sk);
+
+ /* Populate the fields of the newsk from the oldsk and migrate the
+ * asoc to the newsk.
+ */
+ sctp_sock_migrate(sk, sock->sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH);
+
+ *sockp = sock;
+
+ return err;
+}
+EXPORT_SYMBOL(sctp_do_peeloff);
+
+static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval, int __user *optlen)
+{
+ sctp_peeloff_arg_t peeloff;
+ struct socket *newsock;
+ struct file *newfile;
+ int retval = 0;
+
+ if (len < sizeof(sctp_peeloff_arg_t))
+ return -EINVAL;
+ len = sizeof(sctp_peeloff_arg_t);
+ if (copy_from_user(&peeloff, optval, len))
+ return -EFAULT;
+
+ retval = sctp_do_peeloff(sk, peeloff.associd, &newsock);
+ if (retval < 0)
+ goto out;
+
+ /* Map the socket to an unused fd that can be returned to the user. */
+ retval = get_unused_fd_flags(0);
+ if (retval < 0) {
+ sock_release(newsock);
+ goto out;
+ }
+
+ newfile = sock_alloc_file(newsock, 0, NULL);
+ if (unlikely(IS_ERR(newfile))) {
+ put_unused_fd(retval);
+ sock_release(newsock);
+ return PTR_ERR(newfile);
+ }
+
+ pr_debug("%s: sk:%p, newsk:%p, sd:%d\n", __func__, sk, newsock->sk,
+ retval);
+
+ /* Return the fd mapped to the new socket. */
+ if (put_user(len, optlen)) {
+ fput(newfile);
+ put_unused_fd(retval);
+ return -EFAULT;
+ }
+ peeloff.sd = retval;
+ if (copy_to_user(optval, &peeloff, len)) {
+ fput(newfile);
+ put_unused_fd(retval);
+ return -EFAULT;
+ }
+ fd_install(retval, newfile);
+out:
+ return retval;
+}
+
+/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS)
+ *
+ * Applications can enable or disable heartbeats for any peer address of
+ * an association, modify an address's heartbeat interval, force a
+ * heartbeat to be sent immediately, and adjust the address's maximum
+ * number of retransmissions sent before an address is considered
+ * unreachable. The following structure is used to access and modify an
+ * address's parameters:
+ *
+ * struct sctp_paddrparams {
+ * sctp_assoc_t spp_assoc_id;
+ * struct sockaddr_storage spp_address;
+ * uint32_t spp_hbinterval;
+ * uint16_t spp_pathmaxrxt;
+ * uint32_t spp_pathmtu;
+ * uint32_t spp_sackdelay;
+ * uint32_t spp_flags;
+ * };
+ *
+ * spp_assoc_id - (one-to-many style socket) This is filled in the
+ * application, and identifies the association for
+ * this query.
+ * spp_address - This specifies which address is of interest.
+ * spp_hbinterval - This contains the value of the heartbeat interval,
+ * in milliseconds. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
+ * spp_pathmaxrxt - This contains the maximum number of
+ * retransmissions before this address shall be
+ * considered unreachable. If a value of zero
+ * is present in this field then no changes are to
+ * be made to this parameter.
+ * spp_pathmtu - When Path MTU discovery is disabled the value
+ * specified here will be the "fixed" path mtu.
+ * Note that if the spp_address field is empty
+ * then all associations on this address will
+ * have this fixed path mtu set upon them.
+ *
+ * spp_sackdelay - When delayed sack is enabled, this value specifies
+ * the number of milliseconds that sacks will be delayed
+ * for. This value will apply to all addresses of an
+ * association if the spp_address field is empty. Note
+ * also, that if delayed sack is enabled and this
+ * value is set to 0, no change is made to the last
+ * recorded delayed sack timer value.
+ *
+ * spp_flags - These flags are used to control various features
+ * on an association. The flag field may contain
+ * zero or more of the following options.
+ *
+ * SPP_HB_ENABLE - Enable heartbeats on the
+ * specified address. Note that if the address
+ * field is empty all addresses for the association
+ * have heartbeats enabled upon them.
+ *
+ * SPP_HB_DISABLE - Disable heartbeats on the
+ * speicifed address. Note that if the address
+ * field is empty all addresses for the association
+ * will have their heartbeats disabled. Note also
+ * that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ * mutually exclusive, only one of these two should
+ * be specified. Enabling both fields will have
+ * undetermined results.
+ *
+ * SPP_HB_DEMAND - Request a user initiated heartbeat
+ * to be made immediately.
+ *
+ * SPP_PMTUD_ENABLE - This field will enable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected.
+ *
+ * SPP_PMTUD_DISABLE - This field will disable PMTU
+ * discovery upon the specified address. Note that
+ * if the address feild is empty then all addresses
+ * on the association are effected. Not also that
+ * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ * exclusive. Enabling both will have undetermined
+ * results.
+ *
+ * SPP_SACKDELAY_ENABLE - Setting this flag turns
+ * on delayed sack. The time specified in spp_sackdelay
+ * is used to specify the sack delay for this address. Note
+ * that if spp_address is empty then all addresses will
+ * enable delayed sack and take on the sack delay
+ * value specified in spp_sackdelay.
+ * SPP_SACKDELAY_DISABLE - Setting this flag turns
+ * off delayed sack. If the spp_address field is blank then
+ * delayed sack is disabled for the entire association. Note
+ * also that this field is mutually exclusive to
+ * SPP_SACKDELAY_ENABLE, setting both will have undefined
+ * results.
+ */
+static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_paddrparams params;
+ struct sctp_transport *trans = NULL;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (len < sizeof(struct sctp_paddrparams))
+ return -EINVAL;
+ len = sizeof(struct sctp_paddrparams);
+ if (copy_from_user(&params, optval, len))
+ return -EFAULT;
+
+ /* If an address other than INADDR_ANY is specified, and
+ * no transport is found, then the request is invalid.
+ */
+ if (!sctp_is_any(sk, (union sctp_addr *)&params.spp_address)) {
+ trans = sctp_addr_id2transport(sk, &params.spp_address,
+ params.spp_assoc_id);
+ if (!trans) {
+ pr_debug("%s: failed no transport\n", __func__);
+ return -EINVAL;
+ }
+ }
+
+ /* Get association, if assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+ if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) {
+ pr_debug("%s: failed no association\n", __func__);
+ return -EINVAL;
+ }
+
+ if (trans) {
+ /* Fetch transport values. */
+ params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval);
+ params.spp_pathmtu = trans->pathmtu;
+ params.spp_pathmaxrxt = trans->pathmaxrxt;
+ params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay);
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = trans->param_flags;
+ } else if (asoc) {
+ /* Fetch association values. */
+ params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
+ params.spp_pathmtu = asoc->pathmtu;
+ params.spp_pathmaxrxt = asoc->pathmaxrxt;
+ params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay);
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = asoc->param_flags;
+ } else {
+ /* Fetch socket values. */
+ params.spp_hbinterval = sp->hbinterval;
+ params.spp_pathmtu = sp->pathmtu;
+ params.spp_sackdelay = sp->sackdelay;
+ params.spp_pathmaxrxt = sp->pathmaxrxt;
+
+ /*draft-11 doesn't say what to return in spp_flags*/
+ params.spp_flags = sp->param_flags;
+ }
+
+ if (copy_to_user(optval, &params, len))
+ return -EFAULT;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK)
+ *
+ * This option will effect the way delayed acks are performed. This
+ * option allows you to get or set the delayed ack time, in
+ * milliseconds. It also allows changing the delayed ack frequency.
+ * Changing the frequency to 1 disables the delayed sack algorithm. If
+ * the assoc_id is 0, then this sets or gets the endpoints default
+ * values. If the assoc_id field is non-zero, then the set or get
+ * effects the specified association for the one to many model (the
+ * assoc_id field is ignored by the one to one model). Note that if
+ * sack_delay or sack_freq are 0 when setting this option, then the
+ * current values will remain unchanged.
+ *
+ * struct sctp_sack_info {
+ * sctp_assoc_t sack_assoc_id;
+ * uint32_t sack_delay;
+ * uint32_t sack_freq;
+ * };
+ *
+ * sack_assoc_id - This parameter, indicates which association the user
+ * is performing an action upon. Note that if this field's value is
+ * zero then the endpoints default value is changed (effecting future
+ * associations only).
+ *
+ * sack_delay - This parameter contains the number of milliseconds that
+ * the user is requesting the delayed ACK timer be set to. Note that
+ * this value is defined in the standard to be between 200 and 500
+ * milliseconds.
+ *
+ * sack_freq - This parameter contains the number of packets that must
+ * be received before a sack is sent without waiting for the delay
+ * timer to expire. The default value for this is 2, setting this
+ * value to 1 will disable the delayed sack algorithm.
+ */
+static int sctp_getsockopt_delayed_ack(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_sack_info params;
+ struct sctp_association *asoc = NULL;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (len >= sizeof(struct sctp_sack_info)) {
+ len = sizeof(struct sctp_sack_info);
+
+ if (copy_from_user(&params, optval, len))
+ return -EFAULT;
+ } else if (len == sizeof(struct sctp_assoc_value)) {
+ pr_warn_ratelimited(DEPRECATED
+ "%s (pid %d) "
+ "Use of struct sctp_assoc_value in delayed_ack socket option.\n"
+ "Use struct sctp_sack_info instead\n",
+ current->comm, task_pid_nr(current));
+ if (copy_from_user(&params, optval, len))
+ return -EFAULT;
+ } else
+ return -EINVAL;
+
+ /* Get association, if sack_assoc_id != 0 and the socket is a one
+ * to many style socket, and an association was not found, then
+ * the id was invalid.
+ */
+ asoc = sctp_id2assoc(sk, params.sack_assoc_id);
+ if (!asoc && params.sack_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (asoc) {
+ /* Fetch association values. */
+ if (asoc->param_flags & SPP_SACKDELAY_ENABLE) {
+ params.sack_delay = jiffies_to_msecs(
+ asoc->sackdelay);
+ params.sack_freq = asoc->sackfreq;
+
+ } else {
+ params.sack_delay = 0;
+ params.sack_freq = 1;
+ }
+ } else {
+ /* Fetch socket values. */
+ if (sp->param_flags & SPP_SACKDELAY_ENABLE) {
+ params.sack_delay = sp->sackdelay;
+ params.sack_freq = sp->sackfreq;
+ } else {
+ params.sack_delay = 0;
+ params.sack_freq = 1;
+ }
+ }
+
+ if (copy_to_user(optval, &params, len))
+ return -EFAULT;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* 7.1.3 Initialization Parameters (SCTP_INITMSG)
+ *
+ * Applications can specify protocol parameters for the default association
+ * initialization. The option name argument to setsockopt() and getsockopt()
+ * is SCTP_INITMSG.
+ *
+ * Setting initialization parameters is effective only on an unconnected
+ * socket (for UDP-style sockets only future associations are effected
+ * by the change). With TCP-style sockets, this option is inherited by
+ * sockets derived from a listener socket.
+ */
+static int sctp_getsockopt_initmsg(struct sock *sk, int len, char __user *optval, int __user *optlen)
+{
+ if (len < sizeof(struct sctp_initmsg))
+ return -EINVAL;
+ len = sizeof(struct sctp_initmsg);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &sctp_sk(sk)->initmsg, len))
+ return -EFAULT;
+ return 0;
+}
+
+
+static int sctp_getsockopt_peer_addrs(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_association *asoc;
+ int cnt = 0;
+ struct sctp_getaddrs getaddrs;
+ struct sctp_transport *from;
+ void __user *to;
+ union sctp_addr temp;
+ struct sctp_sock *sp = sctp_sk(sk);
+ int addrlen;
+ size_t space_left;
+ int bytes_copied;
+
+ if (len < sizeof(struct sctp_getaddrs))
+ return -EINVAL;
+
+ if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
+ return -EFAULT;
+
+ /* For UDP-style sockets, id specifies the association to query. */
+ asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
+ if (!asoc)
+ return -EINVAL;
+
+ to = optval + offsetof(struct sctp_getaddrs, addrs);
+ space_left = len - offsetof(struct sctp_getaddrs, addrs);
+
+ list_for_each_entry(from, &asoc->peer.transport_addr_list,
+ transports) {
+ memcpy(&temp, &from->ipaddr, sizeof(temp));
+ addrlen = sctp_get_pf_specific(sk->sk_family)
+ ->addr_to_user(sp, &temp);
+ if (space_left < addrlen)
+ return -ENOMEM;
+ if (copy_to_user(to, &temp, addrlen))
+ return -EFAULT;
+ to += addrlen;
+ cnt++;
+ space_left -= addrlen;
+ }
+
+ if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num))
+ return -EFAULT;
+ bytes_copied = ((char __user *)to) - optval;
+ if (put_user(bytes_copied, optlen))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to,
+ size_t space_left, int *bytes_copied)
+{
+ struct sctp_sockaddr_entry *addr;
+ union sctp_addr temp;
+ int cnt = 0;
+ int addrlen;
+ struct net *net = sock_net(sk);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) {
+ if (!addr->valid)
+ continue;
+
+ if ((PF_INET == sk->sk_family) &&
+ (AF_INET6 == addr->a.sa.sa_family))
+ continue;
+ if ((PF_INET6 == sk->sk_family) &&
+ inet_v6_ipv6only(sk) &&
+ (AF_INET == addr->a.sa.sa_family))
+ continue;
+ memcpy(&temp, &addr->a, sizeof(temp));
+ if (!temp.v4.sin_port)
+ temp.v4.sin_port = htons(port);
+
+ addrlen = sctp_get_pf_specific(sk->sk_family)
+ ->addr_to_user(sctp_sk(sk), &temp);
+
+ if (space_left < addrlen) {
+ cnt = -ENOMEM;
+ break;
+ }
+ memcpy(to, &temp, addrlen);
+
+ to += addrlen;
+ cnt++;
+ space_left -= addrlen;
+ *bytes_copied += addrlen;
+ }
+ rcu_read_unlock();
+
+ return cnt;
+}
+
+
+static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_bind_addr *bp;
+ struct sctp_association *asoc;
+ int cnt = 0;
+ struct sctp_getaddrs getaddrs;
+ struct sctp_sockaddr_entry *addr;
+ void __user *to;
+ union sctp_addr temp;
+ struct sctp_sock *sp = sctp_sk(sk);
+ int addrlen;
+ int err = 0;
+ size_t space_left;
+ int bytes_copied = 0;
+ void *addrs;
+ void *buf;
+
+ if (len < sizeof(struct sctp_getaddrs))
+ return -EINVAL;
+
+ if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
+ return -EFAULT;
+
+ /*
+ * For UDP-style sockets, id specifies the association to query.
+ * If the id field is set to the value '0' then the locally bound
+ * addresses are returned without regard to any particular
+ * association.
+ */
+ if (0 == getaddrs.assoc_id) {
+ bp = &sctp_sk(sk)->ep->base.bind_addr;
+ } else {
+ asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
+ if (!asoc)
+ return -EINVAL;
+ bp = &asoc->base.bind_addr;
+ }
+
+ to = optval + offsetof(struct sctp_getaddrs, addrs);
+ space_left = len - offsetof(struct sctp_getaddrs, addrs);
+
+ addrs = kmalloc(space_left, GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+
+ /* If the endpoint is bound to 0.0.0.0 or ::0, get the valid
+ * addresses from the global local address list.
+ */
+ if (sctp_list_single_entry(&bp->address_list)) {
+ addr = list_entry(bp->address_list.next,
+ struct sctp_sockaddr_entry, list);
+ if (sctp_is_any(sk, &addr->a)) {
+ cnt = sctp_copy_laddrs(sk, bp->port, addrs,
+ space_left, &bytes_copied);
+ if (cnt < 0) {
+ err = cnt;
+ goto out;
+ }
+ goto copy_getaddrs;
+ }
+ }
+
+ buf = addrs;
+ /* Protection on the bound address list is not needed since
+ * in the socket option context we hold a socket lock and
+ * thus the bound address list can't change.
+ */
+ list_for_each_entry(addr, &bp->address_list, list) {
+ memcpy(&temp, &addr->a, sizeof(temp));
+ addrlen = sctp_get_pf_specific(sk->sk_family)
+ ->addr_to_user(sp, &temp);
+ if (space_left < addrlen) {
+ err = -ENOMEM; /*fixme: right error?*/
+ goto out;
+ }
+ memcpy(buf, &temp, addrlen);
+ buf += addrlen;
+ bytes_copied += addrlen;
+ cnt++;
+ space_left -= addrlen;
+ }
+
+copy_getaddrs:
+ if (copy_to_user(to, addrs, bytes_copied)) {
+ err = -EFAULT;
+ goto out;
+ }
+ if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) {
+ err = -EFAULT;
+ goto out;
+ }
+ if (put_user(bytes_copied, optlen))
+ err = -EFAULT;
+out:
+ kfree(addrs);
+ return err;
+}
+
+/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
+ *
+ * Requests that the local SCTP stack use the enclosed peer address as
+ * the association primary. The enclosed address must be one of the
+ * association peer's addresses.
+ */
+static int sctp_getsockopt_primary_addr(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_prim prim;
+ struct sctp_association *asoc;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (len < sizeof(struct sctp_prim))
+ return -EINVAL;
+
+ len = sizeof(struct sctp_prim);
+
+ if (copy_from_user(&prim, optval, len))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, prim.ssp_assoc_id);
+ if (!asoc)
+ return -EINVAL;
+
+ if (!asoc->peer.primary_path)
+ return -ENOTCONN;
+
+ memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr,
+ asoc->peer.primary_path->af_specific->sockaddr_len);
+
+ sctp_get_pf_specific(sk->sk_family)->addr_to_user(sp,
+ (union sctp_addr *)&prim.ssp_addr);
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &prim, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 7.1.11 Set Adaptation Layer Indicator (SCTP_ADAPTATION_LAYER)
+ *
+ * Requests that the local endpoint set the specified Adaptation Layer
+ * Indication parameter for all future INIT and INIT-ACK exchanges.
+ */
+static int sctp_getsockopt_adaptation_layer(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_setadaptation adaptation;
+
+ if (len < sizeof(struct sctp_setadaptation))
+ return -EINVAL;
+
+ len = sizeof(struct sctp_setadaptation);
+
+ adaptation.ssb_adaptation_ind = sctp_sk(sk)->adaptation_ind;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &adaptation, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ *
+ * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM)
+ *
+ * Applications that wish to use the sendto() system call may wish to
+ * specify a default set of parameters that would normally be supplied
+ * through the inclusion of ancillary data. This socket option allows
+ * such an application to set the default sctp_sndrcvinfo structure.
+
+
+ * The application that wishes to use this socket option simply passes
+ * in to this call the sctp_sndrcvinfo structure defined in Section
+ * 5.2.2) The input parameters accepted by this call include
+ * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context,
+ * sinfo_timetolive. The user must provide the sinfo_assoc_id field in
+ * to this call if the caller is using the UDP model.
+ *
+ * For getsockopt, it get the default sctp_sndrcvinfo structure.
+ */
+static int sctp_getsockopt_default_send_param(struct sock *sk,
+ int len, char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_association *asoc;
+ struct sctp_sndrcvinfo info;
+
+ if (len < sizeof(info))
+ return -EINVAL;
+
+ len = sizeof(info);
+
+ if (copy_from_user(&info, optval, len))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, info.sinfo_assoc_id);
+ if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+ if (asoc) {
+ info.sinfo_stream = asoc->default_stream;
+ info.sinfo_flags = asoc->default_flags;
+ info.sinfo_ppid = asoc->default_ppid;
+ info.sinfo_context = asoc->default_context;
+ info.sinfo_timetolive = asoc->default_timetolive;
+ } else {
+ info.sinfo_stream = sp->default_stream;
+ info.sinfo_flags = sp->default_flags;
+ info.sinfo_ppid = sp->default_ppid;
+ info.sinfo_context = sp->default_context;
+ info.sinfo_timetolive = sp->default_timetolive;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* RFC6458, Section 8.1.31. Set/get Default Send Parameters
+ * (SCTP_DEFAULT_SNDINFO)
+ */
+static int sctp_getsockopt_default_sndinfo(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_association *asoc;
+ struct sctp_sndinfo info;
+
+ if (len < sizeof(info))
+ return -EINVAL;
+
+ len = sizeof(info);
+
+ if (copy_from_user(&info, optval, len))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, info.snd_assoc_id);
+ if (!asoc && info.snd_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+ if (asoc) {
+ info.snd_sid = asoc->default_stream;
+ info.snd_flags = asoc->default_flags;
+ info.snd_ppid = asoc->default_ppid;
+ info.snd_context = asoc->default_context;
+ } else {
+ info.snd_sid = sp->default_stream;
+ info.snd_flags = sp->default_flags;
+ info.snd_ppid = sp->default_ppid;
+ info.snd_context = sp->default_context;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &info, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ *
+ * 7.1.5 SCTP_NODELAY
+ *
+ * Turn on/off any Nagle-like algorithm. This means that packets are
+ * generally sent as soon as possible and no unnecessary delays are
+ * introduced, at the cost of more packets in the network. Expects an
+ * integer boolean flag.
+ */
+
+static int sctp_getsockopt_nodelay(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ int val;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ val = (sctp_sk(sk)->nodelay == 1);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ *
+ * 7.1.1 SCTP_RTOINFO
+ *
+ * The protocol parameters used to initialize and bound retransmission
+ * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access
+ * and modify these parameters.
+ * All parameters are time values, in milliseconds. A value of 0, when
+ * modifying the parameters, indicates that the current value should not
+ * be changed.
+ *
+ */
+static int sctp_getsockopt_rtoinfo(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen) {
+ struct sctp_rtoinfo rtoinfo;
+ struct sctp_association *asoc;
+
+ if (len < sizeof (struct sctp_rtoinfo))
+ return -EINVAL;
+
+ len = sizeof(struct sctp_rtoinfo);
+
+ if (copy_from_user(&rtoinfo, optval, len))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id);
+
+ if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ /* Values corresponding to the specific association. */
+ if (asoc) {
+ rtoinfo.srto_initial = jiffies_to_msecs(asoc->rto_initial);
+ rtoinfo.srto_max = jiffies_to_msecs(asoc->rto_max);
+ rtoinfo.srto_min = jiffies_to_msecs(asoc->rto_min);
+ } else {
+ /* Values corresponding to the endpoint. */
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ rtoinfo.srto_initial = sp->rtoinfo.srto_initial;
+ rtoinfo.srto_max = sp->rtoinfo.srto_max;
+ rtoinfo.srto_min = sp->rtoinfo.srto_min;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &rtoinfo, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ *
+ * 7.1.2 SCTP_ASSOCINFO
+ *
+ * This option is used to tune the maximum retransmission attempts
+ * of the association.
+ * Returns an error if the new association retransmission value is
+ * greater than the sum of the retransmission value of the peer.
+ * See [SCTP] for more information.
+ *
+ */
+static int sctp_getsockopt_associnfo(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+
+ struct sctp_assocparams assocparams;
+ struct sctp_association *asoc;
+ struct list_head *pos;
+ int cnt = 0;
+
+ if (len < sizeof (struct sctp_assocparams))
+ return -EINVAL;
+
+ len = sizeof(struct sctp_assocparams);
+
+ if (copy_from_user(&assocparams, optval, len))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id);
+
+ if (!asoc && assocparams.sasoc_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ /* Values correspoinding to the specific association */
+ if (asoc) {
+ assocparams.sasoc_asocmaxrxt = asoc->max_retrans;
+ assocparams.sasoc_peer_rwnd = asoc->peer.rwnd;
+ assocparams.sasoc_local_rwnd = asoc->a_rwnd;
+ assocparams.sasoc_cookie_life = ktime_to_ms(asoc->cookie_life);
+
+ list_for_each(pos, &asoc->peer.transport_addr_list) {
+ cnt++;
+ }
+
+ assocparams.sasoc_number_peer_destinations = cnt;
+ } else {
+ /* Values corresponding to the endpoint */
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ assocparams.sasoc_asocmaxrxt = sp->assocparams.sasoc_asocmaxrxt;
+ assocparams.sasoc_peer_rwnd = sp->assocparams.sasoc_peer_rwnd;
+ assocparams.sasoc_local_rwnd = sp->assocparams.sasoc_local_rwnd;
+ assocparams.sasoc_cookie_life =
+ sp->assocparams.sasoc_cookie_life;
+ assocparams.sasoc_number_peer_destinations =
+ sp->assocparams.
+ sasoc_number_peer_destinations;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &assocparams, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR)
+ *
+ * This socket option is a boolean flag which turns on or off mapped V4
+ * addresses. If this option is turned on and the socket is type
+ * PF_INET6, then IPv4 addresses will be mapped to V6 representation.
+ * If this option is turned off, then no mapping will be done of V4
+ * addresses and a user will receive both PF_INET6 and PF_INET type
+ * addresses on the socket.
+ */
+static int sctp_getsockopt_mappedv4(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ int val;
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ val = sp->v4mapped;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 7.1.29. Set or Get the default context (SCTP_CONTEXT)
+ * (chapter and verse is quoted at sctp_setsockopt_context())
+ */
+static int sctp_getsockopt_context(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_sock *sp;
+ struct sctp_association *asoc;
+
+ if (len < sizeof(struct sctp_assoc_value))
+ return -EINVAL;
+
+ len = sizeof(struct sctp_assoc_value);
+
+ if (copy_from_user(&params, optval, len))
+ return -EFAULT;
+
+ sp = sctp_sk(sk);
+
+ if (params.assoc_id != 0) {
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc)
+ return -EINVAL;
+ params.assoc_value = asoc->default_rcv_context;
+ } else {
+ params.assoc_value = sp->default_rcv_context;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &params, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 8.1.16. Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG)
+ * This option will get or set the maximum size to put in any outgoing
+ * SCTP DATA chunk. If a message is larger than this size it will be
+ * fragmented by SCTP into the specified size. Note that the underlying
+ * SCTP implementation may fragment into smaller sized chunks when the
+ * PMTU of the underlying association is smaller than the value set by
+ * the user. The default value for this option is '0' which indicates
+ * the user is NOT limiting fragmentation and only the PMTU will effect
+ * SCTP's choice of DATA chunk size. Note also that values set larger
+ * than the maximum size of an IP datagram will effectively let SCTP
+ * control fragmentation (i.e. the same as setting this option to 0).
+ *
+ * The following structure is used to access and modify this parameter:
+ *
+ * struct sctp_assoc_value {
+ * sctp_assoc_t assoc_id;
+ * uint32_t assoc_value;
+ * };
+ *
+ * assoc_id: This parameter is ignored for one-to-one style sockets.
+ * For one-to-many style sockets this parameter indicates which
+ * association the user is performing an action upon. Note that if
+ * this field's value is zero then the endpoints default value is
+ * changed (effecting future associations only).
+ * assoc_value: This parameter specifies the maximum size in bytes.
+ */
+static int sctp_getsockopt_maxseg(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+
+ if (len == sizeof(int)) {
+ pr_warn_ratelimited(DEPRECATED
+ "%s (pid %d) "
+ "Use of int in maxseg socket option.\n"
+ "Use struct sctp_assoc_value instead\n",
+ current->comm, task_pid_nr(current));
+ params.assoc_id = 0;
+ } else if (len >= sizeof(struct sctp_assoc_value)) {
+ len = sizeof(struct sctp_assoc_value);
+ if (copy_from_user(&params, optval, sizeof(params)))
+ return -EFAULT;
+ } else
+ return -EINVAL;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (asoc)
+ params.assoc_value = asoc->frag_point;
+ else
+ params.assoc_value = sctp_sk(sk)->user_frag;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (len == sizeof(int)) {
+ if (copy_to_user(optval, &params.assoc_value, len))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(optval, &params, len))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE)
+ * (chapter and verse is quoted at sctp_setsockopt_fragment_interleave())
+ */
+static int sctp_getsockopt_fragment_interleave(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ int val;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+
+ val = sctp_sk(sk)->frag_interleave;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 7.1.25. Set or Get the sctp partial delivery point
+ * (chapter and verse is quoted at sctp_setsockopt_partial_delivery_point())
+ */
+static int sctp_getsockopt_partial_delivery_point(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ u32 val;
+
+ if (len < sizeof(u32))
+ return -EINVAL;
+
+ len = sizeof(u32);
+
+ val = sctp_sk(sk)->pd_point;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST)
+ * (chapter and verse is quoted at sctp_setsockopt_maxburst())
+ */
+static int sctp_getsockopt_maxburst(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_sock *sp;
+ struct sctp_association *asoc;
+
+ if (len == sizeof(int)) {
+ pr_warn_ratelimited(DEPRECATED
+ "%s (pid %d) "
+ "Use of int in max_burst socket option.\n"
+ "Use struct sctp_assoc_value instead\n",
+ current->comm, task_pid_nr(current));
+ params.assoc_id = 0;
+ } else if (len >= sizeof(struct sctp_assoc_value)) {
+ len = sizeof(struct sctp_assoc_value);
+ if (copy_from_user(&params, optval, len))
+ return -EFAULT;
+ } else
+ return -EINVAL;
+
+ sp = sctp_sk(sk);
+
+ if (params.assoc_id != 0) {
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc)
+ return -EINVAL;
+ params.assoc_value = asoc->max_burst;
+ } else
+ params.assoc_value = sp->max_burst;
+
+ if (len == sizeof(int)) {
+ if (copy_to_user(optval, &params.assoc_value, len))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(optval, &params, len))
+ return -EFAULT;
+ }
+
+ return 0;
+
+}
+
+static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_hmacalgo __user *p = (void __user *)optval;
+ struct sctp_hmac_algo_param *hmacs;
+ __u16 data_len = 0;
+ u32 num_idents;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ hmacs = ep->auth_hmacs_list;
+ data_len = ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t);
+
+ if (len < sizeof(struct sctp_hmacalgo) + data_len)
+ return -EINVAL;
+
+ len = sizeof(struct sctp_hmacalgo) + data_len;
+ num_idents = data_len / sizeof(u16);
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (put_user(num_idents, &p->shmac_num_idents))
+ return -EFAULT;
+ if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len))
+ return -EFAULT;
+ return 0;
+}
+
+static int sctp_getsockopt_active_key(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_authkeyid val;
+ struct sctp_association *asoc;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ if (len < sizeof(struct sctp_authkeyid))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(struct sctp_authkeyid)))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, val.scact_assoc_id);
+ if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (asoc)
+ val.scact_keynumber = asoc->active_key_id;
+ else
+ val.scact_keynumber = ep->active_key_id;
+
+ len = sizeof(struct sctp_authkeyid);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_authchunks __user *p = (void __user *)optval;
+ struct sctp_authchunks val;
+ struct sctp_association *asoc;
+ struct sctp_chunks_param *ch;
+ u32 num_chunks = 0;
+ char __user *to;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ if (len < sizeof(struct sctp_authchunks))
+ return -EINVAL;
+
+ if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks)))
+ return -EFAULT;
+
+ to = p->gauth_chunks;
+ asoc = sctp_id2assoc(sk, val.gauth_assoc_id);
+ if (!asoc)
+ return -EINVAL;
+
+ ch = asoc->peer.peer_chunks;
+ if (!ch)
+ goto num;
+
+ /* See if the user provided enough room for all the data */
+ num_chunks = ntohs(ch->param_hdr.length) - sizeof(sctp_paramhdr_t);
+ if (len < num_chunks)
+ return -EINVAL;
+
+ if (copy_to_user(to, ch->chunks, num_chunks))
+ return -EFAULT;
+num:
+ len = sizeof(struct sctp_authchunks) + num_chunks;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (put_user(num_chunks, &p->gauth_number_of_chunks))
+ return -EFAULT;
+ return 0;
+}
+
+static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ struct sctp_authchunks __user *p = (void __user *)optval;
+ struct sctp_authchunks val;
+ struct sctp_association *asoc;
+ struct sctp_chunks_param *ch;
+ u32 num_chunks = 0;
+ char __user *to;
+
+ if (!ep->auth_enable)
+ return -EACCES;
+
+ if (len < sizeof(struct sctp_authchunks))
+ return -EINVAL;
+
+ if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks)))
+ return -EFAULT;
+
+ to = p->gauth_chunks;
+ asoc = sctp_id2assoc(sk, val.gauth_assoc_id);
+ if (!asoc && val.gauth_assoc_id && sctp_style(sk, UDP))
+ return -EINVAL;
+
+ if (asoc)
+ ch = (struct sctp_chunks_param *)asoc->c.auth_chunks;
+ else
+ ch = ep->auth_chunk_list;
+
+ if (!ch)
+ goto num;
+
+ num_chunks = ntohs(ch->param_hdr.length) - sizeof(sctp_paramhdr_t);
+ if (len < sizeof(struct sctp_authchunks) + num_chunks)
+ return -EINVAL;
+
+ if (copy_to_user(to, ch->chunks, num_chunks))
+ return -EFAULT;
+num:
+ len = sizeof(struct sctp_authchunks) + num_chunks;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (put_user(num_chunks, &p->gauth_number_of_chunks))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 8.2.5. Get the Current Number of Associations (SCTP_GET_ASSOC_NUMBER)
+ * This option gets the current number of associations that are attached
+ * to a one-to-many style socket. The option value is an uint32_t.
+ */
+static int sctp_getsockopt_assoc_number(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_association *asoc;
+ u32 val = 0;
+
+ if (sctp_style(sk, TCP))
+ return -EOPNOTSUPP;
+
+ if (len < sizeof(u32))
+ return -EINVAL;
+
+ len = sizeof(u32);
+
+ list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
+ val++;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * 8.1.23 SCTP_AUTO_ASCONF
+ * See the corresponding setsockopt entry as description
+ */
+static int sctp_getsockopt_auto_asconf(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ int val = 0;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ if (sctp_sk(sk)->do_auto_asconf && sctp_is_ep_boundall(sk))
+ val = 1;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * 8.2.6. Get the Current Identifiers of Associations
+ * (SCTP_GET_ASSOC_ID_LIST)
+ *
+ * This option gets the current list of SCTP association identifiers of
+ * the SCTP associations handled by a one-to-many style socket.
+ */
+static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_association *asoc;
+ struct sctp_assoc_ids *ids;
+ u32 num = 0;
+
+ if (sctp_style(sk, TCP))
+ return -EOPNOTSUPP;
+
+ if (len < sizeof(struct sctp_assoc_ids))
+ return -EINVAL;
+
+ list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
+ num++;
+ }
+
+ if (len < sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num)
+ return -EINVAL;
+
+ len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num;
+
+ ids = kmalloc(len, GFP_KERNEL);
+ if (unlikely(!ids))
+ return -ENOMEM;
+
+ ids->gaids_number_of_ids = num;
+ num = 0;
+ list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
+ ids->gaids_assoc_id[num++] = asoc->assoc_id;
+ }
+
+ if (put_user(len, optlen) || copy_to_user(optval, ids, len)) {
+ kfree(ids);
+ return -EFAULT;
+ }
+
+ kfree(ids);
+ return 0;
+}
+
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association. See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+ char __user *optval,
+ int len,
+ int __user *optlen)
+{
+ struct sctp_paddrthlds val;
+ struct sctp_transport *trans;
+ struct sctp_association *asoc;
+
+ if (len < sizeof(struct sctp_paddrthlds))
+ return -EINVAL;
+ len = sizeof(struct sctp_paddrthlds);
+ if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len))
+ return -EFAULT;
+
+ if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+ asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+ if (!asoc)
+ return -ENOENT;
+
+ val.spt_pathpfthld = asoc->pf_retrans;
+ val.spt_pathmaxrxt = asoc->pathmaxrxt;
+ } else {
+ trans = sctp_addr_id2transport(sk, &val.spt_address,
+ val.spt_assoc_id);
+ if (!trans)
+ return -ENOENT;
+
+ val.spt_pathmaxrxt = trans->pathmaxrxt;
+ val.spt_pathpfthld = trans->pf_retrans;
+ }
+
+ if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * SCTP_GET_ASSOC_STATS
+ *
+ * This option retrieves local per endpoint statistics. It is modeled
+ * after OpenSolaris' implementation
+ */
+static int sctp_getsockopt_assoc_stats(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_stats sas;
+ struct sctp_association *asoc = NULL;
+
+ /* User must provide at least the assoc id */
+ if (len < sizeof(sctp_assoc_t))
+ return -EINVAL;
+
+ /* Allow the struct to grow and fill in as much as possible */
+ len = min_t(size_t, len, sizeof(sas));
+
+ if (copy_from_user(&sas, optval, len))
+ return -EFAULT;
+
+ asoc = sctp_id2assoc(sk, sas.sas_assoc_id);
+ if (!asoc)
+ return -EINVAL;
+
+ sas.sas_rtxchunks = asoc->stats.rtxchunks;
+ sas.sas_gapcnt = asoc->stats.gapcnt;
+ sas.sas_outofseqtsns = asoc->stats.outofseqtsns;
+ sas.sas_osacks = asoc->stats.osacks;
+ sas.sas_isacks = asoc->stats.isacks;
+ sas.sas_octrlchunks = asoc->stats.octrlchunks;
+ sas.sas_ictrlchunks = asoc->stats.ictrlchunks;
+ sas.sas_oodchunks = asoc->stats.oodchunks;
+ sas.sas_iodchunks = asoc->stats.iodchunks;
+ sas.sas_ouodchunks = asoc->stats.ouodchunks;
+ sas.sas_iuodchunks = asoc->stats.iuodchunks;
+ sas.sas_idupchunks = asoc->stats.idupchunks;
+ sas.sas_opackets = asoc->stats.opackets;
+ sas.sas_ipackets = asoc->stats.ipackets;
+
+ /* New high max rto observed, will return 0 if not a single
+ * RTO update took place. obs_rto_ipaddr will be bogus
+ * in such a case
+ */
+ sas.sas_maxrto = asoc->stats.max_obs_rto;
+ memcpy(&sas.sas_obs_rto_ipaddr, &asoc->stats.obs_rto_ipaddr,
+ sizeof(struct sockaddr_storage));
+
+ /* Mark beginning of a new observation period */
+ asoc->stats.max_obs_rto = asoc->rto_min;
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ pr_debug("%s: len:%d, assoc_id:%d\n", __func__, len, sas.sas_assoc_id);
+
+ if (copy_to_user(optval, &sas, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int sctp_getsockopt_recvrcvinfo(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ int val = 0;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ if (sctp_sk(sk)->recvrcvinfo)
+ val = 1;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ int val = 0;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ if (sctp_sk(sk)->recvnxtinfo)
+ val = 1;
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int sctp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ int retval = 0;
+ int len;
+
+ pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname);
+
+ /* I can hardly begin to describe how wrong this is. This is
+ * so broken as to be worse than useless. The API draft
+ * REALLY is NOT helpful here... I am not convinced that the
+ * semantics of getsockopt() with a level OTHER THAN SOL_SCTP
+ * are at all well-founded.
+ */
+ if (level != SOL_SCTP) {
+ struct sctp_af *af = sctp_sk(sk)->pf->af;
+
+ retval = af->getsockopt(sk, level, optname, optval, optlen);
+ return retval;
+ }
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case SCTP_STATUS:
+ retval = sctp_getsockopt_sctp_status(sk, len, optval, optlen);
+ break;
+ case SCTP_DISABLE_FRAGMENTS:
+ retval = sctp_getsockopt_disable_fragments(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_EVENTS:
+ retval = sctp_getsockopt_events(sk, len, optval, optlen);
+ break;
+ case SCTP_AUTOCLOSE:
+ retval = sctp_getsockopt_autoclose(sk, len, optval, optlen);
+ break;
+ case SCTP_SOCKOPT_PEELOFF:
+ retval = sctp_getsockopt_peeloff(sk, len, optval, optlen);
+ break;
+ case SCTP_PEER_ADDR_PARAMS:
+ retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_DELAYED_SACK:
+ retval = sctp_getsockopt_delayed_ack(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_INITMSG:
+ retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
+ break;
+ case SCTP_GET_PEER_ADDRS:
+ retval = sctp_getsockopt_peer_addrs(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_GET_LOCAL_ADDRS:
+ retval = sctp_getsockopt_local_addrs(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_SOCKOPT_CONNECTX3:
+ retval = sctp_getsockopt_connectx3(sk, len, optval, optlen);
+ break;
+ case SCTP_DEFAULT_SEND_PARAM:
+ retval = sctp_getsockopt_default_send_param(sk, len,
+ optval, optlen);
+ break;
+ case SCTP_DEFAULT_SNDINFO:
+ retval = sctp_getsockopt_default_sndinfo(sk, len,
+ optval, optlen);
+ break;
+ case SCTP_PRIMARY_ADDR:
+ retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen);
+ break;
+ case SCTP_NODELAY:
+ retval = sctp_getsockopt_nodelay(sk, len, optval, optlen);
+ break;
+ case SCTP_RTOINFO:
+ retval = sctp_getsockopt_rtoinfo(sk, len, optval, optlen);
+ break;
+ case SCTP_ASSOCINFO:
+ retval = sctp_getsockopt_associnfo(sk, len, optval, optlen);
+ break;
+ case SCTP_I_WANT_MAPPED_V4_ADDR:
+ retval = sctp_getsockopt_mappedv4(sk, len, optval, optlen);
+ break;
+ case SCTP_MAXSEG:
+ retval = sctp_getsockopt_maxseg(sk, len, optval, optlen);
+ break;
+ case SCTP_GET_PEER_ADDR_INFO:
+ retval = sctp_getsockopt_peer_addr_info(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_ADAPTATION_LAYER:
+ retval = sctp_getsockopt_adaptation_layer(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_CONTEXT:
+ retval = sctp_getsockopt_context(sk, len, optval, optlen);
+ break;
+ case SCTP_FRAGMENT_INTERLEAVE:
+ retval = sctp_getsockopt_fragment_interleave(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_PARTIAL_DELIVERY_POINT:
+ retval = sctp_getsockopt_partial_delivery_point(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_MAX_BURST:
+ retval = sctp_getsockopt_maxburst(sk, len, optval, optlen);
+ break;
+ case SCTP_AUTH_KEY:
+ case SCTP_AUTH_CHUNK:
+ case SCTP_AUTH_DELETE_KEY:
+ retval = -EOPNOTSUPP;
+ break;
+ case SCTP_HMAC_IDENT:
+ retval = sctp_getsockopt_hmac_ident(sk, len, optval, optlen);
+ break;
+ case SCTP_AUTH_ACTIVE_KEY:
+ retval = sctp_getsockopt_active_key(sk, len, optval, optlen);
+ break;
+ case SCTP_PEER_AUTH_CHUNKS:
+ retval = sctp_getsockopt_peer_auth_chunks(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_LOCAL_AUTH_CHUNKS:
+ retval = sctp_getsockopt_local_auth_chunks(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_GET_ASSOC_NUMBER:
+ retval = sctp_getsockopt_assoc_number(sk, len, optval, optlen);
+ break;
+ case SCTP_GET_ASSOC_ID_LIST:
+ retval = sctp_getsockopt_assoc_ids(sk, len, optval, optlen);
+ break;
+ case SCTP_AUTO_ASCONF:
+ retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
+ break;
+ case SCTP_PEER_ADDR_THLDS:
+ retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
+ break;
+ case SCTP_GET_ASSOC_STATS:
+ retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen);
+ break;
+ case SCTP_RECVRCVINFO:
+ retval = sctp_getsockopt_recvrcvinfo(sk, len, optval, optlen);
+ break;
+ case SCTP_RECVNXTINFO:
+ retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen);
+ break;
+ default:
+ retval = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return retval;
+}
+
+static void sctp_hash(struct sock *sk)
+{
+ /* STUB */
+}
+
+static void sctp_unhash(struct sock *sk)
+{
+ /* STUB */
+}
+
+/* Check if port is acceptable. Possibly find first available port.
+ *
+ * The port hash table (contained in the 'global' SCTP protocol storage
+ * returned by struct sctp_protocol *sctp_get_protocol()). The hash
+ * table is an array of 4096 lists (sctp_bind_hashbucket). Each
+ * list (the list number is the port number hashed out, so as you
+ * would expect from a hash function, all the ports in a given list have
+ * such a number that hashes out to the same list number; you were
+ * expecting that, right?); so each list has a set of ports, with a
+ * link to the socket (struct sock) that uses it, the port number and
+ * a fastreuse flag (FIXME: NPI ipg).
+ */
+static struct sctp_bind_bucket *sctp_bucket_create(
+ struct sctp_bind_hashbucket *head, struct net *, unsigned short snum);
+
+static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
+{
+ struct sctp_bind_hashbucket *head; /* hash list */
+ struct sctp_bind_bucket *pp;
+ unsigned short snum;
+ int ret;
+
+ snum = ntohs(addr->v4.sin_port);
+
+ pr_debug("%s: begins, snum:%d\n", __func__, snum);
+
+ local_bh_disable();
+
+ if (snum == 0) {
+ /* Search for an available port. */
+ int low, high, remaining, index;
+ unsigned int rover;
+ struct net *net = sock_net(sk);
+
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+ rover = prandom_u32() % remaining + low;
+
+ do {
+ rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ if (inet_is_local_reserved_port(net, rover))
+ continue;
+ index = sctp_phashfn(sock_net(sk), rover);
+ head = &sctp_port_hashtable[index];
+ spin_lock(&head->lock);
+ sctp_for_each_hentry(pp, &head->chain)
+ if ((pp->port == rover) &&
+ net_eq(sock_net(sk), pp->net))
+ goto next;
+ break;
+ next:
+ spin_unlock(&head->lock);
+ } while (--remaining > 0);
+
+ /* Exhausted local port range during search? */
+ ret = 1;
+ if (remaining <= 0)
+ goto fail;
+
+ /* OK, here is the one we will use. HEAD (the port
+ * hash table list entry) is non-NULL and we hold it's
+ * mutex.
+ */
+ snum = rover;
+ } else {
+ /* We are given an specific port number; we verify
+ * that it is not being used. If it is used, we will
+ * exahust the search in the hash list corresponding
+ * to the port number (snum) - we detect that with the
+ * port iterator, pp being NULL.
+ */
+ head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), snum)];
+ spin_lock(&head->lock);
+ sctp_for_each_hentry(pp, &head->chain) {
+ if ((pp->port == snum) && net_eq(pp->net, sock_net(sk)))
+ goto pp_found;
+ }
+ }
+ pp = NULL;
+ goto pp_not_found;
+pp_found:
+ if (!hlist_empty(&pp->owner)) {
+ /* We had a port hash table hit - there is an
+ * available port (pp != NULL) and it is being
+ * used by other socket (pp->owner not empty); that other
+ * socket is going to be sk2.
+ */
+ int reuse = sk->sk_reuse;
+ struct sock *sk2;
+
+ pr_debug("%s: found a possible match\n", __func__);
+
+ if (pp->fastreuse && sk->sk_reuse &&
+ sk->sk_state != SCTP_SS_LISTENING)
+ goto success;
+
+ /* Run through the list of sockets bound to the port
+ * (pp->port) [via the pointers bind_next and
+ * bind_pprev in the struct sock *sk2 (pp->sk)]. On each one,
+ * we get the endpoint they describe and run through
+ * the endpoint's list of IP (v4 or v6) addresses,
+ * comparing each of the addresses with the address of
+ * the socket sk. If we find a match, then that means
+ * that this port/socket (sk) combination are already
+ * in an endpoint.
+ */
+ sk_for_each_bound(sk2, &pp->owner) {
+ struct sctp_endpoint *ep2;
+ ep2 = sctp_sk(sk2)->ep;
+
+ if (sk == sk2 ||
+ (reuse && sk2->sk_reuse &&
+ sk2->sk_state != SCTP_SS_LISTENING))
+ continue;
+
+ if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr,
+ sctp_sk(sk2), sctp_sk(sk))) {
+ ret = (long)sk2;
+ goto fail_unlock;
+ }
+ }
+
+ pr_debug("%s: found a match\n", __func__);
+ }
+pp_not_found:
+ /* If there was a hash table miss, create a new port. */
+ ret = 1;
+ if (!pp && !(pp = sctp_bucket_create(head, sock_net(sk), snum)))
+ goto fail_unlock;
+
+ /* In either case (hit or miss), make sure fastreuse is 1 only
+ * if sk->sk_reuse is too (that is, if the caller requested
+ * SO_REUSEADDR on this socket -sk-).
+ */
+ if (hlist_empty(&pp->owner)) {
+ if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING)
+ pp->fastreuse = 1;
+ else
+ pp->fastreuse = 0;
+ } else if (pp->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING))
+ pp->fastreuse = 0;
+
+ /* We are set, so fill up all the data in the hash table
+ * entry, tie the socket list information with the rest of the
+ * sockets FIXME: Blurry, NPI (ipg).
+ */
+success:
+ if (!sctp_sk(sk)->bind_hash) {
+ inet_sk(sk)->inet_num = snum;
+ sk_add_bind_node(sk, &pp->owner);
+ sctp_sk(sk)->bind_hash = pp;
+ }
+ ret = 0;
+
+fail_unlock:
+ spin_unlock(&head->lock);
+
+fail:
+ local_bh_enable();
+ return ret;
+}
+
+/* Assign a 'snum' port to the socket. If snum == 0, an ephemeral
+ * port is requested.
+ */
+static int sctp_get_port(struct sock *sk, unsigned short snum)
+{
+ union sctp_addr addr;
+ struct sctp_af *af = sctp_sk(sk)->pf->af;
+
+ /* Set up a dummy address struct from the sk. */
+ af->from_sk(&addr, sk);
+ addr.v4.sin_port = htons(snum);
+
+ /* Note: sk->sk_num gets filled in if ephemeral port request. */
+ return !!sctp_get_port_local(sk, &addr);
+}
+
+/*
+ * Move a socket to LISTENING state.
+ */
+static int sctp_listen_start(struct sock *sk, int backlog)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_endpoint *ep = sp->ep;
+ struct crypto_hash *tfm = NULL;
+ char alg[32];
+
+ /* Allocate HMAC for generating cookie. */
+ if (!sp->hmac && sp->sctp_hmac_alg) {
+ sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg);
+ tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm)) {
+ net_info_ratelimited("failed to load transform for %s: %ld\n",
+ sp->sctp_hmac_alg, PTR_ERR(tfm));
+ return -ENOSYS;
+ }
+ sctp_sk(sk)->hmac = tfm;
+ }
+
+ /*
+ * If a bind() or sctp_bindx() is not called prior to a listen()
+ * call that allows new associations to be accepted, the system
+ * picks an ephemeral port and will choose an address set equivalent
+ * to binding with a wildcard address.
+ *
+ * This is not currently spelled out in the SCTP sockets
+ * extensions draft, but follows the practice as seen in TCP
+ * sockets.
+ *
+ */
+ sk->sk_state = SCTP_SS_LISTENING;
+ if (!ep->base.bind_addr.port) {
+ if (sctp_autobind(sk))
+ return -EAGAIN;
+ } else {
+ if (sctp_get_port(sk, inet_sk(sk)->inet_num)) {
+ sk->sk_state = SCTP_SS_CLOSED;
+ return -EADDRINUSE;
+ }
+ }
+
+ sk->sk_max_ack_backlog = backlog;
+ sctp_hash_endpoint(ep);
+ return 0;
+}
+
+/*
+ * 4.1.3 / 5.1.3 listen()
+ *
+ * By default, new associations are not accepted for UDP style sockets.
+ * An application uses listen() to mark a socket as being able to
+ * accept new associations.
+ *
+ * On TCP style sockets, applications use listen() to ready the SCTP
+ * endpoint for accepting inbound associations.
+ *
+ * On both types of endpoints a backlog of '0' disables listening.
+ *
+ * Move a socket to LISTENING state.
+ */
+int sctp_inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+ int err = -EINVAL;
+
+ if (unlikely(backlog < 0))
+ return err;
+
+ lock_sock(sk);
+
+ /* Peeled-off sockets are not allowed to listen(). */
+ if (sctp_style(sk, UDP_HIGH_BANDWIDTH))
+ goto out;
+
+ if (sock->state != SS_UNCONNECTED)
+ goto out;
+
+ /* If backlog is zero, disable listening. */
+ if (!backlog) {
+ if (sctp_sstate(sk, CLOSED))
+ goto out;
+
+ err = 0;
+ sctp_unhash_endpoint(ep);
+ sk->sk_state = SCTP_SS_CLOSED;
+ if (sk->sk_reuse)
+ sctp_sk(sk)->bind_hash->fastreuse = 1;
+ goto out;
+ }
+
+ /* If we are already listening, just update the backlog */
+ if (sctp_sstate(sk, LISTENING))
+ sk->sk_max_ack_backlog = backlog;
+ else {
+ err = sctp_listen_start(sk, backlog);
+ if (err)
+ goto out;
+ }
+
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+
+/*
+ * This function is done by modeling the current datagram_poll() and the
+ * tcp_poll(). Note that, based on these implementations, we don't
+ * lock the socket in this function, even though it seems that,
+ * ideally, locking or some other mechanisms can be used to ensure
+ * the integrity of the counters (sndbuf and wmem_alloc) used
+ * in this place. We assume that we don't need locks either until proven
+ * otherwise.
+ *
+ * Another thing to note is that we include the Async I/O support
+ * here, again, by modeling the current TCP/UDP code. We don't have
+ * a good way to test with it yet.
+ */
+unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct sctp_sock *sp = sctp_sk(sk);
+ unsigned int mask;
+
+ poll_wait(file, sk_sleep(sk), wait);
+
+ /* A TCP-style listening socket becomes readable when the accept queue
+ * is not empty.
+ */
+ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+ return (!list_empty(&sp->ep->asocs)) ?
+ (POLLIN | POLLRDNORM) : 0;
+
+ mask = 0;
+
+ /* Is there any exceptional events? */
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ /* Is it readable? Reconsider this code with TCP-style support. */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* The association is either gone or not ready. */
+ if (!sctp_style(sk, UDP) && sctp_sstate(sk, CLOSED))
+ return mask;
+
+ /* Is it writable? */
+ if (sctp_writeable(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else {
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ /*
+ * Since the socket is not locked, the buffer
+ * might be made available after the writeable check and
+ * before the bit is set. This could cause a lost I/O
+ * signal. tcp_poll() has a race breaker for this race
+ * condition. Based on their implementation, we put
+ * in the following code to cover it as well.
+ */
+ if (sctp_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ return mask;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+static struct sctp_bind_bucket *sctp_bucket_create(
+ struct sctp_bind_hashbucket *head, struct net *net, unsigned short snum)
+{
+ struct sctp_bind_bucket *pp;
+
+ pp = kmem_cache_alloc(sctp_bucket_cachep, GFP_ATOMIC);
+ if (pp) {
+ SCTP_DBG_OBJCNT_INC(bind_bucket);
+ pp->port = snum;
+ pp->fastreuse = 0;
+ INIT_HLIST_HEAD(&pp->owner);
+ pp->net = net;
+ hlist_add_head(&pp->node, &head->chain);
+ }
+ return pp;
+}
+
+/* Caller must hold hashbucket lock for this tb with local BH disabled */
+static void sctp_bucket_destroy(struct sctp_bind_bucket *pp)
+{
+ if (pp && hlist_empty(&pp->owner)) {
+ __hlist_del(&pp->node);
+ kmem_cache_free(sctp_bucket_cachep, pp);
+ SCTP_DBG_OBJCNT_DEC(bind_bucket);
+ }
+}
+
+/* Release this socket's reference to a local port. */
+static inline void __sctp_put_port(struct sock *sk)
+{
+ struct sctp_bind_hashbucket *head =
+ &sctp_port_hashtable[sctp_phashfn(sock_net(sk),
+ inet_sk(sk)->inet_num)];
+ struct sctp_bind_bucket *pp;
+
+ spin_lock(&head->lock);
+ pp = sctp_sk(sk)->bind_hash;
+ __sk_del_bind_node(sk);
+ sctp_sk(sk)->bind_hash = NULL;
+ inet_sk(sk)->inet_num = 0;
+ sctp_bucket_destroy(pp);
+ spin_unlock(&head->lock);
+}
+
+void sctp_put_port(struct sock *sk)
+{
+ local_bh_disable();
+ __sctp_put_port(sk);
+ local_bh_enable();
+}
+
+/*
+ * The system picks an ephemeral port and choose an address set equivalent
+ * to binding with a wildcard address.
+ * One of those addresses will be the primary address for the association.
+ * This automatically enables the multihoming capability of SCTP.
+ */
+static int sctp_autobind(struct sock *sk)
+{
+ union sctp_addr autoaddr;
+ struct sctp_af *af;
+ __be16 port;
+
+ /* Initialize a local sockaddr structure to INADDR_ANY. */
+ af = sctp_sk(sk)->pf->af;
+
+ port = htons(inet_sk(sk)->inet_num);
+ af->inaddr_any(&autoaddr, port);
+
+ return sctp_do_bind(sk, &autoaddr, af->sockaddr_len);
+}
+
+/* Parse out IPPROTO_SCTP CMSG headers. Perform only minimal validation.
+ *
+ * From RFC 2292
+ * 4.2 The cmsghdr Structure *
+ *
+ * When ancillary data is sent or received, any number of ancillary data
+ * objects can be specified by the msg_control and msg_controllen members of
+ * the msghdr structure, because each object is preceded by
+ * a cmsghdr structure defining the object's length (the cmsg_len member).
+ * Historically Berkeley-derived implementations have passed only one object
+ * at a time, but this API allows multiple objects to be
+ * passed in a single call to sendmsg() or recvmsg(). The following example
+ * shows two ancillary data objects in a control buffer.
+ *
+ * |<--------------------------- msg_controllen -------------------------->|
+ * | |
+ *
+ * |<----- ancillary data object ----->|<----- ancillary data object ----->|
+ *
+ * |<---------- CMSG_SPACE() --------->|<---------- CMSG_SPACE() --------->|
+ * | | |
+ *
+ * |<---------- cmsg_len ---------->| |<--------- cmsg_len ----------->| |
+ *
+ * |<--------- CMSG_LEN() --------->| |<-------- CMSG_LEN() ---------->| |
+ * | | | | |
+ *
+ * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+
+ * |cmsg_|cmsg_|cmsg_|XX| |XX|cmsg_|cmsg_|cmsg_|XX| |XX|
+ *
+ * |len |level|type |XX|cmsg_data[]|XX|len |level|type |XX|cmsg_data[]|XX|
+ *
+ * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+
+ * ^
+ * |
+ *
+ * msg_control
+ * points here
+ */
+static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
+{
+ struct cmsghdr *cmsg;
+ struct msghdr *my_msg = (struct msghdr *)msg;
+
+ for_each_cmsghdr(cmsg, my_msg) {
+ if (!CMSG_OK(my_msg, cmsg))
+ return -EINVAL;
+
+ /* Should we parse this header or ignore? */
+ if (cmsg->cmsg_level != IPPROTO_SCTP)
+ continue;
+
+ /* Strictly check lengths following example in SCM code. */
+ switch (cmsg->cmsg_type) {
+ case SCTP_INIT:
+ /* SCTP Socket API Extension
+ * 5.3.1 SCTP Initiation Structure (SCTP_INIT)
+ *
+ * This cmsghdr structure provides information for
+ * initializing new SCTP associations with sendmsg().
+ * The SCTP_INITMSG socket option uses this same data
+ * structure. This structure is not used for
+ * recvmsg().
+ *
+ * cmsg_level cmsg_type cmsg_data[]
+ * ------------ ------------ ----------------------
+ * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg
+ */
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_initmsg)))
+ return -EINVAL;
+
+ cmsgs->init = CMSG_DATA(cmsg);
+ break;
+
+ case SCTP_SNDRCV:
+ /* SCTP Socket API Extension
+ * 5.3.2 SCTP Header Information Structure(SCTP_SNDRCV)
+ *
+ * This cmsghdr structure specifies SCTP options for
+ * sendmsg() and describes SCTP header information
+ * about a received message through recvmsg().
+ *
+ * cmsg_level cmsg_type cmsg_data[]
+ * ------------ ------------ ----------------------
+ * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo
+ */
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndrcvinfo)))
+ return -EINVAL;
+
+ cmsgs->srinfo = CMSG_DATA(cmsg);
+
+ if (cmsgs->srinfo->sinfo_flags &
+ ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_ABORT | SCTP_EOF))
+ return -EINVAL;
+ break;
+
+ case SCTP_SNDINFO:
+ /* SCTP Socket API Extension
+ * 5.3.4 SCTP Send Information Structure (SCTP_SNDINFO)
+ *
+ * This cmsghdr structure specifies SCTP options for
+ * sendmsg(). This structure and SCTP_RCVINFO replaces
+ * SCTP_SNDRCV which has been deprecated.
+ *
+ * cmsg_level cmsg_type cmsg_data[]
+ * ------------ ------------ ---------------------
+ * IPPROTO_SCTP SCTP_SNDINFO struct sctp_sndinfo
+ */
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndinfo)))
+ return -EINVAL;
+
+ cmsgs->sinfo = CMSG_DATA(cmsg);
+
+ if (cmsgs->sinfo->snd_flags &
+ ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_ABORT | SCTP_EOF))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Wait for a packet..
+ * Note: This function is the same function as in core/datagram.c
+ * with a few modifications to make lksctp work.
+ */
+static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p)
+{
+ int error;
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ /* Socket errors? */
+ error = sock_error(sk);
+ if (error)
+ goto out;
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ goto ready;
+
+ /* Socket shut down? */
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto out;
+
+ /* Sequenced packets can come disconnected. If so we report the
+ * problem.
+ */
+ error = -ENOTCONN;
+
+ /* Is there a good reason to think that we may receive some data? */
+ if (list_empty(&sctp_sk(sk)->ep->asocs) && !sctp_sstate(sk, LISTENING))
+ goto out;
+
+ /* Handle signals. */
+ if (signal_pending(current))
+ goto interrupted;
+
+ /* Let another process have a go. Since we are going to sleep
+ * anyway. Note: This may cause odd behaviors if the message
+ * does not fit in the user's buffer, but this seems to be the
+ * only way to honor MSG_DONTWAIT realistically.
+ */
+ release_sock(sk);
+ *timeo_p = schedule_timeout(*timeo_p);
+ lock_sock(sk);
+
+ready:
+ finish_wait(sk_sleep(sk), &wait);
+ return 0;
+
+interrupted:
+ error = sock_intr_errno(*timeo_p);
+
+out:
+ finish_wait(sk_sleep(sk), &wait);
+ *err = error;
+ return error;
+}
+
+/* Receive a datagram.
+ * Note: This is pretty much the same routine as in core/datagram.c
+ * with a few changes to make lksctp work.
+ */
+struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
+ int noblock, int *err)
+{
+ int error;
+ struct sk_buff *skb;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, noblock);
+
+ pr_debug("%s: timeo:%ld, max:%ld\n", __func__, timeo,
+ MAX_SCHEDULE_TIMEOUT);
+
+ do {
+ /* Again only user level code calls this function,
+ * so nothing interrupt level
+ * will suddenly eat the receive_queue.
+ *
+ * Look at current nfs client by the way...
+ * However, this function was correct in any case. 8)
+ */
+ if (flags & MSG_PEEK) {
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ atomic_inc(&skb->users);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ } else {
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ }
+
+ if (skb)
+ return skb;
+
+ /* Caller is allowed not to check sk->sk_err before calling. */
+ error = sock_error(sk);
+ if (error)
+ goto no_packet;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk_can_busy_loop(sk) &&
+ sk_busy_loop(sk, noblock))
+ continue;
+
+ /* User doesn't want to wait. */
+ error = -EAGAIN;
+ if (!timeo)
+ goto no_packet;
+ } while (sctp_wait_for_packet(sk, err, &timeo) == 0);
+
+ return NULL;
+
+no_packet:
+ *err = error;
+ return NULL;
+}
+
+/* If sndbuf has changed, wake up per association sndbuf waiters. */
+static void __sctp_write_space(struct sctp_association *asoc)
+{
+ struct sock *sk = asoc->base.sk;
+ struct socket *sock = sk->sk_socket;
+
+ if ((sctp_wspace(asoc) > 0) && sock) {
+ if (waitqueue_active(&asoc->wait))
+ wake_up_interruptible(&asoc->wait);
+
+ if (sctp_writeable(sk)) {
+ wait_queue_head_t *wq = sk_sleep(sk);
+
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
+
+ /* Note that we try to include the Async I/O support
+ * here by modeling from the current TCP/UDP code.
+ * We have not tested with it yet.
+ */
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(sock,
+ SOCK_WAKE_SPACE, POLL_OUT);
+ }
+ }
+}
+
+static void sctp_wake_up_waiters(struct sock *sk,
+ struct sctp_association *asoc)
+{
+ struct sctp_association *tmp = asoc;
+
+ /* We do accounting for the sndbuf space per association,
+ * so we only need to wake our own association.
+ */
+ if (asoc->ep->sndbuf_policy)
+ return __sctp_write_space(asoc);
+
+ /* If association goes down and is just flushing its
+ * outq, then just normally notify others.
+ */
+ if (asoc->base.dead)
+ return sctp_write_space(sk);
+
+ /* Accounting for the sndbuf space is per socket, so we
+ * need to wake up others, try to be fair and in case of
+ * other associations, let them have a go first instead
+ * of just doing a sctp_write_space() call.
+ *
+ * Note that we reach sctp_wake_up_waiters() only when
+ * associations free up queued chunks, thus we are under
+ * lock and the list of associations on a socket is
+ * guaranteed not to change.
+ */
+ for (tmp = list_next_entry(tmp, asocs); 1;
+ tmp = list_next_entry(tmp, asocs)) {
+ /* Manually skip the head element. */
+ if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs))
+ continue;
+ /* Wake up association. */
+ __sctp_write_space(tmp);
+ /* We've reached the end. */
+ if (tmp == asoc)
+ break;
+ }
+}
+
+/* Do accounting for the sndbuf space.
+ * Decrement the used sndbuf space of the corresponding association by the
+ * data size which was just transmitted(freed).
+ */
+static void sctp_wfree(struct sk_buff *skb)
+{
+ struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg;
+ struct sctp_association *asoc = chunk->asoc;
+ struct sock *sk = asoc->base.sk;
+
+ asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) +
+ sizeof(struct sk_buff) +
+ sizeof(struct sctp_chunk);
+
+ atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
+
+ /*
+ * This undoes what is done via sctp_set_owner_w and sk_mem_charge
+ */
+ sk->sk_wmem_queued -= skb->truesize;
+ sk_mem_uncharge(sk, skb->truesize);
+
+ sock_wfree(skb);
+ sctp_wake_up_waiters(sk, asoc);
+
+ sctp_association_put(asoc);
+}
+
+/* Do accounting for the receive space on the socket.
+ * Accounting for the association is done in ulpevent.c
+ * We set this as a destructor for the cloned data skbs so that
+ * accounting is done at the correct time.
+ */
+void sctp_sock_rfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct sctp_ulpevent *event = sctp_skb2event(skb);
+
+ atomic_sub(event->rmem_len, &sk->sk_rmem_alloc);
+
+ /*
+ * Mimic the behavior of sock_rfree
+ */
+ sk_mem_uncharge(sk, event->rmem_len);
+}
+
+
+/* Helper function to wait for space in the sndbuf. */
+static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
+ size_t msg_len)
+{
+ struct sock *sk = asoc->base.sk;
+ int err = 0;
+ long current_timeo = *timeo_p;
+ DEFINE_WAIT(wait);
+
+ pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc,
+ *timeo_p, msg_len);
+
+ /* Increment the association's refcnt. */
+ sctp_association_hold(asoc);
+
+ /* Wait on the association specific sndbuf space. */
+ for (;;) {
+ prepare_to_wait_exclusive(&asoc->wait, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!*timeo_p)
+ goto do_nonblock;
+ if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING ||
+ asoc->base.dead)
+ goto do_error;
+ if (signal_pending(current))
+ goto do_interrupted;
+ if (msg_len <= sctp_wspace(asoc))
+ break;
+
+ /* Let another process have a go. Since we are going
+ * to sleep anyway.
+ */
+ release_sock(sk);
+ current_timeo = schedule_timeout(current_timeo);
+ BUG_ON(sk != asoc->base.sk);
+ lock_sock(sk);
+
+ *timeo_p = current_timeo;
+ }
+
+out:
+ finish_wait(&asoc->wait, &wait);
+
+ /* Release the association's refcnt. */
+ sctp_association_put(asoc);
+
+ return err;
+
+do_error:
+ err = -EPIPE;
+ goto out;
+
+do_interrupted:
+ err = sock_intr_errno(*timeo_p);
+ goto out;
+
+do_nonblock:
+ err = -EAGAIN;
+ goto out;
+}
+
+void sctp_data_ready(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+ POLLRDNORM | POLLRDBAND);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
+}
+
+/* If socket sndbuf has changed, wake up all per association waiters. */
+void sctp_write_space(struct sock *sk)
+{
+ struct sctp_association *asoc;
+
+ /* Wake up the tasks in each wait queue. */
+ list_for_each_entry(asoc, &((sctp_sk(sk))->ep->asocs), asocs) {
+ __sctp_write_space(asoc);
+ }
+}
+
+/* Is there any sndbuf space available on the socket?
+ *
+ * Note that sk_wmem_alloc is the sum of the send buffers on all of the
+ * associations on the same socket. For a UDP-style socket with
+ * multiple associations, it is possible for it to be "unwriteable"
+ * prematurely. I assume that this is acceptable because
+ * a premature "unwriteable" is better than an accidental "writeable" which
+ * would cause an unwanted block under certain circumstances. For the 1-1
+ * UDP-style sockets or TCP-style sockets, this code should work.
+ * - Daisy
+ */
+static int sctp_writeable(struct sock *sk)
+{
+ int amt = 0;
+
+ amt = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amt < 0)
+ amt = 0;
+ return amt;
+}
+
+/* Wait for an association to go into ESTABLISHED state. If timeout is 0,
+ * returns immediately with EINPROGRESS.
+ */
+static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p)
+{
+ struct sock *sk = asoc->base.sk;
+ int err = 0;
+ long current_timeo = *timeo_p;
+ DEFINE_WAIT(wait);
+
+ pr_debug("%s: asoc:%p, timeo:%ld\n", __func__, asoc, *timeo_p);
+
+ /* Increment the association's refcnt. */
+ sctp_association_hold(asoc);
+
+ for (;;) {
+ prepare_to_wait_exclusive(&asoc->wait, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!*timeo_p)
+ goto do_nonblock;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING ||
+ asoc->base.dead)
+ goto do_error;
+ if (signal_pending(current))
+ goto do_interrupted;
+
+ if (sctp_state(asoc, ESTABLISHED))
+ break;
+
+ /* Let another process have a go. Since we are going
+ * to sleep anyway.
+ */
+ release_sock(sk);
+ current_timeo = schedule_timeout(current_timeo);
+ lock_sock(sk);
+
+ *timeo_p = current_timeo;
+ }
+
+out:
+ finish_wait(&asoc->wait, &wait);
+
+ /* Release the association's refcnt. */
+ sctp_association_put(asoc);
+
+ return err;
+
+do_error:
+ if (asoc->init_err_counter + 1 > asoc->max_init_attempts)
+ err = -ETIMEDOUT;
+ else
+ err = -ECONNREFUSED;
+ goto out;
+
+do_interrupted:
+ err = sock_intr_errno(*timeo_p);
+ goto out;
+
+do_nonblock:
+ err = -EINPROGRESS;
+ goto out;
+}
+
+static int sctp_wait_for_accept(struct sock *sk, long timeo)
+{
+ struct sctp_endpoint *ep;
+ int err = 0;
+ DEFINE_WAIT(wait);
+
+ ep = sctp_sk(sk)->ep;
+
+
+ for (;;) {
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+
+ if (list_empty(&ep->asocs)) {
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ }
+
+ err = -EINVAL;
+ if (!sctp_sstate(sk, LISTENING))
+ break;
+
+ err = 0;
+ if (!list_empty(&ep->asocs))
+ break;
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ }
+
+ finish_wait(sk_sleep(sk), &wait);
+
+ return err;
+}
+
+static void sctp_wait_for_close(struct sock *sk, long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (list_empty(&sctp_sk(sk)->ep->asocs))
+ break;
+ release_sock(sk);
+ timeout = schedule_timeout(timeout);
+ lock_sock(sk);
+ } while (!signal_pending(current) && timeout);
+
+ finish_wait(sk_sleep(sk), &wait);
+}
+
+static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk)
+{
+ struct sk_buff *frag;
+
+ if (!skb->data_len)
+ goto done;
+
+ /* Don't forget the fragments. */
+ skb_walk_frags(skb, frag)
+ sctp_skb_set_owner_r_frag(frag, sk);
+
+done:
+ sctp_skb_set_owner_r(skb, sk);
+}
+
+void sctp_copy_sock(struct sock *newsk, struct sock *sk,
+ struct sctp_association *asoc)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_sock *newinet;
+
+ newsk->sk_type = sk->sk_type;
+ newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
+ newsk->sk_flags = sk->sk_flags;
+ newsk->sk_no_check_tx = sk->sk_no_check_tx;
+ newsk->sk_no_check_rx = sk->sk_no_check_rx;
+ newsk->sk_reuse = sk->sk_reuse;
+
+ newsk->sk_shutdown = sk->sk_shutdown;
+ newsk->sk_destruct = sctp_destruct_sock;
+ newsk->sk_family = sk->sk_family;
+ newsk->sk_protocol = IPPROTO_SCTP;
+ newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+ newsk->sk_sndbuf = sk->sk_sndbuf;
+ newsk->sk_rcvbuf = sk->sk_rcvbuf;
+ newsk->sk_lingertime = sk->sk_lingertime;
+ newsk->sk_rcvtimeo = sk->sk_rcvtimeo;
+ newsk->sk_sndtimeo = sk->sk_sndtimeo;
+
+ newinet = inet_sk(newsk);
+
+ /* Initialize sk's sport, dport, rcv_saddr and daddr for
+ * getsockname() and getpeername()
+ */
+ newinet->inet_sport = inet->inet_sport;
+ newinet->inet_saddr = inet->inet_saddr;
+ newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
+ newinet->inet_dport = htons(asoc->peer.port);
+ newinet->pmtudisc = inet->pmtudisc;
+ newinet->inet_id = asoc->next_tsn ^ jiffies;
+
+ newinet->uc_ttl = inet->uc_ttl;
+ newinet->mc_loop = 1;
+ newinet->mc_ttl = 1;
+ newinet->mc_index = 0;
+ newinet->mc_list = NULL;
+}
+
+static inline void sctp_copy_descendant(struct sock *sk_to,
+ const struct sock *sk_from)
+{
+ int ancestor_size = sizeof(struct inet_sock) +
+ sizeof(struct sctp_sock) -
+ offsetof(struct sctp_sock, auto_asconf_list);
+
+ if (sk_from->sk_family == PF_INET6)
+ ancestor_size += sizeof(struct ipv6_pinfo);
+
+ __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
+}
+
+/* Populate the fields of the newsk from the oldsk and migrate the assoc
+ * and its messages to the newsk.
+ */
+static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
+ struct sctp_association *assoc,
+ sctp_socket_type_t type)
+{
+ struct sctp_sock *oldsp = sctp_sk(oldsk);
+ struct sctp_sock *newsp = sctp_sk(newsk);
+ struct sctp_bind_bucket *pp; /* hash list port iterator */
+ struct sctp_endpoint *newep = newsp->ep;
+ struct sk_buff *skb, *tmp;
+ struct sctp_ulpevent *event;
+ struct sctp_bind_hashbucket *head;
+
+ /* Migrate socket buffer sizes and all the socket level options to the
+ * new socket.
+ */
+ newsk->sk_sndbuf = oldsk->sk_sndbuf;
+ newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
+ /* Brute force copy old sctp opt. */
+ sctp_copy_descendant(newsk, oldsk);
+
+ /* Restore the ep value that was overwritten with the above structure
+ * copy.
+ */
+ newsp->ep = newep;
+ newsp->hmac = NULL;
+
+ /* Hook this new socket in to the bind_hash list. */
+ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk),
+ inet_sk(oldsk)->inet_num)];
+ local_bh_disable();
+ spin_lock(&head->lock);
+ pp = sctp_sk(oldsk)->bind_hash;
+ sk_add_bind_node(newsk, &pp->owner);
+ sctp_sk(newsk)->bind_hash = pp;
+ inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num;
+ spin_unlock(&head->lock);
+ local_bh_enable();
+
+ /* Copy the bind_addr list from the original endpoint to the new
+ * endpoint so that we can handle restarts properly
+ */
+ sctp_bind_addr_dup(&newsp->ep->base.bind_addr,
+ &oldsp->ep->base.bind_addr, GFP_KERNEL);
+
+ /* Move any messages in the old socket's receive queue that are for the
+ * peeled off association to the new socket's receive queue.
+ */
+ sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
+ event = sctp_skb2event(skb);
+ if (event->asoc == assoc) {
+ __skb_unlink(skb, &oldsk->sk_receive_queue);
+ __skb_queue_tail(&newsk->sk_receive_queue, skb);
+ sctp_skb_set_owner_r_frag(skb, newsk);
+ }
+ }
+
+ /* Clean up any messages pending delivery due to partial
+ * delivery. Three cases:
+ * 1) No partial deliver; no work.
+ * 2) Peeling off partial delivery; keep pd_lobby in new pd_lobby.
+ * 3) Peeling off non-partial delivery; move pd_lobby to receive_queue.
+ */
+ skb_queue_head_init(&newsp->pd_lobby);
+ atomic_set(&sctp_sk(newsk)->pd_mode, assoc->ulpq.pd_mode);
+
+ if (atomic_read(&sctp_sk(oldsk)->pd_mode)) {
+ struct sk_buff_head *queue;
+
+ /* Decide which queue to move pd_lobby skbs to. */
+ if (assoc->ulpq.pd_mode) {
+ queue = &newsp->pd_lobby;
+ } else
+ queue = &newsk->sk_receive_queue;
+
+ /* Walk through the pd_lobby, looking for skbs that
+ * need moved to the new socket.
+ */
+ sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
+ event = sctp_skb2event(skb);
+ if (event->asoc == assoc) {
+ __skb_unlink(skb, &oldsp->pd_lobby);
+ __skb_queue_tail(queue, skb);
+ sctp_skb_set_owner_r_frag(skb, newsk);
+ }
+ }
+
+ /* Clear up any skbs waiting for the partial
+ * delivery to finish.
+ */
+ if (assoc->ulpq.pd_mode)
+ sctp_clear_pd(oldsk, NULL);
+
+ }
+
+ sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp)
+ sctp_skb_set_owner_r_frag(skb, newsk);
+
+ sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp)
+ sctp_skb_set_owner_r_frag(skb, newsk);
+
+ /* Set the type of socket to indicate that it is peeled off from the
+ * original UDP-style socket or created with the accept() call on a
+ * TCP-style socket..
+ */
+ newsp->type = type;
+
+ /* Mark the new socket "in-use" by the user so that any packets
+ * that may arrive on the association after we've moved it are
+ * queued to the backlog. This prevents a potential race between
+ * backlog processing on the old socket and new-packet processing
+ * on the new socket.
+ *
+ * The caller has just allocated newsk so we can guarantee that other
+ * paths won't try to lock it and then oldsk.
+ */
+ lock_sock_nested(newsk, SINGLE_DEPTH_NESTING);
+ sctp_assoc_migrate(assoc, newsk);
+
+ /* If the association on the newsk is already closed before accept()
+ * is called, set RCV_SHUTDOWN flag.
+ */
+ if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP))
+ newsk->sk_shutdown |= RCV_SHUTDOWN;
+
+ newsk->sk_state = SCTP_SS_ESTABLISHED;
+ release_sock(newsk);
+}
+
+
+/* This proto struct describes the ULP interface for SCTP. */
+struct proto sctp_prot = {
+ .name = "SCTP",
+ .owner = THIS_MODULE,
+ .close = sctp_close,
+ .connect = sctp_connect,
+ .disconnect = sctp_disconnect,
+ .accept = sctp_accept,
+ .ioctl = sctp_ioctl,
+ .init = sctp_init_sock,
+ .destroy = sctp_destroy_sock,
+ .shutdown = sctp_shutdown,
+ .setsockopt = sctp_setsockopt,
+ .getsockopt = sctp_getsockopt,
+ .sendmsg = sctp_sendmsg,
+ .recvmsg = sctp_recvmsg,
+ .bind = sctp_bind,
+ .backlog_rcv = sctp_backlog_rcv,
+ .hash = sctp_hash,
+ .unhash = sctp_unhash,
+ .get_port = sctp_get_port,
+ .obj_size = sizeof(struct sctp_sock),
+ .sysctl_mem = sysctl_sctp_mem,
+ .sysctl_rmem = sysctl_sctp_rmem,
+ .sysctl_wmem = sysctl_sctp_wmem,
+ .memory_pressure = &sctp_memory_pressure,
+ .enter_memory_pressure = sctp_enter_memory_pressure,
+ .memory_allocated = &sctp_memory_allocated,
+ .sockets_allocated = &sctp_sockets_allocated,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+struct proto sctpv6_prot = {
+ .name = "SCTPv6",
+ .owner = THIS_MODULE,
+ .close = sctp_close,
+ .connect = sctp_connect,
+ .disconnect = sctp_disconnect,
+ .accept = sctp_accept,
+ .ioctl = sctp_ioctl,
+ .init = sctp_init_sock,
+ .destroy = sctp_destroy_sock,
+ .shutdown = sctp_shutdown,
+ .setsockopt = sctp_setsockopt,
+ .getsockopt = sctp_getsockopt,
+ .sendmsg = sctp_sendmsg,
+ .recvmsg = sctp_recvmsg,
+ .bind = sctp_bind,
+ .backlog_rcv = sctp_backlog_rcv,
+ .hash = sctp_hash,
+ .unhash = sctp_unhash,
+ .get_port = sctp_get_port,
+ .obj_size = sizeof(struct sctp6_sock),
+ .sysctl_mem = sysctl_sctp_mem,
+ .sysctl_rmem = sysctl_sctp_rmem,
+ .sysctl_wmem = sysctl_sctp_wmem,
+ .memory_pressure = &sctp_memory_pressure,
+ .enter_memory_pressure = sctp_enter_memory_pressure,
+ .memory_allocated = &sctp_memory_allocated,
+ .sockets_allocated = &sctp_sockets_allocated,
+};
+#endif /* IS_ENABLED(CONFIG_IPV6) */
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
new file mode 100644
index 000000000..b9c8521c1
--- /dev/null
+++ b/net/sctp/ssnmap.c
@@ -0,0 +1,125 @@
+/* SCTP kernel implementation
+ * Copyright (c) 2003 International Business Machines, Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp SSN tracker.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Jon Grimm <jgrimm@us.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
+ __u16 out);
+
+/* Storage size needed for map includes 2 headers and then the
+ * specific needs of in or out streams.
+ */
+static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
+{
+ return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16);
+}
+
+
+/* Create a new sctp_ssnmap.
+ * Allocate room to store at least 'len' contiguous TSNs.
+ */
+struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
+ gfp_t gfp)
+{
+ struct sctp_ssnmap *retval;
+ int size;
+
+ size = sctp_ssnmap_size(in, out);
+ if (size <= KMALLOC_MAX_SIZE)
+ retval = kmalloc(size, gfp);
+ else
+ retval = (struct sctp_ssnmap *)
+ __get_free_pages(gfp, get_order(size));
+ if (!retval)
+ goto fail;
+
+ if (!sctp_ssnmap_init(retval, in, out))
+ goto fail_map;
+
+ SCTP_DBG_OBJCNT_INC(ssnmap);
+
+ return retval;
+
+fail_map:
+ if (size <= KMALLOC_MAX_SIZE)
+ kfree(retval);
+ else
+ free_pages((unsigned long)retval, get_order(size));
+fail:
+ return NULL;
+}
+
+
+/* Initialize a block of memory as a ssnmap. */
+static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
+ __u16 out)
+{
+ memset(map, 0x00, sctp_ssnmap_size(in, out));
+
+ /* Start 'in' stream just after the map header. */
+ map->in.ssn = (__u16 *)&map[1];
+ map->in.len = in;
+
+ /* Start 'out' stream just after 'in'. */
+ map->out.ssn = &map->in.ssn[in];
+ map->out.len = out;
+
+ return map;
+}
+
+/* Clear out the ssnmap streams. */
+void sctp_ssnmap_clear(struct sctp_ssnmap *map)
+{
+ size_t size;
+
+ size = (map->in.len + map->out.len) * sizeof(__u16);
+ memset(map->in.ssn, 0x00, size);
+}
+
+/* Dispose of a ssnmap. */
+void sctp_ssnmap_free(struct sctp_ssnmap *map)
+{
+ int size;
+
+ if (unlikely(!map))
+ return;
+
+ size = sctp_ssnmap_size(map->in.len, map->out.len);
+ if (size <= KMALLOC_MAX_SIZE)
+ kfree(map);
+ else
+ free_pages((unsigned long)map, get_order(size));
+
+ SCTP_DBG_OBJCNT_DEC(ssnmap);
+}
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
new file mode 100644
index 000000000..26d50c565
--- /dev/null
+++ b/net/sctp/sysctl.c
@@ -0,0 +1,501 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2002, 2004
+ * Copyright (c) 2002 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * Sysctl related interfaces for SCTP.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Mingqin Liu <liuming@us.ibm.com>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ * Ryan Layer <rmlayer@us.ibm.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <net/sctp/structs.h>
+#include <net/sctp/sctp.h>
+#include <linux/sysctl.h>
+
+static int zero = 0;
+static int one = 1;
+static int timer_max = 86400000; /* ms in one day */
+static int int_max = INT_MAX;
+static int sack_timer_min = 1;
+static int sack_timer_max = 500;
+static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */
+static int rwnd_scale_max = 16;
+static int rto_alpha_min = 0;
+static int rto_beta_min = 0;
+static int rto_alpha_max = 1000;
+static int rto_beta_max = 1000;
+
+static unsigned long max_autoclose_min = 0;
+static unsigned long max_autoclose_max =
+ (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX)
+ ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ;
+
+static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+
+static struct ctl_table sctp_table[] = {
+ {
+ .procname = "sctp_mem",
+ .data = &sysctl_sctp_mem,
+ .maxlen = sizeof(sysctl_sctp_mem),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax
+ },
+ {
+ .procname = "sctp_rmem",
+ .data = &sysctl_sctp_rmem,
+ .maxlen = sizeof(sysctl_sctp_rmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sctp_wmem",
+ .data = &sysctl_sctp_wmem,
+ .maxlen = sizeof(sysctl_sctp_wmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+
+ { /* sentinel */ }
+};
+
+static struct ctl_table sctp_net_table[] = {
+ {
+ .procname = "rto_initial",
+ .data = &init_net.sctp.rto_initial,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &timer_max
+ },
+ {
+ .procname = "rto_min",
+ .data = &init_net.sctp.rto_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_sctp_do_rto_min,
+ .extra1 = &one,
+ .extra2 = &init_net.sctp.rto_max
+ },
+ {
+ .procname = "rto_max",
+ .data = &init_net.sctp.rto_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_sctp_do_rto_max,
+ .extra1 = &init_net.sctp.rto_min,
+ .extra2 = &timer_max
+ },
+ {
+ .procname = "rto_alpha_exp_divisor",
+ .data = &init_net.sctp.rto_alpha,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_sctp_do_alpha_beta,
+ .extra1 = &rto_alpha_min,
+ .extra2 = &rto_alpha_max,
+ },
+ {
+ .procname = "rto_beta_exp_divisor",
+ .data = &init_net.sctp.rto_beta,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_sctp_do_alpha_beta,
+ .extra1 = &rto_beta_min,
+ .extra2 = &rto_beta_max,
+ },
+ {
+ .procname = "max_burst",
+ .data = &init_net.sctp.max_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max
+ },
+ {
+ .procname = "cookie_preserve_enable",
+ .data = &init_net.sctp.cookie_preserve_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cookie_hmac_alg",
+ .data = &init_net.sctp.sctp_hmac_alg,
+ .maxlen = 8,
+ .mode = 0644,
+ .proc_handler = proc_sctp_do_hmac_alg,
+ },
+ {
+ .procname = "valid_cookie_life",
+ .data = &init_net.sctp.valid_cookie_life,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &timer_max
+ },
+ {
+ .procname = "sack_timeout",
+ .data = &init_net.sctp.sack_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &sack_timer_min,
+ .extra2 = &sack_timer_max,
+ },
+ {
+ .procname = "hb_interval",
+ .data = &init_net.sctp.hb_interval,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &timer_max
+ },
+ {
+ .procname = "association_max_retrans",
+ .data = &init_net.sctp.max_retrans_association,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &int_max
+ },
+ {
+ .procname = "path_max_retrans",
+ .data = &init_net.sctp.max_retrans_path,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &int_max
+ },
+ {
+ .procname = "max_init_retransmits",
+ .data = &init_net.sctp.max_retrans_init,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &int_max
+ },
+ {
+ .procname = "pf_retrans",
+ .data = &init_net.sctp.pf_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max
+ },
+ {
+ .procname = "sndbuf_policy",
+ .data = &init_net.sctp.sndbuf_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "rcvbuf_policy",
+ .data = &init_net.sctp.rcvbuf_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "default_auto_asconf",
+ .data = &init_net.sctp.default_auto_asconf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "addip_enable",
+ .data = &init_net.sctp.addip_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "addip_noauth_enable",
+ .data = &init_net.sctp.addip_noauth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "prsctp_enable",
+ .data = &init_net.sctp.prsctp_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "auth_enable",
+ .data = &init_net.sctp.auth_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_sctp_do_auth,
+ },
+ {
+ .procname = "addr_scope_policy",
+ .data = &init_net.sctp.scope_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &addr_scope_max,
+ },
+ {
+ .procname = "rwnd_update_shift",
+ .data = &init_net.sctp.rwnd_upd_shift,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &rwnd_scale_max,
+ },
+ {
+ .procname = "max_autoclose",
+ .data = &init_net.sctp.max_autoclose,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &max_autoclose_min,
+ .extra2 = &max_autoclose_max,
+ },
+
+ { /* sentinel */ }
+};
+
+static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct ctl_table tbl;
+ bool changed = false;
+ char *none = "none";
+ char tmp[8];
+ int ret;
+
+ memset(&tbl, 0, sizeof(struct ctl_table));
+
+ if (write) {
+ tbl.data = tmp;
+ tbl.maxlen = sizeof(tmp);
+ } else {
+ tbl.data = net->sctp.sctp_hmac_alg ? : none;
+ tbl.maxlen = strlen(tbl.data);
+ }
+
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0) {
+#ifdef CONFIG_CRYPTO_MD5
+ if (!strncmp(tmp, "md5", 3)) {
+ net->sctp.sctp_hmac_alg = "md5";
+ changed = true;
+ }
+#endif
+#ifdef CONFIG_CRYPTO_SHA1
+ if (!strncmp(tmp, "sha1", 4)) {
+ net->sctp.sctp_hmac_alg = "sha1";
+ changed = true;
+ }
+#endif
+ if (!strncmp(tmp, "none", 4)) {
+ net->sctp.sctp_hmac_alg = NULL;
+ changed = true;
+ }
+ if (!changed)
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ unsigned int min = *(unsigned int *) ctl->extra1;
+ unsigned int max = *(unsigned int *) ctl->extra2;
+ struct ctl_table tbl;
+ int ret, new_value;
+
+ memset(&tbl, 0, sizeof(struct ctl_table));
+ tbl.maxlen = sizeof(unsigned int);
+
+ if (write)
+ tbl.data = &new_value;
+ else
+ tbl.data = &net->sctp.rto_min;
+
+ ret = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0) {
+ if (new_value > max || new_value < min)
+ return -EINVAL;
+
+ net->sctp.rto_min = new_value;
+ }
+
+ return ret;
+}
+
+static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ unsigned int min = *(unsigned int *) ctl->extra1;
+ unsigned int max = *(unsigned int *) ctl->extra2;
+ struct ctl_table tbl;
+ int ret, new_value;
+
+ memset(&tbl, 0, sizeof(struct ctl_table));
+ tbl.maxlen = sizeof(unsigned int);
+
+ if (write)
+ tbl.data = &new_value;
+ else
+ tbl.data = &net->sctp.rto_max;
+
+ ret = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0) {
+ if (new_value > max || new_value < min)
+ return -EINVAL;
+
+ net->sctp.rto_max = new_value;
+ }
+
+ return ret;
+}
+
+static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ if (write)
+ pr_warn_once("Changing rto_alpha or rto_beta may lead to "
+ "suboptimal rtt/srtt estimations!\n");
+
+ return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
+static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct ctl_table tbl;
+ int new_value, ret;
+
+ memset(&tbl, 0, sizeof(struct ctl_table));
+ tbl.maxlen = sizeof(unsigned int);
+
+ if (write)
+ tbl.data = &new_value;
+ else
+ tbl.data = &net->sctp.auth_enable;
+
+ ret = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+ if (write && ret == 0) {
+ struct sock *sk = net->sctp.ctl_sock;
+
+ net->sctp.auth_enable = new_value;
+ /* Update the value in the control socket */
+ lock_sock(sk);
+ sctp_sk(sk)->ep->auth_enable = new_value;
+ release_sock(sk);
+ }
+
+ return ret;
+}
+
+int sctp_sysctl_net_register(struct net *net)
+{
+ struct ctl_table *table;
+ int i;
+
+ table = kmemdup(sctp_net_table, sizeof(sctp_net_table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ for (i = 0; table[i].data; i++)
+ table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp;
+
+ net->sctp.sysctl_header = register_net_sysctl(net, "net/sctp", table);
+ if (net->sctp.sysctl_header == NULL) {
+ kfree(table);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void sctp_sysctl_net_unregister(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->sctp.sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->sctp.sysctl_header);
+ kfree(table);
+}
+
+static struct ctl_table_header *sctp_sysctl_header;
+
+/* Sysctl registration. */
+void sctp_sysctl_register(void)
+{
+ sctp_sysctl_header = register_net_sysctl(&init_net, "net/sctp", sctp_table);
+}
+
+/* Sysctl deregistration. */
+void sctp_sysctl_unregister(void)
+{
+ unregister_net_sysctl_table(sctp_sysctl_header);
+}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
new file mode 100644
index 000000000..a0a431824
--- /dev/null
+++ b/net/sctp/transport.c
@@ -0,0 +1,656 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2003 International Business Machines Corp.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This module provides the abstraction for an SCTP tranport representing
+ * a remote transport address. For local transport addresses, we just use
+ * union sctp_addr.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Xingang Guo <xingang.guo@intel.com>
+ * Hui Huang <hui.huang@nokia.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/random.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* 1st Level Abstractions. */
+
+/* Initialize a new transport from provided memory. */
+static struct sctp_transport *sctp_transport_init(struct net *net,
+ struct sctp_transport *peer,
+ const union sctp_addr *addr,
+ gfp_t gfp)
+{
+ /* Copy in the address. */
+ peer->ipaddr = *addr;
+ peer->af_specific = sctp_get_af_specific(addr->sa.sa_family);
+ memset(&peer->saddr, 0, sizeof(union sctp_addr));
+
+ peer->sack_generation = 0;
+
+ /* From 6.3.1 RTO Calculation:
+ *
+ * C1) Until an RTT measurement has been made for a packet sent to the
+ * given destination transport address, set RTO to the protocol
+ * parameter 'RTO.Initial'.
+ */
+ peer->rto = msecs_to_jiffies(net->sctp.rto_initial);
+
+ peer->last_time_heard = ktime_get();
+ peer->last_time_ecne_reduced = jiffies;
+
+ peer->param_flags = SPP_HB_DISABLE |
+ SPP_PMTUD_ENABLE |
+ SPP_SACKDELAY_ENABLE;
+
+ /* Initialize the default path max_retrans. */
+ peer->pathmaxrxt = net->sctp.max_retrans_path;
+ peer->pf_retrans = net->sctp.pf_retrans;
+
+ INIT_LIST_HEAD(&peer->transmitted);
+ INIT_LIST_HEAD(&peer->send_ready);
+ INIT_LIST_HEAD(&peer->transports);
+
+ setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event,
+ (unsigned long)peer);
+ setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
+ (unsigned long)peer);
+ setup_timer(&peer->proto_unreach_timer,
+ sctp_generate_proto_unreach_event, (unsigned long)peer);
+
+ /* Initialize the 64-bit random nonce sent with heartbeat. */
+ get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce));
+
+ atomic_set(&peer->refcnt, 1);
+
+ return peer;
+}
+
+/* Allocate and initialize a new transport. */
+struct sctp_transport *sctp_transport_new(struct net *net,
+ const union sctp_addr *addr,
+ gfp_t gfp)
+{
+ struct sctp_transport *transport;
+
+ transport = kzalloc(sizeof(*transport), gfp);
+ if (!transport)
+ goto fail;
+
+ if (!sctp_transport_init(net, transport, addr, gfp))
+ goto fail_init;
+
+ SCTP_DBG_OBJCNT_INC(transport);
+
+ return transport;
+
+fail_init:
+ kfree(transport);
+
+fail:
+ return NULL;
+}
+
+/* This transport is no longer needed. Free up if possible, or
+ * delay until it last reference count.
+ */
+void sctp_transport_free(struct sctp_transport *transport)
+{
+ transport->dead = 1;
+
+ /* Try to delete the heartbeat timer. */
+ if (del_timer(&transport->hb_timer))
+ sctp_transport_put(transport);
+
+ /* Delete the T3_rtx timer if it's active.
+ * There is no point in not doing this now and letting
+ * structure hang around in memory since we know
+ * the tranport is going away.
+ */
+ if (del_timer(&transport->T3_rtx_timer))
+ sctp_transport_put(transport);
+
+ /* Delete the ICMP proto unreachable timer if it's active. */
+ if (del_timer(&transport->proto_unreach_timer))
+ sctp_association_put(transport->asoc);
+
+ sctp_transport_put(transport);
+}
+
+static void sctp_transport_destroy_rcu(struct rcu_head *head)
+{
+ struct sctp_transport *transport;
+
+ transport = container_of(head, struct sctp_transport, rcu);
+
+ dst_release(transport->dst);
+ kfree(transport);
+ SCTP_DBG_OBJCNT_DEC(transport);
+}
+
+/* Destroy the transport data structure.
+ * Assumes there are no more users of this structure.
+ */
+static void sctp_transport_destroy(struct sctp_transport *transport)
+{
+ if (unlikely(!transport->dead)) {
+ WARN(1, "Attempt to destroy undead transport %p!\n", transport);
+ return;
+ }
+
+ sctp_packet_free(&transport->packet);
+
+ if (transport->asoc)
+ sctp_association_put(transport->asoc);
+
+ call_rcu(&transport->rcu, sctp_transport_destroy_rcu);
+}
+
+/* Start T3_rtx timer if it is not already running and update the heartbeat
+ * timer. This routine is called every time a DATA chunk is sent.
+ */
+void sctp_transport_reset_timers(struct sctp_transport *transport)
+{
+ /* RFC 2960 6.3.2 Retransmission Timer Rules
+ *
+ * R1) Every time a DATA chunk is sent to any address(including a
+ * retransmission), if the T3-rtx timer of that address is not running
+ * start it running so that it will expire after the RTO of that
+ * address.
+ */
+
+ if (!timer_pending(&transport->T3_rtx_timer))
+ if (!mod_timer(&transport->T3_rtx_timer,
+ jiffies + transport->rto))
+ sctp_transport_hold(transport);
+
+ /* When a data chunk is sent, reset the heartbeat interval. */
+ if (!mod_timer(&transport->hb_timer,
+ sctp_transport_timeout(transport)))
+ sctp_transport_hold(transport);
+}
+
+/* This transport has been assigned to an association.
+ * Initialize fields from the association or from the sock itself.
+ * Register the reference count in the association.
+ */
+void sctp_transport_set_owner(struct sctp_transport *transport,
+ struct sctp_association *asoc)
+{
+ transport->asoc = asoc;
+ sctp_association_hold(asoc);
+}
+
+/* Initialize the pmtu of a transport. */
+void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
+{
+ /* If we don't have a fresh route, look one up */
+ if (!transport->dst || transport->dst->obsolete) {
+ dst_release(transport->dst);
+ transport->af_specific->get_dst(transport, &transport->saddr,
+ &transport->fl, sk);
+ }
+
+ if (transport->dst) {
+ transport->pathmtu = dst_mtu(transport->dst);
+ } else
+ transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+}
+
+void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 pmtu)
+{
+ struct dst_entry *dst;
+
+ if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
+ pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
+ __func__, pmtu,
+ SCTP_DEFAULT_MINSEGMENT);
+ /* Use default minimum segment size and disable
+ * pmtu discovery on this transport.
+ */
+ t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
+ } else {
+ t->pathmtu = pmtu;
+ }
+
+ dst = sctp_transport_dst_check(t);
+ if (!dst)
+ t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
+
+ if (dst) {
+ dst->ops->update_pmtu(dst, sk, NULL, pmtu);
+
+ dst = sctp_transport_dst_check(t);
+ if (!dst)
+ t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
+ }
+}
+
+/* Caches the dst entry and source address for a transport's destination
+ * address.
+ */
+void sctp_transport_route(struct sctp_transport *transport,
+ union sctp_addr *saddr, struct sctp_sock *opt)
+{
+ struct sctp_association *asoc = transport->asoc;
+ struct sctp_af *af = transport->af_specific;
+
+ af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt));
+
+ if (saddr)
+ memcpy(&transport->saddr, saddr, sizeof(union sctp_addr));
+ else
+ af->get_saddr(opt, transport, &transport->fl);
+
+ if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
+ return;
+ }
+ if (transport->dst) {
+ transport->pathmtu = dst_mtu(transport->dst);
+
+ /* Initialize sk->sk_rcv_saddr, if the transport is the
+ * association's active path for getsockname().
+ */
+ if (asoc && (!asoc->peer.primary_path ||
+ (transport == asoc->peer.active_path)))
+ opt->pf->to_sk_saddr(&transport->saddr,
+ asoc->base.sk);
+ } else
+ transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+}
+
+/* Hold a reference to a transport. */
+void sctp_transport_hold(struct sctp_transport *transport)
+{
+ atomic_inc(&transport->refcnt);
+}
+
+/* Release a reference to a transport and clean up
+ * if there are no more references.
+ */
+void sctp_transport_put(struct sctp_transport *transport)
+{
+ if (atomic_dec_and_test(&transport->refcnt))
+ sctp_transport_destroy(transport);
+}
+
+/* Update transport's RTO based on the newly calculated RTT. */
+void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt)
+{
+ if (unlikely(!tp->rto_pending))
+ /* We should not be doing any RTO updates unless rto_pending is set. */
+ pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp);
+
+ if (tp->rttvar || tp->srtt) {
+ struct net *net = sock_net(tp->asoc->base.sk);
+ /* 6.3.1 C3) When a new RTT measurement R' is made, set
+ * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'|
+ * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R'
+ */
+
+ /* Note: The above algorithm has been rewritten to
+ * express rto_beta and rto_alpha as inverse powers
+ * of two.
+ * For example, assuming the default value of RTO.Alpha of
+ * 1/8, rto_alpha would be expressed as 3.
+ */
+ tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta)
+ + (((__u32)abs64((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta);
+ tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha)
+ + (rtt >> net->sctp.rto_alpha);
+ } else {
+ /* 6.3.1 C2) When the first RTT measurement R is made, set
+ * SRTT <- R, RTTVAR <- R/2.
+ */
+ tp->srtt = rtt;
+ tp->rttvar = rtt >> 1;
+ }
+
+ /* 6.3.1 G1) Whenever RTTVAR is computed, if RTTVAR = 0, then
+ * adjust RTTVAR <- G, where G is the CLOCK GRANULARITY.
+ */
+ if (tp->rttvar == 0)
+ tp->rttvar = SCTP_CLOCK_GRANULARITY;
+
+ /* 6.3.1 C3) After the computation, update RTO <- SRTT + 4 * RTTVAR. */
+ tp->rto = tp->srtt + (tp->rttvar << 2);
+
+ /* 6.3.1 C6) Whenever RTO is computed, if it is less than RTO.Min
+ * seconds then it is rounded up to RTO.Min seconds.
+ */
+ if (tp->rto < tp->asoc->rto_min)
+ tp->rto = tp->asoc->rto_min;
+
+ /* 6.3.1 C7) A maximum value may be placed on RTO provided it is
+ * at least RTO.max seconds.
+ */
+ if (tp->rto > tp->asoc->rto_max)
+ tp->rto = tp->asoc->rto_max;
+
+ sctp_max_rto(tp->asoc, tp);
+ tp->rtt = rtt;
+
+ /* Reset rto_pending so that a new RTT measurement is started when a
+ * new data chunk is sent.
+ */
+ tp->rto_pending = 0;
+
+ pr_debug("%s: transport:%p, rtt:%d, srtt:%d rttvar:%d, rto:%ld\n",
+ __func__, tp, rtt, tp->srtt, tp->rttvar, tp->rto);
+}
+
+/* This routine updates the transport's cwnd and partial_bytes_acked
+ * parameters based on the bytes acked in the received SACK.
+ */
+void sctp_transport_raise_cwnd(struct sctp_transport *transport,
+ __u32 sack_ctsn, __u32 bytes_acked)
+{
+ struct sctp_association *asoc = transport->asoc;
+ __u32 cwnd, ssthresh, flight_size, pba, pmtu;
+
+ cwnd = transport->cwnd;
+ flight_size = transport->flight_size;
+
+ /* See if we need to exit Fast Recovery first */
+ if (asoc->fast_recovery &&
+ TSN_lte(asoc->fast_recovery_exit, sack_ctsn))
+ asoc->fast_recovery = 0;
+
+ /* The appropriate cwnd increase algorithm is performed if, and only
+ * if the cumulative TSN whould advanced and the congestion window is
+ * being fully utilized.
+ */
+ if (TSN_lte(sack_ctsn, transport->asoc->ctsn_ack_point) ||
+ (flight_size < cwnd))
+ return;
+
+ ssthresh = transport->ssthresh;
+ pba = transport->partial_bytes_acked;
+ pmtu = transport->asoc->pathmtu;
+
+ if (cwnd <= ssthresh) {
+ /* RFC 4960 7.2.1
+ * o When cwnd is less than or equal to ssthresh, an SCTP
+ * endpoint MUST use the slow-start algorithm to increase
+ * cwnd only if the current congestion window is being fully
+ * utilized, an incoming SACK advances the Cumulative TSN
+ * Ack Point, and the data sender is not in Fast Recovery.
+ * Only when these three conditions are met can the cwnd be
+ * increased; otherwise, the cwnd MUST not be increased.
+ * If these conditions are met, then cwnd MUST be increased
+ * by, at most, the lesser of 1) the total size of the
+ * previously outstanding DATA chunk(s) acknowledged, and
+ * 2) the destination's path MTU. This upper bound protects
+ * against the ACK-Splitting attack outlined in [SAVAGE99].
+ */
+ if (asoc->fast_recovery)
+ return;
+
+ if (bytes_acked > pmtu)
+ cwnd += pmtu;
+ else
+ cwnd += bytes_acked;
+
+ pr_debug("%s: slow start: transport:%p, bytes_acked:%d, "
+ "cwnd:%d, ssthresh:%d, flight_size:%d, pba:%d\n",
+ __func__, transport, bytes_acked, cwnd, ssthresh,
+ flight_size, pba);
+ } else {
+ /* RFC 2960 7.2.2 Whenever cwnd is greater than ssthresh,
+ * upon each SACK arrival that advances the Cumulative TSN Ack
+ * Point, increase partial_bytes_acked by the total number of
+ * bytes of all new chunks acknowledged in that SACK including
+ * chunks acknowledged by the new Cumulative TSN Ack and by
+ * Gap Ack Blocks.
+ *
+ * When partial_bytes_acked is equal to or greater than cwnd
+ * and before the arrival of the SACK the sender had cwnd or
+ * more bytes of data outstanding (i.e., before arrival of the
+ * SACK, flightsize was greater than or equal to cwnd),
+ * increase cwnd by MTU, and reset partial_bytes_acked to
+ * (partial_bytes_acked - cwnd).
+ */
+ pba += bytes_acked;
+ if (pba >= cwnd) {
+ cwnd += pmtu;
+ pba = ((cwnd < pba) ? (pba - cwnd) : 0);
+ }
+
+ pr_debug("%s: congestion avoidance: transport:%p, "
+ "bytes_acked:%d, cwnd:%d, ssthresh:%d, "
+ "flight_size:%d, pba:%d\n", __func__,
+ transport, bytes_acked, cwnd, ssthresh,
+ flight_size, pba);
+ }
+
+ transport->cwnd = cwnd;
+ transport->partial_bytes_acked = pba;
+}
+
+/* This routine is used to lower the transport's cwnd when congestion is
+ * detected.
+ */
+void sctp_transport_lower_cwnd(struct sctp_transport *transport,
+ sctp_lower_cwnd_t reason)
+{
+ struct sctp_association *asoc = transport->asoc;
+
+ switch (reason) {
+ case SCTP_LOWER_CWND_T3_RTX:
+ /* RFC 2960 Section 7.2.3, sctpimpguide
+ * When the T3-rtx timer expires on an address, SCTP should
+ * perform slow start by:
+ * ssthresh = max(cwnd/2, 4*MTU)
+ * cwnd = 1*MTU
+ * partial_bytes_acked = 0
+ */
+ transport->ssthresh = max(transport->cwnd/2,
+ 4*asoc->pathmtu);
+ transport->cwnd = asoc->pathmtu;
+
+ /* T3-rtx also clears fast recovery */
+ asoc->fast_recovery = 0;
+ break;
+
+ case SCTP_LOWER_CWND_FAST_RTX:
+ /* RFC 2960 7.2.4 Adjust the ssthresh and cwnd of the
+ * destination address(es) to which the missing DATA chunks
+ * were last sent, according to the formula described in
+ * Section 7.2.3.
+ *
+ * RFC 2960 7.2.3, sctpimpguide Upon detection of packet
+ * losses from SACK (see Section 7.2.4), An endpoint
+ * should do the following:
+ * ssthresh = max(cwnd/2, 4*MTU)
+ * cwnd = ssthresh
+ * partial_bytes_acked = 0
+ */
+ if (asoc->fast_recovery)
+ return;
+
+ /* Mark Fast recovery */
+ asoc->fast_recovery = 1;
+ asoc->fast_recovery_exit = asoc->next_tsn - 1;
+
+ transport->ssthresh = max(transport->cwnd/2,
+ 4*asoc->pathmtu);
+ transport->cwnd = transport->ssthresh;
+ break;
+
+ case SCTP_LOWER_CWND_ECNE:
+ /* RFC 2481 Section 6.1.2.
+ * If the sender receives an ECN-Echo ACK packet
+ * then the sender knows that congestion was encountered in the
+ * network on the path from the sender to the receiver. The
+ * indication of congestion should be treated just as a
+ * congestion loss in non-ECN Capable TCP. That is, the TCP
+ * source halves the congestion window "cwnd" and reduces the
+ * slow start threshold "ssthresh".
+ * A critical condition is that TCP does not react to
+ * congestion indications more than once every window of
+ * data (or more loosely more than once every round-trip time).
+ */
+ if (time_after(jiffies, transport->last_time_ecne_reduced +
+ transport->rtt)) {
+ transport->ssthresh = max(transport->cwnd/2,
+ 4*asoc->pathmtu);
+ transport->cwnd = transport->ssthresh;
+ transport->last_time_ecne_reduced = jiffies;
+ }
+ break;
+
+ case SCTP_LOWER_CWND_INACTIVE:
+ /* RFC 2960 Section 7.2.1, sctpimpguide
+ * When the endpoint does not transmit data on a given
+ * transport address, the cwnd of the transport address
+ * should be adjusted to max(cwnd/2, 4*MTU) per RTO.
+ * NOTE: Although the draft recommends that this check needs
+ * to be done every RTO interval, we do it every hearbeat
+ * interval.
+ */
+ transport->cwnd = max(transport->cwnd/2,
+ 4*asoc->pathmtu);
+ break;
+ }
+
+ transport->partial_bytes_acked = 0;
+
+ pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d\n",
+ __func__, transport, reason, transport->cwnd,
+ transport->ssthresh);
+}
+
+/* Apply Max.Burst limit to the congestion window:
+ * sctpimpguide-05 2.14.2
+ * D) When the time comes for the sender to
+ * transmit new DATA chunks, the protocol parameter Max.Burst MUST
+ * first be applied to limit how many new DATA chunks may be sent.
+ * The limit is applied by adjusting cwnd as follows:
+ * if ((flightsize+ Max.Burst * MTU) < cwnd)
+ * cwnd = flightsize + Max.Burst * MTU
+ */
+
+void sctp_transport_burst_limited(struct sctp_transport *t)
+{
+ struct sctp_association *asoc = t->asoc;
+ u32 old_cwnd = t->cwnd;
+ u32 max_burst_bytes;
+
+ if (t->burst_limited || asoc->max_burst == 0)
+ return;
+
+ max_burst_bytes = t->flight_size + (asoc->max_burst * asoc->pathmtu);
+ if (max_burst_bytes < old_cwnd) {
+ t->cwnd = max_burst_bytes;
+ t->burst_limited = old_cwnd;
+ }
+}
+
+/* Restore the old cwnd congestion window, after the burst had it's
+ * desired effect.
+ */
+void sctp_transport_burst_reset(struct sctp_transport *t)
+{
+ if (t->burst_limited) {
+ t->cwnd = t->burst_limited;
+ t->burst_limited = 0;
+ }
+}
+
+/* What is the next timeout value for this transport? */
+unsigned long sctp_transport_timeout(struct sctp_transport *trans)
+{
+ /* RTO + timer slack +/- 50% of RTO */
+ unsigned long timeout = (trans->rto >> 1) + prandom_u32_max(trans->rto);
+
+ if (trans->state != SCTP_UNCONFIRMED &&
+ trans->state != SCTP_PF)
+ timeout += trans->hbinterval;
+
+ return timeout + jiffies;
+}
+
+/* Reset transport variables to their initial values */
+void sctp_transport_reset(struct sctp_transport *t)
+{
+ struct sctp_association *asoc = t->asoc;
+
+ /* RFC 2960 (bis), Section 5.2.4
+ * All the congestion control parameters (e.g., cwnd, ssthresh)
+ * related to this peer MUST be reset to their initial values
+ * (see Section 6.2.1)
+ */
+ t->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
+ t->burst_limited = 0;
+ t->ssthresh = asoc->peer.i.a_rwnd;
+ t->rto = asoc->rto_initial;
+ sctp_max_rto(asoc, t);
+ t->rtt = 0;
+ t->srtt = 0;
+ t->rttvar = 0;
+
+ /* Reset these additional varibles so that we have a clean
+ * slate.
+ */
+ t->partial_bytes_acked = 0;
+ t->flight_size = 0;
+ t->error_count = 0;
+ t->rto_pending = 0;
+ t->hb_sent = 0;
+
+ /* Initialize the state information for SFR-CACC */
+ t->cacc.changeover_active = 0;
+ t->cacc.cycling_changeover = 0;
+ t->cacc.next_tsn_at_change = 0;
+ t->cacc.cacc_saw_newack = 0;
+}
+
+/* Schedule retransmission on the given transport */
+void sctp_transport_immediate_rtx(struct sctp_transport *t)
+{
+ /* Stop pending T3_rtx_timer */
+ if (del_timer(&t->T3_rtx_timer))
+ sctp_transport_put(t);
+
+ sctp_retransmit(&t->asoc->outqueue, t, SCTP_RTXR_T3_RTX);
+ if (!timer_pending(&t->T3_rtx_timer)) {
+ if (!mod_timer(&t->T3_rtx_timer, jiffies + t->rto))
+ sctp_transport_hold(t);
+ }
+}
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
new file mode 100644
index 000000000..7635f9f23
--- /dev/null
+++ b/net/sctp/tsnmap.c
@@ -0,0 +1,379 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp tsn mapping array.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * Karl Knutson <karl@athena.chicago.il.us>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/bitmap.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static void sctp_tsnmap_update(struct sctp_tsnmap *map);
+static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off,
+ __u16 len, __u16 *start, __u16 *end);
+static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size);
+
+/* Initialize a block of memory as a tsnmap. */
+struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len,
+ __u32 initial_tsn, gfp_t gfp)
+{
+ if (!map->tsn_map) {
+ map->tsn_map = kzalloc(len>>3, gfp);
+ if (map->tsn_map == NULL)
+ return NULL;
+
+ map->len = len;
+ } else {
+ bitmap_zero(map->tsn_map, map->len);
+ }
+
+ /* Keep track of TSNs represented by tsn_map. */
+ map->base_tsn = initial_tsn;
+ map->cumulative_tsn_ack_point = initial_tsn - 1;
+ map->max_tsn_seen = map->cumulative_tsn_ack_point;
+ map->num_dup_tsns = 0;
+
+ return map;
+}
+
+void sctp_tsnmap_free(struct sctp_tsnmap *map)
+{
+ map->len = 0;
+ kfree(map->tsn_map);
+}
+
+/* Test the tracking state of this TSN.
+ * Returns:
+ * 0 if the TSN has not yet been seen
+ * >0 if the TSN has been seen (duplicate)
+ * <0 if the TSN is invalid (too large to track)
+ */
+int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn)
+{
+ u32 gap;
+
+ /* Check to see if this is an old TSN */
+ if (TSN_lte(tsn, map->cumulative_tsn_ack_point))
+ return 1;
+
+ /* Verify that we can hold this TSN and that it will not
+ * overlfow our map
+ */
+ if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE))
+ return -1;
+
+ /* Calculate the index into the mapping arrays. */
+ gap = tsn - map->base_tsn;
+
+ /* Check to see if TSN has already been recorded. */
+ if (gap < map->len && test_bit(gap, map->tsn_map))
+ return 1;
+ else
+ return 0;
+}
+
+
+/* Mark this TSN as seen. */
+int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn,
+ struct sctp_transport *trans)
+{
+ u16 gap;
+
+ if (TSN_lt(tsn, map->base_tsn))
+ return 0;
+
+ gap = tsn - map->base_tsn;
+
+ if (gap >= map->len && !sctp_tsnmap_grow(map, gap + 1))
+ return -ENOMEM;
+
+ if (!sctp_tsnmap_has_gap(map) && gap == 0) {
+ /* In this case the map has no gaps and the tsn we are
+ * recording is the next expected tsn. We don't touch
+ * the map but simply bump the values.
+ */
+ map->max_tsn_seen++;
+ map->cumulative_tsn_ack_point++;
+ if (trans)
+ trans->sack_generation =
+ trans->asoc->peer.sack_generation;
+ map->base_tsn++;
+ } else {
+ /* Either we already have a gap, or about to record a gap, so
+ * have work to do.
+ *
+ * Bump the max.
+ */
+ if (TSN_lt(map->max_tsn_seen, tsn))
+ map->max_tsn_seen = tsn;
+
+ /* Mark the TSN as received. */
+ set_bit(gap, map->tsn_map);
+
+ /* Go fixup any internal TSN mapping variables including
+ * cumulative_tsn_ack_point.
+ */
+ sctp_tsnmap_update(map);
+ }
+
+ return 0;
+}
+
+
+/* Initialize a Gap Ack Block iterator from memory being provided. */
+static void sctp_tsnmap_iter_init(const struct sctp_tsnmap *map,
+ struct sctp_tsnmap_iter *iter)
+{
+ /* Only start looking one past the Cumulative TSN Ack Point. */
+ iter->start = map->cumulative_tsn_ack_point + 1;
+}
+
+/* Get the next Gap Ack Blocks. Returns 0 if there was not another block
+ * to get.
+ */
+static int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map,
+ struct sctp_tsnmap_iter *iter,
+ __u16 *start, __u16 *end)
+{
+ int ended = 0;
+ __u16 start_ = 0, end_ = 0, offset;
+
+ /* If there are no more gap acks possible, get out fast. */
+ if (TSN_lte(map->max_tsn_seen, iter->start))
+ return 0;
+
+ offset = iter->start - map->base_tsn;
+ sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len,
+ &start_, &end_);
+
+ /* The Gap Ack Block happens to end at the end of the map. */
+ if (start_ && !end_)
+ end_ = map->len - 1;
+
+ /* If we found a Gap Ack Block, return the start and end and
+ * bump the iterator forward.
+ */
+ if (end_) {
+ /* Fix up the start and end based on the
+ * Cumulative TSN Ack which is always 1 behind base.
+ */
+ *start = start_ + 1;
+ *end = end_ + 1;
+
+ /* Move the iterator forward. */
+ iter->start = map->cumulative_tsn_ack_point + *end + 1;
+ ended = 1;
+ }
+
+ return ended;
+}
+
+/* Mark this and any lower TSN as seen. */
+void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn)
+{
+ u32 gap;
+
+ if (TSN_lt(tsn, map->base_tsn))
+ return;
+ if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE))
+ return;
+
+ /* Bump the max. */
+ if (TSN_lt(map->max_tsn_seen, tsn))
+ map->max_tsn_seen = tsn;
+
+ gap = tsn - map->base_tsn + 1;
+
+ map->base_tsn += gap;
+ map->cumulative_tsn_ack_point += gap;
+ if (gap >= map->len) {
+ /* If our gap is larger then the map size, just
+ * zero out the map.
+ */
+ bitmap_zero(map->tsn_map, map->len);
+ } else {
+ /* If the gap is smaller than the map size,
+ * shift the map by 'gap' bits and update further.
+ */
+ bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len);
+ sctp_tsnmap_update(map);
+ }
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* This private helper function updates the tsnmap buffers and
+ * the Cumulative TSN Ack Point.
+ */
+static void sctp_tsnmap_update(struct sctp_tsnmap *map)
+{
+ u16 len;
+ unsigned long zero_bit;
+
+
+ len = map->max_tsn_seen - map->cumulative_tsn_ack_point;
+ zero_bit = find_first_zero_bit(map->tsn_map, len);
+ if (!zero_bit)
+ return; /* The first 0-bit is bit 0. nothing to do */
+
+ map->base_tsn += zero_bit;
+ map->cumulative_tsn_ack_point += zero_bit;
+
+ bitmap_shift_right(map->tsn_map, map->tsn_map, zero_bit, map->len);
+}
+
+/* How many data chunks are we missing from our peer?
+ */
+__u16 sctp_tsnmap_pending(struct sctp_tsnmap *map)
+{
+ __u32 cum_tsn = map->cumulative_tsn_ack_point;
+ __u32 max_tsn = map->max_tsn_seen;
+ __u32 base_tsn = map->base_tsn;
+ __u16 pending_data;
+ u32 gap;
+
+ pending_data = max_tsn - cum_tsn;
+ gap = max_tsn - base_tsn;
+
+ if (gap == 0 || gap >= map->len)
+ goto out;
+
+ pending_data -= bitmap_weight(map->tsn_map, gap + 1);
+out:
+ return pending_data;
+}
+
+/* This is a private helper for finding Gap Ack Blocks. It searches a
+ * single array for the start and end of a Gap Ack Block.
+ *
+ * The flags "started" and "ended" tell is if we found the beginning
+ * or (respectively) the end of a Gap Ack Block.
+ */
+static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off,
+ __u16 len, __u16 *start, __u16 *end)
+{
+ int i = off;
+
+ /* Look through the entire array, but break out
+ * early if we have found the end of the Gap Ack Block.
+ */
+
+ /* Also, stop looking past the maximum TSN seen. */
+
+ /* Look for the start. */
+ i = find_next_bit(map, len, off);
+ if (i < len)
+ *start = i;
+
+ /* Look for the end. */
+ if (*start) {
+ /* We have found the start, let's find the
+ * end. If we find the end, break out.
+ */
+ i = find_next_zero_bit(map, len, i);
+ if (i < len)
+ *end = i - 1;
+ }
+}
+
+/* Renege that we have seen a TSN. */
+void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn)
+{
+ u32 gap;
+
+ if (TSN_lt(tsn, map->base_tsn))
+ return;
+ /* Assert: TSN is in range. */
+ if (!TSN_lt(tsn, map->base_tsn + map->len))
+ return;
+
+ gap = tsn - map->base_tsn;
+
+ /* Pretend we never saw the TSN. */
+ clear_bit(gap, map->tsn_map);
+}
+
+/* How many gap ack blocks do we have recorded? */
+__u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map,
+ struct sctp_gap_ack_block *gabs)
+{
+ struct sctp_tsnmap_iter iter;
+ int ngaps = 0;
+
+ /* Refresh the gap ack information. */
+ if (sctp_tsnmap_has_gap(map)) {
+ __u16 start = 0, end = 0;
+ sctp_tsnmap_iter_init(map, &iter);
+ while (sctp_tsnmap_next_gap_ack(map, &iter,
+ &start,
+ &end)) {
+
+ gabs[ngaps].start = htons(start);
+ gabs[ngaps].end = htons(end);
+ ngaps++;
+ if (ngaps >= SCTP_MAX_GABS)
+ break;
+ }
+ }
+ return ngaps;
+}
+
+static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size)
+{
+ unsigned long *new;
+ unsigned long inc;
+ u16 len;
+
+ if (size > SCTP_TSN_MAP_SIZE)
+ return 0;
+
+ inc = ALIGN((size - map->len), BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT;
+ len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE);
+
+ new = kzalloc(len>>3, GFP_ATOMIC);
+ if (!new)
+ return 0;
+
+ bitmap_copy(new, map->tsn_map,
+ map->max_tsn_seen - map->cumulative_tsn_ack_point);
+ kfree(map->tsn_map);
+ map->tsn_map = new;
+ map->len = len;
+
+ return 1;
+}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
new file mode 100644
index 000000000..d1e38308f
--- /dev/null
+++ b/net/sctp/ulpevent.c
@@ -0,0 +1,1065 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * These functions manipulate an sctp event. The struct ulpevent is used
+ * to carry notifications and data to the ULP (sockets).
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Ardelle Fan <ardelle.fan@intel.com>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <net/sctp/structs.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
+ struct sctp_association *asoc);
+static void sctp_ulpevent_release_data(struct sctp_ulpevent *event);
+static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event);
+
+
+/* Initialize an ULP event from an given skb. */
+static void sctp_ulpevent_init(struct sctp_ulpevent *event,
+ int msg_flags,
+ unsigned int len)
+{
+ memset(event, 0, sizeof(struct sctp_ulpevent));
+ event->msg_flags = msg_flags;
+ event->rmem_len = len;
+}
+
+/* Create a new sctp_ulpevent. */
+static struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
+ gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(size, gfp);
+ if (!skb)
+ goto fail;
+
+ event = sctp_skb2event(skb);
+ sctp_ulpevent_init(event, msg_flags, skb->truesize);
+
+ return event;
+
+fail:
+ return NULL;
+}
+
+/* Is this a MSG_NOTIFICATION? */
+int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event)
+{
+ return MSG_NOTIFICATION == (event->msg_flags & MSG_NOTIFICATION);
+}
+
+/* Hold the association in case the msg_name needs read out of
+ * the association.
+ */
+static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
+ const struct sctp_association *asoc)
+{
+ struct sk_buff *skb;
+
+ /* Cast away the const, as we are just wanting to
+ * bump the reference count.
+ */
+ sctp_association_hold((struct sctp_association *)asoc);
+ skb = sctp_event2skb(event);
+ event->asoc = (struct sctp_association *)asoc;
+ atomic_add(event->rmem_len, &event->asoc->rmem_alloc);
+ sctp_skb_set_owner_r(skb, asoc->base.sk);
+}
+
+/* A simple destructor to give up the reference to the association. */
+static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
+{
+ struct sctp_association *asoc = event->asoc;
+
+ atomic_sub(event->rmem_len, &asoc->rmem_alloc);
+ sctp_association_put(asoc);
+}
+
+/* Create and initialize an SCTP_ASSOC_CHANGE event.
+ *
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * Communication notifications inform the ULP that an SCTP association
+ * has either begun or ended. The identifier for a new association is
+ * provided by this notification.
+ *
+ * Note: There is no field checking here. If a field is unused it will be
+ * zero'd out.
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
+ const struct sctp_association *asoc,
+ __u16 flags, __u16 state, __u16 error, __u16 outbound,
+ __u16 inbound, struct sctp_chunk *chunk, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_assoc_change *sac;
+ struct sk_buff *skb;
+
+ /* If the lower layer passed in the chunk, it will be
+ * an ABORT, so we need to include it in the sac_info.
+ */
+ if (chunk) {
+ /* Copy the chunk data to a new skb and reserve enough
+ * head room to use as notification.
+ */
+ skb = skb_copy_expand(chunk->skb,
+ sizeof(struct sctp_assoc_change), 0, gfp);
+
+ if (!skb)
+ goto fail;
+
+ /* Embed the event fields inside the cloned skb. */
+ event = sctp_skb2event(skb);
+ sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
+
+ /* Include the notification structure */
+ sac = (struct sctp_assoc_change *)
+ skb_push(skb, sizeof(struct sctp_assoc_change));
+
+ /* Trim the buffer to the right length. */
+ skb_trim(skb, sizeof(struct sctp_assoc_change) +
+ ntohs(chunk->chunk_hdr->length) -
+ sizeof(sctp_chunkhdr_t));
+ } else {
+ event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change),
+ MSG_NOTIFICATION, gfp);
+ if (!event)
+ goto fail;
+
+ skb = sctp_event2skb(event);
+ sac = (struct sctp_assoc_change *) skb_put(skb,
+ sizeof(struct sctp_assoc_change));
+ }
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * sac_type:
+ * It should be SCTP_ASSOC_CHANGE.
+ */
+ sac->sac_type = SCTP_ASSOC_CHANGE;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * sac_state: 32 bits (signed integer)
+ * This field holds one of a number of values that communicate the
+ * event that happened to the association.
+ */
+ sac->sac_state = state;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * sac_flags: 16 bits (unsigned integer)
+ * Currently unused.
+ */
+ sac->sac_flags = 0;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * sac_length: sizeof (__u32)
+ * This field is the total length of the notification data, including
+ * the notification header.
+ */
+ sac->sac_length = skb->len;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * sac_error: 32 bits (signed integer)
+ *
+ * If the state was reached due to a error condition (e.g.
+ * COMMUNICATION_LOST) any relevant error information is available in
+ * this field. This corresponds to the protocol error codes defined in
+ * [SCTP].
+ */
+ sac->sac_error = error;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * sac_outbound_streams: 16 bits (unsigned integer)
+ * sac_inbound_streams: 16 bits (unsigned integer)
+ *
+ * The maximum number of streams allowed in each direction are
+ * available in sac_outbound_streams and sac_inbound streams.
+ */
+ sac->sac_outbound_streams = outbound;
+ sac->sac_inbound_streams = inbound;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * sac_assoc_id: sizeof (sctp_assoc_t)
+ *
+ * The association id field, holds the identifier for the association.
+ * All notifications for a given association have the same association
+ * identifier. For TCP style socket, this field is ignored.
+ */
+ sctp_ulpevent_set_owner(event, asoc);
+ sac->sac_assoc_id = sctp_assoc2id(asoc);
+
+ return event;
+
+fail:
+ return NULL;
+}
+
+/* Create and initialize an SCTP_PEER_ADDR_CHANGE event.
+ *
+ * Socket Extensions for SCTP - draft-01
+ * 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ * When a destination address on a multi-homed peer encounters a change
+ * an interface details event is sent.
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
+ const struct sctp_association *asoc,
+ const struct sockaddr_storage *aaddr,
+ int flags, int state, int error, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_paddr_change *spc;
+ struct sk_buff *skb;
+
+ event = sctp_ulpevent_new(sizeof(struct sctp_paddr_change),
+ MSG_NOTIFICATION, gfp);
+ if (!event)
+ goto fail;
+
+ skb = sctp_event2skb(event);
+ spc = (struct sctp_paddr_change *)
+ skb_put(skb, sizeof(struct sctp_paddr_change));
+
+ /* Sockets API Extensions for SCTP
+ * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ * spc_type:
+ *
+ * It should be SCTP_PEER_ADDR_CHANGE.
+ */
+ spc->spc_type = SCTP_PEER_ADDR_CHANGE;
+
+ /* Sockets API Extensions for SCTP
+ * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ * spc_length: sizeof (__u32)
+ *
+ * This field is the total length of the notification data, including
+ * the notification header.
+ */
+ spc->spc_length = sizeof(struct sctp_paddr_change);
+
+ /* Sockets API Extensions for SCTP
+ * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ * spc_flags: 16 bits (unsigned integer)
+ * Currently unused.
+ */
+ spc->spc_flags = 0;
+
+ /* Sockets API Extensions for SCTP
+ * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ * spc_state: 32 bits (signed integer)
+ *
+ * This field holds one of a number of values that communicate the
+ * event that happened to the address.
+ */
+ spc->spc_state = state;
+
+ /* Sockets API Extensions for SCTP
+ * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ * spc_error: 32 bits (signed integer)
+ *
+ * If the state was reached due to any error condition (e.g.
+ * ADDRESS_UNREACHABLE) any relevant error information is available in
+ * this field.
+ */
+ spc->spc_error = error;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * spc_assoc_id: sizeof (sctp_assoc_t)
+ *
+ * The association id field, holds the identifier for the association.
+ * All notifications for a given association have the same association
+ * identifier. For TCP style socket, this field is ignored.
+ */
+ sctp_ulpevent_set_owner(event, asoc);
+ spc->spc_assoc_id = sctp_assoc2id(asoc);
+
+ /* Sockets API Extensions for SCTP
+ * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ * spc_aaddr: sizeof (struct sockaddr_storage)
+ *
+ * The affected address field, holds the remote peer's address that is
+ * encountering the change of state.
+ */
+ memcpy(&spc->spc_aaddr, aaddr, sizeof(struct sockaddr_storage));
+
+ /* Map ipv4 address into v4-mapped-on-v6 address. */
+ sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_to_user(
+ sctp_sk(asoc->base.sk),
+ (union sctp_addr *)&spc->spc_aaddr);
+
+ return event;
+
+fail:
+ return NULL;
+}
+
+/* Create and initialize an SCTP_REMOTE_ERROR notification.
+ *
+ * Note: This assumes that the chunk->skb->data already points to the
+ * operation error payload.
+ *
+ * Socket Extensions for SCTP - draft-01
+ * 5.3.1.3 SCTP_REMOTE_ERROR
+ *
+ * A remote peer may send an Operational Error message to its peer.
+ * This message indicates a variety of error conditions on an
+ * association. The entire error TLV as it appears on the wire is
+ * included in a SCTP_REMOTE_ERROR event. Please refer to the SCTP
+ * specification [SCTP] and any extensions for a list of possible
+ * error formats.
+ */
+struct sctp_ulpevent *
+sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
+ struct sctp_chunk *chunk, __u16 flags,
+ gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_remote_error *sre;
+ struct sk_buff *skb;
+ sctp_errhdr_t *ch;
+ __be16 cause;
+ int elen;
+
+ ch = (sctp_errhdr_t *)(chunk->skb->data);
+ cause = ch->cause;
+ elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
+
+ /* Pull off the ERROR header. */
+ skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
+
+ /* Copy the skb to a new skb with room for us to prepend
+ * notification with.
+ */
+ skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp);
+
+ /* Pull off the rest of the cause TLV from the chunk. */
+ skb_pull(chunk->skb, elen);
+ if (!skb)
+ goto fail;
+
+ /* Embed the event fields inside the cloned skb. */
+ event = sctp_skb2event(skb);
+ sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
+
+ sre = (struct sctp_remote_error *) skb_push(skb, sizeof(*sre));
+
+ /* Trim the buffer to the right length. */
+ skb_trim(skb, sizeof(*sre) + elen);
+
+ /* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */
+ memset(sre, 0, sizeof(*sre));
+ sre->sre_type = SCTP_REMOTE_ERROR;
+ sre->sre_flags = 0;
+ sre->sre_length = skb->len;
+ sre->sre_error = cause;
+ sctp_ulpevent_set_owner(event, asoc);
+ sre->sre_assoc_id = sctp_assoc2id(asoc);
+
+ return event;
+fail:
+ return NULL;
+}
+
+/* Create and initialize a SCTP_SEND_FAILED notification.
+ *
+ * Socket Extensions for SCTP - draft-01
+ * 5.3.1.4 SCTP_SEND_FAILED
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
+ const struct sctp_association *asoc, struct sctp_chunk *chunk,
+ __u16 flags, __u32 error, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_send_failed *ssf;
+ struct sk_buff *skb;
+
+ /* Pull off any padding. */
+ int len = ntohs(chunk->chunk_hdr->length);
+
+ /* Make skb with more room so we can prepend notification. */
+ skb = skb_copy_expand(chunk->skb,
+ sizeof(struct sctp_send_failed), /* headroom */
+ 0, /* tailroom */
+ gfp);
+ if (!skb)
+ goto fail;
+
+ /* Pull off the common chunk header and DATA header. */
+ skb_pull(skb, sizeof(struct sctp_data_chunk));
+ len -= sizeof(struct sctp_data_chunk);
+
+ /* Embed the event fields inside the cloned skb. */
+ event = sctp_skb2event(skb);
+ sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
+
+ ssf = (struct sctp_send_failed *)
+ skb_push(skb, sizeof(struct sctp_send_failed));
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.4 SCTP_SEND_FAILED
+ *
+ * ssf_type:
+ * It should be SCTP_SEND_FAILED.
+ */
+ ssf->ssf_type = SCTP_SEND_FAILED;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.4 SCTP_SEND_FAILED
+ *
+ * ssf_flags: 16 bits (unsigned integer)
+ * The flag value will take one of the following values
+ *
+ * SCTP_DATA_UNSENT - Indicates that the data was never put on
+ * the wire.
+ *
+ * SCTP_DATA_SENT - Indicates that the data was put on the wire.
+ * Note that this does not necessarily mean that the
+ * data was (or was not) successfully delivered.
+ */
+ ssf->ssf_flags = flags;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.4 SCTP_SEND_FAILED
+ *
+ * ssf_length: sizeof (__u32)
+ * This field is the total length of the notification data, including
+ * the notification header.
+ */
+ ssf->ssf_length = sizeof(struct sctp_send_failed) + len;
+ skb_trim(skb, ssf->ssf_length);
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.4 SCTP_SEND_FAILED
+ *
+ * ssf_error: 16 bits (unsigned integer)
+ * This value represents the reason why the send failed, and if set,
+ * will be a SCTP protocol error code as defined in [SCTP] section
+ * 3.3.10.
+ */
+ ssf->ssf_error = error;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.4 SCTP_SEND_FAILED
+ *
+ * ssf_info: sizeof (struct sctp_sndrcvinfo)
+ * The original send information associated with the undelivered
+ * message.
+ */
+ memcpy(&ssf->ssf_info, &chunk->sinfo, sizeof(struct sctp_sndrcvinfo));
+
+ /* Per TSVWG discussion with Randy. Allow the application to
+ * reassemble a fragmented message.
+ */
+ ssf->ssf_info.sinfo_flags = chunk->chunk_hdr->flags;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.4 SCTP_SEND_FAILED
+ *
+ * ssf_assoc_id: sizeof (sctp_assoc_t)
+ * The association id field, sf_assoc_id, holds the identifier for the
+ * association. All notifications for a given association have the
+ * same association identifier. For TCP style socket, this field is
+ * ignored.
+ */
+ sctp_ulpevent_set_owner(event, asoc);
+ ssf->ssf_assoc_id = sctp_assoc2id(asoc);
+ return event;
+
+fail:
+ return NULL;
+}
+
+/* Create and initialize a SCTP_SHUTDOWN_EVENT notification.
+ *
+ * Socket Extensions for SCTP - draft-01
+ * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
+ const struct sctp_association *asoc,
+ __u16 flags, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_shutdown_event *sse;
+ struct sk_buff *skb;
+
+ event = sctp_ulpevent_new(sizeof(struct sctp_shutdown_event),
+ MSG_NOTIFICATION, gfp);
+ if (!event)
+ goto fail;
+
+ skb = sctp_event2skb(event);
+ sse = (struct sctp_shutdown_event *)
+ skb_put(skb, sizeof(struct sctp_shutdown_event));
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ *
+ * sse_type
+ * It should be SCTP_SHUTDOWN_EVENT
+ */
+ sse->sse_type = SCTP_SHUTDOWN_EVENT;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ *
+ * sse_flags: 16 bits (unsigned integer)
+ * Currently unused.
+ */
+ sse->sse_flags = 0;
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ *
+ * sse_length: sizeof (__u32)
+ * This field is the total length of the notification data, including
+ * the notification header.
+ */
+ sse->sse_length = sizeof(struct sctp_shutdown_event);
+
+ /* Socket Extensions for SCTP
+ * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ *
+ * sse_assoc_id: sizeof (sctp_assoc_t)
+ * The association id field, holds the identifier for the association.
+ * All notifications for a given association have the same association
+ * identifier. For TCP style socket, this field is ignored.
+ */
+ sctp_ulpevent_set_owner(event, asoc);
+ sse->sse_assoc_id = sctp_assoc2id(asoc);
+
+ return event;
+
+fail:
+ return NULL;
+}
+
+/* Create and initialize a SCTP_ADAPTATION_INDICATION notification.
+ *
+ * Socket Extensions for SCTP
+ * 5.3.1.6 SCTP_ADAPTATION_INDICATION
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication(
+ const struct sctp_association *asoc, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_adaptation_event *sai;
+ struct sk_buff *skb;
+
+ event = sctp_ulpevent_new(sizeof(struct sctp_adaptation_event),
+ MSG_NOTIFICATION, gfp);
+ if (!event)
+ goto fail;
+
+ skb = sctp_event2skb(event);
+ sai = (struct sctp_adaptation_event *)
+ skb_put(skb, sizeof(struct sctp_adaptation_event));
+
+ sai->sai_type = SCTP_ADAPTATION_INDICATION;
+ sai->sai_flags = 0;
+ sai->sai_length = sizeof(struct sctp_adaptation_event);
+ sai->sai_adaptation_ind = asoc->peer.adaptation_ind;
+ sctp_ulpevent_set_owner(event, asoc);
+ sai->sai_assoc_id = sctp_assoc2id(asoc);
+
+ return event;
+
+fail:
+ return NULL;
+}
+
+/* A message has been received. Package this message as a notification
+ * to pass it to the upper layers. Go ahead and calculate the sndrcvinfo
+ * even if filtered out later.
+ *
+ * Socket Extensions for SCTP
+ * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV)
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
+ struct sctp_chunk *chunk,
+ gfp_t gfp)
+{
+ struct sctp_ulpevent *event = NULL;
+ struct sk_buff *skb;
+ size_t padding, len;
+ int rx_count;
+
+ /*
+ * check to see if we need to make space for this
+ * new skb, expand the rcvbuffer if needed, or drop
+ * the frame
+ */
+ if (asoc->ep->rcvbuf_policy)
+ rx_count = atomic_read(&asoc->rmem_alloc);
+ else
+ rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
+
+ if (rx_count >= asoc->base.sk->sk_rcvbuf) {
+
+ if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
+ (!sk_rmem_schedule(asoc->base.sk, chunk->skb,
+ chunk->skb->truesize)))
+ goto fail;
+ }
+
+ /* Clone the original skb, sharing the data. */
+ skb = skb_clone(chunk->skb, gfp);
+ if (!skb)
+ goto fail;
+
+ /* Now that all memory allocations for this chunk succeeded, we
+ * can mark it as received so the tsn_map is updated correctly.
+ */
+ if (sctp_tsnmap_mark(&asoc->peer.tsn_map,
+ ntohl(chunk->subh.data_hdr->tsn),
+ chunk->transport))
+ goto fail_mark;
+
+ /* First calculate the padding, so we don't inadvertently
+ * pass up the wrong length to the user.
+ *
+ * RFC 2960 - Section 3.2 Chunk Field Descriptions
+ *
+ * The total length of a chunk(including Type, Length and Value fields)
+ * MUST be a multiple of 4 bytes. If the length of the chunk is not a
+ * multiple of 4 bytes, the sender MUST pad the chunk with all zero
+ * bytes and this padding is not included in the chunk length field.
+ * The sender should never pad with more than 3 bytes. The receiver
+ * MUST ignore the padding bytes.
+ */
+ len = ntohs(chunk->chunk_hdr->length);
+ padding = WORD_ROUND(len) - len;
+
+ /* Fixup cloned skb with just this chunks data. */
+ skb_trim(skb, chunk->chunk_end - padding - skb->data);
+
+ /* Embed the event fields inside the cloned skb. */
+ event = sctp_skb2event(skb);
+
+ /* Initialize event with flags 0 and correct length
+ * Since this is a clone of the original skb, only account for
+ * the data of this chunk as other chunks will be accounted separately.
+ */
+ sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff));
+
+ sctp_ulpevent_receive_data(event, asoc);
+
+ event->stream = ntohs(chunk->subh.data_hdr->stream);
+ event->ssn = ntohs(chunk->subh.data_hdr->ssn);
+ event->ppid = chunk->subh.data_hdr->ppid;
+ if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
+ event->flags |= SCTP_UNORDERED;
+ event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
+ }
+ event->tsn = ntohl(chunk->subh.data_hdr->tsn);
+ event->msg_flags |= chunk->chunk_hdr->flags;
+ event->iif = sctp_chunk_iif(chunk);
+
+ return event;
+
+fail_mark:
+ kfree_skb(skb);
+fail:
+ return NULL;
+}
+
+/* Create a partial delivery related event.
+ *
+ * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT
+ *
+ * When a receiver is engaged in a partial delivery of a
+ * message this notification will be used to indicate
+ * various events.
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
+ const struct sctp_association *asoc, __u32 indication,
+ gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_pdapi_event *pd;
+ struct sk_buff *skb;
+
+ event = sctp_ulpevent_new(sizeof(struct sctp_pdapi_event),
+ MSG_NOTIFICATION, gfp);
+ if (!event)
+ goto fail;
+
+ skb = sctp_event2skb(event);
+ pd = (struct sctp_pdapi_event *)
+ skb_put(skb, sizeof(struct sctp_pdapi_event));
+
+ /* pdapi_type
+ * It should be SCTP_PARTIAL_DELIVERY_EVENT
+ *
+ * pdapi_flags: 16 bits (unsigned integer)
+ * Currently unused.
+ */
+ pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
+ pd->pdapi_flags = 0;
+
+ /* pdapi_length: 32 bits (unsigned integer)
+ *
+ * This field is the total length of the notification data, including
+ * the notification header. It will generally be sizeof (struct
+ * sctp_pdapi_event).
+ */
+ pd->pdapi_length = sizeof(struct sctp_pdapi_event);
+
+ /* pdapi_indication: 32 bits (unsigned integer)
+ *
+ * This field holds the indication being sent to the application.
+ */
+ pd->pdapi_indication = indication;
+
+ /* pdapi_assoc_id: sizeof (sctp_assoc_t)
+ *
+ * The association id field, holds the identifier for the association.
+ */
+ sctp_ulpevent_set_owner(event, asoc);
+ pd->pdapi_assoc_id = sctp_assoc2id(asoc);
+
+ return event;
+fail:
+ return NULL;
+}
+
+struct sctp_ulpevent *sctp_ulpevent_make_authkey(
+ const struct sctp_association *asoc, __u16 key_id,
+ __u32 indication, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_authkey_event *ak;
+ struct sk_buff *skb;
+
+ event = sctp_ulpevent_new(sizeof(struct sctp_authkey_event),
+ MSG_NOTIFICATION, gfp);
+ if (!event)
+ goto fail;
+
+ skb = sctp_event2skb(event);
+ ak = (struct sctp_authkey_event *)
+ skb_put(skb, sizeof(struct sctp_authkey_event));
+
+ ak->auth_type = SCTP_AUTHENTICATION_EVENT;
+ ak->auth_flags = 0;
+ ak->auth_length = sizeof(struct sctp_authkey_event);
+
+ ak->auth_keynumber = key_id;
+ ak->auth_altkeynumber = 0;
+ ak->auth_indication = indication;
+
+ /*
+ * The association id field, holds the identifier for the association.
+ */
+ sctp_ulpevent_set_owner(event, asoc);
+ ak->auth_assoc_id = sctp_assoc2id(asoc);
+
+ return event;
+fail:
+ return NULL;
+}
+
+/*
+ * Socket Extensions for SCTP
+ * 6.3.10. SCTP_SENDER_DRY_EVENT
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
+ const struct sctp_association *asoc, gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_sender_dry_event *sdry;
+ struct sk_buff *skb;
+
+ event = sctp_ulpevent_new(sizeof(struct sctp_sender_dry_event),
+ MSG_NOTIFICATION, gfp);
+ if (!event)
+ return NULL;
+
+ skb = sctp_event2skb(event);
+ sdry = (struct sctp_sender_dry_event *)
+ skb_put(skb, sizeof(struct sctp_sender_dry_event));
+
+ sdry->sender_dry_type = SCTP_SENDER_DRY_EVENT;
+ sdry->sender_dry_flags = 0;
+ sdry->sender_dry_length = sizeof(struct sctp_sender_dry_event);
+ sctp_ulpevent_set_owner(event, asoc);
+ sdry->sender_dry_assoc_id = sctp_assoc2id(asoc);
+
+ return event;
+}
+
+/* Return the notification type, assuming this is a notification
+ * event.
+ */
+__u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event)
+{
+ union sctp_notification *notification;
+ struct sk_buff *skb;
+
+ skb = sctp_event2skb(event);
+ notification = (union sctp_notification *) skb->data;
+ return notification->sn_header.sn_type;
+}
+
+/* RFC6458, Section 5.3.2. SCTP Header Information Structure
+ * (SCTP_SNDRCV, DEPRECATED)
+ */
+void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
+ struct msghdr *msghdr)
+{
+ struct sctp_sndrcvinfo sinfo;
+
+ if (sctp_ulpevent_is_notification(event))
+ return;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ sinfo.sinfo_stream = event->stream;
+ sinfo.sinfo_ssn = event->ssn;
+ sinfo.sinfo_ppid = event->ppid;
+ sinfo.sinfo_flags = event->flags;
+ sinfo.sinfo_tsn = event->tsn;
+ sinfo.sinfo_cumtsn = event->cumtsn;
+ sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc);
+ /* Context value that is set via SCTP_CONTEXT socket option. */
+ sinfo.sinfo_context = event->asoc->default_rcv_context;
+ /* These fields are not used while receiving. */
+ sinfo.sinfo_timetolive = 0;
+
+ put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV,
+ sizeof(sinfo), &sinfo);
+}
+
+/* RFC6458, Section 5.3.5 SCTP Receive Information Structure
+ * (SCTP_SNDRCV)
+ */
+void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event,
+ struct msghdr *msghdr)
+{
+ struct sctp_rcvinfo rinfo;
+
+ if (sctp_ulpevent_is_notification(event))
+ return;
+
+ memset(&rinfo, 0, sizeof(struct sctp_rcvinfo));
+ rinfo.rcv_sid = event->stream;
+ rinfo.rcv_ssn = event->ssn;
+ rinfo.rcv_ppid = event->ppid;
+ rinfo.rcv_flags = event->flags;
+ rinfo.rcv_tsn = event->tsn;
+ rinfo.rcv_cumtsn = event->cumtsn;
+ rinfo.rcv_assoc_id = sctp_assoc2id(event->asoc);
+ rinfo.rcv_context = event->asoc->default_rcv_context;
+
+ put_cmsg(msghdr, IPPROTO_SCTP, SCTP_RCVINFO,
+ sizeof(rinfo), &rinfo);
+}
+
+/* RFC6458, Section 5.3.6. SCTP Next Receive Information Structure
+ * (SCTP_NXTINFO)
+ */
+static void __sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event,
+ struct msghdr *msghdr,
+ const struct sk_buff *skb)
+{
+ struct sctp_nxtinfo nxtinfo;
+
+ memset(&nxtinfo, 0, sizeof(nxtinfo));
+ nxtinfo.nxt_sid = event->stream;
+ nxtinfo.nxt_ppid = event->ppid;
+ nxtinfo.nxt_flags = event->flags;
+ if (sctp_ulpevent_is_notification(event))
+ nxtinfo.nxt_flags |= SCTP_NOTIFICATION;
+ nxtinfo.nxt_length = skb->len;
+ nxtinfo.nxt_assoc_id = sctp_assoc2id(event->asoc);
+
+ put_cmsg(msghdr, IPPROTO_SCTP, SCTP_NXTINFO,
+ sizeof(nxtinfo), &nxtinfo);
+}
+
+void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event,
+ struct msghdr *msghdr,
+ struct sock *sk)
+{
+ struct sk_buff *skb;
+ int err;
+
+ skb = sctp_skb_recv_datagram(sk, MSG_PEEK, 1, &err);
+ if (skb != NULL) {
+ __sctp_ulpevent_read_nxtinfo(sctp_skb2event(skb),
+ msghdr, skb);
+ /* Just release refcount here. */
+ kfree_skb(skb);
+ }
+}
+
+/* Do accounting for bytes received and hold a reference to the association
+ * for each skb.
+ */
+static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
+ struct sctp_association *asoc)
+{
+ struct sk_buff *skb, *frag;
+
+ skb = sctp_event2skb(event);
+ /* Set the owner and charge rwnd for bytes received. */
+ sctp_ulpevent_set_owner(event, asoc);
+ sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb));
+
+ if (!skb->data_len)
+ return;
+
+ /* Note: Not clearing the entire event struct as this is just a
+ * fragment of the real event. However, we still need to do rwnd
+ * accounting.
+ * In general, the skb passed from IP can have only 1 level of
+ * fragments. But we allow multiple levels of fragments.
+ */
+ skb_walk_frags(skb, frag)
+ sctp_ulpevent_receive_data(sctp_skb2event(frag), asoc);
+}
+
+/* Do accounting for bytes just read by user and release the references to
+ * the association.
+ */
+static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
+{
+ struct sk_buff *skb, *frag;
+ unsigned int len;
+
+ /* Current stack structures assume that the rcv buffer is
+ * per socket. For UDP style sockets this is not true as
+ * multiple associations may be on a single UDP-style socket.
+ * Use the local private area of the skb to track the owning
+ * association.
+ */
+
+ skb = sctp_event2skb(event);
+ len = skb->len;
+
+ if (!skb->data_len)
+ goto done;
+
+ /* Don't forget the fragments. */
+ skb_walk_frags(skb, frag) {
+ /* NOTE: skb_shinfos are recursive. Although IP returns
+ * skb's with only 1 level of fragments, SCTP reassembly can
+ * increase the levels.
+ */
+ sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
+ }
+
+done:
+ sctp_assoc_rwnd_increase(event->asoc, len);
+ sctp_ulpevent_release_owner(event);
+}
+
+static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
+{
+ struct sk_buff *skb, *frag;
+
+ skb = sctp_event2skb(event);
+
+ if (!skb->data_len)
+ goto done;
+
+ /* Don't forget the fragments. */
+ skb_walk_frags(skb, frag) {
+ /* NOTE: skb_shinfos are recursive. Although IP returns
+ * skb's with only 1 level of fragments, SCTP reassembly can
+ * increase the levels.
+ */
+ sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
+ }
+
+done:
+ sctp_ulpevent_release_owner(event);
+}
+
+/* Free a ulpevent that has an owner. It includes releasing the reference
+ * to the owner, updating the rwnd in case of a DATA event and freeing the
+ * skb.
+ */
+void sctp_ulpevent_free(struct sctp_ulpevent *event)
+{
+ if (sctp_ulpevent_is_notification(event))
+ sctp_ulpevent_release_owner(event);
+ else
+ sctp_ulpevent_release_data(event);
+
+ kfree_skb(sctp_event2skb(event));
+}
+
+/* Purge the skb lists holding ulpevents. */
+unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+ unsigned int data_unread = 0;
+
+ while ((skb = skb_dequeue(list)) != NULL) {
+ struct sctp_ulpevent *event = sctp_skb2event(skb);
+
+ if (!sctp_ulpevent_is_notification(event))
+ data_unread += skb->len;
+
+ sctp_ulpevent_free(event);
+ }
+
+ return data_unread;
+}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
new file mode 100644
index 000000000..ce469d648
--- /dev/null
+++ b/net/sctp/ulpqueue.c
@@ -0,0 +1,1144 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This abstraction carries sctp events to the ULP (sockets).
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ * ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ * lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ * Jon Grimm <jgrimm@us.ibm.com>
+ * La Monte H.P. Yarroll <piggy@acm.org>
+ * Sridhar Samudrala <sri@us.ibm.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/busy_poll.h>
+#include <net/sctp/structs.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal helpers. */
+static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *);
+static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *,
+ struct sctp_ulpevent *);
+static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq);
+
+/* 1st Level Abstractions */
+
+/* Initialize a ULP queue from a block of memory. */
+struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
+ struct sctp_association *asoc)
+{
+ memset(ulpq, 0, sizeof(struct sctp_ulpq));
+
+ ulpq->asoc = asoc;
+ skb_queue_head_init(&ulpq->reasm);
+ skb_queue_head_init(&ulpq->lobby);
+ ulpq->pd_mode = 0;
+
+ return ulpq;
+}
+
+
+/* Flush the reassembly and ordering queues. */
+void sctp_ulpq_flush(struct sctp_ulpq *ulpq)
+{
+ struct sk_buff *skb;
+ struct sctp_ulpevent *event;
+
+ while ((skb = __skb_dequeue(&ulpq->lobby)) != NULL) {
+ event = sctp_skb2event(skb);
+ sctp_ulpevent_free(event);
+ }
+
+ while ((skb = __skb_dequeue(&ulpq->reasm)) != NULL) {
+ event = sctp_skb2event(skb);
+ sctp_ulpevent_free(event);
+ }
+
+}
+
+/* Dispose of a ulpqueue. */
+void sctp_ulpq_free(struct sctp_ulpq *ulpq)
+{
+ sctp_ulpq_flush(ulpq);
+}
+
+/* Process an incoming DATA chunk. */
+int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
+ gfp_t gfp)
+{
+ struct sk_buff_head temp;
+ struct sctp_ulpevent *event;
+ int event_eor = 0;
+
+ /* Create an event from the incoming chunk. */
+ event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp);
+ if (!event)
+ return -ENOMEM;
+
+ /* Do reassembly if needed. */
+ event = sctp_ulpq_reasm(ulpq, event);
+
+ /* Do ordering if needed. */
+ if ((event) && (event->msg_flags & MSG_EOR)) {
+ /* Create a temporary list to collect chunks on. */
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+
+ event = sctp_ulpq_order(ulpq, event);
+ }
+
+ /* Send event to the ULP. 'event' is the sctp_ulpevent for
+ * very first SKB on the 'temp' list.
+ */
+ if (event) {
+ event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0;
+ sctp_ulpq_tail_event(ulpq, event);
+ }
+
+ return event_eor;
+}
+
+/* Add a new event for propagation to the ULP. */
+/* Clear the partial delivery mode for this socket. Note: This
+ * assumes that no association is currently in partial delivery mode.
+ */
+int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+
+ if (atomic_dec_and_test(&sp->pd_mode)) {
+ /* This means there are no other associations in PD, so
+ * we can go ahead and clear out the lobby in one shot
+ */
+ if (!skb_queue_empty(&sp->pd_lobby)) {
+ struct list_head *list;
+ sctp_skb_list_tail(&sp->pd_lobby, &sk->sk_receive_queue);
+ list = (struct list_head *)&sctp_sk(sk)->pd_lobby;
+ INIT_LIST_HEAD(list);
+ return 1;
+ }
+ } else {
+ /* There are other associations in PD, so we only need to
+ * pull stuff out of the lobby that belongs to the
+ * associations that is exiting PD (all of its notifications
+ * are posted here).
+ */
+ if (!skb_queue_empty(&sp->pd_lobby) && asoc) {
+ struct sk_buff *skb, *tmp;
+ struct sctp_ulpevent *event;
+
+ sctp_skb_for_each(skb, &sp->pd_lobby, tmp) {
+ event = sctp_skb2event(skb);
+ if (event->asoc == asoc) {
+ __skb_unlink(skb, &sp->pd_lobby);
+ __skb_queue_tail(&sk->sk_receive_queue,
+ skb);
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* Set the pd_mode on the socket and ulpq */
+static void sctp_ulpq_set_pd(struct sctp_ulpq *ulpq)
+{
+ struct sctp_sock *sp = sctp_sk(ulpq->asoc->base.sk);
+
+ atomic_inc(&sp->pd_mode);
+ ulpq->pd_mode = 1;
+}
+
+/* Clear the pd_mode and restart any pending messages waiting for delivery. */
+static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
+{
+ ulpq->pd_mode = 0;
+ sctp_ulpq_reasm_drain(ulpq);
+ return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc);
+}
+
+/* If the SKB of 'event' is on a list, it is the first such member
+ * of that list.
+ */
+int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
+{
+ struct sock *sk = ulpq->asoc->base.sk;
+ struct sk_buff_head *queue, *skb_list;
+ struct sk_buff *skb = sctp_event2skb(event);
+ int clear_pd = 0;
+
+ skb_list = (struct sk_buff_head *) skb->prev;
+
+ /* If the socket is just going to throw this away, do not
+ * even try to deliver it.
+ */
+ if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
+ goto out_free;
+
+ if (!sctp_ulpevent_is_notification(event)) {
+ sk_mark_napi_id(sk, skb);
+ sk_incoming_cpu_update(sk);
+ }
+ /* Check if the user wishes to receive this event. */
+ if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
+ goto out_free;
+
+ /* If we are in partial delivery mode, post to the lobby until
+ * partial delivery is cleared, unless, of course _this_ is
+ * the association the cause of the partial delivery.
+ */
+
+ if (atomic_read(&sctp_sk(sk)->pd_mode) == 0) {
+ queue = &sk->sk_receive_queue;
+ } else {
+ if (ulpq->pd_mode) {
+ /* If the association is in partial delivery, we
+ * need to finish delivering the partially processed
+ * packet before passing any other data. This is
+ * because we don't truly support stream interleaving.
+ */
+ if ((event->msg_flags & MSG_NOTIFICATION) ||
+ (SCTP_DATA_NOT_FRAG ==
+ (event->msg_flags & SCTP_DATA_FRAG_MASK)))
+ queue = &sctp_sk(sk)->pd_lobby;
+ else {
+ clear_pd = event->msg_flags & MSG_EOR;
+ queue = &sk->sk_receive_queue;
+ }
+ } else {
+ /*
+ * If fragment interleave is enabled, we
+ * can queue this to the receive queue instead
+ * of the lobby.
+ */
+ if (sctp_sk(sk)->frag_interleave)
+ queue = &sk->sk_receive_queue;
+ else
+ queue = &sctp_sk(sk)->pd_lobby;
+ }
+ }
+
+ /* If we are harvesting multiple skbs they will be
+ * collected on a list.
+ */
+ if (skb_list)
+ sctp_skb_list_tail(skb_list, queue);
+ else
+ __skb_queue_tail(queue, skb);
+
+ /* Did we just complete partial delivery and need to get
+ * rolling again? Move pending data to the receive
+ * queue.
+ */
+ if (clear_pd)
+ sctp_ulpq_clear_pd(ulpq);
+
+ if (queue == &sk->sk_receive_queue)
+ sk->sk_data_ready(sk);
+ return 1;
+
+out_free:
+ if (skb_list)
+ sctp_queue_purge_ulpevents(skb_list);
+ else
+ sctp_ulpevent_free(event);
+
+ return 0;
+}
+
+/* 2nd Level Abstractions */
+
+/* Helper function to store chunks that need to be reassembled. */
+static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sk_buff *pos;
+ struct sctp_ulpevent *cevent;
+ __u32 tsn, ctsn;
+
+ tsn = event->tsn;
+
+ /* See if it belongs at the end. */
+ pos = skb_peek_tail(&ulpq->reasm);
+ if (!pos) {
+ __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+ return;
+ }
+
+ /* Short circuit just dropping it at the end. */
+ cevent = sctp_skb2event(pos);
+ ctsn = cevent->tsn;
+ if (TSN_lt(ctsn, tsn)) {
+ __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+ return;
+ }
+
+ /* Find the right place in this list. We store them by TSN. */
+ skb_queue_walk(&ulpq->reasm, pos) {
+ cevent = sctp_skb2event(pos);
+ ctsn = cevent->tsn;
+
+ if (TSN_lt(tsn, ctsn))
+ break;
+ }
+
+ /* Insert before pos. */
+ __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));
+
+}
+
+/* Helper function to return an event corresponding to the reassembled
+ * datagram.
+ * This routine creates a re-assembled skb given the first and last skb's
+ * as stored in the reassembly queue. The skb's may be non-linear if the sctp
+ * payload was fragmented on the way and ip had to reassemble them.
+ * We add the rest of skb's to the first skb's fraglist.
+ */
+static struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net,
+ struct sk_buff_head *queue, struct sk_buff *f_frag,
+ struct sk_buff *l_frag)
+{
+ struct sk_buff *pos;
+ struct sk_buff *new = NULL;
+ struct sctp_ulpevent *event;
+ struct sk_buff *pnext, *last;
+ struct sk_buff *list = skb_shinfo(f_frag)->frag_list;
+
+ /* Store the pointer to the 2nd skb */
+ if (f_frag == l_frag)
+ pos = NULL;
+ else
+ pos = f_frag->next;
+
+ /* Get the last skb in the f_frag's frag_list if present. */
+ for (last = list; list; last = list, list = list->next)
+ ;
+
+ /* Add the list of remaining fragments to the first fragments
+ * frag_list.
+ */
+ if (last)
+ last->next = pos;
+ else {
+ if (skb_cloned(f_frag)) {
+ /* This is a cloned skb, we can't just modify
+ * the frag_list. We need a new skb to do that.
+ * Instead of calling skb_unshare(), we'll do it
+ * ourselves since we need to delay the free.
+ */
+ new = skb_copy(f_frag, GFP_ATOMIC);
+ if (!new)
+ return NULL; /* try again later */
+
+ sctp_skb_set_owner_r(new, f_frag->sk);
+
+ skb_shinfo(new)->frag_list = pos;
+ } else
+ skb_shinfo(f_frag)->frag_list = pos;
+ }
+
+ /* Remove the first fragment from the reassembly queue. */
+ __skb_unlink(f_frag, queue);
+
+ /* if we did unshare, then free the old skb and re-assign */
+ if (new) {
+ kfree_skb(f_frag);
+ f_frag = new;
+ }
+
+ while (pos) {
+
+ pnext = pos->next;
+
+ /* Update the len and data_len fields of the first fragment. */
+ f_frag->len += pos->len;
+ f_frag->data_len += pos->len;
+
+ /* Remove the fragment from the reassembly queue. */
+ __skb_unlink(pos, queue);
+
+ /* Break if we have reached the last fragment. */
+ if (pos == l_frag)
+ break;
+ pos->next = pnext;
+ pos = pnext;
+ }
+
+ event = sctp_skb2event(f_frag);
+ SCTP_INC_STATS(net, SCTP_MIB_REASMUSRMSGS);
+
+ return event;
+}
+
+
+/* Helper function to check if an incoming chunk has filled up the last
+ * missing fragment in a SCTP datagram and return the corresponding event.
+ */
+static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ulpq)
+{
+ struct sk_buff *pos;
+ struct sctp_ulpevent *cevent;
+ struct sk_buff *first_frag = NULL;
+ __u32 ctsn, next_tsn;
+ struct sctp_ulpevent *retval = NULL;
+ struct sk_buff *pd_first = NULL;
+ struct sk_buff *pd_last = NULL;
+ size_t pd_len = 0;
+ struct sctp_association *asoc;
+ u32 pd_point;
+
+ /* Initialized to 0 just to avoid compiler warning message. Will
+ * never be used with this value. It is referenced only after it
+ * is set when we find the first fragment of a message.
+ */
+ next_tsn = 0;
+
+ /* The chunks are held in the reasm queue sorted by TSN.
+ * Walk through the queue sequentially and look for a sequence of
+ * fragmented chunks that complete a datagram.
+ * 'first_frag' and next_tsn are reset when we find a chunk which
+ * is the first fragment of a datagram. Once these 2 fields are set
+ * we expect to find the remaining middle fragments and the last
+ * fragment in order. If not, first_frag is reset to NULL and we
+ * start the next pass when we find another first fragment.
+ *
+ * There is a potential to do partial delivery if user sets
+ * SCTP_PARTIAL_DELIVERY_POINT option. Lets count some things here
+ * to see if can do PD.
+ */
+ skb_queue_walk(&ulpq->reasm, pos) {
+ cevent = sctp_skb2event(pos);
+ ctsn = cevent->tsn;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ /* If this "FIRST_FRAG" is the first
+ * element in the queue, then count it towards
+ * possible PD.
+ */
+ if (pos == ulpq->reasm.next) {
+ pd_first = pos;
+ pd_last = pos;
+ pd_len = pos->len;
+ } else {
+ pd_first = NULL;
+ pd_last = NULL;
+ pd_len = 0;
+ }
+
+ first_frag = pos;
+ next_tsn = ctsn + 1;
+ break;
+
+ case SCTP_DATA_MIDDLE_FRAG:
+ if ((first_frag) && (ctsn == next_tsn)) {
+ next_tsn++;
+ if (pd_first) {
+ pd_last = pos;
+ pd_len += pos->len;
+ }
+ } else
+ first_frag = NULL;
+ break;
+
+ case SCTP_DATA_LAST_FRAG:
+ if (first_frag && (ctsn == next_tsn))
+ goto found;
+ else
+ first_frag = NULL;
+ break;
+ }
+ }
+
+ asoc = ulpq->asoc;
+ if (pd_first) {
+ /* Make sure we can enter partial deliver.
+ * We can trigger partial delivery only if framgent
+ * interleave is set, or the socket is not already
+ * in partial delivery.
+ */
+ if (!sctp_sk(asoc->base.sk)->frag_interleave &&
+ atomic_read(&sctp_sk(asoc->base.sk)->pd_mode))
+ goto done;
+
+ cevent = sctp_skb2event(pd_first);
+ pd_point = sctp_sk(asoc->base.sk)->pd_point;
+ if (pd_point && pd_point <= pd_len) {
+ retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+ &ulpq->reasm,
+ pd_first,
+ pd_last);
+ if (retval)
+ sctp_ulpq_set_pd(ulpq);
+ }
+ }
+done:
+ return retval;
+found:
+ retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ &ulpq->reasm, first_frag, pos);
+ if (retval)
+ retval->msg_flags |= MSG_EOR;
+ goto done;
+}
+
+/* Retrieve the next set of fragments of a partial message. */
+static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq)
+{
+ struct sk_buff *pos, *last_frag, *first_frag;
+ struct sctp_ulpevent *cevent;
+ __u32 ctsn, next_tsn;
+ int is_last;
+ struct sctp_ulpevent *retval;
+
+ /* The chunks are held in the reasm queue sorted by TSN.
+ * Walk through the queue sequentially and look for the first
+ * sequence of fragmented chunks.
+ */
+
+ if (skb_queue_empty(&ulpq->reasm))
+ return NULL;
+
+ last_frag = first_frag = NULL;
+ retval = NULL;
+ next_tsn = 0;
+ is_last = 0;
+
+ skb_queue_walk(&ulpq->reasm, pos) {
+ cevent = sctp_skb2event(pos);
+ ctsn = cevent->tsn;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ if (!first_frag)
+ return NULL;
+ goto done;
+ case SCTP_DATA_MIDDLE_FRAG:
+ if (!first_frag) {
+ first_frag = pos;
+ next_tsn = ctsn + 1;
+ last_frag = pos;
+ } else if (next_tsn == ctsn) {
+ next_tsn++;
+ last_frag = pos;
+ } else
+ goto done;
+ break;
+ case SCTP_DATA_LAST_FRAG:
+ if (!first_frag)
+ first_frag = pos;
+ else if (ctsn != next_tsn)
+ goto done;
+ last_frag = pos;
+ is_last = 1;
+ goto done;
+ default:
+ return NULL;
+ }
+ }
+
+ /* We have the reassembled event. There is no need to look
+ * further.
+ */
+done:
+ retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ &ulpq->reasm, first_frag, last_frag);
+ if (retval && is_last)
+ retval->msg_flags |= MSG_EOR;
+
+ return retval;
+}
+
+
+/* Helper function to reassemble chunks. Hold chunks on the reasm queue that
+ * need reassembling.
+ */
+static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sctp_ulpevent *retval = NULL;
+
+ /* Check if this is part of a fragmented message. */
+ if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
+ event->msg_flags |= MSG_EOR;
+ return event;
+ }
+
+ sctp_ulpq_store_reasm(ulpq, event);
+ if (!ulpq->pd_mode)
+ retval = sctp_ulpq_retrieve_reassembled(ulpq);
+ else {
+ __u32 ctsn, ctsnap;
+
+ /* Do not even bother unless this is the next tsn to
+ * be delivered.
+ */
+ ctsn = event->tsn;
+ ctsnap = sctp_tsnmap_get_ctsn(&ulpq->asoc->peer.tsn_map);
+ if (TSN_lte(ctsn, ctsnap))
+ retval = sctp_ulpq_retrieve_partial(ulpq);
+ }
+
+ return retval;
+}
+
+/* Retrieve the first part (sequential fragments) for partial delivery. */
+static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq)
+{
+ struct sk_buff *pos, *last_frag, *first_frag;
+ struct sctp_ulpevent *cevent;
+ __u32 ctsn, next_tsn;
+ struct sctp_ulpevent *retval;
+
+ /* The chunks are held in the reasm queue sorted by TSN.
+ * Walk through the queue sequentially and look for a sequence of
+ * fragmented chunks that start a datagram.
+ */
+
+ if (skb_queue_empty(&ulpq->reasm))
+ return NULL;
+
+ last_frag = first_frag = NULL;
+ retval = NULL;
+ next_tsn = 0;
+
+ skb_queue_walk(&ulpq->reasm, pos) {
+ cevent = sctp_skb2event(pos);
+ ctsn = cevent->tsn;
+
+ switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+ case SCTP_DATA_FIRST_FRAG:
+ if (!first_frag) {
+ first_frag = pos;
+ next_tsn = ctsn + 1;
+ last_frag = pos;
+ } else
+ goto done;
+ break;
+
+ case SCTP_DATA_MIDDLE_FRAG:
+ if (!first_frag)
+ return NULL;
+ if (ctsn == next_tsn) {
+ next_tsn++;
+ last_frag = pos;
+ } else
+ goto done;
+ break;
+
+ case SCTP_DATA_LAST_FRAG:
+ if (!first_frag)
+ return NULL;
+ else
+ goto done;
+ break;
+
+ default:
+ return NULL;
+ }
+ }
+
+ /* We have the reassembled event. There is no need to look
+ * further.
+ */
+done:
+ retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ &ulpq->reasm, first_frag, last_frag);
+ return retval;
+}
+
+/*
+ * Flush out stale fragments from the reassembly queue when processing
+ * a Forward TSN.
+ *
+ * RFC 3758, Section 3.6
+ *
+ * After receiving and processing a FORWARD TSN, the data receiver MUST
+ * take cautions in updating its re-assembly queue. The receiver MUST
+ * remove any partially reassembled message, which is still missing one
+ * or more TSNs earlier than or equal to the new cumulative TSN point.
+ * In the event that the receiver has invoked the partial delivery API,
+ * a notification SHOULD also be generated to inform the upper layer API
+ * that the message being partially delivered will NOT be completed.
+ */
+void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn)
+{
+ struct sk_buff *pos, *tmp;
+ struct sctp_ulpevent *event;
+ __u32 tsn;
+
+ if (skb_queue_empty(&ulpq->reasm))
+ return;
+
+ skb_queue_walk_safe(&ulpq->reasm, pos, tmp) {
+ event = sctp_skb2event(pos);
+ tsn = event->tsn;
+
+ /* Since the entire message must be abandoned by the
+ * sender (item A3 in Section 3.5, RFC 3758), we can
+ * free all fragments on the list that are less then
+ * or equal to ctsn_point
+ */
+ if (TSN_lte(tsn, fwd_tsn)) {
+ __skb_unlink(pos, &ulpq->reasm);
+ sctp_ulpevent_free(event);
+ } else
+ break;
+ }
+}
+
+/*
+ * Drain the reassembly queue. If we just cleared parted delivery, it
+ * is possible that the reassembly queue will contain already reassembled
+ * messages. Retrieve any such messages and give them to the user.
+ */
+static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq)
+{
+ struct sctp_ulpevent *event = NULL;
+ struct sk_buff_head temp;
+
+ if (skb_queue_empty(&ulpq->reasm))
+ return;
+
+ while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) {
+ /* Do ordering if needed. */
+ if ((event) && (event->msg_flags & MSG_EOR)) {
+ skb_queue_head_init(&temp);
+ __skb_queue_tail(&temp, sctp_event2skb(event));
+
+ event = sctp_ulpq_order(ulpq, event);
+ }
+
+ /* Send event to the ULP. 'event' is the
+ * sctp_ulpevent for very first SKB on the temp' list.
+ */
+ if (event)
+ sctp_ulpq_tail_event(ulpq, event);
+ }
+}
+
+
+/* Helper function to gather skbs that have possibly become
+ * ordered by an an incoming chunk.
+ */
+static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sk_buff_head *event_list;
+ struct sk_buff *pos, *tmp;
+ struct sctp_ulpevent *cevent;
+ struct sctp_stream *in;
+ __u16 sid, csid, cssn;
+
+ sid = event->stream;
+ in = &ulpq->asoc->ssnmap->in;
+
+ event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
+
+ /* We are holding the chunks by stream, by SSN. */
+ sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
+ cevent = (struct sctp_ulpevent *) pos->cb;
+ csid = cevent->stream;
+ cssn = cevent->ssn;
+
+ /* Have we gone too far? */
+ if (csid > sid)
+ break;
+
+ /* Have we not gone far enough? */
+ if (csid < sid)
+ continue;
+
+ if (cssn != sctp_ssn_peek(in, sid))
+ break;
+
+ /* Found it, so mark in the ssnmap. */
+ sctp_ssn_next(in, sid);
+
+ __skb_unlink(pos, &ulpq->lobby);
+
+ /* Attach all gathered skbs to the event. */
+ __skb_queue_tail(event_list, pos);
+ }
+}
+
+/* Helper function to store chunks needing ordering. */
+static void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ struct sk_buff *pos;
+ struct sctp_ulpevent *cevent;
+ __u16 sid, csid;
+ __u16 ssn, cssn;
+
+ pos = skb_peek_tail(&ulpq->lobby);
+ if (!pos) {
+ __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+ return;
+ }
+
+ sid = event->stream;
+ ssn = event->ssn;
+
+ cevent = (struct sctp_ulpevent *) pos->cb;
+ csid = cevent->stream;
+ cssn = cevent->ssn;
+ if (sid > csid) {
+ __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+ return;
+ }
+
+ if ((sid == csid) && SSN_lt(cssn, ssn)) {
+ __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+ return;
+ }
+
+ /* Find the right place in this list. We store them by
+ * stream ID and then by SSN.
+ */
+ skb_queue_walk(&ulpq->lobby, pos) {
+ cevent = (struct sctp_ulpevent *) pos->cb;
+ csid = cevent->stream;
+ cssn = cevent->ssn;
+
+ if (csid > sid)
+ break;
+ if (csid == sid && SSN_lt(ssn, cssn))
+ break;
+ }
+
+
+ /* Insert before pos. */
+ __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
+}
+
+static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
+ struct sctp_ulpevent *event)
+{
+ __u16 sid, ssn;
+ struct sctp_stream *in;
+
+ /* Check if this message needs ordering. */
+ if (SCTP_DATA_UNORDERED & event->msg_flags)
+ return event;
+
+ /* Note: The stream ID must be verified before this routine. */
+ sid = event->stream;
+ ssn = event->ssn;
+ in = &ulpq->asoc->ssnmap->in;
+
+ /* Is this the expected SSN for this stream ID? */
+ if (ssn != sctp_ssn_peek(in, sid)) {
+ /* We've received something out of order, so find where it
+ * needs to be placed. We order by stream and then by SSN.
+ */
+ sctp_ulpq_store_ordered(ulpq, event);
+ return NULL;
+ }
+
+ /* Mark that the next chunk has been found. */
+ sctp_ssn_next(in, sid);
+
+ /* Go find any other chunks that were waiting for
+ * ordering.
+ */
+ sctp_ulpq_retrieve_ordered(ulpq, event);
+
+ return event;
+}
+
+/* Helper function to gather skbs that have possibly become
+ * ordered by forward tsn skipping their dependencies.
+ */
+static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
+{
+ struct sk_buff *pos, *tmp;
+ struct sctp_ulpevent *cevent;
+ struct sctp_ulpevent *event;
+ struct sctp_stream *in;
+ struct sk_buff_head temp;
+ struct sk_buff_head *lobby = &ulpq->lobby;
+ __u16 csid, cssn;
+
+ in = &ulpq->asoc->ssnmap->in;
+
+ /* We are holding the chunks by stream, by SSN. */
+ skb_queue_head_init(&temp);
+ event = NULL;
+ sctp_skb_for_each(pos, lobby, tmp) {
+ cevent = (struct sctp_ulpevent *) pos->cb;
+ csid = cevent->stream;
+ cssn = cevent->ssn;
+
+ /* Have we gone too far? */
+ if (csid > sid)
+ break;
+
+ /* Have we not gone far enough? */
+ if (csid < sid)
+ continue;
+
+ /* see if this ssn has been marked by skipping */
+ if (!SSN_lt(cssn, sctp_ssn_peek(in, csid)))
+ break;
+
+ __skb_unlink(pos, lobby);
+ if (!event)
+ /* Create a temporary list to collect chunks on. */
+ event = sctp_skb2event(pos);
+
+ /* Attach all gathered skbs to the event. */
+ __skb_queue_tail(&temp, pos);
+ }
+
+ /* If we didn't reap any data, see if the next expected SSN
+ * is next on the queue and if so, use that.
+ */
+ if (event == NULL && pos != (struct sk_buff *)lobby) {
+ cevent = (struct sctp_ulpevent *) pos->cb;
+ csid = cevent->stream;
+ cssn = cevent->ssn;
+
+ if (csid == sid && cssn == sctp_ssn_peek(in, csid)) {
+ sctp_ssn_next(in, csid);
+ __skb_unlink(pos, lobby);
+ __skb_queue_tail(&temp, pos);
+ event = sctp_skb2event(pos);
+ }
+ }
+
+ /* Send event to the ULP. 'event' is the sctp_ulpevent for
+ * very first SKB on the 'temp' list.
+ */
+ if (event) {
+ /* see if we have more ordered that we can deliver */
+ sctp_ulpq_retrieve_ordered(ulpq, event);
+ sctp_ulpq_tail_event(ulpq, event);
+ }
+}
+
+/* Skip over an SSN. This is used during the processing of
+ * Forwared TSN chunk to skip over the abandoned ordered data
+ */
+void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
+{
+ struct sctp_stream *in;
+
+ /* Note: The stream ID must be verified before this routine. */
+ in = &ulpq->asoc->ssnmap->in;
+
+ /* Is this an old SSN? If so ignore. */
+ if (SSN_lt(ssn, sctp_ssn_peek(in, sid)))
+ return;
+
+ /* Mark that we are no longer expecting this SSN or lower. */
+ sctp_ssn_skip(in, sid, ssn);
+
+ /* Go find any other chunks that were waiting for
+ * ordering and deliver them if needed.
+ */
+ sctp_ulpq_reap_ordered(ulpq, sid);
+}
+
+static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq,
+ struct sk_buff_head *list, __u16 needed)
+{
+ __u16 freed = 0;
+ __u32 tsn, last_tsn;
+ struct sk_buff *skb, *flist, *last;
+ struct sctp_ulpevent *event;
+ struct sctp_tsnmap *tsnmap;
+
+ tsnmap = &ulpq->asoc->peer.tsn_map;
+
+ while ((skb = skb_peek_tail(list)) != NULL) {
+ event = sctp_skb2event(skb);
+ tsn = event->tsn;
+
+ /* Don't renege below the Cumulative TSN ACK Point. */
+ if (TSN_lte(tsn, sctp_tsnmap_get_ctsn(tsnmap)))
+ break;
+
+ /* Events in ordering queue may have multiple fragments
+ * corresponding to additional TSNs. Sum the total
+ * freed space; find the last TSN.
+ */
+ freed += skb_headlen(skb);
+ flist = skb_shinfo(skb)->frag_list;
+ for (last = flist; flist; flist = flist->next) {
+ last = flist;
+ freed += skb_headlen(last);
+ }
+ if (last)
+ last_tsn = sctp_skb2event(last)->tsn;
+ else
+ last_tsn = tsn;
+
+ /* Unlink the event, then renege all applicable TSNs. */
+ __skb_unlink(skb, list);
+ sctp_ulpevent_free(event);
+ while (TSN_lte(tsn, last_tsn)) {
+ sctp_tsnmap_renege(tsnmap, tsn);
+ tsn++;
+ }
+ if (freed >= needed)
+ return freed;
+ }
+
+ return freed;
+}
+
+/* Renege 'needed' bytes from the ordering queue. */
+static __u16 sctp_ulpq_renege_order(struct sctp_ulpq *ulpq, __u16 needed)
+{
+ return sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed);
+}
+
+/* Renege 'needed' bytes from the reassembly queue. */
+static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
+{
+ return sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed);
+}
+
+/* Partial deliver the first message as there is pressure on rwnd. */
+void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
+ gfp_t gfp)
+{
+ struct sctp_ulpevent *event;
+ struct sctp_association *asoc;
+ struct sctp_sock *sp;
+ __u32 ctsn;
+ struct sk_buff *skb;
+
+ asoc = ulpq->asoc;
+ sp = sctp_sk(asoc->base.sk);
+
+ /* If the association is already in Partial Delivery mode
+ * we have nothing to do.
+ */
+ if (ulpq->pd_mode)
+ return;
+
+ /* Data must be at or below the Cumulative TSN ACK Point to
+ * start partial delivery.
+ */
+ skb = skb_peek(&asoc->ulpq.reasm);
+ if (skb != NULL) {
+ ctsn = sctp_skb2event(skb)->tsn;
+ if (!TSN_lte(ctsn, sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)))
+ return;
+ }
+
+ /* If the user enabled fragment interleave socket option,
+ * multiple associations can enter partial delivery.
+ * Otherwise, we can only enter partial delivery if the
+ * socket is not in partial deliver mode.
+ */
+ if (sp->frag_interleave || atomic_read(&sp->pd_mode) == 0) {
+ /* Is partial delivery possible? */
+ event = sctp_ulpq_retrieve_first(ulpq);
+ /* Send event to the ULP. */
+ if (event) {
+ sctp_ulpq_tail_event(ulpq, event);
+ sctp_ulpq_set_pd(ulpq);
+ return;
+ }
+ }
+}
+
+/* Renege some packets to make room for an incoming chunk. */
+void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
+ gfp_t gfp)
+{
+ struct sctp_association *asoc;
+ __u16 needed, freed;
+
+ asoc = ulpq->asoc;
+
+ if (chunk) {
+ needed = ntohs(chunk->chunk_hdr->length);
+ needed -= sizeof(sctp_data_chunk_t);
+ } else
+ needed = SCTP_DEFAULT_MAXWINDOW;
+
+ freed = 0;
+
+ if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) {
+ freed = sctp_ulpq_renege_order(ulpq, needed);
+ if (freed < needed) {
+ freed += sctp_ulpq_renege_frags(ulpq, needed - freed);
+ }
+ }
+ /* If able to free enough room, accept this chunk. */
+ if (chunk && (freed >= needed)) {
+ int retval;
+ retval = sctp_ulpq_tail_data(ulpq, chunk, gfp);
+ /*
+ * Enter partial delivery if chunk has not been
+ * delivered; otherwise, drain the reassembly queue.
+ */
+ if (retval <= 0)
+ sctp_ulpq_partial_delivery(ulpq, gfp);
+ else if (retval == 1)
+ sctp_ulpq_reasm_drain(ulpq);
+ }
+
+ sk_mem_reclaim(asoc->base.sk);
+}
+
+
+
+/* Notify the application if an association is aborted and in
+ * partial delivery mode. Send up any pending received messages.
+ */
+void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+ struct sctp_ulpevent *ev = NULL;
+ struct sock *sk;
+
+ if (!ulpq->pd_mode)
+ return;
+
+ sk = ulpq->asoc->base.sk;
+ if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
+ &sctp_sk(sk)->subscribe))
+ ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
+ SCTP_PARTIAL_DELIVERY_ABORTED,
+ gfp);
+ if (ev)
+ __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
+
+ /* If there is data waiting, send it up the socket now. */
+ if (sctp_ulpq_clear_pd(ulpq) || ev)
+ sk->sk_data_ready(sk);
+}
diff --git a/net/socket.c b/net/socket.c
new file mode 100644
index 000000000..884e32997
--- /dev/null
+++ b/net/socket.c
@@ -0,0 +1,3304 @@
+/*
+ * NET An implementation of the SOCKET network access protocol.
+ *
+ * Version: @(#)socket.c 1.1.93 18/02/95
+ *
+ * Authors: Orest Zborowski, <obz@Kodak.COM>
+ * Ross Biro
+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ * Anonymous : NOTSOCK/BADF cleanup. Error fix in
+ * shutdown()
+ * Alan Cox : verify_area() fixes
+ * Alan Cox : Removed DDI
+ * Jonathan Kamens : SOCK_DGRAM reconnect bug
+ * Alan Cox : Moved a load of checks to the very
+ * top level.
+ * Alan Cox : Move address structures to/from user
+ * mode above the protocol layers.
+ * Rob Janssen : Allow 0 length sends.
+ * Alan Cox : Asynchronous I/O support (cribbed from the
+ * tty drivers).
+ * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style)
+ * Jeff Uphoff : Made max number of sockets command-line
+ * configurable.
+ * Matti Aarnio : Made the number of sockets dynamic,
+ * to be allocated when needed, and mr.
+ * Uphoff's max is used as max to be
+ * allowed to allocate.
+ * Linus : Argh. removed all the socket allocation
+ * altogether: it's in the inode now.
+ * Alan Cox : Made sock_alloc()/sock_release() public
+ * for NetROM and future kernel nfsd type
+ * stuff.
+ * Alan Cox : sendmsg/recvmsg basics.
+ * Tom Dyas : Export net symbols.
+ * Marcin Dalecki : Fixed problems with CONFIG_NET="n".
+ * Alan Cox : Added thread locking to sys_* calls
+ * for sockets. May have errors at the
+ * moment.
+ * Kevin Buhr : Fixed the dumb errors in the above.
+ * Andi Kleen : Some small cleanups, optimizations,
+ * and fixed a copy_from_user() bug.
+ * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0)
+ * Tigran Aivazian : Made listen(2) backlog sanity checks
+ * protocol-independent
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * This module is effectively the top level interface to the BSD socket
+ * paradigm.
+ *
+ * Based upon Swansea University Computer Society NET3.039
+ */
+
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/interrupt.h>
+#include <linux/thread_info.h>
+#include <linux/rcupdate.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+#include <linux/if_bridge.h>
+#include <linux/if_frad.h>
+#include <linux/if_vlan.h>
+#include <linux/ptp_classify.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/kmod.h>
+#include <linux/audit.h>
+#include <linux/wireless.h>
+#include <linux/nsproxy.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#include <net/compat.h>
+#include <net/wext.h>
+#include <net/cls_cgroup.h>
+
+#include <net/sock.h>
+#include <linux/netfilter.h>
+
+#include <linux/if_tun.h>
+#include <linux/ipv6_route.h>
+#include <linux/route.h>
+#include <linux/sockios.h>
+#include <linux/atalk.h>
+#include <net/busy_poll.h>
+#include <linux/errqueue.h>
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+unsigned int sysctl_net_busy_read __read_mostly;
+unsigned int sysctl_net_busy_poll __read_mostly;
+#endif
+
+static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
+static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
+static int sock_mmap(struct file *file, struct vm_area_struct *vma);
+
+static int sock_close(struct inode *inode, struct file *file);
+static unsigned int sock_poll(struct file *file,
+ struct poll_table_struct *wait);
+static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+static long compat_sock_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg);
+#endif
+static int sock_fasync(int fd, struct file *filp, int on);
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+ int offset, size_t size, loff_t *ppos, int more);
+static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags);
+
+/*
+ * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
+ * in the operation structures but are done directly via the socketcall() multiplexor.
+ */
+
+static const struct file_operations socket_file_ops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read_iter = sock_read_iter,
+ .write_iter = sock_write_iter,
+ .poll = sock_poll,
+ .unlocked_ioctl = sock_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = compat_sock_ioctl,
+#endif
+ .mmap = sock_mmap,
+ .release = sock_close,
+ .fasync = sock_fasync,
+ .sendpage = sock_sendpage,
+ .splice_write = generic_splice_sendpage,
+ .splice_read = sock_splice_read,
+};
+
+/*
+ * The protocol list. Each protocol is registered in here.
+ */
+
+static DEFINE_SPINLOCK(net_family_lock);
+static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
+
+/*
+ * Statistics counters of the socket lists
+ */
+
+static DEFINE_PER_CPU(int, sockets_in_use);
+
+/*
+ * Support routines.
+ * Move socket addresses back and forth across the kernel/user
+ * divide and look after the messy bits.
+ */
+
+/**
+ * move_addr_to_kernel - copy a socket address into kernel space
+ * @uaddr: Address in user space
+ * @kaddr: Address in kernel space
+ * @ulen: Length in user space
+ *
+ * The address is copied into kernel space. If the provided address is
+ * too long an error code of -EINVAL is returned. If the copy gives
+ * invalid addresses -EFAULT is returned. On a success 0 is returned.
+ */
+
+int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
+{
+ if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
+ return -EINVAL;
+ if (ulen == 0)
+ return 0;
+ if (copy_from_user(kaddr, uaddr, ulen))
+ return -EFAULT;
+ return audit_sockaddr(ulen, kaddr);
+}
+
+/**
+ * move_addr_to_user - copy an address to user space
+ * @kaddr: kernel space address
+ * @klen: length of address in kernel
+ * @uaddr: user space address
+ * @ulen: pointer to user length field
+ *
+ * The value pointed to by ulen on entry is the buffer length available.
+ * This is overwritten with the buffer space used. -EINVAL is returned
+ * if an overlong buffer is specified or a negative buffer size. -EFAULT
+ * is returned if either the buffer or the length field are not
+ * accessible.
+ * After copying the data up to the limit the user specifies, the true
+ * length of the data is written over the length limit the user
+ * specified. Zero is returned for a success.
+ */
+
+static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
+ void __user *uaddr, int __user *ulen)
+{
+ int err;
+ int len;
+
+ BUG_ON(klen > sizeof(struct sockaddr_storage));
+ err = get_user(len, ulen);
+ if (err)
+ return err;
+ if (len > klen)
+ len = klen;
+ if (len < 0)
+ return -EINVAL;
+ if (len) {
+ if (audit_sockaddr(klen, kaddr))
+ return -ENOMEM;
+ if (copy_to_user(uaddr, kaddr, len))
+ return -EFAULT;
+ }
+ /*
+ * "fromlen shall refer to the value before truncation.."
+ * 1003.1g
+ */
+ return __put_user(klen, ulen);
+}
+
+static struct kmem_cache *sock_inode_cachep __read_mostly;
+
+static struct inode *sock_alloc_inode(struct super_block *sb)
+{
+ struct socket_alloc *ei;
+ struct socket_wq *wq;
+
+ ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
+ if (!ei)
+ return NULL;
+ wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq) {
+ kmem_cache_free(sock_inode_cachep, ei);
+ return NULL;
+ }
+ init_waitqueue_head(&wq->wait);
+ wq->fasync_list = NULL;
+ RCU_INIT_POINTER(ei->socket.wq, wq);
+
+ ei->socket.state = SS_UNCONNECTED;
+ ei->socket.flags = 0;
+ ei->socket.ops = NULL;
+ ei->socket.sk = NULL;
+ ei->socket.file = NULL;
+
+ return &ei->vfs_inode;
+}
+
+static void sock_destroy_inode(struct inode *inode)
+{
+ struct socket_alloc *ei;
+ struct socket_wq *wq;
+
+ ei = container_of(inode, struct socket_alloc, vfs_inode);
+ wq = rcu_dereference_protected(ei->socket.wq, 1);
+ kfree_rcu(wq, rcu);
+ kmem_cache_free(sock_inode_cachep, ei);
+}
+
+static void init_once(void *foo)
+{
+ struct socket_alloc *ei = (struct socket_alloc *)foo;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+ sock_inode_cachep = kmem_cache_create("sock_inode_cache",
+ sizeof(struct socket_alloc),
+ 0,
+ (SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD),
+ init_once);
+ if (sock_inode_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+static const struct super_operations sockfs_ops = {
+ .alloc_inode = sock_alloc_inode,
+ .destroy_inode = sock_destroy_inode,
+ .statfs = simple_statfs,
+};
+
+/*
+ * sockfs_dname() is called from d_path().
+ */
+static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+ return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
+ d_inode(dentry)->i_ino);
+}
+
+static const struct dentry_operations sockfs_dentry_operations = {
+ .d_dname = sockfs_dname,
+};
+
+static struct dentry *sockfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_pseudo(fs_type, "socket:", &sockfs_ops,
+ &sockfs_dentry_operations, SOCKFS_MAGIC);
+}
+
+static struct vfsmount *sock_mnt __read_mostly;
+
+static struct file_system_type sock_fs_type = {
+ .name = "sockfs",
+ .mount = sockfs_mount,
+ .kill_sb = kill_anon_super,
+};
+
+/*
+ * Obtains the first available file descriptor and sets it up for use.
+ *
+ * These functions create file structures and maps them to fd space
+ * of the current process. On success it returns file descriptor
+ * and file struct implicitly stored in sock->file.
+ * Note that another thread may close file descriptor before we return
+ * from this function. We use the fact that now we do not refer
+ * to socket after mapping. If one day we will need it, this
+ * function will increment ref. count on file by 1.
+ *
+ * In any case returned fd MAY BE not valid!
+ * This race condition is unavoidable
+ * with shared fd spaces, we cannot solve it inside kernel,
+ * but we take care of internal coherence yet.
+ */
+
+struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
+{
+ struct qstr name = { .name = "" };
+ struct path path;
+ struct file *file;
+
+ if (dname) {
+ name.name = dname;
+ name.len = strlen(name.name);
+ } else if (sock->sk) {
+ name.name = sock->sk->sk_prot_creator->name;
+ name.len = strlen(name.name);
+ }
+ path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
+ if (unlikely(!path.dentry))
+ return ERR_PTR(-ENOMEM);
+ path.mnt = mntget(sock_mnt);
+
+ d_instantiate(path.dentry, SOCK_INODE(sock));
+
+ file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
+ &socket_file_ops);
+ if (unlikely(IS_ERR(file))) {
+ /* drop dentry, keep inode */
+ ihold(d_inode(path.dentry));
+ path_put(&path);
+ return file;
+ }
+
+ sock->file = file;
+ file->f_flags = O_RDWR | (flags & O_NONBLOCK);
+ file->private_data = sock;
+ return file;
+}
+EXPORT_SYMBOL(sock_alloc_file);
+
+static int sock_map_fd(struct socket *sock, int flags)
+{
+ struct file *newfile;
+ int fd = get_unused_fd_flags(flags);
+ if (unlikely(fd < 0))
+ return fd;
+
+ newfile = sock_alloc_file(sock, flags, NULL);
+ if (likely(!IS_ERR(newfile))) {
+ fd_install(fd, newfile);
+ return fd;
+ }
+
+ put_unused_fd(fd);
+ return PTR_ERR(newfile);
+}
+
+struct socket *sock_from_file(struct file *file, int *err)
+{
+ if (file->f_op == &socket_file_ops)
+ return file->private_data; /* set in sock_map_fd */
+
+ *err = -ENOTSOCK;
+ return NULL;
+}
+EXPORT_SYMBOL(sock_from_file);
+
+/**
+ * sockfd_lookup - Go from a file number to its socket slot
+ * @fd: file handle
+ * @err: pointer to an error code return
+ *
+ * The file handle passed in is locked and the socket it is bound
+ * too is returned. If an error occurs the err pointer is overwritten
+ * with a negative errno code and NULL is returned. The function checks
+ * for both invalid handles and passing a handle which is not a socket.
+ *
+ * On a success the socket object pointer is returned.
+ */
+
+struct socket *sockfd_lookup(int fd, int *err)
+{
+ struct file *file;
+ struct socket *sock;
+
+ file = fget(fd);
+ if (!file) {
+ *err = -EBADF;
+ return NULL;
+ }
+
+ sock = sock_from_file(file, err);
+ if (!sock)
+ fput(file);
+ return sock;
+}
+EXPORT_SYMBOL(sockfd_lookup);
+
+static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
+{
+ struct fd f = fdget(fd);
+ struct socket *sock;
+
+ *err = -EBADF;
+ if (f.file) {
+ sock = sock_from_file(f.file, err);
+ if (likely(sock)) {
+ *fput_needed = f.flags;
+ return sock;
+ }
+ fdput(f);
+ }
+ return NULL;
+}
+
+#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
+#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
+#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
+static ssize_t sockfs_getxattr(struct dentry *dentry,
+ const char *name, void *value, size_t size)
+{
+ const char *proto_name;
+ size_t proto_size;
+ int error;
+
+ error = -ENODATA;
+ if (!strncmp(name, XATTR_NAME_SOCKPROTONAME, XATTR_NAME_SOCKPROTONAME_LEN)) {
+ proto_name = dentry->d_name.name;
+ proto_size = strlen(proto_name);
+
+ if (value) {
+ error = -ERANGE;
+ if (proto_size + 1 > size)
+ goto out;
+
+ strncpy(value, proto_name, proto_size + 1);
+ }
+ error = proto_size + 1;
+ }
+
+out:
+ return error;
+}
+
+static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
+ size_t size)
+{
+ ssize_t len;
+ ssize_t used = 0;
+
+ len = security_inode_listsecurity(d_inode(dentry), buffer, size);
+ if (len < 0)
+ return len;
+ used += len;
+ if (buffer) {
+ if (size < used)
+ return -ERANGE;
+ buffer += len;
+ }
+
+ len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
+ used += len;
+ if (buffer) {
+ if (size < used)
+ return -ERANGE;
+ memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
+ buffer += len;
+ }
+
+ return used;
+}
+
+static const struct inode_operations sockfs_inode_ops = {
+ .getxattr = sockfs_getxattr,
+ .listxattr = sockfs_listxattr,
+};
+
+/**
+ * sock_alloc - allocate a socket
+ *
+ * Allocate a new inode and socket object. The two are bound together
+ * and initialised. The socket is then returned. If we are out of inodes
+ * NULL is returned.
+ */
+
+static struct socket *sock_alloc(void)
+{
+ struct inode *inode;
+ struct socket *sock;
+
+ inode = new_inode_pseudo(sock_mnt->mnt_sb);
+ if (!inode)
+ return NULL;
+
+ sock = SOCKET_I(inode);
+
+ kmemcheck_annotate_bitfield(sock, type);
+ inode->i_ino = get_next_ino();
+ inode->i_mode = S_IFSOCK | S_IRWXUGO;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_op = &sockfs_inode_ops;
+
+ this_cpu_add(sockets_in_use, 1);
+ return sock;
+}
+
+/**
+ * sock_release - close a socket
+ * @sock: socket to close
+ *
+ * The socket is released from the protocol stack if it has a release
+ * callback, and the inode is then released if the socket is bound to
+ * an inode not a file.
+ */
+
+void sock_release(struct socket *sock)
+{
+ if (sock->ops) {
+ struct module *owner = sock->ops->owner;
+
+ sock->ops->release(sock);
+ sock->ops = NULL;
+ module_put(owner);
+ }
+
+ if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
+ pr_err("%s: fasync list not empty!\n", __func__);
+
+ if (test_bit(SOCK_EXTERNALLY_ALLOCATED, &sock->flags))
+ return;
+
+ this_cpu_sub(sockets_in_use, 1);
+ if (!sock->file) {
+ iput(SOCK_INODE(sock));
+ return;
+ }
+ sock->file = NULL;
+}
+EXPORT_SYMBOL(sock_release);
+
+void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags)
+{
+ u8 flags = *tx_flags;
+
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
+ flags |= SKBTX_HW_TSTAMP;
+
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
+ flags |= SKBTX_SW_TSTAMP;
+
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)
+ flags |= SKBTX_SCHED_TSTAMP;
+
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)
+ flags |= SKBTX_ACK_TSTAMP;
+
+ *tx_flags = flags;
+}
+EXPORT_SYMBOL(__sock_tx_timestamp);
+
+static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
+{
+ int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg));
+ BUG_ON(ret == -EIOCBQUEUED);
+ return ret;
+}
+
+int sock_sendmsg(struct socket *sock, struct msghdr *msg)
+{
+ int err = security_socket_sendmsg(sock, msg,
+ msg_data_left(msg));
+
+ return err ?: sock_sendmsg_nosec(sock, msg);
+}
+EXPORT_SYMBOL(sock_sendmsg);
+
+int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct kvec *vec, size_t num, size_t size)
+{
+ iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size);
+ return sock_sendmsg(sock, msg);
+}
+EXPORT_SYMBOL(kernel_sendmsg);
+
+/*
+ * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
+ */
+void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
+ struct scm_timestamping tss;
+ int empty = 1;
+ struct skb_shared_hwtstamps *shhwtstamps =
+ skb_hwtstamps(skb);
+
+ /* Race occurred between timestamp enabling and packet
+ receiving. Fill in the current time for now. */
+ if (need_software_tstamp && skb->tstamp.tv64 == 0)
+ __net_timestamp(skb);
+
+ if (need_software_tstamp) {
+ if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
+ struct timeval tv;
+ skb_get_timestamp(skb, &tv);
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+ sizeof(tv), &tv);
+ } else {
+ struct timespec ts;
+ skb_get_timestampns(skb, &ts);
+ put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
+ sizeof(ts), &ts);
+ }
+ }
+
+ memset(&tss, 0, sizeof(tss));
+ if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
+ ktime_to_timespec_cond(skb->tstamp, tss.ts + 0))
+ empty = 0;
+ if (shhwtstamps &&
+ (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
+ empty = 0;
+ if (!empty)
+ put_cmsg(msg, SOL_SOCKET,
+ SCM_TIMESTAMPING, sizeof(tss), &tss);
+}
+EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
+
+void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int ack;
+
+ if (!sock_flag(sk, SOCK_WIFI_STATUS))
+ return;
+ if (!skb->wifi_acked_valid)
+ return;
+
+ ack = skb->wifi_acked;
+
+ put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
+}
+EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
+
+static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
+ put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
+ sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
+}
+
+void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb)
+{
+ sock_recv_timestamp(msg, sk, skb);
+ sock_recv_drops(msg, sk, skb);
+}
+EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
+
+static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ return sock->ops->recvmsg(sock, msg, size, flags);
+}
+
+int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ int err = security_socket_recvmsg(sock, msg, size, flags);
+
+ return err ?: sock_recvmsg_nosec(sock, msg, size, flags);
+}
+EXPORT_SYMBOL(sock_recvmsg);
+
+/**
+ * kernel_recvmsg - Receive a message from a socket (kernel space)
+ * @sock: The socket to receive the message from
+ * @msg: Received message
+ * @vec: Input s/g array for message data
+ * @num: Size of input s/g array
+ * @size: Number of bytes to read
+ * @flags: Message flags (MSG_DONTWAIT, etc...)
+ *
+ * On return the msg structure contains the scatter/gather array passed in the
+ * vec argument. The array is modified so that it consists of the unfilled
+ * portion of the original array.
+ *
+ * The returned value is the total number of bytes received, or an error.
+ */
+int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
+ struct kvec *vec, size_t num, size_t size, int flags)
+{
+ mm_segment_t oldfs = get_fs();
+ int result;
+
+ iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size);
+ set_fs(KERNEL_DS);
+ result = sock_recvmsg(sock, msg, size, flags);
+ set_fs(oldfs);
+ return result;
+}
+EXPORT_SYMBOL(kernel_recvmsg);
+
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+ int offset, size_t size, loff_t *ppos, int more)
+{
+ struct socket *sock;
+ int flags;
+
+ sock = file->private_data;
+
+ flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+ /* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
+ flags |= more;
+
+ return kernel_sendpage(sock, page, offset, size, flags);
+}
+
+static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct socket *sock = file->private_data;
+
+ if (unlikely(!sock->ops->splice_read))
+ return -EINVAL;
+
+ return sock->ops->splice_read(sock, ppos, pipe, len, flags);
+}
+
+static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ struct socket *sock = file->private_data;
+ struct msghdr msg = {.msg_iter = *to,
+ .msg_iocb = iocb};
+ ssize_t res;
+
+ if (file->f_flags & O_NONBLOCK)
+ msg.msg_flags = MSG_DONTWAIT;
+
+ if (iocb->ki_pos != 0)
+ return -ESPIPE;
+
+ if (!iov_iter_count(to)) /* Match SYS5 behaviour */
+ return 0;
+
+ res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags);
+ *to = msg.msg_iter;
+ return res;
+}
+
+static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct socket *sock = file->private_data;
+ struct msghdr msg = {.msg_iter = *from,
+ .msg_iocb = iocb};
+ ssize_t res;
+
+ if (iocb->ki_pos != 0)
+ return -ESPIPE;
+
+ if (file->f_flags & O_NONBLOCK)
+ msg.msg_flags = MSG_DONTWAIT;
+
+ if (sock->type == SOCK_SEQPACKET)
+ msg.msg_flags |= MSG_EOR;
+
+ res = sock_sendmsg(sock, &msg);
+ *from = msg.msg_iter;
+ return res;
+}
+
+/*
+ * Atomic setting of ioctl hooks to avoid race
+ * with module unload.
+ */
+
+static DEFINE_MUTEX(br_ioctl_mutex);
+static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
+
+void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
+{
+ mutex_lock(&br_ioctl_mutex);
+ br_ioctl_hook = hook;
+ mutex_unlock(&br_ioctl_mutex);
+}
+EXPORT_SYMBOL(brioctl_set);
+
+static DEFINE_MUTEX(vlan_ioctl_mutex);
+static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
+
+void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
+{
+ mutex_lock(&vlan_ioctl_mutex);
+ vlan_ioctl_hook = hook;
+ mutex_unlock(&vlan_ioctl_mutex);
+}
+EXPORT_SYMBOL(vlan_ioctl_set);
+
+static DEFINE_MUTEX(dlci_ioctl_mutex);
+static int (*dlci_ioctl_hook) (unsigned int, void __user *);
+
+void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
+{
+ mutex_lock(&dlci_ioctl_mutex);
+ dlci_ioctl_hook = hook;
+ mutex_unlock(&dlci_ioctl_mutex);
+}
+EXPORT_SYMBOL(dlci_ioctl_set);
+
+static long sock_do_ioctl(struct net *net, struct socket *sock,
+ unsigned int cmd, unsigned long arg)
+{
+ int err;
+ void __user *argp = (void __user *)arg;
+
+ err = sock->ops->ioctl(sock, cmd, arg);
+
+ /*
+ * If this ioctl is unknown try to hand it down
+ * to the NIC driver.
+ */
+ if (err == -ENOIOCTLCMD)
+ err = dev_ioctl(net, cmd, argp);
+
+ return err;
+}
+
+/*
+ * With an ioctl, arg may well be a user mode pointer, but we don't know
+ * what to do with it - that's up to the protocol still.
+ */
+
+static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+ struct socket *sock;
+ struct sock *sk;
+ void __user *argp = (void __user *)arg;
+ int pid, err;
+ struct net *net;
+
+ sock = file->private_data;
+ sk = sock->sk;
+ net = sock_net(sk);
+ if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
+ err = dev_ioctl(net, cmd, argp);
+ } else
+#ifdef CONFIG_WEXT_CORE
+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
+ err = dev_ioctl(net, cmd, argp);
+ } else
+#endif
+ switch (cmd) {
+ case FIOSETOWN:
+ case SIOCSPGRP:
+ err = -EFAULT;
+ if (get_user(pid, (int __user *)argp))
+ break;
+ f_setown(sock->file, pid, 1);
+ err = 0;
+ break;
+ case FIOGETOWN:
+ case SIOCGPGRP:
+ err = put_user(f_getown(sock->file),
+ (int __user *)argp);
+ break;
+ case SIOCGIFBR:
+ case SIOCSIFBR:
+ case SIOCBRADDBR:
+ case SIOCBRDELBR:
+ err = -ENOPKG;
+ if (!br_ioctl_hook)
+ request_module("bridge");
+
+ mutex_lock(&br_ioctl_mutex);
+ if (br_ioctl_hook)
+ err = br_ioctl_hook(net, cmd, argp);
+ mutex_unlock(&br_ioctl_mutex);
+ break;
+ case SIOCGIFVLAN:
+ case SIOCSIFVLAN:
+ err = -ENOPKG;
+ if (!vlan_ioctl_hook)
+ request_module("8021q");
+
+ mutex_lock(&vlan_ioctl_mutex);
+ if (vlan_ioctl_hook)
+ err = vlan_ioctl_hook(net, argp);
+ mutex_unlock(&vlan_ioctl_mutex);
+ break;
+ case SIOCADDDLCI:
+ case SIOCDELDLCI:
+ err = -ENOPKG;
+ if (!dlci_ioctl_hook)
+ request_module("dlci");
+
+ mutex_lock(&dlci_ioctl_mutex);
+ if (dlci_ioctl_hook)
+ err = dlci_ioctl_hook(cmd, argp);
+ mutex_unlock(&dlci_ioctl_mutex);
+ break;
+ default:
+ err = sock_do_ioctl(net, sock, cmd, arg);
+ break;
+ }
+ return err;
+}
+
+int sock_create_lite(int family, int type, int protocol, struct socket **res)
+{
+ int err;
+ struct socket *sock = NULL;
+
+ err = security_socket_create(family, type, protocol, 1);
+ if (err)
+ goto out;
+
+ sock = sock_alloc();
+ if (!sock) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ sock->type = type;
+ err = security_socket_post_create(sock, family, type, protocol, 1);
+ if (err)
+ goto out_release;
+
+out:
+ *res = sock;
+ return err;
+out_release:
+ sock_release(sock);
+ sock = NULL;
+ goto out;
+}
+EXPORT_SYMBOL(sock_create_lite);
+
+/* No kernel lock held - perfect */
+static unsigned int sock_poll(struct file *file, poll_table *wait)
+{
+ unsigned int busy_flag = 0;
+ struct socket *sock;
+
+ /*
+ * We can't return errors to poll, so it's either yes or no.
+ */
+ sock = file->private_data;
+
+ if (sk_can_busy_loop(sock->sk)) {
+ /* this socket can poll_ll so tell the system call */
+ busy_flag = POLL_BUSY_LOOP;
+
+ /* once, only if requested by syscall */
+ if (wait && (wait->_key & POLL_BUSY_LOOP))
+ sk_busy_loop(sock->sk, 1);
+ }
+
+ return busy_flag | sock->ops->poll(file, sock, wait);
+}
+
+static int sock_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct socket *sock = file->private_data;
+
+ return sock->ops->mmap(file, sock, vma);
+}
+
+static int sock_close(struct inode *inode, struct file *filp)
+{
+ sock_release(SOCKET_I(inode));
+ return 0;
+}
+
+/*
+ * Update the socket async list
+ *
+ * Fasync_list locking strategy.
+ *
+ * 1. fasync_list is modified only under process context socket lock
+ * i.e. under semaphore.
+ * 2. fasync_list is used under read_lock(&sk->sk_callback_lock)
+ * or under socket lock
+ */
+
+static int sock_fasync(int fd, struct file *filp, int on)
+{
+ struct socket *sock = filp->private_data;
+ struct sock *sk = sock->sk;
+ struct socket_wq *wq;
+
+ if (sk == NULL)
+ return -EINVAL;
+
+ lock_sock(sk);
+ wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
+ fasync_helper(fd, filp, on, &wq->fasync_list);
+
+ if (!wq->fasync_list)
+ sock_reset_flag(sk, SOCK_FASYNC);
+ else
+ sock_set_flag(sk, SOCK_FASYNC);
+
+ release_sock(sk);
+ return 0;
+}
+
+/* This function may be called only under socket lock or callback_lock or rcu_lock */
+
+int sock_wake_async(struct socket *sock, int how, int band)
+{
+ struct socket_wq *wq;
+
+ if (!sock)
+ return -1;
+ rcu_read_lock();
+ wq = rcu_dereference(sock->wq);
+ if (!wq || !wq->fasync_list) {
+ rcu_read_unlock();
+ return -1;
+ }
+ switch (how) {
+ case SOCK_WAKE_WAITD:
+ if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
+ break;
+ goto call_kill;
+ case SOCK_WAKE_SPACE:
+ if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
+ break;
+ /* fall through */
+ case SOCK_WAKE_IO:
+call_kill:
+ kill_fasync(&wq->fasync_list, SIGIO, band);
+ break;
+ case SOCK_WAKE_URG:
+ kill_fasync(&wq->fasync_list, SIGURG, band);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+EXPORT_SYMBOL(sock_wake_async);
+
+int __sock_create(struct net *net, int family, int type, int protocol,
+ struct socket **res, int kern)
+{
+ int err;
+ struct socket *sock;
+ const struct net_proto_family *pf;
+
+ /*
+ * Check protocol is in range
+ */
+ if (family < 0 || family >= NPROTO)
+ return -EAFNOSUPPORT;
+ if (type < 0 || type >= SOCK_MAX)
+ return -EINVAL;
+
+ /* Compatibility.
+
+ This uglymoron is moved from INET layer to here to avoid
+ deadlock in module load.
+ */
+ if (family == PF_INET && type == SOCK_PACKET) {
+ static int warned;
+ if (!warned) {
+ warned = 1;
+ pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
+ current->comm);
+ }
+ family = PF_PACKET;
+ }
+
+ err = security_socket_create(family, type, protocol, kern);
+ if (err)
+ return err;
+
+ /*
+ * Allocate the socket and allow the family to set things up. if
+ * the protocol is 0, the family is instructed to select an appropriate
+ * default.
+ */
+ sock = sock_alloc();
+ if (!sock) {
+ net_warn_ratelimited("socket: no more sockets\n");
+ return -ENFILE; /* Not exactly a match, but its the
+ closest posix thing */
+ }
+
+ sock->type = type;
+
+#ifdef CONFIG_MODULES
+ /* Attempt to load a protocol module if the find failed.
+ *
+ * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
+ * requested real, full-featured networking support upon configuration.
+ * Otherwise module support will break!
+ */
+ if (rcu_access_pointer(net_families[family]) == NULL)
+ request_module("net-pf-%d", family);
+#endif
+
+ rcu_read_lock();
+ pf = rcu_dereference(net_families[family]);
+ err = -EAFNOSUPPORT;
+ if (!pf)
+ goto out_release;
+
+ /*
+ * We will call the ->create function, that possibly is in a loadable
+ * module, so we have to bump that loadable module refcnt first.
+ */
+ if (!try_module_get(pf->owner))
+ goto out_release;
+
+ /* Now protected by module ref count */
+ rcu_read_unlock();
+
+ err = pf->create(net, sock, protocol, kern);
+ if (err < 0)
+ goto out_module_put;
+
+ /*
+ * Now to bump the refcnt of the [loadable] module that owns this
+ * socket at sock_release time we decrement its refcnt.
+ */
+ if (!try_module_get(sock->ops->owner))
+ goto out_module_busy;
+
+ /*
+ * Now that we're done with the ->create function, the [loadable]
+ * module can have its refcnt decremented
+ */
+ module_put(pf->owner);
+ err = security_socket_post_create(sock, family, type, protocol, kern);
+ if (err)
+ goto out_sock_release;
+ *res = sock;
+
+ return 0;
+
+out_module_busy:
+ err = -EAFNOSUPPORT;
+out_module_put:
+ sock->ops = NULL;
+ module_put(pf->owner);
+out_sock_release:
+ sock_release(sock);
+ return err;
+
+out_release:
+ rcu_read_unlock();
+ goto out_sock_release;
+}
+EXPORT_SYMBOL(__sock_create);
+
+int sock_create(int family, int type, int protocol, struct socket **res)
+{
+ return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
+}
+EXPORT_SYMBOL(sock_create);
+
+int sock_create_kern(int family, int type, int protocol, struct socket **res)
+{
+ return __sock_create(&init_net, family, type, protocol, res, 1);
+}
+EXPORT_SYMBOL(sock_create_kern);
+
+SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
+{
+ int retval;
+ struct socket *sock;
+ int flags;
+
+ /* Check the SOCK_* constants for consistency. */
+ BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
+ BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
+ BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
+
+ flags = type & ~SOCK_TYPE_MASK;
+ if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ type &= SOCK_TYPE_MASK;
+
+ if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+ flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+ retval = sock_create(family, type, protocol, &sock);
+ if (retval < 0)
+ goto out;
+
+ retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
+ if (retval < 0)
+ goto out_release;
+
+out:
+ /* It may be already another descriptor 8) Not kernel problem. */
+ return retval;
+
+out_release:
+ sock_release(sock);
+ return retval;
+}
+
+/*
+ * Create a pair of connected sockets.
+ */
+
+SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
+ int __user *, usockvec)
+{
+ struct socket *sock1, *sock2;
+ int fd1, fd2, err;
+ struct file *newfile1, *newfile2;
+ int flags;
+
+ flags = type & ~SOCK_TYPE_MASK;
+ if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ type &= SOCK_TYPE_MASK;
+
+ if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+ flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+ /*
+ * Obtain the first socket and check if the underlying protocol
+ * supports the socketpair call.
+ */
+
+ err = sock_create(family, type, protocol, &sock1);
+ if (err < 0)
+ goto out;
+
+ err = sock_create(family, type, protocol, &sock2);
+ if (err < 0)
+ goto out_release_1;
+
+ err = sock1->ops->socketpair(sock1, sock2);
+ if (err < 0)
+ goto out_release_both;
+
+ fd1 = get_unused_fd_flags(flags);
+ if (unlikely(fd1 < 0)) {
+ err = fd1;
+ goto out_release_both;
+ }
+
+ fd2 = get_unused_fd_flags(flags);
+ if (unlikely(fd2 < 0)) {
+ err = fd2;
+ goto out_put_unused_1;
+ }
+
+ newfile1 = sock_alloc_file(sock1, flags, NULL);
+ if (unlikely(IS_ERR(newfile1))) {
+ err = PTR_ERR(newfile1);
+ goto out_put_unused_both;
+ }
+
+ newfile2 = sock_alloc_file(sock2, flags, NULL);
+ if (IS_ERR(newfile2)) {
+ err = PTR_ERR(newfile2);
+ goto out_fput_1;
+ }
+
+ err = put_user(fd1, &usockvec[0]);
+ if (err)
+ goto out_fput_both;
+
+ err = put_user(fd2, &usockvec[1]);
+ if (err)
+ goto out_fput_both;
+
+ audit_fd_pair(fd1, fd2);
+
+ fd_install(fd1, newfile1);
+ fd_install(fd2, newfile2);
+ /* fd1 and fd2 may be already another descriptors.
+ * Not kernel problem.
+ */
+
+ return 0;
+
+out_fput_both:
+ fput(newfile2);
+ fput(newfile1);
+ put_unused_fd(fd2);
+ put_unused_fd(fd1);
+ goto out;
+
+out_fput_1:
+ fput(newfile1);
+ put_unused_fd(fd2);
+ put_unused_fd(fd1);
+ sock_release(sock2);
+ goto out;
+
+out_put_unused_both:
+ put_unused_fd(fd2);
+out_put_unused_1:
+ put_unused_fd(fd1);
+out_release_both:
+ sock_release(sock2);
+out_release_1:
+ sock_release(sock1);
+out:
+ return err;
+}
+
+/*
+ * Bind a name to a socket. Nothing much to do here since it's
+ * the protocol's responsibility to handle the local address.
+ *
+ * We move the socket address to kernel space before we call
+ * the protocol layer (having also checked the address is ok).
+ */
+
+SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
+{
+ struct socket *sock;
+ struct sockaddr_storage address;
+ int err, fput_needed;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (sock) {
+ err = move_addr_to_kernel(umyaddr, addrlen, &address);
+ if (err >= 0) {
+ err = security_socket_bind(sock,
+ (struct sockaddr *)&address,
+ addrlen);
+ if (!err)
+ err = sock->ops->bind(sock,
+ (struct sockaddr *)
+ &address, addrlen);
+ }
+ fput_light(sock->file, fput_needed);
+ }
+ return err;
+}
+
+/*
+ * Perform a listen. Basically, we allow the protocol to do anything
+ * necessary for a listen, and if that works, we mark the socket as
+ * ready for listening.
+ */
+
+SYSCALL_DEFINE2(listen, int, fd, int, backlog)
+{
+ struct socket *sock;
+ int err, fput_needed;
+ int somaxconn;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (sock) {
+ somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
+ if ((unsigned int)backlog > somaxconn)
+ backlog = somaxconn;
+
+ err = security_socket_listen(sock, backlog);
+ if (!err)
+ err = sock->ops->listen(sock, backlog);
+
+ fput_light(sock->file, fput_needed);
+ }
+ return err;
+}
+
+/*
+ * For accept, we attempt to create a new socket, set up the link
+ * with the client, wake up the client, then return the new
+ * connected fd. We collect the address of the connector in kernel
+ * space and move it to user at the very end. This is unclean because
+ * we open the socket then return an error.
+ *
+ * 1003.1g adds the ability to recvmsg() to query connection pending
+ * status to recvmsg. We need to add that support in a way thats
+ * clean when we restucture accept also.
+ */
+
+SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
+ int __user *, upeer_addrlen, int, flags)
+{
+ struct socket *sock, *newsock;
+ struct file *newfile;
+ int err, len, newfd, fput_needed;
+ struct sockaddr_storage address;
+
+ if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+
+ if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+ flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ goto out;
+
+ err = -ENFILE;
+ newsock = sock_alloc();
+ if (!newsock)
+ goto out_put;
+
+ newsock->type = sock->type;
+ newsock->ops = sock->ops;
+
+ /*
+ * We don't need try_module_get here, as the listening socket (sock)
+ * has the protocol module (sock->ops->owner) held.
+ */
+ __module_get(newsock->ops->owner);
+
+ newfd = get_unused_fd_flags(flags);
+ if (unlikely(newfd < 0)) {
+ err = newfd;
+ sock_release(newsock);
+ goto out_put;
+ }
+ newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
+ if (unlikely(IS_ERR(newfile))) {
+ err = PTR_ERR(newfile);
+ put_unused_fd(newfd);
+ sock_release(newsock);
+ goto out_put;
+ }
+
+ err = security_socket_accept(sock, newsock);
+ if (err)
+ goto out_fd;
+
+ err = sock->ops->accept(sock, newsock, sock->file->f_flags);
+ if (err < 0)
+ goto out_fd;
+
+ if (upeer_sockaddr) {
+ if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
+ &len, 2) < 0) {
+ err = -ECONNABORTED;
+ goto out_fd;
+ }
+ err = move_addr_to_user(&address,
+ len, upeer_sockaddr, upeer_addrlen);
+ if (err < 0)
+ goto out_fd;
+ }
+
+ /* File flags are not inherited via accept() unlike another OSes. */
+
+ fd_install(newfd, newfile);
+ err = newfd;
+
+out_put:
+ fput_light(sock->file, fput_needed);
+out:
+ return err;
+out_fd:
+ fput(newfile);
+ put_unused_fd(newfd);
+ goto out_put;
+}
+
+SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
+ int __user *, upeer_addrlen)
+{
+ return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
+}
+
+/*
+ * Attempt to connect to a socket with the server address. The address
+ * is in user space so we verify it is OK and move it to kernel space.
+ *
+ * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
+ * break bindings
+ *
+ * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
+ * other SEQPACKET protocols that take time to connect() as it doesn't
+ * include the -EINPROGRESS status for such sockets.
+ */
+
+SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
+ int, addrlen)
+{
+ struct socket *sock;
+ struct sockaddr_storage address;
+ int err, fput_needed;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ goto out;
+ err = move_addr_to_kernel(uservaddr, addrlen, &address);
+ if (err < 0)
+ goto out_put;
+
+ err =
+ security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
+ if (err)
+ goto out_put;
+
+ err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
+ sock->file->f_flags);
+out_put:
+ fput_light(sock->file, fput_needed);
+out:
+ return err;
+}
+
+/*
+ * Get the local address ('name') of a socket object. Move the obtained
+ * name to user space.
+ */
+
+SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
+ int __user *, usockaddr_len)
+{
+ struct socket *sock;
+ struct sockaddr_storage address;
+ int len, err, fput_needed;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ goto out;
+
+ err = security_socket_getsockname(sock);
+ if (err)
+ goto out_put;
+
+ err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0);
+ if (err)
+ goto out_put;
+ err = move_addr_to_user(&address, len, usockaddr, usockaddr_len);
+
+out_put:
+ fput_light(sock->file, fput_needed);
+out:
+ return err;
+}
+
+/*
+ * Get the remote address ('name') of a socket object. Move the obtained
+ * name to user space.
+ */
+
+SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
+ int __user *, usockaddr_len)
+{
+ struct socket *sock;
+ struct sockaddr_storage address;
+ int len, err, fput_needed;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (sock != NULL) {
+ err = security_socket_getpeername(sock);
+ if (err) {
+ fput_light(sock->file, fput_needed);
+ return err;
+ }
+
+ err =
+ sock->ops->getname(sock, (struct sockaddr *)&address, &len,
+ 1);
+ if (!err)
+ err = move_addr_to_user(&address, len, usockaddr,
+ usockaddr_len);
+ fput_light(sock->file, fput_needed);
+ }
+ return err;
+}
+
+/*
+ * Send a datagram to a given address. We move the address into kernel
+ * space and check the user space data area is readable before invoking
+ * the protocol.
+ */
+
+SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
+ unsigned int, flags, struct sockaddr __user *, addr,
+ int, addr_len)
+{
+ struct socket *sock;
+ struct sockaddr_storage address;
+ int err;
+ struct msghdr msg;
+ struct iovec iov;
+ int fput_needed;
+
+ err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter);
+ if (unlikely(err))
+ return err;
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ goto out;
+
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+ if (addr) {
+ err = move_addr_to_kernel(addr, addr_len, &address);
+ if (err < 0)
+ goto out_put;
+ msg.msg_name = (struct sockaddr *)&address;
+ msg.msg_namelen = addr_len;
+ }
+ if (sock->file->f_flags & O_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+ msg.msg_flags = flags;
+ err = sock_sendmsg(sock, &msg);
+
+out_put:
+ fput_light(sock->file, fput_needed);
+out:
+ return err;
+}
+
+/*
+ * Send a datagram down a socket.
+ */
+
+SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
+ unsigned int, flags)
+{
+ return sys_sendto(fd, buff, len, flags, NULL, 0);
+}
+
+/*
+ * Receive a frame from the socket and optionally record the address of the
+ * sender. We verify the buffers are writable and if needed move the
+ * sender address from kernel to user space.
+ */
+
+SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
+ unsigned int, flags, struct sockaddr __user *, addr,
+ int __user *, addr_len)
+{
+ struct socket *sock;
+ struct iovec iov;
+ struct msghdr msg;
+ struct sockaddr_storage address;
+ int err, err2;
+ int fput_needed;
+
+ err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter);
+ if (unlikely(err))
+ return err;
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ goto out;
+
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ /* Save some cycles and don't copy the address if not needed */
+ msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
+ /* We assume all kernel code knows the size of sockaddr_storage */
+ msg.msg_namelen = 0;
+ if (sock->file->f_flags & O_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+ err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags);
+
+ if (err >= 0 && addr != NULL) {
+ err2 = move_addr_to_user(&address,
+ msg.msg_namelen, addr, addr_len);
+ if (err2 < 0)
+ err = err2;
+ }
+
+ fput_light(sock->file, fput_needed);
+out:
+ return err;
+}
+
+/*
+ * Receive a datagram from a socket.
+ */
+
+SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
+ unsigned int, flags)
+{
+ return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
+}
+
+/*
+ * Set a socket option. Because we don't know the option lengths we have
+ * to pass the user mode parameter for the protocols to sort out.
+ */
+
+SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
+ char __user *, optval, int, optlen)
+{
+ int err, fput_needed;
+ struct socket *sock;
+
+ if (optlen < 0)
+ return -EINVAL;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (sock != NULL) {
+ err = security_socket_setsockopt(sock, level, optname);
+ if (err)
+ goto out_put;
+
+ if (level == SOL_SOCKET)
+ err =
+ sock_setsockopt(sock, level, optname, optval,
+ optlen);
+ else
+ err =
+ sock->ops->setsockopt(sock, level, optname, optval,
+ optlen);
+out_put:
+ fput_light(sock->file, fput_needed);
+ }
+ return err;
+}
+
+/*
+ * Get a socket option. Because we don't know the option lengths we have
+ * to pass a user mode parameter for the protocols to sort out.
+ */
+
+SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
+ char __user *, optval, int __user *, optlen)
+{
+ int err, fput_needed;
+ struct socket *sock;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (sock != NULL) {
+ err = security_socket_getsockopt(sock, level, optname);
+ if (err)
+ goto out_put;
+
+ if (level == SOL_SOCKET)
+ err =
+ sock_getsockopt(sock, level, optname, optval,
+ optlen);
+ else
+ err =
+ sock->ops->getsockopt(sock, level, optname, optval,
+ optlen);
+out_put:
+ fput_light(sock->file, fput_needed);
+ }
+ return err;
+}
+
+/*
+ * Shutdown a socket.
+ */
+
+SYSCALL_DEFINE2(shutdown, int, fd, int, how)
+{
+ int err, fput_needed;
+ struct socket *sock;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (sock != NULL) {
+ err = security_socket_shutdown(sock, how);
+ if (!err)
+ err = sock->ops->shutdown(sock, how);
+ fput_light(sock->file, fput_needed);
+ }
+ return err;
+}
+
+/* A couple of helpful macros for getting the address of the 32/64 bit
+ * fields which are the same type (int / unsigned) on our platforms.
+ */
+#define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
+#define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen)
+#define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags)
+
+struct used_address {
+ struct sockaddr_storage name;
+ unsigned int name_len;
+};
+
+static int copy_msghdr_from_user(struct msghdr *kmsg,
+ struct user_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec **iov)
+{
+ struct sockaddr __user *uaddr;
+ struct iovec __user *uiov;
+ size_t nr_segs;
+ ssize_t err;
+
+ if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
+ __get_user(uaddr, &umsg->msg_name) ||
+ __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
+ __get_user(uiov, &umsg->msg_iov) ||
+ __get_user(nr_segs, &umsg->msg_iovlen) ||
+ __get_user(kmsg->msg_control, &umsg->msg_control) ||
+ __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
+ __get_user(kmsg->msg_flags, &umsg->msg_flags))
+ return -EFAULT;
+
+ if (!uaddr)
+ kmsg->msg_namelen = 0;
+
+ if (kmsg->msg_namelen < 0)
+ return -EINVAL;
+
+ if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
+ kmsg->msg_namelen = sizeof(struct sockaddr_storage);
+
+ if (save_addr)
+ *save_addr = uaddr;
+
+ if (uaddr && kmsg->msg_namelen) {
+ if (!save_addr) {
+ err = move_addr_to_kernel(uaddr, kmsg->msg_namelen,
+ kmsg->msg_name);
+ if (err < 0)
+ return err;
+ }
+ } else {
+ kmsg->msg_name = NULL;
+ kmsg->msg_namelen = 0;
+ }
+
+ if (nr_segs > UIO_MAXIOV)
+ return -EMSGSIZE;
+
+ kmsg->msg_iocb = NULL;
+
+ return import_iovec(save_addr ? READ : WRITE, uiov, nr_segs,
+ UIO_FASTIOV, iov, &kmsg->msg_iter);
+}
+
+static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
+ struct msghdr *msg_sys, unsigned int flags,
+ struct used_address *used_address)
+{
+ struct compat_msghdr __user *msg_compat =
+ (struct compat_msghdr __user *)msg;
+ struct sockaddr_storage address;
+ struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+ unsigned char ctl[sizeof(struct cmsghdr) + 20]
+ __attribute__ ((aligned(sizeof(__kernel_size_t))));
+ /* 20 is size of ipv6_pktinfo */
+ unsigned char *ctl_buf = ctl;
+ int ctl_len;
+ ssize_t err;
+
+ msg_sys->msg_name = &address;
+
+ if (MSG_CMSG_COMPAT & flags)
+ err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov);
+ else
+ err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov);
+ if (err < 0)
+ return err;
+
+ err = -ENOBUFS;
+
+ if (msg_sys->msg_controllen > INT_MAX)
+ goto out_freeiov;
+ ctl_len = msg_sys->msg_controllen;
+ if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
+ err =
+ cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
+ sizeof(ctl));
+ if (err)
+ goto out_freeiov;
+ ctl_buf = msg_sys->msg_control;
+ ctl_len = msg_sys->msg_controllen;
+ } else if (ctl_len) {
+ if (ctl_len > sizeof(ctl)) {
+ ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
+ if (ctl_buf == NULL)
+ goto out_freeiov;
+ }
+ err = -EFAULT;
+ /*
+ * Careful! Before this, msg_sys->msg_control contains a user pointer.
+ * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
+ * checking falls down on this.
+ */
+ if (copy_from_user(ctl_buf,
+ (void __user __force *)msg_sys->msg_control,
+ ctl_len))
+ goto out_freectl;
+ msg_sys->msg_control = ctl_buf;
+ }
+ msg_sys->msg_flags = flags;
+
+ if (sock->file->f_flags & O_NONBLOCK)
+ msg_sys->msg_flags |= MSG_DONTWAIT;
+ /*
+ * If this is sendmmsg() and current destination address is same as
+ * previously succeeded address, omit asking LSM's decision.
+ * used_address->name_len is initialized to UINT_MAX so that the first
+ * destination address never matches.
+ */
+ if (used_address && msg_sys->msg_name &&
+ used_address->name_len == msg_sys->msg_namelen &&
+ !memcmp(&used_address->name, msg_sys->msg_name,
+ used_address->name_len)) {
+ err = sock_sendmsg_nosec(sock, msg_sys);
+ goto out_freectl;
+ }
+ err = sock_sendmsg(sock, msg_sys);
+ /*
+ * If this is sendmmsg() and sending to current destination address was
+ * successful, remember it.
+ */
+ if (used_address && err >= 0) {
+ used_address->name_len = msg_sys->msg_namelen;
+ if (msg_sys->msg_name)
+ memcpy(&used_address->name, msg_sys->msg_name,
+ used_address->name_len);
+ }
+
+out_freectl:
+ if (ctl_buf != ctl)
+ sock_kfree_s(sock->sk, ctl_buf, ctl_len);
+out_freeiov:
+ kfree(iov);
+ return err;
+}
+
+/*
+ * BSD sendmsg interface
+ */
+
+long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
+{
+ int fput_needed, err;
+ struct msghdr msg_sys;
+ struct socket *sock;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ goto out;
+
+ err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
+
+ fput_light(sock->file, fput_needed);
+out:
+ return err;
+}
+
+SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
+{
+ if (flags & MSG_CMSG_COMPAT)
+ return -EINVAL;
+ return __sys_sendmsg(fd, msg, flags);
+}
+
+/*
+ * Linux sendmmsg interface
+ */
+
+int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+ unsigned int flags)
+{
+ int fput_needed, err, datagrams;
+ struct socket *sock;
+ struct mmsghdr __user *entry;
+ struct compat_mmsghdr __user *compat_entry;
+ struct msghdr msg_sys;
+ struct used_address used_address;
+
+ if (vlen > UIO_MAXIOV)
+ vlen = UIO_MAXIOV;
+
+ datagrams = 0;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ return err;
+
+ used_address.name_len = UINT_MAX;
+ entry = mmsg;
+ compat_entry = (struct compat_mmsghdr __user *)mmsg;
+ err = 0;
+
+ while (datagrams < vlen) {
+ if (MSG_CMSG_COMPAT & flags) {
+ err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
+ &msg_sys, flags, &used_address);
+ if (err < 0)
+ break;
+ err = __put_user(err, &compat_entry->msg_len);
+ ++compat_entry;
+ } else {
+ err = ___sys_sendmsg(sock,
+ (struct user_msghdr __user *)entry,
+ &msg_sys, flags, &used_address);
+ if (err < 0)
+ break;
+ err = put_user(err, &entry->msg_len);
+ ++entry;
+ }
+
+ if (err)
+ break;
+ ++datagrams;
+ }
+
+ fput_light(sock->file, fput_needed);
+
+ /* We only return an error if no datagrams were able to be sent */
+ if (datagrams != 0)
+ return datagrams;
+
+ return err;
+}
+
+SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags)
+{
+ if (flags & MSG_CMSG_COMPAT)
+ return -EINVAL;
+ return __sys_sendmmsg(fd, mmsg, vlen, flags);
+}
+
+static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
+ struct msghdr *msg_sys, unsigned int flags, int nosec)
+{
+ struct compat_msghdr __user *msg_compat =
+ (struct compat_msghdr __user *)msg;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ unsigned long cmsg_ptr;
+ int total_len, len;
+ ssize_t err;
+
+ /* kernel mode address */
+ struct sockaddr_storage addr;
+
+ /* user mode address pointers */
+ struct sockaddr __user *uaddr;
+ int __user *uaddr_len = COMPAT_NAMELEN(msg);
+
+ msg_sys->msg_name = &addr;
+
+ if (MSG_CMSG_COMPAT & flags)
+ err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov);
+ else
+ err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
+ if (err < 0)
+ return err;
+ total_len = iov_iter_count(&msg_sys->msg_iter);
+
+ cmsg_ptr = (unsigned long)msg_sys->msg_control;
+ msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+
+ /* We assume all kernel code knows the size of sockaddr_storage */
+ msg_sys->msg_namelen = 0;
+
+ if (sock->file->f_flags & O_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+ err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
+ total_len, flags);
+ if (err < 0)
+ goto out_freeiov;
+ len = err;
+
+ if (uaddr != NULL) {
+ err = move_addr_to_user(&addr,
+ msg_sys->msg_namelen, uaddr,
+ uaddr_len);
+ if (err < 0)
+ goto out_freeiov;
+ }
+ err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
+ COMPAT_FLAGS(msg));
+ if (err)
+ goto out_freeiov;
+ if (MSG_CMSG_COMPAT & flags)
+ err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
+ &msg_compat->msg_controllen);
+ else
+ err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
+ &msg->msg_controllen);
+ if (err)
+ goto out_freeiov;
+ err = len;
+
+out_freeiov:
+ kfree(iov);
+ return err;
+}
+
+/*
+ * BSD recvmsg interface
+ */
+
+long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
+{
+ int fput_needed, err;
+ struct msghdr msg_sys;
+ struct socket *sock;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ goto out;
+
+ err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+
+ fput_light(sock->file, fput_needed);
+out:
+ return err;
+}
+
+SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
+ unsigned int, flags)
+{
+ if (flags & MSG_CMSG_COMPAT)
+ return -EINVAL;
+ return __sys_recvmsg(fd, msg, flags);
+}
+
+/*
+ * Linux recvmmsg interface
+ */
+
+int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+ unsigned int flags, struct timespec *timeout)
+{
+ int fput_needed, err, datagrams;
+ struct socket *sock;
+ struct mmsghdr __user *entry;
+ struct compat_mmsghdr __user *compat_entry;
+ struct msghdr msg_sys;
+ struct timespec end_time;
+
+ if (timeout &&
+ poll_select_set_timeout(&end_time, timeout->tv_sec,
+ timeout->tv_nsec))
+ return -EINVAL;
+
+ datagrams = 0;
+
+ sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ if (!sock)
+ return err;
+
+ err = sock_error(sock->sk);
+ if (err)
+ goto out_put;
+
+ entry = mmsg;
+ compat_entry = (struct compat_mmsghdr __user *)mmsg;
+
+ while (datagrams < vlen) {
+ /*
+ * No need to ask LSM for more than the first datagram.
+ */
+ if (MSG_CMSG_COMPAT & flags) {
+ err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry,
+ &msg_sys, flags & ~MSG_WAITFORONE,
+ datagrams);
+ if (err < 0)
+ break;
+ err = __put_user(err, &compat_entry->msg_len);
+ ++compat_entry;
+ } else {
+ err = ___sys_recvmsg(sock,
+ (struct user_msghdr __user *)entry,
+ &msg_sys, flags & ~MSG_WAITFORONE,
+ datagrams);
+ if (err < 0)
+ break;
+ err = put_user(err, &entry->msg_len);
+ ++entry;
+ }
+
+ if (err)
+ break;
+ ++datagrams;
+
+ /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
+ if (flags & MSG_WAITFORONE)
+ flags |= MSG_DONTWAIT;
+
+ if (timeout) {
+ ktime_get_ts(timeout);
+ *timeout = timespec_sub(end_time, *timeout);
+ if (timeout->tv_sec < 0) {
+ timeout->tv_sec = timeout->tv_nsec = 0;
+ break;
+ }
+
+ /* Timeout, return less than vlen datagrams */
+ if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
+ break;
+ }
+
+ /* Out of band data, return right away */
+ if (msg_sys.msg_flags & MSG_OOB)
+ break;
+ }
+
+out_put:
+ fput_light(sock->file, fput_needed);
+
+ if (err == 0)
+ return datagrams;
+
+ if (datagrams != 0) {
+ /*
+ * We may return less entries than requested (vlen) if the
+ * sock is non block and there aren't enough datagrams...
+ */
+ if (err != -EAGAIN) {
+ /*
+ * ... or if recvmsg returns an error after we
+ * received some datagrams, where we record the
+ * error to return on the next call or if the
+ * app asks about it using getsockopt(SO_ERROR).
+ */
+ sock->sk->sk_err = -err;
+ }
+
+ return datagrams;
+ }
+
+ return err;
+}
+
+SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags,
+ struct timespec __user *, timeout)
+{
+ int datagrams;
+ struct timespec timeout_sys;
+
+ if (flags & MSG_CMSG_COMPAT)
+ return -EINVAL;
+
+ if (!timeout)
+ return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
+
+ if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
+ return -EFAULT;
+
+ datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
+
+ if (datagrams > 0 &&
+ copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
+ datagrams = -EFAULT;
+
+ return datagrams;
+}
+
+#ifdef __ARCH_WANT_SYS_SOCKETCALL
+/* Argument list sizes for sys_socketcall */
+#define AL(x) ((x) * sizeof(unsigned long))
+static const unsigned char nargs[21] = {
+ AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
+ AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
+ AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
+ AL(4), AL(5), AL(4)
+};
+
+#undef AL
+
+/*
+ * System call vectors.
+ *
+ * Argument checking cleaned up. Saved 20% in size.
+ * This function doesn't need to set the kernel lock because
+ * it is set by the callees.
+ */
+
+SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
+{
+ unsigned long a[AUDITSC_ARGS];
+ unsigned long a0, a1;
+ int err;
+ unsigned int len;
+
+ if (call < 1 || call > SYS_SENDMMSG)
+ return -EINVAL;
+
+ len = nargs[call];
+ if (len > sizeof(a))
+ return -EINVAL;
+
+ /* copy_from_user should be SMP safe. */
+ if (copy_from_user(a, args, len))
+ return -EFAULT;
+
+ err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
+ if (err)
+ return err;
+
+ a0 = a[0];
+ a1 = a[1];
+
+ switch (call) {
+ case SYS_SOCKET:
+ err = sys_socket(a0, a1, a[2]);
+ break;
+ case SYS_BIND:
+ err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
+ break;
+ case SYS_CONNECT:
+ err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
+ break;
+ case SYS_LISTEN:
+ err = sys_listen(a0, a1);
+ break;
+ case SYS_ACCEPT:
+ err = sys_accept4(a0, (struct sockaddr __user *)a1,
+ (int __user *)a[2], 0);
+ break;
+ case SYS_GETSOCKNAME:
+ err =
+ sys_getsockname(a0, (struct sockaddr __user *)a1,
+ (int __user *)a[2]);
+ break;
+ case SYS_GETPEERNAME:
+ err =
+ sys_getpeername(a0, (struct sockaddr __user *)a1,
+ (int __user *)a[2]);
+ break;
+ case SYS_SOCKETPAIR:
+ err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
+ break;
+ case SYS_SEND:
+ err = sys_send(a0, (void __user *)a1, a[2], a[3]);
+ break;
+ case SYS_SENDTO:
+ err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
+ (struct sockaddr __user *)a[4], a[5]);
+ break;
+ case SYS_RECV:
+ err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
+ break;
+ case SYS_RECVFROM:
+ err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
+ (struct sockaddr __user *)a[4],
+ (int __user *)a[5]);
+ break;
+ case SYS_SHUTDOWN:
+ err = sys_shutdown(a0, a1);
+ break;
+ case SYS_SETSOCKOPT:
+ err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
+ break;
+ case SYS_GETSOCKOPT:
+ err =
+ sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
+ (int __user *)a[4]);
+ break;
+ case SYS_SENDMSG:
+ err = sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2]);
+ break;
+ case SYS_SENDMMSG:
+ err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
+ break;
+ case SYS_RECVMSG:
+ err = sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2]);
+ break;
+ case SYS_RECVMMSG:
+ err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
+ (struct timespec __user *)a[4]);
+ break;
+ case SYS_ACCEPT4:
+ err = sys_accept4(a0, (struct sockaddr __user *)a1,
+ (int __user *)a[2], a[3]);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+#endif /* __ARCH_WANT_SYS_SOCKETCALL */
+
+/**
+ * sock_register - add a socket protocol handler
+ * @ops: description of protocol
+ *
+ * This function is called by a protocol handler that wants to
+ * advertise its address family, and have it linked into the
+ * socket interface. The value ops->family corresponds to the
+ * socket system call protocol family.
+ */
+int sock_register(const struct net_proto_family *ops)
+{
+ int err;
+
+ if (ops->family >= NPROTO) {
+ pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
+ return -ENOBUFS;
+ }
+
+ spin_lock(&net_family_lock);
+ if (rcu_dereference_protected(net_families[ops->family],
+ lockdep_is_held(&net_family_lock)))
+ err = -EEXIST;
+ else {
+ rcu_assign_pointer(net_families[ops->family], ops);
+ err = 0;
+ }
+ spin_unlock(&net_family_lock);
+
+ pr_info("NET: Registered protocol family %d\n", ops->family);
+ return err;
+}
+EXPORT_SYMBOL(sock_register);
+
+/**
+ * sock_unregister - remove a protocol handler
+ * @family: protocol family to remove
+ *
+ * This function is called by a protocol handler that wants to
+ * remove its address family, and have it unlinked from the
+ * new socket creation.
+ *
+ * If protocol handler is a module, then it can use module reference
+ * counts to protect against new references. If protocol handler is not
+ * a module then it needs to provide its own protection in
+ * the ops->create routine.
+ */
+void sock_unregister(int family)
+{
+ BUG_ON(family < 0 || family >= NPROTO);
+
+ spin_lock(&net_family_lock);
+ RCU_INIT_POINTER(net_families[family], NULL);
+ spin_unlock(&net_family_lock);
+
+ synchronize_rcu();
+
+ pr_info("NET: Unregistered protocol family %d\n", family);
+}
+EXPORT_SYMBOL(sock_unregister);
+
+static int __init sock_init(void)
+{
+ int err;
+ /*
+ * Initialize the network sysctl infrastructure.
+ */
+ err = net_sysctl_init();
+ if (err)
+ goto out;
+
+ /*
+ * Initialize skbuff SLAB cache
+ */
+ skb_init();
+
+ /*
+ * Initialize the protocols module.
+ */
+
+ init_inodecache();
+
+ err = register_filesystem(&sock_fs_type);
+ if (err)
+ goto out_fs;
+ sock_mnt = kern_mount(&sock_fs_type);
+ if (IS_ERR(sock_mnt)) {
+ err = PTR_ERR(sock_mnt);
+ goto out_mount;
+ }
+
+ /* The real protocol initialization is performed in later initcalls.
+ */
+
+#ifdef CONFIG_NETFILTER
+ err = netfilter_init();
+ if (err)
+ goto out;
+#endif
+
+ ptp_classifier_init();
+
+out:
+ return err;
+
+out_mount:
+ unregister_filesystem(&sock_fs_type);
+out_fs:
+ goto out;
+}
+
+core_initcall(sock_init); /* early initcall */
+
+#ifdef CONFIG_PROC_FS
+void socket_seq_show(struct seq_file *seq)
+{
+ int cpu;
+ int counter = 0;
+
+ for_each_possible_cpu(cpu)
+ counter += per_cpu(sockets_in_use, cpu);
+
+ /* It can be negative, by the way. 8) */
+ if (counter < 0)
+ counter = 0;
+
+ seq_printf(seq, "sockets: used %d\n", counter);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_COMPAT
+static int do_siocgstamp(struct net *net, struct socket *sock,
+ unsigned int cmd, void __user *up)
+{
+ mm_segment_t old_fs = get_fs();
+ struct timeval ktv;
+ int err;
+
+ set_fs(KERNEL_DS);
+ err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
+ set_fs(old_fs);
+ if (!err)
+ err = compat_put_timeval(&ktv, up);
+
+ return err;
+}
+
+static int do_siocgstampns(struct net *net, struct socket *sock,
+ unsigned int cmd, void __user *up)
+{
+ mm_segment_t old_fs = get_fs();
+ struct timespec kts;
+ int err;
+
+ set_fs(KERNEL_DS);
+ err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
+ set_fs(old_fs);
+ if (!err)
+ err = compat_put_timespec(&kts, up);
+
+ return err;
+}
+
+static int dev_ifname32(struct net *net, struct compat_ifreq __user *uifr32)
+{
+ struct ifreq __user *uifr;
+ int err;
+
+ uifr = compat_alloc_user_space(sizeof(struct ifreq));
+ if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
+ return -EFAULT;
+
+ err = dev_ioctl(net, SIOCGIFNAME, uifr);
+ if (err)
+ return err;
+
+ if (copy_in_user(uifr32, uifr, sizeof(struct compat_ifreq)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
+{
+ struct compat_ifconf ifc32;
+ struct ifconf ifc;
+ struct ifconf __user *uifc;
+ struct compat_ifreq __user *ifr32;
+ struct ifreq __user *ifr;
+ unsigned int i, j;
+ int err;
+
+ if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
+ return -EFAULT;
+
+ memset(&ifc, 0, sizeof(ifc));
+ if (ifc32.ifcbuf == 0) {
+ ifc32.ifc_len = 0;
+ ifc.ifc_len = 0;
+ ifc.ifc_req = NULL;
+ uifc = compat_alloc_user_space(sizeof(struct ifconf));
+ } else {
+ size_t len = ((ifc32.ifc_len / sizeof(struct compat_ifreq)) + 1) *
+ sizeof(struct ifreq);
+ uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
+ ifc.ifc_len = len;
+ ifr = ifc.ifc_req = (void __user *)(uifc + 1);
+ ifr32 = compat_ptr(ifc32.ifcbuf);
+ for (i = 0; i < ifc32.ifc_len; i += sizeof(struct compat_ifreq)) {
+ if (copy_in_user(ifr, ifr32, sizeof(struct compat_ifreq)))
+ return -EFAULT;
+ ifr++;
+ ifr32++;
+ }
+ }
+ if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ err = dev_ioctl(net, SIOCGIFCONF, uifc);
+ if (err)
+ return err;
+
+ if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ ifr = ifc.ifc_req;
+ ifr32 = compat_ptr(ifc32.ifcbuf);
+ for (i = 0, j = 0;
+ i + sizeof(struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len;
+ i += sizeof(struct compat_ifreq), j += sizeof(struct ifreq)) {
+ if (copy_in_user(ifr32, ifr, sizeof(struct compat_ifreq)))
+ return -EFAULT;
+ ifr32++;
+ ifr++;
+ }
+
+ if (ifc32.ifcbuf == 0) {
+ /* Translate from 64-bit structure multiple to
+ * a 32-bit one.
+ */
+ i = ifc.ifc_len;
+ i = ((i / sizeof(struct ifreq)) * sizeof(struct compat_ifreq));
+ ifc32.ifc_len = i;
+ } else {
+ ifc32.ifc_len = i;
+ }
+ if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
+{
+ struct compat_ethtool_rxnfc __user *compat_rxnfc;
+ bool convert_in = false, convert_out = false;
+ size_t buf_size = ALIGN(sizeof(struct ifreq), 8);
+ struct ethtool_rxnfc __user *rxnfc;
+ struct ifreq __user *ifr;
+ u32 rule_cnt = 0, actual_rule_cnt;
+ u32 ethcmd;
+ u32 data;
+ int ret;
+
+ if (get_user(data, &ifr32->ifr_ifru.ifru_data))
+ return -EFAULT;
+
+ compat_rxnfc = compat_ptr(data);
+
+ if (get_user(ethcmd, &compat_rxnfc->cmd))
+ return -EFAULT;
+
+ /* Most ethtool structures are defined without padding.
+ * Unfortunately struct ethtool_rxnfc is an exception.
+ */
+ switch (ethcmd) {
+ default:
+ break;
+ case ETHTOOL_GRXCLSRLALL:
+ /* Buffer size is variable */
+ if (get_user(rule_cnt, &compat_rxnfc->rule_cnt))
+ return -EFAULT;
+ if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32))
+ return -ENOMEM;
+ buf_size += rule_cnt * sizeof(u32);
+ /* fall through */
+ case ETHTOOL_GRXRINGS:
+ case ETHTOOL_GRXCLSRLCNT:
+ case ETHTOOL_GRXCLSRULE:
+ case ETHTOOL_SRXCLSRLINS:
+ convert_out = true;
+ /* fall through */
+ case ETHTOOL_SRXCLSRLDEL:
+ buf_size += sizeof(struct ethtool_rxnfc);
+ convert_in = true;
+ break;
+ }
+
+ ifr = compat_alloc_user_space(buf_size);
+ rxnfc = (void __user *)ifr + ALIGN(sizeof(struct ifreq), 8);
+
+ if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
+ return -EFAULT;
+
+ if (put_user(convert_in ? rxnfc : compat_ptr(data),
+ &ifr->ifr_ifru.ifru_data))
+ return -EFAULT;
+
+ if (convert_in) {
+ /* We expect there to be holes between fs.m_ext and
+ * fs.ring_cookie and at the end of fs, but nowhere else.
+ */
+ BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
+ sizeof(compat_rxnfc->fs.m_ext) !=
+ offsetof(struct ethtool_rxnfc, fs.m_ext) +
+ sizeof(rxnfc->fs.m_ext));
+ BUILD_BUG_ON(
+ offsetof(struct compat_ethtool_rxnfc, fs.location) -
+ offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
+ offsetof(struct ethtool_rxnfc, fs.location) -
+ offsetof(struct ethtool_rxnfc, fs.ring_cookie));
+
+ if (copy_in_user(rxnfc, compat_rxnfc,
+ (void __user *)(&rxnfc->fs.m_ext + 1) -
+ (void __user *)rxnfc) ||
+ copy_in_user(&rxnfc->fs.ring_cookie,
+ &compat_rxnfc->fs.ring_cookie,
+ (void __user *)(&rxnfc->fs.location + 1) -
+ (void __user *)&rxnfc->fs.ring_cookie) ||
+ copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt,
+ sizeof(rxnfc->rule_cnt)))
+ return -EFAULT;
+ }
+
+ ret = dev_ioctl(net, SIOCETHTOOL, ifr);
+ if (ret)
+ return ret;
+
+ if (convert_out) {
+ if (copy_in_user(compat_rxnfc, rxnfc,
+ (const void __user *)(&rxnfc->fs.m_ext + 1) -
+ (const void __user *)rxnfc) ||
+ copy_in_user(&compat_rxnfc->fs.ring_cookie,
+ &rxnfc->fs.ring_cookie,
+ (const void __user *)(&rxnfc->fs.location + 1) -
+ (const void __user *)&rxnfc->fs.ring_cookie) ||
+ copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt,
+ sizeof(rxnfc->rule_cnt)))
+ return -EFAULT;
+
+ if (ethcmd == ETHTOOL_GRXCLSRLALL) {
+ /* As an optimisation, we only copy the actual
+ * number of rules that the underlying
+ * function returned. Since Mallory might
+ * change the rule count in user memory, we
+ * check that it is less than the rule count
+ * originally given (as the user buffer size),
+ * which has been range-checked.
+ */
+ if (get_user(actual_rule_cnt, &rxnfc->rule_cnt))
+ return -EFAULT;
+ if (actual_rule_cnt < rule_cnt)
+ rule_cnt = actual_rule_cnt;
+ if (copy_in_user(&compat_rxnfc->rule_locs[0],
+ &rxnfc->rule_locs[0],
+ rule_cnt * sizeof(u32)))
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
+{
+ void __user *uptr;
+ compat_uptr_t uptr32;
+ struct ifreq __user *uifr;
+
+ uifr = compat_alloc_user_space(sizeof(*uifr));
+ if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
+ return -EFAULT;
+
+ if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
+ return -EFAULT;
+
+ uptr = compat_ptr(uptr32);
+
+ if (put_user(uptr, &uifr->ifr_settings.ifs_ifsu.raw_hdlc))
+ return -EFAULT;
+
+ return dev_ioctl(net, SIOCWANDEV, uifr);
+}
+
+static int bond_ioctl(struct net *net, unsigned int cmd,
+ struct compat_ifreq __user *ifr32)
+{
+ struct ifreq kifr;
+ mm_segment_t old_fs;
+ int err;
+
+ switch (cmd) {
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDCHANGEACTIVE:
+ if (copy_from_user(&kifr, ifr32, sizeof(struct compat_ifreq)))
+ return -EFAULT;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = dev_ioctl(net, cmd,
+ (struct ifreq __user __force *) &kifr);
+ set_fs(old_fs);
+
+ return err;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+/* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
+static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
+ struct compat_ifreq __user *u_ifreq32)
+{
+ struct ifreq __user *u_ifreq64;
+ char tmp_buf[IFNAMSIZ];
+ void __user *data64;
+ u32 data32;
+
+ if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
+ IFNAMSIZ))
+ return -EFAULT;
+ if (get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
+ return -EFAULT;
+ data64 = compat_ptr(data32);
+
+ u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
+
+ if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
+ IFNAMSIZ))
+ return -EFAULT;
+ if (put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
+ return -EFAULT;
+
+ return dev_ioctl(net, cmd, u_ifreq64);
+}
+
+static int dev_ifsioc(struct net *net, struct socket *sock,
+ unsigned int cmd, struct compat_ifreq __user *uifr32)
+{
+ struct ifreq __user *uifr;
+ int err;
+
+ uifr = compat_alloc_user_space(sizeof(*uifr));
+ if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
+ return -EFAULT;
+
+ err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);
+
+ if (!err) {
+ switch (cmd) {
+ case SIOCGIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCGIFMEM:
+ case SIOCGIFHWADDR:
+ case SIOCGIFINDEX:
+ case SIOCGIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCGIFNETMASK:
+ case SIOCGIFPFLAGS:
+ case SIOCGIFTXQLEN:
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
+ err = -EFAULT;
+ break;
+ }
+ }
+ return err;
+}
+
+static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
+ struct compat_ifreq __user *uifr32)
+{
+ struct ifreq ifr;
+ struct compat_ifmap __user *uifmap32;
+ mm_segment_t old_fs;
+ int err;
+
+ uifmap32 = &uifr32->ifr_ifru.ifru_map;
+ err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
+ err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
+ err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
+ err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
+ err |= get_user(ifr.ifr_map.irq, &uifmap32->irq);
+ err |= get_user(ifr.ifr_map.dma, &uifmap32->dma);
+ err |= get_user(ifr.ifr_map.port, &uifmap32->port);
+ if (err)
+ return -EFAULT;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = dev_ioctl(net, cmd, (void __user __force *)&ifr);
+ set_fs(old_fs);
+
+ if (cmd == SIOCGIFMAP && !err) {
+ err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
+ err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
+ err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
+ err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
+ err |= put_user(ifr.ifr_map.irq, &uifmap32->irq);
+ err |= put_user(ifr.ifr_map.dma, &uifmap32->dma);
+ err |= put_user(ifr.ifr_map.port, &uifmap32->port);
+ if (err)
+ err = -EFAULT;
+ }
+ return err;
+}
+
+struct rtentry32 {
+ u32 rt_pad1;
+ struct sockaddr rt_dst; /* target address */
+ struct sockaddr rt_gateway; /* gateway addr (RTF_GATEWAY) */
+ struct sockaddr rt_genmask; /* target network mask (IP) */
+ unsigned short rt_flags;
+ short rt_pad2;
+ u32 rt_pad3;
+ unsigned char rt_tos;
+ unsigned char rt_class;
+ short rt_pad4;
+ short rt_metric; /* +1 for binary compatibility! */
+ /* char * */ u32 rt_dev; /* forcing the device at add */
+ u32 rt_mtu; /* per route MTU/Window */
+ u32 rt_window; /* Window clamping */
+ unsigned short rt_irtt; /* Initial RTT */
+};
+
+struct in6_rtmsg32 {
+ struct in6_addr rtmsg_dst;
+ struct in6_addr rtmsg_src;
+ struct in6_addr rtmsg_gateway;
+ u32 rtmsg_type;
+ u16 rtmsg_dst_len;
+ u16 rtmsg_src_len;
+ u32 rtmsg_metric;
+ u32 rtmsg_info;
+ u32 rtmsg_flags;
+ s32 rtmsg_ifindex;
+};
+
+static int routing_ioctl(struct net *net, struct socket *sock,
+ unsigned int cmd, void __user *argp)
+{
+ int ret;
+ void *r = NULL;
+ struct in6_rtmsg r6;
+ struct rtentry r4;
+ char devname[16];
+ u32 rtdev;
+ mm_segment_t old_fs = get_fs();
+
+ if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */
+ struct in6_rtmsg32 __user *ur6 = argp;
+ ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst),
+ 3 * sizeof(struct in6_addr));
+ ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
+ ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
+ ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
+ ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
+ ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
+ ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
+ ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
+
+ r = (void *) &r6;
+ } else { /* ipv4 */
+ struct rtentry32 __user *ur4 = argp;
+ ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
+ 3 * sizeof(struct sockaddr));
+ ret |= get_user(r4.rt_flags, &(ur4->rt_flags));
+ ret |= get_user(r4.rt_metric, &(ur4->rt_metric));
+ ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu));
+ ret |= get_user(r4.rt_window, &(ur4->rt_window));
+ ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt));
+ ret |= get_user(rtdev, &(ur4->rt_dev));
+ if (rtdev) {
+ ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
+ r4.rt_dev = (char __user __force *)devname;
+ devname[15] = 0;
+ } else
+ r4.rt_dev = NULL;
+
+ r = (void *) &r4;
+ }
+
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ set_fs(KERNEL_DS);
+ ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
+ set_fs(old_fs);
+
+out:
+ return ret;
+}
+
+/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
+ * for some operations; this forces use of the newer bridge-utils that
+ * use compatible ioctls
+ */
+static int old_bridge_ioctl(compat_ulong_t __user *argp)
+{
+ compat_ulong_t tmp;
+
+ if (get_user(tmp, argp))
+ return -EFAULT;
+ if (tmp == BRCTL_GET_VERSION)
+ return BRCTL_VERSION + 1;
+ return -EINVAL;
+}
+
+static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
+ unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = compat_ptr(arg);
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+
+ if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
+ return compat_ifr_data_ioctl(net, cmd, argp);
+
+ switch (cmd) {
+ case SIOCSIFBR:
+ case SIOCGIFBR:
+ return old_bridge_ioctl(argp);
+ case SIOCGIFNAME:
+ return dev_ifname32(net, argp);
+ case SIOCGIFCONF:
+ return dev_ifconf(net, argp);
+ case SIOCETHTOOL:
+ return ethtool_ioctl(net, argp);
+ case SIOCWANDEV:
+ return compat_siocwandev(net, argp);
+ case SIOCGIFMAP:
+ case SIOCSIFMAP:
+ return compat_sioc_ifmap(net, cmd, argp);
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDCHANGEACTIVE:
+ return bond_ioctl(net, cmd, argp);
+ case SIOCADDRT:
+ case SIOCDELRT:
+ return routing_ioctl(net, sock, cmd, argp);
+ case SIOCGSTAMP:
+ return do_siocgstamp(net, sock, cmd, argp);
+ case SIOCGSTAMPNS:
+ return do_siocgstampns(net, sock, cmd, argp);
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ case SIOCSHWTSTAMP:
+ case SIOCGHWTSTAMP:
+ return compat_ifr_data_ioctl(net, cmd, argp);
+
+ case FIOSETOWN:
+ case SIOCSPGRP:
+ case FIOGETOWN:
+ case SIOCGPGRP:
+ case SIOCBRADDBR:
+ case SIOCBRDELBR:
+ case SIOCGIFVLAN:
+ case SIOCSIFVLAN:
+ case SIOCADDDLCI:
+ case SIOCDELDLCI:
+ return sock_ioctl(file, cmd, arg);
+
+ case SIOCGIFFLAGS:
+ case SIOCSIFFLAGS:
+ case SIOCGIFMETRIC:
+ case SIOCSIFMETRIC:
+ case SIOCGIFMTU:
+ case SIOCSIFMTU:
+ case SIOCGIFMEM:
+ case SIOCSIFMEM:
+ case SIOCGIFHWADDR:
+ case SIOCSIFHWADDR:
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ case SIOCGIFINDEX:
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCSIFHWBROADCAST:
+ case SIOCDIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCGIFTXQLEN:
+ case SIOCSIFTXQLEN:
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ case SIOCSIFNAME:
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSMIIREG:
+ return dev_ifsioc(net, sock, cmd, argp);
+
+ case SIOCSARP:
+ case SIOCGARP:
+ case SIOCDARP:
+ case SIOCATMARK:
+ return sock_do_ioctl(net, sock, cmd, arg);
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static long compat_sock_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct socket *sock = file->private_data;
+ int ret = -ENOIOCTLCMD;
+ struct sock *sk;
+ struct net *net;
+
+ sk = sock->sk;
+ net = sock_net(sk);
+
+ if (sock->ops->compat_ioctl)
+ ret = sock->ops->compat_ioctl(sock, cmd, arg);
+
+ if (ret == -ENOIOCTLCMD &&
+ (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
+ ret = compat_wext_handle_ioctl(net, cmd, arg);
+
+ if (ret == -ENOIOCTLCMD)
+ ret = compat_sock_ioctl_trans(file, sock, cmd, arg);
+
+ return ret;
+}
+#endif
+
+int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
+{
+ return sock->ops->bind(sock, addr, addrlen);
+}
+EXPORT_SYMBOL(kernel_bind);
+
+int kernel_listen(struct socket *sock, int backlog)
+{
+ return sock->ops->listen(sock, backlog);
+}
+EXPORT_SYMBOL(kernel_listen);
+
+int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err;
+
+ err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
+ newsock);
+ if (err < 0)
+ goto done;
+
+ err = sock->ops->accept(sock, *newsock, flags);
+ if (err < 0) {
+ sock_release(*newsock);
+ *newsock = NULL;
+ goto done;
+ }
+
+ (*newsock)->ops = sock->ops;
+ __module_get((*newsock)->ops->owner);
+
+done:
+ return err;
+}
+EXPORT_SYMBOL(kernel_accept);
+
+int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
+ int flags)
+{
+ return sock->ops->connect(sock, addr, addrlen, flags);
+}
+EXPORT_SYMBOL(kernel_connect);
+
+int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
+ int *addrlen)
+{
+ return sock->ops->getname(sock, addr, addrlen, 0);
+}
+EXPORT_SYMBOL(kernel_getsockname);
+
+int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
+ int *addrlen)
+{
+ return sock->ops->getname(sock, addr, addrlen, 1);
+}
+EXPORT_SYMBOL(kernel_getpeername);
+
+int kernel_getsockopt(struct socket *sock, int level, int optname,
+ char *optval, int *optlen)
+{
+ mm_segment_t oldfs = get_fs();
+ char __user *uoptval;
+ int __user *uoptlen;
+ int err;
+
+ uoptval = (char __user __force *) optval;
+ uoptlen = (int __user __force *) optlen;
+
+ set_fs(KERNEL_DS);
+ if (level == SOL_SOCKET)
+ err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
+ else
+ err = sock->ops->getsockopt(sock, level, optname, uoptval,
+ uoptlen);
+ set_fs(oldfs);
+ return err;
+}
+EXPORT_SYMBOL(kernel_getsockopt);
+
+int kernel_setsockopt(struct socket *sock, int level, int optname,
+ char *optval, unsigned int optlen)
+{
+ mm_segment_t oldfs = get_fs();
+ char __user *uoptval;
+ int err;
+
+ uoptval = (char __user __force *) optval;
+
+ set_fs(KERNEL_DS);
+ if (level == SOL_SOCKET)
+ err = sock_setsockopt(sock, level, optname, uoptval, optlen);
+ else
+ err = sock->ops->setsockopt(sock, level, optname, uoptval,
+ optlen);
+ set_fs(oldfs);
+ return err;
+}
+EXPORT_SYMBOL(kernel_setsockopt);
+
+int kernel_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
+{
+ if (sock->ops->sendpage)
+ return sock->ops->sendpage(sock, page, offset, size, flags);
+
+ return sock_no_sendpage(sock, page, offset, size, flags);
+}
+EXPORT_SYMBOL(kernel_sendpage);
+
+int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
+{
+ mm_segment_t oldfs = get_fs();
+ int err;
+
+ set_fs(KERNEL_DS);
+ err = sock->ops->ioctl(sock, cmd, arg);
+ set_fs(oldfs);
+
+ return err;
+}
+EXPORT_SYMBOL(kernel_sock_ioctl);
+
+int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
+{
+ return sock->ops->shutdown(sock, how);
+}
+EXPORT_SYMBOL(kernel_sock_shutdown);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
new file mode 100644
index 000000000..9068e72aa
--- /dev/null
+++ b/net/sunrpc/Kconfig
@@ -0,0 +1,75 @@
+config SUNRPC
+ tristate
+ depends on MULTIUSER
+
+config SUNRPC_GSS
+ tristate
+ select OID_REGISTRY
+ depends on MULTIUSER
+
+config SUNRPC_BACKCHANNEL
+ bool
+ depends on SUNRPC
+
+config SUNRPC_SWAP
+ bool
+ depends on SUNRPC
+
+config RPCSEC_GSS_KRB5
+ tristate "Secure RPC: Kerberos V mechanism"
+ depends on SUNRPC && CRYPTO
+ depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS
+ depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES
+ depends on CRYPTO_ARC4
+ default y
+ select SUNRPC_GSS
+ help
+ Choose Y here to enable Secure RPC using the Kerberos version 5
+ GSS-API mechanism (RFC 1964).
+
+ Secure RPC calls with Kerberos require an auxiliary user-space
+ daemon which may be found in the Linux nfs-utils package
+ available from http://linux-nfs.org/. In addition, user-space
+ Kerberos support should be installed.
+
+ If unsure, say Y.
+
+config SUNRPC_DEBUG
+ bool "RPC: Enable dprintk debugging"
+ depends on SUNRPC && SYSCTL
+ select DEBUG_FS
+ help
+ This option enables a sysctl-based debugging interface
+ that is be used by the 'rpcdebug' utility to turn on or off
+ logging of different aspects of the kernel RPC activity.
+
+ Disabling this option will make your kernel slightly smaller,
+ but makes troubleshooting NFS issues significantly harder.
+
+ If unsure, say Y.
+
+config SUNRPC_XPRT_RDMA_CLIENT
+ tristate "RPC over RDMA Client Support"
+ depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+ default SUNRPC && INFINIBAND
+ help
+ This option allows the NFS client to support an RDMA-enabled
+ transport.
+
+ To compile RPC client RDMA transport support as a module,
+ choose M here: the module will be called xprtrdma.
+
+ If unsure, say N.
+
+config SUNRPC_XPRT_RDMA_SERVER
+ tristate "RPC over RDMA Server Support"
+ depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+ default SUNRPC && INFINIBAND
+ help
+ This option allows the NFS server to support an RDMA-enabled
+ transport.
+
+ To compile RPC server RDMA transport support as a module,
+ choose M here: the module will be called svcrdma.
+
+ If unsure, say N.
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
new file mode 100644
index 000000000..15e6f6c23
--- /dev/null
+++ b/net/sunrpc/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for Linux kernel SUN RPC
+#
+
+
+obj-$(CONFIG_SUNRPC) += sunrpc.o
+obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
+
+obj-y += xprtrdma/
+
+sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
+ auth.o auth_null.o auth_unix.o auth_generic.o \
+ svc.o svcsock.o svcauth.o svcauth_unix.o \
+ addr.o rpcb_clnt.o timer.o xdr.o \
+ sunrpc_syms.o cache.o rpc_pipe.o \
+ svc_xprt.o
+sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
+sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o
+sunrpc-$(CONFIG_PROC_FS) += stats.o
+sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
new file mode 100644
index 000000000..2e0a6f92e
--- /dev/null
+++ b/net/sunrpc/addr.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright 2009, Oracle. All rights reserved.
+ *
+ * Convert socket addresses to presentation addresses and universal
+ * addresses, and vice versa.
+ *
+ * Universal addresses are introduced by RFC 1833 and further refined by
+ * recent RFCs describing NFSv4. The universal address format is part
+ * of the external (network) interface provided by rpcbind version 3
+ * and 4, and by NFSv4. Such an address is a string containing a
+ * presentation format IP address followed by a port number in
+ * "hibyte.lobyte" format.
+ *
+ * IPv6 addresses can also include a scope ID, typically denoted by
+ * a '%' followed by a device name or a non-negative integer. Refer to
+ * RFC 4291, Section 2.2 for details on IPv6 presentation formats.
+ */
+
+#include <net/ipv6.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap,
+ char *buf, const int buflen)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ const struct in6_addr *addr = &sin6->sin6_addr;
+
+ /*
+ * RFC 4291, Section 2.2.2
+ *
+ * Shorthanded ANY address
+ */
+ if (ipv6_addr_any(addr))
+ return snprintf(buf, buflen, "::");
+
+ /*
+ * RFC 4291, Section 2.2.2
+ *
+ * Shorthanded loopback address
+ */
+ if (ipv6_addr_loopback(addr))
+ return snprintf(buf, buflen, "::1");
+
+ /*
+ * RFC 4291, Section 2.2.3
+ *
+ * Special presentation address format for mapped v4
+ * addresses.
+ */
+ if (ipv6_addr_v4mapped(addr))
+ return snprintf(buf, buflen, "::ffff:%pI4",
+ &addr->s6_addr32[3]);
+
+ /*
+ * RFC 4291, Section 2.2.1
+ */
+ return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+static size_t rpc_ntop6(const struct sockaddr *sap,
+ char *buf, const size_t buflen)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ char scopebuf[IPV6_SCOPE_ID_LEN];
+ size_t len;
+ int rc;
+
+ len = rpc_ntop6_noscopeid(sap, buf, buflen);
+ if (unlikely(len == 0))
+ return len;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return len;
+ if (sin6->sin6_scope_id == 0)
+ return len;
+
+ rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u",
+ IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id);
+ if (unlikely((size_t)rc > sizeof(scopebuf)))
+ return 0;
+
+ len += rc;
+ if (unlikely(len > buflen))
+ return 0;
+
+ strcat(buf, scopebuf);
+ return len;
+}
+
+#else /* !IS_ENABLED(CONFIG_IPV6) */
+
+static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap,
+ char *buf, const int buflen)
+{
+ return 0;
+}
+
+static size_t rpc_ntop6(const struct sockaddr *sap,
+ char *buf, const size_t buflen)
+{
+ return 0;
+}
+
+#endif /* !IS_ENABLED(CONFIG_IPV6) */
+
+static int rpc_ntop4(const struct sockaddr *sap,
+ char *buf, const size_t buflen)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+ return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+/**
+ * rpc_ntop - construct a presentation address in @buf
+ * @sap: socket address
+ * @buf: construction area
+ * @buflen: size of @buf, in bytes
+ *
+ * Plants a %NUL-terminated string in @buf and returns the length
+ * of the string, excluding the %NUL. Otherwise zero is returned.
+ */
+size_t rpc_ntop(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+ switch (sap->sa_family) {
+ case AF_INET:
+ return rpc_ntop4(sap, buf, buflen);
+ case AF_INET6:
+ return rpc_ntop6(sap, buf, buflen);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_ntop);
+
+static size_t rpc_pton4(const char *buf, const size_t buflen,
+ struct sockaddr *sap, const size_t salen)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ u8 *addr = (u8 *)&sin->sin_addr.s_addr;
+
+ if (buflen > INET_ADDRSTRLEN || salen < sizeof(struct sockaddr_in))
+ return 0;
+
+ memset(sap, 0, sizeof(struct sockaddr_in));
+
+ if (in4_pton(buf, buflen, addr, '\0', NULL) == 0)
+ return 0;
+
+ sin->sin_family = AF_INET;
+ return sizeof(struct sockaddr_in);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int rpc_parse_scope_id(struct net *net, const char *buf,
+ const size_t buflen, const char *delim,
+ struct sockaddr_in6 *sin6)
+{
+ char *p;
+ size_t len;
+
+ if ((buf + buflen) == delim)
+ return 1;
+
+ if (*delim != IPV6_SCOPE_DELIMITER)
+ return 0;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return 0;
+
+ len = (buf + buflen) - delim - 1;
+ p = kstrndup(delim + 1, len, GFP_KERNEL);
+ if (p) {
+ u32 scope_id = 0;
+ struct net_device *dev;
+
+ dev = dev_get_by_name(net, p);
+ if (dev != NULL) {
+ scope_id = dev->ifindex;
+ dev_put(dev);
+ } else {
+ if (kstrtou32(p, 10, &scope_id) == 0) {
+ kfree(p);
+ return 0;
+ }
+ }
+
+ kfree(p);
+
+ sin6->sin6_scope_id = scope_id;
+ return 1;
+ }
+
+ return 0;
+}
+
+static size_t rpc_pton6(struct net *net, const char *buf, const size_t buflen,
+ struct sockaddr *sap, const size_t salen)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+ const char *delim;
+
+ if (buflen > (INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN) ||
+ salen < sizeof(struct sockaddr_in6))
+ return 0;
+
+ memset(sap, 0, sizeof(struct sockaddr_in6));
+
+ if (in6_pton(buf, buflen, addr, IPV6_SCOPE_DELIMITER, &delim) == 0)
+ return 0;
+
+ if (!rpc_parse_scope_id(net, buf, buflen, delim, sin6))
+ return 0;
+
+ sin6->sin6_family = AF_INET6;
+ return sizeof(struct sockaddr_in6);
+}
+#else
+static size_t rpc_pton6(struct net *net, const char *buf, const size_t buflen,
+ struct sockaddr *sap, const size_t salen)
+{
+ return 0;
+}
+#endif
+
+/**
+ * rpc_pton - Construct a sockaddr in @sap
+ * @net: applicable network namespace
+ * @buf: C string containing presentation format IP address
+ * @buflen: length of presentation address in bytes
+ * @sap: buffer into which to plant socket address
+ * @salen: size of buffer in bytes
+ *
+ * Returns the size of the socket address if successful; otherwise
+ * zero is returned.
+ *
+ * Plants a socket address in @sap and returns the size of the
+ * socket address, if successful. Returns zero if an error
+ * occurred.
+ */
+size_t rpc_pton(struct net *net, const char *buf, const size_t buflen,
+ struct sockaddr *sap, const size_t salen)
+{
+ unsigned int i;
+
+ for (i = 0; i < buflen; i++)
+ if (buf[i] == ':')
+ return rpc_pton6(net, buf, buflen, sap, salen);
+ return rpc_pton4(buf, buflen, sap, salen);
+}
+EXPORT_SYMBOL_GPL(rpc_pton);
+
+/**
+ * rpc_sockaddr2uaddr - Construct a universal address string from @sap.
+ * @sap: socket address
+ * @gfp_flags: allocation mode
+ *
+ * Returns a %NUL-terminated string in dynamically allocated memory;
+ * otherwise NULL is returned if an error occurred. Caller must
+ * free the returned string.
+ */
+char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags)
+{
+ char portbuf[RPCBIND_MAXUADDRPLEN];
+ char addrbuf[RPCBIND_MAXUADDRLEN];
+ unsigned short port;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ if (rpc_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+ return NULL;
+ port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+ break;
+ case AF_INET6:
+ if (rpc_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+ return NULL;
+ port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+ break;
+ default:
+ return NULL;
+ }
+
+ if (snprintf(portbuf, sizeof(portbuf),
+ ".%u.%u", port >> 8, port & 0xff) > (int)sizeof(portbuf))
+ return NULL;
+
+ if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) > sizeof(addrbuf))
+ return NULL;
+
+ return kstrdup(addrbuf, gfp_flags);
+}
+
+/**
+ * rpc_uaddr2sockaddr - convert a universal address to a socket address.
+ * @net: applicable network namespace
+ * @uaddr: C string containing universal address to convert
+ * @uaddr_len: length of universal address string
+ * @sap: buffer into which to plant socket address
+ * @salen: size of buffer
+ *
+ * @uaddr does not have to be '\0'-terminated, but kstrtou8() and
+ * rpc_pton() require proper string termination to be successful.
+ *
+ * Returns the size of the socket address if successful; otherwise
+ * zero is returned.
+ */
+size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
+ const size_t uaddr_len, struct sockaddr *sap,
+ const size_t salen)
+{
+ char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')];
+ u8 portlo, porthi;
+ unsigned short port;
+
+ if (uaddr_len > RPCBIND_MAXUADDRLEN)
+ return 0;
+
+ memcpy(buf, uaddr, uaddr_len);
+
+ buf[uaddr_len] = '\0';
+ c = strrchr(buf, '.');
+ if (unlikely(c == NULL))
+ return 0;
+ if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0))
+ return 0;
+
+ *c = '\0';
+ c = strrchr(buf, '.');
+ if (unlikely(c == NULL))
+ return 0;
+ if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0))
+ return 0;
+
+ port = (unsigned short)((porthi << 8) | portlo);
+
+ *c = '\0';
+ if (rpc_pton(net, buf, strlen(buf), sap, salen) == 0)
+ return 0;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)sap)->sin_port = htons(port);
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ ((struct sockaddr_in6 *)sap)->sin6_port = htons(port);
+ return sizeof(struct sockaddr_in6);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_uaddr2sockaddr);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
new file mode 100644
index 000000000..47f38be41
--- /dev/null
+++ b/net/sunrpc/auth.c
@@ -0,0 +1,891 @@
+/*
+ * linux/net/sunrpc/auth.c
+ *
+ * Generic RPC client authentication API.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/hash.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/spinlock.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+#define RPC_CREDCACHE_DEFAULT_HASHBITS (4)
+struct rpc_cred_cache {
+ struct hlist_head *hashtable;
+ unsigned int hashbits;
+ spinlock_t lock;
+};
+
+static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS;
+
+static DEFINE_SPINLOCK(rpc_authflavor_lock);
+static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
+ &authnull_ops, /* AUTH_NULL */
+ &authunix_ops, /* AUTH_UNIX */
+ NULL, /* others can be loadable modules */
+};
+
+static LIST_HEAD(cred_unused);
+static unsigned long number_cred_unused;
+
+#define MAX_HASHTABLE_BITS (14)
+static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
+{
+ unsigned long num;
+ unsigned int nbits;
+ int ret;
+
+ if (!val)
+ goto out_inval;
+ ret = kstrtoul(val, 0, &num);
+ if (ret == -EINVAL)
+ goto out_inval;
+ nbits = fls(num);
+ if (num > (1U << nbits))
+ nbits++;
+ if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
+ goto out_inval;
+ *(unsigned int *)kp->arg = nbits;
+ return 0;
+out_inval:
+ return -EINVAL;
+}
+
+static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp)
+{
+ unsigned int nbits;
+
+ nbits = *(unsigned int *)kp->arg;
+ return sprintf(buffer, "%u", 1U << nbits);
+}
+
+#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
+
+static struct kernel_param_ops param_ops_hashtbl_sz = {
+ .set = param_set_hashtbl_sz,
+ .get = param_get_hashtbl_sz,
+};
+
+module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
+MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
+
+static unsigned long auth_max_cred_cachesize = ULONG_MAX;
+module_param(auth_max_cred_cachesize, ulong, 0644);
+MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size");
+
+static u32
+pseudoflavor_to_flavor(u32 flavor) {
+ if (flavor > RPC_AUTH_MAXFLAVOR)
+ return RPC_AUTH_GSS;
+ return flavor;
+}
+
+int
+rpcauth_register(const struct rpc_authops *ops)
+{
+ rpc_authflavor_t flavor;
+ int ret = -EPERM;
+
+ if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
+ return -EINVAL;
+ spin_lock(&rpc_authflavor_lock);
+ if (auth_flavors[flavor] == NULL) {
+ auth_flavors[flavor] = ops;
+ ret = 0;
+ }
+ spin_unlock(&rpc_authflavor_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpcauth_register);
+
+int
+rpcauth_unregister(const struct rpc_authops *ops)
+{
+ rpc_authflavor_t flavor;
+ int ret = -EPERM;
+
+ if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
+ return -EINVAL;
+ spin_lock(&rpc_authflavor_lock);
+ if (auth_flavors[flavor] == ops) {
+ auth_flavors[flavor] = NULL;
+ ret = 0;
+ }
+ spin_unlock(&rpc_authflavor_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpcauth_unregister);
+
+/**
+ * rpcauth_get_pseudoflavor - check if security flavor is supported
+ * @flavor: a security flavor
+ * @info: a GSS mech OID, quality of protection, and service value
+ *
+ * Verifies that an appropriate kernel module is available or already loaded.
+ * Returns an equivalent pseudoflavor, or RPC_AUTH_MAXFLAVOR if "flavor" is
+ * not supported locally.
+ */
+rpc_authflavor_t
+rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info)
+{
+ const struct rpc_authops *ops;
+ rpc_authflavor_t pseudoflavor;
+
+ ops = auth_flavors[flavor];
+ if (ops == NULL)
+ request_module("rpc-auth-%u", flavor);
+ spin_lock(&rpc_authflavor_lock);
+ ops = auth_flavors[flavor];
+ if (ops == NULL || !try_module_get(ops->owner)) {
+ spin_unlock(&rpc_authflavor_lock);
+ return RPC_AUTH_MAXFLAVOR;
+ }
+ spin_unlock(&rpc_authflavor_lock);
+
+ pseudoflavor = flavor;
+ if (ops->info2flavor != NULL)
+ pseudoflavor = ops->info2flavor(info);
+
+ module_put(ops->owner);
+ return pseudoflavor;
+}
+EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor);
+
+/**
+ * rpcauth_get_gssinfo - find GSS tuple matching a GSS pseudoflavor
+ * @pseudoflavor: GSS pseudoflavor to match
+ * @info: rpcsec_gss_info structure to fill in
+ *
+ * Returns zero and fills in "info" if pseudoflavor matches a
+ * supported mechanism.
+ */
+int
+rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info)
+{
+ rpc_authflavor_t flavor = pseudoflavor_to_flavor(pseudoflavor);
+ const struct rpc_authops *ops;
+ int result;
+
+ if (flavor >= RPC_AUTH_MAXFLAVOR)
+ return -EINVAL;
+
+ ops = auth_flavors[flavor];
+ if (ops == NULL)
+ request_module("rpc-auth-%u", flavor);
+ spin_lock(&rpc_authflavor_lock);
+ ops = auth_flavors[flavor];
+ if (ops == NULL || !try_module_get(ops->owner)) {
+ spin_unlock(&rpc_authflavor_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&rpc_authflavor_lock);
+
+ result = -ENOENT;
+ if (ops->flavor2info != NULL)
+ result = ops->flavor2info(pseudoflavor, info);
+
+ module_put(ops->owner);
+ return result;
+}
+EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
+
+/**
+ * rpcauth_list_flavors - discover registered flavors and pseudoflavors
+ * @array: array to fill in
+ * @size: size of "array"
+ *
+ * Returns the number of array items filled in, or a negative errno.
+ *
+ * The returned array is not sorted by any policy. Callers should not
+ * rely on the order of the items in the returned array.
+ */
+int
+rpcauth_list_flavors(rpc_authflavor_t *array, int size)
+{
+ rpc_authflavor_t flavor;
+ int result = 0;
+
+ spin_lock(&rpc_authflavor_lock);
+ for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) {
+ const struct rpc_authops *ops = auth_flavors[flavor];
+ rpc_authflavor_t pseudos[4];
+ int i, len;
+
+ if (result >= size) {
+ result = -ENOMEM;
+ break;
+ }
+
+ if (ops == NULL)
+ continue;
+ if (ops->list_pseudoflavors == NULL) {
+ array[result++] = ops->au_flavor;
+ continue;
+ }
+ len = ops->list_pseudoflavors(pseudos, ARRAY_SIZE(pseudos));
+ if (len < 0) {
+ result = len;
+ break;
+ }
+ for (i = 0; i < len; i++) {
+ if (result >= size) {
+ result = -ENOMEM;
+ break;
+ }
+ array[result++] = pseudos[i];
+ }
+ }
+ spin_unlock(&rpc_authflavor_lock);
+
+ dprintk("RPC: %s returns %d\n", __func__, result);
+ return result;
+}
+EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
+
+struct rpc_auth *
+rpcauth_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+{
+ struct rpc_auth *auth;
+ const struct rpc_authops *ops;
+ u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor);
+
+ auth = ERR_PTR(-EINVAL);
+ if (flavor >= RPC_AUTH_MAXFLAVOR)
+ goto out;
+
+ if ((ops = auth_flavors[flavor]) == NULL)
+ request_module("rpc-auth-%u", flavor);
+ spin_lock(&rpc_authflavor_lock);
+ ops = auth_flavors[flavor];
+ if (ops == NULL || !try_module_get(ops->owner)) {
+ spin_unlock(&rpc_authflavor_lock);
+ goto out;
+ }
+ spin_unlock(&rpc_authflavor_lock);
+ auth = ops->create(args, clnt);
+ module_put(ops->owner);
+ if (IS_ERR(auth))
+ return auth;
+ if (clnt->cl_auth)
+ rpcauth_release(clnt->cl_auth);
+ clnt->cl_auth = auth;
+
+out:
+ return auth;
+}
+EXPORT_SYMBOL_GPL(rpcauth_create);
+
+void
+rpcauth_release(struct rpc_auth *auth)
+{
+ if (!atomic_dec_and_test(&auth->au_count))
+ return;
+ auth->au_ops->destroy(auth);
+}
+
+static DEFINE_SPINLOCK(rpc_credcache_lock);
+
+static void
+rpcauth_unhash_cred_locked(struct rpc_cred *cred)
+{
+ hlist_del_rcu(&cred->cr_hash);
+ smp_mb__before_atomic();
+ clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+}
+
+static int
+rpcauth_unhash_cred(struct rpc_cred *cred)
+{
+ spinlock_t *cache_lock;
+ int ret;
+
+ cache_lock = &cred->cr_auth->au_credcache->lock;
+ spin_lock(cache_lock);
+ ret = atomic_read(&cred->cr_count) == 0;
+ if (ret)
+ rpcauth_unhash_cred_locked(cred);
+ spin_unlock(cache_lock);
+ return ret;
+}
+
+/*
+ * Initialize RPC credential cache
+ */
+int
+rpcauth_init_credcache(struct rpc_auth *auth)
+{
+ struct rpc_cred_cache *new;
+ unsigned int hashsize;
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ goto out_nocache;
+ new->hashbits = auth_hashbits;
+ hashsize = 1U << new->hashbits;
+ new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL);
+ if (!new->hashtable)
+ goto out_nohashtbl;
+ spin_lock_init(&new->lock);
+ auth->au_credcache = new;
+ return 0;
+out_nohashtbl:
+ kfree(new);
+out_nocache:
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(rpcauth_init_credcache);
+
+/*
+ * Setup a credential key lifetime timeout notification
+ */
+int
+rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
+{
+ if (!cred->cr_auth->au_ops->key_timeout)
+ return 0;
+ return cred->cr_auth->au_ops->key_timeout(auth, cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
+
+bool
+rpcauth_cred_key_to_expire(struct rpc_cred *cred)
+{
+ if (!cred->cr_ops->crkey_to_expire)
+ return false;
+ return cred->cr_ops->crkey_to_expire(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
+
+char *
+rpcauth_stringify_acceptor(struct rpc_cred *cred)
+{
+ if (!cred->cr_ops->crstringify_acceptor)
+ return NULL;
+ return cred->cr_ops->crstringify_acceptor(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor);
+
+/*
+ * Destroy a list of credentials
+ */
+static inline
+void rpcauth_destroy_credlist(struct list_head *head)
+{
+ struct rpc_cred *cred;
+
+ while (!list_empty(head)) {
+ cred = list_entry(head->next, struct rpc_cred, cr_lru);
+ list_del_init(&cred->cr_lru);
+ put_rpccred(cred);
+ }
+}
+
+/*
+ * Clear the RPC credential cache, and delete those credentials
+ * that are not referenced.
+ */
+void
+rpcauth_clear_credcache(struct rpc_cred_cache *cache)
+{
+ LIST_HEAD(free);
+ struct hlist_head *head;
+ struct rpc_cred *cred;
+ unsigned int hashsize = 1U << cache->hashbits;
+ int i;
+
+ spin_lock(&rpc_credcache_lock);
+ spin_lock(&cache->lock);
+ for (i = 0; i < hashsize; i++) {
+ head = &cache->hashtable[i];
+ while (!hlist_empty(head)) {
+ cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
+ get_rpccred(cred);
+ if (!list_empty(&cred->cr_lru)) {
+ list_del(&cred->cr_lru);
+ number_cred_unused--;
+ }
+ list_add_tail(&cred->cr_lru, &free);
+ rpcauth_unhash_cred_locked(cred);
+ }
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&rpc_credcache_lock);
+ rpcauth_destroy_credlist(&free);
+}
+
+/*
+ * Destroy the RPC credential cache
+ */
+void
+rpcauth_destroy_credcache(struct rpc_auth *auth)
+{
+ struct rpc_cred_cache *cache = auth->au_credcache;
+
+ if (cache) {
+ auth->au_credcache = NULL;
+ rpcauth_clear_credcache(cache);
+ kfree(cache->hashtable);
+ kfree(cache);
+ }
+}
+EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
+
+
+#define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ)
+
+/*
+ * Remove stale credentials. Avoid sleeping inside the loop.
+ */
+static long
+rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
+{
+ spinlock_t *cache_lock;
+ struct rpc_cred *cred, *next;
+ unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
+ long freed = 0;
+
+ list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) {
+
+ if (nr_to_scan-- == 0)
+ break;
+ /*
+ * Enforce a 60 second garbage collection moratorium
+ * Note that the cred_unused list must be time-ordered.
+ */
+ if (time_in_range(cred->cr_expire, expired, jiffies) &&
+ test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
+ break;
+
+ list_del_init(&cred->cr_lru);
+ number_cred_unused--;
+ freed++;
+ if (atomic_read(&cred->cr_count) != 0)
+ continue;
+
+ cache_lock = &cred->cr_auth->au_credcache->lock;
+ spin_lock(cache_lock);
+ if (atomic_read(&cred->cr_count) == 0) {
+ get_rpccred(cred);
+ list_add_tail(&cred->cr_lru, free);
+ rpcauth_unhash_cred_locked(cred);
+ }
+ spin_unlock(cache_lock);
+ }
+ return freed;
+}
+
+static unsigned long
+rpcauth_cache_do_shrink(int nr_to_scan)
+{
+ LIST_HEAD(free);
+ unsigned long freed;
+
+ spin_lock(&rpc_credcache_lock);
+ freed = rpcauth_prune_expired(&free, nr_to_scan);
+ spin_unlock(&rpc_credcache_lock);
+ rpcauth_destroy_credlist(&free);
+
+ return freed;
+}
+
+/*
+ * Run memory cache shrinker.
+ */
+static unsigned long
+rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+
+{
+ if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+ return SHRINK_STOP;
+
+ /* nothing left, don't come back */
+ if (list_empty(&cred_unused))
+ return SHRINK_STOP;
+
+ return rpcauth_cache_do_shrink(sc->nr_to_scan);
+}
+
+static unsigned long
+rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+
+{
+ return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+}
+
+static void
+rpcauth_cache_enforce_limit(void)
+{
+ unsigned long diff;
+ unsigned int nr_to_scan;
+
+ if (number_cred_unused <= auth_max_cred_cachesize)
+ return;
+ diff = number_cred_unused - auth_max_cred_cachesize;
+ nr_to_scan = 100;
+ if (diff < nr_to_scan)
+ nr_to_scan = diff;
+ rpcauth_cache_do_shrink(nr_to_scan);
+}
+
+/*
+ * Look up a process' credentials in the authentication cache
+ */
+struct rpc_cred *
+rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
+ int flags)
+{
+ LIST_HEAD(free);
+ struct rpc_cred_cache *cache = auth->au_credcache;
+ struct rpc_cred *cred = NULL,
+ *entry, *new;
+ unsigned int nr;
+
+ nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
+ if (!entry->cr_ops->crmatch(acred, entry, flags))
+ continue;
+ if (flags & RPCAUTH_LOOKUP_RCU) {
+ if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
+ !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
+ cred = entry;
+ break;
+ }
+ spin_lock(&cache->lock);
+ if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
+ spin_unlock(&cache->lock);
+ continue;
+ }
+ cred = get_rpccred(entry);
+ spin_unlock(&cache->lock);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (cred != NULL)
+ goto found;
+
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ return ERR_PTR(-ECHILD);
+
+ new = auth->au_ops->crcreate(auth, acred, flags);
+ if (IS_ERR(new)) {
+ cred = new;
+ goto out;
+ }
+
+ spin_lock(&cache->lock);
+ hlist_for_each_entry(entry, &cache->hashtable[nr], cr_hash) {
+ if (!entry->cr_ops->crmatch(acred, entry, flags))
+ continue;
+ cred = get_rpccred(entry);
+ break;
+ }
+ if (cred == NULL) {
+ cred = new;
+ set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+ hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]);
+ } else
+ list_add_tail(&new->cr_lru, &free);
+ spin_unlock(&cache->lock);
+ rpcauth_cache_enforce_limit();
+found:
+ if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
+ cred->cr_ops->cr_init != NULL &&
+ !(flags & RPCAUTH_LOOKUP_NEW)) {
+ int res = cred->cr_ops->cr_init(auth, cred);
+ if (res < 0) {
+ put_rpccred(cred);
+ cred = ERR_PTR(res);
+ }
+ }
+ rpcauth_destroy_credlist(&free);
+out:
+ return cred;
+}
+EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache);
+
+struct rpc_cred *
+rpcauth_lookupcred(struct rpc_auth *auth, int flags)
+{
+ struct auth_cred acred;
+ struct rpc_cred *ret;
+ const struct cred *cred = current_cred();
+
+ dprintk("RPC: looking up %s cred\n",
+ auth->au_ops->au_name);
+
+ memset(&acred, 0, sizeof(acred));
+ acred.uid = cred->fsuid;
+ acred.gid = cred->fsgid;
+ acred.group_info = cred->group_info;
+ ret = auth->au_ops->lookup_cred(auth, &acred, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
+
+void
+rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
+ struct rpc_auth *auth, const struct rpc_credops *ops)
+{
+ INIT_HLIST_NODE(&cred->cr_hash);
+ INIT_LIST_HEAD(&cred->cr_lru);
+ atomic_set(&cred->cr_count, 1);
+ cred->cr_auth = auth;
+ cred->cr_ops = ops;
+ cred->cr_expire = jiffies;
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ cred->cr_magic = RPCAUTH_CRED_MAGIC;
+#endif
+ cred->cr_uid = acred->uid;
+}
+EXPORT_SYMBOL_GPL(rpcauth_init_cred);
+
+struct rpc_cred *
+rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
+{
+ dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
+ cred->cr_auth->au_ops->au_name, cred);
+ return get_rpccred(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred);
+
+static struct rpc_cred *
+rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
+{
+ struct rpc_auth *auth = task->tk_client->cl_auth;
+ struct auth_cred acred = {
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ };
+
+ dprintk("RPC: %5u looking up %s cred\n",
+ task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
+ return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
+}
+
+static struct rpc_cred *
+rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags)
+{
+ struct rpc_auth *auth = task->tk_client->cl_auth;
+
+ dprintk("RPC: %5u looking up %s cred\n",
+ task->tk_pid, auth->au_ops->au_name);
+ return rpcauth_lookupcred(auth, lookupflags);
+}
+
+static int
+rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_cred *new;
+ int lookupflags = 0;
+
+ if (flags & RPC_TASK_ASYNC)
+ lookupflags |= RPCAUTH_LOOKUP_NEW;
+ if (cred != NULL)
+ new = cred->cr_ops->crbind(task, cred, lookupflags);
+ else if (flags & RPC_TASK_ROOTCREDS)
+ new = rpcauth_bind_root_cred(task, lookupflags);
+ else
+ new = rpcauth_bind_new_cred(task, lookupflags);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ if (req->rq_cred != NULL)
+ put_rpccred(req->rq_cred);
+ req->rq_cred = new;
+ return 0;
+}
+
+void
+put_rpccred(struct rpc_cred *cred)
+{
+ /* Fast path for unhashed credentials */
+ if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
+ if (atomic_dec_and_test(&cred->cr_count))
+ cred->cr_ops->crdestroy(cred);
+ return;
+ }
+
+ if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
+ return;
+ if (!list_empty(&cred->cr_lru)) {
+ number_cred_unused--;
+ list_del_init(&cred->cr_lru);
+ }
+ if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
+ if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) {
+ cred->cr_expire = jiffies;
+ list_add_tail(&cred->cr_lru, &cred_unused);
+ number_cred_unused++;
+ goto out_nodestroy;
+ }
+ if (!rpcauth_unhash_cred(cred)) {
+ /* We were hashed and someone looked us up... */
+ goto out_nodestroy;
+ }
+ }
+ spin_unlock(&rpc_credcache_lock);
+ cred->cr_ops->crdestroy(cred);
+ return;
+out_nodestroy:
+ spin_unlock(&rpc_credcache_lock);
+}
+EXPORT_SYMBOL_GPL(put_rpccred);
+
+__be32 *
+rpcauth_marshcred(struct rpc_task *task, __be32 *p)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+ dprintk("RPC: %5u marshaling %s cred %p\n",
+ task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
+
+ return cred->cr_ops->crmarshal(task, p);
+}
+
+__be32 *
+rpcauth_checkverf(struct rpc_task *task, __be32 *p)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+ dprintk("RPC: %5u validating %s cred %p\n",
+ task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
+
+ return cred->cr_ops->crvalidate(task, p);
+}
+
+static void rpcauth_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
+ __be32 *data, void *obj)
+{
+ struct xdr_stream xdr;
+
+ xdr_init_encode(&xdr, &rqstp->rq_snd_buf, data);
+ encode(rqstp, &xdr, obj);
+}
+
+int
+rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp,
+ __be32 *data, void *obj)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+ dprintk("RPC: %5u using %s cred %p to wrap rpc data\n",
+ task->tk_pid, cred->cr_ops->cr_name, cred);
+ if (cred->cr_ops->crwrap_req)
+ return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj);
+ /* By default, we encode the arguments normally. */
+ rpcauth_wrap_req_encode(encode, rqstp, data, obj);
+ return 0;
+}
+
+static int
+rpcauth_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
+ __be32 *data, void *obj)
+{
+ struct xdr_stream xdr;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, data);
+ return decode(rqstp, &xdr, obj);
+}
+
+int
+rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp,
+ __be32 *data, void *obj)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+ dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n",
+ task->tk_pid, cred->cr_ops->cr_name, cred);
+ if (cred->cr_ops->crunwrap_resp)
+ return cred->cr_ops->crunwrap_resp(task, decode, rqstp,
+ data, obj);
+ /* By default, we decode the arguments normally. */
+ return rpcauth_unwrap_req_decode(decode, rqstp, data, obj);
+}
+
+int
+rpcauth_refreshcred(struct rpc_task *task)
+{
+ struct rpc_cred *cred;
+ int err;
+
+ cred = task->tk_rqstp->rq_cred;
+ if (cred == NULL) {
+ err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags);
+ if (err < 0)
+ goto out;
+ cred = task->tk_rqstp->rq_cred;
+ }
+ dprintk("RPC: %5u refreshing %s cred %p\n",
+ task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
+
+ err = cred->cr_ops->crrefresh(task);
+out:
+ if (err < 0)
+ task->tk_status = err;
+ return err;
+}
+
+void
+rpcauth_invalcred(struct rpc_task *task)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+ dprintk("RPC: %5u invalidating %s cred %p\n",
+ task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
+ if (cred)
+ clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+}
+
+int
+rpcauth_uptodatecred(struct rpc_task *task)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+ return cred == NULL ||
+ test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0;
+}
+
+static struct shrinker rpc_cred_shrinker = {
+ .count_objects = rpcauth_cache_shrink_count,
+ .scan_objects = rpcauth_cache_shrink_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+int __init rpcauth_init_module(void)
+{
+ int err;
+
+ err = rpc_init_authunix();
+ if (err < 0)
+ goto out1;
+ err = rpc_init_generic_auth();
+ if (err < 0)
+ goto out2;
+ register_shrinker(&rpc_cred_shrinker);
+ return 0;
+out2:
+ rpc_destroy_authunix();
+out1:
+ return err;
+}
+
+void rpcauth_remove_module(void)
+{
+ rpc_destroy_authunix();
+ rpc_destroy_generic_auth();
+ unregister_shrinker(&rpc_cred_shrinker);
+}
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
new file mode 100644
index 000000000..41248b182
--- /dev/null
+++ b/net/sunrpc/auth_generic.c
@@ -0,0 +1,290 @@
+/*
+ * Generic RPC credential
+ *
+ * Copyright (C) 2008, Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/sched.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+#define RPC_MACHINE_CRED_USERID GLOBAL_ROOT_UID
+#define RPC_MACHINE_CRED_GROUPID GLOBAL_ROOT_GID
+
+struct generic_cred {
+ struct rpc_cred gc_base;
+ struct auth_cred acred;
+};
+
+static struct rpc_auth generic_auth;
+static const struct rpc_credops generic_credops;
+
+/*
+ * Public call interface
+ */
+struct rpc_cred *rpc_lookup_cred(void)
+{
+ return rpcauth_lookupcred(&generic_auth, 0);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+
+struct rpc_cred *rpc_lookup_cred_nonblock(void)
+{
+ return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
+
+/*
+ * Public call interface for looking up machine creds.
+ */
+struct rpc_cred *rpc_lookup_machine_cred(const char *service_name)
+{
+ struct auth_cred acred = {
+ .uid = RPC_MACHINE_CRED_USERID,
+ .gid = RPC_MACHINE_CRED_GROUPID,
+ .principal = service_name,
+ .machine_cred = 1,
+ };
+
+ dprintk("RPC: looking up machine cred for service %s\n",
+ service_name);
+ return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
+
+static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
+ struct rpc_cred *cred, int lookupflags)
+{
+ struct rpc_auth *auth = task->tk_client->cl_auth;
+ struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred;
+
+ return auth->au_ops->lookup_cred(auth, acred, lookupflags);
+}
+
+/*
+ * Lookup generic creds for current process
+ */
+static struct rpc_cred *
+generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+ return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+}
+
+static struct rpc_cred *
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+ struct generic_cred *gcred;
+
+ gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+ if (gcred == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ rpcauth_init_cred(&gcred->gc_base, acred, &generic_auth, &generic_credops);
+ gcred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
+
+ gcred->acred.uid = acred->uid;
+ gcred->acred.gid = acred->gid;
+ gcred->acred.group_info = acred->group_info;
+ gcred->acred.ac_flags = 0;
+ if (gcred->acred.group_info != NULL)
+ get_group_info(gcred->acred.group_info);
+ gcred->acred.machine_cred = acred->machine_cred;
+ gcred->acred.principal = acred->principal;
+
+ dprintk("RPC: allocated %s cred %p for uid %d gid %d\n",
+ gcred->acred.machine_cred ? "machine" : "generic",
+ gcred,
+ from_kuid(&init_user_ns, acred->uid),
+ from_kgid(&init_user_ns, acred->gid));
+ return &gcred->gc_base;
+}
+
+static void
+generic_free_cred(struct rpc_cred *cred)
+{
+ struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base);
+
+ dprintk("RPC: generic_free_cred %p\n", gcred);
+ if (gcred->acred.group_info != NULL)
+ put_group_info(gcred->acred.group_info);
+ kfree(gcred);
+}
+
+static void
+generic_free_cred_callback(struct rcu_head *head)
+{
+ struct rpc_cred *cred = container_of(head, struct rpc_cred, cr_rcu);
+ generic_free_cred(cred);
+}
+
+static void
+generic_destroy_cred(struct rpc_cred *cred)
+{
+ call_rcu(&cred->cr_rcu, generic_free_cred_callback);
+}
+
+static int
+machine_cred_match(struct auth_cred *acred, struct generic_cred *gcred, int flags)
+{
+ if (!gcred->acred.machine_cred ||
+ gcred->acred.principal != acred->principal ||
+ !uid_eq(gcred->acred.uid, acred->uid) ||
+ !gid_eq(gcred->acred.gid, acred->gid))
+ return 0;
+ return 1;
+}
+
+/*
+ * Match credentials against current process creds.
+ */
+static int
+generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
+{
+ struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base);
+ int i;
+
+ if (acred->machine_cred)
+ return machine_cred_match(acred, gcred, flags);
+
+ if (!uid_eq(gcred->acred.uid, acred->uid) ||
+ !gid_eq(gcred->acred.gid, acred->gid) ||
+ gcred->acred.machine_cred != 0)
+ goto out_nomatch;
+
+ /* Optimisation in the case where pointers are identical... */
+ if (gcred->acred.group_info == acred->group_info)
+ goto out_match;
+
+ /* Slow path... */
+ if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
+ goto out_nomatch;
+ for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
+ if (!gid_eq(GROUP_AT(gcred->acred.group_info, i),
+ GROUP_AT(acred->group_info, i)))
+ goto out_nomatch;
+ }
+out_match:
+ return 1;
+out_nomatch:
+ return 0;
+}
+
+int __init rpc_init_generic_auth(void)
+{
+ return rpcauth_init_credcache(&generic_auth);
+}
+
+void rpc_destroy_generic_auth(void)
+{
+ rpcauth_destroy_credcache(&generic_auth);
+}
+
+/*
+ * Test the the current time (now) against the underlying credential key expiry
+ * minus a timeout and setup notification.
+ *
+ * The normal case:
+ * If 'now' is before the key expiry minus RPC_KEY_EXPIRE_TIMEO, set
+ * the RPC_CRED_NOTIFY_TIMEOUT flag to setup the underlying credential
+ * rpc_credops crmatch routine to notify this generic cred when it's key
+ * expiration is within RPC_KEY_EXPIRE_TIMEO, and return 0.
+ *
+ * The error case:
+ * If the underlying cred lookup fails, return -EACCES.
+ *
+ * The 'almost' error case:
+ * If 'now' is within key expiry minus RPC_KEY_EXPIRE_TIMEO, but not within
+ * key expiry minus RPC_KEY_EXPIRE_FAIL, set the RPC_CRED_EXPIRE_SOON bit
+ * on the acred ac_flags and return 0.
+ */
+static int
+generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
+{
+ struct auth_cred *acred = &container_of(cred, struct generic_cred,
+ gc_base)->acred;
+ struct rpc_cred *tcred;
+ int ret = 0;
+
+
+ /* Fast track for non crkey_timeout (no key) underlying credentials */
+ if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags))
+ return 0;
+
+ /* Fast track for the normal case */
+ if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags))
+ return 0;
+
+ /* lookup_cred either returns a valid referenced rpc_cred, or PTR_ERR */
+ tcred = auth->au_ops->lookup_cred(auth, acred, 0);
+ if (IS_ERR(tcred))
+ return -EACCES;
+
+ if (!tcred->cr_ops->crkey_timeout) {
+ set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
+ ret = 0;
+ goto out_put;
+ }
+
+ /* Test for the almost error case */
+ ret = tcred->cr_ops->crkey_timeout(tcred);
+ if (ret != 0) {
+ set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
+ ret = 0;
+ } else {
+ /* In case underlying cred key has been reset */
+ if (test_and_clear_bit(RPC_CRED_KEY_EXPIRE_SOON,
+ &acred->ac_flags))
+ dprintk("RPC: UID %d Credential key reset\n",
+ from_kuid(&init_user_ns, tcred->cr_uid));
+ /* set up fasttrack for the normal case */
+ set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
+ }
+
+out_put:
+ put_rpccred(tcred);
+ return ret;
+}
+
+static const struct rpc_authops generic_auth_ops = {
+ .owner = THIS_MODULE,
+ .au_name = "Generic",
+ .lookup_cred = generic_lookup_cred,
+ .crcreate = generic_create_cred,
+ .key_timeout = generic_key_timeout,
+};
+
+static struct rpc_auth generic_auth = {
+ .au_ops = &generic_auth_ops,
+ .au_count = ATOMIC_INIT(0),
+};
+
+static bool generic_key_to_expire(struct rpc_cred *cred)
+{
+ struct auth_cred *acred = &container_of(cred, struct generic_cred,
+ gc_base)->acred;
+ bool ret;
+
+ get_rpccred(cred);
+ ret = test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
+ put_rpccred(cred);
+
+ return ret;
+}
+
+static const struct rpc_credops generic_credops = {
+ .cr_name = "Generic cred",
+ .crdestroy = generic_destroy_cred,
+ .crbind = generic_bind_cred,
+ .crmatch = generic_match,
+ .crkey_to_expire = generic_key_to_expire,
+};
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
new file mode 100644
index 000000000..14e9e53e6
--- /dev/null
+++ b/net/sunrpc/auth_gss/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for Linux kernel rpcsec_gss implementation
+#
+
+obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
+
+auth_rpcgss-y := auth_gss.o gss_generic_token.o \
+ gss_mech_switch.o svcauth_gss.o \
+ gss_rpc_upcall.o gss_rpc_xdr.o
+
+obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
+
+rpcsec_gss_krb5-y := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
+ gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
new file mode 100644
index 000000000..dace13d76
--- /dev/null
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -0,0 +1,2097 @@
+/*
+ * linux/net/sunrpc/auth_gss/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/gss_err.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/gss_api.h>
+#include <asm/uaccess.h>
+#include <linux/hashtable.h>
+
+#include "../netns.h"
+
+static const struct rpc_authops authgss_ops;
+
+static const struct rpc_credops gss_credops;
+static const struct rpc_credops gss_nullops;
+
+#define GSS_RETRY_EXPIRED 5
+static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED;
+
+#define GSS_KEY_EXPIRE_TIMEO 240
+static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO;
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+#define GSS_CRED_SLACK (RPC_MAX_AUTH_SIZE * 2)
+/* length of a krb5 verifier (48), plus data added before arguments when
+ * using integrity (two 4-byte integers): */
+#define GSS_VERF_SLACK 100
+
+static DEFINE_HASHTABLE(gss_auth_hash_table, 4);
+static DEFINE_SPINLOCK(gss_auth_hash_lock);
+
+struct gss_pipe {
+ struct rpc_pipe_dir_object pdo;
+ struct rpc_pipe *pipe;
+ struct rpc_clnt *clnt;
+ const char *name;
+ struct kref kref;
+};
+
+struct gss_auth {
+ struct kref kref;
+ struct hlist_node hash;
+ struct rpc_auth rpc_auth;
+ struct gss_api_mech *mech;
+ enum rpc_gss_svc service;
+ struct rpc_clnt *client;
+ struct net *net;
+ /*
+ * There are two upcall pipes; dentry[1], named "gssd", is used
+ * for the new text-based upcall; dentry[0] is named after the
+ * mechanism (for example, "krb5") and exists for
+ * backwards-compatibility with older gssd's.
+ */
+ struct gss_pipe *gss_pipe[2];
+ const char *target_name;
+};
+
+/* pipe_version >= 0 if and only if someone has a pipe open. */
+static DEFINE_SPINLOCK(pipe_version_lock);
+static struct rpc_wait_queue pipe_version_rpc_waitqueue;
+static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue);
+static void gss_put_auth(struct gss_auth *gss_auth);
+
+static void gss_free_ctx(struct gss_cl_ctx *);
+static const struct rpc_pipe_ops gss_upcall_ops_v0;
+static const struct rpc_pipe_ops gss_upcall_ops_v1;
+
+static inline struct gss_cl_ctx *
+gss_get_ctx(struct gss_cl_ctx *ctx)
+{
+ atomic_inc(&ctx->count);
+ return ctx;
+}
+
+static inline void
+gss_put_ctx(struct gss_cl_ctx *ctx)
+{
+ if (atomic_dec_and_test(&ctx->count))
+ gss_free_ctx(ctx);
+}
+
+/* gss_cred_set_ctx:
+ * called by gss_upcall_callback and gss_create_upcall in order
+ * to set the gss context. The actual exchange of an old context
+ * and a new one is protected by the pipe->lock.
+ */
+static void
+gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
+{
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+
+ if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
+ return;
+ gss_get_ctx(ctx);
+ rcu_assign_pointer(gss_cred->gc_ctx, ctx);
+ set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+ smp_mb__before_atomic();
+ clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
+}
+
+static const void *
+simple_get_bytes(const void *p, const void *end, void *res, size_t len)
+{
+ const void *q = (const void *)((const char *)p + len);
+ if (unlikely(q > end || q < p))
+ return ERR_PTR(-EFAULT);
+ memcpy(res, p, len);
+ return q;
+}
+
+static inline const void *
+simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
+{
+ const void *q;
+ unsigned int len;
+
+ p = simple_get_bytes(p, end, &len, sizeof(len));
+ if (IS_ERR(p))
+ return p;
+ q = (const void *)((const char *)p + len);
+ if (unlikely(q > end || q < p))
+ return ERR_PTR(-EFAULT);
+ dest->data = kmemdup(p, len, GFP_NOFS);
+ if (unlikely(dest->data == NULL))
+ return ERR_PTR(-ENOMEM);
+ dest->len = len;
+ return q;
+}
+
+static struct gss_cl_ctx *
+gss_cred_get_ctx(struct rpc_cred *cred)
+{
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx = NULL;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (ctx)
+ gss_get_ctx(ctx);
+ rcu_read_unlock();
+ return ctx;
+}
+
+static struct gss_cl_ctx *
+gss_alloc_context(void)
+{
+ struct gss_cl_ctx *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+ if (ctx != NULL) {
+ ctx->gc_proc = RPC_GSS_PROC_DATA;
+ ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */
+ spin_lock_init(&ctx->gc_seq_lock);
+ atomic_set(&ctx->count,1);
+ }
+ return ctx;
+}
+
+#define GSSD_MIN_TIMEOUT (60 * 60)
+static const void *
+gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm)
+{
+ const void *q;
+ unsigned int seclen;
+ unsigned int timeout;
+ unsigned long now = jiffies;
+ u32 window_size;
+ int ret;
+
+ /* First unsigned int gives the remaining lifetime in seconds of the
+ * credential - e.g. the remaining TGT lifetime for Kerberos or
+ * the -t value passed to GSSD.
+ */
+ p = simple_get_bytes(p, end, &timeout, sizeof(timeout));
+ if (IS_ERR(p))
+ goto err;
+ if (timeout == 0)
+ timeout = GSSD_MIN_TIMEOUT;
+ ctx->gc_expiry = now + ((unsigned long)timeout * HZ);
+ /* Sequence number window. Determines the maximum number of
+ * simultaneous requests
+ */
+ p = simple_get_bytes(p, end, &window_size, sizeof(window_size));
+ if (IS_ERR(p))
+ goto err;
+ ctx->gc_win = window_size;
+ /* gssd signals an error by passing ctx->gc_win = 0: */
+ if (ctx->gc_win == 0) {
+ /*
+ * in which case, p points to an error code. Anything other
+ * than -EKEYEXPIRED gets converted to -EACCES.
+ */
+ p = simple_get_bytes(p, end, &ret, sizeof(ret));
+ if (!IS_ERR(p))
+ p = (ret == -EKEYEXPIRED) ? ERR_PTR(-EKEYEXPIRED) :
+ ERR_PTR(-EACCES);
+ goto err;
+ }
+ /* copy the opaque wire context */
+ p = simple_get_netobj(p, end, &ctx->gc_wire_ctx);
+ if (IS_ERR(p))
+ goto err;
+ /* import the opaque security context */
+ p = simple_get_bytes(p, end, &seclen, sizeof(seclen));
+ if (IS_ERR(p))
+ goto err;
+ q = (const void *)((const char *)p + seclen);
+ if (unlikely(q > end || q < p)) {
+ p = ERR_PTR(-EFAULT);
+ goto err;
+ }
+ ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS);
+ if (ret < 0) {
+ p = ERR_PTR(ret);
+ goto err;
+ }
+
+ /* is there any trailing data? */
+ if (q == end) {
+ p = q;
+ goto done;
+ }
+
+ /* pull in acceptor name (if there is one) */
+ p = simple_get_netobj(q, end, &ctx->gc_acceptor);
+ if (IS_ERR(p))
+ goto err;
+done:
+ dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
+ __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
+ ctx->gc_acceptor.data);
+ return p;
+err:
+ dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p));
+ return p;
+}
+
+#define UPCALL_BUF_LEN 128
+
+struct gss_upcall_msg {
+ atomic_t count;
+ kuid_t uid;
+ struct rpc_pipe_msg msg;
+ struct list_head list;
+ struct gss_auth *auth;
+ struct rpc_pipe *pipe;
+ struct rpc_wait_queue rpc_waitqueue;
+ wait_queue_head_t waitqueue;
+ struct gss_cl_ctx *ctx;
+ char databuf[UPCALL_BUF_LEN];
+};
+
+static int get_pipe_version(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ int ret;
+
+ spin_lock(&pipe_version_lock);
+ if (sn->pipe_version >= 0) {
+ atomic_inc(&sn->pipe_users);
+ ret = sn->pipe_version;
+ } else
+ ret = -EAGAIN;
+ spin_unlock(&pipe_version_lock);
+ return ret;
+}
+
+static void put_pipe_version(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) {
+ sn->pipe_version = -1;
+ spin_unlock(&pipe_version_lock);
+ }
+}
+
+static void
+gss_release_msg(struct gss_upcall_msg *gss_msg)
+{
+ struct net *net = gss_msg->auth->net;
+ if (!atomic_dec_and_test(&gss_msg->count))
+ return;
+ put_pipe_version(net);
+ BUG_ON(!list_empty(&gss_msg->list));
+ if (gss_msg->ctx != NULL)
+ gss_put_ctx(gss_msg->ctx);
+ rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue);
+ gss_put_auth(gss_msg->auth);
+ kfree(gss_msg);
+}
+
+static struct gss_upcall_msg *
+__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid)
+{
+ struct gss_upcall_msg *pos;
+ list_for_each_entry(pos, &pipe->in_downcall, list) {
+ if (!uid_eq(pos->uid, uid))
+ continue;
+ atomic_inc(&pos->count);
+ dprintk("RPC: %s found msg %p\n", __func__, pos);
+ return pos;
+ }
+ dprintk("RPC: %s found nothing\n", __func__);
+ return NULL;
+}
+
+/* Try to add an upcall to the pipefs queue.
+ * If an upcall owned by our uid already exists, then we return a reference
+ * to that upcall instead of adding the new upcall.
+ */
+static inline struct gss_upcall_msg *
+gss_add_msg(struct gss_upcall_msg *gss_msg)
+{
+ struct rpc_pipe *pipe = gss_msg->pipe;
+ struct gss_upcall_msg *old;
+
+ spin_lock(&pipe->lock);
+ old = __gss_find_upcall(pipe, gss_msg->uid);
+ if (old == NULL) {
+ atomic_inc(&gss_msg->count);
+ list_add(&gss_msg->list, &pipe->in_downcall);
+ } else
+ gss_msg = old;
+ spin_unlock(&pipe->lock);
+ return gss_msg;
+}
+
+static void
+__gss_unhash_msg(struct gss_upcall_msg *gss_msg)
+{
+ list_del_init(&gss_msg->list);
+ rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno);
+ wake_up_all(&gss_msg->waitqueue);
+ atomic_dec(&gss_msg->count);
+}
+
+static void
+gss_unhash_msg(struct gss_upcall_msg *gss_msg)
+{
+ struct rpc_pipe *pipe = gss_msg->pipe;
+
+ if (list_empty(&gss_msg->list))
+ return;
+ spin_lock(&pipe->lock);
+ if (!list_empty(&gss_msg->list))
+ __gss_unhash_msg(gss_msg);
+ spin_unlock(&pipe->lock);
+}
+
+static void
+gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg)
+{
+ switch (gss_msg->msg.errno) {
+ case 0:
+ if (gss_msg->ctx == NULL)
+ break;
+ clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags);
+ gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx);
+ break;
+ case -EKEYEXPIRED:
+ set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags);
+ }
+ gss_cred->gc_upcall_timestamp = jiffies;
+ gss_cred->gc_upcall = NULL;
+ rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno);
+}
+
+static void
+gss_upcall_callback(struct rpc_task *task)
+{
+ struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred,
+ struct gss_cred, gc_base);
+ struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
+ struct rpc_pipe *pipe = gss_msg->pipe;
+
+ spin_lock(&pipe->lock);
+ gss_handle_downcall_result(gss_cred, gss_msg);
+ spin_unlock(&pipe->lock);
+ task->tk_status = gss_msg->msg.errno;
+ gss_release_msg(gss_msg);
+}
+
+static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg)
+{
+ uid_t uid = from_kuid(&init_user_ns, gss_msg->uid);
+ memcpy(gss_msg->databuf, &uid, sizeof(uid));
+ gss_msg->msg.data = gss_msg->databuf;
+ gss_msg->msg.len = sizeof(uid);
+
+ BUILD_BUG_ON(sizeof(uid) > sizeof(gss_msg->databuf));
+}
+
+static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
+ const char *service_name,
+ const char *target_name)
+{
+ struct gss_api_mech *mech = gss_msg->auth->mech;
+ char *p = gss_msg->databuf;
+ size_t buflen = sizeof(gss_msg->databuf);
+ int len;
+
+ len = scnprintf(p, buflen, "mech=%s uid=%d ", mech->gm_name,
+ from_kuid(&init_user_ns, gss_msg->uid));
+ buflen -= len;
+ p += len;
+ gss_msg->msg.len = len;
+ if (target_name) {
+ len = scnprintf(p, buflen, "target=%s ", target_name);
+ buflen -= len;
+ p += len;
+ gss_msg->msg.len += len;
+ }
+ if (service_name != NULL) {
+ len = scnprintf(p, buflen, "service=%s ", service_name);
+ buflen -= len;
+ p += len;
+ gss_msg->msg.len += len;
+ }
+ if (mech->gm_upcall_enctypes) {
+ len = scnprintf(p, buflen, "enctypes=%s ",
+ mech->gm_upcall_enctypes);
+ buflen -= len;
+ p += len;
+ gss_msg->msg.len += len;
+ }
+ len = scnprintf(p, buflen, "\n");
+ if (len == 0)
+ goto out_overflow;
+ gss_msg->msg.len += len;
+
+ gss_msg->msg.data = gss_msg->databuf;
+ return 0;
+out_overflow:
+ WARN_ON_ONCE(1);
+ return -ENOMEM;
+}
+
+static struct gss_upcall_msg *
+gss_alloc_msg(struct gss_auth *gss_auth,
+ kuid_t uid, const char *service_name)
+{
+ struct gss_upcall_msg *gss_msg;
+ int vers;
+ int err = -ENOMEM;
+
+ gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS);
+ if (gss_msg == NULL)
+ goto err;
+ vers = get_pipe_version(gss_auth->net);
+ err = vers;
+ if (err < 0)
+ goto err_free_msg;
+ gss_msg->pipe = gss_auth->gss_pipe[vers]->pipe;
+ INIT_LIST_HEAD(&gss_msg->list);
+ rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq");
+ init_waitqueue_head(&gss_msg->waitqueue);
+ atomic_set(&gss_msg->count, 1);
+ gss_msg->uid = uid;
+ gss_msg->auth = gss_auth;
+ switch (vers) {
+ case 0:
+ gss_encode_v0_msg(gss_msg);
+ break;
+ default:
+ err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
+ if (err)
+ goto err_put_pipe_version;
+ };
+ kref_get(&gss_auth->kref);
+ return gss_msg;
+err_put_pipe_version:
+ put_pipe_version(gss_auth->net);
+err_free_msg:
+ kfree(gss_msg);
+err:
+ return ERR_PTR(err);
+}
+
+static struct gss_upcall_msg *
+gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred)
+{
+ struct gss_cred *gss_cred = container_of(cred,
+ struct gss_cred, gc_base);
+ struct gss_upcall_msg *gss_new, *gss_msg;
+ kuid_t uid = cred->cr_uid;
+
+ gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal);
+ if (IS_ERR(gss_new))
+ return gss_new;
+ gss_msg = gss_add_msg(gss_new);
+ if (gss_msg == gss_new) {
+ int res = rpc_queue_upcall(gss_new->pipe, &gss_new->msg);
+ if (res) {
+ gss_unhash_msg(gss_new);
+ gss_msg = ERR_PTR(res);
+ }
+ } else
+ gss_release_msg(gss_new);
+ return gss_msg;
+}
+
+static void warn_gssd(void)
+{
+ dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n");
+}
+
+static inline int
+gss_refresh_upcall(struct rpc_task *task)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ struct gss_auth *gss_auth = container_of(cred->cr_auth,
+ struct gss_auth, rpc_auth);
+ struct gss_cred *gss_cred = container_of(cred,
+ struct gss_cred, gc_base);
+ struct gss_upcall_msg *gss_msg;
+ struct rpc_pipe *pipe;
+ int err = 0;
+
+ dprintk("RPC: %5u %s for uid %u\n",
+ task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_uid));
+ gss_msg = gss_setup_upcall(gss_auth, cred);
+ if (PTR_ERR(gss_msg) == -EAGAIN) {
+ /* XXX: warning on the first, under the assumption we
+ * shouldn't normally hit this case on a refresh. */
+ warn_gssd();
+ task->tk_timeout = 15*HZ;
+ rpc_sleep_on(&pipe_version_rpc_waitqueue, task, NULL);
+ return -EAGAIN;
+ }
+ if (IS_ERR(gss_msg)) {
+ err = PTR_ERR(gss_msg);
+ goto out;
+ }
+ pipe = gss_msg->pipe;
+ spin_lock(&pipe->lock);
+ if (gss_cred->gc_upcall != NULL)
+ rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL);
+ else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) {
+ task->tk_timeout = 0;
+ gss_cred->gc_upcall = gss_msg;
+ /* gss_upcall_callback will release the reference to gss_upcall_msg */
+ atomic_inc(&gss_msg->count);
+ rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback);
+ } else {
+ gss_handle_downcall_result(gss_cred, gss_msg);
+ err = gss_msg->msg.errno;
+ }
+ spin_unlock(&pipe->lock);
+ gss_release_msg(gss_msg);
+out:
+ dprintk("RPC: %5u %s for uid %u result %d\n",
+ task->tk_pid, __func__,
+ from_kuid(&init_user_ns, cred->cr_uid), err);
+ return err;
+}
+
+static inline int
+gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
+{
+ struct net *net = gss_auth->net;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_pipe *pipe;
+ struct rpc_cred *cred = &gss_cred->gc_base;
+ struct gss_upcall_msg *gss_msg;
+ DEFINE_WAIT(wait);
+ int err;
+
+ dprintk("RPC: %s for uid %u\n",
+ __func__, from_kuid(&init_user_ns, cred->cr_uid));
+retry:
+ err = 0;
+ /* if gssd is down, just skip upcalling altogether */
+ if (!gssd_running(net)) {
+ warn_gssd();
+ return -EACCES;
+ }
+ gss_msg = gss_setup_upcall(gss_auth, cred);
+ if (PTR_ERR(gss_msg) == -EAGAIN) {
+ err = wait_event_interruptible_timeout(pipe_version_waitqueue,
+ sn->pipe_version >= 0, 15 * HZ);
+ if (sn->pipe_version < 0) {
+ warn_gssd();
+ err = -EACCES;
+ }
+ if (err < 0)
+ goto out;
+ goto retry;
+ }
+ if (IS_ERR(gss_msg)) {
+ err = PTR_ERR(gss_msg);
+ goto out;
+ }
+ pipe = gss_msg->pipe;
+ for (;;) {
+ prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE);
+ spin_lock(&pipe->lock);
+ if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) {
+ break;
+ }
+ spin_unlock(&pipe->lock);
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ goto out_intr;
+ }
+ schedule();
+ }
+ if (gss_msg->ctx)
+ gss_cred_set_ctx(cred, gss_msg->ctx);
+ else
+ err = gss_msg->msg.errno;
+ spin_unlock(&pipe->lock);
+out_intr:
+ finish_wait(&gss_msg->waitqueue, &wait);
+ gss_release_msg(gss_msg);
+out:
+ dprintk("RPC: %s for uid %u result %d\n",
+ __func__, from_kuid(&init_user_ns, cred->cr_uid), err);
+ return err;
+}
+
+#define MSG_BUF_MAXSIZE 1024
+
+static ssize_t
+gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+ const void *p, *end;
+ void *buf;
+ struct gss_upcall_msg *gss_msg;
+ struct rpc_pipe *pipe = RPC_I(file_inode(filp))->pipe;
+ struct gss_cl_ctx *ctx;
+ uid_t id;
+ kuid_t uid;
+ ssize_t err = -EFBIG;
+
+ if (mlen > MSG_BUF_MAXSIZE)
+ goto out;
+ err = -ENOMEM;
+ buf = kmalloc(mlen, GFP_NOFS);
+ if (!buf)
+ goto out;
+
+ err = -EFAULT;
+ if (copy_from_user(buf, src, mlen))
+ goto err;
+
+ end = (const void *)((char *)buf + mlen);
+ p = simple_get_bytes(buf, end, &id, sizeof(id));
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ goto err;
+ }
+
+ uid = make_kuid(&init_user_ns, id);
+ if (!uid_valid(uid)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = -ENOMEM;
+ ctx = gss_alloc_context();
+ if (ctx == NULL)
+ goto err;
+
+ err = -ENOENT;
+ /* Find a matching upcall */
+ spin_lock(&pipe->lock);
+ gss_msg = __gss_find_upcall(pipe, uid);
+ if (gss_msg == NULL) {
+ spin_unlock(&pipe->lock);
+ goto err_put_ctx;
+ }
+ list_del_init(&gss_msg->list);
+ spin_unlock(&pipe->lock);
+
+ p = gss_fill_context(p, end, ctx, gss_msg->auth->mech);
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ switch (err) {
+ case -EACCES:
+ case -EKEYEXPIRED:
+ gss_msg->msg.errno = err;
+ err = mlen;
+ break;
+ case -EFAULT:
+ case -ENOMEM:
+ case -EINVAL:
+ case -ENOSYS:
+ gss_msg->msg.errno = -EAGAIN;
+ break;
+ default:
+ printk(KERN_CRIT "%s: bad return from "
+ "gss_fill_context: %zd\n", __func__, err);
+ BUG();
+ }
+ goto err_release_msg;
+ }
+ gss_msg->ctx = gss_get_ctx(ctx);
+ err = mlen;
+
+err_release_msg:
+ spin_lock(&pipe->lock);
+ __gss_unhash_msg(gss_msg);
+ spin_unlock(&pipe->lock);
+ gss_release_msg(gss_msg);
+err_put_ctx:
+ gss_put_ctx(ctx);
+err:
+ kfree(buf);
+out:
+ dprintk("RPC: %s returning %Zd\n", __func__, err);
+ return err;
+}
+
+static int gss_pipe_open(struct inode *inode, int new_version)
+{
+ struct net *net = inode->i_sb->s_fs_info;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ int ret = 0;
+
+ spin_lock(&pipe_version_lock);
+ if (sn->pipe_version < 0) {
+ /* First open of any gss pipe determines the version: */
+ sn->pipe_version = new_version;
+ rpc_wake_up(&pipe_version_rpc_waitqueue);
+ wake_up(&pipe_version_waitqueue);
+ } else if (sn->pipe_version != new_version) {
+ /* Trying to open a pipe of a different version */
+ ret = -EBUSY;
+ goto out;
+ }
+ atomic_inc(&sn->pipe_users);
+out:
+ spin_unlock(&pipe_version_lock);
+ return ret;
+
+}
+
+static int gss_pipe_open_v0(struct inode *inode)
+{
+ return gss_pipe_open(inode, 0);
+}
+
+static int gss_pipe_open_v1(struct inode *inode)
+{
+ return gss_pipe_open(inode, 1);
+}
+
+static void
+gss_pipe_release(struct inode *inode)
+{
+ struct net *net = inode->i_sb->s_fs_info;
+ struct rpc_pipe *pipe = RPC_I(inode)->pipe;
+ struct gss_upcall_msg *gss_msg;
+
+restart:
+ spin_lock(&pipe->lock);
+ list_for_each_entry(gss_msg, &pipe->in_downcall, list) {
+
+ if (!list_empty(&gss_msg->msg.list))
+ continue;
+ gss_msg->msg.errno = -EPIPE;
+ atomic_inc(&gss_msg->count);
+ __gss_unhash_msg(gss_msg);
+ spin_unlock(&pipe->lock);
+ gss_release_msg(gss_msg);
+ goto restart;
+ }
+ spin_unlock(&pipe->lock);
+
+ put_pipe_version(net);
+}
+
+static void
+gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg);
+
+ if (msg->errno < 0) {
+ dprintk("RPC: %s releasing msg %p\n",
+ __func__, gss_msg);
+ atomic_inc(&gss_msg->count);
+ gss_unhash_msg(gss_msg);
+ if (msg->errno == -ETIMEDOUT)
+ warn_gssd();
+ gss_release_msg(gss_msg);
+ }
+}
+
+static void gss_pipe_dentry_destroy(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
+{
+ struct gss_pipe *gss_pipe = pdo->pdo_data;
+ struct rpc_pipe *pipe = gss_pipe->pipe;
+
+ if (pipe->dentry != NULL) {
+ rpc_unlink(pipe->dentry);
+ pipe->dentry = NULL;
+ }
+}
+
+static int gss_pipe_dentry_create(struct dentry *dir,
+ struct rpc_pipe_dir_object *pdo)
+{
+ struct gss_pipe *p = pdo->pdo_data;
+ struct dentry *dentry;
+
+ dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ p->pipe->dentry = dentry;
+ return 0;
+}
+
+static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = {
+ .create = gss_pipe_dentry_create,
+ .destroy = gss_pipe_dentry_destroy,
+};
+
+static struct gss_pipe *gss_pipe_alloc(struct rpc_clnt *clnt,
+ const char *name,
+ const struct rpc_pipe_ops *upcall_ops)
+{
+ struct gss_pipe *p;
+ int err = -ENOMEM;
+
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (p == NULL)
+ goto err;
+ p->pipe = rpc_mkpipe_data(upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
+ if (IS_ERR(p->pipe)) {
+ err = PTR_ERR(p->pipe);
+ goto err_free_gss_pipe;
+ }
+ p->name = name;
+ p->clnt = clnt;
+ kref_init(&p->kref);
+ rpc_init_pipe_dir_object(&p->pdo,
+ &gss_pipe_dir_object_ops,
+ p);
+ return p;
+err_free_gss_pipe:
+ kfree(p);
+err:
+ return ERR_PTR(err);
+}
+
+struct gss_alloc_pdo {
+ struct rpc_clnt *clnt;
+ const char *name;
+ const struct rpc_pipe_ops *upcall_ops;
+};
+
+static int gss_pipe_match_pdo(struct rpc_pipe_dir_object *pdo, void *data)
+{
+ struct gss_pipe *gss_pipe;
+ struct gss_alloc_pdo *args = data;
+
+ if (pdo->pdo_ops != &gss_pipe_dir_object_ops)
+ return 0;
+ gss_pipe = container_of(pdo, struct gss_pipe, pdo);
+ if (strcmp(gss_pipe->name, args->name) != 0)
+ return 0;
+ if (!kref_get_unless_zero(&gss_pipe->kref))
+ return 0;
+ return 1;
+}
+
+static struct rpc_pipe_dir_object *gss_pipe_alloc_pdo(void *data)
+{
+ struct gss_pipe *gss_pipe;
+ struct gss_alloc_pdo *args = data;
+
+ gss_pipe = gss_pipe_alloc(args->clnt, args->name, args->upcall_ops);
+ if (!IS_ERR(gss_pipe))
+ return &gss_pipe->pdo;
+ return NULL;
+}
+
+static struct gss_pipe *gss_pipe_get(struct rpc_clnt *clnt,
+ const char *name,
+ const struct rpc_pipe_ops *upcall_ops)
+{
+ struct net *net = rpc_net_ns(clnt);
+ struct rpc_pipe_dir_object *pdo;
+ struct gss_alloc_pdo args = {
+ .clnt = clnt,
+ .name = name,
+ .upcall_ops = upcall_ops,
+ };
+
+ pdo = rpc_find_or_alloc_pipe_dir_object(net,
+ &clnt->cl_pipedir_objects,
+ gss_pipe_match_pdo,
+ gss_pipe_alloc_pdo,
+ &args);
+ if (pdo != NULL)
+ return container_of(pdo, struct gss_pipe, pdo);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void __gss_pipe_free(struct gss_pipe *p)
+{
+ struct rpc_clnt *clnt = p->clnt;
+ struct net *net = rpc_net_ns(clnt);
+
+ rpc_remove_pipe_dir_object(net,
+ &clnt->cl_pipedir_objects,
+ &p->pdo);
+ rpc_destroy_pipe_data(p->pipe);
+ kfree(p);
+}
+
+static void __gss_pipe_release(struct kref *kref)
+{
+ struct gss_pipe *p = container_of(kref, struct gss_pipe, kref);
+
+ __gss_pipe_free(p);
+}
+
+static void gss_pipe_free(struct gss_pipe *p)
+{
+ if (p != NULL)
+ kref_put(&p->kref, __gss_pipe_release);
+}
+
+/*
+ * NOTE: we have the opportunity to use different
+ * parameters based on the input flavor (which must be a pseudoflavor)
+ */
+static struct gss_auth *
+gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+{
+ rpc_authflavor_t flavor = args->pseudoflavor;
+ struct gss_auth *gss_auth;
+ struct gss_pipe *gss_pipe;
+ struct rpc_auth * auth;
+ int err = -ENOMEM; /* XXX? */
+
+ dprintk("RPC: creating GSS authenticator for client %p\n", clnt);
+
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(err);
+ if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
+ goto out_dec;
+ INIT_HLIST_NODE(&gss_auth->hash);
+ gss_auth->target_name = NULL;
+ if (args->target_name) {
+ gss_auth->target_name = kstrdup(args->target_name, GFP_KERNEL);
+ if (gss_auth->target_name == NULL)
+ goto err_free;
+ }
+ gss_auth->client = clnt;
+ gss_auth->net = get_net(rpc_net_ns(clnt));
+ err = -EINVAL;
+ gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
+ if (!gss_auth->mech) {
+ dprintk("RPC: Pseudoflavor %d not found!\n", flavor);
+ goto err_put_net;
+ }
+ gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
+ if (gss_auth->service == 0)
+ goto err_put_mech;
+ if (!gssd_running(gss_auth->net))
+ goto err_put_mech;
+ auth = &gss_auth->rpc_auth;
+ auth->au_cslack = GSS_CRED_SLACK >> 2;
+ auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_ops = &authgss_ops;
+ auth->au_flavor = flavor;
+ atomic_set(&auth->au_count, 1);
+ kref_init(&gss_auth->kref);
+
+ err = rpcauth_init_credcache(auth);
+ if (err)
+ goto err_put_mech;
+ /*
+ * Note: if we created the old pipe first, then someone who
+ * examined the directory at the right moment might conclude
+ * that we supported only the old pipe. So we instead create
+ * the new pipe first.
+ */
+ gss_pipe = gss_pipe_get(clnt, "gssd", &gss_upcall_ops_v1);
+ if (IS_ERR(gss_pipe)) {
+ err = PTR_ERR(gss_pipe);
+ goto err_destroy_credcache;
+ }
+ gss_auth->gss_pipe[1] = gss_pipe;
+
+ gss_pipe = gss_pipe_get(clnt, gss_auth->mech->gm_name,
+ &gss_upcall_ops_v0);
+ if (IS_ERR(gss_pipe)) {
+ err = PTR_ERR(gss_pipe);
+ goto err_destroy_pipe_1;
+ }
+ gss_auth->gss_pipe[0] = gss_pipe;
+
+ return gss_auth;
+err_destroy_pipe_1:
+ gss_pipe_free(gss_auth->gss_pipe[1]);
+err_destroy_credcache:
+ rpcauth_destroy_credcache(auth);
+err_put_mech:
+ gss_mech_put(gss_auth->mech);
+err_put_net:
+ put_net(gss_auth->net);
+err_free:
+ kfree(gss_auth->target_name);
+ kfree(gss_auth);
+out_dec:
+ module_put(THIS_MODULE);
+ return ERR_PTR(err);
+}
+
+static void
+gss_free(struct gss_auth *gss_auth)
+{
+ gss_pipe_free(gss_auth->gss_pipe[0]);
+ gss_pipe_free(gss_auth->gss_pipe[1]);
+ gss_mech_put(gss_auth->mech);
+ put_net(gss_auth->net);
+ kfree(gss_auth->target_name);
+
+ kfree(gss_auth);
+ module_put(THIS_MODULE);
+}
+
+static void
+gss_free_callback(struct kref *kref)
+{
+ struct gss_auth *gss_auth = container_of(kref, struct gss_auth, kref);
+
+ gss_free(gss_auth);
+}
+
+static void
+gss_put_auth(struct gss_auth *gss_auth)
+{
+ kref_put(&gss_auth->kref, gss_free_callback);
+}
+
+static void
+gss_destroy(struct rpc_auth *auth)
+{
+ struct gss_auth *gss_auth = container_of(auth,
+ struct gss_auth, rpc_auth);
+
+ dprintk("RPC: destroying GSS authenticator %p flavor %d\n",
+ auth, auth->au_flavor);
+
+ if (hash_hashed(&gss_auth->hash)) {
+ spin_lock(&gss_auth_hash_lock);
+ hash_del(&gss_auth->hash);
+ spin_unlock(&gss_auth_hash_lock);
+ }
+
+ gss_pipe_free(gss_auth->gss_pipe[0]);
+ gss_auth->gss_pipe[0] = NULL;
+ gss_pipe_free(gss_auth->gss_pipe[1]);
+ gss_auth->gss_pipe[1] = NULL;
+ rpcauth_destroy_credcache(auth);
+
+ gss_put_auth(gss_auth);
+}
+
+/*
+ * Auths may be shared between rpc clients that were cloned from a
+ * common client with the same xprt, if they also share the flavor and
+ * target_name.
+ *
+ * The auth is looked up from the oldest parent sharing the same
+ * cl_xprt, and the auth itself references only that common parent
+ * (which is guaranteed to last as long as any of its descendants).
+ */
+static struct gss_auth *
+gss_auth_find_or_add_hashed(struct rpc_auth_create_args *args,
+ struct rpc_clnt *clnt,
+ struct gss_auth *new)
+{
+ struct gss_auth *gss_auth;
+ unsigned long hashval = (unsigned long)clnt;
+
+ spin_lock(&gss_auth_hash_lock);
+ hash_for_each_possible(gss_auth_hash_table,
+ gss_auth,
+ hash,
+ hashval) {
+ if (gss_auth->client != clnt)
+ continue;
+ if (gss_auth->rpc_auth.au_flavor != args->pseudoflavor)
+ continue;
+ if (gss_auth->target_name != args->target_name) {
+ if (gss_auth->target_name == NULL)
+ continue;
+ if (args->target_name == NULL)
+ continue;
+ if (strcmp(gss_auth->target_name, args->target_name))
+ continue;
+ }
+ if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count))
+ continue;
+ goto out;
+ }
+ if (new)
+ hash_add(gss_auth_hash_table, &new->hash, hashval);
+ gss_auth = new;
+out:
+ spin_unlock(&gss_auth_hash_lock);
+ return gss_auth;
+}
+
+static struct gss_auth *
+gss_create_hashed(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+{
+ struct gss_auth *gss_auth;
+ struct gss_auth *new;
+
+ gss_auth = gss_auth_find_or_add_hashed(args, clnt, NULL);
+ if (gss_auth != NULL)
+ goto out;
+ new = gss_create_new(args, clnt);
+ if (IS_ERR(new))
+ return new;
+ gss_auth = gss_auth_find_or_add_hashed(args, clnt, new);
+ if (gss_auth != new)
+ gss_destroy(&new->rpc_auth);
+out:
+ return gss_auth;
+}
+
+static struct rpc_auth *
+gss_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+{
+ struct gss_auth *gss_auth;
+ struct rpc_xprt *xprt = rcu_access_pointer(clnt->cl_xprt);
+
+ while (clnt != clnt->cl_parent) {
+ struct rpc_clnt *parent = clnt->cl_parent;
+ /* Find the original parent for this transport */
+ if (rcu_access_pointer(parent->cl_xprt) != xprt)
+ break;
+ clnt = parent;
+ }
+
+ gss_auth = gss_create_hashed(args, clnt);
+ if (IS_ERR(gss_auth))
+ return ERR_CAST(gss_auth);
+ return &gss_auth->rpc_auth;
+}
+
+/*
+ * gss_destroying_context will cause the RPCSEC_GSS to send a NULL RPC call
+ * to the server with the GSS control procedure field set to
+ * RPC_GSS_PROC_DESTROY. This should normally cause the server to release
+ * all RPCSEC_GSS state associated with that context.
+ */
+static int
+gss_destroying_context(struct rpc_cred *cred)
+{
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+ struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+ struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
+ struct rpc_task *task;
+
+ if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
+ return 0;
+
+ ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+ cred->cr_ops = &gss_nullops;
+
+ /* Take a reference to ensure the cred will be destroyed either
+ * by the RPC call or by the put_rpccred() below */
+ get_rpccred(cred);
+
+ task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC|RPC_TASK_SOFT);
+ if (!IS_ERR(task))
+ rpc_put_task(task);
+
+ put_rpccred(cred);
+ return 1;
+}
+
+/* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure
+ * to create a new cred or context, so they check that things have been
+ * allocated before freeing them. */
+static void
+gss_do_free_ctx(struct gss_cl_ctx *ctx)
+{
+ dprintk("RPC: %s\n", __func__);
+
+ gss_delete_sec_context(&ctx->gc_gss_ctx);
+ kfree(ctx->gc_wire_ctx.data);
+ kfree(ctx->gc_acceptor.data);
+ kfree(ctx);
+}
+
+static void
+gss_free_ctx_callback(struct rcu_head *head)
+{
+ struct gss_cl_ctx *ctx = container_of(head, struct gss_cl_ctx, gc_rcu);
+ gss_do_free_ctx(ctx);
+}
+
+static void
+gss_free_ctx(struct gss_cl_ctx *ctx)
+{
+ call_rcu(&ctx->gc_rcu, gss_free_ctx_callback);
+}
+
+static void
+gss_free_cred(struct gss_cred *gss_cred)
+{
+ dprintk("RPC: %s cred=%p\n", __func__, gss_cred);
+ kfree(gss_cred);
+}
+
+static void
+gss_free_cred_callback(struct rcu_head *head)
+{
+ struct gss_cred *gss_cred = container_of(head, struct gss_cred, gc_base.cr_rcu);
+ gss_free_cred(gss_cred);
+}
+
+static void
+gss_destroy_nullcred(struct rpc_cred *cred)
+{
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+ struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+ struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
+
+ RCU_INIT_POINTER(gss_cred->gc_ctx, NULL);
+ call_rcu(&cred->cr_rcu, gss_free_cred_callback);
+ if (ctx)
+ gss_put_ctx(ctx);
+ gss_put_auth(gss_auth);
+}
+
+static void
+gss_destroy_cred(struct rpc_cred *cred)
+{
+
+ if (gss_destroying_context(cred))
+ return;
+ gss_destroy_nullcred(cred);
+}
+
+/*
+ * Lookup RPCSEC_GSS cred for the current process
+ */
+static struct rpc_cred *
+gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+ return rpcauth_lookup_credcache(auth, acred, flags);
+}
+
+static struct rpc_cred *
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+ struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
+ struct gss_cred *cred = NULL;
+ int err = -ENOMEM;
+
+ dprintk("RPC: %s for uid %d, flavor %d\n",
+ __func__, from_kuid(&init_user_ns, acred->uid),
+ auth->au_flavor);
+
+ if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+ goto out_err;
+
+ rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
+ /*
+ * Note: in order to force a call to call_refresh(), we deliberately
+ * fail to flag the credential as RPCAUTH_CRED_UPTODATE.
+ */
+ cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW;
+ cred->gc_service = gss_auth->service;
+ cred->gc_principal = NULL;
+ if (acred->machine_cred)
+ cred->gc_principal = acred->principal;
+ kref_get(&gss_auth->kref);
+ return &cred->gc_base;
+
+out_err:
+ dprintk("RPC: %s failed with error %d\n", __func__, err);
+ return ERR_PTR(err);
+}
+
+static int
+gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
+{
+ struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
+ struct gss_cred *gss_cred = container_of(cred,struct gss_cred, gc_base);
+ int err;
+
+ do {
+ err = gss_create_upcall(gss_auth, gss_cred);
+ } while (err == -EAGAIN);
+ return err;
+}
+
+static char *
+gss_stringify_acceptor(struct rpc_cred *cred)
+{
+ char *string = NULL;
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
+ unsigned int len;
+ struct xdr_netobj *acceptor;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (!ctx)
+ goto out;
+
+ len = ctx->gc_acceptor.len;
+ rcu_read_unlock();
+
+ /* no point if there's no string */
+ if (!len)
+ return NULL;
+realloc:
+ string = kmalloc(len + 1, GFP_KERNEL);
+ if (!string)
+ return NULL;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+
+ /* did the ctx disappear or was it replaced by one with no acceptor? */
+ if (!ctx || !ctx->gc_acceptor.len) {
+ kfree(string);
+ string = NULL;
+ goto out;
+ }
+
+ acceptor = &ctx->gc_acceptor;
+
+ /*
+ * Did we find a new acceptor that's longer than the original? Allocate
+ * a longer buffer and try again.
+ */
+ if (len < acceptor->len) {
+ len = acceptor->len;
+ rcu_read_unlock();
+ kfree(string);
+ goto realloc;
+ }
+
+ memcpy(string, acceptor->data, acceptor->len);
+ string[acceptor->len] = '\0';
+out:
+ rcu_read_unlock();
+ return string;
+}
+
+/*
+ * Returns -EACCES if GSS context is NULL or will expire within the
+ * timeout (miliseconds)
+ */
+static int
+gss_key_timeout(struct rpc_cred *rc)
+{
+ struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
+ unsigned long now = jiffies;
+ unsigned long expire;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (ctx)
+ expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+ rcu_read_unlock();
+ if (!ctx || time_after(now, expire))
+ return -EACCES;
+ return 0;
+}
+
+static int
+gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
+{
+ struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
+ int ret;
+
+ if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
+ goto out;
+ /* Don't match with creds that have expired. */
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (!ctx || time_after(jiffies, ctx->gc_expiry)) {
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
+ return 0;
+out:
+ if (acred->principal != NULL) {
+ if (gss_cred->gc_principal == NULL)
+ return 0;
+ ret = strcmp(acred->principal, gss_cred->gc_principal) == 0;
+ goto check_expire;
+ }
+ if (gss_cred->gc_principal != NULL)
+ return 0;
+ ret = uid_eq(rc->cr_uid, acred->uid);
+
+check_expire:
+ if (ret == 0)
+ return ret;
+
+ /* Notify acred users of GSS context expiration timeout */
+ if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags) &&
+ (gss_key_timeout(rc) != 0)) {
+ /* test will now be done from generic cred */
+ test_and_clear_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
+ /* tell NFS layer that key will expire soon */
+ set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags);
+ }
+ return ret;
+}
+
+/*
+* Marshal credentials.
+* Maybe we should keep a cached credential for performance reasons.
+*/
+static __be32 *
+gss_marshal(struct rpc_task *task, __be32 *p)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_cred *cred = req->rq_cred;
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
+ gc_base);
+ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+ __be32 *cred_len;
+ u32 maj_stat = 0;
+ struct xdr_netobj mic;
+ struct kvec iov;
+ struct xdr_buf verf_buf;
+
+ dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+
+ *p++ = htonl(RPC_AUTH_GSS);
+ cred_len = p++;
+
+ spin_lock(&ctx->gc_seq_lock);
+ req->rq_seqno = ctx->gc_seq++;
+ spin_unlock(&ctx->gc_seq_lock);
+
+ *p++ = htonl((u32) RPC_GSS_VERSION);
+ *p++ = htonl((u32) ctx->gc_proc);
+ *p++ = htonl((u32) req->rq_seqno);
+ *p++ = htonl((u32) gss_cred->gc_service);
+ p = xdr_encode_netobj(p, &ctx->gc_wire_ctx);
+ *cred_len = htonl((p - (cred_len + 1)) << 2);
+
+ /* We compute the checksum for the verifier over the xdr-encoded bytes
+ * starting with the xid and ending at the end of the credential: */
+ iov.iov_base = xprt_skip_transport_header(req->rq_xprt,
+ req->rq_snd_buf.head[0].iov_base);
+ iov.iov_len = (u8 *)p - (u8 *)iov.iov_base;
+ xdr_buf_from_iov(&iov, &verf_buf);
+
+ /* set verifier flavor*/
+ *p++ = htonl(RPC_AUTH_GSS);
+
+ mic.data = (u8 *)(p + 1);
+ maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED) {
+ clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+ } else if (maj_stat != 0) {
+ printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
+ goto out_put_ctx;
+ }
+ p = xdr_encode_opaque(p, NULL, mic.len);
+ gss_put_ctx(ctx);
+ return p;
+out_put_ctx:
+ gss_put_ctx(ctx);
+ return NULL;
+}
+
+static int gss_renew_cred(struct rpc_task *task)
+{
+ struct rpc_cred *oldcred = task->tk_rqstp->rq_cred;
+ struct gss_cred *gss_cred = container_of(oldcred,
+ struct gss_cred,
+ gc_base);
+ struct rpc_auth *auth = oldcred->cr_auth;
+ struct auth_cred acred = {
+ .uid = oldcred->cr_uid,
+ .principal = gss_cred->gc_principal,
+ .machine_cred = (gss_cred->gc_principal != NULL ? 1 : 0),
+ };
+ struct rpc_cred *new;
+
+ new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ task->tk_rqstp->rq_cred = new;
+ put_rpccred(oldcred);
+ return 0;
+}
+
+static int gss_cred_is_negative_entry(struct rpc_cred *cred)
+{
+ if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) {
+ unsigned long now = jiffies;
+ unsigned long begin, expire;
+ struct gss_cred *gss_cred;
+
+ gss_cred = container_of(cred, struct gss_cred, gc_base);
+ begin = gss_cred->gc_upcall_timestamp;
+ expire = begin + gss_expired_cred_retry_delay * HZ;
+
+ if (time_in_range_open(now, begin, expire))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+* Refresh credentials. XXX - finish
+*/
+static int
+gss_refresh(struct rpc_task *task)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ int ret = 0;
+
+ if (gss_cred_is_negative_entry(cred))
+ return -EKEYEXPIRED;
+
+ if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
+ !test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) {
+ ret = gss_renew_cred(task);
+ if (ret < 0)
+ goto out;
+ cred = task->tk_rqstp->rq_cred;
+ }
+
+ if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
+ ret = gss_refresh_upcall(task);
+out:
+ return ret;
+}
+
+/* Dummy refresh routine: used only when destroying the context */
+static int
+gss_refresh_null(struct rpc_task *task)
+{
+ return 0;
+}
+
+static __be32 *
+gss_validate(struct rpc_task *task, __be32 *p)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+ __be32 seq;
+ struct kvec iov;
+ struct xdr_buf verf_buf;
+ struct xdr_netobj mic;
+ u32 flav,len;
+ u32 maj_stat;
+ __be32 *ret = ERR_PTR(-EIO);
+
+ dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+
+ flav = ntohl(*p++);
+ if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE)
+ goto out_bad;
+ if (flav != RPC_AUTH_GSS)
+ goto out_bad;
+ seq = htonl(task->tk_rqstp->rq_seqno);
+ iov.iov_base = &seq;
+ iov.iov_len = sizeof(seq);
+ xdr_buf_from_iov(&iov, &verf_buf);
+ mic.data = (u8 *)p;
+ mic.len = len;
+
+ ret = ERR_PTR(-EACCES);
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+ clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+ if (maj_stat) {
+ dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n",
+ task->tk_pid, __func__, maj_stat);
+ goto out_bad;
+ }
+ /* We leave it to unwrap to calculate au_rslack. For now we just
+ * calculate the length of the verifier: */
+ cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
+ gss_put_ctx(ctx);
+ dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n",
+ task->tk_pid, __func__);
+ return p + XDR_QUADLEN(len);
+out_bad:
+ gss_put_ctx(ctx);
+ dprintk("RPC: %5u %s failed ret %ld.\n", task->tk_pid, __func__,
+ PTR_ERR(ret));
+ return ret;
+}
+
+static void gss_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
+ __be32 *p, void *obj)
+{
+ struct xdr_stream xdr;
+
+ xdr_init_encode(&xdr, &rqstp->rq_snd_buf, p);
+ encode(rqstp, &xdr, obj);
+}
+
+static inline int
+gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ kxdreproc_t encode, struct rpc_rqst *rqstp,
+ __be32 *p, void *obj)
+{
+ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
+ struct xdr_buf integ_buf;
+ __be32 *integ_len = NULL;
+ struct xdr_netobj mic;
+ u32 offset;
+ __be32 *q;
+ struct kvec *iov;
+ u32 maj_stat = 0;
+ int status = -EIO;
+
+ integ_len = p++;
+ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+ *p++ = htonl(rqstp->rq_seqno);
+
+ gss_wrap_req_encode(encode, rqstp, p, obj);
+
+ if (xdr_buf_subsegment(snd_buf, &integ_buf,
+ offset, snd_buf->len - offset))
+ return status;
+ *integ_len = htonl(integ_buf.len);
+
+ /* guess whether we're in the head or the tail: */
+ if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+ iov = snd_buf->tail;
+ else
+ iov = snd_buf->head;
+ p = iov->iov_base + iov->iov_len;
+ mic.data = (u8 *)(p + 1);
+
+ maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+ status = -EIO; /* XXX? */
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+ clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+ else if (maj_stat)
+ return status;
+ q = xdr_encode_opaque(p, NULL, mic.len);
+
+ offset = (u8 *)q - (u8 *)p;
+ iov->iov_len += offset;
+ snd_buf->len += offset;
+ return 0;
+}
+
+static void
+priv_release_snd_buf(struct rpc_rqst *rqstp)
+{
+ int i;
+
+ for (i=0; i < rqstp->rq_enc_pages_num; i++)
+ __free_page(rqstp->rq_enc_pages[i]);
+ kfree(rqstp->rq_enc_pages);
+}
+
+static int
+alloc_enc_pages(struct rpc_rqst *rqstp)
+{
+ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
+ int first, last, i;
+
+ if (snd_buf->page_len == 0) {
+ rqstp->rq_enc_pages_num = 0;
+ return 0;
+ }
+
+ first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
+ last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
+ rqstp->rq_enc_pages_num = last - first + 1 + 1;
+ rqstp->rq_enc_pages
+ = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
+ GFP_NOFS);
+ if (!rqstp->rq_enc_pages)
+ goto out;
+ for (i=0; i < rqstp->rq_enc_pages_num; i++) {
+ rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
+ if (rqstp->rq_enc_pages[i] == NULL)
+ goto out_free;
+ }
+ rqstp->rq_release_snd_buf = priv_release_snd_buf;
+ return 0;
+out_free:
+ rqstp->rq_enc_pages_num = i;
+ priv_release_snd_buf(rqstp);
+out:
+ return -EAGAIN;
+}
+
+static inline int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ kxdreproc_t encode, struct rpc_rqst *rqstp,
+ __be32 *p, void *obj)
+{
+ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
+ u32 offset;
+ u32 maj_stat;
+ int status;
+ __be32 *opaque_len;
+ struct page **inpages;
+ int first;
+ int pad;
+ struct kvec *iov;
+ char *tmp;
+
+ opaque_len = p++;
+ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+ *p++ = htonl(rqstp->rq_seqno);
+
+ gss_wrap_req_encode(encode, rqstp, p, obj);
+
+ status = alloc_enc_pages(rqstp);
+ if (status)
+ return status;
+ first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
+ inpages = snd_buf->pages + first;
+ snd_buf->pages = rqstp->rq_enc_pages;
+ snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
+ /*
+ * Give the tail its own page, in case we need extra space in the
+ * head when wrapping:
+ *
+ * call_allocate() allocates twice the slack space required
+ * by the authentication flavor to rq_callsize.
+ * For GSS, slack is GSS_CRED_SLACK.
+ */
+ if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
+ tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
+ memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
+ snd_buf->tail[0].iov_base = tmp;
+ }
+ maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages);
+ /* slack space should prevent this ever happening: */
+ BUG_ON(snd_buf->len > snd_buf->buflen);
+ status = -EIO;
+ /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was
+ * done anyway, so it's safe to put the request on the wire: */
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+ clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+ else if (maj_stat)
+ return status;
+
+ *opaque_len = htonl(snd_buf->len - offset);
+ /* guess whether we're in the head or the tail: */
+ if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+ iov = snd_buf->tail;
+ else
+ iov = snd_buf->head;
+ p = iov->iov_base + iov->iov_len;
+ pad = 3 - ((snd_buf->len - offset - 1) & 3);
+ memset(p, 0, pad);
+ iov->iov_len += pad;
+ snd_buf->len += pad;
+
+ return 0;
+}
+
+static int
+gss_wrap_req(struct rpc_task *task,
+ kxdreproc_t encode, void *rqstp, __be32 *p, void *obj)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
+ gc_base);
+ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+ int status = -EIO;
+
+ dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
+ if (ctx->gc_proc != RPC_GSS_PROC_DATA) {
+ /* The spec seems a little ambiguous here, but I think that not
+ * wrapping context destruction requests makes the most sense.
+ */
+ gss_wrap_req_encode(encode, rqstp, p, obj);
+ status = 0;
+ goto out;
+ }
+ switch (gss_cred->gc_service) {
+ case RPC_GSS_SVC_NONE:
+ gss_wrap_req_encode(encode, rqstp, p, obj);
+ status = 0;
+ break;
+ case RPC_GSS_SVC_INTEGRITY:
+ status = gss_wrap_req_integ(cred, ctx, encode, rqstp, p, obj);
+ break;
+ case RPC_GSS_SVC_PRIVACY:
+ status = gss_wrap_req_priv(cred, ctx, encode, rqstp, p, obj);
+ break;
+ }
+out:
+ gss_put_ctx(ctx);
+ dprintk("RPC: %5u %s returning %d\n", task->tk_pid, __func__, status);
+ return status;
+}
+
+static inline int
+gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_rqst *rqstp, __be32 **p)
+{
+ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
+ struct xdr_buf integ_buf;
+ struct xdr_netobj mic;
+ u32 data_offset, mic_offset;
+ u32 integ_len;
+ u32 maj_stat;
+ int status = -EIO;
+
+ integ_len = ntohl(*(*p)++);
+ if (integ_len & 3)
+ return status;
+ data_offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+ mic_offset = integ_len + data_offset;
+ if (mic_offset > rcv_buf->len)
+ return status;
+ if (ntohl(*(*p)++) != rqstp->rq_seqno)
+ return status;
+
+ if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset,
+ mic_offset - data_offset))
+ return status;
+
+ if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset))
+ return status;
+
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+ clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+ if (maj_stat != GSS_S_COMPLETE)
+ return status;
+ return 0;
+}
+
+static inline int
+gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_rqst *rqstp, __be32 **p)
+{
+ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
+ u32 offset;
+ u32 opaque_len;
+ u32 maj_stat;
+ int status = -EIO;
+
+ opaque_len = ntohl(*(*p)++);
+ offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+ if (offset + opaque_len > rcv_buf->len)
+ return status;
+ /* remove padding: */
+ rcv_buf->len = offset + opaque_len;
+
+ maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf);
+ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+ clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+ if (maj_stat != GSS_S_COMPLETE)
+ return status;
+ if (ntohl(*(*p)++) != rqstp->rq_seqno)
+ return status;
+
+ return 0;
+}
+
+static int
+gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
+ __be32 *p, void *obj)
+{
+ struct xdr_stream xdr;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ return decode(rqstp, &xdr, obj);
+}
+
+static int
+gss_unwrap_resp(struct rpc_task *task,
+ kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj)
+{
+ struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
+ gc_base);
+ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+ __be32 *savedp = p;
+ struct kvec *head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head;
+ int savedlen = head->iov_len;
+ int status = -EIO;
+
+ if (ctx->gc_proc != RPC_GSS_PROC_DATA)
+ goto out_decode;
+ switch (gss_cred->gc_service) {
+ case RPC_GSS_SVC_NONE:
+ break;
+ case RPC_GSS_SVC_INTEGRITY:
+ status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p);
+ if (status)
+ goto out;
+ break;
+ case RPC_GSS_SVC_PRIVACY:
+ status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p);
+ if (status)
+ goto out;
+ break;
+ }
+ /* take into account extra slack for integrity and privacy cases: */
+ cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp)
+ + (savedlen - head->iov_len);
+out_decode:
+ status = gss_unwrap_req_decode(decode, rqstp, p, obj);
+out:
+ gss_put_ctx(ctx);
+ dprintk("RPC: %5u %s returning %d\n",
+ task->tk_pid, __func__, status);
+ return status;
+}
+
+static const struct rpc_authops authgss_ops = {
+ .owner = THIS_MODULE,
+ .au_flavor = RPC_AUTH_GSS,
+ .au_name = "RPCSEC_GSS",
+ .create = gss_create,
+ .destroy = gss_destroy,
+ .lookup_cred = gss_lookup_cred,
+ .crcreate = gss_create_cred,
+ .list_pseudoflavors = gss_mech_list_pseudoflavors,
+ .info2flavor = gss_mech_info2flavor,
+ .flavor2info = gss_mech_flavor2info,
+};
+
+static const struct rpc_credops gss_credops = {
+ .cr_name = "AUTH_GSS",
+ .crdestroy = gss_destroy_cred,
+ .cr_init = gss_cred_init,
+ .crbind = rpcauth_generic_bind_cred,
+ .crmatch = gss_match,
+ .crmarshal = gss_marshal,
+ .crrefresh = gss_refresh,
+ .crvalidate = gss_validate,
+ .crwrap_req = gss_wrap_req,
+ .crunwrap_resp = gss_unwrap_resp,
+ .crkey_timeout = gss_key_timeout,
+ .crstringify_acceptor = gss_stringify_acceptor,
+};
+
+static const struct rpc_credops gss_nullops = {
+ .cr_name = "AUTH_GSS",
+ .crdestroy = gss_destroy_nullcred,
+ .crbind = rpcauth_generic_bind_cred,
+ .crmatch = gss_match,
+ .crmarshal = gss_marshal,
+ .crrefresh = gss_refresh_null,
+ .crvalidate = gss_validate,
+ .crwrap_req = gss_wrap_req,
+ .crunwrap_resp = gss_unwrap_resp,
+ .crstringify_acceptor = gss_stringify_acceptor,
+};
+
+static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = gss_pipe_downcall,
+ .destroy_msg = gss_pipe_destroy_msg,
+ .open_pipe = gss_pipe_open_v0,
+ .release_pipe = gss_pipe_release,
+};
+
+static const struct rpc_pipe_ops gss_upcall_ops_v1 = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = gss_pipe_downcall,
+ .destroy_msg = gss_pipe_destroy_msg,
+ .open_pipe = gss_pipe_open_v1,
+ .release_pipe = gss_pipe_release,
+};
+
+static __net_init int rpcsec_gss_init_net(struct net *net)
+{
+ return gss_svc_init_net(net);
+}
+
+static __net_exit void rpcsec_gss_exit_net(struct net *net)
+{
+ gss_svc_shutdown_net(net);
+}
+
+static struct pernet_operations rpcsec_gss_net_ops = {
+ .init = rpcsec_gss_init_net,
+ .exit = rpcsec_gss_exit_net,
+};
+
+/*
+ * Initialize RPCSEC_GSS module
+ */
+static int __init init_rpcsec_gss(void)
+{
+ int err = 0;
+
+ err = rpcauth_register(&authgss_ops);
+ if (err)
+ goto out;
+ err = gss_svc_init();
+ if (err)
+ goto out_unregister;
+ err = register_pernet_subsys(&rpcsec_gss_net_ops);
+ if (err)
+ goto out_svc_exit;
+ rpc_init_wait_queue(&pipe_version_rpc_waitqueue, "gss pipe version");
+ return 0;
+out_svc_exit:
+ gss_svc_shutdown();
+out_unregister:
+ rpcauth_unregister(&authgss_ops);
+out:
+ return err;
+}
+
+static void __exit exit_rpcsec_gss(void)
+{
+ unregister_pernet_subsys(&rpcsec_gss_net_ops);
+ gss_svc_shutdown();
+ rpcauth_unregister(&authgss_ops);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+
+MODULE_ALIAS("rpc-auth-6");
+MODULE_LICENSE("GPL");
+module_param_named(expired_cred_retry_delay,
+ gss_expired_cred_retry_delay,
+ uint, 0644);
+MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until "
+ "the RPC engine retries an expired credential");
+
+module_param_named(key_expire_timeo,
+ gss_key_expire_timeo,
+ uint, 0644);
+MODULE_PARM_DESC(key_expire_timeo, "Time (in seconds) at the end of a "
+ "credential keys lifetime where the NFS layer cleans up "
+ "prior to key expiration");
+
+module_init(init_rpcsec_gss)
+module_exit(exit_rpcsec_gss)
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
new file mode 100644
index 000000000..254defe44
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -0,0 +1,234 @@
+/*
+ * linux/net/sunrpc/gss_generic_token.c
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/gss_asn1.h>
+
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+ memcpy((ptr), (char *) (str), (len)); \
+ (ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+ never be longer than 127 bytes. This assumption is not inherent in
+ the interfaces, so the code can be fixed if the OSI namespace
+ balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60 tag for APPLICATION 0, SEQUENCE
+ (constructed, definite-length)
+ <length> possible multiple bytes, need to parse/generate
+ 0x06 tag for OBJECT IDENTIFIER
+ <moid_length> compile-time constant string (assume 1 byte)
+ <moid_bytes> compile-time constant string
+ <inner_bytes> the ANY containing the application token
+ bytes 0,1 are the token type
+ bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type. The token
+"body" consists of everything else.
+
+*/
+
+static int
+der_length_size( int length)
+{
+ if (length < (1<<7))
+ return 1;
+ else if (length < (1<<8))
+ return 2;
+#if (SIZEOF_INT == 2)
+ else
+ return 3;
+#else
+ else if (length < (1<<16))
+ return 3;
+ else if (length < (1<<24))
+ return 4;
+ else
+ return 5;
+#endif
+}
+
+static void
+der_write_length(unsigned char **buf, int length)
+{
+ if (length < (1<<7)) {
+ *(*buf)++ = (unsigned char) length;
+ } else {
+ *(*buf)++ = (unsigned char) (der_length_size(length)+127);
+#if (SIZEOF_INT > 2)
+ if (length >= (1<<24))
+ *(*buf)++ = (unsigned char) (length>>24);
+ if (length >= (1<<16))
+ *(*buf)++ = (unsigned char) ((length>>16)&0xff);
+#endif
+ if (length >= (1<<8))
+ *(*buf)++ = (unsigned char) ((length>>8)&0xff);
+ *(*buf)++ = (unsigned char) (length&0xff);
+ }
+}
+
+/* returns decoded length, or < 0 on failure. Advances buf and
+ decrements bufsize */
+
+static int
+der_read_length(unsigned char **buf, int *bufsize)
+{
+ unsigned char sf;
+ int ret;
+
+ if (*bufsize < 1)
+ return -1;
+ sf = *(*buf)++;
+ (*bufsize)--;
+ if (sf & 0x80) {
+ if ((sf &= 0x7f) > ((*bufsize)-1))
+ return -1;
+ if (sf > SIZEOF_INT)
+ return -1;
+ ret = 0;
+ for (; sf; sf--) {
+ ret = (ret<<8) + (*(*buf)++);
+ (*bufsize)--;
+ }
+ } else {
+ ret = sf;
+ }
+
+ return ret;
+}
+
+/* returns the length of a token, given the mech oid and the body size */
+
+int
+g_token_size(struct xdr_netobj *mech, unsigned int body_size)
+{
+ /* set body_size to sequence contents size */
+ body_size += 2 + (int) mech->len; /* NEED overflow check */
+ return 1 + der_length_size(body_size) + body_size;
+}
+
+EXPORT_SYMBOL_GPL(g_token_size);
+
+/* fills in a buffer with the token header. The buffer is assumed to
+ be the right size. buf is advanced past the token header */
+
+void
+g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf)
+{
+ *(*buf)++ = 0x60;
+ der_write_length(buf, 2 + mech->len + body_size);
+ *(*buf)++ = 0x06;
+ *(*buf)++ = (unsigned char) mech->len;
+ TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+EXPORT_SYMBOL_GPL(g_make_token_header);
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes. Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument. buf and
+ * *body_size are left unmodified on error.
+ */
+u32
+g_verify_token_header(struct xdr_netobj *mech, int *body_size,
+ unsigned char **buf_in, int toksize)
+{
+ unsigned char *buf = *buf_in;
+ int seqsize;
+ struct xdr_netobj toid;
+ int ret = 0;
+
+ if ((toksize-=1) < 0)
+ return G_BAD_TOK_HEADER;
+ if (*buf++ != 0x60)
+ return G_BAD_TOK_HEADER;
+
+ if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+ return G_BAD_TOK_HEADER;
+
+ if (seqsize != toksize)
+ return G_BAD_TOK_HEADER;
+
+ if ((toksize-=1) < 0)
+ return G_BAD_TOK_HEADER;
+ if (*buf++ != 0x06)
+ return G_BAD_TOK_HEADER;
+
+ if ((toksize-=1) < 0)
+ return G_BAD_TOK_HEADER;
+ toid.len = *buf++;
+
+ if ((toksize-=toid.len) < 0)
+ return G_BAD_TOK_HEADER;
+ toid.data = buf;
+ buf+=toid.len;
+
+ if (! g_OID_equal(&toid, mech))
+ ret = G_WRONG_MECH;
+
+ /* G_WRONG_MECH is not returned immediately because it's more important
+ to return G_BAD_TOK_HEADER if the token header is in fact bad */
+
+ if ((toksize-=2) < 0)
+ return G_BAD_TOK_HEADER;
+
+ if (ret)
+ return ret;
+
+ if (!ret) {
+ *buf_in = buf;
+ *body_size = toksize;
+ }
+
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(g_verify_token_header);
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
new file mode 100644
index 000000000..b5408e8a3
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -0,0 +1,988 @@
+/*
+ * linux/net/sunrpc/gss_krb5_crypto.c
+ *
+ * Copyright (c) 2000-2008 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government. It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. FundsXpress makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/err.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/random.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/sunrpc/xdr.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+u32
+krb5_encrypt(
+ struct crypto_blkcipher *tfm,
+ void * iv,
+ void * in,
+ void * out,
+ int length)
+{
+ u32 ret = -EINVAL;
+ struct scatterlist sg[1];
+ u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
+ struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv };
+
+ if (length % crypto_blkcipher_blocksize(tfm) != 0)
+ goto out;
+
+ if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+ dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n",
+ crypto_blkcipher_ivsize(tfm));
+ goto out;
+ }
+
+ if (iv)
+ memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm));
+
+ memcpy(out, in, length);
+ sg_init_one(sg, out, length);
+
+ ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, length);
+out:
+ dprintk("RPC: krb5_encrypt returns %d\n", ret);
+ return ret;
+}
+
+u32
+krb5_decrypt(
+ struct crypto_blkcipher *tfm,
+ void * iv,
+ void * in,
+ void * out,
+ int length)
+{
+ u32 ret = -EINVAL;
+ struct scatterlist sg[1];
+ u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
+ struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv };
+
+ if (length % crypto_blkcipher_blocksize(tfm) != 0)
+ goto out;
+
+ if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+ dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n",
+ crypto_blkcipher_ivsize(tfm));
+ goto out;
+ }
+ if (iv)
+ memcpy(local_iv,iv, crypto_blkcipher_ivsize(tfm));
+
+ memcpy(out, in, length);
+ sg_init_one(sg, out, length);
+
+ ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, length);
+out:
+ dprintk("RPC: gss_k5decrypt returns %d\n",ret);
+ return ret;
+}
+
+static int
+checksummer(struct scatterlist *sg, void *data)
+{
+ struct hash_desc *desc = data;
+
+ return crypto_hash_update(desc, sg, sg->length);
+}
+
+static int
+arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4])
+{
+ unsigned int ms_usage;
+
+ switch (usage) {
+ case KG_USAGE_SIGN:
+ ms_usage = 15;
+ break;
+ case KG_USAGE_SEAL:
+ ms_usage = 13;
+ break;
+ default:
+ return -EINVAL;
+ }
+ salt[0] = (ms_usage >> 0) & 0xff;
+ salt[1] = (ms_usage >> 8) & 0xff;
+ salt[2] = (ms_usage >> 16) & 0xff;
+ salt[3] = (ms_usage >> 24) & 0xff;
+
+ return 0;
+}
+
+static u32
+make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
+ struct xdr_buf *body, int body_offset, u8 *cksumkey,
+ unsigned int usage, struct xdr_netobj *cksumout)
+{
+ struct hash_desc desc;
+ struct scatterlist sg[1];
+ int err;
+ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ u8 rc4salt[4];
+ struct crypto_hash *md5;
+ struct crypto_hash *hmac_md5;
+
+ if (cksumkey == NULL)
+ return GSS_S_FAILURE;
+
+ if (cksumout->len < kctx->gk5e->cksumlength) {
+ dprintk("%s: checksum buffer length, %u, too small for %s\n",
+ __func__, cksumout->len, kctx->gk5e->name);
+ return GSS_S_FAILURE;
+ }
+
+ if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) {
+ dprintk("%s: invalid usage value %u\n", __func__, usage);
+ return GSS_S_FAILURE;
+ }
+
+ md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(md5))
+ return GSS_S_FAILURE;
+
+ hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(hmac_md5)) {
+ crypto_free_hash(md5);
+ return GSS_S_FAILURE;
+ }
+
+ desc.tfm = md5;
+ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_hash_init(&desc);
+ if (err)
+ goto out;
+ sg_init_one(sg, rc4salt, 4);
+ err = crypto_hash_update(&desc, sg, 4);
+ if (err)
+ goto out;
+
+ sg_init_one(sg, header, hdrlen);
+ err = crypto_hash_update(&desc, sg, hdrlen);
+ if (err)
+ goto out;
+ err = xdr_process_buf(body, body_offset, body->len - body_offset,
+ checksummer, &desc);
+ if (err)
+ goto out;
+ err = crypto_hash_final(&desc, checksumdata);
+ if (err)
+ goto out;
+
+ desc.tfm = hmac_md5;
+ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_hash_init(&desc);
+ if (err)
+ goto out;
+ err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
+ if (err)
+ goto out;
+
+ sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5));
+ err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5),
+ checksumdata);
+ if (err)
+ goto out;
+
+ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
+ cksumout->len = kctx->gk5e->cksumlength;
+out:
+ crypto_free_hash(md5);
+ crypto_free_hash(hmac_md5);
+ return err ? GSS_S_FAILURE : 0;
+}
+
+/*
+ * checksum the plaintext data and hdrlen bytes of the token header
+ * The checksum is performed over the first 8 bytes of the
+ * gss token header and then over the data body
+ */
+u32
+make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
+ struct xdr_buf *body, int body_offset, u8 *cksumkey,
+ unsigned int usage, struct xdr_netobj *cksumout)
+{
+ struct hash_desc desc;
+ struct scatterlist sg[1];
+ int err;
+ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ unsigned int checksumlen;
+
+ if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR)
+ return make_checksum_hmac_md5(kctx, header, hdrlen,
+ body, body_offset,
+ cksumkey, usage, cksumout);
+
+ if (cksumout->len < kctx->gk5e->cksumlength) {
+ dprintk("%s: checksum buffer length, %u, too small for %s\n",
+ __func__, cksumout->len, kctx->gk5e->name);
+ return GSS_S_FAILURE;
+ }
+
+ desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(desc.tfm))
+ return GSS_S_FAILURE;
+ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ checksumlen = crypto_hash_digestsize(desc.tfm);
+
+ if (cksumkey != NULL) {
+ err = crypto_hash_setkey(desc.tfm, cksumkey,
+ kctx->gk5e->keylength);
+ if (err)
+ goto out;
+ }
+
+ err = crypto_hash_init(&desc);
+ if (err)
+ goto out;
+ sg_init_one(sg, header, hdrlen);
+ err = crypto_hash_update(&desc, sg, hdrlen);
+ if (err)
+ goto out;
+ err = xdr_process_buf(body, body_offset, body->len - body_offset,
+ checksummer, &desc);
+ if (err)
+ goto out;
+ err = crypto_hash_final(&desc, checksumdata);
+ if (err)
+ goto out;
+
+ switch (kctx->gk5e->ctype) {
+ case CKSUMTYPE_RSA_MD5:
+ err = kctx->gk5e->encrypt(kctx->seq, NULL, checksumdata,
+ checksumdata, checksumlen);
+ if (err)
+ goto out;
+ memcpy(cksumout->data,
+ checksumdata + checksumlen - kctx->gk5e->cksumlength,
+ kctx->gk5e->cksumlength);
+ break;
+ case CKSUMTYPE_HMAC_SHA1_DES3:
+ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ cksumout->len = kctx->gk5e->cksumlength;
+out:
+ crypto_free_hash(desc.tfm);
+ return err ? GSS_S_FAILURE : 0;
+}
+
+/*
+ * checksum the plaintext data and hdrlen bytes of the token header
+ * Per rfc4121, sec. 4.2.4, the checksum is performed over the data
+ * body then over the first 16 octets of the MIC token
+ * Inclusion of the header data in the calculation of the
+ * checksum is optional.
+ */
+u32
+make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
+ struct xdr_buf *body, int body_offset, u8 *cksumkey,
+ unsigned int usage, struct xdr_netobj *cksumout)
+{
+ struct hash_desc desc;
+ struct scatterlist sg[1];
+ int err;
+ u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ unsigned int checksumlen;
+
+ if (kctx->gk5e->keyed_cksum == 0) {
+ dprintk("%s: expected keyed hash for %s\n",
+ __func__, kctx->gk5e->name);
+ return GSS_S_FAILURE;
+ }
+ if (cksumkey == NULL) {
+ dprintk("%s: no key supplied for %s\n",
+ __func__, kctx->gk5e->name);
+ return GSS_S_FAILURE;
+ }
+
+ desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(desc.tfm))
+ return GSS_S_FAILURE;
+ checksumlen = crypto_hash_digestsize(desc.tfm);
+ desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength);
+ if (err)
+ goto out;
+
+ err = crypto_hash_init(&desc);
+ if (err)
+ goto out;
+ err = xdr_process_buf(body, body_offset, body->len - body_offset,
+ checksummer, &desc);
+ if (err)
+ goto out;
+ if (header != NULL) {
+ sg_init_one(sg, header, hdrlen);
+ err = crypto_hash_update(&desc, sg, hdrlen);
+ if (err)
+ goto out;
+ }
+ err = crypto_hash_final(&desc, checksumdata);
+ if (err)
+ goto out;
+
+ cksumout->len = kctx->gk5e->cksumlength;
+
+ switch (kctx->gk5e->ctype) {
+ case CKSUMTYPE_HMAC_SHA1_96_AES128:
+ case CKSUMTYPE_HMAC_SHA1_96_AES256:
+ /* note that this truncates the hash */
+ memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
+ break;
+ default:
+ BUG();
+ break;
+ }
+out:
+ crypto_free_hash(desc.tfm);
+ return err ? GSS_S_FAILURE : 0;
+}
+
+struct encryptor_desc {
+ u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
+ struct blkcipher_desc desc;
+ int pos;
+ struct xdr_buf *outbuf;
+ struct page **pages;
+ struct scatterlist infrags[4];
+ struct scatterlist outfrags[4];
+ int fragno;
+ int fraglen;
+};
+
+static int
+encryptor(struct scatterlist *sg, void *data)
+{
+ struct encryptor_desc *desc = data;
+ struct xdr_buf *outbuf = desc->outbuf;
+ struct page *in_page;
+ int thislen = desc->fraglen + sg->length;
+ int fraglen, ret;
+ int page_pos;
+
+ /* Worst case is 4 fragments: head, end of page 1, start
+ * of page 2, tail. Anything more is a bug. */
+ BUG_ON(desc->fragno > 3);
+
+ page_pos = desc->pos - outbuf->head[0].iov_len;
+ if (page_pos >= 0 && page_pos < outbuf->page_len) {
+ /* pages are not in place: */
+ int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
+ in_page = desc->pages[i];
+ } else {
+ in_page = sg_page(sg);
+ }
+ sg_set_page(&desc->infrags[desc->fragno], in_page, sg->length,
+ sg->offset);
+ sg_set_page(&desc->outfrags[desc->fragno], sg_page(sg), sg->length,
+ sg->offset);
+ desc->fragno++;
+ desc->fraglen += sg->length;
+ desc->pos += sg->length;
+
+ fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1);
+ thislen -= fraglen;
+
+ if (thislen == 0)
+ return 0;
+
+ sg_mark_end(&desc->infrags[desc->fragno - 1]);
+ sg_mark_end(&desc->outfrags[desc->fragno - 1]);
+
+ ret = crypto_blkcipher_encrypt_iv(&desc->desc, desc->outfrags,
+ desc->infrags, thislen);
+ if (ret)
+ return ret;
+
+ sg_init_table(desc->infrags, 4);
+ sg_init_table(desc->outfrags, 4);
+
+ if (fraglen) {
+ sg_set_page(&desc->outfrags[0], sg_page(sg), fraglen,
+ sg->offset + sg->length - fraglen);
+ desc->infrags[0] = desc->outfrags[0];
+ sg_assign_page(&desc->infrags[0], in_page);
+ desc->fragno = 1;
+ desc->fraglen = fraglen;
+ } else {
+ desc->fragno = 0;
+ desc->fraglen = 0;
+ }
+ return 0;
+}
+
+int
+gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
+ int offset, struct page **pages)
+{
+ int ret;
+ struct encryptor_desc desc;
+
+ BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0);
+
+ memset(desc.iv, 0, sizeof(desc.iv));
+ desc.desc.tfm = tfm;
+ desc.desc.info = desc.iv;
+ desc.desc.flags = 0;
+ desc.pos = offset;
+ desc.outbuf = buf;
+ desc.pages = pages;
+ desc.fragno = 0;
+ desc.fraglen = 0;
+
+ sg_init_table(desc.infrags, 4);
+ sg_init_table(desc.outfrags, 4);
+
+ ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc);
+ return ret;
+}
+
+struct decryptor_desc {
+ u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
+ struct blkcipher_desc desc;
+ struct scatterlist frags[4];
+ int fragno;
+ int fraglen;
+};
+
+static int
+decryptor(struct scatterlist *sg, void *data)
+{
+ struct decryptor_desc *desc = data;
+ int thislen = desc->fraglen + sg->length;
+ int fraglen, ret;
+
+ /* Worst case is 4 fragments: head, end of page 1, start
+ * of page 2, tail. Anything more is a bug. */
+ BUG_ON(desc->fragno > 3);
+ sg_set_page(&desc->frags[desc->fragno], sg_page(sg), sg->length,
+ sg->offset);
+ desc->fragno++;
+ desc->fraglen += sg->length;
+
+ fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1);
+ thislen -= fraglen;
+
+ if (thislen == 0)
+ return 0;
+
+ sg_mark_end(&desc->frags[desc->fragno - 1]);
+
+ ret = crypto_blkcipher_decrypt_iv(&desc->desc, desc->frags,
+ desc->frags, thislen);
+ if (ret)
+ return ret;
+
+ sg_init_table(desc->frags, 4);
+
+ if (fraglen) {
+ sg_set_page(&desc->frags[0], sg_page(sg), fraglen,
+ sg->offset + sg->length - fraglen);
+ desc->fragno = 1;
+ desc->fraglen = fraglen;
+ } else {
+ desc->fragno = 0;
+ desc->fraglen = 0;
+ }
+ return 0;
+}
+
+int
+gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
+ int offset)
+{
+ struct decryptor_desc desc;
+
+ /* XXXJBF: */
+ BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0);
+
+ memset(desc.iv, 0, sizeof(desc.iv));
+ desc.desc.tfm = tfm;
+ desc.desc.info = desc.iv;
+ desc.desc.flags = 0;
+ desc.fragno = 0;
+ desc.fraglen = 0;
+
+ sg_init_table(desc.frags, 4);
+
+ return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
+}
+
+/*
+ * This function makes the assumption that it was ultimately called
+ * from gss_wrap().
+ *
+ * The client auth_gss code moves any existing tail data into a
+ * separate page before calling gss_wrap.
+ * The server svcauth_gss code ensures that both the head and the
+ * tail have slack space of RPC_MAX_AUTH_SIZE before calling gss_wrap.
+ *
+ * Even with that guarantee, this function may be called more than
+ * once in the processing of gss_wrap(). The best we can do is
+ * verify at compile-time (see GSS_KRB5_SLACK_CHECK) that the
+ * largest expected shift will fit within RPC_MAX_AUTH_SIZE.
+ * At run-time we can verify that a single invocation of this
+ * function doesn't attempt to use more the RPC_MAX_AUTH_SIZE.
+ */
+
+int
+xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen)
+{
+ u8 *p;
+
+ if (shiftlen == 0)
+ return 0;
+
+ BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE);
+ BUG_ON(shiftlen > RPC_MAX_AUTH_SIZE);
+
+ p = buf->head[0].iov_base + base;
+
+ memmove(p + shiftlen, p, buf->head[0].iov_len - base);
+
+ buf->head[0].iov_len += shiftlen;
+ buf->len += shiftlen;
+
+ return 0;
+}
+
+static u32
+gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf,
+ u32 offset, u8 *iv, struct page **pages, int encrypt)
+{
+ u32 ret;
+ struct scatterlist sg[1];
+ struct blkcipher_desc desc = { .tfm = cipher, .info = iv };
+ u8 data[GSS_KRB5_MAX_BLOCKSIZE * 2];
+ struct page **save_pages;
+ u32 len = buf->len - offset;
+
+ if (len > ARRAY_SIZE(data)) {
+ WARN_ON(0);
+ return -ENOMEM;
+ }
+
+ /*
+ * For encryption, we want to read from the cleartext
+ * page cache pages, and write the encrypted data to
+ * the supplied xdr_buf pages.
+ */
+ save_pages = buf->pages;
+ if (encrypt)
+ buf->pages = pages;
+
+ ret = read_bytes_from_xdr_buf(buf, offset, data, len);
+ buf->pages = save_pages;
+ if (ret)
+ goto out;
+
+ sg_init_one(sg, data, len);
+
+ if (encrypt)
+ ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+ else
+ ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len);
+
+ if (ret)
+ goto out;
+
+ ret = write_bytes_to_xdr_buf(buf, offset, data, len);
+
+out:
+ return ret;
+}
+
+u32
+gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
+ struct xdr_buf *buf, struct page **pages)
+{
+ u32 err;
+ struct xdr_netobj hmac;
+ u8 *cksumkey;
+ u8 *ecptr;
+ struct crypto_blkcipher *cipher, *aux_cipher;
+ int blocksize;
+ struct page **save_pages;
+ int nblocks, nbytes;
+ struct encryptor_desc desc;
+ u32 cbcbytes;
+ unsigned int usage;
+
+ if (kctx->initiate) {
+ cipher = kctx->initiator_enc;
+ aux_cipher = kctx->initiator_enc_aux;
+ cksumkey = kctx->initiator_integ;
+ usage = KG_USAGE_INITIATOR_SEAL;
+ } else {
+ cipher = kctx->acceptor_enc;
+ aux_cipher = kctx->acceptor_enc_aux;
+ cksumkey = kctx->acceptor_integ;
+ usage = KG_USAGE_ACCEPTOR_SEAL;
+ }
+ blocksize = crypto_blkcipher_blocksize(cipher);
+
+ /* hide the gss token header and insert the confounder */
+ offset += GSS_KRB5_TOK_HDR_LEN;
+ if (xdr_extend_head(buf, offset, kctx->gk5e->conflen))
+ return GSS_S_FAILURE;
+ gss_krb5_make_confounder(buf->head[0].iov_base + offset, kctx->gk5e->conflen);
+ offset -= GSS_KRB5_TOK_HDR_LEN;
+
+ if (buf->tail[0].iov_base != NULL) {
+ ecptr = buf->tail[0].iov_base + buf->tail[0].iov_len;
+ } else {
+ buf->tail[0].iov_base = buf->head[0].iov_base
+ + buf->head[0].iov_len;
+ buf->tail[0].iov_len = 0;
+ ecptr = buf->tail[0].iov_base;
+ }
+
+ /* copy plaintext gss token header after filler (if any) */
+ memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN);
+ buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN;
+ buf->len += GSS_KRB5_TOK_HDR_LEN;
+
+ /* Do the HMAC */
+ hmac.len = GSS_KRB5_MAX_CKSUM_LEN;
+ hmac.data = buf->tail[0].iov_base + buf->tail[0].iov_len;
+
+ /*
+ * When we are called, pages points to the real page cache
+ * data -- which we can't go and encrypt! buf->pages points
+ * to scratch pages which we are going to send off to the
+ * client/server. Swap in the plaintext pages to calculate
+ * the hmac.
+ */
+ save_pages = buf->pages;
+ buf->pages = pages;
+
+ err = make_checksum_v2(kctx, NULL, 0, buf,
+ offset + GSS_KRB5_TOK_HDR_LEN,
+ cksumkey, usage, &hmac);
+ buf->pages = save_pages;
+ if (err)
+ return GSS_S_FAILURE;
+
+ nbytes = buf->len - offset - GSS_KRB5_TOK_HDR_LEN;
+ nblocks = (nbytes + blocksize - 1) / blocksize;
+ cbcbytes = 0;
+ if (nblocks > 2)
+ cbcbytes = (nblocks - 2) * blocksize;
+
+ memset(desc.iv, 0, sizeof(desc.iv));
+
+ if (cbcbytes) {
+ desc.pos = offset + GSS_KRB5_TOK_HDR_LEN;
+ desc.fragno = 0;
+ desc.fraglen = 0;
+ desc.pages = pages;
+ desc.outbuf = buf;
+ desc.desc.info = desc.iv;
+ desc.desc.flags = 0;
+ desc.desc.tfm = aux_cipher;
+
+ sg_init_table(desc.infrags, 4);
+ sg_init_table(desc.outfrags, 4);
+
+ err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN,
+ cbcbytes, encryptor, &desc);
+ if (err)
+ goto out_err;
+ }
+
+ /* Make sure IV carries forward from any CBC results. */
+ err = gss_krb5_cts_crypt(cipher, buf,
+ offset + GSS_KRB5_TOK_HDR_LEN + cbcbytes,
+ desc.iv, pages, 1);
+ if (err) {
+ err = GSS_S_FAILURE;
+ goto out_err;
+ }
+
+ /* Now update buf to account for HMAC */
+ buf->tail[0].iov_len += kctx->gk5e->cksumlength;
+ buf->len += kctx->gk5e->cksumlength;
+
+out_err:
+ if (err)
+ err = GSS_S_FAILURE;
+ return err;
+}
+
+u32
+gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
+ u32 *headskip, u32 *tailskip)
+{
+ struct xdr_buf subbuf;
+ u32 ret = 0;
+ u8 *cksum_key;
+ struct crypto_blkcipher *cipher, *aux_cipher;
+ struct xdr_netobj our_hmac_obj;
+ u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN];
+ u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN];
+ int nblocks, blocksize, cbcbytes;
+ struct decryptor_desc desc;
+ unsigned int usage;
+
+ if (kctx->initiate) {
+ cipher = kctx->acceptor_enc;
+ aux_cipher = kctx->acceptor_enc_aux;
+ cksum_key = kctx->acceptor_integ;
+ usage = KG_USAGE_ACCEPTOR_SEAL;
+ } else {
+ cipher = kctx->initiator_enc;
+ aux_cipher = kctx->initiator_enc_aux;
+ cksum_key = kctx->initiator_integ;
+ usage = KG_USAGE_INITIATOR_SEAL;
+ }
+ blocksize = crypto_blkcipher_blocksize(cipher);
+
+
+ /* create a segment skipping the header and leaving out the checksum */
+ xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN,
+ (buf->len - offset - GSS_KRB5_TOK_HDR_LEN -
+ kctx->gk5e->cksumlength));
+
+ nblocks = (subbuf.len + blocksize - 1) / blocksize;
+
+ cbcbytes = 0;
+ if (nblocks > 2)
+ cbcbytes = (nblocks - 2) * blocksize;
+
+ memset(desc.iv, 0, sizeof(desc.iv));
+
+ if (cbcbytes) {
+ desc.fragno = 0;
+ desc.fraglen = 0;
+ desc.desc.info = desc.iv;
+ desc.desc.flags = 0;
+ desc.desc.tfm = aux_cipher;
+
+ sg_init_table(desc.frags, 4);
+
+ ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc);
+ if (ret)
+ goto out_err;
+ }
+
+ /* Make sure IV carries forward from any CBC results. */
+ ret = gss_krb5_cts_crypt(cipher, &subbuf, cbcbytes, desc.iv, NULL, 0);
+ if (ret)
+ goto out_err;
+
+
+ /* Calculate our hmac over the plaintext data */
+ our_hmac_obj.len = sizeof(our_hmac);
+ our_hmac_obj.data = our_hmac;
+
+ ret = make_checksum_v2(kctx, NULL, 0, &subbuf, 0,
+ cksum_key, usage, &our_hmac_obj);
+ if (ret)
+ goto out_err;
+
+ /* Get the packet's hmac value */
+ ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength,
+ pkt_hmac, kctx->gk5e->cksumlength);
+ if (ret)
+ goto out_err;
+
+ if (memcmp(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) {
+ ret = GSS_S_BAD_SIG;
+ goto out_err;
+ }
+ *headskip = kctx->gk5e->conflen;
+ *tailskip = kctx->gk5e->cksumlength;
+out_err:
+ if (ret && ret != GSS_S_BAD_SIG)
+ ret = GSS_S_FAILURE;
+ return ret;
+}
+
+/*
+ * Compute Kseq given the initial session key and the checksum.
+ * Set the key of the given cipher.
+ */
+int
+krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
+ unsigned char *cksum)
+{
+ struct crypto_hash *hmac;
+ struct hash_desc desc;
+ struct scatterlist sg[1];
+ u8 Kseq[GSS_KRB5_MAX_KEYLEN];
+ u32 zeroconstant = 0;
+ int err;
+
+ dprintk("%s: entered\n", __func__);
+
+ hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(hmac)) {
+ dprintk("%s: error %ld, allocating hash '%s'\n",
+ __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
+ return PTR_ERR(hmac);
+ }
+
+ desc.tfm = hmac;
+ desc.flags = 0;
+
+ err = crypto_hash_init(&desc);
+ if (err)
+ goto out_err;
+
+ /* Compute intermediate Kseq from session key */
+ err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
+ if (err)
+ goto out_err;
+
+ sg_init_table(sg, 1);
+ sg_set_buf(sg, &zeroconstant, 4);
+
+ err = crypto_hash_digest(&desc, sg, 4, Kseq);
+ if (err)
+ goto out_err;
+
+ /* Compute final Kseq from the checksum and intermediate Kseq */
+ err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength);
+ if (err)
+ goto out_err;
+
+ sg_set_buf(sg, cksum, 8);
+
+ err = crypto_hash_digest(&desc, sg, 8, Kseq);
+ if (err)
+ goto out_err;
+
+ err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
+ if (err)
+ goto out_err;
+
+ err = 0;
+
+out_err:
+ crypto_free_hash(hmac);
+ dprintk("%s: returning %d\n", __func__, err);
+ return err;
+}
+
+/*
+ * Compute Kcrypt given the initial session key and the plaintext seqnum.
+ * Set the key of cipher kctx->enc.
+ */
+int
+krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
+ s32 seqnum)
+{
+ struct crypto_hash *hmac;
+ struct hash_desc desc;
+ struct scatterlist sg[1];
+ u8 Kcrypt[GSS_KRB5_MAX_KEYLEN];
+ u8 zeroconstant[4] = {0};
+ u8 seqnumarray[4];
+ int err, i;
+
+ dprintk("%s: entered, seqnum %u\n", __func__, seqnum);
+
+ hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(hmac)) {
+ dprintk("%s: error %ld, allocating hash '%s'\n",
+ __func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
+ return PTR_ERR(hmac);
+ }
+
+ desc.tfm = hmac;
+ desc.flags = 0;
+
+ err = crypto_hash_init(&desc);
+ if (err)
+ goto out_err;
+
+ /* Compute intermediate Kcrypt from session key */
+ for (i = 0; i < kctx->gk5e->keylength; i++)
+ Kcrypt[i] = kctx->Ksess[i] ^ 0xf0;
+
+ err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
+ if (err)
+ goto out_err;
+
+ sg_init_table(sg, 1);
+ sg_set_buf(sg, zeroconstant, 4);
+
+ err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
+ if (err)
+ goto out_err;
+
+ /* Compute final Kcrypt from the seqnum and intermediate Kcrypt */
+ err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
+ if (err)
+ goto out_err;
+
+ seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff);
+ seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff);
+ seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff);
+ seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff);
+
+ sg_set_buf(sg, seqnumarray, 4);
+
+ err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
+ if (err)
+ goto out_err;
+
+ err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength);
+ if (err)
+ goto out_err;
+
+ err = 0;
+
+out_err:
+ crypto_free_hash(hmac);
+ dprintk("%s: returning %d\n", __func__, err);
+ return err;
+}
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
new file mode 100644
index 000000000..234fa8d0f
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -0,0 +1,327 @@
+/*
+ * COPYRIGHT (c) 2008
+ * The Regents of the University of Michigan
+ * ALL RIGHTS RESERVED
+ *
+ * Permission is granted to use, copy, create derivative works
+ * and redistribute this software and such derivative works
+ * for any purpose, so long as the name of The University of
+ * Michigan is not used in any advertising or publicity
+ * pertaining to the use of distribution of this software
+ * without specific, written prior authorization. If the
+ * above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any
+ * portion of this software, then the disclaimer below must
+ * also be included.
+ *
+ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION
+ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY
+ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF
+ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
+ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE
+ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR
+ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN
+ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGES.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government. It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. FundsXpress makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/err.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/lcm.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+/*
+ * This is the n-fold function as described in rfc3961, sec 5.1
+ * Taken from MIT Kerberos and modified.
+ */
+
+static void krb5_nfold(u32 inbits, const u8 *in,
+ u32 outbits, u8 *out)
+{
+ unsigned long ulcm;
+ int byte, i, msbit;
+
+ /* the code below is more readable if I make these bytes
+ instead of bits */
+
+ inbits >>= 3;
+ outbits >>= 3;
+
+ /* first compute lcm(n,k) */
+ ulcm = lcm(inbits, outbits);
+
+ /* now do the real work */
+
+ memset(out, 0, outbits);
+ byte = 0;
+
+ /* this will end up cycling through k lcm(k,n)/k times, which
+ is correct */
+ for (i = ulcm-1; i >= 0; i--) {
+ /* compute the msbit in k which gets added into this byte */
+ msbit = (
+ /* first, start with the msbit in the first,
+ * unrotated byte */
+ ((inbits << 3) - 1)
+ /* then, for each byte, shift to the right
+ * for each repetition */
+ + (((inbits << 3) + 13) * (i/inbits))
+ /* last, pick out the correct byte within
+ * that shifted repetition */
+ + ((inbits - (i % inbits)) << 3)
+ ) % (inbits << 3);
+
+ /* pull out the byte value itself */
+ byte += (((in[((inbits - 1) - (msbit >> 3)) % inbits] << 8)|
+ (in[((inbits) - (msbit >> 3)) % inbits]))
+ >> ((msbit & 7) + 1)) & 0xff;
+
+ /* do the addition */
+ byte += out[i % outbits];
+ out[i % outbits] = byte & 0xff;
+
+ /* keep around the carry bit, if any */
+ byte >>= 8;
+
+ }
+
+ /* if there's a carry bit left over, add it back in */
+ if (byte) {
+ for (i = outbits - 1; i >= 0; i--) {
+ /* do the addition */
+ byte += out[i];
+ out[i] = byte & 0xff;
+
+ /* keep around the carry bit, if any */
+ byte >>= 8;
+ }
+ }
+}
+
+/*
+ * This is the DK (derive_key) function as described in rfc3961, sec 5.1
+ * Taken from MIT Kerberos and modified.
+ */
+
+u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
+ const struct xdr_netobj *inkey,
+ struct xdr_netobj *outkey,
+ const struct xdr_netobj *in_constant,
+ gfp_t gfp_mask)
+{
+ size_t blocksize, keybytes, keylength, n;
+ unsigned char *inblockdata, *outblockdata, *rawkey;
+ struct xdr_netobj inblock, outblock;
+ struct crypto_blkcipher *cipher;
+ u32 ret = EINVAL;
+
+ blocksize = gk5e->blocksize;
+ keybytes = gk5e->keybytes;
+ keylength = gk5e->keylength;
+
+ if ((inkey->len != keylength) || (outkey->len != keylength))
+ goto err_return;
+
+ cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cipher))
+ goto err_return;
+ if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len))
+ goto err_return;
+
+ /* allocate and set up buffers */
+
+ ret = ENOMEM;
+ inblockdata = kmalloc(blocksize, gfp_mask);
+ if (inblockdata == NULL)
+ goto err_free_cipher;
+
+ outblockdata = kmalloc(blocksize, gfp_mask);
+ if (outblockdata == NULL)
+ goto err_free_in;
+
+ rawkey = kmalloc(keybytes, gfp_mask);
+ if (rawkey == NULL)
+ goto err_free_out;
+
+ inblock.data = (char *) inblockdata;
+ inblock.len = blocksize;
+
+ outblock.data = (char *) outblockdata;
+ outblock.len = blocksize;
+
+ /* initialize the input block */
+
+ if (in_constant->len == inblock.len) {
+ memcpy(inblock.data, in_constant->data, inblock.len);
+ } else {
+ krb5_nfold(in_constant->len * 8, in_constant->data,
+ inblock.len * 8, inblock.data);
+ }
+
+ /* loop encrypting the blocks until enough key bytes are generated */
+
+ n = 0;
+ while (n < keybytes) {
+ (*(gk5e->encrypt))(cipher, NULL, inblock.data,
+ outblock.data, inblock.len);
+
+ if ((keybytes - n) <= outblock.len) {
+ memcpy(rawkey + n, outblock.data, (keybytes - n));
+ break;
+ }
+
+ memcpy(rawkey + n, outblock.data, outblock.len);
+ memcpy(inblock.data, outblock.data, outblock.len);
+ n += outblock.len;
+ }
+
+ /* postprocess the key */
+
+ inblock.data = (char *) rawkey;
+ inblock.len = keybytes;
+
+ BUG_ON(gk5e->mk_key == NULL);
+ ret = (*(gk5e->mk_key))(gk5e, &inblock, outkey);
+ if (ret) {
+ dprintk("%s: got %d from mk_key function for '%s'\n",
+ __func__, ret, gk5e->encrypt_name);
+ goto err_free_raw;
+ }
+
+ /* clean memory, free resources and exit */
+
+ ret = 0;
+
+err_free_raw:
+ memset(rawkey, 0, keybytes);
+ kfree(rawkey);
+err_free_out:
+ memset(outblockdata, 0, blocksize);
+ kfree(outblockdata);
+err_free_in:
+ memset(inblockdata, 0, blocksize);
+ kfree(inblockdata);
+err_free_cipher:
+ crypto_free_blkcipher(cipher);
+err_return:
+ return ret;
+}
+
+#define smask(step) ((1<<step)-1)
+#define pstep(x, step) (((x)&smask(step))^(((x)>>step)&smask(step)))
+#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1)
+
+static void mit_des_fixup_key_parity(u8 key[8])
+{
+ int i;
+ for (i = 0; i < 8; i++) {
+ key[i] &= 0xfe;
+ key[i] |= 1^parity_char(key[i]);
+ }
+}
+
+/*
+ * This is the des3 key derivation postprocess function
+ */
+u32 gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e,
+ struct xdr_netobj *randombits,
+ struct xdr_netobj *key)
+{
+ int i;
+ u32 ret = EINVAL;
+
+ if (key->len != 24) {
+ dprintk("%s: key->len is %d\n", __func__, key->len);
+ goto err_out;
+ }
+ if (randombits->len != 21) {
+ dprintk("%s: randombits->len is %d\n",
+ __func__, randombits->len);
+ goto err_out;
+ }
+
+ /* take the seven bytes, move them around into the top 7 bits of the
+ 8 key bytes, then compute the parity bits. Do this three times. */
+
+ for (i = 0; i < 3; i++) {
+ memcpy(key->data + i*8, randombits->data + i*7, 7);
+ key->data[i*8+7] = (((key->data[i*8]&1)<<1) |
+ ((key->data[i*8+1]&1)<<2) |
+ ((key->data[i*8+2]&1)<<3) |
+ ((key->data[i*8+3]&1)<<4) |
+ ((key->data[i*8+4]&1)<<5) |
+ ((key->data[i*8+5]&1)<<6) |
+ ((key->data[i*8+6]&1)<<7));
+
+ mit_des_fixup_key_parity(key->data + i*8);
+ }
+ ret = 0;
+err_out:
+ return ret;
+}
+
+/*
+ * This is the aes key derivation postprocess function
+ */
+u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
+ struct xdr_netobj *randombits,
+ struct xdr_netobj *key)
+{
+ u32 ret = EINVAL;
+
+ if (key->len != 16 && key->len != 32) {
+ dprintk("%s: key->len is %d\n", __func__, key->len);
+ goto err_out;
+ }
+ if (randombits->len != 16 && randombits->len != 32) {
+ dprintk("%s: randombits->len is %d\n",
+ __func__, randombits->len);
+ goto err_out;
+ }
+ if (randombits->len != key->len) {
+ dprintk("%s: randombits->len is %d, key->len is %d\n",
+ __func__, randombits->len, key->len);
+ goto err_out;
+ }
+ memcpy(key->data, randombits->data, key->len);
+ ret = 0;
+err_out:
+ return ret;
+}
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
new file mode 100644
index 000000000..28db442a0
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -0,0 +1,788 @@
+/*
+ * linux/net/sunrpc/gss_krb5_mech.c
+ *
+ * Copyright (c) 2001-2008 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ * J. Bruce Fields <bfields@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/crypto.h>
+#include <linux/sunrpc/gss_krb5_enctypes.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+static struct gss_api_mech gss_kerberos_mech; /* forward declaration */
+
+static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = {
+ /*
+ * DES (All DES enctypes are mapped to the same gss functionality)
+ */
+ {
+ .etype = ENCTYPE_DES_CBC_RAW,
+ .ctype = CKSUMTYPE_RSA_MD5,
+ .name = "des-cbc-crc",
+ .encrypt_name = "cbc(des)",
+ .cksum_name = "md5",
+ .encrypt = krb5_encrypt,
+ .decrypt = krb5_decrypt,
+ .mk_key = NULL,
+ .signalg = SGN_ALG_DES_MAC_MD5,
+ .sealalg = SEAL_ALG_DES,
+ .keybytes = 7,
+ .keylength = 8,
+ .blocksize = 8,
+ .conflen = 8,
+ .cksumlength = 8,
+ .keyed_cksum = 0,
+ },
+ /*
+ * RC4-HMAC
+ */
+ {
+ .etype = ENCTYPE_ARCFOUR_HMAC,
+ .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR,
+ .name = "rc4-hmac",
+ .encrypt_name = "ecb(arc4)",
+ .cksum_name = "hmac(md5)",
+ .encrypt = krb5_encrypt,
+ .decrypt = krb5_decrypt,
+ .mk_key = NULL,
+ .signalg = SGN_ALG_HMAC_MD5,
+ .sealalg = SEAL_ALG_MICROSOFT_RC4,
+ .keybytes = 16,
+ .keylength = 16,
+ .blocksize = 1,
+ .conflen = 8,
+ .cksumlength = 8,
+ .keyed_cksum = 1,
+ },
+ /*
+ * 3DES
+ */
+ {
+ .etype = ENCTYPE_DES3_CBC_RAW,
+ .ctype = CKSUMTYPE_HMAC_SHA1_DES3,
+ .name = "des3-hmac-sha1",
+ .encrypt_name = "cbc(des3_ede)",
+ .cksum_name = "hmac(sha1)",
+ .encrypt = krb5_encrypt,
+ .decrypt = krb5_decrypt,
+ .mk_key = gss_krb5_des3_make_key,
+ .signalg = SGN_ALG_HMAC_SHA1_DES3_KD,
+ .sealalg = SEAL_ALG_DES3KD,
+ .keybytes = 21,
+ .keylength = 24,
+ .blocksize = 8,
+ .conflen = 8,
+ .cksumlength = 20,
+ .keyed_cksum = 1,
+ },
+ /*
+ * AES128
+ */
+ {
+ .etype = ENCTYPE_AES128_CTS_HMAC_SHA1_96,
+ .ctype = CKSUMTYPE_HMAC_SHA1_96_AES128,
+ .name = "aes128-cts",
+ .encrypt_name = "cts(cbc(aes))",
+ .cksum_name = "hmac(sha1)",
+ .encrypt = krb5_encrypt,
+ .decrypt = krb5_decrypt,
+ .mk_key = gss_krb5_aes_make_key,
+ .encrypt_v2 = gss_krb5_aes_encrypt,
+ .decrypt_v2 = gss_krb5_aes_decrypt,
+ .signalg = -1,
+ .sealalg = -1,
+ .keybytes = 16,
+ .keylength = 16,
+ .blocksize = 16,
+ .conflen = 16,
+ .cksumlength = 12,
+ .keyed_cksum = 1,
+ },
+ /*
+ * AES256
+ */
+ {
+ .etype = ENCTYPE_AES256_CTS_HMAC_SHA1_96,
+ .ctype = CKSUMTYPE_HMAC_SHA1_96_AES256,
+ .name = "aes256-cts",
+ .encrypt_name = "cts(cbc(aes))",
+ .cksum_name = "hmac(sha1)",
+ .encrypt = krb5_encrypt,
+ .decrypt = krb5_decrypt,
+ .mk_key = gss_krb5_aes_make_key,
+ .encrypt_v2 = gss_krb5_aes_encrypt,
+ .decrypt_v2 = gss_krb5_aes_decrypt,
+ .signalg = -1,
+ .sealalg = -1,
+ .keybytes = 32,
+ .keylength = 32,
+ .blocksize = 16,
+ .conflen = 16,
+ .cksumlength = 12,
+ .keyed_cksum = 1,
+ },
+};
+
+static const int num_supported_enctypes =
+ ARRAY_SIZE(supported_gss_krb5_enctypes);
+
+static int
+supported_gss_krb5_enctype(int etype)
+{
+ int i;
+ for (i = 0; i < num_supported_enctypes; i++)
+ if (supported_gss_krb5_enctypes[i].etype == etype)
+ return 1;
+ return 0;
+}
+
+static const struct gss_krb5_enctype *
+get_gss_krb5_enctype(int etype)
+{
+ int i;
+ for (i = 0; i < num_supported_enctypes; i++)
+ if (supported_gss_krb5_enctypes[i].etype == etype)
+ return &supported_gss_krb5_enctypes[i];
+ return NULL;
+}
+
+static const void *
+simple_get_bytes(const void *p, const void *end, void *res, int len)
+{
+ const void *q = (const void *)((const char *)p + len);
+ if (unlikely(q > end || q < p))
+ return ERR_PTR(-EFAULT);
+ memcpy(res, p, len);
+ return q;
+}
+
+static const void *
+simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
+{
+ const void *q;
+ unsigned int len;
+
+ p = simple_get_bytes(p, end, &len, sizeof(len));
+ if (IS_ERR(p))
+ return p;
+ q = (const void *)((const char *)p + len);
+ if (unlikely(q > end || q < p))
+ return ERR_PTR(-EFAULT);
+ res->data = kmemdup(p, len, GFP_NOFS);
+ if (unlikely(res->data == NULL))
+ return ERR_PTR(-ENOMEM);
+ res->len = len;
+ return q;
+}
+
+static inline const void *
+get_key(const void *p, const void *end,
+ struct krb5_ctx *ctx, struct crypto_blkcipher **res)
+{
+ struct xdr_netobj key;
+ int alg;
+
+ p = simple_get_bytes(p, end, &alg, sizeof(alg));
+ if (IS_ERR(p))
+ goto out_err;
+
+ switch (alg) {
+ case ENCTYPE_DES_CBC_CRC:
+ case ENCTYPE_DES_CBC_MD4:
+ case ENCTYPE_DES_CBC_MD5:
+ /* Map all these key types to ENCTYPE_DES_CBC_RAW */
+ alg = ENCTYPE_DES_CBC_RAW;
+ break;
+ }
+
+ if (!supported_gss_krb5_enctype(alg)) {
+ printk(KERN_WARNING "gss_kerberos_mech: unsupported "
+ "encryption key algorithm %d\n", alg);
+ p = ERR_PTR(-EINVAL);
+ goto out_err;
+ }
+ p = simple_get_netobj(p, end, &key);
+ if (IS_ERR(p))
+ goto out_err;
+
+ *res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(*res)) {
+ printk(KERN_WARNING "gss_kerberos_mech: unable to initialize "
+ "crypto algorithm %s\n", ctx->gk5e->encrypt_name);
+ *res = NULL;
+ goto out_err_free_key;
+ }
+ if (crypto_blkcipher_setkey(*res, key.data, key.len)) {
+ printk(KERN_WARNING "gss_kerberos_mech: error setting key for "
+ "crypto algorithm %s\n", ctx->gk5e->encrypt_name);
+ goto out_err_free_tfm;
+ }
+
+ kfree(key.data);
+ return p;
+
+out_err_free_tfm:
+ crypto_free_blkcipher(*res);
+out_err_free_key:
+ kfree(key.data);
+ p = ERR_PTR(-EINVAL);
+out_err:
+ return p;
+}
+
+static int
+gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
+{
+ int tmp;
+
+ p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate));
+ if (IS_ERR(p))
+ goto out_err;
+
+ /* Old format supports only DES! Any other enctype uses new format */
+ ctx->enctype = ENCTYPE_DES_CBC_RAW;
+
+ ctx->gk5e = get_gss_krb5_enctype(ctx->enctype);
+ if (ctx->gk5e == NULL) {
+ p = ERR_PTR(-EINVAL);
+ goto out_err;
+ }
+
+ /* The downcall format was designed before we completely understood
+ * the uses of the context fields; so it includes some stuff we
+ * just give some minimal sanity-checking, and some we ignore
+ * completely (like the next twenty bytes): */
+ if (unlikely(p + 20 > end || p + 20 < p)) {
+ p = ERR_PTR(-EFAULT);
+ goto out_err;
+ }
+ p += 20;
+ p = simple_get_bytes(p, end, &tmp, sizeof(tmp));
+ if (IS_ERR(p))
+ goto out_err;
+ if (tmp != SGN_ALG_DES_MAC_MD5) {
+ p = ERR_PTR(-ENOSYS);
+ goto out_err;
+ }
+ p = simple_get_bytes(p, end, &tmp, sizeof(tmp));
+ if (IS_ERR(p))
+ goto out_err;
+ if (tmp != SEAL_ALG_DES) {
+ p = ERR_PTR(-ENOSYS);
+ goto out_err;
+ }
+ p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
+ if (IS_ERR(p))
+ goto out_err;
+ p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send));
+ if (IS_ERR(p))
+ goto out_err;
+ p = simple_get_netobj(p, end, &ctx->mech_used);
+ if (IS_ERR(p))
+ goto out_err;
+ p = get_key(p, end, ctx, &ctx->enc);
+ if (IS_ERR(p))
+ goto out_err_free_mech;
+ p = get_key(p, end, ctx, &ctx->seq);
+ if (IS_ERR(p))
+ goto out_err_free_key1;
+ if (p != end) {
+ p = ERR_PTR(-EFAULT);
+ goto out_err_free_key2;
+ }
+
+ return 0;
+
+out_err_free_key2:
+ crypto_free_blkcipher(ctx->seq);
+out_err_free_key1:
+ crypto_free_blkcipher(ctx->enc);
+out_err_free_mech:
+ kfree(ctx->mech_used.data);
+out_err:
+ return PTR_ERR(p);
+}
+
+static struct crypto_blkcipher *
+context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key)
+{
+ struct crypto_blkcipher *cp;
+
+ cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cp)) {
+ dprintk("gss_kerberos_mech: unable to initialize "
+ "crypto algorithm %s\n", cname);
+ return NULL;
+ }
+ if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) {
+ dprintk("gss_kerberos_mech: error setting key for "
+ "crypto algorithm %s\n", cname);
+ crypto_free_blkcipher(cp);
+ return NULL;
+ }
+ return cp;
+}
+
+static inline void
+set_cdata(u8 cdata[GSS_KRB5_K5CLENGTH], u32 usage, u8 seed)
+{
+ cdata[0] = (usage>>24)&0xff;
+ cdata[1] = (usage>>16)&0xff;
+ cdata[2] = (usage>>8)&0xff;
+ cdata[3] = usage&0xff;
+ cdata[4] = seed;
+}
+
+static int
+context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask)
+{
+ struct xdr_netobj c, keyin, keyout;
+ u8 cdata[GSS_KRB5_K5CLENGTH];
+ u32 err;
+
+ c.len = GSS_KRB5_K5CLENGTH;
+ c.data = cdata;
+
+ keyin.data = ctx->Ksess;
+ keyin.len = ctx->gk5e->keylength;
+ keyout.len = ctx->gk5e->keylength;
+
+ /* seq uses the raw key */
+ ctx->seq = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name,
+ ctx->Ksess);
+ if (ctx->seq == NULL)
+ goto out_err;
+
+ ctx->enc = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name,
+ ctx->Ksess);
+ if (ctx->enc == NULL)
+ goto out_free_seq;
+
+ /* derive cksum */
+ set_cdata(cdata, KG_USAGE_SIGN, KEY_USAGE_SEED_CHECKSUM);
+ keyout.data = ctx->cksum;
+ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+ if (err) {
+ dprintk("%s: Error %d deriving cksum key\n",
+ __func__, err);
+ goto out_free_enc;
+ }
+
+ return 0;
+
+out_free_enc:
+ crypto_free_blkcipher(ctx->enc);
+out_free_seq:
+ crypto_free_blkcipher(ctx->seq);
+out_err:
+ return -EINVAL;
+}
+
+/*
+ * Note that RC4 depends on deriving keys using the sequence
+ * number or the checksum of a token. Therefore, the final keys
+ * cannot be calculated until the token is being constructed!
+ */
+static int
+context_derive_keys_rc4(struct krb5_ctx *ctx)
+{
+ struct crypto_hash *hmac;
+ char sigkeyconstant[] = "signaturekey";
+ int slen = strlen(sigkeyconstant) + 1; /* include null terminator */
+ struct hash_desc desc;
+ struct scatterlist sg[1];
+ int err;
+
+ dprintk("RPC: %s: entered\n", __func__);
+ /*
+ * derive cksum (aka Ksign) key
+ */
+ hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(hmac)) {
+ dprintk("%s: error %ld allocating hash '%s'\n",
+ __func__, PTR_ERR(hmac), ctx->gk5e->cksum_name);
+ err = PTR_ERR(hmac);
+ goto out_err;
+ }
+
+ err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
+ if (err)
+ goto out_err_free_hmac;
+
+ sg_init_table(sg, 1);
+ sg_set_buf(sg, sigkeyconstant, slen);
+
+ desc.tfm = hmac;
+ desc.flags = 0;
+
+ err = crypto_hash_init(&desc);
+ if (err)
+ goto out_err_free_hmac;
+
+ err = crypto_hash_digest(&desc, sg, slen, ctx->cksum);
+ if (err)
+ goto out_err_free_hmac;
+ /*
+ * allocate hash, and blkciphers for data and seqnum encryption
+ */
+ ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(ctx->enc)) {
+ err = PTR_ERR(ctx->enc);
+ goto out_err_free_hmac;
+ }
+
+ ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(ctx->seq)) {
+ crypto_free_blkcipher(ctx->enc);
+ err = PTR_ERR(ctx->seq);
+ goto out_err_free_hmac;
+ }
+
+ dprintk("RPC: %s: returning success\n", __func__);
+
+ err = 0;
+
+out_err_free_hmac:
+ crypto_free_hash(hmac);
+out_err:
+ dprintk("RPC: %s: returning %d\n", __func__, err);
+ return err;
+}
+
+static int
+context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
+{
+ struct xdr_netobj c, keyin, keyout;
+ u8 cdata[GSS_KRB5_K5CLENGTH];
+ u32 err;
+
+ c.len = GSS_KRB5_K5CLENGTH;
+ c.data = cdata;
+
+ keyin.data = ctx->Ksess;
+ keyin.len = ctx->gk5e->keylength;
+ keyout.len = ctx->gk5e->keylength;
+
+ /* initiator seal encryption */
+ set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_ENCRYPTION);
+ keyout.data = ctx->initiator_seal;
+ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+ if (err) {
+ dprintk("%s: Error %d deriving initiator_seal key\n",
+ __func__, err);
+ goto out_err;
+ }
+ ctx->initiator_enc = context_v2_alloc_cipher(ctx,
+ ctx->gk5e->encrypt_name,
+ ctx->initiator_seal);
+ if (ctx->initiator_enc == NULL)
+ goto out_err;
+
+ /* acceptor seal encryption */
+ set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_ENCRYPTION);
+ keyout.data = ctx->acceptor_seal;
+ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+ if (err) {
+ dprintk("%s: Error %d deriving acceptor_seal key\n",
+ __func__, err);
+ goto out_free_initiator_enc;
+ }
+ ctx->acceptor_enc = context_v2_alloc_cipher(ctx,
+ ctx->gk5e->encrypt_name,
+ ctx->acceptor_seal);
+ if (ctx->acceptor_enc == NULL)
+ goto out_free_initiator_enc;
+
+ /* initiator sign checksum */
+ set_cdata(cdata, KG_USAGE_INITIATOR_SIGN, KEY_USAGE_SEED_CHECKSUM);
+ keyout.data = ctx->initiator_sign;
+ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+ if (err) {
+ dprintk("%s: Error %d deriving initiator_sign key\n",
+ __func__, err);
+ goto out_free_acceptor_enc;
+ }
+
+ /* acceptor sign checksum */
+ set_cdata(cdata, KG_USAGE_ACCEPTOR_SIGN, KEY_USAGE_SEED_CHECKSUM);
+ keyout.data = ctx->acceptor_sign;
+ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+ if (err) {
+ dprintk("%s: Error %d deriving acceptor_sign key\n",
+ __func__, err);
+ goto out_free_acceptor_enc;
+ }
+
+ /* initiator seal integrity */
+ set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_INTEGRITY);
+ keyout.data = ctx->initiator_integ;
+ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+ if (err) {
+ dprintk("%s: Error %d deriving initiator_integ key\n",
+ __func__, err);
+ goto out_free_acceptor_enc;
+ }
+
+ /* acceptor seal integrity */
+ set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_INTEGRITY);
+ keyout.data = ctx->acceptor_integ;
+ err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+ if (err) {
+ dprintk("%s: Error %d deriving acceptor_integ key\n",
+ __func__, err);
+ goto out_free_acceptor_enc;
+ }
+
+ switch (ctx->enctype) {
+ case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+ case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+ ctx->initiator_enc_aux =
+ context_v2_alloc_cipher(ctx, "cbc(aes)",
+ ctx->initiator_seal);
+ if (ctx->initiator_enc_aux == NULL)
+ goto out_free_acceptor_enc;
+ ctx->acceptor_enc_aux =
+ context_v2_alloc_cipher(ctx, "cbc(aes)",
+ ctx->acceptor_seal);
+ if (ctx->acceptor_enc_aux == NULL) {
+ crypto_free_blkcipher(ctx->initiator_enc_aux);
+ goto out_free_acceptor_enc;
+ }
+ }
+
+ return 0;
+
+out_free_acceptor_enc:
+ crypto_free_blkcipher(ctx->acceptor_enc);
+out_free_initiator_enc:
+ crypto_free_blkcipher(ctx->initiator_enc);
+out_err:
+ return -EINVAL;
+}
+
+static int
+gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
+ gfp_t gfp_mask)
+{
+ int keylen;
+
+ p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags));
+ if (IS_ERR(p))
+ goto out_err;
+ ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR;
+
+ p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
+ if (IS_ERR(p))
+ goto out_err;
+ p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64));
+ if (IS_ERR(p))
+ goto out_err;
+ /* set seq_send for use by "older" enctypes */
+ ctx->seq_send = ctx->seq_send64;
+ if (ctx->seq_send64 != ctx->seq_send) {
+ dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__,
+ (unsigned long)ctx->seq_send64, ctx->seq_send);
+ p = ERR_PTR(-EINVAL);
+ goto out_err;
+ }
+ p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype));
+ if (IS_ERR(p))
+ goto out_err;
+ /* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */
+ if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1)
+ ctx->enctype = ENCTYPE_DES3_CBC_RAW;
+ ctx->gk5e = get_gss_krb5_enctype(ctx->enctype);
+ if (ctx->gk5e == NULL) {
+ dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n",
+ ctx->enctype);
+ p = ERR_PTR(-EINVAL);
+ goto out_err;
+ }
+ keylen = ctx->gk5e->keylength;
+
+ p = simple_get_bytes(p, end, ctx->Ksess, keylen);
+ if (IS_ERR(p))
+ goto out_err;
+
+ if (p != end) {
+ p = ERR_PTR(-EINVAL);
+ goto out_err;
+ }
+
+ ctx->mech_used.data = kmemdup(gss_kerberos_mech.gm_oid.data,
+ gss_kerberos_mech.gm_oid.len, gfp_mask);
+ if (unlikely(ctx->mech_used.data == NULL)) {
+ p = ERR_PTR(-ENOMEM);
+ goto out_err;
+ }
+ ctx->mech_used.len = gss_kerberos_mech.gm_oid.len;
+
+ switch (ctx->enctype) {
+ case ENCTYPE_DES3_CBC_RAW:
+ return context_derive_keys_des3(ctx, gfp_mask);
+ case ENCTYPE_ARCFOUR_HMAC:
+ return context_derive_keys_rc4(ctx);
+ case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+ case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+ return context_derive_keys_new(ctx, gfp_mask);
+ default:
+ return -EINVAL;
+ }
+
+out_err:
+ return PTR_ERR(p);
+}
+
+static int
+gss_import_sec_context_kerberos(const void *p, size_t len,
+ struct gss_ctx *ctx_id,
+ time_t *endtime,
+ gfp_t gfp_mask)
+{
+ const void *end = (const void *)((const char *)p + len);
+ struct krb5_ctx *ctx;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), gfp_mask);
+ if (ctx == NULL)
+ return -ENOMEM;
+
+ if (len == 85)
+ ret = gss_import_v1_context(p, end, ctx);
+ else
+ ret = gss_import_v2_context(p, end, ctx, gfp_mask);
+
+ if (ret == 0) {
+ ctx_id->internal_ctx_id = ctx;
+ if (endtime)
+ *endtime = ctx->endtime;
+ } else
+ kfree(ctx);
+
+ dprintk("RPC: %s: returning %d\n", __func__, ret);
+ return ret;
+}
+
+static void
+gss_delete_sec_context_kerberos(void *internal_ctx) {
+ struct krb5_ctx *kctx = internal_ctx;
+
+ crypto_free_blkcipher(kctx->seq);
+ crypto_free_blkcipher(kctx->enc);
+ crypto_free_blkcipher(kctx->acceptor_enc);
+ crypto_free_blkcipher(kctx->initiator_enc);
+ crypto_free_blkcipher(kctx->acceptor_enc_aux);
+ crypto_free_blkcipher(kctx->initiator_enc_aux);
+ kfree(kctx->mech_used.data);
+ kfree(kctx);
+}
+
+static const struct gss_api_ops gss_kerberos_ops = {
+ .gss_import_sec_context = gss_import_sec_context_kerberos,
+ .gss_get_mic = gss_get_mic_kerberos,
+ .gss_verify_mic = gss_verify_mic_kerberos,
+ .gss_wrap = gss_wrap_kerberos,
+ .gss_unwrap = gss_unwrap_kerberos,
+ .gss_delete_sec_context = gss_delete_sec_context_kerberos,
+};
+
+static struct pf_desc gss_kerberos_pfs[] = {
+ [0] = {
+ .pseudoflavor = RPC_AUTH_GSS_KRB5,
+ .qop = GSS_C_QOP_DEFAULT,
+ .service = RPC_GSS_SVC_NONE,
+ .name = "krb5",
+ },
+ [1] = {
+ .pseudoflavor = RPC_AUTH_GSS_KRB5I,
+ .qop = GSS_C_QOP_DEFAULT,
+ .service = RPC_GSS_SVC_INTEGRITY,
+ .name = "krb5i",
+ },
+ [2] = {
+ .pseudoflavor = RPC_AUTH_GSS_KRB5P,
+ .qop = GSS_C_QOP_DEFAULT,
+ .service = RPC_GSS_SVC_PRIVACY,
+ .name = "krb5p",
+ },
+};
+
+MODULE_ALIAS("rpc-auth-gss-krb5");
+MODULE_ALIAS("rpc-auth-gss-krb5i");
+MODULE_ALIAS("rpc-auth-gss-krb5p");
+MODULE_ALIAS("rpc-auth-gss-390003");
+MODULE_ALIAS("rpc-auth-gss-390004");
+MODULE_ALIAS("rpc-auth-gss-390005");
+MODULE_ALIAS("rpc-auth-gss-1.2.840.113554.1.2.2");
+
+static struct gss_api_mech gss_kerberos_mech = {
+ .gm_name = "krb5",
+ .gm_owner = THIS_MODULE,
+ .gm_oid = { 9, "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02" },
+ .gm_ops = &gss_kerberos_ops,
+ .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs),
+ .gm_pfs = gss_kerberos_pfs,
+ .gm_upcall_enctypes = KRB5_SUPPORTED_ENCTYPES,
+};
+
+static int __init init_kerberos_module(void)
+{
+ int status;
+
+ status = gss_mech_register(&gss_kerberos_mech);
+ if (status)
+ printk("Failed to register kerberos gss mechanism!\n");
+ return status;
+}
+
+static void __exit cleanup_kerberos_module(void)
+{
+ gss_mech_unregister(&gss_kerberos_mech);
+}
+
+MODULE_LICENSE("GPL");
+module_init(init_kerberos_module);
+module_exit(cleanup_kerberos_module);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
new file mode 100644
index 000000000..1d74d653e
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -0,0 +1,229 @@
+/*
+ * linux/net/sunrpc/gss_krb5_seal.c
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c
+ *
+ * Copyright (c) 2000-2008 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ * J. Bruce Fields <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government. It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. FundsXpress makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+DEFINE_SPINLOCK(krb5_seq_lock);
+
+static void *
+setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
+{
+ u16 *ptr;
+ void *krb5_hdr;
+ int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
+
+ token->len = g_token_size(&ctx->mech_used, body_size);
+
+ ptr = (u16 *)token->data;
+ g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr);
+
+ /* ptr now at start of header described in rfc 1964, section 1.2.1: */
+ krb5_hdr = ptr;
+ *ptr++ = KG_TOK_MIC_MSG;
+ /*
+ * signalg is stored as if it were converted from LE to host endian, even
+ * though it's an opaque pair of bytes according to the RFC.
+ */
+ *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg);
+ *ptr++ = SEAL_ALG_NONE;
+ *ptr = 0xffff;
+
+ return krb5_hdr;
+}
+
+static void *
+setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
+{
+ u16 *ptr;
+ void *krb5_hdr;
+ u8 *p, flags = 0x00;
+
+ if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
+ flags |= 0x01;
+ if (ctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY)
+ flags |= 0x04;
+
+ /* Per rfc 4121, sec 4.2.6.1, there is no header,
+ * just start the token */
+ krb5_hdr = ptr = (u16 *)token->data;
+
+ *ptr++ = KG2_TOK_MIC;
+ p = (u8 *)ptr;
+ *p++ = flags;
+ *p++ = 0xff;
+ ptr = (u16 *)p;
+ *ptr++ = 0xffff;
+ *ptr = 0xffff;
+
+ token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
+ return krb5_hdr;
+}
+
+static u32
+gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text,
+ struct xdr_netobj *token)
+{
+ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata),
+ .data = cksumdata};
+ void *ptr;
+ s32 now;
+ u32 seq_send;
+ u8 *cksumkey;
+
+ dprintk("RPC: %s\n", __func__);
+ BUG_ON(ctx == NULL);
+
+ now = get_seconds();
+
+ ptr = setup_token(ctx, token);
+
+ if (ctx->gk5e->keyed_cksum)
+ cksumkey = ctx->cksum;
+ else
+ cksumkey = NULL;
+
+ if (make_checksum(ctx, ptr, 8, text, 0, cksumkey,
+ KG_USAGE_SIGN, &md5cksum))
+ return GSS_S_FAILURE;
+
+ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
+
+ spin_lock(&krb5_seq_lock);
+ seq_send = ctx->seq_send++;
+ spin_unlock(&krb5_seq_lock);
+
+ if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff,
+ seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))
+ return GSS_S_FAILURE;
+
+ return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
+}
+
+static u32
+gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
+ struct xdr_netobj *token)
+{
+ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ struct xdr_netobj cksumobj = { .len = sizeof(cksumdata),
+ .data = cksumdata};
+ void *krb5_hdr;
+ s32 now;
+ u64 seq_send;
+ u8 *cksumkey;
+ unsigned int cksum_usage;
+
+ dprintk("RPC: %s\n", __func__);
+
+ krb5_hdr = setup_token_v2(ctx, token);
+
+ /* Set up the sequence number. Now 64-bits in clear
+ * text and w/o direction indicator */
+ spin_lock(&krb5_seq_lock);
+ seq_send = ctx->seq_send64++;
+ spin_unlock(&krb5_seq_lock);
+ *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+
+ if (ctx->initiate) {
+ cksumkey = ctx->initiator_sign;
+ cksum_usage = KG_USAGE_INITIATOR_SIGN;
+ } else {
+ cksumkey = ctx->acceptor_sign;
+ cksum_usage = KG_USAGE_ACCEPTOR_SIGN;
+ }
+
+ if (make_checksum_v2(ctx, krb5_hdr, GSS_KRB5_TOK_HDR_LEN,
+ text, 0, cksumkey, cksum_usage, &cksumobj))
+ return GSS_S_FAILURE;
+
+ memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len);
+
+ now = get_seconds();
+
+ return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
+}
+
+u32
+gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
+ struct xdr_netobj *token)
+{
+ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id;
+
+ switch (ctx->enctype) {
+ default:
+ BUG();
+ case ENCTYPE_DES_CBC_RAW:
+ case ENCTYPE_DES3_CBC_RAW:
+ case ENCTYPE_ARCFOUR_HMAC:
+ return gss_get_mic_v1(ctx, text, token);
+ case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+ case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+ return gss_get_mic_v2(ctx, text, token);
+ }
+}
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
new file mode 100644
index 000000000..20d55c793
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -0,0 +1,166 @@
+/*
+ * linux/net/sunrpc/gss_krb5_seqnum.c
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/crypto.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+static s32
+krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
+ unsigned char *cksum, unsigned char *buf)
+{
+ struct crypto_blkcipher *cipher;
+ unsigned char plain[8];
+ s32 code;
+
+ dprintk("RPC: %s:\n", __func__);
+ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cipher))
+ return PTR_ERR(cipher);
+
+ plain[0] = (unsigned char) ((seqnum >> 24) & 0xff);
+ plain[1] = (unsigned char) ((seqnum >> 16) & 0xff);
+ plain[2] = (unsigned char) ((seqnum >> 8) & 0xff);
+ plain[3] = (unsigned char) ((seqnum >> 0) & 0xff);
+ plain[4] = direction;
+ plain[5] = direction;
+ plain[6] = direction;
+ plain[7] = direction;
+
+ code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
+ if (code)
+ goto out;
+
+ code = krb5_encrypt(cipher, cksum, plain, buf, 8);
+out:
+ crypto_free_blkcipher(cipher);
+ return code;
+}
+s32
+krb5_make_seq_num(struct krb5_ctx *kctx,
+ struct crypto_blkcipher *key,
+ int direction,
+ u32 seqnum,
+ unsigned char *cksum, unsigned char *buf)
+{
+ unsigned char plain[8];
+
+ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
+ return krb5_make_rc4_seq_num(kctx, direction, seqnum,
+ cksum, buf);
+
+ plain[0] = (unsigned char) (seqnum & 0xff);
+ plain[1] = (unsigned char) ((seqnum >> 8) & 0xff);
+ plain[2] = (unsigned char) ((seqnum >> 16) & 0xff);
+ plain[3] = (unsigned char) ((seqnum >> 24) & 0xff);
+
+ plain[4] = direction;
+ plain[5] = direction;
+ plain[6] = direction;
+ plain[7] = direction;
+
+ return krb5_encrypt(key, cksum, plain, buf, 8);
+}
+
+static s32
+krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
+ unsigned char *buf, int *direction, s32 *seqnum)
+{
+ struct crypto_blkcipher *cipher;
+ unsigned char plain[8];
+ s32 code;
+
+ dprintk("RPC: %s:\n", __func__);
+ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cipher))
+ return PTR_ERR(cipher);
+
+ code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
+ if (code)
+ goto out;
+
+ code = krb5_decrypt(cipher, cksum, buf, plain, 8);
+ if (code)
+ goto out;
+
+ if ((plain[4] != plain[5]) || (plain[4] != plain[6])
+ || (plain[4] != plain[7])) {
+ code = (s32)KG_BAD_SEQ;
+ goto out;
+ }
+
+ *direction = plain[4];
+
+ *seqnum = ((plain[0] << 24) | (plain[1] << 16) |
+ (plain[2] << 8) | (plain[3]));
+out:
+ crypto_free_blkcipher(cipher);
+ return code;
+}
+
+s32
+krb5_get_seq_num(struct krb5_ctx *kctx,
+ unsigned char *cksum,
+ unsigned char *buf,
+ int *direction, u32 *seqnum)
+{
+ s32 code;
+ unsigned char plain[8];
+ struct crypto_blkcipher *key = kctx->seq;
+
+ dprintk("RPC: krb5_get_seq_num:\n");
+
+ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
+ return krb5_get_rc4_seq_num(kctx, cksum, buf,
+ direction, seqnum);
+
+ if ((code = krb5_decrypt(key, cksum, buf, plain, 8)))
+ return code;
+
+ if ((plain[4] != plain[5]) || (plain[4] != plain[6]) ||
+ (plain[4] != plain[7]))
+ return (s32)KG_BAD_SEQ;
+
+ *direction = plain[4];
+
+ *seqnum = ((plain[0]) |
+ (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
+
+ return 0;
+}
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
new file mode 100644
index 000000000..dcf9515d9
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -0,0 +1,226 @@
+/*
+ * linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c
+ *
+ * Copyright (c) 2000-2008 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government. It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. FundsXpress makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/crypto.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+
+/* read_token is a mic token, and message_buffer is the data that the mic was
+ * supposedly taken over. */
+
+static u32
+gss_verify_mic_v1(struct krb5_ctx *ctx,
+ struct xdr_buf *message_buffer, struct xdr_netobj *read_token)
+{
+ int signalg;
+ int sealalg;
+ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata),
+ .data = cksumdata};
+ s32 now;
+ int direction;
+ u32 seqnum;
+ unsigned char *ptr = (unsigned char *)read_token->data;
+ int bodysize;
+ u8 *cksumkey;
+
+ dprintk("RPC: krb5_read_token\n");
+
+ if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr,
+ read_token->len))
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) ||
+ (ptr[1] != (KG_TOK_MIC_MSG & 0xff)))
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ /* XXX sanity-check bodysize?? */
+
+ signalg = ptr[2] + (ptr[3] << 8);
+ if (signalg != ctx->gk5e->signalg)
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ sealalg = ptr[4] + (ptr[5] << 8);
+ if (sealalg != SEAL_ALG_NONE)
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ if (ctx->gk5e->keyed_cksum)
+ cksumkey = ctx->cksum;
+ else
+ cksumkey = NULL;
+
+ if (make_checksum(ctx, ptr, 8, message_buffer, 0,
+ cksumkey, KG_USAGE_SIGN, &md5cksum))
+ return GSS_S_FAILURE;
+
+ if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN,
+ ctx->gk5e->cksumlength))
+ return GSS_S_BAD_SIG;
+
+ /* it got through unscathed. Make sure the context is unexpired */
+
+ now = get_seconds();
+
+ if (now > ctx->endtime)
+ return GSS_S_CONTEXT_EXPIRED;
+
+ /* do sequencing checks */
+
+ if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8,
+ &direction, &seqnum))
+ return GSS_S_FAILURE;
+
+ if ((ctx->initiate && direction != 0xff) ||
+ (!ctx->initiate && direction != 0))
+ return GSS_S_BAD_SIG;
+
+ return GSS_S_COMPLETE;
+}
+
+static u32
+gss_verify_mic_v2(struct krb5_ctx *ctx,
+ struct xdr_buf *message_buffer, struct xdr_netobj *read_token)
+{
+ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ struct xdr_netobj cksumobj = {.len = sizeof(cksumdata),
+ .data = cksumdata};
+ s32 now;
+ u8 *ptr = read_token->data;
+ u8 *cksumkey;
+ u8 flags;
+ int i;
+ unsigned int cksum_usage;
+
+ dprintk("RPC: %s\n", __func__);
+
+ if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC)
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ flags = ptr[2];
+ if ((!ctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) ||
+ (ctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)))
+ return GSS_S_BAD_SIG;
+
+ if (flags & KG2_TOKEN_FLAG_SEALED) {
+ dprintk("%s: token has unexpected sealed flag\n", __func__);
+ return GSS_S_FAILURE;
+ }
+
+ for (i = 3; i < 8; i++)
+ if (ptr[i] != 0xff)
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ if (ctx->initiate) {
+ cksumkey = ctx->acceptor_sign;
+ cksum_usage = KG_USAGE_ACCEPTOR_SIGN;
+ } else {
+ cksumkey = ctx->initiator_sign;
+ cksum_usage = KG_USAGE_INITIATOR_SIGN;
+ }
+
+ if (make_checksum_v2(ctx, ptr, GSS_KRB5_TOK_HDR_LEN, message_buffer, 0,
+ cksumkey, cksum_usage, &cksumobj))
+ return GSS_S_FAILURE;
+
+ if (memcmp(cksumobj.data, ptr + GSS_KRB5_TOK_HDR_LEN,
+ ctx->gk5e->cksumlength))
+ return GSS_S_BAD_SIG;
+
+ /* it got through unscathed. Make sure the context is unexpired */
+ now = get_seconds();
+ if (now > ctx->endtime)
+ return GSS_S_CONTEXT_EXPIRED;
+
+ /*
+ * NOTE: the sequence number at ptr + 8 is skipped, rpcsec_gss
+ * doesn't want it checked; see page 6 of rfc 2203.
+ */
+
+ return GSS_S_COMPLETE;
+}
+
+u32
+gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
+ struct xdr_buf *message_buffer,
+ struct xdr_netobj *read_token)
+{
+ struct krb5_ctx *ctx = gss_ctx->internal_ctx_id;
+
+ switch (ctx->enctype) {
+ default:
+ BUG();
+ case ENCTYPE_DES_CBC_RAW:
+ case ENCTYPE_DES3_CBC_RAW:
+ case ENCTYPE_ARCFOUR_HMAC:
+ return gss_verify_mic_v1(ctx, message_buffer, read_token);
+ case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+ case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+ return gss_verify_mic_v2(ctx, message_buffer, read_token);
+ }
+}
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
new file mode 100644
index 000000000..ca7e92a32
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -0,0 +1,626 @@
+/*
+ * COPYRIGHT (c) 2008
+ * The Regents of the University of Michigan
+ * ALL RIGHTS RESERVED
+ *
+ * Permission is granted to use, copy, create derivative works
+ * and redistribute this software and such derivative works
+ * for any purpose, so long as the name of The University of
+ * Michigan is not used in any advertising or publicity
+ * pertaining to the use of distribution of this software
+ * without specific, written prior authorization. If the
+ * above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any
+ * portion of this software, then the disclaimer below must
+ * also be included.
+ *
+ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION
+ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY
+ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF
+ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
+ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE
+ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR
+ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN
+ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGES.
+ */
+
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/random.h>
+#include <linux/pagemap.h>
+#include <linux/crypto.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+static inline int
+gss_krb5_padding(int blocksize, int length)
+{
+ return blocksize - (length % blocksize);
+}
+
+static inline void
+gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize)
+{
+ int padding = gss_krb5_padding(blocksize, buf->len - offset);
+ char *p;
+ struct kvec *iov;
+
+ if (buf->page_len || buf->tail[0].iov_len)
+ iov = &buf->tail[0];
+ else
+ iov = &buf->head[0];
+ p = iov->iov_base + iov->iov_len;
+ iov->iov_len += padding;
+ buf->len += padding;
+ memset(p, padding, padding);
+}
+
+static inline int
+gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
+{
+ u8 *ptr;
+ u8 pad;
+ size_t len = buf->len;
+
+ if (len <= buf->head[0].iov_len) {
+ pad = *(u8 *)(buf->head[0].iov_base + len - 1);
+ if (pad > buf->head[0].iov_len)
+ return -EINVAL;
+ buf->head[0].iov_len -= pad;
+ goto out;
+ } else
+ len -= buf->head[0].iov_len;
+ if (len <= buf->page_len) {
+ unsigned int last = (buf->page_base + len - 1)
+ >>PAGE_CACHE_SHIFT;
+ unsigned int offset = (buf->page_base + len - 1)
+ & (PAGE_CACHE_SIZE - 1);
+ ptr = kmap_atomic(buf->pages[last]);
+ pad = *(ptr + offset);
+ kunmap_atomic(ptr);
+ goto out;
+ } else
+ len -= buf->page_len;
+ BUG_ON(len > buf->tail[0].iov_len);
+ pad = *(u8 *)(buf->tail[0].iov_base + len - 1);
+out:
+ /* XXX: NOTE: we do not adjust the page lengths--they represent
+ * a range of data in the real filesystem page cache, and we need
+ * to know that range so the xdr code can properly place read data.
+ * However adjusting the head length, as we do above, is harmless.
+ * In the case of a request that fits into a single page, the server
+ * also uses length and head length together to determine the original
+ * start of the request to copy the request for deferal; so it's
+ * easier on the server if we adjust head and tail length in tandem.
+ * It's not really a problem that we don't fool with the page and
+ * tail lengths, though--at worst badly formed xdr might lead the
+ * server to attempt to parse the padding.
+ * XXX: Document all these weird requirements for gss mechanism
+ * wrap/unwrap functions. */
+ if (pad > blocksize)
+ return -EINVAL;
+ if (buf->len > pad)
+ buf->len -= pad;
+ else
+ return -EINVAL;
+ return 0;
+}
+
+void
+gss_krb5_make_confounder(char *p, u32 conflen)
+{
+ static u64 i = 0;
+ u64 *q = (u64 *)p;
+
+ /* rfc1964 claims this should be "random". But all that's really
+ * necessary is that it be unique. And not even that is necessary in
+ * our case since our "gssapi" implementation exists only to support
+ * rpcsec_gss, so we know that the only buffers we will ever encrypt
+ * already begin with a unique sequence number. Just to hedge my bets
+ * I'll make a half-hearted attempt at something unique, but ensuring
+ * uniqueness would mean worrying about atomicity and rollover, and I
+ * don't care enough. */
+
+ /* initialize to random value */
+ if (i == 0) {
+ i = prandom_u32();
+ i = (i << 32) | prandom_u32();
+ }
+
+ switch (conflen) {
+ case 16:
+ *q++ = i++;
+ /* fall through */
+ case 8:
+ *q++ = i++;
+ break;
+ default:
+ BUG();
+ }
+}
+
+/* Assumptions: the head and tail of inbuf are ours to play with.
+ * The pages, however, may be real pages in the page cache and we replace
+ * them with scratch pages from **pages before writing to them. */
+/* XXX: obviously the above should be documentation of wrap interface,
+ * and shouldn't be in this kerberos-specific file. */
+
+/* XXX factor out common code with seal/unseal. */
+
+static u32
+gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
+ struct xdr_buf *buf, struct page **pages)
+{
+ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata),
+ .data = cksumdata};
+ int blocksize = 0, plainlen;
+ unsigned char *ptr, *msg_start;
+ s32 now;
+ int headlen;
+ struct page **tmp_pages;
+ u32 seq_send;
+ u8 *cksumkey;
+ u32 conflen = kctx->gk5e->conflen;
+
+ dprintk("RPC: %s\n", __func__);
+
+ now = get_seconds();
+
+ blocksize = crypto_blkcipher_blocksize(kctx->enc);
+ gss_krb5_add_padding(buf, offset, blocksize);
+ BUG_ON((buf->len - offset) % blocksize);
+ plainlen = conflen + buf->len - offset;
+
+ headlen = g_token_size(&kctx->mech_used,
+ GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) -
+ (buf->len - offset);
+
+ ptr = buf->head[0].iov_base + offset;
+ /* shift data to make room for header. */
+ xdr_extend_head(buf, offset, headlen);
+
+ /* XXX Would be cleverer to encrypt while copying. */
+ BUG_ON((buf->len - offset - headlen) % blocksize);
+
+ g_make_token_header(&kctx->mech_used,
+ GSS_KRB5_TOK_HDR_LEN +
+ kctx->gk5e->cksumlength + plainlen, &ptr);
+
+
+ /* ptr now at header described in rfc 1964, section 1.2.1: */
+ ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff);
+ ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff);
+
+ msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength;
+
+ /*
+ * signalg and sealalg are stored as if they were converted from LE
+ * to host endian, even though they're opaque pairs of bytes according
+ * to the RFC.
+ */
+ *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
+ *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+ ptr[6] = 0xff;
+ ptr[7] = 0xff;
+
+ gss_krb5_make_confounder(msg_start, conflen);
+
+ if (kctx->gk5e->keyed_cksum)
+ cksumkey = kctx->cksum;
+ else
+ cksumkey = NULL;
+
+ /* XXXJBF: UGH!: */
+ tmp_pages = buf->pages;
+ buf->pages = pages;
+ if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen,
+ cksumkey, KG_USAGE_SEAL, &md5cksum))
+ return GSS_S_FAILURE;
+ buf->pages = tmp_pages;
+
+ memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
+
+ spin_lock(&krb5_seq_lock);
+ seq_send = kctx->seq_send++;
+ spin_unlock(&krb5_seq_lock);
+
+ /* XXX would probably be more efficient to compute checksum
+ * and encrypt at the same time: */
+ if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff,
+ seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)))
+ return GSS_S_FAILURE;
+
+ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
+ struct crypto_blkcipher *cipher;
+ int err;
+ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cipher))
+ return GSS_S_FAILURE;
+
+ krb5_rc4_setup_enc_key(kctx, cipher, seq_send);
+
+ err = gss_encrypt_xdr_buf(cipher, buf,
+ offset + headlen - conflen, pages);
+ crypto_free_blkcipher(cipher);
+ if (err)
+ return GSS_S_FAILURE;
+ } else {
+ if (gss_encrypt_xdr_buf(kctx->enc, buf,
+ offset + headlen - conflen, pages))
+ return GSS_S_FAILURE;
+ }
+
+ return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
+}
+
+static u32
+gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
+{
+ int signalg;
+ int sealalg;
+ char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+ struct xdr_netobj md5cksum = {.len = sizeof(cksumdata),
+ .data = cksumdata};
+ s32 now;
+ int direction;
+ s32 seqnum;
+ unsigned char *ptr;
+ int bodysize;
+ void *data_start, *orig_start;
+ int data_len;
+ int blocksize;
+ u32 conflen = kctx->gk5e->conflen;
+ int crypt_offset;
+ u8 *cksumkey;
+
+ dprintk("RPC: gss_unwrap_kerberos\n");
+
+ ptr = (u8 *)buf->head[0].iov_base + offset;
+ if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
+ buf->len - offset))
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) ||
+ (ptr[1] != (KG_TOK_WRAP_MSG & 0xff)))
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ /* XXX sanity-check bodysize?? */
+
+ /* get the sign and seal algorithms */
+
+ signalg = ptr[2] + (ptr[3] << 8);
+ if (signalg != kctx->gk5e->signalg)
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ sealalg = ptr[4] + (ptr[5] << 8);
+ if (sealalg != kctx->gk5e->sealalg)
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ /*
+ * Data starts after token header and checksum. ptr points
+ * to the beginning of the token header
+ */
+ crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) -
+ (unsigned char *)buf->head[0].iov_base;
+
+ /*
+ * Need plaintext seqnum to derive encryption key for arcfour-hmac
+ */
+ if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN,
+ ptr + 8, &direction, &seqnum))
+ return GSS_S_BAD_SIG;
+
+ if ((kctx->initiate && direction != 0xff) ||
+ (!kctx->initiate && direction != 0))
+ return GSS_S_BAD_SIG;
+
+ if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
+ struct crypto_blkcipher *cipher;
+ int err;
+
+ cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cipher))
+ return GSS_S_FAILURE;
+
+ krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
+
+ err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
+ crypto_free_blkcipher(cipher);
+ if (err)
+ return GSS_S_DEFECTIVE_TOKEN;
+ } else {
+ if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset))
+ return GSS_S_DEFECTIVE_TOKEN;
+ }
+
+ if (kctx->gk5e->keyed_cksum)
+ cksumkey = kctx->cksum;
+ else
+ cksumkey = NULL;
+
+ if (make_checksum(kctx, ptr, 8, buf, crypt_offset,
+ cksumkey, KG_USAGE_SEAL, &md5cksum))
+ return GSS_S_FAILURE;
+
+ if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN,
+ kctx->gk5e->cksumlength))
+ return GSS_S_BAD_SIG;
+
+ /* it got through unscathed. Make sure the context is unexpired */
+
+ now = get_seconds();
+
+ if (now > kctx->endtime)
+ return GSS_S_CONTEXT_EXPIRED;
+
+ /* do sequencing checks */
+
+ /* Copy the data back to the right position. XXX: Would probably be
+ * better to copy and encrypt at the same time. */
+
+ blocksize = crypto_blkcipher_blocksize(kctx->enc);
+ data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) +
+ conflen;
+ orig_start = buf->head[0].iov_base + offset;
+ data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start;
+ memmove(orig_start, data_start, data_len);
+ buf->head[0].iov_len -= (data_start - orig_start);
+ buf->len -= (data_start - orig_start);
+
+ if (gss_krb5_remove_padding(buf, blocksize))
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ return GSS_S_COMPLETE;
+}
+
+/*
+ * We can shift data by up to LOCAL_BUF_LEN bytes in a pass. If we need
+ * to do more than that, we shift repeatedly. Kevin Coffman reports
+ * seeing 28 bytes as the value used by Microsoft clients and servers
+ * with AES, so this constant is chosen to allow handling 28 in one pass
+ * without using too much stack space.
+ *
+ * If that proves to a problem perhaps we could use a more clever
+ * algorithm.
+ */
+#define LOCAL_BUF_LEN 32u
+
+static void rotate_buf_a_little(struct xdr_buf *buf, unsigned int shift)
+{
+ char head[LOCAL_BUF_LEN];
+ char tmp[LOCAL_BUF_LEN];
+ unsigned int this_len, i;
+
+ BUG_ON(shift > LOCAL_BUF_LEN);
+
+ read_bytes_from_xdr_buf(buf, 0, head, shift);
+ for (i = 0; i + shift < buf->len; i += LOCAL_BUF_LEN) {
+ this_len = min(LOCAL_BUF_LEN, buf->len - (i + shift));
+ read_bytes_from_xdr_buf(buf, i+shift, tmp, this_len);
+ write_bytes_to_xdr_buf(buf, i, tmp, this_len);
+ }
+ write_bytes_to_xdr_buf(buf, buf->len - shift, head, shift);
+}
+
+static void _rotate_left(struct xdr_buf *buf, unsigned int shift)
+{
+ int shifted = 0;
+ int this_shift;
+
+ shift %= buf->len;
+ while (shifted < shift) {
+ this_shift = min(shift - shifted, LOCAL_BUF_LEN);
+ rotate_buf_a_little(buf, this_shift);
+ shifted += this_shift;
+ }
+}
+
+static void rotate_left(u32 base, struct xdr_buf *buf, unsigned int shift)
+{
+ struct xdr_buf subbuf;
+
+ xdr_buf_subsegment(buf, &subbuf, base, buf->len - base);
+ _rotate_left(&subbuf, shift);
+}
+
+static u32
+gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
+ struct xdr_buf *buf, struct page **pages)
+{
+ int blocksize;
+ u8 *ptr, *plainhdr;
+ s32 now;
+ u8 flags = 0x00;
+ __be16 *be16ptr;
+ __be64 *be64ptr;
+ u32 err;
+
+ dprintk("RPC: %s\n", __func__);
+
+ if (kctx->gk5e->encrypt_v2 == NULL)
+ return GSS_S_FAILURE;
+
+ /* make room for gss token header */
+ if (xdr_extend_head(buf, offset, GSS_KRB5_TOK_HDR_LEN))
+ return GSS_S_FAILURE;
+
+ /* construct gss token header */
+ ptr = plainhdr = buf->head[0].iov_base + offset;
+ *ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff);
+ *ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff);
+
+ if ((kctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
+ flags |= KG2_TOKEN_FLAG_SENTBYACCEPTOR;
+ if ((kctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) != 0)
+ flags |= KG2_TOKEN_FLAG_ACCEPTORSUBKEY;
+ /* We always do confidentiality in wrap tokens */
+ flags |= KG2_TOKEN_FLAG_SEALED;
+
+ *ptr++ = flags;
+ *ptr++ = 0xff;
+ be16ptr = (__be16 *)ptr;
+
+ blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
+ *be16ptr++ = 0;
+ /* "inner" token header always uses 0 for RRC */
+ *be16ptr++ = 0;
+
+ be64ptr = (__be64 *)be16ptr;
+ spin_lock(&krb5_seq_lock);
+ *be64ptr = cpu_to_be64(kctx->seq_send64++);
+ spin_unlock(&krb5_seq_lock);
+
+ err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
+ if (err)
+ return err;
+
+ now = get_seconds();
+ return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
+}
+
+static u32
+gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
+{
+ s32 now;
+ u8 *ptr;
+ u8 flags = 0x00;
+ u16 ec, rrc;
+ int err;
+ u32 headskip, tailskip;
+ u8 decrypted_hdr[GSS_KRB5_TOK_HDR_LEN];
+ unsigned int movelen;
+
+
+ dprintk("RPC: %s\n", __func__);
+
+ if (kctx->gk5e->decrypt_v2 == NULL)
+ return GSS_S_FAILURE;
+
+ ptr = buf->head[0].iov_base + offset;
+
+ if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_WRAP)
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ flags = ptr[2];
+ if ((!kctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) ||
+ (kctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)))
+ return GSS_S_BAD_SIG;
+
+ if ((flags & KG2_TOKEN_FLAG_SEALED) == 0) {
+ dprintk("%s: token missing expected sealed flag\n", __func__);
+ return GSS_S_DEFECTIVE_TOKEN;
+ }
+
+ if (ptr[3] != 0xff)
+ return GSS_S_DEFECTIVE_TOKEN;
+
+ ec = be16_to_cpup((__be16 *)(ptr + 4));
+ rrc = be16_to_cpup((__be16 *)(ptr + 6));
+
+ /*
+ * NOTE: the sequence number at ptr + 8 is skipped, rpcsec_gss
+ * doesn't want it checked; see page 6 of rfc 2203.
+ */
+
+ if (rrc != 0)
+ rotate_left(offset + 16, buf, rrc);
+
+ err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf,
+ &headskip, &tailskip);
+ if (err)
+ return GSS_S_FAILURE;
+
+ /*
+ * Retrieve the decrypted gss token header and verify
+ * it against the original
+ */
+ err = read_bytes_from_xdr_buf(buf,
+ buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip,
+ decrypted_hdr, GSS_KRB5_TOK_HDR_LEN);
+ if (err) {
+ dprintk("%s: error %u getting decrypted_hdr\n", __func__, err);
+ return GSS_S_FAILURE;
+ }
+ if (memcmp(ptr, decrypted_hdr, 6)
+ || memcmp(ptr + 8, decrypted_hdr + 8, 8)) {
+ dprintk("%s: token hdr, plaintext hdr mismatch!\n", __func__);
+ return GSS_S_FAILURE;
+ }
+
+ /* do sequencing checks */
+
+ /* it got through unscathed. Make sure the context is unexpired */
+ now = get_seconds();
+ if (now > kctx->endtime)
+ return GSS_S_CONTEXT_EXPIRED;
+
+ /*
+ * Move the head data back to the right position in xdr_buf.
+ * We ignore any "ec" data since it might be in the head or
+ * the tail, and we really don't need to deal with it.
+ * Note that buf->head[0].iov_len may indicate the available
+ * head buffer space rather than that actually occupied.
+ */
+ movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len);
+ movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip;
+ BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen >
+ buf->head[0].iov_len);
+ memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen);
+ buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip;
+ buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip;
+
+ /* Trim off the trailing "extra count" and checksum blob */
+ xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip);
+ return GSS_S_COMPLETE;
+}
+
+u32
+gss_wrap_kerberos(struct gss_ctx *gctx, int offset,
+ struct xdr_buf *buf, struct page **pages)
+{
+ struct krb5_ctx *kctx = gctx->internal_ctx_id;
+
+ switch (kctx->enctype) {
+ default:
+ BUG();
+ case ENCTYPE_DES_CBC_RAW:
+ case ENCTYPE_DES3_CBC_RAW:
+ case ENCTYPE_ARCFOUR_HMAC:
+ return gss_wrap_kerberos_v1(kctx, offset, buf, pages);
+ case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+ case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+ return gss_wrap_kerberos_v2(kctx, offset, buf, pages);
+ }
+}
+
+u32
+gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf)
+{
+ struct krb5_ctx *kctx = gctx->internal_ctx_id;
+
+ switch (kctx->enctype) {
+ default:
+ BUG();
+ case ENCTYPE_DES_CBC_RAW:
+ case ENCTYPE_DES3_CBC_RAW:
+ case ENCTYPE_ARCFOUR_HMAC:
+ return gss_unwrap_kerberos_v1(kctx, offset, buf);
+ case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+ case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+ return gss_unwrap_kerberos_v2(kctx, offset, buf);
+ }
+}
+
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
new file mode 100644
index 000000000..7063d856a
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -0,0 +1,481 @@
+/*
+ * linux/net/sunrpc/gss_mech_switch.c
+ *
+ * Copyright (c) 2001 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * J. Bruce Fields <bfields@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/oid_registry.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_asn1.h>
+#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/gss_err.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/clnt.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+static LIST_HEAD(registered_mechs);
+static DEFINE_SPINLOCK(registered_mechs_lock);
+
+static void
+gss_mech_free(struct gss_api_mech *gm)
+{
+ struct pf_desc *pf;
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ pf = &gm->gm_pfs[i];
+ kfree(pf->auth_domain_name);
+ pf->auth_domain_name = NULL;
+ }
+}
+
+static inline char *
+make_auth_domain_name(char *name)
+{
+ static char *prefix = "gss/";
+ char *new;
+
+ new = kmalloc(strlen(name) + strlen(prefix) + 1, GFP_KERNEL);
+ if (new) {
+ strcpy(new, prefix);
+ strcat(new, name);
+ }
+ return new;
+}
+
+static int
+gss_mech_svc_setup(struct gss_api_mech *gm)
+{
+ struct pf_desc *pf;
+ int i, status;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ pf = &gm->gm_pfs[i];
+ pf->auth_domain_name = make_auth_domain_name(pf->name);
+ status = -ENOMEM;
+ if (pf->auth_domain_name == NULL)
+ goto out;
+ status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor,
+ pf->auth_domain_name);
+ if (status)
+ goto out;
+ }
+ return 0;
+out:
+ gss_mech_free(gm);
+ return status;
+}
+
+/**
+ * gss_mech_register - register a GSS mechanism
+ * @gm: GSS mechanism handle
+ *
+ * Returns zero if successful, or a negative errno.
+ */
+int gss_mech_register(struct gss_api_mech *gm)
+{
+ int status;
+
+ status = gss_mech_svc_setup(gm);
+ if (status)
+ return status;
+ spin_lock(&registered_mechs_lock);
+ list_add(&gm->gm_list, &registered_mechs);
+ spin_unlock(&registered_mechs_lock);
+ dprintk("RPC: registered gss mechanism %s\n", gm->gm_name);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gss_mech_register);
+
+/**
+ * gss_mech_unregister - release a GSS mechanism
+ * @gm: GSS mechanism handle
+ *
+ */
+void gss_mech_unregister(struct gss_api_mech *gm)
+{
+ spin_lock(&registered_mechs_lock);
+ list_del(&gm->gm_list);
+ spin_unlock(&registered_mechs_lock);
+ dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name);
+ gss_mech_free(gm);
+}
+EXPORT_SYMBOL_GPL(gss_mech_unregister);
+
+struct gss_api_mech *gss_mech_get(struct gss_api_mech *gm)
+{
+ __module_get(gm->gm_owner);
+ return gm;
+}
+EXPORT_SYMBOL(gss_mech_get);
+
+static struct gss_api_mech *
+_gss_mech_get_by_name(const char *name)
+{
+ struct gss_api_mech *pos, *gm = NULL;
+
+ spin_lock(&registered_mechs_lock);
+ list_for_each_entry(pos, &registered_mechs, gm_list) {
+ if (0 == strcmp(name, pos->gm_name)) {
+ if (try_module_get(pos->gm_owner))
+ gm = pos;
+ break;
+ }
+ }
+ spin_unlock(&registered_mechs_lock);
+ return gm;
+
+}
+
+struct gss_api_mech * gss_mech_get_by_name(const char *name)
+{
+ struct gss_api_mech *gm = NULL;
+
+ gm = _gss_mech_get_by_name(name);
+ if (!gm) {
+ request_module("rpc-auth-gss-%s", name);
+ gm = _gss_mech_get_by_name(name);
+ }
+ return gm;
+}
+
+struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
+{
+ struct gss_api_mech *pos, *gm = NULL;
+ char buf[32];
+
+ if (sprint_oid(obj->data, obj->len, buf, sizeof(buf)) < 0)
+ return NULL;
+ dprintk("RPC: %s(%s)\n", __func__, buf);
+ request_module("rpc-auth-gss-%s", buf);
+
+ spin_lock(&registered_mechs_lock);
+ list_for_each_entry(pos, &registered_mechs, gm_list) {
+ if (obj->len == pos->gm_oid.len) {
+ if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) {
+ if (try_module_get(pos->gm_owner))
+ gm = pos;
+ break;
+ }
+ }
+ }
+ spin_unlock(&registered_mechs_lock);
+ return gm;
+}
+
+static inline int
+mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+ return 1;
+ }
+ return 0;
+}
+
+static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
+{
+ struct gss_api_mech *gm = NULL, *pos;
+
+ spin_lock(&registered_mechs_lock);
+ list_for_each_entry(pos, &registered_mechs, gm_list) {
+ if (!mech_supports_pseudoflavor(pos, pseudoflavor))
+ continue;
+ if (try_module_get(pos->gm_owner))
+ gm = pos;
+ break;
+ }
+ spin_unlock(&registered_mechs_lock);
+ return gm;
+}
+
+struct gss_api_mech *
+gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
+{
+ struct gss_api_mech *gm;
+
+ gm = _gss_mech_get_by_pseudoflavor(pseudoflavor);
+
+ if (!gm) {
+ request_module("rpc-auth-gss-%u", pseudoflavor);
+ gm = _gss_mech_get_by_pseudoflavor(pseudoflavor);
+ }
+ return gm;
+}
+
+/**
+ * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors
+ * @array: array to fill in
+ * @size: size of "array"
+ *
+ * Returns the number of array items filled in, or a negative errno.
+ *
+ * The returned array is not sorted by any policy. Callers should not
+ * rely on the order of the items in the returned array.
+ */
+int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
+{
+ struct gss_api_mech *pos = NULL;
+ int j, i = 0;
+
+ spin_lock(&registered_mechs_lock);
+ list_for_each_entry(pos, &registered_mechs, gm_list) {
+ for (j = 0; j < pos->gm_pf_num; j++) {
+ if (i >= size) {
+ spin_unlock(&registered_mechs_lock);
+ return -ENOMEM;
+ }
+ array_ptr[i++] = pos->gm_pfs[j].pseudoflavor;
+ }
+ }
+ spin_unlock(&registered_mechs_lock);
+ return i;
+}
+
+/**
+ * gss_svc_to_pseudoflavor - map a GSS service number to a pseudoflavor
+ * @gm: GSS mechanism handle
+ * @qop: GSS quality-of-protection value
+ * @service: GSS service value
+ *
+ * Returns a matching security flavor, or RPC_AUTH_MAXFLAVOR if none is found.
+ */
+rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 qop,
+ u32 service)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].qop == qop &&
+ gm->gm_pfs[i].service == service) {
+ return gm->gm_pfs[i].pseudoflavor;
+ }
+ }
+ return RPC_AUTH_MAXFLAVOR;
+}
+
+/**
+ * gss_mech_info2flavor - look up a pseudoflavor given a GSS tuple
+ * @info: a GSS mech OID, quality of protection, and service value
+ *
+ * Returns a matching pseudoflavor, or RPC_AUTH_MAXFLAVOR if the tuple is
+ * not supported.
+ */
+rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *info)
+{
+ rpc_authflavor_t pseudoflavor;
+ struct gss_api_mech *gm;
+
+ gm = gss_mech_get_by_OID(&info->oid);
+ if (gm == NULL)
+ return RPC_AUTH_MAXFLAVOR;
+
+ pseudoflavor = gss_svc_to_pseudoflavor(gm, info->qop, info->service);
+
+ gss_mech_put(gm);
+ return pseudoflavor;
+}
+
+/**
+ * gss_mech_flavor2info - look up a GSS tuple for a given pseudoflavor
+ * @pseudoflavor: GSS pseudoflavor to match
+ * @info: rpcsec_gss_info structure to fill in
+ *
+ * Returns zero and fills in "info" if pseudoflavor matches a
+ * supported mechanism. Otherwise a negative errno is returned.
+ */
+int gss_mech_flavor2info(rpc_authflavor_t pseudoflavor,
+ struct rpcsec_gss_info *info)
+{
+ struct gss_api_mech *gm;
+ int i;
+
+ gm = gss_mech_get_by_pseudoflavor(pseudoflavor);
+ if (gm == NULL)
+ return -ENOENT;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) {
+ memcpy(info->oid.data, gm->gm_oid.data, gm->gm_oid.len);
+ info->oid.len = gm->gm_oid.len;
+ info->qop = gm->gm_pfs[i].qop;
+ info->service = gm->gm_pfs[i].service;
+ gss_mech_put(gm);
+ return 0;
+ }
+ }
+
+ gss_mech_put(gm);
+ return -ENOENT;
+}
+
+u32
+gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+ return gm->gm_pfs[i].service;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(gss_pseudoflavor_to_service);
+
+char *
+gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].service == service)
+ return gm->gm_pfs[i].auth_domain_name;
+ }
+ return NULL;
+}
+
+void
+gss_mech_put(struct gss_api_mech * gm)
+{
+ if (gm)
+ module_put(gm->gm_owner);
+}
+EXPORT_SYMBOL(gss_mech_put);
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+int
+gss_import_sec_context(const void *input_token, size_t bufsize,
+ struct gss_api_mech *mech,
+ struct gss_ctx **ctx_id,
+ time_t *endtime,
+ gfp_t gfp_mask)
+{
+ if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask)))
+ return -ENOMEM;
+ (*ctx_id)->mech_type = gss_mech_get(mech);
+
+ return mech->gm_ops->gss_import_sec_context(input_token, bufsize,
+ *ctx_id, endtime, gfp_mask);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+
+u32
+gss_get_mic(struct gss_ctx *context_handle,
+ struct xdr_buf *message,
+ struct xdr_netobj *mic_token)
+{
+ return context_handle->mech_type->gm_ops
+ ->gss_get_mic(context_handle,
+ message,
+ mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+
+u32
+gss_verify_mic(struct gss_ctx *context_handle,
+ struct xdr_buf *message,
+ struct xdr_netobj *mic_token)
+{
+ return context_handle->mech_type->gm_ops
+ ->gss_verify_mic(context_handle,
+ message,
+ mic_token);
+}
+
+/*
+ * This function is called from both the client and server code.
+ * Each makes guarantees about how much "slack" space is available
+ * for the underlying function in "buf"'s head and tail while
+ * performing the wrap.
+ *
+ * The client and server code allocate RPC_MAX_AUTH_SIZE extra
+ * space in both the head and tail which is available for use by
+ * the wrap function.
+ *
+ * Underlying functions should verify they do not use more than
+ * RPC_MAX_AUTH_SIZE of extra space in either the head or tail
+ * when performing the wrap.
+ */
+u32
+gss_wrap(struct gss_ctx *ctx_id,
+ int offset,
+ struct xdr_buf *buf,
+ struct page **inpages)
+{
+ return ctx_id->mech_type->gm_ops
+ ->gss_wrap(ctx_id, offset, buf, inpages);
+}
+
+u32
+gss_unwrap(struct gss_ctx *ctx_id,
+ int offset,
+ struct xdr_buf *buf)
+{
+ return ctx_id->mech_type->gm_ops
+ ->gss_unwrap(ctx_id, offset, buf);
+}
+
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+u32
+gss_delete_sec_context(struct gss_ctx **context_handle)
+{
+ dprintk("RPC: gss_delete_sec_context deleting %p\n",
+ *context_handle);
+
+ if (!*context_handle)
+ return GSS_S_NO_CONTEXT;
+ if ((*context_handle)->internal_ctx_id)
+ (*context_handle)->mech_type->gm_ops
+ ->gss_delete_sec_context((*context_handle)
+ ->internal_ctx_id);
+ gss_mech_put((*context_handle)->mech_type);
+ kfree(*context_handle);
+ *context_handle=NULL;
+ return GSS_S_COMPLETE;
+}
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
new file mode 100644
index 000000000..59eeed43e
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -0,0 +1,384 @@
+/*
+ * linux/net/sunrpc/gss_rpc_upcall.c
+ *
+ * Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/un.h>
+
+#include <linux/sunrpc/svcauth.h>
+#include "gss_rpc_upcall.h"
+
+#define GSSPROXY_SOCK_PATHNAME "/var/run/gssproxy.sock"
+
+#define GSSPROXY_PROGRAM (400112u)
+#define GSSPROXY_VERS_1 (1u)
+
+/*
+ * Encoding/Decoding functions
+ */
+
+enum {
+ GSSX_NULL = 0, /* Unused */
+ GSSX_INDICATE_MECHS = 1,
+ GSSX_GET_CALL_CONTEXT = 2,
+ GSSX_IMPORT_AND_CANON_NAME = 3,
+ GSSX_EXPORT_CRED = 4,
+ GSSX_IMPORT_CRED = 5,
+ GSSX_ACQUIRE_CRED = 6,
+ GSSX_STORE_CRED = 7,
+ GSSX_INIT_SEC_CONTEXT = 8,
+ GSSX_ACCEPT_SEC_CONTEXT = 9,
+ GSSX_RELEASE_HANDLE = 10,
+ GSSX_GET_MIC = 11,
+ GSSX_VERIFY = 12,
+ GSSX_WRAP = 13,
+ GSSX_UNWRAP = 14,
+ GSSX_WRAP_SIZE_LIMIT = 15,
+};
+
+#define PROC(proc, name) \
+[GSSX_##proc] = { \
+ .p_proc = GSSX_##proc, \
+ .p_encode = (kxdreproc_t)gssx_enc_##name, \
+ .p_decode = (kxdrdproc_t)gssx_dec_##name, \
+ .p_arglen = GSSX_ARG_##name##_sz, \
+ .p_replen = GSSX_RES_##name##_sz, \
+ .p_statidx = GSSX_##proc, \
+ .p_name = #proc, \
+}
+
+static struct rpc_procinfo gssp_procedures[] = {
+ PROC(INDICATE_MECHS, indicate_mechs),
+ PROC(GET_CALL_CONTEXT, get_call_context),
+ PROC(IMPORT_AND_CANON_NAME, import_and_canon_name),
+ PROC(EXPORT_CRED, export_cred),
+ PROC(IMPORT_CRED, import_cred),
+ PROC(ACQUIRE_CRED, acquire_cred),
+ PROC(STORE_CRED, store_cred),
+ PROC(INIT_SEC_CONTEXT, init_sec_context),
+ PROC(ACCEPT_SEC_CONTEXT, accept_sec_context),
+ PROC(RELEASE_HANDLE, release_handle),
+ PROC(GET_MIC, get_mic),
+ PROC(VERIFY, verify),
+ PROC(WRAP, wrap),
+ PROC(UNWRAP, unwrap),
+ PROC(WRAP_SIZE_LIMIT, wrap_size_limit),
+};
+
+
+
+/*
+ * Common transport functions
+ */
+
+static const struct rpc_program gssp_program;
+
+static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt)
+{
+ static const struct sockaddr_un gssp_localaddr = {
+ .sun_family = AF_LOCAL,
+ .sun_path = GSSPROXY_SOCK_PATHNAME,
+ };
+ struct rpc_create_args args = {
+ .net = net,
+ .protocol = XPRT_TRANSPORT_LOCAL,
+ .address = (struct sockaddr *)&gssp_localaddr,
+ .addrsize = sizeof(gssp_localaddr),
+ .servername = "localhost",
+ .program = &gssp_program,
+ .version = GSSPROXY_VERS_1,
+ .authflavor = RPC_AUTH_NULL,
+ /*
+ * Note we want connection to be done in the caller's
+ * filesystem namespace. We therefore turn off the idle
+ * timeout, which would result in reconnections being
+ * done without the correct namespace:
+ */
+ .flags = RPC_CLNT_CREATE_NOPING |
+ RPC_CLNT_CREATE_NO_IDLE_TIMEOUT
+ };
+ struct rpc_clnt *clnt;
+ int result = 0;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt)) {
+ dprintk("RPC: failed to create AF_LOCAL gssproxy "
+ "client (errno %ld).\n", PTR_ERR(clnt));
+ result = PTR_ERR(clnt);
+ *_clnt = NULL;
+ goto out;
+ }
+
+ dprintk("RPC: created new gssp local client (gssp_local_clnt: "
+ "%p)\n", clnt);
+ *_clnt = clnt;
+
+out:
+ return result;
+}
+
+void init_gssp_clnt(struct sunrpc_net *sn)
+{
+ mutex_init(&sn->gssp_lock);
+ sn->gssp_clnt = NULL;
+}
+
+int set_gssp_clnt(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_clnt *clnt;
+ int ret;
+
+ mutex_lock(&sn->gssp_lock);
+ ret = gssp_rpc_create(net, &clnt);
+ if (!ret) {
+ if (sn->gssp_clnt)
+ rpc_shutdown_client(sn->gssp_clnt);
+ sn->gssp_clnt = clnt;
+ }
+ mutex_unlock(&sn->gssp_lock);
+ return ret;
+}
+
+void clear_gssp_clnt(struct sunrpc_net *sn)
+{
+ mutex_lock(&sn->gssp_lock);
+ if (sn->gssp_clnt) {
+ rpc_shutdown_client(sn->gssp_clnt);
+ sn->gssp_clnt = NULL;
+ }
+ mutex_unlock(&sn->gssp_lock);
+}
+
+static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn)
+{
+ struct rpc_clnt *clnt;
+
+ mutex_lock(&sn->gssp_lock);
+ clnt = sn->gssp_clnt;
+ if (clnt)
+ atomic_inc(&clnt->cl_count);
+ mutex_unlock(&sn->gssp_lock);
+ return clnt;
+}
+
+static int gssp_call(struct net *net, struct rpc_message *msg)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_clnt *clnt;
+ int status;
+
+ clnt = get_gssp_clnt(sn);
+ if (!clnt)
+ return -EIO;
+ status = rpc_call_sync(clnt, msg, 0);
+ if (status < 0) {
+ dprintk("gssp: rpc_call returned error %d\n", -status);
+ switch (status) {
+ case -EPROTONOSUPPORT:
+ status = -EINVAL;
+ break;
+ case -ECONNREFUSED:
+ case -ETIMEDOUT:
+ case -ENOTCONN:
+ status = -EAGAIN;
+ break;
+ case -ERESTARTSYS:
+ if (signalled ())
+ status = -EINTR;
+ break;
+ default:
+ break;
+ }
+ }
+ rpc_release_client(clnt);
+ return status;
+}
+
+static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
+{
+ int i;
+
+ for (i = 0; i < arg->npages && arg->pages[i]; i++)
+ __free_page(arg->pages[i]);
+
+ kfree(arg->pages);
+}
+
+static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
+{
+ arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE);
+ arg->pages = kzalloc(arg->npages * sizeof(struct page *), GFP_KERNEL);
+ /*
+ * XXX: actual pages are allocated by xdr layer in
+ * xdr_partial_copy_from_skb.
+ */
+ if (!arg->pages)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Public functions
+ */
+
+/* numbers somewhat arbitrary but large enough for current needs */
+#define GSSX_MAX_OUT_HANDLE 128
+#define GSSX_MAX_SRC_PRINC 256
+#define GSSX_KMEMBUF (GSSX_max_output_handle_sz + \
+ GSSX_max_oid_sz + \
+ GSSX_max_princ_sz + \
+ sizeof(struct svc_cred))
+
+int gssp_accept_sec_context_upcall(struct net *net,
+ struct gssp_upcall_data *data)
+{
+ struct gssx_ctx ctxh = {
+ .state = data->in_handle
+ };
+ struct gssx_arg_accept_sec_context arg = {
+ .input_token = data->in_token,
+ };
+ struct gssx_ctx rctxh = {
+ /*
+ * pass in the max length we expect for each of these
+ * buffers but let the xdr code kmalloc them:
+ */
+ .exported_context_token.len = GSSX_max_output_handle_sz,
+ .mech.len = GSS_OID_MAX_LEN,
+ .src_name.display_name.len = GSSX_max_princ_sz
+ };
+ struct gssx_res_accept_sec_context res = {
+ .context_handle = &rctxh,
+ .output_token = &data->out_token
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &gssp_procedures[GSSX_ACCEPT_SEC_CONTEXT],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = NULL, /* FIXME ? */
+ };
+ struct xdr_netobj client_name = { 0 , NULL };
+ int ret;
+
+ if (data->in_handle.len != 0)
+ arg.context_handle = &ctxh;
+ res.output_token->len = GSSX_max_output_token_sz;
+
+ ret = gssp_alloc_receive_pages(&arg);
+ if (ret)
+ return ret;
+
+ /* use nfs/ for targ_name ? */
+
+ ret = gssp_call(net, &msg);
+
+ gssp_free_receive_pages(&arg);
+
+ /* we need to fetch all data even in case of error so
+ * that we can free special strctures is they have been allocated */
+ data->major_status = res.status.major_status;
+ data->minor_status = res.status.minor_status;
+ if (res.context_handle) {
+ data->out_handle = rctxh.exported_context_token;
+ data->mech_oid.len = rctxh.mech.len;
+ if (rctxh.mech.data)
+ memcpy(data->mech_oid.data, rctxh.mech.data,
+ data->mech_oid.len);
+ client_name = rctxh.src_name.display_name;
+ }
+
+ if (res.options.count == 1) {
+ gssx_buffer *value = &res.options.data[0].value;
+ /* Currently we only decode CREDS_VALUE, if we add
+ * anything else we'll have to loop and match on the
+ * option name */
+ if (value->len == 1) {
+ /* steal group info from struct svc_cred */
+ data->creds = *(struct svc_cred *)value->data;
+ data->found_creds = 1;
+ }
+ /* whether we use it or not, free data */
+ kfree(value->data);
+ }
+
+ if (res.options.count != 0) {
+ kfree(res.options.data);
+ }
+
+ /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */
+ if (data->found_creds && client_name.data != NULL) {
+ char *c;
+
+ data->creds.cr_principal = kstrndup(client_name.data,
+ client_name.len, GFP_KERNEL);
+ if (data->creds.cr_principal) {
+ /* terminate and remove realm part */
+ c = strchr(data->creds.cr_principal, '@');
+ if (c) {
+ *c = '\0';
+
+ /* change service-hostname delimiter */
+ c = strchr(data->creds.cr_principal, '/');
+ if (c) *c = '@';
+ }
+ if (!c) {
+ /* not a service principal */
+ kfree(data->creds.cr_principal);
+ data->creds.cr_principal = NULL;
+ }
+ }
+ }
+ kfree(client_name.data);
+
+ return ret;
+}
+
+void gssp_free_upcall_data(struct gssp_upcall_data *data)
+{
+ kfree(data->in_handle.data);
+ kfree(data->out_handle.data);
+ kfree(data->out_token.data);
+ free_svc_cred(&data->creds);
+}
+
+/*
+ * Initialization stuff
+ */
+
+static const struct rpc_version gssp_version1 = {
+ .number = GSSPROXY_VERS_1,
+ .nrprocs = ARRAY_SIZE(gssp_procedures),
+ .procs = gssp_procedures,
+};
+
+static const struct rpc_version *gssp_version[] = {
+ NULL,
+ &gssp_version1,
+};
+
+static struct rpc_stat gssp_stats;
+
+static const struct rpc_program gssp_program = {
+ .name = "gssproxy",
+ .number = GSSPROXY_PROGRAM,
+ .nrvers = ARRAY_SIZE(gssp_version),
+ .version = gssp_version,
+ .stats = &gssp_stats,
+};
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.h b/net/sunrpc/auth_gss/gss_rpc_upcall.h
new file mode 100644
index 000000000..1e542aded
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.h
@@ -0,0 +1,48 @@
+/*
+ * linux/net/sunrpc/gss_rpc_upcall.h
+ *
+ * Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _GSS_RPC_UPCALL_H
+#define _GSS_RPC_UPCALL_H
+
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/auth_gss.h>
+#include "gss_rpc_xdr.h"
+#include "../netns.h"
+
+struct gssp_upcall_data {
+ struct xdr_netobj in_handle;
+ struct gssp_in_token in_token;
+ struct xdr_netobj out_handle;
+ struct xdr_netobj out_token;
+ struct rpcsec_gss_oid mech_oid;
+ struct svc_cred creds;
+ int found_creds;
+ int major_status;
+ int minor_status;
+};
+
+int gssp_accept_sec_context_upcall(struct net *net,
+ struct gssp_upcall_data *data);
+void gssp_free_upcall_data(struct gssp_upcall_data *data);
+
+void init_gssp_clnt(struct sunrpc_net *);
+int set_gssp_clnt(struct net *);
+void clear_gssp_clnt(struct sunrpc_net *);
+#endif /* _GSS_RPC_UPCALL_H */
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
new file mode 100644
index 000000000..eeeba5ade
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -0,0 +1,848 @@
+/*
+ * GSS Proxy upcall module
+ *
+ * Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/sunrpc/svcauth.h>
+#include "gss_rpc_xdr.h"
+
+static int gssx_enc_bool(struct xdr_stream *xdr, int v)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ *p = v ? xdr_one : xdr_zero;
+ return 0;
+}
+
+static int gssx_dec_bool(struct xdr_stream *xdr, u32 *v)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ *v = be32_to_cpu(*p);
+ return 0;
+}
+
+static int gssx_enc_buffer(struct xdr_stream *xdr,
+ gssx_buffer *buf)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(u32) + buf->len);
+ if (!p)
+ return -ENOSPC;
+ xdr_encode_opaque(p, buf->data, buf->len);
+ return 0;
+}
+
+static int gssx_enc_in_token(struct xdr_stream *xdr,
+ struct gssp_in_token *in)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return -ENOSPC;
+ *p = cpu_to_be32(in->page_len);
+
+ /* all we need to do is to write pages */
+ xdr_write_pages(xdr, in->pages, in->page_base, in->page_len);
+
+ return 0;
+}
+
+
+static int gssx_dec_buffer(struct xdr_stream *xdr,
+ gssx_buffer *buf)
+{
+ u32 length;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ length = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, length);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ if (buf->len == 0) {
+ /* we intentionally are not interested in this buffer */
+ return 0;
+ }
+ if (length > buf->len)
+ return -ENOSPC;
+
+ if (!buf->data) {
+ buf->data = kmemdup(p, length, GFP_KERNEL);
+ if (!buf->data)
+ return -ENOMEM;
+ } else {
+ memcpy(buf->data, p, length);
+ }
+ buf->len = length;
+ return 0;
+}
+
+static int gssx_enc_option(struct xdr_stream *xdr,
+ struct gssx_option *opt)
+{
+ int err;
+
+ err = gssx_enc_buffer(xdr, &opt->option);
+ if (err)
+ return err;
+ err = gssx_enc_buffer(xdr, &opt->value);
+ return err;
+}
+
+static int gssx_dec_option(struct xdr_stream *xdr,
+ struct gssx_option *opt)
+{
+ int err;
+
+ err = gssx_dec_buffer(xdr, &opt->option);
+ if (err)
+ return err;
+ err = gssx_dec_buffer(xdr, &opt->value);
+ return err;
+}
+
+static int dummy_enc_opt_array(struct xdr_stream *xdr,
+ struct gssx_option_array *oa)
+{
+ __be32 *p;
+
+ if (oa->count != 0)
+ return -EINVAL;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return -ENOSPC;
+ *p = 0;
+
+ return 0;
+}
+
+static int dummy_dec_opt_array(struct xdr_stream *xdr,
+ struct gssx_option_array *oa)
+{
+ struct gssx_option dummy;
+ u32 count, i;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ count = be32_to_cpup(p++);
+ memset(&dummy, 0, sizeof(dummy));
+ for (i = 0; i < count; i++) {
+ gssx_dec_option(xdr, &dummy);
+ }
+
+ oa->count = 0;
+ oa->data = NULL;
+ return 0;
+}
+
+static int get_host_u32(struct xdr_stream *xdr, u32 *res)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EINVAL;
+ /* Contents of linux creds are all host-endian: */
+ memcpy(res, p, sizeof(u32));
+ return 0;
+}
+
+static int gssx_dec_linux_creds(struct xdr_stream *xdr,
+ struct svc_cred *creds)
+{
+ u32 length;
+ __be32 *p;
+ u32 tmp;
+ u32 N;
+ int i, err;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ length = be32_to_cpup(p);
+
+ if (length > (3 + NGROUPS_MAX) * sizeof(u32))
+ return -ENOSPC;
+
+ /* uid */
+ err = get_host_u32(xdr, &tmp);
+ if (err)
+ return err;
+ creds->cr_uid = make_kuid(&init_user_ns, tmp);
+
+ /* gid */
+ err = get_host_u32(xdr, &tmp);
+ if (err)
+ return err;
+ creds->cr_gid = make_kgid(&init_user_ns, tmp);
+
+ /* number of additional gid's */
+ err = get_host_u32(xdr, &tmp);
+ if (err)
+ return err;
+ N = tmp;
+ if ((3 + N) * sizeof(u32) != length)
+ return -EINVAL;
+ creds->cr_group_info = groups_alloc(N);
+ if (creds->cr_group_info == NULL)
+ return -ENOMEM;
+
+ /* gid's */
+ for (i = 0; i < N; i++) {
+ kgid_t kgid;
+ err = get_host_u32(xdr, &tmp);
+ if (err)
+ goto out_free_groups;
+ err = -EINVAL;
+ kgid = make_kgid(&init_user_ns, tmp);
+ if (!gid_valid(kgid))
+ goto out_free_groups;
+ GROUP_AT(creds->cr_group_info, i) = kgid;
+ }
+
+ return 0;
+out_free_groups:
+ groups_free(creds->cr_group_info);
+ return err;
+}
+
+static int gssx_dec_option_array(struct xdr_stream *xdr,
+ struct gssx_option_array *oa)
+{
+ struct svc_cred *creds;
+ u32 count, i;
+ __be32 *p;
+ int err;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ count = be32_to_cpup(p++);
+ if (!count)
+ return 0;
+
+ /* we recognize only 1 currently: CREDS_VALUE */
+ oa->count = 1;
+
+ oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL);
+ if (!oa->data)
+ return -ENOMEM;
+
+ creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
+ if (!creds) {
+ kfree(oa->data);
+ return -ENOMEM;
+ }
+
+ oa->data[0].option.data = CREDS_VALUE;
+ oa->data[0].option.len = sizeof(CREDS_VALUE);
+ oa->data[0].value.data = (void *)creds;
+ oa->data[0].value.len = 0;
+
+ for (i = 0; i < count; i++) {
+ gssx_buffer dummy = { 0, NULL };
+ u32 length;
+
+ /* option buffer */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ length = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, length);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+
+ if (length == sizeof(CREDS_VALUE) &&
+ memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) {
+ /* We have creds here. parse them */
+ err = gssx_dec_linux_creds(xdr, creds);
+ if (err)
+ return err;
+ oa->data[0].value.len = 1; /* presence */
+ } else {
+ /* consume uninteresting buffer */
+ err = gssx_dec_buffer(xdr, &dummy);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int gssx_dec_status(struct xdr_stream *xdr,
+ struct gssx_status *status)
+{
+ __be32 *p;
+ int err;
+
+ /* status->major_status */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ p = xdr_decode_hyper(p, &status->major_status);
+
+ /* status->mech */
+ err = gssx_dec_buffer(xdr, &status->mech);
+ if (err)
+ return err;
+
+ /* status->minor_status */
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ p = xdr_decode_hyper(p, &status->minor_status);
+
+ /* status->major_status_string */
+ err = gssx_dec_buffer(xdr, &status->major_status_string);
+ if (err)
+ return err;
+
+ /* status->minor_status_string */
+ err = gssx_dec_buffer(xdr, &status->minor_status_string);
+ if (err)
+ return err;
+
+ /* status->server_ctx */
+ err = gssx_dec_buffer(xdr, &status->server_ctx);
+ if (err)
+ return err;
+
+ /* we assume we have no options for now, so simply consume them */
+ /* status->options */
+ err = dummy_dec_opt_array(xdr, &status->options);
+
+ return err;
+}
+
+static int gssx_enc_call_ctx(struct xdr_stream *xdr,
+ struct gssx_call_ctx *ctx)
+{
+ struct gssx_option opt;
+ __be32 *p;
+ int err;
+
+ /* ctx->locale */
+ err = gssx_enc_buffer(xdr, &ctx->locale);
+ if (err)
+ return err;
+
+ /* ctx->server_ctx */
+ err = gssx_enc_buffer(xdr, &ctx->server_ctx);
+ if (err)
+ return err;
+
+ /* we always want to ask for lucid contexts */
+ /* ctx->options */
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(2);
+
+ /* we want a lucid_v1 context */
+ opt.option.data = LUCID_OPTION;
+ opt.option.len = sizeof(LUCID_OPTION);
+ opt.value.data = LUCID_VALUE;
+ opt.value.len = sizeof(LUCID_VALUE);
+ err = gssx_enc_option(xdr, &opt);
+
+ /* ..and user creds */
+ opt.option.data = CREDS_OPTION;
+ opt.option.len = sizeof(CREDS_OPTION);
+ opt.value.data = CREDS_VALUE;
+ opt.value.len = sizeof(CREDS_VALUE);
+ err = gssx_enc_option(xdr, &opt);
+
+ return err;
+}
+
+static int gssx_dec_name_attr(struct xdr_stream *xdr,
+ struct gssx_name_attr *attr)
+{
+ int err;
+
+ /* attr->attr */
+ err = gssx_dec_buffer(xdr, &attr->attr);
+ if (err)
+ return err;
+
+ /* attr->value */
+ err = gssx_dec_buffer(xdr, &attr->value);
+ if (err)
+ return err;
+
+ /* attr->extensions */
+ err = dummy_dec_opt_array(xdr, &attr->extensions);
+
+ return err;
+}
+
+static int dummy_enc_nameattr_array(struct xdr_stream *xdr,
+ struct gssx_name_attr_array *naa)
+{
+ __be32 *p;
+
+ if (naa->count != 0)
+ return -EINVAL;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return -ENOSPC;
+ *p = 0;
+
+ return 0;
+}
+
+static int dummy_dec_nameattr_array(struct xdr_stream *xdr,
+ struct gssx_name_attr_array *naa)
+{
+ struct gssx_name_attr dummy = { .attr = {.len = 0} };
+ u32 count, i;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ count = be32_to_cpup(p++);
+ for (i = 0; i < count; i++) {
+ gssx_dec_name_attr(xdr, &dummy);
+ }
+
+ naa->count = 0;
+ naa->data = NULL;
+ return 0;
+}
+
+static struct xdr_netobj zero_netobj = {};
+
+static struct gssx_name_attr_array zero_name_attr_array = {};
+
+static struct gssx_option_array zero_option_array = {};
+
+static int gssx_enc_name(struct xdr_stream *xdr,
+ struct gssx_name *name)
+{
+ int err;
+
+ /* name->display_name */
+ err = gssx_enc_buffer(xdr, &name->display_name);
+ if (err)
+ return err;
+
+ /* name->name_type */
+ err = gssx_enc_buffer(xdr, &zero_netobj);
+ if (err)
+ return err;
+
+ /* name->exported_name */
+ err = gssx_enc_buffer(xdr, &zero_netobj);
+ if (err)
+ return err;
+
+ /* name->exported_composite_name */
+ err = gssx_enc_buffer(xdr, &zero_netobj);
+ if (err)
+ return err;
+
+ /* leave name_attributes empty for now, will add once we have any
+ * to pass up at all */
+ /* name->name_attributes */
+ err = dummy_enc_nameattr_array(xdr, &zero_name_attr_array);
+ if (err)
+ return err;
+
+ /* leave options empty for now, will add once we have any options
+ * to pass up at all */
+ /* name->extensions */
+ err = dummy_enc_opt_array(xdr, &zero_option_array);
+
+ return err;
+}
+
+
+static int gssx_dec_name(struct xdr_stream *xdr,
+ struct gssx_name *name)
+{
+ struct xdr_netobj dummy_netobj = { .len = 0 };
+ struct gssx_name_attr_array dummy_name_attr_array = { .count = 0 };
+ struct gssx_option_array dummy_option_array = { .count = 0 };
+ int err;
+
+ /* name->display_name */
+ err = gssx_dec_buffer(xdr, &name->display_name);
+ if (err)
+ return err;
+
+ /* name->name_type */
+ err = gssx_dec_buffer(xdr, &dummy_netobj);
+ if (err)
+ return err;
+
+ /* name->exported_name */
+ err = gssx_dec_buffer(xdr, &dummy_netobj);
+ if (err)
+ return err;
+
+ /* name->exported_composite_name */
+ err = gssx_dec_buffer(xdr, &dummy_netobj);
+ if (err)
+ return err;
+
+ /* we assume we have no attributes for now, so simply consume them */
+ /* name->name_attributes */
+ err = dummy_dec_nameattr_array(xdr, &dummy_name_attr_array);
+ if (err)
+ return err;
+
+ /* we assume we have no options for now, so simply consume them */
+ /* name->extensions */
+ err = dummy_dec_opt_array(xdr, &dummy_option_array);
+
+ return err;
+}
+
+static int dummy_enc_credel_array(struct xdr_stream *xdr,
+ struct gssx_cred_element_array *cea)
+{
+ __be32 *p;
+
+ if (cea->count != 0)
+ return -EINVAL;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return -ENOSPC;
+ *p = 0;
+
+ return 0;
+}
+
+static int gssx_enc_cred(struct xdr_stream *xdr,
+ struct gssx_cred *cred)
+{
+ int err;
+
+ /* cred->desired_name */
+ err = gssx_enc_name(xdr, &cred->desired_name);
+ if (err)
+ return err;
+
+ /* cred->elements */
+ err = dummy_enc_credel_array(xdr, &cred->elements);
+ if (err)
+ return err;
+
+ /* cred->cred_handle_reference */
+ err = gssx_enc_buffer(xdr, &cred->cred_handle_reference);
+ if (err)
+ return err;
+
+ /* cred->needs_release */
+ err = gssx_enc_bool(xdr, cred->needs_release);
+
+ return err;
+}
+
+static int gssx_enc_ctx(struct xdr_stream *xdr,
+ struct gssx_ctx *ctx)
+{
+ __be32 *p;
+ int err;
+
+ /* ctx->exported_context_token */
+ err = gssx_enc_buffer(xdr, &ctx->exported_context_token);
+ if (err)
+ return err;
+
+ /* ctx->state */
+ err = gssx_enc_buffer(xdr, &ctx->state);
+ if (err)
+ return err;
+
+ /* ctx->need_release */
+ err = gssx_enc_bool(xdr, ctx->need_release);
+ if (err)
+ return err;
+
+ /* ctx->mech */
+ err = gssx_enc_buffer(xdr, &ctx->mech);
+ if (err)
+ return err;
+
+ /* ctx->src_name */
+ err = gssx_enc_name(xdr, &ctx->src_name);
+ if (err)
+ return err;
+
+ /* ctx->targ_name */
+ err = gssx_enc_name(xdr, &ctx->targ_name);
+ if (err)
+ return err;
+
+ /* ctx->lifetime */
+ p = xdr_reserve_space(xdr, 8+8);
+ if (!p)
+ return -ENOSPC;
+ p = xdr_encode_hyper(p, ctx->lifetime);
+
+ /* ctx->ctx_flags */
+ p = xdr_encode_hyper(p, ctx->ctx_flags);
+
+ /* ctx->locally_initiated */
+ err = gssx_enc_bool(xdr, ctx->locally_initiated);
+ if (err)
+ return err;
+
+ /* ctx->open */
+ err = gssx_enc_bool(xdr, ctx->open);
+ if (err)
+ return err;
+
+ /* leave options empty for now, will add once we have any options
+ * to pass up at all */
+ /* ctx->options */
+ err = dummy_enc_opt_array(xdr, &ctx->options);
+
+ return err;
+}
+
+static int gssx_dec_ctx(struct xdr_stream *xdr,
+ struct gssx_ctx *ctx)
+{
+ __be32 *p;
+ int err;
+
+ /* ctx->exported_context_token */
+ err = gssx_dec_buffer(xdr, &ctx->exported_context_token);
+ if (err)
+ return err;
+
+ /* ctx->state */
+ err = gssx_dec_buffer(xdr, &ctx->state);
+ if (err)
+ return err;
+
+ /* ctx->need_release */
+ err = gssx_dec_bool(xdr, &ctx->need_release);
+ if (err)
+ return err;
+
+ /* ctx->mech */
+ err = gssx_dec_buffer(xdr, &ctx->mech);
+ if (err)
+ return err;
+
+ /* ctx->src_name */
+ err = gssx_dec_name(xdr, &ctx->src_name);
+ if (err)
+ return err;
+
+ /* ctx->targ_name */
+ err = gssx_dec_name(xdr, &ctx->targ_name);
+ if (err)
+ return err;
+
+ /* ctx->lifetime */
+ p = xdr_inline_decode(xdr, 8+8);
+ if (unlikely(p == NULL))
+ return -ENOSPC;
+ p = xdr_decode_hyper(p, &ctx->lifetime);
+
+ /* ctx->ctx_flags */
+ p = xdr_decode_hyper(p, &ctx->ctx_flags);
+
+ /* ctx->locally_initiated */
+ err = gssx_dec_bool(xdr, &ctx->locally_initiated);
+ if (err)
+ return err;
+
+ /* ctx->open */
+ err = gssx_dec_bool(xdr, &ctx->open);
+ if (err)
+ return err;
+
+ /* we assume we have no options for now, so simply consume them */
+ /* ctx->options */
+ err = dummy_dec_opt_array(xdr, &ctx->options);
+
+ return err;
+}
+
+static int gssx_enc_cb(struct xdr_stream *xdr, struct gssx_cb *cb)
+{
+ __be32 *p;
+ int err;
+
+ /* cb->initiator_addrtype */
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return -ENOSPC;
+ p = xdr_encode_hyper(p, cb->initiator_addrtype);
+
+ /* cb->initiator_address */
+ err = gssx_enc_buffer(xdr, &cb->initiator_address);
+ if (err)
+ return err;
+
+ /* cb->acceptor_addrtype */
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return -ENOSPC;
+ p = xdr_encode_hyper(p, cb->acceptor_addrtype);
+
+ /* cb->acceptor_address */
+ err = gssx_enc_buffer(xdr, &cb->acceptor_address);
+ if (err)
+ return err;
+
+ /* cb->application_data */
+ err = gssx_enc_buffer(xdr, &cb->application_data);
+
+ return err;
+}
+
+void gssx_enc_accept_sec_context(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct gssx_arg_accept_sec_context *arg)
+{
+ int err;
+
+ err = gssx_enc_call_ctx(xdr, &arg->call_ctx);
+ if (err)
+ goto done;
+
+ /* arg->context_handle */
+ if (arg->context_handle)
+ err = gssx_enc_ctx(xdr, arg->context_handle);
+ else
+ err = gssx_enc_bool(xdr, 0);
+ if (err)
+ goto done;
+
+ /* arg->cred_handle */
+ if (arg->cred_handle)
+ err = gssx_enc_cred(xdr, arg->cred_handle);
+ else
+ err = gssx_enc_bool(xdr, 0);
+ if (err)
+ goto done;
+
+ /* arg->input_token */
+ err = gssx_enc_in_token(xdr, &arg->input_token);
+ if (err)
+ goto done;
+
+ /* arg->input_cb */
+ if (arg->input_cb)
+ err = gssx_enc_cb(xdr, arg->input_cb);
+ else
+ err = gssx_enc_bool(xdr, 0);
+ if (err)
+ goto done;
+
+ err = gssx_enc_bool(xdr, arg->ret_deleg_cred);
+ if (err)
+ goto done;
+
+ /* leave options empty for now, will add once we have any options
+ * to pass up at all */
+ /* arg->options */
+ err = dummy_enc_opt_array(xdr, &arg->options);
+
+ xdr_inline_pages(&req->rq_rcv_buf,
+ PAGE_SIZE/2 /* pretty arbitrary */,
+ arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
+done:
+ if (err)
+ dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err);
+}
+
+int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct gssx_res_accept_sec_context *res)
+{
+ u32 value_follows;
+ int err;
+ struct page *scratch;
+
+ scratch = alloc_page(GFP_KERNEL);
+ if (!scratch)
+ return -ENOMEM;
+ xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
+
+ /* res->status */
+ err = gssx_dec_status(xdr, &res->status);
+ if (err)
+ goto out_free;
+
+ /* res->context_handle */
+ err = gssx_dec_bool(xdr, &value_follows);
+ if (err)
+ goto out_free;
+ if (value_follows) {
+ err = gssx_dec_ctx(xdr, res->context_handle);
+ if (err)
+ goto out_free;
+ } else {
+ res->context_handle = NULL;
+ }
+
+ /* res->output_token */
+ err = gssx_dec_bool(xdr, &value_follows);
+ if (err)
+ goto out_free;
+ if (value_follows) {
+ err = gssx_dec_buffer(xdr, res->output_token);
+ if (err)
+ goto out_free;
+ } else {
+ res->output_token = NULL;
+ }
+
+ /* res->delegated_cred_handle */
+ err = gssx_dec_bool(xdr, &value_follows);
+ if (err)
+ goto out_free;
+ if (value_follows) {
+ /* we do not support upcall servers sending this data. */
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ /* res->options */
+ err = gssx_dec_option_array(xdr, &res->options);
+
+out_free:
+ __free_page(scratch);
+ return err;
+}
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.h b/net/sunrpc/auth_gss/gss_rpc_xdr.h
new file mode 100644
index 000000000..9d88c6239
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.h
@@ -0,0 +1,267 @@
+/*
+ * GSS Proxy upcall module
+ *
+ * Copyright (C) 2012 Simo Sorce <simo@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _LINUX_GSS_RPC_XDR_H
+#define _LINUX_GSS_RPC_XDR_H
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprtsock.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+#define LUCID_OPTION "exported_context_type"
+#define LUCID_VALUE "linux_lucid_v1"
+#define CREDS_OPTION "exported_creds_type"
+#define CREDS_VALUE "linux_creds_v1"
+
+typedef struct xdr_netobj gssx_buffer;
+typedef struct xdr_netobj utf8string;
+typedef struct xdr_netobj gssx_OID;
+
+enum gssx_cred_usage {
+ GSSX_C_INITIATE = 1,
+ GSSX_C_ACCEPT = 2,
+ GSSX_C_BOTH = 3,
+};
+
+struct gssx_option {
+ gssx_buffer option;
+ gssx_buffer value;
+};
+
+struct gssx_option_array {
+ u32 count;
+ struct gssx_option *data;
+};
+
+struct gssx_status {
+ u64 major_status;
+ gssx_OID mech;
+ u64 minor_status;
+ utf8string major_status_string;
+ utf8string minor_status_string;
+ gssx_buffer server_ctx;
+ struct gssx_option_array options;
+};
+
+struct gssx_call_ctx {
+ utf8string locale;
+ gssx_buffer server_ctx;
+ struct gssx_option_array options;
+};
+
+struct gssx_name_attr {
+ gssx_buffer attr;
+ gssx_buffer value;
+ struct gssx_option_array extensions;
+};
+
+struct gssx_name_attr_array {
+ u32 count;
+ struct gssx_name_attr *data;
+};
+
+struct gssx_name {
+ gssx_buffer display_name;
+};
+typedef struct gssx_name gssx_name;
+
+struct gssx_cred_element {
+ gssx_name MN;
+ gssx_OID mech;
+ u32 cred_usage;
+ u64 initiator_time_rec;
+ u64 acceptor_time_rec;
+ struct gssx_option_array options;
+};
+
+struct gssx_cred_element_array {
+ u32 count;
+ struct gssx_cred_element *data;
+};
+
+struct gssx_cred {
+ gssx_name desired_name;
+ struct gssx_cred_element_array elements;
+ gssx_buffer cred_handle_reference;
+ u32 needs_release;
+};
+
+struct gssx_ctx {
+ gssx_buffer exported_context_token;
+ gssx_buffer state;
+ u32 need_release;
+ gssx_OID mech;
+ gssx_name src_name;
+ gssx_name targ_name;
+ u64 lifetime;
+ u64 ctx_flags;
+ u32 locally_initiated;
+ u32 open;
+ struct gssx_option_array options;
+};
+
+struct gssx_cb {
+ u64 initiator_addrtype;
+ gssx_buffer initiator_address;
+ u64 acceptor_addrtype;
+ gssx_buffer acceptor_address;
+ gssx_buffer application_data;
+};
+
+
+/* This structure is not defined in the protocol.
+ * It is used in the kernel to carry around a big buffer
+ * as a set of pages */
+struct gssp_in_token {
+ struct page **pages; /* Array of contiguous pages */
+ unsigned int page_base; /* Start of page data */
+ unsigned int page_len; /* Length of page data */
+};
+
+struct gssx_arg_accept_sec_context {
+ struct gssx_call_ctx call_ctx;
+ struct gssx_ctx *context_handle;
+ struct gssx_cred *cred_handle;
+ struct gssp_in_token input_token;
+ struct gssx_cb *input_cb;
+ u32 ret_deleg_cred;
+ struct gssx_option_array options;
+ struct page **pages;
+ unsigned int npages;
+};
+
+struct gssx_res_accept_sec_context {
+ struct gssx_status status;
+ struct gssx_ctx *context_handle;
+ gssx_buffer *output_token;
+ /* struct gssx_cred *delegated_cred_handle; not used in kernel */
+ struct gssx_option_array options;
+};
+
+
+
+#define gssx_enc_indicate_mechs NULL
+#define gssx_dec_indicate_mechs NULL
+#define gssx_enc_get_call_context NULL
+#define gssx_dec_get_call_context NULL
+#define gssx_enc_import_and_canon_name NULL
+#define gssx_dec_import_and_canon_name NULL
+#define gssx_enc_export_cred NULL
+#define gssx_dec_export_cred NULL
+#define gssx_enc_import_cred NULL
+#define gssx_dec_import_cred NULL
+#define gssx_enc_acquire_cred NULL
+#define gssx_dec_acquire_cred NULL
+#define gssx_enc_store_cred NULL
+#define gssx_dec_store_cred NULL
+#define gssx_enc_init_sec_context NULL
+#define gssx_dec_init_sec_context NULL
+void gssx_enc_accept_sec_context(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct gssx_arg_accept_sec_context *args);
+int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct gssx_res_accept_sec_context *res);
+#define gssx_enc_release_handle NULL
+#define gssx_dec_release_handle NULL
+#define gssx_enc_get_mic NULL
+#define gssx_dec_get_mic NULL
+#define gssx_enc_verify NULL
+#define gssx_dec_verify NULL
+#define gssx_enc_wrap NULL
+#define gssx_dec_wrap NULL
+#define gssx_enc_unwrap NULL
+#define gssx_dec_unwrap NULL
+#define gssx_enc_wrap_size_limit NULL
+#define gssx_dec_wrap_size_limit NULL
+
+/* non implemented calls are set to 0 size */
+#define GSSX_ARG_indicate_mechs_sz 0
+#define GSSX_RES_indicate_mechs_sz 0
+#define GSSX_ARG_get_call_context_sz 0
+#define GSSX_RES_get_call_context_sz 0
+#define GSSX_ARG_import_and_canon_name_sz 0
+#define GSSX_RES_import_and_canon_name_sz 0
+#define GSSX_ARG_export_cred_sz 0
+#define GSSX_RES_export_cred_sz 0
+#define GSSX_ARG_import_cred_sz 0
+#define GSSX_RES_import_cred_sz 0
+#define GSSX_ARG_acquire_cred_sz 0
+#define GSSX_RES_acquire_cred_sz 0
+#define GSSX_ARG_store_cred_sz 0
+#define GSSX_RES_store_cred_sz 0
+#define GSSX_ARG_init_sec_context_sz 0
+#define GSSX_RES_init_sec_context_sz 0
+
+#define GSSX_default_in_call_ctx_sz (4 + 4 + 4 + \
+ 8 + sizeof(LUCID_OPTION) + sizeof(LUCID_VALUE) + \
+ 8 + sizeof(CREDS_OPTION) + sizeof(CREDS_VALUE))
+#define GSSX_default_in_ctx_hndl_sz (4 + 4+8 + 4 + 4 + 6*4 + 6*4 + 8 + 8 + \
+ 4 + 4 + 4)
+#define GSSX_default_in_cred_sz 4 /* we send in no cred_handle */
+#define GSSX_default_in_token_sz 4 /* does *not* include token data */
+#define GSSX_default_in_cb_sz 4 /* we do not use channel bindings */
+#define GSSX_ARG_accept_sec_context_sz (GSSX_default_in_call_ctx_sz + \
+ GSSX_default_in_ctx_hndl_sz + \
+ GSSX_default_in_cred_sz + \
+ GSSX_default_in_token_sz + \
+ GSSX_default_in_cb_sz + \
+ 4 /* no deleg creds boolean */ + \
+ 4) /* empty options */
+
+/* somewhat arbitrary numbers but large enough (we ignore some of the data
+ * sent down, but it is part of the protocol so we need enough space to take
+ * it in) */
+#define GSSX_default_status_sz 8 + 24 + 8 + 256 + 256 + 16 + 4
+#define GSSX_max_output_handle_sz 128
+#define GSSX_max_oid_sz 16
+#define GSSX_max_princ_sz 256
+#define GSSX_default_ctx_sz (GSSX_max_output_handle_sz + \
+ 16 + 4 + GSSX_max_oid_sz + \
+ 2 * GSSX_max_princ_sz + \
+ 8 + 8 + 4 + 4 + 4)
+#define GSSX_max_output_token_sz 1024
+/* grouplist not included; we allocate separate pages for that: */
+#define GSSX_max_creds_sz (4 + 4 + 4 /* + NGROUPS_MAX*4 */)
+#define GSSX_RES_accept_sec_context_sz (GSSX_default_status_sz + \
+ GSSX_default_ctx_sz + \
+ GSSX_max_output_token_sz + \
+ 4 + GSSX_max_creds_sz)
+
+#define GSSX_ARG_release_handle_sz 0
+#define GSSX_RES_release_handle_sz 0
+#define GSSX_ARG_get_mic_sz 0
+#define GSSX_RES_get_mic_sz 0
+#define GSSX_ARG_verify_sz 0
+#define GSSX_RES_verify_sz 0
+#define GSSX_ARG_wrap_sz 0
+#define GSSX_RES_wrap_sz 0
+#define GSSX_ARG_unwrap_sz 0
+#define GSSX_RES_unwrap_sz 0
+#define GSSX_ARG_wrap_size_limit_sz 0
+#define GSSX_RES_wrap_size_limit_sz 0
+
+
+
+#endif /* _LINUX_GSS_RPC_XDR_H */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
new file mode 100644
index 000000000..1095be9c8
--- /dev/null
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -0,0 +1,1862 @@
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ * 1/ context creation
+ * 2/ data exchange
+ * 3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ * In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ * GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel. The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ * uid/gidlist - for determining access rights
+ * mechanism type
+ * mechanism specific information, such as a key
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/user_namespace.h>
+
+#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/gss_err.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/cache.h>
+#include "gss_rpc_upcall.h"
+
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests
+ * into replies.
+ *
+ * Key is context handle (\x if empty) and gss_token.
+ * Content is major_status minor_status (integers) context_handle, reply_token.
+ *
+ */
+
+static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b)
+{
+ return a->len == b->len && 0 == memcmp(a->data, b->data, a->len);
+}
+
+#define RSI_HASHBITS 6
+#define RSI_HASHMAX (1<<RSI_HASHBITS)
+
+struct rsi {
+ struct cache_head h;
+ struct xdr_netobj in_handle, in_token;
+ struct xdr_netobj out_handle, out_token;
+ int major_status, minor_status;
+};
+
+static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old);
+static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item);
+
+static void rsi_free(struct rsi *rsii)
+{
+ kfree(rsii->in_handle.data);
+ kfree(rsii->in_token.data);
+ kfree(rsii->out_handle.data);
+ kfree(rsii->out_token.data);
+}
+
+static void rsi_put(struct kref *ref)
+{
+ struct rsi *rsii = container_of(ref, struct rsi, h.ref);
+ rsi_free(rsii);
+ kfree(rsii);
+}
+
+static inline int rsi_hash(struct rsi *item)
+{
+ return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS)
+ ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS);
+}
+
+static int rsi_match(struct cache_head *a, struct cache_head *b)
+{
+ struct rsi *item = container_of(a, struct rsi, h);
+ struct rsi *tmp = container_of(b, struct rsi, h);
+ return netobj_equal(&item->in_handle, &tmp->in_handle) &&
+ netobj_equal(&item->in_token, &tmp->in_token);
+}
+
+static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len)
+{
+ dst->len = len;
+ dst->data = (len ? kmemdup(src, len, GFP_KERNEL) : NULL);
+ if (len && !dst->data)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src)
+{
+ return dup_to_netobj(dst, src->data, src->len);
+}
+
+static void rsi_init(struct cache_head *cnew, struct cache_head *citem)
+{
+ struct rsi *new = container_of(cnew, struct rsi, h);
+ struct rsi *item = container_of(citem, struct rsi, h);
+
+ new->out_handle.data = NULL;
+ new->out_handle.len = 0;
+ new->out_token.data = NULL;
+ new->out_token.len = 0;
+ new->in_handle.len = item->in_handle.len;
+ item->in_handle.len = 0;
+ new->in_token.len = item->in_token.len;
+ item->in_token.len = 0;
+ new->in_handle.data = item->in_handle.data;
+ item->in_handle.data = NULL;
+ new->in_token.data = item->in_token.data;
+ item->in_token.data = NULL;
+}
+
+static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
+{
+ struct rsi *new = container_of(cnew, struct rsi, h);
+ struct rsi *item = container_of(citem, struct rsi, h);
+
+ BUG_ON(new->out_handle.data || new->out_token.data);
+ new->out_handle.len = item->out_handle.len;
+ item->out_handle.len = 0;
+ new->out_token.len = item->out_token.len;
+ item->out_token.len = 0;
+ new->out_handle.data = item->out_handle.data;
+ item->out_handle.data = NULL;
+ new->out_token.data = item->out_token.data;
+ item->out_token.data = NULL;
+
+ new->major_status = item->major_status;
+ new->minor_status = item->minor_status;
+}
+
+static struct cache_head *rsi_alloc(void)
+{
+ struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL);
+ if (rsii)
+ return &rsii->h;
+ else
+ return NULL;
+}
+
+static void rsi_request(struct cache_detail *cd,
+ struct cache_head *h,
+ char **bpp, int *blen)
+{
+ struct rsi *rsii = container_of(h, struct rsi, h);
+
+ qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len);
+ qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len);
+ (*bpp)[-1] = '\n';
+}
+
+static int rsi_parse(struct cache_detail *cd,
+ char *mesg, int mlen)
+{
+ /* context token expiry major minor context token */
+ char *buf = mesg;
+ char *ep;
+ int len;
+ struct rsi rsii, *rsip = NULL;
+ time_t expiry;
+ int status = -EINVAL;
+
+ memset(&rsii, 0, sizeof(rsii));
+ /* handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+ if (dup_to_netobj(&rsii.in_handle, buf, len))
+ goto out;
+
+ /* token */
+ len = qword_get(&mesg, buf, mlen);
+ status = -EINVAL;
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+ if (dup_to_netobj(&rsii.in_token, buf, len))
+ goto out;
+
+ rsip = rsi_lookup(cd, &rsii);
+ if (!rsip)
+ goto out;
+
+ rsii.h.flags = 0;
+ /* expiry */
+ expiry = get_expiry(&mesg);
+ status = -EINVAL;
+ if (expiry == 0)
+ goto out;
+
+ /* major/minor */
+ len = qword_get(&mesg, buf, mlen);
+ if (len <= 0)
+ goto out;
+ rsii.major_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+ len = qword_get(&mesg, buf, mlen);
+ if (len <= 0)
+ goto out;
+ rsii.minor_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+
+ /* out_handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+ if (dup_to_netobj(&rsii.out_handle, buf, len))
+ goto out;
+
+ /* out_token */
+ len = qword_get(&mesg, buf, mlen);
+ status = -EINVAL;
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+ if (dup_to_netobj(&rsii.out_token, buf, len))
+ goto out;
+ rsii.h.expiry_time = expiry;
+ rsip = rsi_update(cd, &rsii, rsip);
+ status = 0;
+out:
+ rsi_free(&rsii);
+ if (rsip)
+ cache_put(&rsip->h, cd);
+ else
+ status = -ENOMEM;
+ return status;
+}
+
+static struct cache_detail rsi_cache_template = {
+ .owner = THIS_MODULE,
+ .hash_size = RSI_HASHMAX,
+ .name = "auth.rpcsec.init",
+ .cache_put = rsi_put,
+ .cache_request = rsi_request,
+ .cache_parse = rsi_parse,
+ .match = rsi_match,
+ .init = rsi_init,
+ .update = update_rsi,
+ .alloc = rsi_alloc,
+};
+
+static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item)
+{
+ struct cache_head *ch;
+ int hash = rsi_hash(item);
+
+ ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ if (ch)
+ return container_of(ch, struct rsi, h);
+ else
+ return NULL;
+}
+
+static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old)
+{
+ struct cache_head *ch;
+ int hash = rsi_hash(new);
+
+ ch = sunrpc_cache_update(cd, &new->h,
+ &old->h, hash);
+ if (ch)
+ return container_of(ch, struct rsi, h);
+ else
+ return NULL;
+}
+
+
+/*
+ * The rpcsec_context cache is used to store a context that is
+ * used in data exchange.
+ * The key is a context handle. The content is:
+ * uid, gidlist, mechanism, service-set, mech-specific-data
+ */
+
+#define RSC_HASHBITS 10
+#define RSC_HASHMAX (1<<RSC_HASHBITS)
+
+#define GSS_SEQ_WIN 128
+
+struct gss_svc_seq_data {
+ /* highest seq number seen so far: */
+ int sd_max;
+ /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of
+ * sd_win is nonzero iff sequence number i has been seen already: */
+ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG];
+ spinlock_t sd_lock;
+};
+
+struct rsc {
+ struct cache_head h;
+ struct xdr_netobj handle;
+ struct svc_cred cred;
+ struct gss_svc_seq_data seqdata;
+ struct gss_ctx *mechctx;
+};
+
+static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old);
+static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item);
+
+static void rsc_free(struct rsc *rsci)
+{
+ kfree(rsci->handle.data);
+ if (rsci->mechctx)
+ gss_delete_sec_context(&rsci->mechctx);
+ free_svc_cred(&rsci->cred);
+}
+
+static void rsc_put(struct kref *ref)
+{
+ struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+ rsc_free(rsci);
+ kfree(rsci);
+}
+
+static inline int
+rsc_hash(struct rsc *rsci)
+{
+ return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS);
+}
+
+static int
+rsc_match(struct cache_head *a, struct cache_head *b)
+{
+ struct rsc *new = container_of(a, struct rsc, h);
+ struct rsc *tmp = container_of(b, struct rsc, h);
+
+ return netobj_equal(&new->handle, &tmp->handle);
+}
+
+static void
+rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
+{
+ struct rsc *new = container_of(cnew, struct rsc, h);
+ struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+ new->handle.len = tmp->handle.len;
+ tmp->handle.len = 0;
+ new->handle.data = tmp->handle.data;
+ tmp->handle.data = NULL;
+ new->mechctx = NULL;
+ init_svc_cred(&new->cred);
+}
+
+static void
+update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
+{
+ struct rsc *new = container_of(cnew, struct rsc, h);
+ struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+ new->mechctx = tmp->mechctx;
+ tmp->mechctx = NULL;
+ memset(&new->seqdata, 0, sizeof(new->seqdata));
+ spin_lock_init(&new->seqdata.sd_lock);
+ new->cred = tmp->cred;
+ init_svc_cred(&tmp->cred);
+}
+
+static struct cache_head *
+rsc_alloc(void)
+{
+ struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL);
+ if (rsci)
+ return &rsci->h;
+ else
+ return NULL;
+}
+
+static int rsc_parse(struct cache_detail *cd,
+ char *mesg, int mlen)
+{
+ /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
+ char *buf = mesg;
+ int id;
+ int len, rv;
+ struct rsc rsci, *rscp = NULL;
+ time_t expiry;
+ int status = -EINVAL;
+ struct gss_api_mech *gm = NULL;
+
+ memset(&rsci, 0, sizeof(rsci));
+ /* context handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0) goto out;
+ status = -ENOMEM;
+ if (dup_to_netobj(&rsci.handle, buf, len))
+ goto out;
+
+ rsci.h.flags = 0;
+ /* expiry */
+ expiry = get_expiry(&mesg);
+ status = -EINVAL;
+ if (expiry == 0)
+ goto out;
+
+ rscp = rsc_lookup(cd, &rsci);
+ if (!rscp)
+ goto out;
+
+ /* uid, or NEGATIVE */
+ rv = get_int(&mesg, &id);
+ if (rv == -EINVAL)
+ goto out;
+ if (rv == -ENOENT)
+ set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+ else {
+ int N, i;
+
+ /*
+ * NOTE: we skip uid_valid()/gid_valid() checks here:
+ * instead, * -1 id's are later mapped to the
+ * (export-specific) anonymous id by nfsd_setuser.
+ *
+ * (But supplementary gid's get no such special
+ * treatment so are checked for validity here.)
+ */
+ /* uid */
+ rsci.cred.cr_uid = make_kuid(&init_user_ns, id);
+
+ /* gid */
+ if (get_int(&mesg, &id))
+ goto out;
+ rsci.cred.cr_gid = make_kgid(&init_user_ns, id);
+
+ /* number of additional gid's */
+ if (get_int(&mesg, &N))
+ goto out;
+ if (N < 0 || N > NGROUPS_MAX)
+ goto out;
+ status = -ENOMEM;
+ rsci.cred.cr_group_info = groups_alloc(N);
+ if (rsci.cred.cr_group_info == NULL)
+ goto out;
+
+ /* gid's */
+ status = -EINVAL;
+ for (i=0; i<N; i++) {
+ kgid_t kgid;
+ if (get_int(&mesg, &id))
+ goto out;
+ kgid = make_kgid(&init_user_ns, id);
+ if (!gid_valid(kgid))
+ goto out;
+ GROUP_AT(rsci.cred.cr_group_info, i) = kgid;
+ }
+
+ /* mech name */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ gm = rsci.cred.cr_gss_mech = gss_mech_get_by_name(buf);
+ status = -EOPNOTSUPP;
+ if (!gm)
+ goto out;
+
+ status = -EINVAL;
+ /* mech-specific data: */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ status = gss_import_sec_context(buf, len, gm, &rsci.mechctx,
+ NULL, GFP_KERNEL);
+ if (status)
+ goto out;
+
+ /* get client name */
+ len = qword_get(&mesg, buf, mlen);
+ if (len > 0) {
+ rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL);
+ if (!rsci.cred.cr_principal) {
+ status = -ENOMEM;
+ goto out;
+ }
+ }
+
+ }
+ rsci.h.expiry_time = expiry;
+ rscp = rsc_update(cd, &rsci, rscp);
+ status = 0;
+out:
+ rsc_free(&rsci);
+ if (rscp)
+ cache_put(&rscp->h, cd);
+ else
+ status = -ENOMEM;
+ return status;
+}
+
+static struct cache_detail rsc_cache_template = {
+ .owner = THIS_MODULE,
+ .hash_size = RSC_HASHMAX,
+ .name = "auth.rpcsec.context",
+ .cache_put = rsc_put,
+ .cache_parse = rsc_parse,
+ .match = rsc_match,
+ .init = rsc_init,
+ .update = update_rsc,
+ .alloc = rsc_alloc,
+};
+
+static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item)
+{
+ struct cache_head *ch;
+ int hash = rsc_hash(item);
+
+ ch = sunrpc_cache_lookup(cd, &item->h, hash);
+ if (ch)
+ return container_of(ch, struct rsc, h);
+ else
+ return NULL;
+}
+
+static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old)
+{
+ struct cache_head *ch;
+ int hash = rsc_hash(new);
+
+ ch = sunrpc_cache_update(cd, &new->h,
+ &old->h, hash);
+ if (ch)
+ return container_of(ch, struct rsc, h);
+ else
+ return NULL;
+}
+
+
+static struct rsc *
+gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle)
+{
+ struct rsc rsci;
+ struct rsc *found;
+
+ memset(&rsci, 0, sizeof(rsci));
+ if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+ return NULL;
+ found = rsc_lookup(cd, &rsci);
+ rsc_free(&rsci);
+ if (!found)
+ return NULL;
+ if (cache_check(cd, &found->h, NULL))
+ return NULL;
+ return found;
+}
+
+/* Implements sequence number algorithm as specified in RFC 2203. */
+static int
+gss_check_seq_num(struct rsc *rsci, int seq_num)
+{
+ struct gss_svc_seq_data *sd = &rsci->seqdata;
+
+ spin_lock(&sd->sd_lock);
+ if (seq_num > sd->sd_max) {
+ if (seq_num >= sd->sd_max + GSS_SEQ_WIN) {
+ memset(sd->sd_win,0,sizeof(sd->sd_win));
+ sd->sd_max = seq_num;
+ } else while (sd->sd_max < seq_num) {
+ sd->sd_max++;
+ __clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win);
+ }
+ __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win);
+ goto ok;
+ } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) {
+ goto drop;
+ }
+ /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */
+ if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win))
+ goto drop;
+ok:
+ spin_unlock(&sd->sd_lock);
+ return 1;
+drop:
+ spin_unlock(&sd->sd_lock);
+ return 0;
+}
+
+static inline u32 round_up_to_quad(u32 i)
+{
+ return (i + 3 ) & ~3;
+}
+
+static inline int
+svc_safe_getnetobj(struct kvec *argv, struct xdr_netobj *o)
+{
+ int l;
+
+ if (argv->iov_len < 4)
+ return -1;
+ o->len = svc_getnl(argv);
+ l = round_up_to_quad(o->len);
+ if (argv->iov_len < l)
+ return -1;
+ o->data = argv->iov_base;
+ argv->iov_base += l;
+ argv->iov_len -= l;
+ return 0;
+}
+
+static inline int
+svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o)
+{
+ u8 *p;
+
+ if (resv->iov_len + 4 > PAGE_SIZE)
+ return -1;
+ svc_putnl(resv, o->len);
+ p = resv->iov_base + resv->iov_len;
+ resv->iov_len += round_up_to_quad(o->len);
+ if (resv->iov_len > PAGE_SIZE)
+ return -1;
+ memcpy(p, o->data, o->len);
+ memset(p + o->len, 0, round_up_to_quad(o->len) - o->len);
+ return 0;
+}
+
+/*
+ * Verify the checksum on the header and return SVC_OK on success.
+ * Otherwise, return SVC_DROP (in the case of a bad sequence number)
+ * or return SVC_DENIED and indicate error in authp.
+ */
+static int
+gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
+ __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp)
+{
+ struct gss_ctx *ctx_id = rsci->mechctx;
+ struct xdr_buf rpchdr;
+ struct xdr_netobj checksum;
+ u32 flavor = 0;
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec iov;
+
+ /* data to compute the checksum over: */
+ iov.iov_base = rpcstart;
+ iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart;
+ xdr_buf_from_iov(&iov, &rpchdr);
+
+ *authp = rpc_autherr_badverf;
+ if (argv->iov_len < 4)
+ return SVC_DENIED;
+ flavor = svc_getnl(argv);
+ if (flavor != RPC_AUTH_GSS)
+ return SVC_DENIED;
+ if (svc_safe_getnetobj(argv, &checksum))
+ return SVC_DENIED;
+
+ if (rqstp->rq_deferred) /* skip verification of revisited request */
+ return SVC_OK;
+ if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) {
+ *authp = rpcsec_gsserr_credproblem;
+ return SVC_DENIED;
+ }
+
+ if (gc->gc_seq > MAXSEQ) {
+ dprintk("RPC: svcauth_gss: discarding request with "
+ "large sequence number %d\n", gc->gc_seq);
+ *authp = rpcsec_gsserr_ctxproblem;
+ return SVC_DENIED;
+ }
+ if (!gss_check_seq_num(rsci, gc->gc_seq)) {
+ dprintk("RPC: svcauth_gss: discarding request with "
+ "old sequence number %d\n", gc->gc_seq);
+ return SVC_DROP;
+ }
+ return SVC_OK;
+}
+
+static int
+gss_write_null_verf(struct svc_rqst *rqstp)
+{
+ __be32 *p;
+
+ svc_putnl(rqstp->rq_res.head, RPC_AUTH_NULL);
+ p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len;
+ /* don't really need to check if head->iov_len > PAGE_SIZE ... */
+ *p++ = 0;
+ if (!xdr_ressize_check(rqstp, p))
+ return -1;
+ return 0;
+}
+
+static int
+gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq)
+{
+ __be32 xdr_seq;
+ u32 maj_stat;
+ struct xdr_buf verf_data;
+ struct xdr_netobj mic;
+ __be32 *p;
+ struct kvec iov;
+
+ svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS);
+ xdr_seq = htonl(seq);
+
+ iov.iov_base = &xdr_seq;
+ iov.iov_len = sizeof(xdr_seq);
+ xdr_buf_from_iov(&iov, &verf_data);
+ p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len;
+ mic.data = (u8 *)(p + 1);
+ maj_stat = gss_get_mic(ctx_id, &verf_data, &mic);
+ if (maj_stat != GSS_S_COMPLETE)
+ return -1;
+ *p++ = htonl(mic.len);
+ memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len);
+ p += XDR_QUADLEN(mic.len);
+ if (!xdr_ressize_check(rqstp, p))
+ return -1;
+ return 0;
+}
+
+struct gss_domain {
+ struct auth_domain h;
+ u32 pseudoflavor;
+};
+
+static struct auth_domain *
+find_gss_auth_domain(struct gss_ctx *ctx, u32 svc)
+{
+ char *name;
+
+ name = gss_service_to_auth_domain_name(ctx->mech_type, svc);
+ if (!name)
+ return NULL;
+ return auth_domain_find(name);
+}
+
+static struct auth_ops svcauthops_gss;
+
+u32 svcauth_gss_flavor(struct auth_domain *dom)
+{
+ struct gss_domain *gd = container_of(dom, struct gss_domain, h);
+
+ return gd->pseudoflavor;
+}
+
+EXPORT_SYMBOL_GPL(svcauth_gss_flavor);
+
+int
+svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
+{
+ struct gss_domain *new;
+ struct auth_domain *test;
+ int stat = -ENOMEM;
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ goto out;
+ kref_init(&new->h.ref);
+ new->h.name = kstrdup(name, GFP_KERNEL);
+ if (!new->h.name)
+ goto out_free_dom;
+ new->h.flavour = &svcauthops_gss;
+ new->pseudoflavor = pseudoflavor;
+
+ stat = 0;
+ test = auth_domain_lookup(name, &new->h);
+ if (test != &new->h) { /* Duplicate registration */
+ auth_domain_put(test);
+ kfree(new->h.name);
+ goto out_free_dom;
+ }
+ return 0;
+
+out_free_dom:
+ kfree(new);
+out:
+ return stat;
+}
+
+EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor);
+
+static inline int
+read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
+{
+ __be32 raw;
+ int status;
+
+ status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj));
+ if (status)
+ return status;
+ *obj = ntohl(raw);
+ return 0;
+}
+
+/* It would be nice if this bit of code could be shared with the client.
+ * Obstacles:
+ * The client shouldn't malloc(), would have to pass in own memory.
+ * The server uses base of head iovec as read pointer, while the
+ * client uses separate pointer. */
+static int
+unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
+{
+ int stat = -EINVAL;
+ u32 integ_len, maj_stat;
+ struct xdr_netobj mic;
+ struct xdr_buf integ_buf;
+
+ /* Did we already verify the signature on the original pass through? */
+ if (rqstp->rq_deferred)
+ return 0;
+
+ integ_len = svc_getnl(&buf->head[0]);
+ if (integ_len & 3)
+ return stat;
+ if (integ_len > buf->len)
+ return stat;
+ if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len))
+ BUG();
+ /* copy out mic... */
+ if (read_u32_from_xdr_buf(buf, integ_len, &mic.len))
+ BUG();
+ if (mic.len > RPC_MAX_AUTH_SIZE)
+ return stat;
+ mic.data = kmalloc(mic.len, GFP_KERNEL);
+ if (!mic.data)
+ return stat;
+ if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len))
+ goto out;
+ maj_stat = gss_verify_mic(ctx, &integ_buf, &mic);
+ if (maj_stat != GSS_S_COMPLETE)
+ goto out;
+ if (svc_getnl(&buf->head[0]) != seq)
+ goto out;
+ /* trim off the mic at the end before returning */
+ xdr_buf_trim(buf, mic.len + 4);
+ stat = 0;
+out:
+ kfree(mic.data);
+ return stat;
+}
+
+static inline int
+total_buf_len(struct xdr_buf *buf)
+{
+ return buf->head[0].iov_len + buf->page_len + buf->tail[0].iov_len;
+}
+
+static void
+fix_priv_head(struct xdr_buf *buf, int pad)
+{
+ if (buf->page_len == 0) {
+ /* We need to adjust head and buf->len in tandem in this
+ * case to make svc_defer() work--it finds the original
+ * buffer start using buf->len - buf->head[0].iov_len. */
+ buf->head[0].iov_len -= pad;
+ }
+}
+
+static int
+unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
+{
+ u32 priv_len, maj_stat;
+ int pad, saved_len, remaining_len, offset;
+
+ clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+
+ priv_len = svc_getnl(&buf->head[0]);
+ if (rqstp->rq_deferred) {
+ /* Already decrypted last time through! The sequence number
+ * check at out_seq is unnecessary but harmless: */
+ goto out_seq;
+ }
+ /* buf->len is the number of bytes from the original start of the
+ * request to the end, where head[0].iov_len is just the bytes
+ * not yet read from the head, so these two values are different: */
+ remaining_len = total_buf_len(buf);
+ if (priv_len > remaining_len)
+ return -EINVAL;
+ pad = remaining_len - priv_len;
+ buf->len -= pad;
+ fix_priv_head(buf, pad);
+
+ /* Maybe it would be better to give gss_unwrap a length parameter: */
+ saved_len = buf->len;
+ buf->len = priv_len;
+ maj_stat = gss_unwrap(ctx, 0, buf);
+ pad = priv_len - buf->len;
+ buf->len = saved_len;
+ buf->len -= pad;
+ /* The upper layers assume the buffer is aligned on 4-byte boundaries.
+ * In the krb5p case, at least, the data ends up offset, so we need to
+ * move it around. */
+ /* XXX: This is very inefficient. It would be better to either do
+ * this while we encrypt, or maybe in the receive code, if we can peak
+ * ahead and work out the service and mechanism there. */
+ offset = buf->head[0].iov_len % 4;
+ if (offset) {
+ buf->buflen = RPCSVC_MAXPAYLOAD;
+ xdr_shift_buf(buf, offset);
+ fix_priv_head(buf, pad);
+ }
+ if (maj_stat != GSS_S_COMPLETE)
+ return -EINVAL;
+out_seq:
+ if (svc_getnl(&buf->head[0]) != seq)
+ return -EINVAL;
+ return 0;
+}
+
+struct gss_svc_data {
+ /* decoded gss client cred: */
+ struct rpc_gss_wire_cred clcred;
+ /* save a pointer to the beginning of the encoded verifier,
+ * for use in encryption/checksumming in svcauth_gss_release: */
+ __be32 *verf_start;
+ struct rsc *rsci;
+};
+
+static int
+svcauth_gss_set_client(struct svc_rqst *rqstp)
+{
+ struct gss_svc_data *svcdata = rqstp->rq_auth_data;
+ struct rsc *rsci = svcdata->rsci;
+ struct rpc_gss_wire_cred *gc = &svcdata->clcred;
+ int stat;
+
+ /*
+ * A gss export can be specified either by:
+ * export *(sec=krb5,rw)
+ * or by
+ * export gss/krb5(rw)
+ * The latter is deprecated; but for backwards compatibility reasons
+ * the nfsd code will still fall back on trying it if the former
+ * doesn't work; so we try to make both available to nfsd, below.
+ */
+ rqstp->rq_gssclient = find_gss_auth_domain(rsci->mechctx, gc->gc_svc);
+ if (rqstp->rq_gssclient == NULL)
+ return SVC_DENIED;
+ stat = svcauth_unix_set_client(rqstp);
+ if (stat == SVC_DROP || stat == SVC_CLOSE)
+ return stat;
+ return SVC_OK;
+}
+
+static inline int
+gss_write_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp,
+ struct xdr_netobj *out_handle, int *major_status)
+{
+ struct rsc *rsci;
+ int rc;
+
+ if (*major_status != GSS_S_COMPLETE)
+ return gss_write_null_verf(rqstp);
+ rsci = gss_svc_searchbyctx(cd, out_handle);
+ if (rsci == NULL) {
+ *major_status = GSS_S_NO_CONTEXT;
+ return gss_write_null_verf(rqstp);
+ }
+ rc = gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN);
+ cache_put(&rsci->h, cd);
+ return rc;
+}
+
+static inline int
+gss_read_common_verf(struct rpc_gss_wire_cred *gc,
+ struct kvec *argv, __be32 *authp,
+ struct xdr_netobj *in_handle)
+{
+ /* Read the verifier; should be NULL: */
+ *authp = rpc_autherr_badverf;
+ if (argv->iov_len < 2 * 4)
+ return SVC_DENIED;
+ if (svc_getnl(argv) != RPC_AUTH_NULL)
+ return SVC_DENIED;
+ if (svc_getnl(argv) != 0)
+ return SVC_DENIED;
+ /* Martial context handle and token for upcall: */
+ *authp = rpc_autherr_badcred;
+ if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0)
+ return SVC_DENIED;
+ if (dup_netobj(in_handle, &gc->gc_ctx))
+ return SVC_CLOSE;
+ *authp = rpc_autherr_badverf;
+
+ return 0;
+}
+
+static inline int
+gss_read_verf(struct rpc_gss_wire_cred *gc,
+ struct kvec *argv, __be32 *authp,
+ struct xdr_netobj *in_handle,
+ struct xdr_netobj *in_token)
+{
+ struct xdr_netobj tmpobj;
+ int res;
+
+ res = gss_read_common_verf(gc, argv, authp, in_handle);
+ if (res)
+ return res;
+
+ if (svc_safe_getnetobj(argv, &tmpobj)) {
+ kfree(in_handle->data);
+ return SVC_DENIED;
+ }
+ if (dup_netobj(in_token, &tmpobj)) {
+ kfree(in_handle->data);
+ return SVC_CLOSE;
+ }
+
+ return 0;
+}
+
+/* Ok this is really heavily depending on a set of semantics in
+ * how rqstp is set up by svc_recv and pages laid down by the
+ * server when reading a request. We are basically guaranteed that
+ * the token lays all down linearly across a set of pages, starting
+ * at iov_base in rq_arg.head[0] which happens to be the first of a
+ * set of pages stored in rq_pages[].
+ * rq_arg.head[0].iov_base will provide us the page_base to pass
+ * to the upcall.
+ */
+static inline int
+gss_read_proxy_verf(struct svc_rqst *rqstp,
+ struct rpc_gss_wire_cred *gc, __be32 *authp,
+ struct xdr_netobj *in_handle,
+ struct gssp_in_token *in_token)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ u32 inlen;
+ int res;
+
+ res = gss_read_common_verf(gc, argv, authp, in_handle);
+ if (res)
+ return res;
+
+ inlen = svc_getnl(argv);
+ if (inlen > (argv->iov_len + rqstp->rq_arg.page_len))
+ return SVC_DENIED;
+
+ in_token->pages = rqstp->rq_pages;
+ in_token->page_base = (ulong)argv->iov_base & ~PAGE_MASK;
+ in_token->page_len = inlen;
+
+ return 0;
+}
+
+static inline int
+gss_write_resv(struct kvec *resv, size_t size_limit,
+ struct xdr_netobj *out_handle, struct xdr_netobj *out_token,
+ int major_status, int minor_status)
+{
+ if (resv->iov_len + 4 > size_limit)
+ return -1;
+ svc_putnl(resv, RPC_SUCCESS);
+ if (svc_safe_putnetobj(resv, out_handle))
+ return -1;
+ if (resv->iov_len + 3 * 4 > size_limit)
+ return -1;
+ svc_putnl(resv, major_status);
+ svc_putnl(resv, minor_status);
+ svc_putnl(resv, GSS_SEQ_WIN);
+ if (svc_safe_putnetobj(resv, out_token))
+ return -1;
+ return 0;
+}
+
+/*
+ * Having read the cred already and found we're in the context
+ * initiation case, read the verifier and initiate (or check the results
+ * of) upcalls to userspace for help with context initiation. If
+ * the upcall results are available, write the verifier and result.
+ * Otherwise, drop the request pending an answer to the upcall.
+ */
+static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
+ struct rpc_gss_wire_cred *gc, __be32 *authp)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ struct rsi *rsip, rsikey;
+ int ret;
+ struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+
+ memset(&rsikey, 0, sizeof(rsikey));
+ ret = gss_read_verf(gc, argv, authp,
+ &rsikey.in_handle, &rsikey.in_token);
+ if (ret)
+ return ret;
+
+ /* Perform upcall, or find upcall result: */
+ rsip = rsi_lookup(sn->rsi_cache, &rsikey);
+ rsi_free(&rsikey);
+ if (!rsip)
+ return SVC_CLOSE;
+ if (cache_check(sn->rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0)
+ /* No upcall result: */
+ return SVC_CLOSE;
+
+ ret = SVC_CLOSE;
+ /* Got an answer to the upcall; use it: */
+ if (gss_write_init_verf(sn->rsc_cache, rqstp,
+ &rsip->out_handle, &rsip->major_status))
+ goto out;
+ if (gss_write_resv(resv, PAGE_SIZE,
+ &rsip->out_handle, &rsip->out_token,
+ rsip->major_status, rsip->minor_status))
+ goto out;
+
+ ret = SVC_COMPLETE;
+out:
+ cache_put(&rsip->h, sn->rsi_cache);
+ return ret;
+}
+
+static int gss_proxy_save_rsc(struct cache_detail *cd,
+ struct gssp_upcall_data *ud,
+ uint64_t *handle)
+{
+ struct rsc rsci, *rscp = NULL;
+ static atomic64_t ctxhctr;
+ long long ctxh;
+ struct gss_api_mech *gm = NULL;
+ time_t expiry;
+ int status = -EINVAL;
+
+ memset(&rsci, 0, sizeof(rsci));
+ /* context handle */
+ status = -ENOMEM;
+ /* the handle needs to be just a unique id,
+ * use a static counter */
+ ctxh = atomic64_inc_return(&ctxhctr);
+
+ /* make a copy for the caller */
+ *handle = ctxh;
+
+ /* make a copy for the rsc cache */
+ if (dup_to_netobj(&rsci.handle, (char *)handle, sizeof(uint64_t)))
+ goto out;
+ rscp = rsc_lookup(cd, &rsci);
+ if (!rscp)
+ goto out;
+
+ /* creds */
+ if (!ud->found_creds) {
+ /* userspace seem buggy, we should always get at least a
+ * mapping to nobody */
+ dprintk("RPC: No creds found!\n");
+ goto out;
+ } else {
+
+ /* steal creds */
+ rsci.cred = ud->creds;
+ memset(&ud->creds, 0, sizeof(struct svc_cred));
+
+ status = -EOPNOTSUPP;
+ /* get mech handle from OID */
+ gm = gss_mech_get_by_OID(&ud->mech_oid);
+ if (!gm)
+ goto out;
+ rsci.cred.cr_gss_mech = gm;
+
+ status = -EINVAL;
+ /* mech-specific data: */
+ status = gss_import_sec_context(ud->out_handle.data,
+ ud->out_handle.len,
+ gm, &rsci.mechctx,
+ &expiry, GFP_KERNEL);
+ if (status)
+ goto out;
+ }
+
+ rsci.h.expiry_time = expiry;
+ rscp = rsc_update(cd, &rsci, rscp);
+ status = 0;
+out:
+ rsc_free(&rsci);
+ if (rscp)
+ cache_put(&rscp->h, cd);
+ else
+ status = -ENOMEM;
+ return status;
+}
+
+static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
+ struct rpc_gss_wire_cred *gc, __be32 *authp)
+{
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ struct xdr_netobj cli_handle;
+ struct gssp_upcall_data ud;
+ uint64_t handle;
+ int status;
+ int ret;
+ struct net *net = rqstp->rq_xprt->xpt_net;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ memset(&ud, 0, sizeof(ud));
+ ret = gss_read_proxy_verf(rqstp, gc, authp,
+ &ud.in_handle, &ud.in_token);
+ if (ret)
+ return ret;
+
+ ret = SVC_CLOSE;
+
+ /* Perform synchronous upcall to gss-proxy */
+ status = gssp_accept_sec_context_upcall(net, &ud);
+ if (status)
+ goto out;
+
+ dprintk("RPC: svcauth_gss: gss major status = %d\n",
+ ud.major_status);
+
+ switch (ud.major_status) {
+ case GSS_S_CONTINUE_NEEDED:
+ cli_handle = ud.out_handle;
+ break;
+ case GSS_S_COMPLETE:
+ status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle);
+ if (status)
+ goto out;
+ cli_handle.data = (u8 *)&handle;
+ cli_handle.len = sizeof(handle);
+ break;
+ default:
+ ret = SVC_CLOSE;
+ goto out;
+ }
+
+ /* Got an answer to the upcall; use it: */
+ if (gss_write_init_verf(sn->rsc_cache, rqstp,
+ &cli_handle, &ud.major_status))
+ goto out;
+ if (gss_write_resv(resv, PAGE_SIZE,
+ &cli_handle, &ud.out_token,
+ ud.major_status, ud.minor_status))
+ goto out;
+
+ ret = SVC_COMPLETE;
+out:
+ gssp_free_upcall_data(&ud);
+ return ret;
+}
+
+/*
+ * Try to set the sn->use_gss_proxy variable to a new value. We only allow
+ * it to be changed if it's currently undefined (-1). If it's any other value
+ * then return -EBUSY unless the type wouldn't have changed anyway.
+ */
+static int set_gss_proxy(struct net *net, int type)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ int ret;
+
+ WARN_ON_ONCE(type != 0 && type != 1);
+ ret = cmpxchg(&sn->use_gss_proxy, -1, type);
+ if (ret != -1 && ret != type)
+ return -EBUSY;
+ return 0;
+}
+
+static bool use_gss_proxy(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ /* If use_gss_proxy is still undefined, then try to disable it */
+ if (sn->use_gss_proxy == -1)
+ set_gss_proxy(net, 0);
+ return sn->use_gss_proxy;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static ssize_t write_gssp(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct net *net = PDE_DATA(file_inode(file));
+ char tbuf[20];
+ unsigned long i;
+ int res;
+
+ if (*ppos || count > sizeof(tbuf)-1)
+ return -EINVAL;
+ if (copy_from_user(tbuf, buf, count))
+ return -EFAULT;
+
+ tbuf[count] = 0;
+ res = kstrtoul(tbuf, 0, &i);
+ if (res)
+ return res;
+ if (i != 1)
+ return -EINVAL;
+ res = set_gssp_clnt(net);
+ if (res)
+ return res;
+ res = set_gss_proxy(net, 1);
+ if (res)
+ return res;
+ return count;
+}
+
+static ssize_t read_gssp(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct net *net = PDE_DATA(file_inode(file));
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ unsigned long p = *ppos;
+ char tbuf[10];
+ size_t len;
+
+ snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy);
+ len = strlen(tbuf);
+ if (p >= len)
+ return 0;
+ len -= p;
+ if (len > count)
+ len = count;
+ if (copy_to_user(buf, (void *)(tbuf+p), len))
+ return -EFAULT;
+ *ppos += len;
+ return len;
+}
+
+static const struct file_operations use_gss_proxy_ops = {
+ .open = nonseekable_open,
+ .write = write_gssp,
+ .read = read_gssp,
+};
+
+static int create_use_gss_proxy_proc_entry(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct proc_dir_entry **p = &sn->use_gssp_proc;
+
+ sn->use_gss_proxy = -1;
+ *p = proc_create_data("use-gss-proxy", S_IFREG|S_IRUSR|S_IWUSR,
+ sn->proc_net_rpc,
+ &use_gss_proxy_ops, net);
+ if (!*p)
+ return -ENOMEM;
+ init_gssp_clnt(sn);
+ return 0;
+}
+
+static void destroy_use_gss_proxy_proc_entry(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ if (sn->use_gssp_proc) {
+ remove_proc_entry("use-gss-proxy", sn->proc_net_rpc);
+ clear_gssp_clnt(sn);
+ }
+}
+#else /* CONFIG_PROC_FS */
+
+static int create_use_gss_proxy_proc_entry(struct net *net)
+{
+ return 0;
+}
+
+static void destroy_use_gss_proxy_proc_entry(struct net *net) {}
+
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Accept an rpcsec packet.
+ * If context establishment, punt to user space
+ * If data exchange, verify/decrypt
+ * If context destruction, handle here
+ * In the context establishment and destruction case we encode
+ * response here and return SVC_COMPLETE.
+ */
+static int
+svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ u32 crlen;
+ struct gss_svc_data *svcdata = rqstp->rq_auth_data;
+ struct rpc_gss_wire_cred *gc;
+ struct rsc *rsci = NULL;
+ __be32 *rpcstart;
+ __be32 *reject_stat = resv->iov_base + resv->iov_len;
+ int ret;
+ struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+
+ dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n",
+ argv->iov_len);
+
+ *authp = rpc_autherr_badcred;
+ if (!svcdata)
+ svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL);
+ if (!svcdata)
+ goto auth_err;
+ rqstp->rq_auth_data = svcdata;
+ svcdata->verf_start = NULL;
+ svcdata->rsci = NULL;
+ gc = &svcdata->clcred;
+
+ /* start of rpc packet is 7 u32's back from here:
+ * xid direction rpcversion prog vers proc flavour
+ */
+ rpcstart = argv->iov_base;
+ rpcstart -= 7;
+
+ /* credential is:
+ * version(==1), proc(0,1,2,3), seq, service (1,2,3), handle
+ * at least 5 u32s, and is preceded by length, so that makes 6.
+ */
+
+ if (argv->iov_len < 5 * 4)
+ goto auth_err;
+ crlen = svc_getnl(argv);
+ if (svc_getnl(argv) != RPC_GSS_VERSION)
+ goto auth_err;
+ gc->gc_proc = svc_getnl(argv);
+ gc->gc_seq = svc_getnl(argv);
+ gc->gc_svc = svc_getnl(argv);
+ if (svc_safe_getnetobj(argv, &gc->gc_ctx))
+ goto auth_err;
+ if (crlen != round_up_to_quad(gc->gc_ctx.len) + 5 * 4)
+ goto auth_err;
+
+ if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0))
+ goto auth_err;
+
+ *authp = rpc_autherr_badverf;
+ switch (gc->gc_proc) {
+ case RPC_GSS_PROC_INIT:
+ case RPC_GSS_PROC_CONTINUE_INIT:
+ if (use_gss_proxy(SVC_NET(rqstp)))
+ return svcauth_gss_proxy_init(rqstp, gc, authp);
+ else
+ return svcauth_gss_legacy_init(rqstp, gc, authp);
+ case RPC_GSS_PROC_DATA:
+ case RPC_GSS_PROC_DESTROY:
+ /* Look up the context, and check the verifier: */
+ *authp = rpcsec_gsserr_credproblem;
+ rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx);
+ if (!rsci)
+ goto auth_err;
+ switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) {
+ case SVC_OK:
+ break;
+ case SVC_DENIED:
+ goto auth_err;
+ case SVC_DROP:
+ goto drop;
+ }
+ break;
+ default:
+ *authp = rpc_autherr_rejectedcred;
+ goto auth_err;
+ }
+
+ /* now act upon the command: */
+ switch (gc->gc_proc) {
+ case RPC_GSS_PROC_DESTROY:
+ if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
+ goto auth_err;
+ rsci->h.expiry_time = get_seconds();
+ set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ if (resv->iov_len + 4 > PAGE_SIZE)
+ goto drop;
+ svc_putnl(resv, RPC_SUCCESS);
+ goto complete;
+ case RPC_GSS_PROC_DATA:
+ *authp = rpcsec_gsserr_ctxproblem;
+ svcdata->verf_start = resv->iov_base + resv->iov_len;
+ if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
+ goto auth_err;
+ rqstp->rq_cred = rsci->cred;
+ get_group_info(rsci->cred.cr_group_info);
+ *authp = rpc_autherr_badcred;
+ switch (gc->gc_svc) {
+ case RPC_GSS_SVC_NONE:
+ break;
+ case RPC_GSS_SVC_INTEGRITY:
+ /* placeholders for length and seq. number: */
+ svc_putnl(resv, 0);
+ svc_putnl(resv, 0);
+ if (unwrap_integ_data(rqstp, &rqstp->rq_arg,
+ gc->gc_seq, rsci->mechctx))
+ goto garbage_args;
+ rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE;
+ break;
+ case RPC_GSS_SVC_PRIVACY:
+ /* placeholders for length and seq. number: */
+ svc_putnl(resv, 0);
+ svc_putnl(resv, 0);
+ if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
+ gc->gc_seq, rsci->mechctx))
+ goto garbage_args;
+ rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE * 2;
+ break;
+ default:
+ goto auth_err;
+ }
+ svcdata->rsci = rsci;
+ cache_get(&rsci->h);
+ rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor(
+ rsci->mechctx->mech_type,
+ GSS_C_QOP_DEFAULT,
+ gc->gc_svc);
+ ret = SVC_OK;
+ goto out;
+ }
+garbage_args:
+ ret = SVC_GARBAGE;
+ goto out;
+auth_err:
+ /* Restore write pointer to its original value: */
+ xdr_ressize_check(rqstp, reject_stat);
+ ret = SVC_DENIED;
+ goto out;
+complete:
+ ret = SVC_COMPLETE;
+ goto out;
+drop:
+ ret = SVC_DROP;
+out:
+ if (rsci)
+ cache_put(&rsci->h, sn->rsc_cache);
+ return ret;
+}
+
+static __be32 *
+svcauth_gss_prepare_to_wrap(struct xdr_buf *resbuf, struct gss_svc_data *gsd)
+{
+ __be32 *p;
+ u32 verf_len;
+
+ p = gsd->verf_start;
+ gsd->verf_start = NULL;
+
+ /* If the reply stat is nonzero, don't wrap: */
+ if (*(p-1) != rpc_success)
+ return NULL;
+ /* Skip the verifier: */
+ p += 1;
+ verf_len = ntohl(*p++);
+ p += XDR_QUADLEN(verf_len);
+ /* move accept_stat to right place: */
+ memcpy(p, p + 2, 4);
+ /* Also don't wrap if the accept stat is nonzero: */
+ if (*p != rpc_success) {
+ resbuf->head[0].iov_len -= 2 * 4;
+ return NULL;
+ }
+ p++;
+ return p;
+}
+
+static inline int
+svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
+{
+ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+ struct rpc_gss_wire_cred *gc = &gsd->clcred;
+ struct xdr_buf *resbuf = &rqstp->rq_res;
+ struct xdr_buf integ_buf;
+ struct xdr_netobj mic;
+ struct kvec *resv;
+ __be32 *p;
+ int integ_offset, integ_len;
+ int stat = -EINVAL;
+
+ p = svcauth_gss_prepare_to_wrap(resbuf, gsd);
+ if (p == NULL)
+ goto out;
+ integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
+ integ_len = resbuf->len - integ_offset;
+ BUG_ON(integ_len % 4);
+ *p++ = htonl(integ_len);
+ *p++ = htonl(gc->gc_seq);
+ if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len))
+ BUG();
+ if (resbuf->tail[0].iov_base == NULL) {
+ if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE)
+ goto out_err;
+ resbuf->tail[0].iov_base = resbuf->head[0].iov_base
+ + resbuf->head[0].iov_len;
+ resbuf->tail[0].iov_len = 0;
+ }
+ resv = &resbuf->tail[0];
+ mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
+ if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic))
+ goto out_err;
+ svc_putnl(resv, mic.len);
+ memset(mic.data + mic.len, 0,
+ round_up_to_quad(mic.len) - mic.len);
+ resv->iov_len += XDR_QUADLEN(mic.len) << 2;
+ /* not strictly required: */
+ resbuf->len += XDR_QUADLEN(mic.len) << 2;
+ BUG_ON(resv->iov_len > PAGE_SIZE);
+out:
+ stat = 0;
+out_err:
+ return stat;
+}
+
+static inline int
+svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp)
+{
+ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+ struct rpc_gss_wire_cred *gc = &gsd->clcred;
+ struct xdr_buf *resbuf = &rqstp->rq_res;
+ struct page **inpages = NULL;
+ __be32 *p, *len;
+ int offset;
+ int pad;
+
+ p = svcauth_gss_prepare_to_wrap(resbuf, gsd);
+ if (p == NULL)
+ return 0;
+ len = p++;
+ offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base;
+ *p++ = htonl(gc->gc_seq);
+ inpages = resbuf->pages;
+ /* XXX: Would be better to write some xdr helper functions for
+ * nfs{2,3,4}xdr.c that place the data right, instead of copying: */
+
+ /*
+ * If there is currently tail data, make sure there is
+ * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in
+ * the page, and move the current tail data such that
+ * there is RPC_MAX_AUTH_SIZE slack space available in
+ * both the head and tail.
+ */
+ if (resbuf->tail[0].iov_base) {
+ BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base
+ + PAGE_SIZE);
+ BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base);
+ if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len
+ + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE)
+ return -ENOMEM;
+ memmove(resbuf->tail[0].iov_base + RPC_MAX_AUTH_SIZE,
+ resbuf->tail[0].iov_base,
+ resbuf->tail[0].iov_len);
+ resbuf->tail[0].iov_base += RPC_MAX_AUTH_SIZE;
+ }
+ /*
+ * If there is no current tail data, make sure there is
+ * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the
+ * allotted page, and set up tail information such that there
+ * is RPC_MAX_AUTH_SIZE slack space available in both the
+ * head and tail.
+ */
+ if (resbuf->tail[0].iov_base == NULL) {
+ if (resbuf->head[0].iov_len + 2*RPC_MAX_AUTH_SIZE > PAGE_SIZE)
+ return -ENOMEM;
+ resbuf->tail[0].iov_base = resbuf->head[0].iov_base
+ + resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE;
+ resbuf->tail[0].iov_len = 0;
+ }
+ if (gss_wrap(gsd->rsci->mechctx, offset, resbuf, inpages))
+ return -ENOMEM;
+ *len = htonl(resbuf->len - offset);
+ pad = 3 - ((resbuf->len - offset - 1)&3);
+ p = (__be32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len);
+ memset(p, 0, pad);
+ resbuf->tail[0].iov_len += pad;
+ resbuf->len += pad;
+ return 0;
+}
+
+static int
+svcauth_gss_release(struct svc_rqst *rqstp)
+{
+ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+ struct rpc_gss_wire_cred *gc = &gsd->clcred;
+ struct xdr_buf *resbuf = &rqstp->rq_res;
+ int stat = -EINVAL;
+ struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id);
+
+ if (gc->gc_proc != RPC_GSS_PROC_DATA)
+ goto out;
+ /* Release can be called twice, but we only wrap once. */
+ if (gsd->verf_start == NULL)
+ goto out;
+ /* normally not set till svc_send, but we need it here: */
+ /* XXX: what for? Do we mess it up the moment we call svc_putu32
+ * or whatever? */
+ resbuf->len = total_buf_len(resbuf);
+ switch (gc->gc_svc) {
+ case RPC_GSS_SVC_NONE:
+ break;
+ case RPC_GSS_SVC_INTEGRITY:
+ stat = svcauth_gss_wrap_resp_integ(rqstp);
+ if (stat)
+ goto out_err;
+ break;
+ case RPC_GSS_SVC_PRIVACY:
+ stat = svcauth_gss_wrap_resp_priv(rqstp);
+ if (stat)
+ goto out_err;
+ break;
+ /*
+ * For any other gc_svc value, svcauth_gss_accept() already set
+ * the auth_error appropriately; just fall through:
+ */
+ }
+
+out:
+ stat = 0;
+out_err:
+ if (rqstp->rq_client)
+ auth_domain_put(rqstp->rq_client);
+ rqstp->rq_client = NULL;
+ if (rqstp->rq_gssclient)
+ auth_domain_put(rqstp->rq_gssclient);
+ rqstp->rq_gssclient = NULL;
+ if (rqstp->rq_cred.cr_group_info)
+ put_group_info(rqstp->rq_cred.cr_group_info);
+ rqstp->rq_cred.cr_group_info = NULL;
+ if (gsd->rsci)
+ cache_put(&gsd->rsci->h, sn->rsc_cache);
+ gsd->rsci = NULL;
+
+ return stat;
+}
+
+static void
+svcauth_gss_domain_release(struct auth_domain *dom)
+{
+ struct gss_domain *gd = container_of(dom, struct gss_domain, h);
+
+ kfree(dom->name);
+ kfree(gd);
+}
+
+static struct auth_ops svcauthops_gss = {
+ .name = "rpcsec_gss",
+ .owner = THIS_MODULE,
+ .flavour = RPC_AUTH_GSS,
+ .accept = svcauth_gss_accept,
+ .release = svcauth_gss_release,
+ .domain_release = svcauth_gss_domain_release,
+ .set_client = svcauth_gss_set_client,
+};
+
+static int rsi_cache_create_net(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct cache_detail *cd;
+ int err;
+
+ cd = cache_create_net(&rsi_cache_template, net);
+ if (IS_ERR(cd))
+ return PTR_ERR(cd);
+ err = cache_register_net(cd, net);
+ if (err) {
+ cache_destroy_net(cd, net);
+ return err;
+ }
+ sn->rsi_cache = cd;
+ return 0;
+}
+
+static void rsi_cache_destroy_net(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct cache_detail *cd = sn->rsi_cache;
+
+ sn->rsi_cache = NULL;
+ cache_purge(cd);
+ cache_unregister_net(cd, net);
+ cache_destroy_net(cd, net);
+}
+
+static int rsc_cache_create_net(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct cache_detail *cd;
+ int err;
+
+ cd = cache_create_net(&rsc_cache_template, net);
+ if (IS_ERR(cd))
+ return PTR_ERR(cd);
+ err = cache_register_net(cd, net);
+ if (err) {
+ cache_destroy_net(cd, net);
+ return err;
+ }
+ sn->rsc_cache = cd;
+ return 0;
+}
+
+static void rsc_cache_destroy_net(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct cache_detail *cd = sn->rsc_cache;
+
+ sn->rsc_cache = NULL;
+ cache_purge(cd);
+ cache_unregister_net(cd, net);
+ cache_destroy_net(cd, net);
+}
+
+int
+gss_svc_init_net(struct net *net)
+{
+ int rv;
+
+ rv = rsc_cache_create_net(net);
+ if (rv)
+ return rv;
+ rv = rsi_cache_create_net(net);
+ if (rv)
+ goto out1;
+ rv = create_use_gss_proxy_proc_entry(net);
+ if (rv)
+ goto out2;
+ return 0;
+out2:
+ destroy_use_gss_proxy_proc_entry(net);
+out1:
+ rsc_cache_destroy_net(net);
+ return rv;
+}
+
+void
+gss_svc_shutdown_net(struct net *net)
+{
+ destroy_use_gss_proxy_proc_entry(net);
+ rsi_cache_destroy_net(net);
+ rsc_cache_destroy_net(net);
+}
+
+int
+gss_svc_init(void)
+{
+ return svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss);
+}
+
+void
+gss_svc_shutdown(void)
+{
+ svc_auth_unregister(RPC_AUTH_GSS);
+}
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
new file mode 100644
index 000000000..c2a2b584a
--- /dev/null
+++ b/net/sunrpc/auth_null.c
@@ -0,0 +1,144 @@
+/*
+ * linux/net/sunrpc/auth_null.c
+ *
+ * AUTH_NULL authentication. Really :-)
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/sunrpc/clnt.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+static struct rpc_auth null_auth;
+static struct rpc_cred null_cred;
+
+static struct rpc_auth *
+nul_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+{
+ atomic_inc(&null_auth.au_count);
+ return &null_auth;
+}
+
+static void
+nul_destroy(struct rpc_auth *auth)
+{
+}
+
+/*
+ * Lookup NULL creds for current process
+ */
+static struct rpc_cred *
+nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ return &null_cred;
+ return get_rpccred(&null_cred);
+}
+
+/*
+ * Destroy cred handle.
+ */
+static void
+nul_destroy_cred(struct rpc_cred *cred)
+{
+}
+
+/*
+ * Match cred handle against current process
+ */
+static int
+nul_match(struct auth_cred *acred, struct rpc_cred *cred, int taskflags)
+{
+ return 1;
+}
+
+/*
+ * Marshal credential.
+ */
+static __be32 *
+nul_marshal(struct rpc_task *task, __be32 *p)
+{
+ *p++ = htonl(RPC_AUTH_NULL);
+ *p++ = 0;
+ *p++ = htonl(RPC_AUTH_NULL);
+ *p++ = 0;
+
+ return p;
+}
+
+/*
+ * Refresh credential. This is a no-op for AUTH_NULL
+ */
+static int
+nul_refresh(struct rpc_task *task)
+{
+ set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
+ return 0;
+}
+
+static __be32 *
+nul_validate(struct rpc_task *task, __be32 *p)
+{
+ rpc_authflavor_t flavor;
+ u32 size;
+
+ flavor = ntohl(*p++);
+ if (flavor != RPC_AUTH_NULL) {
+ printk("RPC: bad verf flavor: %u\n", flavor);
+ return ERR_PTR(-EIO);
+ }
+
+ size = ntohl(*p++);
+ if (size != 0) {
+ printk("RPC: bad verf size: %u\n", size);
+ return ERR_PTR(-EIO);
+ }
+
+ return p;
+}
+
+const struct rpc_authops authnull_ops = {
+ .owner = THIS_MODULE,
+ .au_flavor = RPC_AUTH_NULL,
+ .au_name = "NULL",
+ .create = nul_create,
+ .destroy = nul_destroy,
+ .lookup_cred = nul_lookup_cred,
+};
+
+static
+struct rpc_auth null_auth = {
+ .au_cslack = 4,
+ .au_rslack = 2,
+ .au_ops = &authnull_ops,
+ .au_flavor = RPC_AUTH_NULL,
+ .au_count = ATOMIC_INIT(0),
+};
+
+static
+const struct rpc_credops null_credops = {
+ .cr_name = "AUTH_NULL",
+ .crdestroy = nul_destroy_cred,
+ .crbind = rpcauth_generic_bind_cred,
+ .crmatch = nul_match,
+ .crmarshal = nul_marshal,
+ .crrefresh = nul_refresh,
+ .crvalidate = nul_validate,
+};
+
+static
+struct rpc_cred null_cred = {
+ .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru),
+ .cr_auth = &null_auth,
+ .cr_ops = &null_credops,
+ .cr_count = ATOMIC_INIT(1),
+ .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE,
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ .cr_magic = RPCAUTH_CRED_MAGIC,
+#endif
+};
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
new file mode 100644
index 000000000..4feda2d0a
--- /dev/null
+++ b/net/sunrpc/auth_unix.c
@@ -0,0 +1,247 @@
+/*
+ * linux/net/sunrpc/auth_unix.c
+ *
+ * UNIX-style authentication; no AUTH_SHORT support
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/user_namespace.h>
+
+#define NFS_NGROUPS 16
+
+struct unx_cred {
+ struct rpc_cred uc_base;
+ kgid_t uc_gid;
+ kgid_t uc_gids[NFS_NGROUPS];
+};
+#define uc_uid uc_base.cr_uid
+
+#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2))
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_AUTH
+#endif
+
+static struct rpc_auth unix_auth;
+static const struct rpc_credops unix_credops;
+
+static struct rpc_auth *
+unx_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+{
+ dprintk("RPC: creating UNIX authenticator for client %p\n",
+ clnt);
+ atomic_inc(&unix_auth.au_count);
+ return &unix_auth;
+}
+
+static void
+unx_destroy(struct rpc_auth *auth)
+{
+ dprintk("RPC: destroying UNIX authenticator %p\n", auth);
+ rpcauth_clear_credcache(auth->au_credcache);
+}
+
+/*
+ * Lookup AUTH_UNIX creds for current process
+ */
+static struct rpc_cred *
+unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+ return rpcauth_lookup_credcache(auth, acred, flags);
+}
+
+static struct rpc_cred *
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+ struct unx_cred *cred;
+ unsigned int groups = 0;
+ unsigned int i;
+
+ dprintk("RPC: allocating UNIX cred for uid %d gid %d\n",
+ from_kuid(&init_user_ns, acred->uid),
+ from_kgid(&init_user_ns, acred->gid));
+
+ if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+ return ERR_PTR(-ENOMEM);
+
+ rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
+ cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
+
+ if (acred->group_info != NULL)
+ groups = acred->group_info->ngroups;
+ if (groups > NFS_NGROUPS)
+ groups = NFS_NGROUPS;
+
+ cred->uc_gid = acred->gid;
+ for (i = 0; i < groups; i++)
+ cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
+ if (i < NFS_NGROUPS)
+ cred->uc_gids[i] = INVALID_GID;
+
+ return &cred->uc_base;
+}
+
+static void
+unx_free_cred(struct unx_cred *unx_cred)
+{
+ dprintk("RPC: unx_free_cred %p\n", unx_cred);
+ kfree(unx_cred);
+}
+
+static void
+unx_free_cred_callback(struct rcu_head *head)
+{
+ struct unx_cred *unx_cred = container_of(head, struct unx_cred, uc_base.cr_rcu);
+ unx_free_cred(unx_cred);
+}
+
+static void
+unx_destroy_cred(struct rpc_cred *cred)
+{
+ call_rcu(&cred->cr_rcu, unx_free_cred_callback);
+}
+
+/*
+ * Match credentials against current process creds.
+ * The root_override argument takes care of cases where the caller may
+ * request root creds (e.g. for NFS swapping).
+ */
+static int
+unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
+{
+ struct unx_cred *cred = container_of(rcred, struct unx_cred, uc_base);
+ unsigned int groups = 0;
+ unsigned int i;
+
+
+ if (!uid_eq(cred->uc_uid, acred->uid) || !gid_eq(cred->uc_gid, acred->gid))
+ return 0;
+
+ if (acred->group_info != NULL)
+ groups = acred->group_info->ngroups;
+ if (groups > NFS_NGROUPS)
+ groups = NFS_NGROUPS;
+ for (i = 0; i < groups ; i++)
+ if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i)))
+ return 0;
+ if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
+ return 0;
+ return 1;
+}
+
+/*
+ * Marshal credentials.
+ * Maybe we should keep a cached credential for performance reasons.
+ */
+static __be32 *
+unx_marshal(struct rpc_task *task, __be32 *p)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+ struct unx_cred *cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base);
+ __be32 *base, *hold;
+ int i;
+
+ *p++ = htonl(RPC_AUTH_UNIX);
+ base = p++;
+ *p++ = htonl(jiffies/HZ);
+
+ /*
+ * Copy the UTS nodename captured when the client was created.
+ */
+ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
+
+ *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid));
+ *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid));
+ hold = p++;
+ for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++)
+ *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i]));
+ *hold = htonl(p - hold - 1); /* gid array length */
+ *base = htonl((p - base - 1) << 2); /* cred length */
+
+ *p++ = htonl(RPC_AUTH_NULL);
+ *p++ = htonl(0);
+
+ return p;
+}
+
+/*
+ * Refresh credentials. This is a no-op for AUTH_UNIX
+ */
+static int
+unx_refresh(struct rpc_task *task)
+{
+ set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
+ return 0;
+}
+
+static __be32 *
+unx_validate(struct rpc_task *task, __be32 *p)
+{
+ rpc_authflavor_t flavor;
+ u32 size;
+
+ flavor = ntohl(*p++);
+ if (flavor != RPC_AUTH_NULL &&
+ flavor != RPC_AUTH_UNIX &&
+ flavor != RPC_AUTH_SHORT) {
+ printk("RPC: bad verf flavor: %u\n", flavor);
+ return ERR_PTR(-EIO);
+ }
+
+ size = ntohl(*p++);
+ if (size > RPC_MAX_AUTH_SIZE) {
+ printk("RPC: giant verf size: %u\n", size);
+ return ERR_PTR(-EIO);
+ }
+ task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
+ p += (size >> 2);
+
+ return p;
+}
+
+int __init rpc_init_authunix(void)
+{
+ return rpcauth_init_credcache(&unix_auth);
+}
+
+void rpc_destroy_authunix(void)
+{
+ rpcauth_destroy_credcache(&unix_auth);
+}
+
+const struct rpc_authops authunix_ops = {
+ .owner = THIS_MODULE,
+ .au_flavor = RPC_AUTH_UNIX,
+ .au_name = "UNIX",
+ .create = unx_create,
+ .destroy = unx_destroy,
+ .lookup_cred = unx_lookup_cred,
+ .crcreate = unx_create_cred,
+};
+
+static
+struct rpc_auth unix_auth = {
+ .au_cslack = UNX_WRITESLACK,
+ .au_rslack = 2, /* assume AUTH_NULL verf */
+ .au_ops = &authunix_ops,
+ .au_flavor = RPC_AUTH_UNIX,
+ .au_count = ATOMIC_INIT(0),
+};
+
+static
+const struct rpc_credops unix_credops = {
+ .cr_name = "AUTH_UNIX",
+ .crdestroy = unx_destroy_cred,
+ .crbind = rpcauth_generic_bind_cred,
+ .crmatch = unx_match,
+ .crmarshal = unx_marshal,
+ .crrefresh = unx_refresh,
+ .crvalidate = unx_validate,
+};
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
new file mode 100644
index 000000000..28504dfd3
--- /dev/null
+++ b/net/sunrpc/backchannel_rqst.c
@@ -0,0 +1,325 @@
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc. All Rights Reserved.
+(c) 2009 NetApp. All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/export.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+#define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+/*
+ * Helper routines that track the number of preallocation elements
+ * on the transport.
+ */
+static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
+{
+ return xprt->bc_alloc_count > 0;
+}
+
+static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
+{
+ xprt->bc_alloc_count += n;
+}
+
+static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
+{
+ return xprt->bc_alloc_count -= n;
+}
+
+/*
+ * Free the preallocated rpc_rqst structure and the memory
+ * buffers hanging off of it.
+ */
+static void xprt_free_allocation(struct rpc_rqst *req)
+{
+ struct xdr_buf *xbufp;
+
+ dprintk("RPC: free allocations for req= %p\n", req);
+ WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+ xbufp = &req->rq_rcv_buf;
+ free_page((unsigned long)xbufp->head[0].iov_base);
+ xbufp = &req->rq_snd_buf;
+ free_page((unsigned long)xbufp->head[0].iov_base);
+ kfree(req);
+}
+
+/*
+ * Preallocate up to min_reqs structures and related buffers for use
+ * by the backchannel. This function can be called multiple times
+ * when creating new sessions that use the same rpc_xprt. The
+ * preallocated buffers are added to the pool of resources used by
+ * the rpc_xprt. Anyone of these resources may be used used by an
+ * incoming callback request. It's up to the higher levels in the
+ * stack to enforce that the maximum number of session slots is not
+ * being exceeded.
+ *
+ * Some callback arguments can be large. For example, a pNFS server
+ * using multiple deviceids. The list can be unbound, but the client
+ * has the ability to tell the server the maximum size of the callback
+ * requests. Each deviceID is 16 bytes, so allocate one page
+ * for the arguments to have enough room to receive a number of these
+ * deviceIDs. The NFS client indicates to the pNFS server that its
+ * callback requests can be up to 4096 bytes in size.
+ */
+int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
+{
+ struct page *page_rcv = NULL, *page_snd = NULL;
+ struct xdr_buf *xbufp = NULL;
+ struct rpc_rqst *req, *tmp;
+ struct list_head tmp_list;
+ int i;
+
+ dprintk("RPC: setup backchannel transport\n");
+
+ /*
+ * We use a temporary list to keep track of the preallocated
+ * buffers. Once we're done building the list we splice it
+ * into the backchannel preallocation list off of the rpc_xprt
+ * struct. This helps minimize the amount of time the list
+ * lock is held on the rpc_xprt struct. It also makes cleanup
+ * easier in case of memory allocation errors.
+ */
+ INIT_LIST_HEAD(&tmp_list);
+ for (i = 0; i < min_reqs; i++) {
+ /* Pre-allocate one backchannel rpc_rqst */
+ req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+ if (req == NULL) {
+ printk(KERN_ERR "Failed to create bc rpc_rqst\n");
+ goto out_free;
+ }
+
+ /* Add the allocated buffer to the tmp list */
+ dprintk("RPC: adding req= %p\n", req);
+ list_add(&req->rq_bc_pa_list, &tmp_list);
+
+ req->rq_xprt = xprt;
+ INIT_LIST_HEAD(&req->rq_list);
+ INIT_LIST_HEAD(&req->rq_bc_list);
+
+ /* Preallocate one XDR receive buffer */
+ page_rcv = alloc_page(GFP_KERNEL);
+ if (page_rcv == NULL) {
+ printk(KERN_ERR "Failed to create bc receive xbuf\n");
+ goto out_free;
+ }
+ xbufp = &req->rq_rcv_buf;
+ xbufp->head[0].iov_base = page_address(page_rcv);
+ xbufp->head[0].iov_len = PAGE_SIZE;
+ xbufp->tail[0].iov_base = NULL;
+ xbufp->tail[0].iov_len = 0;
+ xbufp->page_len = 0;
+ xbufp->len = PAGE_SIZE;
+ xbufp->buflen = PAGE_SIZE;
+
+ /* Preallocate one XDR send buffer */
+ page_snd = alloc_page(GFP_KERNEL);
+ if (page_snd == NULL) {
+ printk(KERN_ERR "Failed to create bc snd xbuf\n");
+ goto out_free;
+ }
+
+ xbufp = &req->rq_snd_buf;
+ xbufp->head[0].iov_base = page_address(page_snd);
+ xbufp->head[0].iov_len = 0;
+ xbufp->tail[0].iov_base = NULL;
+ xbufp->tail[0].iov_len = 0;
+ xbufp->page_len = 0;
+ xbufp->len = 0;
+ xbufp->buflen = PAGE_SIZE;
+ }
+
+ /*
+ * Add the temporary list to the backchannel preallocation list
+ */
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_splice(&tmp_list, &xprt->bc_pa_list);
+ xprt_inc_alloc_count(xprt, min_reqs);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+
+ dprintk("RPC: setup backchannel transport done\n");
+ return 0;
+
+out_free:
+ /*
+ * Memory allocation failed, free the temporary list
+ */
+ list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) {
+ list_del(&req->rq_bc_pa_list);
+ xprt_free_allocation(req);
+ }
+
+ dprintk("RPC: setup backchannel transport failed\n");
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
+
+/**
+ * xprt_destroy_backchannel - Destroys the backchannel preallocated structures.
+ * @xprt: the transport holding the preallocated strucures
+ * @max_reqs the maximum number of preallocated structures to destroy
+ *
+ * Since these structures may have been allocated by multiple calls
+ * to xprt_setup_backchannel, we only destroy up to the maximum number
+ * of reqs specified by the caller.
+ */
+void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
+{
+ struct rpc_rqst *req = NULL, *tmp = NULL;
+
+ dprintk("RPC: destroy backchannel transport\n");
+
+ if (max_reqs == 0)
+ goto out;
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ xprt_dec_alloc_count(xprt, max_reqs);
+ list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+ dprintk("RPC: req=%p\n", req);
+ list_del(&req->rq_bc_pa_list);
+ xprt_free_allocation(req);
+ if (--max_reqs == 0)
+ break;
+ }
+ spin_unlock_bh(&xprt->bc_pa_lock);
+
+out:
+ dprintk("RPC: backchannel list empty= %s\n",
+ list_empty(&xprt->bc_pa_list) ? "true" : "false");
+}
+EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
+
+static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
+{
+ struct rpc_rqst *req = NULL;
+
+ dprintk("RPC: allocate a backchannel request\n");
+ if (list_empty(&xprt->bc_pa_list))
+ goto not_found;
+
+ req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
+ rq_bc_pa_list);
+ req->rq_reply_bytes_recvd = 0;
+ req->rq_bytes_sent = 0;
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+ sizeof(req->rq_private_buf));
+ req->rq_xid = xid;
+ req->rq_connect_cookie = xprt->connect_cookie;
+not_found:
+ dprintk("RPC: backchannel req=%p\n", req);
+ return req;
+}
+
+/*
+ * Return the preallocated rpc_rqst structure and XDR buffers
+ * associated with this rpc_task.
+ */
+void xprt_free_bc_request(struct rpc_rqst *req)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ dprintk("RPC: free backchannel req=%p\n", req);
+
+ req->rq_connect_cookie = xprt->connect_cookie - 1;
+ smp_mb__before_atomic();
+ WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+ clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+ smp_mb__after_atomic();
+
+ if (!xprt_need_to_requeue(xprt)) {
+ /*
+ * The last remaining session was destroyed while this
+ * entry was in use. Free the entry and don't attempt
+ * to add back to the list because there is no need to
+ * have anymore preallocated entries.
+ */
+ dprintk("RPC: Last session removed req=%p\n", req);
+ xprt_free_allocation(req);
+ return;
+ }
+
+ /*
+ * Return it to the list of preallocations so that it
+ * may be reused by a new callback request.
+ */
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/*
+ * One or more rpc_rqst structure have been preallocated during the
+ * backchannel setup. Buffer space for the send and private XDR buffers
+ * has been preallocated as well. Use xprt_alloc_bc_request to allocate
+ * to this request. Use xprt_free_bc_request to return it.
+ *
+ * We know that we're called in soft interrupt context, grab the spin_lock
+ * since there is no need to grab the bottom half spin_lock.
+ *
+ * Return an available rpc_rqst, otherwise NULL if non are available.
+ */
+struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
+{
+ struct rpc_rqst *req;
+
+ spin_lock(&xprt->bc_pa_lock);
+ list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) {
+ if (req->rq_connect_cookie != xprt->connect_cookie)
+ continue;
+ if (req->rq_xid == xid)
+ goto found;
+ }
+ req = xprt_alloc_bc_request(xprt, xid);
+found:
+ spin_unlock(&xprt->bc_pa_lock);
+ return req;
+}
+
+/*
+ * Add callback request to callback list. The callback
+ * service sleeps on the sv_cb_waitq waiting for new
+ * requests. Wake it up after adding enqueing the
+ * request.
+ */
+void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct svc_serv *bc_serv = xprt->bc_serv;
+
+ spin_lock(&xprt->bc_pa_lock);
+ list_del(&req->rq_bc_pa_list);
+ spin_unlock(&xprt->bc_pa_lock);
+
+ req->rq_private_buf.len = copied;
+ set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+
+ dprintk("RPC: add callback request to list\n");
+ spin_lock(&bc_serv->sv_cb_lock);
+ list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
+ wake_up(&bc_serv->sv_cb_waitq);
+ spin_unlock(&bc_serv->sv_cb_lock);
+}
+
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
new file mode 100644
index 000000000..15c7a8a1c
--- /dev/null
+++ b/net/sunrpc/bc_svc.c
@@ -0,0 +1,63 @@
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc. All Rights Reserved.
+(c) 2009 NetApp. All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * The NFSv4.1 callback service helper routines.
+ * They implement the transport level processing required to send the
+ * reply over an existing open connection previously established by the client.
+ */
+
+#include <linux/module.h>
+
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCDSP
+
+/* Empty callback ops */
+static const struct rpc_call_ops nfs41_callback_ops = {
+};
+
+
+/*
+ * Send the callback reply
+ */
+int bc_send(struct rpc_rqst *req)
+{
+ struct rpc_task *task;
+ int ret;
+
+ dprintk("RPC: bc_send req= %p\n", req);
+ task = rpc_run_bc_task(req, &nfs41_callback_ops);
+ if (IS_ERR(task))
+ ret = PTR_ERR(task);
+ else {
+ WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
+ ret = task->tk_status;
+ rpc_put_task(task);
+ }
+ dprintk("RPC: bc_send ret= %d\n", ret);
+ return ret;
+}
+
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
new file mode 100644
index 000000000..2928afffb
--- /dev/null
+++ b/net/sunrpc/cache.c
@@ -0,0 +1,1826 @@
+/*
+ * net/sunrpc/cache.c
+ *
+ * Generic code for various authentication-related caches
+ * used by sunrpc clients and servers.
+ *
+ * Copyright (C) 2002 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * Released under terms in GPL version 2. See COPYING.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/string_helpers.h>
+#include <asm/uaccess.h>
+#include <linux/poll.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <asm/ioctls.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "netns.h"
+
+#define RPCDBG_FACILITY RPCDBG_CACHE
+
+static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
+static void cache_revisit_request(struct cache_head *item);
+
+static void cache_init(struct cache_head *h)
+{
+ time_t now = seconds_since_boot();
+ h->next = NULL;
+ h->flags = 0;
+ kref_init(&h->ref);
+ h->expiry_time = now + CACHE_NEW_EXPIRY;
+ h->last_refresh = now;
+}
+
+struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
+ struct cache_head *key, int hash)
+{
+ struct cache_head **head, **hp;
+ struct cache_head *new = NULL, *freeme = NULL;
+
+ head = &detail->hash_table[hash];
+
+ read_lock(&detail->hash_lock);
+
+ for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
+ struct cache_head *tmp = *hp;
+ if (detail->match(tmp, key)) {
+ if (cache_is_expired(detail, tmp))
+ /* This entry is expired, we will discard it. */
+ break;
+ cache_get(tmp);
+ read_unlock(&detail->hash_lock);
+ return tmp;
+ }
+ }
+ read_unlock(&detail->hash_lock);
+ /* Didn't find anything, insert an empty entry */
+
+ new = detail->alloc();
+ if (!new)
+ return NULL;
+ /* must fully initialise 'new', else
+ * we might get lose if we need to
+ * cache_put it soon.
+ */
+ cache_init(new);
+ detail->init(new, key);
+
+ write_lock(&detail->hash_lock);
+
+ /* check if entry appeared while we slept */
+ for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
+ struct cache_head *tmp = *hp;
+ if (detail->match(tmp, key)) {
+ if (cache_is_expired(detail, tmp)) {
+ *hp = tmp->next;
+ tmp->next = NULL;
+ detail->entries --;
+ freeme = tmp;
+ break;
+ }
+ cache_get(tmp);
+ write_unlock(&detail->hash_lock);
+ cache_put(new, detail);
+ return tmp;
+ }
+ }
+ new->next = *head;
+ *head = new;
+ detail->entries++;
+ cache_get(new);
+ write_unlock(&detail->hash_lock);
+
+ if (freeme)
+ cache_put(freeme, detail);
+ return new;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);
+
+
+static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
+
+static void cache_fresh_locked(struct cache_head *head, time_t expiry)
+{
+ head->expiry_time = expiry;
+ head->last_refresh = seconds_since_boot();
+ smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */
+ set_bit(CACHE_VALID, &head->flags);
+}
+
+static void cache_fresh_unlocked(struct cache_head *head,
+ struct cache_detail *detail)
+{
+ if (test_and_clear_bit(CACHE_PENDING, &head->flags)) {
+ cache_revisit_request(head);
+ cache_dequeue(detail, head);
+ }
+}
+
+struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
+ struct cache_head *new, struct cache_head *old, int hash)
+{
+ /* The 'old' entry is to be replaced by 'new'.
+ * If 'old' is not VALID, we update it directly,
+ * otherwise we need to replace it
+ */
+ struct cache_head **head;
+ struct cache_head *tmp;
+
+ if (!test_bit(CACHE_VALID, &old->flags)) {
+ write_lock(&detail->hash_lock);
+ if (!test_bit(CACHE_VALID, &old->flags)) {
+ if (test_bit(CACHE_NEGATIVE, &new->flags))
+ set_bit(CACHE_NEGATIVE, &old->flags);
+ else
+ detail->update(old, new);
+ cache_fresh_locked(old, new->expiry_time);
+ write_unlock(&detail->hash_lock);
+ cache_fresh_unlocked(old, detail);
+ return old;
+ }
+ write_unlock(&detail->hash_lock);
+ }
+ /* We need to insert a new entry */
+ tmp = detail->alloc();
+ if (!tmp) {
+ cache_put(old, detail);
+ return NULL;
+ }
+ cache_init(tmp);
+ detail->init(tmp, old);
+ head = &detail->hash_table[hash];
+
+ write_lock(&detail->hash_lock);
+ if (test_bit(CACHE_NEGATIVE, &new->flags))
+ set_bit(CACHE_NEGATIVE, &tmp->flags);
+ else
+ detail->update(tmp, new);
+ tmp->next = *head;
+ *head = tmp;
+ detail->entries++;
+ cache_get(tmp);
+ cache_fresh_locked(tmp, new->expiry_time);
+ cache_fresh_locked(old, 0);
+ write_unlock(&detail->hash_lock);
+ cache_fresh_unlocked(tmp, detail);
+ cache_fresh_unlocked(old, detail);
+ cache_put(old, detail);
+ return tmp;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_update);
+
+static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ if (cd->cache_upcall)
+ return cd->cache_upcall(cd, h);
+ return sunrpc_cache_pipe_upcall(cd, h);
+}
+
+static inline int cache_is_valid(struct cache_head *h)
+{
+ if (!test_bit(CACHE_VALID, &h->flags))
+ return -EAGAIN;
+ else {
+ /* entry is valid */
+ if (test_bit(CACHE_NEGATIVE, &h->flags))
+ return -ENOENT;
+ else {
+ /*
+ * In combination with write barrier in
+ * sunrpc_cache_update, ensures that anyone
+ * using the cache entry after this sees the
+ * updated contents:
+ */
+ smp_rmb();
+ return 0;
+ }
+ }
+}
+
+static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h)
+{
+ int rv;
+
+ write_lock(&detail->hash_lock);
+ rv = cache_is_valid(h);
+ if (rv == -EAGAIN) {
+ set_bit(CACHE_NEGATIVE, &h->flags);
+ cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
+ rv = -ENOENT;
+ }
+ write_unlock(&detail->hash_lock);
+ cache_fresh_unlocked(h, detail);
+ return rv;
+}
+
+/*
+ * This is the generic cache management routine for all
+ * the authentication caches.
+ * It checks the currency of a cache item and will (later)
+ * initiate an upcall to fill it if needed.
+ *
+ *
+ * Returns 0 if the cache_head can be used, or cache_puts it and returns
+ * -EAGAIN if upcall is pending and request has been queued
+ * -ETIMEDOUT if upcall failed or request could not be queue or
+ * upcall completed but item is still invalid (implying that
+ * the cache item has been replaced with a newer one).
+ * -ENOENT if cache entry was negative
+ */
+int cache_check(struct cache_detail *detail,
+ struct cache_head *h, struct cache_req *rqstp)
+{
+ int rv;
+ long refresh_age, age;
+
+ /* First decide return status as best we can */
+ rv = cache_is_valid(h);
+
+ /* now see if we want to start an upcall */
+ refresh_age = (h->expiry_time - h->last_refresh);
+ age = seconds_since_boot() - h->last_refresh;
+
+ if (rqstp == NULL) {
+ if (rv == -EAGAIN)
+ rv = -ENOENT;
+ } else if (rv == -EAGAIN ||
+ (h->expiry_time != 0 && age > refresh_age/2)) {
+ dprintk("RPC: Want update, refage=%ld, age=%ld\n",
+ refresh_age, age);
+ if (!test_and_set_bit(CACHE_PENDING, &h->flags)) {
+ switch (cache_make_upcall(detail, h)) {
+ case -EINVAL:
+ rv = try_to_negate_entry(detail, h);
+ break;
+ case -EAGAIN:
+ cache_fresh_unlocked(h, detail);
+ break;
+ }
+ }
+ }
+
+ if (rv == -EAGAIN) {
+ if (!cache_defer_req(rqstp, h)) {
+ /*
+ * Request was not deferred; handle it as best
+ * we can ourselves:
+ */
+ rv = cache_is_valid(h);
+ if (rv == -EAGAIN)
+ rv = -ETIMEDOUT;
+ }
+ }
+ if (rv)
+ cache_put(h, detail);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(cache_check);
+
+/*
+ * caches need to be periodically cleaned.
+ * For this we maintain a list of cache_detail and
+ * a current pointer into that list and into the table
+ * for that entry.
+ *
+ * Each time cache_clean is called it finds the next non-empty entry
+ * in the current table and walks the list in that entry
+ * looking for entries that can be removed.
+ *
+ * An entry gets removed if:
+ * - The expiry is before current time
+ * - The last_refresh time is before the flush_time for that cache
+ *
+ * later we might drop old entries with non-NEVER expiry if that table
+ * is getting 'full' for some definition of 'full'
+ *
+ * The question of "how often to scan a table" is an interesting one
+ * and is answered in part by the use of the "nextcheck" field in the
+ * cache_detail.
+ * When a scan of a table begins, the nextcheck field is set to a time
+ * that is well into the future.
+ * While scanning, if an expiry time is found that is earlier than the
+ * current nextcheck time, nextcheck is set to that expiry time.
+ * If the flush_time is ever set to a time earlier than the nextcheck
+ * time, the nextcheck time is then set to that flush_time.
+ *
+ * A table is then only scanned if the current time is at least
+ * the nextcheck time.
+ *
+ */
+
+static LIST_HEAD(cache_list);
+static DEFINE_SPINLOCK(cache_list_lock);
+static struct cache_detail *current_detail;
+static int current_index;
+
+static void do_cache_clean(struct work_struct *work);
+static struct delayed_work cache_cleaner;
+
+void sunrpc_init_cache_detail(struct cache_detail *cd)
+{
+ rwlock_init(&cd->hash_lock);
+ INIT_LIST_HEAD(&cd->queue);
+ spin_lock(&cache_list_lock);
+ cd->nextcheck = 0;
+ cd->entries = 0;
+ atomic_set(&cd->readers, 0);
+ cd->last_close = 0;
+ cd->last_warn = -1;
+ list_add(&cd->others, &cache_list);
+ spin_unlock(&cache_list_lock);
+
+ /* start the cleaning process */
+ schedule_delayed_work(&cache_cleaner, 0);
+}
+EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail);
+
+void sunrpc_destroy_cache_detail(struct cache_detail *cd)
+{
+ cache_purge(cd);
+ spin_lock(&cache_list_lock);
+ write_lock(&cd->hash_lock);
+ if (cd->entries || atomic_read(&cd->inuse)) {
+ write_unlock(&cd->hash_lock);
+ spin_unlock(&cache_list_lock);
+ goto out;
+ }
+ if (current_detail == cd)
+ current_detail = NULL;
+ list_del_init(&cd->others);
+ write_unlock(&cd->hash_lock);
+ spin_unlock(&cache_list_lock);
+ if (list_empty(&cache_list)) {
+ /* module must be being unloaded so its safe to kill the worker */
+ cancel_delayed_work_sync(&cache_cleaner);
+ }
+ return;
+out:
+ printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
+}
+EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
+
+/* clean cache tries to find something to clean
+ * and cleans it.
+ * It returns 1 if it cleaned something,
+ * 0 if it didn't find anything this time
+ * -1 if it fell off the end of the list.
+ */
+static int cache_clean(void)
+{
+ int rv = 0;
+ struct list_head *next;
+
+ spin_lock(&cache_list_lock);
+
+ /* find a suitable table if we don't already have one */
+ while (current_detail == NULL ||
+ current_index >= current_detail->hash_size) {
+ if (current_detail)
+ next = current_detail->others.next;
+ else
+ next = cache_list.next;
+ if (next == &cache_list) {
+ current_detail = NULL;
+ spin_unlock(&cache_list_lock);
+ return -1;
+ }
+ current_detail = list_entry(next, struct cache_detail, others);
+ if (current_detail->nextcheck > seconds_since_boot())
+ current_index = current_detail->hash_size;
+ else {
+ current_index = 0;
+ current_detail->nextcheck = seconds_since_boot()+30*60;
+ }
+ }
+
+ /* find a non-empty bucket in the table */
+ while (current_detail &&
+ current_index < current_detail->hash_size &&
+ current_detail->hash_table[current_index] == NULL)
+ current_index++;
+
+ /* find a cleanable entry in the bucket and clean it, or set to next bucket */
+
+ if (current_detail && current_index < current_detail->hash_size) {
+ struct cache_head *ch, **cp;
+ struct cache_detail *d;
+
+ write_lock(&current_detail->hash_lock);
+
+ /* Ok, now to clean this strand */
+
+ cp = & current_detail->hash_table[current_index];
+ for (ch = *cp ; ch ; cp = & ch->next, ch = *cp) {
+ if (current_detail->nextcheck > ch->expiry_time)
+ current_detail->nextcheck = ch->expiry_time+1;
+ if (!cache_is_expired(current_detail, ch))
+ continue;
+
+ *cp = ch->next;
+ ch->next = NULL;
+ current_detail->entries--;
+ rv = 1;
+ break;
+ }
+
+ write_unlock(&current_detail->hash_lock);
+ d = current_detail;
+ if (!ch)
+ current_index ++;
+ spin_unlock(&cache_list_lock);
+ if (ch) {
+ set_bit(CACHE_CLEANED, &ch->flags);
+ cache_fresh_unlocked(ch, d);
+ cache_put(ch, d);
+ }
+ } else
+ spin_unlock(&cache_list_lock);
+
+ return rv;
+}
+
+/*
+ * We want to regularly clean the cache, so we need to schedule some work ...
+ */
+static void do_cache_clean(struct work_struct *work)
+{
+ int delay = 5;
+ if (cache_clean() == -1)
+ delay = round_jiffies_relative(30*HZ);
+
+ if (list_empty(&cache_list))
+ delay = 0;
+
+ if (delay)
+ schedule_delayed_work(&cache_cleaner, delay);
+}
+
+
+/*
+ * Clean all caches promptly. This just calls cache_clean
+ * repeatedly until we are sure that every cache has had a chance to
+ * be fully cleaned
+ */
+void cache_flush(void)
+{
+ while (cache_clean() != -1)
+ cond_resched();
+ while (cache_clean() != -1)
+ cond_resched();
+}
+EXPORT_SYMBOL_GPL(cache_flush);
+
+void cache_purge(struct cache_detail *detail)
+{
+ detail->flush_time = LONG_MAX;
+ detail->nextcheck = seconds_since_boot();
+ cache_flush();
+ detail->flush_time = 1;
+}
+EXPORT_SYMBOL_GPL(cache_purge);
+
+
+/*
+ * Deferral and Revisiting of Requests.
+ *
+ * If a cache lookup finds a pending entry, we
+ * need to defer the request and revisit it later.
+ * All deferred requests are stored in a hash table,
+ * indexed by "struct cache_head *".
+ * As it may be wasteful to store a whole request
+ * structure, we allow the request to provide a
+ * deferred form, which must contain a
+ * 'struct cache_deferred_req'
+ * This cache_deferred_req contains a method to allow
+ * it to be revisited when cache info is available
+ */
+
+#define DFR_HASHSIZE (PAGE_SIZE/sizeof(struct list_head))
+#define DFR_HASH(item) ((((long)item)>>4 ^ (((long)item)>>13)) % DFR_HASHSIZE)
+
+#define DFR_MAX 300 /* ??? */
+
+static DEFINE_SPINLOCK(cache_defer_lock);
+static LIST_HEAD(cache_defer_list);
+static struct hlist_head cache_defer_hash[DFR_HASHSIZE];
+static int cache_defer_cnt;
+
+static void __unhash_deferred_req(struct cache_deferred_req *dreq)
+{
+ hlist_del_init(&dreq->hash);
+ if (!list_empty(&dreq->recent)) {
+ list_del_init(&dreq->recent);
+ cache_defer_cnt--;
+ }
+}
+
+static void __hash_deferred_req(struct cache_deferred_req *dreq, struct cache_head *item)
+{
+ int hash = DFR_HASH(item);
+
+ INIT_LIST_HEAD(&dreq->recent);
+ hlist_add_head(&dreq->hash, &cache_defer_hash[hash]);
+}
+
+static void setup_deferral(struct cache_deferred_req *dreq,
+ struct cache_head *item,
+ int count_me)
+{
+
+ dreq->item = item;
+
+ spin_lock(&cache_defer_lock);
+
+ __hash_deferred_req(dreq, item);
+
+ if (count_me) {
+ cache_defer_cnt++;
+ list_add(&dreq->recent, &cache_defer_list);
+ }
+
+ spin_unlock(&cache_defer_lock);
+
+}
+
+struct thread_deferred_req {
+ struct cache_deferred_req handle;
+ struct completion completion;
+};
+
+static void cache_restart_thread(struct cache_deferred_req *dreq, int too_many)
+{
+ struct thread_deferred_req *dr =
+ container_of(dreq, struct thread_deferred_req, handle);
+ complete(&dr->completion);
+}
+
+static void cache_wait_req(struct cache_req *req, struct cache_head *item)
+{
+ struct thread_deferred_req sleeper;
+ struct cache_deferred_req *dreq = &sleeper.handle;
+
+ sleeper.completion = COMPLETION_INITIALIZER_ONSTACK(sleeper.completion);
+ dreq->revisit = cache_restart_thread;
+
+ setup_deferral(dreq, item, 0);
+
+ if (!test_bit(CACHE_PENDING, &item->flags) ||
+ wait_for_completion_interruptible_timeout(
+ &sleeper.completion, req->thread_wait) <= 0) {
+ /* The completion wasn't completed, so we need
+ * to clean up
+ */
+ spin_lock(&cache_defer_lock);
+ if (!hlist_unhashed(&sleeper.handle.hash)) {
+ __unhash_deferred_req(&sleeper.handle);
+ spin_unlock(&cache_defer_lock);
+ } else {
+ /* cache_revisit_request already removed
+ * this from the hash table, but hasn't
+ * called ->revisit yet. It will very soon
+ * and we need to wait for it.
+ */
+ spin_unlock(&cache_defer_lock);
+ wait_for_completion(&sleeper.completion);
+ }
+ }
+}
+
+static void cache_limit_defers(void)
+{
+ /* Make sure we haven't exceed the limit of allowed deferred
+ * requests.
+ */
+ struct cache_deferred_req *discard = NULL;
+
+ if (cache_defer_cnt <= DFR_MAX)
+ return;
+
+ spin_lock(&cache_defer_lock);
+
+ /* Consider removing either the first or the last */
+ if (cache_defer_cnt > DFR_MAX) {
+ if (prandom_u32() & 1)
+ discard = list_entry(cache_defer_list.next,
+ struct cache_deferred_req, recent);
+ else
+ discard = list_entry(cache_defer_list.prev,
+ struct cache_deferred_req, recent);
+ __unhash_deferred_req(discard);
+ }
+ spin_unlock(&cache_defer_lock);
+ if (discard)
+ discard->revisit(discard, 1);
+}
+
+/* Return true if and only if a deferred request is queued. */
+static bool cache_defer_req(struct cache_req *req, struct cache_head *item)
+{
+ struct cache_deferred_req *dreq;
+
+ if (req->thread_wait) {
+ cache_wait_req(req, item);
+ if (!test_bit(CACHE_PENDING, &item->flags))
+ return false;
+ }
+ dreq = req->defer(req);
+ if (dreq == NULL)
+ return false;
+ setup_deferral(dreq, item, 1);
+ if (!test_bit(CACHE_PENDING, &item->flags))
+ /* Bit could have been cleared before we managed to
+ * set up the deferral, so need to revisit just in case
+ */
+ cache_revisit_request(item);
+
+ cache_limit_defers();
+ return true;
+}
+
+static void cache_revisit_request(struct cache_head *item)
+{
+ struct cache_deferred_req *dreq;
+ struct list_head pending;
+ struct hlist_node *tmp;
+ int hash = DFR_HASH(item);
+
+ INIT_LIST_HEAD(&pending);
+ spin_lock(&cache_defer_lock);
+
+ hlist_for_each_entry_safe(dreq, tmp, &cache_defer_hash[hash], hash)
+ if (dreq->item == item) {
+ __unhash_deferred_req(dreq);
+ list_add(&dreq->recent, &pending);
+ }
+
+ spin_unlock(&cache_defer_lock);
+
+ while (!list_empty(&pending)) {
+ dreq = list_entry(pending.next, struct cache_deferred_req, recent);
+ list_del_init(&dreq->recent);
+ dreq->revisit(dreq, 0);
+ }
+}
+
+void cache_clean_deferred(void *owner)
+{
+ struct cache_deferred_req *dreq, *tmp;
+ struct list_head pending;
+
+
+ INIT_LIST_HEAD(&pending);
+ spin_lock(&cache_defer_lock);
+
+ list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) {
+ if (dreq->owner == owner) {
+ __unhash_deferred_req(dreq);
+ list_add(&dreq->recent, &pending);
+ }
+ }
+ spin_unlock(&cache_defer_lock);
+
+ while (!list_empty(&pending)) {
+ dreq = list_entry(pending.next, struct cache_deferred_req, recent);
+ list_del_init(&dreq->recent);
+ dreq->revisit(dreq, 1);
+ }
+}
+
+/*
+ * communicate with user-space
+ *
+ * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
+ * On read, you get a full request, or block.
+ * On write, an update request is processed.
+ * Poll works if anything to read, and always allows write.
+ *
+ * Implemented by linked list of requests. Each open file has
+ * a ->private that also exists in this list. New requests are added
+ * to the end and may wakeup and preceding readers.
+ * New readers are added to the head. If, on read, an item is found with
+ * CACHE_UPCALLING clear, we free it from the list.
+ *
+ */
+
+static DEFINE_SPINLOCK(queue_lock);
+static DEFINE_MUTEX(queue_io_mutex);
+
+struct cache_queue {
+ struct list_head list;
+ int reader; /* if 0, then request */
+};
+struct cache_request {
+ struct cache_queue q;
+ struct cache_head *item;
+ char * buf;
+ int len;
+ int readers;
+};
+struct cache_reader {
+ struct cache_queue q;
+ int offset; /* if non-0, we have a refcnt on next request */
+};
+
+static int cache_request(struct cache_detail *detail,
+ struct cache_request *crq)
+{
+ char *bp = crq->buf;
+ int len = PAGE_SIZE;
+
+ detail->cache_request(detail, crq->item, &bp, &len);
+ if (len < 0)
+ return -EAGAIN;
+ return PAGE_SIZE - len;
+}
+
+static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
+ loff_t *ppos, struct cache_detail *cd)
+{
+ struct cache_reader *rp = filp->private_data;
+ struct cache_request *rq;
+ struct inode *inode = file_inode(filp);
+ int err;
+
+ if (count == 0)
+ return 0;
+
+ mutex_lock(&inode->i_mutex); /* protect against multiple concurrent
+ * readers on this file */
+ again:
+ spin_lock(&queue_lock);
+ /* need to find next request */
+ while (rp->q.list.next != &cd->queue &&
+ list_entry(rp->q.list.next, struct cache_queue, list)
+ ->reader) {
+ struct list_head *next = rp->q.list.next;
+ list_move(&rp->q.list, next);
+ }
+ if (rp->q.list.next == &cd->queue) {
+ spin_unlock(&queue_lock);
+ mutex_unlock(&inode->i_mutex);
+ WARN_ON_ONCE(rp->offset);
+ return 0;
+ }
+ rq = container_of(rp->q.list.next, struct cache_request, q.list);
+ WARN_ON_ONCE(rq->q.reader);
+ if (rp->offset == 0)
+ rq->readers++;
+ spin_unlock(&queue_lock);
+
+ if (rq->len == 0) {
+ err = cache_request(cd, rq);
+ if (err < 0)
+ goto out;
+ rq->len = err;
+ }
+
+ if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) {
+ err = -EAGAIN;
+ spin_lock(&queue_lock);
+ list_move(&rp->q.list, &rq->q.list);
+ spin_unlock(&queue_lock);
+ } else {
+ if (rp->offset + count > rq->len)
+ count = rq->len - rp->offset;
+ err = -EFAULT;
+ if (copy_to_user(buf, rq->buf + rp->offset, count))
+ goto out;
+ rp->offset += count;
+ if (rp->offset >= rq->len) {
+ rp->offset = 0;
+ spin_lock(&queue_lock);
+ list_move(&rp->q.list, &rq->q.list);
+ spin_unlock(&queue_lock);
+ }
+ err = 0;
+ }
+ out:
+ if (rp->offset == 0) {
+ /* need to release rq */
+ spin_lock(&queue_lock);
+ rq->readers--;
+ if (rq->readers == 0 &&
+ !test_bit(CACHE_PENDING, &rq->item->flags)) {
+ list_del(&rq->q.list);
+ spin_unlock(&queue_lock);
+ cache_put(rq->item, cd);
+ kfree(rq->buf);
+ kfree(rq);
+ } else
+ spin_unlock(&queue_lock);
+ }
+ if (err == -EAGAIN)
+ goto again;
+ mutex_unlock(&inode->i_mutex);
+ return err ? err : count;
+}
+
+static ssize_t cache_do_downcall(char *kaddr, const char __user *buf,
+ size_t count, struct cache_detail *cd)
+{
+ ssize_t ret;
+
+ if (count == 0)
+ return -EINVAL;
+ if (copy_from_user(kaddr, buf, count))
+ return -EFAULT;
+ kaddr[count] = '\0';
+ ret = cd->cache_parse(cd, kaddr, count);
+ if (!ret)
+ ret = count;
+ return ret;
+}
+
+static ssize_t cache_slow_downcall(const char __user *buf,
+ size_t count, struct cache_detail *cd)
+{
+ static char write_buf[8192]; /* protected by queue_io_mutex */
+ ssize_t ret = -EINVAL;
+
+ if (count >= sizeof(write_buf))
+ goto out;
+ mutex_lock(&queue_io_mutex);
+ ret = cache_do_downcall(write_buf, buf, count, cd);
+ mutex_unlock(&queue_io_mutex);
+out:
+ return ret;
+}
+
+static ssize_t cache_downcall(struct address_space *mapping,
+ const char __user *buf,
+ size_t count, struct cache_detail *cd)
+{
+ struct page *page;
+ char *kaddr;
+ ssize_t ret = -ENOMEM;
+
+ if (count >= PAGE_CACHE_SIZE)
+ goto out_slow;
+
+ page = find_or_create_page(mapping, 0, GFP_KERNEL);
+ if (!page)
+ goto out_slow;
+
+ kaddr = kmap(page);
+ ret = cache_do_downcall(kaddr, buf, count, cd);
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ return ret;
+out_slow:
+ return cache_slow_downcall(buf, count, cd);
+}
+
+static ssize_t cache_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *ppos,
+ struct cache_detail *cd)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = file_inode(filp);
+ ssize_t ret = -EINVAL;
+
+ if (!cd->cache_parse)
+ goto out;
+
+ mutex_lock(&inode->i_mutex);
+ ret = cache_downcall(mapping, buf, count, cd);
+ mutex_unlock(&inode->i_mutex);
+out:
+ return ret;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(queue_wait);
+
+static unsigned int cache_poll(struct file *filp, poll_table *wait,
+ struct cache_detail *cd)
+{
+ unsigned int mask;
+ struct cache_reader *rp = filp->private_data;
+ struct cache_queue *cq;
+
+ poll_wait(filp, &queue_wait, wait);
+
+ /* alway allow write */
+ mask = POLLOUT | POLLWRNORM;
+
+ if (!rp)
+ return mask;
+
+ spin_lock(&queue_lock);
+
+ for (cq= &rp->q; &cq->list != &cd->queue;
+ cq = list_entry(cq->list.next, struct cache_queue, list))
+ if (!cq->reader) {
+ mask |= POLLIN | POLLRDNORM;
+ break;
+ }
+ spin_unlock(&queue_lock);
+ return mask;
+}
+
+static int cache_ioctl(struct inode *ino, struct file *filp,
+ unsigned int cmd, unsigned long arg,
+ struct cache_detail *cd)
+{
+ int len = 0;
+ struct cache_reader *rp = filp->private_data;
+ struct cache_queue *cq;
+
+ if (cmd != FIONREAD || !rp)
+ return -EINVAL;
+
+ spin_lock(&queue_lock);
+
+ /* only find the length remaining in current request,
+ * or the length of the next request
+ */
+ for (cq= &rp->q; &cq->list != &cd->queue;
+ cq = list_entry(cq->list.next, struct cache_queue, list))
+ if (!cq->reader) {
+ struct cache_request *cr =
+ container_of(cq, struct cache_request, q);
+ len = cr->len - rp->offset;
+ break;
+ }
+ spin_unlock(&queue_lock);
+
+ return put_user(len, (int __user *)arg);
+}
+
+static int cache_open(struct inode *inode, struct file *filp,
+ struct cache_detail *cd)
+{
+ struct cache_reader *rp = NULL;
+
+ if (!cd || !try_module_get(cd->owner))
+ return -EACCES;
+ nonseekable_open(inode, filp);
+ if (filp->f_mode & FMODE_READ) {
+ rp = kmalloc(sizeof(*rp), GFP_KERNEL);
+ if (!rp) {
+ module_put(cd->owner);
+ return -ENOMEM;
+ }
+ rp->offset = 0;
+ rp->q.reader = 1;
+ atomic_inc(&cd->readers);
+ spin_lock(&queue_lock);
+ list_add(&rp->q.list, &cd->queue);
+ spin_unlock(&queue_lock);
+ }
+ filp->private_data = rp;
+ return 0;
+}
+
+static int cache_release(struct inode *inode, struct file *filp,
+ struct cache_detail *cd)
+{
+ struct cache_reader *rp = filp->private_data;
+
+ if (rp) {
+ spin_lock(&queue_lock);
+ if (rp->offset) {
+ struct cache_queue *cq;
+ for (cq= &rp->q; &cq->list != &cd->queue;
+ cq = list_entry(cq->list.next, struct cache_queue, list))
+ if (!cq->reader) {
+ container_of(cq, struct cache_request, q)
+ ->readers--;
+ break;
+ }
+ rp->offset = 0;
+ }
+ list_del(&rp->q.list);
+ spin_unlock(&queue_lock);
+
+ filp->private_data = NULL;
+ kfree(rp);
+
+ cd->last_close = seconds_since_boot();
+ atomic_dec(&cd->readers);
+ }
+ module_put(cd->owner);
+ return 0;
+}
+
+
+
+static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch)
+{
+ struct cache_queue *cq, *tmp;
+ struct cache_request *cr;
+ struct list_head dequeued;
+
+ INIT_LIST_HEAD(&dequeued);
+ spin_lock(&queue_lock);
+ list_for_each_entry_safe(cq, tmp, &detail->queue, list)
+ if (!cq->reader) {
+ cr = container_of(cq, struct cache_request, q);
+ if (cr->item != ch)
+ continue;
+ if (test_bit(CACHE_PENDING, &ch->flags))
+ /* Lost a race and it is pending again */
+ break;
+ if (cr->readers != 0)
+ continue;
+ list_move(&cr->q.list, &dequeued);
+ }
+ spin_unlock(&queue_lock);
+ while (!list_empty(&dequeued)) {
+ cr = list_entry(dequeued.next, struct cache_request, q.list);
+ list_del(&cr->q.list);
+ cache_put(cr->item, detail);
+ kfree(cr->buf);
+ kfree(cr);
+ }
+}
+
+/*
+ * Support routines for text-based upcalls.
+ * Fields are separated by spaces.
+ * Fields are either mangled to quote space tab newline slosh with slosh
+ * or a hexified with a leading \x
+ * Record is terminated with newline.
+ *
+ */
+
+void qword_add(char **bpp, int *lp, char *str)
+{
+ char *bp = *bpp;
+ int len = *lp;
+ int ret;
+
+ if (len < 0) return;
+
+ ret = string_escape_str(str, bp, len, ESCAPE_OCTAL, "\\ \n\t");
+ if (ret >= len) {
+ bp += len;
+ len = -1;
+ } else {
+ bp += ret;
+ len -= ret;
+ *bp++ = ' ';
+ len--;
+ }
+ *bpp = bp;
+ *lp = len;
+}
+EXPORT_SYMBOL_GPL(qword_add);
+
+void qword_addhex(char **bpp, int *lp, char *buf, int blen)
+{
+ char *bp = *bpp;
+ int len = *lp;
+
+ if (len < 0) return;
+
+ if (len > 2) {
+ *bp++ = '\\';
+ *bp++ = 'x';
+ len -= 2;
+ while (blen && len >= 2) {
+ bp = hex_byte_pack(bp, *buf++);
+ len -= 2;
+ blen--;
+ }
+ }
+ if (blen || len<1) len = -1;
+ else {
+ *bp++ = ' ';
+ len--;
+ }
+ *bpp = bp;
+ *lp = len;
+}
+EXPORT_SYMBOL_GPL(qword_addhex);
+
+static void warn_no_listener(struct cache_detail *detail)
+{
+ if (detail->last_warn != detail->last_close) {
+ detail->last_warn = detail->last_close;
+ if (detail->warn_no_listener)
+ detail->warn_no_listener(detail, detail->last_close != 0);
+ }
+}
+
+static bool cache_listeners_exist(struct cache_detail *detail)
+{
+ if (atomic_read(&detail->readers))
+ return true;
+ if (detail->last_close == 0)
+ /* This cache was never opened */
+ return false;
+ if (detail->last_close < seconds_since_boot() - 30)
+ /*
+ * We allow for the possibility that someone might
+ * restart a userspace daemon without restarting the
+ * server; but after 30 seconds, we give up.
+ */
+ return false;
+ return true;
+}
+
+/*
+ * register an upcall request to user-space and queue it up for read() by the
+ * upcall daemon.
+ *
+ * Each request is at most one page long.
+ */
+int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
+{
+
+ char *buf;
+ struct cache_request *crq;
+ int ret = 0;
+
+ if (!detail->cache_request)
+ return -EINVAL;
+
+ if (!cache_listeners_exist(detail)) {
+ warn_no_listener(detail);
+ return -EINVAL;
+ }
+ if (test_bit(CACHE_CLEANED, &h->flags))
+ /* Too late to make an upcall */
+ return -EAGAIN;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -EAGAIN;
+
+ crq = kmalloc(sizeof (*crq), GFP_KERNEL);
+ if (!crq) {
+ kfree(buf);
+ return -EAGAIN;
+ }
+
+ crq->q.reader = 0;
+ crq->item = cache_get(h);
+ crq->buf = buf;
+ crq->len = 0;
+ crq->readers = 0;
+ spin_lock(&queue_lock);
+ if (test_bit(CACHE_PENDING, &h->flags))
+ list_add_tail(&crq->q.list, &detail->queue);
+ else
+ /* Lost a race, no longer PENDING, so don't enqueue */
+ ret = -EAGAIN;
+ spin_unlock(&queue_lock);
+ wake_up(&queue_wait);
+ if (ret == -EAGAIN) {
+ kfree(buf);
+ kfree(crq);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
+
+/*
+ * parse a message from user-space and pass it
+ * to an appropriate cache
+ * Messages are, like requests, separated into fields by
+ * spaces and dequotes as \xHEXSTRING or embedded \nnn octal
+ *
+ * Message is
+ * reply cachename expiry key ... content....
+ *
+ * key and content are both parsed by cache
+ */
+
+int qword_get(char **bpp, char *dest, int bufsize)
+{
+ /* return bytes copied, or -1 on error */
+ char *bp = *bpp;
+ int len = 0;
+
+ while (*bp == ' ') bp++;
+
+ if (bp[0] == '\\' && bp[1] == 'x') {
+ /* HEX STRING */
+ bp += 2;
+ while (len < bufsize) {
+ int h, l;
+
+ h = hex_to_bin(bp[0]);
+ if (h < 0)
+ break;
+
+ l = hex_to_bin(bp[1]);
+ if (l < 0)
+ break;
+
+ *dest++ = (h << 4) | l;
+ bp += 2;
+ len++;
+ }
+ } else {
+ /* text with \nnn octal quoting */
+ while (*bp != ' ' && *bp != '\n' && *bp && len < bufsize-1) {
+ if (*bp == '\\' &&
+ isodigit(bp[1]) && (bp[1] <= '3') &&
+ isodigit(bp[2]) &&
+ isodigit(bp[3])) {
+ int byte = (*++bp -'0');
+ bp++;
+ byte = (byte << 3) | (*bp++ - '0');
+ byte = (byte << 3) | (*bp++ - '0');
+ *dest++ = byte;
+ len++;
+ } else {
+ *dest++ = *bp++;
+ len++;
+ }
+ }
+ }
+
+ if (*bp != ' ' && *bp != '\n' && *bp != '\0')
+ return -1;
+ while (*bp == ' ') bp++;
+ *bpp = bp;
+ *dest = '\0';
+ return len;
+}
+EXPORT_SYMBOL_GPL(qword_get);
+
+
+/*
+ * support /proc/sunrpc/cache/$CACHENAME/content
+ * as a seqfile.
+ * We call ->cache_show passing NULL for the item to
+ * get a header, then pass each real item in the cache
+ */
+
+struct handle {
+ struct cache_detail *cd;
+};
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+ __acquires(cd->hash_lock)
+{
+ loff_t n = *pos;
+ unsigned int hash, entry;
+ struct cache_head *ch;
+ struct cache_detail *cd = ((struct handle*)m->private)->cd;
+
+
+ read_lock(&cd->hash_lock);
+ if (!n--)
+ return SEQ_START_TOKEN;
+ hash = n >> 32;
+ entry = n & ((1LL<<32) - 1);
+
+ for (ch=cd->hash_table[hash]; ch; ch=ch->next)
+ if (!entry--)
+ return ch;
+ n &= ~((1LL<<32) - 1);
+ do {
+ hash++;
+ n += 1LL<<32;
+ } while(hash < cd->hash_size &&
+ cd->hash_table[hash]==NULL);
+ if (hash >= cd->hash_size)
+ return NULL;
+ *pos = n+1;
+ return cd->hash_table[hash];
+}
+
+static void *c_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct cache_head *ch = p;
+ int hash = (*pos >> 32);
+ struct cache_detail *cd = ((struct handle*)m->private)->cd;
+
+ if (p == SEQ_START_TOKEN)
+ hash = 0;
+ else if (ch->next == NULL) {
+ hash++;
+ *pos += 1LL<<32;
+ } else {
+ ++*pos;
+ return ch->next;
+ }
+ *pos &= ~((1LL<<32) - 1);
+ while (hash < cd->hash_size &&
+ cd->hash_table[hash] == NULL) {
+ hash++;
+ *pos += 1LL<<32;
+ }
+ if (hash >= cd->hash_size)
+ return NULL;
+ ++*pos;
+ return cd->hash_table[hash];
+}
+
+static void c_stop(struct seq_file *m, void *p)
+ __releases(cd->hash_lock)
+{
+ struct cache_detail *cd = ((struct handle*)m->private)->cd;
+ read_unlock(&cd->hash_lock);
+}
+
+static int c_show(struct seq_file *m, void *p)
+{
+ struct cache_head *cp = p;
+ struct cache_detail *cd = ((struct handle*)m->private)->cd;
+
+ if (p == SEQ_START_TOKEN)
+ return cd->cache_show(m, cd, NULL);
+
+ ifdebug(CACHE)
+ seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
+ convert_to_wallclock(cp->expiry_time),
+ atomic_read(&cp->ref.refcount), cp->flags);
+ cache_get(cp);
+ if (cache_check(cd, cp, NULL))
+ /* cache_check does a cache_put on failure */
+ seq_printf(m, "# ");
+ else {
+ if (cache_is_expired(cd, cp))
+ seq_printf(m, "# ");
+ cache_put(cp, cd);
+ }
+
+ return cd->cache_show(m, cd, cp);
+}
+
+static const struct seq_operations cache_content_op = {
+ .start = c_start,
+ .next = c_next,
+ .stop = c_stop,
+ .show = c_show,
+};
+
+static int content_open(struct inode *inode, struct file *file,
+ struct cache_detail *cd)
+{
+ struct handle *han;
+
+ if (!cd || !try_module_get(cd->owner))
+ return -EACCES;
+ han = __seq_open_private(file, &cache_content_op, sizeof(*han));
+ if (han == NULL) {
+ module_put(cd->owner);
+ return -ENOMEM;
+ }
+
+ han->cd = cd;
+ return 0;
+}
+
+static int content_release(struct inode *inode, struct file *file,
+ struct cache_detail *cd)
+{
+ int ret = seq_release_private(inode, file);
+ module_put(cd->owner);
+ return ret;
+}
+
+static int open_flush(struct inode *inode, struct file *file,
+ struct cache_detail *cd)
+{
+ if (!cd || !try_module_get(cd->owner))
+ return -EACCES;
+ return nonseekable_open(inode, file);
+}
+
+static int release_flush(struct inode *inode, struct file *file,
+ struct cache_detail *cd)
+{
+ module_put(cd->owner);
+ return 0;
+}
+
+static ssize_t read_flush(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos,
+ struct cache_detail *cd)
+{
+ char tbuf[22];
+ unsigned long p = *ppos;
+ size_t len;
+
+ snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time));
+ len = strlen(tbuf);
+ if (p >= len)
+ return 0;
+ len -= p;
+ if (len > count)
+ len = count;
+ if (copy_to_user(buf, (void*)(tbuf+p), len))
+ return -EFAULT;
+ *ppos += len;
+ return len;
+}
+
+static ssize_t write_flush(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos,
+ struct cache_detail *cd)
+{
+ char tbuf[20];
+ char *bp, *ep;
+
+ if (*ppos || count > sizeof(tbuf)-1)
+ return -EINVAL;
+ if (copy_from_user(tbuf, buf, count))
+ return -EFAULT;
+ tbuf[count] = 0;
+ simple_strtoul(tbuf, &ep, 0);
+ if (*ep && *ep != '\n')
+ return -EINVAL;
+
+ bp = tbuf;
+ cd->flush_time = get_expiry(&bp);
+ cd->nextcheck = seconds_since_boot();
+ cache_flush();
+
+ *ppos += count;
+ return count;
+}
+
+static ssize_t cache_read_procfs(struct file *filp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
+
+ return cache_read(filp, buf, count, ppos, cd);
+}
+
+static ssize_t cache_write_procfs(struct file *filp, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
+
+ return cache_write(filp, buf, count, ppos, cd);
+}
+
+static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
+{
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
+
+ return cache_poll(filp, wait, cd);
+}
+
+static long cache_ioctl_procfs(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct cache_detail *cd = PDE_DATA(inode);
+
+ return cache_ioctl(inode, filp, cmd, arg, cd);
+}
+
+static int cache_open_procfs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = PDE_DATA(inode);
+
+ return cache_open(inode, filp, cd);
+}
+
+static int cache_release_procfs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = PDE_DATA(inode);
+
+ return cache_release(inode, filp, cd);
+}
+
+static const struct file_operations cache_file_operations_procfs = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = cache_read_procfs,
+ .write = cache_write_procfs,
+ .poll = cache_poll_procfs,
+ .unlocked_ioctl = cache_ioctl_procfs, /* for FIONREAD */
+ .open = cache_open_procfs,
+ .release = cache_release_procfs,
+};
+
+static int content_open_procfs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = PDE_DATA(inode);
+
+ return content_open(inode, filp, cd);
+}
+
+static int content_release_procfs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = PDE_DATA(inode);
+
+ return content_release(inode, filp, cd);
+}
+
+static const struct file_operations content_file_operations_procfs = {
+ .open = content_open_procfs,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = content_release_procfs,
+};
+
+static int open_flush_procfs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = PDE_DATA(inode);
+
+ return open_flush(inode, filp, cd);
+}
+
+static int release_flush_procfs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = PDE_DATA(inode);
+
+ return release_flush(inode, filp, cd);
+}
+
+static ssize_t read_flush_procfs(struct file *filp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
+
+ return read_flush(filp, buf, count, ppos, cd);
+}
+
+static ssize_t write_flush_procfs(struct file *filp,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cache_detail *cd = PDE_DATA(file_inode(filp));
+
+ return write_flush(filp, buf, count, ppos, cd);
+}
+
+static const struct file_operations cache_flush_operations_procfs = {
+ .open = open_flush_procfs,
+ .read = read_flush_procfs,
+ .write = write_flush_procfs,
+ .release = release_flush_procfs,
+ .llseek = no_llseek,
+};
+
+static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net)
+{
+ struct sunrpc_net *sn;
+
+ if (cd->u.procfs.proc_ent == NULL)
+ return;
+ if (cd->u.procfs.flush_ent)
+ remove_proc_entry("flush", cd->u.procfs.proc_ent);
+ if (cd->u.procfs.channel_ent)
+ remove_proc_entry("channel", cd->u.procfs.proc_ent);
+ if (cd->u.procfs.content_ent)
+ remove_proc_entry("content", cd->u.procfs.proc_ent);
+ cd->u.procfs.proc_ent = NULL;
+ sn = net_generic(net, sunrpc_net_id);
+ remove_proc_entry(cd->name, sn->proc_net_rpc);
+}
+
+#ifdef CONFIG_PROC_FS
+static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
+{
+ struct proc_dir_entry *p;
+ struct sunrpc_net *sn;
+
+ sn = net_generic(net, sunrpc_net_id);
+ cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
+ if (cd->u.procfs.proc_ent == NULL)
+ goto out_nomem;
+ cd->u.procfs.channel_ent = NULL;
+ cd->u.procfs.content_ent = NULL;
+
+ p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
+ cd->u.procfs.proc_ent,
+ &cache_flush_operations_procfs, cd);
+ cd->u.procfs.flush_ent = p;
+ if (p == NULL)
+ goto out_nomem;
+
+ if (cd->cache_request || cd->cache_parse) {
+ p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
+ cd->u.procfs.proc_ent,
+ &cache_file_operations_procfs, cd);
+ cd->u.procfs.channel_ent = p;
+ if (p == NULL)
+ goto out_nomem;
+ }
+ if (cd->cache_show) {
+ p = proc_create_data("content", S_IFREG|S_IRUSR,
+ cd->u.procfs.proc_ent,
+ &content_file_operations_procfs, cd);
+ cd->u.procfs.content_ent = p;
+ if (p == NULL)
+ goto out_nomem;
+ }
+ return 0;
+out_nomem:
+ remove_cache_proc_entries(cd, net);
+ return -ENOMEM;
+}
+#else /* CONFIG_PROC_FS */
+static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
+{
+ return 0;
+}
+#endif
+
+void __init cache_initialize(void)
+{
+ INIT_DEFERRABLE_WORK(&cache_cleaner, do_cache_clean);
+}
+
+int cache_register_net(struct cache_detail *cd, struct net *net)
+{
+ int ret;
+
+ sunrpc_init_cache_detail(cd);
+ ret = create_cache_proc_entries(cd, net);
+ if (ret)
+ sunrpc_destroy_cache_detail(cd);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cache_register_net);
+
+void cache_unregister_net(struct cache_detail *cd, struct net *net)
+{
+ remove_cache_proc_entries(cd, net);
+ sunrpc_destroy_cache_detail(cd);
+}
+EXPORT_SYMBOL_GPL(cache_unregister_net);
+
+struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net)
+{
+ struct cache_detail *cd;
+
+ cd = kmemdup(tmpl, sizeof(struct cache_detail), GFP_KERNEL);
+ if (cd == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ cd->hash_table = kzalloc(cd->hash_size * sizeof(struct cache_head *),
+ GFP_KERNEL);
+ if (cd->hash_table == NULL) {
+ kfree(cd);
+ return ERR_PTR(-ENOMEM);
+ }
+ cd->net = net;
+ return cd;
+}
+EXPORT_SYMBOL_GPL(cache_create_net);
+
+void cache_destroy_net(struct cache_detail *cd, struct net *net)
+{
+ kfree(cd->hash_table);
+ kfree(cd);
+}
+EXPORT_SYMBOL_GPL(cache_destroy_net);
+
+static ssize_t cache_read_pipefs(struct file *filp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
+
+ return cache_read(filp, buf, count, ppos, cd);
+}
+
+static ssize_t cache_write_pipefs(struct file *filp, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
+
+ return cache_write(filp, buf, count, ppos, cd);
+}
+
+static unsigned int cache_poll_pipefs(struct file *filp, poll_table *wait)
+{
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
+
+ return cache_poll(filp, wait, cd);
+}
+
+static long cache_ioctl_pipefs(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct cache_detail *cd = RPC_I(inode)->private;
+
+ return cache_ioctl(inode, filp, cmd, arg, cd);
+}
+
+static int cache_open_pipefs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = RPC_I(inode)->private;
+
+ return cache_open(inode, filp, cd);
+}
+
+static int cache_release_pipefs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = RPC_I(inode)->private;
+
+ return cache_release(inode, filp, cd);
+}
+
+const struct file_operations cache_file_operations_pipefs = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = cache_read_pipefs,
+ .write = cache_write_pipefs,
+ .poll = cache_poll_pipefs,
+ .unlocked_ioctl = cache_ioctl_pipefs, /* for FIONREAD */
+ .open = cache_open_pipefs,
+ .release = cache_release_pipefs,
+};
+
+static int content_open_pipefs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = RPC_I(inode)->private;
+
+ return content_open(inode, filp, cd);
+}
+
+static int content_release_pipefs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = RPC_I(inode)->private;
+
+ return content_release(inode, filp, cd);
+}
+
+const struct file_operations content_file_operations_pipefs = {
+ .open = content_open_pipefs,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = content_release_pipefs,
+};
+
+static int open_flush_pipefs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = RPC_I(inode)->private;
+
+ return open_flush(inode, filp, cd);
+}
+
+static int release_flush_pipefs(struct inode *inode, struct file *filp)
+{
+ struct cache_detail *cd = RPC_I(inode)->private;
+
+ return release_flush(inode, filp, cd);
+}
+
+static ssize_t read_flush_pipefs(struct file *filp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
+
+ return read_flush(filp, buf, count, ppos, cd);
+}
+
+static ssize_t write_flush_pipefs(struct file *filp,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cache_detail *cd = RPC_I(file_inode(filp))->private;
+
+ return write_flush(filp, buf, count, ppos, cd);
+}
+
+const struct file_operations cache_flush_operations_pipefs = {
+ .open = open_flush_pipefs,
+ .read = read_flush_pipefs,
+ .write = write_flush_pipefs,
+ .release = release_flush_pipefs,
+ .llseek = no_llseek,
+};
+
+int sunrpc_cache_register_pipefs(struct dentry *parent,
+ const char *name, umode_t umode,
+ struct cache_detail *cd)
+{
+ struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+ cd->u.pipefs.dir = dir;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
+
+void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
+{
+ rpc_remove_cache_dir(cd->u.pipefs.dir);
+ cd->u.pipefs.dir = NULL;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
+
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
new file mode 100644
index 000000000..e6ce15173
--- /dev/null
+++ b/net/sunrpc/clnt.c
@@ -0,0 +1,2478 @@
+/*
+ * linux/net/sunrpc/clnt.c
+ *
+ * This file contains the high-level RPC interface.
+ * It is modeled as a finite state machine to support both synchronous
+ * and asynchronous requests.
+ *
+ * - RPC header generation and argument serialization.
+ * - Credential refresh.
+ * - TCP connect handling.
+ * - Retry of operation when it is suspected the operation failed because
+ * of uid squashing on the server, or when the credentials were stale
+ * and need to be refreshed, or when a packet was damaged in transit.
+ * This may be have to be moved to the VFS layer.
+ *
+ * Copyright (C) 1992,1993 Rick Sladkey <jrs@world.std.com>
+ * Copyright (C) 1995,1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+#include <linux/mm.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <linux/utsname.h>
+#include <linux/workqueue.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/un.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/bc_xprt.h>
+#include <trace/events/sunrpc.h>
+
+#include "sunrpc.h"
+#include "netns.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_CALL
+#endif
+
+#define dprint_status(t) \
+ dprintk("RPC: %5u %s (status %d)\n", t->tk_pid, \
+ __func__, t->tk_status)
+
+/*
+ * All RPC clients are linked into this list
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(destroy_wait);
+
+
+static void call_start(struct rpc_task *task);
+static void call_reserve(struct rpc_task *task);
+static void call_reserveresult(struct rpc_task *task);
+static void call_allocate(struct rpc_task *task);
+static void call_decode(struct rpc_task *task);
+static void call_bind(struct rpc_task *task);
+static void call_bind_status(struct rpc_task *task);
+static void call_transmit(struct rpc_task *task);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static void call_bc_transmit(struct rpc_task *task);
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+static void call_status(struct rpc_task *task);
+static void call_transmit_status(struct rpc_task *task);
+static void call_refresh(struct rpc_task *task);
+static void call_refreshresult(struct rpc_task *task);
+static void call_timeout(struct rpc_task *task);
+static void call_connect(struct rpc_task *task);
+static void call_connect_status(struct rpc_task *task);
+
+static __be32 *rpc_encode_header(struct rpc_task *task);
+static __be32 *rpc_verify_header(struct rpc_task *task);
+static int rpc_ping(struct rpc_clnt *clnt);
+
+static void rpc_register_client(struct rpc_clnt *clnt)
+{
+ struct net *net = rpc_net_ns(clnt);
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ spin_lock(&sn->rpc_client_lock);
+ list_add(&clnt->cl_clients, &sn->all_clients);
+ spin_unlock(&sn->rpc_client_lock);
+}
+
+static void rpc_unregister_client(struct rpc_clnt *clnt)
+{
+ struct net *net = rpc_net_ns(clnt);
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ spin_lock(&sn->rpc_client_lock);
+ list_del(&clnt->cl_clients);
+ spin_unlock(&sn->rpc_client_lock);
+}
+
+static void __rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
+{
+ rpc_remove_client_dir(clnt);
+}
+
+static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
+{
+ struct net *net = rpc_net_ns(clnt);
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ __rpc_clnt_remove_pipedir(clnt);
+ rpc_put_sb_net(net);
+ }
+}
+
+static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
+ struct rpc_clnt *clnt)
+{
+ static uint32_t clntid;
+ const char *dir_name = clnt->cl_program->pipe_dir_name;
+ char name[15];
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, dir_name);
+ if (dir == NULL) {
+ pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name);
+ return dir;
+ }
+ for (;;) {
+ snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
+ name[sizeof(name) - 1] = '\0';
+ dentry = rpc_create_client_dir(dir, name, clnt);
+ if (!IS_ERR(dentry))
+ break;
+ if (dentry == ERR_PTR(-EEXIST))
+ continue;
+ printk(KERN_INFO "RPC: Couldn't create pipefs entry"
+ " %s/%s, error %ld\n",
+ dir_name, name, PTR_ERR(dentry));
+ break;
+ }
+ dput(dir);
+ return dentry;
+}
+
+static int
+rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt)
+{
+ struct dentry *dentry;
+
+ if (clnt->cl_program->pipe_dir_name != NULL) {
+ dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ return 0;
+}
+
+static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event)
+{
+ if (clnt->cl_program->pipe_dir_name == NULL)
+ return 1;
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ if (clnt->cl_pipedir_objects.pdh_dentry != NULL)
+ return 1;
+ if (atomic_read(&clnt->cl_count) == 0)
+ return 1;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (clnt->cl_pipedir_objects.pdh_dentry == NULL)
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
+ struct super_block *sb)
+{
+ struct dentry *dentry;
+ int err = 0;
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = rpc_setup_pipedir_sb(sb, clnt);
+ if (!dentry)
+ return -ENOENT;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ __rpc_clnt_remove_pipedir(clnt);
+ break;
+ default:
+ printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event);
+ return -ENOTSUPP;
+ }
+ return err;
+}
+
+static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
+ struct super_block *sb)
+{
+ int error = 0;
+
+ for (;; clnt = clnt->cl_parent) {
+ if (!rpc_clnt_skip_event(clnt, event))
+ error = __rpc_clnt_handle_event(clnt, event, sb);
+ if (error || clnt == clnt->cl_parent)
+ break;
+ }
+ return error;
+}
+
+static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_clnt *clnt;
+
+ spin_lock(&sn->rpc_client_lock);
+ list_for_each_entry(clnt, &sn->all_clients, cl_clients) {
+ if (rpc_clnt_skip_event(clnt, event))
+ continue;
+ spin_unlock(&sn->rpc_client_lock);
+ return clnt;
+ }
+ spin_unlock(&sn->rpc_client_lock);
+ return NULL;
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct rpc_clnt *clnt;
+ int error = 0;
+
+ while ((clnt = rpc_get_client_for_event(sb->s_fs_info, event))) {
+ error = __rpc_pipefs_event(clnt, event, sb);
+ if (error)
+ break;
+ }
+ return error;
+}
+
+static struct notifier_block rpc_clients_block = {
+ .notifier_call = rpc_pipefs_event,
+ .priority = SUNRPC_PIPEFS_RPC_PRIO,
+};
+
+int rpc_clients_notifier_register(void)
+{
+ return rpc_pipefs_notifier_register(&rpc_clients_block);
+}
+
+void rpc_clients_notifier_unregister(void)
+{
+ return rpc_pipefs_notifier_unregister(&rpc_clients_block);
+}
+
+static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ const struct rpc_timeout *timeout)
+{
+ struct rpc_xprt *old;
+
+ spin_lock(&clnt->cl_lock);
+ old = rcu_dereference_protected(clnt->cl_xprt,
+ lockdep_is_held(&clnt->cl_lock));
+
+ if (!xprt_bound(xprt))
+ clnt->cl_autobind = 1;
+
+ clnt->cl_timeout = timeout;
+ rcu_assign_pointer(clnt->cl_xprt, xprt);
+ spin_unlock(&clnt->cl_lock);
+
+ return old;
+}
+
+static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename)
+{
+ clnt->cl_nodelen = strlcpy(clnt->cl_nodename,
+ nodename, sizeof(clnt->cl_nodename));
+}
+
+static int rpc_client_register(struct rpc_clnt *clnt,
+ rpc_authflavor_t pseudoflavor,
+ const char *client_name)
+{
+ struct rpc_auth_create_args auth_args = {
+ .pseudoflavor = pseudoflavor,
+ .target_name = client_name,
+ };
+ struct rpc_auth *auth;
+ struct net *net = rpc_net_ns(clnt);
+ struct super_block *pipefs_sb;
+ int err;
+
+ rpc_clnt_debugfs_register(clnt);
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ err = rpc_setup_pipedir(pipefs_sb, clnt);
+ if (err)
+ goto out;
+ }
+
+ rpc_register_client(clnt);
+ if (pipefs_sb)
+ rpc_put_sb_net(net);
+
+ auth = rpcauth_create(&auth_args, clnt);
+ if (IS_ERR(auth)) {
+ dprintk("RPC: Couldn't create auth handle (flavor %u)\n",
+ pseudoflavor);
+ err = PTR_ERR(auth);
+ goto err_auth;
+ }
+ return 0;
+err_auth:
+ pipefs_sb = rpc_get_sb_net(net);
+ rpc_unregister_client(clnt);
+ __rpc_clnt_remove_pipedir(clnt);
+out:
+ if (pipefs_sb)
+ rpc_put_sb_net(net);
+ rpc_clnt_debugfs_unregister(clnt);
+ return err;
+}
+
+static DEFINE_IDA(rpc_clids);
+
+static int rpc_alloc_clid(struct rpc_clnt *clnt)
+{
+ int clid;
+
+ clid = ida_simple_get(&rpc_clids, 0, 0, GFP_KERNEL);
+ if (clid < 0)
+ return clid;
+ clnt->cl_clid = clid;
+ return 0;
+}
+
+static void rpc_free_clid(struct rpc_clnt *clnt)
+{
+ ida_simple_remove(&rpc_clids, clnt->cl_clid);
+}
+
+static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
+ struct rpc_xprt *xprt,
+ struct rpc_clnt *parent)
+{
+ const struct rpc_program *program = args->program;
+ const struct rpc_version *version;
+ struct rpc_clnt *clnt = NULL;
+ const struct rpc_timeout *timeout;
+ const char *nodename = args->nodename;
+ int err;
+
+ /* sanity check the name before trying to print it */
+ dprintk("RPC: creating %s client for %s (xprt %p)\n",
+ program->name, args->servername, xprt);
+
+ err = rpciod_up();
+ if (err)
+ goto out_no_rpciod;
+
+ err = -EINVAL;
+ if (args->version >= program->nrvers)
+ goto out_err;
+ version = program->version[args->version];
+ if (version == NULL)
+ goto out_err;
+
+ err = -ENOMEM;
+ clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
+ if (!clnt)
+ goto out_err;
+ clnt->cl_parent = parent ? : clnt;
+
+ err = rpc_alloc_clid(clnt);
+ if (err)
+ goto out_no_clid;
+
+ clnt->cl_procinfo = version->procs;
+ clnt->cl_maxproc = version->nrprocs;
+ clnt->cl_prog = args->prognumber ? : program->number;
+ clnt->cl_vers = version->number;
+ clnt->cl_stats = program->stats;
+ clnt->cl_metrics = rpc_alloc_iostats(clnt);
+ rpc_init_pipe_dir_head(&clnt->cl_pipedir_objects);
+ err = -ENOMEM;
+ if (clnt->cl_metrics == NULL)
+ goto out_no_stats;
+ clnt->cl_program = program;
+ INIT_LIST_HEAD(&clnt->cl_tasks);
+ spin_lock_init(&clnt->cl_lock);
+
+ timeout = xprt->timeout;
+ if (args->timeout != NULL) {
+ memcpy(&clnt->cl_timeout_default, args->timeout,
+ sizeof(clnt->cl_timeout_default));
+ timeout = &clnt->cl_timeout_default;
+ }
+
+ rpc_clnt_set_transport(clnt, xprt, timeout);
+
+ clnt->cl_rtt = &clnt->cl_rtt_default;
+ rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval);
+
+ atomic_set(&clnt->cl_count, 1);
+
+ if (nodename == NULL)
+ nodename = utsname()->nodename;
+ /* save the nodename */
+ rpc_clnt_set_nodename(clnt, nodename);
+
+ err = rpc_client_register(clnt, args->authflavor, args->client_name);
+ if (err)
+ goto out_no_path;
+ if (parent)
+ atomic_inc(&parent->cl_count);
+ return clnt;
+
+out_no_path:
+ rpc_free_iostats(clnt->cl_metrics);
+out_no_stats:
+ rpc_free_clid(clnt);
+out_no_clid:
+ kfree(clnt);
+out_err:
+ rpciod_down();
+out_no_rpciod:
+ xprt_put(xprt);
+ return ERR_PTR(err);
+}
+
+struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
+ struct rpc_xprt *xprt)
+{
+ struct rpc_clnt *clnt = NULL;
+
+ clnt = rpc_new_client(args, xprt, NULL);
+ if (IS_ERR(clnt))
+ return clnt;
+
+ if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
+ int err = rpc_ping(clnt);
+ if (err != 0) {
+ rpc_shutdown_client(clnt);
+ return ERR_PTR(err);
+ }
+ }
+
+ clnt->cl_softrtry = 1;
+ if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
+ clnt->cl_softrtry = 0;
+
+ if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
+ clnt->cl_autobind = 1;
+ if (args->flags & RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT)
+ clnt->cl_noretranstimeo = 1;
+ if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
+ clnt->cl_discrtry = 1;
+ if (!(args->flags & RPC_CLNT_CREATE_QUIET))
+ clnt->cl_chatty = 1;
+
+ return clnt;
+}
+EXPORT_SYMBOL_GPL(rpc_create_xprt);
+
+/**
+ * rpc_create - create an RPC client and transport with one call
+ * @args: rpc_clnt create argument structure
+ *
+ * Creates and initializes an RPC transport and an RPC client.
+ *
+ * It can ping the server in order to determine if it is up, and to see if
+ * it supports this program and version. RPC_CLNT_CREATE_NOPING disables
+ * this behavior so asynchronous tasks can also use rpc_create.
+ */
+struct rpc_clnt *rpc_create(struct rpc_create_args *args)
+{
+ struct rpc_xprt *xprt;
+ struct xprt_create xprtargs = {
+ .net = args->net,
+ .ident = args->protocol,
+ .srcaddr = args->saddress,
+ .dstaddr = args->address,
+ .addrlen = args->addrsize,
+ .servername = args->servername,
+ .bc_xprt = args->bc_xprt,
+ };
+ char servername[48];
+
+ if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
+ xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
+ if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT)
+ xprtargs.flags |= XPRT_CREATE_NO_IDLE_TIMEOUT;
+ /*
+ * If the caller chooses not to specify a hostname, whip
+ * up a string representation of the passed-in address.
+ */
+ if (xprtargs.servername == NULL) {
+ struct sockaddr_un *sun =
+ (struct sockaddr_un *)args->address;
+ struct sockaddr_in *sin =
+ (struct sockaddr_in *)args->address;
+ struct sockaddr_in6 *sin6 =
+ (struct sockaddr_in6 *)args->address;
+
+ servername[0] = '\0';
+ switch (args->address->sa_family) {
+ case AF_LOCAL:
+ snprintf(servername, sizeof(servername), "%s",
+ sun->sun_path);
+ break;
+ case AF_INET:
+ snprintf(servername, sizeof(servername), "%pI4",
+ &sin->sin_addr.s_addr);
+ break;
+ case AF_INET6:
+ snprintf(servername, sizeof(servername), "%pI6",
+ &sin6->sin6_addr);
+ break;
+ default:
+ /* caller wants default server name, but
+ * address family isn't recognized. */
+ return ERR_PTR(-EINVAL);
+ }
+ xprtargs.servername = servername;
+ }
+
+ xprt = xprt_create_transport(&xprtargs);
+ if (IS_ERR(xprt))
+ return (struct rpc_clnt *)xprt;
+
+ /*
+ * By default, kernel RPC client connects from a reserved port.
+ * CAP_NET_BIND_SERVICE will not be set for unprivileged requesters,
+ * but it is always enabled for rpciod, which handles the connect
+ * operation.
+ */
+ xprt->resvport = 1;
+ if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
+ xprt->resvport = 0;
+
+ return rpc_create_xprt(args, xprt);
+}
+EXPORT_SYMBOL_GPL(rpc_create);
+
+/*
+ * This function clones the RPC client structure. It allows us to share the
+ * same transport while varying parameters such as the authentication
+ * flavour.
+ */
+static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
+ struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+ struct rpc_clnt *new;
+ int err;
+
+ err = -ENOMEM;
+ rcu_read_lock();
+ xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ rcu_read_unlock();
+ if (xprt == NULL)
+ goto out_err;
+ args->servername = xprt->servername;
+ args->nodename = clnt->cl_nodename;
+
+ new = rpc_new_client(args, xprt, clnt);
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ goto out_err;
+ }
+
+ /* Turn off autobind on clones */
+ new->cl_autobind = 0;
+ new->cl_softrtry = clnt->cl_softrtry;
+ new->cl_noretranstimeo = clnt->cl_noretranstimeo;
+ new->cl_discrtry = clnt->cl_discrtry;
+ new->cl_chatty = clnt->cl_chatty;
+ return new;
+
+out_err:
+ dprintk("RPC: %s: returned error %d\n", __func__, err);
+ return ERR_PTR(err);
+}
+
+/**
+ * rpc_clone_client - Clone an RPC client structure
+ *
+ * @clnt: RPC client whose parameters are copied
+ *
+ * Returns a fresh RPC client or an ERR_PTR.
+ */
+struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt)
+{
+ struct rpc_create_args args = {
+ .program = clnt->cl_program,
+ .prognumber = clnt->cl_prog,
+ .version = clnt->cl_vers,
+ .authflavor = clnt->cl_auth->au_flavor,
+ };
+ return __rpc_clone_client(&args, clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_clone_client);
+
+/**
+ * rpc_clone_client_set_auth - Clone an RPC client structure and set its auth
+ *
+ * @clnt: RPC client whose parameters are copied
+ * @flavor: security flavor for new client
+ *
+ * Returns a fresh RPC client or an ERR_PTR.
+ */
+struct rpc_clnt *
+rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
+{
+ struct rpc_create_args args = {
+ .program = clnt->cl_program,
+ .prognumber = clnt->cl_prog,
+ .version = clnt->cl_vers,
+ .authflavor = flavor,
+ };
+ return __rpc_clone_client(&args, clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth);
+
+/**
+ * rpc_switch_client_transport: switch the RPC transport on the fly
+ * @clnt: pointer to a struct rpc_clnt
+ * @args: pointer to the new transport arguments
+ * @timeout: pointer to the new timeout parameters
+ *
+ * This function allows the caller to switch the RPC transport for the
+ * rpc_clnt structure 'clnt' to allow it to connect to a mirrored NFS
+ * server, for instance. It assumes that the caller has ensured that
+ * there are no active RPC tasks by using some form of locking.
+ *
+ * Returns zero if "clnt" is now using the new xprt. Otherwise a
+ * negative errno is returned, and "clnt" continues to use the old
+ * xprt.
+ */
+int rpc_switch_client_transport(struct rpc_clnt *clnt,
+ struct xprt_create *args,
+ const struct rpc_timeout *timeout)
+{
+ const struct rpc_timeout *old_timeo;
+ rpc_authflavor_t pseudoflavor;
+ struct rpc_xprt *xprt, *old;
+ struct rpc_clnt *parent;
+ int err;
+
+ xprt = xprt_create_transport(args);
+ if (IS_ERR(xprt)) {
+ dprintk("RPC: failed to create new xprt for clnt %p\n",
+ clnt);
+ return PTR_ERR(xprt);
+ }
+
+ pseudoflavor = clnt->cl_auth->au_flavor;
+
+ old_timeo = clnt->cl_timeout;
+ old = rpc_clnt_set_transport(clnt, xprt, timeout);
+
+ rpc_unregister_client(clnt);
+ __rpc_clnt_remove_pipedir(clnt);
+ rpc_clnt_debugfs_unregister(clnt);
+
+ /*
+ * A new transport was created. "clnt" therefore
+ * becomes the root of a new cl_parent tree. clnt's
+ * children, if it has any, still point to the old xprt.
+ */
+ parent = clnt->cl_parent;
+ clnt->cl_parent = clnt;
+
+ /*
+ * The old rpc_auth cache cannot be re-used. GSS
+ * contexts in particular are between a single
+ * client and server.
+ */
+ err = rpc_client_register(clnt, pseudoflavor, NULL);
+ if (err)
+ goto out_revert;
+
+ synchronize_rcu();
+ if (parent != clnt)
+ rpc_release_client(parent);
+ xprt_put(old);
+ dprintk("RPC: replaced xprt for clnt %p\n", clnt);
+ return 0;
+
+out_revert:
+ rpc_clnt_set_transport(clnt, old, old_timeo);
+ clnt->cl_parent = parent;
+ rpc_client_register(clnt, pseudoflavor, NULL);
+ xprt_put(xprt);
+ dprintk("RPC: failed to switch xprt for clnt %p\n", clnt);
+ return err;
+}
+EXPORT_SYMBOL_GPL(rpc_switch_client_transport);
+
+/*
+ * Kill all tasks for the given client.
+ * XXX: kill their descendants as well?
+ */
+void rpc_killall_tasks(struct rpc_clnt *clnt)
+{
+ struct rpc_task *rovr;
+
+
+ if (list_empty(&clnt->cl_tasks))
+ return;
+ dprintk("RPC: killing all tasks for client %p\n", clnt);
+ /*
+ * Spin lock all_tasks to prevent changes...
+ */
+ spin_lock(&clnt->cl_lock);
+ list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
+ if (!RPC_IS_ACTIVATED(rovr))
+ continue;
+ if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
+ rovr->tk_flags |= RPC_TASK_KILLED;
+ rpc_exit(rovr, -EIO);
+ if (RPC_IS_QUEUED(rovr))
+ rpc_wake_up_queued_task(rovr->tk_waitqueue,
+ rovr);
+ }
+ }
+ spin_unlock(&clnt->cl_lock);
+}
+EXPORT_SYMBOL_GPL(rpc_killall_tasks);
+
+/*
+ * Properly shut down an RPC client, terminating all outstanding
+ * requests.
+ */
+void rpc_shutdown_client(struct rpc_clnt *clnt)
+{
+ might_sleep();
+
+ dprintk_rcu("RPC: shutting down %s client for %s\n",
+ clnt->cl_program->name,
+ rcu_dereference(clnt->cl_xprt)->servername);
+
+ while (!list_empty(&clnt->cl_tasks)) {
+ rpc_killall_tasks(clnt);
+ wait_event_timeout(destroy_wait,
+ list_empty(&clnt->cl_tasks), 1*HZ);
+ }
+
+ rpc_release_client(clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_shutdown_client);
+
+/*
+ * Free an RPC client
+ */
+static struct rpc_clnt *
+rpc_free_client(struct rpc_clnt *clnt)
+{
+ struct rpc_clnt *parent = NULL;
+
+ dprintk_rcu("RPC: destroying %s client for %s\n",
+ clnt->cl_program->name,
+ rcu_dereference(clnt->cl_xprt)->servername);
+ if (clnt->cl_parent != clnt)
+ parent = clnt->cl_parent;
+ rpc_clnt_debugfs_unregister(clnt);
+ rpc_clnt_remove_pipedir(clnt);
+ rpc_unregister_client(clnt);
+ rpc_free_iostats(clnt->cl_metrics);
+ clnt->cl_metrics = NULL;
+ xprt_put(rcu_dereference_raw(clnt->cl_xprt));
+ rpciod_down();
+ rpc_free_clid(clnt);
+ kfree(clnt);
+ return parent;
+}
+
+/*
+ * Free an RPC client
+ */
+static struct rpc_clnt *
+rpc_free_auth(struct rpc_clnt *clnt)
+{
+ if (clnt->cl_auth == NULL)
+ return rpc_free_client(clnt);
+
+ /*
+ * Note: RPCSEC_GSS may need to send NULL RPC calls in order to
+ * release remaining GSS contexts. This mechanism ensures
+ * that it can do so safely.
+ */
+ atomic_inc(&clnt->cl_count);
+ rpcauth_release(clnt->cl_auth);
+ clnt->cl_auth = NULL;
+ if (atomic_dec_and_test(&clnt->cl_count))
+ return rpc_free_client(clnt);
+ return NULL;
+}
+
+/*
+ * Release reference to the RPC client
+ */
+void
+rpc_release_client(struct rpc_clnt *clnt)
+{
+ dprintk("RPC: rpc_release_client(%p)\n", clnt);
+
+ do {
+ if (list_empty(&clnt->cl_tasks))
+ wake_up(&destroy_wait);
+ if (!atomic_dec_and_test(&clnt->cl_count))
+ break;
+ clnt = rpc_free_auth(clnt);
+ } while (clnt != NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_release_client);
+
+/**
+ * rpc_bind_new_program - bind a new RPC program to an existing client
+ * @old: old rpc_client
+ * @program: rpc program to set
+ * @vers: rpc program version
+ *
+ * Clones the rpc client and sets up a new RPC program. This is mainly
+ * of use for enabling different RPC programs to share the same transport.
+ * The Sun NFSv2/v3 ACL protocol can do this.
+ */
+struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
+ const struct rpc_program *program,
+ u32 vers)
+{
+ struct rpc_create_args args = {
+ .program = program,
+ .prognumber = program->number,
+ .version = vers,
+ .authflavor = old->cl_auth->au_flavor,
+ };
+ struct rpc_clnt *clnt;
+ int err;
+
+ clnt = __rpc_clone_client(&args, old);
+ if (IS_ERR(clnt))
+ goto out;
+ err = rpc_ping(clnt);
+ if (err != 0) {
+ rpc_shutdown_client(clnt);
+ clnt = ERR_PTR(err);
+ }
+out:
+ return clnt;
+}
+EXPORT_SYMBOL_GPL(rpc_bind_new_program);
+
+void rpc_task_release_client(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+
+ if (clnt != NULL) {
+ /* Remove from client task list */
+ spin_lock(&clnt->cl_lock);
+ list_del(&task->tk_task);
+ spin_unlock(&clnt->cl_lock);
+ task->tk_client = NULL;
+
+ rpc_release_client(clnt);
+ }
+}
+
+static
+void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+ if (clnt != NULL) {
+ rpc_task_release_client(task);
+ task->tk_client = clnt;
+ atomic_inc(&clnt->cl_count);
+ if (clnt->cl_softrtry)
+ task->tk_flags |= RPC_TASK_SOFT;
+ if (clnt->cl_noretranstimeo)
+ task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
+ if (sk_memalloc_socks()) {
+ struct rpc_xprt *xprt;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ if (xprt->swapper)
+ task->tk_flags |= RPC_TASK_SWAPPER;
+ rcu_read_unlock();
+ }
+ /* Add to the client's list of all tasks */
+ spin_lock(&clnt->cl_lock);
+ list_add_tail(&task->tk_task, &clnt->cl_tasks);
+ spin_unlock(&clnt->cl_lock);
+ }
+}
+
+void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+ rpc_task_release_client(task);
+ rpc_task_set_client(task, clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_task_reset_client);
+
+
+static void
+rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
+{
+ if (msg != NULL) {
+ task->tk_msg.rpc_proc = msg->rpc_proc;
+ task->tk_msg.rpc_argp = msg->rpc_argp;
+ task->tk_msg.rpc_resp = msg->rpc_resp;
+ if (msg->rpc_cred != NULL)
+ task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred);
+ }
+}
+
+/*
+ * Default callback for async RPC calls
+ */
+static void
+rpc_default_callback(struct rpc_task *task, void *data)
+{
+}
+
+static const struct rpc_call_ops rpc_default_ops = {
+ .rpc_call_done = rpc_default_callback,
+};
+
+/**
+ * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it
+ * @task_setup_data: pointer to task initialisation data
+ */
+struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
+{
+ struct rpc_task *task;
+
+ task = rpc_new_task(task_setup_data);
+ if (IS_ERR(task))
+ goto out;
+
+ rpc_task_set_client(task, task_setup_data->rpc_client);
+ rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
+
+ if (task->tk_action == NULL)
+ rpc_call_start(task);
+
+ atomic_inc(&task->tk_count);
+ rpc_execute(task);
+out:
+ return task;
+}
+EXPORT_SYMBOL_GPL(rpc_run_task);
+
+/**
+ * rpc_call_sync - Perform a synchronous RPC call
+ * @clnt: pointer to RPC client
+ * @msg: RPC call parameters
+ * @flags: RPC call flags
+ */
+int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags)
+{
+ struct rpc_task *task;
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_message = msg,
+ .callback_ops = &rpc_default_ops,
+ .flags = flags,
+ };
+ int status;
+
+ WARN_ON_ONCE(flags & RPC_TASK_ASYNC);
+ if (flags & RPC_TASK_ASYNC) {
+ rpc_release_calldata(task_setup_data.callback_ops,
+ task_setup_data.callback_data);
+ return -EINVAL;
+ }
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ status = task->tk_status;
+ rpc_put_task(task);
+ return status;
+}
+EXPORT_SYMBOL_GPL(rpc_call_sync);
+
+/**
+ * rpc_call_async - Perform an asynchronous RPC call
+ * @clnt: pointer to RPC client
+ * @msg: RPC call parameters
+ * @flags: RPC call flags
+ * @tk_ops: RPC call ops
+ * @data: user call data
+ */
+int
+rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags,
+ const struct rpc_call_ops *tk_ops, void *data)
+{
+ struct rpc_task *task;
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_message = msg,
+ .callback_ops = tk_ops,
+ .callback_data = data,
+ .flags = flags|RPC_TASK_ASYNC,
+ };
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_call_async);
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/**
+ * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
+ * rpc_execute against it
+ * @req: RPC request
+ * @tk_ops: RPC call ops
+ */
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+ const struct rpc_call_ops *tk_ops)
+{
+ struct rpc_task *task;
+ struct xdr_buf *xbufp = &req->rq_snd_buf;
+ struct rpc_task_setup task_setup_data = {
+ .callback_ops = tk_ops,
+ };
+
+ dprintk("RPC: rpc_run_bc_task req= %p\n", req);
+ /*
+ * Create an rpc_task to send the data
+ */
+ task = rpc_new_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ xprt_free_bc_request(req);
+ goto out;
+ }
+ task->tk_rqstp = req;
+
+ /*
+ * Set up the xdr_buf length.
+ * This also indicates that the buffer is XDR encoded already.
+ */
+ xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
+ xbufp->tail[0].iov_len;
+
+ task->tk_action = call_bc_transmit;
+ atomic_inc(&task->tk_count);
+ WARN_ON_ONCE(atomic_read(&task->tk_count) != 2);
+ rpc_execute(task);
+
+out:
+ dprintk("RPC: rpc_run_bc_task: task= %p\n", task);
+ return task;
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+void
+rpc_call_start(struct rpc_task *task)
+{
+ task->tk_action = call_start;
+}
+EXPORT_SYMBOL_GPL(rpc_call_start);
+
+/**
+ * rpc_peeraddr - extract remote peer address from clnt's xprt
+ * @clnt: RPC client structure
+ * @buf: target buffer
+ * @bufsize: length of target buffer
+ *
+ * Returns the number of bytes that are actually in the stored address.
+ */
+size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize)
+{
+ size_t bytes;
+ struct rpc_xprt *xprt;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+
+ bytes = xprt->addrlen;
+ if (bytes > bufsize)
+ bytes = bufsize;
+ memcpy(buf, &xprt->addr, bytes);
+ rcu_read_unlock();
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(rpc_peeraddr);
+
+/**
+ * rpc_peeraddr2str - return remote peer address in printable format
+ * @clnt: RPC client structure
+ * @format: address format
+ *
+ * NB: the lifetime of the memory referenced by the returned pointer is
+ * the same as the rpc_xprt itself. As long as the caller uses this
+ * pointer, it must hold the RCU read lock.
+ */
+const char *rpc_peeraddr2str(struct rpc_clnt *clnt,
+ enum rpc_display_format_t format)
+{
+ struct rpc_xprt *xprt;
+
+ xprt = rcu_dereference(clnt->cl_xprt);
+
+ if (xprt->address_strings[format] != NULL)
+ return xprt->address_strings[format];
+ else
+ return "unprintable";
+}
+EXPORT_SYMBOL_GPL(rpc_peeraddr2str);
+
+static const struct sockaddr_in rpc_inaddr_loopback = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+};
+
+static const struct sockaddr_in6 rpc_in6addr_loopback = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+};
+
+/*
+ * Try a getsockname() on a connected datagram socket. Using a
+ * connected datagram socket prevents leaving a socket in TIME_WAIT.
+ * This conserves the ephemeral port number space.
+ *
+ * Returns zero and fills in "buf" if successful; otherwise, a
+ * negative errno is returned.
+ */
+static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen,
+ struct sockaddr *buf, int buflen)
+{
+ struct socket *sock;
+ int err;
+
+ err = __sock_create(net, sap->sa_family,
+ SOCK_DGRAM, IPPROTO_UDP, &sock, 1);
+ if (err < 0) {
+ dprintk("RPC: can't create UDP socket (%d)\n", err);
+ goto out;
+ }
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ err = kernel_bind(sock,
+ (struct sockaddr *)&rpc_inaddr_loopback,
+ sizeof(rpc_inaddr_loopback));
+ break;
+ case AF_INET6:
+ err = kernel_bind(sock,
+ (struct sockaddr *)&rpc_in6addr_loopback,
+ sizeof(rpc_in6addr_loopback));
+ break;
+ default:
+ err = -EAFNOSUPPORT;
+ goto out;
+ }
+ if (err < 0) {
+ dprintk("RPC: can't bind UDP socket (%d)\n", err);
+ goto out_release;
+ }
+
+ err = kernel_connect(sock, sap, salen, 0);
+ if (err < 0) {
+ dprintk("RPC: can't connect UDP socket (%d)\n", err);
+ goto out_release;
+ }
+
+ err = kernel_getsockname(sock, buf, &buflen);
+ if (err < 0) {
+ dprintk("RPC: getsockname failed (%d)\n", err);
+ goto out_release;
+ }
+
+ err = 0;
+ if (buf->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)buf;
+ sin6->sin6_scope_id = 0;
+ }
+ dprintk("RPC: %s succeeded\n", __func__);
+
+out_release:
+ sock_release(sock);
+out:
+ return err;
+}
+
+/*
+ * Scraping a connected socket failed, so we don't have a useable
+ * local address. Fallback: generate an address that will prevent
+ * the server from calling us back.
+ *
+ * Returns zero and fills in "buf" if successful; otherwise, a
+ * negative errno is returned.
+ */
+static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen)
+{
+ switch (family) {
+ case AF_INET:
+ if (buflen < sizeof(rpc_inaddr_loopback))
+ return -EINVAL;
+ memcpy(buf, &rpc_inaddr_loopback,
+ sizeof(rpc_inaddr_loopback));
+ break;
+ case AF_INET6:
+ if (buflen < sizeof(rpc_in6addr_loopback))
+ return -EINVAL;
+ memcpy(buf, &rpc_in6addr_loopback,
+ sizeof(rpc_in6addr_loopback));
+ default:
+ dprintk("RPC: %s: address family not supported\n",
+ __func__);
+ return -EAFNOSUPPORT;
+ }
+ dprintk("RPC: %s: succeeded\n", __func__);
+ return 0;
+}
+
+/**
+ * rpc_localaddr - discover local endpoint address for an RPC client
+ * @clnt: RPC client structure
+ * @buf: target buffer
+ * @buflen: size of target buffer, in bytes
+ *
+ * Returns zero and fills in "buf" and "buflen" if successful;
+ * otherwise, a negative errno is returned.
+ *
+ * This works even if the underlying transport is not currently connected,
+ * or if the upper layer never previously provided a source address.
+ *
+ * The result of this function call is transient: multiple calls in
+ * succession may give different results, depending on how local
+ * networking configuration changes over time.
+ */
+int rpc_localaddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t buflen)
+{
+ struct sockaddr_storage address;
+ struct sockaddr *sap = (struct sockaddr *)&address;
+ struct rpc_xprt *xprt;
+ struct net *net;
+ size_t salen;
+ int err;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ salen = xprt->addrlen;
+ memcpy(sap, &xprt->addr, salen);
+ net = get_net(xprt->xprt_net);
+ rcu_read_unlock();
+
+ rpc_set_port(sap, 0);
+ err = rpc_sockname(net, sap, salen, buf, buflen);
+ put_net(net);
+ if (err != 0)
+ /* Couldn't discover local address, return ANYADDR */
+ return rpc_anyaddr(sap->sa_family, buf, buflen);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_localaddr);
+
+void
+rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize)
+{
+ struct rpc_xprt *xprt;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ if (xprt->ops->set_buffer_size)
+ xprt->ops->set_buffer_size(xprt, sndsize, rcvsize);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rpc_setbufsize);
+
+/**
+ * rpc_protocol - Get transport protocol number for an RPC client
+ * @clnt: RPC client to query
+ *
+ */
+int rpc_protocol(struct rpc_clnt *clnt)
+{
+ int protocol;
+
+ rcu_read_lock();
+ protocol = rcu_dereference(clnt->cl_xprt)->prot;
+ rcu_read_unlock();
+ return protocol;
+}
+EXPORT_SYMBOL_GPL(rpc_protocol);
+
+/**
+ * rpc_net_ns - Get the network namespace for this RPC client
+ * @clnt: RPC client to query
+ *
+ */
+struct net *rpc_net_ns(struct rpc_clnt *clnt)
+{
+ struct net *ret;
+
+ rcu_read_lock();
+ ret = rcu_dereference(clnt->cl_xprt)->xprt_net;
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_net_ns);
+
+/**
+ * rpc_max_payload - Get maximum payload size for a transport, in bytes
+ * @clnt: RPC client to query
+ *
+ * For stream transports, this is one RPC record fragment (see RFC
+ * 1831), as we don't support multi-record requests yet. For datagram
+ * transports, this is the size of an IP packet minus the IP, UDP, and
+ * RPC header sizes.
+ */
+size_t rpc_max_payload(struct rpc_clnt *clnt)
+{
+ size_t ret;
+
+ rcu_read_lock();
+ ret = rcu_dereference(clnt->cl_xprt)->max_payload;
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_max_payload);
+
+/**
+ * rpc_get_timeout - Get timeout for transport in units of HZ
+ * @clnt: RPC client to query
+ */
+unsigned long rpc_get_timeout(struct rpc_clnt *clnt)
+{
+ unsigned long ret;
+
+ rcu_read_lock();
+ ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval;
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_get_timeout);
+
+/**
+ * rpc_force_rebind - force transport to check that remote port is unchanged
+ * @clnt: client to rebind
+ *
+ */
+void rpc_force_rebind(struct rpc_clnt *clnt)
+{
+ if (clnt->cl_autobind) {
+ rcu_read_lock();
+ xprt_clear_bound(rcu_dereference(clnt->cl_xprt));
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL_GPL(rpc_force_rebind);
+
+/*
+ * Restart an (async) RPC call from the call_prepare state.
+ * Usually called from within the exit handler.
+ */
+int
+rpc_restart_call_prepare(struct rpc_task *task)
+{
+ if (RPC_ASSASSINATED(task))
+ return 0;
+ task->tk_action = call_start;
+ task->tk_status = 0;
+ if (task->tk_ops->rpc_call_prepare != NULL)
+ task->tk_action = rpc_prepare_task;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
+
+/*
+ * Restart an (async) RPC call. Usually called from within the
+ * exit handler.
+ */
+int
+rpc_restart_call(struct rpc_task *task)
+{
+ if (RPC_ASSASSINATED(task))
+ return 0;
+ task->tk_action = call_start;
+ task->tk_status = 0;
+ return 1;
+}
+EXPORT_SYMBOL_GPL(rpc_restart_call);
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+const char
+*rpc_proc_name(const struct rpc_task *task)
+{
+ const struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+
+ if (proc) {
+ if (proc->p_name)
+ return proc->p_name;
+ else
+ return "NULL";
+ } else
+ return "no proc";
+}
+#endif
+
+/*
+ * 0. Initial state
+ *
+ * Other FSM states can be visited zero or more times, but
+ * this state is visited exactly once for each RPC.
+ */
+static void
+call_start(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+
+ dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid,
+ clnt->cl_program->name, clnt->cl_vers,
+ rpc_proc_name(task),
+ (RPC_IS_ASYNC(task) ? "async" : "sync"));
+
+ /* Increment call count */
+ task->tk_msg.rpc_proc->p_count++;
+ clnt->cl_stats->rpccnt++;
+ task->tk_action = call_reserve;
+}
+
+/*
+ * 1. Reserve an RPC call slot
+ */
+static void
+call_reserve(struct rpc_task *task)
+{
+ dprint_status(task);
+
+ task->tk_status = 0;
+ task->tk_action = call_reserveresult;
+ xprt_reserve(task);
+}
+
+static void call_retry_reserve(struct rpc_task *task);
+
+/*
+ * 1b. Grok the result of xprt_reserve()
+ */
+static void
+call_reserveresult(struct rpc_task *task)
+{
+ int status = task->tk_status;
+
+ dprint_status(task);
+
+ /*
+ * After a call to xprt_reserve(), we must have either
+ * a request slot or else an error status.
+ */
+ task->tk_status = 0;
+ if (status >= 0) {
+ if (task->tk_rqstp) {
+ task->tk_action = call_refresh;
+ return;
+ }
+
+ printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n",
+ __func__, status);
+ rpc_exit(task, -EIO);
+ return;
+ }
+
+ /*
+ * Even though there was an error, we may have acquired
+ * a request slot somehow. Make sure not to leak it.
+ */
+ if (task->tk_rqstp) {
+ printk(KERN_ERR "%s: status=%d, request allocated anyway\n",
+ __func__, status);
+ xprt_release(task);
+ }
+
+ switch (status) {
+ case -ENOMEM:
+ rpc_delay(task, HZ >> 2);
+ case -EAGAIN: /* woken up; retry */
+ task->tk_action = call_retry_reserve;
+ return;
+ case -EIO: /* probably a shutdown */
+ break;
+ default:
+ printk(KERN_ERR "%s: unrecognized error %d, exiting\n",
+ __func__, status);
+ break;
+ }
+ rpc_exit(task, status);
+}
+
+/*
+ * 1c. Retry reserving an RPC call slot
+ */
+static void
+call_retry_reserve(struct rpc_task *task)
+{
+ dprint_status(task);
+
+ task->tk_status = 0;
+ task->tk_action = call_reserveresult;
+ xprt_retry_reserve(task);
+}
+
+/*
+ * 2. Bind and/or refresh the credentials
+ */
+static void
+call_refresh(struct rpc_task *task)
+{
+ dprint_status(task);
+
+ task->tk_action = call_refreshresult;
+ task->tk_status = 0;
+ task->tk_client->cl_stats->rpcauthrefresh++;
+ rpcauth_refreshcred(task);
+}
+
+/*
+ * 2a. Process the results of a credential refresh
+ */
+static void
+call_refreshresult(struct rpc_task *task)
+{
+ int status = task->tk_status;
+
+ dprint_status(task);
+
+ task->tk_status = 0;
+ task->tk_action = call_refresh;
+ switch (status) {
+ case 0:
+ if (rpcauth_uptodatecred(task)) {
+ task->tk_action = call_allocate;
+ return;
+ }
+ /* Use rate-limiting and a max number of retries if refresh
+ * had status 0 but failed to update the cred.
+ */
+ case -ETIMEDOUT:
+ rpc_delay(task, 3*HZ);
+ case -EAGAIN:
+ status = -EACCES;
+ case -EKEYEXPIRED:
+ if (!task->tk_cred_retry)
+ break;
+ task->tk_cred_retry--;
+ dprintk("RPC: %5u %s: retry refresh creds\n",
+ task->tk_pid, __func__);
+ return;
+ }
+ dprintk("RPC: %5u %s: refresh creds failed with error %d\n",
+ task->tk_pid, __func__, status);
+ rpc_exit(task, status);
+}
+
+/*
+ * 2b. Allocate the buffer. For details, see sched.c:rpc_malloc.
+ * (Note: buffer memory is freed in xprt_release).
+ */
+static void
+call_allocate(struct rpc_task *task)
+{
+ unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack;
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+
+ dprint_status(task);
+
+ task->tk_status = 0;
+ task->tk_action = call_bind;
+
+ if (req->rq_buffer)
+ return;
+
+ if (proc->p_proc != 0) {
+ BUG_ON(proc->p_arglen == 0);
+ if (proc->p_decode != NULL)
+ BUG_ON(proc->p_replen == 0);
+ }
+
+ /*
+ * Calculate the size (in quads) of the RPC call
+ * and reply headers, and convert both values
+ * to byte sizes.
+ */
+ req->rq_callsize = RPC_CALLHDRSIZE + (slack << 1) + proc->p_arglen;
+ req->rq_callsize <<= 2;
+ req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
+ req->rq_rcvsize <<= 2;
+
+ req->rq_buffer = xprt->ops->buf_alloc(task,
+ req->rq_callsize + req->rq_rcvsize);
+ if (req->rq_buffer != NULL)
+ return;
+
+ dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
+
+ if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) {
+ task->tk_action = call_allocate;
+ rpc_delay(task, HZ>>4);
+ return;
+ }
+
+ rpc_exit(task, -ERESTARTSYS);
+}
+
+static inline int
+rpc_task_need_encode(struct rpc_task *task)
+{
+ return task->tk_rqstp->rq_snd_buf.len == 0;
+}
+
+static inline void
+rpc_task_force_reencode(struct rpc_task *task)
+{
+ task->tk_rqstp->rq_snd_buf.len = 0;
+ task->tk_rqstp->rq_bytes_sent = 0;
+}
+
+static inline void
+rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
+{
+ buf->head[0].iov_base = start;
+ buf->head[0].iov_len = len;
+ buf->tail[0].iov_len = 0;
+ buf->page_len = 0;
+ buf->flags = 0;
+ buf->len = 0;
+ buf->buflen = len;
+}
+
+/*
+ * 3. Encode arguments of an RPC call
+ */
+static void
+rpc_xdr_encode(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ kxdreproc_t encode;
+ __be32 *p;
+
+ dprint_status(task);
+
+ rpc_xdr_buf_init(&req->rq_snd_buf,
+ req->rq_buffer,
+ req->rq_callsize);
+ rpc_xdr_buf_init(&req->rq_rcv_buf,
+ (char *)req->rq_buffer + req->rq_callsize,
+ req->rq_rcvsize);
+
+ p = rpc_encode_header(task);
+ if (p == NULL) {
+ printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
+ rpc_exit(task, -EIO);
+ return;
+ }
+
+ encode = task->tk_msg.rpc_proc->p_encode;
+ if (encode == NULL)
+ return;
+
+ task->tk_status = rpcauth_wrap_req(task, encode, req, p,
+ task->tk_msg.rpc_argp);
+}
+
+/*
+ * 4. Get the server port number if not yet set
+ */
+static void
+call_bind(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+
+ dprint_status(task);
+
+ task->tk_action = call_connect;
+ if (!xprt_bound(xprt)) {
+ task->tk_action = call_bind_status;
+ task->tk_timeout = xprt->bind_timeout;
+ xprt->ops->rpcbind(task);
+ }
+}
+
+/*
+ * 4a. Sort out bind result
+ */
+static void
+call_bind_status(struct rpc_task *task)
+{
+ int status = -EIO;
+
+ if (task->tk_status >= 0) {
+ dprint_status(task);
+ task->tk_status = 0;
+ task->tk_action = call_connect;
+ return;
+ }
+
+ trace_rpc_bind_status(task);
+ switch (task->tk_status) {
+ case -ENOMEM:
+ dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid);
+ rpc_delay(task, HZ >> 2);
+ goto retry_timeout;
+ case -EACCES:
+ dprintk("RPC: %5u remote rpcbind: RPC program/version "
+ "unavailable\n", task->tk_pid);
+ /* fail immediately if this is an RPC ping */
+ if (task->tk_msg.rpc_proc->p_proc == 0) {
+ status = -EOPNOTSUPP;
+ break;
+ }
+ if (task->tk_rebind_retry == 0)
+ break;
+ task->tk_rebind_retry--;
+ rpc_delay(task, 3*HZ);
+ goto retry_timeout;
+ case -ETIMEDOUT:
+ dprintk("RPC: %5u rpcbind request timed out\n",
+ task->tk_pid);
+ goto retry_timeout;
+ case -EPFNOSUPPORT:
+ /* server doesn't support any rpcbind version we know of */
+ dprintk("RPC: %5u unrecognized remote rpcbind service\n",
+ task->tk_pid);
+ break;
+ case -EPROTONOSUPPORT:
+ dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
+ task->tk_pid);
+ goto retry_timeout;
+ case -ECONNREFUSED: /* connection problems */
+ case -ECONNRESET:
+ case -ECONNABORTED:
+ case -ENOTCONN:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -ENOBUFS:
+ case -EPIPE:
+ dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
+ task->tk_pid, task->tk_status);
+ if (!RPC_IS_SOFTCONN(task)) {
+ rpc_delay(task, 5*HZ);
+ goto retry_timeout;
+ }
+ status = task->tk_status;
+ break;
+ default:
+ dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
+ task->tk_pid, -task->tk_status);
+ }
+
+ rpc_exit(task, status);
+ return;
+
+retry_timeout:
+ task->tk_status = 0;
+ task->tk_action = call_timeout;
+}
+
+/*
+ * 4b. Connect to the RPC server
+ */
+static void
+call_connect(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+
+ dprintk("RPC: %5u call_connect xprt %p %s connected\n",
+ task->tk_pid, xprt,
+ (xprt_connected(xprt) ? "is" : "is not"));
+
+ task->tk_action = call_transmit;
+ if (!xprt_connected(xprt)) {
+ task->tk_action = call_connect_status;
+ if (task->tk_status < 0)
+ return;
+ if (task->tk_flags & RPC_TASK_NOCONNECT) {
+ rpc_exit(task, -ENOTCONN);
+ return;
+ }
+ xprt_connect(task);
+ }
+}
+
+/*
+ * 4c. Sort out connect result
+ */
+static void
+call_connect_status(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+ int status = task->tk_status;
+
+ dprint_status(task);
+
+ trace_rpc_connect_status(task, status);
+ task->tk_status = 0;
+ switch (status) {
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ECONNABORTED:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ case -EADDRINUSE:
+ case -ENOBUFS:
+ case -EPIPE:
+ if (RPC_IS_SOFTCONN(task))
+ break;
+ /* retry with existing socket, after a delay */
+ rpc_delay(task, 3*HZ);
+ case -EAGAIN:
+ /* Check for timeouts before looping back to call_bind */
+ case -ETIMEDOUT:
+ task->tk_action = call_timeout;
+ return;
+ case 0:
+ clnt->cl_stats->netreconn++;
+ task->tk_action = call_transmit;
+ return;
+ }
+ rpc_exit(task, status);
+}
+
+/*
+ * 5. Transmit the RPC request, and wait for reply
+ */
+static void
+call_transmit(struct rpc_task *task)
+{
+ int is_retrans = RPC_WAS_SENT(task);
+
+ dprint_status(task);
+
+ task->tk_action = call_status;
+ if (task->tk_status < 0)
+ return;
+ if (!xprt_prepare_transmit(task))
+ return;
+ task->tk_action = call_transmit_status;
+ /* Encode here so that rpcsec_gss can use correct sequence number. */
+ if (rpc_task_need_encode(task)) {
+ rpc_xdr_encode(task);
+ /* Did the encode result in an error condition? */
+ if (task->tk_status != 0) {
+ /* Was the error nonfatal? */
+ if (task->tk_status == -EAGAIN)
+ rpc_delay(task, HZ >> 4);
+ else
+ rpc_exit(task, task->tk_status);
+ return;
+ }
+ }
+ xprt_transmit(task);
+ if (task->tk_status < 0)
+ return;
+ if (is_retrans)
+ task->tk_client->cl_stats->rpcretrans++;
+ /*
+ * On success, ensure that we call xprt_end_transmit() before sleeping
+ * in order to allow access to the socket to other RPC requests.
+ */
+ call_transmit_status(task);
+ if (rpc_reply_expected(task))
+ return;
+ task->tk_action = rpc_exit_task;
+ rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task);
+}
+
+/*
+ * 5a. Handle cleanup after a transmission
+ */
+static void
+call_transmit_status(struct rpc_task *task)
+{
+ task->tk_action = call_status;
+
+ /*
+ * Common case: success. Force the compiler to put this
+ * test first.
+ */
+ if (task->tk_status == 0) {
+ xprt_end_transmit(task);
+ rpc_task_force_reencode(task);
+ return;
+ }
+
+ switch (task->tk_status) {
+ case -EAGAIN:
+ break;
+ default:
+ dprint_status(task);
+ xprt_end_transmit(task);
+ rpc_task_force_reencode(task);
+ break;
+ /*
+ * Special cases: if we've been waiting on the
+ * socket's write_space() callback, or if the
+ * socket just returned a connection error,
+ * then hold onto the transport lock.
+ */
+ case -ECONNREFUSED:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EPERM:
+ if (RPC_IS_SOFTCONN(task)) {
+ xprt_end_transmit(task);
+ rpc_exit(task, task->tk_status);
+ break;
+ }
+ case -ECONNRESET:
+ case -ECONNABORTED:
+ case -EADDRINUSE:
+ case -ENOTCONN:
+ case -ENOBUFS:
+ case -EPIPE:
+ rpc_task_force_reencode(task);
+ }
+}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/*
+ * 5b. Send the backchannel RPC reply. On error, drop the reply. In
+ * addition, disconnect on connectivity errors.
+ */
+static void
+call_bc_transmit(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ if (!xprt_prepare_transmit(task)) {
+ /*
+ * Could not reserve the transport. Try again after the
+ * transport is released.
+ */
+ task->tk_status = 0;
+ task->tk_action = call_bc_transmit;
+ return;
+ }
+
+ task->tk_action = rpc_exit_task;
+ if (task->tk_status < 0) {
+ printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+ "error: %d\n", task->tk_status);
+ return;
+ }
+
+ xprt_transmit(task);
+ xprt_end_transmit(task);
+ dprint_status(task);
+ switch (task->tk_status) {
+ case 0:
+ /* Success */
+ break;
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -ETIMEDOUT:
+ /*
+ * Problem reaching the server. Disconnect and let the
+ * forechannel reestablish the connection. The server will
+ * have to retransmit the backchannel request and we'll
+ * reprocess it. Since these ops are idempotent, there's no
+ * need to cache our reply at this time.
+ */
+ printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+ "error: %d\n", task->tk_status);
+ xprt_conditional_disconnect(req->rq_xprt,
+ req->rq_connect_cookie);
+ break;
+ default:
+ /*
+ * We were unable to reply and will have to drop the
+ * request. The server should reconnect and retransmit.
+ */
+ WARN_ON_ONCE(task->tk_status == -EAGAIN);
+ printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+ "error: %d\n", task->tk_status);
+ break;
+ }
+ rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+/*
+ * 6. Sort out the RPC call status
+ */
+static void
+call_status(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+ struct rpc_rqst *req = task->tk_rqstp;
+ int status;
+
+ if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
+ task->tk_status = req->rq_reply_bytes_recvd;
+
+ dprint_status(task);
+
+ status = task->tk_status;
+ if (status >= 0) {
+ task->tk_action = call_decode;
+ return;
+ }
+
+ trace_rpc_call_status(task);
+ task->tk_status = 0;
+ switch(status) {
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EPERM:
+ if (RPC_IS_SOFTCONN(task)) {
+ rpc_exit(task, status);
+ break;
+ }
+ /*
+ * Delay any retries for 3 seconds, then handle as if it
+ * were a timeout.
+ */
+ rpc_delay(task, 3*HZ);
+ case -ETIMEDOUT:
+ task->tk_action = call_timeout;
+ if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
+ && task->tk_client->cl_discrtry)
+ xprt_conditional_disconnect(req->rq_xprt,
+ req->rq_connect_cookie);
+ break;
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ECONNABORTED:
+ rpc_force_rebind(clnt);
+ case -EADDRINUSE:
+ case -ENOBUFS:
+ rpc_delay(task, 3*HZ);
+ case -EPIPE:
+ case -ENOTCONN:
+ task->tk_action = call_bind;
+ break;
+ case -EAGAIN:
+ task->tk_action = call_transmit;
+ break;
+ case -EIO:
+ /* shutdown or soft timeout */
+ rpc_exit(task, status);
+ break;
+ default:
+ if (clnt->cl_chatty)
+ printk("%s: RPC call returned error %d\n",
+ clnt->cl_program->name, -status);
+ rpc_exit(task, status);
+ }
+}
+
+/*
+ * 6a. Handle RPC timeout
+ * We do not release the request slot, so we keep using the
+ * same XID for all retransmits.
+ */
+static void
+call_timeout(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+
+ if (xprt_adjust_timeout(task->tk_rqstp) == 0) {
+ dprintk("RPC: %5u call_timeout (minor)\n", task->tk_pid);
+ goto retry;
+ }
+
+ dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid);
+ task->tk_timeouts++;
+
+ if (RPC_IS_SOFTCONN(task)) {
+ rpc_exit(task, -ETIMEDOUT);
+ return;
+ }
+ if (RPC_IS_SOFT(task)) {
+ if (clnt->cl_chatty) {
+ rcu_read_lock();
+ printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
+ clnt->cl_program->name,
+ rcu_dereference(clnt->cl_xprt)->servername);
+ rcu_read_unlock();
+ }
+ if (task->tk_flags & RPC_TASK_TIMEOUT)
+ rpc_exit(task, -ETIMEDOUT);
+ else
+ rpc_exit(task, -EIO);
+ return;
+ }
+
+ if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) {
+ task->tk_flags |= RPC_CALL_MAJORSEEN;
+ if (clnt->cl_chatty) {
+ rcu_read_lock();
+ printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
+ clnt->cl_program->name,
+ rcu_dereference(clnt->cl_xprt)->servername);
+ rcu_read_unlock();
+ }
+ }
+ rpc_force_rebind(clnt);
+ /*
+ * Did our request time out due to an RPCSEC_GSS out-of-sequence
+ * event? RFC2203 requires the server to drop all such requests.
+ */
+ rpcauth_invalcred(task);
+
+retry:
+ task->tk_action = call_bind;
+ task->tk_status = 0;
+}
+
+/*
+ * 7. Decode the RPC reply
+ */
+static void
+call_decode(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+ struct rpc_rqst *req = task->tk_rqstp;
+ kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode;
+ __be32 *p;
+
+ dprint_status(task);
+
+ if (task->tk_flags & RPC_CALL_MAJORSEEN) {
+ if (clnt->cl_chatty) {
+ rcu_read_lock();
+ printk(KERN_NOTICE "%s: server %s OK\n",
+ clnt->cl_program->name,
+ rcu_dereference(clnt->cl_xprt)->servername);
+ rcu_read_unlock();
+ }
+ task->tk_flags &= ~RPC_CALL_MAJORSEEN;
+ }
+
+ /*
+ * Ensure that we see all writes made by xprt_complete_rqst()
+ * before it changed req->rq_reply_bytes_recvd.
+ */
+ smp_rmb();
+ req->rq_rcv_buf.len = req->rq_private_buf.len;
+
+ /* Check that the softirq receive buffer is valid */
+ WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf,
+ sizeof(req->rq_rcv_buf)) != 0);
+
+ if (req->rq_rcv_buf.len < 12) {
+ if (!RPC_IS_SOFT(task)) {
+ task->tk_action = call_bind;
+ goto out_retry;
+ }
+ dprintk("RPC: %s: too small RPC reply size (%d bytes)\n",
+ clnt->cl_program->name, task->tk_status);
+ task->tk_action = call_timeout;
+ goto out_retry;
+ }
+
+ p = rpc_verify_header(task);
+ if (IS_ERR(p)) {
+ if (p == ERR_PTR(-EAGAIN))
+ goto out_retry;
+ return;
+ }
+
+ task->tk_action = rpc_exit_task;
+
+ if (decode) {
+ task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
+ task->tk_msg.rpc_resp);
+ }
+ dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
+ task->tk_status);
+ return;
+out_retry:
+ task->tk_status = 0;
+ /* Note: rpc_verify_header() may have freed the RPC slot */
+ if (task->tk_rqstp == req) {
+ req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
+ if (task->tk_client->cl_discrtry)
+ xprt_conditional_disconnect(req->rq_xprt,
+ req->rq_connect_cookie);
+ }
+}
+
+static __be32 *
+rpc_encode_header(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+ struct rpc_rqst *req = task->tk_rqstp;
+ __be32 *p = req->rq_svec[0].iov_base;
+
+ /* FIXME: check buffer size? */
+
+ p = xprt_skip_transport_header(req->rq_xprt, p);
+ *p++ = req->rq_xid; /* XID */
+ *p++ = htonl(RPC_CALL); /* CALL */
+ *p++ = htonl(RPC_VERSION); /* RPC version */
+ *p++ = htonl(clnt->cl_prog); /* program number */
+ *p++ = htonl(clnt->cl_vers); /* program version */
+ *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */
+ p = rpcauth_marshcred(task, p);
+ req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
+ return p;
+}
+
+static __be32 *
+rpc_verify_header(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt = task->tk_client;
+ struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0];
+ int len = task->tk_rqstp->rq_rcv_buf.len >> 2;
+ __be32 *p = iov->iov_base;
+ u32 n;
+ int error = -EACCES;
+
+ if ((task->tk_rqstp->rq_rcv_buf.len & 3) != 0) {
+ /* RFC-1014 says that the representation of XDR data must be a
+ * multiple of four bytes
+ * - if it isn't pointer subtraction in the NFS client may give
+ * undefined results
+ */
+ dprintk("RPC: %5u %s: XDR representation not a multiple of"
+ " 4 bytes: 0x%x\n", task->tk_pid, __func__,
+ task->tk_rqstp->rq_rcv_buf.len);
+ error = -EIO;
+ goto out_err;
+ }
+ if ((len -= 3) < 0)
+ goto out_overflow;
+
+ p += 1; /* skip XID */
+ if ((n = ntohl(*p++)) != RPC_REPLY) {
+ dprintk("RPC: %5u %s: not an RPC reply: %x\n",
+ task->tk_pid, __func__, n);
+ error = -EIO;
+ goto out_garbage;
+ }
+
+ if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) {
+ if (--len < 0)
+ goto out_overflow;
+ switch ((n = ntohl(*p++))) {
+ case RPC_AUTH_ERROR:
+ break;
+ case RPC_MISMATCH:
+ dprintk("RPC: %5u %s: RPC call version mismatch!\n",
+ task->tk_pid, __func__);
+ error = -EPROTONOSUPPORT;
+ goto out_err;
+ default:
+ dprintk("RPC: %5u %s: RPC call rejected, "
+ "unknown error: %x\n",
+ task->tk_pid, __func__, n);
+ error = -EIO;
+ goto out_err;
+ }
+ if (--len < 0)
+ goto out_overflow;
+ switch ((n = ntohl(*p++))) {
+ case RPC_AUTH_REJECTEDCRED:
+ case RPC_AUTH_REJECTEDVERF:
+ case RPCSEC_GSS_CREDPROBLEM:
+ case RPCSEC_GSS_CTXPROBLEM:
+ if (!task->tk_cred_retry)
+ break;
+ task->tk_cred_retry--;
+ dprintk("RPC: %5u %s: retry stale creds\n",
+ task->tk_pid, __func__);
+ rpcauth_invalcred(task);
+ /* Ensure we obtain a new XID! */
+ xprt_release(task);
+ task->tk_action = call_reserve;
+ goto out_retry;
+ case RPC_AUTH_BADCRED:
+ case RPC_AUTH_BADVERF:
+ /* possibly garbled cred/verf? */
+ if (!task->tk_garb_retry)
+ break;
+ task->tk_garb_retry--;
+ dprintk("RPC: %5u %s: retry garbled creds\n",
+ task->tk_pid, __func__);
+ task->tk_action = call_bind;
+ goto out_retry;
+ case RPC_AUTH_TOOWEAK:
+ rcu_read_lock();
+ printk(KERN_NOTICE "RPC: server %s requires stronger "
+ "authentication.\n",
+ rcu_dereference(clnt->cl_xprt)->servername);
+ rcu_read_unlock();
+ break;
+ default:
+ dprintk("RPC: %5u %s: unknown auth error: %x\n",
+ task->tk_pid, __func__, n);
+ error = -EIO;
+ }
+ dprintk("RPC: %5u %s: call rejected %d\n",
+ task->tk_pid, __func__, n);
+ goto out_err;
+ }
+ p = rpcauth_checkverf(task, p);
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
+ dprintk("RPC: %5u %s: auth check failed with %d\n",
+ task->tk_pid, __func__, error);
+ goto out_garbage; /* bad verifier, retry */
+ }
+ len = p - (__be32 *)iov->iov_base - 1;
+ if (len < 0)
+ goto out_overflow;
+ switch ((n = ntohl(*p++))) {
+ case RPC_SUCCESS:
+ return p;
+ case RPC_PROG_UNAVAIL:
+ dprintk_rcu("RPC: %5u %s: program %u is unsupported "
+ "by server %s\n", task->tk_pid, __func__,
+ (unsigned int)clnt->cl_prog,
+ rcu_dereference(clnt->cl_xprt)->servername);
+ error = -EPFNOSUPPORT;
+ goto out_err;
+ case RPC_PROG_MISMATCH:
+ dprintk_rcu("RPC: %5u %s: program %u, version %u unsupported "
+ "by server %s\n", task->tk_pid, __func__,
+ (unsigned int)clnt->cl_prog,
+ (unsigned int)clnt->cl_vers,
+ rcu_dereference(clnt->cl_xprt)->servername);
+ error = -EPROTONOSUPPORT;
+ goto out_err;
+ case RPC_PROC_UNAVAIL:
+ dprintk_rcu("RPC: %5u %s: proc %s unsupported by program %u, "
+ "version %u on server %s\n",
+ task->tk_pid, __func__,
+ rpc_proc_name(task),
+ clnt->cl_prog, clnt->cl_vers,
+ rcu_dereference(clnt->cl_xprt)->servername);
+ error = -EOPNOTSUPP;
+ goto out_err;
+ case RPC_GARBAGE_ARGS:
+ dprintk("RPC: %5u %s: server saw garbage\n",
+ task->tk_pid, __func__);
+ break; /* retry */
+ default:
+ dprintk("RPC: %5u %s: server accept status: %x\n",
+ task->tk_pid, __func__, n);
+ /* Also retry */
+ }
+
+out_garbage:
+ clnt->cl_stats->rpcgarbage++;
+ if (task->tk_garb_retry) {
+ task->tk_garb_retry--;
+ dprintk("RPC: %5u %s: retrying\n",
+ task->tk_pid, __func__);
+ task->tk_action = call_bind;
+out_retry:
+ return ERR_PTR(-EAGAIN);
+ }
+out_err:
+ rpc_exit(task, error);
+ dprintk("RPC: %5u %s: call failed with error %d\n", task->tk_pid,
+ __func__, error);
+ return ERR_PTR(error);
+out_overflow:
+ dprintk("RPC: %5u %s: server reply was truncated.\n", task->tk_pid,
+ __func__);
+ goto out_garbage;
+}
+
+static void rpcproc_encode_null(void *rqstp, struct xdr_stream *xdr, void *obj)
+{
+}
+
+static int rpcproc_decode_null(void *rqstp, struct xdr_stream *xdr, void *obj)
+{
+ return 0;
+}
+
+static struct rpc_procinfo rpcproc_null = {
+ .p_encode = rpcproc_encode_null,
+ .p_decode = rpcproc_decode_null,
+};
+
+static int rpc_ping(struct rpc_clnt *clnt)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &rpcproc_null,
+ };
+ int err;
+ msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+ err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN);
+ put_rpccred(msg.rpc_cred);
+ return err;
+}
+
+struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &rpcproc_null,
+ .rpc_cred = cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_message = &msg,
+ .callback_ops = &rpc_default_ops,
+ .flags = flags,
+ };
+ return rpc_run_task(&task_setup_data);
+}
+EXPORT_SYMBOL_GPL(rpc_call_null);
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+static void rpc_show_header(void)
+{
+ printk(KERN_INFO "-pid- flgs status -client- --rqstp- "
+ "-timeout ---ops--\n");
+}
+
+static void rpc_show_task(const struct rpc_clnt *clnt,
+ const struct rpc_task *task)
+{
+ const char *rpc_waitq = "none";
+
+ if (RPC_IS_QUEUED(task))
+ rpc_waitq = rpc_qname(task->tk_waitqueue);
+
+ printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%ps q:%s\n",
+ task->tk_pid, task->tk_flags, task->tk_status,
+ clnt, task->tk_rqstp, task->tk_timeout, task->tk_ops,
+ clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task),
+ task->tk_action, rpc_waitq);
+}
+
+void rpc_show_tasks(struct net *net)
+{
+ struct rpc_clnt *clnt;
+ struct rpc_task *task;
+ int header = 0;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ spin_lock(&sn->rpc_client_lock);
+ list_for_each_entry(clnt, &sn->all_clients, cl_clients) {
+ spin_lock(&clnt->cl_lock);
+ list_for_each_entry(task, &clnt->cl_tasks, tk_task) {
+ if (!header) {
+ rpc_show_header();
+ header++;
+ }
+ rpc_show_task(clnt, task);
+ }
+ spin_unlock(&clnt->cl_lock);
+ }
+ spin_unlock(&sn->rpc_client_lock);
+}
+#endif
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
new file mode 100644
index 000000000..82962f7e6
--- /dev/null
+++ b/net/sunrpc/debugfs.c
@@ -0,0 +1,298 @@
+/**
+ * debugfs interface for sunrpc
+ *
+ * (c) 2014 Jeff Layton <jlayton@primarydata.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include "netns.h"
+
+static struct dentry *topdir;
+static struct dentry *rpc_clnt_dir;
+static struct dentry *rpc_xprt_dir;
+
+struct rpc_clnt_iter {
+ struct rpc_clnt *clnt;
+ loff_t pos;
+};
+
+static int
+tasks_show(struct seq_file *f, void *v)
+{
+ u32 xid = 0;
+ struct rpc_task *task = v;
+ struct rpc_clnt *clnt = task->tk_client;
+ const char *rpc_waitq = "none";
+
+ if (RPC_IS_QUEUED(task))
+ rpc_waitq = rpc_qname(task->tk_waitqueue);
+
+ if (task->tk_rqstp)
+ xid = be32_to_cpu(task->tk_rqstp->rq_xid);
+
+ seq_printf(f, "%5u %04x %6d 0x%x 0x%x %8ld %ps %sv%u %s a:%ps q:%s\n",
+ task->tk_pid, task->tk_flags, task->tk_status,
+ clnt->cl_clid, xid, task->tk_timeout, task->tk_ops,
+ clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task),
+ task->tk_action, rpc_waitq);
+ return 0;
+}
+
+static void *
+tasks_start(struct seq_file *f, loff_t *ppos)
+ __acquires(&clnt->cl_lock)
+{
+ struct rpc_clnt_iter *iter = f->private;
+ loff_t pos = *ppos;
+ struct rpc_clnt *clnt = iter->clnt;
+ struct rpc_task *task;
+
+ iter->pos = pos + 1;
+ spin_lock(&clnt->cl_lock);
+ list_for_each_entry(task, &clnt->cl_tasks, tk_task)
+ if (pos-- == 0)
+ return task;
+ return NULL;
+}
+
+static void *
+tasks_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ struct rpc_clnt_iter *iter = f->private;
+ struct rpc_clnt *clnt = iter->clnt;
+ struct rpc_task *task = v;
+ struct list_head *next = task->tk_task.next;
+
+ ++iter->pos;
+ ++*pos;
+
+ /* If there's another task on list, return it */
+ if (next == &clnt->cl_tasks)
+ return NULL;
+ return list_entry(next, struct rpc_task, tk_task);
+}
+
+static void
+tasks_stop(struct seq_file *f, void *v)
+ __releases(&clnt->cl_lock)
+{
+ struct rpc_clnt_iter *iter = f->private;
+ struct rpc_clnt *clnt = iter->clnt;
+
+ spin_unlock(&clnt->cl_lock);
+}
+
+static const struct seq_operations tasks_seq_operations = {
+ .start = tasks_start,
+ .next = tasks_next,
+ .stop = tasks_stop,
+ .show = tasks_show,
+};
+
+static int tasks_open(struct inode *inode, struct file *filp)
+{
+ int ret = seq_open_private(filp, &tasks_seq_operations,
+ sizeof(struct rpc_clnt_iter));
+
+ if (!ret) {
+ struct seq_file *seq = filp->private_data;
+ struct rpc_clnt_iter *iter = seq->private;
+
+ iter->clnt = inode->i_private;
+
+ if (!atomic_inc_not_zero(&iter->clnt->cl_count)) {
+ seq_release_private(inode, filp);
+ ret = -EINVAL;
+ }
+ }
+
+ return ret;
+}
+
+static int
+tasks_release(struct inode *inode, struct file *filp)
+{
+ struct seq_file *seq = filp->private_data;
+ struct rpc_clnt_iter *iter = seq->private;
+
+ rpc_release_client(iter->clnt);
+ return seq_release_private(inode, filp);
+}
+
+static const struct file_operations tasks_fops = {
+ .owner = THIS_MODULE,
+ .open = tasks_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tasks_release,
+};
+
+void
+rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
+{
+ int len;
+ char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */
+ struct rpc_xprt *xprt;
+
+ /* Already registered? */
+ if (clnt->cl_debugfs || !rpc_clnt_dir)
+ return;
+
+ len = snprintf(name, sizeof(name), "%x", clnt->cl_clid);
+ if (len >= sizeof(name))
+ return;
+
+ /* make the per-client dir */
+ clnt->cl_debugfs = debugfs_create_dir(name, rpc_clnt_dir);
+ if (!clnt->cl_debugfs)
+ return;
+
+ /* make tasks file */
+ if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs,
+ clnt, &tasks_fops))
+ goto out_err;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ /* no "debugfs" dentry? Don't bother with the symlink. */
+ if (!xprt->debugfs) {
+ rcu_read_unlock();
+ return;
+ }
+ len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
+ xprt->debugfs->d_name.name);
+ rcu_read_unlock();
+
+ if (len >= sizeof(name))
+ goto out_err;
+
+ if (!debugfs_create_symlink("xprt", clnt->cl_debugfs, name))
+ goto out_err;
+
+ return;
+out_err:
+ debugfs_remove_recursive(clnt->cl_debugfs);
+ clnt->cl_debugfs = NULL;
+}
+
+void
+rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt)
+{
+ debugfs_remove_recursive(clnt->cl_debugfs);
+ clnt->cl_debugfs = NULL;
+}
+
+static int
+xprt_info_show(struct seq_file *f, void *v)
+{
+ struct rpc_xprt *xprt = f->private;
+
+ seq_printf(f, "netid: %s\n", xprt->address_strings[RPC_DISPLAY_NETID]);
+ seq_printf(f, "addr: %s\n", xprt->address_strings[RPC_DISPLAY_ADDR]);
+ seq_printf(f, "port: %s\n", xprt->address_strings[RPC_DISPLAY_PORT]);
+ seq_printf(f, "state: 0x%lx\n", xprt->state);
+ return 0;
+}
+
+static int
+xprt_info_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+ struct rpc_xprt *xprt = inode->i_private;
+
+ ret = single_open(filp, xprt_info_show, xprt);
+
+ if (!ret) {
+ if (!xprt_get(xprt)) {
+ single_release(inode, filp);
+ ret = -EINVAL;
+ }
+ }
+ return ret;
+}
+
+static int
+xprt_info_release(struct inode *inode, struct file *filp)
+{
+ struct rpc_xprt *xprt = inode->i_private;
+
+ xprt_put(xprt);
+ return single_release(inode, filp);
+}
+
+static const struct file_operations xprt_info_fops = {
+ .owner = THIS_MODULE,
+ .open = xprt_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = xprt_info_release,
+};
+
+void
+rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
+{
+ int len, id;
+ static atomic_t cur_id;
+ char name[9]; /* 8 hex digits + NULL term */
+
+ if (!rpc_xprt_dir)
+ return;
+
+ id = (unsigned int)atomic_inc_return(&cur_id);
+
+ len = snprintf(name, sizeof(name), "%x", id);
+ if (len >= sizeof(name))
+ return;
+
+ /* make the per-client dir */
+ xprt->debugfs = debugfs_create_dir(name, rpc_xprt_dir);
+ if (!xprt->debugfs)
+ return;
+
+ /* make tasks file */
+ if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs,
+ xprt, &xprt_info_fops)) {
+ debugfs_remove_recursive(xprt->debugfs);
+ xprt->debugfs = NULL;
+ }
+}
+
+void
+rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
+{
+ debugfs_remove_recursive(xprt->debugfs);
+ xprt->debugfs = NULL;
+}
+
+void __exit
+sunrpc_debugfs_exit(void)
+{
+ debugfs_remove_recursive(topdir);
+ topdir = NULL;
+ rpc_clnt_dir = NULL;
+ rpc_xprt_dir = NULL;
+}
+
+void __init
+sunrpc_debugfs_init(void)
+{
+ topdir = debugfs_create_dir("sunrpc", NULL);
+ if (!topdir)
+ return;
+
+ rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
+ if (!rpc_clnt_dir)
+ goto out_remove;
+
+ rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir);
+ if (!rpc_xprt_dir)
+ goto out_remove;
+
+ return;
+out_remove:
+ debugfs_remove_recursive(topdir);
+ topdir = NULL;
+ rpc_clnt_dir = NULL;
+}
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
new file mode 100644
index 000000000..df5826876
--- /dev/null
+++ b/net/sunrpc/netns.h
@@ -0,0 +1,42 @@
+#ifndef __SUNRPC_NETNS_H__
+#define __SUNRPC_NETNS_H__
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+struct cache_detail;
+
+struct sunrpc_net {
+ struct proc_dir_entry *proc_net_rpc;
+ struct cache_detail *ip_map_cache;
+ struct cache_detail *unix_gid_cache;
+ struct cache_detail *rsc_cache;
+ struct cache_detail *rsi_cache;
+
+ struct super_block *pipefs_sb;
+ struct rpc_pipe *gssd_dummy;
+ struct mutex pipefs_sb_lock;
+
+ struct list_head all_clients;
+ spinlock_t rpc_client_lock;
+
+ struct rpc_clnt *rpcb_local_clnt;
+ struct rpc_clnt *rpcb_local_clnt4;
+ spinlock_t rpcb_clnt_lock;
+ unsigned int rpcb_users;
+ unsigned int rpcb_is_af_local : 1;
+
+ struct mutex gssp_lock;
+ struct rpc_clnt *gssp_clnt;
+ int use_gss_proxy;
+ int pipe_version;
+ atomic_t pipe_users;
+ struct proc_dir_entry *use_gssp_proc;
+};
+
+extern int sunrpc_net_id;
+
+int ip_map_cache_create(struct net *);
+void ip_map_cache_destroy(struct net *);
+
+#endif
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
new file mode 100644
index 000000000..d81186d34
--- /dev/null
+++ b/net/sunrpc/rpc_pipe.c
@@ -0,0 +1,1527 @@
+/*
+ * net/sunrpc/rpc_pipe.c
+ *
+ * Userland/kernel interface for rpcauth_gss.
+ * Code shamelessly plagiarized from fs/nfsd/nfsctl.c
+ * and fs/sysfs/inode.c
+ *
+ * Copyright (c) 2002, Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/utsname.h>
+
+#include <asm/ioctls.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
+#include <linux/seq_file.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/nsproxy.h>
+#include <linux/notifier.h>
+
+#include "netns.h"
+#include "sunrpc.h"
+
+#define RPCDBG_FACILITY RPCDBG_DEBUG
+
+#define NET_NAME(net) ((net == &init_net) ? " (init_net)" : "")
+
+static struct file_system_type rpc_pipe_fs_type;
+static const struct rpc_pipe_ops gssd_dummy_pipe_ops;
+
+static struct kmem_cache *rpc_inode_cachep __read_mostly;
+
+#define RPC_UPCALL_TIMEOUT (30*HZ)
+
+static BLOCKING_NOTIFIER_HEAD(rpc_pipefs_notifier_list);
+
+int rpc_pipefs_notifier_register(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_cond_register(&rpc_pipefs_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_register);
+
+void rpc_pipefs_notifier_unregister(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&rpc_pipefs_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_unregister);
+
+static void rpc_purge_list(wait_queue_head_t *waitq, struct list_head *head,
+ void (*destroy_msg)(struct rpc_pipe_msg *), int err)
+{
+ struct rpc_pipe_msg *msg;
+
+ if (list_empty(head))
+ return;
+ do {
+ msg = list_entry(head->next, struct rpc_pipe_msg, list);
+ list_del_init(&msg->list);
+ msg->errno = err;
+ destroy_msg(msg);
+ } while (!list_empty(head));
+
+ if (waitq)
+ wake_up(waitq);
+}
+
+static void
+rpc_timeout_upcall_queue(struct work_struct *work)
+{
+ LIST_HEAD(free_list);
+ struct rpc_pipe *pipe =
+ container_of(work, struct rpc_pipe, queue_timeout.work);
+ void (*destroy_msg)(struct rpc_pipe_msg *);
+ struct dentry *dentry;
+
+ spin_lock(&pipe->lock);
+ destroy_msg = pipe->ops->destroy_msg;
+ if (pipe->nreaders == 0) {
+ list_splice_init(&pipe->pipe, &free_list);
+ pipe->pipelen = 0;
+ }
+ dentry = dget(pipe->dentry);
+ spin_unlock(&pipe->lock);
+ rpc_purge_list(dentry ? &RPC_I(d_inode(dentry))->waitq : NULL,
+ &free_list, destroy_msg, -ETIMEDOUT);
+ dput(dentry);
+}
+
+ssize_t rpc_pipe_generic_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+ char __user *dst, size_t buflen)
+{
+ char *data = (char *)msg->data + msg->copied;
+ size_t mlen = min(msg->len - msg->copied, buflen);
+ unsigned long left;
+
+ left = copy_to_user(dst, data, mlen);
+ if (left == mlen) {
+ msg->errno = -EFAULT;
+ return -EFAULT;
+ }
+
+ mlen -= left;
+ msg->copied += mlen;
+ msg->errno = 0;
+ return mlen;
+}
+EXPORT_SYMBOL_GPL(rpc_pipe_generic_upcall);
+
+/**
+ * rpc_queue_upcall - queue an upcall message to userspace
+ * @pipe: upcall pipe on which to queue given message
+ * @msg: message to queue
+ *
+ * Call with an @inode created by rpc_mkpipe() to queue an upcall.
+ * A userspace process may then later read the upcall by performing a
+ * read on an open file for this inode. It is up to the caller to
+ * initialize the fields of @msg (other than @msg->list) appropriately.
+ */
+int
+rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg)
+{
+ int res = -EPIPE;
+ struct dentry *dentry;
+
+ spin_lock(&pipe->lock);
+ if (pipe->nreaders) {
+ list_add_tail(&msg->list, &pipe->pipe);
+ pipe->pipelen += msg->len;
+ res = 0;
+ } else if (pipe->flags & RPC_PIPE_WAIT_FOR_OPEN) {
+ if (list_empty(&pipe->pipe))
+ queue_delayed_work(rpciod_workqueue,
+ &pipe->queue_timeout,
+ RPC_UPCALL_TIMEOUT);
+ list_add_tail(&msg->list, &pipe->pipe);
+ pipe->pipelen += msg->len;
+ res = 0;
+ }
+ dentry = dget(pipe->dentry);
+ spin_unlock(&pipe->lock);
+ if (dentry) {
+ wake_up(&RPC_I(d_inode(dentry))->waitq);
+ dput(dentry);
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(rpc_queue_upcall);
+
+static inline void
+rpc_inode_setowner(struct inode *inode, void *private)
+{
+ RPC_I(inode)->private = private;
+}
+
+static void
+rpc_close_pipes(struct inode *inode)
+{
+ struct rpc_pipe *pipe = RPC_I(inode)->pipe;
+ int need_release;
+ LIST_HEAD(free_list);
+
+ mutex_lock(&inode->i_mutex);
+ spin_lock(&pipe->lock);
+ need_release = pipe->nreaders != 0 || pipe->nwriters != 0;
+ pipe->nreaders = 0;
+ list_splice_init(&pipe->in_upcall, &free_list);
+ list_splice_init(&pipe->pipe, &free_list);
+ pipe->pipelen = 0;
+ pipe->dentry = NULL;
+ spin_unlock(&pipe->lock);
+ rpc_purge_list(&RPC_I(inode)->waitq, &free_list, pipe->ops->destroy_msg, -EPIPE);
+ pipe->nwriters = 0;
+ if (need_release && pipe->ops->release_pipe)
+ pipe->ops->release_pipe(inode);
+ cancel_delayed_work_sync(&pipe->queue_timeout);
+ rpc_inode_setowner(inode, NULL);
+ RPC_I(inode)->pipe = NULL;
+ mutex_unlock(&inode->i_mutex);
+}
+
+static struct inode *
+rpc_alloc_inode(struct super_block *sb)
+{
+ struct rpc_inode *rpci;
+ rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
+ if (!rpci)
+ return NULL;
+ return &rpci->vfs_inode;
+}
+
+static void
+rpc_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(rpc_inode_cachep, RPC_I(inode));
+}
+
+static void
+rpc_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, rpc_i_callback);
+}
+
+static int
+rpc_pipe_open(struct inode *inode, struct file *filp)
+{
+ struct rpc_pipe *pipe;
+ int first_open;
+ int res = -ENXIO;
+
+ mutex_lock(&inode->i_mutex);
+ pipe = RPC_I(inode)->pipe;
+ if (pipe == NULL)
+ goto out;
+ first_open = pipe->nreaders == 0 && pipe->nwriters == 0;
+ if (first_open && pipe->ops->open_pipe) {
+ res = pipe->ops->open_pipe(inode);
+ if (res)
+ goto out;
+ }
+ if (filp->f_mode & FMODE_READ)
+ pipe->nreaders++;
+ if (filp->f_mode & FMODE_WRITE)
+ pipe->nwriters++;
+ res = 0;
+out:
+ mutex_unlock(&inode->i_mutex);
+ return res;
+}
+
+static int
+rpc_pipe_release(struct inode *inode, struct file *filp)
+{
+ struct rpc_pipe *pipe;
+ struct rpc_pipe_msg *msg;
+ int last_close;
+
+ mutex_lock(&inode->i_mutex);
+ pipe = RPC_I(inode)->pipe;
+ if (pipe == NULL)
+ goto out;
+ msg = filp->private_data;
+ if (msg != NULL) {
+ spin_lock(&pipe->lock);
+ msg->errno = -EAGAIN;
+ list_del_init(&msg->list);
+ spin_unlock(&pipe->lock);
+ pipe->ops->destroy_msg(msg);
+ }
+ if (filp->f_mode & FMODE_WRITE)
+ pipe->nwriters --;
+ if (filp->f_mode & FMODE_READ) {
+ pipe->nreaders --;
+ if (pipe->nreaders == 0) {
+ LIST_HEAD(free_list);
+ spin_lock(&pipe->lock);
+ list_splice_init(&pipe->pipe, &free_list);
+ pipe->pipelen = 0;
+ spin_unlock(&pipe->lock);
+ rpc_purge_list(&RPC_I(inode)->waitq, &free_list,
+ pipe->ops->destroy_msg, -EAGAIN);
+ }
+ }
+ last_close = pipe->nwriters == 0 && pipe->nreaders == 0;
+ if (last_close && pipe->ops->release_pipe)
+ pipe->ops->release_pipe(inode);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+}
+
+static ssize_t
+rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
+{
+ struct inode *inode = file_inode(filp);
+ struct rpc_pipe *pipe;
+ struct rpc_pipe_msg *msg;
+ int res = 0;
+
+ mutex_lock(&inode->i_mutex);
+ pipe = RPC_I(inode)->pipe;
+ if (pipe == NULL) {
+ res = -EPIPE;
+ goto out_unlock;
+ }
+ msg = filp->private_data;
+ if (msg == NULL) {
+ spin_lock(&pipe->lock);
+ if (!list_empty(&pipe->pipe)) {
+ msg = list_entry(pipe->pipe.next,
+ struct rpc_pipe_msg,
+ list);
+ list_move(&msg->list, &pipe->in_upcall);
+ pipe->pipelen -= msg->len;
+ filp->private_data = msg;
+ msg->copied = 0;
+ }
+ spin_unlock(&pipe->lock);
+ if (msg == NULL)
+ goto out_unlock;
+ }
+ /* NOTE: it is up to the callback to update msg->copied */
+ res = pipe->ops->upcall(filp, msg, buf, len);
+ if (res < 0 || msg->len == msg->copied) {
+ filp->private_data = NULL;
+ spin_lock(&pipe->lock);
+ list_del_init(&msg->list);
+ spin_unlock(&pipe->lock);
+ pipe->ops->destroy_msg(msg);
+ }
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ return res;
+}
+
+static ssize_t
+rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset)
+{
+ struct inode *inode = file_inode(filp);
+ int res;
+
+ mutex_lock(&inode->i_mutex);
+ res = -EPIPE;
+ if (RPC_I(inode)->pipe != NULL)
+ res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len);
+ mutex_unlock(&inode->i_mutex);
+ return res;
+}
+
+static unsigned int
+rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct inode *inode = file_inode(filp);
+ struct rpc_inode *rpci = RPC_I(inode);
+ unsigned int mask = POLLOUT | POLLWRNORM;
+
+ poll_wait(filp, &rpci->waitq, wait);
+
+ mutex_lock(&inode->i_mutex);
+ if (rpci->pipe == NULL)
+ mask |= POLLERR | POLLHUP;
+ else if (filp->private_data || !list_empty(&rpci->pipe->pipe))
+ mask |= POLLIN | POLLRDNORM;
+ mutex_unlock(&inode->i_mutex);
+ return mask;
+}
+
+static long
+rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct rpc_pipe *pipe;
+ int len;
+
+ switch (cmd) {
+ case FIONREAD:
+ mutex_lock(&inode->i_mutex);
+ pipe = RPC_I(inode)->pipe;
+ if (pipe == NULL) {
+ mutex_unlock(&inode->i_mutex);
+ return -EPIPE;
+ }
+ spin_lock(&pipe->lock);
+ len = pipe->pipelen;
+ if (filp->private_data) {
+ struct rpc_pipe_msg *msg;
+ msg = filp->private_data;
+ len += msg->len - msg->copied;
+ }
+ spin_unlock(&pipe->lock);
+ mutex_unlock(&inode->i_mutex);
+ return put_user(len, (int __user *)arg);
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct file_operations rpc_pipe_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = rpc_pipe_read,
+ .write = rpc_pipe_write,
+ .poll = rpc_pipe_poll,
+ .unlocked_ioctl = rpc_pipe_ioctl,
+ .open = rpc_pipe_open,
+ .release = rpc_pipe_release,
+};
+
+static int
+rpc_show_info(struct seq_file *m, void *v)
+{
+ struct rpc_clnt *clnt = m->private;
+
+ rcu_read_lock();
+ seq_printf(m, "RPC server: %s\n",
+ rcu_dereference(clnt->cl_xprt)->servername);
+ seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_program->name,
+ clnt->cl_prog, clnt->cl_vers);
+ seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
+ seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO));
+ seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT));
+ rcu_read_unlock();
+ return 0;
+}
+
+static int
+rpc_info_open(struct inode *inode, struct file *file)
+{
+ struct rpc_clnt *clnt = NULL;
+ int ret = single_open(file, rpc_show_info, NULL);
+
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+
+ spin_lock(&file->f_path.dentry->d_lock);
+ if (!d_unhashed(file->f_path.dentry))
+ clnt = RPC_I(inode)->private;
+ if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) {
+ spin_unlock(&file->f_path.dentry->d_lock);
+ m->private = clnt;
+ } else {
+ spin_unlock(&file->f_path.dentry->d_lock);
+ single_release(inode, file);
+ ret = -EINVAL;
+ }
+ }
+ return ret;
+}
+
+static int
+rpc_info_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ struct rpc_clnt *clnt = (struct rpc_clnt *)m->private;
+
+ if (clnt)
+ rpc_release_client(clnt);
+ return single_release(inode, file);
+}
+
+static const struct file_operations rpc_info_operations = {
+ .owner = THIS_MODULE,
+ .open = rpc_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = rpc_info_release,
+};
+
+
+/*
+ * Description of fs contents.
+ */
+struct rpc_filelist {
+ const char *name;
+ const struct file_operations *i_fop;
+ umode_t mode;
+};
+
+static struct inode *
+rpc_get_inode(struct super_block *sb, umode_t mode)
+{
+ struct inode *inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ inode->i_fop = &simple_dir_operations;
+ inode->i_op = &simple_dir_inode_operations;
+ inc_nlink(inode);
+ default:
+ break;
+ }
+ return inode;
+}
+
+static int __rpc_create_common(struct inode *dir, struct dentry *dentry,
+ umode_t mode,
+ const struct file_operations *i_fop,
+ void *private)
+{
+ struct inode *inode;
+
+ d_drop(dentry);
+ inode = rpc_get_inode(dir->i_sb, mode);
+ if (!inode)
+ goto out_err;
+ inode->i_ino = iunique(dir->i_sb, 100);
+ if (i_fop)
+ inode->i_fop = i_fop;
+ if (private)
+ rpc_inode_setowner(inode, private);
+ d_add(dentry, inode);
+ return 0;
+out_err:
+ printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %pd\n",
+ __FILE__, __func__, dentry);
+ dput(dentry);
+ return -ENOMEM;
+}
+
+static int __rpc_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode,
+ const struct file_operations *i_fop,
+ void *private)
+{
+ int err;
+
+ err = __rpc_create_common(dir, dentry, S_IFREG | mode, i_fop, private);
+ if (err)
+ return err;
+ fsnotify_create(dir, dentry);
+ return 0;
+}
+
+static int __rpc_mkdir(struct inode *dir, struct dentry *dentry,
+ umode_t mode,
+ const struct file_operations *i_fop,
+ void *private)
+{
+ int err;
+
+ err = __rpc_create_common(dir, dentry, S_IFDIR | mode, i_fop, private);
+ if (err)
+ return err;
+ inc_nlink(dir);
+ fsnotify_mkdir(dir, dentry);
+ return 0;
+}
+
+static void
+init_pipe(struct rpc_pipe *pipe)
+{
+ pipe->nreaders = 0;
+ pipe->nwriters = 0;
+ INIT_LIST_HEAD(&pipe->in_upcall);
+ INIT_LIST_HEAD(&pipe->in_downcall);
+ INIT_LIST_HEAD(&pipe->pipe);
+ pipe->pipelen = 0;
+ INIT_DELAYED_WORK(&pipe->queue_timeout,
+ rpc_timeout_upcall_queue);
+ pipe->ops = NULL;
+ spin_lock_init(&pipe->lock);
+ pipe->dentry = NULL;
+}
+
+void rpc_destroy_pipe_data(struct rpc_pipe *pipe)
+{
+ kfree(pipe);
+}
+EXPORT_SYMBOL_GPL(rpc_destroy_pipe_data);
+
+struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags)
+{
+ struct rpc_pipe *pipe;
+
+ pipe = kzalloc(sizeof(struct rpc_pipe), GFP_KERNEL);
+ if (!pipe)
+ return ERR_PTR(-ENOMEM);
+ init_pipe(pipe);
+ pipe->ops = ops;
+ pipe->flags = flags;
+ return pipe;
+}
+EXPORT_SYMBOL_GPL(rpc_mkpipe_data);
+
+static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry,
+ umode_t mode,
+ const struct file_operations *i_fop,
+ void *private,
+ struct rpc_pipe *pipe)
+{
+ struct rpc_inode *rpci;
+ int err;
+
+ err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private);
+ if (err)
+ return err;
+ rpci = RPC_I(d_inode(dentry));
+ rpci->private = private;
+ rpci->pipe = pipe;
+ fsnotify_create(dir, dentry);
+ return 0;
+}
+
+static int __rpc_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int ret;
+
+ dget(dentry);
+ ret = simple_rmdir(dir, dentry);
+ d_delete(dentry);
+ dput(dentry);
+ return ret;
+}
+
+int rpc_rmdir(struct dentry *dentry)
+{
+ struct dentry *parent;
+ struct inode *dir;
+ int error;
+
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ error = __rpc_rmdir(dir, dentry);
+ mutex_unlock(&dir->i_mutex);
+ dput(parent);
+ return error;
+}
+EXPORT_SYMBOL_GPL(rpc_rmdir);
+
+static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int ret;
+
+ dget(dentry);
+ ret = simple_unlink(dir, dentry);
+ d_delete(dentry);
+ dput(dentry);
+ return ret;
+}
+
+static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ rpc_close_pipes(inode);
+ return __rpc_unlink(dir, dentry);
+}
+
+static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
+ const char *name)
+{
+ struct qstr q = QSTR_INIT(name, strlen(name));
+ struct dentry *dentry = d_hash_and_lookup(parent, &q);
+ if (!dentry) {
+ dentry = d_alloc(parent, &q);
+ if (!dentry)
+ return ERR_PTR(-ENOMEM);
+ }
+ if (d_really_is_negative(dentry))
+ return dentry;
+ dput(dentry);
+ return ERR_PTR(-EEXIST);
+}
+
+/*
+ * FIXME: This probably has races.
+ */
+static void __rpc_depopulate(struct dentry *parent,
+ const struct rpc_filelist *files,
+ int start, int eof)
+{
+ struct inode *dir = d_inode(parent);
+ struct dentry *dentry;
+ struct qstr name;
+ int i;
+
+ for (i = start; i < eof; i++) {
+ name.name = files[i].name;
+ name.len = strlen(files[i].name);
+ dentry = d_hash_and_lookup(parent, &name);
+
+ if (dentry == NULL)
+ continue;
+ if (d_really_is_negative(dentry))
+ goto next;
+ switch (d_inode(dentry)->i_mode & S_IFMT) {
+ default:
+ BUG();
+ case S_IFREG:
+ __rpc_unlink(dir, dentry);
+ break;
+ case S_IFDIR:
+ __rpc_rmdir(dir, dentry);
+ }
+next:
+ dput(dentry);
+ }
+}
+
+static void rpc_depopulate(struct dentry *parent,
+ const struct rpc_filelist *files,
+ int start, int eof)
+{
+ struct inode *dir = d_inode(parent);
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
+ __rpc_depopulate(parent, files, start, eof);
+ mutex_unlock(&dir->i_mutex);
+}
+
+static int rpc_populate(struct dentry *parent,
+ const struct rpc_filelist *files,
+ int start, int eof,
+ void *private)
+{
+ struct inode *dir = d_inode(parent);
+ struct dentry *dentry;
+ int i, err;
+
+ mutex_lock(&dir->i_mutex);
+ for (i = start; i < eof; i++) {
+ dentry = __rpc_lookup_create_exclusive(parent, files[i].name);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ goto out_bad;
+ switch (files[i].mode & S_IFMT) {
+ default:
+ BUG();
+ case S_IFREG:
+ err = __rpc_create(dir, dentry,
+ files[i].mode,
+ files[i].i_fop,
+ private);
+ break;
+ case S_IFDIR:
+ err = __rpc_mkdir(dir, dentry,
+ files[i].mode,
+ NULL,
+ private);
+ }
+ if (err != 0)
+ goto out_bad;
+ }
+ mutex_unlock(&dir->i_mutex);
+ return 0;
+out_bad:
+ __rpc_depopulate(parent, files, start, eof);
+ mutex_unlock(&dir->i_mutex);
+ printk(KERN_WARNING "%s: %s failed to populate directory %pd\n",
+ __FILE__, __func__, parent);
+ return err;
+}
+
+static struct dentry *rpc_mkdir_populate(struct dentry *parent,
+ const char *name, umode_t mode, void *private,
+ int (*populate)(struct dentry *, void *), void *args_populate)
+{
+ struct dentry *dentry;
+ struct inode *dir = d_inode(parent);
+ int error;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ dentry = __rpc_lookup_create_exclusive(parent, name);
+ if (IS_ERR(dentry))
+ goto out;
+ error = __rpc_mkdir(dir, dentry, mode, NULL, private);
+ if (error != 0)
+ goto out_err;
+ if (populate != NULL) {
+ error = populate(dentry, args_populate);
+ if (error)
+ goto err_rmdir;
+ }
+out:
+ mutex_unlock(&dir->i_mutex);
+ return dentry;
+err_rmdir:
+ __rpc_rmdir(dir, dentry);
+out_err:
+ dentry = ERR_PTR(error);
+ goto out;
+}
+
+static int rpc_rmdir_depopulate(struct dentry *dentry,
+ void (*depopulate)(struct dentry *))
+{
+ struct dentry *parent;
+ struct inode *dir;
+ int error;
+
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ if (depopulate != NULL)
+ depopulate(dentry);
+ error = __rpc_rmdir(dir, dentry);
+ mutex_unlock(&dir->i_mutex);
+ dput(parent);
+ return error;
+}
+
+/**
+ * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication
+ * @parent: dentry of directory to create new "pipe" in
+ * @name: name of pipe
+ * @private: private data to associate with the pipe, for the caller's use
+ * @pipe: &rpc_pipe containing input parameters
+ *
+ * Data is made available for userspace to read by calls to
+ * rpc_queue_upcall(). The actual reads will result in calls to
+ * @ops->upcall, which will be called with the file pointer,
+ * message, and userspace buffer to copy to.
+ *
+ * Writes can come at any time, and do not necessarily have to be
+ * responses to upcalls. They will result in calls to @msg->downcall.
+ *
+ * The @private argument passed here will be available to all these methods
+ * from the file pointer, via RPC_I(file_inode(file))->private.
+ */
+struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
+ void *private, struct rpc_pipe *pipe)
+{
+ struct dentry *dentry;
+ struct inode *dir = d_inode(parent);
+ umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR;
+ int err;
+
+ if (pipe->ops->upcall == NULL)
+ umode &= ~S_IRUGO;
+ if (pipe->ops->downcall == NULL)
+ umode &= ~S_IWUGO;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ dentry = __rpc_lookup_create_exclusive(parent, name);
+ if (IS_ERR(dentry))
+ goto out;
+ err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops,
+ private, pipe);
+ if (err)
+ goto out_err;
+out:
+ mutex_unlock(&dir->i_mutex);
+ return dentry;
+out_err:
+ dentry = ERR_PTR(err);
+ printk(KERN_WARNING "%s: %s() failed to create pipe %pd/%s (errno = %d)\n",
+ __FILE__, __func__, parent, name,
+ err);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry);
+
+/**
+ * rpc_unlink - remove a pipe
+ * @dentry: dentry for the pipe, as returned from rpc_mkpipe
+ *
+ * After this call, lookups will no longer find the pipe, and any
+ * attempts to read or write using preexisting opens of the pipe will
+ * return -EPIPE.
+ */
+int
+rpc_unlink(struct dentry *dentry)
+{
+ struct dentry *parent;
+ struct inode *dir;
+ int error = 0;
+
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ error = __rpc_rmpipe(dir, dentry);
+ mutex_unlock(&dir->i_mutex);
+ dput(parent);
+ return error;
+}
+EXPORT_SYMBOL_GPL(rpc_unlink);
+
+/**
+ * rpc_init_pipe_dir_head - initialise a struct rpc_pipe_dir_head
+ * @pdh: pointer to struct rpc_pipe_dir_head
+ */
+void rpc_init_pipe_dir_head(struct rpc_pipe_dir_head *pdh)
+{
+ INIT_LIST_HEAD(&pdh->pdh_entries);
+ pdh->pdh_dentry = NULL;
+}
+EXPORT_SYMBOL_GPL(rpc_init_pipe_dir_head);
+
+/**
+ * rpc_init_pipe_dir_object - initialise a struct rpc_pipe_dir_object
+ * @pdo: pointer to struct rpc_pipe_dir_object
+ * @pdo_ops: pointer to const struct rpc_pipe_dir_object_ops
+ * @pdo_data: pointer to caller-defined data
+ */
+void rpc_init_pipe_dir_object(struct rpc_pipe_dir_object *pdo,
+ const struct rpc_pipe_dir_object_ops *pdo_ops,
+ void *pdo_data)
+{
+ INIT_LIST_HEAD(&pdo->pdo_head);
+ pdo->pdo_ops = pdo_ops;
+ pdo->pdo_data = pdo_data;
+}
+EXPORT_SYMBOL_GPL(rpc_init_pipe_dir_object);
+
+static int
+rpc_add_pipe_dir_object_locked(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ struct rpc_pipe_dir_object *pdo)
+{
+ int ret = 0;
+
+ if (pdh->pdh_dentry)
+ ret = pdo->pdo_ops->create(pdh->pdh_dentry, pdo);
+ if (ret == 0)
+ list_add_tail(&pdo->pdo_head, &pdh->pdh_entries);
+ return ret;
+}
+
+static void
+rpc_remove_pipe_dir_object_locked(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ struct rpc_pipe_dir_object *pdo)
+{
+ if (pdh->pdh_dentry)
+ pdo->pdo_ops->destroy(pdh->pdh_dentry, pdo);
+ list_del_init(&pdo->pdo_head);
+}
+
+/**
+ * rpc_add_pipe_dir_object - associate a rpc_pipe_dir_object to a directory
+ * @net: pointer to struct net
+ * @pdh: pointer to struct rpc_pipe_dir_head
+ * @pdo: pointer to struct rpc_pipe_dir_object
+ *
+ */
+int
+rpc_add_pipe_dir_object(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ struct rpc_pipe_dir_object *pdo)
+{
+ int ret = 0;
+
+ if (list_empty(&pdo->pdo_head)) {
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ mutex_lock(&sn->pipefs_sb_lock);
+ ret = rpc_add_pipe_dir_object_locked(net, pdh, pdo);
+ mutex_unlock(&sn->pipefs_sb_lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_add_pipe_dir_object);
+
+/**
+ * rpc_remove_pipe_dir_object - remove a rpc_pipe_dir_object from a directory
+ * @net: pointer to struct net
+ * @pdh: pointer to struct rpc_pipe_dir_head
+ * @pdo: pointer to struct rpc_pipe_dir_object
+ *
+ */
+void
+rpc_remove_pipe_dir_object(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ struct rpc_pipe_dir_object *pdo)
+{
+ if (!list_empty(&pdo->pdo_head)) {
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ mutex_lock(&sn->pipefs_sb_lock);
+ rpc_remove_pipe_dir_object_locked(net, pdh, pdo);
+ mutex_unlock(&sn->pipefs_sb_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(rpc_remove_pipe_dir_object);
+
+/**
+ * rpc_find_or_alloc_pipe_dir_object
+ * @net: pointer to struct net
+ * @pdh: pointer to struct rpc_pipe_dir_head
+ * @match: match struct rpc_pipe_dir_object to data
+ * @alloc: allocate a new struct rpc_pipe_dir_object
+ * @data: user defined data for match() and alloc()
+ *
+ */
+struct rpc_pipe_dir_object *
+rpc_find_or_alloc_pipe_dir_object(struct net *net,
+ struct rpc_pipe_dir_head *pdh,
+ int (*match)(struct rpc_pipe_dir_object *, void *),
+ struct rpc_pipe_dir_object *(*alloc)(void *),
+ void *data)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_pipe_dir_object *pdo;
+
+ mutex_lock(&sn->pipefs_sb_lock);
+ list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head) {
+ if (!match(pdo, data))
+ continue;
+ goto out;
+ }
+ pdo = alloc(data);
+ if (!pdo)
+ goto out;
+ rpc_add_pipe_dir_object_locked(net, pdh, pdo);
+out:
+ mutex_unlock(&sn->pipefs_sb_lock);
+ return pdo;
+}
+EXPORT_SYMBOL_GPL(rpc_find_or_alloc_pipe_dir_object);
+
+static void
+rpc_create_pipe_dir_objects(struct rpc_pipe_dir_head *pdh)
+{
+ struct rpc_pipe_dir_object *pdo;
+ struct dentry *dir = pdh->pdh_dentry;
+
+ list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head)
+ pdo->pdo_ops->create(dir, pdo);
+}
+
+static void
+rpc_destroy_pipe_dir_objects(struct rpc_pipe_dir_head *pdh)
+{
+ struct rpc_pipe_dir_object *pdo;
+ struct dentry *dir = pdh->pdh_dentry;
+
+ list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head)
+ pdo->pdo_ops->destroy(dir, pdo);
+}
+
+enum {
+ RPCAUTH_info,
+ RPCAUTH_EOF
+};
+
+static const struct rpc_filelist authfiles[] = {
+ [RPCAUTH_info] = {
+ .name = "info",
+ .i_fop = &rpc_info_operations,
+ .mode = S_IFREG | S_IRUSR,
+ },
+};
+
+static int rpc_clntdir_populate(struct dentry *dentry, void *private)
+{
+ return rpc_populate(dentry,
+ authfiles, RPCAUTH_info, RPCAUTH_EOF,
+ private);
+}
+
+static void rpc_clntdir_depopulate(struct dentry *dentry)
+{
+ rpc_depopulate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF);
+}
+
+/**
+ * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs
+ * @dentry: the parent of new directory
+ * @name: the name of new directory
+ * @rpc_client: rpc client to associate with this directory
+ *
+ * This creates a directory at the given @path associated with
+ * @rpc_clnt, which will contain a file named "info" with some basic
+ * information about the client, together with any "pipes" that may
+ * later be created using rpc_mkpipe().
+ */
+struct dentry *rpc_create_client_dir(struct dentry *dentry,
+ const char *name,
+ struct rpc_clnt *rpc_client)
+{
+ struct dentry *ret;
+
+ ret = rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL,
+ rpc_clntdir_populate, rpc_client);
+ if (!IS_ERR(ret)) {
+ rpc_client->cl_pipedir_objects.pdh_dentry = ret;
+ rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
+ }
+ return ret;
+}
+
+/**
+ * rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir()
+ * @rpc_client: rpc_client for the pipe
+ */
+int rpc_remove_client_dir(struct rpc_clnt *rpc_client)
+{
+ struct dentry *dentry = rpc_client->cl_pipedir_objects.pdh_dentry;
+
+ if (dentry == NULL)
+ return 0;
+ rpc_destroy_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
+ rpc_client->cl_pipedir_objects.pdh_dentry = NULL;
+ return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate);
+}
+
+static const struct rpc_filelist cache_pipefs_files[3] = {
+ [0] = {
+ .name = "channel",
+ .i_fop = &cache_file_operations_pipefs,
+ .mode = S_IFREG|S_IRUSR|S_IWUSR,
+ },
+ [1] = {
+ .name = "content",
+ .i_fop = &content_file_operations_pipefs,
+ .mode = S_IFREG|S_IRUSR,
+ },
+ [2] = {
+ .name = "flush",
+ .i_fop = &cache_flush_operations_pipefs,
+ .mode = S_IFREG|S_IRUSR|S_IWUSR,
+ },
+};
+
+static int rpc_cachedir_populate(struct dentry *dentry, void *private)
+{
+ return rpc_populate(dentry,
+ cache_pipefs_files, 0, 3,
+ private);
+}
+
+static void rpc_cachedir_depopulate(struct dentry *dentry)
+{
+ rpc_depopulate(dentry, cache_pipefs_files, 0, 3);
+}
+
+struct dentry *rpc_create_cache_dir(struct dentry *parent, const char *name,
+ umode_t umode, struct cache_detail *cd)
+{
+ return rpc_mkdir_populate(parent, name, umode, NULL,
+ rpc_cachedir_populate, cd);
+}
+
+void rpc_remove_cache_dir(struct dentry *dentry)
+{
+ rpc_rmdir_depopulate(dentry, rpc_cachedir_depopulate);
+}
+
+/*
+ * populate the filesystem
+ */
+static const struct super_operations s_ops = {
+ .alloc_inode = rpc_alloc_inode,
+ .destroy_inode = rpc_destroy_inode,
+ .statfs = simple_statfs,
+};
+
+#define RPCAUTH_GSSMAGIC 0x67596969
+
+/*
+ * We have a single directory with 1 node in it.
+ */
+enum {
+ RPCAUTH_lockd,
+ RPCAUTH_mount,
+ RPCAUTH_nfs,
+ RPCAUTH_portmap,
+ RPCAUTH_statd,
+ RPCAUTH_nfsd4_cb,
+ RPCAUTH_cache,
+ RPCAUTH_nfsd,
+ RPCAUTH_gssd,
+ RPCAUTH_RootEOF
+};
+
+static const struct rpc_filelist files[] = {
+ [RPCAUTH_lockd] = {
+ .name = "lockd",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+ [RPCAUTH_mount] = {
+ .name = "mount",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+ [RPCAUTH_nfs] = {
+ .name = "nfs",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+ [RPCAUTH_portmap] = {
+ .name = "portmap",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+ [RPCAUTH_statd] = {
+ .name = "statd",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+ [RPCAUTH_nfsd4_cb] = {
+ .name = "nfsd4_cb",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+ [RPCAUTH_cache] = {
+ .name = "cache",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+ [RPCAUTH_nfsd] = {
+ .name = "nfsd",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+ [RPCAUTH_gssd] = {
+ .name = "gssd",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+};
+
+/*
+ * This call can be used only in RPC pipefs mount notification hooks.
+ */
+struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
+ const unsigned char *dir_name)
+{
+ struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name));
+ return d_hash_and_lookup(sb->s_root, &dir);
+}
+EXPORT_SYMBOL_GPL(rpc_d_lookup_sb);
+
+int rpc_pipefs_init_net(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ sn->gssd_dummy = rpc_mkpipe_data(&gssd_dummy_pipe_ops, 0);
+ if (IS_ERR(sn->gssd_dummy))
+ return PTR_ERR(sn->gssd_dummy);
+
+ mutex_init(&sn->pipefs_sb_lock);
+ sn->pipe_version = -1;
+ return 0;
+}
+
+void rpc_pipefs_exit_net(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ rpc_destroy_pipe_data(sn->gssd_dummy);
+}
+
+/*
+ * This call will be used for per network namespace operations calls.
+ * Note: Function will be returned with pipefs_sb_lock taken if superblock was
+ * found. This lock have to be released by rpc_put_sb_net() when all operations
+ * will be completed.
+ */
+struct super_block *rpc_get_sb_net(const struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ mutex_lock(&sn->pipefs_sb_lock);
+ if (sn->pipefs_sb)
+ return sn->pipefs_sb;
+ mutex_unlock(&sn->pipefs_sb_lock);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(rpc_get_sb_net);
+
+void rpc_put_sb_net(const struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ WARN_ON(sn->pipefs_sb == NULL);
+ mutex_unlock(&sn->pipefs_sb_lock);
+}
+EXPORT_SYMBOL_GPL(rpc_put_sb_net);
+
+static const struct rpc_filelist gssd_dummy_clnt_dir[] = {
+ [0] = {
+ .name = "clntXX",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+};
+
+static ssize_t
+dummy_downcall(struct file *filp, const char __user *src, size_t len)
+{
+ return -EINVAL;
+}
+
+static const struct rpc_pipe_ops gssd_dummy_pipe_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = dummy_downcall,
+};
+
+/*
+ * Here we present a bogus "info" file to keep rpc.gssd happy. We don't expect
+ * that it will ever use this info to handle an upcall, but rpc.gssd expects
+ * that this file will be there and have a certain format.
+ */
+static int
+rpc_show_dummy_info(struct seq_file *m, void *v)
+{
+ seq_printf(m, "RPC server: %s\n", utsname()->nodename);
+ seq_printf(m, "service: foo (1) version 0\n");
+ seq_printf(m, "address: 127.0.0.1\n");
+ seq_printf(m, "protocol: tcp\n");
+ seq_printf(m, "port: 0\n");
+ return 0;
+}
+
+static int
+rpc_dummy_info_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rpc_show_dummy_info, NULL);
+}
+
+static const struct file_operations rpc_dummy_info_operations = {
+ .owner = THIS_MODULE,
+ .open = rpc_dummy_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct rpc_filelist gssd_dummy_info_file[] = {
+ [0] = {
+ .name = "info",
+ .i_fop = &rpc_dummy_info_operations,
+ .mode = S_IFREG | S_IRUSR,
+ },
+};
+
+/**
+ * rpc_gssd_dummy_populate - create a dummy gssd pipe
+ * @root: root of the rpc_pipefs filesystem
+ * @pipe_data: pipe data created when netns is initialized
+ *
+ * Create a dummy set of directories and a pipe that gssd can hold open to
+ * indicate that it is up and running.
+ */
+static struct dentry *
+rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
+{
+ int ret = 0;
+ struct dentry *gssd_dentry;
+ struct dentry *clnt_dentry = NULL;
+ struct dentry *pipe_dentry = NULL;
+ struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name,
+ strlen(files[RPCAUTH_gssd].name));
+
+ /* We should never get this far if "gssd" doesn't exist */
+ gssd_dentry = d_hash_and_lookup(root, &q);
+ if (!gssd_dentry)
+ return ERR_PTR(-ENOENT);
+
+ ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL);
+ if (ret) {
+ pipe_dentry = ERR_PTR(ret);
+ goto out;
+ }
+
+ q.name = gssd_dummy_clnt_dir[0].name;
+ q.len = strlen(gssd_dummy_clnt_dir[0].name);
+ clnt_dentry = d_hash_and_lookup(gssd_dentry, &q);
+ if (!clnt_dentry) {
+ pipe_dentry = ERR_PTR(-ENOENT);
+ goto out;
+ }
+
+ ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL);
+ if (ret) {
+ __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
+ pipe_dentry = ERR_PTR(ret);
+ goto out;
+ }
+
+ pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data);
+ if (IS_ERR(pipe_dentry)) {
+ __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1);
+ __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
+ }
+out:
+ dput(clnt_dentry);
+ dput(gssd_dentry);
+ return pipe_dentry;
+}
+
+static void
+rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
+{
+ struct dentry *clnt_dir = pipe_dentry->d_parent;
+ struct dentry *gssd_dir = clnt_dir->d_parent;
+
+ __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry);
+ __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
+ __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
+ dput(pipe_dentry);
+}
+
+static int
+rpc_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct inode *inode;
+ struct dentry *root, *gssd_dentry;
+ struct net *net = data;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ int err;
+
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = RPCAUTH_GSSMAGIC;
+ sb->s_op = &s_ops;
+ sb->s_d_op = &simple_dentry_operations;
+ sb->s_time_gran = 1;
+
+ inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
+ sb->s_root = root = d_make_root(inode);
+ if (!root)
+ return -ENOMEM;
+ if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL))
+ return -ENOMEM;
+
+ gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy);
+ if (IS_ERR(gssd_dentry)) {
+ __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF);
+ return PTR_ERR(gssd_dentry);
+ }
+
+ dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n",
+ net, NET_NAME(net));
+ mutex_lock(&sn->pipefs_sb_lock);
+ sn->pipefs_sb = sb;
+ err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
+ RPC_PIPEFS_MOUNT,
+ sb);
+ if (err)
+ goto err_depopulate;
+ sb->s_fs_info = get_net(net);
+ mutex_unlock(&sn->pipefs_sb_lock);
+ return 0;
+
+err_depopulate:
+ rpc_gssd_dummy_depopulate(gssd_dentry);
+ blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
+ RPC_PIPEFS_UMOUNT,
+ sb);
+ sn->pipefs_sb = NULL;
+ __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF);
+ mutex_unlock(&sn->pipefs_sb_lock);
+ return err;
+}
+
+bool
+gssd_running(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_pipe *pipe = sn->gssd_dummy;
+
+ return pipe->nreaders || pipe->nwriters;
+}
+EXPORT_SYMBOL_GPL(gssd_running);
+
+static struct dentry *
+rpc_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super);
+}
+
+static void rpc_kill_sb(struct super_block *sb)
+{
+ struct net *net = sb->s_fs_info;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ mutex_lock(&sn->pipefs_sb_lock);
+ if (sn->pipefs_sb != sb) {
+ mutex_unlock(&sn->pipefs_sb_lock);
+ goto out;
+ }
+ sn->pipefs_sb = NULL;
+ dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n",
+ net, NET_NAME(net));
+ blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
+ RPC_PIPEFS_UMOUNT,
+ sb);
+ mutex_unlock(&sn->pipefs_sb_lock);
+ put_net(net);
+out:
+ kill_litter_super(sb);
+}
+
+static struct file_system_type rpc_pipe_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "rpc_pipefs",
+ .mount = rpc_mount,
+ .kill_sb = rpc_kill_sb,
+};
+MODULE_ALIAS_FS("rpc_pipefs");
+MODULE_ALIAS("rpc_pipefs");
+
+static void
+init_once(void *foo)
+{
+ struct rpc_inode *rpci = (struct rpc_inode *) foo;
+
+ inode_init_once(&rpci->vfs_inode);
+ rpci->private = NULL;
+ rpci->pipe = NULL;
+ init_waitqueue_head(&rpci->waitq);
+}
+
+int register_rpc_pipefs(void)
+{
+ int err;
+
+ rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
+ sizeof(struct rpc_inode),
+ 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once);
+ if (!rpc_inode_cachep)
+ return -ENOMEM;
+ err = rpc_clients_notifier_register();
+ if (err)
+ goto err_notifier;
+ err = register_filesystem(&rpc_pipe_fs_type);
+ if (err)
+ goto err_register;
+ return 0;
+
+err_register:
+ rpc_clients_notifier_unregister();
+err_notifier:
+ kmem_cache_destroy(rpc_inode_cachep);
+ return err;
+}
+
+void unregister_rpc_pipefs(void)
+{
+ rpc_clients_notifier_unregister();
+ kmem_cache_destroy(rpc_inode_cachep);
+ unregister_filesystem(&rpc_pipe_fs_type);
+}
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
new file mode 100644
index 000000000..cf5770d8f
--- /dev/null
+++ b/net/sunrpc/rpcb_clnt.c
@@ -0,0 +1,1151 @@
+/*
+ * In-kernel rpcbind client supporting versions 2, 3, and 4 of the rpcbind
+ * protocol
+ *
+ * Based on RFC 1833: "Binding Protocols for ONC RPC Version 2" and
+ * RFC 3530: "Network File System (NFS) version 4 Protocol"
+ *
+ * Original: Gilles Quillard, Bull Open Source, 2005 <gilles.quillard@bull.net>
+ * Updated: Chuck Lever, Oracle Corporation, 2007 <chuck.lever@oracle.com>
+ *
+ * Descended from net/sunrpc/pmap_clnt.c,
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xprtsock.h>
+
+#include "netns.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_BIND
+#endif
+
+#define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock"
+
+#define RPCBIND_PROGRAM (100000u)
+#define RPCBIND_PORT (111u)
+
+#define RPCBVERS_2 (2u)
+#define RPCBVERS_3 (3u)
+#define RPCBVERS_4 (4u)
+
+enum {
+ RPCBPROC_NULL,
+ RPCBPROC_SET,
+ RPCBPROC_UNSET,
+ RPCBPROC_GETPORT,
+ RPCBPROC_GETADDR = 3, /* alias for GETPORT */
+ RPCBPROC_DUMP,
+ RPCBPROC_CALLIT,
+ RPCBPROC_BCAST = 5, /* alias for CALLIT */
+ RPCBPROC_GETTIME,
+ RPCBPROC_UADDR2TADDR,
+ RPCBPROC_TADDR2UADDR,
+ RPCBPROC_GETVERSADDR,
+ RPCBPROC_INDIRECT,
+ RPCBPROC_GETADDRLIST,
+ RPCBPROC_GETSTAT,
+};
+
+/*
+ * r_owner
+ *
+ * The "owner" is allowed to unset a service in the rpcbind database.
+ *
+ * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a
+ * UID which it maps to a local user name via a password lookup.
+ * In all other cases it is ignored.
+ *
+ * For SET/UNSET requests, user space provides a value, even for
+ * network requests, and GETADDR uses an empty string. We follow
+ * those precedents here.
+ */
+#define RPCB_OWNER_STRING "0"
+#define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING)
+
+/*
+ * XDR data type sizes
+ */
+#define RPCB_program_sz (1)
+#define RPCB_version_sz (1)
+#define RPCB_protocol_sz (1)
+#define RPCB_port_sz (1)
+#define RPCB_boolean_sz (1)
+
+#define RPCB_netid_sz (1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN))
+#define RPCB_addr_sz (1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN))
+#define RPCB_ownerstring_sz (1 + XDR_QUADLEN(RPCB_MAXOWNERLEN))
+
+/*
+ * XDR argument and result sizes
+ */
+#define RPCB_mappingargs_sz (RPCB_program_sz + RPCB_version_sz + \
+ RPCB_protocol_sz + RPCB_port_sz)
+#define RPCB_getaddrargs_sz (RPCB_program_sz + RPCB_version_sz + \
+ RPCB_netid_sz + RPCB_addr_sz + \
+ RPCB_ownerstring_sz)
+
+#define RPCB_getportres_sz RPCB_port_sz
+#define RPCB_setres_sz RPCB_boolean_sz
+
+/*
+ * Note that RFC 1833 does not put any size restrictions on the
+ * address string returned by the remote rpcbind database.
+ */
+#define RPCB_getaddrres_sz RPCB_addr_sz
+
+static void rpcb_getport_done(struct rpc_task *, void *);
+static void rpcb_map_release(void *data);
+static const struct rpc_program rpcb_program;
+
+struct rpcbind_args {
+ struct rpc_xprt * r_xprt;
+
+ u32 r_prog;
+ u32 r_vers;
+ u32 r_prot;
+ unsigned short r_port;
+ const char * r_netid;
+ const char * r_addr;
+ const char * r_owner;
+
+ int r_status;
+};
+
+static struct rpc_procinfo rpcb_procedures2[];
+static struct rpc_procinfo rpcb_procedures3[];
+static struct rpc_procinfo rpcb_procedures4[];
+
+struct rpcb_info {
+ u32 rpc_vers;
+ struct rpc_procinfo * rpc_proc;
+};
+
+static const struct rpcb_info rpcb_next_version[];
+static const struct rpcb_info rpcb_next_version6[];
+
+static const struct rpc_call_ops rpcb_getport_ops = {
+ .rpc_call_done = rpcb_getport_done,
+ .rpc_release = rpcb_map_release,
+};
+
+static void rpcb_wake_rpcbind_waiters(struct rpc_xprt *xprt, int status)
+{
+ xprt_clear_binding(xprt);
+ rpc_wake_up_status(&xprt->binding, status);
+}
+
+static void rpcb_map_release(void *data)
+{
+ struct rpcbind_args *map = data;
+
+ rpcb_wake_rpcbind_waiters(map->r_xprt, map->r_status);
+ xprt_put(map->r_xprt);
+ kfree(map->r_addr);
+ kfree(map);
+}
+
+static int rpcb_get_local(struct net *net)
+{
+ int cnt;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ spin_lock(&sn->rpcb_clnt_lock);
+ if (sn->rpcb_users)
+ sn->rpcb_users++;
+ cnt = sn->rpcb_users;
+ spin_unlock(&sn->rpcb_clnt_lock);
+
+ return cnt;
+}
+
+void rpcb_put_local(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_clnt *clnt = sn->rpcb_local_clnt;
+ struct rpc_clnt *clnt4 = sn->rpcb_local_clnt4;
+ int shutdown = 0;
+
+ spin_lock(&sn->rpcb_clnt_lock);
+ if (sn->rpcb_users) {
+ if (--sn->rpcb_users == 0) {
+ sn->rpcb_local_clnt = NULL;
+ sn->rpcb_local_clnt4 = NULL;
+ }
+ shutdown = !sn->rpcb_users;
+ }
+ spin_unlock(&sn->rpcb_clnt_lock);
+
+ if (shutdown) {
+ /*
+ * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister
+ */
+ if (clnt4)
+ rpc_shutdown_client(clnt4);
+ if (clnt)
+ rpc_shutdown_client(clnt);
+ }
+}
+
+static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt,
+ struct rpc_clnt *clnt4,
+ bool is_af_local)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ /* Protected by rpcb_create_local_mutex */
+ sn->rpcb_local_clnt = clnt;
+ sn->rpcb_local_clnt4 = clnt4;
+ sn->rpcb_is_af_local = is_af_local ? 1 : 0;
+ smp_wmb();
+ sn->rpcb_users = 1;
+ dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: "
+ "%p, rpcb_local_clnt4: %p) for net %p%s\n",
+ sn->rpcb_local_clnt, sn->rpcb_local_clnt4,
+ net, (net == &init_net) ? " (init_net)" : "");
+}
+
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+static int rpcb_create_local_unix(struct net *net)
+{
+ static const struct sockaddr_un rpcb_localaddr_rpcbind = {
+ .sun_family = AF_LOCAL,
+ .sun_path = RPCBIND_SOCK_PATHNAME,
+ };
+ struct rpc_create_args args = {
+ .net = net,
+ .protocol = XPRT_TRANSPORT_LOCAL,
+ .address = (struct sockaddr *)&rpcb_localaddr_rpcbind,
+ .addrsize = sizeof(rpcb_localaddr_rpcbind),
+ .servername = "localhost",
+ .program = &rpcb_program,
+ .version = RPCBVERS_2,
+ .authflavor = RPC_AUTH_NULL,
+ /*
+ * We turn off the idle timeout to prevent the kernel
+ * from automatically disconnecting the socket.
+ * Otherwise, we'd have to cache the mount namespace
+ * of the caller and somehow pass that to the socket
+ * reconnect code.
+ */
+ .flags = RPC_CLNT_CREATE_NO_IDLE_TIMEOUT,
+ };
+ struct rpc_clnt *clnt, *clnt4;
+ int result = 0;
+
+ /*
+ * Because we requested an RPC PING at transport creation time,
+ * this works only if the user space portmapper is rpcbind, and
+ * it's listening on AF_LOCAL on the named socket.
+ */
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt)) {
+ dprintk("RPC: failed to create AF_LOCAL rpcbind "
+ "client (errno %ld).\n", PTR_ERR(clnt));
+ result = PTR_ERR(clnt);
+ goto out;
+ }
+
+ clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
+ if (IS_ERR(clnt4)) {
+ dprintk("RPC: failed to bind second program to "
+ "rpcbind v4 client (errno %ld).\n",
+ PTR_ERR(clnt4));
+ clnt4 = NULL;
+ }
+
+ rpcb_set_local(net, clnt, clnt4, true);
+
+out:
+ return result;
+}
+
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+static int rpcb_create_local_net(struct net *net)
+{
+ static const struct sockaddr_in rpcb_inaddr_loopback = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+ .sin_port = htons(RPCBIND_PORT),
+ };
+ struct rpc_create_args args = {
+ .net = net,
+ .protocol = XPRT_TRANSPORT_TCP,
+ .address = (struct sockaddr *)&rpcb_inaddr_loopback,
+ .addrsize = sizeof(rpcb_inaddr_loopback),
+ .servername = "localhost",
+ .program = &rpcb_program,
+ .version = RPCBVERS_2,
+ .authflavor = RPC_AUTH_UNIX,
+ .flags = RPC_CLNT_CREATE_NOPING,
+ };
+ struct rpc_clnt *clnt, *clnt4;
+ int result = 0;
+
+ clnt = rpc_create(&args);
+ if (IS_ERR(clnt)) {
+ dprintk("RPC: failed to create local rpcbind "
+ "client (errno %ld).\n", PTR_ERR(clnt));
+ result = PTR_ERR(clnt);
+ goto out;
+ }
+
+ /*
+ * This results in an RPC ping. On systems running portmapper,
+ * the v4 ping will fail. Proceed anyway, but disallow rpcb
+ * v4 upcalls.
+ */
+ clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
+ if (IS_ERR(clnt4)) {
+ dprintk("RPC: failed to bind second program to "
+ "rpcbind v4 client (errno %ld).\n",
+ PTR_ERR(clnt4));
+ clnt4 = NULL;
+ }
+
+ rpcb_set_local(net, clnt, clnt4, false);
+
+out:
+ return result;
+}
+
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+int rpcb_create_local(struct net *net)
+{
+ static DEFINE_MUTEX(rpcb_create_local_mutex);
+ int result = 0;
+
+ if (rpcb_get_local(net))
+ return result;
+
+ mutex_lock(&rpcb_create_local_mutex);
+ if (rpcb_get_local(net))
+ goto out;
+
+ if (rpcb_create_local_unix(net) != 0)
+ result = rpcb_create_local_net(net);
+
+out:
+ mutex_unlock(&rpcb_create_local_mutex);
+ return result;
+}
+
+static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename,
+ const char *hostname,
+ struct sockaddr *srvaddr, size_t salen,
+ int proto, u32 version)
+{
+ struct rpc_create_args args = {
+ .net = net,
+ .protocol = proto,
+ .address = srvaddr,
+ .addrsize = salen,
+ .servername = hostname,
+ .nodename = nodename,
+ .program = &rpcb_program,
+ .version = version,
+ .authflavor = RPC_AUTH_UNIX,
+ .flags = (RPC_CLNT_CREATE_NOPING |
+ RPC_CLNT_CREATE_NONPRIVPORT),
+ };
+
+ switch (srvaddr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT);
+ break;
+ default:
+ return ERR_PTR(-EAFNOSUPPORT);
+ }
+
+ return rpc_create(&args);
+}
+
+static int rpcb_register_call(struct sunrpc_net *sn, struct rpc_clnt *clnt, struct rpc_message *msg, bool is_set)
+{
+ int flags = RPC_TASK_NOCONNECT;
+ int error, result = 0;
+
+ if (is_set || !sn->rpcb_is_af_local)
+ flags = RPC_TASK_SOFTCONN;
+ msg->rpc_resp = &result;
+
+ error = rpc_call_sync(clnt, msg, flags);
+ if (error < 0) {
+ dprintk("RPC: failed to contact local rpcbind "
+ "server (errno %d).\n", -error);
+ return error;
+ }
+
+ if (!result)
+ return -EACCES;
+ return 0;
+}
+
+/**
+ * rpcb_register - set or unset a port registration with the local rpcbind svc
+ * @net: target network namespace
+ * @prog: RPC program number to bind
+ * @vers: RPC version number to bind
+ * @prot: transport protocol to register
+ * @port: port value to register
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success. Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
+ *
+ * RPC services invoke this function to advertise their contact
+ * information via the system's rpcbind daemon. RPC services
+ * invoke this function once for each [program, version, transport]
+ * tuple they wish to advertise.
+ *
+ * Callers may also unregister RPC services that are no longer
+ * available by setting the passed-in port to zero. This removes
+ * all registered transports for [program, version] from the local
+ * rpcbind database.
+ *
+ * This function uses rpcbind protocol version 2 to contact the
+ * local rpcbind daemon.
+ *
+ * Registration works over both AF_INET and AF_INET6, and services
+ * registered via this function are advertised as available for any
+ * address. If the local rpcbind daemon is listening on AF_INET6,
+ * services registered via this function will be advertised on
+ * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6
+ * addresses).
+ */
+int rpcb_register(struct net *net, u32 prog, u32 vers, int prot, unsigned short port)
+{
+ struct rpcbind_args map = {
+ .r_prog = prog,
+ .r_vers = vers,
+ .r_prot = prot,
+ .r_port = port,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = &map,
+ };
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ bool is_set = false;
+
+ dprintk("RPC: %sregistering (%u, %u, %d, %u) with local "
+ "rpcbind\n", (port ? "" : "un"),
+ prog, vers, prot, port);
+
+ msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET];
+ if (port != 0) {
+ msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET];
+ is_set = true;
+ }
+
+ return rpcb_register_call(sn, sn->rpcb_local_clnt, &msg, is_set);
+}
+
+/*
+ * Fill in AF_INET family-specific arguments to register
+ */
+static int rpcb_register_inet4(struct sunrpc_net *sn,
+ const struct sockaddr *sap,
+ struct rpc_message *msg)
+{
+ const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
+ struct rpcbind_args *map = msg->rpc_argp;
+ unsigned short port = ntohs(sin->sin_port);
+ bool is_set = false;
+ int result;
+
+ map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
+
+ dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
+ "local rpcbind\n", (port ? "" : "un"),
+ map->r_prog, map->r_vers,
+ map->r_addr, map->r_netid);
+
+ msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
+ if (port != 0) {
+ msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
+ is_set = true;
+ }
+
+ result = rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, is_set);
+ kfree(map->r_addr);
+ return result;
+}
+
+/*
+ * Fill in AF_INET6 family-specific arguments to register
+ */
+static int rpcb_register_inet6(struct sunrpc_net *sn,
+ const struct sockaddr *sap,
+ struct rpc_message *msg)
+{
+ const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap;
+ struct rpcbind_args *map = msg->rpc_argp;
+ unsigned short port = ntohs(sin6->sin6_port);
+ bool is_set = false;
+ int result;
+
+ map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
+
+ dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
+ "local rpcbind\n", (port ? "" : "un"),
+ map->r_prog, map->r_vers,
+ map->r_addr, map->r_netid);
+
+ msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
+ if (port != 0) {
+ msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
+ is_set = true;
+ }
+
+ result = rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, is_set);
+ kfree(map->r_addr);
+ return result;
+}
+
+static int rpcb_unregister_all_protofamilies(struct sunrpc_net *sn,
+ struct rpc_message *msg)
+{
+ struct rpcbind_args *map = msg->rpc_argp;
+
+ dprintk("RPC: unregistering [%u, %u, '%s'] with "
+ "local rpcbind\n",
+ map->r_prog, map->r_vers, map->r_netid);
+
+ map->r_addr = "";
+ msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
+
+ return rpcb_register_call(sn, sn->rpcb_local_clnt4, msg, false);
+}
+
+/**
+ * rpcb_v4_register - set or unset a port registration with the local rpcbind
+ * @net: target network namespace
+ * @program: RPC program number of service to (un)register
+ * @version: RPC version number of service to (un)register
+ * @address: address family, IP address, and port to (un)register
+ * @netid: netid of transport protocol to (un)register
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success. Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
+ *
+ * RPC services invoke this function to advertise their contact
+ * information via the system's rpcbind daemon. RPC services
+ * invoke this function once for each [program, version, address,
+ * netid] tuple they wish to advertise.
+ *
+ * Callers may also unregister RPC services that are registered at a
+ * specific address by setting the port number in @address to zero.
+ * They may unregister all registered protocol families at once for
+ * a service by passing a NULL @address argument. If @netid is ""
+ * then all netids for [program, version, address] are unregistered.
+ *
+ * This function uses rpcbind protocol version 4 to contact the
+ * local rpcbind daemon. The local rpcbind daemon must support
+ * version 4 of the rpcbind protocol in order for these functions
+ * to register a service successfully.
+ *
+ * Supported netids include "udp" and "tcp" for UDP and TCP over
+ * IPv4, and "udp6" and "tcp6" for UDP and TCP over IPv6,
+ * respectively.
+ *
+ * The contents of @address determine the address family and the
+ * port to be registered. The usual practice is to pass INADDR_ANY
+ * as the raw address, but specifying a non-zero address is also
+ * supported by this API if the caller wishes to advertise an RPC
+ * service on a specific network interface.
+ *
+ * Note that passing in INADDR_ANY does not create the same service
+ * registration as IN6ADDR_ANY. The former advertises an RPC
+ * service on any IPv4 address, but not on IPv6. The latter
+ * advertises the service on all IPv4 and IPv6 addresses.
+ */
+int rpcb_v4_register(struct net *net, const u32 program, const u32 version,
+ const struct sockaddr *address, const char *netid)
+{
+ struct rpcbind_args map = {
+ .r_prog = program,
+ .r_vers = version,
+ .r_netid = netid,
+ .r_owner = RPCB_OWNER_STRING,
+ };
+ struct rpc_message msg = {
+ .rpc_argp = &map,
+ };
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ if (sn->rpcb_local_clnt4 == NULL)
+ return -EPROTONOSUPPORT;
+
+ if (address == NULL)
+ return rpcb_unregister_all_protofamilies(sn, &msg);
+
+ switch (address->sa_family) {
+ case AF_INET:
+ return rpcb_register_inet4(sn, address, &msg);
+ case AF_INET6:
+ return rpcb_register_inet6(sn, address, &msg);
+ }
+
+ return -EAFNOSUPPORT;
+}
+
+static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc)
+{
+ struct rpc_message msg = {
+ .rpc_proc = proc,
+ .rpc_argp = map,
+ .rpc_resp = map,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = rpcb_clnt,
+ .rpc_message = &msg,
+ .callback_ops = &rpcb_getport_ops,
+ .callback_data = map,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_SOFTCONN,
+ };
+
+ return rpc_run_task(&task_setup_data);
+}
+
+/*
+ * In the case where rpc clients have been cloned, we want to make
+ * sure that we use the program number/version etc of the actual
+ * owner of the xprt. To do so, we walk back up the tree of parents
+ * to find whoever created the transport and/or whoever has the
+ * autobind flag set.
+ */
+static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt)
+{
+ struct rpc_clnt *parent = clnt->cl_parent;
+ struct rpc_xprt *xprt = rcu_dereference(clnt->cl_xprt);
+
+ while (parent != clnt) {
+ if (rcu_dereference(parent->cl_xprt) != xprt)
+ break;
+ if (clnt->cl_autobind)
+ break;
+ clnt = parent;
+ parent = parent->cl_parent;
+ }
+ return clnt;
+}
+
+/**
+ * rpcb_getport_async - obtain the port for a given RPC service on a given host
+ * @task: task that is waiting for portmapper request
+ *
+ * This one can be called for an ongoing RPC request, and can be used in
+ * an async (rpciod) context.
+ */
+void rpcb_getport_async(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt;
+ struct rpc_procinfo *proc;
+ u32 bind_version;
+ struct rpc_xprt *xprt;
+ struct rpc_clnt *rpcb_clnt;
+ struct rpcbind_args *map;
+ struct rpc_task *child;
+ struct sockaddr_storage addr;
+ struct sockaddr *sap = (struct sockaddr *)&addr;
+ size_t salen;
+ int status;
+
+ rcu_read_lock();
+ do {
+ clnt = rpcb_find_transport_owner(task->tk_client);
+ xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ } while (xprt == NULL);
+ rcu_read_unlock();
+
+ dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
+ task->tk_pid, __func__,
+ xprt->servername, clnt->cl_prog, clnt->cl_vers, xprt->prot);
+
+ /* Put self on the wait queue to ensure we get notified if
+ * some other task is already attempting to bind the port */
+ rpc_sleep_on(&xprt->binding, task, NULL);
+
+ if (xprt_test_and_set_binding(xprt)) {
+ dprintk("RPC: %5u %s: waiting for another binder\n",
+ task->tk_pid, __func__);
+ xprt_put(xprt);
+ return;
+ }
+
+ /* Someone else may have bound if we slept */
+ if (xprt_bound(xprt)) {
+ status = 0;
+ dprintk("RPC: %5u %s: already bound\n",
+ task->tk_pid, __func__);
+ goto bailout_nofree;
+ }
+
+ /* Parent transport's destination address */
+ salen = rpc_peeraddr(clnt, sap, sizeof(addr));
+
+ /* Don't ever use rpcbind v2 for AF_INET6 requests */
+ switch (sap->sa_family) {
+ case AF_INET:
+ proc = rpcb_next_version[xprt->bind_index].rpc_proc;
+ bind_version = rpcb_next_version[xprt->bind_index].rpc_vers;
+ break;
+ case AF_INET6:
+ proc = rpcb_next_version6[xprt->bind_index].rpc_proc;
+ bind_version = rpcb_next_version6[xprt->bind_index].rpc_vers;
+ break;
+ default:
+ status = -EAFNOSUPPORT;
+ dprintk("RPC: %5u %s: bad address family\n",
+ task->tk_pid, __func__);
+ goto bailout_nofree;
+ }
+ if (proc == NULL) {
+ xprt->bind_index = 0;
+ status = -EPFNOSUPPORT;
+ dprintk("RPC: %5u %s: no more getport versions available\n",
+ task->tk_pid, __func__);
+ goto bailout_nofree;
+ }
+
+ dprintk("RPC: %5u %s: trying rpcbind version %u\n",
+ task->tk_pid, __func__, bind_version);
+
+ rpcb_clnt = rpcb_create(xprt->xprt_net,
+ clnt->cl_nodename,
+ xprt->servername, sap, salen,
+ xprt->prot, bind_version);
+ if (IS_ERR(rpcb_clnt)) {
+ status = PTR_ERR(rpcb_clnt);
+ dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
+ task->tk_pid, __func__, PTR_ERR(rpcb_clnt));
+ goto bailout_nofree;
+ }
+
+ map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC);
+ if (!map) {
+ status = -ENOMEM;
+ dprintk("RPC: %5u %s: no memory available\n",
+ task->tk_pid, __func__);
+ goto bailout_release_client;
+ }
+ map->r_prog = clnt->cl_prog;
+ map->r_vers = clnt->cl_vers;
+ map->r_prot = xprt->prot;
+ map->r_port = 0;
+ map->r_xprt = xprt;
+ map->r_status = -EIO;
+
+ switch (bind_version) {
+ case RPCBVERS_4:
+ case RPCBVERS_3:
+ map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID];
+ map->r_addr = rpc_sockaddr2uaddr(sap, GFP_ATOMIC);
+ map->r_owner = "";
+ break;
+ case RPCBVERS_2:
+ map->r_addr = NULL;
+ break;
+ default:
+ BUG();
+ }
+
+ child = rpcb_call_async(rpcb_clnt, map, proc);
+ rpc_release_client(rpcb_clnt);
+ if (IS_ERR(child)) {
+ /* rpcb_map_release() has freed the arguments */
+ dprintk("RPC: %5u %s: rpc_run_task failed\n",
+ task->tk_pid, __func__);
+ return;
+ }
+
+ xprt->stat.bind_count++;
+ rpc_put_task(child);
+ return;
+
+bailout_release_client:
+ rpc_release_client(rpcb_clnt);
+bailout_nofree:
+ rpcb_wake_rpcbind_waiters(xprt, status);
+ task->tk_status = status;
+ xprt_put(xprt);
+}
+EXPORT_SYMBOL_GPL(rpcb_getport_async);
+
+/*
+ * Rpcbind child task calls this callback via tk_exit.
+ */
+static void rpcb_getport_done(struct rpc_task *child, void *data)
+{
+ struct rpcbind_args *map = data;
+ struct rpc_xprt *xprt = map->r_xprt;
+ int status = child->tk_status;
+
+ /* Garbage reply: retry with a lesser rpcbind version */
+ if (status == -EIO)
+ status = -EPROTONOSUPPORT;
+
+ /* rpcbind server doesn't support this rpcbind protocol version */
+ if (status == -EPROTONOSUPPORT)
+ xprt->bind_index++;
+
+ if (status < 0) {
+ /* rpcbind server not available on remote host? */
+ xprt->ops->set_port(xprt, 0);
+ } else if (map->r_port == 0) {
+ /* Requested RPC service wasn't registered on remote host */
+ xprt->ops->set_port(xprt, 0);
+ status = -EACCES;
+ } else {
+ /* Succeeded */
+ xprt->ops->set_port(xprt, map->r_port);
+ xprt_set_bound(xprt);
+ status = 0;
+ }
+
+ dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n",
+ child->tk_pid, status, map->r_port);
+
+ map->r_status = status;
+}
+
+/*
+ * XDR functions for rpcbind
+ */
+
+static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const struct rpcbind_args *rpcb)
+{
+ __be32 *p;
+
+ dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n",
+ req->rq_task->tk_pid,
+ req->rq_task->tk_msg.rpc_proc->p_name,
+ rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
+
+ p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2);
+ *p++ = cpu_to_be32(rpcb->r_prog);
+ *p++ = cpu_to_be32(rpcb->r_vers);
+ *p++ = cpu_to_be32(rpcb->r_prot);
+ *p = cpu_to_be32(rpcb->r_port);
+}
+
+static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr,
+ struct rpcbind_args *rpcb)
+{
+ unsigned long port;
+ __be32 *p;
+
+ rpcb->r_port = 0;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ port = be32_to_cpup(p);
+ dprintk("RPC: %5u PMAP_%s result: %lu\n", req->rq_task->tk_pid,
+ req->rq_task->tk_msg.rpc_proc->p_name, port);
+ if (unlikely(port > USHRT_MAX))
+ return -EIO;
+
+ rpcb->r_port = port;
+ return 0;
+}
+
+static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr,
+ unsigned int *boolp)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ return -EIO;
+
+ *boolp = 0;
+ if (*p != xdr_zero)
+ *boolp = 1;
+
+ dprintk("RPC: %5u RPCB_%s call %s\n",
+ req->rq_task->tk_pid,
+ req->rq_task->tk_msg.rpc_proc->p_name,
+ (*boolp ? "succeeded" : "failed"));
+ return 0;
+}
+
+static void encode_rpcb_string(struct xdr_stream *xdr, const char *string,
+ const u32 maxstrlen)
+{
+ __be32 *p;
+ u32 len;
+
+ len = strlen(string);
+ WARN_ON_ONCE(len > maxstrlen);
+ if (len > maxstrlen)
+ /* truncate and hope for the best */
+ len = maxstrlen;
+ p = xdr_reserve_space(xdr, 4 + len);
+ xdr_encode_opaque(p, string, len);
+}
+
+static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ const struct rpcbind_args *rpcb)
+{
+ __be32 *p;
+
+ dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n",
+ req->rq_task->tk_pid,
+ req->rq_task->tk_msg.rpc_proc->p_name,
+ rpcb->r_prog, rpcb->r_vers,
+ rpcb->r_netid, rpcb->r_addr);
+
+ p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2);
+ *p++ = cpu_to_be32(rpcb->r_prog);
+ *p = cpu_to_be32(rpcb->r_vers);
+
+ encode_rpcb_string(xdr, rpcb->r_netid, RPCBIND_MAXNETIDLEN);
+ encode_rpcb_string(xdr, rpcb->r_addr, RPCBIND_MAXUADDRLEN);
+ encode_rpcb_string(xdr, rpcb->r_owner, RPCB_MAXOWNERLEN);
+}
+
+static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
+ struct rpcbind_args *rpcb)
+{
+ struct sockaddr_storage address;
+ struct sockaddr *sap = (struct sockaddr *)&address;
+ __be32 *p;
+ u32 len;
+
+ rpcb->r_port = 0;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(p == NULL))
+ goto out_fail;
+ len = be32_to_cpup(p);
+
+ /*
+ * If the returned universal address is a null string,
+ * the requested RPC service was not registered.
+ */
+ if (len == 0) {
+ dprintk("RPC: %5u RPCB reply: program not registered\n",
+ req->rq_task->tk_pid);
+ return 0;
+ }
+
+ if (unlikely(len > RPCBIND_MAXUADDRLEN))
+ goto out_fail;
+
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(p == NULL))
+ goto out_fail;
+ dprintk("RPC: %5u RPCB_%s reply: %s\n", req->rq_task->tk_pid,
+ req->rq_task->tk_msg.rpc_proc->p_name, (char *)p);
+
+ if (rpc_uaddr2sockaddr(req->rq_xprt->xprt_net, (char *)p, len,
+ sap, sizeof(address)) == 0)
+ goto out_fail;
+ rpcb->r_port = rpc_get_port(sap);
+
+ return 0;
+
+out_fail:
+ dprintk("RPC: %5u malformed RPCB_%s reply\n",
+ req->rq_task->tk_pid,
+ req->rq_task->tk_msg.rpc_proc->p_name);
+ return -EIO;
+}
+
+/*
+ * Not all rpcbind procedures described in RFC 1833 are implemented
+ * since the Linux kernel RPC code requires only these.
+ */
+
+static struct rpc_procinfo rpcb_procedures2[] = {
+ [RPCBPROC_SET] = {
+ .p_proc = RPCBPROC_SET,
+ .p_encode = (kxdreproc_t)rpcb_enc_mapping,
+ .p_decode = (kxdrdproc_t)rpcb_dec_set,
+ .p_arglen = RPCB_mappingargs_sz,
+ .p_replen = RPCB_setres_sz,
+ .p_statidx = RPCBPROC_SET,
+ .p_timer = 0,
+ .p_name = "SET",
+ },
+ [RPCBPROC_UNSET] = {
+ .p_proc = RPCBPROC_UNSET,
+ .p_encode = (kxdreproc_t)rpcb_enc_mapping,
+ .p_decode = (kxdrdproc_t)rpcb_dec_set,
+ .p_arglen = RPCB_mappingargs_sz,
+ .p_replen = RPCB_setres_sz,
+ .p_statidx = RPCBPROC_UNSET,
+ .p_timer = 0,
+ .p_name = "UNSET",
+ },
+ [RPCBPROC_GETPORT] = {
+ .p_proc = RPCBPROC_GETPORT,
+ .p_encode = (kxdreproc_t)rpcb_enc_mapping,
+ .p_decode = (kxdrdproc_t)rpcb_dec_getport,
+ .p_arglen = RPCB_mappingargs_sz,
+ .p_replen = RPCB_getportres_sz,
+ .p_statidx = RPCBPROC_GETPORT,
+ .p_timer = 0,
+ .p_name = "GETPORT",
+ },
+};
+
+static struct rpc_procinfo rpcb_procedures3[] = {
+ [RPCBPROC_SET] = {
+ .p_proc = RPCBPROC_SET,
+ .p_encode = (kxdreproc_t)rpcb_enc_getaddr,
+ .p_decode = (kxdrdproc_t)rpcb_dec_set,
+ .p_arglen = RPCB_getaddrargs_sz,
+ .p_replen = RPCB_setres_sz,
+ .p_statidx = RPCBPROC_SET,
+ .p_timer = 0,
+ .p_name = "SET",
+ },
+ [RPCBPROC_UNSET] = {
+ .p_proc = RPCBPROC_UNSET,
+ .p_encode = (kxdreproc_t)rpcb_enc_getaddr,
+ .p_decode = (kxdrdproc_t)rpcb_dec_set,
+ .p_arglen = RPCB_getaddrargs_sz,
+ .p_replen = RPCB_setres_sz,
+ .p_statidx = RPCBPROC_UNSET,
+ .p_timer = 0,
+ .p_name = "UNSET",
+ },
+ [RPCBPROC_GETADDR] = {
+ .p_proc = RPCBPROC_GETADDR,
+ .p_encode = (kxdreproc_t)rpcb_enc_getaddr,
+ .p_decode = (kxdrdproc_t)rpcb_dec_getaddr,
+ .p_arglen = RPCB_getaddrargs_sz,
+ .p_replen = RPCB_getaddrres_sz,
+ .p_statidx = RPCBPROC_GETADDR,
+ .p_timer = 0,
+ .p_name = "GETADDR",
+ },
+};
+
+static struct rpc_procinfo rpcb_procedures4[] = {
+ [RPCBPROC_SET] = {
+ .p_proc = RPCBPROC_SET,
+ .p_encode = (kxdreproc_t)rpcb_enc_getaddr,
+ .p_decode = (kxdrdproc_t)rpcb_dec_set,
+ .p_arglen = RPCB_getaddrargs_sz,
+ .p_replen = RPCB_setres_sz,
+ .p_statidx = RPCBPROC_SET,
+ .p_timer = 0,
+ .p_name = "SET",
+ },
+ [RPCBPROC_UNSET] = {
+ .p_proc = RPCBPROC_UNSET,
+ .p_encode = (kxdreproc_t)rpcb_enc_getaddr,
+ .p_decode = (kxdrdproc_t)rpcb_dec_set,
+ .p_arglen = RPCB_getaddrargs_sz,
+ .p_replen = RPCB_setres_sz,
+ .p_statidx = RPCBPROC_UNSET,
+ .p_timer = 0,
+ .p_name = "UNSET",
+ },
+ [RPCBPROC_GETADDR] = {
+ .p_proc = RPCBPROC_GETADDR,
+ .p_encode = (kxdreproc_t)rpcb_enc_getaddr,
+ .p_decode = (kxdrdproc_t)rpcb_dec_getaddr,
+ .p_arglen = RPCB_getaddrargs_sz,
+ .p_replen = RPCB_getaddrres_sz,
+ .p_statidx = RPCBPROC_GETADDR,
+ .p_timer = 0,
+ .p_name = "GETADDR",
+ },
+};
+
+static const struct rpcb_info rpcb_next_version[] = {
+ {
+ .rpc_vers = RPCBVERS_2,
+ .rpc_proc = &rpcb_procedures2[RPCBPROC_GETPORT],
+ },
+ {
+ .rpc_proc = NULL,
+ },
+};
+
+static const struct rpcb_info rpcb_next_version6[] = {
+ {
+ .rpc_vers = RPCBVERS_4,
+ .rpc_proc = &rpcb_procedures4[RPCBPROC_GETADDR],
+ },
+ {
+ .rpc_vers = RPCBVERS_3,
+ .rpc_proc = &rpcb_procedures3[RPCBPROC_GETADDR],
+ },
+ {
+ .rpc_proc = NULL,
+ },
+};
+
+static const struct rpc_version rpcb_version2 = {
+ .number = RPCBVERS_2,
+ .nrprocs = ARRAY_SIZE(rpcb_procedures2),
+ .procs = rpcb_procedures2
+};
+
+static const struct rpc_version rpcb_version3 = {
+ .number = RPCBVERS_3,
+ .nrprocs = ARRAY_SIZE(rpcb_procedures3),
+ .procs = rpcb_procedures3
+};
+
+static const struct rpc_version rpcb_version4 = {
+ .number = RPCBVERS_4,
+ .nrprocs = ARRAY_SIZE(rpcb_procedures4),
+ .procs = rpcb_procedures4
+};
+
+static const struct rpc_version *rpcb_version[] = {
+ NULL,
+ NULL,
+ &rpcb_version2,
+ &rpcb_version3,
+ &rpcb_version4
+};
+
+static struct rpc_stat rpcb_stats;
+
+static const struct rpc_program rpcb_program = {
+ .name = "rpcbind",
+ .number = RPCBIND_PROGRAM,
+ .nrvers = ARRAY_SIZE(rpcb_version),
+ .version = rpcb_version,
+ .stats = &rpcb_stats,
+};
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
new file mode 100644
index 000000000..337ca851a
--- /dev/null
+++ b/net/sunrpc/sched.c
@@ -0,0 +1,1141 @@
+/*
+ * linux/net/sunrpc/sched.c
+ *
+ * Scheduling for synchronous and asynchronous RPC requests.
+ *
+ * Copyright (C) 1996 Olaf Kirch, <okir@monad.swb.de>
+ *
+ * TCP NFS related read + write fixes
+ * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
+ */
+
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+
+#include <linux/sunrpc/clnt.h>
+
+#include "sunrpc.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+#define RPCDBG_FACILITY RPCDBG_SCHED
+#endif
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sunrpc.h>
+
+/*
+ * RPC slabs and memory pools
+ */
+#define RPC_BUFFER_MAXSIZE (2048)
+#define RPC_BUFFER_POOLSIZE (8)
+#define RPC_TASK_POOLSIZE (8)
+static struct kmem_cache *rpc_task_slabp __read_mostly;
+static struct kmem_cache *rpc_buffer_slabp __read_mostly;
+static mempool_t *rpc_task_mempool __read_mostly;
+static mempool_t *rpc_buffer_mempool __read_mostly;
+
+static void rpc_async_schedule(struct work_struct *);
+static void rpc_release_task(struct rpc_task *task);
+static void __rpc_queue_timer_fn(unsigned long ptr);
+
+/*
+ * RPC tasks sit here while waiting for conditions to improve.
+ */
+static struct rpc_wait_queue delay_queue;
+
+/*
+ * rpciod-related stuff
+ */
+struct workqueue_struct *rpciod_workqueue;
+
+/*
+ * Disable the timer for a given RPC task. Should be called with
+ * queue->lock and bh_disabled in order to avoid races within
+ * rpc_run_timer().
+ */
+static void
+__rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ if (task->tk_timeout == 0)
+ return;
+ dprintk("RPC: %5u disabling timer\n", task->tk_pid);
+ task->tk_timeout = 0;
+ list_del(&task->u.tk_wait.timer_list);
+ if (list_empty(&queue->timer_list.list))
+ del_timer(&queue->timer_list.timer);
+}
+
+static void
+rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
+{
+ queue->timer_list.expires = expires;
+ mod_timer(&queue->timer_list.timer, expires);
+}
+
+/*
+ * Set up a timer for the current task.
+ */
+static void
+__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ if (!task->tk_timeout)
+ return;
+
+ dprintk("RPC: %5u setting alarm for %u ms\n",
+ task->tk_pid, jiffies_to_msecs(task->tk_timeout));
+
+ task->u.tk_wait.expires = jiffies + task->tk_timeout;
+ if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
+ rpc_set_queue_timer(queue, task->u.tk_wait.expires);
+ list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
+}
+
+static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue)
+{
+ struct list_head *q = &queue->tasks[queue->priority];
+ struct rpc_task *task;
+
+ if (!list_empty(q)) {
+ task = list_first_entry(q, struct rpc_task, u.tk_wait.list);
+ if (task->tk_owner == queue->owner)
+ list_move_tail(&task->u.tk_wait.list, q);
+ }
+}
+
+static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
+{
+ if (queue->priority != priority) {
+ /* Fairness: rotate the list when changing priority */
+ rpc_rotate_queue_owner(queue);
+ queue->priority = priority;
+ }
+}
+
+static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
+{
+ queue->owner = pid;
+ queue->nr = RPC_BATCH_COUNT;
+}
+
+static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue)
+{
+ rpc_set_waitqueue_priority(queue, queue->maxpriority);
+ rpc_set_waitqueue_owner(queue, 0);
+}
+
+/*
+ * Add new request to a priority queue.
+ */
+static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue,
+ struct rpc_task *task,
+ unsigned char queue_priority)
+{
+ struct list_head *q;
+ struct rpc_task *t;
+
+ INIT_LIST_HEAD(&task->u.tk_wait.links);
+ if (unlikely(queue_priority > queue->maxpriority))
+ queue_priority = queue->maxpriority;
+ if (queue_priority > queue->priority)
+ rpc_set_waitqueue_priority(queue, queue_priority);
+ q = &queue->tasks[queue_priority];
+ list_for_each_entry(t, q, u.tk_wait.list) {
+ if (t->tk_owner == task->tk_owner) {
+ list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
+ return;
+ }
+ }
+ list_add_tail(&task->u.tk_wait.list, q);
+}
+
+/*
+ * Add new request to wait queue.
+ *
+ * Swapper tasks always get inserted at the head of the queue.
+ * This should avoid many nasty memory deadlocks and hopefully
+ * improve overall performance.
+ * Everyone else gets appended to the queue to ensure proper FIFO behavior.
+ */
+static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
+ struct rpc_task *task,
+ unsigned char queue_priority)
+{
+ WARN_ON_ONCE(RPC_IS_QUEUED(task));
+ if (RPC_IS_QUEUED(task))
+ return;
+
+ if (RPC_IS_PRIORITY(queue))
+ __rpc_add_wait_queue_priority(queue, task, queue_priority);
+ else if (RPC_IS_SWAPPER(task))
+ list_add(&task->u.tk_wait.list, &queue->tasks[0]);
+ else
+ list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
+ task->tk_waitqueue = queue;
+ queue->qlen++;
+ /* barrier matches the read in rpc_wake_up_task_queue_locked() */
+ smp_wmb();
+ rpc_set_queued(task);
+
+ dprintk("RPC: %5u added to queue %p \"%s\"\n",
+ task->tk_pid, queue, rpc_qname(queue));
+}
+
+/*
+ * Remove request from a priority queue.
+ */
+static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
+{
+ struct rpc_task *t;
+
+ if (!list_empty(&task->u.tk_wait.links)) {
+ t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
+ list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
+ list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
+ }
+}
+
+/*
+ * Remove request from queue.
+ * Note: must be called with spin lock held.
+ */
+static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ __rpc_disable_timer(queue, task);
+ if (RPC_IS_PRIORITY(queue))
+ __rpc_remove_wait_queue_priority(task);
+ list_del(&task->u.tk_wait.list);
+ queue->qlen--;
+ dprintk("RPC: %5u removed from queue %p \"%s\"\n",
+ task->tk_pid, queue, rpc_qname(queue));
+}
+
+static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues)
+{
+ int i;
+
+ spin_lock_init(&queue->lock);
+ for (i = 0; i < ARRAY_SIZE(queue->tasks); i++)
+ INIT_LIST_HEAD(&queue->tasks[i]);
+ queue->maxpriority = nr_queues - 1;
+ rpc_reset_waitqueue_priority(queue);
+ queue->qlen = 0;
+ setup_timer(&queue->timer_list.timer, __rpc_queue_timer_fn, (unsigned long)queue);
+ INIT_LIST_HEAD(&queue->timer_list.list);
+ rpc_assign_waitqueue_name(queue, qname);
+}
+
+void rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname)
+{
+ __rpc_init_priority_wait_queue(queue, qname, RPC_NR_PRIORITY);
+}
+EXPORT_SYMBOL_GPL(rpc_init_priority_wait_queue);
+
+void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname)
+{
+ __rpc_init_priority_wait_queue(queue, qname, 1);
+}
+EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
+
+void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
+{
+ del_timer_sync(&queue->timer_list.timer);
+}
+EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
+
+static int rpc_wait_bit_killable(struct wait_bit_key *key)
+{
+ if (fatal_signal_pending(current))
+ return -ERESTARTSYS;
+ freezable_schedule_unsafe();
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS)
+static void rpc_task_set_debuginfo(struct rpc_task *task)
+{
+ static atomic_t rpc_pid;
+
+ task->tk_pid = atomic_inc_return(&rpc_pid);
+}
+#else
+static inline void rpc_task_set_debuginfo(struct rpc_task *task)
+{
+}
+#endif
+
+static void rpc_set_active(struct rpc_task *task)
+{
+ trace_rpc_task_begin(task->tk_client, task, NULL);
+
+ rpc_task_set_debuginfo(task);
+ set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
+}
+
+/*
+ * Mark an RPC call as having completed by clearing the 'active' bit
+ * and then waking up all tasks that were sleeping.
+ */
+static int rpc_complete_task(struct rpc_task *task)
+{
+ void *m = &task->tk_runstate;
+ wait_queue_head_t *wq = bit_waitqueue(m, RPC_TASK_ACTIVE);
+ struct wait_bit_key k = __WAIT_BIT_KEY_INITIALIZER(m, RPC_TASK_ACTIVE);
+ unsigned long flags;
+ int ret;
+
+ trace_rpc_task_complete(task->tk_client, task, NULL);
+
+ spin_lock_irqsave(&wq->lock, flags);
+ clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
+ ret = atomic_dec_and_test(&task->tk_count);
+ if (waitqueue_active(wq))
+ __wake_up_locked_key(wq, TASK_NORMAL, &k);
+ spin_unlock_irqrestore(&wq->lock, flags);
+ return ret;
+}
+
+/*
+ * Allow callers to wait for completion of an RPC call
+ *
+ * Note the use of out_of_line_wait_on_bit() rather than wait_on_bit()
+ * to enforce taking of the wq->lock and hence avoid races with
+ * rpc_complete_task().
+ */
+int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action)
+{
+ if (action == NULL)
+ action = rpc_wait_bit_killable;
+ return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
+ action, TASK_KILLABLE);
+}
+EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
+
+/*
+ * Make an RPC task runnable.
+ *
+ * Note: If the task is ASYNC, and is being made runnable after sitting on an
+ * rpc_wait_queue, this must be called with the queue spinlock held to protect
+ * the wait queue operation.
+ * Note the ordering of rpc_test_and_set_running() and rpc_clear_queued(),
+ * which is needed to ensure that __rpc_execute() doesn't loop (due to the
+ * lockless RPC_IS_QUEUED() test) before we've had a chance to test
+ * the RPC_TASK_RUNNING flag.
+ */
+static void rpc_make_runnable(struct rpc_task *task)
+{
+ bool need_wakeup = !rpc_test_and_set_running(task);
+
+ rpc_clear_queued(task);
+ if (!need_wakeup)
+ return;
+ if (RPC_IS_ASYNC(task)) {
+ INIT_WORK(&task->u.tk_work, rpc_async_schedule);
+ queue_work(rpciod_workqueue, &task->u.tk_work);
+ } else
+ wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
+}
+
+/*
+ * Prepare for sleeping on a wait queue.
+ * By always appending tasks to the list we ensure FIFO behavior.
+ * NB: An RPC task will only receive interrupt-driven events as long
+ * as it's on a wait queue.
+ */
+static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+ struct rpc_task *task,
+ rpc_action action,
+ unsigned char queue_priority)
+{
+ dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
+ task->tk_pid, rpc_qname(q), jiffies);
+
+ trace_rpc_task_sleep(task->tk_client, task, q);
+
+ __rpc_add_wait_queue(q, task, queue_priority);
+
+ WARN_ON_ONCE(task->tk_callback != NULL);
+ task->tk_callback = action;
+ __rpc_add_timer(q, task);
+}
+
+void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+ rpc_action action)
+{
+ /* We shouldn't ever put an inactive task to sleep */
+ WARN_ON_ONCE(!RPC_IS_ACTIVATED(task));
+ if (!RPC_IS_ACTIVATED(task)) {
+ task->tk_status = -EIO;
+ rpc_put_task_async(task);
+ return;
+ }
+
+ /*
+ * Protect the queue operations.
+ */
+ spin_lock_bh(&q->lock);
+ __rpc_sleep_on_priority(q, task, action, task->tk_priority);
+ spin_unlock_bh(&q->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_sleep_on);
+
+void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
+ rpc_action action, int priority)
+{
+ /* We shouldn't ever put an inactive task to sleep */
+ WARN_ON_ONCE(!RPC_IS_ACTIVATED(task));
+ if (!RPC_IS_ACTIVATED(task)) {
+ task->tk_status = -EIO;
+ rpc_put_task_async(task);
+ return;
+ }
+
+ /*
+ * Protect the queue operations.
+ */
+ spin_lock_bh(&q->lock);
+ __rpc_sleep_on_priority(q, task, action, priority - RPC_PRIORITY_LOW);
+ spin_unlock_bh(&q->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
+
+/**
+ * __rpc_do_wake_up_task - wake up a single rpc_task
+ * @queue: wait queue
+ * @task: task to be woken up
+ *
+ * Caller must hold queue->lock, and have cleared the task queued flag.
+ */
+static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
+ task->tk_pid, jiffies);
+
+ /* Has the task been executed yet? If not, we cannot wake it up! */
+ if (!RPC_IS_ACTIVATED(task)) {
+ printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
+ return;
+ }
+
+ trace_rpc_task_wakeup(task->tk_client, task, queue);
+
+ __rpc_remove_wait_queue(queue, task);
+
+ rpc_make_runnable(task);
+
+ dprintk("RPC: __rpc_wake_up_task done\n");
+}
+
+/*
+ * Wake up a queued task while the queue lock is being held
+ */
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ if (RPC_IS_QUEUED(task)) {
+ smp_rmb();
+ if (task->tk_waitqueue == queue)
+ __rpc_do_wake_up_task(queue, task);
+ }
+}
+
+/*
+ * Wake up a task on a specific queue
+ */
+void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ spin_lock_bh(&queue->lock);
+ rpc_wake_up_task_queue_locked(queue, task);
+ spin_unlock_bh(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
+
+/*
+ * Wake up the next task on a priority queue.
+ */
+static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *queue)
+{
+ struct list_head *q;
+ struct rpc_task *task;
+
+ /*
+ * Service a batch of tasks from a single owner.
+ */
+ q = &queue->tasks[queue->priority];
+ if (!list_empty(q)) {
+ task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ if (queue->owner == task->tk_owner) {
+ if (--queue->nr)
+ goto out;
+ list_move_tail(&task->u.tk_wait.list, q);
+ }
+ /*
+ * Check if we need to switch queues.
+ */
+ goto new_owner;
+ }
+
+ /*
+ * Service the next queue.
+ */
+ do {
+ if (q == &queue->tasks[0])
+ q = &queue->tasks[queue->maxpriority];
+ else
+ q = q - 1;
+ if (!list_empty(q)) {
+ task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ goto new_queue;
+ }
+ } while (q != &queue->tasks[queue->priority]);
+
+ rpc_reset_waitqueue_priority(queue);
+ return NULL;
+
+new_queue:
+ rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0]));
+new_owner:
+ rpc_set_waitqueue_owner(queue, task->tk_owner);
+out:
+ return task;
+}
+
+static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue)
+{
+ if (RPC_IS_PRIORITY(queue))
+ return __rpc_find_next_queued_priority(queue);
+ if (!list_empty(&queue->tasks[0]))
+ return list_first_entry(&queue->tasks[0], struct rpc_task, u.tk_wait.list);
+ return NULL;
+}
+
+/*
+ * Wake up the first task on the wait queue.
+ */
+struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+ bool (*func)(struct rpc_task *, void *), void *data)
+{
+ struct rpc_task *task = NULL;
+
+ dprintk("RPC: wake_up_first(%p \"%s\")\n",
+ queue, rpc_qname(queue));
+ spin_lock_bh(&queue->lock);
+ task = __rpc_find_next_queued(queue);
+ if (task != NULL) {
+ if (func(task, data))
+ rpc_wake_up_task_queue_locked(queue, task);
+ else
+ task = NULL;
+ }
+ spin_unlock_bh(&queue->lock);
+
+ return task;
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up_first);
+
+static bool rpc_wake_up_next_func(struct rpc_task *task, void *data)
+{
+ return true;
+}
+
+/*
+ * Wake up the next task on the wait queue.
+*/
+struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue)
+{
+ return rpc_wake_up_first(queue, rpc_wake_up_next_func, NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up_next);
+
+/**
+ * rpc_wake_up - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+ * Grabs queue->lock
+ */
+void rpc_wake_up(struct rpc_wait_queue *queue)
+{
+ struct list_head *head;
+
+ spin_lock_bh(&queue->lock);
+ head = &queue->tasks[queue->maxpriority];
+ for (;;) {
+ while (!list_empty(head)) {
+ struct rpc_task *task;
+ task = list_first_entry(head,
+ struct rpc_task,
+ u.tk_wait.list);
+ rpc_wake_up_task_queue_locked(queue, task);
+ }
+ if (head == &queue->tasks[0])
+ break;
+ head--;
+ }
+ spin_unlock_bh(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up);
+
+/**
+ * rpc_wake_up_status - wake up all rpc_tasks and set their status value.
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ *
+ * Grabs queue->lock
+ */
+void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
+{
+ struct list_head *head;
+
+ spin_lock_bh(&queue->lock);
+ head = &queue->tasks[queue->maxpriority];
+ for (;;) {
+ while (!list_empty(head)) {
+ struct rpc_task *task;
+ task = list_first_entry(head,
+ struct rpc_task,
+ u.tk_wait.list);
+ task->tk_status = status;
+ rpc_wake_up_task_queue_locked(queue, task);
+ }
+ if (head == &queue->tasks[0])
+ break;
+ head--;
+ }
+ spin_unlock_bh(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up_status);
+
+static void __rpc_queue_timer_fn(unsigned long ptr)
+{
+ struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr;
+ struct rpc_task *task, *n;
+ unsigned long expires, now, timeo;
+
+ spin_lock(&queue->lock);
+ expires = now = jiffies;
+ list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
+ timeo = task->u.tk_wait.expires;
+ if (time_after_eq(now, timeo)) {
+ dprintk("RPC: %5u timeout\n", task->tk_pid);
+ task->tk_status = -ETIMEDOUT;
+ rpc_wake_up_task_queue_locked(queue, task);
+ continue;
+ }
+ if (expires == now || time_after(expires, timeo))
+ expires = timeo;
+ }
+ if (!list_empty(&queue->timer_list.list))
+ rpc_set_queue_timer(queue, expires);
+ spin_unlock(&queue->lock);
+}
+
+static void __rpc_atrun(struct rpc_task *task)
+{
+ if (task->tk_status == -ETIMEDOUT)
+ task->tk_status = 0;
+}
+
+/*
+ * Run a task at a later time
+ */
+void rpc_delay(struct rpc_task *task, unsigned long delay)
+{
+ task->tk_timeout = delay;
+ rpc_sleep_on(&delay_queue, task, __rpc_atrun);
+}
+EXPORT_SYMBOL_GPL(rpc_delay);
+
+/*
+ * Helper to call task->tk_ops->rpc_call_prepare
+ */
+void rpc_prepare_task(struct rpc_task *task)
+{
+ task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
+}
+
+static void
+rpc_init_task_statistics(struct rpc_task *task)
+{
+ /* Initialize retry counters */
+ task->tk_garb_retry = 2;
+ task->tk_cred_retry = 2;
+ task->tk_rebind_retry = 2;
+
+ /* starting timestamp */
+ task->tk_start = ktime_get();
+}
+
+static void
+rpc_reset_task_statistics(struct rpc_task *task)
+{
+ task->tk_timeouts = 0;
+ task->tk_flags &= ~(RPC_CALL_MAJORSEEN|RPC_TASK_KILLED|RPC_TASK_SENT);
+
+ rpc_init_task_statistics(task);
+}
+
+/*
+ * Helper that calls task->tk_ops->rpc_call_done if it exists
+ */
+void rpc_exit_task(struct rpc_task *task)
+{
+ task->tk_action = NULL;
+ if (task->tk_ops->rpc_call_done != NULL) {
+ task->tk_ops->rpc_call_done(task, task->tk_calldata);
+ if (task->tk_action != NULL) {
+ WARN_ON(RPC_ASSASSINATED(task));
+ /* Always release the RPC slot and buffer memory */
+ xprt_release(task);
+ rpc_reset_task_statistics(task);
+ }
+ }
+}
+
+void rpc_exit(struct rpc_task *task, int status)
+{
+ task->tk_status = status;
+ task->tk_action = rpc_exit_task;
+ if (RPC_IS_QUEUED(task))
+ rpc_wake_up_queued_task(task->tk_waitqueue, task);
+}
+EXPORT_SYMBOL_GPL(rpc_exit);
+
+void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
+{
+ if (ops->rpc_release != NULL)
+ ops->rpc_release(calldata);
+}
+
+/*
+ * This is the RPC `scheduler' (or rather, the finite state machine).
+ */
+static void __rpc_execute(struct rpc_task *task)
+{
+ struct rpc_wait_queue *queue;
+ int task_is_async = RPC_IS_ASYNC(task);
+ int status = 0;
+
+ dprintk("RPC: %5u __rpc_execute flags=0x%x\n",
+ task->tk_pid, task->tk_flags);
+
+ WARN_ON_ONCE(RPC_IS_QUEUED(task));
+ if (RPC_IS_QUEUED(task))
+ return;
+
+ for (;;) {
+ void (*do_action)(struct rpc_task *);
+
+ /*
+ * Execute any pending callback first.
+ */
+ do_action = task->tk_callback;
+ task->tk_callback = NULL;
+ if (do_action == NULL) {
+ /*
+ * Perform the next FSM step.
+ * tk_action may be NULL if the task has been killed.
+ * In particular, note that rpc_killall_tasks may
+ * do this at any time, so beware when dereferencing.
+ */
+ do_action = task->tk_action;
+ if (do_action == NULL)
+ break;
+ }
+ trace_rpc_task_run_action(task->tk_client, task, task->tk_action);
+ do_action(task);
+
+ /*
+ * Lockless check for whether task is sleeping or not.
+ */
+ if (!RPC_IS_QUEUED(task))
+ continue;
+ /*
+ * The queue->lock protects against races with
+ * rpc_make_runnable().
+ *
+ * Note that once we clear RPC_TASK_RUNNING on an asynchronous
+ * rpc_task, rpc_make_runnable() can assign it to a
+ * different workqueue. We therefore cannot assume that the
+ * rpc_task pointer may still be dereferenced.
+ */
+ queue = task->tk_waitqueue;
+ spin_lock_bh(&queue->lock);
+ if (!RPC_IS_QUEUED(task)) {
+ spin_unlock_bh(&queue->lock);
+ continue;
+ }
+ rpc_clear_running(task);
+ spin_unlock_bh(&queue->lock);
+ if (task_is_async)
+ return;
+
+ /* sync task: sleep here */
+ dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid);
+ status = out_of_line_wait_on_bit(&task->tk_runstate,
+ RPC_TASK_QUEUED, rpc_wait_bit_killable,
+ TASK_KILLABLE);
+ if (status == -ERESTARTSYS) {
+ /*
+ * When a sync task receives a signal, it exits with
+ * -ERESTARTSYS. In order to catch any callbacks that
+ * clean up after sleeping on some queue, we don't
+ * break the loop here, but go around once more.
+ */
+ dprintk("RPC: %5u got signal\n", task->tk_pid);
+ task->tk_flags |= RPC_TASK_KILLED;
+ rpc_exit(task, -ERESTARTSYS);
+ }
+ dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
+ }
+
+ dprintk("RPC: %5u return %d, status %d\n", task->tk_pid, status,
+ task->tk_status);
+ /* Release all resources associated with the task */
+ rpc_release_task(task);
+}
+
+/*
+ * User-visible entry point to the scheduler.
+ *
+ * This may be called recursively if e.g. an async NFS task updates
+ * the attributes and finds that dirty pages must be flushed.
+ * NOTE: Upon exit of this function the task is guaranteed to be
+ * released. In particular note that tk_release() will have
+ * been called, so your task memory may have been freed.
+ */
+void rpc_execute(struct rpc_task *task)
+{
+ bool is_async = RPC_IS_ASYNC(task);
+
+ rpc_set_active(task);
+ rpc_make_runnable(task);
+ if (!is_async)
+ __rpc_execute(task);
+}
+
+static void rpc_async_schedule(struct work_struct *work)
+{
+ __rpc_execute(container_of(work, struct rpc_task, u.tk_work));
+}
+
+/**
+ * rpc_malloc - allocate an RPC buffer
+ * @task: RPC task that will use this buffer
+ * @size: requested byte size
+ *
+ * To prevent rpciod from hanging, this allocator never sleeps,
+ * returning NULL and suppressing warning if the request cannot be serviced
+ * immediately.
+ * The caller can arrange to sleep in a way that is safe for rpciod.
+ *
+ * Most requests are 'small' (under 2KiB) and can be serviced from a
+ * mempool, ensuring that NFS reads and writes can always proceed,
+ * and that there is good locality of reference for these buffers.
+ *
+ * In order to avoid memory starvation triggering more writebacks of
+ * NFS requests, we avoid using GFP_KERNEL.
+ */
+void *rpc_malloc(struct rpc_task *task, size_t size)
+{
+ struct rpc_buffer *buf;
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
+
+ if (RPC_IS_SWAPPER(task))
+ gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+
+ size += sizeof(struct rpc_buffer);
+ if (size <= RPC_BUFFER_MAXSIZE)
+ buf = mempool_alloc(rpc_buffer_mempool, gfp);
+ else
+ buf = kmalloc(size, gfp);
+
+ if (!buf)
+ return NULL;
+
+ buf->len = size;
+ dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
+ task->tk_pid, size, buf);
+ return &buf->data;
+}
+EXPORT_SYMBOL_GPL(rpc_malloc);
+
+/**
+ * rpc_free - free buffer allocated via rpc_malloc
+ * @buffer: buffer to free
+ *
+ */
+void rpc_free(void *buffer)
+{
+ size_t size;
+ struct rpc_buffer *buf;
+
+ if (!buffer)
+ return;
+
+ buf = container_of(buffer, struct rpc_buffer, data);
+ size = buf->len;
+
+ dprintk("RPC: freeing buffer of size %zu at %p\n",
+ size, buf);
+
+ if (size <= RPC_BUFFER_MAXSIZE)
+ mempool_free(buf, rpc_buffer_mempool);
+ else
+ kfree(buf);
+}
+EXPORT_SYMBOL_GPL(rpc_free);
+
+/*
+ * Creation and deletion of RPC task structures
+ */
+static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data)
+{
+ memset(task, 0, sizeof(*task));
+ atomic_set(&task->tk_count, 1);
+ task->tk_flags = task_setup_data->flags;
+ task->tk_ops = task_setup_data->callback_ops;
+ task->tk_calldata = task_setup_data->callback_data;
+ INIT_LIST_HEAD(&task->tk_task);
+
+ task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW;
+ task->tk_owner = current->tgid;
+
+ /* Initialize workqueue for async tasks */
+ task->tk_workqueue = task_setup_data->workqueue;
+
+ if (task->tk_ops->rpc_call_prepare != NULL)
+ task->tk_action = rpc_prepare_task;
+
+ rpc_init_task_statistics(task);
+
+ dprintk("RPC: new task initialized, procpid %u\n",
+ task_pid_nr(current));
+}
+
+static struct rpc_task *
+rpc_alloc_task(void)
+{
+ return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
+}
+
+/*
+ * Create a new task for the specified client.
+ */
+struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
+{
+ struct rpc_task *task = setup_data->task;
+ unsigned short flags = 0;
+
+ if (task == NULL) {
+ task = rpc_alloc_task();
+ if (task == NULL) {
+ rpc_release_calldata(setup_data->callback_ops,
+ setup_data->callback_data);
+ return ERR_PTR(-ENOMEM);
+ }
+ flags = RPC_TASK_DYNAMIC;
+ }
+
+ rpc_init_task(task, setup_data);
+ task->tk_flags |= flags;
+ dprintk("RPC: allocated task %p\n", task);
+ return task;
+}
+
+/*
+ * rpc_free_task - release rpc task and perform cleanups
+ *
+ * Note that we free up the rpc_task _after_ rpc_release_calldata()
+ * in order to work around a workqueue dependency issue.
+ *
+ * Tejun Heo states:
+ * "Workqueue currently considers two work items to be the same if they're
+ * on the same address and won't execute them concurrently - ie. it
+ * makes a work item which is queued again while being executed wait
+ * for the previous execution to complete.
+ *
+ * If a work function frees the work item, and then waits for an event
+ * which should be performed by another work item and *that* work item
+ * recycles the freed work item, it can create a false dependency loop.
+ * There really is no reliable way to detect this short of verifying
+ * every memory free."
+ *
+ */
+static void rpc_free_task(struct rpc_task *task)
+{
+ unsigned short tk_flags = task->tk_flags;
+
+ rpc_release_calldata(task->tk_ops, task->tk_calldata);
+
+ if (tk_flags & RPC_TASK_DYNAMIC) {
+ dprintk("RPC: %5u freeing task\n", task->tk_pid);
+ mempool_free(task, rpc_task_mempool);
+ }
+}
+
+static void rpc_async_release(struct work_struct *work)
+{
+ rpc_free_task(container_of(work, struct rpc_task, u.tk_work));
+}
+
+static void rpc_release_resources_task(struct rpc_task *task)
+{
+ xprt_release(task);
+ if (task->tk_msg.rpc_cred) {
+ put_rpccred(task->tk_msg.rpc_cred);
+ task->tk_msg.rpc_cred = NULL;
+ }
+ rpc_task_release_client(task);
+}
+
+static void rpc_final_put_task(struct rpc_task *task,
+ struct workqueue_struct *q)
+{
+ if (q != NULL) {
+ INIT_WORK(&task->u.tk_work, rpc_async_release);
+ queue_work(q, &task->u.tk_work);
+ } else
+ rpc_free_task(task);
+}
+
+static void rpc_do_put_task(struct rpc_task *task, struct workqueue_struct *q)
+{
+ if (atomic_dec_and_test(&task->tk_count)) {
+ rpc_release_resources_task(task);
+ rpc_final_put_task(task, q);
+ }
+}
+
+void rpc_put_task(struct rpc_task *task)
+{
+ rpc_do_put_task(task, NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_put_task);
+
+void rpc_put_task_async(struct rpc_task *task)
+{
+ rpc_do_put_task(task, task->tk_workqueue);
+}
+EXPORT_SYMBOL_GPL(rpc_put_task_async);
+
+static void rpc_release_task(struct rpc_task *task)
+{
+ dprintk("RPC: %5u release task\n", task->tk_pid);
+
+ WARN_ON_ONCE(RPC_IS_QUEUED(task));
+
+ rpc_release_resources_task(task);
+
+ /*
+ * Note: at this point we have been removed from rpc_clnt->cl_tasks,
+ * so it should be safe to use task->tk_count as a test for whether
+ * or not any other processes still hold references to our rpc_task.
+ */
+ if (atomic_read(&task->tk_count) != 1 + !RPC_IS_ASYNC(task)) {
+ /* Wake up anyone who may be waiting for task completion */
+ if (!rpc_complete_task(task))
+ return;
+ } else {
+ if (!atomic_dec_and_test(&task->tk_count))
+ return;
+ }
+ rpc_final_put_task(task, task->tk_workqueue);
+}
+
+int rpciod_up(void)
+{
+ return try_module_get(THIS_MODULE) ? 0 : -EINVAL;
+}
+
+void rpciod_down(void)
+{
+ module_put(THIS_MODULE);
+}
+
+/*
+ * Start up the rpciod workqueue.
+ */
+static int rpciod_start(void)
+{
+ struct workqueue_struct *wq;
+
+ /*
+ * Create the rpciod thread and wait for it to start.
+ */
+ dprintk("RPC: creating workqueue rpciod\n");
+ /* Note: highpri because network receive is latency sensitive */
+ wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ rpciod_workqueue = wq;
+ return rpciod_workqueue != NULL;
+}
+
+static void rpciod_stop(void)
+{
+ struct workqueue_struct *wq = NULL;
+
+ if (rpciod_workqueue == NULL)
+ return;
+ dprintk("RPC: destroying workqueue rpciod\n");
+
+ wq = rpciod_workqueue;
+ rpciod_workqueue = NULL;
+ destroy_workqueue(wq);
+}
+
+void
+rpc_destroy_mempool(void)
+{
+ rpciod_stop();
+ if (rpc_buffer_mempool)
+ mempool_destroy(rpc_buffer_mempool);
+ if (rpc_task_mempool)
+ mempool_destroy(rpc_task_mempool);
+ if (rpc_task_slabp)
+ kmem_cache_destroy(rpc_task_slabp);
+ if (rpc_buffer_slabp)
+ kmem_cache_destroy(rpc_buffer_slabp);
+ rpc_destroy_wait_queue(&delay_queue);
+}
+
+int
+rpc_init_mempool(void)
+{
+ /*
+ * The following is not strictly a mempool initialisation,
+ * but there is no harm in doing it here
+ */
+ rpc_init_wait_queue(&delay_queue, "delayq");
+ if (!rpciod_start())
+ goto err_nomem;
+
+ rpc_task_slabp = kmem_cache_create("rpc_tasks",
+ sizeof(struct rpc_task),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!rpc_task_slabp)
+ goto err_nomem;
+ rpc_buffer_slabp = kmem_cache_create("rpc_buffers",
+ RPC_BUFFER_MAXSIZE,
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!rpc_buffer_slabp)
+ goto err_nomem;
+ rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE,
+ rpc_task_slabp);
+ if (!rpc_task_mempool)
+ goto err_nomem;
+ rpc_buffer_mempool = mempool_create_slab_pool(RPC_BUFFER_POOLSIZE,
+ rpc_buffer_slabp);
+ if (!rpc_buffer_mempool)
+ goto err_nomem;
+ return 0;
+err_nomem:
+ rpc_destroy_mempool();
+ return -ENOMEM;
+}
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
new file mode 100644
index 000000000..2df87f78e
--- /dev/null
+++ b/net/sunrpc/socklib.c
@@ -0,0 +1,187 @@
+/*
+ * linux/net/sunrpc/socklib.c
+ *
+ * Common socket helper routines for RPC client and server
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/pagemap.h>
+#include <linux/udp.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/export.h>
+
+
+/**
+ * xdr_skb_read_bits - copy some data bits from skb to internal buffer
+ * @desc: sk_buff copy helper
+ * @to: copy destination
+ * @len: number of bytes to copy
+ *
+ * Possibly called several times to iterate over an sk_buff and copy
+ * data out of it.
+ */
+size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
+{
+ if (len > desc->count)
+ len = desc->count;
+ if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len)))
+ return 0;
+ desc->count -= len;
+ desc->offset += len;
+ return len;
+}
+EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
+
+/**
+ * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
+ * @desc: sk_buff copy helper
+ * @to: copy destination
+ * @len: number of bytes to copy
+ *
+ * Same as skb_read_bits, but calculate a checksum at the same time.
+ */
+static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, size_t len)
+{
+ unsigned int pos;
+ __wsum csum2;
+
+ if (len > desc->count)
+ len = desc->count;
+ pos = desc->offset;
+ csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0);
+ desc->csum = csum_block_add(desc->csum, csum2, pos);
+ desc->count -= len;
+ desc->offset += len;
+ return len;
+}
+
+/**
+ * xdr_partial_copy_from_skb - copy data out of an skb
+ * @xdr: target XDR buffer
+ * @base: starting offset
+ * @desc: sk_buff copy helper
+ * @copy_actor: virtual method for copying data
+ *
+ */
+ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor)
+{
+ struct page **ppage = xdr->pages;
+ unsigned int len, pglen = xdr->page_len;
+ ssize_t copied = 0;
+ size_t ret;
+
+ len = xdr->head[0].iov_len;
+ if (base < len) {
+ len -= base;
+ ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
+ copied += ret;
+ if (ret != len || !desc->count)
+ goto out;
+ base = 0;
+ } else
+ base -= len;
+
+ if (unlikely(pglen == 0))
+ goto copy_tail;
+ if (unlikely(base >= pglen)) {
+ base -= pglen;
+ goto copy_tail;
+ }
+ if (base || xdr->page_base) {
+ pglen -= base;
+ base += xdr->page_base;
+ ppage += base >> PAGE_CACHE_SHIFT;
+ base &= ~PAGE_CACHE_MASK;
+ }
+ do {
+ char *kaddr;
+
+ /* ACL likes to be lazy in allocating pages - ACLs
+ * are small by default but can get huge. */
+ if (unlikely(*ppage == NULL)) {
+ *ppage = alloc_page(GFP_ATOMIC);
+ if (unlikely(*ppage == NULL)) {
+ if (copied == 0)
+ copied = -ENOMEM;
+ goto out;
+ }
+ }
+
+ len = PAGE_CACHE_SIZE;
+ kaddr = kmap_atomic(*ppage);
+ if (base) {
+ len -= base;
+ if (pglen < len)
+ len = pglen;
+ ret = copy_actor(desc, kaddr + base, len);
+ base = 0;
+ } else {
+ if (pglen < len)
+ len = pglen;
+ ret = copy_actor(desc, kaddr, len);
+ }
+ flush_dcache_page(*ppage);
+ kunmap_atomic(kaddr);
+ copied += ret;
+ if (ret != len || !desc->count)
+ goto out;
+ ppage++;
+ } while ((pglen -= len) != 0);
+copy_tail:
+ len = xdr->tail[0].iov_len;
+ if (base < len)
+ copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+out:
+ return copied;
+}
+EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
+
+/**
+ * csum_partial_copy_to_xdr - checksum and copy data
+ * @xdr: target XDR buffer
+ * @skb: source skb
+ *
+ * We have set things up such that we perform the checksum of the UDP
+ * packet in parallel with the copies into the RPC client iovec. -DaveM
+ */
+int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
+{
+ struct xdr_skb_reader desc;
+
+ desc.skb = skb;
+ desc.offset = sizeof(struct udphdr);
+ desc.count = skb->len - desc.offset;
+
+ if (skb_csum_unnecessary(skb))
+ goto no_checksum;
+
+ desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
+ if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_and_csum_bits) < 0)
+ return -1;
+ if (desc.offset != skb->len) {
+ __wsum csum2;
+ csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
+ desc.csum = csum_block_add(desc.csum, csum2, desc.offset);
+ }
+ if (desc.count)
+ return -1;
+ if (csum_fold(desc.csum))
+ return -1;
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev);
+ return 0;
+no_checksum:
+ if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
+ return -1;
+ if (desc.count)
+ return -1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
new file mode 100644
index 000000000..2ecb99431
--- /dev/null
+++ b/net/sunrpc/stats.c
@@ -0,0 +1,305 @@
+/*
+ * linux/net/sunrpc/stats.c
+ *
+ * procfs-based user access to generic RPC statistics. The stats files
+ * reside in /proc/net/rpc.
+ *
+ * The read routines assume that the buffer passed in is just big enough.
+ * If you implement an RPC service that has its own stats routine which
+ * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE
+ * limit.
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/rcupdate.h>
+
+#include "netns.h"
+
+#define RPCDBG_FACILITY RPCDBG_MISC
+
+/*
+ * Get RPC client stats
+ */
+static int rpc_proc_show(struct seq_file *seq, void *v) {
+ const struct rpc_stat *statp = seq->private;
+ const struct rpc_program *prog = statp->program;
+ unsigned int i, j;
+
+ seq_printf(seq,
+ "net %u %u %u %u\n",
+ statp->netcnt,
+ statp->netudpcnt,
+ statp->nettcpcnt,
+ statp->nettcpconn);
+ seq_printf(seq,
+ "rpc %u %u %u\n",
+ statp->rpccnt,
+ statp->rpcretrans,
+ statp->rpcauthrefresh);
+
+ for (i = 0; i < prog->nrvers; i++) {
+ const struct rpc_version *vers = prog->version[i];
+ if (!vers)
+ continue;
+ seq_printf(seq, "proc%u %u",
+ vers->number, vers->nrprocs);
+ for (j = 0; j < vers->nrprocs; j++)
+ seq_printf(seq, " %u",
+ vers->procs[j].p_count);
+ seq_putc(seq, '\n');
+ }
+ return 0;
+}
+
+static int rpc_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rpc_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations rpc_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = rpc_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * Get RPC server stats
+ */
+void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
+ const struct svc_program *prog = statp->program;
+ const struct svc_procedure *proc;
+ const struct svc_version *vers;
+ unsigned int i, j;
+
+ seq_printf(seq,
+ "net %u %u %u %u\n",
+ statp->netcnt,
+ statp->netudpcnt,
+ statp->nettcpcnt,
+ statp->nettcpconn);
+ seq_printf(seq,
+ "rpc %u %u %u %u %u\n",
+ statp->rpccnt,
+ statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt,
+ statp->rpcbadfmt,
+ statp->rpcbadauth,
+ statp->rpcbadclnt);
+
+ for (i = 0; i < prog->pg_nvers; i++) {
+ if (!(vers = prog->pg_vers[i]) || !(proc = vers->vs_proc))
+ continue;
+ seq_printf(seq, "proc%d %u", i, vers->vs_nproc);
+ for (j = 0; j < vers->vs_nproc; j++, proc++)
+ seq_printf(seq, " %u", proc->pc_count);
+ seq_putc(seq, '\n');
+ }
+}
+EXPORT_SYMBOL_GPL(svc_seq_show);
+
+/**
+ * rpc_alloc_iostats - allocate an rpc_iostats structure
+ * @clnt: RPC program, version, and xprt
+ *
+ */
+struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt)
+{
+ struct rpc_iostats *stats;
+ int i;
+
+ stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL);
+ if (stats) {
+ for (i = 0; i < clnt->cl_maxproc; i++)
+ spin_lock_init(&stats[i].om_lock);
+ }
+ return stats;
+}
+EXPORT_SYMBOL_GPL(rpc_alloc_iostats);
+
+/**
+ * rpc_free_iostats - release an rpc_iostats structure
+ * @stats: doomed rpc_iostats structure
+ *
+ */
+void rpc_free_iostats(struct rpc_iostats *stats)
+{
+ kfree(stats);
+}
+EXPORT_SYMBOL_GPL(rpc_free_iostats);
+
+/**
+ * rpc_count_iostats_metrics - tally up per-task stats
+ * @task: completed rpc_task
+ * @op_metrics: stat structure for OP that will accumulate stats from @task
+ */
+void rpc_count_iostats_metrics(const struct rpc_task *task,
+ struct rpc_iostats *op_metrics)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ ktime_t delta, now;
+
+ if (!op_metrics || !req)
+ return;
+
+ now = ktime_get();
+ spin_lock(&op_metrics->om_lock);
+
+ op_metrics->om_ops++;
+ op_metrics->om_ntrans += req->rq_ntrans;
+ op_metrics->om_timeouts += task->tk_timeouts;
+
+ op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
+ op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
+
+ delta = ktime_sub(req->rq_xtime, task->tk_start);
+ op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
+
+ op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
+
+ delta = ktime_sub(now, task->tk_start);
+ op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta);
+
+ spin_unlock(&op_metrics->om_lock);
+}
+EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics);
+
+/**
+ * rpc_count_iostats - tally up per-task stats
+ * @task: completed rpc_task
+ * @stats: array of stat structures
+ *
+ * Uses the statidx from @task
+ */
+void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats)
+{
+ rpc_count_iostats_metrics(task,
+ &stats[task->tk_msg.rpc_proc->p_statidx]);
+}
+EXPORT_SYMBOL_GPL(rpc_count_iostats);
+
+static void _print_name(struct seq_file *seq, unsigned int op,
+ struct rpc_procinfo *procs)
+{
+ if (procs[op].p_name)
+ seq_printf(seq, "\t%12s: ", procs[op].p_name);
+ else if (op == 0)
+ seq_printf(seq, "\t NULL: ");
+ else
+ seq_printf(seq, "\t%12u: ", op);
+}
+
+void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
+{
+ struct rpc_iostats *stats = clnt->cl_metrics;
+ struct rpc_xprt *xprt;
+ unsigned int op, maxproc = clnt->cl_maxproc;
+
+ if (!stats)
+ return;
+
+ seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS);
+ seq_printf(seq, "p/v: %u/%u (%s)\n",
+ clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name);
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ if (xprt)
+ xprt->ops->print_stats(xprt, seq);
+ rcu_read_unlock();
+
+ seq_printf(seq, "\tper-op statistics\n");
+ for (op = 0; op < maxproc; op++) {
+ struct rpc_iostats *metrics = &stats[op];
+ _print_name(seq, op, clnt->cl_procinfo);
+ seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
+ metrics->om_ops,
+ metrics->om_ntrans,
+ metrics->om_timeouts,
+ metrics->om_bytes_sent,
+ metrics->om_bytes_recv,
+ ktime_to_ms(metrics->om_queue),
+ ktime_to_ms(metrics->om_rtt),
+ ktime_to_ms(metrics->om_execute));
+ }
+}
+EXPORT_SYMBOL_GPL(rpc_print_iostats);
+
+/*
+ * Register/unregister RPC proc files
+ */
+static inline struct proc_dir_entry *
+do_register(struct net *net, const char *name, void *data,
+ const struct file_operations *fops)
+{
+ struct sunrpc_net *sn;
+
+ dprintk("RPC: registering /proc/net/rpc/%s\n", name);
+ sn = net_generic(net, sunrpc_net_id);
+ return proc_create_data(name, 0, sn->proc_net_rpc, fops, data);
+}
+
+struct proc_dir_entry *
+rpc_proc_register(struct net *net, struct rpc_stat *statp)
+{
+ return do_register(net, statp->program->name, statp, &rpc_proc_fops);
+}
+EXPORT_SYMBOL_GPL(rpc_proc_register);
+
+void
+rpc_proc_unregister(struct net *net, const char *name)
+{
+ struct sunrpc_net *sn;
+
+ sn = net_generic(net, sunrpc_net_id);
+ remove_proc_entry(name, sn->proc_net_rpc);
+}
+EXPORT_SYMBOL_GPL(rpc_proc_unregister);
+
+struct proc_dir_entry *
+svc_proc_register(struct net *net, struct svc_stat *statp, const struct file_operations *fops)
+{
+ return do_register(net, statp->program->pg_name, statp, fops);
+}
+EXPORT_SYMBOL_GPL(svc_proc_register);
+
+void
+svc_proc_unregister(struct net *net, const char *name)
+{
+ struct sunrpc_net *sn;
+
+ sn = net_generic(net, sunrpc_net_id);
+ remove_proc_entry(name, sn->proc_net_rpc);
+}
+EXPORT_SYMBOL_GPL(svc_proc_unregister);
+
+int rpc_proc_init(struct net *net)
+{
+ struct sunrpc_net *sn;
+
+ dprintk("RPC: registering /proc/net/rpc\n");
+ sn = net_generic(net, sunrpc_net_id);
+ sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net);
+ if (sn->proc_net_rpc == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rpc_proc_exit(struct net *net)
+{
+ dprintk("RPC: unregistering /proc/net/rpc\n");
+ remove_proc_entry("rpc", net->proc_net);
+}
+
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
new file mode 100644
index 000000000..f2b7cb540
--- /dev/null
+++ b/net/sunrpc/sunrpc.h
@@ -0,0 +1,66 @@
+/******************************************************************************
+
+(c) 2008 NetApp. All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * Functions and macros used internally by RPC
+ */
+
+#ifndef _NET_SUNRPC_SUNRPC_H
+#define _NET_SUNRPC_SUNRPC_H
+
+#include <linux/net.h>
+
+/*
+ * Header for dynamically allocated rpc buffers.
+ */
+struct rpc_buffer {
+ size_t len;
+ char data[];
+};
+
+static inline int rpc_reply_expected(struct rpc_task *task)
+{
+ return (task->tk_msg.rpc_proc != NULL) &&
+ (task->tk_msg.rpc_proc->p_decode != NULL);
+}
+
+static inline int sock_is_loopback(struct sock *sk)
+{
+ struct dst_entry *dst;
+ int loopback = 0;
+ rcu_read_lock();
+ dst = rcu_dereference(sk->sk_dst_cache);
+ if (dst && dst->dev &&
+ (dst->dev->features & NETIF_F_LOOPBACK))
+ loopback = 1;
+ rcu_read_unlock();
+ return loopback;
+}
+
+int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
+ struct page *headpage, unsigned long headoffset,
+ struct page *tailpage, unsigned long tailoffset);
+
+int rpc_clients_notifier_register(void);
+void rpc_clients_notifier_unregister(void);
+#endif /* _NET_SUNRPC_SUNRPC_H */
+
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
new file mode 100644
index 000000000..ee5d3d253
--- /dev/null
+++ b/net/sunrpc/sunrpc_syms.c
@@ -0,0 +1,136 @@
+/*
+ * linux/net/sunrpc/sunrpc_syms.c
+ *
+ * Symbols exported by the sunrpc module.
+ *
+ * Copyright (C) 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/xprtsock.h>
+
+#include "netns.h"
+
+int sunrpc_net_id;
+EXPORT_SYMBOL_GPL(sunrpc_net_id);
+
+static __net_init int sunrpc_init_net(struct net *net)
+{
+ int err;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ err = rpc_proc_init(net);
+ if (err)
+ goto err_proc;
+
+ err = ip_map_cache_create(net);
+ if (err)
+ goto err_ipmap;
+
+ err = unix_gid_cache_create(net);
+ if (err)
+ goto err_unixgid;
+
+ err = rpc_pipefs_init_net(net);
+ if (err)
+ goto err_pipefs;
+
+ INIT_LIST_HEAD(&sn->all_clients);
+ spin_lock_init(&sn->rpc_client_lock);
+ spin_lock_init(&sn->rpcb_clnt_lock);
+ return 0;
+
+err_pipefs:
+ unix_gid_cache_destroy(net);
+err_unixgid:
+ ip_map_cache_destroy(net);
+err_ipmap:
+ rpc_proc_exit(net);
+err_proc:
+ return err;
+}
+
+static __net_exit void sunrpc_exit_net(struct net *net)
+{
+ rpc_pipefs_exit_net(net);
+ unix_gid_cache_destroy(net);
+ ip_map_cache_destroy(net);
+ rpc_proc_exit(net);
+}
+
+static struct pernet_operations sunrpc_net_ops = {
+ .init = sunrpc_init_net,
+ .exit = sunrpc_exit_net,
+ .id = &sunrpc_net_id,
+ .size = sizeof(struct sunrpc_net),
+};
+
+static int __init
+init_sunrpc(void)
+{
+ int err = rpc_init_mempool();
+ if (err)
+ goto out;
+ err = rpcauth_init_module();
+ if (err)
+ goto out2;
+
+ cache_initialize();
+
+ err = register_pernet_subsys(&sunrpc_net_ops);
+ if (err)
+ goto out3;
+
+ err = register_rpc_pipefs();
+ if (err)
+ goto out4;
+
+ sunrpc_debugfs_init();
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ rpc_register_sysctl();
+#endif
+ svc_init_xprt_sock(); /* svc sock transport */
+ init_socket_xprt(); /* clnt sock transport */
+ return 0;
+
+out4:
+ unregister_pernet_subsys(&sunrpc_net_ops);
+out3:
+ rpcauth_remove_module();
+out2:
+ rpc_destroy_mempool();
+out:
+ return err;
+}
+
+static void __exit
+cleanup_sunrpc(void)
+{
+ rpcauth_remove_module();
+ cleanup_socket_xprt();
+ svc_cleanup_xprt_sock();
+ sunrpc_debugfs_exit();
+ unregister_rpc_pipefs();
+ rpc_destroy_mempool();
+ unregister_pernet_subsys(&sunrpc_net_ops);
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ rpc_unregister_sysctl();
+#endif
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+MODULE_LICENSE("GPL");
+fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */
+module_exit(cleanup_sunrpc);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
new file mode 100644
index 000000000..78974e4d9
--- /dev/null
+++ b/net/sunrpc/svc.c
@@ -0,0 +1,1405 @@
+/*
+ * linux/net/sunrpc/svc.c
+ *
+ * High-level RPC service routines
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ *
+ * Multiple threads pools and NUMAisation
+ * Copyright (c) 2006 Silicon Graphics, Inc.
+ * by Greg Banks <gnb@melbourne.sgi.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <trace/events/sunrpc.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCDSP
+
+static void svc_unregister(const struct svc_serv *serv, struct net *net);
+
+#define svc_serv_is_pooled(serv) ((serv)->sv_function)
+
+/*
+ * Mode for mapping cpus to pools.
+ */
+enum {
+ SVC_POOL_AUTO = -1, /* choose one of the others */
+ SVC_POOL_GLOBAL, /* no mapping, just a single global pool
+ * (legacy & UP mode) */
+ SVC_POOL_PERCPU, /* one pool per cpu */
+ SVC_POOL_PERNODE /* one pool per numa node */
+};
+#define SVC_POOL_DEFAULT SVC_POOL_GLOBAL
+
+/*
+ * Structure for mapping cpus to pools and vice versa.
+ * Setup once during sunrpc initialisation.
+ */
+static struct svc_pool_map {
+ int count; /* How many svc_servs use us */
+ int mode; /* Note: int not enum to avoid
+ * warnings about "enumeration value
+ * not handled in switch" */
+ unsigned int npools;
+ unsigned int *pool_to; /* maps pool id to cpu or node */
+ unsigned int *to_pool; /* maps cpu or node to pool id */
+} svc_pool_map = {
+ .count = 0,
+ .mode = SVC_POOL_DEFAULT
+};
+static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
+
+static int
+param_set_pool_mode(const char *val, struct kernel_param *kp)
+{
+ int *ip = (int *)kp->arg;
+ struct svc_pool_map *m = &svc_pool_map;
+ int err;
+
+ mutex_lock(&svc_pool_map_mutex);
+
+ err = -EBUSY;
+ if (m->count)
+ goto out;
+
+ err = 0;
+ if (!strncmp(val, "auto", 4))
+ *ip = SVC_POOL_AUTO;
+ else if (!strncmp(val, "global", 6))
+ *ip = SVC_POOL_GLOBAL;
+ else if (!strncmp(val, "percpu", 6))
+ *ip = SVC_POOL_PERCPU;
+ else if (!strncmp(val, "pernode", 7))
+ *ip = SVC_POOL_PERNODE;
+ else
+ err = -EINVAL;
+
+out:
+ mutex_unlock(&svc_pool_map_mutex);
+ return err;
+}
+
+static int
+param_get_pool_mode(char *buf, struct kernel_param *kp)
+{
+ int *ip = (int *)kp->arg;
+
+ switch (*ip)
+ {
+ case SVC_POOL_AUTO:
+ return strlcpy(buf, "auto", 20);
+ case SVC_POOL_GLOBAL:
+ return strlcpy(buf, "global", 20);
+ case SVC_POOL_PERCPU:
+ return strlcpy(buf, "percpu", 20);
+ case SVC_POOL_PERNODE:
+ return strlcpy(buf, "pernode", 20);
+ default:
+ return sprintf(buf, "%d", *ip);
+ }
+}
+
+module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode,
+ &svc_pool_map.mode, 0644);
+
+/*
+ * Detect best pool mapping mode heuristically,
+ * according to the machine's topology.
+ */
+static int
+svc_pool_map_choose_mode(void)
+{
+ unsigned int node;
+
+ if (nr_online_nodes > 1) {
+ /*
+ * Actually have multiple NUMA nodes,
+ * so split pools on NUMA node boundaries
+ */
+ return SVC_POOL_PERNODE;
+ }
+
+ node = first_online_node;
+ if (nr_cpus_node(node) > 2) {
+ /*
+ * Non-trivial SMP, or CONFIG_NUMA on
+ * non-NUMA hardware, e.g. with a generic
+ * x86_64 kernel on Xeons. In this case we
+ * want to divide the pools on cpu boundaries.
+ */
+ return SVC_POOL_PERCPU;
+ }
+
+ /* default: one global pool */
+ return SVC_POOL_GLOBAL;
+}
+
+/*
+ * Allocate the to_pool[] and pool_to[] arrays.
+ * Returns 0 on success or an errno.
+ */
+static int
+svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
+{
+ m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
+ if (!m->to_pool)
+ goto fail;
+ m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
+ if (!m->pool_to)
+ goto fail_free;
+
+ return 0;
+
+fail_free:
+ kfree(m->to_pool);
+ m->to_pool = NULL;
+fail:
+ return -ENOMEM;
+}
+
+/*
+ * Initialise the pool map for SVC_POOL_PERCPU mode.
+ * Returns number of pools or <0 on error.
+ */
+static int
+svc_pool_map_init_percpu(struct svc_pool_map *m)
+{
+ unsigned int maxpools = nr_cpu_ids;
+ unsigned int pidx = 0;
+ unsigned int cpu;
+ int err;
+
+ err = svc_pool_map_alloc_arrays(m, maxpools);
+ if (err)
+ return err;
+
+ for_each_online_cpu(cpu) {
+ BUG_ON(pidx >= maxpools);
+ m->to_pool[cpu] = pidx;
+ m->pool_to[pidx] = cpu;
+ pidx++;
+ }
+ /* cpus brought online later all get mapped to pool0, sorry */
+
+ return pidx;
+};
+
+
+/*
+ * Initialise the pool map for SVC_POOL_PERNODE mode.
+ * Returns number of pools or <0 on error.
+ */
+static int
+svc_pool_map_init_pernode(struct svc_pool_map *m)
+{
+ unsigned int maxpools = nr_node_ids;
+ unsigned int pidx = 0;
+ unsigned int node;
+ int err;
+
+ err = svc_pool_map_alloc_arrays(m, maxpools);
+ if (err)
+ return err;
+
+ for_each_node_with_cpus(node) {
+ /* some architectures (e.g. SN2) have cpuless nodes */
+ BUG_ON(pidx > maxpools);
+ m->to_pool[node] = pidx;
+ m->pool_to[pidx] = node;
+ pidx++;
+ }
+ /* nodes brought online later all get mapped to pool0, sorry */
+
+ return pidx;
+}
+
+
+/*
+ * Add a reference to the global map of cpus to pools (and
+ * vice versa). Initialise the map if we're the first user.
+ * Returns the number of pools.
+ */
+static unsigned int
+svc_pool_map_get(void)
+{
+ struct svc_pool_map *m = &svc_pool_map;
+ int npools = -1;
+
+ mutex_lock(&svc_pool_map_mutex);
+
+ if (m->count++) {
+ mutex_unlock(&svc_pool_map_mutex);
+ return m->npools;
+ }
+
+ if (m->mode == SVC_POOL_AUTO)
+ m->mode = svc_pool_map_choose_mode();
+
+ switch (m->mode) {
+ case SVC_POOL_PERCPU:
+ npools = svc_pool_map_init_percpu(m);
+ break;
+ case SVC_POOL_PERNODE:
+ npools = svc_pool_map_init_pernode(m);
+ break;
+ }
+
+ if (npools < 0) {
+ /* default, or memory allocation failure */
+ npools = 1;
+ m->mode = SVC_POOL_GLOBAL;
+ }
+ m->npools = npools;
+
+ mutex_unlock(&svc_pool_map_mutex);
+ return m->npools;
+}
+
+
+/*
+ * Drop a reference to the global map of cpus to pools.
+ * When the last reference is dropped, the map data is
+ * freed; this allows the sysadmin to change the pool
+ * mode using the pool_mode module option without
+ * rebooting or re-loading sunrpc.ko.
+ */
+static void
+svc_pool_map_put(void)
+{
+ struct svc_pool_map *m = &svc_pool_map;
+
+ mutex_lock(&svc_pool_map_mutex);
+
+ if (!--m->count) {
+ kfree(m->to_pool);
+ m->to_pool = NULL;
+ kfree(m->pool_to);
+ m->pool_to = NULL;
+ m->npools = 0;
+ }
+
+ mutex_unlock(&svc_pool_map_mutex);
+}
+
+
+static int svc_pool_map_get_node(unsigned int pidx)
+{
+ const struct svc_pool_map *m = &svc_pool_map;
+
+ if (m->count) {
+ if (m->mode == SVC_POOL_PERCPU)
+ return cpu_to_node(m->pool_to[pidx]);
+ if (m->mode == SVC_POOL_PERNODE)
+ return m->pool_to[pidx];
+ }
+ return NUMA_NO_NODE;
+}
+/*
+ * Set the given thread's cpus_allowed mask so that it
+ * will only run on cpus in the given pool.
+ */
+static inline void
+svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
+{
+ struct svc_pool_map *m = &svc_pool_map;
+ unsigned int node = m->pool_to[pidx];
+
+ /*
+ * The caller checks for sv_nrpools > 1, which
+ * implies that we've been initialized.
+ */
+ WARN_ON_ONCE(m->count == 0);
+ if (m->count == 0)
+ return;
+
+ switch (m->mode) {
+ case SVC_POOL_PERCPU:
+ {
+ set_cpus_allowed_ptr(task, cpumask_of(node));
+ break;
+ }
+ case SVC_POOL_PERNODE:
+ {
+ set_cpus_allowed_ptr(task, cpumask_of_node(node));
+ break;
+ }
+ }
+}
+
+/*
+ * Use the mapping mode to choose a pool for a given CPU.
+ * Used when enqueueing an incoming RPC. Always returns
+ * a non-NULL pool pointer.
+ */
+struct svc_pool *
+svc_pool_for_cpu(struct svc_serv *serv, int cpu)
+{
+ struct svc_pool_map *m = &svc_pool_map;
+ unsigned int pidx = 0;
+
+ /*
+ * An uninitialised map happens in a pure client when
+ * lockd is brought up, so silently treat it the
+ * same as SVC_POOL_GLOBAL.
+ */
+ if (svc_serv_is_pooled(serv)) {
+ switch (m->mode) {
+ case SVC_POOL_PERCPU:
+ pidx = m->to_pool[cpu];
+ break;
+ case SVC_POOL_PERNODE:
+ pidx = m->to_pool[cpu_to_node(cpu)];
+ break;
+ }
+ }
+ return &serv->sv_pools[pidx % serv->sv_nrpools];
+}
+
+int svc_rpcb_setup(struct svc_serv *serv, struct net *net)
+{
+ int err;
+
+ err = rpcb_create_local(net);
+ if (err)
+ return err;
+
+ /* Remove any stale portmap registrations */
+ svc_unregister(serv, net);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(svc_rpcb_setup);
+
+void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net)
+{
+ svc_unregister(serv, net);
+ rpcb_put_local(net);
+}
+EXPORT_SYMBOL_GPL(svc_rpcb_cleanup);
+
+static int svc_uses_rpcbind(struct svc_serv *serv)
+{
+ struct svc_program *progp;
+ unsigned int i;
+
+ for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+ for (i = 0; i < progp->pg_nvers; i++) {
+ if (progp->pg_vers[i] == NULL)
+ continue;
+ if (progp->pg_vers[i]->vs_hidden == 0)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+int svc_bind(struct svc_serv *serv, struct net *net)
+{
+ if (!svc_uses_rpcbind(serv))
+ return 0;
+ return svc_rpcb_setup(serv, net);
+}
+EXPORT_SYMBOL_GPL(svc_bind);
+
+/*
+ * Create an RPC service
+ */
+static struct svc_serv *
+__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
+ void (*shutdown)(struct svc_serv *serv, struct net *net))
+{
+ struct svc_serv *serv;
+ unsigned int vers;
+ unsigned int xdrsize;
+ unsigned int i;
+
+ if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
+ return NULL;
+ serv->sv_name = prog->pg_name;
+ serv->sv_program = prog;
+ serv->sv_nrthreads = 1;
+ serv->sv_stats = prog->pg_stats;
+ if (bufsize > RPCSVC_MAXPAYLOAD)
+ bufsize = RPCSVC_MAXPAYLOAD;
+ serv->sv_max_payload = bufsize? bufsize : 4096;
+ serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE);
+ serv->sv_shutdown = shutdown;
+ xdrsize = 0;
+ while (prog) {
+ prog->pg_lovers = prog->pg_nvers-1;
+ for (vers=0; vers<prog->pg_nvers ; vers++)
+ if (prog->pg_vers[vers]) {
+ prog->pg_hivers = vers;
+ if (prog->pg_lovers > vers)
+ prog->pg_lovers = vers;
+ if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
+ xdrsize = prog->pg_vers[vers]->vs_xdrsize;
+ }
+ prog = prog->pg_next;
+ }
+ serv->sv_xdrsize = xdrsize;
+ INIT_LIST_HEAD(&serv->sv_tempsocks);
+ INIT_LIST_HEAD(&serv->sv_permsocks);
+ init_timer(&serv->sv_temptimer);
+ spin_lock_init(&serv->sv_lock);
+
+ serv->sv_nrpools = npools;
+ serv->sv_pools =
+ kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
+ GFP_KERNEL);
+ if (!serv->sv_pools) {
+ kfree(serv);
+ return NULL;
+ }
+
+ for (i = 0; i < serv->sv_nrpools; i++) {
+ struct svc_pool *pool = &serv->sv_pools[i];
+
+ dprintk("svc: initialising pool %u for %s\n",
+ i, serv->sv_name);
+
+ pool->sp_id = i;
+ INIT_LIST_HEAD(&pool->sp_sockets);
+ INIT_LIST_HEAD(&pool->sp_all_threads);
+ spin_lock_init(&pool->sp_lock);
+ }
+
+ return serv;
+}
+
+struct svc_serv *
+svc_create(struct svc_program *prog, unsigned int bufsize,
+ void (*shutdown)(struct svc_serv *serv, struct net *net))
+{
+ return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+}
+EXPORT_SYMBOL_GPL(svc_create);
+
+struct svc_serv *
+svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
+ void (*shutdown)(struct svc_serv *serv, struct net *net),
+ svc_thread_fn func, struct module *mod)
+{
+ struct svc_serv *serv;
+ unsigned int npools = svc_pool_map_get();
+
+ serv = __svc_create(prog, bufsize, npools, shutdown);
+ if (!serv)
+ goto out_err;
+
+ serv->sv_function = func;
+ serv->sv_module = mod;
+ return serv;
+out_err:
+ svc_pool_map_put();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(svc_create_pooled);
+
+void svc_shutdown_net(struct svc_serv *serv, struct net *net)
+{
+ svc_close_net(serv, net);
+
+ if (serv->sv_shutdown)
+ serv->sv_shutdown(serv, net);
+}
+EXPORT_SYMBOL_GPL(svc_shutdown_net);
+
+/*
+ * Destroy an RPC service. Should be called with appropriate locking to
+ * protect the sv_nrthreads, sv_permsocks and sv_tempsocks.
+ */
+void
+svc_destroy(struct svc_serv *serv)
+{
+ dprintk("svc: svc_destroy(%s, %d)\n",
+ serv->sv_program->pg_name,
+ serv->sv_nrthreads);
+
+ if (serv->sv_nrthreads) {
+ if (--(serv->sv_nrthreads) != 0) {
+ svc_sock_update_bufs(serv);
+ return;
+ }
+ } else
+ printk("svc_destroy: no threads for serv=%p!\n", serv);
+
+ del_timer_sync(&serv->sv_temptimer);
+
+ /*
+ * The last user is gone and thus all sockets have to be destroyed to
+ * the point. Check this.
+ */
+ BUG_ON(!list_empty(&serv->sv_permsocks));
+ BUG_ON(!list_empty(&serv->sv_tempsocks));
+
+ cache_clean_deferred(serv);
+
+ if (svc_serv_is_pooled(serv))
+ svc_pool_map_put();
+
+ kfree(serv->sv_pools);
+ kfree(serv);
+}
+EXPORT_SYMBOL_GPL(svc_destroy);
+
+/*
+ * Allocate an RPC server's buffer space.
+ * We allocate pages and place them in rq_argpages.
+ */
+static int
+svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
+{
+ unsigned int pages, arghi;
+
+ /* bc_xprt uses fore channel allocated buffers */
+ if (svc_is_backchannel(rqstp))
+ return 1;
+
+ pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply.
+ * We assume one is at most one page
+ */
+ arghi = 0;
+ WARN_ON_ONCE(pages > RPCSVC_MAXPAGES);
+ if (pages > RPCSVC_MAXPAGES)
+ pages = RPCSVC_MAXPAGES;
+ while (pages) {
+ struct page *p = alloc_pages_node(node, GFP_KERNEL, 0);
+ if (!p)
+ break;
+ rqstp->rq_pages[arghi++] = p;
+ pages--;
+ }
+ return pages == 0;
+}
+
+/*
+ * Release an RPC server buffer
+ */
+static void
+svc_release_buffer(struct svc_rqst *rqstp)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(rqstp->rq_pages); i++)
+ if (rqstp->rq_pages[i])
+ put_page(rqstp->rq_pages[i]);
+}
+
+struct svc_rqst *
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
+{
+ struct svc_rqst *rqstp;
+
+ rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node);
+ if (!rqstp)
+ goto out_enomem;
+
+ serv->sv_nrthreads++;
+ __set_bit(RQ_BUSY, &rqstp->rq_flags);
+ spin_lock_init(&rqstp->rq_lock);
+ rqstp->rq_server = serv;
+ rqstp->rq_pool = pool;
+ spin_lock_bh(&pool->sp_lock);
+ pool->sp_nrthreads++;
+ list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
+ spin_unlock_bh(&pool->sp_lock);
+
+ rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
+ if (!rqstp->rq_argp)
+ goto out_thread;
+
+ rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
+ if (!rqstp->rq_resp)
+ goto out_thread;
+
+ if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node))
+ goto out_thread;
+
+ return rqstp;
+out_thread:
+ svc_exit_thread(rqstp);
+out_enomem:
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL_GPL(svc_prepare_thread);
+
+/*
+ * Choose a pool in which to create a new thread, for svc_set_num_threads
+ */
+static inline struct svc_pool *
+choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
+{
+ if (pool != NULL)
+ return pool;
+
+ return &serv->sv_pools[(*state)++ % serv->sv_nrpools];
+}
+
+/*
+ * Choose a thread to kill, for svc_set_num_threads
+ */
+static inline struct task_struct *
+choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
+{
+ unsigned int i;
+ struct task_struct *task = NULL;
+
+ if (pool != NULL) {
+ spin_lock_bh(&pool->sp_lock);
+ } else {
+ /* choose a pool in round-robin fashion */
+ for (i = 0; i < serv->sv_nrpools; i++) {
+ pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
+ spin_lock_bh(&pool->sp_lock);
+ if (!list_empty(&pool->sp_all_threads))
+ goto found_pool;
+ spin_unlock_bh(&pool->sp_lock);
+ }
+ return NULL;
+ }
+
+found_pool:
+ if (!list_empty(&pool->sp_all_threads)) {
+ struct svc_rqst *rqstp;
+
+ /*
+ * Remove from the pool->sp_all_threads list
+ * so we don't try to kill it again.
+ */
+ rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
+ set_bit(RQ_VICTIM, &rqstp->rq_flags);
+ list_del_rcu(&rqstp->rq_all);
+ task = rqstp->rq_task;
+ }
+ spin_unlock_bh(&pool->sp_lock);
+
+ return task;
+}
+
+/*
+ * Create or destroy enough new threads to make the number
+ * of threads the given number. If `pool' is non-NULL, applies
+ * only to threads in that pool, otherwise round-robins between
+ * all pools. Caller must ensure that mutual exclusion between this and
+ * server startup or shutdown.
+ *
+ * Destroying threads relies on the service threads filling in
+ * rqstp->rq_task, which only the nfs ones do. Assumes the serv
+ * has been created using svc_create_pooled().
+ *
+ * Based on code that used to be in nfsd_svc() but tweaked
+ * to be pool-aware.
+ */
+int
+svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ struct svc_rqst *rqstp;
+ struct task_struct *task;
+ struct svc_pool *chosen_pool;
+ int error = 0;
+ unsigned int state = serv->sv_nrthreads-1;
+ int node;
+
+ if (pool == NULL) {
+ /* The -1 assumes caller has done a svc_get() */
+ nrservs -= (serv->sv_nrthreads-1);
+ } else {
+ spin_lock_bh(&pool->sp_lock);
+ nrservs -= pool->sp_nrthreads;
+ spin_unlock_bh(&pool->sp_lock);
+ }
+
+ /* create new threads */
+ while (nrservs > 0) {
+ nrservs--;
+ chosen_pool = choose_pool(serv, pool, &state);
+
+ node = svc_pool_map_get_node(chosen_pool->sp_id);
+ rqstp = svc_prepare_thread(serv, chosen_pool, node);
+ if (IS_ERR(rqstp)) {
+ error = PTR_ERR(rqstp);
+ break;
+ }
+
+ __module_get(serv->sv_module);
+ task = kthread_create_on_node(serv->sv_function, rqstp,
+ node, "%s", serv->sv_name);
+ if (IS_ERR(task)) {
+ error = PTR_ERR(task);
+ module_put(serv->sv_module);
+ svc_exit_thread(rqstp);
+ break;
+ }
+
+ rqstp->rq_task = task;
+ if (serv->sv_nrpools > 1)
+ svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
+
+ svc_sock_update_bufs(serv);
+ wake_up_process(task);
+ }
+ /* destroy old threads */
+ while (nrservs < 0 &&
+ (task = choose_victim(serv, pool, &state)) != NULL) {
+ send_sig(SIGINT, task, 1);
+ nrservs++;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(svc_set_num_threads);
+
+/*
+ * Called from a server thread as it's exiting. Caller must hold the "service
+ * mutex" for the service.
+ */
+void
+svc_exit_thread(struct svc_rqst *rqstp)
+{
+ struct svc_serv *serv = rqstp->rq_server;
+ struct svc_pool *pool = rqstp->rq_pool;
+
+ svc_release_buffer(rqstp);
+ kfree(rqstp->rq_resp);
+ kfree(rqstp->rq_argp);
+ kfree(rqstp->rq_auth_data);
+
+ spin_lock_bh(&pool->sp_lock);
+ pool->sp_nrthreads--;
+ if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags))
+ list_del_rcu(&rqstp->rq_all);
+ spin_unlock_bh(&pool->sp_lock);
+
+ kfree_rcu(rqstp, rq_rcu_head);
+
+ /* Release the server */
+ if (serv)
+ svc_destroy(serv);
+}
+EXPORT_SYMBOL_GPL(svc_exit_thread);
+
+/*
+ * Register an "inet" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_rpcb_register4(struct net *net, const u32 program,
+ const u32 version,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ const struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ .sin_port = htons(port),
+ };
+ const char *netid;
+ int error;
+
+ switch (protocol) {
+ case IPPROTO_UDP:
+ netid = RPCBIND_NETID_UDP;
+ break;
+ case IPPROTO_TCP:
+ netid = RPCBIND_NETID_TCP;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ error = rpcb_v4_register(net, program, version,
+ (const struct sockaddr *)&sin, netid);
+
+ /*
+ * User space didn't support rpcbind v4, so retry this
+ * registration request with the legacy rpcbind v2 protocol.
+ */
+ if (error == -EPROTONOSUPPORT)
+ error = rpcb_register(net, program, version, protocol, port);
+
+ return error;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/*
+ * Register an "inet6" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_rpcb_register6(struct net *net, const u32 program,
+ const u32 version,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ const struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = htons(port),
+ };
+ const char *netid;
+ int error;
+
+ switch (protocol) {
+ case IPPROTO_UDP:
+ netid = RPCBIND_NETID_UDP6;
+ break;
+ case IPPROTO_TCP:
+ netid = RPCBIND_NETID_TCP6;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ error = rpcb_v4_register(net, program, version,
+ (const struct sockaddr *)&sin6, netid);
+
+ /*
+ * User space didn't support rpcbind version 4, so we won't
+ * use a PF_INET6 listener.
+ */
+ if (error == -EPROTONOSUPPORT)
+ error = -EAFNOSUPPORT;
+
+ return error;
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+/*
+ * Register a kernel RPC service via rpcbind version 4.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_register(struct net *net, const char *progname,
+ const u32 program, const u32 version,
+ const int family,
+ const unsigned short protocol,
+ const unsigned short port)
+{
+ int error = -EAFNOSUPPORT;
+
+ switch (family) {
+ case PF_INET:
+ error = __svc_rpcb_register4(net, program, version,
+ protocol, port);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case PF_INET6:
+ error = __svc_rpcb_register6(net, program, version,
+ protocol, port);
+#endif
+ }
+
+ return error;
+}
+
+/**
+ * svc_register - register an RPC service with the local portmapper
+ * @serv: svc_serv struct for the service to register
+ * @net: net namespace for the service to register
+ * @family: protocol family of service's listener socket
+ * @proto: transport protocol number to advertise
+ * @port: port to advertise
+ *
+ * Service is registered for any address in the passed-in protocol family
+ */
+int svc_register(const struct svc_serv *serv, struct net *net,
+ const int family, const unsigned short proto,
+ const unsigned short port)
+{
+ struct svc_program *progp;
+ struct svc_version *vers;
+ unsigned int i;
+ int error = 0;
+
+ WARN_ON_ONCE(proto == 0 && port == 0);
+ if (proto == 0 && port == 0)
+ return -EINVAL;
+
+ for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+ for (i = 0; i < progp->pg_nvers; i++) {
+ vers = progp->pg_vers[i];
+ if (vers == NULL)
+ continue;
+
+ dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
+ progp->pg_name,
+ i,
+ proto == IPPROTO_UDP? "udp" : "tcp",
+ port,
+ family,
+ vers->vs_hidden ?
+ " (but not telling portmap)" : "");
+
+ if (vers->vs_hidden)
+ continue;
+
+ error = __svc_register(net, progp->pg_name, progp->pg_prog,
+ i, family, proto, port);
+
+ if (vers->vs_rpcb_optnl) {
+ error = 0;
+ continue;
+ }
+
+ if (error < 0) {
+ printk(KERN_WARNING "svc: failed to register "
+ "%sv%u RPC service (errno %d).\n",
+ progp->pg_name, i, -error);
+ break;
+ }
+ }
+ }
+
+ return error;
+}
+
+/*
+ * If user space is running rpcbind, it should take the v4 UNSET
+ * and clear everything for this [program, version]. If user space
+ * is running portmap, it will reject the v4 UNSET, but won't have
+ * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient
+ * in this case to clear all existing entries for [program, version].
+ */
+static void __svc_unregister(struct net *net, const u32 program, const u32 version,
+ const char *progname)
+{
+ int error;
+
+ error = rpcb_v4_register(net, program, version, NULL, "");
+
+ /*
+ * User space didn't support rpcbind v4, so retry this
+ * request with the legacy rpcbind v2 protocol.
+ */
+ if (error == -EPROTONOSUPPORT)
+ error = rpcb_register(net, program, version, 0, 0);
+
+ dprintk("svc: %s(%sv%u), error %d\n",
+ __func__, progname, version, error);
+}
+
+/*
+ * All netids, bind addresses and ports registered for [program, version]
+ * are removed from the local rpcbind database (if the service is not
+ * hidden) to make way for a new instance of the service.
+ *
+ * The result of unregistration is reported via dprintk for those who want
+ * verification of the result, but is otherwise not important.
+ */
+static void svc_unregister(const struct svc_serv *serv, struct net *net)
+{
+ struct svc_program *progp;
+ unsigned long flags;
+ unsigned int i;
+
+ clear_thread_flag(TIF_SIGPENDING);
+
+ for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+ for (i = 0; i < progp->pg_nvers; i++) {
+ if (progp->pg_vers[i] == NULL)
+ continue;
+ if (progp->pg_vers[i]->vs_hidden)
+ continue;
+
+ dprintk("svc: attempting to unregister %sv%u\n",
+ progp->pg_name, i);
+ __svc_unregister(net, progp->pg_prog, i, progp->pg_name);
+ }
+ }
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+
+/*
+ * dprintk the given error with the address of the client that caused it.
+ */
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+static __printf(2, 3)
+void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ char buf[RPC_MAX_ADDRBUFLEN];
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ dprintk("svc: %s: %pV", svc_print_addr(rqstp, buf, sizeof(buf)), &vaf);
+
+ va_end(args);
+}
+#else
+static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {}
+#endif
+
+/*
+ * Common routine for processing the RPC request.
+ */
+static int
+svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+{
+ struct svc_program *progp;
+ struct svc_version *versp = NULL; /* compiler food */
+ struct svc_procedure *procp = NULL;
+ struct svc_serv *serv = rqstp->rq_server;
+ kxdrproc_t xdr;
+ __be32 *statp;
+ u32 prog, vers, proc;
+ __be32 auth_stat, rpc_stat;
+ int auth_res;
+ __be32 *reply_statp;
+
+ rpc_stat = rpc_success;
+
+ if (argv->iov_len < 6*4)
+ goto err_short_len;
+
+ /* Will be turned off only in gss privacy case: */
+ set_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ /* Will be turned off only when NFSv4 Sessions are used */
+ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ clear_bit(RQ_DROPME, &rqstp->rq_flags);
+
+ /* Setup reply header */
+ rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
+
+ svc_putu32(resv, rqstp->rq_xid);
+
+ vers = svc_getnl(argv);
+
+ /* First words of reply: */
+ svc_putnl(resv, 1); /* REPLY */
+
+ if (vers != 2) /* RPC version number */
+ goto err_bad_rpc;
+
+ /* Save position in case we later decide to reject: */
+ reply_statp = resv->iov_base + resv->iov_len;
+
+ svc_putnl(resv, 0); /* ACCEPT */
+
+ rqstp->rq_prog = prog = svc_getnl(argv); /* program number */
+ rqstp->rq_vers = vers = svc_getnl(argv); /* version number */
+ rqstp->rq_proc = proc = svc_getnl(argv); /* procedure number */
+
+ for (progp = serv->sv_program; progp; progp = progp->pg_next)
+ if (prog == progp->pg_prog)
+ break;
+
+ /*
+ * Decode auth data, and add verifier to reply buffer.
+ * We do this before anything else in order to get a decent
+ * auth verifier.
+ */
+ auth_res = svc_authenticate(rqstp, &auth_stat);
+ /* Also give the program a chance to reject this call: */
+ if (auth_res == SVC_OK && progp) {
+ auth_stat = rpc_autherr_badcred;
+ auth_res = progp->pg_authenticate(rqstp);
+ }
+ switch (auth_res) {
+ case SVC_OK:
+ break;
+ case SVC_GARBAGE:
+ goto err_garbage;
+ case SVC_SYSERR:
+ rpc_stat = rpc_system_err;
+ goto err_bad;
+ case SVC_DENIED:
+ goto err_bad_auth;
+ case SVC_CLOSE:
+ if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+ svc_close_xprt(rqstp->rq_xprt);
+ case SVC_DROP:
+ goto dropit;
+ case SVC_COMPLETE:
+ goto sendit;
+ }
+
+ if (progp == NULL)
+ goto err_bad_prog;
+
+ if (vers >= progp->pg_nvers ||
+ !(versp = progp->pg_vers[vers]))
+ goto err_bad_vers;
+
+ procp = versp->vs_proc + proc;
+ if (proc >= versp->vs_nproc || !procp->pc_func)
+ goto err_bad_proc;
+ rqstp->rq_procinfo = procp;
+
+ /* Syntactic check complete */
+ serv->sv_stats->rpccnt++;
+
+ /* Build the reply header. */
+ statp = resv->iov_base +resv->iov_len;
+ svc_putnl(resv, RPC_SUCCESS);
+
+ /* Bump per-procedure stats counter */
+ procp->pc_count++;
+
+ /* Initialize storage for argp and resp */
+ memset(rqstp->rq_argp, 0, procp->pc_argsize);
+ memset(rqstp->rq_resp, 0, procp->pc_ressize);
+
+ /* un-reserve some of the out-queue now that we have a
+ * better idea of reply size
+ */
+ if (procp->pc_xdrressize)
+ svc_reserve_auth(rqstp, procp->pc_xdrressize<<2);
+
+ /* Call the function that processes the request. */
+ if (!versp->vs_dispatch) {
+ /* Decode arguments */
+ xdr = procp->pc_decode;
+ if (xdr && !xdr(rqstp, argv->iov_base, rqstp->rq_argp))
+ goto err_garbage;
+
+ *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
+
+ /* Encode reply */
+ if (test_bit(RQ_DROPME, &rqstp->rq_flags)) {
+ if (procp->pc_release)
+ procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+ goto dropit;
+ }
+ if (*statp == rpc_success &&
+ (xdr = procp->pc_encode) &&
+ !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
+ dprintk("svc: failed to encode reply\n");
+ /* serv->sv_stats->rpcsystemerr++; */
+ *statp = rpc_system_err;
+ }
+ } else {
+ dprintk("svc: calling dispatcher\n");
+ if (!versp->vs_dispatch(rqstp, statp)) {
+ /* Release reply info */
+ if (procp->pc_release)
+ procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+ goto dropit;
+ }
+ }
+
+ /* Check RPC status result */
+ if (*statp != rpc_success)
+ resv->iov_len = ((void*)statp) - resv->iov_base + 4;
+
+ /* Release reply info */
+ if (procp->pc_release)
+ procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+
+ if (procp->pc_encode == NULL)
+ goto dropit;
+
+ sendit:
+ if (svc_authorise(rqstp))
+ goto dropit;
+ return 1; /* Caller can now send it */
+
+ dropit:
+ svc_authorise(rqstp); /* doesn't hurt to call this twice */
+ dprintk("svc: svc_process dropit\n");
+ return 0;
+
+err_short_len:
+ svc_printk(rqstp, "short len %Zd, dropping request\n",
+ argv->iov_len);
+
+ goto dropit; /* drop request */
+
+err_bad_rpc:
+ serv->sv_stats->rpcbadfmt++;
+ svc_putnl(resv, 1); /* REJECT */
+ svc_putnl(resv, 0); /* RPC_MISMATCH */
+ svc_putnl(resv, 2); /* Only RPCv2 supported */
+ svc_putnl(resv, 2);
+ goto sendit;
+
+err_bad_auth:
+ dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
+ serv->sv_stats->rpcbadauth++;
+ /* Restore write pointer to location of accept status: */
+ xdr_ressize_check(rqstp, reply_statp);
+ svc_putnl(resv, 1); /* REJECT */
+ svc_putnl(resv, 1); /* AUTH_ERROR */
+ svc_putnl(resv, ntohl(auth_stat)); /* status */
+ goto sendit;
+
+err_bad_prog:
+ dprintk("svc: unknown program %d\n", prog);
+ serv->sv_stats->rpcbadfmt++;
+ svc_putnl(resv, RPC_PROG_UNAVAIL);
+ goto sendit;
+
+err_bad_vers:
+ svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n",
+ vers, prog, progp->pg_name);
+
+ serv->sv_stats->rpcbadfmt++;
+ svc_putnl(resv, RPC_PROG_MISMATCH);
+ svc_putnl(resv, progp->pg_lovers);
+ svc_putnl(resv, progp->pg_hivers);
+ goto sendit;
+
+err_bad_proc:
+ svc_printk(rqstp, "unknown procedure (%d)\n", proc);
+
+ serv->sv_stats->rpcbadfmt++;
+ svc_putnl(resv, RPC_PROC_UNAVAIL);
+ goto sendit;
+
+err_garbage:
+ svc_printk(rqstp, "failed to decode args\n");
+
+ rpc_stat = rpc_garbage_args;
+err_bad:
+ serv->sv_stats->rpcbadfmt++;
+ svc_putnl(resv, ntohl(rpc_stat));
+ goto sendit;
+}
+EXPORT_SYMBOL_GPL(svc_process);
+
+/*
+ * Process the RPC request.
+ */
+int
+svc_process(struct svc_rqst *rqstp)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ struct svc_serv *serv = rqstp->rq_server;
+ u32 dir;
+
+ /*
+ * Setup response xdr_buf.
+ * Initially it has just one page
+ */
+ rqstp->rq_next_page = &rqstp->rq_respages[1];
+ resv->iov_base = page_address(rqstp->rq_respages[0]);
+ resv->iov_len = 0;
+ rqstp->rq_res.pages = rqstp->rq_respages + 1;
+ rqstp->rq_res.len = 0;
+ rqstp->rq_res.page_base = 0;
+ rqstp->rq_res.page_len = 0;
+ rqstp->rq_res.buflen = PAGE_SIZE;
+ rqstp->rq_res.tail[0].iov_base = NULL;
+ rqstp->rq_res.tail[0].iov_len = 0;
+
+ dir = svc_getnl(argv);
+ if (dir != 0) {
+ /* direction != CALL */
+ svc_printk(rqstp, "bad direction %d, dropping request\n", dir);
+ serv->sv_stats->rpcbadfmt++;
+ goto out_drop;
+ }
+
+ /* Returns 1 for send, 0 for drop */
+ if (likely(svc_process_common(rqstp, argv, resv))) {
+ int ret = svc_send(rqstp);
+
+ trace_svc_process(rqstp, ret);
+ return ret;
+ }
+out_drop:
+ trace_svc_process(rqstp, 0);
+ svc_drop(rqstp);
+ return 0;
+}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/*
+ * Process a backchannel RPC request that arrived over an existing
+ * outbound connection
+ */
+int
+bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
+ struct svc_rqst *rqstp)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+
+ /* Build the svc_rqst used by the common processing routine */
+ rqstp->rq_xprt = serv->sv_bc_xprt;
+ rqstp->rq_xid = req->rq_xid;
+ rqstp->rq_prot = req->rq_xprt->prot;
+ rqstp->rq_server = serv;
+
+ rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
+ memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
+ memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg));
+ memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res));
+
+ /* reset result send buffer "put" position */
+ resv->iov_len = 0;
+
+ if (rqstp->rq_prot != IPPROTO_TCP) {
+ printk(KERN_ERR "No support for Non-TCP transports!\n");
+ BUG();
+ }
+
+ /*
+ * Skip the next two words because they've already been
+ * processed in the trasport
+ */
+ svc_getu32(argv); /* XID */
+ svc_getnl(argv); /* CALLDIR */
+
+ /* Returns 1 for send, 0 for drop */
+ if (svc_process_common(rqstp, argv, resv)) {
+ memcpy(&req->rq_snd_buf, &rqstp->rq_res,
+ sizeof(req->rq_snd_buf));
+ return bc_send(req);
+ } else {
+ /* drop request */
+ xprt_free_bc_request(req);
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(bc_svc_process);
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+/*
+ * Return (transport-specific) limit on the rpc payload.
+ */
+u32 svc_max_payload(const struct svc_rqst *rqstp)
+{
+ u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload;
+
+ if (rqstp->rq_server->sv_max_payload < max)
+ max = rqstp->rq_server->sv_max_payload;
+ return max;
+}
+EXPORT_SYMBOL_GPL(svc_max_payload);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
new file mode 100644
index 000000000..163ac45c3
--- /dev/null
+++ b/net/sunrpc/svc_xprt.c
@@ -0,0 +1,1371 @@
+/*
+ * linux/net/sunrpc/svc_xprt.c
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/module.h>
+#include <trace/events/sunrpc.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
+static int svc_deferred_recv(struct svc_rqst *rqstp);
+static struct cache_deferred_req *svc_defer(struct cache_req *req);
+static void svc_age_temp_xprts(unsigned long closure);
+static void svc_delete_xprt(struct svc_xprt *xprt);
+static void svc_xprt_do_enqueue(struct svc_xprt *xprt);
+
+/* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ * http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+static int svc_conn_age_period = 6*60;
+
+/* List of registered transport classes */
+static DEFINE_SPINLOCK(svc_xprt_class_lock);
+static LIST_HEAD(svc_xprt_class_list);
+
+/* SMP locking strategy:
+ *
+ * svc_pool->sp_lock protects most of the fields of that pool.
+ * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
+ * when both need to be taken (rare), svc_serv->sv_lock is first.
+ * The "service mutex" protects svc_serv->sv_nrthread.
+ * svc_sock->sk_lock protects the svc_sock->sk_deferred list
+ * and the ->sk_info_authunix cache.
+ *
+ * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being
+ * enqueued multiply. During normal transport processing this bit
+ * is set by svc_xprt_enqueue and cleared by svc_xprt_received.
+ * Providers should not manipulate this bit directly.
+ *
+ * Some flags can be set to certain values at any time
+ * providing that certain rules are followed:
+ *
+ * XPT_CONN, XPT_DATA:
+ * - Can be set or cleared at any time.
+ * - After a set, svc_xprt_enqueue must be called to enqueue
+ * the transport for processing.
+ * - After a clear, the transport must be read/accepted.
+ * If this succeeds, it must be set again.
+ * XPT_CLOSE:
+ * - Can set at any time. It is never cleared.
+ * XPT_DEAD:
+ * - Can only be set while XPT_BUSY is held which ensures
+ * that no other thread will be using the transport or will
+ * try to set XPT_DEAD.
+ */
+int svc_reg_xprt_class(struct svc_xprt_class *xcl)
+{
+ struct svc_xprt_class *cl;
+ int res = -EEXIST;
+
+ dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
+
+ INIT_LIST_HEAD(&xcl->xcl_list);
+ spin_lock(&svc_xprt_class_lock);
+ /* Make sure there isn't already a class with the same name */
+ list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
+ if (strcmp(xcl->xcl_name, cl->xcl_name) == 0)
+ goto out;
+ }
+ list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
+ res = 0;
+out:
+ spin_unlock(&svc_xprt_class_lock);
+ return res;
+}
+EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
+
+void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
+{
+ dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
+ spin_lock(&svc_xprt_class_lock);
+ list_del_init(&xcl->xcl_list);
+ spin_unlock(&svc_xprt_class_lock);
+}
+EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
+
+/*
+ * Format the transport list for printing
+ */
+int svc_print_xprts(char *buf, int maxlen)
+{
+ struct svc_xprt_class *xcl;
+ char tmpstr[80];
+ int len = 0;
+ buf[0] = '\0';
+
+ spin_lock(&svc_xprt_class_lock);
+ list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
+ int slen;
+
+ sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
+ slen = strlen(tmpstr);
+ if (len + slen > maxlen)
+ break;
+ len += slen;
+ strcat(buf, tmpstr);
+ }
+ spin_unlock(&svc_xprt_class_lock);
+
+ return len;
+}
+
+static void svc_xprt_free(struct kref *kref)
+{
+ struct svc_xprt *xprt =
+ container_of(kref, struct svc_xprt, xpt_ref);
+ struct module *owner = xprt->xpt_class->xcl_owner;
+ if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags))
+ svcauth_unix_info_release(xprt);
+ put_net(xprt->xpt_net);
+ /* See comment on corresponding get in xs_setup_bc_tcp(): */
+ if (xprt->xpt_bc_xprt)
+ xprt_put(xprt->xpt_bc_xprt);
+ xprt->xpt_ops->xpo_free(xprt);
+ module_put(owner);
+}
+
+void svc_xprt_put(struct svc_xprt *xprt)
+{
+ kref_put(&xprt->xpt_ref, svc_xprt_free);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_put);
+
+/*
+ * Called by transport drivers to initialize the transport independent
+ * portion of the transport instance.
+ */
+void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
+ struct svc_xprt *xprt, struct svc_serv *serv)
+{
+ memset(xprt, 0, sizeof(*xprt));
+ xprt->xpt_class = xcl;
+ xprt->xpt_ops = xcl->xcl_ops;
+ kref_init(&xprt->xpt_ref);
+ xprt->xpt_server = serv;
+ INIT_LIST_HEAD(&xprt->xpt_list);
+ INIT_LIST_HEAD(&xprt->xpt_ready);
+ INIT_LIST_HEAD(&xprt->xpt_deferred);
+ INIT_LIST_HEAD(&xprt->xpt_users);
+ mutex_init(&xprt->xpt_mutex);
+ spin_lock_init(&xprt->xpt_lock);
+ set_bit(XPT_BUSY, &xprt->xpt_flags);
+ rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
+ xprt->xpt_net = get_net(net);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_init);
+
+static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
+ struct svc_serv *serv,
+ struct net *net,
+ const int family,
+ const unsigned short port,
+ int flags)
+{
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ .sin_port = htons(port),
+ };
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ .sin6_port = htons(port),
+ };
+#endif
+ struct sockaddr *sap;
+ size_t len;
+
+ switch (family) {
+ case PF_INET:
+ sap = (struct sockaddr *)&sin;
+ len = sizeof(sin);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case PF_INET6:
+ sap = (struct sockaddr *)&sin6;
+ len = sizeof(sin6);
+ break;
+#endif
+ default:
+ return ERR_PTR(-EAFNOSUPPORT);
+ }
+
+ return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
+}
+
+/*
+ * svc_xprt_received conditionally queues the transport for processing
+ * by another thread. The caller must hold the XPT_BUSY bit and must
+ * not thereafter touch transport data.
+ *
+ * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
+ * insufficient) data.
+ */
+static void svc_xprt_received(struct svc_xprt *xprt)
+{
+ if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) {
+ WARN_ONCE(1, "xprt=0x%p already busy!", xprt);
+ return;
+ }
+
+ /* As soon as we clear busy, the xprt could be closed and
+ * 'put', so we need a reference to call svc_xprt_do_enqueue with:
+ */
+ svc_xprt_get(xprt);
+ smp_mb__before_atomic();
+ clear_bit(XPT_BUSY, &xprt->xpt_flags);
+ svc_xprt_do_enqueue(xprt);
+ svc_xprt_put(xprt);
+}
+
+void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
+{
+ clear_bit(XPT_TEMP, &new->xpt_flags);
+ spin_lock_bh(&serv->sv_lock);
+ list_add(&new->xpt_list, &serv->sv_permsocks);
+ spin_unlock_bh(&serv->sv_lock);
+ svc_xprt_received(new);
+}
+
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+ struct net *net, const int family,
+ const unsigned short port, int flags)
+{
+ struct svc_xprt_class *xcl;
+
+ dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+ spin_lock(&svc_xprt_class_lock);
+ list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
+ struct svc_xprt *newxprt;
+ unsigned short newport;
+
+ if (strcmp(xprt_name, xcl->xcl_name))
+ continue;
+
+ if (!try_module_get(xcl->xcl_owner))
+ goto err;
+
+ spin_unlock(&svc_xprt_class_lock);
+ newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags);
+ if (IS_ERR(newxprt)) {
+ module_put(xcl->xcl_owner);
+ return PTR_ERR(newxprt);
+ }
+ svc_add_new_perm_xprt(serv, newxprt);
+ newport = svc_xprt_local_port(newxprt);
+ return newport;
+ }
+ err:
+ spin_unlock(&svc_xprt_class_lock);
+ dprintk("svc: transport %s not found\n", xprt_name);
+
+ /* This errno is exposed to user space. Provide a reasonable
+ * perror msg for a bad transport. */
+ return -EPROTONOSUPPORT;
+}
+EXPORT_SYMBOL_GPL(svc_create_xprt);
+
+/*
+ * Copy the local and remote xprt addresses to the rqstp structure
+ */
+void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+ memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
+ rqstp->rq_addrlen = xprt->xpt_remotelen;
+
+ /*
+ * Destination address in request is needed for binding the
+ * source address in RPC replies/callbacks later.
+ */
+ memcpy(&rqstp->rq_daddr, &xprt->xpt_local, xprt->xpt_locallen);
+ rqstp->rq_daddrlen = xprt->xpt_locallen;
+}
+EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
+
+/**
+ * svc_print_addr - Format rq_addr field for printing
+ * @rqstp: svc_rqst struct containing address to print
+ * @buf: target buffer for formatted address
+ * @len: length of target buffer
+ *
+ */
+char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
+{
+ return __svc_print_addr(svc_addr(rqstp), buf, len);
+}
+EXPORT_SYMBOL_GPL(svc_print_addr);
+
+static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
+{
+ if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
+ return true;
+ if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED)))
+ return xprt->xpt_ops->xpo_has_wspace(xprt);
+ return false;
+}
+
+static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
+{
+ struct svc_pool *pool;
+ struct svc_rqst *rqstp = NULL;
+ int cpu;
+ bool queued = false;
+
+ if (!svc_xprt_has_something_to_do(xprt))
+ goto out;
+
+ /* Mark transport as busy. It will remain in this state until
+ * the provider calls svc_xprt_received. We update XPT_BUSY
+ * atomically because it also guards against trying to enqueue
+ * the transport twice.
+ */
+ if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
+ /* Don't enqueue transport while already enqueued */
+ dprintk("svc: transport %p busy, not enqueued\n", xprt);
+ goto out;
+ }
+
+ cpu = get_cpu();
+ pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
+
+ atomic_long_inc(&pool->sp_stats.packets);
+
+redo_search:
+ /* find a thread for this xprt */
+ rcu_read_lock();
+ list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
+ /* Do a lockless check first */
+ if (test_bit(RQ_BUSY, &rqstp->rq_flags))
+ continue;
+
+ /*
+ * Once the xprt has been queued, it can only be dequeued by
+ * the task that intends to service it. All we can do at that
+ * point is to try to wake this thread back up so that it can
+ * do so.
+ */
+ if (!queued) {
+ spin_lock_bh(&rqstp->rq_lock);
+ if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) {
+ /* already busy, move on... */
+ spin_unlock_bh(&rqstp->rq_lock);
+ continue;
+ }
+
+ /* this one will do */
+ rqstp->rq_xprt = xprt;
+ svc_xprt_get(xprt);
+ spin_unlock_bh(&rqstp->rq_lock);
+ }
+ rcu_read_unlock();
+
+ atomic_long_inc(&pool->sp_stats.threads_woken);
+ wake_up_process(rqstp->rq_task);
+ put_cpu();
+ goto out;
+ }
+ rcu_read_unlock();
+
+ /*
+ * We didn't find an idle thread to use, so we need to queue the xprt.
+ * Do so and then search again. If we find one, we can't hook this one
+ * up to it directly but we can wake the thread up in the hopes that it
+ * will pick it up once it searches for a xprt to service.
+ */
+ if (!queued) {
+ queued = true;
+ dprintk("svc: transport %p put into queue\n", xprt);
+ spin_lock_bh(&pool->sp_lock);
+ list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+ pool->sp_stats.sockets_queued++;
+ spin_unlock_bh(&pool->sp_lock);
+ goto redo_search;
+ }
+ rqstp = NULL;
+ put_cpu();
+out:
+ trace_svc_xprt_do_enqueue(xprt, rqstp);
+}
+
+/*
+ * Queue up a transport with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+void svc_xprt_enqueue(struct svc_xprt *xprt)
+{
+ if (test_bit(XPT_BUSY, &xprt->xpt_flags))
+ return;
+ svc_xprt_do_enqueue(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
+
+/*
+ * Dequeue the first transport, if there is one.
+ */
+static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
+{
+ struct svc_xprt *xprt = NULL;
+
+ if (list_empty(&pool->sp_sockets))
+ goto out;
+
+ spin_lock_bh(&pool->sp_lock);
+ if (likely(!list_empty(&pool->sp_sockets))) {
+ xprt = list_first_entry(&pool->sp_sockets,
+ struct svc_xprt, xpt_ready);
+ list_del_init(&xprt->xpt_ready);
+ svc_xprt_get(xprt);
+
+ dprintk("svc: transport %p dequeued, inuse=%d\n",
+ xprt, atomic_read(&xprt->xpt_ref.refcount));
+ }
+ spin_unlock_bh(&pool->sp_lock);
+out:
+ trace_svc_xprt_dequeue(xprt);
+ return xprt;
+}
+
+/**
+ * svc_reserve - change the space reserved for the reply to a request.
+ * @rqstp: The request in question
+ * @space: new max space to reserve
+ *
+ * Each request reserves some space on the output queue of the transport
+ * to make sure the reply fits. This function reduces that reserved
+ * space to be the amount of space used already, plus @space.
+ *
+ */
+void svc_reserve(struct svc_rqst *rqstp, int space)
+{
+ space += rqstp->rq_res.head[0].iov_len;
+
+ if (space < rqstp->rq_reserved) {
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
+ rqstp->rq_reserved = space;
+
+ if (xprt->xpt_ops->xpo_adjust_wspace)
+ xprt->xpt_ops->xpo_adjust_wspace(xprt);
+ svc_xprt_enqueue(xprt);
+ }
+}
+EXPORT_SYMBOL_GPL(svc_reserve);
+
+static void svc_xprt_release(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+
+ rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+
+ kfree(rqstp->rq_deferred);
+ rqstp->rq_deferred = NULL;
+
+ svc_free_res_pages(rqstp);
+ rqstp->rq_res.page_len = 0;
+ rqstp->rq_res.page_base = 0;
+
+ /* Reset response buffer and release
+ * the reservation.
+ * But first, check that enough space was reserved
+ * for the reply, otherwise we have a bug!
+ */
+ if ((rqstp->rq_res.len) > rqstp->rq_reserved)
+ printk(KERN_ERR "RPC request reserved %d but used %d\n",
+ rqstp->rq_reserved,
+ rqstp->rq_res.len);
+
+ rqstp->rq_res.head[0].iov_len = 0;
+ svc_reserve(rqstp, 0);
+ rqstp->rq_xprt = NULL;
+
+ svc_xprt_put(xprt);
+}
+
+/*
+ * Some svc_serv's will have occasional work to do, even when a xprt is not
+ * waiting to be serviced. This function is there to "kick" a task in one of
+ * those services so that it can wake up and do that work. Note that we only
+ * bother with pool 0 as we don't need to wake up more than one thread for
+ * this purpose.
+ */
+void svc_wake_up(struct svc_serv *serv)
+{
+ struct svc_rqst *rqstp;
+ struct svc_pool *pool;
+
+ pool = &serv->sv_pools[0];
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) {
+ /* skip any that aren't queued */
+ if (test_bit(RQ_BUSY, &rqstp->rq_flags))
+ continue;
+ rcu_read_unlock();
+ dprintk("svc: daemon %p woken up.\n", rqstp);
+ wake_up_process(rqstp->rq_task);
+ trace_svc_wake_up(rqstp->rq_task->pid);
+ return;
+ }
+ rcu_read_unlock();
+
+ /* No free entries available */
+ set_bit(SP_TASK_PENDING, &pool->sp_flags);
+ smp_wmb();
+ trace_svc_wake_up(0);
+}
+EXPORT_SYMBOL_GPL(svc_wake_up);
+
+int svc_port_is_privileged(struct sockaddr *sin)
+{
+ switch (sin->sa_family) {
+ case AF_INET:
+ return ntohs(((struct sockaddr_in *)sin)->sin_port)
+ < PROT_SOCK;
+ case AF_INET6:
+ return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
+ < PROT_SOCK;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * Make sure that we don't have too many active connections. If we have,
+ * something must be dropped. It's not clear what will happen if we allow
+ * "too many" connections, but when dealing with network-facing software,
+ * we have to code defensively. Here we do that by imposing hard limits.
+ *
+ * There's no point in trying to do random drop here for DoS
+ * prevention. The NFS clients does 1 reconnect in 15 seconds. An
+ * attacker can easily beat that.
+ *
+ * The only somewhat efficient mechanism would be if drop old
+ * connections from the same IP first. But right now we don't even
+ * record the client IP in svc_sock.
+ *
+ * single-threaded services that expect a lot of clients will probably
+ * need to set sv_maxconn to override the default value which is based
+ * on the number of threads
+ */
+static void svc_check_conn_limits(struct svc_serv *serv)
+{
+ unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn :
+ (serv->sv_nrthreads+3) * 20;
+
+ if (serv->sv_tmpcnt > limit) {
+ struct svc_xprt *xprt = NULL;
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_tempsocks)) {
+ /* Try to help the admin */
+ net_notice_ratelimited("%s: too many open connections, consider increasing the %s\n",
+ serv->sv_name, serv->sv_maxconn ?
+ "max number of connections" :
+ "number of threads");
+ /*
+ * Always select the oldest connection. It's not fair,
+ * but so is life
+ */
+ xprt = list_entry(serv->sv_tempsocks.prev,
+ struct svc_xprt,
+ xpt_list);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_get(xprt);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ if (xprt) {
+ svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
+ }
+ }
+}
+
+static int svc_alloc_arg(struct svc_rqst *rqstp)
+{
+ struct svc_serv *serv = rqstp->rq_server;
+ struct xdr_buf *arg;
+ int pages;
+ int i;
+
+ /* now allocate needed pages. If we get a failure, sleep briefly */
+ pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
+ WARN_ON_ONCE(pages >= RPCSVC_MAXPAGES);
+ if (pages >= RPCSVC_MAXPAGES)
+ /* use as many pages as possible */
+ pages = RPCSVC_MAXPAGES - 1;
+ for (i = 0; i < pages ; i++)
+ while (rqstp->rq_pages[i] == NULL) {
+ struct page *p = alloc_page(GFP_KERNEL);
+ if (!p) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signalled() || kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ return -EINTR;
+ }
+ schedule_timeout(msecs_to_jiffies(500));
+ }
+ rqstp->rq_pages[i] = p;
+ }
+ rqstp->rq_page_end = &rqstp->rq_pages[i];
+ rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+
+ /* Make arg->head point to first page and arg->pages point to rest */
+ arg = &rqstp->rq_arg;
+ arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
+ arg->head[0].iov_len = PAGE_SIZE;
+ arg->pages = rqstp->rq_pages + 1;
+ arg->page_base = 0;
+ /* save at least one page for response */
+ arg->page_len = (pages-2)*PAGE_SIZE;
+ arg->len = (pages-1)*PAGE_SIZE;
+ arg->tail[0].iov_len = 0;
+ return 0;
+}
+
+static bool
+rqst_should_sleep(struct svc_rqst *rqstp)
+{
+ struct svc_pool *pool = rqstp->rq_pool;
+
+ /* did someone call svc_wake_up? */
+ if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags))
+ return false;
+
+ /* was a socket queued? */
+ if (!list_empty(&pool->sp_sockets))
+ return false;
+
+ /* are we shutting down? */
+ if (signalled() || kthread_should_stop())
+ return false;
+
+ /* are we freezing? */
+ if (freezing(current))
+ return false;
+
+ return true;
+}
+
+static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
+{
+ struct svc_xprt *xprt;
+ struct svc_pool *pool = rqstp->rq_pool;
+ long time_left = 0;
+
+ /* rq_xprt should be clear on entry */
+ WARN_ON_ONCE(rqstp->rq_xprt);
+
+ /* Normally we will wait up to 5 seconds for any required
+ * cache information to be provided.
+ */
+ rqstp->rq_chandle.thread_wait = 5*HZ;
+
+ xprt = svc_xprt_dequeue(pool);
+ if (xprt) {
+ rqstp->rq_xprt = xprt;
+
+ /* As there is a shortage of threads and this request
+ * had to be queued, don't allow the thread to wait so
+ * long for cache updates.
+ */
+ rqstp->rq_chandle.thread_wait = 1*HZ;
+ clear_bit(SP_TASK_PENDING, &pool->sp_flags);
+ return xprt;
+ }
+
+ /*
+ * We have to be able to interrupt this wait
+ * to bring down the daemons ...
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ clear_bit(RQ_BUSY, &rqstp->rq_flags);
+ smp_mb();
+
+ if (likely(rqst_should_sleep(rqstp)))
+ time_left = schedule_timeout(timeout);
+ else
+ __set_current_state(TASK_RUNNING);
+
+ try_to_freeze();
+
+ spin_lock_bh(&rqstp->rq_lock);
+ set_bit(RQ_BUSY, &rqstp->rq_flags);
+ spin_unlock_bh(&rqstp->rq_lock);
+
+ xprt = rqstp->rq_xprt;
+ if (xprt != NULL)
+ return xprt;
+
+ if (!time_left)
+ atomic_long_inc(&pool->sp_stats.threads_timedout);
+
+ if (signalled() || kthread_should_stop())
+ return ERR_PTR(-EINTR);
+ return ERR_PTR(-EAGAIN);
+}
+
+static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
+{
+ spin_lock_bh(&serv->sv_lock);
+ set_bit(XPT_TEMP, &newxpt->xpt_flags);
+ list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
+ serv->sv_tmpcnt++;
+ if (serv->sv_temptimer.function == NULL) {
+ /* setup timer to age temp transports */
+ setup_timer(&serv->sv_temptimer, svc_age_temp_xprts,
+ (unsigned long)serv);
+ mod_timer(&serv->sv_temptimer,
+ jiffies + svc_conn_age_period * HZ);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+ svc_xprt_received(newxpt);
+}
+
+static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+ struct svc_serv *serv = rqstp->rq_server;
+ int len = 0;
+
+ if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
+ dprintk("svc_recv: found XPT_CLOSE\n");
+ svc_delete_xprt(xprt);
+ /* Leave XPT_BUSY set on the dead xprt: */
+ goto out;
+ }
+ if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+ struct svc_xprt *newxpt;
+ /*
+ * We know this module_get will succeed because the
+ * listener holds a reference too
+ */
+ __module_get(xprt->xpt_class->xcl_owner);
+ svc_check_conn_limits(xprt->xpt_server);
+ newxpt = xprt->xpt_ops->xpo_accept(xprt);
+ if (newxpt)
+ svc_add_new_temp_xprt(serv, newxpt);
+ else
+ module_put(xprt->xpt_class->xcl_owner);
+ } else {
+ /* XPT_DATA|XPT_DEFERRED case: */
+ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
+ rqstp, rqstp->rq_pool->sp_id, xprt,
+ atomic_read(&xprt->xpt_ref.refcount));
+ rqstp->rq_deferred = svc_deferred_dequeue(xprt);
+ if (rqstp->rq_deferred)
+ len = svc_deferred_recv(rqstp);
+ else
+ len = xprt->xpt_ops->xpo_recvfrom(rqstp);
+ dprintk("svc: got len=%d\n", len);
+ rqstp->rq_reserved = serv->sv_max_mesg;
+ atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+ }
+ /* clear XPT_BUSY: */
+ svc_xprt_received(xprt);
+out:
+ trace_svc_handle_xprt(xprt, len);
+ return len;
+}
+
+/*
+ * Receive the next request on any transport. This code is carefully
+ * organised not to touch any cachelines in the shared svc_serv
+ * structure, only cachelines in the local svc_pool.
+ */
+int svc_recv(struct svc_rqst *rqstp, long timeout)
+{
+ struct svc_xprt *xprt = NULL;
+ struct svc_serv *serv = rqstp->rq_server;
+ int len, err;
+
+ dprintk("svc: server %p waiting for data (to = %ld)\n",
+ rqstp, timeout);
+
+ if (rqstp->rq_xprt)
+ printk(KERN_ERR
+ "svc_recv: service %p, transport not NULL!\n",
+ rqstp);
+
+ err = svc_alloc_arg(rqstp);
+ if (err)
+ goto out;
+
+ try_to_freeze();
+ cond_resched();
+ err = -EINTR;
+ if (signalled() || kthread_should_stop())
+ goto out;
+
+ xprt = svc_get_next_xprt(rqstp, timeout);
+ if (IS_ERR(xprt)) {
+ err = PTR_ERR(xprt);
+ goto out;
+ }
+
+ len = svc_handle_xprt(rqstp, xprt);
+
+ /* No data, incomplete (TCP) read, or accept() */
+ err = -EAGAIN;
+ if (len <= 0)
+ goto out_release;
+
+ clear_bit(XPT_OLD, &xprt->xpt_flags);
+
+ if (xprt->xpt_ops->xpo_secure_port(rqstp))
+ set_bit(RQ_SECURE, &rqstp->rq_flags);
+ else
+ clear_bit(RQ_SECURE, &rqstp->rq_flags);
+ rqstp->rq_chandle.defer = svc_defer;
+ rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]);
+
+ if (serv->sv_stats)
+ serv->sv_stats->netcnt++;
+ trace_svc_recv(rqstp, len);
+ return len;
+out_release:
+ rqstp->rq_res.len = 0;
+ svc_xprt_release(rqstp);
+out:
+ trace_svc_recv(rqstp, err);
+ return err;
+}
+EXPORT_SYMBOL_GPL(svc_recv);
+
+/*
+ * Drop request
+ */
+void svc_drop(struct svc_rqst *rqstp)
+{
+ dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
+ svc_xprt_release(rqstp);
+}
+EXPORT_SYMBOL_GPL(svc_drop);
+
+/*
+ * Return reply to client.
+ */
+int svc_send(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt;
+ int len = -EFAULT;
+ struct xdr_buf *xb;
+
+ xprt = rqstp->rq_xprt;
+ if (!xprt)
+ goto out;
+
+ /* release the receive skb before sending the reply */
+ rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+
+ /* calculate over-all length */
+ xb = &rqstp->rq_res;
+ xb->len = xb->head[0].iov_len +
+ xb->page_len +
+ xb->tail[0].iov_len;
+
+ /* Grab mutex to serialize outgoing data. */
+ mutex_lock(&xprt->xpt_mutex);
+ if (test_bit(XPT_DEAD, &xprt->xpt_flags)
+ || test_bit(XPT_CLOSE, &xprt->xpt_flags))
+ len = -ENOTCONN;
+ else
+ len = xprt->xpt_ops->xpo_sendto(rqstp);
+ mutex_unlock(&xprt->xpt_mutex);
+ rpc_wake_up(&xprt->xpt_bc_pending);
+ svc_xprt_release(rqstp);
+
+ if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
+ len = 0;
+out:
+ trace_svc_send(rqstp, len);
+ return len;
+}
+
+/*
+ * Timer function to close old temporary transports, using
+ * a mark-and-sweep algorithm.
+ */
+static void svc_age_temp_xprts(unsigned long closure)
+{
+ struct svc_serv *serv = (struct svc_serv *)closure;
+ struct svc_xprt *xprt;
+ struct list_head *le, *next;
+
+ dprintk("svc_age_temp_xprts\n");
+
+ if (!spin_trylock_bh(&serv->sv_lock)) {
+ /* busy, try again 1 sec later */
+ dprintk("svc_age_temp_xprts: busy\n");
+ mod_timer(&serv->sv_temptimer, jiffies + HZ);
+ return;
+ }
+
+ list_for_each_safe(le, next, &serv->sv_tempsocks) {
+ xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+ /* First time through, just mark it OLD. Second time
+ * through, close it. */
+ if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
+ continue;
+ if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+ test_bit(XPT_BUSY, &xprt->xpt_flags))
+ continue;
+ list_del_init(le);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ dprintk("queuing xprt %p for closing\n", xprt);
+
+ /* a thread will dequeue and close it soon */
+ svc_xprt_enqueue(xprt);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
+}
+
+static void call_xpt_users(struct svc_xprt *xprt)
+{
+ struct svc_xpt_user *u;
+
+ spin_lock(&xprt->xpt_lock);
+ while (!list_empty(&xprt->xpt_users)) {
+ u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list);
+ list_del(&u->list);
+ u->callback(u);
+ }
+ spin_unlock(&xprt->xpt_lock);
+}
+
+/*
+ * Remove a dead transport
+ */
+static void svc_delete_xprt(struct svc_xprt *xprt)
+{
+ struct svc_serv *serv = xprt->xpt_server;
+ struct svc_deferred_req *dr;
+
+ /* Only do this once */
+ if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags))
+ BUG();
+
+ dprintk("svc: svc_delete_xprt(%p)\n", xprt);
+ xprt->xpt_ops->xpo_detach(xprt);
+
+ spin_lock_bh(&serv->sv_lock);
+ list_del_init(&xprt->xpt_list);
+ WARN_ON_ONCE(!list_empty(&xprt->xpt_ready));
+ if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+ serv->sv_tmpcnt--;
+ spin_unlock_bh(&serv->sv_lock);
+
+ while ((dr = svc_deferred_dequeue(xprt)) != NULL)
+ kfree(dr);
+
+ call_xpt_users(xprt);
+ svc_xprt_put(xprt);
+}
+
+void svc_close_xprt(struct svc_xprt *xprt)
+{
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
+ /* someone else will have to effect the close */
+ return;
+ /*
+ * We expect svc_close_xprt() to work even when no threads are
+ * running (e.g., while configuring the server before starting
+ * any threads), so if the transport isn't busy, we delete
+ * it ourself:
+ */
+ svc_delete_xprt(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_close_xprt);
+
+static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net)
+{
+ struct svc_xprt *xprt;
+ int ret = 0;
+
+ spin_lock(&serv->sv_lock);
+ list_for_each_entry(xprt, xprt_list, xpt_list) {
+ if (xprt->xpt_net != net)
+ continue;
+ ret++;
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+ }
+ spin_unlock(&serv->sv_lock);
+ return ret;
+}
+
+static struct svc_xprt *svc_dequeue_net(struct svc_serv *serv, struct net *net)
+{
+ struct svc_pool *pool;
+ struct svc_xprt *xprt;
+ struct svc_xprt *tmp;
+ int i;
+
+ for (i = 0; i < serv->sv_nrpools; i++) {
+ pool = &serv->sv_pools[i];
+
+ spin_lock_bh(&pool->sp_lock);
+ list_for_each_entry_safe(xprt, tmp, &pool->sp_sockets, xpt_ready) {
+ if (xprt->xpt_net != net)
+ continue;
+ list_del_init(&xprt->xpt_ready);
+ spin_unlock_bh(&pool->sp_lock);
+ return xprt;
+ }
+ spin_unlock_bh(&pool->sp_lock);
+ }
+ return NULL;
+}
+
+static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
+{
+ struct svc_xprt *xprt;
+
+ while ((xprt = svc_dequeue_net(serv, net))) {
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_delete_xprt(xprt);
+ }
+}
+
+/*
+ * Server threads may still be running (especially in the case where the
+ * service is still running in other network namespaces).
+ *
+ * So we shut down sockets the same way we would on a running server, by
+ * setting XPT_CLOSE, enqueuing, and letting a thread pick it up to do
+ * the close. In the case there are no such other threads,
+ * threads running, svc_clean_up_xprts() does a simple version of a
+ * server's main event loop, and in the case where there are other
+ * threads, we may need to wait a little while and then check again to
+ * see if they're done.
+ */
+void svc_close_net(struct svc_serv *serv, struct net *net)
+{
+ int delay = 0;
+
+ while (svc_close_list(serv, &serv->sv_permsocks, net) +
+ svc_close_list(serv, &serv->sv_tempsocks, net)) {
+
+ svc_clean_up_xprts(serv, net);
+ msleep(delay++);
+ }
+}
+
+/*
+ * Handle defer and revisit of requests
+ */
+
+static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+{
+ struct svc_deferred_req *dr =
+ container_of(dreq, struct svc_deferred_req, handle);
+ struct svc_xprt *xprt = dr->xprt;
+
+ spin_lock(&xprt->xpt_lock);
+ set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+ if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) {
+ spin_unlock(&xprt->xpt_lock);
+ dprintk("revisit canceled\n");
+ svc_xprt_put(xprt);
+ kfree(dr);
+ return;
+ }
+ dprintk("revisit queued\n");
+ dr->xprt = NULL;
+ list_add(&dr->handle.recent, &xprt->xpt_deferred);
+ spin_unlock(&xprt->xpt_lock);
+ svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
+}
+
+/*
+ * Save the request off for later processing. The request buffer looks
+ * like this:
+ *
+ * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
+ *
+ * This code can only handle requests that consist of an xprt-header
+ * and rpc-header.
+ */
+static struct cache_deferred_req *svc_defer(struct cache_req *req)
+{
+ struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
+ struct svc_deferred_req *dr;
+
+ if (rqstp->rq_arg.page_len || !test_bit(RQ_USEDEFERRAL, &rqstp->rq_flags))
+ return NULL; /* if more than a page, give up FIXME */
+ if (rqstp->rq_deferred) {
+ dr = rqstp->rq_deferred;
+ rqstp->rq_deferred = NULL;
+ } else {
+ size_t skip;
+ size_t size;
+ /* FIXME maybe discard if size too large */
+ size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len;
+ dr = kmalloc(size, GFP_KERNEL);
+ if (dr == NULL)
+ return NULL;
+
+ dr->handle.owner = rqstp->rq_server;
+ dr->prot = rqstp->rq_prot;
+ memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
+ dr->addrlen = rqstp->rq_addrlen;
+ dr->daddr = rqstp->rq_daddr;
+ dr->argslen = rqstp->rq_arg.len >> 2;
+ dr->xprt_hlen = rqstp->rq_xprt_hlen;
+
+ /* back up head to the start of the buffer and copy */
+ skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+ memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
+ dr->argslen << 2);
+ }
+ svc_xprt_get(rqstp->rq_xprt);
+ dr->xprt = rqstp->rq_xprt;
+ set_bit(RQ_DROPME, &rqstp->rq_flags);
+
+ dr->handle.revisit = svc_revisit;
+ return &dr->handle;
+}
+
+/*
+ * recv data from a deferred request into an active one
+ */
+static int svc_deferred_recv(struct svc_rqst *rqstp)
+{
+ struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+ /* setup iov_base past transport header */
+ rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
+ /* The iov_len does not include the transport header bytes */
+ rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
+ rqstp->rq_arg.page_len = 0;
+ /* The rq_arg.len includes the transport header bytes */
+ rqstp->rq_arg.len = dr->argslen<<2;
+ rqstp->rq_prot = dr->prot;
+ memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
+ rqstp->rq_addrlen = dr->addrlen;
+ /* Save off transport header len in case we get deferred again */
+ rqstp->rq_xprt_hlen = dr->xprt_hlen;
+ rqstp->rq_daddr = dr->daddr;
+ rqstp->rq_respages = rqstp->rq_pages;
+ return (dr->argslen<<2) - dr->xprt_hlen;
+}
+
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
+{
+ struct svc_deferred_req *dr = NULL;
+
+ if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
+ return NULL;
+ spin_lock(&xprt->xpt_lock);
+ if (!list_empty(&xprt->xpt_deferred)) {
+ dr = list_entry(xprt->xpt_deferred.next,
+ struct svc_deferred_req,
+ handle.recent);
+ list_del_init(&dr->handle.recent);
+ } else
+ clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
+ spin_unlock(&xprt->xpt_lock);
+ return dr;
+}
+
+/**
+ * svc_find_xprt - find an RPC transport instance
+ * @serv: pointer to svc_serv to search
+ * @xcl_name: C string containing transport's class name
+ * @net: owner net pointer
+ * @af: Address family of transport's local address
+ * @port: transport's IP port number
+ *
+ * Return the transport instance pointer for the endpoint accepting
+ * connections/peer traffic from the specified transport class,
+ * address family and port.
+ *
+ * Specifying 0 for the address family or port is effectively a
+ * wild-card, and will result in matching the first transport in the
+ * service's list that has a matching class name.
+ */
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+ struct net *net, const sa_family_t af,
+ const unsigned short port)
+{
+ struct svc_xprt *xprt;
+ struct svc_xprt *found = NULL;
+
+ /* Sanity check the args */
+ if (serv == NULL || xcl_name == NULL)
+ return found;
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+ if (xprt->xpt_net != net)
+ continue;
+ if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
+ continue;
+ if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
+ continue;
+ if (port != 0 && port != svc_xprt_local_port(xprt))
+ continue;
+ found = xprt;
+ svc_xprt_get(xprt);
+ break;
+ }
+ spin_unlock_bh(&serv->sv_lock);
+ return found;
+}
+EXPORT_SYMBOL_GPL(svc_find_xprt);
+
+static int svc_one_xprt_name(const struct svc_xprt *xprt,
+ char *pos, int remaining)
+{
+ int len;
+
+ len = snprintf(pos, remaining, "%s %u\n",
+ xprt->xpt_class->xcl_name,
+ svc_xprt_local_port(xprt));
+ if (len >= remaining)
+ return -ENAMETOOLONG;
+ return len;
+}
+
+/**
+ * svc_xprt_names - format a buffer with a list of transport names
+ * @serv: pointer to an RPC service
+ * @buf: pointer to a buffer to be filled in
+ * @buflen: length of buffer to be filled in
+ *
+ * Fills in @buf with a string containing a list of transport names,
+ * each name terminated with '\n'.
+ *
+ * Returns positive length of the filled-in string on success; otherwise
+ * a negative errno value is returned if an error occurs.
+ */
+int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen)
+{
+ struct svc_xprt *xprt;
+ int len, totlen;
+ char *pos;
+
+ /* Sanity check args */
+ if (!serv)
+ return 0;
+
+ spin_lock_bh(&serv->sv_lock);
+
+ pos = buf;
+ totlen = 0;
+ list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+ len = svc_one_xprt_name(xprt, pos, buflen - totlen);
+ if (len < 0) {
+ *buf = '\0';
+ totlen = len;
+ }
+ if (len <= 0)
+ break;
+
+ pos += len;
+ totlen += len;
+ }
+
+ spin_unlock_bh(&serv->sv_lock);
+ return totlen;
+}
+EXPORT_SYMBOL_GPL(svc_xprt_names);
+
+
+/*----------------------------------------------------------------------------*/
+
+static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos)
+{
+ unsigned int pidx = (unsigned int)*pos;
+ struct svc_serv *serv = m->private;
+
+ dprintk("svc_pool_stats_start, *pidx=%u\n", pidx);
+
+ if (!pidx)
+ return SEQ_START_TOKEN;
+ return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]);
+}
+
+static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct svc_pool *pool = p;
+ struct svc_serv *serv = m->private;
+
+ dprintk("svc_pool_stats_next, *pos=%llu\n", *pos);
+
+ if (p == SEQ_START_TOKEN) {
+ pool = &serv->sv_pools[0];
+ } else {
+ unsigned int pidx = (pool - &serv->sv_pools[0]);
+ if (pidx < serv->sv_nrpools-1)
+ pool = &serv->sv_pools[pidx+1];
+ else
+ pool = NULL;
+ }
+ ++*pos;
+ return pool;
+}
+
+static void svc_pool_stats_stop(struct seq_file *m, void *p)
+{
+}
+
+static int svc_pool_stats_show(struct seq_file *m, void *p)
+{
+ struct svc_pool *pool = p;
+
+ if (p == SEQ_START_TOKEN) {
+ seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken threads-timedout\n");
+ return 0;
+ }
+
+ seq_printf(m, "%u %lu %lu %lu %lu\n",
+ pool->sp_id,
+ (unsigned long)atomic_long_read(&pool->sp_stats.packets),
+ pool->sp_stats.sockets_queued,
+ (unsigned long)atomic_long_read(&pool->sp_stats.threads_woken),
+ (unsigned long)atomic_long_read(&pool->sp_stats.threads_timedout));
+
+ return 0;
+}
+
+static const struct seq_operations svc_pool_stats_seq_ops = {
+ .start = svc_pool_stats_start,
+ .next = svc_pool_stats_next,
+ .stop = svc_pool_stats_stop,
+ .show = svc_pool_stats_show,
+};
+
+int svc_pool_stats_open(struct svc_serv *serv, struct file *file)
+{
+ int err;
+
+ err = seq_open(file, &svc_pool_stats_seq_ops);
+ if (!err)
+ ((struct seq_file *) file->private_data)->private = serv;
+ return err;
+}
+EXPORT_SYMBOL(svc_pool_stats_open);
+
+/*----------------------------------------------------------------------------*/
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
new file mode 100644
index 000000000..79c0f3459
--- /dev/null
+++ b/net/sunrpc/svcauth.c
@@ -0,0 +1,166 @@
+/*
+ * linux/net/sunrpc/svcauth.c
+ *
+ * The generic interface for RPC authentication on the server side.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ *
+ * CHANGES
+ * 19-Apr-2000 Chris Evans - Security fix
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/err.h>
+#include <linux/hash.h>
+
+#define RPCDBG_FACILITY RPCDBG_AUTH
+
+
+/*
+ * Table of authenticators
+ */
+extern struct auth_ops svcauth_null;
+extern struct auth_ops svcauth_unix;
+
+static DEFINE_SPINLOCK(authtab_lock);
+static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = {
+ [0] = &svcauth_null,
+ [1] = &svcauth_unix,
+};
+
+int
+svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
+{
+ rpc_authflavor_t flavor;
+ struct auth_ops *aops;
+
+ *authp = rpc_auth_ok;
+
+ flavor = svc_getnl(&rqstp->rq_arg.head[0]);
+
+ dprintk("svc: svc_authenticate (%d)\n", flavor);
+
+ spin_lock(&authtab_lock);
+ if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) ||
+ !try_module_get(aops->owner)) {
+ spin_unlock(&authtab_lock);
+ *authp = rpc_autherr_badcred;
+ return SVC_DENIED;
+ }
+ spin_unlock(&authtab_lock);
+
+ rqstp->rq_auth_slack = 0;
+
+ rqstp->rq_authop = aops;
+ return aops->accept(rqstp, authp);
+}
+EXPORT_SYMBOL_GPL(svc_authenticate);
+
+int svc_set_client(struct svc_rqst *rqstp)
+{
+ return rqstp->rq_authop->set_client(rqstp);
+}
+EXPORT_SYMBOL_GPL(svc_set_client);
+
+/* A request, which was authenticated, has now executed.
+ * Time to finalise the credentials and verifier
+ * and release and resources
+ */
+int svc_authorise(struct svc_rqst *rqstp)
+{
+ struct auth_ops *aops = rqstp->rq_authop;
+ int rv = 0;
+
+ rqstp->rq_authop = NULL;
+
+ if (aops) {
+ rv = aops->release(rqstp);
+ module_put(aops->owner);
+ }
+ return rv;
+}
+
+int
+svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
+{
+ int rv = -EINVAL;
+ spin_lock(&authtab_lock);
+ if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) {
+ authtab[flavor] = aops;
+ rv = 0;
+ }
+ spin_unlock(&authtab_lock);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(svc_auth_register);
+
+void
+svc_auth_unregister(rpc_authflavor_t flavor)
+{
+ spin_lock(&authtab_lock);
+ if (flavor < RPC_AUTH_MAXFLAVOR)
+ authtab[flavor] = NULL;
+ spin_unlock(&authtab_lock);
+}
+EXPORT_SYMBOL_GPL(svc_auth_unregister);
+
+/**************************************************
+ * 'auth_domains' are stored in a hash table indexed by name.
+ * When the last reference to an 'auth_domain' is dropped,
+ * the object is unhashed and freed.
+ * If auth_domain_lookup fails to find an entry, it will return
+ * it's second argument 'new'. If this is non-null, it will
+ * have been atomically linked into the table.
+ */
+
+#define DN_HASHBITS 6
+#define DN_HASHMAX (1<<DN_HASHBITS)
+
+static struct hlist_head auth_domain_table[DN_HASHMAX];
+static spinlock_t auth_domain_lock =
+ __SPIN_LOCK_UNLOCKED(auth_domain_lock);
+
+void auth_domain_put(struct auth_domain *dom)
+{
+ if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
+ hlist_del(&dom->hash);
+ dom->flavour->domain_release(dom);
+ spin_unlock(&auth_domain_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(auth_domain_put);
+
+struct auth_domain *
+auth_domain_lookup(char *name, struct auth_domain *new)
+{
+ struct auth_domain *hp;
+ struct hlist_head *head;
+
+ head = &auth_domain_table[hash_str(name, DN_HASHBITS)];
+
+ spin_lock(&auth_domain_lock);
+
+ hlist_for_each_entry(hp, head, hash) {
+ if (strcmp(hp->name, name)==0) {
+ kref_get(&hp->ref);
+ spin_unlock(&auth_domain_lock);
+ return hp;
+ }
+ }
+ if (new)
+ hlist_add_head(&new->hash, head);
+ spin_unlock(&auth_domain_lock);
+ return new;
+}
+EXPORT_SYMBOL_GPL(auth_domain_lookup);
+
+struct auth_domain *auth_domain_find(char *name)
+{
+ return auth_domain_lookup(name, NULL);
+}
+EXPORT_SYMBOL_GPL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
new file mode 100644
index 000000000..621ca7b4a
--- /dev/null
+++ b/net/sunrpc/svcauth_unix.c
@@ -0,0 +1,914 @@
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <linux/kernel.h>
+#include <linux/user_namespace.h>
+#define RPCDBG_FACILITY RPCDBG_AUTH
+
+
+#include "netns.h"
+
+/*
+ * AUTHUNIX and AUTHNULL credentials are both handled here.
+ * AUTHNULL is treated just like AUTHUNIX except that the uid/gid
+ * are always nobody (-2). i.e. we do the same IP address checks for
+ * AUTHNULL as for AUTHUNIX, and that is done here.
+ */
+
+
+struct unix_domain {
+ struct auth_domain h;
+ /* other stuff later */
+};
+
+extern struct auth_ops svcauth_null;
+extern struct auth_ops svcauth_unix;
+
+static void svcauth_unix_domain_release(struct auth_domain *dom)
+{
+ struct unix_domain *ud = container_of(dom, struct unix_domain, h);
+
+ kfree(dom->name);
+ kfree(ud);
+}
+
+struct auth_domain *unix_domain_find(char *name)
+{
+ struct auth_domain *rv;
+ struct unix_domain *new = NULL;
+
+ rv = auth_domain_lookup(name, NULL);
+ while(1) {
+ if (rv) {
+ if (new && rv != &new->h)
+ svcauth_unix_domain_release(&new->h);
+
+ if (rv->flavour != &svcauth_unix) {
+ auth_domain_put(rv);
+ return NULL;
+ }
+ return rv;
+ }
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (new == NULL)
+ return NULL;
+ kref_init(&new->h.ref);
+ new->h.name = kstrdup(name, GFP_KERNEL);
+ if (new->h.name == NULL) {
+ kfree(new);
+ return NULL;
+ }
+ new->h.flavour = &svcauth_unix;
+ rv = auth_domain_lookup(name, &new->h);
+ }
+}
+EXPORT_SYMBOL_GPL(unix_domain_find);
+
+
+/**************************************************
+ * cache for IP address to unix_domain
+ * as needed by AUTH_UNIX
+ */
+#define IP_HASHBITS 8
+#define IP_HASHMAX (1<<IP_HASHBITS)
+
+struct ip_map {
+ struct cache_head h;
+ char m_class[8]; /* e.g. "nfsd" */
+ struct in6_addr m_addr;
+ struct unix_domain *m_client;
+};
+
+static void ip_map_put(struct kref *kref)
+{
+ struct cache_head *item = container_of(kref, struct cache_head, ref);
+ struct ip_map *im = container_of(item, struct ip_map,h);
+
+ if (test_bit(CACHE_VALID, &item->flags) &&
+ !test_bit(CACHE_NEGATIVE, &item->flags))
+ auth_domain_put(&im->m_client->h);
+ kfree(im);
+}
+
+static inline int hash_ip6(const struct in6_addr *ip)
+{
+ return hash_32(ipv6_addr_hash(ip), IP_HASHBITS);
+}
+static int ip_map_match(struct cache_head *corig, struct cache_head *cnew)
+{
+ struct ip_map *orig = container_of(corig, struct ip_map, h);
+ struct ip_map *new = container_of(cnew, struct ip_map, h);
+ return strcmp(orig->m_class, new->m_class) == 0 &&
+ ipv6_addr_equal(&orig->m_addr, &new->m_addr);
+}
+static void ip_map_init(struct cache_head *cnew, struct cache_head *citem)
+{
+ struct ip_map *new = container_of(cnew, struct ip_map, h);
+ struct ip_map *item = container_of(citem, struct ip_map, h);
+
+ strcpy(new->m_class, item->m_class);
+ new->m_addr = item->m_addr;
+}
+static void update(struct cache_head *cnew, struct cache_head *citem)
+{
+ struct ip_map *new = container_of(cnew, struct ip_map, h);
+ struct ip_map *item = container_of(citem, struct ip_map, h);
+
+ kref_get(&item->m_client->h.ref);
+ new->m_client = item->m_client;
+}
+static struct cache_head *ip_map_alloc(void)
+{
+ struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL);
+ if (i)
+ return &i->h;
+ else
+ return NULL;
+}
+
+static void ip_map_request(struct cache_detail *cd,
+ struct cache_head *h,
+ char **bpp, int *blen)
+{
+ char text_addr[40];
+ struct ip_map *im = container_of(h, struct ip_map, h);
+
+ if (ipv6_addr_v4mapped(&(im->m_addr))) {
+ snprintf(text_addr, 20, "%pI4", &im->m_addr.s6_addr32[3]);
+ } else {
+ snprintf(text_addr, 40, "%pI6", &im->m_addr);
+ }
+ qword_add(bpp, blen, im->m_class);
+ qword_add(bpp, blen, text_addr);
+ (*bpp)[-1] = '\n';
+}
+
+static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr);
+static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
+
+static int ip_map_parse(struct cache_detail *cd,
+ char *mesg, int mlen)
+{
+ /* class ipaddress [domainname] */
+ /* should be safe just to use the start of the input buffer
+ * for scratch: */
+ char *buf = mesg;
+ int len;
+ char class[8];
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in s4;
+ struct sockaddr_in6 s6;
+ } address;
+ struct sockaddr_in6 sin6;
+ int err;
+
+ struct ip_map *ipmp;
+ struct auth_domain *dom;
+ time_t expiry;
+
+ if (mesg[mlen-1] != '\n')
+ return -EINVAL;
+ mesg[mlen-1] = 0;
+
+ /* class */
+ len = qword_get(&mesg, class, sizeof(class));
+ if (len <= 0) return -EINVAL;
+
+ /* ip address */
+ len = qword_get(&mesg, buf, mlen);
+ if (len <= 0) return -EINVAL;
+
+ if (rpc_pton(cd->net, buf, len, &address.sa, sizeof(address)) == 0)
+ return -EINVAL;
+ switch (address.sa.sa_family) {
+ case AF_INET:
+ /* Form a mapped IPv4 address in sin6 */
+ sin6.sin6_family = AF_INET6;
+ ipv6_addr_set_v4mapped(address.s4.sin_addr.s_addr,
+ &sin6.sin6_addr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ memcpy(&sin6, &address.s6, sizeof(sin6));
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ expiry = get_expiry(&mesg);
+ if (expiry ==0)
+ return -EINVAL;
+
+ /* domainname, or empty for NEGATIVE */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0) return -EINVAL;
+
+ if (len) {
+ dom = unix_domain_find(buf);
+ if (dom == NULL)
+ return -ENOENT;
+ } else
+ dom = NULL;
+
+ /* IPv6 scope IDs are ignored for now */
+ ipmp = __ip_map_lookup(cd, class, &sin6.sin6_addr);
+ if (ipmp) {
+ err = __ip_map_update(cd, ipmp,
+ container_of(dom, struct unix_domain, h),
+ expiry);
+ } else
+ err = -ENOMEM;
+
+ if (dom)
+ auth_domain_put(dom);
+
+ cache_flush();
+ return err;
+}
+
+static int ip_map_show(struct seq_file *m,
+ struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct ip_map *im;
+ struct in6_addr addr;
+ char *dom = "-no-domain-";
+
+ if (h == NULL) {
+ seq_puts(m, "#class IP domain\n");
+ return 0;
+ }
+ im = container_of(h, struct ip_map, h);
+ /* class addr domain */
+ addr = im->m_addr;
+
+ if (test_bit(CACHE_VALID, &h->flags) &&
+ !test_bit(CACHE_NEGATIVE, &h->flags))
+ dom = im->m_client->h.name;
+
+ if (ipv6_addr_v4mapped(&addr)) {
+ seq_printf(m, "%s %pI4 %s\n",
+ im->m_class, &addr.s6_addr32[3], dom);
+ } else {
+ seq_printf(m, "%s %pI6 %s\n", im->m_class, &addr, dom);
+ }
+ return 0;
+}
+
+
+static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
+ struct in6_addr *addr)
+{
+ struct ip_map ip;
+ struct cache_head *ch;
+
+ strcpy(ip.m_class, class);
+ ip.m_addr = *addr;
+ ch = sunrpc_cache_lookup(cd, &ip.h,
+ hash_str(class, IP_HASHBITS) ^
+ hash_ip6(addr));
+
+ if (ch)
+ return container_of(ch, struct ip_map, h);
+ else
+ return NULL;
+}
+
+static inline struct ip_map *ip_map_lookup(struct net *net, char *class,
+ struct in6_addr *addr)
+{
+ struct sunrpc_net *sn;
+
+ sn = net_generic(net, sunrpc_net_id);
+ return __ip_map_lookup(sn->ip_map_cache, class, addr);
+}
+
+static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
+ struct unix_domain *udom, time_t expiry)
+{
+ struct ip_map ip;
+ struct cache_head *ch;
+
+ ip.m_client = udom;
+ ip.h.flags = 0;
+ if (!udom)
+ set_bit(CACHE_NEGATIVE, &ip.h.flags);
+ ip.h.expiry_time = expiry;
+ ch = sunrpc_cache_update(cd, &ip.h, &ipm->h,
+ hash_str(ipm->m_class, IP_HASHBITS) ^
+ hash_ip6(&ipm->m_addr));
+ if (!ch)
+ return -ENOMEM;
+ cache_put(ch, cd);
+ return 0;
+}
+
+static inline int ip_map_update(struct net *net, struct ip_map *ipm,
+ struct unix_domain *udom, time_t expiry)
+{
+ struct sunrpc_net *sn;
+
+ sn = net_generic(net, sunrpc_net_id);
+ return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry);
+}
+
+void svcauth_unix_purge(struct net *net)
+{
+ struct sunrpc_net *sn;
+
+ sn = net_generic(net, sunrpc_net_id);
+ cache_purge(sn->ip_map_cache);
+}
+EXPORT_SYMBOL_GPL(svcauth_unix_purge);
+
+static inline struct ip_map *
+ip_map_cached_get(struct svc_xprt *xprt)
+{
+ struct ip_map *ipm = NULL;
+ struct sunrpc_net *sn;
+
+ if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+ spin_lock(&xprt->xpt_lock);
+ ipm = xprt->xpt_auth_cache;
+ if (ipm != NULL) {
+ sn = net_generic(xprt->xpt_net, sunrpc_net_id);
+ if (cache_is_expired(sn->ip_map_cache, &ipm->h)) {
+ /*
+ * The entry has been invalidated since it was
+ * remembered, e.g. by a second mount from the
+ * same IP address.
+ */
+ xprt->xpt_auth_cache = NULL;
+ spin_unlock(&xprt->xpt_lock);
+ cache_put(&ipm->h, sn->ip_map_cache);
+ return NULL;
+ }
+ cache_get(&ipm->h);
+ }
+ spin_unlock(&xprt->xpt_lock);
+ }
+ return ipm;
+}
+
+static inline void
+ip_map_cached_put(struct svc_xprt *xprt, struct ip_map *ipm)
+{
+ if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+ spin_lock(&xprt->xpt_lock);
+ if (xprt->xpt_auth_cache == NULL) {
+ /* newly cached, keep the reference */
+ xprt->xpt_auth_cache = ipm;
+ ipm = NULL;
+ }
+ spin_unlock(&xprt->xpt_lock);
+ }
+ if (ipm) {
+ struct sunrpc_net *sn;
+
+ sn = net_generic(xprt->xpt_net, sunrpc_net_id);
+ cache_put(&ipm->h, sn->ip_map_cache);
+ }
+}
+
+void
+svcauth_unix_info_release(struct svc_xprt *xpt)
+{
+ struct ip_map *ipm;
+
+ ipm = xpt->xpt_auth_cache;
+ if (ipm != NULL) {
+ struct sunrpc_net *sn;
+
+ sn = net_generic(xpt->xpt_net, sunrpc_net_id);
+ cache_put(&ipm->h, sn->ip_map_cache);
+ }
+}
+
+/****************************************************************************
+ * auth.unix.gid cache
+ * simple cache to map a UID to a list of GIDs
+ * because AUTH_UNIX aka AUTH_SYS has a max of 16
+ */
+#define GID_HASHBITS 8
+#define GID_HASHMAX (1<<GID_HASHBITS)
+
+struct unix_gid {
+ struct cache_head h;
+ kuid_t uid;
+ struct group_info *gi;
+};
+
+static int unix_gid_hash(kuid_t uid)
+{
+ return hash_long(from_kuid(&init_user_ns, uid), GID_HASHBITS);
+}
+
+static void unix_gid_put(struct kref *kref)
+{
+ struct cache_head *item = container_of(kref, struct cache_head, ref);
+ struct unix_gid *ug = container_of(item, struct unix_gid, h);
+ if (test_bit(CACHE_VALID, &item->flags) &&
+ !test_bit(CACHE_NEGATIVE, &item->flags))
+ put_group_info(ug->gi);
+ kfree(ug);
+}
+
+static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew)
+{
+ struct unix_gid *orig = container_of(corig, struct unix_gid, h);
+ struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+ return uid_eq(orig->uid, new->uid);
+}
+static void unix_gid_init(struct cache_head *cnew, struct cache_head *citem)
+{
+ struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+ struct unix_gid *item = container_of(citem, struct unix_gid, h);
+ new->uid = item->uid;
+}
+static void unix_gid_update(struct cache_head *cnew, struct cache_head *citem)
+{
+ struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+ struct unix_gid *item = container_of(citem, struct unix_gid, h);
+
+ get_group_info(item->gi);
+ new->gi = item->gi;
+}
+static struct cache_head *unix_gid_alloc(void)
+{
+ struct unix_gid *g = kmalloc(sizeof(*g), GFP_KERNEL);
+ if (g)
+ return &g->h;
+ else
+ return NULL;
+}
+
+static void unix_gid_request(struct cache_detail *cd,
+ struct cache_head *h,
+ char **bpp, int *blen)
+{
+ char tuid[20];
+ struct unix_gid *ug = container_of(h, struct unix_gid, h);
+
+ snprintf(tuid, 20, "%u", from_kuid(&init_user_ns, ug->uid));
+ qword_add(bpp, blen, tuid);
+ (*bpp)[-1] = '\n';
+}
+
+static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid);
+
+static int unix_gid_parse(struct cache_detail *cd,
+ char *mesg, int mlen)
+{
+ /* uid expiry Ngid gid0 gid1 ... gidN-1 */
+ int id;
+ kuid_t uid;
+ int gids;
+ int rv;
+ int i;
+ int err;
+ time_t expiry;
+ struct unix_gid ug, *ugp;
+
+ if (mesg[mlen - 1] != '\n')
+ return -EINVAL;
+ mesg[mlen-1] = 0;
+
+ rv = get_int(&mesg, &id);
+ if (rv)
+ return -EINVAL;
+ uid = make_kuid(&init_user_ns, id);
+ ug.uid = uid;
+
+ expiry = get_expiry(&mesg);
+ if (expiry == 0)
+ return -EINVAL;
+
+ rv = get_int(&mesg, &gids);
+ if (rv || gids < 0 || gids > 8192)
+ return -EINVAL;
+
+ ug.gi = groups_alloc(gids);
+ if (!ug.gi)
+ return -ENOMEM;
+
+ for (i = 0 ; i < gids ; i++) {
+ int gid;
+ kgid_t kgid;
+ rv = get_int(&mesg, &gid);
+ err = -EINVAL;
+ if (rv)
+ goto out;
+ kgid = make_kgid(&init_user_ns, gid);
+ if (!gid_valid(kgid))
+ goto out;
+ GROUP_AT(ug.gi, i) = kgid;
+ }
+
+ ugp = unix_gid_lookup(cd, uid);
+ if (ugp) {
+ struct cache_head *ch;
+ ug.h.flags = 0;
+ ug.h.expiry_time = expiry;
+ ch = sunrpc_cache_update(cd,
+ &ug.h, &ugp->h,
+ unix_gid_hash(uid));
+ if (!ch)
+ err = -ENOMEM;
+ else {
+ err = 0;
+ cache_put(ch, cd);
+ }
+ } else
+ err = -ENOMEM;
+ out:
+ if (ug.gi)
+ put_group_info(ug.gi);
+ return err;
+}
+
+static int unix_gid_show(struct seq_file *m,
+ struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct user_namespace *user_ns = &init_user_ns;
+ struct unix_gid *ug;
+ int i;
+ int glen;
+
+ if (h == NULL) {
+ seq_puts(m, "#uid cnt: gids...\n");
+ return 0;
+ }
+ ug = container_of(h, struct unix_gid, h);
+ if (test_bit(CACHE_VALID, &h->flags) &&
+ !test_bit(CACHE_NEGATIVE, &h->flags))
+ glen = ug->gi->ngroups;
+ else
+ glen = 0;
+
+ seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen);
+ for (i = 0; i < glen; i++)
+ seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i)));
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static struct cache_detail unix_gid_cache_template = {
+ .owner = THIS_MODULE,
+ .hash_size = GID_HASHMAX,
+ .name = "auth.unix.gid",
+ .cache_put = unix_gid_put,
+ .cache_request = unix_gid_request,
+ .cache_parse = unix_gid_parse,
+ .cache_show = unix_gid_show,
+ .match = unix_gid_match,
+ .init = unix_gid_init,
+ .update = unix_gid_update,
+ .alloc = unix_gid_alloc,
+};
+
+int unix_gid_cache_create(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct cache_detail *cd;
+ int err;
+
+ cd = cache_create_net(&unix_gid_cache_template, net);
+ if (IS_ERR(cd))
+ return PTR_ERR(cd);
+ err = cache_register_net(cd, net);
+ if (err) {
+ cache_destroy_net(cd, net);
+ return err;
+ }
+ sn->unix_gid_cache = cd;
+ return 0;
+}
+
+void unix_gid_cache_destroy(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct cache_detail *cd = sn->unix_gid_cache;
+
+ sn->unix_gid_cache = NULL;
+ cache_purge(cd);
+ cache_unregister_net(cd, net);
+ cache_destroy_net(cd, net);
+}
+
+static struct unix_gid *unix_gid_lookup(struct cache_detail *cd, kuid_t uid)
+{
+ struct unix_gid ug;
+ struct cache_head *ch;
+
+ ug.uid = uid;
+ ch = sunrpc_cache_lookup(cd, &ug.h, unix_gid_hash(uid));
+ if (ch)
+ return container_of(ch, struct unix_gid, h);
+ else
+ return NULL;
+}
+
+static struct group_info *unix_gid_find(kuid_t uid, struct svc_rqst *rqstp)
+{
+ struct unix_gid *ug;
+ struct group_info *gi;
+ int ret;
+ struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net,
+ sunrpc_net_id);
+
+ ug = unix_gid_lookup(sn->unix_gid_cache, uid);
+ if (!ug)
+ return ERR_PTR(-EAGAIN);
+ ret = cache_check(sn->unix_gid_cache, &ug->h, &rqstp->rq_chandle);
+ switch (ret) {
+ case -ENOENT:
+ return ERR_PTR(-ENOENT);
+ case -ETIMEDOUT:
+ return ERR_PTR(-ESHUTDOWN);
+ case 0:
+ gi = get_group_info(ug->gi);
+ cache_put(&ug->h, sn->unix_gid_cache);
+ return gi;
+ default:
+ return ERR_PTR(-EAGAIN);
+ }
+}
+
+int
+svcauth_unix_set_client(struct svc_rqst *rqstp)
+{
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6, sin6_storage;
+ struct ip_map *ipm;
+ struct group_info *gi;
+ struct svc_cred *cred = &rqstp->rq_cred;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct net *net = xprt->xpt_net;
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ switch (rqstp->rq_addr.ss_family) {
+ case AF_INET:
+ sin = svc_addr_in(rqstp);
+ sin6 = &sin6_storage;
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &sin6->sin6_addr);
+ break;
+ case AF_INET6:
+ sin6 = svc_addr_in6(rqstp);
+ break;
+ default:
+ BUG();
+ }
+
+ rqstp->rq_client = NULL;
+ if (rqstp->rq_proc == 0)
+ return SVC_OK;
+
+ ipm = ip_map_cached_get(xprt);
+ if (ipm == NULL)
+ ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class,
+ &sin6->sin6_addr);
+
+ if (ipm == NULL)
+ return SVC_DENIED;
+
+ switch (cache_check(sn->ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
+ default:
+ BUG();
+ case -ETIMEDOUT:
+ return SVC_CLOSE;
+ case -EAGAIN:
+ return SVC_DROP;
+ case -ENOENT:
+ return SVC_DENIED;
+ case 0:
+ rqstp->rq_client = &ipm->m_client->h;
+ kref_get(&rqstp->rq_client->ref);
+ ip_map_cached_put(xprt, ipm);
+ break;
+ }
+
+ gi = unix_gid_find(cred->cr_uid, rqstp);
+ switch (PTR_ERR(gi)) {
+ case -EAGAIN:
+ return SVC_DROP;
+ case -ESHUTDOWN:
+ return SVC_CLOSE;
+ case -ENOENT:
+ break;
+ default:
+ put_group_info(cred->cr_group_info);
+ cred->cr_group_info = gi;
+ }
+ return SVC_OK;
+}
+
+EXPORT_SYMBOL_GPL(svcauth_unix_set_client);
+
+static int
+svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ struct svc_cred *cred = &rqstp->rq_cred;
+
+ cred->cr_group_info = NULL;
+ cred->cr_principal = NULL;
+ rqstp->rq_client = NULL;
+
+ if (argv->iov_len < 3*4)
+ return SVC_GARBAGE;
+
+ if (svc_getu32(argv) != 0) {
+ dprintk("svc: bad null cred\n");
+ *authp = rpc_autherr_badcred;
+ return SVC_DENIED;
+ }
+ if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
+ dprintk("svc: bad null verf\n");
+ *authp = rpc_autherr_badverf;
+ return SVC_DENIED;
+ }
+
+ /* Signal that mapping to nobody uid/gid is required */
+ cred->cr_uid = INVALID_UID;
+ cred->cr_gid = INVALID_GID;
+ cred->cr_group_info = groups_alloc(0);
+ if (cred->cr_group_info == NULL)
+ return SVC_CLOSE; /* kmalloc failure - client must retry */
+
+ /* Put NULL verifier */
+ svc_putnl(resv, RPC_AUTH_NULL);
+ svc_putnl(resv, 0);
+
+ rqstp->rq_cred.cr_flavor = RPC_AUTH_NULL;
+ return SVC_OK;
+}
+
+static int
+svcauth_null_release(struct svc_rqst *rqstp)
+{
+ if (rqstp->rq_client)
+ auth_domain_put(rqstp->rq_client);
+ rqstp->rq_client = NULL;
+ if (rqstp->rq_cred.cr_group_info)
+ put_group_info(rqstp->rq_cred.cr_group_info);
+ rqstp->rq_cred.cr_group_info = NULL;
+
+ return 0; /* don't drop */
+}
+
+
+struct auth_ops svcauth_null = {
+ .name = "null",
+ .owner = THIS_MODULE,
+ .flavour = RPC_AUTH_NULL,
+ .accept = svcauth_null_accept,
+ .release = svcauth_null_release,
+ .set_client = svcauth_unix_set_client,
+};
+
+
+static int
+svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ struct svc_cred *cred = &rqstp->rq_cred;
+ u32 slen, i;
+ int len = argv->iov_len;
+
+ cred->cr_group_info = NULL;
+ cred->cr_principal = NULL;
+ rqstp->rq_client = NULL;
+
+ if ((len -= 3*4) < 0)
+ return SVC_GARBAGE;
+
+ svc_getu32(argv); /* length */
+ svc_getu32(argv); /* time stamp */
+ slen = XDR_QUADLEN(svc_getnl(argv)); /* machname length */
+ if (slen > 64 || (len -= (slen + 3)*4) < 0)
+ goto badcred;
+ argv->iov_base = (void*)((__be32*)argv->iov_base + slen); /* skip machname */
+ argv->iov_len -= slen*4;
+ /*
+ * Note: we skip uid_valid()/gid_valid() checks here for
+ * backwards compatibility with clients that use -1 id's.
+ * Instead, -1 uid or gid is later mapped to the
+ * (export-specific) anonymous id by nfsd_setuser.
+ * Supplementary gid's will be left alone.
+ */
+ cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */
+ cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */
+ slen = svc_getnl(argv); /* gids length */
+ if (slen > 16 || (len -= (slen + 2)*4) < 0)
+ goto badcred;
+ cred->cr_group_info = groups_alloc(slen);
+ if (cred->cr_group_info == NULL)
+ return SVC_CLOSE;
+ for (i = 0; i < slen; i++) {
+ kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv));
+ GROUP_AT(cred->cr_group_info, i) = kgid;
+ }
+ if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
+ *authp = rpc_autherr_badverf;
+ return SVC_DENIED;
+ }
+
+ /* Put NULL verifier */
+ svc_putnl(resv, RPC_AUTH_NULL);
+ svc_putnl(resv, 0);
+
+ rqstp->rq_cred.cr_flavor = RPC_AUTH_UNIX;
+ return SVC_OK;
+
+badcred:
+ *authp = rpc_autherr_badcred;
+ return SVC_DENIED;
+}
+
+static int
+svcauth_unix_release(struct svc_rqst *rqstp)
+{
+ /* Verifier (such as it is) is already in place.
+ */
+ if (rqstp->rq_client)
+ auth_domain_put(rqstp->rq_client);
+ rqstp->rq_client = NULL;
+ if (rqstp->rq_cred.cr_group_info)
+ put_group_info(rqstp->rq_cred.cr_group_info);
+ rqstp->rq_cred.cr_group_info = NULL;
+
+ return 0;
+}
+
+
+struct auth_ops svcauth_unix = {
+ .name = "unix",
+ .owner = THIS_MODULE,
+ .flavour = RPC_AUTH_UNIX,
+ .accept = svcauth_unix_accept,
+ .release = svcauth_unix_release,
+ .domain_release = svcauth_unix_domain_release,
+ .set_client = svcauth_unix_set_client,
+};
+
+static struct cache_detail ip_map_cache_template = {
+ .owner = THIS_MODULE,
+ .hash_size = IP_HASHMAX,
+ .name = "auth.unix.ip",
+ .cache_put = ip_map_put,
+ .cache_request = ip_map_request,
+ .cache_parse = ip_map_parse,
+ .cache_show = ip_map_show,
+ .match = ip_map_match,
+ .init = ip_map_init,
+ .update = update,
+ .alloc = ip_map_alloc,
+};
+
+int ip_map_cache_create(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct cache_detail *cd;
+ int err;
+
+ cd = cache_create_net(&ip_map_cache_template, net);
+ if (IS_ERR(cd))
+ return PTR_ERR(cd);
+ err = cache_register_net(cd, net);
+ if (err) {
+ cache_destroy_net(cd, net);
+ return err;
+ }
+ sn->ip_map_cache = cd;
+ return 0;
+}
+
+void ip_map_cache_destroy(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct cache_detail *cd = sn->ip_map_cache;
+
+ sn->ip_map_cache = NULL;
+ cache_purge(cd);
+ cache_unregister_net(cd, net);
+ cache_destroy_net(cd, net);
+}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
new file mode 100644
index 000000000..0c8120229
--- /dev/null
+++ b/net/sunrpc/svcsock.c
@@ -0,0 +1,1672 @@
+/*
+ * linux/net/sunrpc/svcsock.c
+ *
+ * These are the RPC server socket internals.
+ *
+ * The server scheduling algorithm does not always distribute the load
+ * evenly when servicing a single client. May need to modify the
+ * svc_xprt_enqueue procedure...
+ *
+ * TCP support is largely untested and may be a little slow. The problem
+ * is that we currently do two separate recvfrom's, one for the 4-byte
+ * record length, and the second for the actual record. This could possibly
+ * be improved by always reading a minimum size of around 100 bytes and
+ * tucking any superfluous bytes away in a temporary store. Still, that
+ * leaves write requests out in the rain. An alternative may be to peek at
+ * the first skb in the queue, and if it matches the next TCP sequence
+ * number, to extract the record marker. Yuck.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <trace/events/skb.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/xprt.h>
+
+#include "sunrpc.h"
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+
+static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
+ int flags);
+static void svc_udp_data_ready(struct sock *);
+static int svc_udp_recvfrom(struct svc_rqst *);
+static int svc_udp_sendto(struct svc_rqst *);
+static void svc_sock_detach(struct svc_xprt *);
+static void svc_tcp_sock_detach(struct svc_xprt *);
+static void svc_sock_free(struct svc_xprt *);
+
+static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
+ struct net *, struct sockaddr *,
+ int, int);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
+ struct net *, struct sockaddr *,
+ int, int);
+static void svc_bc_sock_free(struct svc_xprt *xprt);
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key svc_key[2];
+static struct lock_class_key svc_slock_key[2];
+
+static void svc_reclassify_socket(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ WARN_ON_ONCE(sock_owned_by_user(sk));
+ if (sock_owned_by_user(sk))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
+ &svc_slock_key[0],
+ "sk_xprt.xpt_lock-AF_INET-NFSD",
+ &svc_key[0]);
+ break;
+
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
+ &svc_slock_key[1],
+ "sk_xprt.xpt_lock-AF_INET6-NFSD",
+ &svc_key[1]);
+ break;
+
+ default:
+ BUG();
+ }
+}
+#else
+static void svc_reclassify_socket(struct socket *sock)
+{
+}
+#endif
+
+/*
+ * Release an skbuff after use
+ */
+static void svc_release_skb(struct svc_rqst *rqstp)
+{
+ struct sk_buff *skb = rqstp->rq_xprt_ctxt;
+
+ if (skb) {
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ rqstp->rq_xprt_ctxt = NULL;
+
+ dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+ skb_free_datagram_locked(svsk->sk_sk, skb);
+ }
+}
+
+union svc_pktinfo_u {
+ struct in_pktinfo pkti;
+ struct in6_pktinfo pkti6;
+};
+#define SVC_PKTINFO_SPACE \
+ CMSG_SPACE(sizeof(union svc_pktinfo_u))
+
+static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
+{
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ switch (svsk->sk_sk->sk_family) {
+ case AF_INET: {
+ struct in_pktinfo *pki = CMSG_DATA(cmh);
+
+ cmh->cmsg_level = SOL_IP;
+ cmh->cmsg_type = IP_PKTINFO;
+ pki->ipi_ifindex = 0;
+ pki->ipi_spec_dst.s_addr =
+ svc_daddr_in(rqstp)->sin_addr.s_addr;
+ cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
+ }
+ break;
+
+ case AF_INET6: {
+ struct in6_pktinfo *pki = CMSG_DATA(cmh);
+ struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp);
+
+ cmh->cmsg_level = SOL_IPV6;
+ cmh->cmsg_type = IPV6_PKTINFO;
+ pki->ipi6_ifindex = daddr->sin6_scope_id;
+ pki->ipi6_addr = daddr->sin6_addr;
+ cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
+ }
+ break;
+ }
+}
+
+/*
+ * send routine intended to be shared by the fore- and back-channel
+ */
+int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
+ struct page *headpage, unsigned long headoffset,
+ struct page *tailpage, unsigned long tailoffset)
+{
+ int result;
+ int size;
+ struct page **ppage = xdr->pages;
+ size_t base = xdr->page_base;
+ unsigned int pglen = xdr->page_len;
+ unsigned int flags = MSG_MORE;
+ int slen;
+ int len = 0;
+
+ slen = xdr->len;
+
+ /* send head */
+ if (slen == xdr->head[0].iov_len)
+ flags = 0;
+ len = kernel_sendpage(sock, headpage, headoffset,
+ xdr->head[0].iov_len, flags);
+ if (len != xdr->head[0].iov_len)
+ goto out;
+ slen -= xdr->head[0].iov_len;
+ if (slen == 0)
+ goto out;
+
+ /* send page data */
+ size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
+ while (pglen > 0) {
+ if (slen == size)
+ flags = 0;
+ result = kernel_sendpage(sock, *ppage, base, size, flags);
+ if (result > 0)
+ len += result;
+ if (result != size)
+ goto out;
+ slen -= size;
+ pglen -= size;
+ size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
+ base = 0;
+ ppage++;
+ }
+
+ /* send tail */
+ if (xdr->tail[0].iov_len) {
+ result = kernel_sendpage(sock, tailpage, tailoffset,
+ xdr->tail[0].iov_len, 0);
+ if (result > 0)
+ len += result;
+ }
+
+out:
+ return len;
+}
+
+
+/*
+ * Generic sendto routine
+ */
+static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+{
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ struct socket *sock = svsk->sk_sock;
+ union {
+ struct cmsghdr hdr;
+ long all[SVC_PKTINFO_SPACE / sizeof(long)];
+ } buffer;
+ struct cmsghdr *cmh = &buffer.hdr;
+ int len = 0;
+ unsigned long tailoff;
+ unsigned long headoff;
+ RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+ if (rqstp->rq_prot == IPPROTO_UDP) {
+ struct msghdr msg = {
+ .msg_name = &rqstp->rq_addr,
+ .msg_namelen = rqstp->rq_addrlen,
+ .msg_control = cmh,
+ .msg_controllen = sizeof(buffer),
+ .msg_flags = MSG_MORE,
+ };
+
+ svc_set_cmsg_data(rqstp, cmh);
+
+ if (sock_sendmsg(sock, &msg) < 0)
+ goto out;
+ }
+
+ tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
+ headoff = 0;
+ len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
+ rqstp->rq_respages[0], tailoff);
+
+out:
+ dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
+ svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
+ xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
+
+ return len;
+}
+
+/*
+ * Report socket names for nfsdfs
+ */
+static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
+{
+ const struct sock *sk = svsk->sk_sk;
+ const char *proto_name = sk->sk_protocol == IPPROTO_UDP ?
+ "udp" : "tcp";
+ int len;
+
+ switch (sk->sk_family) {
+ case PF_INET:
+ len = snprintf(buf, remaining, "ipv4 %s %pI4 %d\n",
+ proto_name,
+ &inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case PF_INET6:
+ len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
+ proto_name,
+ &sk->sk_v6_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ break;
+#endif
+ default:
+ len = snprintf(buf, remaining, "*unknown-%d*\n",
+ sk->sk_family);
+ }
+
+ if (len >= remaining) {
+ *buf = '\0';
+ return -ENAMETOOLONG;
+ }
+ return len;
+}
+
+/*
+ * Generic recvfrom routine.
+ */
+static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
+ int buflen)
+{
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT,
+ };
+ int len;
+
+ rqstp->rq_xprt_hlen = 0;
+
+ clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
+ msg.msg_flags);
+ /* If we read a full record, then assume there may be more
+ * data to read (stream based sockets only!)
+ */
+ if (len == buflen)
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+
+ dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+ svsk, iov[0].iov_base, iov[0].iov_len, len);
+ return len;
+}
+
+static int svc_partial_recvfrom(struct svc_rqst *rqstp,
+ struct kvec *iov, int nr,
+ int buflen, unsigned int base)
+{
+ size_t save_iovlen;
+ void *save_iovbase;
+ unsigned int i;
+ int ret;
+
+ if (base == 0)
+ return svc_recvfrom(rqstp, iov, nr, buflen);
+
+ for (i = 0; i < nr; i++) {
+ if (iov[i].iov_len > base)
+ break;
+ base -= iov[i].iov_len;
+ }
+ save_iovlen = iov[i].iov_len;
+ save_iovbase = iov[i].iov_base;
+ iov[i].iov_len -= base;
+ iov[i].iov_base += base;
+ ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen);
+ iov[i].iov_len = save_iovlen;
+ iov[i].iov_base = save_iovbase;
+ return ret;
+}
+
+/*
+ * Set socket snd and rcv buffer lengths
+ */
+static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
+ unsigned int rcv)
+{
+#if 0
+ mm_segment_t oldfs;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+ (char*)&snd, sizeof(snd));
+ sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+ (char*)&rcv, sizeof(rcv));
+#else
+ /* sock_setsockopt limits use to sysctl_?mem_max,
+ * which isn't acceptable. Until that is made conditional
+ * on not having CAP_SYS_RESOURCE or similar, we go direct...
+ * DaveM said I could!
+ */
+ lock_sock(sock->sk);
+ sock->sk->sk_sndbuf = snd * 2;
+ sock->sk->sk_rcvbuf = rcv * 2;
+ sock->sk->sk_write_space(sock->sk);
+ release_sock(sock->sk);
+#endif
+}
+
+static int svc_sock_secure_port(struct svc_rqst *rqstp)
+{
+ return svc_port_is_privileged(svc_addr(rqstp));
+}
+
+/*
+ * INET callback when data has been received on the socket.
+ */
+static void svc_udp_data_ready(struct sock *sk)
+{
+ struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
+
+ if (svsk) {
+ dprintk("svc: socket %p(inet %p), busy=%d\n",
+ svsk, sk,
+ test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
+ }
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
+}
+
+/*
+ * INET callback when space is newly available on the socket.
+ */
+static void svc_write_space(struct sock *sk)
+{
+ struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
+ wait_queue_head_t *wq = sk_sleep(sk);
+
+ if (svsk) {
+ dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
+ svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+ svc_xprt_enqueue(&svsk->sk_xprt);
+ }
+
+ if (wq && waitqueue_active(wq)) {
+ dprintk("RPC svc_write_space: someone sleeping on %p\n",
+ svsk);
+ wake_up_interruptible(wq);
+ }
+}
+
+static int svc_tcp_has_wspace(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+ int required;
+
+ if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
+ return 1;
+ required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
+ if (sk_stream_wspace(svsk->sk_sk) >= required ||
+ (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
+ atomic_read(&xprt->xpt_reserved) == 0))
+ return 1;
+ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return 0;
+}
+
+static void svc_tcp_write_space(struct sock *sk)
+{
+ struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
+ struct socket *sock = sk->sk_socket;
+
+ if (!sk_stream_is_writeable(sk) || !sock)
+ return;
+ if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt))
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+ svc_write_space(sk);
+}
+
+static void svc_tcp_adjust_wspace(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+
+ if (svc_tcp_has_wspace(xprt))
+ clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+}
+
+/*
+ * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
+ */
+static int svc_udp_get_dest_address4(struct svc_rqst *rqstp,
+ struct cmsghdr *cmh)
+{
+ struct in_pktinfo *pki = CMSG_DATA(cmh);
+ struct sockaddr_in *daddr = svc_daddr_in(rqstp);
+
+ if (cmh->cmsg_type != IP_PKTINFO)
+ return 0;
+
+ daddr->sin_family = AF_INET;
+ daddr->sin_addr.s_addr = pki->ipi_spec_dst.s_addr;
+ return 1;
+}
+
+/*
+ * See net/ipv6/datagram.c : ip6_datagram_recv_ctl
+ */
+static int svc_udp_get_dest_address6(struct svc_rqst *rqstp,
+ struct cmsghdr *cmh)
+{
+ struct in6_pktinfo *pki = CMSG_DATA(cmh);
+ struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp);
+
+ if (cmh->cmsg_type != IPV6_PKTINFO)
+ return 0;
+
+ daddr->sin6_family = AF_INET6;
+ daddr->sin6_addr = pki->ipi6_addr;
+ daddr->sin6_scope_id = pki->ipi6_ifindex;
+ return 1;
+}
+
+/*
+ * Copy the UDP datagram's destination address to the rqstp structure.
+ * The 'destination' address in this case is the address to which the
+ * peer sent the datagram, i.e. our local address. For multihomed
+ * hosts, this can change from msg to msg. Note that only the IP
+ * address changes, the port number should remain the same.
+ */
+static int svc_udp_get_dest_address(struct svc_rqst *rqstp,
+ struct cmsghdr *cmh)
+{
+ switch (cmh->cmsg_level) {
+ case SOL_IP:
+ return svc_udp_get_dest_address4(rqstp, cmh);
+ case SOL_IPV6:
+ return svc_udp_get_dest_address6(rqstp, cmh);
+ }
+
+ return 0;
+}
+
+/*
+ * Receive a datagram from a UDP socket.
+ */
+static int svc_udp_recvfrom(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+ struct sk_buff *skb;
+ union {
+ struct cmsghdr hdr;
+ long all[SVC_PKTINFO_SPACE / sizeof(long)];
+ } buffer;
+ struct cmsghdr *cmh = &buffer.hdr;
+ struct msghdr msg = {
+ .msg_name = svc_addr(rqstp),
+ .msg_control = cmh,
+ .msg_controllen = sizeof(buffer),
+ .msg_flags = MSG_DONTWAIT,
+ };
+ size_t len;
+ int err;
+
+ if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
+ /* udp sockets need large rcvbuf as all pending
+ * requests are still in that buffer. sndbuf must
+ * also be large enough that there is enough space
+ * for one reply per thread. We count all threads
+ * rather than threads in a particular pool, which
+ * provides an upper bound on the number of threads
+ * which will access the socket.
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ (serv->sv_nrthreads+3) * serv->sv_max_mesg,
+ (serv->sv_nrthreads+3) * serv->sv_max_mesg);
+
+ clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ skb = NULL;
+ err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
+ 0, 0, MSG_PEEK | MSG_DONTWAIT);
+ if (err >= 0)
+ skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+
+ if (skb == NULL) {
+ if (err != -EAGAIN) {
+ /* possibly an icmp error */
+ dprintk("svc: recvfrom returned error %d\n", -err);
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ }
+ return 0;
+ }
+ len = svc_addr_len(svc_addr(rqstp));
+ rqstp->rq_addrlen = len;
+ if (skb->tstamp.tv64 == 0) {
+ skb->tstamp = ktime_get_real();
+ /* Don't enable netstamp, sunrpc doesn't
+ need that much accuracy */
+ }
+ svsk->sk_sk->sk_stamp = skb->tstamp;
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
+
+ len = skb->len - sizeof(struct udphdr);
+ rqstp->rq_arg.len = len;
+
+ rqstp->rq_prot = IPPROTO_UDP;
+
+ if (!svc_udp_get_dest_address(rqstp, cmh)) {
+ net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
+ cmh->cmsg_level, cmh->cmsg_type);
+ goto out_free;
+ }
+ rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp));
+
+ if (skb_is_nonlinear(skb)) {
+ /* we have to copy */
+ local_bh_disable();
+ if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
+ local_bh_enable();
+ /* checksum error */
+ goto out_free;
+ }
+ local_bh_enable();
+ skb_free_datagram_locked(svsk->sk_sk, skb);
+ } else {
+ /* we can use it in-place */
+ rqstp->rq_arg.head[0].iov_base = skb->data +
+ sizeof(struct udphdr);
+ rqstp->rq_arg.head[0].iov_len = len;
+ if (skb_checksum_complete(skb))
+ goto out_free;
+ rqstp->rq_xprt_ctxt = skb;
+ }
+
+ rqstp->rq_arg.page_base = 0;
+ if (len <= rqstp->rq_arg.head[0].iov_len) {
+ rqstp->rq_arg.head[0].iov_len = len;
+ rqstp->rq_arg.page_len = 0;
+ rqstp->rq_respages = rqstp->rq_pages+1;
+ } else {
+ rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+ rqstp->rq_respages = rqstp->rq_pages + 1 +
+ DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE);
+ }
+ rqstp->rq_next_page = rqstp->rq_respages+1;
+
+ if (serv->sv_stats)
+ serv->sv_stats->netudpcnt++;
+
+ return len;
+out_free:
+ trace_kfree_skb(skb, svc_udp_recvfrom);
+ skb_free_datagram_locked(svsk->sk_sk, skb);
+ return 0;
+}
+
+static int
+svc_udp_sendto(struct svc_rqst *rqstp)
+{
+ int error;
+
+ error = svc_sendto(rqstp, &rqstp->rq_res);
+ if (error == -ECONNREFUSED)
+ /* ICMP error on earlier request. */
+ error = svc_sendto(rqstp, &rqstp->rq_res);
+
+ return error;
+}
+
+static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+static int svc_udp_has_wspace(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = xprt->xpt_server;
+ unsigned long required;
+
+ /*
+ * Set the SOCK_NOSPACE flag before checking the available
+ * sock space.
+ */
+ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+ if (required*2 > sock_wspace(svsk->sk_sk))
+ return 0;
+ clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return 1;
+}
+
+static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
+{
+ BUG();
+ return NULL;
+}
+
+static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
+ struct net *net,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags);
+}
+
+static struct svc_xprt_ops svc_udp_ops = {
+ .xpo_create = svc_udp_create,
+ .xpo_recvfrom = svc_udp_recvfrom,
+ .xpo_sendto = svc_udp_sendto,
+ .xpo_release_rqst = svc_release_skb,
+ .xpo_detach = svc_sock_detach,
+ .xpo_free = svc_sock_free,
+ .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
+ .xpo_has_wspace = svc_udp_has_wspace,
+ .xpo_accept = svc_udp_accept,
+ .xpo_secure_port = svc_sock_secure_port,
+};
+
+static struct svc_xprt_class svc_udp_class = {
+ .xcl_name = "udp",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_udp_ops,
+ .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
+ .xcl_ident = XPRT_TRANSPORT_UDP,
+};
+
+static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
+{
+ int err, level, optname, one = 1;
+
+ svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
+ &svsk->sk_xprt, serv);
+ clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+ svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
+ svsk->sk_sk->sk_write_space = svc_write_space;
+
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_udp_recvfrom will re-adjust if necessary
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+ 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
+
+ /* data might have come in before data_ready set up */
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+
+ /* make sure we get destination address info */
+ switch (svsk->sk_sk->sk_family) {
+ case AF_INET:
+ level = SOL_IP;
+ optname = IP_PKTINFO;
+ break;
+ case AF_INET6:
+ level = SOL_IPV6;
+ optname = IPV6_RECVPKTINFO;
+ break;
+ default:
+ BUG();
+ }
+ err = kernel_setsockopt(svsk->sk_sock, level, optname,
+ (char *)&one, sizeof(one));
+ dprintk("svc: kernel_setsockopt returned %d\n", err);
+}
+
+/*
+ * A data_ready event on a listening socket means there's a connection
+ * pending. Do not use state_change as a substitute for it.
+ */
+static void svc_tcp_listen_data_ready(struct sock *sk)
+{
+ struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq;
+
+ dprintk("svc: socket %p TCP (listen) state change %d\n",
+ sk, sk->sk_state);
+
+ /*
+ * This callback may called twice when a new connection
+ * is established as a child socket inherits everything
+ * from a parent LISTEN socket.
+ * 1) data_ready method of the parent socket will be called
+ * when one of child sockets become ESTABLISHED.
+ * 2) data_ready method of the child socket may be called
+ * when it receives data before the socket is accepted.
+ * In case of 2, we should ignore it silently.
+ */
+ if (sk->sk_state == TCP_LISTEN) {
+ if (svsk) {
+ set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
+ } else
+ printk("svc: socket %p: no user data\n", sk);
+ }
+
+ wq = sk_sleep(sk);
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible_all(wq);
+}
+
+/*
+ * A state change on a connected socket means it's dying or dead.
+ */
+static void svc_tcp_state_change(struct sock *sk)
+{
+ struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
+
+ dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
+ sk, sk->sk_state, sk->sk_user_data);
+
+ if (!svsk)
+ printk("svc: socket %p: no user data\n", sk);
+ else {
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
+ }
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible_all(wq);
+}
+
+static void svc_tcp_data_ready(struct sock *sk)
+{
+ struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+ wait_queue_head_t *wq = sk_sleep(sk);
+
+ dprintk("svc: socket %p TCP data ready (svsk %p)\n",
+ sk, sk->sk_user_data);
+ if (svsk) {
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
+ }
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
+}
+
+/*
+ * Accept a TCP connection
+ */
+static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct sockaddr_storage addr;
+ struct sockaddr *sin = (struct sockaddr *) &addr;
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+ struct socket *sock = svsk->sk_sock;
+ struct socket *newsock;
+ struct svc_sock *newsvsk;
+ int err, slen;
+ RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+ dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
+ if (!sock)
+ return NULL;
+
+ clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+ err = kernel_accept(sock, &newsock, O_NONBLOCK);
+ if (err < 0) {
+ if (err == -ENOMEM)
+ printk(KERN_WARNING "%s: no more sockets!\n",
+ serv->sv_name);
+ else if (err != -EAGAIN)
+ net_warn_ratelimited("%s: accept failed (err %d)!\n",
+ serv->sv_name, -err);
+ return NULL;
+ }
+ set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+
+ err = kernel_getpeername(newsock, sin, &slen);
+ if (err < 0) {
+ net_warn_ratelimited("%s: peername failed (err %d)!\n",
+ serv->sv_name, -err);
+ goto failed; /* aborted connection or whatever */
+ }
+
+ /* Ideally, we would want to reject connections from unauthorized
+ * hosts here, but when we get encryption, the IP of the host won't
+ * tell us anything. For now just warn about unpriv connections.
+ */
+ if (!svc_port_is_privileged(sin)) {
+ dprintk("%s: connect from unprivileged port: %s\n",
+ serv->sv_name,
+ __svc_print_addr(sin, buf, sizeof(buf)));
+ }
+ dprintk("%s: connect from %s\n", serv->sv_name,
+ __svc_print_addr(sin, buf, sizeof(buf)));
+
+ /* make sure that a write doesn't block forever when
+ * low on memory
+ */
+ newsock->sk->sk_sndtimeo = HZ*30;
+
+ newsvsk = svc_setup_socket(serv, newsock,
+ (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY));
+ if (IS_ERR(newsvsk))
+ goto failed;
+ svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
+ err = kernel_getsockname(newsock, sin, &slen);
+ if (unlikely(err < 0)) {
+ dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
+ slen = offsetof(struct sockaddr, sa_data);
+ }
+ svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
+
+ if (sock_is_loopback(newsock->sk))
+ set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
+ else
+ clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
+ if (serv->sv_stats)
+ serv->sv_stats->nettcpconn++;
+
+ return &newsvsk->sk_xprt;
+
+failed:
+ sock_release(newsock);
+ return NULL;
+}
+
+static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+ unsigned int i, len, npages;
+
+ if (svsk->sk_datalen == 0)
+ return 0;
+ len = svsk->sk_datalen;
+ npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
+ if (rqstp->rq_pages[i] != NULL)
+ put_page(rqstp->rq_pages[i]);
+ BUG_ON(svsk->sk_pages[i] == NULL);
+ rqstp->rq_pages[i] = svsk->sk_pages[i];
+ svsk->sk_pages[i] = NULL;
+ }
+ rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]);
+ return len;
+}
+
+static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+ unsigned int i, len, npages;
+
+ if (svsk->sk_datalen == 0)
+ return;
+ len = svsk->sk_datalen;
+ npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
+ svsk->sk_pages[i] = rqstp->rq_pages[i];
+ rqstp->rq_pages[i] = NULL;
+ }
+}
+
+static void svc_tcp_clear_pages(struct svc_sock *svsk)
+{
+ unsigned int i, len, npages;
+
+ if (svsk->sk_datalen == 0)
+ goto out;
+ len = svsk->sk_datalen;
+ npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
+ if (svsk->sk_pages[i] == NULL) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ put_page(svsk->sk_pages[i]);
+ svsk->sk_pages[i] = NULL;
+ }
+out:
+ svsk->sk_tcplen = 0;
+ svsk->sk_datalen = 0;
+}
+
+/*
+ * Receive fragment record header.
+ * If we haven't gotten the record length yet, get the next four bytes.
+ */
+static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+ unsigned int want;
+ int len;
+
+ if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
+ struct kvec iov;
+
+ want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
+ iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
+ iov.iov_len = want;
+ if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
+ goto error;
+ svsk->sk_tcplen += len;
+
+ if (len < want) {
+ dprintk("svc: short recvfrom while reading record "
+ "length (%d of %d)\n", len, want);
+ return -EAGAIN;
+ }
+
+ dprintk("svc: TCP record, %d bytes\n", svc_sock_reclen(svsk));
+ if (svc_sock_reclen(svsk) + svsk->sk_datalen >
+ serv->sv_max_mesg) {
+ net_notice_ratelimited("RPC: fragment too large: %d\n",
+ svc_sock_reclen(svsk));
+ goto err_delete;
+ }
+ }
+
+ return svc_sock_reclen(svsk);
+error:
+ dprintk("RPC: TCP recv_record got %d\n", len);
+ return len;
+err_delete:
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ return -EAGAIN;
+}
+
+static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+ struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
+ struct rpc_rqst *req = NULL;
+ struct kvec *src, *dst;
+ __be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
+ __be32 xid;
+ __be32 calldir;
+
+ xid = *p++;
+ calldir = *p;
+
+ if (!bc_xprt)
+ return -EAGAIN;
+ spin_lock_bh(&bc_xprt->transport_lock);
+ req = xprt_lookup_rqst(bc_xprt, xid);
+ if (!req)
+ goto unlock_notfound;
+
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+ /*
+ * XXX!: cheating for now! Only copying HEAD.
+ * But we know this is good enough for now (in fact, for any
+ * callback reply in the forseeable future).
+ */
+ dst = &req->rq_private_buf.head[0];
+ src = &rqstp->rq_arg.head[0];
+ if (dst->iov_len < src->iov_len)
+ goto unlock_eagain; /* whatever; just giving up. */
+ memcpy(dst->iov_base, src->iov_base, src->iov_len);
+ xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
+ rqstp->rq_arg.len = 0;
+ spin_unlock_bh(&bc_xprt->transport_lock);
+ return 0;
+unlock_notfound:
+ printk(KERN_NOTICE
+ "%s: Got unrecognized reply: "
+ "calldir 0x%x xpt_bc_xprt %p xid %08x\n",
+ __func__, ntohl(calldir),
+ bc_xprt, ntohl(xid));
+unlock_eagain:
+ spin_unlock_bh(&bc_xprt->transport_lock);
+ return -EAGAIN;
+}
+
+static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
+{
+ int i = 0;
+ int t = 0;
+
+ while (t < len) {
+ vec[i].iov_base = page_address(pages[i]);
+ vec[i].iov_len = PAGE_SIZE;
+ i++;
+ t += PAGE_SIZE;
+ }
+ return i;
+}
+
+static void svc_tcp_fragment_received(struct svc_sock *svsk)
+{
+ /* If we have more data, signal svc_xprt_enqueue() to try again */
+ dprintk("svc: TCP %s record (%d bytes)\n",
+ svc_sock_final_rec(svsk) ? "final" : "nonfinal",
+ svc_sock_reclen(svsk));
+ svsk->sk_tcplen = 0;
+ svsk->sk_reclen = 0;
+}
+
+/*
+ * Receive data from a TCP socket.
+ */
+static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+ int len;
+ struct kvec *vec;
+ unsigned int want, base;
+ __be32 *p;
+ __be32 calldir;
+ int pnum;
+
+ dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
+ svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
+ test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
+ test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
+
+ len = svc_tcp_recv_record(svsk, rqstp);
+ if (len < 0)
+ goto error;
+
+ base = svc_tcp_restore_pages(svsk, rqstp);
+ want = svc_sock_reclen(svsk) - (svsk->sk_tcplen - sizeof(rpc_fraghdr));
+
+ vec = rqstp->rq_vec;
+
+ pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
+ svsk->sk_datalen + want);
+
+ rqstp->rq_respages = &rqstp->rq_pages[pnum];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* Now receive data */
+ len = svc_partial_recvfrom(rqstp, vec, pnum, want, base);
+ if (len >= 0) {
+ svsk->sk_tcplen += len;
+ svsk->sk_datalen += len;
+ }
+ if (len != want || !svc_sock_final_rec(svsk)) {
+ svc_tcp_save_pages(svsk, rqstp);
+ if (len < 0 && len != -EAGAIN)
+ goto err_delete;
+ if (len == want)
+ svc_tcp_fragment_received(svsk);
+ else
+ dprintk("svc: incomplete TCP record (%d of %d)\n",
+ (int)(svsk->sk_tcplen - sizeof(rpc_fraghdr)),
+ svc_sock_reclen(svsk));
+ goto err_noclose;
+ }
+
+ if (svsk->sk_datalen < 8) {
+ svsk->sk_datalen = 0;
+ goto err_delete; /* client is nuts. */
+ }
+
+ rqstp->rq_arg.len = svsk->sk_datalen;
+ rqstp->rq_arg.page_base = 0;
+ if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
+ rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
+ rqstp->rq_arg.page_len = 0;
+ } else
+ rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+
+ rqstp->rq_xprt_ctxt = NULL;
+ rqstp->rq_prot = IPPROTO_TCP;
+ if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags))
+ set_bit(RQ_LOCAL, &rqstp->rq_flags);
+ else
+ clear_bit(RQ_LOCAL, &rqstp->rq_flags);
+
+ p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
+ calldir = p[1];
+ if (calldir)
+ len = receive_cb_reply(svsk, rqstp);
+
+ /* Reset TCP read info */
+ svsk->sk_datalen = 0;
+ svc_tcp_fragment_received(svsk);
+
+ if (len < 0)
+ goto error;
+
+ svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
+ if (serv->sv_stats)
+ serv->sv_stats->nettcpcnt++;
+
+ return rqstp->rq_arg.len;
+
+error:
+ if (len != -EAGAIN)
+ goto err_delete;
+ dprintk("RPC: TCP recvfrom got EAGAIN\n");
+ return 0;
+err_delete:
+ printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
+ svsk->sk_xprt.xpt_server->sv_name, -len);
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+err_noclose:
+ return 0; /* record not complete */
+}
+
+/*
+ * Send out data on TCP socket.
+ */
+static int svc_tcp_sendto(struct svc_rqst *rqstp)
+{
+ struct xdr_buf *xbufp = &rqstp->rq_res;
+ int sent;
+ __be32 reclen;
+
+ /* Set up the first element of the reply kvec.
+ * Any other kvecs that may be in use have been taken
+ * care of by the server implementation itself.
+ */
+ reclen = htonl(0x80000000|((xbufp->len ) - 4));
+ memcpy(xbufp->head[0].iov_base, &reclen, 4);
+
+ sent = svc_sendto(rqstp, &rqstp->rq_res);
+ if (sent != xbufp->len) {
+ printk(KERN_NOTICE
+ "rpc-srv/tcp: %s: %s %d when sending %d bytes "
+ "- shutting down socket\n",
+ rqstp->rq_xprt->xpt_server->sv_name,
+ (sent<0)?"got error":"sent only",
+ sent, xbufp->len);
+ set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
+ svc_xprt_enqueue(rqstp->rq_xprt);
+ sent = -EAGAIN;
+ }
+ return sent;
+}
+
+/*
+ * Setup response header. TCP has a 4B record length field.
+ */
+static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+ struct kvec *resv = &rqstp->rq_res.head[0];
+
+ /* tcp needs a space for the record length... */
+ svc_putnl(resv, 0);
+}
+
+static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
+ struct net *net,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
+}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
+ struct net *, struct sockaddr *,
+ int, int);
+static void svc_bc_sock_free(struct svc_xprt *xprt);
+
+static struct svc_xprt *svc_bc_tcp_create(struct svc_serv *serv,
+ struct net *net,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ return svc_bc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
+}
+
+static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt)
+{
+}
+
+static struct svc_xprt_ops svc_tcp_bc_ops = {
+ .xpo_create = svc_bc_tcp_create,
+ .xpo_detach = svc_bc_tcp_sock_detach,
+ .xpo_free = svc_bc_sock_free,
+ .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+ .xpo_secure_port = svc_sock_secure_port,
+};
+
+static struct svc_xprt_class svc_tcp_bc_class = {
+ .xcl_name = "tcp-bc",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_tcp_bc_ops,
+ .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+
+static void svc_init_bc_xprt_sock(void)
+{
+ svc_reg_xprt_class(&svc_tcp_bc_class);
+}
+
+static void svc_cleanup_bc_xprt_sock(void)
+{
+ svc_unreg_xprt_class(&svc_tcp_bc_class);
+}
+#else /* CONFIG_SUNRPC_BACKCHANNEL */
+static void svc_init_bc_xprt_sock(void)
+{
+}
+
+static void svc_cleanup_bc_xprt_sock(void)
+{
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+static struct svc_xprt_ops svc_tcp_ops = {
+ .xpo_create = svc_tcp_create,
+ .xpo_recvfrom = svc_tcp_recvfrom,
+ .xpo_sendto = svc_tcp_sendto,
+ .xpo_release_rqst = svc_release_skb,
+ .xpo_detach = svc_tcp_sock_detach,
+ .xpo_free = svc_sock_free,
+ .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+ .xpo_has_wspace = svc_tcp_has_wspace,
+ .xpo_accept = svc_tcp_accept,
+ .xpo_secure_port = svc_sock_secure_port,
+ .xpo_adjust_wspace = svc_tcp_adjust_wspace,
+};
+
+static struct svc_xprt_class svc_tcp_class = {
+ .xcl_name = "tcp",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_tcp_ops,
+ .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+ .xcl_ident = XPRT_TRANSPORT_TCP,
+};
+
+void svc_init_xprt_sock(void)
+{
+ svc_reg_xprt_class(&svc_tcp_class);
+ svc_reg_xprt_class(&svc_udp_class);
+ svc_init_bc_xprt_sock();
+}
+
+void svc_cleanup_xprt_sock(void)
+{
+ svc_unreg_xprt_class(&svc_tcp_class);
+ svc_unreg_xprt_class(&svc_udp_class);
+ svc_cleanup_bc_xprt_sock();
+}
+
+static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
+{
+ struct sock *sk = svsk->sk_sk;
+
+ svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
+ &svsk->sk_xprt, serv);
+ set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+ if (sk->sk_state == TCP_LISTEN) {
+ dprintk("setting up TCP socket for listening\n");
+ set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
+ sk->sk_data_ready = svc_tcp_listen_data_ready;
+ set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+ } else {
+ dprintk("setting up TCP socket for reading\n");
+ sk->sk_state_change = svc_tcp_state_change;
+ sk->sk_data_ready = svc_tcp_data_ready;
+ sk->sk_write_space = svc_tcp_write_space;
+
+ svsk->sk_reclen = 0;
+ svsk->sk_tcplen = 0;
+ svsk->sk_datalen = 0;
+ memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
+
+ tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ if (sk->sk_state != TCP_ESTABLISHED)
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ }
+}
+
+void svc_sock_update_bufs(struct svc_serv *serv)
+{
+ /*
+ * The number of server threads has changed. Update
+ * rcvbuf and sndbuf accordingly on all sockets
+ */
+ struct svc_sock *svsk;
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list)
+ set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+ spin_unlock_bh(&serv->sv_lock);
+}
+EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
+
+/*
+ * Initialize socket for RPC use and create svc_sock struct
+ * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
+ */
+static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
+ struct socket *sock,
+ int flags)
+{
+ struct svc_sock *svsk;
+ struct sock *inet;
+ int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+ int err = 0;
+
+ dprintk("svc: svc_setup_socket %p\n", sock);
+ svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
+ if (!svsk)
+ return ERR_PTR(-ENOMEM);
+
+ inet = sock->sk;
+
+ /* Register socket with portmapper */
+ if (pmap_register)
+ err = svc_register(serv, sock_net(sock->sk), inet->sk_family,
+ inet->sk_protocol,
+ ntohs(inet_sk(inet)->inet_sport));
+
+ if (err < 0) {
+ kfree(svsk);
+ return ERR_PTR(err);
+ }
+
+ inet->sk_user_data = svsk;
+ svsk->sk_sock = sock;
+ svsk->sk_sk = inet;
+ svsk->sk_ostate = inet->sk_state_change;
+ svsk->sk_odata = inet->sk_data_ready;
+ svsk->sk_owspace = inet->sk_write_space;
+
+ /* Initialize the socket */
+ if (sock->type == SOCK_DGRAM)
+ svc_udp_init(svsk, serv);
+ else {
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ */
+ svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
+ 4 * serv->sv_max_mesg);
+ svc_tcp_init(svsk, serv);
+ }
+
+ dprintk("svc: svc_setup_socket created %p (inet %p)\n",
+ svsk, svsk->sk_sk);
+
+ return svsk;
+}
+
+bool svc_alien_sock(struct net *net, int fd)
+{
+ int err;
+ struct socket *sock = sockfd_lookup(fd, &err);
+ bool ret = false;
+
+ if (!sock)
+ goto out;
+ if (sock_net(sock->sk) != net)
+ ret = true;
+ sockfd_put(sock);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(svc_alien_sock);
+
+/**
+ * svc_addsock - add a listener socket to an RPC service
+ * @serv: pointer to RPC service to which to add a new listener
+ * @fd: file descriptor of the new listener
+ * @name_return: pointer to buffer to fill in with name of listener
+ * @len: size of the buffer
+ *
+ * Fills in socket name and returns positive length of name if successful.
+ * Name is terminated with '\n'. On error, returns a negative errno
+ * value.
+ */
+int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
+ const size_t len)
+{
+ int err = 0;
+ struct socket *so = sockfd_lookup(fd, &err);
+ struct svc_sock *svsk = NULL;
+ struct sockaddr_storage addr;
+ struct sockaddr *sin = (struct sockaddr *)&addr;
+ int salen;
+
+ if (!so)
+ return err;
+ err = -EAFNOSUPPORT;
+ if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6))
+ goto out;
+ err = -EPROTONOSUPPORT;
+ if (so->sk->sk_protocol != IPPROTO_TCP &&
+ so->sk->sk_protocol != IPPROTO_UDP)
+ goto out;
+ err = -EISCONN;
+ if (so->state > SS_UNCONNECTED)
+ goto out;
+ err = -ENOENT;
+ if (!try_module_get(THIS_MODULE))
+ goto out;
+ svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS);
+ if (IS_ERR(svsk)) {
+ module_put(THIS_MODULE);
+ err = PTR_ERR(svsk);
+ goto out;
+ }
+ if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
+ svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
+ svc_add_new_perm_xprt(serv, &svsk->sk_xprt);
+ return svc_one_sock_name(svsk, name_return, len);
+out:
+ sockfd_put(so);
+ return err;
+}
+EXPORT_SYMBOL_GPL(svc_addsock);
+
+/*
+ * Create socket for RPC service.
+ */
+static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
+ int protocol,
+ struct net *net,
+ struct sockaddr *sin, int len,
+ int flags)
+{
+ struct svc_sock *svsk;
+ struct socket *sock;
+ int error;
+ int type;
+ struct sockaddr_storage addr;
+ struct sockaddr *newsin = (struct sockaddr *)&addr;
+ int newlen;
+ int family;
+ int val;
+ RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+ dprintk("svc: svc_create_socket(%s, %d, %s)\n",
+ serv->sv_program->pg_name, protocol,
+ __svc_print_addr(sin, buf, sizeof(buf)));
+
+ if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
+ printk(KERN_WARNING "svc: only UDP and TCP "
+ "sockets supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
+ switch (sin->sa_family) {
+ case AF_INET6:
+ family = PF_INET6;
+ break;
+ case AF_INET:
+ family = PF_INET;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ error = __sock_create(net, family, type, protocol, &sock, 1);
+ if (error < 0)
+ return ERR_PTR(error);
+
+ svc_reclassify_socket(sock);
+
+ /*
+ * If this is an PF_INET6 listener, we want to avoid
+ * getting requests from IPv4 remotes. Those should
+ * be shunted to a PF_INET listener via rpcbind.
+ */
+ val = 1;
+ if (family == PF_INET6)
+ kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+ (char *)&val, sizeof(val));
+
+ if (type == SOCK_STREAM)
+ sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */
+ error = kernel_bind(sock, sin, len);
+ if (error < 0)
+ goto bummer;
+
+ newlen = len;
+ error = kernel_getsockname(sock, newsin, &newlen);
+ if (error < 0)
+ goto bummer;
+
+ if (protocol == IPPROTO_TCP) {
+ if ((error = kernel_listen(sock, 64)) < 0)
+ goto bummer;
+ }
+
+ svsk = svc_setup_socket(serv, sock, flags);
+ if (IS_ERR(svsk)) {
+ error = PTR_ERR(svsk);
+ goto bummer;
+ }
+ svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
+ return (struct svc_xprt *)svsk;
+bummer:
+ dprintk("svc: svc_create_socket error = %d\n", -error);
+ sock_release(sock);
+ return ERR_PTR(error);
+}
+
+/*
+ * Detach the svc_sock from the socket so that no
+ * more callbacks occur.
+ */
+static void svc_sock_detach(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct sock *sk = svsk->sk_sk;
+ wait_queue_head_t *wq;
+
+ dprintk("svc: svc_sock_detach(%p)\n", svsk);
+
+ /* put back the old socket callbacks */
+ sk->sk_state_change = svsk->sk_ostate;
+ sk->sk_data_ready = svsk->sk_odata;
+ sk->sk_write_space = svsk->sk_owspace;
+
+ wq = sk_sleep(sk);
+ if (wq && waitqueue_active(wq))
+ wake_up_interruptible(wq);
+}
+
+/*
+ * Disconnect the socket, and reset the callbacks
+ */
+static void svc_tcp_sock_detach(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+
+ dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk);
+
+ svc_sock_detach(xprt);
+
+ if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+ svc_tcp_clear_pages(svsk);
+ kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR);
+ }
+}
+
+/*
+ * Free the svc_sock's socket resources and the svc_sock itself.
+ */
+static void svc_sock_free(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ dprintk("svc: svc_sock_free(%p)\n", svsk);
+
+ if (svsk->sk_sock->file)
+ sockfd_put(svsk->sk_sock);
+ else
+ sock_release(svsk->sk_sock);
+ kfree(svsk);
+}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/*
+ * Create a back channel svc_xprt which shares the fore channel socket.
+ */
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
+ int protocol,
+ struct net *net,
+ struct sockaddr *sin, int len,
+ int flags)
+{
+ struct svc_sock *svsk;
+ struct svc_xprt *xprt;
+
+ if (protocol != IPPROTO_TCP) {
+ printk(KERN_WARNING "svc: only TCP sockets"
+ " supported on shared back channel\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
+ if (!svsk)
+ return ERR_PTR(-ENOMEM);
+
+ xprt = &svsk->sk_xprt;
+ svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv);
+
+ serv->sv_bc_xprt = xprt;
+
+ return xprt;
+}
+
+/*
+ * Free a back channel svc_sock.
+ */
+static void svc_bc_sock_free(struct svc_xprt *xprt)
+{
+ if (xprt)
+ kfree(container_of(xprt, struct svc_sock, sk_xprt));
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
new file mode 100644
index 000000000..887f0183b
--- /dev/null
+++ b/net/sunrpc/sysctl.c
@@ -0,0 +1,185 @@
+/*
+ * linux/net/sunrpc/sysctl.c
+ *
+ * Sysctl interface to sunrpc module.
+ *
+ * I would prefer to register the sunrpc table below sys/net, but that's
+ * impossible at the moment.
+ */
+
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#include "netns.h"
+
+/*
+ * Declare the debug flags here
+ */
+unsigned int rpc_debug;
+EXPORT_SYMBOL_GPL(rpc_debug);
+
+unsigned int nfs_debug;
+EXPORT_SYMBOL_GPL(nfs_debug);
+
+unsigned int nfsd_debug;
+EXPORT_SYMBOL_GPL(nfsd_debug);
+
+unsigned int nlm_debug;
+EXPORT_SYMBOL_GPL(nlm_debug);
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+
+static struct ctl_table_header *sunrpc_table_header;
+static struct ctl_table sunrpc_table[];
+
+void
+rpc_register_sysctl(void)
+{
+ if (!sunrpc_table_header)
+ sunrpc_table_header = register_sysctl_table(sunrpc_table);
+}
+
+void
+rpc_unregister_sysctl(void)
+{
+ if (sunrpc_table_header) {
+ unregister_sysctl_table(sunrpc_table_header);
+ sunrpc_table_header = NULL;
+ }
+}
+
+static int proc_do_xprt(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char tmpbuf[256];
+ size_t len;
+
+ if ((*ppos && !write) || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+ len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
+ return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+}
+
+static int
+proc_dodebug(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char tmpbuf[20], c, *s;
+ char __user *p;
+ unsigned int value;
+ size_t left, len;
+
+ if ((*ppos && !write) || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ left = *lenp;
+
+ if (write) {
+ if (!access_ok(VERIFY_READ, buffer, left))
+ return -EFAULT;
+ p = buffer;
+ while (left && __get_user(c, p) >= 0 && isspace(c))
+ left--, p++;
+ if (!left)
+ goto done;
+
+ if (left > sizeof(tmpbuf) - 1)
+ return -EINVAL;
+ if (copy_from_user(tmpbuf, p, left))
+ return -EFAULT;
+ tmpbuf[left] = '\0';
+
+ for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--)
+ value = 10 * value + (*s - '0');
+ if (*s && !isspace(*s))
+ return -EINVAL;
+ while (left && isspace(*s))
+ left--, s++;
+ *(unsigned int *) table->data = value;
+ /* Display the RPC tasks on writing to rpc_debug */
+ if (strcmp(table->procname, "rpc_debug") == 0)
+ rpc_show_tasks(&init_net);
+ } else {
+ if (!access_ok(VERIFY_WRITE, buffer, left))
+ return -EFAULT;
+ len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data);
+ if (len > left)
+ len = left;
+ if (__copy_to_user(buffer, tmpbuf, len))
+ return -EFAULT;
+ if ((left -= len) > 0) {
+ if (put_user('\n', (char __user *)buffer + len))
+ return -EFAULT;
+ left--;
+ }
+ }
+
+done:
+ *lenp -= left;
+ *ppos += *lenp;
+ return 0;
+}
+
+
+static struct ctl_table debug_table[] = {
+ {
+ .procname = "rpc_debug",
+ .data = &rpc_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dodebug
+ },
+ {
+ .procname = "nfs_debug",
+ .data = &nfs_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dodebug
+ },
+ {
+ .procname = "nfsd_debug",
+ .data = &nfsd_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dodebug
+ },
+ {
+ .procname = "nlm_debug",
+ .data = &nlm_debug,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dodebug
+ },
+ {
+ .procname = "transports",
+ .maxlen = 256,
+ .mode = 0444,
+ .proc_handler = proc_do_xprt,
+ },
+ { }
+};
+
+static struct ctl_table sunrpc_table[] = {
+ {
+ .procname = "sunrpc",
+ .mode = 0555,
+ .child = debug_table
+ },
+ { }
+};
+
+#endif
diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c
new file mode 100644
index 000000000..08881d0c9
--- /dev/null
+++ b/net/sunrpc/timer.c
@@ -0,0 +1,122 @@
+/*
+ * linux/net/sunrpc/timer.c
+ *
+ * Estimate RPC request round trip time.
+ *
+ * Based on packet round-trip and variance estimator algorithms described
+ * in appendix A of "Congestion Avoidance and Control" by Van Jacobson
+ * and Michael J. Karels (ACM Computer Communication Review; Proceedings
+ * of the Sigcomm '88 Symposium in Stanford, CA, August, 1988).
+ *
+ * This RTT estimator is used only for RPC over datagram protocols.
+ *
+ * Copyright (C) 2002 Trond Myklebust <trond.myklebust@fys.uio.no>
+ */
+
+#include <asm/param.h>
+
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+
+#include <linux/sunrpc/clnt.h>
+
+#define RPC_RTO_MAX (60*HZ)
+#define RPC_RTO_INIT (HZ/5)
+#define RPC_RTO_MIN (HZ/10)
+
+/**
+ * rpc_init_rtt - Initialize an RPC RTT estimator context
+ * @rt: context to initialize
+ * @timeo: initial timeout value, in jiffies
+ *
+ */
+void rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo)
+{
+ unsigned long init = 0;
+ unsigned int i;
+
+ rt->timeo = timeo;
+
+ if (timeo > RPC_RTO_INIT)
+ init = (timeo - RPC_RTO_INIT) << 3;
+ for (i = 0; i < 5; i++) {
+ rt->srtt[i] = init;
+ rt->sdrtt[i] = RPC_RTO_INIT;
+ rt->ntimeouts[i] = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(rpc_init_rtt);
+
+/**
+ * rpc_update_rtt - Update an RPC RTT estimator context
+ * @rt: context to update
+ * @timer: timer array index (request type)
+ * @m: recent actual RTT, in jiffies
+ *
+ * NB: When computing the smoothed RTT and standard deviation,
+ * be careful not to produce negative intermediate results.
+ */
+void rpc_update_rtt(struct rpc_rtt *rt, unsigned int timer, long m)
+{
+ long *srtt, *sdrtt;
+
+ if (timer-- == 0)
+ return;
+
+ /* jiffies wrapped; ignore this one */
+ if (m < 0)
+ return;
+
+ if (m == 0)
+ m = 1L;
+
+ srtt = (long *)&rt->srtt[timer];
+ m -= *srtt >> 3;
+ *srtt += m;
+
+ if (m < 0)
+ m = -m;
+
+ sdrtt = (long *)&rt->sdrtt[timer];
+ m -= *sdrtt >> 2;
+ *sdrtt += m;
+
+ /* Set lower bound on the variance */
+ if (*sdrtt < RPC_RTO_MIN)
+ *sdrtt = RPC_RTO_MIN;
+}
+EXPORT_SYMBOL_GPL(rpc_update_rtt);
+
+/**
+ * rpc_calc_rto - Provide an estimated timeout value
+ * @rt: context to use for calculation
+ * @timer: timer array index (request type)
+ *
+ * Estimate RTO for an NFS RPC sent via an unreliable datagram. Use
+ * the mean and mean deviation of RTT for the appropriate type of RPC
+ * for frequently issued RPCs, and a fixed default for the others.
+ *
+ * The justification for doing "other" this way is that these RPCs
+ * happen so infrequently that timer estimation would probably be
+ * stale. Also, since many of these RPCs are non-idempotent, a
+ * conservative timeout is desired.
+ *
+ * getattr, lookup,
+ * read, write, commit - A+4D
+ * other - timeo
+ */
+unsigned long rpc_calc_rto(struct rpc_rtt *rt, unsigned int timer)
+{
+ unsigned long res;
+
+ if (timer-- == 0)
+ return rt->timeo;
+
+ res = ((rt->srtt[timer] + 7) >> 3) + rt->sdrtt[timer];
+ if (res > RPC_RTO_MAX)
+ res = RPC_RTO_MAX;
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(rpc_calc_rto);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
new file mode 100644
index 000000000..4439ac4c1
--- /dev/null
+++ b/net/sunrpc/xdr.c
@@ -0,0 +1,1515 @@
+/*
+ * linux/net/sunrpc/xdr.c
+ *
+ * Generic XDR support.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/errno.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/msg_prot.h>
+
+/*
+ * XDR functions for basic NFS types
+ */
+__be32 *
+xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj)
+{
+ unsigned int quadlen = XDR_QUADLEN(obj->len);
+
+ p[quadlen] = 0; /* zero trailing bytes */
+ *p++ = cpu_to_be32(obj->len);
+ memcpy(p, obj->data, obj->len);
+ return p + XDR_QUADLEN(obj->len);
+}
+EXPORT_SYMBOL_GPL(xdr_encode_netobj);
+
+__be32 *
+xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj)
+{
+ unsigned int len;
+
+ if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ)
+ return NULL;
+ obj->len = len;
+ obj->data = (u8 *) p;
+ return p + XDR_QUADLEN(len);
+}
+EXPORT_SYMBOL_GPL(xdr_decode_netobj);
+
+/**
+ * xdr_encode_opaque_fixed - Encode fixed length opaque data
+ * @p: pointer to current position in XDR buffer.
+ * @ptr: pointer to data to encode (or NULL)
+ * @nbytes: size of data.
+ *
+ * Copy the array of data of length nbytes at ptr to the XDR buffer
+ * at position p, then align to the next 32-bit boundary by padding
+ * with zero bytes (see RFC1832).
+ * Note: if ptr is NULL, only the padding is performed.
+ *
+ * Returns the updated current XDR buffer position
+ *
+ */
+__be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int nbytes)
+{
+ if (likely(nbytes != 0)) {
+ unsigned int quadlen = XDR_QUADLEN(nbytes);
+ unsigned int padding = (quadlen << 2) - nbytes;
+
+ if (ptr != NULL)
+ memcpy(p, ptr, nbytes);
+ if (padding != 0)
+ memset((char *)p + nbytes, 0, padding);
+ p += quadlen;
+ }
+ return p;
+}
+EXPORT_SYMBOL_GPL(xdr_encode_opaque_fixed);
+
+/**
+ * xdr_encode_opaque - Encode variable length opaque data
+ * @p: pointer to current position in XDR buffer.
+ * @ptr: pointer to data to encode (or NULL)
+ * @nbytes: size of data.
+ *
+ * Returns the updated current XDR buffer position
+ */
+__be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int nbytes)
+{
+ *p++ = cpu_to_be32(nbytes);
+ return xdr_encode_opaque_fixed(p, ptr, nbytes);
+}
+EXPORT_SYMBOL_GPL(xdr_encode_opaque);
+
+__be32 *
+xdr_encode_string(__be32 *p, const char *string)
+{
+ return xdr_encode_array(p, string, strlen(string));
+}
+EXPORT_SYMBOL_GPL(xdr_encode_string);
+
+__be32 *
+xdr_decode_string_inplace(__be32 *p, char **sp,
+ unsigned int *lenp, unsigned int maxlen)
+{
+ u32 len;
+
+ len = be32_to_cpu(*p++);
+ if (len > maxlen)
+ return NULL;
+ *lenp = len;
+ *sp = (char *) p;
+ return p + XDR_QUADLEN(len);
+}
+EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
+
+/**
+ * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf
+ * @buf: XDR buffer where string resides
+ * @len: length of string, in bytes
+ *
+ */
+void
+xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+{
+ char *kaddr;
+
+ kaddr = kmap_atomic(buf->pages[0]);
+ kaddr[buf->page_base + len] = '\0';
+ kunmap_atomic(kaddr);
+}
+EXPORT_SYMBOL_GPL(xdr_terminate_string);
+
+void
+xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
+ struct page **pages, unsigned int base, unsigned int len)
+{
+ struct kvec *head = xdr->head;
+ struct kvec *tail = xdr->tail;
+ char *buf = (char *)head->iov_base;
+ unsigned int buflen = head->iov_len;
+
+ head->iov_len = offset;
+
+ xdr->pages = pages;
+ xdr->page_base = base;
+ xdr->page_len = len;
+
+ tail->iov_base = buf + offset;
+ tail->iov_len = buflen - offset;
+
+ xdr->buflen += len;
+}
+EXPORT_SYMBOL_GPL(xdr_inline_pages);
+
+/*
+ * Helper routines for doing 'memmove' like operations on a struct xdr_buf
+ */
+
+/**
+ * _shift_data_right_pages
+ * @pages: vector of pages containing both the source and dest memory area.
+ * @pgto_base: page vector address of destination
+ * @pgfrom_base: page vector address of source
+ * @len: number of bytes to copy
+ *
+ * Note: the addresses pgto_base and pgfrom_base are both calculated in
+ * the same way:
+ * if a memory area starts at byte 'base' in page 'pages[i]',
+ * then its address is given as (i << PAGE_CACHE_SHIFT) + base
+ * Also note: pgfrom_base must be < pgto_base, but the memory areas
+ * they point to may overlap.
+ */
+static void
+_shift_data_right_pages(struct page **pages, size_t pgto_base,
+ size_t pgfrom_base, size_t len)
+{
+ struct page **pgfrom, **pgto;
+ char *vfrom, *vto;
+ size_t copy;
+
+ BUG_ON(pgto_base <= pgfrom_base);
+
+ pgto_base += len;
+ pgfrom_base += len;
+
+ pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT);
+ pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT);
+
+ pgto_base &= ~PAGE_CACHE_MASK;
+ pgfrom_base &= ~PAGE_CACHE_MASK;
+
+ do {
+ /* Are any pointers crossing a page boundary? */
+ if (pgto_base == 0) {
+ pgto_base = PAGE_CACHE_SIZE;
+ pgto--;
+ }
+ if (pgfrom_base == 0) {
+ pgfrom_base = PAGE_CACHE_SIZE;
+ pgfrom--;
+ }
+
+ copy = len;
+ if (copy > pgto_base)
+ copy = pgto_base;
+ if (copy > pgfrom_base)
+ copy = pgfrom_base;
+ pgto_base -= copy;
+ pgfrom_base -= copy;
+
+ vto = kmap_atomic(*pgto);
+ if (*pgto != *pgfrom) {
+ vfrom = kmap_atomic(*pgfrom);
+ memcpy(vto + pgto_base, vfrom + pgfrom_base, copy);
+ kunmap_atomic(vfrom);
+ } else
+ memmove(vto + pgto_base, vto + pgfrom_base, copy);
+ flush_dcache_page(*pgto);
+ kunmap_atomic(vto);
+
+ } while ((len -= copy) != 0);
+}
+
+/**
+ * _copy_to_pages
+ * @pages: array of pages
+ * @pgbase: page vector address of destination
+ * @p: pointer to source data
+ * @len: length
+ *
+ * Copies data from an arbitrary memory location into an array of pages
+ * The copy is assumed to be non-overlapping.
+ */
+static void
+_copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
+{
+ struct page **pgto;
+ char *vto;
+ size_t copy;
+
+ pgto = pages + (pgbase >> PAGE_CACHE_SHIFT);
+ pgbase &= ~PAGE_CACHE_MASK;
+
+ for (;;) {
+ copy = PAGE_CACHE_SIZE - pgbase;
+ if (copy > len)
+ copy = len;
+
+ vto = kmap_atomic(*pgto);
+ memcpy(vto + pgbase, p, copy);
+ kunmap_atomic(vto);
+
+ len -= copy;
+ if (len == 0)
+ break;
+
+ pgbase += copy;
+ if (pgbase == PAGE_CACHE_SIZE) {
+ flush_dcache_page(*pgto);
+ pgbase = 0;
+ pgto++;
+ }
+ p += copy;
+ }
+ flush_dcache_page(*pgto);
+}
+
+/**
+ * _copy_from_pages
+ * @p: pointer to destination
+ * @pages: array of pages
+ * @pgbase: offset of source data
+ * @len: length
+ *
+ * Copies data into an arbitrary memory location from an array of pages
+ * The copy is assumed to be non-overlapping.
+ */
+void
+_copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
+{
+ struct page **pgfrom;
+ char *vfrom;
+ size_t copy;
+
+ pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT);
+ pgbase &= ~PAGE_CACHE_MASK;
+
+ do {
+ copy = PAGE_CACHE_SIZE - pgbase;
+ if (copy > len)
+ copy = len;
+
+ vfrom = kmap_atomic(*pgfrom);
+ memcpy(p, vfrom + pgbase, copy);
+ kunmap_atomic(vfrom);
+
+ pgbase += copy;
+ if (pgbase == PAGE_CACHE_SIZE) {
+ pgbase = 0;
+ pgfrom++;
+ }
+ p += copy;
+
+ } while ((len -= copy) != 0);
+}
+EXPORT_SYMBOL_GPL(_copy_from_pages);
+
+/**
+ * xdr_shrink_bufhead
+ * @buf: xdr_buf
+ * @len: bytes to remove from buf->head[0]
+ *
+ * Shrinks XDR buffer's header kvec buf->head[0] by
+ * 'len' bytes. The extra data is not lost, but is instead
+ * moved into the inlined pages and/or the tail.
+ */
+static void
+xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
+{
+ struct kvec *head, *tail;
+ size_t copy, offs;
+ unsigned int pglen = buf->page_len;
+
+ tail = buf->tail;
+ head = buf->head;
+
+ WARN_ON_ONCE(len > head->iov_len);
+ if (len > head->iov_len)
+ len = head->iov_len;
+
+ /* Shift the tail first */
+ if (tail->iov_len != 0) {
+ if (tail->iov_len > len) {
+ copy = tail->iov_len - len;
+ memmove((char *)tail->iov_base + len,
+ tail->iov_base, copy);
+ }
+ /* Copy from the inlined pages into the tail */
+ copy = len;
+ if (copy > pglen)
+ copy = pglen;
+ offs = len - copy;
+ if (offs >= tail->iov_len)
+ copy = 0;
+ else if (copy > tail->iov_len - offs)
+ copy = tail->iov_len - offs;
+ if (copy != 0)
+ _copy_from_pages((char *)tail->iov_base + offs,
+ buf->pages,
+ buf->page_base + pglen + offs - len,
+ copy);
+ /* Do we also need to copy data from the head into the tail ? */
+ if (len > pglen) {
+ offs = copy = len - pglen;
+ if (copy > tail->iov_len)
+ copy = tail->iov_len;
+ memcpy(tail->iov_base,
+ (char *)head->iov_base +
+ head->iov_len - offs,
+ copy);
+ }
+ }
+ /* Now handle pages */
+ if (pglen != 0) {
+ if (pglen > len)
+ _shift_data_right_pages(buf->pages,
+ buf->page_base + len,
+ buf->page_base,
+ pglen - len);
+ copy = len;
+ if (len > pglen)
+ copy = pglen;
+ _copy_to_pages(buf->pages, buf->page_base,
+ (char *)head->iov_base + head->iov_len - len,
+ copy);
+ }
+ head->iov_len -= len;
+ buf->buflen -= len;
+ /* Have we truncated the message? */
+ if (buf->len > buf->buflen)
+ buf->len = buf->buflen;
+}
+
+/**
+ * xdr_shrink_pagelen
+ * @buf: xdr_buf
+ * @len: bytes to remove from buf->pages
+ *
+ * Shrinks XDR buffer's page array buf->pages by
+ * 'len' bytes. The extra data is not lost, but is instead
+ * moved into the tail.
+ */
+static void
+xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+{
+ struct kvec *tail;
+ size_t copy;
+ unsigned int pglen = buf->page_len;
+ unsigned int tailbuf_len;
+
+ tail = buf->tail;
+ BUG_ON (len > pglen);
+
+ tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
+
+ /* Shift the tail first */
+ if (tailbuf_len != 0) {
+ unsigned int free_space = tailbuf_len - tail->iov_len;
+
+ if (len < free_space)
+ free_space = len;
+ tail->iov_len += free_space;
+
+ copy = len;
+ if (tail->iov_len > len) {
+ char *p = (char *)tail->iov_base + len;
+ memmove(p, tail->iov_base, tail->iov_len - len);
+ } else
+ copy = tail->iov_len;
+ /* Copy from the inlined pages into the tail */
+ _copy_from_pages((char *)tail->iov_base,
+ buf->pages, buf->page_base + pglen - len,
+ copy);
+ }
+ buf->page_len -= len;
+ buf->buflen -= len;
+ /* Have we truncated the message? */
+ if (buf->len > buf->buflen)
+ buf->len = buf->buflen;
+}
+
+void
+xdr_shift_buf(struct xdr_buf *buf, size_t len)
+{
+ xdr_shrink_bufhead(buf, len);
+}
+EXPORT_SYMBOL_GPL(xdr_shift_buf);
+
+/**
+ * xdr_stream_pos - Return the current offset from the start of the xdr_stream
+ * @xdr: pointer to struct xdr_stream
+ */
+unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
+{
+ return (unsigned int)(XDR_QUADLEN(xdr->buf->len) - xdr->nwords) << 2;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_pos);
+
+/**
+ * xdr_init_encode - Initialize a struct xdr_stream for sending data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer in which to encode data
+ * @p: current pointer inside XDR buffer
+ *
+ * Note: at the moment the RPC client only passes the length of our
+ * scratch buffer in the xdr_buf's header kvec. Previously this
+ * meant we needed to call xdr_adjust_iovec() after encoding the
+ * data. With the new scheme, the xdr_stream manages the details
+ * of the buffer length, and takes care of adjusting the kvec
+ * length for us.
+ */
+void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+{
+ struct kvec *iov = buf->head;
+ int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
+
+ xdr_set_scratch_buffer(xdr, NULL, 0);
+ BUG_ON(scratch_len < 0);
+ xdr->buf = buf;
+ xdr->iov = iov;
+ xdr->p = (__be32 *)((char *)iov->iov_base + iov->iov_len);
+ xdr->end = (__be32 *)((char *)iov->iov_base + scratch_len);
+ BUG_ON(iov->iov_len > scratch_len);
+
+ if (p != xdr->p && p != NULL) {
+ size_t len;
+
+ BUG_ON(p < xdr->p || p > xdr->end);
+ len = (char *)p - (char *)xdr->p;
+ xdr->p = p;
+ buf->len += len;
+ iov->iov_len += len;
+ }
+}
+EXPORT_SYMBOL_GPL(xdr_init_encode);
+
+/**
+ * xdr_commit_encode - Ensure all data is written to buffer
+ * @xdr: pointer to xdr_stream
+ *
+ * We handle encoding across page boundaries by giving the caller a
+ * temporary location to write to, then later copying the data into
+ * place; xdr_commit_encode does that copying.
+ *
+ * Normally the caller doesn't need to call this directly, as the
+ * following xdr_reserve_space will do it. But an explicit call may be
+ * required at the end of encoding, or any other time when the xdr_buf
+ * data might be read.
+ */
+void xdr_commit_encode(struct xdr_stream *xdr)
+{
+ int shift = xdr->scratch.iov_len;
+ void *page;
+
+ if (shift == 0)
+ return;
+ page = page_address(*xdr->page_ptr);
+ memcpy(xdr->scratch.iov_base, page, shift);
+ memmove(page, page + shift, (void *)xdr->p - page);
+ xdr->scratch.iov_len = 0;
+}
+EXPORT_SYMBOL_GPL(xdr_commit_encode);
+
+static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
+ size_t nbytes)
+{
+ static __be32 *p;
+ int space_left;
+ int frag1bytes, frag2bytes;
+
+ if (nbytes > PAGE_SIZE)
+ return NULL; /* Bigger buffers require special handling */
+ if (xdr->buf->len + nbytes > xdr->buf->buflen)
+ return NULL; /* Sorry, we're totally out of space */
+ frag1bytes = (xdr->end - xdr->p) << 2;
+ frag2bytes = nbytes - frag1bytes;
+ if (xdr->iov)
+ xdr->iov->iov_len += frag1bytes;
+ else
+ xdr->buf->page_len += frag1bytes;
+ xdr->page_ptr++;
+ xdr->iov = NULL;
+ /*
+ * If the last encode didn't end exactly on a page boundary, the
+ * next one will straddle boundaries. Encode into the next
+ * page, then copy it back later in xdr_commit_encode. We use
+ * the "scratch" iov to track any temporarily unused fragment of
+ * space at the end of the previous buffer:
+ */
+ xdr->scratch.iov_base = xdr->p;
+ xdr->scratch.iov_len = frag1bytes;
+ p = page_address(*xdr->page_ptr);
+ /*
+ * Note this is where the next encode will start after we've
+ * shifted this one back:
+ */
+ xdr->p = (void *)p + frag2bytes;
+ space_left = xdr->buf->buflen - xdr->buf->len;
+ xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE);
+ xdr->buf->page_len += frag2bytes;
+ xdr->buf->len += nbytes;
+ return p;
+}
+
+/**
+ * xdr_reserve_space - Reserve buffer space for sending
+ * @xdr: pointer to xdr_stream
+ * @nbytes: number of bytes to reserve
+ *
+ * Checks that we have enough buffer space to encode 'nbytes' more
+ * bytes of data. If so, update the total xdr_buf length, and
+ * adjust the length of the current kvec.
+ */
+__be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p = xdr->p;
+ __be32 *q;
+
+ xdr_commit_encode(xdr);
+ /* align nbytes on the next 32-bit boundary */
+ nbytes += 3;
+ nbytes &= ~3;
+ q = p + (nbytes >> 2);
+ if (unlikely(q > xdr->end || q < p))
+ return xdr_get_next_encode_buffer(xdr, nbytes);
+ xdr->p = q;
+ if (xdr->iov)
+ xdr->iov->iov_len += nbytes;
+ else
+ xdr->buf->page_len += nbytes;
+ xdr->buf->len += nbytes;
+ return p;
+}
+EXPORT_SYMBOL_GPL(xdr_reserve_space);
+
+/**
+ * xdr_truncate_encode - truncate an encode buffer
+ * @xdr: pointer to xdr_stream
+ * @len: new length of buffer
+ *
+ * Truncates the xdr stream, so that xdr->buf->len == len,
+ * and xdr->p points at offset len from the start of the buffer, and
+ * head, tail, and page lengths are adjusted to correspond.
+ *
+ * If this means moving xdr->p to a different buffer, we assume that
+ * that the end pointer should be set to the end of the current page,
+ * except in the case of the head buffer when we assume the head
+ * buffer's current length represents the end of the available buffer.
+ *
+ * This is *not* safe to use on a buffer that already has inlined page
+ * cache pages (as in a zero-copy server read reply), except for the
+ * simple case of truncating from one position in the tail to another.
+ *
+ */
+void xdr_truncate_encode(struct xdr_stream *xdr, size_t len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ struct kvec *head = buf->head;
+ struct kvec *tail = buf->tail;
+ int fraglen;
+ int new;
+
+ if (len > buf->len) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ xdr_commit_encode(xdr);
+
+ fraglen = min_t(int, buf->len - len, tail->iov_len);
+ tail->iov_len -= fraglen;
+ buf->len -= fraglen;
+ if (tail->iov_len) {
+ xdr->p = tail->iov_base + tail->iov_len;
+ WARN_ON_ONCE(!xdr->end);
+ WARN_ON_ONCE(!xdr->iov);
+ return;
+ }
+ WARN_ON_ONCE(fraglen);
+ fraglen = min_t(int, buf->len - len, buf->page_len);
+ buf->page_len -= fraglen;
+ buf->len -= fraglen;
+
+ new = buf->page_base + buf->page_len;
+
+ xdr->page_ptr = buf->pages + (new >> PAGE_SHIFT);
+
+ if (buf->page_len) {
+ xdr->p = page_address(*xdr->page_ptr);
+ xdr->end = (void *)xdr->p + PAGE_SIZE;
+ xdr->p = (void *)xdr->p + (new % PAGE_SIZE);
+ WARN_ON_ONCE(xdr->iov);
+ return;
+ }
+ if (fraglen) {
+ xdr->end = head->iov_base + head->iov_len;
+ xdr->page_ptr--;
+ }
+ /* (otherwise assume xdr->end is already set) */
+ head->iov_len = len;
+ buf->len = len;
+ xdr->p = head->iov_base + head->iov_len;
+ xdr->iov = buf->head;
+}
+EXPORT_SYMBOL(xdr_truncate_encode);
+
+/**
+ * xdr_restrict_buflen - decrease available buffer space
+ * @xdr: pointer to xdr_stream
+ * @newbuflen: new maximum number of bytes available
+ *
+ * Adjust our idea of how much space is available in the buffer.
+ * If we've already used too much space in the buffer, returns -1.
+ * If the available space is already smaller than newbuflen, returns 0
+ * and does nothing. Otherwise, adjusts xdr->buf->buflen to newbuflen
+ * and ensures xdr->end is set at most offset newbuflen from the start
+ * of the buffer.
+ */
+int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen)
+{
+ struct xdr_buf *buf = xdr->buf;
+ int left_in_this_buf = (void *)xdr->end - (void *)xdr->p;
+ int end_offset = buf->len + left_in_this_buf;
+
+ if (newbuflen < 0 || newbuflen < buf->len)
+ return -1;
+ if (newbuflen > buf->buflen)
+ return 0;
+ if (newbuflen < end_offset)
+ xdr->end = (void *)xdr->end + newbuflen - end_offset;
+ buf->buflen = newbuflen;
+ return 0;
+}
+EXPORT_SYMBOL(xdr_restrict_buflen);
+
+/**
+ * xdr_write_pages - Insert a list of pages into an XDR buffer for sending
+ * @xdr: pointer to xdr_stream
+ * @pages: list of pages
+ * @base: offset of first byte
+ * @len: length of data in bytes
+ *
+ */
+void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base,
+ unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ struct kvec *iov = buf->tail;
+ buf->pages = pages;
+ buf->page_base = base;
+ buf->page_len = len;
+
+ iov->iov_base = (char *)xdr->p;
+ iov->iov_len = 0;
+ xdr->iov = iov;
+
+ if (len & 3) {
+ unsigned int pad = 4 - (len & 3);
+
+ BUG_ON(xdr->p >= xdr->end);
+ iov->iov_base = (char *)xdr->p + (len & 3);
+ iov->iov_len += pad;
+ len += pad;
+ *xdr->p++ = 0;
+ }
+ buf->buflen += len;
+ buf->len += len;
+}
+EXPORT_SYMBOL_GPL(xdr_write_pages);
+
+static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
+ unsigned int len)
+{
+ if (len > iov->iov_len)
+ len = iov->iov_len;
+ xdr->p = (__be32*)iov->iov_base;
+ xdr->end = (__be32*)(iov->iov_base + len);
+ xdr->iov = iov;
+ xdr->page_ptr = NULL;
+}
+
+static int xdr_set_page_base(struct xdr_stream *xdr,
+ unsigned int base, unsigned int len)
+{
+ unsigned int pgnr;
+ unsigned int maxlen;
+ unsigned int pgoff;
+ unsigned int pgend;
+ void *kaddr;
+
+ maxlen = xdr->buf->page_len;
+ if (base >= maxlen)
+ return -EINVAL;
+ maxlen -= base;
+ if (len > maxlen)
+ len = maxlen;
+
+ base += xdr->buf->page_base;
+
+ pgnr = base >> PAGE_SHIFT;
+ xdr->page_ptr = &xdr->buf->pages[pgnr];
+ kaddr = page_address(*xdr->page_ptr);
+
+ pgoff = base & ~PAGE_MASK;
+ xdr->p = (__be32*)(kaddr + pgoff);
+
+ pgend = pgoff + len;
+ if (pgend > PAGE_SIZE)
+ pgend = PAGE_SIZE;
+ xdr->end = (__be32*)(kaddr + pgend);
+ xdr->iov = NULL;
+ return 0;
+}
+
+static void xdr_set_next_page(struct xdr_stream *xdr)
+{
+ unsigned int newbase;
+
+ newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
+ newbase -= xdr->buf->page_base;
+
+ if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+}
+
+static bool xdr_set_next_buffer(struct xdr_stream *xdr)
+{
+ if (xdr->page_ptr != NULL)
+ xdr_set_next_page(xdr);
+ else if (xdr->iov == xdr->buf->head) {
+ if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
+ xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+ }
+ return xdr->p != xdr->end;
+}
+
+/**
+ * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer from which to decode data
+ * @p: current pointer inside XDR buffer
+ */
+void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+{
+ xdr->buf = buf;
+ xdr->scratch.iov_base = NULL;
+ xdr->scratch.iov_len = 0;
+ xdr->nwords = XDR_QUADLEN(buf->len);
+ if (buf->head[0].iov_len != 0)
+ xdr_set_iov(xdr, buf->head, buf->len);
+ else if (buf->page_len != 0)
+ xdr_set_page_base(xdr, 0, buf->len);
+ if (p != NULL && p > xdr->p && xdr->end >= p) {
+ xdr->nwords -= p - xdr->p;
+ xdr->p = p;
+ }
+}
+EXPORT_SYMBOL_GPL(xdr_init_decode);
+
+/**
+ * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer from which to decode data
+ * @pages: list of pages to decode into
+ * @len: length in bytes of buffer in pages
+ */
+void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+ struct page **pages, unsigned int len)
+{
+ memset(buf, 0, sizeof(*buf));
+ buf->pages = pages;
+ buf->page_len = len;
+ buf->buflen = len;
+ buf->len = len;
+ xdr_init_decode(xdr, buf, NULL);
+}
+EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
+
+static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+{
+ unsigned int nwords = XDR_QUADLEN(nbytes);
+ __be32 *p = xdr->p;
+ __be32 *q = p + nwords;
+
+ if (unlikely(nwords > xdr->nwords || q > xdr->end || q < p))
+ return NULL;
+ xdr->p = q;
+ xdr->nwords -= nwords;
+ return p;
+}
+
+/**
+ * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to an empty buffer
+ * @buflen: size of 'buf'
+ *
+ * The scratch buffer is used when decoding from an array of pages.
+ * If an xdr_inline_decode() call spans across page boundaries, then
+ * we copy the data into the scratch buffer in order to allow linear
+ * access.
+ */
+void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
+{
+ xdr->scratch.iov_base = buf;
+ xdr->scratch.iov_len = buflen;
+}
+EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
+
+static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p;
+ void *cpdest = xdr->scratch.iov_base;
+ size_t cplen = (char *)xdr->end - (char *)xdr->p;
+
+ if (nbytes > xdr->scratch.iov_len)
+ return NULL;
+ memcpy(cpdest, xdr->p, cplen);
+ cpdest += cplen;
+ nbytes -= cplen;
+ if (!xdr_set_next_buffer(xdr))
+ return NULL;
+ p = __xdr_inline_decode(xdr, nbytes);
+ if (p == NULL)
+ return NULL;
+ memcpy(cpdest, p, nbytes);
+ return xdr->scratch.iov_base;
+}
+
+/**
+ * xdr_inline_decode - Retrieve XDR data to decode
+ * @xdr: pointer to xdr_stream struct
+ * @nbytes: number of bytes of data to decode
+ *
+ * Check if the input buffer is long enough to enable us to decode
+ * 'nbytes' more bytes of data starting at the current position.
+ * If so return the current pointer, then update the current
+ * pointer position.
+ */
+__be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p;
+
+ if (nbytes == 0)
+ return xdr->p;
+ if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
+ return NULL;
+ p = __xdr_inline_decode(xdr, nbytes);
+ if (p != NULL)
+ return p;
+ return xdr_copy_to_scratch(xdr, nbytes);
+}
+EXPORT_SYMBOL_GPL(xdr_inline_decode);
+
+static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ struct kvec *iov;
+ unsigned int nwords = XDR_QUADLEN(len);
+ unsigned int cur = xdr_stream_pos(xdr);
+
+ if (xdr->nwords == 0)
+ return 0;
+ /* Realign pages to current pointer position */
+ iov = buf->head;
+ if (iov->iov_len > cur) {
+ xdr_shrink_bufhead(buf, iov->iov_len - cur);
+ xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ }
+
+ if (nwords > xdr->nwords) {
+ nwords = xdr->nwords;
+ len = nwords << 2;
+ }
+ if (buf->page_len <= len)
+ len = buf->page_len;
+ else if (nwords < xdr->nwords) {
+ /* Truncate page data and move it into the tail */
+ xdr_shrink_pagelen(buf, buf->page_len - len);
+ xdr->nwords = XDR_QUADLEN(buf->len - cur);
+ }
+ return len;
+}
+
+/**
+ * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
+ * @xdr: pointer to xdr_stream struct
+ * @len: number of bytes of page data
+ *
+ * Moves data beyond the current pointer position from the XDR head[] buffer
+ * into the page list. Any data that lies beyond current position + "len"
+ * bytes is moved into the XDR tail[].
+ *
+ * Returns the number of XDR encoded bytes now contained in the pages
+ */
+unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ struct kvec *iov;
+ unsigned int nwords;
+ unsigned int end;
+ unsigned int padding;
+
+ len = xdr_align_pages(xdr, len);
+ if (len == 0)
+ return 0;
+ nwords = XDR_QUADLEN(len);
+ padding = (nwords << 2) - len;
+ xdr->iov = iov = buf->tail;
+ /* Compute remaining message length. */
+ end = ((xdr->nwords - nwords) << 2) + padding;
+ if (end > iov->iov_len)
+ end = iov->iov_len;
+
+ /*
+ * Position current pointer at beginning of tail, and
+ * set remaining message length.
+ */
+ xdr->p = (__be32 *)((char *)iov->iov_base + padding);
+ xdr->end = (__be32 *)((char *)iov->iov_base + end);
+ xdr->page_ptr = NULL;
+ xdr->nwords = XDR_QUADLEN(end - padding);
+ return len;
+}
+EXPORT_SYMBOL_GPL(xdr_read_pages);
+
+/**
+ * xdr_enter_page - decode data from the XDR page
+ * @xdr: pointer to xdr_stream struct
+ * @len: number of bytes of page data
+ *
+ * Moves data beyond the current pointer position from the XDR head[] buffer
+ * into the page list. Any data that lies beyond current position + "len"
+ * bytes is moved into the XDR tail[]. The current pointer is then
+ * repositioned at the beginning of the first XDR page.
+ */
+void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
+{
+ len = xdr_align_pages(xdr, len);
+ /*
+ * Position current pointer at beginning of tail, and
+ * set remaining message length.
+ */
+ if (len != 0)
+ xdr_set_page_base(xdr, 0, len);
+}
+EXPORT_SYMBOL_GPL(xdr_enter_page);
+
+static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
+
+void
+xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+{
+ buf->head[0] = *iov;
+ buf->tail[0] = empty_iov;
+ buf->page_len = 0;
+ buf->buflen = buf->len = iov->iov_len;
+}
+EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
+
+/**
+ * xdr_buf_subsegment - set subbuf to a portion of buf
+ * @buf: an xdr buffer
+ * @subbuf: the result buffer
+ * @base: beginning of range in bytes
+ * @len: length of range in bytes
+ *
+ * sets @subbuf to an xdr buffer representing the portion of @buf of
+ * length @len starting at offset @base.
+ *
+ * @buf and @subbuf may be pointers to the same struct xdr_buf.
+ *
+ * Returns -1 if base of length are out of bounds.
+ */
+int
+xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
+ unsigned int base, unsigned int len)
+{
+ subbuf->buflen = subbuf->len = len;
+ if (base < buf->head[0].iov_len) {
+ subbuf->head[0].iov_base = buf->head[0].iov_base + base;
+ subbuf->head[0].iov_len = min_t(unsigned int, len,
+ buf->head[0].iov_len - base);
+ len -= subbuf->head[0].iov_len;
+ base = 0;
+ } else {
+ base -= buf->head[0].iov_len;
+ subbuf->head[0].iov_len = 0;
+ }
+
+ if (base < buf->page_len) {
+ subbuf->page_len = min(buf->page_len - base, len);
+ base += buf->page_base;
+ subbuf->page_base = base & ~PAGE_CACHE_MASK;
+ subbuf->pages = &buf->pages[base >> PAGE_CACHE_SHIFT];
+ len -= subbuf->page_len;
+ base = 0;
+ } else {
+ base -= buf->page_len;
+ subbuf->page_len = 0;
+ }
+
+ if (base < buf->tail[0].iov_len) {
+ subbuf->tail[0].iov_base = buf->tail[0].iov_base + base;
+ subbuf->tail[0].iov_len = min_t(unsigned int, len,
+ buf->tail[0].iov_len - base);
+ len -= subbuf->tail[0].iov_len;
+ base = 0;
+ } else {
+ base -= buf->tail[0].iov_len;
+ subbuf->tail[0].iov_len = 0;
+ }
+
+ if (base || len)
+ return -1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
+
+/**
+ * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
+ * @buf: buf to be trimmed
+ * @len: number of bytes to reduce "buf" by
+ *
+ * Trim an xdr_buf by the given number of bytes by fixing up the lengths. Note
+ * that it's possible that we'll trim less than that amount if the xdr_buf is
+ * too small, or if (for instance) it's all in the head and the parser has
+ * already read too far into it.
+ */
+void xdr_buf_trim(struct xdr_buf *buf, unsigned int len)
+{
+ size_t cur;
+ unsigned int trim = len;
+
+ if (buf->tail[0].iov_len) {
+ cur = min_t(size_t, buf->tail[0].iov_len, trim);
+ buf->tail[0].iov_len -= cur;
+ trim -= cur;
+ if (!trim)
+ goto fix_len;
+ }
+
+ if (buf->page_len) {
+ cur = min_t(unsigned int, buf->page_len, trim);
+ buf->page_len -= cur;
+ trim -= cur;
+ if (!trim)
+ goto fix_len;
+ }
+
+ if (buf->head[0].iov_len) {
+ cur = min_t(size_t, buf->head[0].iov_len, trim);
+ buf->head[0].iov_len -= cur;
+ trim -= cur;
+ }
+fix_len:
+ buf->len -= (len - trim);
+}
+EXPORT_SYMBOL_GPL(xdr_buf_trim);
+
+static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+{
+ unsigned int this_len;
+
+ this_len = min_t(unsigned int, len, subbuf->head[0].iov_len);
+ memcpy(obj, subbuf->head[0].iov_base, this_len);
+ len -= this_len;
+ obj += this_len;
+ this_len = min_t(unsigned int, len, subbuf->page_len);
+ if (this_len)
+ _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
+ len -= this_len;
+ obj += this_len;
+ this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
+ memcpy(obj, subbuf->tail[0].iov_base, this_len);
+}
+
+/* obj is assumed to point to allocated memory of size at least len: */
+int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+{
+ struct xdr_buf subbuf;
+ int status;
+
+ status = xdr_buf_subsegment(buf, &subbuf, base, len);
+ if (status != 0)
+ return status;
+ __read_bytes_from_xdr_buf(&subbuf, obj, len);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
+
+static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+{
+ unsigned int this_len;
+
+ this_len = min_t(unsigned int, len, subbuf->head[0].iov_len);
+ memcpy(subbuf->head[0].iov_base, obj, this_len);
+ len -= this_len;
+ obj += this_len;
+ this_len = min_t(unsigned int, len, subbuf->page_len);
+ if (this_len)
+ _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
+ len -= this_len;
+ obj += this_len;
+ this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
+ memcpy(subbuf->tail[0].iov_base, obj, this_len);
+}
+
+/* obj is assumed to point to allocated memory of size at least len: */
+int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+{
+ struct xdr_buf subbuf;
+ int status;
+
+ status = xdr_buf_subsegment(buf, &subbuf, base, len);
+ if (status != 0)
+ return status;
+ __write_bytes_to_xdr_buf(&subbuf, obj, len);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
+
+int
+xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+{
+ __be32 raw;
+ int status;
+
+ status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj));
+ if (status)
+ return status;
+ *obj = be32_to_cpu(raw);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdr_decode_word);
+
+int
+xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+{
+ __be32 raw = cpu_to_be32(obj);
+
+ return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
+}
+EXPORT_SYMBOL_GPL(xdr_encode_word);
+
+/* If the netobj starting offset bytes from the start of xdr_buf is contained
+ * entirely in the head or the tail, set object to point to it; otherwise
+ * try to find space for it at the end of the tail, copy it there, and
+ * set obj to point to it. */
+int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset)
+{
+ struct xdr_buf subbuf;
+
+ if (xdr_decode_word(buf, offset, &obj->len))
+ return -EFAULT;
+ if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len))
+ return -EFAULT;
+
+ /* Is the obj contained entirely in the head? */
+ obj->data = subbuf.head[0].iov_base;
+ if (subbuf.head[0].iov_len == obj->len)
+ return 0;
+ /* ..or is the obj contained entirely in the tail? */
+ obj->data = subbuf.tail[0].iov_base;
+ if (subbuf.tail[0].iov_len == obj->len)
+ return 0;
+
+ /* use end of tail as storage for obj:
+ * (We don't copy to the beginning because then we'd have
+ * to worry about doing a potentially overlapping copy.
+ * This assumes the object is at most half the length of the
+ * tail.) */
+ if (obj->len > buf->buflen - buf->len)
+ return -ENOMEM;
+ if (buf->tail[0].iov_len != 0)
+ obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
+ else
+ obj->data = buf->head[0].iov_base + buf->head[0].iov_len;
+ __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdr_buf_read_netobj);
+
+/* Returns 0 on success, or else a negative error code. */
+static int
+xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc, int encode)
+{
+ char *elem = NULL, *c;
+ unsigned int copied = 0, todo, avail_here;
+ struct page **ppages = NULL;
+ int err;
+
+ if (encode) {
+ if (xdr_encode_word(buf, base, desc->array_len) != 0)
+ return -EINVAL;
+ } else {
+ if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+ desc->array_len > desc->array_maxlen ||
+ (unsigned long) base + 4 + desc->array_len *
+ desc->elem_size > buf->len)
+ return -EINVAL;
+ }
+ base += 4;
+
+ if (!desc->xcode)
+ return 0;
+
+ todo = desc->array_len * desc->elem_size;
+
+ /* process head */
+ if (todo && base < buf->head->iov_len) {
+ c = buf->head->iov_base + base;
+ avail_here = min_t(unsigned int, todo,
+ buf->head->iov_len - base);
+ todo -= avail_here;
+
+ while (avail_here >= desc->elem_size) {
+ err = desc->xcode(desc, c);
+ if (err)
+ goto out;
+ c += desc->elem_size;
+ avail_here -= desc->elem_size;
+ }
+ if (avail_here) {
+ if (!elem) {
+ elem = kmalloc(desc->elem_size, GFP_KERNEL);
+ err = -ENOMEM;
+ if (!elem)
+ goto out;
+ }
+ if (encode) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ memcpy(c, elem, avail_here);
+ } else
+ memcpy(elem, c, avail_here);
+ copied = avail_here;
+ }
+ base = buf->head->iov_len; /* align to start of pages */
+ }
+
+ /* process pages array */
+ base -= buf->head->iov_len;
+ if (todo && base < buf->page_len) {
+ unsigned int avail_page;
+
+ avail_here = min(todo, buf->page_len - base);
+ todo -= avail_here;
+
+ base += buf->page_base;
+ ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
+ base &= ~PAGE_CACHE_MASK;
+ avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
+ avail_here);
+ c = kmap(*ppages) + base;
+
+ while (avail_here) {
+ avail_here -= avail_page;
+ if (copied || avail_page < desc->elem_size) {
+ unsigned int l = min(avail_page,
+ desc->elem_size - copied);
+ if (!elem) {
+ elem = kmalloc(desc->elem_size,
+ GFP_KERNEL);
+ err = -ENOMEM;
+ if (!elem)
+ goto out;
+ }
+ if (encode) {
+ if (!copied) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ }
+ memcpy(c, elem + copied, l);
+ copied += l;
+ if (copied == desc->elem_size)
+ copied = 0;
+ } else {
+ memcpy(elem + copied, c, l);
+ copied += l;
+ if (copied == desc->elem_size) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ copied = 0;
+ }
+ }
+ avail_page -= l;
+ c += l;
+ }
+ while (avail_page >= desc->elem_size) {
+ err = desc->xcode(desc, c);
+ if (err)
+ goto out;
+ c += desc->elem_size;
+ avail_page -= desc->elem_size;
+ }
+ if (avail_page) {
+ unsigned int l = min(avail_page,
+ desc->elem_size - copied);
+ if (!elem) {
+ elem = kmalloc(desc->elem_size,
+ GFP_KERNEL);
+ err = -ENOMEM;
+ if (!elem)
+ goto out;
+ }
+ if (encode) {
+ if (!copied) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ }
+ memcpy(c, elem + copied, l);
+ copied += l;
+ if (copied == desc->elem_size)
+ copied = 0;
+ } else {
+ memcpy(elem + copied, c, l);
+ copied += l;
+ if (copied == desc->elem_size) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ copied = 0;
+ }
+ }
+ }
+ if (avail_here) {
+ kunmap(*ppages);
+ ppages++;
+ c = kmap(*ppages);
+ }
+
+ avail_page = min(avail_here,
+ (unsigned int) PAGE_CACHE_SIZE);
+ }
+ base = buf->page_len; /* align to start of tail */
+ }
+
+ /* process tail */
+ base -= buf->page_len;
+ if (todo) {
+ c = buf->tail->iov_base + base;
+ if (copied) {
+ unsigned int l = desc->elem_size - copied;
+
+ if (encode)
+ memcpy(c, elem + copied, l);
+ else {
+ memcpy(elem + copied, c, l);
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ }
+ todo -= l;
+ c += l;
+ }
+ while (todo) {
+ err = desc->xcode(desc, c);
+ if (err)
+ goto out;
+ c += desc->elem_size;
+ todo -= desc->elem_size;
+ }
+ }
+ err = 0;
+
+out:
+ kfree(elem);
+ if (ppages)
+ kunmap(*ppages);
+ return err;
+}
+
+int
+xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
+{
+ if (base >= buf->len)
+ return -EINVAL;
+
+ return xdr_xcode_array2(buf, base, desc, 0);
+}
+EXPORT_SYMBOL_GPL(xdr_decode_array2);
+
+int
+xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
+{
+ if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
+ buf->head->iov_len + buf->page_len + buf->tail->iov_len)
+ return -EINVAL;
+
+ return xdr_xcode_array2(buf, base, desc, 1);
+}
+EXPORT_SYMBOL_GPL(xdr_encode_array2);
+
+int
+xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
+ int (*actor)(struct scatterlist *, void *), void *data)
+{
+ int i, ret = 0;
+ unsigned int page_len, thislen, page_offset;
+ struct scatterlist sg[1];
+
+ sg_init_table(sg, 1);
+
+ if (offset >= buf->head[0].iov_len) {
+ offset -= buf->head[0].iov_len;
+ } else {
+ thislen = buf->head[0].iov_len - offset;
+ if (thislen > len)
+ thislen = len;
+ sg_set_buf(sg, buf->head[0].iov_base + offset, thislen);
+ ret = actor(sg, data);
+ if (ret)
+ goto out;
+ offset = 0;
+ len -= thislen;
+ }
+ if (len == 0)
+ goto out;
+
+ if (offset >= buf->page_len) {
+ offset -= buf->page_len;
+ } else {
+ page_len = buf->page_len - offset;
+ if (page_len > len)
+ page_len = len;
+ len -= page_len;
+ page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
+ i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
+ thislen = PAGE_CACHE_SIZE - page_offset;
+ do {
+ if (thislen > page_len)
+ thislen = page_len;
+ sg_set_page(sg, buf->pages[i], thislen, page_offset);
+ ret = actor(sg, data);
+ if (ret)
+ goto out;
+ page_len -= thislen;
+ i++;
+ page_offset = 0;
+ thislen = PAGE_CACHE_SIZE;
+ } while (page_len != 0);
+ offset = 0;
+ }
+ if (len == 0)
+ goto out;
+ if (offset < buf->tail[0].iov_len) {
+ thislen = buf->tail[0].iov_len - offset;
+ if (thislen > len)
+ thislen = len;
+ sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen);
+ ret = actor(sg, data);
+ len -= thislen;
+ }
+ if (len != 0)
+ ret = -EINVAL;
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_process_buf);
+
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
new file mode 100644
index 000000000..1d4fe24af
--- /dev/null
+++ b/net/sunrpc/xprt.c
@@ -0,0 +1,1416 @@
+/*
+ * linux/net/sunrpc/xprt.c
+ *
+ * This is a generic RPC call interface supporting congestion avoidance,
+ * and asynchronous calls.
+ *
+ * The interface works like this:
+ *
+ * - When a process places a call, it allocates a request slot if
+ * one is available. Otherwise, it sleeps on the backlog queue
+ * (xprt_reserve).
+ * - Next, the caller puts together the RPC message, stuffs it into
+ * the request struct, and calls xprt_transmit().
+ * - xprt_transmit sends the message and installs the caller on the
+ * transport's wait list. At the same time, if a reply is expected,
+ * it installs a timer that is run after the packet's timeout has
+ * expired.
+ * - When a packet arrives, the data_ready handler walks the list of
+ * pending requests for that transport. If a matching XID is found, the
+ * caller is woken up, and the timer removed.
+ * - When no reply arrives within the timeout interval, the timer is
+ * fired by the kernel and runs xprt_timer(). It either adjusts the
+ * timeout values (minor timeout) or wakes up the caller with a status
+ * of -ETIMEDOUT.
+ * - When the caller receives a notification from RPC that a reply arrived,
+ * it should release the RPC slot, and process the reply.
+ * If the call timed out, it may choose to retry the operation by
+ * adjusting the initial timeout value, and simply calling rpc_call
+ * again.
+ *
+ * Support for async RPC is done through a set of RPC-specific scheduling
+ * primitives that `transparently' work for processes as well as async
+ * tasks that rely on callbacks.
+ *
+ * Copyright (C) 1995-1997, Olaf Kirch <okir@monad.swb.de>
+ *
+ * Transport switch API copyright (C) 2005, Chuck Lever <cel@netapp.com>
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include <linux/net.h>
+#include <linux/ktime.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include <trace/events/sunrpc.h>
+
+#include "sunrpc.h"
+
+/*
+ * Local variables
+ */
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_XPRT
+#endif
+
+/*
+ * Local functions
+ */
+static void xprt_init(struct rpc_xprt *xprt, struct net *net);
+static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
+static void xprt_connect_status(struct rpc_task *task);
+static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
+static void xprt_destroy(struct rpc_xprt *xprt);
+
+static DEFINE_SPINLOCK(xprt_list_lock);
+static LIST_HEAD(xprt_list);
+
+/**
+ * xprt_register_transport - register a transport implementation
+ * @transport: transport to register
+ *
+ * If a transport implementation is loaded as a kernel module, it can
+ * call this interface to make itself known to the RPC client.
+ *
+ * Returns:
+ * 0: transport successfully registered
+ * -EEXIST: transport already registered
+ * -EINVAL: transport module being unloaded
+ */
+int xprt_register_transport(struct xprt_class *transport)
+{
+ struct xprt_class *t;
+ int result;
+
+ result = -EEXIST;
+ spin_lock(&xprt_list_lock);
+ list_for_each_entry(t, &xprt_list, list) {
+ /* don't register the same transport class twice */
+ if (t->ident == transport->ident)
+ goto out;
+ }
+
+ list_add_tail(&transport->list, &xprt_list);
+ printk(KERN_INFO "RPC: Registered %s transport module.\n",
+ transport->name);
+ result = 0;
+
+out:
+ spin_unlock(&xprt_list_lock);
+ return result;
+}
+EXPORT_SYMBOL_GPL(xprt_register_transport);
+
+/**
+ * xprt_unregister_transport - unregister a transport implementation
+ * @transport: transport to unregister
+ *
+ * Returns:
+ * 0: transport successfully unregistered
+ * -ENOENT: transport never registered
+ */
+int xprt_unregister_transport(struct xprt_class *transport)
+{
+ struct xprt_class *t;
+ int result;
+
+ result = 0;
+ spin_lock(&xprt_list_lock);
+ list_for_each_entry(t, &xprt_list, list) {
+ if (t == transport) {
+ printk(KERN_INFO
+ "RPC: Unregistered %s transport module.\n",
+ transport->name);
+ list_del_init(&transport->list);
+ goto out;
+ }
+ }
+ result = -ENOENT;
+
+out:
+ spin_unlock(&xprt_list_lock);
+ return result;
+}
+EXPORT_SYMBOL_GPL(xprt_unregister_transport);
+
+/**
+ * xprt_load_transport - load a transport implementation
+ * @transport_name: transport to load
+ *
+ * Returns:
+ * 0: transport successfully loaded
+ * -ENOENT: transport module not available
+ */
+int xprt_load_transport(const char *transport_name)
+{
+ struct xprt_class *t;
+ int result;
+
+ result = 0;
+ spin_lock(&xprt_list_lock);
+ list_for_each_entry(t, &xprt_list, list) {
+ if (strcmp(t->name, transport_name) == 0) {
+ spin_unlock(&xprt_list_lock);
+ goto out;
+ }
+ }
+ spin_unlock(&xprt_list_lock);
+ result = request_module("xprt%s", transport_name);
+out:
+ return result;
+}
+EXPORT_SYMBOL_GPL(xprt_load_transport);
+
+/**
+ * xprt_reserve_xprt - serialize write access to transports
+ * @task: task that is requesting access to the transport
+ * @xprt: pointer to the target transport
+ *
+ * This prevents mixing the payload of separate requests, and prevents
+ * transport connects from colliding with writes. No congestion control
+ * is provided.
+ */
+int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ int priority;
+
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
+ if (task == xprt->snd_task)
+ return 1;
+ goto out_sleep;
+ }
+ xprt->snd_task = task;
+ if (req != NULL)
+ req->rq_ntrans++;
+
+ return 1;
+
+out_sleep:
+ dprintk("RPC: %5u failed to lock transport %p\n",
+ task->tk_pid, xprt);
+ task->tk_timeout = 0;
+ task->tk_status = -EAGAIN;
+ if (req == NULL)
+ priority = RPC_PRIORITY_LOW;
+ else if (!req->rq_ntrans)
+ priority = RPC_PRIORITY_NORMAL;
+ else
+ priority = RPC_PRIORITY_HIGH;
+ rpc_sleep_on_priority(&xprt->sending, task, NULL, priority);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
+
+static void xprt_clear_locked(struct rpc_xprt *xprt)
+{
+ xprt->snd_task = NULL;
+ if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
+ smp_mb__before_atomic();
+ clear_bit(XPRT_LOCKED, &xprt->state);
+ smp_mb__after_atomic();
+ } else
+ queue_work(rpciod_workqueue, &xprt->task_cleanup);
+}
+
+/*
+ * xprt_reserve_xprt_cong - serialize write access to transports
+ * @task: task that is requesting access to the transport
+ *
+ * Same as xprt_reserve_xprt, but Van Jacobson congestion control is
+ * integrated into the decision of whether a request is allowed to be
+ * woken up and given access to the transport.
+ */
+int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ int priority;
+
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
+ if (task == xprt->snd_task)
+ return 1;
+ goto out_sleep;
+ }
+ if (req == NULL) {
+ xprt->snd_task = task;
+ return 1;
+ }
+ if (__xprt_get_cong(xprt, task)) {
+ xprt->snd_task = task;
+ req->rq_ntrans++;
+ return 1;
+ }
+ xprt_clear_locked(xprt);
+out_sleep:
+ dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
+ task->tk_timeout = 0;
+ task->tk_status = -EAGAIN;
+ if (req == NULL)
+ priority = RPC_PRIORITY_LOW;
+ else if (!req->rq_ntrans)
+ priority = RPC_PRIORITY_NORMAL;
+ else
+ priority = RPC_PRIORITY_HIGH;
+ rpc_sleep_on_priority(&xprt->sending, task, NULL, priority);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
+
+static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ int retval;
+
+ spin_lock_bh(&xprt->transport_lock);
+ retval = xprt->ops->reserve_xprt(xprt, task);
+ spin_unlock_bh(&xprt->transport_lock);
+ return retval;
+}
+
+static bool __xprt_lock_write_func(struct rpc_task *task, void *data)
+{
+ struct rpc_xprt *xprt = data;
+ struct rpc_rqst *req;
+
+ req = task->tk_rqstp;
+ xprt->snd_task = task;
+ if (req)
+ req->rq_ntrans++;
+ return true;
+}
+
+static void __xprt_lock_write_next(struct rpc_xprt *xprt)
+{
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
+ return;
+
+ if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt))
+ return;
+ xprt_clear_locked(xprt);
+}
+
+static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data)
+{
+ struct rpc_xprt *xprt = data;
+ struct rpc_rqst *req;
+
+ req = task->tk_rqstp;
+ if (req == NULL) {
+ xprt->snd_task = task;
+ return true;
+ }
+ if (__xprt_get_cong(xprt, task)) {
+ xprt->snd_task = task;
+ req->rq_ntrans++;
+ return true;
+ }
+ return false;
+}
+
+static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
+{
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
+ return;
+ if (RPCXPRT_CONGESTED(xprt))
+ goto out_unlock;
+ if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt))
+ return;
+out_unlock:
+ xprt_clear_locked(xprt);
+}
+
+static void xprt_task_clear_bytes_sent(struct rpc_task *task)
+{
+ if (task != NULL) {
+ struct rpc_rqst *req = task->tk_rqstp;
+ if (req != NULL)
+ req->rq_bytes_sent = 0;
+ }
+}
+
+/**
+ * xprt_release_xprt - allow other requests to use a transport
+ * @xprt: transport with other tasks potentially waiting
+ * @task: task that is releasing access to the transport
+ *
+ * Note that "task" can be NULL. No congestion control is provided.
+ */
+void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ if (xprt->snd_task == task) {
+ xprt_task_clear_bytes_sent(task);
+ xprt_clear_locked(xprt);
+ __xprt_lock_write_next(xprt);
+ }
+}
+EXPORT_SYMBOL_GPL(xprt_release_xprt);
+
+/**
+ * xprt_release_xprt_cong - allow other requests to use a transport
+ * @xprt: transport with other tasks potentially waiting
+ * @task: task that is releasing access to the transport
+ *
+ * Note that "task" can be NULL. Another task is awoken to use the
+ * transport if the transport's congestion window allows it.
+ */
+void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ if (xprt->snd_task == task) {
+ xprt_task_clear_bytes_sent(task);
+ xprt_clear_locked(xprt);
+ __xprt_lock_write_next_cong(xprt);
+ }
+}
+EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
+
+static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ spin_lock_bh(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+/*
+ * Van Jacobson congestion avoidance. Check if the congestion window
+ * overflowed. Put the task to sleep if this is the case.
+ */
+static int
+__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ if (req->rq_cong)
+ return 1;
+ dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
+ task->tk_pid, xprt->cong, xprt->cwnd);
+ if (RPCXPRT_CONGESTED(xprt))
+ return 0;
+ req->rq_cong = 1;
+ xprt->cong += RPC_CWNDSCALE;
+ return 1;
+}
+
+/*
+ * Adjust the congestion window, and wake up the next task
+ * that has been sleeping due to congestion
+ */
+static void
+__xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ if (!req->rq_cong)
+ return;
+ req->rq_cong = 0;
+ xprt->cong -= RPC_CWNDSCALE;
+ __xprt_lock_write_next_cong(xprt);
+}
+
+/**
+ * xprt_release_rqst_cong - housekeeping when request is complete
+ * @task: RPC request that recently completed
+ *
+ * Useful for transports that require congestion control.
+ */
+void xprt_release_rqst_cong(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ __xprt_put_cong(req->rq_xprt, req);
+}
+EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
+
+/**
+ * xprt_adjust_cwnd - adjust transport congestion window
+ * @xprt: pointer to xprt
+ * @task: recently completed RPC request used to adjust window
+ * @result: result code of completed RPC request
+ *
+ * The transport code maintains an estimate on the maximum number of out-
+ * standing RPC requests, using a smoothed version of the congestion
+ * avoidance implemented in 44BSD. This is basically the Van Jacobson
+ * congestion algorithm: If a retransmit occurs, the congestion window is
+ * halved; otherwise, it is incremented by 1/cwnd when
+ *
+ * - a reply is received and
+ * - a full number of requests are outstanding and
+ * - the congestion window hasn't been updated recently.
+ */
+void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ unsigned long cwnd = xprt->cwnd;
+
+ if (result >= 0 && cwnd <= xprt->cong) {
+ /* The (cwnd >> 1) term makes sure
+ * the result gets rounded properly. */
+ cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd;
+ if (cwnd > RPC_MAXCWND(xprt))
+ cwnd = RPC_MAXCWND(xprt);
+ __xprt_lock_write_next_cong(xprt);
+ } else if (result == -ETIMEDOUT) {
+ cwnd >>= 1;
+ if (cwnd < RPC_CWNDSCALE)
+ cwnd = RPC_CWNDSCALE;
+ }
+ dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n",
+ xprt->cong, xprt->cwnd, cwnd);
+ xprt->cwnd = cwnd;
+ __xprt_put_cong(xprt, req);
+}
+EXPORT_SYMBOL_GPL(xprt_adjust_cwnd);
+
+/**
+ * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue
+ * @xprt: transport with waiting tasks
+ * @status: result code to plant in each task before waking it
+ *
+ */
+void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status)
+{
+ if (status < 0)
+ rpc_wake_up_status(&xprt->pending, status);
+ else
+ rpc_wake_up(&xprt->pending);
+}
+EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
+
+/**
+ * xprt_wait_for_buffer_space - wait for transport output buffer to clear
+ * @task: task to be put to sleep
+ * @action: function pointer to be executed after wait
+ *
+ * Note that we only set the timer for the case of RPC_IS_SOFT(), since
+ * we don't in general want to force a socket disconnection due to
+ * an incomplete RPC call transmission.
+ */
+void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0;
+ rpc_sleep_on(&xprt->pending, task, action);
+}
+EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
+
+/**
+ * xprt_write_space - wake the task waiting for transport output buffer space
+ * @xprt: transport with waiting tasks
+ *
+ * Can be called in a soft IRQ context, so xprt_write_space never sleeps.
+ */
+void xprt_write_space(struct rpc_xprt *xprt)
+{
+ spin_lock_bh(&xprt->transport_lock);
+ if (xprt->snd_task) {
+ dprintk("RPC: write space: waking waiting task on "
+ "xprt %p\n", xprt);
+ rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task);
+ }
+ spin_unlock_bh(&xprt->transport_lock);
+}
+EXPORT_SYMBOL_GPL(xprt_write_space);
+
+/**
+ * xprt_set_retrans_timeout_def - set a request's retransmit timeout
+ * @task: task whose timeout is to be set
+ *
+ * Set a request's retransmit timeout based on the transport's
+ * default timeout parameters. Used by transports that don't adjust
+ * the retransmit timeout based on round-trip time estimation.
+ */
+void xprt_set_retrans_timeout_def(struct rpc_task *task)
+{
+ task->tk_timeout = task->tk_rqstp->rq_timeout;
+}
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
+
+/**
+ * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
+ * @task: task whose timeout is to be set
+ *
+ * Set a request's retransmit timeout using the RTT estimator.
+ */
+void xprt_set_retrans_timeout_rtt(struct rpc_task *task)
+{
+ int timer = task->tk_msg.rpc_proc->p_timer;
+ struct rpc_clnt *clnt = task->tk_client;
+ struct rpc_rtt *rtt = clnt->cl_rtt;
+ struct rpc_rqst *req = task->tk_rqstp;
+ unsigned long max_timeout = clnt->cl_timeout->to_maxval;
+
+ task->tk_timeout = rpc_calc_rto(rtt, timer);
+ task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries;
+ if (task->tk_timeout > max_timeout || task->tk_timeout == 0)
+ task->tk_timeout = max_timeout;
+}
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt);
+
+static void xprt_reset_majortimeo(struct rpc_rqst *req)
+{
+ const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
+
+ req->rq_majortimeo = req->rq_timeout;
+ if (to->to_exponential)
+ req->rq_majortimeo <<= to->to_retries;
+ else
+ req->rq_majortimeo += to->to_increment * to->to_retries;
+ if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0)
+ req->rq_majortimeo = to->to_maxval;
+ req->rq_majortimeo += jiffies;
+}
+
+/**
+ * xprt_adjust_timeout - adjust timeout values for next retransmit
+ * @req: RPC request containing parameters to use for the adjustment
+ *
+ */
+int xprt_adjust_timeout(struct rpc_rqst *req)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+ const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
+ int status = 0;
+
+ if (time_before(jiffies, req->rq_majortimeo)) {
+ if (to->to_exponential)
+ req->rq_timeout <<= 1;
+ else
+ req->rq_timeout += to->to_increment;
+ if (to->to_maxval && req->rq_timeout >= to->to_maxval)
+ req->rq_timeout = to->to_maxval;
+ req->rq_retries++;
+ } else {
+ req->rq_timeout = to->to_initval;
+ req->rq_retries = 0;
+ xprt_reset_majortimeo(req);
+ /* Reset the RTT counters == "slow start" */
+ spin_lock_bh(&xprt->transport_lock);
+ rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval);
+ spin_unlock_bh(&xprt->transport_lock);
+ status = -ETIMEDOUT;
+ }
+
+ if (req->rq_timeout == 0) {
+ printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n");
+ req->rq_timeout = 5 * HZ;
+ }
+ return status;
+}
+
+static void xprt_autoclose(struct work_struct *work)
+{
+ struct rpc_xprt *xprt =
+ container_of(work, struct rpc_xprt, task_cleanup);
+
+ xprt->ops->close(xprt);
+ clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ xprt_release_write(xprt, NULL);
+}
+
+/**
+ * xprt_disconnect_done - mark a transport as disconnected
+ * @xprt: transport to flag for disconnect
+ *
+ */
+void xprt_disconnect_done(struct rpc_xprt *xprt)
+{
+ dprintk("RPC: disconnected transport %p\n", xprt);
+ spin_lock_bh(&xprt->transport_lock);
+ xprt_clear_connected(xprt);
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
+ spin_unlock_bh(&xprt->transport_lock);
+}
+EXPORT_SYMBOL_GPL(xprt_disconnect_done);
+
+/**
+ * xprt_force_disconnect - force a transport to disconnect
+ * @xprt: transport to disconnect
+ *
+ */
+void xprt_force_disconnect(struct rpc_xprt *xprt)
+{
+ /* Don't race with the test_bit() in xprt_clear_locked() */
+ spin_lock_bh(&xprt->transport_lock);
+ set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ /* Try to schedule an autoclose RPC call */
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
+ queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+/**
+ * xprt_conditional_disconnect - force a transport to disconnect
+ * @xprt: transport to disconnect
+ * @cookie: 'connection cookie'
+ *
+ * This attempts to break the connection if and only if 'cookie' matches
+ * the current transport 'connection cookie'. It ensures that we don't
+ * try to break the connection more than once when we need to retransmit
+ * a batch of RPC requests.
+ *
+ */
+void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
+{
+ /* Don't race with the test_bit() in xprt_clear_locked() */
+ spin_lock_bh(&xprt->transport_lock);
+ if (cookie != xprt->connect_cookie)
+ goto out;
+ if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt))
+ goto out;
+ set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ /* Try to schedule an autoclose RPC call */
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
+ queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
+out:
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void
+xprt_init_autodisconnect(unsigned long data)
+{
+ struct rpc_xprt *xprt = (struct rpc_xprt *)data;
+
+ spin_lock(&xprt->transport_lock);
+ if (!list_empty(&xprt->recv))
+ goto out_abort;
+ if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
+ goto out_abort;
+ spin_unlock(&xprt->transport_lock);
+ queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ return;
+out_abort:
+ spin_unlock(&xprt->transport_lock);
+}
+
+bool xprt_lock_connect(struct rpc_xprt *xprt,
+ struct rpc_task *task,
+ void *cookie)
+{
+ bool ret = false;
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (!test_bit(XPRT_LOCKED, &xprt->state))
+ goto out;
+ if (xprt->snd_task != task)
+ goto out;
+ xprt_task_clear_bytes_sent(task);
+ xprt->snd_task = cookie;
+ ret = true;
+out:
+ spin_unlock_bh(&xprt->transport_lock);
+ return ret;
+}
+
+void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
+{
+ spin_lock_bh(&xprt->transport_lock);
+ if (xprt->snd_task != cookie)
+ goto out;
+ if (!test_bit(XPRT_LOCKED, &xprt->state))
+ goto out;
+ xprt->snd_task =NULL;
+ xprt->ops->release_xprt(xprt, NULL);
+out:
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+/**
+ * xprt_connect - schedule a transport connect operation
+ * @task: RPC task that is requesting the connect
+ *
+ */
+void xprt_connect(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+
+ dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid,
+ xprt, (xprt_connected(xprt) ? "is" : "is not"));
+
+ if (!xprt_bound(xprt)) {
+ task->tk_status = -EAGAIN;
+ return;
+ }
+ if (!xprt_lock_write(xprt, task))
+ return;
+
+ if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state))
+ xprt->ops->close(xprt);
+
+ if (!xprt_connected(xprt)) {
+ task->tk_rqstp->rq_bytes_sent = 0;
+ task->tk_timeout = task->tk_rqstp->rq_timeout;
+ rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
+
+ if (test_bit(XPRT_CLOSING, &xprt->state))
+ return;
+ if (xprt_test_and_set_connecting(xprt))
+ return;
+ xprt->stat.connect_start = jiffies;
+ xprt->ops->connect(xprt, task);
+ }
+ xprt_release_write(xprt, task);
+}
+
+static void xprt_connect_status(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+
+ if (task->tk_status == 0) {
+ xprt->stat.connect_count++;
+ xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start;
+ dprintk("RPC: %5u xprt_connect_status: connection established\n",
+ task->tk_pid);
+ return;
+ }
+
+ switch (task->tk_status) {
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ECONNABORTED:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
+ case -EPIPE:
+ case -EAGAIN:
+ dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
+ break;
+ case -ETIMEDOUT:
+ dprintk("RPC: %5u xprt_connect_status: connect attempt timed "
+ "out\n", task->tk_pid);
+ break;
+ default:
+ dprintk("RPC: %5u xprt_connect_status: error %d connecting to "
+ "server %s\n", task->tk_pid, -task->tk_status,
+ xprt->servername);
+ task->tk_status = -EIO;
+ }
+}
+
+/**
+ * xprt_lookup_rqst - find an RPC request corresponding to an XID
+ * @xprt: transport on which the original request was transmitted
+ * @xid: RPC XID of incoming reply
+ *
+ */
+struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
+{
+ struct rpc_rqst *entry;
+
+ list_for_each_entry(entry, &xprt->recv, rq_list)
+ if (entry->rq_xid == xid) {
+ trace_xprt_lookup_rqst(xprt, xid, 0);
+ return entry;
+ }
+
+ dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n",
+ ntohl(xid));
+ trace_xprt_lookup_rqst(xprt, xid, -ENOENT);
+ xprt->stat.bad_xids++;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
+
+static void xprt_update_rtt(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_rtt *rtt = task->tk_client->cl_rtt;
+ unsigned int timer = task->tk_msg.rpc_proc->p_timer;
+ long m = usecs_to_jiffies(ktime_to_us(req->rq_rtt));
+
+ if (timer) {
+ if (req->rq_ntrans == 1)
+ rpc_update_rtt(rtt, timer, m);
+ rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
+ }
+}
+
+/**
+ * xprt_complete_rqst - called when reply processing is complete
+ * @task: RPC request that recently completed
+ * @copied: actual number of bytes received from the transport
+ *
+ * Caller holds transport lock.
+ */
+void xprt_complete_rqst(struct rpc_task *task, int copied)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
+ task->tk_pid, ntohl(req->rq_xid), copied);
+ trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
+
+ xprt->stat.recvs++;
+ req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
+ if (xprt->ops->timer != NULL)
+ xprt_update_rtt(task);
+
+ list_del_init(&req->rq_list);
+ req->rq_private_buf.len = copied;
+ /* Ensure all writes are done before we update */
+ /* req->rq_reply_bytes_recvd */
+ smp_wmb();
+ req->rq_reply_bytes_recvd = copied;
+ rpc_wake_up_queued_task(&xprt->pending, task);
+}
+EXPORT_SYMBOL_GPL(xprt_complete_rqst);
+
+static void xprt_timer(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (task->tk_status != -ETIMEDOUT)
+ return;
+ dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (!req->rq_reply_bytes_recvd) {
+ if (xprt->ops->timer)
+ xprt->ops->timer(xprt, task);
+ } else
+ task->tk_status = 0;
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+static inline int xprt_has_timer(struct rpc_xprt *xprt)
+{
+ return xprt->idle_timeout != 0;
+}
+
+/**
+ * xprt_prepare_transmit - reserve the transport before sending a request
+ * @task: RPC task about to send a request
+ *
+ */
+bool xprt_prepare_transmit(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ bool ret = false;
+
+ dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (!req->rq_bytes_sent) {
+ if (req->rq_reply_bytes_recvd) {
+ task->tk_status = req->rq_reply_bytes_recvd;
+ goto out_unlock;
+ }
+ if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
+ && xprt_connected(xprt)
+ && req->rq_connect_cookie == xprt->connect_cookie) {
+ xprt->ops->set_retrans_timeout(task);
+ rpc_sleep_on(&xprt->pending, task, xprt_timer);
+ goto out_unlock;
+ }
+ }
+ if (!xprt->ops->reserve_xprt(xprt, task)) {
+ task->tk_status = -EAGAIN;
+ goto out_unlock;
+ }
+ ret = true;
+out_unlock:
+ spin_unlock_bh(&xprt->transport_lock);
+ return ret;
+}
+
+void xprt_end_transmit(struct rpc_task *task)
+{
+ xprt_release_write(task->tk_rqstp->rq_xprt, task);
+}
+
+/**
+ * xprt_transmit - send an RPC request on a transport
+ * @task: controlling RPC task
+ *
+ * We have to copy the iovec because sendmsg fiddles with its contents.
+ */
+void xprt_transmit(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ int status, numreqs;
+
+ dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
+
+ if (!req->rq_reply_bytes_recvd) {
+ if (list_empty(&req->rq_list) && rpc_reply_expected(task)) {
+ /*
+ * Add to the list only if we're expecting a reply
+ */
+ spin_lock_bh(&xprt->transport_lock);
+ /* Update the softirq receive buffer */
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+ sizeof(req->rq_private_buf));
+ /* Add request to the receive list */
+ list_add_tail(&req->rq_list, &xprt->recv);
+ spin_unlock_bh(&xprt->transport_lock);
+ xprt_reset_majortimeo(req);
+ /* Turn off autodisconnect */
+ del_singleshot_timer_sync(&xprt->timer);
+ }
+ } else if (!req->rq_bytes_sent)
+ return;
+
+ req->rq_xtime = ktime_get();
+ status = xprt->ops->send_request(task);
+ trace_xprt_transmit(xprt, req->rq_xid, status);
+ if (status != 0) {
+ task->tk_status = status;
+ return;
+ }
+
+ dprintk("RPC: %5u xmit complete\n", task->tk_pid);
+ task->tk_flags |= RPC_TASK_SENT;
+ spin_lock_bh(&xprt->transport_lock);
+
+ xprt->ops->set_retrans_timeout(task);
+
+ numreqs = atomic_read(&xprt->num_reqs);
+ if (numreqs > xprt->stat.max_slots)
+ xprt->stat.max_slots = numreqs;
+ xprt->stat.sends++;
+ xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
+ xprt->stat.bklog_u += xprt->backlog.qlen;
+ xprt->stat.sending_u += xprt->sending.qlen;
+ xprt->stat.pending_u += xprt->pending.qlen;
+
+ /* Don't race with disconnect */
+ if (!xprt_connected(xprt))
+ task->tk_status = -ENOTCONN;
+ else {
+ /*
+ * Sleep on the pending queue since
+ * we're expecting a reply.
+ */
+ if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task))
+ rpc_sleep_on(&xprt->pending, task, xprt_timer);
+ req->rq_connect_cookie = xprt->connect_cookie;
+ }
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ set_bit(XPRT_CONGESTED, &xprt->state);
+ rpc_sleep_on(&xprt->backlog, task, NULL);
+}
+
+static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+{
+ if (rpc_wake_up_next(&xprt->backlog) == NULL)
+ clear_bit(XPRT_CONGESTED, &xprt->state);
+}
+
+static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ bool ret = false;
+
+ if (!test_bit(XPRT_CONGESTED, &xprt->state))
+ goto out;
+ spin_lock(&xprt->reserve_lock);
+ if (test_bit(XPRT_CONGESTED, &xprt->state)) {
+ rpc_sleep_on(&xprt->backlog, task, NULL);
+ ret = true;
+ }
+ spin_unlock(&xprt->reserve_lock);
+out:
+ return ret;
+}
+
+static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt, gfp_t gfp_flags)
+{
+ struct rpc_rqst *req = ERR_PTR(-EAGAIN);
+
+ if (!atomic_add_unless(&xprt->num_reqs, 1, xprt->max_reqs))
+ goto out;
+ req = kzalloc(sizeof(struct rpc_rqst), gfp_flags);
+ if (req != NULL)
+ goto out;
+ atomic_dec(&xprt->num_reqs);
+ req = ERR_PTR(-ENOMEM);
+out:
+ return req;
+}
+
+static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ if (atomic_add_unless(&xprt->num_reqs, -1, xprt->min_reqs)) {
+ kfree(req);
+ return true;
+ }
+ return false;
+}
+
+void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct rpc_rqst *req;
+
+ spin_lock(&xprt->reserve_lock);
+ if (!list_empty(&xprt->free)) {
+ req = list_entry(xprt->free.next, struct rpc_rqst, rq_list);
+ list_del(&req->rq_list);
+ goto out_init_req;
+ }
+ req = xprt_dynamic_alloc_slot(xprt, GFP_NOWAIT|__GFP_NOWARN);
+ if (!IS_ERR(req))
+ goto out_init_req;
+ switch (PTR_ERR(req)) {
+ case -ENOMEM:
+ dprintk("RPC: dynamic allocation of request slot "
+ "failed! Retrying\n");
+ task->tk_status = -ENOMEM;
+ break;
+ case -EAGAIN:
+ xprt_add_backlog(xprt, task);
+ dprintk("RPC: waiting for request slot\n");
+ default:
+ task->tk_status = -EAGAIN;
+ }
+ spin_unlock(&xprt->reserve_lock);
+ return;
+out_init_req:
+ task->tk_status = 0;
+ task->tk_rqstp = req;
+ xprt_request_init(task, xprt);
+ spin_unlock(&xprt->reserve_lock);
+}
+EXPORT_SYMBOL_GPL(xprt_alloc_slot);
+
+void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ /* Note: grabbing the xprt_lock_write() ensures that we throttle
+ * new slot allocation if the transport is congested (i.e. when
+ * reconnecting a stream transport or when out of socket write
+ * buffer space).
+ */
+ if (xprt_lock_write(xprt, task)) {
+ xprt_alloc_slot(xprt, task);
+ xprt_release_write(xprt, task);
+ }
+}
+EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot);
+
+static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+ spin_lock(&xprt->reserve_lock);
+ if (!xprt_dynamic_free_slot(xprt, req)) {
+ memset(req, 0, sizeof(*req)); /* mark unused */
+ list_add(&req->rq_list, &xprt->free);
+ }
+ xprt_wake_up_backlog(xprt);
+ spin_unlock(&xprt->reserve_lock);
+}
+
+static void xprt_free_all_slots(struct rpc_xprt *xprt)
+{
+ struct rpc_rqst *req;
+ while (!list_empty(&xprt->free)) {
+ req = list_first_entry(&xprt->free, struct rpc_rqst, rq_list);
+ list_del(&req->rq_list);
+ kfree(req);
+ }
+}
+
+struct rpc_xprt *xprt_alloc(struct net *net, size_t size,
+ unsigned int num_prealloc,
+ unsigned int max_alloc)
+{
+ struct rpc_xprt *xprt;
+ struct rpc_rqst *req;
+ int i;
+
+ xprt = kzalloc(size, GFP_KERNEL);
+ if (xprt == NULL)
+ goto out;
+
+ xprt_init(xprt, net);
+
+ for (i = 0; i < num_prealloc; i++) {
+ req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+ if (!req)
+ goto out_free;
+ list_add(&req->rq_list, &xprt->free);
+ }
+ if (max_alloc > num_prealloc)
+ xprt->max_reqs = max_alloc;
+ else
+ xprt->max_reqs = num_prealloc;
+ xprt->min_reqs = num_prealloc;
+ atomic_set(&xprt->num_reqs, num_prealloc);
+
+ return xprt;
+
+out_free:
+ xprt_free(xprt);
+out:
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(xprt_alloc);
+
+void xprt_free(struct rpc_xprt *xprt)
+{
+ put_net(xprt->xprt_net);
+ xprt_free_all_slots(xprt);
+ kfree(xprt);
+}
+EXPORT_SYMBOL_GPL(xprt_free);
+
+/**
+ * xprt_reserve - allocate an RPC request slot
+ * @task: RPC task requesting a slot allocation
+ *
+ * If the transport is marked as being congested, or if no more
+ * slots are available, place the task on the transport's
+ * backlog queue.
+ */
+void xprt_reserve(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt;
+
+ task->tk_status = 0;
+ if (task->tk_rqstp != NULL)
+ return;
+
+ task->tk_timeout = 0;
+ task->tk_status = -EAGAIN;
+ rcu_read_lock();
+ xprt = rcu_dereference(task->tk_client->cl_xprt);
+ if (!xprt_throttle_congested(xprt, task))
+ xprt->ops->alloc_slot(xprt, task);
+ rcu_read_unlock();
+}
+
+/**
+ * xprt_retry_reserve - allocate an RPC request slot
+ * @task: RPC task requesting a slot allocation
+ *
+ * If no more slots are available, place the task on the transport's
+ * backlog queue.
+ * Note that the only difference with xprt_reserve is that we now
+ * ignore the value of the XPRT_CONGESTED flag.
+ */
+void xprt_retry_reserve(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt;
+
+ task->tk_status = 0;
+ if (task->tk_rqstp != NULL)
+ return;
+
+ task->tk_timeout = 0;
+ task->tk_status = -EAGAIN;
+ rcu_read_lock();
+ xprt = rcu_dereference(task->tk_client->cl_xprt);
+ xprt->ops->alloc_slot(xprt, task);
+ rcu_read_unlock();
+}
+
+static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
+{
+ return (__force __be32)xprt->xid++;
+}
+
+static inline void xprt_init_xid(struct rpc_xprt *xprt)
+{
+ xprt->xid = prandom_u32();
+}
+
+static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ INIT_LIST_HEAD(&req->rq_list);
+ req->rq_timeout = task->tk_client->cl_timeout->to_initval;
+ req->rq_task = task;
+ req->rq_xprt = xprt;
+ req->rq_buffer = NULL;
+ req->rq_xid = xprt_alloc_xid(xprt);
+ req->rq_connect_cookie = xprt->connect_cookie - 1;
+ req->rq_bytes_sent = 0;
+ req->rq_snd_buf.len = 0;
+ req->rq_snd_buf.buflen = 0;
+ req->rq_rcv_buf.len = 0;
+ req->rq_rcv_buf.buflen = 0;
+ req->rq_release_snd_buf = NULL;
+ xprt_reset_majortimeo(req);
+ dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
+ req, ntohl(req->rq_xid));
+}
+
+/**
+ * xprt_release - release an RPC request slot
+ * @task: task which is finished with the slot
+ *
+ */
+void xprt_release(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt;
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ if (req == NULL) {
+ if (task->tk_client) {
+ rcu_read_lock();
+ xprt = rcu_dereference(task->tk_client->cl_xprt);
+ if (xprt->snd_task == task)
+ xprt_release_write(xprt, task);
+ rcu_read_unlock();
+ }
+ return;
+ }
+
+ xprt = req->rq_xprt;
+ if (task->tk_ops->rpc_count_stats != NULL)
+ task->tk_ops->rpc_count_stats(task, task->tk_calldata);
+ else if (task->tk_client)
+ rpc_count_iostats(task, task->tk_client->cl_metrics);
+ spin_lock_bh(&xprt->transport_lock);
+ xprt->ops->release_xprt(xprt, task);
+ if (xprt->ops->release_request)
+ xprt->ops->release_request(task);
+ if (!list_empty(&req->rq_list))
+ list_del(&req->rq_list);
+ xprt->last_used = jiffies;
+ if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
+ mod_timer(&xprt->timer,
+ xprt->last_used + xprt->idle_timeout);
+ spin_unlock_bh(&xprt->transport_lock);
+ if (req->rq_buffer)
+ xprt->ops->buf_free(req->rq_buffer);
+ if (req->rq_cred != NULL)
+ put_rpccred(req->rq_cred);
+ task->tk_rqstp = NULL;
+ if (req->rq_release_snd_buf)
+ req->rq_release_snd_buf(req);
+
+ dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
+ if (likely(!bc_prealloc(req)))
+ xprt_free_slot(xprt, req);
+ else
+ xprt_free_bc_request(req);
+}
+
+static void xprt_init(struct rpc_xprt *xprt, struct net *net)
+{
+ atomic_set(&xprt->count, 1);
+
+ spin_lock_init(&xprt->transport_lock);
+ spin_lock_init(&xprt->reserve_lock);
+
+ INIT_LIST_HEAD(&xprt->free);
+ INIT_LIST_HEAD(&xprt->recv);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ spin_lock_init(&xprt->bc_pa_lock);
+ INIT_LIST_HEAD(&xprt->bc_pa_list);
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+ xprt->last_used = jiffies;
+ xprt->cwnd = RPC_INITCWND;
+ xprt->bind_index = 0;
+
+ rpc_init_wait_queue(&xprt->binding, "xprt_binding");
+ rpc_init_wait_queue(&xprt->pending, "xprt_pending");
+ rpc_init_priority_wait_queue(&xprt->sending, "xprt_sending");
+ rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
+
+ xprt_init_xid(xprt);
+
+ xprt->xprt_net = get_net(net);
+}
+
+/**
+ * xprt_create_transport - create an RPC transport
+ * @args: rpc transport creation arguments
+ *
+ */
+struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
+{
+ struct rpc_xprt *xprt;
+ struct xprt_class *t;
+
+ spin_lock(&xprt_list_lock);
+ list_for_each_entry(t, &xprt_list, list) {
+ if (t->ident == args->ident) {
+ spin_unlock(&xprt_list_lock);
+ goto found;
+ }
+ }
+ spin_unlock(&xprt_list_lock);
+ dprintk("RPC: transport (%d) not supported\n", args->ident);
+ return ERR_PTR(-EIO);
+
+found:
+ xprt = t->setup(args);
+ if (IS_ERR(xprt)) {
+ dprintk("RPC: xprt_create_transport: failed, %ld\n",
+ -PTR_ERR(xprt));
+ goto out;
+ }
+ if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
+ xprt->idle_timeout = 0;
+ INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
+ if (xprt_has_timer(xprt))
+ setup_timer(&xprt->timer, xprt_init_autodisconnect,
+ (unsigned long)xprt);
+ else
+ init_timer(&xprt->timer);
+
+ if (strlen(args->servername) > RPC_MAXNETNAMELEN) {
+ xprt_destroy(xprt);
+ return ERR_PTR(-EINVAL);
+ }
+ xprt->servername = kstrdup(args->servername, GFP_KERNEL);
+ if (xprt->servername == NULL) {
+ xprt_destroy(xprt);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ rpc_xprt_debugfs_register(xprt);
+
+ dprintk("RPC: created transport %p with %u slots\n", xprt,
+ xprt->max_reqs);
+out:
+ return xprt;
+}
+
+/**
+ * xprt_destroy - destroy an RPC transport, killing off all requests.
+ * @xprt: transport to destroy
+ *
+ */
+static void xprt_destroy(struct rpc_xprt *xprt)
+{
+ dprintk("RPC: destroying transport %p\n", xprt);
+ del_timer_sync(&xprt->timer);
+
+ rpc_xprt_debugfs_unregister(xprt);
+ rpc_destroy_wait_queue(&xprt->binding);
+ rpc_destroy_wait_queue(&xprt->pending);
+ rpc_destroy_wait_queue(&xprt->sending);
+ rpc_destroy_wait_queue(&xprt->backlog);
+ cancel_work_sync(&xprt->task_cleanup);
+ kfree(xprt->servername);
+ /*
+ * Tear down transport state and free the rpc_xprt
+ */
+ xprt->ops->destroy(xprt);
+}
+
+/**
+ * xprt_put - release a reference to an RPC transport.
+ * @xprt: pointer to the transport
+ *
+ */
+void xprt_put(struct rpc_xprt *xprt)
+{
+ if (atomic_dec_and_test(&xprt->count))
+ xprt_destroy(xprt);
+}
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 000000000..579f72bbc
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
+
+xprtrdma-y := transport.o rpc_rdma.o verbs.o \
+ fmr_ops.o frwr_ops.o physical_ops.o
+
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
+
+svcrdma-y := svc_rdma.o svc_rdma_transport.o \
+ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644
index 000000000..302d4ebf6
--- /dev/null
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Memory Regions (FMR).
+ * Referred to sometimes as MTHCAFMR mode.
+ *
+ * FMR uses synchronous memory registration and deregistration.
+ * FMR registration is known to be fast, but FMR deregistration
+ * can take tens of usecs to complete.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+/* Maximum scatter/gather per FMR */
+#define RPCRDMA_MAX_FMR_SGES (64)
+
+static int
+fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+ struct rpcrdma_create_data_internal *cdata)
+{
+ return 0;
+}
+
+/* FMR mode conveys up to 64 pages of payload per chunk segment.
+ */
+static size_t
+fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+ return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+ rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+}
+
+static int
+fmr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+ struct ib_fmr_attr fmr_attr = {
+ .max_pages = RPCRDMA_MAX_FMR_SGES,
+ .max_maps = 1,
+ .page_shift = PAGE_SHIFT
+ };
+ struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+ struct rpcrdma_mw *r;
+ int i, rc;
+
+ INIT_LIST_HEAD(&buf->rb_mws);
+ INIT_LIST_HEAD(&buf->rb_all);
+
+ i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+ dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
+
+ while (i--) {
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+ if (IS_ERR(r->r.fmr))
+ goto out_fmr_err;
+
+ list_add(&r->mw_list, &buf->rb_mws);
+ list_add(&r->mw_all, &buf->rb_all);
+ }
+ return 0;
+
+out_fmr_err:
+ rc = PTR_ERR(r->r.fmr);
+ dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
+ kfree(r);
+ return rc;
+}
+
+/* Use the ib_map_phys_fmr() verb to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct ib_device *device = ia->ri_id->device;
+ enum dma_data_direction direction = rpcrdma_data_dir(writing);
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct rpcrdma_mw *mw = seg1->rl_mw;
+ u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+ int len, pageoff, i, rc;
+
+ pageoff = offset_in_page(seg1->mr_offset);
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (nsegs > RPCRDMA_MAX_FMR_SGES)
+ nsegs = RPCRDMA_MAX_FMR_SGES;
+ for (i = 0; i < nsegs;) {
+ rpcrdma_map_one(device, seg, direction);
+ physaddrs[i] = seg->mr_dma;
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+ break;
+ }
+
+ rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+ if (rc)
+ goto out_maperr;
+
+ seg1->mr_rkey = mw->r.fmr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ return i;
+
+out_maperr:
+ dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+ __func__, len, (unsigned long long)seg1->mr_dma,
+ pageoff, i, rc);
+ while (i--)
+ rpcrdma_unmap_one(device, --seg);
+ return rc;
+}
+
+/* Use the ib_unmap_fmr() verb to prevent further remote
+ * access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct ib_device *device;
+ int rc, nsegs = seg->mr_nsegs;
+ LIST_HEAD(l);
+
+ list_add(&seg1->rl_mw->r.fmr->list, &l);
+ rc = ib_unmap_fmr(&l);
+ read_lock(&ia->ri_qplock);
+ device = ia->ri_id->device;
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(device, seg++);
+ read_unlock(&ia->ri_qplock);
+ if (rc)
+ goto out_err;
+ return nsegs;
+
+out_err:
+ dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
+ return nsegs;
+}
+
+/* After a disconnect, unmap all FMRs.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_fmr_external().
+ */
+static void
+fmr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_mw *r;
+ LIST_HEAD(list);
+ int rc;
+
+ list_for_each_entry(r, &buf->rb_all, mw_all)
+ list_add(&r->r.fmr->list, &list);
+
+ rc = ib_unmap_fmr(&list);
+ if (rc)
+ dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
+ __func__, rc);
+}
+
+static void
+fmr_op_destroy(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int rc;
+
+ while (!list_empty(&buf->rb_all)) {
+ r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&r->mw_all);
+ rc = ib_dealloc_fmr(r->r.fmr);
+ if (rc)
+ dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
+ __func__, rc);
+ kfree(r);
+ }
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
+ .ro_map = fmr_op_map,
+ .ro_unmap = fmr_op_unmap,
+ .ro_open = fmr_op_open,
+ .ro_maxpages = fmr_op_maxpages,
+ .ro_init = fmr_op_init,
+ .ro_reset = fmr_op_reset,
+ .ro_destroy = fmr_op_destroy,
+ .ro_displayname = "fmr",
+};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 000000000..dff0481db
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Registration Work
+ * Requests (FRWR). Also referred to sometimes as FRMR mode.
+ *
+ * FRWR features ordered asynchronous registration and deregistration
+ * of arbitrarily sized memory regions. This is the fastest and safest
+ * but most complex memory registration mode.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+static int
+__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
+ unsigned int depth)
+{
+ struct rpcrdma_frmr *f = &r->r.frmr;
+ int rc;
+
+ f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+ if (IS_ERR(f->fr_mr))
+ goto out_mr_err;
+ f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
+ if (IS_ERR(f->fr_pgl))
+ goto out_list_err;
+ return 0;
+
+out_mr_err:
+ rc = PTR_ERR(f->fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n",
+ __func__, rc);
+ return rc;
+
+out_list_err:
+ rc = PTR_ERR(f->fr_pgl);
+ dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n",
+ __func__, rc);
+ ib_dereg_mr(f->fr_mr);
+ return rc;
+}
+
+static void
+__frwr_release(struct rpcrdma_mw *r)
+{
+ int rc;
+
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s: ib_dereg_mr status %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+}
+
+static int
+frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+ struct rpcrdma_create_data_internal *cdata)
+{
+ struct ib_device_attr *devattr = &ia->ri_devattr;
+ int depth, delta;
+
+ ia->ri_max_frmr_depth =
+ min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+ devattr->max_fast_reg_page_list_len);
+ dprintk("RPC: %s: device's max FR page list len = %u\n",
+ __func__, ia->ri_max_frmr_depth);
+
+ /* Add room for frmr register and invalidate WRs.
+ * 1. FRMR reg WR for head
+ * 2. FRMR invalidate WR for head
+ * 3. N FRMR reg WRs for pagelist
+ * 4. N FRMR invalidate WRs for pagelist
+ * 5. FRMR reg WR for tail
+ * 6. FRMR invalidate WR for tail
+ * 7. The RDMA_SEND WR
+ */
+ depth = 7;
+
+ /* Calculate N if the device max FRMR depth is smaller than
+ * RPCRDMA_MAX_DATA_SEGS.
+ */
+ if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
+ do {
+ depth += 2; /* FRMR reg + invalidate */
+ delta -= ia->ri_max_frmr_depth;
+ } while (delta > 0);
+ }
+
+ ep->rep_attr.cap.max_send_wr *= depth;
+ if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
+ cdata->max_requests = devattr->max_qp_wr / depth;
+ if (!cdata->max_requests)
+ return -EINVAL;
+ ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+ depth;
+ }
+
+ return 0;
+}
+
+/* FRWR mode conveys a list of pages per chunk segment. The
+ * maximum length of that list is the FRWR page list depth.
+ */
+static size_t
+frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+ rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+}
+
+/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
+static void
+frwr_sendcompletion(struct ib_wc *wc)
+{
+ struct rpcrdma_mw *r;
+
+ if (likely(wc->status == IB_WC_SUCCESS))
+ return;
+
+ /* WARNING: Only wr_id and status are reliable at this point */
+ r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+ dprintk("RPC: %s: frmr %p (stale), status %d\n",
+ __func__, r, wc->status);
+ r->r.frmr.fr_state = FRMR_IS_STALE;
+}
+
+static int
+frwr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+ struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+ int i;
+
+ INIT_LIST_HEAD(&buf->rb_mws);
+ INIT_LIST_HEAD(&buf->rb_all);
+
+ i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+ dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
+
+ while (i--) {
+ struct rpcrdma_mw *r;
+ int rc;
+
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ rc = __frwr_init(r, pd, device, depth);
+ if (rc) {
+ kfree(r);
+ return rc;
+ }
+
+ list_add(&r->mw_list, &buf->rb_mws);
+ list_add(&r->mw_all, &buf->rb_all);
+ r->mw_sendcompletion = frwr_sendcompletion;
+ }
+
+ return 0;
+}
+
+/* Post a FAST_REG Work Request to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct ib_device *device = ia->ri_id->device;
+ enum dma_data_direction direction = rpcrdma_data_dir(writing);
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct rpcrdma_mw *mw = seg1->rl_mw;
+ struct rpcrdma_frmr *frmr = &mw->r.frmr;
+ struct ib_mr *mr = frmr->fr_mr;
+ struct ib_send_wr fastreg_wr, *bad_wr;
+ u8 key;
+ int len, pageoff;
+ int i, rc;
+ int seg_len;
+ u64 pa;
+ int page_no;
+
+ pageoff = offset_in_page(seg1->mr_offset);
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+ if (nsegs > ia->ri_max_frmr_depth)
+ nsegs = ia->ri_max_frmr_depth;
+ for (page_no = i = 0; i < nsegs;) {
+ rpcrdma_map_one(device, seg, direction);
+ pa = seg->mr_dma;
+ for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
+ frmr->fr_pgl->page_list[page_no++] = pa;
+ pa += PAGE_SIZE;
+ }
+ len += seg->mr_len;
+ ++seg;
+ ++i;
+ /* Check for holes */
+ if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+ offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+ break;
+ }
+ dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
+ __func__, mw, i, len);
+
+ frmr->fr_state = FRMR_IS_VALID;
+
+ memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+ fastreg_wr.wr_id = (unsigned long)(void *)mw;
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
+ fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.page_list_len = page_no;
+ fastreg_wr.wr.fast_reg.length = len;
+ fastreg_wr.wr.fast_reg.access_flags = writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ;
+ key = (u8)(mr->rkey & 0x000000FF);
+ ib_update_fast_reg_key(mr, ++key);
+ fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+ rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+ if (rc)
+ goto out_senderr;
+
+ seg1->mr_rkey = mr->rkey;
+ seg1->mr_base = seg1->mr_dma + pageoff;
+ seg1->mr_nsegs = i;
+ seg1->mr_len = len;
+ return i;
+
+out_senderr:
+ dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
+ ib_update_fast_reg_key(mr, --key);
+ frmr->fr_state = FRMR_IS_INVALID;
+ while (i--)
+ rpcrdma_unmap_one(device, --seg);
+ return rc;
+}
+
+/* Post a LOCAL_INV Work Request to prevent further remote access
+ * via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+ struct rpcrdma_mr_seg *seg1 = seg;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct ib_send_wr invalidate_wr, *bad_wr;
+ int rc, nsegs = seg->mr_nsegs;
+ struct ib_device *device;
+
+ seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+
+ memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+ invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ read_lock(&ia->ri_qplock);
+ device = ia->ri_id->device;
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(device, seg++);
+ rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ read_unlock(&ia->ri_qplock);
+ if (rc)
+ goto out_err;
+ return nsegs;
+
+out_err:
+ /* Force rpcrdma_buffer_get() to retry */
+ seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
+ dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
+ return nsegs;
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each. FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+frwr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+ struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+ struct rpcrdma_mw *r;
+ int rc;
+
+ list_for_each_entry(r, &buf->rb_all, mw_all) {
+ if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+ continue;
+
+ __frwr_release(r);
+ rc = __frwr_init(r, pd, device, depth);
+ if (rc) {
+ dprintk("RPC: %s: mw %p left %s\n",
+ __func__, r,
+ (r->r.frmr.fr_state == FRMR_IS_STALE ?
+ "stale" : "valid"));
+ continue;
+ }
+
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+ }
+}
+
+static void
+frwr_op_destroy(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+
+ while (!list_empty(&buf->rb_all)) {
+ r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&r->mw_all);
+ __frwr_release(r);
+ kfree(r);
+ }
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
+ .ro_map = frwr_op_map,
+ .ro_unmap = frwr_op_unmap,
+ .ro_open = frwr_op_open,
+ .ro_maxpages = frwr_op_maxpages,
+ .ro_init = frwr_op_init,
+ .ro_reset = frwr_op_reset,
+ .ro_destroy = frwr_op_destroy,
+ .ro_displayname = "frwr",
+};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
new file mode 100644
index 000000000..ba518af16
--- /dev/null
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* No-op chunk preparation. All client memory is pre-registered.
+ * Sometimes referred to as ALLPHYSICAL mode.
+ *
+ * Physical registration is simple because all client memory is
+ * pre-registered and never deregistered. This mode is good for
+ * adapter bring up, but is considered not safe: the server is
+ * trusted not to abuse its access to client memory not involved
+ * in RDMA I/O.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+static int
+physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+ struct rpcrdma_create_data_internal *cdata)
+{
+ return 0;
+}
+
+/* PHYSICAL memory registration conveys one page per chunk segment.
+ */
+static size_t
+physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+ return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+ rpcrdma_max_segments(r_xprt));
+}
+
+static int
+physical_op_init(struct rpcrdma_xprt *r_xprt)
+{
+ return 0;
+}
+
+/* The client's physical memory is already exposed for
+ * remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ rpcrdma_map_one(ia->ri_id->device, seg,
+ rpcrdma_data_dir(writing));
+ seg->mr_rkey = ia->ri_bind_mem->rkey;
+ seg->mr_base = seg->mr_dma;
+ seg->mr_nsegs = 1;
+ return 1;
+}
+
+/* Unmap a memory region, but leave it registered.
+ */
+static int
+physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ read_lock(&ia->ri_qplock);
+ rpcrdma_unmap_one(ia->ri_id->device, seg);
+ read_unlock(&ia->ri_qplock);
+
+ return 1;
+}
+
+static void
+physical_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+}
+
+static void
+physical_op_destroy(struct rpcrdma_buffer *buf)
+{
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
+ .ro_map = physical_op_map,
+ .ro_unmap = physical_op_unmap,
+ .ro_open = physical_op_open,
+ .ro_maxpages = physical_op_maxpages,
+ .ro_init = physical_op_init,
+ .ro_reset = physical_op_reset,
+ .ro_destroy = physical_op_destroy,
+ .ro_displayname = "physical",
+};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 000000000..2c53ea9e1
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,889 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * rpc_rdma.c
+ *
+ * This file contains the guts of the RPC RDMA protocol, and
+ * does marshaling/unmarshaling, etc. It is also where interfacing
+ * to the Linux RPC framework lives.
+ */
+
+#include "xprt_rdma.h"
+
+#include <linux/highmem.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+enum rpcrdma_chunktype {
+ rpcrdma_noch = 0,
+ rpcrdma_readch,
+ rpcrdma_areadch,
+ rpcrdma_writech,
+ rpcrdma_replych
+};
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+static const char transfertypes[][12] = {
+ "pure inline", /* no chunks */
+ " read chunk", /* some argument via rdma read */
+ "*read chunk", /* entire request via rdma read */
+ "write chunk", /* some result via rdma write */
+ "reply chunk" /* entire reply via rdma write */
+};
+#endif
+
+/*
+ * Chunk assembly from upper layer xdr_buf.
+ *
+ * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+ * elements. Segments are then coalesced when registered, if possible
+ * within the selected memreg mode.
+ *
+ * Returns positive number of segments converted, or a negative errno.
+ */
+
+static int
+rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+ enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+{
+ int len, n = 0, p;
+ int page_base;
+ struct page **ppages;
+
+ if (pos == 0 && xdrbuf->head[0].iov_len) {
+ seg[n].mr_page = NULL;
+ seg[n].mr_offset = xdrbuf->head[0].iov_base;
+ seg[n].mr_len = xdrbuf->head[0].iov_len;
+ ++n;
+ }
+
+ len = xdrbuf->page_len;
+ ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
+ page_base = xdrbuf->page_base & ~PAGE_MASK;
+ p = 0;
+ while (len && n < nsegs) {
+ if (!ppages[p]) {
+ /* alloc the pagelist for receiving buffer */
+ ppages[p] = alloc_page(GFP_ATOMIC);
+ if (!ppages[p])
+ return -ENOMEM;
+ }
+ seg[n].mr_page = ppages[p];
+ seg[n].mr_offset = (void *)(unsigned long) page_base;
+ seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+ if (seg[n].mr_len > PAGE_SIZE)
+ return -EIO;
+ len -= seg[n].mr_len;
+ ++n;
+ ++p;
+ page_base = 0; /* page offset only applies to first page */
+ }
+
+ /* Message overflows the seg array */
+ if (len && n == nsegs)
+ return -EIO;
+
+ if (xdrbuf->tail[0].iov_len) {
+ /* the rpcrdma protocol allows us to omit any trailing
+ * xdr pad bytes, saving the server an RDMA operation. */
+ if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
+ return n;
+ if (n == nsegs)
+ /* Tail remains, but we're out of segments */
+ return -EIO;
+ seg[n].mr_page = NULL;
+ seg[n].mr_offset = xdrbuf->tail[0].iov_base;
+ seg[n].mr_len = xdrbuf->tail[0].iov_len;
+ ++n;
+ }
+
+ return n;
+}
+
+/*
+ * Create read/write chunk lists, and reply chunks, for RDMA
+ *
+ * Assume check against THRESHOLD has been done, and chunks are required.
+ * Assume only encoding one list entry for read|write chunks. The NFSv3
+ * protocol is simple enough to allow this as it only has a single "bulk
+ * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
+ * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
+ *
+ * When used for a single reply chunk (which is a special write
+ * chunk used for the entire reply, rather than just the data), it
+ * is used primarily for READDIR and READLINK which would otherwise
+ * be severely size-limited by a small rdma inline read max. The server
+ * response will come back as an RDMA Write, followed by a message
+ * of type RDMA_NOMSG carrying the xid and length. As a result, reply
+ * chunks do not provide data alignment, however they do not require
+ * "fixup" (moving the response to the upper layer buffer) either.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ * Read chunklist (a linked list):
+ * N elements, position P (same P for all chunks of same arg!):
+ * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ * Write chunklist (a list of (one) counted array):
+ * N elements:
+ * 1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ * Reply chunk (a counted array):
+ * N elements:
+ * 1 - N - HLOO - HLOO - ... - HLOO
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+static ssize_t
+rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+ struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+{
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+ int n, nsegs, nchunks = 0;
+ unsigned int pos;
+ struct rpcrdma_mr_seg *seg = req->rl_segments;
+ struct rpcrdma_read_chunk *cur_rchunk = NULL;
+ struct rpcrdma_write_array *warray = NULL;
+ struct rpcrdma_write_chunk *cur_wchunk = NULL;
+ __be32 *iptr = headerp->rm_body.rm_chunks;
+ int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
+
+ if (type == rpcrdma_readch || type == rpcrdma_areadch) {
+ /* a read chunk - server will RDMA Read our memory */
+ cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
+ } else {
+ /* a write or reply chunk - server will RDMA Write our memory */
+ *iptr++ = xdr_zero; /* encode a NULL read chunk list */
+ if (type == rpcrdma_replych)
+ *iptr++ = xdr_zero; /* a NULL write chunk list */
+ warray = (struct rpcrdma_write_array *) iptr;
+ cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
+ }
+
+ if (type == rpcrdma_replych || type == rpcrdma_areadch)
+ pos = 0;
+ else
+ pos = target->head[0].iov_len;
+
+ nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+ if (nsegs < 0)
+ return nsegs;
+
+ map = r_xprt->rx_ia.ri_ops->ro_map;
+ do {
+ n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
+ if (n <= 0)
+ goto out;
+ if (cur_rchunk) { /* read */
+ cur_rchunk->rc_discrim = xdr_one;
+ /* all read chunks have the same "position" */
+ cur_rchunk->rc_position = cpu_to_be32(pos);
+ cur_rchunk->rc_target.rs_handle =
+ cpu_to_be32(seg->mr_rkey);
+ cur_rchunk->rc_target.rs_length =
+ cpu_to_be32(seg->mr_len);
+ xdr_encode_hyper(
+ (__be32 *)&cur_rchunk->rc_target.rs_offset,
+ seg->mr_base);
+ dprintk("RPC: %s: read chunk "
+ "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, pos, n < nsegs ? "more" : "last");
+ cur_rchunk++;
+ r_xprt->rx_stats.read_chunk_count++;
+ } else { /* write/reply */
+ cur_wchunk->wc_target.rs_handle =
+ cpu_to_be32(seg->mr_rkey);
+ cur_wchunk->wc_target.rs_length =
+ cpu_to_be32(seg->mr_len);
+ xdr_encode_hyper(
+ (__be32 *)&cur_wchunk->wc_target.rs_offset,
+ seg->mr_base);
+ dprintk("RPC: %s: %s chunk "
+ "elem %d@0x%llx:0x%x (%s)\n", __func__,
+ (type == rpcrdma_replych) ? "reply" : "write",
+ seg->mr_len, (unsigned long long)seg->mr_base,
+ seg->mr_rkey, n < nsegs ? "more" : "last");
+ cur_wchunk++;
+ if (type == rpcrdma_replych)
+ r_xprt->rx_stats.reply_chunk_count++;
+ else
+ r_xprt->rx_stats.write_chunk_count++;
+ r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+ }
+ nchunks++;
+ seg += n;
+ nsegs -= n;
+ } while (nsegs);
+
+ /* success. all failures return above */
+ req->rl_nchunks = nchunks;
+
+ /*
+ * finish off header. If write, marshal discrim and nchunks.
+ */
+ if (cur_rchunk) {
+ iptr = (__be32 *) cur_rchunk;
+ *iptr++ = xdr_zero; /* finish the read chunk list */
+ *iptr++ = xdr_zero; /* encode a NULL write chunk list */
+ *iptr++ = xdr_zero; /* encode a NULL reply chunk */
+ } else {
+ warray->wc_discrim = xdr_one;
+ warray->wc_nchunks = cpu_to_be32(nchunks);
+ iptr = (__be32 *) cur_wchunk;
+ if (type == rpcrdma_writech) {
+ *iptr++ = xdr_zero; /* finish the write chunk list */
+ *iptr++ = xdr_zero; /* encode a NULL reply chunk */
+ }
+ }
+
+ /*
+ * Return header size.
+ */
+ return (unsigned char *)iptr - (unsigned char *)headerp;
+
+out:
+ if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+ return n;
+
+ for (pos = 0; nchunks--;)
+ pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+ &req->rl_segments[pos]);
+ return n;
+}
+
+/*
+ * Copy write data inline.
+ * This function is used for "small" requests. Data which is passed
+ * to RPC via iovecs (or page list) is copied directly into the
+ * pre-registered memory buffer for this request. For small amounts
+ * of data, this is efficient. The cutoff value is tunable.
+ */
+static int
+rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+{
+ int i, npages, curlen;
+ int copy_len;
+ unsigned char *srcp, *destp;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+ int page_base;
+ struct page **ppages;
+
+ destp = rqst->rq_svec[0].iov_base;
+ curlen = rqst->rq_svec[0].iov_len;
+ destp += curlen;
+ /*
+ * Do optional padding where it makes sense. Alignment of write
+ * payload can help the server, if our setting is accurate.
+ */
+ pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
+ if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
+ pad = 0; /* don't pad this request */
+
+ dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n",
+ __func__, pad, destp, rqst->rq_slen, curlen);
+
+ copy_len = rqst->rq_snd_buf.page_len;
+
+ if (rqst->rq_snd_buf.tail[0].iov_len) {
+ curlen = rqst->rq_snd_buf.tail[0].iov_len;
+ if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
+ memmove(destp + copy_len,
+ rqst->rq_snd_buf.tail[0].iov_base, curlen);
+ r_xprt->rx_stats.pullup_copy_count += curlen;
+ }
+ dprintk("RPC: %s: tail destp 0x%p len %d\n",
+ __func__, destp + copy_len, curlen);
+ rqst->rq_svec[0].iov_len += curlen;
+ }
+ r_xprt->rx_stats.pullup_copy_count += copy_len;
+
+ page_base = rqst->rq_snd_buf.page_base;
+ ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
+ page_base &= ~PAGE_MASK;
+ npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
+ for (i = 0; copy_len && i < npages; i++) {
+ curlen = PAGE_SIZE - page_base;
+ if (curlen > copy_len)
+ curlen = copy_len;
+ dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n",
+ __func__, i, destp, copy_len, curlen);
+ srcp = kmap_atomic(ppages[i]);
+ memcpy(destp, srcp+page_base, curlen);
+ kunmap_atomic(srcp);
+ rqst->rq_svec[0].iov_len += curlen;
+ destp += curlen;
+ copy_len -= curlen;
+ page_base = 0;
+ }
+ /* header now contains entire send message */
+ return pad;
+}
+
+/*
+ * Marshal a request: the primary job of this routine is to choose
+ * the transfer modes. See comments below.
+ *
+ * Uses multiple RDMA IOVs for a request:
+ * [0] -- RPC RDMA header, which uses memory from the *start* of the
+ * preregistered buffer that already holds the RPC data in
+ * its middle.
+ * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
+ * [2] -- optional padding.
+ * [3] -- if padded, header only in [1] and data here.
+ *
+ * Returns zero on success, otherwise a negative errno.
+ */
+
+int
+rpcrdma_marshal_req(struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ char *base;
+ size_t rpclen, padlen;
+ ssize_t hdrlen;
+ enum rpcrdma_chunktype rtype, wtype;
+ struct rpcrdma_msg *headerp;
+
+ /*
+ * rpclen gets amount of data in first buffer, which is the
+ * pre-registered buffer.
+ */
+ base = rqst->rq_svec[0].iov_base;
+ rpclen = rqst->rq_svec[0].iov_len;
+
+ headerp = rdmab_to_msg(req->rl_rdmabuf);
+ /* don't byte-swap XID, it's already done in request */
+ headerp->rm_xid = rqst->rq_xid;
+ headerp->rm_vers = rpcrdma_version;
+ headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
+ headerp->rm_type = rdma_msg;
+
+ /*
+ * Chunks needed for results?
+ *
+ * o If the expected result is under the inline threshold, all ops
+ * return as inline (but see later).
+ * o Large non-read ops return as a single reply chunk.
+ * o Large read ops return data as write chunk(s), header as inline.
+ *
+ * Note: the NFS code sending down multiple result segments implies
+ * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
+ */
+
+ /*
+ * This code can handle read chunks, write chunks OR reply
+ * chunks -- only one type. If the request is too big to fit
+ * inline, then we will choose read chunks. If the request is
+ * a READ, then use write chunks to separate the file data
+ * into pages; otherwise use reply chunks.
+ */
+ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
+ wtype = rpcrdma_noch;
+ else if (rqst->rq_rcv_buf.page_len == 0)
+ wtype = rpcrdma_replych;
+ else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ wtype = rpcrdma_writech;
+ else
+ wtype = rpcrdma_replych;
+
+ /*
+ * Chunks needed for arguments?
+ *
+ * o If the total request is under the inline threshold, all ops
+ * are sent as inline.
+ * o Large non-write ops are sent with the entire message as a
+ * single read chunk (protocol 0-position special case).
+ * o Large write ops transmit data as read chunk(s), header as
+ * inline.
+ *
+ * Note: the NFS code sending down multiple argument segments
+ * implies the op is a write.
+ * TBD check NFSv4 setacl
+ */
+ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ rtype = rpcrdma_noch;
+ else if (rqst->rq_snd_buf.page_len == 0)
+ rtype = rpcrdma_areadch;
+ else
+ rtype = rpcrdma_readch;
+
+ /* The following simplification is not true forever */
+ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+ wtype = rpcrdma_noch;
+ if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
+ dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
+ __func__);
+ return -EIO;
+ }
+
+ hdrlen = RPCRDMA_HDRLEN_MIN;
+ padlen = 0;
+
+ /*
+ * Pull up any extra send data into the preregistered buffer.
+ * When padding is in use and applies to the transfer, insert
+ * it and change the message type.
+ */
+ if (rtype == rpcrdma_noch) {
+
+ padlen = rpcrdma_inline_pullup(rqst,
+ RPCRDMA_INLINE_PAD_VALUE(rqst));
+
+ if (padlen) {
+ headerp->rm_type = rdma_msgp;
+ headerp->rm_body.rm_padded.rm_align =
+ cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
+ headerp->rm_body.rm_padded.rm_thresh =
+ cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
+ headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
+ headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+ headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+ hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+ if (wtype != rpcrdma_noch) {
+ dprintk("RPC: %s: invalid chunk list\n",
+ __func__);
+ return -EIO;
+ }
+ } else {
+ headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+ /* new length after pullup */
+ rpclen = rqst->rq_svec[0].iov_len;
+ /*
+ * Currently we try to not actually use read inline.
+ * Reply chunks have the desirable property that
+ * they land, packed, directly in the target buffers
+ * without headers, so they require no fixup. The
+ * additional RDMA Write op sends the same amount
+ * of data, streams on-the-wire and adds no overhead
+ * on receive. Therefore, we request a reply chunk
+ * for non-writes wherever feasible and efficient.
+ */
+ if (wtype == rpcrdma_noch)
+ wtype = rpcrdma_replych;
+ }
+ }
+
+ if (rtype != rpcrdma_noch) {
+ hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+ headerp, rtype);
+ wtype = rtype; /* simplify dprintk */
+
+ } else if (wtype != rpcrdma_noch) {
+ hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+ headerp, wtype);
+ }
+ if (hdrlen < 0)
+ return hdrlen;
+
+ dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+ " headerp 0x%p base 0x%p lkey 0x%x\n",
+ __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+ headerp, base, rdmab_lkey(req->rl_rdmabuf));
+
+ /*
+ * initialize send_iov's - normally only two: rdma chunk header and
+ * single preregistered RPC header buffer, but if padding is present,
+ * then use a preregistered (and zeroed) pad buffer between the RPC
+ * header and any write data. In all non-rdma cases, any following
+ * data has been copied into the RPC header buffer.
+ */
+ req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
+ req->rl_send_iov[0].length = hdrlen;
+ req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+
+ req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
+ req->rl_send_iov[1].length = rpclen;
+ req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
+
+ req->rl_niovs = 2;
+
+ if (padlen) {
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+
+ req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
+ req->rl_send_iov[2].length = padlen;
+ req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
+
+ req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
+ req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
+ req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
+
+ req->rl_niovs = 4;
+ }
+
+ return 0;
+}
+
+/*
+ * Chase down a received write or reply chunklist to get length
+ * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
+ */
+static int
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
+{
+ unsigned int i, total_len;
+ struct rpcrdma_write_chunk *cur_wchunk;
+ char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
+
+ i = be32_to_cpu(**iptrp);
+ if (i > max)
+ return -1;
+ cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
+ total_len = 0;
+ while (i--) {
+ struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
+ ifdebug(FACILITY) {
+ u64 off;
+ xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
+ dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",
+ __func__,
+ be32_to_cpu(seg->rs_length),
+ (unsigned long long)off,
+ be32_to_cpu(seg->rs_handle));
+ }
+ total_len += be32_to_cpu(seg->rs_length);
+ ++cur_wchunk;
+ }
+ /* check and adjust for properly terminated write chunk */
+ if (wrchunk) {
+ __be32 *w = (__be32 *) cur_wchunk;
+ if (*w++ != xdr_zero)
+ return -1;
+ cur_wchunk = (struct rpcrdma_write_chunk *) w;
+ }
+ if ((char *)cur_wchunk > base + rep->rr_len)
+ return -1;
+
+ *iptrp = (__be32 *) cur_wchunk;
+ return total_len;
+}
+
+/*
+ * Scatter inline received data back into provided iov's.
+ */
+static void
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+{
+ int i, npages, curlen, olen;
+ char *destp;
+ struct page **ppages;
+ int page_base;
+
+ curlen = rqst->rq_rcv_buf.head[0].iov_len;
+ if (curlen > copy_len) { /* write chunk header fixup */
+ curlen = copy_len;
+ rqst->rq_rcv_buf.head[0].iov_len = curlen;
+ }
+
+ dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
+ __func__, srcp, copy_len, curlen);
+
+ /* Shift pointer for first receive segment only */
+ rqst->rq_rcv_buf.head[0].iov_base = srcp;
+ srcp += curlen;
+ copy_len -= curlen;
+
+ olen = copy_len;
+ i = 0;
+ rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
+ page_base = rqst->rq_rcv_buf.page_base;
+ ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
+ page_base &= ~PAGE_MASK;
+
+ if (copy_len && rqst->rq_rcv_buf.page_len) {
+ npages = PAGE_ALIGN(page_base +
+ rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
+ for (; i < npages; i++) {
+ curlen = PAGE_SIZE - page_base;
+ if (curlen > copy_len)
+ curlen = copy_len;
+ dprintk("RPC: %s: page %d"
+ " srcp 0x%p len %d curlen %d\n",
+ __func__, i, srcp, copy_len, curlen);
+ destp = kmap_atomic(ppages[i]);
+ memcpy(destp + page_base, srcp, curlen);
+ flush_dcache_page(ppages[i]);
+ kunmap_atomic(destp);
+ srcp += curlen;
+ copy_len -= curlen;
+ if (copy_len == 0)
+ break;
+ page_base = 0;
+ }
+ }
+
+ if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+ curlen = copy_len;
+ if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
+ curlen = rqst->rq_rcv_buf.tail[0].iov_len;
+ if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
+ memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+ dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
+ __func__, srcp, copy_len, curlen);
+ rqst->rq_rcv_buf.tail[0].iov_len = curlen;
+ copy_len -= curlen; ++i;
+ } else
+ rqst->rq_rcv_buf.tail[0].iov_len = 0;
+
+ if (pad) {
+ /* implicit padding on terminal chunk */
+ unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
+ while (pad--)
+ p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+ }
+
+ if (copy_len)
+ dprintk("RPC: %s: %d bytes in"
+ " %d extra segments (%d lost)\n",
+ __func__, olen, i, copy_len);
+
+ /* TBD avoid a warning from call_decode() */
+ rqst->rq_private_buf = rqst->rq_rcv_buf;
+}
+
+void
+rpcrdma_connect_worker(struct work_struct *work)
+{
+ struct rpcrdma_ep *ep =
+ container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ep, struct rpcrdma_xprt, rx_ep);
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (++xprt->connect_cookie == 0) /* maintain a reserved value */
+ ++xprt->connect_cookie;
+ if (ep->rep_connected > 0) {
+ if (!xprt_test_and_set_connected(xprt))
+ xprt_wake_pending_tasks(xprt, 0);
+ } else {
+ if (xprt_test_and_clear_connected(xprt))
+ xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ }
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+/*
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
+ */
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+ schedule_delayed_work(&ep->rep_connect_worker, 0);
+}
+
+/*
+ * Called as a tasklet to do req/reply match and complete a request
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+ */
+void
+rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+{
+ struct rpcrdma_msg *headerp;
+ struct rpcrdma_req *req;
+ struct rpc_rqst *rqst;
+ struct rpc_xprt *xprt = rep->rr_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ __be32 *iptr;
+ int rdmalen, status;
+ unsigned long cwnd;
+ u32 credits;
+
+ /* Check status. If bad, signal disconnect and return rep to pool */
+ if (rep->rr_len == ~0U) {
+ rpcrdma_recv_buffer_put(rep);
+ if (r_xprt->rx_ep.rep_connected == 1) {
+ r_xprt->rx_ep.rep_connected = -EIO;
+ rpcrdma_conn_func(&r_xprt->rx_ep);
+ }
+ return;
+ }
+ if (rep->rr_len < RPCRDMA_HDRLEN_MIN) {
+ dprintk("RPC: %s: short/invalid reply\n", __func__);
+ goto repost;
+ }
+ headerp = rdmab_to_msg(rep->rr_rdmabuf);
+ if (headerp->rm_vers != rpcrdma_version) {
+ dprintk("RPC: %s: invalid version %d\n",
+ __func__, be32_to_cpu(headerp->rm_vers));
+ goto repost;
+ }
+
+ /* Get XID and try for a match. */
+ spin_lock(&xprt->transport_lock);
+ rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
+ if (rqst == NULL) {
+ spin_unlock(&xprt->transport_lock);
+ dprintk("RPC: %s: reply 0x%p failed "
+ "to match any request xid 0x%08x len %d\n",
+ __func__, rep, be32_to_cpu(headerp->rm_xid),
+ rep->rr_len);
+repost:
+ r_xprt->rx_stats.bad_reply_count++;
+ rep->rr_func = rpcrdma_reply_handler;
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ rpcrdma_recv_buffer_put(rep);
+
+ return;
+ }
+
+ /* get request object */
+ req = rpcr_to_rdmar(rqst);
+ if (req->rl_reply) {
+ spin_unlock(&xprt->transport_lock);
+ dprintk("RPC: %s: duplicate reply 0x%p to RPC "
+ "request 0x%p: xid 0x%08x\n", __func__, rep, req,
+ be32_to_cpu(headerp->rm_xid));
+ goto repost;
+ }
+
+ dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
+ " RPC request 0x%p xid 0x%08x\n",
+ __func__, rep, req, rqst,
+ be32_to_cpu(headerp->rm_xid));
+
+ /* from here on, the reply is no longer an orphan */
+ req->rl_reply = rep;
+ xprt->reestablish_timeout = 0;
+
+ /* check for expected message types */
+ /* The order of some of these tests is important. */
+ switch (headerp->rm_type) {
+ case rdma_msg:
+ /* never expect read chunks */
+ /* never expect reply chunks (two ways to check) */
+ /* never expect write chunks without having offered RDMA */
+ if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+ (headerp->rm_body.rm_chunks[1] == xdr_zero &&
+ headerp->rm_body.rm_chunks[2] != xdr_zero) ||
+ (headerp->rm_body.rm_chunks[1] != xdr_zero &&
+ req->rl_nchunks == 0))
+ goto badheader;
+ if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
+ /* count any expected write chunks in read reply */
+ /* start at write chunk array count */
+ iptr = &headerp->rm_body.rm_chunks[2];
+ rdmalen = rpcrdma_count_chunks(rep,
+ req->rl_nchunks, 1, &iptr);
+ /* check for validity, and no reply chunk after */
+ if (rdmalen < 0 || *iptr++ != xdr_zero)
+ goto badheader;
+ rep->rr_len -=
+ ((unsigned char *)iptr - (unsigned char *)headerp);
+ status = rep->rr_len + rdmalen;
+ r_xprt->rx_stats.total_rdma_reply += rdmalen;
+ /* special case - last chunk may omit padding */
+ if (rdmalen &= 3) {
+ rdmalen = 4 - rdmalen;
+ status += rdmalen;
+ }
+ } else {
+ /* else ordinary inline */
+ rdmalen = 0;
+ iptr = (__be32 *)((unsigned char *)headerp +
+ RPCRDMA_HDRLEN_MIN);
+ rep->rr_len -= RPCRDMA_HDRLEN_MIN;
+ status = rep->rr_len;
+ }
+ /* Fix up the rpc results for upper layer */
+ rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
+ break;
+
+ case rdma_nomsg:
+ /* never expect read or write chunks, always reply chunks */
+ if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+ headerp->rm_body.rm_chunks[1] != xdr_zero ||
+ headerp->rm_body.rm_chunks[2] != xdr_one ||
+ req->rl_nchunks == 0)
+ goto badheader;
+ iptr = (__be32 *)((unsigned char *)headerp +
+ RPCRDMA_HDRLEN_MIN);
+ rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+ if (rdmalen < 0)
+ goto badheader;
+ r_xprt->rx_stats.total_rdma_reply += rdmalen;
+ /* Reply chunk buffer already is the reply vector - no fixup. */
+ status = rdmalen;
+ break;
+
+badheader:
+ default:
+ dprintk("%s: invalid rpcrdma reply header (type %d):"
+ " chunks[012] == %d %d %d"
+ " expected chunks <= %d\n",
+ __func__, be32_to_cpu(headerp->rm_type),
+ headerp->rm_body.rm_chunks[0],
+ headerp->rm_body.rm_chunks[1],
+ headerp->rm_body.rm_chunks[2],
+ req->rl_nchunks);
+ status = -EIO;
+ r_xprt->rx_stats.bad_reply_count++;
+ break;
+ }
+
+ credits = be32_to_cpu(headerp->rm_credit);
+ if (credits == 0)
+ credits = 1; /* don't deadlock */
+ else if (credits > r_xprt->rx_buf.rb_max_requests)
+ credits = r_xprt->rx_buf.rb_max_requests;
+
+ cwnd = xprt->cwnd;
+ xprt->cwnd = credits << RPC_CWNDSHIFT;
+ if (xprt->cwnd > cwnd)
+ xprt_release_rqst_cong(rqst->rq_task);
+
+ dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+ __func__, xprt, rqst, status);
+ xprt_complete_rqst(rqst->rq_task, status);
+ spin_unlock(&xprt->transport_lock);
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 000000000..c1b627026
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/* RPC/RDMA parameters */
+unsigned int svcrdma_ord = RPCRDMA_ORD;
+static unsigned int min_ord = 1;
+static unsigned int max_ord = 4096;
+unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+static unsigned int min_max_requests = 4;
+static unsigned int max_max_requests = 16384;
+unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
+static unsigned int min_max_inline = 4096;
+static unsigned int max_max_inline = 65536;
+
+atomic_t rdma_stat_recv;
+atomic_t rdma_stat_read;
+atomic_t rdma_stat_write;
+atomic_t rdma_stat_sq_starve;
+atomic_t rdma_stat_rq_starve;
+atomic_t rdma_stat_rq_poll;
+atomic_t rdma_stat_rq_prod;
+atomic_t rdma_stat_sq_poll;
+atomic_t rdma_stat_sq_prod;
+
+/* Temporary NFS request map and context caches */
+struct kmem_cache *svc_rdma_map_cachep;
+struct kmem_cache *svc_rdma_ctxt_cachep;
+
+struct workqueue_struct *svc_rdma_wq;
+
+/*
+ * This function implements reading and resetting an atomic_t stat
+ * variable through read/write to a proc file. Any write to the file
+ * resets the associated statistic to zero. Any read returns it's
+ * current value.
+ */
+static int read_reset_stat(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ atomic_t *stat = (atomic_t *)table->data;
+
+ if (!stat)
+ return -EINVAL;
+
+ if (write)
+ atomic_set(stat, 0);
+ else {
+ char str_buf[32];
+ char *data;
+ int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
+ if (len >= 32)
+ return -EFAULT;
+ len = strlen(str_buf);
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+ data = &str_buf[*ppos];
+ len -= *ppos;
+ if (len > *lenp)
+ len = *lenp;
+ if (len && copy_to_user(buffer, str_buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+static struct ctl_table_header *svcrdma_table_header;
+static struct ctl_table svcrdma_parm_table[] = {
+ {
+ .procname = "max_requests",
+ .data = &svcrdma_max_requests,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_max_requests,
+ .extra2 = &max_max_requests
+ },
+ {
+ .procname = "max_req_size",
+ .data = &svcrdma_max_req_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_max_inline,
+ .extra2 = &max_max_inline
+ },
+ {
+ .procname = "max_outbound_read_requests",
+ .data = &svcrdma_ord,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_ord,
+ .extra2 = &max_ord,
+ },
+
+ {
+ .procname = "rdma_stat_read",
+ .data = &rdma_stat_read,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_recv",
+ .data = &rdma_stat_recv,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_write",
+ .data = &rdma_stat_write,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_sq_starve",
+ .data = &rdma_stat_sq_starve,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_rq_starve",
+ .data = &rdma_stat_rq_starve,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_rq_poll",
+ .data = &rdma_stat_rq_poll,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_rq_prod",
+ .data = &rdma_stat_rq_prod,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_sq_poll",
+ .data = &rdma_stat_sq_poll,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_sq_prod",
+ .data = &rdma_stat_sq_prod,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = read_reset_stat,
+ },
+ { },
+};
+
+static struct ctl_table svcrdma_table[] = {
+ {
+ .procname = "svc_rdma",
+ .mode = 0555,
+ .child = svcrdma_parm_table
+ },
+ { },
+};
+
+static struct ctl_table svcrdma_root_table[] = {
+ {
+ .procname = "sunrpc",
+ .mode = 0555,
+ .child = svcrdma_table
+ },
+ { },
+};
+
+void svc_rdma_cleanup(void)
+{
+ dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+ destroy_workqueue(svc_rdma_wq);
+ if (svcrdma_table_header) {
+ unregister_sysctl_table(svcrdma_table_header);
+ svcrdma_table_header = NULL;
+ }
+ svc_unreg_xprt_class(&svc_rdma_class);
+ kmem_cache_destroy(svc_rdma_map_cachep);
+ kmem_cache_destroy(svc_rdma_ctxt_cachep);
+}
+
+int svc_rdma_init(void)
+{
+ dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+ dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
+ dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
+ dprintk("\tsq_depth : %d\n",
+ svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+ dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
+
+ svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
+ if (!svc_rdma_wq)
+ return -ENOMEM;
+
+ if (!svcrdma_table_header)
+ svcrdma_table_header =
+ register_sysctl_table(svcrdma_root_table);
+
+ /* Create the temporary map cache */
+ svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
+ sizeof(struct svc_rdma_req_map),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!svc_rdma_map_cachep) {
+ printk(KERN_INFO "Could not allocate map cache.\n");
+ goto err0;
+ }
+
+ /* Create the temporary context cache */
+ svc_rdma_ctxt_cachep =
+ kmem_cache_create("svc_rdma_ctxt_cache",
+ sizeof(struct svc_rdma_op_ctxt),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!svc_rdma_ctxt_cachep) {
+ printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
+ goto err1;
+ }
+
+ /* Register RDMA with the SVC transport switch */
+ svc_reg_xprt_class(&svc_rdma_class);
+ return 0;
+ err1:
+ kmem_cache_destroy(svc_rdma_map_cachep);
+ err0:
+ unregister_sysctl_table(svcrdma_table_header);
+ destroy_workqueue(svc_rdma_wq);
+ return -ENOMEM;
+}
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("SVC RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+module_init(svc_rdma_init);
+module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644
index 000000000..b681855cf
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/debug.h>
+#include <asm/unaligned.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/*
+ * Decodes a read chunk list. The expected format is as follows:
+ * descrim : xdr_one
+ * position : u32 offset into XDR stream
+ * handle : u32 RKEY
+ * . . .
+ * end-of-list: xdr_zero
+ */
+static u32 *decode_read_list(u32 *va, u32 *vaend)
+{
+ struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+
+ while (ch->rc_discrim != xdr_zero) {
+ if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
+ (unsigned long)vaend) {
+ dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+ return NULL;
+ }
+ ch++;
+ }
+ return (u32 *)&ch->rc_position;
+}
+
+/*
+ * Decodes a write chunk list. The expected format is as follows:
+ * descrim : xdr_one
+ * nchunks : <count>
+ * handle : u32 RKEY ---+
+ * length : u32 <len of segment> |
+ * offset : remove va + <count>
+ * . . . |
+ * ---+
+ */
+static u32 *decode_write_list(u32 *va, u32 *vaend)
+{
+ unsigned long start, end;
+ int nchunks;
+
+ struct rpcrdma_write_array *ary =
+ (struct rpcrdma_write_array *)va;
+
+ /* Check for not write-array */
+ if (ary->wc_discrim == xdr_zero)
+ return (u32 *)&ary->wc_nchunks;
+
+ if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+ (unsigned long)vaend) {
+ dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+ return NULL;
+ }
+ nchunks = ntohl(ary->wc_nchunks);
+
+ start = (unsigned long)&ary->wc_array[0];
+ end = (unsigned long)vaend;
+ if (nchunks < 0 ||
+ nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
+ (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
+ dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+ ary, nchunks, vaend);
+ return NULL;
+ }
+ /*
+ * rs_length is the 2nd 4B field in wc_target and taking its
+ * address skips the list terminator
+ */
+ return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length;
+}
+
+static u32 *decode_reply_array(u32 *va, u32 *vaend)
+{
+ unsigned long start, end;
+ int nchunks;
+ struct rpcrdma_write_array *ary =
+ (struct rpcrdma_write_array *)va;
+
+ /* Check for no reply-array */
+ if (ary->wc_discrim == xdr_zero)
+ return (u32 *)&ary->wc_nchunks;
+
+ if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+ (unsigned long)vaend) {
+ dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+ return NULL;
+ }
+ nchunks = ntohl(ary->wc_nchunks);
+
+ start = (unsigned long)&ary->wc_array[0];
+ end = (unsigned long)vaend;
+ if (nchunks < 0 ||
+ nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
+ (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
+ dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+ ary, nchunks, vaend);
+ return NULL;
+ }
+ return (u32 *)&ary->wc_array[nchunks];
+}
+
+int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
+ struct svc_rqst *rqstp)
+{
+ struct rpcrdma_msg *rmsgp = NULL;
+ u32 *va;
+ u32 *vaend;
+ u32 hdr_len;
+
+ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+ /* Verify that there's enough bytes for header + something */
+ if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
+ dprintk("svcrdma: header too short = %d\n",
+ rqstp->rq_arg.len);
+ return -EINVAL;
+ }
+
+ /* Decode the header */
+ rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
+ rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
+ rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
+ rmsgp->rm_type = ntohl(rmsgp->rm_type);
+
+ if (rmsgp->rm_vers != RPCRDMA_VERSION)
+ return -ENOSYS;
+
+ /* Pull in the extra for the padded case and bump our pointer */
+ if (rmsgp->rm_type == RDMA_MSGP) {
+ int hdrlen;
+ rmsgp->rm_body.rm_padded.rm_align =
+ ntohl(rmsgp->rm_body.rm_padded.rm_align);
+ rmsgp->rm_body.rm_padded.rm_thresh =
+ ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+
+ va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+ rqstp->rq_arg.head[0].iov_base = va;
+ hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+ rqstp->rq_arg.head[0].iov_len -= hdrlen;
+ if (hdrlen > rqstp->rq_arg.len)
+ return -EINVAL;
+ return hdrlen;
+ }
+
+ /* The chunk list may contain either a read chunk list or a write
+ * chunk list and a reply chunk list.
+ */
+ va = &rmsgp->rm_body.rm_chunks[0];
+ vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+ va = decode_read_list(va, vaend);
+ if (!va)
+ return -EINVAL;
+ va = decode_write_list(va, vaend);
+ if (!va)
+ return -EINVAL;
+ va = decode_reply_array(va, vaend);
+ if (!va)
+ return -EINVAL;
+
+ rqstp->rq_arg.head[0].iov_base = va;
+ hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+ rqstp->rq_arg.head[0].iov_len -= hdr_len;
+
+ *rdma_req = rmsgp;
+ return hdr_len;
+}
+
+int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
+{
+ struct rpcrdma_msg *rmsgp = NULL;
+ struct rpcrdma_read_chunk *ch;
+ struct rpcrdma_write_array *ary;
+ u32 *va;
+ u32 hdrlen;
+
+ dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
+ rqstp);
+ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+ /* Pull in the extra for the padded case and bump our pointer */
+ if (rmsgp->rm_type == RDMA_MSGP) {
+ va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+ rqstp->rq_arg.head[0].iov_base = va;
+ hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+ rqstp->rq_arg.head[0].iov_len -= hdrlen;
+ return hdrlen;
+ }
+
+ /*
+ * Skip all chunks to find RPC msg. These were previously processed
+ */
+ va = &rmsgp->rm_body.rm_chunks[0];
+
+ /* Skip read-list */
+ for (ch = (struct rpcrdma_read_chunk *)va;
+ ch->rc_discrim != xdr_zero; ch++);
+ va = (u32 *)&ch->rc_position;
+
+ /* Skip write-list */
+ ary = (struct rpcrdma_write_array *)va;
+ if (ary->wc_discrim == xdr_zero)
+ va = (u32 *)&ary->wc_nchunks;
+ else
+ /*
+ * rs_length is the 2nd 4B field in wc_target and taking its
+ * address skips the list terminator
+ */
+ va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
+
+ /* Skip reply-array */
+ ary = (struct rpcrdma_write_array *)va;
+ if (ary->wc_discrim == xdr_zero)
+ va = (u32 *)&ary->wc_nchunks;
+ else
+ va = (u32 *)&ary->wc_array[ary->wc_nchunks];
+
+ rqstp->rq_arg.head[0].iov_base = va;
+ hdrlen = (unsigned long)va - (unsigned long)rmsgp;
+ rqstp->rq_arg.head[0].iov_len -= hdrlen;
+
+ return hdrlen;
+}
+
+int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rmsgp,
+ enum rpcrdma_errcode err, u32 *va)
+{
+ u32 *startp = va;
+
+ *va++ = htonl(rmsgp->rm_xid);
+ *va++ = htonl(rmsgp->rm_vers);
+ *va++ = htonl(xprt->sc_max_requests);
+ *va++ = htonl(RDMA_ERROR);
+ *va++ = htonl(err);
+ if (err == ERR_VERS) {
+ *va++ = htonl(RPCRDMA_VERSION);
+ *va++ = htonl(RPCRDMA_VERSION);
+ }
+
+ return (int)((unsigned long)va - (unsigned long)startp);
+}
+
+int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_write_array *wr_ary;
+
+ /* There is no read-list in a reply */
+
+ /* skip write list */
+ wr_ary = (struct rpcrdma_write_array *)
+ &rmsgp->rm_body.rm_chunks[1];
+ if (wr_ary->wc_discrim)
+ wr_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
+ wc_target.rs_length;
+ else
+ wr_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_nchunks;
+
+ /* skip reply array */
+ if (wr_ary->wc_discrim)
+ wr_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
+ else
+ wr_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_nchunks;
+
+ return (unsigned long) wr_ary - (unsigned long) rmsgp;
+}
+
+void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
+{
+ struct rpcrdma_write_array *ary;
+
+ /* no read-list */
+ rmsgp->rm_body.rm_chunks[0] = xdr_zero;
+
+ /* write-array discrim */
+ ary = (struct rpcrdma_write_array *)
+ &rmsgp->rm_body.rm_chunks[1];
+ ary->wc_discrim = xdr_one;
+ ary->wc_nchunks = htonl(chunks);
+
+ /* write-list terminator */
+ ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
+
+ /* reply-array discriminator */
+ ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
+}
+
+void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
+ int chunks)
+{
+ ary->wc_discrim = xdr_one;
+ ary->wc_nchunks = htonl(chunks);
+}
+
+void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
+ int chunk_no,
+ __be32 rs_handle,
+ __be64 rs_offset,
+ u32 write_len)
+{
+ struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
+ seg->rs_handle = rs_handle;
+ seg->rs_offset = rs_offset;
+ seg->rs_length = htonl(write_len);
+}
+
+void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_msg *rdma_resp,
+ enum rpcrdma_proc rdma_type)
+{
+ rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
+ rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
+ rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
+ rdma_resp->rm_type = htonl(rdma_type);
+
+ /* Encode <nul> chunks lists */
+ rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
+ rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
+ rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 000000000..f9f13a32d
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/*
+ * Replace the pages in the rq_argpages array with the pages from the SGE in
+ * the RDMA_RECV completion. The SGL should contain full pages up until the
+ * last one.
+ */
+static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *ctxt,
+ u32 byte_count)
+{
+ struct rpcrdma_msg *rmsgp;
+ struct page *page;
+ u32 bc;
+ int sge_no;
+
+ /* Swap the page in the SGE with the page in argpages */
+ page = ctxt->pages[0];
+ put_page(rqstp->rq_pages[0]);
+ rqstp->rq_pages[0] = page;
+
+ /* Set up the XDR head */
+ rqstp->rq_arg.head[0].iov_base = page_address(page);
+ rqstp->rq_arg.head[0].iov_len =
+ min_t(size_t, byte_count, ctxt->sge[0].length);
+ rqstp->rq_arg.len = byte_count;
+ rqstp->rq_arg.buflen = byte_count;
+
+ /* Compute bytes past head in the SGL */
+ bc = byte_count - rqstp->rq_arg.head[0].iov_len;
+
+ /* If data remains, store it in the pagelist */
+ rqstp->rq_arg.page_len = bc;
+ rqstp->rq_arg.page_base = 0;
+
+ /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+ if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+ rqstp->rq_arg.pages = &rqstp->rq_pages[0];
+ else
+ rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
+ sge_no = 1;
+ while (bc && sge_no < ctxt->count) {
+ page = ctxt->pages[sge_no];
+ put_page(rqstp->rq_pages[sge_no]);
+ rqstp->rq_pages[sge_no] = page;
+ bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
+ rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
+ sge_no++;
+ }
+ rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* If not all pages were used from the SGL, free the remaining ones */
+ bc = sge_no;
+ while (sge_no < ctxt->count) {
+ page = ctxt->pages[sge_no++];
+ put_page(page);
+ }
+ ctxt->count = bc;
+
+ /* Set up tail */
+ rqstp->rq_arg.tail[0].iov_base = NULL;
+ rqstp->rq_arg.tail[0].iov_len = 0;
+}
+
+static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+{
+ if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
+ RDMA_TRANSPORT_IWARP)
+ return 1;
+ else
+ return min_t(int, sge_count, xprt->sc_max_sge);
+}
+
+/* Issue an RDMA_READ using the local lkey to map the data sink */
+int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ int *page_no,
+ u32 *page_offset,
+ u32 rs_handle,
+ u32 rs_length,
+ u64 rs_offset,
+ bool last)
+{
+ struct ib_send_wr read_wr;
+ int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+ struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
+ int ret, read, pno;
+ u32 pg_off = *page_offset;
+ u32 pg_no = *page_no;
+
+ ctxt->direction = DMA_FROM_DEVICE;
+ ctxt->read_hdr = head;
+ pages_needed =
+ min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
+ read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+
+ for (pno = 0; pno < pages_needed; pno++) {
+ int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
+
+ head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
+ head->arg.page_len += len;
+ head->arg.len += len;
+ if (!pg_off)
+ head->count++;
+ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+ ctxt->sge[pno].addr =
+ ib_dma_map_page(xprt->sc_cm_id->device,
+ head->arg.pages[pg_no], pg_off,
+ PAGE_SIZE - pg_off,
+ DMA_FROM_DEVICE);
+ ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
+ ctxt->sge[pno].addr);
+ if (ret)
+ goto err;
+ atomic_inc(&xprt->sc_dma_used);
+
+ /* The lkey here is either a local dma lkey or a dma_mr lkey */
+ ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[pno].length = len;
+ ctxt->count++;
+
+ /* adjust offset and wrap to next page if needed */
+ pg_off += len;
+ if (pg_off == PAGE_SIZE) {
+ pg_off = 0;
+ pg_no++;
+ }
+ rs_length -= len;
+ }
+
+ if (last && rs_length == 0)
+ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ else
+ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+
+ memset(&read_wr, 0, sizeof(read_wr));
+ read_wr.wr_id = (unsigned long)ctxt;
+ read_wr.opcode = IB_WR_RDMA_READ;
+ ctxt->wr_op = read_wr.opcode;
+ read_wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.wr.rdma.rkey = rs_handle;
+ read_wr.wr.rdma.remote_addr = rs_offset;
+ read_wr.sg_list = ctxt->sge;
+ read_wr.num_sge = pages_needed;
+
+ ret = svc_rdma_send(xprt, &read_wr);
+ if (ret) {
+ pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ goto err;
+ }
+
+ /* return current location in page array */
+ *page_no = pg_no;
+ *page_offset = pg_off;
+ ret = read;
+ atomic_inc(&rdma_stat_read);
+ return ret;
+ err:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
+ return ret;
+}
+
+/* Issue an RDMA_READ using an FRMR to map the data sink */
+int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ int *page_no,
+ u32 *page_offset,
+ u32 rs_handle,
+ u32 rs_length,
+ u64 rs_offset,
+ bool last)
+{
+ struct ib_send_wr read_wr;
+ struct ib_send_wr inv_wr;
+ struct ib_send_wr fastreg_wr;
+ u8 key;
+ int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+ struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
+ struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
+ int ret, read, pno;
+ u32 pg_off = *page_offset;
+ u32 pg_no = *page_no;
+
+ if (IS_ERR(frmr))
+ return -ENOMEM;
+
+ ctxt->direction = DMA_FROM_DEVICE;
+ ctxt->frmr = frmr;
+ pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
+ read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+
+ frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
+ frmr->direction = DMA_FROM_DEVICE;
+ frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+ frmr->map_len = pages_needed << PAGE_SHIFT;
+ frmr->page_list_len = pages_needed;
+
+ for (pno = 0; pno < pages_needed; pno++) {
+ int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
+
+ head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
+ head->arg.page_len += len;
+ head->arg.len += len;
+ if (!pg_off)
+ head->count++;
+ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+ frmr->page_list->page_list[pno] =
+ ib_dma_map_page(xprt->sc_cm_id->device,
+ head->arg.pages[pg_no], 0,
+ PAGE_SIZE, DMA_FROM_DEVICE);
+ ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
+ frmr->page_list->page_list[pno]);
+ if (ret)
+ goto err;
+ atomic_inc(&xprt->sc_dma_used);
+
+ /* adjust offset and wrap to next page if needed */
+ pg_off += len;
+ if (pg_off == PAGE_SIZE) {
+ pg_off = 0;
+ pg_no++;
+ }
+ rs_length -= len;
+ }
+
+ if (last && rs_length == 0)
+ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ else
+ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+
+ /* Bump the key */
+ key = (u8)(frmr->mr->lkey & 0x000000FF);
+ ib_update_fast_reg_key(frmr->mr, ++key);
+
+ ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
+ ctxt->sge[0].lkey = frmr->mr->lkey;
+ ctxt->sge[0].length = read;
+ ctxt->count = 1;
+ ctxt->read_hdr = head;
+
+ /* Prepare FASTREG WR */
+ memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.send_flags = IB_SEND_SIGNALED;
+ fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+ fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+ fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.length = frmr->map_len;
+ fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+ fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+ fastreg_wr.next = &read_wr;
+
+ /* Prepare RDMA_READ */
+ memset(&read_wr, 0, sizeof(read_wr));
+ read_wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.wr.rdma.rkey = rs_handle;
+ read_wr.wr.rdma.remote_addr = rs_offset;
+ read_wr.sg_list = ctxt->sge;
+ read_wr.num_sge = 1;
+ if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
+ read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ read_wr.wr_id = (unsigned long)ctxt;
+ read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
+ } else {
+ read_wr.opcode = IB_WR_RDMA_READ;
+ read_wr.next = &inv_wr;
+ /* Prepare invalidate */
+ memset(&inv_wr, 0, sizeof(inv_wr));
+ inv_wr.wr_id = (unsigned long)ctxt;
+ inv_wr.opcode = IB_WR_LOCAL_INV;
+ inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
+ inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
+ }
+ ctxt->wr_op = read_wr.opcode;
+
+ /* Post the chain */
+ ret = svc_rdma_send(xprt, &fastreg_wr);
+ if (ret) {
+ pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ goto err;
+ }
+
+ /* return current location in page array */
+ *page_no = pg_no;
+ *page_offset = pg_off;
+ ret = read;
+ atomic_inc(&rdma_stat_read);
+ return ret;
+ err:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
+ svc_rdma_put_frmr(xprt, frmr);
+ return ret;
+}
+
+static unsigned int
+rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch)
+{
+ unsigned int count;
+
+ for (count = 0; ch->rc_discrim != xdr_zero; ch++)
+ count++;
+ return count;
+}
+
+/* If there was additional inline content, append it to the end of arg.pages.
+ * Tail copy has to be done after the reader function has determined how many
+ * pages are needed for RDMA READ.
+ */
+static int
+rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head,
+ u32 position, u32 byte_count, u32 page_offset, int page_no)
+{
+ char *srcp, *destp;
+ int ret;
+
+ ret = 0;
+ srcp = head->arg.head[0].iov_base + position;
+ byte_count = head->arg.head[0].iov_len - position;
+ if (byte_count > PAGE_SIZE) {
+ dprintk("svcrdma: large tail unsupported\n");
+ return 0;
+ }
+
+ /* Fit as much of the tail on the current page as possible */
+ if (page_offset != PAGE_SIZE) {
+ destp = page_address(rqstp->rq_arg.pages[page_no]);
+ destp += page_offset;
+ while (byte_count--) {
+ *destp++ = *srcp++;
+ page_offset++;
+ if (page_offset == PAGE_SIZE && byte_count)
+ goto more;
+ }
+ goto done;
+ }
+
+more:
+ /* Fit the rest on the next page */
+ page_no++;
+ destp = page_address(rqstp->rq_arg.pages[page_no]);
+ while (byte_count--)
+ *destp++ = *srcp++;
+
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+done:
+ byte_count = head->arg.head[0].iov_len - position;
+ head->arg.page_len += byte_count;
+ head->arg.len += byte_count;
+ head->arg.buflen += byte_count;
+ return 1;
+}
+
+static int rdma_read_chunks(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rmsgp,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head)
+{
+ int page_no, ret;
+ struct rpcrdma_read_chunk *ch;
+ u32 handle, page_offset, byte_count;
+ u32 position;
+ u64 rs_offset;
+ bool last;
+
+ /* If no read list is present, return 0 */
+ ch = svc_rdma_get_read_chunk(rmsgp);
+ if (!ch)
+ return 0;
+
+ if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES)
+ return -EINVAL;
+
+ /* The request is completed when the RDMA_READs complete. The
+ * head context keeps all the pages that comprise the
+ * request.
+ */
+ head->arg.head[0] = rqstp->rq_arg.head[0];
+ head->arg.tail[0] = rqstp->rq_arg.tail[0];
+ head->hdr_count = head->count;
+ head->arg.page_base = 0;
+ head->arg.page_len = 0;
+ head->arg.len = rqstp->rq_arg.len;
+ head->arg.buflen = rqstp->rq_arg.buflen;
+
+ ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ position = be32_to_cpu(ch->rc_position);
+
+ /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+ if (position == 0) {
+ head->arg.pages = &head->pages[0];
+ page_offset = head->byte_len;
+ } else {
+ head->arg.pages = &head->pages[head->count];
+ page_offset = 0;
+ }
+
+ ret = 0;
+ page_no = 0;
+ for (; ch->rc_discrim != xdr_zero; ch++) {
+ if (be32_to_cpu(ch->rc_position) != position)
+ goto err;
+
+ handle = be32_to_cpu(ch->rc_target.rs_handle),
+ byte_count = be32_to_cpu(ch->rc_target.rs_length);
+ xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
+ &rs_offset);
+
+ while (byte_count > 0) {
+ last = (ch + 1)->rc_discrim == xdr_zero;
+ ret = xprt->sc_reader(xprt, rqstp, head,
+ &page_no, &page_offset,
+ handle, byte_count,
+ rs_offset, last);
+ if (ret < 0)
+ goto err;
+ byte_count -= ret;
+ rs_offset += ret;
+ head->arg.buflen += ret;
+ }
+ }
+
+ /* Read list may need XDR round-up (see RFC 5666, s. 3.7) */
+ if (page_offset & 3) {
+ u32 pad = 4 - (page_offset & 3);
+
+ head->arg.page_len += pad;
+ head->arg.len += pad;
+ head->arg.buflen += pad;
+ page_offset += pad;
+ }
+
+ ret = 1;
+ if (position && position < head->arg.head[0].iov_len)
+ ret = rdma_copy_tail(rqstp, head, position,
+ byte_count, page_offset, page_no);
+ head->arg.head[0].iov_len = position;
+ head->position = position;
+
+ err:
+ /* Detach arg pages. svc_recv will replenish them */
+ for (page_no = 0;
+ &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++)
+ rqstp->rq_pages[page_no] = NULL;
+
+ return ret;
+}
+
+static int rdma_read_complete(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head)
+{
+ int page_no;
+ int ret;
+
+ /* Copy RPC pages */
+ for (page_no = 0; page_no < head->count; page_no++) {
+ put_page(rqstp->rq_pages[page_no]);
+ rqstp->rq_pages[page_no] = head->pages[page_no];
+ }
+
+ /* Adjustments made for RDMA_NOMSG type requests */
+ if (head->position == 0) {
+ if (head->arg.len <= head->sge[0].length) {
+ head->arg.head[0].iov_len = head->arg.len -
+ head->byte_len;
+ head->arg.page_len = 0;
+ } else {
+ head->arg.head[0].iov_len = head->sge[0].length -
+ head->byte_len;
+ head->arg.page_len = head->arg.len -
+ head->sge[0].length;
+ }
+ }
+
+ /* Point rq_arg.pages past header */
+ rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
+ rqstp->rq_arg.page_len = head->arg.page_len;
+ rqstp->rq_arg.page_base = head->arg.page_base;
+
+ /* rq_respages starts after the last arg page */
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* Rebuild rq_arg head and tail. */
+ rqstp->rq_arg.head[0] = head->arg.head[0];
+ rqstp->rq_arg.tail[0] = head->arg.tail[0];
+ rqstp->rq_arg.len = head->arg.len;
+ rqstp->rq_arg.buflen = head->arg.buflen;
+
+ /* Free the context */
+ svc_rdma_put_context(head, 0);
+
+ /* XXX: What should this be? */
+ rqstp->rq_prot = IPPROTO_MAX;
+ svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
+
+ ret = rqstp->rq_arg.head[0].iov_len
+ + rqstp->rq_arg.page_len
+ + rqstp->rq_arg.tail[0].iov_len;
+ dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, "
+ "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n",
+ ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
+ rqstp->rq_arg.head[0].iov_len);
+
+ return ret;
+}
+
+/*
+ * Set up the rqstp thread context to point to the RQ buffer. If
+ * necessary, pull additional data from the client with an RDMA_READ
+ * request.
+ */
+int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svcxprt_rdma *rdma_xprt =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+ struct rpcrdma_msg *rmsgp;
+ int ret = 0;
+ int len;
+
+ dprintk("svcrdma: rqstp=%p\n", rqstp);
+
+ spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+ if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
+ ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+ spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ return rdma_read_complete(rqstp, ctxt);
+ } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+ ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+ } else {
+ atomic_inc(&rdma_stat_rq_starve);
+ clear_bit(XPT_DATA, &xprt->xpt_flags);
+ ctxt = NULL;
+ }
+ spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ if (!ctxt) {
+ /* This is the EAGAIN path. The svc_recv routine will
+ * return -EAGAIN, the nfsd thread will go to call into
+ * svc_recv again and we shouldn't be on the active
+ * transport list
+ */
+ if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+ goto close_out;
+
+ goto out;
+ }
+ dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
+ ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+ atomic_inc(&rdma_stat_recv);
+
+ /* Build up the XDR from the receive buffers. */
+ rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
+
+ /* Decode the RDMA header. */
+ len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
+ rqstp->rq_xprt_hlen = len;
+
+ /* If the request is invalid, reply with an error */
+ if (len < 0) {
+ if (len == -ENOSYS)
+ svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
+ goto close_out;
+ }
+
+ /* Read read-list data. */
+ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
+ if (ret > 0) {
+ /* read-list posted, defer until data received from client. */
+ goto defer;
+ } else if (ret < 0) {
+ /* Post of read-list failed, free context. */
+ svc_rdma_put_context(ctxt, 1);
+ return 0;
+ }
+
+ ret = rqstp->rq_arg.head[0].iov_len
+ + rqstp->rq_arg.page_len
+ + rqstp->rq_arg.tail[0].iov_len;
+ svc_rdma_put_context(ctxt, 0);
+ out:
+ dprintk("svcrdma: ret=%d, rq_arg.len=%u, "
+ "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zd\n",
+ ret, rqstp->rq_arg.len,
+ rqstp->rq_arg.head[0].iov_base,
+ rqstp->rq_arg.head[0].iov_len);
+ rqstp->rq_prot = IPPROTO_MAX;
+ svc_xprt_copy_addrs(rqstp, xprt);
+ return ret;
+
+ close_out:
+ if (ctxt)
+ svc_rdma_put_context(ctxt, 1);
+ dprintk("svcrdma: transport %p is closing\n", xprt);
+ /*
+ * Set the close bit and enqueue it. svc_recv will see the
+ * close bit and call svc_xprt_delete
+ */
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+defer:
+ return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 000000000..7de33d1af
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+static int map_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
+{
+ int sge_no;
+ u32 sge_bytes;
+ u32 page_bytes;
+ u32 page_off;
+ int page_no;
+
+ if (xdr->len !=
+ (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
+ pr_err("svcrdma: map_xdr: XDR buffer length error\n");
+ return -EIO;
+ }
+
+ /* Skip the first sge, this is for the RPCRDMA header */
+ sge_no = 1;
+
+ /* Head SGE */
+ vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+ vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+ sge_no++;
+
+ /* pages SGE */
+ page_no = 0;
+ page_bytes = xdr->page_len;
+ page_off = xdr->page_base;
+ while (page_bytes) {
+ vec->sge[sge_no].iov_base =
+ page_address(xdr->pages[page_no]) + page_off;
+ sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+ page_bytes -= sge_bytes;
+ vec->sge[sge_no].iov_len = sge_bytes;
+
+ sge_no++;
+ page_no++;
+ page_off = 0; /* reset for next time through loop */
+ }
+
+ /* Tail SGE */
+ if (xdr->tail[0].iov_len) {
+ vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+ vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+ sge_no++;
+ }
+
+ dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+ "page_base %u page_len %u head_len %zu tail_len %zu\n",
+ sge_no, page_no, xdr->page_base, xdr->page_len,
+ xdr->head[0].iov_len, xdr->tail[0].iov_len);
+
+ vec->count = sge_no;
+ return 0;
+}
+
+static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ u32 xdr_off, size_t len, int dir)
+{
+ struct page *page;
+ dma_addr_t dma_addr;
+ if (xdr_off < xdr->head[0].iov_len) {
+ /* This offset is in the head */
+ xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
+ page = virt_to_page(xdr->head[0].iov_base);
+ } else {
+ xdr_off -= xdr->head[0].iov_len;
+ if (xdr_off < xdr->page_len) {
+ /* This offset is in the page list */
+ xdr_off += xdr->page_base;
+ page = xdr->pages[xdr_off >> PAGE_SHIFT];
+ xdr_off &= ~PAGE_MASK;
+ } else {
+ /* This offset is in the tail */
+ xdr_off -= xdr->page_len;
+ xdr_off += (unsigned long)
+ xdr->tail[0].iov_base & ~PAGE_MASK;
+ page = virt_to_page(xdr->tail[0].iov_base);
+ }
+ }
+ dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
+ min_t(size_t, PAGE_SIZE, len), dir);
+ return dma_addr;
+}
+
+/* Assumptions:
+ * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+ */
+static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+ u32 rmr, u64 to,
+ u32 xdr_off, int write_len,
+ struct svc_rdma_req_map *vec)
+{
+ struct ib_send_wr write_wr;
+ struct ib_sge *sge;
+ int xdr_sge_no;
+ int sge_no;
+ int sge_bytes;
+ int sge_off;
+ int bc;
+ struct svc_rdma_op_ctxt *ctxt;
+
+ if (vec->count > RPCSVC_MAXPAGES) {
+ pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
+ return -EIO;
+ }
+
+ dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
+ "write_len=%d, vec->sge=%p, vec->count=%lu\n",
+ rmr, (unsigned long long)to, xdr_off,
+ write_len, vec->sge, vec->count);
+
+ ctxt = svc_rdma_get_context(xprt);
+ ctxt->direction = DMA_TO_DEVICE;
+ sge = ctxt->sge;
+
+ /* Find the SGE associated with xdr_off */
+ for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
+ xdr_sge_no++) {
+ if (vec->sge[xdr_sge_no].iov_len > bc)
+ break;
+ bc -= vec->sge[xdr_sge_no].iov_len;
+ }
+
+ sge_off = bc;
+ bc = write_len;
+ sge_no = 0;
+
+ /* Copy the remaining SGE */
+ while (bc != 0) {
+ sge_bytes = min_t(size_t,
+ bc, vec->sge[xdr_sge_no].iov_len-sge_off);
+ sge[sge_no].length = sge_bytes;
+ sge[sge_no].addr =
+ dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
+ sge_bytes, DMA_TO_DEVICE);
+ xdr_off += sge_bytes;
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+ sge[sge_no].addr))
+ goto err;
+ atomic_inc(&xprt->sc_dma_used);
+ sge[sge_no].lkey = xprt->sc_dma_lkey;
+ ctxt->count++;
+ sge_off = 0;
+ sge_no++;
+ xdr_sge_no++;
+ if (xdr_sge_no > vec->count) {
+ pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
+ goto err;
+ }
+ bc -= sge_bytes;
+ if (sge_no == xprt->sc_max_sge)
+ break;
+ }
+
+ /* Prepare WRITE WR */
+ memset(&write_wr, 0, sizeof write_wr);
+ ctxt->wr_op = IB_WR_RDMA_WRITE;
+ write_wr.wr_id = (unsigned long)ctxt;
+ write_wr.sg_list = &sge[0];
+ write_wr.num_sge = sge_no;
+ write_wr.opcode = IB_WR_RDMA_WRITE;
+ write_wr.send_flags = IB_SEND_SIGNALED;
+ write_wr.wr.rdma.rkey = rmr;
+ write_wr.wr.rdma.remote_addr = to;
+
+ /* Post It */
+ atomic_inc(&rdma_stat_write);
+ if (svc_rdma_send(xprt, &write_wr))
+ goto err;
+ return write_len - bc;
+ err:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
+ /* Fatal error, close transport */
+ return -EIO;
+}
+
+static int send_write_chunks(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_msg *rdma_resp,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_req_map *vec)
+{
+ u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+ int write_len;
+ u32 xdr_off;
+ int chunk_off;
+ int chunk_no;
+ struct rpcrdma_write_array *arg_ary;
+ struct rpcrdma_write_array *res_ary;
+ int ret;
+
+ arg_ary = svc_rdma_get_write_array(rdma_argp);
+ if (!arg_ary)
+ return 0;
+ res_ary = (struct rpcrdma_write_array *)
+ &rdma_resp->rm_body.rm_chunks[1];
+
+ /* Write chunks start at the pagelist */
+ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+ xfer_len && chunk_no < arg_ary->wc_nchunks;
+ chunk_no++) {
+ struct rpcrdma_segment *arg_ch;
+ u64 rs_offset;
+
+ arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
+ write_len = min(xfer_len, ntohl(arg_ch->rs_length));
+
+ /* Prepare the response chunk given the length actually
+ * written */
+ xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
+ svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+ arg_ch->rs_handle,
+ arg_ch->rs_offset,
+ write_len);
+ chunk_off = 0;
+ while (write_len) {
+ ret = send_write(xprt, rqstp,
+ ntohl(arg_ch->rs_handle),
+ rs_offset + chunk_off,
+ xdr_off,
+ write_len,
+ vec);
+ if (ret <= 0) {
+ dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+ ret);
+ return -EIO;
+ }
+ chunk_off += ret;
+ xdr_off += ret;
+ xfer_len -= ret;
+ write_len -= ret;
+ }
+ }
+ /* Update the req with the number of chunks actually used */
+ svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
+
+ return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+}
+
+static int send_reply_chunks(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_msg *rdma_resp,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_req_map *vec)
+{
+ u32 xfer_len = rqstp->rq_res.len;
+ int write_len;
+ u32 xdr_off;
+ int chunk_no;
+ int chunk_off;
+ int nchunks;
+ struct rpcrdma_segment *ch;
+ struct rpcrdma_write_array *arg_ary;
+ struct rpcrdma_write_array *res_ary;
+ int ret;
+
+ arg_ary = svc_rdma_get_reply_array(rdma_argp);
+ if (!arg_ary)
+ return 0;
+ /* XXX: need to fix when reply lists occur with read-list and or
+ * write-list */
+ res_ary = (struct rpcrdma_write_array *)
+ &rdma_resp->rm_body.rm_chunks[2];
+
+ /* xdr offset starts at RPC message */
+ nchunks = ntohl(arg_ary->wc_nchunks);
+ for (xdr_off = 0, chunk_no = 0;
+ xfer_len && chunk_no < nchunks;
+ chunk_no++) {
+ u64 rs_offset;
+ ch = &arg_ary->wc_array[chunk_no].wc_target;
+ write_len = min(xfer_len, htonl(ch->rs_length));
+
+ /* Prepare the reply chunk given the length actually
+ * written */
+ xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
+ svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+ ch->rs_handle, ch->rs_offset,
+ write_len);
+ chunk_off = 0;
+ while (write_len) {
+ ret = send_write(xprt, rqstp,
+ ntohl(ch->rs_handle),
+ rs_offset + chunk_off,
+ xdr_off,
+ write_len,
+ vec);
+ if (ret <= 0) {
+ dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+ ret);
+ return -EIO;
+ }
+ chunk_off += ret;
+ xdr_off += ret;
+ xfer_len -= ret;
+ write_len -= ret;
+ }
+ }
+ /* Update the req with the number of chunks actually used */
+ svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+
+ return rqstp->rq_res.len;
+}
+
+/* This function prepares the portion of the RPCRDMA message to be
+ * sent in the RDMA_SEND. This function is called after data sent via
+ * RDMA has already been transmitted. There are three cases:
+ * - The RPCRDMA header, RPC header, and payload are all sent in a
+ * single RDMA_SEND. This is the "inline" case.
+ * - The RPCRDMA header and some portion of the RPC header and data
+ * are sent via this RDMA_SEND and another portion of the data is
+ * sent via RDMA.
+ * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
+ * header and data are all transmitted via RDMA.
+ * In all three cases, this function prepares the RPCRDMA header in
+ * sge[0], the 'type' parameter indicates the type to place in the
+ * RPCRDMA header, and the 'byte_count' field indicates how much of
+ * the XDR to include in this RDMA_SEND. NB: The offset of the payload
+ * to send is zero in the XDR.
+ */
+static int send_reply(struct svcxprt_rdma *rdma,
+ struct svc_rqst *rqstp,
+ struct page *page,
+ struct rpcrdma_msg *rdma_resp,
+ struct svc_rdma_op_ctxt *ctxt,
+ struct svc_rdma_req_map *vec,
+ int byte_count)
+{
+ struct ib_send_wr send_wr;
+ int sge_no;
+ int sge_bytes;
+ int page_no;
+ int pages;
+ int ret;
+
+ /* Post a recv buffer to handle another request. */
+ ret = svc_rdma_post_recv(rdma);
+ if (ret) {
+ printk(KERN_INFO
+ "svcrdma: could not post a receive buffer, err=%d."
+ "Closing transport %p.\n", ret, rdma);
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ svc_rdma_put_context(ctxt, 0);
+ return -ENOTCONN;
+ }
+
+ /* Prepare the context */
+ ctxt->pages[0] = page;
+ ctxt->count = 1;
+
+ /* Prepare the SGE for the RPCRDMA Header */
+ ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+ ctxt->sge[0].addr =
+ ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
+ ctxt->sge[0].length, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+ goto err;
+ atomic_inc(&rdma->sc_dma_used);
+
+ ctxt->direction = DMA_TO_DEVICE;
+
+ /* Map the payload indicated by 'byte_count' */
+ for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+ int xdr_off = 0;
+ sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
+ byte_count -= sge_bytes;
+ ctxt->sge[sge_no].addr =
+ dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
+ sge_bytes, DMA_TO_DEVICE);
+ xdr_off += sge_bytes;
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+ ctxt->sge[sge_no].addr))
+ goto err;
+ atomic_inc(&rdma->sc_dma_used);
+ ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[sge_no].length = sge_bytes;
+ }
+ if (byte_count != 0) {
+ pr_err("svcrdma: Could not map %d bytes\n", byte_count);
+ goto err;
+ }
+
+ /* Save all respages in the ctxt and remove them from the
+ * respages array. They are our pages until the I/O
+ * completes.
+ */
+ pages = rqstp->rq_next_page - rqstp->rq_respages;
+ for (page_no = 0; page_no < pages; page_no++) {
+ ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
+ ctxt->count++;
+ rqstp->rq_respages[page_no] = NULL;
+ /*
+ * If there are more pages than SGE, terminate SGE
+ * list so that svc_rdma_unmap_dma doesn't attempt to
+ * unmap garbage.
+ */
+ if (page_no+1 >= sge_no)
+ ctxt->sge[page_no+1].length = 0;
+ }
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ if (sge_no > rdma->sc_max_sge) {
+ pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+ goto err;
+ }
+ memset(&send_wr, 0, sizeof send_wr);
+ ctxt->wr_op = IB_WR_SEND;
+ send_wr.wr_id = (unsigned long)ctxt;
+ send_wr.sg_list = ctxt->sge;
+ send_wr.num_sge = sge_no;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = svc_rdma_send(rdma, &send_wr);
+ if (ret)
+ goto err;
+
+ return 0;
+
+ err:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ return -EIO;
+}
+
+void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+int svc_rdma_sendto(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ struct rpcrdma_msg *rdma_argp;
+ struct rpcrdma_msg *rdma_resp;
+ struct rpcrdma_write_array *reply_ary;
+ enum rpcrdma_proc reply_type;
+ int ret;
+ int inline_bytes;
+ struct page *res_page;
+ struct svc_rdma_op_ctxt *ctxt;
+ struct svc_rdma_req_map *vec;
+
+ dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+
+ /* Get the RDMA request header. The receive logic always
+ * places this at the start of page 0.
+ */
+ rdma_argp = page_address(rqstp->rq_pages[0]);
+
+ /* Build an req vec for the XDR */
+ ctxt = svc_rdma_get_context(rdma);
+ ctxt->direction = DMA_TO_DEVICE;
+ vec = svc_rdma_get_req_map();
+ ret = map_xdr(rdma, &rqstp->rq_res, vec);
+ if (ret)
+ goto err0;
+ inline_bytes = rqstp->rq_res.len;
+
+ /* Create the RDMA response header */
+ res_page = svc_rdma_get_page();
+ rdma_resp = page_address(res_page);
+ reply_ary = svc_rdma_get_reply_array(rdma_argp);
+ if (reply_ary)
+ reply_type = RDMA_NOMSG;
+ else
+ reply_type = RDMA_MSG;
+ svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
+ rdma_resp, reply_type);
+
+ /* Send any write-chunk data and build resp write-list */
+ ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
+ rqstp, vec);
+ if (ret < 0) {
+ printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
+ ret);
+ goto err1;
+ }
+ inline_bytes -= ret;
+
+ /* Send any reply-list data and update resp reply-list */
+ ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
+ rqstp, vec);
+ if (ret < 0) {
+ printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
+ ret);
+ goto err1;
+ }
+ inline_bytes -= ret;
+
+ ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+ inline_bytes);
+ svc_rdma_put_req_map(vec);
+ dprintk("svcrdma: send_reply returns %d\n", ret);
+ return ret;
+
+ err1:
+ put_page(res_page);
+ err0:
+ svc_rdma_put_req_map(vec);
+ svc_rdma_put_context(ctxt, 0);
+ return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 000000000..f609c1c2d
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1366 @@
+/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/export.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+ struct net *net,
+ struct sockaddr *sa, int salen,
+ int flags);
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
+static void svc_rdma_release_rqst(struct svc_rqst *);
+static void dto_tasklet_func(unsigned long data);
+static void svc_rdma_detach(struct svc_xprt *xprt);
+static void svc_rdma_free(struct svc_xprt *xprt);
+static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static int svc_rdma_secure_port(struct svc_rqst *);
+static void rq_cq_reap(struct svcxprt_rdma *xprt);
+static void sq_cq_reap(struct svcxprt_rdma *xprt);
+
+static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
+static DEFINE_SPINLOCK(dto_lock);
+static LIST_HEAD(dto_xprt_q);
+
+static struct svc_xprt_ops svc_rdma_ops = {
+ .xpo_create = svc_rdma_create,
+ .xpo_recvfrom = svc_rdma_recvfrom,
+ .xpo_sendto = svc_rdma_sendto,
+ .xpo_release_rqst = svc_rdma_release_rqst,
+ .xpo_detach = svc_rdma_detach,
+ .xpo_free = svc_rdma_free,
+ .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+ .xpo_has_wspace = svc_rdma_has_wspace,
+ .xpo_accept = svc_rdma_accept,
+ .xpo_secure_port = svc_rdma_secure_port,
+};
+
+struct svc_xprt_class svc_rdma_class = {
+ .xcl_name = "rdma",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_rdma_ops,
+ .xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
+ .xcl_ident = XPRT_TRANSPORT_RDMA,
+};
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+
+ while (1) {
+ ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
+ if (ctxt)
+ break;
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+ }
+ ctxt->xprt = xprt;
+ INIT_LIST_HEAD(&ctxt->dto_q);
+ ctxt->count = 0;
+ ctxt->frmr = NULL;
+ atomic_inc(&xprt->sc_ctxt_used);
+ return ctxt;
+}
+
+void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
+{
+ struct svcxprt_rdma *xprt = ctxt->xprt;
+ int i;
+ for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+ /*
+ * Unmap the DMA addr in the SGE if the lkey matches
+ * the sc_dma_lkey, otherwise, ignore it since it is
+ * an FRMR lkey and will be unmapped later when the
+ * last WR that uses it completes.
+ */
+ if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+ atomic_dec(&xprt->sc_dma_used);
+ ib_dma_unmap_page(xprt->sc_cm_id->device,
+ ctxt->sge[i].addr,
+ ctxt->sge[i].length,
+ ctxt->direction);
+ }
+ }
+}
+
+void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
+{
+ struct svcxprt_rdma *xprt;
+ int i;
+
+ xprt = ctxt->xprt;
+ if (free_pages)
+ for (i = 0; i < ctxt->count; i++)
+ put_page(ctxt->pages[i]);
+
+ kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
+ atomic_dec(&xprt->sc_ctxt_used);
+}
+
+/*
+ * Temporary NFS req mappings are shared across all transport
+ * instances. These are short lived and should be bounded by the number
+ * of concurrent server threads * depth of the SQ.
+ */
+struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+{
+ struct svc_rdma_req_map *map;
+ while (1) {
+ map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
+ if (map)
+ break;
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+ }
+ map->count = 0;
+ return map;
+}
+
+void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+{
+ kmem_cache_free(svc_rdma_map_cachep, map);
+}
+
+/* ib_cq event handler */
+static void cq_event_handler(struct ib_event *event, void *context)
+{
+ struct svc_xprt *xprt = context;
+ dprintk("svcrdma: received CQ event id=%d, context=%p\n",
+ event->event, context);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+}
+
+/* QP event handler */
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+ struct svc_xprt *xprt = context;
+
+ switch (event->event) {
+ /* These are considered benign events */
+ case IB_EVENT_PATH_MIG:
+ case IB_EVENT_COMM_EST:
+ case IB_EVENT_SQ_DRAINED:
+ case IB_EVENT_QP_LAST_WQE_REACHED:
+ dprintk("svcrdma: QP event %d received for QP=%p\n",
+ event->event, event->element.qp);
+ break;
+ /* These are considered fatal events */
+ case IB_EVENT_PATH_MIG_ERR:
+ case IB_EVENT_QP_FATAL:
+ case IB_EVENT_QP_REQ_ERR:
+ case IB_EVENT_QP_ACCESS_ERR:
+ case IB_EVENT_DEVICE_FATAL:
+ default:
+ dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
+ "closing transport\n",
+ event->event, event->element.qp);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ break;
+ }
+}
+
+/*
+ * Data Transfer Operation Tasklet
+ *
+ * Walks a list of transports with I/O pending, removing entries as
+ * they are added to the server's I/O pending list. Two bits indicate
+ * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
+ * spinlock that serializes access to the transport list with the RQ
+ * and SQ interrupt handlers.
+ */
+static void dto_tasklet_func(unsigned long data)
+{
+ struct svcxprt_rdma *xprt;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dto_lock, flags);
+ while (!list_empty(&dto_xprt_q)) {
+ xprt = list_entry(dto_xprt_q.next,
+ struct svcxprt_rdma, sc_dto_q);
+ list_del_init(&xprt->sc_dto_q);
+ spin_unlock_irqrestore(&dto_lock, flags);
+
+ rq_cq_reap(xprt);
+ sq_cq_reap(xprt);
+
+ svc_xprt_put(&xprt->sc_xprt);
+ spin_lock_irqsave(&dto_lock, flags);
+ }
+ spin_unlock_irqrestore(&dto_lock, flags);
+}
+
+/*
+ * Receive Queue Completion Handler
+ *
+ * Since an RQ completion handler is called on interrupt context, we
+ * need to defer the handling of the I/O to a tasklet
+ */
+static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct svcxprt_rdma *xprt = cq_context;
+ unsigned long flags;
+
+ /* Guard against unconditional flush call for destroyed QP */
+ if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+ return;
+
+ /*
+ * Set the bit regardless of whether or not it's on the list
+ * because it may be on the list already due to an SQ
+ * completion.
+ */
+ set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
+
+ /*
+ * If this transport is not already on the DTO transport queue,
+ * add it
+ */
+ spin_lock_irqsave(&dto_lock, flags);
+ if (list_empty(&xprt->sc_dto_q)) {
+ svc_xprt_get(&xprt->sc_xprt);
+ list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+ }
+ spin_unlock_irqrestore(&dto_lock, flags);
+
+ /* Tasklet does all the work to avoid irqsave locks. */
+ tasklet_schedule(&dto_tasklet);
+}
+
+/*
+ * rq_cq_reap - Process the RQ CQ.
+ *
+ * Take all completing WC off the CQE and enqueue the associated DTO
+ * context on the dto_q for the transport.
+ *
+ * Note that caller must hold a transport reference.
+ */
+static void rq_cq_reap(struct svcxprt_rdma *xprt)
+{
+ int ret;
+ struct ib_wc wc;
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+
+ if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
+ return;
+
+ ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+ atomic_inc(&rdma_stat_rq_poll);
+
+ while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
+ ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+ ctxt->wc_status = wc.status;
+ ctxt->byte_len = wc.byte_len;
+ svc_rdma_unmap_dma(ctxt);
+ if (wc.status != IB_WC_SUCCESS) {
+ /* Close the transport */
+ dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ svc_rdma_put_context(ctxt, 1);
+ svc_xprt_put(&xprt->sc_xprt);
+ continue;
+ }
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ svc_xprt_put(&xprt->sc_xprt);
+ }
+
+ if (ctxt)
+ atomic_inc(&rdma_stat_rq_prod);
+
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ /*
+ * If data arrived before established event,
+ * don't enqueue. This defers RPC I/O until the
+ * RDMA connection is complete.
+ */
+ if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+ svc_xprt_enqueue(&xprt->sc_xprt);
+}
+
+/*
+ * Process a completion context
+ */
+static void process_context(struct svcxprt_rdma *xprt,
+ struct svc_rdma_op_ctxt *ctxt)
+{
+ svc_rdma_unmap_dma(ctxt);
+
+ switch (ctxt->wr_op) {
+ case IB_WR_SEND:
+ if (ctxt->frmr)
+ pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
+ svc_rdma_put_context(ctxt, 1);
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ if (ctxt->frmr)
+ pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
+ svc_rdma_put_context(ctxt, 0);
+ break;
+
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_READ_WITH_INV:
+ svc_rdma_put_frmr(xprt, ctxt->frmr);
+ if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+ struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+ if (read_hdr) {
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ list_add_tail(&read_hdr->dto_q,
+ &xprt->sc_read_complete_q);
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ } else {
+ pr_err("svcrdma: ctxt->read_hdr == NULL\n");
+ }
+ svc_xprt_enqueue(&xprt->sc_xprt);
+ }
+ svc_rdma_put_context(ctxt, 0);
+ break;
+
+ default:
+ printk(KERN_ERR "svcrdma: unexpected completion type, "
+ "opcode=%d\n",
+ ctxt->wr_op);
+ break;
+ }
+}
+
+/*
+ * Send Queue Completion Handler - potentially called on interrupt context.
+ *
+ * Note that caller must hold a transport reference.
+ */
+static void sq_cq_reap(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+ struct ib_wc wc_a[6];
+ struct ib_wc *wc;
+ struct ib_cq *cq = xprt->sc_sq_cq;
+ int ret;
+
+ memset(wc_a, 0, sizeof(wc_a));
+
+ if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
+ return;
+
+ ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+ atomic_inc(&rdma_stat_sq_poll);
+ while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
+ int i;
+
+ for (i = 0; i < ret; i++) {
+ wc = &wc_a[i];
+ if (wc->status != IB_WC_SUCCESS) {
+ dprintk("svcrdma: sq wc err status %d\n",
+ wc->status);
+
+ /* Close the transport */
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ }
+
+ /* Decrement used SQ WR count */
+ atomic_dec(&xprt->sc_sq_count);
+ wake_up(&xprt->sc_send_wait);
+
+ ctxt = (struct svc_rdma_op_ctxt *)
+ (unsigned long)wc->wr_id;
+ if (ctxt)
+ process_context(xprt, ctxt);
+
+ svc_xprt_put(&xprt->sc_xprt);
+ }
+ }
+
+ if (ctxt)
+ atomic_inc(&rdma_stat_sq_prod);
+}
+
+static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct svcxprt_rdma *xprt = cq_context;
+ unsigned long flags;
+
+ /* Guard against unconditional flush call for destroyed QP */
+ if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+ return;
+
+ /*
+ * Set the bit regardless of whether or not it's on the list
+ * because it may be on the list already due to an RQ
+ * completion.
+ */
+ set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
+
+ /*
+ * If this transport is not already on the DTO transport queue,
+ * add it
+ */
+ spin_lock_irqsave(&dto_lock, flags);
+ if (list_empty(&xprt->sc_dto_q)) {
+ svc_xprt_get(&xprt->sc_xprt);
+ list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+ }
+ spin_unlock_irqrestore(&dto_lock, flags);
+
+ /* Tasklet does all the work to avoid irqsave locks. */
+ tasklet_schedule(&dto_tasklet);
+}
+
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
+ int listener)
+{
+ struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+
+ if (!cma_xprt)
+ return NULL;
+ svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
+ INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
+ init_waitqueue_head(&cma_xprt->sc_send_wait);
+
+ spin_lock_init(&cma_xprt->sc_lock);
+ spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+ spin_lock_init(&cma_xprt->sc_frmr_q_lock);
+
+ cma_xprt->sc_ord = svcrdma_ord;
+
+ cma_xprt->sc_max_req_size = svcrdma_max_req_size;
+ cma_xprt->sc_max_requests = svcrdma_max_requests;
+ cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
+ atomic_set(&cma_xprt->sc_sq_count, 0);
+ atomic_set(&cma_xprt->sc_ctxt_used, 0);
+
+ if (listener)
+ set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+
+ return cma_xprt;
+}
+
+struct page *svc_rdma_get_page(void)
+{
+ struct page *page;
+
+ while ((page = alloc_page(GFP_KERNEL)) == NULL) {
+ /* If we can't get memory, wait a bit and try again */
+ printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
+ schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
+ }
+ return page;
+}
+
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+{
+ struct ib_recv_wr recv_wr, *bad_recv_wr;
+ struct svc_rdma_op_ctxt *ctxt;
+ struct page *page;
+ dma_addr_t pa;
+ int sge_no;
+ int buflen;
+ int ret;
+
+ ctxt = svc_rdma_get_context(xprt);
+ buflen = 0;
+ ctxt->direction = DMA_FROM_DEVICE;
+ for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
+ if (sge_no >= xprt->sc_max_sge) {
+ pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+ goto err_put_ctxt;
+ }
+ page = svc_rdma_get_page();
+ ctxt->pages[sge_no] = page;
+ pa = ib_dma_map_page(xprt->sc_cm_id->device,
+ page, 0, PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
+ goto err_put_ctxt;
+ atomic_inc(&xprt->sc_dma_used);
+ ctxt->sge[sge_no].addr = pa;
+ ctxt->sge[sge_no].length = PAGE_SIZE;
+ ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+ ctxt->count = sge_no + 1;
+ buflen += PAGE_SIZE;
+ }
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &ctxt->sge[0];
+ recv_wr.num_sge = ctxt->count;
+ recv_wr.wr_id = (u64)(unsigned long)ctxt;
+
+ svc_xprt_get(&xprt->sc_xprt);
+ ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+ if (ret) {
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ svc_xprt_put(&xprt->sc_xprt);
+ }
+ return ret;
+
+ err_put_ctxt:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ return -ENOMEM;
+}
+
+/*
+ * This function handles the CONNECT_REQUEST event on a listening
+ * endpoint. It is passed the cma_id for the _new_ connection. The context in
+ * this cma_id is inherited from the listening cma_id and is the svc_xprt
+ * structure for the listening endpoint.
+ *
+ * This function creates a new xprt for the new connection and enqueues it on
+ * the accept queue for the listent xprt. When the listen thread is kicked, it
+ * will call the recvfrom method on the listen xprt which will accept the new
+ * connection.
+ */
+static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
+{
+ struct svcxprt_rdma *listen_xprt = new_cma_id->context;
+ struct svcxprt_rdma *newxprt;
+ struct sockaddr *sa;
+
+ /* Create a new transport */
+ newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
+ if (!newxprt) {
+ dprintk("svcrdma: failed to create new transport\n");
+ return;
+ }
+ newxprt->sc_cm_id = new_cma_id;
+ new_cma_id->context = newxprt;
+ dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
+ newxprt, newxprt->sc_cm_id, listen_xprt);
+
+ /* Save client advertised inbound read limit for use later in accept. */
+ newxprt->sc_ord = client_ird;
+
+ /* Set the local and remote addresses in the transport */
+ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+ svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+ svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+
+ /*
+ * Enqueue the new transport on the accept queue of the listening
+ * transport
+ */
+ spin_lock_bh(&listen_xprt->sc_lock);
+ list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
+ spin_unlock_bh(&listen_xprt->sc_lock);
+
+ set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
+ svc_xprt_enqueue(&listen_xprt->sc_xprt);
+}
+
+/*
+ * Handles events generated on the listening endpoint. These events will be
+ * either be incoming connect requests or adapter removal events.
+ */
+static int rdma_listen_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
+{
+ struct svcxprt_rdma *xprt = cma_id->context;
+ int ret = 0;
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
+ "event=%d\n", cma_id, cma_id->context, event->event);
+ handle_connect_req(cma_id,
+ event->param.conn.initiator_depth);
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ /* Accept complete */
+ dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
+ "cm_id=%p\n", xprt, cma_id);
+ break;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
+ xprt, cma_id);
+ if (xprt)
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ break;
+
+ default:
+ dprintk("svcrdma: Unexpected event on listening endpoint %p, "
+ "event=%d\n", cma_id, event->event);
+ break;
+ }
+
+ return ret;
+}
+
+static int rdma_cma_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
+{
+ struct svc_xprt *xprt = cma_id->context;
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ switch (event->event) {
+ case RDMA_CM_EVENT_ESTABLISHED:
+ /* Accept complete */
+ svc_xprt_get(xprt);
+ dprintk("svcrdma: Connection completed on DTO xprt=%p, "
+ "cm_id=%p\n", xprt, cma_id);
+ clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
+ svc_xprt_enqueue(xprt);
+ break;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
+ xprt, cma_id);
+ if (xprt) {
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
+ }
+ break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
+ "event=%d\n", cma_id, xprt, event->event);
+ if (xprt) {
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+ }
+ break;
+ default:
+ dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
+ "event=%d\n", cma_id, event->event);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Create a listening RDMA service endpoint.
+ */
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+ struct net *net,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ struct rdma_cm_id *listen_id;
+ struct svcxprt_rdma *cma_xprt;
+ int ret;
+
+ dprintk("svcrdma: Creating RDMA socket\n");
+ if (sa->sa_family != AF_INET) {
+ dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
+ return ERR_PTR(-EAFNOSUPPORT);
+ }
+ cma_xprt = rdma_create_xprt(serv, 1);
+ if (!cma_xprt)
+ return ERR_PTR(-ENOMEM);
+
+ listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
+ IB_QPT_RC);
+ if (IS_ERR(listen_id)) {
+ ret = PTR_ERR(listen_id);
+ dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
+ goto err0;
+ }
+
+ ret = rdma_bind_addr(listen_id, sa);
+ if (ret) {
+ dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
+ goto err1;
+ }
+ cma_xprt->sc_cm_id = listen_id;
+
+ ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
+ if (ret) {
+ dprintk("svcrdma: rdma_listen failed = %d\n", ret);
+ goto err1;
+ }
+
+ /*
+ * We need to use the address from the cm_id in case the
+ * caller specified 0 for the port number.
+ */
+ sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
+ svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
+
+ return &cma_xprt->sc_xprt;
+
+ err1:
+ rdma_destroy_id(listen_id);
+ err0:
+ kfree(cma_xprt);
+ return ERR_PTR(ret);
+}
+
+static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
+{
+ struct ib_mr *mr;
+ struct ib_fast_reg_page_list *pl;
+ struct svc_rdma_fastreg_mr *frmr;
+
+ frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
+ if (!frmr)
+ goto err;
+
+ mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+ if (IS_ERR(mr))
+ goto err_free_frmr;
+
+ pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
+ RPCSVC_MAXPAGES);
+ if (IS_ERR(pl))
+ goto err_free_mr;
+
+ frmr->mr = mr;
+ frmr->page_list = pl;
+ INIT_LIST_HEAD(&frmr->frmr_list);
+ return frmr;
+
+ err_free_mr:
+ ib_dereg_mr(mr);
+ err_free_frmr:
+ kfree(frmr);
+ err:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_fastreg_mr *frmr;
+
+ while (!list_empty(&xprt->sc_frmr_q)) {
+ frmr = list_entry(xprt->sc_frmr_q.next,
+ struct svc_rdma_fastreg_mr, frmr_list);
+ list_del_init(&frmr->frmr_list);
+ ib_dereg_mr(frmr->mr);
+ ib_free_fast_reg_page_list(frmr->page_list);
+ kfree(frmr);
+ }
+}
+
+struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
+{
+ struct svc_rdma_fastreg_mr *frmr = NULL;
+
+ spin_lock_bh(&rdma->sc_frmr_q_lock);
+ if (!list_empty(&rdma->sc_frmr_q)) {
+ frmr = list_entry(rdma->sc_frmr_q.next,
+ struct svc_rdma_fastreg_mr, frmr_list);
+ list_del_init(&frmr->frmr_list);
+ frmr->map_len = 0;
+ frmr->page_list_len = 0;
+ }
+ spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ if (frmr)
+ return frmr;
+
+ return rdma_alloc_frmr(rdma);
+}
+
+static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ int page_no;
+ for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+ dma_addr_t addr = frmr->page_list->page_list[page_no];
+ if (ib_dma_mapping_error(frmr->mr->device, addr))
+ continue;
+ atomic_dec(&xprt->sc_dma_used);
+ ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE,
+ frmr->direction);
+ }
+}
+
+void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ if (frmr) {
+ frmr_unmap_dma(rdma, frmr);
+ spin_lock_bh(&rdma->sc_frmr_q_lock);
+ WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
+ list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
+ spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ }
+}
+
+/*
+ * This is the xpo_recvfrom function for listening endpoints. Its
+ * purpose is to accept incoming connections. The CMA callback handler
+ * has already created a new transport and attached it to the new CMA
+ * ID.
+ *
+ * There is a queue of pending connections hung on the listening
+ * transport. This queue contains the new svc_xprt structure. This
+ * function takes svc_xprt structures off the accept_q and completes
+ * the connection.
+ */
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *listen_rdma;
+ struct svcxprt_rdma *newxprt = NULL;
+ struct rdma_conn_param conn_param;
+ struct ib_qp_init_attr qp_attr;
+ struct ib_device_attr devattr;
+ int uninitialized_var(dma_mr_acc);
+ int need_dma_mr;
+ int ret;
+ int i;
+
+ listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ clear_bit(XPT_CONN, &xprt->xpt_flags);
+ /* Get the next entry off the accept list */
+ spin_lock_bh(&listen_rdma->sc_lock);
+ if (!list_empty(&listen_rdma->sc_accept_q)) {
+ newxprt = list_entry(listen_rdma->sc_accept_q.next,
+ struct svcxprt_rdma, sc_accept_q);
+ list_del_init(&newxprt->sc_accept_q);
+ }
+ if (!list_empty(&listen_rdma->sc_accept_q))
+ set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
+ spin_unlock_bh(&listen_rdma->sc_lock);
+ if (!newxprt)
+ return NULL;
+
+ dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
+ newxprt, newxprt->sc_cm_id);
+
+ ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
+ if (ret) {
+ dprintk("svcrdma: could not query device attributes on "
+ "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
+ goto errout;
+ }
+
+ /* Qualify the transport resource defaults with the
+ * capabilities of this particular device */
+ newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+ (size_t)RPCSVC_MAXPAGES);
+ newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
+ (size_t)svcrdma_max_requests);
+ newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+
+ /*
+ * Limit ORD based on client limit, local device limit, and
+ * configured svcrdma limit.
+ */
+ newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+ newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
+
+ newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+ if (IS_ERR(newxprt->sc_pd)) {
+ dprintk("svcrdma: error creating PD for connect request\n");
+ goto errout;
+ }
+ newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+ sq_comp_handler,
+ cq_event_handler,
+ newxprt,
+ newxprt->sc_sq_depth,
+ 0);
+ if (IS_ERR(newxprt->sc_sq_cq)) {
+ dprintk("svcrdma: error creating SQ CQ for connect request\n");
+ goto errout;
+ }
+ newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+ rq_comp_handler,
+ cq_event_handler,
+ newxprt,
+ newxprt->sc_max_requests,
+ 0);
+ if (IS_ERR(newxprt->sc_rq_cq)) {
+ dprintk("svcrdma: error creating RQ CQ for connect request\n");
+ goto errout;
+ }
+
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.event_handler = qp_event_handler;
+ qp_attr.qp_context = &newxprt->sc_xprt;
+ qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
+ qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+ qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
+ qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
+ qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ qp_attr.qp_type = IB_QPT_RC;
+ qp_attr.send_cq = newxprt->sc_sq_cq;
+ qp_attr.recv_cq = newxprt->sc_rq_cq;
+ dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
+ " cm_id->device=%p, sc_pd->device=%p\n"
+ " cap.max_send_wr = %d\n"
+ " cap.max_recv_wr = %d\n"
+ " cap.max_send_sge = %d\n"
+ " cap.max_recv_sge = %d\n",
+ newxprt->sc_cm_id, newxprt->sc_pd,
+ newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+ qp_attr.cap.max_send_wr,
+ qp_attr.cap.max_recv_wr,
+ qp_attr.cap.max_send_sge,
+ qp_attr.cap.max_recv_sge);
+
+ ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
+ if (ret) {
+ dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+ goto errout;
+ }
+ newxprt->sc_qp = newxprt->sc_cm_id->qp;
+
+ /*
+ * Use the most secure set of MR resources based on the
+ * transport type and available memory management features in
+ * the device. Here's the table implemented below:
+ *
+ * Fast Global DMA Remote WR
+ * Reg LKEY MR Access
+ * Sup'd Sup'd Needed Needed
+ *
+ * IWARP N N Y Y
+ * N Y Y Y
+ * Y N Y N
+ * Y Y N -
+ *
+ * IB N N Y N
+ * N Y N -
+ * Y N Y N
+ * Y Y N -
+ *
+ * NB: iWARP requires remote write access for the data sink
+ * of an RDMA_READ. IB does not.
+ */
+ newxprt->sc_reader = rdma_read_chunk_lcl;
+ if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ newxprt->sc_frmr_pg_list_len =
+ devattr.max_fast_reg_page_list_len;
+ newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+ newxprt->sc_reader = rdma_read_chunk_frmr;
+ }
+
+ /*
+ * Determine if a DMA MR is required and if so, what privs are required
+ */
+ switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
+ case RDMA_TRANSPORT_IWARP:
+ newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
+ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+ need_dma_mr = 1;
+ dma_mr_acc =
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+ } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else
+ need_dma_mr = 0;
+ break;
+ case RDMA_TRANSPORT_IB:
+ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else if (!(devattr.device_cap_flags &
+ IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else
+ need_dma_mr = 0;
+ break;
+ default:
+ goto errout;
+ }
+
+ /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
+ if (need_dma_mr) {
+ /* Register all of physical memory */
+ newxprt->sc_phys_mr =
+ ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
+ if (IS_ERR(newxprt->sc_phys_mr)) {
+ dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
+ ret);
+ goto errout;
+ }
+ newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
+ } else
+ newxprt->sc_dma_lkey =
+ newxprt->sc_cm_id->device->local_dma_lkey;
+
+ /* Post receive buffers */
+ for (i = 0; i < newxprt->sc_max_requests; i++) {
+ ret = svc_rdma_post_recv(newxprt);
+ if (ret) {
+ dprintk("svcrdma: failure posting receive buffers\n");
+ goto errout;
+ }
+ }
+
+ /* Swap out the handler */
+ newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+
+ /*
+ * Arm the CQs for the SQ and RQ before accepting so we can't
+ * miss the first message
+ */
+ ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+ ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+
+ /* Accept Connection */
+ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.responder_resources = 0;
+ conn_param.initiator_depth = newxprt->sc_ord;
+ ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
+ if (ret) {
+ dprintk("svcrdma: failed to accept new connection, ret=%d\n",
+ ret);
+ goto errout;
+ }
+
+ dprintk("svcrdma: new connection %p accepted with the following "
+ "attributes:\n"
+ " local_ip : %pI4\n"
+ " local_port : %d\n"
+ " remote_ip : %pI4\n"
+ " remote_port : %d\n"
+ " max_sge : %d\n"
+ " sq_depth : %d\n"
+ " max_requests : %d\n"
+ " ord : %d\n",
+ newxprt,
+ &((struct sockaddr_in *)&newxprt->sc_cm_id->
+ route.addr.src_addr)->sin_addr.s_addr,
+ ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+ route.addr.src_addr)->sin_port),
+ &((struct sockaddr_in *)&newxprt->sc_cm_id->
+ route.addr.dst_addr)->sin_addr.s_addr,
+ ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+ route.addr.dst_addr)->sin_port),
+ newxprt->sc_max_sge,
+ newxprt->sc_sq_depth,
+ newxprt->sc_max_requests,
+ newxprt->sc_ord);
+
+ return &newxprt->sc_xprt;
+
+ errout:
+ dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
+ /* Take a reference in case the DTO handler runs */
+ svc_xprt_get(&newxprt->sc_xprt);
+ if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
+ ib_destroy_qp(newxprt->sc_qp);
+ rdma_destroy_id(newxprt->sc_cm_id);
+ /* This call to put will destroy the transport */
+ svc_xprt_put(&newxprt->sc_xprt);
+ return NULL;
+}
+
+static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
+{
+}
+
+/*
+ * When connected, an svc_xprt has at least two references:
+ *
+ * - A reference held by the cm_id between the ESTABLISHED and
+ * DISCONNECTED events. If the remote peer disconnected first, this
+ * reference could be gone.
+ *
+ * - A reference held by the svc_recv code that called this function
+ * as part of close processing.
+ *
+ * At a minimum one references should still be held.
+ */
+static void svc_rdma_detach(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ dprintk("svc: svc_rdma_detach(%p)\n", xprt);
+
+ /* Disconnect and flush posted WQE */
+ rdma_disconnect(rdma->sc_cm_id);
+}
+
+static void __svc_rdma_free(struct work_struct *work)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(work, struct svcxprt_rdma, sc_work);
+ dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+
+ /* We should only be called from kref_put */
+ if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
+ pr_err("svcrdma: sc_xprt still in use? (%d)\n",
+ atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
+
+ /*
+ * Destroy queued, but not processed read completions. Note
+ * that this cleanup has to be done before destroying the
+ * cm_id because the device ptr is needed to unmap the dma in
+ * svc_rdma_put_context.
+ */
+ while (!list_empty(&rdma->sc_read_complete_q)) {
+ struct svc_rdma_op_ctxt *ctxt;
+ ctxt = list_entry(rdma->sc_read_complete_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+ svc_rdma_put_context(ctxt, 1);
+ }
+
+ /* Destroy queued, but not processed recv completions */
+ while (!list_empty(&rdma->sc_rq_dto_q)) {
+ struct svc_rdma_op_ctxt *ctxt;
+ ctxt = list_entry(rdma->sc_rq_dto_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+ svc_rdma_put_context(ctxt, 1);
+ }
+
+ /* Warn if we leaked a resource or under-referenced */
+ if (atomic_read(&rdma->sc_ctxt_used) != 0)
+ pr_err("svcrdma: ctxt still in use? (%d)\n",
+ atomic_read(&rdma->sc_ctxt_used));
+ if (atomic_read(&rdma->sc_dma_used) != 0)
+ pr_err("svcrdma: dma still in use? (%d)\n",
+ atomic_read(&rdma->sc_dma_used));
+
+ /* De-allocate fastreg mr */
+ rdma_dealloc_frmr_q(rdma);
+
+ /* Destroy the QP if present (not a listener) */
+ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+ ib_destroy_qp(rdma->sc_qp);
+
+ if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
+ ib_destroy_cq(rdma->sc_sq_cq);
+
+ if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
+ ib_destroy_cq(rdma->sc_rq_cq);
+
+ if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
+ ib_dereg_mr(rdma->sc_phys_mr);
+
+ if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
+ ib_dealloc_pd(rdma->sc_pd);
+
+ /* Destroy the CM ID */
+ rdma_destroy_id(rdma->sc_cm_id);
+
+ kfree(rdma);
+}
+
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ INIT_WORK(&rdma->sc_work, __svc_rdma_free);
+ queue_work(svc_rdma_wq, &rdma->sc_work);
+}
+
+static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+ /*
+ * If there are already waiters on the SQ,
+ * return false.
+ */
+ if (waitqueue_active(&rdma->sc_send_wait))
+ return 0;
+
+ /* Otherwise return true. */
+ return 1;
+}
+
+static int svc_rdma_secure_port(struct svc_rqst *rqstp)
+{
+ return 1;
+}
+
+/*
+ * Attempt to register the kvec representing the RPC memory with the
+ * device.
+ *
+ * Returns:
+ * NULL : The device does not support fastreg or there were no more
+ * fastreg mr.
+ * frmr : The kvec register request was successfully posted.
+ * <0 : An error was encountered attempting to register the kvec.
+ */
+int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
+ struct svc_rdma_fastreg_mr *frmr)
+{
+ struct ib_send_wr fastreg_wr;
+ u8 key;
+
+ /* Bump the key */
+ key = (u8)(frmr->mr->lkey & 0x000000FF);
+ ib_update_fast_reg_key(frmr->mr, ++key);
+
+ /* Prepare FASTREG WR */
+ memset(&fastreg_wr, 0, sizeof fastreg_wr);
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.send_flags = IB_SEND_SIGNALED;
+ fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+ fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+ fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.length = frmr->map_len;
+ fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+ fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+ return svc_rdma_send(xprt, &fastreg_wr);
+}
+
+int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
+{
+ struct ib_send_wr *bad_wr, *n_wr;
+ int wr_count;
+ int i;
+ int ret;
+
+ if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+ return -ENOTCONN;
+
+ wr_count = 1;
+ for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
+ wr_count++;
+
+ /* If the SQ is full, wait until an SQ entry is available */
+ while (1) {
+ spin_lock_bh(&xprt->sc_lock);
+ if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
+ spin_unlock_bh(&xprt->sc_lock);
+ atomic_inc(&rdma_stat_sq_starve);
+
+ /* See if we can opportunistically reap SQ WR to make room */
+ sq_cq_reap(xprt);
+
+ /* Wait until SQ WR available if SQ still full */
+ wait_event(xprt->sc_send_wait,
+ atomic_read(&xprt->sc_sq_count) <
+ xprt->sc_sq_depth);
+ if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+ return -ENOTCONN;
+ continue;
+ }
+ /* Take a transport ref for each WR posted */
+ for (i = 0; i < wr_count; i++)
+ svc_xprt_get(&xprt->sc_xprt);
+
+ /* Bump used SQ WR count and post */
+ atomic_add(wr_count, &xprt->sc_sq_count);
+ ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
+ if (ret) {
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ atomic_sub(wr_count, &xprt->sc_sq_count);
+ for (i = 0; i < wr_count; i ++)
+ svc_xprt_put(&xprt->sc_xprt);
+ dprintk("svcrdma: failed to post SQ WR rc=%d, "
+ "sc_sq_count=%d, sc_sq_depth=%d\n",
+ ret, atomic_read(&xprt->sc_sq_count),
+ xprt->sc_sq_depth);
+ }
+ spin_unlock_bh(&xprt->sc_lock);
+ if (ret)
+ wake_up(&xprt->sc_send_wait);
+ break;
+ }
+ return ret;
+}
+
+void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
+ enum rpcrdma_errcode err)
+{
+ struct ib_send_wr err_wr;
+ struct page *p;
+ struct svc_rdma_op_ctxt *ctxt;
+ u32 *va;
+ int length;
+ int ret;
+
+ p = svc_rdma_get_page();
+ va = page_address(p);
+
+ /* XDR encode error */
+ length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
+
+ ctxt = svc_rdma_get_context(xprt);
+ ctxt->direction = DMA_FROM_DEVICE;
+ ctxt->count = 1;
+ ctxt->pages[0] = p;
+
+ /* Prepare SGE for local address */
+ ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
+ p, 0, length, DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
+ put_page(p);
+ svc_rdma_put_context(ctxt, 1);
+ return;
+ }
+ atomic_inc(&xprt->sc_dma_used);
+ ctxt->sge[0].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[0].length = length;
+
+ /* Prepare SEND WR */
+ memset(&err_wr, 0, sizeof err_wr);
+ ctxt->wr_op = IB_WR_SEND;
+ err_wr.wr_id = (unsigned long)ctxt;
+ err_wr.sg_list = ctxt->sge;
+ err_wr.num_sge = 1;
+ err_wr.opcode = IB_WR_SEND;
+ err_wr.send_flags = IB_SEND_SIGNALED;
+
+ /* Post It */
+ ret = svc_rdma_send(xprt, &err_wr);
+ if (ret) {
+ dprintk("svcrdma: Error %d posting send for protocol error\n",
+ ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ }
+}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 000000000..54f23b1be
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * transport.c
+ *
+ * This file contains the top-level implementation of an RPC RDMA
+ * transport.
+ *
+ * Naming convention: functions beginning with xprt_ are part of the
+ * transport switch. All others are RPC RDMA internal.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/sunrpc/addr.h>
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
+MODULE_AUTHOR("Network Appliance, Inc.");
+
+/*
+ * tunables
+ */
+
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_inline_write_padding;
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+ int xprt_rdma_pad_optimize = 1;
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+
+static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int zero;
+static unsigned int max_padding = PAGE_SIZE;
+static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
+static unsigned int max_memreg = RPCRDMA_LAST - 1;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+static struct ctl_table xr_tunables_table[] = {
+ {
+ .procname = "rdma_slot_table_entries",
+ .data = &xprt_rdma_slot_table_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_slot_table_size,
+ .extra2 = &max_slot_table_size
+ },
+ {
+ .procname = "rdma_max_inline_read",
+ .data = &xprt_rdma_max_inline_read,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "rdma_max_inline_write",
+ .data = &xprt_rdma_max_inline_write,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "rdma_inline_write_padding",
+ .data = &xprt_rdma_inline_write_padding,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &max_padding,
+ },
+ {
+ .procname = "rdma_memreg_strategy",
+ .data = &xprt_rdma_memreg_strategy,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_memreg,
+ .extra2 = &max_memreg,
+ },
+ {
+ .procname = "rdma_pad_optimize",
+ .data = &xprt_rdma_pad_optimize,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { },
+};
+
+static struct ctl_table sunrpc_table[] = {
+ {
+ .procname = "sunrpc",
+ .mode = 0555,
+ .child = xr_tunables_table
+ },
+ { },
+};
+
+#endif
+
+#define RPCRDMA_BIND_TO (60U * HZ)
+#define RPCRDMA_INIT_REEST_TO (5U * HZ)
+#define RPCRDMA_MAX_REEST_TO (30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
+
+static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
+
+static void
+xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ char buf[20];
+
+ snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+ xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+ xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
+}
+
+static void
+xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ char buf[40];
+
+ snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+ xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+ xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
+}
+
+static void
+xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+{
+ struct sockaddr *sap = (struct sockaddr *)
+ &rpcx_to_rdmad(xprt).addr;
+ char buf[128];
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ xprt_rdma_format_addresses4(xprt, sap);
+ break;
+ case AF_INET6:
+ xprt_rdma_format_addresses6(xprt, sap);
+ break;
+ default:
+ pr_err("rpcrdma: Unrecognized address family\n");
+ return;
+ }
+
+ (void)rpc_ntop(sap, buf, sizeof(buf));
+ xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+ snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
+ xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+ snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
+ xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+
+ xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
+}
+
+static void
+xprt_rdma_free_addresses(struct rpc_xprt *xprt)
+{
+ unsigned int i;
+
+ for (i = 0; i < RPC_DISPLAY_MAX; i++)
+ switch (i) {
+ case RPC_DISPLAY_PROTO:
+ case RPC_DISPLAY_NETID:
+ continue;
+ default:
+ kfree(xprt->address_strings[i]);
+ }
+}
+
+static void
+xprt_rdma_connect_worker(struct work_struct *work)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
+ rx_connect_worker.work);
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ int rc = 0;
+
+ xprt_clear_connected(xprt);
+
+ dprintk("RPC: %s: %sconnect\n", __func__,
+ r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
+ rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ if (rc)
+ xprt_wake_pending_tasks(xprt, rc);
+
+ dprintk("RPC: %s: exit\n", __func__);
+ xprt_clear_connecting(xprt);
+}
+
+/*
+ * xprt_rdma_destroy
+ *
+ * Destroy the xprt.
+ * Free all memory associated with the object, including its own.
+ * NOTE: none of the *destroy methods free memory for their top-level
+ * objects, even though they may have allocated it (they do free
+ * private memory). It's up to the caller to handle it. In this
+ * case (RDMA transport), all structure memory is inlined with the
+ * struct rpcrdma_xprt.
+ */
+static void
+xprt_rdma_destroy(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+ dprintk("RPC: %s: called\n", __func__);
+
+ cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
+
+ xprt_clear_connected(xprt);
+
+ rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+ rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_ia_close(&r_xprt->rx_ia);
+
+ xprt_rdma_free_addresses(xprt);
+
+ xprt_free(xprt);
+
+ dprintk("RPC: %s: returning\n", __func__);
+
+ module_put(THIS_MODULE);
+}
+
+static const struct rpc_timeout xprt_rdma_default_timeout = {
+ .to_initval = 60 * HZ,
+ .to_maxval = 60 * HZ,
+};
+
+/**
+ * xprt_setup_rdma - Set up transport to use RDMA
+ *
+ * @args: rpc transport arguments
+ */
+static struct rpc_xprt *
+xprt_setup_rdma(struct xprt_create *args)
+{
+ struct rpcrdma_create_data_internal cdata;
+ struct rpc_xprt *xprt;
+ struct rpcrdma_xprt *new_xprt;
+ struct rpcrdma_ep *new_ep;
+ struct sockaddr_in *sin;
+ int rc;
+
+ if (args->addrlen > sizeof(xprt->addr)) {
+ dprintk("RPC: %s: address too large\n", __func__);
+ return ERR_PTR(-EBADF);
+ }
+
+ xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
+ xprt_rdma_slot_table_entries,
+ xprt_rdma_slot_table_entries);
+ if (xprt == NULL) {
+ dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
+ __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* 60 second timeout, no retries */
+ xprt->timeout = &xprt_rdma_default_timeout;
+ xprt->bind_timeout = RPCRDMA_BIND_TO;
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+ xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+ xprt->resvport = 0; /* privileged port not needed */
+ xprt->tsh_size = 0; /* RPC-RDMA handles framing */
+ xprt->ops = &xprt_rdma_procs;
+
+ /*
+ * Set up RDMA-specific connect data.
+ */
+
+ /* Put server RDMA address in local cdata */
+ memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+
+ /* Ensure xprt->addr holds valid server TCP (not RDMA)
+ * address, for any side protocols which peek at it */
+ xprt->prot = IPPROTO_TCP;
+ xprt->addrlen = args->addrlen;
+ memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+
+ sin = (struct sockaddr_in *)&cdata.addr;
+ if (ntohs(sin->sin_port) != 0)
+ xprt_set_bound(xprt);
+
+ dprintk("RPC: %s: %pI4:%u\n",
+ __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
+
+ /* Set max requests */
+ cdata.max_requests = xprt->max_reqs;
+
+ /* Set some length limits */
+ cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
+ cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
+
+ cdata.inline_wsize = xprt_rdma_max_inline_write;
+ if (cdata.inline_wsize > cdata.wsize)
+ cdata.inline_wsize = cdata.wsize;
+
+ cdata.inline_rsize = xprt_rdma_max_inline_read;
+ if (cdata.inline_rsize > cdata.rsize)
+ cdata.inline_rsize = cdata.rsize;
+
+ cdata.padding = xprt_rdma_inline_write_padding;
+
+ /*
+ * Create new transport instance, which includes initialized
+ * o ia
+ * o endpoint
+ * o buffers
+ */
+
+ new_xprt = rpcx_to_rdmax(xprt);
+
+ rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
+ xprt_rdma_memreg_strategy);
+ if (rc)
+ goto out1;
+
+ /*
+ * initialize and create ep
+ */
+ new_xprt->rx_data = cdata;
+ new_ep = &new_xprt->rx_ep;
+ new_ep->rep_remote_addr = cdata.addr;
+
+ rc = rpcrdma_ep_create(&new_xprt->rx_ep,
+ &new_xprt->rx_ia, &new_xprt->rx_data);
+ if (rc)
+ goto out2;
+
+ /*
+ * Allocate pre-registered send and receive buffers for headers and
+ * any inline data. Also specify any padding which will be provided
+ * from a preregistered zero buffer.
+ */
+ rc = rpcrdma_buffer_create(new_xprt);
+ if (rc)
+ goto out3;
+
+ /*
+ * Register a callback for connection events. This is necessary because
+ * connection loss notification is async. We also catch connection loss
+ * when reaping receives.
+ */
+ INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
+ xprt_rdma_connect_worker);
+
+ xprt_rdma_format_addresses(xprt);
+ xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
+ if (xprt->max_payload == 0)
+ goto out4;
+ xprt->max_payload <<= PAGE_SHIFT;
+ dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
+ __func__, xprt->max_payload);
+
+ if (!try_module_get(THIS_MODULE))
+ goto out4;
+
+ return xprt;
+
+out4:
+ xprt_rdma_free_addresses(xprt);
+ rc = -EINVAL;
+out3:
+ rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+out2:
+ rpcrdma_ia_close(&new_xprt->rx_ia);
+out1:
+ xprt_free(xprt);
+ return ERR_PTR(rc);
+}
+
+/*
+ * Close a connection, during shutdown or timeout/reconnect
+ */
+static void
+xprt_rdma_close(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+ dprintk("RPC: %s: closing\n", __func__);
+ if (r_xprt->rx_ep.rep_connected > 0)
+ xprt->reestablish_timeout = 0;
+ xprt_disconnect_done(xprt);
+ rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+}
+
+static void
+xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
+{
+ struct sockaddr_in *sap;
+
+ sap = (struct sockaddr_in *)&xprt->addr;
+ sap->sin_port = htons(port);
+ sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
+ sap->sin_port = htons(port);
+ dprintk("RPC: %s: %u\n", __func__, port);
+}
+
+static void
+xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+ if (r_xprt->rx_ep.rep_connected != 0) {
+ /* Reconnect */
+ schedule_delayed_work(&r_xprt->rx_connect_worker,
+ xprt->reestablish_timeout);
+ xprt->reestablish_timeout <<= 1;
+ if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
+ xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
+ else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+ } else {
+ schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
+ if (!RPC_IS_ASYNC(task))
+ flush_delayed_work(&r_xprt->rx_connect_worker);
+ }
+}
+
+/*
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
+ * sequence.
+ *
+ * The RPC layer allocates both send and receive buffers in the same call
+ * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
+ * We may register rq_rcv_buf when using reply chunks.
+ */
+static void *
+xprt_rdma_allocate(struct rpc_task *task, size_t size)
+{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_regbuf *rb;
+ struct rpcrdma_req *req;
+ size_t min_size;
+ gfp_t flags;
+
+ req = rpcrdma_buffer_get(&r_xprt->rx_buf);
+ if (req == NULL)
+ return NULL;
+
+ flags = GFP_NOIO | __GFP_NOWARN;
+ if (RPC_IS_SWAPPER(task))
+ flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+
+ if (req->rl_rdmabuf == NULL)
+ goto out_rdmabuf;
+ if (req->rl_sendbuf == NULL)
+ goto out_sendbuf;
+ if (size > req->rl_sendbuf->rg_size)
+ goto out_sendbuf;
+
+out:
+ dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
+ req->rl_connect_cookie = 0; /* our reserved value */
+ return req->rl_sendbuf->rg_base;
+
+out_rdmabuf:
+ min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
+ rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
+ if (IS_ERR(rb))
+ goto out_fail;
+ req->rl_rdmabuf = rb;
+
+out_sendbuf:
+ /* XDR encoding and RPC/RDMA marshaling of this request has not
+ * yet occurred. Thus a lower bound is needed to prevent buffer
+ * overrun during marshaling.
+ *
+ * RPC/RDMA marshaling may choose to send payload bearing ops
+ * inline, if the result is smaller than the inline threshold.
+ * The value of the "size" argument accounts for header
+ * requirements but not for the payload in these cases.
+ *
+ * Likewise, allocate enough space to receive a reply up to the
+ * size of the inline threshold.
+ *
+ * It's unlikely that both the send header and the received
+ * reply will be large, but slush is provided here to allow
+ * flexibility when marshaling.
+ */
+ min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
+ min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
+ if (size < min_size)
+ size = min_size;
+
+ rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
+ if (IS_ERR(rb))
+ goto out_fail;
+ rb->rg_owner = req;
+
+ r_xprt->rx_stats.hardway_register_count += size;
+ rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
+ req->rl_sendbuf = rb;
+ goto out;
+
+out_fail:
+ rpcrdma_buffer_put(req);
+ r_xprt->rx_stats.failed_marshal_count++;
+ return NULL;
+}
+
+/*
+ * This function returns all RDMA resources to the pool.
+ */
+static void
+xprt_rdma_free(void *buffer)
+{
+ struct rpcrdma_req *req;
+ struct rpcrdma_xprt *r_xprt;
+ struct rpcrdma_regbuf *rb;
+ int i;
+
+ if (buffer == NULL)
+ return;
+
+ rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
+ req = rb->rg_owner;
+ r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
+
+ dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
+
+ for (i = 0; req->rl_nchunks;) {
+ --req->rl_nchunks;
+ i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+ &req->rl_segments[i]);
+ }
+
+ rpcrdma_buffer_put(req);
+}
+
+/*
+ * send_request invokes the meat of RPC RDMA. It must do the following:
+ * 1. Marshal the RPC request into an RPC RDMA request, which means
+ * putting a header in front of data, and creating IOVs for RDMA
+ * from those in the request.
+ * 2. In marshaling, detect opportunities for RDMA, and use them.
+ * 3. Post a recv message to set up asynch completion, then send
+ * the request (rpcrdma_ep_post).
+ * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
+ */
+
+static int
+xprt_rdma_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ int rc = 0;
+
+ rc = rpcrdma_marshal_req(rqst);
+ if (rc < 0)
+ goto failed_marshal;
+
+ if (req->rl_reply == NULL) /* e.g. reconnection */
+ rpcrdma_recv_buffer_get(req);
+
+ if (req->rl_reply) {
+ req->rl_reply->rr_func = rpcrdma_reply_handler;
+ /* this need only be done once, but... */
+ req->rl_reply->rr_xprt = xprt;
+ }
+
+ /* Must suppress retransmit to maintain credits */
+ if (req->rl_connect_cookie == xprt->connect_cookie)
+ goto drop_connection;
+ req->rl_connect_cookie = xprt->connect_cookie;
+
+ if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ goto drop_connection;
+
+ rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
+ rqst->rq_bytes_sent = 0;
+ return 0;
+
+failed_marshal:
+ r_xprt->rx_stats.failed_marshal_count++;
+ dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
+ __func__, rc);
+ if (rc == -EIO)
+ return -EIO;
+drop_connection:
+ xprt_disconnect_done(xprt);
+ return -ENOTCONN; /* implies disconnect */
+}
+
+static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ long idle_time = 0;
+
+ if (xprt_connected(xprt))
+ idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+ seq_printf(seq,
+ "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
+ "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
+
+ 0, /* need a local port? */
+ xprt->stat.bind_count,
+ xprt->stat.connect_count,
+ xprt->stat.connect_time,
+ idle_time,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u,
+
+ r_xprt->rx_stats.read_chunk_count,
+ r_xprt->rx_stats.write_chunk_count,
+ r_xprt->rx_stats.reply_chunk_count,
+ r_xprt->rx_stats.total_rdma_request,
+ r_xprt->rx_stats.total_rdma_reply,
+ r_xprt->rx_stats.pullup_copy_count,
+ r_xprt->rx_stats.fixup_copy_count,
+ r_xprt->rx_stats.hardway_register_count,
+ r_xprt->rx_stats.failed_marshal_count,
+ r_xprt->rx_stats.bad_reply_count);
+}
+
+/*
+ * Plumbing for rpc transport switch and kernel module
+ */
+
+static struct rpc_xprt_ops xprt_rdma_procs = {
+ .reserve_xprt = xprt_reserve_xprt_cong,
+ .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
+ .alloc_slot = xprt_alloc_slot,
+ .release_request = xprt_release_rqst_cong, /* ditto */
+ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
+ .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
+ .set_port = xprt_rdma_set_port,
+ .connect = xprt_rdma_connect,
+ .buf_alloc = xprt_rdma_allocate,
+ .buf_free = xprt_rdma_free,
+ .send_request = xprt_rdma_send_request,
+ .close = xprt_rdma_close,
+ .destroy = xprt_rdma_destroy,
+ .print_stats = xprt_rdma_print_stats
+};
+
+static struct xprt_class xprt_rdma = {
+ .list = LIST_HEAD_INIT(xprt_rdma.list),
+ .name = "rdma",
+ .owner = THIS_MODULE,
+ .ident = XPRT_TRANSPORT_RDMA,
+ .setup = xprt_setup_rdma,
+};
+
+static void __exit xprt_rdma_cleanup(void)
+{
+ int rc;
+
+ dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ if (sunrpc_table_header) {
+ unregister_sysctl_table(sunrpc_table_header);
+ sunrpc_table_header = NULL;
+ }
+#endif
+ rc = xprt_unregister_transport(&xprt_rdma);
+ if (rc)
+ dprintk("RPC: %s: xprt_unregister returned %i\n",
+ __func__, rc);
+}
+
+static int __init xprt_rdma_init(void)
+{
+ int rc;
+
+ rc = xprt_register_transport(&xprt_rdma);
+
+ if (rc)
+ return rc;
+
+ dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
+
+ dprintk("Defaults:\n");
+ dprintk("\tSlots %d\n"
+ "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
+ xprt_rdma_slot_table_entries,
+ xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
+ dprintk("\tPadding %d\n\tMemreg %d\n",
+ xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ if (!sunrpc_table_header)
+ sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+ return 0;
+}
+
+module_init(xprt_rdma_init);
+module_exit(xprt_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 000000000..4870d272e
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,1672 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * verbs.c
+ *
+ * Encapsulates the major functions managing:
+ * o adapters
+ * o endpoints
+ * o connections
+ * o buffer memory
+ */
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+#include <linux/sunrpc/addr.h>
+#include <asm/bitops.h>
+
+#include "xprt_rdma.h"
+
+/*
+ * Globals/Macros
+ */
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+/*
+ * internal functions
+ */
+
+/*
+ * handle replies in tasklet context, using a single, global list
+ * rdma tasklet function -- just turn around and call the func
+ * for all replies on the list
+ */
+
+static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
+static LIST_HEAD(rpcrdma_tasklets_g);
+
+static void
+rpcrdma_run_tasklet(unsigned long data)
+{
+ struct rpcrdma_rep *rep;
+ void (*func)(struct rpcrdma_rep *);
+ unsigned long flags;
+
+ data = data;
+ spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+ while (!list_empty(&rpcrdma_tasklets_g)) {
+ rep = list_entry(rpcrdma_tasklets_g.next,
+ struct rpcrdma_rep, rr_list);
+ list_del(&rep->rr_list);
+ func = rep->rr_func;
+ rep->rr_func = NULL;
+ spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+
+ if (func)
+ func(rep);
+ else
+ rpcrdma_recv_buffer_put(rep);
+
+ spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+ }
+ spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+}
+
+static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
+
+static const char * const async_event[] = {
+ "CQ error",
+ "QP fatal error",
+ "QP request error",
+ "QP access error",
+ "communication established",
+ "send queue drained",
+ "path migration successful",
+ "path mig error",
+ "device fatal error",
+ "port active",
+ "port error",
+ "LID change",
+ "P_key change",
+ "SM change",
+ "SRQ error",
+ "SRQ limit reached",
+ "last WQE reached",
+ "client reregister",
+ "GID change",
+};
+
+#define ASYNC_MSG(status) \
+ ((status) < ARRAY_SIZE(async_event) ? \
+ async_event[(status)] : "unknown async error")
+
+static void
+rpcrdma_schedule_tasklet(struct list_head *sched_list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+ list_splice_tail(sched_list, &rpcrdma_tasklets_g);
+ spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+ tasklet_schedule(&rpcrdma_tasklet_g);
+}
+
+static void
+rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
+{
+ struct rpcrdma_ep *ep = context;
+
+ pr_err("RPC: %s: %s on device %s ep %p\n",
+ __func__, ASYNC_MSG(event->event),
+ event->device->name, context);
+ if (ep->rep_connected == 1) {
+ ep->rep_connected = -EIO;
+ rpcrdma_conn_func(ep);
+ wake_up_all(&ep->rep_connect_wait);
+ }
+}
+
+static void
+rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
+{
+ struct rpcrdma_ep *ep = context;
+
+ pr_err("RPC: %s: %s on device %s ep %p\n",
+ __func__, ASYNC_MSG(event->event),
+ event->device->name, context);
+ if (ep->rep_connected == 1) {
+ ep->rep_connected = -EIO;
+ rpcrdma_conn_func(ep);
+ wake_up_all(&ep->rep_connect_wait);
+ }
+}
+
+static const char * const wc_status[] = {
+ "success",
+ "local length error",
+ "local QP operation error",
+ "local EE context operation error",
+ "local protection error",
+ "WR flushed",
+ "memory management operation error",
+ "bad response error",
+ "local access error",
+ "remote invalid request error",
+ "remote access error",
+ "remote operation error",
+ "transport retry counter exceeded",
+ "RNR retry counter exceeded",
+ "local RDD violation error",
+ "remove invalid RD request",
+ "operation aborted",
+ "invalid EE context number",
+ "invalid EE context state",
+ "fatal error",
+ "response timeout error",
+ "general error",
+};
+
+#define COMPLETION_MSG(status) \
+ ((status) < ARRAY_SIZE(wc_status) ? \
+ wc_status[(status)] : "unexpected completion error")
+
+static void
+rpcrdma_sendcq_process_wc(struct ib_wc *wc)
+{
+ /* WARNING: Only wr_id and status are reliable at this point */
+ if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("RPC: %s: SEND: %s\n",
+ __func__, COMPLETION_MSG(wc->status));
+ } else {
+ struct rpcrdma_mw *r;
+
+ r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+ r->mw_sendcompletion(wc);
+ }
+}
+
+static int
+rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+{
+ struct ib_wc *wcs;
+ int budget, count, rc;
+
+ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+ do {
+ wcs = ep->rep_send_wcs;
+
+ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+ if (rc <= 0)
+ return rc;
+
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_sendcq_process_wc(wcs++);
+ } while (rc == RPCRDMA_POLLSIZE && --budget);
+ return 0;
+}
+
+/*
+ * Handle send, fast_reg_mr, and local_inv completions.
+ *
+ * Send events are typically suppressed and thus do not result
+ * in an upcall. Occasionally one is signaled, however. This
+ * prevents the provider's completion queue from wrapping and
+ * losing a completion.
+ */
+static void
+rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+{
+ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+ int rc;
+
+ rc = rpcrdma_sendcq_poll(cq, ep);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
+ return;
+ }
+
+ rc = ib_req_notify_cq(cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ if (rc == 0)
+ return;
+ if (rc < 0) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ return;
+ }
+
+ rpcrdma_sendcq_poll(cq, ep);
+}
+
+static void
+rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
+{
+ struct rpcrdma_rep *rep =
+ (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
+
+ /* WARNING: Only wr_id and status are reliable at this point */
+ if (wc->status != IB_WC_SUCCESS)
+ goto out_fail;
+
+ /* status == SUCCESS means all fields in wc are trustworthy */
+ if (wc->opcode != IB_WC_RECV)
+ return;
+
+ dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
+ __func__, rep, wc->byte_len);
+
+ rep->rr_len = wc->byte_len;
+ ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+ rdmab_addr(rep->rr_rdmabuf),
+ rep->rr_len, DMA_FROM_DEVICE);
+ prefetch(rdmab_to_msg(rep->rr_rdmabuf));
+
+out_schedule:
+ list_add_tail(&rep->rr_list, sched_list);
+ return;
+out_fail:
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("RPC: %s: rep %p: %s\n",
+ __func__, rep, COMPLETION_MSG(wc->status));
+ rep->rr_len = ~0U;
+ goto out_schedule;
+}
+
+static int
+rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+{
+ struct list_head sched_list;
+ struct ib_wc *wcs;
+ int budget, count, rc;
+
+ INIT_LIST_HEAD(&sched_list);
+ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+ do {
+ wcs = ep->rep_recv_wcs;
+
+ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+ if (rc <= 0)
+ goto out_schedule;
+
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_recvcq_process_wc(wcs++, &sched_list);
+ } while (rc == RPCRDMA_POLLSIZE && --budget);
+ rc = 0;
+
+out_schedule:
+ rpcrdma_schedule_tasklet(&sched_list);
+ return rc;
+}
+
+/*
+ * Handle receive completions.
+ *
+ * It is reentrant but processes single events in order to maintain
+ * ordering of receives to keep server credits.
+ *
+ * It is the responsibility of the scheduled tasklet to return
+ * recv buffers to the pool. NOTE: this affects synchronization of
+ * connection shutdown. That is, the structures required for
+ * the completion of the reply handler must remain intact until
+ * all memory has been reclaimed.
+ */
+static void
+rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+{
+ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+ int rc;
+
+ rc = rpcrdma_recvcq_poll(cq, ep);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
+ return;
+ }
+
+ rc = ib_req_notify_cq(cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ if (rc == 0)
+ return;
+ if (rc < 0) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ return;
+ }
+
+ rpcrdma_recvcq_poll(cq, ep);
+}
+
+static void
+rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
+{
+ struct ib_wc wc;
+ LIST_HEAD(sched_list);
+
+ while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
+ rpcrdma_recvcq_process_wc(&wc, &sched_list);
+ if (!list_empty(&sched_list))
+ rpcrdma_schedule_tasklet(&sched_list);
+ while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
+ rpcrdma_sendcq_process_wc(&wc);
+}
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+static const char * const conn[] = {
+ "address resolved",
+ "address error",
+ "route resolved",
+ "route error",
+ "connect request",
+ "connect response",
+ "connect error",
+ "unreachable",
+ "rejected",
+ "established",
+ "disconnected",
+ "device removal",
+ "multicast join",
+ "multicast error",
+ "address change",
+ "timewait exit",
+};
+
+#define CONNECTION_MSG(status) \
+ ((status) < ARRAY_SIZE(conn) ? \
+ conn[(status)] : "unrecognized connection error")
+#endif
+
+static int
+rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+ struct rpcrdma_xprt *xprt = id->context;
+ struct rpcrdma_ia *ia = &xprt->rx_ia;
+ struct rpcrdma_ep *ep = &xprt->rx_ep;
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
+#endif
+ struct ib_qp_attr *attr = &ia->ri_qp_attr;
+ struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
+ int connstate = 0;
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ ia->ri_async_rc = 0;
+ complete(&ia->ri_done);
+ break;
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ ia->ri_async_rc = -EHOSTUNREACH;
+ dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
+ __func__, ep);
+ complete(&ia->ri_done);
+ break;
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ ia->ri_async_rc = -ENETUNREACH;
+ dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
+ __func__, ep);
+ complete(&ia->ri_done);
+ break;
+ case RDMA_CM_EVENT_ESTABLISHED:
+ connstate = 1;
+ ib_query_qp(ia->ri_id->qp, attr,
+ IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
+ iattr);
+ dprintk("RPC: %s: %d responder resources"
+ " (%d initiator)\n",
+ __func__, attr->max_dest_rd_atomic,
+ attr->max_rd_atomic);
+ goto connected;
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ connstate = -ENOTCONN;
+ goto connected;
+ case RDMA_CM_EVENT_UNREACHABLE:
+ connstate = -ENETDOWN;
+ goto connected;
+ case RDMA_CM_EVENT_REJECTED:
+ connstate = -ECONNREFUSED;
+ goto connected;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ connstate = -ECONNABORTED;
+ goto connected;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ connstate = -ENODEV;
+connected:
+ dprintk("RPC: %s: %sconnected\n",
+ __func__, connstate > 0 ? "" : "dis");
+ ep->rep_connected = connstate;
+ rpcrdma_conn_func(ep);
+ wake_up_all(&ep->rep_connect_wait);
+ /*FALLTHROUGH*/
+ default:
+ dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
+ __func__, sap, rpc_get_port(sap), ep,
+ CONNECTION_MSG(event->event));
+ break;
+ }
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ if (connstate == 1) {
+ int ird = attr->max_dest_rd_atomic;
+ int tird = ep->rep_remote_cma.responder_resources;
+
+ pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
+ sap, rpc_get_port(sap),
+ ia->ri_id->device->name,
+ ia->ri_ops->ro_displayname,
+ xprt->rx_buf.rb_max_requests,
+ ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
+ } else if (connstate < 0) {
+ pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
+ sap, rpc_get_port(sap), connstate);
+ }
+#endif
+
+ return 0;
+}
+
+static struct rdma_cm_id *
+rpcrdma_create_id(struct rpcrdma_xprt *xprt,
+ struct rpcrdma_ia *ia, struct sockaddr *addr)
+{
+ struct rdma_cm_id *id;
+ int rc;
+
+ init_completion(&ia->ri_done);
+
+ id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(id)) {
+ rc = PTR_ERR(id);
+ dprintk("RPC: %s: rdma_create_id() failed %i\n",
+ __func__, rc);
+ return id;
+ }
+
+ ia->ri_async_rc = -ETIMEDOUT;
+ rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
+ if (rc) {
+ dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
+ __func__, rc);
+ goto out;
+ }
+ wait_for_completion_interruptible_timeout(&ia->ri_done,
+ msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+ rc = ia->ri_async_rc;
+ if (rc)
+ goto out;
+
+ ia->ri_async_rc = -ETIMEDOUT;
+ rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+ if (rc) {
+ dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
+ __func__, rc);
+ goto out;
+ }
+ wait_for_completion_interruptible_timeout(&ia->ri_done,
+ msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+ rc = ia->ri_async_rc;
+ if (rc)
+ goto out;
+
+ return id;
+
+out:
+ rdma_destroy_id(id);
+ return ERR_PTR(rc);
+}
+
+/*
+ * Drain any cq, prior to teardown.
+ */
+static void
+rpcrdma_clean_cq(struct ib_cq *cq)
+{
+ struct ib_wc wc;
+ int count = 0;
+
+ while (1 == ib_poll_cq(cq, 1, &wc))
+ ++count;
+
+ if (count)
+ dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
+ __func__, count, wc.opcode);
+}
+
+/*
+ * Exported functions.
+ */
+
+/*
+ * Open and initialize an Interface Adapter.
+ * o initializes fields of struct rpcrdma_ia, including
+ * interface and provider attributes and protection zone.
+ */
+int
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+{
+ int rc, mem_priv;
+ struct rpcrdma_ia *ia = &xprt->rx_ia;
+ struct ib_device_attr *devattr = &ia->ri_devattr;
+
+ ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
+ if (IS_ERR(ia->ri_id)) {
+ rc = PTR_ERR(ia->ri_id);
+ goto out1;
+ }
+
+ ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+ if (IS_ERR(ia->ri_pd)) {
+ rc = PTR_ERR(ia->ri_pd);
+ dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ rc = ib_query_device(ia->ri_id->device, devattr);
+ if (rc) {
+ dprintk("RPC: %s: ib_query_device failed %d\n",
+ __func__, rc);
+ goto out3;
+ }
+
+ if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+ ia->ri_have_dma_lkey = 1;
+ ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+ }
+
+ if (memreg == RPCRDMA_FRMR) {
+ /* Requires both frmr reg and local dma lkey */
+ if (((devattr->device_cap_flags &
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
+ (devattr->max_fast_reg_page_list_len == 0)) {
+ dprintk("RPC: %s: FRMR registration "
+ "not supported by HCA\n", __func__);
+ memreg = RPCRDMA_MTHCAFMR;
+ }
+ }
+ if (memreg == RPCRDMA_MTHCAFMR) {
+ if (!ia->ri_id->device->alloc_fmr) {
+ dprintk("RPC: %s: MTHCAFMR registration "
+ "not supported by HCA\n", __func__);
+ memreg = RPCRDMA_ALLPHYSICAL;
+ }
+ }
+
+ /*
+ * Optionally obtain an underlying physical identity mapping in
+ * order to do a memory window-based bind. This base registration
+ * is protected from remote access - that is enabled only by binding
+ * for the specific bytes targeted during each RPC operation, and
+ * revoked after the corresponding completion similar to a storage
+ * adapter.
+ */
+ switch (memreg) {
+ case RPCRDMA_FRMR:
+ ia->ri_ops = &rpcrdma_frwr_memreg_ops;
+ break;
+ case RPCRDMA_ALLPHYSICAL:
+ ia->ri_ops = &rpcrdma_physical_memreg_ops;
+ mem_priv = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ;
+ goto register_setup;
+ case RPCRDMA_MTHCAFMR:
+ ia->ri_ops = &rpcrdma_fmr_memreg_ops;
+ if (ia->ri_have_dma_lkey)
+ break;
+ mem_priv = IB_ACCESS_LOCAL_WRITE;
+ register_setup:
+ ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+ if (IS_ERR(ia->ri_bind_mem)) {
+ printk(KERN_ALERT "%s: ib_get_dma_mr for "
+ "phys register failed with %lX\n",
+ __func__, PTR_ERR(ia->ri_bind_mem));
+ rc = -ENOMEM;
+ goto out3;
+ }
+ break;
+ default:
+ printk(KERN_ERR "RPC: Unsupported memory "
+ "registration mode: %d\n", memreg);
+ rc = -ENOMEM;
+ goto out3;
+ }
+ dprintk("RPC: %s: memory registration strategy is '%s'\n",
+ __func__, ia->ri_ops->ro_displayname);
+
+ /* Else will do memory reg/dereg for each chunk */
+ ia->ri_memreg_strategy = memreg;
+
+ rwlock_init(&ia->ri_qplock);
+ return 0;
+
+out3:
+ ib_dealloc_pd(ia->ri_pd);
+ ia->ri_pd = NULL;
+out2:
+ rdma_destroy_id(ia->ri_id);
+ ia->ri_id = NULL;
+out1:
+ return rc;
+}
+
+/*
+ * Clean up/close an IA.
+ * o if event handles and PD have been initialized, free them.
+ * o close the IA
+ */
+void
+rpcrdma_ia_close(struct rpcrdma_ia *ia)
+{
+ int rc;
+
+ dprintk("RPC: %s: entering\n", __func__);
+ if (ia->ri_bind_mem != NULL) {
+ rc = ib_dereg_mr(ia->ri_bind_mem);
+ dprintk("RPC: %s: ib_dereg_mr returned %i\n",
+ __func__, rc);
+ }
+ if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
+ if (ia->ri_id->qp)
+ rdma_destroy_qp(ia->ri_id);
+ rdma_destroy_id(ia->ri_id);
+ ia->ri_id = NULL;
+ }
+ if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
+ rc = ib_dealloc_pd(ia->ri_pd);
+ dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
+ __func__, rc);
+ }
+}
+
+/*
+ * Create unconnected endpoint.
+ */
+int
+rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ struct rpcrdma_create_data_internal *cdata)
+{
+ struct ib_device_attr *devattr = &ia->ri_devattr;
+ struct ib_cq *sendcq, *recvcq;
+ int rc, err;
+
+ /* check provider's send/recv wr limits */
+ if (cdata->max_requests > devattr->max_qp_wr)
+ cdata->max_requests = devattr->max_qp_wr;
+
+ ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
+ ep->rep_attr.qp_context = ep;
+ ep->rep_attr.srq = NULL;
+ ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+ rc = ia->ri_ops->ro_open(ia, ep, cdata);
+ if (rc)
+ return rc;
+ ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
+ ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+ ep->rep_attr.cap.max_recv_sge = 1;
+ ep->rep_attr.cap.max_inline_data = 0;
+ ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ ep->rep_attr.qp_type = IB_QPT_RC;
+ ep->rep_attr.port_num = ~0;
+
+ if (cdata->padding) {
+ ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
+ GFP_KERNEL);
+ if (IS_ERR(ep->rep_padbuf))
+ return PTR_ERR(ep->rep_padbuf);
+ } else
+ ep->rep_padbuf = NULL;
+
+ dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
+ "iovs: send %d recv %d\n",
+ __func__,
+ ep->rep_attr.cap.max_send_wr,
+ ep->rep_attr.cap.max_recv_wr,
+ ep->rep_attr.cap.max_send_sge,
+ ep->rep_attr.cap.max_recv_sge);
+
+ /* set trigger for requesting send completion */
+ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
+ if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
+ ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
+ else if (ep->rep_cqinit <= 2)
+ ep->rep_cqinit = 0;
+ INIT_CQCOUNT(ep);
+ init_waitqueue_head(&ep->rep_connect_wait);
+ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+
+ sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+ rpcrdma_cq_async_error_upcall, ep,
+ ep->rep_attr.cap.max_send_wr + 1, 0);
+ if (IS_ERR(sendcq)) {
+ rc = PTR_ERR(sendcq);
+ dprintk("RPC: %s: failed to create send CQ: %i\n",
+ __func__, rc);
+ goto out1;
+ }
+
+ rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
+ if (rc) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
+ rpcrdma_cq_async_error_upcall, ep,
+ ep->rep_attr.cap.max_recv_wr + 1, 0);
+ if (IS_ERR(recvcq)) {
+ rc = PTR_ERR(recvcq);
+ dprintk("RPC: %s: failed to create recv CQ: %i\n",
+ __func__, rc);
+ goto out2;
+ }
+
+ rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
+ if (rc) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ ib_destroy_cq(recvcq);
+ goto out2;
+ }
+
+ ep->rep_attr.send_cq = sendcq;
+ ep->rep_attr.recv_cq = recvcq;
+
+ /* Initialize cma parameters */
+
+ /* RPC/RDMA does not use private data */
+ ep->rep_remote_cma.private_data = NULL;
+ ep->rep_remote_cma.private_data_len = 0;
+
+ /* Client offers RDMA Read but does not initiate */
+ ep->rep_remote_cma.initiator_depth = 0;
+ if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ ep->rep_remote_cma.responder_resources = 32;
+ else
+ ep->rep_remote_cma.responder_resources =
+ devattr->max_qp_rd_atom;
+
+ ep->rep_remote_cma.retry_count = 7;
+ ep->rep_remote_cma.flow_control = 0;
+ ep->rep_remote_cma.rnr_retry_count = 0;
+
+ return 0;
+
+out2:
+ err = ib_destroy_cq(sendcq);
+ if (err)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, err);
+out1:
+ rpcrdma_free_regbuf(ia, ep->rep_padbuf);
+ return rc;
+}
+
+/*
+ * rpcrdma_ep_destroy
+ *
+ * Disconnect and destroy endpoint. After this, the only
+ * valid operations on the ep are to free it (if dynamically
+ * allocated) or re-create it.
+ */
+void
+rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+ int rc;
+
+ dprintk("RPC: %s: entering, connected is %d\n",
+ __func__, ep->rep_connected);
+
+ cancel_delayed_work_sync(&ep->rep_connect_worker);
+
+ if (ia->ri_id->qp) {
+ rpcrdma_ep_disconnect(ep, ia);
+ rdma_destroy_qp(ia->ri_id);
+ ia->ri_id->qp = NULL;
+ }
+
+ rpcrdma_free_regbuf(ia, ep->rep_padbuf);
+
+ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+ rc = ib_destroy_cq(ep->rep_attr.recv_cq);
+ if (rc)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, rc);
+
+ rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rc = ib_destroy_cq(ep->rep_attr.send_cq);
+ if (rc)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, rc);
+}
+
+/*
+ * Connect unconnected endpoint.
+ */
+int
+rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+ struct rdma_cm_id *id, *old;
+ int rc = 0;
+ int retry_count = 0;
+
+ if (ep->rep_connected != 0) {
+ struct rpcrdma_xprt *xprt;
+retry:
+ dprintk("RPC: %s: reconnecting...\n", __func__);
+
+ rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_flush_cqs(ep);
+
+ xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+ ia->ri_ops->ro_reset(xprt);
+
+ id = rpcrdma_create_id(xprt, ia,
+ (struct sockaddr *)&xprt->rx_data.addr);
+ if (IS_ERR(id)) {
+ rc = -EHOSTUNREACH;
+ goto out;
+ }
+ /* TEMP TEMP TEMP - fail if new device:
+ * Deregister/remarshal *all* requests!
+ * Close and recreate adapter, pd, etc!
+ * Re-determine all attributes still sane!
+ * More stuff I haven't thought of!
+ * Rrrgh!
+ */
+ if (ia->ri_id->device != id->device) {
+ printk("RPC: %s: can't reconnect on "
+ "different device!\n", __func__);
+ rdma_destroy_id(id);
+ rc = -ENETUNREACH;
+ goto out;
+ }
+ /* END TEMP */
+ rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+ if (rc) {
+ dprintk("RPC: %s: rdma_create_qp failed %i\n",
+ __func__, rc);
+ rdma_destroy_id(id);
+ rc = -ENETUNREACH;
+ goto out;
+ }
+
+ write_lock(&ia->ri_qplock);
+ old = ia->ri_id;
+ ia->ri_id = id;
+ write_unlock(&ia->ri_qplock);
+
+ rdma_destroy_qp(old);
+ rdma_destroy_id(old);
+ } else {
+ dprintk("RPC: %s: connecting...\n", __func__);
+ rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ if (rc) {
+ dprintk("RPC: %s: rdma_create_qp failed %i\n",
+ __func__, rc);
+ /* do not update ep->rep_connected */
+ return -ENETUNREACH;
+ }
+ }
+
+ ep->rep_connected = 0;
+
+ rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+ if (rc) {
+ dprintk("RPC: %s: rdma_connect() failed with %i\n",
+ __func__, rc);
+ goto out;
+ }
+
+ wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
+
+ /*
+ * Check state. A non-peer reject indicates no listener
+ * (ECONNREFUSED), which may be a transient state. All
+ * others indicate a transport condition which has already
+ * undergone a best-effort.
+ */
+ if (ep->rep_connected == -ECONNREFUSED &&
+ ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
+ dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
+ goto retry;
+ }
+ if (ep->rep_connected <= 0) {
+ /* Sometimes, the only way to reliably connect to remote
+ * CMs is to use same nonzero values for ORD and IRD. */
+ if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
+ (ep->rep_remote_cma.responder_resources == 0 ||
+ ep->rep_remote_cma.initiator_depth !=
+ ep->rep_remote_cma.responder_resources)) {
+ if (ep->rep_remote_cma.responder_resources == 0)
+ ep->rep_remote_cma.responder_resources = 1;
+ ep->rep_remote_cma.initiator_depth =
+ ep->rep_remote_cma.responder_resources;
+ goto retry;
+ }
+ rc = ep->rep_connected;
+ } else {
+ dprintk("RPC: %s: connected\n", __func__);
+ }
+
+out:
+ if (rc)
+ ep->rep_connected = rc;
+ return rc;
+}
+
+/*
+ * rpcrdma_ep_disconnect
+ *
+ * This is separate from destroy to facilitate the ability
+ * to reconnect without recreating the endpoint.
+ *
+ * This call is not reentrant, and must not be made in parallel
+ * on the same endpoint.
+ */
+void
+rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+ int rc;
+
+ rpcrdma_flush_cqs(ep);
+ rc = rdma_disconnect(ia->ri_id);
+ if (!rc) {
+ /* returns without wait if not connected */
+ wait_event_interruptible(ep->rep_connect_wait,
+ ep->rep_connected != 1);
+ dprintk("RPC: %s: after wait, %sconnected\n", __func__,
+ (ep->rep_connected == 1) ? "still " : "dis");
+ } else {
+ dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
+ ep->rep_connected = rc;
+ }
+}
+
+static struct rpcrdma_req *
+rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_req *req;
+
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ req->rl_buffer = &r_xprt->rx_buf;
+ return req;
+}
+
+static struct rpcrdma_rep *
+rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_rep *rep;
+ int rc;
+
+ rc = -ENOMEM;
+ rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+ if (rep == NULL)
+ goto out;
+
+ rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
+ GFP_KERNEL);
+ if (IS_ERR(rep->rr_rdmabuf)) {
+ rc = PTR_ERR(rep->rr_rdmabuf);
+ goto out_free;
+ }
+
+ rep->rr_buffer = &r_xprt->rx_buf;
+ return rep;
+
+out_free:
+ kfree(rep);
+out:
+ return ERR_PTR(rc);
+}
+
+int
+rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ char *p;
+ size_t len;
+ int i, rc;
+
+ buf->rb_max_requests = cdata->max_requests;
+ spin_lock_init(&buf->rb_lock);
+
+ /* Need to allocate:
+ * 1. arrays for send and recv pointers
+ * 2. arrays of struct rpcrdma_req to fill in pointers
+ * 3. array of struct rpcrdma_rep for replies
+ * Send/recv buffers in req/rep need to be registered
+ */
+ len = buf->rb_max_requests *
+ (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
+
+ p = kzalloc(len, GFP_KERNEL);
+ if (p == NULL) {
+ dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
+ __func__, len);
+ rc = -ENOMEM;
+ goto out;
+ }
+ buf->rb_pool = p; /* for freeing it later */
+
+ buf->rb_send_bufs = (struct rpcrdma_req **) p;
+ p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
+ buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
+ p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
+
+ rc = ia->ri_ops->ro_init(r_xprt);
+ if (rc)
+ goto out;
+
+ for (i = 0; i < buf->rb_max_requests; i++) {
+ struct rpcrdma_req *req;
+ struct rpcrdma_rep *rep;
+
+ req = rpcrdma_create_req(r_xprt);
+ if (IS_ERR(req)) {
+ dprintk("RPC: %s: request buffer %d alloc"
+ " failed\n", __func__, i);
+ rc = PTR_ERR(req);
+ goto out;
+ }
+ buf->rb_send_bufs[i] = req;
+
+ rep = rpcrdma_create_rep(r_xprt);
+ if (IS_ERR(rep)) {
+ dprintk("RPC: %s: reply buffer %d alloc failed\n",
+ __func__, i);
+ rc = PTR_ERR(rep);
+ goto out;
+ }
+ buf->rb_recv_bufs[i] = rep;
+ }
+
+ return 0;
+out:
+ rpcrdma_buffer_destroy(buf);
+ return rc;
+}
+
+static void
+rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
+{
+ if (!rep)
+ return;
+
+ rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
+ kfree(rep);
+}
+
+static void
+rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+ if (!req)
+ return;
+
+ rpcrdma_free_regbuf(ia, req->rl_sendbuf);
+ rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
+ kfree(req);
+}
+
+void
+rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ int i;
+
+ /* clean up in reverse order from create
+ * 1. recv mr memory (mr free, then kfree)
+ * 2. send mr memory (mr free, then kfree)
+ * 3. MWs
+ */
+ dprintk("RPC: %s: entering\n", __func__);
+
+ for (i = 0; i < buf->rb_max_requests; i++) {
+ if (buf->rb_recv_bufs)
+ rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
+ if (buf->rb_send_bufs)
+ rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
+ }
+
+ ia->ri_ops->ro_destroy(buf);
+
+ kfree(buf->rb_pool);
+}
+
+/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
+ * some req segments uninitialized.
+ */
+static void
+rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+{
+ if (*mw) {
+ list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
+ *mw = NULL;
+ }
+}
+
+/* Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+ */
+static void
+rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_segments;
+ struct rpcrdma_mr_seg *seg1 = seg;
+ int i;
+
+ for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
+ rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
+ rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
+}
+
+static void
+rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ buf->rb_send_bufs[--buf->rb_send_index] = req;
+ req->rl_niovs = 0;
+ if (req->rl_reply) {
+ buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
+ req->rl_reply->rr_func = NULL;
+ req->rl_reply = NULL;
+ }
+}
+
+/* rpcrdma_unmap_one() was already done during deregistration.
+ * Redo only the ib_post_send().
+ */
+static void
+rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ia, struct rpcrdma_xprt, rx_ia);
+ struct ib_send_wr invalidate_wr, *bad_wr;
+ int rc;
+
+ dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
+
+ /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+
+ memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+ invalidate_wr.wr_id = (unsigned long)(void *)r;
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
+ __func__, r, r->r.frmr.fr_mr->rkey);
+
+ read_lock(&ia->ri_qplock);
+ rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ read_unlock(&ia->ri_qplock);
+ if (rc) {
+ /* Force rpcrdma_buffer_get() to retry */
+ r->r.frmr.fr_state = FRMR_IS_STALE;
+ dprintk("RPC: %s: ib_post_send failed, %i\n",
+ __func__, rc);
+ }
+}
+
+static void
+rpcrdma_retry_flushed_linv(struct list_head *stale,
+ struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ struct list_head *pos;
+ struct rpcrdma_mw *r;
+ unsigned long flags;
+
+ list_for_each(pos, stale) {
+ r = list_entry(pos, struct rpcrdma_mw, mw_list);
+ rpcrdma_retry_local_inv(r, ia);
+ }
+
+ spin_lock_irqsave(&buf->rb_lock, flags);
+ list_splice_tail(stale, &buf->rb_mws);
+ spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
+ struct list_head *stale)
+{
+ struct rpcrdma_mw *r;
+ int i;
+
+ i = RPCRDMA_MAX_SEGS - 1;
+ while (!list_empty(&buf->rb_mws)) {
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ if (r->r.frmr.fr_state == FRMR_IS_STALE) {
+ list_add(&r->mw_list, stale);
+ continue;
+ }
+ req->rl_segments[i].rl_mw = r;
+ if (unlikely(i-- == 0))
+ return req; /* Success */
+ }
+
+ /* Not enough entries on rb_mws for this req */
+ rpcrdma_buffer_put_sendbuf(req, buf);
+ rpcrdma_buffer_put_mrs(req, buf);
+ return NULL;
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int i;
+
+ i = RPCRDMA_MAX_SEGS - 1;
+ while (!list_empty(&buf->rb_mws)) {
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ req->rl_segments[i].rl_mw = r;
+ if (unlikely(i-- == 0))
+ return req; /* Success */
+ }
+
+ /* Not enough entries on rb_mws for this req */
+ rpcrdma_buffer_put_sendbuf(req, buf);
+ rpcrdma_buffer_put_mrs(req, buf);
+ return NULL;
+}
+
+/*
+ * Get a set of request/reply buffers.
+ *
+ * Reply buffer (if needed) is attached to send buffer upon return.
+ * Rule:
+ * rb_send_index and rb_recv_index MUST always be pointing to the
+ * *next* available buffer (non-NULL). They are incremented after
+ * removing buffers, and decremented *before* returning them.
+ */
+struct rpcrdma_req *
+rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
+{
+ struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+ struct list_head stale;
+ struct rpcrdma_req *req;
+ unsigned long flags;
+
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ if (buffers->rb_send_index == buffers->rb_max_requests) {
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ dprintk("RPC: %s: out of request buffers\n", __func__);
+ return ((struct rpcrdma_req *)NULL);
+ }
+
+ req = buffers->rb_send_bufs[buffers->rb_send_index];
+ if (buffers->rb_send_index < buffers->rb_recv_index) {
+ dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
+ __func__,
+ buffers->rb_recv_index - buffers->rb_send_index);
+ req->rl_reply = NULL;
+ } else {
+ req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+ buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+ }
+ buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+
+ INIT_LIST_HEAD(&stale);
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
+ break;
+ case RPCRDMA_MTHCAFMR:
+ req = rpcrdma_buffer_get_fmrs(req, buffers);
+ break;
+ default:
+ break;
+ }
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ if (!list_empty(&stale))
+ rpcrdma_retry_flushed_linv(&stale, buffers);
+ return req;
+}
+
+/*
+ * Put request/reply buffers back into pool.
+ * Pre-decrement counter/array index.
+ */
+void
+rpcrdma_buffer_put(struct rpcrdma_req *req)
+{
+ struct rpcrdma_buffer *buffers = req->rl_buffer;
+ struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+ unsigned long flags;
+
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ rpcrdma_buffer_put_sendbuf(req, buffers);
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ case RPCRDMA_MTHCAFMR:
+ rpcrdma_buffer_put_mrs(req, buffers);
+ break;
+ default:
+ break;
+ }
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Recover reply buffers from pool.
+ * This happens when recovering from error conditions.
+ * Post-increment counter/array index.
+ */
+void
+rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
+{
+ struct rpcrdma_buffer *buffers = req->rl_buffer;
+ unsigned long flags;
+
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ if (buffers->rb_recv_index < buffers->rb_max_requests) {
+ req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+ buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+ }
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Put reply buffers back into pool when not attached to
+ * request. This happens in error conditions.
+ */
+void
+rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+{
+ struct rpcrdma_buffer *buffers = rep->rr_buffer;
+ unsigned long flags;
+
+ rep->rr_func = NULL;
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
+ spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Wrappers for internal-use kmalloc memory registration, used by buffer code.
+ */
+
+void
+rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
+{
+ dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
+ seg->mr_offset,
+ (unsigned long long)seg->mr_dma, seg->mr_dmalen);
+}
+
+static int
+rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
+ struct ib_mr **mrp, struct ib_sge *iov)
+{
+ struct ib_phys_buf ipb;
+ struct ib_mr *mr;
+ int rc;
+
+ /*
+ * All memory passed here was kmalloc'ed, therefore phys-contiguous.
+ */
+ iov->addr = ib_dma_map_single(ia->ri_id->device,
+ va, len, DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+ return -ENOMEM;
+
+ iov->length = len;
+
+ if (ia->ri_have_dma_lkey) {
+ *mrp = NULL;
+ iov->lkey = ia->ri_dma_lkey;
+ return 0;
+ } else if (ia->ri_bind_mem != NULL) {
+ *mrp = NULL;
+ iov->lkey = ia->ri_bind_mem->lkey;
+ return 0;
+ }
+
+ ipb.addr = iov->addr;
+ ipb.size = iov->length;
+ mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
+ IB_ACCESS_LOCAL_WRITE, &iov->addr);
+
+ dprintk("RPC: %s: phys convert: 0x%llx "
+ "registered 0x%llx length %d\n",
+ __func__, (unsigned long long)ipb.addr,
+ (unsigned long long)iov->addr, len);
+
+ if (IS_ERR(mr)) {
+ *mrp = NULL;
+ rc = PTR_ERR(mr);
+ dprintk("RPC: %s: failed with %i\n", __func__, rc);
+ } else {
+ *mrp = mr;
+ iov->lkey = mr->lkey;
+ rc = 0;
+ }
+
+ return rc;
+}
+
+static int
+rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
+ struct ib_mr *mr, struct ib_sge *iov)
+{
+ int rc;
+
+ ib_dma_unmap_single(ia->ri_id->device,
+ iov->addr, iov->length, DMA_BIDIRECTIONAL);
+
+ if (NULL == mr)
+ return 0;
+
+ rc = ib_dereg_mr(mr);
+ if (rc)
+ dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
+ return rc;
+}
+
+/**
+ * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
+ * @ia: controlling rpcrdma_ia
+ * @size: size of buffer to be allocated, in bytes
+ * @flags: GFP flags
+ *
+ * Returns pointer to private header of an area of internally
+ * registered memory, or an ERR_PTR. The registered buffer follows
+ * the end of the private header.
+ *
+ * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
+ * receiving the payload of RDMA RECV operations. regbufs are not
+ * used for RDMA READ/WRITE operations, thus are registered only for
+ * LOCAL access.
+ */
+struct rpcrdma_regbuf *
+rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
+{
+ struct rpcrdma_regbuf *rb;
+ int rc;
+
+ rc = -ENOMEM;
+ rb = kmalloc(sizeof(*rb) + size, flags);
+ if (rb == NULL)
+ goto out;
+
+ rb->rg_size = size;
+ rb->rg_owner = NULL;
+ rc = rpcrdma_register_internal(ia, rb->rg_base, size,
+ &rb->rg_mr, &rb->rg_iov);
+ if (rc)
+ goto out_free;
+
+ return rb;
+
+out_free:
+ kfree(rb);
+out:
+ return ERR_PTR(rc);
+}
+
+/**
+ * rpcrdma_free_regbuf - deregister and free registered buffer
+ * @ia: controlling rpcrdma_ia
+ * @rb: regbuf to be deregistered and freed
+ */
+void
+rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+ if (rb) {
+ rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
+ kfree(rb);
+ }
+}
+
+/*
+ * Prepost any receive buffer, then post send.
+ *
+ * Receive buffer is donated to hardware, reclaimed upon recv completion.
+ */
+int
+rpcrdma_ep_post(struct rpcrdma_ia *ia,
+ struct rpcrdma_ep *ep,
+ struct rpcrdma_req *req)
+{
+ struct ib_send_wr send_wr, *send_wr_fail;
+ struct rpcrdma_rep *rep = req->rl_reply;
+ int rc;
+
+ if (rep) {
+ rc = rpcrdma_ep_post_recv(ia, ep, rep);
+ if (rc)
+ goto out;
+ req->rl_reply = NULL;
+ }
+
+ send_wr.next = NULL;
+ send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
+ send_wr.sg_list = req->rl_send_iov;
+ send_wr.num_sge = req->rl_niovs;
+ send_wr.opcode = IB_WR_SEND;
+ if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
+ ib_dma_sync_single_for_device(ia->ri_id->device,
+ req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
+ DMA_TO_DEVICE);
+ ib_dma_sync_single_for_device(ia->ri_id->device,
+ req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
+ DMA_TO_DEVICE);
+ ib_dma_sync_single_for_device(ia->ri_id->device,
+ req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
+ DMA_TO_DEVICE);
+
+ if (DECR_CQCOUNT(ep) > 0)
+ send_wr.send_flags = 0;
+ else { /* Provider must take a send completion every now and then */
+ INIT_CQCOUNT(ep);
+ send_wr.send_flags = IB_SEND_SIGNALED;
+ }
+
+ rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+ if (rc)
+ dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
+ rc);
+out:
+ return rc;
+}
+
+/*
+ * (Re)post a receive buffer.
+ */
+int
+rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
+ struct rpcrdma_ep *ep,
+ struct rpcrdma_rep *rep)
+{
+ struct ib_recv_wr recv_wr, *recv_wr_fail;
+ int rc;
+
+ recv_wr.next = NULL;
+ recv_wr.wr_id = (u64) (unsigned long) rep;
+ recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+ recv_wr.num_sge = 1;
+
+ ib_dma_sync_single_for_cpu(ia->ri_id->device,
+ rdmab_addr(rep->rr_rdmabuf),
+ rdmab_length(rep->rr_rdmabuf),
+ DMA_BIDIRECTIONAL);
+
+ rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+
+ if (rc)
+ dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
+ rc);
+ return rc;
+}
+
+/* How many chunk list items fit within our inline buffers?
+ */
+unsigned int
+rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ int bytes, segments;
+
+ bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
+ bytes -= RPCRDMA_HDRLEN_MIN;
+ if (bytes < sizeof(struct rpcrdma_segment) * 2) {
+ pr_warn("RPC: %s: inline threshold too small\n",
+ __func__);
+ return 0;
+ }
+
+ segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
+ dprintk("RPC: %s: max chunk list size = %d segments\n",
+ __func__, segments);
+ return segments;
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 000000000..78e0b8bea
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
+#define _LINUX_SUNRPC_XPRT_RDMA_H
+
+#include <linux/wait.h> /* wait_queue_head_t, etc */
+#include <linux/spinlock.h> /* spinlock_t, etc */
+#include <linux/atomic.h> /* atomic_t, etc */
+#include <linux/workqueue.h> /* struct work_struct */
+
+#include <rdma/rdma_cm.h> /* RDMA connection api */
+#include <rdma/ib_verbs.h> /* RDMA verbs api */
+
+#include <linux/sunrpc/clnt.h> /* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
+#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
+#include <linux/sunrpc/svc.h> /* RPCSVC_MAXPAYLOAD */
+
+#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
+
+/*
+ * Interface Adapter -- one per transport instance
+ */
+struct rpcrdma_ia {
+ const struct rpcrdma_memreg_ops *ri_ops;
+ rwlock_t ri_qplock;
+ struct rdma_cm_id *ri_id;
+ struct ib_pd *ri_pd;
+ struct ib_mr *ri_bind_mem;
+ u32 ri_dma_lkey;
+ int ri_have_dma_lkey;
+ struct completion ri_done;
+ int ri_async_rc;
+ enum rpcrdma_memreg ri_memreg_strategy;
+ unsigned int ri_max_frmr_depth;
+ struct ib_device_attr ri_devattr;
+ struct ib_qp_attr ri_qp_attr;
+ struct ib_qp_init_attr ri_qp_init_attr;
+};
+
+/*
+ * RDMA Endpoint -- one per transport instance
+ */
+
+#define RPCRDMA_WC_BUDGET (128)
+#define RPCRDMA_POLLSIZE (16)
+
+struct rpcrdma_ep {
+ atomic_t rep_cqcount;
+ int rep_cqinit;
+ int rep_connected;
+ struct ib_qp_init_attr rep_attr;
+ wait_queue_head_t rep_connect_wait;
+ struct rpcrdma_regbuf *rep_padbuf;
+ struct rdma_conn_param rep_remote_cma;
+ struct sockaddr_storage rep_remote_addr;
+ struct delayed_work rep_connect_worker;
+ struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
+ struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
+};
+
+/*
+ * Force a signaled SEND Work Request every so often,
+ * in case the provider needs to do some housekeeping.
+ */
+#define RPCRDMA_MAX_UNSIGNALED_SENDS (32)
+
+#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+
+/* Force completion handler to ignore the signal
+ */
+#define RPCRDMA_IGNORE_COMPLETION (0ULL)
+
+/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
+ *
+ * The below structure appears at the front of a large region of kmalloc'd
+ * memory, which always starts on a good alignment boundary.
+ */
+
+struct rpcrdma_regbuf {
+ size_t rg_size;
+ struct rpcrdma_req *rg_owner;
+ struct ib_mr *rg_mr;
+ struct ib_sge rg_iov;
+ __be32 rg_base[0] __attribute__ ((aligned(256)));
+};
+
+static inline u64
+rdmab_addr(struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_iov.addr;
+}
+
+static inline u32
+rdmab_length(struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_iov.length;
+}
+
+static inline u32
+rdmab_lkey(struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_iov.lkey;
+}
+
+static inline struct rpcrdma_msg *
+rdmab_to_msg(struct rpcrdma_regbuf *rb)
+{
+ return (struct rpcrdma_msg *)rb->rg_base;
+}
+
+/*
+ * struct rpcrdma_rep -- this structure encapsulates state required to recv
+ * and complete a reply, asychronously. It needs several pieces of
+ * state:
+ * o recv buffer (posted to provider)
+ * o ib_sge (also donated to provider)
+ * o status of reply (length, success or not)
+ * o bookkeeping state to get run by tasklet (list, etc)
+ *
+ * These are allocated during initialization, per-transport instance;
+ * however, the tasklet execution list itself is global, as it should
+ * always be pretty short.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ */
+
+/* temporary static scatter/gather max */
+#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
+#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+
+struct rpcrdma_buffer;
+
+struct rpcrdma_rep {
+ unsigned int rr_len;
+ struct rpcrdma_buffer *rr_buffer;
+ struct rpc_xprt *rr_xprt;
+ void (*rr_func)(struct rpcrdma_rep *);
+ struct list_head rr_list;
+ struct rpcrdma_regbuf *rr_rdmabuf;
+};
+
+/*
+ * struct rpcrdma_mw - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ *
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
+ * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
+ * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
+ * track of registration metadata while each RPC is pending.
+ * rpcrdma_deregister_external() uses this metadata to unmap and
+ * release these resources when an RPC is complete.
+ */
+enum rpcrdma_frmr_state {
+ FRMR_IS_INVALID, /* ready to be used */
+ FRMR_IS_VALID, /* in use */
+ FRMR_IS_STALE, /* failed completion */
+};
+
+struct rpcrdma_frmr {
+ struct ib_fast_reg_page_list *fr_pgl;
+ struct ib_mr *fr_mr;
+ enum rpcrdma_frmr_state fr_state;
+};
+
+struct rpcrdma_mw {
+ union {
+ struct ib_fmr *fmr;
+ struct rpcrdma_frmr frmr;
+ } r;
+ void (*mw_sendcompletion)(struct ib_wc *);
+ struct list_head mw_list;
+ struct list_head mw_all;
+};
+
+/*
+ * struct rpcrdma_req -- structure central to the request/reply sequence.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ *
+ * It includes pre-registered buffer memory for send AND recv.
+ * The recv buffer, however, is not owned by this structure, and
+ * is "donated" to the hardware when a recv is posted. When a
+ * reply is handled, the recv buffer used is given back to the
+ * struct rpcrdma_req associated with the request.
+ *
+ * In addition to the basic memory, this structure includes an array
+ * of iovs for send operations. The reason is that the iovs passed to
+ * ib_post_{send,recv} must not be modified until the work request
+ * completes.
+ *
+ * NOTES:
+ * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
+ * marshal. The number needed varies depending on the iov lists that
+ * are passed to us, the memory registration mode we are in, and if
+ * physical addressing is used, the layout.
+ */
+
+struct rpcrdma_mr_seg { /* chunk descriptors */
+ struct rpcrdma_mw *rl_mw; /* registered MR */
+ u64 mr_base; /* registration result */
+ u32 mr_rkey; /* registration result */
+ u32 mr_len; /* length of chunk or segment */
+ int mr_nsegs; /* number of segments in chunk or 0 */
+ enum dma_data_direction mr_dir; /* segment mapping direction */
+ dma_addr_t mr_dma; /* segment mapping address */
+ size_t mr_dmalen; /* segment mapping length */
+ struct page *mr_page; /* owning page, if any */
+ char *mr_offset; /* kva if no page, else offset */
+};
+
+struct rpcrdma_req {
+ unsigned int rl_niovs; /* 0, 2 or 4 */
+ unsigned int rl_nchunks; /* non-zero if chunks */
+ unsigned int rl_connect_cookie; /* retry detection */
+ struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
+ struct ib_sge rl_send_iov[4]; /* for active requests */
+ struct rpcrdma_regbuf *rl_rdmabuf;
+ struct rpcrdma_regbuf *rl_sendbuf;
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+};
+
+static inline struct rpcrdma_req *
+rpcr_to_rdmar(struct rpc_rqst *rqst)
+{
+ void *buffer = rqst->rq_buffer;
+ struct rpcrdma_regbuf *rb;
+
+ rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
+ return rb->rg_owner;
+}
+
+/*
+ * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
+ * inline requests/replies, and client/server credits.
+ *
+ * One of these is associated with a transport instance
+ */
+struct rpcrdma_buffer {
+ spinlock_t rb_lock; /* protects indexes */
+ u32 rb_max_requests;/* client max requests */
+ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
+ struct list_head rb_all;
+ int rb_send_index;
+ struct rpcrdma_req **rb_send_bufs;
+ int rb_recv_index;
+ struct rpcrdma_rep **rb_recv_bufs;
+ char *rb_pool;
+};
+#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
+
+/*
+ * Internal structure for transport instance creation. This
+ * exists primarily for modularity.
+ *
+ * This data should be set with mount options
+ */
+struct rpcrdma_create_data_internal {
+ struct sockaddr_storage addr; /* RDMA server address */
+ unsigned int max_requests; /* max requests (slots) in flight */
+ unsigned int rsize; /* mount rsize - max read hdr+data */
+ unsigned int wsize; /* mount wsize - max write hdr+data */
+ unsigned int inline_rsize; /* max non-rdma read data payload */
+ unsigned int inline_wsize; /* max non-rdma write data payload */
+ unsigned int padding; /* non-rdma write header padding */
+};
+
+#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
+ (rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
+
+#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
+ (rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
+
+#define RPCRDMA_INLINE_PAD_VALUE(rq)\
+ rpcx_to_rdmad(rq->rq_xprt).padding
+
+/*
+ * Statistics for RPCRDMA
+ */
+struct rpcrdma_stats {
+ unsigned long read_chunk_count;
+ unsigned long write_chunk_count;
+ unsigned long reply_chunk_count;
+
+ unsigned long long total_rdma_request;
+ unsigned long long total_rdma_reply;
+
+ unsigned long long pullup_copy_count;
+ unsigned long long fixup_copy_count;
+ unsigned long hardway_register_count;
+ unsigned long failed_marshal_count;
+ unsigned long bad_reply_count;
+};
+
+/*
+ * Per-registration mode operations
+ */
+struct rpcrdma_xprt;
+struct rpcrdma_memreg_ops {
+ int (*ro_map)(struct rpcrdma_xprt *,
+ struct rpcrdma_mr_seg *, int, bool);
+ int (*ro_unmap)(struct rpcrdma_xprt *,
+ struct rpcrdma_mr_seg *);
+ int (*ro_open)(struct rpcrdma_ia *,
+ struct rpcrdma_ep *,
+ struct rpcrdma_create_data_internal *);
+ size_t (*ro_maxpages)(struct rpcrdma_xprt *);
+ int (*ro_init)(struct rpcrdma_xprt *);
+ void (*ro_reset)(struct rpcrdma_xprt *);
+ void (*ro_destroy)(struct rpcrdma_buffer *);
+ const char *ro_displayname;
+};
+
+extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
+
+/*
+ * RPCRDMA transport -- encapsulates the structures above for
+ * integration with RPC.
+ *
+ * The contained structures are embedded, not pointers,
+ * for convenience. This structure need not be visible externally.
+ *
+ * It is allocated and initialized during mount, and released
+ * during unmount.
+ */
+struct rpcrdma_xprt {
+ struct rpc_xprt rx_xprt;
+ struct rpcrdma_ia rx_ia;
+ struct rpcrdma_ep rx_ep;
+ struct rpcrdma_buffer rx_buf;
+ struct rpcrdma_create_data_internal rx_data;
+ struct delayed_work rx_connect_worker;
+ struct rpcrdma_stats rx_stats;
+};
+
+#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
+#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
+
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
+/*
+ * Interface Adapter calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
+void rpcrdma_ia_close(struct rpcrdma_ia *);
+
+/*
+ * Endpoint calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
+ struct rpcrdma_create_data_internal *);
+void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+
+int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
+ struct rpcrdma_req *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
+ struct rpcrdma_rep *);
+
+/*
+ * Buffer calls - xprtrdma/verbs.c
+ */
+int rpcrdma_buffer_create(struct rpcrdma_xprt *);
+void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+
+struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
+void rpcrdma_buffer_put(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+
+struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
+ size_t, gfp_t);
+void rpcrdma_free_regbuf(struct rpcrdma_ia *,
+ struct rpcrdma_regbuf *);
+
+unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
+
+static inline enum dma_data_direction
+rpcrdma_data_dir(bool writing)
+{
+ return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+}
+
+static inline void
+rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
+ enum dma_data_direction direction)
+{
+ seg->mr_dir = direction;
+ seg->mr_dmalen = seg->mr_len;
+
+ if (seg->mr_page)
+ seg->mr_dma = ib_dma_map_page(device,
+ seg->mr_page, offset_in_page(seg->mr_offset),
+ seg->mr_dmalen, seg->mr_dir);
+ else
+ seg->mr_dma = ib_dma_map_single(device,
+ seg->mr_offset,
+ seg->mr_dmalen, seg->mr_dir);
+
+ if (ib_dma_mapping_error(device, seg->mr_dma))
+ rpcrdma_mapping_error(seg);
+}
+
+static inline void
+rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
+{
+ if (seg->mr_page)
+ ib_dma_unmap_page(device,
+ seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+ else
+ ib_dma_unmap_single(device,
+ seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
+/*
+ * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+ */
+void rpcrdma_connect_worker(struct work_struct *);
+void rpcrdma_conn_func(struct rpcrdma_ep *);
+void rpcrdma_reply_handler(struct rpcrdma_rep *);
+
+/*
+ * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
+ */
+int rpcrdma_marshal_req(struct rpc_rqst *);
+
+/* Temporary NFS request map cache. Created in svc_rdma.c */
+extern struct kmem_cache *svc_rdma_map_cachep;
+/* WR context cache. Created in svc_rdma.c */
+extern struct kmem_cache *svc_rdma_ctxt_cachep;
+/* Workqueue created in svc_rdma.c */
+extern struct workqueue_struct *svc_rdma_wq;
+
+#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
+#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
+#else
+#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
+#endif
+
+#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
new file mode 100644
index 000000000..66891e32c
--- /dev/null
+++ b/net/sunrpc/xprtsock.c
@@ -0,0 +1,3034 @@
+/*
+ * linux/net/sunrpc/xprtsock.c
+ *
+ * Client-side transport implementation for sockets.
+ *
+ * TCP callback races fixes (C) 1998 Red Hat
+ * TCP send fixes (C) 1998 Red Hat
+ * TCP NFS related read + write fixes
+ * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
+ *
+ * Rewrite of larges part of the code in order to stabilize TCP stuff.
+ * Fix behaviour when socket buffer is full.
+ * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com>
+ *
+ * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005.
+ * <gilles.quillard@bull.net>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/pagemap.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/un.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/file.h>
+#ifdef CONFIG_SUNRPC_BACKCHANNEL
+#include <linux/sunrpc/bc_xprt.h>
+#endif
+
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+
+#include <trace/events/sunrpc.h>
+
+#include "sunrpc.h"
+
+static void xs_close(struct rpc_xprt *xprt);
+
+/*
+ * xprtsock tunables
+ */
+static unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE;
+static unsigned int xprt_tcp_slot_table_entries = RPC_MIN_SLOT_TABLE;
+static unsigned int xprt_max_tcp_slot_table_entries = RPC_MAX_SLOT_TABLE;
+
+static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
+static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+
+#define XS_TCP_LINGER_TO (15U * HZ)
+static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
+
+/*
+ * We can register our own files under /proc/sys/sunrpc by
+ * calling register_sysctl_table() again. The files in that
+ * directory become the union of all files registered there.
+ *
+ * We simply need to make sure that we don't collide with
+ * someone else's file names!
+ */
+
+static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE;
+static unsigned int max_tcp_slot_table_limit = RPC_MAX_SLOT_TABLE_LIMIT;
+static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT;
+static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+/*
+ * FIXME: changing the UDP slot table size should also resize the UDP
+ * socket buffers for existing UDP transports
+ */
+static struct ctl_table xs_tunables_table[] = {
+ {
+ .procname = "udp_slot_table_entries",
+ .data = &xprt_udp_slot_table_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_slot_table_size,
+ .extra2 = &max_slot_table_size
+ },
+ {
+ .procname = "tcp_slot_table_entries",
+ .data = &xprt_tcp_slot_table_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_slot_table_size,
+ .extra2 = &max_slot_table_size
+ },
+ {
+ .procname = "tcp_max_slot_table_entries",
+ .data = &xprt_max_tcp_slot_table_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_slot_table_size,
+ .extra2 = &max_tcp_slot_table_limit
+ },
+ {
+ .procname = "min_resvport",
+ .data = &xprt_min_resvport,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xprt_min_resvport_limit,
+ .extra2 = &xprt_max_resvport_limit
+ },
+ {
+ .procname = "max_resvport",
+ .data = &xprt_max_resvport,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xprt_min_resvport_limit,
+ .extra2 = &xprt_max_resvport_limit
+ },
+ {
+ .procname = "tcp_fin_timeout",
+ .data = &xs_tcp_fin_timeout,
+ .maxlen = sizeof(xs_tcp_fin_timeout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { },
+};
+
+static struct ctl_table sunrpc_table[] = {
+ {
+ .procname = "sunrpc",
+ .mode = 0555,
+ .child = xs_tunables_table
+ },
+ { },
+};
+
+#endif
+
+/*
+ * Wait duration for a reply from the RPC portmapper.
+ */
+#define XS_BIND_TO (60U * HZ)
+
+/*
+ * Delay if a UDP socket connect error occurs. This is most likely some
+ * kind of resource problem on the local host.
+ */
+#define XS_UDP_REEST_TO (2U * HZ)
+
+/*
+ * The reestablish timeout allows clients to delay for a bit before attempting
+ * to reconnect to a server that just dropped our connection.
+ *
+ * We implement an exponential backoff when trying to reestablish a TCP
+ * transport connection with the server. Some servers like to drop a TCP
+ * connection when they are overworked, so we start with a short timeout and
+ * increase over time if the server is down or not responding.
+ */
+#define XS_TCP_INIT_REEST_TO (3U * HZ)
+#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ)
+
+/*
+ * TCP idle timeout; client drops the transport socket if it is idle
+ * for this long. Note that we also timeout UDP sockets to prevent
+ * holding port numbers when there is no RPC traffic.
+ */
+#define XS_IDLE_DISC_TO (5U * 60 * HZ)
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# undef RPC_DEBUG_DATA
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+#ifdef RPC_DEBUG_DATA
+static void xs_pktdump(char *msg, u32 *packet, unsigned int count)
+{
+ u8 *buf = (u8 *) packet;
+ int j;
+
+ dprintk("RPC: %s\n", msg);
+ for (j = 0; j < count && j < 128; j += 4) {
+ if (!(j & 31)) {
+ if (j)
+ dprintk("\n");
+ dprintk("0x%04x ", j);
+ }
+ dprintk("%02x%02x%02x%02x ",
+ buf[j], buf[j+1], buf[j+2], buf[j+3]);
+ }
+ dprintk("\n");
+}
+#else
+static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count)
+{
+ /* NOP */
+}
+#endif
+
+static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
+{
+ return (struct rpc_xprt *) sk->sk_user_data;
+}
+
+static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt)
+{
+ return (struct sockaddr *) &xprt->addr;
+}
+
+static inline struct sockaddr_un *xs_addr_un(struct rpc_xprt *xprt)
+{
+ return (struct sockaddr_un *) &xprt->addr;
+}
+
+static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt)
+{
+ return (struct sockaddr_in *) &xprt->addr;
+}
+
+static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt)
+{
+ return (struct sockaddr_in6 *) &xprt->addr;
+}
+
+static void xs_format_common_peer_addresses(struct rpc_xprt *xprt)
+{
+ struct sockaddr *sap = xs_addr(xprt);
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ struct sockaddr_un *sun;
+ char buf[128];
+
+ switch (sap->sa_family) {
+ case AF_LOCAL:
+ sun = xs_addr_un(xprt);
+ strlcpy(buf, sun->sun_path, sizeof(buf));
+ xprt->address_strings[RPC_DISPLAY_ADDR] =
+ kstrdup(buf, GFP_KERNEL);
+ break;
+ case AF_INET:
+ (void)rpc_ntop(sap, buf, sizeof(buf));
+ xprt->address_strings[RPC_DISPLAY_ADDR] =
+ kstrdup(buf, GFP_KERNEL);
+ sin = xs_addr_in(xprt);
+ snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+ break;
+ case AF_INET6:
+ (void)rpc_ntop(sap, buf, sizeof(buf));
+ xprt->address_strings[RPC_DISPLAY_ADDR] =
+ kstrdup(buf, GFP_KERNEL);
+ sin6 = xs_addr_in6(xprt);
+ snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+ break;
+ default:
+ BUG();
+ }
+
+ xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+}
+
+static void xs_format_common_peer_ports(struct rpc_xprt *xprt)
+{
+ struct sockaddr *sap = xs_addr(xprt);
+ char buf[128];
+
+ snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
+ xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+ snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
+ xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+}
+
+static void xs_format_peer_addresses(struct rpc_xprt *xprt,
+ const char *protocol,
+ const char *netid)
+{
+ xprt->address_strings[RPC_DISPLAY_PROTO] = protocol;
+ xprt->address_strings[RPC_DISPLAY_NETID] = netid;
+ xs_format_common_peer_addresses(xprt);
+ xs_format_common_peer_ports(xprt);
+}
+
+static void xs_update_peer_port(struct rpc_xprt *xprt)
+{
+ kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
+ kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
+
+ xs_format_common_peer_ports(xprt);
+}
+
+static void xs_free_peer_addresses(struct rpc_xprt *xprt)
+{
+ unsigned int i;
+
+ for (i = 0; i < RPC_DISPLAY_MAX; i++)
+ switch (i) {
+ case RPC_DISPLAY_PROTO:
+ case RPC_DISPLAY_NETID:
+ continue;
+ default:
+ kfree(xprt->address_strings[i]);
+ }
+}
+
+#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more)
+{
+ struct msghdr msg = {
+ .msg_name = addr,
+ .msg_namelen = addrlen,
+ .msg_flags = XS_SENDMSG_FLAGS | (more ? MSG_MORE : 0),
+ };
+ struct kvec iov = {
+ .iov_base = vec->iov_base + base,
+ .iov_len = vec->iov_len - base,
+ };
+
+ if (iov.iov_len != 0)
+ return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
+ return kernel_sendmsg(sock, &msg, NULL, 0, 0);
+}
+
+static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy, int *sent_p)
+{
+ ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags);
+ struct page **ppage;
+ unsigned int remainder;
+ int err;
+
+ remainder = xdr->page_len - base;
+ base += xdr->page_base;
+ ppage = xdr->pages + (base >> PAGE_SHIFT);
+ base &= ~PAGE_MASK;
+ do_sendpage = sock->ops->sendpage;
+ if (!zerocopy)
+ do_sendpage = sock_no_sendpage;
+ for(;;) {
+ unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
+ int flags = XS_SENDMSG_FLAGS;
+
+ remainder -= len;
+ if (remainder != 0 || more)
+ flags |= MSG_MORE;
+ err = do_sendpage(sock, *ppage, base, len, flags);
+ if (remainder == 0 || err != len)
+ break;
+ *sent_p += err;
+ ppage++;
+ base = 0;
+ }
+ if (err > 0) {
+ *sent_p += err;
+ err = 0;
+ }
+ return err;
+}
+
+/**
+ * xs_sendpages - write pages directly to a socket
+ * @sock: socket to send on
+ * @addr: UDP only -- address of destination
+ * @addrlen: UDP only -- length of destination address
+ * @xdr: buffer containing this request
+ * @base: starting position in the buffer
+ * @zerocopy: true if it is safe to use sendpage()
+ * @sent_p: return the total number of bytes successfully queued for sending
+ *
+ */
+static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy, int *sent_p)
+{
+ unsigned int remainder = xdr->len - base;
+ int err = 0;
+ int sent = 0;
+
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+ if (base != 0) {
+ addr = NULL;
+ addrlen = 0;
+ }
+
+ if (base < xdr->head[0].iov_len || addr != NULL) {
+ unsigned int len = xdr->head[0].iov_len - base;
+ remainder -= len;
+ err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0);
+ if (remainder == 0 || err != len)
+ goto out;
+ *sent_p += err;
+ base = 0;
+ } else
+ base -= xdr->head[0].iov_len;
+
+ if (base < xdr->page_len) {
+ unsigned int len = xdr->page_len - base;
+ remainder -= len;
+ err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy, &sent);
+ *sent_p += sent;
+ if (remainder == 0 || sent != len)
+ goto out;
+ base = 0;
+ } else
+ base -= xdr->page_len;
+
+ if (base >= xdr->tail[0].iov_len)
+ return 0;
+ err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0);
+out:
+ if (err > 0) {
+ *sent_p += err;
+ err = 0;
+ }
+ return err;
+}
+
+static void xs_nospace_callback(struct rpc_task *task)
+{
+ struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
+
+ transport->inet->sk_write_pending--;
+ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+}
+
+/**
+ * xs_nospace - place task on wait queue if transmit was incomplete
+ * @task: task to put to sleep
+ *
+ */
+static int xs_nospace(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct sock *sk = transport->inet;
+ int ret = -EAGAIN;
+
+ dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
+ task->tk_pid, req->rq_slen - req->rq_bytes_sent,
+ req->rq_slen);
+
+ /* Protect against races with write_space */
+ spin_lock_bh(&xprt->transport_lock);
+
+ /* Don't race with disconnect */
+ if (xprt_connected(xprt)) {
+ if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
+ /*
+ * Notify TCP that we're limited by the application
+ * window size
+ */
+ set_bit(SOCK_NOSPACE, &transport->sock->flags);
+ sk->sk_write_pending++;
+ /* ...and wait for more buffer space */
+ xprt_wait_for_buffer_space(task, xs_nospace_callback);
+ }
+ } else {
+ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ ret = -ENOTCONN;
+ }
+
+ spin_unlock_bh(&xprt->transport_lock);
+
+ /* Race breaker in case memory is freed before above code is called */
+ sk->sk_write_space(sk);
+ return ret;
+}
+
+/*
+ * Construct a stream transport record marker in @buf.
+ */
+static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
+{
+ u32 reclen = buf->len - sizeof(rpc_fraghdr);
+ rpc_fraghdr *base = buf->head[0].iov_base;
+ *base = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | reclen);
+}
+
+/**
+ * xs_local_send_request - write an RPC request to an AF_LOCAL socket
+ * @task: RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ * 0: The request has been sent
+ * EAGAIN: The socket was blocked, please call again later to
+ * complete the request
+ * ENOTCONN: Caller needs to invoke connect logic then call again
+ * other: Some other error occured, the request was not sent
+ */
+static int xs_local_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct sock_xprt *transport =
+ container_of(xprt, struct sock_xprt, xprt);
+ struct xdr_buf *xdr = &req->rq_snd_buf;
+ int status;
+ int sent = 0;
+
+ xs_encode_stream_record_marker(&req->rq_snd_buf);
+
+ xs_pktdump("packet data:",
+ req->rq_svec->iov_base, req->rq_svec->iov_len);
+
+ status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent,
+ true, &sent);
+ dprintk("RPC: %s(%u) = %d\n",
+ __func__, xdr->len - req->rq_bytes_sent, status);
+ if (likely(sent > 0) || status == 0) {
+ req->rq_bytes_sent += sent;
+ req->rq_xmit_bytes_sent += sent;
+ if (likely(req->rq_bytes_sent >= req->rq_slen)) {
+ req->rq_bytes_sent = 0;
+ return 0;
+ }
+ status = -EAGAIN;
+ }
+
+ switch (status) {
+ case -ENOBUFS:
+ case -EAGAIN:
+ status = xs_nospace(task);
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
+ case -EPIPE:
+ xs_close(xprt);
+ status = -ENOTCONN;
+ }
+
+ return status;
+}
+
+/**
+ * xs_udp_send_request - write an RPC request to a UDP socket
+ * @task: address of RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ * 0: The request has been sent
+ * EAGAIN: The socket was blocked, please call again later to
+ * complete the request
+ * ENOTCONN: Caller needs to invoke connect logic then call again
+ * other: Some other error occurred, the request was not sent
+ */
+static int xs_udp_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct xdr_buf *xdr = &req->rq_snd_buf;
+ int sent = 0;
+ int status;
+
+ xs_pktdump("packet data:",
+ req->rq_svec->iov_base,
+ req->rq_svec->iov_len);
+
+ if (!xprt_bound(xprt))
+ return -ENOTCONN;
+ status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
+ xdr, req->rq_bytes_sent, true, &sent);
+
+ dprintk("RPC: xs_udp_send_request(%u) = %d\n",
+ xdr->len - req->rq_bytes_sent, status);
+
+ /* firewall is blocking us, don't return -EAGAIN or we end up looping */
+ if (status == -EPERM)
+ goto process_status;
+
+ if (sent > 0 || status == 0) {
+ req->rq_xmit_bytes_sent += sent;
+ if (sent >= req->rq_slen)
+ return 0;
+ /* Still some bytes left; set up for a retry later. */
+ status = -EAGAIN;
+ }
+
+process_status:
+ switch (status) {
+ case -ENOTSOCK:
+ status = -ENOTCONN;
+ /* Should we call xs_close() here? */
+ break;
+ case -EAGAIN:
+ status = xs_nospace(task);
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
+ case -ENETUNREACH:
+ case -ENOBUFS:
+ case -EPIPE:
+ case -ECONNREFUSED:
+ case -EPERM:
+ /* When the server has died, an ICMP port unreachable message
+ * prompts ECONNREFUSED. */
+ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ }
+
+ return status;
+}
+
+/**
+ * xs_tcp_shutdown - gracefully shut down a TCP socket
+ * @xprt: transport
+ *
+ * Initiates a graceful shutdown of the TCP socket by calling the
+ * equivalent of shutdown(SHUT_RDWR);
+ */
+static void xs_tcp_shutdown(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct socket *sock = transport->sock;
+
+ if (sock != NULL) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ trace_rpc_socket_shutdown(xprt, sock);
+ }
+}
+
+/**
+ * xs_tcp_send_request - write an RPC request to a TCP socket
+ * @task: address of RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ * 0: The request has been sent
+ * EAGAIN: The socket was blocked, please call again later to
+ * complete the request
+ * ENOTCONN: Caller needs to invoke connect logic then call again
+ * other: Some other error occurred, the request was not sent
+ *
+ * XXX: In the case of soft timeouts, should we eventually give up
+ * if sendmsg is not able to make progress?
+ */
+static int xs_tcp_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct xdr_buf *xdr = &req->rq_snd_buf;
+ bool zerocopy = true;
+ int status;
+ int sent;
+
+ xs_encode_stream_record_marker(&req->rq_snd_buf);
+
+ xs_pktdump("packet data:",
+ req->rq_svec->iov_base,
+ req->rq_svec->iov_len);
+ /* Don't use zero copy if this is a resend. If the RPC call
+ * completes while the socket holds a reference to the pages,
+ * then we may end up resending corrupted data.
+ */
+ if (task->tk_flags & RPC_TASK_SENT)
+ zerocopy = false;
+
+ /* Continue transmitting the packet/record. We must be careful
+ * to cope with writespace callbacks arriving _after_ we have
+ * called sendmsg(). */
+ while (1) {
+ sent = 0;
+ status = xs_sendpages(transport->sock, NULL, 0, xdr,
+ req->rq_bytes_sent, zerocopy, &sent);
+
+ dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
+ xdr->len - req->rq_bytes_sent, status);
+
+ if (unlikely(sent == 0 && status < 0))
+ break;
+
+ /* If we've sent the entire packet, immediately
+ * reset the count of bytes sent. */
+ req->rq_bytes_sent += sent;
+ req->rq_xmit_bytes_sent += sent;
+ if (likely(req->rq_bytes_sent >= req->rq_slen)) {
+ req->rq_bytes_sent = 0;
+ return 0;
+ }
+
+ if (sent != 0)
+ continue;
+ status = -EAGAIN;
+ break;
+ }
+
+ switch (status) {
+ case -ENOTSOCK:
+ status = -ENOTCONN;
+ /* Should we call xs_close() here? */
+ break;
+ case -ENOBUFS:
+ case -EAGAIN:
+ status = xs_nospace(task);
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
+ case -ECONNRESET:
+ case -ECONNREFUSED:
+ case -ENOTCONN:
+ case -EADDRINUSE:
+ case -EPIPE:
+ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ }
+
+ return status;
+}
+
+/**
+ * xs_tcp_release_xprt - clean up after a tcp transmission
+ * @xprt: transport
+ * @task: rpc task
+ *
+ * This cleans up if an error causes us to abort the transmission of a request.
+ * In this case, the socket may need to be reset in order to avoid confusing
+ * the server.
+ */
+static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct rpc_rqst *req;
+
+ if (task != xprt->snd_task)
+ return;
+ if (task == NULL)
+ goto out_release;
+ req = task->tk_rqstp;
+ if (req == NULL)
+ goto out_release;
+ if (req->rq_bytes_sent == 0)
+ goto out_release;
+ if (req->rq_bytes_sent == req->rq_snd_buf.len)
+ goto out_release;
+ set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+out_release:
+ xprt_release_xprt(xprt, task);
+}
+
+static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk)
+{
+ transport->old_data_ready = sk->sk_data_ready;
+ transport->old_state_change = sk->sk_state_change;
+ transport->old_write_space = sk->sk_write_space;
+ transport->old_error_report = sk->sk_error_report;
+}
+
+static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk)
+{
+ sk->sk_data_ready = transport->old_data_ready;
+ sk->sk_state_change = transport->old_state_change;
+ sk->sk_write_space = transport->old_write_space;
+ sk->sk_error_report = transport->old_error_report;
+}
+
+static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
+{
+ smp_mb__before_atomic();
+ clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ clear_bit(XPRT_CLOSING, &xprt->state);
+ smp_mb__after_atomic();
+}
+
+static void xs_sock_mark_closed(struct rpc_xprt *xprt)
+{
+ xs_sock_reset_connection_flags(xprt);
+ /* Mark transport as closed and wake up all pending tasks */
+ xprt_disconnect_done(xprt);
+}
+
+/**
+ * xs_error_report - callback to handle TCP socket state errors
+ * @sk: socket
+ *
+ * Note: we don't call sock_error() since there may be a rpc_task
+ * using the socket, and so we don't want to clear sk->sk_err.
+ */
+static void xs_error_report(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+ int err;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (!(xprt = xprt_from_sock(sk)))
+ goto out;
+
+ err = -sk->sk_err;
+ if (err == 0)
+ goto out;
+ /* Is this a reset event? */
+ if (sk->sk_state == TCP_CLOSE)
+ xs_sock_mark_closed(xprt);
+ dprintk("RPC: xs_error_report client %p, error=%d...\n",
+ xprt, -err);
+ trace_rpc_socket_error(xprt, sk->sk_socket, err);
+ xprt_wake_pending_tasks(xprt, err);
+ out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void xs_reset_transport(struct sock_xprt *transport)
+{
+ struct socket *sock = transport->sock;
+ struct sock *sk = transport->inet;
+ struct rpc_xprt *xprt = &transport->xprt;
+
+ if (sk == NULL)
+ return;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ transport->inet = NULL;
+ transport->sock = NULL;
+
+ sk->sk_user_data = NULL;
+
+ xs_restore_old_callbacks(transport, sk);
+ write_unlock_bh(&sk->sk_callback_lock);
+ xs_sock_reset_connection_flags(xprt);
+
+ trace_rpc_socket_close(xprt, sock);
+ sock_release(sock);
+}
+
+/**
+ * xs_close - close a socket
+ * @xprt: transport
+ *
+ * This is used when all requests are complete; ie, no DRC state remains
+ * on the server we want to save.
+ *
+ * The caller _must_ be holding XPRT_LOCKED in order to avoid issues with
+ * xs_reset_transport() zeroing the socket from underneath a writer.
+ */
+static void xs_close(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ dprintk("RPC: xs_close xprt %p\n", xprt);
+
+ xs_reset_transport(transport);
+ xprt->reestablish_timeout = 0;
+
+ xprt_disconnect_done(xprt);
+}
+
+static void xs_xprt_free(struct rpc_xprt *xprt)
+{
+ xs_free_peer_addresses(xprt);
+ xprt_free(xprt);
+}
+
+/**
+ * xs_destroy - prepare to shutdown a transport
+ * @xprt: doomed transport
+ *
+ */
+static void xs_destroy(struct rpc_xprt *xprt)
+{
+ dprintk("RPC: xs_destroy xprt %p\n", xprt);
+
+ xs_close(xprt);
+ xs_xprt_free(xprt);
+ module_put(THIS_MODULE);
+}
+
+static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
+{
+ struct xdr_skb_reader desc = {
+ .skb = skb,
+ .offset = sizeof(rpc_fraghdr),
+ .count = skb->len - sizeof(rpc_fraghdr),
+ };
+
+ if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
+ return -1;
+ if (desc.count)
+ return -1;
+ return 0;
+}
+
+/**
+ * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
+ * @sk: socket with data to read
+ * @len: how much data to read
+ *
+ * Currently this assumes we can read the whole reply in a single gulp.
+ */
+static void xs_local_data_ready(struct sock *sk)
+{
+ struct rpc_task *task;
+ struct rpc_xprt *xprt;
+ struct rpc_rqst *rovr;
+ struct sk_buff *skb;
+ int err, repsize, copied;
+ u32 _xid;
+ __be32 *xp;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ dprintk("RPC: %s...\n", __func__);
+ xprt = xprt_from_sock(sk);
+ if (xprt == NULL)
+ goto out;
+
+ skb = skb_recv_datagram(sk, 0, 1, &err);
+ if (skb == NULL)
+ goto out;
+
+ repsize = skb->len - sizeof(rpc_fraghdr);
+ if (repsize < 4) {
+ dprintk("RPC: impossible RPC reply size %d\n", repsize);
+ goto dropit;
+ }
+
+ /* Copy the XID from the skb... */
+ xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid);
+ if (xp == NULL)
+ goto dropit;
+
+ /* Look up and lock the request corresponding to the given XID */
+ spin_lock(&xprt->transport_lock);
+ rovr = xprt_lookup_rqst(xprt, *xp);
+ if (!rovr)
+ goto out_unlock;
+ task = rovr->rq_task;
+
+ copied = rovr->rq_private_buf.buflen;
+ if (copied > repsize)
+ copied = repsize;
+
+ if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
+ dprintk("RPC: sk_buff copy failed\n");
+ goto out_unlock;
+ }
+
+ xprt_complete_rqst(task, copied);
+
+ out_unlock:
+ spin_unlock(&xprt->transport_lock);
+ dropit:
+ skb_free_datagram(sk, skb);
+ out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_udp_data_ready - "data ready" callback for UDP sockets
+ * @sk: socket with data to read
+ * @len: how much data to read
+ *
+ */
+static void xs_udp_data_ready(struct sock *sk)
+{
+ struct rpc_task *task;
+ struct rpc_xprt *xprt;
+ struct rpc_rqst *rovr;
+ struct sk_buff *skb;
+ int err, repsize, copied;
+ u32 _xid;
+ __be32 *xp;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ dprintk("RPC: xs_udp_data_ready...\n");
+ if (!(xprt = xprt_from_sock(sk)))
+ goto out;
+
+ if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
+ goto out;
+
+ repsize = skb->len - sizeof(struct udphdr);
+ if (repsize < 4) {
+ dprintk("RPC: impossible RPC reply size %d!\n", repsize);
+ goto dropit;
+ }
+
+ /* Copy the XID from the skb... */
+ xp = skb_header_pointer(skb, sizeof(struct udphdr),
+ sizeof(_xid), &_xid);
+ if (xp == NULL)
+ goto dropit;
+
+ /* Look up and lock the request corresponding to the given XID */
+ spin_lock(&xprt->transport_lock);
+ rovr = xprt_lookup_rqst(xprt, *xp);
+ if (!rovr)
+ goto out_unlock;
+ task = rovr->rq_task;
+
+ if ((copied = rovr->rq_private_buf.buflen) > repsize)
+ copied = repsize;
+
+ /* Suck it into the iovec, verify checksum if not done by hw. */
+ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
+ UDPX_INC_STATS_BH(sk, UDP_MIB_INERRORS);
+ goto out_unlock;
+ }
+
+ UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS);
+
+ xprt_adjust_cwnd(xprt, task, copied);
+ xprt_complete_rqst(task, copied);
+
+ out_unlock:
+ spin_unlock(&xprt->transport_lock);
+ dropit:
+ skb_free_datagram(sk, skb);
+ out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/*
+ * Helper function to force a TCP close if the server is sending
+ * junk and/or it has put us in CLOSE_WAIT
+ */
+static void xs_tcp_force_close(struct rpc_xprt *xprt)
+{
+ xprt_force_disconnect(xprt);
+}
+
+static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ size_t len, used;
+ char *p;
+
+ p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset;
+ len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset;
+ used = xdr_skb_read_bits(desc, p, len);
+ transport->tcp_offset += used;
+ if (used != len)
+ return;
+
+ transport->tcp_reclen = ntohl(transport->tcp_fraghdr);
+ if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
+ transport->tcp_flags |= TCP_RCV_LAST_FRAG;
+ else
+ transport->tcp_flags &= ~TCP_RCV_LAST_FRAG;
+ transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
+
+ transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR;
+ transport->tcp_offset = 0;
+
+ /* Sanity check of the record length */
+ if (unlikely(transport->tcp_reclen < 8)) {
+ dprintk("RPC: invalid TCP record fragment length\n");
+ xs_tcp_force_close(xprt);
+ return;
+ }
+ dprintk("RPC: reading TCP record fragment of length %d\n",
+ transport->tcp_reclen);
+}
+
+static void xs_tcp_check_fraghdr(struct sock_xprt *transport)
+{
+ if (transport->tcp_offset == transport->tcp_reclen) {
+ transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR;
+ transport->tcp_offset = 0;
+ if (transport->tcp_flags & TCP_RCV_LAST_FRAG) {
+ transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+ transport->tcp_flags |= TCP_RCV_COPY_XID;
+ transport->tcp_copied = 0;
+ }
+ }
+}
+
+static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc)
+{
+ size_t len, used;
+ char *p;
+
+ len = sizeof(transport->tcp_xid) - transport->tcp_offset;
+ dprintk("RPC: reading XID (%Zu bytes)\n", len);
+ p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
+ used = xdr_skb_read_bits(desc, p, len);
+ transport->tcp_offset += used;
+ if (used != len)
+ return;
+ transport->tcp_flags &= ~TCP_RCV_COPY_XID;
+ transport->tcp_flags |= TCP_RCV_READ_CALLDIR;
+ transport->tcp_copied = 4;
+ dprintk("RPC: reading %s XID %08x\n",
+ (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for"
+ : "request with",
+ ntohl(transport->tcp_xid));
+ xs_tcp_check_fraghdr(transport);
+}
+
+static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
+ struct xdr_skb_reader *desc)
+{
+ size_t len, used;
+ u32 offset;
+ char *p;
+
+ /*
+ * We want transport->tcp_offset to be 8 at the end of this routine
+ * (4 bytes for the xid and 4 bytes for the call/reply flag).
+ * When this function is called for the first time,
+ * transport->tcp_offset is 4 (after having already read the xid).
+ */
+ offset = transport->tcp_offset - sizeof(transport->tcp_xid);
+ len = sizeof(transport->tcp_calldir) - offset;
+ dprintk("RPC: reading CALL/REPLY flag (%Zu bytes)\n", len);
+ p = ((char *) &transport->tcp_calldir) + offset;
+ used = xdr_skb_read_bits(desc, p, len);
+ transport->tcp_offset += used;
+ if (used != len)
+ return;
+ transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR;
+ /*
+ * We don't yet have the XDR buffer, so we will write the calldir
+ * out after we get the buffer from the 'struct rpc_rqst'
+ */
+ switch (ntohl(transport->tcp_calldir)) {
+ case RPC_REPLY:
+ transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
+ transport->tcp_flags |= TCP_RCV_COPY_DATA;
+ transport->tcp_flags |= TCP_RPC_REPLY;
+ break;
+ case RPC_CALL:
+ transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
+ transport->tcp_flags |= TCP_RCV_COPY_DATA;
+ transport->tcp_flags &= ~TCP_RPC_REPLY;
+ break;
+ default:
+ dprintk("RPC: invalid request message type\n");
+ xs_tcp_force_close(&transport->xprt);
+ }
+ xs_tcp_check_fraghdr(transport);
+}
+
+static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
+ struct xdr_skb_reader *desc,
+ struct rpc_rqst *req)
+{
+ struct sock_xprt *transport =
+ container_of(xprt, struct sock_xprt, xprt);
+ struct xdr_buf *rcvbuf;
+ size_t len;
+ ssize_t r;
+
+ rcvbuf = &req->rq_private_buf;
+
+ if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) {
+ /*
+ * Save the RPC direction in the XDR buffer
+ */
+ memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied,
+ &transport->tcp_calldir,
+ sizeof(transport->tcp_calldir));
+ transport->tcp_copied += sizeof(transport->tcp_calldir);
+ transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR;
+ }
+
+ len = desc->count;
+ if (len > transport->tcp_reclen - transport->tcp_offset) {
+ struct xdr_skb_reader my_desc;
+
+ len = transport->tcp_reclen - transport->tcp_offset;
+ memcpy(&my_desc, desc, sizeof(my_desc));
+ my_desc.count = len;
+ r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
+ &my_desc, xdr_skb_read_bits);
+ desc->count -= r;
+ desc->offset += r;
+ } else
+ r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
+ desc, xdr_skb_read_bits);
+
+ if (r > 0) {
+ transport->tcp_copied += r;
+ transport->tcp_offset += r;
+ }
+ if (r != len) {
+ /* Error when copying to the receive buffer,
+ * usually because we weren't able to allocate
+ * additional buffer pages. All we can do now
+ * is turn off TCP_RCV_COPY_DATA, so the request
+ * will not receive any additional updates,
+ * and time out.
+ * Any remaining data from this record will
+ * be discarded.
+ */
+ transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+ dprintk("RPC: XID %08x truncated request\n",
+ ntohl(transport->tcp_xid));
+ dprintk("RPC: xprt = %p, tcp_copied = %lu, "
+ "tcp_offset = %u, tcp_reclen = %u\n",
+ xprt, transport->tcp_copied,
+ transport->tcp_offset, transport->tcp_reclen);
+ return;
+ }
+
+ dprintk("RPC: XID %08x read %Zd bytes\n",
+ ntohl(transport->tcp_xid), r);
+ dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
+ "tcp_reclen = %u\n", xprt, transport->tcp_copied,
+ transport->tcp_offset, transport->tcp_reclen);
+
+ if (transport->tcp_copied == req->rq_private_buf.buflen)
+ transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+ else if (transport->tcp_offset == transport->tcp_reclen) {
+ if (transport->tcp_flags & TCP_RCV_LAST_FRAG)
+ transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+ }
+}
+
+/*
+ * Finds the request corresponding to the RPC xid and invokes the common
+ * tcp read code to read the data.
+ */
+static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
+ struct xdr_skb_reader *desc)
+{
+ struct sock_xprt *transport =
+ container_of(xprt, struct sock_xprt, xprt);
+ struct rpc_rqst *req;
+
+ dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
+
+ /* Find and lock the request corresponding to this xid */
+ spin_lock(&xprt->transport_lock);
+ req = xprt_lookup_rqst(xprt, transport->tcp_xid);
+ if (!req) {
+ dprintk("RPC: XID %08x request not found!\n",
+ ntohl(transport->tcp_xid));
+ spin_unlock(&xprt->transport_lock);
+ return -1;
+ }
+
+ xs_tcp_read_common(xprt, desc, req);
+
+ if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
+ xprt_complete_rqst(req->rq_task, transport->tcp_copied);
+
+ spin_unlock(&xprt->transport_lock);
+ return 0;
+}
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/*
+ * Obtains an rpc_rqst previously allocated and invokes the common
+ * tcp read code to read the data. The result is placed in the callback
+ * queue.
+ * If we're unable to obtain the rpc_rqst we schedule the closing of the
+ * connection and return -1.
+ */
+static int xs_tcp_read_callback(struct rpc_xprt *xprt,
+ struct xdr_skb_reader *desc)
+{
+ struct sock_xprt *transport =
+ container_of(xprt, struct sock_xprt, xprt);
+ struct rpc_rqst *req;
+
+ /* Look up and lock the request corresponding to the given XID */
+ spin_lock(&xprt->transport_lock);
+ req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
+ if (req == NULL) {
+ spin_unlock(&xprt->transport_lock);
+ printk(KERN_WARNING "Callback slot table overflowed\n");
+ xprt_force_disconnect(xprt);
+ return -1;
+ }
+
+ dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid));
+ xs_tcp_read_common(xprt, desc, req);
+
+ if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
+ xprt_complete_bc_request(req, transport->tcp_copied);
+ spin_unlock(&xprt->transport_lock);
+
+ return 0;
+}
+
+static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
+ struct xdr_skb_reader *desc)
+{
+ struct sock_xprt *transport =
+ container_of(xprt, struct sock_xprt, xprt);
+
+ return (transport->tcp_flags & TCP_RPC_REPLY) ?
+ xs_tcp_read_reply(xprt, desc) :
+ xs_tcp_read_callback(xprt, desc);
+}
+#else
+static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
+ struct xdr_skb_reader *desc)
+{
+ return xs_tcp_read_reply(xprt, desc);
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+/*
+ * Read data off the transport. This can be either an RPC_CALL or an
+ * RPC_REPLY. Relay the processing to helper functions.
+ */
+static void xs_tcp_read_data(struct rpc_xprt *xprt,
+ struct xdr_skb_reader *desc)
+{
+ struct sock_xprt *transport =
+ container_of(xprt, struct sock_xprt, xprt);
+
+ if (_xs_tcp_read_data(xprt, desc) == 0)
+ xs_tcp_check_fraghdr(transport);
+ else {
+ /*
+ * The transport_lock protects the request handling.
+ * There's no need to hold it to update the tcp_flags.
+ */
+ transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+ }
+}
+
+static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc)
+{
+ size_t len;
+
+ len = transport->tcp_reclen - transport->tcp_offset;
+ if (len > desc->count)
+ len = desc->count;
+ desc->count -= len;
+ desc->offset += len;
+ transport->tcp_offset += len;
+ dprintk("RPC: discarded %Zu bytes\n", len);
+ xs_tcp_check_fraghdr(transport);
+}
+
+static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
+{
+ struct rpc_xprt *xprt = rd_desc->arg.data;
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct xdr_skb_reader desc = {
+ .skb = skb,
+ .offset = offset,
+ .count = len,
+ };
+
+ dprintk("RPC: xs_tcp_data_recv started\n");
+ do {
+ trace_xs_tcp_data_recv(transport);
+ /* Read in a new fragment marker if necessary */
+ /* Can we ever really expect to get completely empty fragments? */
+ if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) {
+ xs_tcp_read_fraghdr(xprt, &desc);
+ continue;
+ }
+ /* Read in the xid if necessary */
+ if (transport->tcp_flags & TCP_RCV_COPY_XID) {
+ xs_tcp_read_xid(transport, &desc);
+ continue;
+ }
+ /* Read in the call/reply flag */
+ if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) {
+ xs_tcp_read_calldir(transport, &desc);
+ continue;
+ }
+ /* Read in the request data */
+ if (transport->tcp_flags & TCP_RCV_COPY_DATA) {
+ xs_tcp_read_data(xprt, &desc);
+ continue;
+ }
+ /* Skip over any trailing bytes on short reads */
+ xs_tcp_read_discard(transport, &desc);
+ } while (desc.count);
+ trace_xs_tcp_data_recv(transport);
+ dprintk("RPC: xs_tcp_data_recv done\n");
+ return len - desc.count;
+}
+
+/**
+ * xs_tcp_data_ready - "data ready" callback for TCP sockets
+ * @sk: socket with data to read
+ * @bytes: how much data to read
+ *
+ */
+static void xs_tcp_data_ready(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+ read_descriptor_t rd_desc;
+ int read;
+ unsigned long total = 0;
+
+ dprintk("RPC: xs_tcp_data_ready...\n");
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (!(xprt = xprt_from_sock(sk))) {
+ read = 0;
+ goto out;
+ }
+ /* Any data means we had a useful conversation, so
+ * the we don't need to delay the next reconnect
+ */
+ if (xprt->reestablish_timeout)
+ xprt->reestablish_timeout = 0;
+
+ /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
+ rd_desc.arg.data = xprt;
+ do {
+ rd_desc.count = 65536;
+ read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+ if (read > 0)
+ total += read;
+ } while (read > 0);
+out:
+ trace_xs_tcp_data_ready(xprt, read, total);
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_tcp_state_change - callback to handle TCP socket state changes
+ * @sk: socket whose state has changed
+ *
+ */
+static void xs_tcp_state_change(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (!(xprt = xprt_from_sock(sk)))
+ goto out;
+ dprintk("RPC: xs_tcp_state_change client %p...\n", xprt);
+ dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n",
+ sk->sk_state, xprt_connected(xprt),
+ sock_flag(sk, SOCK_DEAD),
+ sock_flag(sk, SOCK_ZAPPED),
+ sk->sk_shutdown);
+
+ trace_rpc_socket_state_change(xprt, sk->sk_socket);
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ spin_lock(&xprt->transport_lock);
+ if (!xprt_test_and_set_connected(xprt)) {
+ struct sock_xprt *transport = container_of(xprt,
+ struct sock_xprt, xprt);
+
+ /* Reset TCP record info */
+ transport->tcp_offset = 0;
+ transport->tcp_reclen = 0;
+ transport->tcp_copied = 0;
+ transport->tcp_flags =
+ TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
+ xprt->connect_cookie++;
+
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
+ }
+ spin_unlock(&xprt->transport_lock);
+ break;
+ case TCP_FIN_WAIT1:
+ /* The client initiated a shutdown of the socket */
+ xprt->connect_cookie++;
+ xprt->reestablish_timeout = 0;
+ set_bit(XPRT_CLOSING, &xprt->state);
+ smp_mb__before_atomic();
+ clear_bit(XPRT_CONNECTED, &xprt->state);
+ clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ smp_mb__after_atomic();
+ break;
+ case TCP_CLOSE_WAIT:
+ /* The server initiated a shutdown of the socket */
+ xprt->connect_cookie++;
+ clear_bit(XPRT_CONNECTED, &xprt->state);
+ xs_tcp_force_close(xprt);
+ case TCP_CLOSING:
+ /*
+ * If the server closed down the connection, make sure that
+ * we back off before reconnecting
+ */
+ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ break;
+ case TCP_LAST_ACK:
+ set_bit(XPRT_CLOSING, &xprt->state);
+ smp_mb__before_atomic();
+ clear_bit(XPRT_CONNECTED, &xprt->state);
+ smp_mb__after_atomic();
+ break;
+ case TCP_CLOSE:
+ xs_sock_mark_closed(xprt);
+ }
+ out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void xs_write_space(struct sock *sk)
+{
+ struct socket *sock;
+ struct rpc_xprt *xprt;
+
+ if (unlikely(!(sock = sk->sk_socket)))
+ return;
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+
+ if (unlikely(!(xprt = xprt_from_sock(sk))))
+ return;
+ if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0)
+ return;
+
+ xprt_write_space(xprt);
+}
+
+/**
+ * xs_udp_write_space - callback invoked when socket buffer space
+ * becomes available
+ * @sk: socket whose state has changed
+ *
+ * Called when more output buffer space is available for this socket.
+ * We try not to wake our writers until they can make "significant"
+ * progress, otherwise we'll waste resources thrashing kernel_sendmsg
+ * with a bunch of small requests.
+ */
+static void xs_udp_write_space(struct sock *sk)
+{
+ read_lock_bh(&sk->sk_callback_lock);
+
+ /* from net/core/sock.c:sock_def_write_space */
+ if (sock_writeable(sk))
+ xs_write_space(sk);
+
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_tcp_write_space - callback invoked when socket buffer space
+ * becomes available
+ * @sk: socket whose state has changed
+ *
+ * Called when more output buffer space is available for this socket.
+ * We try not to wake our writers until they can make "significant"
+ * progress, otherwise we'll waste resources thrashing kernel_sendmsg
+ * with a bunch of small requests.
+ */
+static void xs_tcp_write_space(struct sock *sk)
+{
+ read_lock_bh(&sk->sk_callback_lock);
+
+ /* from net/core/stream.c:sk_stream_write_space */
+ if (sk_stream_is_writeable(sk))
+ xs_write_space(sk);
+
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct sock *sk = transport->inet;
+
+ if (transport->rcvsize) {
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ sk->sk_rcvbuf = transport->rcvsize * xprt->max_reqs * 2;
+ }
+ if (transport->sndsize) {
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ sk->sk_sndbuf = transport->sndsize * xprt->max_reqs * 2;
+ sk->sk_write_space(sk);
+ }
+}
+
+/**
+ * xs_udp_set_buffer_size - set send and receive limits
+ * @xprt: generic transport
+ * @sndsize: requested size of send buffer, in bytes
+ * @rcvsize: requested size of receive buffer, in bytes
+ *
+ * Set socket send and receive buffer size limits.
+ */
+static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ transport->sndsize = 0;
+ if (sndsize)
+ transport->sndsize = sndsize + 1024;
+ transport->rcvsize = 0;
+ if (rcvsize)
+ transport->rcvsize = rcvsize + 1024;
+
+ xs_udp_do_set_buffer_size(xprt);
+}
+
+/**
+ * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport
+ * @task: task that timed out
+ *
+ * Adjust the congestion window after a retransmit timeout has occurred.
+ */
+static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ xprt_adjust_cwnd(xprt, task, -ETIMEDOUT);
+}
+
+static unsigned short xs_get_random_port(void)
+{
+ unsigned short range = xprt_max_resvport - xprt_min_resvport;
+ unsigned short rand = (unsigned short) prandom_u32() % range;
+ return rand + xprt_min_resvport;
+}
+
+/**
+ * xs_set_reuseaddr_port - set the socket's port and address reuse options
+ * @sock: socket
+ *
+ * Note that this function has to be called on all sockets that share the
+ * same port, and it must be called before binding.
+ */
+static void xs_sock_set_reuseport(struct socket *sock)
+{
+ int opt = 1;
+
+ kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT,
+ (char *)&opt, sizeof(opt));
+}
+
+static unsigned short xs_sock_getport(struct socket *sock)
+{
+ struct sockaddr_storage buf;
+ int buflen;
+ unsigned short port = 0;
+
+ if (kernel_getsockname(sock, (struct sockaddr *)&buf, &buflen) < 0)
+ goto out;
+ switch (buf.ss_family) {
+ case AF_INET6:
+ port = ntohs(((struct sockaddr_in6 *)&buf)->sin6_port);
+ break;
+ case AF_INET:
+ port = ntohs(((struct sockaddr_in *)&buf)->sin_port);
+ }
+out:
+ return port;
+}
+
+/**
+ * xs_set_port - reset the port number in the remote endpoint address
+ * @xprt: generic transport
+ * @port: new port number
+ *
+ */
+static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
+{
+ dprintk("RPC: setting port for xprt %p to %u\n", xprt, port);
+
+ rpc_set_port(xs_addr(xprt), port);
+ xs_update_peer_port(xprt);
+}
+
+static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
+{
+ if (transport->srcport == 0)
+ transport->srcport = xs_sock_getport(sock);
+}
+
+static unsigned short xs_get_srcport(struct sock_xprt *transport)
+{
+ unsigned short port = transport->srcport;
+
+ if (port == 0 && transport->xprt.resvport)
+ port = xs_get_random_port();
+ return port;
+}
+
+static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port)
+{
+ if (transport->srcport != 0)
+ transport->srcport = 0;
+ if (!transport->xprt.resvport)
+ return 0;
+ if (port <= xprt_min_resvport || port > xprt_max_resvport)
+ return xprt_max_resvport;
+ return --port;
+}
+static int xs_bind(struct sock_xprt *transport, struct socket *sock)
+{
+ struct sockaddr_storage myaddr;
+ int err, nloop = 0;
+ unsigned short port = xs_get_srcport(transport);
+ unsigned short last;
+
+ /*
+ * If we are asking for any ephemeral port (i.e. port == 0 &&
+ * transport->xprt.resvport == 0), don't bind. Let the local
+ * port selection happen implicitly when the socket is used
+ * (for example at connect time).
+ *
+ * This ensures that we can continue to establish TCP
+ * connections even when all local ephemeral ports are already
+ * a part of some TCP connection. This makes no difference
+ * for UDP sockets, but also doens't harm them.
+ *
+ * If we're asking for any reserved port (i.e. port == 0 &&
+ * transport->xprt.resvport == 1) xs_get_srcport above will
+ * ensure that port is non-zero and we will bind as needed.
+ */
+ if (port == 0)
+ return 0;
+
+ memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
+ do {
+ rpc_set_port((struct sockaddr *)&myaddr, port);
+ err = kernel_bind(sock, (struct sockaddr *)&myaddr,
+ transport->xprt.addrlen);
+ if (err == 0) {
+ transport->srcport = port;
+ break;
+ }
+ last = port;
+ port = xs_next_srcport(transport, port);
+ if (port > last)
+ nloop++;
+ } while (err == -EADDRINUSE && nloop != 2);
+
+ if (myaddr.ss_family == AF_INET)
+ dprintk("RPC: %s %pI4:%u: %s (%d)\n", __func__,
+ &((struct sockaddr_in *)&myaddr)->sin_addr,
+ port, err ? "failed" : "ok", err);
+ else
+ dprintk("RPC: %s %pI6:%u: %s (%d)\n", __func__,
+ &((struct sockaddr_in6 *)&myaddr)->sin6_addr,
+ port, err ? "failed" : "ok", err);
+ return err;
+}
+
+/*
+ * We don't support autobind on AF_LOCAL sockets
+ */
+static void xs_local_rpcbind(struct rpc_task *task)
+{
+ rcu_read_lock();
+ xprt_set_bound(rcu_dereference(task->tk_client->cl_xprt));
+ rcu_read_unlock();
+}
+
+static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port)
+{
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key xs_key[2];
+static struct lock_class_key xs_slock_key[2];
+
+static inline void xs_reclassify_socketu(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC",
+ &xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]);
+}
+
+static inline void xs_reclassify_socket4(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC",
+ &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]);
+}
+
+static inline void xs_reclassify_socket6(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC",
+ &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]);
+}
+
+static inline void xs_reclassify_socket(int family, struct socket *sock)
+{
+ WARN_ON_ONCE(sock_owned_by_user(sock->sk));
+ if (sock_owned_by_user(sock->sk))
+ return;
+
+ switch (family) {
+ case AF_LOCAL:
+ xs_reclassify_socketu(sock);
+ break;
+ case AF_INET:
+ xs_reclassify_socket4(sock);
+ break;
+ case AF_INET6:
+ xs_reclassify_socket6(sock);
+ break;
+ }
+}
+#else
+static inline void xs_reclassify_socketu(struct socket *sock)
+{
+}
+
+static inline void xs_reclassify_socket4(struct socket *sock)
+{
+}
+
+static inline void xs_reclassify_socket6(struct socket *sock)
+{
+}
+
+static inline void xs_reclassify_socket(int family, struct socket *sock)
+{
+}
+#endif
+
+static void xs_dummy_setup_socket(struct work_struct *work)
+{
+}
+
+static struct socket *xs_create_sock(struct rpc_xprt *xprt,
+ struct sock_xprt *transport, int family, int type,
+ int protocol, bool reuseport)
+{
+ struct socket *sock;
+ int err;
+
+ err = __sock_create(xprt->xprt_net, family, type, protocol, &sock, 1);
+ if (err < 0) {
+ dprintk("RPC: can't create %d transport socket (%d).\n",
+ protocol, -err);
+ goto out;
+ }
+ xs_reclassify_socket(family, sock);
+
+ if (reuseport)
+ xs_sock_set_reuseport(sock);
+
+ err = xs_bind(transport, sock);
+ if (err) {
+ sock_release(sock);
+ goto out;
+ }
+
+ return sock;
+out:
+ return ERR_PTR(err);
+}
+
+static int xs_local_finish_connecting(struct rpc_xprt *xprt,
+ struct socket *sock)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+ xprt);
+
+ if (!transport->inet) {
+ struct sock *sk = sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ xs_save_old_callbacks(transport, sk);
+
+ sk->sk_user_data = xprt;
+ sk->sk_data_ready = xs_local_data_ready;
+ sk->sk_write_space = xs_udp_write_space;
+ sk->sk_error_report = xs_error_report;
+ sk->sk_allocation = GFP_ATOMIC;
+
+ xprt_clear_connected(xprt);
+
+ /* Reset to new socket */
+ transport->sock = sock;
+ transport->inet = sk;
+
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+
+ /* Tell the socket layer to start connecting... */
+ xprt->stat.connect_count++;
+ xprt->stat.connect_start = jiffies;
+ return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0);
+}
+
+/**
+ * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint
+ * @xprt: RPC transport to connect
+ * @transport: socket transport to connect
+ * @create_sock: function to create a socket of the correct type
+ */
+static int xs_local_setup_socket(struct sock_xprt *transport)
+{
+ struct rpc_xprt *xprt = &transport->xprt;
+ struct socket *sock;
+ int status = -EIO;
+
+ status = __sock_create(xprt->xprt_net, AF_LOCAL,
+ SOCK_STREAM, 0, &sock, 1);
+ if (status < 0) {
+ dprintk("RPC: can't create AF_LOCAL "
+ "transport socket (%d).\n", -status);
+ goto out;
+ }
+ xs_reclassify_socketu(sock);
+
+ dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n",
+ xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ status = xs_local_finish_connecting(xprt, sock);
+ trace_rpc_socket_connect(xprt, sock, status);
+ switch (status) {
+ case 0:
+ dprintk("RPC: xprt %p connected to %s\n",
+ xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+ xprt_set_connected(xprt);
+ case -ENOBUFS:
+ break;
+ case -ENOENT:
+ dprintk("RPC: xprt %p: socket %s does not exist\n",
+ xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+ break;
+ case -ECONNREFUSED:
+ dprintk("RPC: xprt %p: connection refused for %s\n",
+ xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+ break;
+ default:
+ printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n",
+ __func__, -status,
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+ }
+
+out:
+ xprt_clear_connecting(xprt);
+ xprt_wake_pending_tasks(xprt, status);
+ return status;
+}
+
+static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ int ret;
+
+ if (RPC_IS_ASYNC(task)) {
+ /*
+ * We want the AF_LOCAL connect to be resolved in the
+ * filesystem namespace of the process making the rpc
+ * call. Thus we connect synchronously.
+ *
+ * If we want to support asynchronous AF_LOCAL calls,
+ * we'll need to figure out how to pass a namespace to
+ * connect.
+ */
+ rpc_exit(task, -ENOTCONN);
+ return;
+ }
+ ret = xs_local_setup_socket(transport);
+ if (ret && !RPC_IS_SOFTCONN(task))
+ msleep_interruptible(15000);
+}
+
+#ifdef CONFIG_SUNRPC_SWAP
+static void xs_set_memalloc(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+ xprt);
+
+ if (xprt->swapper)
+ sk_set_memalloc(transport->inet);
+}
+
+/**
+ * xs_swapper - Tag this transport as being used for swap.
+ * @xprt: transport to tag
+ * @enable: enable/disable
+ *
+ */
+int xs_swapper(struct rpc_xprt *xprt, int enable)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+ xprt);
+ int err = 0;
+
+ if (enable) {
+ xprt->swapper++;
+ xs_set_memalloc(xprt);
+ } else if (xprt->swapper) {
+ xprt->swapper--;
+ sk_clear_memalloc(transport->inet);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(xs_swapper);
+#else
+static void xs_set_memalloc(struct rpc_xprt *xprt)
+{
+}
+#endif
+
+static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ if (!transport->inet) {
+ struct sock *sk = sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ xs_save_old_callbacks(transport, sk);
+
+ sk->sk_user_data = xprt;
+ sk->sk_data_ready = xs_udp_data_ready;
+ sk->sk_write_space = xs_udp_write_space;
+ sk->sk_allocation = GFP_ATOMIC;
+
+ xprt_set_connected(xprt);
+
+ /* Reset to new socket */
+ transport->sock = sock;
+ transport->inet = sk;
+
+ xs_set_memalloc(xprt);
+
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+ xs_udp_do_set_buffer_size(xprt);
+}
+
+static void xs_udp_setup_socket(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, connect_worker.work);
+ struct rpc_xprt *xprt = &transport->xprt;
+ struct socket *sock = transport->sock;
+ int status = -EIO;
+
+ sock = xs_create_sock(xprt, transport,
+ xs_addr(xprt)->sa_family, SOCK_DGRAM,
+ IPPROTO_UDP, false);
+ if (IS_ERR(sock))
+ goto out;
+
+ dprintk("RPC: worker connecting xprt %p via %s to "
+ "%s (port %s)\n", xprt,
+ xprt->address_strings[RPC_DISPLAY_PROTO],
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT]);
+
+ xs_udp_finish_connecting(xprt, sock);
+ trace_rpc_socket_connect(xprt, sock, 0);
+ status = 0;
+out:
+ xprt_unlock_connect(xprt, transport);
+ xprt_clear_connecting(xprt);
+ xprt_wake_pending_tasks(xprt, status);
+}
+
+static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ int ret = -ENOTCONN;
+
+ if (!transport->inet) {
+ struct sock *sk = sock->sk;
+ unsigned int keepidle = xprt->timeout->to_initval / HZ;
+ unsigned int keepcnt = xprt->timeout->to_retries + 1;
+ unsigned int opt_on = 1;
+
+ /* TCP Keepalive options */
+ kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&opt_on, sizeof(opt_on));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+ (char *)&keepidle, sizeof(keepidle));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+ (char *)&keepidle, sizeof(keepidle));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+ (char *)&keepcnt, sizeof(keepcnt));
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ xs_save_old_callbacks(transport, sk);
+
+ sk->sk_user_data = xprt;
+ sk->sk_data_ready = xs_tcp_data_ready;
+ sk->sk_state_change = xs_tcp_state_change;
+ sk->sk_write_space = xs_tcp_write_space;
+ sk->sk_error_report = xs_error_report;
+ sk->sk_allocation = GFP_ATOMIC;
+
+ /* socket options */
+ sock_reset_flag(sk, SOCK_LINGER);
+ tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+
+ xprt_clear_connected(xprt);
+
+ /* Reset to new socket */
+ transport->sock = sock;
+ transport->inet = sk;
+
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+
+ if (!xprt_bound(xprt))
+ goto out;
+
+ xs_set_memalloc(xprt);
+
+ /* Tell the socket layer to start connecting... */
+ xprt->stat.connect_count++;
+ xprt->stat.connect_start = jiffies;
+ ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
+ switch (ret) {
+ case 0:
+ xs_set_srcport(transport, sock);
+ case -EINPROGRESS:
+ /* SYN_SENT! */
+ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ }
+out:
+ return ret;
+}
+
+/**
+ * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
+ * @xprt: RPC transport to connect
+ * @transport: socket transport to connect
+ * @create_sock: function to create a socket of the correct type
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_tcp_setup_socket(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, connect_worker.work);
+ struct socket *sock = transport->sock;
+ struct rpc_xprt *xprt = &transport->xprt;
+ int status = -EIO;
+
+ if (!sock) {
+ sock = xs_create_sock(xprt, transport,
+ xs_addr(xprt)->sa_family, SOCK_STREAM,
+ IPPROTO_TCP, true);
+ if (IS_ERR(sock)) {
+ status = PTR_ERR(sock);
+ goto out;
+ }
+ }
+
+ dprintk("RPC: worker connecting xprt %p via %s to "
+ "%s (port %s)\n", xprt,
+ xprt->address_strings[RPC_DISPLAY_PROTO],
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT]);
+
+ status = xs_tcp_finish_connecting(xprt, sock);
+ trace_rpc_socket_connect(xprt, sock, status);
+ dprintk("RPC: %p connect status %d connected %d sock state %d\n",
+ xprt, -status, xprt_connected(xprt),
+ sock->sk->sk_state);
+ switch (status) {
+ default:
+ printk("%s: connect returned unhandled error %d\n",
+ __func__, status);
+ case -EADDRNOTAVAIL:
+ /* We're probably in TIME_WAIT. Get rid of existing socket,
+ * and retry
+ */
+ xs_tcp_force_close(xprt);
+ break;
+ case 0:
+ case -EINPROGRESS:
+ case -EALREADY:
+ xprt_unlock_connect(xprt, transport);
+ xprt_clear_connecting(xprt);
+ return;
+ case -EINVAL:
+ /* Happens, for instance, if the user specified a link
+ * local IPv6 address without a scope-id.
+ */
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ENETUNREACH:
+ case -EADDRINUSE:
+ case -ENOBUFS:
+ /* retry with existing socket, after a delay */
+ xs_tcp_force_close(xprt);
+ goto out;
+ }
+ status = -EAGAIN;
+out:
+ xprt_unlock_connect(xprt, transport);
+ xprt_clear_connecting(xprt);
+ xprt_wake_pending_tasks(xprt, status);
+}
+
+/**
+ * xs_connect - connect a socket to a remote endpoint
+ * @xprt: pointer to transport structure
+ * @task: address of RPC task that manages state of connect request
+ *
+ * TCP: If the remote end dropped the connection, delay reconnecting.
+ *
+ * UDP socket connects are synchronous, but we use a work queue anyway
+ * to guarantee that even unprivileged user processes can set up a
+ * socket on a privileged port.
+ *
+ * If a UDP socket connect fails, the delay behavior here prevents
+ * retry floods (hard mounts).
+ */
+static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
+
+ /* Start by resetting any existing state */
+ xs_reset_transport(transport);
+
+ if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
+ dprintk("RPC: xs_connect delayed xprt %p for %lu "
+ "seconds\n",
+ xprt, xprt->reestablish_timeout / HZ);
+ queue_delayed_work(rpciod_workqueue,
+ &transport->connect_worker,
+ xprt->reestablish_timeout);
+ xprt->reestablish_timeout <<= 1;
+ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
+ xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
+ } else {
+ dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
+ queue_delayed_work(rpciod_workqueue,
+ &transport->connect_worker, 0);
+ }
+}
+
+/**
+ * xs_local_print_stats - display AF_LOCAL socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+ long idle_time = 0;
+
+ if (xprt_connected(xprt))
+ idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+ seq_printf(seq, "\txprt:\tlocal %lu %lu %lu %ld %lu %lu %lu "
+ "%llu %llu %lu %llu %llu\n",
+ xprt->stat.bind_count,
+ xprt->stat.connect_count,
+ xprt->stat.connect_time,
+ idle_time,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u,
+ xprt->stat.max_slots,
+ xprt->stat.sending_u,
+ xprt->stat.pending_u);
+}
+
+/**
+ * xs_udp_print_stats - display UDP socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %llu %llu "
+ "%lu %llu %llu\n",
+ transport->srcport,
+ xprt->stat.bind_count,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u,
+ xprt->stat.max_slots,
+ xprt->stat.sending_u,
+ xprt->stat.pending_u);
+}
+
+/**
+ * xs_tcp_print_stats - display TCP socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ long idle_time = 0;
+
+ if (xprt_connected(xprt))
+ idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+ seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu "
+ "%llu %llu %lu %llu %llu\n",
+ transport->srcport,
+ xprt->stat.bind_count,
+ xprt->stat.connect_count,
+ xprt->stat.connect_time,
+ idle_time,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u,
+ xprt->stat.max_slots,
+ xprt->stat.sending_u,
+ xprt->stat.pending_u);
+}
+
+/*
+ * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason
+ * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
+ * to use the server side send routines.
+ */
+static void *bc_malloc(struct rpc_task *task, size_t size)
+{
+ struct page *page;
+ struct rpc_buffer *buf;
+
+ WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer));
+ if (size > PAGE_SIZE - sizeof(struct rpc_buffer))
+ return NULL;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return NULL;
+
+ buf = page_address(page);
+ buf->len = PAGE_SIZE;
+
+ return buf->data;
+}
+
+/*
+ * Free the space allocated in the bc_alloc routine
+ */
+static void bc_free(void *buffer)
+{
+ struct rpc_buffer *buf;
+
+ if (!buffer)
+ return;
+
+ buf = container_of(buffer, struct rpc_buffer, data);
+ free_page((unsigned long)buf);
+}
+
+/*
+ * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex
+ * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request.
+ */
+static int bc_sendto(struct rpc_rqst *req)
+{
+ int len;
+ struct xdr_buf *xbufp = &req->rq_snd_buf;
+ struct rpc_xprt *xprt = req->rq_xprt;
+ struct sock_xprt *transport =
+ container_of(xprt, struct sock_xprt, xprt);
+ struct socket *sock = transport->sock;
+ unsigned long headoff;
+ unsigned long tailoff;
+
+ xs_encode_stream_record_marker(xbufp);
+
+ tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
+ headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
+ len = svc_send_common(sock, xbufp,
+ virt_to_page(xbufp->head[0].iov_base), headoff,
+ xbufp->tail[0].iov_base, tailoff);
+
+ if (len != xbufp->len) {
+ printk(KERN_NOTICE "Error sending entire callback!\n");
+ len = -EAGAIN;
+ }
+
+ return len;
+}
+
+/*
+ * The send routine. Borrows from svc_send
+ */
+static int bc_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct svc_xprt *xprt;
+ u32 len;
+
+ dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
+ /*
+ * Get the server socket associated with this callback xprt
+ */
+ xprt = req->rq_xprt->bc_xprt;
+
+ /*
+ * Grab the mutex to serialize data as the connection is shared
+ * with the fore channel
+ */
+ if (!mutex_trylock(&xprt->xpt_mutex)) {
+ rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL);
+ if (!mutex_trylock(&xprt->xpt_mutex))
+ return -EAGAIN;
+ rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task);
+ }
+ if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+ len = -ENOTCONN;
+ else
+ len = bc_sendto(req);
+ mutex_unlock(&xprt->xpt_mutex);
+
+ if (len > 0)
+ len = 0;
+
+ return len;
+}
+
+/*
+ * The close routine. Since this is client initiated, we do nothing
+ */
+
+static void bc_close(struct rpc_xprt *xprt)
+{
+}
+
+/*
+ * The xprt destroy routine. Again, because this connection is client
+ * initiated, we do nothing
+ */
+
+static void bc_destroy(struct rpc_xprt *xprt)
+{
+ dprintk("RPC: bc_destroy xprt %p\n", xprt);
+
+ xs_xprt_free(xprt);
+ module_put(THIS_MODULE);
+}
+
+static struct rpc_xprt_ops xs_local_ops = {
+ .reserve_xprt = xprt_reserve_xprt,
+ .release_xprt = xs_tcp_release_xprt,
+ .alloc_slot = xprt_alloc_slot,
+ .rpcbind = xs_local_rpcbind,
+ .set_port = xs_local_set_port,
+ .connect = xs_local_connect,
+ .buf_alloc = rpc_malloc,
+ .buf_free = rpc_free,
+ .send_request = xs_local_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .close = xs_close,
+ .destroy = xs_destroy,
+ .print_stats = xs_local_print_stats,
+};
+
+static struct rpc_xprt_ops xs_udp_ops = {
+ .set_buffer_size = xs_udp_set_buffer_size,
+ .reserve_xprt = xprt_reserve_xprt_cong,
+ .release_xprt = xprt_release_xprt_cong,
+ .alloc_slot = xprt_alloc_slot,
+ .rpcbind = rpcb_getport_async,
+ .set_port = xs_set_port,
+ .connect = xs_connect,
+ .buf_alloc = rpc_malloc,
+ .buf_free = rpc_free,
+ .send_request = xs_udp_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_rtt,
+ .timer = xs_udp_timer,
+ .release_request = xprt_release_rqst_cong,
+ .close = xs_close,
+ .destroy = xs_destroy,
+ .print_stats = xs_udp_print_stats,
+};
+
+static struct rpc_xprt_ops xs_tcp_ops = {
+ .reserve_xprt = xprt_reserve_xprt,
+ .release_xprt = xs_tcp_release_xprt,
+ .alloc_slot = xprt_lock_and_alloc_slot,
+ .rpcbind = rpcb_getport_async,
+ .set_port = xs_set_port,
+ .connect = xs_connect,
+ .buf_alloc = rpc_malloc,
+ .buf_free = rpc_free,
+ .send_request = xs_tcp_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .close = xs_tcp_shutdown,
+ .destroy = xs_destroy,
+ .print_stats = xs_tcp_print_stats,
+};
+
+/*
+ * The rpc_xprt_ops for the server backchannel
+ */
+
+static struct rpc_xprt_ops bc_tcp_ops = {
+ .reserve_xprt = xprt_reserve_xprt,
+ .release_xprt = xprt_release_xprt,
+ .alloc_slot = xprt_alloc_slot,
+ .buf_alloc = bc_malloc,
+ .buf_free = bc_free,
+ .send_request = bc_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .close = bc_close,
+ .destroy = bc_destroy,
+ .print_stats = xs_tcp_print_stats,
+};
+
+static int xs_init_anyaddr(const int family, struct sockaddr *sap)
+{
+ static const struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ };
+ static const struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_ANY_INIT,
+ };
+
+ switch (family) {
+ case AF_LOCAL:
+ break;
+ case AF_INET:
+ memcpy(sap, &sin, sizeof(sin));
+ break;
+ case AF_INET6:
+ memcpy(sap, &sin6, sizeof(sin6));
+ break;
+ default:
+ dprintk("RPC: %s: Bad address family\n", __func__);
+ return -EAFNOSUPPORT;
+ }
+ return 0;
+}
+
+static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
+ unsigned int slot_table_size,
+ unsigned int max_slot_table_size)
+{
+ struct rpc_xprt *xprt;
+ struct sock_xprt *new;
+
+ if (args->addrlen > sizeof(xprt->addr)) {
+ dprintk("RPC: xs_setup_xprt: address too large\n");
+ return ERR_PTR(-EBADF);
+ }
+
+ xprt = xprt_alloc(args->net, sizeof(*new), slot_table_size,
+ max_slot_table_size);
+ if (xprt == NULL) {
+ dprintk("RPC: xs_setup_xprt: couldn't allocate "
+ "rpc_xprt\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ new = container_of(xprt, struct sock_xprt, xprt);
+ memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+ xprt->addrlen = args->addrlen;
+ if (args->srcaddr)
+ memcpy(&new->srcaddr, args->srcaddr, args->addrlen);
+ else {
+ int err;
+ err = xs_init_anyaddr(args->dstaddr->sa_family,
+ (struct sockaddr *)&new->srcaddr);
+ if (err != 0) {
+ xprt_free(xprt);
+ return ERR_PTR(err);
+ }
+ }
+
+ return xprt;
+}
+
+static const struct rpc_timeout xs_local_default_timeout = {
+ .to_initval = 10 * HZ,
+ .to_maxval = 10 * HZ,
+ .to_retries = 2,
+};
+
+/**
+ * xs_setup_local - Set up transport to use an AF_LOCAL socket
+ * @args: rpc transport creation arguments
+ *
+ * AF_LOCAL is a "tpi_cots_ord" transport, just like TCP
+ */
+static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
+{
+ struct sockaddr_un *sun = (struct sockaddr_un *)args->dstaddr;
+ struct sock_xprt *transport;
+ struct rpc_xprt *xprt;
+ struct rpc_xprt *ret;
+
+ xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
+ xprt_max_tcp_slot_table_entries);
+ if (IS_ERR(xprt))
+ return xprt;
+ transport = container_of(xprt, struct sock_xprt, xprt);
+
+ xprt->prot = 0;
+ xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+ xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+
+ xprt->bind_timeout = XS_BIND_TO;
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+ xprt->ops = &xs_local_ops;
+ xprt->timeout = &xs_local_default_timeout;
+
+ INIT_DELAYED_WORK(&transport->connect_worker,
+ xs_dummy_setup_socket);
+
+ switch (sun->sun_family) {
+ case AF_LOCAL:
+ if (sun->sun_path[0] != '/') {
+ dprintk("RPC: bad AF_LOCAL address: %s\n",
+ sun->sun_path);
+ ret = ERR_PTR(-EINVAL);
+ goto out_err;
+ }
+ xprt_set_bound(xprt);
+ xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL);
+ ret = ERR_PTR(xs_local_setup_socket(transport));
+ if (ret)
+ goto out_err;
+ break;
+ default:
+ ret = ERR_PTR(-EAFNOSUPPORT);
+ goto out_err;
+ }
+
+ dprintk("RPC: set up xprt to %s via AF_LOCAL\n",
+ xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+ if (try_module_get(THIS_MODULE))
+ return xprt;
+ ret = ERR_PTR(-EINVAL);
+out_err:
+ xs_xprt_free(xprt);
+ return ret;
+}
+
+static const struct rpc_timeout xs_udp_default_timeout = {
+ .to_initval = 5 * HZ,
+ .to_maxval = 30 * HZ,
+ .to_increment = 5 * HZ,
+ .to_retries = 5,
+};
+
+/**
+ * xs_setup_udp - Set up transport to use a UDP socket
+ * @args: rpc transport creation arguments
+ *
+ */
+static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
+{
+ struct sockaddr *addr = args->dstaddr;
+ struct rpc_xprt *xprt;
+ struct sock_xprt *transport;
+ struct rpc_xprt *ret;
+
+ xprt = xs_setup_xprt(args, xprt_udp_slot_table_entries,
+ xprt_udp_slot_table_entries);
+ if (IS_ERR(xprt))
+ return xprt;
+ transport = container_of(xprt, struct sock_xprt, xprt);
+
+ xprt->prot = IPPROTO_UDP;
+ xprt->tsh_size = 0;
+ /* XXX: header size can vary due to auth type, IPv6, etc. */
+ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
+
+ xprt->bind_timeout = XS_BIND_TO;
+ xprt->reestablish_timeout = XS_UDP_REEST_TO;
+ xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+ xprt->ops = &xs_udp_ops;
+
+ xprt->timeout = &xs_udp_default_timeout;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+ xprt_set_bound(xprt);
+
+ INIT_DELAYED_WORK(&transport->connect_worker,
+ xs_udp_setup_socket);
+ xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP);
+ break;
+ case AF_INET6:
+ if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+ xprt_set_bound(xprt);
+
+ INIT_DELAYED_WORK(&transport->connect_worker,
+ xs_udp_setup_socket);
+ xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6);
+ break;
+ default:
+ ret = ERR_PTR(-EAFNOSUPPORT);
+ goto out_err;
+ }
+
+ if (xprt_bound(xprt))
+ dprintk("RPC: set up xprt to %s (port %s) via %s\n",
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT],
+ xprt->address_strings[RPC_DISPLAY_PROTO]);
+ else
+ dprintk("RPC: set up xprt to %s (autobind) via %s\n",
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PROTO]);
+
+ if (try_module_get(THIS_MODULE))
+ return xprt;
+ ret = ERR_PTR(-EINVAL);
+out_err:
+ xs_xprt_free(xprt);
+ return ret;
+}
+
+static const struct rpc_timeout xs_tcp_default_timeout = {
+ .to_initval = 60 * HZ,
+ .to_maxval = 60 * HZ,
+ .to_retries = 2,
+};
+
+/**
+ * xs_setup_tcp - Set up transport to use a TCP socket
+ * @args: rpc transport creation arguments
+ *
+ */
+static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
+{
+ struct sockaddr *addr = args->dstaddr;
+ struct rpc_xprt *xprt;
+ struct sock_xprt *transport;
+ struct rpc_xprt *ret;
+ unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries;
+
+ if (args->flags & XPRT_CREATE_INFINITE_SLOTS)
+ max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT;
+
+ xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
+ max_slot_table_size);
+ if (IS_ERR(xprt))
+ return xprt;
+ transport = container_of(xprt, struct sock_xprt, xprt);
+
+ xprt->prot = IPPROTO_TCP;
+ xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+ xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+
+ xprt->bind_timeout = XS_BIND_TO;
+ xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+ xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+ xprt->ops = &xs_tcp_ops;
+ xprt->timeout = &xs_tcp_default_timeout;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+ xprt_set_bound(xprt);
+
+ INIT_DELAYED_WORK(&transport->connect_worker,
+ xs_tcp_setup_socket);
+ xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
+ break;
+ case AF_INET6:
+ if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+ xprt_set_bound(xprt);
+
+ INIT_DELAYED_WORK(&transport->connect_worker,
+ xs_tcp_setup_socket);
+ xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
+ break;
+ default:
+ ret = ERR_PTR(-EAFNOSUPPORT);
+ goto out_err;
+ }
+
+ if (xprt_bound(xprt))
+ dprintk("RPC: set up xprt to %s (port %s) via %s\n",
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT],
+ xprt->address_strings[RPC_DISPLAY_PROTO]);
+ else
+ dprintk("RPC: set up xprt to %s (autobind) via %s\n",
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PROTO]);
+
+ if (try_module_get(THIS_MODULE))
+ return xprt;
+ ret = ERR_PTR(-EINVAL);
+out_err:
+ xs_xprt_free(xprt);
+ return ret;
+}
+
+/**
+ * xs_setup_bc_tcp - Set up transport to use a TCP backchannel socket
+ * @args: rpc transport creation arguments
+ *
+ */
+static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
+{
+ struct sockaddr *addr = args->dstaddr;
+ struct rpc_xprt *xprt;
+ struct sock_xprt *transport;
+ struct svc_sock *bc_sock;
+ struct rpc_xprt *ret;
+
+ xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
+ xprt_tcp_slot_table_entries);
+ if (IS_ERR(xprt))
+ return xprt;
+ transport = container_of(xprt, struct sock_xprt, xprt);
+
+ xprt->prot = IPPROTO_TCP;
+ xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+ xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+ xprt->timeout = &xs_tcp_default_timeout;
+
+ /* backchannel */
+ xprt_set_bound(xprt);
+ xprt->bind_timeout = 0;
+ xprt->reestablish_timeout = 0;
+ xprt->idle_timeout = 0;
+
+ xprt->ops = &bc_tcp_ops;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ xs_format_peer_addresses(xprt, "tcp",
+ RPCBIND_NETID_TCP);
+ break;
+ case AF_INET6:
+ xs_format_peer_addresses(xprt, "tcp",
+ RPCBIND_NETID_TCP6);
+ break;
+ default:
+ ret = ERR_PTR(-EAFNOSUPPORT);
+ goto out_err;
+ }
+
+ dprintk("RPC: set up xprt to %s (port %s) via %s\n",
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT],
+ xprt->address_strings[RPC_DISPLAY_PROTO]);
+
+ /*
+ * Once we've associated a backchannel xprt with a connection,
+ * we want to keep it around as long as the connection lasts,
+ * in case we need to start using it for a backchannel again;
+ * this reference won't be dropped until bc_xprt is destroyed.
+ */
+ xprt_get(xprt);
+ args->bc_xprt->xpt_bc_xprt = xprt;
+ xprt->bc_xprt = args->bc_xprt;
+ bc_sock = container_of(args->bc_xprt, struct svc_sock, sk_xprt);
+ transport->sock = bc_sock->sk_sock;
+ transport->inet = bc_sock->sk_sk;
+
+ /*
+ * Since we don't want connections for the backchannel, we set
+ * the xprt status to connected
+ */
+ xprt_set_connected(xprt);
+
+ if (try_module_get(THIS_MODULE))
+ return xprt;
+
+ args->bc_xprt->xpt_bc_xprt = NULL;
+ xprt_put(xprt);
+ ret = ERR_PTR(-EINVAL);
+out_err:
+ xs_xprt_free(xprt);
+ return ret;
+}
+
+static struct xprt_class xs_local_transport = {
+ .list = LIST_HEAD_INIT(xs_local_transport.list),
+ .name = "named UNIX socket",
+ .owner = THIS_MODULE,
+ .ident = XPRT_TRANSPORT_LOCAL,
+ .setup = xs_setup_local,
+};
+
+static struct xprt_class xs_udp_transport = {
+ .list = LIST_HEAD_INIT(xs_udp_transport.list),
+ .name = "udp",
+ .owner = THIS_MODULE,
+ .ident = XPRT_TRANSPORT_UDP,
+ .setup = xs_setup_udp,
+};
+
+static struct xprt_class xs_tcp_transport = {
+ .list = LIST_HEAD_INIT(xs_tcp_transport.list),
+ .name = "tcp",
+ .owner = THIS_MODULE,
+ .ident = XPRT_TRANSPORT_TCP,
+ .setup = xs_setup_tcp,
+};
+
+static struct xprt_class xs_bc_tcp_transport = {
+ .list = LIST_HEAD_INIT(xs_bc_tcp_transport.list),
+ .name = "tcp NFSv4.1 backchannel",
+ .owner = THIS_MODULE,
+ .ident = XPRT_TRANSPORT_BC_TCP,
+ .setup = xs_setup_bc_tcp,
+};
+
+/**
+ * init_socket_xprt - set up xprtsock's sysctls, register with RPC client
+ *
+ */
+int init_socket_xprt(void)
+{
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ if (!sunrpc_table_header)
+ sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+
+ xprt_register_transport(&xs_local_transport);
+ xprt_register_transport(&xs_udp_transport);
+ xprt_register_transport(&xs_tcp_transport);
+ xprt_register_transport(&xs_bc_tcp_transport);
+
+ return 0;
+}
+
+/**
+ * cleanup_socket_xprt - remove xprtsock's sysctls, unregister
+ *
+ */
+void cleanup_socket_xprt(void)
+{
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ if (sunrpc_table_header) {
+ unregister_sysctl_table(sunrpc_table_header);
+ sunrpc_table_header = NULL;
+ }
+#endif
+
+ xprt_unregister_transport(&xs_local_transport);
+ xprt_unregister_transport(&xs_udp_transport);
+ xprt_unregister_transport(&xs_tcp_transport);
+ xprt_unregister_transport(&xs_bc_tcp_transport);
+}
+
+static int param_set_uint_minmax(const char *val,
+ const struct kernel_param *kp,
+ unsigned int min, unsigned int max)
+{
+ unsigned int num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtouint(val, 0, &num);
+ if (ret == -EINVAL || num < min || num > max)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp,
+ RPC_MIN_RESVPORT,
+ RPC_MAX_RESVPORT);
+}
+
+static struct kernel_param_ops param_ops_portnr = {
+ .set = param_set_portnr,
+ .get = param_get_uint,
+};
+
+#define param_check_portnr(name, p) \
+ __param_check(name, p, unsigned int);
+
+module_param_named(min_resvport, xprt_min_resvport, portnr, 0644);
+module_param_named(max_resvport, xprt_max_resvport, portnr, 0644);
+
+static int param_set_slot_table_size(const char *val,
+ const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp,
+ RPC_MIN_SLOT_TABLE,
+ RPC_MAX_SLOT_TABLE);
+}
+
+static struct kernel_param_ops param_ops_slot_table_size = {
+ .set = param_set_slot_table_size,
+ .get = param_get_uint,
+};
+
+#define param_check_slot_table_size(name, p) \
+ __param_check(name, p, unsigned int);
+
+static int param_set_max_slot_table_size(const char *val,
+ const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp,
+ RPC_MIN_SLOT_TABLE,
+ RPC_MAX_SLOT_TABLE_LIMIT);
+}
+
+static struct kernel_param_ops param_ops_max_slot_table_size = {
+ .set = param_set_max_slot_table_size,
+ .get = param_get_uint,
+};
+
+#define param_check_max_slot_table_size(name, p) \
+ __param_check(name, p, unsigned int);
+
+module_param_named(tcp_slot_table_entries, xprt_tcp_slot_table_entries,
+ slot_table_size, 0644);
+module_param_named(tcp_max_slot_table_entries, xprt_max_tcp_slot_table_entries,
+ max_slot_table_size, 0644);
+module_param_named(udp_slot_table_entries, xprt_udp_slot_table_entries,
+ slot_table_size, 0644);
+
diff --git a/net/switchdev/Kconfig b/net/switchdev/Kconfig
new file mode 100644
index 000000000..86a47e17c
--- /dev/null
+++ b/net/switchdev/Kconfig
@@ -0,0 +1,13 @@
+#
+# Configuration for Switch device support
+#
+
+config NET_SWITCHDEV
+ bool "Switch (and switch-ish) device support (EXPERIMENTAL)"
+ depends on INET
+ ---help---
+ This module provides glue between core networking code and device
+ drivers in order to support hardware switch chips in very generic
+ meaning of the word "switch". This include devices supporting L2/L3 but
+ also various flow offloading chips, including switches embedded into
+ SR-IOV NICs.
diff --git a/net/switchdev/Makefile b/net/switchdev/Makefile
new file mode 100644
index 000000000..5ed63ed32
--- /dev/null
+++ b/net/switchdev/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Switch device API
+#
+
+obj-$(CONFIG_NET_SWITCHDEV) += switchdev.o
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
new file mode 100644
index 000000000..055453d48
--- /dev/null
+++ b/net/switchdev/switchdev.c
@@ -0,0 +1,404 @@
+/*
+ * net/switchdev/switchdev.c - Switch device API
+ * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us>
+ * Copyright (c) 2014-2015 Scott Feldman <sfeldma@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <net/ip_fib.h>
+#include <net/switchdev.h>
+
+/**
+ * netdev_switch_parent_id_get - Get ID of a switch
+ * @dev: port device
+ * @psid: switch ID
+ *
+ * Get ID of a switch this port is part of.
+ */
+int netdev_switch_parent_id_get(struct net_device *dev,
+ struct netdev_phys_item_id *psid)
+{
+ const struct swdev_ops *ops = dev->swdev_ops;
+
+ if (!ops || !ops->swdev_parent_id_get)
+ return -EOPNOTSUPP;
+ return ops->swdev_parent_id_get(dev, psid);
+}
+EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get);
+
+/**
+ * netdev_switch_port_stp_update - Notify switch device port of STP
+ * state change
+ * @dev: port device
+ * @state: port STP state
+ *
+ * Notify switch device port of bridge port STP state change.
+ */
+int netdev_switch_port_stp_update(struct net_device *dev, u8 state)
+{
+ const struct swdev_ops *ops = dev->swdev_ops;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err = -EOPNOTSUPP;
+
+ if (ops && ops->swdev_port_stp_update)
+ return ops->swdev_port_stp_update(dev, state);
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = netdev_switch_port_stp_update(lower_dev, state);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(netdev_switch_port_stp_update);
+
+static DEFINE_MUTEX(netdev_switch_mutex);
+static RAW_NOTIFIER_HEAD(netdev_switch_notif_chain);
+
+/**
+ * register_netdev_switch_notifier - Register notifier
+ * @nb: notifier_block
+ *
+ * Register switch device notifier. This should be used by code
+ * which needs to monitor events happening in particular device.
+ * Return values are same as for atomic_notifier_chain_register().
+ */
+int register_netdev_switch_notifier(struct notifier_block *nb)
+{
+ int err;
+
+ mutex_lock(&netdev_switch_mutex);
+ err = raw_notifier_chain_register(&netdev_switch_notif_chain, nb);
+ mutex_unlock(&netdev_switch_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_netdev_switch_notifier);
+
+/**
+ * unregister_netdev_switch_notifier - Unregister notifier
+ * @nb: notifier_block
+ *
+ * Unregister switch device notifier.
+ * Return values are same as for atomic_notifier_chain_unregister().
+ */
+int unregister_netdev_switch_notifier(struct notifier_block *nb)
+{
+ int err;
+
+ mutex_lock(&netdev_switch_mutex);
+ err = raw_notifier_chain_unregister(&netdev_switch_notif_chain, nb);
+ mutex_unlock(&netdev_switch_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier);
+
+/**
+ * call_netdev_switch_notifiers - Call notifiers
+ * @val: value passed unmodified to notifier function
+ * @dev: port device
+ * @info: notifier information data
+ *
+ * Call all network notifier blocks. This should be called by driver
+ * when it needs to propagate hardware event.
+ * Return values are same as for atomic_notifier_call_chain().
+ */
+int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev,
+ struct netdev_switch_notifier_info *info)
+{
+ int err;
+
+ info->dev = dev;
+ mutex_lock(&netdev_switch_mutex);
+ err = raw_notifier_call_chain(&netdev_switch_notif_chain, val, info);
+ mutex_unlock(&netdev_switch_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers);
+
+/**
+ * netdev_switch_port_bridge_setlink - Notify switch device port of bridge
+ * port attributes
+ *
+ * @dev: port device
+ * @nlh: netlink msg with bridge port attributes
+ * @flags: bridge setlink flags
+ *
+ * Notify switch device port of bridge port attributes
+ */
+int netdev_switch_port_bridge_setlink(struct net_device *dev,
+ struct nlmsghdr *nlh, u16 flags)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
+ return 0;
+
+ if (!ops->ndo_bridge_setlink)
+ return -EOPNOTSUPP;
+
+ return ops->ndo_bridge_setlink(dev, nlh, flags);
+}
+EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink);
+
+/**
+ * netdev_switch_port_bridge_dellink - Notify switch device port of bridge
+ * port attribute delete
+ *
+ * @dev: port device
+ * @nlh: netlink msg with bridge port attributes
+ * @flags: bridge setlink flags
+ *
+ * Notify switch device port of bridge port attribute delete
+ */
+int netdev_switch_port_bridge_dellink(struct net_device *dev,
+ struct nlmsghdr *nlh, u16 flags)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
+ return 0;
+
+ if (!ops->ndo_bridge_dellink)
+ return -EOPNOTSUPP;
+
+ return ops->ndo_bridge_dellink(dev, nlh, flags);
+}
+EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink);
+
+/**
+ * ndo_dflt_netdev_switch_port_bridge_setlink - default ndo bridge setlink
+ * op for master devices
+ *
+ * @dev: port device
+ * @nlh: netlink msg with bridge port attributes
+ * @flags: bridge setlink flags
+ *
+ * Notify master device slaves of bridge port attributes
+ */
+int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
+ struct nlmsghdr *nlh, u16 flags)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int ret = 0, err = 0;
+
+ if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
+ return ret;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = netdev_switch_port_bridge_setlink(lower_dev, nlh, flags);
+ if (err && err != -EOPNOTSUPP)
+ ret = err;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink);
+
+/**
+ * ndo_dflt_netdev_switch_port_bridge_dellink - default ndo bridge dellink
+ * op for master devices
+ *
+ * @dev: port device
+ * @nlh: netlink msg with bridge port attributes
+ * @flags: bridge dellink flags
+ *
+ * Notify master device slaves of bridge port attribute deletes
+ */
+int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
+ struct nlmsghdr *nlh, u16 flags)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int ret = 0, err = 0;
+
+ if (!(dev->features & NETIF_F_HW_SWITCH_OFFLOAD))
+ return ret;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = netdev_switch_port_bridge_dellink(lower_dev, nlh, flags);
+ if (err && err != -EOPNOTSUPP)
+ ret = err;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_dellink);
+
+static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
+{
+ const struct swdev_ops *ops = dev->swdev_ops;
+ struct net_device *lower_dev;
+ struct net_device *port_dev;
+ struct list_head *iter;
+
+ /* Recusively search down until we find a sw port dev.
+ * (A sw port dev supports swdev_parent_id_get).
+ */
+
+ if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD &&
+ ops && ops->swdev_parent_id_get)
+ return dev;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ port_dev = netdev_switch_get_lowest_dev(lower_dev);
+ if (port_dev)
+ return port_dev;
+ }
+
+ return NULL;
+}
+
+static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
+{
+ struct netdev_phys_item_id psid;
+ struct netdev_phys_item_id prev_psid;
+ struct net_device *dev = NULL;
+ int nhsel;
+
+ /* For this route, all nexthop devs must be on the same switch. */
+
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (!nh->nh_dev)
+ return NULL;
+
+ dev = netdev_switch_get_lowest_dev(nh->nh_dev);
+ if (!dev)
+ return NULL;
+
+ if (netdev_switch_parent_id_get(dev, &psid))
+ return NULL;
+
+ if (nhsel > 0) {
+ if (prev_psid.id_len != psid.id_len)
+ return NULL;
+ if (memcmp(prev_psid.id, psid.id, psid.id_len))
+ return NULL;
+ }
+
+ prev_psid = psid;
+ }
+
+ return dev;
+}
+
+/**
+ * netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch
+ *
+ * @dst: route's IPv4 destination address
+ * @dst_len: destination address length (prefix length)
+ * @fi: route FIB info structure
+ * @tos: route TOS
+ * @type: route type
+ * @nlflags: netlink flags passed in (NLM_F_*)
+ * @tb_id: route table ID
+ *
+ * Add IPv4 route entry to switch device.
+ */
+int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 nlflags, u32 tb_id)
+{
+ struct net_device *dev;
+ const struct swdev_ops *ops;
+ int err = 0;
+
+ /* Don't offload route if using custom ip rules or if
+ * IPv4 FIB offloading has been disabled completely.
+ */
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ if (fi->fib_net->ipv4.fib_has_custom_rules)
+ return 0;
+#endif
+
+ if (fi->fib_net->ipv4.fib_offload_disabled)
+ return 0;
+
+ dev = netdev_switch_get_dev_by_nhs(fi);
+ if (!dev)
+ return 0;
+ ops = dev->swdev_ops;
+
+ if (ops->swdev_fib_ipv4_add) {
+ err = ops->swdev_fib_ipv4_add(dev, htonl(dst), dst_len,
+ fi, tos, type, nlflags,
+ tb_id);
+ if (!err)
+ fi->fib_flags |= RTNH_F_OFFLOAD;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add);
+
+/**
+ * netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch
+ *
+ * @dst: route's IPv4 destination address
+ * @dst_len: destination address length (prefix length)
+ * @fi: route FIB info structure
+ * @tos: route TOS
+ * @type: route type
+ * @tb_id: route table ID
+ *
+ * Delete IPv4 route entry from switch device.
+ */
+int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id)
+{
+ struct net_device *dev;
+ const struct swdev_ops *ops;
+ int err = 0;
+
+ if (!(fi->fib_flags & RTNH_F_OFFLOAD))
+ return 0;
+
+ dev = netdev_switch_get_dev_by_nhs(fi);
+ if (!dev)
+ return 0;
+ ops = dev->swdev_ops;
+
+ if (ops->swdev_fib_ipv4_del) {
+ err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len,
+ fi, tos, type, tb_id);
+ if (!err)
+ fi->fib_flags &= ~RTNH_F_OFFLOAD;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_del);
+
+/**
+ * netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation
+ *
+ * @fi: route FIB info structure
+ */
+void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
+{
+ /* There was a problem installing this route to the offload
+ * device. For now, until we come up with more refined
+ * policy handling, abruptly end IPv4 fib offloading for
+ * for entire net by flushing offload device(s) of all
+ * IPv4 routes, and mark IPv4 fib offloading broken from
+ * this point forward.
+ */
+
+ fib_flush_external(fi->fib_net);
+ fi->fib_net->ipv4.fib_offload_disabled = true;
+}
+EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_abort);
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
new file mode 100644
index 000000000..e7000be32
--- /dev/null
+++ b/net/sysctl_net.c
@@ -0,0 +1,114 @@
+/* -*- linux-c -*-
+ * sysctl_net.c: sysctl interface to net subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net directories for each protocol family. [MS]
+ *
+ * Revision 1.2 1996/05/08 20:24:40 shaver
+ * Added bits for NET_BRIDGE and the NET_IPV4_ARP stuff and
+ * NET_IPV4_IP_FORWARD.
+ *
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/sysctl.h>
+#include <linux/nsproxy.h>
+
+#include <net/sock.h>
+
+#ifdef CONFIG_INET
+#include <net/ip.h>
+#endif
+
+#ifdef CONFIG_NET
+#include <linux/if_ether.h>
+#endif
+
+static struct ctl_table_set *
+net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+ return &namespaces->net_ns->sysctls;
+}
+
+static int is_seen(struct ctl_table_set *set)
+{
+ return &current->nsproxy->net_ns->sysctls == set;
+}
+
+/* Return standard mode bits for table entry. */
+static int net_ctl_permissions(struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ struct net *net = container_of(head->set, struct net, sysctls);
+ kuid_t root_uid = make_kuid(net->user_ns, 0);
+ kgid_t root_gid = make_kgid(net->user_ns, 0);
+
+ /* Allow network administrator to have same access as root. */
+ if (ns_capable(net->user_ns, CAP_NET_ADMIN) ||
+ uid_eq(root_uid, current_euid())) {
+ int mode = (table->mode >> 6) & 7;
+ return (mode << 6) | (mode << 3) | mode;
+ }
+ /* Allow netns root group to have the same access as the root group */
+ if (in_egroup_p(root_gid)) {
+ int mode = (table->mode >> 3) & 7;
+ return (mode << 3) | mode;
+ }
+ return table->mode;
+}
+
+static struct ctl_table_root net_sysctl_root = {
+ .lookup = net_ctl_header_lookup,
+ .permissions = net_ctl_permissions,
+};
+
+static int __net_init sysctl_net_init(struct net *net)
+{
+ setup_sysctl_set(&net->sysctls, &net_sysctl_root, is_seen);
+ return 0;
+}
+
+static void __net_exit sysctl_net_exit(struct net *net)
+{
+ retire_sysctl_set(&net->sysctls);
+}
+
+static struct pernet_operations sysctl_pernet_ops = {
+ .init = sysctl_net_init,
+ .exit = sysctl_net_exit,
+};
+
+static struct ctl_table_header *net_header;
+__init int net_sysctl_init(void)
+{
+ static struct ctl_table empty[1];
+ int ret = -ENOMEM;
+ /* Avoid limitations in the sysctl implementation by
+ * registering "/proc/sys/net" as an empty directory not in a
+ * network namespace.
+ */
+ net_header = register_sysctl("net", empty);
+ if (!net_header)
+ goto out;
+ ret = register_pernet_subsys(&sysctl_pernet_ops);
+ if (ret)
+ goto out;
+ register_sysctl_root(&net_sysctl_root);
+out:
+ return ret;
+}
+
+struct ctl_table_header *register_net_sysctl(struct net *net,
+ const char *path, struct ctl_table *table)
+{
+ return __register_sysctl_table(&net->sysctls, path, table);
+}
+EXPORT_SYMBOL_GPL(register_net_sysctl);
+
+void unregister_net_sysctl_table(struct ctl_table_header *header)
+{
+ unregister_sysctl_table(header);
+}
+EXPORT_SYMBOL_GPL(unregister_net_sysctl_table);
diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig
new file mode 100644
index 000000000..c25a3a149
--- /dev/null
+++ b/net/tipc/Kconfig
@@ -0,0 +1,36 @@
+#
+# TIPC configuration
+#
+
+menuconfig TIPC
+ tristate "The TIPC Protocol"
+ depends on INET
+ ---help---
+ The Transparent Inter Process Communication (TIPC) protocol is
+ specially designed for intra cluster communication. This protocol
+ originates from Ericsson where it has been used in carrier grade
+ cluster applications for many years.
+
+ For more information about TIPC, see http://tipc.sourceforge.net.
+
+ This protocol support is also available as a module ( = code which
+ can be inserted in and removed from the running kernel whenever you
+ want). The module will be called tipc. If you want to compile it
+ as a module, say M here and read <file:Documentation/kbuild/modules.txt>.
+
+ If in doubt, say N.
+
+config TIPC_MEDIA_IB
+ bool "InfiniBand media type support"
+ depends on TIPC && INFINIBAND_IPOIB
+ help
+ Saying Y here will enable support for running TIPC on
+ IP-over-InfiniBand devices.
+config TIPC_MEDIA_UDP
+ bool "IP/UDP media type support"
+ depends on TIPC
+ select NET_UDP_TUNNEL
+ help
+ Saying Y here will enable support for running TIPC over IP/UDP
+ bool
+ default y
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
new file mode 100644
index 000000000..57e460be4
--- /dev/null
+++ b/net/tipc/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the Linux TIPC layer
+#
+
+obj-$(CONFIG_TIPC) := tipc.o
+
+tipc-y += addr.o bcast.o bearer.o \
+ core.o link.o discover.o msg.o \
+ name_distr.o subscr.o name_table.o net.o \
+ netlink.o netlink_compat.o node.o socket.o eth_media.o \
+ server.o socket.o
+
+tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o
+tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o
+tipc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
new file mode 100644
index 000000000..ba7daa864
--- /dev/null
+++ b/net/tipc/addr.c
@@ -0,0 +1,153 @@
+/*
+ * net/tipc/addr.c: TIPC address utility routines
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include "addr.h"
+#include "core.h"
+
+u32 tipc_own_addr(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ return tn->own_addr;
+}
+
+/**
+ * in_own_cluster - test for cluster inclusion; <0.0.0> always matches
+ */
+int in_own_cluster(struct net *net, u32 addr)
+{
+ return in_own_cluster_exact(net, addr) || !addr;
+}
+
+int in_own_cluster_exact(struct net *net, u32 addr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ return !((addr ^ tn->own_addr) >> 12);
+}
+
+/**
+ * in_own_node - test for node inclusion; <0.0.0> always matches
+ */
+int in_own_node(struct net *net, u32 addr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ return (addr == tn->own_addr) || !addr;
+}
+
+/**
+ * addr_domain - convert 2-bit scope value to equivalent message lookup domain
+ *
+ * Needed when address of a named message must be looked up a second time
+ * after a network hop.
+ */
+u32 addr_domain(struct net *net, u32 sc)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ if (likely(sc == TIPC_NODE_SCOPE))
+ return tn->own_addr;
+ if (sc == TIPC_CLUSTER_SCOPE)
+ return tipc_cluster_mask(tn->own_addr);
+ return tipc_zone_mask(tn->own_addr);
+}
+
+/**
+ * tipc_addr_domain_valid - validates a network domain address
+ *
+ * Accepts <Z.C.N>, <Z.C.0>, <Z.0.0>, and <0.0.0>,
+ * where Z, C, and N are non-zero.
+ *
+ * Returns 1 if domain address is valid, otherwise 0
+ */
+int tipc_addr_domain_valid(u32 addr)
+{
+ u32 n = tipc_node(addr);
+ u32 c = tipc_cluster(addr);
+ u32 z = tipc_zone(addr);
+
+ if (n && (!z || !c))
+ return 0;
+ if (c && !z)
+ return 0;
+ return 1;
+}
+
+/**
+ * tipc_addr_node_valid - validates a proposed network address for this node
+ *
+ * Accepts <Z.C.N>, where Z, C, and N are non-zero.
+ *
+ * Returns 1 if address can be used, otherwise 0
+ */
+int tipc_addr_node_valid(u32 addr)
+{
+ return tipc_addr_domain_valid(addr) && tipc_node(addr);
+}
+
+int tipc_in_scope(u32 domain, u32 addr)
+{
+ if (!domain || (domain == addr))
+ return 1;
+ if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */
+ return 1;
+ if (domain == tipc_zone_mask(addr)) /* domain <Z.0.0> */
+ return 1;
+ return 0;
+}
+
+/**
+ * tipc_addr_scope - convert message lookup domain to a 2-bit scope value
+ */
+int tipc_addr_scope(u32 domain)
+{
+ if (likely(!domain))
+ return TIPC_ZONE_SCOPE;
+ if (tipc_node(domain))
+ return TIPC_NODE_SCOPE;
+ if (tipc_cluster(domain))
+ return TIPC_CLUSTER_SCOPE;
+ return TIPC_ZONE_SCOPE;
+}
+
+char *tipc_addr_string_fill(char *string, u32 addr)
+{
+ snprintf(string, 16, "<%u.%u.%u>",
+ tipc_zone(addr), tipc_cluster(addr), tipc_node(addr));
+ return string;
+}
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
new file mode 100644
index 000000000..7ba6d5c8a
--- /dev/null
+++ b/net/tipc/addr.h
@@ -0,0 +1,68 @@
+/*
+ * net/tipc/addr.h: Include file for TIPC address utility routines
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2004-2005, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_ADDR_H
+#define _TIPC_ADDR_H
+
+#include <linux/types.h>
+#include <linux/tipc.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define TIPC_ZONE_MASK 0xff000000u
+#define TIPC_CLUSTER_MASK 0xfffff000u
+
+static inline u32 tipc_zone_mask(u32 addr)
+{
+ return addr & TIPC_ZONE_MASK;
+}
+
+static inline u32 tipc_cluster_mask(u32 addr)
+{
+ return addr & TIPC_CLUSTER_MASK;
+}
+
+u32 tipc_own_addr(struct net *net);
+int in_own_cluster(struct net *net, u32 addr);
+int in_own_cluster_exact(struct net *net, u32 addr);
+int in_own_node(struct net *net, u32 addr);
+u32 addr_domain(struct net *net, u32 sc);
+int tipc_addr_domain_valid(u32);
+int tipc_addr_node_valid(u32 addr);
+int tipc_in_scope(u32 domain, u32 addr);
+int tipc_addr_scope(u32 domain);
+char *tipc_addr_string_fill(char *string, u32 addr);
+#endif
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
new file mode 100644
index 000000000..c5cbdcb1f
--- /dev/null
+++ b/net/tipc/bcast.c
@@ -0,0 +1,986 @@
+/*
+ * net/tipc/bcast.c: TIPC broadcast code
+ *
+ * Copyright (c) 2004-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2004, Intel Corporation.
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "socket.h"
+#include "msg.h"
+#include "bcast.h"
+#include "name_distr.h"
+#include "core.h"
+
+#define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */
+#define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */
+
+const char tipc_bclink_name[] = "broadcast-link";
+
+static void tipc_nmap_diff(struct tipc_node_map *nm_a,
+ struct tipc_node_map *nm_b,
+ struct tipc_node_map *nm_diff);
+static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node);
+static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node);
+
+static void tipc_bclink_lock(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ spin_lock_bh(&tn->bclink->lock);
+}
+
+static void tipc_bclink_unlock(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ spin_unlock_bh(&tn->bclink->lock);
+}
+
+void tipc_bclink_input(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ tipc_sk_mcast_rcv(net, &tn->bclink->arrvq, &tn->bclink->inputq);
+}
+
+uint tipc_bclink_get_mtu(void)
+{
+ return MAX_PKT_DEFAULT_MCAST;
+}
+
+static u32 bcbuf_acks(struct sk_buff *buf)
+{
+ return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle;
+}
+
+static void bcbuf_set_acks(struct sk_buff *buf, u32 acks)
+{
+ TIPC_SKB_CB(buf)->handle = (void *)(unsigned long)acks;
+}
+
+static void bcbuf_decr_acks(struct sk_buff *buf)
+{
+ bcbuf_set_acks(buf, bcbuf_acks(buf) - 1);
+}
+
+void tipc_bclink_add_node(struct net *net, u32 addr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ tipc_bclink_lock(net);
+ tipc_nmap_add(&tn->bclink->bcast_nodes, addr);
+ tipc_bclink_unlock(net);
+}
+
+void tipc_bclink_remove_node(struct net *net, u32 addr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ tipc_bclink_lock(net);
+ tipc_nmap_remove(&tn->bclink->bcast_nodes, addr);
+ tipc_bclink_unlock(net);
+}
+
+static void bclink_set_last_sent(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *bcl = tn->bcl;
+ struct sk_buff *skb = skb_peek(&bcl->backlogq);
+
+ if (skb)
+ bcl->fsm_msg_cnt = mod(buf_seqno(skb) - 1);
+ else
+ bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1);
+}
+
+u32 tipc_bclink_get_last_sent(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ return tn->bcl->fsm_msg_cnt;
+}
+
+static void bclink_update_last_sent(struct tipc_node *node, u32 seqno)
+{
+ node->bclink.last_sent = less_eq(node->bclink.last_sent, seqno) ?
+ seqno : node->bclink.last_sent;
+}
+
+/**
+ * tipc_bclink_retransmit_to - get most recent node to request retransmission
+ *
+ * Called with bclink_lock locked
+ */
+struct tipc_node *tipc_bclink_retransmit_to(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ return tn->bclink->retransmit_to;
+}
+
+/**
+ * bclink_retransmit_pkt - retransmit broadcast packets
+ * @after: sequence number of last packet to *not* retransmit
+ * @to: sequence number of last packet to retransmit
+ *
+ * Called with bclink_lock locked
+ */
+static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
+{
+ struct sk_buff *skb;
+ struct tipc_link *bcl = tn->bcl;
+
+ skb_queue_walk(&bcl->transmq, skb) {
+ if (more(buf_seqno(skb), after)) {
+ tipc_link_retransmit(bcl, skb, mod(to - after));
+ break;
+ }
+ }
+}
+
+/**
+ * tipc_bclink_wakeup_users - wake up pending users
+ *
+ * Called with no locks taken
+ */
+void tipc_bclink_wakeup_users(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ tipc_sk_rcv(net, &tn->bclink->link.wakeupq);
+}
+
+/**
+ * tipc_bclink_acknowledge - handle acknowledgement of broadcast packets
+ * @n_ptr: node that sent acknowledgement info
+ * @acked: broadcast sequence # that has been acknowledged
+ *
+ * Node is locked, bclink_lock unlocked.
+ */
+void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked)
+{
+ struct sk_buff *skb, *tmp;
+ unsigned int released = 0;
+ struct net *net = n_ptr->net;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ if (unlikely(!n_ptr->bclink.recv_permitted))
+ return;
+
+ tipc_bclink_lock(net);
+
+ /* Bail out if tx queue is empty (no clean up is required) */
+ skb = skb_peek(&tn->bcl->transmq);
+ if (!skb)
+ goto exit;
+
+ /* Determine which messages need to be acknowledged */
+ if (acked == INVALID_LINK_SEQ) {
+ /*
+ * Contact with specified node has been lost, so need to
+ * acknowledge sent messages only (if other nodes still exist)
+ * or both sent and unsent messages (otherwise)
+ */
+ if (tn->bclink->bcast_nodes.count)
+ acked = tn->bcl->fsm_msg_cnt;
+ else
+ acked = tn->bcl->next_out_no;
+ } else {
+ /*
+ * Bail out if specified sequence number does not correspond
+ * to a message that has been sent and not yet acknowledged
+ */
+ if (less(acked, buf_seqno(skb)) ||
+ less(tn->bcl->fsm_msg_cnt, acked) ||
+ less_eq(acked, n_ptr->bclink.acked))
+ goto exit;
+ }
+
+ /* Skip over packets that node has previously acknowledged */
+ skb_queue_walk(&tn->bcl->transmq, skb) {
+ if (more(buf_seqno(skb), n_ptr->bclink.acked))
+ break;
+ }
+
+ /* Update packets that node is now acknowledging */
+ skb_queue_walk_from_safe(&tn->bcl->transmq, skb, tmp) {
+ if (more(buf_seqno(skb), acked))
+ break;
+ bcbuf_decr_acks(skb);
+ bclink_set_last_sent(net);
+ if (bcbuf_acks(skb) == 0) {
+ __skb_unlink(skb, &tn->bcl->transmq);
+ kfree_skb(skb);
+ released = 1;
+ }
+ }
+ n_ptr->bclink.acked = acked;
+
+ /* Try resolving broadcast link congestion, if necessary */
+ if (unlikely(skb_peek(&tn->bcl->backlogq))) {
+ tipc_link_push_packets(tn->bcl);
+ bclink_set_last_sent(net);
+ }
+ if (unlikely(released && !skb_queue_empty(&tn->bcl->wakeupq)))
+ n_ptr->action_flags |= TIPC_WAKEUP_BCAST_USERS;
+exit:
+ tipc_bclink_unlock(net);
+}
+
+/**
+ * tipc_bclink_update_link_state - update broadcast link state
+ *
+ * RCU and node lock set
+ */
+void tipc_bclink_update_link_state(struct tipc_node *n_ptr,
+ u32 last_sent)
+{
+ struct sk_buff *buf;
+ struct net *net = n_ptr->net;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ /* Ignore "stale" link state info */
+ if (less_eq(last_sent, n_ptr->bclink.last_in))
+ return;
+
+ /* Update link synchronization state; quit if in sync */
+ bclink_update_last_sent(n_ptr, last_sent);
+
+ if (n_ptr->bclink.last_sent == n_ptr->bclink.last_in)
+ return;
+
+ /* Update out-of-sync state; quit if loss is still unconfirmed */
+ if ((++n_ptr->bclink.oos_state) == 1) {
+ if (n_ptr->bclink.deferred_size < (TIPC_MIN_LINK_WIN / 2))
+ return;
+ n_ptr->bclink.oos_state++;
+ }
+
+ /* Don't NACK if one has been recently sent (or seen) */
+ if (n_ptr->bclink.oos_state & 0x1)
+ return;
+
+ /* Send NACK */
+ buf = tipc_buf_acquire(INT_H_SIZE);
+ if (buf) {
+ struct tipc_msg *msg = buf_msg(buf);
+ struct sk_buff *skb = skb_peek(&n_ptr->bclink.deferdq);
+ u32 to = skb ? buf_seqno(skb) - 1 : n_ptr->bclink.last_sent;
+
+ tipc_msg_init(tn->own_addr, msg, BCAST_PROTOCOL, STATE_MSG,
+ INT_H_SIZE, n_ptr->addr);
+ msg_set_non_seq(msg, 1);
+ msg_set_mc_netid(msg, tn->net_id);
+ msg_set_bcast_ack(msg, n_ptr->bclink.last_in);
+ msg_set_bcgap_after(msg, n_ptr->bclink.last_in);
+ msg_set_bcgap_to(msg, to);
+
+ tipc_bclink_lock(net);
+ tipc_bearer_send(net, MAX_BEARERS, buf, NULL);
+ tn->bcl->stats.sent_nacks++;
+ tipc_bclink_unlock(net);
+ kfree_skb(buf);
+
+ n_ptr->bclink.oos_state++;
+ }
+}
+
+/**
+ * bclink_peek_nack - monitor retransmission requests sent by other nodes
+ *
+ * Delay any upcoming NACK by this node if another node has already
+ * requested the first message this node is going to ask for.
+ */
+static void bclink_peek_nack(struct net *net, struct tipc_msg *msg)
+{
+ struct tipc_node *n_ptr = tipc_node_find(net, msg_destnode(msg));
+
+ if (unlikely(!n_ptr))
+ return;
+
+ tipc_node_lock(n_ptr);
+ if (n_ptr->bclink.recv_permitted &&
+ (n_ptr->bclink.last_in != n_ptr->bclink.last_sent) &&
+ (n_ptr->bclink.last_in == msg_bcgap_after(msg)))
+ n_ptr->bclink.oos_state = 2;
+ tipc_node_unlock(n_ptr);
+ tipc_node_put(n_ptr);
+}
+
+/* tipc_bclink_xmit - deliver buffer chain to all nodes in cluster
+ * and to identified node local sockets
+ * @net: the applicable net namespace
+ * @list: chain of buffers containing message
+ * Consumes the buffer chain, except when returning -ELINKCONG
+ * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ */
+int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *bcl = tn->bcl;
+ struct tipc_bclink *bclink = tn->bclink;
+ int rc = 0;
+ int bc = 0;
+ struct sk_buff *skb;
+ struct sk_buff_head arrvq;
+ struct sk_buff_head inputq;
+
+ /* Prepare clone of message for local node */
+ skb = tipc_msg_reassemble(list);
+ if (unlikely(!skb)) {
+ __skb_queue_purge(list);
+ return -EHOSTUNREACH;
+ }
+ /* Broadcast to all nodes */
+ if (likely(bclink)) {
+ tipc_bclink_lock(net);
+ if (likely(bclink->bcast_nodes.count)) {
+ rc = __tipc_link_xmit(net, bcl, list);
+ if (likely(!rc)) {
+ u32 len = skb_queue_len(&bcl->transmq);
+
+ bclink_set_last_sent(net);
+ bcl->stats.queue_sz_counts++;
+ bcl->stats.accu_queue_sz += len;
+ }
+ bc = 1;
+ }
+ tipc_bclink_unlock(net);
+ }
+
+ if (unlikely(!bc))
+ __skb_queue_purge(list);
+
+ if (unlikely(rc)) {
+ kfree_skb(skb);
+ return rc;
+ }
+ /* Deliver message clone */
+ __skb_queue_head_init(&arrvq);
+ skb_queue_head_init(&inputq);
+ __skb_queue_tail(&arrvq, skb);
+ tipc_sk_mcast_rcv(net, &arrvq, &inputq);
+ return rc;
+}
+
+/**
+ * bclink_accept_pkt - accept an incoming, in-sequence broadcast packet
+ *
+ * Called with both sending node's lock and bclink_lock taken.
+ */
+static void bclink_accept_pkt(struct tipc_node *node, u32 seqno)
+{
+ struct tipc_net *tn = net_generic(node->net, tipc_net_id);
+
+ bclink_update_last_sent(node, seqno);
+ node->bclink.last_in = seqno;
+ node->bclink.oos_state = 0;
+ tn->bcl->stats.recv_info++;
+
+ /*
+ * Unicast an ACK periodically, ensuring that
+ * all nodes in the cluster don't ACK at the same time
+ */
+ if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) {
+ tipc_link_proto_xmit(node->active_links[node->addr & 1],
+ STATE_MSG, 0, 0, 0, 0);
+ tn->bcl->stats.sent_acks++;
+ }
+}
+
+/**
+ * tipc_bclink_rcv - receive a broadcast packet, and deliver upwards
+ *
+ * RCU is locked, no other locks set
+ */
+void tipc_bclink_rcv(struct net *net, struct sk_buff *buf)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *bcl = tn->bcl;
+ struct tipc_msg *msg = buf_msg(buf);
+ struct tipc_node *node;
+ u32 next_in;
+ u32 seqno;
+ int deferred = 0;
+ int pos = 0;
+ struct sk_buff *iskb;
+ struct sk_buff_head *arrvq, *inputq;
+
+ /* Screen out unwanted broadcast messages */
+ if (msg_mc_netid(msg) != tn->net_id)
+ goto exit;
+
+ node = tipc_node_find(net, msg_prevnode(msg));
+ if (unlikely(!node))
+ goto exit;
+
+ tipc_node_lock(node);
+ if (unlikely(!node->bclink.recv_permitted))
+ goto unlock;
+
+ /* Handle broadcast protocol message */
+ if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) {
+ if (msg_type(msg) != STATE_MSG)
+ goto unlock;
+ if (msg_destnode(msg) == tn->own_addr) {
+ tipc_bclink_acknowledge(node, msg_bcast_ack(msg));
+ tipc_bclink_lock(net);
+ bcl->stats.recv_nacks++;
+ tn->bclink->retransmit_to = node;
+ bclink_retransmit_pkt(tn, msg_bcgap_after(msg),
+ msg_bcgap_to(msg));
+ tipc_bclink_unlock(net);
+ tipc_node_unlock(node);
+ } else {
+ tipc_node_unlock(node);
+ bclink_peek_nack(net, msg);
+ }
+ tipc_node_put(node);
+ goto exit;
+ }
+
+ /* Handle in-sequence broadcast message */
+ seqno = msg_seqno(msg);
+ next_in = mod(node->bclink.last_in + 1);
+ arrvq = &tn->bclink->arrvq;
+ inputq = &tn->bclink->inputq;
+
+ if (likely(seqno == next_in)) {
+receive:
+ /* Deliver message to destination */
+ if (likely(msg_isdata(msg))) {
+ tipc_bclink_lock(net);
+ bclink_accept_pkt(node, seqno);
+ spin_lock_bh(&inputq->lock);
+ __skb_queue_tail(arrvq, buf);
+ spin_unlock_bh(&inputq->lock);
+ node->action_flags |= TIPC_BCAST_MSG_EVT;
+ tipc_bclink_unlock(net);
+ tipc_node_unlock(node);
+ } else if (msg_user(msg) == MSG_BUNDLER) {
+ tipc_bclink_lock(net);
+ bclink_accept_pkt(node, seqno);
+ bcl->stats.recv_bundles++;
+ bcl->stats.recv_bundled += msg_msgcnt(msg);
+ pos = 0;
+ while (tipc_msg_extract(buf, &iskb, &pos)) {
+ spin_lock_bh(&inputq->lock);
+ __skb_queue_tail(arrvq, iskb);
+ spin_unlock_bh(&inputq->lock);
+ }
+ node->action_flags |= TIPC_BCAST_MSG_EVT;
+ tipc_bclink_unlock(net);
+ tipc_node_unlock(node);
+ } else if (msg_user(msg) == MSG_FRAGMENTER) {
+ tipc_bclink_lock(net);
+ bclink_accept_pkt(node, seqno);
+ tipc_buf_append(&node->bclink.reasm_buf, &buf);
+ if (unlikely(!buf && !node->bclink.reasm_buf)) {
+ tipc_bclink_unlock(net);
+ goto unlock;
+ }
+ bcl->stats.recv_fragments++;
+ if (buf) {
+ bcl->stats.recv_fragmented++;
+ msg = buf_msg(buf);
+ tipc_bclink_unlock(net);
+ goto receive;
+ }
+ tipc_bclink_unlock(net);
+ tipc_node_unlock(node);
+ } else {
+ tipc_bclink_lock(net);
+ bclink_accept_pkt(node, seqno);
+ tipc_bclink_unlock(net);
+ tipc_node_unlock(node);
+ kfree_skb(buf);
+ }
+ buf = NULL;
+
+ /* Determine new synchronization state */
+ tipc_node_lock(node);
+ if (unlikely(!tipc_node_is_up(node)))
+ goto unlock;
+
+ if (node->bclink.last_in == node->bclink.last_sent)
+ goto unlock;
+
+ if (skb_queue_empty(&node->bclink.deferdq)) {
+ node->bclink.oos_state = 1;
+ goto unlock;
+ }
+
+ msg = buf_msg(skb_peek(&node->bclink.deferdq));
+ seqno = msg_seqno(msg);
+ next_in = mod(next_in + 1);
+ if (seqno != next_in)
+ goto unlock;
+
+ /* Take in-sequence message from deferred queue & deliver it */
+ buf = __skb_dequeue(&node->bclink.deferdq);
+ goto receive;
+ }
+
+ /* Handle out-of-sequence broadcast message */
+ if (less(next_in, seqno)) {
+ deferred = tipc_link_defer_pkt(&node->bclink.deferdq,
+ buf);
+ bclink_update_last_sent(node, seqno);
+ buf = NULL;
+ }
+
+ tipc_bclink_lock(net);
+
+ if (deferred)
+ bcl->stats.deferred_recv++;
+ else
+ bcl->stats.duplicates++;
+
+ tipc_bclink_unlock(net);
+
+unlock:
+ tipc_node_unlock(node);
+ tipc_node_put(node);
+exit:
+ kfree_skb(buf);
+}
+
+u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr)
+{
+ return (n_ptr->bclink.recv_permitted &&
+ (tipc_bclink_get_last_sent(n_ptr->net) != n_ptr->bclink.acked));
+}
+
+
+/**
+ * tipc_bcbearer_send - send a packet through the broadcast pseudo-bearer
+ *
+ * Send packet over as many bearers as necessary to reach all nodes
+ * that have joined the broadcast link.
+ *
+ * Returns 0 (packet sent successfully) under all circumstances,
+ * since the broadcast link's pseudo-bearer never blocks
+ */
+static int tipc_bcbearer_send(struct net *net, struct sk_buff *buf,
+ struct tipc_bearer *unused1,
+ struct tipc_media_addr *unused2)
+{
+ int bp_index;
+ struct tipc_msg *msg = buf_msg(buf);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bcbearer *bcbearer = tn->bcbearer;
+ struct tipc_bclink *bclink = tn->bclink;
+
+ /* Prepare broadcast link message for reliable transmission,
+ * if first time trying to send it;
+ * preparation is skipped for broadcast link protocol messages
+ * since they are sent in an unreliable manner and don't need it
+ */
+ if (likely(!msg_non_seq(buf_msg(buf)))) {
+ bcbuf_set_acks(buf, bclink->bcast_nodes.count);
+ msg_set_non_seq(msg, 1);
+ msg_set_mc_netid(msg, tn->net_id);
+ tn->bcl->stats.sent_info++;
+ if (WARN_ON(!bclink->bcast_nodes.count)) {
+ dump_stack();
+ return 0;
+ }
+ }
+
+ /* Send buffer over bearers until all targets reached */
+ bcbearer->remains = bclink->bcast_nodes;
+
+ for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) {
+ struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary;
+ struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary;
+ struct tipc_bearer *bp[2] = {p, s};
+ struct tipc_bearer *b = bp[msg_link_selector(msg)];
+ struct sk_buff *tbuf;
+
+ if (!p)
+ break; /* No more bearers to try */
+ if (!b)
+ b = p;
+ tipc_nmap_diff(&bcbearer->remains, &b->nodes,
+ &bcbearer->remains_new);
+ if (bcbearer->remains_new.count == bcbearer->remains.count)
+ continue; /* Nothing added by bearer pair */
+
+ if (bp_index == 0) {
+ /* Use original buffer for first bearer */
+ tipc_bearer_send(net, b->identity, buf, &b->bcast_addr);
+ } else {
+ /* Avoid concurrent buffer access */
+ tbuf = pskb_copy_for_clone(buf, GFP_ATOMIC);
+ if (!tbuf)
+ break;
+ tipc_bearer_send(net, b->identity, tbuf,
+ &b->bcast_addr);
+ kfree_skb(tbuf); /* Bearer keeps a clone */
+ }
+ if (bcbearer->remains_new.count == 0)
+ break; /* All targets reached */
+
+ bcbearer->remains = bcbearer->remains_new;
+ }
+
+ return 0;
+}
+
+/**
+ * tipc_bcbearer_sort - create sets of bearer pairs used by broadcast bearer
+ */
+void tipc_bcbearer_sort(struct net *net, struct tipc_node_map *nm_ptr,
+ u32 node, bool action)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bcbearer *bcbearer = tn->bcbearer;
+ struct tipc_bcbearer_pair *bp_temp = bcbearer->bpairs_temp;
+ struct tipc_bcbearer_pair *bp_curr;
+ struct tipc_bearer *b;
+ int b_index;
+ int pri;
+
+ tipc_bclink_lock(net);
+
+ if (action)
+ tipc_nmap_add(nm_ptr, node);
+ else
+ tipc_nmap_remove(nm_ptr, node);
+
+ /* Group bearers by priority (can assume max of two per priority) */
+ memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp));
+
+ rcu_read_lock();
+ for (b_index = 0; b_index < MAX_BEARERS; b_index++) {
+ b = rcu_dereference_rtnl(tn->bearer_list[b_index]);
+ if (!b || !b->nodes.count)
+ continue;
+
+ if (!bp_temp[b->priority].primary)
+ bp_temp[b->priority].primary = b;
+ else
+ bp_temp[b->priority].secondary = b;
+ }
+ rcu_read_unlock();
+
+ /* Create array of bearer pairs for broadcasting */
+ bp_curr = bcbearer->bpairs;
+ memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs));
+
+ for (pri = TIPC_MAX_LINK_PRI; pri >= 0; pri--) {
+
+ if (!bp_temp[pri].primary)
+ continue;
+
+ bp_curr->primary = bp_temp[pri].primary;
+
+ if (bp_temp[pri].secondary) {
+ if (tipc_nmap_equal(&bp_temp[pri].primary->nodes,
+ &bp_temp[pri].secondary->nodes)) {
+ bp_curr->secondary = bp_temp[pri].secondary;
+ } else {
+ bp_curr++;
+ bp_curr->primary = bp_temp[pri].secondary;
+ }
+ }
+
+ bp_curr++;
+ }
+
+ tipc_bclink_unlock(net);
+}
+
+static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
+ struct tipc_stats *stats)
+{
+ int i;
+ struct nlattr *nest;
+
+ struct nla_map {
+ __u32 key;
+ __u32 val;
+ };
+
+ struct nla_map map[] = {
+ {TIPC_NLA_STATS_RX_INFO, stats->recv_info},
+ {TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments},
+ {TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented},
+ {TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles},
+ {TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled},
+ {TIPC_NLA_STATS_TX_INFO, stats->sent_info},
+ {TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments},
+ {TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented},
+ {TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles},
+ {TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled},
+ {TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks},
+ {TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv},
+ {TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks},
+ {TIPC_NLA_STATS_TX_ACKS, stats->sent_acks},
+ {TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted},
+ {TIPC_NLA_STATS_DUPLICATES, stats->duplicates},
+ {TIPC_NLA_STATS_LINK_CONGS, stats->link_congs},
+ {TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz},
+ {TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ?
+ (stats->accu_queue_sz / stats->queue_sz_counts) : 0}
+ };
+
+ nest = nla_nest_start(skb, TIPC_NLA_LINK_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for (i = 0; i < ARRAY_SIZE(map); i++)
+ if (nla_put_u32(skb, map[i].key, map[i].val))
+ goto msg_full;
+
+ nla_nest_end(skb, nest);
+
+ return 0;
+msg_full:
+ nla_nest_cancel(skb, nest);
+
+ return -EMSGSIZE;
+}
+
+int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
+{
+ int err;
+ void *hdr;
+ struct nlattr *attrs;
+ struct nlattr *prop;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *bcl = tn->bcl;
+
+ if (!bcl)
+ return 0;
+
+ tipc_bclink_lock(net);
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ NLM_F_MULTI, TIPC_NL_LINK_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK);
+ if (!attrs)
+ goto msg_full;
+
+ /* The broadcast link is always up */
+ if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP))
+ goto attr_msg_full;
+
+ if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST))
+ goto attr_msg_full;
+ if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->next_in_no))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->next_out_no))
+ goto attr_msg_full;
+
+ prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
+ if (!prop)
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window))
+ goto prop_msg_full;
+ nla_nest_end(msg->skb, prop);
+
+ err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats);
+ if (err)
+ goto attr_msg_full;
+
+ tipc_bclink_unlock(net);
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+prop_msg_full:
+ nla_nest_cancel(msg->skb, prop);
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ tipc_bclink_unlock(net);
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+int tipc_bclink_reset_stats(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *bcl = tn->bcl;
+
+ if (!bcl)
+ return -ENOPROTOOPT;
+
+ tipc_bclink_lock(net);
+ memset(&bcl->stats, 0, sizeof(bcl->stats));
+ tipc_bclink_unlock(net);
+ return 0;
+}
+
+int tipc_bclink_set_queue_limits(struct net *net, u32 limit)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *bcl = tn->bcl;
+
+ if (!bcl)
+ return -ENOPROTOOPT;
+ if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN))
+ return -EINVAL;
+
+ tipc_bclink_lock(net);
+ tipc_link_set_queue_limits(bcl, limit);
+ tipc_bclink_unlock(net);
+ return 0;
+}
+
+int tipc_bclink_init(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bcbearer *bcbearer;
+ struct tipc_bclink *bclink;
+ struct tipc_link *bcl;
+
+ bcbearer = kzalloc(sizeof(*bcbearer), GFP_ATOMIC);
+ if (!bcbearer)
+ return -ENOMEM;
+
+ bclink = kzalloc(sizeof(*bclink), GFP_ATOMIC);
+ if (!bclink) {
+ kfree(bcbearer);
+ return -ENOMEM;
+ }
+
+ bcl = &bclink->link;
+ bcbearer->bearer.media = &bcbearer->media;
+ bcbearer->media.send_msg = tipc_bcbearer_send;
+ sprintf(bcbearer->media.name, "tipc-broadcast");
+
+ spin_lock_init(&bclink->lock);
+ __skb_queue_head_init(&bcl->transmq);
+ __skb_queue_head_init(&bcl->backlogq);
+ __skb_queue_head_init(&bcl->deferdq);
+ skb_queue_head_init(&bcl->wakeupq);
+ bcl->next_out_no = 1;
+ spin_lock_init(&bclink->node.lock);
+ __skb_queue_head_init(&bclink->arrvq);
+ skb_queue_head_init(&bclink->inputq);
+ bcl->owner = &bclink->node;
+ bcl->owner->net = net;
+ bcl->mtu = MAX_PKT_DEFAULT_MCAST;
+ tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
+ bcl->bearer_id = MAX_BEARERS;
+ rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer);
+ bcl->state = WORKING_WORKING;
+ bcl->pmsg = (struct tipc_msg *)&bcl->proto_msg;
+ msg_set_prevnode(bcl->pmsg, tn->own_addr);
+ strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
+ tn->bcbearer = bcbearer;
+ tn->bclink = bclink;
+ tn->bcl = bcl;
+ return 0;
+}
+
+void tipc_bclink_stop(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ tipc_bclink_lock(net);
+ tipc_link_purge_queues(tn->bcl);
+ tipc_bclink_unlock(net);
+
+ RCU_INIT_POINTER(tn->bearer_list[BCBEARER], NULL);
+ synchronize_net();
+ kfree(tn->bcbearer);
+ kfree(tn->bclink);
+}
+
+/**
+ * tipc_nmap_add - add a node to a node map
+ */
+static void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node)
+{
+ int n = tipc_node(node);
+ int w = n / WSIZE;
+ u32 mask = (1 << (n % WSIZE));
+
+ if ((nm_ptr->map[w] & mask) == 0) {
+ nm_ptr->count++;
+ nm_ptr->map[w] |= mask;
+ }
+}
+
+/**
+ * tipc_nmap_remove - remove a node from a node map
+ */
+static void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node)
+{
+ int n = tipc_node(node);
+ int w = n / WSIZE;
+ u32 mask = (1 << (n % WSIZE));
+
+ if ((nm_ptr->map[w] & mask) != 0) {
+ nm_ptr->map[w] &= ~mask;
+ nm_ptr->count--;
+ }
+}
+
+/**
+ * tipc_nmap_diff - find differences between node maps
+ * @nm_a: input node map A
+ * @nm_b: input node map B
+ * @nm_diff: output node map A-B (i.e. nodes of A that are not in B)
+ */
+static void tipc_nmap_diff(struct tipc_node_map *nm_a,
+ struct tipc_node_map *nm_b,
+ struct tipc_node_map *nm_diff)
+{
+ int stop = ARRAY_SIZE(nm_a->map);
+ int w;
+ int b;
+ u32 map;
+
+ memset(nm_diff, 0, sizeof(*nm_diff));
+ for (w = 0; w < stop; w++) {
+ map = nm_a->map[w] ^ (nm_a->map[w] & nm_b->map[w]);
+ nm_diff->map[w] = map;
+ if (map != 0) {
+ for (b = 0 ; b < WSIZE; b++) {
+ if (map & (1 << b))
+ nm_diff->count++;
+ }
+ }
+ }
+}
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
new file mode 100644
index 000000000..4bdc12277
--- /dev/null
+++ b/net/tipc/bcast.h
@@ -0,0 +1,136 @@
+/*
+ * net/tipc/bcast.h: Include file for TIPC broadcast code
+ *
+ * Copyright (c) 2003-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_BCAST_H
+#define _TIPC_BCAST_H
+
+#include <linux/tipc_config.h>
+#include "link.h"
+#include "node.h"
+
+/**
+ * struct tipc_bcbearer_pair - a pair of bearers used by broadcast link
+ * @primary: pointer to primary bearer
+ * @secondary: pointer to secondary bearer
+ *
+ * Bearers must have same priority and same set of reachable destinations
+ * to be paired.
+ */
+
+struct tipc_bcbearer_pair {
+ struct tipc_bearer *primary;
+ struct tipc_bearer *secondary;
+};
+
+#define BCBEARER MAX_BEARERS
+
+/**
+ * struct tipc_bcbearer - bearer used by broadcast link
+ * @bearer: (non-standard) broadcast bearer structure
+ * @media: (non-standard) broadcast media structure
+ * @bpairs: array of bearer pairs
+ * @bpairs_temp: temporary array of bearer pairs used by tipc_bcbearer_sort()
+ * @remains: temporary node map used by tipc_bcbearer_send()
+ * @remains_new: temporary node map used tipc_bcbearer_send()
+ *
+ * Note: The fields labelled "temporary" are incorporated into the bearer
+ * to avoid consuming potentially limited stack space through the use of
+ * large local variables within multicast routines. Concurrent access is
+ * prevented through use of the spinlock "bclink_lock".
+ */
+struct tipc_bcbearer {
+ struct tipc_bearer bearer;
+ struct tipc_media media;
+ struct tipc_bcbearer_pair bpairs[MAX_BEARERS];
+ struct tipc_bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1];
+ struct tipc_node_map remains;
+ struct tipc_node_map remains_new;
+};
+
+/**
+ * struct tipc_bclink - link used for broadcast messages
+ * @lock: spinlock governing access to structure
+ * @link: (non-standard) broadcast link structure
+ * @node: (non-standard) node structure representing b'cast link's peer node
+ * @bcast_nodes: map of broadcast-capable nodes
+ * @retransmit_to: node that most recently requested a retransmit
+ *
+ * Handles sequence numbering, fragmentation, bundling, etc.
+ */
+struct tipc_bclink {
+ spinlock_t lock;
+ struct tipc_link link;
+ struct tipc_node node;
+ struct sk_buff_head arrvq;
+ struct sk_buff_head inputq;
+ struct tipc_node_map bcast_nodes;
+ struct tipc_node *retransmit_to;
+};
+
+struct tipc_node;
+extern const char tipc_bclink_name[];
+
+/**
+ * tipc_nmap_equal - test for equality of node maps
+ */
+static inline int tipc_nmap_equal(struct tipc_node_map *nm_a,
+ struct tipc_node_map *nm_b)
+{
+ return !memcmp(nm_a, nm_b, sizeof(*nm_a));
+}
+
+int tipc_bclink_init(struct net *net);
+void tipc_bclink_stop(struct net *net);
+void tipc_bclink_add_node(struct net *net, u32 addr);
+void tipc_bclink_remove_node(struct net *net, u32 addr);
+struct tipc_node *tipc_bclink_retransmit_to(struct net *tn);
+void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked);
+void tipc_bclink_rcv(struct net *net, struct sk_buff *buf);
+u32 tipc_bclink_get_last_sent(struct net *net);
+u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr);
+void tipc_bclink_update_link_state(struct tipc_node *node,
+ u32 last_sent);
+int tipc_bclink_reset_stats(struct net *net);
+int tipc_bclink_set_queue_limits(struct net *net, u32 limit);
+void tipc_bcbearer_sort(struct net *net, struct tipc_node_map *nm_ptr,
+ u32 node, bool action);
+uint tipc_bclink_get_mtu(void);
+int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list);
+void tipc_bclink_wakeup_users(struct net *net);
+int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
+void tipc_bclink_input(struct net *net);
+
+#endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
new file mode 100644
index 000000000..70e3dacbf
--- /dev/null
+++ b/net/tipc/bearer.c
@@ -0,0 +1,1025 @@
+/*
+ * net/tipc/bearer.c: TIPC bearer code
+ *
+ * Copyright (c) 1996-2006, 2013-2014, Ericsson AB
+ * Copyright (c) 2004-2006, 2010-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <net/sock.h>
+#include "core.h"
+#include "bearer.h"
+#include "link.h"
+#include "discover.h"
+#include "bcast.h"
+
+#define MAX_ADDR_STR 60
+
+static struct tipc_media * const media_info_array[] = {
+ &eth_media_info,
+#ifdef CONFIG_TIPC_MEDIA_IB
+ &ib_media_info,
+#endif
+#ifdef CONFIG_TIPC_MEDIA_UDP
+ &udp_media_info,
+#endif
+ NULL
+};
+
+static const struct nla_policy
+tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = {
+ [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_BEARER_NAME] = {
+ .type = NLA_STRING,
+ .len = TIPC_MAX_BEARER_NAME
+ },
+ [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED },
+ [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 }
+};
+
+static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = {
+ [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING },
+ [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED }
+};
+
+static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr,
+ bool shutting_down);
+
+/**
+ * tipc_media_find - locates specified media object by name
+ */
+struct tipc_media *tipc_media_find(const char *name)
+{
+ u32 i;
+
+ for (i = 0; media_info_array[i] != NULL; i++) {
+ if (!strcmp(media_info_array[i]->name, name))
+ break;
+ }
+ return media_info_array[i];
+}
+
+/**
+ * media_find_id - locates specified media object by type identifier
+ */
+static struct tipc_media *media_find_id(u8 type)
+{
+ u32 i;
+
+ for (i = 0; media_info_array[i] != NULL; i++) {
+ if (media_info_array[i]->type_id == type)
+ break;
+ }
+ return media_info_array[i];
+}
+
+/**
+ * tipc_media_addr_printf - record media address in print buffer
+ */
+void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a)
+{
+ char addr_str[MAX_ADDR_STR];
+ struct tipc_media *m_ptr;
+ int ret;
+
+ m_ptr = media_find_id(a->media_id);
+
+ if (m_ptr && !m_ptr->addr2str(a, addr_str, sizeof(addr_str)))
+ ret = scnprintf(buf, len, "%s(%s)", m_ptr->name, addr_str);
+ else {
+ u32 i;
+
+ ret = scnprintf(buf, len, "UNKNOWN(%u)", a->media_id);
+ for (i = 0; i < sizeof(a->value); i++)
+ ret += scnprintf(buf - ret, len + ret,
+ "-%02x", a->value[i]);
+ }
+}
+
+/**
+ * bearer_name_validate - validate & (optionally) deconstruct bearer name
+ * @name: ptr to bearer name string
+ * @name_parts: ptr to area for bearer name components (or NULL if not needed)
+ *
+ * Returns 1 if bearer name is valid, otherwise 0.
+ */
+static int bearer_name_validate(const char *name,
+ struct tipc_bearer_names *name_parts)
+{
+ char name_copy[TIPC_MAX_BEARER_NAME];
+ char *media_name;
+ char *if_name;
+ u32 media_len;
+ u32 if_len;
+
+ /* copy bearer name & ensure length is OK */
+ name_copy[TIPC_MAX_BEARER_NAME - 1] = 0;
+ /* need above in case non-Posix strncpy() doesn't pad with nulls */
+ strncpy(name_copy, name, TIPC_MAX_BEARER_NAME);
+ if (name_copy[TIPC_MAX_BEARER_NAME - 1] != 0)
+ return 0;
+
+ /* ensure all component parts of bearer name are present */
+ media_name = name_copy;
+ if_name = strchr(media_name, ':');
+ if (if_name == NULL)
+ return 0;
+ *(if_name++) = 0;
+ media_len = if_name - media_name;
+ if_len = strlen(if_name) + 1;
+
+ /* validate component parts of bearer name */
+ if ((media_len <= 1) || (media_len > TIPC_MAX_MEDIA_NAME) ||
+ (if_len <= 1) || (if_len > TIPC_MAX_IF_NAME))
+ return 0;
+
+ /* return bearer name components, if necessary */
+ if (name_parts) {
+ strcpy(name_parts->media_name, media_name);
+ strcpy(name_parts->if_name, if_name);
+ }
+ return 1;
+}
+
+/**
+ * tipc_bearer_find - locates bearer object with matching bearer name
+ */
+struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bearer *b_ptr;
+ u32 i;
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ b_ptr = rtnl_dereference(tn->bearer_list[i]);
+ if (b_ptr && (!strcmp(b_ptr->name, name)))
+ return b_ptr;
+ }
+ return NULL;
+}
+
+void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bearer *b_ptr;
+
+ rcu_read_lock();
+ b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (b_ptr) {
+ tipc_bcbearer_sort(net, &b_ptr->nodes, dest, true);
+ tipc_disc_add_dest(b_ptr->link_req);
+ }
+ rcu_read_unlock();
+}
+
+void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bearer *b_ptr;
+
+ rcu_read_lock();
+ b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (b_ptr) {
+ tipc_bcbearer_sort(net, &b_ptr->nodes, dest, false);
+ tipc_disc_remove_dest(b_ptr->link_req);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * tipc_enable_bearer - enable bearer with the given name
+ */
+static int tipc_enable_bearer(struct net *net, const char *name,
+ u32 disc_domain, u32 priority,
+ struct nlattr *attr[])
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bearer *b_ptr;
+ struct tipc_media *m_ptr;
+ struct tipc_bearer_names b_names;
+ char addr_string[16];
+ u32 bearer_id;
+ u32 with_this_prio;
+ u32 i;
+ int res = -EINVAL;
+
+ if (!tn->own_addr) {
+ pr_warn("Bearer <%s> rejected, not supported in standalone mode\n",
+ name);
+ return -ENOPROTOOPT;
+ }
+ if (!bearer_name_validate(name, &b_names)) {
+ pr_warn("Bearer <%s> rejected, illegal name\n", name);
+ return -EINVAL;
+ }
+ if (tipc_addr_domain_valid(disc_domain) &&
+ (disc_domain != tn->own_addr)) {
+ if (tipc_in_scope(disc_domain, tn->own_addr)) {
+ disc_domain = tn->own_addr & TIPC_CLUSTER_MASK;
+ res = 0; /* accept any node in own cluster */
+ } else if (in_own_cluster_exact(net, disc_domain))
+ res = 0; /* accept specified node in own cluster */
+ }
+ if (res) {
+ pr_warn("Bearer <%s> rejected, illegal discovery domain\n",
+ name);
+ return -EINVAL;
+ }
+ if ((priority > TIPC_MAX_LINK_PRI) &&
+ (priority != TIPC_MEDIA_LINK_PRI)) {
+ pr_warn("Bearer <%s> rejected, illegal priority\n", name);
+ return -EINVAL;
+ }
+
+ m_ptr = tipc_media_find(b_names.media_name);
+ if (!m_ptr) {
+ pr_warn("Bearer <%s> rejected, media <%s> not registered\n",
+ name, b_names.media_name);
+ return -EINVAL;
+ }
+
+ if (priority == TIPC_MEDIA_LINK_PRI)
+ priority = m_ptr->priority;
+
+restart:
+ bearer_id = MAX_BEARERS;
+ with_this_prio = 1;
+ for (i = MAX_BEARERS; i-- != 0; ) {
+ b_ptr = rtnl_dereference(tn->bearer_list[i]);
+ if (!b_ptr) {
+ bearer_id = i;
+ continue;
+ }
+ if (!strcmp(name, b_ptr->name)) {
+ pr_warn("Bearer <%s> rejected, already enabled\n",
+ name);
+ return -EINVAL;
+ }
+ if ((b_ptr->priority == priority) &&
+ (++with_this_prio > 2)) {
+ if (priority-- == 0) {
+ pr_warn("Bearer <%s> rejected, duplicate priority\n",
+ name);
+ return -EINVAL;
+ }
+ pr_warn("Bearer <%s> priority adjustment required %u->%u\n",
+ name, priority + 1, priority);
+ goto restart;
+ }
+ }
+ if (bearer_id >= MAX_BEARERS) {
+ pr_warn("Bearer <%s> rejected, bearer limit reached (%u)\n",
+ name, MAX_BEARERS);
+ return -EINVAL;
+ }
+
+ b_ptr = kzalloc(sizeof(*b_ptr), GFP_ATOMIC);
+ if (!b_ptr)
+ return -ENOMEM;
+
+ strcpy(b_ptr->name, name);
+ b_ptr->media = m_ptr;
+ res = m_ptr->enable_media(net, b_ptr, attr);
+ if (res) {
+ pr_warn("Bearer <%s> rejected, enable failure (%d)\n",
+ name, -res);
+ return -EINVAL;
+ }
+
+ b_ptr->identity = bearer_id;
+ b_ptr->tolerance = m_ptr->tolerance;
+ b_ptr->window = m_ptr->window;
+ b_ptr->domain = disc_domain;
+ b_ptr->net_plane = bearer_id + 'A';
+ b_ptr->priority = priority;
+
+ res = tipc_disc_create(net, b_ptr, &b_ptr->bcast_addr);
+ if (res) {
+ bearer_disable(net, b_ptr, false);
+ pr_warn("Bearer <%s> rejected, discovery object creation failed\n",
+ name);
+ return -EINVAL;
+ }
+
+ rcu_assign_pointer(tn->bearer_list[bearer_id], b_ptr);
+
+ pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
+ name,
+ tipc_addr_string_fill(addr_string, disc_domain), priority);
+ return res;
+}
+
+/**
+ * tipc_reset_bearer - Reset all links established over this bearer
+ */
+static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr)
+{
+ pr_info("Resetting bearer <%s>\n", b_ptr->name);
+ tipc_link_reset_list(net, b_ptr->identity);
+ tipc_disc_reset(net, b_ptr);
+ return 0;
+}
+
+/**
+ * bearer_disable
+ *
+ * Note: This routine assumes caller holds RTNL lock.
+ */
+static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr,
+ bool shutting_down)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ u32 i;
+
+ pr_info("Disabling bearer <%s>\n", b_ptr->name);
+ b_ptr->media->disable_media(b_ptr);
+
+ tipc_link_delete_list(net, b_ptr->identity, shutting_down);
+ if (b_ptr->link_req)
+ tipc_disc_delete(b_ptr->link_req);
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ if (b_ptr == rtnl_dereference(tn->bearer_list[i])) {
+ RCU_INIT_POINTER(tn->bearer_list[i], NULL);
+ break;
+ }
+ }
+ kfree_rcu(b_ptr, rcu);
+}
+
+int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
+ struct nlattr *attr[])
+{
+ struct net_device *dev;
+ char *driver_name = strchr((const char *)b->name, ':') + 1;
+
+ /* Find device with specified name */
+ dev = dev_get_by_name(net, driver_name);
+ if (!dev)
+ return -ENODEV;
+
+ /* Associate TIPC bearer with L2 bearer */
+ rcu_assign_pointer(b->media_ptr, dev);
+ memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
+ memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
+ b->bcast_addr.media_id = b->media->type_id;
+ b->bcast_addr.broadcast = 1;
+ b->mtu = dev->mtu;
+ b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr);
+ rcu_assign_pointer(dev->tipc_ptr, b);
+ return 0;
+}
+
+/* tipc_disable_l2_media - detach TIPC bearer from an L2 interface
+ *
+ * Mark L2 bearer as inactive so that incoming buffers are thrown away,
+ * then get worker thread to complete bearer cleanup. (Can't do cleanup
+ * here because cleanup code needs to sleep and caller holds spinlocks.)
+ */
+void tipc_disable_l2_media(struct tipc_bearer *b)
+{
+ struct net_device *dev;
+
+ dev = (struct net_device *)rtnl_dereference(b->media_ptr);
+ RCU_INIT_POINTER(b->media_ptr, NULL);
+ RCU_INIT_POINTER(dev->tipc_ptr, NULL);
+ synchronize_net();
+ dev_put(dev);
+}
+
+/**
+ * tipc_l2_send_msg - send a TIPC packet out over an L2 interface
+ * @buf: the packet to be sent
+ * @b_ptr: the bearer through which the packet is to be sent
+ * @dest: peer destination address
+ */
+int tipc_l2_send_msg(struct net *net, struct sk_buff *buf,
+ struct tipc_bearer *b, struct tipc_media_addr *dest)
+{
+ struct sk_buff *clone;
+ struct net_device *dev;
+ int delta;
+
+ dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr);
+ if (!dev)
+ return 0;
+
+ clone = skb_clone(buf, GFP_ATOMIC);
+ if (!clone)
+ return 0;
+
+ delta = dev->hard_header_len - skb_headroom(buf);
+ if ((delta > 0) &&
+ pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+ kfree_skb(clone);
+ return 0;
+ }
+
+ skb_reset_network_header(clone);
+ clone->dev = dev;
+ clone->protocol = htons(ETH_P_TIPC);
+ dev_hard_header(clone, dev, ETH_P_TIPC, dest->value,
+ dev->dev_addr, clone->len);
+ dev_queue_xmit(clone);
+ return 0;
+}
+
+/* tipc_bearer_send- sends buffer to destination over bearer
+ *
+ * IMPORTANT:
+ * The media send routine must not alter the buffer being passed in
+ * as it may be needed for later retransmission!
+ */
+void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf,
+ struct tipc_media_addr *dest)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bearer *b_ptr;
+
+ rcu_read_lock();
+ b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (likely(b_ptr))
+ b_ptr->media->send_msg(net, buf, b_ptr, dest);
+ rcu_read_unlock();
+}
+
+/**
+ * tipc_l2_rcv_msg - handle incoming TIPC message from an interface
+ * @buf: the received packet
+ * @dev: the net device that the packet was received on
+ * @pt: the packet_type structure which was used to register this handler
+ * @orig_dev: the original receive net device in case the device is a bond
+ *
+ * Accept only packets explicitly sent to this node, or broadcast packets;
+ * ignores packets sent using interface multicast, and traffic sent to other
+ * nodes (which can happen if interface is running in promiscuous mode).
+ */
+static int tipc_l2_rcv_msg(struct sk_buff *buf, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct tipc_bearer *b_ptr;
+
+ rcu_read_lock();
+ b_ptr = rcu_dereference_rtnl(dev->tipc_ptr);
+ if (likely(b_ptr)) {
+ if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
+ buf->next = NULL;
+ tipc_rcv(dev_net(dev), buf, b_ptr);
+ rcu_read_unlock();
+ return NET_RX_SUCCESS;
+ }
+ }
+ rcu_read_unlock();
+
+ kfree_skb(buf);
+ return NET_RX_DROP;
+}
+
+/**
+ * tipc_l2_device_event - handle device events from network device
+ * @nb: the context of the notification
+ * @evt: the type of event
+ * @ptr: the net device that the event was on
+ *
+ * This function is called by the Ethernet driver in case of link
+ * change event.
+ */
+static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct tipc_bearer *b_ptr;
+
+ b_ptr = rtnl_dereference(dev->tipc_ptr);
+ if (!b_ptr)
+ return NOTIFY_DONE;
+
+ b_ptr->mtu = dev->mtu;
+
+ switch (evt) {
+ case NETDEV_CHANGE:
+ if (netif_carrier_ok(dev))
+ break;
+ case NETDEV_DOWN:
+ case NETDEV_CHANGEMTU:
+ tipc_reset_bearer(net, b_ptr);
+ break;
+ case NETDEV_CHANGEADDR:
+ b_ptr->media->raw2addr(b_ptr, &b_ptr->addr,
+ (char *)dev->dev_addr);
+ tipc_reset_bearer(net, b_ptr);
+ break;
+ case NETDEV_UNREGISTER:
+ case NETDEV_CHANGENAME:
+ bearer_disable(dev_net(dev), b_ptr, false);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct packet_type tipc_packet_type __read_mostly = {
+ .type = htons(ETH_P_TIPC),
+ .func = tipc_l2_rcv_msg,
+};
+
+static struct notifier_block notifier = {
+ .notifier_call = tipc_l2_device_event,
+ .priority = 0,
+};
+
+int tipc_bearer_setup(void)
+{
+ int err;
+
+ err = register_netdevice_notifier(&notifier);
+ if (err)
+ return err;
+ dev_add_pack(&tipc_packet_type);
+ return 0;
+}
+
+void tipc_bearer_cleanup(void)
+{
+ unregister_netdevice_notifier(&notifier);
+ dev_remove_pack(&tipc_packet_type);
+}
+
+void tipc_bearer_stop(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_bearer *b_ptr;
+ u32 i;
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ b_ptr = rtnl_dereference(tn->bearer_list[i]);
+ if (b_ptr) {
+ bearer_disable(net, b_ptr, true);
+ tn->bearer_list[i] = NULL;
+ }
+ }
+}
+
+/* Caller should hold rtnl_lock to protect the bearer */
+static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
+ struct tipc_bearer *bearer, int nlflags)
+{
+ void *hdr;
+ struct nlattr *attrs;
+ struct nlattr *prop;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ nlflags, TIPC_NL_BEARER_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_BEARER);
+ if (!attrs)
+ goto msg_full;
+
+ if (nla_put_string(msg->skb, TIPC_NLA_BEARER_NAME, bearer->name))
+ goto attr_msg_full;
+
+ prop = nla_nest_start(msg->skb, TIPC_NLA_BEARER_PROP);
+ if (!prop)
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, bearer->priority))
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance))
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window))
+ goto prop_msg_full;
+
+ nla_nest_end(msg->skb, prop);
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+prop_msg_full:
+ nla_nest_cancel(msg->skb, prop);
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int err;
+ int i = cb->args[0];
+ struct tipc_bearer *bearer;
+ struct tipc_nl_msg msg;
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ if (i == MAX_BEARERS)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ rtnl_lock();
+ for (i = 0; i < MAX_BEARERS; i++) {
+ bearer = rtnl_dereference(tn->bearer_list[i]);
+ if (!bearer)
+ continue;
+
+ err = __tipc_nl_add_bearer(&msg, bearer, NLM_F_MULTI);
+ if (err)
+ break;
+ }
+ rtnl_unlock();
+
+ cb->args[0] = i;
+ return skb->len;
+}
+
+int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *name;
+ struct sk_buff *rep;
+ struct tipc_bearer *bearer;
+ struct tipc_nl_msg msg;
+ struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[TIPC_NLA_BEARER])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX,
+ info->attrs[TIPC_NLA_BEARER],
+ tipc_nl_bearer_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_BEARER_NAME])
+ return -EINVAL;
+ name = nla_data(attrs[TIPC_NLA_BEARER_NAME]);
+
+ rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!rep)
+ return -ENOMEM;
+
+ msg.skb = rep;
+ msg.portid = info->snd_portid;
+ msg.seq = info->snd_seq;
+
+ rtnl_lock();
+ bearer = tipc_bearer_find(net, name);
+ if (!bearer) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = __tipc_nl_add_bearer(&msg, bearer, 0);
+ if (err)
+ goto err_out;
+ rtnl_unlock();
+
+ return genlmsg_reply(rep, info);
+err_out:
+ rtnl_unlock();
+ nlmsg_free(rep);
+
+ return err;
+}
+
+int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *name;
+ struct tipc_bearer *bearer;
+ struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+
+ if (!info->attrs[TIPC_NLA_BEARER])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX,
+ info->attrs[TIPC_NLA_BEARER],
+ tipc_nl_bearer_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_BEARER_NAME])
+ return -EINVAL;
+
+ name = nla_data(attrs[TIPC_NLA_BEARER_NAME]);
+
+ rtnl_lock();
+ bearer = tipc_bearer_find(net, name);
+ if (!bearer) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ bearer_disable(net, bearer, false);
+ rtnl_unlock();
+
+ return 0;
+}
+
+int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *bearer;
+ struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ u32 domain;
+ u32 prio;
+
+ prio = TIPC_MEDIA_LINK_PRI;
+ domain = tn->own_addr & TIPC_CLUSTER_MASK;
+
+ if (!info->attrs[TIPC_NLA_BEARER])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX,
+ info->attrs[TIPC_NLA_BEARER],
+ tipc_nl_bearer_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_BEARER_NAME])
+ return -EINVAL;
+
+ bearer = nla_data(attrs[TIPC_NLA_BEARER_NAME]);
+
+ if (attrs[TIPC_NLA_BEARER_DOMAIN])
+ domain = nla_get_u32(attrs[TIPC_NLA_BEARER_DOMAIN]);
+
+ if (attrs[TIPC_NLA_BEARER_PROP]) {
+ struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
+
+ err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_BEARER_PROP],
+ props);
+ if (err)
+ return err;
+
+ if (props[TIPC_NLA_PROP_PRIO])
+ prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
+ }
+
+ rtnl_lock();
+ err = tipc_enable_bearer(net, bearer, domain, prio, attrs);
+ if (err) {
+ rtnl_unlock();
+ return err;
+ }
+ rtnl_unlock();
+
+ return 0;
+}
+
+int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *name;
+ struct tipc_bearer *b;
+ struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
+ struct net *net = genl_info_net(info);
+
+ if (!info->attrs[TIPC_NLA_BEARER])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX,
+ info->attrs[TIPC_NLA_BEARER],
+ tipc_nl_bearer_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_BEARER_NAME])
+ return -EINVAL;
+ name = nla_data(attrs[TIPC_NLA_BEARER_NAME]);
+
+ rtnl_lock();
+ b = tipc_bearer_find(net, name);
+ if (!b) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ if (attrs[TIPC_NLA_BEARER_PROP]) {
+ struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
+
+ err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_BEARER_PROP],
+ props);
+ if (err) {
+ rtnl_unlock();
+ return err;
+ }
+
+ if (props[TIPC_NLA_PROP_TOL])
+ b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
+ if (props[TIPC_NLA_PROP_PRIO])
+ b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
+ if (props[TIPC_NLA_PROP_WIN])
+ b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ }
+ rtnl_unlock();
+
+ return 0;
+}
+
+static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
+ struct tipc_media *media, int nlflags)
+{
+ void *hdr;
+ struct nlattr *attrs;
+ struct nlattr *prop;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ nlflags, TIPC_NL_MEDIA_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_MEDIA);
+ if (!attrs)
+ goto msg_full;
+
+ if (nla_put_string(msg->skb, TIPC_NLA_MEDIA_NAME, media->name))
+ goto attr_msg_full;
+
+ prop = nla_nest_start(msg->skb, TIPC_NLA_MEDIA_PROP);
+ if (!prop)
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, media->priority))
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance))
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window))
+ goto prop_msg_full;
+
+ nla_nest_end(msg->skb, prop);
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+prop_msg_full:
+ nla_nest_cancel(msg->skb, prop);
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int err;
+ int i = cb->args[0];
+ struct tipc_nl_msg msg;
+
+ if (i == MAX_MEDIA)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ rtnl_lock();
+ for (; media_info_array[i] != NULL; i++) {
+ err = __tipc_nl_add_media(&msg, media_info_array[i],
+ NLM_F_MULTI);
+ if (err)
+ break;
+ }
+ rtnl_unlock();
+
+ cb->args[0] = i;
+ return skb->len;
+}
+
+int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *name;
+ struct tipc_nl_msg msg;
+ struct tipc_media *media;
+ struct sk_buff *rep;
+ struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
+
+ if (!info->attrs[TIPC_NLA_MEDIA])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_MEDIA_MAX,
+ info->attrs[TIPC_NLA_MEDIA],
+ tipc_nl_media_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_MEDIA_NAME])
+ return -EINVAL;
+ name = nla_data(attrs[TIPC_NLA_MEDIA_NAME]);
+
+ rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!rep)
+ return -ENOMEM;
+
+ msg.skb = rep;
+ msg.portid = info->snd_portid;
+ msg.seq = info->snd_seq;
+
+ rtnl_lock();
+ media = tipc_media_find(name);
+ if (!media) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = __tipc_nl_add_media(&msg, media, 0);
+ if (err)
+ goto err_out;
+ rtnl_unlock();
+
+ return genlmsg_reply(rep, info);
+err_out:
+ rtnl_unlock();
+ nlmsg_free(rep);
+
+ return err;
+}
+
+int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *name;
+ struct tipc_media *m;
+ struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
+
+ if (!info->attrs[TIPC_NLA_MEDIA])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_MEDIA_MAX,
+ info->attrs[TIPC_NLA_MEDIA],
+ tipc_nl_media_policy);
+
+ if (!attrs[TIPC_NLA_MEDIA_NAME])
+ return -EINVAL;
+ name = nla_data(attrs[TIPC_NLA_MEDIA_NAME]);
+
+ rtnl_lock();
+ m = tipc_media_find(name);
+ if (!m) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ if (attrs[TIPC_NLA_MEDIA_PROP]) {
+ struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
+
+ err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_MEDIA_PROP],
+ props);
+ if (err) {
+ rtnl_unlock();
+ return err;
+ }
+
+ if (props[TIPC_NLA_PROP_TOL])
+ m->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
+ if (props[TIPC_NLA_PROP_PRIO])
+ m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
+ if (props[TIPC_NLA_PROP_WIN])
+ m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ }
+ rtnl_unlock();
+
+ return 0;
+}
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
new file mode 100644
index 000000000..5cad243ee
--- /dev/null
+++ b/net/tipc/bearer.h
@@ -0,0 +1,221 @@
+/*
+ * net/tipc/bearer.h: Include file for TIPC bearer code
+ *
+ * Copyright (c) 1996-2006, 2013-2014, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_BEARER_H
+#define _TIPC_BEARER_H
+
+#include "netlink.h"
+#include <net/genetlink.h>
+
+#define MAX_BEARERS 2
+#define MAX_MEDIA 3
+#define MAX_NODES 4096
+#define WSIZE 32
+
+/* Identifiers associated with TIPC message header media address info
+ * - address info field is 32 bytes long
+ * - the field's actual content and length is defined per media
+ * - remaining unused bytes in the field are set to zero
+ */
+#define TIPC_MEDIA_INFO_SIZE 32
+#define TIPC_MEDIA_TYPE_OFFSET 3
+#define TIPC_MEDIA_ADDR_OFFSET 4
+
+/*
+ * Identifiers of supported TIPC media types
+ */
+#define TIPC_MEDIA_TYPE_ETH 1
+#define TIPC_MEDIA_TYPE_IB 2
+#define TIPC_MEDIA_TYPE_UDP 3
+
+/**
+ * struct tipc_node_map - set of node identifiers
+ * @count: # of nodes in set
+ * @map: bitmap of node identifiers that are in the set
+ */
+struct tipc_node_map {
+ u32 count;
+ u32 map[MAX_NODES / WSIZE];
+};
+
+/**
+ * struct tipc_media_addr - destination address used by TIPC bearers
+ * @value: address info (format defined by media)
+ * @media_id: TIPC media type identifier
+ * @broadcast: non-zero if address is a broadcast address
+ */
+struct tipc_media_addr {
+ u8 value[TIPC_MEDIA_INFO_SIZE];
+ u8 media_id;
+ u8 broadcast;
+};
+
+struct tipc_bearer;
+
+/**
+ * struct tipc_media - Media specific info exposed to generic bearer layer
+ * @send_msg: routine which handles buffer transmission
+ * @enable_media: routine which enables a media
+ * @disable_media: routine which disables a media
+ * @addr2str: convert media address format to string
+ * @addr2msg: convert from media addr format to discovery msg addr format
+ * @msg2addr: convert from discovery msg addr format to media addr format
+ * @raw2addr: convert from raw addr format to media addr format
+ * @priority: default link (and bearer) priority
+ * @tolerance: default time (in ms) before declaring link failure
+ * @window: default window (in packets) before declaring link congestion
+ * @type_id: TIPC media identifier
+ * @hwaddr_len: TIPC media address len
+ * @name: media name
+ */
+struct tipc_media {
+ int (*send_msg)(struct net *net, struct sk_buff *buf,
+ struct tipc_bearer *b_ptr,
+ struct tipc_media_addr *dest);
+ int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr,
+ struct nlattr *attr[]);
+ void (*disable_media)(struct tipc_bearer *b_ptr);
+ int (*addr2str)(struct tipc_media_addr *addr,
+ char *strbuf,
+ int bufsz);
+ int (*addr2msg)(char *msg, struct tipc_media_addr *addr);
+ int (*msg2addr)(struct tipc_bearer *b,
+ struct tipc_media_addr *addr,
+ char *msg);
+ int (*raw2addr)(struct tipc_bearer *b,
+ struct tipc_media_addr *addr,
+ char *raw);
+ u32 priority;
+ u32 tolerance;
+ u32 window;
+ u32 type_id;
+ u32 hwaddr_len;
+ char name[TIPC_MAX_MEDIA_NAME];
+};
+
+/**
+ * struct tipc_bearer - Generic TIPC bearer structure
+ * @media_ptr: pointer to additional media-specific information about bearer
+ * @mtu: max packet size bearer can support
+ * @addr: media-specific address associated with bearer
+ * @name: bearer name (format = media:interface)
+ * @media: ptr to media structure associated with bearer
+ * @bcast_addr: media address used in broadcasting
+ * @rcu: rcu struct for tipc_bearer
+ * @priority: default link priority for bearer
+ * @window: default window size for bearer
+ * @tolerance: default link tolerance for bearer
+ * @domain: network domain to which links can be established
+ * @identity: array index of this bearer within TIPC bearer array
+ * @link_req: ptr to (optional) structure making periodic link setup requests
+ * @net_plane: network plane ('A' through 'H') currently associated with bearer
+ * @nodes: indicates which nodes in cluster can be reached through bearer
+ *
+ * Note: media-specific code is responsible for initialization of the fields
+ * indicated below when a bearer is enabled; TIPC's generic bearer code takes
+ * care of initializing all other fields.
+ */
+struct tipc_bearer {
+ void __rcu *media_ptr; /* initalized by media */
+ u32 mtu; /* initalized by media */
+ struct tipc_media_addr addr; /* initalized by media */
+ char name[TIPC_MAX_BEARER_NAME];
+ struct tipc_media *media;
+ struct tipc_media_addr bcast_addr;
+ struct rcu_head rcu;
+ u32 priority;
+ u32 window;
+ u32 tolerance;
+ u32 domain;
+ u32 identity;
+ struct tipc_link_req *link_req;
+ char net_plane;
+ struct tipc_node_map nodes;
+};
+
+struct tipc_bearer_names {
+ char media_name[TIPC_MAX_MEDIA_NAME];
+ char if_name[TIPC_MAX_IF_NAME];
+};
+
+/*
+ * TIPC routines available to supported media types
+ */
+
+void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr);
+
+/*
+ * Routines made available to TIPC by supported media types
+ */
+extern struct tipc_media eth_media_info;
+
+#ifdef CONFIG_TIPC_MEDIA_IB
+extern struct tipc_media ib_media_info;
+#endif
+#ifdef CONFIG_TIPC_MEDIA_UDP
+extern struct tipc_media udp_media_info;
+#endif
+
+int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb);
+int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info);
+
+int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb);
+int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info);
+
+int tipc_media_set_priority(const char *name, u32 new_value);
+int tipc_media_set_window(const char *name, u32 new_value);
+void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a);
+int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
+ struct nlattr *attrs[]);
+void tipc_disable_l2_media(struct tipc_bearer *b);
+int tipc_l2_send_msg(struct net *net, struct sk_buff *buf,
+ struct tipc_bearer *b, struct tipc_media_addr *dest);
+
+void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest);
+void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest);
+struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name);
+struct tipc_media *tipc_media_find(const char *name);
+int tipc_bearer_setup(void);
+void tipc_bearer_cleanup(void);
+void tipc_bearer_stop(struct net *net);
+void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf,
+ struct tipc_media_addr *dest);
+
+#endif /* _TIPC_BEARER_H */
diff --git a/net/tipc/core.c b/net/tipc/core.c
new file mode 100644
index 000000000..be1c9fa60
--- /dev/null
+++ b/net/tipc/core.c
@@ -0,0 +1,169 @@
+/*
+ * net/tipc/core.c: TIPC module code
+ *
+ * Copyright (c) 2003-2006, 2013, Ericsson AB
+ * Copyright (c) 2005-2006, 2010-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "core.h"
+#include "name_table.h"
+#include "subscr.h"
+#include "bearer.h"
+#include "net.h"
+#include "socket.h"
+
+#include <linux/module.h>
+
+/* configurable TIPC parameters */
+int tipc_net_id __read_mostly;
+int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */
+
+static int __net_init tipc_init_net(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ int err;
+
+ tn->net_id = 4711;
+ tn->own_addr = 0;
+ get_random_bytes(&tn->random, sizeof(int));
+ INIT_LIST_HEAD(&tn->node_list);
+ spin_lock_init(&tn->node_list_lock);
+
+ err = tipc_sk_rht_init(net);
+ if (err)
+ goto out_sk_rht;
+
+ err = tipc_nametbl_init(net);
+ if (err)
+ goto out_nametbl;
+
+ err = tipc_subscr_start(net);
+ if (err)
+ goto out_subscr;
+ return 0;
+
+out_subscr:
+ tipc_nametbl_stop(net);
+out_nametbl:
+ tipc_sk_rht_destroy(net);
+out_sk_rht:
+ return err;
+}
+
+static void __net_exit tipc_exit_net(struct net *net)
+{
+ tipc_subscr_stop(net);
+ tipc_net_stop(net);
+ tipc_nametbl_stop(net);
+ tipc_sk_rht_destroy(net);
+}
+
+static struct pernet_operations tipc_net_ops = {
+ .init = tipc_init_net,
+ .exit = tipc_exit_net,
+ .id = &tipc_net_id,
+ .size = sizeof(struct tipc_net),
+};
+
+static int __init tipc_init(void)
+{
+ int err;
+
+ pr_info("Activated (version " TIPC_MOD_VER ")\n");
+
+ sysctl_tipc_rmem[0] = TIPC_CONN_OVERLOAD_LIMIT >> 4 <<
+ TIPC_LOW_IMPORTANCE;
+ sysctl_tipc_rmem[1] = TIPC_CONN_OVERLOAD_LIMIT >> 4 <<
+ TIPC_CRITICAL_IMPORTANCE;
+ sysctl_tipc_rmem[2] = TIPC_CONN_OVERLOAD_LIMIT;
+
+ err = tipc_netlink_start();
+ if (err)
+ goto out_netlink;
+
+ err = tipc_netlink_compat_start();
+ if (err)
+ goto out_netlink_compat;
+
+ err = tipc_socket_init();
+ if (err)
+ goto out_socket;
+
+ err = tipc_register_sysctl();
+ if (err)
+ goto out_sysctl;
+
+ err = register_pernet_subsys(&tipc_net_ops);
+ if (err)
+ goto out_pernet;
+
+ err = tipc_bearer_setup();
+ if (err)
+ goto out_bearer;
+
+ pr_info("Started in single node mode\n");
+ return 0;
+out_bearer:
+ unregister_pernet_subsys(&tipc_net_ops);
+out_pernet:
+ tipc_unregister_sysctl();
+out_sysctl:
+ tipc_socket_stop();
+out_socket:
+ tipc_netlink_compat_stop();
+out_netlink_compat:
+ tipc_netlink_stop();
+out_netlink:
+ pr_err("Unable to start in single node mode\n");
+ return err;
+}
+
+static void __exit tipc_exit(void)
+{
+ tipc_bearer_cleanup();
+ unregister_pernet_subsys(&tipc_net_ops);
+ tipc_netlink_stop();
+ tipc_netlink_compat_stop();
+ tipc_socket_stop();
+ tipc_unregister_sysctl();
+
+ pr_info("Deactivated\n");
+}
+
+module_init(tipc_init);
+module_exit(tipc_exit);
+
+MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(TIPC_MOD_VER);
diff --git a/net/tipc/core.h b/net/tipc/core.h
new file mode 100644
index 000000000..3dc68c7a9
--- /dev/null
+++ b/net/tipc/core.h
@@ -0,0 +1,116 @@
+/*
+ * net/tipc/core.h: Include file for TIPC global declarations
+ *
+ * Copyright (c) 2005-2006, 2013 Ericsson AB
+ * Copyright (c) 2005-2007, 2010-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_CORE_H
+#define _TIPC_CORE_H
+
+#include <linux/tipc.h>
+#include <linux/tipc_config.h>
+#include <linux/tipc_netlink.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/atomic.h>
+#include <asm/hardirq.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <linux/etherdevice.h>
+#include <net/netns/generic.h>
+#include <linux/rhashtable.h>
+
+#include "node.h"
+#include "bearer.h"
+#include "bcast.h"
+#include "netlink.h"
+#include "link.h"
+#include "node.h"
+#include "msg.h"
+
+#define TIPC_MOD_VER "2.0.0"
+
+extern int tipc_net_id __read_mostly;
+extern int sysctl_tipc_rmem[3] __read_mostly;
+extern int sysctl_tipc_named_timeout __read_mostly;
+
+struct tipc_net {
+ u32 own_addr;
+ int net_id;
+ int random;
+
+ /* Node table and node list */
+ spinlock_t node_list_lock;
+ struct hlist_head node_htable[NODE_HTABLE_SIZE];
+ struct list_head node_list;
+ u32 num_nodes;
+ u32 num_links;
+
+ /* Bearer list */
+ struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1];
+
+ /* Broadcast link */
+ struct tipc_bcbearer *bcbearer;
+ struct tipc_bclink *bclink;
+ struct tipc_link *bcl;
+
+ /* Socket hash table */
+ struct rhashtable sk_rht;
+
+ /* Name table */
+ spinlock_t nametbl_lock;
+ struct name_table *nametbl;
+
+ /* Topology subscription server */
+ struct tipc_server *topsrv;
+ atomic_t subscription_count;
+};
+
+#ifdef CONFIG_SYSCTL
+int tipc_register_sysctl(void);
+void tipc_unregister_sysctl(void);
+#else
+#define tipc_register_sysctl() 0
+#define tipc_unregister_sysctl()
+#endif
+#endif
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
new file mode 100644
index 000000000..967e292f5
--- /dev/null
+++ b/net/tipc/discover.c
@@ -0,0 +1,418 @@
+/*
+ * net/tipc/discover.c
+ *
+ * Copyright (c) 2003-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2005-2006, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "link.h"
+#include "discover.h"
+
+/* min delay during bearer start up */
+#define TIPC_LINK_REQ_INIT msecs_to_jiffies(125)
+/* max delay if bearer has no links */
+#define TIPC_LINK_REQ_FAST msecs_to_jiffies(1000)
+/* max delay if bearer has links */
+#define TIPC_LINK_REQ_SLOW msecs_to_jiffies(60000)
+/* indicates no timer in use */
+#define TIPC_LINK_REQ_INACTIVE 0xffffffff
+
+/**
+ * struct tipc_link_req - information about an ongoing link setup request
+ * @bearer_id: identity of bearer issuing requests
+ * @net: network namespace instance
+ * @dest: destination address for request messages
+ * @domain: network domain to which links can be established
+ * @num_nodes: number of nodes currently discovered (i.e. with an active link)
+ * @lock: spinlock for controlling access to requests
+ * @buf: request message to be (repeatedly) sent
+ * @timer: timer governing period between requests
+ * @timer_intv: current interval between requests (in ms)
+ */
+struct tipc_link_req {
+ u32 bearer_id;
+ struct tipc_media_addr dest;
+ struct net *net;
+ u32 domain;
+ int num_nodes;
+ spinlock_t lock;
+ struct sk_buff *buf;
+ struct timer_list timer;
+ unsigned long timer_intv;
+};
+
+/**
+ * tipc_disc_init_msg - initialize a link setup message
+ * @net: the applicable net namespace
+ * @type: message type (request or response)
+ * @b_ptr: ptr to bearer issuing message
+ */
+static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type,
+ struct tipc_bearer *b_ptr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_msg *msg;
+ u32 dest_domain = b_ptr->domain;
+
+ msg = buf_msg(buf);
+ tipc_msg_init(tn->own_addr, msg, LINK_CONFIG, type,
+ MAX_H_SIZE, dest_domain);
+ msg_set_non_seq(msg, 1);
+ msg_set_node_sig(msg, tn->random);
+ msg_set_node_capabilities(msg, 0);
+ msg_set_dest_domain(msg, dest_domain);
+ msg_set_bc_netid(msg, tn->net_id);
+ b_ptr->media->addr2msg(msg_media_addr(msg), &b_ptr->addr);
+}
+
+/**
+ * disc_dupl_alert - issue node address duplication alert
+ * @b_ptr: pointer to bearer detecting duplication
+ * @node_addr: duplicated node address
+ * @media_addr: media address advertised by duplicated node
+ */
+static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr,
+ struct tipc_media_addr *media_addr)
+{
+ char node_addr_str[16];
+ char media_addr_str[64];
+
+ tipc_addr_string_fill(node_addr_str, node_addr);
+ tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str),
+ media_addr);
+ pr_warn("Duplicate %s using %s seen on <%s>\n", node_addr_str,
+ media_addr_str, b_ptr->name);
+}
+
+/**
+ * tipc_disc_rcv - handle incoming discovery message (request or response)
+ * @net: the applicable net namespace
+ * @buf: buffer containing message
+ * @bearer: bearer that message arrived on
+ */
+void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
+ struct tipc_bearer *bearer)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_node *node;
+ struct tipc_link *link;
+ struct tipc_media_addr maddr;
+ struct sk_buff *rbuf;
+ struct tipc_msg *msg = buf_msg(buf);
+ u32 ddom = msg_dest_domain(msg);
+ u32 onode = msg_prevnode(msg);
+ u32 net_id = msg_bc_netid(msg);
+ u32 mtyp = msg_type(msg);
+ u32 signature = msg_node_sig(msg);
+ u16 caps = msg_node_capabilities(msg);
+ bool addr_match = false;
+ bool sign_match = false;
+ bool link_up = false;
+ bool accept_addr = false;
+ bool accept_sign = false;
+ bool respond = false;
+
+ bearer->media->msg2addr(bearer, &maddr, msg_media_addr(msg));
+ kfree_skb(buf);
+
+ /* Ensure message from node is valid and communication is permitted */
+ if (net_id != tn->net_id)
+ return;
+ if (maddr.broadcast)
+ return;
+ if (!tipc_addr_domain_valid(ddom))
+ return;
+ if (!tipc_addr_node_valid(onode))
+ return;
+
+ if (in_own_node(net, onode)) {
+ if (memcmp(&maddr, &bearer->addr, sizeof(maddr)))
+ disc_dupl_alert(bearer, tn->own_addr, &maddr);
+ return;
+ }
+ if (!tipc_in_scope(ddom, tn->own_addr))
+ return;
+ if (!tipc_in_scope(bearer->domain, onode))
+ return;
+
+ node = tipc_node_create(net, onode);
+ if (!node)
+ return;
+ tipc_node_lock(node);
+ node->capabilities = caps;
+ link = node->links[bearer->identity];
+
+ /* Prepare to validate requesting node's signature and media address */
+ sign_match = (signature == node->signature);
+ addr_match = link && !memcmp(&link->media_addr, &maddr, sizeof(maddr));
+ link_up = link && tipc_link_is_up(link);
+
+
+ /* These three flags give us eight permutations: */
+
+ if (sign_match && addr_match && link_up) {
+ /* All is fine. Do nothing. */
+ } else if (sign_match && addr_match && !link_up) {
+ /* Respond. The link will come up in due time */
+ respond = true;
+ } else if (sign_match && !addr_match && link_up) {
+ /* Peer has changed i/f address without rebooting.
+ * If so, the link will reset soon, and the next
+ * discovery will be accepted. So we can ignore it.
+ * It may also be an cloned or malicious peer having
+ * chosen the same node address and signature as an
+ * existing one.
+ * Ignore requests until the link goes down, if ever.
+ */
+ disc_dupl_alert(bearer, onode, &maddr);
+ } else if (sign_match && !addr_match && !link_up) {
+ /* Peer link has changed i/f address without rebooting.
+ * It may also be a cloned or malicious peer; we can't
+ * distinguish between the two.
+ * The signature is correct, so we must accept.
+ */
+ accept_addr = true;
+ respond = true;
+ } else if (!sign_match && addr_match && link_up) {
+ /* Peer node rebooted. Two possibilities:
+ * - Delayed re-discovery; this link endpoint has already
+ * reset and re-established contact with the peer, before
+ * receiving a discovery message from that node.
+ * (The peer happened to receive one from this node first).
+ * - The peer came back so fast that our side has not
+ * discovered it yet. Probing from this side will soon
+ * reset the link, since there can be no working link
+ * endpoint at the peer end, and the link will re-establish.
+ * Accept the signature, since it comes from a known peer.
+ */
+ accept_sign = true;
+ } else if (!sign_match && addr_match && !link_up) {
+ /* The peer node has rebooted.
+ * Accept signature, since it is a known peer.
+ */
+ accept_sign = true;
+ respond = true;
+ } else if (!sign_match && !addr_match && link_up) {
+ /* Peer rebooted with new address, or a new/duplicate peer.
+ * Ignore until the link goes down, if ever.
+ */
+ disc_dupl_alert(bearer, onode, &maddr);
+ } else if (!sign_match && !addr_match && !link_up) {
+ /* Peer rebooted with new address, or it is a new peer.
+ * Accept signature and address.
+ */
+ accept_sign = true;
+ accept_addr = true;
+ respond = true;
+ }
+
+ if (accept_sign)
+ node->signature = signature;
+
+ if (accept_addr) {
+ if (!link)
+ link = tipc_link_create(node, bearer, &maddr);
+ if (link) {
+ memcpy(&link->media_addr, &maddr, sizeof(maddr));
+ tipc_link_reset(link);
+ } else {
+ respond = false;
+ }
+ }
+
+ /* Send response, if necessary */
+ if (respond && (mtyp == DSC_REQ_MSG)) {
+ rbuf = tipc_buf_acquire(MAX_H_SIZE);
+ if (rbuf) {
+ tipc_disc_init_msg(net, rbuf, DSC_RESP_MSG, bearer);
+ tipc_bearer_send(net, bearer->identity, rbuf, &maddr);
+ kfree_skb(rbuf);
+ }
+ }
+ tipc_node_unlock(node);
+ tipc_node_put(node);
+}
+
+/**
+ * disc_update - update frequency of periodic link setup requests
+ * @req: ptr to link request structure
+ *
+ * Reinitiates discovery process if discovery object has no associated nodes
+ * and is either not currently searching or is searching at a slow rate
+ */
+static void disc_update(struct tipc_link_req *req)
+{
+ if (!req->num_nodes) {
+ if ((req->timer_intv == TIPC_LINK_REQ_INACTIVE) ||
+ (req->timer_intv > TIPC_LINK_REQ_FAST)) {
+ req->timer_intv = TIPC_LINK_REQ_INIT;
+ mod_timer(&req->timer, jiffies + req->timer_intv);
+ }
+ }
+}
+
+/**
+ * tipc_disc_add_dest - increment set of discovered nodes
+ * @req: ptr to link request structure
+ */
+void tipc_disc_add_dest(struct tipc_link_req *req)
+{
+ spin_lock_bh(&req->lock);
+ req->num_nodes++;
+ spin_unlock_bh(&req->lock);
+}
+
+/**
+ * tipc_disc_remove_dest - decrement set of discovered nodes
+ * @req: ptr to link request structure
+ */
+void tipc_disc_remove_dest(struct tipc_link_req *req)
+{
+ spin_lock_bh(&req->lock);
+ req->num_nodes--;
+ disc_update(req);
+ spin_unlock_bh(&req->lock);
+}
+
+/**
+ * disc_timeout - send a periodic link setup request
+ * @data: ptr to link request structure
+ *
+ * Called whenever a link setup request timer associated with a bearer expires.
+ */
+static void disc_timeout(unsigned long data)
+{
+ struct tipc_link_req *req = (struct tipc_link_req *)data;
+ int max_delay;
+
+ spin_lock_bh(&req->lock);
+
+ /* Stop searching if only desired node has been found */
+ if (tipc_node(req->domain) && req->num_nodes) {
+ req->timer_intv = TIPC_LINK_REQ_INACTIVE;
+ goto exit;
+ }
+
+ /*
+ * Send discovery message, then update discovery timer
+ *
+ * Keep doubling time between requests until limit is reached;
+ * hold at fast polling rate if don't have any associated nodes,
+ * otherwise hold at slow polling rate
+ */
+ tipc_bearer_send(req->net, req->bearer_id, req->buf, &req->dest);
+
+
+ req->timer_intv *= 2;
+ if (req->num_nodes)
+ max_delay = TIPC_LINK_REQ_SLOW;
+ else
+ max_delay = TIPC_LINK_REQ_FAST;
+ if (req->timer_intv > max_delay)
+ req->timer_intv = max_delay;
+
+ mod_timer(&req->timer, jiffies + req->timer_intv);
+exit:
+ spin_unlock_bh(&req->lock);
+}
+
+/**
+ * tipc_disc_create - create object to send periodic link setup requests
+ * @net: the applicable net namespace
+ * @b_ptr: ptr to bearer issuing requests
+ * @dest: destination address for request messages
+ * @dest_domain: network domain to which links can be established
+ *
+ * Returns 0 if successful, otherwise -errno.
+ */
+int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
+ struct tipc_media_addr *dest)
+{
+ struct tipc_link_req *req;
+
+ req = kmalloc(sizeof(*req), GFP_ATOMIC);
+ if (!req)
+ return -ENOMEM;
+ req->buf = tipc_buf_acquire(MAX_H_SIZE);
+ if (!req->buf) {
+ kfree(req);
+ return -ENOMEM;
+ }
+
+ tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b_ptr);
+ memcpy(&req->dest, dest, sizeof(*dest));
+ req->net = net;
+ req->bearer_id = b_ptr->identity;
+ req->domain = b_ptr->domain;
+ req->num_nodes = 0;
+ req->timer_intv = TIPC_LINK_REQ_INIT;
+ spin_lock_init(&req->lock);
+ setup_timer(&req->timer, disc_timeout, (unsigned long)req);
+ mod_timer(&req->timer, jiffies + req->timer_intv);
+ b_ptr->link_req = req;
+ tipc_bearer_send(net, req->bearer_id, req->buf, &req->dest);
+ return 0;
+}
+
+/**
+ * tipc_disc_delete - destroy object sending periodic link setup requests
+ * @req: ptr to link request structure
+ */
+void tipc_disc_delete(struct tipc_link_req *req)
+{
+ del_timer_sync(&req->timer);
+ kfree_skb(req->buf);
+ kfree(req);
+}
+
+/**
+ * tipc_disc_reset - reset object to send periodic link setup requests
+ * @net: the applicable net namespace
+ * @b_ptr: ptr to bearer issuing requests
+ * @dest_domain: network domain to which links can be established
+ */
+void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr)
+{
+ struct tipc_link_req *req = b_ptr->link_req;
+
+ spin_lock_bh(&req->lock);
+ tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b_ptr);
+ req->net = net;
+ req->bearer_id = b_ptr->identity;
+ req->domain = b_ptr->domain;
+ req->num_nodes = 0;
+ req->timer_intv = TIPC_LINK_REQ_INIT;
+ mod_timer(&req->timer, jiffies + req->timer_intv);
+ tipc_bearer_send(net, req->bearer_id, req->buf, &req->dest);
+ spin_unlock_bh(&req->lock);
+}
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
new file mode 100644
index 000000000..c9b12770c
--- /dev/null
+++ b/net/tipc/discover.h
@@ -0,0 +1,51 @@
+/*
+ * net/tipc/discover.h
+ *
+ * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_DISCOVER_H
+#define _TIPC_DISCOVER_H
+
+struct tipc_link_req;
+
+int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
+ struct tipc_media_addr *dest);
+void tipc_disc_delete(struct tipc_link_req *req);
+void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr);
+void tipc_disc_add_dest(struct tipc_link_req *req);
+void tipc_disc_remove_dest(struct tipc_link_req *req);
+void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
+ struct tipc_bearer *b_ptr);
+
+#endif
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
new file mode 100644
index 000000000..f69a2fde9
--- /dev/null
+++ b/net/tipc/eth_media.c
@@ -0,0 +1,99 @@
+/*
+ * net/tipc/eth_media.c: Ethernet bearer support for TIPC
+ *
+ * Copyright (c) 2001-2007, 2013-2014, Ericsson AB
+ * Copyright (c) 2005-2008, 2011-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "bearer.h"
+
+/* Convert Ethernet address (media address format) to string */
+static int tipc_eth_addr2str(struct tipc_media_addr *addr,
+ char *strbuf, int bufsz)
+{
+ if (bufsz < 18) /* 18 = strlen("aa:bb:cc:dd:ee:ff\0") */
+ return 1;
+
+ sprintf(strbuf, "%pM", addr->value);
+ return 0;
+}
+
+/* Convert from media address format to discovery message addr format */
+static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr)
+{
+ memset(msg, 0, TIPC_MEDIA_INFO_SIZE);
+ msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH;
+ memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, addr->value, ETH_ALEN);
+ return 0;
+}
+
+/* Convert raw mac address format to media addr format */
+static int tipc_eth_raw2addr(struct tipc_bearer *b,
+ struct tipc_media_addr *addr,
+ char *msg)
+{
+ char bcast_mac[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+ memset(addr, 0, sizeof(*addr));
+ ether_addr_copy(addr->value, msg);
+ addr->media_id = TIPC_MEDIA_TYPE_ETH;
+ addr->broadcast = !memcmp(addr->value, bcast_mac, ETH_ALEN);
+ return 0;
+}
+
+/* Convert discovery msg addr format to Ethernet media addr format */
+static int tipc_eth_msg2addr(struct tipc_bearer *b,
+ struct tipc_media_addr *addr,
+ char *msg)
+{
+ /* Skip past preamble: */
+ msg += TIPC_MEDIA_ADDR_OFFSET;
+ return tipc_eth_raw2addr(b, addr, msg);
+}
+
+/* Ethernet media registration info */
+struct tipc_media eth_media_info = {
+ .send_msg = tipc_l2_send_msg,
+ .enable_media = tipc_enable_l2_media,
+ .disable_media = tipc_disable_l2_media,
+ .addr2str = tipc_eth_addr2str,
+ .addr2msg = tipc_eth_addr2msg,
+ .msg2addr = tipc_eth_msg2addr,
+ .raw2addr = tipc_eth_raw2addr,
+ .priority = TIPC_DEF_LINK_PRI,
+ .tolerance = TIPC_DEF_LINK_TOL,
+ .window = TIPC_DEF_LINK_WIN,
+ .type_id = TIPC_MEDIA_TYPE_ETH,
+ .hwaddr_len = ETH_ALEN,
+ .name = "eth"
+};
diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c
new file mode 100644
index 000000000..e8c16718e
--- /dev/null
+++ b/net/tipc/ib_media.c
@@ -0,0 +1,101 @@
+/*
+ * net/tipc/ib_media.c: Infiniband bearer support for TIPC
+ *
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * Based on eth_media.c, which carries the following copyright notice:
+ *
+ * Copyright (c) 2001-2007, Ericsson AB
+ * Copyright (c) 2005-2008, 2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/if_infiniband.h>
+#include "core.h"
+#include "bearer.h"
+
+/* convert InfiniBand address (media address format) media address to string */
+static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf,
+ int str_size)
+{
+ if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */
+ return 1;
+
+ sprintf(str_buf, "%20phC", a->value);
+
+ return 0;
+}
+
+/* Convert from media address format to discovery message addr format */
+static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr)
+{
+ memset(msg, 0, TIPC_MEDIA_INFO_SIZE);
+ memcpy(msg, addr->value, INFINIBAND_ALEN);
+ return 0;
+}
+
+/* Convert raw InfiniBand address format to media addr format */
+static int tipc_ib_raw2addr(struct tipc_bearer *b,
+ struct tipc_media_addr *addr,
+ char *msg)
+{
+ memset(addr, 0, sizeof(*addr));
+ memcpy(addr->value, msg, INFINIBAND_ALEN);
+ addr->media_id = TIPC_MEDIA_TYPE_IB;
+ addr->broadcast = !memcmp(msg, b->bcast_addr.value,
+ INFINIBAND_ALEN);
+ return 0;
+}
+
+/* Convert discovery msg addr format to InfiniBand media addr format */
+static int tipc_ib_msg2addr(struct tipc_bearer *b,
+ struct tipc_media_addr *addr,
+ char *msg)
+{
+ return tipc_ib_raw2addr(b, addr, msg);
+}
+
+/* InfiniBand media registration info */
+struct tipc_media ib_media_info = {
+ .send_msg = tipc_l2_send_msg,
+ .enable_media = tipc_enable_l2_media,
+ .disable_media = tipc_disable_l2_media,
+ .addr2str = tipc_ib_addr2str,
+ .addr2msg = tipc_ib_addr2msg,
+ .msg2addr = tipc_ib_msg2addr,
+ .raw2addr = tipc_ib_raw2addr,
+ .priority = TIPC_DEF_LINK_PRI,
+ .tolerance = TIPC_DEF_LINK_TOL,
+ .window = TIPC_DEF_LINK_WIN,
+ .type_id = TIPC_MEDIA_TYPE_IB,
+ .hwaddr_len = INFINIBAND_ALEN,
+ .name = "ib"
+};
diff --git a/net/tipc/link.c b/net/tipc/link.c
new file mode 100644
index 000000000..43a515dc9
--- /dev/null
+++ b/net/tipc/link.c
@@ -0,0 +1,2272 @@
+/*
+ * net/tipc/link.c: TIPC link code
+ *
+ * Copyright (c) 1996-2007, 2012-2015, Ericsson AB
+ * Copyright (c) 2004-2007, 2010-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "subscr.h"
+#include "link.h"
+#include "bcast.h"
+#include "socket.h"
+#include "name_distr.h"
+#include "discover.h"
+#include "netlink.h"
+
+#include <linux/pkt_sched.h>
+
+/*
+ * Error message prefixes
+ */
+static const char *link_co_err = "Link changeover error, ";
+static const char *link_rst_msg = "Resetting link ";
+static const char *link_unk_evt = "Unknown link event ";
+
+static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
+ [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_LINK_NAME] = {
+ .type = NLA_STRING,
+ .len = TIPC_MAX_LINK_NAME
+ },
+ [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 },
+ [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED },
+ [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED },
+ [TIPC_NLA_LINK_RX] = { .type = NLA_U32 },
+ [TIPC_NLA_LINK_TX] = { .type = NLA_U32 }
+};
+
+/* Properties valid for media, bearar and link */
+static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
+ [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 },
+ [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 },
+ [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }
+};
+
+/*
+ * Out-of-range value for link session numbers
+ */
+#define INVALID_SESSION 0x10000
+
+/*
+ * Link state events:
+ */
+#define STARTING_EVT 856384768 /* link processing trigger */
+#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */
+#define TIMEOUT_EVT 560817u /* link timer expired */
+
+/*
+ * State value stored in 'failover_pkts'
+ */
+#define FIRST_FAILOVER 0xffffu
+
+static void link_handle_out_of_seq_msg(struct tipc_link *link,
+ struct sk_buff *skb);
+static void tipc_link_proto_rcv(struct tipc_link *link,
+ struct sk_buff *skb);
+static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol);
+static void link_state_event(struct tipc_link *l_ptr, u32 event);
+static void link_reset_statistics(struct tipc_link *l_ptr);
+static void link_print(struct tipc_link *l_ptr, const char *str);
+static void tipc_link_sync_xmit(struct tipc_link *l);
+static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf);
+static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb);
+static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb);
+static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb);
+/*
+ * Simple link routines
+ */
+static unsigned int align(unsigned int i)
+{
+ return (i + 3) & ~3u;
+}
+
+static void tipc_link_release(struct kref *kref)
+{
+ kfree(container_of(kref, struct tipc_link, ref));
+}
+
+static void tipc_link_get(struct tipc_link *l_ptr)
+{
+ kref_get(&l_ptr->ref);
+}
+
+static void tipc_link_put(struct tipc_link *l_ptr)
+{
+ kref_put(&l_ptr->ref, tipc_link_release);
+}
+
+static struct tipc_link *tipc_parallel_link(struct tipc_link *l)
+{
+ if (l->owner->active_links[0] != l)
+ return l->owner->active_links[0];
+ return l->owner->active_links[1];
+}
+
+/*
+ * Simple non-static link routines (i.e. referenced outside this file)
+ */
+int tipc_link_is_up(struct tipc_link *l_ptr)
+{
+ if (!l_ptr)
+ return 0;
+ return link_working_working(l_ptr) || link_working_unknown(l_ptr);
+}
+
+int tipc_link_is_active(struct tipc_link *l_ptr)
+{
+ return (l_ptr->owner->active_links[0] == l_ptr) ||
+ (l_ptr->owner->active_links[1] == l_ptr);
+}
+
+/**
+ * link_timeout - handle expiration of link timer
+ * @l_ptr: pointer to link
+ */
+static void link_timeout(unsigned long data)
+{
+ struct tipc_link *l_ptr = (struct tipc_link *)data;
+ struct sk_buff *skb;
+
+ tipc_node_lock(l_ptr->owner);
+
+ /* update counters used in statistical profiling of send traffic */
+ l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq);
+ l_ptr->stats.queue_sz_counts++;
+
+ skb = skb_peek(&l_ptr->transmq);
+ if (skb) {
+ struct tipc_msg *msg = buf_msg(skb);
+ u32 length = msg_size(msg);
+
+ if ((msg_user(msg) == MSG_FRAGMENTER) &&
+ (msg_type(msg) == FIRST_FRAGMENT)) {
+ length = msg_size(msg_get_wrapped(msg));
+ }
+ if (length) {
+ l_ptr->stats.msg_lengths_total += length;
+ l_ptr->stats.msg_length_counts++;
+ if (length <= 64)
+ l_ptr->stats.msg_length_profile[0]++;
+ else if (length <= 256)
+ l_ptr->stats.msg_length_profile[1]++;
+ else if (length <= 1024)
+ l_ptr->stats.msg_length_profile[2]++;
+ else if (length <= 4096)
+ l_ptr->stats.msg_length_profile[3]++;
+ else if (length <= 16384)
+ l_ptr->stats.msg_length_profile[4]++;
+ else if (length <= 32768)
+ l_ptr->stats.msg_length_profile[5]++;
+ else
+ l_ptr->stats.msg_length_profile[6]++;
+ }
+ }
+
+ /* do all other link processing performed on a periodic basis */
+ link_state_event(l_ptr, TIMEOUT_EVT);
+
+ if (skb_queue_len(&l_ptr->backlogq))
+ tipc_link_push_packets(l_ptr);
+
+ tipc_node_unlock(l_ptr->owner);
+ tipc_link_put(l_ptr);
+}
+
+static void link_set_timer(struct tipc_link *link, unsigned long time)
+{
+ if (!mod_timer(&link->timer, jiffies + time))
+ tipc_link_get(link);
+}
+
+/**
+ * tipc_link_create - create a new link
+ * @n_ptr: pointer to associated node
+ * @b_ptr: pointer to associated bearer
+ * @media_addr: media address to use when sending messages over link
+ *
+ * Returns pointer to link.
+ */
+struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
+ struct tipc_bearer *b_ptr,
+ const struct tipc_media_addr *media_addr)
+{
+ struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
+ struct tipc_link *l_ptr;
+ struct tipc_msg *msg;
+ char *if_name;
+ char addr_string[16];
+ u32 peer = n_ptr->addr;
+
+ if (n_ptr->link_cnt >= MAX_BEARERS) {
+ tipc_addr_string_fill(addr_string, n_ptr->addr);
+ pr_err("Attempt to establish %uth link to %s. Max %u allowed.\n",
+ n_ptr->link_cnt, addr_string, MAX_BEARERS);
+ return NULL;
+ }
+
+ if (n_ptr->links[b_ptr->identity]) {
+ tipc_addr_string_fill(addr_string, n_ptr->addr);
+ pr_err("Attempt to establish second link on <%s> to %s\n",
+ b_ptr->name, addr_string);
+ return NULL;
+ }
+
+ l_ptr = kzalloc(sizeof(*l_ptr), GFP_ATOMIC);
+ if (!l_ptr) {
+ pr_warn("Link creation failed, no memory\n");
+ return NULL;
+ }
+ kref_init(&l_ptr->ref);
+ l_ptr->addr = peer;
+ if_name = strchr(b_ptr->name, ':') + 1;
+ sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
+ tipc_zone(tn->own_addr), tipc_cluster(tn->own_addr),
+ tipc_node(tn->own_addr),
+ if_name,
+ tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
+ /* note: peer i/f name is updated by reset/activate message */
+ memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr));
+ l_ptr->owner = n_ptr;
+ l_ptr->checkpoint = 1;
+ l_ptr->peer_session = INVALID_SESSION;
+ l_ptr->bearer_id = b_ptr->identity;
+ link_set_supervision_props(l_ptr, b_ptr->tolerance);
+ l_ptr->state = RESET_UNKNOWN;
+
+ l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg;
+ msg = l_ptr->pmsg;
+ tipc_msg_init(tn->own_addr, msg, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE,
+ l_ptr->addr);
+ msg_set_size(msg, sizeof(l_ptr->proto_msg));
+ msg_set_session(msg, (tn->random & 0xffff));
+ msg_set_bearer_id(msg, b_ptr->identity);
+ strcpy((char *)msg_data(msg), if_name);
+ l_ptr->net_plane = b_ptr->net_plane;
+ l_ptr->advertised_mtu = b_ptr->mtu;
+ l_ptr->mtu = l_ptr->advertised_mtu;
+ l_ptr->priority = b_ptr->priority;
+ tipc_link_set_queue_limits(l_ptr, b_ptr->window);
+ l_ptr->next_out_no = 1;
+ __skb_queue_head_init(&l_ptr->transmq);
+ __skb_queue_head_init(&l_ptr->backlogq);
+ __skb_queue_head_init(&l_ptr->deferdq);
+ skb_queue_head_init(&l_ptr->wakeupq);
+ skb_queue_head_init(&l_ptr->inputq);
+ skb_queue_head_init(&l_ptr->namedq);
+ link_reset_statistics(l_ptr);
+ tipc_node_attach_link(n_ptr, l_ptr);
+ setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr);
+ link_state_event(l_ptr, STARTING_EVT);
+
+ return l_ptr;
+}
+
+/**
+ * tipc_link_delete - Delete a link
+ * @l: link to be deleted
+ */
+void tipc_link_delete(struct tipc_link *l)
+{
+ tipc_link_reset(l);
+ if (del_timer(&l->timer))
+ tipc_link_put(l);
+ l->flags |= LINK_STOPPED;
+ /* Delete link now, or when timer is finished: */
+ tipc_link_reset_fragments(l);
+ tipc_node_detach_link(l->owner, l);
+ tipc_link_put(l);
+}
+
+void tipc_link_delete_list(struct net *net, unsigned int bearer_id,
+ bool shutting_down)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *link;
+ struct tipc_node *node;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(node, &tn->node_list, list) {
+ tipc_node_lock(node);
+ link = node->links[bearer_id];
+ if (link)
+ tipc_link_delete(link);
+ tipc_node_unlock(node);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * link_schedule_user - schedule a message sender for wakeup after congestion
+ * @link: congested link
+ * @list: message that was attempted sent
+ * Create pseudo msg to send back to user when congestion abates
+ * Only consumes message if there is an error
+ */
+static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
+{
+ struct tipc_msg *msg = buf_msg(skb_peek(list));
+ int imp = msg_importance(msg);
+ u32 oport = msg_origport(msg);
+ u32 addr = link_own_addr(link);
+ struct sk_buff *skb;
+
+ /* This really cannot happen... */
+ if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
+ pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
+ tipc_link_reset(link);
+ goto err;
+ }
+ /* Non-blocking sender: */
+ if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
+ return -ELINKCONG;
+
+ /* Create and schedule wakeup pseudo message */
+ skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
+ addr, addr, oport, 0, 0);
+ if (!skb)
+ goto err;
+ TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
+ TIPC_SKB_CB(skb)->chain_imp = imp;
+ skb_queue_tail(&link->wakeupq, skb);
+ link->stats.link_congs++;
+ return -ELINKCONG;
+err:
+ __skb_queue_purge(list);
+ return -ENOBUFS;
+}
+
+/**
+ * link_prepare_wakeup - prepare users for wakeup after congestion
+ * @link: congested link
+ * Move a number of waiting users, as permitted by available space in
+ * the send queue, from link wait queue to node wait queue for wakeup
+ */
+void link_prepare_wakeup(struct tipc_link *l)
+{
+ int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
+ int imp, lim;
+ struct sk_buff *skb, *tmp;
+
+ skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
+ imp = TIPC_SKB_CB(skb)->chain_imp;
+ lim = l->window + l->backlog[imp].limit;
+ pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
+ if ((pnd[imp] + l->backlog[imp].len) >= lim)
+ break;
+ skb_unlink(skb, &l->wakeupq);
+ skb_queue_tail(&l->inputq, skb);
+ l->owner->inputq = &l->inputq;
+ l->owner->action_flags |= TIPC_MSG_EVT;
+ }
+}
+
+/**
+ * tipc_link_reset_fragments - purge link's inbound message fragments queue
+ * @l_ptr: pointer to link
+ */
+void tipc_link_reset_fragments(struct tipc_link *l_ptr)
+{
+ kfree_skb(l_ptr->reasm_buf);
+ l_ptr->reasm_buf = NULL;
+}
+
+static void tipc_link_purge_backlog(struct tipc_link *l)
+{
+ __skb_queue_purge(&l->backlogq);
+ l->backlog[TIPC_LOW_IMPORTANCE].len = 0;
+ l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0;
+ l->backlog[TIPC_HIGH_IMPORTANCE].len = 0;
+ l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0;
+ l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0;
+}
+
+/**
+ * tipc_link_purge_queues - purge all pkt queues associated with link
+ * @l_ptr: pointer to link
+ */
+void tipc_link_purge_queues(struct tipc_link *l_ptr)
+{
+ __skb_queue_purge(&l_ptr->deferdq);
+ __skb_queue_purge(&l_ptr->transmq);
+ tipc_link_purge_backlog(l_ptr);
+ tipc_link_reset_fragments(l_ptr);
+}
+
+void tipc_link_reset(struct tipc_link *l_ptr)
+{
+ u32 prev_state = l_ptr->state;
+ int was_active_link = tipc_link_is_active(l_ptr);
+ struct tipc_node *owner = l_ptr->owner;
+ struct tipc_link *pl = tipc_parallel_link(l_ptr);
+
+ msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff));
+
+ /* Link is down, accept any session */
+ l_ptr->peer_session = INVALID_SESSION;
+
+ /* Prepare for renewed mtu size negotiation */
+ l_ptr->mtu = l_ptr->advertised_mtu;
+
+ l_ptr->state = RESET_UNKNOWN;
+
+ if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET))
+ return;
+
+ tipc_node_link_down(l_ptr->owner, l_ptr);
+ tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr);
+
+ if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) {
+ l_ptr->flags |= LINK_FAILINGOVER;
+ l_ptr->failover_checkpt = l_ptr->next_in_no;
+ pl->failover_pkts = FIRST_FAILOVER;
+ pl->failover_checkpt = l_ptr->next_in_no;
+ pl->failover_skb = l_ptr->reasm_buf;
+ } else {
+ kfree_skb(l_ptr->reasm_buf);
+ }
+ /* Clean up all queues, except inputq: */
+ __skb_queue_purge(&l_ptr->transmq);
+ __skb_queue_purge(&l_ptr->deferdq);
+ if (!owner->inputq)
+ owner->inputq = &l_ptr->inputq;
+ skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq);
+ if (!skb_queue_empty(owner->inputq))
+ owner->action_flags |= TIPC_MSG_EVT;
+ tipc_link_purge_backlog(l_ptr);
+ l_ptr->reasm_buf = NULL;
+ l_ptr->rcv_unacked = 0;
+ l_ptr->checkpoint = 1;
+ l_ptr->next_out_no = 1;
+ l_ptr->fsm_msg_cnt = 0;
+ l_ptr->stale_count = 0;
+ link_reset_statistics(l_ptr);
+}
+
+void tipc_link_reset_list(struct net *net, unsigned int bearer_id)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *l_ptr;
+ struct tipc_node *n_ptr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(n_ptr, &tn->node_list, list) {
+ tipc_node_lock(n_ptr);
+ l_ptr = n_ptr->links[bearer_id];
+ if (l_ptr)
+ tipc_link_reset(l_ptr);
+ tipc_node_unlock(n_ptr);
+ }
+ rcu_read_unlock();
+}
+
+static void link_activate(struct tipc_link *link)
+{
+ struct tipc_node *node = link->owner;
+
+ link->next_in_no = 1;
+ link->stats.recv_info = 1;
+ tipc_node_link_up(node, link);
+ tipc_bearer_add_dest(node->net, link->bearer_id, link->addr);
+}
+
+/**
+ * link_state_event - link finite state machine
+ * @l_ptr: pointer to link
+ * @event: state machine event to process
+ */
+static void link_state_event(struct tipc_link *l_ptr, unsigned int event)
+{
+ struct tipc_link *other;
+ unsigned long cont_intv = l_ptr->cont_intv;
+
+ if (l_ptr->flags & LINK_STOPPED)
+ return;
+
+ if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT))
+ return; /* Not yet. */
+
+ if (l_ptr->flags & LINK_FAILINGOVER) {
+ if (event == TIMEOUT_EVT)
+ link_set_timer(l_ptr, cont_intv);
+ return;
+ }
+
+ switch (l_ptr->state) {
+ case WORKING_WORKING:
+ switch (event) {
+ case TRAFFIC_MSG_EVT:
+ case ACTIVATE_MSG:
+ break;
+ case TIMEOUT_EVT:
+ if (l_ptr->next_in_no != l_ptr->checkpoint) {
+ l_ptr->checkpoint = l_ptr->next_in_no;
+ if (tipc_bclink_acks_missing(l_ptr->owner)) {
+ tipc_link_proto_xmit(l_ptr, STATE_MSG,
+ 0, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ }
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ }
+ l_ptr->state = WORKING_UNKNOWN;
+ l_ptr->fsm_msg_cnt = 0;
+ tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv / 4);
+ break;
+ case RESET_MSG:
+ pr_debug("%s<%s>, requested by peer\n",
+ link_rst_msg, l_ptr->name);
+ tipc_link_reset(l_ptr);
+ l_ptr->state = RESET_RESET;
+ l_ptr->fsm_msg_cnt = 0;
+ tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
+ 0, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ default:
+ pr_debug("%s%u in WW state\n", link_unk_evt, event);
+ }
+ break;
+ case WORKING_UNKNOWN:
+ switch (event) {
+ case TRAFFIC_MSG_EVT:
+ case ACTIVATE_MSG:
+ l_ptr->state = WORKING_WORKING;
+ l_ptr->fsm_msg_cnt = 0;
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ case RESET_MSG:
+ pr_debug("%s<%s>, requested by peer while probing\n",
+ link_rst_msg, l_ptr->name);
+ tipc_link_reset(l_ptr);
+ l_ptr->state = RESET_RESET;
+ l_ptr->fsm_msg_cnt = 0;
+ tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
+ 0, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ case TIMEOUT_EVT:
+ if (l_ptr->next_in_no != l_ptr->checkpoint) {
+ l_ptr->state = WORKING_WORKING;
+ l_ptr->fsm_msg_cnt = 0;
+ l_ptr->checkpoint = l_ptr->next_in_no;
+ if (tipc_bclink_acks_missing(l_ptr->owner)) {
+ tipc_link_proto_xmit(l_ptr, STATE_MSG,
+ 0, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ }
+ link_set_timer(l_ptr, cont_intv);
+ } else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) {
+ tipc_link_proto_xmit(l_ptr, STATE_MSG,
+ 1, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv / 4);
+ } else { /* Link has failed */
+ pr_debug("%s<%s>, peer not responding\n",
+ link_rst_msg, l_ptr->name);
+ tipc_link_reset(l_ptr);
+ l_ptr->state = RESET_UNKNOWN;
+ l_ptr->fsm_msg_cnt = 0;
+ tipc_link_proto_xmit(l_ptr, RESET_MSG,
+ 0, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv);
+ }
+ break;
+ default:
+ pr_err("%s%u in WU state\n", link_unk_evt, event);
+ }
+ break;
+ case RESET_UNKNOWN:
+ switch (event) {
+ case TRAFFIC_MSG_EVT:
+ break;
+ case ACTIVATE_MSG:
+ other = l_ptr->owner->active_links[0];
+ if (other && link_working_unknown(other))
+ break;
+ l_ptr->state = WORKING_WORKING;
+ l_ptr->fsm_msg_cnt = 0;
+ link_activate(l_ptr);
+ tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ if (l_ptr->owner->working_links == 1)
+ tipc_link_sync_xmit(l_ptr);
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ case RESET_MSG:
+ l_ptr->state = RESET_RESET;
+ l_ptr->fsm_msg_cnt = 0;
+ tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
+ 1, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ case STARTING_EVT:
+ l_ptr->flags |= LINK_STARTED;
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ case TIMEOUT_EVT:
+ tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ default:
+ pr_err("%s%u in RU state\n", link_unk_evt, event);
+ }
+ break;
+ case RESET_RESET:
+ switch (event) {
+ case TRAFFIC_MSG_EVT:
+ case ACTIVATE_MSG:
+ other = l_ptr->owner->active_links[0];
+ if (other && link_working_unknown(other))
+ break;
+ l_ptr->state = WORKING_WORKING;
+ l_ptr->fsm_msg_cnt = 0;
+ link_activate(l_ptr);
+ tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ if (l_ptr->owner->working_links == 1)
+ tipc_link_sync_xmit(l_ptr);
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ case RESET_MSG:
+ break;
+ case TIMEOUT_EVT:
+ tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG,
+ 0, 0, 0, 0);
+ l_ptr->fsm_msg_cnt++;
+ link_set_timer(l_ptr, cont_intv);
+ break;
+ default:
+ pr_err("%s%u in RR state\n", link_unk_evt, event);
+ }
+ break;
+ default:
+ pr_err("Unknown link state %u/%u\n", l_ptr->state, event);
+ }
+}
+
+/**
+ * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked
+ * @link: link to use
+ * @list: chain of buffers containing message
+ *
+ * Consumes the buffer chain, except when returning -ELINKCONG,
+ * since the caller then may want to make more send attempts.
+ * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
+ * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
+ */
+int __tipc_link_xmit(struct net *net, struct tipc_link *link,
+ struct sk_buff_head *list)
+{
+ struct tipc_msg *msg = buf_msg(skb_peek(list));
+ unsigned int maxwin = link->window;
+ unsigned int imp = msg_importance(msg);
+ uint mtu = link->mtu;
+ uint ack = mod(link->next_in_no - 1);
+ uint seqno = link->next_out_no;
+ uint bc_last_in = link->owner->bclink.last_in;
+ struct tipc_media_addr *addr = &link->media_addr;
+ struct sk_buff_head *transmq = &link->transmq;
+ struct sk_buff_head *backlogq = &link->backlogq;
+ struct sk_buff *skb, *tmp;
+
+ /* Match backlog limit against msg importance: */
+ if (unlikely(link->backlog[imp].len >= link->backlog[imp].limit))
+ return link_schedule_user(link, list);
+
+ if (unlikely(msg_size(msg) > mtu)) {
+ __skb_queue_purge(list);
+ return -EMSGSIZE;
+ }
+ /* Prepare each packet for sending, and add to relevant queue: */
+ skb_queue_walk_safe(list, skb, tmp) {
+ __skb_unlink(skb, list);
+ msg = buf_msg(skb);
+ msg_set_seqno(msg, seqno);
+ msg_set_ack(msg, ack);
+ msg_set_bcast_ack(msg, bc_last_in);
+
+ if (likely(skb_queue_len(transmq) < maxwin)) {
+ __skb_queue_tail(transmq, skb);
+ tipc_bearer_send(net, link->bearer_id, skb, addr);
+ link->rcv_unacked = 0;
+ seqno++;
+ continue;
+ }
+ if (tipc_msg_bundle(skb_peek_tail(backlogq), skb, mtu)) {
+ link->stats.sent_bundled++;
+ continue;
+ }
+ if (tipc_msg_make_bundle(&skb, mtu, link->addr)) {
+ link->stats.sent_bundled++;
+ link->stats.sent_bundles++;
+ imp = msg_importance(buf_msg(skb));
+ }
+ __skb_queue_tail(backlogq, skb);
+ link->backlog[imp].len++;
+ seqno++;
+ }
+ link->next_out_no = seqno;
+ return 0;
+}
+
+static void skb2list(struct sk_buff *skb, struct sk_buff_head *list)
+{
+ skb_queue_head_init(list);
+ __skb_queue_tail(list, skb);
+}
+
+static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb)
+{
+ struct sk_buff_head head;
+
+ skb2list(skb, &head);
+ return __tipc_link_xmit(link->owner->net, link, &head);
+}
+
+/* tipc_link_xmit_skb(): send single buffer to destination
+ * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE
+ * messages, which will not be rejected
+ * The only exception is datagram messages rerouted after secondary
+ * lookup, which are rare and safe to dispose of anyway.
+ * TODO: Return real return value, and let callers use
+ * tipc_wait_for_sendpkt() where applicable
+ */
+int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
+ u32 selector)
+{
+ struct sk_buff_head head;
+ int rc;
+
+ skb2list(skb, &head);
+ rc = tipc_link_xmit(net, &head, dnode, selector);
+ if (rc == -ELINKCONG)
+ kfree_skb(skb);
+ return 0;
+}
+
+/**
+ * tipc_link_xmit() is the general link level function for message sending
+ * @net: the applicable net namespace
+ * @list: chain of buffers containing message
+ * @dsz: amount of user data to be sent
+ * @dnode: address of destination node
+ * @selector: a number used for deterministic link selection
+ * Consumes the buffer chain, except when returning -ELINKCONG
+ * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ */
+int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
+ u32 selector)
+{
+ struct tipc_link *link = NULL;
+ struct tipc_node *node;
+ int rc = -EHOSTUNREACH;
+
+ node = tipc_node_find(net, dnode);
+ if (node) {
+ tipc_node_lock(node);
+ link = node->active_links[selector & 1];
+ if (link)
+ rc = __tipc_link_xmit(net, link, list);
+ tipc_node_unlock(node);
+ tipc_node_put(node);
+ }
+ if (link)
+ return rc;
+
+ if (likely(in_own_node(net, dnode))) {
+ tipc_sk_rcv(net, list);
+ return 0;
+ }
+
+ __skb_queue_purge(list);
+ return rc;
+}
+
+/*
+ * tipc_link_sync_xmit - synchronize broadcast link endpoints.
+ *
+ * Give a newly added peer node the sequence number where it should
+ * start receiving and acking broadcast packets.
+ *
+ * Called with node locked
+ */
+static void tipc_link_sync_xmit(struct tipc_link *link)
+{
+ struct sk_buff *skb;
+ struct tipc_msg *msg;
+
+ skb = tipc_buf_acquire(INT_H_SIZE);
+ if (!skb)
+ return;
+
+ msg = buf_msg(skb);
+ tipc_msg_init(link_own_addr(link), msg, BCAST_PROTOCOL, STATE_MSG,
+ INT_H_SIZE, link->addr);
+ msg_set_last_bcast(msg, link->owner->bclink.acked);
+ __tipc_link_xmit_skb(link, skb);
+}
+
+/*
+ * tipc_link_sync_rcv - synchronize broadcast link endpoints.
+ * Receive the sequence number where we should start receiving and
+ * acking broadcast packets from a newly added peer node, and open
+ * up for reception of such packets.
+ *
+ * Called with node locked
+ */
+static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf)
+{
+ struct tipc_msg *msg = buf_msg(buf);
+
+ n->bclink.last_sent = n->bclink.last_in = msg_last_bcast(msg);
+ n->bclink.recv_permitted = true;
+ kfree_skb(buf);
+}
+
+/*
+ * tipc_link_push_packets - push unsent packets to bearer
+ *
+ * Push out the unsent messages of a link where congestion
+ * has abated. Node is locked.
+ *
+ * Called with node locked
+ */
+void tipc_link_push_packets(struct tipc_link *link)
+{
+ struct sk_buff *skb;
+ struct tipc_msg *msg;
+ unsigned int ack = mod(link->next_in_no - 1);
+
+ while (skb_queue_len(&link->transmq) < link->window) {
+ skb = __skb_dequeue(&link->backlogq);
+ if (!skb)
+ break;
+ msg = buf_msg(skb);
+ link->backlog[msg_importance(msg)].len--;
+ msg_set_ack(msg, ack);
+ msg_set_bcast_ack(msg, link->owner->bclink.last_in);
+ link->rcv_unacked = 0;
+ __skb_queue_tail(&link->transmq, skb);
+ tipc_bearer_send(link->owner->net, link->bearer_id,
+ skb, &link->media_addr);
+ }
+}
+
+void tipc_link_reset_all(struct tipc_node *node)
+{
+ char addr_string[16];
+ u32 i;
+
+ tipc_node_lock(node);
+
+ pr_warn("Resetting all links to %s\n",
+ tipc_addr_string_fill(addr_string, node->addr));
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ if (node->links[i]) {
+ link_print(node->links[i], "Resetting link\n");
+ tipc_link_reset(node->links[i]);
+ }
+ }
+
+ tipc_node_unlock(node);
+}
+
+static void link_retransmit_failure(struct tipc_link *l_ptr,
+ struct sk_buff *buf)
+{
+ struct tipc_msg *msg = buf_msg(buf);
+ struct net *net = l_ptr->owner->net;
+
+ pr_warn("Retransmission failure on link <%s>\n", l_ptr->name);
+
+ if (l_ptr->addr) {
+ /* Handle failure on standard link */
+ link_print(l_ptr, "Resetting link\n");
+ tipc_link_reset(l_ptr);
+
+ } else {
+ /* Handle failure on broadcast link */
+ struct tipc_node *n_ptr;
+ char addr_string[16];
+
+ pr_info("Msg seq number: %u, ", msg_seqno(msg));
+ pr_cont("Outstanding acks: %lu\n",
+ (unsigned long) TIPC_SKB_CB(buf)->handle);
+
+ n_ptr = tipc_bclink_retransmit_to(net);
+
+ tipc_addr_string_fill(addr_string, n_ptr->addr);
+ pr_info("Broadcast link info for %s\n", addr_string);
+ pr_info("Reception permitted: %d, Acked: %u\n",
+ n_ptr->bclink.recv_permitted,
+ n_ptr->bclink.acked);
+ pr_info("Last in: %u, Oos state: %u, Last sent: %u\n",
+ n_ptr->bclink.last_in,
+ n_ptr->bclink.oos_state,
+ n_ptr->bclink.last_sent);
+
+ n_ptr->action_flags |= TIPC_BCAST_RESET;
+ l_ptr->stale_count = 0;
+ }
+}
+
+void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb,
+ u32 retransmits)
+{
+ struct tipc_msg *msg;
+
+ if (!skb)
+ return;
+
+ msg = buf_msg(skb);
+
+ /* Detect repeated retransmit failures */
+ if (l_ptr->last_retransmitted == msg_seqno(msg)) {
+ if (++l_ptr->stale_count > 100) {
+ link_retransmit_failure(l_ptr, skb);
+ return;
+ }
+ } else {
+ l_ptr->last_retransmitted = msg_seqno(msg);
+ l_ptr->stale_count = 1;
+ }
+
+ skb_queue_walk_from(&l_ptr->transmq, skb) {
+ if (!retransmits)
+ break;
+ msg = buf_msg(skb);
+ msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
+ msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
+ tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, skb,
+ &l_ptr->media_addr);
+ retransmits--;
+ l_ptr->stats.retransmitted++;
+ }
+}
+
+/* link_synch(): check if all packets arrived before the synch
+ * point have been consumed
+ * Returns true if the parallel links are synched, otherwise false
+ */
+static bool link_synch(struct tipc_link *l)
+{
+ unsigned int post_synch;
+ struct tipc_link *pl;
+
+ pl = tipc_parallel_link(l);
+ if (pl == l)
+ goto synched;
+
+ /* Was last pre-synch packet added to input queue ? */
+ if (less_eq(pl->next_in_no, l->synch_point))
+ return false;
+
+ /* Is it still in the input queue ? */
+ post_synch = mod(pl->next_in_no - l->synch_point) - 1;
+ if (skb_queue_len(&pl->inputq) > post_synch)
+ return false;
+synched:
+ l->flags &= ~LINK_SYNCHING;
+ return true;
+}
+
+static void link_retrieve_defq(struct tipc_link *link,
+ struct sk_buff_head *list)
+{
+ u32 seq_no;
+
+ if (skb_queue_empty(&link->deferdq))
+ return;
+
+ seq_no = buf_seqno(skb_peek(&link->deferdq));
+ if (seq_no == mod(link->next_in_no))
+ skb_queue_splice_tail_init(&link->deferdq, list);
+}
+
+/**
+ * tipc_rcv - process TIPC packets/messages arriving from off-node
+ * @net: the applicable net namespace
+ * @skb: TIPC packet
+ * @b_ptr: pointer to bearer message arrived on
+ *
+ * Invoked with no locks held. Bearer pointer must point to a valid bearer
+ * structure (i.e. cannot be NULL), but bearer can be inactive.
+ */
+void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct sk_buff_head head;
+ struct tipc_node *n_ptr;
+ struct tipc_link *l_ptr;
+ struct sk_buff *skb1, *tmp;
+ struct tipc_msg *msg;
+ u32 seq_no;
+ u32 ackd;
+ u32 released;
+
+ skb2list(skb, &head);
+
+ while ((skb = __skb_dequeue(&head))) {
+ /* Ensure message is well-formed */
+ if (unlikely(!tipc_msg_validate(skb)))
+ goto discard;
+
+ /* Handle arrival of a non-unicast link message */
+ msg = buf_msg(skb);
+ if (unlikely(msg_non_seq(msg))) {
+ if (msg_user(msg) == LINK_CONFIG)
+ tipc_disc_rcv(net, skb, b_ptr);
+ else
+ tipc_bclink_rcv(net, skb);
+ continue;
+ }
+
+ /* Discard unicast link messages destined for another node */
+ if (unlikely(!msg_short(msg) &&
+ (msg_destnode(msg) != tn->own_addr)))
+ goto discard;
+
+ /* Locate neighboring node that sent message */
+ n_ptr = tipc_node_find(net, msg_prevnode(msg));
+ if (unlikely(!n_ptr))
+ goto discard;
+
+ tipc_node_lock(n_ptr);
+ /* Locate unicast link endpoint that should handle message */
+ l_ptr = n_ptr->links[b_ptr->identity];
+ if (unlikely(!l_ptr))
+ goto unlock;
+
+ /* Verify that communication with node is currently allowed */
+ if ((n_ptr->action_flags & TIPC_WAIT_PEER_LINKS_DOWN) &&
+ msg_user(msg) == LINK_PROTOCOL &&
+ (msg_type(msg) == RESET_MSG ||
+ msg_type(msg) == ACTIVATE_MSG) &&
+ !msg_redundant_link(msg))
+ n_ptr->action_flags &= ~TIPC_WAIT_PEER_LINKS_DOWN;
+
+ if (tipc_node_blocked(n_ptr))
+ goto unlock;
+
+ /* Validate message sequence number info */
+ seq_no = msg_seqno(msg);
+ ackd = msg_ack(msg);
+
+ /* Release acked messages */
+ if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg)))
+ tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg));
+
+ released = 0;
+ skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) {
+ if (more(buf_seqno(skb1), ackd))
+ break;
+ __skb_unlink(skb1, &l_ptr->transmq);
+ kfree_skb(skb1);
+ released = 1;
+ }
+
+ /* Try sending any messages link endpoint has pending */
+ if (unlikely(skb_queue_len(&l_ptr->backlogq)))
+ tipc_link_push_packets(l_ptr);
+
+ if (released && !skb_queue_empty(&l_ptr->wakeupq))
+ link_prepare_wakeup(l_ptr);
+
+ /* Process the incoming packet */
+ if (unlikely(!link_working_working(l_ptr))) {
+ if (msg_user(msg) == LINK_PROTOCOL) {
+ tipc_link_proto_rcv(l_ptr, skb);
+ link_retrieve_defq(l_ptr, &head);
+ skb = NULL;
+ goto unlock;
+ }
+
+ /* Traffic message. Conditionally activate link */
+ link_state_event(l_ptr, TRAFFIC_MSG_EVT);
+
+ if (link_working_working(l_ptr)) {
+ /* Re-insert buffer in front of queue */
+ __skb_queue_head(&head, skb);
+ skb = NULL;
+ goto unlock;
+ }
+ goto unlock;
+ }
+
+ /* Link is now in state WORKING_WORKING */
+ if (unlikely(seq_no != mod(l_ptr->next_in_no))) {
+ link_handle_out_of_seq_msg(l_ptr, skb);
+ link_retrieve_defq(l_ptr, &head);
+ skb = NULL;
+ goto unlock;
+ }
+ /* Synchronize with parallel link if applicable */
+ if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) {
+ if (!link_synch(l_ptr))
+ goto unlock;
+ }
+ l_ptr->next_in_no++;
+ if (unlikely(!skb_queue_empty(&l_ptr->deferdq)))
+ link_retrieve_defq(l_ptr, &head);
+ if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) {
+ l_ptr->stats.sent_acks++;
+ tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0);
+ }
+ tipc_link_input(l_ptr, skb);
+ skb = NULL;
+unlock:
+ tipc_node_unlock(n_ptr);
+ tipc_node_put(n_ptr);
+discard:
+ if (unlikely(skb))
+ kfree_skb(skb);
+ }
+}
+
+/* tipc_data_input - deliver data and name distr msgs to upper layer
+ *
+ * Consumes buffer if message is of right type
+ * Node lock must be held
+ */
+static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb)
+{
+ struct tipc_node *node = link->owner;
+ struct tipc_msg *msg = buf_msg(skb);
+ u32 dport = msg_destport(msg);
+
+ switch (msg_user(msg)) {
+ case TIPC_LOW_IMPORTANCE:
+ case TIPC_MEDIUM_IMPORTANCE:
+ case TIPC_HIGH_IMPORTANCE:
+ case TIPC_CRITICAL_IMPORTANCE:
+ case CONN_MANAGER:
+ if (tipc_skb_queue_tail(&link->inputq, skb, dport)) {
+ node->inputq = &link->inputq;
+ node->action_flags |= TIPC_MSG_EVT;
+ }
+ return true;
+ case NAME_DISTRIBUTOR:
+ node->bclink.recv_permitted = true;
+ node->namedq = &link->namedq;
+ skb_queue_tail(&link->namedq, skb);
+ if (skb_queue_len(&link->namedq) == 1)
+ node->action_flags |= TIPC_NAMED_MSG_EVT;
+ return true;
+ case MSG_BUNDLER:
+ case TUNNEL_PROTOCOL:
+ case MSG_FRAGMENTER:
+ case BCAST_PROTOCOL:
+ return false;
+ default:
+ pr_warn("Dropping received illegal msg type\n");
+ kfree_skb(skb);
+ return false;
+ };
+}
+
+/* tipc_link_input - process packet that has passed link protocol check
+ *
+ * Consumes buffer
+ * Node lock must be held
+ */
+static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb)
+{
+ struct tipc_node *node = link->owner;
+ struct tipc_msg *msg = buf_msg(skb);
+ struct sk_buff *iskb;
+ int pos = 0;
+
+ if (likely(tipc_data_input(link, skb)))
+ return;
+
+ switch (msg_user(msg)) {
+ case TUNNEL_PROTOCOL:
+ if (msg_dup(msg)) {
+ link->flags |= LINK_SYNCHING;
+ link->synch_point = msg_seqno(msg_get_wrapped(msg));
+ kfree_skb(skb);
+ break;
+ }
+ if (!tipc_link_failover_rcv(link, &skb))
+ break;
+ if (msg_user(buf_msg(skb)) != MSG_BUNDLER) {
+ tipc_data_input(link, skb);
+ break;
+ }
+ case MSG_BUNDLER:
+ link->stats.recv_bundles++;
+ link->stats.recv_bundled += msg_msgcnt(msg);
+
+ while (tipc_msg_extract(skb, &iskb, &pos))
+ tipc_data_input(link, iskb);
+ break;
+ case MSG_FRAGMENTER:
+ link->stats.recv_fragments++;
+ if (tipc_buf_append(&link->reasm_buf, &skb)) {
+ link->stats.recv_fragmented++;
+ tipc_data_input(link, skb);
+ } else if (!link->reasm_buf) {
+ tipc_link_reset(link);
+ }
+ break;
+ case BCAST_PROTOCOL:
+ tipc_link_sync_rcv(node, skb);
+ break;
+ default:
+ break;
+ };
+}
+
+/**
+ * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue
+ *
+ * Returns increase in queue length (i.e. 0 or 1)
+ */
+u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb)
+{
+ struct sk_buff *skb1;
+ u32 seq_no = buf_seqno(skb);
+
+ /* Empty queue ? */
+ if (skb_queue_empty(list)) {
+ __skb_queue_tail(list, skb);
+ return 1;
+ }
+
+ /* Last ? */
+ if (less(buf_seqno(skb_peek_tail(list)), seq_no)) {
+ __skb_queue_tail(list, skb);
+ return 1;
+ }
+
+ /* Locate insertion point in queue, then insert; discard if duplicate */
+ skb_queue_walk(list, skb1) {
+ u32 curr_seqno = buf_seqno(skb1);
+
+ if (seq_no == curr_seqno) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (less(seq_no, curr_seqno))
+ break;
+ }
+
+ __skb_queue_before(list, skb1, skb);
+ return 1;
+}
+
+/*
+ * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet
+ */
+static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr,
+ struct sk_buff *buf)
+{
+ u32 seq_no = buf_seqno(buf);
+
+ if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) {
+ tipc_link_proto_rcv(l_ptr, buf);
+ return;
+ }
+
+ /* Record OOS packet arrival (force mismatch on next timeout) */
+ l_ptr->checkpoint--;
+
+ /*
+ * Discard packet if a duplicate; otherwise add it to deferred queue
+ * and notify peer of gap as per protocol specification
+ */
+ if (less(seq_no, mod(l_ptr->next_in_no))) {
+ l_ptr->stats.duplicates++;
+ kfree_skb(buf);
+ return;
+ }
+
+ if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) {
+ l_ptr->stats.deferred_recv++;
+ if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1)
+ tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0);
+ } else {
+ l_ptr->stats.duplicates++;
+ }
+}
+
+/*
+ * Send protocol message to the other endpoint.
+ */
+void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg,
+ u32 gap, u32 tolerance, u32 priority)
+{
+ struct sk_buff *buf = NULL;
+ struct tipc_msg *msg = l_ptr->pmsg;
+ u32 msg_size = sizeof(l_ptr->proto_msg);
+ int r_flag;
+
+ /* Don't send protocol message during link failover */
+ if (l_ptr->flags & LINK_FAILINGOVER)
+ return;
+
+ /* Abort non-RESET send if communication with node is prohibited */
+ if ((tipc_node_blocked(l_ptr->owner)) && (msg_typ != RESET_MSG))
+ return;
+
+ /* Create protocol message with "out-of-sequence" sequence number */
+ msg_set_type(msg, msg_typ);
+ msg_set_net_plane(msg, l_ptr->net_plane);
+ msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
+ msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net));
+
+ if (msg_typ == STATE_MSG) {
+ u32 next_sent = mod(l_ptr->next_out_no);
+
+ if (!tipc_link_is_up(l_ptr))
+ return;
+ if (skb_queue_len(&l_ptr->backlogq))
+ next_sent = buf_seqno(skb_peek(&l_ptr->backlogq));
+ msg_set_next_sent(msg, next_sent);
+ if (!skb_queue_empty(&l_ptr->deferdq)) {
+ u32 rec = buf_seqno(skb_peek(&l_ptr->deferdq));
+ gap = mod(rec - mod(l_ptr->next_in_no));
+ }
+ msg_set_seq_gap(msg, gap);
+ if (gap)
+ l_ptr->stats.sent_nacks++;
+ msg_set_link_tolerance(msg, tolerance);
+ msg_set_linkprio(msg, priority);
+ msg_set_max_pkt(msg, l_ptr->mtu);
+ msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
+ msg_set_probe(msg, probe_msg != 0);
+ if (probe_msg)
+ l_ptr->stats.sent_probes++;
+ l_ptr->stats.sent_states++;
+ } else { /* RESET_MSG or ACTIVATE_MSG */
+ msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1));
+ msg_set_seq_gap(msg, 0);
+ msg_set_next_sent(msg, 1);
+ msg_set_probe(msg, 0);
+ msg_set_link_tolerance(msg, l_ptr->tolerance);
+ msg_set_linkprio(msg, l_ptr->priority);
+ msg_set_max_pkt(msg, l_ptr->advertised_mtu);
+ }
+
+ r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr));
+ msg_set_redundant_link(msg, r_flag);
+ msg_set_linkprio(msg, l_ptr->priority);
+ msg_set_size(msg, msg_size);
+
+ msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2)));
+
+ buf = tipc_buf_acquire(msg_size);
+ if (!buf)
+ return;
+
+ skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg));
+ buf->priority = TC_PRIO_CONTROL;
+ tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf,
+ &l_ptr->media_addr);
+ l_ptr->rcv_unacked = 0;
+ kfree_skb(buf);
+}
+
+/*
+ * Receive protocol message :
+ * Note that network plane id propagates through the network, and may
+ * change at any time. The node with lowest address rules
+ */
+static void tipc_link_proto_rcv(struct tipc_link *l_ptr,
+ struct sk_buff *buf)
+{
+ u32 rec_gap = 0;
+ u32 msg_tol;
+ struct tipc_msg *msg = buf_msg(buf);
+
+ if (l_ptr->flags & LINK_FAILINGOVER)
+ goto exit;
+
+ if (l_ptr->net_plane != msg_net_plane(msg))
+ if (link_own_addr(l_ptr) > msg_prevnode(msg))
+ l_ptr->net_plane = msg_net_plane(msg);
+
+ switch (msg_type(msg)) {
+
+ case RESET_MSG:
+ if (!link_working_unknown(l_ptr) &&
+ (l_ptr->peer_session != INVALID_SESSION)) {
+ if (less_eq(msg_session(msg), l_ptr->peer_session))
+ break; /* duplicate or old reset: ignore */
+ }
+
+ if (!msg_redundant_link(msg) && (link_working_working(l_ptr) ||
+ link_working_unknown(l_ptr))) {
+ /*
+ * peer has lost contact -- don't allow peer's links
+ * to reactivate before we recognize loss & clean up
+ */
+ l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN;
+ }
+
+ link_state_event(l_ptr, RESET_MSG);
+
+ /* fall thru' */
+ case ACTIVATE_MSG:
+ /* Update link settings according other endpoint's values */
+ strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg));
+
+ msg_tol = msg_link_tolerance(msg);
+ if (msg_tol > l_ptr->tolerance)
+ link_set_supervision_props(l_ptr, msg_tol);
+
+ if (msg_linkprio(msg) > l_ptr->priority)
+ l_ptr->priority = msg_linkprio(msg);
+
+ if (l_ptr->mtu > msg_max_pkt(msg))
+ l_ptr->mtu = msg_max_pkt(msg);
+
+ /* Synchronize broadcast link info, if not done previously */
+ if (!tipc_node_is_up(l_ptr->owner)) {
+ l_ptr->owner->bclink.last_sent =
+ l_ptr->owner->bclink.last_in =
+ msg_last_bcast(msg);
+ l_ptr->owner->bclink.oos_state = 0;
+ }
+
+ l_ptr->peer_session = msg_session(msg);
+ l_ptr->peer_bearer_id = msg_bearer_id(msg);
+
+ if (msg_type(msg) == ACTIVATE_MSG)
+ link_state_event(l_ptr, ACTIVATE_MSG);
+ break;
+ case STATE_MSG:
+
+ msg_tol = msg_link_tolerance(msg);
+ if (msg_tol)
+ link_set_supervision_props(l_ptr, msg_tol);
+
+ if (msg_linkprio(msg) &&
+ (msg_linkprio(msg) != l_ptr->priority)) {
+ pr_debug("%s<%s>, priority change %u->%u\n",
+ link_rst_msg, l_ptr->name,
+ l_ptr->priority, msg_linkprio(msg));
+ l_ptr->priority = msg_linkprio(msg);
+ tipc_link_reset(l_ptr); /* Enforce change to take effect */
+ break;
+ }
+
+ /* Record reception; force mismatch at next timeout: */
+ l_ptr->checkpoint--;
+
+ link_state_event(l_ptr, TRAFFIC_MSG_EVT);
+ l_ptr->stats.recv_states++;
+ if (link_reset_unknown(l_ptr))
+ break;
+
+ if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) {
+ rec_gap = mod(msg_next_sent(msg) -
+ mod(l_ptr->next_in_no));
+ }
+
+ if (msg_probe(msg))
+ l_ptr->stats.recv_probes++;
+
+ /* Protocol message before retransmits, reduce loss risk */
+ if (l_ptr->owner->bclink.recv_permitted)
+ tipc_bclink_update_link_state(l_ptr->owner,
+ msg_last_bcast(msg));
+
+ if (rec_gap || (msg_probe(msg))) {
+ tipc_link_proto_xmit(l_ptr, STATE_MSG, 0,
+ rec_gap, 0, 0);
+ }
+ if (msg_seq_gap(msg)) {
+ l_ptr->stats.recv_nacks++;
+ tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq),
+ msg_seq_gap(msg));
+ }
+ break;
+ }
+exit:
+ kfree_skb(buf);
+}
+
+
+/* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to
+ * a different bearer. Owner node is locked.
+ */
+static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr,
+ struct tipc_msg *tunnel_hdr,
+ struct tipc_msg *msg,
+ u32 selector)
+{
+ struct tipc_link *tunnel;
+ struct sk_buff *skb;
+ u32 length = msg_size(msg);
+
+ tunnel = l_ptr->owner->active_links[selector & 1];
+ if (!tipc_link_is_up(tunnel)) {
+ pr_warn("%stunnel link no longer available\n", link_co_err);
+ return;
+ }
+ msg_set_size(tunnel_hdr, length + INT_H_SIZE);
+ skb = tipc_buf_acquire(length + INT_H_SIZE);
+ if (!skb) {
+ pr_warn("%sunable to send tunnel msg\n", link_co_err);
+ return;
+ }
+ skb_copy_to_linear_data(skb, tunnel_hdr, INT_H_SIZE);
+ skb_copy_to_linear_data_offset(skb, INT_H_SIZE, msg, length);
+ __tipc_link_xmit_skb(tunnel, skb);
+}
+
+
+/* tipc_link_failover_send_queue(): A link has gone down, but a second
+ * link is still active. We can do failover. Tunnel the failing link's
+ * whole send queue via the remaining link. This way, we don't lose
+ * any packets, and sequence order is preserved for subsequent traffic
+ * sent over the remaining link. Owner node is locked.
+ */
+void tipc_link_failover_send_queue(struct tipc_link *l_ptr)
+{
+ int msgcount;
+ struct tipc_link *tunnel = l_ptr->owner->active_links[0];
+ struct tipc_msg tunnel_hdr;
+ struct sk_buff *skb;
+ int split_bundles;
+
+ if (!tunnel)
+ return;
+
+ tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL,
+ FAILOVER_MSG, INT_H_SIZE, l_ptr->addr);
+ skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq);
+ tipc_link_purge_backlog(l_ptr);
+ msgcount = skb_queue_len(&l_ptr->transmq);
+ msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id);
+ msg_set_msgcnt(&tunnel_hdr, msgcount);
+
+ if (skb_queue_empty(&l_ptr->transmq)) {
+ skb = tipc_buf_acquire(INT_H_SIZE);
+ if (skb) {
+ skb_copy_to_linear_data(skb, &tunnel_hdr, INT_H_SIZE);
+ msg_set_size(&tunnel_hdr, INT_H_SIZE);
+ __tipc_link_xmit_skb(tunnel, skb);
+ } else {
+ pr_warn("%sunable to send changeover msg\n",
+ link_co_err);
+ }
+ return;
+ }
+
+ split_bundles = (l_ptr->owner->active_links[0] !=
+ l_ptr->owner->active_links[1]);
+
+ skb_queue_walk(&l_ptr->transmq, skb) {
+ struct tipc_msg *msg = buf_msg(skb);
+
+ if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) {
+ struct tipc_msg *m = msg_get_wrapped(msg);
+ unchar *pos = (unchar *)m;
+
+ msgcount = msg_msgcnt(msg);
+ while (msgcount--) {
+ msg_set_seqno(m, msg_seqno(msg));
+ tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, m,
+ msg_link_selector(m));
+ pos += align(msg_size(m));
+ m = (struct tipc_msg *)pos;
+ }
+ } else {
+ tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, msg,
+ msg_link_selector(msg));
+ }
+ }
+}
+
+/* tipc_link_dup_queue_xmit(): A second link has become active. Tunnel a
+ * duplicate of the first link's send queue via the new link. This way, we
+ * are guaranteed that currently queued packets from a socket are delivered
+ * before future traffic from the same socket, even if this is using the
+ * new link. The last arriving copy of each duplicate packet is dropped at
+ * the receiving end by the regular protocol check, so packet cardinality
+ * and sequence order is preserved per sender/receiver socket pair.
+ * Owner node is locked.
+ */
+void tipc_link_dup_queue_xmit(struct tipc_link *link,
+ struct tipc_link *tnl)
+{
+ struct sk_buff *skb;
+ struct tipc_msg tnl_hdr;
+ struct sk_buff_head *queue = &link->transmq;
+ int mcnt;
+
+ tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL,
+ SYNCH_MSG, INT_H_SIZE, link->addr);
+ mcnt = skb_queue_len(&link->transmq) + skb_queue_len(&link->backlogq);
+ msg_set_msgcnt(&tnl_hdr, mcnt);
+ msg_set_bearer_id(&tnl_hdr, link->peer_bearer_id);
+
+tunnel_queue:
+ skb_queue_walk(queue, skb) {
+ struct sk_buff *outskb;
+ struct tipc_msg *msg = buf_msg(skb);
+ u32 len = msg_size(msg);
+
+ msg_set_ack(msg, mod(link->next_in_no - 1));
+ msg_set_bcast_ack(msg, link->owner->bclink.last_in);
+ msg_set_size(&tnl_hdr, len + INT_H_SIZE);
+ outskb = tipc_buf_acquire(len + INT_H_SIZE);
+ if (outskb == NULL) {
+ pr_warn("%sunable to send duplicate msg\n",
+ link_co_err);
+ return;
+ }
+ skb_copy_to_linear_data(outskb, &tnl_hdr, INT_H_SIZE);
+ skb_copy_to_linear_data_offset(outskb, INT_H_SIZE,
+ skb->data, len);
+ __tipc_link_xmit_skb(tnl, outskb);
+ if (!tipc_link_is_up(link))
+ return;
+ }
+ if (queue == &link->backlogq)
+ return;
+ queue = &link->backlogq;
+ goto tunnel_queue;
+}
+
+/* tipc_link_failover_rcv(): Receive a tunnelled FAILOVER_MSG packet
+ * Owner node is locked.
+ */
+static bool tipc_link_failover_rcv(struct tipc_link *link,
+ struct sk_buff **skb)
+{
+ struct tipc_msg *msg = buf_msg(*skb);
+ struct sk_buff *iskb = NULL;
+ struct tipc_link *pl = NULL;
+ int bearer_id = msg_bearer_id(msg);
+ int pos = 0;
+
+ if (msg_type(msg) != FAILOVER_MSG) {
+ pr_warn("%sunknown tunnel pkt received\n", link_co_err);
+ goto exit;
+ }
+ if (bearer_id >= MAX_BEARERS)
+ goto exit;
+
+ if (bearer_id == link->bearer_id)
+ goto exit;
+
+ pl = link->owner->links[bearer_id];
+ if (pl && tipc_link_is_up(pl))
+ tipc_link_reset(pl);
+
+ if (link->failover_pkts == FIRST_FAILOVER)
+ link->failover_pkts = msg_msgcnt(msg);
+
+ /* Should we expect an inner packet? */
+ if (!link->failover_pkts)
+ goto exit;
+
+ if (!tipc_msg_extract(*skb, &iskb, &pos)) {
+ pr_warn("%sno inner failover pkt\n", link_co_err);
+ *skb = NULL;
+ goto exit;
+ }
+ link->failover_pkts--;
+ *skb = NULL;
+
+ /* Was this packet already delivered? */
+ if (less(buf_seqno(iskb), link->failover_checkpt)) {
+ kfree_skb(iskb);
+ iskb = NULL;
+ goto exit;
+ }
+ if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) {
+ link->stats.recv_fragments++;
+ tipc_buf_append(&link->failover_skb, &iskb);
+ }
+exit:
+ if (!link->failover_pkts && pl)
+ pl->flags &= ~LINK_FAILINGOVER;
+ kfree_skb(*skb);
+ *skb = iskb;
+ return *skb;
+}
+
+static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol)
+{
+ unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
+
+ if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL))
+ return;
+
+ l_ptr->tolerance = tol;
+ l_ptr->cont_intv = msecs_to_jiffies(intv);
+ l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->cont_intv) / 4);
+}
+
+void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
+{
+ int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE);
+
+ l->window = win;
+ l->backlog[TIPC_LOW_IMPORTANCE].limit = win / 2;
+ l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = win;
+ l->backlog[TIPC_HIGH_IMPORTANCE].limit = win / 2 * 3;
+ l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = win * 2;
+ l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk;
+}
+
+/* tipc_link_find_owner - locate owner node of link by link's name
+ * @net: the applicable net namespace
+ * @name: pointer to link name string
+ * @bearer_id: pointer to index in 'node->links' array where the link was found.
+ *
+ * Returns pointer to node owning the link, or 0 if no matching link is found.
+ */
+static struct tipc_node *tipc_link_find_owner(struct net *net,
+ const char *link_name,
+ unsigned int *bearer_id)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *l_ptr;
+ struct tipc_node *n_ptr;
+ struct tipc_node *found_node = NULL;
+ int i;
+
+ *bearer_id = 0;
+ rcu_read_lock();
+ list_for_each_entry_rcu(n_ptr, &tn->node_list, list) {
+ tipc_node_lock(n_ptr);
+ for (i = 0; i < MAX_BEARERS; i++) {
+ l_ptr = n_ptr->links[i];
+ if (l_ptr && !strcmp(l_ptr->name, link_name)) {
+ *bearer_id = i;
+ found_node = n_ptr;
+ break;
+ }
+ }
+ tipc_node_unlock(n_ptr);
+ if (found_node)
+ break;
+ }
+ rcu_read_unlock();
+
+ return found_node;
+}
+
+/**
+ * link_reset_statistics - reset link statistics
+ * @l_ptr: pointer to link
+ */
+static void link_reset_statistics(struct tipc_link *l_ptr)
+{
+ memset(&l_ptr->stats, 0, sizeof(l_ptr->stats));
+ l_ptr->stats.sent_info = l_ptr->next_out_no;
+ l_ptr->stats.recv_info = l_ptr->next_in_no;
+}
+
+static void link_print(struct tipc_link *l_ptr, const char *str)
+{
+ struct tipc_net *tn = net_generic(l_ptr->owner->net, tipc_net_id);
+ struct tipc_bearer *b_ptr;
+
+ rcu_read_lock();
+ b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]);
+ if (b_ptr)
+ pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name);
+ rcu_read_unlock();
+
+ if (link_working_unknown(l_ptr))
+ pr_cont(":WU\n");
+ else if (link_reset_reset(l_ptr))
+ pr_cont(":RR\n");
+ else if (link_reset_unknown(l_ptr))
+ pr_cont(":RU\n");
+ else if (link_working_working(l_ptr))
+ pr_cont(":WW\n");
+ else
+ pr_cont("\n");
+}
+
+/* Parse and validate nested (link) properties valid for media, bearer and link
+ */
+int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[])
+{
+ int err;
+
+ err = nla_parse_nested(props, TIPC_NLA_PROP_MAX, prop,
+ tipc_nl_prop_policy);
+ if (err)
+ return err;
+
+ if (props[TIPC_NLA_PROP_PRIO]) {
+ u32 prio;
+
+ prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
+ if (prio > TIPC_MAX_LINK_PRI)
+ return -EINVAL;
+ }
+
+ if (props[TIPC_NLA_PROP_TOL]) {
+ u32 tol;
+
+ tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
+ if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL))
+ return -EINVAL;
+ }
+
+ if (props[TIPC_NLA_PROP_WIN]) {
+ u32 win;
+
+ win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ int res = 0;
+ int bearer_id;
+ char *name;
+ struct tipc_link *link;
+ struct tipc_node *node;
+ struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+
+ if (!info->attrs[TIPC_NLA_LINK])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX,
+ info->attrs[TIPC_NLA_LINK],
+ tipc_nl_link_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_LINK_NAME])
+ return -EINVAL;
+
+ name = nla_data(attrs[TIPC_NLA_LINK_NAME]);
+
+ node = tipc_link_find_owner(net, name, &bearer_id);
+ if (!node)
+ return -EINVAL;
+
+ tipc_node_lock(node);
+
+ link = node->links[bearer_id];
+ if (!link) {
+ res = -EINVAL;
+ goto out;
+ }
+
+ if (attrs[TIPC_NLA_LINK_PROP]) {
+ struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
+
+ err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP],
+ props);
+ if (err) {
+ res = err;
+ goto out;
+ }
+
+ if (props[TIPC_NLA_PROP_TOL]) {
+ u32 tol;
+
+ tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
+ link_set_supervision_props(link, tol);
+ tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0);
+ }
+ if (props[TIPC_NLA_PROP_PRIO]) {
+ u32 prio;
+
+ prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
+ link->priority = prio;
+ tipc_link_proto_xmit(link, STATE_MSG, 0, 0, 0, prio);
+ }
+ if (props[TIPC_NLA_PROP_WIN]) {
+ u32 win;
+
+ win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ tipc_link_set_queue_limits(link, win);
+ }
+ }
+
+out:
+ tipc_node_unlock(node);
+
+ return res;
+}
+
+static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s)
+{
+ int i;
+ struct nlattr *stats;
+
+ struct nla_map {
+ u32 key;
+ u32 val;
+ };
+
+ struct nla_map map[] = {
+ {TIPC_NLA_STATS_RX_INFO, s->recv_info},
+ {TIPC_NLA_STATS_RX_FRAGMENTS, s->recv_fragments},
+ {TIPC_NLA_STATS_RX_FRAGMENTED, s->recv_fragmented},
+ {TIPC_NLA_STATS_RX_BUNDLES, s->recv_bundles},
+ {TIPC_NLA_STATS_RX_BUNDLED, s->recv_bundled},
+ {TIPC_NLA_STATS_TX_INFO, s->sent_info},
+ {TIPC_NLA_STATS_TX_FRAGMENTS, s->sent_fragments},
+ {TIPC_NLA_STATS_TX_FRAGMENTED, s->sent_fragmented},
+ {TIPC_NLA_STATS_TX_BUNDLES, s->sent_bundles},
+ {TIPC_NLA_STATS_TX_BUNDLED, s->sent_bundled},
+ {TIPC_NLA_STATS_MSG_PROF_TOT, (s->msg_length_counts) ?
+ s->msg_length_counts : 1},
+ {TIPC_NLA_STATS_MSG_LEN_CNT, s->msg_length_counts},
+ {TIPC_NLA_STATS_MSG_LEN_TOT, s->msg_lengths_total},
+ {TIPC_NLA_STATS_MSG_LEN_P0, s->msg_length_profile[0]},
+ {TIPC_NLA_STATS_MSG_LEN_P1, s->msg_length_profile[1]},
+ {TIPC_NLA_STATS_MSG_LEN_P2, s->msg_length_profile[2]},
+ {TIPC_NLA_STATS_MSG_LEN_P3, s->msg_length_profile[3]},
+ {TIPC_NLA_STATS_MSG_LEN_P4, s->msg_length_profile[4]},
+ {TIPC_NLA_STATS_MSG_LEN_P5, s->msg_length_profile[5]},
+ {TIPC_NLA_STATS_MSG_LEN_P6, s->msg_length_profile[6]},
+ {TIPC_NLA_STATS_RX_STATES, s->recv_states},
+ {TIPC_NLA_STATS_RX_PROBES, s->recv_probes},
+ {TIPC_NLA_STATS_RX_NACKS, s->recv_nacks},
+ {TIPC_NLA_STATS_RX_DEFERRED, s->deferred_recv},
+ {TIPC_NLA_STATS_TX_STATES, s->sent_states},
+ {TIPC_NLA_STATS_TX_PROBES, s->sent_probes},
+ {TIPC_NLA_STATS_TX_NACKS, s->sent_nacks},
+ {TIPC_NLA_STATS_TX_ACKS, s->sent_acks},
+ {TIPC_NLA_STATS_RETRANSMITTED, s->retransmitted},
+ {TIPC_NLA_STATS_DUPLICATES, s->duplicates},
+ {TIPC_NLA_STATS_LINK_CONGS, s->link_congs},
+ {TIPC_NLA_STATS_MAX_QUEUE, s->max_queue_sz},
+ {TIPC_NLA_STATS_AVG_QUEUE, s->queue_sz_counts ?
+ (s->accu_queue_sz / s->queue_sz_counts) : 0}
+ };
+
+ stats = nla_nest_start(skb, TIPC_NLA_LINK_STATS);
+ if (!stats)
+ return -EMSGSIZE;
+
+ for (i = 0; i < ARRAY_SIZE(map); i++)
+ if (nla_put_u32(skb, map[i].key, map[i].val))
+ goto msg_full;
+
+ nla_nest_end(skb, stats);
+
+ return 0;
+msg_full:
+ nla_nest_cancel(skb, stats);
+
+ return -EMSGSIZE;
+}
+
+/* Caller should hold appropriate locks to protect the link */
+static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
+ struct tipc_link *link, int nlflags)
+{
+ int err;
+ void *hdr;
+ struct nlattr *attrs;
+ struct nlattr *prop;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ nlflags, TIPC_NL_LINK_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK);
+ if (!attrs)
+ goto msg_full;
+
+ if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, link->name))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST,
+ tipc_cluster_mask(tn->own_addr)))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->next_in_no))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->next_out_no))
+ goto attr_msg_full;
+
+ if (tipc_link_is_up(link))
+ if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP))
+ goto attr_msg_full;
+ if (tipc_link_is_active(link))
+ if (nla_put_flag(msg->skb, TIPC_NLA_LINK_ACTIVE))
+ goto attr_msg_full;
+
+ prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
+ if (!prop)
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority))
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, link->tolerance))
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN,
+ link->window))
+ goto prop_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority))
+ goto prop_msg_full;
+ nla_nest_end(msg->skb, prop);
+
+ err = __tipc_nl_add_stats(msg->skb, &link->stats);
+ if (err)
+ goto attr_msg_full;
+
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+prop_msg_full:
+ nla_nest_cancel(msg->skb, prop);
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+/* Caller should hold node lock */
+static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
+ struct tipc_node *node, u32 *prev_link)
+{
+ u32 i;
+ int err;
+
+ for (i = *prev_link; i < MAX_BEARERS; i++) {
+ *prev_link = i;
+
+ if (!node->links[i])
+ continue;
+
+ err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI);
+ if (err)
+ return err;
+ }
+ *prev_link = 0;
+
+ return 0;
+}
+
+int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_node *node;
+ struct tipc_nl_msg msg;
+ u32 prev_node = cb->args[0];
+ u32 prev_link = cb->args[1];
+ int done = cb->args[2];
+ int err;
+
+ if (done)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ rcu_read_lock();
+ if (prev_node) {
+ node = tipc_node_find(net, prev_node);
+ if (!node) {
+ /* We never set seq or call nl_dump_check_consistent()
+ * this means that setting prev_seq here will cause the
+ * consistence check to fail in the netlink callback
+ * handler. Resulting in the last NLMSG_DONE message
+ * having the NLM_F_DUMP_INTR flag set.
+ */
+ cb->prev_seq = 1;
+ goto out;
+ }
+ tipc_node_put(node);
+
+ list_for_each_entry_continue_rcu(node, &tn->node_list,
+ list) {
+ tipc_node_lock(node);
+ err = __tipc_nl_add_node_links(net, &msg, node,
+ &prev_link);
+ tipc_node_unlock(node);
+ if (err)
+ goto out;
+
+ prev_node = node->addr;
+ }
+ } else {
+ err = tipc_nl_add_bc_link(net, &msg);
+ if (err)
+ goto out;
+
+ list_for_each_entry_rcu(node, &tn->node_list, list) {
+ tipc_node_lock(node);
+ err = __tipc_nl_add_node_links(net, &msg, node,
+ &prev_link);
+ tipc_node_unlock(node);
+ if (err)
+ goto out;
+
+ prev_node = node->addr;
+ }
+ }
+ done = 1;
+out:
+ rcu_read_unlock();
+
+ cb->args[0] = prev_node;
+ cb->args[1] = prev_link;
+ cb->args[2] = done;
+
+ return skb->len;
+}
+
+int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct sk_buff *ans_skb;
+ struct tipc_nl_msg msg;
+ struct tipc_link *link;
+ struct tipc_node *node;
+ char *name;
+ int bearer_id;
+ int err;
+
+ if (!info->attrs[TIPC_NLA_LINK_NAME])
+ return -EINVAL;
+
+ name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]);
+ node = tipc_link_find_owner(net, name, &bearer_id);
+ if (!node)
+ return -EINVAL;
+
+ ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!ans_skb)
+ return -ENOMEM;
+
+ msg.skb = ans_skb;
+ msg.portid = info->snd_portid;
+ msg.seq = info->snd_seq;
+
+ tipc_node_lock(node);
+ link = node->links[bearer_id];
+ if (!link) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = __tipc_nl_add_link(net, &msg, link, 0);
+ if (err)
+ goto err_out;
+
+ tipc_node_unlock(node);
+
+ return genlmsg_reply(ans_skb, info);
+
+err_out:
+ tipc_node_unlock(node);
+ nlmsg_free(ans_skb);
+
+ return err;
+}
+
+int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *link_name;
+ unsigned int bearer_id;
+ struct tipc_link *link;
+ struct tipc_node *node;
+ struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+
+ if (!info->attrs[TIPC_NLA_LINK])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX,
+ info->attrs[TIPC_NLA_LINK],
+ tipc_nl_link_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_LINK_NAME])
+ return -EINVAL;
+
+ link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]);
+
+ if (strcmp(link_name, tipc_bclink_name) == 0) {
+ err = tipc_bclink_reset_stats(net);
+ if (err)
+ return err;
+ return 0;
+ }
+
+ node = tipc_link_find_owner(net, link_name, &bearer_id);
+ if (!node)
+ return -EINVAL;
+
+ tipc_node_lock(node);
+
+ link = node->links[bearer_id];
+ if (!link) {
+ tipc_node_unlock(node);
+ return -EINVAL;
+ }
+
+ link_reset_statistics(link);
+
+ tipc_node_unlock(node);
+
+ return 0;
+}
diff --git a/net/tipc/link.h b/net/tipc/link.h
new file mode 100644
index 000000000..b5b4e3554
--- /dev/null
+++ b/net/tipc/link.h
@@ -0,0 +1,311 @@
+/*
+ * net/tipc/link.h: Include file for TIPC link code
+ *
+ * Copyright (c) 1995-2006, 2013-2014, Ericsson AB
+ * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_LINK_H
+#define _TIPC_LINK_H
+
+#include <net/genetlink.h>
+#include "msg.h"
+#include "node.h"
+
+/* TIPC-specific error codes
+*/
+#define ELINKCONG EAGAIN /* link congestion <=> resource unavailable */
+
+/* Out-of-range value for link sequence numbers
+ */
+#define INVALID_LINK_SEQ 0x10000
+
+/* Link working states
+ */
+#define WORKING_WORKING 560810u
+#define WORKING_UNKNOWN 560811u
+#define RESET_UNKNOWN 560812u
+#define RESET_RESET 560813u
+
+/* Link endpoint execution states
+ */
+#define LINK_STARTED 0x0001
+#define LINK_STOPPED 0x0002
+#define LINK_SYNCHING 0x0004
+#define LINK_FAILINGOVER 0x0008
+
+/* Starting value for maximum packet size negotiation on unicast links
+ * (unless bearer MTU is less)
+ */
+#define MAX_PKT_DEFAULT 1500
+
+struct tipc_stats {
+ u32 sent_info; /* used in counting # sent packets */
+ u32 recv_info; /* used in counting # recv'd packets */
+ u32 sent_states;
+ u32 recv_states;
+ u32 sent_probes;
+ u32 recv_probes;
+ u32 sent_nacks;
+ u32 recv_nacks;
+ u32 sent_acks;
+ u32 sent_bundled;
+ u32 sent_bundles;
+ u32 recv_bundled;
+ u32 recv_bundles;
+ u32 retransmitted;
+ u32 sent_fragmented;
+ u32 sent_fragments;
+ u32 recv_fragmented;
+ u32 recv_fragments;
+ u32 link_congs; /* # port sends blocked by congestion */
+ u32 deferred_recv;
+ u32 duplicates;
+ u32 max_queue_sz; /* send queue size high water mark */
+ u32 accu_queue_sz; /* used for send queue size profiling */
+ u32 queue_sz_counts; /* used for send queue size profiling */
+ u32 msg_length_counts; /* used for message length profiling */
+ u32 msg_lengths_total; /* used for message length profiling */
+ u32 msg_length_profile[7]; /* used for msg. length profiling */
+};
+
+/**
+ * struct tipc_link - TIPC link data structure
+ * @addr: network address of link's peer node
+ * @name: link name character string
+ * @media_addr: media address to use when sending messages over link
+ * @timer: link timer
+ * @owner: pointer to peer node
+ * @refcnt: reference counter for permanent references (owner node & timer)
+ * @flags: execution state flags for link endpoint instance
+ * @checkpoint: reference point for triggering link continuity checking
+ * @peer_session: link session # being used by peer end of link
+ * @peer_bearer_id: bearer id used by link's peer endpoint
+ * @bearer_id: local bearer id used by link
+ * @tolerance: minimum link continuity loss needed to reset link [in ms]
+ * @cont_intv: link continuity testing interval
+ * @abort_limit: # of unacknowledged continuity probes needed to reset link
+ * @state: current state of link FSM
+ * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state
+ * @proto_msg: template for control messages generated by link
+ * @pmsg: convenience pointer to "proto_msg" field
+ * @priority: current link priority
+ * @net_plane: current link network plane ('A' through 'H')
+ * @backlog_limit: backlog queue congestion thresholds (indexed by importance)
+ * @exp_msg_count: # of tunnelled messages expected during link changeover
+ * @reset_checkpoint: seq # of last acknowledged message at time of link reset
+ * @mtu: current maximum packet size for this link
+ * @advertised_mtu: advertised own mtu when link is being established
+ * @transmitq: queue for sent, non-acked messages
+ * @backlogq: queue for messages waiting to be sent
+ * @next_out_no: next sequence number to use for outbound messages
+ * @last_retransmitted: sequence number of most recently retransmitted message
+ * @stale_count: # of identical retransmit requests made by peer
+ * @next_in_no: next sequence number to expect for inbound messages
+ * @deferred_queue: deferred queue saved OOS b'cast message received from node
+ * @unacked_window: # of inbound messages rx'd without ack'ing back to peer
+ * @inputq: buffer queue for messages to be delivered upwards
+ * @namedq: buffer queue for name table messages to be delivered upwards
+ * @next_out: ptr to first unsent outbound message in queue
+ * @wakeupq: linked list of wakeup msgs waiting for link congestion to abate
+ * @long_msg_seq_no: next identifier to use for outbound fragmented messages
+ * @reasm_buf: head of partially reassembled inbound message fragments
+ * @stats: collects statistics regarding link activity
+ */
+struct tipc_link {
+ u32 addr;
+ char name[TIPC_MAX_LINK_NAME];
+ struct tipc_media_addr media_addr;
+ struct timer_list timer;
+ struct tipc_node *owner;
+ struct kref ref;
+
+ /* Management and link supervision data */
+ unsigned int flags;
+ u32 checkpoint;
+ u32 peer_session;
+ u32 peer_bearer_id;
+ u32 bearer_id;
+ u32 tolerance;
+ unsigned long cont_intv;
+ u32 abort_limit;
+ int state;
+ u32 fsm_msg_cnt;
+ struct {
+ unchar hdr[INT_H_SIZE];
+ unchar body[TIPC_MAX_IF_NAME];
+ } proto_msg;
+ struct tipc_msg *pmsg;
+ u32 priority;
+ char net_plane;
+ u16 synch_point;
+
+ /* Failover */
+ u16 failover_pkts;
+ u16 failover_checkpt;
+ struct sk_buff *failover_skb;
+
+ /* Max packet negotiation */
+ u16 mtu;
+ u16 advertised_mtu;
+
+ /* Sending */
+ struct sk_buff_head transmq;
+ struct sk_buff_head backlogq;
+ struct {
+ u16 len;
+ u16 limit;
+ } backlog[5];
+ u32 next_out_no;
+ u32 window;
+ u32 last_retransmitted;
+ u32 stale_count;
+
+ /* Reception */
+ u32 next_in_no;
+ u32 rcv_unacked;
+ struct sk_buff_head deferdq;
+ struct sk_buff_head inputq;
+ struct sk_buff_head namedq;
+
+ /* Congestion handling */
+ struct sk_buff_head wakeupq;
+
+ /* Fragmentation/reassembly */
+ struct sk_buff *reasm_buf;
+
+ /* Statistics */
+ struct tipc_stats stats;
+};
+
+struct tipc_port;
+
+struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
+ struct tipc_bearer *b_ptr,
+ const struct tipc_media_addr *media_addr);
+void tipc_link_delete(struct tipc_link *link);
+void tipc_link_delete_list(struct net *net, unsigned int bearer_id,
+ bool shutting_down);
+void tipc_link_failover_send_queue(struct tipc_link *l_ptr);
+void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest);
+void tipc_link_reset_fragments(struct tipc_link *l_ptr);
+int tipc_link_is_up(struct tipc_link *l_ptr);
+int tipc_link_is_active(struct tipc_link *l_ptr);
+void tipc_link_purge_queues(struct tipc_link *l_ptr);
+void tipc_link_reset_all(struct tipc_node *node);
+void tipc_link_reset(struct tipc_link *l_ptr);
+void tipc_link_reset_list(struct net *net, unsigned int bearer_id);
+int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest,
+ u32 selector);
+int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest,
+ u32 selector);
+int __tipc_link_xmit(struct net *net, struct tipc_link *link,
+ struct sk_buff_head *list);
+void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob,
+ u32 gap, u32 tolerance, u32 priority);
+void tipc_link_push_packets(struct tipc_link *l_ptr);
+u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *buf);
+void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window);
+void tipc_link_retransmit(struct tipc_link *l_ptr,
+ struct sk_buff *start, u32 retransmits);
+struct sk_buff *tipc_skb_queue_next(const struct sk_buff_head *list,
+ const struct sk_buff *skb);
+
+int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb);
+int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]);
+void link_prepare_wakeup(struct tipc_link *l);
+
+/*
+ * Link sequence number manipulation routines (uses modulo 2**16 arithmetic)
+ */
+static inline u32 buf_seqno(struct sk_buff *buf)
+{
+ return msg_seqno(buf_msg(buf));
+}
+
+static inline u32 mod(u32 x)
+{
+ return x & 0xffffu;
+}
+
+static inline int less_eq(u32 left, u32 right)
+{
+ return mod(right - left) < 32768u;
+}
+
+static inline int more(u32 left, u32 right)
+{
+ return !less_eq(left, right);
+}
+
+static inline int less(u32 left, u32 right)
+{
+ return less_eq(left, right) && (mod(right) != mod(left));
+}
+
+static inline u32 lesser(u32 left, u32 right)
+{
+ return less_eq(left, right) ? left : right;
+}
+
+static inline u32 link_own_addr(struct tipc_link *l)
+{
+ return msg_prevnode(l->pmsg);
+}
+
+/*
+ * Link status checking routines
+ */
+static inline int link_working_working(struct tipc_link *l_ptr)
+{
+ return l_ptr->state == WORKING_WORKING;
+}
+
+static inline int link_working_unknown(struct tipc_link *l_ptr)
+{
+ return l_ptr->state == WORKING_UNKNOWN;
+}
+
+static inline int link_reset_unknown(struct tipc_link *l_ptr)
+{
+ return l_ptr->state == RESET_UNKNOWN;
+}
+
+static inline int link_reset_reset(struct tipc_link *l_ptr)
+{
+ return l_ptr->state == RESET_RESET;
+}
+
+#endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
new file mode 100644
index 000000000..c3e96e815
--- /dev/null
+++ b/net/tipc/msg.c
@@ -0,0 +1,574 @@
+/*
+ * net/tipc/msg.c: TIPC message header routines
+ *
+ * Copyright (c) 2000-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <net/sock.h>
+#include "core.h"
+#include "msg.h"
+#include "addr.h"
+#include "name_table.h"
+
+#define MAX_FORWARD_SIZE 1024
+
+static unsigned int align(unsigned int i)
+{
+ return (i + 3) & ~3u;
+}
+
+/**
+ * tipc_buf_acquire - creates a TIPC message buffer
+ * @size: message size (including TIPC header)
+ *
+ * Returns a new buffer with data pointers set to the specified size.
+ *
+ * NOTE: Headroom is reserved to allow prepending of a data link header.
+ * There may also be unrequested tailroom present at the buffer's end.
+ */
+struct sk_buff *tipc_buf_acquire(u32 size)
+{
+ struct sk_buff *skb;
+ unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
+
+ skb = alloc_skb_fclone(buf_size, GFP_ATOMIC);
+ if (skb) {
+ skb_reserve(skb, BUF_HEADROOM);
+ skb_put(skb, size);
+ skb->next = NULL;
+ }
+ return skb;
+}
+
+void tipc_msg_init(u32 own_node, struct tipc_msg *m, u32 user, u32 type,
+ u32 hsize, u32 dnode)
+{
+ memset(m, 0, hsize);
+ msg_set_version(m);
+ msg_set_user(m, user);
+ msg_set_hdr_sz(m, hsize);
+ msg_set_size(m, hsize);
+ msg_set_prevnode(m, own_node);
+ msg_set_type(m, type);
+ if (hsize > SHORT_H_SIZE) {
+ msg_set_orignode(m, own_node);
+ msg_set_destnode(m, dnode);
+ }
+}
+
+struct sk_buff *tipc_msg_create(uint user, uint type,
+ uint hdr_sz, uint data_sz, u32 dnode,
+ u32 onode, u32 dport, u32 oport, int errcode)
+{
+ struct tipc_msg *msg;
+ struct sk_buff *buf;
+
+ buf = tipc_buf_acquire(hdr_sz + data_sz);
+ if (unlikely(!buf))
+ return NULL;
+
+ msg = buf_msg(buf);
+ tipc_msg_init(onode, msg, user, type, hdr_sz, dnode);
+ msg_set_size(msg, hdr_sz + data_sz);
+ msg_set_origport(msg, oport);
+ msg_set_destport(msg, dport);
+ msg_set_errcode(msg, errcode);
+ if (hdr_sz > SHORT_H_SIZE) {
+ msg_set_orignode(msg, onode);
+ msg_set_destnode(msg, dnode);
+ }
+ return buf;
+}
+
+/* tipc_buf_append(): Append a buffer to the fragment list of another buffer
+ * @*headbuf: in: NULL for first frag, otherwise value returned from prev call
+ * out: set when successful non-complete reassembly, otherwise NULL
+ * @*buf: in: the buffer to append. Always defined
+ * out: head buf after successful complete reassembly, otherwise NULL
+ * Returns 1 when reassembly complete, otherwise 0
+ */
+int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
+{
+ struct sk_buff *head = *headbuf;
+ struct sk_buff *frag = *buf;
+ struct sk_buff *tail;
+ struct tipc_msg *msg;
+ u32 fragid;
+ int delta;
+ bool headstolen;
+
+ if (!frag)
+ goto err;
+
+ msg = buf_msg(frag);
+ fragid = msg_type(msg);
+ frag->next = NULL;
+ skb_pull(frag, msg_hdr_sz(msg));
+
+ if (fragid == FIRST_FRAGMENT) {
+ if (unlikely(head))
+ goto err;
+ if (unlikely(skb_unclone(frag, GFP_ATOMIC)))
+ goto err;
+ head = *headbuf = frag;
+ skb_frag_list_init(head);
+ TIPC_SKB_CB(head)->tail = NULL;
+ *buf = NULL;
+ return 0;
+ }
+
+ if (!head)
+ goto err;
+
+ if (skb_try_coalesce(head, frag, &headstolen, &delta)) {
+ kfree_skb_partial(frag, headstolen);
+ } else {
+ tail = TIPC_SKB_CB(head)->tail;
+ if (!skb_has_frag_list(head))
+ skb_shinfo(head)->frag_list = frag;
+ else
+ tail->next = frag;
+ head->truesize += frag->truesize;
+ head->data_len += frag->len;
+ head->len += frag->len;
+ TIPC_SKB_CB(head)->tail = frag;
+ }
+
+ if (fragid == LAST_FRAGMENT) {
+ TIPC_SKB_CB(head)->validated = false;
+ if (unlikely(!tipc_msg_validate(head)))
+ goto err;
+ *buf = head;
+ TIPC_SKB_CB(head)->tail = NULL;
+ *headbuf = NULL;
+ return 1;
+ }
+ *buf = NULL;
+ return 0;
+err:
+ pr_warn_ratelimited("Unable to build fragment list\n");
+ kfree_skb(*buf);
+ kfree_skb(*headbuf);
+ *buf = *headbuf = NULL;
+ return 0;
+}
+
+/* tipc_msg_validate - validate basic format of received message
+ *
+ * This routine ensures a TIPC message has an acceptable header, and at least
+ * as much data as the header indicates it should. The routine also ensures
+ * that the entire message header is stored in the main fragment of the message
+ * buffer, to simplify future access to message header fields.
+ *
+ * Note: Having extra info present in the message header or data areas is OK.
+ * TIPC will ignore the excess, under the assumption that it is optional info
+ * introduced by a later release of the protocol.
+ */
+bool tipc_msg_validate(struct sk_buff *skb)
+{
+ struct tipc_msg *msg;
+ int msz, hsz;
+
+ if (unlikely(TIPC_SKB_CB(skb)->validated))
+ return true;
+ if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE)))
+ return false;
+
+ hsz = msg_hdr_sz(buf_msg(skb));
+ if (unlikely(hsz < MIN_H_SIZE) || (hsz > MAX_H_SIZE))
+ return false;
+ if (unlikely(!pskb_may_pull(skb, hsz)))
+ return false;
+
+ msg = buf_msg(skb);
+ if (unlikely(msg_version(msg) != TIPC_VERSION))
+ return false;
+
+ msz = msg_size(msg);
+ if (unlikely(msz < hsz))
+ return false;
+ if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE))
+ return false;
+ if (unlikely(skb->len < msz))
+ return false;
+
+ TIPC_SKB_CB(skb)->validated = true;
+ return true;
+}
+
+/**
+ * tipc_msg_build - create buffer chain containing specified header and data
+ * @mhdr: Message header, to be prepended to data
+ * @m: User message
+ * @dsz: Total length of user data
+ * @pktmax: Max packet size that can be used
+ * @list: Buffer or chain of buffers to be returned to caller
+ *
+ * Returns message data size or errno: -ENOMEM, -EFAULT
+ */
+int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
+ int offset, int dsz, int pktmax, struct sk_buff_head *list)
+{
+ int mhsz = msg_hdr_sz(mhdr);
+ int msz = mhsz + dsz;
+ int pktno = 1;
+ int pktsz;
+ int pktrem = pktmax;
+ int drem = dsz;
+ struct tipc_msg pkthdr;
+ struct sk_buff *skb;
+ char *pktpos;
+ int rc;
+
+ msg_set_size(mhdr, msz);
+
+ /* No fragmentation needed? */
+ if (likely(msz <= pktmax)) {
+ skb = tipc_buf_acquire(msz);
+ if (unlikely(!skb))
+ return -ENOMEM;
+ skb_orphan(skb);
+ __skb_queue_tail(list, skb);
+ skb_copy_to_linear_data(skb, mhdr, mhsz);
+ pktpos = skb->data + mhsz;
+ if (copy_from_iter(pktpos, dsz, &m->msg_iter) == dsz)
+ return dsz;
+ rc = -EFAULT;
+ goto error;
+ }
+
+ /* Prepare reusable fragment header */
+ tipc_msg_init(msg_prevnode(mhdr), &pkthdr, MSG_FRAGMENTER,
+ FIRST_FRAGMENT, INT_H_SIZE, msg_destnode(mhdr));
+ msg_set_size(&pkthdr, pktmax);
+ msg_set_fragm_no(&pkthdr, pktno);
+ msg_set_importance(&pkthdr, msg_importance(mhdr));
+
+ /* Prepare first fragment */
+ skb = tipc_buf_acquire(pktmax);
+ if (!skb)
+ return -ENOMEM;
+ skb_orphan(skb);
+ __skb_queue_tail(list, skb);
+ pktpos = skb->data;
+ skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE);
+ pktpos += INT_H_SIZE;
+ pktrem -= INT_H_SIZE;
+ skb_copy_to_linear_data_offset(skb, INT_H_SIZE, mhdr, mhsz);
+ pktpos += mhsz;
+ pktrem -= mhsz;
+
+ do {
+ if (drem < pktrem)
+ pktrem = drem;
+
+ if (copy_from_iter(pktpos, pktrem, &m->msg_iter) != pktrem) {
+ rc = -EFAULT;
+ goto error;
+ }
+ drem -= pktrem;
+
+ if (!drem)
+ break;
+
+ /* Prepare new fragment: */
+ if (drem < (pktmax - INT_H_SIZE))
+ pktsz = drem + INT_H_SIZE;
+ else
+ pktsz = pktmax;
+ skb = tipc_buf_acquire(pktsz);
+ if (!skb) {
+ rc = -ENOMEM;
+ goto error;
+ }
+ skb_orphan(skb);
+ __skb_queue_tail(list, skb);
+ msg_set_type(&pkthdr, FRAGMENT);
+ msg_set_size(&pkthdr, pktsz);
+ msg_set_fragm_no(&pkthdr, ++pktno);
+ skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE);
+ pktpos = skb->data + INT_H_SIZE;
+ pktrem = pktsz - INT_H_SIZE;
+
+ } while (1);
+ msg_set_type(buf_msg(skb), LAST_FRAGMENT);
+ return dsz;
+error:
+ __skb_queue_purge(list);
+ __skb_queue_head_init(list);
+ return rc;
+}
+
+/**
+ * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one
+ * @bskb: the buffer to append to ("bundle")
+ * @skb: buffer to be appended
+ * @mtu: max allowable size for the bundle buffer
+ * Consumes buffer if successful
+ * Returns true if bundling could be performed, otherwise false
+ */
+bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu)
+{
+ struct tipc_msg *bmsg;
+ struct tipc_msg *msg = buf_msg(skb);
+ unsigned int bsz;
+ unsigned int msz = msg_size(msg);
+ u32 start, pad;
+ u32 max = mtu - INT_H_SIZE;
+
+ if (likely(msg_user(msg) == MSG_FRAGMENTER))
+ return false;
+ if (!bskb)
+ return false;
+ bmsg = buf_msg(bskb);
+ bsz = msg_size(bmsg);
+ start = align(bsz);
+ pad = start - bsz;
+
+ if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL))
+ return false;
+ if (unlikely(msg_user(msg) == BCAST_PROTOCOL))
+ return false;
+ if (likely(msg_user(bmsg) != MSG_BUNDLER))
+ return false;
+ if (unlikely(skb_tailroom(bskb) < (pad + msz)))
+ return false;
+ if (unlikely(max < (start + msz)))
+ return false;
+
+ skb_put(bskb, pad + msz);
+ skb_copy_to_linear_data_offset(bskb, start, skb->data, msz);
+ msg_set_size(bmsg, start + msz);
+ msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1);
+ kfree_skb(skb);
+ return true;
+}
+
+/**
+ * tipc_msg_extract(): extract bundled inner packet from buffer
+ * @skb: buffer to be extracted from.
+ * @iskb: extracted inner buffer, to be returned
+ * @pos: position in outer message of msg to be extracted.
+ * Returns position of next msg
+ * Consumes outer buffer when last packet extracted
+ * Returns true when when there is an extracted buffer, otherwise false
+ */
+bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos)
+{
+ struct tipc_msg *msg;
+ int imsz, offset;
+
+ *iskb = NULL;
+ if (unlikely(skb_linearize(skb)))
+ goto none;
+
+ msg = buf_msg(skb);
+ offset = msg_hdr_sz(msg) + *pos;
+ if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE)))
+ goto none;
+
+ *iskb = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!*iskb))
+ goto none;
+ skb_pull(*iskb, offset);
+ imsz = msg_size(buf_msg(*iskb));
+ skb_trim(*iskb, imsz);
+ if (unlikely(!tipc_msg_validate(*iskb)))
+ goto none;
+ *pos += align(imsz);
+ return true;
+none:
+ kfree_skb(skb);
+ kfree_skb(*iskb);
+ *iskb = NULL;
+ return false;
+}
+
+/**
+ * tipc_msg_make_bundle(): Create bundle buf and append message to its tail
+ * @list: the buffer chain
+ * @skb: buffer to be appended and replaced
+ * @mtu: max allowable size for the bundle buffer, inclusive header
+ * @dnode: destination node for message. (Not always present in header)
+ * Replaces buffer if successful
+ * Returns true if success, otherwise false
+ */
+bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode)
+{
+ struct sk_buff *bskb;
+ struct tipc_msg *bmsg;
+ struct tipc_msg *msg = buf_msg(*skb);
+ u32 msz = msg_size(msg);
+ u32 max = mtu - INT_H_SIZE;
+
+ if (msg_user(msg) == MSG_FRAGMENTER)
+ return false;
+ if (msg_user(msg) == TUNNEL_PROTOCOL)
+ return false;
+ if (msg_user(msg) == BCAST_PROTOCOL)
+ return false;
+ if (msz > (max / 2))
+ return false;
+
+ bskb = tipc_buf_acquire(max);
+ if (!bskb)
+ return false;
+
+ skb_trim(bskb, INT_H_SIZE);
+ bmsg = buf_msg(bskb);
+ tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0,
+ INT_H_SIZE, dnode);
+ msg_set_seqno(bmsg, msg_seqno(msg));
+ msg_set_ack(bmsg, msg_ack(msg));
+ msg_set_bcast_ack(bmsg, msg_bcast_ack(msg));
+ tipc_msg_bundle(bskb, *skb, mtu);
+ *skb = bskb;
+ return true;
+}
+
+/**
+ * tipc_msg_reverse(): swap source and destination addresses and add error code
+ * @buf: buffer containing message to be reversed
+ * @dnode: return value: node where to send message after reversal
+ * @err: error code to be set in message
+ * Consumes buffer if failure
+ * Returns true if success, otherwise false
+ */
+bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode,
+ int err)
+{
+ struct tipc_msg *msg = buf_msg(buf);
+ struct tipc_msg ohdr;
+ uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE);
+
+ if (skb_linearize(buf))
+ goto exit;
+ msg = buf_msg(buf);
+ if (msg_dest_droppable(msg))
+ goto exit;
+ if (msg_errcode(msg))
+ goto exit;
+ memcpy(&ohdr, msg, msg_hdr_sz(msg));
+ msg_set_errcode(msg, err);
+ msg_set_origport(msg, msg_destport(&ohdr));
+ msg_set_destport(msg, msg_origport(&ohdr));
+ msg_set_prevnode(msg, own_addr);
+ if (!msg_short(msg)) {
+ msg_set_orignode(msg, msg_destnode(&ohdr));
+ msg_set_destnode(msg, msg_orignode(&ohdr));
+ }
+ msg_set_size(msg, msg_hdr_sz(msg) + rdsz);
+ skb_trim(buf, msg_size(msg));
+ skb_orphan(buf);
+ *dnode = msg_orignode(&ohdr);
+ return true;
+exit:
+ kfree_skb(buf);
+ *dnode = 0;
+ return false;
+}
+
+/**
+ * tipc_msg_lookup_dest(): try to find new destination for named message
+ * @skb: the buffer containing the message.
+ * @dnode: return value: next-hop node, if destination found
+ * @err: return value: error code to use, if message to be rejected
+ * Does not consume buffer
+ * Returns true if a destination is found, false otherwise
+ */
+bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb,
+ u32 *dnode, int *err)
+{
+ struct tipc_msg *msg = buf_msg(skb);
+ u32 dport;
+ u32 own_addr = tipc_own_addr(net);
+
+ if (!msg_isdata(msg))
+ return false;
+ if (!msg_named(msg))
+ return false;
+ if (msg_errcode(msg))
+ return false;
+ *err = -TIPC_ERR_NO_NAME;
+ if (skb_linearize(skb))
+ return false;
+ if (msg_reroute_cnt(msg))
+ return false;
+ *dnode = addr_domain(net, msg_lookup_scope(msg));
+ dport = tipc_nametbl_translate(net, msg_nametype(msg),
+ msg_nameinst(msg), dnode);
+ if (!dport)
+ return false;
+ msg_incr_reroute_cnt(msg);
+ if (*dnode != own_addr)
+ msg_set_prevnode(msg, own_addr);
+ msg_set_destnode(msg, *dnode);
+ msg_set_destport(msg, dport);
+ *err = TIPC_OK;
+ return true;
+}
+
+/* tipc_msg_reassemble() - clone a buffer chain of fragments and
+ * reassemble the clones into one message
+ */
+struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+ struct sk_buff *frag = NULL;
+ struct sk_buff *head = NULL;
+ int hdr_sz;
+
+ /* Copy header if single buffer */
+ if (skb_queue_len(list) == 1) {
+ skb = skb_peek(list);
+ hdr_sz = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb));
+ return __pskb_copy(skb, hdr_sz, GFP_ATOMIC);
+ }
+
+ /* Clone all fragments and reassemble */
+ skb_queue_walk(list, skb) {
+ frag = skb_clone(skb, GFP_ATOMIC);
+ if (!frag)
+ goto error;
+ frag->next = NULL;
+ if (tipc_buf_append(&head, &frag))
+ break;
+ if (!head)
+ goto error;
+ }
+ return frag;
+error:
+ pr_warn("Failed do clone local mcast rcv buffer\n");
+ kfree_skb(head);
+ return NULL;
+}
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
new file mode 100644
index 000000000..e1d3595e2
--- /dev/null
+++ b/net/tipc/msg.h
@@ -0,0 +1,873 @@
+/*
+ * net/tipc/msg.h: Include file for TIPC message header routines
+ *
+ * Copyright (c) 2000-2007, 2014-2015 Ericsson AB
+ * Copyright (c) 2005-2008, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_MSG_H
+#define _TIPC_MSG_H
+
+#include <linux/tipc.h>
+
+/*
+ * Constants and routines used to read and write TIPC payload message headers
+ *
+ * Note: Some items are also used with TIPC internal message headers
+ */
+#define TIPC_VERSION 2
+struct plist;
+
+/*
+ * Payload message users are defined in TIPC's public API:
+ * - TIPC_LOW_IMPORTANCE
+ * - TIPC_MEDIUM_IMPORTANCE
+ * - TIPC_HIGH_IMPORTANCE
+ * - TIPC_CRITICAL_IMPORTANCE
+ */
+#define TIPC_SYSTEM_IMPORTANCE 4
+
+
+/*
+ * Payload message types
+ */
+#define TIPC_CONN_MSG 0
+#define TIPC_MCAST_MSG 1
+#define TIPC_NAMED_MSG 2
+#define TIPC_DIRECT_MSG 3
+
+/*
+ * Internal message users
+ */
+#define BCAST_PROTOCOL 5
+#define MSG_BUNDLER 6
+#define LINK_PROTOCOL 7
+#define CONN_MANAGER 8
+#define TUNNEL_PROTOCOL 10
+#define NAME_DISTRIBUTOR 11
+#define MSG_FRAGMENTER 12
+#define LINK_CONFIG 13
+#define SOCK_WAKEUP 14 /* pseudo user */
+
+/*
+ * Message header sizes
+ */
+#define SHORT_H_SIZE 24 /* In-cluster basic payload message */
+#define BASIC_H_SIZE 32 /* Basic payload message */
+#define NAMED_H_SIZE 40 /* Named payload message */
+#define MCAST_H_SIZE 44 /* Multicast payload message */
+#define INT_H_SIZE 40 /* Internal messages */
+#define MIN_H_SIZE 24 /* Smallest legal TIPC header size */
+#define MAX_H_SIZE 60 /* Largest possible TIPC header size */
+
+#define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE)
+
+#define TIPC_MEDIA_INFO_OFFSET 5
+
+/**
+ * TIPC message buffer code
+ *
+ * TIPC message buffer headroom reserves space for the worst-case
+ * link-level device header (in case the message is sent off-node).
+ *
+ * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields
+ * are word aligned for quicker access
+ */
+#define BUF_HEADROOM (LL_MAX_HEADER + 48)
+
+struct tipc_skb_cb {
+ void *handle;
+ struct sk_buff *tail;
+ bool validated;
+ bool wakeup_pending;
+ bool bundling;
+ u16 chain_sz;
+ u16 chain_imp;
+};
+
+#define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0]))
+
+struct tipc_msg {
+ __be32 hdr[15];
+};
+
+static inline struct tipc_msg *buf_msg(struct sk_buff *skb)
+{
+ return (struct tipc_msg *)skb->data;
+}
+
+static inline u32 msg_word(struct tipc_msg *m, u32 pos)
+{
+ return ntohl(m->hdr[pos]);
+}
+
+static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val)
+{
+ m->hdr[w] = htonl(val);
+}
+
+static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask)
+{
+ return (msg_word(m, w) >> pos) & mask;
+}
+
+static inline void msg_set_bits(struct tipc_msg *m, u32 w,
+ u32 pos, u32 mask, u32 val)
+{
+ val = (val & mask) << pos;
+ mask = mask << pos;
+ m->hdr[w] &= ~htonl(mask);
+ m->hdr[w] |= htonl(val);
+}
+
+static inline void msg_swap_words(struct tipc_msg *msg, u32 a, u32 b)
+{
+ u32 temp = msg->hdr[a];
+
+ msg->hdr[a] = msg->hdr[b];
+ msg->hdr[b] = temp;
+}
+
+/*
+ * Word 0
+ */
+static inline u32 msg_version(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 29, 7);
+}
+
+static inline void msg_set_version(struct tipc_msg *m)
+{
+ msg_set_bits(m, 0, 29, 7, TIPC_VERSION);
+}
+
+static inline u32 msg_user(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 25, 0xf);
+}
+
+static inline u32 msg_isdata(struct tipc_msg *m)
+{
+ return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE;
+}
+
+static inline void msg_set_user(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 0, 25, 0xf, n);
+}
+
+static inline u32 msg_hdr_sz(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 21, 0xf) << 2;
+}
+
+static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 0, 21, 0xf, n>>2);
+}
+
+static inline u32 msg_size(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 0, 0x1ffff);
+}
+
+static inline u32 msg_data_sz(struct tipc_msg *m)
+{
+ return msg_size(m) - msg_hdr_sz(m);
+}
+
+static inline int msg_non_seq(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 20, 1);
+}
+
+static inline void msg_set_non_seq(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 0, 20, 1, n);
+}
+
+static inline int msg_dest_droppable(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 19, 1);
+}
+
+static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d)
+{
+ msg_set_bits(m, 0, 19, 1, d);
+}
+
+static inline int msg_src_droppable(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 18, 1);
+}
+
+static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d)
+{
+ msg_set_bits(m, 0, 18, 1, d);
+}
+
+static inline void msg_set_size(struct tipc_msg *m, u32 sz)
+{
+ m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz);
+}
+
+static inline unchar *msg_data(struct tipc_msg *m)
+{
+ return ((unchar *)m) + msg_hdr_sz(m);
+}
+
+static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m)
+{
+ return (struct tipc_msg *)msg_data(m);
+}
+
+/*
+ * Word 1
+ */
+static inline u32 msg_type(struct tipc_msg *m)
+{
+ return msg_bits(m, 1, 29, 0x7);
+}
+
+static inline void msg_set_type(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 1, 29, 0x7, n);
+}
+
+static inline u32 msg_named(struct tipc_msg *m)
+{
+ return msg_type(m) == TIPC_NAMED_MSG;
+}
+
+static inline u32 msg_mcast(struct tipc_msg *m)
+{
+ return msg_type(m) == TIPC_MCAST_MSG;
+}
+
+static inline u32 msg_connected(struct tipc_msg *m)
+{
+ return msg_type(m) == TIPC_CONN_MSG;
+}
+
+static inline u32 msg_errcode(struct tipc_msg *m)
+{
+ return msg_bits(m, 1, 25, 0xf);
+}
+
+static inline void msg_set_errcode(struct tipc_msg *m, u32 err)
+{
+ msg_set_bits(m, 1, 25, 0xf, err);
+}
+
+static inline u32 msg_reroute_cnt(struct tipc_msg *m)
+{
+ return msg_bits(m, 1, 21, 0xf);
+}
+
+static inline void msg_incr_reroute_cnt(struct tipc_msg *m)
+{
+ msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1);
+}
+
+static inline void msg_reset_reroute_cnt(struct tipc_msg *m)
+{
+ msg_set_bits(m, 1, 21, 0xf, 0);
+}
+
+static inline u32 msg_lookup_scope(struct tipc_msg *m)
+{
+ return msg_bits(m, 1, 19, 0x3);
+}
+
+static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 1, 19, 0x3, n);
+}
+
+static inline u32 msg_bcast_ack(struct tipc_msg *m)
+{
+ return msg_bits(m, 1, 0, 0xffff);
+}
+
+static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 1, 0, 0xffff, n);
+}
+
+
+/*
+ * Word 2
+ */
+static inline u32 msg_ack(struct tipc_msg *m)
+{
+ return msg_bits(m, 2, 16, 0xffff);
+}
+
+static inline void msg_set_ack(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 2, 16, 0xffff, n);
+}
+
+static inline u32 msg_seqno(struct tipc_msg *m)
+{
+ return msg_bits(m, 2, 0, 0xffff);
+}
+
+static inline void msg_set_seqno(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 2, 0, 0xffff, n);
+}
+
+/*
+ * Words 3-10
+ */
+static inline u32 msg_importance(struct tipc_msg *m)
+{
+ if (unlikely(msg_user(m) == MSG_FRAGMENTER))
+ return msg_bits(m, 5, 13, 0x7);
+ if (likely(msg_isdata(m) && !msg_errcode(m)))
+ return msg_user(m);
+ return TIPC_SYSTEM_IMPORTANCE;
+}
+
+static inline void msg_set_importance(struct tipc_msg *m, u32 i)
+{
+ if (unlikely(msg_user(m) == MSG_FRAGMENTER))
+ msg_set_bits(m, 5, 13, 0x7, i);
+ else if (likely(i < TIPC_SYSTEM_IMPORTANCE))
+ msg_set_user(m, i);
+ else
+ pr_warn("Trying to set illegal importance in message\n");
+}
+
+static inline u32 msg_prevnode(struct tipc_msg *m)
+{
+ return msg_word(m, 3);
+}
+
+static inline void msg_set_prevnode(struct tipc_msg *m, u32 a)
+{
+ msg_set_word(m, 3, a);
+}
+
+static inline u32 msg_origport(struct tipc_msg *m)
+{
+ if (msg_user(m) == MSG_FRAGMENTER)
+ m = msg_get_wrapped(m);
+ return msg_word(m, 4);
+}
+
+static inline void msg_set_origport(struct tipc_msg *m, u32 p)
+{
+ msg_set_word(m, 4, p);
+}
+
+static inline u32 msg_destport(struct tipc_msg *m)
+{
+ return msg_word(m, 5);
+}
+
+static inline void msg_set_destport(struct tipc_msg *m, u32 p)
+{
+ msg_set_word(m, 5, p);
+}
+
+static inline u32 msg_mc_netid(struct tipc_msg *m)
+{
+ return msg_word(m, 5);
+}
+
+static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p)
+{
+ msg_set_word(m, 5, p);
+}
+
+static inline int msg_short(struct tipc_msg *m)
+{
+ return msg_hdr_sz(m) == SHORT_H_SIZE;
+}
+
+static inline u32 msg_orignode(struct tipc_msg *m)
+{
+ if (likely(msg_short(m)))
+ return msg_prevnode(m);
+ return msg_word(m, 6);
+}
+
+static inline void msg_set_orignode(struct tipc_msg *m, u32 a)
+{
+ msg_set_word(m, 6, a);
+}
+
+static inline u32 msg_destnode(struct tipc_msg *m)
+{
+ return msg_word(m, 7);
+}
+
+static inline void msg_set_destnode(struct tipc_msg *m, u32 a)
+{
+ msg_set_word(m, 7, a);
+}
+
+static inline u32 msg_nametype(struct tipc_msg *m)
+{
+ return msg_word(m, 8);
+}
+
+static inline void msg_set_nametype(struct tipc_msg *m, u32 n)
+{
+ msg_set_word(m, 8, n);
+}
+
+static inline u32 msg_nameinst(struct tipc_msg *m)
+{
+ return msg_word(m, 9);
+}
+
+static inline u32 msg_namelower(struct tipc_msg *m)
+{
+ return msg_nameinst(m);
+}
+
+static inline void msg_set_namelower(struct tipc_msg *m, u32 n)
+{
+ msg_set_word(m, 9, n);
+}
+
+static inline void msg_set_nameinst(struct tipc_msg *m, u32 n)
+{
+ msg_set_namelower(m, n);
+}
+
+static inline u32 msg_nameupper(struct tipc_msg *m)
+{
+ return msg_word(m, 10);
+}
+
+static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
+{
+ msg_set_word(m, 10, n);
+}
+
+/*
+ * Constants and routines used to read and write TIPC internal message headers
+ */
+
+/*
+ * Connection management protocol message types
+ */
+#define CONN_PROBE 0
+#define CONN_PROBE_REPLY 1
+#define CONN_ACK 2
+
+/*
+ * Name distributor message types
+ */
+#define PUBLICATION 0
+#define WITHDRAWAL 1
+
+/*
+ * Segmentation message types
+ */
+#define FIRST_FRAGMENT 0
+#define FRAGMENT 1
+#define LAST_FRAGMENT 2
+
+/*
+ * Link management protocol message types
+ */
+#define STATE_MSG 0
+#define RESET_MSG 1
+#define ACTIVATE_MSG 2
+
+/*
+ * Changeover tunnel message types
+ */
+#define SYNCH_MSG 0
+#define FAILOVER_MSG 1
+
+/*
+ * Config protocol message types
+ */
+#define DSC_REQ_MSG 0
+#define DSC_RESP_MSG 1
+
+/*
+ * Word 1
+ */
+static inline u32 msg_seq_gap(struct tipc_msg *m)
+{
+ return msg_bits(m, 1, 16, 0x1fff);
+}
+
+static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 1, 16, 0x1fff, n);
+}
+
+static inline u32 msg_node_sig(struct tipc_msg *m)
+{
+ return msg_bits(m, 1, 0, 0xffff);
+}
+
+static inline void msg_set_node_sig(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 1, 0, 0xffff, n);
+}
+
+static inline u32 msg_node_capabilities(struct tipc_msg *m)
+{
+ return msg_bits(m, 1, 15, 0x1fff);
+}
+
+static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 1, 15, 0x1fff, n);
+}
+
+static inline bool msg_dup(struct tipc_msg *m)
+{
+ if (likely(msg_user(m) != TUNNEL_PROTOCOL))
+ return false;
+ if (msg_type(m) != SYNCH_MSG)
+ return false;
+ return true;
+}
+
+/*
+ * Word 2
+ */
+static inline u32 msg_dest_domain(struct tipc_msg *m)
+{
+ return msg_word(m, 2);
+}
+
+static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n)
+{
+ msg_set_word(m, 2, n);
+}
+
+static inline u32 msg_bcgap_after(struct tipc_msg *m)
+{
+ return msg_bits(m, 2, 16, 0xffff);
+}
+
+static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 2, 16, 0xffff, n);
+}
+
+static inline u32 msg_bcgap_to(struct tipc_msg *m)
+{
+ return msg_bits(m, 2, 0, 0xffff);
+}
+
+static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 2, 0, 0xffff, n);
+}
+
+
+/*
+ * Word 4
+ */
+static inline u32 msg_last_bcast(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 16, 0xffff);
+}
+
+static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 4, 16, 0xffff, n);
+}
+
+static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 4, 16, 0xffff, n);
+}
+
+
+static inline u32 msg_next_sent(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 0, 0xffff);
+}
+
+static inline void msg_set_next_sent(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 4, 0, 0xffff, n);
+}
+
+static inline void msg_set_long_msgno(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 4, 0, 0xffff, n);
+}
+
+static inline u32 msg_bc_netid(struct tipc_msg *m)
+{
+ return msg_word(m, 4);
+}
+
+static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
+{
+ msg_set_word(m, 4, id);
+}
+
+static inline u32 msg_link_selector(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 0, 1);
+}
+
+static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 4, 0, 1, n);
+}
+
+/*
+ * Word 5
+ */
+static inline u32 msg_session(struct tipc_msg *m)
+{
+ return msg_bits(m, 5, 16, 0xffff);
+}
+
+static inline void msg_set_session(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 5, 16, 0xffff, n);
+}
+
+static inline u32 msg_probe(struct tipc_msg *m)
+{
+ return msg_bits(m, 5, 0, 1);
+}
+
+static inline void msg_set_probe(struct tipc_msg *m, u32 val)
+{
+ msg_set_bits(m, 5, 0, 1, val);
+}
+
+static inline char msg_net_plane(struct tipc_msg *m)
+{
+ return msg_bits(m, 5, 1, 7) + 'A';
+}
+
+static inline void msg_set_net_plane(struct tipc_msg *m, char n)
+{
+ msg_set_bits(m, 5, 1, 7, (n - 'A'));
+}
+
+static inline u32 msg_linkprio(struct tipc_msg *m)
+{
+ return msg_bits(m, 5, 4, 0x1f);
+}
+
+static inline void msg_set_linkprio(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 5, 4, 0x1f, n);
+}
+
+static inline u32 msg_bearer_id(struct tipc_msg *m)
+{
+ return msg_bits(m, 5, 9, 0x7);
+}
+
+static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 5, 9, 0x7, n);
+}
+
+static inline u32 msg_redundant_link(struct tipc_msg *m)
+{
+ return msg_bits(m, 5, 12, 0x1);
+}
+
+static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r)
+{
+ msg_set_bits(m, 5, 12, 0x1, r);
+}
+
+static inline char *msg_media_addr(struct tipc_msg *m)
+{
+ return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET];
+}
+
+/*
+ * Word 9
+ */
+static inline u32 msg_msgcnt(struct tipc_msg *m)
+{
+ return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
+static inline u32 msg_bcast_tag(struct tipc_msg *m)
+{
+ return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
+static inline u32 msg_max_pkt(struct tipc_msg *m)
+{
+ return msg_bits(m, 9, 16, 0xffff) * 4;
+}
+
+static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 9, 16, 0xffff, (n / 4));
+}
+
+static inline u32 msg_link_tolerance(struct tipc_msg *m)
+{
+ return msg_bits(m, 9, 0, 0xffff);
+}
+
+static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 9, 0, 0xffff, n);
+}
+
+struct sk_buff *tipc_buf_acquire(u32 size);
+bool tipc_msg_validate(struct sk_buff *skb);
+bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode,
+ int err);
+void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type,
+ u32 hsize, u32 destnode);
+struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz,
+ uint data_sz, u32 dnode, u32 onode,
+ u32 dport, u32 oport, int errcode);
+int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf);
+bool tipc_msg_bundle(struct sk_buff *bskb, struct sk_buff *skb, u32 mtu);
+
+bool tipc_msg_make_bundle(struct sk_buff **skb, u32 mtu, u32 dnode);
+bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
+int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
+ int offset, int dsz, int mtu, struct sk_buff_head *list);
+bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, u32 *dnode,
+ int *err);
+struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list);
+
+/* tipc_skb_peek(): peek and reserve first buffer in list
+ * @list: list to be peeked in
+ * Returns pointer to first buffer in list, if any
+ */
+static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list,
+ spinlock_t *lock)
+{
+ struct sk_buff *skb;
+
+ spin_lock_bh(lock);
+ skb = skb_peek(list);
+ if (skb)
+ skb_get(skb);
+ spin_unlock_bh(lock);
+ return skb;
+}
+
+/* tipc_skb_peek_port(): find a destination port, ignoring all destinations
+ * up to and including 'filter'.
+ * Note: ignoring previously tried destinations minimizes the risk of
+ * contention on the socket lock
+ * @list: list to be peeked in
+ * @filter: last destination to be ignored from search
+ * Returns a destination port number, of applicable.
+ */
+static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter)
+{
+ struct sk_buff *skb;
+ u32 dport = 0;
+ bool ignore = true;
+
+ spin_lock_bh(&list->lock);
+ skb_queue_walk(list, skb) {
+ dport = msg_destport(buf_msg(skb));
+ if (!filter || skb_queue_is_last(list, skb))
+ break;
+ if (dport == filter)
+ ignore = false;
+ else if (!ignore)
+ break;
+ }
+ spin_unlock_bh(&list->lock);
+ return dport;
+}
+
+/* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list
+ * @list: list to be unlinked from
+ * @dport: selection criteria for buffer to unlink
+ */
+static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list,
+ u32 dport)
+{
+ struct sk_buff *_skb, *tmp, *skb = NULL;
+
+ spin_lock_bh(&list->lock);
+ skb_queue_walk_safe(list, _skb, tmp) {
+ if (msg_destport(buf_msg(_skb)) == dport) {
+ __skb_unlink(_skb, list);
+ skb = _skb;
+ break;
+ }
+ }
+ spin_unlock_bh(&list->lock);
+ return skb;
+}
+
+/* tipc_skb_queue_tail(): add buffer to tail of list;
+ * @list: list to be appended to
+ * @skb: buffer to append. Always appended
+ * @dport: the destination port of the buffer
+ * returns true if dport differs from previous destination
+ */
+static inline bool tipc_skb_queue_tail(struct sk_buff_head *list,
+ struct sk_buff *skb, u32 dport)
+{
+ struct sk_buff *_skb = NULL;
+ bool rv = false;
+
+ spin_lock_bh(&list->lock);
+ _skb = skb_peek_tail(list);
+ if (!_skb || (msg_destport(buf_msg(_skb)) != dport) ||
+ (skb_queue_len(list) > 32))
+ rv = true;
+ __skb_queue_tail(list, skb);
+ spin_unlock_bh(&list->lock);
+ return rv;
+}
+
+#endif
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
new file mode 100644
index 000000000..41e7b7e4d
--- /dev/null
+++ b/net/tipc/name_distr.c
@@ -0,0 +1,437 @@
+/*
+ * net/tipc/name_distr.c: TIPC name distribution code
+ *
+ * Copyright (c) 2000-2006, 2014, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "link.h"
+#include "name_distr.h"
+
+int sysctl_tipc_named_timeout __read_mostly = 2000;
+
+/**
+ * struct tipc_dist_queue - queue holding deferred name table updates
+ */
+static struct list_head tipc_dist_queue = LIST_HEAD_INIT(tipc_dist_queue);
+
+struct distr_queue_item {
+ struct distr_item i;
+ u32 dtype;
+ u32 node;
+ unsigned long expires;
+ struct list_head next;
+};
+
+/**
+ * publ_to_item - add publication info to a publication message
+ */
+static void publ_to_item(struct distr_item *i, struct publication *p)
+{
+ i->type = htonl(p->type);
+ i->lower = htonl(p->lower);
+ i->upper = htonl(p->upper);
+ i->ref = htonl(p->ref);
+ i->key = htonl(p->key);
+}
+
+/**
+ * named_prepare_buf - allocate & initialize a publication message
+ */
+static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
+ u32 dest)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size);
+ struct tipc_msg *msg;
+
+ if (buf != NULL) {
+ msg = buf_msg(buf);
+ tipc_msg_init(tn->own_addr, msg, NAME_DISTRIBUTOR, type,
+ INT_H_SIZE, dest);
+ msg_set_size(msg, INT_H_SIZE + size);
+ }
+ return buf;
+}
+
+void named_cluster_distribute(struct net *net, struct sk_buff *skb)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct sk_buff *oskb;
+ struct tipc_node *node;
+ u32 dnode;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(node, &tn->node_list, list) {
+ dnode = node->addr;
+ if (in_own_node(net, dnode))
+ continue;
+ if (!tipc_node_active_links(node))
+ continue;
+ oskb = pskb_copy(skb, GFP_ATOMIC);
+ if (!oskb)
+ break;
+ msg_set_destnode(buf_msg(oskb), dnode);
+ tipc_link_xmit_skb(net, oskb, dnode, dnode);
+ }
+ rcu_read_unlock();
+
+ kfree_skb(skb);
+}
+
+/**
+ * tipc_named_publish - tell other nodes about a new publication by this node
+ */
+struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct sk_buff *buf;
+ struct distr_item *item;
+
+ list_add_tail_rcu(&publ->local_list,
+ &tn->nametbl->publ_list[publ->scope]);
+
+ if (publ->scope == TIPC_NODE_SCOPE)
+ return NULL;
+
+ buf = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0);
+ if (!buf) {
+ pr_warn("Publication distribution failure\n");
+ return NULL;
+ }
+
+ item = (struct distr_item *)msg_data(buf_msg(buf));
+ publ_to_item(item, publ);
+ return buf;
+}
+
+/**
+ * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node
+ */
+struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ)
+{
+ struct sk_buff *buf;
+ struct distr_item *item;
+
+ list_del(&publ->local_list);
+
+ if (publ->scope == TIPC_NODE_SCOPE)
+ return NULL;
+
+ buf = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0);
+ if (!buf) {
+ pr_warn("Withdrawal distribution failure\n");
+ return NULL;
+ }
+
+ item = (struct distr_item *)msg_data(buf_msg(buf));
+ publ_to_item(item, publ);
+ return buf;
+}
+
+/**
+ * named_distribute - prepare name info for bulk distribution to another node
+ * @list: list of messages (buffers) to be returned from this function
+ * @dnode: node to be updated
+ * @pls: linked list of publication items to be packed into buffer chain
+ */
+static void named_distribute(struct net *net, struct sk_buff_head *list,
+ u32 dnode, struct list_head *pls)
+{
+ struct publication *publ;
+ struct sk_buff *skb = NULL;
+ struct distr_item *item = NULL;
+ uint msg_dsz = (tipc_node_get_mtu(net, dnode, 0) / ITEM_SIZE) *
+ ITEM_SIZE;
+ uint msg_rem = msg_dsz;
+
+ list_for_each_entry(publ, pls, local_list) {
+ /* Prepare next buffer: */
+ if (!skb) {
+ skb = named_prepare_buf(net, PUBLICATION, msg_rem,
+ dnode);
+ if (!skb) {
+ pr_warn("Bulk publication failure\n");
+ return;
+ }
+ item = (struct distr_item *)msg_data(buf_msg(skb));
+ }
+
+ /* Pack publication into message: */
+ publ_to_item(item, publ);
+ item++;
+ msg_rem -= ITEM_SIZE;
+
+ /* Append full buffer to list: */
+ if (!msg_rem) {
+ __skb_queue_tail(list, skb);
+ skb = NULL;
+ msg_rem = msg_dsz;
+ }
+ }
+ if (skb) {
+ msg_set_size(buf_msg(skb), INT_H_SIZE + (msg_dsz - msg_rem));
+ skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem));
+ __skb_queue_tail(list, skb);
+ }
+}
+
+/**
+ * tipc_named_node_up - tell specified node about all publications by this node
+ */
+void tipc_named_node_up(struct net *net, u32 dnode)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct sk_buff_head head;
+
+ __skb_queue_head_init(&head);
+
+ rcu_read_lock();
+ named_distribute(net, &head, dnode,
+ &tn->nametbl->publ_list[TIPC_CLUSTER_SCOPE]);
+ named_distribute(net, &head, dnode,
+ &tn->nametbl->publ_list[TIPC_ZONE_SCOPE]);
+ rcu_read_unlock();
+
+ tipc_link_xmit(net, &head, dnode, dnode);
+}
+
+static void tipc_publ_subscribe(struct net *net, struct publication *publ,
+ u32 addr)
+{
+ struct tipc_node *node;
+
+ if (in_own_node(net, addr))
+ return;
+
+ node = tipc_node_find(net, addr);
+ if (!node) {
+ pr_warn("Node subscription rejected, unknown node 0x%x\n",
+ addr);
+ return;
+ }
+
+ tipc_node_lock(node);
+ list_add_tail(&publ->nodesub_list, &node->publ_list);
+ tipc_node_unlock(node);
+ tipc_node_put(node);
+}
+
+static void tipc_publ_unsubscribe(struct net *net, struct publication *publ,
+ u32 addr)
+{
+ struct tipc_node *node;
+
+ node = tipc_node_find(net, addr);
+ if (!node)
+ return;
+
+ tipc_node_lock(node);
+ list_del_init(&publ->nodesub_list);
+ tipc_node_unlock(node);
+ tipc_node_put(node);
+}
+
+/**
+ * tipc_publ_purge - remove publication associated with a failed node
+ *
+ * Invoked for each publication issued by a newly failed node.
+ * Removes publication structure from name table & deletes it.
+ */
+static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct publication *p;
+
+ spin_lock_bh(&tn->nametbl_lock);
+ p = tipc_nametbl_remove_publ(net, publ->type, publ->lower,
+ publ->node, publ->ref, publ->key);
+ if (p)
+ tipc_publ_unsubscribe(net, p, addr);
+ spin_unlock_bh(&tn->nametbl_lock);
+
+ if (p != publ) {
+ pr_err("Unable to remove publication from failed node\n"
+ " (type=%u, lower=%u, node=0x%x, ref=%u, key=%u)\n",
+ publ->type, publ->lower, publ->node, publ->ref,
+ publ->key);
+ }
+
+ kfree_rcu(p, rcu);
+}
+
+void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr)
+{
+ struct publication *publ, *tmp;
+
+ list_for_each_entry_safe(publ, tmp, nsub_list, nodesub_list)
+ tipc_publ_purge(net, publ, addr);
+}
+
+/**
+ * tipc_update_nametbl - try to process a nametable update and notify
+ * subscribers
+ *
+ * tipc_nametbl_lock must be held.
+ * Returns the publication item if successful, otherwise NULL.
+ */
+static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
+ u32 node, u32 dtype)
+{
+ struct publication *publ = NULL;
+
+ if (dtype == PUBLICATION) {
+ publ = tipc_nametbl_insert_publ(net, ntohl(i->type),
+ ntohl(i->lower),
+ ntohl(i->upper),
+ TIPC_CLUSTER_SCOPE, node,
+ ntohl(i->ref), ntohl(i->key));
+ if (publ) {
+ tipc_publ_subscribe(net, publ, node);
+ return true;
+ }
+ } else if (dtype == WITHDRAWAL) {
+ publ = tipc_nametbl_remove_publ(net, ntohl(i->type),
+ ntohl(i->lower),
+ node, ntohl(i->ref),
+ ntohl(i->key));
+ if (publ) {
+ tipc_publ_unsubscribe(net, publ, node);
+ kfree_rcu(publ, rcu);
+ return true;
+ }
+ } else {
+ pr_warn("Unrecognized name table message received\n");
+ }
+ return false;
+}
+
+/**
+ * tipc_named_add_backlog - add a failed name table update to the backlog
+ *
+ */
+static void tipc_named_add_backlog(struct distr_item *i, u32 type, u32 node)
+{
+ struct distr_queue_item *e;
+ unsigned long now = get_jiffies_64();
+
+ e = kzalloc(sizeof(*e), GFP_ATOMIC);
+ if (!e)
+ return;
+ e->dtype = type;
+ e->node = node;
+ e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout);
+ memcpy(e, i, sizeof(*i));
+ list_add_tail(&e->next, &tipc_dist_queue);
+}
+
+/**
+ * tipc_named_process_backlog - try to process any pending name table updates
+ * from the network.
+ */
+void tipc_named_process_backlog(struct net *net)
+{
+ struct distr_queue_item *e, *tmp;
+ char addr[16];
+ unsigned long now = get_jiffies_64();
+
+ list_for_each_entry_safe(e, tmp, &tipc_dist_queue, next) {
+ if (time_after(e->expires, now)) {
+ if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype))
+ continue;
+ } else {
+ tipc_addr_string_fill(addr, e->node);
+ pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n",
+ e->dtype, ntohl(e->i.type),
+ ntohl(e->i.lower),
+ ntohl(e->i.upper),
+ addr, ntohl(e->i.key));
+ }
+ list_del(&e->next);
+ kfree(e);
+ }
+}
+
+/**
+ * tipc_named_rcv - process name table update messages sent by another node
+ */
+void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_msg *msg;
+ struct distr_item *item;
+ uint count;
+ u32 node;
+ struct sk_buff *skb;
+ int mtype;
+
+ spin_lock_bh(&tn->nametbl_lock);
+ for (skb = skb_dequeue(inputq); skb; skb = skb_dequeue(inputq)) {
+ msg = buf_msg(skb);
+ mtype = msg_type(msg);
+ item = (struct distr_item *)msg_data(msg);
+ count = msg_data_sz(msg) / ITEM_SIZE;
+ node = msg_orignode(msg);
+ while (count--) {
+ if (!tipc_update_nametbl(net, item, node, mtype))
+ tipc_named_add_backlog(item, mtype, node);
+ item++;
+ }
+ kfree_skb(skb);
+ tipc_named_process_backlog(net);
+ }
+ spin_unlock_bh(&tn->nametbl_lock);
+}
+
+/**
+ * tipc_named_reinit - re-initialize local publications
+ *
+ * This routine is called whenever TIPC networking is enabled.
+ * All name table entries published by this node are updated to reflect
+ * the node's new network address.
+ */
+void tipc_named_reinit(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct publication *publ;
+ int scope;
+
+ spin_lock_bh(&tn->nametbl_lock);
+
+ for (scope = TIPC_ZONE_SCOPE; scope <= TIPC_NODE_SCOPE; scope++)
+ list_for_each_entry_rcu(publ, &tn->nametbl->publ_list[scope],
+ local_list)
+ publ->node = tn->own_addr;
+
+ spin_unlock_bh(&tn->nametbl_lock);
+}
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
new file mode 100644
index 000000000..dd2d9fd80
--- /dev/null
+++ b/net/tipc/name_distr.h
@@ -0,0 +1,79 @@
+/*
+ * net/tipc/name_distr.h: Include file for TIPC name distribution code
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NAME_DISTR_H
+#define _TIPC_NAME_DISTR_H
+
+#include "name_table.h"
+
+#define ITEM_SIZE sizeof(struct distr_item)
+
+/**
+ * struct distr_item - publication info distributed to other nodes
+ * @type: name sequence type
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @ref: publishing port reference
+ * @key: publication key
+ *
+ * ===> All fields are stored in network byte order. <===
+ *
+ * First 3 fields identify (name or) name sequence being published.
+ * Reference field uniquely identifies port that published name sequence.
+ * Key field uniquely identifies publication, in the event a port has
+ * multiple publications of the same name sequence.
+ *
+ * Note: There is no field that identifies the publishing node because it is
+ * the same for all items contained within a publication message.
+ */
+struct distr_item {
+ __be32 type;
+ __be32 lower;
+ __be32 upper;
+ __be32 ref;
+ __be32 key;
+};
+
+struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ);
+struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ);
+void named_cluster_distribute(struct net *net, struct sk_buff *buf);
+void tipc_named_node_up(struct net *net, u32 dnode);
+void tipc_named_rcv(struct net *net, struct sk_buff_head *msg_queue);
+void tipc_named_reinit(struct net *net);
+void tipc_named_process_backlog(struct net *net);
+void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr);
+
+#endif
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
new file mode 100644
index 000000000..ab0ac62a1
--- /dev/null
+++ b/net/tipc/name_table.c
@@ -0,0 +1,1070 @@
+/*
+ * net/tipc/name_table.c: TIPC name table code
+ *
+ * Copyright (c) 2000-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2004-2008, 2010-2014, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <net/sock.h>
+#include "core.h"
+#include "netlink.h"
+#include "name_table.h"
+#include "name_distr.h"
+#include "subscr.h"
+#include "bcast.h"
+#include "addr.h"
+#include <net/genetlink.h>
+
+#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */
+
+static const struct nla_policy
+tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = {
+ [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED }
+};
+
+/**
+ * struct name_info - name sequence publication info
+ * @node_list: circular list of publications made by own node
+ * @cluster_list: circular list of publications made by own cluster
+ * @zone_list: circular list of publications made by own zone
+ * @node_list_size: number of entries in "node_list"
+ * @cluster_list_size: number of entries in "cluster_list"
+ * @zone_list_size: number of entries in "zone_list"
+ *
+ * Note: The zone list always contains at least one entry, since all
+ * publications of the associated name sequence belong to it.
+ * (The cluster and node lists may be empty.)
+ */
+struct name_info {
+ struct list_head node_list;
+ struct list_head cluster_list;
+ struct list_head zone_list;
+ u32 node_list_size;
+ u32 cluster_list_size;
+ u32 zone_list_size;
+};
+
+/**
+ * struct sub_seq - container for all published instances of a name sequence
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @info: pointer to name sequence publication info
+ */
+struct sub_seq {
+ u32 lower;
+ u32 upper;
+ struct name_info *info;
+};
+
+/**
+ * struct name_seq - container for all published instances of a name type
+ * @type: 32 bit 'type' value for name sequence
+ * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type';
+ * sub-sequences are sorted in ascending order
+ * @alloc: number of sub-sequences currently in array
+ * @first_free: array index of first unused sub-sequence entry
+ * @ns_list: links to adjacent name sequences in hash chain
+ * @subscriptions: list of subscriptions for this 'type'
+ * @lock: spinlock controlling access to publication lists of all sub-sequences
+ * @rcu: RCU callback head used for deferred freeing
+ */
+struct name_seq {
+ u32 type;
+ struct sub_seq *sseqs;
+ u32 alloc;
+ u32 first_free;
+ struct hlist_node ns_list;
+ struct list_head subscriptions;
+ spinlock_t lock;
+ struct rcu_head rcu;
+};
+
+static int hash(int x)
+{
+ return x & (TIPC_NAMETBL_SIZE - 1);
+}
+
+/**
+ * publ_create - create a publication structure
+ */
+static struct publication *publ_create(u32 type, u32 lower, u32 upper,
+ u32 scope, u32 node, u32 port_ref,
+ u32 key)
+{
+ struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC);
+ if (publ == NULL) {
+ pr_warn("Publication creation failure, no memory\n");
+ return NULL;
+ }
+
+ publ->type = type;
+ publ->lower = lower;
+ publ->upper = upper;
+ publ->scope = scope;
+ publ->node = node;
+ publ->ref = port_ref;
+ publ->key = key;
+ INIT_LIST_HEAD(&publ->pport_list);
+ return publ;
+}
+
+/**
+ * tipc_subseq_alloc - allocate a specified number of sub-sequence structures
+ */
+static struct sub_seq *tipc_subseq_alloc(u32 cnt)
+{
+ return kcalloc(cnt, sizeof(struct sub_seq), GFP_ATOMIC);
+}
+
+/**
+ * tipc_nameseq_create - create a name sequence structure for the specified 'type'
+ *
+ * Allocates a single sub-sequence structure and sets it to all 0's.
+ */
+static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_head)
+{
+ struct name_seq *nseq = kzalloc(sizeof(*nseq), GFP_ATOMIC);
+ struct sub_seq *sseq = tipc_subseq_alloc(1);
+
+ if (!nseq || !sseq) {
+ pr_warn("Name sequence creation failed, no memory\n");
+ kfree(nseq);
+ kfree(sseq);
+ return NULL;
+ }
+
+ spin_lock_init(&nseq->lock);
+ nseq->type = type;
+ nseq->sseqs = sseq;
+ nseq->alloc = 1;
+ INIT_HLIST_NODE(&nseq->ns_list);
+ INIT_LIST_HEAD(&nseq->subscriptions);
+ hlist_add_head_rcu(&nseq->ns_list, seq_head);
+ return nseq;
+}
+
+/**
+ * nameseq_find_subseq - find sub-sequence (if any) matching a name instance
+ *
+ * Very time-critical, so binary searches through sub-sequence array.
+ */
+static struct sub_seq *nameseq_find_subseq(struct name_seq *nseq,
+ u32 instance)
+{
+ struct sub_seq *sseqs = nseq->sseqs;
+ int low = 0;
+ int high = nseq->first_free - 1;
+ int mid;
+
+ while (low <= high) {
+ mid = (low + high) / 2;
+ if (instance < sseqs[mid].lower)
+ high = mid - 1;
+ else if (instance > sseqs[mid].upper)
+ low = mid + 1;
+ else
+ return &sseqs[mid];
+ }
+ return NULL;
+}
+
+/**
+ * nameseq_locate_subseq - determine position of name instance in sub-sequence
+ *
+ * Returns index in sub-sequence array of the entry that contains the specified
+ * instance value; if no entry contains that value, returns the position
+ * where a new entry for it would be inserted in the array.
+ *
+ * Note: Similar to binary search code for locating a sub-sequence.
+ */
+static u32 nameseq_locate_subseq(struct name_seq *nseq, u32 instance)
+{
+ struct sub_seq *sseqs = nseq->sseqs;
+ int low = 0;
+ int high = nseq->first_free - 1;
+ int mid;
+
+ while (low <= high) {
+ mid = (low + high) / 2;
+ if (instance < sseqs[mid].lower)
+ high = mid - 1;
+ else if (instance > sseqs[mid].upper)
+ low = mid + 1;
+ else
+ return mid;
+ }
+ return low;
+}
+
+/**
+ * tipc_nameseq_insert_publ
+ */
+static struct publication *tipc_nameseq_insert_publ(struct net *net,
+ struct name_seq *nseq,
+ u32 type, u32 lower,
+ u32 upper, u32 scope,
+ u32 node, u32 port, u32 key)
+{
+ struct tipc_subscription *s;
+ struct tipc_subscription *st;
+ struct publication *publ;
+ struct sub_seq *sseq;
+ struct name_info *info;
+ int created_subseq = 0;
+
+ sseq = nameseq_find_subseq(nseq, lower);
+ if (sseq) {
+
+ /* Lower end overlaps existing entry => need an exact match */
+ if ((sseq->lower != lower) || (sseq->upper != upper)) {
+ return NULL;
+ }
+
+ info = sseq->info;
+
+ /* Check if an identical publication already exists */
+ list_for_each_entry(publ, &info->zone_list, zone_list) {
+ if ((publ->ref == port) && (publ->key == key) &&
+ (!publ->node || (publ->node == node)))
+ return NULL;
+ }
+ } else {
+ u32 inspos;
+ struct sub_seq *freesseq;
+
+ /* Find where lower end should be inserted */
+ inspos = nameseq_locate_subseq(nseq, lower);
+
+ /* Fail if upper end overlaps into an existing entry */
+ if ((inspos < nseq->first_free) &&
+ (upper >= nseq->sseqs[inspos].lower)) {
+ return NULL;
+ }
+
+ /* Ensure there is space for new sub-sequence */
+ if (nseq->first_free == nseq->alloc) {
+ struct sub_seq *sseqs = tipc_subseq_alloc(nseq->alloc * 2);
+
+ if (!sseqs) {
+ pr_warn("Cannot publish {%u,%u,%u}, no memory\n",
+ type, lower, upper);
+ return NULL;
+ }
+ memcpy(sseqs, nseq->sseqs,
+ nseq->alloc * sizeof(struct sub_seq));
+ kfree(nseq->sseqs);
+ nseq->sseqs = sseqs;
+ nseq->alloc *= 2;
+ }
+
+ info = kzalloc(sizeof(*info), GFP_ATOMIC);
+ if (!info) {
+ pr_warn("Cannot publish {%u,%u,%u}, no memory\n",
+ type, lower, upper);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&info->node_list);
+ INIT_LIST_HEAD(&info->cluster_list);
+ INIT_LIST_HEAD(&info->zone_list);
+
+ /* Insert new sub-sequence */
+ sseq = &nseq->sseqs[inspos];
+ freesseq = &nseq->sseqs[nseq->first_free];
+ memmove(sseq + 1, sseq, (freesseq - sseq) * sizeof(*sseq));
+ memset(sseq, 0, sizeof(*sseq));
+ nseq->first_free++;
+ sseq->lower = lower;
+ sseq->upper = upper;
+ sseq->info = info;
+ created_subseq = 1;
+ }
+
+ /* Insert a publication */
+ publ = publ_create(type, lower, upper, scope, node, port, key);
+ if (!publ)
+ return NULL;
+
+ list_add(&publ->zone_list, &info->zone_list);
+ info->zone_list_size++;
+
+ if (in_own_cluster(net, node)) {
+ list_add(&publ->cluster_list, &info->cluster_list);
+ info->cluster_list_size++;
+ }
+
+ if (in_own_node(net, node)) {
+ list_add(&publ->node_list, &info->node_list);
+ info->node_list_size++;
+ }
+
+ /* Any subscriptions waiting for notification? */
+ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
+ tipc_subscr_report_overlap(s,
+ publ->lower,
+ publ->upper,
+ TIPC_PUBLISHED,
+ publ->ref,
+ publ->node,
+ created_subseq);
+ }
+ return publ;
+}
+
+/**
+ * tipc_nameseq_remove_publ
+ *
+ * NOTE: There may be cases where TIPC is asked to remove a publication
+ * that is not in the name table. For example, if another node issues a
+ * publication for a name sequence that overlaps an existing name sequence
+ * the publication will not be recorded, which means the publication won't
+ * be found when the name sequence is later withdrawn by that node.
+ * A failed withdraw request simply returns a failure indication and lets the
+ * caller issue any error or warning messages associated with such a problem.
+ */
+static struct publication *tipc_nameseq_remove_publ(struct net *net,
+ struct name_seq *nseq,
+ u32 inst, u32 node,
+ u32 ref, u32 key)
+{
+ struct publication *publ;
+ struct sub_seq *sseq = nameseq_find_subseq(nseq, inst);
+ struct name_info *info;
+ struct sub_seq *free;
+ struct tipc_subscription *s, *st;
+ int removed_subseq = 0;
+
+ if (!sseq)
+ return NULL;
+
+ info = sseq->info;
+
+ /* Locate publication, if it exists */
+ list_for_each_entry(publ, &info->zone_list, zone_list) {
+ if ((publ->key == key) && (publ->ref == ref) &&
+ (!publ->node || (publ->node == node)))
+ goto found;
+ }
+ return NULL;
+
+found:
+ /* Remove publication from zone scope list */
+ list_del(&publ->zone_list);
+ info->zone_list_size--;
+
+ /* Remove publication from cluster scope list, if present */
+ if (in_own_cluster(net, node)) {
+ list_del(&publ->cluster_list);
+ info->cluster_list_size--;
+ }
+
+ /* Remove publication from node scope list, if present */
+ if (in_own_node(net, node)) {
+ list_del(&publ->node_list);
+ info->node_list_size--;
+ }
+
+ /* Contract subseq list if no more publications for that subseq */
+ if (list_empty(&info->zone_list)) {
+ kfree(info);
+ free = &nseq->sseqs[nseq->first_free--];
+ memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq));
+ removed_subseq = 1;
+ }
+
+ /* Notify any waiting subscriptions */
+ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
+ tipc_subscr_report_overlap(s,
+ publ->lower,
+ publ->upper,
+ TIPC_WITHDRAWN,
+ publ->ref,
+ publ->node,
+ removed_subseq);
+ }
+
+ return publ;
+}
+
+/**
+ * tipc_nameseq_subscribe - attach a subscription, and issue
+ * the prescribed number of events if there is any sub-
+ * sequence overlapping with the requested sequence
+ */
+static void tipc_nameseq_subscribe(struct name_seq *nseq,
+ struct tipc_subscription *s)
+{
+ struct sub_seq *sseq = nseq->sseqs;
+
+ list_add(&s->nameseq_list, &nseq->subscriptions);
+
+ if (!sseq)
+ return;
+
+ while (sseq != &nseq->sseqs[nseq->first_free]) {
+ if (tipc_subscr_overlap(s, sseq->lower, sseq->upper)) {
+ struct publication *crs;
+ struct name_info *info = sseq->info;
+ int must_report = 1;
+
+ list_for_each_entry(crs, &info->zone_list, zone_list) {
+ tipc_subscr_report_overlap(s,
+ sseq->lower,
+ sseq->upper,
+ TIPC_PUBLISHED,
+ crs->ref,
+ crs->node,
+ must_report);
+ must_report = 0;
+ }
+ }
+ sseq++;
+ }
+}
+
+static struct name_seq *nametbl_find_seq(struct net *net, u32 type)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct hlist_head *seq_head;
+ struct name_seq *ns;
+
+ seq_head = &tn->nametbl->seq_hlist[hash(type)];
+ hlist_for_each_entry_rcu(ns, seq_head, ns_list) {
+ if (ns->type == type)
+ return ns;
+ }
+
+ return NULL;
+};
+
+struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
+ u32 lower, u32 upper, u32 scope,
+ u32 node, u32 port, u32 key)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct publication *publ;
+ struct name_seq *seq = nametbl_find_seq(net, type);
+ int index = hash(type);
+
+ if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE) ||
+ (lower > upper)) {
+ pr_debug("Failed to publish illegal {%u,%u,%u} with scope %u\n",
+ type, lower, upper, scope);
+ return NULL;
+ }
+
+ if (!seq)
+ seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]);
+ if (!seq)
+ return NULL;
+
+ spin_lock_bh(&seq->lock);
+ publ = tipc_nameseq_insert_publ(net, seq, type, lower, upper,
+ scope, node, port, key);
+ spin_unlock_bh(&seq->lock);
+ return publ;
+}
+
+struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
+ u32 lower, u32 node, u32 ref,
+ u32 key)
+{
+ struct publication *publ;
+ struct name_seq *seq = nametbl_find_seq(net, type);
+
+ if (!seq)
+ return NULL;
+
+ spin_lock_bh(&seq->lock);
+ publ = tipc_nameseq_remove_publ(net, seq, lower, node, ref, key);
+ if (!seq->first_free && list_empty(&seq->subscriptions)) {
+ hlist_del_init_rcu(&seq->ns_list);
+ kfree(seq->sseqs);
+ spin_unlock_bh(&seq->lock);
+ kfree_rcu(seq, rcu);
+ return publ;
+ }
+ spin_unlock_bh(&seq->lock);
+ return publ;
+}
+
+/**
+ * tipc_nametbl_translate - perform name translation
+ *
+ * On entry, 'destnode' is the search domain used during translation.
+ *
+ * On exit:
+ * - if name translation is deferred to another node/cluster/zone,
+ * leaves 'destnode' unchanged (will be non-zero) and returns 0
+ * - if name translation is attempted and succeeds, sets 'destnode'
+ * to publishing node and returns port reference (will be non-zero)
+ * - if name translation is attempted and fails, sets 'destnode' to 0
+ * and returns 0
+ */
+u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
+ u32 *destnode)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct sub_seq *sseq;
+ struct name_info *info;
+ struct publication *publ;
+ struct name_seq *seq;
+ u32 ref = 0;
+ u32 node = 0;
+
+ if (!tipc_in_scope(*destnode, tn->own_addr))
+ return 0;
+
+ rcu_read_lock();
+ seq = nametbl_find_seq(net, type);
+ if (unlikely(!seq))
+ goto not_found;
+ spin_lock_bh(&seq->lock);
+ sseq = nameseq_find_subseq(seq, instance);
+ if (unlikely(!sseq))
+ goto no_match;
+ info = sseq->info;
+
+ /* Closest-First Algorithm */
+ if (likely(!*destnode)) {
+ if (!list_empty(&info->node_list)) {
+ publ = list_first_entry(&info->node_list,
+ struct publication,
+ node_list);
+ list_move_tail(&publ->node_list,
+ &info->node_list);
+ } else if (!list_empty(&info->cluster_list)) {
+ publ = list_first_entry(&info->cluster_list,
+ struct publication,
+ cluster_list);
+ list_move_tail(&publ->cluster_list,
+ &info->cluster_list);
+ } else {
+ publ = list_first_entry(&info->zone_list,
+ struct publication,
+ zone_list);
+ list_move_tail(&publ->zone_list,
+ &info->zone_list);
+ }
+ }
+
+ /* Round-Robin Algorithm */
+ else if (*destnode == tn->own_addr) {
+ if (list_empty(&info->node_list))
+ goto no_match;
+ publ = list_first_entry(&info->node_list, struct publication,
+ node_list);
+ list_move_tail(&publ->node_list, &info->node_list);
+ } else if (in_own_cluster_exact(net, *destnode)) {
+ if (list_empty(&info->cluster_list))
+ goto no_match;
+ publ = list_first_entry(&info->cluster_list, struct publication,
+ cluster_list);
+ list_move_tail(&publ->cluster_list, &info->cluster_list);
+ } else {
+ publ = list_first_entry(&info->zone_list, struct publication,
+ zone_list);
+ list_move_tail(&publ->zone_list, &info->zone_list);
+ }
+
+ ref = publ->ref;
+ node = publ->node;
+no_match:
+ spin_unlock_bh(&seq->lock);
+not_found:
+ rcu_read_unlock();
+ *destnode = node;
+ return ref;
+}
+
+/**
+ * tipc_nametbl_mc_translate - find multicast destinations
+ *
+ * Creates list of all local ports that overlap the given multicast address;
+ * also determines if any off-node ports overlap.
+ *
+ * Note: Publications with a scope narrower than 'limit' are ignored.
+ * (i.e. local node-scope publications mustn't receive messages arriving
+ * from another node, even if the multcast link brought it here)
+ *
+ * Returns non-zero if any off-node ports overlap
+ */
+int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
+ u32 limit, struct tipc_plist *dports)
+{
+ struct name_seq *seq;
+ struct sub_seq *sseq;
+ struct sub_seq *sseq_stop;
+ struct name_info *info;
+ int res = 0;
+
+ rcu_read_lock();
+ seq = nametbl_find_seq(net, type);
+ if (!seq)
+ goto exit;
+
+ spin_lock_bh(&seq->lock);
+ sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
+ sseq_stop = seq->sseqs + seq->first_free;
+ for (; sseq != sseq_stop; sseq++) {
+ struct publication *publ;
+
+ if (sseq->lower > upper)
+ break;
+
+ info = sseq->info;
+ list_for_each_entry(publ, &info->node_list, node_list) {
+ if (publ->scope <= limit)
+ tipc_plist_push(dports, publ->ref);
+ }
+
+ if (info->cluster_list_size != info->node_list_size)
+ res = 1;
+ }
+ spin_unlock_bh(&seq->lock);
+exit:
+ rcu_read_unlock();
+ return res;
+}
+
+/*
+ * tipc_nametbl_publish - add name publication to network name tables
+ */
+struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
+ u32 upper, u32 scope, u32 port_ref,
+ u32 key)
+{
+ struct publication *publ;
+ struct sk_buff *buf = NULL;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ spin_lock_bh(&tn->nametbl_lock);
+ if (tn->nametbl->local_publ_count >= TIPC_MAX_PUBLICATIONS) {
+ pr_warn("Publication failed, local publication limit reached (%u)\n",
+ TIPC_MAX_PUBLICATIONS);
+ spin_unlock_bh(&tn->nametbl_lock);
+ return NULL;
+ }
+
+ publ = tipc_nametbl_insert_publ(net, type, lower, upper, scope,
+ tn->own_addr, port_ref, key);
+ if (likely(publ)) {
+ tn->nametbl->local_publ_count++;
+ buf = tipc_named_publish(net, publ);
+ /* Any pending external events? */
+ tipc_named_process_backlog(net);
+ }
+ spin_unlock_bh(&tn->nametbl_lock);
+
+ if (buf)
+ named_cluster_distribute(net, buf);
+ return publ;
+}
+
+/**
+ * tipc_nametbl_withdraw - withdraw name publication from network name tables
+ */
+int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
+ u32 key)
+{
+ struct publication *publ;
+ struct sk_buff *skb = NULL;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ spin_lock_bh(&tn->nametbl_lock);
+ publ = tipc_nametbl_remove_publ(net, type, lower, tn->own_addr,
+ ref, key);
+ if (likely(publ)) {
+ tn->nametbl->local_publ_count--;
+ skb = tipc_named_withdraw(net, publ);
+ /* Any pending external events? */
+ tipc_named_process_backlog(net);
+ list_del_init(&publ->pport_list);
+ kfree_rcu(publ, rcu);
+ } else {
+ pr_err("Unable to remove local publication\n"
+ "(type=%u, lower=%u, ref=%u, key=%u)\n",
+ type, lower, ref, key);
+ }
+ spin_unlock_bh(&tn->nametbl_lock);
+
+ if (skb) {
+ named_cluster_distribute(net, skb);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * tipc_nametbl_subscribe - add a subscription object to the name table
+ */
+void tipc_nametbl_subscribe(struct tipc_subscription *s)
+{
+ struct tipc_net *tn = net_generic(s->net, tipc_net_id);
+ u32 type = s->seq.type;
+ int index = hash(type);
+ struct name_seq *seq;
+
+ spin_lock_bh(&tn->nametbl_lock);
+ seq = nametbl_find_seq(s->net, type);
+ if (!seq)
+ seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]);
+ if (seq) {
+ spin_lock_bh(&seq->lock);
+ tipc_nameseq_subscribe(seq, s);
+ spin_unlock_bh(&seq->lock);
+ } else {
+ pr_warn("Failed to create subscription for {%u,%u,%u}\n",
+ s->seq.type, s->seq.lower, s->seq.upper);
+ }
+ spin_unlock_bh(&tn->nametbl_lock);
+}
+
+/**
+ * tipc_nametbl_unsubscribe - remove a subscription object from name table
+ */
+void tipc_nametbl_unsubscribe(struct tipc_subscription *s)
+{
+ struct tipc_net *tn = net_generic(s->net, tipc_net_id);
+ struct name_seq *seq;
+
+ spin_lock_bh(&tn->nametbl_lock);
+ seq = nametbl_find_seq(s->net, s->seq.type);
+ if (seq != NULL) {
+ spin_lock_bh(&seq->lock);
+ list_del_init(&s->nameseq_list);
+ if (!seq->first_free && list_empty(&seq->subscriptions)) {
+ hlist_del_init_rcu(&seq->ns_list);
+ kfree(seq->sseqs);
+ spin_unlock_bh(&seq->lock);
+ kfree_rcu(seq, rcu);
+ } else {
+ spin_unlock_bh(&seq->lock);
+ }
+ }
+ spin_unlock_bh(&tn->nametbl_lock);
+}
+
+int tipc_nametbl_init(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct name_table *tipc_nametbl;
+ int i;
+
+ tipc_nametbl = kzalloc(sizeof(*tipc_nametbl), GFP_ATOMIC);
+ if (!tipc_nametbl)
+ return -ENOMEM;
+
+ for (i = 0; i < TIPC_NAMETBL_SIZE; i++)
+ INIT_HLIST_HEAD(&tipc_nametbl->seq_hlist[i]);
+
+ INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_ZONE_SCOPE]);
+ INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_CLUSTER_SCOPE]);
+ INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_NODE_SCOPE]);
+ tn->nametbl = tipc_nametbl;
+ spin_lock_init(&tn->nametbl_lock);
+ return 0;
+}
+
+/**
+ * tipc_purge_publications - remove all publications for a given type
+ *
+ * tipc_nametbl_lock must be held when calling this function
+ */
+static void tipc_purge_publications(struct net *net, struct name_seq *seq)
+{
+ struct publication *publ, *safe;
+ struct sub_seq *sseq;
+ struct name_info *info;
+
+ spin_lock_bh(&seq->lock);
+ sseq = seq->sseqs;
+ info = sseq->info;
+ list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) {
+ tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node,
+ publ->ref, publ->key);
+ kfree_rcu(publ, rcu);
+ }
+ hlist_del_init_rcu(&seq->ns_list);
+ kfree(seq->sseqs);
+ spin_unlock_bh(&seq->lock);
+
+ kfree_rcu(seq, rcu);
+}
+
+void tipc_nametbl_stop(struct net *net)
+{
+ u32 i;
+ struct name_seq *seq;
+ struct hlist_head *seq_head;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct name_table *tipc_nametbl = tn->nametbl;
+
+ /* Verify name table is empty and purge any lingering
+ * publications, then release the name table
+ */
+ spin_lock_bh(&tn->nametbl_lock);
+ for (i = 0; i < TIPC_NAMETBL_SIZE; i++) {
+ if (hlist_empty(&tipc_nametbl->seq_hlist[i]))
+ continue;
+ seq_head = &tipc_nametbl->seq_hlist[i];
+ hlist_for_each_entry_rcu(seq, seq_head, ns_list) {
+ tipc_purge_publications(net, seq);
+ }
+ }
+ spin_unlock_bh(&tn->nametbl_lock);
+
+ synchronize_net();
+ kfree(tipc_nametbl);
+
+}
+
+static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
+ struct name_seq *seq,
+ struct sub_seq *sseq, u32 *last_publ)
+{
+ void *hdr;
+ struct nlattr *attrs;
+ struct nlattr *publ;
+ struct publication *p;
+
+ if (*last_publ) {
+ list_for_each_entry(p, &sseq->info->zone_list, zone_list)
+ if (p->key == *last_publ)
+ break;
+ if (p->key != *last_publ)
+ return -EPIPE;
+ } else {
+ p = list_first_entry(&sseq->info->zone_list, struct publication,
+ zone_list);
+ }
+
+ list_for_each_entry_from(p, &sseq->info->zone_list, zone_list) {
+ *last_publ = p->key;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq,
+ &tipc_genl_family, NLM_F_MULTI,
+ TIPC_NL_NAME_TABLE_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE);
+ if (!attrs)
+ goto msg_full;
+
+ publ = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL);
+ if (!publ)
+ goto attr_msg_full;
+
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, seq->type))
+ goto publ_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sseq->lower))
+ goto publ_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sseq->upper))
+ goto publ_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope))
+ goto publ_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->node))
+ goto publ_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->ref))
+ goto publ_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key))
+ goto publ_msg_full;
+
+ nla_nest_end(msg->skb, publ);
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+ }
+ *last_publ = 0;
+
+ return 0;
+
+publ_msg_full:
+ nla_nest_cancel(msg->skb, publ);
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+static int __tipc_nl_subseq_list(struct tipc_nl_msg *msg, struct name_seq *seq,
+ u32 *last_lower, u32 *last_publ)
+{
+ struct sub_seq *sseq;
+ struct sub_seq *sseq_start;
+ int err;
+
+ if (*last_lower) {
+ sseq_start = nameseq_find_subseq(seq, *last_lower);
+ if (!sseq_start)
+ return -EPIPE;
+ } else {
+ sseq_start = seq->sseqs;
+ }
+
+ for (sseq = sseq_start; sseq != &seq->sseqs[seq->first_free]; sseq++) {
+ err = __tipc_nl_add_nametable_publ(msg, seq, sseq, last_publ);
+ if (err) {
+ *last_lower = sseq->lower;
+ return err;
+ }
+ }
+ *last_lower = 0;
+
+ return 0;
+}
+
+static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg,
+ u32 *last_type, u32 *last_lower, u32 *last_publ)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct hlist_head *seq_head;
+ struct name_seq *seq = NULL;
+ int err;
+ int i;
+
+ if (*last_type)
+ i = hash(*last_type);
+ else
+ i = 0;
+
+ for (; i < TIPC_NAMETBL_SIZE; i++) {
+ seq_head = &tn->nametbl->seq_hlist[i];
+
+ if (*last_type) {
+ seq = nametbl_find_seq(net, *last_type);
+ if (!seq)
+ return -EPIPE;
+ } else {
+ hlist_for_each_entry_rcu(seq, seq_head, ns_list)
+ break;
+ if (!seq)
+ continue;
+ }
+
+ hlist_for_each_entry_from_rcu(seq, ns_list) {
+ spin_lock_bh(&seq->lock);
+ err = __tipc_nl_subseq_list(msg, seq, last_lower,
+ last_publ);
+
+ if (err) {
+ *last_type = seq->type;
+ spin_unlock_bh(&seq->lock);
+ return err;
+ }
+ spin_unlock_bh(&seq->lock);
+ }
+ *last_type = 0;
+ }
+ return 0;
+}
+
+int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int err;
+ int done = cb->args[3];
+ u32 last_type = cb->args[0];
+ u32 last_lower = cb->args[1];
+ u32 last_publ = cb->args[2];
+ struct net *net = sock_net(skb->sk);
+ struct tipc_nl_msg msg;
+
+ if (done)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ rcu_read_lock();
+ err = tipc_nl_seq_list(net, &msg, &last_type, &last_lower, &last_publ);
+ if (!err) {
+ done = 1;
+ } else if (err != -EMSGSIZE) {
+ /* We never set seq or call nl_dump_check_consistent() this
+ * means that setting prev_seq here will cause the consistence
+ * check to fail in the netlink callback handler. Resulting in
+ * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if
+ * we got an error.
+ */
+ cb->prev_seq = 1;
+ }
+ rcu_read_unlock();
+
+ cb->args[0] = last_type;
+ cb->args[1] = last_lower;
+ cb->args[2] = last_publ;
+ cb->args[3] = done;
+
+ return skb->len;
+}
+
+void tipc_plist_push(struct tipc_plist *pl, u32 port)
+{
+ struct tipc_plist *nl;
+
+ if (likely(!pl->port)) {
+ pl->port = port;
+ return;
+ }
+ if (pl->port == port)
+ return;
+ list_for_each_entry(nl, &pl->list, list) {
+ if (nl->port == port)
+ return;
+ }
+ nl = kmalloc(sizeof(*nl), GFP_ATOMIC);
+ if (nl) {
+ nl->port = port;
+ list_add(&nl->list, &pl->list);
+ }
+}
+
+u32 tipc_plist_pop(struct tipc_plist *pl)
+{
+ struct tipc_plist *nl;
+ u32 port = 0;
+
+ if (likely(list_empty(&pl->list))) {
+ port = pl->port;
+ pl->port = 0;
+ return port;
+ }
+ nl = list_first_entry(&pl->list, typeof(*nl), list);
+ port = nl->port;
+ list_del(&nl->list);
+ kfree(nl);
+ return port;
+}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
new file mode 100644
index 000000000..1524a7383
--- /dev/null
+++ b/net/tipc/name_table.h
@@ -0,0 +1,133 @@
+/*
+ * net/tipc/name_table.h: Include file for TIPC name table code
+ *
+ * Copyright (c) 2000-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NAME_TABLE_H
+#define _TIPC_NAME_TABLE_H
+
+struct tipc_subscription;
+struct tipc_plist;
+
+/*
+ * TIPC name types reserved for internal TIPC use (both current and planned)
+ */
+#define TIPC_ZM_SRV 3 /* zone master service name type */
+#define TIPC_PUBL_SCOPE_NUM (TIPC_NODE_SCOPE + 1)
+#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */
+
+/**
+ * struct publication - info about a published (name or) name sequence
+ * @type: name sequence type
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @scope: scope of publication
+ * @node: network address of publishing port's node
+ * @ref: publishing port
+ * @key: publication key
+ * @nodesub_list: subscription to "node down" event (off-node publication only)
+ * @local_list: adjacent entries in list of publications made by this node
+ * @pport_list: adjacent entries in list of publications made by this port
+ * @node_list: adjacent matching name seq publications with >= node scope
+ * @cluster_list: adjacent matching name seq publications with >= cluster scope
+ * @zone_list: adjacent matching name seq publications with >= zone scope
+ * @rcu: RCU callback head used for deferred freeing
+ *
+ * Note that the node list, cluster list, and zone list are circular lists.
+ */
+struct publication {
+ u32 type;
+ u32 lower;
+ u32 upper;
+ u32 scope;
+ u32 node;
+ u32 ref;
+ u32 key;
+ struct list_head nodesub_list;
+ struct list_head local_list;
+ struct list_head pport_list;
+ struct list_head node_list;
+ struct list_head cluster_list;
+ struct list_head zone_list;
+ struct rcu_head rcu;
+};
+
+/**
+ * struct name_table - table containing all existing port name publications
+ * @seq_hlist: name sequence hash lists
+ * @publ_list: pulication lists
+ * @local_publ_count: number of publications issued by this node
+ */
+struct name_table {
+ struct hlist_head seq_hlist[TIPC_NAMETBL_SIZE];
+ struct list_head publ_list[TIPC_PUBL_SCOPE_NUM];
+ u32 local_publ_count;
+};
+
+int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
+int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
+ u32 limit, struct tipc_plist *dports);
+struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
+ u32 upper, u32 scope, u32 port_ref,
+ u32 key);
+int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
+ u32 key);
+struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
+ u32 lower, u32 upper, u32 scope,
+ u32 node, u32 ref, u32 key);
+struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
+ u32 lower, u32 node, u32 ref,
+ u32 key);
+void tipc_nametbl_subscribe(struct tipc_subscription *s);
+void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
+int tipc_nametbl_init(struct net *net);
+void tipc_nametbl_stop(struct net *net);
+
+struct tipc_plist {
+ struct list_head list;
+ u32 port;
+};
+
+static inline void tipc_plist_init(struct tipc_plist *pl)
+{
+ INIT_LIST_HEAD(&pl->list);
+ pl->port = 0;
+}
+
+void tipc_plist_push(struct tipc_plist *pl, u32 port);
+u32 tipc_plist_pop(struct tipc_plist *pl);
+
+#endif
diff --git a/net/tipc/net.c b/net/tipc/net.c
new file mode 100644
index 000000000..a54f3cbe2
--- /dev/null
+++ b/net/tipc/net.c
@@ -0,0 +1,254 @@
+/*
+ * net/tipc/net.c: TIPC network routing code
+ *
+ * Copyright (c) 1995-2006, 2014, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "net.h"
+#include "name_distr.h"
+#include "subscr.h"
+#include "socket.h"
+#include "node.h"
+
+static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
+ [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NET_ID] = { .type = NLA_U32 }
+};
+
+/*
+ * The TIPC locking policy is designed to ensure a very fine locking
+ * granularity, permitting complete parallel access to individual
+ * port and node/link instances. The code consists of four major
+ * locking domains, each protected with their own disjunct set of locks.
+ *
+ * 1: The bearer level.
+ * RTNL lock is used to serialize the process of configuring bearer
+ * on update side, and RCU lock is applied on read side to make
+ * bearer instance valid on both paths of message transmission and
+ * reception.
+ *
+ * 2: The node and link level.
+ * All node instances are saved into two tipc_node_list and node_htable
+ * lists. The two lists are protected by node_list_lock on write side,
+ * and they are guarded with RCU lock on read side. Especially node
+ * instance is destroyed only when TIPC module is removed, and we can
+ * confirm that there has no any user who is accessing the node at the
+ * moment. Therefore, Except for iterating the two lists within RCU
+ * protection, it's no needed to hold RCU that we access node instance
+ * in other places.
+ *
+ * In addition, all members in node structure including link instances
+ * are protected by node spin lock.
+ *
+ * 3: The transport level of the protocol.
+ * This consists of the structures port, (and its user level
+ * representations, such as user_port and tipc_sock), reference and
+ * tipc_user (port.c, reg.c, socket.c).
+ *
+ * This layer has four different locks:
+ * - The tipc_port spin_lock. This is protecting each port instance
+ * from parallel data access and removal. Since we can not place
+ * this lock in the port itself, it has been placed in the
+ * corresponding reference table entry, which has the same life
+ * cycle as the module. This entry is difficult to access from
+ * outside the TIPC core, however, so a pointer to the lock has
+ * been added in the port instance, -to be used for unlocking
+ * only.
+ * - A read/write lock to protect the reference table itself (teg.c).
+ * (Nobody is using read-only access to this, so it can just as
+ * well be changed to a spin_lock)
+ * - A spin lock to protect the registry of kernel/driver users (reg.c)
+ * - A global spin_lock (tipc_port_lock), which only task is to ensure
+ * consistency where more than one port is involved in an operation,
+ * i.e., whe a port is part of a linked list of ports.
+ * There are two such lists; 'port_list', which is used for management,
+ * and 'wait_list', which is used to queue ports during congestion.
+ *
+ * 4: The name table (name_table.c, name_distr.c, subscription.c)
+ * - There is one big read/write-lock (tipc_nametbl_lock) protecting the
+ * overall name table structure. Nothing must be added/removed to
+ * this structure without holding write access to it.
+ * - There is one local spin_lock per sub_sequence, which can be seen
+ * as a sub-domain to the tipc_nametbl_lock domain. It is used only
+ * for translation operations, and is needed because a translation
+ * steps the root of the 'publication' linked list between each lookup.
+ * This is always used within the scope of a tipc_nametbl_lock(read).
+ * - A local spin_lock protecting the queue of subscriber events.
+*/
+
+int tipc_net_start(struct net *net, u32 addr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ char addr_string[16];
+ int res;
+
+ tn->own_addr = addr;
+ tipc_named_reinit(net);
+ tipc_sk_reinit(net);
+ res = tipc_bclink_init(net);
+ if (res)
+ return res;
+
+ tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr,
+ TIPC_ZONE_SCOPE, 0, tn->own_addr);
+
+ pr_info("Started in network mode\n");
+ pr_info("Own node address %s, network identity %u\n",
+ tipc_addr_string_fill(addr_string, tn->own_addr),
+ tn->net_id);
+ return 0;
+}
+
+void tipc_net_stop(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ if (!tn->own_addr)
+ return;
+
+ tipc_nametbl_withdraw(net, TIPC_CFG_SRV, tn->own_addr, 0,
+ tn->own_addr);
+ rtnl_lock();
+ tipc_bearer_stop(net);
+ tipc_bclink_stop(net);
+ tipc_node_stop(net);
+ rtnl_unlock();
+
+ pr_info("Left network mode\n");
+}
+
+static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ void *hdr;
+ struct nlattr *attrs;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ NLM_F_MULTI, TIPC_NL_NET_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_NET);
+ if (!attrs)
+ goto msg_full;
+
+ if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id))
+ goto attr_msg_full;
+
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int err;
+ int done = cb->args[0];
+ struct tipc_nl_msg msg;
+
+ if (done)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ err = __tipc_nl_add_net(net, &msg);
+ if (err)
+ goto out;
+
+ done = 1;
+out:
+ cb->args[0] = done;
+
+ return skb->len;
+}
+
+int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct nlattr *attrs[TIPC_NLA_NET_MAX + 1];
+ int err;
+
+ if (!info->attrs[TIPC_NLA_NET])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX,
+ info->attrs[TIPC_NLA_NET],
+ tipc_nl_net_policy);
+ if (err)
+ return err;
+
+ if (attrs[TIPC_NLA_NET_ID]) {
+ u32 val;
+
+ /* Can't change net id once TIPC has joined a network */
+ if (tn->own_addr)
+ return -EPERM;
+
+ val = nla_get_u32(attrs[TIPC_NLA_NET_ID]);
+ if (val < 1 || val > 9999)
+ return -EINVAL;
+
+ tn->net_id = val;
+ }
+
+ if (attrs[TIPC_NLA_NET_ADDR]) {
+ u32 addr;
+
+ /* Can't change net addr once TIPC has joined a network */
+ if (tn->own_addr)
+ return -EPERM;
+
+ addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
+ if (!tipc_addr_node_valid(addr))
+ return -EINVAL;
+
+ rtnl_lock();
+ tipc_net_start(net, addr);
+ rtnl_unlock();
+ }
+
+ return 0;
+}
diff --git a/net/tipc/net.h b/net/tipc/net.h
new file mode 100644
index 000000000..77a7a1189
--- /dev/null
+++ b/net/tipc/net.h
@@ -0,0 +1,49 @@
+/*
+ * net/tipc/net.h: Include file for TIPC network routing code
+ *
+ * Copyright (c) 1995-2006, 2014, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NET_H
+#define _TIPC_NET_H
+
+#include <net/genetlink.h>
+
+int tipc_net_start(struct net *net, u32 addr);
+
+void tipc_net_stop(struct net *net);
+
+int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb);
+int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info);
+
+#endif
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
new file mode 100644
index 000000000..7f6475efc
--- /dev/null
+++ b/net/tipc/netlink.c
@@ -0,0 +1,178 @@
+/*
+ * net/tipc/netlink.c: TIPC configuration handling
+ *
+ * Copyright (c) 2005-2006, 2014, Ericsson AB
+ * Copyright (c) 2005-2007, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "socket.h"
+#include "name_table.h"
+#include "bearer.h"
+#include "link.h"
+#include "node.h"
+#include "net.h"
+#include <net/genetlink.h>
+
+static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = {
+ [TIPC_NLA_UNSPEC] = { .type = NLA_UNSPEC, },
+ [TIPC_NLA_BEARER] = { .type = NLA_NESTED, },
+ [TIPC_NLA_SOCK] = { .type = NLA_NESTED, },
+ [TIPC_NLA_PUBL] = { .type = NLA_NESTED, },
+ [TIPC_NLA_LINK] = { .type = NLA_NESTED, },
+ [TIPC_NLA_MEDIA] = { .type = NLA_NESTED, },
+ [TIPC_NLA_NODE] = { .type = NLA_NESTED, },
+ [TIPC_NLA_NET] = { .type = NLA_NESTED, },
+ [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, }
+};
+
+/* Users of the legacy API (tipc-config) can't handle that we add operations,
+ * so we have a separate genl handling for the new API.
+ */
+struct genl_family tipc_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .name = TIPC_GENL_V2_NAME,
+ .version = TIPC_GENL_V2_VERSION,
+ .hdrsize = 0,
+ .maxattr = TIPC_NLA_MAX,
+ .netnsok = true,
+};
+
+static const struct genl_ops tipc_genl_v2_ops[] = {
+ {
+ .cmd = TIPC_NL_BEARER_DISABLE,
+ .doit = tipc_nl_bearer_disable,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_BEARER_ENABLE,
+ .doit = tipc_nl_bearer_enable,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_BEARER_GET,
+ .doit = tipc_nl_bearer_get,
+ .dumpit = tipc_nl_bearer_dump,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_BEARER_SET,
+ .doit = tipc_nl_bearer_set,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_SOCK_GET,
+ .dumpit = tipc_nl_sk_dump,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_PUBL_GET,
+ .dumpit = tipc_nl_publ_dump,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_LINK_GET,
+ .doit = tipc_nl_link_get,
+ .dumpit = tipc_nl_link_dump,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_LINK_SET,
+ .doit = tipc_nl_link_set,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_LINK_RESET_STATS,
+ .doit = tipc_nl_link_reset_stats,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_MEDIA_GET,
+ .doit = tipc_nl_media_get,
+ .dumpit = tipc_nl_media_dump,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_MEDIA_SET,
+ .doit = tipc_nl_media_set,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_NODE_GET,
+ .dumpit = tipc_nl_node_dump,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_NET_GET,
+ .dumpit = tipc_nl_net_dump,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_NET_SET,
+ .doit = tipc_nl_net_set,
+ .policy = tipc_nl_policy,
+ },
+ {
+ .cmd = TIPC_NL_NAME_TABLE_GET,
+ .dumpit = tipc_nl_name_table_dump,
+ .policy = tipc_nl_policy,
+ }
+};
+
+int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
+{
+ u32 maxattr = tipc_genl_family.maxattr;
+
+ *attr = tipc_genl_family.attrbuf;
+ if (!*attr)
+ return -EOPNOTSUPP;
+
+ return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy);
+}
+
+int tipc_netlink_start(void)
+{
+ int res;
+
+ res = genl_register_family_with_ops(&tipc_genl_family,
+ tipc_genl_v2_ops);
+ if (res) {
+ pr_err("Failed to register netlink interface\n");
+ return res;
+ }
+ return 0;
+}
+
+void tipc_netlink_stop(void)
+{
+ genl_unregister_family(&tipc_genl_family);
+}
diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h
new file mode 100644
index 000000000..08a1db67b
--- /dev/null
+++ b/net/tipc/netlink.h
@@ -0,0 +1,53 @@
+/*
+ * net/tipc/netlink.h: Include file for TIPC netlink code
+ *
+ * Copyright (c) 2014, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NETLINK_H
+#define _TIPC_NETLINK_H
+
+extern struct genl_family tipc_genl_family;
+int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf);
+
+struct tipc_nl_msg {
+ struct sk_buff *skb;
+ u32 portid;
+ u32 seq;
+};
+
+int tipc_netlink_start(void);
+int tipc_netlink_compat_start(void);
+void tipc_netlink_stop(void);
+void tipc_netlink_compat_stop(void);
+
+#endif
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
new file mode 100644
index 000000000..ce9121e8e
--- /dev/null
+++ b/net/tipc/netlink_compat.c
@@ -0,0 +1,1084 @@
+/*
+ * Copyright (c) 2014, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "bearer.h"
+#include "link.h"
+#include "name_table.h"
+#include "socket.h"
+#include "node.h"
+#include "net.h"
+#include <net/genetlink.h>
+#include <linux/tipc_config.h>
+
+/* The legacy API had an artificial message length limit called
+ * ULTRA_STRING_MAX_LEN.
+ */
+#define ULTRA_STRING_MAX_LEN 32768
+
+#define TIPC_SKB_MAX TLV_SPACE(ULTRA_STRING_MAX_LEN)
+
+#define REPLY_TRUNCATED "<truncated>\n"
+
+struct tipc_nl_compat_msg {
+ u16 cmd;
+ int rep_type;
+ int rep_size;
+ int req_type;
+ struct sk_buff *rep;
+ struct tlv_desc *req;
+ struct sock *dst_sk;
+};
+
+struct tipc_nl_compat_cmd_dump {
+ int (*header)(struct tipc_nl_compat_msg *);
+ int (*dumpit)(struct sk_buff *, struct netlink_callback *);
+ int (*format)(struct tipc_nl_compat_msg *msg, struct nlattr **attrs);
+};
+
+struct tipc_nl_compat_cmd_doit {
+ int (*doit)(struct sk_buff *skb, struct genl_info *info);
+ int (*transcode)(struct sk_buff *skb, struct tipc_nl_compat_msg *msg);
+};
+
+static int tipc_skb_tailroom(struct sk_buff *skb)
+{
+ int tailroom;
+ int limit;
+
+ tailroom = skb_tailroom(skb);
+ limit = TIPC_SKB_MAX - skb->len;
+
+ if (tailroom < limit)
+ return tailroom;
+
+ return limit;
+}
+
+static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len)
+{
+ struct tlv_desc *tlv = (struct tlv_desc *)skb_tail_pointer(skb);
+
+ if (tipc_skb_tailroom(skb) < TLV_SPACE(len))
+ return -EMSGSIZE;
+
+ skb_put(skb, TLV_SPACE(len));
+ tlv->tlv_type = htons(type);
+ tlv->tlv_len = htons(TLV_LENGTH(len));
+ if (len && data)
+ memcpy(TLV_DATA(tlv), data, len);
+
+ return 0;
+}
+
+static void tipc_tlv_init(struct sk_buff *skb, u16 type)
+{
+ struct tlv_desc *tlv = (struct tlv_desc *)skb->data;
+
+ TLV_SET_LEN(tlv, 0);
+ TLV_SET_TYPE(tlv, type);
+ skb_put(skb, sizeof(struct tlv_desc));
+}
+
+static int tipc_tlv_sprintf(struct sk_buff *skb, const char *fmt, ...)
+{
+ int n;
+ u16 len;
+ u32 rem;
+ char *buf;
+ struct tlv_desc *tlv;
+ va_list args;
+
+ rem = tipc_skb_tailroom(skb);
+
+ tlv = (struct tlv_desc *)skb->data;
+ len = TLV_GET_LEN(tlv);
+ buf = TLV_DATA(tlv) + len;
+
+ va_start(args, fmt);
+ n = vscnprintf(buf, rem, fmt, args);
+ va_end(args);
+
+ TLV_SET_LEN(tlv, n + len);
+ skb_put(skb, n);
+
+ return n;
+}
+
+static struct sk_buff *tipc_tlv_alloc(int size)
+{
+ int hdr_len;
+ struct sk_buff *buf;
+
+ size = TLV_SPACE(size);
+ hdr_len = nlmsg_total_size(GENL_HDRLEN + TIPC_GENL_HDRLEN);
+
+ buf = alloc_skb(hdr_len + size, GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ skb_reserve(buf, hdr_len);
+
+ return buf;
+}
+
+static struct sk_buff *tipc_get_err_tlv(char *str)
+{
+ int str_len = strlen(str) + 1;
+ struct sk_buff *buf;
+
+ buf = tipc_tlv_alloc(TLV_SPACE(str_len));
+ if (buf)
+ tipc_add_tlv(buf, TIPC_TLV_ERROR_STRING, str, str_len);
+
+ return buf;
+}
+
+static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
+ struct tipc_nl_compat_msg *msg,
+ struct sk_buff *arg)
+{
+ int len = 0;
+ int err;
+ struct sk_buff *buf;
+ struct nlmsghdr *nlmsg;
+ struct netlink_callback cb;
+
+ memset(&cb, 0, sizeof(cb));
+ cb.nlh = (struct nlmsghdr *)arg->data;
+ cb.skb = arg;
+
+ buf = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ buf->sk = msg->dst_sk;
+
+ do {
+ int rem;
+
+ len = (*cmd->dumpit)(buf, &cb);
+
+ nlmsg_for_each_msg(nlmsg, nlmsg_hdr(buf), len, rem) {
+ struct nlattr **attrs;
+
+ err = tipc_nlmsg_parse(nlmsg, &attrs);
+ if (err)
+ goto err_out;
+
+ err = (*cmd->format)(msg, attrs);
+ if (err)
+ goto err_out;
+
+ if (tipc_skb_tailroom(msg->rep) <= 1) {
+ err = -EMSGSIZE;
+ goto err_out;
+ }
+ }
+
+ skb_reset_tail_pointer(buf);
+ buf->len = 0;
+
+ } while (len);
+
+ err = 0;
+
+err_out:
+ kfree_skb(buf);
+
+ if (err == -EMSGSIZE) {
+ /* The legacy API only considered messages filling
+ * "ULTRA_STRING_MAX_LEN" to be truncated.
+ */
+ if ((TIPC_SKB_MAX - msg->rep->len) <= 1) {
+ char *tail = skb_tail_pointer(msg->rep);
+
+ if (*tail != '\0')
+ sprintf(tail - sizeof(REPLY_TRUNCATED) - 1,
+ REPLY_TRUNCATED);
+ }
+
+ return 0;
+ }
+
+ return err;
+}
+
+static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
+ struct tipc_nl_compat_msg *msg)
+{
+ int err;
+ struct sk_buff *arg;
+
+ if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type))
+ return -EINVAL;
+
+ msg->rep = tipc_tlv_alloc(msg->rep_size);
+ if (!msg->rep)
+ return -ENOMEM;
+
+ if (msg->rep_type)
+ tipc_tlv_init(msg->rep, msg->rep_type);
+
+ if (cmd->header)
+ (*cmd->header)(msg);
+
+ arg = nlmsg_new(0, GFP_KERNEL);
+ if (!arg) {
+ kfree_skb(msg->rep);
+ return -ENOMEM;
+ }
+
+ err = __tipc_nl_compat_dumpit(cmd, msg, arg);
+ if (err)
+ kfree_skb(msg->rep);
+
+ kfree_skb(arg);
+
+ return err;
+}
+
+static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd,
+ struct tipc_nl_compat_msg *msg)
+{
+ int err;
+ struct sk_buff *doit_buf;
+ struct sk_buff *trans_buf;
+ struct nlattr **attrbuf;
+ struct genl_info info;
+
+ trans_buf = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!trans_buf)
+ return -ENOMEM;
+
+ err = (*cmd->transcode)(trans_buf, msg);
+ if (err)
+ goto trans_out;
+
+ attrbuf = kmalloc((tipc_genl_family.maxattr + 1) *
+ sizeof(struct nlattr *), GFP_KERNEL);
+ if (!attrbuf) {
+ err = -ENOMEM;
+ goto trans_out;
+ }
+
+ err = nla_parse(attrbuf, tipc_genl_family.maxattr,
+ (const struct nlattr *)trans_buf->data,
+ trans_buf->len, NULL);
+ if (err)
+ goto parse_out;
+
+ doit_buf = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!doit_buf) {
+ err = -ENOMEM;
+ goto parse_out;
+ }
+
+ doit_buf->sk = msg->dst_sk;
+
+ memset(&info, 0, sizeof(info));
+ info.attrs = attrbuf;
+
+ err = (*cmd->doit)(doit_buf, &info);
+
+ kfree_skb(doit_buf);
+parse_out:
+ kfree(attrbuf);
+trans_out:
+ kfree_skb(trans_buf);
+
+ return err;
+}
+
+static int tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd,
+ struct tipc_nl_compat_msg *msg)
+{
+ int err;
+
+ if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type))
+ return -EINVAL;
+
+ err = __tipc_nl_compat_doit(cmd, msg);
+ if (err)
+ return err;
+
+ /* The legacy API considered an empty message a success message */
+ msg->rep = tipc_tlv_alloc(0);
+ if (!msg->rep)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ struct nlattr *bearer[TIPC_NLA_BEARER_MAX + 1];
+
+ nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER],
+ NULL);
+
+ return tipc_add_tlv(msg->rep, TIPC_TLV_BEARER_NAME,
+ nla_data(bearer[TIPC_NLA_BEARER_NAME]),
+ nla_len(bearer[TIPC_NLA_BEARER_NAME]));
+}
+
+static int tipc_nl_compat_bearer_enable(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
+{
+ struct nlattr *prop;
+ struct nlattr *bearer;
+ struct tipc_bearer_config *b;
+
+ b = (struct tipc_bearer_config *)TLV_DATA(msg->req);
+
+ bearer = nla_nest_start(skb, TIPC_NLA_BEARER);
+ if (!bearer)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, b->name))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TIPC_NLA_BEARER_DOMAIN, ntohl(b->disc_domain)))
+ return -EMSGSIZE;
+
+ if (ntohl(b->priority) <= TIPC_MAX_LINK_PRI) {
+ prop = nla_nest_start(skb, TIPC_NLA_BEARER_PROP);
+ if (!prop)
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(b->priority)))
+ return -EMSGSIZE;
+ nla_nest_end(skb, prop);
+ }
+ nla_nest_end(skb, bearer);
+
+ return 0;
+}
+
+static int tipc_nl_compat_bearer_disable(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
+{
+ char *name;
+ struct nlattr *bearer;
+
+ name = (char *)TLV_DATA(msg->req);
+
+ bearer = nla_nest_start(skb, TIPC_NLA_BEARER);
+ if (!bearer)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, name))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, bearer);
+
+ return 0;
+}
+
+static inline u32 perc(u32 count, u32 total)
+{
+ return (count * 100 + (total / 2)) / total;
+}
+
+static void __fill_bc_link_stat(struct tipc_nl_compat_msg *msg,
+ struct nlattr *prop[], struct nlattr *stats[])
+{
+ tipc_tlv_sprintf(msg->rep, " Window:%u packets\n",
+ nla_get_u32(prop[TIPC_NLA_PROP_WIN]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " RX packets:%u fragments:%u/%u bundles:%u/%u\n",
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_INFO]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_FRAGMENTS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_FRAGMENTED]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_BUNDLES]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_BUNDLED]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " TX packets:%u fragments:%u/%u bundles:%u/%u\n",
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_INFO]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_FRAGMENTS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_FRAGMENTED]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_BUNDLES]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_BUNDLED]));
+
+ tipc_tlv_sprintf(msg->rep, " RX naks:%u defs:%u dups:%u\n",
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_NACKS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_DEFERRED]),
+ nla_get_u32(stats[TIPC_NLA_STATS_DUPLICATES]));
+
+ tipc_tlv_sprintf(msg->rep, " TX naks:%u acks:%u dups:%u\n",
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_NACKS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_ACKS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RETRANSMITTED]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " Congestion link:%u Send queue max:%u avg:%u",
+ nla_get_u32(stats[TIPC_NLA_STATS_LINK_CONGS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MAX_QUEUE]),
+ nla_get_u32(stats[TIPC_NLA_STATS_AVG_QUEUE]));
+}
+
+static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ char *name;
+ struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
+ struct nlattr *prop[TIPC_NLA_PROP_MAX + 1];
+ struct nlattr *stats[TIPC_NLA_STATS_MAX + 1];
+
+ nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+
+ nla_parse_nested(prop, TIPC_NLA_PROP_MAX, link[TIPC_NLA_LINK_PROP],
+ NULL);
+
+ nla_parse_nested(stats, TIPC_NLA_STATS_MAX, link[TIPC_NLA_LINK_STATS],
+ NULL);
+
+ name = (char *)TLV_DATA(msg->req);
+ if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0)
+ return 0;
+
+ tipc_tlv_sprintf(msg->rep, "\nLink <%s>\n",
+ nla_data(link[TIPC_NLA_LINK_NAME]));
+
+ if (link[TIPC_NLA_LINK_BROADCAST]) {
+ __fill_bc_link_stat(msg, prop, stats);
+ return 0;
+ }
+
+ if (link[TIPC_NLA_LINK_ACTIVE])
+ tipc_tlv_sprintf(msg->rep, " ACTIVE");
+ else if (link[TIPC_NLA_LINK_UP])
+ tipc_tlv_sprintf(msg->rep, " STANDBY");
+ else
+ tipc_tlv_sprintf(msg->rep, " DEFUNCT");
+
+ tipc_tlv_sprintf(msg->rep, " MTU:%u Priority:%u",
+ nla_get_u32(link[TIPC_NLA_LINK_MTU]),
+ nla_get_u32(prop[TIPC_NLA_PROP_PRIO]));
+
+ tipc_tlv_sprintf(msg->rep, " Tolerance:%u ms Window:%u packets\n",
+ nla_get_u32(prop[TIPC_NLA_PROP_TOL]),
+ nla_get_u32(prop[TIPC_NLA_PROP_WIN]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " RX packets:%u fragments:%u/%u bundles:%u/%u\n",
+ nla_get_u32(link[TIPC_NLA_LINK_RX]) -
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_INFO]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_FRAGMENTS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_FRAGMENTED]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_BUNDLES]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_BUNDLED]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " TX packets:%u fragments:%u/%u bundles:%u/%u\n",
+ nla_get_u32(link[TIPC_NLA_LINK_TX]) -
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_INFO]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_FRAGMENTS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_FRAGMENTED]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_BUNDLES]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_BUNDLED]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " TX profile sample:%u packets average:%u octets\n",
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_CNT]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_TOT]) /
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " 0-64:%u%% -256:%u%% -1024:%u%% -4096:%u%% ",
+ perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P0]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])),
+ perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P1]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])),
+ perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P2]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])),
+ perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P3]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])));
+
+ tipc_tlv_sprintf(msg->rep, "-16384:%u%% -32768:%u%% -66000:%u%%\n",
+ perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P4]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])),
+ perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P5]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])),
+ perc(nla_get_u32(stats[TIPC_NLA_STATS_MSG_LEN_P6]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MSG_PROF_TOT])));
+
+ tipc_tlv_sprintf(msg->rep,
+ " RX states:%u probes:%u naks:%u defs:%u dups:%u\n",
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_STATES]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_PROBES]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_NACKS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RX_DEFERRED]),
+ nla_get_u32(stats[TIPC_NLA_STATS_DUPLICATES]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " TX states:%u probes:%u naks:%u acks:%u dups:%u\n",
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_STATES]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_PROBES]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_NACKS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_TX_ACKS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_RETRANSMITTED]));
+
+ tipc_tlv_sprintf(msg->rep,
+ " Congestion link:%u Send queue max:%u avg:%u",
+ nla_get_u32(stats[TIPC_NLA_STATS_LINK_CONGS]),
+ nla_get_u32(stats[TIPC_NLA_STATS_MAX_QUEUE]),
+ nla_get_u32(stats[TIPC_NLA_STATS_AVG_QUEUE]));
+
+ return 0;
+}
+
+static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
+ struct tipc_link_info link_info;
+
+ nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL);
+
+ link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
+ link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
+ strcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]));
+
+ return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO,
+ &link_info, sizeof(link_info));
+}
+
+static int tipc_nl_compat_link_set(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
+{
+ struct nlattr *link;
+ struct nlattr *prop;
+ struct tipc_link_config *lc;
+
+ lc = (struct tipc_link_config *)TLV_DATA(msg->req);
+
+ link = nla_nest_start(skb, TIPC_NLA_LINK);
+ if (!link)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, TIPC_NLA_LINK_NAME, lc->name))
+ return -EMSGSIZE;
+
+ prop = nla_nest_start(skb, TIPC_NLA_LINK_PROP);
+ if (!prop)
+ return -EMSGSIZE;
+
+ if (msg->cmd == TIPC_CMD_SET_LINK_PRI) {
+ if (nla_put_u32(skb, TIPC_NLA_PROP_PRIO, ntohl(lc->value)))
+ return -EMSGSIZE;
+ } else if (msg->cmd == TIPC_CMD_SET_LINK_TOL) {
+ if (nla_put_u32(skb, TIPC_NLA_PROP_TOL, ntohl(lc->value)))
+ return -EMSGSIZE;
+ } else if (msg->cmd == TIPC_CMD_SET_LINK_WINDOW) {
+ if (nla_put_u32(skb, TIPC_NLA_PROP_WIN, ntohl(lc->value)))
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, prop);
+ nla_nest_end(skb, link);
+
+ return 0;
+}
+
+static int tipc_nl_compat_link_reset_stats(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
+{
+ char *name;
+ struct nlattr *link;
+
+ name = (char *)TLV_DATA(msg->req);
+
+ link = nla_nest_start(skb, TIPC_NLA_LINK);
+ if (!link)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, TIPC_NLA_LINK_NAME, name))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, link);
+
+ return 0;
+}
+
+static int tipc_nl_compat_name_table_dump_header(struct tipc_nl_compat_msg *msg)
+{
+ int i;
+ u32 depth;
+ struct tipc_name_table_query *ntq;
+ static const char * const header[] = {
+ "Type ",
+ "Lower Upper ",
+ "Port Identity ",
+ "Publication Scope"
+ };
+
+ ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req);
+
+ depth = ntohl(ntq->depth);
+
+ if (depth > 4)
+ depth = 4;
+ for (i = 0; i < depth; i++)
+ tipc_tlv_sprintf(msg->rep, header[i]);
+ tipc_tlv_sprintf(msg->rep, "\n");
+
+ return 0;
+}
+
+static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ char port_str[27];
+ struct tipc_name_table_query *ntq;
+ struct nlattr *nt[TIPC_NLA_NAME_TABLE_MAX + 1];
+ struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1];
+ u32 node, depth, type, lowbound, upbound;
+ static const char * const scope_str[] = {"", " zone", " cluster",
+ " node"};
+
+ nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX,
+ attrs[TIPC_NLA_NAME_TABLE], NULL);
+
+ nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, nt[TIPC_NLA_NAME_TABLE_PUBL],
+ NULL);
+
+ ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req);
+
+ depth = ntohl(ntq->depth);
+ type = ntohl(ntq->type);
+ lowbound = ntohl(ntq->lowbound);
+ upbound = ntohl(ntq->upbound);
+
+ if (!(depth & TIPC_NTQ_ALLTYPES) &&
+ (type != nla_get_u32(publ[TIPC_NLA_PUBL_TYPE])))
+ return 0;
+ if (lowbound && (lowbound > nla_get_u32(publ[TIPC_NLA_PUBL_UPPER])))
+ return 0;
+ if (upbound && (upbound < nla_get_u32(publ[TIPC_NLA_PUBL_LOWER])))
+ return 0;
+
+ tipc_tlv_sprintf(msg->rep, "%-10u ",
+ nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]));
+
+ if (depth == 1)
+ goto out;
+
+ tipc_tlv_sprintf(msg->rep, "%-10u %-10u ",
+ nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]),
+ nla_get_u32(publ[TIPC_NLA_PUBL_UPPER]));
+
+ if (depth == 2)
+ goto out;
+
+ node = nla_get_u32(publ[TIPC_NLA_PUBL_NODE]);
+ sprintf(port_str, "<%u.%u.%u:%u>", tipc_zone(node), tipc_cluster(node),
+ tipc_node(node), nla_get_u32(publ[TIPC_NLA_PUBL_REF]));
+ tipc_tlv_sprintf(msg->rep, "%-26s ", port_str);
+
+ if (depth == 3)
+ goto out;
+
+ tipc_tlv_sprintf(msg->rep, "%-10u %s",
+ nla_get_u32(publ[TIPC_NLA_PUBL_REF]),
+ scope_str[nla_get_u32(publ[TIPC_NLA_PUBL_SCOPE])]);
+out:
+ tipc_tlv_sprintf(msg->rep, "\n");
+
+ return 0;
+}
+
+static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ u32 type, lower, upper;
+ struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1];
+
+ nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], NULL);
+
+ type = nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]);
+ lower = nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]);
+ upper = nla_get_u32(publ[TIPC_NLA_PUBL_UPPER]);
+
+ if (lower == upper)
+ tipc_tlv_sprintf(msg->rep, " {%u,%u}", type, lower);
+ else
+ tipc_tlv_sprintf(msg->rep, " {%u,%u,%u}", type, lower, upper);
+
+ return 0;
+}
+
+static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock)
+{
+ int err;
+ void *hdr;
+ struct nlattr *nest;
+ struct sk_buff *args;
+ struct tipc_nl_compat_cmd_dump dump;
+
+ args = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!args)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(args, 0, 0, &tipc_genl_family, NLM_F_MULTI,
+ TIPC_NL_PUBL_GET);
+
+ nest = nla_nest_start(args, TIPC_NLA_SOCK);
+ if (!nest) {
+ kfree_skb(args);
+ return -EMSGSIZE;
+ }
+
+ if (nla_put_u32(args, TIPC_NLA_SOCK_REF, sock)) {
+ kfree_skb(args);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(args, nest);
+ genlmsg_end(args, hdr);
+
+ dump.dumpit = tipc_nl_publ_dump;
+ dump.format = __tipc_nl_compat_publ_dump;
+
+ err = __tipc_nl_compat_dumpit(&dump, msg, args);
+
+ kfree_skb(args);
+
+ return err;
+}
+
+static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ int err;
+ u32 sock_ref;
+ struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1];
+
+ nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], NULL);
+
+ sock_ref = nla_get_u32(sock[TIPC_NLA_SOCK_REF]);
+ tipc_tlv_sprintf(msg->rep, "%u:", sock_ref);
+
+ if (sock[TIPC_NLA_SOCK_CON]) {
+ u32 node;
+ struct nlattr *con[TIPC_NLA_CON_MAX + 1];
+
+ nla_parse_nested(con, TIPC_NLA_CON_MAX, sock[TIPC_NLA_SOCK_CON],
+ NULL);
+
+ node = nla_get_u32(con[TIPC_NLA_CON_NODE]);
+ tipc_tlv_sprintf(msg->rep, " connected to <%u.%u.%u:%u>",
+ tipc_zone(node),
+ tipc_cluster(node),
+ tipc_node(node),
+ nla_get_u32(con[TIPC_NLA_CON_SOCK]));
+
+ if (con[TIPC_NLA_CON_FLAG])
+ tipc_tlv_sprintf(msg->rep, " via {%u,%u}\n",
+ nla_get_u32(con[TIPC_NLA_CON_TYPE]),
+ nla_get_u32(con[TIPC_NLA_CON_INST]));
+ else
+ tipc_tlv_sprintf(msg->rep, "\n");
+ } else if (sock[TIPC_NLA_SOCK_HAS_PUBL]) {
+ tipc_tlv_sprintf(msg->rep, " bound to");
+
+ err = tipc_nl_compat_publ_dump(msg, sock_ref);
+ if (err)
+ return err;
+ }
+ tipc_tlv_sprintf(msg->rep, "\n");
+
+ return 0;
+}
+
+static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ struct nlattr *media[TIPC_NLA_MEDIA_MAX + 1];
+
+ nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA],
+ NULL);
+
+ return tipc_add_tlv(msg->rep, TIPC_TLV_MEDIA_NAME,
+ nla_data(media[TIPC_NLA_MEDIA_NAME]),
+ nla_len(media[TIPC_NLA_MEDIA_NAME]));
+}
+
+static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ struct tipc_node_info node_info;
+ struct nlattr *node[TIPC_NLA_NODE_MAX + 1];
+
+ nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], NULL);
+
+ node_info.addr = htonl(nla_get_u32(node[TIPC_NLA_NODE_ADDR]));
+ node_info.up = htonl(nla_get_flag(node[TIPC_NLA_NODE_UP]));
+
+ return tipc_add_tlv(msg->rep, TIPC_TLV_NODE_INFO, &node_info,
+ sizeof(node_info));
+}
+
+static int tipc_nl_compat_net_set(struct sk_buff *skb,
+ struct tipc_nl_compat_msg *msg)
+{
+ u32 val;
+ struct nlattr *net;
+
+ val = ntohl(*(__be32 *)TLV_DATA(msg->req));
+
+ net = nla_nest_start(skb, TIPC_NLA_NET);
+ if (!net)
+ return -EMSGSIZE;
+
+ if (msg->cmd == TIPC_CMD_SET_NODE_ADDR) {
+ if (nla_put_u32(skb, TIPC_NLA_NET_ADDR, val))
+ return -EMSGSIZE;
+ } else if (msg->cmd == TIPC_CMD_SET_NETID) {
+ if (nla_put_u32(skb, TIPC_NLA_NET_ID, val))
+ return -EMSGSIZE;
+ }
+ nla_nest_end(skb, net);
+
+ return 0;
+}
+
+static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg,
+ struct nlattr **attrs)
+{
+ __be32 id;
+ struct nlattr *net[TIPC_NLA_NET_MAX + 1];
+
+ nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], NULL);
+ id = htonl(nla_get_u32(net[TIPC_NLA_NET_ID]));
+
+ return tipc_add_tlv(msg->rep, TIPC_TLV_UNSIGNED, &id, sizeof(id));
+}
+
+static int tipc_cmd_show_stats_compat(struct tipc_nl_compat_msg *msg)
+{
+ msg->rep = tipc_tlv_alloc(ULTRA_STRING_MAX_LEN);
+ if (!msg->rep)
+ return -ENOMEM;
+
+ tipc_tlv_init(msg->rep, TIPC_TLV_ULTRA_STRING);
+ tipc_tlv_sprintf(msg->rep, "TIPC version " TIPC_MOD_VER "\n");
+
+ return 0;
+}
+
+static int tipc_nl_compat_handle(struct tipc_nl_compat_msg *msg)
+{
+ struct tipc_nl_compat_cmd_dump dump;
+ struct tipc_nl_compat_cmd_doit doit;
+
+ memset(&dump, 0, sizeof(dump));
+ memset(&doit, 0, sizeof(doit));
+
+ switch (msg->cmd) {
+ case TIPC_CMD_NOOP:
+ msg->rep = tipc_tlv_alloc(0);
+ if (!msg->rep)
+ return -ENOMEM;
+ return 0;
+ case TIPC_CMD_GET_BEARER_NAMES:
+ msg->rep_size = MAX_BEARERS * TLV_SPACE(TIPC_MAX_BEARER_NAME);
+ dump.dumpit = tipc_nl_bearer_dump;
+ dump.format = tipc_nl_compat_bearer_dump;
+ return tipc_nl_compat_dumpit(&dump, msg);
+ case TIPC_CMD_ENABLE_BEARER:
+ msg->req_type = TIPC_TLV_BEARER_CONFIG;
+ doit.doit = tipc_nl_bearer_enable;
+ doit.transcode = tipc_nl_compat_bearer_enable;
+ return tipc_nl_compat_doit(&doit, msg);
+ case TIPC_CMD_DISABLE_BEARER:
+ msg->req_type = TIPC_TLV_BEARER_NAME;
+ doit.doit = tipc_nl_bearer_disable;
+ doit.transcode = tipc_nl_compat_bearer_disable;
+ return tipc_nl_compat_doit(&doit, msg);
+ case TIPC_CMD_SHOW_LINK_STATS:
+ msg->req_type = TIPC_TLV_LINK_NAME;
+ msg->rep_size = ULTRA_STRING_MAX_LEN;
+ msg->rep_type = TIPC_TLV_ULTRA_STRING;
+ dump.dumpit = tipc_nl_link_dump;
+ dump.format = tipc_nl_compat_link_stat_dump;
+ return tipc_nl_compat_dumpit(&dump, msg);
+ case TIPC_CMD_GET_LINKS:
+ msg->req_type = TIPC_TLV_NET_ADDR;
+ msg->rep_size = ULTRA_STRING_MAX_LEN;
+ dump.dumpit = tipc_nl_link_dump;
+ dump.format = tipc_nl_compat_link_dump;
+ return tipc_nl_compat_dumpit(&dump, msg);
+ case TIPC_CMD_SET_LINK_TOL:
+ case TIPC_CMD_SET_LINK_PRI:
+ case TIPC_CMD_SET_LINK_WINDOW:
+ msg->req_type = TIPC_TLV_LINK_CONFIG;
+ doit.doit = tipc_nl_link_set;
+ doit.transcode = tipc_nl_compat_link_set;
+ return tipc_nl_compat_doit(&doit, msg);
+ case TIPC_CMD_RESET_LINK_STATS:
+ msg->req_type = TIPC_TLV_LINK_NAME;
+ doit.doit = tipc_nl_link_reset_stats;
+ doit.transcode = tipc_nl_compat_link_reset_stats;
+ return tipc_nl_compat_doit(&doit, msg);
+ case TIPC_CMD_SHOW_NAME_TABLE:
+ msg->req_type = TIPC_TLV_NAME_TBL_QUERY;
+ msg->rep_size = ULTRA_STRING_MAX_LEN;
+ msg->rep_type = TIPC_TLV_ULTRA_STRING;
+ dump.header = tipc_nl_compat_name_table_dump_header;
+ dump.dumpit = tipc_nl_name_table_dump;
+ dump.format = tipc_nl_compat_name_table_dump;
+ return tipc_nl_compat_dumpit(&dump, msg);
+ case TIPC_CMD_SHOW_PORTS:
+ msg->rep_size = ULTRA_STRING_MAX_LEN;
+ msg->rep_type = TIPC_TLV_ULTRA_STRING;
+ dump.dumpit = tipc_nl_sk_dump;
+ dump.format = tipc_nl_compat_sk_dump;
+ return tipc_nl_compat_dumpit(&dump, msg);
+ case TIPC_CMD_GET_MEDIA_NAMES:
+ msg->rep_size = MAX_MEDIA * TLV_SPACE(TIPC_MAX_MEDIA_NAME);
+ dump.dumpit = tipc_nl_media_dump;
+ dump.format = tipc_nl_compat_media_dump;
+ return tipc_nl_compat_dumpit(&dump, msg);
+ case TIPC_CMD_GET_NODES:
+ msg->rep_size = ULTRA_STRING_MAX_LEN;
+ dump.dumpit = tipc_nl_node_dump;
+ dump.format = tipc_nl_compat_node_dump;
+ return tipc_nl_compat_dumpit(&dump, msg);
+ case TIPC_CMD_SET_NODE_ADDR:
+ msg->req_type = TIPC_TLV_NET_ADDR;
+ doit.doit = tipc_nl_net_set;
+ doit.transcode = tipc_nl_compat_net_set;
+ return tipc_nl_compat_doit(&doit, msg);
+ case TIPC_CMD_SET_NETID:
+ msg->req_type = TIPC_TLV_UNSIGNED;
+ doit.doit = tipc_nl_net_set;
+ doit.transcode = tipc_nl_compat_net_set;
+ return tipc_nl_compat_doit(&doit, msg);
+ case TIPC_CMD_GET_NETID:
+ msg->rep_size = sizeof(u32);
+ dump.dumpit = tipc_nl_net_dump;
+ dump.format = tipc_nl_compat_net_dump;
+ return tipc_nl_compat_dumpit(&dump, msg);
+ case TIPC_CMD_SHOW_STATS:
+ return tipc_cmd_show_stats_compat(msg);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ int len;
+ struct tipc_nl_compat_msg msg;
+ struct nlmsghdr *req_nlh;
+ struct nlmsghdr *rep_nlh;
+ struct tipc_genlmsghdr *req_userhdr = info->userhdr;
+ struct net *net = genl_info_net(info);
+
+ memset(&msg, 0, sizeof(msg));
+
+ req_nlh = (struct nlmsghdr *)skb->data;
+ msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN;
+ msg.cmd = req_userhdr->cmd;
+ msg.dst_sk = info->dst_sk;
+
+ if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) {
+ msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN);
+ err = -EACCES;
+ goto send;
+ }
+
+ len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
+ if (TLV_GET_LEN(msg.req) && !TLV_OK(msg.req, len)) {
+ msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED);
+ err = -EOPNOTSUPP;
+ goto send;
+ }
+
+ err = tipc_nl_compat_handle(&msg);
+ if (err == -EOPNOTSUPP)
+ msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED);
+ else if (err == -EINVAL)
+ msg.rep = tipc_get_err_tlv(TIPC_CFG_TLV_ERROR);
+send:
+ if (!msg.rep)
+ return err;
+
+ len = nlmsg_total_size(GENL_HDRLEN + TIPC_GENL_HDRLEN);
+ skb_push(msg.rep, len);
+ rep_nlh = nlmsg_hdr(msg.rep);
+ memcpy(rep_nlh, info->nlhdr, len);
+ rep_nlh->nlmsg_len = msg.rep->len;
+ genlmsg_unicast(net, msg.rep, NETLINK_CB(skb).portid);
+
+ return err;
+}
+
+static struct genl_family tipc_genl_compat_family = {
+ .id = GENL_ID_GENERATE,
+ .name = TIPC_GENL_NAME,
+ .version = TIPC_GENL_VERSION,
+ .hdrsize = TIPC_GENL_HDRLEN,
+ .maxattr = 0,
+ .netnsok = true,
+};
+
+static struct genl_ops tipc_genl_compat_ops[] = {
+ {
+ .cmd = TIPC_GENL_CMD,
+ .doit = tipc_nl_compat_recv,
+ },
+};
+
+int tipc_netlink_compat_start(void)
+{
+ int res;
+
+ res = genl_register_family_with_ops(&tipc_genl_compat_family,
+ tipc_genl_compat_ops);
+ if (res) {
+ pr_err("Failed to register legacy compat interface\n");
+ return res;
+ }
+
+ return 0;
+}
+
+void tipc_netlink_compat_stop(void)
+{
+ genl_unregister_family(&tipc_genl_compat_family);
+}
diff --git a/net/tipc/node.c b/net/tipc/node.c
new file mode 100644
index 000000000..22c059ad2
--- /dev/null
+++ b/net/tipc/node.c
@@ -0,0 +1,621 @@
+/*
+ * net/tipc/node.c: TIPC node management routines
+ *
+ * Copyright (c) 2000-2006, 2012-2014, Ericsson AB
+ * Copyright (c) 2005-2006, 2010-2014, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "link.h"
+#include "node.h"
+#include "name_distr.h"
+#include "socket.h"
+
+static void node_lost_contact(struct tipc_node *n_ptr);
+static void node_established_contact(struct tipc_node *n_ptr);
+static void tipc_node_delete(struct tipc_node *node);
+
+struct tipc_sock_conn {
+ u32 port;
+ u32 peer_port;
+ u32 peer_node;
+ struct list_head list;
+};
+
+static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
+ [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 },
+ [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }
+};
+
+/*
+ * A trivial power-of-two bitmask technique is used for speed, since this
+ * operation is done for every incoming TIPC packet. The number of hash table
+ * entries has been chosen so that no hash chain exceeds 8 nodes and will
+ * usually be much smaller (typically only a single node).
+ */
+static unsigned int tipc_hashfn(u32 addr)
+{
+ return addr & (NODE_HTABLE_SIZE - 1);
+}
+
+static void tipc_node_kref_release(struct kref *kref)
+{
+ struct tipc_node *node = container_of(kref, struct tipc_node, kref);
+
+ tipc_node_delete(node);
+}
+
+void tipc_node_put(struct tipc_node *node)
+{
+ kref_put(&node->kref, tipc_node_kref_release);
+}
+
+static void tipc_node_get(struct tipc_node *node)
+{
+ kref_get(&node->kref);
+}
+
+/*
+ * tipc_node_find - locate specified node object, if it exists
+ */
+struct tipc_node *tipc_node_find(struct net *net, u32 addr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_node *node;
+
+ if (unlikely(!in_own_cluster_exact(net, addr)))
+ return NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)],
+ hash) {
+ if (node->addr == addr) {
+ tipc_node_get(node);
+ rcu_read_unlock();
+ return node;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+struct tipc_node *tipc_node_create(struct net *net, u32 addr)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_node *n_ptr, *temp_node;
+
+ spin_lock_bh(&tn->node_list_lock);
+ n_ptr = tipc_node_find(net, addr);
+ if (n_ptr)
+ goto exit;
+ n_ptr = kzalloc(sizeof(*n_ptr), GFP_ATOMIC);
+ if (!n_ptr) {
+ pr_warn("Node creation failed, no memory\n");
+ goto exit;
+ }
+ n_ptr->addr = addr;
+ n_ptr->net = net;
+ kref_init(&n_ptr->kref);
+ spin_lock_init(&n_ptr->lock);
+ INIT_HLIST_NODE(&n_ptr->hash);
+ INIT_LIST_HEAD(&n_ptr->list);
+ INIT_LIST_HEAD(&n_ptr->publ_list);
+ INIT_LIST_HEAD(&n_ptr->conn_sks);
+ __skb_queue_head_init(&n_ptr->bclink.deferdq);
+ hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]);
+ list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
+ if (n_ptr->addr < temp_node->addr)
+ break;
+ }
+ list_add_tail_rcu(&n_ptr->list, &temp_node->list);
+ n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN;
+ n_ptr->signature = INVALID_NODE_SIG;
+ tipc_node_get(n_ptr);
+exit:
+ spin_unlock_bh(&tn->node_list_lock);
+ return n_ptr;
+}
+
+static void tipc_node_delete(struct tipc_node *node)
+{
+ list_del_rcu(&node->list);
+ hlist_del_rcu(&node->hash);
+ kfree_rcu(node, rcu);
+}
+
+void tipc_node_stop(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_node *node, *t_node;
+
+ spin_lock_bh(&tn->node_list_lock);
+ list_for_each_entry_safe(node, t_node, &tn->node_list, list)
+ tipc_node_put(node);
+ spin_unlock_bh(&tn->node_list_lock);
+}
+
+int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port)
+{
+ struct tipc_node *node;
+ struct tipc_sock_conn *conn;
+ int err = 0;
+
+ if (in_own_node(net, dnode))
+ return 0;
+
+ node = tipc_node_find(net, dnode);
+ if (!node) {
+ pr_warn("Connecting sock to node 0x%x failed\n", dnode);
+ return -EHOSTUNREACH;
+ }
+ conn = kmalloc(sizeof(*conn), GFP_ATOMIC);
+ if (!conn) {
+ err = -EHOSTUNREACH;
+ goto exit;
+ }
+ conn->peer_node = dnode;
+ conn->port = port;
+ conn->peer_port = peer_port;
+
+ tipc_node_lock(node);
+ list_add_tail(&conn->list, &node->conn_sks);
+ tipc_node_unlock(node);
+exit:
+ tipc_node_put(node);
+ return err;
+}
+
+void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
+{
+ struct tipc_node *node;
+ struct tipc_sock_conn *conn, *safe;
+
+ if (in_own_node(net, dnode))
+ return;
+
+ node = tipc_node_find(net, dnode);
+ if (!node)
+ return;
+
+ tipc_node_lock(node);
+ list_for_each_entry_safe(conn, safe, &node->conn_sks, list) {
+ if (port != conn->port)
+ continue;
+ list_del(&conn->list);
+ kfree(conn);
+ }
+ tipc_node_unlock(node);
+ tipc_node_put(node);
+}
+
+/**
+ * tipc_node_link_up - handle addition of link
+ *
+ * Link becomes active (alone or shared) or standby, depending on its priority.
+ */
+void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+{
+ struct tipc_link **active = &n_ptr->active_links[0];
+
+ n_ptr->working_links++;
+ n_ptr->action_flags |= TIPC_NOTIFY_LINK_UP;
+ n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id;
+
+ pr_debug("Established link <%s> on network plane %c\n",
+ l_ptr->name, l_ptr->net_plane);
+
+ if (!active[0]) {
+ active[0] = active[1] = l_ptr;
+ node_established_contact(n_ptr);
+ goto exit;
+ }
+ if (l_ptr->priority < active[0]->priority) {
+ pr_debug("New link <%s> becomes standby\n", l_ptr->name);
+ goto exit;
+ }
+ tipc_link_dup_queue_xmit(active[0], l_ptr);
+ if (l_ptr->priority == active[0]->priority) {
+ active[0] = l_ptr;
+ goto exit;
+ }
+ pr_debug("Old link <%s> becomes standby\n", active[0]->name);
+ if (active[1] != active[0])
+ pr_debug("Old link <%s> becomes standby\n", active[1]->name);
+ active[0] = active[1] = l_ptr;
+exit:
+ /* Leave room for changeover header when returning 'mtu' to users: */
+ n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE;
+ n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE;
+}
+
+/**
+ * node_select_active_links - select active link
+ */
+static void node_select_active_links(struct tipc_node *n_ptr)
+{
+ struct tipc_link **active = &n_ptr->active_links[0];
+ u32 i;
+ u32 highest_prio = 0;
+
+ active[0] = active[1] = NULL;
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ struct tipc_link *l_ptr = n_ptr->links[i];
+
+ if (!l_ptr || !tipc_link_is_up(l_ptr) ||
+ (l_ptr->priority < highest_prio))
+ continue;
+
+ if (l_ptr->priority > highest_prio) {
+ highest_prio = l_ptr->priority;
+ active[0] = active[1] = l_ptr;
+ } else {
+ active[1] = l_ptr;
+ }
+ }
+}
+
+/**
+ * tipc_node_link_down - handle loss of link
+ */
+void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+{
+ struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
+ struct tipc_link **active;
+
+ n_ptr->working_links--;
+ n_ptr->action_flags |= TIPC_NOTIFY_LINK_DOWN;
+ n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id;
+
+ if (!tipc_link_is_active(l_ptr)) {
+ pr_debug("Lost standby link <%s> on network plane %c\n",
+ l_ptr->name, l_ptr->net_plane);
+ return;
+ }
+ pr_debug("Lost link <%s> on network plane %c\n",
+ l_ptr->name, l_ptr->net_plane);
+
+ active = &n_ptr->active_links[0];
+ if (active[0] == l_ptr)
+ active[0] = active[1];
+ if (active[1] == l_ptr)
+ active[1] = active[0];
+ if (active[0] == l_ptr)
+ node_select_active_links(n_ptr);
+ if (tipc_node_is_up(n_ptr))
+ tipc_link_failover_send_queue(l_ptr);
+ else
+ node_lost_contact(n_ptr);
+
+ /* Leave room for changeover header when returning 'mtu' to users: */
+ if (active[0]) {
+ n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE;
+ n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE;
+ return;
+ }
+ /* Loopback link went down? No fragmentation needed from now on. */
+ if (n_ptr->addr == tn->own_addr) {
+ n_ptr->act_mtus[0] = MAX_MSG_SIZE;
+ n_ptr->act_mtus[1] = MAX_MSG_SIZE;
+ }
+}
+
+int tipc_node_active_links(struct tipc_node *n_ptr)
+{
+ return n_ptr->active_links[0] != NULL;
+}
+
+int tipc_node_is_up(struct tipc_node *n_ptr)
+{
+ return tipc_node_active_links(n_ptr);
+}
+
+void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+{
+ n_ptr->links[l_ptr->bearer_id] = l_ptr;
+ n_ptr->link_cnt++;
+}
+
+void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr)
+{
+ int i;
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ if (l_ptr != n_ptr->links[i])
+ continue;
+ n_ptr->links[i] = NULL;
+ n_ptr->link_cnt--;
+ }
+}
+
+static void node_established_contact(struct tipc_node *n_ptr)
+{
+ n_ptr->action_flags |= TIPC_NOTIFY_NODE_UP;
+ n_ptr->bclink.oos_state = 0;
+ n_ptr->bclink.acked = tipc_bclink_get_last_sent(n_ptr->net);
+ tipc_bclink_add_node(n_ptr->net, n_ptr->addr);
+}
+
+static void node_lost_contact(struct tipc_node *n_ptr)
+{
+ char addr_string[16];
+ struct tipc_sock_conn *conn, *safe;
+ struct list_head *conns = &n_ptr->conn_sks;
+ struct sk_buff *skb;
+ struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id);
+ uint i;
+
+ pr_debug("Lost contact with %s\n",
+ tipc_addr_string_fill(addr_string, n_ptr->addr));
+
+ /* Flush broadcast link info associated with lost node */
+ if (n_ptr->bclink.recv_permitted) {
+ __skb_queue_purge(&n_ptr->bclink.deferdq);
+
+ if (n_ptr->bclink.reasm_buf) {
+ kfree_skb(n_ptr->bclink.reasm_buf);
+ n_ptr->bclink.reasm_buf = NULL;
+ }
+
+ tipc_bclink_remove_node(n_ptr->net, n_ptr->addr);
+ tipc_bclink_acknowledge(n_ptr, INVALID_LINK_SEQ);
+
+ n_ptr->bclink.recv_permitted = false;
+ }
+
+ /* Abort any ongoing link failover */
+ for (i = 0; i < MAX_BEARERS; i++) {
+ struct tipc_link *l_ptr = n_ptr->links[i];
+ if (!l_ptr)
+ continue;
+ l_ptr->flags &= ~LINK_FAILINGOVER;
+ l_ptr->failover_checkpt = 0;
+ l_ptr->failover_pkts = 0;
+ kfree_skb(l_ptr->failover_skb);
+ l_ptr->failover_skb = NULL;
+ tipc_link_reset_fragments(l_ptr);
+ }
+
+ n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN;
+
+ /* Prevent re-contact with node until cleanup is done */
+ n_ptr->action_flags |= TIPC_WAIT_PEER_LINKS_DOWN;
+
+ /* Notify publications from this node */
+ n_ptr->action_flags |= TIPC_NOTIFY_NODE_DOWN;
+
+ /* Notify sockets connected to node */
+ list_for_each_entry_safe(conn, safe, conns, list) {
+ skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG,
+ SHORT_H_SIZE, 0, tn->own_addr,
+ conn->peer_node, conn->port,
+ conn->peer_port, TIPC_ERR_NO_NODE);
+ if (likely(skb)) {
+ skb_queue_tail(n_ptr->inputq, skb);
+ n_ptr->action_flags |= TIPC_MSG_EVT;
+ }
+ list_del(&conn->list);
+ kfree(conn);
+ }
+}
+
+/**
+ * tipc_node_get_linkname - get the name of a link
+ *
+ * @bearer_id: id of the bearer
+ * @node: peer node address
+ * @linkname: link name output buffer
+ *
+ * Returns 0 on success
+ */
+int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
+ char *linkname, size_t len)
+{
+ struct tipc_link *link;
+ int err = -EINVAL;
+ struct tipc_node *node = tipc_node_find(net, addr);
+
+ if (!node)
+ return err;
+
+ if (bearer_id >= MAX_BEARERS)
+ goto exit;
+
+ tipc_node_lock(node);
+ link = node->links[bearer_id];
+ if (link) {
+ strncpy(linkname, link->name, len);
+ err = 0;
+ }
+exit:
+ tipc_node_unlock(node);
+ tipc_node_put(node);
+ return err;
+}
+
+void tipc_node_unlock(struct tipc_node *node)
+{
+ struct net *net = node->net;
+ u32 addr = 0;
+ u32 flags = node->action_flags;
+ u32 link_id = 0;
+ struct list_head *publ_list;
+ struct sk_buff_head *inputq = node->inputq;
+ struct sk_buff_head *namedq;
+
+ if (likely(!flags || (flags == TIPC_MSG_EVT))) {
+ node->action_flags = 0;
+ spin_unlock_bh(&node->lock);
+ if (flags == TIPC_MSG_EVT)
+ tipc_sk_rcv(net, inputq);
+ return;
+ }
+
+ addr = node->addr;
+ link_id = node->link_id;
+ namedq = node->namedq;
+ publ_list = &node->publ_list;
+
+ node->action_flags &= ~(TIPC_MSG_EVT |
+ TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
+ TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP |
+ TIPC_WAKEUP_BCAST_USERS | TIPC_BCAST_MSG_EVT |
+ TIPC_NAMED_MSG_EVT | TIPC_BCAST_RESET);
+
+ spin_unlock_bh(&node->lock);
+
+ if (flags & TIPC_NOTIFY_NODE_DOWN)
+ tipc_publ_notify(net, publ_list, addr);
+
+ if (flags & TIPC_WAKEUP_BCAST_USERS)
+ tipc_bclink_wakeup_users(net);
+
+ if (flags & TIPC_NOTIFY_NODE_UP)
+ tipc_named_node_up(net, addr);
+
+ if (flags & TIPC_NOTIFY_LINK_UP)
+ tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
+ TIPC_NODE_SCOPE, link_id, addr);
+
+ if (flags & TIPC_NOTIFY_LINK_DOWN)
+ tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
+ link_id, addr);
+
+ if (flags & TIPC_MSG_EVT)
+ tipc_sk_rcv(net, inputq);
+
+ if (flags & TIPC_NAMED_MSG_EVT)
+ tipc_named_rcv(net, namedq);
+
+ if (flags & TIPC_BCAST_MSG_EVT)
+ tipc_bclink_input(net);
+
+ if (flags & TIPC_BCAST_RESET)
+ tipc_link_reset_all(node);
+}
+
+/* Caller should hold node lock for the passed node */
+static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node)
+{
+ void *hdr;
+ struct nlattr *attrs;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ NLM_F_MULTI, TIPC_NL_NODE_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_NODE);
+ if (!attrs)
+ goto msg_full;
+
+ if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr))
+ goto attr_msg_full;
+ if (tipc_node_is_up(node))
+ if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP))
+ goto attr_msg_full;
+
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int err;
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ int done = cb->args[0];
+ int last_addr = cb->args[1];
+ struct tipc_node *node;
+ struct tipc_nl_msg msg;
+
+ if (done)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ rcu_read_lock();
+ if (last_addr) {
+ node = tipc_node_find(net, last_addr);
+ if (!node) {
+ rcu_read_unlock();
+ /* We never set seq or call nl_dump_check_consistent()
+ * this means that setting prev_seq here will cause the
+ * consistence check to fail in the netlink callback
+ * handler. Resulting in the NLMSG_DONE message having
+ * the NLM_F_DUMP_INTR flag set if the node state
+ * changed while we released the lock.
+ */
+ cb->prev_seq = 1;
+ return -EPIPE;
+ }
+ tipc_node_put(node);
+ }
+
+ list_for_each_entry_rcu(node, &tn->node_list, list) {
+ if (last_addr) {
+ if (node->addr == last_addr)
+ last_addr = 0;
+ else
+ continue;
+ }
+
+ tipc_node_lock(node);
+ err = __tipc_nl_add_node(&msg, node);
+ if (err) {
+ last_addr = node->addr;
+ tipc_node_unlock(node);
+ goto out;
+ }
+
+ tipc_node_unlock(node);
+ }
+ done = 1;
+out:
+ cb->args[0] = done;
+ cb->args[1] = last_addr;
+ rcu_read_unlock();
+
+ return skb->len;
+}
diff --git a/net/tipc/node.h b/net/tipc/node.h
new file mode 100644
index 000000000..02d5c20dc
--- /dev/null
+++ b/net/tipc/node.h
@@ -0,0 +1,187 @@
+/*
+ * net/tipc/node.h: Include file for TIPC node management routines
+ *
+ * Copyright (c) 2000-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2005, 2010-2014, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NODE_H
+#define _TIPC_NODE_H
+
+#include "addr.h"
+#include "net.h"
+#include "bearer.h"
+#include "msg.h"
+
+/* Out-of-range value for node signature */
+#define INVALID_NODE_SIG 0x10000
+
+#define NODE_HTABLE_SIZE 512
+
+/* Flags used to take different actions according to flag type
+ * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down
+ * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down
+ * TIPC_NOTIFY_NODE_DOWN: notify node is down
+ * TIPC_NOTIFY_NODE_UP: notify node is up
+ * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type
+ */
+enum {
+ TIPC_MSG_EVT = 1,
+ TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1),
+ TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2),
+ TIPC_NOTIFY_NODE_DOWN = (1 << 3),
+ TIPC_NOTIFY_NODE_UP = (1 << 4),
+ TIPC_WAKEUP_BCAST_USERS = (1 << 5),
+ TIPC_NOTIFY_LINK_UP = (1 << 6),
+ TIPC_NOTIFY_LINK_DOWN = (1 << 7),
+ TIPC_NAMED_MSG_EVT = (1 << 8),
+ TIPC_BCAST_MSG_EVT = (1 << 9),
+ TIPC_BCAST_RESET = (1 << 10)
+};
+
+/**
+ * struct tipc_node_bclink - TIPC node bclink structure
+ * @acked: sequence # of last outbound b'cast message acknowledged by node
+ * @last_in: sequence # of last in-sequence b'cast message received from node
+ * @last_sent: sequence # of last b'cast message sent by node
+ * @oos_state: state tracker for handling OOS b'cast messages
+ * @deferred_queue: deferred queue saved OOS b'cast message received from node
+ * @reasm_buf: broadcast reassembly queue head from node
+ * @inputq_map: bitmap indicating which inqueues should be kicked
+ * @recv_permitted: true if node is allowed to receive b'cast messages
+ */
+struct tipc_node_bclink {
+ u32 acked;
+ u32 last_in;
+ u32 last_sent;
+ u32 oos_state;
+ u32 deferred_size;
+ struct sk_buff_head deferdq;
+ struct sk_buff *reasm_buf;
+ int inputq_map;
+ bool recv_permitted;
+};
+
+/**
+ * struct tipc_node - TIPC node structure
+ * @addr: network address of node
+ * @ref: reference counter to node object
+ * @lock: spinlock governing access to structure
+ * @net: the applicable net namespace
+ * @hash: links to adjacent nodes in unsorted hash chain
+ * @inputq: pointer to input queue containing messages for msg event
+ * @namedq: pointer to name table input queue with name table messages
+ * @curr_link: the link holding the node lock, if any
+ * @active_links: pointers to active links to node
+ * @links: pointers to all links to node
+ * @action_flags: bit mask of different types of node actions
+ * @bclink: broadcast-related info
+ * @list: links to adjacent nodes in sorted list of cluster's nodes
+ * @working_links: number of working links to node (both active and standby)
+ * @link_cnt: number of links to node
+ * @capabilities: bitmap, indicating peer node's functional capabilities
+ * @signature: node instance identifier
+ * @link_id: local and remote bearer ids of changing link, if any
+ * @publ_list: list of publications
+ * @rcu: rcu struct for tipc_node
+ */
+struct tipc_node {
+ u32 addr;
+ struct kref kref;
+ spinlock_t lock;
+ struct net *net;
+ struct hlist_node hash;
+ struct sk_buff_head *inputq;
+ struct sk_buff_head *namedq;
+ struct tipc_link *active_links[2];
+ u32 act_mtus[2];
+ struct tipc_link *links[MAX_BEARERS];
+ int action_flags;
+ struct tipc_node_bclink bclink;
+ struct list_head list;
+ int link_cnt;
+ u16 working_links;
+ u16 capabilities;
+ u32 signature;
+ u32 link_id;
+ struct list_head publ_list;
+ struct list_head conn_sks;
+ struct rcu_head rcu;
+};
+
+struct tipc_node *tipc_node_find(struct net *net, u32 addr);
+void tipc_node_put(struct tipc_node *node);
+struct tipc_node *tipc_node_create(struct net *net, u32 addr);
+void tipc_node_stop(struct net *net);
+void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
+void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
+void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
+void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
+int tipc_node_active_links(struct tipc_node *n_ptr);
+int tipc_node_is_up(struct tipc_node *n_ptr);
+int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
+ char *linkname, size_t len);
+void tipc_node_unlock(struct tipc_node *node);
+int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port);
+void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port);
+
+int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+static inline void tipc_node_lock(struct tipc_node *node)
+{
+ spin_lock_bh(&node->lock);
+}
+
+static inline bool tipc_node_blocked(struct tipc_node *node)
+{
+ return (node->action_flags & (TIPC_WAIT_PEER_LINKS_DOWN |
+ TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN));
+}
+
+static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector)
+{
+ struct tipc_node *node;
+ u32 mtu;
+
+ node = tipc_node_find(net, addr);
+
+ if (likely(node)) {
+ mtu = node->act_mtus[selector & 1];
+ tipc_node_put(node);
+ } else {
+ mtu = MAX_MSG_SIZE;
+ }
+
+ return mtu;
+}
+
+#endif
diff --git a/net/tipc/server.c b/net/tipc/server.c
new file mode 100644
index 000000000..77ff03ed1
--- /dev/null
+++ b/net/tipc/server.c
@@ -0,0 +1,634 @@
+/*
+ * net/tipc/server.c: TIPC server infrastructure
+ *
+ * Copyright (c) 2012-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "server.h"
+#include "core.h"
+#include "socket.h"
+#include <net/sock.h>
+#include <linux/module.h>
+
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT 25
+#define MAX_RECV_MSG_COUNT 25
+#define CF_CONNECTED 1
+#define CF_SERVER 2
+
+#define sock2con(x) ((struct tipc_conn *)(x)->sk_user_data)
+
+/**
+ * struct tipc_conn - TIPC connection structure
+ * @kref: reference counter to connection object
+ * @conid: connection identifier
+ * @sock: socket handler associated with connection
+ * @flags: indicates connection state
+ * @server: pointer to connected server
+ * @rwork: receive work item
+ * @usr_data: user-specified field
+ * @rx_action: what to do when connection socket is active
+ * @outqueue: pointer to first outbound message in queue
+ * @outqueue_lock: control access to the outqueue
+ * @outqueue: list of connection objects for its server
+ * @swork: send work item
+ */
+struct tipc_conn {
+ struct kref kref;
+ int conid;
+ struct socket *sock;
+ unsigned long flags;
+ struct tipc_server *server;
+ struct work_struct rwork;
+ int (*rx_action) (struct tipc_conn *con);
+ void *usr_data;
+ struct list_head outqueue;
+ spinlock_t outqueue_lock;
+ struct work_struct swork;
+};
+
+/* An entry waiting to be sent */
+struct outqueue_entry {
+ struct list_head list;
+ struct kvec iov;
+ struct sockaddr_tipc dest;
+};
+
+static void tipc_recv_work(struct work_struct *work);
+static void tipc_send_work(struct work_struct *work);
+static void tipc_clean_outqueues(struct tipc_conn *con);
+
+static void tipc_conn_kref_release(struct kref *kref)
+{
+ struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
+ struct sockaddr_tipc *saddr = con->server->saddr;
+ struct socket *sock = con->sock;
+ struct sock *sk;
+
+ if (sock) {
+ sk = sock->sk;
+ if (test_bit(CF_SERVER, &con->flags)) {
+ __module_get(sock->ops->owner);
+ __module_get(sk->sk_prot_creator->owner);
+ }
+ saddr->scope = -TIPC_NODE_SCOPE;
+ kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
+ sock_release(sock);
+ con->sock = NULL;
+ }
+
+ tipc_clean_outqueues(con);
+ kfree(con);
+}
+
+static void conn_put(struct tipc_conn *con)
+{
+ kref_put(&con->kref, tipc_conn_kref_release);
+}
+
+static void conn_get(struct tipc_conn *con)
+{
+ kref_get(&con->kref);
+}
+
+static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
+{
+ struct tipc_conn *con;
+
+ spin_lock_bh(&s->idr_lock);
+ con = idr_find(&s->conn_idr, conid);
+ if (con)
+ conn_get(con);
+ spin_unlock_bh(&s->idr_lock);
+ return con;
+}
+
+static void sock_data_ready(struct sock *sk)
+{
+ struct tipc_conn *con;
+
+ read_lock(&sk->sk_callback_lock);
+ con = sock2con(sk);
+ if (con && test_bit(CF_CONNECTED, &con->flags)) {
+ conn_get(con);
+ if (!queue_work(con->server->rcv_wq, &con->rwork))
+ conn_put(con);
+ }
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void sock_write_space(struct sock *sk)
+{
+ struct tipc_conn *con;
+
+ read_lock(&sk->sk_callback_lock);
+ con = sock2con(sk);
+ if (con && test_bit(CF_CONNECTED, &con->flags)) {
+ conn_get(con);
+ if (!queue_work(con->server->send_wq, &con->swork))
+ conn_put(con);
+ }
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
+{
+ struct sock *sk = sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ sk->sk_data_ready = sock_data_ready;
+ sk->sk_write_space = sock_write_space;
+ sk->sk_user_data = con;
+
+ con->sock = sock;
+
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void tipc_unregister_callbacks(struct tipc_conn *con)
+{
+ struct sock *sk = con->sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void tipc_close_conn(struct tipc_conn *con)
+{
+ struct tipc_server *s = con->server;
+
+ if (test_and_clear_bit(CF_CONNECTED, &con->flags)) {
+ if (con->conid)
+ s->tipc_conn_shutdown(con->conid, con->usr_data);
+
+ spin_lock_bh(&s->idr_lock);
+ idr_remove(&s->conn_idr, con->conid);
+ s->idr_in_use--;
+ spin_unlock_bh(&s->idr_lock);
+
+ tipc_unregister_callbacks(con);
+
+ /* We shouldn't flush pending works as we may be in the
+ * thread. In fact the races with pending rx/tx work structs
+ * are harmless for us here as we have already deleted this
+ * connection from server connection list and set
+ * sk->sk_user_data to 0 before releasing connection object.
+ */
+ kernel_sock_shutdown(con->sock, SHUT_RDWR);
+
+ conn_put(con);
+ }
+}
+
+static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s)
+{
+ struct tipc_conn *con;
+ int ret;
+
+ con = kzalloc(sizeof(struct tipc_conn), GFP_ATOMIC);
+ if (!con)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&con->kref);
+ INIT_LIST_HEAD(&con->outqueue);
+ spin_lock_init(&con->outqueue_lock);
+ INIT_WORK(&con->swork, tipc_send_work);
+ INIT_WORK(&con->rwork, tipc_recv_work);
+
+ spin_lock_bh(&s->idr_lock);
+ ret = idr_alloc(&s->conn_idr, con, 0, 0, GFP_ATOMIC);
+ if (ret < 0) {
+ kfree(con);
+ spin_unlock_bh(&s->idr_lock);
+ return ERR_PTR(-ENOMEM);
+ }
+ con->conid = ret;
+ s->idr_in_use++;
+ spin_unlock_bh(&s->idr_lock);
+
+ set_bit(CF_CONNECTED, &con->flags);
+ con->server = s;
+
+ return con;
+}
+
+static int tipc_receive_from_sock(struct tipc_conn *con)
+{
+ struct msghdr msg = {};
+ struct tipc_server *s = con->server;
+ struct sockaddr_tipc addr;
+ struct kvec iov;
+ void *buf;
+ int ret;
+
+ buf = kmem_cache_alloc(s->rcvbuf_cache, GFP_ATOMIC);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out_close;
+ }
+
+ iov.iov_base = buf;
+ iov.iov_len = s->max_rcvbuf_size;
+ msg.msg_name = &addr;
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ MSG_DONTWAIT);
+ if (ret <= 0) {
+ kmem_cache_free(s->rcvbuf_cache, buf);
+ goto out_close;
+ }
+
+ s->tipc_conn_recvmsg(sock_net(con->sock->sk), con->conid, &addr,
+ con->usr_data, buf, ret);
+
+ kmem_cache_free(s->rcvbuf_cache, buf);
+
+ return 0;
+
+out_close:
+ if (ret != -EWOULDBLOCK)
+ tipc_close_conn(con);
+ else if (ret == 0)
+ /* Don't return success if we really got EOF */
+ ret = -EAGAIN;
+
+ return ret;
+}
+
+static int tipc_accept_from_sock(struct tipc_conn *con)
+{
+ struct tipc_server *s = con->server;
+ struct socket *sock = con->sock;
+ struct socket *newsock;
+ struct tipc_conn *newcon;
+ int ret;
+
+ ret = kernel_accept(sock, &newsock, O_NONBLOCK);
+ if (ret < 0)
+ return ret;
+
+ newcon = tipc_alloc_conn(con->server);
+ if (IS_ERR(newcon)) {
+ ret = PTR_ERR(newcon);
+ sock_release(newsock);
+ return ret;
+ }
+
+ newcon->rx_action = tipc_receive_from_sock;
+ tipc_register_callbacks(newsock, newcon);
+
+ /* Notify that new connection is incoming */
+ newcon->usr_data = s->tipc_conn_new(newcon->conid);
+
+ /* Wake up receive process in case of 'SYN+' message */
+ newsock->sk->sk_data_ready(newsock->sk);
+ return ret;
+}
+
+static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
+{
+ struct tipc_server *s = con->server;
+ struct socket *sock = NULL;
+ int ret;
+
+ ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1);
+ if (ret < 0)
+ return NULL;
+ ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE,
+ (char *)&s->imp, sizeof(s->imp));
+ if (ret < 0)
+ goto create_err;
+ ret = kernel_bind(sock, (struct sockaddr *)s->saddr, sizeof(*s->saddr));
+ if (ret < 0)
+ goto create_err;
+
+ switch (s->type) {
+ case SOCK_STREAM:
+ case SOCK_SEQPACKET:
+ con->rx_action = tipc_accept_from_sock;
+
+ ret = kernel_listen(sock, 0);
+ if (ret < 0)
+ goto create_err;
+ break;
+ case SOCK_DGRAM:
+ case SOCK_RDM:
+ con->rx_action = tipc_receive_from_sock;
+ break;
+ default:
+ pr_err("Unknown socket type %d\n", s->type);
+ goto create_err;
+ }
+
+ /* As server's listening socket owner and creator is the same module,
+ * we have to decrease TIPC module reference count to guarantee that
+ * it remains zero after the server socket is created, otherwise,
+ * executing "rmmod" command is unable to make TIPC module deleted
+ * after TIPC module is inserted successfully.
+ *
+ * However, the reference count is ever increased twice in
+ * sock_create_kern(): one is to increase the reference count of owner
+ * of TIPC socket's proto_ops struct; another is to increment the
+ * reference count of owner of TIPC proto struct. Therefore, we must
+ * decrement the module reference count twice to ensure that it keeps
+ * zero after server's listening socket is created. Of course, we
+ * must bump the module reference count twice as well before the socket
+ * is closed.
+ */
+ module_put(sock->ops->owner);
+ module_put(sock->sk->sk_prot_creator->owner);
+ set_bit(CF_SERVER, &con->flags);
+
+ return sock;
+
+create_err:
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sock_release(sock);
+ return NULL;
+}
+
+static int tipc_open_listening_sock(struct tipc_server *s)
+{
+ struct socket *sock;
+ struct tipc_conn *con;
+
+ con = tipc_alloc_conn(s);
+ if (IS_ERR(con))
+ return PTR_ERR(con);
+
+ sock = tipc_create_listen_sock(con);
+ if (!sock) {
+ idr_remove(&s->conn_idr, con->conid);
+ s->idr_in_use--;
+ kfree(con);
+ return -EINVAL;
+ }
+
+ tipc_register_callbacks(sock, con);
+ return 0;
+}
+
+static struct outqueue_entry *tipc_alloc_entry(void *data, int len)
+{
+ struct outqueue_entry *entry;
+ void *buf;
+
+ entry = kmalloc(sizeof(struct outqueue_entry), GFP_ATOMIC);
+ if (!entry)
+ return NULL;
+
+ buf = kmalloc(len, GFP_ATOMIC);
+ if (!buf) {
+ kfree(entry);
+ return NULL;
+ }
+
+ memcpy(buf, data, len);
+ entry->iov.iov_base = buf;
+ entry->iov.iov_len = len;
+
+ return entry;
+}
+
+static void tipc_free_entry(struct outqueue_entry *e)
+{
+ kfree(e->iov.iov_base);
+ kfree(e);
+}
+
+static void tipc_clean_outqueues(struct tipc_conn *con)
+{
+ struct outqueue_entry *e, *safe;
+
+ spin_lock_bh(&con->outqueue_lock);
+ list_for_each_entry_safe(e, safe, &con->outqueue, list) {
+ list_del(&e->list);
+ tipc_free_entry(e);
+ }
+ spin_unlock_bh(&con->outqueue_lock);
+}
+
+int tipc_conn_sendmsg(struct tipc_server *s, int conid,
+ struct sockaddr_tipc *addr, void *data, size_t len)
+{
+ struct outqueue_entry *e;
+ struct tipc_conn *con;
+
+ con = tipc_conn_lookup(s, conid);
+ if (!con)
+ return -EINVAL;
+
+ e = tipc_alloc_entry(data, len);
+ if (!e) {
+ conn_put(con);
+ return -ENOMEM;
+ }
+
+ if (addr)
+ memcpy(&e->dest, addr, sizeof(struct sockaddr_tipc));
+
+ spin_lock_bh(&con->outqueue_lock);
+ list_add_tail(&e->list, &con->outqueue);
+ spin_unlock_bh(&con->outqueue_lock);
+
+ if (test_bit(CF_CONNECTED, &con->flags)) {
+ if (!queue_work(s->send_wq, &con->swork))
+ conn_put(con);
+ } else {
+ conn_put(con);
+ }
+ return 0;
+}
+
+void tipc_conn_terminate(struct tipc_server *s, int conid)
+{
+ struct tipc_conn *con;
+
+ con = tipc_conn_lookup(s, conid);
+ if (con) {
+ tipc_close_conn(con);
+ conn_put(con);
+ }
+}
+
+static void tipc_send_to_sock(struct tipc_conn *con)
+{
+ int count = 0;
+ struct tipc_server *s = con->server;
+ struct outqueue_entry *e;
+ struct msghdr msg;
+ int ret;
+
+ spin_lock_bh(&con->outqueue_lock);
+ while (1) {
+ e = list_entry(con->outqueue.next, struct outqueue_entry,
+ list);
+ if ((struct list_head *) e == &con->outqueue)
+ break;
+ spin_unlock_bh(&con->outqueue_lock);
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT;
+
+ if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) {
+ msg.msg_name = &e->dest;
+ msg.msg_namelen = sizeof(struct sockaddr_tipc);
+ }
+ ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1,
+ e->iov.iov_len);
+ if (ret == -EWOULDBLOCK || ret == 0) {
+ cond_resched();
+ goto out;
+ } else if (ret < 0) {
+ goto send_err;
+ }
+
+ /* Don't starve users filling buffers */
+ if (++count >= MAX_SEND_MSG_COUNT) {
+ cond_resched();
+ count = 0;
+ }
+
+ spin_lock_bh(&con->outqueue_lock);
+ list_del(&e->list);
+ tipc_free_entry(e);
+ }
+ spin_unlock_bh(&con->outqueue_lock);
+out:
+ return;
+
+send_err:
+ tipc_close_conn(con);
+}
+
+static void tipc_recv_work(struct work_struct *work)
+{
+ struct tipc_conn *con = container_of(work, struct tipc_conn, rwork);
+ int count = 0;
+
+ while (test_bit(CF_CONNECTED, &con->flags)) {
+ if (con->rx_action(con))
+ break;
+
+ /* Don't flood Rx machine */
+ if (++count >= MAX_RECV_MSG_COUNT) {
+ cond_resched();
+ count = 0;
+ }
+ }
+ conn_put(con);
+}
+
+static void tipc_send_work(struct work_struct *work)
+{
+ struct tipc_conn *con = container_of(work, struct tipc_conn, swork);
+
+ if (test_bit(CF_CONNECTED, &con->flags))
+ tipc_send_to_sock(con);
+
+ conn_put(con);
+}
+
+static void tipc_work_stop(struct tipc_server *s)
+{
+ destroy_workqueue(s->rcv_wq);
+ destroy_workqueue(s->send_wq);
+}
+
+static int tipc_work_start(struct tipc_server *s)
+{
+ s->rcv_wq = alloc_workqueue("tipc_rcv", WQ_UNBOUND, 1);
+ if (!s->rcv_wq) {
+ pr_err("can't start tipc receive workqueue\n");
+ return -ENOMEM;
+ }
+
+ s->send_wq = alloc_workqueue("tipc_send", WQ_UNBOUND, 1);
+ if (!s->send_wq) {
+ pr_err("can't start tipc send workqueue\n");
+ destroy_workqueue(s->rcv_wq);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int tipc_server_start(struct tipc_server *s)
+{
+ int ret;
+
+ spin_lock_init(&s->idr_lock);
+ idr_init(&s->conn_idr);
+ s->idr_in_use = 0;
+
+ s->rcvbuf_cache = kmem_cache_create(s->name, s->max_rcvbuf_size,
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!s->rcvbuf_cache)
+ return -ENOMEM;
+
+ ret = tipc_work_start(s);
+ if (ret < 0) {
+ kmem_cache_destroy(s->rcvbuf_cache);
+ return ret;
+ }
+ ret = tipc_open_listening_sock(s);
+ if (ret < 0) {
+ tipc_work_stop(s);
+ kmem_cache_destroy(s->rcvbuf_cache);
+ return ret;
+ }
+ return ret;
+}
+
+void tipc_server_stop(struct tipc_server *s)
+{
+ struct tipc_conn *con;
+ int total = 0;
+ int id;
+
+ spin_lock_bh(&s->idr_lock);
+ for (id = 0; total < s->idr_in_use; id++) {
+ con = idr_find(&s->conn_idr, id);
+ if (con) {
+ total++;
+ spin_unlock_bh(&s->idr_lock);
+ tipc_close_conn(con);
+ spin_lock_bh(&s->idr_lock);
+ }
+ }
+ spin_unlock_bh(&s->idr_lock);
+
+ tipc_work_stop(s);
+ kmem_cache_destroy(s->rcvbuf_cache);
+ idr_destroy(&s->conn_idr);
+}
diff --git a/net/tipc/server.h b/net/tipc/server.h
new file mode 100644
index 000000000..9015faedb
--- /dev/null
+++ b/net/tipc/server.h
@@ -0,0 +1,97 @@
+/*
+ * net/tipc/server.h: Include file for TIPC server code
+ *
+ * Copyright (c) 2012-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_SERVER_H
+#define _TIPC_SERVER_H
+
+#include <linux/idr.h>
+#include <linux/tipc.h>
+#include <net/net_namespace.h>
+
+#define TIPC_SERVER_NAME_LEN 32
+
+/**
+ * struct tipc_server - TIPC server structure
+ * @conn_idr: identifier set of connection
+ * @idr_lock: protect the connection identifier set
+ * @idr_in_use: amount of allocated identifier entry
+ * @net: network namspace instance
+ * @rcvbuf_cache: memory cache of server receive buffer
+ * @rcv_wq: receive workqueue
+ * @send_wq: send workqueue
+ * @max_rcvbuf_size: maximum permitted receive message length
+ * @tipc_conn_new: callback will be called when new connection is incoming
+ * @tipc_conn_shutdown: callback will be called when connection is shut down
+ * @tipc_conn_recvmsg: callback will be called when message arrives
+ * @saddr: TIPC server address
+ * @name: server name
+ * @imp: message importance
+ * @type: socket type
+ */
+struct tipc_server {
+ struct idr conn_idr;
+ spinlock_t idr_lock;
+ int idr_in_use;
+ struct net *net;
+ struct kmem_cache *rcvbuf_cache;
+ struct workqueue_struct *rcv_wq;
+ struct workqueue_struct *send_wq;
+ int max_rcvbuf_size;
+ void *(*tipc_conn_new)(int conid);
+ void (*tipc_conn_shutdown)(int conid, void *usr_data);
+ void (*tipc_conn_recvmsg)(struct net *net, int conid,
+ struct sockaddr_tipc *addr, void *usr_data,
+ void *buf, size_t len);
+ struct sockaddr_tipc *saddr;
+ char name[TIPC_SERVER_NAME_LEN];
+ int imp;
+ int type;
+};
+
+int tipc_conn_sendmsg(struct tipc_server *s, int conid,
+ struct sockaddr_tipc *addr, void *data, size_t len);
+
+/**
+ * tipc_conn_terminate - terminate connection with server
+ *
+ * Note: Must call it in process context since it might sleep
+ */
+void tipc_conn_terminate(struct tipc_server *s, int conid);
+
+int tipc_server_start(struct tipc_server *s);
+
+void tipc_server_stop(struct tipc_server *s);
+
+#endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
new file mode 100644
index 000000000..f485600c4
--- /dev/null
+++ b/net/tipc/socket.c
@@ -0,0 +1,2837 @@
+/*
+ * net/tipc/socket.c: TIPC socket API
+ *
+ * Copyright (c) 2001-2007, 2012-2015, Ericsson AB
+ * Copyright (c) 2004-2008, 2010-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/rhashtable.h>
+#include "core.h"
+#include "name_table.h"
+#include "node.h"
+#include "link.h"
+#include "name_distr.h"
+#include "socket.h"
+
+#define SS_LISTENING -1 /* socket is listening */
+#define SS_READY -2 /* socket is connectionless */
+
+#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
+#define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */
+#define TIPC_FWD_MSG 1
+#define TIPC_CONN_OK 0
+#define TIPC_CONN_PROBING 1
+#define TIPC_MAX_PORT 0xffffffff
+#define TIPC_MIN_PORT 1
+
+/**
+ * struct tipc_sock - TIPC socket structure
+ * @sk: socket - interacts with 'port' and with user via the socket API
+ * @connected: non-zero if port is currently connected to a peer port
+ * @conn_type: TIPC type used when connection was established
+ * @conn_instance: TIPC instance used when connection was established
+ * @published: non-zero if port has one or more associated names
+ * @max_pkt: maximum packet size "hint" used when building messages sent by port
+ * @portid: unique port identity in TIPC socket hash table
+ * @phdr: preformatted message header used when sending messages
+ * @port_list: adjacent ports in TIPC's global list of ports
+ * @publications: list of publications for port
+ * @pub_count: total # of publications port has made during its lifetime
+ * @probing_state:
+ * @probing_intv:
+ * @conn_timeout: the time we can wait for an unresponded setup request
+ * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
+ * @link_cong: non-zero if owner must sleep because of link congestion
+ * @sent_unacked: # messages sent by socket, and not yet acked by peer
+ * @rcv_unacked: # messages read by user, but not yet acked back to peer
+ * @remote: 'connected' peer for dgram/rdm
+ * @node: hash table node
+ * @rcu: rcu struct for tipc_sock
+ */
+struct tipc_sock {
+ struct sock sk;
+ int connected;
+ u32 conn_type;
+ u32 conn_instance;
+ int published;
+ u32 max_pkt;
+ u32 portid;
+ struct tipc_msg phdr;
+ struct list_head sock_list;
+ struct list_head publications;
+ u32 pub_count;
+ u32 probing_state;
+ unsigned long probing_intv;
+ uint conn_timeout;
+ atomic_t dupl_rcvcnt;
+ bool link_cong;
+ uint sent_unacked;
+ uint rcv_unacked;
+ struct sockaddr_tipc remote;
+ struct rhash_head node;
+ struct rcu_head rcu;
+};
+
+static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+static void tipc_data_ready(struct sock *sk);
+static void tipc_write_space(struct sock *sk);
+static int tipc_release(struct socket *sock);
+static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags);
+static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p);
+static void tipc_sk_timeout(unsigned long data);
+static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
+ struct tipc_name_seq const *seq);
+static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
+ struct tipc_name_seq const *seq);
+static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
+static int tipc_sk_insert(struct tipc_sock *tsk);
+static void tipc_sk_remove(struct tipc_sock *tsk);
+static int __tipc_send_stream(struct socket *sock, struct msghdr *m,
+ size_t dsz);
+static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
+
+static const struct proto_ops packet_ops;
+static const struct proto_ops stream_ops;
+static const struct proto_ops msg_ops;
+static struct proto tipc_proto;
+
+static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
+ [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 },
+ [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 },
+ [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED },
+ [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG }
+};
+
+static const struct rhashtable_params tsk_rht_params;
+
+/*
+ * Revised TIPC socket locking policy:
+ *
+ * Most socket operations take the standard socket lock when they start
+ * and hold it until they finish (or until they need to sleep). Acquiring
+ * this lock grants the owner exclusive access to the fields of the socket
+ * data structures, with the exception of the backlog queue. A few socket
+ * operations can be done without taking the socket lock because they only
+ * read socket information that never changes during the life of the socket.
+ *
+ * Socket operations may acquire the lock for the associated TIPC port if they
+ * need to perform an operation on the port. If any routine needs to acquire
+ * both the socket lock and the port lock it must take the socket lock first
+ * to avoid the risk of deadlock.
+ *
+ * The dispatcher handling incoming messages cannot grab the socket lock in
+ * the standard fashion, since invoked it runs at the BH level and cannot block.
+ * Instead, it checks to see if the socket lock is currently owned by someone,
+ * and either handles the message itself or adds it to the socket's backlog
+ * queue; in the latter case the queued message is processed once the process
+ * owning the socket lock releases it.
+ *
+ * NOTE: Releasing the socket lock while an operation is sleeping overcomes
+ * the problem of a blocked socket operation preventing any other operations
+ * from occurring. However, applications must be careful if they have
+ * multiple threads trying to send (or receive) on the same socket, as these
+ * operations might interfere with each other. For example, doing a connect
+ * and a receive at the same time might allow the receive to consume the
+ * ACK message meant for the connect. While additional work could be done
+ * to try and overcome this, it doesn't seem to be worthwhile at the present.
+ *
+ * NOTE: Releasing the socket lock while an operation is sleeping also ensures
+ * that another operation that must be performed in a non-blocking manner is
+ * not delayed for very long because the lock has already been taken.
+ *
+ * NOTE: This code assumes that certain fields of a port/socket pair are
+ * constant over its lifetime; such fields can be examined without taking
+ * the socket lock and/or port lock, and do not need to be re-read even
+ * after resuming processing after waiting. These fields include:
+ * - socket type
+ * - pointer to socket sk structure (aka tipc_sock structure)
+ * - pointer to port structure
+ * - port reference
+ */
+
+static u32 tsk_own_node(struct tipc_sock *tsk)
+{
+ return msg_prevnode(&tsk->phdr);
+}
+
+static u32 tsk_peer_node(struct tipc_sock *tsk)
+{
+ return msg_destnode(&tsk->phdr);
+}
+
+static u32 tsk_peer_port(struct tipc_sock *tsk)
+{
+ return msg_destport(&tsk->phdr);
+}
+
+static bool tsk_unreliable(struct tipc_sock *tsk)
+{
+ return msg_src_droppable(&tsk->phdr) != 0;
+}
+
+static void tsk_set_unreliable(struct tipc_sock *tsk, bool unreliable)
+{
+ msg_set_src_droppable(&tsk->phdr, unreliable ? 1 : 0);
+}
+
+static bool tsk_unreturnable(struct tipc_sock *tsk)
+{
+ return msg_dest_droppable(&tsk->phdr) != 0;
+}
+
+static void tsk_set_unreturnable(struct tipc_sock *tsk, bool unreturnable)
+{
+ msg_set_dest_droppable(&tsk->phdr, unreturnable ? 1 : 0);
+}
+
+static int tsk_importance(struct tipc_sock *tsk)
+{
+ return msg_importance(&tsk->phdr);
+}
+
+static int tsk_set_importance(struct tipc_sock *tsk, int imp)
+{
+ if (imp > TIPC_CRITICAL_IMPORTANCE)
+ return -EINVAL;
+ msg_set_importance(&tsk->phdr, (u32)imp);
+ return 0;
+}
+
+static struct tipc_sock *tipc_sk(const struct sock *sk)
+{
+ return container_of(sk, struct tipc_sock, sk);
+}
+
+static int tsk_conn_cong(struct tipc_sock *tsk)
+{
+ return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN;
+}
+
+/**
+ * tsk_advance_rx_queue - discard first buffer in socket receive queue
+ *
+ * Caller must hold socket lock
+ */
+static void tsk_advance_rx_queue(struct sock *sk)
+{
+ kfree_skb(__skb_dequeue(&sk->sk_receive_queue));
+}
+
+/**
+ * tsk_rej_rx_queue - reject all buffers in socket receive queue
+ *
+ * Caller must hold socket lock
+ */
+static void tsk_rej_rx_queue(struct sock *sk)
+{
+ struct sk_buff *skb;
+ u32 dnode;
+ u32 own_node = tsk_own_node(tipc_sk(sk));
+
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue))) {
+ if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_ERR_NO_PORT))
+ tipc_link_xmit_skb(sock_net(sk), skb, dnode, 0);
+ }
+}
+
+/* tsk_peer_msg - verify if message was sent by connected port's peer
+ *
+ * Handles cases where the node's network address has changed from
+ * the default of <0.0.0> to its configured setting.
+ */
+static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
+{
+ struct tipc_net *tn = net_generic(sock_net(&tsk->sk), tipc_net_id);
+ u32 peer_port = tsk_peer_port(tsk);
+ u32 orig_node;
+ u32 peer_node;
+
+ if (unlikely(!tsk->connected))
+ return false;
+
+ if (unlikely(msg_origport(msg) != peer_port))
+ return false;
+
+ orig_node = msg_orignode(msg);
+ peer_node = tsk_peer_node(tsk);
+
+ if (likely(orig_node == peer_node))
+ return true;
+
+ if (!orig_node && (peer_node == tn->own_addr))
+ return true;
+
+ if (!peer_node && (orig_node == tn->own_addr))
+ return true;
+
+ return false;
+}
+
+/**
+ * tipc_sk_create - create a TIPC socket
+ * @net: network namespace (must be default network)
+ * @sock: pre-allocated socket structure
+ * @protocol: protocol indicator (must be 0)
+ * @kern: caused by kernel or by userspace?
+ *
+ * This routine creates additional data structures used by the TIPC socket,
+ * initializes them, and links them together.
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_sk_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
+{
+ struct tipc_net *tn;
+ const struct proto_ops *ops;
+ socket_state state;
+ struct sock *sk;
+ struct tipc_sock *tsk;
+ struct tipc_msg *msg;
+
+ /* Validate arguments */
+ if (unlikely(protocol != 0))
+ return -EPROTONOSUPPORT;
+
+ switch (sock->type) {
+ case SOCK_STREAM:
+ ops = &stream_ops;
+ state = SS_UNCONNECTED;
+ break;
+ case SOCK_SEQPACKET:
+ ops = &packet_ops;
+ state = SS_UNCONNECTED;
+ break;
+ case SOCK_DGRAM:
+ case SOCK_RDM:
+ ops = &msg_ops;
+ state = SS_READY;
+ break;
+ default:
+ return -EPROTOTYPE;
+ }
+
+ /* Allocate socket's protocol area */
+ sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto);
+ if (sk == NULL)
+ return -ENOMEM;
+
+ tsk = tipc_sk(sk);
+ tsk->max_pkt = MAX_PKT_DEFAULT;
+ INIT_LIST_HEAD(&tsk->publications);
+ msg = &tsk->phdr;
+ tn = net_generic(sock_net(sk), tipc_net_id);
+ tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
+ NAMED_H_SIZE, 0);
+
+ /* Finish initializing socket data structures */
+ sock->ops = ops;
+ sock->state = state;
+ sock_init_data(sock, sk);
+ if (tipc_sk_insert(tsk)) {
+ pr_warn("Socket create failed; port numbrer exhausted\n");
+ return -EINVAL;
+ }
+ msg_set_origport(msg, tsk->portid);
+ setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk);
+ sk->sk_backlog_rcv = tipc_backlog_rcv;
+ sk->sk_rcvbuf = sysctl_tipc_rmem[1];
+ sk->sk_data_ready = tipc_data_ready;
+ sk->sk_write_space = tipc_write_space;
+ tsk->conn_timeout = CONN_TIMEOUT_DEFAULT;
+ tsk->sent_unacked = 0;
+ atomic_set(&tsk->dupl_rcvcnt, 0);
+
+ if (sock->state == SS_READY) {
+ tsk_set_unreturnable(tsk, true);
+ if (sock->type == SOCK_DGRAM)
+ tsk_set_unreliable(tsk, true);
+ }
+ return 0;
+}
+
+static void tipc_sk_callback(struct rcu_head *head)
+{
+ struct tipc_sock *tsk = container_of(head, struct tipc_sock, rcu);
+
+ sock_put(&tsk->sk);
+}
+
+/**
+ * tipc_release - destroy a TIPC socket
+ * @sock: socket to destroy
+ *
+ * This routine cleans up any messages that are still queued on the socket.
+ * For DGRAM and RDM socket types, all queued messages are rejected.
+ * For SEQPACKET and STREAM socket types, the first message is rejected
+ * and any others are discarded. (If the first message on a STREAM socket
+ * is partially-read, it is discarded and the next one is rejected instead.)
+ *
+ * NOTE: Rejected messages are not necessarily returned to the sender! They
+ * are returned or discarded according to the "destination droppable" setting
+ * specified for the message by the sender.
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct net *net;
+ struct tipc_sock *tsk;
+ struct sk_buff *skb;
+ u32 dnode, probing_state;
+
+ /*
+ * Exit if socket isn't fully initialized (occurs when a failed accept()
+ * releases a pre-allocated child socket that was never used)
+ */
+ if (sk == NULL)
+ return 0;
+
+ net = sock_net(sk);
+ tsk = tipc_sk(sk);
+ lock_sock(sk);
+
+ /*
+ * Reject all unreceived messages, except on an active connection
+ * (which disconnects locally & sends a 'FIN+' to peer)
+ */
+ dnode = tsk_peer_node(tsk);
+ while (sock->state != SS_DISCONNECTING) {
+ skb = __skb_dequeue(&sk->sk_receive_queue);
+ if (skb == NULL)
+ break;
+ if (TIPC_SKB_CB(skb)->handle != NULL)
+ kfree_skb(skb);
+ else {
+ if ((sock->state == SS_CONNECTING) ||
+ (sock->state == SS_CONNECTED)) {
+ sock->state = SS_DISCONNECTING;
+ tsk->connected = 0;
+ tipc_node_remove_conn(net, dnode, tsk->portid);
+ }
+ if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode,
+ TIPC_ERR_NO_PORT))
+ tipc_link_xmit_skb(net, skb, dnode, 0);
+ }
+ }
+
+ tipc_sk_withdraw(tsk, 0, NULL);
+ probing_state = tsk->probing_state;
+ if (del_timer_sync(&sk->sk_timer) &&
+ probing_state != TIPC_CONN_PROBING)
+ sock_put(sk);
+ tipc_sk_remove(tsk);
+ if (tsk->connected) {
+ skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
+ TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
+ tsk_own_node(tsk), tsk_peer_port(tsk),
+ tsk->portid, TIPC_ERR_NO_PORT);
+ if (skb)
+ tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ tipc_node_remove_conn(net, dnode, tsk->portid);
+ }
+
+ /* Discard any remaining (connection-based) messages in receive queue */
+ __skb_queue_purge(&sk->sk_receive_queue);
+
+ /* Reject any messages that accumulated in backlog queue */
+ sock->state = SS_DISCONNECTING;
+ release_sock(sk);
+
+ call_rcu(&tsk->rcu, tipc_sk_callback);
+ sock->sk = NULL;
+
+ return 0;
+}
+
+/**
+ * tipc_bind - associate or disassocate TIPC name(s) with a socket
+ * @sock: socket structure
+ * @uaddr: socket address describing name(s) and desired operation
+ * @uaddr_len: size of socket address data structure
+ *
+ * Name and name sequence binding is indicated using a positive scope value;
+ * a negative scope value unbinds the specified name. Specifying no name
+ * (i.e. a socket address length of 0) unbinds all names from the socket.
+ *
+ * Returns 0 on success, errno otherwise
+ *
+ * NOTE: This routine doesn't need to take the socket lock since it doesn't
+ * access any non-constant socket information.
+ */
+static int tipc_bind(struct socket *sock, struct sockaddr *uaddr,
+ int uaddr_len)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ int res = -EINVAL;
+
+ lock_sock(sk);
+ if (unlikely(!uaddr_len)) {
+ res = tipc_sk_withdraw(tsk, 0, NULL);
+ goto exit;
+ }
+
+ if (uaddr_len < sizeof(struct sockaddr_tipc)) {
+ res = -EINVAL;
+ goto exit;
+ }
+ if (addr->family != AF_TIPC) {
+ res = -EAFNOSUPPORT;
+ goto exit;
+ }
+
+ if (addr->addrtype == TIPC_ADDR_NAME)
+ addr->addr.nameseq.upper = addr->addr.nameseq.lower;
+ else if (addr->addrtype != TIPC_ADDR_NAMESEQ) {
+ res = -EAFNOSUPPORT;
+ goto exit;
+ }
+
+ if ((addr->addr.nameseq.type < TIPC_RESERVED_TYPES) &&
+ (addr->addr.nameseq.type != TIPC_TOP_SRV) &&
+ (addr->addr.nameseq.type != TIPC_CFG_SRV)) {
+ res = -EACCES;
+ goto exit;
+ }
+
+ res = (addr->scope > 0) ?
+ tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) :
+ tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq);
+exit:
+ release_sock(sk);
+ return res;
+}
+
+/**
+ * tipc_getname - get port ID of socket or peer socket
+ * @sock: socket structure
+ * @uaddr: area for returned socket address
+ * @uaddr_len: area for returned length of socket address
+ * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID
+ *
+ * Returns 0 on success, errno otherwise
+ *
+ * NOTE: This routine doesn't need to take the socket lock since it only
+ * accesses socket information that is unchanging (or which changes in
+ * a completely predictable manner).
+ */
+static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
+ struct tipc_sock *tsk = tipc_sk(sock->sk);
+ struct tipc_net *tn = net_generic(sock_net(sock->sk), tipc_net_id);
+
+ memset(addr, 0, sizeof(*addr));
+ if (peer) {
+ if ((sock->state != SS_CONNECTED) &&
+ ((peer != 2) || (sock->state != SS_DISCONNECTING)))
+ return -ENOTCONN;
+ addr->addr.id.ref = tsk_peer_port(tsk);
+ addr->addr.id.node = tsk_peer_node(tsk);
+ } else {
+ addr->addr.id.ref = tsk->portid;
+ addr->addr.id.node = tn->own_addr;
+ }
+
+ *uaddr_len = sizeof(*addr);
+ addr->addrtype = TIPC_ADDR_ID;
+ addr->family = AF_TIPC;
+ addr->scope = 0;
+ addr->addr.name.domain = 0;
+
+ return 0;
+}
+
+/**
+ * tipc_poll - read and possibly block on pollmask
+ * @file: file structure associated with the socket
+ * @sock: socket for which to calculate the poll bits
+ * @wait: ???
+ *
+ * Returns pollmask value
+ *
+ * COMMENTARY:
+ * It appears that the usual socket locking mechanisms are not useful here
+ * since the pollmask info is potentially out-of-date the moment this routine
+ * exits. TCP and other protocols seem to rely on higher level poll routines
+ * to handle any preventable race conditions, so TIPC will do the same ...
+ *
+ * TIPC sets the returned events as follows:
+ *
+ * socket state flags set
+ * ------------ ---------
+ * unconnected no read flags
+ * POLLOUT if port is not congested
+ *
+ * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue
+ * no write flags
+ *
+ * connected POLLIN/POLLRDNORM if data in rx queue
+ * POLLOUT if port is not congested
+ *
+ * disconnecting POLLIN/POLLRDNORM/POLLHUP
+ * no write flags
+ *
+ * listening POLLIN if SYN in rx queue
+ * no write flags
+ *
+ * ready POLLIN/POLLRDNORM if data in rx queue
+ * [connectionless] POLLOUT (since port cannot be congested)
+ *
+ * IMPORTANT: The fact that a read or write operation is indicated does NOT
+ * imply that the operation will succeed, merely that it should be performed
+ * and will not block.
+ */
+static unsigned int tipc_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ u32 mask = 0;
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+
+ switch ((int)sock->state) {
+ case SS_UNCONNECTED:
+ if (!tsk->link_cong)
+ mask |= POLLOUT;
+ break;
+ case SS_READY:
+ case SS_CONNECTED:
+ if (!tsk->link_cong && !tsk_conn_cong(tsk))
+ mask |= POLLOUT;
+ /* fall thru' */
+ case SS_CONNECTING:
+ case SS_LISTENING:
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= (POLLIN | POLLRDNORM);
+ break;
+ case SS_DISCONNECTING:
+ mask = (POLLIN | POLLRDNORM | POLLHUP);
+ break;
+ }
+
+ return mask;
+}
+
+/**
+ * tipc_sendmcast - send multicast message
+ * @sock: socket structure
+ * @seq: destination address
+ * @msg: message to send
+ * @dsz: total length of message data
+ * @timeo: timeout to wait for wakeup
+ *
+ * Called from function tipc_sendmsg(), which has done all sanity checks
+ * Returns the number of bytes sent on success, or errno
+ */
+static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
+ struct msghdr *msg, size_t dsz, long timeo)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct net *net = sock_net(sk);
+ struct tipc_msg *mhdr = &tsk->phdr;
+ struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ struct iov_iter save = msg->msg_iter;
+ uint mtu;
+ int rc;
+
+ msg_set_type(mhdr, TIPC_MCAST_MSG);
+ msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE);
+ msg_set_destport(mhdr, 0);
+ msg_set_destnode(mhdr, 0);
+ msg_set_nametype(mhdr, seq->type);
+ msg_set_namelower(mhdr, seq->lower);
+ msg_set_nameupper(mhdr, seq->upper);
+ msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
+
+new_mtu:
+ mtu = tipc_bclink_get_mtu();
+ rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain);
+ if (unlikely(rc < 0))
+ return rc;
+
+ do {
+ rc = tipc_bclink_xmit(net, pktchain);
+ if (likely(rc >= 0)) {
+ rc = dsz;
+ break;
+ }
+ if (rc == -EMSGSIZE) {
+ msg->msg_iter = save;
+ goto new_mtu;
+ }
+ if (rc != -ELINKCONG)
+ break;
+ tipc_sk(sk)->link_cong = 1;
+ rc = tipc_wait_for_sndmsg(sock, &timeo);
+ if (rc)
+ __skb_queue_purge(pktchain);
+ } while (!rc);
+ return rc;
+}
+
+/**
+ * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets
+ * @arrvq: queue with arriving messages, to be cloned after destination lookup
+ * @inputq: queue with cloned messages, delivered to socket after dest lookup
+ *
+ * Multi-threaded: parallel calls with reference to same queues may occur
+ */
+void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
+ struct sk_buff_head *inputq)
+{
+ struct tipc_msg *msg;
+ struct tipc_plist dports;
+ u32 portid;
+ u32 scope = TIPC_CLUSTER_SCOPE;
+ struct sk_buff_head tmpq;
+ uint hsz;
+ struct sk_buff *skb, *_skb;
+
+ __skb_queue_head_init(&tmpq);
+ tipc_plist_init(&dports);
+
+ skb = tipc_skb_peek(arrvq, &inputq->lock);
+ for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
+ msg = buf_msg(skb);
+ hsz = skb_headroom(skb) + msg_hdr_sz(msg);
+
+ if (in_own_node(net, msg_orignode(msg)))
+ scope = TIPC_NODE_SCOPE;
+
+ /* Create destination port list and message clones: */
+ tipc_nametbl_mc_translate(net,
+ msg_nametype(msg), msg_namelower(msg),
+ msg_nameupper(msg), scope, &dports);
+ portid = tipc_plist_pop(&dports);
+ for (; portid; portid = tipc_plist_pop(&dports)) {
+ _skb = __pskb_copy(skb, hsz, GFP_ATOMIC);
+ if (_skb) {
+ msg_set_destport(buf_msg(_skb), portid);
+ __skb_queue_tail(&tmpq, _skb);
+ continue;
+ }
+ pr_warn("Failed to clone mcast rcv buffer\n");
+ }
+ /* Append to inputq if not already done by other thread */
+ spin_lock_bh(&inputq->lock);
+ if (skb_peek(arrvq) == skb) {
+ skb_queue_splice_tail_init(&tmpq, inputq);
+ kfree_skb(__skb_dequeue(arrvq));
+ }
+ spin_unlock_bh(&inputq->lock);
+ __skb_queue_purge(&tmpq);
+ kfree_skb(skb);
+ }
+ tipc_sk_rcv(net, inputq);
+}
+
+/**
+ * tipc_sk_proto_rcv - receive a connection mng protocol message
+ * @tsk: receiving socket
+ * @skb: pointer to message buffer. Set to NULL if buffer is consumed.
+ */
+static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff **skb)
+{
+ struct tipc_msg *msg = buf_msg(*skb);
+ int conn_cong;
+ u32 dnode;
+ u32 own_node = tsk_own_node(tsk);
+ /* Ignore if connection cannot be validated: */
+ if (!tsk_peer_msg(tsk, msg))
+ goto exit;
+
+ tsk->probing_state = TIPC_CONN_OK;
+
+ if (msg_type(msg) == CONN_ACK) {
+ conn_cong = tsk_conn_cong(tsk);
+ tsk->sent_unacked -= msg_msgcnt(msg);
+ if (conn_cong)
+ tsk->sk.sk_write_space(&tsk->sk);
+ } else if (msg_type(msg) == CONN_PROBE) {
+ if (tipc_msg_reverse(own_node, *skb, &dnode, TIPC_OK)) {
+ msg_set_type(msg, CONN_PROBE_REPLY);
+ return;
+ }
+ }
+ /* Do nothing if msg_type() == CONN_PROBE_REPLY */
+exit:
+ kfree_skb(*skb);
+ *skb = NULL;
+}
+
+static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ DEFINE_WAIT(wait);
+ int done;
+
+ do {
+ int err = sock_error(sk);
+ if (err)
+ return err;
+ if (sock->state == SS_DISCONNECTING)
+ return -EPIPE;
+ if (!*timeo_p)
+ return -EAGAIN;
+ if (signal_pending(current))
+ return sock_intr_errno(*timeo_p);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ done = sk_wait_event(sk, timeo_p, !tsk->link_cong);
+ finish_wait(sk_sleep(sk), &wait);
+ } while (!done);
+ return 0;
+}
+
+/**
+ * tipc_sendmsg - send message in connectionless manner
+ * @sock: socket structure
+ * @m: message to send
+ * @dsz: amount of user data to be sent
+ *
+ * Message must have an destination specified explicitly.
+ * Used for SOCK_RDM and SOCK_DGRAM messages,
+ * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections.
+ * (Note: 'SYN+' is prohibited on SOCK_STREAM.)
+ *
+ * Returns the number of bytes sent on success, or errno otherwise
+ */
+static int tipc_sendmsg(struct socket *sock,
+ struct msghdr *m, size_t dsz)
+{
+ struct sock *sk = sock->sk;
+ int ret;
+
+ lock_sock(sk);
+ ret = __tipc_sendmsg(sock, m, dsz);
+ release_sock(sk);
+
+ return ret;
+}
+
+static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
+{
+ DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct net *net = sock_net(sk);
+ struct tipc_msg *mhdr = &tsk->phdr;
+ u32 dnode, dport;
+ struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ struct sk_buff *skb;
+ struct tipc_name_seq *seq;
+ struct iov_iter save;
+ u32 mtu;
+ long timeo;
+ int rc;
+
+ if (dsz > TIPC_MAX_USER_MSG_SIZE)
+ return -EMSGSIZE;
+ if (unlikely(!dest)) {
+ if (tsk->connected && sock->state == SS_READY)
+ dest = &tsk->remote;
+ else
+ return -EDESTADDRREQ;
+ } else if (unlikely(m->msg_namelen < sizeof(*dest)) ||
+ dest->family != AF_TIPC) {
+ return -EINVAL;
+ }
+ if (unlikely(sock->state != SS_READY)) {
+ if (sock->state == SS_LISTENING)
+ return -EPIPE;
+ if (sock->state != SS_UNCONNECTED)
+ return -EISCONN;
+ if (tsk->published)
+ return -EOPNOTSUPP;
+ if (dest->addrtype == TIPC_ADDR_NAME) {
+ tsk->conn_type = dest->addr.name.name.type;
+ tsk->conn_instance = dest->addr.name.name.instance;
+ }
+ }
+ seq = &dest->addr.nameseq;
+ timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+
+ if (dest->addrtype == TIPC_ADDR_MCAST) {
+ return tipc_sendmcast(sock, seq, m, dsz, timeo);
+ } else if (dest->addrtype == TIPC_ADDR_NAME) {
+ u32 type = dest->addr.name.name.type;
+ u32 inst = dest->addr.name.name.instance;
+ u32 domain = dest->addr.name.domain;
+
+ dnode = domain;
+ msg_set_type(mhdr, TIPC_NAMED_MSG);
+ msg_set_hdr_sz(mhdr, NAMED_H_SIZE);
+ msg_set_nametype(mhdr, type);
+ msg_set_nameinst(mhdr, inst);
+ msg_set_lookup_scope(mhdr, tipc_addr_scope(domain));
+ dport = tipc_nametbl_translate(net, type, inst, &dnode);
+ msg_set_destnode(mhdr, dnode);
+ msg_set_destport(mhdr, dport);
+ if (unlikely(!dport && !dnode))
+ return -EHOSTUNREACH;
+ } else if (dest->addrtype == TIPC_ADDR_ID) {
+ dnode = dest->addr.id.node;
+ msg_set_type(mhdr, TIPC_DIRECT_MSG);
+ msg_set_lookup_scope(mhdr, 0);
+ msg_set_destnode(mhdr, dnode);
+ msg_set_destport(mhdr, dest->addr.id.ref);
+ msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
+ }
+
+ save = m->msg_iter;
+new_mtu:
+ mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
+ rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain);
+ if (rc < 0)
+ return rc;
+
+ do {
+ skb = skb_peek(pktchain);
+ TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
+ rc = tipc_link_xmit(net, pktchain, dnode, tsk->portid);
+ if (likely(rc >= 0)) {
+ if (sock->state != SS_READY)
+ sock->state = SS_CONNECTING;
+ rc = dsz;
+ break;
+ }
+ if (rc == -EMSGSIZE) {
+ m->msg_iter = save;
+ goto new_mtu;
+ }
+ if (rc != -ELINKCONG)
+ break;
+ tsk->link_cong = 1;
+ rc = tipc_wait_for_sndmsg(sock, &timeo);
+ if (rc)
+ __skb_queue_purge(pktchain);
+ } while (!rc);
+
+ return rc;
+}
+
+static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ DEFINE_WAIT(wait);
+ int done;
+
+ do {
+ int err = sock_error(sk);
+ if (err)
+ return err;
+ if (sock->state == SS_DISCONNECTING)
+ return -EPIPE;
+ else if (sock->state != SS_CONNECTED)
+ return -ENOTCONN;
+ if (!*timeo_p)
+ return -EAGAIN;
+ if (signal_pending(current))
+ return sock_intr_errno(*timeo_p);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ done = sk_wait_event(sk, timeo_p,
+ (!tsk->link_cong &&
+ !tsk_conn_cong(tsk)) ||
+ !tsk->connected);
+ finish_wait(sk_sleep(sk), &wait);
+ } while (!done);
+ return 0;
+}
+
+/**
+ * tipc_send_stream - send stream-oriented data
+ * @sock: socket structure
+ * @m: data to send
+ * @dsz: total length of data to be transmitted
+ *
+ * Used for SOCK_STREAM data.
+ *
+ * Returns the number of bytes sent on success (or partial success),
+ * or errno if no data sent
+ */
+static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
+{
+ struct sock *sk = sock->sk;
+ int ret;
+
+ lock_sock(sk);
+ ret = __tipc_send_stream(sock, m, dsz);
+ release_sock(sk);
+
+ return ret;
+}
+
+static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct tipc_msg *mhdr = &tsk->phdr;
+ struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
+ u32 portid = tsk->portid;
+ int rc = -EINVAL;
+ long timeo;
+ u32 dnode;
+ uint mtu, send, sent = 0;
+ struct iov_iter save;
+
+ /* Handle implied connection establishment */
+ if (unlikely(dest)) {
+ rc = __tipc_sendmsg(sock, m, dsz);
+ if (dsz && (dsz == rc))
+ tsk->sent_unacked = 1;
+ return rc;
+ }
+ if (dsz > (uint)INT_MAX)
+ return -EMSGSIZE;
+
+ if (unlikely(sock->state != SS_CONNECTED)) {
+ if (sock->state == SS_DISCONNECTING)
+ return -EPIPE;
+ else
+ return -ENOTCONN;
+ }
+
+ timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+ dnode = tsk_peer_node(tsk);
+
+next:
+ save = m->msg_iter;
+ mtu = tsk->max_pkt;
+ send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
+ rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain);
+ if (unlikely(rc < 0))
+ return rc;
+ do {
+ if (likely(!tsk_conn_cong(tsk))) {
+ rc = tipc_link_xmit(net, pktchain, dnode, portid);
+ if (likely(!rc)) {
+ tsk->sent_unacked++;
+ sent += send;
+ if (sent == dsz)
+ break;
+ goto next;
+ }
+ if (rc == -EMSGSIZE) {
+ tsk->max_pkt = tipc_node_get_mtu(net, dnode,
+ portid);
+ m->msg_iter = save;
+ goto next;
+ }
+ if (rc != -ELINKCONG)
+ break;
+ tsk->link_cong = 1;
+ }
+ rc = tipc_wait_for_sndpkt(sock, &timeo);
+ if (rc)
+ __skb_queue_purge(pktchain);
+ } while (!rc);
+
+ return sent ? sent : rc;
+}
+
+/**
+ * tipc_send_packet - send a connection-oriented message
+ * @sock: socket structure
+ * @m: message to send
+ * @dsz: length of data to be transmitted
+ *
+ * Used for SOCK_SEQPACKET messages.
+ *
+ * Returns the number of bytes sent on success, or errno otherwise
+ */
+static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz)
+{
+ if (dsz > TIPC_MAX_USER_MSG_SIZE)
+ return -EMSGSIZE;
+
+ return tipc_send_stream(sock, m, dsz);
+}
+
+/* tipc_sk_finish_conn - complete the setup of a connection
+ */
+static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
+ u32 peer_node)
+{
+ struct sock *sk = &tsk->sk;
+ struct net *net = sock_net(sk);
+ struct tipc_msg *msg = &tsk->phdr;
+
+ msg_set_destnode(msg, peer_node);
+ msg_set_destport(msg, peer_port);
+ msg_set_type(msg, TIPC_CONN_MSG);
+ msg_set_lookup_scope(msg, 0);
+ msg_set_hdr_sz(msg, SHORT_H_SIZE);
+
+ tsk->probing_intv = CONN_PROBING_INTERVAL;
+ tsk->probing_state = TIPC_CONN_OK;
+ tsk->connected = 1;
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
+ tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
+ tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid);
+}
+
+/**
+ * set_orig_addr - capture sender's address for received message
+ * @m: descriptor for message info
+ * @msg: received message header
+ *
+ * Note: Address is not captured if not requested by receiver.
+ */
+static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg)
+{
+ DECLARE_SOCKADDR(struct sockaddr_tipc *, addr, m->msg_name);
+
+ if (addr) {
+ addr->family = AF_TIPC;
+ addr->addrtype = TIPC_ADDR_ID;
+ memset(&addr->addr, 0, sizeof(addr->addr));
+ addr->addr.id.ref = msg_origport(msg);
+ addr->addr.id.node = msg_orignode(msg);
+ addr->addr.name.domain = 0; /* could leave uninitialized */
+ addr->scope = 0; /* could leave uninitialized */
+ m->msg_namelen = sizeof(struct sockaddr_tipc);
+ }
+}
+
+/**
+ * tipc_sk_anc_data_recv - optionally capture ancillary data for received message
+ * @m: descriptor for message info
+ * @msg: received message header
+ * @tsk: TIPC port associated with message
+ *
+ * Note: Ancillary data is not captured if not requested by receiver.
+ *
+ * Returns 0 if successful, otherwise errno
+ */
+static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg,
+ struct tipc_sock *tsk)
+{
+ u32 anc_data[3];
+ u32 err;
+ u32 dest_type;
+ int has_name;
+ int res;
+
+ if (likely(m->msg_controllen == 0))
+ return 0;
+
+ /* Optionally capture errored message object(s) */
+ err = msg ? msg_errcode(msg) : 0;
+ if (unlikely(err)) {
+ anc_data[0] = err;
+ anc_data[1] = msg_data_sz(msg);
+ res = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, anc_data);
+ if (res)
+ return res;
+ if (anc_data[1]) {
+ res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1],
+ msg_data(msg));
+ if (res)
+ return res;
+ }
+ }
+
+ /* Optionally capture message destination object */
+ dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG;
+ switch (dest_type) {
+ case TIPC_NAMED_MSG:
+ has_name = 1;
+ anc_data[0] = msg_nametype(msg);
+ anc_data[1] = msg_namelower(msg);
+ anc_data[2] = msg_namelower(msg);
+ break;
+ case TIPC_MCAST_MSG:
+ has_name = 1;
+ anc_data[0] = msg_nametype(msg);
+ anc_data[1] = msg_namelower(msg);
+ anc_data[2] = msg_nameupper(msg);
+ break;
+ case TIPC_CONN_MSG:
+ has_name = (tsk->conn_type != 0);
+ anc_data[0] = tsk->conn_type;
+ anc_data[1] = tsk->conn_instance;
+ anc_data[2] = tsk->conn_instance;
+ break;
+ default:
+ has_name = 0;
+ }
+ if (has_name) {
+ res = put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, anc_data);
+ if (res)
+ return res;
+ }
+
+ return 0;
+}
+
+static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack)
+{
+ struct net *net = sock_net(&tsk->sk);
+ struct sk_buff *skb = NULL;
+ struct tipc_msg *msg;
+ u32 peer_port = tsk_peer_port(tsk);
+ u32 dnode = tsk_peer_node(tsk);
+
+ if (!tsk->connected)
+ return;
+ skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0,
+ dnode, tsk_own_node(tsk), peer_port,
+ tsk->portid, TIPC_OK);
+ if (!skb)
+ return;
+ msg = buf_msg(skb);
+ msg_set_msgcnt(msg, ack);
+ tipc_link_xmit_skb(net, skb, dnode, msg_link_selector(msg));
+}
+
+static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
+{
+ struct sock *sk = sock->sk;
+ DEFINE_WAIT(wait);
+ long timeo = *timeop;
+ int err;
+
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (timeo && skb_queue_empty(&sk->sk_receive_queue)) {
+ if (sock->state == SS_DISCONNECTING) {
+ err = -ENOTCONN;
+ break;
+ }
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ }
+ err = 0;
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ *timeop = timeo;
+ return err;
+}
+
+/**
+ * tipc_recvmsg - receive packet-oriented message
+ * @m: descriptor for message info
+ * @buf_len: total size of user buffer area
+ * @flags: receive flags
+ *
+ * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages.
+ * If the complete message doesn't fit in user area, truncate it.
+ *
+ * Returns size of returned message data, errno otherwise
+ */
+static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct sk_buff *buf;
+ struct tipc_msg *msg;
+ long timeo;
+ unsigned int sz;
+ u32 err;
+ int res;
+
+ /* Catch invalid receive requests */
+ if (unlikely(!buf_len))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (unlikely(sock->state == SS_UNCONNECTED)) {
+ res = -ENOTCONN;
+ goto exit;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+restart:
+
+ /* Look for a message in receive queue; wait if necessary */
+ res = tipc_wait_for_rcvmsg(sock, &timeo);
+ if (res)
+ goto exit;
+
+ /* Look at first message in receive queue */
+ buf = skb_peek(&sk->sk_receive_queue);
+ msg = buf_msg(buf);
+ sz = msg_data_sz(msg);
+ err = msg_errcode(msg);
+
+ /* Discard an empty non-errored message & try again */
+ if ((!sz) && (!err)) {
+ tsk_advance_rx_queue(sk);
+ goto restart;
+ }
+
+ /* Capture sender's address (optional) */
+ set_orig_addr(m, msg);
+
+ /* Capture ancillary data (optional) */
+ res = tipc_sk_anc_data_recv(m, msg, tsk);
+ if (res)
+ goto exit;
+
+ /* Capture message data (if valid) & compute return value (always) */
+ if (!err) {
+ if (unlikely(buf_len < sz)) {
+ sz = buf_len;
+ m->msg_flags |= MSG_TRUNC;
+ }
+ res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz);
+ if (res)
+ goto exit;
+ res = sz;
+ } else {
+ if ((sock->state == SS_READY) ||
+ ((err == TIPC_CONN_SHUTDOWN) || m->msg_control))
+ res = 0;
+ else
+ res = -ECONNRESET;
+ }
+
+ /* Consume received message (optional) */
+ if (likely(!(flags & MSG_PEEK))) {
+ if ((sock->state != SS_READY) &&
+ (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) {
+ tipc_sk_send_ack(tsk, tsk->rcv_unacked);
+ tsk->rcv_unacked = 0;
+ }
+ tsk_advance_rx_queue(sk);
+ }
+exit:
+ release_sock(sk);
+ return res;
+}
+
+/**
+ * tipc_recv_stream - receive stream-oriented data
+ * @m: descriptor for message info
+ * @buf_len: total size of user buffer area
+ * @flags: receive flags
+ *
+ * Used for SOCK_STREAM messages only. If not enough data is available
+ * will optionally wait for more; never truncates data.
+ *
+ * Returns size of returned message data, errno otherwise
+ */
+static int tipc_recv_stream(struct socket *sock, struct msghdr *m,
+ size_t buf_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct sk_buff *buf;
+ struct tipc_msg *msg;
+ long timeo;
+ unsigned int sz;
+ int sz_to_copy, target, needed;
+ int sz_copied = 0;
+ u32 err;
+ int res = 0;
+
+ /* Catch invalid receive attempts */
+ if (unlikely(!buf_len))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (unlikely(sock->state == SS_UNCONNECTED)) {
+ res = -ENOTCONN;
+ goto exit;
+ }
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+restart:
+ /* Look for a message in receive queue; wait if necessary */
+ res = tipc_wait_for_rcvmsg(sock, &timeo);
+ if (res)
+ goto exit;
+
+ /* Look at first message in receive queue */
+ buf = skb_peek(&sk->sk_receive_queue);
+ msg = buf_msg(buf);
+ sz = msg_data_sz(msg);
+ err = msg_errcode(msg);
+
+ /* Discard an empty non-errored message & try again */
+ if ((!sz) && (!err)) {
+ tsk_advance_rx_queue(sk);
+ goto restart;
+ }
+
+ /* Optionally capture sender's address & ancillary data of first msg */
+ if (sz_copied == 0) {
+ set_orig_addr(m, msg);
+ res = tipc_sk_anc_data_recv(m, msg, tsk);
+ if (res)
+ goto exit;
+ }
+
+ /* Capture message data (if valid) & compute return value (always) */
+ if (!err) {
+ u32 offset = (u32)(unsigned long)(TIPC_SKB_CB(buf)->handle);
+
+ sz -= offset;
+ needed = (buf_len - sz_copied);
+ sz_to_copy = (sz <= needed) ? sz : needed;
+
+ res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset,
+ m, sz_to_copy);
+ if (res)
+ goto exit;
+
+ sz_copied += sz_to_copy;
+
+ if (sz_to_copy < sz) {
+ if (!(flags & MSG_PEEK))
+ TIPC_SKB_CB(buf)->handle =
+ (void *)(unsigned long)(offset + sz_to_copy);
+ goto exit;
+ }
+ } else {
+ if (sz_copied != 0)
+ goto exit; /* can't add error msg to valid data */
+
+ if ((err == TIPC_CONN_SHUTDOWN) || m->msg_control)
+ res = 0;
+ else
+ res = -ECONNRESET;
+ }
+
+ /* Consume received message (optional) */
+ if (likely(!(flags & MSG_PEEK))) {
+ if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) {
+ tipc_sk_send_ack(tsk, tsk->rcv_unacked);
+ tsk->rcv_unacked = 0;
+ }
+ tsk_advance_rx_queue(sk);
+ }
+
+ /* Loop around if more data is required */
+ if ((sz_copied < buf_len) && /* didn't get all requested data */
+ (!skb_queue_empty(&sk->sk_receive_queue) ||
+ (sz_copied < target)) && /* and more is ready or required */
+ (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */
+ (!err)) /* and haven't reached a FIN */
+ goto restart;
+
+exit:
+ release_sock(sk);
+ return sz_copied ? sz_copied : res;
+}
+
+/**
+ * tipc_write_space - wake up thread if port congestion is released
+ * @sk: socket
+ */
+static void tipc_write_space(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+ POLLWRNORM | POLLWRBAND);
+ rcu_read_unlock();
+}
+
+/**
+ * tipc_data_ready - wake up threads to indicate messages have been received
+ * @sk: socket
+ * @len: the length of messages
+ */
+static void tipc_data_ready(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+ POLLRDNORM | POLLRDBAND);
+ rcu_read_unlock();
+}
+
+/**
+ * filter_connect - Handle all incoming messages for a connection-based socket
+ * @tsk: TIPC socket
+ * @skb: pointer to message buffer. Set to NULL if buffer is consumed
+ *
+ * Returns 0 (TIPC_OK) if everything ok, -TIPC_ERR_NO_PORT otherwise
+ */
+static int filter_connect(struct tipc_sock *tsk, struct sk_buff **skb)
+{
+ struct sock *sk = &tsk->sk;
+ struct net *net = sock_net(sk);
+ struct socket *sock = sk->sk_socket;
+ struct tipc_msg *msg = buf_msg(*skb);
+ int retval = -TIPC_ERR_NO_PORT;
+
+ if (msg_mcast(msg))
+ return retval;
+
+ switch ((int)sock->state) {
+ case SS_CONNECTED:
+ /* Accept only connection-based messages sent by peer */
+ if (tsk_peer_msg(tsk, msg)) {
+ if (unlikely(msg_errcode(msg))) {
+ sock->state = SS_DISCONNECTING;
+ tsk->connected = 0;
+ /* let timer expire on it's own */
+ tipc_node_remove_conn(net, tsk_peer_node(tsk),
+ tsk->portid);
+ }
+ retval = TIPC_OK;
+ }
+ break;
+ case SS_CONNECTING:
+ /* Accept only ACK or NACK message */
+
+ if (unlikely(!msg_connected(msg)))
+ break;
+
+ if (unlikely(msg_errcode(msg))) {
+ sock->state = SS_DISCONNECTING;
+ sk->sk_err = ECONNREFUSED;
+ retval = TIPC_OK;
+ break;
+ }
+
+ if (unlikely(msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE)) {
+ sock->state = SS_DISCONNECTING;
+ sk->sk_err = EINVAL;
+ retval = TIPC_OK;
+ break;
+ }
+
+ tipc_sk_finish_conn(tsk, msg_origport(msg), msg_orignode(msg));
+ msg_set_importance(&tsk->phdr, msg_importance(msg));
+ sock->state = SS_CONNECTED;
+
+ /* If an incoming message is an 'ACK-', it should be
+ * discarded here because it doesn't contain useful
+ * data. In addition, we should try to wake up
+ * connect() routine if sleeping.
+ */
+ if (msg_data_sz(msg) == 0) {
+ kfree_skb(*skb);
+ *skb = NULL;
+ if (waitqueue_active(sk_sleep(sk)))
+ wake_up_interruptible(sk_sleep(sk));
+ }
+ retval = TIPC_OK;
+ break;
+ case SS_LISTENING:
+ case SS_UNCONNECTED:
+ /* Accept only SYN message */
+ if (!msg_connected(msg) && !(msg_errcode(msg)))
+ retval = TIPC_OK;
+ break;
+ case SS_DISCONNECTING:
+ break;
+ default:
+ pr_err("Unknown socket state %u\n", sock->state);
+ }
+ return retval;
+}
+
+/**
+ * rcvbuf_limit - get proper overload limit of socket receive queue
+ * @sk: socket
+ * @buf: message
+ *
+ * For all connection oriented messages, irrespective of importance,
+ * the default overload value (i.e. 67MB) is set as limit.
+ *
+ * For all connectionless messages, by default new queue limits are
+ * as belows:
+ *
+ * TIPC_LOW_IMPORTANCE (4 MB)
+ * TIPC_MEDIUM_IMPORTANCE (8 MB)
+ * TIPC_HIGH_IMPORTANCE (16 MB)
+ * TIPC_CRITICAL_IMPORTANCE (32 MB)
+ *
+ * Returns overload limit according to corresponding message importance
+ */
+static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
+{
+ struct tipc_msg *msg = buf_msg(buf);
+
+ if (msg_connected(msg))
+ return sysctl_tipc_rmem[2];
+
+ return sk->sk_rcvbuf >> TIPC_CRITICAL_IMPORTANCE <<
+ msg_importance(msg);
+}
+
+/**
+ * filter_rcv - validate incoming message
+ * @sk: socket
+ * @skb: pointer to message. Set to NULL if buffer is consumed.
+ *
+ * Enqueues message on receive queue if acceptable; optionally handles
+ * disconnect indication for a connected socket.
+ *
+ * Called with socket lock already taken
+ *
+ * Returns 0 (TIPC_OK) if message was ok, -TIPC error code if rejected
+ */
+static int filter_rcv(struct sock *sk, struct sk_buff **skb)
+{
+ struct socket *sock = sk->sk_socket;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct tipc_msg *msg = buf_msg(*skb);
+ unsigned int limit = rcvbuf_limit(sk, *skb);
+ int rc = TIPC_OK;
+
+ if (unlikely(msg_user(msg) == CONN_MANAGER)) {
+ tipc_sk_proto_rcv(tsk, skb);
+ return TIPC_OK;
+ }
+
+ if (unlikely(msg_user(msg) == SOCK_WAKEUP)) {
+ kfree_skb(*skb);
+ tsk->link_cong = 0;
+ sk->sk_write_space(sk);
+ *skb = NULL;
+ return TIPC_OK;
+ }
+
+ /* Reject message if it is wrong sort of message for socket */
+ if (msg_type(msg) > TIPC_DIRECT_MSG)
+ return -TIPC_ERR_NO_PORT;
+
+ if (sock->state == SS_READY) {
+ if (msg_connected(msg))
+ return -TIPC_ERR_NO_PORT;
+ } else {
+ rc = filter_connect(tsk, skb);
+ if (rc != TIPC_OK || !*skb)
+ return rc;
+ }
+
+ /* Reject message if there isn't room to queue it */
+ if (sk_rmem_alloc_get(sk) + (*skb)->truesize >= limit)
+ return -TIPC_ERR_OVERLOAD;
+
+ /* Enqueue message */
+ TIPC_SKB_CB(*skb)->handle = NULL;
+ __skb_queue_tail(&sk->sk_receive_queue, *skb);
+ skb_set_owner_r(*skb, sk);
+
+ sk->sk_data_ready(sk);
+ *skb = NULL;
+ return TIPC_OK;
+}
+
+/**
+ * tipc_backlog_rcv - handle incoming message from backlog queue
+ * @sk: socket
+ * @skb: message
+ *
+ * Caller must hold socket lock
+ *
+ * Returns 0
+ */
+static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+ atomic_t *dcnt;
+ u32 dnode;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct net *net = sock_net(sk);
+ uint truesize = skb->truesize;
+
+ err = filter_rcv(sk, &skb);
+ if (likely(!skb)) {
+ dcnt = &tsk->dupl_rcvcnt;
+ if (atomic_read(dcnt) < TIPC_CONN_OVERLOAD_LIMIT)
+ atomic_add(truesize, dcnt);
+ return 0;
+ }
+ if (!err || tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, -err))
+ tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ return 0;
+}
+
+/**
+ * tipc_sk_enqueue - extract all buffers with destination 'dport' from
+ * inputq and try adding them to socket or backlog queue
+ * @inputq: list of incoming buffers with potentially different destinations
+ * @sk: socket where the buffers should be enqueued
+ * @dport: port number for the socket
+ * @_skb: returned buffer to be forwarded or rejected, if applicable
+ *
+ * Caller must hold socket lock
+ *
+ * Returns TIPC_OK if all buffers enqueued, otherwise -TIPC_ERR_OVERLOAD
+ * or -TIPC_ERR_NO_PORT
+ */
+static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
+ u32 dport, struct sk_buff **_skb)
+{
+ unsigned int lim;
+ atomic_t *dcnt;
+ int err;
+ struct sk_buff *skb;
+ unsigned long time_limit = jiffies + 2;
+
+ while (skb_queue_len(inputq)) {
+ if (unlikely(time_after_eq(jiffies, time_limit)))
+ return TIPC_OK;
+ skb = tipc_skb_dequeue(inputq, dport);
+ if (unlikely(!skb))
+ return TIPC_OK;
+ if (!sock_owned_by_user(sk)) {
+ err = filter_rcv(sk, &skb);
+ if (likely(!skb))
+ continue;
+ *_skb = skb;
+ return err;
+ }
+ dcnt = &tipc_sk(sk)->dupl_rcvcnt;
+ if (sk->sk_backlog.len)
+ atomic_set(dcnt, 0);
+ lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
+ if (likely(!sk_add_backlog(sk, skb, lim)))
+ continue;
+ *_skb = skb;
+ return -TIPC_ERR_OVERLOAD;
+ }
+ return TIPC_OK;
+}
+
+/**
+ * tipc_sk_rcv - handle a chain of incoming buffers
+ * @inputq: buffer list containing the buffers
+ * Consumes all buffers in list until inputq is empty
+ * Note: may be called in multiple threads referring to the same queue
+ * Returns 0 if last buffer was accepted, otherwise -EHOSTUNREACH
+ * Only node local calls check the return value, sending single-buffer queues
+ */
+int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
+{
+ u32 dnode, dport = 0;
+ int err;
+ struct sk_buff *skb;
+ struct tipc_sock *tsk;
+ struct tipc_net *tn;
+ struct sock *sk;
+
+ while (skb_queue_len(inputq)) {
+ err = -TIPC_ERR_NO_PORT;
+ skb = NULL;
+ dport = tipc_skb_peek_port(inputq, dport);
+ tsk = tipc_sk_lookup(net, dport);
+ if (likely(tsk)) {
+ sk = &tsk->sk;
+ if (likely(spin_trylock_bh(&sk->sk_lock.slock))) {
+ err = tipc_sk_enqueue(inputq, sk, dport, &skb);
+ spin_unlock_bh(&sk->sk_lock.slock);
+ dport = 0;
+ }
+ sock_put(sk);
+ } else {
+ skb = tipc_skb_dequeue(inputq, dport);
+ }
+ if (likely(!skb))
+ continue;
+ if (tipc_msg_lookup_dest(net, skb, &dnode, &err))
+ goto xmit;
+ if (!err) {
+ dnode = msg_destnode(buf_msg(skb));
+ goto xmit;
+ }
+ tn = net_generic(net, tipc_net_id);
+ if (!tipc_msg_reverse(tn->own_addr, skb, &dnode, -err))
+ continue;
+xmit:
+ tipc_link_xmit_skb(net, skb, dnode, dport);
+ }
+ return err ? -EHOSTUNREACH : 0;
+}
+
+static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
+{
+ struct sock *sk = sock->sk;
+ DEFINE_WAIT(wait);
+ int done;
+
+ do {
+ int err = sock_error(sk);
+ if (err)
+ return err;
+ if (!*timeo_p)
+ return -ETIMEDOUT;
+ if (signal_pending(current))
+ return sock_intr_errno(*timeo_p);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ done = sk_wait_event(sk, timeo_p, sock->state != SS_CONNECTING);
+ finish_wait(sk_sleep(sk), &wait);
+ } while (!done);
+ return 0;
+}
+
+/**
+ * tipc_connect - establish a connection to another TIPC port
+ * @sock: socket structure
+ * @dest: socket address for destination port
+ * @destlen: size of socket address data structure
+ * @flags: file-related flags associated with socket
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_connect(struct socket *sock, struct sockaddr *dest,
+ int destlen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest;
+ struct msghdr m = {NULL,};
+ long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout;
+ socket_state previous;
+ int res = 0;
+
+ lock_sock(sk);
+
+ /* DGRAM/RDM connect(), just save the destaddr */
+ if (sock->state == SS_READY) {
+ if (dst->family == AF_UNSPEC) {
+ memset(&tsk->remote, 0, sizeof(struct sockaddr_tipc));
+ tsk->connected = 0;
+ } else if (destlen != sizeof(struct sockaddr_tipc)) {
+ res = -EINVAL;
+ } else {
+ memcpy(&tsk->remote, dest, destlen);
+ tsk->connected = 1;
+ }
+ goto exit;
+ }
+
+ /*
+ * Reject connection attempt using multicast address
+ *
+ * Note: send_msg() validates the rest of the address fields,
+ * so there's no need to do it here
+ */
+ if (dst->addrtype == TIPC_ADDR_MCAST) {
+ res = -EINVAL;
+ goto exit;
+ }
+
+ previous = sock->state;
+ switch (sock->state) {
+ case SS_UNCONNECTED:
+ /* Send a 'SYN-' to destination */
+ m.msg_name = dest;
+ m.msg_namelen = destlen;
+
+ /* If connect is in non-blocking case, set MSG_DONTWAIT to
+ * indicate send_msg() is never blocked.
+ */
+ if (!timeout)
+ m.msg_flags = MSG_DONTWAIT;
+
+ res = __tipc_sendmsg(sock, &m, 0);
+ if ((res < 0) && (res != -EWOULDBLOCK))
+ goto exit;
+
+ /* Just entered SS_CONNECTING state; the only
+ * difference is that return value in non-blocking
+ * case is EINPROGRESS, rather than EALREADY.
+ */
+ res = -EINPROGRESS;
+ case SS_CONNECTING:
+ if (previous == SS_CONNECTING)
+ res = -EALREADY;
+ if (!timeout)
+ goto exit;
+ timeout = msecs_to_jiffies(timeout);
+ /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
+ res = tipc_wait_for_connect(sock, &timeout);
+ break;
+ case SS_CONNECTED:
+ res = -EISCONN;
+ break;
+ default:
+ res = -EINVAL;
+ break;
+ }
+exit:
+ release_sock(sk);
+ return res;
+}
+
+/**
+ * tipc_listen - allow socket to listen for incoming connections
+ * @sock: socket structure
+ * @len: (unused)
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_listen(struct socket *sock, int len)
+{
+ struct sock *sk = sock->sk;
+ int res;
+
+ lock_sock(sk);
+
+ if (sock->state != SS_UNCONNECTED)
+ res = -EINVAL;
+ else {
+ sock->state = SS_LISTENING;
+ res = 0;
+ }
+
+ release_sock(sk);
+ return res;
+}
+
+static int tipc_wait_for_accept(struct socket *sock, long timeo)
+{
+ struct sock *sk = sock->sk;
+ DEFINE_WAIT(wait);
+ int err;
+
+ /* True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ */
+ for (;;) {
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (timeo && skb_queue_empty(&sk->sk_receive_queue)) {
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ }
+ err = 0;
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+ err = -EINVAL;
+ if (sock->state != SS_LISTENING)
+ break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return err;
+}
+
+/**
+ * tipc_accept - wait for connection request
+ * @sock: listening socket
+ * @newsock: new socket that is to be connected
+ * @flags: file-related flags associated with socket
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags)
+{
+ struct sock *new_sk, *sk = sock->sk;
+ struct sk_buff *buf;
+ struct tipc_sock *new_tsock;
+ struct tipc_msg *msg;
+ long timeo;
+ int res;
+
+ lock_sock(sk);
+
+ if (sock->state != SS_LISTENING) {
+ res = -EINVAL;
+ goto exit;
+ }
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ res = tipc_wait_for_accept(sock, timeo);
+ if (res)
+ goto exit;
+
+ buf = skb_peek(&sk->sk_receive_queue);
+
+ res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, 1);
+ if (res)
+ goto exit;
+
+ new_sk = new_sock->sk;
+ new_tsock = tipc_sk(new_sk);
+ msg = buf_msg(buf);
+
+ /* we lock on new_sk; but lockdep sees the lock on sk */
+ lock_sock_nested(new_sk, SINGLE_DEPTH_NESTING);
+
+ /*
+ * Reject any stray messages received by new socket
+ * before the socket lock was taken (very, very unlikely)
+ */
+ tsk_rej_rx_queue(new_sk);
+
+ /* Connect new socket to it's peer */
+ tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg));
+ new_sock->state = SS_CONNECTED;
+
+ tsk_set_importance(new_tsock, msg_importance(msg));
+ if (msg_named(msg)) {
+ new_tsock->conn_type = msg_nametype(msg);
+ new_tsock->conn_instance = msg_nameinst(msg);
+ }
+
+ /*
+ * Respond to 'SYN-' by discarding it & returning 'ACK'-.
+ * Respond to 'SYN+' by queuing it on new socket.
+ */
+ if (!msg_data_sz(msg)) {
+ struct msghdr m = {NULL,};
+
+ tsk_advance_rx_queue(sk);
+ __tipc_send_stream(new_sock, &m, 0);
+ } else {
+ __skb_dequeue(&sk->sk_receive_queue);
+ __skb_queue_head(&new_sk->sk_receive_queue, buf);
+ skb_set_owner_r(buf, new_sk);
+ }
+ release_sock(new_sk);
+exit:
+ release_sock(sk);
+ return res;
+}
+
+/**
+ * tipc_shutdown - shutdown socket connection
+ * @sock: socket structure
+ * @how: direction to close (must be SHUT_RDWR)
+ *
+ * Terminates connection (if necessary), then purges socket's receive queue.
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct tipc_sock *tsk = tipc_sk(sk);
+ struct sk_buff *skb;
+ u32 dnode;
+ int res;
+
+ if (how != SHUT_RDWR)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ switch (sock->state) {
+ case SS_CONNECTING:
+ case SS_CONNECTED:
+
+restart:
+ /* Disconnect and send a 'FIN+' or 'FIN-' message to peer */
+ skb = __skb_dequeue(&sk->sk_receive_queue);
+ if (skb) {
+ if (TIPC_SKB_CB(skb)->handle != NULL) {
+ kfree_skb(skb);
+ goto restart;
+ }
+ if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode,
+ TIPC_CONN_SHUTDOWN))
+ tipc_link_xmit_skb(net, skb, dnode,
+ tsk->portid);
+ } else {
+ dnode = tsk_peer_node(tsk);
+
+ skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
+ TIPC_CONN_MSG, SHORT_H_SIZE,
+ 0, dnode, tsk_own_node(tsk),
+ tsk_peer_port(tsk),
+ tsk->portid, TIPC_CONN_SHUTDOWN);
+ tipc_link_xmit_skb(net, skb, dnode, tsk->portid);
+ }
+ tsk->connected = 0;
+ sock->state = SS_DISCONNECTING;
+ tipc_node_remove_conn(net, dnode, tsk->portid);
+ /* fall through */
+
+ case SS_DISCONNECTING:
+
+ /* Discard any unreceived messages */
+ __skb_queue_purge(&sk->sk_receive_queue);
+
+ /* Wake up anyone sleeping in poll */
+ sk->sk_state_change(sk);
+ res = 0;
+ break;
+
+ default:
+ res = -ENOTCONN;
+ }
+
+ release_sock(sk);
+ return res;
+}
+
+static void tipc_sk_timeout(unsigned long data)
+{
+ struct tipc_sock *tsk = (struct tipc_sock *)data;
+ struct sock *sk = &tsk->sk;
+ struct sk_buff *skb = NULL;
+ u32 peer_port, peer_node;
+ u32 own_node = tsk_own_node(tsk);
+
+ bh_lock_sock(sk);
+ if (!tsk->connected) {
+ bh_unlock_sock(sk);
+ goto exit;
+ }
+ peer_port = tsk_peer_port(tsk);
+ peer_node = tsk_peer_node(tsk);
+
+ if (tsk->probing_state == TIPC_CONN_PROBING) {
+ if (!sock_owned_by_user(sk)) {
+ sk->sk_socket->state = SS_DISCONNECTING;
+ tsk->connected = 0;
+ tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
+ tsk_peer_port(tsk));
+ sk->sk_state_change(sk);
+ } else {
+ /* Try again later */
+ sk_reset_timer(sk, &sk->sk_timer, (HZ / 20));
+ }
+
+ } else {
+ skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE,
+ INT_H_SIZE, 0, peer_node, own_node,
+ peer_port, tsk->portid, TIPC_OK);
+ tsk->probing_state = TIPC_CONN_PROBING;
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv);
+ }
+ bh_unlock_sock(sk);
+ if (skb)
+ tipc_link_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid);
+exit:
+ sock_put(sk);
+}
+
+static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
+ struct tipc_name_seq const *seq)
+{
+ struct net *net = sock_net(&tsk->sk);
+ struct publication *publ;
+ u32 key;
+
+ if (tsk->connected)
+ return -EINVAL;
+ key = tsk->portid + tsk->pub_count + 1;
+ if (key == tsk->portid)
+ return -EADDRINUSE;
+
+ publ = tipc_nametbl_publish(net, seq->type, seq->lower, seq->upper,
+ scope, tsk->portid, key);
+ if (unlikely(!publ))
+ return -EINVAL;
+
+ list_add(&publ->pport_list, &tsk->publications);
+ tsk->pub_count++;
+ tsk->published = 1;
+ return 0;
+}
+
+static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
+ struct tipc_name_seq const *seq)
+{
+ struct net *net = sock_net(&tsk->sk);
+ struct publication *publ;
+ struct publication *safe;
+ int rc = -EINVAL;
+
+ list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) {
+ if (seq) {
+ if (publ->scope != scope)
+ continue;
+ if (publ->type != seq->type)
+ continue;
+ if (publ->lower != seq->lower)
+ continue;
+ if (publ->upper != seq->upper)
+ break;
+ tipc_nametbl_withdraw(net, publ->type, publ->lower,
+ publ->ref, publ->key);
+ rc = 0;
+ break;
+ }
+ tipc_nametbl_withdraw(net, publ->type, publ->lower,
+ publ->ref, publ->key);
+ rc = 0;
+ }
+ if (list_empty(&tsk->publications))
+ tsk->published = 0;
+ return rc;
+}
+
+/* tipc_sk_reinit: set non-zero address in all existing sockets
+ * when we go from standalone to network mode.
+ */
+void tipc_sk_reinit(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ const struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct tipc_sock *tsk;
+ struct tipc_msg *msg;
+ int i;
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht);
+ for (i = 0; i < tbl->size; i++) {
+ rht_for_each_entry_rcu(tsk, pos, tbl, i, node) {
+ spin_lock_bh(&tsk->sk.sk_lock.slock);
+ msg = &tsk->phdr;
+ msg_set_prevnode(msg, tn->own_addr);
+ msg_set_orignode(msg, tn->own_addr);
+ spin_unlock_bh(&tsk->sk.sk_lock.slock);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_sock *tsk;
+
+ rcu_read_lock();
+ tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params);
+ if (tsk)
+ sock_hold(&tsk->sk);
+ rcu_read_unlock();
+
+ return tsk;
+}
+
+static int tipc_sk_insert(struct tipc_sock *tsk)
+{
+ struct sock *sk = &tsk->sk;
+ struct net *net = sock_net(sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ u32 remaining = (TIPC_MAX_PORT - TIPC_MIN_PORT) + 1;
+ u32 portid = prandom_u32() % remaining + TIPC_MIN_PORT;
+
+ while (remaining--) {
+ portid++;
+ if ((portid < TIPC_MIN_PORT) || (portid > TIPC_MAX_PORT))
+ portid = TIPC_MIN_PORT;
+ tsk->portid = portid;
+ sock_hold(&tsk->sk);
+ if (!rhashtable_lookup_insert_fast(&tn->sk_rht, &tsk->node,
+ tsk_rht_params))
+ return 0;
+ sock_put(&tsk->sk);
+ }
+
+ return -1;
+}
+
+static void tipc_sk_remove(struct tipc_sock *tsk)
+{
+ struct sock *sk = &tsk->sk;
+ struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id);
+
+ if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) {
+ WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
+ __sock_put(sk);
+ }
+}
+
+static const struct rhashtable_params tsk_rht_params = {
+ .nelem_hint = 192,
+ .head_offset = offsetof(struct tipc_sock, node),
+ .key_offset = offsetof(struct tipc_sock, portid),
+ .key_len = sizeof(u32), /* portid */
+ .max_size = 1048576,
+ .min_size = 256,
+ .automatic_shrinking = true,
+};
+
+int tipc_sk_rht_init(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ return rhashtable_init(&tn->sk_rht, &tsk_rht_params);
+}
+
+void tipc_sk_rht_destroy(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ /* Wait for socket readers to complete */
+ synchronize_net();
+
+ rhashtable_destroy(&tn->sk_rht);
+}
+
+/**
+ * tipc_setsockopt - set socket option
+ * @sock: socket structure
+ * @lvl: option level
+ * @opt: option identifier
+ * @ov: pointer to new option value
+ * @ol: length of option value
+ *
+ * For stream sockets only, accepts and ignores all IPPROTO_TCP options
+ * (to ease compatibility).
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
+ char __user *ov, unsigned int ol)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ u32 value;
+ int res;
+
+ if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
+ return 0;
+ if (lvl != SOL_TIPC)
+ return -ENOPROTOOPT;
+ if (ol < sizeof(value))
+ return -EINVAL;
+ res = get_user(value, (u32 __user *)ov);
+ if (res)
+ return res;
+
+ lock_sock(sk);
+
+ switch (opt) {
+ case TIPC_IMPORTANCE:
+ res = tsk_set_importance(tsk, value);
+ break;
+ case TIPC_SRC_DROPPABLE:
+ if (sock->type != SOCK_STREAM)
+ tsk_set_unreliable(tsk, value);
+ else
+ res = -ENOPROTOOPT;
+ break;
+ case TIPC_DEST_DROPPABLE:
+ tsk_set_unreturnable(tsk, value);
+ break;
+ case TIPC_CONN_TIMEOUT:
+ tipc_sk(sk)->conn_timeout = value;
+ /* no need to set "res", since already 0 at this point */
+ break;
+ default:
+ res = -EINVAL;
+ }
+
+ release_sock(sk);
+
+ return res;
+}
+
+/**
+ * tipc_getsockopt - get socket option
+ * @sock: socket structure
+ * @lvl: option level
+ * @opt: option identifier
+ * @ov: receptacle for option value
+ * @ol: receptacle for length of option value
+ *
+ * For stream sockets only, returns 0 length result for all IPPROTO_TCP options
+ * (to ease compatibility).
+ *
+ * Returns 0 on success, errno otherwise
+ */
+static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
+ char __user *ov, int __user *ol)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sock *tsk = tipc_sk(sk);
+ int len;
+ u32 value;
+ int res;
+
+ if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
+ return put_user(0, ol);
+ if (lvl != SOL_TIPC)
+ return -ENOPROTOOPT;
+ res = get_user(len, ol);
+ if (res)
+ return res;
+
+ lock_sock(sk);
+
+ switch (opt) {
+ case TIPC_IMPORTANCE:
+ value = tsk_importance(tsk);
+ break;
+ case TIPC_SRC_DROPPABLE:
+ value = tsk_unreliable(tsk);
+ break;
+ case TIPC_DEST_DROPPABLE:
+ value = tsk_unreturnable(tsk);
+ break;
+ case TIPC_CONN_TIMEOUT:
+ value = tsk->conn_timeout;
+ /* no need to set "res", since already 0 at this point */
+ break;
+ case TIPC_NODE_RECVQ_DEPTH:
+ value = 0; /* was tipc_queue_size, now obsolete */
+ break;
+ case TIPC_SOCK_RECVQ_DEPTH:
+ value = skb_queue_len(&sk->sk_receive_queue);
+ break;
+ default:
+ res = -EINVAL;
+ }
+
+ release_sock(sk);
+
+ if (res)
+ return res; /* "get" failed */
+
+ if (len < sizeof(value))
+ return -EINVAL;
+
+ if (copy_to_user(ov, &value, sizeof(value)))
+ return -EFAULT;
+
+ return put_user(sizeof(value), ol);
+}
+
+static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct tipc_sioc_ln_req lnr;
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case SIOCGETLINKNAME:
+ if (copy_from_user(&lnr, argp, sizeof(lnr)))
+ return -EFAULT;
+ if (!tipc_node_get_linkname(sock_net(sk),
+ lnr.bearer_id & 0xffff, lnr.peer,
+ lnr.linkname, TIPC_MAX_LINK_NAME)) {
+ if (copy_to_user(argp, &lnr, sizeof(lnr)))
+ return -EFAULT;
+ return 0;
+ }
+ return -EADDRNOTAVAIL;
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+/* Protocol switches for the various types of TIPC sockets */
+
+static const struct proto_ops msg_ops = {
+ .owner = THIS_MODULE,
+ .family = AF_TIPC,
+ .release = tipc_release,
+ .bind = tipc_bind,
+ .connect = tipc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = tipc_getname,
+ .poll = tipc_poll,
+ .ioctl = tipc_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = tipc_shutdown,
+ .setsockopt = tipc_setsockopt,
+ .getsockopt = tipc_getsockopt,
+ .sendmsg = tipc_sendmsg,
+ .recvmsg = tipc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage
+};
+
+static const struct proto_ops packet_ops = {
+ .owner = THIS_MODULE,
+ .family = AF_TIPC,
+ .release = tipc_release,
+ .bind = tipc_bind,
+ .connect = tipc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = tipc_accept,
+ .getname = tipc_getname,
+ .poll = tipc_poll,
+ .ioctl = tipc_ioctl,
+ .listen = tipc_listen,
+ .shutdown = tipc_shutdown,
+ .setsockopt = tipc_setsockopt,
+ .getsockopt = tipc_getsockopt,
+ .sendmsg = tipc_send_packet,
+ .recvmsg = tipc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage
+};
+
+static const struct proto_ops stream_ops = {
+ .owner = THIS_MODULE,
+ .family = AF_TIPC,
+ .release = tipc_release,
+ .bind = tipc_bind,
+ .connect = tipc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = tipc_accept,
+ .getname = tipc_getname,
+ .poll = tipc_poll,
+ .ioctl = tipc_ioctl,
+ .listen = tipc_listen,
+ .shutdown = tipc_shutdown,
+ .setsockopt = tipc_setsockopt,
+ .getsockopt = tipc_getsockopt,
+ .sendmsg = tipc_send_stream,
+ .recvmsg = tipc_recv_stream,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage
+};
+
+static const struct net_proto_family tipc_family_ops = {
+ .owner = THIS_MODULE,
+ .family = AF_TIPC,
+ .create = tipc_sk_create
+};
+
+static struct proto tipc_proto = {
+ .name = "TIPC",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct tipc_sock),
+ .sysctl_rmem = sysctl_tipc_rmem
+};
+
+/**
+ * tipc_socket_init - initialize TIPC socket interface
+ *
+ * Returns 0 on success, errno otherwise
+ */
+int tipc_socket_init(void)
+{
+ int res;
+
+ res = proto_register(&tipc_proto, 1);
+ if (res) {
+ pr_err("Failed to register TIPC protocol type\n");
+ goto out;
+ }
+
+ res = sock_register(&tipc_family_ops);
+ if (res) {
+ pr_err("Failed to register TIPC socket type\n");
+ proto_unregister(&tipc_proto);
+ goto out;
+ }
+ out:
+ return res;
+}
+
+/**
+ * tipc_socket_stop - stop TIPC socket interface
+ */
+void tipc_socket_stop(void)
+{
+ sock_unregister(tipc_family_ops.family);
+ proto_unregister(&tipc_proto);
+}
+
+/* Caller should hold socket lock for the passed tipc socket. */
+static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk)
+{
+ u32 peer_node;
+ u32 peer_port;
+ struct nlattr *nest;
+
+ peer_node = tsk_peer_node(tsk);
+ peer_port = tsk_peer_port(tsk);
+
+ nest = nla_nest_start(skb, TIPC_NLA_SOCK_CON);
+
+ if (nla_put_u32(skb, TIPC_NLA_CON_NODE, peer_node))
+ goto msg_full;
+ if (nla_put_u32(skb, TIPC_NLA_CON_SOCK, peer_port))
+ goto msg_full;
+
+ if (tsk->conn_type != 0) {
+ if (nla_put_flag(skb, TIPC_NLA_CON_FLAG))
+ goto msg_full;
+ if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, tsk->conn_type))
+ goto msg_full;
+ if (nla_put_u32(skb, TIPC_NLA_CON_INST, tsk->conn_instance))
+ goto msg_full;
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+
+msg_full:
+ nla_nest_cancel(skb, nest);
+
+ return -EMSGSIZE;
+}
+
+/* Caller should hold socket lock for the passed tipc socket. */
+static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
+ struct tipc_sock *tsk)
+{
+ int err;
+ void *hdr;
+ struct nlattr *attrs;
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET);
+ if (!hdr)
+ goto msg_cancel;
+
+ attrs = nla_nest_start(skb, TIPC_NLA_SOCK);
+ if (!attrs)
+ goto genlmsg_cancel;
+ if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid))
+ goto attr_msg_cancel;
+ if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr))
+ goto attr_msg_cancel;
+
+ if (tsk->connected) {
+ err = __tipc_nl_add_sk_con(skb, tsk);
+ if (err)
+ goto attr_msg_cancel;
+ } else if (!list_empty(&tsk->publications)) {
+ if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL))
+ goto attr_msg_cancel;
+ }
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, hdr);
+
+ return 0;
+
+attr_msg_cancel:
+ nla_nest_cancel(skb, attrs);
+genlmsg_cancel:
+ genlmsg_cancel(skb, hdr);
+msg_cancel:
+ return -EMSGSIZE;
+}
+
+int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int err;
+ struct tipc_sock *tsk;
+ const struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ u32 tbl_id = cb->args[0];
+ u32 prev_portid = cb->args[1];
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht);
+ for (; tbl_id < tbl->size; tbl_id++) {
+ rht_for_each_entry_rcu(tsk, pos, tbl, tbl_id, node) {
+ spin_lock_bh(&tsk->sk.sk_lock.slock);
+ if (prev_portid && prev_portid != tsk->portid) {
+ spin_unlock_bh(&tsk->sk.sk_lock.slock);
+ continue;
+ }
+
+ err = __tipc_nl_add_sk(skb, cb, tsk);
+ if (err) {
+ prev_portid = tsk->portid;
+ spin_unlock_bh(&tsk->sk.sk_lock.slock);
+ goto out;
+ }
+ prev_portid = 0;
+ spin_unlock_bh(&tsk->sk.sk_lock.slock);
+ }
+ }
+out:
+ rcu_read_unlock();
+ cb->args[0] = tbl_id;
+ cb->args[1] = prev_portid;
+
+ return skb->len;
+}
+
+/* Caller should hold socket lock for the passed tipc socket. */
+static int __tipc_nl_add_sk_publ(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct publication *publ)
+{
+ void *hdr;
+ struct nlattr *attrs;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET);
+ if (!hdr)
+ goto msg_cancel;
+
+ attrs = nla_nest_start(skb, TIPC_NLA_PUBL);
+ if (!attrs)
+ goto genlmsg_cancel;
+
+ if (nla_put_u32(skb, TIPC_NLA_PUBL_KEY, publ->key))
+ goto attr_msg_cancel;
+ if (nla_put_u32(skb, TIPC_NLA_PUBL_TYPE, publ->type))
+ goto attr_msg_cancel;
+ if (nla_put_u32(skb, TIPC_NLA_PUBL_LOWER, publ->lower))
+ goto attr_msg_cancel;
+ if (nla_put_u32(skb, TIPC_NLA_PUBL_UPPER, publ->upper))
+ goto attr_msg_cancel;
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, hdr);
+
+ return 0;
+
+attr_msg_cancel:
+ nla_nest_cancel(skb, attrs);
+genlmsg_cancel:
+ genlmsg_cancel(skb, hdr);
+msg_cancel:
+ return -EMSGSIZE;
+}
+
+/* Caller should hold socket lock for the passed tipc socket. */
+static int __tipc_nl_list_sk_publ(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct tipc_sock *tsk, u32 *last_publ)
+{
+ int err;
+ struct publication *p;
+
+ if (*last_publ) {
+ list_for_each_entry(p, &tsk->publications, pport_list) {
+ if (p->key == *last_publ)
+ break;
+ }
+ if (p->key != *last_publ) {
+ /* We never set seq or call nl_dump_check_consistent()
+ * this means that setting prev_seq here will cause the
+ * consistence check to fail in the netlink callback
+ * handler. Resulting in the last NLMSG_DONE message
+ * having the NLM_F_DUMP_INTR flag set.
+ */
+ cb->prev_seq = 1;
+ *last_publ = 0;
+ return -EPIPE;
+ }
+ } else {
+ p = list_first_entry(&tsk->publications, struct publication,
+ pport_list);
+ }
+
+ list_for_each_entry_from(p, &tsk->publications, pport_list) {
+ err = __tipc_nl_add_sk_publ(skb, cb, p);
+ if (err) {
+ *last_publ = p->key;
+ return err;
+ }
+ }
+ *last_publ = 0;
+
+ return 0;
+}
+
+int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int err;
+ u32 tsk_portid = cb->args[0];
+ u32 last_publ = cb->args[1];
+ u32 done = cb->args[2];
+ struct net *net = sock_net(skb->sk);
+ struct tipc_sock *tsk;
+
+ if (!tsk_portid) {
+ struct nlattr **attrs;
+ struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1];
+
+ err = tipc_nlmsg_parse(cb->nlh, &attrs);
+ if (err)
+ return err;
+
+ err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX,
+ attrs[TIPC_NLA_SOCK],
+ tipc_nl_sock_policy);
+ if (err)
+ return err;
+
+ if (!sock[TIPC_NLA_SOCK_REF])
+ return -EINVAL;
+
+ tsk_portid = nla_get_u32(sock[TIPC_NLA_SOCK_REF]);
+ }
+
+ if (done)
+ return 0;
+
+ tsk = tipc_sk_lookup(net, tsk_portid);
+ if (!tsk)
+ return -EINVAL;
+
+ lock_sock(&tsk->sk);
+ err = __tipc_nl_list_sk_publ(skb, cb, tsk, &last_publ);
+ if (!err)
+ done = 1;
+ release_sock(&tsk->sk);
+ sock_put(&tsk->sk);
+
+ cb->args[0] = tsk_portid;
+ cb->args[1] = last_publ;
+ cb->args[2] = done;
+
+ return skb->len;
+}
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
new file mode 100644
index 000000000..bf6551389
--- /dev/null
+++ b/net/tipc/socket.h
@@ -0,0 +1,56 @@
+/* net/tipc/socket.h: Include file for TIPC socket code
+ *
+ * Copyright (c) 2014-2015, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_SOCK_H
+#define _TIPC_SOCK_H
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+#define TIPC_CONNACK_INTV 256
+#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2)
+#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \
+ SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE))
+int tipc_socket_init(void);
+void tipc_socket_stop(void);
+int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq);
+void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
+ struct sk_buff_head *inputq);
+void tipc_sk_reinit(struct net *net);
+int tipc_sk_rht_init(struct net *net);
+void tipc_sk_rht_destroy(struct net *net);
+int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb);
+int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+#endif
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
new file mode 100644
index 000000000..1c147c869
--- /dev/null
+++ b/net/tipc/subscr.c
@@ -0,0 +1,377 @@
+/*
+ * net/tipc/subscr.c: TIPC network topology service
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005-2007, 2010-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "name_table.h"
+#include "subscr.h"
+
+/**
+ * struct tipc_subscriber - TIPC network topology subscriber
+ * @conid: connection identifier to server connecting to subscriber
+ * @lock: control access to subscriber
+ * @subscription_list: list of subscription objects for this subscriber
+ */
+struct tipc_subscriber {
+ int conid;
+ spinlock_t lock;
+ struct list_head subscription_list;
+};
+
+/**
+ * htohl - convert value to endianness used by destination
+ * @in: value to convert
+ * @swap: non-zero if endianness must be reversed
+ *
+ * Returns converted value
+ */
+static u32 htohl(u32 in, int swap)
+{
+ return swap ? swab32(in) : in;
+}
+
+static void subscr_send_event(struct tipc_subscription *sub, u32 found_lower,
+ u32 found_upper, u32 event, u32 port_ref,
+ u32 node)
+{
+ struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+ struct tipc_subscriber *subscriber = sub->subscriber;
+ struct kvec msg_sect;
+
+ msg_sect.iov_base = (void *)&sub->evt;
+ msg_sect.iov_len = sizeof(struct tipc_event);
+ sub->evt.event = htohl(event, sub->swap);
+ sub->evt.found_lower = htohl(found_lower, sub->swap);
+ sub->evt.found_upper = htohl(found_upper, sub->swap);
+ sub->evt.port.ref = htohl(port_ref, sub->swap);
+ sub->evt.port.node = htohl(node, sub->swap);
+ tipc_conn_sendmsg(tn->topsrv, subscriber->conid, NULL,
+ msg_sect.iov_base, msg_sect.iov_len);
+}
+
+/**
+ * tipc_subscr_overlap - test for subscription overlap with the given values
+ *
+ * Returns 1 if there is overlap, otherwise 0.
+ */
+int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower,
+ u32 found_upper)
+{
+ if (found_lower < sub->seq.lower)
+ found_lower = sub->seq.lower;
+ if (found_upper > sub->seq.upper)
+ found_upper = sub->seq.upper;
+ if (found_lower > found_upper)
+ return 0;
+ return 1;
+}
+
+/**
+ * tipc_subscr_report_overlap - issue event if there is subscription overlap
+ *
+ * Protected by nameseq.lock in name_table.c
+ */
+void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower,
+ u32 found_upper, u32 event, u32 port_ref,
+ u32 node, int must)
+{
+ if (!tipc_subscr_overlap(sub, found_lower, found_upper))
+ return;
+ if (!must && !(sub->filter & TIPC_SUB_PORTS))
+ return;
+
+ subscr_send_event(sub, found_lower, found_upper, event, port_ref, node);
+}
+
+static void subscr_timeout(unsigned long data)
+{
+ struct tipc_subscription *sub = (struct tipc_subscription *)data;
+ struct tipc_subscriber *subscriber = sub->subscriber;
+ struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+
+ /* The spin lock per subscriber is used to protect its members */
+ spin_lock_bh(&subscriber->lock);
+
+ /* Validate timeout (in case subscription is being cancelled) */
+ if (sub->timeout == TIPC_WAIT_FOREVER) {
+ spin_unlock_bh(&subscriber->lock);
+ return;
+ }
+
+ /* Unlink subscription from name table */
+ tipc_nametbl_unsubscribe(sub);
+
+ /* Unlink subscription from subscriber */
+ list_del(&sub->subscription_list);
+
+ spin_unlock_bh(&subscriber->lock);
+
+ /* Notify subscriber of timeout */
+ subscr_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
+ TIPC_SUBSCR_TIMEOUT, 0, 0);
+
+ /* Now destroy subscription */
+ kfree(sub);
+ atomic_dec(&tn->subscription_count);
+}
+
+/**
+ * subscr_del - delete a subscription within a subscription list
+ *
+ * Called with subscriber lock held.
+ */
+static void subscr_del(struct tipc_subscription *sub)
+{
+ struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+
+ tipc_nametbl_unsubscribe(sub);
+ list_del(&sub->subscription_list);
+ kfree(sub);
+ atomic_dec(&tn->subscription_count);
+}
+
+static void subscr_release(struct tipc_subscriber *subscriber)
+{
+ struct tipc_subscription *sub;
+ struct tipc_subscription *sub_temp;
+
+ spin_lock_bh(&subscriber->lock);
+
+ /* Destroy any existing subscriptions for subscriber */
+ list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
+ subscription_list) {
+ if (sub->timeout != TIPC_WAIT_FOREVER) {
+ spin_unlock_bh(&subscriber->lock);
+ del_timer_sync(&sub->timer);
+ spin_lock_bh(&subscriber->lock);
+ }
+ subscr_del(sub);
+ }
+ spin_unlock_bh(&subscriber->lock);
+
+ /* Now destroy subscriber */
+ kfree(subscriber);
+}
+
+/**
+ * subscr_cancel - handle subscription cancellation request
+ *
+ * Called with subscriber lock held. Routine must temporarily release lock
+ * to enable the subscription timeout routine to finish without deadlocking;
+ * the lock is then reclaimed to allow caller to release it upon return.
+ *
+ * Note that fields of 's' use subscriber's endianness!
+ */
+static void subscr_cancel(struct tipc_subscr *s,
+ struct tipc_subscriber *subscriber)
+{
+ struct tipc_subscription *sub;
+ struct tipc_subscription *sub_temp;
+ int found = 0;
+
+ /* Find first matching subscription, exit if not found */
+ list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
+ subscription_list) {
+ if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return;
+
+ /* Cancel subscription timer (if used), then delete subscription */
+ if (sub->timeout != TIPC_WAIT_FOREVER) {
+ sub->timeout = TIPC_WAIT_FOREVER;
+ spin_unlock_bh(&subscriber->lock);
+ del_timer_sync(&sub->timer);
+ spin_lock_bh(&subscriber->lock);
+ }
+ subscr_del(sub);
+}
+
+/**
+ * subscr_subscribe - create subscription for subscriber
+ *
+ * Called with subscriber lock held.
+ */
+static int subscr_subscribe(struct net *net, struct tipc_subscr *s,
+ struct tipc_subscriber *subscriber,
+ struct tipc_subscription **sub_p)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_subscription *sub;
+ int swap;
+
+ /* Determine subscriber's endianness */
+ swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
+
+ /* Detect & process a subscription cancellation request */
+ if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
+ s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
+ subscr_cancel(s, subscriber);
+ return 0;
+ }
+
+ /* Refuse subscription if global limit exceeded */
+ if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) {
+ pr_warn("Subscription rejected, limit reached (%u)\n",
+ TIPC_MAX_SUBSCRIPTIONS);
+ return -EINVAL;
+ }
+
+ /* Allocate subscription object */
+ sub = kmalloc(sizeof(*sub), GFP_ATOMIC);
+ if (!sub) {
+ pr_warn("Subscription rejected, no memory\n");
+ return -ENOMEM;
+ }
+
+ /* Initialize subscription object */
+ sub->net = net;
+ sub->seq.type = htohl(s->seq.type, swap);
+ sub->seq.lower = htohl(s->seq.lower, swap);
+ sub->seq.upper = htohl(s->seq.upper, swap);
+ sub->timeout = msecs_to_jiffies(htohl(s->timeout, swap));
+ sub->filter = htohl(s->filter, swap);
+ if ((!(sub->filter & TIPC_SUB_PORTS) ==
+ !(sub->filter & TIPC_SUB_SERVICE)) ||
+ (sub->seq.lower > sub->seq.upper)) {
+ pr_warn("Subscription rejected, illegal request\n");
+ kfree(sub);
+ return -EINVAL;
+ }
+ list_add(&sub->subscription_list, &subscriber->subscription_list);
+ sub->subscriber = subscriber;
+ sub->swap = swap;
+ memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr));
+ atomic_inc(&tn->subscription_count);
+ if (sub->timeout != TIPC_WAIT_FOREVER) {
+ setup_timer(&sub->timer, subscr_timeout, (unsigned long)sub);
+ mod_timer(&sub->timer, jiffies + sub->timeout);
+ }
+ *sub_p = sub;
+ return 0;
+}
+
+/* Handle one termination request for the subscriber */
+static void subscr_conn_shutdown_event(int conid, void *usr_data)
+{
+ subscr_release((struct tipc_subscriber *)usr_data);
+}
+
+/* Handle one request to create a new subscription for the subscriber */
+static void subscr_conn_msg_event(struct net *net, int conid,
+ struct sockaddr_tipc *addr, void *usr_data,
+ void *buf, size_t len)
+{
+ struct tipc_subscriber *subscriber = usr_data;
+ struct tipc_subscription *sub = NULL;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+
+ spin_lock_bh(&subscriber->lock);
+ subscr_subscribe(net, (struct tipc_subscr *)buf, subscriber, &sub);
+ if (sub)
+ tipc_nametbl_subscribe(sub);
+ else
+ tipc_conn_terminate(tn->topsrv, subscriber->conid);
+ spin_unlock_bh(&subscriber->lock);
+}
+
+/* Handle one request to establish a new subscriber */
+static void *subscr_named_msg_event(int conid)
+{
+ struct tipc_subscriber *subscriber;
+
+ /* Create subscriber object */
+ subscriber = kzalloc(sizeof(struct tipc_subscriber), GFP_ATOMIC);
+ if (subscriber == NULL) {
+ pr_warn("Subscriber rejected, no memory\n");
+ return NULL;
+ }
+ INIT_LIST_HEAD(&subscriber->subscription_list);
+ subscriber->conid = conid;
+ spin_lock_init(&subscriber->lock);
+
+ return (void *)subscriber;
+}
+
+int tipc_subscr_start(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ const char name[] = "topology_server";
+ struct tipc_server *topsrv;
+ struct sockaddr_tipc *saddr;
+
+ saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC);
+ if (!saddr)
+ return -ENOMEM;
+ saddr->family = AF_TIPC;
+ saddr->addrtype = TIPC_ADDR_NAMESEQ;
+ saddr->addr.nameseq.type = TIPC_TOP_SRV;
+ saddr->addr.nameseq.lower = TIPC_TOP_SRV;
+ saddr->addr.nameseq.upper = TIPC_TOP_SRV;
+ saddr->scope = TIPC_NODE_SCOPE;
+
+ topsrv = kzalloc(sizeof(*topsrv), GFP_ATOMIC);
+ if (!topsrv) {
+ kfree(saddr);
+ return -ENOMEM;
+ }
+ topsrv->net = net;
+ topsrv->saddr = saddr;
+ topsrv->imp = TIPC_CRITICAL_IMPORTANCE;
+ topsrv->type = SOCK_SEQPACKET;
+ topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr);
+ topsrv->tipc_conn_recvmsg = subscr_conn_msg_event;
+ topsrv->tipc_conn_new = subscr_named_msg_event;
+ topsrv->tipc_conn_shutdown = subscr_conn_shutdown_event;
+
+ strncpy(topsrv->name, name, strlen(name) + 1);
+ tn->topsrv = topsrv;
+ atomic_set(&tn->subscription_count, 0);
+
+ return tipc_server_start(topsrv);
+}
+
+void tipc_subscr_stop(struct net *net)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_server *topsrv = tn->topsrv;
+
+ tipc_server_stop(topsrv);
+ kfree(topsrv->saddr);
+ kfree(topsrv);
+}
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
new file mode 100644
index 000000000..33488bd9f
--- /dev/null
+++ b/net/tipc/subscr.h
@@ -0,0 +1,83 @@
+/*
+ * net/tipc/subscr.h: Include file for TIPC network topology service
+ *
+ * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2005-2007, 2012-2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_SUBSCR_H
+#define _TIPC_SUBSCR_H
+
+#include "server.h"
+
+#define TIPC_MAX_SUBSCRIPTIONS 65535
+#define TIPC_MAX_PUBLICATIONS 65535
+
+struct tipc_subscription;
+struct tipc_subscriber;
+
+/**
+ * struct tipc_subscription - TIPC network topology subscription object
+ * @subscriber: pointer to its subscriber
+ * @seq: name sequence associated with subscription
+ * @net: point to network namespace
+ * @timeout: duration of subscription (in ms)
+ * @filter: event filtering to be done for subscription
+ * @timer: timer governing subscription duration (optional)
+ * @nameseq_list: adjacent subscriptions in name sequence's subscription list
+ * @subscription_list: adjacent subscriptions in subscriber's subscription list
+ * @server_ref: object reference of server port associated with subscription
+ * @swap: indicates if subscriber uses opposite endianness in its messages
+ * @evt: template for events generated by subscription
+ */
+struct tipc_subscription {
+ struct tipc_subscriber *subscriber;
+ struct tipc_name_seq seq;
+ struct net *net;
+ unsigned long timeout;
+ u32 filter;
+ struct timer_list timer;
+ struct list_head nameseq_list;
+ struct list_head subscription_list;
+ int swap;
+ struct tipc_event evt;
+};
+
+int tipc_subscr_overlap(struct tipc_subscription *sub, u32 found_lower,
+ u32 found_upper);
+void tipc_subscr_report_overlap(struct tipc_subscription *sub, u32 found_lower,
+ u32 found_upper, u32 event, u32 port_ref,
+ u32 node, int must);
+int tipc_subscr_start(struct net *net);
+void tipc_subscr_stop(struct net *net);
+
+#endif
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
new file mode 100644
index 000000000..1a779b1e8
--- /dev/null
+++ b/net/tipc/sysctl.c
@@ -0,0 +1,71 @@
+/*
+ * net/tipc/sysctl.c: sysctl interface to TIPC subsystem
+ *
+ * Copyright (c) 2013, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+
+#include <linux/sysctl.h>
+
+static struct ctl_table_header *tipc_ctl_hdr;
+
+static struct ctl_table tipc_table[] = {
+ {
+ .procname = "tipc_rmem",
+ .data = &sysctl_tipc_rmem,
+ .maxlen = sizeof(sysctl_tipc_rmem),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "named_timeout",
+ .data = &sysctl_tipc_named_timeout,
+ .maxlen = sizeof(sysctl_tipc_named_timeout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+int tipc_register_sysctl(void)
+{
+ tipc_ctl_hdr = register_net_sysctl(&init_net, "net/tipc", tipc_table);
+ if (tipc_ctl_hdr == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void tipc_unregister_sysctl(void)
+{
+ unregister_net_sysctl_table(tipc_ctl_hdr);
+}
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
new file mode 100644
index 000000000..66deebc66
--- /dev/null
+++ b/net/tipc/udp_media.c
@@ -0,0 +1,448 @@
+/* net/tipc/udp_media.c: IP bearer support for TIPC
+ *
+ * Copyright (c) 2015, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/udp_tunnel.h>
+#include <net/addrconf.h>
+#include <linux/tipc_netlink.h>
+#include "core.h"
+#include "bearer.h"
+
+/* IANA assigned UDP port */
+#define UDP_PORT_DEFAULT 6118
+
+static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
+ [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC},
+ [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY,
+ .len = sizeof(struct sockaddr_storage)},
+ [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY,
+ .len = sizeof(struct sockaddr_storage)},
+};
+
+/**
+ * struct udp_media_addr - IP/UDP addressing information
+ *
+ * This is the bearer level originating address used in neighbor discovery
+ * messages, and all fields should be in network byte order
+ */
+struct udp_media_addr {
+ __be16 proto;
+ __be16 udp_port;
+ union {
+ struct in_addr ipv4;
+ struct in6_addr ipv6;
+ };
+};
+
+/**
+ * struct udp_bearer - ip/udp bearer data structure
+ * @bearer: associated generic tipc bearer
+ * @ubsock: bearer associated socket
+ * @ifindex: local address scope
+ * @work: used to schedule deferred work on a bearer
+ */
+struct udp_bearer {
+ struct tipc_bearer __rcu *bearer;
+ struct socket *ubsock;
+ u32 ifindex;
+ struct work_struct work;
+};
+
+/* udp_media_addr_set - convert a ip/udp address to a TIPC media address */
+static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
+ struct udp_media_addr *ua)
+{
+ memset(addr, 0, sizeof(struct tipc_media_addr));
+ addr->media_id = TIPC_MEDIA_TYPE_UDP;
+ memcpy(addr->value, ua, sizeof(struct udp_media_addr));
+ if (ntohs(ua->proto) == ETH_P_IP) {
+ if (ipv4_is_multicast(ua->ipv4.s_addr))
+ addr->broadcast = 1;
+ } else if (ntohs(ua->proto) == ETH_P_IPV6) {
+ if (ipv6_addr_type(&ua->ipv6) & IPV6_ADDR_MULTICAST)
+ addr->broadcast = 1;
+ } else {
+ pr_err("Invalid UDP media address\n");
+ }
+}
+
+/* tipc_udp_addr2str - convert ip/udp address to string */
+static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size)
+{
+ struct udp_media_addr *ua = (struct udp_media_addr *)&a->value;
+
+ if (ntohs(ua->proto) == ETH_P_IP)
+ snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->udp_port));
+ else if (ntohs(ua->proto) == ETH_P_IPV6)
+ snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->udp_port));
+ else
+ pr_err("Invalid UDP media address\n");
+ return 0;
+}
+
+/* tipc_udp_msg2addr - extract an ip/udp address from a TIPC ndisc message */
+static int tipc_udp_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *a,
+ char *msg)
+{
+ struct udp_media_addr *ua;
+
+ ua = (struct udp_media_addr *) (msg + TIPC_MEDIA_ADDR_OFFSET);
+ if (msg[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_UDP)
+ return -EINVAL;
+ tipc_udp_media_addr_set(a, ua);
+ return 0;
+}
+
+/* tipc_udp_addr2msg - write an ip/udp address to a TIPC ndisc message */
+static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a)
+{
+ memset(msg, 0, TIPC_MEDIA_INFO_SIZE);
+ msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_UDP;
+ memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, a->value,
+ sizeof(struct udp_media_addr));
+ return 0;
+}
+
+/* tipc_send_msg - enqueue a send request */
+static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
+ struct tipc_bearer *b,
+ struct tipc_media_addr *dest)
+{
+ int ttl, err = 0;
+ struct udp_bearer *ub;
+ struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value;
+ struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
+ struct sk_buff *clone;
+ struct rtable *rt;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ skb_set_inner_protocol(clone, htons(ETH_P_TIPC));
+ ub = rcu_dereference_rtnl(b->media_ptr);
+ if (!ub) {
+ err = -ENODEV;
+ goto tx_error;
+ }
+ if (dst->proto == htons(ETH_P_IP)) {
+ struct flowi4 fl = {
+ .daddr = dst->ipv4.s_addr,
+ .saddr = src->ipv4.s_addr,
+ .flowi4_mark = clone->mark,
+ .flowi4_proto = IPPROTO_UDP
+ };
+ rt = ip_route_output_key(net, &fl);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto tx_error;
+ }
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ err = udp_tunnel_xmit_skb(rt, ub->ubsock->sk, clone,
+ src->ipv4.s_addr,
+ dst->ipv4.s_addr, 0, ttl, 0,
+ src->udp_port, dst->udp_port,
+ false, true);
+ if (err < 0) {
+ ip_rt_put(rt);
+ goto tx_error;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ struct dst_entry *ndst;
+ struct flowi6 fl6 = {
+ .flowi6_oif = ub->ifindex,
+ .daddr = dst->ipv6,
+ .saddr = src->ipv6,
+ .flowi6_proto = IPPROTO_UDP
+ };
+ err = ipv6_stub->ipv6_dst_lookup(ub->ubsock->sk, &ndst, &fl6);
+ if (err)
+ goto tx_error;
+ ttl = ip6_dst_hoplimit(ndst);
+ err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, clone,
+ ndst->dev, &src->ipv6,
+ &dst->ipv6, 0, ttl, src->udp_port,
+ dst->udp_port, false);
+#endif
+ }
+ return err;
+
+tx_error:
+ kfree_skb(clone);
+ return err;
+}
+
+/* tipc_udp_recv - read data from bearer socket */
+static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct udp_bearer *ub;
+ struct tipc_bearer *b;
+
+ ub = rcu_dereference_sk_user_data(sk);
+ if (!ub) {
+ pr_err_ratelimited("Failed to get UDP bearer reference");
+ kfree_skb(skb);
+ return 0;
+ }
+
+ skb_pull(skb, sizeof(struct udphdr));
+ rcu_read_lock();
+ b = rcu_dereference_rtnl(ub->bearer);
+
+ if (b) {
+ tipc_rcv(sock_net(sk), skb, b);
+ rcu_read_unlock();
+ return 0;
+ }
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return 0;
+}
+
+static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote)
+{
+ int err = 0;
+ struct ip_mreqn mreqn;
+ struct sock *sk = ub->ubsock->sk;
+
+ if (ntohs(remote->proto) == ETH_P_IP) {
+ if (!ipv4_is_multicast(remote->ipv4.s_addr))
+ return 0;
+ mreqn.imr_multiaddr = remote->ipv4;
+ mreqn.imr_ifindex = ub->ifindex;
+ err = ip_mc_join_group(sk, &mreqn);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (!ipv6_addr_is_multicast(&remote->ipv6))
+ return 0;
+ err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex,
+ &remote->ipv6);
+#endif
+ }
+ return err;
+}
+
+/**
+ * parse_options - build local/remote addresses from configuration
+ * @attrs: netlink config data
+ * @ub: UDP bearer instance
+ * @local: local bearer IP address/port
+ * @remote: peer or multicast IP/port
+ */
+static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
+ struct udp_media_addr *local,
+ struct udp_media_addr *remote)
+{
+ struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
+ struct sockaddr_storage *sa_local, *sa_remote;
+
+ if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
+ goto err;
+ if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX,
+ attrs[TIPC_NLA_BEARER_UDP_OPTS],
+ tipc_nl_udp_policy))
+ goto err;
+ if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) {
+ sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]);
+ sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]);
+ } else {
+err:
+ pr_err("Invalid UDP bearer configuration");
+ return -EINVAL;
+ }
+ if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) {
+ struct sockaddr_in *ip4;
+
+ ip4 = (struct sockaddr_in *)sa_local;
+ local->proto = htons(ETH_P_IP);
+ local->udp_port = ip4->sin_port;
+ local->ipv4.s_addr = ip4->sin_addr.s_addr;
+
+ ip4 = (struct sockaddr_in *)sa_remote;
+ remote->proto = htons(ETH_P_IP);
+ remote->udp_port = ip4->sin_port;
+ remote->ipv4.s_addr = ip4->sin_addr.s_addr;
+ return 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) {
+ struct sockaddr_in6 *ip6;
+
+ ip6 = (struct sockaddr_in6 *)sa_local;
+ local->proto = htons(ETH_P_IPV6);
+ local->udp_port = ip6->sin6_port;
+ local->ipv6 = ip6->sin6_addr;
+ ub->ifindex = ip6->sin6_scope_id;
+
+ ip6 = (struct sockaddr_in6 *)sa_remote;
+ remote->proto = htons(ETH_P_IPV6);
+ remote->udp_port = ip6->sin6_port;
+ remote->ipv6 = ip6->sin6_addr;
+ return 0;
+#endif
+ }
+ return -EADDRNOTAVAIL;
+}
+
+/**
+ * tipc_udp_enable - callback to create a new udp bearer instance
+ * @net: network namespace
+ * @b: pointer to generic tipc_bearer
+ * @attrs: netlink bearer configuration
+ *
+ * validate the bearer parameters and initialize the udp bearer
+ * rtnl_lock should be held
+ */
+static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
+ struct nlattr *attrs[])
+{
+ int err = -EINVAL;
+ struct udp_bearer *ub;
+ struct udp_media_addr *remote;
+ struct udp_media_addr local = {0};
+ struct udp_port_cfg udp_conf = {0};
+ struct udp_tunnel_sock_cfg tuncfg = {NULL};
+
+ ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
+ if (!ub)
+ return -ENOMEM;
+
+ remote = (struct udp_media_addr *)&b->bcast_addr.value;
+ memset(remote, 0, sizeof(struct udp_media_addr));
+ err = parse_options(attrs, ub, &local, remote);
+ if (err)
+ goto err;
+
+ b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
+ b->bcast_addr.broadcast = 1;
+ rcu_assign_pointer(b->media_ptr, ub);
+ rcu_assign_pointer(ub->bearer, b);
+ tipc_udp_media_addr_set(&b->addr, &local);
+ if (local.proto == htons(ETH_P_IP)) {
+ struct net_device *dev;
+
+ dev = __ip_dev_find(net, local.ipv4.s_addr, false);
+ if (!dev) {
+ err = -ENODEV;
+ goto err;
+ }
+ udp_conf.family = AF_INET;
+ udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+ udp_conf.use_udp_checksums = false;
+ ub->ifindex = dev->ifindex;
+ b->mtu = dev->mtu - sizeof(struct iphdr)
+ - sizeof(struct udphdr);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (local.proto == htons(ETH_P_IPV6)) {
+ udp_conf.family = AF_INET6;
+ udp_conf.use_udp6_tx_checksums = true;
+ udp_conf.use_udp6_rx_checksums = true;
+ udp_conf.local_ip6 = in6addr_any;
+ b->mtu = 1280;
+#endif
+ } else {
+ err = -EAFNOSUPPORT;
+ goto err;
+ }
+ udp_conf.local_udp_port = local.udp_port;
+ err = udp_sock_create(net, &udp_conf, &ub->ubsock);
+ if (err)
+ goto err;
+ tuncfg.sk_user_data = ub;
+ tuncfg.encap_type = 1;
+ tuncfg.encap_rcv = tipc_udp_recv;
+ tuncfg.encap_destroy = NULL;
+ setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg);
+
+ if (enable_mcast(ub, remote))
+ goto err;
+ return 0;
+err:
+ kfree(ub);
+ return err;
+}
+
+/* cleanup_bearer - break the socket/bearer association */
+static void cleanup_bearer(struct work_struct *work)
+{
+ struct udp_bearer *ub = container_of(work, struct udp_bearer, work);
+
+ if (ub->ubsock)
+ udp_tunnel_sock_release(ub->ubsock);
+ synchronize_net();
+ kfree(ub);
+}
+
+/* tipc_udp_disable - detach bearer from socket */
+static void tipc_udp_disable(struct tipc_bearer *b)
+{
+ struct udp_bearer *ub;
+
+ ub = rcu_dereference_rtnl(b->media_ptr);
+ if (!ub) {
+ pr_err("UDP bearer instance not found\n");
+ return;
+ }
+ if (ub->ubsock)
+ sock_set_flag(ub->ubsock->sk, SOCK_DEAD);
+ RCU_INIT_POINTER(b->media_ptr, NULL);
+ RCU_INIT_POINTER(ub->bearer, NULL);
+
+ /* sock_release need to be done outside of rtnl lock */
+ INIT_WORK(&ub->work, cleanup_bearer);
+ schedule_work(&ub->work);
+}
+
+struct tipc_media udp_media_info = {
+ .send_msg = tipc_udp_send_msg,
+ .enable_media = tipc_udp_enable,
+ .disable_media = tipc_udp_disable,
+ .addr2str = tipc_udp_addr2str,
+ .addr2msg = tipc_udp_addr2msg,
+ .msg2addr = tipc_udp_msg2addr,
+ .priority = TIPC_DEF_LINK_PRI,
+ .tolerance = TIPC_DEF_LINK_TOL,
+ .window = TIPC_DEF_LINK_WIN,
+ .type_id = TIPC_MEDIA_TYPE_UDP,
+ .hwaddr_len = 0,
+ .name = "udp"
+};
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 000000000..8b31ab85d
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,28 @@
+#
+# Unix Domain Sockets
+#
+
+config UNIX
+ tristate "Unix domain sockets"
+ ---help---
+ If you say Y here, you will include support for Unix domain sockets;
+ sockets are the standard Unix mechanism for establishing and
+ accessing network connections. Many commonly used programs such as
+ the X Window system and syslog use these sockets even if your
+ machine is not connected to any network. Unless you are working on
+ an embedded system or something similar, you therefore definitely
+ want to say Y here.
+
+ To compile this driver as a module, choose M here: the module will be
+ called unix. Note that several important services won't work
+ correctly if you say M here and then neglect to load the module.
+
+ Say Y unless you know what you are doing.
+
+config UNIX_DIAG
+ tristate "UNIX: socket monitoring interface"
+ depends on UNIX
+ default n
+ ---help---
+ Support for UNIX socket monitoring interface used by the ss tool.
+ If unsure, say Y.
diff --git a/net/unix/Makefile b/net/unix/Makefile
new file mode 100644
index 000000000..b663c607b
--- /dev/null
+++ b/net/unix/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Linux unix domain socket layer.
+#
+
+obj-$(CONFIG_UNIX) += unix.o
+
+unix-y := af_unix.o garbage.o
+unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o
+
+obj-$(CONFIG_UNIX_DIAG) += unix_diag.o
+unix_diag-y := diag.o
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
new file mode 100644
index 000000000..06430598c
--- /dev/null
+++ b/net/unix/af_unix.c
@@ -0,0 +1,2467 @@
+/*
+ * NET4: Implementation of BSD Unix domain sockets.
+ *
+ * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Linus Torvalds : Assorted bug cures.
+ * Niibe Yutaka : async I/O support.
+ * Carsten Paeth : PF_UNIX check, address fixes.
+ * Alan Cox : Limit size of allocated blocks.
+ * Alan Cox : Fixed the stupid socketpair bug.
+ * Alan Cox : BSD compatibility fine tuning.
+ * Alan Cox : Fixed a bug in connect when interrupted.
+ * Alan Cox : Sorted out a proper draft version of
+ * file descriptor passing hacked up from
+ * Mike Shaver's work.
+ * Marty Leisner : Fixes to fd passing
+ * Nick Nevin : recvmsg bugfix.
+ * Alan Cox : Started proper garbage collector
+ * Heiko EiBfeldt : Missing verify_area check
+ * Alan Cox : Started POSIXisms
+ * Andreas Schwab : Replace inode by dentry for proper
+ * reference counting
+ * Kirk Petersen : Made this a module
+ * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
+ * Lots of bug fixes.
+ * Alexey Kuznetosv : Repaired (I hope) bugs introduces
+ * by above two patches.
+ * Andrea Arcangeli : If possible we block in connect(2)
+ * if the max backlog of the listen socket
+ * is been reached. This won't break
+ * old apps and it will avoid huge amount
+ * of socks hashed (this for unix_gc()
+ * performances reasons).
+ * Security fix that limits the max
+ * number of socks to 2*max_files and
+ * the number of skb queueable in the
+ * dgram receiver.
+ * Artur Skawina : Hash function optimizations
+ * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
+ * Malcolm Beattie : Set peercred for socketpair
+ * Michal Ostrowski : Module initialization cleanup.
+ * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
+ * the core infrastructure is doing that
+ * for all net proto families now (2.5.69+)
+ *
+ *
+ * Known differences from reference BSD that was tested:
+ *
+ * [TO FIX]
+ * ECONNREFUSED is not returned from one end of a connected() socket to the
+ * other the moment one end closes.
+ * fstat() doesn't return st_dev=0, and give the blksize as high water mark
+ * and a fake inode identifier (nor the BSD first socket fstat twice bug).
+ * [NOT TO FIX]
+ * accept() returns a path name even if the connecting socket has closed
+ * in the meantime (BSD loses the path and gives up).
+ * accept() returns 0 length path for an unbound connector. BSD returns 16
+ * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
+ * socketpair(...SOCK_RAW..) doesn't panic the kernel.
+ * BSD af_unix apparently has connect forgetting to block properly.
+ * (need to check this with the POSIX spec in detail)
+ *
+ * Differences from 2.0.0-11-... (ANK)
+ * Bug fixes and improvements.
+ * - client shutdown killed server socket.
+ * - removed all useless cli/sti pairs.
+ *
+ * Semantic changes/extensions.
+ * - generic control message passing.
+ * - SCM_CREDENTIALS control message.
+ * - "Abstract" (not FS based) socket bindings.
+ * Abstract names are sequences of bytes (not zero terminated)
+ * started by 0, so that this name space does not intersect
+ * with BSD names.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/af_unix.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/scm.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/rtnetlink.h>
+#include <linux/mount.h>
+#include <net/checksum.h>
+#include <linux/security.h>
+#include <linux/freezer.h>
+
+struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
+EXPORT_SYMBOL_GPL(unix_socket_table);
+DEFINE_SPINLOCK(unix_table_lock);
+EXPORT_SYMBOL_GPL(unix_table_lock);
+static atomic_long_t unix_nr_socks;
+
+
+static struct hlist_head *unix_sockets_unbound(void *addr)
+{
+ unsigned long hash = (unsigned long)addr;
+
+ hash ^= hash >> 16;
+ hash ^= hash >> 8;
+ hash %= UNIX_HASH_SIZE;
+ return &unix_socket_table[UNIX_HASH_SIZE + hash];
+}
+
+#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
+
+#ifdef CONFIG_SECURITY_NETWORK
+static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
+}
+
+static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ scm->secid = *UNIXSID(skb);
+}
+#else
+static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{ }
+
+static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{ }
+#endif /* CONFIG_SECURITY_NETWORK */
+
+/*
+ * SMP locking strategy:
+ * hash table is protected with spinlock unix_table_lock
+ * each socket state is protected by separate spin lock.
+ */
+
+static inline unsigned int unix_hash_fold(__wsum n)
+{
+ unsigned int hash = (__force unsigned int)csum_fold(n);
+
+ hash ^= hash>>8;
+ return hash&(UNIX_HASH_SIZE-1);
+}
+
+#define unix_peer(sk) (unix_sk(sk)->peer)
+
+static inline int unix_our_peer(struct sock *sk, struct sock *osk)
+{
+ return unix_peer(osk) == sk;
+}
+
+static inline int unix_may_send(struct sock *sk, struct sock *osk)
+{
+ return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
+}
+
+static inline int unix_recvq_full(struct sock const *sk)
+{
+ return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
+}
+
+struct sock *unix_peer_get(struct sock *s)
+{
+ struct sock *peer;
+
+ unix_state_lock(s);
+ peer = unix_peer(s);
+ if (peer)
+ sock_hold(peer);
+ unix_state_unlock(s);
+ return peer;
+}
+EXPORT_SYMBOL_GPL(unix_peer_get);
+
+static inline void unix_release_addr(struct unix_address *addr)
+{
+ if (atomic_dec_and_test(&addr->refcnt))
+ kfree(addr);
+}
+
+/*
+ * Check unix socket name:
+ * - should be not zero length.
+ * - if started by not zero, should be NULL terminated (FS object)
+ * - if started by zero, it is abstract name.
+ */
+
+static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
+{
+ if (len <= sizeof(short) || len > sizeof(*sunaddr))
+ return -EINVAL;
+ if (!sunaddr || sunaddr->sun_family != AF_UNIX)
+ return -EINVAL;
+ if (sunaddr->sun_path[0]) {
+ /*
+ * This may look like an off by one error but it is a bit more
+ * subtle. 108 is the longest valid AF_UNIX path for a binding.
+ * sun_path[108] doesn't as such exist. However in kernel space
+ * we are guaranteed that it is a valid memory location in our
+ * kernel address buffer.
+ */
+ ((char *)sunaddr)[len] = 0;
+ len = strlen(sunaddr->sun_path)+1+sizeof(short);
+ return len;
+ }
+
+ *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
+ return len;
+}
+
+static void __unix_remove_socket(struct sock *sk)
+{
+ sk_del_node_init(sk);
+}
+
+static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
+{
+ WARN_ON(!sk_unhashed(sk));
+ sk_add_node(sk, list);
+}
+
+static inline void unix_remove_socket(struct sock *sk)
+{
+ spin_lock(&unix_table_lock);
+ __unix_remove_socket(sk);
+ spin_unlock(&unix_table_lock);
+}
+
+static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
+{
+ spin_lock(&unix_table_lock);
+ __unix_insert_socket(list, sk);
+ spin_unlock(&unix_table_lock);
+}
+
+static struct sock *__unix_find_socket_byname(struct net *net,
+ struct sockaddr_un *sunname,
+ int len, int type, unsigned int hash)
+{
+ struct sock *s;
+
+ sk_for_each(s, &unix_socket_table[hash ^ type]) {
+ struct unix_sock *u = unix_sk(s);
+
+ if (!net_eq(sock_net(s), net))
+ continue;
+
+ if (u->addr->len == len &&
+ !memcmp(u->addr->name, sunname, len))
+ goto found;
+ }
+ s = NULL;
+found:
+ return s;
+}
+
+static inline struct sock *unix_find_socket_byname(struct net *net,
+ struct sockaddr_un *sunname,
+ int len, int type,
+ unsigned int hash)
+{
+ struct sock *s;
+
+ spin_lock(&unix_table_lock);
+ s = __unix_find_socket_byname(net, sunname, len, type, hash);
+ if (s)
+ sock_hold(s);
+ spin_unlock(&unix_table_lock);
+ return s;
+}
+
+static struct sock *unix_find_socket_byinode(struct inode *i)
+{
+ struct sock *s;
+
+ spin_lock(&unix_table_lock);
+ sk_for_each(s,
+ &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
+ struct dentry *dentry = unix_sk(s)->path.dentry;
+
+ if (dentry && d_backing_inode(dentry) == i) {
+ sock_hold(s);
+ goto found;
+ }
+ }
+ s = NULL;
+found:
+ spin_unlock(&unix_table_lock);
+ return s;
+}
+
+static inline int unix_writable(struct sock *sk)
+{
+ return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
+}
+
+static void unix_write_space(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ if (unix_writable(sk)) {
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait,
+ POLLOUT | POLLWRNORM | POLLWRBAND);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+ rcu_read_unlock();
+}
+
+/* When dgram socket disconnects (or changes its peer), we clear its receive
+ * queue of packets arrived from previous peer. First, it allows to do
+ * flow control based only on wmem_alloc; second, sk connected to peer
+ * may receive messages only from that peer. */
+static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
+{
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
+ skb_queue_purge(&sk->sk_receive_queue);
+ wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
+
+ /* If one link of bidirectional dgram pipe is disconnected,
+ * we signal error. Messages are lost. Do not make this,
+ * when peer was not connected to us.
+ */
+ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
+ other->sk_err = ECONNRESET;
+ other->sk_error_report(other);
+ }
+ }
+}
+
+static void unix_sock_destructor(struct sock *sk)
+{
+ struct unix_sock *u = unix_sk(sk);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(!sk_unhashed(sk));
+ WARN_ON(sk->sk_socket);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_info("Attempt to release alive unix socket: %p\n", sk);
+ return;
+ }
+
+ if (u->addr)
+ unix_release_addr(u->addr);
+
+ atomic_long_dec(&unix_nr_socks);
+ local_bh_disable();
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ local_bh_enable();
+#ifdef UNIX_REFCNT_DEBUG
+ pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
+ atomic_long_read(&unix_nr_socks));
+#endif
+}
+
+static void unix_release_sock(struct sock *sk, int embrion)
+{
+ struct unix_sock *u = unix_sk(sk);
+ struct path path;
+ struct sock *skpair;
+ struct sk_buff *skb;
+ int state;
+
+ unix_remove_socket(sk);
+
+ /* Clear state */
+ unix_state_lock(sk);
+ sock_orphan(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ path = u->path;
+ u->path.dentry = NULL;
+ u->path.mnt = NULL;
+ state = sk->sk_state;
+ sk->sk_state = TCP_CLOSE;
+ unix_state_unlock(sk);
+
+ wake_up_interruptible_all(&u->peer_wait);
+
+ skpair = unix_peer(sk);
+
+ if (skpair != NULL) {
+ if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
+ unix_state_lock(skpair);
+ /* No more writes */
+ skpair->sk_shutdown = SHUTDOWN_MASK;
+ if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
+ skpair->sk_err = ECONNRESET;
+ unix_state_unlock(skpair);
+ skpair->sk_state_change(skpair);
+ sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
+ }
+ sock_put(skpair); /* It may now die */
+ unix_peer(sk) = NULL;
+ }
+
+ /* Try to flush out this socket. Throw out buffers at least */
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (state == TCP_LISTEN)
+ unix_release_sock(skb->sk, 1);
+ /* passed fds are erased in the kfree_skb hook */
+ kfree_skb(skb);
+ }
+
+ if (path.dentry)
+ path_put(&path);
+
+ sock_put(sk);
+
+ /* ---- Socket is dead now and most probably destroyed ---- */
+
+ /*
+ * Fixme: BSD difference: In BSD all sockets connected to us get
+ * ECONNRESET and we die on the spot. In Linux we behave
+ * like files and pipes do and wait for the last
+ * dereference.
+ *
+ * Can't we simply set sock->err?
+ *
+ * What the above comment does talk about? --ANK(980817)
+ */
+
+ if (unix_tot_inflight)
+ unix_gc(); /* Garbage collect fds */
+}
+
+static void init_peercred(struct sock *sk)
+{
+ put_pid(sk->sk_peer_pid);
+ if (sk->sk_peer_cred)
+ put_cred(sk->sk_peer_cred);
+ sk->sk_peer_pid = get_pid(task_tgid(current));
+ sk->sk_peer_cred = get_current_cred();
+}
+
+static void copy_peercred(struct sock *sk, struct sock *peersk)
+{
+ put_pid(sk->sk_peer_pid);
+ if (sk->sk_peer_cred)
+ put_cred(sk->sk_peer_cred);
+ sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
+ sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
+}
+
+static int unix_listen(struct socket *sock, int backlog)
+{
+ int err;
+ struct sock *sk = sock->sk;
+ struct unix_sock *u = unix_sk(sk);
+ struct pid *old_pid = NULL;
+
+ err = -EOPNOTSUPP;
+ if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
+ goto out; /* Only stream/seqpacket sockets accept */
+ err = -EINVAL;
+ if (!u->addr)
+ goto out; /* No listens on an unbound socket */
+ unix_state_lock(sk);
+ if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
+ goto out_unlock;
+ if (backlog > sk->sk_max_ack_backlog)
+ wake_up_interruptible_all(&u->peer_wait);
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_state = TCP_LISTEN;
+ /* set credentials so connect can copy them */
+ init_peercred(sk);
+ err = 0;
+
+out_unlock:
+ unix_state_unlock(sk);
+ put_pid(old_pid);
+out:
+ return err;
+}
+
+static int unix_release(struct socket *);
+static int unix_bind(struct socket *, struct sockaddr *, int);
+static int unix_stream_connect(struct socket *, struct sockaddr *,
+ int addr_len, int flags);
+static int unix_socketpair(struct socket *, struct socket *);
+static int unix_accept(struct socket *, struct socket *, int);
+static int unix_getname(struct socket *, struct sockaddr *, int *, int);
+static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
+static unsigned int unix_dgram_poll(struct file *, struct socket *,
+ poll_table *);
+static int unix_ioctl(struct socket *, unsigned int, unsigned long);
+static int unix_shutdown(struct socket *, int);
+static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
+static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
+static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
+static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
+static int unix_dgram_connect(struct socket *, struct sockaddr *,
+ int, int);
+static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
+static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
+ int);
+
+static int unix_set_peek_off(struct sock *sk, int val)
+{
+ struct unix_sock *u = unix_sk(sk);
+
+ if (mutex_lock_interruptible(&u->readlock))
+ return -EINTR;
+
+ sk->sk_peek_off = val;
+ mutex_unlock(&u->readlock);
+
+ return 0;
+}
+
+
+static const struct proto_ops unix_stream_ops = {
+ .family = PF_UNIX,
+ .owner = THIS_MODULE,
+ .release = unix_release,
+ .bind = unix_bind,
+ .connect = unix_stream_connect,
+ .socketpair = unix_socketpair,
+ .accept = unix_accept,
+ .getname = unix_getname,
+ .poll = unix_poll,
+ .ioctl = unix_ioctl,
+ .listen = unix_listen,
+ .shutdown = unix_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = unix_stream_sendmsg,
+ .recvmsg = unix_stream_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+ .set_peek_off = unix_set_peek_off,
+};
+
+static const struct proto_ops unix_dgram_ops = {
+ .family = PF_UNIX,
+ .owner = THIS_MODULE,
+ .release = unix_release,
+ .bind = unix_bind,
+ .connect = unix_dgram_connect,
+ .socketpair = unix_socketpair,
+ .accept = sock_no_accept,
+ .getname = unix_getname,
+ .poll = unix_dgram_poll,
+ .ioctl = unix_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = unix_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = unix_dgram_sendmsg,
+ .recvmsg = unix_dgram_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+ .set_peek_off = unix_set_peek_off,
+};
+
+static const struct proto_ops unix_seqpacket_ops = {
+ .family = PF_UNIX,
+ .owner = THIS_MODULE,
+ .release = unix_release,
+ .bind = unix_bind,
+ .connect = unix_stream_connect,
+ .socketpair = unix_socketpair,
+ .accept = unix_accept,
+ .getname = unix_getname,
+ .poll = unix_dgram_poll,
+ .ioctl = unix_ioctl,
+ .listen = unix_listen,
+ .shutdown = unix_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = unix_seqpacket_sendmsg,
+ .recvmsg = unix_seqpacket_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+ .set_peek_off = unix_set_peek_off,
+};
+
+static struct proto unix_proto = {
+ .name = "UNIX",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct unix_sock),
+};
+
+/*
+ * AF_UNIX sockets do not interact with hardware, hence they
+ * dont trigger interrupts - so it's safe for them to have
+ * bh-unsafe locking for their sk_receive_queue.lock. Split off
+ * this special lock-class by reinitializing the spinlock key:
+ */
+static struct lock_class_key af_unix_sk_receive_queue_lock_key;
+
+static struct sock *unix_create1(struct net *net, struct socket *sock)
+{
+ struct sock *sk = NULL;
+ struct unix_sock *u;
+
+ atomic_long_inc(&unix_nr_socks);
+ if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
+ goto out;
+
+ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
+ if (!sk)
+ goto out;
+
+ sock_init_data(sock, sk);
+ lockdep_set_class(&sk->sk_receive_queue.lock,
+ &af_unix_sk_receive_queue_lock_key);
+
+ sk->sk_write_space = unix_write_space;
+ sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
+ sk->sk_destruct = unix_sock_destructor;
+ u = unix_sk(sk);
+ u->path.dentry = NULL;
+ u->path.mnt = NULL;
+ spin_lock_init(&u->lock);
+ atomic_long_set(&u->inflight, 0);
+ INIT_LIST_HEAD(&u->link);
+ mutex_init(&u->readlock); /* single task reading lock */
+ init_waitqueue_head(&u->peer_wait);
+ unix_insert_socket(unix_sockets_unbound(sk), sk);
+out:
+ if (sk == NULL)
+ atomic_long_dec(&unix_nr_socks);
+ else {
+ local_bh_disable();
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ local_bh_enable();
+ }
+ return sk;
+}
+
+static int unix_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ if (protocol && protocol != PF_UNIX)
+ return -EPROTONOSUPPORT;
+
+ sock->state = SS_UNCONNECTED;
+
+ switch (sock->type) {
+ case SOCK_STREAM:
+ sock->ops = &unix_stream_ops;
+ break;
+ /*
+ * Believe it or not BSD has AF_UNIX, SOCK_RAW though
+ * nothing uses it.
+ */
+ case SOCK_RAW:
+ sock->type = SOCK_DGRAM;
+ case SOCK_DGRAM:
+ sock->ops = &unix_dgram_ops;
+ break;
+ case SOCK_SEQPACKET:
+ sock->ops = &unix_seqpacket_ops;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ return unix_create1(net, sock) ? 0 : -ENOMEM;
+}
+
+static int unix_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (!sk)
+ return 0;
+
+ unix_release_sock(sk, 0);
+ sock->sk = NULL;
+
+ return 0;
+}
+
+static int unix_autobind(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct unix_sock *u = unix_sk(sk);
+ static u32 ordernum = 1;
+ struct unix_address *addr;
+ int err;
+ unsigned int retries = 0;
+
+ err = mutex_lock_interruptible(&u->readlock);
+ if (err)
+ return err;
+
+ err = 0;
+ if (u->addr)
+ goto out;
+
+ err = -ENOMEM;
+ addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
+ if (!addr)
+ goto out;
+
+ addr->name->sun_family = AF_UNIX;
+ atomic_set(&addr->refcnt, 1);
+
+retry:
+ addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
+ addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
+
+ spin_lock(&unix_table_lock);
+ ordernum = (ordernum+1)&0xFFFFF;
+
+ if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
+ addr->hash)) {
+ spin_unlock(&unix_table_lock);
+ /*
+ * __unix_find_socket_byname() may take long time if many names
+ * are already in use.
+ */
+ cond_resched();
+ /* Give up if all names seems to be in use. */
+ if (retries++ == 0xFFFFF) {
+ err = -ENOSPC;
+ kfree(addr);
+ goto out;
+ }
+ goto retry;
+ }
+ addr->hash ^= sk->sk_type;
+
+ __unix_remove_socket(sk);
+ u->addr = addr;
+ __unix_insert_socket(&unix_socket_table[addr->hash], sk);
+ spin_unlock(&unix_table_lock);
+ err = 0;
+
+out: mutex_unlock(&u->readlock);
+ return err;
+}
+
+static struct sock *unix_find_other(struct net *net,
+ struct sockaddr_un *sunname, int len,
+ int type, unsigned int hash, int *error)
+{
+ struct sock *u;
+ struct path path;
+ int err = 0;
+
+ if (sunname->sun_path[0]) {
+ struct inode *inode;
+ err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
+ if (err)
+ goto fail;
+ inode = d_backing_inode(path.dentry);
+ err = inode_permission(inode, MAY_WRITE);
+ if (err)
+ goto put_fail;
+
+ err = -ECONNREFUSED;
+ if (!S_ISSOCK(inode->i_mode))
+ goto put_fail;
+ u = unix_find_socket_byinode(inode);
+ if (!u)
+ goto put_fail;
+
+ if (u->sk_type == type)
+ touch_atime(&path);
+
+ path_put(&path);
+
+ err = -EPROTOTYPE;
+ if (u->sk_type != type) {
+ sock_put(u);
+ goto fail;
+ }
+ } else {
+ err = -ECONNREFUSED;
+ u = unix_find_socket_byname(net, sunname, len, type, hash);
+ if (u) {
+ struct dentry *dentry;
+ dentry = unix_sk(u)->path.dentry;
+ if (dentry)
+ touch_atime(&unix_sk(u)->path);
+ } else
+ goto fail;
+ }
+ return u;
+
+put_fail:
+ path_put(&path);
+fail:
+ *error = err;
+ return NULL;
+}
+
+static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
+{
+ struct dentry *dentry;
+ struct path path;
+ int err = 0;
+ /*
+ * Get the parent directory, calculate the hash for last
+ * component.
+ */
+ dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ return err;
+
+ /*
+ * All right, let's create it.
+ */
+ err = security_path_mknod(&path, dentry, mode, 0);
+ if (!err) {
+ err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
+ if (!err) {
+ res->mnt = mntget(path.mnt);
+ res->dentry = dget(dentry);
+ }
+ }
+ done_path_create(&path, dentry);
+ return err;
+}
+
+static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct unix_sock *u = unix_sk(sk);
+ struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
+ char *sun_path = sunaddr->sun_path;
+ int err;
+ unsigned int hash;
+ struct unix_address *addr;
+ struct hlist_head *list;
+
+ err = -EINVAL;
+ if (sunaddr->sun_family != AF_UNIX)
+ goto out;
+
+ if (addr_len == sizeof(short)) {
+ err = unix_autobind(sock);
+ goto out;
+ }
+
+ err = unix_mkname(sunaddr, addr_len, &hash);
+ if (err < 0)
+ goto out;
+ addr_len = err;
+
+ err = mutex_lock_interruptible(&u->readlock);
+ if (err)
+ goto out;
+
+ err = -EINVAL;
+ if (u->addr)
+ goto out_up;
+
+ err = -ENOMEM;
+ addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
+ if (!addr)
+ goto out_up;
+
+ memcpy(addr->name, sunaddr, addr_len);
+ addr->len = addr_len;
+ addr->hash = hash ^ sk->sk_type;
+ atomic_set(&addr->refcnt, 1);
+
+ if (sun_path[0]) {
+ struct path path;
+ umode_t mode = S_IFSOCK |
+ (SOCK_INODE(sock)->i_mode & ~current_umask());
+ err = unix_mknod(sun_path, mode, &path);
+ if (err) {
+ if (err == -EEXIST)
+ err = -EADDRINUSE;
+ unix_release_addr(addr);
+ goto out_up;
+ }
+ addr->hash = UNIX_HASH_SIZE;
+ hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
+ spin_lock(&unix_table_lock);
+ u->path = path;
+ list = &unix_socket_table[hash];
+ } else {
+ spin_lock(&unix_table_lock);
+ err = -EADDRINUSE;
+ if (__unix_find_socket_byname(net, sunaddr, addr_len,
+ sk->sk_type, hash)) {
+ unix_release_addr(addr);
+ goto out_unlock;
+ }
+
+ list = &unix_socket_table[addr->hash];
+ }
+
+ err = 0;
+ __unix_remove_socket(sk);
+ u->addr = addr;
+ __unix_insert_socket(list, sk);
+
+out_unlock:
+ spin_unlock(&unix_table_lock);
+out_up:
+ mutex_unlock(&u->readlock);
+out:
+ return err;
+}
+
+static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
+{
+ if (unlikely(sk1 == sk2) || !sk2) {
+ unix_state_lock(sk1);
+ return;
+ }
+ if (sk1 < sk2) {
+ unix_state_lock(sk1);
+ unix_state_lock_nested(sk2);
+ } else {
+ unix_state_lock(sk2);
+ unix_state_lock_nested(sk1);
+ }
+}
+
+static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
+{
+ if (unlikely(sk1 == sk2) || !sk2) {
+ unix_state_unlock(sk1);
+ return;
+ }
+ unix_state_unlock(sk1);
+ unix_state_unlock(sk2);
+}
+
+static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
+ struct sock *other;
+ unsigned int hash;
+ int err;
+
+ if (addr->sa_family != AF_UNSPEC) {
+ err = unix_mkname(sunaddr, alen, &hash);
+ if (err < 0)
+ goto out;
+ alen = err;
+
+ if (test_bit(SOCK_PASSCRED, &sock->flags) &&
+ !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
+ goto out;
+
+restart:
+ other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
+ if (!other)
+ goto out;
+
+ unix_state_double_lock(sk, other);
+
+ /* Apparently VFS overslept socket death. Retry. */
+ if (sock_flag(other, SOCK_DEAD)) {
+ unix_state_double_unlock(sk, other);
+ sock_put(other);
+ goto restart;
+ }
+
+ err = -EPERM;
+ if (!unix_may_send(sk, other))
+ goto out_unlock;
+
+ err = security_unix_may_send(sk->sk_socket, other->sk_socket);
+ if (err)
+ goto out_unlock;
+
+ } else {
+ /*
+ * 1003.1g breaking connected state with AF_UNSPEC
+ */
+ other = NULL;
+ unix_state_double_lock(sk, other);
+ }
+
+ /*
+ * If it was connected, reconnect.
+ */
+ if (unix_peer(sk)) {
+ struct sock *old_peer = unix_peer(sk);
+ unix_peer(sk) = other;
+ unix_state_double_unlock(sk, other);
+
+ if (other != old_peer)
+ unix_dgram_disconnected(sk, old_peer);
+ sock_put(old_peer);
+ } else {
+ unix_peer(sk) = other;
+ unix_state_double_unlock(sk, other);
+ }
+ return 0;
+
+out_unlock:
+ unix_state_double_unlock(sk, other);
+ sock_put(other);
+out:
+ return err;
+}
+
+static long unix_wait_for_peer(struct sock *other, long timeo)
+{
+ struct unix_sock *u = unix_sk(other);
+ int sched;
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
+
+ sched = !sock_flag(other, SOCK_DEAD) &&
+ !(other->sk_shutdown & RCV_SHUTDOWN) &&
+ unix_recvq_full(other);
+
+ unix_state_unlock(other);
+
+ if (sched)
+ timeo = schedule_timeout(timeo);
+
+ finish_wait(&u->peer_wait, &wait);
+ return timeo;
+}
+
+static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct unix_sock *u = unix_sk(sk), *newu, *otheru;
+ struct sock *newsk = NULL;
+ struct sock *other = NULL;
+ struct sk_buff *skb = NULL;
+ unsigned int hash;
+ int st;
+ int err;
+ long timeo;
+
+ err = unix_mkname(sunaddr, addr_len, &hash);
+ if (err < 0)
+ goto out;
+ addr_len = err;
+
+ if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
+ (err = unix_autobind(sock)) != 0)
+ goto out;
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ /* First of all allocate resources.
+ If we will make it after state is locked,
+ we will have to recheck all again in any case.
+ */
+
+ err = -ENOMEM;
+
+ /* create new sock for complete connection */
+ newsk = unix_create1(sock_net(sk), NULL);
+ if (newsk == NULL)
+ goto out;
+
+ /* Allocate skb for sending to listening sock */
+ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
+ if (skb == NULL)
+ goto out;
+
+restart:
+ /* Find listening sock. */
+ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
+ if (!other)
+ goto out;
+
+ /* Latch state of peer */
+ unix_state_lock(other);
+
+ /* Apparently VFS overslept socket death. Retry. */
+ if (sock_flag(other, SOCK_DEAD)) {
+ unix_state_unlock(other);
+ sock_put(other);
+ goto restart;
+ }
+
+ err = -ECONNREFUSED;
+ if (other->sk_state != TCP_LISTEN)
+ goto out_unlock;
+ if (other->sk_shutdown & RCV_SHUTDOWN)
+ goto out_unlock;
+
+ if (unix_recvq_full(other)) {
+ err = -EAGAIN;
+ if (!timeo)
+ goto out_unlock;
+
+ timeo = unix_wait_for_peer(other, timeo);
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out;
+ sock_put(other);
+ goto restart;
+ }
+
+ /* Latch our state.
+
+ It is tricky place. We need to grab our state lock and cannot
+ drop lock on peer. It is dangerous because deadlock is
+ possible. Connect to self case and simultaneous
+ attempt to connect are eliminated by checking socket
+ state. other is TCP_LISTEN, if sk is TCP_LISTEN we
+ check this before attempt to grab lock.
+
+ Well, and we have to recheck the state after socket locked.
+ */
+ st = sk->sk_state;
+
+ switch (st) {
+ case TCP_CLOSE:
+ /* This is ok... continue with connect */
+ break;
+ case TCP_ESTABLISHED:
+ /* Socket is already connected */
+ err = -EISCONN;
+ goto out_unlock;
+ default:
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ unix_state_lock_nested(sk);
+
+ if (sk->sk_state != st) {
+ unix_state_unlock(sk);
+ unix_state_unlock(other);
+ sock_put(other);
+ goto restart;
+ }
+
+ err = security_unix_stream_connect(sk, other, newsk);
+ if (err) {
+ unix_state_unlock(sk);
+ goto out_unlock;
+ }
+
+ /* The way is open! Fastly set all the necessary fields... */
+
+ sock_hold(sk);
+ unix_peer(newsk) = sk;
+ newsk->sk_state = TCP_ESTABLISHED;
+ newsk->sk_type = sk->sk_type;
+ init_peercred(newsk);
+ newu = unix_sk(newsk);
+ RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
+ otheru = unix_sk(other);
+
+ /* copy address information from listening to new sock*/
+ if (otheru->addr) {
+ atomic_inc(&otheru->addr->refcnt);
+ newu->addr = otheru->addr;
+ }
+ if (otheru->path.dentry) {
+ path_get(&otheru->path);
+ newu->path = otheru->path;
+ }
+
+ /* Set credentials */
+ copy_peercred(sk, other);
+
+ sock->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
+ sock_hold(newsk);
+
+ smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
+ unix_peer(sk) = newsk;
+
+ unix_state_unlock(sk);
+
+ /* take ten and and send info to listening sock */
+ spin_lock(&other->sk_receive_queue.lock);
+ __skb_queue_tail(&other->sk_receive_queue, skb);
+ spin_unlock(&other->sk_receive_queue.lock);
+ unix_state_unlock(other);
+ other->sk_data_ready(other);
+ sock_put(other);
+ return 0;
+
+out_unlock:
+ if (other)
+ unix_state_unlock(other);
+
+out:
+ kfree_skb(skb);
+ if (newsk)
+ unix_release_sock(newsk, 0);
+ if (other)
+ sock_put(other);
+ return err;
+}
+
+static int unix_socketpair(struct socket *socka, struct socket *sockb)
+{
+ struct sock *ska = socka->sk, *skb = sockb->sk;
+
+ /* Join our sockets back to back */
+ sock_hold(ska);
+ sock_hold(skb);
+ unix_peer(ska) = skb;
+ unix_peer(skb) = ska;
+ init_peercred(ska);
+ init_peercred(skb);
+
+ if (ska->sk_type != SOCK_DGRAM) {
+ ska->sk_state = TCP_ESTABLISHED;
+ skb->sk_state = TCP_ESTABLISHED;
+ socka->state = SS_CONNECTED;
+ sockb->state = SS_CONNECTED;
+ }
+ return 0;
+}
+
+static void unix_sock_inherit_flags(const struct socket *old,
+ struct socket *new)
+{
+ if (test_bit(SOCK_PASSCRED, &old->flags))
+ set_bit(SOCK_PASSCRED, &new->flags);
+ if (test_bit(SOCK_PASSSEC, &old->flags))
+ set_bit(SOCK_PASSSEC, &new->flags);
+}
+
+static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sock *tsk;
+ struct sk_buff *skb;
+ int err;
+
+ err = -EOPNOTSUPP;
+ if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
+ goto out;
+
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out;
+
+ /* If socket state is TCP_LISTEN it cannot change (for now...),
+ * so that no locks are necessary.
+ */
+
+ skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
+ if (!skb) {
+ /* This means receive shutdown. */
+ if (err == 0)
+ err = -EINVAL;
+ goto out;
+ }
+
+ tsk = skb->sk;
+ skb_free_datagram(sk, skb);
+ wake_up_interruptible(&unix_sk(sk)->peer_wait);
+
+ /* attach accepted sock to socket */
+ unix_state_lock(tsk);
+ newsock->state = SS_CONNECTED;
+ unix_sock_inherit_flags(sock, newsock);
+ sock_graft(tsk, newsock);
+ unix_state_unlock(tsk);
+ return 0;
+
+out:
+ return err;
+}
+
+
+static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct unix_sock *u;
+ DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
+ int err = 0;
+
+ if (peer) {
+ sk = unix_peer_get(sk);
+
+ err = -ENOTCONN;
+ if (!sk)
+ goto out;
+ err = 0;
+ } else {
+ sock_hold(sk);
+ }
+
+ u = unix_sk(sk);
+ unix_state_lock(sk);
+ if (!u->addr) {
+ sunaddr->sun_family = AF_UNIX;
+ sunaddr->sun_path[0] = 0;
+ *uaddr_len = sizeof(short);
+ } else {
+ struct unix_address *addr = u->addr;
+
+ *uaddr_len = addr->len;
+ memcpy(sunaddr, addr->name, *uaddr_len);
+ }
+ unix_state_unlock(sk);
+ sock_put(sk);
+out:
+ return err;
+}
+
+static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ int i;
+
+ scm->fp = UNIXCB(skb).fp;
+ UNIXCB(skb).fp = NULL;
+
+ for (i = scm->fp->count-1; i >= 0; i--)
+ unix_notinflight(scm->fp->fp[i]);
+}
+
+static void unix_destruct_scm(struct sk_buff *skb)
+{
+ struct scm_cookie scm;
+ memset(&scm, 0, sizeof(scm));
+ scm.pid = UNIXCB(skb).pid;
+ if (UNIXCB(skb).fp)
+ unix_detach_fds(&scm, skb);
+
+ /* Alas, it calls VFS */
+ /* So fscking what? fput() had been SMP-safe since the last Summer */
+ scm_destroy(&scm);
+ sock_wfree(skb);
+}
+
+#define MAX_RECURSION_LEVEL 4
+
+static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+ int i;
+ unsigned char max_level = 0;
+ int unix_sock_count = 0;
+
+ for (i = scm->fp->count - 1; i >= 0; i--) {
+ struct sock *sk = unix_get_socket(scm->fp->fp[i]);
+
+ if (sk) {
+ unix_sock_count++;
+ max_level = max(max_level,
+ unix_sk(sk)->recursion_level);
+ }
+ }
+ if (unlikely(max_level > MAX_RECURSION_LEVEL))
+ return -ETOOMANYREFS;
+
+ /*
+ * Need to duplicate file references for the sake of garbage
+ * collection. Otherwise a socket in the fps might become a
+ * candidate for GC while the skb is not yet queued.
+ */
+ UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+ if (!UNIXCB(skb).fp)
+ return -ENOMEM;
+
+ if (unix_sock_count) {
+ for (i = scm->fp->count - 1; i >= 0; i--)
+ unix_inflight(scm->fp->fp[i]);
+ }
+ return max_level;
+}
+
+static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
+{
+ int err = 0;
+
+ UNIXCB(skb).pid = get_pid(scm->pid);
+ UNIXCB(skb).uid = scm->creds.uid;
+ UNIXCB(skb).gid = scm->creds.gid;
+ UNIXCB(skb).fp = NULL;
+ if (scm->fp && send_fds)
+ err = unix_attach_fds(scm, skb);
+
+ skb->destructor = unix_destruct_scm;
+ return err;
+}
+
+/*
+ * Some apps rely on write() giving SCM_CREDENTIALS
+ * We include credentials if source or destination socket
+ * asserted SOCK_PASSCRED.
+ */
+static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
+ const struct sock *other)
+{
+ if (UNIXCB(skb).pid)
+ return;
+ if (test_bit(SOCK_PASSCRED, &sock->flags) ||
+ !other->sk_socket ||
+ test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
+ UNIXCB(skb).pid = get_pid(task_tgid(current));
+ current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
+ }
+}
+
+/*
+ * Send AF_UNIX data.
+ */
+
+static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct unix_sock *u = unix_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
+ struct sock *other = NULL;
+ int namelen = 0; /* fake GCC */
+ int err;
+ unsigned int hash;
+ struct sk_buff *skb;
+ long timeo;
+ struct scm_cookie scm;
+ int max_level;
+ int data_len = 0;
+
+ wait_for_unix_gc();
+ err = scm_send(sock, msg, &scm, false);
+ if (err < 0)
+ return err;
+
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags&MSG_OOB)
+ goto out;
+
+ if (msg->msg_namelen) {
+ err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
+ if (err < 0)
+ goto out;
+ namelen = err;
+ } else {
+ sunaddr = NULL;
+ err = -ENOTCONN;
+ other = unix_peer_get(sk);
+ if (!other)
+ goto out;
+ }
+
+ if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
+ && (err = unix_autobind(sock)) != 0)
+ goto out;
+
+ err = -EMSGSIZE;
+ if (len > sk->sk_sndbuf - 32)
+ goto out;
+
+ if (len > SKB_MAX_ALLOC) {
+ data_len = min_t(size_t,
+ len - SKB_MAX_ALLOC,
+ MAX_SKB_FRAGS * PAGE_SIZE);
+ data_len = PAGE_ALIGN(data_len);
+
+ BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
+ }
+
+ skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
+ msg->msg_flags & MSG_DONTWAIT, &err,
+ PAGE_ALLOC_COSTLY_ORDER);
+ if (skb == NULL)
+ goto out;
+
+ err = unix_scm_to_skb(&scm, skb, true);
+ if (err < 0)
+ goto out_free;
+ max_level = err + 1;
+ unix_get_secdata(&scm, skb);
+
+ skb_put(skb, len - data_len);
+ skb->data_len = data_len;
+ skb->len = len;
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
+ if (err)
+ goto out_free;
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+restart:
+ if (!other) {
+ err = -ECONNRESET;
+ if (sunaddr == NULL)
+ goto out_free;
+
+ other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
+ hash, &err);
+ if (other == NULL)
+ goto out_free;
+ }
+
+ if (sk_filter(other, skb) < 0) {
+ /* Toss the packet but do not return any error to the sender */
+ err = len;
+ goto out_free;
+ }
+
+ unix_state_lock(other);
+ err = -EPERM;
+ if (!unix_may_send(sk, other))
+ goto out_unlock;
+
+ if (sock_flag(other, SOCK_DEAD)) {
+ /*
+ * Check with 1003.1g - what should
+ * datagram error
+ */
+ unix_state_unlock(other);
+ sock_put(other);
+
+ err = 0;
+ unix_state_lock(sk);
+ if (unix_peer(sk) == other) {
+ unix_peer(sk) = NULL;
+ unix_state_unlock(sk);
+
+ unix_dgram_disconnected(sk, other);
+ sock_put(other);
+ err = -ECONNREFUSED;
+ } else {
+ unix_state_unlock(sk);
+ }
+
+ other = NULL;
+ if (err)
+ goto out_free;
+ goto restart;
+ }
+
+ err = -EPIPE;
+ if (other->sk_shutdown & RCV_SHUTDOWN)
+ goto out_unlock;
+
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = security_unix_may_send(sk->sk_socket, other->sk_socket);
+ if (err)
+ goto out_unlock;
+ }
+
+ if (unix_peer(other) != sk && unix_recvq_full(other)) {
+ if (!timeo) {
+ err = -EAGAIN;
+ goto out_unlock;
+ }
+
+ timeo = unix_wait_for_peer(other, timeo);
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out_free;
+
+ goto restart;
+ }
+
+ if (sock_flag(other, SOCK_RCVTSTAMP))
+ __net_timestamp(skb);
+ maybe_add_creds(skb, sock, other);
+ skb_queue_tail(&other->sk_receive_queue, skb);
+ if (max_level > unix_sk(other)->recursion_level)
+ unix_sk(other)->recursion_level = max_level;
+ unix_state_unlock(other);
+ other->sk_data_ready(other);
+ sock_put(other);
+ scm_destroy(&scm);
+ return len;
+
+out_unlock:
+ unix_state_unlock(other);
+out_free:
+ kfree_skb(skb);
+out:
+ if (other)
+ sock_put(other);
+ scm_destroy(&scm);
+ return err;
+}
+
+/* We use paged skbs for stream sockets, and limit occupancy to 32768
+ * bytes, and a minimun of a full page.
+ */
+#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
+
+static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct sock *other = NULL;
+ int err, size;
+ struct sk_buff *skb;
+ int sent = 0;
+ struct scm_cookie scm;
+ bool fds_sent = false;
+ int max_level;
+ int data_len;
+
+ wait_for_unix_gc();
+ err = scm_send(sock, msg, &scm, false);
+ if (err < 0)
+ return err;
+
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags&MSG_OOB)
+ goto out_err;
+
+ if (msg->msg_namelen) {
+ err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
+ goto out_err;
+ } else {
+ err = -ENOTCONN;
+ other = unix_peer(sk);
+ if (!other)
+ goto out_err;
+ }
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ goto pipe_err;
+
+ while (sent < len) {
+ size = len - sent;
+
+ /* Keep two messages in the pipe so it schedules better */
+ size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
+
+ /* allow fallback to order-0 allocations */
+ size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
+
+ data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
+
+ data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
+
+ skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
+ msg->msg_flags & MSG_DONTWAIT, &err,
+ get_order(UNIX_SKB_FRAGS_SZ));
+ if (!skb)
+ goto out_err;
+
+ /* Only send the fds in the first buffer */
+ err = unix_scm_to_skb(&scm, skb, !fds_sent);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto out_err;
+ }
+ max_level = err + 1;
+ fds_sent = true;
+
+ skb_put(skb, size - data_len);
+ skb->data_len = data_len;
+ skb->len = size;
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+ if (err) {
+ kfree_skb(skb);
+ goto out_err;
+ }
+
+ unix_state_lock(other);
+
+ if (sock_flag(other, SOCK_DEAD) ||
+ (other->sk_shutdown & RCV_SHUTDOWN))
+ goto pipe_err_free;
+
+ maybe_add_creds(skb, sock, other);
+ skb_queue_tail(&other->sk_receive_queue, skb);
+ if (max_level > unix_sk(other)->recursion_level)
+ unix_sk(other)->recursion_level = max_level;
+ unix_state_unlock(other);
+ other->sk_data_ready(other);
+ sent += size;
+ }
+
+ scm_destroy(&scm);
+
+ return sent;
+
+pipe_err_free:
+ unix_state_unlock(other);
+ kfree_skb(skb);
+pipe_err:
+ if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
+ err = -EPIPE;
+out_err:
+ scm_destroy(&scm);
+ return sent ? : err;
+}
+
+static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ int err;
+ struct sock *sk = sock->sk;
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ if (msg->msg_namelen)
+ msg->msg_namelen = 0;
+
+ return unix_dgram_sendmsg(sock, msg, len);
+}
+
+static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ return unix_dgram_recvmsg(sock, msg, size, flags);
+}
+
+static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
+{
+ struct unix_sock *u = unix_sk(sk);
+
+ if (u->addr) {
+ msg->msg_namelen = u->addr->len;
+ memcpy(msg->msg_name, u->addr->name, u->addr->len);
+ }
+}
+
+static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct scm_cookie scm;
+ struct sock *sk = sock->sk;
+ struct unix_sock *u = unix_sk(sk);
+ int noblock = flags & MSG_DONTWAIT;
+ struct sk_buff *skb;
+ int err;
+ int peeked, skip;
+
+ err = -EOPNOTSUPP;
+ if (flags&MSG_OOB)
+ goto out;
+
+ err = mutex_lock_interruptible(&u->readlock);
+ if (unlikely(err)) {
+ /* recvmsg() in non blocking mode is supposed to return -EAGAIN
+ * sk_rcvtimeo is not honored by mutex_lock_interruptible()
+ */
+ err = noblock ? -EAGAIN : -ERESTARTSYS;
+ goto out;
+ }
+
+ skip = sk_peek_offset(sk, flags);
+
+ skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
+ if (!skb) {
+ unix_state_lock(sk);
+ /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
+ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+ err = 0;
+ unix_state_unlock(sk);
+ goto out_unlock;
+ }
+
+ wake_up_interruptible_sync_poll(&u->peer_wait,
+ POLLOUT | POLLWRNORM | POLLWRBAND);
+
+ if (msg->msg_name)
+ unix_copy_addr(msg, skb->sk);
+
+ if (size > skb->len - skip)
+ size = skb->len - skip;
+ else if (size < skb->len - skip)
+ msg->msg_flags |= MSG_TRUNC;
+
+ err = skb_copy_datagram_msg(skb, skip, msg, size);
+ if (err)
+ goto out_free;
+
+ if (sock_flag(sk, SOCK_RCVTSTAMP))
+ __sock_recv_timestamp(msg, sk, skb);
+
+ memset(&scm, 0, sizeof(scm));
+
+ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
+ unix_set_secdata(&scm, skb);
+
+ if (!(flags & MSG_PEEK)) {
+ if (UNIXCB(skb).fp)
+ unix_detach_fds(&scm, skb);
+
+ sk_peek_offset_bwd(sk, skb->len);
+ } else {
+ /* It is questionable: on PEEK we could:
+ - do not return fds - good, but too simple 8)
+ - return fds, and do not return them on read (old strategy,
+ apparently wrong)
+ - clone fds (I chose it for now, it is the most universal
+ solution)
+
+ POSIX 1003.1g does not actually define this clearly
+ at all. POSIX 1003.1g doesn't define a lot of things
+ clearly however!
+
+ */
+
+ sk_peek_offset_fwd(sk, size);
+
+ if (UNIXCB(skb).fp)
+ scm.fp = scm_fp_dup(UNIXCB(skb).fp);
+ }
+ err = (flags & MSG_TRUNC) ? skb->len - skip : size;
+
+ scm_recv(sock, msg, &scm, flags);
+
+out_free:
+ skb_free_datagram(sk, skb);
+out_unlock:
+ mutex_unlock(&u->readlock);
+out:
+ return err;
+}
+
+/*
+ * Sleep until more data has arrived. But check for races..
+ */
+static long unix_stream_data_wait(struct sock *sk, long timeo,
+ struct sk_buff *last)
+{
+ DEFINE_WAIT(wait);
+
+ unix_state_lock(sk);
+
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ if (skb_peek_tail(&sk->sk_receive_queue) != last ||
+ sk->sk_err ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current) ||
+ !timeo)
+ break;
+
+ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ unix_state_unlock(sk);
+ timeo = freezable_schedule_timeout(timeo);
+ unix_state_lock(sk);
+
+ if (sock_flag(sk, SOCK_DEAD))
+ break;
+
+ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ }
+
+ finish_wait(sk_sleep(sk), &wait);
+ unix_state_unlock(sk);
+ return timeo;
+}
+
+static unsigned int unix_skb_len(const struct sk_buff *skb)
+{
+ return skb->len - UNIXCB(skb).consumed;
+}
+
+static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct scm_cookie scm;
+ struct sock *sk = sock->sk;
+ struct unix_sock *u = unix_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
+ int copied = 0;
+ int noblock = flags & MSG_DONTWAIT;
+ int check_creds = 0;
+ int target;
+ int err = 0;
+ long timeo;
+ int skip;
+
+ err = -EINVAL;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ err = -EOPNOTSUPP;
+ if (flags&MSG_OOB)
+ goto out;
+
+ target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
+ timeo = sock_rcvtimeo(sk, noblock);
+
+ /* Lock the socket to prevent queue disordering
+ * while sleeps in memcpy_tomsg
+ */
+
+ memset(&scm, 0, sizeof(scm));
+
+ err = mutex_lock_interruptible(&u->readlock);
+ if (unlikely(err)) {
+ /* recvmsg() in non blocking mode is supposed to return -EAGAIN
+ * sk_rcvtimeo is not honored by mutex_lock_interruptible()
+ */
+ err = noblock ? -EAGAIN : -ERESTARTSYS;
+ goto out;
+ }
+
+ do {
+ int chunk;
+ struct sk_buff *skb, *last;
+
+ unix_state_lock(sk);
+ if (sock_flag(sk, SOCK_DEAD)) {
+ err = -ECONNRESET;
+ goto unlock;
+ }
+ last = skb = skb_peek(&sk->sk_receive_queue);
+again:
+ if (skb == NULL) {
+ unix_sk(sk)->recursion_level = 0;
+ if (copied >= target)
+ goto unlock;
+
+ /*
+ * POSIX 1003.1g mandates this order.
+ */
+
+ err = sock_error(sk);
+ if (err)
+ goto unlock;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto unlock;
+
+ unix_state_unlock(sk);
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ mutex_unlock(&u->readlock);
+
+ timeo = unix_stream_data_wait(sk, timeo, last);
+
+ if (signal_pending(current)
+ || mutex_lock_interruptible(&u->readlock)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+
+ continue;
+ unlock:
+ unix_state_unlock(sk);
+ break;
+ }
+
+ skip = sk_peek_offset(sk, flags);
+ while (skip >= unix_skb_len(skb)) {
+ skip -= unix_skb_len(skb);
+ last = skb;
+ skb = skb_peek_next(skb, &sk->sk_receive_queue);
+ if (!skb)
+ goto again;
+ }
+
+ unix_state_unlock(sk);
+
+ if (check_creds) {
+ /* Never glue messages from different writers */
+ if ((UNIXCB(skb).pid != scm.pid) ||
+ !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
+ !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
+ break;
+ } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
+ /* Copy credentials */
+ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
+ check_creds = 1;
+ }
+
+ /* Copy address just once */
+ if (sunaddr) {
+ unix_copy_addr(msg, skb->sk);
+ sunaddr = NULL;
+ }
+
+ chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
+ if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
+ msg, chunk)) {
+ if (copied == 0)
+ copied = -EFAULT;
+ break;
+ }
+ copied += chunk;
+ size -= chunk;
+
+ /* Mark read part of skb as used */
+ if (!(flags & MSG_PEEK)) {
+ UNIXCB(skb).consumed += chunk;
+
+ sk_peek_offset_bwd(sk, chunk);
+
+ if (UNIXCB(skb).fp)
+ unix_detach_fds(&scm, skb);
+
+ if (unix_skb_len(skb))
+ break;
+
+ skb_unlink(skb, &sk->sk_receive_queue);
+ consume_skb(skb);
+
+ if (scm.fp)
+ break;
+ } else {
+ /* It is questionable, see note in unix_dgram_recvmsg.
+ */
+ if (UNIXCB(skb).fp)
+ scm.fp = scm_fp_dup(UNIXCB(skb).fp);
+
+ sk_peek_offset_fwd(sk, chunk);
+
+ break;
+ }
+ } while (size);
+
+ mutex_unlock(&u->readlock);
+ scm_recv(sock, msg, &scm, flags);
+out:
+ return copied ? : err;
+}
+
+static int unix_shutdown(struct socket *sock, int mode)
+{
+ struct sock *sk = sock->sk;
+ struct sock *other;
+
+ if (mode < SHUT_RD || mode > SHUT_RDWR)
+ return -EINVAL;
+ /* This maps:
+ * SHUT_RD (0) -> RCV_SHUTDOWN (1)
+ * SHUT_WR (1) -> SEND_SHUTDOWN (2)
+ * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
+ */
+ ++mode;
+
+ unix_state_lock(sk);
+ sk->sk_shutdown |= mode;
+ other = unix_peer(sk);
+ if (other)
+ sock_hold(other);
+ unix_state_unlock(sk);
+ sk->sk_state_change(sk);
+
+ if (other &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
+
+ int peer_mode = 0;
+
+ if (mode&RCV_SHUTDOWN)
+ peer_mode |= SEND_SHUTDOWN;
+ if (mode&SEND_SHUTDOWN)
+ peer_mode |= RCV_SHUTDOWN;
+ unix_state_lock(other);
+ other->sk_shutdown |= peer_mode;
+ unix_state_unlock(other);
+ other->sk_state_change(other);
+ if (peer_mode == SHUTDOWN_MASK)
+ sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
+ else if (peer_mode & RCV_SHUTDOWN)
+ sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
+ }
+ if (other)
+ sock_put(other);
+
+ return 0;
+}
+
+long unix_inq_len(struct sock *sk)
+{
+ struct sk_buff *skb;
+ long amount = 0;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -EINVAL;
+
+ spin_lock(&sk->sk_receive_queue.lock);
+ if (sk->sk_type == SOCK_STREAM ||
+ sk->sk_type == SOCK_SEQPACKET) {
+ skb_queue_walk(&sk->sk_receive_queue, skb)
+ amount += unix_skb_len(skb);
+ } else {
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb)
+ amount = skb->len;
+ }
+ spin_unlock(&sk->sk_receive_queue.lock);
+
+ return amount;
+}
+EXPORT_SYMBOL_GPL(unix_inq_len);
+
+long unix_outq_len(struct sock *sk)
+{
+ return sk_wmem_alloc_get(sk);
+}
+EXPORT_SYMBOL_GPL(unix_outq_len);
+
+static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ long amount = 0;
+ int err;
+
+ switch (cmd) {
+ case SIOCOUTQ:
+ amount = unix_outq_len(sk);
+ err = put_user(amount, (int __user *)arg);
+ break;
+ case SIOCINQ:
+ amount = unix_inq_len(sk);
+ if (amount < 0)
+ err = amount;
+ else
+ err = put_user(amount, (int __user *)arg);
+ break;
+ default:
+ err = -ENOIOCTLCMD;
+ break;
+ }
+ return err;
+}
+
+static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask;
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err)
+ mask |= POLLERR;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connection-based need to check for termination and startup */
+ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
+ sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+
+ /*
+ * we set writable also when the other side has shut down the
+ * connection. This prevents stuck sockets.
+ */
+ if (unix_writable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+ return mask;
+}
+
+static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk, *other;
+ unsigned int mask, writable;
+
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ mask = 0;
+
+ /* exceptional events? */
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR |
+ (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connection-based need to check for termination and startup */
+ if (sk->sk_type == SOCK_SEQPACKET) {
+ if (sk->sk_state == TCP_CLOSE)
+ mask |= POLLHUP;
+ /* connection hasn't started yet? */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return mask;
+ }
+
+ /* No write status requested, avoid expensive OUT tests. */
+ if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
+ return mask;
+
+ writable = unix_writable(sk);
+ other = unix_peer_get(sk);
+ if (other) {
+ if (unix_peer(other) != sk) {
+ sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
+ if (unix_recvq_full(other))
+ writable = 0;
+ }
+ sock_put(other);
+ }
+
+ if (writable)
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+ return mask;
+}
+
+#ifdef CONFIG_PROC_FS
+
+#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
+
+#define get_bucket(x) ((x) >> BUCKET_SPACE)
+#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
+#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
+
+static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
+{
+ unsigned long offset = get_offset(*pos);
+ unsigned long bucket = get_bucket(*pos);
+ struct sock *sk;
+ unsigned long count = 0;
+
+ for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
+ if (sock_net(sk) != seq_file_net(seq))
+ continue;
+ if (++count == offset)
+ break;
+ }
+
+ return sk;
+}
+
+static struct sock *unix_next_socket(struct seq_file *seq,
+ struct sock *sk,
+ loff_t *pos)
+{
+ unsigned long bucket;
+
+ while (sk > (struct sock *)SEQ_START_TOKEN) {
+ sk = sk_next(sk);
+ if (!sk)
+ goto next_bucket;
+ if (sock_net(sk) == seq_file_net(seq))
+ return sk;
+ }
+
+ do {
+ sk = unix_from_bucket(seq, pos);
+ if (sk)
+ return sk;
+
+next_bucket:
+ bucket = get_bucket(*pos) + 1;
+ *pos = set_bucket_offset(bucket, 1);
+ } while (bucket < ARRAY_SIZE(unix_socket_table));
+
+ return NULL;
+}
+
+static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(unix_table_lock)
+{
+ spin_lock(&unix_table_lock);
+
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
+ return NULL;
+
+ return unix_next_socket(seq, NULL, pos);
+}
+
+static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return unix_next_socket(seq, v, pos);
+}
+
+static void unix_seq_stop(struct seq_file *seq, void *v)
+ __releases(unix_table_lock)
+{
+ spin_unlock(&unix_table_lock);
+}
+
+static int unix_seq_show(struct seq_file *seq, void *v)
+{
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "Num RefCount Protocol Flags Type St "
+ "Inode Path\n");
+ else {
+ struct sock *s = v;
+ struct unix_sock *u = unix_sk(s);
+ unix_state_lock(s);
+
+ seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
+ s,
+ atomic_read(&s->sk_refcnt),
+ 0,
+ s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
+ s->sk_type,
+ s->sk_socket ?
+ (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
+ (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
+ sock_i_ino(s));
+
+ if (u->addr) {
+ int i, len;
+ seq_putc(seq, ' ');
+
+ i = 0;
+ len = u->addr->len - sizeof(short);
+ if (!UNIX_ABSTRACT(s))
+ len--;
+ else {
+ seq_putc(seq, '@');
+ i++;
+ }
+ for ( ; i < len; i++)
+ seq_putc(seq, u->addr->name->sun_path[i]);
+ }
+ unix_state_unlock(s);
+ seq_putc(seq, '\n');
+ }
+
+ return 0;
+}
+
+static const struct seq_operations unix_seq_ops = {
+ .start = unix_seq_start,
+ .next = unix_seq_next,
+ .stop = unix_seq_stop,
+ .show = unix_seq_show,
+};
+
+static int unix_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &unix_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations unix_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = unix_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif
+
+static const struct net_proto_family unix_family_ops = {
+ .family = PF_UNIX,
+ .create = unix_create,
+ .owner = THIS_MODULE,
+};
+
+
+static int __net_init unix_net_init(struct net *net)
+{
+ int error = -ENOMEM;
+
+ net->unx.sysctl_max_dgram_qlen = 10;
+ if (unix_sysctl_register(net))
+ goto out;
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
+ unix_sysctl_unregister(net);
+ goto out;
+ }
+#endif
+ error = 0;
+out:
+ return error;
+}
+
+static void __net_exit unix_net_exit(struct net *net)
+{
+ unix_sysctl_unregister(net);
+ remove_proc_entry("unix", net->proc_net);
+}
+
+static struct pernet_operations unix_net_ops = {
+ .init = unix_net_init,
+ .exit = unix_net_exit,
+};
+
+static int __init af_unix_init(void)
+{
+ int rc = -1;
+
+ BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
+
+ rc = proto_register(&unix_proto, 1);
+ if (rc != 0) {
+ pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
+ goto out;
+ }
+
+ sock_register(&unix_family_ops);
+ register_pernet_subsys(&unix_net_ops);
+out:
+ return rc;
+}
+
+static void __exit af_unix_exit(void)
+{
+ sock_unregister(PF_UNIX);
+ proto_unregister(&unix_proto);
+ unregister_pernet_subsys(&unix_net_ops);
+}
+
+/* Earlier than device_initcall() so that other drivers invoking
+ request_module() don't end up in a loop when modprobe tries
+ to use a UNIX socket. But later than subsys_initcall() because
+ we depend on stuff initialised there */
+fs_initcall(af_unix_init);
+module_exit(af_unix_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_UNIX);
diff --git a/net/unix/diag.c b/net/unix/diag.c
new file mode 100644
index 000000000..c512f64d5
--- /dev/null
+++ b/net/unix/diag.c
@@ -0,0 +1,328 @@
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/sock_diag.h>
+#include <linux/unix_diag.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/netlink.h>
+#include <net/af_unix.h>
+#include <net/tcp_states.h>
+
+static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
+{
+ struct unix_address *addr = unix_sk(sk)->addr;
+
+ if (!addr)
+ return 0;
+
+ return nla_put(nlskb, UNIX_DIAG_NAME, addr->len - sizeof(short),
+ addr->name->sun_path);
+}
+
+static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb)
+{
+ struct dentry *dentry = unix_sk(sk)->path.dentry;
+
+ if (dentry) {
+ struct unix_diag_vfs uv = {
+ .udiag_vfs_ino = d_backing_inode(dentry)->i_ino,
+ .udiag_vfs_dev = dentry->d_sb->s_dev,
+ };
+
+ return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), &uv);
+ }
+
+ return 0;
+}
+
+static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb)
+{
+ struct sock *peer;
+ int ino;
+
+ peer = unix_peer_get(sk);
+ if (peer) {
+ unix_state_lock(peer);
+ ino = sock_i_ino(peer);
+ unix_state_unlock(peer);
+ sock_put(peer);
+
+ return nla_put_u32(nlskb, UNIX_DIAG_PEER, ino);
+ }
+
+ return 0;
+}
+
+static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb)
+{
+ struct sk_buff *skb;
+ struct nlattr *attr;
+ u32 *buf;
+ int i;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ spin_lock(&sk->sk_receive_queue.lock);
+
+ attr = nla_reserve(nlskb, UNIX_DIAG_ICONS,
+ sk->sk_receive_queue.qlen * sizeof(u32));
+ if (!attr)
+ goto errout;
+
+ buf = nla_data(attr);
+ i = 0;
+ skb_queue_walk(&sk->sk_receive_queue, skb) {
+ struct sock *req, *peer;
+
+ req = skb->sk;
+ /*
+ * The state lock is outer for the same sk's
+ * queue lock. With the other's queue locked it's
+ * OK to lock the state.
+ */
+ unix_state_lock_nested(req);
+ peer = unix_sk(req)->peer;
+ buf[i++] = (peer ? sock_i_ino(peer) : 0);
+ unix_state_unlock(req);
+ }
+ spin_unlock(&sk->sk_receive_queue.lock);
+ }
+
+ return 0;
+
+errout:
+ spin_unlock(&sk->sk_receive_queue.lock);
+ return -EMSGSIZE;
+}
+
+static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
+{
+ struct unix_diag_rqlen rql;
+
+ if (sk->sk_state == TCP_LISTEN) {
+ rql.udiag_rqueue = sk->sk_receive_queue.qlen;
+ rql.udiag_wqueue = sk->sk_max_ack_backlog;
+ } else {
+ rql.udiag_rqueue = (u32) unix_inq_len(sk);
+ rql.udiag_wqueue = (u32) unix_outq_len(sk);
+ }
+
+ return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql);
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
+ u32 portid, u32 seq, u32 flags, int sk_ino)
+{
+ struct nlmsghdr *nlh;
+ struct unix_diag_msg *rep;
+
+ nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep),
+ flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ rep = nlmsg_data(nlh);
+ rep->udiag_family = AF_UNIX;
+ rep->udiag_type = sk->sk_type;
+ rep->udiag_state = sk->sk_state;
+ rep->pad = 0;
+ rep->udiag_ino = sk_ino;
+ sock_diag_save_cookie(sk, rep->udiag_cookie);
+
+ if ((req->udiag_show & UDIAG_SHOW_NAME) &&
+ sk_diag_dump_name(sk, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->udiag_show & UDIAG_SHOW_VFS) &&
+ sk_diag_dump_vfs(sk, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->udiag_show & UDIAG_SHOW_PEER) &&
+ sk_diag_dump_peer(sk, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->udiag_show & UDIAG_SHOW_ICONS) &&
+ sk_diag_dump_icons(sk, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->udiag_show & UDIAG_SHOW_RQLEN) &&
+ sk_diag_show_rqlen(sk, skb))
+ goto out_nlmsg_trim;
+
+ if ((req->udiag_show & UDIAG_SHOW_MEMINFO) &&
+ sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO))
+ goto out_nlmsg_trim;
+
+ if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto out_nlmsg_trim;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+out_nlmsg_trim:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
+ u32 portid, u32 seq, u32 flags)
+{
+ int sk_ino;
+
+ unix_state_lock(sk);
+ sk_ino = sock_i_ino(sk);
+ unix_state_unlock(sk);
+
+ if (!sk_ino)
+ return 0;
+
+ return sk_diag_fill(sk, skb, req, portid, seq, flags, sk_ino);
+}
+
+static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct unix_diag_req *req;
+ int num, s_num, slot, s_slot;
+ struct net *net = sock_net(skb->sk);
+
+ req = nlmsg_data(cb->nlh);
+
+ s_slot = cb->args[0];
+ num = s_num = cb->args[1];
+
+ spin_lock(&unix_table_lock);
+ for (slot = s_slot;
+ slot < ARRAY_SIZE(unix_socket_table);
+ s_num = 0, slot++) {
+ struct sock *sk;
+
+ num = 0;
+ sk_for_each(sk, &unix_socket_table[slot]) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next;
+ if (!(req->udiag_states & (1 << sk->sk_state)))
+ goto next;
+ if (sk_diag_dump(sk, skb, req,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI) < 0)
+ goto done;
+next:
+ num++;
+ }
+ }
+done:
+ spin_unlock(&unix_table_lock);
+ cb->args[0] = slot;
+ cb->args[1] = num;
+
+ return skb->len;
+}
+
+static struct sock *unix_lookup_by_ino(int ino)
+{
+ int i;
+ struct sock *sk;
+
+ spin_lock(&unix_table_lock);
+ for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
+ sk_for_each(sk, &unix_socket_table[i])
+ if (ino == sock_i_ino(sk)) {
+ sock_hold(sk);
+ spin_unlock(&unix_table_lock);
+
+ return sk;
+ }
+ }
+
+ spin_unlock(&unix_table_lock);
+ return NULL;
+}
+
+static int unix_diag_get_exact(struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ struct unix_diag_req *req)
+{
+ int err = -EINVAL;
+ struct sock *sk;
+ struct sk_buff *rep;
+ unsigned int extra_len;
+ struct net *net = sock_net(in_skb->sk);
+
+ if (req->udiag_ino == 0)
+ goto out_nosk;
+
+ sk = unix_lookup_by_ino(req->udiag_ino);
+ err = -ENOENT;
+ if (sk == NULL)
+ goto out_nosk;
+
+ err = sock_diag_check_cookie(sk, req->udiag_cookie);
+ if (err)
+ goto out;
+
+ extra_len = 256;
+again:
+ err = -ENOMEM;
+ rep = nlmsg_new(sizeof(struct unix_diag_msg) + extra_len, GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ err = sk_diag_fill(sk, rep, req, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, req->udiag_ino);
+ if (err < 0) {
+ nlmsg_free(rep);
+ extra_len += 256;
+ if (extra_len >= PAGE_SIZE)
+ goto out;
+
+ goto again;
+ }
+ err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+out:
+ if (sk)
+ sock_put(sk);
+out_nosk:
+ return err;
+}
+
+static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ int hdrlen = sizeof(struct unix_diag_req);
+ struct net *net = sock_net(skb->sk);
+
+ if (nlmsg_len(h) < hdrlen)
+ return -EINVAL;
+
+ if (h->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .dump = unix_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ } else
+ return unix_diag_get_exact(skb, h, nlmsg_data(h));
+}
+
+static const struct sock_diag_handler unix_diag_handler = {
+ .family = AF_UNIX,
+ .dump = unix_diag_handler_dump,
+};
+
+static int __init unix_diag_init(void)
+{
+ return sock_diag_register(&unix_diag_handler);
+}
+
+static void __exit unix_diag_exit(void)
+{
+ sock_diag_unregister(&unix_diag_handler);
+}
+
+module_init(unix_diag_init);
+module_exit(unix_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 1 /* AF_LOCAL */);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
new file mode 100644
index 000000000..a73a226f2
--- /dev/null
+++ b/net/unix/garbage.c
@@ -0,0 +1,372 @@
+/*
+ * NET3: Garbage Collector For AF_UNIX sockets
+ *
+ * Garbage Collector:
+ * Copyright (C) Barak A. Pearlmutter.
+ * Released under the GPL version 2 or later.
+ *
+ * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem.
+ * If it doesn't work blame me, it worked when Barak sent it.
+ *
+ * Assumptions:
+ *
+ * - object w/ a bit
+ * - free list
+ *
+ * Current optimizations:
+ *
+ * - explicit stack instead of recursion
+ * - tail recurse on first born instead of immediate push/pop
+ * - we gather the stuff that should not be killed into tree
+ * and stack is just a path from root to the current pointer.
+ *
+ * Future optimizations:
+ *
+ * - don't just push entire root set; process in place
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed.
+ * Cope with changing max_files.
+ * Al Viro 11 Oct 1998
+ * Graph may have cycles. That is, we can send the descriptor
+ * of foo to bar and vice versa. Current code chokes on that.
+ * Fix: move SCM_RIGHTS ones into the separate list and then
+ * skb_free() them all instead of doing explicit fput's.
+ * Another problem: since fput() may block somebody may
+ * create a new unix_socket when we are in the middle of sweep
+ * phase. Fix: revert the logic wrt MARKED. Mark everything
+ * upon the beginning and unmark non-junk ones.
+ *
+ * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS
+ * sent to connect()'ed but still not accept()'ed sockets.
+ * Fixed. Old code had slightly different problem here:
+ * extra fput() in situation when we passed the descriptor via
+ * such socket and closed it (descriptor). That would happen on
+ * each unix_gc() until the accept(). Since the struct file in
+ * question would go to the free list and might be reused...
+ * That might be the reason of random oopses on filp_close()
+ * in unrelated processes.
+ *
+ * AV 28 Feb 1999
+ * Kill the explicit allocation of stack. Now we keep the tree
+ * with root in dummy + pointer (gc_current) to one of the nodes.
+ * Stack is represented as path from gc_current to dummy. Unmark
+ * now means "add to tree". Push == "make it a son of gc_current".
+ * Pop == "move gc_current to parent". We keep only pointers to
+ * parents (->gc_tree).
+ * AV 1 Mar 1999
+ * Damn. Added missing check for ->dead in listen queues scanning.
+ *
+ * Miklos Szeredi 25 Jun 2007
+ * Reimplement with a cycle collecting algorithm. This should
+ * solve several problems with the previous code, like being racy
+ * wrt receive and holding up unrelated socket operations.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/net.h>
+#include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+
+#include <net/sock.h>
+#include <net/af_unix.h>
+#include <net/scm.h>
+#include <net/tcp_states.h>
+
+/* Internal data structures and random procedures: */
+
+static LIST_HEAD(gc_inflight_list);
+static LIST_HEAD(gc_candidates);
+static DEFINE_SPINLOCK(unix_gc_lock);
+static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
+
+unsigned int unix_tot_inflight;
+
+struct sock *unix_get_socket(struct file *filp)
+{
+ struct sock *u_sock = NULL;
+ struct inode *inode = file_inode(filp);
+
+ /* Socket ? */
+ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+ struct socket *sock = SOCKET_I(inode);
+ struct sock *s = sock->sk;
+
+ /* PF_UNIX ? */
+ if (s && sock->ops && sock->ops->family == PF_UNIX)
+ u_sock = s;
+ }
+ return u_sock;
+}
+
+/* Keep the number of times in flight count for the file
+ * descriptor if it is for an AF_UNIX socket.
+ */
+
+void unix_inflight(struct file *fp)
+{
+ struct sock *s = unix_get_socket(fp);
+
+ if (s) {
+ struct unix_sock *u = unix_sk(s);
+
+ spin_lock(&unix_gc_lock);
+
+ if (atomic_long_inc_return(&u->inflight) == 1) {
+ BUG_ON(!list_empty(&u->link));
+ list_add_tail(&u->link, &gc_inflight_list);
+ } else {
+ BUG_ON(list_empty(&u->link));
+ }
+ unix_tot_inflight++;
+ spin_unlock(&unix_gc_lock);
+ }
+}
+
+void unix_notinflight(struct file *fp)
+{
+ struct sock *s = unix_get_socket(fp);
+
+ if (s) {
+ struct unix_sock *u = unix_sk(s);
+
+ spin_lock(&unix_gc_lock);
+ BUG_ON(list_empty(&u->link));
+
+ if (atomic_long_dec_and_test(&u->inflight))
+ list_del_init(&u->link);
+ unix_tot_inflight--;
+ spin_unlock(&unix_gc_lock);
+ }
+}
+
+static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+ struct sk_buff_head *hitlist)
+{
+ struct sk_buff *skb;
+ struct sk_buff *next;
+
+ spin_lock(&x->sk_receive_queue.lock);
+ skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
+ /* Do we have file descriptors ? */
+ if (UNIXCB(skb).fp) {
+ bool hit = false;
+ /* Process the descriptors of this socket */
+ int nfd = UNIXCB(skb).fp->count;
+ struct file **fp = UNIXCB(skb).fp->fp;
+
+ while (nfd--) {
+ /* Get the socket the fd matches if it indeed does so */
+ struct sock *sk = unix_get_socket(*fp++);
+
+ if (sk) {
+ struct unix_sock *u = unix_sk(sk);
+
+ /* Ignore non-candidates, they could
+ * have been added to the queues after
+ * starting the garbage collection
+ */
+ if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
+ hit = true;
+
+ func(u);
+ }
+ }
+ }
+ if (hit && hitlist != NULL) {
+ __skb_unlink(skb, &x->sk_receive_queue);
+ __skb_queue_tail(hitlist, skb);
+ }
+ }
+ }
+ spin_unlock(&x->sk_receive_queue.lock);
+}
+
+static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
+ struct sk_buff_head *hitlist)
+{
+ if (x->sk_state != TCP_LISTEN) {
+ scan_inflight(x, func, hitlist);
+ } else {
+ struct sk_buff *skb;
+ struct sk_buff *next;
+ struct unix_sock *u;
+ LIST_HEAD(embryos);
+
+ /* For a listening socket collect the queued embryos
+ * and perform a scan on them as well.
+ */
+ spin_lock(&x->sk_receive_queue.lock);
+ skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
+ u = unix_sk(skb->sk);
+
+ /* An embryo cannot be in-flight, so it's safe
+ * to use the list link.
+ */
+ BUG_ON(!list_empty(&u->link));
+ list_add_tail(&u->link, &embryos);
+ }
+ spin_unlock(&x->sk_receive_queue.lock);
+
+ while (!list_empty(&embryos)) {
+ u = list_entry(embryos.next, struct unix_sock, link);
+ scan_inflight(&u->sk, func, hitlist);
+ list_del_init(&u->link);
+ }
+ }
+}
+
+static void dec_inflight(struct unix_sock *usk)
+{
+ atomic_long_dec(&usk->inflight);
+}
+
+static void inc_inflight(struct unix_sock *usk)
+{
+ atomic_long_inc(&usk->inflight);
+}
+
+static void inc_inflight_move_tail(struct unix_sock *u)
+{
+ atomic_long_inc(&u->inflight);
+ /* If this still might be part of a cycle, move it to the end
+ * of the list, so that it's checked even if it was already
+ * passed over
+ */
+ if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
+ list_move_tail(&u->link, &gc_candidates);
+}
+
+static bool gc_in_progress;
+#define UNIX_INFLIGHT_TRIGGER_GC 16000
+
+void wait_for_unix_gc(void)
+{
+ /* If number of inflight sockets is insane,
+ * force a garbage collect right now.
+ */
+ if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
+ unix_gc();
+ wait_event(unix_gc_wait, gc_in_progress == false);
+}
+
+/* The external entry point: unix_gc() */
+void unix_gc(void)
+{
+ struct unix_sock *u;
+ struct unix_sock *next;
+ struct sk_buff_head hitlist;
+ struct list_head cursor;
+ LIST_HEAD(not_cycle_list);
+
+ spin_lock(&unix_gc_lock);
+
+ /* Avoid a recursive GC. */
+ if (gc_in_progress)
+ goto out;
+
+ gc_in_progress = true;
+ /* First, select candidates for garbage collection. Only
+ * in-flight sockets are considered, and from those only ones
+ * which don't have any external reference.
+ *
+ * Holding unix_gc_lock will protect these candidates from
+ * being detached, and hence from gaining an external
+ * reference. Since there are no possible receivers, all
+ * buffers currently on the candidates' queues stay there
+ * during the garbage collection.
+ *
+ * We also know that no new candidate can be added onto the
+ * receive queues. Other, non candidate sockets _can_ be
+ * added to queue, so we must make sure only to touch
+ * candidates.
+ */
+ list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
+ long total_refs;
+ long inflight_refs;
+
+ total_refs = file_count(u->sk.sk_socket->file);
+ inflight_refs = atomic_long_read(&u->inflight);
+
+ BUG_ON(inflight_refs < 1);
+ BUG_ON(total_refs < inflight_refs);
+ if (total_refs == inflight_refs) {
+ list_move_tail(&u->link, &gc_candidates);
+ __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+ __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+ }
+ }
+
+ /* Now remove all internal in-flight reference to children of
+ * the candidates.
+ */
+ list_for_each_entry(u, &gc_candidates, link)
+ scan_children(&u->sk, dec_inflight, NULL);
+
+ /* Restore the references for children of all candidates,
+ * which have remaining references. Do this recursively, so
+ * only those remain, which form cyclic references.
+ *
+ * Use a "cursor" link, to make the list traversal safe, even
+ * though elements might be moved about.
+ */
+ list_add(&cursor, &gc_candidates);
+ while (cursor.next != &gc_candidates) {
+ u = list_entry(cursor.next, struct unix_sock, link);
+
+ /* Move cursor to after the current position. */
+ list_move(&cursor, &u->link);
+
+ if (atomic_long_read(&u->inflight) > 0) {
+ list_move_tail(&u->link, &not_cycle_list);
+ __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+ scan_children(&u->sk, inc_inflight_move_tail, NULL);
+ }
+ }
+ list_del(&cursor);
+
+ /* not_cycle_list contains those sockets which do not make up a
+ * cycle. Restore these to the inflight list.
+ */
+ while (!list_empty(&not_cycle_list)) {
+ u = list_entry(not_cycle_list.next, struct unix_sock, link);
+ __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+ list_move_tail(&u->link, &gc_inflight_list);
+ }
+
+ /* Now gc_candidates contains only garbage. Restore original
+ * inflight counters for these as well, and remove the skbuffs
+ * which are creating the cycle(s).
+ */
+ skb_queue_head_init(&hitlist);
+ list_for_each_entry(u, &gc_candidates, link)
+ scan_children(&u->sk, inc_inflight, &hitlist);
+
+ spin_unlock(&unix_gc_lock);
+
+ /* Here we are. Hitlist is filled. Die. */
+ __skb_queue_purge(&hitlist);
+
+ spin_lock(&unix_gc_lock);
+
+ /* All candidates should have been detached by now. */
+ BUG_ON(!list_empty(&gc_candidates));
+ gc_in_progress = false;
+ wake_up(&unix_gc_wait);
+
+ out:
+ spin_unlock(&unix_gc_lock);
+}
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
new file mode 100644
index 000000000..b3d515021
--- /dev/null
+++ b/net/unix/sysctl_net_unix.c
@@ -0,0 +1,61 @@
+/*
+ * NET4: Sysctl interface to net af_unix subsystem.
+ *
+ * Authors: Mike Shaver.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+
+#include <net/af_unix.h>
+
+static struct ctl_table unix_table[] = {
+ {
+ .procname = "max_dgram_qlen",
+ .data = &init_net.unx.sysctl_max_dgram_qlen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ { }
+};
+
+int __net_init unix_sysctl_register(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL);
+ if (table == NULL)
+ goto err_alloc;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ table[0].data = &net->unx.sysctl_max_dgram_qlen;
+ net->unx.ctl = register_net_sysctl(net, "net/unix", table);
+ if (net->unx.ctl == NULL)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+void unix_sysctl_unregister(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->unx.ctl->ctl_table_arg;
+ unregister_net_sysctl_table(net->unx.ctl);
+ kfree(table);
+}
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
new file mode 100644
index 000000000..14810abed
--- /dev/null
+++ b/net/vmw_vsock/Kconfig
@@ -0,0 +1,28 @@
+#
+# Vsock protocol
+#
+
+config VSOCKETS
+ tristate "Virtual Socket protocol"
+ help
+ Virtual Socket Protocol is a socket protocol similar to TCP/IP
+ allowing communication between Virtual Machines and hypervisor
+ or host.
+
+ You should also select one or more hypervisor-specific transports
+ below.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vsock. If unsure, say N.
+
+config VMWARE_VMCI_VSOCKETS
+ tristate "VMware VMCI transport for Virtual Sockets"
+ depends on VSOCKETS && VMWARE_VMCI
+ help
+ This module implements a VMCI transport for Virtual Sockets.
+
+ Enable this transport if your Virtual Machine runs on a VMware
+ hypervisor.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vmw_vsock_vmci_transport. If unsure, say N.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
new file mode 100644
index 000000000..2ce52d70f
--- /dev/null
+++ b/net/vmw_vsock/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_VSOCKETS) += vsock.o
+obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
+
+vsock-y += af_vsock.o vsock_addr.o
+
+vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
+ vmci_transport_notify_qstate.o
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
new file mode 100644
index 000000000..2ec86e652
--- /dev/null
+++ b/net/vmw_vsock/af_vsock.c
@@ -0,0 +1,1999 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+/* Implementation notes:
+ *
+ * - There are two kinds of sockets: those created by user action (such as
+ * calling socket(2)) and those created by incoming connection request packets.
+ *
+ * - There are two "global" tables, one for bound sockets (sockets that have
+ * specified an address that they are responsible for) and one for connected
+ * sockets (sockets that have established a connection with another socket).
+ * These tables are "global" in that all sockets on the system are placed
+ * within them. - Note, though, that the bound table contains an extra entry
+ * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in
+ * that list. The bound table is used solely for lookup of sockets when packets
+ * are received and that's not necessary for SOCK_DGRAM sockets since we create
+ * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM
+ * sockets out of the bound hash buckets will reduce the chance of collisions
+ * when looking for SOCK_STREAM sockets and prevents us from having to check the
+ * socket type in the hash table lookups.
+ *
+ * - Sockets created by user action will either be "client" sockets that
+ * initiate a connection or "server" sockets that listen for connections; we do
+ * not support simultaneous connects (two "client" sockets connecting).
+ *
+ * - "Server" sockets are referred to as listener sockets throughout this
+ * implementation because they are in the SS_LISTEN state. When a connection
+ * request is received (the second kind of socket mentioned above), we create a
+ * new socket and refer to it as a pending socket. These pending sockets are
+ * placed on the pending connection list of the listener socket. When future
+ * packets are received for the address the listener socket is bound to, we
+ * check if the source of the packet is from one that has an existing pending
+ * connection. If it does, we process the packet for the pending socket. When
+ * that socket reaches the connected state, it is removed from the listener
+ * socket's pending list and enqueued in the listener socket's accept queue.
+ * Callers of accept(2) will accept connected sockets from the listener socket's
+ * accept queue. If the socket cannot be accepted for some reason then it is
+ * marked rejected. Once the connection is accepted, it is owned by the user
+ * process and the responsibility for cleanup falls with that user process.
+ *
+ * - It is possible that these pending sockets will never reach the connected
+ * state; in fact, we may never receive another packet after the connection
+ * request. Because of this, we must schedule a cleanup function to run in the
+ * future, after some amount of time passes where a connection should have been
+ * established. This function ensures that the socket is off all lists so it
+ * cannot be retrieved, then drops all references to the socket so it is cleaned
+ * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this
+ * function will also cleanup rejected sockets, those that reach the connected
+ * state but leave it before they have been accepted.
+ *
+ * - Sockets created by user action will be cleaned up when the user process
+ * calls close(2), causing our release implementation to be called. Our release
+ * implementation will perform some cleanup then drop the last reference so our
+ * sk_destruct implementation is invoked. Our sk_destruct implementation will
+ * perform additional cleanup that's common for both types of sockets.
+ *
+ * - A socket's reference count is what ensures that the structure won't be
+ * freed. Each entry in a list (such as the "global" bound and connected tables
+ * and the listener socket's pending list and connected queue) ensures a
+ * reference. When we defer work until process context and pass a socket as our
+ * argument, we must ensure the reference count is increased to ensure the
+ * socket isn't freed before the function is run; the deferred function will
+ * then drop the reference.
+ */
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <linux/skbuff.h>
+#include <linux/smp.h>
+#include <linux/socket.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
+static void vsock_sk_destruct(struct sock *sk);
+static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* Protocol family. */
+static struct proto vsock_proto = {
+ .name = "AF_VSOCK",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct vsock_sock),
+};
+
+/* The default peer timeout indicates how long we will wait for a peer response
+ * to a control message.
+ */
+#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
+
+#define SS_LISTEN 255
+
+static const struct vsock_transport *transport;
+static DEFINE_MUTEX(vsock_register_mutex);
+
+/**** EXPORTS ****/
+
+/* Get the ID of the local context. This is transport dependent. */
+
+int vm_sockets_get_local_cid(void)
+{
+ return transport->get_local_cid();
+}
+EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
+
+/**** UTILS ****/
+
+/* Each bound VSocket is stored in the bind hash table and each connected
+ * VSocket is stored in the connected hash table.
+ *
+ * Unbound sockets are all put on the same list attached to the end of the hash
+ * table (vsock_unbound_sockets). Bound sockets are added to the hash table in
+ * the bucket that their local address hashes to (vsock_bound_sockets(addr)
+ * represents the list that addr hashes to).
+ *
+ * Specifically, we initialize the vsock_bind_table array to a size of
+ * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through
+ * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and
+ * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function
+ * mods with VSOCK_HASH_SIZE to ensure this.
+ */
+#define VSOCK_HASH_SIZE 251
+#define MAX_PORT_RETRIES 24
+
+#define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE)
+#define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)])
+#define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE])
+
+/* XXX This can probably be implemented in a better way. */
+#define VSOCK_CONN_HASH(src, dst) \
+ (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE)
+#define vsock_connected_sockets(src, dst) \
+ (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)])
+#define vsock_connected_sockets_vsk(vsk) \
+ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr)
+
+static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
+static struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
+static DEFINE_SPINLOCK(vsock_table_lock);
+
+/* Autobind this socket to the local address if necessary. */
+static int vsock_auto_bind(struct vsock_sock *vsk)
+{
+ struct sock *sk = sk_vsock(vsk);
+ struct sockaddr_vm local_addr;
+
+ if (vsock_addr_bound(&vsk->local_addr))
+ return 0;
+ vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+ return __vsock_bind(sk, &local_addr);
+}
+
+static void vsock_init_tables(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++)
+ INIT_LIST_HEAD(&vsock_bind_table[i]);
+
+ for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
+ INIT_LIST_HEAD(&vsock_connected_table[i]);
+}
+
+static void __vsock_insert_bound(struct list_head *list,
+ struct vsock_sock *vsk)
+{
+ sock_hold(&vsk->sk);
+ list_add(&vsk->bound_table, list);
+}
+
+static void __vsock_insert_connected(struct list_head *list,
+ struct vsock_sock *vsk)
+{
+ sock_hold(&vsk->sk);
+ list_add(&vsk->connected_table, list);
+}
+
+static void __vsock_remove_bound(struct vsock_sock *vsk)
+{
+ list_del_init(&vsk->bound_table);
+ sock_put(&vsk->sk);
+}
+
+static void __vsock_remove_connected(struct vsock_sock *vsk)
+{
+ list_del_init(&vsk->connected_table);
+ sock_put(&vsk->sk);
+}
+
+static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
+{
+ struct vsock_sock *vsk;
+
+ list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table)
+ if (addr->svm_port == vsk->local_addr.svm_port)
+ return sk_vsock(vsk);
+
+ return NULL;
+}
+
+static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst)
+{
+ struct vsock_sock *vsk;
+
+ list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
+ connected_table) {
+ if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
+ dst->svm_port == vsk->local_addr.svm_port) {
+ return sk_vsock(vsk);
+ }
+ }
+
+ return NULL;
+}
+
+static bool __vsock_in_bound_table(struct vsock_sock *vsk)
+{
+ return !list_empty(&vsk->bound_table);
+}
+
+static bool __vsock_in_connected_table(struct vsock_sock *vsk)
+{
+ return !list_empty(&vsk->connected_table);
+}
+
+static void vsock_insert_unbound(struct vsock_sock *vsk)
+{
+ spin_lock_bh(&vsock_table_lock);
+ __vsock_insert_bound(vsock_unbound_sockets, vsk);
+ spin_unlock_bh(&vsock_table_lock);
+}
+
+void vsock_insert_connected(struct vsock_sock *vsk)
+{
+ struct list_head *list = vsock_connected_sockets(
+ &vsk->remote_addr, &vsk->local_addr);
+
+ spin_lock_bh(&vsock_table_lock);
+ __vsock_insert_connected(list, vsk);
+ spin_unlock_bh(&vsock_table_lock);
+}
+EXPORT_SYMBOL_GPL(vsock_insert_connected);
+
+void vsock_remove_bound(struct vsock_sock *vsk)
+{
+ spin_lock_bh(&vsock_table_lock);
+ __vsock_remove_bound(vsk);
+ spin_unlock_bh(&vsock_table_lock);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_bound);
+
+void vsock_remove_connected(struct vsock_sock *vsk)
+{
+ spin_lock_bh(&vsock_table_lock);
+ __vsock_remove_connected(vsk);
+ spin_unlock_bh(&vsock_table_lock);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_connected);
+
+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
+{
+ struct sock *sk;
+
+ spin_lock_bh(&vsock_table_lock);
+ sk = __vsock_find_bound_socket(addr);
+ if (sk)
+ sock_hold(sk);
+
+ spin_unlock_bh(&vsock_table_lock);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
+
+struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst)
+{
+ struct sock *sk;
+
+ spin_lock_bh(&vsock_table_lock);
+ sk = __vsock_find_connected_socket(src, dst);
+ if (sk)
+ sock_hold(sk);
+
+ spin_unlock_bh(&vsock_table_lock);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
+
+static bool vsock_in_bound_table(struct vsock_sock *vsk)
+{
+ bool ret;
+
+ spin_lock_bh(&vsock_table_lock);
+ ret = __vsock_in_bound_table(vsk);
+ spin_unlock_bh(&vsock_table_lock);
+
+ return ret;
+}
+
+static bool vsock_in_connected_table(struct vsock_sock *vsk)
+{
+ bool ret;
+
+ spin_lock_bh(&vsock_table_lock);
+ ret = __vsock_in_connected_table(vsk);
+ spin_unlock_bh(&vsock_table_lock);
+
+ return ret;
+}
+
+void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
+{
+ int i;
+
+ spin_lock_bh(&vsock_table_lock);
+
+ for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
+ struct vsock_sock *vsk;
+ list_for_each_entry(vsk, &vsock_connected_table[i],
+ connected_table)
+ fn(sk_vsock(vsk));
+ }
+
+ spin_unlock_bh(&vsock_table_lock);
+}
+EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
+
+void vsock_add_pending(struct sock *listener, struct sock *pending)
+{
+ struct vsock_sock *vlistener;
+ struct vsock_sock *vpending;
+
+ vlistener = vsock_sk(listener);
+ vpending = vsock_sk(pending);
+
+ sock_hold(pending);
+ sock_hold(listener);
+ list_add_tail(&vpending->pending_links, &vlistener->pending_links);
+}
+EXPORT_SYMBOL_GPL(vsock_add_pending);
+
+void vsock_remove_pending(struct sock *listener, struct sock *pending)
+{
+ struct vsock_sock *vpending = vsock_sk(pending);
+
+ list_del_init(&vpending->pending_links);
+ sock_put(listener);
+ sock_put(pending);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_pending);
+
+void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
+{
+ struct vsock_sock *vlistener;
+ struct vsock_sock *vconnected;
+
+ vlistener = vsock_sk(listener);
+ vconnected = vsock_sk(connected);
+
+ sock_hold(connected);
+ sock_hold(listener);
+ list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue);
+}
+EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
+
+static struct sock *vsock_dequeue_accept(struct sock *listener)
+{
+ struct vsock_sock *vlistener;
+ struct vsock_sock *vconnected;
+
+ vlistener = vsock_sk(listener);
+
+ if (list_empty(&vlistener->accept_queue))
+ return NULL;
+
+ vconnected = list_entry(vlistener->accept_queue.next,
+ struct vsock_sock, accept_queue);
+
+ list_del_init(&vconnected->accept_queue);
+ sock_put(listener);
+ /* The caller will need a reference on the connected socket so we let
+ * it call sock_put().
+ */
+
+ return sk_vsock(vconnected);
+}
+
+static bool vsock_is_accept_queue_empty(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ return list_empty(&vsk->accept_queue);
+}
+
+static bool vsock_is_pending(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ return !list_empty(&vsk->pending_links);
+}
+
+static int vsock_send_shutdown(struct sock *sk, int mode)
+{
+ return transport->shutdown(vsock_sk(sk), mode);
+}
+
+void vsock_pending_work(struct work_struct *work)
+{
+ struct sock *sk;
+ struct sock *listener;
+ struct vsock_sock *vsk;
+ bool cleanup;
+
+ vsk = container_of(work, struct vsock_sock, dwork.work);
+ sk = sk_vsock(vsk);
+ listener = vsk->listener;
+ cleanup = true;
+
+ lock_sock(listener);
+ lock_sock(sk);
+
+ if (vsock_is_pending(sk)) {
+ vsock_remove_pending(listener, sk);
+ } else if (!vsk->rejected) {
+ /* We are not on the pending list and accept() did not reject
+ * us, so we must have been accepted by our user process. We
+ * just need to drop our references to the sockets and be on
+ * our way.
+ */
+ cleanup = false;
+ goto out;
+ }
+
+ listener->sk_ack_backlog--;
+
+ /* We need to remove ourself from the global connected sockets list so
+ * incoming packets can't find this socket, and to reduce the reference
+ * count.
+ */
+ if (vsock_in_connected_table(vsk))
+ vsock_remove_connected(vsk);
+
+ sk->sk_state = SS_FREE;
+
+out:
+ release_sock(sk);
+ release_sock(listener);
+ if (cleanup)
+ sock_put(sk);
+
+ sock_put(sk);
+ sock_put(listener);
+}
+EXPORT_SYMBOL_GPL(vsock_pending_work);
+
+/**** SOCKET OPERATIONS ****/
+
+static int __vsock_bind_stream(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ static u32 port = LAST_RESERVED_PORT + 1;
+ struct sockaddr_vm new_addr;
+
+ vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
+
+ if (addr->svm_port == VMADDR_PORT_ANY) {
+ bool found = false;
+ unsigned int i;
+
+ for (i = 0; i < MAX_PORT_RETRIES; i++) {
+ if (port <= LAST_RESERVED_PORT)
+ port = LAST_RESERVED_PORT + 1;
+
+ new_addr.svm_port = port++;
+
+ if (!__vsock_find_bound_socket(&new_addr)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return -EADDRNOTAVAIL;
+ } else {
+ /* If port is in reserved range, ensure caller
+ * has necessary privileges.
+ */
+ if (addr->svm_port <= LAST_RESERVED_PORT &&
+ !capable(CAP_NET_BIND_SERVICE)) {
+ return -EACCES;
+ }
+
+ if (__vsock_find_bound_socket(&new_addr))
+ return -EADDRINUSE;
+ }
+
+ vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
+
+ /* Remove stream sockets from the unbound list and add them to the hash
+ * table for easy lookup by its address. The unbound list is simply an
+ * extra entry at the end of the hash table, a trick used by AF_UNIX.
+ */
+ __vsock_remove_bound(vsk);
+ __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk);
+
+ return 0;
+}
+
+static int __vsock_bind_dgram(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ return transport->dgram_bind(vsk, addr);
+}
+
+static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ u32 cid;
+ int retval;
+
+ /* First ensure this socket isn't already bound. */
+ if (vsock_addr_bound(&vsk->local_addr))
+ return -EINVAL;
+
+ /* Now bind to the provided address or select appropriate values if
+ * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that
+ * like AF_INET prevents binding to a non-local IP address (in most
+ * cases), we only allow binding to the local CID.
+ */
+ cid = transport->get_local_cid();
+ if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
+ return -EADDRNOTAVAIL;
+
+ switch (sk->sk_socket->type) {
+ case SOCK_STREAM:
+ spin_lock_bh(&vsock_table_lock);
+ retval = __vsock_bind_stream(vsk, addr);
+ spin_unlock_bh(&vsock_table_lock);
+ break;
+
+ case SOCK_DGRAM:
+ retval = __vsock_bind_dgram(vsk, addr);
+ break;
+
+ default:
+ retval = -EINVAL;
+ break;
+ }
+
+ return retval;
+}
+
+struct sock *__vsock_create(struct net *net,
+ struct socket *sock,
+ struct sock *parent,
+ gfp_t priority,
+ unsigned short type)
+{
+ struct sock *sk;
+ struct vsock_sock *psk;
+ struct vsock_sock *vsk;
+
+ sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk);
+
+ /* sk->sk_type is normally set in sock_init_data, but only if sock is
+ * non-NULL. We make sure that our sockets always have a type by
+ * setting it here if needed.
+ */
+ if (!sock)
+ sk->sk_type = type;
+
+ vsk = vsock_sk(sk);
+ vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+ vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+
+ sk->sk_destruct = vsock_sk_destruct;
+ sk->sk_backlog_rcv = vsock_queue_rcv_skb;
+ sk->sk_state = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ INIT_LIST_HEAD(&vsk->bound_table);
+ INIT_LIST_HEAD(&vsk->connected_table);
+ vsk->listener = NULL;
+ INIT_LIST_HEAD(&vsk->pending_links);
+ INIT_LIST_HEAD(&vsk->accept_queue);
+ vsk->rejected = false;
+ vsk->sent_request = false;
+ vsk->ignore_connecting_rst = false;
+ vsk->peer_shutdown = 0;
+
+ psk = parent ? vsock_sk(parent) : NULL;
+ if (parent) {
+ vsk->trusted = psk->trusted;
+ vsk->owner = get_cred(psk->owner);
+ vsk->connect_timeout = psk->connect_timeout;
+ } else {
+ vsk->trusted = capable(CAP_NET_ADMIN);
+ vsk->owner = get_current_cred();
+ vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
+ }
+
+ if (transport->init(vsk, psk) < 0) {
+ sk_free(sk);
+ return NULL;
+ }
+
+ if (sock)
+ vsock_insert_unbound(vsk);
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(__vsock_create);
+
+static void __vsock_release(struct sock *sk)
+{
+ if (sk) {
+ struct sk_buff *skb;
+ struct sock *pending;
+ struct vsock_sock *vsk;
+
+ vsk = vsock_sk(sk);
+ pending = NULL; /* Compiler warning. */
+
+ if (vsock_in_bound_table(vsk))
+ vsock_remove_bound(vsk);
+
+ if (vsock_in_connected_table(vsk))
+ vsock_remove_connected(vsk);
+
+ transport->release(vsk);
+
+ lock_sock(sk);
+ sock_orphan(sk);
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)))
+ kfree_skb(skb);
+
+ /* Clean up any sockets that never were accepted. */
+ while ((pending = vsock_dequeue_accept(sk)) != NULL) {
+ __vsock_release(pending);
+ sock_put(pending);
+ }
+
+ release_sock(sk);
+ sock_put(sk);
+ }
+}
+
+static void vsock_sk_destruct(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ transport->destruct(vsk);
+
+ /* When clearing these addresses, there's no need to set the family and
+ * possibly register the address family with the kernel.
+ */
+ vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+ vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+
+ put_cred(vsk->owner);
+}
+
+static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+
+ err = sock_queue_rcv_skb(sk, skb);
+ if (err)
+ kfree_skb(skb);
+
+ return err;
+}
+
+s64 vsock_stream_has_data(struct vsock_sock *vsk)
+{
+ return transport->stream_has_data(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_stream_has_data);
+
+s64 vsock_stream_has_space(struct vsock_sock *vsk)
+{
+ return transport->stream_has_space(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_stream_has_space);
+
+static int vsock_release(struct socket *sock)
+{
+ __vsock_release(sock->sk);
+ sock->sk = NULL;
+ sock->state = SS_FREE;
+
+ return 0;
+}
+
+static int
+vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+ int err;
+ struct sock *sk;
+ struct sockaddr_vm *vm_addr;
+
+ sk = sock->sk;
+
+ if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0)
+ return -EINVAL;
+
+ lock_sock(sk);
+ err = __vsock_bind(sk, vm_addr);
+ release_sock(sk);
+
+ return err;
+}
+
+static int vsock_getname(struct socket *sock,
+ struct sockaddr *addr, int *addr_len, int peer)
+{
+ int err;
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ struct sockaddr_vm *vm_addr;
+
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+ err = 0;
+
+ lock_sock(sk);
+
+ if (peer) {
+ if (sock->state != SS_CONNECTED) {
+ err = -ENOTCONN;
+ goto out;
+ }
+ vm_addr = &vsk->remote_addr;
+ } else {
+ vm_addr = &vsk->local_addr;
+ }
+
+ if (!vm_addr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* sys_getsockname() and sys_getpeername() pass us a
+ * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately
+ * that macro is defined in socket.c instead of .h, so we hardcode its
+ * value here.
+ */
+ BUILD_BUG_ON(sizeof(*vm_addr) > 128);
+ memcpy(addr, vm_addr, sizeof(*vm_addr));
+ *addr_len = sizeof(*vm_addr);
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int vsock_shutdown(struct socket *sock, int mode)
+{
+ int err;
+ struct sock *sk;
+
+ /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses
+ * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode
+ * here like the other address families do. Note also that the
+ * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3),
+ * which is what we want.
+ */
+ mode++;
+
+ if ((mode & ~SHUTDOWN_MASK) || !mode)
+ return -EINVAL;
+
+ /* If this is a STREAM socket and it is not connected then bail out
+ * immediately. If it is a DGRAM socket then we must first kick the
+ * socket so that it wakes up from any sleeping calls, for example
+ * recv(), and then afterwards return the error.
+ */
+
+ sk = sock->sk;
+ if (sock->state == SS_UNCONNECTED) {
+ err = -ENOTCONN;
+ if (sk->sk_type == SOCK_STREAM)
+ return err;
+ } else {
+ sock->state = SS_DISCONNECTING;
+ err = 0;
+ }
+
+ /* Receive and send shutdowns are treated alike. */
+ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN);
+ if (mode) {
+ lock_sock(sk);
+ sk->sk_shutdown |= mode;
+ sk->sk_state_change(sk);
+ release_sock(sk);
+
+ if (sk->sk_type == SOCK_STREAM) {
+ sock_reset_flag(sk, SOCK_DONE);
+ vsock_send_shutdown(sk, mode);
+ }
+ }
+
+ return err;
+}
+
+static unsigned int vsock_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk;
+ unsigned int mask;
+ struct vsock_sock *vsk;
+
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+
+ poll_wait(file, sk_sleep(sk), wait);
+ mask = 0;
+
+ if (sk->sk_err)
+ /* Signify that there has been an error on this socket. */
+ mask |= POLLERR;
+
+ /* INET sockets treat local write shutdown and peer write shutdown as a
+ * case of POLLHUP set.
+ */
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ ((sk->sk_shutdown & SEND_SHUTDOWN) &&
+ (vsk->peer_shutdown & SEND_SHUTDOWN))) {
+ mask |= POLLHUP;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ vsk->peer_shutdown & SEND_SHUTDOWN) {
+ mask |= POLLRDHUP;
+ }
+
+ if (sock->type == SOCK_DGRAM) {
+ /* For datagram sockets we can read if there is something in
+ * the queue and write as long as the socket isn't shutdown for
+ * sending.
+ */
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN)) {
+ mask |= POLLIN | POLLRDNORM;
+ }
+
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+ } else if (sock->type == SOCK_STREAM) {
+ lock_sock(sk);
+
+ /* Listening sockets that have connections in their accept
+ * queue can be read.
+ */
+ if (sk->sk_state == SS_LISTEN
+ && !vsock_is_accept_queue_empty(sk))
+ mask |= POLLIN | POLLRDNORM;
+
+ /* If there is something in the queue then we can read. */
+ if (transport->stream_is_active(vsk) &&
+ !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ bool data_ready_now = false;
+ int ret = transport->notify_poll_in(
+ vsk, 1, &data_ready_now);
+ if (ret < 0) {
+ mask |= POLLERR;
+ } else {
+ if (data_ready_now)
+ mask |= POLLIN | POLLRDNORM;
+
+ }
+ }
+
+ /* Sockets whose connections have been closed, reset, or
+ * terminated should also be considered read, and we check the
+ * shutdown flag for that.
+ */
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ vsk->peer_shutdown & SEND_SHUTDOWN) {
+ mask |= POLLIN | POLLRDNORM;
+ }
+
+ /* Connected sockets that can produce data can be written. */
+ if (sk->sk_state == SS_CONNECTED) {
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ bool space_avail_now = false;
+ int ret = transport->notify_poll_out(
+ vsk, 1, &space_avail_now);
+ if (ret < 0) {
+ mask |= POLLERR;
+ } else {
+ if (space_avail_now)
+ /* Remove POLLWRBAND since INET
+ * sockets are not setting it.
+ */
+ mask |= POLLOUT | POLLWRNORM;
+
+ }
+ }
+ }
+
+ /* Simulate INET socket poll behaviors, which sets
+ * POLLOUT|POLLWRNORM when peer is closed and nothing to read,
+ * but local send is not shutdown.
+ */
+ if (sk->sk_state == SS_UNCONNECTED) {
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN))
+ mask |= POLLOUT | POLLWRNORM;
+
+ }
+
+ release_sock(sk);
+ }
+
+ return mask;
+}
+
+static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ int err;
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ struct sockaddr_vm *remote_addr;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ /* For now, MSG_DONTWAIT is always assumed... */
+ err = 0;
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+
+ lock_sock(sk);
+
+ err = vsock_auto_bind(vsk);
+ if (err)
+ goto out;
+
+
+ /* If the provided message contains an address, use that. Otherwise
+ * fall back on the socket's remote handle (if it has been connected).
+ */
+ if (msg->msg_name &&
+ vsock_addr_cast(msg->msg_name, msg->msg_namelen,
+ &remote_addr) == 0) {
+ /* Ensure this address is of the right type and is a valid
+ * destination.
+ */
+
+ if (remote_addr->svm_cid == VMADDR_CID_ANY)
+ remote_addr->svm_cid = transport->get_local_cid();
+
+ if (!vsock_addr_bound(remote_addr)) {
+ err = -EINVAL;
+ goto out;
+ }
+ } else if (sock->state == SS_CONNECTED) {
+ remote_addr = &vsk->remote_addr;
+
+ if (remote_addr->svm_cid == VMADDR_CID_ANY)
+ remote_addr->svm_cid = transport->get_local_cid();
+
+ /* XXX Should connect() or this function ensure remote_addr is
+ * bound?
+ */
+ if (!vsock_addr_bound(&vsk->remote_addr)) {
+ err = -EINVAL;
+ goto out;
+ }
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!transport->dgram_allow(remote_addr->svm_cid,
+ remote_addr->svm_port)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = transport->dgram_enqueue(vsk, remote_addr, msg, len);
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int vsock_dgram_connect(struct socket *sock,
+ struct sockaddr *addr, int addr_len, int flags)
+{
+ int err;
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ struct sockaddr_vm *remote_addr;
+
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+
+ err = vsock_addr_cast(addr, addr_len, &remote_addr);
+ if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) {
+ lock_sock(sk);
+ vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY,
+ VMADDR_PORT_ANY);
+ sock->state = SS_UNCONNECTED;
+ release_sock(sk);
+ return 0;
+ } else if (err != 0)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ err = vsock_auto_bind(vsk);
+ if (err)
+ goto out;
+
+ if (!transport->dgram_allow(remote_addr->svm_cid,
+ remote_addr->svm_port)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
+ sock->state = SS_CONNECTED;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags);
+}
+
+static const struct proto_ops vsock_dgram_ops = {
+ .family = PF_VSOCK,
+ .owner = THIS_MODULE,
+ .release = vsock_release,
+ .bind = vsock_bind,
+ .connect = vsock_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = vsock_getname,
+ .poll = vsock_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = vsock_shutdown,
+ .setsockopt = sock_no_setsockopt,
+ .getsockopt = sock_no_getsockopt,
+ .sendmsg = vsock_dgram_sendmsg,
+ .recvmsg = vsock_dgram_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static void vsock_connect_timeout(struct work_struct *work)
+{
+ struct sock *sk;
+ struct vsock_sock *vsk;
+
+ vsk = container_of(work, struct vsock_sock, dwork.work);
+ sk = sk_vsock(vsk);
+
+ lock_sock(sk);
+ if (sk->sk_state == SS_CONNECTING &&
+ (sk->sk_shutdown != SHUTDOWN_MASK)) {
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = ETIMEDOUT;
+ sk->sk_error_report(sk);
+ }
+ release_sock(sk);
+
+ sock_put(sk);
+}
+
+static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
+ int addr_len, int flags)
+{
+ int err;
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ struct sockaddr_vm *remote_addr;
+ long timeout;
+ DEFINE_WAIT(wait);
+
+ err = 0;
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+
+ lock_sock(sk);
+
+ /* XXX AF_UNSPEC should make us disconnect like AF_INET. */
+ switch (sock->state) {
+ case SS_CONNECTED:
+ err = -EISCONN;
+ goto out;
+ case SS_DISCONNECTING:
+ err = -EINVAL;
+ goto out;
+ case SS_CONNECTING:
+ /* This continues on so we can move sock into the SS_CONNECTED
+ * state once the connection has completed (at which point err
+ * will be set to zero also). Otherwise, we will either wait
+ * for the connection or return -EALREADY should this be a
+ * non-blocking call.
+ */
+ err = -EALREADY;
+ break;
+ default:
+ if ((sk->sk_state == SS_LISTEN) ||
+ vsock_addr_cast(addr, addr_len, &remote_addr) != 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* The hypervisor and well-known contexts do not have socket
+ * endpoints.
+ */
+ if (!transport->stream_allow(remote_addr->svm_cid,
+ remote_addr->svm_port)) {
+ err = -ENETUNREACH;
+ goto out;
+ }
+
+ /* Set the remote address that we are connecting to. */
+ memcpy(&vsk->remote_addr, remote_addr,
+ sizeof(vsk->remote_addr));
+
+ err = vsock_auto_bind(vsk);
+ if (err)
+ goto out;
+
+ sk->sk_state = SS_CONNECTING;
+
+ err = transport->connect(vsk);
+ if (err < 0)
+ goto out;
+
+ /* Mark sock as connecting and set the error code to in
+ * progress in case this is a non-blocking connect.
+ */
+ sock->state = SS_CONNECTING;
+ err = -EINPROGRESS;
+ }
+
+ /* The receive path will handle all communication until we are able to
+ * enter the connected state. Here we wait for the connection to be
+ * completed or a notification of an error.
+ */
+ timeout = vsk->connect_timeout;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) {
+ if (flags & O_NONBLOCK) {
+ /* If we're not going to block, we schedule a timeout
+ * function to generate a timeout on the connection
+ * attempt, in case the peer doesn't respond in a
+ * timely manner. We hold on to the socket until the
+ * timeout fires.
+ */
+ sock_hold(sk);
+ INIT_DELAYED_WORK(&vsk->dwork,
+ vsock_connect_timeout);
+ schedule_delayed_work(&vsk->dwork, timeout);
+
+ /* Skip ahead to preserve error code set above. */
+ goto out_wait;
+ }
+
+ release_sock(sk);
+ timeout = schedule_timeout(timeout);
+ lock_sock(sk);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeout);
+ goto out_wait_error;
+ } else if (timeout == 0) {
+ err = -ETIMEDOUT;
+ goto out_wait_error;
+ }
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ }
+
+ if (sk->sk_err) {
+ err = -sk->sk_err;
+ goto out_wait_error;
+ } else
+ err = 0;
+
+out_wait:
+ finish_wait(sk_sleep(sk), &wait);
+out:
+ release_sock(sk);
+ return err;
+
+out_wait_error:
+ sk->sk_state = SS_UNCONNECTED;
+ sock->state = SS_UNCONNECTED;
+ goto out_wait;
+}
+
+static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *listener;
+ int err;
+ struct sock *connected;
+ struct vsock_sock *vconnected;
+ long timeout;
+ DEFINE_WAIT(wait);
+
+ err = 0;
+ listener = sock->sk;
+
+ lock_sock(listener);
+
+ if (sock->type != SOCK_STREAM) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (listener->sk_state != SS_LISTEN) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Wait for children sockets to appear; these are the new sockets
+ * created upon connection establishment.
+ */
+ timeout = sock_sndtimeo(listener, flags & O_NONBLOCK);
+ prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
+
+ while ((connected = vsock_dequeue_accept(listener)) == NULL &&
+ listener->sk_err == 0) {
+ release_sock(listener);
+ timeout = schedule_timeout(timeout);
+ lock_sock(listener);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeout);
+ goto out_wait;
+ } else if (timeout == 0) {
+ err = -EAGAIN;
+ goto out_wait;
+ }
+
+ prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
+ }
+
+ if (listener->sk_err)
+ err = -listener->sk_err;
+
+ if (connected) {
+ listener->sk_ack_backlog--;
+
+ lock_sock(connected);
+ vconnected = vsock_sk(connected);
+
+ /* If the listener socket has received an error, then we should
+ * reject this socket and return. Note that we simply mark the
+ * socket rejected, drop our reference, and let the cleanup
+ * function handle the cleanup; the fact that we found it in
+ * the listener's accept queue guarantees that the cleanup
+ * function hasn't run yet.
+ */
+ if (err) {
+ vconnected->rejected = true;
+ release_sock(connected);
+ sock_put(connected);
+ goto out_wait;
+ }
+
+ newsock->state = SS_CONNECTED;
+ sock_graft(connected, newsock);
+ release_sock(connected);
+ sock_put(connected);
+ }
+
+out_wait:
+ finish_wait(sk_sleep(listener), &wait);
+out:
+ release_sock(listener);
+ return err;
+}
+
+static int vsock_listen(struct socket *sock, int backlog)
+{
+ int err;
+ struct sock *sk;
+ struct vsock_sock *vsk;
+
+ sk = sock->sk;
+
+ lock_sock(sk);
+
+ if (sock->type != SOCK_STREAM) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (sock->state != SS_UNCONNECTED) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ vsk = vsock_sk(sk);
+
+ if (!vsock_addr_bound(&vsk->local_addr)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_state = SS_LISTEN;
+
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+static int vsock_stream_setsockopt(struct socket *sock,
+ int level,
+ int optname,
+ char __user *optval,
+ unsigned int optlen)
+{
+ int err;
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ u64 val;
+
+ if (level != AF_VSOCK)
+ return -ENOPROTOOPT;
+
+#define COPY_IN(_v) \
+ do { \
+ if (optlen < sizeof(_v)) { \
+ err = -EINVAL; \
+ goto exit; \
+ } \
+ if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \
+ err = -EFAULT; \
+ goto exit; \
+ } \
+ } while (0)
+
+ err = 0;
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case SO_VM_SOCKETS_BUFFER_SIZE:
+ COPY_IN(val);
+ transport->set_buffer_size(vsk, val);
+ break;
+
+ case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
+ COPY_IN(val);
+ transport->set_max_buffer_size(vsk, val);
+ break;
+
+ case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
+ COPY_IN(val);
+ transport->set_min_buffer_size(vsk, val);
+ break;
+
+ case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
+ struct timeval tv;
+ COPY_IN(tv);
+ if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC &&
+ tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) {
+ vsk->connect_timeout = tv.tv_sec * HZ +
+ DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ));
+ if (vsk->connect_timeout == 0)
+ vsk->connect_timeout =
+ VSOCK_DEFAULT_CONNECT_TIMEOUT;
+
+ } else {
+ err = -ERANGE;
+ }
+ break;
+ }
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+#undef COPY_IN
+
+exit:
+ release_sock(sk);
+ return err;
+}
+
+static int vsock_stream_getsockopt(struct socket *sock,
+ int level, int optname,
+ char __user *optval,
+ int __user *optlen)
+{
+ int err;
+ int len;
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ u64 val;
+
+ if (level != AF_VSOCK)
+ return -ENOPROTOOPT;
+
+ err = get_user(len, optlen);
+ if (err != 0)
+ return err;
+
+#define COPY_OUT(_v) \
+ do { \
+ if (len < sizeof(_v)) \
+ return -EINVAL; \
+ \
+ len = sizeof(_v); \
+ if (copy_to_user(optval, &_v, len) != 0) \
+ return -EFAULT; \
+ \
+ } while (0)
+
+ err = 0;
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+
+ switch (optname) {
+ case SO_VM_SOCKETS_BUFFER_SIZE:
+ val = transport->get_buffer_size(vsk);
+ COPY_OUT(val);
+ break;
+
+ case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
+ val = transport->get_max_buffer_size(vsk);
+ COPY_OUT(val);
+ break;
+
+ case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
+ val = transport->get_min_buffer_size(vsk);
+ COPY_OUT(val);
+ break;
+
+ case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
+ struct timeval tv;
+ tv.tv_sec = vsk->connect_timeout / HZ;
+ tv.tv_usec =
+ (vsk->connect_timeout -
+ tv.tv_sec * HZ) * (1000000 / HZ);
+ COPY_OUT(tv);
+ break;
+ }
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ err = put_user(len, optlen);
+ if (err != 0)
+ return -EFAULT;
+
+#undef COPY_OUT
+
+ return 0;
+}
+
+static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ ssize_t total_written;
+ long timeout;
+ int err;
+ struct vsock_transport_send_notify_data send_data;
+
+ DEFINE_WAIT(wait);
+
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+ total_written = 0;
+ err = 0;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+
+ /* Callers should not provide a destination with stream sockets. */
+ if (msg->msg_namelen) {
+ err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* Send data only if both sides are not shutdown in the direction. */
+ if (sk->sk_shutdown & SEND_SHUTDOWN ||
+ vsk->peer_shutdown & RCV_SHUTDOWN) {
+ err = -EPIPE;
+ goto out;
+ }
+
+ if (sk->sk_state != SS_CONNECTED ||
+ !vsock_addr_bound(&vsk->local_addr)) {
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ if (!vsock_addr_bound(&vsk->remote_addr)) {
+ err = -EDESTADDRREQ;
+ goto out;
+ }
+
+ /* Wait for room in the produce queue to enqueue our user's data. */
+ timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ err = transport->notify_send_init(vsk, &send_data);
+ if (err < 0)
+ goto out;
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ while (total_written < len) {
+ ssize_t written;
+
+ while (vsock_stream_has_space(vsk) == 0 &&
+ sk->sk_err == 0 &&
+ !(sk->sk_shutdown & SEND_SHUTDOWN) &&
+ !(vsk->peer_shutdown & RCV_SHUTDOWN)) {
+
+ /* Don't wait for non-blocking sockets. */
+ if (timeout == 0) {
+ err = -EAGAIN;
+ goto out_wait;
+ }
+
+ err = transport->notify_send_pre_block(vsk, &send_data);
+ if (err < 0)
+ goto out_wait;
+
+ release_sock(sk);
+ timeout = schedule_timeout(timeout);
+ lock_sock(sk);
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeout);
+ goto out_wait;
+ } else if (timeout == 0) {
+ err = -EAGAIN;
+ goto out_wait;
+ }
+
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ }
+
+ /* These checks occur both as part of and after the loop
+ * conditional since we need to check before and after
+ * sleeping.
+ */
+ if (sk->sk_err) {
+ err = -sk->sk_err;
+ goto out_wait;
+ } else if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
+ (vsk->peer_shutdown & RCV_SHUTDOWN)) {
+ err = -EPIPE;
+ goto out_wait;
+ }
+
+ err = transport->notify_send_pre_enqueue(vsk, &send_data);
+ if (err < 0)
+ goto out_wait;
+
+ /* Note that enqueue will only write as many bytes as are free
+ * in the produce queue, so we don't need to ensure len is
+ * smaller than the queue size. It is the caller's
+ * responsibility to check how many bytes we were able to send.
+ */
+
+ written = transport->stream_enqueue(
+ vsk, msg,
+ len - total_written);
+ if (written < 0) {
+ err = -ENOMEM;
+ goto out_wait;
+ }
+
+ total_written += written;
+
+ err = transport->notify_send_post_enqueue(
+ vsk, written, &send_data);
+ if (err < 0)
+ goto out_wait;
+
+ }
+
+out_wait:
+ if (total_written > 0)
+ err = total_written;
+ finish_wait(sk_sleep(sk), &wait);
+out:
+ release_sock(sk);
+ return err;
+}
+
+
+static int
+vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk;
+ struct vsock_sock *vsk;
+ int err;
+ size_t target;
+ ssize_t copied;
+ long timeout;
+ struct vsock_transport_recv_notify_data recv_data;
+
+ DEFINE_WAIT(wait);
+
+ sk = sock->sk;
+ vsk = vsock_sk(sk);
+ err = 0;
+
+ lock_sock(sk);
+
+ if (sk->sk_state != SS_CONNECTED) {
+ /* Recvmsg is supposed to return 0 if a peer performs an
+ * orderly shutdown. Differentiate between that case and when a
+ * peer has not connected or a local shutdown occured with the
+ * SOCK_DONE flag.
+ */
+ if (sock_flag(sk, SOCK_DONE))
+ err = 0;
+ else
+ err = -ENOTCONN;
+
+ goto out;
+ }
+
+ if (flags & MSG_OOB) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* We don't check peer_shutdown flag here since peer may actually shut
+ * down, but there can be data in the queue that a local socket can
+ * receive.
+ */
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ err = 0;
+ goto out;
+ }
+
+ /* It is valid on Linux to pass in a zero-length receive buffer. This
+ * is not an error. We may as well bail out now.
+ */
+ if (!len) {
+ err = 0;
+ goto out;
+ }
+
+ /* We must not copy less than target bytes into the user's buffer
+ * before returning successfully, so we wait for the consume queue to
+ * have that much data to consume before dequeueing. Note that this
+ * makes it impossible to handle cases where target is greater than the
+ * queue size.
+ */
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ if (target >= transport->stream_rcvhiwat(vsk)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ copied = 0;
+
+ err = transport->notify_recv_init(vsk, target, &recv_data);
+ if (err < 0)
+ goto out;
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ while (1) {
+ s64 ready = vsock_stream_has_data(vsk);
+
+ if (ready < 0) {
+ /* Invalid queue pair content. XXX This should be
+ * changed to a connection reset in a later change.
+ */
+
+ err = -ENOMEM;
+ goto out_wait;
+ } else if (ready > 0) {
+ ssize_t read;
+
+ err = transport->notify_recv_pre_dequeue(
+ vsk, target, &recv_data);
+ if (err < 0)
+ break;
+
+ read = transport->stream_dequeue(
+ vsk, msg,
+ len - copied, flags);
+ if (read < 0) {
+ err = -ENOMEM;
+ break;
+ }
+
+ copied += read;
+
+ err = transport->notify_recv_post_dequeue(
+ vsk, target, read,
+ !(flags & MSG_PEEK), &recv_data);
+ if (err < 0)
+ goto out_wait;
+
+ if (read >= target || flags & MSG_PEEK)
+ break;
+
+ target -= read;
+ } else {
+ if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN)
+ || (vsk->peer_shutdown & SEND_SHUTDOWN)) {
+ break;
+ }
+ /* Don't wait for non-blocking sockets. */
+ if (timeout == 0) {
+ err = -EAGAIN;
+ break;
+ }
+
+ err = transport->notify_recv_pre_block(
+ vsk, target, &recv_data);
+ if (err < 0)
+ break;
+
+ release_sock(sk);
+ timeout = schedule_timeout(timeout);
+ lock_sock(sk);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeout);
+ break;
+ } else if (timeout == 0) {
+ err = -EAGAIN;
+ break;
+ }
+
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ }
+ }
+
+ if (sk->sk_err)
+ err = -sk->sk_err;
+ else if (sk->sk_shutdown & RCV_SHUTDOWN)
+ err = 0;
+
+ if (copied > 0) {
+ /* We only do these additional bookkeeping/notification steps
+ * if we actually copied something out of the queue pair
+ * instead of just peeking ahead.
+ */
+
+ if (!(flags & MSG_PEEK)) {
+ /* If the other side has shutdown for sending and there
+ * is nothing more to read, then modify the socket
+ * state.
+ */
+ if (vsk->peer_shutdown & SEND_SHUTDOWN) {
+ if (vsock_stream_has_data(vsk) <= 0) {
+ sk->sk_state = SS_UNCONNECTED;
+ sock_set_flag(sk, SOCK_DONE);
+ sk->sk_state_change(sk);
+ }
+ }
+ }
+ err = copied;
+ }
+
+out_wait:
+ finish_wait(sk_sleep(sk), &wait);
+out:
+ release_sock(sk);
+ return err;
+}
+
+static const struct proto_ops vsock_stream_ops = {
+ .family = PF_VSOCK,
+ .owner = THIS_MODULE,
+ .release = vsock_release,
+ .bind = vsock_bind,
+ .connect = vsock_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = vsock_accept,
+ .getname = vsock_getname,
+ .poll = vsock_poll,
+ .ioctl = sock_no_ioctl,
+ .listen = vsock_listen,
+ .shutdown = vsock_shutdown,
+ .setsockopt = vsock_stream_setsockopt,
+ .getsockopt = vsock_stream_getsockopt,
+ .sendmsg = vsock_stream_sendmsg,
+ .recvmsg = vsock_stream_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static int vsock_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
+{
+ if (!sock)
+ return -EINVAL;
+
+ if (protocol && protocol != PF_VSOCK)
+ return -EPROTONOSUPPORT;
+
+ switch (sock->type) {
+ case SOCK_DGRAM:
+ sock->ops = &vsock_dgram_ops;
+ break;
+ case SOCK_STREAM:
+ sock->ops = &vsock_stream_ops;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ sock->state = SS_UNCONNECTED;
+
+ return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM;
+}
+
+static const struct net_proto_family vsock_family_ops = {
+ .family = AF_VSOCK,
+ .create = vsock_create,
+ .owner = THIS_MODULE,
+};
+
+static long vsock_dev_do_ioctl(struct file *filp,
+ unsigned int cmd, void __user *ptr)
+{
+ u32 __user *p = ptr;
+ int retval = 0;
+
+ switch (cmd) {
+ case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
+ if (put_user(transport->get_local_cid(), p) != 0)
+ retval = -EFAULT;
+ break;
+
+ default:
+ pr_err("Unknown ioctl %d\n", cmd);
+ retval = -EINVAL;
+ }
+
+ return retval;
+}
+
+static long vsock_dev_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg);
+}
+
+#ifdef CONFIG_COMPAT
+static long vsock_dev_compat_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations vsock_device_ops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = vsock_dev_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = vsock_dev_compat_ioctl,
+#endif
+ .open = nonseekable_open,
+};
+
+static struct miscdevice vsock_device = {
+ .name = "vsock",
+ .fops = &vsock_device_ops,
+};
+
+int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
+{
+ int err = mutex_lock_interruptible(&vsock_register_mutex);
+
+ if (err)
+ return err;
+
+ if (transport) {
+ err = -EBUSY;
+ goto err_busy;
+ }
+
+ /* Transport must be the owner of the protocol so that it can't
+ * unload while there are open sockets.
+ */
+ vsock_proto.owner = owner;
+ transport = t;
+
+ vsock_init_tables();
+
+ vsock_device.minor = MISC_DYNAMIC_MINOR;
+ err = misc_register(&vsock_device);
+ if (err) {
+ pr_err("Failed to register misc device\n");
+ return -ENOENT;
+ }
+
+ err = proto_register(&vsock_proto, 1); /* we want our slab */
+ if (err) {
+ pr_err("Cannot register vsock protocol\n");
+ goto err_misc_deregister;
+ }
+
+ err = sock_register(&vsock_family_ops);
+ if (err) {
+ pr_err("could not register af_vsock (%d) address family: %d\n",
+ AF_VSOCK, err);
+ goto err_unregister_proto;
+ }
+
+ mutex_unlock(&vsock_register_mutex);
+ return 0;
+
+err_unregister_proto:
+ proto_unregister(&vsock_proto);
+err_misc_deregister:
+ misc_deregister(&vsock_device);
+ transport = NULL;
+err_busy:
+ mutex_unlock(&vsock_register_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(__vsock_core_init);
+
+void vsock_core_exit(void)
+{
+ mutex_lock(&vsock_register_mutex);
+
+ misc_deregister(&vsock_device);
+ sock_unregister(AF_VSOCK);
+ proto_unregister(&vsock_proto);
+
+ /* We do not want the assignment below re-ordered. */
+ mb();
+ transport = NULL;
+
+ mutex_unlock(&vsock_register_mutex);
+}
+EXPORT_SYMBOL_GPL(vsock_core_exit);
+
+MODULE_AUTHOR("VMware, Inc.");
+MODULE_DESCRIPTION("VMware Virtual Socket Family");
+MODULE_VERSION("1.0.1.0-k");
+MODULE_LICENSE("GPL v2");
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
new file mode 100644
index 000000000..c294da095
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport.c
@@ -0,0 +1,2169 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <linux/skbuff.h>
+#include <linux/smp.h>
+#include <linux/socket.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#include "vmci_transport_notify.h"
+
+static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg);
+static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg);
+static void vmci_transport_peer_attach_cb(u32 sub_id,
+ const struct vmci_event_data *ed,
+ void *client_data);
+static void vmci_transport_peer_detach_cb(u32 sub_id,
+ const struct vmci_event_data *ed,
+ void *client_data);
+static void vmci_transport_recv_pkt_work(struct work_struct *work);
+static int vmci_transport_recv_listen(struct sock *sk,
+ struct vmci_transport_packet *pkt);
+static int vmci_transport_recv_connecting_server(
+ struct sock *sk,
+ struct sock *pending,
+ struct vmci_transport_packet *pkt);
+static int vmci_transport_recv_connecting_client(
+ struct sock *sk,
+ struct vmci_transport_packet *pkt);
+static int vmci_transport_recv_connecting_client_negotiate(
+ struct sock *sk,
+ struct vmci_transport_packet *pkt);
+static int vmci_transport_recv_connecting_client_invalid(
+ struct sock *sk,
+ struct vmci_transport_packet *pkt);
+static int vmci_transport_recv_connected(struct sock *sk,
+ struct vmci_transport_packet *pkt);
+static bool vmci_transport_old_proto_override(bool *old_pkt_proto);
+static u16 vmci_transport_new_proto_supported_versions(void);
+static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto,
+ bool old_pkt_proto);
+
+struct vmci_transport_recv_pkt_info {
+ struct work_struct work;
+ struct sock *sk;
+ struct vmci_transport_packet pkt;
+};
+
+static struct vmci_handle vmci_transport_stream_handle = { VMCI_INVALID_ID,
+ VMCI_INVALID_ID };
+static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
+
+static int PROTOCOL_OVERRIDE = -1;
+
+#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128
+#define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144
+#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144
+
+/* The default peer timeout indicates how long we will wait for a peer response
+ * to a control message.
+ */
+#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
+
+#define SS_LISTEN 255
+
+/* Helper function to convert from a VMCI error code to a VSock error code. */
+
+static s32 vmci_transport_error_to_vsock_error(s32 vmci_error)
+{
+ int err;
+
+ switch (vmci_error) {
+ case VMCI_ERROR_NO_MEM:
+ err = ENOMEM;
+ break;
+ case VMCI_ERROR_DUPLICATE_ENTRY:
+ case VMCI_ERROR_ALREADY_EXISTS:
+ err = EADDRINUSE;
+ break;
+ case VMCI_ERROR_NO_ACCESS:
+ err = EPERM;
+ break;
+ case VMCI_ERROR_NO_RESOURCES:
+ err = ENOBUFS;
+ break;
+ case VMCI_ERROR_INVALID_RESOURCE:
+ err = EHOSTUNREACH;
+ break;
+ case VMCI_ERROR_INVALID_ARGS:
+ default:
+ err = EINVAL;
+ }
+
+ return err > 0 ? -err : err;
+}
+
+static u32 vmci_transport_peer_rid(u32 peer_cid)
+{
+ if (VMADDR_CID_HYPERVISOR == peer_cid)
+ return VMCI_TRANSPORT_HYPERVISOR_PACKET_RID;
+
+ return VMCI_TRANSPORT_PACKET_RID;
+}
+
+static inline void
+vmci_transport_packet_init(struct vmci_transport_packet *pkt,
+ struct sockaddr_vm *src,
+ struct sockaddr_vm *dst,
+ u8 type,
+ u64 size,
+ u64 mode,
+ struct vmci_transport_waiting_info *wait,
+ u16 proto,
+ struct vmci_handle handle)
+{
+ /* We register the stream control handler as an any cid handle so we
+ * must always send from a source address of VMADDR_CID_ANY
+ */
+ pkt->dg.src = vmci_make_handle(VMADDR_CID_ANY,
+ VMCI_TRANSPORT_PACKET_RID);
+ pkt->dg.dst = vmci_make_handle(dst->svm_cid,
+ vmci_transport_peer_rid(dst->svm_cid));
+ pkt->dg.payload_size = sizeof(*pkt) - sizeof(pkt->dg);
+ pkt->version = VMCI_TRANSPORT_PACKET_VERSION;
+ pkt->type = type;
+ pkt->src_port = src->svm_port;
+ pkt->dst_port = dst->svm_port;
+ memset(&pkt->proto, 0, sizeof(pkt->proto));
+ memset(&pkt->_reserved2, 0, sizeof(pkt->_reserved2));
+
+ switch (pkt->type) {
+ case VMCI_TRANSPORT_PACKET_TYPE_INVALID:
+ pkt->u.size = 0;
+ break;
+
+ case VMCI_TRANSPORT_PACKET_TYPE_REQUEST:
+ case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE:
+ pkt->u.size = size;
+ break;
+
+ case VMCI_TRANSPORT_PACKET_TYPE_OFFER:
+ case VMCI_TRANSPORT_PACKET_TYPE_ATTACH:
+ pkt->u.handle = handle;
+ break;
+
+ case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
+ case VMCI_TRANSPORT_PACKET_TYPE_READ:
+ case VMCI_TRANSPORT_PACKET_TYPE_RST:
+ pkt->u.size = 0;
+ break;
+
+ case VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN:
+ pkt->u.mode = mode;
+ break;
+
+ case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
+ case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
+ memcpy(&pkt->u.wait, wait, sizeof(pkt->u.wait));
+ break;
+
+ case VMCI_TRANSPORT_PACKET_TYPE_REQUEST2:
+ case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2:
+ pkt->u.size = size;
+ pkt->proto = proto;
+ break;
+ }
+}
+
+static inline void
+vmci_transport_packet_get_addresses(struct vmci_transport_packet *pkt,
+ struct sockaddr_vm *local,
+ struct sockaddr_vm *remote)
+{
+ vsock_addr_init(local, pkt->dg.dst.context, pkt->dst_port);
+ vsock_addr_init(remote, pkt->dg.src.context, pkt->src_port);
+}
+
+static int
+__vmci_transport_send_control_pkt(struct vmci_transport_packet *pkt,
+ struct sockaddr_vm *src,
+ struct sockaddr_vm *dst,
+ enum vmci_transport_packet_type type,
+ u64 size,
+ u64 mode,
+ struct vmci_transport_waiting_info *wait,
+ u16 proto,
+ struct vmci_handle handle,
+ bool convert_error)
+{
+ int err;
+
+ vmci_transport_packet_init(pkt, src, dst, type, size, mode, wait,
+ proto, handle);
+ err = vmci_datagram_send(&pkt->dg);
+ if (convert_error && (err < 0))
+ return vmci_transport_error_to_vsock_error(err);
+
+ return err;
+}
+
+static int
+vmci_transport_reply_control_pkt_fast(struct vmci_transport_packet *pkt,
+ enum vmci_transport_packet_type type,
+ u64 size,
+ u64 mode,
+ struct vmci_transport_waiting_info *wait,
+ struct vmci_handle handle)
+{
+ struct vmci_transport_packet reply;
+ struct sockaddr_vm src, dst;
+
+ if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST) {
+ return 0;
+ } else {
+ vmci_transport_packet_get_addresses(pkt, &src, &dst);
+ return __vmci_transport_send_control_pkt(&reply, &src, &dst,
+ type,
+ size, mode, wait,
+ VSOCK_PROTO_INVALID,
+ handle, true);
+ }
+}
+
+static int
+vmci_transport_send_control_pkt_bh(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst,
+ enum vmci_transport_packet_type type,
+ u64 size,
+ u64 mode,
+ struct vmci_transport_waiting_info *wait,
+ struct vmci_handle handle)
+{
+ /* Note that it is safe to use a single packet across all CPUs since
+ * two tasklets of the same type are guaranteed to not ever run
+ * simultaneously. If that ever changes, or VMCI stops using tasklets,
+ * we can use per-cpu packets.
+ */
+ static struct vmci_transport_packet pkt;
+
+ return __vmci_transport_send_control_pkt(&pkt, src, dst, type,
+ size, mode, wait,
+ VSOCK_PROTO_INVALID, handle,
+ false);
+}
+
+static int
+vmci_transport_send_control_pkt(struct sock *sk,
+ enum vmci_transport_packet_type type,
+ u64 size,
+ u64 mode,
+ struct vmci_transport_waiting_info *wait,
+ u16 proto,
+ struct vmci_handle handle)
+{
+ struct vmci_transport_packet *pkt;
+ struct vsock_sock *vsk;
+ int err;
+
+ vsk = vsock_sk(sk);
+
+ if (!vsock_addr_bound(&vsk->local_addr))
+ return -EINVAL;
+
+ if (!vsock_addr_bound(&vsk->remote_addr))
+ return -EINVAL;
+
+ pkt = kmalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ return -ENOMEM;
+
+ err = __vmci_transport_send_control_pkt(pkt, &vsk->local_addr,
+ &vsk->remote_addr, type, size,
+ mode, wait, proto, handle,
+ true);
+ kfree(pkt);
+
+ return err;
+}
+
+static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst,
+ struct sockaddr_vm *src,
+ struct vmci_transport_packet *pkt)
+{
+ if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST)
+ return 0;
+ return vmci_transport_send_control_pkt_bh(
+ dst, src,
+ VMCI_TRANSPORT_PACKET_TYPE_RST, 0,
+ 0, NULL, VMCI_INVALID_HANDLE);
+}
+
+static int vmci_transport_send_reset(struct sock *sk,
+ struct vmci_transport_packet *pkt)
+{
+ if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST)
+ return 0;
+ return vmci_transport_send_control_pkt(sk,
+ VMCI_TRANSPORT_PACKET_TYPE_RST,
+ 0, 0, NULL, VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
+}
+
+static int vmci_transport_send_negotiate(struct sock *sk, size_t size)
+{
+ return vmci_transport_send_control_pkt(
+ sk,
+ VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE,
+ size, 0, NULL,
+ VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
+}
+
+static int vmci_transport_send_negotiate2(struct sock *sk, size_t size,
+ u16 version)
+{
+ return vmci_transport_send_control_pkt(
+ sk,
+ VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2,
+ size, 0, NULL, version,
+ VMCI_INVALID_HANDLE);
+}
+
+static int vmci_transport_send_qp_offer(struct sock *sk,
+ struct vmci_handle handle)
+{
+ return vmci_transport_send_control_pkt(
+ sk, VMCI_TRANSPORT_PACKET_TYPE_OFFER, 0,
+ 0, NULL,
+ VSOCK_PROTO_INVALID, handle);
+}
+
+static int vmci_transport_send_attach(struct sock *sk,
+ struct vmci_handle handle)
+{
+ return vmci_transport_send_control_pkt(
+ sk, VMCI_TRANSPORT_PACKET_TYPE_ATTACH,
+ 0, 0, NULL, VSOCK_PROTO_INVALID,
+ handle);
+}
+
+static int vmci_transport_reply_reset(struct vmci_transport_packet *pkt)
+{
+ return vmci_transport_reply_control_pkt_fast(
+ pkt,
+ VMCI_TRANSPORT_PACKET_TYPE_RST,
+ 0, 0, NULL,
+ VMCI_INVALID_HANDLE);
+}
+
+static int vmci_transport_send_invalid_bh(struct sockaddr_vm *dst,
+ struct sockaddr_vm *src)
+{
+ return vmci_transport_send_control_pkt_bh(
+ dst, src,
+ VMCI_TRANSPORT_PACKET_TYPE_INVALID,
+ 0, 0, NULL, VMCI_INVALID_HANDLE);
+}
+
+int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst,
+ struct sockaddr_vm *src)
+{
+ return vmci_transport_send_control_pkt_bh(
+ dst, src,
+ VMCI_TRANSPORT_PACKET_TYPE_WROTE, 0,
+ 0, NULL, VMCI_INVALID_HANDLE);
+}
+
+int vmci_transport_send_read_bh(struct sockaddr_vm *dst,
+ struct sockaddr_vm *src)
+{
+ return vmci_transport_send_control_pkt_bh(
+ dst, src,
+ VMCI_TRANSPORT_PACKET_TYPE_READ, 0,
+ 0, NULL, VMCI_INVALID_HANDLE);
+}
+
+int vmci_transport_send_wrote(struct sock *sk)
+{
+ return vmci_transport_send_control_pkt(
+ sk, VMCI_TRANSPORT_PACKET_TYPE_WROTE, 0,
+ 0, NULL, VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
+}
+
+int vmci_transport_send_read(struct sock *sk)
+{
+ return vmci_transport_send_control_pkt(
+ sk, VMCI_TRANSPORT_PACKET_TYPE_READ, 0,
+ 0, NULL, VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
+}
+
+int vmci_transport_send_waiting_write(struct sock *sk,
+ struct vmci_transport_waiting_info *wait)
+{
+ return vmci_transport_send_control_pkt(
+ sk, VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE,
+ 0, 0, wait, VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
+}
+
+int vmci_transport_send_waiting_read(struct sock *sk,
+ struct vmci_transport_waiting_info *wait)
+{
+ return vmci_transport_send_control_pkt(
+ sk, VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ,
+ 0, 0, wait, VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
+}
+
+static int vmci_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+ return vmci_transport_send_control_pkt(
+ &vsk->sk,
+ VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN,
+ 0, mode, NULL,
+ VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
+}
+
+static int vmci_transport_send_conn_request(struct sock *sk, size_t size)
+{
+ return vmci_transport_send_control_pkt(sk,
+ VMCI_TRANSPORT_PACKET_TYPE_REQUEST,
+ size, 0, NULL,
+ VSOCK_PROTO_INVALID,
+ VMCI_INVALID_HANDLE);
+}
+
+static int vmci_transport_send_conn_request2(struct sock *sk, size_t size,
+ u16 version)
+{
+ return vmci_transport_send_control_pkt(
+ sk, VMCI_TRANSPORT_PACKET_TYPE_REQUEST2,
+ size, 0, NULL, version,
+ VMCI_INVALID_HANDLE);
+}
+
+static struct sock *vmci_transport_get_pending(
+ struct sock *listener,
+ struct vmci_transport_packet *pkt)
+{
+ struct vsock_sock *vlistener;
+ struct vsock_sock *vpending;
+ struct sock *pending;
+ struct sockaddr_vm src;
+
+ vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port);
+
+ vlistener = vsock_sk(listener);
+
+ list_for_each_entry(vpending, &vlistener->pending_links,
+ pending_links) {
+ if (vsock_addr_equals_addr(&src, &vpending->remote_addr) &&
+ pkt->dst_port == vpending->local_addr.svm_port) {
+ pending = sk_vsock(vpending);
+ sock_hold(pending);
+ goto found;
+ }
+ }
+
+ pending = NULL;
+found:
+ return pending;
+
+}
+
+static void vmci_transport_release_pending(struct sock *pending)
+{
+ sock_put(pending);
+}
+
+/* We allow two kinds of sockets to communicate with a restricted VM: 1)
+ * trusted sockets 2) sockets from applications running as the same user as the
+ * VM (this is only true for the host side and only when using hosted products)
+ */
+
+static bool vmci_transport_is_trusted(struct vsock_sock *vsock, u32 peer_cid)
+{
+ return vsock->trusted ||
+ vmci_is_context_owner(peer_cid, vsock->owner->uid);
+}
+
+/* We allow sending datagrams to and receiving datagrams from a restricted VM
+ * only if it is trusted as described in vmci_transport_is_trusted.
+ */
+
+static bool vmci_transport_allow_dgram(struct vsock_sock *vsock, u32 peer_cid)
+{
+ if (VMADDR_CID_HYPERVISOR == peer_cid)
+ return true;
+
+ if (vsock->cached_peer != peer_cid) {
+ vsock->cached_peer = peer_cid;
+ if (!vmci_transport_is_trusted(vsock, peer_cid) &&
+ (vmci_context_get_priv_flags(peer_cid) &
+ VMCI_PRIVILEGE_FLAG_RESTRICTED)) {
+ vsock->cached_peer_allow_dgram = false;
+ } else {
+ vsock->cached_peer_allow_dgram = true;
+ }
+ }
+
+ return vsock->cached_peer_allow_dgram;
+}
+
+static int
+vmci_transport_queue_pair_alloc(struct vmci_qp **qpair,
+ struct vmci_handle *handle,
+ u64 produce_size,
+ u64 consume_size,
+ u32 peer, u32 flags, bool trusted)
+{
+ int err = 0;
+
+ if (trusted) {
+ /* Try to allocate our queue pair as trusted. This will only
+ * work if vsock is running in the host.
+ */
+
+ err = vmci_qpair_alloc(qpair, handle, produce_size,
+ consume_size,
+ peer, flags,
+ VMCI_PRIVILEGE_FLAG_TRUSTED);
+ if (err != VMCI_ERROR_NO_ACCESS)
+ goto out;
+
+ }
+
+ err = vmci_qpair_alloc(qpair, handle, produce_size, consume_size,
+ peer, flags, VMCI_NO_PRIVILEGE_FLAGS);
+out:
+ if (err < 0) {
+ pr_err("Could not attach to queue pair with %d\n",
+ err);
+ err = vmci_transport_error_to_vsock_error(err);
+ }
+
+ return err;
+}
+
+static int
+vmci_transport_datagram_create_hnd(u32 resource_id,
+ u32 flags,
+ vmci_datagram_recv_cb recv_cb,
+ void *client_data,
+ struct vmci_handle *out_handle)
+{
+ int err = 0;
+
+ /* Try to allocate our datagram handler as trusted. This will only work
+ * if vsock is running in the host.
+ */
+
+ err = vmci_datagram_create_handle_priv(resource_id, flags,
+ VMCI_PRIVILEGE_FLAG_TRUSTED,
+ recv_cb,
+ client_data, out_handle);
+
+ if (err == VMCI_ERROR_NO_ACCESS)
+ err = vmci_datagram_create_handle(resource_id, flags,
+ recv_cb, client_data,
+ out_handle);
+
+ return err;
+}
+
+/* This is invoked as part of a tasklet that's scheduled when the VMCI
+ * interrupt fires. This is run in bottom-half context and if it ever needs to
+ * sleep it should defer that work to a work queue.
+ */
+
+static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
+{
+ struct sock *sk;
+ size_t size;
+ struct sk_buff *skb;
+ struct vsock_sock *vsk;
+
+ sk = (struct sock *)data;
+
+ /* This handler is privileged when this module is running on the host.
+ * We will get datagrams from all endpoints (even VMs that are in a
+ * restricted context). If we get one from a restricted context then
+ * the destination socket must be trusted.
+ *
+ * NOTE: We access the socket struct without holding the lock here.
+ * This is ok because the field we are interested is never modified
+ * outside of the create and destruct socket functions.
+ */
+ vsk = vsock_sk(sk);
+ if (!vmci_transport_allow_dgram(vsk, dg->src.context))
+ return VMCI_ERROR_NO_ACCESS;
+
+ size = VMCI_DG_SIZE(dg);
+
+ /* Attach the packet to the socket's receive queue as an sk_buff. */
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb)
+ return VMCI_ERROR_NO_MEM;
+
+ /* sk_receive_skb() will do a sock_put(), so hold here. */
+ sock_hold(sk);
+ skb_put(skb, size);
+ memcpy(skb->data, dg, size);
+ sk_receive_skb(sk, skb, 0);
+
+ return VMCI_SUCCESS;
+}
+
+static bool vmci_transport_stream_allow(u32 cid, u32 port)
+{
+ static const u32 non_socket_contexts[] = {
+ VMADDR_CID_RESERVED,
+ };
+ int i;
+
+ BUILD_BUG_ON(sizeof(cid) != sizeof(*non_socket_contexts));
+
+ for (i = 0; i < ARRAY_SIZE(non_socket_contexts); i++) {
+ if (cid == non_socket_contexts[i])
+ return false;
+ }
+
+ return true;
+}
+
+/* This is invoked as part of a tasklet that's scheduled when the VMCI
+ * interrupt fires. This is run in bottom-half context but it defers most of
+ * its work to the packet handling work queue.
+ */
+
+static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
+{
+ struct sock *sk;
+ struct sockaddr_vm dst;
+ struct sockaddr_vm src;
+ struct vmci_transport_packet *pkt;
+ struct vsock_sock *vsk;
+ bool bh_process_pkt;
+ int err;
+
+ sk = NULL;
+ err = VMCI_SUCCESS;
+ bh_process_pkt = false;
+
+ /* Ignore incoming packets from contexts without sockets, or resources
+ * that aren't vsock implementations.
+ */
+
+ if (!vmci_transport_stream_allow(dg->src.context, -1)
+ || vmci_transport_peer_rid(dg->src.context) != dg->src.resource)
+ return VMCI_ERROR_NO_ACCESS;
+
+ if (VMCI_DG_SIZE(dg) < sizeof(*pkt))
+ /* Drop datagrams that do not contain full VSock packets. */
+ return VMCI_ERROR_INVALID_ARGS;
+
+ pkt = (struct vmci_transport_packet *)dg;
+
+ /* Find the socket that should handle this packet. First we look for a
+ * connected socket and if there is none we look for a socket bound to
+ * the destintation address.
+ */
+ vsock_addr_init(&src, pkt->dg.src.context, pkt->src_port);
+ vsock_addr_init(&dst, pkt->dg.dst.context, pkt->dst_port);
+
+ sk = vsock_find_connected_socket(&src, &dst);
+ if (!sk) {
+ sk = vsock_find_bound_socket(&dst);
+ if (!sk) {
+ /* We could not find a socket for this specified
+ * address. If this packet is a RST, we just drop it.
+ * If it is another packet, we send a RST. Note that
+ * we do not send a RST reply to RSTs so that we do not
+ * continually send RSTs between two endpoints.
+ *
+ * Note that since this is a reply, dst is src and src
+ * is dst.
+ */
+ if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0)
+ pr_err("unable to send reset\n");
+
+ err = VMCI_ERROR_NOT_FOUND;
+ goto out;
+ }
+ }
+
+ /* If the received packet type is beyond all types known to this
+ * implementation, reply with an invalid message. Hopefully this will
+ * help when implementing backwards compatibility in the future.
+ */
+ if (pkt->type >= VMCI_TRANSPORT_PACKET_TYPE_MAX) {
+ vmci_transport_send_invalid_bh(&dst, &src);
+ err = VMCI_ERROR_INVALID_ARGS;
+ goto out;
+ }
+
+ /* This handler is privileged when this module is running on the host.
+ * We will get datagram connect requests from all endpoints (even VMs
+ * that are in a restricted context). If we get one from a restricted
+ * context then the destination socket must be trusted.
+ *
+ * NOTE: We access the socket struct without holding the lock here.
+ * This is ok because the field we are interested is never modified
+ * outside of the create and destruct socket functions.
+ */
+ vsk = vsock_sk(sk);
+ if (!vmci_transport_allow_dgram(vsk, pkt->dg.src.context)) {
+ err = VMCI_ERROR_NO_ACCESS;
+ goto out;
+ }
+
+ /* We do most everything in a work queue, but let's fast path the
+ * notification of reads and writes to help data transfer performance.
+ * We can only do this if there is no process context code executing
+ * for this socket since that may change the state.
+ */
+ bh_lock_sock(sk);
+
+ if (!sock_owned_by_user(sk)) {
+ /* The local context ID may be out of date, update it. */
+ vsk->local_addr.svm_cid = dst.svm_cid;
+
+ if (sk->sk_state == SS_CONNECTED)
+ vmci_trans(vsk)->notify_ops->handle_notify_pkt(
+ sk, pkt, true, &dst, &src,
+ &bh_process_pkt);
+ }
+
+ bh_unlock_sock(sk);
+
+ if (!bh_process_pkt) {
+ struct vmci_transport_recv_pkt_info *recv_pkt_info;
+
+ recv_pkt_info = kmalloc(sizeof(*recv_pkt_info), GFP_ATOMIC);
+ if (!recv_pkt_info) {
+ if (vmci_transport_send_reset_bh(&dst, &src, pkt) < 0)
+ pr_err("unable to send reset\n");
+
+ err = VMCI_ERROR_NO_MEM;
+ goto out;
+ }
+
+ recv_pkt_info->sk = sk;
+ memcpy(&recv_pkt_info->pkt, pkt, sizeof(recv_pkt_info->pkt));
+ INIT_WORK(&recv_pkt_info->work, vmci_transport_recv_pkt_work);
+
+ schedule_work(&recv_pkt_info->work);
+ /* Clear sk so that the reference count incremented by one of
+ * the Find functions above is not decremented below. We need
+ * that reference count for the packet handler we've scheduled
+ * to run.
+ */
+ sk = NULL;
+ }
+
+out:
+ if (sk)
+ sock_put(sk);
+
+ return err;
+}
+
+static void vmci_transport_peer_attach_cb(u32 sub_id,
+ const struct vmci_event_data *e_data,
+ void *client_data)
+{
+ struct sock *sk = client_data;
+ const struct vmci_event_payload_qp *e_payload;
+ struct vsock_sock *vsk;
+
+ e_payload = vmci_event_data_const_payload(e_data);
+
+ vsk = vsock_sk(sk);
+
+ /* We don't ask for delayed CBs when we subscribe to this event (we
+ * pass 0 as flags to vmci_event_subscribe()). VMCI makes no
+ * guarantees in that case about what context we might be running in,
+ * so it could be BH or process, blockable or non-blockable. So we
+ * need to account for all possible contexts here.
+ */
+ local_bh_disable();
+ bh_lock_sock(sk);
+
+ /* XXX This is lame, we should provide a way to lookup sockets by
+ * qp_handle.
+ */
+ if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle,
+ e_payload->handle)) {
+ /* XXX This doesn't do anything, but in the future we may want
+ * to set a flag here to verify the attach really did occur and
+ * we weren't just sent a datagram claiming it was.
+ */
+ goto out;
+ }
+
+out:
+ bh_unlock_sock(sk);
+ local_bh_enable();
+}
+
+static void vmci_transport_handle_detach(struct sock *sk)
+{
+ struct vsock_sock *vsk;
+
+ vsk = vsock_sk(sk);
+ if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) {
+ sock_set_flag(sk, SOCK_DONE);
+
+ /* On a detach the peer will not be sending or receiving
+ * anymore.
+ */
+ vsk->peer_shutdown = SHUTDOWN_MASK;
+
+ /* We should not be sending anymore since the peer won't be
+ * there to receive, but we can still receive if there is data
+ * left in our consume queue.
+ */
+ if (vsock_stream_has_data(vsk) <= 0) {
+ if (sk->sk_state == SS_CONNECTING) {
+ /* The peer may detach from a queue pair while
+ * we are still in the connecting state, i.e.,
+ * if the peer VM is killed after attaching to
+ * a queue pair, but before we complete the
+ * handshake. In that case, we treat the detach
+ * event like a reset.
+ */
+
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = ECONNRESET;
+ sk->sk_error_report(sk);
+ return;
+ }
+ sk->sk_state = SS_UNCONNECTED;
+ }
+ sk->sk_state_change(sk);
+ }
+}
+
+static void vmci_transport_peer_detach_cb(u32 sub_id,
+ const struct vmci_event_data *e_data,
+ void *client_data)
+{
+ struct sock *sk = client_data;
+ const struct vmci_event_payload_qp *e_payload;
+ struct vsock_sock *vsk;
+
+ e_payload = vmci_event_data_const_payload(e_data);
+ vsk = vsock_sk(sk);
+ if (vmci_handle_is_invalid(e_payload->handle))
+ return;
+
+ /* Same rules for locking as for peer_attach_cb(). */
+ local_bh_disable();
+ bh_lock_sock(sk);
+
+ /* XXX This is lame, we should provide a way to lookup sockets by
+ * qp_handle.
+ */
+ if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle,
+ e_payload->handle))
+ vmci_transport_handle_detach(sk);
+
+ bh_unlock_sock(sk);
+ local_bh_enable();
+}
+
+static void vmci_transport_qp_resumed_cb(u32 sub_id,
+ const struct vmci_event_data *e_data,
+ void *client_data)
+{
+ vsock_for_each_connected_socket(vmci_transport_handle_detach);
+}
+
+static void vmci_transport_recv_pkt_work(struct work_struct *work)
+{
+ struct vmci_transport_recv_pkt_info *recv_pkt_info;
+ struct vmci_transport_packet *pkt;
+ struct sock *sk;
+
+ recv_pkt_info =
+ container_of(work, struct vmci_transport_recv_pkt_info, work);
+ sk = recv_pkt_info->sk;
+ pkt = &recv_pkt_info->pkt;
+
+ lock_sock(sk);
+
+ /* The local context ID may be out of date. */
+ vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context;
+
+ switch (sk->sk_state) {
+ case SS_LISTEN:
+ vmci_transport_recv_listen(sk, pkt);
+ break;
+ case SS_CONNECTING:
+ /* Processing of pending connections for servers goes through
+ * the listening socket, so see vmci_transport_recv_listen()
+ * for that path.
+ */
+ vmci_transport_recv_connecting_client(sk, pkt);
+ break;
+ case SS_CONNECTED:
+ vmci_transport_recv_connected(sk, pkt);
+ break;
+ default:
+ /* Because this function does not run in the same context as
+ * vmci_transport_recv_stream_cb it is possible that the
+ * socket has closed. We need to let the other side know or it
+ * could be sitting in a connect and hang forever. Send a
+ * reset to prevent that.
+ */
+ vmci_transport_send_reset(sk, pkt);
+ break;
+ }
+
+ release_sock(sk);
+ kfree(recv_pkt_info);
+ /* Release reference obtained in the stream callback when we fetched
+ * this socket out of the bound or connected list.
+ */
+ sock_put(sk);
+}
+
+static int vmci_transport_recv_listen(struct sock *sk,
+ struct vmci_transport_packet *pkt)
+{
+ struct sock *pending;
+ struct vsock_sock *vpending;
+ int err;
+ u64 qp_size;
+ bool old_request = false;
+ bool old_pkt_proto = false;
+
+ err = 0;
+
+ /* Because we are in the listen state, we could be receiving a packet
+ * for ourself or any previous connection requests that we received.
+ * If it's the latter, we try to find a socket in our list of pending
+ * connections and, if we do, call the appropriate handler for the
+ * state that that socket is in. Otherwise we try to service the
+ * connection request.
+ */
+ pending = vmci_transport_get_pending(sk, pkt);
+ if (pending) {
+ lock_sock(pending);
+
+ /* The local context ID may be out of date. */
+ vsock_sk(pending)->local_addr.svm_cid = pkt->dg.dst.context;
+
+ switch (pending->sk_state) {
+ case SS_CONNECTING:
+ err = vmci_transport_recv_connecting_server(sk,
+ pending,
+ pkt);
+ break;
+ default:
+ vmci_transport_send_reset(pending, pkt);
+ err = -EINVAL;
+ }
+
+ if (err < 0)
+ vsock_remove_pending(sk, pending);
+
+ release_sock(pending);
+ vmci_transport_release_pending(pending);
+
+ return err;
+ }
+
+ /* The listen state only accepts connection requests. Reply with a
+ * reset unless we received a reset.
+ */
+
+ if (!(pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST ||
+ pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST2)) {
+ vmci_transport_reply_reset(pkt);
+ return -EINVAL;
+ }
+
+ if (pkt->u.size == 0) {
+ vmci_transport_reply_reset(pkt);
+ return -EINVAL;
+ }
+
+ /* If this socket can't accommodate this connection request, we send a
+ * reset. Otherwise we create and initialize a child socket and reply
+ * with a connection negotiation.
+ */
+ if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) {
+ vmci_transport_reply_reset(pkt);
+ return -ECONNREFUSED;
+ }
+
+ pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+ sk->sk_type);
+ if (!pending) {
+ vmci_transport_send_reset(sk, pkt);
+ return -ENOMEM;
+ }
+
+ vpending = vsock_sk(pending);
+
+ vsock_addr_init(&vpending->local_addr, pkt->dg.dst.context,
+ pkt->dst_port);
+ vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context,
+ pkt->src_port);
+
+ /* If the proposed size fits within our min/max, accept it. Otherwise
+ * propose our own size.
+ */
+ if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size &&
+ pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) {
+ qp_size = pkt->u.size;
+ } else {
+ qp_size = vmci_trans(vpending)->queue_pair_size;
+ }
+
+ /* Figure out if we are using old or new requests based on the
+ * overrides pkt types sent by our peer.
+ */
+ if (vmci_transport_old_proto_override(&old_pkt_proto)) {
+ old_request = old_pkt_proto;
+ } else {
+ if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST)
+ old_request = true;
+ else if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_REQUEST2)
+ old_request = false;
+
+ }
+
+ if (old_request) {
+ /* Handle a REQUEST (or override) */
+ u16 version = VSOCK_PROTO_INVALID;
+ if (vmci_transport_proto_to_notify_struct(
+ pending, &version, true))
+ err = vmci_transport_send_negotiate(pending, qp_size);
+ else
+ err = -EINVAL;
+
+ } else {
+ /* Handle a REQUEST2 (or override) */
+ int proto_int = pkt->proto;
+ int pos;
+ u16 active_proto_version = 0;
+
+ /* The list of possible protocols is the intersection of all
+ * protocols the client supports ... plus all the protocols we
+ * support.
+ */
+ proto_int &= vmci_transport_new_proto_supported_versions();
+
+ /* We choose the highest possible protocol version and use that
+ * one.
+ */
+ pos = fls(proto_int);
+ if (pos) {
+ active_proto_version = (1 << (pos - 1));
+ if (vmci_transport_proto_to_notify_struct(
+ pending, &active_proto_version, false))
+ err = vmci_transport_send_negotiate2(pending,
+ qp_size,
+ active_proto_version);
+ else
+ err = -EINVAL;
+
+ } else {
+ err = -EINVAL;
+ }
+ }
+
+ if (err < 0) {
+ vmci_transport_send_reset(sk, pkt);
+ sock_put(pending);
+ err = vmci_transport_error_to_vsock_error(err);
+ goto out;
+ }
+
+ vsock_add_pending(sk, pending);
+ sk->sk_ack_backlog++;
+
+ pending->sk_state = SS_CONNECTING;
+ vmci_trans(vpending)->produce_size =
+ vmci_trans(vpending)->consume_size = qp_size;
+ vmci_trans(vpending)->queue_pair_size = qp_size;
+
+ vmci_trans(vpending)->notify_ops->process_request(pending);
+
+ /* We might never receive another message for this socket and it's not
+ * connected to any process, so we have to ensure it gets cleaned up
+ * ourself. Our delayed work function will take care of that. Note
+ * that we do not ever cancel this function since we have few
+ * guarantees about its state when calling cancel_delayed_work().
+ * Instead we hold a reference on the socket for that function and make
+ * it capable of handling cases where it needs to do nothing but
+ * release that reference.
+ */
+ vpending->listener = sk;
+ sock_hold(sk);
+ sock_hold(pending);
+ INIT_DELAYED_WORK(&vpending->dwork, vsock_pending_work);
+ schedule_delayed_work(&vpending->dwork, HZ);
+
+out:
+ return err;
+}
+
+static int
+vmci_transport_recv_connecting_server(struct sock *listener,
+ struct sock *pending,
+ struct vmci_transport_packet *pkt)
+{
+ struct vsock_sock *vpending;
+ struct vmci_handle handle;
+ struct vmci_qp *qpair;
+ bool is_local;
+ u32 flags;
+ u32 detach_sub_id;
+ int err;
+ int skerr;
+
+ vpending = vsock_sk(pending);
+ detach_sub_id = VMCI_INVALID_ID;
+
+ switch (pkt->type) {
+ case VMCI_TRANSPORT_PACKET_TYPE_OFFER:
+ if (vmci_handle_is_invalid(pkt->u.handle)) {
+ vmci_transport_send_reset(pending, pkt);
+ skerr = EPROTO;
+ err = -EINVAL;
+ goto destroy;
+ }
+ break;
+ default:
+ /* Close and cleanup the connection. */
+ vmci_transport_send_reset(pending, pkt);
+ skerr = EPROTO;
+ err = pkt->type == VMCI_TRANSPORT_PACKET_TYPE_RST ? 0 : -EINVAL;
+ goto destroy;
+ }
+
+ /* In order to complete the connection we need to attach to the offered
+ * queue pair and send an attach notification. We also subscribe to the
+ * detach event so we know when our peer goes away, and we do that
+ * before attaching so we don't miss an event. If all this succeeds,
+ * we update our state and wakeup anything waiting in accept() for a
+ * connection.
+ */
+
+ /* We don't care about attach since we ensure the other side has
+ * attached by specifying the ATTACH_ONLY flag below.
+ */
+ err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH,
+ vmci_transport_peer_detach_cb,
+ pending, &detach_sub_id);
+ if (err < VMCI_SUCCESS) {
+ vmci_transport_send_reset(pending, pkt);
+ err = vmci_transport_error_to_vsock_error(err);
+ skerr = -err;
+ goto destroy;
+ }
+
+ vmci_trans(vpending)->detach_sub_id = detach_sub_id;
+
+ /* Now attach to the queue pair the client created. */
+ handle = pkt->u.handle;
+
+ /* vpending->local_addr always has a context id so we do not need to
+ * worry about VMADDR_CID_ANY in this case.
+ */
+ is_local =
+ vpending->remote_addr.svm_cid == vpending->local_addr.svm_cid;
+ flags = VMCI_QPFLAG_ATTACH_ONLY;
+ flags |= is_local ? VMCI_QPFLAG_LOCAL : 0;
+
+ err = vmci_transport_queue_pair_alloc(
+ &qpair,
+ &handle,
+ vmci_trans(vpending)->produce_size,
+ vmci_trans(vpending)->consume_size,
+ pkt->dg.src.context,
+ flags,
+ vmci_transport_is_trusted(
+ vpending,
+ vpending->remote_addr.svm_cid));
+ if (err < 0) {
+ vmci_transport_send_reset(pending, pkt);
+ skerr = -err;
+ goto destroy;
+ }
+
+ vmci_trans(vpending)->qp_handle = handle;
+ vmci_trans(vpending)->qpair = qpair;
+
+ /* When we send the attach message, we must be ready to handle incoming
+ * control messages on the newly connected socket. So we move the
+ * pending socket to the connected state before sending the attach
+ * message. Otherwise, an incoming packet triggered by the attach being
+ * received by the peer may be processed concurrently with what happens
+ * below after sending the attach message, and that incoming packet
+ * will find the listening socket instead of the (currently) pending
+ * socket. Note that enqueueing the socket increments the reference
+ * count, so even if a reset comes before the connection is accepted,
+ * the socket will be valid until it is removed from the queue.
+ *
+ * If we fail sending the attach below, we remove the socket from the
+ * connected list and move the socket to SS_UNCONNECTED before
+ * releasing the lock, so a pending slow path processing of an incoming
+ * packet will not see the socket in the connected state in that case.
+ */
+ pending->sk_state = SS_CONNECTED;
+
+ vsock_insert_connected(vpending);
+
+ /* Notify our peer of our attach. */
+ err = vmci_transport_send_attach(pending, handle);
+ if (err < 0) {
+ vsock_remove_connected(vpending);
+ pr_err("Could not send attach\n");
+ vmci_transport_send_reset(pending, pkt);
+ err = vmci_transport_error_to_vsock_error(err);
+ skerr = -err;
+ goto destroy;
+ }
+
+ /* We have a connection. Move the now connected socket from the
+ * listener's pending list to the accept queue so callers of accept()
+ * can find it.
+ */
+ vsock_remove_pending(listener, pending);
+ vsock_enqueue_accept(listener, pending);
+
+ /* Callers of accept() will be be waiting on the listening socket, not
+ * the pending socket.
+ */
+ listener->sk_state_change(listener);
+
+ return 0;
+
+destroy:
+ pending->sk_err = skerr;
+ pending->sk_state = SS_UNCONNECTED;
+ /* As long as we drop our reference, all necessary cleanup will handle
+ * when the cleanup function drops its reference and our destruct
+ * implementation is called. Note that since the listen handler will
+ * remove pending from the pending list upon our failure, the cleanup
+ * function won't drop the additional reference, which is why we do it
+ * here.
+ */
+ sock_put(pending);
+
+ return err;
+}
+
+static int
+vmci_transport_recv_connecting_client(struct sock *sk,
+ struct vmci_transport_packet *pkt)
+{
+ struct vsock_sock *vsk;
+ int err;
+ int skerr;
+
+ vsk = vsock_sk(sk);
+
+ switch (pkt->type) {
+ case VMCI_TRANSPORT_PACKET_TYPE_ATTACH:
+ if (vmci_handle_is_invalid(pkt->u.handle) ||
+ !vmci_handle_is_equal(pkt->u.handle,
+ vmci_trans(vsk)->qp_handle)) {
+ skerr = EPROTO;
+ err = -EINVAL;
+ goto destroy;
+ }
+
+ /* Signify the socket is connected and wakeup the waiter in
+ * connect(). Also place the socket in the connected table for
+ * accounting (it can already be found since it's in the bound
+ * table).
+ */
+ sk->sk_state = SS_CONNECTED;
+ sk->sk_socket->state = SS_CONNECTED;
+ vsock_insert_connected(vsk);
+ sk->sk_state_change(sk);
+
+ break;
+ case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE:
+ case VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2:
+ if (pkt->u.size == 0
+ || pkt->dg.src.context != vsk->remote_addr.svm_cid
+ || pkt->src_port != vsk->remote_addr.svm_port
+ || !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)
+ || vmci_trans(vsk)->qpair
+ || vmci_trans(vsk)->produce_size != 0
+ || vmci_trans(vsk)->consume_size != 0
+ || vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID
+ || vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) {
+ skerr = EPROTO;
+ err = -EINVAL;
+
+ goto destroy;
+ }
+
+ err = vmci_transport_recv_connecting_client_negotiate(sk, pkt);
+ if (err) {
+ skerr = -err;
+ goto destroy;
+ }
+
+ break;
+ case VMCI_TRANSPORT_PACKET_TYPE_INVALID:
+ err = vmci_transport_recv_connecting_client_invalid(sk, pkt);
+ if (err) {
+ skerr = -err;
+ goto destroy;
+ }
+
+ break;
+ case VMCI_TRANSPORT_PACKET_TYPE_RST:
+ /* Older versions of the linux code (WS 6.5 / ESX 4.0) used to
+ * continue processing here after they sent an INVALID packet.
+ * This meant that we got a RST after the INVALID. We ignore a
+ * RST after an INVALID. The common code doesn't send the RST
+ * ... so we can hang if an old version of the common code
+ * fails between getting a REQUEST and sending an OFFER back.
+ * Not much we can do about it... except hope that it doesn't
+ * happen.
+ */
+ if (vsk->ignore_connecting_rst) {
+ vsk->ignore_connecting_rst = false;
+ } else {
+ skerr = ECONNRESET;
+ err = 0;
+ goto destroy;
+ }
+
+ break;
+ default:
+ /* Close and cleanup the connection. */
+ skerr = EPROTO;
+ err = -EINVAL;
+ goto destroy;
+ }
+
+ return 0;
+
+destroy:
+ vmci_transport_send_reset(sk, pkt);
+
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = skerr;
+ sk->sk_error_report(sk);
+ return err;
+}
+
+static int vmci_transport_recv_connecting_client_negotiate(
+ struct sock *sk,
+ struct vmci_transport_packet *pkt)
+{
+ int err;
+ struct vsock_sock *vsk;
+ struct vmci_handle handle;
+ struct vmci_qp *qpair;
+ u32 attach_sub_id;
+ u32 detach_sub_id;
+ bool is_local;
+ u32 flags;
+ bool old_proto = true;
+ bool old_pkt_proto;
+ u16 version;
+
+ vsk = vsock_sk(sk);
+ handle = VMCI_INVALID_HANDLE;
+ attach_sub_id = VMCI_INVALID_ID;
+ detach_sub_id = VMCI_INVALID_ID;
+
+ /* If we have gotten here then we should be past the point where old
+ * linux vsock could have sent the bogus rst.
+ */
+ vsk->sent_request = false;
+ vsk->ignore_connecting_rst = false;
+
+ /* Verify that we're OK with the proposed queue pair size */
+ if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size ||
+ pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) {
+ err = -EINVAL;
+ goto destroy;
+ }
+
+ /* At this point we know the CID the peer is using to talk to us. */
+
+ if (vsk->local_addr.svm_cid == VMADDR_CID_ANY)
+ vsk->local_addr.svm_cid = pkt->dg.dst.context;
+
+ /* Setup the notify ops to be the highest supported version that both
+ * the server and the client support.
+ */
+
+ if (vmci_transport_old_proto_override(&old_pkt_proto)) {
+ old_proto = old_pkt_proto;
+ } else {
+ if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE)
+ old_proto = true;
+ else if (pkt->type == VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2)
+ old_proto = false;
+
+ }
+
+ if (old_proto)
+ version = VSOCK_PROTO_INVALID;
+ else
+ version = pkt->proto;
+
+ if (!vmci_transport_proto_to_notify_struct(sk, &version, old_proto)) {
+ err = -EINVAL;
+ goto destroy;
+ }
+
+ /* Subscribe to attach and detach events first.
+ *
+ * XXX We attach once for each queue pair created for now so it is easy
+ * to find the socket (it's provided), but later we should only
+ * subscribe once and add a way to lookup sockets by queue pair handle.
+ */
+ err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_ATTACH,
+ vmci_transport_peer_attach_cb,
+ sk, &attach_sub_id);
+ if (err < VMCI_SUCCESS) {
+ err = vmci_transport_error_to_vsock_error(err);
+ goto destroy;
+ }
+
+ err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH,
+ vmci_transport_peer_detach_cb,
+ sk, &detach_sub_id);
+ if (err < VMCI_SUCCESS) {
+ err = vmci_transport_error_to_vsock_error(err);
+ goto destroy;
+ }
+
+ /* Make VMCI select the handle for us. */
+ handle = VMCI_INVALID_HANDLE;
+ is_local = vsk->remote_addr.svm_cid == vsk->local_addr.svm_cid;
+ flags = is_local ? VMCI_QPFLAG_LOCAL : 0;
+
+ err = vmci_transport_queue_pair_alloc(&qpair,
+ &handle,
+ pkt->u.size,
+ pkt->u.size,
+ vsk->remote_addr.svm_cid,
+ flags,
+ vmci_transport_is_trusted(
+ vsk,
+ vsk->
+ remote_addr.svm_cid));
+ if (err < 0)
+ goto destroy;
+
+ err = vmci_transport_send_qp_offer(sk, handle);
+ if (err < 0) {
+ err = vmci_transport_error_to_vsock_error(err);
+ goto destroy;
+ }
+
+ vmci_trans(vsk)->qp_handle = handle;
+ vmci_trans(vsk)->qpair = qpair;
+
+ vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size =
+ pkt->u.size;
+
+ vmci_trans(vsk)->attach_sub_id = attach_sub_id;
+ vmci_trans(vsk)->detach_sub_id = detach_sub_id;
+
+ vmci_trans(vsk)->notify_ops->process_negotiate(sk);
+
+ return 0;
+
+destroy:
+ if (attach_sub_id != VMCI_INVALID_ID)
+ vmci_event_unsubscribe(attach_sub_id);
+
+ if (detach_sub_id != VMCI_INVALID_ID)
+ vmci_event_unsubscribe(detach_sub_id);
+
+ if (!vmci_handle_is_invalid(handle))
+ vmci_qpair_detach(&qpair);
+
+ return err;
+}
+
+static int
+vmci_transport_recv_connecting_client_invalid(struct sock *sk,
+ struct vmci_transport_packet *pkt)
+{
+ int err = 0;
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (vsk->sent_request) {
+ vsk->sent_request = false;
+ vsk->ignore_connecting_rst = true;
+
+ err = vmci_transport_send_conn_request(
+ sk, vmci_trans(vsk)->queue_pair_size);
+ if (err < 0)
+ err = vmci_transport_error_to_vsock_error(err);
+ else
+ err = 0;
+
+ }
+
+ return err;
+}
+
+static int vmci_transport_recv_connected(struct sock *sk,
+ struct vmci_transport_packet *pkt)
+{
+ struct vsock_sock *vsk;
+ bool pkt_processed = false;
+
+ /* In cases where we are closing the connection, it's sufficient to
+ * mark the state change (and maybe error) and wake up any waiting
+ * threads. Since this is a connected socket, it's owned by a user
+ * process and will be cleaned up when the failure is passed back on
+ * the current or next system call. Our system call implementations
+ * must therefore check for error and state changes on entry and when
+ * being awoken.
+ */
+ switch (pkt->type) {
+ case VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN:
+ if (pkt->u.mode) {
+ vsk = vsock_sk(sk);
+
+ vsk->peer_shutdown |= pkt->u.mode;
+ sk->sk_state_change(sk);
+ }
+ break;
+
+ case VMCI_TRANSPORT_PACKET_TYPE_RST:
+ vsk = vsock_sk(sk);
+ /* It is possible that we sent our peer a message (e.g a
+ * WAITING_READ) right before we got notified that the peer had
+ * detached. If that happens then we can get a RST pkt back
+ * from our peer even though there is data available for us to
+ * read. In that case, don't shutdown the socket completely but
+ * instead allow the local client to finish reading data off
+ * the queuepair. Always treat a RST pkt in connected mode like
+ * a clean shutdown.
+ */
+ sock_set_flag(sk, SOCK_DONE);
+ vsk->peer_shutdown = SHUTDOWN_MASK;
+ if (vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+
+ sk->sk_state_change(sk);
+ break;
+
+ default:
+ vsk = vsock_sk(sk);
+ vmci_trans(vsk)->notify_ops->handle_notify_pkt(
+ sk, pkt, false, NULL, NULL,
+ &pkt_processed);
+ if (!pkt_processed)
+ return -EINVAL;
+
+ break;
+ }
+
+ return 0;
+}
+
+static int vmci_transport_socket_init(struct vsock_sock *vsk,
+ struct vsock_sock *psk)
+{
+ vsk->trans = kmalloc(sizeof(struct vmci_transport), GFP_KERNEL);
+ if (!vsk->trans)
+ return -ENOMEM;
+
+ vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
+ vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE;
+ vmci_trans(vsk)->qpair = NULL;
+ vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 0;
+ vmci_trans(vsk)->attach_sub_id = vmci_trans(vsk)->detach_sub_id =
+ VMCI_INVALID_ID;
+ vmci_trans(vsk)->notify_ops = NULL;
+ if (psk) {
+ vmci_trans(vsk)->queue_pair_size =
+ vmci_trans(psk)->queue_pair_size;
+ vmci_trans(vsk)->queue_pair_min_size =
+ vmci_trans(psk)->queue_pair_min_size;
+ vmci_trans(vsk)->queue_pair_max_size =
+ vmci_trans(psk)->queue_pair_max_size;
+ } else {
+ vmci_trans(vsk)->queue_pair_size =
+ VMCI_TRANSPORT_DEFAULT_QP_SIZE;
+ vmci_trans(vsk)->queue_pair_min_size =
+ VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN;
+ vmci_trans(vsk)->queue_pair_max_size =
+ VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX;
+ }
+
+ return 0;
+}
+
+static void vmci_transport_destruct(struct vsock_sock *vsk)
+{
+ if (vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID) {
+ vmci_event_unsubscribe(vmci_trans(vsk)->attach_sub_id);
+ vmci_trans(vsk)->attach_sub_id = VMCI_INVALID_ID;
+ }
+
+ if (vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) {
+ vmci_event_unsubscribe(vmci_trans(vsk)->detach_sub_id);
+ vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID;
+ }
+
+ if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) {
+ vmci_qpair_detach(&vmci_trans(vsk)->qpair);
+ vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE;
+ vmci_trans(vsk)->produce_size = 0;
+ vmci_trans(vsk)->consume_size = 0;
+ }
+
+ if (vmci_trans(vsk)->notify_ops)
+ vmci_trans(vsk)->notify_ops->socket_destruct(vsk);
+
+ kfree(vsk->trans);
+ vsk->trans = NULL;
+}
+
+static void vmci_transport_release(struct vsock_sock *vsk)
+{
+ if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) {
+ vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle);
+ vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
+ }
+}
+
+static int vmci_transport_dgram_bind(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ u32 port;
+ u32 flags;
+ int err;
+
+ /* VMCI will select a resource ID for us if we provide
+ * VMCI_INVALID_ID.
+ */
+ port = addr->svm_port == VMADDR_PORT_ANY ?
+ VMCI_INVALID_ID : addr->svm_port;
+
+ if (port <= LAST_RESERVED_PORT && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ flags = addr->svm_cid == VMADDR_CID_ANY ?
+ VMCI_FLAG_ANYCID_DG_HND : 0;
+
+ err = vmci_transport_datagram_create_hnd(port, flags,
+ vmci_transport_recv_dgram_cb,
+ &vsk->sk,
+ &vmci_trans(vsk)->dg_handle);
+ if (err < VMCI_SUCCESS)
+ return vmci_transport_error_to_vsock_error(err);
+ vsock_addr_init(&vsk->local_addr, addr->svm_cid,
+ vmci_trans(vsk)->dg_handle.resource);
+
+ return 0;
+}
+
+static int vmci_transport_dgram_enqueue(
+ struct vsock_sock *vsk,
+ struct sockaddr_vm *remote_addr,
+ struct msghdr *msg,
+ size_t len)
+{
+ int err;
+ struct vmci_datagram *dg;
+
+ if (len > VMCI_MAX_DG_PAYLOAD_SIZE)
+ return -EMSGSIZE;
+
+ if (!vmci_transport_allow_dgram(vsk, remote_addr->svm_cid))
+ return -EPERM;
+
+ /* Allocate a buffer for the user's message and our packet header. */
+ dg = kmalloc(len + sizeof(*dg), GFP_KERNEL);
+ if (!dg)
+ return -ENOMEM;
+
+ memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len);
+
+ dg->dst = vmci_make_handle(remote_addr->svm_cid,
+ remote_addr->svm_port);
+ dg->src = vmci_make_handle(vsk->local_addr.svm_cid,
+ vsk->local_addr.svm_port);
+ dg->payload_size = len;
+
+ err = vmci_datagram_send(dg);
+ kfree(dg);
+ if (err < 0)
+ return vmci_transport_error_to_vsock_error(err);
+
+ return err - sizeof(*dg);
+}
+
+static int vmci_transport_dgram_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg, size_t len,
+ int flags)
+{
+ int err;
+ int noblock;
+ struct vmci_datagram *dg;
+ size_t payload_len;
+ struct sk_buff *skb;
+
+ noblock = flags & MSG_DONTWAIT;
+
+ if (flags & MSG_OOB || flags & MSG_ERRQUEUE)
+ return -EOPNOTSUPP;
+
+ /* Retrieve the head sk_buff from the socket's receive queue. */
+ err = 0;
+ skb = skb_recv_datagram(&vsk->sk, flags, noblock, &err);
+ if (err)
+ return err;
+
+ if (!skb)
+ return -EAGAIN;
+
+ dg = (struct vmci_datagram *)skb->data;
+ if (!dg)
+ /* err is 0, meaning we read zero bytes. */
+ goto out;
+
+ payload_len = dg->payload_size;
+ /* Ensure the sk_buff matches the payload size claimed in the packet. */
+ if (payload_len != skb->len - sizeof(*dg)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (payload_len > len) {
+ payload_len = len;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ /* Place the datagram payload in the user's iovec. */
+ err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len);
+ if (err)
+ goto out;
+
+ if (msg->msg_name) {
+ /* Provide the address of the sender. */
+ DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name);
+ vsock_addr_init(vm_addr, dg->src.context, dg->src.resource);
+ msg->msg_namelen = sizeof(*vm_addr);
+ }
+ err = payload_len;
+
+out:
+ skb_free_datagram(&vsk->sk, skb);
+ return err;
+}
+
+static bool vmci_transport_dgram_allow(u32 cid, u32 port)
+{
+ if (cid == VMADDR_CID_HYPERVISOR) {
+ /* Registrations of PBRPC Servers do not modify VMX/Hypervisor
+ * state and are allowed.
+ */
+ return port == VMCI_UNITY_PBRPC_REGISTER;
+ }
+
+ return true;
+}
+
+static int vmci_transport_connect(struct vsock_sock *vsk)
+{
+ int err;
+ bool old_pkt_proto = false;
+ struct sock *sk = &vsk->sk;
+
+ if (vmci_transport_old_proto_override(&old_pkt_proto) &&
+ old_pkt_proto) {
+ err = vmci_transport_send_conn_request(
+ sk, vmci_trans(vsk)->queue_pair_size);
+ if (err < 0) {
+ sk->sk_state = SS_UNCONNECTED;
+ return err;
+ }
+ } else {
+ int supported_proto_versions =
+ vmci_transport_new_proto_supported_versions();
+ err = vmci_transport_send_conn_request2(
+ sk, vmci_trans(vsk)->queue_pair_size,
+ supported_proto_versions);
+ if (err < 0) {
+ sk->sk_state = SS_UNCONNECTED;
+ return err;
+ }
+
+ vsk->sent_request = true;
+ }
+
+ return err;
+}
+
+static ssize_t vmci_transport_stream_dequeue(
+ struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len,
+ int flags)
+{
+ if (flags & MSG_PEEK)
+ return vmci_qpair_peekv(vmci_trans(vsk)->qpair, msg, len, 0);
+ else
+ return vmci_qpair_dequev(vmci_trans(vsk)->qpair, msg, len, 0);
+}
+
+static ssize_t vmci_transport_stream_enqueue(
+ struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ return vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg, len, 0);
+}
+
+static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk)
+{
+ return vmci_qpair_consume_buf_ready(vmci_trans(vsk)->qpair);
+}
+
+static s64 vmci_transport_stream_has_space(struct vsock_sock *vsk)
+{
+ return vmci_qpair_produce_free_space(vmci_trans(vsk)->qpair);
+}
+
+static u64 vmci_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+ return vmci_trans(vsk)->consume_size;
+}
+
+static bool vmci_transport_stream_is_active(struct vsock_sock *vsk)
+{
+ return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle);
+}
+
+static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+ return vmci_trans(vsk)->queue_pair_size;
+}
+
+static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+ return vmci_trans(vsk)->queue_pair_min_size;
+}
+
+static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+ return vmci_trans(vsk)->queue_pair_max_size;
+}
+
+static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ if (val < vmci_trans(vsk)->queue_pair_min_size)
+ vmci_trans(vsk)->queue_pair_min_size = val;
+ if (val > vmci_trans(vsk)->queue_pair_max_size)
+ vmci_trans(vsk)->queue_pair_max_size = val;
+ vmci_trans(vsk)->queue_pair_size = val;
+}
+
+static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk,
+ u64 val)
+{
+ if (val > vmci_trans(vsk)->queue_pair_size)
+ vmci_trans(vsk)->queue_pair_size = val;
+ vmci_trans(vsk)->queue_pair_min_size = val;
+}
+
+static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk,
+ u64 val)
+{
+ if (val < vmci_trans(vsk)->queue_pair_size)
+ vmci_trans(vsk)->queue_pair_size = val;
+ vmci_trans(vsk)->queue_pair_max_size = val;
+}
+
+static int vmci_transport_notify_poll_in(
+ struct vsock_sock *vsk,
+ size_t target,
+ bool *data_ready_now)
+{
+ return vmci_trans(vsk)->notify_ops->poll_in(
+ &vsk->sk, target, data_ready_now);
+}
+
+static int vmci_transport_notify_poll_out(
+ struct vsock_sock *vsk,
+ size_t target,
+ bool *space_available_now)
+{
+ return vmci_trans(vsk)->notify_ops->poll_out(
+ &vsk->sk, target, space_available_now);
+}
+
+static int vmci_transport_notify_recv_init(
+ struct vsock_sock *vsk,
+ size_t target,
+ struct vsock_transport_recv_notify_data *data)
+{
+ return vmci_trans(vsk)->notify_ops->recv_init(
+ &vsk->sk, target,
+ (struct vmci_transport_recv_notify_data *)data);
+}
+
+static int vmci_transport_notify_recv_pre_block(
+ struct vsock_sock *vsk,
+ size_t target,
+ struct vsock_transport_recv_notify_data *data)
+{
+ return vmci_trans(vsk)->notify_ops->recv_pre_block(
+ &vsk->sk, target,
+ (struct vmci_transport_recv_notify_data *)data);
+}
+
+static int vmci_transport_notify_recv_pre_dequeue(
+ struct vsock_sock *vsk,
+ size_t target,
+ struct vsock_transport_recv_notify_data *data)
+{
+ return vmci_trans(vsk)->notify_ops->recv_pre_dequeue(
+ &vsk->sk, target,
+ (struct vmci_transport_recv_notify_data *)data);
+}
+
+static int vmci_transport_notify_recv_post_dequeue(
+ struct vsock_sock *vsk,
+ size_t target,
+ ssize_t copied,
+ bool data_read,
+ struct vsock_transport_recv_notify_data *data)
+{
+ return vmci_trans(vsk)->notify_ops->recv_post_dequeue(
+ &vsk->sk, target, copied, data_read,
+ (struct vmci_transport_recv_notify_data *)data);
+}
+
+static int vmci_transport_notify_send_init(
+ struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return vmci_trans(vsk)->notify_ops->send_init(
+ &vsk->sk,
+ (struct vmci_transport_send_notify_data *)data);
+}
+
+static int vmci_transport_notify_send_pre_block(
+ struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return vmci_trans(vsk)->notify_ops->send_pre_block(
+ &vsk->sk,
+ (struct vmci_transport_send_notify_data *)data);
+}
+
+static int vmci_transport_notify_send_pre_enqueue(
+ struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return vmci_trans(vsk)->notify_ops->send_pre_enqueue(
+ &vsk->sk,
+ (struct vmci_transport_send_notify_data *)data);
+}
+
+static int vmci_transport_notify_send_post_enqueue(
+ struct vsock_sock *vsk,
+ ssize_t written,
+ struct vsock_transport_send_notify_data *data)
+{
+ return vmci_trans(vsk)->notify_ops->send_post_enqueue(
+ &vsk->sk, written,
+ (struct vmci_transport_send_notify_data *)data);
+}
+
+static bool vmci_transport_old_proto_override(bool *old_pkt_proto)
+{
+ if (PROTOCOL_OVERRIDE != -1) {
+ if (PROTOCOL_OVERRIDE == 0)
+ *old_pkt_proto = true;
+ else
+ *old_pkt_proto = false;
+
+ pr_info("Proto override in use\n");
+ return true;
+ }
+
+ return false;
+}
+
+static bool vmci_transport_proto_to_notify_struct(struct sock *sk,
+ u16 *proto,
+ bool old_pkt_proto)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (old_pkt_proto) {
+ if (*proto != VSOCK_PROTO_INVALID) {
+ pr_err("Can't set both an old and new protocol\n");
+ return false;
+ }
+ vmci_trans(vsk)->notify_ops = &vmci_transport_notify_pkt_ops;
+ goto exit;
+ }
+
+ switch (*proto) {
+ case VSOCK_PROTO_PKT_ON_NOTIFY:
+ vmci_trans(vsk)->notify_ops =
+ &vmci_transport_notify_pkt_q_state_ops;
+ break;
+ default:
+ pr_err("Unknown notify protocol version\n");
+ return false;
+ }
+
+exit:
+ vmci_trans(vsk)->notify_ops->socket_init(sk);
+ return true;
+}
+
+static u16 vmci_transport_new_proto_supported_versions(void)
+{
+ if (PROTOCOL_OVERRIDE != -1)
+ return PROTOCOL_OVERRIDE;
+
+ return VSOCK_PROTO_ALL_SUPPORTED;
+}
+
+static u32 vmci_transport_get_local_cid(void)
+{
+ return vmci_get_context_id();
+}
+
+static struct vsock_transport vmci_transport = {
+ .init = vmci_transport_socket_init,
+ .destruct = vmci_transport_destruct,
+ .release = vmci_transport_release,
+ .connect = vmci_transport_connect,
+ .dgram_bind = vmci_transport_dgram_bind,
+ .dgram_dequeue = vmci_transport_dgram_dequeue,
+ .dgram_enqueue = vmci_transport_dgram_enqueue,
+ .dgram_allow = vmci_transport_dgram_allow,
+ .stream_dequeue = vmci_transport_stream_dequeue,
+ .stream_enqueue = vmci_transport_stream_enqueue,
+ .stream_has_data = vmci_transport_stream_has_data,
+ .stream_has_space = vmci_transport_stream_has_space,
+ .stream_rcvhiwat = vmci_transport_stream_rcvhiwat,
+ .stream_is_active = vmci_transport_stream_is_active,
+ .stream_allow = vmci_transport_stream_allow,
+ .notify_poll_in = vmci_transport_notify_poll_in,
+ .notify_poll_out = vmci_transport_notify_poll_out,
+ .notify_recv_init = vmci_transport_notify_recv_init,
+ .notify_recv_pre_block = vmci_transport_notify_recv_pre_block,
+ .notify_recv_pre_dequeue = vmci_transport_notify_recv_pre_dequeue,
+ .notify_recv_post_dequeue = vmci_transport_notify_recv_post_dequeue,
+ .notify_send_init = vmci_transport_notify_send_init,
+ .notify_send_pre_block = vmci_transport_notify_send_pre_block,
+ .notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue,
+ .notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue,
+ .shutdown = vmci_transport_shutdown,
+ .set_buffer_size = vmci_transport_set_buffer_size,
+ .set_min_buffer_size = vmci_transport_set_min_buffer_size,
+ .set_max_buffer_size = vmci_transport_set_max_buffer_size,
+ .get_buffer_size = vmci_transport_get_buffer_size,
+ .get_min_buffer_size = vmci_transport_get_min_buffer_size,
+ .get_max_buffer_size = vmci_transport_get_max_buffer_size,
+ .get_local_cid = vmci_transport_get_local_cid,
+};
+
+static int __init vmci_transport_init(void)
+{
+ int err;
+
+ /* Create the datagram handle that we will use to send and receive all
+ * VSocket control messages for this context.
+ */
+ err = vmci_transport_datagram_create_hnd(VMCI_TRANSPORT_PACKET_RID,
+ VMCI_FLAG_ANYCID_DG_HND,
+ vmci_transport_recv_stream_cb,
+ NULL,
+ &vmci_transport_stream_handle);
+ if (err < VMCI_SUCCESS) {
+ pr_err("Unable to create datagram handle. (%d)\n", err);
+ return vmci_transport_error_to_vsock_error(err);
+ }
+
+ err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED,
+ vmci_transport_qp_resumed_cb,
+ NULL, &vmci_transport_qp_resumed_sub_id);
+ if (err < VMCI_SUCCESS) {
+ pr_err("Unable to subscribe to resumed event. (%d)\n", err);
+ err = vmci_transport_error_to_vsock_error(err);
+ vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
+ goto err_destroy_stream_handle;
+ }
+
+ err = vsock_core_init(&vmci_transport);
+ if (err < 0)
+ goto err_unsubscribe;
+
+ return 0;
+
+err_unsubscribe:
+ vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id);
+err_destroy_stream_handle:
+ vmci_datagram_destroy_handle(vmci_transport_stream_handle);
+ return err;
+}
+module_init(vmci_transport_init);
+
+static void __exit vmci_transport_exit(void)
+{
+ if (!vmci_handle_is_invalid(vmci_transport_stream_handle)) {
+ if (vmci_datagram_destroy_handle(
+ vmci_transport_stream_handle) != VMCI_SUCCESS)
+ pr_err("Couldn't destroy datagram handle\n");
+ vmci_transport_stream_handle = VMCI_INVALID_HANDLE;
+ }
+
+ if (vmci_transport_qp_resumed_sub_id != VMCI_INVALID_ID) {
+ vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id);
+ vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
+ }
+
+ vsock_core_exit();
+}
+module_exit(vmci_transport_exit);
+
+MODULE_AUTHOR("VMware, Inc.");
+MODULE_DESCRIPTION("VMCI transport for Virtual Sockets");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("vmware_vsock");
+MODULE_ALIAS_NETPROTO(PF_VSOCK);
diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
new file mode 100644
index 000000000..ce6c9623d
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport.h
@@ -0,0 +1,142 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _VMCI_TRANSPORT_H_
+#define _VMCI_TRANSPORT_H_
+
+#include <linux/vmw_vmci_defs.h>
+#include <linux/vmw_vmci_api.h>
+
+#include <net/vsock_addr.h>
+#include <net/af_vsock.h>
+
+/* If the packet format changes in a release then this should change too. */
+#define VMCI_TRANSPORT_PACKET_VERSION 1
+
+/* The resource ID on which control packets are sent. */
+#define VMCI_TRANSPORT_PACKET_RID 1
+
+/* The resource ID on which control packets are sent to the hypervisor. */
+#define VMCI_TRANSPORT_HYPERVISOR_PACKET_RID 15
+
+#define VSOCK_PROTO_INVALID 0
+#define VSOCK_PROTO_PKT_ON_NOTIFY (1 << 0)
+#define VSOCK_PROTO_ALL_SUPPORTED (VSOCK_PROTO_PKT_ON_NOTIFY)
+
+#define vmci_trans(_vsk) ((struct vmci_transport *)((_vsk)->trans))
+
+enum vmci_transport_packet_type {
+ VMCI_TRANSPORT_PACKET_TYPE_INVALID = 0,
+ VMCI_TRANSPORT_PACKET_TYPE_REQUEST,
+ VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE,
+ VMCI_TRANSPORT_PACKET_TYPE_OFFER,
+ VMCI_TRANSPORT_PACKET_TYPE_ATTACH,
+ VMCI_TRANSPORT_PACKET_TYPE_WROTE,
+ VMCI_TRANSPORT_PACKET_TYPE_READ,
+ VMCI_TRANSPORT_PACKET_TYPE_RST,
+ VMCI_TRANSPORT_PACKET_TYPE_SHUTDOWN,
+ VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE,
+ VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ,
+ VMCI_TRANSPORT_PACKET_TYPE_REQUEST2,
+ VMCI_TRANSPORT_PACKET_TYPE_NEGOTIATE2,
+ VMCI_TRANSPORT_PACKET_TYPE_MAX
+};
+
+struct vmci_transport_waiting_info {
+ u64 generation;
+ u64 offset;
+};
+
+/* Control packet type for STREAM sockets. DGRAMs have no control packets nor
+ * special packet header for data packets, they are just raw VMCI DGRAM
+ * messages. For STREAMs, control packets are sent over the control channel
+ * while data is written and read directly from queue pairs with no packet
+ * format.
+ */
+struct vmci_transport_packet {
+ struct vmci_datagram dg;
+ u8 version;
+ u8 type;
+ u16 proto;
+ u32 src_port;
+ u32 dst_port;
+ u32 _reserved2;
+ union {
+ u64 size;
+ u64 mode;
+ struct vmci_handle handle;
+ struct vmci_transport_waiting_info wait;
+ } u;
+};
+
+struct vmci_transport_notify_pkt {
+ u64 write_notify_window;
+ u64 write_notify_min_window;
+ bool peer_waiting_read;
+ bool peer_waiting_write;
+ bool peer_waiting_write_detected;
+ bool sent_waiting_read;
+ bool sent_waiting_write;
+ struct vmci_transport_waiting_info peer_waiting_read_info;
+ struct vmci_transport_waiting_info peer_waiting_write_info;
+ u64 produce_q_generation;
+ u64 consume_q_generation;
+};
+
+struct vmci_transport_notify_pkt_q_state {
+ u64 write_notify_window;
+ u64 write_notify_min_window;
+ bool peer_waiting_write;
+ bool peer_waiting_write_detected;
+};
+
+union vmci_transport_notify {
+ struct vmci_transport_notify_pkt pkt;
+ struct vmci_transport_notify_pkt_q_state pkt_q_state;
+};
+
+/* Our transport-specific data. */
+struct vmci_transport {
+ /* For DGRAMs. */
+ struct vmci_handle dg_handle;
+ /* For STREAMs. */
+ struct vmci_handle qp_handle;
+ struct vmci_qp *qpair;
+ u64 produce_size;
+ u64 consume_size;
+ u64 queue_pair_size;
+ u64 queue_pair_min_size;
+ u64 queue_pair_max_size;
+ u32 attach_sub_id;
+ u32 detach_sub_id;
+ union vmci_transport_notify notify;
+ struct vmci_transport_notify_ops *notify_ops;
+};
+
+int vmci_transport_register(void);
+void vmci_transport_unregister(void);
+
+int vmci_transport_send_wrote_bh(struct sockaddr_vm *dst,
+ struct sockaddr_vm *src);
+int vmci_transport_send_read_bh(struct sockaddr_vm *dst,
+ struct sockaddr_vm *src);
+int vmci_transport_send_wrote(struct sock *sk);
+int vmci_transport_send_read(struct sock *sk);
+int vmci_transport_send_waiting_write(struct sock *sk,
+ struct vmci_transport_waiting_info *wait);
+int vmci_transport_send_waiting_read(struct sock *sk,
+ struct vmci_transport_waiting_info *wait);
+
+#endif
diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c
new file mode 100644
index 000000000..9b7f207f2
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport_notify.c
@@ -0,0 +1,680 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/stddef.h>
+#include <net/sock.h>
+
+#include "vmci_transport_notify.h"
+
+#define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
+
+static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ bool retval;
+ u64 notify_limit;
+
+ if (!PKT_FIELD(vsk, peer_waiting_write))
+ return false;
+
+#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
+ /* When the sender blocks, we take that as a sign that the sender is
+ * faster than the receiver. To reduce the transmit rate of the sender,
+ * we delay the sending of the read notification by decreasing the
+ * write_notify_window. The notification is delayed until the number of
+ * bytes used in the queue drops below the write_notify_window.
+ */
+
+ if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
+ PKT_FIELD(vsk, peer_waiting_write_detected) = true;
+ if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
+ PKT_FIELD(vsk, write_notify_window) =
+ PKT_FIELD(vsk, write_notify_min_window);
+ } else {
+ PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
+ if (PKT_FIELD(vsk, write_notify_window) <
+ PKT_FIELD(vsk, write_notify_min_window))
+ PKT_FIELD(vsk, write_notify_window) =
+ PKT_FIELD(vsk, write_notify_min_window);
+
+ }
+ }
+ notify_limit = vmci_trans(vsk)->consume_size -
+ PKT_FIELD(vsk, write_notify_window);
+#else
+ notify_limit = 0;
+#endif
+
+ /* For now we ignore the wait information and just see if the free
+ * space exceeds the notify limit. Note that improving this function
+ * to be more intelligent will not require a protocol change and will
+ * retain compatibility between endpoints with mixed versions of this
+ * function.
+ *
+ * The notify_limit is used to delay notifications in the case where
+ * flow control is enabled. Below the test is expressed in terms of
+ * free space in the queue: if free_space > ConsumeSize -
+ * write_notify_window then notify An alternate way of expressing this
+ * is to rewrite the expression to use the data ready in the receive
+ * queue: if write_notify_window > bufferReady then notify as
+ * free_space == ConsumeSize - bufferReady.
+ */
+ retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
+ notify_limit;
+#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
+ if (retval) {
+ /*
+ * Once we notify the peer, we reset the detected flag so the
+ * next wait will again cause a decrease in the window size.
+ */
+
+ PKT_FIELD(vsk, peer_waiting_write_detected) = false;
+ }
+#endif
+ return retval;
+#else
+ return true;
+#endif
+}
+
+static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ if (!PKT_FIELD(vsk, peer_waiting_read))
+ return false;
+
+ /* For now we ignore the wait information and just see if there is any
+ * data for our peer to read. Note that improving this function to be
+ * more intelligent will not require a protocol change and will retain
+ * compatibility between endpoints with mixed versions of this
+ * function.
+ */
+ return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0;
+#else
+ return true;
+#endif
+}
+
+static void
+vmci_transport_handle_waiting_read(struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half,
+ struct sockaddr_vm *dst,
+ struct sockaddr_vm *src)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ struct vsock_sock *vsk;
+
+ vsk = vsock_sk(sk);
+
+ PKT_FIELD(vsk, peer_waiting_read) = true;
+ memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait,
+ sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
+
+ if (vmci_transport_notify_waiting_read(vsk)) {
+ bool sent;
+
+ if (bottom_half)
+ sent = vmci_transport_send_wrote_bh(dst, src) > 0;
+ else
+ sent = vmci_transport_send_wrote(sk) > 0;
+
+ if (sent)
+ PKT_FIELD(vsk, peer_waiting_read) = false;
+ }
+#endif
+}
+
+static void
+vmci_transport_handle_waiting_write(struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half,
+ struct sockaddr_vm *dst,
+ struct sockaddr_vm *src)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ struct vsock_sock *vsk;
+
+ vsk = vsock_sk(sk);
+
+ PKT_FIELD(vsk, peer_waiting_write) = true;
+ memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait,
+ sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
+
+ if (vmci_transport_notify_waiting_write(vsk)) {
+ bool sent;
+
+ if (bottom_half)
+ sent = vmci_transport_send_read_bh(dst, src) > 0;
+ else
+ sent = vmci_transport_send_read(sk) > 0;
+
+ if (sent)
+ PKT_FIELD(vsk, peer_waiting_write) = false;
+ }
+#endif
+}
+
+static void
+vmci_transport_handle_read(struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half,
+ struct sockaddr_vm *dst, struct sockaddr_vm *src)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ struct vsock_sock *vsk;
+
+ vsk = vsock_sk(sk);
+ PKT_FIELD(vsk, sent_waiting_write) = false;
+#endif
+
+ sk->sk_write_space(sk);
+}
+
+static bool send_waiting_read(struct sock *sk, u64 room_needed)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ struct vsock_sock *vsk;
+ struct vmci_transport_waiting_info waiting_info;
+ u64 tail;
+ u64 head;
+ u64 room_left;
+ bool ret;
+
+ vsk = vsock_sk(sk);
+
+ if (PKT_FIELD(vsk, sent_waiting_read))
+ return true;
+
+ if (PKT_FIELD(vsk, write_notify_window) <
+ vmci_trans(vsk)->consume_size)
+ PKT_FIELD(vsk, write_notify_window) =
+ min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
+ vmci_trans(vsk)->consume_size);
+
+ vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head);
+ room_left = vmci_trans(vsk)->consume_size - head;
+ if (room_needed >= room_left) {
+ waiting_info.offset = room_needed - room_left;
+ waiting_info.generation =
+ PKT_FIELD(vsk, consume_q_generation) + 1;
+ } else {
+ waiting_info.offset = head + room_needed;
+ waiting_info.generation = PKT_FIELD(vsk, consume_q_generation);
+ }
+
+ ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0;
+ if (ret)
+ PKT_FIELD(vsk, sent_waiting_read) = true;
+
+ return ret;
+#else
+ return true;
+#endif
+}
+
+static bool send_waiting_write(struct sock *sk, u64 room_needed)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ struct vsock_sock *vsk;
+ struct vmci_transport_waiting_info waiting_info;
+ u64 tail;
+ u64 head;
+ u64 room_left;
+ bool ret;
+
+ vsk = vsock_sk(sk);
+
+ if (PKT_FIELD(vsk, sent_waiting_write))
+ return true;
+
+ vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head);
+ room_left = vmci_trans(vsk)->produce_size - tail;
+ if (room_needed + 1 >= room_left) {
+ /* Wraps around to current generation. */
+ waiting_info.offset = room_needed + 1 - room_left;
+ waiting_info.generation = PKT_FIELD(vsk, produce_q_generation);
+ } else {
+ waiting_info.offset = tail + room_needed + 1;
+ waiting_info.generation =
+ PKT_FIELD(vsk, produce_q_generation) - 1;
+ }
+
+ ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0;
+ if (ret)
+ PKT_FIELD(vsk, sent_waiting_write) = true;
+
+ return ret;
+#else
+ return true;
+#endif
+}
+
+static int vmci_transport_send_read_notification(struct sock *sk)
+{
+ struct vsock_sock *vsk;
+ bool sent_read;
+ unsigned int retries;
+ int err;
+
+ vsk = vsock_sk(sk);
+ sent_read = false;
+ retries = 0;
+ err = 0;
+
+ if (vmci_transport_notify_waiting_write(vsk)) {
+ /* Notify the peer that we have read, retrying the send on
+ * failure up to our maximum value. XXX For now we just log
+ * the failure, but later we should schedule a work item to
+ * handle the resend until it succeeds. That would require
+ * keeping track of work items in the vsk and cleaning them up
+ * upon socket close.
+ */
+ while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
+ !sent_read &&
+ retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
+ err = vmci_transport_send_read(sk);
+ if (err >= 0)
+ sent_read = true;
+
+ retries++;
+ }
+
+ if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS)
+ pr_err("%p unable to send read notify to peer\n", sk);
+ else
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ PKT_FIELD(vsk, peer_waiting_write) = false;
+#endif
+
+ }
+ return err;
+}
+
+static void
+vmci_transport_handle_wrote(struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half,
+ struct sockaddr_vm *dst, struct sockaddr_vm *src)
+{
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ struct vsock_sock *vsk = vsock_sk(sk);
+ PKT_FIELD(vsk, sent_waiting_read) = false;
+#endif
+ sk->sk_data_ready(sk);
+}
+
+static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
+ PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
+ PKT_FIELD(vsk, peer_waiting_read) = false;
+ PKT_FIELD(vsk, peer_waiting_write) = false;
+ PKT_FIELD(vsk, peer_waiting_write_detected) = false;
+ PKT_FIELD(vsk, sent_waiting_read) = false;
+ PKT_FIELD(vsk, sent_waiting_write) = false;
+ PKT_FIELD(vsk, produce_q_generation) = 0;
+ PKT_FIELD(vsk, consume_q_generation) = 0;
+
+ memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0,
+ sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
+ memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0,
+ sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
+}
+
+static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
+{
+}
+
+static int
+vmci_transport_notify_pkt_poll_in(struct sock *sk,
+ size_t target, bool *data_ready_now)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (vsock_stream_has_data(vsk)) {
+ *data_ready_now = true;
+ } else {
+ /* We can't read right now because there is nothing in the
+ * queue. Ask for notifications when there is something to
+ * read.
+ */
+ if (sk->sk_state == SS_CONNECTED) {
+ if (!send_waiting_read(sk, 1))
+ return -1;
+
+ }
+ *data_ready_now = false;
+ }
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_poll_out(struct sock *sk,
+ size_t target, bool *space_avail_now)
+{
+ s64 produce_q_free_space;
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ produce_q_free_space = vsock_stream_has_space(vsk);
+ if (produce_q_free_space > 0) {
+ *space_avail_now = true;
+ return 0;
+ } else if (produce_q_free_space == 0) {
+ /* This is a connected socket but we can't currently send data.
+ * Notify the peer that we are waiting if the queue is full. We
+ * only send a waiting write if the queue is full because
+ * otherwise we end up in an infinite WAITING_WRITE, READ,
+ * WAITING_WRITE, READ, etc. loop. Treat failing to send the
+ * notification as a socket error, passing that back through
+ * the mask.
+ */
+ if (!send_waiting_write(sk, 1))
+ return -1;
+
+ *space_avail_now = false;
+ }
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_recv_init(
+ struct sock *sk,
+ size_t target,
+ struct vmci_transport_recv_notify_data *data)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
+ data->consume_head = 0;
+ data->produce_tail = 0;
+#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
+ data->notify_on_block = false;
+
+ if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
+ PKT_FIELD(vsk, write_notify_min_window) = target + 1;
+ if (PKT_FIELD(vsk, write_notify_window) <
+ PKT_FIELD(vsk, write_notify_min_window)) {
+ /* If the current window is smaller than the new
+ * minimal window size, we need to reevaluate whether
+ * we need to notify the sender. If the number of ready
+ * bytes are smaller than the new window, we need to
+ * send a notification to the sender before we block.
+ */
+
+ PKT_FIELD(vsk, write_notify_window) =
+ PKT_FIELD(vsk, write_notify_min_window);
+ data->notify_on_block = true;
+ }
+ }
+#endif
+#endif
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_recv_pre_block(
+ struct sock *sk,
+ size_t target,
+ struct vmci_transport_recv_notify_data *data)
+{
+ int err = 0;
+
+ /* Notify our peer that we are waiting for data to read. */
+ if (!send_waiting_read(sk, target)) {
+ err = -EHOSTUNREACH;
+ return err;
+ }
+#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
+ if (data->notify_on_block) {
+ err = vmci_transport_send_read_notification(sk);
+ if (err < 0)
+ return err;
+
+ data->notify_on_block = false;
+ }
+#endif
+
+ return err;
+}
+
+static int
+vmci_transport_notify_pkt_recv_pre_dequeue(
+ struct sock *sk,
+ size_t target,
+ struct vmci_transport_recv_notify_data *data)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ /* Now consume up to len bytes from the queue. Note that since we have
+ * the socket locked we should copy at least ready bytes.
+ */
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair,
+ &data->produce_tail,
+ &data->consume_head);
+#endif
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_recv_post_dequeue(
+ struct sock *sk,
+ size_t target,
+ ssize_t copied,
+ bool data_read,
+ struct vmci_transport_recv_notify_data *data)
+{
+ struct vsock_sock *vsk;
+ int err;
+
+ vsk = vsock_sk(sk);
+ err = 0;
+
+ if (data_read) {
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ /* Detect a wrap-around to maintain queue generation. Note
+ * that this is safe since we hold the socket lock across the
+ * two queue pair operations.
+ */
+ if (copied >=
+ vmci_trans(vsk)->consume_size - data->consume_head)
+ PKT_FIELD(vsk, consume_q_generation)++;
+#endif
+
+ err = vmci_transport_send_read_notification(sk);
+ if (err < 0)
+ return err;
+
+ }
+ return err;
+}
+
+static int
+vmci_transport_notify_pkt_send_init(
+ struct sock *sk,
+ struct vmci_transport_send_notify_data *data)
+{
+#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
+ data->consume_head = 0;
+ data->produce_tail = 0;
+#endif
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_send_pre_block(
+ struct sock *sk,
+ struct vmci_transport_send_notify_data *data)
+{
+ /* Notify our peer that we are waiting for room to write. */
+ if (!send_waiting_write(sk, 1))
+ return -EHOSTUNREACH;
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_send_pre_enqueue(
+ struct sock *sk,
+ struct vmci_transport_send_notify_data *data)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair,
+ &data->produce_tail,
+ &data->consume_head);
+#endif
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_send_post_enqueue(
+ struct sock *sk,
+ ssize_t written,
+ struct vmci_transport_send_notify_data *data)
+{
+ int err = 0;
+ struct vsock_sock *vsk;
+ bool sent_wrote = false;
+ int retries = 0;
+
+ vsk = vsock_sk(sk);
+
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ /* Detect a wrap-around to maintain queue generation. Note that this
+ * is safe since we hold the socket lock across the two queue pair
+ * operations.
+ */
+ if (written >= vmci_trans(vsk)->produce_size - data->produce_tail)
+ PKT_FIELD(vsk, produce_q_generation)++;
+
+#endif
+
+ if (vmci_transport_notify_waiting_read(vsk)) {
+ /* Notify the peer that we have written, retrying the send on
+ * failure up to our maximum value. See the XXX comment for the
+ * corresponding piece of code in StreamRecvmsg() for potential
+ * improvements.
+ */
+ while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
+ !sent_wrote &&
+ retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
+ err = vmci_transport_send_wrote(sk);
+ if (err >= 0)
+ sent_wrote = true;
+
+ retries++;
+ }
+
+ if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
+ pr_err("%p unable to send wrote notify to peer\n", sk);
+ return err;
+ } else {
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+ PKT_FIELD(vsk, peer_waiting_read) = false;
+#endif
+ }
+ }
+ return err;
+}
+
+static void
+vmci_transport_notify_pkt_handle_pkt(
+ struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half,
+ struct sockaddr_vm *dst,
+ struct sockaddr_vm *src, bool *pkt_processed)
+{
+ bool processed = false;
+
+ switch (pkt->type) {
+ case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
+ vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
+ processed = true;
+ break;
+ case VMCI_TRANSPORT_PACKET_TYPE_READ:
+ vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
+ processed = true;
+ break;
+ case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
+ vmci_transport_handle_waiting_write(sk, pkt, bottom_half,
+ dst, src);
+ processed = true;
+ break;
+
+ case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
+ vmci_transport_handle_waiting_read(sk, pkt, bottom_half,
+ dst, src);
+ processed = true;
+ break;
+ }
+
+ if (pkt_processed)
+ *pkt_processed = processed;
+}
+
+static void vmci_transport_notify_pkt_process_request(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
+ if (vmci_trans(vsk)->consume_size <
+ PKT_FIELD(vsk, write_notify_min_window))
+ PKT_FIELD(vsk, write_notify_min_window) =
+ vmci_trans(vsk)->consume_size;
+}
+
+static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
+ if (vmci_trans(vsk)->consume_size <
+ PKT_FIELD(vsk, write_notify_min_window))
+ PKT_FIELD(vsk, write_notify_min_window) =
+ vmci_trans(vsk)->consume_size;
+}
+
+/* Socket control packet based operations. */
+struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
+ vmci_transport_notify_pkt_socket_init,
+ vmci_transport_notify_pkt_socket_destruct,
+ vmci_transport_notify_pkt_poll_in,
+ vmci_transport_notify_pkt_poll_out,
+ vmci_transport_notify_pkt_handle_pkt,
+ vmci_transport_notify_pkt_recv_init,
+ vmci_transport_notify_pkt_recv_pre_block,
+ vmci_transport_notify_pkt_recv_pre_dequeue,
+ vmci_transport_notify_pkt_recv_post_dequeue,
+ vmci_transport_notify_pkt_send_init,
+ vmci_transport_notify_pkt_send_pre_block,
+ vmci_transport_notify_pkt_send_pre_enqueue,
+ vmci_transport_notify_pkt_send_post_enqueue,
+ vmci_transport_notify_pkt_process_request,
+ vmci_transport_notify_pkt_process_negotiate,
+};
diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h
new file mode 100644
index 000000000..7df793249
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport_notify.h
@@ -0,0 +1,83 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __VMCI_TRANSPORT_NOTIFY_H__
+#define __VMCI_TRANSPORT_NOTIFY_H__
+
+#include <linux/types.h>
+#include <linux/vmw_vmci_defs.h>
+#include <linux/vmw_vmci_api.h>
+#include <linux/vm_sockets.h>
+
+#include "vmci_transport.h"
+
+/* Comment this out to compare with old protocol. */
+#define VSOCK_OPTIMIZATION_WAITING_NOTIFY 1
+#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
+/* Comment this out to remove flow control for "new" protocol */
+#define VSOCK_OPTIMIZATION_FLOW_CONTROL 1
+#endif
+
+#define VMCI_TRANSPORT_MAX_DGRAM_RESENDS 10
+
+struct vmci_transport_recv_notify_data {
+ u64 consume_head;
+ u64 produce_tail;
+ bool notify_on_block;
+};
+
+struct vmci_transport_send_notify_data {
+ u64 consume_head;
+ u64 produce_tail;
+};
+
+/* Socket notification callbacks. */
+struct vmci_transport_notify_ops {
+ void (*socket_init) (struct sock *sk);
+ void (*socket_destruct) (struct vsock_sock *vsk);
+ int (*poll_in) (struct sock *sk, size_t target,
+ bool *data_ready_now);
+ int (*poll_out) (struct sock *sk, size_t target,
+ bool *space_avail_now);
+ void (*handle_notify_pkt) (struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half, struct sockaddr_vm *dst,
+ struct sockaddr_vm *src,
+ bool *pkt_processed);
+ int (*recv_init) (struct sock *sk, size_t target,
+ struct vmci_transport_recv_notify_data *data);
+ int (*recv_pre_block) (struct sock *sk, size_t target,
+ struct vmci_transport_recv_notify_data *data);
+ int (*recv_pre_dequeue) (struct sock *sk, size_t target,
+ struct vmci_transport_recv_notify_data *data);
+ int (*recv_post_dequeue) (struct sock *sk, size_t target,
+ ssize_t copied, bool data_read,
+ struct vmci_transport_recv_notify_data *data);
+ int (*send_init) (struct sock *sk,
+ struct vmci_transport_send_notify_data *data);
+ int (*send_pre_block) (struct sock *sk,
+ struct vmci_transport_send_notify_data *data);
+ int (*send_pre_enqueue) (struct sock *sk,
+ struct vmci_transport_send_notify_data *data);
+ int (*send_post_enqueue) (struct sock *sk, ssize_t written,
+ struct vmci_transport_send_notify_data *data);
+ void (*process_request) (struct sock *sk);
+ void (*process_negotiate) (struct sock *sk);
+};
+
+extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops;
+extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops;
+
+#endif /* __VMCI_TRANSPORT_NOTIFY_H__ */
diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c
new file mode 100644
index 000000000..dc9c7929a
--- /dev/null
+++ b/net/vmw_vsock/vmci_transport_notify_qstate.c
@@ -0,0 +1,438 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/stddef.h>
+#include <net/sock.h>
+
+#include "vmci_transport_notify.h"
+
+#define PKT_FIELD(vsk, field_name) \
+ (vmci_trans(vsk)->notify.pkt_q_state.field_name)
+
+static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
+{
+ bool retval;
+ u64 notify_limit;
+
+ if (!PKT_FIELD(vsk, peer_waiting_write))
+ return false;
+
+ /* When the sender blocks, we take that as a sign that the sender is
+ * faster than the receiver. To reduce the transmit rate of the sender,
+ * we delay the sending of the read notification by decreasing the
+ * write_notify_window. The notification is delayed until the number of
+ * bytes used in the queue drops below the write_notify_window.
+ */
+
+ if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
+ PKT_FIELD(vsk, peer_waiting_write_detected) = true;
+ if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
+ PKT_FIELD(vsk, write_notify_window) =
+ PKT_FIELD(vsk, write_notify_min_window);
+ } else {
+ PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
+ if (PKT_FIELD(vsk, write_notify_window) <
+ PKT_FIELD(vsk, write_notify_min_window))
+ PKT_FIELD(vsk, write_notify_window) =
+ PKT_FIELD(vsk, write_notify_min_window);
+
+ }
+ }
+ notify_limit = vmci_trans(vsk)->consume_size -
+ PKT_FIELD(vsk, write_notify_window);
+
+ /* The notify_limit is used to delay notifications in the case where
+ * flow control is enabled. Below the test is expressed in terms of
+ * free space in the queue: if free_space > ConsumeSize -
+ * write_notify_window then notify An alternate way of expressing this
+ * is to rewrite the expression to use the data ready in the receive
+ * queue: if write_notify_window > bufferReady then notify as
+ * free_space == ConsumeSize - bufferReady.
+ */
+
+ retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
+ notify_limit;
+
+ if (retval) {
+ /* Once we notify the peer, we reset the detected flag so the
+ * next wait will again cause a decrease in the window size.
+ */
+
+ PKT_FIELD(vsk, peer_waiting_write_detected) = false;
+ }
+ return retval;
+}
+
+static void
+vmci_transport_handle_read(struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half,
+ struct sockaddr_vm *dst, struct sockaddr_vm *src)
+{
+ sk->sk_write_space(sk);
+}
+
+static void
+vmci_transport_handle_wrote(struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half,
+ struct sockaddr_vm *dst, struct sockaddr_vm *src)
+{
+ sk->sk_data_ready(sk);
+}
+
+static void vsock_block_update_write_window(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (PKT_FIELD(vsk, write_notify_window) < vmci_trans(vsk)->consume_size)
+ PKT_FIELD(vsk, write_notify_window) =
+ min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
+ vmci_trans(vsk)->consume_size);
+}
+
+static int vmci_transport_send_read_notification(struct sock *sk)
+{
+ struct vsock_sock *vsk;
+ bool sent_read;
+ unsigned int retries;
+ int err;
+
+ vsk = vsock_sk(sk);
+ sent_read = false;
+ retries = 0;
+ err = 0;
+
+ if (vmci_transport_notify_waiting_write(vsk)) {
+ /* Notify the peer that we have read, retrying the send on
+ * failure up to our maximum value. XXX For now we just log
+ * the failure, but later we should schedule a work item to
+ * handle the resend until it succeeds. That would require
+ * keeping track of work items in the vsk and cleaning them up
+ * upon socket close.
+ */
+ while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
+ !sent_read &&
+ retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
+ err = vmci_transport_send_read(sk);
+ if (err >= 0)
+ sent_read = true;
+
+ retries++;
+ }
+
+ if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_read)
+ pr_err("%p unable to send read notification to peer\n",
+ sk);
+ else
+ PKT_FIELD(vsk, peer_waiting_write) = false;
+
+ }
+ return err;
+}
+
+static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
+ PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
+ PKT_FIELD(vsk, peer_waiting_write) = false;
+ PKT_FIELD(vsk, peer_waiting_write_detected) = false;
+}
+
+static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
+{
+ PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
+ PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
+ PKT_FIELD(vsk, peer_waiting_write) = false;
+ PKT_FIELD(vsk, peer_waiting_write_detected) = false;
+}
+
+static int
+vmci_transport_notify_pkt_poll_in(struct sock *sk,
+ size_t target, bool *data_ready_now)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (vsock_stream_has_data(vsk)) {
+ *data_ready_now = true;
+ } else {
+ /* We can't read right now because there is nothing in the
+ * queue. Ask for notifications when there is something to
+ * read.
+ */
+ if (sk->sk_state == SS_CONNECTED)
+ vsock_block_update_write_window(sk);
+ *data_ready_now = false;
+ }
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_poll_out(struct sock *sk,
+ size_t target, bool *space_avail_now)
+{
+ s64 produce_q_free_space;
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ produce_q_free_space = vsock_stream_has_space(vsk);
+ if (produce_q_free_space > 0) {
+ *space_avail_now = true;
+ return 0;
+ } else if (produce_q_free_space == 0) {
+ /* This is a connected socket but we can't currently send data.
+ * Nothing else to do.
+ */
+ *space_avail_now = false;
+ }
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_recv_init(
+ struct sock *sk,
+ size_t target,
+ struct vmci_transport_recv_notify_data *data)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ data->consume_head = 0;
+ data->produce_tail = 0;
+ data->notify_on_block = false;
+
+ if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
+ PKT_FIELD(vsk, write_notify_min_window) = target + 1;
+ if (PKT_FIELD(vsk, write_notify_window) <
+ PKT_FIELD(vsk, write_notify_min_window)) {
+ /* If the current window is smaller than the new
+ * minimal window size, we need to reevaluate whether
+ * we need to notify the sender. If the number of ready
+ * bytes are smaller than the new window, we need to
+ * send a notification to the sender before we block.
+ */
+
+ PKT_FIELD(vsk, write_notify_window) =
+ PKT_FIELD(vsk, write_notify_min_window);
+ data->notify_on_block = true;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_recv_pre_block(
+ struct sock *sk,
+ size_t target,
+ struct vmci_transport_recv_notify_data *data)
+{
+ int err = 0;
+
+ vsock_block_update_write_window(sk);
+
+ if (data->notify_on_block) {
+ err = vmci_transport_send_read_notification(sk);
+ if (err < 0)
+ return err;
+ data->notify_on_block = false;
+ }
+
+ return err;
+}
+
+static int
+vmci_transport_notify_pkt_recv_post_dequeue(
+ struct sock *sk,
+ size_t target,
+ ssize_t copied,
+ bool data_read,
+ struct vmci_transport_recv_notify_data *data)
+{
+ struct vsock_sock *vsk;
+ int err;
+ bool was_full = false;
+ u64 free_space;
+
+ vsk = vsock_sk(sk);
+ err = 0;
+
+ if (data_read) {
+ smp_mb();
+
+ free_space =
+ vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair);
+ was_full = free_space == copied;
+
+ if (was_full)
+ PKT_FIELD(vsk, peer_waiting_write) = true;
+
+ err = vmci_transport_send_read_notification(sk);
+ if (err < 0)
+ return err;
+
+ /* See the comment in
+ * vmci_transport_notify_pkt_send_post_enqueue().
+ */
+ sk->sk_data_ready(sk);
+ }
+
+ return err;
+}
+
+static int
+vmci_transport_notify_pkt_send_init(
+ struct sock *sk,
+ struct vmci_transport_send_notify_data *data)
+{
+ data->consume_head = 0;
+ data->produce_tail = 0;
+
+ return 0;
+}
+
+static int
+vmci_transport_notify_pkt_send_post_enqueue(
+ struct sock *sk,
+ ssize_t written,
+ struct vmci_transport_send_notify_data *data)
+{
+ int err = 0;
+ struct vsock_sock *vsk;
+ bool sent_wrote = false;
+ bool was_empty;
+ int retries = 0;
+
+ vsk = vsock_sk(sk);
+
+ smp_mb();
+
+ was_empty =
+ vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) == written;
+ if (was_empty) {
+ while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
+ !sent_wrote &&
+ retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
+ err = vmci_transport_send_wrote(sk);
+ if (err >= 0)
+ sent_wrote = true;
+
+ retries++;
+ }
+ }
+
+ if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS && !sent_wrote) {
+ pr_err("%p unable to send wrote notification to peer\n",
+ sk);
+ return err;
+ }
+
+ return err;
+}
+
+static void
+vmci_transport_notify_pkt_handle_pkt(
+ struct sock *sk,
+ struct vmci_transport_packet *pkt,
+ bool bottom_half,
+ struct sockaddr_vm *dst,
+ struct sockaddr_vm *src, bool *pkt_processed)
+{
+ bool processed = false;
+
+ switch (pkt->type) {
+ case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
+ vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
+ processed = true;
+ break;
+ case VMCI_TRANSPORT_PACKET_TYPE_READ:
+ vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
+ processed = true;
+ break;
+ }
+
+ if (pkt_processed)
+ *pkt_processed = processed;
+}
+
+static void vmci_transport_notify_pkt_process_request(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
+ if (vmci_trans(vsk)->consume_size <
+ PKT_FIELD(vsk, write_notify_min_window))
+ PKT_FIELD(vsk, write_notify_min_window) =
+ vmci_trans(vsk)->consume_size;
+}
+
+static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
+ if (vmci_trans(vsk)->consume_size <
+ PKT_FIELD(vsk, write_notify_min_window))
+ PKT_FIELD(vsk, write_notify_min_window) =
+ vmci_trans(vsk)->consume_size;
+}
+
+static int
+vmci_transport_notify_pkt_recv_pre_dequeue(
+ struct sock *sk,
+ size_t target,
+ struct vmci_transport_recv_notify_data *data)
+{
+ return 0; /* NOP for QState. */
+}
+
+static int
+vmci_transport_notify_pkt_send_pre_block(
+ struct sock *sk,
+ struct vmci_transport_send_notify_data *data)
+{
+ return 0; /* NOP for QState. */
+}
+
+static int
+vmci_transport_notify_pkt_send_pre_enqueue(
+ struct sock *sk,
+ struct vmci_transport_send_notify_data *data)
+{
+ return 0; /* NOP for QState. */
+}
+
+/* Socket always on control packet based operations. */
+struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
+ vmci_transport_notify_pkt_socket_init,
+ vmci_transport_notify_pkt_socket_destruct,
+ vmci_transport_notify_pkt_poll_in,
+ vmci_transport_notify_pkt_poll_out,
+ vmci_transport_notify_pkt_handle_pkt,
+ vmci_transport_notify_pkt_recv_init,
+ vmci_transport_notify_pkt_recv_pre_block,
+ vmci_transport_notify_pkt_recv_pre_dequeue,
+ vmci_transport_notify_pkt_recv_post_dequeue,
+ vmci_transport_notify_pkt_send_init,
+ vmci_transport_notify_pkt_send_pre_block,
+ vmci_transport_notify_pkt_send_pre_enqueue,
+ vmci_transport_notify_pkt_send_post_enqueue,
+ vmci_transport_notify_pkt_process_request,
+ vmci_transport_notify_pkt_process_negotiate,
+};
diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c
new file mode 100644
index 000000000..82486ee55
--- /dev/null
+++ b/net/vmw_vsock/vsock_addr.c
@@ -0,0 +1,75 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2012 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/stddef.h>
+#include <net/sock.h>
+#include <net/vsock_addr.h>
+
+void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port)
+{
+ memset(addr, 0, sizeof(*addr));
+ addr->svm_family = AF_VSOCK;
+ addr->svm_cid = cid;
+ addr->svm_port = port;
+}
+EXPORT_SYMBOL_GPL(vsock_addr_init);
+
+int vsock_addr_validate(const struct sockaddr_vm *addr)
+{
+ if (!addr)
+ return -EFAULT;
+
+ if (addr->svm_family != AF_VSOCK)
+ return -EAFNOSUPPORT;
+
+ if (addr->svm_zero[0] != 0)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vsock_addr_validate);
+
+bool vsock_addr_bound(const struct sockaddr_vm *addr)
+{
+ return addr->svm_port != VMADDR_PORT_ANY;
+}
+EXPORT_SYMBOL_GPL(vsock_addr_bound);
+
+void vsock_addr_unbind(struct sockaddr_vm *addr)
+{
+ vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
+}
+EXPORT_SYMBOL_GPL(vsock_addr_unbind);
+
+bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
+ const struct sockaddr_vm *other)
+{
+ return addr->svm_cid == other->svm_cid &&
+ addr->svm_port == other->svm_port;
+}
+EXPORT_SYMBOL_GPL(vsock_addr_equals_addr);
+
+int vsock_addr_cast(const struct sockaddr *addr,
+ size_t len, struct sockaddr_vm **out_addr)
+{
+ if (len < sizeof(**out_addr))
+ return -EFAULT;
+
+ *out_addr = (struct sockaddr_vm *)addr;
+ return vsock_addr_validate(*out_addr);
+}
+EXPORT_SYMBOL_GPL(vsock_addr_cast);
diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig
new file mode 100644
index 000000000..e4d97ab47
--- /dev/null
+++ b/net/wimax/Kconfig
@@ -0,0 +1,39 @@
+#
+# WiMAX LAN device configuration
+#
+
+menuconfig WIMAX
+ tristate "WiMAX Wireless Broadband support"
+ depends on RFKILL || !RFKILL
+ help
+
+ Select to configure support for devices that provide
+ wireless broadband connectivity using the WiMAX protocol
+ (IEEE 802.16).
+
+ Please note that most of these devices require signing up
+ for a service plan with a provider.
+
+ The different WiMAX drivers can be enabled in the menu entry
+
+ Device Drivers > Network device support > WiMAX Wireless
+ Broadband devices
+
+ If unsure, it is safe to select M (module).
+
+config WIMAX_DEBUG_LEVEL
+ int "WiMAX debug level"
+ depends on WIMAX
+ default 8
+ help
+
+ Select the maximum debug verbosity level to be compiled into
+ the WiMAX stack code.
+
+ By default, debug messages are disabled at runtime and can
+ be selectively enabled for different parts of the code using
+ the sysfs debug-levels file.
+
+ If set at zero, this will compile out all the debug code.
+
+ It is recommended that it is left at 8.
diff --git a/net/wimax/Makefile b/net/wimax/Makefile
new file mode 100644
index 000000000..8f1510d0c
--- /dev/null
+++ b/net/wimax/Makefile
@@ -0,0 +1,14 @@
+
+obj-$(CONFIG_WIMAX) += wimax.o
+
+wimax-y := \
+ id-table.o \
+ op-msg.o \
+ op-reset.o \
+ op-rfkill.o \
+ op-state-get.o \
+ stack.o
+
+wimax-$(CONFIG_DEBUG_FS) += debugfs.o
+
+
diff --git a/net/wimax/debug-levels.h b/net/wimax/debug-levels.h
new file mode 100644
index 000000000..0975adba6
--- /dev/null
+++ b/net/wimax/debug-levels.h
@@ -0,0 +1,43 @@
+/*
+ * Linux WiMAX Stack
+ * Debug levels control file for the wimax module
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+#ifndef __debug_levels__h__
+#define __debug_levels__h__
+
+/* Maximum compile and run time debug level for all submodules */
+#define D_MODULENAME wimax
+#define D_MASTER CONFIG_WIMAX_DEBUG_LEVEL
+
+#include <linux/wimax/debug.h>
+
+/* List of all the enabled modules */
+enum d_module {
+ D_SUBMODULE_DECLARE(debugfs),
+ D_SUBMODULE_DECLARE(id_table),
+ D_SUBMODULE_DECLARE(op_msg),
+ D_SUBMODULE_DECLARE(op_reset),
+ D_SUBMODULE_DECLARE(op_rfkill),
+ D_SUBMODULE_DECLARE(op_state_get),
+ D_SUBMODULE_DECLARE(stack),
+};
+
+#endif /* #ifndef __debug_levels__h__ */
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
new file mode 100644
index 000000000..6c9bedb74
--- /dev/null
+++ b/net/wimax/debugfs.c
@@ -0,0 +1,80 @@
+/*
+ * Linux WiMAX
+ * Debugfs support
+ *
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+#include <linux/debugfs.h>
+#include <linux/wimax.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE debugfs
+#include "debug-levels.h"
+
+
+#define __debugfs_register(prefix, name, parent) \
+do { \
+ result = d_level_register_debugfs(prefix, name, parent); \
+ if (result < 0) \
+ goto error; \
+} while (0)
+
+
+int wimax_debugfs_add(struct wimax_dev *wimax_dev)
+{
+ int result;
+ struct net_device *net_dev = wimax_dev->net_dev;
+ struct device *dev = net_dev->dev.parent;
+ struct dentry *dentry;
+ char buf[128];
+
+ snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name);
+ dentry = debugfs_create_dir(buf, NULL);
+ result = PTR_ERR(dentry);
+ if (IS_ERR(dentry)) {
+ if (result == -ENODEV)
+ result = 0; /* No debugfs support */
+ else
+ dev_err(dev, "Can't create debugfs dentry: %d\n",
+ result);
+ goto out;
+ }
+ wimax_dev->debugfs_dentry = dentry;
+ __debugfs_register("wimax_dl_", debugfs, dentry);
+ __debugfs_register("wimax_dl_", id_table, dentry);
+ __debugfs_register("wimax_dl_", op_msg, dentry);
+ __debugfs_register("wimax_dl_", op_reset, dentry);
+ __debugfs_register("wimax_dl_", op_rfkill, dentry);
+ __debugfs_register("wimax_dl_", op_state_get, dentry);
+ __debugfs_register("wimax_dl_", stack, dentry);
+ result = 0;
+out:
+ return result;
+
+error:
+ debugfs_remove_recursive(wimax_dev->debugfs_dentry);
+ return result;
+}
+
+void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
+{
+ debugfs_remove_recursive(wimax_dev->debugfs_dentry);
+}
+
+
diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c
new file mode 100644
index 000000000..a21508d11
--- /dev/null
+++ b/net/wimax/id-table.c
@@ -0,0 +1,145 @@
+/*
+ * Linux WiMAX
+ * Mappping of generic netlink family IDs to net devices
+ *
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * We assign a single generic netlink family ID to each device (to
+ * simplify lookup).
+ *
+ * We need a way to map family ID to a wimax_dev pointer.
+ *
+ * The idea is to use a very simple lookup. Using a netlink attribute
+ * with (for example) the interface name implies a heavier search over
+ * all the network devices; seemed kind of a waste given that we know
+ * we are looking for a WiMAX device and that most systems will have
+ * just a single WiMAX adapter.
+ *
+ * We put all the WiMAX devices in the system in a linked list and
+ * match the generic link family ID against the list.
+ *
+ * By using a linked list, the case of a single adapter in the system
+ * becomes (almost) no overhead, while still working for many more. If
+ * it ever goes beyond two, I'll be surprised.
+ */
+#include <linux/device.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/wimax.h>
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE id_table
+#include "debug-levels.h"
+
+
+static DEFINE_SPINLOCK(wimax_id_table_lock);
+static struct list_head wimax_id_table = LIST_HEAD_INIT(wimax_id_table);
+
+
+/*
+ * wimax_id_table_add - add a gennetlink familiy ID / wimax_dev mapping
+ *
+ * @wimax_dev: WiMAX device descriptor to associate to the Generic
+ * Netlink family ID.
+ *
+ * Look for an empty spot in the ID table; if none found, double the
+ * table's size and get the first spot.
+ */
+void wimax_id_table_add(struct wimax_dev *wimax_dev)
+{
+ d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+ spin_lock(&wimax_id_table_lock);
+ list_add(&wimax_dev->id_table_node, &wimax_id_table);
+ spin_unlock(&wimax_id_table_lock);
+ d_fnend(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+}
+
+
+/*
+ * wimax_get_netdev_by_info - lookup a wimax_dev from the gennetlink info
+ *
+ * The generic netlink family ID has been filled out in the
+ * nlmsghdr->nlmsg_type field, so we pull it from there, look it up in
+ * the mapping table and reference the wimax_dev.
+ *
+ * When done, the reference should be dropped with
+ * 'dev_put(wimax_dev->net_dev)'.
+ */
+struct wimax_dev *wimax_dev_get_by_genl_info(
+ struct genl_info *info, int ifindex)
+{
+ struct wimax_dev *wimax_dev = NULL;
+
+ d_fnstart(3, NULL, "(info %p ifindex %d)\n", info, ifindex);
+ spin_lock(&wimax_id_table_lock);
+ list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
+ if (wimax_dev->net_dev->ifindex == ifindex) {
+ dev_hold(wimax_dev->net_dev);
+ goto found;
+ }
+ }
+ wimax_dev = NULL;
+ d_printf(1, NULL, "wimax: no devices found with ifindex %d\n",
+ ifindex);
+found:
+ spin_unlock(&wimax_id_table_lock);
+ d_fnend(3, NULL, "(info %p ifindex %d) = %p\n",
+ info, ifindex, wimax_dev);
+ return wimax_dev;
+}
+
+
+/*
+ * wimax_id_table_rm - Remove a gennetlink familiy ID / wimax_dev mapping
+ *
+ * @id: family ID to remove from the table
+ */
+void wimax_id_table_rm(struct wimax_dev *wimax_dev)
+{
+ spin_lock(&wimax_id_table_lock);
+ list_del_init(&wimax_dev->id_table_node);
+ spin_unlock(&wimax_id_table_lock);
+}
+
+
+/*
+ * Release the gennetlink family id / mapping table
+ *
+ * On debug, verify that the table is empty upon removal. We want the
+ * code always compiled, to ensure it doesn't bit rot. It will be
+ * compiled out if CONFIG_BUG is disabled.
+ */
+void wimax_id_table_release(void)
+{
+ struct wimax_dev *wimax_dev;
+
+#ifndef CONFIG_BUG
+ return;
+#endif
+ spin_lock(&wimax_id_table_lock);
+ list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
+ pr_err("BUG: %s wimax_dev %p ifindex %d not cleared\n",
+ __func__, wimax_dev, wimax_dev->net_dev->ifindex);
+ WARN_ON(1);
+ }
+ spin_unlock(&wimax_id_table_lock);
+}
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
new file mode 100644
index 000000000..54aa14693
--- /dev/null
+++ b/net/wimax/op-msg.c
@@ -0,0 +1,407 @@
+/*
+ * Linux WiMAX
+ * Generic messaging interface between userspace and driver/device
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This implements a direct communication channel between user space and
+ * the driver/device, by which free form messages can be sent back and
+ * forth.
+ *
+ * This is intended for device-specific features, vendor quirks, etc.
+ *
+ * See include/net/wimax.h
+ *
+ * GENERIC NETLINK ENCODING AND CAPACITY
+ *
+ * A destination "pipe name" is added to each message; it is up to the
+ * drivers to assign or use those names (if using them at all).
+ *
+ * Messages are encoded as a binary netlink attribute using nla_put()
+ * using type NLA_UNSPEC (as some versions of libnl still in
+ * deployment don't yet understand NLA_BINARY).
+ *
+ * The maximum capacity of this transport is PAGESIZE per message (so
+ * the actual payload will be bit smaller depending on the
+ * netlink/generic netlink attributes and headers).
+ *
+ * RECEPTION OF MESSAGES
+ *
+ * When a message is received from user space, it is passed verbatim
+ * to the driver calling wimax_dev->op_msg_from_user(). The return
+ * value from this function is passed back to user space as an ack
+ * over the generic netlink protocol.
+ *
+ * The stack doesn't do any processing or interpretation of these
+ * messages.
+ *
+ * SENDING MESSAGES
+ *
+ * Messages can be sent with wimax_msg().
+ *
+ * If the message delivery needs to happen on a different context to
+ * that of its creation, wimax_msg_alloc() can be used to get a
+ * pointer to the message that can be delivered later on with
+ * wimax_msg_send().
+ *
+ * ROADMAP
+ *
+ * wimax_gnl_doit_msg_from_user() Process a message from user space
+ * wimax_dev_get_by_genl_info()
+ * wimax_dev->op_msg_from_user() Delivery of message to the driver
+ *
+ * wimax_msg() Send a message to user space
+ * wimax_msg_alloc()
+ * wimax_msg_send()
+ */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include <linux/export.h>
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE op_msg
+#include "debug-levels.h"
+
+
+/**
+ * wimax_msg_alloc - Create a new skb for sending a message to userspace
+ *
+ * @wimax_dev: WiMAX device descriptor
+ * @pipe_name: "named pipe" the message will be sent to
+ * @msg: pointer to the message data to send
+ * @size: size of the message to send (in bytes), including the header.
+ * @gfp_flags: flags for memory allocation.
+ *
+ * Returns: %0 if ok, negative errno code on error
+ *
+ * Description:
+ *
+ * Allocates an skb that will contain the message to send to user
+ * space over the messaging pipe and initializes it, copying the
+ * payload.
+ *
+ * Once this call is done, you can deliver it with
+ * wimax_msg_send().
+ *
+ * IMPORTANT:
+ *
+ * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
+ * wimax_msg_send() depends on skb->data being placed at the
+ * beginning of the user message.
+ *
+ * Unlike other WiMAX stack calls, this call can be used way early,
+ * even before wimax_dev_add() is called, as long as the
+ * wimax_dev->net_dev pointer is set to point to a proper
+ * net_dev. This is so that drivers can use it early in case they need
+ * to send stuff around or communicate with user space.
+ */
+struct sk_buff *wimax_msg_alloc(struct wimax_dev *wimax_dev,
+ const char *pipe_name,
+ const void *msg, size_t size,
+ gfp_t gfp_flags)
+{
+ int result;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ size_t msg_size;
+ void *genl_msg;
+ struct sk_buff *skb;
+
+ msg_size = nla_total_size(size)
+ + nla_total_size(sizeof(u32))
+ + (pipe_name ? nla_total_size(strlen(pipe_name)) : 0);
+ result = -ENOMEM;
+ skb = genlmsg_new(msg_size, gfp_flags);
+ if (skb == NULL)
+ goto error_new;
+ genl_msg = genlmsg_put(skb, 0, 0, &wimax_gnl_family,
+ 0, WIMAX_GNL_OP_MSG_TO_USER);
+ if (genl_msg == NULL) {
+ dev_err(dev, "no memory to create generic netlink message\n");
+ goto error_genlmsg_put;
+ }
+ result = nla_put_u32(skb, WIMAX_GNL_MSG_IFIDX,
+ wimax_dev->net_dev->ifindex);
+ if (result < 0) {
+ dev_err(dev, "no memory to add ifindex attribute\n");
+ goto error_nla_put;
+ }
+ if (pipe_name) {
+ result = nla_put_string(skb, WIMAX_GNL_MSG_PIPE_NAME,
+ pipe_name);
+ if (result < 0) {
+ dev_err(dev, "no memory to add pipe_name attribute\n");
+ goto error_nla_put;
+ }
+ }
+ result = nla_put(skb, WIMAX_GNL_MSG_DATA, size, msg);
+ if (result < 0) {
+ dev_err(dev, "no memory to add payload (msg %p size %zu) in "
+ "attribute: %d\n", msg, size, result);
+ goto error_nla_put;
+ }
+ genlmsg_end(skb, genl_msg);
+ return skb;
+
+error_nla_put:
+error_genlmsg_put:
+error_new:
+ nlmsg_free(skb);
+ return ERR_PTR(result);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_alloc);
+
+
+/**
+ * wimax_msg_data_len - Return a pointer and size of a message's payload
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ * @size: Pointer to where to store the message's size
+ *
+ * Returns the pointer to the message data.
+ */
+const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size)
+{
+ struct nlmsghdr *nlh = (void *) msg->head;
+ struct nlattr *nla;
+
+ nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+ WIMAX_GNL_MSG_DATA);
+ if (nla == NULL) {
+ pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+ return NULL;
+ }
+ *size = nla_len(nla);
+ return nla_data(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_data_len);
+
+
+/**
+ * wimax_msg_data - Return a pointer to a message's payload
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ */
+const void *wimax_msg_data(struct sk_buff *msg)
+{
+ struct nlmsghdr *nlh = (void *) msg->head;
+ struct nlattr *nla;
+
+ nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+ WIMAX_GNL_MSG_DATA);
+ if (nla == NULL) {
+ pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+ return NULL;
+ }
+ return nla_data(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_data);
+
+
+/**
+ * wimax_msg_len - Return a message's payload length
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ */
+ssize_t wimax_msg_len(struct sk_buff *msg)
+{
+ struct nlmsghdr *nlh = (void *) msg->head;
+ struct nlattr *nla;
+
+ nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+ WIMAX_GNL_MSG_DATA);
+ if (nla == NULL) {
+ pr_err("Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+ return -EINVAL;
+ }
+ return nla_len(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_len);
+
+
+/**
+ * wimax_msg_send - Send a pre-allocated message to user space
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @skb: &struct sk_buff returned by wimax_msg_alloc(). Note the
+ * ownership of @skb is transferred to this function.
+ *
+ * Returns: 0 if ok, < 0 errno code on error
+ *
+ * Description:
+ *
+ * Sends a free-form message that was preallocated with
+ * wimax_msg_alloc() and filled up.
+ *
+ * Assumes that once you pass an skb to this function for sending, it
+ * owns it and will release it when done (on success).
+ *
+ * IMPORTANT:
+ *
+ * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
+ * wimax_msg_send() depends on skb->data being placed at the
+ * beginning of the user message.
+ *
+ * Unlike other WiMAX stack calls, this call can be used way early,
+ * even before wimax_dev_add() is called, as long as the
+ * wimax_dev->net_dev pointer is set to point to a proper
+ * net_dev. This is so that drivers can use it early in case they need
+ * to send stuff around or communicate with user space.
+ */
+int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb)
+{
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ void *msg = skb->data;
+ size_t size = skb->len;
+ might_sleep();
+
+ d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size);
+ d_dump(2, dev, msg, size);
+ genlmsg_multicast(&wimax_gnl_family, skb, 0, 0, GFP_KERNEL);
+ d_printf(1, dev, "CTX: genl multicast done\n");
+ return 0;
+}
+EXPORT_SYMBOL_GPL(wimax_msg_send);
+
+
+/**
+ * wimax_msg - Send a message to user space
+ *
+ * @wimax_dev: WiMAX device descriptor (properly referenced)
+ * @pipe_name: "named pipe" the message will be sent to
+ * @buf: pointer to the message to send.
+ * @size: size of the buffer pointed to by @buf (in bytes).
+ * @gfp_flags: flags for memory allocation.
+ *
+ * Returns: %0 if ok, negative errno code on error.
+ *
+ * Description:
+ *
+ * Sends a free-form message to user space on the device @wimax_dev.
+ *
+ * NOTES:
+ *
+ * Once the @skb is given to this function, who will own it and will
+ * release it when done (unless it returns error).
+ */
+int wimax_msg(struct wimax_dev *wimax_dev, const char *pipe_name,
+ const void *buf, size_t size, gfp_t gfp_flags)
+{
+ int result = -ENOMEM;
+ struct sk_buff *skb;
+
+ skb = wimax_msg_alloc(wimax_dev, pipe_name, buf, size, gfp_flags);
+ if (IS_ERR(skb))
+ result = PTR_ERR(skb);
+ else
+ result = wimax_msg_send(wimax_dev, skb);
+ return result;
+}
+EXPORT_SYMBOL_GPL(wimax_msg);
+
+/*
+ * Relays a message from user space to the driver
+ *
+ * The skb is passed to the driver-specific function with the netlink
+ * and generic netlink headers already stripped.
+ *
+ * This call will block while handling/relaying the message.
+ */
+int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info)
+{
+ int result, ifindex;
+ struct wimax_dev *wimax_dev;
+ struct device *dev;
+ struct nlmsghdr *nlh = info->nlhdr;
+ char *pipe_name;
+ void *msg_buf;
+ size_t msg_len;
+
+ might_sleep();
+ d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+ result = -ENODEV;
+ if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) {
+ pr_err("WIMAX_GNL_MSG_FROM_USER: can't find IFIDX attribute\n");
+ goto error_no_wimax_dev;
+ }
+ ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]);
+ wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+ if (wimax_dev == NULL)
+ goto error_no_wimax_dev;
+ dev = wimax_dev_to_dev(wimax_dev);
+
+ /* Unpack arguments */
+ result = -EINVAL;
+ if (info->attrs[WIMAX_GNL_MSG_DATA] == NULL) {
+ dev_err(dev, "WIMAX_GNL_MSG_FROM_USER: can't find MSG_DATA "
+ "attribute\n");
+ goto error_no_data;
+ }
+ msg_buf = nla_data(info->attrs[WIMAX_GNL_MSG_DATA]);
+ msg_len = nla_len(info->attrs[WIMAX_GNL_MSG_DATA]);
+
+ if (info->attrs[WIMAX_GNL_MSG_PIPE_NAME] == NULL)
+ pipe_name = NULL;
+ else {
+ struct nlattr *attr = info->attrs[WIMAX_GNL_MSG_PIPE_NAME];
+ size_t attr_len = nla_len(attr);
+ /* libnl-1.1 does not yet support NLA_NUL_STRING */
+ result = -ENOMEM;
+ pipe_name = kstrndup(nla_data(attr), attr_len + 1, GFP_KERNEL);
+ if (pipe_name == NULL)
+ goto error_alloc;
+ pipe_name[attr_len] = 0;
+ }
+ mutex_lock(&wimax_dev->mutex);
+ result = wimax_dev_is_ready(wimax_dev);
+ if (result == -ENOMEDIUM)
+ result = 0;
+ if (result < 0)
+ goto error_not_ready;
+ result = -ENOSYS;
+ if (wimax_dev->op_msg_from_user == NULL)
+ goto error_noop;
+
+ d_printf(1, dev,
+ "CRX: nlmsghdr len %u type %u flags 0x%04x seq 0x%x pid %u\n",
+ nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags,
+ nlh->nlmsg_seq, nlh->nlmsg_pid);
+ d_printf(1, dev, "CRX: wimax message %zu bytes\n", msg_len);
+ d_dump(2, dev, msg_buf, msg_len);
+
+ result = wimax_dev->op_msg_from_user(wimax_dev, pipe_name,
+ msg_buf, msg_len, info);
+error_noop:
+error_not_ready:
+ mutex_unlock(&wimax_dev->mutex);
+error_alloc:
+ kfree(pipe_name);
+error_no_data:
+ dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+ d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+ return result;
+}
+
diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c
new file mode 100644
index 000000000..a42079165
--- /dev/null
+++ b/net/wimax/op-reset.c
@@ -0,0 +1,123 @@
+/*
+ * Linux WiMAX
+ * Implement and export a method for resetting a WiMAX device
+ *
+ *
+ * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This implements a simple synchronous call to reset a WiMAX device.
+ *
+ * Resets aim at being warm, keeping the device handles active;
+ * however, when that fails, it falls back to a cold reset (that will
+ * disconnect and reconnect the device).
+ */
+
+#include <net/wimax.h>
+#include <net/genetlink.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include <linux/export.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_reset
+#include "debug-levels.h"
+
+
+/**
+ * wimax_reset - Reset a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Returns:
+ *
+ * %0 if ok and a warm reset was done (the device still exists in
+ * the system).
+ *
+ * -%ENODEV if a cold/bus reset had to be done (device has
+ * disconnected and reconnected, so current handle is not valid
+ * any more).
+ *
+ * -%EINVAL if the device is not even registered.
+ *
+ * Any other negative error code shall be considered as
+ * non-recoverable.
+ *
+ * Description:
+ *
+ * Called when wanting to reset the device for any reason. Device is
+ * taken back to power on status.
+ *
+ * This call blocks; on successful return, the device has completed the
+ * reset process and is ready to operate.
+ */
+int wimax_reset(struct wimax_dev *wimax_dev)
+{
+ int result = -EINVAL;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ enum wimax_st state;
+
+ might_sleep();
+ d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+ mutex_lock(&wimax_dev->mutex);
+ dev_hold(wimax_dev->net_dev);
+ state = wimax_dev->state;
+ mutex_unlock(&wimax_dev->mutex);
+
+ if (state >= WIMAX_ST_DOWN) {
+ mutex_lock(&wimax_dev->mutex_reset);
+ result = wimax_dev->op_reset(wimax_dev);
+ mutex_unlock(&wimax_dev->mutex_reset);
+ }
+ dev_put(wimax_dev->net_dev);
+
+ d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
+ return result;
+}
+EXPORT_SYMBOL(wimax_reset);
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the reset command from user space, return error code.
+ *
+ * No attributes.
+ */
+int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info)
+{
+ int result, ifindex;
+ struct wimax_dev *wimax_dev;
+
+ d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+ result = -ENODEV;
+ if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) {
+ pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n");
+ goto error_no_wimax_dev;
+ }
+ ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]);
+ wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+ if (wimax_dev == NULL)
+ goto error_no_wimax_dev;
+ /* Execute the operation and send the result back to user space */
+ result = wimax_reset(wimax_dev);
+ dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+ d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+ return result;
+}
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
new file mode 100644
index 000000000..7d730543f
--- /dev/null
+++ b/net/wimax/op-rfkill.c
@@ -0,0 +1,447 @@
+/*
+ * Linux WiMAX
+ * RF-kill framework integration
+ *
+ *
+ * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This integrates into the Linux Kernel rfkill susbystem so that the
+ * drivers just have to do the bare minimal work, which is providing a
+ * method to set the software RF-Kill switch and to report changes in
+ * the software and hardware switch status.
+ *
+ * A non-polled generic rfkill device is embedded into the WiMAX
+ * subsystem's representation of a device.
+ *
+ * FIXME: Need polled support? Let drivers provide a poll routine
+ * and hand it to rfkill ops then?
+ *
+ * All device drivers have to do is after wimax_dev_init(), call
+ * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update
+ * initial state and then every time it changes. See wimax.h:struct
+ * wimax_dev for more information.
+ *
+ * ROADMAP
+ *
+ * wimax_gnl_doit_rfkill() User space calling wimax_rfkill()
+ * wimax_rfkill() Kernel calling wimax_rfkill()
+ * __wimax_rf_toggle_radio()
+ *
+ * wimax_rfkill_set_radio_block() RF-Kill subsystem calling
+ * __wimax_rf_toggle_radio()
+ *
+ * __wimax_rf_toggle_radio()
+ * wimax_dev->op_rfkill_sw_toggle() Driver backend
+ * __wimax_state_change()
+ *
+ * wimax_report_rfkill_sw() Driver reports state change
+ * __wimax_state_change()
+ *
+ * wimax_report_rfkill_hw() Driver reports state change
+ * __wimax_state_change()
+ *
+ * wimax_rfkill_add() Initialize/shutdown rfkill support
+ * wimax_rfkill_rm() [called by wimax_dev_add/rm()]
+ */
+
+#include <net/wimax.h>
+#include <net/genetlink.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include <linux/rfkill.h>
+#include <linux/export.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_rfkill
+#include "debug-levels.h"
+
+/**
+ * wimax_report_rfkill_hw - Reports changes in the hardware RF switch
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New state of the RF Kill switch. %WIMAX_RF_ON radio on,
+ * %WIMAX_RF_OFF radio off.
+ *
+ * When the device detects a change in the state of thehardware RF
+ * switch, it must call this function to let the WiMAX kernel stack
+ * know that the state has changed so it can be properly propagated.
+ *
+ * The WiMAX stack caches the state (the driver doesn't need to). As
+ * well, as the change is propagated it will come back as a request to
+ * change the software state to mirror the hardware state.
+ *
+ * If the device doesn't have a hardware kill switch, just report
+ * it on initialization as always on (%WIMAX_RF_ON, radio on).
+ */
+void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
+ enum wimax_rf_state state)
+{
+ int result;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ enum wimax_st wimax_state;
+
+ d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+ BUG_ON(state == WIMAX_RF_QUERY);
+ BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
+
+ mutex_lock(&wimax_dev->mutex);
+ result = wimax_dev_is_ready(wimax_dev);
+ if (result < 0)
+ goto error_not_ready;
+
+ if (state != wimax_dev->rf_hw) {
+ wimax_dev->rf_hw = state;
+ if (wimax_dev->rf_hw == WIMAX_RF_ON &&
+ wimax_dev->rf_sw == WIMAX_RF_ON)
+ wimax_state = WIMAX_ST_READY;
+ else
+ wimax_state = WIMAX_ST_RADIO_OFF;
+
+ result = rfkill_set_hw_state(wimax_dev->rfkill,
+ state == WIMAX_RF_OFF);
+
+ __wimax_state_change(wimax_dev, wimax_state);
+ }
+error_not_ready:
+ mutex_unlock(&wimax_dev->mutex);
+ d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
+ wimax_dev, state, result);
+}
+EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw);
+
+
+/**
+ * wimax_report_rfkill_sw - Reports changes in the software RF switch
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on,
+ * %WIMAX_RF_OFF radio off.
+ *
+ * Reports changes in the software RF switch state to the the WiMAX
+ * stack.
+ *
+ * The main use is during initialization, so the driver can query the
+ * device for its current software radio kill switch state and feed it
+ * to the system.
+ *
+ * On the side, the device does not change the software state by
+ * itself. In practice, this can happen, as the device might decide to
+ * switch (in software) the radio off for different reasons.
+ */
+void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev,
+ enum wimax_rf_state state)
+{
+ int result;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ enum wimax_st wimax_state;
+
+ d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+ BUG_ON(state == WIMAX_RF_QUERY);
+ BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
+
+ mutex_lock(&wimax_dev->mutex);
+ result = wimax_dev_is_ready(wimax_dev);
+ if (result < 0)
+ goto error_not_ready;
+
+ if (state != wimax_dev->rf_sw) {
+ wimax_dev->rf_sw = state;
+ if (wimax_dev->rf_hw == WIMAX_RF_ON &&
+ wimax_dev->rf_sw == WIMAX_RF_ON)
+ wimax_state = WIMAX_ST_READY;
+ else
+ wimax_state = WIMAX_ST_RADIO_OFF;
+ __wimax_state_change(wimax_dev, wimax_state);
+ rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
+ }
+error_not_ready:
+ mutex_unlock(&wimax_dev->mutex);
+ d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
+ wimax_dev, state, result);
+}
+EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw);
+
+
+/*
+ * Callback for the RF Kill toggle operation
+ *
+ * This function is called by:
+ *
+ * - The rfkill subsystem when the RF-Kill key is pressed in the
+ * hardware and the driver notifies through
+ * wimax_report_rfkill_hw(). The rfkill subsystem ends up calling back
+ * here so the software RF Kill switch state is changed to reflect
+ * the hardware switch state.
+ *
+ * - When the user sets the state through sysfs' rfkill/state file
+ *
+ * - When the user calls wimax_rfkill().
+ *
+ * This call blocks!
+ *
+ * WARNING! When we call rfkill_unregister(), this will be called with
+ * state 0!
+ *
+ * WARNING: wimax_dev must be locked
+ */
+static
+int __wimax_rf_toggle_radio(struct wimax_dev *wimax_dev,
+ enum wimax_rf_state state)
+{
+ int result = 0;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ enum wimax_st wimax_state;
+
+ might_sleep();
+ d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+ if (wimax_dev->rf_sw == state)
+ goto out_no_change;
+ if (wimax_dev->op_rfkill_sw_toggle != NULL)
+ result = wimax_dev->op_rfkill_sw_toggle(wimax_dev, state);
+ else if (state == WIMAX_RF_OFF) /* No op? can't turn off */
+ result = -ENXIO;
+ else /* No op? can turn on */
+ result = 0; /* should never happen tho */
+ if (result >= 0) {
+ result = 0;
+ wimax_dev->rf_sw = state;
+ wimax_state = state == WIMAX_RF_ON ?
+ WIMAX_ST_READY : WIMAX_ST_RADIO_OFF;
+ __wimax_state_change(wimax_dev, wimax_state);
+ }
+out_no_change:
+ d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
+ wimax_dev, state, result);
+ return result;
+}
+
+
+/*
+ * Translate from rfkill state to wimax state
+ *
+ * NOTE: Special state handling rules here
+ *
+ * Just pretend the call didn't happen if we are in a state where
+ * we know for sure it cannot be handled (WIMAX_ST_DOWN or
+ * __WIMAX_ST_QUIESCING). rfkill() needs it to register and
+ * unregister, as it will run this path.
+ *
+ * NOTE: This call will block until the operation is completed.
+ */
+static int wimax_rfkill_set_radio_block(void *data, bool blocked)
+{
+ int result;
+ struct wimax_dev *wimax_dev = data;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ enum wimax_rf_state rf_state;
+
+ d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked);
+ rf_state = WIMAX_RF_ON;
+ if (blocked)
+ rf_state = WIMAX_RF_OFF;
+ mutex_lock(&wimax_dev->mutex);
+ if (wimax_dev->state <= __WIMAX_ST_QUIESCING)
+ result = 0;
+ else
+ result = __wimax_rf_toggle_radio(wimax_dev, rf_state);
+ mutex_unlock(&wimax_dev->mutex);
+ d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n",
+ wimax_dev, blocked, result);
+ return result;
+}
+
+static const struct rfkill_ops wimax_rfkill_ops = {
+ .set_block = wimax_rfkill_set_radio_block,
+};
+
+/**
+ * wimax_rfkill - Set the software RF switch state for a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New RF state.
+ *
+ * Returns:
+ *
+ * >= 0 toggle state if ok, < 0 errno code on error. The toggle state
+ * is returned as a bitmap, bit 0 being the hardware RF state, bit 1
+ * the software RF state.
+ *
+ * 0 means disabled (%WIMAX_RF_ON, radio on), 1 means enabled radio
+ * off (%WIMAX_RF_OFF).
+ *
+ * Description:
+ *
+ * Called by the user when he wants to request the WiMAX radio to be
+ * switched on (%WIMAX_RF_ON) or off (%WIMAX_RF_OFF). With
+ * %WIMAX_RF_QUERY, just the current state is returned.
+ *
+ * NOTE:
+ *
+ * This call will block until the operation is complete.
+ */
+int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state)
+{
+ int result;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+
+ d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+ mutex_lock(&wimax_dev->mutex);
+ result = wimax_dev_is_ready(wimax_dev);
+ if (result < 0) {
+ /* While initializing, < 1.4.3 wimax-tools versions use
+ * this call to check if the device is a valid WiMAX
+ * device; so we allow it to proceed always,
+ * considering the radios are all off. */
+ if (result == -ENOMEDIUM && state == WIMAX_RF_QUERY)
+ result = WIMAX_RF_OFF << 1 | WIMAX_RF_OFF;
+ goto error_not_ready;
+ }
+ switch (state) {
+ case WIMAX_RF_ON:
+ case WIMAX_RF_OFF:
+ result = __wimax_rf_toggle_radio(wimax_dev, state);
+ if (result < 0)
+ goto error;
+ rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
+ break;
+ case WIMAX_RF_QUERY:
+ break;
+ default:
+ result = -EINVAL;
+ goto error;
+ }
+ result = wimax_dev->rf_sw << 1 | wimax_dev->rf_hw;
+error:
+error_not_ready:
+ mutex_unlock(&wimax_dev->mutex);
+ d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
+ wimax_dev, state, result);
+ return result;
+}
+EXPORT_SYMBOL(wimax_rfkill);
+
+
+/*
+ * Register a new WiMAX device's RF Kill support
+ *
+ * WARNING: wimax_dev->mutex must be unlocked
+ */
+int wimax_rfkill_add(struct wimax_dev *wimax_dev)
+{
+ int result;
+ struct rfkill *rfkill;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+
+ d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+ /* Initialize RF Kill */
+ result = -ENOMEM;
+ rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX,
+ &wimax_rfkill_ops, wimax_dev);
+ if (rfkill == NULL)
+ goto error_rfkill_allocate;
+
+ d_printf(1, dev, "rfkill %p\n", rfkill);
+
+ wimax_dev->rfkill = rfkill;
+
+ rfkill_init_sw_state(rfkill, 1);
+ result = rfkill_register(wimax_dev->rfkill);
+ if (result < 0)
+ goto error_rfkill_register;
+
+ /* If there is no SW toggle op, SW RFKill is always on */
+ if (wimax_dev->op_rfkill_sw_toggle == NULL)
+ wimax_dev->rf_sw = WIMAX_RF_ON;
+
+ d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev);
+ return 0;
+
+error_rfkill_register:
+ rfkill_destroy(wimax_dev->rfkill);
+error_rfkill_allocate:
+ d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
+ return result;
+}
+
+
+/*
+ * Deregister a WiMAX device's RF Kill support
+ *
+ * Ick, we can't call rfkill_free() after rfkill_unregister()...oh
+ * well.
+ *
+ * WARNING: wimax_dev->mutex must be unlocked
+ */
+void wimax_rfkill_rm(struct wimax_dev *wimax_dev)
+{
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+ rfkill_unregister(wimax_dev->rfkill);
+ rfkill_destroy(wimax_dev->rfkill);
+ d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev);
+}
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the rfkill command from user space, return a combination
+ * value that describe the states of the different toggles.
+ *
+ * Only one attribute: the new state requested (on, off or no change,
+ * just query).
+ */
+
+int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info)
+{
+ int result, ifindex;
+ struct wimax_dev *wimax_dev;
+ struct device *dev;
+ enum wimax_rf_state new_state;
+
+ d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+ result = -ENODEV;
+ if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) {
+ pr_err("WIMAX_GNL_OP_RFKILL: can't find IFIDX attribute\n");
+ goto error_no_wimax_dev;
+ }
+ ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]);
+ wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+ if (wimax_dev == NULL)
+ goto error_no_wimax_dev;
+ dev = wimax_dev_to_dev(wimax_dev);
+ result = -EINVAL;
+ if (info->attrs[WIMAX_GNL_RFKILL_STATE] == NULL) {
+ dev_err(dev, "WIMAX_GNL_RFKILL: can't find RFKILL_STATE "
+ "attribute\n");
+ goto error_no_pid;
+ }
+ new_state = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_STATE]);
+
+ /* Execute the operation and send the result back to user space */
+ result = wimax_rfkill(wimax_dev, new_state);
+error_no_pid:
+ dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+ d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+ return result;
+}
diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c
new file mode 100644
index 000000000..e6788d281
--- /dev/null
+++ b/net/wimax/op-state-get.c
@@ -0,0 +1,65 @@
+/*
+ * Linux WiMAX
+ * Implement and export a method for getting a WiMAX device current state
+ *
+ * Copyright (C) 2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on previous WiMAX core work by:
+ * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <net/wimax.h>
+#include <net/genetlink.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_state_get
+#include "debug-levels.h"
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the state get command from user space, return a combination
+ * value that describe the current state.
+ *
+ * No attributes.
+ */
+int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info)
+{
+ int result, ifindex;
+ struct wimax_dev *wimax_dev;
+
+ d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+ result = -ENODEV;
+ if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) {
+ pr_err("WIMAX_GNL_OP_STATE_GET: can't find IFIDX attribute\n");
+ goto error_no_wimax_dev;
+ }
+ ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]);
+ wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+ if (wimax_dev == NULL)
+ goto error_no_wimax_dev;
+ /* Execute the operation and send the result back to user space */
+ result = wimax_state_get(wimax_dev);
+ dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+ d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+ return result;
+}
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
new file mode 100644
index 000000000..3f816e297
--- /dev/null
+++ b/net/wimax/stack.c
@@ -0,0 +1,632 @@
+/*
+ * Linux WiMAX
+ * Initialization, addition and removal of wimax devices
+ *
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This implements:
+ *
+ * - basic life cycle of 'struct wimax_dev' [wimax_dev_*()]; on
+ * addition/registration initialize all subfields and allocate
+ * generic netlink resources for user space communication. On
+ * removal/unregistration, undo all that.
+ *
+ * - device state machine [wimax_state_change()] and support to send
+ * reports to user space when the state changes
+ * [wimax_gnl_re_state_change*()].
+ *
+ * See include/net/wimax.h for rationales and design.
+ *
+ * ROADMAP
+ *
+ * [__]wimax_state_change() Called by drivers to update device's state
+ * wimax_gnl_re_state_change_alloc()
+ * wimax_gnl_re_state_change_send()
+ *
+ * wimax_dev_init() Init a device
+ * wimax_dev_add() Register
+ * wimax_rfkill_add()
+ * wimax_gnl_add() Register all the generic netlink resources.
+ * wimax_id_table_add()
+ * wimax_dev_rm() Unregister
+ * wimax_id_table_rm()
+ * wimax_gnl_rm()
+ * wimax_rfkill_rm()
+ */
+#include <linux/device.h>
+#include <linux/gfp.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include <linux/wimax.h>
+#include <linux/module.h>
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE stack
+#include "debug-levels.h"
+
+static char wimax_debug_params[128];
+module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params),
+ 0644);
+MODULE_PARM_DESC(debug,
+ "String of space-separated NAME:VALUE pairs, where NAMEs "
+ "are the different debug submodules and VALUE are the "
+ "initial debug value to set.");
+
+/*
+ * Authoritative source for the RE_STATE_CHANGE attribute policy
+ *
+ * We don't really use it here, but /me likes to keep the definition
+ * close to where the data is generated.
+ */
+/*
+static const struct nla_policy wimax_gnl_re_status_change[WIMAX_GNL_ATTR_MAX + 1] = {
+ [WIMAX_GNL_STCH_STATE_OLD] = { .type = NLA_U8 },
+ [WIMAX_GNL_STCH_STATE_NEW] = { .type = NLA_U8 },
+};
+*/
+
+
+/*
+ * Allocate a Report State Change message
+ *
+ * @header: save it, you need it for _send()
+ *
+ * Creates and fills a basic state change message; different code
+ * paths can then add more attributes to the message as needed.
+ *
+ * Use wimax_gnl_re_state_change_send() to send the returned skb.
+ *
+ * Returns: skb with the genl message if ok, IS_ERR() ptr on error
+ * with an errno code.
+ */
+static
+struct sk_buff *wimax_gnl_re_state_change_alloc(
+ struct wimax_dev *wimax_dev,
+ enum wimax_st new_state, enum wimax_st old_state,
+ void **header)
+{
+ int result;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ void *data;
+ struct sk_buff *report_skb;
+
+ d_fnstart(3, dev, "(wimax_dev %p new_state %u old_state %u)\n",
+ wimax_dev, new_state, old_state);
+ result = -ENOMEM;
+ report_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (report_skb == NULL) {
+ dev_err(dev, "RE_STCH: can't create message\n");
+ goto error_new;
+ }
+ /* FIXME: sending a group ID as the seq is wrong */
+ data = genlmsg_put(report_skb, 0, wimax_gnl_family.mcgrp_offset,
+ &wimax_gnl_family, 0, WIMAX_GNL_RE_STATE_CHANGE);
+ if (data == NULL) {
+ dev_err(dev, "RE_STCH: can't put data into message\n");
+ goto error_put;
+ }
+ *header = data;
+
+ result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_OLD, old_state);
+ if (result < 0) {
+ dev_err(dev, "RE_STCH: Error adding OLD attr: %d\n", result);
+ goto error_put;
+ }
+ result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_NEW, new_state);
+ if (result < 0) {
+ dev_err(dev, "RE_STCH: Error adding NEW attr: %d\n", result);
+ goto error_put;
+ }
+ result = nla_put_u32(report_skb, WIMAX_GNL_STCH_IFIDX,
+ wimax_dev->net_dev->ifindex);
+ if (result < 0) {
+ dev_err(dev, "RE_STCH: Error adding IFINDEX attribute\n");
+ goto error_put;
+ }
+ d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %p\n",
+ wimax_dev, new_state, old_state, report_skb);
+ return report_skb;
+
+error_put:
+ nlmsg_free(report_skb);
+error_new:
+ d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %d\n",
+ wimax_dev, new_state, old_state, result);
+ return ERR_PTR(result);
+}
+
+
+/*
+ * Send a Report State Change message (as created with _alloc).
+ *
+ * @report_skb: as returned by wimax_gnl_re_state_change_alloc()
+ * @header: as returned by wimax_gnl_re_state_change_alloc()
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * If the message is NULL, pretend it didn't happen.
+ */
+static
+int wimax_gnl_re_state_change_send(
+ struct wimax_dev *wimax_dev, struct sk_buff *report_skb,
+ void *header)
+{
+ int result = 0;
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n",
+ wimax_dev, report_skb);
+ if (report_skb == NULL) {
+ result = -ENOMEM;
+ goto out;
+ }
+ genlmsg_end(report_skb, header);
+ genlmsg_multicast(&wimax_gnl_family, report_skb, 0, 0, GFP_KERNEL);
+out:
+ d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n",
+ wimax_dev, report_skb, result);
+ return result;
+}
+
+
+static
+void __check_new_state(enum wimax_st old_state, enum wimax_st new_state,
+ unsigned int allowed_states_bm)
+{
+ if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) {
+ pr_err("SW BUG! Forbidden state change %u -> %u\n",
+ old_state, new_state);
+ }
+}
+
+
+/*
+ * Set the current state of a WiMAX device [unlocking version of
+ * wimax_state_change().
+ */
+void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
+{
+ struct device *dev = wimax_dev_to_dev(wimax_dev);
+ enum wimax_st old_state = wimax_dev->state;
+ struct sk_buff *stch_skb;
+ void *header;
+
+ d_fnstart(3, dev, "(wimax_dev %p new_state %u [old %u])\n",
+ wimax_dev, new_state, old_state);
+
+ if (WARN_ON(new_state >= __WIMAX_ST_INVALID)) {
+ dev_err(dev, "SW BUG: requesting invalid state %u\n",
+ new_state);
+ goto out;
+ }
+ if (old_state == new_state)
+ goto out;
+ header = NULL; /* gcc complains? can't grok why */
+ stch_skb = wimax_gnl_re_state_change_alloc(
+ wimax_dev, new_state, old_state, &header);
+
+ /* Verify the state transition and do exit-from-state actions */
+ switch (old_state) {
+ case __WIMAX_ST_NULL:
+ __check_new_state(old_state, new_state,
+ 1 << WIMAX_ST_DOWN);
+ break;
+ case WIMAX_ST_DOWN:
+ __check_new_state(old_state, new_state,
+ 1 << __WIMAX_ST_QUIESCING
+ | 1 << WIMAX_ST_UNINITIALIZED
+ | 1 << WIMAX_ST_RADIO_OFF);
+ break;
+ case __WIMAX_ST_QUIESCING:
+ __check_new_state(old_state, new_state, 1 << WIMAX_ST_DOWN);
+ break;
+ case WIMAX_ST_UNINITIALIZED:
+ __check_new_state(old_state, new_state,
+ 1 << __WIMAX_ST_QUIESCING
+ | 1 << WIMAX_ST_RADIO_OFF);
+ break;
+ case WIMAX_ST_RADIO_OFF:
+ __check_new_state(old_state, new_state,
+ 1 << __WIMAX_ST_QUIESCING
+ | 1 << WIMAX_ST_READY);
+ break;
+ case WIMAX_ST_READY:
+ __check_new_state(old_state, new_state,
+ 1 << __WIMAX_ST_QUIESCING
+ | 1 << WIMAX_ST_RADIO_OFF
+ | 1 << WIMAX_ST_SCANNING
+ | 1 << WIMAX_ST_CONNECTING
+ | 1 << WIMAX_ST_CONNECTED);
+ break;
+ case WIMAX_ST_SCANNING:
+ __check_new_state(old_state, new_state,
+ 1 << __WIMAX_ST_QUIESCING
+ | 1 << WIMAX_ST_RADIO_OFF
+ | 1 << WIMAX_ST_READY
+ | 1 << WIMAX_ST_CONNECTING
+ | 1 << WIMAX_ST_CONNECTED);
+ break;
+ case WIMAX_ST_CONNECTING:
+ __check_new_state(old_state, new_state,
+ 1 << __WIMAX_ST_QUIESCING
+ | 1 << WIMAX_ST_RADIO_OFF
+ | 1 << WIMAX_ST_READY
+ | 1 << WIMAX_ST_SCANNING
+ | 1 << WIMAX_ST_CONNECTED);
+ break;
+ case WIMAX_ST_CONNECTED:
+ __check_new_state(old_state, new_state,
+ 1 << __WIMAX_ST_QUIESCING
+ | 1 << WIMAX_ST_RADIO_OFF
+ | 1 << WIMAX_ST_READY);
+ netif_tx_disable(wimax_dev->net_dev);
+ netif_carrier_off(wimax_dev->net_dev);
+ break;
+ case __WIMAX_ST_INVALID:
+ default:
+ dev_err(dev, "SW BUG: wimax_dev %p is in unknown state %u\n",
+ wimax_dev, wimax_dev->state);
+ WARN_ON(1);
+ goto out;
+ }
+
+ /* Execute the actions of entry to the new state */
+ switch (new_state) {
+ case __WIMAX_ST_NULL:
+ dev_err(dev, "SW BUG: wimax_dev %p entering NULL state "
+ "from %u\n", wimax_dev, wimax_dev->state);
+ WARN_ON(1); /* Nobody can enter this state */
+ break;
+ case WIMAX_ST_DOWN:
+ break;
+ case __WIMAX_ST_QUIESCING:
+ break;
+ case WIMAX_ST_UNINITIALIZED:
+ break;
+ case WIMAX_ST_RADIO_OFF:
+ break;
+ case WIMAX_ST_READY:
+ break;
+ case WIMAX_ST_SCANNING:
+ break;
+ case WIMAX_ST_CONNECTING:
+ break;
+ case WIMAX_ST_CONNECTED:
+ netif_carrier_on(wimax_dev->net_dev);
+ netif_wake_queue(wimax_dev->net_dev);
+ break;
+ case __WIMAX_ST_INVALID:
+ default:
+ BUG();
+ }
+ __wimax_state_set(wimax_dev, new_state);
+ if (!IS_ERR(stch_skb))
+ wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header);
+out:
+ d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n",
+ wimax_dev, new_state, old_state);
+}
+
+
+/**
+ * wimax_state_change - Set the current state of a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor (properly referenced)
+ * @new_state: New state to switch to
+ *
+ * This implements the state changes for the wimax devices. It will
+ *
+ * - verify that the state transition is legal (for now it'll just
+ * print a warning if not) according to the table in
+ * linux/wimax.h's documentation for 'enum wimax_st'.
+ *
+ * - perform the actions needed for leaving the current state and
+ * whichever are needed for entering the new state.
+ *
+ * - issue a report to user space indicating the new state (and an
+ * optional payload with information about the new state).
+ *
+ * NOTE: @wimax_dev must be locked
+ */
+void wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
+{
+ /*
+ * A driver cannot take the wimax_dev out of the
+ * __WIMAX_ST_NULL state unless by calling wimax_dev_add(). If
+ * the wimax_dev's state is still NULL, we ignore any request
+ * to change its state because it means it hasn't been yet
+ * registered.
+ *
+ * There is no need to complain about it, as routines that
+ * call this might be shared from different code paths that
+ * are called before or after wimax_dev_add() has done its
+ * job.
+ */
+ mutex_lock(&wimax_dev->mutex);
+ if (wimax_dev->state > __WIMAX_ST_NULL)
+ __wimax_state_change(wimax_dev, new_state);
+ mutex_unlock(&wimax_dev->mutex);
+}
+EXPORT_SYMBOL_GPL(wimax_state_change);
+
+
+/**
+ * wimax_state_get() - Return the current state of a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Returns: Current state of the device according to its driver.
+ */
+enum wimax_st wimax_state_get(struct wimax_dev *wimax_dev)
+{
+ enum wimax_st state;
+ mutex_lock(&wimax_dev->mutex);
+ state = wimax_dev->state;
+ mutex_unlock(&wimax_dev->mutex);
+ return state;
+}
+EXPORT_SYMBOL_GPL(wimax_state_get);
+
+
+/**
+ * wimax_dev_init - initialize a newly allocated instance
+ *
+ * @wimax_dev: WiMAX device descriptor to initialize.
+ *
+ * Initializes fields of a freshly allocated @wimax_dev instance. This
+ * function assumes that after allocation, the memory occupied by
+ * @wimax_dev was zeroed.
+ */
+void wimax_dev_init(struct wimax_dev *wimax_dev)
+{
+ INIT_LIST_HEAD(&wimax_dev->id_table_node);
+ __wimax_state_set(wimax_dev, __WIMAX_ST_NULL);
+ mutex_init(&wimax_dev->mutex);
+ mutex_init(&wimax_dev->mutex_reset);
+}
+EXPORT_SYMBOL_GPL(wimax_dev_init);
+
+static const struct nla_policy wimax_gnl_policy[WIMAX_GNL_ATTR_MAX + 1] = {
+ [WIMAX_GNL_RESET_IFIDX] = { .type = NLA_U32, },
+ [WIMAX_GNL_RFKILL_IFIDX] = { .type = NLA_U32, },
+ [WIMAX_GNL_RFKILL_STATE] = {
+ .type = NLA_U32 /* enum wimax_rf_state */
+ },
+ [WIMAX_GNL_STGET_IFIDX] = { .type = NLA_U32, },
+ [WIMAX_GNL_MSG_IFIDX] = { .type = NLA_U32, },
+ [WIMAX_GNL_MSG_DATA] = {
+ .type = NLA_UNSPEC, /* libnl doesn't grok BINARY yet */
+ },
+};
+
+static const struct genl_ops wimax_gnl_ops[] = {
+ {
+ .cmd = WIMAX_GNL_OP_MSG_FROM_USER,
+ .flags = GENL_ADMIN_PERM,
+ .policy = wimax_gnl_policy,
+ .doit = wimax_gnl_doit_msg_from_user,
+ },
+ {
+ .cmd = WIMAX_GNL_OP_RESET,
+ .flags = GENL_ADMIN_PERM,
+ .policy = wimax_gnl_policy,
+ .doit = wimax_gnl_doit_reset,
+ },
+ {
+ .cmd = WIMAX_GNL_OP_RFKILL,
+ .flags = GENL_ADMIN_PERM,
+ .policy = wimax_gnl_policy,
+ .doit = wimax_gnl_doit_rfkill,
+ },
+ {
+ .cmd = WIMAX_GNL_OP_STATE_GET,
+ .flags = GENL_ADMIN_PERM,
+ .policy = wimax_gnl_policy,
+ .doit = wimax_gnl_doit_state_get,
+ },
+};
+
+
+static
+size_t wimax_addr_scnprint(char *addr_str, size_t addr_str_size,
+ unsigned char *addr, size_t addr_len)
+{
+ unsigned int cnt, total;
+
+ for (total = cnt = 0; cnt < addr_len; cnt++)
+ total += scnprintf(addr_str + total, addr_str_size - total,
+ "%02x%c", addr[cnt],
+ cnt == addr_len - 1 ? '\0' : ':');
+ return total;
+}
+
+
+/**
+ * wimax_dev_add - Register a new WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor (as embedded in your @net_dev's
+ * priv data). You must have called wimax_dev_init() on it before.
+ *
+ * @net_dev: net device the @wimax_dev is associated with. The
+ * function expects SET_NETDEV_DEV() and register_netdev() were
+ * already called on it.
+ *
+ * Registers the new WiMAX device, sets up the user-kernel control
+ * interface (generic netlink) and common WiMAX infrastructure.
+ *
+ * Note that the parts that will allow interaction with user space are
+ * setup at the very end, when the rest is in place, as once that
+ * happens, the driver might get user space control requests via
+ * netlink or from debugfs that might translate into calls into
+ * wimax_dev->op_*().
+ */
+int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
+{
+ int result;
+ struct device *dev = net_dev->dev.parent;
+ char addr_str[32];
+
+ d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev);
+
+ /* Do the RFKILL setup before locking, as RFKILL will call
+ * into our functions. */
+ wimax_dev->net_dev = net_dev;
+ result = wimax_rfkill_add(wimax_dev);
+ if (result < 0)
+ goto error_rfkill_add;
+
+ /* Set up user-space interaction */
+ mutex_lock(&wimax_dev->mutex);
+ wimax_id_table_add(wimax_dev);
+ result = wimax_debugfs_add(wimax_dev);
+ if (result < 0) {
+ dev_err(dev, "cannot initialize debugfs: %d\n",
+ result);
+ goto error_debugfs_add;
+ }
+
+ __wimax_state_set(wimax_dev, WIMAX_ST_DOWN);
+ mutex_unlock(&wimax_dev->mutex);
+
+ wimax_addr_scnprint(addr_str, sizeof(addr_str),
+ net_dev->dev_addr, net_dev->addr_len);
+ dev_err(dev, "WiMAX interface %s (%s) ready\n",
+ net_dev->name, addr_str);
+ d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev);
+ return 0;
+
+error_debugfs_add:
+ wimax_id_table_rm(wimax_dev);
+ mutex_unlock(&wimax_dev->mutex);
+ wimax_rfkill_rm(wimax_dev);
+error_rfkill_add:
+ d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n",
+ wimax_dev, net_dev, result);
+ return result;
+}
+EXPORT_SYMBOL_GPL(wimax_dev_add);
+
+
+/**
+ * wimax_dev_rm - Unregister an existing WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Unregisters a WiMAX device previously registered for use with
+ * wimax_add_rm().
+ *
+ * IMPORTANT! Must call before calling unregister_netdev().
+ *
+ * After this function returns, you will not get any more user space
+ * control requests (via netlink or debugfs) and thus to wimax_dev->ops.
+ *
+ * Reentrancy control is ensured by setting the state to
+ * %__WIMAX_ST_QUIESCING. rfkill operations coming through
+ * wimax_*rfkill*() will be stopped by the quiescing state; ops coming
+ * from the rfkill subsystem will be stopped by the support being
+ * removed by wimax_rfkill_rm().
+ */
+void wimax_dev_rm(struct wimax_dev *wimax_dev)
+{
+ d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+
+ mutex_lock(&wimax_dev->mutex);
+ __wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING);
+ wimax_debugfs_rm(wimax_dev);
+ wimax_id_table_rm(wimax_dev);
+ __wimax_state_change(wimax_dev, WIMAX_ST_DOWN);
+ mutex_unlock(&wimax_dev->mutex);
+ wimax_rfkill_rm(wimax_dev);
+ d_fnend(3, NULL, "(wimax_dev %p) = void\n", wimax_dev);
+}
+EXPORT_SYMBOL_GPL(wimax_dev_rm);
+
+
+/* Debug framework control of debug levels */
+struct d_level D_LEVEL[] = {
+ D_SUBMODULE_DEFINE(debugfs),
+ D_SUBMODULE_DEFINE(id_table),
+ D_SUBMODULE_DEFINE(op_msg),
+ D_SUBMODULE_DEFINE(op_reset),
+ D_SUBMODULE_DEFINE(op_rfkill),
+ D_SUBMODULE_DEFINE(op_state_get),
+ D_SUBMODULE_DEFINE(stack),
+};
+size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
+
+
+struct genl_family wimax_gnl_family = {
+ .id = GENL_ID_GENERATE,
+ .name = "WiMAX",
+ .version = WIMAX_GNL_VERSION,
+ .hdrsize = 0,
+ .maxattr = WIMAX_GNL_ATTR_MAX,
+};
+
+static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
+ { .name = "msg", },
+};
+
+
+
+/* Shutdown the wimax stack */
+static
+int __init wimax_subsys_init(void)
+{
+ int result;
+
+ d_fnstart(4, NULL, "()\n");
+ d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
+ "wimax.debug");
+
+ snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
+ "WiMAX");
+ result = genl_register_family_with_ops_groups(&wimax_gnl_family,
+ wimax_gnl_ops,
+ wimax_gnl_mcgrps);
+ if (unlikely(result < 0)) {
+ pr_err("cannot register generic netlink family: %d\n", result);
+ goto error_register_family;
+ }
+
+ d_fnend(4, NULL, "() = 0\n");
+ return 0;
+
+error_register_family:
+ d_fnend(4, NULL, "() = %d\n", result);
+ return result;
+
+}
+module_init(wimax_subsys_init);
+
+
+/* Shutdown the wimax stack */
+static
+void __exit wimax_subsys_exit(void)
+{
+ wimax_id_table_release();
+ genl_unregister_family(&wimax_gnl_family);
+}
+module_exit(wimax_subsys_exit);
+
+MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
+MODULE_DESCRIPTION("Linux WiMAX stack");
+MODULE_LICENSE("GPL");
+
diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h
new file mode 100644
index 000000000..733c4bf8d
--- /dev/null
+++ b/net/wimax/wimax-internal.h
@@ -0,0 +1,103 @@
+/*
+ * Linux WiMAX
+ * Internal API for kernel space WiMAX stack
+ *
+ *
+ * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This header file is for declarations and definitions internal to
+ * the WiMAX stack. For public APIs and documentation, see
+ * include/net/wimax.h and include/linux/wimax.h.
+ */
+
+#ifndef __WIMAX_INTERNAL_H__
+#define __WIMAX_INTERNAL_H__
+#ifdef __KERNEL__
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <net/wimax.h>
+
+
+/*
+ * Decide if a (locked) device is ready for use
+ *
+ * Before using the device structure, it must be locked
+ * (wimax_dev->mutex). As well, most operations need to call this
+ * function to check if the state is the right one.
+ *
+ * An error value will be returned if the state is not the right
+ * one. In that case, the caller should not attempt to use the device
+ * and just unlock it.
+ */
+static inline __must_check
+int wimax_dev_is_ready(struct wimax_dev *wimax_dev)
+{
+ if (wimax_dev->state == __WIMAX_ST_NULL)
+ return -EINVAL; /* Device is not even registered! */
+ if (wimax_dev->state == WIMAX_ST_DOWN)
+ return -ENOMEDIUM;
+ if (wimax_dev->state == __WIMAX_ST_QUIESCING)
+ return -ESHUTDOWN;
+ return 0;
+}
+
+
+static inline
+void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state)
+{
+ wimax_dev->state = state;
+}
+void __wimax_state_change(struct wimax_dev *, enum wimax_st);
+
+#ifdef CONFIG_DEBUG_FS
+int wimax_debugfs_add(struct wimax_dev *);
+void wimax_debugfs_rm(struct wimax_dev *);
+#else
+static inline int wimax_debugfs_add(struct wimax_dev *wimax_dev)
+{
+ return 0;
+}
+static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {}
+#endif
+
+void wimax_id_table_add(struct wimax_dev *);
+struct wimax_dev *wimax_dev_get_by_genl_info(struct genl_info *, int);
+void wimax_id_table_rm(struct wimax_dev *);
+void wimax_id_table_release(void);
+
+int wimax_rfkill_add(struct wimax_dev *);
+void wimax_rfkill_rm(struct wimax_dev *);
+
+/* generic netlink */
+extern struct genl_family wimax_gnl_family;
+
+/* ops */
+int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info);
+int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info);
+int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info);
+int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info);
+
+#endif /* #ifdef __KERNEL__ */
+#endif /* #ifndef __WIMAX_INTERNAL_H__ */
diff --git a/net/wireless/.gitignore b/net/wireless/.gitignore
new file mode 100644
index 000000000..c33451b89
--- /dev/null
+++ b/net/wireless/.gitignore
@@ -0,0 +1 @@
+regdb.c
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
new file mode 100644
index 000000000..4f5543dd2
--- /dev/null
+++ b/net/wireless/Kconfig
@@ -0,0 +1,219 @@
+config WIRELESS_EXT
+ bool
+
+config WEXT_CORE
+ def_bool y
+ depends on CFG80211_WEXT || WIRELESS_EXT
+
+config WEXT_PROC
+ def_bool y
+ depends on PROC_FS
+ depends on WEXT_CORE
+
+config WEXT_SPY
+ bool
+
+config WEXT_PRIV
+ bool
+
+config CFG80211
+ tristate "cfg80211 - wireless configuration API"
+ depends on RFKILL || !RFKILL
+ ---help---
+ cfg80211 is the Linux wireless LAN (802.11) configuration API.
+ Enable this if you have a wireless device.
+
+ For more information refer to documentation on the wireless wiki:
+
+ http://wireless.kernel.org/en/developers/Documentation/cfg80211
+
+ When built as a module it will be called cfg80211.
+
+config NL80211_TESTMODE
+ bool "nl80211 testmode command"
+ depends on CFG80211
+ help
+ The nl80211 testmode command helps implementing things like
+ factory calibration or validation tools for wireless chips.
+
+ Select this option ONLY for kernels that are specifically
+ built for such purposes.
+
+ Debugging tools that are supposed to end up in the hands of
+ users should better be implemented with debugfs.
+
+ Say N.
+
+config CFG80211_DEVELOPER_WARNINGS
+ bool "enable developer warnings"
+ depends on CFG80211
+ default n
+ help
+ This option enables some additional warnings that help
+ cfg80211 developers and driver developers, but that can
+ trigger due to races with userspace.
+
+ For example, when a driver reports that it was disconnected
+ from the AP, but the user disconnects manually at the same
+ time, the warning might trigger spuriously due to races.
+
+ Say Y only if you are developing cfg80211 or a driver based
+ on it (or mac80211).
+
+
+config CFG80211_REG_DEBUG
+ bool "cfg80211 regulatory debugging"
+ depends on CFG80211
+ default n
+ ---help---
+ You can enable this if you want to debug regulatory changes.
+ For more information on cfg80211 regulatory refer to the wireless
+ wiki:
+
+ http://wireless.kernel.org/en/developers/Regulatory
+
+ If unsure, say N.
+
+config CFG80211_CERTIFICATION_ONUS
+ bool "cfg80211 certification onus"
+ depends on CFG80211 && EXPERT
+ default n
+ ---help---
+ You should disable this option unless you are both capable
+ and willing to ensure your system will remain regulatory
+ compliant with the features available under this option.
+ Some options may still be under heavy development and
+ for whatever reason regulatory compliance has not or
+ cannot yet be verified. Regulatory verification may at
+ times only be possible until you have the final system
+ in place.
+
+ This option should only be enabled by system integrators
+ or distributions that have done work necessary to ensure
+ regulatory certification on the system with the enabled
+ features. Alternatively you can enable this option if
+ you are a wireless researcher and are working in a controlled
+ and approved environment by your local regulatory agency.
+
+config CFG80211_REG_CELLULAR_HINTS
+ bool "cfg80211 regulatory support for cellular base station hints"
+ depends on CFG80211_CERTIFICATION_ONUS
+ ---help---
+ This option enables support for parsing regulatory hints
+ from cellular base stations. If enabled and at least one driver
+ claims support for parsing cellular base station hints the
+ regulatory core will allow and parse these regulatory hints.
+ The regulatory core will only apply these regulatory hints on
+ drivers that support this feature. You should only enable this
+ feature if you have tested and validated this feature on your
+ systems.
+
+config CFG80211_REG_RELAX_NO_IR
+ bool "cfg80211 support for NO_IR relaxation"
+ depends on CFG80211_CERTIFICATION_ONUS
+ ---help---
+ This option enables support for relaxation of the NO_IR flag for
+ situations that certain regulatory bodies have provided clarifications
+ on how relaxation can occur. This feature has an inherent dependency on
+ userspace features which must have been properly tested and as such is
+ not enabled by default.
+
+ A relaxation feature example is allowing the operation of a P2P group
+ owner (GO) on channels marked with NO_IR if there is an additional BSS
+ interface which associated to an AP which userspace assumes or confirms
+ to be an authorized master, i.e., with radar detection support and DFS
+ capabilities. However, note that in order to not create daisy chain
+ scenarios, this relaxation is not allowed in cases that the BSS client
+ is associated to P2P GO and in addition the P2P GO instantiated on
+ a channel due to this relaxation should not allow connection from
+ non P2P clients.
+
+ The regulatory core will apply these relaxations only for drivers that
+ support this feature by declaring the appropriate channel flags and
+ capabilities in their registration flow.
+
+config CFG80211_DEFAULT_PS
+ bool "enable powersave by default"
+ depends on CFG80211
+ default y
+ help
+ This option enables powersave mode by default.
+
+ If this causes your applications to misbehave you should fix your
+ applications instead -- they need to register their network
+ latency requirement, see Documentation/power/pm_qos_interface.txt.
+
+config CFG80211_DEBUGFS
+ bool "cfg80211 DebugFS entries"
+ depends on CFG80211
+ depends on DEBUG_FS
+ ---help---
+ You can enable this if you want to debugfs entries for cfg80211.
+
+ If unsure, say N.
+
+config CFG80211_INTERNAL_REGDB
+ bool "use statically compiled regulatory rules database" if EXPERT
+ default n
+ depends on CFG80211
+ ---help---
+ This option generates an internal data structure representing
+ the wireless regulatory rules described in net/wireless/db.txt
+ and includes code to query that database. This is an alternative
+ to using CRDA for defining regulatory rules for the kernel.
+
+ Using this option requires some parsing of the db.txt at build time,
+ the parser will be upkept with the latest wireless-regdb updates but
+ older wireless-regdb formats will be ignored. The parser may later
+ be replaced to avoid issues with conflicts on versions of
+ wireless-regdb.
+
+ For details see:
+
+ http://wireless.kernel.org/en/developers/Regulatory
+
+ Most distributions have a CRDA package. So if unsure, say N.
+
+config CFG80211_WEXT
+ bool "cfg80211 wireless extensions compatibility" if !CFG80211_WEXT_EXPORT
+ depends on CFG80211
+ select WEXT_CORE
+ default y if CFG80211_WEXT_EXPORT
+ help
+ Enable this option if you need old userspace for wireless
+ extensions with cfg80211-based drivers.
+
+config CFG80211_WEXT_EXPORT
+ bool
+ depends on CFG80211
+ help
+ Drivers should select this option if they require cfg80211's
+ wext compatibility symbols to be exported.
+
+config LIB80211
+ tristate
+ default n
+ help
+ This options enables a library of common routines used
+ by IEEE802.11 wireless LAN drivers.
+
+ Drivers should select this themselves if needed.
+
+config LIB80211_CRYPT_WEP
+ tristate
+
+config LIB80211_CRYPT_CCMP
+ tristate
+
+config LIB80211_CRYPT_TKIP
+ tristate
+
+config LIB80211_DEBUG
+ bool "lib80211 debugging messages"
+ depends on LIB80211
+ default n
+ ---help---
+ You can enable this if you want verbose debugging messages
+ from lib80211.
+
+ If unsure, say N.
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
new file mode 100644
index 000000000..4c9e39f04
--- /dev/null
+++ b/net/wireless/Makefile
@@ -0,0 +1,25 @@
+obj-$(CONFIG_CFG80211) += cfg80211.o
+obj-$(CONFIG_LIB80211) += lib80211.o
+obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o
+obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o
+obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o
+
+obj-$(CONFIG_WEXT_CORE) += wext-core.o
+obj-$(CONFIG_WEXT_PROC) += wext-proc.o
+obj-$(CONFIG_WEXT_SPY) += wext-spy.o
+obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
+
+cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
+cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o
+cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
+cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
+cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o
+
+CFLAGS_trace.o := -I$(src)
+
+ccflags-y += -D__CHECK_ENDIAN__
+
+$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk
+ @$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@
+
+clean-files := regdb.c
diff --git a/net/wireless/ap.c b/net/wireless/ap.c
new file mode 100644
index 000000000..bdad1f951
--- /dev/null
+++ b/net/wireless/ap.c
@@ -0,0 +1,51 @@
+#include <linux/ieee80211.h>
+#include <linux/export.h>
+#include <net/cfg80211.h>
+#include "nl80211.h"
+#include "core.h"
+#include "rdev-ops.h"
+
+
+int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool notify)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!rdev->ops->stop_ap)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+
+ if (!wdev->beacon_interval)
+ return -ENOENT;
+
+ err = rdev_stop_ap(rdev, dev);
+ if (!err) {
+ wdev->beacon_interval = 0;
+ memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+ wdev->ssid_len = 0;
+ rdev_set_qos_map(rdev, dev, NULL);
+ if (notify)
+ nl80211_send_ap_stopped(wdev);
+ }
+
+ return err;
+}
+
+int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool notify)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ wdev_lock(wdev);
+ err = __cfg80211_stop_ap(rdev, dev, notify);
+ wdev_unlock(wdev);
+
+ return err;
+}
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
new file mode 100644
index 000000000..7aaf7415d
--- /dev/null
+++ b/net/wireless/chan.c
@@ -0,0 +1,916 @@
+/*
+ * This file contains helper code to handle channel
+ * settings and keeping track of what is possible at
+ * any point in time.
+ *
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ */
+
+#include <linux/export.h>
+#include <net/cfg80211.h>
+#include "core.h"
+#include "rdev-ops.h"
+
+void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
+ struct ieee80211_channel *chan,
+ enum nl80211_channel_type chan_type)
+{
+ if (WARN_ON(!chan))
+ return;
+
+ chandef->chan = chan;
+ chandef->center_freq2 = 0;
+
+ switch (chan_type) {
+ case NL80211_CHAN_NO_HT:
+ chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
+ chandef->center_freq1 = chan->center_freq;
+ break;
+ case NL80211_CHAN_HT20:
+ chandef->width = NL80211_CHAN_WIDTH_20;
+ chandef->center_freq1 = chan->center_freq;
+ break;
+ case NL80211_CHAN_HT40PLUS:
+ chandef->width = NL80211_CHAN_WIDTH_40;
+ chandef->center_freq1 = chan->center_freq + 10;
+ break;
+ case NL80211_CHAN_HT40MINUS:
+ chandef->width = NL80211_CHAN_WIDTH_40;
+ chandef->center_freq1 = chan->center_freq - 10;
+ break;
+ default:
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL(cfg80211_chandef_create);
+
+bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
+{
+ u32 control_freq;
+
+ if (!chandef->chan)
+ return false;
+
+ control_freq = chandef->chan->center_freq;
+
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ if (chandef->center_freq1 != control_freq)
+ return false;
+ if (chandef->center_freq2)
+ return false;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ if (chandef->center_freq1 != control_freq + 10 &&
+ chandef->center_freq1 != control_freq - 10)
+ return false;
+ if (chandef->center_freq2)
+ return false;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ if (chandef->center_freq1 != control_freq + 30 &&
+ chandef->center_freq1 != control_freq + 10 &&
+ chandef->center_freq1 != control_freq - 10 &&
+ chandef->center_freq1 != control_freq - 30)
+ return false;
+ if (!chandef->center_freq2)
+ return false;
+ /* adjacent is not allowed -- that's a 160 MHz channel */
+ if (chandef->center_freq1 - chandef->center_freq2 == 80 ||
+ chandef->center_freq2 - chandef->center_freq1 == 80)
+ return false;
+ break;
+ case NL80211_CHAN_WIDTH_80:
+ if (chandef->center_freq1 != control_freq + 30 &&
+ chandef->center_freq1 != control_freq + 10 &&
+ chandef->center_freq1 != control_freq - 10 &&
+ chandef->center_freq1 != control_freq - 30)
+ return false;
+ if (chandef->center_freq2)
+ return false;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ if (chandef->center_freq1 != control_freq + 70 &&
+ chandef->center_freq1 != control_freq + 50 &&
+ chandef->center_freq1 != control_freq + 30 &&
+ chandef->center_freq1 != control_freq + 10 &&
+ chandef->center_freq1 != control_freq - 10 &&
+ chandef->center_freq1 != control_freq - 30 &&
+ chandef->center_freq1 != control_freq - 50 &&
+ chandef->center_freq1 != control_freq - 70)
+ return false;
+ if (chandef->center_freq2)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(cfg80211_chandef_valid);
+
+static void chandef_primary_freqs(const struct cfg80211_chan_def *c,
+ u32 *pri40, u32 *pri80)
+{
+ int tmp;
+
+ switch (c->width) {
+ case NL80211_CHAN_WIDTH_40:
+ *pri40 = c->center_freq1;
+ *pri80 = 0;
+ break;
+ case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_80P80:
+ *pri80 = c->center_freq1;
+ /* n_P20 */
+ tmp = (30 + c->chan->center_freq - c->center_freq1)/20;
+ /* n_P40 */
+ tmp /= 2;
+ /* freq_P40 */
+ *pri40 = c->center_freq1 - 20 + 40 * tmp;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ /* n_P20 */
+ tmp = (70 + c->chan->center_freq - c->center_freq1)/20;
+ /* n_P40 */
+ tmp /= 2;
+ /* freq_P40 */
+ *pri40 = c->center_freq1 - 60 + 40 * tmp;
+ /* n_P80 */
+ tmp /= 2;
+ *pri80 = c->center_freq1 - 40 + 80 * tmp;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+
+static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
+{
+ int width;
+
+ switch (c->width) {
+ case NL80211_CHAN_WIDTH_5:
+ width = 5;
+ break;
+ case NL80211_CHAN_WIDTH_10:
+ width = 10;
+ break;
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ width = 20;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ width = 40;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_80:
+ width = 80;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ width = 160;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ return width;
+}
+
+const struct cfg80211_chan_def *
+cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
+ const struct cfg80211_chan_def *c2)
+{
+ u32 c1_pri40, c1_pri80, c2_pri40, c2_pri80;
+
+ /* If they are identical, return */
+ if (cfg80211_chandef_identical(c1, c2))
+ return c1;
+
+ /* otherwise, must have same control channel */
+ if (c1->chan != c2->chan)
+ return NULL;
+
+ /*
+ * If they have the same width, but aren't identical,
+ * then they can't be compatible.
+ */
+ if (c1->width == c2->width)
+ return NULL;
+
+ /*
+ * can't be compatible if one of them is 5 or 10 MHz,
+ * but they don't have the same width.
+ */
+ if (c1->width == NL80211_CHAN_WIDTH_5 ||
+ c1->width == NL80211_CHAN_WIDTH_10 ||
+ c2->width == NL80211_CHAN_WIDTH_5 ||
+ c2->width == NL80211_CHAN_WIDTH_10)
+ return NULL;
+
+ if (c1->width == NL80211_CHAN_WIDTH_20_NOHT ||
+ c1->width == NL80211_CHAN_WIDTH_20)
+ return c2;
+
+ if (c2->width == NL80211_CHAN_WIDTH_20_NOHT ||
+ c2->width == NL80211_CHAN_WIDTH_20)
+ return c1;
+
+ chandef_primary_freqs(c1, &c1_pri40, &c1_pri80);
+ chandef_primary_freqs(c2, &c2_pri40, &c2_pri80);
+
+ if (c1_pri40 != c2_pri40)
+ return NULL;
+
+ WARN_ON(!c1_pri80 && !c2_pri80);
+ if (c1_pri80 && c2_pri80 && c1_pri80 != c2_pri80)
+ return NULL;
+
+ if (c1->width > c2->width)
+ return c1;
+ return c2;
+}
+EXPORT_SYMBOL(cfg80211_chandef_compatible);
+
+static void cfg80211_set_chans_dfs_state(struct wiphy *wiphy, u32 center_freq,
+ u32 bandwidth,
+ enum nl80211_dfs_state dfs_state)
+{
+ struct ieee80211_channel *c;
+ u32 freq;
+
+ for (freq = center_freq - bandwidth/2 + 10;
+ freq <= center_freq + bandwidth/2 - 10;
+ freq += 20) {
+ c = ieee80211_get_channel(wiphy, freq);
+ if (!c || !(c->flags & IEEE80211_CHAN_RADAR))
+ continue;
+
+ c->dfs_state = dfs_state;
+ c->dfs_state_entered = jiffies;
+ }
+}
+
+void cfg80211_set_dfs_state(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef,
+ enum nl80211_dfs_state dfs_state)
+{
+ int width;
+
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return;
+
+ width = cfg80211_chandef_get_width(chandef);
+ if (width < 0)
+ return;
+
+ cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq1,
+ width, dfs_state);
+
+ if (!chandef->center_freq2)
+ return;
+ cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq2,
+ width, dfs_state);
+}
+
+static u32 cfg80211_get_start_freq(u32 center_freq,
+ u32 bandwidth)
+{
+ u32 start_freq;
+
+ if (bandwidth <= 20)
+ start_freq = center_freq;
+ else
+ start_freq = center_freq - bandwidth/2 + 10;
+
+ return start_freq;
+}
+
+static u32 cfg80211_get_end_freq(u32 center_freq,
+ u32 bandwidth)
+{
+ u32 end_freq;
+
+ if (bandwidth <= 20)
+ end_freq = center_freq;
+ else
+ end_freq = center_freq + bandwidth/2 - 10;
+
+ return end_freq;
+}
+
+static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy,
+ u32 center_freq,
+ u32 bandwidth)
+{
+ struct ieee80211_channel *c;
+ u32 freq, start_freq, end_freq;
+
+ start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
+ end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
+
+ for (freq = start_freq; freq <= end_freq; freq += 20) {
+ c = ieee80211_get_channel(wiphy, freq);
+ if (!c)
+ return -EINVAL;
+
+ if (c->flags & IEEE80211_CHAN_RADAR)
+ return 1;
+ }
+ return 0;
+}
+
+
+int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef,
+ enum nl80211_iftype iftype)
+{
+ int width;
+ int ret;
+
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return -EINVAL;
+
+ switch (iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_MESH_POINT:
+ width = cfg80211_chandef_get_width(chandef);
+ if (width < 0)
+ return -EINVAL;
+
+ ret = cfg80211_get_chans_dfs_required(wiphy,
+ chandef->center_freq1,
+ width);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return BIT(chandef->width);
+
+ if (!chandef->center_freq2)
+ return 0;
+
+ ret = cfg80211_get_chans_dfs_required(wiphy,
+ chandef->center_freq2,
+ width);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ return BIT(chandef->width);
+
+ break;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_OCB:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ WARN_ON(1);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(cfg80211_chandef_dfs_required);
+
+static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy,
+ u32 center_freq,
+ u32 bandwidth)
+{
+ struct ieee80211_channel *c;
+ u32 freq, start_freq, end_freq;
+ int count = 0;
+
+ start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
+ end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
+
+ /*
+ * Check entire range of channels for the bandwidth.
+ * Check all channels are DFS channels (DFS_USABLE or
+ * DFS_AVAILABLE). Return number of usable channels
+ * (require CAC). Allow DFS and non-DFS channel mix.
+ */
+ for (freq = start_freq; freq <= end_freq; freq += 20) {
+ c = ieee80211_get_channel(wiphy, freq);
+ if (!c)
+ return -EINVAL;
+
+ if (c->flags & IEEE80211_CHAN_DISABLED)
+ return -EINVAL;
+
+ if (c->flags & IEEE80211_CHAN_RADAR) {
+ if (c->dfs_state == NL80211_DFS_UNAVAILABLE)
+ return -EINVAL;
+
+ if (c->dfs_state == NL80211_DFS_USABLE)
+ count++;
+ }
+ }
+
+ return count;
+}
+
+bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef)
+{
+ int width;
+ int r1, r2 = 0;
+
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return false;
+
+ width = cfg80211_chandef_get_width(chandef);
+ if (width < 0)
+ return false;
+
+ r1 = cfg80211_get_chans_dfs_usable(wiphy, chandef->center_freq1,
+ width);
+
+ if (r1 < 0)
+ return false;
+
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_80P80:
+ WARN_ON(!chandef->center_freq2);
+ r2 = cfg80211_get_chans_dfs_usable(wiphy,
+ chandef->center_freq2,
+ width);
+ if (r2 < 0)
+ return false;
+ break;
+ default:
+ WARN_ON(chandef->center_freq2);
+ break;
+ }
+
+ return (r1 + r2 > 0);
+}
+
+
+static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy,
+ u32 center_freq,
+ u32 bandwidth)
+{
+ struct ieee80211_channel *c;
+ u32 freq, start_freq, end_freq;
+
+ start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
+ end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
+
+ /*
+ * Check entire range of channels for the bandwidth.
+ * If any channel in between is disabled or has not
+ * had gone through CAC return false
+ */
+ for (freq = start_freq; freq <= end_freq; freq += 20) {
+ c = ieee80211_get_channel(wiphy, freq);
+ if (!c)
+ return false;
+
+ if (c->flags & IEEE80211_CHAN_DISABLED)
+ return false;
+
+ if ((c->flags & IEEE80211_CHAN_RADAR) &&
+ (c->dfs_state != NL80211_DFS_AVAILABLE))
+ return false;
+ }
+
+ return true;
+}
+
+static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef)
+{
+ int width;
+ int r;
+
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return false;
+
+ width = cfg80211_chandef_get_width(chandef);
+ if (width < 0)
+ return false;
+
+ r = cfg80211_get_chans_dfs_available(wiphy, chandef->center_freq1,
+ width);
+
+ /* If any of channels unavailable for cf1 just return */
+ if (!r)
+ return r;
+
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_80P80:
+ WARN_ON(!chandef->center_freq2);
+ r = cfg80211_get_chans_dfs_available(wiphy,
+ chandef->center_freq2,
+ width);
+ default:
+ WARN_ON(chandef->center_freq2);
+ break;
+ }
+
+ return r;
+}
+
+static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy,
+ u32 center_freq,
+ u32 bandwidth)
+{
+ struct ieee80211_channel *c;
+ u32 start_freq, end_freq, freq;
+ unsigned int dfs_cac_ms = 0;
+
+ start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
+ end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
+
+ for (freq = start_freq; freq <= end_freq; freq += 20) {
+ c = ieee80211_get_channel(wiphy, freq);
+ if (!c)
+ return 0;
+
+ if (c->flags & IEEE80211_CHAN_DISABLED)
+ return 0;
+
+ if (!(c->flags & IEEE80211_CHAN_RADAR))
+ continue;
+
+ if (c->dfs_cac_ms > dfs_cac_ms)
+ dfs_cac_ms = c->dfs_cac_ms;
+ }
+
+ return dfs_cac_ms;
+}
+
+unsigned int
+cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef)
+{
+ int width;
+ unsigned int t1 = 0, t2 = 0;
+
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return 0;
+
+ width = cfg80211_chandef_get_width(chandef);
+ if (width < 0)
+ return 0;
+
+ t1 = cfg80211_get_chans_dfs_cac_time(wiphy,
+ chandef->center_freq1,
+ width);
+
+ if (!chandef->center_freq2)
+ return t1;
+
+ t2 = cfg80211_get_chans_dfs_cac_time(wiphy,
+ chandef->center_freq2,
+ width);
+
+ return max(t1, t2);
+}
+
+static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
+ u32 center_freq, u32 bandwidth,
+ u32 prohibited_flags)
+{
+ struct ieee80211_channel *c;
+ u32 freq, start_freq, end_freq;
+
+ start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
+ end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
+
+ for (freq = start_freq; freq <= end_freq; freq += 20) {
+ c = ieee80211_get_channel(wiphy, freq);
+ if (!c || c->flags & prohibited_flags)
+ return false;
+ }
+
+ return true;
+}
+
+bool cfg80211_chandef_usable(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef,
+ u32 prohibited_flags)
+{
+ struct ieee80211_sta_ht_cap *ht_cap;
+ struct ieee80211_sta_vht_cap *vht_cap;
+ u32 width, control_freq, cap;
+
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return false;
+
+ ht_cap = &wiphy->bands[chandef->chan->band]->ht_cap;
+ vht_cap = &wiphy->bands[chandef->chan->band]->vht_cap;
+
+ control_freq = chandef->chan->center_freq;
+
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_5:
+ width = 5;
+ break;
+ case NL80211_CHAN_WIDTH_10:
+ prohibited_flags |= IEEE80211_CHAN_NO_10MHZ;
+ width = 10;
+ break;
+ case NL80211_CHAN_WIDTH_20:
+ if (!ht_cap->ht_supported)
+ return false;
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ prohibited_flags |= IEEE80211_CHAN_NO_20MHZ;
+ width = 20;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ width = 40;
+ if (!ht_cap->ht_supported)
+ return false;
+ if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ||
+ ht_cap->cap & IEEE80211_HT_CAP_40MHZ_INTOLERANT)
+ return false;
+ if (chandef->center_freq1 < control_freq &&
+ chandef->chan->flags & IEEE80211_CHAN_NO_HT40MINUS)
+ return false;
+ if (chandef->center_freq1 > control_freq &&
+ chandef->chan->flags & IEEE80211_CHAN_NO_HT40PLUS)
+ return false;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+ if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
+ return false;
+ case NL80211_CHAN_WIDTH_80:
+ if (!vht_cap->vht_supported)
+ return false;
+ prohibited_flags |= IEEE80211_CHAN_NO_80MHZ;
+ width = 80;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ if (!vht_cap->vht_supported)
+ return false;
+ cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+ if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
+ cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
+ return false;
+ prohibited_flags |= IEEE80211_CHAN_NO_160MHZ;
+ width = 160;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * TODO: What if there are only certain 80/160/80+80 MHz channels
+ * allowed by the driver, or only certain combinations?
+ * For 40 MHz the driver can set the NO_HT40 flags, but for
+ * 80/160 MHz and in particular 80+80 MHz this isn't really
+ * feasible and we only have NO_80MHZ/NO_160MHZ so far but
+ * no way to cover 80+80 MHz or more complex restrictions.
+ * Note that such restrictions also need to be advertised to
+ * userspace, for example for P2P channel selection.
+ */
+
+ if (width > 20)
+ prohibited_flags |= IEEE80211_CHAN_NO_OFDM;
+
+ /* 5 and 10 MHz are only defined for the OFDM PHY */
+ if (width < 20)
+ prohibited_flags |= IEEE80211_CHAN_NO_OFDM;
+
+
+ if (!cfg80211_secondary_chans_ok(wiphy, chandef->center_freq1,
+ width, prohibited_flags))
+ return false;
+
+ if (!chandef->center_freq2)
+ return true;
+ return cfg80211_secondary_chans_ok(wiphy, chandef->center_freq2,
+ width, prohibited_flags);
+}
+EXPORT_SYMBOL(cfg80211_chandef_usable);
+
+/*
+ * For GO only, check if the channel can be used under permissive conditions
+ * mandated by the some regulatory bodies, i.e., the channel is marked with
+ * IEEE80211_CHAN_GO_CONCURRENT and there is an additional station interface
+ * associated to an AP on the same channel or on the same UNII band
+ * (assuming that the AP is an authorized master).
+ * In addition allow the GO to operate on a channel on which indoor operation is
+ * allowed, iff we are currently operating in an indoor environment.
+ */
+static bool cfg80211_go_permissive_chan(struct cfg80211_registered_device *rdev,
+ struct ieee80211_channel *chan)
+{
+ struct wireless_dev *wdev_iter;
+ struct wiphy *wiphy = wiphy_idx_to_wiphy(rdev->wiphy_idx);
+
+ ASSERT_RTNL();
+
+ if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
+ !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR))
+ return false;
+
+ if (regulatory_indoor_allowed() &&
+ (chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
+ return true;
+
+ if (!(chan->flags & IEEE80211_CHAN_GO_CONCURRENT))
+ return false;
+
+ /*
+ * Generally, it is possible to rely on another device/driver to allow
+ * the GO concurrent relaxation, however, since the device can further
+ * enforce the relaxation (by doing a similar verifications as this),
+ * and thus fail the GO instantiation, consider only the interfaces of
+ * the current registered device.
+ */
+ list_for_each_entry(wdev_iter, &rdev->wdev_list, list) {
+ struct ieee80211_channel *other_chan = NULL;
+ int r1, r2;
+
+ if (wdev_iter->iftype != NL80211_IFTYPE_STATION ||
+ !netif_running(wdev_iter->netdev))
+ continue;
+
+ wdev_lock(wdev_iter);
+ if (wdev_iter->current_bss)
+ other_chan = wdev_iter->current_bss->pub.channel;
+ wdev_unlock(wdev_iter);
+
+ if (!other_chan)
+ continue;
+
+ if (chan == other_chan)
+ return true;
+
+ if (chan->band != IEEE80211_BAND_5GHZ)
+ continue;
+
+ r1 = cfg80211_get_unii(chan->center_freq);
+ r2 = cfg80211_get_unii(other_chan->center_freq);
+
+ if (r1 != -EINVAL && r1 == r2) {
+ /*
+ * At some locations channels 149-165 are considered a
+ * bundle, but at other locations, e.g., Indonesia,
+ * channels 149-161 are considered a bundle while
+ * channel 165 is left out and considered to be in a
+ * different bundle. Thus, in case that there is a
+ * station interface connected to an AP on channel 165,
+ * it is assumed that channels 149-161 are allowed for
+ * GO operations. However, having a station interface
+ * connected to an AP on channels 149-161, does not
+ * allow GO operation on channel 165.
+ */
+ if (chan->center_freq == 5825 &&
+ other_chan->center_freq != 5825)
+ continue;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef,
+ enum nl80211_iftype iftype)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ bool res;
+ u32 prohibited_flags = IEEE80211_CHAN_DISABLED |
+ IEEE80211_CHAN_RADAR;
+
+ trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype);
+
+ /*
+ * Under certain conditions suggested by the some regulatory bodies
+ * a GO can operate on channels marked with IEEE80211_NO_IR
+ * so set this flag only if such relaxations are not enabled and
+ * the conditions are not met.
+ */
+ if (iftype != NL80211_IFTYPE_P2P_GO ||
+ !cfg80211_go_permissive_chan(rdev, chandef->chan))
+ prohibited_flags |= IEEE80211_CHAN_NO_IR;
+
+ if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 &&
+ cfg80211_chandef_dfs_available(wiphy, chandef)) {
+ /* We can skip IEEE80211_CHAN_NO_IR if chandef dfs available */
+ prohibited_flags = IEEE80211_CHAN_DISABLED;
+ }
+
+ res = cfg80211_chandef_usable(wiphy, chandef, prohibited_flags);
+
+ trace_cfg80211_return_bool(res);
+ return res;
+}
+EXPORT_SYMBOL(cfg80211_reg_can_beacon);
+
+int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
+ struct cfg80211_chan_def *chandef)
+{
+ if (!rdev->ops->set_monitor_channel)
+ return -EOPNOTSUPP;
+ if (!cfg80211_has_monitors_only(rdev))
+ return -EBUSY;
+
+ return rdev_set_monitor_channel(rdev, chandef);
+}
+
+void
+cfg80211_get_chan_state(struct wireless_dev *wdev,
+ struct ieee80211_channel **chan,
+ enum cfg80211_chan_mode *chanmode,
+ u8 *radar_detect)
+{
+ int ret;
+
+ *chan = NULL;
+ *chanmode = CHAN_MODE_UNDEFINED;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (wdev->netdev && !netif_running(wdev->netdev))
+ return;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ if (wdev->current_bss) {
+ *chan = wdev->current_bss->pub.channel;
+ *chanmode = (wdev->ibss_fixed &&
+ !wdev->ibss_dfs_possible)
+ ? CHAN_MODE_SHARED
+ : CHAN_MODE_EXCLUSIVE;
+
+ /* consider worst-case - IBSS can try to return to the
+ * original user-specified channel as creator */
+ if (wdev->ibss_dfs_possible)
+ *radar_detect |= BIT(wdev->chandef.width);
+ return;
+ }
+ break;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ if (wdev->current_bss) {
+ *chan = wdev->current_bss->pub.channel;
+ *chanmode = CHAN_MODE_SHARED;
+ return;
+ }
+ break;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ if (wdev->cac_started) {
+ *chan = wdev->chandef.chan;
+ *chanmode = CHAN_MODE_SHARED;
+ *radar_detect |= BIT(wdev->chandef.width);
+ } else if (wdev->beacon_interval) {
+ *chan = wdev->chandef.chan;
+ *chanmode = CHAN_MODE_SHARED;
+
+ ret = cfg80211_chandef_dfs_required(wdev->wiphy,
+ &wdev->chandef,
+ wdev->iftype);
+ WARN_ON(ret < 0);
+ if (ret > 0)
+ *radar_detect |= BIT(wdev->chandef.width);
+ }
+ return;
+ case NL80211_IFTYPE_MESH_POINT:
+ if (wdev->mesh_id_len) {
+ *chan = wdev->chandef.chan;
+ *chanmode = CHAN_MODE_SHARED;
+
+ ret = cfg80211_chandef_dfs_required(wdev->wiphy,
+ &wdev->chandef,
+ wdev->iftype);
+ WARN_ON(ret < 0);
+ if (ret > 0)
+ *radar_detect |= BIT(wdev->chandef.width);
+ }
+ return;
+ case NL80211_IFTYPE_OCB:
+ if (wdev->chandef.chan) {
+ *chan = wdev->chandef.chan;
+ *chanmode = CHAN_MODE_SHARED;
+ return;
+ }
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ /* these interface types don't really have a channel */
+ return;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ WARN_ON(1);
+ }
+}
diff --git a/net/wireless/core.c b/net/wireless/core.c
new file mode 100644
index 000000000..2a0bbd228
--- /dev/null
+++ b/net/wireless/core.c
@@ -0,0 +1,1224 @@
+/*
+ * This is the linux wireless configuration interface.
+ *
+ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/nl80211.h>
+#include <linux/debugfs.h>
+#include <linux/notifier.h>
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
+#include <net/genetlink.h>
+#include <net/cfg80211.h>
+#include "nl80211.h"
+#include "core.h"
+#include "sysfs.h"
+#include "debugfs.h"
+#include "wext-compat.h"
+#include "rdev-ops.h"
+
+/* name for sysfs, %d is appended */
+#define PHY_NAME "phy"
+
+MODULE_AUTHOR("Johannes Berg");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("wireless configuration support");
+MODULE_ALIAS_GENL_FAMILY(NL80211_GENL_NAME);
+
+/* RCU-protected (and RTNL for writers) */
+LIST_HEAD(cfg80211_rdev_list);
+int cfg80211_rdev_list_generation;
+
+/* for debugfs */
+static struct dentry *ieee80211_debugfs_dir;
+
+/* for the cleanup, scan and event works */
+struct workqueue_struct *cfg80211_wq;
+
+static bool cfg80211_disable_40mhz_24ghz;
+module_param(cfg80211_disable_40mhz_24ghz, bool, 0644);
+MODULE_PARM_DESC(cfg80211_disable_40mhz_24ghz,
+ "Disable 40MHz support in the 2.4GHz band");
+
+struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx)
+{
+ struct cfg80211_registered_device *result = NULL, *rdev;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ if (rdev->wiphy_idx == wiphy_idx) {
+ result = rdev;
+ break;
+ }
+ }
+
+ return result;
+}
+
+int get_wiphy_idx(struct wiphy *wiphy)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ return rdev->wiphy_idx;
+}
+
+struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx)
+{
+ struct cfg80211_registered_device *rdev;
+
+ ASSERT_RTNL();
+
+ rdev = cfg80211_rdev_by_wiphy_idx(wiphy_idx);
+ if (!rdev)
+ return NULL;
+ return &rdev->wiphy;
+}
+
+static int cfg80211_dev_check_name(struct cfg80211_registered_device *rdev,
+ const char *newname)
+{
+ struct cfg80211_registered_device *rdev2;
+ int wiphy_idx, taken = -1, digits;
+
+ ASSERT_RTNL();
+
+ /* prohibit calling the thing phy%d when %d is not its number */
+ sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken);
+ if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) {
+ /* count number of places needed to print wiphy_idx */
+ digits = 1;
+ while (wiphy_idx /= 10)
+ digits++;
+ /*
+ * deny the name if it is phy<idx> where <idx> is printed
+ * without leading zeroes. taken == strlen(newname) here
+ */
+ if (taken == strlen(PHY_NAME) + digits)
+ return -EINVAL;
+ }
+
+ /* Ensure another device does not already have this name. */
+ list_for_each_entry(rdev2, &cfg80211_rdev_list, list)
+ if (strcmp(newname, wiphy_name(&rdev2->wiphy)) == 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
+ char *newname)
+{
+ int result;
+
+ ASSERT_RTNL();
+
+ /* Ignore nop renames */
+ if (strcmp(newname, wiphy_name(&rdev->wiphy)) == 0)
+ return 0;
+
+ result = cfg80211_dev_check_name(rdev, newname);
+ if (result < 0)
+ return result;
+
+ result = device_rename(&rdev->wiphy.dev, newname);
+ if (result)
+ return result;
+
+ if (rdev->wiphy.debugfsdir &&
+ !debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
+ rdev->wiphy.debugfsdir,
+ rdev->wiphy.debugfsdir->d_parent,
+ newname))
+ pr_err("failed to rename debugfs dir to %s!\n", newname);
+
+ nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
+
+ return 0;
+}
+
+int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
+ struct net *net)
+{
+ struct wireless_dev *wdev;
+ int err = 0;
+
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ if (!wdev->netdev)
+ continue;
+ wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+ err = dev_change_net_namespace(wdev->netdev, net, "wlan%d");
+ if (err)
+ break;
+ wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
+ }
+
+ if (err) {
+ /* failed -- clean up to old netns */
+ net = wiphy_net(&rdev->wiphy);
+
+ list_for_each_entry_continue_reverse(wdev, &rdev->wdev_list,
+ list) {
+ if (!wdev->netdev)
+ continue;
+ wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+ err = dev_change_net_namespace(wdev->netdev, net,
+ "wlan%d");
+ WARN_ON(err);
+ wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
+ }
+
+ return err;
+ }
+
+ wiphy_net_set(&rdev->wiphy, net);
+
+ err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev));
+ WARN_ON(err);
+
+ return 0;
+}
+
+static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data)
+{
+ struct cfg80211_registered_device *rdev = data;
+
+ rdev_rfkill_poll(rdev);
+}
+
+void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ ASSERT_RTNL();
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE))
+ return;
+
+ if (!wdev->p2p_started)
+ return;
+
+ rdev_stop_p2p_device(rdev, wdev);
+ wdev->p2p_started = false;
+
+ rdev->opencount--;
+
+ if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
+ if (WARN_ON(!rdev->scan_req->notified))
+ rdev->scan_req->aborted = true;
+ ___cfg80211_scan_done(rdev, false);
+ }
+}
+
+void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct wireless_dev *wdev;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ if (wdev->netdev) {
+ dev_close(wdev->netdev);
+ continue;
+ }
+ /* otherwise, check iftype */
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_P2P_DEVICE:
+ cfg80211_stop_p2p_device(rdev, wdev);
+ break;
+ default:
+ break;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces);
+
+static int cfg80211_rfkill_set_block(void *data, bool blocked)
+{
+ struct cfg80211_registered_device *rdev = data;
+
+ if (!blocked)
+ return 0;
+
+ rtnl_lock();
+ cfg80211_shutdown_all_interfaces(&rdev->wiphy);
+ rtnl_unlock();
+
+ return 0;
+}
+
+static void cfg80211_rfkill_sync_work(struct work_struct *work)
+{
+ struct cfg80211_registered_device *rdev;
+
+ rdev = container_of(work, struct cfg80211_registered_device, rfkill_sync);
+ cfg80211_rfkill_set_block(rdev, rfkill_blocked(rdev->rfkill));
+}
+
+static void cfg80211_event_work(struct work_struct *work)
+{
+ struct cfg80211_registered_device *rdev;
+
+ rdev = container_of(work, struct cfg80211_registered_device,
+ event_work);
+
+ rtnl_lock();
+ cfg80211_process_rdev_events(rdev);
+ rtnl_unlock();
+}
+
+void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_iface_destroy *item;
+
+ ASSERT_RTNL();
+
+ spin_lock_irq(&rdev->destroy_list_lock);
+ while ((item = list_first_entry_or_null(&rdev->destroy_list,
+ struct cfg80211_iface_destroy,
+ list))) {
+ struct wireless_dev *wdev, *tmp;
+ u32 nlportid = item->nlportid;
+
+ list_del(&item->list);
+ kfree(item);
+ spin_unlock_irq(&rdev->destroy_list_lock);
+
+ list_for_each_entry_safe(wdev, tmp, &rdev->wdev_list, list) {
+ if (nlportid == wdev->owner_nlportid)
+ rdev_del_virtual_intf(rdev, wdev);
+ }
+
+ spin_lock_irq(&rdev->destroy_list_lock);
+ }
+ spin_unlock_irq(&rdev->destroy_list_lock);
+}
+
+static void cfg80211_destroy_iface_wk(struct work_struct *work)
+{
+ struct cfg80211_registered_device *rdev;
+
+ rdev = container_of(work, struct cfg80211_registered_device,
+ destroy_work);
+
+ rtnl_lock();
+ cfg80211_destroy_ifaces(rdev);
+ rtnl_unlock();
+}
+
+static void cfg80211_sched_scan_stop_wk(struct work_struct *work)
+{
+ struct cfg80211_registered_device *rdev;
+
+ rdev = container_of(work, struct cfg80211_registered_device,
+ sched_scan_stop_wk);
+
+ rtnl_lock();
+
+ __cfg80211_stop_sched_scan(rdev, false);
+
+ rtnl_unlock();
+}
+
+/* exported functions */
+
+struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
+ const char *requested_name)
+{
+ static atomic_t wiphy_counter = ATOMIC_INIT(0);
+
+ struct cfg80211_registered_device *rdev;
+ int alloc_size;
+
+ WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key));
+ WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc));
+ WARN_ON(ops->connect && !ops->disconnect);
+ WARN_ON(ops->join_ibss && !ops->leave_ibss);
+ WARN_ON(ops->add_virtual_intf && !ops->del_virtual_intf);
+ WARN_ON(ops->add_station && !ops->del_station);
+ WARN_ON(ops->add_mpath && !ops->del_mpath);
+ WARN_ON(ops->join_mesh && !ops->leave_mesh);
+
+ alloc_size = sizeof(*rdev) + sizeof_priv;
+
+ rdev = kzalloc(alloc_size, GFP_KERNEL);
+ if (!rdev)
+ return NULL;
+
+ rdev->ops = ops;
+
+ rdev->wiphy_idx = atomic_inc_return(&wiphy_counter);
+
+ if (unlikely(rdev->wiphy_idx < 0)) {
+ /* ugh, wrapped! */
+ atomic_dec(&wiphy_counter);
+ kfree(rdev);
+ return NULL;
+ }
+
+ /* atomic_inc_return makes it start at 1, make it start at 0 */
+ rdev->wiphy_idx--;
+
+ /* give it a proper name */
+ if (requested_name && requested_name[0]) {
+ int rv;
+
+ rtnl_lock();
+ rv = cfg80211_dev_check_name(rdev, requested_name);
+
+ if (rv < 0) {
+ rtnl_unlock();
+ goto use_default_name;
+ }
+
+ rv = dev_set_name(&rdev->wiphy.dev, "%s", requested_name);
+ rtnl_unlock();
+ if (rv)
+ goto use_default_name;
+ } else {
+use_default_name:
+ /* NOTE: This is *probably* safe w/out holding rtnl because of
+ * the restrictions on phy names. Probably this call could
+ * fail if some other part of the kernel (re)named a device
+ * phyX. But, might should add some locking and check return
+ * value, and use a different name if this one exists?
+ */
+ dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx);
+ }
+
+ INIT_LIST_HEAD(&rdev->wdev_list);
+ INIT_LIST_HEAD(&rdev->beacon_registrations);
+ spin_lock_init(&rdev->beacon_registrations_lock);
+ spin_lock_init(&rdev->bss_lock);
+ INIT_LIST_HEAD(&rdev->bss_list);
+ INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done);
+ INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results);
+ INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk,
+ cfg80211_dfs_channels_update_work);
+#ifdef CONFIG_CFG80211_WEXT
+ rdev->wiphy.wext = &cfg80211_wext_handler;
+#endif
+
+ device_initialize(&rdev->wiphy.dev);
+ rdev->wiphy.dev.class = &ieee80211_class;
+ rdev->wiphy.dev.platform_data = rdev;
+
+ INIT_LIST_HEAD(&rdev->destroy_list);
+ spin_lock_init(&rdev->destroy_list_lock);
+ INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk);
+ INIT_WORK(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk);
+
+#ifdef CONFIG_CFG80211_DEFAULT_PS
+ rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT;
+#endif
+
+ wiphy_net_set(&rdev->wiphy, &init_net);
+
+ rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block;
+ rdev->rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev),
+ &rdev->wiphy.dev, RFKILL_TYPE_WLAN,
+ &rdev->rfkill_ops, rdev);
+
+ if (!rdev->rfkill) {
+ kfree(rdev);
+ return NULL;
+ }
+
+ INIT_WORK(&rdev->rfkill_sync, cfg80211_rfkill_sync_work);
+ INIT_WORK(&rdev->conn_work, cfg80211_conn_work);
+ INIT_WORK(&rdev->event_work, cfg80211_event_work);
+
+ init_waitqueue_head(&rdev->dev_wait);
+
+ /*
+ * Initialize wiphy parameters to IEEE 802.11 MIB default values.
+ * Fragmentation and RTS threshold are disabled by default with the
+ * special -1 value.
+ */
+ rdev->wiphy.retry_short = 7;
+ rdev->wiphy.retry_long = 4;
+ rdev->wiphy.frag_threshold = (u32) -1;
+ rdev->wiphy.rts_threshold = (u32) -1;
+ rdev->wiphy.coverage_class = 0;
+
+ rdev->wiphy.max_num_csa_counters = 1;
+
+ return &rdev->wiphy;
+}
+EXPORT_SYMBOL(wiphy_new_nm);
+
+static int wiphy_verify_combinations(struct wiphy *wiphy)
+{
+ const struct ieee80211_iface_combination *c;
+ int i, j;
+
+ for (i = 0; i < wiphy->n_iface_combinations; i++) {
+ u32 cnt = 0;
+ u16 all_iftypes = 0;
+
+ c = &wiphy->iface_combinations[i];
+
+ /*
+ * Combinations with just one interface aren't real,
+ * however we make an exception for DFS.
+ */
+ if (WARN_ON((c->max_interfaces < 2) && !c->radar_detect_widths))
+ return -EINVAL;
+
+ /* Need at least one channel */
+ if (WARN_ON(!c->num_different_channels))
+ return -EINVAL;
+
+ /*
+ * Put a sane limit on maximum number of different
+ * channels to simplify channel accounting code.
+ */
+ if (WARN_ON(c->num_different_channels >
+ CFG80211_MAX_NUM_DIFFERENT_CHANNELS))
+ return -EINVAL;
+
+ /* DFS only works on one channel. */
+ if (WARN_ON(c->radar_detect_widths &&
+ (c->num_different_channels > 1)))
+ return -EINVAL;
+
+ if (WARN_ON(!c->n_limits))
+ return -EINVAL;
+
+ for (j = 0; j < c->n_limits; j++) {
+ u16 types = c->limits[j].types;
+
+ /* interface types shouldn't overlap */
+ if (WARN_ON(types & all_iftypes))
+ return -EINVAL;
+ all_iftypes |= types;
+
+ if (WARN_ON(!c->limits[j].max))
+ return -EINVAL;
+
+ /* Shouldn't list software iftypes in combinations! */
+ if (WARN_ON(wiphy->software_iftypes & types))
+ return -EINVAL;
+
+ /* Only a single P2P_DEVICE can be allowed */
+ if (WARN_ON(types & BIT(NL80211_IFTYPE_P2P_DEVICE) &&
+ c->limits[j].max > 1))
+ return -EINVAL;
+
+ cnt += c->limits[j].max;
+ /*
+ * Don't advertise an unsupported type
+ * in a combination.
+ */
+ if (WARN_ON((wiphy->interface_modes & types) != types))
+ return -EINVAL;
+ }
+
+ /* You can't even choose that many! */
+ if (WARN_ON(cnt < c->max_interfaces))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int wiphy_register(struct wiphy *wiphy)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ int res;
+ enum ieee80211_band band;
+ struct ieee80211_supported_band *sband;
+ bool have_band = false;
+ int i;
+ u16 ifmodes = wiphy->interface_modes;
+
+#ifdef CONFIG_PM
+ if (WARN_ON(wiphy->wowlan &&
+ (wiphy->wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) &&
+ !(wiphy->wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY)))
+ return -EINVAL;
+ if (WARN_ON(wiphy->wowlan &&
+ !wiphy->wowlan->flags && !wiphy->wowlan->n_patterns &&
+ !wiphy->wowlan->tcp))
+ return -EINVAL;
+#endif
+ if (WARN_ON((wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH) &&
+ (!rdev->ops->tdls_channel_switch ||
+ !rdev->ops->tdls_cancel_channel_switch)))
+ return -EINVAL;
+
+ /*
+ * if a wiphy has unsupported modes for regulatory channel enforcement,
+ * opt-out of enforcement checking
+ */
+ if (wiphy->interface_modes & ~(BIT(NL80211_IFTYPE_STATION) |
+ BIT(NL80211_IFTYPE_P2P_CLIENT) |
+ BIT(NL80211_IFTYPE_AP) |
+ BIT(NL80211_IFTYPE_P2P_GO) |
+ BIT(NL80211_IFTYPE_ADHOC) |
+ BIT(NL80211_IFTYPE_P2P_DEVICE) |
+ BIT(NL80211_IFTYPE_AP_VLAN) |
+ BIT(NL80211_IFTYPE_MONITOR)))
+ wiphy->regulatory_flags |= REGULATORY_IGNORE_STALE_KICKOFF;
+
+ if (WARN_ON((wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) &&
+ (wiphy->regulatory_flags &
+ (REGULATORY_CUSTOM_REG |
+ REGULATORY_STRICT_REG |
+ REGULATORY_COUNTRY_IE_FOLLOW_POWER |
+ REGULATORY_COUNTRY_IE_IGNORE))))
+ return -EINVAL;
+
+ if (WARN_ON(wiphy->coalesce &&
+ (!wiphy->coalesce->n_rules ||
+ !wiphy->coalesce->n_patterns) &&
+ (!wiphy->coalesce->pattern_min_len ||
+ wiphy->coalesce->pattern_min_len >
+ wiphy->coalesce->pattern_max_len)))
+ return -EINVAL;
+
+ if (WARN_ON(wiphy->ap_sme_capa &&
+ !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME)))
+ return -EINVAL;
+
+ if (WARN_ON(wiphy->addresses && !wiphy->n_addresses))
+ return -EINVAL;
+
+ if (WARN_ON(wiphy->addresses &&
+ !is_zero_ether_addr(wiphy->perm_addr) &&
+ memcmp(wiphy->perm_addr, wiphy->addresses[0].addr,
+ ETH_ALEN)))
+ return -EINVAL;
+
+ if (WARN_ON(wiphy->max_acl_mac_addrs &&
+ (!(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME) ||
+ !rdev->ops->set_mac_acl)))
+ return -EINVAL;
+
+ if (wiphy->addresses)
+ memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
+
+ /* sanity check ifmodes */
+ WARN_ON(!ifmodes);
+ ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1;
+ if (WARN_ON(ifmodes != wiphy->interface_modes))
+ wiphy->interface_modes = ifmodes;
+
+ res = wiphy_verify_combinations(wiphy);
+ if (res)
+ return res;
+
+ /* sanity check supported bands/channels */
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ sband = wiphy->bands[band];
+ if (!sband)
+ continue;
+
+ sband->band = band;
+ if (WARN_ON(!sband->n_channels))
+ return -EINVAL;
+ /*
+ * on 60gHz band, there are no legacy rates, so
+ * n_bitrates is 0
+ */
+ if (WARN_ON(band != IEEE80211_BAND_60GHZ &&
+ !sband->n_bitrates))
+ return -EINVAL;
+
+ /*
+ * Since cfg80211_disable_40mhz_24ghz is global, we can
+ * modify the sband's ht data even if the driver uses a
+ * global structure for that.
+ */
+ if (cfg80211_disable_40mhz_24ghz &&
+ band == IEEE80211_BAND_2GHZ &&
+ sband->ht_cap.ht_supported) {
+ sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+ sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40;
+ }
+
+ /*
+ * Since we use a u32 for rate bitmaps in
+ * ieee80211_get_response_rate, we cannot
+ * have more than 32 legacy rates.
+ */
+ if (WARN_ON(sband->n_bitrates > 32))
+ return -EINVAL;
+
+ for (i = 0; i < sband->n_channels; i++) {
+ sband->channels[i].orig_flags =
+ sband->channels[i].flags;
+ sband->channels[i].orig_mag = INT_MAX;
+ sband->channels[i].orig_mpwr =
+ sband->channels[i].max_power;
+ sband->channels[i].band = band;
+ }
+
+ have_band = true;
+ }
+
+ if (!have_band) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_PM
+ if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns &&
+ (!rdev->wiphy.wowlan->pattern_min_len ||
+ rdev->wiphy.wowlan->pattern_min_len >
+ rdev->wiphy.wowlan->pattern_max_len)))
+ return -EINVAL;
+#endif
+
+ /* check and set up bitrates */
+ ieee80211_set_bitrate_flags(wiphy);
+
+ rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH;
+
+ rtnl_lock();
+ res = device_add(&rdev->wiphy.dev);
+ if (res) {
+ rtnl_unlock();
+ return res;
+ }
+
+ /* set up regulatory info */
+ wiphy_regulatory_register(wiphy);
+
+ list_add_rcu(&rdev->list, &cfg80211_rdev_list);
+ cfg80211_rdev_list_generation++;
+
+ /* add to debugfs */
+ rdev->wiphy.debugfsdir =
+ debugfs_create_dir(wiphy_name(&rdev->wiphy),
+ ieee80211_debugfs_dir);
+ if (IS_ERR(rdev->wiphy.debugfsdir))
+ rdev->wiphy.debugfsdir = NULL;
+
+ cfg80211_debugfs_rdev_add(rdev);
+ nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
+
+ if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) {
+ struct regulatory_request request;
+
+ request.wiphy_idx = get_wiphy_idx(wiphy);
+ request.initiator = NL80211_REGDOM_SET_BY_DRIVER;
+ request.alpha2[0] = '9';
+ request.alpha2[1] = '9';
+
+ nl80211_send_reg_change_event(&request);
+ }
+
+ rdev->wiphy.registered = true;
+ rtnl_unlock();
+
+ res = rfkill_register(rdev->rfkill);
+ if (res) {
+ rfkill_destroy(rdev->rfkill);
+ rdev->rfkill = NULL;
+ wiphy_unregister(&rdev->wiphy);
+ return res;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(wiphy_register);
+
+void wiphy_rfkill_start_polling(struct wiphy *wiphy)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ if (!rdev->ops->rfkill_poll)
+ return;
+ rdev->rfkill_ops.poll = cfg80211_rfkill_poll;
+ rfkill_resume_polling(rdev->rfkill);
+}
+EXPORT_SYMBOL(wiphy_rfkill_start_polling);
+
+void wiphy_rfkill_stop_polling(struct wiphy *wiphy)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ rfkill_pause_polling(rdev->rfkill);
+}
+EXPORT_SYMBOL(wiphy_rfkill_stop_polling);
+
+void wiphy_unregister(struct wiphy *wiphy)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ wait_event(rdev->dev_wait, ({
+ int __count;
+ rtnl_lock();
+ __count = rdev->opencount;
+ rtnl_unlock();
+ __count == 0; }));
+
+ if (rdev->rfkill)
+ rfkill_unregister(rdev->rfkill);
+
+ rtnl_lock();
+ nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY);
+ rdev->wiphy.registered = false;
+
+ WARN_ON(!list_empty(&rdev->wdev_list));
+
+ /*
+ * First remove the hardware from everywhere, this makes
+ * it impossible to find from userspace.
+ */
+ debugfs_remove_recursive(rdev->wiphy.debugfsdir);
+ list_del_rcu(&rdev->list);
+ synchronize_rcu();
+
+ /*
+ * If this device got a regulatory hint tell core its
+ * free to listen now to a new shiny device regulatory hint
+ */
+ wiphy_regulatory_deregister(wiphy);
+
+ cfg80211_rdev_list_generation++;
+ device_del(&rdev->wiphy.dev);
+
+ rtnl_unlock();
+
+ flush_work(&rdev->scan_done_wk);
+ cancel_work_sync(&rdev->conn_work);
+ flush_work(&rdev->event_work);
+ cancel_delayed_work_sync(&rdev->dfs_update_channels_wk);
+ flush_work(&rdev->destroy_work);
+ flush_work(&rdev->sched_scan_stop_wk);
+
+#ifdef CONFIG_PM
+ if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup)
+ rdev_set_wakeup(rdev, false);
+#endif
+ cfg80211_rdev_free_wowlan(rdev);
+ cfg80211_rdev_free_coalesce(rdev);
+}
+EXPORT_SYMBOL(wiphy_unregister);
+
+void cfg80211_dev_free(struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_internal_bss *scan, *tmp;
+ struct cfg80211_beacon_registration *reg, *treg;
+ rfkill_destroy(rdev->rfkill);
+ list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) {
+ list_del(&reg->list);
+ kfree(reg);
+ }
+ list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list)
+ cfg80211_put_bss(&rdev->wiphy, &scan->pub);
+ kfree(rdev);
+}
+
+void wiphy_free(struct wiphy *wiphy)
+{
+ put_device(&wiphy->dev);
+}
+EXPORT_SYMBOL(wiphy_free);
+
+void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ if (rfkill_set_hw_state(rdev->rfkill, blocked))
+ schedule_work(&rdev->rfkill_sync);
+}
+EXPORT_SYMBOL(wiphy_rfkill_set_hw_state);
+
+void cfg80211_unregister_wdev(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ ASSERT_RTNL();
+
+ if (WARN_ON(wdev->netdev))
+ return;
+
+ list_del_rcu(&wdev->list);
+ rdev->devlist_generation++;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_P2P_DEVICE:
+ cfg80211_stop_p2p_device(rdev, wdev);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+EXPORT_SYMBOL(cfg80211_unregister_wdev);
+
+static const struct device_type wiphy_type = {
+ .name = "wlan",
+};
+
+void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
+ enum nl80211_iftype iftype, int num)
+{
+ ASSERT_RTNL();
+
+ rdev->num_running_ifaces += num;
+ if (iftype == NL80211_IFTYPE_MONITOR)
+ rdev->num_running_monitor_ifaces += num;
+}
+
+void __cfg80211_leave(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ struct net_device *dev = wdev->netdev;
+ struct cfg80211_sched_scan_request *sched_scan_req;
+
+ ASSERT_RTNL();
+ ASSERT_WDEV_LOCK(wdev);
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ __cfg80211_leave_ibss(rdev, dev, true);
+ break;
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_STATION:
+ sched_scan_req = rtnl_dereference(rdev->sched_scan_req);
+ if (sched_scan_req && dev == sched_scan_req->dev)
+ __cfg80211_stop_sched_scan(rdev, false);
+
+#ifdef CONFIG_CFG80211_WEXT
+ kfree(wdev->wext.ie);
+ wdev->wext.ie = NULL;
+ wdev->wext.ie_len = 0;
+ wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+#endif
+ cfg80211_disconnect(rdev, dev,
+ WLAN_REASON_DEAUTH_LEAVING, true);
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ __cfg80211_leave_mesh(rdev, dev);
+ break;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ __cfg80211_stop_ap(rdev, dev, true);
+ break;
+ case NL80211_IFTYPE_OCB:
+ __cfg80211_leave_ocb(rdev, dev);
+ break;
+ case NL80211_IFTYPE_WDS:
+ /* must be handled by mac80211/driver, has no APIs */
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ /* cannot happen, has no netdev */
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MONITOR:
+ /* nothing to do */
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ /* invalid */
+ break;
+ }
+}
+
+void cfg80211_leave(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ wdev_lock(wdev);
+ __cfg80211_leave(rdev, wdev);
+ wdev_unlock(wdev);
+}
+
+void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
+ gfp_t gfp)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_event *ev;
+ unsigned long flags;
+
+ trace_cfg80211_stop_iface(wiphy, wdev);
+
+ ev = kzalloc(sizeof(*ev), gfp);
+ if (!ev)
+ return;
+
+ ev->type = EVENT_STOPPED;
+
+ spin_lock_irqsave(&wdev->event_lock, flags);
+ list_add_tail(&ev->list, &wdev->event_list);
+ spin_unlock_irqrestore(&wdev->event_lock, flags);
+ queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_stop_iface);
+
+static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev;
+ struct cfg80211_sched_scan_request *sched_scan_req;
+
+ if (!wdev)
+ return NOTIFY_DONE;
+
+ rdev = wiphy_to_rdev(wdev->wiphy);
+
+ WARN_ON(wdev->iftype == NL80211_IFTYPE_UNSPECIFIED);
+
+ switch (state) {
+ case NETDEV_POST_INIT:
+ SET_NETDEV_DEVTYPE(dev, &wiphy_type);
+ break;
+ case NETDEV_REGISTER:
+ /*
+ * NB: cannot take rdev->mtx here because this may be
+ * called within code protected by it when interfaces
+ * are added with nl80211.
+ */
+ mutex_init(&wdev->mtx);
+ INIT_LIST_HEAD(&wdev->event_list);
+ spin_lock_init(&wdev->event_lock);
+ INIT_LIST_HEAD(&wdev->mgmt_registrations);
+ spin_lock_init(&wdev->mgmt_registrations_lock);
+
+ wdev->identifier = ++rdev->wdev_id;
+ list_add_rcu(&wdev->list, &rdev->wdev_list);
+ rdev->devlist_generation++;
+ /* can only change netns with wiphy */
+ dev->features |= NETIF_F_NETNS_LOCAL;
+
+ if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj,
+ "phy80211")) {
+ pr_err("failed to add phy80211 symlink to netdev!\n");
+ }
+ wdev->netdev = dev;
+#ifdef CONFIG_CFG80211_WEXT
+ wdev->wext.default_key = -1;
+ wdev->wext.default_mgmt_key = -1;
+ wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+#endif
+
+ if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT)
+ wdev->ps = true;
+ else
+ wdev->ps = false;
+ /* allow mac80211 to determine the timeout */
+ wdev->ps_timeout = -1;
+
+ if ((wdev->iftype == NL80211_IFTYPE_STATION ||
+ wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
+ wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
+ dev->priv_flags |= IFF_DONT_BRIDGE;
+ break;
+ case NETDEV_GOING_DOWN:
+ cfg80211_leave(rdev, wdev);
+ break;
+ case NETDEV_DOWN:
+ cfg80211_update_iface_num(rdev, wdev->iftype, -1);
+ if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
+ if (WARN_ON(!rdev->scan_req->notified))
+ rdev->scan_req->aborted = true;
+ ___cfg80211_scan_done(rdev, false);
+ }
+
+ sched_scan_req = rtnl_dereference(rdev->sched_scan_req);
+ if (WARN_ON(sched_scan_req &&
+ sched_scan_req->dev == wdev->netdev)) {
+ __cfg80211_stop_sched_scan(rdev, false);
+ }
+
+ rdev->opencount--;
+ wake_up(&rdev->dev_wait);
+ break;
+ case NETDEV_UP:
+ cfg80211_update_iface_num(rdev, wdev->iftype, 1);
+ wdev_lock(wdev);
+ switch (wdev->iftype) {
+#ifdef CONFIG_CFG80211_WEXT
+ case NL80211_IFTYPE_ADHOC:
+ cfg80211_ibss_wext_join(rdev, wdev);
+ break;
+ case NL80211_IFTYPE_STATION:
+ cfg80211_mgd_wext_connect(rdev, wdev);
+ break;
+#endif
+#ifdef CONFIG_MAC80211_MESH
+ case NL80211_IFTYPE_MESH_POINT:
+ {
+ /* backward compat code... */
+ struct mesh_setup setup;
+ memcpy(&setup, &default_mesh_setup,
+ sizeof(setup));
+ /* back compat only needed for mesh_id */
+ setup.mesh_id = wdev->ssid;
+ setup.mesh_id_len = wdev->mesh_id_up_len;
+ if (wdev->mesh_id_up_len)
+ __cfg80211_join_mesh(rdev, dev,
+ &setup,
+ &default_mesh_config);
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+ wdev_unlock(wdev);
+ rdev->opencount++;
+
+ /*
+ * Configure power management to the driver here so that its
+ * correctly set also after interface type changes etc.
+ */
+ if ((wdev->iftype == NL80211_IFTYPE_STATION ||
+ wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) &&
+ rdev->ops->set_power_mgmt)
+ if (rdev_set_power_mgmt(rdev, dev, wdev->ps,
+ wdev->ps_timeout)) {
+ /* assume this means it's off */
+ wdev->ps = false;
+ }
+ break;
+ case NETDEV_UNREGISTER:
+ /*
+ * It is possible to get NETDEV_UNREGISTER
+ * multiple times. To detect that, check
+ * that the interface is still on the list
+ * of registered interfaces, and only then
+ * remove and clean it up.
+ */
+ if (!list_empty(&wdev->list)) {
+ sysfs_remove_link(&dev->dev.kobj, "phy80211");
+ list_del_rcu(&wdev->list);
+ rdev->devlist_generation++;
+ cfg80211_mlme_purge_registrations(wdev);
+#ifdef CONFIG_CFG80211_WEXT
+ kzfree(wdev->wext.keys);
+#endif
+ }
+ /*
+ * synchronise (so that we won't find this netdev
+ * from other code any more) and then clear the list
+ * head so that the above code can safely check for
+ * !list_empty() to avoid double-cleanup.
+ */
+ synchronize_rcu();
+ INIT_LIST_HEAD(&wdev->list);
+ /*
+ * Ensure that all events have been processed and
+ * freed.
+ */
+ cfg80211_process_wdev_events(wdev);
+
+ if (WARN_ON(wdev->current_bss)) {
+ cfg80211_unhold_bss(wdev->current_bss);
+ cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+ wdev->current_bss = NULL;
+ }
+ break;
+ case NETDEV_PRE_UP:
+ if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype)))
+ return notifier_from_errno(-EOPNOTSUPP);
+ if (rfkill_blocked(rdev->rfkill))
+ return notifier_from_errno(-ERFKILL);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cfg80211_netdev_notifier = {
+ .notifier_call = cfg80211_netdev_notifier_call,
+};
+
+static void __net_exit cfg80211_pernet_exit(struct net *net)
+{
+ struct cfg80211_registered_device *rdev;
+
+ rtnl_lock();
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ if (net_eq(wiphy_net(&rdev->wiphy), net))
+ WARN_ON(cfg80211_switch_netns(rdev, &init_net));
+ }
+ rtnl_unlock();
+}
+
+static struct pernet_operations cfg80211_pernet_ops = {
+ .exit = cfg80211_pernet_exit,
+};
+
+static int __init cfg80211_init(void)
+{
+ int err;
+
+ err = register_pernet_device(&cfg80211_pernet_ops);
+ if (err)
+ goto out_fail_pernet;
+
+ err = wiphy_sysfs_init();
+ if (err)
+ goto out_fail_sysfs;
+
+ err = register_netdevice_notifier(&cfg80211_netdev_notifier);
+ if (err)
+ goto out_fail_notifier;
+
+ err = nl80211_init();
+ if (err)
+ goto out_fail_nl80211;
+
+ ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL);
+
+ err = regulatory_init();
+ if (err)
+ goto out_fail_reg;
+
+ cfg80211_wq = create_singlethread_workqueue("cfg80211");
+ if (!cfg80211_wq) {
+ err = -ENOMEM;
+ goto out_fail_wq;
+ }
+
+ return 0;
+
+out_fail_wq:
+ regulatory_exit();
+out_fail_reg:
+ debugfs_remove(ieee80211_debugfs_dir);
+ nl80211_exit();
+out_fail_nl80211:
+ unregister_netdevice_notifier(&cfg80211_netdev_notifier);
+out_fail_notifier:
+ wiphy_sysfs_exit();
+out_fail_sysfs:
+ unregister_pernet_device(&cfg80211_pernet_ops);
+out_fail_pernet:
+ return err;
+}
+subsys_initcall(cfg80211_init);
+
+static void __exit cfg80211_exit(void)
+{
+ debugfs_remove(ieee80211_debugfs_dir);
+ nl80211_exit();
+ unregister_netdevice_notifier(&cfg80211_netdev_notifier);
+ wiphy_sysfs_exit();
+ regulatory_exit();
+ unregister_pernet_device(&cfg80211_pernet_ops);
+ destroy_workqueue(cfg80211_wq);
+}
+module_exit(cfg80211_exit);
diff --git a/net/wireless/core.h b/net/wireless/core.h
new file mode 100644
index 000000000..801cd49c5
--- /dev/null
+++ b/net/wireless/core.h
@@ -0,0 +1,492 @@
+/*
+ * Wireless configuration interface internals.
+ *
+ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
+ */
+#ifndef __NET_WIRELESS_CORE_H
+#define __NET_WIRELESS_CORE_H
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/rbtree.h>
+#include <linux/debugfs.h>
+#include <linux/rfkill.h>
+#include <linux/workqueue.h>
+#include <linux/rtnetlink.h>
+#include <net/genetlink.h>
+#include <net/cfg80211.h>
+#include "reg.h"
+
+
+#define WIPHY_IDX_INVALID -1
+
+struct cfg80211_registered_device {
+ const struct cfg80211_ops *ops;
+ struct list_head list;
+
+ /* rfkill support */
+ struct rfkill_ops rfkill_ops;
+ struct rfkill *rfkill;
+ struct work_struct rfkill_sync;
+
+ /* ISO / IEC 3166 alpha2 for which this device is receiving
+ * country IEs on, this can help disregard country IEs from APs
+ * on the same alpha2 quickly. The alpha2 may differ from
+ * cfg80211_regdomain's alpha2 when an intersection has occurred.
+ * If the AP is reconfigured this can also be used to tell us if
+ * the country on the country IE changed. */
+ char country_ie_alpha2[2];
+
+ /*
+ * the driver requests the regulatory core to set this regulatory
+ * domain as the wiphy's. Only used for %REGULATORY_WIPHY_SELF_MANAGED
+ * devices using the regulatory_set_wiphy_regd() API
+ */
+ const struct ieee80211_regdomain *requested_regd;
+
+ /* If a Country IE has been received this tells us the environment
+ * which its telling us its in. This defaults to ENVIRON_ANY */
+ enum environment_cap env;
+
+ /* wiphy index, internal only */
+ int wiphy_idx;
+
+ /* associated wireless interfaces, protected by rtnl or RCU */
+ struct list_head wdev_list;
+ int devlist_generation, wdev_id;
+ int opencount; /* also protected by devlist_mtx */
+ wait_queue_head_t dev_wait;
+
+ struct list_head beacon_registrations;
+ spinlock_t beacon_registrations_lock;
+
+ /* protected by RTNL only */
+ int num_running_ifaces;
+ int num_running_monitor_ifaces;
+
+ /* BSSes/scanning */
+ spinlock_t bss_lock;
+ struct list_head bss_list;
+ struct rb_root bss_tree;
+ u32 bss_generation;
+ struct cfg80211_scan_request *scan_req; /* protected by RTNL */
+ struct sk_buff *scan_msg;
+ struct cfg80211_sched_scan_request __rcu *sched_scan_req;
+ unsigned long suspend_at;
+ struct work_struct scan_done_wk;
+ struct work_struct sched_scan_results_wk;
+
+ struct genl_info *cur_cmd_info;
+
+ struct work_struct conn_work;
+ struct work_struct event_work;
+
+ struct delayed_work dfs_update_channels_wk;
+
+ /* netlink port which started critical protocol (0 means not started) */
+ u32 crit_proto_nlportid;
+
+ struct cfg80211_coalesce *coalesce;
+
+ spinlock_t destroy_list_lock;
+ struct list_head destroy_list;
+ struct work_struct destroy_work;
+
+ struct work_struct sched_scan_stop_wk;
+
+ /* must be last because of the way we do wiphy_priv(),
+ * and it should at least be aligned to NETDEV_ALIGN */
+ struct wiphy wiphy __aligned(NETDEV_ALIGN);
+};
+
+static inline
+struct cfg80211_registered_device *wiphy_to_rdev(struct wiphy *wiphy)
+{
+ BUG_ON(!wiphy);
+ return container_of(wiphy, struct cfg80211_registered_device, wiphy);
+}
+
+static inline void
+cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev)
+{
+#ifdef CONFIG_PM
+ int i;
+
+ if (!rdev->wiphy.wowlan_config)
+ return;
+ for (i = 0; i < rdev->wiphy.wowlan_config->n_patterns; i++)
+ kfree(rdev->wiphy.wowlan_config->patterns[i].mask);
+ kfree(rdev->wiphy.wowlan_config->patterns);
+ if (rdev->wiphy.wowlan_config->tcp &&
+ rdev->wiphy.wowlan_config->tcp->sock)
+ sock_release(rdev->wiphy.wowlan_config->tcp->sock);
+ kfree(rdev->wiphy.wowlan_config->tcp);
+ kfree(rdev->wiphy.wowlan_config->nd_config);
+ kfree(rdev->wiphy.wowlan_config);
+#endif
+}
+
+extern struct workqueue_struct *cfg80211_wq;
+extern struct list_head cfg80211_rdev_list;
+extern int cfg80211_rdev_list_generation;
+
+struct cfg80211_internal_bss {
+ struct list_head list;
+ struct list_head hidden_list;
+ struct rb_node rbn;
+ unsigned long ts;
+ unsigned long refcount;
+ atomic_t hold;
+
+ /* must be last because of priv member */
+ struct cfg80211_bss pub;
+};
+
+static inline struct cfg80211_internal_bss *bss_from_pub(struct cfg80211_bss *pub)
+{
+ return container_of(pub, struct cfg80211_internal_bss, pub);
+}
+
+static inline void cfg80211_hold_bss(struct cfg80211_internal_bss *bss)
+{
+ atomic_inc(&bss->hold);
+}
+
+static inline void cfg80211_unhold_bss(struct cfg80211_internal_bss *bss)
+{
+ int r = atomic_dec_return(&bss->hold);
+ WARN_ON(r < 0);
+}
+
+
+struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx);
+int get_wiphy_idx(struct wiphy *wiphy);
+
+struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);
+
+int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
+ struct net *net);
+
+static inline void wdev_lock(struct wireless_dev *wdev)
+ __acquires(wdev)
+{
+ mutex_lock(&wdev->mtx);
+ __acquire(wdev->mtx);
+}
+
+static inline void wdev_unlock(struct wireless_dev *wdev)
+ __releases(wdev)
+{
+ __release(wdev->mtx);
+ mutex_unlock(&wdev->mtx);
+}
+
+#define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx)
+
+static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev)
+{
+ ASSERT_RTNL();
+
+ return rdev->num_running_ifaces == rdev->num_running_monitor_ifaces &&
+ rdev->num_running_ifaces > 0;
+}
+
+enum cfg80211_event_type {
+ EVENT_CONNECT_RESULT,
+ EVENT_ROAMED,
+ EVENT_DISCONNECTED,
+ EVENT_IBSS_JOINED,
+ EVENT_STOPPED,
+};
+
+struct cfg80211_event {
+ struct list_head list;
+ enum cfg80211_event_type type;
+
+ union {
+ struct {
+ u8 bssid[ETH_ALEN];
+ const u8 *req_ie;
+ const u8 *resp_ie;
+ size_t req_ie_len;
+ size_t resp_ie_len;
+ u16 status;
+ } cr;
+ struct {
+ const u8 *req_ie;
+ const u8 *resp_ie;
+ size_t req_ie_len;
+ size_t resp_ie_len;
+ struct cfg80211_bss *bss;
+ } rm;
+ struct {
+ const u8 *ie;
+ size_t ie_len;
+ u16 reason;
+ } dc;
+ struct {
+ u8 bssid[ETH_ALEN];
+ struct ieee80211_channel *channel;
+ } ij;
+ };
+};
+
+struct cfg80211_cached_keys {
+ struct key_params params[6];
+ u8 data[6][WLAN_MAX_KEY_LEN];
+ int def, defmgmt;
+};
+
+enum cfg80211_chan_mode {
+ CHAN_MODE_UNDEFINED,
+ CHAN_MODE_SHARED,
+ CHAN_MODE_EXCLUSIVE,
+};
+
+struct cfg80211_beacon_registration {
+ struct list_head list;
+ u32 nlportid;
+};
+
+struct cfg80211_iface_destroy {
+ struct list_head list;
+ u32 nlportid;
+};
+
+void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev);
+
+/* free object */
+void cfg80211_dev_free(struct cfg80211_registered_device *rdev);
+
+int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
+ char *newname);
+
+void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
+
+void cfg80211_bss_expire(struct cfg80211_registered_device *rdev);
+void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
+ unsigned long age_secs);
+
+/* IBSS */
+int cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_ibss_params *params,
+ struct cfg80211_cached_keys *connkeys);
+void cfg80211_clear_ibss(struct net_device *dev, bool nowext);
+int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool nowext);
+int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool nowext);
+void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
+ struct ieee80211_channel *channel);
+int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
+
+/* mesh */
+extern const struct mesh_config default_mesh_config;
+extern const struct mesh_setup default_mesh_setup;
+int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct mesh_setup *setup,
+ const struct mesh_config *conf);
+int cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct mesh_setup *setup,
+ const struct mesh_config *conf);
+int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev);
+int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev);
+int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_chan_def *chandef);
+
+/* OCB */
+int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ocb_setup *setup);
+int cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ocb_setup *setup);
+int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev);
+int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev);
+
+/* AP */
+int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool notify);
+int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool notify);
+
+/* MLME */
+int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ieee80211_channel *chan,
+ enum nl80211_auth_type auth_type,
+ const u8 *bssid,
+ const u8 *ssid, int ssid_len,
+ const u8 *ie, int ie_len,
+ const u8 *key, int key_len, int key_idx,
+ const u8 *sae_data, int sae_data_len);
+int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ieee80211_channel *chan,
+ const u8 *bssid,
+ const u8 *ssid, int ssid_len,
+ struct cfg80211_assoc_request *req);
+int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *bssid,
+ const u8 *ie, int ie_len, u16 reason,
+ bool local_state_change);
+int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *bssid,
+ const u8 *ie, int ie_len, u16 reason,
+ bool local_state_change);
+void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
+ struct net_device *dev);
+int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
+ u16 frame_type, const u8 *match_data,
+ int match_len);
+void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
+void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
+int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_mgmt_tx_params *params,
+ u64 *cookie);
+void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa,
+ const struct ieee80211_ht_cap *ht_capa_mask);
+void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
+ const struct ieee80211_vht_cap *vht_capa_mask);
+
+/* SME events */
+int cfg80211_connect(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_connect_params *connect,
+ struct cfg80211_cached_keys *connkeys,
+ const u8 *prev_bssid);
+void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len,
+ u16 status, bool wextev,
+ struct cfg80211_bss *bss);
+void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
+ size_t ie_len, u16 reason, bool from_ap);
+int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u16 reason,
+ bool wextev);
+void __cfg80211_roamed(struct wireless_dev *wdev,
+ struct cfg80211_bss *bss,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len);
+int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
+
+/* SME implementation */
+void cfg80211_conn_work(struct work_struct *work);
+void cfg80211_sme_scan_done(struct net_device *dev);
+bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status);
+void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len);
+void cfg80211_sme_disassoc(struct wireless_dev *wdev);
+void cfg80211_sme_deauth(struct wireless_dev *wdev);
+void cfg80211_sme_auth_timeout(struct wireless_dev *wdev);
+void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev);
+
+/* internal helpers */
+bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher);
+int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
+ struct key_params *params, int key_idx,
+ bool pairwise, const u8 *mac_addr);
+void __cfg80211_scan_done(struct work_struct *wk);
+void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
+ bool send_message);
+void __cfg80211_sched_scan_results(struct work_struct *wk);
+int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
+ bool driver_initiated);
+void cfg80211_upload_connect_keys(struct wireless_dev *wdev);
+int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, enum nl80211_iftype ntype,
+ u32 *flags, struct vif_params *params);
+void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
+void cfg80211_process_wdev_events(struct wireless_dev *wdev);
+
+int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ enum nl80211_iftype iftype,
+ struct ieee80211_channel *chan,
+ enum cfg80211_chan_mode chanmode,
+ u8 radar_detect);
+
+/**
+ * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
+ * @wiphy: the wiphy to validate against
+ * @chandef: the channel definition to check
+ *
+ * Checks if chandef is usable and we can/need start CAC on such channel.
+ *
+ * Return: Return true if all channels available and at least
+ * one channel require CAC (NL80211_DFS_USABLE)
+ */
+bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef);
+
+void cfg80211_set_dfs_state(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef,
+ enum nl80211_dfs_state dfs_state);
+
+void cfg80211_dfs_channels_update_work(struct work_struct *work);
+
+unsigned int
+cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef);
+
+static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
+{
+ unsigned long end = jiffies;
+
+ if (end >= start)
+ return jiffies_to_msecs(end - start);
+
+ return jiffies_to_msecs(end + (ULONG_MAX - start) + 1);
+}
+
+void
+cfg80211_get_chan_state(struct wireless_dev *wdev,
+ struct ieee80211_channel **chan,
+ enum cfg80211_chan_mode *chanmode,
+ u8 *radar_detect);
+
+int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
+ struct cfg80211_chan_def *chandef);
+
+int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
+ const u8 *rates, unsigned int n_rates,
+ u32 *mask);
+
+int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
+ u32 beacon_int);
+
+void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
+ enum nl80211_iftype iftype, int num);
+
+void __cfg80211_leave(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
+void cfg80211_leave(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
+
+void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
+
+#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10
+
+#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
+#define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond)
+#else
+/*
+ * Trick to enable using it as a condition,
+ * and also not give a warning when it's
+ * not used that way.
+ */
+#define CFG80211_DEV_WARN_ON(cond) ({bool __r = (cond); __r; })
+#endif
+
+#endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/db.txt b/net/wireless/db.txt
new file mode 100644
index 000000000..a2fc3a09c
--- /dev/null
+++ b/net/wireless/db.txt
@@ -0,0 +1,17 @@
+#
+# This file is a placeholder to prevent accidental build breakage if someone
+# enables CONFIG_CFG80211_INTERNAL_REGDB. Almost no one actually needs to
+# enable that build option.
+#
+# You should be using CRDA instead. It is even better if you use the CRDA
+# package provided by your distribution, since they will probably keep it
+# up-to-date on your behalf.
+#
+# If you _really_ intend to use CONFIG_CFG80211_INTERNAL_REGDB then you will
+# need to replace this file with one containing appropriately formatted
+# regulatory rules that cover the regulatory domains you will be using. Your
+# best option is to extract the db.txt file from the wireless-regdb git
+# repository:
+#
+# git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-regdb.git
+#
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
new file mode 100644
index 000000000..454157717
--- /dev/null
+++ b/net/wireless/debugfs.c
@@ -0,0 +1,117 @@
+/*
+ * cfg80211 debugfs
+ *
+ * Copyright 2009 Luis R. Rodriguez <lrodriguez@atheros.com>
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include "core.h"
+#include "debugfs.h"
+
+#define DEBUGFS_READONLY_FILE(name, buflen, fmt, value...) \
+static ssize_t name## _read(struct file *file, char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ struct wiphy *wiphy= file->private_data; \
+ char buf[buflen]; \
+ int res; \
+ \
+ res = scnprintf(buf, buflen, fmt "\n", ##value); \
+ return simple_read_from_buffer(userbuf, count, ppos, buf, res); \
+} \
+ \
+static const struct file_operations name## _ops = { \
+ .read = name## _read, \
+ .open = simple_open, \
+ .llseek = generic_file_llseek, \
+};
+
+DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
+ wiphy->rts_threshold)
+DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
+ wiphy->frag_threshold);
+DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d",
+ wiphy->retry_short)
+DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d",
+ wiphy->retry_long);
+
+static int ht_print_chan(struct ieee80211_channel *chan,
+ char *buf, int buf_size, int offset)
+{
+ if (WARN_ON(offset > buf_size))
+ return 0;
+
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ return scnprintf(buf + offset,
+ buf_size - offset,
+ "%d Disabled\n",
+ chan->center_freq);
+
+ return scnprintf(buf + offset,
+ buf_size - offset,
+ "%d HT40 %c%c\n",
+ chan->center_freq,
+ (chan->flags & IEEE80211_CHAN_NO_HT40MINUS) ?
+ ' ' : '-',
+ (chan->flags & IEEE80211_CHAN_NO_HT40PLUS) ?
+ ' ' : '+');
+}
+
+static ssize_t ht40allow_map_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct wiphy *wiphy = file->private_data;
+ char *buf;
+ unsigned int offset = 0, buf_size = PAGE_SIZE, i, r;
+ enum ieee80211_band band;
+ struct ieee80211_supported_band *sband;
+
+ buf = kzalloc(buf_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ rtnl_lock();
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ sband = wiphy->bands[band];
+ if (!sband)
+ continue;
+ for (i = 0; i < sband->n_channels; i++)
+ offset += ht_print_chan(&sband->channels[i],
+ buf, buf_size, offset);
+ }
+
+ rtnl_unlock();
+
+ r = simple_read_from_buffer(user_buf, count, ppos, buf, offset);
+
+ kfree(buf);
+
+ return r;
+}
+
+static const struct file_operations ht40allow_map_ops = {
+ .read = ht40allow_map_read,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+#define DEBUGFS_ADD(name) \
+ debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops);
+
+void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev)
+{
+ struct dentry *phyd = rdev->wiphy.debugfsdir;
+
+ DEBUGFS_ADD(rts_threshold);
+ DEBUGFS_ADD(fragmentation_threshold);
+ DEBUGFS_ADD(short_retry_limit);
+ DEBUGFS_ADD(long_retry_limit);
+ DEBUGFS_ADD(ht40allow_map);
+}
diff --git a/net/wireless/debugfs.h b/net/wireless/debugfs.h
new file mode 100644
index 000000000..74fdd3811
--- /dev/null
+++ b/net/wireless/debugfs.h
@@ -0,0 +1,11 @@
+#ifndef __CFG80211_DEBUGFS_H
+#define __CFG80211_DEBUGFS_H
+
+#ifdef CONFIG_CFG80211_DEBUGFS
+void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev);
+#else
+static inline
+void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) {}
+#endif
+
+#endif /* __CFG80211_DEBUGFS_H */
diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c
new file mode 100644
index 000000000..e9e91298c
--- /dev/null
+++ b/net/wireless/ethtool.c
@@ -0,0 +1,24 @@
+#include <linux/utsname.h>
+#include <net/cfg80211.h>
+#include "core.h"
+#include "rdev-ops.h"
+
+void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name,
+ sizeof(info->driver));
+
+ strlcpy(info->version, init_utsname()->release, sizeof(info->version));
+
+ if (wdev->wiphy->fw_version[0])
+ strlcpy(info->fw_version, wdev->wiphy->fw_version,
+ sizeof(info->fw_version));
+ else
+ strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
+
+ strlcpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)),
+ sizeof(info->bus_info));
+}
+EXPORT_SYMBOL(cfg80211_get_drvinfo);
diff --git a/net/wireless/genregdb.awk b/net/wireless/genregdb.awk
new file mode 100644
index 000000000..baf2426b5
--- /dev/null
+++ b/net/wireless/genregdb.awk
@@ -0,0 +1,158 @@
+#!/usr/bin/awk -f
+#
+# genregdb.awk -- generate regdb.c from db.txt
+#
+# Actually, it reads from stdin (presumed to be db.txt) and writes
+# to stdout (presumed to be regdb.c), but close enough...
+#
+# Copyright 2009 John W. Linville <linville@tuxdriver.com>
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+BEGIN {
+ active = 0
+ rules = 0;
+ print "/*"
+ print " * DO NOT EDIT -- file generated from data in db.txt"
+ print " */"
+ print ""
+ print "#include <linux/nl80211.h>"
+ print "#include <net/cfg80211.h>"
+ print "#include \"regdb.h\""
+ print ""
+ regdb = "const struct ieee80211_regdomain *reg_regdb[] = {\n"
+}
+
+function parse_country_head() {
+ country=$2
+ sub(/:/, "", country)
+ printf "static const struct ieee80211_regdomain regdom_%s = {\n", country
+ printf "\t.alpha2 = \"%s\",\n", country
+ if ($NF ~ /DFS-ETSI/)
+ printf "\t.dfs_region = NL80211_DFS_ETSI,\n"
+ else if ($NF ~ /DFS-FCC/)
+ printf "\t.dfs_region = NL80211_DFS_FCC,\n"
+ else if ($NF ~ /DFS-JP/)
+ printf "\t.dfs_region = NL80211_DFS_JP,\n"
+ printf "\t.reg_rules = {\n"
+ active = 1
+ regdb = regdb "\t&regdom_" country ",\n"
+}
+
+function parse_reg_rule()
+{
+ flag_starts_at = 7
+
+ start = $1
+ sub(/\(/, "", start)
+ end = $3
+ bw = $5
+ sub(/\),/, "", bw)
+ gain = 0
+ power = $6
+ # power might be in mW...
+ units = $7
+ dfs_cac = 0
+
+ sub(/\(/, "", power)
+ sub(/\),/, "", power)
+ sub(/\),/, "", units)
+ sub(/\)/, "", units)
+
+ if (units == "mW") {
+ flag_starts_at = 8
+ power = 10 * log(power)/log(10)
+ if ($8 ~ /[[:digit:]]/) {
+ flag_starts_at = 9
+ dfs_cac = $8
+ }
+ } else {
+ if ($7 ~ /[[:digit:]]/) {
+ flag_starts_at = 8
+ dfs_cac = $7
+ }
+ }
+ sub(/\(/, "", dfs_cac)
+ sub(/\),/, "", dfs_cac)
+ flagstr = ""
+ for (i=flag_starts_at; i<=NF; i++)
+ flagstr = flagstr $i
+ split(flagstr, flagarray, ",")
+ flags = ""
+ for (arg in flagarray) {
+ if (flagarray[arg] == "NO-OFDM") {
+ flags = flags "\n\t\t\tNL80211_RRF_NO_OFDM | "
+ } else if (flagarray[arg] == "NO-CCK") {
+ flags = flags "\n\t\t\tNL80211_RRF_NO_CCK | "
+ } else if (flagarray[arg] == "NO-INDOOR") {
+ flags = flags "\n\t\t\tNL80211_RRF_NO_INDOOR | "
+ } else if (flagarray[arg] == "NO-OUTDOOR") {
+ flags = flags "\n\t\t\tNL80211_RRF_NO_OUTDOOR | "
+ } else if (flagarray[arg] == "DFS") {
+ flags = flags "\n\t\t\tNL80211_RRF_DFS | "
+ } else if (flagarray[arg] == "PTP-ONLY") {
+ flags = flags "\n\t\t\tNL80211_RRF_PTP_ONLY | "
+ } else if (flagarray[arg] == "PTMP-ONLY") {
+ flags = flags "\n\t\t\tNL80211_RRF_PTMP_ONLY | "
+ } else if (flagarray[arg] == "PASSIVE-SCAN") {
+ flags = flags "\n\t\t\tNL80211_RRF_NO_IR | "
+ } else if (flagarray[arg] == "NO-IBSS") {
+ flags = flags "\n\t\t\tNL80211_RRF_NO_IR | "
+ } else if (flagarray[arg] == "NO-IR") {
+ flags = flags "\n\t\t\tNL80211_RRF_NO_IR | "
+ } else if (flagarray[arg] == "AUTO-BW") {
+ flags = flags "\n\t\t\tNL80211_RRF_AUTO_BW | "
+ }
+
+ }
+ flags = flags "0"
+ printf "\t\tREG_RULE_EXT(%d, %d, %d, %d, %.0f, %d, %s),\n", start, end, bw, gain, power, dfs_cac, flags
+ rules++
+}
+
+function print_tail_country()
+{
+ active = 0
+ printf "\t},\n"
+ printf "\t.n_reg_rules = %d\n", rules
+ printf "};\n\n"
+ rules = 0;
+}
+
+/^[ \t]*#/ {
+ # Ignore
+}
+
+!active && /^[ \t]*$/ {
+ # Ignore
+}
+
+!active && /country/ {
+ parse_country_head()
+}
+
+active && /^[ \t]*\(/ {
+ parse_reg_rule()
+}
+
+active && /^[ \t]*$/ {
+ print_tail_country()
+}
+
+END {
+ if (active)
+ print_tail_country()
+ print regdb "};"
+ print ""
+ print "int reg_regdb_size = ARRAY_SIZE(reg_regdb);"
+}
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
new file mode 100644
index 000000000..4c55fab9b
--- /dev/null
+++ b/net/wireless/ibss.c
@@ -0,0 +1,542 @@
+/*
+ * Some IBSS support code for cfg80211.
+ *
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/cfg80211.h>
+#include "wext-compat.h"
+#include "nl80211.h"
+#include "rdev-ops.h"
+
+
+void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
+ struct ieee80211_channel *channel)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_bss *bss;
+#ifdef CONFIG_CFG80211_WEXT
+ union iwreq_data wrqu;
+#endif
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+ return;
+
+ if (!wdev->ssid_len)
+ return;
+
+ bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0,
+ IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY_ANY);
+
+ if (WARN_ON(!bss))
+ return;
+
+ if (wdev->current_bss) {
+ cfg80211_unhold_bss(wdev->current_bss);
+ cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+ }
+
+ cfg80211_hold_bss(bss_from_pub(bss));
+ wdev->current_bss = bss_from_pub(bss);
+
+ cfg80211_upload_connect_keys(wdev);
+
+ nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid,
+ GFP_KERNEL);
+#ifdef CONFIG_CFG80211_WEXT
+ memset(&wrqu, 0, sizeof(wrqu));
+ memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
+ wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+#endif
+}
+
+void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
+ struct ieee80211_channel *channel, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_event *ev;
+ unsigned long flags;
+
+ trace_cfg80211_ibss_joined(dev, bssid, channel);
+
+ if (WARN_ON(!channel))
+ return;
+
+ ev = kzalloc(sizeof(*ev), gfp);
+ if (!ev)
+ return;
+
+ ev->type = EVENT_IBSS_JOINED;
+ memcpy(ev->ij.bssid, bssid, ETH_ALEN);
+ ev->ij.channel = channel;
+
+ spin_lock_irqsave(&wdev->event_lock, flags);
+ list_add_tail(&ev->list, &wdev->event_list);
+ spin_unlock_irqrestore(&wdev->event_lock, flags);
+ queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_ibss_joined);
+
+static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_ibss_params *params,
+ struct cfg80211_cached_keys *connkeys)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (wdev->ssid_len)
+ return -EALREADY;
+
+ if (!params->basic_rates) {
+ /*
+ * If no rates were explicitly configured,
+ * use the mandatory rate set for 11b or
+ * 11a for maximum compatibility.
+ */
+ struct ieee80211_supported_band *sband =
+ rdev->wiphy.bands[params->chandef.chan->band];
+ int j;
+ u32 flag = params->chandef.chan->band == IEEE80211_BAND_5GHZ ?
+ IEEE80211_RATE_MANDATORY_A :
+ IEEE80211_RATE_MANDATORY_B;
+
+ for (j = 0; j < sband->n_bitrates; j++) {
+ if (sband->bitrates[j].flags & flag)
+ params->basic_rates |= BIT(j);
+ }
+ }
+
+ if (WARN_ON(wdev->connect_keys))
+ kzfree(wdev->connect_keys);
+ wdev->connect_keys = connkeys;
+
+ wdev->ibss_fixed = params->channel_fixed;
+ wdev->ibss_dfs_possible = params->userspace_handles_dfs;
+ wdev->chandef = params->chandef;
+#ifdef CONFIG_CFG80211_WEXT
+ wdev->wext.ibss.chandef = params->chandef;
+#endif
+ err = rdev_join_ibss(rdev, dev, params);
+ if (err) {
+ wdev->connect_keys = NULL;
+ return err;
+ }
+
+ memcpy(wdev->ssid, params->ssid, params->ssid_len);
+ wdev->ssid_len = params->ssid_len;
+
+ return 0;
+}
+
+int cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_ibss_params *params,
+ struct cfg80211_cached_keys *connkeys)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_RTNL();
+
+ wdev_lock(wdev);
+ err = __cfg80211_join_ibss(rdev, dev, params, connkeys);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ int i;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ kzfree(wdev->connect_keys);
+ wdev->connect_keys = NULL;
+
+ rdev_set_qos_map(rdev, dev, NULL);
+
+ /*
+ * Delete all the keys ... pairwise keys can't really
+ * exist any more anyway, but default keys might.
+ */
+ if (rdev->ops->del_key)
+ for (i = 0; i < 6; i++)
+ rdev_del_key(rdev, dev, i, false, NULL);
+
+ if (wdev->current_bss) {
+ cfg80211_unhold_bss(wdev->current_bss);
+ cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+ }
+
+ wdev->current_bss = NULL;
+ wdev->ssid_len = 0;
+ memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+#ifdef CONFIG_CFG80211_WEXT
+ if (!nowext)
+ wdev->wext.ibss.ssid_len = 0;
+#endif
+}
+
+void cfg80211_clear_ibss(struct net_device *dev, bool nowext)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ wdev_lock(wdev);
+ __cfg80211_clear_ibss(dev, nowext);
+ wdev_unlock(wdev);
+}
+
+int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool nowext)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!wdev->ssid_len)
+ return -ENOLINK;
+
+ err = rdev_leave_ibss(rdev, dev);
+
+ if (err)
+ return err;
+
+ __cfg80211_clear_ibss(dev, nowext);
+
+ return 0;
+}
+
+int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool nowext)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ wdev_lock(wdev);
+ err = __cfg80211_leave_ibss(rdev, dev, nowext);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+#ifdef CONFIG_CFG80211_WEXT
+int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ struct cfg80211_cached_keys *ck = NULL;
+ enum ieee80211_band band;
+ int i, err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!wdev->wext.ibss.beacon_interval)
+ wdev->wext.ibss.beacon_interval = 100;
+
+ /* try to find an IBSS channel if none requested ... */
+ if (!wdev->wext.ibss.chandef.chan) {
+ struct ieee80211_channel *new_chan = NULL;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_channel *chan;
+
+ sband = rdev->wiphy.bands[band];
+ if (!sband)
+ continue;
+
+ for (i = 0; i < sband->n_channels; i++) {
+ chan = &sband->channels[i];
+ if (chan->flags & IEEE80211_CHAN_NO_IR)
+ continue;
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ continue;
+ new_chan = chan;
+ break;
+ }
+
+ if (new_chan)
+ break;
+ }
+
+ if (!new_chan)
+ return -EINVAL;
+
+ cfg80211_chandef_create(&wdev->wext.ibss.chandef, new_chan,
+ NL80211_CHAN_NO_HT);
+ }
+
+ /* don't join -- SSID is not there */
+ if (!wdev->wext.ibss.ssid_len)
+ return 0;
+
+ if (!netif_running(wdev->netdev))
+ return 0;
+
+ if (wdev->wext.keys) {
+ wdev->wext.keys->def = wdev->wext.default_key;
+ wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
+ }
+
+ wdev->wext.ibss.privacy = wdev->wext.default_key != -1;
+
+ if (wdev->wext.keys) {
+ ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
+ if (!ck)
+ return -ENOMEM;
+ for (i = 0; i < 6; i++)
+ ck->params[i].key = ck->data[i];
+ }
+ err = __cfg80211_join_ibss(rdev, wdev->netdev,
+ &wdev->wext.ibss, ck);
+ if (err)
+ kfree(ck);
+
+ return err;
+}
+
+int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *wextfreq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct ieee80211_channel *chan = NULL;
+ int err, freq;
+
+ /* call only for ibss! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+ return -EINVAL;
+
+ if (!rdev->ops->join_ibss)
+ return -EOPNOTSUPP;
+
+ freq = cfg80211_wext_freq(wextfreq);
+ if (freq < 0)
+ return freq;
+
+ if (freq) {
+ chan = ieee80211_get_channel(wdev->wiphy, freq);
+ if (!chan)
+ return -EINVAL;
+ if (chan->flags & IEEE80211_CHAN_NO_IR ||
+ chan->flags & IEEE80211_CHAN_DISABLED)
+ return -EINVAL;
+ }
+
+ if (wdev->wext.ibss.chandef.chan == chan)
+ return 0;
+
+ wdev_lock(wdev);
+ err = 0;
+ if (wdev->ssid_len)
+ err = __cfg80211_leave_ibss(rdev, dev, true);
+ wdev_unlock(wdev);
+
+ if (err)
+ return err;
+
+ if (chan) {
+ cfg80211_chandef_create(&wdev->wext.ibss.chandef, chan,
+ NL80211_CHAN_NO_HT);
+ wdev->wext.ibss.channel_fixed = true;
+ } else {
+ /* cfg80211_ibss_wext_join will pick one if needed */
+ wdev->wext.ibss.channel_fixed = false;
+ }
+
+ wdev_lock(wdev);
+ err = cfg80211_ibss_wext_join(rdev, wdev);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *freq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct ieee80211_channel *chan = NULL;
+
+ /* call only for ibss! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+ return -EINVAL;
+
+ wdev_lock(wdev);
+ if (wdev->current_bss)
+ chan = wdev->current_bss->pub.channel;
+ else if (wdev->wext.ibss.chandef.chan)
+ chan = wdev->wext.ibss.chandef.chan;
+ wdev_unlock(wdev);
+
+ if (chan) {
+ freq->m = chan->center_freq;
+ freq->e = 6;
+ return 0;
+ }
+
+ /* no channel if not joining */
+ return -EINVAL;
+}
+
+int cfg80211_ibss_wext_siwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ size_t len = data->length;
+ int err;
+
+ /* call only for ibss! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+ return -EINVAL;
+
+ if (!rdev->ops->join_ibss)
+ return -EOPNOTSUPP;
+
+ wdev_lock(wdev);
+ err = 0;
+ if (wdev->ssid_len)
+ err = __cfg80211_leave_ibss(rdev, dev, true);
+ wdev_unlock(wdev);
+
+ if (err)
+ return err;
+
+ /* iwconfig uses nul termination in SSID.. */
+ if (len > 0 && ssid[len - 1] == '\0')
+ len--;
+
+ memcpy(wdev->ssid, ssid, len);
+ wdev->wext.ibss.ssid = wdev->ssid;
+ wdev->wext.ibss.ssid_len = len;
+
+ wdev_lock(wdev);
+ err = cfg80211_ibss_wext_join(rdev, wdev);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+int cfg80211_ibss_wext_giwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ /* call only for ibss! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+ return -EINVAL;
+
+ data->flags = 0;
+
+ wdev_lock(wdev);
+ if (wdev->ssid_len) {
+ data->flags = 1;
+ data->length = wdev->ssid_len;
+ memcpy(ssid, wdev->ssid, data->length);
+ } else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) {
+ data->flags = 1;
+ data->length = wdev->wext.ibss.ssid_len;
+ memcpy(ssid, wdev->wext.ibss.ssid, data->length);
+ }
+ wdev_unlock(wdev);
+
+ return 0;
+}
+
+int cfg80211_ibss_wext_siwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ u8 *bssid = ap_addr->sa_data;
+ int err;
+
+ /* call only for ibss! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+ return -EINVAL;
+
+ if (!rdev->ops->join_ibss)
+ return -EOPNOTSUPP;
+
+ if (ap_addr->sa_family != ARPHRD_ETHER)
+ return -EINVAL;
+
+ /* automatic mode */
+ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid))
+ bssid = NULL;
+
+ if (bssid && !is_valid_ether_addr(bssid))
+ return -EINVAL;
+
+ /* both automatic */
+ if (!bssid && !wdev->wext.ibss.bssid)
+ return 0;
+
+ /* fixed already - and no change */
+ if (wdev->wext.ibss.bssid && bssid &&
+ ether_addr_equal(bssid, wdev->wext.ibss.bssid))
+ return 0;
+
+ wdev_lock(wdev);
+ err = 0;
+ if (wdev->ssid_len)
+ err = __cfg80211_leave_ibss(rdev, dev, true);
+ wdev_unlock(wdev);
+
+ if (err)
+ return err;
+
+ if (bssid) {
+ memcpy(wdev->wext.bssid, bssid, ETH_ALEN);
+ wdev->wext.ibss.bssid = wdev->wext.bssid;
+ } else
+ wdev->wext.ibss.bssid = NULL;
+
+ wdev_lock(wdev);
+ err = cfg80211_ibss_wext_join(rdev, wdev);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+int cfg80211_ibss_wext_giwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ /* call only for ibss! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+ return -EINVAL;
+
+ ap_addr->sa_family = ARPHRD_ETHER;
+
+ wdev_lock(wdev);
+ if (wdev->current_bss)
+ memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN);
+ else if (wdev->wext.ibss.bssid)
+ memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN);
+ else
+ eth_zero_addr(ap_addr->sa_data);
+
+ wdev_unlock(wdev);
+
+ return 0;
+}
+#endif
diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c
new file mode 100644
index 000000000..459611577
--- /dev/null
+++ b/net/wireless/lib80211.c
@@ -0,0 +1,257 @@
+/*
+ * lib80211 -- common bits for IEEE802.11 drivers
+ *
+ * Copyright(c) 2008 John W. Linville <linville@tuxdriver.com>
+ *
+ * Portions copied from old ieee80211 component, w/ original copyright
+ * notices below:
+ *
+ * Host AP crypto routines
+ *
+ * Copyright (c) 2002-2003, Jouni Malinen <j@w1.fi>
+ * Portions Copyright (C) 2004, Intel Corporation <jketreno@linux.intel.com>
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/ieee80211.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <net/lib80211.h>
+
+#define DRV_NAME "lib80211"
+
+#define DRV_DESCRIPTION "common routines for IEEE802.11 drivers"
+
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>");
+MODULE_LICENSE("GPL");
+
+struct lib80211_crypto_alg {
+ struct list_head list;
+ struct lib80211_crypto_ops *ops;
+};
+
+static LIST_HEAD(lib80211_crypto_algs);
+static DEFINE_SPINLOCK(lib80211_crypto_lock);
+
+static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info,
+ int force);
+static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info);
+static void lib80211_crypt_deinit_handler(unsigned long data);
+
+int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name,
+ spinlock_t *lock)
+{
+ memset(info, 0, sizeof(*info));
+
+ info->name = name;
+ info->lock = lock;
+
+ INIT_LIST_HEAD(&info->crypt_deinit_list);
+ setup_timer(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler,
+ (unsigned long)info);
+
+ return 0;
+}
+EXPORT_SYMBOL(lib80211_crypt_info_init);
+
+void lib80211_crypt_info_free(struct lib80211_crypt_info *info)
+{
+ int i;
+
+ lib80211_crypt_quiescing(info);
+ del_timer_sync(&info->crypt_deinit_timer);
+ lib80211_crypt_deinit_entries(info, 1);
+
+ for (i = 0; i < NUM_WEP_KEYS; i++) {
+ struct lib80211_crypt_data *crypt = info->crypt[i];
+ if (crypt) {
+ if (crypt->ops) {
+ crypt->ops->deinit(crypt->priv);
+ module_put(crypt->ops->owner);
+ }
+ kfree(crypt);
+ info->crypt[i] = NULL;
+ }
+ }
+}
+EXPORT_SYMBOL(lib80211_crypt_info_free);
+
+static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info,
+ int force)
+{
+ struct lib80211_crypt_data *entry, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(info->lock, flags);
+ list_for_each_entry_safe(entry, next, &info->crypt_deinit_list, list) {
+ if (atomic_read(&entry->refcnt) != 0 && !force)
+ continue;
+
+ list_del(&entry->list);
+
+ if (entry->ops) {
+ entry->ops->deinit(entry->priv);
+ module_put(entry->ops->owner);
+ }
+ kfree(entry);
+ }
+ spin_unlock_irqrestore(info->lock, flags);
+}
+
+/* After this, crypt_deinit_list won't accept new members */
+static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(info->lock, flags);
+ info->crypt_quiesced = 1;
+ spin_unlock_irqrestore(info->lock, flags);
+}
+
+static void lib80211_crypt_deinit_handler(unsigned long data)
+{
+ struct lib80211_crypt_info *info = (struct lib80211_crypt_info *)data;
+ unsigned long flags;
+
+ lib80211_crypt_deinit_entries(info, 0);
+
+ spin_lock_irqsave(info->lock, flags);
+ if (!list_empty(&info->crypt_deinit_list) && !info->crypt_quiesced) {
+ printk(KERN_DEBUG "%s: entries remaining in delayed crypt "
+ "deletion list\n", info->name);
+ info->crypt_deinit_timer.expires = jiffies + HZ;
+ add_timer(&info->crypt_deinit_timer);
+ }
+ spin_unlock_irqrestore(info->lock, flags);
+}
+
+void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info,
+ struct lib80211_crypt_data **crypt)
+{
+ struct lib80211_crypt_data *tmp;
+ unsigned long flags;
+
+ if (*crypt == NULL)
+ return;
+
+ tmp = *crypt;
+ *crypt = NULL;
+
+ /* must not run ops->deinit() while there may be pending encrypt or
+ * decrypt operations. Use a list of delayed deinits to avoid needing
+ * locking. */
+
+ spin_lock_irqsave(info->lock, flags);
+ if (!info->crypt_quiesced) {
+ list_add(&tmp->list, &info->crypt_deinit_list);
+ if (!timer_pending(&info->crypt_deinit_timer)) {
+ info->crypt_deinit_timer.expires = jiffies + HZ;
+ add_timer(&info->crypt_deinit_timer);
+ }
+ }
+ spin_unlock_irqrestore(info->lock, flags);
+}
+EXPORT_SYMBOL(lib80211_crypt_delayed_deinit);
+
+int lib80211_register_crypto_ops(struct lib80211_crypto_ops *ops)
+{
+ unsigned long flags;
+ struct lib80211_crypto_alg *alg;
+
+ alg = kzalloc(sizeof(*alg), GFP_KERNEL);
+ if (alg == NULL)
+ return -ENOMEM;
+
+ alg->ops = ops;
+
+ spin_lock_irqsave(&lib80211_crypto_lock, flags);
+ list_add(&alg->list, &lib80211_crypto_algs);
+ spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+
+ printk(KERN_DEBUG "lib80211_crypt: registered algorithm '%s'\n",
+ ops->name);
+
+ return 0;
+}
+EXPORT_SYMBOL(lib80211_register_crypto_ops);
+
+int lib80211_unregister_crypto_ops(struct lib80211_crypto_ops *ops)
+{
+ struct lib80211_crypto_alg *alg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&lib80211_crypto_lock, flags);
+ list_for_each_entry(alg, &lib80211_crypto_algs, list) {
+ if (alg->ops == ops)
+ goto found;
+ }
+ spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+ return -EINVAL;
+
+ found:
+ printk(KERN_DEBUG "lib80211_crypt: unregistered algorithm '%s'\n",
+ ops->name);
+ list_del(&alg->list);
+ spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+ kfree(alg);
+ return 0;
+}
+EXPORT_SYMBOL(lib80211_unregister_crypto_ops);
+
+struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name)
+{
+ struct lib80211_crypto_alg *alg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&lib80211_crypto_lock, flags);
+ list_for_each_entry(alg, &lib80211_crypto_algs, list) {
+ if (strcmp(alg->ops->name, name) == 0)
+ goto found;
+ }
+ spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+ return NULL;
+
+ found:
+ spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+ return alg->ops;
+}
+EXPORT_SYMBOL(lib80211_get_crypto_ops);
+
+static void *lib80211_crypt_null_init(int keyidx)
+{
+ return (void *)1;
+}
+
+static void lib80211_crypt_null_deinit(void *priv)
+{
+}
+
+static struct lib80211_crypto_ops lib80211_crypt_null = {
+ .name = "NULL",
+ .init = lib80211_crypt_null_init,
+ .deinit = lib80211_crypt_null_deinit,
+ .owner = THIS_MODULE,
+};
+
+static int __init lib80211_init(void)
+{
+ pr_info(DRV_DESCRIPTION "\n");
+ return lib80211_register_crypto_ops(&lib80211_crypt_null);
+}
+
+static void __exit lib80211_exit(void)
+{
+ lib80211_unregister_crypto_ops(&lib80211_crypt_null);
+ BUG_ON(!list_empty(&lib80211_crypto_algs));
+}
+
+module_init(lib80211_init);
+module_exit(lib80211_exit);
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
new file mode 100644
index 000000000..dc0e59e53
--- /dev/null
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -0,0 +1,479 @@
+/*
+ * lib80211 crypt: host-based CCMP encryption implementation for lib80211
+ *
+ * Copyright (c) 2003-2004, Jouni Malinen <j@w1.fi>
+ * Copyright (c) 2008, John W. Linville <linville@tuxdriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <asm/string.h>
+#include <linux/wireless.h>
+
+#include <linux/ieee80211.h>
+
+#include <linux/crypto.h>
+
+#include <net/lib80211.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("Host AP crypt: CCMP");
+MODULE_LICENSE("GPL");
+
+#define AES_BLOCK_LEN 16
+#define CCMP_HDR_LEN 8
+#define CCMP_MIC_LEN 8
+#define CCMP_TK_LEN 16
+#define CCMP_PN_LEN 6
+
+struct lib80211_ccmp_data {
+ u8 key[CCMP_TK_LEN];
+ int key_set;
+
+ u8 tx_pn[CCMP_PN_LEN];
+ u8 rx_pn[CCMP_PN_LEN];
+
+ u32 dot11RSNAStatsCCMPFormatErrors;
+ u32 dot11RSNAStatsCCMPReplays;
+ u32 dot11RSNAStatsCCMPDecryptErrors;
+
+ int key_idx;
+
+ struct crypto_cipher *tfm;
+
+ /* scratch buffers for virt_to_page() (crypto API) */
+ u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN],
+ tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN];
+ u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN];
+};
+
+static inline void lib80211_ccmp_aes_encrypt(struct crypto_cipher *tfm,
+ const u8 pt[16], u8 ct[16])
+{
+ crypto_cipher_encrypt_one(tfm, ct, pt);
+}
+
+static void *lib80211_ccmp_init(int key_idx)
+{
+ struct lib80211_ccmp_data *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+ priv->key_idx = key_idx;
+
+ priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(priv->tfm)) {
+ priv->tfm = NULL;
+ goto fail;
+ }
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tfm)
+ crypto_free_cipher(priv->tfm);
+ kfree(priv);
+ }
+
+ return NULL;
+}
+
+static void lib80211_ccmp_deinit(void *priv)
+{
+ struct lib80211_ccmp_data *_priv = priv;
+ if (_priv && _priv->tfm)
+ crypto_free_cipher(_priv->tfm);
+ kfree(priv);
+}
+
+static inline void xor_block(u8 * b, u8 * a, size_t len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ b[i] ^= a[i];
+}
+
+static void ccmp_init_blocks(struct crypto_cipher *tfm,
+ struct ieee80211_hdr *hdr,
+ u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0)
+{
+ u8 *pos, qc = 0;
+ size_t aad_len;
+ int a4_included, qc_included;
+ u8 aad[2 * AES_BLOCK_LEN];
+
+ a4_included = ieee80211_has_a4(hdr->frame_control);
+ qc_included = ieee80211_is_data_qos(hdr->frame_control);
+
+ aad_len = 22;
+ if (a4_included)
+ aad_len += 6;
+ if (qc_included) {
+ pos = (u8 *) & hdr->addr4;
+ if (a4_included)
+ pos += 6;
+ qc = *pos & 0x0f;
+ aad_len += 2;
+ }
+
+ /* CCM Initial Block:
+ * Flag (Include authentication header, M=3 (8-octet MIC),
+ * L=1 (2-octet Dlen))
+ * Nonce: 0x00 | A2 | PN
+ * Dlen */
+ b0[0] = 0x59;
+ b0[1] = qc;
+ memcpy(b0 + 2, hdr->addr2, ETH_ALEN);
+ memcpy(b0 + 8, pn, CCMP_PN_LEN);
+ b0[14] = (dlen >> 8) & 0xff;
+ b0[15] = dlen & 0xff;
+
+ /* AAD:
+ * FC with bits 4..6 and 11..13 masked to zero; 14 is always one
+ * A1 | A2 | A3
+ * SC with bits 4..15 (seq#) masked to zero
+ * A4 (if present)
+ * QC (if present)
+ */
+ pos = (u8 *) hdr;
+ aad[0] = 0; /* aad_len >> 8 */
+ aad[1] = aad_len & 0xff;
+ aad[2] = pos[0] & 0x8f;
+ aad[3] = pos[1] & 0xc7;
+ memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN);
+ pos = (u8 *) & hdr->seq_ctrl;
+ aad[22] = pos[0] & 0x0f;
+ aad[23] = 0; /* all bits masked */
+ memset(aad + 24, 0, 8);
+ if (a4_included)
+ memcpy(aad + 24, hdr->addr4, ETH_ALEN);
+ if (qc_included) {
+ aad[a4_included ? 30 : 24] = qc;
+ /* rest of QC masked */
+ }
+
+ /* Start with the first block and AAD */
+ lib80211_ccmp_aes_encrypt(tfm, b0, auth);
+ xor_block(auth, aad, AES_BLOCK_LEN);
+ lib80211_ccmp_aes_encrypt(tfm, auth, auth);
+ xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN);
+ lib80211_ccmp_aes_encrypt(tfm, auth, auth);
+ b0[0] &= 0x07;
+ b0[14] = b0[15] = 0;
+ lib80211_ccmp_aes_encrypt(tfm, b0, s0);
+}
+
+static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len,
+ u8 *aeskey, int keylen, void *priv)
+{
+ struct lib80211_ccmp_data *key = priv;
+ int i;
+ u8 *pos;
+
+ if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len)
+ return -1;
+
+ if (aeskey != NULL && keylen >= CCMP_TK_LEN)
+ memcpy(aeskey, key->key, CCMP_TK_LEN);
+
+ pos = skb_push(skb, CCMP_HDR_LEN);
+ memmove(pos, pos + CCMP_HDR_LEN, hdr_len);
+ pos += hdr_len;
+
+ i = CCMP_PN_LEN - 1;
+ while (i >= 0) {
+ key->tx_pn[i]++;
+ if (key->tx_pn[i] != 0)
+ break;
+ i--;
+ }
+
+ *pos++ = key->tx_pn[5];
+ *pos++ = key->tx_pn[4];
+ *pos++ = 0;
+ *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
+ *pos++ = key->tx_pn[3];
+ *pos++ = key->tx_pn[2];
+ *pos++ = key->tx_pn[1];
+ *pos++ = key->tx_pn[0];
+
+ return CCMP_HDR_LEN;
+}
+
+static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct lib80211_ccmp_data *key = priv;
+ int data_len, i, blocks, last, len;
+ u8 *pos, *mic;
+ struct ieee80211_hdr *hdr;
+ u8 *b0 = key->tx_b0;
+ u8 *b = key->tx_b;
+ u8 *e = key->tx_e;
+ u8 *s0 = key->tx_s0;
+
+ if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len)
+ return -1;
+
+ data_len = skb->len - hdr_len;
+ len = lib80211_ccmp_hdr(skb, hdr_len, NULL, 0, priv);
+ if (len < 0)
+ return -1;
+
+ pos = skb->data + hdr_len + CCMP_HDR_LEN;
+ hdr = (struct ieee80211_hdr *)skb->data;
+ ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0);
+
+ blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
+ last = data_len % AES_BLOCK_LEN;
+
+ for (i = 1; i <= blocks; i++) {
+ len = (i == blocks && last) ? last : AES_BLOCK_LEN;
+ /* Authentication */
+ xor_block(b, pos, len);
+ lib80211_ccmp_aes_encrypt(key->tfm, b, b);
+ /* Encryption, with counter */
+ b0[14] = (i >> 8) & 0xff;
+ b0[15] = i & 0xff;
+ lib80211_ccmp_aes_encrypt(key->tfm, b0, e);
+ xor_block(pos, e, len);
+ pos += len;
+ }
+
+ mic = skb_put(skb, CCMP_MIC_LEN);
+ for (i = 0; i < CCMP_MIC_LEN; i++)
+ mic[i] = b[i] ^ s0[i];
+
+ return 0;
+}
+
+/*
+ * deal with seq counter wrapping correctly.
+ * refer to timer_after() for jiffies wrapping handling
+ */
+static inline int ccmp_replay_check(u8 *pn_n, u8 *pn_o)
+{
+ u32 iv32_n, iv16_n;
+ u32 iv32_o, iv16_o;
+
+ iv32_n = (pn_n[0] << 24) | (pn_n[1] << 16) | (pn_n[2] << 8) | pn_n[3];
+ iv16_n = (pn_n[4] << 8) | pn_n[5];
+
+ iv32_o = (pn_o[0] << 24) | (pn_o[1] << 16) | (pn_o[2] << 8) | pn_o[3];
+ iv16_o = (pn_o[4] << 8) | pn_o[5];
+
+ if ((s32)iv32_n - (s32)iv32_o < 0 ||
+ (iv32_n == iv32_o && iv16_n <= iv16_o))
+ return 1;
+ return 0;
+}
+
+static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct lib80211_ccmp_data *key = priv;
+ u8 keyidx, *pos;
+ struct ieee80211_hdr *hdr;
+ u8 *b0 = key->rx_b0;
+ u8 *b = key->rx_b;
+ u8 *a = key->rx_a;
+ u8 pn[6];
+ int i, blocks, last, len;
+ size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN;
+ u8 *mic = skb->data + skb->len - CCMP_MIC_LEN;
+
+ if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) {
+ key->dot11RSNAStatsCCMPFormatErrors++;
+ return -1;
+ }
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ pos = skb->data + hdr_len;
+ keyidx = pos[3];
+ if (!(keyidx & (1 << 5))) {
+ net_dbg_ratelimited("CCMP: received packet without ExtIV flag from %pM\n",
+ hdr->addr2);
+ key->dot11RSNAStatsCCMPFormatErrors++;
+ return -2;
+ }
+ keyidx >>= 6;
+ if (key->key_idx != keyidx) {
+ printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame "
+ "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv);
+ return -6;
+ }
+ if (!key->key_set) {
+ net_dbg_ratelimited("CCMP: received packet from %pM with keyid=%d that does not have a configured key\n",
+ hdr->addr2, keyidx);
+ return -3;
+ }
+
+ pn[0] = pos[7];
+ pn[1] = pos[6];
+ pn[2] = pos[5];
+ pn[3] = pos[4];
+ pn[4] = pos[1];
+ pn[5] = pos[0];
+ pos += 8;
+
+ if (ccmp_replay_check(pn, key->rx_pn)) {
+#ifdef CONFIG_LIB80211_DEBUG
+ net_dbg_ratelimited("CCMP: replay detected: STA=%pM previous PN %02x%02x%02x%02x%02x%02x received PN %02x%02x%02x%02x%02x%02x\n",
+ hdr->addr2,
+ key->rx_pn[0], key->rx_pn[1], key->rx_pn[2],
+ key->rx_pn[3], key->rx_pn[4], key->rx_pn[5],
+ pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]);
+#endif
+ key->dot11RSNAStatsCCMPReplays++;
+ return -4;
+ }
+
+ ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b);
+ xor_block(mic, b, CCMP_MIC_LEN);
+
+ blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
+ last = data_len % AES_BLOCK_LEN;
+
+ for (i = 1; i <= blocks; i++) {
+ len = (i == blocks && last) ? last : AES_BLOCK_LEN;
+ /* Decrypt, with counter */
+ b0[14] = (i >> 8) & 0xff;
+ b0[15] = i & 0xff;
+ lib80211_ccmp_aes_encrypt(key->tfm, b0, b);
+ xor_block(pos, b, len);
+ /* Authentication */
+ xor_block(a, pos, len);
+ lib80211_ccmp_aes_encrypt(key->tfm, a, a);
+ pos += len;
+ }
+
+ if (memcmp(mic, a, CCMP_MIC_LEN) != 0) {
+ net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM\n",
+ hdr->addr2);
+ key->dot11RSNAStatsCCMPDecryptErrors++;
+ return -5;
+ }
+
+ memcpy(key->rx_pn, pn, CCMP_PN_LEN);
+
+ /* Remove hdr and MIC */
+ memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len);
+ skb_pull(skb, CCMP_HDR_LEN);
+ skb_trim(skb, skb->len - CCMP_MIC_LEN);
+
+ return keyidx;
+}
+
+static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct lib80211_ccmp_data *data = priv;
+ int keyidx;
+ struct crypto_cipher *tfm = data->tfm;
+
+ keyidx = data->key_idx;
+ memset(data, 0, sizeof(*data));
+ data->key_idx = keyidx;
+ data->tfm = tfm;
+ if (len == CCMP_TK_LEN) {
+ memcpy(data->key, key, CCMP_TK_LEN);
+ data->key_set = 1;
+ if (seq) {
+ data->rx_pn[0] = seq[5];
+ data->rx_pn[1] = seq[4];
+ data->rx_pn[2] = seq[3];
+ data->rx_pn[3] = seq[2];
+ data->rx_pn[4] = seq[1];
+ data->rx_pn[5] = seq[0];
+ }
+ crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN);
+ } else if (len == 0)
+ data->key_set = 0;
+ else
+ return -1;
+
+ return 0;
+}
+
+static int lib80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct lib80211_ccmp_data *data = priv;
+
+ if (len < CCMP_TK_LEN)
+ return -1;
+
+ if (!data->key_set)
+ return 0;
+ memcpy(key, data->key, CCMP_TK_LEN);
+
+ if (seq) {
+ seq[0] = data->tx_pn[5];
+ seq[1] = data->tx_pn[4];
+ seq[2] = data->tx_pn[3];
+ seq[3] = data->tx_pn[2];
+ seq[4] = data->tx_pn[1];
+ seq[5] = data->tx_pn[0];
+ }
+
+ return CCMP_TK_LEN;
+}
+
+static void lib80211_ccmp_print_stats(struct seq_file *m, void *priv)
+{
+ struct lib80211_ccmp_data *ccmp = priv;
+
+ seq_printf(m,
+ "key[%d] alg=CCMP key_set=%d "
+ "tx_pn=%02x%02x%02x%02x%02x%02x "
+ "rx_pn=%02x%02x%02x%02x%02x%02x "
+ "format_errors=%d replays=%d decrypt_errors=%d\n",
+ ccmp->key_idx, ccmp->key_set,
+ ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2],
+ ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5],
+ ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2],
+ ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5],
+ ccmp->dot11RSNAStatsCCMPFormatErrors,
+ ccmp->dot11RSNAStatsCCMPReplays,
+ ccmp->dot11RSNAStatsCCMPDecryptErrors);
+}
+
+static struct lib80211_crypto_ops lib80211_crypt_ccmp = {
+ .name = "CCMP",
+ .init = lib80211_ccmp_init,
+ .deinit = lib80211_ccmp_deinit,
+ .encrypt_mpdu = lib80211_ccmp_encrypt,
+ .decrypt_mpdu = lib80211_ccmp_decrypt,
+ .encrypt_msdu = NULL,
+ .decrypt_msdu = NULL,
+ .set_key = lib80211_ccmp_set_key,
+ .get_key = lib80211_ccmp_get_key,
+ .print_stats = lib80211_ccmp_print_stats,
+ .extra_mpdu_prefix_len = CCMP_HDR_LEN,
+ .extra_mpdu_postfix_len = CCMP_MIC_LEN,
+ .owner = THIS_MODULE,
+};
+
+static int __init lib80211_crypto_ccmp_init(void)
+{
+ return lib80211_register_crypto_ops(&lib80211_crypt_ccmp);
+}
+
+static void __exit lib80211_crypto_ccmp_exit(void)
+{
+ lib80211_unregister_crypto_ops(&lib80211_crypt_ccmp);
+}
+
+module_init(lib80211_crypto_ccmp_init);
+module_exit(lib80211_crypto_ccmp_exit);
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
new file mode 100644
index 000000000..8c90ba79e
--- /dev/null
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -0,0 +1,762 @@
+/*
+ * lib80211 crypt: host-based TKIP encryption implementation for lib80211
+ *
+ * Copyright (c) 2003-2004, Jouni Malinen <j@w1.fi>
+ * Copyright (c) 2008, John W. Linville <linville@tuxdriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/mm.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <asm/string.h>
+
+#include <linux/wireless.h>
+#include <linux/ieee80211.h>
+#include <net/iw_handler.h>
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+#include <net/lib80211.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("lib80211 crypt: TKIP");
+MODULE_LICENSE("GPL");
+
+#define TKIP_HDR_LEN 8
+
+struct lib80211_tkip_data {
+#define TKIP_KEY_LEN 32
+ u8 key[TKIP_KEY_LEN];
+ int key_set;
+
+ u32 tx_iv32;
+ u16 tx_iv16;
+ u16 tx_ttak[5];
+ int tx_phase1_done;
+
+ u32 rx_iv32;
+ u16 rx_iv16;
+ u16 rx_ttak[5];
+ int rx_phase1_done;
+ u32 rx_iv32_new;
+ u16 rx_iv16_new;
+
+ u32 dot11RSNAStatsTKIPReplays;
+ u32 dot11RSNAStatsTKIPICVErrors;
+ u32 dot11RSNAStatsTKIPLocalMICFailures;
+
+ int key_idx;
+
+ struct crypto_blkcipher *rx_tfm_arc4;
+ struct crypto_hash *rx_tfm_michael;
+ struct crypto_blkcipher *tx_tfm_arc4;
+ struct crypto_hash *tx_tfm_michael;
+
+ /* scratch buffers for virt_to_page() (crypto API) */
+ u8 rx_hdr[16], tx_hdr[16];
+
+ unsigned long flags;
+};
+
+static unsigned long lib80211_tkip_set_flags(unsigned long flags, void *priv)
+{
+ struct lib80211_tkip_data *_priv = priv;
+ unsigned long old_flags = _priv->flags;
+ _priv->flags = flags;
+ return old_flags;
+}
+
+static unsigned long lib80211_tkip_get_flags(void *priv)
+{
+ struct lib80211_tkip_data *_priv = priv;
+ return _priv->flags;
+}
+
+static void *lib80211_tkip_init(int key_idx)
+{
+ struct lib80211_tkip_data *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+
+ priv->key_idx = key_idx;
+
+ priv->tx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(priv->tx_tfm_arc4)) {
+ priv->tx_tfm_arc4 = NULL;
+ goto fail;
+ }
+
+ priv->tx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(priv->tx_tfm_michael)) {
+ priv->tx_tfm_michael = NULL;
+ goto fail;
+ }
+
+ priv->rx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(priv->rx_tfm_arc4)) {
+ priv->rx_tfm_arc4 = NULL;
+ goto fail;
+ }
+
+ priv->rx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(priv->rx_tfm_michael)) {
+ priv->rx_tfm_michael = NULL;
+ goto fail;
+ }
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tx_tfm_michael)
+ crypto_free_hash(priv->tx_tfm_michael);
+ if (priv->tx_tfm_arc4)
+ crypto_free_blkcipher(priv->tx_tfm_arc4);
+ if (priv->rx_tfm_michael)
+ crypto_free_hash(priv->rx_tfm_michael);
+ if (priv->rx_tfm_arc4)
+ crypto_free_blkcipher(priv->rx_tfm_arc4);
+ kfree(priv);
+ }
+
+ return NULL;
+}
+
+static void lib80211_tkip_deinit(void *priv)
+{
+ struct lib80211_tkip_data *_priv = priv;
+ if (_priv) {
+ if (_priv->tx_tfm_michael)
+ crypto_free_hash(_priv->tx_tfm_michael);
+ if (_priv->tx_tfm_arc4)
+ crypto_free_blkcipher(_priv->tx_tfm_arc4);
+ if (_priv->rx_tfm_michael)
+ crypto_free_hash(_priv->rx_tfm_michael);
+ if (_priv->rx_tfm_arc4)
+ crypto_free_blkcipher(_priv->rx_tfm_arc4);
+ }
+ kfree(priv);
+}
+
+static inline u16 RotR1(u16 val)
+{
+ return (val >> 1) | (val << 15);
+}
+
+static inline u8 Lo8(u16 val)
+{
+ return val & 0xff;
+}
+
+static inline u8 Hi8(u16 val)
+{
+ return val >> 8;
+}
+
+static inline u16 Lo16(u32 val)
+{
+ return val & 0xffff;
+}
+
+static inline u16 Hi16(u32 val)
+{
+ return val >> 16;
+}
+
+static inline u16 Mk16(u8 hi, u8 lo)
+{
+ return lo | (((u16) hi) << 8);
+}
+
+static inline u16 Mk16_le(__le16 * v)
+{
+ return le16_to_cpu(*v);
+}
+
+static const u16 Sbox[256] = {
+ 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154,
+ 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A,
+ 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B,
+ 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B,
+ 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F,
+ 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F,
+ 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5,
+ 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F,
+ 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB,
+ 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397,
+ 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED,
+ 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A,
+ 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194,
+ 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3,
+ 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104,
+ 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D,
+ 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39,
+ 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695,
+ 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83,
+ 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76,
+ 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4,
+ 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B,
+ 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0,
+ 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018,
+ 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751,
+ 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85,
+ 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12,
+ 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9,
+ 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7,
+ 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A,
+ 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8,
+ 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A,
+};
+
+static inline u16 _S_(u16 v)
+{
+ u16 t = Sbox[Hi8(v)];
+ return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8));
+}
+
+#define PHASE1_LOOP_COUNT 8
+
+static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA,
+ u32 IV32)
+{
+ int i, j;
+
+ /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */
+ TTAK[0] = Lo16(IV32);
+ TTAK[1] = Hi16(IV32);
+ TTAK[2] = Mk16(TA[1], TA[0]);
+ TTAK[3] = Mk16(TA[3], TA[2]);
+ TTAK[4] = Mk16(TA[5], TA[4]);
+
+ for (i = 0; i < PHASE1_LOOP_COUNT; i++) {
+ j = 2 * (i & 1);
+ TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j]));
+ TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j]));
+ TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j]));
+ TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j]));
+ TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i;
+ }
+}
+
+static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK,
+ u16 IV16)
+{
+ /* Make temporary area overlap WEP seed so that the final copy can be
+ * avoided on little endian hosts. */
+ u16 *PPK = (u16 *) & WEPSeed[4];
+
+ /* Step 1 - make copy of TTAK and bring in TSC */
+ PPK[0] = TTAK[0];
+ PPK[1] = TTAK[1];
+ PPK[2] = TTAK[2];
+ PPK[3] = TTAK[3];
+ PPK[4] = TTAK[4];
+ PPK[5] = TTAK[4] + IV16;
+
+ /* Step 2 - 96-bit bijective mixing using S-box */
+ PPK[0] += _S_(PPK[5] ^ Mk16_le((__le16 *) & TK[0]));
+ PPK[1] += _S_(PPK[0] ^ Mk16_le((__le16 *) & TK[2]));
+ PPK[2] += _S_(PPK[1] ^ Mk16_le((__le16 *) & TK[4]));
+ PPK[3] += _S_(PPK[2] ^ Mk16_le((__le16 *) & TK[6]));
+ PPK[4] += _S_(PPK[3] ^ Mk16_le((__le16 *) & TK[8]));
+ PPK[5] += _S_(PPK[4] ^ Mk16_le((__le16 *) & TK[10]));
+
+ PPK[0] += RotR1(PPK[5] ^ Mk16_le((__le16 *) & TK[12]));
+ PPK[1] += RotR1(PPK[0] ^ Mk16_le((__le16 *) & TK[14]));
+ PPK[2] += RotR1(PPK[1]);
+ PPK[3] += RotR1(PPK[2]);
+ PPK[4] += RotR1(PPK[3]);
+ PPK[5] += RotR1(PPK[4]);
+
+ /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value
+ * WEPSeed[0..2] is transmitted as WEP IV */
+ WEPSeed[0] = Hi8(IV16);
+ WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F;
+ WEPSeed[2] = Lo8(IV16);
+ WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((__le16 *) & TK[0])) >> 1);
+
+#ifdef __BIG_ENDIAN
+ {
+ int i;
+ for (i = 0; i < 6; i++)
+ PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8);
+ }
+#endif
+}
+
+static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len,
+ u8 * rc4key, int keylen, void *priv)
+{
+ struct lib80211_tkip_data *tkey = priv;
+ u8 *pos;
+ struct ieee80211_hdr *hdr;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+
+ if (skb_headroom(skb) < TKIP_HDR_LEN || skb->len < hdr_len)
+ return -1;
+
+ if (rc4key == NULL || keylen < 16)
+ return -1;
+
+ if (!tkey->tx_phase1_done) {
+ tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2,
+ tkey->tx_iv32);
+ tkey->tx_phase1_done = 1;
+ }
+ tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16);
+
+ pos = skb_push(skb, TKIP_HDR_LEN);
+ memmove(pos, pos + TKIP_HDR_LEN, hdr_len);
+ pos += hdr_len;
+
+ *pos++ = *rc4key;
+ *pos++ = *(rc4key + 1);
+ *pos++ = *(rc4key + 2);
+ *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
+ *pos++ = tkey->tx_iv32 & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 8) & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 16) & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 24) & 0xff;
+
+ tkey->tx_iv16++;
+ if (tkey->tx_iv16 == 0) {
+ tkey->tx_phase1_done = 0;
+ tkey->tx_iv32++;
+ }
+
+ return TKIP_HDR_LEN;
+}
+
+static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct lib80211_tkip_data *tkey = priv;
+ struct blkcipher_desc desc = { .tfm = tkey->tx_tfm_arc4 };
+ int len;
+ u8 rc4key[16], *pos, *icv;
+ u32 crc;
+ struct scatterlist sg;
+
+ if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ net_dbg_ratelimited("TKIP countermeasures: dropped TX packet to %pM\n",
+ hdr->addr1);
+ return -1;
+ }
+
+ if (skb_tailroom(skb) < 4 || skb->len < hdr_len)
+ return -1;
+
+ len = skb->len - hdr_len;
+ pos = skb->data + hdr_len;
+
+ if ((lib80211_tkip_hdr(skb, hdr_len, rc4key, 16, priv)) < 0)
+ return -1;
+
+ crc = ~crc32_le(~0, pos, len);
+ icv = skb_put(skb, 4);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+
+ crypto_blkcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
+ sg_init_one(&sg, pos, len + 4);
+ return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
+}
+
+/*
+ * deal with seq counter wrapping correctly.
+ * refer to timer_after() for jiffies wrapping handling
+ */
+static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n,
+ u32 iv32_o, u16 iv16_o)
+{
+ if ((s32)iv32_n - (s32)iv32_o < 0 ||
+ (iv32_n == iv32_o && iv16_n <= iv16_o))
+ return 1;
+ return 0;
+}
+
+static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct lib80211_tkip_data *tkey = priv;
+ struct blkcipher_desc desc = { .tfm = tkey->rx_tfm_arc4 };
+ u8 rc4key[16];
+ u8 keyidx, *pos;
+ u32 iv32;
+ u16 iv16;
+ struct ieee80211_hdr *hdr;
+ u8 icv[4];
+ u32 crc;
+ struct scatterlist sg;
+ int plen;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+
+ if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
+ net_dbg_ratelimited("TKIP countermeasures: dropped received packet from %pM\n",
+ hdr->addr2);
+ return -1;
+ }
+
+ if (skb->len < hdr_len + TKIP_HDR_LEN + 4)
+ return -1;
+
+ pos = skb->data + hdr_len;
+ keyidx = pos[3];
+ if (!(keyidx & (1 << 5))) {
+ net_dbg_ratelimited("TKIP: received packet without ExtIV flag from %pM\n",
+ hdr->addr2);
+ return -2;
+ }
+ keyidx >>= 6;
+ if (tkey->key_idx != keyidx) {
+ printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame "
+ "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv);
+ return -6;
+ }
+ if (!tkey->key_set) {
+ net_dbg_ratelimited("TKIP: received packet from %pM with keyid=%d that does not have a configured key\n",
+ hdr->addr2, keyidx);
+ return -3;
+ }
+ iv16 = (pos[0] << 8) | pos[2];
+ iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24);
+ pos += TKIP_HDR_LEN;
+
+ if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) {
+#ifdef CONFIG_LIB80211_DEBUG
+ net_dbg_ratelimited("TKIP: replay detected: STA=%pM previous TSC %08x%04x received TSC %08x%04x\n",
+ hdr->addr2, tkey->rx_iv32, tkey->rx_iv16,
+ iv32, iv16);
+#endif
+ tkey->dot11RSNAStatsTKIPReplays++;
+ return -4;
+ }
+
+ if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) {
+ tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32);
+ tkey->rx_phase1_done = 1;
+ }
+ tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16);
+
+ plen = skb->len - hdr_len - 12;
+
+ crypto_blkcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
+ sg_init_one(&sg, pos, plen + 4);
+ if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) {
+ net_dbg_ratelimited("TKIP: failed to decrypt received packet from %pM\n",
+ hdr->addr2);
+ return -7;
+ }
+
+ crc = ~crc32_le(~0, pos, plen);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+ if (memcmp(icv, pos + plen, 4) != 0) {
+ if (iv32 != tkey->rx_iv32) {
+ /* Previously cached Phase1 result was already lost, so
+ * it needs to be recalculated for the next packet. */
+ tkey->rx_phase1_done = 0;
+ }
+#ifdef CONFIG_LIB80211_DEBUG
+ net_dbg_ratelimited("TKIP: ICV error detected: STA=%pM\n",
+ hdr->addr2);
+#endif
+ tkey->dot11RSNAStatsTKIPICVErrors++;
+ return -5;
+ }
+
+ /* Update real counters only after Michael MIC verification has
+ * completed */
+ tkey->rx_iv32_new = iv32;
+ tkey->rx_iv16_new = iv16;
+
+ /* Remove IV and ICV */
+ memmove(skb->data + TKIP_HDR_LEN, skb->data, hdr_len);
+ skb_pull(skb, TKIP_HDR_LEN);
+ skb_trim(skb, skb->len - 4);
+
+ return keyidx;
+}
+
+static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
+ u8 * data, size_t data_len, u8 * mic)
+{
+ struct hash_desc desc;
+ struct scatterlist sg[2];
+
+ if (tfm_michael == NULL) {
+ pr_warn("%s(): tfm_michael == NULL\n", __func__);
+ return -1;
+ }
+ sg_init_table(sg, 2);
+ sg_set_buf(&sg[0], hdr, 16);
+ sg_set_buf(&sg[1], data, data_len);
+
+ if (crypto_hash_setkey(tfm_michael, key, 8))
+ return -1;
+
+ desc.tfm = tfm_michael;
+ desc.flags = 0;
+ return crypto_hash_digest(&desc, sg, data_len + 16, mic);
+}
+
+static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
+{
+ struct ieee80211_hdr *hdr11;
+
+ hdr11 = (struct ieee80211_hdr *)skb->data;
+
+ switch (le16_to_cpu(hdr11->frame_control) &
+ (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
+ case IEEE80211_FCTL_TODS:
+ memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
+ break;
+ case IEEE80211_FCTL_FROMDS:
+ memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */
+ break;
+ case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
+ memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
+ break;
+ case 0:
+ memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
+ break;
+ }
+
+ if (ieee80211_is_data_qos(hdr11->frame_control)) {
+ hdr[12] = le16_to_cpu(*((__le16 *)ieee80211_get_qos_ctl(hdr11)))
+ & IEEE80211_QOS_CTL_TID_MASK;
+ } else
+ hdr[12] = 0; /* priority */
+
+ hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */
+}
+
+static int lib80211_michael_mic_add(struct sk_buff *skb, int hdr_len,
+ void *priv)
+{
+ struct lib80211_tkip_data *tkey = priv;
+ u8 *pos;
+
+ if (skb_tailroom(skb) < 8 || skb->len < hdr_len) {
+ printk(KERN_DEBUG "Invalid packet for Michael MIC add "
+ "(tailroom=%d hdr_len=%d skb->len=%d)\n",
+ skb_tailroom(skb), hdr_len, skb->len);
+ return -1;
+ }
+
+ michael_mic_hdr(skb, tkey->tx_hdr);
+ pos = skb_put(skb, 8);
+ if (michael_mic(tkey->tx_tfm_michael, &tkey->key[16], tkey->tx_hdr,
+ skb->data + hdr_len, skb->len - 8 - hdr_len, pos))
+ return -1;
+
+ return 0;
+}
+
+static void lib80211_michael_mic_failure(struct net_device *dev,
+ struct ieee80211_hdr *hdr,
+ int keyidx)
+{
+ union iwreq_data wrqu;
+ struct iw_michaelmicfailure ev;
+
+ /* TODO: needed parameters: count, keyid, key type, TSC */
+ memset(&ev, 0, sizeof(ev));
+ ev.flags = keyidx & IW_MICFAILURE_KEY_ID;
+ if (hdr->addr1[0] & 0x01)
+ ev.flags |= IW_MICFAILURE_GROUP;
+ else
+ ev.flags |= IW_MICFAILURE_PAIRWISE;
+ ev.src_addr.sa_family = ARPHRD_ETHER;
+ memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN);
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = sizeof(ev);
+ wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev);
+}
+
+static int lib80211_michael_mic_verify(struct sk_buff *skb, int keyidx,
+ int hdr_len, void *priv)
+{
+ struct lib80211_tkip_data *tkey = priv;
+ u8 mic[8];
+
+ if (!tkey->key_set)
+ return -1;
+
+ michael_mic_hdr(skb, tkey->rx_hdr);
+ if (michael_mic(tkey->rx_tfm_michael, &tkey->key[24], tkey->rx_hdr,
+ skb->data + hdr_len, skb->len - 8 - hdr_len, mic))
+ return -1;
+ if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) {
+ struct ieee80211_hdr *hdr;
+ hdr = (struct ieee80211_hdr *)skb->data;
+ printk(KERN_DEBUG "%s: Michael MIC verification failed for "
+ "MSDU from %pM keyidx=%d\n",
+ skb->dev ? skb->dev->name : "N/A", hdr->addr2,
+ keyidx);
+ if (skb->dev)
+ lib80211_michael_mic_failure(skb->dev, hdr, keyidx);
+ tkey->dot11RSNAStatsTKIPLocalMICFailures++;
+ return -1;
+ }
+
+ /* Update TSC counters for RX now that the packet verification has
+ * completed. */
+ tkey->rx_iv32 = tkey->rx_iv32_new;
+ tkey->rx_iv16 = tkey->rx_iv16_new;
+
+ skb_trim(skb, skb->len - 8);
+
+ return 0;
+}
+
+static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct lib80211_tkip_data *tkey = priv;
+ int keyidx;
+ struct crypto_hash *tfm = tkey->tx_tfm_michael;
+ struct crypto_blkcipher *tfm2 = tkey->tx_tfm_arc4;
+ struct crypto_hash *tfm3 = tkey->rx_tfm_michael;
+ struct crypto_blkcipher *tfm4 = tkey->rx_tfm_arc4;
+
+ keyidx = tkey->key_idx;
+ memset(tkey, 0, sizeof(*tkey));
+ tkey->key_idx = keyidx;
+ tkey->tx_tfm_michael = tfm;
+ tkey->tx_tfm_arc4 = tfm2;
+ tkey->rx_tfm_michael = tfm3;
+ tkey->rx_tfm_arc4 = tfm4;
+ if (len == TKIP_KEY_LEN) {
+ memcpy(tkey->key, key, TKIP_KEY_LEN);
+ tkey->key_set = 1;
+ tkey->tx_iv16 = 1; /* TSC is initialized to 1 */
+ if (seq) {
+ tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) |
+ (seq[3] << 8) | seq[2];
+ tkey->rx_iv16 = (seq[1] << 8) | seq[0];
+ }
+ } else if (len == 0)
+ tkey->key_set = 0;
+ else
+ return -1;
+
+ return 0;
+}
+
+static int lib80211_tkip_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct lib80211_tkip_data *tkey = priv;
+
+ if (len < TKIP_KEY_LEN)
+ return -1;
+
+ if (!tkey->key_set)
+ return 0;
+ memcpy(key, tkey->key, TKIP_KEY_LEN);
+
+ if (seq) {
+ /* Return the sequence number of the last transmitted frame. */
+ u16 iv16 = tkey->tx_iv16;
+ u32 iv32 = tkey->tx_iv32;
+ if (iv16 == 0)
+ iv32--;
+ iv16--;
+ seq[0] = tkey->tx_iv16;
+ seq[1] = tkey->tx_iv16 >> 8;
+ seq[2] = tkey->tx_iv32;
+ seq[3] = tkey->tx_iv32 >> 8;
+ seq[4] = tkey->tx_iv32 >> 16;
+ seq[5] = tkey->tx_iv32 >> 24;
+ }
+
+ return TKIP_KEY_LEN;
+}
+
+static void lib80211_tkip_print_stats(struct seq_file *m, void *priv)
+{
+ struct lib80211_tkip_data *tkip = priv;
+ seq_printf(m,
+ "key[%d] alg=TKIP key_set=%d "
+ "tx_pn=%02x%02x%02x%02x%02x%02x "
+ "rx_pn=%02x%02x%02x%02x%02x%02x "
+ "replays=%d icv_errors=%d local_mic_failures=%d\n",
+ tkip->key_idx, tkip->key_set,
+ (tkip->tx_iv32 >> 24) & 0xff,
+ (tkip->tx_iv32 >> 16) & 0xff,
+ (tkip->tx_iv32 >> 8) & 0xff,
+ tkip->tx_iv32 & 0xff,
+ (tkip->tx_iv16 >> 8) & 0xff,
+ tkip->tx_iv16 & 0xff,
+ (tkip->rx_iv32 >> 24) & 0xff,
+ (tkip->rx_iv32 >> 16) & 0xff,
+ (tkip->rx_iv32 >> 8) & 0xff,
+ tkip->rx_iv32 & 0xff,
+ (tkip->rx_iv16 >> 8) & 0xff,
+ tkip->rx_iv16 & 0xff,
+ tkip->dot11RSNAStatsTKIPReplays,
+ tkip->dot11RSNAStatsTKIPICVErrors,
+ tkip->dot11RSNAStatsTKIPLocalMICFailures);
+}
+
+static struct lib80211_crypto_ops lib80211_crypt_tkip = {
+ .name = "TKIP",
+ .init = lib80211_tkip_init,
+ .deinit = lib80211_tkip_deinit,
+ .encrypt_mpdu = lib80211_tkip_encrypt,
+ .decrypt_mpdu = lib80211_tkip_decrypt,
+ .encrypt_msdu = lib80211_michael_mic_add,
+ .decrypt_msdu = lib80211_michael_mic_verify,
+ .set_key = lib80211_tkip_set_key,
+ .get_key = lib80211_tkip_get_key,
+ .print_stats = lib80211_tkip_print_stats,
+ .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */
+ .extra_mpdu_postfix_len = 4, /* ICV */
+ .extra_msdu_postfix_len = 8, /* MIC */
+ .get_flags = lib80211_tkip_get_flags,
+ .set_flags = lib80211_tkip_set_flags,
+ .owner = THIS_MODULE,
+};
+
+static int __init lib80211_crypto_tkip_init(void)
+{
+ return lib80211_register_crypto_ops(&lib80211_crypt_tkip);
+}
+
+static void __exit lib80211_crypto_tkip_exit(void)
+{
+ lib80211_unregister_crypto_ops(&lib80211_crypt_tkip);
+}
+
+module_init(lib80211_crypto_tkip_init);
+module_exit(lib80211_crypto_tkip_exit);
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
new file mode 100644
index 000000000..1c292e4ea
--- /dev/null
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -0,0 +1,289 @@
+/*
+ * lib80211 crypt: host-based WEP encryption implementation for lib80211
+ *
+ * Copyright (c) 2002-2004, Jouni Malinen <j@w1.fi>
+ * Copyright (c) 2008, John W. Linville <linville@tuxdriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+#include <asm/string.h>
+
+#include <net/lib80211.h>
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("lib80211 crypt: WEP");
+MODULE_LICENSE("GPL");
+
+struct lib80211_wep_data {
+ u32 iv;
+#define WEP_KEY_LEN 13
+ u8 key[WEP_KEY_LEN + 1];
+ u8 key_len;
+ u8 key_idx;
+ struct crypto_blkcipher *tx_tfm;
+ struct crypto_blkcipher *rx_tfm;
+};
+
+static void *lib80211_wep_init(int keyidx)
+{
+ struct lib80211_wep_data *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+ priv->key_idx = keyidx;
+
+ priv->tx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(priv->tx_tfm)) {
+ priv->tx_tfm = NULL;
+ goto fail;
+ }
+
+ priv->rx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(priv->rx_tfm)) {
+ priv->rx_tfm = NULL;
+ goto fail;
+ }
+ /* start WEP IV from a random value */
+ get_random_bytes(&priv->iv, 4);
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tx_tfm)
+ crypto_free_blkcipher(priv->tx_tfm);
+ if (priv->rx_tfm)
+ crypto_free_blkcipher(priv->rx_tfm);
+ kfree(priv);
+ }
+ return NULL;
+}
+
+static void lib80211_wep_deinit(void *priv)
+{
+ struct lib80211_wep_data *_priv = priv;
+ if (_priv) {
+ if (_priv->tx_tfm)
+ crypto_free_blkcipher(_priv->tx_tfm);
+ if (_priv->rx_tfm)
+ crypto_free_blkcipher(_priv->rx_tfm);
+ }
+ kfree(priv);
+}
+
+/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */
+static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len,
+ u8 *key, int keylen, void *priv)
+{
+ struct lib80211_wep_data *wep = priv;
+ u32 klen;
+ u8 *pos;
+
+ if (skb_headroom(skb) < 4 || skb->len < hdr_len)
+ return -1;
+
+ pos = skb_push(skb, 4);
+ memmove(pos, pos + 4, hdr_len);
+ pos += hdr_len;
+
+ klen = 3 + wep->key_len;
+
+ wep->iv++;
+
+ /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key
+ * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N)
+ * can be used to speedup attacks, so avoid using them. */
+ if ((wep->iv & 0xff00) == 0xff00) {
+ u8 B = (wep->iv >> 16) & 0xff;
+ if (B >= 3 && B < klen)
+ wep->iv += 0x0100;
+ }
+
+ /* Prepend 24-bit IV to RC4 key and TX frame */
+ *pos++ = (wep->iv >> 16) & 0xff;
+ *pos++ = (wep->iv >> 8) & 0xff;
+ *pos++ = wep->iv & 0xff;
+ *pos++ = wep->key_idx << 6;
+
+ return 0;
+}
+
+/* Perform WEP encryption on given skb that has at least 4 bytes of headroom
+ * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted,
+ * so the payload length increases with 8 bytes.
+ *
+ * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data))
+ */
+static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct lib80211_wep_data *wep = priv;
+ struct blkcipher_desc desc = { .tfm = wep->tx_tfm };
+ u32 crc, klen, len;
+ u8 *pos, *icv;
+ struct scatterlist sg;
+ u8 key[WEP_KEY_LEN + 3];
+
+ /* other checks are in lib80211_wep_build_iv */
+ if (skb_tailroom(skb) < 4)
+ return -1;
+
+ /* add the IV to the frame */
+ if (lib80211_wep_build_iv(skb, hdr_len, NULL, 0, priv))
+ return -1;
+
+ /* Copy the IV into the first 3 bytes of the key */
+ skb_copy_from_linear_data_offset(skb, hdr_len, key, 3);
+
+ /* Copy rest of the WEP key (the secret part) */
+ memcpy(key + 3, wep->key, wep->key_len);
+
+ len = skb->len - hdr_len - 4;
+ pos = skb->data + hdr_len + 4;
+ klen = 3 + wep->key_len;
+
+ /* Append little-endian CRC32 over only the data and encrypt it to produce ICV */
+ crc = ~crc32_le(~0, pos, len);
+ icv = skb_put(skb, 4);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+
+ crypto_blkcipher_setkey(wep->tx_tfm, key, klen);
+ sg_init_one(&sg, pos, len + 4);
+ return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
+}
+
+/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
+ * the frame: IV (4 bytes), encrypted payload (including SNAP header),
+ * ICV (4 bytes). len includes both IV and ICV.
+ *
+ * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on
+ * failure. If frame is OK, IV and ICV will be removed.
+ */
+static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct lib80211_wep_data *wep = priv;
+ struct blkcipher_desc desc = { .tfm = wep->rx_tfm };
+ u32 crc, klen, plen;
+ u8 key[WEP_KEY_LEN + 3];
+ u8 keyidx, *pos, icv[4];
+ struct scatterlist sg;
+
+ if (skb->len < hdr_len + 8)
+ return -1;
+
+ pos = skb->data + hdr_len;
+ key[0] = *pos++;
+ key[1] = *pos++;
+ key[2] = *pos++;
+ keyidx = *pos++ >> 6;
+ if (keyidx != wep->key_idx)
+ return -1;
+
+ klen = 3 + wep->key_len;
+
+ /* Copy rest of the WEP key (the secret part) */
+ memcpy(key + 3, wep->key, wep->key_len);
+
+ /* Apply RC4 to data and compute CRC32 over decrypted data */
+ plen = skb->len - hdr_len - 8;
+
+ crypto_blkcipher_setkey(wep->rx_tfm, key, klen);
+ sg_init_one(&sg, pos, plen + 4);
+ if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4))
+ return -7;
+
+ crc = ~crc32_le(~0, pos, plen);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+ if (memcmp(icv, pos + plen, 4) != 0) {
+ /* ICV mismatch - drop frame */
+ return -2;
+ }
+
+ /* Remove IV and ICV */
+ memmove(skb->data + 4, skb->data, hdr_len);
+ skb_pull(skb, 4);
+ skb_trim(skb, skb->len - 4);
+
+ return 0;
+}
+
+static int lib80211_wep_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct lib80211_wep_data *wep = priv;
+
+ if (len < 0 || len > WEP_KEY_LEN)
+ return -1;
+
+ memcpy(wep->key, key, len);
+ wep->key_len = len;
+
+ return 0;
+}
+
+static int lib80211_wep_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct lib80211_wep_data *wep = priv;
+
+ if (len < wep->key_len)
+ return -1;
+
+ memcpy(key, wep->key, wep->key_len);
+
+ return wep->key_len;
+}
+
+static void lib80211_wep_print_stats(struct seq_file *m, void *priv)
+{
+ struct lib80211_wep_data *wep = priv;
+ seq_printf(m, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len);
+}
+
+static struct lib80211_crypto_ops lib80211_crypt_wep = {
+ .name = "WEP",
+ .init = lib80211_wep_init,
+ .deinit = lib80211_wep_deinit,
+ .encrypt_mpdu = lib80211_wep_encrypt,
+ .decrypt_mpdu = lib80211_wep_decrypt,
+ .encrypt_msdu = NULL,
+ .decrypt_msdu = NULL,
+ .set_key = lib80211_wep_set_key,
+ .get_key = lib80211_wep_get_key,
+ .print_stats = lib80211_wep_print_stats,
+ .extra_mpdu_prefix_len = 4, /* IV */
+ .extra_mpdu_postfix_len = 4, /* ICV */
+ .owner = THIS_MODULE,
+};
+
+static int __init lib80211_crypto_wep_init(void)
+{
+ return lib80211_register_crypto_ops(&lib80211_crypt_wep);
+}
+
+static void __exit lib80211_crypto_wep_exit(void)
+{
+ lib80211_unregister_crypto_ops(&lib80211_crypt_wep);
+}
+
+module_init(lib80211_crypto_wep_init);
+module_exit(lib80211_crypto_wep_exit);
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
new file mode 100644
index 000000000..092300b30
--- /dev/null
+++ b/net/wireless/mesh.c
@@ -0,0 +1,279 @@
+#include <linux/ieee80211.h>
+#include <linux/export.h>
+#include <net/cfg80211.h>
+#include "nl80211.h"
+#include "core.h"
+#include "rdev-ops.h"
+
+/* Default values, timeouts in ms */
+#define MESH_TTL 31
+#define MESH_DEFAULT_ELEMENT_TTL 31
+#define MESH_MAX_RETR 3
+#define MESH_RET_T 100
+#define MESH_CONF_T 100
+#define MESH_HOLD_T 100
+
+#define MESH_PATH_TIMEOUT 5000
+#define MESH_RANN_INTERVAL 5000
+#define MESH_PATH_TO_ROOT_TIMEOUT 6000
+#define MESH_ROOT_INTERVAL 5000
+#define MESH_ROOT_CONFIRMATION_INTERVAL 2000
+#define MESH_DEFAULT_PLINK_TIMEOUT 1800 /* timeout in seconds */
+
+/*
+ * Minimum interval between two consecutive PREQs originated by the same
+ * interface
+ */
+#define MESH_PREQ_MIN_INT 10
+#define MESH_PERR_MIN_INT 100
+#define MESH_DIAM_TRAVERSAL_TIME 50
+
+#define MESH_RSSI_THRESHOLD 0
+
+/*
+ * A path will be refreshed if it is used PATH_REFRESH_TIME milliseconds
+ * before timing out. This way it will remain ACTIVE and no data frames
+ * will be unnecessarily held in the pending queue.
+ */
+#define MESH_PATH_REFRESH_TIME 1000
+#define MESH_MIN_DISCOVERY_TIMEOUT (2 * MESH_DIAM_TRAVERSAL_TIME)
+
+/* Default maximum number of established plinks per interface */
+#define MESH_MAX_ESTAB_PLINKS 32
+
+#define MESH_MAX_PREQ_RETRIES 4
+
+#define MESH_SYNC_NEIGHBOR_OFFSET_MAX 50
+
+#define MESH_DEFAULT_BEACON_INTERVAL 1000 /* in 1024 us units (=TUs) */
+#define MESH_DEFAULT_DTIM_PERIOD 2
+#define MESH_DEFAULT_AWAKE_WINDOW 10 /* in 1024 us units (=TUs) */
+
+const struct mesh_config default_mesh_config = {
+ .dot11MeshRetryTimeout = MESH_RET_T,
+ .dot11MeshConfirmTimeout = MESH_CONF_T,
+ .dot11MeshHoldingTimeout = MESH_HOLD_T,
+ .dot11MeshMaxRetries = MESH_MAX_RETR,
+ .dot11MeshTTL = MESH_TTL,
+ .element_ttl = MESH_DEFAULT_ELEMENT_TTL,
+ .auto_open_plinks = true,
+ .dot11MeshMaxPeerLinks = MESH_MAX_ESTAB_PLINKS,
+ .dot11MeshNbrOffsetMaxNeighbor = MESH_SYNC_NEIGHBOR_OFFSET_MAX,
+ .dot11MeshHWMPactivePathTimeout = MESH_PATH_TIMEOUT,
+ .dot11MeshHWMPpreqMinInterval = MESH_PREQ_MIN_INT,
+ .dot11MeshHWMPperrMinInterval = MESH_PERR_MIN_INT,
+ .dot11MeshHWMPnetDiameterTraversalTime = MESH_DIAM_TRAVERSAL_TIME,
+ .dot11MeshHWMPmaxPREQretries = MESH_MAX_PREQ_RETRIES,
+ .path_refresh_time = MESH_PATH_REFRESH_TIME,
+ .min_discovery_timeout = MESH_MIN_DISCOVERY_TIMEOUT,
+ .dot11MeshHWMPRannInterval = MESH_RANN_INTERVAL,
+ .dot11MeshGateAnnouncementProtocol = false,
+ .dot11MeshForwarding = true,
+ .rssi_threshold = MESH_RSSI_THRESHOLD,
+ .ht_opmode = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED,
+ .dot11MeshHWMPactivePathToRootTimeout = MESH_PATH_TO_ROOT_TIMEOUT,
+ .dot11MeshHWMProotInterval = MESH_ROOT_INTERVAL,
+ .dot11MeshHWMPconfirmationInterval = MESH_ROOT_CONFIRMATION_INTERVAL,
+ .power_mode = NL80211_MESH_POWER_ACTIVE,
+ .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW,
+ .plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT,
+};
+
+const struct mesh_setup default_mesh_setup = {
+ /* cfg80211_join_mesh() will pick a channel if needed */
+ .sync_method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET,
+ .path_sel_proto = IEEE80211_PATH_PROTOCOL_HWMP,
+ .path_metric = IEEE80211_PATH_METRIC_AIRTIME,
+ .auth_id = 0, /* open */
+ .ie = NULL,
+ .ie_len = 0,
+ .is_secure = false,
+ .user_mpm = false,
+ .beacon_interval = MESH_DEFAULT_BEACON_INTERVAL,
+ .dtim_period = MESH_DEFAULT_DTIM_PERIOD,
+};
+
+int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct mesh_setup *setup,
+ const struct mesh_config *conf)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != IEEE80211_MAX_MESH_ID_LEN);
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) &&
+ setup->is_secure)
+ return -EOPNOTSUPP;
+
+ if (wdev->mesh_id_len)
+ return -EALREADY;
+
+ if (!setup->mesh_id_len)
+ return -EINVAL;
+
+ if (!rdev->ops->join_mesh)
+ return -EOPNOTSUPP;
+
+ if (!setup->chandef.chan) {
+ /* if no channel explicitly given, use preset channel */
+ setup->chandef = wdev->preset_chandef;
+ }
+
+ if (!setup->chandef.chan) {
+ /* if we don't have that either, use the first usable channel */
+ enum ieee80211_band band;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_channel *chan;
+ int i;
+
+ sband = rdev->wiphy.bands[band];
+ if (!sband)
+ continue;
+
+ for (i = 0; i < sband->n_channels; i++) {
+ chan = &sband->channels[i];
+ if (chan->flags & (IEEE80211_CHAN_NO_IR |
+ IEEE80211_CHAN_DISABLED |
+ IEEE80211_CHAN_RADAR))
+ continue;
+ setup->chandef.chan = chan;
+ break;
+ }
+
+ if (setup->chandef.chan)
+ break;
+ }
+
+ /* no usable channel ... */
+ if (!setup->chandef.chan)
+ return -EINVAL;
+
+ setup->chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
+ setup->chandef.center_freq1 = setup->chandef.chan->center_freq;
+ }
+
+ /*
+ * check if basic rates are available otherwise use mandatory rates as
+ * basic rates
+ */
+ if (!setup->basic_rates) {
+ enum nl80211_bss_scan_width scan_width;
+ struct ieee80211_supported_band *sband =
+ rdev->wiphy.bands[setup->chandef.chan->band];
+ scan_width = cfg80211_chandef_to_scan_width(&setup->chandef);
+ setup->basic_rates = ieee80211_mandatory_rates(sband,
+ scan_width);
+ }
+
+ if (!cfg80211_reg_can_beacon(&rdev->wiphy, &setup->chandef,
+ NL80211_IFTYPE_MESH_POINT))
+ return -EINVAL;
+
+ err = rdev_join_mesh(rdev, dev, conf, setup);
+ if (!err) {
+ memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len);
+ wdev->mesh_id_len = setup->mesh_id_len;
+ wdev->chandef = setup->chandef;
+ }
+
+ return err;
+}
+
+int cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct mesh_setup *setup,
+ const struct mesh_config *conf)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ wdev_lock(wdev);
+ err = __cfg80211_join_mesh(rdev, dev, setup, conf);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_chan_def *chandef)
+{
+ int err;
+
+ /*
+ * Workaround for libertas (only!), it puts the interface
+ * into mesh mode but doesn't implement join_mesh. Instead,
+ * it is configured via sysfs and then joins the mesh when
+ * you set the channel. Note that the libertas mesh isn't
+ * compatible with 802.11 mesh.
+ */
+ if (rdev->ops->libertas_set_mesh_channel) {
+ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT)
+ return -EINVAL;
+
+ if (!netif_running(wdev->netdev))
+ return -ENETDOWN;
+
+ err = rdev_libertas_set_mesh_channel(rdev, wdev->netdev,
+ chandef->chan);
+ if (!err)
+ wdev->chandef = *chandef;
+
+ return err;
+ }
+
+ if (wdev->mesh_id_len)
+ return -EBUSY;
+
+ wdev->preset_chandef = *chandef;
+ return 0;
+}
+
+int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->leave_mesh)
+ return -EOPNOTSUPP;
+
+ if (!wdev->mesh_id_len)
+ return -ENOTCONN;
+
+ err = rdev_leave_mesh(rdev, dev);
+ if (!err) {
+ wdev->mesh_id_len = 0;
+ memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+ rdev_set_qos_map(rdev, dev, NULL);
+ }
+
+ return err;
+}
+
+int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ wdev_lock(wdev);
+ err = __cfg80211_leave_mesh(rdev, dev);
+ wdev_unlock(wdev);
+
+ return err;
+}
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
new file mode 100644
index 000000000..7aae329e2
--- /dev/null
+++ b/net/wireless/mlme.c
@@ -0,0 +1,785 @@
+/*
+ * cfg80211 MLME SAP interface
+ *
+ * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/nl80211.h>
+#include <linux/slab.h>
+#include <linux/wireless.h>
+#include <net/cfg80211.h>
+#include <net/iw_handler.h>
+#include "core.h"
+#include "nl80211.h"
+#include "rdev-ops.h"
+
+
+void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss,
+ const u8 *buf, size_t len, int uapsd_queues)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+ u8 *ie = mgmt->u.assoc_resp.variable;
+ int ieoffs = offsetof(struct ieee80211_mgmt, u.assoc_resp.variable);
+ u16 status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code);
+
+ trace_cfg80211_send_rx_assoc(dev, bss);
+
+ /*
+ * This is a bit of a hack, we don't notify userspace of
+ * a (re-)association reply if we tried to send a reassoc
+ * and got a reject -- we only try again with an assoc
+ * frame instead of reassoc.
+ */
+ if (cfg80211_sme_rx_assoc_resp(wdev, status_code)) {
+ cfg80211_unhold_bss(bss_from_pub(bss));
+ cfg80211_put_bss(wiphy, bss);
+ return;
+ }
+
+ nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL, uapsd_queues);
+ /* update current_bss etc., consumes the bss reference */
+ __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs,
+ status_code,
+ status_code == WLAN_STATUS_SUCCESS, bss);
+}
+EXPORT_SYMBOL(cfg80211_rx_assoc_resp);
+
+static void cfg80211_process_auth(struct wireless_dev *wdev,
+ const u8 *buf, size_t len)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ nl80211_send_rx_auth(rdev, wdev->netdev, buf, len, GFP_KERNEL);
+ cfg80211_sme_rx_auth(wdev, buf, len);
+}
+
+static void cfg80211_process_deauth(struct wireless_dev *wdev,
+ const u8 *buf, size_t len)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+ const u8 *bssid = mgmt->bssid;
+ u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
+ bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr);
+
+ nl80211_send_deauth(rdev, wdev->netdev, buf, len, GFP_KERNEL);
+
+ if (!wdev->current_bss ||
+ !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))
+ return;
+
+ __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap);
+ cfg80211_sme_deauth(wdev);
+}
+
+static void cfg80211_process_disassoc(struct wireless_dev *wdev,
+ const u8 *buf, size_t len)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+ const u8 *bssid = mgmt->bssid;
+ u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
+ bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr);
+
+ nl80211_send_disassoc(rdev, wdev->netdev, buf, len, GFP_KERNEL);
+
+ if (WARN_ON(!wdev->current_bss ||
+ !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+ return;
+
+ __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap);
+ cfg80211_sme_disassoc(wdev);
+}
+
+void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct ieee80211_mgmt *mgmt = (void *)buf;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ trace_cfg80211_rx_mlme_mgmt(dev, buf, len);
+
+ if (WARN_ON(len < 2))
+ return;
+
+ if (ieee80211_is_auth(mgmt->frame_control))
+ cfg80211_process_auth(wdev, buf, len);
+ else if (ieee80211_is_deauth(mgmt->frame_control))
+ cfg80211_process_deauth(wdev, buf, len);
+ else if (ieee80211_is_disassoc(mgmt->frame_control))
+ cfg80211_process_disassoc(wdev, buf, len);
+}
+EXPORT_SYMBOL(cfg80211_rx_mlme_mgmt);
+
+void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_send_auth_timeout(dev, addr);
+
+ nl80211_send_auth_timeout(rdev, dev, addr, GFP_KERNEL);
+ cfg80211_sme_auth_timeout(wdev);
+}
+EXPORT_SYMBOL(cfg80211_auth_timeout);
+
+void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_send_assoc_timeout(dev, bss->bssid);
+
+ nl80211_send_assoc_timeout(rdev, dev, bss->bssid, GFP_KERNEL);
+ cfg80211_sme_assoc_timeout(wdev);
+
+ cfg80211_unhold_bss(bss_from_pub(bss));
+ cfg80211_put_bss(wiphy, bss);
+}
+EXPORT_SYMBOL(cfg80211_assoc_timeout);
+
+void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct ieee80211_mgmt *mgmt = (void *)buf;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ trace_cfg80211_tx_mlme_mgmt(dev, buf, len);
+
+ if (WARN_ON(len < 2))
+ return;
+
+ if (ieee80211_is_deauth(mgmt->frame_control))
+ cfg80211_process_deauth(wdev, buf, len);
+ else
+ cfg80211_process_disassoc(wdev, buf, len);
+}
+EXPORT_SYMBOL(cfg80211_tx_mlme_mgmt);
+
+void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
+ enum nl80211_key_type key_type, int key_id,
+ const u8 *tsc, gfp_t gfp)
+{
+ struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+#ifdef CONFIG_CFG80211_WEXT
+ union iwreq_data wrqu;
+ char *buf = kmalloc(128, gfp);
+
+ if (buf) {
+ sprintf(buf, "MLME-MICHAELMICFAILURE.indication("
+ "keyid=%d %scast addr=%pM)", key_id,
+ key_type == NL80211_KEYTYPE_GROUP ? "broad" : "uni",
+ addr);
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = strlen(buf);
+ wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
+ kfree(buf);
+ }
+#endif
+
+ trace_cfg80211_michael_mic_failure(dev, addr, key_type, key_id, tsc);
+ nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc, gfp);
+}
+EXPORT_SYMBOL(cfg80211_michael_mic_failure);
+
+/* some MLME handling for userspace SME */
+int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ieee80211_channel *chan,
+ enum nl80211_auth_type auth_type,
+ const u8 *bssid,
+ const u8 *ssid, int ssid_len,
+ const u8 *ie, int ie_len,
+ const u8 *key, int key_len, int key_idx,
+ const u8 *sae_data, int sae_data_len)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_auth_request req = {
+ .ie = ie,
+ .ie_len = ie_len,
+ .sae_data = sae_data,
+ .sae_data_len = sae_data_len,
+ .auth_type = auth_type,
+ .key = key,
+ .key_len = key_len,
+ .key_idx = key_idx,
+ };
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (auth_type == NL80211_AUTHTYPE_SHARED_KEY)
+ if (!key || !key_len || key_idx < 0 || key_idx > 4)
+ return -EINVAL;
+
+ if (wdev->current_bss &&
+ ether_addr_equal(bssid, wdev->current_bss->pub.bssid))
+ return -EALREADY;
+
+ req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
+ IEEE80211_BSS_TYPE_ESS,
+ IEEE80211_PRIVACY_ANY);
+ if (!req.bss)
+ return -ENOENT;
+
+ err = rdev_auth(rdev, dev, &req);
+
+ cfg80211_put_bss(&rdev->wiphy, req.bss);
+ return err;
+}
+
+/* Do a logical ht_capa &= ht_capa_mask. */
+void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa,
+ const struct ieee80211_ht_cap *ht_capa_mask)
+{
+ int i;
+ u8 *p1, *p2;
+ if (!ht_capa_mask) {
+ memset(ht_capa, 0, sizeof(*ht_capa));
+ return;
+ }
+
+ p1 = (u8*)(ht_capa);
+ p2 = (u8*)(ht_capa_mask);
+ for (i = 0; i<sizeof(*ht_capa); i++)
+ p1[i] &= p2[i];
+}
+
+/* Do a logical ht_capa &= ht_capa_mask. */
+void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
+ const struct ieee80211_vht_cap *vht_capa_mask)
+{
+ int i;
+ u8 *p1, *p2;
+ if (!vht_capa_mask) {
+ memset(vht_capa, 0, sizeof(*vht_capa));
+ return;
+ }
+
+ p1 = (u8*)(vht_capa);
+ p2 = (u8*)(vht_capa_mask);
+ for (i = 0; i < sizeof(*vht_capa); i++)
+ p1[i] &= p2[i];
+}
+
+int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ieee80211_channel *chan,
+ const u8 *bssid,
+ const u8 *ssid, int ssid_len,
+ struct cfg80211_assoc_request *req)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (wdev->current_bss &&
+ (!req->prev_bssid || !ether_addr_equal(wdev->current_bss->pub.bssid,
+ req->prev_bssid)))
+ return -EALREADY;
+
+ cfg80211_oper_and_ht_capa(&req->ht_capa_mask,
+ rdev->wiphy.ht_capa_mod_mask);
+ cfg80211_oper_and_vht_capa(&req->vht_capa_mask,
+ rdev->wiphy.vht_capa_mod_mask);
+
+ req->bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
+ IEEE80211_BSS_TYPE_ESS,
+ IEEE80211_PRIVACY_ANY);
+ if (!req->bss)
+ return -ENOENT;
+
+ err = rdev_assoc(rdev, dev, req);
+ if (!err)
+ cfg80211_hold_bss(bss_from_pub(req->bss));
+ else
+ cfg80211_put_bss(&rdev->wiphy, req->bss);
+
+ return err;
+}
+
+int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *bssid,
+ const u8 *ie, int ie_len, u16 reason,
+ bool local_state_change)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_deauth_request req = {
+ .bssid = bssid,
+ .reason_code = reason,
+ .ie = ie,
+ .ie_len = ie_len,
+ .local_state_change = local_state_change,
+ };
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (local_state_change &&
+ (!wdev->current_bss ||
+ !ether_addr_equal(wdev->current_bss->pub.bssid, bssid)))
+ return 0;
+
+ return rdev_deauth(rdev, dev, &req);
+}
+
+int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *bssid,
+ const u8 *ie, int ie_len, u16 reason,
+ bool local_state_change)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_disassoc_request req = {
+ .reason_code = reason,
+ .local_state_change = local_state_change,
+ .ie = ie,
+ .ie_len = ie_len,
+ };
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!wdev->current_bss)
+ return -ENOTCONN;
+
+ if (ether_addr_equal(wdev->current_bss->pub.bssid, bssid))
+ req.bss = &wdev->current_bss->pub;
+ else
+ return -ENOTCONN;
+
+ err = rdev_disassoc(rdev, dev, &req);
+ if (err)
+ return err;
+
+ /* driver should have reported the disassoc */
+ WARN_ON(wdev->current_bss);
+ return 0;
+}
+
+void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ u8 bssid[ETH_ALEN];
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!rdev->ops->deauth)
+ return;
+
+ if (!wdev->current_bss)
+ return;
+
+ memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN);
+ cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0,
+ WLAN_REASON_DEAUTH_LEAVING, false);
+}
+
+struct cfg80211_mgmt_registration {
+ struct list_head list;
+
+ u32 nlportid;
+
+ int match_len;
+
+ __le16 frame_type;
+
+ u8 match[];
+};
+
+int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
+ u16 frame_type, const u8 *match_data,
+ int match_len)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_mgmt_registration *reg, *nreg;
+ int err = 0;
+ u16 mgmt_type;
+
+ if (!wdev->wiphy->mgmt_stypes)
+ return -EOPNOTSUPP;
+
+ if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT)
+ return -EINVAL;
+
+ if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE))
+ return -EINVAL;
+
+ mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
+ if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type)))
+ return -EINVAL;
+
+ nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL);
+ if (!nreg)
+ return -ENOMEM;
+
+ spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+ list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
+ int mlen = min(match_len, reg->match_len);
+
+ if (frame_type != le16_to_cpu(reg->frame_type))
+ continue;
+
+ if (memcmp(reg->match, match_data, mlen) == 0) {
+ err = -EALREADY;
+ break;
+ }
+ }
+
+ if (err) {
+ kfree(nreg);
+ goto out;
+ }
+
+ memcpy(nreg->match, match_data, match_len);
+ nreg->match_len = match_len;
+ nreg->nlportid = snd_portid;
+ nreg->frame_type = cpu_to_le16(frame_type);
+ list_add(&nreg->list, &wdev->mgmt_registrations);
+
+ if (rdev->ops->mgmt_frame_register)
+ rdev_mgmt_frame_register(rdev, wdev, frame_type, true);
+
+ out:
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+ return err;
+}
+
+void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_mgmt_registration *reg, *tmp;
+
+ spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+ list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
+ if (reg->nlportid != nlportid)
+ continue;
+
+ if (rdev->ops->mgmt_frame_register) {
+ u16 frame_type = le16_to_cpu(reg->frame_type);
+
+ rdev_mgmt_frame_register(rdev, wdev,
+ frame_type, false);
+ }
+
+ list_del(&reg->list);
+ kfree(reg);
+ }
+
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+ if (nlportid && rdev->crit_proto_nlportid == nlportid) {
+ rdev->crit_proto_nlportid = 0;
+ rdev_crit_proto_stop(rdev, wdev);
+ }
+
+ if (nlportid == wdev->ap_unexpected_nlportid)
+ wdev->ap_unexpected_nlportid = 0;
+}
+
+void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
+{
+ struct cfg80211_mgmt_registration *reg, *tmp;
+
+ spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+ list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
+ list_del(&reg->list);
+ kfree(reg);
+ }
+
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
+}
+
+int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_mgmt_tx_params *params, u64 *cookie)
+{
+ const struct ieee80211_mgmt *mgmt;
+ u16 stype;
+
+ if (!wdev->wiphy->mgmt_stypes)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->mgmt_tx)
+ return -EOPNOTSUPP;
+
+ if (params->len < 24 + 1)
+ return -EINVAL;
+
+ mgmt = (const struct ieee80211_mgmt *)params->buf;
+
+ if (!ieee80211_is_mgmt(mgmt->frame_control))
+ return -EINVAL;
+
+ stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
+ if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4)))
+ return -EINVAL;
+
+ if (ieee80211_is_action(mgmt->frame_control) &&
+ mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
+ int err = 0;
+
+ wdev_lock(wdev);
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ if (!wdev->current_bss) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ if (!ether_addr_equal(wdev->current_bss->pub.bssid,
+ mgmt->bssid)) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ /*
+ * check for IBSS DA must be done by driver as
+ * cfg80211 doesn't track the stations
+ */
+ if (wdev->iftype == NL80211_IFTYPE_ADHOC)
+ break;
+
+ /* for station, check that DA is the AP */
+ if (!ether_addr_equal(wdev->current_bss->pub.bssid,
+ mgmt->da)) {
+ err = -ENOTCONN;
+ break;
+ }
+ break;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_AP_VLAN:
+ if (!ether_addr_equal(mgmt->bssid, wdev_address(wdev)))
+ err = -EINVAL;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ if (!ether_addr_equal(mgmt->sa, mgmt->bssid)) {
+ err = -EINVAL;
+ break;
+ }
+ /*
+ * check for mesh DA must be done by driver as
+ * cfg80211 doesn't track the stations
+ */
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ /*
+ * fall through, P2P device only supports
+ * public action frames
+ */
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+ wdev_unlock(wdev);
+
+ if (err)
+ return err;
+ }
+
+ if (!ether_addr_equal(mgmt->sa, wdev_address(wdev)))
+ return -EINVAL;
+
+ /* Transmit the Action frame as requested by user space */
+ return rdev_mgmt_tx(rdev, wdev, params, cookie);
+}
+
+bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm,
+ const u8 *buf, size_t len, u32 flags)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_mgmt_registration *reg;
+ const struct ieee80211_txrx_stypes *stypes =
+ &wiphy->mgmt_stypes[wdev->iftype];
+ struct ieee80211_mgmt *mgmt = (void *)buf;
+ const u8 *data;
+ int data_len;
+ bool result = false;
+ __le16 ftype = mgmt->frame_control &
+ cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE);
+ u16 stype;
+
+ trace_cfg80211_rx_mgmt(wdev, freq, sig_mbm);
+ stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4;
+
+ if (!(stypes->rx & BIT(stype))) {
+ trace_cfg80211_return_bool(false);
+ return false;
+ }
+
+ data = buf + ieee80211_hdrlen(mgmt->frame_control);
+ data_len = len - ieee80211_hdrlen(mgmt->frame_control);
+
+ spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+ list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
+ if (reg->frame_type != ftype)
+ continue;
+
+ if (reg->match_len > data_len)
+ continue;
+
+ if (memcmp(reg->match, data, reg->match_len))
+ continue;
+
+ /* found match! */
+
+ /* Indicate the received Action frame to user space */
+ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid,
+ freq, sig_mbm,
+ buf, len, flags, GFP_ATOMIC))
+ continue;
+
+ result = true;
+ break;
+ }
+
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+ trace_cfg80211_return_bool(result);
+ return result;
+}
+EXPORT_SYMBOL(cfg80211_rx_mgmt);
+
+void cfg80211_dfs_channels_update_work(struct work_struct *work)
+{
+ struct delayed_work *delayed_work;
+ struct cfg80211_registered_device *rdev;
+ struct cfg80211_chan_def chandef;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_channel *c;
+ struct wiphy *wiphy;
+ bool check_again = false;
+ unsigned long timeout, next_time = 0;
+ int bandid, i;
+
+ delayed_work = container_of(work, struct delayed_work, work);
+ rdev = container_of(delayed_work, struct cfg80211_registered_device,
+ dfs_update_channels_wk);
+ wiphy = &rdev->wiphy;
+
+ rtnl_lock();
+ for (bandid = 0; bandid < IEEE80211_NUM_BANDS; bandid++) {
+ sband = wiphy->bands[bandid];
+ if (!sband)
+ continue;
+
+ for (i = 0; i < sband->n_channels; i++) {
+ c = &sband->channels[i];
+
+ if (c->dfs_state != NL80211_DFS_UNAVAILABLE)
+ continue;
+
+ timeout = c->dfs_state_entered + msecs_to_jiffies(
+ IEEE80211_DFS_MIN_NOP_TIME_MS);
+
+ if (time_after_eq(jiffies, timeout)) {
+ c->dfs_state = NL80211_DFS_USABLE;
+ c->dfs_state_entered = jiffies;
+
+ cfg80211_chandef_create(&chandef, c,
+ NL80211_CHAN_NO_HT);
+
+ nl80211_radar_notify(rdev, &chandef,
+ NL80211_RADAR_NOP_FINISHED,
+ NULL, GFP_ATOMIC);
+ continue;
+ }
+
+ if (!check_again)
+ next_time = timeout - jiffies;
+ else
+ next_time = min(next_time, timeout - jiffies);
+ check_again = true;
+ }
+ }
+ rtnl_unlock();
+
+ /* reschedule if there are other channels waiting to be cleared again */
+ if (check_again)
+ queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk,
+ next_time);
+}
+
+
+void cfg80211_radar_event(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef,
+ gfp_t gfp)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ unsigned long timeout;
+
+ trace_cfg80211_radar_event(wiphy, chandef);
+
+ /* only set the chandef supplied channel to unavailable, in
+ * case the radar is detected on only one of multiple channels
+ * spanned by the chandef.
+ */
+ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE);
+
+ timeout = msecs_to_jiffies(IEEE80211_DFS_MIN_NOP_TIME_MS);
+ queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk,
+ timeout);
+
+ nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp);
+}
+EXPORT_SYMBOL(cfg80211_radar_event);
+
+void cfg80211_cac_event(struct net_device *netdev,
+ const struct cfg80211_chan_def *chandef,
+ enum nl80211_radar_event event, gfp_t gfp)
+{
+ struct wireless_dev *wdev = netdev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ unsigned long timeout;
+
+ trace_cfg80211_cac_event(netdev, event);
+
+ if (WARN_ON(!wdev->cac_started))
+ return;
+
+ if (WARN_ON(!wdev->chandef.chan))
+ return;
+
+ switch (event) {
+ case NL80211_RADAR_CAC_FINISHED:
+ timeout = wdev->cac_start_time +
+ msecs_to_jiffies(wdev->cac_time_ms);
+ WARN_ON(!time_after_eq(jiffies, timeout));
+ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE);
+ break;
+ case NL80211_RADAR_CAC_ABORTED:
+ break;
+ default:
+ WARN_ON(1);
+ return;
+ }
+ wdev->cac_started = false;
+
+ nl80211_radar_notify(rdev, chandef, event, netdev, gfp);
+}
+EXPORT_SYMBOL(cfg80211_cac_event);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
new file mode 100644
index 000000000..dd78445c7
--- /dev/null
+++ b/net/wireless/nl80211.c
@@ -0,0 +1,12984 @@
+/*
+ * This is the new netlink-based wireless configuration interface.
+ *
+ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ */
+
+#include <linux/if.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/if_ether.h>
+#include <linux/ieee80211.h>
+#include <linux/nl80211.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <linux/etherdevice.h>
+#include <net/net_namespace.h>
+#include <net/genetlink.h>
+#include <net/cfg80211.h>
+#include <net/sock.h>
+#include <net/inet_connection_sock.h>
+#include "core.h"
+#include "nl80211.h"
+#include "reg.h"
+#include "rdev-ops.h"
+
+static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
+ struct genl_info *info,
+ struct cfg80211_crypto_settings *settings,
+ int cipher_limit);
+
+static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+
+/* the netlink family */
+static struct genl_family nl80211_fam = {
+ .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
+ .name = NL80211_GENL_NAME, /* have users key off the name instead */
+ .hdrsize = 0, /* no private header */
+ .version = 1, /* no particular meaning now */
+ .maxattr = NL80211_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = nl80211_pre_doit,
+ .post_doit = nl80211_post_doit,
+};
+
+/* multicast groups */
+enum nl80211_multicast_groups {
+ NL80211_MCGRP_CONFIG,
+ NL80211_MCGRP_SCAN,
+ NL80211_MCGRP_REGULATORY,
+ NL80211_MCGRP_MLME,
+ NL80211_MCGRP_VENDOR,
+ NL80211_MCGRP_TESTMODE /* keep last - ifdef! */
+};
+
+static const struct genl_multicast_group nl80211_mcgrps[] = {
+ [NL80211_MCGRP_CONFIG] = { .name = NL80211_MULTICAST_GROUP_CONFIG },
+ [NL80211_MCGRP_SCAN] = { .name = NL80211_MULTICAST_GROUP_SCAN },
+ [NL80211_MCGRP_REGULATORY] = { .name = NL80211_MULTICAST_GROUP_REG },
+ [NL80211_MCGRP_MLME] = { .name = NL80211_MULTICAST_GROUP_MLME },
+ [NL80211_MCGRP_VENDOR] = { .name = NL80211_MULTICAST_GROUP_VENDOR },
+#ifdef CONFIG_NL80211_TESTMODE
+ [NL80211_MCGRP_TESTMODE] = { .name = NL80211_MULTICAST_GROUP_TESTMODE }
+#endif
+};
+
+/* returns ERR_PTR values */
+static struct wireless_dev *
+__cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *result = NULL;
+ bool have_ifidx = attrs[NL80211_ATTR_IFINDEX];
+ bool have_wdev_id = attrs[NL80211_ATTR_WDEV];
+ u64 wdev_id;
+ int wiphy_idx = -1;
+ int ifidx = -1;
+
+ ASSERT_RTNL();
+
+ if (!have_ifidx && !have_wdev_id)
+ return ERR_PTR(-EINVAL);
+
+ if (have_ifidx)
+ ifidx = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]);
+ if (have_wdev_id) {
+ wdev_id = nla_get_u64(attrs[NL80211_ATTR_WDEV]);
+ wiphy_idx = wdev_id >> 32;
+ }
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ struct wireless_dev *wdev;
+
+ if (wiphy_net(&rdev->wiphy) != netns)
+ continue;
+
+ if (have_wdev_id && rdev->wiphy_idx != wiphy_idx)
+ continue;
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ if (have_ifidx && wdev->netdev &&
+ wdev->netdev->ifindex == ifidx) {
+ result = wdev;
+ break;
+ }
+ if (have_wdev_id && wdev->identifier == (u32)wdev_id) {
+ result = wdev;
+ break;
+ }
+ }
+
+ if (result)
+ break;
+ }
+
+ if (result)
+ return result;
+ return ERR_PTR(-ENODEV);
+}
+
+static struct cfg80211_registered_device *
+__cfg80211_rdev_from_attrs(struct net *netns, struct nlattr **attrs)
+{
+ struct cfg80211_registered_device *rdev = NULL, *tmp;
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ if (!attrs[NL80211_ATTR_WIPHY] &&
+ !attrs[NL80211_ATTR_IFINDEX] &&
+ !attrs[NL80211_ATTR_WDEV])
+ return ERR_PTR(-EINVAL);
+
+ if (attrs[NL80211_ATTR_WIPHY])
+ rdev = cfg80211_rdev_by_wiphy_idx(
+ nla_get_u32(attrs[NL80211_ATTR_WIPHY]));
+
+ if (attrs[NL80211_ATTR_WDEV]) {
+ u64 wdev_id = nla_get_u64(attrs[NL80211_ATTR_WDEV]);
+ struct wireless_dev *wdev;
+ bool found = false;
+
+ tmp = cfg80211_rdev_by_wiphy_idx(wdev_id >> 32);
+ if (tmp) {
+ /* make sure wdev exists */
+ list_for_each_entry(wdev, &tmp->wdev_list, list) {
+ if (wdev->identifier != (u32)wdev_id)
+ continue;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ tmp = NULL;
+
+ if (rdev && tmp != rdev)
+ return ERR_PTR(-EINVAL);
+ rdev = tmp;
+ }
+ }
+
+ if (attrs[NL80211_ATTR_IFINDEX]) {
+ int ifindex = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]);
+ netdev = __dev_get_by_index(netns, ifindex);
+ if (netdev) {
+ if (netdev->ieee80211_ptr)
+ tmp = wiphy_to_rdev(
+ netdev->ieee80211_ptr->wiphy);
+ else
+ tmp = NULL;
+
+ /* not wireless device -- return error */
+ if (!tmp)
+ return ERR_PTR(-EINVAL);
+
+ /* mismatch -- return error */
+ if (rdev && tmp != rdev)
+ return ERR_PTR(-EINVAL);
+
+ rdev = tmp;
+ }
+ }
+
+ if (!rdev)
+ return ERR_PTR(-ENODEV);
+
+ if (netns != wiphy_net(&rdev->wiphy))
+ return ERR_PTR(-ENODEV);
+
+ return rdev;
+}
+
+/*
+ * This function returns a pointer to the driver
+ * that the genl_info item that is passed refers to.
+ *
+ * The result of this can be a PTR_ERR and hence must
+ * be checked with IS_ERR() for errors.
+ */
+static struct cfg80211_registered_device *
+cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
+{
+ return __cfg80211_rdev_from_attrs(netns, info->attrs);
+}
+
+/* policy for the attributes */
+static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
+ [NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
+ [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
+ .len = 20-1 },
+ [NL80211_ATTR_WIPHY_TXQ_PARAMS] = { .type = NLA_NESTED },
+
+ [NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 },
+ [NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 },
+ [NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 },
+ [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 },
+ [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 },
+
+ [NL80211_ATTR_WIPHY_RETRY_SHORT] = { .type = NLA_U8 },
+ [NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 },
+ [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 },
+ [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 },
+ [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 },
+ [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG },
+
+ [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 },
+ [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
+ [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 },
+
+ [NL80211_ATTR_MAC] = { .len = ETH_ALEN },
+ [NL80211_ATTR_PREV_BSSID] = { .len = ETH_ALEN },
+
+ [NL80211_ATTR_KEY] = { .type = NLA_NESTED, },
+ [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY,
+ .len = WLAN_MAX_KEY_LEN },
+ [NL80211_ATTR_KEY_IDX] = { .type = NLA_U8 },
+ [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
+ [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
+ [NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 },
+
+ [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
+ [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
+ [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_BEACON_TAIL] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_STA_AID] = { .type = NLA_U16 },
+ [NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED },
+ [NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 },
+ [NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY,
+ .len = NL80211_MAX_SUPP_RATES },
+ [NL80211_ATTR_STA_PLINK_ACTION] = { .type = NLA_U8 },
+ [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 },
+ [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ },
+ [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_MESH_ID_LEN },
+ [NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 },
+
+ [NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 },
+ [NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED },
+
+ [NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
+ [NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
+ [NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
+ [NL80211_ATTR_BSS_BASIC_RATES] = { .type = NLA_BINARY,
+ .len = NL80211_MAX_SUPP_RATES },
+ [NL80211_ATTR_BSS_HT_OPMODE] = { .type = NLA_U16 },
+
+ [NL80211_ATTR_MESH_CONFIG] = { .type = NLA_NESTED },
+ [NL80211_ATTR_SUPPORT_MESH_AUTH] = { .type = NLA_FLAG },
+
+ [NL80211_ATTR_HT_CAPABILITY] = { .len = NL80211_HT_CAPABILITY_LEN },
+
+ [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 },
+ [NL80211_ATTR_IE] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED },
+ [NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED },
+
+ [NL80211_ATTR_SSID] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_SSID_LEN },
+ [NL80211_ATTR_AUTH_TYPE] = { .type = NLA_U32 },
+ [NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
+ [NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
+ [NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_USE_MFP] = { .type = NLA_U32 },
+ [NL80211_ATTR_STA_FLAGS2] = {
+ .len = sizeof(struct nl80211_sta_flag_update),
+ },
+ [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 },
+ [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
+ [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
+ [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
+ [NL80211_ATTR_PID] = { .type = NLA_U32 },
+ [NL80211_ATTR_4ADDR] = { .type = NLA_U8 },
+ [NL80211_ATTR_PMKID] = { .type = NLA_BINARY,
+ .len = WLAN_PMKID_LEN },
+ [NL80211_ATTR_DURATION] = { .type = NLA_U32 },
+ [NL80211_ATTR_COOKIE] = { .type = NLA_U64 },
+ [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED },
+ [NL80211_ATTR_FRAME] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_FRAME_MATCH] = { .type = NLA_BINARY, },
+ [NL80211_ATTR_PS_STATE] = { .type = NLA_U32 },
+ [NL80211_ATTR_CQM] = { .type = NLA_NESTED, },
+ [NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG },
+ [NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 },
+ [NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 },
+ [NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 },
+ [NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 },
+ [NL80211_ATTR_WIPHY_ANTENNA_TX] = { .type = NLA_U32 },
+ [NL80211_ATTR_WIPHY_ANTENNA_RX] = { .type = NLA_U32 },
+ [NL80211_ATTR_MCAST_RATE] = { .type = NLA_U32 },
+ [NL80211_ATTR_OFFCHANNEL_TX_OK] = { .type = NLA_FLAG },
+ [NL80211_ATTR_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
+ [NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED },
+ [NL80211_ATTR_STA_PLINK_STATE] = { .type = NLA_U8 },
+ [NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 },
+ [NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED },
+ [NL80211_ATTR_SCAN_SUPP_RATES] = { .type = NLA_NESTED },
+ [NL80211_ATTR_HIDDEN_SSID] = { .type = NLA_U32 },
+ [NL80211_ATTR_IE_PROBE_RESP] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_IE_ASSOC_RESP] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED },
+ [NL80211_ATTR_TX_NO_CCK_RATE] = { .type = NLA_FLAG },
+ [NL80211_ATTR_TDLS_ACTION] = { .type = NLA_U8 },
+ [NL80211_ATTR_TDLS_DIALOG_TOKEN] = { .type = NLA_U8 },
+ [NL80211_ATTR_TDLS_OPERATION] = { .type = NLA_U8 },
+ [NL80211_ATTR_TDLS_SUPPORT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_TDLS_EXTERNAL_SETUP] = { .type = NLA_FLAG },
+ [NL80211_ATTR_TDLS_INITIATOR] = { .type = NLA_FLAG },
+ [NL80211_ATTR_DONT_WAIT_FOR_ACK] = { .type = NLA_FLAG },
+ [NL80211_ATTR_PROBE_RESP] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_DFS_REGION] = { .type = NLA_U8 },
+ [NL80211_ATTR_DISABLE_HT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_HT_CAPABILITY_MASK] = {
+ .len = NL80211_HT_CAPABILITY_LEN
+ },
+ [NL80211_ATTR_NOACK_MAP] = { .type = NLA_U16 },
+ [NL80211_ATTR_INACTIVITY_TIMEOUT] = { .type = NLA_U16 },
+ [NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 },
+ [NL80211_ATTR_WDEV] = { .type = NLA_U64 },
+ [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 },
+ [NL80211_ATTR_SAE_DATA] = { .type = NLA_BINARY, },
+ [NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN },
+ [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
+ [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 },
+ [NL80211_ATTR_P2P_OPPPS] = { .type = NLA_U8 },
+ [NL80211_ATTR_ACL_POLICY] = {. type = NLA_U32 },
+ [NL80211_ATTR_MAC_ADDRS] = { .type = NLA_NESTED },
+ [NL80211_ATTR_STA_CAPABILITY] = { .type = NLA_U16 },
+ [NL80211_ATTR_STA_EXT_CAPABILITY] = { .type = NLA_BINARY, },
+ [NL80211_ATTR_SPLIT_WIPHY_DUMP] = { .type = NLA_FLAG, },
+ [NL80211_ATTR_DISABLE_VHT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_VHT_CAPABILITY_MASK] = {
+ .len = NL80211_VHT_CAPABILITY_LEN,
+ },
+ [NL80211_ATTR_MDID] = { .type = NLA_U16 },
+ [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_PEER_AID] = { .type = NLA_U16 },
+ [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
+ [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG },
+ [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
+ [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_BINARY },
+ [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_BINARY },
+ [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = { .type = NLA_BINARY },
+ [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] = { .type = NLA_BINARY },
+ [NL80211_ATTR_HANDLE_DFS] = { .type = NLA_FLAG },
+ [NL80211_ATTR_OPMODE_NOTIF] = { .type = NLA_U8 },
+ [NL80211_ATTR_VENDOR_ID] = { .type = NLA_U32 },
+ [NL80211_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 },
+ [NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },
+ [NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY,
+ .len = IEEE80211_QOS_MAP_LEN_MAX },
+ [NL80211_ATTR_MAC_HINT] = { .len = ETH_ALEN },
+ [NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 },
+ [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 },
+ [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG },
+ [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY },
+ [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG },
+ [NL80211_ATTR_TSID] = { .type = NLA_U8 },
+ [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 },
+ [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 },
+ [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 },
+ [NL80211_ATTR_MAC_MASK] = { .len = ETH_ALEN },
+ [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG },
+ [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 },
+ [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
+ [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
+};
+
+/* policy for the key attributes */
+static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
+ [NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN },
+ [NL80211_KEY_IDX] = { .type = NLA_U8 },
+ [NL80211_KEY_CIPHER] = { .type = NLA_U32 },
+ [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
+ [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG },
+ [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG },
+ [NL80211_KEY_TYPE] = { .type = NLA_U32 },
+ [NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
+};
+
+/* policy for the key default flags */
+static const struct nla_policy
+nl80211_key_default_policy[NUM_NL80211_KEY_DEFAULT_TYPES] = {
+ [NL80211_KEY_DEFAULT_TYPE_UNICAST] = { .type = NLA_FLAG },
+ [NL80211_KEY_DEFAULT_TYPE_MULTICAST] = { .type = NLA_FLAG },
+};
+
+/* policy for WoWLAN attributes */
+static const struct nla_policy
+nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = {
+ [NL80211_WOWLAN_TRIG_ANY] = { .type = NLA_FLAG },
+ [NL80211_WOWLAN_TRIG_DISCONNECT] = { .type = NLA_FLAG },
+ [NL80211_WOWLAN_TRIG_MAGIC_PKT] = { .type = NLA_FLAG },
+ [NL80211_WOWLAN_TRIG_PKT_PATTERN] = { .type = NLA_NESTED },
+ [NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE] = { .type = NLA_FLAG },
+ [NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST] = { .type = NLA_FLAG },
+ [NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE] = { .type = NLA_FLAG },
+ [NL80211_WOWLAN_TRIG_RFKILL_RELEASE] = { .type = NLA_FLAG },
+ [NL80211_WOWLAN_TRIG_TCP_CONNECTION] = { .type = NLA_NESTED },
+ [NL80211_WOWLAN_TRIG_NET_DETECT] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
+ [NL80211_WOWLAN_TCP_SRC_IPV4] = { .type = NLA_U32 },
+ [NL80211_WOWLAN_TCP_DST_IPV4] = { .type = NLA_U32 },
+ [NL80211_WOWLAN_TCP_DST_MAC] = { .len = ETH_ALEN },
+ [NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 },
+ [NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 },
+ [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .len = 1 },
+ [NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ] = {
+ .len = sizeof(struct nl80211_wowlan_tcp_data_seq)
+ },
+ [NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN] = {
+ .len = sizeof(struct nl80211_wowlan_tcp_data_token)
+ },
+ [NL80211_WOWLAN_TCP_DATA_INTERVAL] = { .type = NLA_U32 },
+ [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = { .len = 1 },
+ [NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 },
+};
+
+/* policy for coalesce rule attributes */
+static const struct nla_policy
+nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = {
+ [NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 },
+ [NL80211_ATTR_COALESCE_RULE_CONDITION] = { .type = NLA_U32 },
+ [NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED },
+};
+
+/* policy for GTK rekey offload attributes */
+static const struct nla_policy
+nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
+ [NL80211_REKEY_DATA_KEK] = { .len = NL80211_KEK_LEN },
+ [NL80211_REKEY_DATA_KCK] = { .len = NL80211_KCK_LEN },
+ [NL80211_REKEY_DATA_REPLAY_CTR] = { .len = NL80211_REPLAY_CTR_LEN },
+};
+
+static const struct nla_policy
+nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = {
+ [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_SSID_LEN },
+ [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 },
+};
+
+static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct cfg80211_registered_device **rdev,
+ struct wireless_dev **wdev)
+{
+ int err;
+
+ rtnl_lock();
+
+ if (!cb->args[0]) {
+ err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
+ nl80211_fam.attrbuf, nl80211_fam.maxattr,
+ nl80211_policy);
+ if (err)
+ goto out_unlock;
+
+ *wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk),
+ nl80211_fam.attrbuf);
+ if (IS_ERR(*wdev)) {
+ err = PTR_ERR(*wdev);
+ goto out_unlock;
+ }
+ *rdev = wiphy_to_rdev((*wdev)->wiphy);
+ /* 0 is the first index - add 1 to parse only once */
+ cb->args[0] = (*rdev)->wiphy_idx + 1;
+ cb->args[1] = (*wdev)->identifier;
+ } else {
+ /* subtract the 1 again here */
+ struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
+ struct wireless_dev *tmp;
+
+ if (!wiphy) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ *rdev = wiphy_to_rdev(wiphy);
+ *wdev = NULL;
+
+ list_for_each_entry(tmp, &(*rdev)->wdev_list, list) {
+ if (tmp->identifier == cb->args[1]) {
+ *wdev = tmp;
+ break;
+ }
+ }
+
+ if (!*wdev) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ return 0;
+ out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void nl80211_finish_wdev_dump(struct cfg80211_registered_device *rdev)
+{
+ rtnl_unlock();
+}
+
+/* IE validation */
+static bool is_valid_ie_attr(const struct nlattr *attr)
+{
+ const u8 *pos;
+ int len;
+
+ if (!attr)
+ return true;
+
+ pos = nla_data(attr);
+ len = nla_len(attr);
+
+ while (len) {
+ u8 elemlen;
+
+ if (len < 2)
+ return false;
+ len -= 2;
+
+ elemlen = pos[1];
+ if (elemlen > len)
+ return false;
+
+ len -= elemlen;
+ pos += 2 + elemlen;
+ }
+
+ return true;
+}
+
+/* message building helper */
+static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
+ int flags, u8 cmd)
+{
+ /* since there is no private header just add the generic one */
+ return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
+}
+
+static int nl80211_msg_put_channel(struct sk_buff *msg,
+ struct ieee80211_channel *chan,
+ bool large)
+{
+ /* Some channels must be completely excluded from the
+ * list to protect old user-space tools from breaking
+ */
+ if (!large && chan->flags &
+ (IEEE80211_CHAN_NO_10MHZ | IEEE80211_CHAN_NO_20MHZ))
+ return 0;
+
+ if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_FREQ,
+ chan->center_freq))
+ goto nla_put_failure;
+
+ if ((chan->flags & IEEE80211_CHAN_DISABLED) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED))
+ goto nla_put_failure;
+ if (chan->flags & IEEE80211_CHAN_NO_IR) {
+ if (nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_IR))
+ goto nla_put_failure;
+ if (nla_put_flag(msg, __NL80211_FREQUENCY_ATTR_NO_IBSS))
+ goto nla_put_failure;
+ }
+ if (chan->flags & IEEE80211_CHAN_RADAR) {
+ if (nla_put_flag(msg, NL80211_FREQUENCY_ATTR_RADAR))
+ goto nla_put_failure;
+ if (large) {
+ u32 time;
+
+ time = elapsed_jiffies_msecs(chan->dfs_state_entered);
+
+ if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_DFS_STATE,
+ chan->dfs_state))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_DFS_TIME,
+ time))
+ goto nla_put_failure;
+ if (nla_put_u32(msg,
+ NL80211_FREQUENCY_ATTR_DFS_CAC_TIME,
+ chan->dfs_cac_ms))
+ goto nla_put_failure;
+ }
+ }
+
+ if (large) {
+ if ((chan->flags & IEEE80211_CHAN_NO_HT40MINUS) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HT40_MINUS))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_HT40PLUS) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HT40_PLUS))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_80MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_80MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_160MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_160MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_INDOOR_ONLY) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_INDOOR_ONLY))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_GO_CONCURRENT) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_GO_CONCURRENT))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_20MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_20MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_10MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_10MHZ))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
+ DBM_TO_MBM(chan->max_power)))
+ goto nla_put_failure;
+
+ return 0;
+
+ nla_put_failure:
+ return -ENOBUFS;
+}
+
+/* netlink command implementations */
+
+struct key_parse {
+ struct key_params p;
+ int idx;
+ int type;
+ bool def, defmgmt;
+ bool def_uni, def_multi;
+};
+
+static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
+{
+ struct nlattr *tb[NL80211_KEY_MAX + 1];
+ int err = nla_parse_nested(tb, NL80211_KEY_MAX, key,
+ nl80211_key_policy);
+ if (err)
+ return err;
+
+ k->def = !!tb[NL80211_KEY_DEFAULT];
+ k->defmgmt = !!tb[NL80211_KEY_DEFAULT_MGMT];
+
+ if (k->def) {
+ k->def_uni = true;
+ k->def_multi = true;
+ }
+ if (k->defmgmt)
+ k->def_multi = true;
+
+ if (tb[NL80211_KEY_IDX])
+ k->idx = nla_get_u8(tb[NL80211_KEY_IDX]);
+
+ if (tb[NL80211_KEY_DATA]) {
+ k->p.key = nla_data(tb[NL80211_KEY_DATA]);
+ k->p.key_len = nla_len(tb[NL80211_KEY_DATA]);
+ }
+
+ if (tb[NL80211_KEY_SEQ]) {
+ k->p.seq = nla_data(tb[NL80211_KEY_SEQ]);
+ k->p.seq_len = nla_len(tb[NL80211_KEY_SEQ]);
+ }
+
+ if (tb[NL80211_KEY_CIPHER])
+ k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]);
+
+ if (tb[NL80211_KEY_TYPE]) {
+ k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);
+ if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+ return -EINVAL;
+ }
+
+ if (tb[NL80211_KEY_DEFAULT_TYPES]) {
+ struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
+ err = nla_parse_nested(kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1,
+ tb[NL80211_KEY_DEFAULT_TYPES],
+ nl80211_key_default_policy);
+ if (err)
+ return err;
+
+ k->def_uni = kdt[NL80211_KEY_DEFAULT_TYPE_UNICAST];
+ k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST];
+ }
+
+ return 0;
+}
+
+static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
+{
+ if (info->attrs[NL80211_ATTR_KEY_DATA]) {
+ k->p.key = nla_data(info->attrs[NL80211_ATTR_KEY_DATA]);
+ k->p.key_len = nla_len(info->attrs[NL80211_ATTR_KEY_DATA]);
+ }
+
+ if (info->attrs[NL80211_ATTR_KEY_SEQ]) {
+ k->p.seq = nla_data(info->attrs[NL80211_ATTR_KEY_SEQ]);
+ k->p.seq_len = nla_len(info->attrs[NL80211_ATTR_KEY_SEQ]);
+ }
+
+ if (info->attrs[NL80211_ATTR_KEY_IDX])
+ k->idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
+
+ if (info->attrs[NL80211_ATTR_KEY_CIPHER])
+ k->p.cipher = nla_get_u32(info->attrs[NL80211_ATTR_KEY_CIPHER]);
+
+ k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT];
+ k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT];
+
+ if (k->def) {
+ k->def_uni = true;
+ k->def_multi = true;
+ }
+ if (k->defmgmt)
+ k->def_multi = true;
+
+ if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+ k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+ if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) {
+ struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
+ int err = nla_parse_nested(
+ kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1,
+ info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES],
+ nl80211_key_default_policy);
+ if (err)
+ return err;
+
+ k->def_uni = kdt[NL80211_KEY_DEFAULT_TYPE_UNICAST];
+ k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST];
+ }
+
+ return 0;
+}
+
+static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
+{
+ int err;
+
+ memset(k, 0, sizeof(*k));
+ k->idx = -1;
+ k->type = -1;
+
+ if (info->attrs[NL80211_ATTR_KEY])
+ err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k);
+ else
+ err = nl80211_parse_key_old(info, k);
+
+ if (err)
+ return err;
+
+ if (k->def && k->defmgmt)
+ return -EINVAL;
+
+ if (k->defmgmt) {
+ if (k->def_uni || !k->def_multi)
+ return -EINVAL;
+ }
+
+ if (k->idx != -1) {
+ if (k->defmgmt) {
+ if (k->idx < 4 || k->idx > 5)
+ return -EINVAL;
+ } else if (k->def) {
+ if (k->idx < 0 || k->idx > 3)
+ return -EINVAL;
+ } else {
+ if (k->idx < 0 || k->idx > 5)
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static struct cfg80211_cached_keys *
+nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
+ struct nlattr *keys, bool *no_ht)
+{
+ struct key_parse parse;
+ struct nlattr *key;
+ struct cfg80211_cached_keys *result;
+ int rem, err, def = 0;
+
+ result = kzalloc(sizeof(*result), GFP_KERNEL);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+
+ result->def = -1;
+ result->defmgmt = -1;
+
+ nla_for_each_nested(key, keys, rem) {
+ memset(&parse, 0, sizeof(parse));
+ parse.idx = -1;
+
+ err = nl80211_parse_key_new(key, &parse);
+ if (err)
+ goto error;
+ err = -EINVAL;
+ if (!parse.p.key)
+ goto error;
+ if (parse.idx < 0 || parse.idx > 4)
+ goto error;
+ if (parse.def) {
+ if (def)
+ goto error;
+ def = 1;
+ result->def = parse.idx;
+ if (!parse.def_uni || !parse.def_multi)
+ goto error;
+ } else if (parse.defmgmt)
+ goto error;
+ err = cfg80211_validate_key_settings(rdev, &parse.p,
+ parse.idx, false, NULL);
+ if (err)
+ goto error;
+ result->params[parse.idx].cipher = parse.p.cipher;
+ result->params[parse.idx].key_len = parse.p.key_len;
+ result->params[parse.idx].key = result->data[parse.idx];
+ memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len);
+
+ if (parse.p.cipher == WLAN_CIPHER_SUITE_WEP40 ||
+ parse.p.cipher == WLAN_CIPHER_SUITE_WEP104) {
+ if (no_ht)
+ *no_ht = true;
+ }
+ }
+
+ return result;
+ error:
+ kfree(result);
+ return ERR_PTR(err);
+}
+
+static int nl80211_key_allowed(struct wireless_dev *wdev)
+{
+ ASSERT_WDEV_LOCK(wdev);
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_MESH_POINT:
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ if (!wdev->current_bss)
+ return -ENOLINK;
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NL80211_IFTYPE_OCB:
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ case NL80211_IFTYPE_WDS:
+ case NUM_NL80211_IFTYPES:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy,
+ struct nlattr *tb)
+{
+ struct ieee80211_channel *chan;
+
+ if (tb == NULL)
+ return NULL;
+ chan = ieee80211_get_channel(wiphy, nla_get_u32(tb));
+ if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
+ return NULL;
+ return chan;
+}
+
+static int nl80211_put_iftypes(struct sk_buff *msg, u32 attr, u16 ifmodes)
+{
+ struct nlattr *nl_modes = nla_nest_start(msg, attr);
+ int i;
+
+ if (!nl_modes)
+ goto nla_put_failure;
+
+ i = 0;
+ while (ifmodes) {
+ if ((ifmodes & 1) && nla_put_flag(msg, i))
+ goto nla_put_failure;
+ ifmodes >>= 1;
+ i++;
+ }
+
+ nla_nest_end(msg, nl_modes);
+ return 0;
+
+nla_put_failure:
+ return -ENOBUFS;
+}
+
+static int nl80211_put_iface_combinations(struct wiphy *wiphy,
+ struct sk_buff *msg,
+ bool large)
+{
+ struct nlattr *nl_combis;
+ int i, j;
+
+ nl_combis = nla_nest_start(msg,
+ NL80211_ATTR_INTERFACE_COMBINATIONS);
+ if (!nl_combis)
+ goto nla_put_failure;
+
+ for (i = 0; i < wiphy->n_iface_combinations; i++) {
+ const struct ieee80211_iface_combination *c;
+ struct nlattr *nl_combi, *nl_limits;
+
+ c = &wiphy->iface_combinations[i];
+
+ nl_combi = nla_nest_start(msg, i + 1);
+ if (!nl_combi)
+ goto nla_put_failure;
+
+ nl_limits = nla_nest_start(msg, NL80211_IFACE_COMB_LIMITS);
+ if (!nl_limits)
+ goto nla_put_failure;
+
+ for (j = 0; j < c->n_limits; j++) {
+ struct nlattr *nl_limit;
+
+ nl_limit = nla_nest_start(msg, j + 1);
+ if (!nl_limit)
+ goto nla_put_failure;
+ if (nla_put_u32(msg, NL80211_IFACE_LIMIT_MAX,
+ c->limits[j].max))
+ goto nla_put_failure;
+ if (nl80211_put_iftypes(msg, NL80211_IFACE_LIMIT_TYPES,
+ c->limits[j].types))
+ goto nla_put_failure;
+ nla_nest_end(msg, nl_limit);
+ }
+
+ nla_nest_end(msg, nl_limits);
+
+ if (c->beacon_int_infra_match &&
+ nla_put_flag(msg, NL80211_IFACE_COMB_STA_AP_BI_MATCH))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, NL80211_IFACE_COMB_NUM_CHANNELS,
+ c->num_different_channels) ||
+ nla_put_u32(msg, NL80211_IFACE_COMB_MAXNUM,
+ c->max_interfaces))
+ goto nla_put_failure;
+ if (large &&
+ (nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS,
+ c->radar_detect_widths) ||
+ nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_REGIONS,
+ c->radar_detect_regions)))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_combi);
+ }
+
+ nla_nest_end(msg, nl_combis);
+
+ return 0;
+nla_put_failure:
+ return -ENOBUFS;
+}
+
+#ifdef CONFIG_PM
+static int nl80211_send_wowlan_tcp_caps(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ const struct wiphy_wowlan_tcp_support *tcp = rdev->wiphy.wowlan->tcp;
+ struct nlattr *nl_tcp;
+
+ if (!tcp)
+ return 0;
+
+ nl_tcp = nla_nest_start(msg, NL80211_WOWLAN_TRIG_TCP_CONNECTION);
+ if (!nl_tcp)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD,
+ tcp->data_payload_max))
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD,
+ tcp->data_payload_max))
+ return -ENOBUFS;
+
+ if (tcp->seq && nla_put_flag(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ))
+ return -ENOBUFS;
+
+ if (tcp->tok && nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN,
+ sizeof(*tcp->tok), tcp->tok))
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_INTERVAL,
+ tcp->data_interval_max))
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_WOWLAN_TCP_WAKE_PAYLOAD,
+ tcp->wake_payload_max))
+ return -ENOBUFS;
+
+ nla_nest_end(msg, nl_tcp);
+ return 0;
+}
+
+static int nl80211_send_wowlan(struct sk_buff *msg,
+ struct cfg80211_registered_device *rdev,
+ bool large)
+{
+ struct nlattr *nl_wowlan;
+
+ if (!rdev->wiphy.wowlan)
+ return 0;
+
+ nl_wowlan = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED);
+ if (!nl_wowlan)
+ return -ENOBUFS;
+
+ if (((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_ANY) &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_ANY)) ||
+ ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_DISCONNECT) &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT)) ||
+ ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT) &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT)) ||
+ ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY) &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED)) ||
+ ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE)) ||
+ ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ) &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST)) ||
+ ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE) &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE)) ||
+ ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE) &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE)))
+ return -ENOBUFS;
+
+ if (rdev->wiphy.wowlan->n_patterns) {
+ struct nl80211_pattern_support pat = {
+ .max_patterns = rdev->wiphy.wowlan->n_patterns,
+ .min_pattern_len = rdev->wiphy.wowlan->pattern_min_len,
+ .max_pattern_len = rdev->wiphy.wowlan->pattern_max_len,
+ .max_pkt_offset = rdev->wiphy.wowlan->max_pkt_offset,
+ };
+
+ if (nla_put(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN,
+ sizeof(pat), &pat))
+ return -ENOBUFS;
+ }
+
+ if ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_NET_DETECT) &&
+ nla_put_u32(msg, NL80211_WOWLAN_TRIG_NET_DETECT,
+ rdev->wiphy.wowlan->max_nd_match_sets))
+ return -ENOBUFS;
+
+ if (large && nl80211_send_wowlan_tcp_caps(rdev, msg))
+ return -ENOBUFS;
+
+ nla_nest_end(msg, nl_wowlan);
+
+ return 0;
+}
+#endif
+
+static int nl80211_send_coalesce(struct sk_buff *msg,
+ struct cfg80211_registered_device *rdev)
+{
+ struct nl80211_coalesce_rule_support rule;
+
+ if (!rdev->wiphy.coalesce)
+ return 0;
+
+ rule.max_rules = rdev->wiphy.coalesce->n_rules;
+ rule.max_delay = rdev->wiphy.coalesce->max_delay;
+ rule.pat.max_patterns = rdev->wiphy.coalesce->n_patterns;
+ rule.pat.min_pattern_len = rdev->wiphy.coalesce->pattern_min_len;
+ rule.pat.max_pattern_len = rdev->wiphy.coalesce->pattern_max_len;
+ rule.pat.max_pkt_offset = rdev->wiphy.coalesce->max_pkt_offset;
+
+ if (nla_put(msg, NL80211_ATTR_COALESCE_RULE, sizeof(rule), &rule))
+ return -ENOBUFS;
+
+ return 0;
+}
+
+static int nl80211_send_band_rateinfo(struct sk_buff *msg,
+ struct ieee80211_supported_band *sband)
+{
+ struct nlattr *nl_rates, *nl_rate;
+ struct ieee80211_rate *rate;
+ int i;
+
+ /* add HT info */
+ if (sband->ht_cap.ht_supported &&
+ (nla_put(msg, NL80211_BAND_ATTR_HT_MCS_SET,
+ sizeof(sband->ht_cap.mcs),
+ &sband->ht_cap.mcs) ||
+ nla_put_u16(msg, NL80211_BAND_ATTR_HT_CAPA,
+ sband->ht_cap.cap) ||
+ nla_put_u8(msg, NL80211_BAND_ATTR_HT_AMPDU_FACTOR,
+ sband->ht_cap.ampdu_factor) ||
+ nla_put_u8(msg, NL80211_BAND_ATTR_HT_AMPDU_DENSITY,
+ sband->ht_cap.ampdu_density)))
+ return -ENOBUFS;
+
+ /* add VHT info */
+ if (sband->vht_cap.vht_supported &&
+ (nla_put(msg, NL80211_BAND_ATTR_VHT_MCS_SET,
+ sizeof(sband->vht_cap.vht_mcs),
+ &sband->vht_cap.vht_mcs) ||
+ nla_put_u32(msg, NL80211_BAND_ATTR_VHT_CAPA,
+ sband->vht_cap.cap)))
+ return -ENOBUFS;
+
+ /* add bitrates */
+ nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES);
+ if (!nl_rates)
+ return -ENOBUFS;
+
+ for (i = 0; i < sband->n_bitrates; i++) {
+ nl_rate = nla_nest_start(msg, i);
+ if (!nl_rate)
+ return -ENOBUFS;
+
+ rate = &sband->bitrates[i];
+ if (nla_put_u32(msg, NL80211_BITRATE_ATTR_RATE,
+ rate->bitrate))
+ return -ENOBUFS;
+ if ((rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) &&
+ nla_put_flag(msg,
+ NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE))
+ return -ENOBUFS;
+
+ nla_nest_end(msg, nl_rate);
+ }
+
+ nla_nest_end(msg, nl_rates);
+
+ return 0;
+}
+
+static int
+nl80211_send_mgmt_stypes(struct sk_buff *msg,
+ const struct ieee80211_txrx_stypes *mgmt_stypes)
+{
+ u16 stypes;
+ struct nlattr *nl_ftypes, *nl_ifs;
+ enum nl80211_iftype ift;
+ int i;
+
+ if (!mgmt_stypes)
+ return 0;
+
+ nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES);
+ if (!nl_ifs)
+ return -ENOBUFS;
+
+ for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
+ nl_ftypes = nla_nest_start(msg, ift);
+ if (!nl_ftypes)
+ return -ENOBUFS;
+ i = 0;
+ stypes = mgmt_stypes[ift].tx;
+ while (stypes) {
+ if ((stypes & 1) &&
+ nla_put_u16(msg, NL80211_ATTR_FRAME_TYPE,
+ (i << 4) | IEEE80211_FTYPE_MGMT))
+ return -ENOBUFS;
+ stypes >>= 1;
+ i++;
+ }
+ nla_nest_end(msg, nl_ftypes);
+ }
+
+ nla_nest_end(msg, nl_ifs);
+
+ nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES);
+ if (!nl_ifs)
+ return -ENOBUFS;
+
+ for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
+ nl_ftypes = nla_nest_start(msg, ift);
+ if (!nl_ftypes)
+ return -ENOBUFS;
+ i = 0;
+ stypes = mgmt_stypes[ift].rx;
+ while (stypes) {
+ if ((stypes & 1) &&
+ nla_put_u16(msg, NL80211_ATTR_FRAME_TYPE,
+ (i << 4) | IEEE80211_FTYPE_MGMT))
+ return -ENOBUFS;
+ stypes >>= 1;
+ i++;
+ }
+ nla_nest_end(msg, nl_ftypes);
+ }
+ nla_nest_end(msg, nl_ifs);
+
+ return 0;
+}
+
+struct nl80211_dump_wiphy_state {
+ s64 filter_wiphy;
+ long start;
+ long split_start, band_start, chan_start;
+ bool split;
+};
+
+static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
+ enum nl80211_commands cmd,
+ struct sk_buff *msg, u32 portid, u32 seq,
+ int flags, struct nl80211_dump_wiphy_state *state)
+{
+ void *hdr;
+ struct nlattr *nl_bands, *nl_band;
+ struct nlattr *nl_freqs, *nl_freq;
+ struct nlattr *nl_cmds;
+ enum ieee80211_band band;
+ struct ieee80211_channel *chan;
+ int i;
+ const struct ieee80211_txrx_stypes *mgmt_stypes =
+ rdev->wiphy.mgmt_stypes;
+ u32 features;
+
+ hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (WARN_ON(!state))
+ return -EINVAL;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_string(msg, NL80211_ATTR_WIPHY_NAME,
+ wiphy_name(&rdev->wiphy)) ||
+ nla_put_u32(msg, NL80211_ATTR_GENERATION,
+ cfg80211_rdev_list_generation))
+ goto nla_put_failure;
+
+ if (cmd != NL80211_CMD_NEW_WIPHY)
+ goto finish;
+
+ switch (state->split_start) {
+ case 0:
+ if (nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_SHORT,
+ rdev->wiphy.retry_short) ||
+ nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_LONG,
+ rdev->wiphy.retry_long) ||
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_FRAG_THRESHOLD,
+ rdev->wiphy.frag_threshold) ||
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD,
+ rdev->wiphy.rts_threshold) ||
+ nla_put_u8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS,
+ rdev->wiphy.coverage_class) ||
+ nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
+ rdev->wiphy.max_scan_ssids) ||
+ nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS,
+ rdev->wiphy.max_sched_scan_ssids) ||
+ nla_put_u16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
+ rdev->wiphy.max_scan_ie_len) ||
+ nla_put_u16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN,
+ rdev->wiphy.max_sched_scan_ie_len) ||
+ nla_put_u8(msg, NL80211_ATTR_MAX_MATCH_SETS,
+ rdev->wiphy.max_match_sets))
+ goto nla_put_failure;
+
+ if ((rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) &&
+ nla_put_flag(msg, NL80211_ATTR_SUPPORT_IBSS_RSN))
+ goto nla_put_failure;
+ if ((rdev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) &&
+ nla_put_flag(msg, NL80211_ATTR_SUPPORT_MESH_AUTH))
+ goto nla_put_failure;
+ if ((rdev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) &&
+ nla_put_flag(msg, NL80211_ATTR_SUPPORT_AP_UAPSD))
+ goto nla_put_failure;
+ if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) &&
+ nla_put_flag(msg, NL80211_ATTR_ROAM_SUPPORT))
+ goto nla_put_failure;
+ if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) &&
+ nla_put_flag(msg, NL80211_ATTR_TDLS_SUPPORT))
+ goto nla_put_failure;
+ if ((rdev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP) &&
+ nla_put_flag(msg, NL80211_ATTR_TDLS_EXTERNAL_SETUP))
+ goto nla_put_failure;
+ state->split_start++;
+ if (state->split)
+ break;
+ case 1:
+ if (nla_put(msg, NL80211_ATTR_CIPHER_SUITES,
+ sizeof(u32) * rdev->wiphy.n_cipher_suites,
+ rdev->wiphy.cipher_suites))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL80211_ATTR_MAX_NUM_PMKIDS,
+ rdev->wiphy.max_num_pmkids))
+ goto nla_put_failure;
+
+ if ((rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) &&
+ nla_put_flag(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX,
+ rdev->wiphy.available_antennas_tx) ||
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX,
+ rdev->wiphy.available_antennas_rx))
+ goto nla_put_failure;
+
+ if ((rdev->wiphy.flags & WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD) &&
+ nla_put_u32(msg, NL80211_ATTR_PROBE_RESP_OFFLOAD,
+ rdev->wiphy.probe_resp_offload))
+ goto nla_put_failure;
+
+ if ((rdev->wiphy.available_antennas_tx ||
+ rdev->wiphy.available_antennas_rx) &&
+ rdev->ops->get_antenna) {
+ u32 tx_ant = 0, rx_ant = 0;
+ int res;
+ res = rdev_get_antenna(rdev, &tx_ant, &rx_ant);
+ if (!res) {
+ if (nla_put_u32(msg,
+ NL80211_ATTR_WIPHY_ANTENNA_TX,
+ tx_ant) ||
+ nla_put_u32(msg,
+ NL80211_ATTR_WIPHY_ANTENNA_RX,
+ rx_ant))
+ goto nla_put_failure;
+ }
+ }
+
+ state->split_start++;
+ if (state->split)
+ break;
+ case 2:
+ if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES,
+ rdev->wiphy.interface_modes))
+ goto nla_put_failure;
+ state->split_start++;
+ if (state->split)
+ break;
+ case 3:
+ nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS);
+ if (!nl_bands)
+ goto nla_put_failure;
+
+ for (band = state->band_start;
+ band < IEEE80211_NUM_BANDS; band++) {
+ struct ieee80211_supported_band *sband;
+
+ sband = rdev->wiphy.bands[band];
+
+ if (!sband)
+ continue;
+
+ nl_band = nla_nest_start(msg, band);
+ if (!nl_band)
+ goto nla_put_failure;
+
+ switch (state->chan_start) {
+ case 0:
+ if (nl80211_send_band_rateinfo(msg, sband))
+ goto nla_put_failure;
+ state->chan_start++;
+ if (state->split)
+ break;
+ default:
+ /* add frequencies */
+ nl_freqs = nla_nest_start(
+ msg, NL80211_BAND_ATTR_FREQS);
+ if (!nl_freqs)
+ goto nla_put_failure;
+
+ for (i = state->chan_start - 1;
+ i < sband->n_channels;
+ i++) {
+ nl_freq = nla_nest_start(msg, i);
+ if (!nl_freq)
+ goto nla_put_failure;
+
+ chan = &sband->channels[i];
+
+ if (nl80211_msg_put_channel(
+ msg, chan,
+ state->split))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_freq);
+ if (state->split)
+ break;
+ }
+ if (i < sband->n_channels)
+ state->chan_start = i + 2;
+ else
+ state->chan_start = 0;
+ nla_nest_end(msg, nl_freqs);
+ }
+
+ nla_nest_end(msg, nl_band);
+
+ if (state->split) {
+ /* start again here */
+ if (state->chan_start)
+ band--;
+ break;
+ }
+ }
+ nla_nest_end(msg, nl_bands);
+
+ if (band < IEEE80211_NUM_BANDS)
+ state->band_start = band + 1;
+ else
+ state->band_start = 0;
+
+ /* if bands & channels are done, continue outside */
+ if (state->band_start == 0 && state->chan_start == 0)
+ state->split_start++;
+ if (state->split)
+ break;
+ case 4:
+ nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS);
+ if (!nl_cmds)
+ goto nla_put_failure;
+
+ i = 0;
+#define CMD(op, n) \
+ do { \
+ if (rdev->ops->op) { \
+ i++; \
+ if (nla_put_u32(msg, i, NL80211_CMD_ ## n)) \
+ goto nla_put_failure; \
+ } \
+ } while (0)
+
+ CMD(add_virtual_intf, NEW_INTERFACE);
+ CMD(change_virtual_intf, SET_INTERFACE);
+ CMD(add_key, NEW_KEY);
+ CMD(start_ap, START_AP);
+ CMD(add_station, NEW_STATION);
+ CMD(add_mpath, NEW_MPATH);
+ CMD(update_mesh_config, SET_MESH_CONFIG);
+ CMD(change_bss, SET_BSS);
+ CMD(auth, AUTHENTICATE);
+ CMD(assoc, ASSOCIATE);
+ CMD(deauth, DEAUTHENTICATE);
+ CMD(disassoc, DISASSOCIATE);
+ CMD(join_ibss, JOIN_IBSS);
+ CMD(join_mesh, JOIN_MESH);
+ CMD(set_pmksa, SET_PMKSA);
+ CMD(del_pmksa, DEL_PMKSA);
+ CMD(flush_pmksa, FLUSH_PMKSA);
+ if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL)
+ CMD(remain_on_channel, REMAIN_ON_CHANNEL);
+ CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
+ CMD(mgmt_tx, FRAME);
+ CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
+ if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS))
+ goto nla_put_failure;
+ }
+ if (rdev->ops->set_monitor_channel || rdev->ops->start_ap ||
+ rdev->ops->join_mesh) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
+ goto nla_put_failure;
+ }
+ CMD(set_wds_peer, SET_WDS_PEER);
+ if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
+ CMD(tdls_mgmt, TDLS_MGMT);
+ CMD(tdls_oper, TDLS_OPER);
+ }
+ if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
+ CMD(sched_scan_start, START_SCHED_SCAN);
+ CMD(probe_client, PROBE_CLIENT);
+ CMD(set_noack_map, SET_NOACK_MAP);
+ if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS))
+ goto nla_put_failure;
+ }
+ CMD(start_p2p_device, START_P2P_DEVICE);
+ CMD(set_mcast_rate, SET_MCAST_RATE);
+#ifdef CONFIG_NL80211_TESTMODE
+ CMD(testmode_cmd, TESTMODE);
+#endif
+ if (state->split) {
+ CMD(crit_proto_start, CRIT_PROTOCOL_START);
+ CMD(crit_proto_stop, CRIT_PROTOCOL_STOP);
+ if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)
+ CMD(channel_switch, CHANNEL_SWITCH);
+ CMD(set_qos_map, SET_QOS_MAP);
+ if (rdev->wiphy.features &
+ NL80211_FEATURE_SUPPORTS_WMM_ADMISSION)
+ CMD(add_tx_ts, ADD_TX_TS);
+ }
+ /* add into the if now */
+#undef CMD
+
+ if (rdev->ops->connect || rdev->ops->auth) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_CONNECT))
+ goto nla_put_failure;
+ }
+
+ if (rdev->ops->disconnect || rdev->ops->deauth) {
+ i++;
+ if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT))
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, nl_cmds);
+ state->split_start++;
+ if (state->split)
+ break;
+ case 5:
+ if (rdev->ops->remain_on_channel &&
+ (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) &&
+ nla_put_u32(msg,
+ NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION,
+ rdev->wiphy.max_remain_on_channel_duration))
+ goto nla_put_failure;
+
+ if ((rdev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX) &&
+ nla_put_flag(msg, NL80211_ATTR_OFFCHANNEL_TX_OK))
+ goto nla_put_failure;
+
+ if (nl80211_send_mgmt_stypes(msg, mgmt_stypes))
+ goto nla_put_failure;
+ state->split_start++;
+ if (state->split)
+ break;
+ case 6:
+#ifdef CONFIG_PM
+ if (nl80211_send_wowlan(msg, rdev, state->split))
+ goto nla_put_failure;
+ state->split_start++;
+ if (state->split)
+ break;
+#else
+ state->split_start++;
+#endif
+ case 7:
+ if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES,
+ rdev->wiphy.software_iftypes))
+ goto nla_put_failure;
+
+ if (nl80211_put_iface_combinations(&rdev->wiphy, msg,
+ state->split))
+ goto nla_put_failure;
+
+ state->split_start++;
+ if (state->split)
+ break;
+ case 8:
+ if ((rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) &&
+ nla_put_u32(msg, NL80211_ATTR_DEVICE_AP_SME,
+ rdev->wiphy.ap_sme_capa))
+ goto nla_put_failure;
+
+ features = rdev->wiphy.features;
+ /*
+ * We can only add the per-channel limit information if the
+ * dump is split, otherwise it makes it too big. Therefore
+ * only advertise it in that case.
+ */
+ if (state->split)
+ features |= NL80211_FEATURE_ADVERTISE_CHAN_LIMITS;
+ if (nla_put_u32(msg, NL80211_ATTR_FEATURE_FLAGS, features))
+ goto nla_put_failure;
+
+ if (rdev->wiphy.ht_capa_mod_mask &&
+ nla_put(msg, NL80211_ATTR_HT_CAPABILITY_MASK,
+ sizeof(*rdev->wiphy.ht_capa_mod_mask),
+ rdev->wiphy.ht_capa_mod_mask))
+ goto nla_put_failure;
+
+ if (rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME &&
+ rdev->wiphy.max_acl_mac_addrs &&
+ nla_put_u32(msg, NL80211_ATTR_MAC_ACL_MAX,
+ rdev->wiphy.max_acl_mac_addrs))
+ goto nla_put_failure;
+
+ /*
+ * Any information below this point is only available to
+ * applications that can deal with it being split. This
+ * helps ensure that newly added capabilities don't break
+ * older tools by overrunning their buffers.
+ *
+ * We still increment split_start so that in the split
+ * case we'll continue with more data in the next round,
+ * but break unconditionally so unsplit data stops here.
+ */
+ state->split_start++;
+ break;
+ case 9:
+ if (rdev->wiphy.extended_capabilities &&
+ (nla_put(msg, NL80211_ATTR_EXT_CAPA,
+ rdev->wiphy.extended_capabilities_len,
+ rdev->wiphy.extended_capabilities) ||
+ nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK,
+ rdev->wiphy.extended_capabilities_len,
+ rdev->wiphy.extended_capabilities_mask)))
+ goto nla_put_failure;
+
+ if (rdev->wiphy.vht_capa_mod_mask &&
+ nla_put(msg, NL80211_ATTR_VHT_CAPABILITY_MASK,
+ sizeof(*rdev->wiphy.vht_capa_mod_mask),
+ rdev->wiphy.vht_capa_mod_mask))
+ goto nla_put_failure;
+
+ state->split_start++;
+ break;
+ case 10:
+ if (nl80211_send_coalesce(msg, rdev))
+ goto nla_put_failure;
+
+ if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ) &&
+ (nla_put_flag(msg, NL80211_ATTR_SUPPORT_5_MHZ) ||
+ nla_put_flag(msg, NL80211_ATTR_SUPPORT_10_MHZ)))
+ goto nla_put_failure;
+
+ if (rdev->wiphy.max_ap_assoc_sta &&
+ nla_put_u32(msg, NL80211_ATTR_MAX_AP_ASSOC_STA,
+ rdev->wiphy.max_ap_assoc_sta))
+ goto nla_put_failure;
+
+ state->split_start++;
+ break;
+ case 11:
+ if (rdev->wiphy.n_vendor_commands) {
+ const struct nl80211_vendor_cmd_info *info;
+ struct nlattr *nested;
+
+ nested = nla_nest_start(msg, NL80211_ATTR_VENDOR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+
+ for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) {
+ info = &rdev->wiphy.vendor_commands[i].info;
+ if (nla_put(msg, i + 1, sizeof(*info), info))
+ goto nla_put_failure;
+ }
+ nla_nest_end(msg, nested);
+ }
+
+ if (rdev->wiphy.n_vendor_events) {
+ const struct nl80211_vendor_cmd_info *info;
+ struct nlattr *nested;
+
+ nested = nla_nest_start(msg,
+ NL80211_ATTR_VENDOR_EVENTS);
+ if (!nested)
+ goto nla_put_failure;
+
+ for (i = 0; i < rdev->wiphy.n_vendor_events; i++) {
+ info = &rdev->wiphy.vendor_events[i];
+ if (nla_put(msg, i + 1, sizeof(*info), info))
+ goto nla_put_failure;
+ }
+ nla_nest_end(msg, nested);
+ }
+ state->split_start++;
+ break;
+ case 12:
+ if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH &&
+ nla_put_u8(msg, NL80211_ATTR_MAX_CSA_COUNTERS,
+ rdev->wiphy.max_num_csa_counters))
+ goto nla_put_failure;
+
+ if (rdev->wiphy.regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
+ nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG))
+ goto nla_put_failure;
+
+ if (nla_put(msg, NL80211_ATTR_EXT_FEATURES,
+ sizeof(rdev->wiphy.ext_features),
+ rdev->wiphy.ext_features))
+ goto nla_put_failure;
+
+ /* done */
+ state->split_start = 0;
+ break;
+ }
+ finish:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl80211_dump_wiphy_parse(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nl80211_dump_wiphy_state *state)
+{
+ struct nlattr **tb = nl80211_fam.attrbuf;
+ int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
+ tb, nl80211_fam.maxattr, nl80211_policy);
+ /* ignore parse errors for backward compatibility */
+ if (ret)
+ return 0;
+
+ state->split = tb[NL80211_ATTR_SPLIT_WIPHY_DUMP];
+ if (tb[NL80211_ATTR_WIPHY])
+ state->filter_wiphy = nla_get_u32(tb[NL80211_ATTR_WIPHY]);
+ if (tb[NL80211_ATTR_WDEV])
+ state->filter_wiphy = nla_get_u64(tb[NL80211_ATTR_WDEV]) >> 32;
+ if (tb[NL80211_ATTR_IFINDEX]) {
+ struct net_device *netdev;
+ struct cfg80211_registered_device *rdev;
+ int ifidx = nla_get_u32(tb[NL80211_ATTR_IFINDEX]);
+
+ netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
+ if (!netdev)
+ return -ENODEV;
+ if (netdev->ieee80211_ptr) {
+ rdev = wiphy_to_rdev(
+ netdev->ieee80211_ptr->wiphy);
+ state->filter_wiphy = rdev->wiphy_idx;
+ }
+ }
+
+ return 0;
+}
+
+static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx = 0, ret;
+ struct nl80211_dump_wiphy_state *state = (void *)cb->args[0];
+ struct cfg80211_registered_device *rdev;
+
+ rtnl_lock();
+ if (!state) {
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state) {
+ rtnl_unlock();
+ return -ENOMEM;
+ }
+ state->filter_wiphy = -1;
+ ret = nl80211_dump_wiphy_parse(skb, cb, state);
+ if (ret) {
+ kfree(state);
+ rtnl_unlock();
+ return ret;
+ }
+ cb->args[0] = (long)state;
+ }
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
+ continue;
+ if (++idx <= state->start)
+ continue;
+ if (state->filter_wiphy != -1 &&
+ state->filter_wiphy != rdev->wiphy_idx)
+ continue;
+ /* attempt to fit multiple wiphy data chunks into the skb */
+ do {
+ ret = nl80211_send_wiphy(rdev, NL80211_CMD_NEW_WIPHY,
+ skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, state);
+ if (ret < 0) {
+ /*
+ * If sending the wiphy data didn't fit (ENOBUFS
+ * or EMSGSIZE returned), this SKB is still
+ * empty (so it's not too big because another
+ * wiphy dataset is already in the skb) and
+ * we've not tried to adjust the dump allocation
+ * yet ... then adjust the alloc size to be
+ * bigger, and return 1 but with the empty skb.
+ * This results in an empty message being RX'ed
+ * in userspace, but that is ignored.
+ *
+ * We can then retry with the larger buffer.
+ */
+ if ((ret == -ENOBUFS || ret == -EMSGSIZE) &&
+ !skb->len && !state->split &&
+ cb->min_dump_alloc < 4096) {
+ cb->min_dump_alloc = 4096;
+ state->split_start = 0;
+ rtnl_unlock();
+ return 1;
+ }
+ idx--;
+ break;
+ }
+ } while (state->split_start > 0);
+ break;
+ }
+ rtnl_unlock();
+
+ state->start = idx;
+
+ return skb->len;
+}
+
+static int nl80211_dump_wiphy_done(struct netlink_callback *cb)
+{
+ kfree((void *)cb->args[0]);
+ return 0;
+}
+
+static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct nl80211_dump_wiphy_state state = {};
+
+ msg = nlmsg_new(4096, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl80211_send_wiphy(rdev, NL80211_CMD_NEW_WIPHY, msg,
+ info->snd_portid, info->snd_seq, 0,
+ &state) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
+ [NL80211_TXQ_ATTR_QUEUE] = { .type = NLA_U8 },
+ [NL80211_TXQ_ATTR_TXOP] = { .type = NLA_U16 },
+ [NL80211_TXQ_ATTR_CWMIN] = { .type = NLA_U16 },
+ [NL80211_TXQ_ATTR_CWMAX] = { .type = NLA_U16 },
+ [NL80211_TXQ_ATTR_AIFS] = { .type = NLA_U8 },
+};
+
+static int parse_txq_params(struct nlattr *tb[],
+ struct ieee80211_txq_params *txq_params)
+{
+ if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] ||
+ !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
+ !tb[NL80211_TXQ_ATTR_AIFS])
+ return -EINVAL;
+
+ txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
+ txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
+ txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
+ txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
+ txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);
+
+ if (txq_params->ac >= NL80211_NUM_ACS)
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
+{
+ /*
+ * You can only set the channel explicitly for WDS interfaces,
+ * all others have their channel managed via their respective
+ * "establish a connection" command (connect, join, ...)
+ *
+ * For AP/GO and mesh mode, the channel can be set with the
+ * channel userspace API, but is only stored and passed to the
+ * low-level driver when the AP starts or the mesh is joined.
+ * This is for backward compatibility, userspace can also give
+ * the channel in the start-ap or join-mesh commands instead.
+ *
+ * Monitors are special as they are normally slaved to
+ * whatever else is going on, so they have their own special
+ * operation to set the monitor channel if possible.
+ */
+ return !wdev ||
+ wdev->iftype == NL80211_IFTYPE_AP ||
+ wdev->iftype == NL80211_IFTYPE_MESH_POINT ||
+ wdev->iftype == NL80211_IFTYPE_MONITOR ||
+ wdev->iftype == NL80211_IFTYPE_P2P_GO;
+}
+
+static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
+ struct genl_info *info,
+ struct cfg80211_chan_def *chandef)
+{
+ u32 control_freq;
+
+ if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
+ return -EINVAL;
+
+ control_freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+
+ chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq);
+ chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
+ chandef->center_freq1 = control_freq;
+ chandef->center_freq2 = 0;
+
+ /* Primary channel not allowed */
+ if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED)
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
+ enum nl80211_channel_type chantype;
+
+ chantype = nla_get_u32(
+ info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
+
+ switch (chantype) {
+ case NL80211_CHAN_NO_HT:
+ case NL80211_CHAN_HT20:
+ case NL80211_CHAN_HT40PLUS:
+ case NL80211_CHAN_HT40MINUS:
+ cfg80211_chandef_create(chandef, chandef->chan,
+ chantype);
+ break;
+ default:
+ return -EINVAL;
+ }
+ } else if (info->attrs[NL80211_ATTR_CHANNEL_WIDTH]) {
+ chandef->width =
+ nla_get_u32(info->attrs[NL80211_ATTR_CHANNEL_WIDTH]);
+ if (info->attrs[NL80211_ATTR_CENTER_FREQ1])
+ chandef->center_freq1 =
+ nla_get_u32(
+ info->attrs[NL80211_ATTR_CENTER_FREQ1]);
+ if (info->attrs[NL80211_ATTR_CENTER_FREQ2])
+ chandef->center_freq2 =
+ nla_get_u32(
+ info->attrs[NL80211_ATTR_CENTER_FREQ2]);
+ }
+
+ if (!cfg80211_chandef_valid(chandef))
+ return -EINVAL;
+
+ if (!cfg80211_chandef_usable(&rdev->wiphy, chandef,
+ IEEE80211_CHAN_DISABLED))
+ return -EINVAL;
+
+ if ((chandef->width == NL80211_CHAN_WIDTH_5 ||
+ chandef->width == NL80211_CHAN_WIDTH_10) &&
+ !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct genl_info *info)
+{
+ struct cfg80211_chan_def chandef;
+ int result;
+ enum nl80211_iftype iftype = NL80211_IFTYPE_MONITOR;
+ struct wireless_dev *wdev = NULL;
+
+ if (dev)
+ wdev = dev->ieee80211_ptr;
+ if (!nl80211_can_set_dev_channel(wdev))
+ return -EOPNOTSUPP;
+ if (wdev)
+ iftype = wdev->iftype;
+
+ result = nl80211_parse_chandef(rdev, info, &chandef);
+ if (result)
+ return result;
+
+ switch (iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, iftype)) {
+ result = -EINVAL;
+ break;
+ }
+ if (wdev->beacon_interval) {
+ if (!dev || !rdev->ops->set_ap_chanwidth ||
+ !(rdev->wiphy.features &
+ NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE)) {
+ result = -EBUSY;
+ break;
+ }
+
+ /* Only allow dynamic channel width changes */
+ if (chandef.chan != wdev->preset_chandef.chan) {
+ result = -EBUSY;
+ break;
+ }
+ result = rdev_set_ap_chanwidth(rdev, dev, &chandef);
+ if (result)
+ break;
+ }
+ wdev->preset_chandef = chandef;
+ result = 0;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ result = cfg80211_set_mesh_channel(rdev, wdev, &chandef);
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ result = cfg80211_set_monitor_channel(rdev, &chandef);
+ break;
+ default:
+ result = -EINVAL;
+ }
+
+ return result;
+}
+
+static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *netdev = info->user_ptr[1];
+
+ return __nl80211_set_channel(rdev, netdev, info);
+}
+
+static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ const u8 *bssid;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!rdev->ops->set_wds_peer)
+ return -EOPNOTSUPP;
+
+ if (wdev->iftype != NL80211_IFTYPE_WDS)
+ return -EOPNOTSUPP;
+
+ bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ return rdev_set_wds_peer(rdev, dev, bssid);
+}
+
+
+static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev;
+ struct net_device *netdev = NULL;
+ struct wireless_dev *wdev;
+ int result = 0, rem_txq_params = 0;
+ struct nlattr *nl_txq_params;
+ u32 changed;
+ u8 retry_short = 0, retry_long = 0;
+ u32 frag_threshold = 0, rts_threshold = 0;
+ u8 coverage_class = 0;
+
+ ASSERT_RTNL();
+
+ /*
+ * Try to find the wiphy and netdev. Normally this
+ * function shouldn't need the netdev, but this is
+ * done for backward compatibility -- previously
+ * setting the channel was done per wiphy, but now
+ * it is per netdev. Previous userland like hostapd
+ * also passed a netdev to set_wiphy, so that it is
+ * possible to let that go to the right netdev!
+ */
+
+ if (info->attrs[NL80211_ATTR_IFINDEX]) {
+ int ifindex = nla_get_u32(info->attrs[NL80211_ATTR_IFINDEX]);
+
+ netdev = __dev_get_by_index(genl_info_net(info), ifindex);
+ if (netdev && netdev->ieee80211_ptr)
+ rdev = wiphy_to_rdev(netdev->ieee80211_ptr->wiphy);
+ else
+ netdev = NULL;
+ }
+
+ if (!netdev) {
+ rdev = __cfg80211_rdev_from_attrs(genl_info_net(info),
+ info->attrs);
+ if (IS_ERR(rdev))
+ return PTR_ERR(rdev);
+ wdev = NULL;
+ netdev = NULL;
+ result = 0;
+ } else
+ wdev = netdev->ieee80211_ptr;
+
+ /*
+ * end workaround code, by now the rdev is available
+ * and locked, and wdev may or may not be NULL.
+ */
+
+ if (info->attrs[NL80211_ATTR_WIPHY_NAME])
+ result = cfg80211_dev_rename(
+ rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME]));
+
+ if (result)
+ return result;
+
+ if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) {
+ struct ieee80211_txq_params txq_params;
+ struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1];
+
+ if (!rdev->ops->set_txq_params)
+ return -EOPNOTSUPP;
+
+ if (!netdev)
+ return -EINVAL;
+
+ if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
+
+ if (!netif_running(netdev))
+ return -ENETDOWN;
+
+ nla_for_each_nested(nl_txq_params,
+ info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
+ rem_txq_params) {
+ result = nla_parse(tb, NL80211_TXQ_ATTR_MAX,
+ nla_data(nl_txq_params),
+ nla_len(nl_txq_params),
+ txq_params_policy);
+ if (result)
+ return result;
+ result = parse_txq_params(tb, &txq_params);
+ if (result)
+ return result;
+
+ result = rdev_set_txq_params(rdev, netdev,
+ &txq_params);
+ if (result)
+ return result;
+ }
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+ result = __nl80211_set_channel(
+ rdev,
+ nl80211_can_set_dev_channel(wdev) ? netdev : NULL,
+ info);
+ if (result)
+ return result;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) {
+ struct wireless_dev *txp_wdev = wdev;
+ enum nl80211_tx_power_setting type;
+ int idx, mbm = 0;
+
+ if (!(rdev->wiphy.features & NL80211_FEATURE_VIF_TXPOWER))
+ txp_wdev = NULL;
+
+ if (!rdev->ops->set_tx_power)
+ return -EOPNOTSUPP;
+
+ idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING;
+ type = nla_get_u32(info->attrs[idx]);
+
+ if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] &&
+ (type != NL80211_TX_POWER_AUTOMATIC))
+ return -EINVAL;
+
+ if (type != NL80211_TX_POWER_AUTOMATIC) {
+ idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL;
+ mbm = nla_get_u32(info->attrs[idx]);
+ }
+
+ result = rdev_set_tx_power(rdev, txp_wdev, type, mbm);
+ if (result)
+ return result;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] &&
+ info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]) {
+ u32 tx_ant, rx_ant;
+ if ((!rdev->wiphy.available_antennas_tx &&
+ !rdev->wiphy.available_antennas_rx) ||
+ !rdev->ops->set_antenna)
+ return -EOPNOTSUPP;
+
+ tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]);
+ rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]);
+
+ /* reject antenna configurations which don't match the
+ * available antenna masks, except for the "all" mask */
+ if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) ||
+ (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx)))
+ return -EINVAL;
+
+ tx_ant = tx_ant & rdev->wiphy.available_antennas_tx;
+ rx_ant = rx_ant & rdev->wiphy.available_antennas_rx;
+
+ result = rdev_set_antenna(rdev, tx_ant, rx_ant);
+ if (result)
+ return result;
+ }
+
+ changed = 0;
+
+ if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) {
+ retry_short = nla_get_u8(
+ info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]);
+ if (retry_short == 0)
+ return -EINVAL;
+
+ changed |= WIPHY_PARAM_RETRY_SHORT;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) {
+ retry_long = nla_get_u8(
+ info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]);
+ if (retry_long == 0)
+ return -EINVAL;
+
+ changed |= WIPHY_PARAM_RETRY_LONG;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) {
+ frag_threshold = nla_get_u32(
+ info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]);
+ if (frag_threshold < 256)
+ return -EINVAL;
+
+ if (frag_threshold != (u32) -1) {
+ /*
+ * Fragments (apart from the last one) are required to
+ * have even length. Make the fragmentation code
+ * simpler by stripping LSB should someone try to use
+ * odd threshold value.
+ */
+ frag_threshold &= ~0x1;
+ }
+ changed |= WIPHY_PARAM_FRAG_THRESHOLD;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]) {
+ rts_threshold = nla_get_u32(
+ info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]);
+ changed |= WIPHY_PARAM_RTS_THRESHOLD;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) {
+ if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK])
+ return -EINVAL;
+
+ coverage_class = nla_get_u8(
+ info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]);
+ changed |= WIPHY_PARAM_COVERAGE_CLASS;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) {
+ if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION))
+ return -EOPNOTSUPP;
+
+ changed |= WIPHY_PARAM_DYN_ACK;
+ }
+
+ if (changed) {
+ u8 old_retry_short, old_retry_long;
+ u32 old_frag_threshold, old_rts_threshold;
+ u8 old_coverage_class;
+
+ if (!rdev->ops->set_wiphy_params)
+ return -EOPNOTSUPP;
+
+ old_retry_short = rdev->wiphy.retry_short;
+ old_retry_long = rdev->wiphy.retry_long;
+ old_frag_threshold = rdev->wiphy.frag_threshold;
+ old_rts_threshold = rdev->wiphy.rts_threshold;
+ old_coverage_class = rdev->wiphy.coverage_class;
+
+ if (changed & WIPHY_PARAM_RETRY_SHORT)
+ rdev->wiphy.retry_short = retry_short;
+ if (changed & WIPHY_PARAM_RETRY_LONG)
+ rdev->wiphy.retry_long = retry_long;
+ if (changed & WIPHY_PARAM_FRAG_THRESHOLD)
+ rdev->wiphy.frag_threshold = frag_threshold;
+ if (changed & WIPHY_PARAM_RTS_THRESHOLD)
+ rdev->wiphy.rts_threshold = rts_threshold;
+ if (changed & WIPHY_PARAM_COVERAGE_CLASS)
+ rdev->wiphy.coverage_class = coverage_class;
+
+ result = rdev_set_wiphy_params(rdev, changed);
+ if (result) {
+ rdev->wiphy.retry_short = old_retry_short;
+ rdev->wiphy.retry_long = old_retry_long;
+ rdev->wiphy.frag_threshold = old_frag_threshold;
+ rdev->wiphy.rts_threshold = old_rts_threshold;
+ rdev->wiphy.coverage_class = old_coverage_class;
+ }
+ }
+ return 0;
+}
+
+static inline u64 wdev_id(struct wireless_dev *wdev)
+{
+ return (u64)wdev->identifier |
+ ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32);
+}
+
+static int nl80211_send_chandef(struct sk_buff *msg,
+ const struct cfg80211_chan_def *chandef)
+{
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return -EINVAL;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ,
+ chandef->chan->center_freq))
+ return -ENOBUFS;
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_40:
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE,
+ cfg80211_get_chandef_type(chandef)))
+ return -ENOBUFS;
+ break;
+ default:
+ break;
+ }
+ if (nla_put_u32(msg, NL80211_ATTR_CHANNEL_WIDTH, chandef->width))
+ return -ENOBUFS;
+ if (nla_put_u32(msg, NL80211_ATTR_CENTER_FREQ1, chandef->center_freq1))
+ return -ENOBUFS;
+ if (chandef->center_freq2 &&
+ nla_put_u32(msg, NL80211_ATTR_CENTER_FREQ2, chandef->center_freq2))
+ return -ENOBUFS;
+ return 0;
+}
+
+static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
+ struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, bool removal)
+{
+ struct net_device *dev = wdev->netdev;
+ u8 cmd = NL80211_CMD_NEW_INTERFACE;
+ void *hdr;
+
+ if (removal)
+ cmd = NL80211_CMD_DEL_INTERFACE;
+
+ hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -1;
+
+ if (dev &&
+ (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put_string(msg, NL80211_ATTR_IFNAME, dev->name)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFTYPE, wdev->iftype) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) ||
+ nla_put_u32(msg, NL80211_ATTR_GENERATION,
+ rdev->devlist_generation ^
+ (cfg80211_rdev_list_generation << 2)))
+ goto nla_put_failure;
+
+ if (rdev->ops->get_channel) {
+ int ret;
+ struct cfg80211_chan_def chandef;
+
+ ret = rdev_get_channel(rdev, wdev, &chandef);
+ if (ret == 0) {
+ if (nl80211_send_chandef(msg, &chandef))
+ goto nla_put_failure;
+ }
+ }
+
+ if (wdev->ssid_len) {
+ if (nla_put(msg, NL80211_ATTR_SSID, wdev->ssid_len, wdev->ssid))
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int wp_idx = 0;
+ int if_idx = 0;
+ int wp_start = cb->args[0];
+ int if_start = cb->args[1];
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+
+ rtnl_lock();
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
+ continue;
+ if (wp_idx < wp_start) {
+ wp_idx++;
+ continue;
+ }
+ if_idx = 0;
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ if (if_idx < if_start) {
+ if_idx++;
+ continue;
+ }
+ if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wdev, false) < 0) {
+ goto out;
+ }
+ if_idx++;
+ }
+
+ wp_idx++;
+ }
+ out:
+ rtnl_unlock();
+
+ cb->args[0] = wp_idx;
+ cb->args[1] = if_idx;
+
+ return skb->len;
+}
+
+static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0,
+ rdev, wdev, false) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = {
+ [NL80211_MNTR_FLAG_FCSFAIL] = { .type = NLA_FLAG },
+ [NL80211_MNTR_FLAG_PLCPFAIL] = { .type = NLA_FLAG },
+ [NL80211_MNTR_FLAG_CONTROL] = { .type = NLA_FLAG },
+ [NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG },
+ [NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG },
+ [NL80211_MNTR_FLAG_ACTIVE] = { .type = NLA_FLAG },
+};
+
+static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags)
+{
+ struct nlattr *flags[NL80211_MNTR_FLAG_MAX + 1];
+ int flag;
+
+ *mntrflags = 0;
+
+ if (!nla)
+ return -EINVAL;
+
+ if (nla_parse_nested(flags, NL80211_MNTR_FLAG_MAX,
+ nla, mntr_flags_policy))
+ return -EINVAL;
+
+ for (flag = 1; flag <= NL80211_MNTR_FLAG_MAX; flag++)
+ if (flags[flag])
+ *mntrflags |= (1<<flag);
+
+ return 0;
+}
+
+static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u8 use_4addr,
+ enum nl80211_iftype iftype)
+{
+ if (!use_4addr) {
+ if (netdev && (netdev->priv_flags & IFF_BRIDGE_PORT))
+ return -EBUSY;
+ return 0;
+ }
+
+ switch (iftype) {
+ case NL80211_IFTYPE_AP_VLAN:
+ if (rdev->wiphy.flags & WIPHY_FLAG_4ADDR_AP)
+ return 0;
+ break;
+ case NL80211_IFTYPE_STATION:
+ if (rdev->wiphy.flags & WIPHY_FLAG_4ADDR_STATION)
+ return 0;
+ break;
+ default:
+ break;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct vif_params params;
+ int err;
+ enum nl80211_iftype otype, ntype;
+ struct net_device *dev = info->user_ptr[1];
+ u32 _flags, *flags = NULL;
+ bool change = false;
+
+ memset(&params, 0, sizeof(params));
+
+ otype = ntype = dev->ieee80211_ptr->iftype;
+
+ if (info->attrs[NL80211_ATTR_IFTYPE]) {
+ ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
+ if (otype != ntype)
+ change = true;
+ if (ntype > NL80211_IFTYPE_MAX)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_MESH_ID]) {
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ if (ntype != NL80211_IFTYPE_MESH_POINT)
+ return -EINVAL;
+ if (netif_running(dev))
+ return -EBUSY;
+
+ wdev_lock(wdev);
+ BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
+ IEEE80211_MAX_MESH_ID_LEN);
+ wdev->mesh_id_up_len =
+ nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
+ memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
+ wdev->mesh_id_up_len);
+ wdev_unlock(wdev);
+ }
+
+ if (info->attrs[NL80211_ATTR_4ADDR]) {
+ params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
+ change = true;
+ err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype);
+ if (err)
+ return err;
+ } else {
+ params.use_4addr = -1;
+ }
+
+ if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
+ if (ntype != NL80211_IFTYPE_MONITOR)
+ return -EINVAL;
+ err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS],
+ &_flags);
+ if (err)
+ return err;
+
+ flags = &_flags;
+ change = true;
+ }
+
+ if (flags && (*flags & MONITOR_FLAG_ACTIVE) &&
+ !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR))
+ return -EOPNOTSUPP;
+
+ if (change)
+ err = cfg80211_change_iface(rdev, dev, ntype, flags, &params);
+ else
+ err = 0;
+
+ if (!err && params.use_4addr != -1)
+ dev->ieee80211_ptr->use_4addr = params.use_4addr;
+
+ return err;
+}
+
+static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct vif_params params;
+ struct wireless_dev *wdev;
+ struct sk_buff *msg, *event;
+ int err;
+ enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;
+ u32 flags;
+
+ /* to avoid failing a new interface creation due to pending removal */
+ cfg80211_destroy_ifaces(rdev);
+
+ memset(&params, 0, sizeof(params));
+
+ if (!info->attrs[NL80211_ATTR_IFNAME])
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_IFTYPE]) {
+ type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
+ if (type > NL80211_IFTYPE_MAX)
+ return -EINVAL;
+ }
+
+ if (!rdev->ops->add_virtual_intf ||
+ !(rdev->wiphy.interface_modes & (1 << type)))
+ return -EOPNOTSUPP;
+
+ if ((type == NL80211_IFTYPE_P2P_DEVICE ||
+ rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) &&
+ info->attrs[NL80211_ATTR_MAC]) {
+ nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC],
+ ETH_ALEN);
+ if (!is_valid_ether_addr(params.macaddr))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (info->attrs[NL80211_ATTR_4ADDR]) {
+ params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
+ err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type);
+ if (err)
+ return err;
+ }
+
+ err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ?
+ info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL,
+ &flags);
+
+ if (!err && (flags & MONITOR_FLAG_ACTIVE) &&
+ !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR))
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ wdev = rdev_add_virtual_intf(rdev,
+ nla_data(info->attrs[NL80211_ATTR_IFNAME]),
+ NET_NAME_USER, type, err ? NULL : &flags,
+ &params);
+ if (WARN_ON(!wdev)) {
+ nlmsg_free(msg);
+ return -EPROTO;
+ } else if (IS_ERR(wdev)) {
+ nlmsg_free(msg);
+ return PTR_ERR(wdev);
+ }
+
+ if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
+ wdev->owner_nlportid = info->snd_portid;
+
+ switch (type) {
+ case NL80211_IFTYPE_MESH_POINT:
+ if (!info->attrs[NL80211_ATTR_MESH_ID])
+ break;
+ wdev_lock(wdev);
+ BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
+ IEEE80211_MAX_MESH_ID_LEN);
+ wdev->mesh_id_up_len =
+ nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
+ memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
+ wdev->mesh_id_up_len);
+ wdev_unlock(wdev);
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ /*
+ * P2P Device doesn't have a netdev, so doesn't go
+ * through the netdev notifier and must be added here
+ */
+ mutex_init(&wdev->mtx);
+ INIT_LIST_HEAD(&wdev->event_list);
+ spin_lock_init(&wdev->event_lock);
+ INIT_LIST_HEAD(&wdev->mgmt_registrations);
+ spin_lock_init(&wdev->mgmt_registrations_lock);
+
+ wdev->identifier = ++rdev->wdev_id;
+ list_add_rcu(&wdev->list, &rdev->wdev_list);
+ rdev->devlist_generation++;
+ break;
+ default:
+ break;
+ }
+
+ if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0,
+ rdev, wdev, false) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ event = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (event) {
+ if (nl80211_send_iface(event, 0, 0, 0,
+ rdev, wdev, false) < 0) {
+ nlmsg_free(event);
+ goto out;
+ }
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
+ event, 0, NL80211_MCGRP_CONFIG,
+ GFP_KERNEL);
+ }
+
+out:
+ return genlmsg_reply(msg, info);
+}
+
+static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ struct sk_buff *msg;
+ int status;
+
+ if (!rdev->ops->del_virtual_intf)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (msg && nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, true) < 0) {
+ nlmsg_free(msg);
+ msg = NULL;
+ }
+
+ /*
+ * If we remove a wireless device without a netdev then clear
+ * user_ptr[1] so that nl80211_post_doit won't dereference it
+ * to check if it needs to do dev_put(). Otherwise it crashes
+ * since the wdev has been freed, unlike with a netdev where
+ * we need the dev_put() for the netdev to really be freed.
+ */
+ if (!wdev->netdev)
+ info->user_ptr[1] = NULL;
+
+ status = rdev_del_virtual_intf(rdev, wdev);
+ if (status >= 0 && msg)
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
+ msg, 0, NL80211_MCGRP_CONFIG,
+ GFP_KERNEL);
+ else
+ nlmsg_free(msg);
+
+ return status;
+}
+
+static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ u16 noack_map;
+
+ if (!info->attrs[NL80211_ATTR_NOACK_MAP])
+ return -EINVAL;
+
+ if (!rdev->ops->set_noack_map)
+ return -EOPNOTSUPP;
+
+ noack_map = nla_get_u16(info->attrs[NL80211_ATTR_NOACK_MAP]);
+
+ return rdev_set_noack_map(rdev, dev, noack_map);
+}
+
+struct get_key_cookie {
+ struct sk_buff *msg;
+ int error;
+ int idx;
+};
+
+static void get_key_callback(void *c, struct key_params *params)
+{
+ struct nlattr *key;
+ struct get_key_cookie *cookie = c;
+
+ if ((params->key &&
+ nla_put(cookie->msg, NL80211_ATTR_KEY_DATA,
+ params->key_len, params->key)) ||
+ (params->seq &&
+ nla_put(cookie->msg, NL80211_ATTR_KEY_SEQ,
+ params->seq_len, params->seq)) ||
+ (params->cipher &&
+ nla_put_u32(cookie->msg, NL80211_ATTR_KEY_CIPHER,
+ params->cipher)))
+ goto nla_put_failure;
+
+ key = nla_nest_start(cookie->msg, NL80211_ATTR_KEY);
+ if (!key)
+ goto nla_put_failure;
+
+ if ((params->key &&
+ nla_put(cookie->msg, NL80211_KEY_DATA,
+ params->key_len, params->key)) ||
+ (params->seq &&
+ nla_put(cookie->msg, NL80211_KEY_SEQ,
+ params->seq_len, params->seq)) ||
+ (params->cipher &&
+ nla_put_u32(cookie->msg, NL80211_KEY_CIPHER,
+ params->cipher)))
+ goto nla_put_failure;
+
+ if (nla_put_u8(cookie->msg, NL80211_ATTR_KEY_IDX, cookie->idx))
+ goto nla_put_failure;
+
+ nla_nest_end(cookie->msg, key);
+
+ return;
+ nla_put_failure:
+ cookie->error = 1;
+}
+
+static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ int err;
+ struct net_device *dev = info->user_ptr[1];
+ u8 key_idx = 0;
+ const u8 *mac_addr = NULL;
+ bool pairwise;
+ struct get_key_cookie cookie = {
+ .error = 0,
+ };
+ void *hdr;
+ struct sk_buff *msg;
+
+ if (info->attrs[NL80211_ATTR_KEY_IDX])
+ key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
+
+ if (key_idx > 5)
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_MAC])
+ mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ pairwise = !!mac_addr;
+ if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+ u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+ if (kt >= NUM_NL80211_KEYTYPES)
+ return -EINVAL;
+ if (kt != NL80211_KEYTYPE_GROUP &&
+ kt != NL80211_KEYTYPE_PAIRWISE)
+ return -EINVAL;
+ pairwise = kt == NL80211_KEYTYPE_PAIRWISE;
+ }
+
+ if (!rdev->ops->get_key)
+ return -EOPNOTSUPP;
+
+ if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+ return -ENOENT;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_NEW_KEY);
+ if (!hdr)
+ goto nla_put_failure;
+
+ cookie.msg = msg;
+ cookie.idx = key_idx;
+
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put_u8(msg, NL80211_ATTR_KEY_IDX, key_idx))
+ goto nla_put_failure;
+ if (mac_addr &&
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr))
+ goto nla_put_failure;
+
+ err = rdev_get_key(rdev, dev, key_idx, pairwise, mac_addr, &cookie,
+ get_key_callback);
+
+ if (err)
+ goto free_msg;
+
+ if (cookie.error)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+ err = -ENOBUFS;
+ free_msg:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct key_parse key;
+ int err;
+ struct net_device *dev = info->user_ptr[1];
+
+ err = nl80211_parse_key(info, &key);
+ if (err)
+ return err;
+
+ if (key.idx < 0)
+ return -EINVAL;
+
+ /* only support setting default key */
+ if (!key.def && !key.defmgmt)
+ return -EINVAL;
+
+ wdev_lock(dev->ieee80211_ptr);
+
+ if (key.def) {
+ if (!rdev->ops->set_default_key) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = nl80211_key_allowed(dev->ieee80211_ptr);
+ if (err)
+ goto out;
+
+ err = rdev_set_default_key(rdev, dev, key.idx,
+ key.def_uni, key.def_multi);
+
+ if (err)
+ goto out;
+
+#ifdef CONFIG_CFG80211_WEXT
+ dev->ieee80211_ptr->wext.default_key = key.idx;
+#endif
+ } else {
+ if (key.def_uni || !key.def_multi) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!rdev->ops->set_default_mgmt_key) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = nl80211_key_allowed(dev->ieee80211_ptr);
+ if (err)
+ goto out;
+
+ err = rdev_set_default_mgmt_key(rdev, dev, key.idx);
+ if (err)
+ goto out;
+
+#ifdef CONFIG_CFG80211_WEXT
+ dev->ieee80211_ptr->wext.default_mgmt_key = key.idx;
+#endif
+ }
+
+ out:
+ wdev_unlock(dev->ieee80211_ptr);
+
+ return err;
+}
+
+static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ int err;
+ struct net_device *dev = info->user_ptr[1];
+ struct key_parse key;
+ const u8 *mac_addr = NULL;
+
+ err = nl80211_parse_key(info, &key);
+ if (err)
+ return err;
+
+ if (!key.p.key)
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_MAC])
+ mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (key.type == -1) {
+ if (mac_addr)
+ key.type = NL80211_KEYTYPE_PAIRWISE;
+ else
+ key.type = NL80211_KEYTYPE_GROUP;
+ }
+
+ /* for now */
+ if (key.type != NL80211_KEYTYPE_PAIRWISE &&
+ key.type != NL80211_KEYTYPE_GROUP)
+ return -EINVAL;
+
+ if (!rdev->ops->add_key)
+ return -EOPNOTSUPP;
+
+ if (cfg80211_validate_key_settings(rdev, &key.p, key.idx,
+ key.type == NL80211_KEYTYPE_PAIRWISE,
+ mac_addr))
+ return -EINVAL;
+
+ wdev_lock(dev->ieee80211_ptr);
+ err = nl80211_key_allowed(dev->ieee80211_ptr);
+ if (!err)
+ err = rdev_add_key(rdev, dev, key.idx,
+ key.type == NL80211_KEYTYPE_PAIRWISE,
+ mac_addr, &key.p);
+ wdev_unlock(dev->ieee80211_ptr);
+
+ return err;
+}
+
+static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ int err;
+ struct net_device *dev = info->user_ptr[1];
+ u8 *mac_addr = NULL;
+ struct key_parse key;
+
+ err = nl80211_parse_key(info, &key);
+ if (err)
+ return err;
+
+ if (info->attrs[NL80211_ATTR_MAC])
+ mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (key.type == -1) {
+ if (mac_addr)
+ key.type = NL80211_KEYTYPE_PAIRWISE;
+ else
+ key.type = NL80211_KEYTYPE_GROUP;
+ }
+
+ /* for now */
+ if (key.type != NL80211_KEYTYPE_PAIRWISE &&
+ key.type != NL80211_KEYTYPE_GROUP)
+ return -EINVAL;
+
+ if (!rdev->ops->del_key)
+ return -EOPNOTSUPP;
+
+ wdev_lock(dev->ieee80211_ptr);
+ err = nl80211_key_allowed(dev->ieee80211_ptr);
+
+ if (key.type == NL80211_KEYTYPE_GROUP && mac_addr &&
+ !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+ err = -ENOENT;
+
+ if (!err)
+ err = rdev_del_key(rdev, dev, key.idx,
+ key.type == NL80211_KEYTYPE_PAIRWISE,
+ mac_addr);
+
+#ifdef CONFIG_CFG80211_WEXT
+ if (!err) {
+ if (key.idx == dev->ieee80211_ptr->wext.default_key)
+ dev->ieee80211_ptr->wext.default_key = -1;
+ else if (key.idx == dev->ieee80211_ptr->wext.default_mgmt_key)
+ dev->ieee80211_ptr->wext.default_mgmt_key = -1;
+ }
+#endif
+ wdev_unlock(dev->ieee80211_ptr);
+
+ return err;
+}
+
+/* This function returns an error or the number of nested attributes */
+static int validate_acl_mac_addrs(struct nlattr *nl_attr)
+{
+ struct nlattr *attr;
+ int n_entries = 0, tmp;
+
+ nla_for_each_nested(attr, nl_attr, tmp) {
+ if (nla_len(attr) != ETH_ALEN)
+ return -EINVAL;
+
+ n_entries++;
+ }
+
+ return n_entries;
+}
+
+/*
+ * This function parses ACL information and allocates memory for ACL data.
+ * On successful return, the calling function is responsible to free the
+ * ACL buffer returned by this function.
+ */
+static struct cfg80211_acl_data *parse_acl_data(struct wiphy *wiphy,
+ struct genl_info *info)
+{
+ enum nl80211_acl_policy acl_policy;
+ struct nlattr *attr;
+ struct cfg80211_acl_data *acl;
+ int i = 0, n_entries, tmp;
+
+ if (!wiphy->max_acl_mac_addrs)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (!info->attrs[NL80211_ATTR_ACL_POLICY])
+ return ERR_PTR(-EINVAL);
+
+ acl_policy = nla_get_u32(info->attrs[NL80211_ATTR_ACL_POLICY]);
+ if (acl_policy != NL80211_ACL_POLICY_ACCEPT_UNLESS_LISTED &&
+ acl_policy != NL80211_ACL_POLICY_DENY_UNLESS_LISTED)
+ return ERR_PTR(-EINVAL);
+
+ if (!info->attrs[NL80211_ATTR_MAC_ADDRS])
+ return ERR_PTR(-EINVAL);
+
+ n_entries = validate_acl_mac_addrs(info->attrs[NL80211_ATTR_MAC_ADDRS]);
+ if (n_entries < 0)
+ return ERR_PTR(n_entries);
+
+ if (n_entries > wiphy->max_acl_mac_addrs)
+ return ERR_PTR(-ENOTSUPP);
+
+ acl = kzalloc(sizeof(*acl) + (sizeof(struct mac_address) * n_entries),
+ GFP_KERNEL);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+
+ nla_for_each_nested(attr, info->attrs[NL80211_ATTR_MAC_ADDRS], tmp) {
+ memcpy(acl->mac_addrs[i].addr, nla_data(attr), ETH_ALEN);
+ i++;
+ }
+
+ acl->n_acl_entries = n_entries;
+ acl->acl_policy = acl_policy;
+
+ return acl;
+}
+
+static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct cfg80211_acl_data *acl;
+ int err;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+
+ if (!dev->ieee80211_ptr->beacon_interval)
+ return -EINVAL;
+
+ acl = parse_acl_data(&rdev->wiphy, info);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+
+ err = rdev_set_mac_acl(rdev, dev, acl);
+
+ kfree(acl);
+
+ return err;
+}
+
+static int nl80211_parse_beacon(struct nlattr *attrs[],
+ struct cfg80211_beacon_data *bcn)
+{
+ bool haveinfo = false;
+
+ if (!is_valid_ie_attr(attrs[NL80211_ATTR_BEACON_TAIL]) ||
+ !is_valid_ie_attr(attrs[NL80211_ATTR_IE]) ||
+ !is_valid_ie_attr(attrs[NL80211_ATTR_IE_PROBE_RESP]) ||
+ !is_valid_ie_attr(attrs[NL80211_ATTR_IE_ASSOC_RESP]))
+ return -EINVAL;
+
+ memset(bcn, 0, sizeof(*bcn));
+
+ if (attrs[NL80211_ATTR_BEACON_HEAD]) {
+ bcn->head = nla_data(attrs[NL80211_ATTR_BEACON_HEAD]);
+ bcn->head_len = nla_len(attrs[NL80211_ATTR_BEACON_HEAD]);
+ if (!bcn->head_len)
+ return -EINVAL;
+ haveinfo = true;
+ }
+
+ if (attrs[NL80211_ATTR_BEACON_TAIL]) {
+ bcn->tail = nla_data(attrs[NL80211_ATTR_BEACON_TAIL]);
+ bcn->tail_len = nla_len(attrs[NL80211_ATTR_BEACON_TAIL]);
+ haveinfo = true;
+ }
+
+ if (!haveinfo)
+ return -EINVAL;
+
+ if (attrs[NL80211_ATTR_IE]) {
+ bcn->beacon_ies = nla_data(attrs[NL80211_ATTR_IE]);
+ bcn->beacon_ies_len = nla_len(attrs[NL80211_ATTR_IE]);
+ }
+
+ if (attrs[NL80211_ATTR_IE_PROBE_RESP]) {
+ bcn->proberesp_ies =
+ nla_data(attrs[NL80211_ATTR_IE_PROBE_RESP]);
+ bcn->proberesp_ies_len =
+ nla_len(attrs[NL80211_ATTR_IE_PROBE_RESP]);
+ }
+
+ if (attrs[NL80211_ATTR_IE_ASSOC_RESP]) {
+ bcn->assocresp_ies =
+ nla_data(attrs[NL80211_ATTR_IE_ASSOC_RESP]);
+ bcn->assocresp_ies_len =
+ nla_len(attrs[NL80211_ATTR_IE_ASSOC_RESP]);
+ }
+
+ if (attrs[NL80211_ATTR_PROBE_RESP]) {
+ bcn->probe_resp = nla_data(attrs[NL80211_ATTR_PROBE_RESP]);
+ bcn->probe_resp_len = nla_len(attrs[NL80211_ATTR_PROBE_RESP]);
+ }
+
+ return 0;
+}
+
+static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
+ struct cfg80211_ap_settings *params)
+{
+ struct wireless_dev *wdev;
+ bool ret = false;
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ if (wdev->iftype != NL80211_IFTYPE_AP &&
+ wdev->iftype != NL80211_IFTYPE_P2P_GO)
+ continue;
+
+ if (!wdev->preset_chandef.chan)
+ continue;
+
+ params->chandef = wdev->preset_chandef;
+ ret = true;
+ break;
+ }
+
+ return ret;
+}
+
+static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
+ enum nl80211_auth_type auth_type,
+ enum nl80211_commands cmd)
+{
+ if (auth_type > NL80211_AUTHTYPE_MAX)
+ return false;
+
+ switch (cmd) {
+ case NL80211_CMD_AUTHENTICATE:
+ if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
+ auth_type == NL80211_AUTHTYPE_SAE)
+ return false;
+ return true;
+ case NL80211_CMD_CONNECT:
+ case NL80211_CMD_START_AP:
+ /* SAE not supported yet */
+ if (auth_type == NL80211_AUTHTYPE_SAE)
+ return false;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_ap_settings params;
+ int err;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->start_ap)
+ return -EOPNOTSUPP;
+
+ if (wdev->beacon_interval)
+ return -EALREADY;
+
+ memset(&params, 0, sizeof(params));
+
+ /* these are required for START_AP */
+ if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] ||
+ !info->attrs[NL80211_ATTR_DTIM_PERIOD] ||
+ !info->attrs[NL80211_ATTR_BEACON_HEAD])
+ return -EINVAL;
+
+ err = nl80211_parse_beacon(info->attrs, &params.beacon);
+ if (err)
+ return err;
+
+ params.beacon_interval =
+ nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
+ params.dtim_period =
+ nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]);
+
+ err = cfg80211_validate_beacon_int(rdev, params.beacon_interval);
+ if (err)
+ return err;
+
+ /*
+ * In theory, some of these attributes should be required here
+ * but since they were not used when the command was originally
+ * added, keep them optional for old user space programs to let
+ * them continue to work with drivers that do not need the
+ * additional information -- drivers must check!
+ */
+ if (info->attrs[NL80211_ATTR_SSID]) {
+ params.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+ params.ssid_len =
+ nla_len(info->attrs[NL80211_ATTR_SSID]);
+ if (params.ssid_len == 0 ||
+ params.ssid_len > IEEE80211_MAX_SSID_LEN)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_HIDDEN_SSID]) {
+ params.hidden_ssid = nla_get_u32(
+ info->attrs[NL80211_ATTR_HIDDEN_SSID]);
+ if (params.hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE &&
+ params.hidden_ssid != NL80211_HIDDEN_SSID_ZERO_LEN &&
+ params.hidden_ssid != NL80211_HIDDEN_SSID_ZERO_CONTENTS)
+ return -EINVAL;
+ }
+
+ params.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
+
+ if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
+ params.auth_type = nla_get_u32(
+ info->attrs[NL80211_ATTR_AUTH_TYPE]);
+ if (!nl80211_valid_auth_type(rdev, params.auth_type,
+ NL80211_CMD_START_AP))
+ return -EINVAL;
+ } else
+ params.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+
+ err = nl80211_crypto_settings(rdev, info, &params.crypto,
+ NL80211_MAX_NR_CIPHER_SUITES);
+ if (err)
+ return err;
+
+ if (info->attrs[NL80211_ATTR_INACTIVITY_TIMEOUT]) {
+ if (!(rdev->wiphy.features & NL80211_FEATURE_INACTIVITY_TIMER))
+ return -EOPNOTSUPP;
+ params.inactivity_timeout = nla_get_u16(
+ info->attrs[NL80211_ATTR_INACTIVITY_TIMEOUT]);
+ }
+
+ if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) {
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
+ params.p2p_ctwindow =
+ nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
+ if (params.p2p_ctwindow > 127)
+ return -EINVAL;
+ if (params.p2p_ctwindow != 0 &&
+ !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN))
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_P2P_OPPPS]) {
+ u8 tmp;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
+ tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
+ if (tmp > 1)
+ return -EINVAL;
+ params.p2p_opp_ps = tmp;
+ if (params.p2p_opp_ps != 0 &&
+ !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS))
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+ err = nl80211_parse_chandef(rdev, info, &params.chandef);
+ if (err)
+ return err;
+ } else if (wdev->preset_chandef.chan) {
+ params.chandef = wdev->preset_chandef;
+ } else if (!nl80211_get_ap_channel(rdev, &params))
+ return -EINVAL;
+
+ if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef,
+ wdev->iftype))
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_ACL_POLICY]) {
+ params.acl = parse_acl_data(&rdev->wiphy, info);
+ if (IS_ERR(params.acl))
+ return PTR_ERR(params.acl);
+ }
+
+ if (info->attrs[NL80211_ATTR_SMPS_MODE]) {
+ params.smps_mode =
+ nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]);
+ switch (params.smps_mode) {
+ case NL80211_SMPS_OFF:
+ break;
+ case NL80211_SMPS_STATIC:
+ if (!(rdev->wiphy.features &
+ NL80211_FEATURE_STATIC_SMPS))
+ return -EINVAL;
+ break;
+ case NL80211_SMPS_DYNAMIC:
+ if (!(rdev->wiphy.features &
+ NL80211_FEATURE_DYNAMIC_SMPS))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ } else {
+ params.smps_mode = NL80211_SMPS_OFF;
+ }
+
+ wdev_lock(wdev);
+ err = rdev_start_ap(rdev, dev, &params);
+ if (!err) {
+ wdev->preset_chandef = params.chandef;
+ wdev->beacon_interval = params.beacon_interval;
+ wdev->chandef = params.chandef;
+ wdev->ssid_len = params.ssid_len;
+ memcpy(wdev->ssid, params.ssid, wdev->ssid_len);
+ }
+ wdev_unlock(wdev);
+
+ kfree(params.acl);
+
+ return err;
+}
+
+static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_beacon_data params;
+ int err;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->change_beacon)
+ return -EOPNOTSUPP;
+
+ if (!wdev->beacon_interval)
+ return -EINVAL;
+
+ err = nl80211_parse_beacon(info->attrs, &params);
+ if (err)
+ return err;
+
+ wdev_lock(wdev);
+ err = rdev_change_beacon(rdev, dev, &params);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+static int nl80211_stop_ap(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+
+ return cfg80211_stop_ap(rdev, dev, false);
+}
+
+static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
+ [NL80211_STA_FLAG_AUTHORIZED] = { .type = NLA_FLAG },
+ [NL80211_STA_FLAG_SHORT_PREAMBLE] = { .type = NLA_FLAG },
+ [NL80211_STA_FLAG_WME] = { .type = NLA_FLAG },
+ [NL80211_STA_FLAG_MFP] = { .type = NLA_FLAG },
+ [NL80211_STA_FLAG_AUTHENTICATED] = { .type = NLA_FLAG },
+ [NL80211_STA_FLAG_TDLS_PEER] = { .type = NLA_FLAG },
+};
+
+static int parse_station_flags(struct genl_info *info,
+ enum nl80211_iftype iftype,
+ struct station_parameters *params)
+{
+ struct nlattr *flags[NL80211_STA_FLAG_MAX + 1];
+ struct nlattr *nla;
+ int flag;
+
+ /*
+ * Try parsing the new attribute first so userspace
+ * can specify both for older kernels.
+ */
+ nla = info->attrs[NL80211_ATTR_STA_FLAGS2];
+ if (nla) {
+ struct nl80211_sta_flag_update *sta_flags;
+
+ sta_flags = nla_data(nla);
+ params->sta_flags_mask = sta_flags->mask;
+ params->sta_flags_set = sta_flags->set;
+ params->sta_flags_set &= params->sta_flags_mask;
+ if ((params->sta_flags_mask |
+ params->sta_flags_set) & BIT(__NL80211_STA_FLAG_INVALID))
+ return -EINVAL;
+ return 0;
+ }
+
+ /* if present, parse the old attribute */
+
+ nla = info->attrs[NL80211_ATTR_STA_FLAGS];
+ if (!nla)
+ return 0;
+
+ if (nla_parse_nested(flags, NL80211_STA_FLAG_MAX,
+ nla, sta_flags_policy))
+ return -EINVAL;
+
+ /*
+ * Only allow certain flags for interface types so that
+ * other attributes are silently ignored. Remember that
+ * this is backward compatibility code with old userspace
+ * and shouldn't be hit in other cases anyway.
+ */
+ switch (iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHORIZED) |
+ BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) |
+ BIT(NL80211_STA_FLAG_WME) |
+ BIT(NL80211_STA_FLAG_MFP);
+ break;
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_STATION:
+ params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHORIZED) |
+ BIT(NL80211_STA_FLAG_TDLS_PEER);
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_MFP) |
+ BIT(NL80211_STA_FLAG_AUTHORIZED);
+ default:
+ return -EINVAL;
+ }
+
+ for (flag = 1; flag <= NL80211_STA_FLAG_MAX; flag++) {
+ if (flags[flag]) {
+ params->sta_flags_set |= (1<<flag);
+
+ /* no longer support new API additions in old API */
+ if (flag > NL80211_STA_FLAG_MAX_OLD_API)
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
+ int attr)
+{
+ struct nlattr *rate;
+ u32 bitrate;
+ u16 bitrate_compat;
+ enum nl80211_attrs rate_flg;
+
+ rate = nla_nest_start(msg, attr);
+ if (!rate)
+ return false;
+
+ /* cfg80211_calculate_bitrate will return 0 for mcs >= 32 */
+ bitrate = cfg80211_calculate_bitrate(info);
+ /* report 16-bit bitrate only if we can */
+ bitrate_compat = bitrate < (1UL << 16) ? bitrate : 0;
+ if (bitrate > 0 &&
+ nla_put_u32(msg, NL80211_RATE_INFO_BITRATE32, bitrate))
+ return false;
+ if (bitrate_compat > 0 &&
+ nla_put_u16(msg, NL80211_RATE_INFO_BITRATE, bitrate_compat))
+ return false;
+
+ switch (info->bw) {
+ case RATE_INFO_BW_5:
+ rate_flg = NL80211_RATE_INFO_5_MHZ_WIDTH;
+ break;
+ case RATE_INFO_BW_10:
+ rate_flg = NL80211_RATE_INFO_10_MHZ_WIDTH;
+ break;
+ default:
+ WARN_ON(1);
+ /* fall through */
+ case RATE_INFO_BW_20:
+ rate_flg = 0;
+ break;
+ case RATE_INFO_BW_40:
+ rate_flg = NL80211_RATE_INFO_40_MHZ_WIDTH;
+ break;
+ case RATE_INFO_BW_80:
+ rate_flg = NL80211_RATE_INFO_80_MHZ_WIDTH;
+ break;
+ case RATE_INFO_BW_160:
+ rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH;
+ break;
+ }
+
+ if (rate_flg && nla_put_flag(msg, rate_flg))
+ return false;
+
+ if (info->flags & RATE_INFO_FLAGS_MCS) {
+ if (nla_put_u8(msg, NL80211_RATE_INFO_MCS, info->mcs))
+ return false;
+ if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
+ nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
+ return false;
+ } else if (info->flags & RATE_INFO_FLAGS_VHT_MCS) {
+ if (nla_put_u8(msg, NL80211_RATE_INFO_VHT_MCS, info->mcs))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_VHT_NSS, info->nss))
+ return false;
+ if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
+ nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
+ return false;
+ }
+
+ nla_nest_end(msg, rate);
+ return true;
+}
+
+static bool nl80211_put_signal(struct sk_buff *msg, u8 mask, s8 *signal,
+ int id)
+{
+ void *attr;
+ int i = 0;
+
+ if (!mask)
+ return true;
+
+ attr = nla_nest_start(msg, id);
+ if (!attr)
+ return false;
+
+ for (i = 0; i < IEEE80211_MAX_CHAINS; i++) {
+ if (!(mask & BIT(i)))
+ continue;
+
+ if (nla_put_u8(msg, i, signal[i]))
+ return false;
+ }
+
+ nla_nest_end(msg, attr);
+
+ return true;
+}
+
+static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
+ u32 seq, int flags,
+ struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ const u8 *mac_addr, struct station_info *sinfo)
+{
+ void *hdr;
+ struct nlattr *sinfoattr, *bss_param;
+
+ hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -1;
+
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr) ||
+ nla_put_u32(msg, NL80211_ATTR_GENERATION, sinfo->generation))
+ goto nla_put_failure;
+
+ sinfoattr = nla_nest_start(msg, NL80211_ATTR_STA_INFO);
+ if (!sinfoattr)
+ goto nla_put_failure;
+
+#define PUT_SINFO(attr, memb, type) do { \
+ if (sinfo->filled & BIT(NL80211_STA_INFO_ ## attr) && \
+ nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \
+ sinfo->memb)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ PUT_SINFO(CONNECTED_TIME, connected_time, u32);
+ PUT_SINFO(INACTIVE_TIME, inactive_time, u32);
+
+ if (sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES) |
+ BIT(NL80211_STA_INFO_RX_BYTES64)) &&
+ nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES,
+ (u32)sinfo->rx_bytes))
+ goto nla_put_failure;
+
+ if (sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES) |
+ BIT(NL80211_STA_INFO_TX_BYTES64)) &&
+ nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES,
+ (u32)sinfo->tx_bytes))
+ goto nla_put_failure;
+
+ PUT_SINFO(RX_BYTES64, rx_bytes, u64);
+ PUT_SINFO(TX_BYTES64, tx_bytes, u64);
+ PUT_SINFO(LLID, llid, u16);
+ PUT_SINFO(PLID, plid, u16);
+ PUT_SINFO(PLINK_STATE, plink_state, u8);
+
+ switch (rdev->wiphy.signal_type) {
+ case CFG80211_SIGNAL_TYPE_MBM:
+ PUT_SINFO(SIGNAL, signal, u8);
+ PUT_SINFO(SIGNAL_AVG, signal_avg, u8);
+ break;
+ default:
+ break;
+ }
+ if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL)) {
+ if (!nl80211_put_signal(msg, sinfo->chains,
+ sinfo->chain_signal,
+ NL80211_STA_INFO_CHAIN_SIGNAL))
+ goto nla_put_failure;
+ }
+ if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) {
+ if (!nl80211_put_signal(msg, sinfo->chains,
+ sinfo->chain_signal_avg,
+ NL80211_STA_INFO_CHAIN_SIGNAL_AVG))
+ goto nla_put_failure;
+ }
+ if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE)) {
+ if (!nl80211_put_sta_rate(msg, &sinfo->txrate,
+ NL80211_STA_INFO_TX_BITRATE))
+ goto nla_put_failure;
+ }
+ if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE)) {
+ if (!nl80211_put_sta_rate(msg, &sinfo->rxrate,
+ NL80211_STA_INFO_RX_BITRATE))
+ goto nla_put_failure;
+ }
+
+ PUT_SINFO(RX_PACKETS, rx_packets, u32);
+ PUT_SINFO(TX_PACKETS, tx_packets, u32);
+ PUT_SINFO(TX_RETRIES, tx_retries, u32);
+ PUT_SINFO(TX_FAILED, tx_failed, u32);
+ PUT_SINFO(EXPECTED_THROUGHPUT, expected_throughput, u32);
+ PUT_SINFO(BEACON_LOSS, beacon_loss_count, u32);
+ PUT_SINFO(LOCAL_PM, local_pm, u32);
+ PUT_SINFO(PEER_PM, peer_pm, u32);
+ PUT_SINFO(NONPEER_PM, nonpeer_pm, u32);
+
+ if (sinfo->filled & BIT(NL80211_STA_INFO_BSS_PARAM)) {
+ bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM);
+ if (!bss_param)
+ goto nla_put_failure;
+
+ if (((sinfo->bss_param.flags & BSS_PARAM_FLAGS_CTS_PROT) &&
+ nla_put_flag(msg, NL80211_STA_BSS_PARAM_CTS_PROT)) ||
+ ((sinfo->bss_param.flags & BSS_PARAM_FLAGS_SHORT_PREAMBLE) &&
+ nla_put_flag(msg, NL80211_STA_BSS_PARAM_SHORT_PREAMBLE)) ||
+ ((sinfo->bss_param.flags & BSS_PARAM_FLAGS_SHORT_SLOT_TIME) &&
+ nla_put_flag(msg, NL80211_STA_BSS_PARAM_SHORT_SLOT_TIME)) ||
+ nla_put_u8(msg, NL80211_STA_BSS_PARAM_DTIM_PERIOD,
+ sinfo->bss_param.dtim_period) ||
+ nla_put_u16(msg, NL80211_STA_BSS_PARAM_BEACON_INTERVAL,
+ sinfo->bss_param.beacon_interval))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, bss_param);
+ }
+ if ((sinfo->filled & BIT(NL80211_STA_INFO_STA_FLAGS)) &&
+ nla_put(msg, NL80211_STA_INFO_STA_FLAGS,
+ sizeof(struct nl80211_sta_flag_update),
+ &sinfo->sta_flags))
+ goto nla_put_failure;
+
+ PUT_SINFO(T_OFFSET, t_offset, u64);
+ PUT_SINFO(RX_DROP_MISC, rx_dropped_misc, u64);
+ PUT_SINFO(BEACON_RX, rx_beacon, u64);
+ PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
+
+#undef PUT_SINFO
+
+ if (sinfo->filled & BIT(NL80211_STA_INFO_TID_STATS)) {
+ struct nlattr *tidsattr;
+ int tid;
+
+ tidsattr = nla_nest_start(msg, NL80211_STA_INFO_TID_STATS);
+ if (!tidsattr)
+ goto nla_put_failure;
+
+ for (tid = 0; tid < IEEE80211_NUM_TIDS + 1; tid++) {
+ struct cfg80211_tid_stats *tidstats;
+ struct nlattr *tidattr;
+
+ tidstats = &sinfo->pertid[tid];
+
+ if (!tidstats->filled)
+ continue;
+
+ tidattr = nla_nest_start(msg, tid + 1);
+ if (!tidattr)
+ goto nla_put_failure;
+
+#define PUT_TIDVAL(attr, memb, type) do { \
+ if (tidstats->filled & BIT(NL80211_TID_STATS_ ## attr) && \
+ nla_put_ ## type(msg, NL80211_TID_STATS_ ## attr, \
+ tidstats->memb)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ PUT_TIDVAL(RX_MSDU, rx_msdu, u64);
+ PUT_TIDVAL(TX_MSDU, tx_msdu, u64);
+ PUT_TIDVAL(TX_MSDU_RETRIES, tx_msdu_retries, u64);
+ PUT_TIDVAL(TX_MSDU_FAILED, tx_msdu_failed, u64);
+
+#undef PUT_TIDVAL
+ nla_nest_end(msg, tidattr);
+ }
+
+ nla_nest_end(msg, tidsattr);
+ }
+
+ nla_nest_end(msg, sinfoattr);
+
+ if (sinfo->assoc_req_ies_len &&
+ nla_put(msg, NL80211_ATTR_IE, sinfo->assoc_req_ies_len,
+ sinfo->assoc_req_ies))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl80211_dump_station(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct station_info sinfo;
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+ u8 mac_addr[ETH_ALEN];
+ int sta_idx = cb->args[2];
+ int err;
+
+ err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ if (err)
+ return err;
+
+ if (!wdev->netdev) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ if (!rdev->ops->dump_station) {
+ err = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ while (1) {
+ memset(&sinfo, 0, sizeof(sinfo));
+ err = rdev_dump_station(rdev, wdev->netdev, sta_idx,
+ mac_addr, &sinfo);
+ if (err == -ENOENT)
+ break;
+ if (err)
+ goto out_err;
+
+ if (nl80211_send_station(skb, NL80211_CMD_NEW_STATION,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wdev->netdev, mac_addr,
+ &sinfo) < 0)
+ goto out;
+
+ sta_idx++;
+ }
+
+
+ out:
+ cb->args[2] = sta_idx;
+ err = skb->len;
+ out_err:
+ nl80211_finish_wdev_dump(rdev);
+
+ return err;
+}
+
+static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct station_info sinfo;
+ struct sk_buff *msg;
+ u8 *mac_addr = NULL;
+ int err;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (!rdev->ops->get_station)
+ return -EOPNOTSUPP;
+
+ err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
+ info->snd_portid, info->snd_seq, 0,
+ rdev, dev, mac_addr, &sinfo) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+int cfg80211_check_station_change(struct wiphy *wiphy,
+ struct station_parameters *params,
+ enum cfg80211_station_type statype)
+{
+ if (params->listen_interval != -1)
+ return -EINVAL;
+ if (params->aid &&
+ !(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)))
+ return -EINVAL;
+
+ /* When you run into this, adjust the code below for the new flag */
+ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7);
+
+ switch (statype) {
+ case CFG80211_STA_MESH_PEER_KERNEL:
+ case CFG80211_STA_MESH_PEER_USER:
+ /*
+ * No ignoring the TDLS flag here -- the userspace mesh
+ * code doesn't have the bug of including TDLS in the
+ * mask everywhere.
+ */
+ if (params->sta_flags_mask &
+ ~(BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_MFP) |
+ BIT(NL80211_STA_FLAG_AUTHORIZED)))
+ return -EINVAL;
+ break;
+ case CFG80211_STA_TDLS_PEER_SETUP:
+ case CFG80211_STA_TDLS_PEER_ACTIVE:
+ if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)))
+ return -EINVAL;
+ /* ignore since it can't change */
+ params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER);
+ break;
+ default:
+ /* disallow mesh-specific things */
+ if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION)
+ return -EINVAL;
+ if (params->local_pm)
+ return -EINVAL;
+ if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE)
+ return -EINVAL;
+ }
+
+ if (statype != CFG80211_STA_TDLS_PEER_SETUP &&
+ statype != CFG80211_STA_TDLS_PEER_ACTIVE) {
+ /* TDLS can't be set, ... */
+ if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))
+ return -EINVAL;
+ /*
+ * ... but don't bother the driver with it. This works around
+ * a hostapd/wpa_supplicant issue -- it always includes the
+ * TLDS_PEER flag in the mask even for AP mode.
+ */
+ params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER);
+ }
+
+ if (statype != CFG80211_STA_TDLS_PEER_SETUP) {
+ /* reject other things that can't change */
+ if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD)
+ return -EINVAL;
+ if (params->sta_modify_mask & STATION_PARAM_APPLY_CAPABILITY)
+ return -EINVAL;
+ if (params->supported_rates)
+ return -EINVAL;
+ if (params->ext_capab || params->ht_capa || params->vht_capa)
+ return -EINVAL;
+ }
+
+ if (statype != CFG80211_STA_AP_CLIENT) {
+ if (params->vlan)
+ return -EINVAL;
+ }
+
+ switch (statype) {
+ case CFG80211_STA_AP_MLME_CLIENT:
+ /* Use this only for authorizing/unauthorizing a station */
+ if (!(params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)))
+ return -EOPNOTSUPP;
+ break;
+ case CFG80211_STA_AP_CLIENT:
+ /* accept only the listed bits */
+ if (params->sta_flags_mask &
+ ~(BIT(NL80211_STA_FLAG_AUTHORIZED) |
+ BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_ASSOCIATED) |
+ BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) |
+ BIT(NL80211_STA_FLAG_WME) |
+ BIT(NL80211_STA_FLAG_MFP)))
+ return -EINVAL;
+
+ /* but authenticated/associated only if driver handles it */
+ if (!(wiphy->features & NL80211_FEATURE_FULL_AP_CLIENT_STATE) &&
+ params->sta_flags_mask &
+ (BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_ASSOCIATED)))
+ return -EINVAL;
+ break;
+ case CFG80211_STA_IBSS:
+ case CFG80211_STA_AP_STA:
+ /* reject any changes other than AUTHORIZED */
+ if (params->sta_flags_mask & ~BIT(NL80211_STA_FLAG_AUTHORIZED))
+ return -EINVAL;
+ break;
+ case CFG80211_STA_TDLS_PEER_SETUP:
+ /* reject any changes other than AUTHORIZED or WME */
+ if (params->sta_flags_mask & ~(BIT(NL80211_STA_FLAG_AUTHORIZED) |
+ BIT(NL80211_STA_FLAG_WME)))
+ return -EINVAL;
+ /* force (at least) rates when authorizing */
+ if (params->sta_flags_set & BIT(NL80211_STA_FLAG_AUTHORIZED) &&
+ !params->supported_rates)
+ return -EINVAL;
+ break;
+ case CFG80211_STA_TDLS_PEER_ACTIVE:
+ /* reject any changes */
+ return -EINVAL;
+ case CFG80211_STA_MESH_PEER_KERNEL:
+ if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE)
+ return -EINVAL;
+ break;
+ case CFG80211_STA_MESH_PEER_USER:
+ if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION)
+ return -EINVAL;
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(cfg80211_check_station_change);
+
+/*
+ * Get vlan interface making sure it is running and on the right wiphy.
+ */
+static struct net_device *get_vlan(struct genl_info *info,
+ struct cfg80211_registered_device *rdev)
+{
+ struct nlattr *vlanattr = info->attrs[NL80211_ATTR_STA_VLAN];
+ struct net_device *v;
+ int ret;
+
+ if (!vlanattr)
+ return NULL;
+
+ v = dev_get_by_index(genl_info_net(info), nla_get_u32(vlanattr));
+ if (!v)
+ return ERR_PTR(-ENODEV);
+
+ if (!v->ieee80211_ptr || v->ieee80211_ptr->wiphy != &rdev->wiphy) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (v->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+ v->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ v->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (!netif_running(v)) {
+ ret = -ENETDOWN;
+ goto error;
+ }
+
+ return v;
+ error:
+ dev_put(v);
+ return ERR_PTR(ret);
+}
+
+static const struct nla_policy
+nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = {
+ [NL80211_STA_WME_UAPSD_QUEUES] = { .type = NLA_U8 },
+ [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 },
+};
+
+static int nl80211_parse_sta_wme(struct genl_info *info,
+ struct station_parameters *params)
+{
+ struct nlattr *tb[NL80211_STA_WME_MAX + 1];
+ struct nlattr *nla;
+ int err;
+
+ /* parse WME attributes if present */
+ if (!info->attrs[NL80211_ATTR_STA_WME])
+ return 0;
+
+ nla = info->attrs[NL80211_ATTR_STA_WME];
+ err = nla_parse_nested(tb, NL80211_STA_WME_MAX, nla,
+ nl80211_sta_wme_policy);
+ if (err)
+ return err;
+
+ if (tb[NL80211_STA_WME_UAPSD_QUEUES])
+ params->uapsd_queues = nla_get_u8(
+ tb[NL80211_STA_WME_UAPSD_QUEUES]);
+ if (params->uapsd_queues & ~IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK)
+ return -EINVAL;
+
+ if (tb[NL80211_STA_WME_MAX_SP])
+ params->max_sp = nla_get_u8(tb[NL80211_STA_WME_MAX_SP]);
+
+ if (params->max_sp & ~IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK)
+ return -EINVAL;
+
+ params->sta_modify_mask |= STATION_PARAM_APPLY_UAPSD;
+
+ return 0;
+}
+
+static int nl80211_parse_sta_channel_info(struct genl_info *info,
+ struct station_parameters *params)
+{
+ if (info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]) {
+ params->supported_channels =
+ nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]);
+ params->supported_channels_len =
+ nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]);
+ /*
+ * Need to include at least one (first channel, number of
+ * channels) tuple for each subband, and must have proper
+ * tuples for the rest of the data as well.
+ */
+ if (params->supported_channels_len < 2)
+ return -EINVAL;
+ if (params->supported_channels_len % 2)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]) {
+ params->supported_oper_classes =
+ nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
+ params->supported_oper_classes_len =
+ nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
+ /*
+ * The value of the Length field of the Supported Operating
+ * Classes element is between 2 and 253.
+ */
+ if (params->supported_oper_classes_len < 2 ||
+ params->supported_oper_classes_len > 253)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nl80211_set_station_tdls(struct genl_info *info,
+ struct station_parameters *params)
+{
+ int err;
+ /* Dummy STA entry gets updated once the peer capabilities are known */
+ if (info->attrs[NL80211_ATTR_PEER_AID])
+ params->aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
+ if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
+ params->ht_capa =
+ nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
+ if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
+ params->vht_capa =
+ nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+
+ err = nl80211_parse_sta_channel_info(info, params);
+ if (err)
+ return err;
+
+ return nl80211_parse_sta_wme(info, params);
+}
+
+static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct station_parameters params;
+ u8 *mac_addr;
+ int err;
+
+ memset(&params, 0, sizeof(params));
+
+ params.listen_interval = -1;
+
+ if (!rdev->ops->change_station)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[NL80211_ATTR_STA_AID])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) {
+ params.supported_rates =
+ nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+ params.supported_rates_len =
+ nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+ }
+
+ if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) {
+ params.capability =
+ nla_get_u16(info->attrs[NL80211_ATTR_STA_CAPABILITY]);
+ params.sta_modify_mask |= STATION_PARAM_APPLY_CAPABILITY;
+ }
+
+ if (info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]) {
+ params.ext_capab =
+ nla_data(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
+ params.ext_capab_len =
+ nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
+ }
+
+ if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
+ return -EINVAL;
+
+ if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params))
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) {
+ params.plink_action =
+ nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
+ if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) {
+ params.plink_state =
+ nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]);
+ if (params.plink_state >= NUM_NL80211_PLINK_STATES)
+ return -EINVAL;
+ params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE;
+ }
+
+ if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]) {
+ enum nl80211_mesh_power_mode pm = nla_get_u32(
+ info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]);
+
+ if (pm <= NL80211_MESH_POWER_UNKNOWN ||
+ pm > NL80211_MESH_POWER_MAX)
+ return -EINVAL;
+
+ params.local_pm = pm;
+ }
+
+ /* Include parameters for TDLS peer (will check later) */
+ err = nl80211_set_station_tdls(info, &params);
+ if (err)
+ return err;
+
+ params.vlan = get_vlan(info, rdev);
+ if (IS_ERR(params.vlan))
+ return PTR_ERR(params.vlan);
+
+ switch (dev->ieee80211_ptr->iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_MESH_POINT:
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ goto out_put_vlan;
+ }
+
+ /* driver will call cfg80211_check_station_change() */
+ err = rdev_change_station(rdev, dev, mac_addr, &params);
+
+ out_put_vlan:
+ if (params.vlan)
+ dev_put(params.vlan);
+
+ return err;
+}
+
+static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ int err;
+ struct net_device *dev = info->user_ptr[1];
+ struct station_parameters params;
+ u8 *mac_addr = NULL;
+
+ memset(&params, 0, sizeof(params));
+
+ if (!rdev->ops->add_station)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_STA_AID] &&
+ !info->attrs[NL80211_ATTR_PEER_AID])
+ return -EINVAL;
+
+ mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ params.supported_rates =
+ nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+ params.supported_rates_len =
+ nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+ params.listen_interval =
+ nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
+
+ if (info->attrs[NL80211_ATTR_PEER_AID])
+ params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
+ else
+ params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
+ if (!params.aid || params.aid > IEEE80211_MAX_AID)
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) {
+ params.capability =
+ nla_get_u16(info->attrs[NL80211_ATTR_STA_CAPABILITY]);
+ params.sta_modify_mask |= STATION_PARAM_APPLY_CAPABILITY;
+ }
+
+ if (info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]) {
+ params.ext_capab =
+ nla_data(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
+ params.ext_capab_len =
+ nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
+ }
+
+ if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
+ params.ht_capa =
+ nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
+
+ if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
+ params.vht_capa =
+ nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+
+ if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
+ params.opmode_notif_used = true;
+ params.opmode_notif =
+ nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
+ }
+
+ if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) {
+ params.plink_action =
+ nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
+ if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS)
+ return -EINVAL;
+ }
+
+ err = nl80211_parse_sta_channel_info(info, &params);
+ if (err)
+ return err;
+
+ err = nl80211_parse_sta_wme(info, &params);
+ if (err)
+ return err;
+
+ if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params))
+ return -EINVAL;
+
+ /* HT/VHT requires QoS, but if we don't have that just ignore HT/VHT
+ * as userspace might just pass through the capabilities from the IEs
+ * directly, rather than enforcing this restriction and returning an
+ * error in this case.
+ */
+ if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
+ params.ht_capa = NULL;
+ params.vht_capa = NULL;
+ }
+
+ /* When you run into this, adjust the code below for the new flag */
+ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7);
+
+ switch (dev->ieee80211_ptr->iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ /* ignore WME attributes if iface/sta is not capable */
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) ||
+ !(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME)))
+ params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD;
+
+ /* TDLS peers cannot be added */
+ if ((params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) ||
+ info->attrs[NL80211_ATTR_PEER_AID])
+ return -EINVAL;
+ /* but don't bother the driver with it */
+ params.sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER);
+
+ /* allow authenticated/associated only if driver handles it */
+ if (!(rdev->wiphy.features &
+ NL80211_FEATURE_FULL_AP_CLIENT_STATE) &&
+ params.sta_flags_mask &
+ (BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_ASSOCIATED)))
+ return -EINVAL;
+
+ /* must be last in here for error handling */
+ params.vlan = get_vlan(info, rdev);
+ if (IS_ERR(params.vlan))
+ return PTR_ERR(params.vlan);
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ /* ignore uAPSD data */
+ params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD;
+
+ /* associated is disallowed */
+ if (params.sta_flags_mask & BIT(NL80211_STA_FLAG_ASSOCIATED))
+ return -EINVAL;
+ /* TDLS peers cannot be added */
+ if ((params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) ||
+ info->attrs[NL80211_ATTR_PEER_AID])
+ return -EINVAL;
+ break;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ /* ignore uAPSD data */
+ params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD;
+
+ /* these are disallowed */
+ if (params.sta_flags_mask &
+ (BIT(NL80211_STA_FLAG_ASSOCIATED) |
+ BIT(NL80211_STA_FLAG_AUTHENTICATED)))
+ return -EINVAL;
+ /* Only TDLS peers can be added */
+ if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)))
+ return -EINVAL;
+ /* Can only add if TDLS ... */
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS))
+ return -EOPNOTSUPP;
+ /* ... with external setup is supported */
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP))
+ return -EOPNOTSUPP;
+ /*
+ * Older wpa_supplicant versions always mark the TDLS peer
+ * as authorized, but it shouldn't yet be.
+ */
+ params.sta_flags_mask &= ~BIT(NL80211_STA_FLAG_AUTHORIZED);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ /* be aware of params.vlan when changing code here */
+
+ err = rdev_add_station(rdev, dev, mac_addr, &params);
+
+ if (params.vlan)
+ dev_put(params.vlan);
+ return err;
+}
+
+static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct station_del_parameters params;
+
+ memset(&params, 0, sizeof(params));
+
+ if (info->attrs[NL80211_ATTR_MAC])
+ params.mac = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
+
+ if (!rdev->ops->del_station)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[NL80211_ATTR_MGMT_SUBTYPE]) {
+ params.subtype =
+ nla_get_u8(info->attrs[NL80211_ATTR_MGMT_SUBTYPE]);
+ if (params.subtype != IEEE80211_STYPE_DISASSOC >> 4 &&
+ params.subtype != IEEE80211_STYPE_DEAUTH >> 4)
+ return -EINVAL;
+ } else {
+ /* Default to Deauthentication frame */
+ params.subtype = IEEE80211_STYPE_DEAUTH >> 4;
+ }
+
+ if (info->attrs[NL80211_ATTR_REASON_CODE]) {
+ params.reason_code =
+ nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+ if (params.reason_code == 0)
+ return -EINVAL; /* 0 is reserved */
+ } else {
+ /* Default to reason code 2 */
+ params.reason_code = WLAN_REASON_PREV_AUTH_NOT_VALID;
+ }
+
+ return rdev_del_station(rdev, dev, &params);
+}
+
+static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq,
+ int flags, struct net_device *dev,
+ u8 *dst, u8 *next_hop,
+ struct mpath_info *pinfo)
+{
+ void *hdr;
+ struct nlattr *pinfoattr;
+
+ hdr = nl80211hdr_put(msg, portid, seq, flags, NL80211_CMD_NEW_MPATH);
+ if (!hdr)
+ return -1;
+
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, dst) ||
+ nla_put(msg, NL80211_ATTR_MPATH_NEXT_HOP, ETH_ALEN, next_hop) ||
+ nla_put_u32(msg, NL80211_ATTR_GENERATION, pinfo->generation))
+ goto nla_put_failure;
+
+ pinfoattr = nla_nest_start(msg, NL80211_ATTR_MPATH_INFO);
+ if (!pinfoattr)
+ goto nla_put_failure;
+ if ((pinfo->filled & MPATH_INFO_FRAME_QLEN) &&
+ nla_put_u32(msg, NL80211_MPATH_INFO_FRAME_QLEN,
+ pinfo->frame_qlen))
+ goto nla_put_failure;
+ if (((pinfo->filled & MPATH_INFO_SN) &&
+ nla_put_u32(msg, NL80211_MPATH_INFO_SN, pinfo->sn)) ||
+ ((pinfo->filled & MPATH_INFO_METRIC) &&
+ nla_put_u32(msg, NL80211_MPATH_INFO_METRIC,
+ pinfo->metric)) ||
+ ((pinfo->filled & MPATH_INFO_EXPTIME) &&
+ nla_put_u32(msg, NL80211_MPATH_INFO_EXPTIME,
+ pinfo->exptime)) ||
+ ((pinfo->filled & MPATH_INFO_FLAGS) &&
+ nla_put_u8(msg, NL80211_MPATH_INFO_FLAGS,
+ pinfo->flags)) ||
+ ((pinfo->filled & MPATH_INFO_DISCOVERY_TIMEOUT) &&
+ nla_put_u32(msg, NL80211_MPATH_INFO_DISCOVERY_TIMEOUT,
+ pinfo->discovery_timeout)) ||
+ ((pinfo->filled & MPATH_INFO_DISCOVERY_RETRIES) &&
+ nla_put_u8(msg, NL80211_MPATH_INFO_DISCOVERY_RETRIES,
+ pinfo->discovery_retries)))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, pinfoattr);
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl80211_dump_mpath(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct mpath_info pinfo;
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+ u8 dst[ETH_ALEN];
+ u8 next_hop[ETH_ALEN];
+ int path_idx = cb->args[2];
+ int err;
+
+ err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ if (err)
+ return err;
+
+ if (!rdev->ops->dump_mpath) {
+ err = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) {
+ err = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ while (1) {
+ err = rdev_dump_mpath(rdev, wdev->netdev, path_idx, dst,
+ next_hop, &pinfo);
+ if (err == -ENOENT)
+ break;
+ if (err)
+ goto out_err;
+
+ if (nl80211_send_mpath(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ wdev->netdev, dst, next_hop,
+ &pinfo) < 0)
+ goto out;
+
+ path_idx++;
+ }
+
+
+ out:
+ cb->args[2] = path_idx;
+ err = skb->len;
+ out_err:
+ nl80211_finish_wdev_dump(rdev);
+ return err;
+}
+
+static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ int err;
+ struct net_device *dev = info->user_ptr[1];
+ struct mpath_info pinfo;
+ struct sk_buff *msg;
+ u8 *dst = NULL;
+ u8 next_hop[ETH_ALEN];
+
+ memset(&pinfo, 0, sizeof(pinfo));
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (!rdev->ops->get_mpath)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ err = rdev_get_mpath(rdev, dev, dst, next_hop, &pinfo);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl80211_send_mpath(msg, info->snd_portid, info->snd_seq, 0,
+ dev, dst, next_hop, &pinfo) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ u8 *dst = NULL;
+ u8 *next_hop = NULL;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_MPATH_NEXT_HOP])
+ return -EINVAL;
+
+ dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
+
+ if (!rdev->ops->change_mpath)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ return rdev_change_mpath(rdev, dev, dst, next_hop);
+}
+
+static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ u8 *dst = NULL;
+ u8 *next_hop = NULL;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_MPATH_NEXT_HOP])
+ return -EINVAL;
+
+ dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
+
+ if (!rdev->ops->add_mpath)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ return rdev_add_mpath(rdev, dev, dst, next_hop);
+}
+
+static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ u8 *dst = NULL;
+
+ if (info->attrs[NL80211_ATTR_MAC])
+ dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (!rdev->ops->del_mpath)
+ return -EOPNOTSUPP;
+
+ return rdev_del_mpath(rdev, dev, dst);
+}
+
+static int nl80211_get_mpp(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ int err;
+ struct net_device *dev = info->user_ptr[1];
+ struct mpath_info pinfo;
+ struct sk_buff *msg;
+ u8 *dst = NULL;
+ u8 mpp[ETH_ALEN];
+
+ memset(&pinfo, 0, sizeof(pinfo));
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (!rdev->ops->get_mpp)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ err = rdev_get_mpp(rdev, dev, dst, mpp, &pinfo);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ if (nl80211_send_mpath(msg, info->snd_portid, info->snd_seq, 0,
+ dev, dst, mpp, &pinfo) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int nl80211_dump_mpp(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct mpath_info pinfo;
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+ u8 dst[ETH_ALEN];
+ u8 mpp[ETH_ALEN];
+ int path_idx = cb->args[2];
+ int err;
+
+ err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ if (err)
+ return err;
+
+ if (!rdev->ops->dump_mpp) {
+ err = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) {
+ err = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ while (1) {
+ err = rdev_dump_mpp(rdev, wdev->netdev, path_idx, dst,
+ mpp, &pinfo);
+ if (err == -ENOENT)
+ break;
+ if (err)
+ goto out_err;
+
+ if (nl80211_send_mpath(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ wdev->netdev, dst, mpp,
+ &pinfo) < 0)
+ goto out;
+
+ path_idx++;
+ }
+
+ out:
+ cb->args[2] = path_idx;
+ err = skb->len;
+ out_err:
+ nl80211_finish_wdev_dump(rdev);
+ return err;
+}
+
+static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct bss_parameters params;
+ int err;
+
+ memset(&params, 0, sizeof(params));
+ /* default to not changing parameters */
+ params.use_cts_prot = -1;
+ params.use_short_preamble = -1;
+ params.use_short_slot_time = -1;
+ params.ap_isolate = -1;
+ params.ht_opmode = -1;
+ params.p2p_ctwindow = -1;
+ params.p2p_opp_ps = -1;
+
+ if (info->attrs[NL80211_ATTR_BSS_CTS_PROT])
+ params.use_cts_prot =
+ nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]);
+ if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE])
+ params.use_short_preamble =
+ nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]);
+ if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME])
+ params.use_short_slot_time =
+ nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]);
+ if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
+ params.basic_rates =
+ nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+ params.basic_rates_len =
+ nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+ }
+ if (info->attrs[NL80211_ATTR_AP_ISOLATE])
+ params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]);
+ if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE])
+ params.ht_opmode =
+ nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]);
+
+ if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) {
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
+ params.p2p_ctwindow =
+ nla_get_s8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
+ if (params.p2p_ctwindow < 0)
+ return -EINVAL;
+ if (params.p2p_ctwindow != 0 &&
+ !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN))
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_P2P_OPPPS]) {
+ u8 tmp;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
+ tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
+ if (tmp > 1)
+ return -EINVAL;
+ params.p2p_opp_ps = tmp;
+ if (params.p2p_opp_ps &&
+ !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS))
+ return -EINVAL;
+ }
+
+ if (!rdev->ops->change_bss)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+
+ wdev_lock(wdev);
+ err = rdev_change_bss(rdev, dev, &params);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
+ [NL80211_ATTR_REG_RULE_FLAGS] = { .type = NLA_U32 },
+ [NL80211_ATTR_FREQ_RANGE_START] = { .type = NLA_U32 },
+ [NL80211_ATTR_FREQ_RANGE_END] = { .type = NLA_U32 },
+ [NL80211_ATTR_FREQ_RANGE_MAX_BW] = { .type = NLA_U32 },
+ [NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN] = { .type = NLA_U32 },
+ [NL80211_ATTR_POWER_RULE_MAX_EIRP] = { .type = NLA_U32 },
+ [NL80211_ATTR_DFS_CAC_TIME] = { .type = NLA_U32 },
+};
+
+static int parse_reg_rule(struct nlattr *tb[],
+ struct ieee80211_reg_rule *reg_rule)
+{
+ struct ieee80211_freq_range *freq_range = &reg_rule->freq_range;
+ struct ieee80211_power_rule *power_rule = &reg_rule->power_rule;
+
+ if (!tb[NL80211_ATTR_REG_RULE_FLAGS])
+ return -EINVAL;
+ if (!tb[NL80211_ATTR_FREQ_RANGE_START])
+ return -EINVAL;
+ if (!tb[NL80211_ATTR_FREQ_RANGE_END])
+ return -EINVAL;
+ if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW])
+ return -EINVAL;
+ if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP])
+ return -EINVAL;
+
+ reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]);
+
+ freq_range->start_freq_khz =
+ nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]);
+ freq_range->end_freq_khz =
+ nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]);
+ freq_range->max_bandwidth_khz =
+ nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]);
+
+ power_rule->max_eirp =
+ nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]);
+
+ if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN])
+ power_rule->max_antenna_gain =
+ nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]);
+
+ if (tb[NL80211_ATTR_DFS_CAC_TIME])
+ reg_rule->dfs_cac_ms =
+ nla_get_u32(tb[NL80211_ATTR_DFS_CAC_TIME]);
+
+ return 0;
+}
+
+static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
+{
+ char *data = NULL;
+ bool is_indoor;
+ enum nl80211_user_reg_hint_type user_reg_hint_type;
+ u32 owner_nlportid;
+
+
+ /*
+ * You should only get this when cfg80211 hasn't yet initialized
+ * completely when built-in to the kernel right between the time
+ * window between nl80211_init() and regulatory_init(), if that is
+ * even possible.
+ */
+ if (unlikely(!rcu_access_pointer(cfg80211_regdomain)))
+ return -EINPROGRESS;
+
+ if (info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE])
+ user_reg_hint_type =
+ nla_get_u32(info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE]);
+ else
+ user_reg_hint_type = NL80211_USER_REG_HINT_USER;
+
+ switch (user_reg_hint_type) {
+ case NL80211_USER_REG_HINT_USER:
+ case NL80211_USER_REG_HINT_CELL_BASE:
+ if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
+ return -EINVAL;
+
+ data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
+ return regulatory_hint_user(data, user_reg_hint_type);
+ case NL80211_USER_REG_HINT_INDOOR:
+ if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+ owner_nlportid = info->snd_portid;
+ is_indoor = !!info->attrs[NL80211_ATTR_REG_INDOOR];
+ } else {
+ owner_nlportid = 0;
+ is_indoor = true;
+ }
+
+ return regulatory_hint_indoor(is_indoor, owner_nlportid);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int nl80211_get_mesh_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct mesh_config cur_params;
+ int err = 0;
+ void *hdr;
+ struct nlattr *pinfoattr;
+ struct sk_buff *msg;
+
+ if (wdev->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->get_mesh_config)
+ return -EOPNOTSUPP;
+
+ wdev_lock(wdev);
+ /* If not connected, get default parameters */
+ if (!wdev->mesh_id_len)
+ memcpy(&cur_params, &default_mesh_config, sizeof(cur_params));
+ else
+ err = rdev_get_mesh_config(rdev, dev, &cur_params);
+ wdev_unlock(wdev);
+
+ if (err)
+ return err;
+
+ /* Draw up a netlink message to send back */
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_GET_MESH_CONFIG);
+ if (!hdr)
+ goto out;
+ pinfoattr = nla_nest_start(msg, NL80211_ATTR_MESH_CONFIG);
+ if (!pinfoattr)
+ goto nla_put_failure;
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put_u16(msg, NL80211_MESHCONF_RETRY_TIMEOUT,
+ cur_params.dot11MeshRetryTimeout) ||
+ nla_put_u16(msg, NL80211_MESHCONF_CONFIRM_TIMEOUT,
+ cur_params.dot11MeshConfirmTimeout) ||
+ nla_put_u16(msg, NL80211_MESHCONF_HOLDING_TIMEOUT,
+ cur_params.dot11MeshHoldingTimeout) ||
+ nla_put_u16(msg, NL80211_MESHCONF_MAX_PEER_LINKS,
+ cur_params.dot11MeshMaxPeerLinks) ||
+ nla_put_u8(msg, NL80211_MESHCONF_MAX_RETRIES,
+ cur_params.dot11MeshMaxRetries) ||
+ nla_put_u8(msg, NL80211_MESHCONF_TTL,
+ cur_params.dot11MeshTTL) ||
+ nla_put_u8(msg, NL80211_MESHCONF_ELEMENT_TTL,
+ cur_params.element_ttl) ||
+ nla_put_u8(msg, NL80211_MESHCONF_AUTO_OPEN_PLINKS,
+ cur_params.auto_open_plinks) ||
+ nla_put_u32(msg, NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR,
+ cur_params.dot11MeshNbrOffsetMaxNeighbor) ||
+ nla_put_u8(msg, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
+ cur_params.dot11MeshHWMPmaxPREQretries) ||
+ nla_put_u32(msg, NL80211_MESHCONF_PATH_REFRESH_TIME,
+ cur_params.path_refresh_time) ||
+ nla_put_u16(msg, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
+ cur_params.min_discovery_timeout) ||
+ nla_put_u32(msg, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
+ cur_params.dot11MeshHWMPactivePathTimeout) ||
+ nla_put_u16(msg, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
+ cur_params.dot11MeshHWMPpreqMinInterval) ||
+ nla_put_u16(msg, NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL,
+ cur_params.dot11MeshHWMPperrMinInterval) ||
+ nla_put_u16(msg, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+ cur_params.dot11MeshHWMPnetDiameterTraversalTime) ||
+ nla_put_u8(msg, NL80211_MESHCONF_HWMP_ROOTMODE,
+ cur_params.dot11MeshHWMPRootMode) ||
+ nla_put_u16(msg, NL80211_MESHCONF_HWMP_RANN_INTERVAL,
+ cur_params.dot11MeshHWMPRannInterval) ||
+ nla_put_u8(msg, NL80211_MESHCONF_GATE_ANNOUNCEMENTS,
+ cur_params.dot11MeshGateAnnouncementProtocol) ||
+ nla_put_u8(msg, NL80211_MESHCONF_FORWARDING,
+ cur_params.dot11MeshForwarding) ||
+ nla_put_u32(msg, NL80211_MESHCONF_RSSI_THRESHOLD,
+ cur_params.rssi_threshold) ||
+ nla_put_u32(msg, NL80211_MESHCONF_HT_OPMODE,
+ cur_params.ht_opmode) ||
+ nla_put_u32(msg, NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
+ cur_params.dot11MeshHWMPactivePathToRootTimeout) ||
+ nla_put_u16(msg, NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
+ cur_params.dot11MeshHWMProotInterval) ||
+ nla_put_u16(msg, NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL,
+ cur_params.dot11MeshHWMPconfirmationInterval) ||
+ nla_put_u32(msg, NL80211_MESHCONF_POWER_MODE,
+ cur_params.power_mode) ||
+ nla_put_u16(msg, NL80211_MESHCONF_AWAKE_WINDOW,
+ cur_params.dot11MeshAwakeWindowDuration) ||
+ nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT,
+ cur_params.plink_timeout))
+ goto nla_put_failure;
+ nla_nest_end(msg, pinfoattr);
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ out:
+ nlmsg_free(msg);
+ return -ENOBUFS;
+}
+
+static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = {
+ [NL80211_MESHCONF_RETRY_TIMEOUT] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_CONFIRM_TIMEOUT] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_HOLDING_TIMEOUT] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_MAX_PEER_LINKS] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_MAX_RETRIES] = { .type = NLA_U8 },
+ [NL80211_MESHCONF_TTL] = { .type = NLA_U8 },
+ [NL80211_MESHCONF_ELEMENT_TTL] = { .type = NLA_U8 },
+ [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = { .type = NLA_U8 },
+ [NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 },
+ [NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_HWMP_ROOTMODE] = { .type = NLA_U8 },
+ [NL80211_MESHCONF_HWMP_RANN_INTERVAL] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_GATE_ANNOUNCEMENTS] = { .type = NLA_U8 },
+ [NL80211_MESHCONF_FORWARDING] = { .type = NLA_U8 },
+ [NL80211_MESHCONF_RSSI_THRESHOLD] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_HT_OPMODE] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_POWER_MODE] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+ nl80211_mesh_setup_params_policy[NL80211_MESH_SETUP_ATTR_MAX+1] = {
+ [NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC] = { .type = NLA_U8 },
+ [NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL] = { .type = NLA_U8 },
+ [NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC] = { .type = NLA_U8 },
+ [NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG },
+ [NL80211_MESH_SETUP_AUTH_PROTOCOL] = { .type = NLA_U8 },
+ [NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG },
+ [NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG },
+};
+
+static int nl80211_parse_mesh_config(struct genl_info *info,
+ struct mesh_config *cfg,
+ u32 *mask_out)
+{
+ struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1];
+ u32 mask = 0;
+
+#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, min, max, mask, attr, fn) \
+do { \
+ if (tb[attr]) { \
+ if (fn(tb[attr]) < min || fn(tb[attr]) > max) \
+ return -EINVAL; \
+ cfg->param = fn(tb[attr]); \
+ mask |= (1 << (attr - 1)); \
+ } \
+} while (0)
+
+
+ if (!info->attrs[NL80211_ATTR_MESH_CONFIG])
+ return -EINVAL;
+ if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX,
+ info->attrs[NL80211_ATTR_MESH_CONFIG],
+ nl80211_meshconf_params_policy))
+ return -EINVAL;
+
+ /* This makes sure that there aren't more than 32 mesh config
+ * parameters (otherwise our bitfield scheme would not work.) */
+ BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);
+
+ /* Fill in the params struct */
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, 1, 255,
+ mask, NL80211_MESHCONF_RETRY_TIMEOUT,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, 1, 255,
+ mask, NL80211_MESHCONF_CONFIRM_TIMEOUT,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, 1, 255,
+ mask, NL80211_MESHCONF_HOLDING_TIMEOUT,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, 0, 255,
+ mask, NL80211_MESHCONF_MAX_PEER_LINKS,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, 0, 16,
+ mask, NL80211_MESHCONF_MAX_RETRIES,
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, 1, 255,
+ mask, NL80211_MESHCONF_TTL, nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, 1, 255,
+ mask, NL80211_MESHCONF_ELEMENT_TTL,
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, 0, 1,
+ mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS,
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor,
+ 1, 255, mask,
+ NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR,
+ nla_get_u32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, 0, 255,
+ mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, 1, 65535,
+ mask, NL80211_MESHCONF_PATH_REFRESH_TIME,
+ nla_get_u32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, 1, 65535,
+ mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout,
+ 1, 65535, mask,
+ NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
+ nla_get_u32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval,
+ 1, 65535, mask,
+ NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval,
+ 1, 65535, mask,
+ NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
+ dot11MeshHWMPnetDiameterTraversalTime,
+ 1, 65535, mask,
+ NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, 0, 4,
+ mask, NL80211_MESHCONF_HWMP_ROOTMODE,
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, 1, 65535,
+ mask, NL80211_MESHCONF_HWMP_RANN_INTERVAL,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
+ dot11MeshGateAnnouncementProtocol, 0, 1,
+ mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS,
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, 0, 1,
+ mask, NL80211_MESHCONF_FORWARDING,
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, -255, 0,
+ mask, NL80211_MESHCONF_RSSI_THRESHOLD,
+ nla_get_s32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, ht_opmode, 0, 16,
+ mask, NL80211_MESHCONF_HT_OPMODE,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
+ 1, 65535, mask,
+ NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
+ nla_get_u32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, 1, 65535,
+ mask, NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
+ dot11MeshHWMPconfirmationInterval,
+ 1, 65535, mask,
+ NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode,
+ NL80211_MESH_POWER_ACTIVE,
+ NL80211_MESH_POWER_MAX,
+ mask, NL80211_MESHCONF_POWER_MODE,
+ nla_get_u32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration,
+ 0, 65535, mask,
+ NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff,
+ mask, NL80211_MESHCONF_PLINK_TIMEOUT,
+ nla_get_u32);
+ if (mask_out)
+ *mask_out = mask;
+
+ return 0;
+
+#undef FILL_IN_MESH_PARAM_IF_SET
+}
+
+static int nl80211_parse_mesh_setup(struct genl_info *info,
+ struct mesh_setup *setup)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct nlattr *tb[NL80211_MESH_SETUP_ATTR_MAX + 1];
+
+ if (!info->attrs[NL80211_ATTR_MESH_SETUP])
+ return -EINVAL;
+ if (nla_parse_nested(tb, NL80211_MESH_SETUP_ATTR_MAX,
+ info->attrs[NL80211_ATTR_MESH_SETUP],
+ nl80211_mesh_setup_params_policy))
+ return -EINVAL;
+
+ if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC])
+ setup->sync_method =
+ (nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC])) ?
+ IEEE80211_SYNC_METHOD_VENDOR :
+ IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET;
+
+ if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL])
+ setup->path_sel_proto =
+ (nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL])) ?
+ IEEE80211_PATH_PROTOCOL_VENDOR :
+ IEEE80211_PATH_PROTOCOL_HWMP;
+
+ if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC])
+ setup->path_metric =
+ (nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC])) ?
+ IEEE80211_PATH_METRIC_VENDOR :
+ IEEE80211_PATH_METRIC_AIRTIME;
+
+
+ if (tb[NL80211_MESH_SETUP_IE]) {
+ struct nlattr *ieattr =
+ tb[NL80211_MESH_SETUP_IE];
+ if (!is_valid_ie_attr(ieattr))
+ return -EINVAL;
+ setup->ie = nla_data(ieattr);
+ setup->ie_len = nla_len(ieattr);
+ }
+ if (tb[NL80211_MESH_SETUP_USERSPACE_MPM] &&
+ !(rdev->wiphy.features & NL80211_FEATURE_USERSPACE_MPM))
+ return -EINVAL;
+ setup->user_mpm = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_MPM]);
+ setup->is_authenticated = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AUTH]);
+ setup->is_secure = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AMPE]);
+ if (setup->is_secure)
+ setup->user_mpm = true;
+
+ if (tb[NL80211_MESH_SETUP_AUTH_PROTOCOL]) {
+ if (!setup->user_mpm)
+ return -EINVAL;
+ setup->auth_id =
+ nla_get_u8(tb[NL80211_MESH_SETUP_AUTH_PROTOCOL]);
+ }
+
+ return 0;
+}
+
+static int nl80211_update_mesh_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct mesh_config cfg;
+ u32 mask;
+ int err;
+
+ if (wdev->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->update_mesh_config)
+ return -EOPNOTSUPP;
+
+ err = nl80211_parse_mesh_config(info, &cfg, &mask);
+ if (err)
+ return err;
+
+ wdev_lock(wdev);
+ if (!wdev->mesh_id_len)
+ err = -ENOLINK;
+
+ if (!err)
+ err = rdev_update_mesh_config(rdev, dev, mask, &cfg);
+
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+static int nl80211_put_regdom(const struct ieee80211_regdomain *regdom,
+ struct sk_buff *msg)
+{
+ struct nlattr *nl_reg_rules;
+ unsigned int i;
+
+ if (nla_put_string(msg, NL80211_ATTR_REG_ALPHA2, regdom->alpha2) ||
+ (regdom->dfs_region &&
+ nla_put_u8(msg, NL80211_ATTR_DFS_REGION, regdom->dfs_region)))
+ goto nla_put_failure;
+
+ nl_reg_rules = nla_nest_start(msg, NL80211_ATTR_REG_RULES);
+ if (!nl_reg_rules)
+ goto nla_put_failure;
+
+ for (i = 0; i < regdom->n_reg_rules; i++) {
+ struct nlattr *nl_reg_rule;
+ const struct ieee80211_reg_rule *reg_rule;
+ const struct ieee80211_freq_range *freq_range;
+ const struct ieee80211_power_rule *power_rule;
+ unsigned int max_bandwidth_khz;
+
+ reg_rule = &regdom->reg_rules[i];
+ freq_range = &reg_rule->freq_range;
+ power_rule = &reg_rule->power_rule;
+
+ nl_reg_rule = nla_nest_start(msg, i);
+ if (!nl_reg_rule)
+ goto nla_put_failure;
+
+ max_bandwidth_khz = freq_range->max_bandwidth_khz;
+ if (!max_bandwidth_khz)
+ max_bandwidth_khz = reg_get_max_bandwidth(regdom,
+ reg_rule);
+
+ if (nla_put_u32(msg, NL80211_ATTR_REG_RULE_FLAGS,
+ reg_rule->flags) ||
+ nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_START,
+ freq_range->start_freq_khz) ||
+ nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_END,
+ freq_range->end_freq_khz) ||
+ nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_MAX_BW,
+ max_bandwidth_khz) ||
+ nla_put_u32(msg, NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN,
+ power_rule->max_antenna_gain) ||
+ nla_put_u32(msg, NL80211_ATTR_POWER_RULE_MAX_EIRP,
+ power_rule->max_eirp) ||
+ nla_put_u32(msg, NL80211_ATTR_DFS_CAC_TIME,
+ reg_rule->dfs_cac_ms))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_reg_rule);
+ }
+
+ nla_nest_end(msg, nl_reg_rules);
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info)
+{
+ const struct ieee80211_regdomain *regdom = NULL;
+ struct cfg80211_registered_device *rdev;
+ struct wiphy *wiphy = NULL;
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOBUFS;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_GET_REG);
+ if (!hdr)
+ goto put_failure;
+
+ if (info->attrs[NL80211_ATTR_WIPHY]) {
+ bool self_managed;
+
+ rdev = cfg80211_get_dev_from_info(genl_info_net(info), info);
+ if (IS_ERR(rdev)) {
+ nlmsg_free(msg);
+ return PTR_ERR(rdev);
+ }
+
+ wiphy = &rdev->wiphy;
+ self_managed = wiphy->regulatory_flags &
+ REGULATORY_WIPHY_SELF_MANAGED;
+ regdom = get_wiphy_regdom(wiphy);
+
+ /* a self-managed-reg device must have a private regdom */
+ if (WARN_ON(!regdom && self_managed)) {
+ nlmsg_free(msg);
+ return -EINVAL;
+ }
+
+ if (regdom &&
+ nla_put_u32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy)))
+ goto nla_put_failure;
+ }
+
+ if (!wiphy && reg_last_request_cell_base() &&
+ nla_put_u32(msg, NL80211_ATTR_USER_REG_HINT_TYPE,
+ NL80211_USER_REG_HINT_CELL_BASE))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+
+ if (!regdom)
+ regdom = rcu_dereference(cfg80211_regdomain);
+
+ if (nl80211_put_regdom(regdom, msg))
+ goto nla_put_failure_rcu;
+
+ rcu_read_unlock();
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure_rcu:
+ rcu_read_unlock();
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+put_failure:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static int nl80211_send_regdom(struct sk_buff *msg, struct netlink_callback *cb,
+ u32 seq, int flags, struct wiphy *wiphy,
+ const struct ieee80211_regdomain *regdom)
+{
+ void *hdr = nl80211hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags,
+ NL80211_CMD_GET_REG);
+
+ if (!hdr)
+ return -1;
+
+ genl_dump_check_consistent(cb, hdr, &nl80211_fam);
+
+ if (nl80211_put_regdom(regdom, msg))
+ goto nla_put_failure;
+
+ if (!wiphy && reg_last_request_cell_base() &&
+ nla_put_u32(msg, NL80211_ATTR_USER_REG_HINT_TYPE,
+ NL80211_USER_REG_HINT_CELL_BASE))
+ goto nla_put_failure;
+
+ if (wiphy &&
+ nla_put_u32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy)))
+ goto nla_put_failure;
+
+ if (wiphy && wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
+ nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl80211_get_reg_dump(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct ieee80211_regdomain *regdom = NULL;
+ struct cfg80211_registered_device *rdev;
+ int err, reg_idx, start = cb->args[2];
+
+ rtnl_lock();
+
+ if (cfg80211_regdomain && start == 0) {
+ err = nl80211_send_regdom(skb, cb, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, NULL,
+ rtnl_dereference(cfg80211_regdomain));
+ if (err < 0)
+ goto out_err;
+ }
+
+ /* the global regdom is idx 0 */
+ reg_idx = 1;
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ regdom = get_wiphy_regdom(&rdev->wiphy);
+ if (!regdom)
+ continue;
+
+ if (++reg_idx <= start)
+ continue;
+
+ err = nl80211_send_regdom(skb, cb, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, &rdev->wiphy, regdom);
+ if (err < 0) {
+ reg_idx--;
+ break;
+ }
+ }
+
+ cb->args[2] = reg_idx;
+ err = skb->len;
+out_err:
+ rtnl_unlock();
+ return err;
+}
+
+static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1];
+ struct nlattr *nl_reg_rule;
+ char *alpha2;
+ int rem_reg_rules, r;
+ u32 num_rules = 0, rule_idx = 0, size_of_regd;
+ enum nl80211_dfs_regions dfs_region = NL80211_DFS_UNSET;
+ struct ieee80211_regdomain *rd;
+
+ if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_REG_RULES])
+ return -EINVAL;
+
+ alpha2 = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
+
+ if (info->attrs[NL80211_ATTR_DFS_REGION])
+ dfs_region = nla_get_u8(info->attrs[NL80211_ATTR_DFS_REGION]);
+
+ nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
+ rem_reg_rules) {
+ num_rules++;
+ if (num_rules > NL80211_MAX_SUPP_REG_RULES)
+ return -EINVAL;
+ }
+
+ if (!reg_is_valid_request(alpha2))
+ return -EINVAL;
+
+ size_of_regd = sizeof(struct ieee80211_regdomain) +
+ num_rules * sizeof(struct ieee80211_reg_rule);
+
+ rd = kzalloc(size_of_regd, GFP_KERNEL);
+ if (!rd)
+ return -ENOMEM;
+
+ rd->n_reg_rules = num_rules;
+ rd->alpha2[0] = alpha2[0];
+ rd->alpha2[1] = alpha2[1];
+
+ /*
+ * Disable DFS master mode if the DFS region was
+ * not supported or known on this kernel.
+ */
+ if (reg_supported_dfs_region(dfs_region))
+ rd->dfs_region = dfs_region;
+
+ nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
+ rem_reg_rules) {
+ r = nla_parse(tb, NL80211_REG_RULE_ATTR_MAX,
+ nla_data(nl_reg_rule), nla_len(nl_reg_rule),
+ reg_rule_policy);
+ if (r)
+ goto bad_reg;
+ r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]);
+ if (r)
+ goto bad_reg;
+
+ rule_idx++;
+
+ if (rule_idx > NL80211_MAX_SUPP_REG_RULES) {
+ r = -EINVAL;
+ goto bad_reg;
+ }
+ }
+
+ r = set_regdom(rd, REGD_SOURCE_CRDA);
+ /* set_regdom took ownership */
+ rd = NULL;
+
+ bad_reg:
+ kfree(rd);
+ return r;
+}
+
+static int validate_scan_freqs(struct nlattr *freqs)
+{
+ struct nlattr *attr1, *attr2;
+ int n_channels = 0, tmp1, tmp2;
+
+ nla_for_each_nested(attr1, freqs, tmp1) {
+ n_channels++;
+ /*
+ * Some hardware has a limited channel list for
+ * scanning, and it is pretty much nonsensical
+ * to scan for a channel twice, so disallow that
+ * and don't require drivers to check that the
+ * channel list they get isn't longer than what
+ * they can scan, as long as they can scan all
+ * the channels they registered at once.
+ */
+ nla_for_each_nested(attr2, freqs, tmp2)
+ if (attr1 != attr2 &&
+ nla_get_u32(attr1) == nla_get_u32(attr2))
+ return 0;
+ }
+
+ return n_channels;
+}
+
+static int nl80211_parse_random_mac(struct nlattr **attrs,
+ u8 *mac_addr, u8 *mac_addr_mask)
+{
+ int i;
+
+ if (!attrs[NL80211_ATTR_MAC] && !attrs[NL80211_ATTR_MAC_MASK]) {
+ eth_zero_addr(mac_addr);
+ eth_zero_addr(mac_addr_mask);
+ mac_addr[0] = 0x2;
+ mac_addr_mask[0] = 0x3;
+
+ return 0;
+ }
+
+ /* need both or none */
+ if (!attrs[NL80211_ATTR_MAC] || !attrs[NL80211_ATTR_MAC_MASK])
+ return -EINVAL;
+
+ memcpy(mac_addr, nla_data(attrs[NL80211_ATTR_MAC]), ETH_ALEN);
+ memcpy(mac_addr_mask, nla_data(attrs[NL80211_ATTR_MAC_MASK]), ETH_ALEN);
+
+ /* don't allow or configure an mcast address */
+ if (!is_multicast_ether_addr(mac_addr_mask) ||
+ is_multicast_ether_addr(mac_addr))
+ return -EINVAL;
+
+ /*
+ * allow users to pass a MAC address that has bits set outside
+ * of the mask, but don't bother drivers with having to deal
+ * with such bits
+ */
+ for (i = 0; i < ETH_ALEN; i++)
+ mac_addr[i] &= mac_addr_mask[i];
+
+ return 0;
+}
+
+static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ struct cfg80211_scan_request *request;
+ struct nlattr *attr;
+ struct wiphy *wiphy;
+ int err, tmp, n_ssids = 0, n_channels, i;
+ size_t ie_len;
+
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+
+ wiphy = &rdev->wiphy;
+
+ if (!rdev->ops->scan)
+ return -EOPNOTSUPP;
+
+ if (rdev->scan_req || rdev->scan_msg) {
+ err = -EBUSY;
+ goto unlock;
+ }
+
+ if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+ n_channels = validate_scan_freqs(
+ info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
+ if (!n_channels) {
+ err = -EINVAL;
+ goto unlock;
+ }
+ } else {
+ n_channels = ieee80211_get_num_supported_channels(wiphy);
+ }
+
+ if (info->attrs[NL80211_ATTR_SCAN_SSIDS])
+ nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp)
+ n_ssids++;
+
+ if (n_ssids > wiphy->max_scan_ssids) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ if (info->attrs[NL80211_ATTR_IE])
+ ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ else
+ ie_len = 0;
+
+ if (ie_len > wiphy->max_scan_ie_len) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ request = kzalloc(sizeof(*request)
+ + sizeof(*request->ssids) * n_ssids
+ + sizeof(*request->channels) * n_channels
+ + ie_len, GFP_KERNEL);
+ if (!request) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ if (n_ssids)
+ request->ssids = (void *)&request->channels[n_channels];
+ request->n_ssids = n_ssids;
+ if (ie_len) {
+ if (n_ssids)
+ request->ie = (void *)(request->ssids + n_ssids);
+ else
+ request->ie = (void *)(request->channels + n_channels);
+ }
+
+ i = 0;
+ if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+ /* user specified, bail out if channel not found */
+ nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) {
+ struct ieee80211_channel *chan;
+
+ chan = ieee80211_get_channel(wiphy, nla_get_u32(attr));
+
+ if (!chan) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ /* ignore disabled channels */
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ continue;
+
+ request->channels[i] = chan;
+ i++;
+ }
+ } else {
+ enum ieee80211_band band;
+
+ /* all channels */
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ int j;
+ if (!wiphy->bands[band])
+ continue;
+ for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+ struct ieee80211_channel *chan;
+
+ chan = &wiphy->bands[band]->channels[j];
+
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ continue;
+
+ request->channels[i] = chan;
+ i++;
+ }
+ }
+ }
+
+ if (!i) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ request->n_channels = i;
+
+ i = 0;
+ if (n_ssids) {
+ nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) {
+ if (nla_len(attr) > IEEE80211_MAX_SSID_LEN) {
+ err = -EINVAL;
+ goto out_free;
+ }
+ request->ssids[i].ssid_len = nla_len(attr);
+ memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr));
+ i++;
+ }
+ }
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ memcpy((void *)request->ie,
+ nla_data(info->attrs[NL80211_ATTR_IE]),
+ request->ie_len);
+ }
+
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++)
+ if (wiphy->bands[i])
+ request->rates[i] =
+ (1 << wiphy->bands[i]->n_bitrates) - 1;
+
+ if (info->attrs[NL80211_ATTR_SCAN_SUPP_RATES]) {
+ nla_for_each_nested(attr,
+ info->attrs[NL80211_ATTR_SCAN_SUPP_RATES],
+ tmp) {
+ enum ieee80211_band band = nla_type(attr);
+
+ if (band < 0 || band >= IEEE80211_NUM_BANDS) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (!wiphy->bands[band])
+ continue;
+
+ err = ieee80211_get_ratemask(wiphy->bands[band],
+ nla_data(attr),
+ nla_len(attr),
+ &request->rates[band]);
+ if (err)
+ goto out_free;
+ }
+ }
+
+ if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) {
+ request->flags = nla_get_u32(
+ info->attrs[NL80211_ATTR_SCAN_FLAGS]);
+ if ((request->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
+ !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) {
+ err = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
+ if (!(wiphy->features &
+ NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR)) {
+ err = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ if (wdev->current_bss) {
+ err = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ err = nl80211_parse_random_mac(info->attrs,
+ request->mac_addr,
+ request->mac_addr_mask);
+ if (err)
+ goto out_free;
+ }
+ }
+
+ request->no_cck =
+ nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);
+
+ request->wdev = wdev;
+ request->wiphy = &rdev->wiphy;
+ request->scan_start = jiffies;
+
+ rdev->scan_req = request;
+ err = rdev_scan(rdev, request);
+
+ if (!err) {
+ nl80211_send_scan_start(rdev, wdev);
+ if (wdev->netdev)
+ dev_hold(wdev->netdev);
+ } else {
+ out_free:
+ rdev->scan_req = NULL;
+ kfree(request);
+ }
+
+ unlock:
+ return err;
+}
+
+static struct cfg80211_sched_scan_request *
+nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct nlattr **attrs)
+{
+ struct cfg80211_sched_scan_request *request;
+ struct nlattr *attr;
+ int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i;
+ u32 interval;
+ enum ieee80211_band band;
+ size_t ie_len;
+ struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1];
+ s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF;
+
+ if (!is_valid_ie_attr(attrs[NL80211_ATTR_IE]))
+ return ERR_PTR(-EINVAL);
+
+ if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
+ return ERR_PTR(-EINVAL);
+
+ interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
+ if (interval == 0)
+ return ERR_PTR(-EINVAL);
+
+ if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+ n_channels = validate_scan_freqs(
+ attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
+ if (!n_channels)
+ return ERR_PTR(-EINVAL);
+ } else {
+ n_channels = ieee80211_get_num_supported_channels(wiphy);
+ }
+
+ if (attrs[NL80211_ATTR_SCAN_SSIDS])
+ nla_for_each_nested(attr, attrs[NL80211_ATTR_SCAN_SSIDS],
+ tmp)
+ n_ssids++;
+
+ if (n_ssids > wiphy->max_sched_scan_ssids)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * First, count the number of 'real' matchsets. Due to an issue with
+ * the old implementation, matchsets containing only the RSSI attribute
+ * (NL80211_SCHED_SCAN_MATCH_ATTR_RSSI) are considered as the 'default'
+ * RSSI for all matchsets, rather than their own matchset for reporting
+ * all APs with a strong RSSI. This is needed to be compatible with
+ * older userspace that treated a matchset with only the RSSI as the
+ * global RSSI for all other matchsets - if there are other matchsets.
+ */
+ if (attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) {
+ nla_for_each_nested(attr,
+ attrs[NL80211_ATTR_SCHED_SCAN_MATCH],
+ tmp) {
+ struct nlattr *rssi;
+
+ err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
+ nla_data(attr), nla_len(attr),
+ nl80211_match_policy);
+ if (err)
+ return ERR_PTR(err);
+ /* add other standalone attributes here */
+ if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]) {
+ n_match_sets++;
+ continue;
+ }
+ rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI];
+ if (rssi)
+ default_match_rssi = nla_get_s32(rssi);
+ }
+ }
+
+ /* However, if there's no other matchset, add the RSSI one */
+ if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF)
+ n_match_sets = 1;
+
+ if (n_match_sets > wiphy->max_match_sets)
+ return ERR_PTR(-EINVAL);
+
+ if (attrs[NL80211_ATTR_IE])
+ ie_len = nla_len(attrs[NL80211_ATTR_IE]);
+ else
+ ie_len = 0;
+
+ if (ie_len > wiphy->max_sched_scan_ie_len)
+ return ERR_PTR(-EINVAL);
+
+ request = kzalloc(sizeof(*request)
+ + sizeof(*request->ssids) * n_ssids
+ + sizeof(*request->match_sets) * n_match_sets
+ + sizeof(*request->channels) * n_channels
+ + ie_len, GFP_KERNEL);
+ if (!request)
+ return ERR_PTR(-ENOMEM);
+
+ if (n_ssids)
+ request->ssids = (void *)&request->channels[n_channels];
+ request->n_ssids = n_ssids;
+ if (ie_len) {
+ if (n_ssids)
+ request->ie = (void *)(request->ssids + n_ssids);
+ else
+ request->ie = (void *)(request->channels + n_channels);
+ }
+
+ if (n_match_sets) {
+ if (request->ie)
+ request->match_sets = (void *)(request->ie + ie_len);
+ else if (n_ssids)
+ request->match_sets =
+ (void *)(request->ssids + n_ssids);
+ else
+ request->match_sets =
+ (void *)(request->channels + n_channels);
+ }
+ request->n_match_sets = n_match_sets;
+
+ i = 0;
+ if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+ /* user specified, bail out if channel not found */
+ nla_for_each_nested(attr,
+ attrs[NL80211_ATTR_SCAN_FREQUENCIES],
+ tmp) {
+ struct ieee80211_channel *chan;
+
+ chan = ieee80211_get_channel(wiphy, nla_get_u32(attr));
+
+ if (!chan) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ /* ignore disabled channels */
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ continue;
+
+ request->channels[i] = chan;
+ i++;
+ }
+ } else {
+ /* all channels */
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ int j;
+ if (!wiphy->bands[band])
+ continue;
+ for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+ struct ieee80211_channel *chan;
+
+ chan = &wiphy->bands[band]->channels[j];
+
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ continue;
+
+ request->channels[i] = chan;
+ i++;
+ }
+ }
+ }
+
+ if (!i) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ request->n_channels = i;
+
+ i = 0;
+ if (n_ssids) {
+ nla_for_each_nested(attr, attrs[NL80211_ATTR_SCAN_SSIDS],
+ tmp) {
+ if (nla_len(attr) > IEEE80211_MAX_SSID_LEN) {
+ err = -EINVAL;
+ goto out_free;
+ }
+ request->ssids[i].ssid_len = nla_len(attr);
+ memcpy(request->ssids[i].ssid, nla_data(attr),
+ nla_len(attr));
+ i++;
+ }
+ }
+
+ i = 0;
+ if (attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) {
+ nla_for_each_nested(attr,
+ attrs[NL80211_ATTR_SCHED_SCAN_MATCH],
+ tmp) {
+ struct nlattr *ssid, *rssi;
+
+ err = nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
+ nla_data(attr), nla_len(attr),
+ nl80211_match_policy);
+ if (err)
+ goto out_free;
+ ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID];
+ if (ssid) {
+ if (WARN_ON(i >= n_match_sets)) {
+ /* this indicates a programming error,
+ * the loop above should have verified
+ * things properly
+ */
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) {
+ err = -EINVAL;
+ goto out_free;
+ }
+ memcpy(request->match_sets[i].ssid.ssid,
+ nla_data(ssid), nla_len(ssid));
+ request->match_sets[i].ssid.ssid_len =
+ nla_len(ssid);
+ /* special attribute - old implemenation w/a */
+ request->match_sets[i].rssi_thold =
+ default_match_rssi;
+ rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI];
+ if (rssi)
+ request->match_sets[i].rssi_thold =
+ nla_get_s32(rssi);
+ }
+ i++;
+ }
+
+ /* there was no other matchset, so the RSSI one is alone */
+ if (i == 0 && n_match_sets)
+ request->match_sets[0].rssi_thold = default_match_rssi;
+
+ request->min_rssi_thold = INT_MAX;
+ for (i = 0; i < n_match_sets; i++)
+ request->min_rssi_thold =
+ min(request->match_sets[i].rssi_thold,
+ request->min_rssi_thold);
+ } else {
+ request->min_rssi_thold = NL80211_SCAN_RSSI_THOLD_OFF;
+ }
+
+ if (ie_len) {
+ request->ie_len = ie_len;
+ memcpy((void *)request->ie,
+ nla_data(attrs[NL80211_ATTR_IE]),
+ request->ie_len);
+ }
+
+ if (attrs[NL80211_ATTR_SCAN_FLAGS]) {
+ request->flags = nla_get_u32(
+ attrs[NL80211_ATTR_SCAN_FLAGS]);
+ if ((request->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
+ !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) {
+ err = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
+ u32 flg = NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR;
+
+ if (!wdev) /* must be net-detect */
+ flg = NL80211_FEATURE_ND_RANDOM_MAC_ADDR;
+
+ if (!(wiphy->features & flg)) {
+ err = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ if (wdev && wdev->current_bss) {
+ err = -EOPNOTSUPP;
+ goto out_free;
+ }
+
+ err = nl80211_parse_random_mac(attrs, request->mac_addr,
+ request->mac_addr_mask);
+ if (err)
+ goto out_free;
+ }
+ }
+
+ if (attrs[NL80211_ATTR_SCHED_SCAN_DELAY])
+ request->delay =
+ nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);
+
+ request->interval = interval;
+ request->scan_start = jiffies;
+
+ return request;
+
+out_free:
+ kfree(request);
+ return ERR_PTR(err);
+}
+
+static int nl80211_start_sched_scan(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_sched_scan_request *sched_scan_req;
+ int err;
+
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) ||
+ !rdev->ops->sched_scan_start)
+ return -EOPNOTSUPP;
+
+ if (rdev->sched_scan_req)
+ return -EINPROGRESS;
+
+ sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev,
+ info->attrs);
+
+ err = PTR_ERR_OR_ZERO(sched_scan_req);
+ if (err)
+ goto out_err;
+
+ err = rdev_sched_scan_start(rdev, dev, sched_scan_req);
+ if (err)
+ goto out_free;
+
+ sched_scan_req->dev = dev;
+ sched_scan_req->wiphy = &rdev->wiphy;
+
+ if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
+ sched_scan_req->owner_nlportid = info->snd_portid;
+
+ rcu_assign_pointer(rdev->sched_scan_req, sched_scan_req);
+
+ nl80211_send_sched_scan(rdev, dev,
+ NL80211_CMD_START_SCHED_SCAN);
+ return 0;
+
+out_free:
+ kfree(sched_scan_req);
+out_err:
+ return err;
+}
+
+static int nl80211_stop_sched_scan(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) ||
+ !rdev->ops->sched_scan_stop)
+ return -EOPNOTSUPP;
+
+ return __cfg80211_stop_sched_scan(rdev, false);
+}
+
+static int nl80211_start_radar_detection(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_chan_def chandef;
+ enum nl80211_dfs_regions dfs_region;
+ unsigned int cac_time_ms;
+ int err;
+
+ dfs_region = reg_get_dfs_region(wdev->wiphy);
+ if (dfs_region == NL80211_DFS_UNSET)
+ return -EINVAL;
+
+ err = nl80211_parse_chandef(rdev, info, &chandef);
+ if (err)
+ return err;
+
+ if (netif_carrier_ok(dev))
+ return -EBUSY;
+
+ if (wdev->cac_started)
+ return -EBUSY;
+
+ err = cfg80211_chandef_dfs_required(wdev->wiphy, &chandef,
+ wdev->iftype);
+ if (err < 0)
+ return err;
+
+ if (err == 0)
+ return -EINVAL;
+
+ if (!cfg80211_chandef_dfs_usable(wdev->wiphy, &chandef))
+ return -EINVAL;
+
+ if (!rdev->ops->start_radar_detection)
+ return -EOPNOTSUPP;
+
+ cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, &chandef);
+ if (WARN_ON(!cac_time_ms))
+ cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
+
+ err = rdev->ops->start_radar_detection(&rdev->wiphy, dev, &chandef,
+ cac_time_ms);
+ if (!err) {
+ wdev->chandef = chandef;
+ wdev->cac_started = true;
+ wdev->cac_start_time = jiffies;
+ wdev->cac_time_ms = cac_time_ms;
+ }
+ return err;
+}
+
+static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_csa_settings params;
+ /* csa_attrs is defined static to avoid waste of stack size - this
+ * function is called under RTNL lock, so this should not be a problem.
+ */
+ static struct nlattr *csa_attrs[NL80211_ATTR_MAX+1];
+ int err;
+ bool need_new_beacon = false;
+ int len, i;
+ u32 cs_count;
+
+ if (!rdev->ops->channel_switch ||
+ !(rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH))
+ return -EOPNOTSUPP;
+
+ switch (dev->ieee80211_ptr->iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ need_new_beacon = true;
+
+ /* useless if AP is not running */
+ if (!wdev->beacon_interval)
+ return -ENOTCONN;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ if (!wdev->ssid_len)
+ return -ENOTCONN;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ if (!wdev->mesh_id_len)
+ return -ENOTCONN;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ memset(&params, 0, sizeof(params));
+
+ if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+ !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT])
+ return -EINVAL;
+
+ /* only important for AP, IBSS and mesh create IEs internally */
+ if (need_new_beacon && !info->attrs[NL80211_ATTR_CSA_IES])
+ return -EINVAL;
+
+ /* Even though the attribute is u32, the specification says
+ * u8, so let's make sure we don't overflow.
+ */
+ cs_count = nla_get_u32(info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]);
+ if (cs_count > 255)
+ return -EINVAL;
+
+ params.count = cs_count;
+
+ if (!need_new_beacon)
+ goto skip_beacons;
+
+ err = nl80211_parse_beacon(info->attrs, &params.beacon_after);
+ if (err)
+ return err;
+
+ err = nla_parse_nested(csa_attrs, NL80211_ATTR_MAX,
+ info->attrs[NL80211_ATTR_CSA_IES],
+ nl80211_policy);
+ if (err)
+ return err;
+
+ err = nl80211_parse_beacon(csa_attrs, &params.beacon_csa);
+ if (err)
+ return err;
+
+ if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON])
+ return -EINVAL;
+
+ len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]);
+ if (!len || (len % sizeof(u16)))
+ return -EINVAL;
+
+ params.n_counter_offsets_beacon = len / sizeof(u16);
+ if (rdev->wiphy.max_num_csa_counters &&
+ (params.n_counter_offsets_beacon >
+ rdev->wiphy.max_num_csa_counters))
+ return -EINVAL;
+
+ params.counter_offsets_beacon =
+ nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]);
+
+ /* sanity checks - counters should fit and be the same */
+ for (i = 0; i < params.n_counter_offsets_beacon; i++) {
+ u16 offset = params.counter_offsets_beacon[i];
+
+ if (offset >= params.beacon_csa.tail_len)
+ return -EINVAL;
+
+ if (params.beacon_csa.tail[offset] != params.count)
+ return -EINVAL;
+ }
+
+ if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) {
+ len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]);
+ if (!len || (len % sizeof(u16)))
+ return -EINVAL;
+
+ params.n_counter_offsets_presp = len / sizeof(u16);
+ if (rdev->wiphy.max_num_csa_counters &&
+ (params.n_counter_offsets_beacon >
+ rdev->wiphy.max_num_csa_counters))
+ return -EINVAL;
+
+ params.counter_offsets_presp =
+ nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]);
+
+ /* sanity checks - counters should fit and be the same */
+ for (i = 0; i < params.n_counter_offsets_presp; i++) {
+ u16 offset = params.counter_offsets_presp[i];
+
+ if (offset >= params.beacon_csa.probe_resp_len)
+ return -EINVAL;
+
+ if (params.beacon_csa.probe_resp[offset] !=
+ params.count)
+ return -EINVAL;
+ }
+ }
+
+skip_beacons:
+ err = nl80211_parse_chandef(rdev, info, &params.chandef);
+ if (err)
+ return err;
+
+ if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef,
+ wdev->iftype))
+ return -EINVAL;
+
+ err = cfg80211_chandef_dfs_required(wdev->wiphy,
+ &params.chandef,
+ wdev->iftype);
+ if (err < 0)
+ return err;
+
+ if (err > 0)
+ params.radar_required = true;
+
+ if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX])
+ params.block_tx = true;
+
+ wdev_lock(wdev);
+ err = rdev_channel_switch(rdev, dev, &params);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
+ u32 seq, int flags,
+ struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_internal_bss *intbss)
+{
+ struct cfg80211_bss *res = &intbss->pub;
+ const struct cfg80211_bss_ies *ies;
+ void *hdr;
+ struct nlattr *bss;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ hdr = nl80211hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags,
+ NL80211_CMD_NEW_SCAN_RESULTS);
+ if (!hdr)
+ return -1;
+
+ genl_dump_check_consistent(cb, hdr, &nl80211_fam);
+
+ if (nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation))
+ goto nla_put_failure;
+ if (wdev->netdev &&
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex))
+ goto nla_put_failure;
+ if (nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ goto nla_put_failure;
+
+ bss = nla_nest_start(msg, NL80211_ATTR_BSS);
+ if (!bss)
+ goto nla_put_failure;
+ if ((!is_zero_ether_addr(res->bssid) &&
+ nla_put(msg, NL80211_BSS_BSSID, ETH_ALEN, res->bssid)))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ /* indicate whether we have probe response data or not */
+ if (rcu_access_pointer(res->proberesp_ies) &&
+ nla_put_flag(msg, NL80211_BSS_PRESP_DATA))
+ goto fail_unlock_rcu;
+
+ /* this pointer prefers to be pointed to probe response data
+ * but is always valid
+ */
+ ies = rcu_dereference(res->ies);
+ if (ies) {
+ if (nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf))
+ goto fail_unlock_rcu;
+ if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS,
+ ies->len, ies->data))
+ goto fail_unlock_rcu;
+ }
+
+ /* and this pointer is always (unless driver didn't know) beacon data */
+ ies = rcu_dereference(res->beacon_ies);
+ if (ies && ies->from_beacon) {
+ if (nla_put_u64(msg, NL80211_BSS_BEACON_TSF, ies->tsf))
+ goto fail_unlock_rcu;
+ if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES,
+ ies->len, ies->data))
+ goto fail_unlock_rcu;
+ }
+ rcu_read_unlock();
+
+ if (res->beacon_interval &&
+ nla_put_u16(msg, NL80211_BSS_BEACON_INTERVAL, res->beacon_interval))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) ||
+ nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) ||
+ nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) ||
+ nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO,
+ jiffies_to_msecs(jiffies - intbss->ts)))
+ goto nla_put_failure;
+
+ switch (rdev->wiphy.signal_type) {
+ case CFG80211_SIGNAL_TYPE_MBM:
+ if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal))
+ goto nla_put_failure;
+ break;
+ case CFG80211_SIGNAL_TYPE_UNSPEC:
+ if (nla_put_u8(msg, NL80211_BSS_SIGNAL_UNSPEC, res->signal))
+ goto nla_put_failure;
+ break;
+ default:
+ break;
+ }
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_STATION:
+ if (intbss == wdev->current_bss &&
+ nla_put_u32(msg, NL80211_BSS_STATUS,
+ NL80211_BSS_STATUS_ASSOCIATED))
+ goto nla_put_failure;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ if (intbss == wdev->current_bss &&
+ nla_put_u32(msg, NL80211_BSS_STATUS,
+ NL80211_BSS_STATUS_IBSS_JOINED))
+ goto nla_put_failure;
+ break;
+ default:
+ break;
+ }
+
+ nla_nest_end(msg, bss);
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ fail_unlock_rcu:
+ rcu_read_unlock();
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct cfg80211_registered_device *rdev;
+ struct cfg80211_internal_bss *scan;
+ struct wireless_dev *wdev;
+ int start = cb->args[2], idx = 0;
+ int err;
+
+ err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ if (err)
+ return err;
+
+ wdev_lock(wdev);
+ spin_lock_bh(&rdev->bss_lock);
+ cfg80211_bss_expire(rdev);
+
+ cb->seq = rdev->bss_generation;
+
+ list_for_each_entry(scan, &rdev->bss_list, list) {
+ if (++idx <= start)
+ continue;
+ if (nl80211_send_bss(skb, cb,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ rdev, wdev, scan) < 0) {
+ idx--;
+ break;
+ }
+ }
+
+ spin_unlock_bh(&rdev->bss_lock);
+ wdev_unlock(wdev);
+
+ cb->args[2] = idx;
+ nl80211_finish_wdev_dump(rdev);
+
+ return skb->len;
+}
+
+static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
+ int flags, struct net_device *dev,
+ bool allow_radio_stats,
+ struct survey_info *survey)
+{
+ void *hdr;
+ struct nlattr *infoattr;
+
+ /* skip radio stats if userspace didn't request them */
+ if (!survey->channel && !allow_radio_stats)
+ return 0;
+
+ hdr = nl80211hdr_put(msg, portid, seq, flags,
+ NL80211_CMD_NEW_SURVEY_RESULTS);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ infoattr = nla_nest_start(msg, NL80211_ATTR_SURVEY_INFO);
+ if (!infoattr)
+ goto nla_put_failure;
+
+ if (survey->channel &&
+ nla_put_u32(msg, NL80211_SURVEY_INFO_FREQUENCY,
+ survey->channel->center_freq))
+ goto nla_put_failure;
+
+ if ((survey->filled & SURVEY_INFO_NOISE_DBM) &&
+ nla_put_u8(msg, NL80211_SURVEY_INFO_NOISE, survey->noise))
+ goto nla_put_failure;
+ if ((survey->filled & SURVEY_INFO_IN_USE) &&
+ nla_put_flag(msg, NL80211_SURVEY_INFO_IN_USE))
+ goto nla_put_failure;
+ if ((survey->filled & SURVEY_INFO_TIME) &&
+ nla_put_u64(msg, NL80211_SURVEY_INFO_TIME,
+ survey->time))
+ goto nla_put_failure;
+ if ((survey->filled & SURVEY_INFO_TIME_BUSY) &&
+ nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_BUSY,
+ survey->time_busy))
+ goto nla_put_failure;
+ if ((survey->filled & SURVEY_INFO_TIME_EXT_BUSY) &&
+ nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_EXT_BUSY,
+ survey->time_ext_busy))
+ goto nla_put_failure;
+ if ((survey->filled & SURVEY_INFO_TIME_RX) &&
+ nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_RX,
+ survey->time_rx))
+ goto nla_put_failure;
+ if ((survey->filled & SURVEY_INFO_TIME_TX) &&
+ nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_TX,
+ survey->time_tx))
+ goto nla_put_failure;
+ if ((survey->filled & SURVEY_INFO_TIME_SCAN) &&
+ nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_SCAN,
+ survey->time_scan))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, infoattr);
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct survey_info survey;
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+ int survey_idx = cb->args[2];
+ int res;
+ bool radio_stats;
+
+ res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ if (res)
+ return res;
+
+ /* prepare_wdev_dump parsed the attributes */
+ radio_stats = nl80211_fam.attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS];
+
+ if (!wdev->netdev) {
+ res = -EINVAL;
+ goto out_err;
+ }
+
+ if (!rdev->ops->dump_survey) {
+ res = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ while (1) {
+ res = rdev_dump_survey(rdev, wdev->netdev, survey_idx, &survey);
+ if (res == -ENOENT)
+ break;
+ if (res)
+ goto out_err;
+
+ /* don't send disabled channels, but do send non-channel data */
+ if (survey.channel &&
+ survey.channel->flags & IEEE80211_CHAN_DISABLED) {
+ survey_idx++;
+ continue;
+ }
+
+ if (nl80211_send_survey(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ wdev->netdev, radio_stats, &survey) < 0)
+ goto out;
+ survey_idx++;
+ }
+
+ out:
+ cb->args[2] = survey_idx;
+ res = skb->len;
+ out_err:
+ nl80211_finish_wdev_dump(rdev);
+ return res;
+}
+
+static bool nl80211_valid_wpa_versions(u32 wpa_versions)
+{
+ return !(wpa_versions & ~(NL80211_WPA_VERSION_1 |
+ NL80211_WPA_VERSION_2));
+}
+
+static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct ieee80211_channel *chan;
+ const u8 *bssid, *ssid, *ie = NULL, *sae_data = NULL;
+ int err, ssid_len, ie_len = 0, sae_data_len = 0;
+ enum nl80211_auth_type auth_type;
+ struct key_parse key;
+ bool local_state_change;
+
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_AUTH_TYPE])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_SSID])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
+ return -EINVAL;
+
+ err = nl80211_parse_key(info, &key);
+ if (err)
+ return err;
+
+ if (key.idx >= 0) {
+ if (key.type != -1 && key.type != NL80211_KEYTYPE_GROUP)
+ return -EINVAL;
+ if (!key.p.key || !key.p.key_len)
+ return -EINVAL;
+ if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 ||
+ key.p.key_len != WLAN_KEY_LEN_WEP40) &&
+ (key.p.cipher != WLAN_CIPHER_SUITE_WEP104 ||
+ key.p.key_len != WLAN_KEY_LEN_WEP104))
+ return -EINVAL;
+ if (key.idx > 4)
+ return -EINVAL;
+ } else {
+ key.p.key_len = 0;
+ key.p.key = NULL;
+ }
+
+ if (key.idx >= 0) {
+ int i;
+ bool ok = false;
+ for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) {
+ if (key.p.cipher == rdev->wiphy.cipher_suites[i]) {
+ ok = true;
+ break;
+ }
+ }
+ if (!ok)
+ return -EINVAL;
+ }
+
+ if (!rdev->ops->auth)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ chan = nl80211_get_valid_chan(&rdev->wiphy,
+ info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+ if (!chan)
+ return -EINVAL;
+
+ ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+ ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ }
+
+ auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+ if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE))
+ return -EINVAL;
+
+ if (auth_type == NL80211_AUTHTYPE_SAE &&
+ !info->attrs[NL80211_ATTR_SAE_DATA])
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_SAE_DATA]) {
+ if (auth_type != NL80211_AUTHTYPE_SAE)
+ return -EINVAL;
+ sae_data = nla_data(info->attrs[NL80211_ATTR_SAE_DATA]);
+ sae_data_len = nla_len(info->attrs[NL80211_ATTR_SAE_DATA]);
+ /* need to include at least Auth Transaction and Status Code */
+ if (sae_data_len < 4)
+ return -EINVAL;
+ }
+
+ local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
+
+ /*
+ * Since we no longer track auth state, ignore
+ * requests to only change local state.
+ */
+ if (local_state_change)
+ return 0;
+
+ wdev_lock(dev->ieee80211_ptr);
+ err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
+ ssid, ssid_len, ie, ie_len,
+ key.p.key, key.p.key_len, key.idx,
+ sae_data, sae_data_len);
+ wdev_unlock(dev->ieee80211_ptr);
+ return err;
+}
+
+static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
+ struct genl_info *info,
+ struct cfg80211_crypto_settings *settings,
+ int cipher_limit)
+{
+ memset(settings, 0, sizeof(*settings));
+
+ settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT];
+
+ if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
+ u16 proto;
+ proto = nla_get_u16(
+ info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
+ settings->control_port_ethertype = cpu_to_be16(proto);
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) &&
+ proto != ETH_P_PAE)
+ return -EINVAL;
+ if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT])
+ settings->control_port_no_encrypt = true;
+ } else
+ settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE);
+
+ if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) {
+ void *data;
+ int len, i;
+
+ data = nla_data(info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]);
+ len = nla_len(info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]);
+ settings->n_ciphers_pairwise = len / sizeof(u32);
+
+ if (len % sizeof(u32))
+ return -EINVAL;
+
+ if (settings->n_ciphers_pairwise > cipher_limit)
+ return -EINVAL;
+
+ memcpy(settings->ciphers_pairwise, data, len);
+
+ for (i = 0; i < settings->n_ciphers_pairwise; i++)
+ if (!cfg80211_supported_cipher_suite(
+ &rdev->wiphy,
+ settings->ciphers_pairwise[i]))
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_CIPHER_SUITE_GROUP]) {
+ settings->cipher_group =
+ nla_get_u32(info->attrs[NL80211_ATTR_CIPHER_SUITE_GROUP]);
+ if (!cfg80211_supported_cipher_suite(&rdev->wiphy,
+ settings->cipher_group))
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_WPA_VERSIONS]) {
+ settings->wpa_versions =
+ nla_get_u32(info->attrs[NL80211_ATTR_WPA_VERSIONS]);
+ if (!nl80211_valid_wpa_versions(settings->wpa_versions))
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_AKM_SUITES]) {
+ void *data;
+ int len;
+
+ data = nla_data(info->attrs[NL80211_ATTR_AKM_SUITES]);
+ len = nla_len(info->attrs[NL80211_ATTR_AKM_SUITES]);
+ settings->n_akm_suites = len / sizeof(u32);
+
+ if (len % sizeof(u32))
+ return -EINVAL;
+
+ if (settings->n_akm_suites > NL80211_MAX_NR_AKM_SUITES)
+ return -EINVAL;
+
+ memcpy(settings->akm_suites, data, len);
+ }
+
+ return 0;
+}
+
+static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct ieee80211_channel *chan;
+ struct cfg80211_assoc_request req = {};
+ const u8 *bssid, *ssid;
+ int err, ssid_len = 0;
+
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_MAC] ||
+ !info->attrs[NL80211_ATTR_SSID] ||
+ !info->attrs[NL80211_ATTR_WIPHY_FREQ])
+ return -EINVAL;
+
+ if (!rdev->ops->assoc)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ chan = nl80211_get_valid_chan(&rdev->wiphy,
+ info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+ if (!chan)
+ return -EINVAL;
+
+ ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+ ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ req.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ }
+
+ if (info->attrs[NL80211_ATTR_USE_MFP]) {
+ enum nl80211_mfp mfp =
+ nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
+ if (mfp == NL80211_MFP_REQUIRED)
+ req.use_mfp = true;
+ else if (mfp != NL80211_MFP_NO)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_PREV_BSSID])
+ req.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);
+
+ if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HT]))
+ req.flags |= ASSOC_REQ_DISABLE_HT;
+
+ if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
+ memcpy(&req.ht_capa_mask,
+ nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]),
+ sizeof(req.ht_capa_mask));
+
+ if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
+ if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
+ return -EINVAL;
+ memcpy(&req.ht_capa,
+ nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]),
+ sizeof(req.ht_capa));
+ }
+
+ if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT]))
+ req.flags |= ASSOC_REQ_DISABLE_VHT;
+
+ if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
+ memcpy(&req.vht_capa_mask,
+ nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
+ sizeof(req.vht_capa_mask));
+
+ if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) {
+ if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
+ return -EINVAL;
+ memcpy(&req.vht_capa,
+ nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]),
+ sizeof(req.vht_capa));
+ }
+
+ if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
+ if (!(rdev->wiphy.features &
+ NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
+ !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+ return -EINVAL;
+ req.flags |= ASSOC_REQ_USE_RRM;
+ }
+
+ err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
+ if (!err) {
+ wdev_lock(dev->ieee80211_ptr);
+ err = cfg80211_mlme_assoc(rdev, dev, chan, bssid,
+ ssid, ssid_len, &req);
+ wdev_unlock(dev->ieee80211_ptr);
+ }
+
+ return err;
+}
+
+static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ const u8 *ie = NULL, *bssid;
+ int ie_len = 0, err;
+ u16 reason_code;
+ bool local_state_change;
+
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_REASON_CODE])
+ return -EINVAL;
+
+ if (!rdev->ops->deauth)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+ if (reason_code == 0) {
+ /* Reason Code 0 is reserved */
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ }
+
+ local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
+
+ wdev_lock(dev->ieee80211_ptr);
+ err = cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
+ local_state_change);
+ wdev_unlock(dev->ieee80211_ptr);
+ return err;
+}
+
+static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ const u8 *ie = NULL, *bssid;
+ int ie_len = 0, err;
+ u16 reason_code;
+ bool local_state_change;
+
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_REASON_CODE])
+ return -EINVAL;
+
+ if (!rdev->ops->disassoc)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+ if (reason_code == 0) {
+ /* Reason Code 0 is reserved */
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ }
+
+ local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
+
+ wdev_lock(dev->ieee80211_ptr);
+ err = cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
+ local_state_change);
+ wdev_unlock(dev->ieee80211_ptr);
+ return err;
+}
+
+static bool
+nl80211_parse_mcast_rate(struct cfg80211_registered_device *rdev,
+ int mcast_rate[IEEE80211_NUM_BANDS],
+ int rateval)
+{
+ struct wiphy *wiphy = &rdev->wiphy;
+ bool found = false;
+ int band, i;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ struct ieee80211_supported_band *sband;
+
+ sband = wiphy->bands[band];
+ if (!sband)
+ continue;
+
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (sband->bitrates[i].bitrate == rateval) {
+ mcast_rate[band] = i + 1;
+ found = true;
+ break;
+ }
+ }
+ }
+
+ return found;
+}
+
+static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct cfg80211_ibss_params ibss;
+ struct wiphy *wiphy;
+ struct cfg80211_cached_keys *connkeys = NULL;
+ int err;
+
+ memset(&ibss, 0, sizeof(ibss));
+
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_SSID] ||
+ !nla_len(info->attrs[NL80211_ATTR_SSID]))
+ return -EINVAL;
+
+ ibss.beacon_interval = 100;
+
+ if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) {
+ ibss.beacon_interval =
+ nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
+ if (ibss.beacon_interval < 1 || ibss.beacon_interval > 10000)
+ return -EINVAL;
+ }
+
+ if (!rdev->ops->join_ibss)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+ return -EOPNOTSUPP;
+
+ wiphy = &rdev->wiphy;
+
+ if (info->attrs[NL80211_ATTR_MAC]) {
+ ibss.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (!is_valid_ether_addr(ibss.bssid))
+ return -EINVAL;
+ }
+ ibss.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+ ibss.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ ibss.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ ibss.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ }
+
+ err = nl80211_parse_chandef(rdev, info, &ibss.chandef);
+ if (err)
+ return err;
+
+ if (!cfg80211_reg_can_beacon(&rdev->wiphy, &ibss.chandef,
+ NL80211_IFTYPE_ADHOC))
+ return -EINVAL;
+
+ switch (ibss.chandef.width) {
+ case NL80211_CHAN_WIDTH_5:
+ case NL80211_CHAN_WIDTH_10:
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ break;
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_40:
+ if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS))
+ return -EINVAL;
+ break;
+ case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_160:
+ if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS))
+ return -EINVAL;
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_VHT_IBSS))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED];
+ ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
+
+ if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
+ u8 *rates =
+ nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+ int n_rates =
+ nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+ struct ieee80211_supported_band *sband =
+ wiphy->bands[ibss.chandef.chan->band];
+
+ err = ieee80211_get_ratemask(sband, rates, n_rates,
+ &ibss.basic_rates);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
+ memcpy(&ibss.ht_capa_mask,
+ nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]),
+ sizeof(ibss.ht_capa_mask));
+
+ if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
+ if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
+ return -EINVAL;
+ memcpy(&ibss.ht_capa,
+ nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]),
+ sizeof(ibss.ht_capa));
+ }
+
+ if (info->attrs[NL80211_ATTR_MCAST_RATE] &&
+ !nl80211_parse_mcast_rate(rdev, ibss.mcast_rate,
+ nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE])))
+ return -EINVAL;
+
+ if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
+ bool no_ht = false;
+
+ connkeys = nl80211_parse_connkeys(rdev,
+ info->attrs[NL80211_ATTR_KEYS],
+ &no_ht);
+ if (IS_ERR(connkeys))
+ return PTR_ERR(connkeys);
+
+ if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
+ no_ht) {
+ kfree(connkeys);
+ return -EINVAL;
+ }
+ }
+
+ ibss.control_port =
+ nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT]);
+
+ ibss.userspace_handles_dfs =
+ nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);
+
+ err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
+ if (err)
+ kzfree(connkeys);
+ return err;
+}
+
+static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+
+ if (!rdev->ops->leave_ibss)
+ return -EOPNOTSUPP;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+ return -EOPNOTSUPP;
+
+ return cfg80211_leave_ibss(rdev, dev, false);
+}
+
+static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ int mcast_rate[IEEE80211_NUM_BANDS];
+ u32 nla_rate;
+ int err;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->set_mcast_rate)
+ return -EOPNOTSUPP;
+
+ memset(mcast_rate, 0, sizeof(mcast_rate));
+
+ if (!info->attrs[NL80211_ATTR_MCAST_RATE])
+ return -EINVAL;
+
+ nla_rate = nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE]);
+ if (!nl80211_parse_mcast_rate(rdev, mcast_rate, nla_rate))
+ return -EINVAL;
+
+ err = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
+
+ return err;
+}
+
+static struct sk_buff *
+__cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, int approxlen,
+ u32 portid, u32 seq, enum nl80211_commands cmd,
+ enum nl80211_attrs attr,
+ const struct nl80211_vendor_cmd_info *info,
+ gfp_t gfp)
+{
+ struct sk_buff *skb;
+ void *hdr;
+ struct nlattr *data;
+
+ skb = nlmsg_new(approxlen + 100, gfp);
+ if (!skb)
+ return NULL;
+
+ hdr = nl80211hdr_put(skb, portid, seq, 0, cmd);
+ if (!hdr) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx))
+ goto nla_put_failure;
+
+ if (info) {
+ if (nla_put_u32(skb, NL80211_ATTR_VENDOR_ID,
+ info->vendor_id))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, NL80211_ATTR_VENDOR_SUBCMD,
+ info->subcmd))
+ goto nla_put_failure;
+ }
+
+ if (wdev) {
+ if (nla_put_u64(skb, NL80211_ATTR_WDEV,
+ wdev_id(wdev)))
+ goto nla_put_failure;
+ if (wdev->netdev &&
+ nla_put_u32(skb, NL80211_ATTR_IFINDEX,
+ wdev->netdev->ifindex))
+ goto nla_put_failure;
+ }
+
+ data = nla_nest_start(skb, attr);
+
+ ((void **)skb->cb)[0] = rdev;
+ ((void **)skb->cb)[1] = hdr;
+ ((void **)skb->cb)[2] = data;
+
+ return skb;
+
+ nla_put_failure:
+ kfree_skb(skb);
+ return NULL;
+}
+
+struct sk_buff *__cfg80211_alloc_event_skb(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ enum nl80211_commands cmd,
+ enum nl80211_attrs attr,
+ int vendor_event_idx,
+ int approxlen, gfp_t gfp)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ const struct nl80211_vendor_cmd_info *info;
+
+ switch (cmd) {
+ case NL80211_CMD_TESTMODE:
+ if (WARN_ON(vendor_event_idx != -1))
+ return NULL;
+ info = NULL;
+ break;
+ case NL80211_CMD_VENDOR:
+ if (WARN_ON(vendor_event_idx < 0 ||
+ vendor_event_idx >= wiphy->n_vendor_events))
+ return NULL;
+ info = &wiphy->vendor_events[vendor_event_idx];
+ break;
+ default:
+ WARN_ON(1);
+ return NULL;
+ }
+
+ return __cfg80211_alloc_vendor_skb(rdev, wdev, approxlen, 0, 0,
+ cmd, attr, info, gfp);
+}
+EXPORT_SYMBOL(__cfg80211_alloc_event_skb);
+
+void __cfg80211_send_event_skb(struct sk_buff *skb, gfp_t gfp)
+{
+ struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0];
+ void *hdr = ((void **)skb->cb)[1];
+ struct nlattr *data = ((void **)skb->cb)[2];
+ enum nl80211_multicast_groups mcgrp = NL80211_MCGRP_TESTMODE;
+
+ /* clear CB data for netlink core to own from now on */
+ memset(skb->cb, 0, sizeof(skb->cb));
+
+ nla_nest_end(skb, data);
+ genlmsg_end(skb, hdr);
+
+ if (data->nla_type == NL80211_ATTR_VENDOR_DATA)
+ mcgrp = NL80211_MCGRP_VENDOR;
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), skb, 0,
+ mcgrp, gfp);
+}
+EXPORT_SYMBOL(__cfg80211_send_event_skb);
+
+#ifdef CONFIG_NL80211_TESTMODE
+static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev =
+ __cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs);
+ int err;
+
+ if (!rdev->ops->testmode_cmd)
+ return -EOPNOTSUPP;
+
+ if (IS_ERR(wdev)) {
+ err = PTR_ERR(wdev);
+ if (err != -EINVAL)
+ return err;
+ wdev = NULL;
+ } else if (wdev->wiphy != &rdev->wiphy) {
+ return -EINVAL;
+ }
+
+ if (!info->attrs[NL80211_ATTR_TESTDATA])
+ return -EINVAL;
+
+ rdev->cur_cmd_info = info;
+ err = rdev_testmode_cmd(rdev, wdev,
+ nla_data(info->attrs[NL80211_ATTR_TESTDATA]),
+ nla_len(info->attrs[NL80211_ATTR_TESTDATA]));
+ rdev->cur_cmd_info = NULL;
+
+ return err;
+}
+
+static int nl80211_testmode_dump(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct cfg80211_registered_device *rdev;
+ int err;
+ long phy_idx;
+ void *data = NULL;
+ int data_len = 0;
+
+ rtnl_lock();
+
+ if (cb->args[0]) {
+ /*
+ * 0 is a valid index, but not valid for args[0],
+ * so we need to offset by 1.
+ */
+ phy_idx = cb->args[0] - 1;
+ } else {
+ err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
+ nl80211_fam.attrbuf, nl80211_fam.maxattr,
+ nl80211_policy);
+ if (err)
+ goto out_err;
+
+ rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk),
+ nl80211_fam.attrbuf);
+ if (IS_ERR(rdev)) {
+ err = PTR_ERR(rdev);
+ goto out_err;
+ }
+ phy_idx = rdev->wiphy_idx;
+ rdev = NULL;
+
+ if (nl80211_fam.attrbuf[NL80211_ATTR_TESTDATA])
+ cb->args[1] =
+ (long)nl80211_fam.attrbuf[NL80211_ATTR_TESTDATA];
+ }
+
+ if (cb->args[1]) {
+ data = nla_data((void *)cb->args[1]);
+ data_len = nla_len((void *)cb->args[1]);
+ }
+
+ rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
+ if (!rdev) {
+ err = -ENOENT;
+ goto out_err;
+ }
+
+ if (!rdev->ops->testmode_dump) {
+ err = -EOPNOTSUPP;
+ goto out_err;
+ }
+
+ while (1) {
+ void *hdr = nl80211hdr_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ NL80211_CMD_TESTMODE);
+ struct nlattr *tmdata;
+
+ if (!hdr)
+ break;
+
+ if (nla_put_u32(skb, NL80211_ATTR_WIPHY, phy_idx)) {
+ genlmsg_cancel(skb, hdr);
+ break;
+ }
+
+ tmdata = nla_nest_start(skb, NL80211_ATTR_TESTDATA);
+ if (!tmdata) {
+ genlmsg_cancel(skb, hdr);
+ break;
+ }
+ err = rdev_testmode_dump(rdev, skb, cb, data, data_len);
+ nla_nest_end(skb, tmdata);
+
+ if (err == -ENOBUFS || err == -ENOENT) {
+ genlmsg_cancel(skb, hdr);
+ break;
+ } else if (err) {
+ genlmsg_cancel(skb, hdr);
+ goto out_err;
+ }
+
+ genlmsg_end(skb, hdr);
+ }
+
+ err = skb->len;
+ /* see above */
+ cb->args[0] = phy_idx + 1;
+ out_err:
+ rtnl_unlock();
+ return err;
+}
+#endif
+
+static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct cfg80211_connect_params connect;
+ struct wiphy *wiphy;
+ struct cfg80211_cached_keys *connkeys = NULL;
+ int err;
+
+ memset(&connect, 0, sizeof(connect));
+
+ if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_SSID] ||
+ !nla_len(info->attrs[NL80211_ATTR_SSID]))
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
+ connect.auth_type =
+ nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+ if (!nl80211_valid_auth_type(rdev, connect.auth_type,
+ NL80211_CMD_CONNECT))
+ return -EINVAL;
+ } else
+ connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+
+ connect.privacy = info->attrs[NL80211_ATTR_PRIVACY];
+
+ err = nl80211_crypto_settings(rdev, info, &connect.crypto,
+ NL80211_MAX_NR_CIPHER_SUITES);
+ if (err)
+ return err;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ wiphy = &rdev->wiphy;
+
+ connect.bg_scan_period = -1;
+ if (info->attrs[NL80211_ATTR_BG_SCAN_PERIOD] &&
+ (wiphy->flags & WIPHY_FLAG_SUPPORTS_FW_ROAM)) {
+ connect.bg_scan_period =
+ nla_get_u16(info->attrs[NL80211_ATTR_BG_SCAN_PERIOD]);
+ }
+
+ if (info->attrs[NL80211_ATTR_MAC])
+ connect.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ else if (info->attrs[NL80211_ATTR_MAC_HINT])
+ connect.bssid_hint =
+ nla_data(info->attrs[NL80211_ATTR_MAC_HINT]);
+ connect.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+ connect.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+ if (info->attrs[NL80211_ATTR_IE]) {
+ connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+ }
+
+ if (info->attrs[NL80211_ATTR_USE_MFP]) {
+ connect.mfp = nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
+ if (connect.mfp != NL80211_MFP_REQUIRED &&
+ connect.mfp != NL80211_MFP_NO)
+ return -EINVAL;
+ } else {
+ connect.mfp = NL80211_MFP_NO;
+ }
+
+ if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+ connect.channel = nl80211_get_valid_chan(
+ wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+ if (!connect.channel)
+ return -EINVAL;
+ } else if (info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]) {
+ connect.channel_hint = nl80211_get_valid_chan(
+ wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]);
+ if (!connect.channel_hint)
+ return -EINVAL;
+ }
+
+ if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
+ connkeys = nl80211_parse_connkeys(rdev,
+ info->attrs[NL80211_ATTR_KEYS], NULL);
+ if (IS_ERR(connkeys))
+ return PTR_ERR(connkeys);
+ }
+
+ if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HT]))
+ connect.flags |= ASSOC_REQ_DISABLE_HT;
+
+ if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
+ memcpy(&connect.ht_capa_mask,
+ nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]),
+ sizeof(connect.ht_capa_mask));
+
+ if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
+ if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) {
+ kzfree(connkeys);
+ return -EINVAL;
+ }
+ memcpy(&connect.ht_capa,
+ nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]),
+ sizeof(connect.ht_capa));
+ }
+
+ if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT]))
+ connect.flags |= ASSOC_REQ_DISABLE_VHT;
+
+ if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
+ memcpy(&connect.vht_capa_mask,
+ nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
+ sizeof(connect.vht_capa_mask));
+
+ if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) {
+ if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) {
+ kzfree(connkeys);
+ return -EINVAL;
+ }
+ memcpy(&connect.vht_capa,
+ nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]),
+ sizeof(connect.vht_capa));
+ }
+
+ if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
+ if (!(rdev->wiphy.features &
+ NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
+ !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+ return -EINVAL;
+ connect.flags |= ASSOC_REQ_USE_RRM;
+ }
+
+ wdev_lock(dev->ieee80211_ptr);
+ err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL);
+ wdev_unlock(dev->ieee80211_ptr);
+ if (err)
+ kzfree(connkeys);
+ return err;
+}
+
+static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ u16 reason;
+ int ret;
+
+ if (!info->attrs[NL80211_ATTR_REASON_CODE])
+ reason = WLAN_REASON_DEAUTH_LEAVING;
+ else
+ reason = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+
+ if (reason == 0)
+ return -EINVAL;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ wdev_lock(dev->ieee80211_ptr);
+ ret = cfg80211_disconnect(rdev, dev, reason, true);
+ wdev_unlock(dev->ieee80211_ptr);
+ return ret;
+}
+
+static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net *net;
+ int err;
+
+ if (info->attrs[NL80211_ATTR_PID]) {
+ u32 pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]);
+
+ net = get_net_ns_by_pid(pid);
+ } else if (info->attrs[NL80211_ATTR_NETNS_FD]) {
+ u32 fd = nla_get_u32(info->attrs[NL80211_ATTR_NETNS_FD]);
+
+ net = get_net_ns_by_fd(fd);
+ } else {
+ return -EINVAL;
+ }
+
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ err = 0;
+
+ /* check if anything to do */
+ if (!net_eq(wiphy_net(&rdev->wiphy), net))
+ err = cfg80211_switch_netns(rdev, net);
+
+ put_net(net);
+ return err;
+}
+
+static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_pmksa *pmksa) = NULL;
+ struct net_device *dev = info->user_ptr[1];
+ struct cfg80211_pmksa pmksa;
+
+ memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_PMKID])
+ return -EINVAL;
+
+ pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
+ pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ switch (info->genlhdr->cmd) {
+ case NL80211_CMD_SET_PMKSA:
+ rdev_ops = rdev->ops->set_pmksa;
+ break;
+ case NL80211_CMD_DEL_PMKSA:
+ rdev_ops = rdev->ops->del_pmksa;
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+
+ if (!rdev_ops)
+ return -EOPNOTSUPP;
+
+ return rdev_ops(&rdev->wiphy, dev, &pmksa);
+}
+
+static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->flush_pmksa)
+ return -EOPNOTSUPP;
+
+ return rdev_flush_pmksa(rdev, dev);
+}
+
+static int nl80211_tdls_mgmt(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ u8 action_code, dialog_token;
+ u32 peer_capability = 0;
+ u16 status_code;
+ u8 *peer;
+ bool initiator;
+
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) ||
+ !rdev->ops->tdls_mgmt)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_TDLS_ACTION] ||
+ !info->attrs[NL80211_ATTR_STATUS_CODE] ||
+ !info->attrs[NL80211_ATTR_TDLS_DIALOG_TOKEN] ||
+ !info->attrs[NL80211_ATTR_IE] ||
+ !info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ peer = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ action_code = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_ACTION]);
+ status_code = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]);
+ dialog_token = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_DIALOG_TOKEN]);
+ initiator = nla_get_flag(info->attrs[NL80211_ATTR_TDLS_INITIATOR]);
+ if (info->attrs[NL80211_ATTR_TDLS_PEER_CAPABILITY])
+ peer_capability =
+ nla_get_u32(info->attrs[NL80211_ATTR_TDLS_PEER_CAPABILITY]);
+
+ return rdev_tdls_mgmt(rdev, dev, peer, action_code,
+ dialog_token, status_code, peer_capability,
+ initiator,
+ nla_data(info->attrs[NL80211_ATTR_IE]),
+ nla_len(info->attrs[NL80211_ATTR_IE]));
+}
+
+static int nl80211_tdls_oper(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ enum nl80211_tdls_operation operation;
+ u8 *peer;
+
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) ||
+ !rdev->ops->tdls_oper)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_TDLS_OPERATION] ||
+ !info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ operation = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_OPERATION]);
+ peer = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ return rdev_tdls_oper(rdev, dev, peer, operation);
+}
+
+static int nl80211_remain_on_channel(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ struct cfg80211_chan_def chandef;
+ struct sk_buff *msg;
+ void *hdr;
+ u64 cookie;
+ u32 duration;
+ int err;
+
+ if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+ !info->attrs[NL80211_ATTR_DURATION])
+ return -EINVAL;
+
+ duration = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]);
+
+ if (!rdev->ops->remain_on_channel ||
+ !(rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL))
+ return -EOPNOTSUPP;
+
+ /*
+ * We should be on that channel for at least a minimum amount of
+ * time (10ms) but no longer than the driver supports.
+ */
+ if (duration < NL80211_MIN_REMAIN_ON_CHANNEL_TIME ||
+ duration > rdev->wiphy.max_remain_on_channel_duration)
+ return -EINVAL;
+
+ err = nl80211_parse_chandef(rdev, info, &chandef);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_REMAIN_ON_CHANNEL);
+ if (!hdr) {
+ err = -ENOBUFS;
+ goto free_msg;
+ }
+
+ err = rdev_remain_on_channel(rdev, wdev, chandef.chan,
+ duration, &cookie);
+
+ if (err)
+ goto free_msg;
+
+ if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+ err = -ENOBUFS;
+ free_msg:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ u64 cookie;
+
+ if (!info->attrs[NL80211_ATTR_COOKIE])
+ return -EINVAL;
+
+ if (!rdev->ops->cancel_remain_on_channel)
+ return -EOPNOTSUPP;
+
+ cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
+
+ return rdev_cancel_remain_on_channel(rdev, wdev, cookie);
+}
+
+static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
+ u8 *rates, u8 rates_len)
+{
+ u8 i;
+ u32 mask = 0;
+
+ for (i = 0; i < rates_len; i++) {
+ int rate = (rates[i] & 0x7f) * 5;
+ int ridx;
+ for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
+ struct ieee80211_rate *srate =
+ &sband->bitrates[ridx];
+ if (rate == srate->bitrate) {
+ mask |= 1 << ridx;
+ break;
+ }
+ }
+ if (ridx == sband->n_bitrates)
+ return 0; /* rate not found */
+ }
+
+ return mask;
+}
+
+static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband,
+ u8 *rates, u8 rates_len,
+ u8 mcs[IEEE80211_HT_MCS_MASK_LEN])
+{
+ u8 i;
+
+ memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN);
+
+ for (i = 0; i < rates_len; i++) {
+ int ridx, rbit;
+
+ ridx = rates[i] / 8;
+ rbit = BIT(rates[i] % 8);
+
+ /* check validity */
+ if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN))
+ return false;
+
+ /* check availability */
+ if (sband->ht_cap.mcs.rx_mask[ridx] & rbit)
+ mcs[ridx] |= rbit;
+ else
+ return false;
+ }
+
+ return true;
+}
+
+static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map)
+{
+ u16 mcs_mask = 0;
+
+ switch (vht_mcs_map) {
+ case IEEE80211_VHT_MCS_NOT_SUPPORTED:
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_7:
+ mcs_mask = 0x00FF;
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_8:
+ mcs_mask = 0x01FF;
+ break;
+ case IEEE80211_VHT_MCS_SUPPORT_0_9:
+ mcs_mask = 0x03FF;
+ break;
+ default:
+ break;
+ }
+
+ return mcs_mask;
+}
+
+static void vht_build_mcs_mask(u16 vht_mcs_map,
+ u16 vht_mcs_mask[NL80211_VHT_NSS_MAX])
+{
+ u8 nss;
+
+ for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) {
+ vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03);
+ vht_mcs_map >>= 2;
+ }
+}
+
+static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
+ struct nl80211_txrate_vht *txrate,
+ u16 mcs[NL80211_VHT_NSS_MAX])
+{
+ u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
+ u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {};
+ u8 i;
+
+ if (!sband->vht_cap.vht_supported)
+ return false;
+
+ memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX);
+
+ /* Build vht_mcs_mask from VHT capabilities */
+ vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask);
+
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
+ if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
+ mcs[i] = txrate->mcs[i];
+ else
+ return false;
+ }
+
+ return true;
+}
+
+static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
+ [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
+ .len = NL80211_MAX_SUPP_RATES },
+ [NL80211_TXRATE_HT] = { .type = NLA_BINARY,
+ .len = NL80211_MAX_SUPP_HT_RATES },
+ [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)},
+ [NL80211_TXRATE_GI] = { .type = NLA_U8 },
+};
+
+static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *tb[NL80211_TXRATE_MAX + 1];
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct cfg80211_bitrate_mask mask;
+ int rem, i;
+ struct net_device *dev = info->user_ptr[1];
+ struct nlattr *tx_rates;
+ struct ieee80211_supported_band *sband;
+ u16 vht_tx_mcs_map;
+
+ if (!rdev->ops->set_bitrate_mask)
+ return -EOPNOTSUPP;
+
+ memset(&mask, 0, sizeof(mask));
+ /* Default to all rates enabled */
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+ sband = rdev->wiphy.bands[i];
+
+ if (!sband)
+ continue;
+
+ mask.control[i].legacy = (1 << sband->n_bitrates) - 1;
+ memcpy(mask.control[i].ht_mcs,
+ sband->ht_cap.mcs.rx_mask,
+ sizeof(mask.control[i].ht_mcs));
+
+ if (!sband->vht_cap.vht_supported)
+ continue;
+
+ vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
+ vht_build_mcs_mask(vht_tx_mcs_map, mask.control[i].vht_mcs);
+ }
+
+ /* if no rates are given set it back to the defaults */
+ if (!info->attrs[NL80211_ATTR_TX_RATES])
+ goto out;
+
+ /*
+ * The nested attribute uses enum nl80211_band as the index. This maps
+ * directly to the enum ieee80211_band values used in cfg80211.
+ */
+ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8);
+ nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) {
+ enum ieee80211_band band = nla_type(tx_rates);
+ int err;
+
+ if (band < 0 || band >= IEEE80211_NUM_BANDS)
+ return -EINVAL;
+ sband = rdev->wiphy.bands[band];
+ if (sband == NULL)
+ return -EINVAL;
+ err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
+ nla_len(tx_rates), nl80211_txattr_policy);
+ if (err)
+ return err;
+ if (tb[NL80211_TXRATE_LEGACY]) {
+ mask.control[band].legacy = rateset_to_mask(
+ sband,
+ nla_data(tb[NL80211_TXRATE_LEGACY]),
+ nla_len(tb[NL80211_TXRATE_LEGACY]));
+ if ((mask.control[band].legacy == 0) &&
+ nla_len(tb[NL80211_TXRATE_LEGACY]))
+ return -EINVAL;
+ }
+ if (tb[NL80211_TXRATE_HT]) {
+ if (!ht_rateset_to_mask(
+ sband,
+ nla_data(tb[NL80211_TXRATE_HT]),
+ nla_len(tb[NL80211_TXRATE_HT]),
+ mask.control[band].ht_mcs))
+ return -EINVAL;
+ }
+ if (tb[NL80211_TXRATE_VHT]) {
+ if (!vht_set_mcs_mask(
+ sband,
+ nla_data(tb[NL80211_TXRATE_VHT]),
+ mask.control[band].vht_mcs))
+ return -EINVAL;
+ }
+ if (tb[NL80211_TXRATE_GI]) {
+ mask.control[band].gi =
+ nla_get_u8(tb[NL80211_TXRATE_GI]);
+ if (mask.control[band].gi > NL80211_TXRATE_FORCE_LGI)
+ return -EINVAL;
+ }
+
+ if (mask.control[band].legacy == 0) {
+ /* don't allow empty legacy rates if HT or VHT
+ * are not even supported.
+ */
+ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
+ rdev->wiphy.bands[band]->vht_cap.vht_supported))
+ return -EINVAL;
+
+ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
+ if (mask.control[band].ht_mcs[i])
+ goto out;
+
+ for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
+ if (mask.control[band].vht_mcs[i])
+ goto out;
+
+ /* legacy and mcs rates may not be both empty */
+ return -EINVAL;
+ }
+ }
+
+out:
+ return rdev_set_bitrate_mask(rdev, dev, NULL, &mask);
+}
+
+static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION;
+
+ if (!info->attrs[NL80211_ATTR_FRAME_MATCH])
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_FRAME_TYPE])
+ frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]);
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ /* not much point in registering if we can't reply */
+ if (!rdev->ops->mgmt_tx)
+ return -EOPNOTSUPP;
+
+ return cfg80211_mlme_register_mgmt(wdev, info->snd_portid, frame_type,
+ nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
+ nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]));
+}
+
+static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ struct cfg80211_chan_def chandef;
+ int err;
+ void *hdr = NULL;
+ u64 cookie;
+ struct sk_buff *msg = NULL;
+ struct cfg80211_mgmt_tx_params params = {
+ .dont_wait_for_ack =
+ info->attrs[NL80211_ATTR_DONT_WAIT_FOR_ACK],
+ };
+
+ if (!info->attrs[NL80211_ATTR_FRAME])
+ return -EINVAL;
+
+ if (!rdev->ops->mgmt_tx)
+ return -EOPNOTSUPP;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_P2P_DEVICE:
+ if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
+ return -EINVAL;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_P2P_GO:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (info->attrs[NL80211_ATTR_DURATION]) {
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX))
+ return -EINVAL;
+ params.wait = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]);
+
+ /*
+ * We should wait on the channel for at least a minimum amount
+ * of time (10ms) but no longer than the driver supports.
+ */
+ if (params.wait < NL80211_MIN_REMAIN_ON_CHANNEL_TIME ||
+ params.wait > rdev->wiphy.max_remain_on_channel_duration)
+ return -EINVAL;
+
+ }
+
+ params.offchan = info->attrs[NL80211_ATTR_OFFCHANNEL_TX_OK];
+
+ if (params.offchan && !(rdev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX))
+ return -EINVAL;
+
+ params.no_cck = nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);
+
+ /* get the channel if any has been specified, otherwise pass NULL to
+ * the driver. The latter will use the current one
+ */
+ chandef.chan = NULL;
+ if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+ err = nl80211_parse_chandef(rdev, info, &chandef);
+ if (err)
+ return err;
+ }
+
+ if (!chandef.chan && params.offchan)
+ return -EINVAL;
+
+ params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
+ params.len = nla_len(info->attrs[NL80211_ATTR_FRAME]);
+
+ if (info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]) {
+ int len = nla_len(info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]);
+ int i;
+
+ if (len % sizeof(u16))
+ return -EINVAL;
+
+ params.n_csa_offsets = len / sizeof(u16);
+ params.csa_offsets =
+ nla_data(info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX]);
+
+ /* check that all the offsets fit the frame */
+ for (i = 0; i < params.n_csa_offsets; i++) {
+ if (params.csa_offsets[i] >= params.len)
+ return -EINVAL;
+ }
+ }
+
+ if (!params.dont_wait_for_ack) {
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_FRAME);
+ if (!hdr) {
+ err = -ENOBUFS;
+ goto free_msg;
+ }
+ }
+
+ params.chan = chandef.chan;
+ err = cfg80211_mlme_mgmt_tx(rdev, wdev, &params, &cookie);
+ if (err)
+ goto free_msg;
+
+ if (msg) {
+ if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+ }
+
+ return 0;
+
+ nla_put_failure:
+ err = -ENOBUFS;
+ free_msg:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ u64 cookie;
+
+ if (!info->attrs[NL80211_ATTR_COOKIE])
+ return -EINVAL;
+
+ if (!rdev->ops->mgmt_tx_cancel_wait)
+ return -EOPNOTSUPP;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
+
+ return rdev_mgmt_tx_cancel_wait(rdev, wdev, cookie);
+}
+
+static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev;
+ struct net_device *dev = info->user_ptr[1];
+ u8 ps_state;
+ bool state;
+ int err;
+
+ if (!info->attrs[NL80211_ATTR_PS_STATE])
+ return -EINVAL;
+
+ ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]);
+
+ if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED)
+ return -EINVAL;
+
+ wdev = dev->ieee80211_ptr;
+
+ if (!rdev->ops->set_power_mgmt)
+ return -EOPNOTSUPP;
+
+ state = (ps_state == NL80211_PS_ENABLED) ? true : false;
+
+ if (state == wdev->ps)
+ return 0;
+
+ err = rdev_set_power_mgmt(rdev, dev, state, wdev->ps_timeout);
+ if (!err)
+ wdev->ps = state;
+ return err;
+}
+
+static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ enum nl80211_ps_state ps_state;
+ struct wireless_dev *wdev;
+ struct net_device *dev = info->user_ptr[1];
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+
+ wdev = dev->ieee80211_ptr;
+
+ if (!rdev->ops->set_power_mgmt)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_GET_POWER_SAVE);
+ if (!hdr) {
+ err = -ENOBUFS;
+ goto free_msg;
+ }
+
+ if (wdev->ps)
+ ps_state = NL80211_PS_ENABLED;
+ else
+ ps_state = NL80211_PS_DISABLED;
+
+ if (nla_put_u32(msg, NL80211_ATTR_PS_STATE, ps_state))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+ err = -ENOBUFS;
+ free_msg:
+ nlmsg_free(msg);
+ return err;
+}
+
+static const struct nla_policy
+nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = {
+ [NL80211_ATTR_CQM_RSSI_THOLD] = { .type = NLA_U32 },
+ [NL80211_ATTR_CQM_RSSI_HYST] = { .type = NLA_U32 },
+ [NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT] = { .type = NLA_U32 },
+ [NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 },
+ [NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 },
+ [NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 },
+};
+
+static int nl80211_set_cqm_txe(struct genl_info *info,
+ u32 rate, u32 pkts, u32 intvl)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ if (rate > 100 || intvl > NL80211_CQM_TXE_MAX_INTVL)
+ return -EINVAL;
+
+ if (!rdev->ops->set_cqm_txe_config)
+ return -EOPNOTSUPP;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ return rdev_set_cqm_txe_config(rdev, dev, rate, pkts, intvl);
+}
+
+static int nl80211_set_cqm_rssi(struct genl_info *info,
+ s32 threshold, u32 hysteresis)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ if (threshold > 0)
+ return -EINVAL;
+
+ /* disabling - hysteresis should also be zero then */
+ if (threshold == 0)
+ hysteresis = 0;
+
+ if (!rdev->ops->set_cqm_rssi_config)
+ return -EOPNOTSUPP;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ return rdev_set_cqm_rssi_config(rdev, dev, threshold, hysteresis);
+}
+
+static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attrs[NL80211_ATTR_CQM_MAX + 1];
+ struct nlattr *cqm;
+ int err;
+
+ cqm = info->attrs[NL80211_ATTR_CQM];
+ if (!cqm)
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, NL80211_ATTR_CQM_MAX, cqm,
+ nl80211_attr_cqm_policy);
+ if (err)
+ return err;
+
+ if (attrs[NL80211_ATTR_CQM_RSSI_THOLD] &&
+ attrs[NL80211_ATTR_CQM_RSSI_HYST]) {
+ s32 threshold = nla_get_s32(attrs[NL80211_ATTR_CQM_RSSI_THOLD]);
+ u32 hysteresis = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_HYST]);
+
+ return nl80211_set_cqm_rssi(info, threshold, hysteresis);
+ }
+
+ if (attrs[NL80211_ATTR_CQM_TXE_RATE] &&
+ attrs[NL80211_ATTR_CQM_TXE_PKTS] &&
+ attrs[NL80211_ATTR_CQM_TXE_INTVL]) {
+ u32 rate = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_RATE]);
+ u32 pkts = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_PKTS]);
+ u32 intvl = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_INTVL]);
+
+ return nl80211_set_cqm_txe(info, rate, pkts, intvl);
+ }
+
+ return -EINVAL;
+}
+
+static int nl80211_join_ocb(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct ocb_setup setup = {};
+ int err;
+
+ err = nl80211_parse_chandef(rdev, info, &setup.chandef);
+ if (err)
+ return err;
+
+ return cfg80211_join_ocb(rdev, dev, &setup);
+}
+
+static int nl80211_leave_ocb(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+
+ return cfg80211_leave_ocb(rdev, dev);
+}
+
+static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct mesh_config cfg;
+ struct mesh_setup setup;
+ int err;
+
+ /* start with default */
+ memcpy(&cfg, &default_mesh_config, sizeof(cfg));
+ memcpy(&setup, &default_mesh_setup, sizeof(setup));
+
+ if (info->attrs[NL80211_ATTR_MESH_CONFIG]) {
+ /* and parse parameters if given */
+ err = nl80211_parse_mesh_config(info, &cfg, NULL);
+ if (err)
+ return err;
+ }
+
+ if (!info->attrs[NL80211_ATTR_MESH_ID] ||
+ !nla_len(info->attrs[NL80211_ATTR_MESH_ID]))
+ return -EINVAL;
+
+ setup.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]);
+ setup.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
+
+ if (info->attrs[NL80211_ATTR_MCAST_RATE] &&
+ !nl80211_parse_mcast_rate(rdev, setup.mcast_rate,
+ nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE])))
+ return -EINVAL;
+
+ if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) {
+ setup.beacon_interval =
+ nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
+ if (setup.beacon_interval < 10 ||
+ setup.beacon_interval > 10000)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) {
+ setup.dtim_period =
+ nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]);
+ if (setup.dtim_period < 1 || setup.dtim_period > 100)
+ return -EINVAL;
+ }
+
+ if (info->attrs[NL80211_ATTR_MESH_SETUP]) {
+ /* parse additional setup parameters if given */
+ err = nl80211_parse_mesh_setup(info, &setup);
+ if (err)
+ return err;
+ }
+
+ if (setup.user_mpm)
+ cfg.auto_open_plinks = false;
+
+ if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+ err = nl80211_parse_chandef(rdev, info, &setup.chandef);
+ if (err)
+ return err;
+ } else {
+ /* cfg80211_join_mesh() will sort it out */
+ setup.chandef.chan = NULL;
+ }
+
+ if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
+ u8 *rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+ int n_rates =
+ nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+ struct ieee80211_supported_band *sband;
+
+ if (!setup.chandef.chan)
+ return -EINVAL;
+
+ sband = rdev->wiphy.bands[setup.chandef.chan->band];
+
+ err = ieee80211_get_ratemask(sband, rates, n_rates,
+ &setup.basic_rates);
+ if (err)
+ return err;
+ }
+
+ return cfg80211_join_mesh(rdev, dev, &setup, &cfg);
+}
+
+static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+
+ return cfg80211_leave_mesh(rdev, dev);
+}
+
+#ifdef CONFIG_PM
+static int nl80211_send_wowlan_patterns(struct sk_buff *msg,
+ struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_wowlan *wowlan = rdev->wiphy.wowlan_config;
+ struct nlattr *nl_pats, *nl_pat;
+ int i, pat_len;
+
+ if (!wowlan->n_patterns)
+ return 0;
+
+ nl_pats = nla_nest_start(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN);
+ if (!nl_pats)
+ return -ENOBUFS;
+
+ for (i = 0; i < wowlan->n_patterns; i++) {
+ nl_pat = nla_nest_start(msg, i + 1);
+ if (!nl_pat)
+ return -ENOBUFS;
+ pat_len = wowlan->patterns[i].pattern_len;
+ if (nla_put(msg, NL80211_PKTPAT_MASK, DIV_ROUND_UP(pat_len, 8),
+ wowlan->patterns[i].mask) ||
+ nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len,
+ wowlan->patterns[i].pattern) ||
+ nla_put_u32(msg, NL80211_PKTPAT_OFFSET,
+ wowlan->patterns[i].pkt_offset))
+ return -ENOBUFS;
+ nla_nest_end(msg, nl_pat);
+ }
+ nla_nest_end(msg, nl_pats);
+
+ return 0;
+}
+
+static int nl80211_send_wowlan_tcp(struct sk_buff *msg,
+ struct cfg80211_wowlan_tcp *tcp)
+{
+ struct nlattr *nl_tcp;
+
+ if (!tcp)
+ return 0;
+
+ nl_tcp = nla_nest_start(msg, NL80211_WOWLAN_TRIG_TCP_CONNECTION);
+ if (!nl_tcp)
+ return -ENOBUFS;
+
+ if (nla_put_in_addr(msg, NL80211_WOWLAN_TCP_SRC_IPV4, tcp->src) ||
+ nla_put_in_addr(msg, NL80211_WOWLAN_TCP_DST_IPV4, tcp->dst) ||
+ nla_put(msg, NL80211_WOWLAN_TCP_DST_MAC, ETH_ALEN, tcp->dst_mac) ||
+ nla_put_u16(msg, NL80211_WOWLAN_TCP_SRC_PORT, tcp->src_port) ||
+ nla_put_u16(msg, NL80211_WOWLAN_TCP_DST_PORT, tcp->dst_port) ||
+ nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD,
+ tcp->payload_len, tcp->payload) ||
+ nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_INTERVAL,
+ tcp->data_interval) ||
+ nla_put(msg, NL80211_WOWLAN_TCP_WAKE_PAYLOAD,
+ tcp->wake_len, tcp->wake_data) ||
+ nla_put(msg, NL80211_WOWLAN_TCP_WAKE_MASK,
+ DIV_ROUND_UP(tcp->wake_len, 8), tcp->wake_mask))
+ return -ENOBUFS;
+
+ if (tcp->payload_seq.len &&
+ nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ,
+ sizeof(tcp->payload_seq), &tcp->payload_seq))
+ return -ENOBUFS;
+
+ if (tcp->payload_tok.len &&
+ nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN,
+ sizeof(tcp->payload_tok) + tcp->tokens_size,
+ &tcp->payload_tok))
+ return -ENOBUFS;
+
+ nla_nest_end(msg, nl_tcp);
+
+ return 0;
+}
+
+static int nl80211_send_wowlan_nd(struct sk_buff *msg,
+ struct cfg80211_sched_scan_request *req)
+{
+ struct nlattr *nd, *freqs, *matches, *match;
+ int i;
+
+ if (!req)
+ return 0;
+
+ nd = nla_nest_start(msg, NL80211_WOWLAN_TRIG_NET_DETECT);
+ if (!nd)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_INTERVAL, req->interval))
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
+ return -ENOBUFS;
+
+ freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
+ if (!freqs)
+ return -ENOBUFS;
+
+ for (i = 0; i < req->n_channels; i++)
+ nla_put_u32(msg, i, req->channels[i]->center_freq);
+
+ nla_nest_end(msg, freqs);
+
+ if (req->n_match_sets) {
+ matches = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_MATCH);
+ for (i = 0; i < req->n_match_sets; i++) {
+ match = nla_nest_start(msg, i);
+ nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
+ req->match_sets[i].ssid.ssid_len,
+ req->match_sets[i].ssid.ssid);
+ nla_nest_end(msg, match);
+ }
+ nla_nest_end(msg, matches);
+ }
+
+ nla_nest_end(msg, nd);
+
+ return 0;
+}
+
+static int nl80211_get_wowlan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct sk_buff *msg;
+ void *hdr;
+ u32 size = NLMSG_DEFAULT_SIZE;
+
+ if (!rdev->wiphy.wowlan)
+ return -EOPNOTSUPP;
+
+ if (rdev->wiphy.wowlan_config && rdev->wiphy.wowlan_config->tcp) {
+ /* adjust size to have room for all the data */
+ size += rdev->wiphy.wowlan_config->tcp->tokens_size +
+ rdev->wiphy.wowlan_config->tcp->payload_len +
+ rdev->wiphy.wowlan_config->tcp->wake_len +
+ rdev->wiphy.wowlan_config->tcp->wake_len / 8;
+ }
+
+ msg = nlmsg_new(size, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_GET_WOWLAN);
+ if (!hdr)
+ goto nla_put_failure;
+
+ if (rdev->wiphy.wowlan_config) {
+ struct nlattr *nl_wowlan;
+
+ nl_wowlan = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS);
+ if (!nl_wowlan)
+ goto nla_put_failure;
+
+ if ((rdev->wiphy.wowlan_config->any &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_ANY)) ||
+ (rdev->wiphy.wowlan_config->disconnect &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT)) ||
+ (rdev->wiphy.wowlan_config->magic_pkt &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT)) ||
+ (rdev->wiphy.wowlan_config->gtk_rekey_failure &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE)) ||
+ (rdev->wiphy.wowlan_config->eap_identity_req &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST)) ||
+ (rdev->wiphy.wowlan_config->four_way_handshake &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE)) ||
+ (rdev->wiphy.wowlan_config->rfkill_release &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE)))
+ goto nla_put_failure;
+
+ if (nl80211_send_wowlan_patterns(msg, rdev))
+ goto nla_put_failure;
+
+ if (nl80211_send_wowlan_tcp(msg,
+ rdev->wiphy.wowlan_config->tcp))
+ goto nla_put_failure;
+
+ if (nl80211_send_wowlan_nd(
+ msg,
+ rdev->wiphy.wowlan_config->nd_config))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nl_wowlan);
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ nlmsg_free(msg);
+ return -ENOBUFS;
+}
+
+static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev,
+ struct nlattr *attr,
+ struct cfg80211_wowlan *trig)
+{
+ struct nlattr *tb[NUM_NL80211_WOWLAN_TCP];
+ struct cfg80211_wowlan_tcp *cfg;
+ struct nl80211_wowlan_tcp_data_token *tok = NULL;
+ struct nl80211_wowlan_tcp_data_seq *seq = NULL;
+ u32 size;
+ u32 data_size, wake_size, tokens_size = 0, wake_mask_size;
+ int err, port;
+
+ if (!rdev->wiphy.wowlan->tcp)
+ return -EINVAL;
+
+ err = nla_parse(tb, MAX_NL80211_WOWLAN_TCP,
+ nla_data(attr), nla_len(attr),
+ nl80211_wowlan_tcp_policy);
+ if (err)
+ return err;
+
+ if (!tb[NL80211_WOWLAN_TCP_SRC_IPV4] ||
+ !tb[NL80211_WOWLAN_TCP_DST_IPV4] ||
+ !tb[NL80211_WOWLAN_TCP_DST_MAC] ||
+ !tb[NL80211_WOWLAN_TCP_DST_PORT] ||
+ !tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD] ||
+ !tb[NL80211_WOWLAN_TCP_DATA_INTERVAL] ||
+ !tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD] ||
+ !tb[NL80211_WOWLAN_TCP_WAKE_MASK])
+ return -EINVAL;
+
+ data_size = nla_len(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD]);
+ if (data_size > rdev->wiphy.wowlan->tcp->data_payload_max)
+ return -EINVAL;
+
+ if (nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]) >
+ rdev->wiphy.wowlan->tcp->data_interval_max ||
+ nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]) == 0)
+ return -EINVAL;
+
+ wake_size = nla_len(tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD]);
+ if (wake_size > rdev->wiphy.wowlan->tcp->wake_payload_max)
+ return -EINVAL;
+
+ wake_mask_size = nla_len(tb[NL80211_WOWLAN_TCP_WAKE_MASK]);
+ if (wake_mask_size != DIV_ROUND_UP(wake_size, 8))
+ return -EINVAL;
+
+ if (tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]) {
+ u32 tokln = nla_len(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]);
+
+ tok = nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]);
+ tokens_size = tokln - sizeof(*tok);
+
+ if (!tok->len || tokens_size % tok->len)
+ return -EINVAL;
+ if (!rdev->wiphy.wowlan->tcp->tok)
+ return -EINVAL;
+ if (tok->len > rdev->wiphy.wowlan->tcp->tok->max_len)
+ return -EINVAL;
+ if (tok->len < rdev->wiphy.wowlan->tcp->tok->min_len)
+ return -EINVAL;
+ if (tokens_size > rdev->wiphy.wowlan->tcp->tok->bufsize)
+ return -EINVAL;
+ if (tok->offset + tok->len > data_size)
+ return -EINVAL;
+ }
+
+ if (tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ]) {
+ seq = nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ]);
+ if (!rdev->wiphy.wowlan->tcp->seq)
+ return -EINVAL;
+ if (seq->len == 0 || seq->len > 4)
+ return -EINVAL;
+ if (seq->len + seq->offset > data_size)
+ return -EINVAL;
+ }
+
+ size = sizeof(*cfg);
+ size += data_size;
+ size += wake_size + wake_mask_size;
+ size += tokens_size;
+
+ cfg = kzalloc(size, GFP_KERNEL);
+ if (!cfg)
+ return -ENOMEM;
+ cfg->src = nla_get_in_addr(tb[NL80211_WOWLAN_TCP_SRC_IPV4]);
+ cfg->dst = nla_get_in_addr(tb[NL80211_WOWLAN_TCP_DST_IPV4]);
+ memcpy(cfg->dst_mac, nla_data(tb[NL80211_WOWLAN_TCP_DST_MAC]),
+ ETH_ALEN);
+ if (tb[NL80211_WOWLAN_TCP_SRC_PORT])
+ port = nla_get_u16(tb[NL80211_WOWLAN_TCP_SRC_PORT]);
+ else
+ port = 0;
+#ifdef CONFIG_INET
+ /* allocate a socket and port for it and use it */
+ err = __sock_create(wiphy_net(&rdev->wiphy), PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &cfg->sock, 1);
+ if (err) {
+ kfree(cfg);
+ return err;
+ }
+ if (inet_csk_get_port(cfg->sock->sk, port)) {
+ sock_release(cfg->sock);
+ kfree(cfg);
+ return -EADDRINUSE;
+ }
+ cfg->src_port = inet_sk(cfg->sock->sk)->inet_num;
+#else
+ if (!port) {
+ kfree(cfg);
+ return -EINVAL;
+ }
+ cfg->src_port = port;
+#endif
+
+ cfg->dst_port = nla_get_u16(tb[NL80211_WOWLAN_TCP_DST_PORT]);
+ cfg->payload_len = data_size;
+ cfg->payload = (u8 *)cfg + sizeof(*cfg) + tokens_size;
+ memcpy((void *)cfg->payload,
+ nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD]),
+ data_size);
+ if (seq)
+ cfg->payload_seq = *seq;
+ cfg->data_interval = nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]);
+ cfg->wake_len = wake_size;
+ cfg->wake_data = (u8 *)cfg + sizeof(*cfg) + tokens_size + data_size;
+ memcpy((void *)cfg->wake_data,
+ nla_data(tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD]),
+ wake_size);
+ cfg->wake_mask = (u8 *)cfg + sizeof(*cfg) + tokens_size +
+ data_size + wake_size;
+ memcpy((void *)cfg->wake_mask,
+ nla_data(tb[NL80211_WOWLAN_TCP_WAKE_MASK]),
+ wake_mask_size);
+ if (tok) {
+ cfg->tokens_size = tokens_size;
+ memcpy(&cfg->payload_tok, tok, sizeof(*tok) + tokens_size);
+ }
+
+ trig->tcp = cfg;
+
+ return 0;
+}
+
+static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev,
+ const struct wiphy_wowlan_support *wowlan,
+ struct nlattr *attr,
+ struct cfg80211_wowlan *trig)
+{
+ struct nlattr **tb;
+ int err;
+
+ tb = kzalloc(NUM_NL80211_ATTR * sizeof(*tb), GFP_KERNEL);
+ if (!tb)
+ return -ENOMEM;
+
+ if (!(wowlan->flags & WIPHY_WOWLAN_NET_DETECT)) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = nla_parse(tb, NL80211_ATTR_MAX,
+ nla_data(attr), nla_len(attr),
+ nl80211_policy);
+ if (err)
+ goto out;
+
+ trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb);
+ err = PTR_ERR_OR_ZERO(trig->nd_config);
+ if (err)
+ trig->nd_config = NULL;
+
+out:
+ kfree(tb);
+ return err;
+}
+
+static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct nlattr *tb[NUM_NL80211_WOWLAN_TRIG];
+ struct cfg80211_wowlan new_triggers = {};
+ struct cfg80211_wowlan *ntrig;
+ const struct wiphy_wowlan_support *wowlan = rdev->wiphy.wowlan;
+ int err, i;
+ bool prev_enabled = rdev->wiphy.wowlan_config;
+ bool regular = false;
+
+ if (!wowlan)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]) {
+ cfg80211_rdev_free_wowlan(rdev);
+ rdev->wiphy.wowlan_config = NULL;
+ goto set_wakeup;
+ }
+
+ err = nla_parse(tb, MAX_NL80211_WOWLAN_TRIG,
+ nla_data(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]),
+ nla_len(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]),
+ nl80211_wowlan_policy);
+ if (err)
+ return err;
+
+ if (tb[NL80211_WOWLAN_TRIG_ANY]) {
+ if (!(wowlan->flags & WIPHY_WOWLAN_ANY))
+ return -EINVAL;
+ new_triggers.any = true;
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_DISCONNECT]) {
+ if (!(wowlan->flags & WIPHY_WOWLAN_DISCONNECT))
+ return -EINVAL;
+ new_triggers.disconnect = true;
+ regular = true;
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_MAGIC_PKT]) {
+ if (!(wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT))
+ return -EINVAL;
+ new_triggers.magic_pkt = true;
+ regular = true;
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED])
+ return -EINVAL;
+
+ if (tb[NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE]) {
+ if (!(wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE))
+ return -EINVAL;
+ new_triggers.gtk_rekey_failure = true;
+ regular = true;
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST]) {
+ if (!(wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ))
+ return -EINVAL;
+ new_triggers.eap_identity_req = true;
+ regular = true;
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE]) {
+ if (!(wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE))
+ return -EINVAL;
+ new_triggers.four_way_handshake = true;
+ regular = true;
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_RFKILL_RELEASE]) {
+ if (!(wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE))
+ return -EINVAL;
+ new_triggers.rfkill_release = true;
+ regular = true;
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_PKT_PATTERN]) {
+ struct nlattr *pat;
+ int n_patterns = 0;
+ int rem, pat_len, mask_len, pkt_offset;
+ struct nlattr *pat_tb[NUM_NL80211_PKTPAT];
+
+ regular = true;
+
+ nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN],
+ rem)
+ n_patterns++;
+ if (n_patterns > wowlan->n_patterns)
+ return -EINVAL;
+
+ new_triggers.patterns = kcalloc(n_patterns,
+ sizeof(new_triggers.patterns[0]),
+ GFP_KERNEL);
+ if (!new_triggers.patterns)
+ return -ENOMEM;
+
+ new_triggers.n_patterns = n_patterns;
+ i = 0;
+
+ nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN],
+ rem) {
+ u8 *mask_pat;
+
+ nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
+ nla_len(pat), NULL);
+ err = -EINVAL;
+ if (!pat_tb[NL80211_PKTPAT_MASK] ||
+ !pat_tb[NL80211_PKTPAT_PATTERN])
+ goto error;
+ pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]);
+ mask_len = DIV_ROUND_UP(pat_len, 8);
+ if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len)
+ goto error;
+ if (pat_len > wowlan->pattern_max_len ||
+ pat_len < wowlan->pattern_min_len)
+ goto error;
+
+ if (!pat_tb[NL80211_PKTPAT_OFFSET])
+ pkt_offset = 0;
+ else
+ pkt_offset = nla_get_u32(
+ pat_tb[NL80211_PKTPAT_OFFSET]);
+ if (pkt_offset > wowlan->max_pkt_offset)
+ goto error;
+ new_triggers.patterns[i].pkt_offset = pkt_offset;
+
+ mask_pat = kmalloc(mask_len + pat_len, GFP_KERNEL);
+ if (!mask_pat) {
+ err = -ENOMEM;
+ goto error;
+ }
+ new_triggers.patterns[i].mask = mask_pat;
+ memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_MASK]),
+ mask_len);
+ mask_pat += mask_len;
+ new_triggers.patterns[i].pattern = mask_pat;
+ new_triggers.patterns[i].pattern_len = pat_len;
+ memcpy(mask_pat,
+ nla_data(pat_tb[NL80211_PKTPAT_PATTERN]),
+ pat_len);
+ i++;
+ }
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION]) {
+ regular = true;
+ err = nl80211_parse_wowlan_tcp(
+ rdev, tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION],
+ &new_triggers);
+ if (err)
+ goto error;
+ }
+
+ if (tb[NL80211_WOWLAN_TRIG_NET_DETECT]) {
+ regular = true;
+ err = nl80211_parse_wowlan_nd(
+ rdev, wowlan, tb[NL80211_WOWLAN_TRIG_NET_DETECT],
+ &new_triggers);
+ if (err)
+ goto error;
+ }
+
+ /* The 'any' trigger means the device continues operating more or less
+ * as in its normal operation mode and wakes up the host on most of the
+ * normal interrupts (like packet RX, ...)
+ * It therefore makes little sense to combine with the more constrained
+ * wakeup trigger modes.
+ */
+ if (new_triggers.any && regular) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ ntrig = kmemdup(&new_triggers, sizeof(new_triggers), GFP_KERNEL);
+ if (!ntrig) {
+ err = -ENOMEM;
+ goto error;
+ }
+ cfg80211_rdev_free_wowlan(rdev);
+ rdev->wiphy.wowlan_config = ntrig;
+
+ set_wakeup:
+ if (rdev->ops->set_wakeup &&
+ prev_enabled != !!rdev->wiphy.wowlan_config)
+ rdev_set_wakeup(rdev, rdev->wiphy.wowlan_config);
+
+ return 0;
+ error:
+ for (i = 0; i < new_triggers.n_patterns; i++)
+ kfree(new_triggers.patterns[i].mask);
+ kfree(new_triggers.patterns);
+ if (new_triggers.tcp && new_triggers.tcp->sock)
+ sock_release(new_triggers.tcp->sock);
+ kfree(new_triggers.tcp);
+ return err;
+}
+#endif
+
+static int nl80211_send_coalesce_rules(struct sk_buff *msg,
+ struct cfg80211_registered_device *rdev)
+{
+ struct nlattr *nl_pats, *nl_pat, *nl_rule, *nl_rules;
+ int i, j, pat_len;
+ struct cfg80211_coalesce_rules *rule;
+
+ if (!rdev->coalesce->n_rules)
+ return 0;
+
+ nl_rules = nla_nest_start(msg, NL80211_ATTR_COALESCE_RULE);
+ if (!nl_rules)
+ return -ENOBUFS;
+
+ for (i = 0; i < rdev->coalesce->n_rules; i++) {
+ nl_rule = nla_nest_start(msg, i + 1);
+ if (!nl_rule)
+ return -ENOBUFS;
+
+ rule = &rdev->coalesce->rules[i];
+ if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_DELAY,
+ rule->delay))
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_CONDITION,
+ rule->condition))
+ return -ENOBUFS;
+
+ nl_pats = nla_nest_start(msg,
+ NL80211_ATTR_COALESCE_RULE_PKT_PATTERN);
+ if (!nl_pats)
+ return -ENOBUFS;
+
+ for (j = 0; j < rule->n_patterns; j++) {
+ nl_pat = nla_nest_start(msg, j + 1);
+ if (!nl_pat)
+ return -ENOBUFS;
+ pat_len = rule->patterns[j].pattern_len;
+ if (nla_put(msg, NL80211_PKTPAT_MASK,
+ DIV_ROUND_UP(pat_len, 8),
+ rule->patterns[j].mask) ||
+ nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len,
+ rule->patterns[j].pattern) ||
+ nla_put_u32(msg, NL80211_PKTPAT_OFFSET,
+ rule->patterns[j].pkt_offset))
+ return -ENOBUFS;
+ nla_nest_end(msg, nl_pat);
+ }
+ nla_nest_end(msg, nl_pats);
+ nla_nest_end(msg, nl_rule);
+ }
+ nla_nest_end(msg, nl_rules);
+
+ return 0;
+}
+
+static int nl80211_get_coalesce(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct sk_buff *msg;
+ void *hdr;
+
+ if (!rdev->wiphy.coalesce)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_GET_COALESCE);
+ if (!hdr)
+ goto nla_put_failure;
+
+ if (rdev->coalesce && nl80211_send_coalesce_rules(msg, rdev))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ nlmsg_free(msg);
+ return -ENOBUFS;
+}
+
+void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_coalesce *coalesce = rdev->coalesce;
+ int i, j;
+ struct cfg80211_coalesce_rules *rule;
+
+ if (!coalesce)
+ return;
+
+ for (i = 0; i < coalesce->n_rules; i++) {
+ rule = &coalesce->rules[i];
+ for (j = 0; j < rule->n_patterns; j++)
+ kfree(rule->patterns[j].mask);
+ kfree(rule->patterns);
+ }
+ kfree(coalesce->rules);
+ kfree(coalesce);
+ rdev->coalesce = NULL;
+}
+
+static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
+ struct nlattr *rule,
+ struct cfg80211_coalesce_rules *new_rule)
+{
+ int err, i;
+ const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce;
+ struct nlattr *tb[NUM_NL80211_ATTR_COALESCE_RULE], *pat;
+ int rem, pat_len, mask_len, pkt_offset, n_patterns = 0;
+ struct nlattr *pat_tb[NUM_NL80211_PKTPAT];
+
+ err = nla_parse(tb, NL80211_ATTR_COALESCE_RULE_MAX, nla_data(rule),
+ nla_len(rule), nl80211_coalesce_policy);
+ if (err)
+ return err;
+
+ if (tb[NL80211_ATTR_COALESCE_RULE_DELAY])
+ new_rule->delay =
+ nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_DELAY]);
+ if (new_rule->delay > coalesce->max_delay)
+ return -EINVAL;
+
+ if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION])
+ new_rule->condition =
+ nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]);
+ if (new_rule->condition != NL80211_COALESCE_CONDITION_MATCH &&
+ new_rule->condition != NL80211_COALESCE_CONDITION_NO_MATCH)
+ return -EINVAL;
+
+ if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN])
+ return -EINVAL;
+
+ nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN],
+ rem)
+ n_patterns++;
+ if (n_patterns > coalesce->n_patterns)
+ return -EINVAL;
+
+ new_rule->patterns = kcalloc(n_patterns, sizeof(new_rule->patterns[0]),
+ GFP_KERNEL);
+ if (!new_rule->patterns)
+ return -ENOMEM;
+
+ new_rule->n_patterns = n_patterns;
+ i = 0;
+
+ nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN],
+ rem) {
+ u8 *mask_pat;
+
+ nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
+ nla_len(pat), NULL);
+ if (!pat_tb[NL80211_PKTPAT_MASK] ||
+ !pat_tb[NL80211_PKTPAT_PATTERN])
+ return -EINVAL;
+ pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]);
+ mask_len = DIV_ROUND_UP(pat_len, 8);
+ if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len)
+ return -EINVAL;
+ if (pat_len > coalesce->pattern_max_len ||
+ pat_len < coalesce->pattern_min_len)
+ return -EINVAL;
+
+ if (!pat_tb[NL80211_PKTPAT_OFFSET])
+ pkt_offset = 0;
+ else
+ pkt_offset = nla_get_u32(pat_tb[NL80211_PKTPAT_OFFSET]);
+ if (pkt_offset > coalesce->max_pkt_offset)
+ return -EINVAL;
+ new_rule->patterns[i].pkt_offset = pkt_offset;
+
+ mask_pat = kmalloc(mask_len + pat_len, GFP_KERNEL);
+ if (!mask_pat)
+ return -ENOMEM;
+
+ new_rule->patterns[i].mask = mask_pat;
+ memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_MASK]),
+ mask_len);
+
+ mask_pat += mask_len;
+ new_rule->patterns[i].pattern = mask_pat;
+ new_rule->patterns[i].pattern_len = pat_len;
+ memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_PATTERN]),
+ pat_len);
+ i++;
+ }
+
+ return 0;
+}
+
+static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce;
+ struct cfg80211_coalesce new_coalesce = {};
+ struct cfg80211_coalesce *n_coalesce;
+ int err, rem_rule, n_rules = 0, i, j;
+ struct nlattr *rule;
+ struct cfg80211_coalesce_rules *tmp_rule;
+
+ if (!rdev->wiphy.coalesce || !rdev->ops->set_coalesce)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) {
+ cfg80211_rdev_free_coalesce(rdev);
+ rdev->ops->set_coalesce(&rdev->wiphy, NULL);
+ return 0;
+ }
+
+ nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE],
+ rem_rule)
+ n_rules++;
+ if (n_rules > coalesce->n_rules)
+ return -EINVAL;
+
+ new_coalesce.rules = kcalloc(n_rules, sizeof(new_coalesce.rules[0]),
+ GFP_KERNEL);
+ if (!new_coalesce.rules)
+ return -ENOMEM;
+
+ new_coalesce.n_rules = n_rules;
+ i = 0;
+
+ nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE],
+ rem_rule) {
+ err = nl80211_parse_coalesce_rule(rdev, rule,
+ &new_coalesce.rules[i]);
+ if (err)
+ goto error;
+
+ i++;
+ }
+
+ err = rdev->ops->set_coalesce(&rdev->wiphy, &new_coalesce);
+ if (err)
+ goto error;
+
+ n_coalesce = kmemdup(&new_coalesce, sizeof(new_coalesce), GFP_KERNEL);
+ if (!n_coalesce) {
+ err = -ENOMEM;
+ goto error;
+ }
+ cfg80211_rdev_free_coalesce(rdev);
+ rdev->coalesce = n_coalesce;
+
+ return 0;
+error:
+ for (i = 0; i < new_coalesce.n_rules; i++) {
+ tmp_rule = &new_coalesce.rules[i];
+ for (j = 0; j < tmp_rule->n_patterns; j++)
+ kfree(tmp_rule->patterns[j].mask);
+ kfree(tmp_rule->patterns);
+ }
+ kfree(new_coalesce.rules);
+
+ return err;
+}
+
+static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct nlattr *tb[NUM_NL80211_REKEY_DATA];
+ struct cfg80211_gtk_rekey_data rekey_data;
+ int err;
+
+ if (!info->attrs[NL80211_ATTR_REKEY_DATA])
+ return -EINVAL;
+
+ err = nla_parse(tb, MAX_NL80211_REKEY_DATA,
+ nla_data(info->attrs[NL80211_ATTR_REKEY_DATA]),
+ nla_len(info->attrs[NL80211_ATTR_REKEY_DATA]),
+ nl80211_rekey_policy);
+ if (err)
+ return err;
+
+ if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN)
+ return -ERANGE;
+ if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN)
+ return -ERANGE;
+ if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN)
+ return -ERANGE;
+
+ rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]);
+ rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]);
+ rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]);
+
+ wdev_lock(wdev);
+ if (!wdev->current_bss) {
+ err = -ENOTCONN;
+ goto out;
+ }
+
+ if (!rdev->ops->set_rekey_data) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = rdev_set_rekey_data(rdev, dev, &rekey_data);
+ out:
+ wdev_unlock(wdev);
+ return err;
+}
+
+static int nl80211_register_unexpected_frame(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ if (wdev->iftype != NL80211_IFTYPE_AP &&
+ wdev->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
+
+ if (wdev->ap_unexpected_nlportid)
+ return -EBUSY;
+
+ wdev->ap_unexpected_nlportid = info->snd_portid;
+ return 0;
+}
+
+static int nl80211_probe_client(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct sk_buff *msg;
+ void *hdr;
+ const u8 *addr;
+ u64 cookie;
+ int err;
+
+ if (wdev->iftype != NL80211_IFTYPE_AP &&
+ wdev->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ if (!rdev->ops->probe_client)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_PROBE_CLIENT);
+ if (!hdr) {
+ err = -ENOBUFS;
+ goto free_msg;
+ }
+
+ addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ err = rdev_probe_client(rdev, dev, addr, &cookie);
+ if (err)
+ goto free_msg;
+
+ if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+ err = -ENOBUFS;
+ free_msg:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int nl80211_register_beacons(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct cfg80211_beacon_registration *reg, *nreg;
+ int rv;
+
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS))
+ return -EOPNOTSUPP;
+
+ nreg = kzalloc(sizeof(*nreg), GFP_KERNEL);
+ if (!nreg)
+ return -ENOMEM;
+
+ /* First, check if already registered. */
+ spin_lock_bh(&rdev->beacon_registrations_lock);
+ list_for_each_entry(reg, &rdev->beacon_registrations, list) {
+ if (reg->nlportid == info->snd_portid) {
+ rv = -EALREADY;
+ goto out_err;
+ }
+ }
+ /* Add it to the list */
+ nreg->nlportid = info->snd_portid;
+ list_add(&nreg->list, &rdev->beacon_registrations);
+
+ spin_unlock_bh(&rdev->beacon_registrations_lock);
+
+ return 0;
+out_err:
+ spin_unlock_bh(&rdev->beacon_registrations_lock);
+ kfree(nreg);
+ return rv;
+}
+
+static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ int err;
+
+ if (!rdev->ops->start_p2p_device)
+ return -EOPNOTSUPP;
+
+ if (wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)
+ return -EOPNOTSUPP;
+
+ if (wdev->p2p_started)
+ return 0;
+
+ if (rfkill_blocked(rdev->rfkill))
+ return -ERFKILL;
+
+ err = rdev_start_p2p_device(rdev, wdev);
+ if (err)
+ return err;
+
+ wdev->p2p_started = true;
+ rdev->opencount++;
+
+ return 0;
+}
+
+static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+
+ if (wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->stop_p2p_device)
+ return -EOPNOTSUPP;
+
+ cfg80211_stop_p2p_device(rdev, wdev);
+
+ return 0;
+}
+
+static int nl80211_get_protocol_features(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ void *hdr;
+ struct sk_buff *msg;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_GET_PROTOCOL_FEATURES);
+ if (!hdr)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_PROTOCOL_FEATURES,
+ NL80211_PROTOCOL_FEATURE_SPLIT_WIPHY_DUMP))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+ kfree_skb(msg);
+ return -ENOBUFS;
+}
+
+static int nl80211_update_ft_ies(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct cfg80211_update_ft_ies_params ft_params;
+ struct net_device *dev = info->user_ptr[1];
+
+ if (!rdev->ops->update_ft_ies)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_MDID] ||
+ !is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ return -EINVAL;
+
+ memset(&ft_params, 0, sizeof(ft_params));
+ ft_params.md = nla_get_u16(info->attrs[NL80211_ATTR_MDID]);
+ ft_params.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+ ft_params.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+
+ return rdev_update_ft_ies(rdev, dev, &ft_params);
+}
+
+static int nl80211_crit_protocol_start(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+ enum nl80211_crit_proto_id proto = NL80211_CRIT_PROTO_UNSPEC;
+ u16 duration;
+ int ret;
+
+ if (!rdev->ops->crit_proto_start)
+ return -EOPNOTSUPP;
+
+ if (WARN_ON(!rdev->ops->crit_proto_stop))
+ return -EINVAL;
+
+ if (rdev->crit_proto_nlportid)
+ return -EBUSY;
+
+ /* determine protocol if provided */
+ if (info->attrs[NL80211_ATTR_CRIT_PROT_ID])
+ proto = nla_get_u16(info->attrs[NL80211_ATTR_CRIT_PROT_ID]);
+
+ if (proto >= NUM_NL80211_CRIT_PROTO)
+ return -EINVAL;
+
+ /* timeout must be provided */
+ if (!info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION])
+ return -EINVAL;
+
+ duration =
+ nla_get_u16(info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION]);
+
+ if (duration > NL80211_CRIT_PROTO_MAX_DURATION)
+ return -ERANGE;
+
+ ret = rdev_crit_proto_start(rdev, wdev, proto, duration);
+ if (!ret)
+ rdev->crit_proto_nlportid = info->snd_portid;
+
+ return ret;
+}
+
+static int nl80211_crit_protocol_stop(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+
+ if (!rdev->ops->crit_proto_stop)
+ return -EOPNOTSUPP;
+
+ if (rdev->crit_proto_nlportid) {
+ rdev->crit_proto_nlportid = 0;
+ rdev_crit_proto_stop(rdev, wdev);
+ }
+ return 0;
+}
+
+static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev =
+ __cfg80211_wdev_from_attrs(genl_info_net(info), info->attrs);
+ int i, err;
+ u32 vid, subcmd;
+
+ if (!rdev->wiphy.vendor_commands)
+ return -EOPNOTSUPP;
+
+ if (IS_ERR(wdev)) {
+ err = PTR_ERR(wdev);
+ if (err != -EINVAL)
+ return err;
+ wdev = NULL;
+ } else if (wdev->wiphy != &rdev->wiphy) {
+ return -EINVAL;
+ }
+
+ if (!info->attrs[NL80211_ATTR_VENDOR_ID] ||
+ !info->attrs[NL80211_ATTR_VENDOR_SUBCMD])
+ return -EINVAL;
+
+ vid = nla_get_u32(info->attrs[NL80211_ATTR_VENDOR_ID]);
+ subcmd = nla_get_u32(info->attrs[NL80211_ATTR_VENDOR_SUBCMD]);
+ for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) {
+ const struct wiphy_vendor_command *vcmd;
+ void *data = NULL;
+ int len = 0;
+
+ vcmd = &rdev->wiphy.vendor_commands[i];
+
+ if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd)
+ continue;
+
+ if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV |
+ WIPHY_VENDOR_CMD_NEED_NETDEV)) {
+ if (!wdev)
+ return -EINVAL;
+ if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV &&
+ !wdev->netdev)
+ return -EINVAL;
+
+ if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
+ if (wdev->netdev &&
+ !netif_running(wdev->netdev))
+ return -ENETDOWN;
+ if (!wdev->netdev && !wdev->p2p_started)
+ return -ENETDOWN;
+ }
+ } else {
+ wdev = NULL;
+ }
+
+ if (info->attrs[NL80211_ATTR_VENDOR_DATA]) {
+ data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]);
+ len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]);
+ }
+
+ rdev->cur_cmd_info = info;
+ err = rdev->wiphy.vendor_commands[i].doit(&rdev->wiphy, wdev,
+ data, len);
+ rdev->cur_cmd_info = NULL;
+ return err;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy,
+ enum nl80211_commands cmd,
+ enum nl80211_attrs attr,
+ int approxlen)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ if (WARN_ON(!rdev->cur_cmd_info))
+ return NULL;
+
+ return __cfg80211_alloc_vendor_skb(rdev, NULL, approxlen,
+ rdev->cur_cmd_info->snd_portid,
+ rdev->cur_cmd_info->snd_seq,
+ cmd, attr, NULL, GFP_KERNEL);
+}
+EXPORT_SYMBOL(__cfg80211_alloc_reply_skb);
+
+int cfg80211_vendor_cmd_reply(struct sk_buff *skb)
+{
+ struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0];
+ void *hdr = ((void **)skb->cb)[1];
+ struct nlattr *data = ((void **)skb->cb)[2];
+
+ /* clear CB data for netlink core to own from now on */
+ memset(skb->cb, 0, sizeof(skb->cb));
+
+ if (WARN_ON(!rdev->cur_cmd_info)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ nla_nest_end(skb, data);
+ genlmsg_end(skb, hdr);
+ return genlmsg_reply(skb, rdev->cur_cmd_info);
+}
+EXPORT_SYMBOL_GPL(cfg80211_vendor_cmd_reply);
+
+
+static int nl80211_set_qos_map(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct cfg80211_qos_map *qos_map = NULL;
+ struct net_device *dev = info->user_ptr[1];
+ u8 *pos, len, num_des, des_len, des;
+ int ret;
+
+ if (!rdev->ops->set_qos_map)
+ return -EOPNOTSUPP;
+
+ if (info->attrs[NL80211_ATTR_QOS_MAP]) {
+ pos = nla_data(info->attrs[NL80211_ATTR_QOS_MAP]);
+ len = nla_len(info->attrs[NL80211_ATTR_QOS_MAP]);
+
+ if (len % 2 || len < IEEE80211_QOS_MAP_LEN_MIN ||
+ len > IEEE80211_QOS_MAP_LEN_MAX)
+ return -EINVAL;
+
+ qos_map = kzalloc(sizeof(struct cfg80211_qos_map), GFP_KERNEL);
+ if (!qos_map)
+ return -ENOMEM;
+
+ num_des = (len - IEEE80211_QOS_MAP_LEN_MIN) >> 1;
+ if (num_des) {
+ des_len = num_des *
+ sizeof(struct cfg80211_dscp_exception);
+ memcpy(qos_map->dscp_exception, pos, des_len);
+ qos_map->num_des = num_des;
+ for (des = 0; des < num_des; des++) {
+ if (qos_map->dscp_exception[des].up > 7) {
+ kfree(qos_map);
+ return -EINVAL;
+ }
+ }
+ pos += des_len;
+ }
+ memcpy(qos_map->up, pos, IEEE80211_QOS_MAP_LEN_MIN);
+ }
+
+ wdev_lock(dev->ieee80211_ptr);
+ ret = nl80211_key_allowed(dev->ieee80211_ptr);
+ if (!ret)
+ ret = rdev_set_qos_map(rdev, dev, qos_map);
+ wdev_unlock(dev->ieee80211_ptr);
+
+ kfree(qos_map);
+ return ret;
+}
+
+static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ const u8 *peer;
+ u8 tsid, up;
+ u16 admitted_time = 0;
+ int err;
+
+ if (!(rdev->wiphy.features & NL80211_FEATURE_SUPPORTS_WMM_ADMISSION))
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC] ||
+ !info->attrs[NL80211_ATTR_USER_PRIO])
+ return -EINVAL;
+
+ tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]);
+ if (tsid >= IEEE80211_NUM_TIDS)
+ return -EINVAL;
+
+ up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]);
+ if (up >= IEEE80211_NUM_UPS)
+ return -EINVAL;
+
+ /* WMM uses TIDs 0-7 even for TSPEC */
+ if (tsid >= IEEE80211_FIRST_TSPEC_TSID) {
+ /* TODO: handle 802.11 TSPEC/admission control
+ * need more attributes for that (e.g. BA session requirement);
+ * change the WMM adminssion test above to allow both then
+ */
+ return -EINVAL;
+ }
+
+ peer = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ if (info->attrs[NL80211_ATTR_ADMITTED_TIME]) {
+ admitted_time =
+ nla_get_u16(info->attrs[NL80211_ATTR_ADMITTED_TIME]);
+ if (!admitted_time)
+ return -EINVAL;
+ }
+
+ wdev_lock(wdev);
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ if (wdev->current_bss)
+ break;
+ err = -ENOTCONN;
+ goto out;
+ default:
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time);
+
+ out:
+ wdev_unlock(wdev);
+ return err;
+}
+
+static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ const u8 *peer;
+ u8 tsid;
+ int err;
+
+ if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]);
+ peer = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ wdev_lock(wdev);
+ err = rdev_del_tx_ts(rdev, dev, tsid, peer);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+static int nl80211_tdls_channel_switch(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_chan_def chandef = {};
+ const u8 *addr;
+ u8 oper_class;
+ int err;
+
+ if (!rdev->ops->tdls_channel_switch ||
+ !(rdev->wiphy.features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH))
+ return -EOPNOTSUPP;
+
+ switch (dev->ieee80211_ptr->iftype) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (!info->attrs[NL80211_ATTR_MAC] ||
+ !info->attrs[NL80211_ATTR_OPER_CLASS])
+ return -EINVAL;
+
+ err = nl80211_parse_chandef(rdev, info, &chandef);
+ if (err)
+ return err;
+
+ /*
+ * Don't allow wide channels on the 2.4Ghz band, as per IEEE802.11-2012
+ * section 10.22.6.2.1. Disallow 5/10Mhz channels as well for now, the
+ * specification is not defined for them.
+ */
+ if (chandef.chan->band == IEEE80211_BAND_2GHZ &&
+ chandef.width != NL80211_CHAN_WIDTH_20_NOHT &&
+ chandef.width != NL80211_CHAN_WIDTH_20)
+ return -EINVAL;
+
+ /* we will be active on the TDLS link */
+ if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, wdev->iftype))
+ return -EINVAL;
+
+ /* don't allow switching to DFS channels */
+ if (cfg80211_chandef_dfs_required(wdev->wiphy, &chandef, wdev->iftype))
+ return -EINVAL;
+
+ addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ oper_class = nla_get_u8(info->attrs[NL80211_ATTR_OPER_CLASS]);
+
+ wdev_lock(wdev);
+ err = rdev_tdls_channel_switch(rdev, dev, addr, oper_class, &chandef);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ const u8 *addr;
+
+ if (!rdev->ops->tdls_channel_switch ||
+ !rdev->ops->tdls_cancel_channel_switch ||
+ !(rdev->wiphy.features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH))
+ return -EOPNOTSUPP;
+
+ switch (dev->ieee80211_ptr->iftype) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
+
+ addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ wdev_lock(wdev);
+ rdev_tdls_cancel_channel_switch(rdev, dev, addr);
+ wdev_unlock(wdev);
+
+ return 0;
+}
+
+#define NL80211_FLAG_NEED_WIPHY 0x01
+#define NL80211_FLAG_NEED_NETDEV 0x02
+#define NL80211_FLAG_NEED_RTNL 0x04
+#define NL80211_FLAG_CHECK_NETDEV_UP 0x08
+#define NL80211_FLAG_NEED_NETDEV_UP (NL80211_FLAG_NEED_NETDEV |\
+ NL80211_FLAG_CHECK_NETDEV_UP)
+#define NL80211_FLAG_NEED_WDEV 0x10
+/* If a netdev is associated, it must be UP, P2P must be started */
+#define NL80211_FLAG_NEED_WDEV_UP (NL80211_FLAG_NEED_WDEV |\
+ NL80211_FLAG_CHECK_NETDEV_UP)
+#define NL80211_FLAG_CLEAR_SKB 0x20
+
+static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+ struct net_device *dev;
+ bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL;
+
+ if (rtnl)
+ rtnl_lock();
+
+ if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) {
+ rdev = cfg80211_get_dev_from_info(genl_info_net(info), info);
+ if (IS_ERR(rdev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return PTR_ERR(rdev);
+ }
+ info->user_ptr[0] = rdev;
+ } else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV ||
+ ops->internal_flags & NL80211_FLAG_NEED_WDEV) {
+ ASSERT_RTNL();
+
+ wdev = __cfg80211_wdev_from_attrs(genl_info_net(info),
+ info->attrs);
+ if (IS_ERR(wdev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return PTR_ERR(wdev);
+ }
+
+ dev = wdev->netdev;
+ rdev = wiphy_to_rdev(wdev->wiphy);
+
+ if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) {
+ if (!dev) {
+ if (rtnl)
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ info->user_ptr[1] = dev;
+ } else {
+ info->user_ptr[1] = wdev;
+ }
+
+ if (dev) {
+ if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
+ !netif_running(dev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+
+ dev_hold(dev);
+ } else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) {
+ if (!wdev->p2p_started) {
+ if (rtnl)
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+ }
+
+ info->user_ptr[0] = rdev;
+ }
+
+ return 0;
+}
+
+static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info)
+{
+ if (info->user_ptr[1]) {
+ if (ops->internal_flags & NL80211_FLAG_NEED_WDEV) {
+ struct wireless_dev *wdev = info->user_ptr[1];
+
+ if (wdev->netdev)
+ dev_put(wdev->netdev);
+ } else {
+ dev_put(info->user_ptr[1]);
+ }
+ }
+
+ if (ops->internal_flags & NL80211_FLAG_NEED_RTNL)
+ rtnl_unlock();
+
+ /* If needed, clear the netlink message payload from the SKB
+ * as it might contain key data that shouldn't stick around on
+ * the heap after the SKB is freed. The netlink message header
+ * is still needed for further processing, so leave it intact.
+ */
+ if (ops->internal_flags & NL80211_FLAG_CLEAR_SKB) {
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+
+ memset(nlmsg_data(nlh), 0, nlmsg_len(nlh));
+ }
+}
+
+static const struct genl_ops nl80211_ops[] = {
+ {
+ .cmd = NL80211_CMD_GET_WIPHY,
+ .doit = nl80211_get_wiphy,
+ .dumpit = nl80211_dump_wiphy,
+ .done = nl80211_dump_wiphy_done,
+ .policy = nl80211_policy,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_WIPHY,
+ .doit = nl80211_set_wiphy,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_INTERFACE,
+ .doit = nl80211_get_interface,
+ .dumpit = nl80211_dump_interface,
+ .policy = nl80211_policy,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_WDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_INTERFACE,
+ .doit = nl80211_set_interface,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_NEW_INTERFACE,
+ .doit = nl80211_new_interface,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DEL_INTERFACE,
+ .doit = nl80211_del_interface,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_KEY,
+ .doit = nl80211_get_key,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_KEY,
+ .doit = nl80211_set_key,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
+ },
+ {
+ .cmd = NL80211_CMD_NEW_KEY,
+ .doit = nl80211_new_key,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
+ },
+ {
+ .cmd = NL80211_CMD_DEL_KEY,
+ .doit = nl80211_del_key,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_BEACON,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .doit = nl80211_set_beacon,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_START_AP,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .doit = nl80211_start_ap,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_STOP_AP,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .doit = nl80211_stop_ap,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_STATION,
+ .doit = nl80211_get_station,
+ .dumpit = nl80211_dump_station,
+ .policy = nl80211_policy,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_STATION,
+ .doit = nl80211_set_station,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_NEW_STATION,
+ .doit = nl80211_new_station,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DEL_STATION,
+ .doit = nl80211_del_station,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_MPATH,
+ .doit = nl80211_get_mpath,
+ .dumpit = nl80211_dump_mpath,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_MPP,
+ .doit = nl80211_get_mpp,
+ .dumpit = nl80211_dump_mpp,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_MPATH,
+ .doit = nl80211_set_mpath,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_NEW_MPATH,
+ .doit = nl80211_new_mpath,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DEL_MPATH,
+ .doit = nl80211_del_mpath,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_BSS,
+ .doit = nl80211_set_bss,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_REG,
+ .doit = nl80211_get_reg_do,
+ .dumpit = nl80211_get_reg_dump,
+ .policy = nl80211_policy,
+ .internal_flags = NL80211_FLAG_NEED_RTNL,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = NL80211_CMD_SET_REG,
+ .doit = nl80211_set_reg,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_REQ_SET_REG,
+ .doit = nl80211_req_set_reg,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = NL80211_CMD_GET_MESH_CONFIG,
+ .doit = nl80211_get_mesh_config,
+ .policy = nl80211_policy,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_MESH_CONFIG,
+ .doit = nl80211_update_mesh_config,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_TRIGGER_SCAN,
+ .doit = nl80211_trigger_scan,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_SCAN,
+ .policy = nl80211_policy,
+ .dumpit = nl80211_dump_scan,
+ },
+ {
+ .cmd = NL80211_CMD_START_SCHED_SCAN,
+ .doit = nl80211_start_sched_scan,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_STOP_SCHED_SCAN,
+ .doit = nl80211_stop_sched_scan,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_AUTHENTICATE,
+ .doit = nl80211_authenticate,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
+ },
+ {
+ .cmd = NL80211_CMD_ASSOCIATE,
+ .doit = nl80211_associate,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DEAUTHENTICATE,
+ .doit = nl80211_deauthenticate,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DISASSOCIATE,
+ .doit = nl80211_disassociate,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_JOIN_IBSS,
+ .doit = nl80211_join_ibss,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_LEAVE_IBSS,
+ .doit = nl80211_leave_ibss,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+#ifdef CONFIG_NL80211_TESTMODE
+ {
+ .cmd = NL80211_CMD_TESTMODE,
+ .doit = nl80211_testmode_do,
+ .dumpit = nl80211_testmode_dump,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+#endif
+ {
+ .cmd = NL80211_CMD_CONNECT,
+ .doit = nl80211_connect,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DISCONNECT,
+ .doit = nl80211_disconnect,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_WIPHY_NETNS,
+ .doit = nl80211_wiphy_netns,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_SURVEY,
+ .policy = nl80211_policy,
+ .dumpit = nl80211_dump_survey,
+ },
+ {
+ .cmd = NL80211_CMD_SET_PMKSA,
+ .doit = nl80211_setdel_pmksa,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DEL_PMKSA,
+ .doit = nl80211_setdel_pmksa,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_FLUSH_PMKSA,
+ .doit = nl80211_flush_pmksa,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_REMAIN_ON_CHANNEL,
+ .doit = nl80211_remain_on_channel,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
+ .doit = nl80211_cancel_remain_on_channel,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_TX_BITRATE_MASK,
+ .doit = nl80211_set_tx_bitrate_mask,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_REGISTER_FRAME,
+ .doit = nl80211_register_mgmt,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_FRAME,
+ .doit = nl80211_tx_mgmt,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_FRAME_WAIT_CANCEL,
+ .doit = nl80211_tx_mgmt_cancel_wait,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_POWER_SAVE,
+ .doit = nl80211_set_power_save,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_POWER_SAVE,
+ .doit = nl80211_get_power_save,
+ .policy = nl80211_policy,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_CQM,
+ .doit = nl80211_set_cqm,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_CHANNEL,
+ .doit = nl80211_set_channel,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_WDS_PEER,
+ .doit = nl80211_set_wds_peer,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_JOIN_MESH,
+ .doit = nl80211_join_mesh,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_LEAVE_MESH,
+ .doit = nl80211_leave_mesh,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_JOIN_OCB,
+ .doit = nl80211_join_ocb,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_LEAVE_OCB,
+ .doit = nl80211_leave_ocb,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+#ifdef CONFIG_PM
+ {
+ .cmd = NL80211_CMD_GET_WOWLAN,
+ .doit = nl80211_get_wowlan,
+ .policy = nl80211_policy,
+ /* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_WOWLAN,
+ .doit = nl80211_set_wowlan,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+#endif
+ {
+ .cmd = NL80211_CMD_SET_REKEY_OFFLOAD,
+ .doit = nl80211_set_rekey_data,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL |
+ NL80211_FLAG_CLEAR_SKB,
+ },
+ {
+ .cmd = NL80211_CMD_TDLS_MGMT,
+ .doit = nl80211_tdls_mgmt,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_TDLS_OPER,
+ .doit = nl80211_tdls_oper,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_UNEXPECTED_FRAME,
+ .doit = nl80211_register_unexpected_frame,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_PROBE_CLIENT,
+ .doit = nl80211_probe_client,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_REGISTER_BEACONS,
+ .doit = nl80211_register_beacons,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_NOACK_MAP,
+ .doit = nl80211_set_noack_map,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_START_P2P_DEVICE,
+ .doit = nl80211_start_p2p_device,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_STOP_P2P_DEVICE,
+ .doit = nl80211_stop_p2p_device,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_MCAST_RATE,
+ .doit = nl80211_set_mcast_rate,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_MAC_ACL,
+ .doit = nl80211_set_mac_acl,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_RADAR_DETECT,
+ .doit = nl80211_start_radar_detection,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_PROTOCOL_FEATURES,
+ .doit = nl80211_get_protocol_features,
+ .policy = nl80211_policy,
+ },
+ {
+ .cmd = NL80211_CMD_UPDATE_FT_IES,
+ .doit = nl80211_update_ft_ies,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_CRIT_PROTOCOL_START,
+ .doit = nl80211_crit_protocol_start,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP,
+ .doit = nl80211_crit_protocol_stop,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_GET_COALESCE,
+ .doit = nl80211_get_coalesce,
+ .policy = nl80211_policy,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_COALESCE,
+ .doit = nl80211_set_coalesce,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_CHANNEL_SWITCH,
+ .doit = nl80211_channel_switch,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_VENDOR,
+ .doit = nl80211_vendor_cmd,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_QOS_MAP,
+ .doit = nl80211_set_qos_map,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_ADD_TX_TS,
+ .doit = nl80211_add_tx_ts,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_DEL_TX_TS,
+ .doit = nl80211_del_tx_ts,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH,
+ .doit = nl80211_tdls_channel_switch,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH,
+ .doit = nl80211_tdls_cancel_channel_switch,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+};
+
+/* notification functions */
+
+void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
+ enum nl80211_commands cmd)
+{
+ struct sk_buff *msg;
+ struct nl80211_dump_wiphy_state state = {};
+
+ WARN_ON(cmd != NL80211_CMD_NEW_WIPHY &&
+ cmd != NL80211_CMD_DEL_WIPHY);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ if (nl80211_send_wiphy(rdev, cmd, msg, 0, 0, 0, &state) < 0) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int nl80211_add_scan_req(struct sk_buff *msg,
+ struct cfg80211_registered_device *rdev)
+{
+ struct cfg80211_scan_request *req = rdev->scan_req;
+ struct nlattr *nest;
+ int i;
+
+ if (WARN_ON(!req))
+ return 0;
+
+ nest = nla_nest_start(msg, NL80211_ATTR_SCAN_SSIDS);
+ if (!nest)
+ goto nla_put_failure;
+ for (i = 0; i < req->n_ssids; i++) {
+ if (nla_put(msg, i, req->ssids[i].ssid_len, req->ssids[i].ssid))
+ goto nla_put_failure;
+ }
+ nla_nest_end(msg, nest);
+
+ nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
+ if (!nest)
+ goto nla_put_failure;
+ for (i = 0; i < req->n_channels; i++) {
+ if (nla_put_u32(msg, i, req->channels[i]->center_freq))
+ goto nla_put_failure;
+ }
+ nla_nest_end(msg, nest);
+
+ if (req->ie &&
+ nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie))
+ goto nla_put_failure;
+
+ if (req->flags &&
+ nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags))
+ goto nla_put_failure;
+
+ return 0;
+ nla_put_failure:
+ return -ENOBUFS;
+}
+
+static int nl80211_send_scan_msg(struct sk_buff *msg,
+ struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ u32 portid, u32 seq, int flags,
+ u32 cmd)
+{
+ void *hdr;
+
+ hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -1;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+ wdev->netdev->ifindex)) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ goto nla_put_failure;
+
+ /* ignore errors and send incomplete event anyway */
+ nl80211_add_scan_req(msg, rdev);
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+nl80211_send_sched_scan_msg(struct sk_buff *msg,
+ struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ u32 portid, u32 seq, int flags, u32 cmd)
+{
+ void *hdr;
+
+ hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -1;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ struct sk_buff *msg;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0,
+ NL80211_CMD_TRIGGER_SCAN) < 0) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_SCAN, GFP_KERNEL);
+}
+
+struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, bool aborted)
+{
+ struct sk_buff *msg;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return NULL;
+
+ if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0,
+ aborted ? NL80211_CMD_SCAN_ABORTED :
+ NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
+ nlmsg_free(msg);
+ return NULL;
+ }
+
+ return msg;
+}
+
+void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ if (!msg)
+ return;
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_SCAN, GFP_KERNEL);
+}
+
+void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev)
+{
+ struct sk_buff *msg;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0,
+ NL80211_CMD_SCHED_SCAN_RESULTS) < 0) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_SCAN, GFP_KERNEL);
+}
+
+void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u32 cmd)
+{
+ struct sk_buff *msg;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_SCAN, GFP_KERNEL);
+}
+
+static bool nl80211_reg_change_event_fill(struct sk_buff *msg,
+ struct regulatory_request *request)
+{
+ /* Userspace can always count this one always being set */
+ if (nla_put_u8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator))
+ goto nla_put_failure;
+
+ if (request->alpha2[0] == '0' && request->alpha2[1] == '0') {
+ if (nla_put_u8(msg, NL80211_ATTR_REG_TYPE,
+ NL80211_REGDOM_TYPE_WORLD))
+ goto nla_put_failure;
+ } else if (request->alpha2[0] == '9' && request->alpha2[1] == '9') {
+ if (nla_put_u8(msg, NL80211_ATTR_REG_TYPE,
+ NL80211_REGDOM_TYPE_CUSTOM_WORLD))
+ goto nla_put_failure;
+ } else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') ||
+ request->intersect) {
+ if (nla_put_u8(msg, NL80211_ATTR_REG_TYPE,
+ NL80211_REGDOM_TYPE_INTERSECTION))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u8(msg, NL80211_ATTR_REG_TYPE,
+ NL80211_REGDOM_TYPE_COUNTRY) ||
+ nla_put_string(msg, NL80211_ATTR_REG_ALPHA2,
+ request->alpha2))
+ goto nla_put_failure;
+ }
+
+ if (request->wiphy_idx != WIPHY_IDX_INVALID) {
+ struct wiphy *wiphy = wiphy_idx_to_wiphy(request->wiphy_idx);
+
+ if (wiphy &&
+ nla_put_u32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx))
+ goto nla_put_failure;
+
+ if (wiphy &&
+ wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
+ nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG))
+ goto nla_put_failure;
+ }
+
+ return true;
+
+nla_put_failure:
+ return false;
+}
+
+/*
+ * This can happen on global regulatory changes or device specific settings
+ * based on custom regulatory domains.
+ */
+void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
+ struct regulatory_request *request)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, cmd_id);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nl80211_reg_change_event_fill(msg, request) == false)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ rcu_read_lock();
+ genlmsg_multicast_allns(&nl80211_fam, msg, 0,
+ NL80211_MCGRP_REGULATORY, GFP_ATOMIC);
+ rcu_read_unlock();
+
+ return;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ const u8 *buf, size_t len,
+ enum nl80211_commands cmd, gfp_t gfp,
+ int uapsd_queues)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_FRAME, len, buf))
+ goto nla_put_failure;
+
+ if (uapsd_queues >= 0) {
+ struct nlattr *nla_wmm =
+ nla_nest_start(msg, NL80211_ATTR_STA_WME);
+ if (!nla_wmm)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL80211_STA_WME_UAPSD_QUEUES,
+ uapsd_queues))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nla_wmm);
+ }
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *buf,
+ size_t len, gfp_t gfp)
+{
+ nl80211_send_mlme_event(rdev, netdev, buf, len,
+ NL80211_CMD_AUTHENTICATE, gfp, -1);
+}
+
+void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *buf,
+ size_t len, gfp_t gfp, int uapsd_queues)
+{
+ nl80211_send_mlme_event(rdev, netdev, buf, len,
+ NL80211_CMD_ASSOCIATE, gfp, uapsd_queues);
+}
+
+void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *buf,
+ size_t len, gfp_t gfp)
+{
+ nl80211_send_mlme_event(rdev, netdev, buf, len,
+ NL80211_CMD_DEAUTHENTICATE, gfp, -1);
+}
+
+void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *buf,
+ size_t len, gfp_t gfp)
+{
+ nl80211_send_mlme_event(rdev, netdev, buf, len,
+ NL80211_CMD_DISASSOCIATE, gfp, -1);
+}
+
+void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf,
+ size_t len)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ const struct ieee80211_mgmt *mgmt = (void *)buf;
+ u32 cmd;
+
+ if (WARN_ON(len < 2))
+ return;
+
+ if (ieee80211_is_deauth(mgmt->frame_control))
+ cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE;
+ else
+ cmd = NL80211_CMD_UNPROT_DISASSOCIATE;
+
+ trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len);
+ nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1);
+}
+EXPORT_SYMBOL(cfg80211_rx_unprot_mlme_mgmt);
+
+static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, int cmd,
+ const u8 *addr, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *addr,
+ gfp_t gfp)
+{
+ nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_AUTHENTICATE,
+ addr, gfp);
+}
+
+void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *addr,
+ gfp_t gfp)
+{
+ nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_ASSOCIATE,
+ addr, gfp);
+}
+
+void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *bssid,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len,
+ u16 status, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONNECT);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ (bssid && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) ||
+ nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, status) ||
+ (req_ie &&
+ nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
+ (resp_ie &&
+ nla_put(msg, NL80211_ATTR_RESP_IE, resp_ie_len, resp_ie)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+
+}
+
+void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *bssid,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ROAM);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid) ||
+ (req_ie &&
+ nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) ||
+ (resp_ie &&
+ nla_put(msg, NL80211_ATTR_RESP_IE, resp_ie_len, resp_ie)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+
+}
+
+void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u16 reason,
+ const u8 *ie, size_t ie_len, bool from_ap)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DISCONNECT);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ (from_ap && reason &&
+ nla_put_u16(msg, NL80211_ATTR_REASON_CODE, reason)) ||
+ (from_ap &&
+ nla_put_flag(msg, NL80211_ATTR_DISCONNECTED_BY_AP)) ||
+ (ie && nla_put(msg, NL80211_ATTR_IE, ie_len, ie)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, GFP_KERNEL);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+
+}
+
+void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *bssid,
+ gfp_t gfp)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_JOIN_IBSS);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
+ const u8* ie, u8 ie_len, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_MESH_POINT))
+ return;
+
+ trace_cfg80211_notify_new_peer_candidate(dev, addr);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NEW_PEER_CANDIDATE);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
+ (ie_len && ie &&
+ nla_put(msg, NL80211_ATTR_IE, ie_len , ie)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate);
+
+void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *addr,
+ enum nl80211_key_type key_type, int key_id,
+ const u8 *tsc, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_MICHAEL_MIC_FAILURE);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ (addr && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr)) ||
+ nla_put_u32(msg, NL80211_ATTR_KEY_TYPE, key_type) ||
+ (key_id != -1 &&
+ nla_put_u8(msg, NL80211_ATTR_KEY_IDX, key_id)) ||
+ (tsc && nla_put(msg, NL80211_ATTR_KEY_SEQ, 6, tsc)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
+ struct ieee80211_channel *channel_before,
+ struct ieee80211_channel *channel_after)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ struct nlattr *nl_freq;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_BEACON_HINT);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ /*
+ * Since we are applying the beacon hint to a wiphy we know its
+ * wiphy_idx is valid
+ */
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy)))
+ goto nla_put_failure;
+
+ /* Before */
+ nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE);
+ if (!nl_freq)
+ goto nla_put_failure;
+ if (nl80211_msg_put_channel(msg, channel_before, false))
+ goto nla_put_failure;
+ nla_nest_end(msg, nl_freq);
+
+ /* After */
+ nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER);
+ if (!nl_freq)
+ goto nla_put_failure;
+ if (nl80211_msg_put_channel(msg, channel_after, false))
+ goto nla_put_failure;
+ nla_nest_end(msg, nl_freq);
+
+ genlmsg_end(msg, hdr);
+
+ rcu_read_lock();
+ genlmsg_multicast_allns(&nl80211_fam, msg, 0,
+ NL80211_MCGRP_REGULATORY, GFP_ATOMIC);
+ rcu_read_unlock();
+
+ return;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+static void nl80211_send_remain_on_chan_event(
+ int cmd, struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan,
+ unsigned int duration, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+ wdev->netdev->ifindex)) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq) ||
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE,
+ NL80211_CHAN_NO_HT) ||
+ nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie))
+ goto nla_put_failure;
+
+ if (cmd == NL80211_CMD_REMAIN_ON_CHANNEL &&
+ nla_put_u32(msg, NL80211_ATTR_DURATION, duration))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan,
+ unsigned int duration, gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_ready_on_channel(wdev, cookie, chan, duration);
+ nl80211_send_remain_on_chan_event(NL80211_CMD_REMAIN_ON_CHANNEL,
+ rdev, wdev, cookie, chan,
+ duration, gfp);
+}
+EXPORT_SYMBOL(cfg80211_ready_on_channel);
+
+void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan,
+ gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_ready_on_channel_expired(wdev, cookie, chan);
+ nl80211_send_remain_on_chan_event(NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
+ rdev, wdev, cookie, chan, 0, gfp);
+}
+EXPORT_SYMBOL(cfg80211_remain_on_channel_expired);
+
+void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
+ struct station_info *sinfo, gfp_t gfp)
+{
+ struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+
+ trace_cfg80211_new_sta(dev, mac_addr, sinfo);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0,
+ rdev, dev, mac_addr, sinfo) < 0) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+}
+EXPORT_SYMBOL(cfg80211_new_sta);
+
+void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
+ struct station_info *sinfo, gfp_t gfp)
+{
+ struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ struct station_info empty_sinfo = {};
+
+ if (!sinfo)
+ sinfo = &empty_sinfo;
+
+ trace_cfg80211_del_sta(dev, mac_addr);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
+ rdev, dev, mac_addr, sinfo) < 0) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+}
+EXPORT_SYMBOL(cfg80211_del_sta_sinfo);
+
+void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
+ enum nl80211_connect_failed_reason reason,
+ gfp_t gfp)
+{
+ struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONN_FAILED);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr) ||
+ nla_put_u32(msg, NL80211_ATTR_CONN_FAILED_REASON, reason))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_conn_failed);
+
+static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd,
+ const u8 *addr, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+ u32 nlportid = ACCESS_ONCE(wdev->ap_unexpected_nlportid);
+
+ if (!nlportid)
+ return false;
+
+ msg = nlmsg_new(100, gfp);
+ if (!msg)
+ return true;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return true;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
+ return true;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+ return true;
+}
+
+bool cfg80211_rx_spurious_frame(struct net_device *dev,
+ const u8 *addr, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ bool ret;
+
+ trace_cfg80211_rx_spurious_frame(dev, addr);
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP &&
+ wdev->iftype != NL80211_IFTYPE_P2P_GO)) {
+ trace_cfg80211_return_bool(false);
+ return false;
+ }
+ ret = __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_FRAME,
+ addr, gfp);
+ trace_cfg80211_return_bool(ret);
+ return ret;
+}
+EXPORT_SYMBOL(cfg80211_rx_spurious_frame);
+
+bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
+ const u8 *addr, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ bool ret;
+
+ trace_cfg80211_rx_unexpected_4addr_frame(dev, addr);
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP &&
+ wdev->iftype != NL80211_IFTYPE_P2P_GO &&
+ wdev->iftype != NL80211_IFTYPE_AP_VLAN)) {
+ trace_cfg80211_return_bool(false);
+ return false;
+ }
+ ret = __nl80211_unexpected_frame(dev,
+ NL80211_CMD_UNEXPECTED_4ADDR_FRAME,
+ addr, gfp);
+ trace_cfg80211_return_bool(ret);
+ return ret;
+}
+EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame);
+
+int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, u32 nlportid,
+ int freq, int sig_dbm,
+ const u8 *buf, size_t len, u32 flags, gfp_t gfp)
+{
+ struct net_device *netdev = wdev->netdev;
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return -ENOMEM;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+ netdev->ifindex)) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) ||
+ (sig_dbm &&
+ nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
+ nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
+ (flags &&
+ nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, flags)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+ return -ENOBUFS;
+}
+
+void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
+ const u8 *buf, size_t len, bool ack, gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct net_device *netdev = wdev->netdev;
+ struct sk_buff *msg;
+ void *hdr;
+
+ trace_cfg80211_mgmt_tx_status(wdev, cookie, ack);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+ netdev->ifindex)) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
+ nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
+ nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) ||
+ (ack && nla_put_flag(msg, NL80211_ATTR_ACK)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
+
+static struct sk_buff *cfg80211_prepare_cqm(struct net_device *dev,
+ const char *mac, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ void **cb;
+
+ if (!msg)
+ return NULL;
+
+ cb = (void **)msg->cb;
+
+ cb[0] = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NOTIFY_CQM);
+ if (!cb[0]) {
+ nlmsg_free(msg);
+ return NULL;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ if (mac && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac))
+ goto nla_put_failure;
+
+ cb[1] = nla_nest_start(msg, NL80211_ATTR_CQM);
+ if (!cb[1])
+ goto nla_put_failure;
+
+ cb[2] = rdev;
+
+ return msg;
+ nla_put_failure:
+ nlmsg_free(msg);
+ return NULL;
+}
+
+static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp)
+{
+ void **cb = (void **)msg->cb;
+ struct cfg80211_registered_device *rdev = cb[2];
+
+ nla_nest_end(msg, cb[1]);
+ genlmsg_end(msg, cb[0]);
+
+ memset(msg->cb, 0, sizeof(msg->cb));
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+}
+
+void cfg80211_cqm_rssi_notify(struct net_device *dev,
+ enum nl80211_cqm_rssi_threshold_event rssi_event,
+ gfp_t gfp)
+{
+ struct sk_buff *msg;
+
+ trace_cfg80211_cqm_rssi_notify(dev, rssi_event);
+
+ if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW &&
+ rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH))
+ return;
+
+ msg = cfg80211_prepare_cqm(dev, NULL, gfp);
+ if (!msg)
+ return;
+
+ if (nla_put_u32(msg, NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT,
+ rssi_event))
+ goto nla_put_failure;
+
+ cfg80211_send_cqm(msg, gfp);
+
+ return;
+
+ nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_cqm_rssi_notify);
+
+void cfg80211_cqm_txe_notify(struct net_device *dev,
+ const u8 *peer, u32 num_packets,
+ u32 rate, u32 intvl, gfp_t gfp)
+{
+ struct sk_buff *msg;
+
+ msg = cfg80211_prepare_cqm(dev, peer, gfp);
+ if (!msg)
+ return;
+
+ if (nla_put_u32(msg, NL80211_ATTR_CQM_TXE_PKTS, num_packets))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_CQM_TXE_RATE, rate))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_CQM_TXE_INTVL, intvl))
+ goto nla_put_failure;
+
+ cfg80211_send_cqm(msg, gfp);
+ return;
+
+ nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_cqm_txe_notify);
+
+void cfg80211_cqm_pktloss_notify(struct net_device *dev,
+ const u8 *peer, u32 num_packets, gfp_t gfp)
+{
+ struct sk_buff *msg;
+
+ trace_cfg80211_cqm_pktloss_notify(dev, peer, num_packets);
+
+ msg = cfg80211_prepare_cqm(dev, peer, gfp);
+ if (!msg)
+ return;
+
+ if (nla_put_u32(msg, NL80211_ATTR_CQM_PKT_LOSS_EVENT, num_packets))
+ goto nla_put_failure;
+
+ cfg80211_send_cqm(msg, gfp);
+ return;
+
+ nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_cqm_pktloss_notify);
+
+void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp)
+{
+ struct sk_buff *msg;
+
+ msg = cfg80211_prepare_cqm(dev, NULL, gfp);
+ if (!msg)
+ return;
+
+ if (nla_put_flag(msg, NL80211_ATTR_CQM_BEACON_LOSS_EVENT))
+ goto nla_put_failure;
+
+ cfg80211_send_cqm(msg, gfp);
+ return;
+
+ nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_cqm_beacon_loss_notify);
+
+static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *bssid,
+ const u8 *replay_ctr, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ struct nlattr *rekey_attr;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_SET_REKEY_OFFLOAD);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
+ goto nla_put_failure;
+
+ rekey_attr = nla_nest_start(msg, NL80211_ATTR_REKEY_DATA);
+ if (!rekey_attr)
+ goto nla_put_failure;
+
+ if (nla_put(msg, NL80211_REKEY_DATA_REPLAY_CTR,
+ NL80211_REPLAY_CTR_LEN, replay_ctr))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, rekey_attr);
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid,
+ const u8 *replay_ctr, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_gtk_rekey_notify(dev, bssid);
+ nl80211_gtk_rekey_notify(rdev, dev, bssid, replay_ctr, gfp);
+}
+EXPORT_SYMBOL(cfg80211_gtk_rekey_notify);
+
+static void
+nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, int index,
+ const u8 *bssid, bool preauth, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ struct nlattr *attr;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PMKSA_CANDIDATE);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ attr = nla_nest_start(msg, NL80211_ATTR_PMKSA_CANDIDATE);
+ if (!attr)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_PMKSA_CANDIDATE_INDEX, index) ||
+ nla_put(msg, NL80211_PMKSA_CANDIDATE_BSSID, ETH_ALEN, bssid) ||
+ (preauth &&
+ nla_put_flag(msg, NL80211_PMKSA_CANDIDATE_PREAUTH)))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
+ const u8 *bssid, bool preauth, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_pmksa_candidate_notify(dev, index, bssid, preauth);
+ nl80211_pmksa_candidate_notify(rdev, dev, index, bssid, preauth, gfp);
+}
+EXPORT_SYMBOL(cfg80211_pmksa_candidate_notify);
+
+static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ struct cfg80211_chan_def *chandef,
+ gfp_t gfp,
+ enum nl80211_commands notif,
+ u8 count)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, notif);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ if (nl80211_send_chandef(msg, chandef))
+ goto nla_put_failure;
+
+ if ((notif == NL80211_CMD_CH_SWITCH_STARTED_NOTIFY) &&
+ (nla_put_u32(msg, NL80211_ATTR_CH_SWITCH_COUNT, count)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void cfg80211_ch_switch_notify(struct net_device *dev,
+ struct cfg80211_chan_def *chandef)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ trace_cfg80211_ch_switch_notify(dev, chandef);
+
+ wdev->chandef = *chandef;
+ wdev->preset_chandef = *chandef;
+ nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL,
+ NL80211_CMD_CH_SWITCH_NOTIFY, 0);
+}
+EXPORT_SYMBOL(cfg80211_ch_switch_notify);
+
+void cfg80211_ch_switch_started_notify(struct net_device *dev,
+ struct cfg80211_chan_def *chandef,
+ u8 count)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ trace_cfg80211_ch_switch_started_notify(dev, chandef);
+
+ nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL,
+ NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, count);
+}
+EXPORT_SYMBOL(cfg80211_ch_switch_started_notify);
+
+void
+nl80211_radar_notify(struct cfg80211_registered_device *rdev,
+ const struct cfg80211_chan_def *chandef,
+ enum nl80211_radar_event event,
+ struct net_device *netdev, gfp_t gfp)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_RADAR_DETECT);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx))
+ goto nla_put_failure;
+
+ /* NOP and radar events don't need a netdev parameter */
+ if (netdev) {
+ struct wireless_dev *wdev = netdev->ieee80211_ptr;
+
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_RADAR_EVENT, event))
+ goto nla_put_failure;
+
+ if (nl80211_send_chandef(msg, chandef))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+
+void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
+ u64 cookie, bool acked, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ trace_cfg80211_probe_status(dev, addr, cookie, acked);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PROBE_CLIENT);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
+ nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) ||
+ (acked && nla_put_flag(msg, NL80211_ATTR_ACK)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_probe_status);
+
+void cfg80211_report_obss_beacon(struct wiphy *wiphy,
+ const u8 *frame, size_t len,
+ int freq, int sig_dbm)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+ struct cfg80211_beacon_registration *reg;
+
+ trace_cfg80211_report_obss_beacon(wiphy, frame, len, freq, sig_dbm);
+
+ spin_lock_bh(&rdev->beacon_registrations_lock);
+ list_for_each_entry(reg, &rdev->beacon_registrations, list) {
+ msg = nlmsg_new(len + 100, GFP_ATOMIC);
+ if (!msg) {
+ spin_unlock_bh(&rdev->beacon_registrations_lock);
+ return;
+ }
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME);
+ if (!hdr)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ (freq &&
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq)) ||
+ (sig_dbm &&
+ nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
+ nla_put(msg, NL80211_ATTR_FRAME, len, frame))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, reg->nlportid);
+ }
+ spin_unlock_bh(&rdev->beacon_registrations_lock);
+ return;
+
+ nla_put_failure:
+ spin_unlock_bh(&rdev->beacon_registrations_lock);
+ if (hdr)
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_report_obss_beacon);
+
+#ifdef CONFIG_PM
+static int cfg80211_net_detect_results(struct sk_buff *msg,
+ struct cfg80211_wowlan_wakeup *wakeup)
+{
+ struct cfg80211_wowlan_nd_info *nd = wakeup->net_detect;
+ struct nlattr *nl_results, *nl_match, *nl_freqs;
+ int i, j;
+
+ nl_results = nla_nest_start(
+ msg, NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS);
+ if (!nl_results)
+ return -EMSGSIZE;
+
+ for (i = 0; i < nd->n_matches; i++) {
+ struct cfg80211_wowlan_nd_match *match = nd->matches[i];
+
+ nl_match = nla_nest_start(msg, i);
+ if (!nl_match)
+ break;
+
+ /* The SSID attribute is optional in nl80211, but for
+ * simplicity reasons it's always present in the
+ * cfg80211 structure. If a driver can't pass the
+ * SSID, that needs to be changed. A zero length SSID
+ * is still a valid SSID (wildcard), so it cannot be
+ * used for this purpose.
+ */
+ if (nla_put(msg, NL80211_ATTR_SSID, match->ssid.ssid_len,
+ match->ssid.ssid)) {
+ nla_nest_cancel(msg, nl_match);
+ goto out;
+ }
+
+ if (match->n_channels) {
+ nl_freqs = nla_nest_start(
+ msg, NL80211_ATTR_SCAN_FREQUENCIES);
+ if (!nl_freqs) {
+ nla_nest_cancel(msg, nl_match);
+ goto out;
+ }
+
+ for (j = 0; j < match->n_channels; j++) {
+ if (nla_put_u32(msg, j, match->channels[j])) {
+ nla_nest_cancel(msg, nl_freqs);
+ nla_nest_cancel(msg, nl_match);
+ goto out;
+ }
+ }
+
+ nla_nest_end(msg, nl_freqs);
+ }
+
+ nla_nest_end(msg, nl_match);
+ }
+
+out:
+ nla_nest_end(msg, nl_results);
+ return 0;
+}
+
+void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
+ struct cfg80211_wowlan_wakeup *wakeup,
+ gfp_t gfp)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+ int size = 200;
+
+ trace_cfg80211_report_wowlan_wakeup(wdev->wiphy, wdev, wakeup);
+
+ if (wakeup)
+ size += wakeup->packet_present_len;
+
+ msg = nlmsg_new(size, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_SET_WOWLAN);
+ if (!hdr)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ goto free_msg;
+
+ if (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
+ wdev->netdev->ifindex))
+ goto free_msg;
+
+ if (wakeup) {
+ struct nlattr *reasons;
+
+ reasons = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS);
+ if (!reasons)
+ goto free_msg;
+
+ if (wakeup->disconnect &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT))
+ goto free_msg;
+ if (wakeup->magic_pkt &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT))
+ goto free_msg;
+ if (wakeup->gtk_rekey_failure &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE))
+ goto free_msg;
+ if (wakeup->eap_identity_req &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST))
+ goto free_msg;
+ if (wakeup->four_way_handshake &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE))
+ goto free_msg;
+ if (wakeup->rfkill_release &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE))
+ goto free_msg;
+
+ if (wakeup->pattern_idx >= 0 &&
+ nla_put_u32(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN,
+ wakeup->pattern_idx))
+ goto free_msg;
+
+ if (wakeup->tcp_match &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_WAKEUP_TCP_MATCH))
+ goto free_msg;
+
+ if (wakeup->tcp_connlost &&
+ nla_put_flag(msg, NL80211_WOWLAN_TRIG_WAKEUP_TCP_CONNLOST))
+ goto free_msg;
+
+ if (wakeup->tcp_nomoretokens &&
+ nla_put_flag(msg,
+ NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS))
+ goto free_msg;
+
+ if (wakeup->packet) {
+ u32 pkt_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211;
+ u32 len_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211_LEN;
+
+ if (!wakeup->packet_80211) {
+ pkt_attr =
+ NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023;
+ len_attr =
+ NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023_LEN;
+ }
+
+ if (wakeup->packet_len &&
+ nla_put_u32(msg, len_attr, wakeup->packet_len))
+ goto free_msg;
+
+ if (nla_put(msg, pkt_attr, wakeup->packet_present_len,
+ wakeup->packet))
+ goto free_msg;
+ }
+
+ if (wakeup->net_detect &&
+ cfg80211_net_detect_results(msg, wakeup))
+ goto free_msg;
+
+ nla_nest_end(msg, reasons);
+ }
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ free_msg:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_report_wowlan_wakeup);
+#endif
+
+void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
+ enum nl80211_tdls_operation oper,
+ u16 reason_code, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ trace_cfg80211_tdls_oper_request(wdev->wiphy, dev, peer, oper,
+ reason_code);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_TDLS_OPER);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+ nla_put_u8(msg, NL80211_ATTR_TDLS_OPERATION, oper) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, peer) ||
+ (reason_code > 0 &&
+ nla_put_u16(msg, NL80211_ATTR_REASON_CODE, reason_code)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, gfp);
+ return;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_tdls_oper_request);
+
+static int nl80211_netlink_notify(struct notifier_block * nb,
+ unsigned long state,
+ void *_notify)
+{
+ struct netlink_notify *notify = _notify;
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+ struct cfg80211_beacon_registration *reg, *tmp;
+
+ if (state != NETLINK_URELEASE)
+ return NOTIFY_DONE;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) {
+ bool schedule_destroy_work = false;
+ bool schedule_scan_stop = false;
+ struct cfg80211_sched_scan_request *sched_scan_req =
+ rcu_dereference(rdev->sched_scan_req);
+
+ if (sched_scan_req && notify->portid &&
+ sched_scan_req->owner_nlportid == notify->portid)
+ schedule_scan_stop = true;
+
+ list_for_each_entry_rcu(wdev, &rdev->wdev_list, list) {
+ cfg80211_mlme_unregister_socket(wdev, notify->portid);
+
+ if (wdev->owner_nlportid == notify->portid)
+ schedule_destroy_work = true;
+ }
+
+ spin_lock_bh(&rdev->beacon_registrations_lock);
+ list_for_each_entry_safe(reg, tmp, &rdev->beacon_registrations,
+ list) {
+ if (reg->nlportid == notify->portid) {
+ list_del(&reg->list);
+ kfree(reg);
+ break;
+ }
+ }
+ spin_unlock_bh(&rdev->beacon_registrations_lock);
+
+ if (schedule_destroy_work) {
+ struct cfg80211_iface_destroy *destroy;
+
+ destroy = kzalloc(sizeof(*destroy), GFP_ATOMIC);
+ if (destroy) {
+ destroy->nlportid = notify->portid;
+ spin_lock(&rdev->destroy_list_lock);
+ list_add(&destroy->list, &rdev->destroy_list);
+ spin_unlock(&rdev->destroy_list_lock);
+ schedule_work(&rdev->destroy_work);
+ }
+ } else if (schedule_scan_stop) {
+ sched_scan_req->owner_nlportid = 0;
+
+ if (rdev->ops->sched_scan_stop &&
+ rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
+ schedule_work(&rdev->sched_scan_stop_wk);
+ }
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * It is possible that the user space process that is controlling the
+ * indoor setting disappeared, so notify the regulatory core.
+ */
+ regulatory_netlink_notify(notify->portid);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block nl80211_netlink_notifier = {
+ .notifier_call = nl80211_netlink_notify,
+};
+
+void cfg80211_ft_event(struct net_device *netdev,
+ struct cfg80211_ft_event_params *ft_event)
+{
+ struct wiphy *wiphy = netdev->ieee80211_ptr->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ trace_cfg80211_ft_event(wiphy, netdev, ft_event);
+
+ if (!ft_event->target_ap)
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FT_EVENT);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, ft_event->target_ap))
+ goto out;
+
+ if (ft_event->ies &&
+ nla_put(msg, NL80211_ATTR_IE, ft_event->ies_len, ft_event->ies))
+ goto out;
+ if (ft_event->ric_ies &&
+ nla_put(msg, NL80211_ATTR_IE_RIC, ft_event->ric_ies_len,
+ ft_event->ric_ies))
+ goto out;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, GFP_KERNEL);
+ return;
+ out:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_ft_event);
+
+void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp)
+{
+ struct cfg80211_registered_device *rdev;
+ struct sk_buff *msg;
+ void *hdr;
+ u32 nlportid;
+
+ rdev = wiphy_to_rdev(wdev->wiphy);
+ if (!rdev->crit_proto_nlportid)
+ return;
+
+ nlportid = rdev->crit_proto_nlportid;
+ rdev->crit_proto_nlportid = 0;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CRIT_PROTOCOL_STOP);
+ if (!hdr)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
+ return;
+
+ nla_put_failure:
+ if (hdr)
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+
+}
+EXPORT_SYMBOL(cfg80211_crit_proto_stopped);
+
+void nl80211_send_ap_stopped(struct wireless_dev *wdev)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_STOP_AP);
+ if (!hdr)
+ goto out;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex) ||
+ nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)))
+ goto out;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), msg, 0,
+ NL80211_MCGRP_MLME, GFP_KERNEL);
+ return;
+ out:
+ nlmsg_free(msg);
+}
+
+/* initialisation/exit functions */
+
+int nl80211_init(void)
+{
+ int err;
+
+ err = genl_register_family_with_ops_groups(&nl80211_fam, nl80211_ops,
+ nl80211_mcgrps);
+ if (err)
+ return err;
+
+ err = netlink_register_notifier(&nl80211_netlink_notifier);
+ if (err)
+ goto err_out;
+
+ return 0;
+ err_out:
+ genl_unregister_family(&nl80211_fam);
+ return err;
+}
+
+void nl80211_exit(void)
+{
+ netlink_unregister_notifier(&nl80211_netlink_notifier);
+ genl_unregister_family(&nl80211_fam);
+}
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
new file mode 100644
index 000000000..84d4edf1d
--- /dev/null
+++ b/net/wireless/nl80211.h
@@ -0,0 +1,97 @@
+#ifndef __NET_WIRELESS_NL80211_H
+#define __NET_WIRELESS_NL80211_H
+
+#include "core.h"
+
+int nl80211_init(void);
+void nl80211_exit(void);
+void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
+ enum nl80211_commands cmd);
+void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
+struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, bool aborted);
+void nl80211_send_scan_result(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg);
+void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u32 cmd);
+void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev);
+void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
+ struct regulatory_request *request);
+
+static inline void
+nl80211_send_reg_change_event(struct regulatory_request *request)
+{
+ nl80211_common_reg_change_event(NL80211_CMD_REG_CHANGE, request);
+}
+
+static inline void
+nl80211_send_wiphy_reg_change_event(struct regulatory_request *request)
+{
+ nl80211_common_reg_change_event(NL80211_CMD_WIPHY_REG_CHANGE, request);
+}
+
+void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ const u8 *buf, size_t len, gfp_t gfp,
+ int uapsd_queues);
+void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ const u8 *addr, gfp_t gfp);
+void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ const u8 *addr, gfp_t gfp);
+void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *bssid,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len,
+ u16 status, gfp_t gfp);
+void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *bssid,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp);
+void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u16 reason,
+ const u8 *ie, size_t ie_len, bool from_ap);
+
+void
+nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *addr,
+ enum nl80211_key_type key_type,
+ int key_id, const u8 *tsc, gfp_t gfp);
+
+void
+nl80211_send_beacon_hint_event(struct wiphy *wiphy,
+ struct ieee80211_channel *channel_before,
+ struct ieee80211_channel *channel_after);
+
+void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, const u8 *bssid,
+ gfp_t gfp);
+
+int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, u32 nlpid,
+ int freq, int sig_dbm,
+ const u8 *buf, size_t len, u32 flags, gfp_t gfp);
+
+void
+nl80211_radar_notify(struct cfg80211_registered_device *rdev,
+ const struct cfg80211_chan_def *chandef,
+ enum nl80211_radar_event event,
+ struct net_device *netdev, gfp_t gfp);
+
+void nl80211_send_ap_stopped(struct wireless_dev *wdev);
+
+void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev);
+
+#endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c
new file mode 100644
index 000000000..c00d4a792
--- /dev/null
+++ b/net/wireless/ocb.c
@@ -0,0 +1,88 @@
+/*
+ * OCB mode implementation
+ *
+ * Copyright: (c) 2014 Czech Technical University in Prague
+ * (c) 2014 Volkswagen Group Research
+ * Author: Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz>
+ * Funded by: Volkswagen Group Research
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <net/cfg80211.h>
+#include "nl80211.h"
+#include "core.h"
+#include "rdev-ops.h"
+
+int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ocb_setup *setup)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB)
+ return -EOPNOTSUPP;
+
+ if (WARN_ON(!setup->chandef.chan))
+ return -EINVAL;
+
+ err = rdev_join_ocb(rdev, dev, setup);
+ if (!err)
+ wdev->chandef = setup->chandef;
+
+ return err;
+}
+
+int cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ocb_setup *setup)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ wdev_lock(wdev);
+ err = __cfg80211_join_ocb(rdev, dev, setup);
+ wdev_unlock(wdev);
+
+ return err;
+}
+
+int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->leave_ocb)
+ return -EOPNOTSUPP;
+
+ err = rdev_leave_ocb(rdev, dev);
+ if (!err)
+ memset(&wdev->chandef, 0, sizeof(wdev->chandef));
+
+ return err;
+}
+
+int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ wdev_lock(wdev);
+ err = __cfg80211_leave_ocb(rdev, dev);
+ wdev_unlock(wdev);
+
+ return err;
+}
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
new file mode 100644
index 000000000..722da6164
--- /dev/null
+++ b/net/wireless/radiotap.c
@@ -0,0 +1,369 @@
+/*
+ * Radiotap parser
+ *
+ * Copyright 2007 Andy Green <andy@warmcat.com>
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Alternatively, this software may be distributed under the terms of BSD
+ * license.
+ *
+ * See COPYING for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <net/cfg80211.h>
+#include <net/ieee80211_radiotap.h>
+#include <asm/unaligned.h>
+
+/* function prototypes and related defs are in include/net/cfg80211.h */
+
+static const struct radiotap_align_size rtap_namespace_sizes[] = {
+ [IEEE80211_RADIOTAP_TSFT] = { .align = 8, .size = 8, },
+ [IEEE80211_RADIOTAP_FLAGS] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_RATE] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_CHANNEL] = { .align = 2, .size = 4, },
+ [IEEE80211_RADIOTAP_FHSS] = { .align = 2, .size = 2, },
+ [IEEE80211_RADIOTAP_DBM_ANTSIGNAL] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_DBM_ANTNOISE] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_LOCK_QUALITY] = { .align = 2, .size = 2, },
+ [IEEE80211_RADIOTAP_TX_ATTENUATION] = { .align = 2, .size = 2, },
+ [IEEE80211_RADIOTAP_DB_TX_ATTENUATION] = { .align = 2, .size = 2, },
+ [IEEE80211_RADIOTAP_DBM_TX_POWER] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_ANTENNA] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_DB_ANTSIGNAL] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_DB_ANTNOISE] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_RX_FLAGS] = { .align = 2, .size = 2, },
+ [IEEE80211_RADIOTAP_TX_FLAGS] = { .align = 2, .size = 2, },
+ [IEEE80211_RADIOTAP_RTS_RETRIES] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_DATA_RETRIES] = { .align = 1, .size = 1, },
+ [IEEE80211_RADIOTAP_MCS] = { .align = 1, .size = 3, },
+ [IEEE80211_RADIOTAP_AMPDU_STATUS] = { .align = 4, .size = 8, },
+ /*
+ * add more here as they are defined in radiotap.h
+ */
+};
+
+static const struct ieee80211_radiotap_namespace radiotap_ns = {
+ .n_bits = ARRAY_SIZE(rtap_namespace_sizes),
+ .align_size = rtap_namespace_sizes,
+};
+
+/**
+ * ieee80211_radiotap_iterator_init - radiotap parser iterator initialization
+ * @iterator: radiotap_iterator to initialize
+ * @radiotap_header: radiotap header to parse
+ * @max_length: total length we can parse into (eg, whole packet length)
+ *
+ * Returns: 0 or a negative error code if there is a problem.
+ *
+ * This function initializes an opaque iterator struct which can then
+ * be passed to ieee80211_radiotap_iterator_next() to visit every radiotap
+ * argument which is present in the header. It knows about extended
+ * present headers and handles them.
+ *
+ * How to use:
+ * call __ieee80211_radiotap_iterator_init() to init a semi-opaque iterator
+ * struct ieee80211_radiotap_iterator (no need to init the struct beforehand)
+ * checking for a good 0 return code. Then loop calling
+ * __ieee80211_radiotap_iterator_next()... it returns either 0,
+ * -ENOENT if there are no more args to parse, or -EINVAL if there is a problem.
+ * The iterator's @this_arg member points to the start of the argument
+ * associated with the current argument index that is present, which can be
+ * found in the iterator's @this_arg_index member. This arg index corresponds
+ * to the IEEE80211_RADIOTAP_... defines.
+ *
+ * Radiotap header length:
+ * You can find the CPU-endian total radiotap header length in
+ * iterator->max_length after executing ieee80211_radiotap_iterator_init()
+ * successfully.
+ *
+ * Alignment Gotcha:
+ * You must take care when dereferencing iterator.this_arg
+ * for multibyte types... the pointer is not aligned. Use
+ * get_unaligned((type *)iterator.this_arg) to dereference
+ * iterator.this_arg for type "type" safely on all arches.
+ *
+ * Example code:
+ * See Documentation/networking/radiotap-headers.txt
+ */
+
+int ieee80211_radiotap_iterator_init(
+ struct ieee80211_radiotap_iterator *iterator,
+ struct ieee80211_radiotap_header *radiotap_header,
+ int max_length, const struct ieee80211_radiotap_vendor_namespaces *vns)
+{
+ /* check the radiotap header can actually be present */
+ if (max_length < sizeof(struct ieee80211_radiotap_header))
+ return -EINVAL;
+
+ /* Linux only supports version 0 radiotap format */
+ if (radiotap_header->it_version)
+ return -EINVAL;
+
+ /* sanity check for allowed length and radiotap length field */
+ if (max_length < get_unaligned_le16(&radiotap_header->it_len))
+ return -EINVAL;
+
+ iterator->_rtheader = radiotap_header;
+ iterator->_max_length = get_unaligned_le16(&radiotap_header->it_len);
+ iterator->_arg_index = 0;
+ iterator->_bitmap_shifter = get_unaligned_le32(&radiotap_header->it_present);
+ iterator->_arg = (uint8_t *)radiotap_header + sizeof(*radiotap_header);
+ iterator->_reset_on_ext = 0;
+ iterator->_next_bitmap = &radiotap_header->it_present;
+ iterator->_next_bitmap++;
+ iterator->_vns = vns;
+ iterator->current_namespace = &radiotap_ns;
+ iterator->is_radiotap_ns = 1;
+
+ /* find payload start allowing for extended bitmap(s) */
+
+ if (iterator->_bitmap_shifter & (1<<IEEE80211_RADIOTAP_EXT)) {
+ if ((unsigned long)iterator->_arg -
+ (unsigned long)iterator->_rtheader + sizeof(uint32_t) >
+ (unsigned long)iterator->_max_length)
+ return -EINVAL;
+ while (get_unaligned_le32(iterator->_arg) &
+ (1 << IEEE80211_RADIOTAP_EXT)) {
+ iterator->_arg += sizeof(uint32_t);
+
+ /*
+ * check for insanity where the present bitmaps
+ * keep claiming to extend up to or even beyond the
+ * stated radiotap header length
+ */
+
+ if ((unsigned long)iterator->_arg -
+ (unsigned long)iterator->_rtheader +
+ sizeof(uint32_t) >
+ (unsigned long)iterator->_max_length)
+ return -EINVAL;
+ }
+
+ iterator->_arg += sizeof(uint32_t);
+
+ /*
+ * no need to check again for blowing past stated radiotap
+ * header length, because ieee80211_radiotap_iterator_next
+ * checks it before it is dereferenced
+ */
+ }
+
+ iterator->this_arg = iterator->_arg;
+
+ /* we are all initialized happily */
+
+ return 0;
+}
+EXPORT_SYMBOL(ieee80211_radiotap_iterator_init);
+
+static void find_ns(struct ieee80211_radiotap_iterator *iterator,
+ uint32_t oui, uint8_t subns)
+{
+ int i;
+
+ iterator->current_namespace = NULL;
+
+ if (!iterator->_vns)
+ return;
+
+ for (i = 0; i < iterator->_vns->n_ns; i++) {
+ if (iterator->_vns->ns[i].oui != oui)
+ continue;
+ if (iterator->_vns->ns[i].subns != subns)
+ continue;
+
+ iterator->current_namespace = &iterator->_vns->ns[i];
+ break;
+ }
+}
+
+
+
+/**
+ * ieee80211_radiotap_iterator_next - return next radiotap parser iterator arg
+ * @iterator: radiotap_iterator to move to next arg (if any)
+ *
+ * Returns: 0 if there is an argument to handle,
+ * -ENOENT if there are no more args or -EINVAL
+ * if there is something else wrong.
+ *
+ * This function provides the next radiotap arg index (IEEE80211_RADIOTAP_*)
+ * in @this_arg_index and sets @this_arg to point to the
+ * payload for the field. It takes care of alignment handling and extended
+ * present fields. @this_arg can be changed by the caller (eg,
+ * incremented to move inside a compound argument like
+ * IEEE80211_RADIOTAP_CHANNEL). The args pointed to are in
+ * little-endian format whatever the endianess of your CPU.
+ *
+ * Alignment Gotcha:
+ * You must take care when dereferencing iterator.this_arg
+ * for multibyte types... the pointer is not aligned. Use
+ * get_unaligned((type *)iterator.this_arg) to dereference
+ * iterator.this_arg for type "type" safely on all arches.
+ */
+
+int ieee80211_radiotap_iterator_next(
+ struct ieee80211_radiotap_iterator *iterator)
+{
+ while (1) {
+ int hit = 0;
+ int pad, align, size, subns;
+ uint32_t oui;
+
+ /* if no more EXT bits, that's it */
+ if ((iterator->_arg_index % 32) == IEEE80211_RADIOTAP_EXT &&
+ !(iterator->_bitmap_shifter & 1))
+ return -ENOENT;
+
+ if (!(iterator->_bitmap_shifter & 1))
+ goto next_entry; /* arg not present */
+
+ /* get alignment/size of data */
+ switch (iterator->_arg_index % 32) {
+ case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE:
+ case IEEE80211_RADIOTAP_EXT:
+ align = 1;
+ size = 0;
+ break;
+ case IEEE80211_RADIOTAP_VENDOR_NAMESPACE:
+ align = 2;
+ size = 6;
+ break;
+ default:
+ if (!iterator->current_namespace ||
+ iterator->_arg_index >= iterator->current_namespace->n_bits) {
+ if (iterator->current_namespace == &radiotap_ns)
+ return -ENOENT;
+ align = 0;
+ } else {
+ align = iterator->current_namespace->align_size[iterator->_arg_index].align;
+ size = iterator->current_namespace->align_size[iterator->_arg_index].size;
+ }
+ if (!align) {
+ /* skip all subsequent data */
+ iterator->_arg = iterator->_next_ns_data;
+ /* give up on this namespace */
+ iterator->current_namespace = NULL;
+ goto next_entry;
+ }
+ break;
+ }
+
+ /*
+ * arg is present, account for alignment padding
+ *
+ * Note that these alignments are relative to the start
+ * of the radiotap header. There is no guarantee
+ * that the radiotap header itself is aligned on any
+ * kind of boundary.
+ *
+ * The above is why get_unaligned() is used to dereference
+ * multibyte elements from the radiotap area.
+ */
+
+ pad = ((unsigned long)iterator->_arg -
+ (unsigned long)iterator->_rtheader) & (align - 1);
+
+ if (pad)
+ iterator->_arg += align - pad;
+
+ if (iterator->_arg_index % 32 == IEEE80211_RADIOTAP_VENDOR_NAMESPACE) {
+ int vnslen;
+
+ if ((unsigned long)iterator->_arg + size -
+ (unsigned long)iterator->_rtheader >
+ (unsigned long)iterator->_max_length)
+ return -EINVAL;
+
+ oui = (*iterator->_arg << 16) |
+ (*(iterator->_arg + 1) << 8) |
+ *(iterator->_arg + 2);
+ subns = *(iterator->_arg + 3);
+
+ find_ns(iterator, oui, subns);
+
+ vnslen = get_unaligned_le16(iterator->_arg + 4);
+ iterator->_next_ns_data = iterator->_arg + size + vnslen;
+ if (!iterator->current_namespace)
+ size += vnslen;
+ }
+
+ /*
+ * this is what we will return to user, but we need to
+ * move on first so next call has something fresh to test
+ */
+ iterator->this_arg_index = iterator->_arg_index;
+ iterator->this_arg = iterator->_arg;
+ iterator->this_arg_size = size;
+
+ /* internally move on the size of this arg */
+ iterator->_arg += size;
+
+ /*
+ * check for insanity where we are given a bitmap that
+ * claims to have more arg content than the length of the
+ * radiotap section. We will normally end up equalling this
+ * max_length on the last arg, never exceeding it.
+ */
+
+ if ((unsigned long)iterator->_arg -
+ (unsigned long)iterator->_rtheader >
+ (unsigned long)iterator->_max_length)
+ return -EINVAL;
+
+ /* these special ones are valid in each bitmap word */
+ switch (iterator->_arg_index % 32) {
+ case IEEE80211_RADIOTAP_VENDOR_NAMESPACE:
+ iterator->_reset_on_ext = 1;
+
+ iterator->is_radiotap_ns = 0;
+ /*
+ * If parser didn't register this vendor
+ * namespace with us, allow it to show it
+ * as 'raw. Do do that, set argument index
+ * to vendor namespace.
+ */
+ iterator->this_arg_index =
+ IEEE80211_RADIOTAP_VENDOR_NAMESPACE;
+ if (!iterator->current_namespace)
+ hit = 1;
+ goto next_entry;
+ case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE:
+ iterator->_reset_on_ext = 1;
+ iterator->current_namespace = &radiotap_ns;
+ iterator->is_radiotap_ns = 1;
+ goto next_entry;
+ case IEEE80211_RADIOTAP_EXT:
+ /*
+ * bit 31 was set, there is more
+ * -- move to next u32 bitmap
+ */
+ iterator->_bitmap_shifter =
+ get_unaligned_le32(iterator->_next_bitmap);
+ iterator->_next_bitmap++;
+ if (iterator->_reset_on_ext)
+ iterator->_arg_index = 0;
+ else
+ iterator->_arg_index++;
+ iterator->_reset_on_ext = 0;
+ break;
+ default:
+ /* we've got a hit! */
+ hit = 1;
+ next_entry:
+ iterator->_bitmap_shifter >>= 1;
+ iterator->_arg_index++;
+ }
+
+ /* if we found a valid arg earlier, return it now */
+ if (hit)
+ return 0;
+ }
+}
+EXPORT_SYMBOL(ieee80211_radiotap_iterator_next);
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
new file mode 100644
index 000000000..c6e83a746
--- /dev/null
+++ b/net/wireless/rdev-ops.h
@@ -0,0 +1,1021 @@
+#ifndef __CFG80211_RDEV_OPS
+#define __CFG80211_RDEV_OPS
+
+#include <linux/rtnetlink.h>
+#include <net/cfg80211.h>
+#include "core.h"
+#include "trace.h"
+
+static inline int rdev_suspend(struct cfg80211_registered_device *rdev,
+ struct cfg80211_wowlan *wowlan)
+{
+ int ret;
+ trace_rdev_suspend(&rdev->wiphy, wowlan);
+ ret = rdev->ops->suspend(&rdev->wiphy, wowlan);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_resume(struct cfg80211_registered_device *rdev)
+{
+ int ret;
+ trace_rdev_resume(&rdev->wiphy);
+ ret = rdev->ops->resume(&rdev->wiphy);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev,
+ bool enabled)
+{
+ trace_rdev_set_wakeup(&rdev->wiphy, enabled);
+ rdev->ops->set_wakeup(&rdev->wiphy, enabled);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
+static inline struct wireless_dev
+*rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name,
+ unsigned char name_assign_type,
+ enum nl80211_iftype type, u32 *flags,
+ struct vif_params *params)
+{
+ struct wireless_dev *ret;
+ trace_rdev_add_virtual_intf(&rdev->wiphy, name, type);
+ ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type,
+ type, flags, params);
+ trace_rdev_return_wdev(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_del_virtual_intf(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ int ret;
+ trace_rdev_del_virtual_intf(&rdev->wiphy, wdev);
+ ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_change_virtual_intf(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, enum nl80211_iftype type,
+ u32 *flags, struct vif_params *params)
+{
+ int ret;
+ trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type);
+ ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, flags,
+ params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_add_key(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u8 key_index,
+ bool pairwise, const u8 *mac_addr,
+ struct key_params *params)
+{
+ int ret;
+ trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr);
+ ret = rdev->ops->add_key(&rdev->wiphy, netdev, key_index, pairwise,
+ mac_addr, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev,
+ u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie,
+ void (*callback)(void *cookie, struct key_params*))
+{
+ int ret;
+ trace_rdev_get_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr);
+ ret = rdev->ops->get_key(&rdev->wiphy, netdev, key_index, pairwise,
+ mac_addr, cookie, callback);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_del_key(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u8 key_index,
+ bool pairwise, const u8 *mac_addr)
+{
+ int ret;
+ trace_rdev_del_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr);
+ ret = rdev->ops->del_key(&rdev->wiphy, netdev, key_index, pairwise,
+ mac_addr);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_default_key(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u8 key_index, bool unicast,
+ bool multicast)
+{
+ int ret;
+ trace_rdev_set_default_key(&rdev->wiphy, netdev, key_index,
+ unicast, multicast);
+ ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, key_index,
+ unicast, multicast);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u8 key_index)
+{
+ int ret;
+ trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, key_index);
+ ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev,
+ key_index);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_start_ap(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_ap_settings *settings)
+{
+ int ret;
+ trace_rdev_start_ap(&rdev->wiphy, dev, settings);
+ ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_beacon_data *info)
+{
+ int ret;
+ trace_rdev_change_beacon(&rdev->wiphy, dev, info);
+ ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ int ret;
+ trace_rdev_stop_ap(&rdev->wiphy, dev);
+ ret = rdev->ops->stop_ap(&rdev->wiphy, dev);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_add_station(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *mac,
+ struct station_parameters *params)
+{
+ int ret;
+ trace_rdev_add_station(&rdev->wiphy, dev, mac, params);
+ ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_del_station(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct station_del_parameters *params)
+{
+ int ret;
+ trace_rdev_del_station(&rdev->wiphy, dev, params);
+ ret = rdev->ops->del_station(&rdev->wiphy, dev, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_change_station(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *mac,
+ struct station_parameters *params)
+{
+ int ret;
+ trace_rdev_change_station(&rdev->wiphy, dev, mac, params);
+ ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_get_station(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *mac,
+ struct station_info *sinfo)
+{
+ int ret;
+ trace_rdev_get_station(&rdev->wiphy, dev, mac);
+ ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo);
+ trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo);
+ return ret;
+}
+
+static inline int rdev_dump_station(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, int idx, u8 *mac,
+ struct station_info *sinfo)
+{
+ int ret;
+ trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac);
+ ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo);
+ trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo);
+ return ret;
+}
+
+static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *dst, u8 *next_hop)
+{
+ int ret;
+ trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop);
+ ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *dst)
+{
+ int ret;
+ trace_rdev_del_mpath(&rdev->wiphy, dev, dst);
+ ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *dst,
+ u8 *next_hop)
+{
+ int ret;
+ trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop);
+ ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *dst, u8 *next_hop,
+ struct mpath_info *pinfo)
+{
+ int ret;
+ trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop);
+ ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo);
+ trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
+ return ret;
+
+}
+
+static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *dst, u8 *mpp,
+ struct mpath_info *pinfo)
+{
+ int ret;
+
+ trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp);
+ ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo);
+ trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
+ return ret;
+}
+
+static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, int idx, u8 *dst,
+ u8 *next_hop, struct mpath_info *pinfo)
+
+{
+ int ret;
+ trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop);
+ ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop,
+ pinfo);
+ trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
+ return ret;
+}
+
+static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, int idx, u8 *dst,
+ u8 *mpp, struct mpath_info *pinfo)
+
+{
+ int ret;
+
+ trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp);
+ ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo);
+ trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
+ return ret;
+}
+
+static inline int
+rdev_get_mesh_config(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, struct mesh_config *conf)
+{
+ int ret;
+ trace_rdev_get_mesh_config(&rdev->wiphy, dev);
+ ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf);
+ trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf);
+ return ret;
+}
+
+static inline int
+rdev_update_mesh_config(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u32 mask,
+ const struct mesh_config *nconf)
+{
+ int ret;
+ trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf);
+ ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ const struct mesh_config *conf,
+ const struct mesh_setup *setup)
+{
+ int ret;
+ trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup);
+ ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+
+static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ int ret;
+ trace_rdev_leave_mesh(&rdev->wiphy, dev);
+ ret = rdev->ops->leave_mesh(&rdev->wiphy, dev);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ocb_setup *setup)
+{
+ int ret;
+ trace_rdev_join_ocb(&rdev->wiphy, dev, setup);
+ ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ int ret;
+ trace_rdev_leave_ocb(&rdev->wiphy, dev);
+ ret = rdev->ops->leave_ocb(&rdev->wiphy, dev);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_change_bss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct bss_parameters *params)
+
+{
+ int ret;
+ trace_rdev_change_bss(&rdev->wiphy, dev, params);
+ ret = rdev->ops->change_bss(&rdev->wiphy, dev, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ieee80211_txq_params *params)
+
+{
+ int ret;
+ trace_rdev_set_txq_params(&rdev->wiphy, dev, params);
+ ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ieee80211_channel *chan)
+{
+ int ret;
+ trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan);
+ ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_monitor_channel(struct cfg80211_registered_device *rdev,
+ struct cfg80211_chan_def *chandef)
+{
+ int ret;
+ trace_rdev_set_monitor_channel(&rdev->wiphy, chandef);
+ ret = rdev->ops->set_monitor_channel(&rdev->wiphy, chandef);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_scan(struct cfg80211_registered_device *rdev,
+ struct cfg80211_scan_request *request)
+{
+ int ret;
+ trace_rdev_scan(&rdev->wiphy, request);
+ ret = rdev->ops->scan(&rdev->wiphy, request);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_auth(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_auth_request *req)
+{
+ int ret;
+ trace_rdev_auth(&rdev->wiphy, dev, req);
+ ret = rdev->ops->auth(&rdev->wiphy, dev, req);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_assoc(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_assoc_request *req)
+{
+ int ret;
+ trace_rdev_assoc(&rdev->wiphy, dev, req);
+ ret = rdev->ops->assoc(&rdev->wiphy, dev, req);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_deauth(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_deauth_request *req)
+{
+ int ret;
+ trace_rdev_deauth(&rdev->wiphy, dev, req);
+ ret = rdev->ops->deauth(&rdev->wiphy, dev, req);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_disassoc(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_disassoc_request *req)
+{
+ int ret;
+ trace_rdev_disassoc(&rdev->wiphy, dev, req);
+ ret = rdev->ops->disassoc(&rdev->wiphy, dev, req);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_connect(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_connect_params *sme)
+{
+ int ret;
+ trace_rdev_connect(&rdev->wiphy, dev, sme);
+ ret = rdev->ops->connect(&rdev->wiphy, dev, sme);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_disconnect(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u16 reason_code)
+{
+ int ret;
+ trace_rdev_disconnect(&rdev->wiphy, dev, reason_code);
+ ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_ibss_params *params)
+{
+ int ret;
+ trace_rdev_join_ibss(&rdev->wiphy, dev, params);
+ ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ int ret;
+ trace_rdev_leave_ibss(&rdev->wiphy, dev);
+ ret = rdev->ops->leave_ibss(&rdev->wiphy, dev);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed)
+{
+ int ret;
+ trace_rdev_set_wiphy_params(&rdev->wiphy, changed);
+ ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ enum nl80211_tx_power_setting type, int mbm)
+{
+ int ret;
+ trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm);
+ ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, int *dbm)
+{
+ int ret;
+ trace_rdev_get_tx_power(&rdev->wiphy, wdev);
+ ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm);
+ trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm);
+ return ret;
+}
+
+static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *addr)
+{
+ int ret;
+ trace_rdev_set_wds_peer(&rdev->wiphy, dev, addr);
+ ret = rdev->ops->set_wds_peer(&rdev->wiphy, dev, addr);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
+{
+ trace_rdev_rfkill_poll(&rdev->wiphy);
+ rdev->ops->rfkill_poll(&rdev->wiphy);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
+
+#ifdef CONFIG_NL80211_TESTMODE
+static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ void *data, int len)
+{
+ int ret;
+ trace_rdev_testmode_cmd(&rdev->wiphy, wdev);
+ ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb, void *data,
+ int len)
+{
+ int ret;
+ trace_rdev_testmode_dump(&rdev->wiphy);
+ ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+#endif
+
+static inline int
+rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *peer,
+ const struct cfg80211_bitrate_mask *mask)
+{
+ int ret;
+ trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, peer, mask);
+ ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, peer, mask);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, int idx,
+ struct survey_info *info)
+{
+ int ret;
+ trace_rdev_dump_survey(&rdev->wiphy, netdev, idx);
+ ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info);
+ if (ret < 0)
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ else
+ trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info);
+ return ret;
+}
+
+static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ struct cfg80211_pmksa *pmksa)
+{
+ int ret;
+ trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa);
+ ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev,
+ struct cfg80211_pmksa *pmksa)
+{
+ int ret;
+ trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa);
+ ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev)
+{
+ int ret;
+ trace_rdev_flush_pmksa(&rdev->wiphy, netdev);
+ ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_remain_on_channel(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct ieee80211_channel *chan,
+ unsigned int duration, u64 *cookie)
+{
+ int ret;
+ trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration);
+ ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan,
+ duration, cookie);
+ trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
+ return ret;
+}
+
+static inline int
+rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, u64 cookie)
+{
+ int ret;
+ trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie);
+ ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_mgmt_tx_params *params,
+ u64 *cookie)
+{
+ int ret;
+ trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params);
+ ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie);
+ trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
+ return ret;
+}
+
+static inline int
+rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, u64 cookie)
+{
+ int ret;
+ trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie);
+ ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool enabled,
+ int timeout)
+{
+ int ret;
+ trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout);
+ ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, s32 rssi_thold, u32 rssi_hyst)
+{
+ int ret;
+ trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold,
+ rssi_hyst);
+ ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold,
+ rssi_hyst);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u32 rate, u32 pkts, u32 intvl)
+{
+ int ret;
+ trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl);
+ ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts,
+ intvl);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void
+rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev, u16 frame_type, bool reg)
+{
+ trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg);
+ rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
+static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev,
+ u32 tx_ant, u32 rx_ant)
+{
+ int ret;
+ trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant);
+ ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev,
+ u32 *tx_ant, u32 *rx_ant)
+{
+ int ret;
+ trace_rdev_get_antenna(&rdev->wiphy);
+ ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant);
+ if (ret)
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ else
+ trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant,
+ *rx_ant);
+ return ret;
+}
+
+static inline int
+rdev_sched_scan_start(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_sched_scan_request *request)
+{
+ int ret;
+ trace_rdev_sched_scan_start(&rdev->wiphy, dev, request);
+ ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ int ret;
+ trace_rdev_sched_scan_stop(&rdev->wiphy, dev);
+ ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_gtk_rekey_data *data)
+{
+ int ret;
+ trace_rdev_set_rekey_data(&rdev->wiphy, dev);
+ ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *peer,
+ u8 action_code, u8 dialog_token,
+ u16 status_code, u32 peer_capability,
+ bool initiator, const u8 *buf, size_t len)
+{
+ int ret;
+ trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, action_code,
+ dialog_token, status_code, peer_capability,
+ initiator, buf, len);
+ ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, action_code,
+ dialog_token, status_code, peer_capability,
+ initiator, buf, len);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 *peer,
+ enum nl80211_tdls_operation oper)
+{
+ int ret;
+ trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper);
+ ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_probe_client(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *peer,
+ u64 *cookie)
+{
+ int ret;
+ trace_rdev_probe_client(&rdev->wiphy, dev, peer);
+ ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie);
+ trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
+ return ret;
+}
+
+static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u16 noack_map)
+{
+ int ret;
+ trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map);
+ ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_get_channel(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ struct cfg80211_chan_def *chandef)
+{
+ int ret;
+
+ trace_rdev_get_channel(&rdev->wiphy, wdev);
+ ret = rdev->ops->get_channel(&rdev->wiphy, wdev, chandef);
+ trace_rdev_return_chandef(&rdev->wiphy, ret, chandef);
+
+ return ret;
+}
+
+static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ int ret;
+
+ trace_rdev_start_p2p_device(&rdev->wiphy, wdev);
+ ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ trace_rdev_stop_p2p_device(&rdev->wiphy, wdev);
+ rdev->ops->stop_p2p_device(&rdev->wiphy, wdev);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
+static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_acl_data *params)
+{
+ int ret;
+
+ trace_rdev_set_mac_acl(&rdev->wiphy, dev, params);
+ ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_update_ft_ies_params *ftie)
+{
+ int ret;
+
+ trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie);
+ ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ enum nl80211_crit_proto_id protocol,
+ u16 duration)
+{
+ int ret;
+
+ trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration);
+ ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev,
+ protocol, duration);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ trace_rdev_crit_proto_stop(&rdev->wiphy, wdev);
+ rdev->ops->crit_proto_stop(&rdev->wiphy, wdev);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
+static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_csa_settings *params)
+{
+ int ret;
+
+ trace_rdev_channel_switch(&rdev->wiphy, dev, params);
+ ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_qos_map *qos_map)
+{
+ int ret = -EOPNOTSUPP;
+
+ if (rdev->ops->set_qos_map) {
+ trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map);
+ ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ }
+
+ return ret;
+}
+
+static inline int
+rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, struct cfg80211_chan_def *chandef)
+{
+ int ret;
+
+ trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, chandef);
+ ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, chandef);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+
+ return ret;
+}
+
+static inline int
+rdev_add_tx_ts(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 tsid, const u8 *peer,
+ u8 user_prio, u16 admitted_time)
+{
+ int ret = -EOPNOTSUPP;
+
+ trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer,
+ user_prio, admitted_time);
+ if (rdev->ops->add_tx_ts)
+ ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer,
+ user_prio, admitted_time);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+
+ return ret;
+}
+
+static inline int
+rdev_del_tx_ts(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u8 tsid, const u8 *peer)
+{
+ int ret = -EOPNOTSUPP;
+
+ trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer);
+ if (rdev->ops->del_tx_ts)
+ ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+
+ return ret;
+}
+
+static inline int
+rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *addr,
+ u8 oper_class, struct cfg80211_chan_def *chandef)
+{
+ int ret;
+
+ trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class,
+ chandef);
+ ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr,
+ oper_class, chandef);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline void
+rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *addr)
+{
+ trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr);
+ rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
+#endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
new file mode 100644
index 000000000..0e347f888
--- /dev/null
+++ b/net/wireless/reg.c
@@ -0,0 +1,3205 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+/**
+ * DOC: Wireless regulatory infrastructure
+ *
+ * The usual implementation is for a driver to read a device EEPROM to
+ * determine which regulatory domain it should be operating under, then
+ * looking up the allowable channels in a driver-local table and finally
+ * registering those channels in the wiphy structure.
+ *
+ * Another set of compliance enforcement is for drivers to use their
+ * own compliance limits which can be stored on the EEPROM. The host
+ * driver or firmware may ensure these are used.
+ *
+ * In addition to all this we provide an extra layer of regulatory
+ * conformance. For drivers which do not have any regulatory
+ * information CRDA provides the complete regulatory solution.
+ * For others it provides a community effort on further restrictions
+ * to enhance compliance.
+ *
+ * Note: When number of rules --> infinity we will not be able to
+ * index on alpha2 any more, instead we'll probably have to
+ * rely on some SHA1 checksum of the regdomain for example.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/nl80211.h>
+#include <linux/platform_device.h>
+#include <linux/moduleparam.h>
+#include <net/cfg80211.h>
+#include "core.h"
+#include "reg.h"
+#include "rdev-ops.h"
+#include "regdb.h"
+#include "nl80211.h"
+
+#ifdef CONFIG_CFG80211_REG_DEBUG
+#define REG_DBG_PRINT(format, args...) \
+ printk(KERN_DEBUG pr_fmt(format), ##args)
+#else
+#define REG_DBG_PRINT(args...)
+#endif
+
+/*
+ * Grace period we give before making sure all current interfaces reside on
+ * channels allowed by the current regulatory domain.
+ */
+#define REG_ENFORCE_GRACE_MS 60000
+
+/**
+ * enum reg_request_treatment - regulatory request treatment
+ *
+ * @REG_REQ_OK: continue processing the regulatory request
+ * @REG_REQ_IGNORE: ignore the regulatory request
+ * @REG_REQ_INTERSECT: the regulatory domain resulting from this request should
+ * be intersected with the current one.
+ * @REG_REQ_ALREADY_SET: the regulatory request will not change the current
+ * regulatory settings, and no further processing is required.
+ */
+enum reg_request_treatment {
+ REG_REQ_OK,
+ REG_REQ_IGNORE,
+ REG_REQ_INTERSECT,
+ REG_REQ_ALREADY_SET,
+};
+
+static struct regulatory_request core_request_world = {
+ .initiator = NL80211_REGDOM_SET_BY_CORE,
+ .alpha2[0] = '0',
+ .alpha2[1] = '0',
+ .intersect = false,
+ .processed = true,
+ .country_ie_env = ENVIRON_ANY,
+};
+
+/*
+ * Receipt of information from last regulatory request,
+ * protected by RTNL (and can be accessed with RCU protection)
+ */
+static struct regulatory_request __rcu *last_request =
+ (void __force __rcu *)&core_request_world;
+
+/* To trigger userspace events */
+static struct platform_device *reg_pdev;
+
+/*
+ * Central wireless core regulatory domains, we only need two,
+ * the current one and a world regulatory domain in case we have no
+ * information to give us an alpha2.
+ * (protected by RTNL, can be read under RCU)
+ */
+const struct ieee80211_regdomain __rcu *cfg80211_regdomain;
+
+/*
+ * Number of devices that registered to the core
+ * that support cellular base station regulatory hints
+ * (protected by RTNL)
+ */
+static int reg_num_devs_support_basehint;
+
+/*
+ * State variable indicating if the platform on which the devices
+ * are attached is operating in an indoor environment. The state variable
+ * is relevant for all registered devices.
+ */
+static bool reg_is_indoor;
+static spinlock_t reg_indoor_lock;
+
+/* Used to track the userspace process controlling the indoor setting */
+static u32 reg_is_indoor_portid;
+
+/* Max number of consecutive attempts to communicate with CRDA */
+#define REG_MAX_CRDA_TIMEOUTS 10
+
+static u32 reg_crda_timeouts;
+
+static const struct ieee80211_regdomain *get_cfg80211_regdom(void)
+{
+ return rtnl_dereference(cfg80211_regdomain);
+}
+
+const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy)
+{
+ return rtnl_dereference(wiphy->regd);
+}
+
+static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region)
+{
+ switch (dfs_region) {
+ case NL80211_DFS_UNSET:
+ return "unset";
+ case NL80211_DFS_FCC:
+ return "FCC";
+ case NL80211_DFS_ETSI:
+ return "ETSI";
+ case NL80211_DFS_JP:
+ return "JP";
+ }
+ return "Unknown";
+}
+
+enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy)
+{
+ const struct ieee80211_regdomain *regd = NULL;
+ const struct ieee80211_regdomain *wiphy_regd = NULL;
+
+ regd = get_cfg80211_regdom();
+ if (!wiphy)
+ goto out;
+
+ wiphy_regd = get_wiphy_regdom(wiphy);
+ if (!wiphy_regd)
+ goto out;
+
+ if (wiphy_regd->dfs_region == regd->dfs_region)
+ goto out;
+
+ REG_DBG_PRINT("%s: device specific dfs_region "
+ "(%s) disagrees with cfg80211's "
+ "central dfs_region (%s)\n",
+ dev_name(&wiphy->dev),
+ reg_dfs_region_str(wiphy_regd->dfs_region),
+ reg_dfs_region_str(regd->dfs_region));
+
+out:
+ return regd->dfs_region;
+}
+
+static void rcu_free_regdom(const struct ieee80211_regdomain *r)
+{
+ if (!r)
+ return;
+ kfree_rcu((struct ieee80211_regdomain *)r, rcu_head);
+}
+
+static struct regulatory_request *get_last_request(void)
+{
+ return rcu_dereference_rtnl(last_request);
+}
+
+/* Used to queue up regulatory hints */
+static LIST_HEAD(reg_requests_list);
+static spinlock_t reg_requests_lock;
+
+/* Used to queue up beacon hints for review */
+static LIST_HEAD(reg_pending_beacons);
+static spinlock_t reg_pending_beacons_lock;
+
+/* Used to keep track of processed beacon hints */
+static LIST_HEAD(reg_beacon_list);
+
+struct reg_beacon {
+ struct list_head list;
+ struct ieee80211_channel chan;
+};
+
+static void reg_check_chans_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reg_check_chans, reg_check_chans_work);
+
+static void reg_todo(struct work_struct *work);
+static DECLARE_WORK(reg_work, reg_todo);
+
+static void reg_timeout_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reg_timeout, reg_timeout_work);
+
+/* We keep a static world regulatory domain in case of the absence of CRDA */
+static const struct ieee80211_regdomain world_regdom = {
+ .n_reg_rules = 8,
+ .alpha2 = "00",
+ .reg_rules = {
+ /* IEEE 802.11b/g, channels 1..11 */
+ REG_RULE(2412-10, 2462+10, 40, 6, 20, 0),
+ /* IEEE 802.11b/g, channels 12..13. */
+ REG_RULE(2467-10, 2472+10, 40, 6, 20,
+ NL80211_RRF_NO_IR),
+ /* IEEE 802.11 channel 14 - Only JP enables
+ * this and for 802.11b only */
+ REG_RULE(2484-10, 2484+10, 20, 6, 20,
+ NL80211_RRF_NO_IR |
+ NL80211_RRF_NO_OFDM),
+ /* IEEE 802.11a, channel 36..48 */
+ REG_RULE(5180-10, 5240+10, 160, 6, 20,
+ NL80211_RRF_NO_IR),
+
+ /* IEEE 802.11a, channel 52..64 - DFS required */
+ REG_RULE(5260-10, 5320+10, 160, 6, 20,
+ NL80211_RRF_NO_IR |
+ NL80211_RRF_DFS),
+
+ /* IEEE 802.11a, channel 100..144 - DFS required */
+ REG_RULE(5500-10, 5720+10, 160, 6, 20,
+ NL80211_RRF_NO_IR |
+ NL80211_RRF_DFS),
+
+ /* IEEE 802.11a, channel 149..165 */
+ REG_RULE(5745-10, 5825+10, 80, 6, 20,
+ NL80211_RRF_NO_IR),
+
+ /* IEEE 802.11ad (60gHz), channels 1..3 */
+ REG_RULE(56160+2160*1-1080, 56160+2160*3+1080, 2160, 0, 0, 0),
+ }
+};
+
+/* protected by RTNL */
+static const struct ieee80211_regdomain *cfg80211_world_regdom =
+ &world_regdom;
+
+static char *ieee80211_regdom = "00";
+static char user_alpha2[2];
+
+module_param(ieee80211_regdom, charp, 0444);
+MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
+
+static void reg_free_request(struct regulatory_request *request)
+{
+ if (request != get_last_request())
+ kfree(request);
+}
+
+static void reg_free_last_request(void)
+{
+ struct regulatory_request *lr = get_last_request();
+
+ if (lr != &core_request_world && lr)
+ kfree_rcu(lr, rcu_head);
+}
+
+static void reg_update_last_request(struct regulatory_request *request)
+{
+ struct regulatory_request *lr;
+
+ lr = get_last_request();
+ if (lr == request)
+ return;
+
+ reg_free_last_request();
+ rcu_assign_pointer(last_request, request);
+}
+
+static void reset_regdomains(bool full_reset,
+ const struct ieee80211_regdomain *new_regdom)
+{
+ const struct ieee80211_regdomain *r;
+
+ ASSERT_RTNL();
+
+ r = get_cfg80211_regdom();
+
+ /* avoid freeing static information or freeing something twice */
+ if (r == cfg80211_world_regdom)
+ r = NULL;
+ if (cfg80211_world_regdom == &world_regdom)
+ cfg80211_world_regdom = NULL;
+ if (r == &world_regdom)
+ r = NULL;
+
+ rcu_free_regdom(r);
+ rcu_free_regdom(cfg80211_world_regdom);
+
+ cfg80211_world_regdom = &world_regdom;
+ rcu_assign_pointer(cfg80211_regdomain, new_regdom);
+
+ if (!full_reset)
+ return;
+
+ reg_update_last_request(&core_request_world);
+}
+
+/*
+ * Dynamic world regulatory domain requested by the wireless
+ * core upon initialization
+ */
+static void update_world_regdomain(const struct ieee80211_regdomain *rd)
+{
+ struct regulatory_request *lr;
+
+ lr = get_last_request();
+
+ WARN_ON(!lr);
+
+ reset_regdomains(false, rd);
+
+ cfg80211_world_regdom = rd;
+}
+
+bool is_world_regdom(const char *alpha2)
+{
+ if (!alpha2)
+ return false;
+ return alpha2[0] == '0' && alpha2[1] == '0';
+}
+
+static bool is_alpha2_set(const char *alpha2)
+{
+ if (!alpha2)
+ return false;
+ return alpha2[0] && alpha2[1];
+}
+
+static bool is_unknown_alpha2(const char *alpha2)
+{
+ if (!alpha2)
+ return false;
+ /*
+ * Special case where regulatory domain was built by driver
+ * but a specific alpha2 cannot be determined
+ */
+ return alpha2[0] == '9' && alpha2[1] == '9';
+}
+
+static bool is_intersected_alpha2(const char *alpha2)
+{
+ if (!alpha2)
+ return false;
+ /*
+ * Special case where regulatory domain is the
+ * result of an intersection between two regulatory domain
+ * structures
+ */
+ return alpha2[0] == '9' && alpha2[1] == '8';
+}
+
+static bool is_an_alpha2(const char *alpha2)
+{
+ if (!alpha2)
+ return false;
+ return isalpha(alpha2[0]) && isalpha(alpha2[1]);
+}
+
+static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y)
+{
+ if (!alpha2_x || !alpha2_y)
+ return false;
+ return alpha2_x[0] == alpha2_y[0] && alpha2_x[1] == alpha2_y[1];
+}
+
+static bool regdom_changes(const char *alpha2)
+{
+ const struct ieee80211_regdomain *r = get_cfg80211_regdom();
+
+ if (!r)
+ return true;
+ return !alpha2_equal(r->alpha2, alpha2);
+}
+
+/*
+ * The NL80211_REGDOM_SET_BY_USER regdom alpha2 is cached, this lets
+ * you know if a valid regulatory hint with NL80211_REGDOM_SET_BY_USER
+ * has ever been issued.
+ */
+static bool is_user_regdom_saved(void)
+{
+ if (user_alpha2[0] == '9' && user_alpha2[1] == '7')
+ return false;
+
+ /* This would indicate a mistake on the design */
+ if (WARN(!is_world_regdom(user_alpha2) && !is_an_alpha2(user_alpha2),
+ "Unexpected user alpha2: %c%c\n",
+ user_alpha2[0], user_alpha2[1]))
+ return false;
+
+ return true;
+}
+
+static const struct ieee80211_regdomain *
+reg_copy_regd(const struct ieee80211_regdomain *src_regd)
+{
+ struct ieee80211_regdomain *regd;
+ int size_of_regd;
+ unsigned int i;
+
+ size_of_regd =
+ sizeof(struct ieee80211_regdomain) +
+ src_regd->n_reg_rules * sizeof(struct ieee80211_reg_rule);
+
+ regd = kzalloc(size_of_regd, GFP_KERNEL);
+ if (!regd)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain));
+
+ for (i = 0; i < src_regd->n_reg_rules; i++)
+ memcpy(&regd->reg_rules[i], &src_regd->reg_rules[i],
+ sizeof(struct ieee80211_reg_rule));
+
+ return regd;
+}
+
+#ifdef CONFIG_CFG80211_INTERNAL_REGDB
+struct reg_regdb_search_request {
+ char alpha2[2];
+ struct list_head list;
+};
+
+static LIST_HEAD(reg_regdb_search_list);
+static DEFINE_MUTEX(reg_regdb_search_mutex);
+
+static void reg_regdb_search(struct work_struct *work)
+{
+ struct reg_regdb_search_request *request;
+ const struct ieee80211_regdomain *curdom, *regdom = NULL;
+ int i;
+
+ rtnl_lock();
+
+ mutex_lock(&reg_regdb_search_mutex);
+ while (!list_empty(&reg_regdb_search_list)) {
+ request = list_first_entry(&reg_regdb_search_list,
+ struct reg_regdb_search_request,
+ list);
+ list_del(&request->list);
+
+ for (i = 0; i < reg_regdb_size; i++) {
+ curdom = reg_regdb[i];
+
+ if (alpha2_equal(request->alpha2, curdom->alpha2)) {
+ regdom = reg_copy_regd(curdom);
+ break;
+ }
+ }
+
+ kfree(request);
+ }
+ mutex_unlock(&reg_regdb_search_mutex);
+
+ if (!IS_ERR_OR_NULL(regdom))
+ set_regdom(regdom, REGD_SOURCE_INTERNAL_DB);
+
+ rtnl_unlock();
+}
+
+static DECLARE_WORK(reg_regdb_work, reg_regdb_search);
+
+static void reg_regdb_query(const char *alpha2)
+{
+ struct reg_regdb_search_request *request;
+
+ if (!alpha2)
+ return;
+
+ request = kzalloc(sizeof(struct reg_regdb_search_request), GFP_KERNEL);
+ if (!request)
+ return;
+
+ memcpy(request->alpha2, alpha2, 2);
+
+ mutex_lock(&reg_regdb_search_mutex);
+ list_add_tail(&request->list, &reg_regdb_search_list);
+ mutex_unlock(&reg_regdb_search_mutex);
+
+ schedule_work(&reg_regdb_work);
+}
+
+/* Feel free to add any other sanity checks here */
+static void reg_regdb_size_check(void)
+{
+ /* We should ideally BUILD_BUG_ON() but then random builds would fail */
+ WARN_ONCE(!reg_regdb_size, "db.txt is empty, you should update it...");
+}
+#else
+static inline void reg_regdb_size_check(void) {}
+static inline void reg_regdb_query(const char *alpha2) {}
+#endif /* CONFIG_CFG80211_INTERNAL_REGDB */
+
+/*
+ * This lets us keep regulatory code which is updated on a regulatory
+ * basis in userspace.
+ */
+static int call_crda(const char *alpha2)
+{
+ char country[12];
+ char *env[] = { country, NULL };
+
+ snprintf(country, sizeof(country), "COUNTRY=%c%c",
+ alpha2[0], alpha2[1]);
+
+ /* query internal regulatory database (if it exists) */
+ reg_regdb_query(alpha2);
+
+ if (reg_crda_timeouts > REG_MAX_CRDA_TIMEOUTS) {
+ pr_info("Exceeded CRDA call max attempts. Not calling CRDA\n");
+ return -EINVAL;
+ }
+
+ if (!is_world_regdom((char *) alpha2))
+ pr_info("Calling CRDA for country: %c%c\n",
+ alpha2[0], alpha2[1]);
+ else
+ pr_info("Calling CRDA to update world regulatory domain\n");
+
+ return kobject_uevent_env(&reg_pdev->dev.kobj, KOBJ_CHANGE, env);
+}
+
+static enum reg_request_treatment
+reg_call_crda(struct regulatory_request *request)
+{
+ if (call_crda(request->alpha2))
+ return REG_REQ_IGNORE;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &reg_timeout, msecs_to_jiffies(3142));
+ return REG_REQ_OK;
+}
+
+bool reg_is_valid_request(const char *alpha2)
+{
+ struct regulatory_request *lr = get_last_request();
+
+ if (!lr || lr->processed)
+ return false;
+
+ return alpha2_equal(lr->alpha2, alpha2);
+}
+
+static const struct ieee80211_regdomain *reg_get_regdomain(struct wiphy *wiphy)
+{
+ struct regulatory_request *lr = get_last_request();
+
+ /*
+ * Follow the driver's regulatory domain, if present, unless a country
+ * IE has been processed or a user wants to help complaince further
+ */
+ if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ lr->initiator != NL80211_REGDOM_SET_BY_USER &&
+ wiphy->regd)
+ return get_wiphy_regdom(wiphy);
+
+ return get_cfg80211_regdom();
+}
+
+static unsigned int
+reg_get_max_bandwidth_from_range(const struct ieee80211_regdomain *rd,
+ const struct ieee80211_reg_rule *rule)
+{
+ const struct ieee80211_freq_range *freq_range = &rule->freq_range;
+ const struct ieee80211_freq_range *freq_range_tmp;
+ const struct ieee80211_reg_rule *tmp;
+ u32 start_freq, end_freq, idx, no;
+
+ for (idx = 0; idx < rd->n_reg_rules; idx++)
+ if (rule == &rd->reg_rules[idx])
+ break;
+
+ if (idx == rd->n_reg_rules)
+ return 0;
+
+ /* get start_freq */
+ no = idx;
+
+ while (no) {
+ tmp = &rd->reg_rules[--no];
+ freq_range_tmp = &tmp->freq_range;
+
+ if (freq_range_tmp->end_freq_khz < freq_range->start_freq_khz)
+ break;
+
+ freq_range = freq_range_tmp;
+ }
+
+ start_freq = freq_range->start_freq_khz;
+
+ /* get end_freq */
+ freq_range = &rule->freq_range;
+ no = idx;
+
+ while (no < rd->n_reg_rules - 1) {
+ tmp = &rd->reg_rules[++no];
+ freq_range_tmp = &tmp->freq_range;
+
+ if (freq_range_tmp->start_freq_khz > freq_range->end_freq_khz)
+ break;
+
+ freq_range = freq_range_tmp;
+ }
+
+ end_freq = freq_range->end_freq_khz;
+
+ return end_freq - start_freq;
+}
+
+unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd,
+ const struct ieee80211_reg_rule *rule)
+{
+ unsigned int bw = reg_get_max_bandwidth_from_range(rd, rule);
+
+ if (rule->flags & NL80211_RRF_NO_160MHZ)
+ bw = min_t(unsigned int, bw, MHZ_TO_KHZ(80));
+ if (rule->flags & NL80211_RRF_NO_80MHZ)
+ bw = min_t(unsigned int, bw, MHZ_TO_KHZ(40));
+
+ /*
+ * HT40+/HT40- limits are handled per-channel. Only limit BW if both
+ * are not allowed.
+ */
+ if (rule->flags & NL80211_RRF_NO_HT40MINUS &&
+ rule->flags & NL80211_RRF_NO_HT40PLUS)
+ bw = min_t(unsigned int, bw, MHZ_TO_KHZ(20));
+
+ return bw;
+}
+
+/* Sanity check on a regulatory rule */
+static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule)
+{
+ const struct ieee80211_freq_range *freq_range = &rule->freq_range;
+ u32 freq_diff;
+
+ if (freq_range->start_freq_khz <= 0 || freq_range->end_freq_khz <= 0)
+ return false;
+
+ if (freq_range->start_freq_khz > freq_range->end_freq_khz)
+ return false;
+
+ freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
+
+ if (freq_range->end_freq_khz <= freq_range->start_freq_khz ||
+ freq_range->max_bandwidth_khz > freq_diff)
+ return false;
+
+ return true;
+}
+
+static bool is_valid_rd(const struct ieee80211_regdomain *rd)
+{
+ const struct ieee80211_reg_rule *reg_rule = NULL;
+ unsigned int i;
+
+ if (!rd->n_reg_rules)
+ return false;
+
+ if (WARN_ON(rd->n_reg_rules > NL80211_MAX_SUPP_REG_RULES))
+ return false;
+
+ for (i = 0; i < rd->n_reg_rules; i++) {
+ reg_rule = &rd->reg_rules[i];
+ if (!is_valid_reg_rule(reg_rule))
+ return false;
+ }
+
+ return true;
+}
+
+static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range,
+ u32 center_freq_khz, u32 bw_khz)
+{
+ u32 start_freq_khz, end_freq_khz;
+
+ start_freq_khz = center_freq_khz - (bw_khz/2);
+ end_freq_khz = center_freq_khz + (bw_khz/2);
+
+ if (start_freq_khz >= freq_range->start_freq_khz &&
+ end_freq_khz <= freq_range->end_freq_khz)
+ return true;
+
+ return false;
+}
+
+/**
+ * freq_in_rule_band - tells us if a frequency is in a frequency band
+ * @freq_range: frequency rule we want to query
+ * @freq_khz: frequency we are inquiring about
+ *
+ * This lets us know if a specific frequency rule is or is not relevant to
+ * a specific frequency's band. Bands are device specific and artificial
+ * definitions (the "2.4 GHz band", the "5 GHz band" and the "60GHz band"),
+ * however it is safe for now to assume that a frequency rule should not be
+ * part of a frequency's band if the start freq or end freq are off by more
+ * than 2 GHz for the 2.4 and 5 GHz bands, and by more than 10 GHz for the
+ * 60 GHz band.
+ * This resolution can be lowered and should be considered as we add
+ * regulatory rule support for other "bands".
+ **/
+static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
+ u32 freq_khz)
+{
+#define ONE_GHZ_IN_KHZ 1000000
+ /*
+ * From 802.11ad: directional multi-gigabit (DMG):
+ * Pertaining to operation in a frequency band containing a channel
+ * with the Channel starting frequency above 45 GHz.
+ */
+ u32 limit = freq_khz > 45 * ONE_GHZ_IN_KHZ ?
+ 10 * ONE_GHZ_IN_KHZ : 2 * ONE_GHZ_IN_KHZ;
+ if (abs(freq_khz - freq_range->start_freq_khz) <= limit)
+ return true;
+ if (abs(freq_khz - freq_range->end_freq_khz) <= limit)
+ return true;
+ return false;
+#undef ONE_GHZ_IN_KHZ
+}
+
+/*
+ * Later on we can perhaps use the more restrictive DFS
+ * region but we don't have information for that yet so
+ * for now simply disallow conflicts.
+ */
+static enum nl80211_dfs_regions
+reg_intersect_dfs_region(const enum nl80211_dfs_regions dfs_region1,
+ const enum nl80211_dfs_regions dfs_region2)
+{
+ if (dfs_region1 != dfs_region2)
+ return NL80211_DFS_UNSET;
+ return dfs_region1;
+}
+
+/*
+ * Helper for regdom_intersect(), this does the real
+ * mathematical intersection fun
+ */
+static int reg_rules_intersect(const struct ieee80211_regdomain *rd1,
+ const struct ieee80211_regdomain *rd2,
+ const struct ieee80211_reg_rule *rule1,
+ const struct ieee80211_reg_rule *rule2,
+ struct ieee80211_reg_rule *intersected_rule)
+{
+ const struct ieee80211_freq_range *freq_range1, *freq_range2;
+ struct ieee80211_freq_range *freq_range;
+ const struct ieee80211_power_rule *power_rule1, *power_rule2;
+ struct ieee80211_power_rule *power_rule;
+ u32 freq_diff, max_bandwidth1, max_bandwidth2;
+
+ freq_range1 = &rule1->freq_range;
+ freq_range2 = &rule2->freq_range;
+ freq_range = &intersected_rule->freq_range;
+
+ power_rule1 = &rule1->power_rule;
+ power_rule2 = &rule2->power_rule;
+ power_rule = &intersected_rule->power_rule;
+
+ freq_range->start_freq_khz = max(freq_range1->start_freq_khz,
+ freq_range2->start_freq_khz);
+ freq_range->end_freq_khz = min(freq_range1->end_freq_khz,
+ freq_range2->end_freq_khz);
+
+ max_bandwidth1 = freq_range1->max_bandwidth_khz;
+ max_bandwidth2 = freq_range2->max_bandwidth_khz;
+
+ if (rule1->flags & NL80211_RRF_AUTO_BW)
+ max_bandwidth1 = reg_get_max_bandwidth(rd1, rule1);
+ if (rule2->flags & NL80211_RRF_AUTO_BW)
+ max_bandwidth2 = reg_get_max_bandwidth(rd2, rule2);
+
+ freq_range->max_bandwidth_khz = min(max_bandwidth1, max_bandwidth2);
+
+ intersected_rule->flags = rule1->flags | rule2->flags;
+
+ /*
+ * In case NL80211_RRF_AUTO_BW requested for both rules
+ * set AUTO_BW in intersected rule also. Next we will
+ * calculate BW correctly in handle_channel function.
+ * In other case remove AUTO_BW flag while we calculate
+ * maximum bandwidth correctly and auto calculation is
+ * not required.
+ */
+ if ((rule1->flags & NL80211_RRF_AUTO_BW) &&
+ (rule2->flags & NL80211_RRF_AUTO_BW))
+ intersected_rule->flags |= NL80211_RRF_AUTO_BW;
+ else
+ intersected_rule->flags &= ~NL80211_RRF_AUTO_BW;
+
+ freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
+ if (freq_range->max_bandwidth_khz > freq_diff)
+ freq_range->max_bandwidth_khz = freq_diff;
+
+ power_rule->max_eirp = min(power_rule1->max_eirp,
+ power_rule2->max_eirp);
+ power_rule->max_antenna_gain = min(power_rule1->max_antenna_gain,
+ power_rule2->max_antenna_gain);
+
+ intersected_rule->dfs_cac_ms = max(rule1->dfs_cac_ms,
+ rule2->dfs_cac_ms);
+
+ if (!is_valid_reg_rule(intersected_rule))
+ return -EINVAL;
+
+ return 0;
+}
+
+/* check whether old rule contains new rule */
+static bool rule_contains(struct ieee80211_reg_rule *r1,
+ struct ieee80211_reg_rule *r2)
+{
+ /* for simplicity, currently consider only same flags */
+ if (r1->flags != r2->flags)
+ return false;
+
+ /* verify r1 is more restrictive */
+ if ((r1->power_rule.max_antenna_gain >
+ r2->power_rule.max_antenna_gain) ||
+ r1->power_rule.max_eirp > r2->power_rule.max_eirp)
+ return false;
+
+ /* make sure r2's range is contained within r1 */
+ if (r1->freq_range.start_freq_khz > r2->freq_range.start_freq_khz ||
+ r1->freq_range.end_freq_khz < r2->freq_range.end_freq_khz)
+ return false;
+
+ /* and finally verify that r1.max_bw >= r2.max_bw */
+ if (r1->freq_range.max_bandwidth_khz <
+ r2->freq_range.max_bandwidth_khz)
+ return false;
+
+ return true;
+}
+
+/* add or extend current rules. do nothing if rule is already contained */
+static void add_rule(struct ieee80211_reg_rule *rule,
+ struct ieee80211_reg_rule *reg_rules, u32 *n_rules)
+{
+ struct ieee80211_reg_rule *tmp_rule;
+ int i;
+
+ for (i = 0; i < *n_rules; i++) {
+ tmp_rule = &reg_rules[i];
+ /* rule is already contained - do nothing */
+ if (rule_contains(tmp_rule, rule))
+ return;
+
+ /* extend rule if possible */
+ if (rule_contains(rule, tmp_rule)) {
+ memcpy(tmp_rule, rule, sizeof(*rule));
+ return;
+ }
+ }
+
+ memcpy(&reg_rules[*n_rules], rule, sizeof(*rule));
+ (*n_rules)++;
+}
+
+/**
+ * regdom_intersect - do the intersection between two regulatory domains
+ * @rd1: first regulatory domain
+ * @rd2: second regulatory domain
+ *
+ * Use this function to get the intersection between two regulatory domains.
+ * Once completed we will mark the alpha2 for the rd as intersected, "98",
+ * as no one single alpha2 can represent this regulatory domain.
+ *
+ * Returns a pointer to the regulatory domain structure which will hold the
+ * resulting intersection of rules between rd1 and rd2. We will
+ * kzalloc() this structure for you.
+ */
+static struct ieee80211_regdomain *
+regdom_intersect(const struct ieee80211_regdomain *rd1,
+ const struct ieee80211_regdomain *rd2)
+{
+ int r, size_of_regd;
+ unsigned int x, y;
+ unsigned int num_rules = 0;
+ const struct ieee80211_reg_rule *rule1, *rule2;
+ struct ieee80211_reg_rule intersected_rule;
+ struct ieee80211_regdomain *rd;
+
+ if (!rd1 || !rd2)
+ return NULL;
+
+ /*
+ * First we get a count of the rules we'll need, then we actually
+ * build them. This is to so we can malloc() and free() a
+ * regdomain once. The reason we use reg_rules_intersect() here
+ * is it will return -EINVAL if the rule computed makes no sense.
+ * All rules that do check out OK are valid.
+ */
+
+ for (x = 0; x < rd1->n_reg_rules; x++) {
+ rule1 = &rd1->reg_rules[x];
+ for (y = 0; y < rd2->n_reg_rules; y++) {
+ rule2 = &rd2->reg_rules[y];
+ if (!reg_rules_intersect(rd1, rd2, rule1, rule2,
+ &intersected_rule))
+ num_rules++;
+ }
+ }
+
+ if (!num_rules)
+ return NULL;
+
+ size_of_regd = sizeof(struct ieee80211_regdomain) +
+ num_rules * sizeof(struct ieee80211_reg_rule);
+
+ rd = kzalloc(size_of_regd, GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ for (x = 0; x < rd1->n_reg_rules; x++) {
+ rule1 = &rd1->reg_rules[x];
+ for (y = 0; y < rd2->n_reg_rules; y++) {
+ rule2 = &rd2->reg_rules[y];
+ r = reg_rules_intersect(rd1, rd2, rule1, rule2,
+ &intersected_rule);
+ /*
+ * No need to memset here the intersected rule here as
+ * we're not using the stack anymore
+ */
+ if (r)
+ continue;
+
+ add_rule(&intersected_rule, rd->reg_rules,
+ &rd->n_reg_rules);
+ }
+ }
+
+ rd->alpha2[0] = '9';
+ rd->alpha2[1] = '8';
+ rd->dfs_region = reg_intersect_dfs_region(rd1->dfs_region,
+ rd2->dfs_region);
+
+ return rd;
+}
+
+/*
+ * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
+ * want to just have the channel structure use these
+ */
+static u32 map_regdom_flags(u32 rd_flags)
+{
+ u32 channel_flags = 0;
+ if (rd_flags & NL80211_RRF_NO_IR_ALL)
+ channel_flags |= IEEE80211_CHAN_NO_IR;
+ if (rd_flags & NL80211_RRF_DFS)
+ channel_flags |= IEEE80211_CHAN_RADAR;
+ if (rd_flags & NL80211_RRF_NO_OFDM)
+ channel_flags |= IEEE80211_CHAN_NO_OFDM;
+ if (rd_flags & NL80211_RRF_NO_OUTDOOR)
+ channel_flags |= IEEE80211_CHAN_INDOOR_ONLY;
+ if (rd_flags & NL80211_RRF_GO_CONCURRENT)
+ channel_flags |= IEEE80211_CHAN_GO_CONCURRENT;
+ if (rd_flags & NL80211_RRF_NO_HT40MINUS)
+ channel_flags |= IEEE80211_CHAN_NO_HT40MINUS;
+ if (rd_flags & NL80211_RRF_NO_HT40PLUS)
+ channel_flags |= IEEE80211_CHAN_NO_HT40PLUS;
+ if (rd_flags & NL80211_RRF_NO_80MHZ)
+ channel_flags |= IEEE80211_CHAN_NO_80MHZ;
+ if (rd_flags & NL80211_RRF_NO_160MHZ)
+ channel_flags |= IEEE80211_CHAN_NO_160MHZ;
+ return channel_flags;
+}
+
+static const struct ieee80211_reg_rule *
+freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq,
+ const struct ieee80211_regdomain *regd)
+{
+ int i;
+ bool band_rule_found = false;
+ bool bw_fits = false;
+
+ if (!regd)
+ return ERR_PTR(-EINVAL);
+
+ for (i = 0; i < regd->n_reg_rules; i++) {
+ const struct ieee80211_reg_rule *rr;
+ const struct ieee80211_freq_range *fr = NULL;
+
+ rr = &regd->reg_rules[i];
+ fr = &rr->freq_range;
+
+ /*
+ * We only need to know if one frequency rule was
+ * was in center_freq's band, that's enough, so lets
+ * not overwrite it once found
+ */
+ if (!band_rule_found)
+ band_rule_found = freq_in_rule_band(fr, center_freq);
+
+ bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(20));
+
+ if (band_rule_found && bw_fits)
+ return rr;
+ }
+
+ if (!band_rule_found)
+ return ERR_PTR(-ERANGE);
+
+ return ERR_PTR(-EINVAL);
+}
+
+const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
+ u32 center_freq)
+{
+ const struct ieee80211_regdomain *regd;
+
+ regd = reg_get_regdomain(wiphy);
+
+ return freq_reg_info_regd(wiphy, center_freq, regd);
+}
+EXPORT_SYMBOL(freq_reg_info);
+
+const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
+{
+ switch (initiator) {
+ case NL80211_REGDOM_SET_BY_CORE:
+ return "core";
+ case NL80211_REGDOM_SET_BY_USER:
+ return "user";
+ case NL80211_REGDOM_SET_BY_DRIVER:
+ return "driver";
+ case NL80211_REGDOM_SET_BY_COUNTRY_IE:
+ return "country IE";
+ default:
+ WARN_ON(1);
+ return "bug";
+ }
+}
+EXPORT_SYMBOL(reg_initiator_name);
+
+#ifdef CONFIG_CFG80211_REG_DEBUG
+static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
+ struct ieee80211_channel *chan,
+ const struct ieee80211_reg_rule *reg_rule)
+{
+ const struct ieee80211_power_rule *power_rule;
+ const struct ieee80211_freq_range *freq_range;
+ char max_antenna_gain[32], bw[32];
+
+ power_rule = &reg_rule->power_rule;
+ freq_range = &reg_rule->freq_range;
+
+ if (!power_rule->max_antenna_gain)
+ snprintf(max_antenna_gain, sizeof(max_antenna_gain), "N/A");
+ else
+ snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d",
+ power_rule->max_antenna_gain);
+
+ if (reg_rule->flags & NL80211_RRF_AUTO_BW)
+ snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO",
+ freq_range->max_bandwidth_khz,
+ reg_get_max_bandwidth(regd, reg_rule));
+ else
+ snprintf(bw, sizeof(bw), "%d KHz",
+ freq_range->max_bandwidth_khz);
+
+ REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n",
+ chan->center_freq);
+
+ REG_DBG_PRINT("%d KHz - %d KHz @ %s), (%s mBi, %d mBm)\n",
+ freq_range->start_freq_khz, freq_range->end_freq_khz,
+ bw, max_antenna_gain,
+ power_rule->max_eirp);
+}
+#else
+static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
+ struct ieee80211_channel *chan,
+ const struct ieee80211_reg_rule *reg_rule)
+{
+ return;
+}
+#endif
+
+/*
+ * Note that right now we assume the desired channel bandwidth
+ * is always 20 MHz for each individual channel (HT40 uses 20 MHz
+ * per channel, the primary and the extension channel).
+ */
+static void handle_channel(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator,
+ struct ieee80211_channel *chan)
+{
+ u32 flags, bw_flags = 0;
+ const struct ieee80211_reg_rule *reg_rule = NULL;
+ const struct ieee80211_power_rule *power_rule = NULL;
+ const struct ieee80211_freq_range *freq_range = NULL;
+ struct wiphy *request_wiphy = NULL;
+ struct regulatory_request *lr = get_last_request();
+ const struct ieee80211_regdomain *regd;
+ u32 max_bandwidth_khz;
+
+ request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
+
+ flags = chan->orig_flags;
+
+ reg_rule = freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq));
+ if (IS_ERR(reg_rule)) {
+ /*
+ * We will disable all channels that do not match our
+ * received regulatory rule unless the hint is coming
+ * from a Country IE and the Country IE had no information
+ * about a band. The IEEE 802.11 spec allows for an AP
+ * to send only a subset of the regulatory rules allowed,
+ * so an AP in the US that only supports 2.4 GHz may only send
+ * a country IE with information for the 2.4 GHz band
+ * while 5 GHz is still supported.
+ */
+ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ PTR_ERR(reg_rule) == -ERANGE)
+ return;
+
+ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ request_wiphy && request_wiphy == wiphy &&
+ request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
+ REG_DBG_PRINT("Disabling freq %d MHz for good\n",
+ chan->center_freq);
+ chan->orig_flags |= IEEE80211_CHAN_DISABLED;
+ chan->flags = chan->orig_flags;
+ } else {
+ REG_DBG_PRINT("Disabling freq %d MHz\n",
+ chan->center_freq);
+ chan->flags |= IEEE80211_CHAN_DISABLED;
+ }
+ return;
+ }
+
+ regd = reg_get_regdomain(wiphy);
+ chan_reg_rule_print_dbg(regd, chan, reg_rule);
+
+ power_rule = &reg_rule->power_rule;
+ freq_range = &reg_rule->freq_range;
+
+ max_bandwidth_khz = freq_range->max_bandwidth_khz;
+ /* Check if auto calculation requested */
+ if (reg_rule->flags & NL80211_RRF_AUTO_BW)
+ max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
+
+ if (max_bandwidth_khz < MHZ_TO_KHZ(40))
+ bw_flags = IEEE80211_CHAN_NO_HT40;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(80))
+ bw_flags |= IEEE80211_CHAN_NO_80MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(160))
+ bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+
+ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ request_wiphy && request_wiphy == wiphy &&
+ request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
+ /*
+ * This guarantees the driver's requested regulatory domain
+ * will always be used as a base for further regulatory
+ * settings
+ */
+ chan->flags = chan->orig_flags =
+ map_regdom_flags(reg_rule->flags) | bw_flags;
+ chan->max_antenna_gain = chan->orig_mag =
+ (int) MBI_TO_DBI(power_rule->max_antenna_gain);
+ chan->max_reg_power = chan->max_power = chan->orig_mpwr =
+ (int) MBM_TO_DBM(power_rule->max_eirp);
+
+ if (chan->flags & IEEE80211_CHAN_RADAR) {
+ chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
+ if (reg_rule->dfs_cac_ms)
+ chan->dfs_cac_ms = reg_rule->dfs_cac_ms;
+ }
+
+ return;
+ }
+
+ chan->dfs_state = NL80211_DFS_USABLE;
+ chan->dfs_state_entered = jiffies;
+
+ chan->beacon_found = false;
+ chan->flags = flags | bw_flags | map_regdom_flags(reg_rule->flags);
+ chan->max_antenna_gain =
+ min_t(int, chan->orig_mag,
+ MBI_TO_DBI(power_rule->max_antenna_gain));
+ chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp);
+
+ if (chan->flags & IEEE80211_CHAN_RADAR) {
+ if (reg_rule->dfs_cac_ms)
+ chan->dfs_cac_ms = reg_rule->dfs_cac_ms;
+ else
+ chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
+ }
+
+ if (chan->orig_mpwr) {
+ /*
+ * Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER
+ * will always follow the passed country IE power settings.
+ */
+ if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER)
+ chan->max_power = chan->max_reg_power;
+ else
+ chan->max_power = min(chan->orig_mpwr,
+ chan->max_reg_power);
+ } else
+ chan->max_power = chan->max_reg_power;
+}
+
+static void handle_band(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator,
+ struct ieee80211_supported_band *sband)
+{
+ unsigned int i;
+
+ if (!sband)
+ return;
+
+ for (i = 0; i < sband->n_channels; i++)
+ handle_channel(wiphy, initiator, &sband->channels[i]);
+}
+
+static bool reg_request_cell_base(struct regulatory_request *request)
+{
+ if (request->initiator != NL80211_REGDOM_SET_BY_USER)
+ return false;
+ return request->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE;
+}
+
+bool reg_last_request_cell_base(void)
+{
+ return reg_request_cell_base(get_last_request());
+}
+
+#ifdef CONFIG_CFG80211_REG_CELLULAR_HINTS
+/* Core specific check */
+static enum reg_request_treatment
+reg_ignore_cell_hint(struct regulatory_request *pending_request)
+{
+ struct regulatory_request *lr = get_last_request();
+
+ if (!reg_num_devs_support_basehint)
+ return REG_REQ_IGNORE;
+
+ if (reg_request_cell_base(lr) &&
+ !regdom_changes(pending_request->alpha2))
+ return REG_REQ_ALREADY_SET;
+
+ return REG_REQ_OK;
+}
+
+/* Device specific check */
+static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy)
+{
+ return !(wiphy->features & NL80211_FEATURE_CELL_BASE_REG_HINTS);
+}
+#else
+static int reg_ignore_cell_hint(struct regulatory_request *pending_request)
+{
+ return REG_REQ_IGNORE;
+}
+
+static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy)
+{
+ return true;
+}
+#endif
+
+static bool wiphy_strict_alpha2_regd(struct wiphy *wiphy)
+{
+ if (wiphy->regulatory_flags & REGULATORY_STRICT_REG &&
+ !(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG))
+ return true;
+ return false;
+}
+
+static bool ignore_reg_update(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator)
+{
+ struct regulatory_request *lr = get_last_request();
+
+ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
+ return true;
+
+ if (!lr) {
+ REG_DBG_PRINT("Ignoring regulatory request set by %s "
+ "since last_request is not set\n",
+ reg_initiator_name(initiator));
+ return true;
+ }
+
+ if (initiator == NL80211_REGDOM_SET_BY_CORE &&
+ wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) {
+ REG_DBG_PRINT("Ignoring regulatory request set by %s "
+ "since the driver uses its own custom "
+ "regulatory domain\n",
+ reg_initiator_name(initiator));
+ return true;
+ }
+
+ /*
+ * wiphy->regd will be set once the device has its own
+ * desired regulatory domain set
+ */
+ if (wiphy_strict_alpha2_regd(wiphy) && !wiphy->regd &&
+ initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ !is_world_regdom(lr->alpha2)) {
+ REG_DBG_PRINT("Ignoring regulatory request set by %s "
+ "since the driver requires its own regulatory "
+ "domain to be set first\n",
+ reg_initiator_name(initiator));
+ return true;
+ }
+
+ if (reg_request_cell_base(lr))
+ return reg_dev_ignore_cell_hint(wiphy);
+
+ return false;
+}
+
+static bool reg_is_world_roaming(struct wiphy *wiphy)
+{
+ const struct ieee80211_regdomain *cr = get_cfg80211_regdom();
+ const struct ieee80211_regdomain *wr = get_wiphy_regdom(wiphy);
+ struct regulatory_request *lr = get_last_request();
+
+ if (is_world_regdom(cr->alpha2) || (wr && is_world_regdom(wr->alpha2)))
+ return true;
+
+ if (lr && lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ wiphy->regulatory_flags & REGULATORY_CUSTOM_REG)
+ return true;
+
+ return false;
+}
+
+static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx,
+ struct reg_beacon *reg_beacon)
+{
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_channel *chan;
+ bool channel_changed = false;
+ struct ieee80211_channel chan_before;
+
+ sband = wiphy->bands[reg_beacon->chan.band];
+ chan = &sband->channels[chan_idx];
+
+ if (likely(chan->center_freq != reg_beacon->chan.center_freq))
+ return;
+
+ if (chan->beacon_found)
+ return;
+
+ chan->beacon_found = true;
+
+ if (!reg_is_world_roaming(wiphy))
+ return;
+
+ if (wiphy->regulatory_flags & REGULATORY_DISABLE_BEACON_HINTS)
+ return;
+
+ chan_before.center_freq = chan->center_freq;
+ chan_before.flags = chan->flags;
+
+ if (chan->flags & IEEE80211_CHAN_NO_IR) {
+ chan->flags &= ~IEEE80211_CHAN_NO_IR;
+ channel_changed = true;
+ }
+
+ if (channel_changed)
+ nl80211_send_beacon_hint_event(wiphy, &chan_before, chan);
+}
+
+/*
+ * Called when a scan on a wiphy finds a beacon on
+ * new channel
+ */
+static void wiphy_update_new_beacon(struct wiphy *wiphy,
+ struct reg_beacon *reg_beacon)
+{
+ unsigned int i;
+ struct ieee80211_supported_band *sband;
+
+ if (!wiphy->bands[reg_beacon->chan.band])
+ return;
+
+ sband = wiphy->bands[reg_beacon->chan.band];
+
+ for (i = 0; i < sband->n_channels; i++)
+ handle_reg_beacon(wiphy, i, reg_beacon);
+}
+
+/*
+ * Called upon reg changes or a new wiphy is added
+ */
+static void wiphy_update_beacon_reg(struct wiphy *wiphy)
+{
+ unsigned int i;
+ struct ieee80211_supported_band *sband;
+ struct reg_beacon *reg_beacon;
+
+ list_for_each_entry(reg_beacon, &reg_beacon_list, list) {
+ if (!wiphy->bands[reg_beacon->chan.band])
+ continue;
+ sband = wiphy->bands[reg_beacon->chan.band];
+ for (i = 0; i < sband->n_channels; i++)
+ handle_reg_beacon(wiphy, i, reg_beacon);
+ }
+}
+
+/* Reap the advantages of previously found beacons */
+static void reg_process_beacons(struct wiphy *wiphy)
+{
+ /*
+ * Means we are just firing up cfg80211, so no beacons would
+ * have been processed yet.
+ */
+ if (!last_request)
+ return;
+ wiphy_update_beacon_reg(wiphy);
+}
+
+static bool is_ht40_allowed(struct ieee80211_channel *chan)
+{
+ if (!chan)
+ return false;
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ return false;
+ /* This would happen when regulatory rules disallow HT40 completely */
+ if ((chan->flags & IEEE80211_CHAN_NO_HT40) == IEEE80211_CHAN_NO_HT40)
+ return false;
+ return true;
+}
+
+static void reg_process_ht_flags_channel(struct wiphy *wiphy,
+ struct ieee80211_channel *channel)
+{
+ struct ieee80211_supported_band *sband = wiphy->bands[channel->band];
+ struct ieee80211_channel *channel_before = NULL, *channel_after = NULL;
+ unsigned int i;
+
+ if (!is_ht40_allowed(channel)) {
+ channel->flags |= IEEE80211_CHAN_NO_HT40;
+ return;
+ }
+
+ /*
+ * We need to ensure the extension channels exist to
+ * be able to use HT40- or HT40+, this finds them (or not)
+ */
+ for (i = 0; i < sband->n_channels; i++) {
+ struct ieee80211_channel *c = &sband->channels[i];
+
+ if (c->center_freq == (channel->center_freq - 20))
+ channel_before = c;
+ if (c->center_freq == (channel->center_freq + 20))
+ channel_after = c;
+ }
+
+ /*
+ * Please note that this assumes target bandwidth is 20 MHz,
+ * if that ever changes we also need to change the below logic
+ * to include that as well.
+ */
+ if (!is_ht40_allowed(channel_before))
+ channel->flags |= IEEE80211_CHAN_NO_HT40MINUS;
+ else
+ channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS;
+
+ if (!is_ht40_allowed(channel_after))
+ channel->flags |= IEEE80211_CHAN_NO_HT40PLUS;
+ else
+ channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS;
+}
+
+static void reg_process_ht_flags_band(struct wiphy *wiphy,
+ struct ieee80211_supported_band *sband)
+{
+ unsigned int i;
+
+ if (!sband)
+ return;
+
+ for (i = 0; i < sband->n_channels; i++)
+ reg_process_ht_flags_channel(wiphy, &sband->channels[i]);
+}
+
+static void reg_process_ht_flags(struct wiphy *wiphy)
+{
+ enum ieee80211_band band;
+
+ if (!wiphy)
+ return;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ reg_process_ht_flags_band(wiphy, wiphy->bands[band]);
+}
+
+static void reg_call_notifier(struct wiphy *wiphy,
+ struct regulatory_request *request)
+{
+ if (wiphy->reg_notifier)
+ wiphy->reg_notifier(wiphy, request);
+}
+
+static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
+{
+ struct cfg80211_chan_def chandef;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ enum nl80211_iftype iftype;
+
+ wdev_lock(wdev);
+ iftype = wdev->iftype;
+
+ /* make sure the interface is active */
+ if (!wdev->netdev || !netif_running(wdev->netdev))
+ goto wdev_inactive_unlock;
+
+ switch (iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ if (!wdev->beacon_interval)
+ goto wdev_inactive_unlock;
+ chandef = wdev->chandef;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ if (!wdev->ssid_len)
+ goto wdev_inactive_unlock;
+ chandef = wdev->chandef;
+ break;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ if (!wdev->current_bss ||
+ !wdev->current_bss->pub.channel)
+ goto wdev_inactive_unlock;
+
+ if (!rdev->ops->get_channel ||
+ rdev_get_channel(rdev, wdev, &chandef))
+ cfg80211_chandef_create(&chandef,
+ wdev->current_bss->pub.channel,
+ NL80211_CHAN_NO_HT);
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_DEVICE:
+ /* no enforcement required */
+ break;
+ default:
+ /* others not implemented for now */
+ WARN_ON(1);
+ break;
+ }
+
+ wdev_unlock(wdev);
+
+ switch (iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_ADHOC:
+ return cfg80211_reg_can_beacon(wiphy, &chandef, iftype);
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ return cfg80211_chandef_usable(wiphy, &chandef,
+ IEEE80211_CHAN_DISABLED);
+ default:
+ break;
+ }
+
+ return true;
+
+wdev_inactive_unlock:
+ wdev_unlock(wdev);
+ return true;
+}
+
+static void reg_leave_invalid_chans(struct wiphy *wiphy)
+{
+ struct wireless_dev *wdev;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list)
+ if (!reg_wdev_chan_valid(wiphy, wdev))
+ cfg80211_leave(rdev, wdev);
+}
+
+static void reg_check_chans_work(struct work_struct *work)
+{
+ struct cfg80211_registered_device *rdev;
+
+ REG_DBG_PRINT("Verifying active interfaces after reg change\n");
+ rtnl_lock();
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list)
+ if (!(rdev->wiphy.regulatory_flags &
+ REGULATORY_IGNORE_STALE_KICKOFF))
+ reg_leave_invalid_chans(&rdev->wiphy);
+
+ rtnl_unlock();
+}
+
+static void reg_check_channels(void)
+{
+ /*
+ * Give usermode a chance to do something nicer (move to another
+ * channel, orderly disconnection), before forcing a disconnection.
+ */
+ mod_delayed_work(system_power_efficient_wq,
+ &reg_check_chans,
+ msecs_to_jiffies(REG_ENFORCE_GRACE_MS));
+}
+
+static void wiphy_update_regulatory(struct wiphy *wiphy,
+ enum nl80211_reg_initiator initiator)
+{
+ enum ieee80211_band band;
+ struct regulatory_request *lr = get_last_request();
+
+ if (ignore_reg_update(wiphy, initiator)) {
+ /*
+ * Regulatory updates set by CORE are ignored for custom
+ * regulatory cards. Let us notify the changes to the driver,
+ * as some drivers used this to restore its orig_* reg domain.
+ */
+ if (initiator == NL80211_REGDOM_SET_BY_CORE &&
+ wiphy->regulatory_flags & REGULATORY_CUSTOM_REG)
+ reg_call_notifier(wiphy, lr);
+ return;
+ }
+
+ lr->dfs_region = get_cfg80211_regdom()->dfs_region;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ handle_band(wiphy, initiator, wiphy->bands[band]);
+
+ reg_process_beacons(wiphy);
+ reg_process_ht_flags(wiphy);
+ reg_call_notifier(wiphy, lr);
+}
+
+static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wiphy *wiphy;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ wiphy = &rdev->wiphy;
+ wiphy_update_regulatory(wiphy, initiator);
+ }
+
+ reg_check_channels();
+}
+
+static void handle_channel_custom(struct wiphy *wiphy,
+ struct ieee80211_channel *chan,
+ const struct ieee80211_regdomain *regd)
+{
+ u32 bw_flags = 0;
+ const struct ieee80211_reg_rule *reg_rule = NULL;
+ const struct ieee80211_power_rule *power_rule = NULL;
+ const struct ieee80211_freq_range *freq_range = NULL;
+ u32 max_bandwidth_khz;
+
+ reg_rule = freq_reg_info_regd(wiphy, MHZ_TO_KHZ(chan->center_freq),
+ regd);
+
+ if (IS_ERR(reg_rule)) {
+ REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n",
+ chan->center_freq);
+ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
+ chan->flags |= IEEE80211_CHAN_DISABLED;
+ } else {
+ chan->orig_flags |= IEEE80211_CHAN_DISABLED;
+ chan->flags = chan->orig_flags;
+ }
+ return;
+ }
+
+ chan_reg_rule_print_dbg(regd, chan, reg_rule);
+
+ power_rule = &reg_rule->power_rule;
+ freq_range = &reg_rule->freq_range;
+
+ max_bandwidth_khz = freq_range->max_bandwidth_khz;
+ /* Check if auto calculation requested */
+ if (reg_rule->flags & NL80211_RRF_AUTO_BW)
+ max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
+
+ if (max_bandwidth_khz < MHZ_TO_KHZ(40))
+ bw_flags = IEEE80211_CHAN_NO_HT40;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(80))
+ bw_flags |= IEEE80211_CHAN_NO_80MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(160))
+ bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+
+ chan->dfs_state_entered = jiffies;
+ chan->dfs_state = NL80211_DFS_USABLE;
+
+ chan->beacon_found = false;
+
+ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
+ chan->flags = chan->orig_flags | bw_flags |
+ map_regdom_flags(reg_rule->flags);
+ else
+ chan->flags |= map_regdom_flags(reg_rule->flags) | bw_flags;
+
+ chan->max_antenna_gain = (int) MBI_TO_DBI(power_rule->max_antenna_gain);
+ chan->max_reg_power = chan->max_power =
+ (int) MBM_TO_DBM(power_rule->max_eirp);
+
+ if (chan->flags & IEEE80211_CHAN_RADAR) {
+ if (reg_rule->dfs_cac_ms)
+ chan->dfs_cac_ms = reg_rule->dfs_cac_ms;
+ else
+ chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
+ }
+
+ chan->max_power = chan->max_reg_power;
+}
+
+static void handle_band_custom(struct wiphy *wiphy,
+ struct ieee80211_supported_band *sband,
+ const struct ieee80211_regdomain *regd)
+{
+ unsigned int i;
+
+ if (!sband)
+ return;
+
+ for (i = 0; i < sband->n_channels; i++)
+ handle_channel_custom(wiphy, &sband->channels[i], regd);
+}
+
+/* Used by drivers prior to wiphy registration */
+void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
+ const struct ieee80211_regdomain *regd)
+{
+ enum ieee80211_band band;
+ unsigned int bands_set = 0;
+
+ WARN(!(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG),
+ "wiphy should have REGULATORY_CUSTOM_REG\n");
+ wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ if (!wiphy->bands[band])
+ continue;
+ handle_band_custom(wiphy, wiphy->bands[band], regd);
+ bands_set++;
+ }
+
+ /*
+ * no point in calling this if it won't have any effect
+ * on your device's supported bands.
+ */
+ WARN_ON(!bands_set);
+}
+EXPORT_SYMBOL(wiphy_apply_custom_regulatory);
+
+static void reg_set_request_processed(void)
+{
+ bool need_more_processing = false;
+ struct regulatory_request *lr = get_last_request();
+
+ lr->processed = true;
+
+ spin_lock(&reg_requests_lock);
+ if (!list_empty(&reg_requests_list))
+ need_more_processing = true;
+ spin_unlock(&reg_requests_lock);
+
+ cancel_delayed_work(&reg_timeout);
+
+ if (need_more_processing)
+ schedule_work(&reg_work);
+}
+
+/**
+ * reg_process_hint_core - process core regulatory requests
+ * @pending_request: a pending core regulatory request
+ *
+ * The wireless subsystem can use this function to process
+ * a regulatory request issued by the regulatory core.
+ *
+ * Returns one of the different reg request treatment values.
+ */
+static enum reg_request_treatment
+reg_process_hint_core(struct regulatory_request *core_request)
+{
+
+ core_request->intersect = false;
+ core_request->processed = false;
+
+ reg_update_last_request(core_request);
+
+ return reg_call_crda(core_request);
+}
+
+static enum reg_request_treatment
+__reg_process_hint_user(struct regulatory_request *user_request)
+{
+ struct regulatory_request *lr = get_last_request();
+
+ if (reg_request_cell_base(user_request))
+ return reg_ignore_cell_hint(user_request);
+
+ if (reg_request_cell_base(lr))
+ return REG_REQ_IGNORE;
+
+ if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)
+ return REG_REQ_INTERSECT;
+ /*
+ * If the user knows better the user should set the regdom
+ * to their country before the IE is picked up
+ */
+ if (lr->initiator == NL80211_REGDOM_SET_BY_USER &&
+ lr->intersect)
+ return REG_REQ_IGNORE;
+ /*
+ * Process user requests only after previous user/driver/core
+ * requests have been processed
+ */
+ if ((lr->initiator == NL80211_REGDOM_SET_BY_CORE ||
+ lr->initiator == NL80211_REGDOM_SET_BY_DRIVER ||
+ lr->initiator == NL80211_REGDOM_SET_BY_USER) &&
+ regdom_changes(lr->alpha2))
+ return REG_REQ_IGNORE;
+
+ if (!regdom_changes(user_request->alpha2))
+ return REG_REQ_ALREADY_SET;
+
+ return REG_REQ_OK;
+}
+
+/**
+ * reg_process_hint_user - process user regulatory requests
+ * @user_request: a pending user regulatory request
+ *
+ * The wireless subsystem can use this function to process
+ * a regulatory request initiated by userspace.
+ *
+ * Returns one of the different reg request treatment values.
+ */
+static enum reg_request_treatment
+reg_process_hint_user(struct regulatory_request *user_request)
+{
+ enum reg_request_treatment treatment;
+
+ treatment = __reg_process_hint_user(user_request);
+ if (treatment == REG_REQ_IGNORE ||
+ treatment == REG_REQ_ALREADY_SET) {
+ reg_free_request(user_request);
+ return treatment;
+ }
+
+ user_request->intersect = treatment == REG_REQ_INTERSECT;
+ user_request->processed = false;
+
+ reg_update_last_request(user_request);
+
+ user_alpha2[0] = user_request->alpha2[0];
+ user_alpha2[1] = user_request->alpha2[1];
+
+ return reg_call_crda(user_request);
+}
+
+static enum reg_request_treatment
+__reg_process_hint_driver(struct regulatory_request *driver_request)
+{
+ struct regulatory_request *lr = get_last_request();
+
+ if (lr->initiator == NL80211_REGDOM_SET_BY_CORE) {
+ if (regdom_changes(driver_request->alpha2))
+ return REG_REQ_OK;
+ return REG_REQ_ALREADY_SET;
+ }
+
+ /*
+ * This would happen if you unplug and plug your card
+ * back in or if you add a new device for which the previously
+ * loaded card also agrees on the regulatory domain.
+ */
+ if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+ !regdom_changes(driver_request->alpha2))
+ return REG_REQ_ALREADY_SET;
+
+ return REG_REQ_INTERSECT;
+}
+
+/**
+ * reg_process_hint_driver - process driver regulatory requests
+ * @driver_request: a pending driver regulatory request
+ *
+ * The wireless subsystem can use this function to process
+ * a regulatory request issued by an 802.11 driver.
+ *
+ * Returns one of the different reg request treatment values.
+ */
+static enum reg_request_treatment
+reg_process_hint_driver(struct wiphy *wiphy,
+ struct regulatory_request *driver_request)
+{
+ const struct ieee80211_regdomain *regd, *tmp;
+ enum reg_request_treatment treatment;
+
+ treatment = __reg_process_hint_driver(driver_request);
+
+ switch (treatment) {
+ case REG_REQ_OK:
+ break;
+ case REG_REQ_IGNORE:
+ reg_free_request(driver_request);
+ return treatment;
+ case REG_REQ_INTERSECT:
+ /* fall through */
+ case REG_REQ_ALREADY_SET:
+ regd = reg_copy_regd(get_cfg80211_regdom());
+ if (IS_ERR(regd)) {
+ reg_free_request(driver_request);
+ return REG_REQ_IGNORE;
+ }
+
+ tmp = get_wiphy_regdom(wiphy);
+ rcu_assign_pointer(wiphy->regd, regd);
+ rcu_free_regdom(tmp);
+ }
+
+
+ driver_request->intersect = treatment == REG_REQ_INTERSECT;
+ driver_request->processed = false;
+
+ reg_update_last_request(driver_request);
+
+ /*
+ * Since CRDA will not be called in this case as we already
+ * have applied the requested regulatory domain before we just
+ * inform userspace we have processed the request
+ */
+ if (treatment == REG_REQ_ALREADY_SET) {
+ nl80211_send_reg_change_event(driver_request);
+ reg_set_request_processed();
+ return treatment;
+ }
+
+ return reg_call_crda(driver_request);
+}
+
+static enum reg_request_treatment
+__reg_process_hint_country_ie(struct wiphy *wiphy,
+ struct regulatory_request *country_ie_request)
+{
+ struct wiphy *last_wiphy = NULL;
+ struct regulatory_request *lr = get_last_request();
+
+ if (reg_request_cell_base(lr)) {
+ /* Trust a Cell base station over the AP's country IE */
+ if (regdom_changes(country_ie_request->alpha2))
+ return REG_REQ_IGNORE;
+ return REG_REQ_ALREADY_SET;
+ } else {
+ if (wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_IGNORE)
+ return REG_REQ_IGNORE;
+ }
+
+ if (unlikely(!is_an_alpha2(country_ie_request->alpha2)))
+ return -EINVAL;
+
+ if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE)
+ return REG_REQ_OK;
+
+ last_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
+
+ if (last_wiphy != wiphy) {
+ /*
+ * Two cards with two APs claiming different
+ * Country IE alpha2s. We could
+ * intersect them, but that seems unlikely
+ * to be correct. Reject second one for now.
+ */
+ if (regdom_changes(country_ie_request->alpha2))
+ return REG_REQ_IGNORE;
+ return REG_REQ_ALREADY_SET;
+ }
+
+ if (regdom_changes(country_ie_request->alpha2))
+ return REG_REQ_OK;
+ return REG_REQ_ALREADY_SET;
+}
+
+/**
+ * reg_process_hint_country_ie - process regulatory requests from country IEs
+ * @country_ie_request: a regulatory request from a country IE
+ *
+ * The wireless subsystem can use this function to process
+ * a regulatory request issued by a country Information Element.
+ *
+ * Returns one of the different reg request treatment values.
+ */
+static enum reg_request_treatment
+reg_process_hint_country_ie(struct wiphy *wiphy,
+ struct regulatory_request *country_ie_request)
+{
+ enum reg_request_treatment treatment;
+
+ treatment = __reg_process_hint_country_ie(wiphy, country_ie_request);
+
+ switch (treatment) {
+ case REG_REQ_OK:
+ break;
+ case REG_REQ_IGNORE:
+ /* fall through */
+ case REG_REQ_ALREADY_SET:
+ reg_free_request(country_ie_request);
+ return treatment;
+ case REG_REQ_INTERSECT:
+ reg_free_request(country_ie_request);
+ /*
+ * This doesn't happen yet, not sure we
+ * ever want to support it for this case.
+ */
+ WARN_ONCE(1, "Unexpected intersection for country IEs");
+ return REG_REQ_IGNORE;
+ }
+
+ country_ie_request->intersect = false;
+ country_ie_request->processed = false;
+
+ reg_update_last_request(country_ie_request);
+
+ return reg_call_crda(country_ie_request);
+}
+
+/* This processes *all* regulatory hints */
+static void reg_process_hint(struct regulatory_request *reg_request)
+{
+ struct wiphy *wiphy = NULL;
+ enum reg_request_treatment treatment;
+
+ if (reg_request->wiphy_idx != WIPHY_IDX_INVALID)
+ wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);
+
+ switch (reg_request->initiator) {
+ case NL80211_REGDOM_SET_BY_CORE:
+ reg_process_hint_core(reg_request);
+ return;
+ case NL80211_REGDOM_SET_BY_USER:
+ treatment = reg_process_hint_user(reg_request);
+ if (treatment == REG_REQ_IGNORE ||
+ treatment == REG_REQ_ALREADY_SET)
+ return;
+ return;
+ case NL80211_REGDOM_SET_BY_DRIVER:
+ if (!wiphy)
+ goto out_free;
+ treatment = reg_process_hint_driver(wiphy, reg_request);
+ break;
+ case NL80211_REGDOM_SET_BY_COUNTRY_IE:
+ if (!wiphy)
+ goto out_free;
+ treatment = reg_process_hint_country_ie(wiphy, reg_request);
+ break;
+ default:
+ WARN(1, "invalid initiator %d\n", reg_request->initiator);
+ goto out_free;
+ }
+
+ /* This is required so that the orig_* parameters are saved */
+ if (treatment == REG_REQ_ALREADY_SET && wiphy &&
+ wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
+ wiphy_update_regulatory(wiphy, reg_request->initiator);
+ reg_check_channels();
+ }
+
+ return;
+
+out_free:
+ reg_free_request(reg_request);
+}
+
+static bool reg_only_self_managed_wiphys(void)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wiphy *wiphy;
+ bool self_managed_found = false;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ wiphy = &rdev->wiphy;
+ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
+ self_managed_found = true;
+ else
+ return false;
+ }
+
+ /* make sure at least one self-managed wiphy exists */
+ return self_managed_found;
+}
+
+/*
+ * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_*
+ * Regulatory hints come on a first come first serve basis and we
+ * must process each one atomically.
+ */
+static void reg_process_pending_hints(void)
+{
+ struct regulatory_request *reg_request, *lr;
+
+ lr = get_last_request();
+
+ /* When last_request->processed becomes true this will be rescheduled */
+ if (lr && !lr->processed) {
+ reg_process_hint(lr);
+ return;
+ }
+
+ spin_lock(&reg_requests_lock);
+
+ if (list_empty(&reg_requests_list)) {
+ spin_unlock(&reg_requests_lock);
+ return;
+ }
+
+ reg_request = list_first_entry(&reg_requests_list,
+ struct regulatory_request,
+ list);
+ list_del_init(&reg_request->list);
+
+ spin_unlock(&reg_requests_lock);
+
+ if (reg_only_self_managed_wiphys()) {
+ reg_free_request(reg_request);
+ return;
+ }
+
+ reg_process_hint(reg_request);
+
+ lr = get_last_request();
+
+ spin_lock(&reg_requests_lock);
+ if (!list_empty(&reg_requests_list) && lr && lr->processed)
+ schedule_work(&reg_work);
+ spin_unlock(&reg_requests_lock);
+}
+
+/* Processes beacon hints -- this has nothing to do with country IEs */
+static void reg_process_pending_beacon_hints(void)
+{
+ struct cfg80211_registered_device *rdev;
+ struct reg_beacon *pending_beacon, *tmp;
+
+ /* This goes through the _pending_ beacon list */
+ spin_lock_bh(&reg_pending_beacons_lock);
+
+ list_for_each_entry_safe(pending_beacon, tmp,
+ &reg_pending_beacons, list) {
+ list_del_init(&pending_beacon->list);
+
+ /* Applies the beacon hint to current wiphys */
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list)
+ wiphy_update_new_beacon(&rdev->wiphy, pending_beacon);
+
+ /* Remembers the beacon hint for new wiphys or reg changes */
+ list_add_tail(&pending_beacon->list, &reg_beacon_list);
+ }
+
+ spin_unlock_bh(&reg_pending_beacons_lock);
+}
+
+static void reg_process_self_managed_hints(void)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wiphy *wiphy;
+ const struct ieee80211_regdomain *tmp;
+ const struct ieee80211_regdomain *regd;
+ enum ieee80211_band band;
+ struct regulatory_request request = {};
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ wiphy = &rdev->wiphy;
+
+ spin_lock(&reg_requests_lock);
+ regd = rdev->requested_regd;
+ rdev->requested_regd = NULL;
+ spin_unlock(&reg_requests_lock);
+
+ if (regd == NULL)
+ continue;
+
+ tmp = get_wiphy_regdom(wiphy);
+ rcu_assign_pointer(wiphy->regd, regd);
+ rcu_free_regdom(tmp);
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ handle_band_custom(wiphy, wiphy->bands[band], regd);
+
+ reg_process_ht_flags(wiphy);
+
+ request.wiphy_idx = get_wiphy_idx(wiphy);
+ request.alpha2[0] = regd->alpha2[0];
+ request.alpha2[1] = regd->alpha2[1];
+ request.initiator = NL80211_REGDOM_SET_BY_DRIVER;
+
+ nl80211_send_wiphy_reg_change_event(&request);
+ }
+
+ reg_check_channels();
+}
+
+static void reg_todo(struct work_struct *work)
+{
+ rtnl_lock();
+ reg_process_pending_hints();
+ reg_process_pending_beacon_hints();
+ reg_process_self_managed_hints();
+ rtnl_unlock();
+}
+
+static void queue_regulatory_request(struct regulatory_request *request)
+{
+ request->alpha2[0] = toupper(request->alpha2[0]);
+ request->alpha2[1] = toupper(request->alpha2[1]);
+
+ spin_lock(&reg_requests_lock);
+ list_add_tail(&request->list, &reg_requests_list);
+ spin_unlock(&reg_requests_lock);
+
+ schedule_work(&reg_work);
+}
+
+/*
+ * Core regulatory hint -- happens during cfg80211_init()
+ * and when we restore regulatory settings.
+ */
+static int regulatory_hint_core(const char *alpha2)
+{
+ struct regulatory_request *request;
+
+ request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->alpha2[0] = alpha2[0];
+ request->alpha2[1] = alpha2[1];
+ request->initiator = NL80211_REGDOM_SET_BY_CORE;
+
+ queue_regulatory_request(request);
+
+ return 0;
+}
+
+/* User hints */
+int regulatory_hint_user(const char *alpha2,
+ enum nl80211_user_reg_hint_type user_reg_hint_type)
+{
+ struct regulatory_request *request;
+
+ if (WARN_ON(!alpha2))
+ return -EINVAL;
+
+ request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->wiphy_idx = WIPHY_IDX_INVALID;
+ request->alpha2[0] = alpha2[0];
+ request->alpha2[1] = alpha2[1];
+ request->initiator = NL80211_REGDOM_SET_BY_USER;
+ request->user_reg_hint_type = user_reg_hint_type;
+
+ /* Allow calling CRDA again */
+ reg_crda_timeouts = 0;
+
+ queue_regulatory_request(request);
+
+ return 0;
+}
+
+int regulatory_hint_indoor(bool is_indoor, u32 portid)
+{
+ spin_lock(&reg_indoor_lock);
+
+ /* It is possible that more than one user space process is trying to
+ * configure the indoor setting. To handle such cases, clear the indoor
+ * setting in case that some process does not think that the device
+ * is operating in an indoor environment. In addition, if a user space
+ * process indicates that it is controlling the indoor setting, save its
+ * portid, i.e., make it the owner.
+ */
+ reg_is_indoor = is_indoor;
+ if (reg_is_indoor) {
+ if (!reg_is_indoor_portid)
+ reg_is_indoor_portid = portid;
+ } else {
+ reg_is_indoor_portid = 0;
+ }
+
+ spin_unlock(&reg_indoor_lock);
+
+ if (!is_indoor)
+ reg_check_channels();
+
+ return 0;
+}
+
+void regulatory_netlink_notify(u32 portid)
+{
+ spin_lock(&reg_indoor_lock);
+
+ if (reg_is_indoor_portid != portid) {
+ spin_unlock(&reg_indoor_lock);
+ return;
+ }
+
+ reg_is_indoor = false;
+ reg_is_indoor_portid = 0;
+
+ spin_unlock(&reg_indoor_lock);
+
+ reg_check_channels();
+}
+
+/* Driver hints */
+int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
+{
+ struct regulatory_request *request;
+
+ if (WARN_ON(!alpha2 || !wiphy))
+ return -EINVAL;
+
+ wiphy->regulatory_flags &= ~REGULATORY_CUSTOM_REG;
+
+ request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->wiphy_idx = get_wiphy_idx(wiphy);
+
+ request->alpha2[0] = alpha2[0];
+ request->alpha2[1] = alpha2[1];
+ request->initiator = NL80211_REGDOM_SET_BY_DRIVER;
+
+ /* Allow calling CRDA again */
+ reg_crda_timeouts = 0;
+
+ queue_regulatory_request(request);
+
+ return 0;
+}
+EXPORT_SYMBOL(regulatory_hint);
+
+void regulatory_hint_country_ie(struct wiphy *wiphy, enum ieee80211_band band,
+ const u8 *country_ie, u8 country_ie_len)
+{
+ char alpha2[2];
+ enum environment_cap env = ENVIRON_ANY;
+ struct regulatory_request *request = NULL, *lr;
+
+ /* IE len must be evenly divisible by 2 */
+ if (country_ie_len & 0x01)
+ return;
+
+ if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
+ return;
+
+ request = kzalloc(sizeof(*request), GFP_KERNEL);
+ if (!request)
+ return;
+
+ alpha2[0] = country_ie[0];
+ alpha2[1] = country_ie[1];
+
+ if (country_ie[2] == 'I')
+ env = ENVIRON_INDOOR;
+ else if (country_ie[2] == 'O')
+ env = ENVIRON_OUTDOOR;
+
+ rcu_read_lock();
+ lr = get_last_request();
+
+ if (unlikely(!lr))
+ goto out;
+
+ /*
+ * We will run this only upon a successful connection on cfg80211.
+ * We leave conflict resolution to the workqueue, where can hold
+ * the RTNL.
+ */
+ if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+ lr->wiphy_idx != WIPHY_IDX_INVALID)
+ goto out;
+
+ request->wiphy_idx = get_wiphy_idx(wiphy);
+ request->alpha2[0] = alpha2[0];
+ request->alpha2[1] = alpha2[1];
+ request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE;
+ request->country_ie_env = env;
+
+ /* Allow calling CRDA again */
+ reg_crda_timeouts = 0;
+
+ queue_regulatory_request(request);
+ request = NULL;
+out:
+ kfree(request);
+ rcu_read_unlock();
+}
+
+static void restore_alpha2(char *alpha2, bool reset_user)
+{
+ /* indicates there is no alpha2 to consider for restoration */
+ alpha2[0] = '9';
+ alpha2[1] = '7';
+
+ /* The user setting has precedence over the module parameter */
+ if (is_user_regdom_saved()) {
+ /* Unless we're asked to ignore it and reset it */
+ if (reset_user) {
+ REG_DBG_PRINT("Restoring regulatory settings including user preference\n");
+ user_alpha2[0] = '9';
+ user_alpha2[1] = '7';
+
+ /*
+ * If we're ignoring user settings, we still need to
+ * check the module parameter to ensure we put things
+ * back as they were for a full restore.
+ */
+ if (!is_world_regdom(ieee80211_regdom)) {
+ REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
+ ieee80211_regdom[0], ieee80211_regdom[1]);
+ alpha2[0] = ieee80211_regdom[0];
+ alpha2[1] = ieee80211_regdom[1];
+ }
+ } else {
+ REG_DBG_PRINT("Restoring regulatory settings while preserving user preference for: %c%c\n",
+ user_alpha2[0], user_alpha2[1]);
+ alpha2[0] = user_alpha2[0];
+ alpha2[1] = user_alpha2[1];
+ }
+ } else if (!is_world_regdom(ieee80211_regdom)) {
+ REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
+ ieee80211_regdom[0], ieee80211_regdom[1]);
+ alpha2[0] = ieee80211_regdom[0];
+ alpha2[1] = ieee80211_regdom[1];
+ } else
+ REG_DBG_PRINT("Restoring regulatory settings\n");
+}
+
+static void restore_custom_reg_settings(struct wiphy *wiphy)
+{
+ struct ieee80211_supported_band *sband;
+ enum ieee80211_band band;
+ struct ieee80211_channel *chan;
+ int i;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ sband = wiphy->bands[band];
+ if (!sband)
+ continue;
+ for (i = 0; i < sband->n_channels; i++) {
+ chan = &sband->channels[i];
+ chan->flags = chan->orig_flags;
+ chan->max_antenna_gain = chan->orig_mag;
+ chan->max_power = chan->orig_mpwr;
+ chan->beacon_found = false;
+ }
+ }
+}
+
+/*
+ * Restoring regulatory settings involves ingoring any
+ * possibly stale country IE information and user regulatory
+ * settings if so desired, this includes any beacon hints
+ * learned as we could have traveled outside to another country
+ * after disconnection. To restore regulatory settings we do
+ * exactly what we did at bootup:
+ *
+ * - send a core regulatory hint
+ * - send a user regulatory hint if applicable
+ *
+ * Device drivers that send a regulatory hint for a specific country
+ * keep their own regulatory domain on wiphy->regd so that does does
+ * not need to be remembered.
+ */
+static void restore_regulatory_settings(bool reset_user)
+{
+ char alpha2[2];
+ char world_alpha2[2];
+ struct reg_beacon *reg_beacon, *btmp;
+ LIST_HEAD(tmp_reg_req_list);
+ struct cfg80211_registered_device *rdev;
+
+ ASSERT_RTNL();
+
+ /*
+ * Clear the indoor setting in case that it is not controlled by user
+ * space, as otherwise there is no guarantee that the device is still
+ * operating in an indoor environment.
+ */
+ spin_lock(&reg_indoor_lock);
+ if (reg_is_indoor && !reg_is_indoor_portid) {
+ reg_is_indoor = false;
+ reg_check_channels();
+ }
+ spin_unlock(&reg_indoor_lock);
+
+ reset_regdomains(true, &world_regdom);
+ restore_alpha2(alpha2, reset_user);
+
+ /*
+ * If there's any pending requests we simply
+ * stash them to a temporary pending queue and
+ * add then after we've restored regulatory
+ * settings.
+ */
+ spin_lock(&reg_requests_lock);
+ list_splice_tail_init(&reg_requests_list, &tmp_reg_req_list);
+ spin_unlock(&reg_requests_lock);
+
+ /* Clear beacon hints */
+ spin_lock_bh(&reg_pending_beacons_lock);
+ list_for_each_entry_safe(reg_beacon, btmp, &reg_pending_beacons, list) {
+ list_del(&reg_beacon->list);
+ kfree(reg_beacon);
+ }
+ spin_unlock_bh(&reg_pending_beacons_lock);
+
+ list_for_each_entry_safe(reg_beacon, btmp, &reg_beacon_list, list) {
+ list_del(&reg_beacon->list);
+ kfree(reg_beacon);
+ }
+
+ /* First restore to the basic regulatory settings */
+ world_alpha2[0] = cfg80211_world_regdom->alpha2[0];
+ world_alpha2[1] = cfg80211_world_regdom->alpha2[1];
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ if (rdev->wiphy.regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
+ continue;
+ if (rdev->wiphy.regulatory_flags & REGULATORY_CUSTOM_REG)
+ restore_custom_reg_settings(&rdev->wiphy);
+ }
+
+ regulatory_hint_core(world_alpha2);
+
+ /*
+ * This restores the ieee80211_regdom module parameter
+ * preference or the last user requested regulatory
+ * settings, user regulatory settings takes precedence.
+ */
+ if (is_an_alpha2(alpha2))
+ regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER);
+
+ spin_lock(&reg_requests_lock);
+ list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list);
+ spin_unlock(&reg_requests_lock);
+
+ REG_DBG_PRINT("Kicking the queue\n");
+
+ schedule_work(&reg_work);
+}
+
+void regulatory_hint_disconnect(void)
+{
+ REG_DBG_PRINT("All devices are disconnected, going to restore regulatory settings\n");
+ restore_regulatory_settings(false);
+}
+
+static bool freq_is_chan_12_13_14(u16 freq)
+{
+ if (freq == ieee80211_channel_to_frequency(12, IEEE80211_BAND_2GHZ) ||
+ freq == ieee80211_channel_to_frequency(13, IEEE80211_BAND_2GHZ) ||
+ freq == ieee80211_channel_to_frequency(14, IEEE80211_BAND_2GHZ))
+ return true;
+ return false;
+}
+
+static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan)
+{
+ struct reg_beacon *pending_beacon;
+
+ list_for_each_entry(pending_beacon, &reg_pending_beacons, list)
+ if (beacon_chan->center_freq ==
+ pending_beacon->chan.center_freq)
+ return true;
+ return false;
+}
+
+int regulatory_hint_found_beacon(struct wiphy *wiphy,
+ struct ieee80211_channel *beacon_chan,
+ gfp_t gfp)
+{
+ struct reg_beacon *reg_beacon;
+ bool processing;
+
+ if (beacon_chan->beacon_found ||
+ beacon_chan->flags & IEEE80211_CHAN_RADAR ||
+ (beacon_chan->band == IEEE80211_BAND_2GHZ &&
+ !freq_is_chan_12_13_14(beacon_chan->center_freq)))
+ return 0;
+
+ spin_lock_bh(&reg_pending_beacons_lock);
+ processing = pending_reg_beacon(beacon_chan);
+ spin_unlock_bh(&reg_pending_beacons_lock);
+
+ if (processing)
+ return 0;
+
+ reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp);
+ if (!reg_beacon)
+ return -ENOMEM;
+
+ REG_DBG_PRINT("Found new beacon on frequency: %d MHz (Ch %d) on %s\n",
+ beacon_chan->center_freq,
+ ieee80211_frequency_to_channel(beacon_chan->center_freq),
+ wiphy_name(wiphy));
+
+ memcpy(&reg_beacon->chan, beacon_chan,
+ sizeof(struct ieee80211_channel));
+
+ /*
+ * Since we can be called from BH or and non-BH context
+ * we must use spin_lock_bh()
+ */
+ spin_lock_bh(&reg_pending_beacons_lock);
+ list_add_tail(&reg_beacon->list, &reg_pending_beacons);
+ spin_unlock_bh(&reg_pending_beacons_lock);
+
+ schedule_work(&reg_work);
+
+ return 0;
+}
+
+static void print_rd_rules(const struct ieee80211_regdomain *rd)
+{
+ unsigned int i;
+ const struct ieee80211_reg_rule *reg_rule = NULL;
+ const struct ieee80211_freq_range *freq_range = NULL;
+ const struct ieee80211_power_rule *power_rule = NULL;
+ char bw[32], cac_time[32];
+
+ pr_info(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n");
+
+ for (i = 0; i < rd->n_reg_rules; i++) {
+ reg_rule = &rd->reg_rules[i];
+ freq_range = &reg_rule->freq_range;
+ power_rule = &reg_rule->power_rule;
+
+ if (reg_rule->flags & NL80211_RRF_AUTO_BW)
+ snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO",
+ freq_range->max_bandwidth_khz,
+ reg_get_max_bandwidth(rd, reg_rule));
+ else
+ snprintf(bw, sizeof(bw), "%d KHz",
+ freq_range->max_bandwidth_khz);
+
+ if (reg_rule->flags & NL80211_RRF_DFS)
+ scnprintf(cac_time, sizeof(cac_time), "%u s",
+ reg_rule->dfs_cac_ms/1000);
+ else
+ scnprintf(cac_time, sizeof(cac_time), "N/A");
+
+
+ /*
+ * There may not be documentation for max antenna gain
+ * in certain regions
+ */
+ if (power_rule->max_antenna_gain)
+ pr_info(" (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n",
+ freq_range->start_freq_khz,
+ freq_range->end_freq_khz,
+ bw,
+ power_rule->max_antenna_gain,
+ power_rule->max_eirp,
+ cac_time);
+ else
+ pr_info(" (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n",
+ freq_range->start_freq_khz,
+ freq_range->end_freq_khz,
+ bw,
+ power_rule->max_eirp,
+ cac_time);
+ }
+}
+
+bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region)
+{
+ switch (dfs_region) {
+ case NL80211_DFS_UNSET:
+ case NL80211_DFS_FCC:
+ case NL80211_DFS_ETSI:
+ case NL80211_DFS_JP:
+ return true;
+ default:
+ REG_DBG_PRINT("Ignoring uknown DFS master region: %d\n",
+ dfs_region);
+ return false;
+ }
+}
+
+static void print_regdomain(const struct ieee80211_regdomain *rd)
+{
+ struct regulatory_request *lr = get_last_request();
+
+ if (is_intersected_alpha2(rd->alpha2)) {
+ if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+ struct cfg80211_registered_device *rdev;
+ rdev = cfg80211_rdev_by_wiphy_idx(lr->wiphy_idx);
+ if (rdev) {
+ pr_info("Current regulatory domain updated by AP to: %c%c\n",
+ rdev->country_ie_alpha2[0],
+ rdev->country_ie_alpha2[1]);
+ } else
+ pr_info("Current regulatory domain intersected:\n");
+ } else
+ pr_info("Current regulatory domain intersected:\n");
+ } else if (is_world_regdom(rd->alpha2)) {
+ pr_info("World regulatory domain updated:\n");
+ } else {
+ if (is_unknown_alpha2(rd->alpha2))
+ pr_info("Regulatory domain changed to driver built-in settings (unknown country)\n");
+ else {
+ if (reg_request_cell_base(lr))
+ pr_info("Regulatory domain changed to country: %c%c by Cell Station\n",
+ rd->alpha2[0], rd->alpha2[1]);
+ else
+ pr_info("Regulatory domain changed to country: %c%c\n",
+ rd->alpha2[0], rd->alpha2[1]);
+ }
+ }
+
+ pr_info(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region));
+ print_rd_rules(rd);
+}
+
+static void print_regdomain_info(const struct ieee80211_regdomain *rd)
+{
+ pr_info("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]);
+ print_rd_rules(rd);
+}
+
+static int reg_set_rd_core(const struct ieee80211_regdomain *rd)
+{
+ if (!is_world_regdom(rd->alpha2))
+ return -EINVAL;
+ update_world_regdomain(rd);
+ return 0;
+}
+
+static int reg_set_rd_user(const struct ieee80211_regdomain *rd,
+ struct regulatory_request *user_request)
+{
+ const struct ieee80211_regdomain *intersected_rd = NULL;
+
+ if (!regdom_changes(rd->alpha2))
+ return -EALREADY;
+
+ if (!is_valid_rd(rd)) {
+ pr_err("Invalid regulatory domain detected:\n");
+ print_regdomain_info(rd);
+ return -EINVAL;
+ }
+
+ if (!user_request->intersect) {
+ reset_regdomains(false, rd);
+ return 0;
+ }
+
+ intersected_rd = regdom_intersect(rd, get_cfg80211_regdom());
+ if (!intersected_rd)
+ return -EINVAL;
+
+ kfree(rd);
+ rd = NULL;
+ reset_regdomains(false, intersected_rd);
+
+ return 0;
+}
+
+static int reg_set_rd_driver(const struct ieee80211_regdomain *rd,
+ struct regulatory_request *driver_request)
+{
+ const struct ieee80211_regdomain *regd;
+ const struct ieee80211_regdomain *intersected_rd = NULL;
+ const struct ieee80211_regdomain *tmp;
+ struct wiphy *request_wiphy;
+
+ if (is_world_regdom(rd->alpha2))
+ return -EINVAL;
+
+ if (!regdom_changes(rd->alpha2))
+ return -EALREADY;
+
+ if (!is_valid_rd(rd)) {
+ pr_err("Invalid regulatory domain detected:\n");
+ print_regdomain_info(rd);
+ return -EINVAL;
+ }
+
+ request_wiphy = wiphy_idx_to_wiphy(driver_request->wiphy_idx);
+ if (!request_wiphy) {
+ queue_delayed_work(system_power_efficient_wq,
+ &reg_timeout, 0);
+ return -ENODEV;
+ }
+
+ if (!driver_request->intersect) {
+ if (request_wiphy->regd)
+ return -EALREADY;
+
+ regd = reg_copy_regd(rd);
+ if (IS_ERR(regd))
+ return PTR_ERR(regd);
+
+ rcu_assign_pointer(request_wiphy->regd, regd);
+ reset_regdomains(false, rd);
+ return 0;
+ }
+
+ intersected_rd = regdom_intersect(rd, get_cfg80211_regdom());
+ if (!intersected_rd)
+ return -EINVAL;
+
+ /*
+ * We can trash what CRDA provided now.
+ * However if a driver requested this specific regulatory
+ * domain we keep it for its private use
+ */
+ tmp = get_wiphy_regdom(request_wiphy);
+ rcu_assign_pointer(request_wiphy->regd, rd);
+ rcu_free_regdom(tmp);
+
+ rd = NULL;
+
+ reset_regdomains(false, intersected_rd);
+
+ return 0;
+}
+
+static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd,
+ struct regulatory_request *country_ie_request)
+{
+ struct wiphy *request_wiphy;
+
+ if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) &&
+ !is_unknown_alpha2(rd->alpha2))
+ return -EINVAL;
+
+ /*
+ * Lets only bother proceeding on the same alpha2 if the current
+ * rd is non static (it means CRDA was present and was used last)
+ * and the pending request came in from a country IE
+ */
+
+ if (!is_valid_rd(rd)) {
+ pr_err("Invalid regulatory domain detected:\n");
+ print_regdomain_info(rd);
+ return -EINVAL;
+ }
+
+ request_wiphy = wiphy_idx_to_wiphy(country_ie_request->wiphy_idx);
+ if (!request_wiphy) {
+ queue_delayed_work(system_power_efficient_wq,
+ &reg_timeout, 0);
+ return -ENODEV;
+ }
+
+ if (country_ie_request->intersect)
+ return -EINVAL;
+
+ reset_regdomains(false, rd);
+ return 0;
+}
+
+/*
+ * Use this call to set the current regulatory domain. Conflicts with
+ * multiple drivers can be ironed out later. Caller must've already
+ * kmalloc'd the rd structure.
+ */
+int set_regdom(const struct ieee80211_regdomain *rd,
+ enum ieee80211_regd_source regd_src)
+{
+ struct regulatory_request *lr;
+ bool user_reset = false;
+ int r;
+
+ if (!reg_is_valid_request(rd->alpha2)) {
+ kfree(rd);
+ return -EINVAL;
+ }
+
+ if (regd_src == REGD_SOURCE_CRDA)
+ reg_crda_timeouts = 0;
+
+ lr = get_last_request();
+
+ /* Note that this doesn't update the wiphys, this is done below */
+ switch (lr->initiator) {
+ case NL80211_REGDOM_SET_BY_CORE:
+ r = reg_set_rd_core(rd);
+ break;
+ case NL80211_REGDOM_SET_BY_USER:
+ r = reg_set_rd_user(rd, lr);
+ user_reset = true;
+ break;
+ case NL80211_REGDOM_SET_BY_DRIVER:
+ r = reg_set_rd_driver(rd, lr);
+ break;
+ case NL80211_REGDOM_SET_BY_COUNTRY_IE:
+ r = reg_set_rd_country_ie(rd, lr);
+ break;
+ default:
+ WARN(1, "invalid initiator %d\n", lr->initiator);
+ return -EINVAL;
+ }
+
+ if (r) {
+ switch (r) {
+ case -EALREADY:
+ reg_set_request_processed();
+ break;
+ default:
+ /* Back to world regulatory in case of errors */
+ restore_regulatory_settings(user_reset);
+ }
+
+ kfree(rd);
+ return r;
+ }
+
+ /* This would make this whole thing pointless */
+ if (WARN_ON(!lr->intersect && rd != get_cfg80211_regdom()))
+ return -EINVAL;
+
+ /* update all wiphys now with the new established regulatory domain */
+ update_all_wiphy_regulatory(lr->initiator);
+
+ print_regdomain(get_cfg80211_regdom());
+
+ nl80211_send_reg_change_event(lr);
+
+ reg_set_request_processed();
+
+ return 0;
+}
+
+static int __regulatory_set_wiphy_regd(struct wiphy *wiphy,
+ struct ieee80211_regdomain *rd)
+{
+ const struct ieee80211_regdomain *regd;
+ const struct ieee80211_regdomain *prev_regd;
+ struct cfg80211_registered_device *rdev;
+
+ if (WARN_ON(!wiphy || !rd))
+ return -EINVAL;
+
+ if (WARN(!(wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED),
+ "wiphy should have REGULATORY_WIPHY_SELF_MANAGED\n"))
+ return -EPERM;
+
+ if (WARN(!is_valid_rd(rd), "Invalid regulatory domain detected\n")) {
+ print_regdomain_info(rd);
+ return -EINVAL;
+ }
+
+ regd = reg_copy_regd(rd);
+ if (IS_ERR(regd))
+ return PTR_ERR(regd);
+
+ rdev = wiphy_to_rdev(wiphy);
+
+ spin_lock(&reg_requests_lock);
+ prev_regd = rdev->requested_regd;
+ rdev->requested_regd = regd;
+ spin_unlock(&reg_requests_lock);
+
+ kfree(prev_regd);
+ return 0;
+}
+
+int regulatory_set_wiphy_regd(struct wiphy *wiphy,
+ struct ieee80211_regdomain *rd)
+{
+ int ret = __regulatory_set_wiphy_regd(wiphy, rd);
+
+ if (ret)
+ return ret;
+
+ schedule_work(&reg_work);
+ return 0;
+}
+EXPORT_SYMBOL(regulatory_set_wiphy_regd);
+
+int regulatory_set_wiphy_regd_sync_rtnl(struct wiphy *wiphy,
+ struct ieee80211_regdomain *rd)
+{
+ int ret;
+
+ ASSERT_RTNL();
+
+ ret = __regulatory_set_wiphy_regd(wiphy, rd);
+ if (ret)
+ return ret;
+
+ /* process the request immediately */
+ reg_process_self_managed_hints();
+ return 0;
+}
+EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync_rtnl);
+
+void wiphy_regulatory_register(struct wiphy *wiphy)
+{
+ struct regulatory_request *lr;
+
+ /* self-managed devices ignore external hints */
+ if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
+ wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS |
+ REGULATORY_COUNTRY_IE_IGNORE;
+
+ if (!reg_dev_ignore_cell_hint(wiphy))
+ reg_num_devs_support_basehint++;
+
+ lr = get_last_request();
+ wiphy_update_regulatory(wiphy, lr->initiator);
+}
+
+void wiphy_regulatory_deregister(struct wiphy *wiphy)
+{
+ struct wiphy *request_wiphy = NULL;
+ struct regulatory_request *lr;
+
+ lr = get_last_request();
+
+ if (!reg_dev_ignore_cell_hint(wiphy))
+ reg_num_devs_support_basehint--;
+
+ rcu_free_regdom(get_wiphy_regdom(wiphy));
+ RCU_INIT_POINTER(wiphy->regd, NULL);
+
+ if (lr)
+ request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
+
+ if (!request_wiphy || request_wiphy != wiphy)
+ return;
+
+ lr->wiphy_idx = WIPHY_IDX_INVALID;
+ lr->country_ie_env = ENVIRON_ANY;
+}
+
+static void reg_timeout_work(struct work_struct *work)
+{
+ REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n");
+ rtnl_lock();
+ reg_crda_timeouts++;
+ restore_regulatory_settings(true);
+ rtnl_unlock();
+}
+
+/*
+ * See http://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii, for
+ * UNII band definitions
+ */
+int cfg80211_get_unii(int freq)
+{
+ /* UNII-1 */
+ if (freq >= 5150 && freq <= 5250)
+ return 0;
+
+ /* UNII-2A */
+ if (freq > 5250 && freq <= 5350)
+ return 1;
+
+ /* UNII-2B */
+ if (freq > 5350 && freq <= 5470)
+ return 2;
+
+ /* UNII-2C */
+ if (freq > 5470 && freq <= 5725)
+ return 3;
+
+ /* UNII-3 */
+ if (freq > 5725 && freq <= 5825)
+ return 4;
+
+ return -EINVAL;
+}
+
+bool regulatory_indoor_allowed(void)
+{
+ return reg_is_indoor;
+}
+
+int __init regulatory_init(void)
+{
+ int err = 0;
+
+ reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
+ if (IS_ERR(reg_pdev))
+ return PTR_ERR(reg_pdev);
+
+ spin_lock_init(&reg_requests_lock);
+ spin_lock_init(&reg_pending_beacons_lock);
+ spin_lock_init(&reg_indoor_lock);
+
+ reg_regdb_size_check();
+
+ rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom);
+
+ user_alpha2[0] = '9';
+ user_alpha2[1] = '7';
+
+ /* We always try to get an update for the static regdomain */
+ err = regulatory_hint_core(cfg80211_world_regdom->alpha2);
+ if (err) {
+ if (err == -ENOMEM)
+ return err;
+ /*
+ * N.B. kobject_uevent_env() can fail mainly for when we're out
+ * memory which is handled and propagated appropriately above
+ * but it can also fail during a netlink_broadcast() or during
+ * early boot for call_usermodehelper(). For now treat these
+ * errors as non-fatal.
+ */
+ pr_err("kobject_uevent_env() was unable to call CRDA during init\n");
+ }
+
+ /*
+ * Finally, if the user set the module parameter treat it
+ * as a user hint.
+ */
+ if (!is_world_regdom(ieee80211_regdom))
+ regulatory_hint_user(ieee80211_regdom,
+ NL80211_USER_REG_HINT_USER);
+
+ return 0;
+}
+
+void regulatory_exit(void)
+{
+ struct regulatory_request *reg_request, *tmp;
+ struct reg_beacon *reg_beacon, *btmp;
+
+ cancel_work_sync(&reg_work);
+ cancel_delayed_work_sync(&reg_timeout);
+ cancel_delayed_work_sync(&reg_check_chans);
+
+ /* Lock to suppress warnings */
+ rtnl_lock();
+ reset_regdomains(true, NULL);
+ rtnl_unlock();
+
+ dev_set_uevent_suppress(&reg_pdev->dev, true);
+
+ platform_device_unregister(reg_pdev);
+
+ list_for_each_entry_safe(reg_beacon, btmp, &reg_pending_beacons, list) {
+ list_del(&reg_beacon->list);
+ kfree(reg_beacon);
+ }
+
+ list_for_each_entry_safe(reg_beacon, btmp, &reg_beacon_list, list) {
+ list_del(&reg_beacon->list);
+ kfree(reg_beacon);
+ }
+
+ list_for_each_entry_safe(reg_request, tmp, &reg_requests_list, list) {
+ list_del(&reg_request->list);
+ kfree(reg_request);
+ }
+}
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
new file mode 100644
index 000000000..9f495d76e
--- /dev/null
+++ b/net/wireless/reg.h
@@ -0,0 +1,146 @@
+#ifndef __NET_WIRELESS_REG_H
+#define __NET_WIRELESS_REG_H
+/*
+ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+enum ieee80211_regd_source {
+ REGD_SOURCE_INTERNAL_DB,
+ REGD_SOURCE_CRDA,
+};
+
+extern const struct ieee80211_regdomain __rcu *cfg80211_regdomain;
+
+bool reg_is_valid_request(const char *alpha2);
+bool is_world_regdom(const char *alpha2);
+bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region);
+enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy);
+
+int regulatory_hint_user(const char *alpha2,
+ enum nl80211_user_reg_hint_type user_reg_hint_type);
+
+/**
+ * regulatory_hint_indoor - hint operation in indoor env. or not
+ * @is_indoor: if true indicates that user space thinks that the
+ * device is operating in an indoor environment.
+ * @portid: the netlink port ID on which the hint was given.
+ */
+int regulatory_hint_indoor(bool is_indoor, u32 portid);
+
+/**
+ * regulatory_netlink_notify - notify on released netlink socket
+ * @portid: the netlink socket port ID
+ */
+void regulatory_netlink_notify(u32 portid);
+
+void wiphy_regulatory_register(struct wiphy *wiphy);
+void wiphy_regulatory_deregister(struct wiphy *wiphy);
+
+int __init regulatory_init(void);
+void regulatory_exit(void);
+
+int set_regdom(const struct ieee80211_regdomain *rd,
+ enum ieee80211_regd_source regd_src);
+
+unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd,
+ const struct ieee80211_reg_rule *rule);
+
+bool reg_last_request_cell_base(void);
+const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy);
+
+/**
+ * regulatory_hint_found_beacon - hints a beacon was found on a channel
+ * @wiphy: the wireless device where the beacon was found on
+ * @beacon_chan: the channel on which the beacon was found on
+ * @gfp: context flags
+ *
+ * This informs the wireless core that a beacon from an AP was found on
+ * the channel provided. This allows the wireless core to make educated
+ * guesses on regulatory to help with world roaming. This is only used for
+ * world roaming -- when we do not know our current location. This is
+ * only useful on channels 12, 13 and 14 on the 2 GHz band as channels
+ * 1-11 are already enabled by the world regulatory domain; and on
+ * non-radar 5 GHz channels.
+ *
+ * Drivers do not need to call this, cfg80211 will do it for after a scan
+ * on a newly found BSS. If you cannot make use of this feature you can
+ * set the wiphy->disable_beacon_hints to true.
+ */
+int regulatory_hint_found_beacon(struct wiphy *wiphy,
+ struct ieee80211_channel *beacon_chan,
+ gfp_t gfp);
+
+/**
+ * regulatory_hint_country_ie - hints a country IE as a regulatory domain
+ * @wiphy: the wireless device giving the hint (used only for reporting
+ * conflicts)
+ * @band: the band on which the country IE was received on. This determines
+ * the band we'll process the country IE channel triplets for.
+ * @country_ie: pointer to the country IE
+ * @country_ie_len: length of the country IE
+ *
+ * We will intersect the rd with the what CRDA tells us should apply
+ * for the alpha2 this country IE belongs to, this prevents APs from
+ * sending us incorrect or outdated information against a country.
+ *
+ * The AP is expected to provide Country IE channel triplets for the
+ * band it is on. It is technically possible for APs to send channel
+ * country IE triplets even for channels outside of the band they are
+ * in but for that they would have to use the regulatory extension
+ * in combination with a triplet but this behaviour is currently
+ * not observed. For this reason if a triplet is seen with channel
+ * information for a band the BSS is not present in it will be ignored.
+ */
+void regulatory_hint_country_ie(struct wiphy *wiphy,
+ enum ieee80211_band band,
+ const u8 *country_ie,
+ u8 country_ie_len);
+
+/**
+ * regulatory_hint_disconnect - informs all devices have been disconneted
+ *
+ * Regulotory rules can be enhanced further upon scanning and upon
+ * connection to an AP. These rules become stale if we disconnect
+ * and go to another country, whether or not we suspend and resume.
+ * If we suspend, go to another country and resume we'll automatically
+ * get disconnected shortly after resuming and things will be reset as well.
+ * This routine is a helper to restore regulatory settings to how they were
+ * prior to our first connect attempt. This includes ignoring country IE and
+ * beacon regulatory hints. The ieee80211_regdom module parameter will always
+ * be respected but if a user had set the regulatory domain that will take
+ * precedence.
+ *
+ * Must be called from process context.
+ */
+void regulatory_hint_disconnect(void);
+
+/**
+ * cfg80211_get_unii - get the U-NII band for the frequency
+ * @freq: the frequency for which we want to get the UNII band.
+
+ * Get a value specifying the U-NII band frequency belongs to.
+ * U-NII bands are defined by the FCC in C.F.R 47 part 15.
+ *
+ * Returns -EINVAL if freq is invalid, 0 for UNII-1, 1 for UNII-2A,
+ * 2 for UNII-2B, 3 for UNII-2C and 4 for UNII-3.
+ */
+int cfg80211_get_unii(int freq);
+
+/**
+ * regulatory_indoor_allowed - is indoor operation allowed
+ */
+bool regulatory_indoor_allowed(void);
+
+#endif /* __NET_WIRELESS_REG_H */
diff --git a/net/wireless/regdb.h b/net/wireless/regdb.h
new file mode 100644
index 000000000..3279cfcef
--- /dev/null
+++ b/net/wireless/regdb.h
@@ -0,0 +1,23 @@
+#ifndef __REGDB_H__
+#define __REGDB_H__
+
+/*
+ * Copyright 2009 John W. Linville <linville@tuxdriver.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+extern const struct ieee80211_regdomain *reg_regdb[];
+extern int reg_regdb_size;
+
+#endif /* __REGDB_H__ */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
new file mode 100644
index 000000000..3a50aa255
--- /dev/null
+++ b/net/wireless/scan.c
@@ -0,0 +1,1679 @@
+/*
+ * cfg80211 scan result handling
+ *
+ * Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/wireless.h>
+#include <linux/nl80211.h>
+#include <linux/etherdevice.h>
+#include <net/arp.h>
+#include <net/cfg80211.h>
+#include <net/cfg80211-wext.h>
+#include <net/iw_handler.h>
+#include "core.h"
+#include "nl80211.h"
+#include "wext-compat.h"
+#include "rdev-ops.h"
+
+/**
+ * DOC: BSS tree/list structure
+ *
+ * At the top level, the BSS list is kept in both a list in each
+ * registered device (@bss_list) as well as an RB-tree for faster
+ * lookup. In the RB-tree, entries can be looked up using their
+ * channel, MESHID, MESHCONF (for MBSSes) or channel, BSSID, SSID
+ * for other BSSes.
+ *
+ * Due to the possibility of hidden SSIDs, there's a second level
+ * structure, the "hidden_list" and "hidden_beacon_bss" pointer.
+ * The hidden_list connects all BSSes belonging to a single AP
+ * that has a hidden SSID, and connects beacon and probe response
+ * entries. For a probe response entry for a hidden SSID, the
+ * hidden_beacon_bss pointer points to the BSS struct holding the
+ * beacon's information.
+ *
+ * Reference counting is done for all these references except for
+ * the hidden_list, so that a beacon BSS struct that is otherwise
+ * not referenced has one reference for being on the bss_list and
+ * one for each probe response entry that points to it using the
+ * hidden_beacon_bss pointer. When a BSS struct that has such a
+ * pointer is get/put, the refcount update is also propagated to
+ * the referenced struct, this ensure that it cannot get removed
+ * while somebody is using the probe response version.
+ *
+ * Note that the hidden_beacon_bss pointer never changes, due to
+ * the reference counting. Therefore, no locking is needed for
+ * it.
+ *
+ * Also note that the hidden_beacon_bss pointer is only relevant
+ * if the driver uses something other than the IEs, e.g. private
+ * data stored stored in the BSS struct, since the beacon IEs are
+ * also linked into the probe response struct.
+ */
+
+#define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ)
+
+static void bss_free(struct cfg80211_internal_bss *bss)
+{
+ struct cfg80211_bss_ies *ies;
+
+ if (WARN_ON(atomic_read(&bss->hold)))
+ return;
+
+ ies = (void *)rcu_access_pointer(bss->pub.beacon_ies);
+ if (ies && !bss->pub.hidden_beacon_bss)
+ kfree_rcu(ies, rcu_head);
+ ies = (void *)rcu_access_pointer(bss->pub.proberesp_ies);
+ if (ies)
+ kfree_rcu(ies, rcu_head);
+
+ /*
+ * This happens when the module is removed, it doesn't
+ * really matter any more save for completeness
+ */
+ if (!list_empty(&bss->hidden_list))
+ list_del(&bss->hidden_list);
+
+ kfree(bss);
+}
+
+static inline void bss_ref_get(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *bss)
+{
+ lockdep_assert_held(&rdev->bss_lock);
+
+ bss->refcount++;
+ if (bss->pub.hidden_beacon_bss) {
+ bss = container_of(bss->pub.hidden_beacon_bss,
+ struct cfg80211_internal_bss,
+ pub);
+ bss->refcount++;
+ }
+}
+
+static inline void bss_ref_put(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *bss)
+{
+ lockdep_assert_held(&rdev->bss_lock);
+
+ if (bss->pub.hidden_beacon_bss) {
+ struct cfg80211_internal_bss *hbss;
+ hbss = container_of(bss->pub.hidden_beacon_bss,
+ struct cfg80211_internal_bss,
+ pub);
+ hbss->refcount--;
+ if (hbss->refcount == 0)
+ bss_free(hbss);
+ }
+ bss->refcount--;
+ if (bss->refcount == 0)
+ bss_free(bss);
+}
+
+static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *bss)
+{
+ lockdep_assert_held(&rdev->bss_lock);
+
+ if (!list_empty(&bss->hidden_list)) {
+ /*
+ * don't remove the beacon entry if it has
+ * probe responses associated with it
+ */
+ if (!bss->pub.hidden_beacon_bss)
+ return false;
+ /*
+ * if it's a probe response entry break its
+ * link to the other entries in the group
+ */
+ list_del_init(&bss->hidden_list);
+ }
+
+ list_del_init(&bss->list);
+ rb_erase(&bss->rbn, &rdev->bss_tree);
+ bss_ref_put(rdev, bss);
+ return true;
+}
+
+static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev,
+ unsigned long expire_time)
+{
+ struct cfg80211_internal_bss *bss, *tmp;
+ bool expired = false;
+
+ lockdep_assert_held(&rdev->bss_lock);
+
+ list_for_each_entry_safe(bss, tmp, &rdev->bss_list, list) {
+ if (atomic_read(&bss->hold))
+ continue;
+ if (!time_after(expire_time, bss->ts))
+ continue;
+
+ if (__cfg80211_unlink_bss(rdev, bss))
+ expired = true;
+ }
+
+ if (expired)
+ rdev->bss_generation++;
+}
+
+void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
+ bool send_message)
+{
+ struct cfg80211_scan_request *request;
+ struct wireless_dev *wdev;
+ struct sk_buff *msg;
+#ifdef CONFIG_CFG80211_WEXT
+ union iwreq_data wrqu;
+#endif
+
+ ASSERT_RTNL();
+
+ if (rdev->scan_msg) {
+ nl80211_send_scan_result(rdev, rdev->scan_msg);
+ rdev->scan_msg = NULL;
+ return;
+ }
+
+ request = rdev->scan_req;
+ if (!request)
+ return;
+
+ wdev = request->wdev;
+
+ /*
+ * This must be before sending the other events!
+ * Otherwise, wpa_supplicant gets completely confused with
+ * wext events.
+ */
+ if (wdev->netdev)
+ cfg80211_sme_scan_done(wdev->netdev);
+
+ if (!request->aborted &&
+ request->flags & NL80211_SCAN_FLAG_FLUSH) {
+ /* flush entries from previous scans */
+ spin_lock_bh(&rdev->bss_lock);
+ __cfg80211_bss_expire(rdev, request->scan_start);
+ spin_unlock_bh(&rdev->bss_lock);
+ }
+
+ msg = nl80211_build_scan_msg(rdev, wdev, request->aborted);
+
+#ifdef CONFIG_CFG80211_WEXT
+ if (wdev->netdev && !request->aborted) {
+ memset(&wrqu, 0, sizeof(wrqu));
+
+ wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL);
+ }
+#endif
+
+ if (wdev->netdev)
+ dev_put(wdev->netdev);
+
+ rdev->scan_req = NULL;
+ kfree(request);
+
+ if (!send_message)
+ rdev->scan_msg = msg;
+ else
+ nl80211_send_scan_result(rdev, msg);
+}
+
+void __cfg80211_scan_done(struct work_struct *wk)
+{
+ struct cfg80211_registered_device *rdev;
+
+ rdev = container_of(wk, struct cfg80211_registered_device,
+ scan_done_wk);
+
+ rtnl_lock();
+ ___cfg80211_scan_done(rdev, true);
+ rtnl_unlock();
+}
+
+void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted)
+{
+ trace_cfg80211_scan_done(request, aborted);
+ WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req);
+
+ request->aborted = aborted;
+ request->notified = true;
+ queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk);
+}
+EXPORT_SYMBOL(cfg80211_scan_done);
+
+void __cfg80211_sched_scan_results(struct work_struct *wk)
+{
+ struct cfg80211_registered_device *rdev;
+ struct cfg80211_sched_scan_request *request;
+
+ rdev = container_of(wk, struct cfg80211_registered_device,
+ sched_scan_results_wk);
+
+ rtnl_lock();
+
+ request = rtnl_dereference(rdev->sched_scan_req);
+
+ /* we don't have sched_scan_req anymore if the scan is stopping */
+ if (request) {
+ if (request->flags & NL80211_SCAN_FLAG_FLUSH) {
+ /* flush entries from previous scans */
+ spin_lock_bh(&rdev->bss_lock);
+ __cfg80211_bss_expire(rdev, request->scan_start);
+ spin_unlock_bh(&rdev->bss_lock);
+ request->scan_start =
+ jiffies + msecs_to_jiffies(request->interval);
+ }
+ nl80211_send_sched_scan_results(rdev, request->dev);
+ }
+
+ rtnl_unlock();
+}
+
+void cfg80211_sched_scan_results(struct wiphy *wiphy)
+{
+ trace_cfg80211_sched_scan_results(wiphy);
+ /* ignore if we're not scanning */
+
+ if (rcu_access_pointer(wiphy_to_rdev(wiphy)->sched_scan_req))
+ queue_work(cfg80211_wq,
+ &wiphy_to_rdev(wiphy)->sched_scan_results_wk);
+}
+EXPORT_SYMBOL(cfg80211_sched_scan_results);
+
+void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+
+ ASSERT_RTNL();
+
+ trace_cfg80211_sched_scan_stopped(wiphy);
+
+ __cfg80211_stop_sched_scan(rdev, true);
+}
+EXPORT_SYMBOL(cfg80211_sched_scan_stopped_rtnl);
+
+void cfg80211_sched_scan_stopped(struct wiphy *wiphy)
+{
+ rtnl_lock();
+ cfg80211_sched_scan_stopped_rtnl(wiphy);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(cfg80211_sched_scan_stopped);
+
+int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
+ bool driver_initiated)
+{
+ struct cfg80211_sched_scan_request *sched_scan_req;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ if (!rdev->sched_scan_req)
+ return -ENOENT;
+
+ sched_scan_req = rtnl_dereference(rdev->sched_scan_req);
+ dev = sched_scan_req->dev;
+
+ if (!driver_initiated) {
+ int err = rdev_sched_scan_stop(rdev, dev);
+ if (err)
+ return err;
+ }
+
+ nl80211_send_sched_scan(rdev, dev, NL80211_CMD_SCHED_SCAN_STOPPED);
+
+ RCU_INIT_POINTER(rdev->sched_scan_req, NULL);
+ kfree_rcu(sched_scan_req, rcu_head);
+
+ return 0;
+}
+
+void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
+ unsigned long age_secs)
+{
+ struct cfg80211_internal_bss *bss;
+ unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC);
+
+ spin_lock_bh(&rdev->bss_lock);
+ list_for_each_entry(bss, &rdev->bss_list, list)
+ bss->ts -= age_jiffies;
+ spin_unlock_bh(&rdev->bss_lock);
+}
+
+void cfg80211_bss_expire(struct cfg80211_registered_device *rdev)
+{
+ __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE);
+}
+
+const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
+{
+ while (len > 2 && ies[0] != eid) {
+ len -= ies[1] + 2;
+ ies += ies[1] + 2;
+ }
+ if (len < 2)
+ return NULL;
+ if (len < 2 + ies[1])
+ return NULL;
+ return ies;
+}
+EXPORT_SYMBOL(cfg80211_find_ie);
+
+const u8 *cfg80211_find_vendor_ie(unsigned int oui, u8 oui_type,
+ const u8 *ies, int len)
+{
+ struct ieee80211_vendor_ie *ie;
+ const u8 *pos = ies, *end = ies + len;
+ int ie_oui;
+
+ while (pos < end) {
+ pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos,
+ end - pos);
+ if (!pos)
+ return NULL;
+
+ ie = (struct ieee80211_vendor_ie *)pos;
+
+ /* make sure we can access ie->len */
+ BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1);
+
+ if (ie->len < sizeof(*ie))
+ goto cont;
+
+ ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2];
+ if (ie_oui == oui && ie->oui_type == oui_type)
+ return pos;
+cont:
+ pos += 2 + ie->len;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(cfg80211_find_vendor_ie);
+
+static bool is_bss(struct cfg80211_bss *a, const u8 *bssid,
+ const u8 *ssid, size_t ssid_len)
+{
+ const struct cfg80211_bss_ies *ies;
+ const u8 *ssidie;
+
+ if (bssid && !ether_addr_equal(a->bssid, bssid))
+ return false;
+
+ if (!ssid)
+ return true;
+
+ ies = rcu_access_pointer(a->ies);
+ if (!ies)
+ return false;
+ ssidie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len);
+ if (!ssidie)
+ return false;
+ if (ssidie[1] != ssid_len)
+ return false;
+ return memcmp(ssidie + 2, ssid, ssid_len) == 0;
+}
+
+/**
+ * enum bss_compare_mode - BSS compare mode
+ * @BSS_CMP_REGULAR: regular compare mode (for insertion and normal find)
+ * @BSS_CMP_HIDE_ZLEN: find hidden SSID with zero-length mode
+ * @BSS_CMP_HIDE_NUL: find hidden SSID with NUL-ed out mode
+ */
+enum bss_compare_mode {
+ BSS_CMP_REGULAR,
+ BSS_CMP_HIDE_ZLEN,
+ BSS_CMP_HIDE_NUL,
+};
+
+static int cmp_bss(struct cfg80211_bss *a,
+ struct cfg80211_bss *b,
+ enum bss_compare_mode mode)
+{
+ const struct cfg80211_bss_ies *a_ies, *b_ies;
+ const u8 *ie1 = NULL;
+ const u8 *ie2 = NULL;
+ int i, r;
+
+ if (a->channel != b->channel)
+ return b->channel->center_freq - a->channel->center_freq;
+
+ a_ies = rcu_access_pointer(a->ies);
+ if (!a_ies)
+ return -1;
+ b_ies = rcu_access_pointer(b->ies);
+ if (!b_ies)
+ return 1;
+
+ if (WLAN_CAPABILITY_IS_STA_BSS(a->capability))
+ ie1 = cfg80211_find_ie(WLAN_EID_MESH_ID,
+ a_ies->data, a_ies->len);
+ if (WLAN_CAPABILITY_IS_STA_BSS(b->capability))
+ ie2 = cfg80211_find_ie(WLAN_EID_MESH_ID,
+ b_ies->data, b_ies->len);
+ if (ie1 && ie2) {
+ int mesh_id_cmp;
+
+ if (ie1[1] == ie2[1])
+ mesh_id_cmp = memcmp(ie1 + 2, ie2 + 2, ie1[1]);
+ else
+ mesh_id_cmp = ie2[1] - ie1[1];
+
+ ie1 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG,
+ a_ies->data, a_ies->len);
+ ie2 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG,
+ b_ies->data, b_ies->len);
+ if (ie1 && ie2) {
+ if (mesh_id_cmp)
+ return mesh_id_cmp;
+ if (ie1[1] != ie2[1])
+ return ie2[1] - ie1[1];
+ return memcmp(ie1 + 2, ie2 + 2, ie1[1]);
+ }
+ }
+
+ r = memcmp(a->bssid, b->bssid, sizeof(a->bssid));
+ if (r)
+ return r;
+
+ ie1 = cfg80211_find_ie(WLAN_EID_SSID, a_ies->data, a_ies->len);
+ ie2 = cfg80211_find_ie(WLAN_EID_SSID, b_ies->data, b_ies->len);
+
+ if (!ie1 && !ie2)
+ return 0;
+
+ /*
+ * Note that with "hide_ssid", the function returns a match if
+ * the already-present BSS ("b") is a hidden SSID beacon for
+ * the new BSS ("a").
+ */
+
+ /* sort missing IE before (left of) present IE */
+ if (!ie1)
+ return -1;
+ if (!ie2)
+ return 1;
+
+ switch (mode) {
+ case BSS_CMP_HIDE_ZLEN:
+ /*
+ * In ZLEN mode we assume the BSS entry we're
+ * looking for has a zero-length SSID. So if
+ * the one we're looking at right now has that,
+ * return 0. Otherwise, return the difference
+ * in length, but since we're looking for the
+ * 0-length it's really equivalent to returning
+ * the length of the one we're looking at.
+ *
+ * No content comparison is needed as we assume
+ * the content length is zero.
+ */
+ return ie2[1];
+ case BSS_CMP_REGULAR:
+ default:
+ /* sort by length first, then by contents */
+ if (ie1[1] != ie2[1])
+ return ie2[1] - ie1[1];
+ return memcmp(ie1 + 2, ie2 + 2, ie1[1]);
+ case BSS_CMP_HIDE_NUL:
+ if (ie1[1] != ie2[1])
+ return ie2[1] - ie1[1];
+ /* this is equivalent to memcmp(zeroes, ie2 + 2, len) */
+ for (i = 0; i < ie2[1]; i++)
+ if (ie2[i + 2])
+ return -1;
+ return 0;
+ }
+}
+
+static bool cfg80211_bss_type_match(u16 capability,
+ enum ieee80211_band band,
+ enum ieee80211_bss_type bss_type)
+{
+ bool ret = true;
+ u16 mask, val;
+
+ if (bss_type == IEEE80211_BSS_TYPE_ANY)
+ return ret;
+
+ if (band == IEEE80211_BAND_60GHZ) {
+ mask = WLAN_CAPABILITY_DMG_TYPE_MASK;
+ switch (bss_type) {
+ case IEEE80211_BSS_TYPE_ESS:
+ val = WLAN_CAPABILITY_DMG_TYPE_AP;
+ break;
+ case IEEE80211_BSS_TYPE_PBSS:
+ val = WLAN_CAPABILITY_DMG_TYPE_PBSS;
+ break;
+ case IEEE80211_BSS_TYPE_IBSS:
+ val = WLAN_CAPABILITY_DMG_TYPE_IBSS;
+ break;
+ default:
+ return false;
+ }
+ } else {
+ mask = WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS;
+ switch (bss_type) {
+ case IEEE80211_BSS_TYPE_ESS:
+ val = WLAN_CAPABILITY_ESS;
+ break;
+ case IEEE80211_BSS_TYPE_IBSS:
+ val = WLAN_CAPABILITY_IBSS;
+ break;
+ case IEEE80211_BSS_TYPE_MBSS:
+ val = 0;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ ret = ((capability & mask) == val);
+ return ret;
+}
+
+/* Returned bss is reference counted and must be cleaned up appropriately. */
+struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
+ struct ieee80211_channel *channel,
+ const u8 *bssid,
+ const u8 *ssid, size_t ssid_len,
+ enum ieee80211_bss_type bss_type,
+ enum ieee80211_privacy privacy)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_internal_bss *bss, *res = NULL;
+ unsigned long now = jiffies;
+ int bss_privacy;
+
+ trace_cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, bss_type,
+ privacy);
+
+ spin_lock_bh(&rdev->bss_lock);
+
+ list_for_each_entry(bss, &rdev->bss_list, list) {
+ if (!cfg80211_bss_type_match(bss->pub.capability,
+ bss->pub.channel->band, bss_type))
+ continue;
+
+ bss_privacy = (bss->pub.capability & WLAN_CAPABILITY_PRIVACY);
+ if ((privacy == IEEE80211_PRIVACY_ON && !bss_privacy) ||
+ (privacy == IEEE80211_PRIVACY_OFF && bss_privacy))
+ continue;
+ if (channel && bss->pub.channel != channel)
+ continue;
+ if (!is_valid_ether_addr(bss->pub.bssid))
+ continue;
+ /* Don't get expired BSS structs */
+ if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) &&
+ !atomic_read(&bss->hold))
+ continue;
+ if (is_bss(&bss->pub, bssid, ssid, ssid_len)) {
+ res = bss;
+ bss_ref_get(rdev, res);
+ break;
+ }
+ }
+
+ spin_unlock_bh(&rdev->bss_lock);
+ if (!res)
+ return NULL;
+ trace_cfg80211_return_bss(&res->pub);
+ return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_get_bss);
+
+static void rb_insert_bss(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *bss)
+{
+ struct rb_node **p = &rdev->bss_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct cfg80211_internal_bss *tbss;
+ int cmp;
+
+ while (*p) {
+ parent = *p;
+ tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn);
+
+ cmp = cmp_bss(&bss->pub, &tbss->pub, BSS_CMP_REGULAR);
+
+ if (WARN_ON(!cmp)) {
+ /* will sort of leak this BSS */
+ return;
+ }
+
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&bss->rbn, parent, p);
+ rb_insert_color(&bss->rbn, &rdev->bss_tree);
+}
+
+static struct cfg80211_internal_bss *
+rb_find_bss(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *res,
+ enum bss_compare_mode mode)
+{
+ struct rb_node *n = rdev->bss_tree.rb_node;
+ struct cfg80211_internal_bss *bss;
+ int r;
+
+ while (n) {
+ bss = rb_entry(n, struct cfg80211_internal_bss, rbn);
+ r = cmp_bss(&res->pub, &bss->pub, mode);
+
+ if (r == 0)
+ return bss;
+ else if (r < 0)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+
+ return NULL;
+}
+
+static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *new)
+{
+ const struct cfg80211_bss_ies *ies;
+ struct cfg80211_internal_bss *bss;
+ const u8 *ie;
+ int i, ssidlen;
+ u8 fold = 0;
+
+ ies = rcu_access_pointer(new->pub.beacon_ies);
+ if (WARN_ON(!ies))
+ return false;
+
+ ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len);
+ if (!ie) {
+ /* nothing to do */
+ return true;
+ }
+
+ ssidlen = ie[1];
+ for (i = 0; i < ssidlen; i++)
+ fold |= ie[2 + i];
+
+ if (fold) {
+ /* not a hidden SSID */
+ return true;
+ }
+
+ /* This is the bad part ... */
+
+ list_for_each_entry(bss, &rdev->bss_list, list) {
+ if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid))
+ continue;
+ if (bss->pub.channel != new->pub.channel)
+ continue;
+ if (bss->pub.scan_width != new->pub.scan_width)
+ continue;
+ if (rcu_access_pointer(bss->pub.beacon_ies))
+ continue;
+ ies = rcu_access_pointer(bss->pub.ies);
+ if (!ies)
+ continue;
+ ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len);
+ if (!ie)
+ continue;
+ if (ssidlen && ie[1] != ssidlen)
+ continue;
+ if (WARN_ON_ONCE(bss->pub.hidden_beacon_bss))
+ continue;
+ if (WARN_ON_ONCE(!list_empty(&bss->hidden_list)))
+ list_del(&bss->hidden_list);
+ /* combine them */
+ list_add(&bss->hidden_list, &new->hidden_list);
+ bss->pub.hidden_beacon_bss = &new->pub;
+ new->refcount += bss->refcount;
+ rcu_assign_pointer(bss->pub.beacon_ies,
+ new->pub.beacon_ies);
+ }
+
+ return true;
+}
+
+/* Returned bss is reference counted and must be cleaned up appropriately. */
+static struct cfg80211_internal_bss *
+cfg80211_bss_update(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *tmp,
+ bool signal_valid)
+{
+ struct cfg80211_internal_bss *found = NULL;
+
+ if (WARN_ON(!tmp->pub.channel))
+ return NULL;
+
+ tmp->ts = jiffies;
+
+ spin_lock_bh(&rdev->bss_lock);
+
+ if (WARN_ON(!rcu_access_pointer(tmp->pub.ies))) {
+ spin_unlock_bh(&rdev->bss_lock);
+ return NULL;
+ }
+
+ found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR);
+
+ if (found) {
+ /* Update IEs */
+ if (rcu_access_pointer(tmp->pub.proberesp_ies)) {
+ const struct cfg80211_bss_ies *old;
+
+ old = rcu_access_pointer(found->pub.proberesp_ies);
+
+ rcu_assign_pointer(found->pub.proberesp_ies,
+ tmp->pub.proberesp_ies);
+ /* Override possible earlier Beacon frame IEs */
+ rcu_assign_pointer(found->pub.ies,
+ tmp->pub.proberesp_ies);
+ if (old)
+ kfree_rcu((struct cfg80211_bss_ies *)old,
+ rcu_head);
+ } else if (rcu_access_pointer(tmp->pub.beacon_ies)) {
+ const struct cfg80211_bss_ies *old;
+ struct cfg80211_internal_bss *bss;
+
+ if (found->pub.hidden_beacon_bss &&
+ !list_empty(&found->hidden_list)) {
+ const struct cfg80211_bss_ies *f;
+
+ /*
+ * The found BSS struct is one of the probe
+ * response members of a group, but we're
+ * receiving a beacon (beacon_ies in the tmp
+ * bss is used). This can only mean that the
+ * AP changed its beacon from not having an
+ * SSID to showing it, which is confusing so
+ * drop this information.
+ */
+
+ f = rcu_access_pointer(tmp->pub.beacon_ies);
+ kfree_rcu((struct cfg80211_bss_ies *)f,
+ rcu_head);
+ goto drop;
+ }
+
+ old = rcu_access_pointer(found->pub.beacon_ies);
+
+ rcu_assign_pointer(found->pub.beacon_ies,
+ tmp->pub.beacon_ies);
+
+ /* Override IEs if they were from a beacon before */
+ if (old == rcu_access_pointer(found->pub.ies))
+ rcu_assign_pointer(found->pub.ies,
+ tmp->pub.beacon_ies);
+
+ /* Assign beacon IEs to all sub entries */
+ list_for_each_entry(bss, &found->hidden_list,
+ hidden_list) {
+ const struct cfg80211_bss_ies *ies;
+
+ ies = rcu_access_pointer(bss->pub.beacon_ies);
+ WARN_ON(ies != old);
+
+ rcu_assign_pointer(bss->pub.beacon_ies,
+ tmp->pub.beacon_ies);
+ }
+
+ if (old)
+ kfree_rcu((struct cfg80211_bss_ies *)old,
+ rcu_head);
+ }
+
+ found->pub.beacon_interval = tmp->pub.beacon_interval;
+ /*
+ * don't update the signal if beacon was heard on
+ * adjacent channel.
+ */
+ if (signal_valid)
+ found->pub.signal = tmp->pub.signal;
+ found->pub.capability = tmp->pub.capability;
+ found->ts = tmp->ts;
+ } else {
+ struct cfg80211_internal_bss *new;
+ struct cfg80211_internal_bss *hidden;
+ struct cfg80211_bss_ies *ies;
+
+ /*
+ * create a copy -- the "res" variable that is passed in
+ * is allocated on the stack since it's not needed in the
+ * more common case of an update
+ */
+ new = kzalloc(sizeof(*new) + rdev->wiphy.bss_priv_size,
+ GFP_ATOMIC);
+ if (!new) {
+ ies = (void *)rcu_dereference(tmp->pub.beacon_ies);
+ if (ies)
+ kfree_rcu(ies, rcu_head);
+ ies = (void *)rcu_dereference(tmp->pub.proberesp_ies);
+ if (ies)
+ kfree_rcu(ies, rcu_head);
+ goto drop;
+ }
+ memcpy(new, tmp, sizeof(*new));
+ new->refcount = 1;
+ INIT_LIST_HEAD(&new->hidden_list);
+
+ if (rcu_access_pointer(tmp->pub.proberesp_ies)) {
+ hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN);
+ if (!hidden)
+ hidden = rb_find_bss(rdev, tmp,
+ BSS_CMP_HIDE_NUL);
+ if (hidden) {
+ new->pub.hidden_beacon_bss = &hidden->pub;
+ list_add(&new->hidden_list,
+ &hidden->hidden_list);
+ hidden->refcount++;
+ rcu_assign_pointer(new->pub.beacon_ies,
+ hidden->pub.beacon_ies);
+ }
+ } else {
+ /*
+ * Ok so we found a beacon, and don't have an entry. If
+ * it's a beacon with hidden SSID, we might be in for an
+ * expensive search for any probe responses that should
+ * be grouped with this beacon for updates ...
+ */
+ if (!cfg80211_combine_bsses(rdev, new)) {
+ kfree(new);
+ goto drop;
+ }
+ }
+
+ list_add_tail(&new->list, &rdev->bss_list);
+ rb_insert_bss(rdev, new);
+ found = new;
+ }
+
+ rdev->bss_generation++;
+ bss_ref_get(rdev, found);
+ spin_unlock_bh(&rdev->bss_lock);
+
+ return found;
+ drop:
+ spin_unlock_bh(&rdev->bss_lock);
+ return NULL;
+}
+
+static struct ieee80211_channel *
+cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
+ struct ieee80211_channel *channel)
+{
+ const u8 *tmp;
+ u32 freq;
+ int channel_number = -1;
+
+ tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen);
+ if (tmp && tmp[1] == 1) {
+ channel_number = tmp[2];
+ } else {
+ tmp = cfg80211_find_ie(WLAN_EID_HT_OPERATION, ie, ielen);
+ if (tmp && tmp[1] >= sizeof(struct ieee80211_ht_operation)) {
+ struct ieee80211_ht_operation *htop = (void *)(tmp + 2);
+
+ channel_number = htop->primary_chan;
+ }
+ }
+
+ if (channel_number < 0)
+ return channel;
+
+ freq = ieee80211_channel_to_frequency(channel_number, channel->band);
+ channel = ieee80211_get_channel(wiphy, freq);
+ if (!channel)
+ return NULL;
+ if (channel->flags & IEEE80211_CHAN_DISABLED)
+ return NULL;
+ return channel;
+}
+
+/* Returned bss is reference counted and must be cleaned up appropriately. */
+struct cfg80211_bss*
+cfg80211_inform_bss_width(struct wiphy *wiphy,
+ struct ieee80211_channel *rx_channel,
+ enum nl80211_bss_scan_width scan_width,
+ enum cfg80211_bss_frame_type ftype,
+ const u8 *bssid, u64 tsf, u16 capability,
+ u16 beacon_interval, const u8 *ie, size_t ielen,
+ s32 signal, gfp_t gfp)
+{
+ struct cfg80211_bss_ies *ies;
+ struct ieee80211_channel *channel;
+ struct cfg80211_internal_bss tmp = {}, *res;
+ int bss_type;
+ bool signal_valid;
+
+ if (WARN_ON(!wiphy))
+ return NULL;
+
+ if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC &&
+ (signal < 0 || signal > 100)))
+ return NULL;
+
+ channel = cfg80211_get_bss_channel(wiphy, ie, ielen, rx_channel);
+ if (!channel)
+ return NULL;
+
+ memcpy(tmp.pub.bssid, bssid, ETH_ALEN);
+ tmp.pub.channel = channel;
+ tmp.pub.scan_width = scan_width;
+ tmp.pub.signal = signal;
+ tmp.pub.beacon_interval = beacon_interval;
+ tmp.pub.capability = capability;
+ /*
+ * If we do not know here whether the IEs are from a Beacon or Probe
+ * Response frame, we need to pick one of the options and only use it
+ * with the driver that does not provide the full Beacon/Probe Response
+ * frame. Use Beacon frame pointer to avoid indicating that this should
+ * override the IEs pointer should we have received an earlier
+ * indication of Probe Response data.
+ */
+ ies = kzalloc(sizeof(*ies) + ielen, gfp);
+ if (!ies)
+ return NULL;
+ ies->len = ielen;
+ ies->tsf = tsf;
+ ies->from_beacon = false;
+ memcpy(ies->data, ie, ielen);
+
+ switch (ftype) {
+ case CFG80211_BSS_FTYPE_BEACON:
+ ies->from_beacon = true;
+ /* fall through to assign */
+ case CFG80211_BSS_FTYPE_UNKNOWN:
+ rcu_assign_pointer(tmp.pub.beacon_ies, ies);
+ break;
+ case CFG80211_BSS_FTYPE_PRESP:
+ rcu_assign_pointer(tmp.pub.proberesp_ies, ies);
+ break;
+ }
+ rcu_assign_pointer(tmp.pub.ies, ies);
+
+ signal_valid = abs(rx_channel->center_freq - channel->center_freq) <=
+ wiphy->max_adj_channel_rssi_comp;
+ res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
+ if (!res)
+ return NULL;
+
+ if (channel->band == IEEE80211_BAND_60GHZ) {
+ bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK;
+ if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP ||
+ bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS)
+ regulatory_hint_found_beacon(wiphy, channel, gfp);
+ } else {
+ if (res->pub.capability & WLAN_CAPABILITY_ESS)
+ regulatory_hint_found_beacon(wiphy, channel, gfp);
+ }
+
+ trace_cfg80211_return_bss(&res->pub);
+ /* cfg80211_bss_update gives us a referenced result */
+ return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_inform_bss_width);
+
+/* Returned bss is reference counted and must be cleaned up appropriately. */
+struct cfg80211_bss *
+cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
+ struct ieee80211_channel *rx_channel,
+ enum nl80211_bss_scan_width scan_width,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ s32 signal, gfp_t gfp)
+{
+ struct cfg80211_internal_bss tmp = {}, *res;
+ struct cfg80211_bss_ies *ies;
+ struct ieee80211_channel *channel;
+ bool signal_valid;
+ size_t ielen = len - offsetof(struct ieee80211_mgmt,
+ u.probe_resp.variable);
+ int bss_type;
+
+ BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) !=
+ offsetof(struct ieee80211_mgmt, u.beacon.variable));
+
+ trace_cfg80211_inform_bss_width_frame(wiphy, rx_channel, scan_width, mgmt,
+ len, signal);
+
+ if (WARN_ON(!mgmt))
+ return NULL;
+
+ if (WARN_ON(!wiphy))
+ return NULL;
+
+ if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC &&
+ (signal < 0 || signal > 100)))
+ return NULL;
+
+ if (WARN_ON(len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable)))
+ return NULL;
+
+ channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable,
+ ielen, rx_channel);
+ if (!channel)
+ return NULL;
+
+ ies = kzalloc(sizeof(*ies) + ielen, gfp);
+ if (!ies)
+ return NULL;
+ ies->len = ielen;
+ ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
+ ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control);
+ memcpy(ies->data, mgmt->u.probe_resp.variable, ielen);
+
+ if (ieee80211_is_probe_resp(mgmt->frame_control))
+ rcu_assign_pointer(tmp.pub.proberesp_ies, ies);
+ else
+ rcu_assign_pointer(tmp.pub.beacon_ies, ies);
+ rcu_assign_pointer(tmp.pub.ies, ies);
+
+ memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
+ tmp.pub.channel = channel;
+ tmp.pub.scan_width = scan_width;
+ tmp.pub.signal = signal;
+ tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
+ tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
+
+ signal_valid = abs(rx_channel->center_freq - channel->center_freq) <=
+ wiphy->max_adj_channel_rssi_comp;
+ res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid);
+ if (!res)
+ return NULL;
+
+ if (channel->band == IEEE80211_BAND_60GHZ) {
+ bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK;
+ if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP ||
+ bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS)
+ regulatory_hint_found_beacon(wiphy, channel, gfp);
+ } else {
+ if (res->pub.capability & WLAN_CAPABILITY_ESS)
+ regulatory_hint_found_beacon(wiphy, channel, gfp);
+ }
+
+ trace_cfg80211_return_bss(&res->pub);
+ /* cfg80211_bss_update gives us a referenced result */
+ return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_inform_bss_width_frame);
+
+void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_internal_bss *bss;
+
+ if (!pub)
+ return;
+
+ bss = container_of(pub, struct cfg80211_internal_bss, pub);
+
+ spin_lock_bh(&rdev->bss_lock);
+ bss_ref_get(rdev, bss);
+ spin_unlock_bh(&rdev->bss_lock);
+}
+EXPORT_SYMBOL(cfg80211_ref_bss);
+
+void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_internal_bss *bss;
+
+ if (!pub)
+ return;
+
+ bss = container_of(pub, struct cfg80211_internal_bss, pub);
+
+ spin_lock_bh(&rdev->bss_lock);
+ bss_ref_put(rdev, bss);
+ spin_unlock_bh(&rdev->bss_lock);
+}
+EXPORT_SYMBOL(cfg80211_put_bss);
+
+void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_internal_bss *bss;
+
+ if (WARN_ON(!pub))
+ return;
+
+ bss = container_of(pub, struct cfg80211_internal_bss, pub);
+
+ spin_lock_bh(&rdev->bss_lock);
+ if (!list_empty(&bss->list)) {
+ if (__cfg80211_unlink_bss(rdev, bss))
+ rdev->bss_generation++;
+ }
+ spin_unlock_bh(&rdev->bss_lock);
+}
+EXPORT_SYMBOL(cfg80211_unlink_bss);
+
+#ifdef CONFIG_CFG80211_WEXT
+static struct cfg80211_registered_device *
+cfg80211_get_dev_from_ifindex(struct net *net, int ifindex)
+{
+ struct cfg80211_registered_device *rdev;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+ if (dev->ieee80211_ptr)
+ rdev = wiphy_to_rdev(dev->ieee80211_ptr->wiphy);
+ else
+ rdev = ERR_PTR(-ENODEV);
+ dev_put(dev);
+ return rdev;
+}
+
+int cfg80211_wext_siwscan(struct net_device *dev,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *extra)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wiphy *wiphy;
+ struct iw_scan_req *wreq = NULL;
+ struct cfg80211_scan_request *creq = NULL;
+ int i, err, n_channels = 0;
+ enum ieee80211_band band;
+
+ if (!netif_running(dev))
+ return -ENETDOWN;
+
+ if (wrqu->data.length == sizeof(struct iw_scan_req))
+ wreq = (struct iw_scan_req *)extra;
+
+ rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex);
+
+ if (IS_ERR(rdev))
+ return PTR_ERR(rdev);
+
+ if (rdev->scan_req || rdev->scan_msg) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ wiphy = &rdev->wiphy;
+
+ /* Determine number of channels, needed to allocate creq */
+ if (wreq && wreq->num_channels)
+ n_channels = wreq->num_channels;
+ else
+ n_channels = ieee80211_get_num_supported_channels(wiphy);
+
+ creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) +
+ n_channels * sizeof(void *),
+ GFP_ATOMIC);
+ if (!creq) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ creq->wiphy = wiphy;
+ creq->wdev = dev->ieee80211_ptr;
+ /* SSIDs come after channels */
+ creq->ssids = (void *)&creq->channels[n_channels];
+ creq->n_channels = n_channels;
+ creq->n_ssids = 1;
+ creq->scan_start = jiffies;
+
+ /* translate "Scan on frequencies" request */
+ i = 0;
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ int j;
+
+ if (!wiphy->bands[band])
+ continue;
+
+ for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+ /* ignore disabled channels */
+ if (wiphy->bands[band]->channels[j].flags &
+ IEEE80211_CHAN_DISABLED)
+ continue;
+
+ /* If we have a wireless request structure and the
+ * wireless request specifies frequencies, then search
+ * for the matching hardware channel.
+ */
+ if (wreq && wreq->num_channels) {
+ int k;
+ int wiphy_freq = wiphy->bands[band]->channels[j].center_freq;
+ for (k = 0; k < wreq->num_channels; k++) {
+ struct iw_freq *freq =
+ &wreq->channel_list[k];
+ int wext_freq =
+ cfg80211_wext_freq(freq);
+
+ if (wext_freq == wiphy_freq)
+ goto wext_freq_found;
+ }
+ goto wext_freq_not_found;
+ }
+
+ wext_freq_found:
+ creq->channels[i] = &wiphy->bands[band]->channels[j];
+ i++;
+ wext_freq_not_found: ;
+ }
+ }
+ /* No channels found? */
+ if (!i) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Set real number of channels specified in creq->channels[] */
+ creq->n_channels = i;
+
+ /* translate "Scan for SSID" request */
+ if (wreq) {
+ if (wrqu->data.flags & IW_SCAN_THIS_ESSID) {
+ if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) {
+ err = -EINVAL;
+ goto out;
+ }
+ memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len);
+ creq->ssids[0].ssid_len = wreq->essid_len;
+ }
+ if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE)
+ creq->n_ssids = 0;
+ }
+
+ for (i = 0; i < IEEE80211_NUM_BANDS; i++)
+ if (wiphy->bands[i])
+ creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1;
+
+ rdev->scan_req = creq;
+ err = rdev_scan(rdev, creq);
+ if (err) {
+ rdev->scan_req = NULL;
+ /* creq will be freed below */
+ } else {
+ nl80211_send_scan_start(rdev, dev->ieee80211_ptr);
+ /* creq now owned by driver */
+ creq = NULL;
+ dev_hold(dev);
+ }
+ out:
+ kfree(creq);
+ return err;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_siwscan);
+
+static char *ieee80211_scan_add_ies(struct iw_request_info *info,
+ const struct cfg80211_bss_ies *ies,
+ char *current_ev, char *end_buf)
+{
+ const u8 *pos, *end, *next;
+ struct iw_event iwe;
+
+ if (!ies)
+ return current_ev;
+
+ /*
+ * If needed, fragment the IEs buffer (at IE boundaries) into short
+ * enough fragments to fit into IW_GENERIC_IE_MAX octet messages.
+ */
+ pos = ies->data;
+ end = pos + ies->len;
+
+ while (end - pos > IW_GENERIC_IE_MAX) {
+ next = pos + 2 + pos[1];
+ while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX)
+ next = next + 2 + next[1];
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVGENIE;
+ iwe.u.data.length = next - pos;
+ current_ev = iwe_stream_add_point_check(info, current_ev,
+ end_buf, &iwe,
+ (void *)pos);
+ if (IS_ERR(current_ev))
+ return current_ev;
+ pos = next;
+ }
+
+ if (end > pos) {
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVGENIE;
+ iwe.u.data.length = end - pos;
+ current_ev = iwe_stream_add_point_check(info, current_ev,
+ end_buf, &iwe,
+ (void *)pos);
+ if (IS_ERR(current_ev))
+ return current_ev;
+ }
+
+ return current_ev;
+}
+
+static char *
+ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info,
+ struct cfg80211_internal_bss *bss, char *current_ev,
+ char *end_buf)
+{
+ const struct cfg80211_bss_ies *ies;
+ struct iw_event iwe;
+ const u8 *ie;
+ u8 buf[50];
+ u8 *cfg, *p, *tmp;
+ int rem, i, sig;
+ bool ismesh = false;
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = SIOCGIWAP;
+ iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
+ memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN);
+ current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe,
+ IW_EV_ADDR_LEN);
+ if (IS_ERR(current_ev))
+ return current_ev;
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = SIOCGIWFREQ;
+ iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq);
+ iwe.u.freq.e = 0;
+ current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe,
+ IW_EV_FREQ_LEN);
+ if (IS_ERR(current_ev))
+ return current_ev;
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = SIOCGIWFREQ;
+ iwe.u.freq.m = bss->pub.channel->center_freq;
+ iwe.u.freq.e = 6;
+ current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe,
+ IW_EV_FREQ_LEN);
+ if (IS_ERR(current_ev))
+ return current_ev;
+
+ if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) {
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVQUAL;
+ iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED |
+ IW_QUAL_NOISE_INVALID |
+ IW_QUAL_QUAL_UPDATED;
+ switch (wiphy->signal_type) {
+ case CFG80211_SIGNAL_TYPE_MBM:
+ sig = bss->pub.signal / 100;
+ iwe.u.qual.level = sig;
+ iwe.u.qual.updated |= IW_QUAL_DBM;
+ if (sig < -110) /* rather bad */
+ sig = -110;
+ else if (sig > -40) /* perfect */
+ sig = -40;
+ /* will give a range of 0 .. 70 */
+ iwe.u.qual.qual = sig + 110;
+ break;
+ case CFG80211_SIGNAL_TYPE_UNSPEC:
+ iwe.u.qual.level = bss->pub.signal;
+ /* will give range 0 .. 100 */
+ iwe.u.qual.qual = bss->pub.signal;
+ break;
+ default:
+ /* not reached */
+ break;
+ }
+ current_ev = iwe_stream_add_event_check(info, current_ev,
+ end_buf, &iwe,
+ IW_EV_QUAL_LEN);
+ if (IS_ERR(current_ev))
+ return current_ev;
+ }
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = SIOCGIWENCODE;
+ if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY)
+ iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
+ else
+ iwe.u.data.flags = IW_ENCODE_DISABLED;
+ iwe.u.data.length = 0;
+ current_ev = iwe_stream_add_point_check(info, current_ev, end_buf,
+ &iwe, "");
+ if (IS_ERR(current_ev))
+ return current_ev;
+
+ rcu_read_lock();
+ ies = rcu_dereference(bss->pub.ies);
+ rem = ies->len;
+ ie = ies->data;
+
+ while (rem >= 2) {
+ /* invalid data */
+ if (ie[1] > rem - 2)
+ break;
+
+ switch (ie[0]) {
+ case WLAN_EID_SSID:
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = SIOCGIWESSID;
+ iwe.u.data.length = ie[1];
+ iwe.u.data.flags = 1;
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf, &iwe,
+ (u8 *)ie + 2);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ break;
+ case WLAN_EID_MESH_ID:
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = SIOCGIWESSID;
+ iwe.u.data.length = ie[1];
+ iwe.u.data.flags = 1;
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf, &iwe,
+ (u8 *)ie + 2);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ break;
+ case WLAN_EID_MESH_CONFIG:
+ ismesh = true;
+ if (ie[1] != sizeof(struct ieee80211_meshconf_ie))
+ break;
+ cfg = (u8 *)ie + 2;
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVCUSTOM;
+ sprintf(buf, "Mesh Network Path Selection Protocol ID: "
+ "0x%02X", cfg[0]);
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf,
+ &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ sprintf(buf, "Path Selection Metric ID: 0x%02X",
+ cfg[1]);
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf,
+ &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ sprintf(buf, "Congestion Control Mode ID: 0x%02X",
+ cfg[2]);
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf,
+ &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ sprintf(buf, "Synchronization ID: 0x%02X", cfg[3]);
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf,
+ &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ sprintf(buf, "Authentication ID: 0x%02X", cfg[4]);
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf,
+ &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ sprintf(buf, "Formation Info: 0x%02X", cfg[5]);
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf,
+ &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ sprintf(buf, "Capabilities: 0x%02X", cfg[6]);
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info,
+ current_ev,
+ end_buf,
+ &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ break;
+ case WLAN_EID_SUPP_RATES:
+ case WLAN_EID_EXT_SUPP_RATES:
+ /* display all supported rates in readable format */
+ p = current_ev + iwe_stream_lcp_len(info);
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = SIOCGIWRATE;
+ /* Those two flags are ignored... */
+ iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
+
+ for (i = 0; i < ie[1]; i++) {
+ iwe.u.bitrate.value =
+ ((ie[i + 2] & 0x7f) * 500000);
+ tmp = p;
+ p = iwe_stream_add_value(info, current_ev, p,
+ end_buf, &iwe,
+ IW_EV_PARAM_LEN);
+ if (p == tmp) {
+ current_ev = ERR_PTR(-E2BIG);
+ goto unlock;
+ }
+ }
+ current_ev = p;
+ break;
+ }
+ rem -= ie[1] + 2;
+ ie += ie[1] + 2;
+ }
+
+ if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) ||
+ ismesh) {
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = SIOCGIWMODE;
+ if (ismesh)
+ iwe.u.mode = IW_MODE_MESH;
+ else if (bss->pub.capability & WLAN_CAPABILITY_ESS)
+ iwe.u.mode = IW_MODE_MASTER;
+ else
+ iwe.u.mode = IW_MODE_ADHOC;
+ current_ev = iwe_stream_add_event_check(info, current_ev,
+ end_buf, &iwe,
+ IW_EV_UINT_LEN);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ }
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVCUSTOM;
+ sprintf(buf, "tsf=%016llx", (unsigned long long)(ies->tsf));
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info, current_ev, end_buf,
+ &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVCUSTOM;
+ sprintf(buf, " Last beacon: %ums ago",
+ elapsed_jiffies_msecs(bss->ts));
+ iwe.u.data.length = strlen(buf);
+ current_ev = iwe_stream_add_point_check(info, current_ev,
+ end_buf, &iwe, buf);
+ if (IS_ERR(current_ev))
+ goto unlock;
+
+ current_ev = ieee80211_scan_add_ies(info, ies, current_ev, end_buf);
+
+ unlock:
+ rcu_read_unlock();
+ return current_ev;
+}
+
+
+static int ieee80211_scan_results(struct cfg80211_registered_device *rdev,
+ struct iw_request_info *info,
+ char *buf, size_t len)
+{
+ char *current_ev = buf;
+ char *end_buf = buf + len;
+ struct cfg80211_internal_bss *bss;
+ int err = 0;
+
+ spin_lock_bh(&rdev->bss_lock);
+ cfg80211_bss_expire(rdev);
+
+ list_for_each_entry(bss, &rdev->bss_list, list) {
+ if (buf + len - current_ev <= IW_EV_ADDR_LEN) {
+ err = -E2BIG;
+ break;
+ }
+ current_ev = ieee80211_bss(&rdev->wiphy, info, bss,
+ current_ev, end_buf);
+ if (IS_ERR(current_ev)) {
+ err = PTR_ERR(current_ev);
+ break;
+ }
+ }
+ spin_unlock_bh(&rdev->bss_lock);
+
+ if (err)
+ return err;
+ return current_ev - buf;
+}
+
+
+int cfg80211_wext_giwscan(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *extra)
+{
+ struct cfg80211_registered_device *rdev;
+ int res;
+
+ if (!netif_running(dev))
+ return -ENETDOWN;
+
+ rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex);
+
+ if (IS_ERR(rdev))
+ return PTR_ERR(rdev);
+
+ if (rdev->scan_req || rdev->scan_msg)
+ return -EAGAIN;
+
+ res = ieee80211_scan_results(rdev, info, extra, data->length);
+ data->length = 0;
+ if (res >= 0) {
+ data->length = res;
+ res = 0;
+ }
+
+ return res;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_giwscan);
+#endif
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
new file mode 100644
index 000000000..d11454f87
--- /dev/null
+++ b/net/wireless/sme.c
@@ -0,0 +1,1051 @@
+/*
+ * SME code for cfg80211
+ * both driver SME event handling and the SME implementation
+ * (for nl80211's connect() and wext)
+ *
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2009 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/wireless.h>
+#include <linux/export.h>
+#include <net/iw_handler.h>
+#include <net/cfg80211.h>
+#include <net/rtnetlink.h>
+#include "nl80211.h"
+#include "reg.h"
+#include "rdev-ops.h"
+
+/*
+ * Software SME in cfg80211, using auth/assoc/deauth calls to the
+ * driver. This is is for implementing nl80211's connect/disconnect
+ * and wireless extensions (if configured.)
+ */
+
+struct cfg80211_conn {
+ struct cfg80211_connect_params params;
+ /* these are sub-states of the _CONNECTING sme_state */
+ enum {
+ CFG80211_CONN_SCANNING,
+ CFG80211_CONN_SCAN_AGAIN,
+ CFG80211_CONN_AUTHENTICATE_NEXT,
+ CFG80211_CONN_AUTHENTICATING,
+ CFG80211_CONN_AUTH_FAILED,
+ CFG80211_CONN_ASSOCIATE_NEXT,
+ CFG80211_CONN_ASSOCIATING,
+ CFG80211_CONN_ASSOC_FAILED,
+ CFG80211_CONN_DEAUTH,
+ CFG80211_CONN_CONNECTED,
+ } state;
+ u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
+ const u8 *ie;
+ size_t ie_len;
+ bool auto_auth, prev_bssid_valid;
+};
+
+static void cfg80211_sme_free(struct wireless_dev *wdev)
+{
+ if (!wdev->conn)
+ return;
+
+ kfree(wdev->conn->ie);
+ kfree(wdev->conn);
+ wdev->conn = NULL;
+}
+
+static int cfg80211_conn_scan(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_scan_request *request;
+ int n_channels, err;
+
+ ASSERT_RTNL();
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (rdev->scan_req || rdev->scan_msg)
+ return -EBUSY;
+
+ if (wdev->conn->params.channel)
+ n_channels = 1;
+ else
+ n_channels = ieee80211_get_num_supported_channels(wdev->wiphy);
+
+ request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) +
+ sizeof(request->channels[0]) * n_channels,
+ GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ if (wdev->conn->params.channel) {
+ enum ieee80211_band band = wdev->conn->params.channel->band;
+ struct ieee80211_supported_band *sband =
+ wdev->wiphy->bands[band];
+
+ if (!sband) {
+ kfree(request);
+ return -EINVAL;
+ }
+ request->channels[0] = wdev->conn->params.channel;
+ request->rates[band] = (1 << sband->n_bitrates) - 1;
+ } else {
+ int i = 0, j;
+ enum ieee80211_band band;
+ struct ieee80211_supported_band *bands;
+ struct ieee80211_channel *channel;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ bands = wdev->wiphy->bands[band];
+ if (!bands)
+ continue;
+ for (j = 0; j < bands->n_channels; j++) {
+ channel = &bands->channels[j];
+ if (channel->flags & IEEE80211_CHAN_DISABLED)
+ continue;
+ request->channels[i++] = channel;
+ }
+ request->rates[band] = (1 << bands->n_bitrates) - 1;
+ }
+ n_channels = i;
+ }
+ request->n_channels = n_channels;
+ request->ssids = (void *)&request->channels[n_channels];
+ request->n_ssids = 1;
+
+ memcpy(request->ssids[0].ssid, wdev->conn->params.ssid,
+ wdev->conn->params.ssid_len);
+ request->ssids[0].ssid_len = wdev->conn->params.ssid_len;
+
+ request->wdev = wdev;
+ request->wiphy = &rdev->wiphy;
+ request->scan_start = jiffies;
+
+ rdev->scan_req = request;
+
+ err = rdev_scan(rdev, request);
+ if (!err) {
+ wdev->conn->state = CFG80211_CONN_SCANNING;
+ nl80211_send_scan_start(rdev, wdev);
+ dev_hold(wdev->netdev);
+ } else {
+ rdev->scan_req = NULL;
+ kfree(request);
+ }
+ return err;
+}
+
+static int cfg80211_conn_do_work(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_connect_params *params;
+ struct cfg80211_assoc_request req = {};
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!wdev->conn)
+ return 0;
+
+ params = &wdev->conn->params;
+
+ switch (wdev->conn->state) {
+ case CFG80211_CONN_SCANNING:
+ /* didn't find it during scan ... */
+ return -ENOENT;
+ case CFG80211_CONN_SCAN_AGAIN:
+ return cfg80211_conn_scan(wdev);
+ case CFG80211_CONN_AUTHENTICATE_NEXT:
+ if (WARN_ON(!rdev->ops->auth))
+ return -EOPNOTSUPP;
+ wdev->conn->state = CFG80211_CONN_AUTHENTICATING;
+ return cfg80211_mlme_auth(rdev, wdev->netdev,
+ params->channel, params->auth_type,
+ params->bssid,
+ params->ssid, params->ssid_len,
+ NULL, 0,
+ params->key, params->key_len,
+ params->key_idx, NULL, 0);
+ case CFG80211_CONN_AUTH_FAILED:
+ return -ENOTCONN;
+ case CFG80211_CONN_ASSOCIATE_NEXT:
+ if (WARN_ON(!rdev->ops->assoc))
+ return -EOPNOTSUPP;
+ wdev->conn->state = CFG80211_CONN_ASSOCIATING;
+ if (wdev->conn->prev_bssid_valid)
+ req.prev_bssid = wdev->conn->prev_bssid;
+ req.ie = params->ie;
+ req.ie_len = params->ie_len;
+ req.use_mfp = params->mfp != NL80211_MFP_NO;
+ req.crypto = params->crypto;
+ req.flags = params->flags;
+ req.ht_capa = params->ht_capa;
+ req.ht_capa_mask = params->ht_capa_mask;
+ req.vht_capa = params->vht_capa;
+ req.vht_capa_mask = params->vht_capa_mask;
+
+ err = cfg80211_mlme_assoc(rdev, wdev->netdev, params->channel,
+ params->bssid, params->ssid,
+ params->ssid_len, &req);
+ if (err)
+ cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
+ NULL, 0,
+ WLAN_REASON_DEAUTH_LEAVING,
+ false);
+ return err;
+ case CFG80211_CONN_ASSOC_FAILED:
+ cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
+ NULL, 0,
+ WLAN_REASON_DEAUTH_LEAVING, false);
+ return -ENOTCONN;
+ case CFG80211_CONN_DEAUTH:
+ cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
+ NULL, 0,
+ WLAN_REASON_DEAUTH_LEAVING, false);
+ /* free directly, disconnected event already sent */
+ cfg80211_sme_free(wdev);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+void cfg80211_conn_work(struct work_struct *work)
+{
+ struct cfg80211_registered_device *rdev =
+ container_of(work, struct cfg80211_registered_device, conn_work);
+ struct wireless_dev *wdev;
+ u8 bssid_buf[ETH_ALEN], *bssid = NULL;
+
+ rtnl_lock();
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ if (!wdev->netdev)
+ continue;
+
+ wdev_lock(wdev);
+ if (!netif_running(wdev->netdev)) {
+ wdev_unlock(wdev);
+ continue;
+ }
+ if (!wdev->conn ||
+ wdev->conn->state == CFG80211_CONN_CONNECTED) {
+ wdev_unlock(wdev);
+ continue;
+ }
+ if (wdev->conn->params.bssid) {
+ memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
+ bssid = bssid_buf;
+ }
+ if (cfg80211_conn_do_work(wdev)) {
+ __cfg80211_connect_result(
+ wdev->netdev, bssid,
+ NULL, 0, NULL, 0,
+ WLAN_STATUS_UNSPECIFIED_FAILURE,
+ false, NULL);
+ }
+ wdev_unlock(wdev);
+ }
+
+ rtnl_unlock();
+}
+
+/* Returned bss is reference counted and must be cleaned up appropriately. */
+static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_bss *bss;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel,
+ wdev->conn->params.bssid,
+ wdev->conn->params.ssid,
+ wdev->conn->params.ssid_len,
+ IEEE80211_BSS_TYPE_ESS,
+ IEEE80211_PRIVACY(wdev->conn->params.privacy));
+ if (!bss)
+ return NULL;
+
+ memcpy(wdev->conn->bssid, bss->bssid, ETH_ALEN);
+ wdev->conn->params.bssid = wdev->conn->bssid;
+ wdev->conn->params.channel = bss->channel;
+ wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT;
+ schedule_work(&rdev->conn_work);
+
+ return bss;
+}
+
+static void __cfg80211_sme_scan_done(struct net_device *dev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_bss *bss;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!wdev->conn)
+ return;
+
+ if (wdev->conn->state != CFG80211_CONN_SCANNING &&
+ wdev->conn->state != CFG80211_CONN_SCAN_AGAIN)
+ return;
+
+ bss = cfg80211_get_conn_bss(wdev);
+ if (bss)
+ cfg80211_put_bss(&rdev->wiphy, bss);
+ else
+ schedule_work(&rdev->conn_work);
+}
+
+void cfg80211_sme_scan_done(struct net_device *dev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ wdev_lock(wdev);
+ __cfg80211_sme_scan_done(dev);
+ wdev_unlock(wdev);
+}
+
+void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+ u16 status_code = le16_to_cpu(mgmt->u.auth.status_code);
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED)
+ return;
+
+ if (status_code == WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG &&
+ wdev->conn->auto_auth &&
+ wdev->conn->params.auth_type != NL80211_AUTHTYPE_NETWORK_EAP) {
+ /* select automatically between only open, shared, leap */
+ switch (wdev->conn->params.auth_type) {
+ case NL80211_AUTHTYPE_OPEN_SYSTEM:
+ if (wdev->connect_keys)
+ wdev->conn->params.auth_type =
+ NL80211_AUTHTYPE_SHARED_KEY;
+ else
+ wdev->conn->params.auth_type =
+ NL80211_AUTHTYPE_NETWORK_EAP;
+ break;
+ case NL80211_AUTHTYPE_SHARED_KEY:
+ wdev->conn->params.auth_type =
+ NL80211_AUTHTYPE_NETWORK_EAP;
+ break;
+ default:
+ /* huh? */
+ wdev->conn->params.auth_type =
+ NL80211_AUTHTYPE_OPEN_SYSTEM;
+ break;
+ }
+ wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT;
+ schedule_work(&rdev->conn_work);
+ } else if (status_code != WLAN_STATUS_SUCCESS) {
+ __cfg80211_connect_result(wdev->netdev, mgmt->bssid,
+ NULL, 0, NULL, 0,
+ status_code, false, NULL);
+ } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
+ wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
+ schedule_work(&rdev->conn_work);
+ }
+}
+
+bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ if (!wdev->conn)
+ return false;
+
+ if (status == WLAN_STATUS_SUCCESS) {
+ wdev->conn->state = CFG80211_CONN_CONNECTED;
+ return false;
+ }
+
+ if (wdev->conn->prev_bssid_valid) {
+ /*
+ * Some stupid APs don't accept reassoc, so we
+ * need to fall back to trying regular assoc;
+ * return true so no event is sent to userspace.
+ */
+ wdev->conn->prev_bssid_valid = false;
+ wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
+ schedule_work(&rdev->conn_work);
+ return true;
+ }
+
+ wdev->conn->state = CFG80211_CONN_ASSOC_FAILED;
+ schedule_work(&rdev->conn_work);
+ return false;
+}
+
+void cfg80211_sme_deauth(struct wireless_dev *wdev)
+{
+ cfg80211_sme_free(wdev);
+}
+
+void cfg80211_sme_auth_timeout(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ if (!wdev->conn)
+ return;
+
+ wdev->conn->state = CFG80211_CONN_AUTH_FAILED;
+ schedule_work(&rdev->conn_work);
+}
+
+void cfg80211_sme_disassoc(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ if (!wdev->conn)
+ return;
+
+ wdev->conn->state = CFG80211_CONN_DEAUTH;
+ schedule_work(&rdev->conn_work);
+}
+
+void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+
+ if (!wdev->conn)
+ return;
+
+ wdev->conn->state = CFG80211_CONN_ASSOC_FAILED;
+ schedule_work(&rdev->conn_work);
+}
+
+static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev,
+ const u8 *ies, size_t ies_len,
+ const u8 **out_ies, size_t *out_ies_len)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ u8 *buf;
+ size_t offs;
+
+ if (!rdev->wiphy.extended_capabilities_len ||
+ (ies && cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, ies, ies_len))) {
+ *out_ies = kmemdup(ies, ies_len, GFP_KERNEL);
+ if (!*out_ies)
+ return -ENOMEM;
+ *out_ies_len = ies_len;
+ return 0;
+ }
+
+ buf = kmalloc(ies_len + rdev->wiphy.extended_capabilities_len + 2,
+ GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (ies_len) {
+ static const u8 before_extcapa[] = {
+ /* not listing IEs expected to be created by driver */
+ WLAN_EID_RSN,
+ WLAN_EID_QOS_CAPA,
+ WLAN_EID_RRM_ENABLED_CAPABILITIES,
+ WLAN_EID_MOBILITY_DOMAIN,
+ WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+ WLAN_EID_BSS_COEX_2040,
+ };
+
+ offs = ieee80211_ie_split(ies, ies_len, before_extcapa,
+ ARRAY_SIZE(before_extcapa), 0);
+ memcpy(buf, ies, offs);
+ /* leave a whole for extended capabilities IE */
+ memcpy(buf + offs + rdev->wiphy.extended_capabilities_len + 2,
+ ies + offs, ies_len - offs);
+ } else {
+ offs = 0;
+ }
+
+ /* place extended capabilities IE (with only driver capabilities) */
+ buf[offs] = WLAN_EID_EXT_CAPABILITY;
+ buf[offs + 1] = rdev->wiphy.extended_capabilities_len;
+ memcpy(buf + offs + 2,
+ rdev->wiphy.extended_capabilities,
+ rdev->wiphy.extended_capabilities_len);
+
+ *out_ies = buf;
+ *out_ies_len = ies_len + rdev->wiphy.extended_capabilities_len + 2;
+
+ return 0;
+}
+
+static int cfg80211_sme_connect(struct wireless_dev *wdev,
+ struct cfg80211_connect_params *connect,
+ const u8 *prev_bssid)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_bss *bss;
+ int err;
+
+ if (!rdev->ops->auth || !rdev->ops->assoc)
+ return -EOPNOTSUPP;
+
+ if (wdev->current_bss)
+ return -EALREADY;
+
+ if (WARN_ON(wdev->conn))
+ return -EINPROGRESS;
+
+ wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL);
+ if (!wdev->conn)
+ return -ENOMEM;
+
+ /*
+ * Copy all parameters, and treat explicitly IEs, BSSID, SSID.
+ */
+ memcpy(&wdev->conn->params, connect, sizeof(*connect));
+ if (connect->bssid) {
+ wdev->conn->params.bssid = wdev->conn->bssid;
+ memcpy(wdev->conn->bssid, connect->bssid, ETH_ALEN);
+ }
+
+ if (cfg80211_sme_get_conn_ies(wdev, connect->ie, connect->ie_len,
+ &wdev->conn->ie,
+ &wdev->conn->params.ie_len)) {
+ kfree(wdev->conn);
+ wdev->conn = NULL;
+ return -ENOMEM;
+ }
+ wdev->conn->params.ie = wdev->conn->ie;
+
+ if (connect->auth_type == NL80211_AUTHTYPE_AUTOMATIC) {
+ wdev->conn->auto_auth = true;
+ /* start with open system ... should mostly work */
+ wdev->conn->params.auth_type =
+ NL80211_AUTHTYPE_OPEN_SYSTEM;
+ } else {
+ wdev->conn->auto_auth = false;
+ }
+
+ wdev->conn->params.ssid = wdev->ssid;
+ wdev->conn->params.ssid_len = wdev->ssid_len;
+
+ /* see if we have the bss already */
+ bss = cfg80211_get_conn_bss(wdev);
+
+ if (prev_bssid) {
+ memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN);
+ wdev->conn->prev_bssid_valid = true;
+ }
+
+ /* we're good if we have a matching bss struct */
+ if (bss) {
+ err = cfg80211_conn_do_work(wdev);
+ cfg80211_put_bss(wdev->wiphy, bss);
+ } else {
+ /* otherwise we'll need to scan for the AP first */
+ err = cfg80211_conn_scan(wdev);
+
+ /*
+ * If we can't scan right now, then we need to scan again
+ * after the current scan finished, since the parameters
+ * changed (unless we find a good AP anyway).
+ */
+ if (err == -EBUSY) {
+ err = 0;
+ wdev->conn->state = CFG80211_CONN_SCAN_AGAIN;
+ }
+ }
+
+ if (err)
+ cfg80211_sme_free(wdev);
+
+ return err;
+}
+
+static int cfg80211_sme_disconnect(struct wireless_dev *wdev, u16 reason)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ int err;
+
+ if (!wdev->conn)
+ return 0;
+
+ if (!rdev->ops->deauth)
+ return -EOPNOTSUPP;
+
+ if (wdev->conn->state == CFG80211_CONN_SCANNING ||
+ wdev->conn->state == CFG80211_CONN_SCAN_AGAIN) {
+ err = 0;
+ goto out;
+ }
+
+ /* wdev->conn->params.bssid must be set if > SCANNING */
+ err = cfg80211_mlme_deauth(rdev, wdev->netdev,
+ wdev->conn->params.bssid,
+ NULL, 0, reason, false);
+ out:
+ cfg80211_sme_free(wdev);
+ return err;
+}
+
+/*
+ * code shared for in-device and software SME
+ */
+
+static bool cfg80211_is_all_idle(void)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+ bool is_all_idle = true;
+
+ /*
+ * All devices must be idle as otherwise if you are actively
+ * scanning some new beacon hints could be learned and would
+ * count as new regulatory hints.
+ */
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ wdev_lock(wdev);
+ if (wdev->conn || wdev->current_bss)
+ is_all_idle = false;
+ wdev_unlock(wdev);
+ }
+ }
+
+ return is_all_idle;
+}
+
+static void disconnect_work(struct work_struct *work)
+{
+ rtnl_lock();
+ if (cfg80211_is_all_idle())
+ regulatory_hint_disconnect();
+ rtnl_unlock();
+}
+
+static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
+
+
+/*
+ * API calls for drivers implementing connect/disconnect and
+ * SME event handling
+ */
+
+/* This method must consume bss one way or another */
+void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len,
+ u16 status, bool wextev,
+ struct cfg80211_bss *bss)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ const u8 *country_ie;
+#ifdef CONFIG_CFG80211_WEXT
+ union iwreq_data wrqu;
+#endif
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) {
+ cfg80211_put_bss(wdev->wiphy, bss);
+ return;
+ }
+
+ nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev,
+ bssid, req_ie, req_ie_len,
+ resp_ie, resp_ie_len,
+ status, GFP_KERNEL);
+
+#ifdef CONFIG_CFG80211_WEXT
+ if (wextev) {
+ if (req_ie && status == WLAN_STATUS_SUCCESS) {
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = req_ie_len;
+ wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, req_ie);
+ }
+
+ if (resp_ie && status == WLAN_STATUS_SUCCESS) {
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = resp_ie_len;
+ wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, resp_ie);
+ }
+
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+ if (bssid && status == WLAN_STATUS_SUCCESS) {
+ memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
+ memcpy(wdev->wext.prev_bssid, bssid, ETH_ALEN);
+ wdev->wext.prev_bssid_valid = true;
+ }
+ wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+ }
+#endif
+
+ if (!bss && (status == WLAN_STATUS_SUCCESS)) {
+ WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
+ bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid,
+ wdev->ssid, wdev->ssid_len,
+ IEEE80211_BSS_TYPE_ESS,
+ IEEE80211_PRIVACY_ANY);
+ if (bss)
+ cfg80211_hold_bss(bss_from_pub(bss));
+ }
+
+ if (wdev->current_bss) {
+ cfg80211_unhold_bss(wdev->current_bss);
+ cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+ wdev->current_bss = NULL;
+ }
+
+ if (status != WLAN_STATUS_SUCCESS) {
+ kzfree(wdev->connect_keys);
+ wdev->connect_keys = NULL;
+ wdev->ssid_len = 0;
+ if (bss) {
+ cfg80211_unhold_bss(bss_from_pub(bss));
+ cfg80211_put_bss(wdev->wiphy, bss);
+ }
+ cfg80211_sme_free(wdev);
+ return;
+ }
+
+ if (WARN_ON(!bss))
+ return;
+
+ wdev->current_bss = bss_from_pub(bss);
+
+ cfg80211_upload_connect_keys(wdev);
+
+ rcu_read_lock();
+ country_ie = ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY);
+ if (!country_ie) {
+ rcu_read_unlock();
+ return;
+ }
+
+ country_ie = kmemdup(country_ie, 2 + country_ie[1], GFP_ATOMIC);
+ rcu_read_unlock();
+
+ if (!country_ie)
+ return;
+
+ /*
+ * ieee80211_bss_get_ie() ensures we can access:
+ * - country_ie + 2, the start of the country ie data, and
+ * - and country_ie[1] which is the IE length
+ */
+ regulatory_hint_country_ie(wdev->wiphy, bss->channel->band,
+ country_ie + 2, country_ie[1]);
+ kfree(country_ie);
+}
+
+void cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len,
+ u16 status, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_event *ev;
+ unsigned long flags;
+
+ ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp);
+ if (!ev)
+ return;
+
+ ev->type = EVENT_CONNECT_RESULT;
+ if (bssid)
+ memcpy(ev->cr.bssid, bssid, ETH_ALEN);
+ if (req_ie_len) {
+ ev->cr.req_ie = ((u8 *)ev) + sizeof(*ev);
+ ev->cr.req_ie_len = req_ie_len;
+ memcpy((void *)ev->cr.req_ie, req_ie, req_ie_len);
+ }
+ if (resp_ie_len) {
+ ev->cr.resp_ie = ((u8 *)ev) + sizeof(*ev) + req_ie_len;
+ ev->cr.resp_ie_len = resp_ie_len;
+ memcpy((void *)ev->cr.resp_ie, resp_ie, resp_ie_len);
+ }
+ ev->cr.status = status;
+
+ spin_lock_irqsave(&wdev->event_lock, flags);
+ list_add_tail(&ev->list, &wdev->event_list);
+ spin_unlock_irqrestore(&wdev->event_lock, flags);
+ queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_connect_result);
+
+/* Consumes bss object one way or another */
+void __cfg80211_roamed(struct wireless_dev *wdev,
+ struct cfg80211_bss *bss,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len)
+{
+#ifdef CONFIG_CFG80211_WEXT
+ union iwreq_data wrqu;
+#endif
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+ goto out;
+
+ if (WARN_ON(!wdev->current_bss))
+ goto out;
+
+ cfg80211_unhold_bss(wdev->current_bss);
+ cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+ wdev->current_bss = NULL;
+
+ cfg80211_hold_bss(bss_from_pub(bss));
+ wdev->current_bss = bss_from_pub(bss);
+
+ nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy),
+ wdev->netdev, bss->bssid,
+ req_ie, req_ie_len, resp_ie, resp_ie_len,
+ GFP_KERNEL);
+
+#ifdef CONFIG_CFG80211_WEXT
+ if (req_ie) {
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = req_ie_len;
+ wireless_send_event(wdev->netdev, IWEVASSOCREQIE,
+ &wrqu, req_ie);
+ }
+
+ if (resp_ie) {
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = resp_ie_len;
+ wireless_send_event(wdev->netdev, IWEVASSOCRESPIE,
+ &wrqu, resp_ie);
+ }
+
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+ memcpy(wrqu.ap_addr.sa_data, bss->bssid, ETH_ALEN);
+ memcpy(wdev->wext.prev_bssid, bss->bssid, ETH_ALEN);
+ wdev->wext.prev_bssid_valid = true;
+ wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL);
+#endif
+
+ return;
+out:
+ cfg80211_put_bss(wdev->wiphy, bss);
+}
+
+void cfg80211_roamed(struct net_device *dev,
+ struct ieee80211_channel *channel,
+ const u8 *bssid,
+ const u8 *req_ie, size_t req_ie_len,
+ const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_bss *bss;
+
+ bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid,
+ wdev->ssid_len,
+ IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
+ if (WARN_ON(!bss))
+ return;
+
+ cfg80211_roamed_bss(dev, bss, req_ie, req_ie_len, resp_ie,
+ resp_ie_len, gfp);
+}
+EXPORT_SYMBOL(cfg80211_roamed);
+
+/* Consumes bss object one way or another */
+void cfg80211_roamed_bss(struct net_device *dev,
+ struct cfg80211_bss *bss, const u8 *req_ie,
+ size_t req_ie_len, const u8 *resp_ie,
+ size_t resp_ie_len, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_event *ev;
+ unsigned long flags;
+
+ if (WARN_ON(!bss))
+ return;
+
+ ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp);
+ if (!ev) {
+ cfg80211_put_bss(wdev->wiphy, bss);
+ return;
+ }
+
+ ev->type = EVENT_ROAMED;
+ ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev);
+ ev->rm.req_ie_len = req_ie_len;
+ memcpy((void *)ev->rm.req_ie, req_ie, req_ie_len);
+ ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + req_ie_len;
+ ev->rm.resp_ie_len = resp_ie_len;
+ memcpy((void *)ev->rm.resp_ie, resp_ie, resp_ie_len);
+ ev->rm.bss = bss;
+
+ spin_lock_irqsave(&wdev->event_lock, flags);
+ list_add_tail(&ev->list, &wdev->event_list);
+ spin_unlock_irqrestore(&wdev->event_lock, flags);
+ queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_roamed_bss);
+
+void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
+ size_t ie_len, u16 reason, bool from_ap)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ int i;
+#ifdef CONFIG_CFG80211_WEXT
+ union iwreq_data wrqu;
+#endif
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+ return;
+
+ if (wdev->current_bss) {
+ cfg80211_unhold_bss(wdev->current_bss);
+ cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub);
+ }
+
+ wdev->current_bss = NULL;
+ wdev->ssid_len = 0;
+
+ nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
+
+ /*
+ * Delete all the keys ... pairwise keys can't really
+ * exist any more anyway, but default keys might.
+ */
+ if (rdev->ops->del_key)
+ for (i = 0; i < 6; i++)
+ rdev_del_key(rdev, dev, i, false, NULL);
+
+ rdev_set_qos_map(rdev, dev, NULL);
+
+#ifdef CONFIG_CFG80211_WEXT
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+ wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+ wdev->wext.connect.ssid_len = 0;
+#endif
+
+ schedule_work(&cfg80211_disconnect_work);
+}
+
+void cfg80211_disconnected(struct net_device *dev, u16 reason,
+ const u8 *ie, size_t ie_len, gfp_t gfp)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_event *ev;
+ unsigned long flags;
+
+ ev = kzalloc(sizeof(*ev) + ie_len, gfp);
+ if (!ev)
+ return;
+
+ ev->type = EVENT_DISCONNECTED;
+ ev->dc.ie = ((u8 *)ev) + sizeof(*ev);
+ ev->dc.ie_len = ie_len;
+ memcpy((void *)ev->dc.ie, ie, ie_len);
+ ev->dc.reason = reason;
+
+ spin_lock_irqsave(&wdev->event_lock, flags);
+ list_add_tail(&ev->list, &wdev->event_list);
+ spin_unlock_irqrestore(&wdev->event_lock, flags);
+ queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_disconnected);
+
+/*
+ * API calls for nl80211/wext compatibility code
+ */
+int cfg80211_connect(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_connect_params *connect,
+ struct cfg80211_cached_keys *connkeys,
+ const u8 *prev_bssid)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (WARN_ON(wdev->connect_keys)) {
+ kzfree(wdev->connect_keys);
+ wdev->connect_keys = NULL;
+ }
+
+ cfg80211_oper_and_ht_capa(&connect->ht_capa_mask,
+ rdev->wiphy.ht_capa_mod_mask);
+
+ if (connkeys && connkeys->def >= 0) {
+ int idx;
+ u32 cipher;
+
+ idx = connkeys->def;
+ cipher = connkeys->params[idx].cipher;
+ /* If given a WEP key we may need it for shared key auth */
+ if (cipher == WLAN_CIPHER_SUITE_WEP40 ||
+ cipher == WLAN_CIPHER_SUITE_WEP104) {
+ connect->key_idx = idx;
+ connect->key = connkeys->params[idx].key;
+ connect->key_len = connkeys->params[idx].key_len;
+
+ /*
+ * If ciphers are not set (e.g. when going through
+ * iwconfig), we have to set them appropriately here.
+ */
+ if (connect->crypto.cipher_group == 0)
+ connect->crypto.cipher_group = cipher;
+
+ if (connect->crypto.n_ciphers_pairwise == 0) {
+ connect->crypto.n_ciphers_pairwise = 1;
+ connect->crypto.ciphers_pairwise[0] = cipher;
+ }
+ }
+ }
+
+ wdev->connect_keys = connkeys;
+ memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
+ wdev->ssid_len = connect->ssid_len;
+
+ if (!rdev->ops->connect)
+ err = cfg80211_sme_connect(wdev, connect, prev_bssid);
+ else
+ err = rdev_connect(rdev, dev, connect);
+
+ if (err) {
+ wdev->connect_keys = NULL;
+ wdev->ssid_len = 0;
+ return err;
+ }
+
+ return 0;
+}
+
+int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, u16 reason, bool wextev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err = 0;
+
+ ASSERT_WDEV_LOCK(wdev);
+
+ kzfree(wdev->connect_keys);
+ wdev->connect_keys = NULL;
+
+ if (wdev->conn)
+ err = cfg80211_sme_disconnect(wdev, reason);
+ else if (!rdev->ops->disconnect)
+ cfg80211_mlme_down(rdev, dev);
+ else if (wdev->current_bss)
+ err = rdev_disconnect(rdev, dev, reason);
+
+ return err;
+}
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
new file mode 100644
index 000000000..9ee6bc1a7
--- /dev/null
+++ b/net/wireless/sysfs.c
@@ -0,0 +1,170 @@
+/*
+ * This file provides /sys/class/ieee80211/<wiphy name>/
+ * and some default attributes.
+ *
+ * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This file is GPLv2 as found in COPYING.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/nl80211.h>
+#include <linux/rtnetlink.h>
+#include <net/cfg80211.h>
+#include "sysfs.h"
+#include "core.h"
+#include "rdev-ops.h"
+
+static inline struct cfg80211_registered_device *dev_to_rdev(
+ struct device *dev)
+{
+ return container_of(dev, struct cfg80211_registered_device, wiphy.dev);
+}
+
+#define SHOW_FMT(name, fmt, member) \
+static ssize_t name ## _show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \
+} \
+static DEVICE_ATTR_RO(name)
+
+SHOW_FMT(index, "%d", wiphy_idx);
+SHOW_FMT(macaddress, "%pM", wiphy.perm_addr);
+SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
+
+static ssize_t name_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf) {
+ struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
+ return sprintf(buf, "%s\n", dev_name(&wiphy->dev));
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t addresses_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
+ char *start = buf;
+ int i;
+
+ if (!wiphy->addresses)
+ return sprintf(buf, "%pM\n", wiphy->perm_addr);
+
+ for (i = 0; i < wiphy->n_addresses; i++)
+ buf += sprintf(buf, "%pM\n", &wiphy->addresses[i].addr);
+
+ return buf - start;
+}
+static DEVICE_ATTR_RO(addresses);
+
+static struct attribute *ieee80211_attrs[] = {
+ &dev_attr_index.attr,
+ &dev_attr_macaddress.attr,
+ &dev_attr_address_mask.attr,
+ &dev_attr_addresses.attr,
+ &dev_attr_name.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(ieee80211);
+
+static void wiphy_dev_release(struct device *dev)
+{
+ struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
+
+ cfg80211_dev_free(rdev);
+}
+
+static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ /* TODO, we probably need stuff here */
+ return 0;
+}
+
+#ifdef CONFIG_PM
+static void cfg80211_leave_all(struct cfg80211_registered_device *rdev)
+{
+ struct wireless_dev *wdev;
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list)
+ cfg80211_leave(rdev, wdev);
+}
+
+static int wiphy_suspend(struct device *dev, pm_message_t state)
+{
+ struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
+ int ret = 0;
+
+ rdev->suspend_at = get_seconds();
+
+ rtnl_lock();
+ if (rdev->wiphy.registered) {
+ if (!rdev->wiphy.wowlan_config)
+ cfg80211_leave_all(rdev);
+ if (rdev->ops->suspend)
+ ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config);
+ if (ret == 1) {
+ /* Driver refuse to configure wowlan */
+ cfg80211_leave_all(rdev);
+ ret = rdev_suspend(rdev, NULL);
+ }
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+
+static int wiphy_resume(struct device *dev)
+{
+ struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
+ int ret = 0;
+
+ /* Age scan results with time spent in suspend */
+ cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
+
+ if (rdev->ops->resume) {
+ rtnl_lock();
+ if (rdev->wiphy.registered)
+ ret = rdev_resume(rdev);
+ rtnl_unlock();
+ }
+
+ return ret;
+}
+#endif
+
+static const void *wiphy_namespace(struct device *d)
+{
+ struct wiphy *wiphy = container_of(d, struct wiphy, dev);
+
+ return wiphy_net(wiphy);
+}
+
+struct class ieee80211_class = {
+ .name = "ieee80211",
+ .owner = THIS_MODULE,
+ .dev_release = wiphy_dev_release,
+ .dev_groups = ieee80211_groups,
+ .dev_uevent = wiphy_uevent,
+#ifdef CONFIG_PM
+ .suspend = wiphy_suspend,
+ .resume = wiphy_resume,
+#endif
+ .ns_type = &net_ns_type_operations,
+ .namespace = wiphy_namespace,
+};
+
+int wiphy_sysfs_init(void)
+{
+ return class_register(&ieee80211_class);
+}
+
+void wiphy_sysfs_exit(void)
+{
+ class_unregister(&ieee80211_class);
+}
diff --git a/net/wireless/sysfs.h b/net/wireless/sysfs.h
new file mode 100644
index 000000000..b533ed71d
--- /dev/null
+++ b/net/wireless/sysfs.h
@@ -0,0 +1,9 @@
+#ifndef __WIRELESS_SYSFS_H
+#define __WIRELESS_SYSFS_H
+
+int wiphy_sysfs_init(void);
+void wiphy_sysfs_exit(void);
+
+extern struct class ieee80211_class;
+
+#endif /* __WIRELESS_SYSFS_H */
diff --git a/net/wireless/trace.c b/net/wireless/trace.c
new file mode 100644
index 000000000..95f997fad
--- /dev/null
+++ b/net/wireless/trace.c
@@ -0,0 +1,7 @@
+#include <linux/module.h>
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#endif
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
new file mode 100644
index 000000000..af3617c98
--- /dev/null
+++ b/net/wireless/trace.h
@@ -0,0 +1,2824 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cfg80211
+
+#if !defined(__RDEV_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __RDEV_OPS_TRACE
+
+#include <linux/tracepoint.h>
+
+#include <linux/rtnetlink.h>
+#include <linux/etherdevice.h>
+#include <net/cfg80211.h>
+#include "core.h"
+
+#define MAC_ENTRY(entry_mac) __array(u8, entry_mac, ETH_ALEN)
+#define MAC_ASSIGN(entry_mac, given_mac) do { \
+ if (given_mac) \
+ memcpy(__entry->entry_mac, given_mac, ETH_ALEN); \
+ else \
+ eth_zero_addr(__entry->entry_mac); \
+ } while (0)
+#define MAC_PR_FMT "%pM"
+#define MAC_PR_ARG(entry_mac) (__entry->entry_mac)
+
+#define MAXNAME 32
+#define WIPHY_ENTRY __array(char, wiphy_name, 32)
+#define WIPHY_ASSIGN strlcpy(__entry->wiphy_name, wiphy_name(wiphy), MAXNAME)
+#define WIPHY_PR_FMT "%s"
+#define WIPHY_PR_ARG __entry->wiphy_name
+
+#define WDEV_ENTRY __field(u32, id)
+#define WDEV_ASSIGN (__entry->id) = (!IS_ERR_OR_NULL(wdev) \
+ ? wdev->identifier : 0)
+#define WDEV_PR_FMT "wdev(%u)"
+#define WDEV_PR_ARG (__entry->id)
+
+#define NETDEV_ENTRY __array(char, name, IFNAMSIZ) \
+ __field(int, ifindex)
+#define NETDEV_ASSIGN \
+ do { \
+ memcpy(__entry->name, netdev->name, IFNAMSIZ); \
+ (__entry->ifindex) = (netdev->ifindex); \
+ } while (0)
+#define NETDEV_PR_FMT "netdev:%s(%d)"
+#define NETDEV_PR_ARG __entry->name, __entry->ifindex
+
+#define MESH_CFG_ENTRY __field(u16, dot11MeshRetryTimeout) \
+ __field(u16, dot11MeshConfirmTimeout) \
+ __field(u16, dot11MeshHoldingTimeout) \
+ __field(u16, dot11MeshMaxPeerLinks) \
+ __field(u8, dot11MeshMaxRetries) \
+ __field(u8, dot11MeshTTL) \
+ __field(u8, element_ttl) \
+ __field(bool, auto_open_plinks) \
+ __field(u32, dot11MeshNbrOffsetMaxNeighbor) \
+ __field(u8, dot11MeshHWMPmaxPREQretries) \
+ __field(u32, path_refresh_time) \
+ __field(u32, dot11MeshHWMPactivePathTimeout) \
+ __field(u16, min_discovery_timeout) \
+ __field(u16, dot11MeshHWMPpreqMinInterval) \
+ __field(u16, dot11MeshHWMPperrMinInterval) \
+ __field(u16, dot11MeshHWMPnetDiameterTraversalTime) \
+ __field(u8, dot11MeshHWMPRootMode) \
+ __field(u16, dot11MeshHWMPRannInterval) \
+ __field(bool, dot11MeshGateAnnouncementProtocol) \
+ __field(bool, dot11MeshForwarding) \
+ __field(s32, rssi_threshold) \
+ __field(u16, ht_opmode) \
+ __field(u32, dot11MeshHWMPactivePathToRootTimeout) \
+ __field(u16, dot11MeshHWMProotInterval) \
+ __field(u16, dot11MeshHWMPconfirmationInterval)
+#define MESH_CFG_ASSIGN \
+ do { \
+ __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \
+ __entry->dot11MeshConfirmTimeout = \
+ conf->dot11MeshConfirmTimeout; \
+ __entry->dot11MeshHoldingTimeout = \
+ conf->dot11MeshHoldingTimeout; \
+ __entry->dot11MeshMaxPeerLinks = conf->dot11MeshMaxPeerLinks; \
+ __entry->dot11MeshMaxRetries = conf->dot11MeshMaxRetries; \
+ __entry->dot11MeshTTL = conf->dot11MeshTTL; \
+ __entry->element_ttl = conf->element_ttl; \
+ __entry->auto_open_plinks = conf->auto_open_plinks; \
+ __entry->dot11MeshNbrOffsetMaxNeighbor = \
+ conf->dot11MeshNbrOffsetMaxNeighbor; \
+ __entry->dot11MeshHWMPmaxPREQretries = \
+ conf->dot11MeshHWMPmaxPREQretries; \
+ __entry->path_refresh_time = conf->path_refresh_time; \
+ __entry->dot11MeshHWMPactivePathTimeout = \
+ conf->dot11MeshHWMPactivePathTimeout; \
+ __entry->min_discovery_timeout = conf->min_discovery_timeout; \
+ __entry->dot11MeshHWMPpreqMinInterval = \
+ conf->dot11MeshHWMPpreqMinInterval; \
+ __entry->dot11MeshHWMPperrMinInterval = \
+ conf->dot11MeshHWMPperrMinInterval; \
+ __entry->dot11MeshHWMPnetDiameterTraversalTime = \
+ conf->dot11MeshHWMPnetDiameterTraversalTime; \
+ __entry->dot11MeshHWMPRootMode = conf->dot11MeshHWMPRootMode; \
+ __entry->dot11MeshHWMPRannInterval = \
+ conf->dot11MeshHWMPRannInterval; \
+ __entry->dot11MeshGateAnnouncementProtocol = \
+ conf->dot11MeshGateAnnouncementProtocol; \
+ __entry->dot11MeshForwarding = conf->dot11MeshForwarding; \
+ __entry->rssi_threshold = conf->rssi_threshold; \
+ __entry->ht_opmode = conf->ht_opmode; \
+ __entry->dot11MeshHWMPactivePathToRootTimeout = \
+ conf->dot11MeshHWMPactivePathToRootTimeout; \
+ __entry->dot11MeshHWMProotInterval = \
+ conf->dot11MeshHWMProotInterval; \
+ __entry->dot11MeshHWMPconfirmationInterval = \
+ conf->dot11MeshHWMPconfirmationInterval; \
+ } while (0)
+
+#define CHAN_ENTRY __field(enum ieee80211_band, band) \
+ __field(u16, center_freq)
+#define CHAN_ASSIGN(chan) \
+ do { \
+ if (chan) { \
+ __entry->band = chan->band; \
+ __entry->center_freq = chan->center_freq; \
+ } else { \
+ __entry->band = 0; \
+ __entry->center_freq = 0; \
+ } \
+ } while (0)
+#define CHAN_PR_FMT "band: %d, freq: %u"
+#define CHAN_PR_ARG __entry->band, __entry->center_freq
+
+#define CHAN_DEF_ENTRY __field(enum ieee80211_band, band) \
+ __field(u32, control_freq) \
+ __field(u32, width) \
+ __field(u32, center_freq1) \
+ __field(u32, center_freq2)
+#define CHAN_DEF_ASSIGN(chandef) \
+ do { \
+ if ((chandef) && (chandef)->chan) { \
+ __entry->band = (chandef)->chan->band; \
+ __entry->control_freq = \
+ (chandef)->chan->center_freq; \
+ __entry->width = (chandef)->width; \
+ __entry->center_freq1 = (chandef)->center_freq1;\
+ __entry->center_freq2 = (chandef)->center_freq2;\
+ } else { \
+ __entry->band = 0; \
+ __entry->control_freq = 0; \
+ __entry->width = 0; \
+ __entry->center_freq1 = 0; \
+ __entry->center_freq2 = 0; \
+ } \
+ } while (0)
+#define CHAN_DEF_PR_FMT \
+ "band: %d, control freq: %u, width: %d, cf1: %u, cf2: %u"
+#define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq, \
+ __entry->width, __entry->center_freq1, \
+ __entry->center_freq2
+
+#define SINFO_ENTRY __field(int, generation) \
+ __field(u32, connected_time) \
+ __field(u32, inactive_time) \
+ __field(u32, rx_bytes) \
+ __field(u32, tx_bytes) \
+ __field(u32, rx_packets) \
+ __field(u32, tx_packets) \
+ __field(u32, tx_retries) \
+ __field(u32, tx_failed) \
+ __field(u32, rx_dropped_misc) \
+ __field(u32, beacon_loss_count) \
+ __field(u16, llid) \
+ __field(u16, plid) \
+ __field(u8, plink_state)
+#define SINFO_ASSIGN \
+ do { \
+ __entry->generation = sinfo->generation; \
+ __entry->connected_time = sinfo->connected_time; \
+ __entry->inactive_time = sinfo->inactive_time; \
+ __entry->rx_bytes = sinfo->rx_bytes; \
+ __entry->tx_bytes = sinfo->tx_bytes; \
+ __entry->rx_packets = sinfo->rx_packets; \
+ __entry->tx_packets = sinfo->tx_packets; \
+ __entry->tx_retries = sinfo->tx_retries; \
+ __entry->tx_failed = sinfo->tx_failed; \
+ __entry->rx_dropped_misc = sinfo->rx_dropped_misc; \
+ __entry->beacon_loss_count = sinfo->beacon_loss_count; \
+ __entry->llid = sinfo->llid; \
+ __entry->plid = sinfo->plid; \
+ __entry->plink_state = sinfo->plink_state; \
+ } while (0)
+
+#define BOOL_TO_STR(bo) (bo) ? "true" : "false"
+
+#define QOS_MAP_ENTRY __field(u8, num_des) \
+ __array(u8, dscp_exception, \
+ 2 * IEEE80211_QOS_MAP_MAX_EX) \
+ __array(u8, up, IEEE80211_QOS_MAP_LEN_MIN)
+#define QOS_MAP_ASSIGN(qos_map) \
+ do { \
+ if ((qos_map)) { \
+ __entry->num_des = (qos_map)->num_des; \
+ memcpy(__entry->dscp_exception, \
+ &(qos_map)->dscp_exception, \
+ 2 * IEEE80211_QOS_MAP_MAX_EX); \
+ memcpy(__entry->up, &(qos_map)->up, \
+ IEEE80211_QOS_MAP_LEN_MIN); \
+ } else { \
+ __entry->num_des = 0; \
+ memset(__entry->dscp_exception, 0, \
+ 2 * IEEE80211_QOS_MAP_MAX_EX); \
+ memset(__entry->up, 0, \
+ IEEE80211_QOS_MAP_LEN_MIN); \
+ } \
+ } while (0)
+
+/*************************************************************
+ * rdev->ops traces *
+ *************************************************************/
+
+TRACE_EVENT(rdev_suspend,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_wowlan *wow),
+ TP_ARGS(wiphy, wow),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(bool, any)
+ __field(bool, disconnect)
+ __field(bool, magic_pkt)
+ __field(bool, gtk_rekey_failure)
+ __field(bool, eap_identity_req)
+ __field(bool, four_way_handshake)
+ __field(bool, rfkill_release)
+ __field(bool, valid_wow)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ if (wow) {
+ __entry->any = wow->any;
+ __entry->disconnect = wow->disconnect;
+ __entry->magic_pkt = wow->magic_pkt;
+ __entry->gtk_rekey_failure = wow->gtk_rekey_failure;
+ __entry->eap_identity_req = wow->eap_identity_req;
+ __entry->four_way_handshake = wow->four_way_handshake;
+ __entry->rfkill_release = wow->rfkill_release;
+ __entry->valid_wow = true;
+ } else {
+ __entry->valid_wow = false;
+ }
+ ),
+ TP_printk(WIPHY_PR_FMT ", wow%s - any: %d, disconnect: %d, "
+ "magic pkt: %d, gtk rekey failure: %d, eap identify req: %d, "
+ "four way handshake: %d, rfkill release: %d.",
+ WIPHY_PR_ARG, __entry->valid_wow ? "" : "(Not configured!)",
+ __entry->any, __entry->disconnect, __entry->magic_pkt,
+ __entry->gtk_rekey_failure, __entry->eap_identity_req,
+ __entry->four_way_handshake, __entry->rfkill_release)
+);
+
+TRACE_EVENT(rdev_return_int,
+ TP_PROTO(struct wiphy *wiphy, int ret),
+ TP_ARGS(wiphy, ret),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(WIPHY_PR_FMT ", returned: %d", WIPHY_PR_ARG, __entry->ret)
+);
+
+TRACE_EVENT(rdev_scan,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_scan_request *request),
+ TP_ARGS(wiphy, request),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
+);
+
+DECLARE_EVENT_CLASS(wiphy_only_evt,
+ TP_PROTO(struct wiphy *wiphy),
+ TP_ARGS(wiphy),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
+);
+
+DEFINE_EVENT(wiphy_only_evt, rdev_resume,
+ TP_PROTO(struct wiphy *wiphy),
+ TP_ARGS(wiphy)
+);
+
+DEFINE_EVENT(wiphy_only_evt, rdev_return_void,
+ TP_PROTO(struct wiphy *wiphy),
+ TP_ARGS(wiphy)
+);
+
+DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna,
+ TP_PROTO(struct wiphy *wiphy),
+ TP_ARGS(wiphy)
+);
+
+DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll,
+ TP_PROTO(struct wiphy *wiphy),
+ TP_ARGS(wiphy)
+);
+
+DECLARE_EVENT_CLASS(wiphy_enabled_evt,
+ TP_PROTO(struct wiphy *wiphy, bool enabled),
+ TP_ARGS(wiphy, enabled),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(bool, enabled)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->enabled = enabled;
+ ),
+ TP_printk(WIPHY_PR_FMT ", %senabled ",
+ WIPHY_PR_ARG, __entry->enabled ? "" : "not ")
+);
+
+DEFINE_EVENT(wiphy_enabled_evt, rdev_set_wakeup,
+ TP_PROTO(struct wiphy *wiphy, bool enabled),
+ TP_ARGS(wiphy, enabled)
+);
+
+TRACE_EVENT(rdev_add_virtual_intf,
+ TP_PROTO(struct wiphy *wiphy, char *name, enum nl80211_iftype type),
+ TP_ARGS(wiphy, name, type),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __string(vir_intf_name, name ? name : "<noname>")
+ __field(enum nl80211_iftype, type)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __assign_str(vir_intf_name, name ? name : "<noname>");
+ __entry->type = type;
+ ),
+ TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d",
+ WIPHY_PR_ARG, __get_str(vir_intf_name), __entry->type)
+);
+
+DECLARE_EVENT_CLASS(wiphy_wdev_evt,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_del_virtual_intf,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+TRACE_EVENT(rdev_change_virtual_intf,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ enum nl80211_iftype type),
+ TP_ARGS(wiphy, netdev, type),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(enum nl80211_iftype, type)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->type = type;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", type: %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->type)
+);
+
+DECLARE_EVENT_CLASS(key_handle,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
+ bool pairwise, const u8 *mac_addr),
+ TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(mac_addr)
+ __field(u8, key_index)
+ __field(bool, pairwise)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(mac_addr, mac_addr);
+ __entry->key_index = key_index;
+ __entry->pairwise = pairwise;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, pairwise: %s, mac addr: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index,
+ BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr))
+);
+
+DEFINE_EVENT(key_handle, rdev_add_key,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
+ bool pairwise, const u8 *mac_addr),
+ TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr)
+);
+
+DEFINE_EVENT(key_handle, rdev_get_key,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
+ bool pairwise, const u8 *mac_addr),
+ TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr)
+);
+
+DEFINE_EVENT(key_handle, rdev_del_key,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
+ bool pairwise, const u8 *mac_addr),
+ TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr)
+);
+
+TRACE_EVENT(rdev_set_default_key,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index,
+ bool unicast, bool multicast),
+ TP_ARGS(wiphy, netdev, key_index, unicast, multicast),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u8, key_index)
+ __field(bool, unicast)
+ __field(bool, multicast)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->key_index = key_index;
+ __entry->unicast = unicast;
+ __entry->multicast = multicast;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u, unicast: %s, multicast: %s",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index,
+ BOOL_TO_STR(__entry->unicast),
+ BOOL_TO_STR(__entry->multicast))
+);
+
+TRACE_EVENT(rdev_set_default_mgmt_key,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index),
+ TP_ARGS(wiphy, netdev, key_index),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u8, key_index)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->key_index = key_index;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index)
+);
+
+TRACE_EVENT(rdev_start_ap,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_ap_settings *settings),
+ TP_ARGS(wiphy, netdev, settings),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ CHAN_DEF_ENTRY
+ __field(int, beacon_interval)
+ __field(int, dtim_period)
+ __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
+ __field(enum nl80211_hidden_ssid, hidden_ssid)
+ __field(u32, wpa_ver)
+ __field(bool, privacy)
+ __field(enum nl80211_auth_type, auth_type)
+ __field(int, inactivity_timeout)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ CHAN_DEF_ASSIGN(&settings->chandef);
+ __entry->beacon_interval = settings->beacon_interval;
+ __entry->dtim_period = settings->dtim_period;
+ __entry->hidden_ssid = settings->hidden_ssid;
+ __entry->wpa_ver = settings->crypto.wpa_versions;
+ __entry->privacy = settings->privacy;
+ __entry->auth_type = settings->auth_type;
+ __entry->inactivity_timeout = settings->inactivity_timeout;
+ memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
+ memcpy(__entry->ssid, settings->ssid, settings->ssid_len);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, "
+ CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, "
+ "hidden ssid: %d, wpa versions: %u, privacy: %s, "
+ "auth type: %d, inactivity timeout: %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG,
+ __entry->beacon_interval, __entry->dtim_period,
+ __entry->hidden_ssid, __entry->wpa_ver,
+ BOOL_TO_STR(__entry->privacy), __entry->auth_type,
+ __entry->inactivity_timeout)
+);
+
+TRACE_EVENT(rdev_change_beacon,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_beacon_data *info),
+ TP_ARGS(wiphy, netdev, info),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __dynamic_array(u8, head, info ? info->head_len : 0)
+ __dynamic_array(u8, tail, info ? info->tail_len : 0)
+ __dynamic_array(u8, beacon_ies, info ? info->beacon_ies_len : 0)
+ __dynamic_array(u8, proberesp_ies,
+ info ? info->proberesp_ies_len : 0)
+ __dynamic_array(u8, assocresp_ies,
+ info ? info->assocresp_ies_len : 0)
+ __dynamic_array(u8, probe_resp, info ? info->probe_resp_len : 0)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ if (info) {
+ if (info->head)
+ memcpy(__get_dynamic_array(head), info->head,
+ info->head_len);
+ if (info->tail)
+ memcpy(__get_dynamic_array(tail), info->tail,
+ info->tail_len);
+ if (info->beacon_ies)
+ memcpy(__get_dynamic_array(beacon_ies),
+ info->beacon_ies, info->beacon_ies_len);
+ if (info->proberesp_ies)
+ memcpy(__get_dynamic_array(proberesp_ies),
+ info->proberesp_ies,
+ info->proberesp_ies_len);
+ if (info->assocresp_ies)
+ memcpy(__get_dynamic_array(assocresp_ies),
+ info->assocresp_ies,
+ info->assocresp_ies_len);
+ if (info->probe_resp)
+ memcpy(__get_dynamic_array(probe_resp),
+ info->probe_resp, info->probe_resp_len);
+ }
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
+);
+
+DECLARE_EVENT_CLASS(wiphy_netdev_evt,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
+);
+
+DEFINE_EVENT(wiphy_netdev_evt, rdev_stop_ap,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
+DEFINE_EVENT(wiphy_netdev_evt, rdev_sched_scan_stop,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
+DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
+DEFINE_EVENT(wiphy_netdev_evt, rdev_get_mesh_config,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
+DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_mesh,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
+DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ibss,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
+DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ocb,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
+DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
+DECLARE_EVENT_CLASS(station_add_change,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
+ struct station_parameters *params),
+ TP_ARGS(wiphy, netdev, mac, params),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(sta_mac)
+ __field(u32, sta_flags_mask)
+ __field(u32, sta_flags_set)
+ __field(u32, sta_modify_mask)
+ __field(int, listen_interval)
+ __field(u16, aid)
+ __field(u8, plink_action)
+ __field(u8, plink_state)
+ __field(u8, uapsd_queues)
+ __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
+ __array(char, vlan, IFNAMSIZ)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(sta_mac, mac);
+ __entry->sta_flags_mask = params->sta_flags_mask;
+ __entry->sta_flags_set = params->sta_flags_set;
+ __entry->sta_modify_mask = params->sta_modify_mask;
+ __entry->listen_interval = params->listen_interval;
+ __entry->aid = params->aid;
+ __entry->plink_action = params->plink_action;
+ __entry->plink_state = params->plink_state;
+ __entry->uapsd_queues = params->uapsd_queues;
+ memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
+ if (params->ht_capa)
+ memcpy(__entry->ht_capa, params->ht_capa,
+ sizeof(struct ieee80211_ht_cap));
+ memset(__entry->vlan, 0, sizeof(__entry->vlan));
+ if (params->vlan)
+ memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
+ ", station flags mask: %u, station flags set: %u, "
+ "station modify mask: %u, listen interval: %d, aid: %u, "
+ "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac),
+ __entry->sta_flags_mask, __entry->sta_flags_set,
+ __entry->sta_modify_mask, __entry->listen_interval,
+ __entry->aid, __entry->plink_action, __entry->plink_state,
+ __entry->uapsd_queues, __entry->vlan)
+);
+
+DEFINE_EVENT(station_add_change, rdev_add_station,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
+ struct station_parameters *params),
+ TP_ARGS(wiphy, netdev, mac, params)
+);
+
+DEFINE_EVENT(station_add_change, rdev_change_station,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
+ struct station_parameters *params),
+ TP_ARGS(wiphy, netdev, mac, params)
+);
+
+DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
+ TP_ARGS(wiphy, netdev, mac),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(sta_mac)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(sta_mac, mac);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac))
+);
+
+DECLARE_EVENT_CLASS(station_del,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct station_del_parameters *params),
+ TP_ARGS(wiphy, netdev, params),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(sta_mac)
+ __field(u8, subtype)
+ __field(u16, reason_code)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(sta_mac, params->mac);
+ __entry->subtype = params->subtype;
+ __entry->reason_code = params->reason_code;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
+ ", subtype: %u, reason_code: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac),
+ __entry->subtype, __entry->reason_code)
+);
+
+DEFINE_EVENT(station_del, rdev_del_station,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct station_del_parameters *params),
+ TP_ARGS(wiphy, netdev, params)
+);
+
+DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_get_station,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
+ TP_ARGS(wiphy, netdev, mac)
+);
+
+DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
+ TP_ARGS(wiphy, netdev, mac)
+);
+
+DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_set_wds_peer,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
+ TP_ARGS(wiphy, netdev, mac)
+);
+
+TRACE_EVENT(rdev_dump_station,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx,
+ u8 *mac),
+ TP_ARGS(wiphy, netdev, idx, mac),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(sta_mac)
+ __field(int, idx)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(sta_mac, mac);
+ __entry->idx = idx;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", idx: %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac),
+ __entry->idx)
+);
+
+TRACE_EVENT(rdev_return_int_station_info,
+ TP_PROTO(struct wiphy *wiphy, int ret, struct station_info *sinfo),
+ TP_ARGS(wiphy, ret, sinfo),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, ret)
+ SINFO_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->ret = ret;
+ SINFO_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", returned %d" ,
+ WIPHY_PR_ARG, __entry->ret)
+);
+
+DECLARE_EVENT_CLASS(mpath_evt,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
+ u8 *next_hop),
+ TP_ARGS(wiphy, netdev, dst, next_hop),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(dst)
+ MAC_ENTRY(next_hop)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(dst, dst);
+ MAC_ASSIGN(next_hop, next_hop);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: " MAC_PR_FMT ", next hop: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dst),
+ MAC_PR_ARG(next_hop))
+);
+
+DEFINE_EVENT(mpath_evt, rdev_add_mpath,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
+ u8 *next_hop),
+ TP_ARGS(wiphy, netdev, dst, next_hop)
+);
+
+DEFINE_EVENT(mpath_evt, rdev_change_mpath,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
+ u8 *next_hop),
+ TP_ARGS(wiphy, netdev, dst, next_hop)
+);
+
+DEFINE_EVENT(mpath_evt, rdev_get_mpath,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
+ u8 *next_hop),
+ TP_ARGS(wiphy, netdev, dst, next_hop)
+);
+
+TRACE_EVENT(rdev_dump_mpath,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx,
+ u8 *dst, u8 *next_hop),
+ TP_ARGS(wiphy, netdev, idx, dst, next_hop),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(dst)
+ MAC_ENTRY(next_hop)
+ __field(int, idx)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(dst, dst);
+ MAC_ASSIGN(next_hop, next_hop);
+ __entry->idx = idx;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: "
+ MAC_PR_FMT ", next hop: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, MAC_PR_ARG(dst),
+ MAC_PR_ARG(next_hop))
+);
+
+TRACE_EVENT(rdev_get_mpp,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ u8 *dst, u8 *mpp),
+ TP_ARGS(wiphy, netdev, dst, mpp),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(dst)
+ MAC_ENTRY(mpp)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(dst, dst);
+ MAC_ASSIGN(mpp, mpp);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: " MAC_PR_FMT
+ ", mpp: " MAC_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG,
+ MAC_PR_ARG(dst), MAC_PR_ARG(mpp))
+);
+
+TRACE_EVENT(rdev_dump_mpp,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx,
+ u8 *dst, u8 *mpp),
+ TP_ARGS(wiphy, netdev, idx, mpp, dst),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(dst)
+ MAC_ENTRY(mpp)
+ __field(int, idx)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(dst, dst);
+ MAC_ASSIGN(mpp, mpp);
+ __entry->idx = idx;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: "
+ MAC_PR_FMT ", mpp: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, MAC_PR_ARG(dst),
+ MAC_PR_ARG(mpp))
+);
+
+TRACE_EVENT(rdev_return_int_mpath_info,
+ TP_PROTO(struct wiphy *wiphy, int ret, struct mpath_info *pinfo),
+ TP_ARGS(wiphy, ret, pinfo),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, ret)
+ __field(int, generation)
+ __field(u32, filled)
+ __field(u32, frame_qlen)
+ __field(u32, sn)
+ __field(u32, metric)
+ __field(u32, exptime)
+ __field(u32, discovery_timeout)
+ __field(u8, discovery_retries)
+ __field(u8, flags)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->ret = ret;
+ __entry->generation = pinfo->generation;
+ __entry->filled = pinfo->filled;
+ __entry->frame_qlen = pinfo->frame_qlen;
+ __entry->sn = pinfo->sn;
+ __entry->metric = pinfo->metric;
+ __entry->exptime = pinfo->exptime;
+ __entry->discovery_timeout = pinfo->discovery_timeout;
+ __entry->discovery_retries = pinfo->discovery_retries;
+ __entry->flags = pinfo->flags;
+ ),
+ TP_printk(WIPHY_PR_FMT ", returned %d. mpath info - generation: %d, "
+ "filled: %u, frame qlen: %u, sn: %u, metric: %u, exptime: %u,"
+ " discovery timeout: %u, discovery retries: %u, flags: %u",
+ WIPHY_PR_ARG, __entry->ret, __entry->generation,
+ __entry->filled, __entry->frame_qlen, __entry->sn,
+ __entry->metric, __entry->exptime, __entry->discovery_timeout,
+ __entry->discovery_retries, __entry->flags)
+);
+
+TRACE_EVENT(rdev_return_int_mesh_config,
+ TP_PROTO(struct wiphy *wiphy, int ret, struct mesh_config *conf),
+ TP_ARGS(wiphy, ret, conf),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ MESH_CFG_ENTRY
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ MESH_CFG_ASSIGN;
+ __entry->ret = ret;
+ ),
+ TP_printk(WIPHY_PR_FMT ", returned: %d",
+ WIPHY_PR_ARG, __entry->ret)
+);
+
+TRACE_EVENT(rdev_update_mesh_config,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 mask,
+ const struct mesh_config *conf),
+ TP_ARGS(wiphy, netdev, mask, conf),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MESH_CFG_ENTRY
+ __field(u32, mask)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MESH_CFG_ASSIGN;
+ __entry->mask = mask;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mask: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mask)
+);
+
+TRACE_EVENT(rdev_join_mesh,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const struct mesh_config *conf,
+ const struct mesh_setup *setup),
+ TP_ARGS(wiphy, netdev, conf, setup),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MESH_CFG_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MESH_CFG_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG)
+);
+
+TRACE_EVENT(rdev_change_bss,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct bss_parameters *params),
+ TP_ARGS(wiphy, netdev, params),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(int, use_cts_prot)
+ __field(int, use_short_preamble)
+ __field(int, use_short_slot_time)
+ __field(int, ap_isolate)
+ __field(int, ht_opmode)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->use_cts_prot = params->use_cts_prot;
+ __entry->use_short_preamble = params->use_short_preamble;
+ __entry->use_short_slot_time = params->use_short_slot_time;
+ __entry->ap_isolate = params->ap_isolate;
+ __entry->ht_opmode = params->ht_opmode;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", use cts prot: %d, "
+ "use short preamble: %d, use short slot time: %d, "
+ "ap isolate: %d, ht opmode: %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->use_cts_prot,
+ __entry->use_short_preamble, __entry->use_short_slot_time,
+ __entry->ap_isolate, __entry->ht_opmode)
+);
+
+TRACE_EVENT(rdev_set_txq_params,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct ieee80211_txq_params *params),
+ TP_ARGS(wiphy, netdev, params),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(enum nl80211_ac, ac)
+ __field(u16, txop)
+ __field(u16, cwmin)
+ __field(u16, cwmax)
+ __field(u8, aifs)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->ac = params->ac;
+ __entry->txop = params->txop;
+ __entry->cwmin = params->cwmin;
+ __entry->cwmax = params->cwmax;
+ __entry->aifs = params->aifs;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", ac: %d, txop: %u, cwmin: %u, cwmax: %u, aifs: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ac, __entry->txop,
+ __entry->cwmin, __entry->cwmax, __entry->aifs)
+);
+
+TRACE_EVENT(rdev_libertas_set_mesh_channel,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct ieee80211_channel *chan),
+ TP_ARGS(wiphy, netdev, chan),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ CHAN_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ CHAN_ASSIGN(chan);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_PR_FMT, WIPHY_PR_ARG,
+ NETDEV_PR_ARG, CHAN_PR_ARG)
+);
+
+TRACE_EVENT(rdev_set_monitor_channel,
+ TP_PROTO(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef),
+ TP_ARGS(wiphy, chandef),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ CHAN_DEF_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
+ WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
+);
+
+TRACE_EVENT(rdev_auth,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_auth_request *req),
+ TP_ARGS(wiphy, netdev, req),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ __field(enum nl80211_auth_type, auth_type)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ if (req->bss)
+ MAC_ASSIGN(bssid, req->bss->bssid);
+ else
+ eth_zero_addr(__entry->bssid);
+ __entry->auth_type = req->auth_type;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", auth type: %d, bssid: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->auth_type,
+ MAC_PR_ARG(bssid))
+);
+
+TRACE_EVENT(rdev_assoc,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_assoc_request *req),
+ TP_ARGS(wiphy, netdev, req),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ MAC_ENTRY(prev_bssid)
+ __field(bool, use_mfp)
+ __field(u32, flags)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ if (req->bss)
+ MAC_ASSIGN(bssid, req->bss->bssid);
+ else
+ eth_zero_addr(__entry->bssid);
+ MAC_ASSIGN(prev_bssid, req->prev_bssid);
+ __entry->use_mfp = req->use_mfp;
+ __entry->flags = req->flags;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
+ ", previous bssid: " MAC_PR_FMT ", use mfp: %s, flags: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid),
+ MAC_PR_ARG(prev_bssid), BOOL_TO_STR(__entry->use_mfp),
+ __entry->flags)
+);
+
+TRACE_EVENT(rdev_deauth,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_deauth_request *req),
+ TP_ARGS(wiphy, netdev, req),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ __field(u16, reason_code)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(bssid, req->bssid);
+ __entry->reason_code = req->reason_code;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", reason: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid),
+ __entry->reason_code)
+);
+
+TRACE_EVENT(rdev_disassoc,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_disassoc_request *req),
+ TP_ARGS(wiphy, netdev, req),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ __field(u16, reason_code)
+ __field(bool, local_state_change)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ if (req->bss)
+ MAC_ASSIGN(bssid, req->bss->bssid);
+ else
+ eth_zero_addr(__entry->bssid);
+ __entry->reason_code = req->reason_code;
+ __entry->local_state_change = req->local_state_change;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
+ ", reason: %u, local state change: %s",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid),
+ __entry->reason_code,
+ BOOL_TO_STR(__entry->local_state_change))
+);
+
+TRACE_EVENT(rdev_mgmt_tx_cancel_wait,
+ TP_PROTO(struct wiphy *wiphy,
+ struct wireless_dev *wdev, u64 cookie),
+ TP_ARGS(wiphy, wdev, cookie),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u64, cookie)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu ",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
+);
+
+TRACE_EVENT(rdev_set_power_mgmt,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ bool enabled, int timeout),
+ TP_ARGS(wiphy, netdev, enabled, timeout),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(bool, enabled)
+ __field(int, timeout)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->enabled = enabled;
+ __entry->timeout = timeout;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %senabled, timeout: %d ",
+ WIPHY_PR_ARG, NETDEV_PR_ARG,
+ __entry->enabled ? "" : "not ", __entry->timeout)
+);
+
+TRACE_EVENT(rdev_connect,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_connect_params *sme),
+ TP_ARGS(wiphy, netdev, sme),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
+ __field(enum nl80211_auth_type, auth_type)
+ __field(bool, privacy)
+ __field(u32, wpa_versions)
+ __field(u32, flags)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(bssid, sme->bssid);
+ memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
+ memcpy(__entry->ssid, sme->ssid, sme->ssid_len);
+ __entry->auth_type = sme->auth_type;
+ __entry->privacy = sme->privacy;
+ __entry->wpa_versions = sme->crypto.wpa_versions;
+ __entry->flags = sme->flags;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
+ ", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, "
+ "flags: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid,
+ __entry->auth_type, BOOL_TO_STR(__entry->privacy),
+ __entry->wpa_versions, __entry->flags)
+);
+
+TRACE_EVENT(rdev_set_cqm_rssi_config,
+ TP_PROTO(struct wiphy *wiphy,
+ struct net_device *netdev, s32 rssi_thold,
+ u32 rssi_hyst),
+ TP_ARGS(wiphy, netdev, rssi_thold, rssi_hyst),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(s32, rssi_thold)
+ __field(u32, rssi_hyst)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->rssi_thold = rssi_thold;
+ __entry->rssi_hyst = rssi_hyst;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
+ ", rssi_thold: %d, rssi_hyst: %u ",
+ WIPHY_PR_ARG, NETDEV_PR_ARG,
+ __entry->rssi_thold, __entry->rssi_hyst)
+);
+
+TRACE_EVENT(rdev_set_cqm_txe_config,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 rate,
+ u32 pkts, u32 intvl),
+ TP_ARGS(wiphy, netdev, rate, pkts, intvl),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u32, rate)
+ __field(u32, pkts)
+ __field(u32, intvl)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->rate = rate;
+ __entry->pkts = pkts;
+ __entry->intvl = intvl;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", rate: %u, packets: %u, interval: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->rate, __entry->pkts,
+ __entry->intvl)
+);
+
+TRACE_EVENT(rdev_disconnect,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ u16 reason_code),
+ TP_ARGS(wiphy, netdev, reason_code),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u16, reason_code)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->reason_code = reason_code;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", reason code: %u", WIPHY_PR_ARG,
+ NETDEV_PR_ARG, __entry->reason_code)
+);
+
+TRACE_EVENT(rdev_join_ibss,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_ibss_params *params),
+ TP_ARGS(wiphy, netdev, params),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(bssid, params->bssid);
+ memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
+ memcpy(__entry->ssid, params->ssid, params->ssid_len);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", ssid: %s",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid)
+);
+
+TRACE_EVENT(rdev_join_ocb,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const struct ocb_setup *setup),
+ TP_ARGS(wiphy, netdev, setup),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG)
+);
+
+TRACE_EVENT(rdev_set_wiphy_params,
+ TP_PROTO(struct wiphy *wiphy, u32 changed),
+ TP_ARGS(wiphy, changed),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(u32, changed)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->changed = changed;
+ ),
+ TP_printk(WIPHY_PR_FMT ", changed: %u",
+ WIPHY_PR_ARG, __entry->changed)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_get_tx_power,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+TRACE_EVENT(rdev_set_tx_power,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ enum nl80211_tx_power_setting type, int mbm),
+ TP_ARGS(wiphy, wdev, type, mbm),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(enum nl80211_tx_power_setting, type)
+ __field(int, mbm)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->type = type;
+ __entry->mbm = mbm;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type: %u, mbm: %d",
+ WIPHY_PR_ARG, WDEV_PR_ARG,__entry->type, __entry->mbm)
+);
+
+TRACE_EVENT(rdev_return_int_int,
+ TP_PROTO(struct wiphy *wiphy, int func_ret, int func_fill),
+ TP_ARGS(wiphy, func_ret, func_fill),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, func_ret)
+ __field(int, func_fill)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->func_ret = func_ret;
+ __entry->func_fill = func_fill;
+ ),
+ TP_printk(WIPHY_PR_FMT ", function returns: %d, function filled: %d",
+ WIPHY_PR_ARG, __entry->func_ret, __entry->func_fill)
+);
+
+#ifdef CONFIG_NL80211_TESTMODE
+TRACE_EVENT(rdev_testmode_cmd,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
+);
+
+TRACE_EVENT(rdev_testmode_dump,
+ TP_PROTO(struct wiphy *wiphy),
+ TP_ARGS(wiphy),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
+);
+#endif /* CONFIG_NL80211_TESTMODE */
+
+TRACE_EVENT(rdev_set_bitrate_mask,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const u8 *peer, const struct cfg80211_bitrate_mask *mask),
+ TP_ARGS(wiphy, netdev, peer, mask),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
+);
+
+TRACE_EVENT(rdev_mgmt_frame_register,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ u16 frame_type, bool reg),
+ TP_ARGS(wiphy, wdev, frame_type, reg),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u16, frame_type)
+ __field(bool, reg)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->frame_type = frame_type;
+ __entry->reg = reg;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", frame_type: 0x%.2x, reg: %s ",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->frame_type,
+ __entry->reg ? "true" : "false")
+);
+
+TRACE_EVENT(rdev_return_int_tx_rx,
+ TP_PROTO(struct wiphy *wiphy, int ret, u32 tx, u32 rx),
+ TP_ARGS(wiphy, ret, tx, rx),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, ret)
+ __field(u32, tx)
+ __field(u32, rx)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->ret = ret;
+ __entry->tx = tx;
+ __entry->rx = rx;
+ ),
+ TP_printk(WIPHY_PR_FMT ", returned %d, tx: %u, rx: %u",
+ WIPHY_PR_ARG, __entry->ret, __entry->tx, __entry->rx)
+);
+
+TRACE_EVENT(rdev_return_void_tx_rx,
+ TP_PROTO(struct wiphy *wiphy, u32 tx, u32 tx_max,
+ u32 rx, u32 rx_max),
+ TP_ARGS(wiphy, tx, tx_max, rx, rx_max),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(u32, tx)
+ __field(u32, tx_max)
+ __field(u32, rx)
+ __field(u32, rx_max)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->tx = tx;
+ __entry->tx_max = tx_max;
+ __entry->rx = rx;
+ __entry->rx_max = rx_max;
+ ),
+ TP_printk(WIPHY_PR_FMT ", tx: %u, tx_max: %u, rx: %u, rx_max: %u ",
+ WIPHY_PR_ARG, __entry->tx, __entry->tx_max, __entry->rx,
+ __entry->rx_max)
+);
+
+DECLARE_EVENT_CLASS(tx_rx_evt,
+ TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx),
+ TP_ARGS(wiphy, rx, tx),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(u32, tx)
+ __field(u32, rx)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->tx = tx;
+ __entry->rx = rx;
+ ),
+ TP_printk(WIPHY_PR_FMT ", tx: %u, rx: %u ",
+ WIPHY_PR_ARG, __entry->tx, __entry->rx)
+);
+
+DEFINE_EVENT(tx_rx_evt, rdev_set_antenna,
+ TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx),
+ TP_ARGS(wiphy, rx, tx)
+);
+
+TRACE_EVENT(rdev_sched_scan_start,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_sched_scan_request *request),
+ TP_ARGS(wiphy, netdev, request),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG)
+);
+
+TRACE_EVENT(rdev_tdls_mgmt,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ u8 *peer, u8 action_code, u8 dialog_token,
+ u16 status_code, u32 peer_capability,
+ bool initiator, const u8 *buf, size_t len),
+ TP_ARGS(wiphy, netdev, peer, action_code, dialog_token, status_code,
+ peer_capability, initiator, buf, len),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ __field(u8, action_code)
+ __field(u8, dialog_token)
+ __field(u16, status_code)
+ __field(u32, peer_capability)
+ __field(bool, initiator)
+ __dynamic_array(u8, buf, len)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ __entry->action_code = action_code;
+ __entry->dialog_token = dialog_token;
+ __entry->status_code = status_code;
+ __entry->peer_capability = peer_capability;
+ __entry->initiator = initiator;
+ memcpy(__get_dynamic_array(buf), buf, len);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", action_code: %u, "
+ "dialog_token: %u, status_code: %u, peer_capability: %u "
+ "initiator: %s buf: %#.2x ",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer),
+ __entry->action_code, __entry->dialog_token,
+ __entry->status_code, __entry->peer_capability,
+ BOOL_TO_STR(__entry->initiator),
+ ((u8 *)__get_dynamic_array(buf))[0])
+);
+
+TRACE_EVENT(rdev_dump_survey,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx),
+ TP_ARGS(wiphy, netdev, idx),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(int, idx)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->idx = idx;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx)
+);
+
+TRACE_EVENT(rdev_return_int_survey_info,
+ TP_PROTO(struct wiphy *wiphy, int ret, struct survey_info *info),
+ TP_ARGS(wiphy, ret, info),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ CHAN_ENTRY
+ __field(int, ret)
+ __field(u64, time)
+ __field(u64, time_busy)
+ __field(u64, time_ext_busy)
+ __field(u64, time_rx)
+ __field(u64, time_tx)
+ __field(u64, time_scan)
+ __field(u32, filled)
+ __field(s8, noise)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ CHAN_ASSIGN(info->channel);
+ __entry->ret = ret;
+ __entry->time = info->time;
+ __entry->time_busy = info->time_busy;
+ __entry->time_ext_busy = info->time_ext_busy;
+ __entry->time_rx = info->time_rx;
+ __entry->time_tx = info->time_tx;
+ __entry->time_scan = info->time_scan;
+ __entry->filled = info->filled;
+ __entry->noise = info->noise;
+ ),
+ TP_printk(WIPHY_PR_FMT ", returned: %d, " CHAN_PR_FMT
+ ", channel time: %llu, channel time busy: %llu, "
+ "channel time extension busy: %llu, channel time rx: %llu, "
+ "channel time tx: %llu, scan time: %llu, filled: %u, noise: %d",
+ WIPHY_PR_ARG, __entry->ret, CHAN_PR_ARG,
+ __entry->time, __entry->time_busy,
+ __entry->time_ext_busy, __entry->time_rx,
+ __entry->time_tx, __entry->time_scan,
+ __entry->filled, __entry->noise)
+);
+
+TRACE_EVENT(rdev_tdls_oper,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ u8 *peer, enum nl80211_tdls_operation oper),
+ TP_ARGS(wiphy, netdev, peer, oper),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ __field(enum nl80211_tdls_operation, oper)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ __entry->oper = oper;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", oper: %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->oper)
+);
+
+DECLARE_EVENT_CLASS(rdev_pmksa,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_pmksa *pmksa),
+ TP_ARGS(wiphy, netdev, pmksa),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(bssid, pmksa->bssid);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid))
+);
+
+TRACE_EVENT(rdev_probe_client,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const u8 *peer),
+ TP_ARGS(wiphy, netdev, peer),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
+);
+
+DEFINE_EVENT(rdev_pmksa, rdev_set_pmksa,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_pmksa *pmksa),
+ TP_ARGS(wiphy, netdev, pmksa)
+);
+
+DEFINE_EVENT(rdev_pmksa, rdev_del_pmksa,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_pmksa *pmksa),
+ TP_ARGS(wiphy, netdev, pmksa)
+);
+
+TRACE_EVENT(rdev_remain_on_channel,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct ieee80211_channel *chan,
+ unsigned int duration),
+ TP_ARGS(wiphy, wdev, chan, duration),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ CHAN_ENTRY
+ __field(unsigned int, duration)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ CHAN_ASSIGN(chan);
+ __entry->duration = duration;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", duration: %u",
+ WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG, __entry->duration)
+);
+
+TRACE_EVENT(rdev_return_int_cookie,
+ TP_PROTO(struct wiphy *wiphy, int ret, u64 cookie),
+ TP_ARGS(wiphy, ret, cookie),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, ret)
+ __field(u64, cookie)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->ret = ret;
+ __entry->cookie = cookie;
+ ),
+ TP_printk(WIPHY_PR_FMT ", returned %d, cookie: %llu",
+ WIPHY_PR_ARG, __entry->ret, __entry->cookie)
+);
+
+TRACE_EVENT(rdev_cancel_remain_on_channel,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
+ TP_ARGS(wiphy, wdev, cookie),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u64, cookie)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
+);
+
+TRACE_EVENT(rdev_mgmt_tx,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct cfg80211_mgmt_tx_params *params),
+ TP_ARGS(wiphy, wdev, params),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ CHAN_ENTRY
+ __field(bool, offchan)
+ __field(unsigned int, wait)
+ __field(bool, no_cck)
+ __field(bool, dont_wait_for_ack)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ CHAN_ASSIGN(params->chan);
+ __entry->offchan = params->offchan;
+ __entry->wait = params->wait;
+ __entry->no_cck = params->no_cck;
+ __entry->dont_wait_for_ack = params->dont_wait_for_ack;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", offchan: %s,"
+ " wait: %u, no cck: %s, dont wait for ack: %s",
+ WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG,
+ BOOL_TO_STR(__entry->offchan), __entry->wait,
+ BOOL_TO_STR(__entry->no_cck),
+ BOOL_TO_STR(__entry->dont_wait_for_ack))
+);
+
+TRACE_EVENT(rdev_set_noack_map,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ u16 noack_map),
+ TP_ARGS(wiphy, netdev, noack_map),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u16, noack_map)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->noack_map = noack_map;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", noack_map: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_get_channel,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+TRACE_EVENT(rdev_return_chandef,
+ TP_PROTO(struct wiphy *wiphy, int ret,
+ struct cfg80211_chan_def *chandef),
+ TP_ARGS(wiphy, ret, chandef),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, ret)
+ CHAN_DEF_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ if (ret == 0)
+ CHAN_DEF_ASSIGN(chandef);
+ else
+ CHAN_DEF_ASSIGN((struct cfg80211_chan_def *)NULL);
+ __entry->ret = ret;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", ret: %d",
+ WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->ret)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_start_p2p_device,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+TRACE_EVENT(rdev_set_mac_acl,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_acl_data *params),
+ TP_ARGS(wiphy, netdev, params),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u32, acl_policy)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->acl_policy = params->acl_policy;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", acl policy: %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->acl_policy)
+);
+
+TRACE_EVENT(rdev_update_ft_ies,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_update_ft_ies_params *ftie),
+ TP_ARGS(wiphy, netdev, ftie),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u16, md)
+ __dynamic_array(u8, ie, ftie->ie_len)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->md = ftie->md;
+ memcpy(__get_dynamic_array(ie), ftie->ie, ftie->ie_len);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", md: 0x%x",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->md)
+);
+
+TRACE_EVENT(rdev_crit_proto_start,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ enum nl80211_crit_proto_id protocol, u16 duration),
+ TP_ARGS(wiphy, wdev, protocol, duration),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(u16, proto)
+ __field(u16, duration)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->proto = protocol;
+ __entry->duration = duration;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", proto=%x, duration=%u",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->proto, __entry->duration)
+);
+
+TRACE_EVENT(rdev_crit_proto_stop,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
+ WIPHY_PR_ARG, WDEV_PR_ARG)
+);
+
+TRACE_EVENT(rdev_channel_switch,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_csa_settings *params),
+ TP_ARGS(wiphy, netdev, params),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ CHAN_DEF_ENTRY
+ __field(bool, radar_required)
+ __field(bool, block_tx)
+ __field(u8, count)
+ __dynamic_array(u16, bcn_ofs, params->n_counter_offsets_beacon)
+ __dynamic_array(u16, pres_ofs, params->n_counter_offsets_presp)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ CHAN_DEF_ASSIGN(&params->chandef);
+ __entry->radar_required = params->radar_required;
+ __entry->block_tx = params->block_tx;
+ __entry->count = params->count;
+ memcpy(__get_dynamic_array(bcn_ofs),
+ params->counter_offsets_beacon,
+ params->n_counter_offsets_beacon * sizeof(u16));
+
+ /* probe response offsets are optional */
+ if (params->n_counter_offsets_presp)
+ memcpy(__get_dynamic_array(pres_ofs),
+ params->counter_offsets_presp,
+ params->n_counter_offsets_presp * sizeof(u16));
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
+ ", block_tx: %d, count: %u, radar_required: %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
+ __entry->block_tx, __entry->count, __entry->radar_required)
+);
+
+TRACE_EVENT(rdev_set_qos_map,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_qos_map *qos_map),
+ TP_ARGS(wiphy, netdev, qos_map),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ QOS_MAP_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ QOS_MAP_ASSIGN(qos_map);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", num_des: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->num_des)
+);
+
+TRACE_EVENT(rdev_set_ap_chanwidth,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_chan_def *chandef),
+ TP_ARGS(wiphy, netdev, chandef),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ CHAN_DEF_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
+);
+
+TRACE_EVENT(rdev_add_tx_ts,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time),
+ TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ __field(u8, tsid)
+ __field(u8, user_prio)
+ __field(u16, admitted_time)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ __entry->tsid = tsid;
+ __entry->user_prio = user_prio;
+ __entry->admitted_time = admitted_time;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d, UP %d, time %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer),
+ __entry->tsid, __entry->user_prio, __entry->admitted_time)
+);
+
+TRACE_EVENT(rdev_del_tx_ts,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ u8 tsid, const u8 *peer),
+ TP_ARGS(wiphy, netdev, tsid, peer),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ __field(u8, tsid)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ __entry->tsid = tsid;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid)
+);
+
+TRACE_EVENT(rdev_tdls_channel_switch,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const u8 *addr, u8 oper_class,
+ struct cfg80211_chan_def *chandef),
+ TP_ARGS(wiphy, netdev, addr, oper_class, chandef),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(addr)
+ __field(u8, oper_class)
+ CHAN_DEF_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(addr, addr);
+ CHAN_DEF_ASSIGN(chandef);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT
+ " oper class %d, " CHAN_DEF_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr),
+ __entry->oper_class, CHAN_DEF_PR_ARG)
+);
+
+TRACE_EVENT(rdev_tdls_cancel_channel_switch,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const u8 *addr),
+ TP_ARGS(wiphy, netdev, addr),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(addr)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(addr, addr);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr))
+);
+
+/*************************************************************
+ * cfg80211 exported functions traces *
+ *************************************************************/
+
+TRACE_EVENT(cfg80211_return_bool,
+ TP_PROTO(bool ret),
+ TP_ARGS(ret),
+ TP_STRUCT__entry(
+ __field(bool, ret)
+ ),
+ TP_fast_assign(
+ __entry->ret = ret;
+ ),
+ TP_printk("returned %s", BOOL_TO_STR(__entry->ret))
+);
+
+DECLARE_EVENT_CLASS(cfg80211_netdev_mac_evt,
+ TP_PROTO(struct net_device *netdev, const u8 *macaddr),
+ TP_ARGS(netdev, macaddr),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(macaddr)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(macaddr, macaddr);
+ ),
+ TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT,
+ NETDEV_PR_ARG, MAC_PR_ARG(macaddr))
+);
+
+DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate,
+ TP_PROTO(struct net_device *netdev, const u8 *macaddr),
+ TP_ARGS(netdev, macaddr)
+);
+
+DECLARE_EVENT_CLASS(netdev_evt_only,
+ TP_PROTO(struct net_device *netdev),
+ TP_ARGS(netdev),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ ),
+ TP_printk(NETDEV_PR_FMT , NETDEV_PR_ARG)
+);
+
+DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth,
+ TP_PROTO(struct net_device *netdev),
+ TP_ARGS(netdev)
+);
+
+TRACE_EVENT(cfg80211_send_rx_assoc,
+ TP_PROTO(struct net_device *netdev, struct cfg80211_bss *bss),
+ TP_ARGS(netdev, bss),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ CHAN_ENTRY
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(bssid, bss->bssid);
+ CHAN_ASSIGN(bss->channel);
+ ),
+ TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", " CHAN_PR_FMT,
+ NETDEV_PR_ARG, MAC_PR_ARG(bssid), CHAN_PR_ARG)
+);
+
+DECLARE_EVENT_CLASS(netdev_frame_event,
+ TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
+ TP_ARGS(netdev, buf, len),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ __dynamic_array(u8, frame, len)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ memcpy(__get_dynamic_array(frame), buf, len);
+ ),
+ TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x",
+ NETDEV_PR_ARG,
+ le16_to_cpup((__le16 *)__get_dynamic_array(frame)))
+);
+
+DEFINE_EVENT(netdev_frame_event, cfg80211_rx_unprot_mlme_mgmt,
+ TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
+ TP_ARGS(netdev, buf, len)
+);
+
+DEFINE_EVENT(netdev_frame_event, cfg80211_rx_mlme_mgmt,
+ TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
+ TP_ARGS(netdev, buf, len)
+);
+
+TRACE_EVENT(cfg80211_tx_mlme_mgmt,
+ TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
+ TP_ARGS(netdev, buf, len),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ __dynamic_array(u8, frame, len)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ memcpy(__get_dynamic_array(frame), buf, len);
+ ),
+ TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x",
+ NETDEV_PR_ARG,
+ le16_to_cpup((__le16 *)__get_dynamic_array(frame)))
+);
+
+DECLARE_EVENT_CLASS(netdev_mac_evt,
+ TP_PROTO(struct net_device *netdev, const u8 *mac),
+ TP_ARGS(netdev, mac),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(mac)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(mac, mac)
+ ),
+ TP_printk(NETDEV_PR_FMT ", mac: " MAC_PR_FMT,
+ NETDEV_PR_ARG, MAC_PR_ARG(mac))
+);
+
+DEFINE_EVENT(netdev_mac_evt, cfg80211_send_auth_timeout,
+ TP_PROTO(struct net_device *netdev, const u8 *mac),
+ TP_ARGS(netdev, mac)
+);
+
+DEFINE_EVENT(netdev_mac_evt, cfg80211_send_assoc_timeout,
+ TP_PROTO(struct net_device *netdev, const u8 *mac),
+ TP_ARGS(netdev, mac)
+);
+
+TRACE_EVENT(cfg80211_michael_mic_failure,
+ TP_PROTO(struct net_device *netdev, const u8 *addr,
+ enum nl80211_key_type key_type, int key_id, const u8 *tsc),
+ TP_ARGS(netdev, addr, key_type, key_id, tsc),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(addr)
+ __field(enum nl80211_key_type, key_type)
+ __field(int, key_id)
+ __array(u8, tsc, 6)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(addr, addr);
+ __entry->key_type = key_type;
+ __entry->key_id = key_id;
+ if (tsc)
+ memcpy(__entry->tsc, tsc, 6);
+ ),
+ TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT ", key type: %d, key id: %d, tsc: %pm",
+ NETDEV_PR_ARG, MAC_PR_ARG(addr), __entry->key_type,
+ __entry->key_id, __entry->tsc)
+);
+
+TRACE_EVENT(cfg80211_ready_on_channel,
+ TP_PROTO(struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan,
+ unsigned int duration),
+ TP_ARGS(wdev, cookie, chan, duration),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ __field(u64, cookie)
+ CHAN_ENTRY
+ __field(unsigned int, duration)
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ CHAN_ASSIGN(chan);
+ __entry->duration = duration;
+ ),
+ TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT ", duration: %u",
+ WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG,
+ __entry->duration)
+);
+
+TRACE_EVENT(cfg80211_ready_on_channel_expired,
+ TP_PROTO(struct wireless_dev *wdev, u64 cookie,
+ struct ieee80211_channel *chan),
+ TP_ARGS(wdev, cookie, chan),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ __field(u64, cookie)
+ CHAN_ENTRY
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ CHAN_ASSIGN(chan);
+ ),
+ TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
+ WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
+);
+
+TRACE_EVENT(cfg80211_new_sta,
+ TP_PROTO(struct net_device *netdev, const u8 *mac_addr,
+ struct station_info *sinfo),
+ TP_ARGS(netdev, mac_addr, sinfo),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(mac_addr)
+ SINFO_ENTRY
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(mac_addr, mac_addr);
+ SINFO_ASSIGN;
+ ),
+ TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT,
+ NETDEV_PR_ARG, MAC_PR_ARG(mac_addr))
+);
+
+DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta,
+ TP_PROTO(struct net_device *netdev, const u8 *macaddr),
+ TP_ARGS(netdev, macaddr)
+);
+
+TRACE_EVENT(cfg80211_rx_mgmt,
+ TP_PROTO(struct wireless_dev *wdev, int freq, int sig_mbm),
+ TP_ARGS(wdev, freq, sig_mbm),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ __field(int, freq)
+ __field(int, sig_mbm)
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ __entry->freq = freq;
+ __entry->sig_mbm = sig_mbm;
+ ),
+ TP_printk(WDEV_PR_FMT ", freq: %d, sig mbm: %d",
+ WDEV_PR_ARG, __entry->freq, __entry->sig_mbm)
+);
+
+TRACE_EVENT(cfg80211_mgmt_tx_status,
+ TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack),
+ TP_ARGS(wdev, cookie, ack),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ __field(u64, cookie)
+ __field(bool, ack)
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ __entry->cookie = cookie;
+ __entry->ack = ack;
+ ),
+ TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s",
+ WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
+);
+
+TRACE_EVENT(cfg80211_cqm_rssi_notify,
+ TP_PROTO(struct net_device *netdev,
+ enum nl80211_cqm_rssi_threshold_event rssi_event),
+ TP_ARGS(netdev, rssi_event),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ __field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ __entry->rssi_event = rssi_event;
+ ),
+ TP_printk(NETDEV_PR_FMT ", rssi event: %d",
+ NETDEV_PR_ARG, __entry->rssi_event)
+);
+
+TRACE_EVENT(cfg80211_reg_can_beacon,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
+ enum nl80211_iftype iftype),
+ TP_ARGS(wiphy, chandef, iftype),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ CHAN_DEF_ENTRY
+ __field(enum nl80211_iftype, iftype)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ __entry->iftype = iftype;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d",
+ WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype)
+);
+
+TRACE_EVENT(cfg80211_chandef_dfs_required,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef),
+ TP_ARGS(wiphy, chandef),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ CHAN_DEF_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
+ WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
+);
+
+TRACE_EVENT(cfg80211_ch_switch_notify,
+ TP_PROTO(struct net_device *netdev,
+ struct cfg80211_chan_def *chandef),
+ TP_ARGS(netdev, chandef),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ CHAN_DEF_ENTRY
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ ),
+ TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
+ NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
+);
+
+TRACE_EVENT(cfg80211_ch_switch_started_notify,
+ TP_PROTO(struct net_device *netdev,
+ struct cfg80211_chan_def *chandef),
+ TP_ARGS(netdev, chandef),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ CHAN_DEF_ENTRY
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ ),
+ TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT,
+ NETDEV_PR_ARG, CHAN_DEF_PR_ARG)
+);
+
+TRACE_EVENT(cfg80211_radar_event,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef),
+ TP_ARGS(wiphy, chandef),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ CHAN_DEF_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
+ WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
+);
+
+TRACE_EVENT(cfg80211_cac_event,
+ TP_PROTO(struct net_device *netdev, enum nl80211_radar_event evt),
+ TP_ARGS(netdev, evt),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ __field(enum nl80211_radar_event, evt)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ __entry->evt = evt;
+ ),
+ TP_printk(NETDEV_PR_FMT ", event: %d",
+ NETDEV_PR_ARG, __entry->evt)
+);
+
+DECLARE_EVENT_CLASS(cfg80211_rx_evt,
+ TP_PROTO(struct net_device *netdev, const u8 *addr),
+ TP_ARGS(netdev, addr),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(addr)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(addr, addr);
+ ),
+ TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT, NETDEV_PR_ARG, MAC_PR_ARG(addr))
+);
+
+DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame,
+ TP_PROTO(struct net_device *netdev, const u8 *addr),
+ TP_ARGS(netdev, addr)
+);
+
+DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame,
+ TP_PROTO(struct net_device *netdev, const u8 *addr),
+ TP_ARGS(netdev, addr)
+);
+
+TRACE_EVENT(cfg80211_ibss_joined,
+ TP_PROTO(struct net_device *netdev, const u8 *bssid,
+ struct ieee80211_channel *channel),
+ TP_ARGS(netdev, bssid, channel),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(bssid)
+ CHAN_ENTRY
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(bssid, bssid);
+ CHAN_ASSIGN(channel);
+ ),
+ TP_printk(NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", " CHAN_PR_FMT,
+ NETDEV_PR_ARG, MAC_PR_ARG(bssid), CHAN_PR_ARG)
+);
+
+TRACE_EVENT(cfg80211_probe_status,
+ TP_PROTO(struct net_device *netdev, const u8 *addr, u64 cookie,
+ bool acked),
+ TP_ARGS(netdev, addr, cookie, acked),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(addr)
+ __field(u64, cookie)
+ __field(bool, acked)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(addr, addr);
+ __entry->cookie = cookie;
+ __entry->acked = acked;
+ ),
+ TP_printk(NETDEV_PR_FMT " addr:" MAC_PR_FMT ", cookie: %llu, acked: %s",
+ NETDEV_PR_ARG, MAC_PR_ARG(addr), __entry->cookie,
+ BOOL_TO_STR(__entry->acked))
+);
+
+TRACE_EVENT(cfg80211_cqm_pktloss_notify,
+ TP_PROTO(struct net_device *netdev, const u8 *peer, u32 num_packets),
+ TP_ARGS(netdev, peer, num_packets),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ __field(u32, num_packets)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ __entry->num_packets = num_packets;
+ ),
+ TP_printk(NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", num of lost packets: %u",
+ NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->num_packets)
+);
+
+DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_gtk_rekey_notify,
+ TP_PROTO(struct net_device *netdev, const u8 *macaddr),
+ TP_ARGS(netdev, macaddr)
+);
+
+TRACE_EVENT(cfg80211_pmksa_candidate_notify,
+ TP_PROTO(struct net_device *netdev, int index, const u8 *bssid,
+ bool preauth),
+ TP_ARGS(netdev, index, bssid, preauth),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ __field(int, index)
+ MAC_ENTRY(bssid)
+ __field(bool, preauth)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ __entry->index = index;
+ MAC_ASSIGN(bssid, bssid);
+ __entry->preauth = preauth;
+ ),
+ TP_printk(NETDEV_PR_FMT ", index:%d, bssid: " MAC_PR_FMT ", pre auth: %s",
+ NETDEV_PR_ARG, __entry->index, MAC_PR_ARG(bssid),
+ BOOL_TO_STR(__entry->preauth))
+);
+
+TRACE_EVENT(cfg80211_report_obss_beacon,
+ TP_PROTO(struct wiphy *wiphy, const u8 *frame, size_t len,
+ int freq, int sig_dbm),
+ TP_ARGS(wiphy, frame, len, freq, sig_dbm),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, freq)
+ __field(int, sig_dbm)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->freq = freq;
+ __entry->sig_dbm = sig_dbm;
+ ),
+ TP_printk(WIPHY_PR_FMT ", freq: %d, sig_dbm: %d",
+ WIPHY_PR_ARG, __entry->freq, __entry->sig_dbm)
+);
+
+TRACE_EVENT(cfg80211_tdls_oper_request,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *peer,
+ enum nl80211_tdls_operation oper, u16 reason_code),
+ TP_ARGS(wiphy, netdev, peer, oper, reason_code),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ __field(enum nl80211_tdls_operation, oper)
+ __field(u16, reason_code)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ __entry->oper = oper;
+ __entry->reason_code = reason_code;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", oper: %d, reason_code %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->oper,
+ __entry->reason_code)
+ );
+
+TRACE_EVENT(cfg80211_scan_done,
+ TP_PROTO(struct cfg80211_scan_request *request, bool aborted),
+ TP_ARGS(request, aborted),
+ TP_STRUCT__entry(
+ __field(u32, n_channels)
+ __dynamic_array(u8, ie, request ? request->ie_len : 0)
+ __array(u32, rates, IEEE80211_NUM_BANDS)
+ __field(u32, wdev_id)
+ MAC_ENTRY(wiphy_mac)
+ __field(bool, no_cck)
+ __field(bool, aborted)
+ ),
+ TP_fast_assign(
+ if (request) {
+ memcpy(__get_dynamic_array(ie), request->ie,
+ request->ie_len);
+ memcpy(__entry->rates, request->rates,
+ IEEE80211_NUM_BANDS);
+ __entry->wdev_id = request->wdev ?
+ request->wdev->identifier : 0;
+ if (request->wiphy)
+ MAC_ASSIGN(wiphy_mac,
+ request->wiphy->perm_addr);
+ __entry->no_cck = request->no_cck;
+ }
+ __entry->aborted = aborted;
+ ),
+ TP_printk("aborted: %s", BOOL_TO_STR(__entry->aborted))
+);
+
+DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_results,
+ TP_PROTO(struct wiphy *wiphy),
+ TP_ARGS(wiphy)
+);
+
+DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_stopped,
+ TP_PROTO(struct wiphy *wiphy),
+ TP_ARGS(wiphy)
+);
+
+TRACE_EVENT(cfg80211_get_bss,
+ TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel,
+ const u8 *bssid, const u8 *ssid, size_t ssid_len,
+ enum ieee80211_bss_type bss_type,
+ enum ieee80211_privacy privacy),
+ TP_ARGS(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ CHAN_ENTRY
+ MAC_ENTRY(bssid)
+ __dynamic_array(u8, ssid, ssid_len)
+ __field(enum ieee80211_bss_type, bss_type)
+ __field(enum ieee80211_privacy, privacy)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ CHAN_ASSIGN(channel);
+ MAC_ASSIGN(bssid, bssid);
+ memcpy(__get_dynamic_array(ssid), ssid, ssid_len);
+ __entry->bss_type = bss_type;
+ __entry->privacy = privacy;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", " MAC_PR_FMT
+ ", buf: %#.2x, bss_type: %d, privacy: %d",
+ WIPHY_PR_ARG, CHAN_PR_ARG, MAC_PR_ARG(bssid),
+ ((u8 *)__get_dynamic_array(ssid))[0], __entry->bss_type,
+ __entry->privacy)
+);
+
+TRACE_EVENT(cfg80211_inform_bss_width_frame,
+ TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel,
+ enum nl80211_bss_scan_width scan_width,
+ struct ieee80211_mgmt *mgmt, size_t len,
+ s32 signal),
+ TP_ARGS(wiphy, channel, scan_width, mgmt, len, signal),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ CHAN_ENTRY
+ __field(enum nl80211_bss_scan_width, scan_width)
+ __dynamic_array(u8, mgmt, len)
+ __field(s32, signal)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ CHAN_ASSIGN(channel);
+ __entry->scan_width = scan_width;
+ if (mgmt)
+ memcpy(__get_dynamic_array(mgmt), mgmt, len);
+ __entry->signal = signal;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d",
+ WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
+ __entry->signal)
+);
+
+DECLARE_EVENT_CLASS(cfg80211_bss_evt,
+ TP_PROTO(struct cfg80211_bss *pub),
+ TP_ARGS(pub),
+ TP_STRUCT__entry(
+ MAC_ENTRY(bssid)
+ CHAN_ENTRY
+ ),
+ TP_fast_assign(
+ MAC_ASSIGN(bssid, pub->bssid);
+ CHAN_ASSIGN(pub->channel);
+ ),
+ TP_printk(MAC_PR_FMT ", " CHAN_PR_FMT, MAC_PR_ARG(bssid), CHAN_PR_ARG)
+);
+
+DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss,
+ TP_PROTO(struct cfg80211_bss *pub),
+ TP_ARGS(pub)
+);
+
+TRACE_EVENT(cfg80211_return_uint,
+ TP_PROTO(unsigned int ret),
+ TP_ARGS(ret),
+ TP_STRUCT__entry(
+ __field(unsigned int, ret)
+ ),
+ TP_fast_assign(
+ __entry->ret = ret;
+ ),
+ TP_printk("ret: %d", __entry->ret)
+);
+
+TRACE_EVENT(cfg80211_return_u32,
+ TP_PROTO(u32 ret),
+ TP_ARGS(ret),
+ TP_STRUCT__entry(
+ __field(u32, ret)
+ ),
+ TP_fast_assign(
+ __entry->ret = ret;
+ ),
+ TP_printk("ret: %u", __entry->ret)
+);
+
+TRACE_EVENT(cfg80211_report_wowlan_wakeup,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct cfg80211_wowlan_wakeup *wakeup),
+ TP_ARGS(wiphy, wdev, wakeup),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(bool, non_wireless)
+ __field(bool, disconnect)
+ __field(bool, magic_pkt)
+ __field(bool, gtk_rekey_failure)
+ __field(bool, eap_identity_req)
+ __field(bool, four_way_handshake)
+ __field(bool, rfkill_release)
+ __field(s32, pattern_idx)
+ __field(u32, packet_len)
+ __dynamic_array(u8, packet,
+ wakeup ? wakeup->packet_present_len : 0)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->non_wireless = !wakeup;
+ __entry->disconnect = wakeup ? wakeup->disconnect : false;
+ __entry->magic_pkt = wakeup ? wakeup->magic_pkt : false;
+ __entry->gtk_rekey_failure = wakeup ? wakeup->gtk_rekey_failure : false;
+ __entry->eap_identity_req = wakeup ? wakeup->eap_identity_req : false;
+ __entry->four_way_handshake = wakeup ? wakeup->four_way_handshake : false;
+ __entry->rfkill_release = wakeup ? wakeup->rfkill_release : false;
+ __entry->pattern_idx = wakeup ? wakeup->pattern_idx : false;
+ __entry->packet_len = wakeup ? wakeup->packet_len : false;
+ if (wakeup && wakeup->packet && wakeup->packet_present_len)
+ memcpy(__get_dynamic_array(packet), wakeup->packet,
+ wakeup->packet_present_len);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
+);
+
+TRACE_EVENT(cfg80211_ft_event,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_ft_event_params *ft_event),
+ TP_ARGS(wiphy, netdev, ft_event),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __dynamic_array(u8, ies, ft_event->ies_len)
+ MAC_ENTRY(target_ap)
+ __dynamic_array(u8, ric_ies, ft_event->ric_ies_len)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ if (ft_event->ies)
+ memcpy(__get_dynamic_array(ies), ft_event->ies,
+ ft_event->ies_len);
+ MAC_ASSIGN(target_ap, ft_event->target_ap);
+ if (ft_event->ric_ies)
+ memcpy(__get_dynamic_array(ric_ies), ft_event->ric_ies,
+ ft_event->ric_ies_len);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", target_ap: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(target_ap))
+);
+
+TRACE_EVENT(cfg80211_stop_iface,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
+ WIPHY_PR_ARG, WDEV_PR_ARG)
+);
+
+#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/wireless/util.c b/net/wireless/util.c
new file mode 100644
index 000000000..7e4e3fffe
--- /dev/null
+++ b/net/wireless/util.c
@@ -0,0 +1,1814 @@
+/*
+ * Wireless utility functions
+ *
+ * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2013-2014 Intel Mobile Communications GmbH
+ */
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <net/cfg80211.h>
+#include <net/ip.h>
+#include <net/dsfield.h>
+#include <linux/if_vlan.h>
+#include <linux/mpls.h>
+#include "core.h"
+#include "rdev-ops.h"
+
+
+struct ieee80211_rate *
+ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
+ u32 basic_rates, int bitrate)
+{
+ struct ieee80211_rate *result = &sband->bitrates[0];
+ int i;
+
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (!(basic_rates & BIT(i)))
+ continue;
+ if (sband->bitrates[i].bitrate > bitrate)
+ continue;
+ result = &sband->bitrates[i];
+ }
+
+ return result;
+}
+EXPORT_SYMBOL(ieee80211_get_response_rate);
+
+u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband,
+ enum nl80211_bss_scan_width scan_width)
+{
+ struct ieee80211_rate *bitrates;
+ u32 mandatory_rates = 0;
+ enum ieee80211_rate_flags mandatory_flag;
+ int i;
+
+ if (WARN_ON(!sband))
+ return 1;
+
+ if (sband->band == IEEE80211_BAND_2GHZ) {
+ if (scan_width == NL80211_BSS_CHAN_WIDTH_5 ||
+ scan_width == NL80211_BSS_CHAN_WIDTH_10)
+ mandatory_flag = IEEE80211_RATE_MANDATORY_G;
+ else
+ mandatory_flag = IEEE80211_RATE_MANDATORY_B;
+ } else {
+ mandatory_flag = IEEE80211_RATE_MANDATORY_A;
+ }
+
+ bitrates = sband->bitrates;
+ for (i = 0; i < sband->n_bitrates; i++)
+ if (bitrates[i].flags & mandatory_flag)
+ mandatory_rates |= BIT(i);
+ return mandatory_rates;
+}
+EXPORT_SYMBOL(ieee80211_mandatory_rates);
+
+int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band)
+{
+ /* see 802.11 17.3.8.3.2 and Annex J
+ * there are overlapping channel numbers in 5GHz and 2GHz bands */
+ if (chan <= 0)
+ return 0; /* not supported */
+ switch (band) {
+ case IEEE80211_BAND_2GHZ:
+ if (chan == 14)
+ return 2484;
+ else if (chan < 14)
+ return 2407 + chan * 5;
+ break;
+ case IEEE80211_BAND_5GHZ:
+ if (chan >= 182 && chan <= 196)
+ return 4000 + chan * 5;
+ else
+ return 5000 + chan * 5;
+ break;
+ case IEEE80211_BAND_60GHZ:
+ if (chan < 5)
+ return 56160 + chan * 2160;
+ break;
+ default:
+ ;
+ }
+ return 0; /* not supported */
+}
+EXPORT_SYMBOL(ieee80211_channel_to_frequency);
+
+int ieee80211_frequency_to_channel(int freq)
+{
+ /* see 802.11 17.3.8.3.2 and Annex J */
+ if (freq == 2484)
+ return 14;
+ else if (freq < 2484)
+ return (freq - 2407) / 5;
+ else if (freq >= 4910 && freq <= 4980)
+ return (freq - 4000) / 5;
+ else if (freq <= 45000) /* DMG band lower limit */
+ return (freq - 5000) / 5;
+ else if (freq >= 58320 && freq <= 64800)
+ return (freq - 56160) / 2160;
+ else
+ return 0;
+}
+EXPORT_SYMBOL(ieee80211_frequency_to_channel);
+
+struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
+ int freq)
+{
+ enum ieee80211_band band;
+ struct ieee80211_supported_band *sband;
+ int i;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ sband = wiphy->bands[band];
+
+ if (!sband)
+ continue;
+
+ for (i = 0; i < sband->n_channels; i++) {
+ if (sband->channels[i].center_freq == freq)
+ return &sband->channels[i];
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(__ieee80211_get_channel);
+
+static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
+ enum ieee80211_band band)
+{
+ int i, want;
+
+ switch (band) {
+ case IEEE80211_BAND_5GHZ:
+ want = 3;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (sband->bitrates[i].bitrate == 60 ||
+ sband->bitrates[i].bitrate == 120 ||
+ sband->bitrates[i].bitrate == 240) {
+ sband->bitrates[i].flags |=
+ IEEE80211_RATE_MANDATORY_A;
+ want--;
+ }
+ }
+ WARN_ON(want);
+ break;
+ case IEEE80211_BAND_2GHZ:
+ want = 7;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (sband->bitrates[i].bitrate == 10) {
+ sband->bitrates[i].flags |=
+ IEEE80211_RATE_MANDATORY_B |
+ IEEE80211_RATE_MANDATORY_G;
+ want--;
+ }
+
+ if (sband->bitrates[i].bitrate == 20 ||
+ sband->bitrates[i].bitrate == 55 ||
+ sband->bitrates[i].bitrate == 110 ||
+ sband->bitrates[i].bitrate == 60 ||
+ sband->bitrates[i].bitrate == 120 ||
+ sband->bitrates[i].bitrate == 240) {
+ sband->bitrates[i].flags |=
+ IEEE80211_RATE_MANDATORY_G;
+ want--;
+ }
+
+ if (sband->bitrates[i].bitrate != 10 &&
+ sband->bitrates[i].bitrate != 20 &&
+ sband->bitrates[i].bitrate != 55 &&
+ sband->bitrates[i].bitrate != 110)
+ sband->bitrates[i].flags |=
+ IEEE80211_RATE_ERP_G;
+ }
+ WARN_ON(want != 0 && want != 3 && want != 6);
+ break;
+ case IEEE80211_BAND_60GHZ:
+ /* check for mandatory HT MCS 1..4 */
+ WARN_ON(!sband->ht_cap.ht_supported);
+ WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e);
+ break;
+ case IEEE80211_NUM_BANDS:
+ WARN_ON(1);
+ break;
+ }
+}
+
+void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
+{
+ enum ieee80211_band band;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ if (wiphy->bands[band])
+ set_mandatory_flags_band(wiphy->bands[band], band);
+}
+
+bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher)
+{
+ int i;
+ for (i = 0; i < wiphy->n_cipher_suites; i++)
+ if (cipher == wiphy->cipher_suites[i])
+ return true;
+ return false;
+}
+
+int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
+ struct key_params *params, int key_idx,
+ bool pairwise, const u8 *mac_addr)
+{
+ if (key_idx > 5)
+ return -EINVAL;
+
+ if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+ return -EINVAL;
+
+ if (pairwise && !mac_addr)
+ return -EINVAL;
+
+ switch (params->cipher) {
+ case WLAN_CIPHER_SUITE_TKIP:
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ /* Disallow pairwise keys with non-zero index unless it's WEP
+ * or a vendor specific cipher (because current deployments use
+ * pairwise WEP keys with non-zero indices and for vendor
+ * specific ciphers this should be validated in the driver or
+ * hardware level - but 802.11i clearly specifies to use zero)
+ */
+ if (pairwise && key_idx)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ /* Disallow BIP (group-only) cipher as pairwise cipher */
+ if (pairwise)
+ return -EINVAL;
+ break;
+ default:
+ break;
+ }
+
+ switch (params->cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ if (params->key_len != WLAN_KEY_LEN_WEP40)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_TKIP:
+ if (params->key_len != WLAN_KEY_LEN_TKIP)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP:
+ if (params->key_len != WLAN_KEY_LEN_CCMP)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ if (params->key_len != WLAN_KEY_LEN_CCMP_256)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_GCMP:
+ if (params->key_len != WLAN_KEY_LEN_GCMP)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ if (params->key_len != WLAN_KEY_LEN_GCMP_256)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_WEP104:
+ if (params->key_len != WLAN_KEY_LEN_WEP104)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ if (params->key_len != WLAN_KEY_LEN_AES_CMAC)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128)
+ return -EINVAL;
+ break;
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256)
+ return -EINVAL;
+ break;
+ default:
+ /*
+ * We don't know anything about this algorithm,
+ * allow using it -- but the driver must check
+ * all parameters! We still check below whether
+ * or not the driver supports this algorithm,
+ * of course.
+ */
+ break;
+ }
+
+ if (params->seq) {
+ switch (params->cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ /* These ciphers do not use key sequence */
+ return -EINVAL;
+ case WLAN_CIPHER_SUITE_TKIP:
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+ case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+ if (params->seq_len != 6)
+ return -EINVAL;
+ break;
+ }
+ }
+
+ if (!cfg80211_supported_cipher_suite(&rdev->wiphy, params->cipher))
+ return -EINVAL;
+
+ return 0;
+}
+
+unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc)
+{
+ unsigned int hdrlen = 24;
+
+ if (ieee80211_is_data(fc)) {
+ if (ieee80211_has_a4(fc))
+ hdrlen = 30;
+ if (ieee80211_is_data_qos(fc)) {
+ hdrlen += IEEE80211_QOS_CTL_LEN;
+ if (ieee80211_has_order(fc))
+ hdrlen += IEEE80211_HT_CTL_LEN;
+ }
+ goto out;
+ }
+
+ if (ieee80211_is_mgmt(fc)) {
+ if (ieee80211_has_order(fc))
+ hdrlen += IEEE80211_HT_CTL_LEN;
+ goto out;
+ }
+
+ if (ieee80211_is_ctl(fc)) {
+ /*
+ * ACK and CTS are 10 bytes, all others 16. To see how
+ * to get this condition consider
+ * subtype mask: 0b0000000011110000 (0x00F0)
+ * ACK subtype: 0b0000000011010000 (0x00D0)
+ * CTS subtype: 0b0000000011000000 (0x00C0)
+ * bits that matter: ^^^ (0x00E0)
+ * value of those: 0b0000000011000000 (0x00C0)
+ */
+ if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0))
+ hdrlen = 10;
+ else
+ hdrlen = 16;
+ }
+out:
+ return hdrlen;
+}
+EXPORT_SYMBOL(ieee80211_hdrlen);
+
+unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb)
+{
+ const struct ieee80211_hdr *hdr =
+ (const struct ieee80211_hdr *)skb->data;
+ unsigned int hdrlen;
+
+ if (unlikely(skb->len < 10))
+ return 0;
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ if (unlikely(hdrlen > skb->len))
+ return 0;
+ return hdrlen;
+}
+EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb);
+
+unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
+{
+ int ae = meshhdr->flags & MESH_FLAGS_AE;
+ /* 802.11-2012, 8.2.4.7.3 */
+ switch (ae) {
+ default:
+ case 0:
+ return 6;
+ case MESH_FLAGS_AE_A4:
+ return 12;
+ case MESH_FLAGS_AE_A5_A6:
+ return 18;
+ }
+}
+EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen);
+
+int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
+ enum nl80211_iftype iftype)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ u16 hdrlen, ethertype;
+ u8 *payload;
+ u8 dst[ETH_ALEN];
+ u8 src[ETH_ALEN] __aligned(2);
+
+ if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
+ return -1;
+
+ hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+ /* convert IEEE 802.11 header + possible LLC headers into Ethernet
+ * header
+ * IEEE 802.11 address fields:
+ * ToDS FromDS Addr1 Addr2 Addr3 Addr4
+ * 0 0 DA SA BSSID n/a
+ * 0 1 DA BSSID SA n/a
+ * 1 0 BSSID SA DA n/a
+ * 1 1 RA TA DA SA
+ */
+ memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN);
+ memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN);
+
+ switch (hdr->frame_control &
+ cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
+ case cpu_to_le16(IEEE80211_FCTL_TODS):
+ if (unlikely(iftype != NL80211_IFTYPE_AP &&
+ iftype != NL80211_IFTYPE_AP_VLAN &&
+ iftype != NL80211_IFTYPE_P2P_GO))
+ return -1;
+ break;
+ case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
+ if (unlikely(iftype != NL80211_IFTYPE_WDS &&
+ iftype != NL80211_IFTYPE_MESH_POINT &&
+ iftype != NL80211_IFTYPE_AP_VLAN &&
+ iftype != NL80211_IFTYPE_STATION))
+ return -1;
+ if (iftype == NL80211_IFTYPE_MESH_POINT) {
+ struct ieee80211s_hdr *meshdr =
+ (struct ieee80211s_hdr *) (skb->data + hdrlen);
+ /* make sure meshdr->flags is on the linear part */
+ if (!pskb_may_pull(skb, hdrlen + 1))
+ return -1;
+ if (meshdr->flags & MESH_FLAGS_AE_A4)
+ return -1;
+ if (meshdr->flags & MESH_FLAGS_AE_A5_A6) {
+ skb_copy_bits(skb, hdrlen +
+ offsetof(struct ieee80211s_hdr, eaddr1),
+ dst, ETH_ALEN);
+ skb_copy_bits(skb, hdrlen +
+ offsetof(struct ieee80211s_hdr, eaddr2),
+ src, ETH_ALEN);
+ }
+ hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+ }
+ break;
+ case cpu_to_le16(IEEE80211_FCTL_FROMDS):
+ if ((iftype != NL80211_IFTYPE_STATION &&
+ iftype != NL80211_IFTYPE_P2P_CLIENT &&
+ iftype != NL80211_IFTYPE_MESH_POINT) ||
+ (is_multicast_ether_addr(dst) &&
+ ether_addr_equal(src, addr)))
+ return -1;
+ if (iftype == NL80211_IFTYPE_MESH_POINT) {
+ struct ieee80211s_hdr *meshdr =
+ (struct ieee80211s_hdr *) (skb->data + hdrlen);
+ /* make sure meshdr->flags is on the linear part */
+ if (!pskb_may_pull(skb, hdrlen + 1))
+ return -1;
+ if (meshdr->flags & MESH_FLAGS_AE_A5_A6)
+ return -1;
+ if (meshdr->flags & MESH_FLAGS_AE_A4)
+ skb_copy_bits(skb, hdrlen +
+ offsetof(struct ieee80211s_hdr, eaddr1),
+ src, ETH_ALEN);
+ hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+ }
+ break;
+ case cpu_to_le16(0):
+ if (iftype != NL80211_IFTYPE_ADHOC &&
+ iftype != NL80211_IFTYPE_STATION &&
+ iftype != NL80211_IFTYPE_OCB)
+ return -1;
+ break;
+ }
+
+ if (!pskb_may_pull(skb, hdrlen + 8))
+ return -1;
+
+ payload = skb->data + hdrlen;
+ ethertype = (payload[6] << 8) | payload[7];
+
+ if (likely((ether_addr_equal(payload, rfc1042_header) &&
+ ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
+ ether_addr_equal(payload, bridge_tunnel_header))) {
+ /* remove RFC1042 or Bridge-Tunnel encapsulation and
+ * replace EtherType */
+ skb_pull(skb, hdrlen + 6);
+ memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
+ memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
+ } else {
+ struct ethhdr *ehdr;
+ __be16 len;
+
+ skb_pull(skb, hdrlen);
+ len = htons(skb->len);
+ ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr));
+ memcpy(ehdr->h_dest, dst, ETH_ALEN);
+ memcpy(ehdr->h_source, src, ETH_ALEN);
+ ehdr->h_proto = len;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ieee80211_data_to_8023);
+
+int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
+ enum nl80211_iftype iftype,
+ const u8 *bssid, bool qos)
+{
+ struct ieee80211_hdr hdr;
+ u16 hdrlen, ethertype;
+ __le16 fc;
+ const u8 *encaps_data;
+ int encaps_len, skip_header_bytes;
+ int nh_pos, h_pos;
+ int head_need;
+
+ if (unlikely(skb->len < ETH_HLEN))
+ return -EINVAL;
+
+ nh_pos = skb_network_header(skb) - skb->data;
+ h_pos = skb_transport_header(skb) - skb->data;
+
+ /* convert Ethernet header to proper 802.11 header (based on
+ * operation mode) */
+ ethertype = (skb->data[12] << 8) | skb->data[13];
+ fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+ switch (iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+ /* DA BSSID SA */
+ memcpy(hdr.addr1, skb->data, ETH_ALEN);
+ memcpy(hdr.addr2, addr, ETH_ALEN);
+ memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
+ hdrlen = 24;
+ break;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+ /* BSSID SA DA */
+ memcpy(hdr.addr1, bssid, ETH_ALEN);
+ memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+ memcpy(hdr.addr3, skb->data, ETH_ALEN);
+ hdrlen = 24;
+ break;
+ case NL80211_IFTYPE_OCB:
+ case NL80211_IFTYPE_ADHOC:
+ /* DA SA BSSID */
+ memcpy(hdr.addr1, skb->data, ETH_ALEN);
+ memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+ memcpy(hdr.addr3, bssid, ETH_ALEN);
+ hdrlen = 24;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (qos) {
+ fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+ hdrlen += 2;
+ }
+
+ hdr.frame_control = fc;
+ hdr.duration_id = 0;
+ hdr.seq_ctrl = 0;
+
+ skip_header_bytes = ETH_HLEN;
+ if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) {
+ encaps_data = bridge_tunnel_header;
+ encaps_len = sizeof(bridge_tunnel_header);
+ skip_header_bytes -= 2;
+ } else if (ethertype >= ETH_P_802_3_MIN) {
+ encaps_data = rfc1042_header;
+ encaps_len = sizeof(rfc1042_header);
+ skip_header_bytes -= 2;
+ } else {
+ encaps_data = NULL;
+ encaps_len = 0;
+ }
+
+ skb_pull(skb, skip_header_bytes);
+ nh_pos -= skip_header_bytes;
+ h_pos -= skip_header_bytes;
+
+ head_need = hdrlen + encaps_len - skb_headroom(skb);
+
+ if (head_need > 0 || skb_cloned(skb)) {
+ head_need = max(head_need, 0);
+ if (head_need)
+ skb_orphan(skb);
+
+ if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC))
+ return -ENOMEM;
+
+ skb->truesize += head_need;
+ }
+
+ if (encaps_data) {
+ memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len);
+ nh_pos += encaps_len;
+ h_pos += encaps_len;
+ }
+
+ memcpy(skb_push(skb, hdrlen), &hdr, hdrlen);
+
+ nh_pos += hdrlen;
+ h_pos += hdrlen;
+
+ /* Update skb pointers to various headers since this modified frame
+ * is going to go through Linux networking code that may potentially
+ * need things like pointer to IP header. */
+ skb_set_mac_header(skb, 0);
+ skb_set_network_header(skb, nh_pos);
+ skb_set_transport_header(skb, h_pos);
+
+ return 0;
+}
+EXPORT_SYMBOL(ieee80211_data_from_8023);
+
+
+void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
+ const u8 *addr, enum nl80211_iftype iftype,
+ const unsigned int extra_headroom,
+ bool has_80211_header)
+{
+ struct sk_buff *frame = NULL;
+ u16 ethertype;
+ u8 *payload;
+ const struct ethhdr *eth;
+ int remaining, err;
+ u8 dst[ETH_ALEN], src[ETH_ALEN];
+
+ if (has_80211_header) {
+ err = ieee80211_data_to_8023(skb, addr, iftype);
+ if (err)
+ goto out;
+
+ /* skip the wrapping header */
+ eth = (struct ethhdr *) skb_pull(skb, sizeof(struct ethhdr));
+ if (!eth)
+ goto out;
+ } else {
+ eth = (struct ethhdr *) skb->data;
+ }
+
+ while (skb != frame) {
+ u8 padding;
+ __be16 len = eth->h_proto;
+ unsigned int subframe_len = sizeof(struct ethhdr) + ntohs(len);
+
+ remaining = skb->len;
+ memcpy(dst, eth->h_dest, ETH_ALEN);
+ memcpy(src, eth->h_source, ETH_ALEN);
+
+ padding = (4 - subframe_len) & 0x3;
+ /* the last MSDU has no padding */
+ if (subframe_len > remaining)
+ goto purge;
+
+ skb_pull(skb, sizeof(struct ethhdr));
+ /* reuse skb for the last subframe */
+ if (remaining <= subframe_len + padding)
+ frame = skb;
+ else {
+ unsigned int hlen = ALIGN(extra_headroom, 4);
+ /*
+ * Allocate and reserve two bytes more for payload
+ * alignment since sizeof(struct ethhdr) is 14.
+ */
+ frame = dev_alloc_skb(hlen + subframe_len + 2);
+ if (!frame)
+ goto purge;
+
+ skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
+ memcpy(skb_put(frame, ntohs(len)), skb->data,
+ ntohs(len));
+
+ eth = (struct ethhdr *)skb_pull(skb, ntohs(len) +
+ padding);
+ if (!eth) {
+ dev_kfree_skb(frame);
+ goto purge;
+ }
+ }
+
+ skb_reset_network_header(frame);
+ frame->dev = skb->dev;
+ frame->priority = skb->priority;
+
+ payload = frame->data;
+ ethertype = (payload[6] << 8) | payload[7];
+
+ if (likely((ether_addr_equal(payload, rfc1042_header) &&
+ ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
+ ether_addr_equal(payload, bridge_tunnel_header))) {
+ /* remove RFC1042 or Bridge-Tunnel
+ * encapsulation and replace EtherType */
+ skb_pull(frame, 6);
+ memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN);
+ memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
+ } else {
+ memcpy(skb_push(frame, sizeof(__be16)), &len,
+ sizeof(__be16));
+ memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN);
+ memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
+ }
+ __skb_queue_tail(list, frame);
+ }
+
+ return;
+
+ purge:
+ __skb_queue_purge(list);
+ out:
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(ieee80211_amsdu_to_8023s);
+
+/* Given a data frame determine the 802.1p/1d tag to use. */
+unsigned int cfg80211_classify8021d(struct sk_buff *skb,
+ struct cfg80211_qos_map *qos_map)
+{
+ unsigned int dscp;
+ unsigned char vlan_priority;
+
+ /* skb->priority values from 256->263 are magic values to
+ * directly indicate a specific 802.1d priority. This is used
+ * to allow 802.1d priority to be passed directly in from VLAN
+ * tags, etc.
+ */
+ if (skb->priority >= 256 && skb->priority <= 263)
+ return skb->priority - 256;
+
+ if (skb_vlan_tag_present(skb)) {
+ vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK)
+ >> VLAN_PRIO_SHIFT;
+ if (vlan_priority > 0)
+ return vlan_priority;
+ }
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) & 0xfc;
+ break;
+ case htons(ETH_P_IPV6):
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc;
+ break;
+ case htons(ETH_P_MPLS_UC):
+ case htons(ETH_P_MPLS_MC): {
+ struct mpls_label mpls_tmp, *mpls;
+
+ mpls = skb_header_pointer(skb, sizeof(struct ethhdr),
+ sizeof(*mpls), &mpls_tmp);
+ if (!mpls)
+ return 0;
+
+ return (ntohl(mpls->entry) & MPLS_LS_TC_MASK)
+ >> MPLS_LS_TC_SHIFT;
+ }
+ case htons(ETH_P_80221):
+ /* 802.21 is always network control traffic */
+ return 7;
+ default:
+ return 0;
+ }
+
+ if (qos_map) {
+ unsigned int i, tmp_dscp = dscp >> 2;
+
+ for (i = 0; i < qos_map->num_des; i++) {
+ if (tmp_dscp == qos_map->dscp_exception[i].dscp)
+ return qos_map->dscp_exception[i].up;
+ }
+
+ for (i = 0; i < 8; i++) {
+ if (tmp_dscp >= qos_map->up[i].low &&
+ tmp_dscp <= qos_map->up[i].high)
+ return i;
+ }
+ }
+
+ return dscp >> 5;
+}
+EXPORT_SYMBOL(cfg80211_classify8021d);
+
+const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 ie)
+{
+ const struct cfg80211_bss_ies *ies;
+
+ ies = rcu_dereference(bss->ies);
+ if (!ies)
+ return NULL;
+
+ return cfg80211_find_ie(ie, ies->data, ies->len);
+}
+EXPORT_SYMBOL(ieee80211_bss_get_ie);
+
+void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
+{
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct net_device *dev = wdev->netdev;
+ int i;
+
+ if (!wdev->connect_keys)
+ return;
+
+ for (i = 0; i < 6; i++) {
+ if (!wdev->connect_keys->params[i].cipher)
+ continue;
+ if (rdev_add_key(rdev, dev, i, false, NULL,
+ &wdev->connect_keys->params[i])) {
+ netdev_err(dev, "failed to set key %d\n", i);
+ continue;
+ }
+ if (wdev->connect_keys->def == i)
+ if (rdev_set_default_key(rdev, dev, i, true, true)) {
+ netdev_err(dev, "failed to set defkey %d\n", i);
+ continue;
+ }
+ if (wdev->connect_keys->defmgmt == i)
+ if (rdev_set_default_mgmt_key(rdev, dev, i))
+ netdev_err(dev, "failed to set mgtdef %d\n", i);
+ }
+
+ kzfree(wdev->connect_keys);
+ wdev->connect_keys = NULL;
+}
+
+void cfg80211_process_wdev_events(struct wireless_dev *wdev)
+{
+ struct cfg80211_event *ev;
+ unsigned long flags;
+ const u8 *bssid = NULL;
+
+ spin_lock_irqsave(&wdev->event_lock, flags);
+ while (!list_empty(&wdev->event_list)) {
+ ev = list_first_entry(&wdev->event_list,
+ struct cfg80211_event, list);
+ list_del(&ev->list);
+ spin_unlock_irqrestore(&wdev->event_lock, flags);
+
+ wdev_lock(wdev);
+ switch (ev->type) {
+ case EVENT_CONNECT_RESULT:
+ if (!is_zero_ether_addr(ev->cr.bssid))
+ bssid = ev->cr.bssid;
+ __cfg80211_connect_result(
+ wdev->netdev, bssid,
+ ev->cr.req_ie, ev->cr.req_ie_len,
+ ev->cr.resp_ie, ev->cr.resp_ie_len,
+ ev->cr.status,
+ ev->cr.status == WLAN_STATUS_SUCCESS,
+ NULL);
+ break;
+ case EVENT_ROAMED:
+ __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie,
+ ev->rm.req_ie_len, ev->rm.resp_ie,
+ ev->rm.resp_ie_len);
+ break;
+ case EVENT_DISCONNECTED:
+ __cfg80211_disconnected(wdev->netdev,
+ ev->dc.ie, ev->dc.ie_len,
+ ev->dc.reason, true);
+ break;
+ case EVENT_IBSS_JOINED:
+ __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid,
+ ev->ij.channel);
+ break;
+ case EVENT_STOPPED:
+ __cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
+ break;
+ }
+ wdev_unlock(wdev);
+
+ kfree(ev);
+
+ spin_lock_irqsave(&wdev->event_lock, flags);
+ }
+ spin_unlock_irqrestore(&wdev->event_lock, flags);
+}
+
+void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev)
+{
+ struct wireless_dev *wdev;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list)
+ cfg80211_process_wdev_events(wdev);
+}
+
+int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, enum nl80211_iftype ntype,
+ u32 *flags, struct vif_params *params)
+{
+ int err;
+ enum nl80211_iftype otype = dev->ieee80211_ptr->iftype;
+
+ ASSERT_RTNL();
+
+ /* don't support changing VLANs, you just re-create them */
+ if (otype == NL80211_IFTYPE_AP_VLAN)
+ return -EOPNOTSUPP;
+
+ /* cannot change into P2P device type */
+ if (ntype == NL80211_IFTYPE_P2P_DEVICE)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->change_virtual_intf ||
+ !(rdev->wiphy.interface_modes & (1 << ntype)))
+ return -EOPNOTSUPP;
+
+ /* if it's part of a bridge, reject changing type to station/ibss */
+ if ((dev->priv_flags & IFF_BRIDGE_PORT) &&
+ (ntype == NL80211_IFTYPE_ADHOC ||
+ ntype == NL80211_IFTYPE_STATION ||
+ ntype == NL80211_IFTYPE_P2P_CLIENT))
+ return -EBUSY;
+
+ if (ntype != otype) {
+ dev->ieee80211_ptr->use_4addr = false;
+ dev->ieee80211_ptr->mesh_id_up_len = 0;
+ wdev_lock(dev->ieee80211_ptr);
+ rdev_set_qos_map(rdev, dev, NULL);
+ wdev_unlock(dev->ieee80211_ptr);
+
+ switch (otype) {
+ case NL80211_IFTYPE_AP:
+ cfg80211_stop_ap(rdev, dev, true);
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ cfg80211_leave_ibss(rdev, dev, false);
+ break;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ wdev_lock(dev->ieee80211_ptr);
+ cfg80211_disconnect(rdev, dev,
+ WLAN_REASON_DEAUTH_LEAVING, true);
+ wdev_unlock(dev->ieee80211_ptr);
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ /* mesh should be handled? */
+ break;
+ default:
+ break;
+ }
+
+ cfg80211_process_rdev_events(rdev);
+ }
+
+ err = rdev_change_virtual_intf(rdev, dev, ntype, flags, params);
+
+ WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype);
+
+ if (!err && params && params->use_4addr != -1)
+ dev->ieee80211_ptr->use_4addr = params->use_4addr;
+
+ if (!err) {
+ dev->priv_flags &= ~IFF_DONT_BRIDGE;
+ switch (ntype) {
+ case NL80211_IFTYPE_STATION:
+ if (dev->ieee80211_ptr->use_4addr)
+ break;
+ /* fall through */
+ case NL80211_IFTYPE_OCB:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_ADHOC:
+ dev->priv_flags |= IFF_DONT_BRIDGE;
+ break;
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_WDS:
+ case NL80211_IFTYPE_MESH_POINT:
+ /* bridging OK */
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ /* monitor can't bridge anyway */
+ break;
+ case NL80211_IFTYPE_UNSPECIFIED:
+ case NUM_NL80211_IFTYPES:
+ /* not happening */
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ if (!err && ntype != otype && netif_running(dev)) {
+ cfg80211_update_iface_num(rdev, ntype, 1);
+ cfg80211_update_iface_num(rdev, otype, -1);
+ }
+
+ return err;
+}
+
+static u32 cfg80211_calculate_bitrate_60g(struct rate_info *rate)
+{
+ static const u32 __mcs2bitrate[] = {
+ /* control PHY */
+ [0] = 275,
+ /* SC PHY */
+ [1] = 3850,
+ [2] = 7700,
+ [3] = 9625,
+ [4] = 11550,
+ [5] = 12512, /* 1251.25 mbps */
+ [6] = 15400,
+ [7] = 19250,
+ [8] = 23100,
+ [9] = 25025,
+ [10] = 30800,
+ [11] = 38500,
+ [12] = 46200,
+ /* OFDM PHY */
+ [13] = 6930,
+ [14] = 8662, /* 866.25 mbps */
+ [15] = 13860,
+ [16] = 17325,
+ [17] = 20790,
+ [18] = 27720,
+ [19] = 34650,
+ [20] = 41580,
+ [21] = 45045,
+ [22] = 51975,
+ [23] = 62370,
+ [24] = 67568, /* 6756.75 mbps */
+ /* LP-SC PHY */
+ [25] = 6260,
+ [26] = 8340,
+ [27] = 11120,
+ [28] = 12510,
+ [29] = 16680,
+ [30] = 22240,
+ [31] = 25030,
+ };
+
+ if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate)))
+ return 0;
+
+ return __mcs2bitrate[rate->mcs];
+}
+
+static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
+{
+ static const u32 base[4][10] = {
+ { 6500000,
+ 13000000,
+ 19500000,
+ 26000000,
+ 39000000,
+ 52000000,
+ 58500000,
+ 65000000,
+ 78000000,
+ 0,
+ },
+ { 13500000,
+ 27000000,
+ 40500000,
+ 54000000,
+ 81000000,
+ 108000000,
+ 121500000,
+ 135000000,
+ 162000000,
+ 180000000,
+ },
+ { 29300000,
+ 58500000,
+ 87800000,
+ 117000000,
+ 175500000,
+ 234000000,
+ 263300000,
+ 292500000,
+ 351000000,
+ 390000000,
+ },
+ { 58500000,
+ 117000000,
+ 175500000,
+ 234000000,
+ 351000000,
+ 468000000,
+ 526500000,
+ 585000000,
+ 702000000,
+ 780000000,
+ },
+ };
+ u32 bitrate;
+ int idx;
+
+ if (WARN_ON_ONCE(rate->mcs > 9))
+ return 0;
+
+ switch (rate->bw) {
+ case RATE_INFO_BW_160:
+ idx = 3;
+ break;
+ case RATE_INFO_BW_80:
+ idx = 2;
+ break;
+ case RATE_INFO_BW_40:
+ idx = 1;
+ break;
+ case RATE_INFO_BW_5:
+ case RATE_INFO_BW_10:
+ default:
+ WARN_ON(1);
+ /* fall through */
+ case RATE_INFO_BW_20:
+ idx = 0;
+ }
+
+ bitrate = base[idx][rate->mcs];
+ bitrate *= rate->nss;
+
+ if (rate->flags & RATE_INFO_FLAGS_SHORT_GI)
+ bitrate = (bitrate / 9) * 10;
+
+ /* do NOT round down here */
+ return (bitrate + 50000) / 100000;
+}
+
+u32 cfg80211_calculate_bitrate(struct rate_info *rate)
+{
+ int modulation, streams, bitrate;
+
+ if (!(rate->flags & RATE_INFO_FLAGS_MCS) &&
+ !(rate->flags & RATE_INFO_FLAGS_VHT_MCS))
+ return rate->legacy;
+ if (rate->flags & RATE_INFO_FLAGS_60G)
+ return cfg80211_calculate_bitrate_60g(rate);
+ if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
+ return cfg80211_calculate_bitrate_vht(rate);
+
+ /* the formula below does only work for MCS values smaller than 32 */
+ if (WARN_ON_ONCE(rate->mcs >= 32))
+ return 0;
+
+ modulation = rate->mcs & 7;
+ streams = (rate->mcs >> 3) + 1;
+
+ bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000;
+
+ if (modulation < 4)
+ bitrate *= (modulation + 1);
+ else if (modulation == 4)
+ bitrate *= (modulation + 2);
+ else
+ bitrate *= (modulation + 3);
+
+ bitrate *= streams;
+
+ if (rate->flags & RATE_INFO_FLAGS_SHORT_GI)
+ bitrate = (bitrate / 9) * 10;
+
+ /* do NOT round down here */
+ return (bitrate + 50000) / 100000;
+}
+EXPORT_SYMBOL(cfg80211_calculate_bitrate);
+
+int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len,
+ enum ieee80211_p2p_attr_id attr,
+ u8 *buf, unsigned int bufsize)
+{
+ u8 *out = buf;
+ u16 attr_remaining = 0;
+ bool desired_attr = false;
+ u16 desired_len = 0;
+
+ while (len > 0) {
+ unsigned int iedatalen;
+ unsigned int copy;
+ const u8 *iedata;
+
+ if (len < 2)
+ return -EILSEQ;
+ iedatalen = ies[1];
+ if (iedatalen + 2 > len)
+ return -EILSEQ;
+
+ if (ies[0] != WLAN_EID_VENDOR_SPECIFIC)
+ goto cont;
+
+ if (iedatalen < 4)
+ goto cont;
+
+ iedata = ies + 2;
+
+ /* check WFA OUI, P2P subtype */
+ if (iedata[0] != 0x50 || iedata[1] != 0x6f ||
+ iedata[2] != 0x9a || iedata[3] != 0x09)
+ goto cont;
+
+ iedatalen -= 4;
+ iedata += 4;
+
+ /* check attribute continuation into this IE */
+ copy = min_t(unsigned int, attr_remaining, iedatalen);
+ if (copy && desired_attr) {
+ desired_len += copy;
+ if (out) {
+ memcpy(out, iedata, min(bufsize, copy));
+ out += min(bufsize, copy);
+ bufsize -= min(bufsize, copy);
+ }
+
+
+ if (copy == attr_remaining)
+ return desired_len;
+ }
+
+ attr_remaining -= copy;
+ if (attr_remaining)
+ goto cont;
+
+ iedatalen -= copy;
+ iedata += copy;
+
+ while (iedatalen > 0) {
+ u16 attr_len;
+
+ /* P2P attribute ID & size must fit */
+ if (iedatalen < 3)
+ return -EILSEQ;
+ desired_attr = iedata[0] == attr;
+ attr_len = get_unaligned_le16(iedata + 1);
+ iedatalen -= 3;
+ iedata += 3;
+
+ copy = min_t(unsigned int, attr_len, iedatalen);
+
+ if (desired_attr) {
+ desired_len += copy;
+ if (out) {
+ memcpy(out, iedata, min(bufsize, copy));
+ out += min(bufsize, copy);
+ bufsize -= min(bufsize, copy);
+ }
+
+ if (copy == attr_len)
+ return desired_len;
+ }
+
+ iedata += copy;
+ iedatalen -= copy;
+ attr_remaining = attr_len - copy;
+ }
+
+ cont:
+ len -= ies[1] + 2;
+ ies += ies[1] + 2;
+ }
+
+ if (attr_remaining && desired_attr)
+ return -EILSEQ;
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(cfg80211_get_p2p_attr);
+
+static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id)
+{
+ int i;
+
+ for (i = 0; i < n_ids; i++)
+ if (ids[i] == id)
+ return true;
+ return false;
+}
+
+size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
+ const u8 *ids, int n_ids,
+ const u8 *after_ric, int n_after_ric,
+ size_t offset)
+{
+ size_t pos = offset;
+
+ while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) {
+ if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) {
+ pos += 2 + ies[pos + 1];
+
+ while (pos < ielen &&
+ !ieee80211_id_in_list(after_ric, n_after_ric,
+ ies[pos]))
+ pos += 2 + ies[pos + 1];
+ } else {
+ pos += 2 + ies[pos + 1];
+ }
+ }
+
+ return pos;
+}
+EXPORT_SYMBOL(ieee80211_ie_split_ric);
+
+size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
+ const u8 *ids, int n_ids, size_t offset)
+{
+ return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset);
+}
+EXPORT_SYMBOL(ieee80211_ie_split);
+
+bool ieee80211_operating_class_to_band(u8 operating_class,
+ enum ieee80211_band *band)
+{
+ switch (operating_class) {
+ case 112:
+ case 115 ... 127:
+ case 128 ... 130:
+ *band = IEEE80211_BAND_5GHZ;
+ return true;
+ case 81:
+ case 82:
+ case 83:
+ case 84:
+ *band = IEEE80211_BAND_2GHZ;
+ return true;
+ case 180:
+ *band = IEEE80211_BAND_60GHZ;
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(ieee80211_operating_class_to_band);
+
+bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
+ u8 *op_class)
+{
+ u8 vht_opclass;
+ u16 freq = chandef->center_freq1;
+
+ if (freq >= 2412 && freq <= 2472) {
+ if (chandef->width > NL80211_CHAN_WIDTH_40)
+ return false;
+
+ /* 2.407 GHz, channels 1..13 */
+ if (chandef->width == NL80211_CHAN_WIDTH_40) {
+ if (freq > chandef->chan->center_freq)
+ *op_class = 83; /* HT40+ */
+ else
+ *op_class = 84; /* HT40- */
+ } else {
+ *op_class = 81;
+ }
+
+ return true;
+ }
+
+ if (freq == 2484) {
+ if (chandef->width > NL80211_CHAN_WIDTH_40)
+ return false;
+
+ *op_class = 82; /* channel 14 */
+ return true;
+ }
+
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_80:
+ vht_opclass = 128;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ vht_opclass = 129;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ vht_opclass = 130;
+ break;
+ case NL80211_CHAN_WIDTH_10:
+ case NL80211_CHAN_WIDTH_5:
+ return false; /* unsupported for now */
+ default:
+ vht_opclass = 0;
+ break;
+ }
+
+ /* 5 GHz, channels 36..48 */
+ if (freq >= 5180 && freq <= 5240) {
+ if (vht_opclass) {
+ *op_class = vht_opclass;
+ } else if (chandef->width == NL80211_CHAN_WIDTH_40) {
+ if (freq > chandef->chan->center_freq)
+ *op_class = 116;
+ else
+ *op_class = 117;
+ } else {
+ *op_class = 115;
+ }
+
+ return true;
+ }
+
+ /* 5 GHz, channels 52..64 */
+ if (freq >= 5260 && freq <= 5320) {
+ if (vht_opclass) {
+ *op_class = vht_opclass;
+ } else if (chandef->width == NL80211_CHAN_WIDTH_40) {
+ if (freq > chandef->chan->center_freq)
+ *op_class = 119;
+ else
+ *op_class = 120;
+ } else {
+ *op_class = 118;
+ }
+
+ return true;
+ }
+
+ /* 5 GHz, channels 100..144 */
+ if (freq >= 5500 && freq <= 5720) {
+ if (vht_opclass) {
+ *op_class = vht_opclass;
+ } else if (chandef->width == NL80211_CHAN_WIDTH_40) {
+ if (freq > chandef->chan->center_freq)
+ *op_class = 122;
+ else
+ *op_class = 123;
+ } else {
+ *op_class = 121;
+ }
+
+ return true;
+ }
+
+ /* 5 GHz, channels 149..169 */
+ if (freq >= 5745 && freq <= 5845) {
+ if (vht_opclass) {
+ *op_class = vht_opclass;
+ } else if (chandef->width == NL80211_CHAN_WIDTH_40) {
+ if (freq > chandef->chan->center_freq)
+ *op_class = 126;
+ else
+ *op_class = 127;
+ } else if (freq <= 5805) {
+ *op_class = 124;
+ } else {
+ *op_class = 125;
+ }
+
+ return true;
+ }
+
+ /* 56.16 GHz, channel 1..4 */
+ if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 4) {
+ if (chandef->width >= NL80211_CHAN_WIDTH_40)
+ return false;
+
+ *op_class = 180;
+ return true;
+ }
+
+ /* not supported yet */
+ return false;
+}
+EXPORT_SYMBOL(ieee80211_chandef_to_operating_class);
+
+int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
+ u32 beacon_int)
+{
+ struct wireless_dev *wdev;
+ int res = 0;
+
+ if (!beacon_int)
+ return -EINVAL;
+
+ list_for_each_entry(wdev, &rdev->wdev_list, list) {
+ if (!wdev->beacon_interval)
+ continue;
+ if (wdev->beacon_interval != beacon_int) {
+ res = -EINVAL;
+ break;
+ }
+ }
+
+ return res;
+}
+
+int cfg80211_iter_combinations(struct wiphy *wiphy,
+ const int num_different_channels,
+ const u8 radar_detect,
+ const int iftype_num[NUM_NL80211_IFTYPES],
+ void (*iter)(const struct ieee80211_iface_combination *c,
+ void *data),
+ void *data)
+{
+ const struct ieee80211_regdomain *regdom;
+ enum nl80211_dfs_regions region = 0;
+ int i, j, iftype;
+ int num_interfaces = 0;
+ u32 used_iftypes = 0;
+
+ if (radar_detect) {
+ rcu_read_lock();
+ regdom = rcu_dereference(cfg80211_regdomain);
+ if (regdom)
+ region = regdom->dfs_region;
+ rcu_read_unlock();
+ }
+
+ for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) {
+ num_interfaces += iftype_num[iftype];
+ if (iftype_num[iftype] > 0 &&
+ !(wiphy->software_iftypes & BIT(iftype)))
+ used_iftypes |= BIT(iftype);
+ }
+
+ for (i = 0; i < wiphy->n_iface_combinations; i++) {
+ const struct ieee80211_iface_combination *c;
+ struct ieee80211_iface_limit *limits;
+ u32 all_iftypes = 0;
+
+ c = &wiphy->iface_combinations[i];
+
+ if (num_interfaces > c->max_interfaces)
+ continue;
+ if (num_different_channels > c->num_different_channels)
+ continue;
+
+ limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits,
+ GFP_KERNEL);
+ if (!limits)
+ return -ENOMEM;
+
+ for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) {
+ if (wiphy->software_iftypes & BIT(iftype))
+ continue;
+ for (j = 0; j < c->n_limits; j++) {
+ all_iftypes |= limits[j].types;
+ if (!(limits[j].types & BIT(iftype)))
+ continue;
+ if (limits[j].max < iftype_num[iftype])
+ goto cont;
+ limits[j].max -= iftype_num[iftype];
+ }
+ }
+
+ if (radar_detect != (c->radar_detect_widths & radar_detect))
+ goto cont;
+
+ if (radar_detect && c->radar_detect_regions &&
+ !(c->radar_detect_regions & BIT(region)))
+ goto cont;
+
+ /* Finally check that all iftypes that we're currently
+ * using are actually part of this combination. If they
+ * aren't then we can't use this combination and have
+ * to continue to the next.
+ */
+ if ((all_iftypes & used_iftypes) != used_iftypes)
+ goto cont;
+
+ /* This combination covered all interface types and
+ * supported the requested numbers, so we're good.
+ */
+
+ (*iter)(c, data);
+ cont:
+ kfree(limits);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(cfg80211_iter_combinations);
+
+static void
+cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c,
+ void *data)
+{
+ int *num = data;
+ (*num)++;
+}
+
+int cfg80211_check_combinations(struct wiphy *wiphy,
+ const int num_different_channels,
+ const u8 radar_detect,
+ const int iftype_num[NUM_NL80211_IFTYPES])
+{
+ int err, num = 0;
+
+ err = cfg80211_iter_combinations(wiphy, num_different_channels,
+ radar_detect, iftype_num,
+ cfg80211_iter_sum_ifcombs, &num);
+ if (err)
+ return err;
+ if (num == 0)
+ return -EBUSY;
+
+ return 0;
+}
+EXPORT_SYMBOL(cfg80211_check_combinations);
+
+int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev,
+ enum nl80211_iftype iftype,
+ struct ieee80211_channel *chan,
+ enum cfg80211_chan_mode chanmode,
+ u8 radar_detect)
+{
+ struct wireless_dev *wdev_iter;
+ int num[NUM_NL80211_IFTYPES];
+ struct ieee80211_channel
+ *used_channels[CFG80211_MAX_NUM_DIFFERENT_CHANNELS];
+ struct ieee80211_channel *ch;
+ enum cfg80211_chan_mode chmode;
+ int num_different_channels = 0;
+ int total = 1;
+ int i;
+
+ ASSERT_RTNL();
+
+ if (WARN_ON(hweight32(radar_detect) > 1))
+ return -EINVAL;
+
+ if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
+ return -EINVAL;
+
+ /* Always allow software iftypes */
+ if (rdev->wiphy.software_iftypes & BIT(iftype)) {
+ if (radar_detect)
+ return -EINVAL;
+ return 0;
+ }
+
+ memset(num, 0, sizeof(num));
+ memset(used_channels, 0, sizeof(used_channels));
+
+ num[iftype] = 1;
+
+ /* TODO: We'll probably not need this anymore, since this
+ * should only be called with CHAN_MODE_UNDEFINED. There are
+ * still a couple of pending calls where other chanmodes are
+ * used, but we should get rid of them.
+ */
+ switch (chanmode) {
+ case CHAN_MODE_UNDEFINED:
+ break;
+ case CHAN_MODE_SHARED:
+ WARN_ON(!chan);
+ used_channels[0] = chan;
+ num_different_channels++;
+ break;
+ case CHAN_MODE_EXCLUSIVE:
+ num_different_channels++;
+ break;
+ }
+
+ list_for_each_entry(wdev_iter, &rdev->wdev_list, list) {
+ if (wdev_iter == wdev)
+ continue;
+ if (wdev_iter->iftype == NL80211_IFTYPE_P2P_DEVICE) {
+ if (!wdev_iter->p2p_started)
+ continue;
+ } else if (wdev_iter->netdev) {
+ if (!netif_running(wdev_iter->netdev))
+ continue;
+ } else {
+ WARN_ON(1);
+ }
+
+ if (rdev->wiphy.software_iftypes & BIT(wdev_iter->iftype))
+ continue;
+
+ /*
+ * We may be holding the "wdev" mutex, but now need to lock
+ * wdev_iter. This is OK because once we get here wdev_iter
+ * is not wdev (tested above), but we need to use the nested
+ * locking for lockdep.
+ */
+ mutex_lock_nested(&wdev_iter->mtx, 1);
+ __acquire(wdev_iter->mtx);
+ cfg80211_get_chan_state(wdev_iter, &ch, &chmode, &radar_detect);
+ wdev_unlock(wdev_iter);
+
+ switch (chmode) {
+ case CHAN_MODE_UNDEFINED:
+ break;
+ case CHAN_MODE_SHARED:
+ for (i = 0; i < CFG80211_MAX_NUM_DIFFERENT_CHANNELS; i++)
+ if (!used_channels[i] || used_channels[i] == ch)
+ break;
+
+ if (i == CFG80211_MAX_NUM_DIFFERENT_CHANNELS)
+ return -EBUSY;
+
+ if (used_channels[i] == NULL) {
+ used_channels[i] = ch;
+ num_different_channels++;
+ }
+ break;
+ case CHAN_MODE_EXCLUSIVE:
+ num_different_channels++;
+ break;
+ }
+
+ num[wdev_iter->iftype]++;
+ total++;
+ }
+
+ if (total == 1 && !radar_detect)
+ return 0;
+
+ return cfg80211_check_combinations(&rdev->wiphy, num_different_channels,
+ radar_detect, num);
+}
+
+int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
+ const u8 *rates, unsigned int n_rates,
+ u32 *mask)
+{
+ int i, j;
+
+ if (!sband)
+ return -EINVAL;
+
+ if (n_rates == 0 || n_rates > NL80211_MAX_SUPP_RATES)
+ return -EINVAL;
+
+ *mask = 0;
+
+ for (i = 0; i < n_rates; i++) {
+ int rate = (rates[i] & 0x7f) * 5;
+ bool found = false;
+
+ for (j = 0; j < sband->n_bitrates; j++) {
+ if (sband->bitrates[j].bitrate == rate) {
+ found = true;
+ *mask |= BIT(j);
+ break;
+ }
+ }
+ if (!found)
+ return -EINVAL;
+ }
+
+ /*
+ * mask must have at least one bit set here since we
+ * didn't accept a 0-length rates array nor allowed
+ * entries in the array that didn't exist
+ */
+
+ return 0;
+}
+
+unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy)
+{
+ enum ieee80211_band band;
+ unsigned int n_channels = 0;
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+ if (wiphy->bands[band])
+ n_channels += wiphy->bands[band]->n_channels;
+
+ return n_channels;
+}
+EXPORT_SYMBOL(ieee80211_get_num_supported_channels);
+
+int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
+ struct station_info *sinfo)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+
+ wdev = dev->ieee80211_ptr;
+ if (!wdev)
+ return -EOPNOTSUPP;
+
+ rdev = wiphy_to_rdev(wdev->wiphy);
+ if (!rdev->ops->get_station)
+ return -EOPNOTSUPP;
+
+ return rdev_get_station(rdev, dev, mac_addr, sinfo);
+}
+EXPORT_SYMBOL(cfg80211_get_station);
+
+/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
+/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
+const unsigned char rfc1042_header[] __aligned(2) =
+ { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
+EXPORT_SYMBOL(rfc1042_header);
+
+/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
+const unsigned char bridge_tunnel_header[] __aligned(2) =
+ { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
+EXPORT_SYMBOL(bridge_tunnel_header);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
new file mode 100644
index 000000000..fd682832a
--- /dev/null
+++ b/net/wireless/wext-compat.c
@@ -0,0 +1,1531 @@
+/*
+ * cfg80211 - wext compat code
+ *
+ * This is temporary code until all wireless functionality is migrated
+ * into cfg80211, when that happens all the exports here go away and
+ * we directly assign the wireless handlers of wireless interfaces.
+ *
+ * Copyright 2008-2009 Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <linux/export.h>
+#include <linux/wireless.h>
+#include <linux/nl80211.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <net/iw_handler.h>
+#include <net/cfg80211.h>
+#include <net/cfg80211-wext.h>
+#include "wext-compat.h"
+#include "core.h"
+#include "rdev-ops.h"
+
+int cfg80211_wext_giwname(struct net_device *dev,
+ struct iw_request_info *info,
+ char *name, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct ieee80211_supported_band *sband;
+ bool is_ht = false, is_a = false, is_b = false, is_g = false;
+
+ if (!wdev)
+ return -EOPNOTSUPP;
+
+ sband = wdev->wiphy->bands[IEEE80211_BAND_5GHZ];
+ if (sband) {
+ is_a = true;
+ is_ht |= sband->ht_cap.ht_supported;
+ }
+
+ sband = wdev->wiphy->bands[IEEE80211_BAND_2GHZ];
+ if (sband) {
+ int i;
+ /* Check for mandatory rates */
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (sband->bitrates[i].bitrate == 10)
+ is_b = true;
+ if (sband->bitrates[i].bitrate == 60)
+ is_g = true;
+ }
+ is_ht |= sband->ht_cap.ht_supported;
+ }
+
+ strcpy(name, "IEEE 802.11");
+ if (is_a)
+ strcat(name, "a");
+ if (is_b)
+ strcat(name, "b");
+ if (is_g)
+ strcat(name, "g");
+ if (is_ht)
+ strcat(name, "n");
+
+ return 0;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_giwname);
+
+int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
+ u32 *mode, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev;
+ struct vif_params vifparams;
+ enum nl80211_iftype type;
+
+ rdev = wiphy_to_rdev(wdev->wiphy);
+
+ switch (*mode) {
+ case IW_MODE_INFRA:
+ type = NL80211_IFTYPE_STATION;
+ break;
+ case IW_MODE_ADHOC:
+ type = NL80211_IFTYPE_ADHOC;
+ break;
+ case IW_MODE_REPEAT:
+ type = NL80211_IFTYPE_WDS;
+ break;
+ case IW_MODE_MONITOR:
+ type = NL80211_IFTYPE_MONITOR;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (type == wdev->iftype)
+ return 0;
+
+ memset(&vifparams, 0, sizeof(vifparams));
+
+ return cfg80211_change_iface(rdev, dev, type, NULL, &vifparams);
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode);
+
+int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
+ u32 *mode, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ if (!wdev)
+ return -EOPNOTSUPP;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_AP:
+ *mode = IW_MODE_MASTER;
+ break;
+ case NL80211_IFTYPE_STATION:
+ *mode = IW_MODE_INFRA;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ *mode = IW_MODE_ADHOC;
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ *mode = IW_MODE_MONITOR;
+ break;
+ case NL80211_IFTYPE_WDS:
+ *mode = IW_MODE_REPEAT;
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ *mode = IW_MODE_SECOND; /* FIXME */
+ break;
+ default:
+ *mode = IW_MODE_AUTO;
+ break;
+ }
+ return 0;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_giwmode);
+
+
+int cfg80211_wext_giwrange(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct iw_range *range = (struct iw_range *) extra;
+ enum ieee80211_band band;
+ int i, c = 0;
+
+ if (!wdev)
+ return -EOPNOTSUPP;
+
+ data->length = sizeof(struct iw_range);
+ memset(range, 0, sizeof(struct iw_range));
+
+ range->we_version_compiled = WIRELESS_EXT;
+ range->we_version_source = 21;
+ range->retry_capa = IW_RETRY_LIMIT;
+ range->retry_flags = IW_RETRY_LIMIT;
+ range->min_retry = 0;
+ range->max_retry = 255;
+ range->min_rts = 0;
+ range->max_rts = 2347;
+ range->min_frag = 256;
+ range->max_frag = 2346;
+
+ range->max_encoding_tokens = 4;
+
+ range->max_qual.updated = IW_QUAL_NOISE_INVALID;
+
+ switch (wdev->wiphy->signal_type) {
+ case CFG80211_SIGNAL_TYPE_NONE:
+ break;
+ case CFG80211_SIGNAL_TYPE_MBM:
+ range->max_qual.level = (u8)-110;
+ range->max_qual.qual = 70;
+ range->avg_qual.qual = 35;
+ range->max_qual.updated |= IW_QUAL_DBM;
+ range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
+ range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
+ break;
+ case CFG80211_SIGNAL_TYPE_UNSPEC:
+ range->max_qual.level = 100;
+ range->max_qual.qual = 100;
+ range->avg_qual.qual = 50;
+ range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
+ range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
+ break;
+ }
+
+ range->avg_qual.level = range->max_qual.level / 2;
+ range->avg_qual.noise = range->max_qual.noise / 2;
+ range->avg_qual.updated = range->max_qual.updated;
+
+ for (i = 0; i < wdev->wiphy->n_cipher_suites; i++) {
+ switch (wdev->wiphy->cipher_suites[i]) {
+ case WLAN_CIPHER_SUITE_TKIP:
+ range->enc_capa |= (IW_ENC_CAPA_CIPHER_TKIP |
+ IW_ENC_CAPA_WPA);
+ break;
+
+ case WLAN_CIPHER_SUITE_CCMP:
+ range->enc_capa |= (IW_ENC_CAPA_CIPHER_CCMP |
+ IW_ENC_CAPA_WPA2);
+ break;
+
+ case WLAN_CIPHER_SUITE_WEP40:
+ range->encoding_size[range->num_encoding_sizes++] =
+ WLAN_KEY_LEN_WEP40;
+ break;
+
+ case WLAN_CIPHER_SUITE_WEP104:
+ range->encoding_size[range->num_encoding_sizes++] =
+ WLAN_KEY_LEN_WEP104;
+ break;
+ }
+ }
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
+ struct ieee80211_supported_band *sband;
+
+ sband = wdev->wiphy->bands[band];
+
+ if (!sband)
+ continue;
+
+ for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) {
+ struct ieee80211_channel *chan = &sband->channels[i];
+
+ if (!(chan->flags & IEEE80211_CHAN_DISABLED)) {
+ range->freq[c].i =
+ ieee80211_frequency_to_channel(
+ chan->center_freq);
+ range->freq[c].m = chan->center_freq;
+ range->freq[c].e = 6;
+ c++;
+ }
+ }
+ }
+ range->num_channels = c;
+ range->num_frequency = c;
+
+ IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
+ IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
+ IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
+
+ if (wdev->wiphy->max_scan_ssids > 0)
+ range->scan_capa |= IW_SCAN_CAPA_ESSID;
+
+ return 0;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_giwrange);
+
+
+/**
+ * cfg80211_wext_freq - get wext frequency for non-"auto"
+ * @dev: the net device
+ * @freq: the wext freq encoding
+ *
+ * Returns a frequency, or a negative error code, or 0 for auto.
+ */
+int cfg80211_wext_freq(struct iw_freq *freq)
+{
+ /*
+ * Parse frequency - return 0 for auto and
+ * -EINVAL for impossible things.
+ */
+ if (freq->e == 0) {
+ enum ieee80211_band band = IEEE80211_BAND_2GHZ;
+ if (freq->m < 0)
+ return 0;
+ if (freq->m > 14)
+ band = IEEE80211_BAND_5GHZ;
+ return ieee80211_channel_to_frequency(freq->m, band);
+ } else {
+ int i, div = 1000000;
+ for (i = 0; i < freq->e; i++)
+ div /= 10;
+ if (div <= 0)
+ return -EINVAL;
+ return freq->m / div;
+ }
+}
+
+int cfg80211_wext_siwrts(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *rts, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ u32 orts = wdev->wiphy->rts_threshold;
+ int err;
+
+ if (rts->disabled || !rts->fixed)
+ wdev->wiphy->rts_threshold = (u32) -1;
+ else if (rts->value < 0)
+ return -EINVAL;
+ else
+ wdev->wiphy->rts_threshold = rts->value;
+
+ err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD);
+ if (err)
+ wdev->wiphy->rts_threshold = orts;
+
+ return err;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts);
+
+int cfg80211_wext_giwrts(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *rts, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ rts->value = wdev->wiphy->rts_threshold;
+ rts->disabled = rts->value == (u32) -1;
+ rts->fixed = 1;
+
+ return 0;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_giwrts);
+
+int cfg80211_wext_siwfrag(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *frag, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ u32 ofrag = wdev->wiphy->frag_threshold;
+ int err;
+
+ if (frag->disabled || !frag->fixed)
+ wdev->wiphy->frag_threshold = (u32) -1;
+ else if (frag->value < 256)
+ return -EINVAL;
+ else {
+ /* Fragment length must be even, so strip LSB. */
+ wdev->wiphy->frag_threshold = frag->value & ~0x1;
+ }
+
+ err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD);
+ if (err)
+ wdev->wiphy->frag_threshold = ofrag;
+
+ return err;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_siwfrag);
+
+int cfg80211_wext_giwfrag(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *frag, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ frag->value = wdev->wiphy->frag_threshold;
+ frag->disabled = frag->value == (u32) -1;
+ frag->fixed = 1;
+
+ return 0;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_giwfrag);
+
+static int cfg80211_wext_siwretry(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *retry, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ u32 changed = 0;
+ u8 olong = wdev->wiphy->retry_long;
+ u8 oshort = wdev->wiphy->retry_short;
+ int err;
+
+ if (retry->disabled || retry->value < 1 || retry->value > 255 ||
+ (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT)
+ return -EINVAL;
+
+ if (retry->flags & IW_RETRY_LONG) {
+ wdev->wiphy->retry_long = retry->value;
+ changed |= WIPHY_PARAM_RETRY_LONG;
+ } else if (retry->flags & IW_RETRY_SHORT) {
+ wdev->wiphy->retry_short = retry->value;
+ changed |= WIPHY_PARAM_RETRY_SHORT;
+ } else {
+ wdev->wiphy->retry_short = retry->value;
+ wdev->wiphy->retry_long = retry->value;
+ changed |= WIPHY_PARAM_RETRY_LONG;
+ changed |= WIPHY_PARAM_RETRY_SHORT;
+ }
+
+ if (!changed)
+ return 0;
+
+ err = rdev_set_wiphy_params(rdev, changed);
+ if (err) {
+ wdev->wiphy->retry_short = oshort;
+ wdev->wiphy->retry_long = olong;
+ }
+
+ return err;
+}
+
+int cfg80211_wext_giwretry(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *retry, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ retry->disabled = 0;
+
+ if (retry->flags == 0 || (retry->flags & IW_RETRY_SHORT)) {
+ /*
+ * First return short value, iwconfig will ask long value
+ * later if needed
+ */
+ retry->flags |= IW_RETRY_LIMIT | IW_RETRY_SHORT;
+ retry->value = wdev->wiphy->retry_short;
+ if (wdev->wiphy->retry_long == wdev->wiphy->retry_short)
+ retry->flags |= IW_RETRY_LONG;
+
+ return 0;
+ }
+
+ if (retry->flags & IW_RETRY_LONG) {
+ retry->flags = IW_RETRY_LIMIT | IW_RETRY_LONG;
+ retry->value = wdev->wiphy->retry_long;
+ }
+
+ return 0;
+}
+EXPORT_WEXT_HANDLER(cfg80211_wext_giwretry);
+
+static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool pairwise,
+ const u8 *addr, bool remove, bool tx_key,
+ int idx, struct key_params *params)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err, i;
+ bool rejoin = false;
+
+ if (pairwise && !addr)
+ return -EINVAL;
+
+ if (!wdev->wext.keys) {
+ wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
+ GFP_KERNEL);
+ if (!wdev->wext.keys)
+ return -ENOMEM;
+ for (i = 0; i < 6; i++)
+ wdev->wext.keys->params[i].key =
+ wdev->wext.keys->data[i];
+ }
+
+ if (wdev->iftype != NL80211_IFTYPE_ADHOC &&
+ wdev->iftype != NL80211_IFTYPE_STATION)
+ return -EOPNOTSUPP;
+
+ if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) {
+ if (!wdev->current_bss)
+ return -ENOLINK;
+
+ if (!rdev->ops->set_default_mgmt_key)
+ return -EOPNOTSUPP;
+
+ if (idx < 4 || idx > 5)
+ return -EINVAL;
+ } else if (idx < 0 || idx > 3)
+ return -EINVAL;
+
+ if (remove) {
+ err = 0;
+ if (wdev->current_bss) {
+ /*
+ * If removing the current TX key, we will need to
+ * join a new IBSS without the privacy bit clear.
+ */
+ if (idx == wdev->wext.default_key &&
+ wdev->iftype == NL80211_IFTYPE_ADHOC) {
+ __cfg80211_leave_ibss(rdev, wdev->netdev, true);
+ rejoin = true;
+ }
+
+ if (!pairwise && addr &&
+ !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+ err = -ENOENT;
+ else
+ err = rdev_del_key(rdev, dev, idx, pairwise,
+ addr);
+ }
+ wdev->wext.connect.privacy = false;
+ /*
+ * Applications using wireless extensions expect to be
+ * able to delete keys that don't exist, so allow that.
+ */
+ if (err == -ENOENT)
+ err = 0;
+ if (!err) {
+ if (!addr) {
+ memset(wdev->wext.keys->data[idx], 0,
+ sizeof(wdev->wext.keys->data[idx]));
+ wdev->wext.keys->params[idx].key_len = 0;
+ wdev->wext.keys->params[idx].cipher = 0;
+ }
+ if (idx == wdev->wext.default_key)
+ wdev->wext.default_key = -1;
+ else if (idx == wdev->wext.default_mgmt_key)
+ wdev->wext.default_mgmt_key = -1;
+ }
+
+ if (!err && rejoin)
+ err = cfg80211_ibss_wext_join(rdev, wdev);
+
+ return err;
+ }
+
+ if (addr)
+ tx_key = false;
+
+ if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr))
+ return -EINVAL;
+
+ err = 0;
+ if (wdev->current_bss)
+ err = rdev_add_key(rdev, dev, idx, pairwise, addr, params);
+ if (err)
+ return err;
+
+ if (!addr) {
+ wdev->wext.keys->params[idx] = *params;
+ memcpy(wdev->wext.keys->data[idx],
+ params->key, params->key_len);
+ wdev->wext.keys->params[idx].key =
+ wdev->wext.keys->data[idx];
+ }
+
+ if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 ||
+ params->cipher == WLAN_CIPHER_SUITE_WEP104) &&
+ (tx_key || (!addr && wdev->wext.default_key == -1))) {
+ if (wdev->current_bss) {
+ /*
+ * If we are getting a new TX key from not having
+ * had one before we need to join a new IBSS with
+ * the privacy bit set.
+ */
+ if (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+ wdev->wext.default_key == -1) {
+ __cfg80211_leave_ibss(rdev, wdev->netdev, true);
+ rejoin = true;
+ }
+ err = rdev_set_default_key(rdev, dev, idx, true, true);
+ }
+ if (!err) {
+ wdev->wext.default_key = idx;
+ if (rejoin)
+ err = cfg80211_ibss_wext_join(rdev, wdev);
+ }
+ return err;
+ }
+
+ if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC &&
+ (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) {
+ if (wdev->current_bss)
+ err = rdev_set_default_mgmt_key(rdev, dev, idx);
+ if (!err)
+ wdev->wext.default_mgmt_key = idx;
+ return err;
+ }
+
+ return 0;
+}
+
+static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool pairwise,
+ const u8 *addr, bool remove, bool tx_key,
+ int idx, struct key_params *params)
+{
+ int err;
+
+ wdev_lock(dev->ieee80211_ptr);
+ err = __cfg80211_set_encryption(rdev, dev, pairwise, addr,
+ remove, tx_key, idx, params);
+ wdev_unlock(dev->ieee80211_ptr);
+
+ return err;
+}
+
+static int cfg80211_wext_siwencode(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *erq, char *keybuf)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ int idx, err;
+ bool remove = false;
+ struct key_params params;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_ADHOC)
+ return -EOPNOTSUPP;
+
+ /* no use -- only MFP (set_default_mgmt_key) is optional */
+ if (!rdev->ops->del_key ||
+ !rdev->ops->add_key ||
+ !rdev->ops->set_default_key)
+ return -EOPNOTSUPP;
+
+ idx = erq->flags & IW_ENCODE_INDEX;
+ if (idx == 0) {
+ idx = wdev->wext.default_key;
+ if (idx < 0)
+ idx = 0;
+ } else if (idx < 1 || idx > 4)
+ return -EINVAL;
+ else
+ idx--;
+
+ if (erq->flags & IW_ENCODE_DISABLED)
+ remove = true;
+ else if (erq->length == 0) {
+ /* No key data - just set the default TX key index */
+ err = 0;
+ wdev_lock(wdev);
+ if (wdev->current_bss)
+ err = rdev_set_default_key(rdev, dev, idx, true,
+ true);
+ if (!err)
+ wdev->wext.default_key = idx;
+ wdev_unlock(wdev);
+ return err;
+ }
+
+ memset(&params, 0, sizeof(params));
+ params.key = keybuf;
+ params.key_len = erq->length;
+ if (erq->length == 5)
+ params.cipher = WLAN_CIPHER_SUITE_WEP40;
+ else if (erq->length == 13)
+ params.cipher = WLAN_CIPHER_SUITE_WEP104;
+ else if (!remove)
+ return -EINVAL;
+
+ return cfg80211_set_encryption(rdev, dev, false, NULL, remove,
+ wdev->wext.default_key == -1,
+ idx, &params);
+}
+
+static int cfg80211_wext_siwencodeext(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *erq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct iw_encode_ext *ext = (struct iw_encode_ext *) extra;
+ const u8 *addr;
+ int idx;
+ bool remove = false;
+ struct key_params params;
+ u32 cipher;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_ADHOC)
+ return -EOPNOTSUPP;
+
+ /* no use -- only MFP (set_default_mgmt_key) is optional */
+ if (!rdev->ops->del_key ||
+ !rdev->ops->add_key ||
+ !rdev->ops->set_default_key)
+ return -EOPNOTSUPP;
+
+ switch (ext->alg) {
+ case IW_ENCODE_ALG_NONE:
+ remove = true;
+ cipher = 0;
+ break;
+ case IW_ENCODE_ALG_WEP:
+ if (ext->key_len == 5)
+ cipher = WLAN_CIPHER_SUITE_WEP40;
+ else if (ext->key_len == 13)
+ cipher = WLAN_CIPHER_SUITE_WEP104;
+ else
+ return -EINVAL;
+ break;
+ case IW_ENCODE_ALG_TKIP:
+ cipher = WLAN_CIPHER_SUITE_TKIP;
+ break;
+ case IW_ENCODE_ALG_CCMP:
+ cipher = WLAN_CIPHER_SUITE_CCMP;
+ break;
+ case IW_ENCODE_ALG_AES_CMAC:
+ cipher = WLAN_CIPHER_SUITE_AES_CMAC;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (erq->flags & IW_ENCODE_DISABLED)
+ remove = true;
+
+ idx = erq->flags & IW_ENCODE_INDEX;
+ if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) {
+ if (idx < 4 || idx > 5) {
+ idx = wdev->wext.default_mgmt_key;
+ if (idx < 0)
+ return -EINVAL;
+ } else
+ idx--;
+ } else {
+ if (idx < 1 || idx > 4) {
+ idx = wdev->wext.default_key;
+ if (idx < 0)
+ return -EINVAL;
+ } else
+ idx--;
+ }
+
+ addr = ext->addr.sa_data;
+ if (is_broadcast_ether_addr(addr))
+ addr = NULL;
+
+ memset(&params, 0, sizeof(params));
+ params.key = ext->key;
+ params.key_len = ext->key_len;
+ params.cipher = cipher;
+
+ if (ext->ext_flags & IW_ENCODE_EXT_RX_SEQ_VALID) {
+ params.seq = ext->rx_seq;
+ params.seq_len = 6;
+ }
+
+ return cfg80211_set_encryption(
+ rdev, dev,
+ !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY),
+ addr, remove,
+ ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY,
+ idx, &params);
+}
+
+static int cfg80211_wext_giwencode(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *erq, char *keybuf)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int idx;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_ADHOC)
+ return -EOPNOTSUPP;
+
+ idx = erq->flags & IW_ENCODE_INDEX;
+ if (idx == 0) {
+ idx = wdev->wext.default_key;
+ if (idx < 0)
+ idx = 0;
+ } else if (idx < 1 || idx > 4)
+ return -EINVAL;
+ else
+ idx--;
+
+ erq->flags = idx + 1;
+
+ if (!wdev->wext.keys || !wdev->wext.keys->params[idx].cipher) {
+ erq->flags |= IW_ENCODE_DISABLED;
+ erq->length = 0;
+ return 0;
+ }
+
+ erq->length = min_t(size_t, erq->length,
+ wdev->wext.keys->params[idx].key_len);
+ memcpy(keybuf, wdev->wext.keys->params[idx].key, erq->length);
+ erq->flags |= IW_ENCODE_ENABLED;
+
+ return 0;
+}
+
+static int cfg80211_wext_siwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *wextfreq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_chan_def chandef = {
+ .width = NL80211_CHAN_WIDTH_20_NOHT,
+ };
+ int freq;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_STATION:
+ return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra);
+ case NL80211_IFTYPE_ADHOC:
+ return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra);
+ case NL80211_IFTYPE_MONITOR:
+ freq = cfg80211_wext_freq(wextfreq);
+ if (freq < 0)
+ return freq;
+ if (freq == 0)
+ return -EINVAL;
+ chandef.center_freq1 = freq;
+ chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
+ if (!chandef.chan)
+ return -EINVAL;
+ return cfg80211_set_monitor_channel(rdev, &chandef);
+ case NL80211_IFTYPE_MESH_POINT:
+ freq = cfg80211_wext_freq(wextfreq);
+ if (freq < 0)
+ return freq;
+ if (freq == 0)
+ return -EINVAL;
+ chandef.center_freq1 = freq;
+ chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
+ if (!chandef.chan)
+ return -EINVAL;
+ return cfg80211_set_mesh_channel(rdev, wdev, &chandef);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int cfg80211_wext_giwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *freq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_chan_def chandef;
+ int ret;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_STATION:
+ return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra);
+ case NL80211_IFTYPE_ADHOC:
+ return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra);
+ case NL80211_IFTYPE_MONITOR:
+ if (!rdev->ops->get_channel)
+ return -EINVAL;
+
+ ret = rdev_get_channel(rdev, wdev, &chandef);
+ if (ret)
+ return ret;
+ freq->m = chandef.chan->center_freq;
+ freq->e = 6;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int cfg80211_wext_siwtxpower(struct net_device *dev,
+ struct iw_request_info *info,
+ union iwreq_data *data, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ enum nl80211_tx_power_setting type;
+ int dbm = 0;
+
+ if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM)
+ return -EINVAL;
+ if (data->txpower.flags & IW_TXPOW_RANGE)
+ return -EINVAL;
+
+ if (!rdev->ops->set_tx_power)
+ return -EOPNOTSUPP;
+
+ /* only change when not disabling */
+ if (!data->txpower.disabled) {
+ rfkill_set_sw_state(rdev->rfkill, false);
+
+ if (data->txpower.fixed) {
+ /*
+ * wext doesn't support negative values, see
+ * below where it's for automatic
+ */
+ if (data->txpower.value < 0)
+ return -EINVAL;
+ dbm = data->txpower.value;
+ type = NL80211_TX_POWER_FIXED;
+ /* TODO: do regulatory check! */
+ } else {
+ /*
+ * Automatic power level setting, max being the value
+ * passed in from userland.
+ */
+ if (data->txpower.value < 0) {
+ type = NL80211_TX_POWER_AUTOMATIC;
+ } else {
+ dbm = data->txpower.value;
+ type = NL80211_TX_POWER_LIMITED;
+ }
+ }
+ } else {
+ rfkill_set_sw_state(rdev->rfkill, true);
+ schedule_work(&rdev->rfkill_sync);
+ return 0;
+ }
+
+ return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm));
+}
+
+static int cfg80211_wext_giwtxpower(struct net_device *dev,
+ struct iw_request_info *info,
+ union iwreq_data *data, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ int err, val;
+
+ if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM)
+ return -EINVAL;
+ if (data->txpower.flags & IW_TXPOW_RANGE)
+ return -EINVAL;
+
+ if (!rdev->ops->get_tx_power)
+ return -EOPNOTSUPP;
+
+ err = rdev_get_tx_power(rdev, wdev, &val);
+ if (err)
+ return err;
+
+ /* well... oh well */
+ data->txpower.fixed = 1;
+ data->txpower.disabled = rfkill_blocked(rdev->rfkill);
+ data->txpower.value = val;
+ data->txpower.flags = IW_TXPOW_DBM;
+
+ return 0;
+}
+
+static int cfg80211_set_auth_alg(struct wireless_dev *wdev,
+ s32 auth_alg)
+{
+ int nr_alg = 0;
+
+ if (!auth_alg)
+ return -EINVAL;
+
+ if (auth_alg & ~(IW_AUTH_ALG_OPEN_SYSTEM |
+ IW_AUTH_ALG_SHARED_KEY |
+ IW_AUTH_ALG_LEAP))
+ return -EINVAL;
+
+ if (auth_alg & IW_AUTH_ALG_OPEN_SYSTEM) {
+ nr_alg++;
+ wdev->wext.connect.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM;
+ }
+
+ if (auth_alg & IW_AUTH_ALG_SHARED_KEY) {
+ nr_alg++;
+ wdev->wext.connect.auth_type = NL80211_AUTHTYPE_SHARED_KEY;
+ }
+
+ if (auth_alg & IW_AUTH_ALG_LEAP) {
+ nr_alg++;
+ wdev->wext.connect.auth_type = NL80211_AUTHTYPE_NETWORK_EAP;
+ }
+
+ if (nr_alg > 1)
+ wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+
+ return 0;
+}
+
+static int cfg80211_set_wpa_version(struct wireless_dev *wdev, u32 wpa_versions)
+{
+ if (wpa_versions & ~(IW_AUTH_WPA_VERSION_WPA |
+ IW_AUTH_WPA_VERSION_WPA2|
+ IW_AUTH_WPA_VERSION_DISABLED))
+ return -EINVAL;
+
+ if ((wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) &&
+ (wpa_versions & (IW_AUTH_WPA_VERSION_WPA|
+ IW_AUTH_WPA_VERSION_WPA2)))
+ return -EINVAL;
+
+ if (wpa_versions & IW_AUTH_WPA_VERSION_DISABLED)
+ wdev->wext.connect.crypto.wpa_versions &=
+ ~(NL80211_WPA_VERSION_1|NL80211_WPA_VERSION_2);
+
+ if (wpa_versions & IW_AUTH_WPA_VERSION_WPA)
+ wdev->wext.connect.crypto.wpa_versions |=
+ NL80211_WPA_VERSION_1;
+
+ if (wpa_versions & IW_AUTH_WPA_VERSION_WPA2)
+ wdev->wext.connect.crypto.wpa_versions |=
+ NL80211_WPA_VERSION_2;
+
+ return 0;
+}
+
+static int cfg80211_set_cipher_group(struct wireless_dev *wdev, u32 cipher)
+{
+ if (cipher & IW_AUTH_CIPHER_WEP40)
+ wdev->wext.connect.crypto.cipher_group =
+ WLAN_CIPHER_SUITE_WEP40;
+ else if (cipher & IW_AUTH_CIPHER_WEP104)
+ wdev->wext.connect.crypto.cipher_group =
+ WLAN_CIPHER_SUITE_WEP104;
+ else if (cipher & IW_AUTH_CIPHER_TKIP)
+ wdev->wext.connect.crypto.cipher_group =
+ WLAN_CIPHER_SUITE_TKIP;
+ else if (cipher & IW_AUTH_CIPHER_CCMP)
+ wdev->wext.connect.crypto.cipher_group =
+ WLAN_CIPHER_SUITE_CCMP;
+ else if (cipher & IW_AUTH_CIPHER_AES_CMAC)
+ wdev->wext.connect.crypto.cipher_group =
+ WLAN_CIPHER_SUITE_AES_CMAC;
+ else if (cipher & IW_AUTH_CIPHER_NONE)
+ wdev->wext.connect.crypto.cipher_group = 0;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+static int cfg80211_set_cipher_pairwise(struct wireless_dev *wdev, u32 cipher)
+{
+ int nr_ciphers = 0;
+ u32 *ciphers_pairwise = wdev->wext.connect.crypto.ciphers_pairwise;
+
+ if (cipher & IW_AUTH_CIPHER_WEP40) {
+ ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP40;
+ nr_ciphers++;
+ }
+
+ if (cipher & IW_AUTH_CIPHER_WEP104) {
+ ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP104;
+ nr_ciphers++;
+ }
+
+ if (cipher & IW_AUTH_CIPHER_TKIP) {
+ ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_TKIP;
+ nr_ciphers++;
+ }
+
+ if (cipher & IW_AUTH_CIPHER_CCMP) {
+ ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_CCMP;
+ nr_ciphers++;
+ }
+
+ if (cipher & IW_AUTH_CIPHER_AES_CMAC) {
+ ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_AES_CMAC;
+ nr_ciphers++;
+ }
+
+ BUILD_BUG_ON(NL80211_MAX_NR_CIPHER_SUITES < 5);
+
+ wdev->wext.connect.crypto.n_ciphers_pairwise = nr_ciphers;
+
+ return 0;
+}
+
+
+static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt)
+{
+ int nr_akm_suites = 0;
+
+ if (key_mgt & ~(IW_AUTH_KEY_MGMT_802_1X |
+ IW_AUTH_KEY_MGMT_PSK))
+ return -EINVAL;
+
+ if (key_mgt & IW_AUTH_KEY_MGMT_802_1X) {
+ wdev->wext.connect.crypto.akm_suites[nr_akm_suites] =
+ WLAN_AKM_SUITE_8021X;
+ nr_akm_suites++;
+ }
+
+ if (key_mgt & IW_AUTH_KEY_MGMT_PSK) {
+ wdev->wext.connect.crypto.akm_suites[nr_akm_suites] =
+ WLAN_AKM_SUITE_PSK;
+ nr_akm_suites++;
+ }
+
+ wdev->wext.connect.crypto.n_akm_suites = nr_akm_suites;
+
+ return 0;
+}
+
+static int cfg80211_wext_siwauth(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *data, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION)
+ return -EOPNOTSUPP;
+
+ switch (data->flags & IW_AUTH_INDEX) {
+ case IW_AUTH_PRIVACY_INVOKED:
+ wdev->wext.connect.privacy = data->value;
+ return 0;
+ case IW_AUTH_WPA_VERSION:
+ return cfg80211_set_wpa_version(wdev, data->value);
+ case IW_AUTH_CIPHER_GROUP:
+ return cfg80211_set_cipher_group(wdev, data->value);
+ case IW_AUTH_KEY_MGMT:
+ return cfg80211_set_key_mgt(wdev, data->value);
+ case IW_AUTH_CIPHER_PAIRWISE:
+ return cfg80211_set_cipher_pairwise(wdev, data->value);
+ case IW_AUTH_80211_AUTH_ALG:
+ return cfg80211_set_auth_alg(wdev, data->value);
+ case IW_AUTH_WPA_ENABLED:
+ case IW_AUTH_RX_UNENCRYPTED_EAPOL:
+ case IW_AUTH_DROP_UNENCRYPTED:
+ case IW_AUTH_MFP:
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int cfg80211_wext_giwauth(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *data, char *extra)
+{
+ /* XXX: what do we need? */
+
+ return -EOPNOTSUPP;
+}
+
+static int cfg80211_wext_siwpower(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *wrq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ bool ps = wdev->ps;
+ int timeout = wdev->ps_timeout;
+ int err;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION)
+ return -EINVAL;
+
+ if (!rdev->ops->set_power_mgmt)
+ return -EOPNOTSUPP;
+
+ if (wrq->disabled) {
+ ps = false;
+ } else {
+ switch (wrq->flags & IW_POWER_MODE) {
+ case IW_POWER_ON: /* If not specified */
+ case IW_POWER_MODE: /* If set all mask */
+ case IW_POWER_ALL_R: /* If explicitely state all */
+ ps = true;
+ break;
+ default: /* Otherwise we ignore */
+ return -EINVAL;
+ }
+
+ if (wrq->flags & ~(IW_POWER_MODE | IW_POWER_TIMEOUT))
+ return -EINVAL;
+
+ if (wrq->flags & IW_POWER_TIMEOUT)
+ timeout = wrq->value / 1000;
+ }
+
+ err = rdev_set_power_mgmt(rdev, dev, ps, timeout);
+ if (err)
+ return err;
+
+ wdev->ps = ps;
+ wdev->ps_timeout = timeout;
+
+ return 0;
+
+}
+
+static int cfg80211_wext_giwpower(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *wrq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ wrq->disabled = !wdev->ps;
+
+ return 0;
+}
+
+static int cfg80211_wds_wext_siwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *addr, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ int err;
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS))
+ return -EINVAL;
+
+ if (addr->sa_family != ARPHRD_ETHER)
+ return -EINVAL;
+
+ if (netif_running(dev))
+ return -EBUSY;
+
+ if (!rdev->ops->set_wds_peer)
+ return -EOPNOTSUPP;
+
+ err = rdev_set_wds_peer(rdev, dev, (u8 *)&addr->sa_data);
+ if (err)
+ return err;
+
+ memcpy(&wdev->wext.bssid, (u8 *) &addr->sa_data, ETH_ALEN);
+
+ return 0;
+}
+
+static int cfg80211_wds_wext_giwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *addr, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS))
+ return -EINVAL;
+
+ addr->sa_family = ARPHRD_ETHER;
+ memcpy(&addr->sa_data, wdev->wext.bssid, ETH_ALEN);
+
+ return 0;
+}
+
+static int cfg80211_wext_siwrate(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *rate, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_bitrate_mask mask;
+ u32 fixed, maxrate;
+ struct ieee80211_supported_band *sband;
+ int band, ridx;
+ bool match = false;
+
+ if (!rdev->ops->set_bitrate_mask)
+ return -EOPNOTSUPP;
+
+ memset(&mask, 0, sizeof(mask));
+ fixed = 0;
+ maxrate = (u32)-1;
+
+ if (rate->value < 0) {
+ /* nothing */
+ } else if (rate->fixed) {
+ fixed = rate->value / 100000;
+ } else {
+ maxrate = rate->value / 100000;
+ }
+
+ for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+ sband = wdev->wiphy->bands[band];
+ if (sband == NULL)
+ continue;
+ for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
+ struct ieee80211_rate *srate = &sband->bitrates[ridx];
+ if (fixed == srate->bitrate) {
+ mask.control[band].legacy = 1 << ridx;
+ match = true;
+ break;
+ }
+ if (srate->bitrate <= maxrate) {
+ mask.control[band].legacy |= 1 << ridx;
+ match = true;
+ }
+ }
+ }
+
+ if (!match)
+ return -EINVAL;
+
+ return rdev_set_bitrate_mask(rdev, dev, NULL, &mask);
+}
+
+static int cfg80211_wext_giwrate(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_param *rate, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ /* we are under RTNL - globally locked - so can use a static struct */
+ static struct station_info sinfo;
+ u8 addr[ETH_ALEN];
+ int err;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION)
+ return -EOPNOTSUPP;
+
+ if (!rdev->ops->get_station)
+ return -EOPNOTSUPP;
+
+ err = 0;
+ wdev_lock(wdev);
+ if (wdev->current_bss)
+ memcpy(addr, wdev->current_bss->pub.bssid, ETH_ALEN);
+ else
+ err = -EOPNOTSUPP;
+ wdev_unlock(wdev);
+ if (err)
+ return err;
+
+ err = rdev_get_station(rdev, dev, addr, &sinfo);
+ if (err)
+ return err;
+
+ if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)))
+ return -EOPNOTSUPP;
+
+ rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
+
+ return 0;
+}
+
+/* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */
+static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ /* we are under RTNL - globally locked - so can use static structs */
+ static struct iw_statistics wstats;
+ static struct station_info sinfo;
+ u8 bssid[ETH_ALEN];
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION)
+ return NULL;
+
+ if (!rdev->ops->get_station)
+ return NULL;
+
+ /* Grab BSSID of current BSS, if any */
+ wdev_lock(wdev);
+ if (!wdev->current_bss) {
+ wdev_unlock(wdev);
+ return NULL;
+ }
+ memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN);
+ wdev_unlock(wdev);
+
+ memset(&sinfo, 0, sizeof(sinfo));
+
+ if (rdev_get_station(rdev, dev, bssid, &sinfo))
+ return NULL;
+
+ memset(&wstats, 0, sizeof(wstats));
+
+ switch (rdev->wiphy.signal_type) {
+ case CFG80211_SIGNAL_TYPE_MBM:
+ if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) {
+ int sig = sinfo.signal;
+ wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
+ wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
+ wstats.qual.updated |= IW_QUAL_DBM;
+ wstats.qual.level = sig;
+ if (sig < -110)
+ sig = -110;
+ else if (sig > -40)
+ sig = -40;
+ wstats.qual.qual = sig + 110;
+ break;
+ }
+ case CFG80211_SIGNAL_TYPE_UNSPEC:
+ if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) {
+ wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
+ wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
+ wstats.qual.level = sinfo.signal;
+ wstats.qual.qual = sinfo.signal;
+ break;
+ }
+ default:
+ wstats.qual.updated |= IW_QUAL_LEVEL_INVALID;
+ wstats.qual.updated |= IW_QUAL_QUAL_INVALID;
+ }
+
+ wstats.qual.updated |= IW_QUAL_NOISE_INVALID;
+ if (sinfo.filled & BIT(NL80211_STA_INFO_RX_DROP_MISC))
+ wstats.discard.misc = sinfo.rx_dropped_misc;
+ if (sinfo.filled & BIT(NL80211_STA_INFO_TX_FAILED))
+ wstats.discard.retries = sinfo.tx_failed;
+
+ return &wstats;
+}
+
+static int cfg80211_wext_siwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra);
+ case NL80211_IFTYPE_STATION:
+ return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra);
+ case NL80211_IFTYPE_WDS:
+ return cfg80211_wds_wext_siwap(dev, info, ap_addr, extra);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int cfg80211_wext_giwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra);
+ case NL80211_IFTYPE_STATION:
+ return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra);
+ case NL80211_IFTYPE_WDS:
+ return cfg80211_wds_wext_giwap(dev, info, ap_addr, extra);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int cfg80211_wext_siwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ return cfg80211_ibss_wext_siwessid(dev, info, data, ssid);
+ case NL80211_IFTYPE_STATION:
+ return cfg80211_mgd_wext_siwessid(dev, info, data, ssid);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int cfg80211_wext_giwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ data->flags = 0;
+ data->length = 0;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ return cfg80211_ibss_wext_giwessid(dev, info, data, ssid);
+ case NL80211_IFTYPE_STATION:
+ return cfg80211_mgd_wext_giwessid(dev, info, data, ssid);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int cfg80211_wext_siwpmksa(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct cfg80211_pmksa cfg_pmksa;
+ struct iw_pmksa *pmksa = (struct iw_pmksa *)extra;
+
+ memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa));
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION)
+ return -EINVAL;
+
+ cfg_pmksa.bssid = pmksa->bssid.sa_data;
+ cfg_pmksa.pmkid = pmksa->pmkid;
+
+ switch (pmksa->cmd) {
+ case IW_PMKSA_ADD:
+ if (!rdev->ops->set_pmksa)
+ return -EOPNOTSUPP;
+
+ return rdev_set_pmksa(rdev, dev, &cfg_pmksa);
+
+ case IW_PMKSA_REMOVE:
+ if (!rdev->ops->del_pmksa)
+ return -EOPNOTSUPP;
+
+ return rdev_del_pmksa(rdev, dev, &cfg_pmksa);
+
+ case IW_PMKSA_FLUSH:
+ if (!rdev->ops->flush_pmksa)
+ return -EOPNOTSUPP;
+
+ return rdev_flush_pmksa(rdev, dev);
+
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static const iw_handler cfg80211_handlers[] = {
+ [IW_IOCTL_IDX(SIOCGIWNAME)] = (iw_handler) cfg80211_wext_giwname,
+ [IW_IOCTL_IDX(SIOCSIWFREQ)] = (iw_handler) cfg80211_wext_siwfreq,
+ [IW_IOCTL_IDX(SIOCGIWFREQ)] = (iw_handler) cfg80211_wext_giwfreq,
+ [IW_IOCTL_IDX(SIOCSIWMODE)] = (iw_handler) cfg80211_wext_siwmode,
+ [IW_IOCTL_IDX(SIOCGIWMODE)] = (iw_handler) cfg80211_wext_giwmode,
+ [IW_IOCTL_IDX(SIOCGIWRANGE)] = (iw_handler) cfg80211_wext_giwrange,
+ [IW_IOCTL_IDX(SIOCSIWAP)] = (iw_handler) cfg80211_wext_siwap,
+ [IW_IOCTL_IDX(SIOCGIWAP)] = (iw_handler) cfg80211_wext_giwap,
+ [IW_IOCTL_IDX(SIOCSIWMLME)] = (iw_handler) cfg80211_wext_siwmlme,
+ [IW_IOCTL_IDX(SIOCSIWSCAN)] = (iw_handler) cfg80211_wext_siwscan,
+ [IW_IOCTL_IDX(SIOCGIWSCAN)] = (iw_handler) cfg80211_wext_giwscan,
+ [IW_IOCTL_IDX(SIOCSIWESSID)] = (iw_handler) cfg80211_wext_siwessid,
+ [IW_IOCTL_IDX(SIOCGIWESSID)] = (iw_handler) cfg80211_wext_giwessid,
+ [IW_IOCTL_IDX(SIOCSIWRATE)] = (iw_handler) cfg80211_wext_siwrate,
+ [IW_IOCTL_IDX(SIOCGIWRATE)] = (iw_handler) cfg80211_wext_giwrate,
+ [IW_IOCTL_IDX(SIOCSIWRTS)] = (iw_handler) cfg80211_wext_siwrts,
+ [IW_IOCTL_IDX(SIOCGIWRTS)] = (iw_handler) cfg80211_wext_giwrts,
+ [IW_IOCTL_IDX(SIOCSIWFRAG)] = (iw_handler) cfg80211_wext_siwfrag,
+ [IW_IOCTL_IDX(SIOCGIWFRAG)] = (iw_handler) cfg80211_wext_giwfrag,
+ [IW_IOCTL_IDX(SIOCSIWTXPOW)] = (iw_handler) cfg80211_wext_siwtxpower,
+ [IW_IOCTL_IDX(SIOCGIWTXPOW)] = (iw_handler) cfg80211_wext_giwtxpower,
+ [IW_IOCTL_IDX(SIOCSIWRETRY)] = (iw_handler) cfg80211_wext_siwretry,
+ [IW_IOCTL_IDX(SIOCGIWRETRY)] = (iw_handler) cfg80211_wext_giwretry,
+ [IW_IOCTL_IDX(SIOCSIWENCODE)] = (iw_handler) cfg80211_wext_siwencode,
+ [IW_IOCTL_IDX(SIOCGIWENCODE)] = (iw_handler) cfg80211_wext_giwencode,
+ [IW_IOCTL_IDX(SIOCSIWPOWER)] = (iw_handler) cfg80211_wext_siwpower,
+ [IW_IOCTL_IDX(SIOCGIWPOWER)] = (iw_handler) cfg80211_wext_giwpower,
+ [IW_IOCTL_IDX(SIOCSIWGENIE)] = (iw_handler) cfg80211_wext_siwgenie,
+ [IW_IOCTL_IDX(SIOCSIWAUTH)] = (iw_handler) cfg80211_wext_siwauth,
+ [IW_IOCTL_IDX(SIOCGIWAUTH)] = (iw_handler) cfg80211_wext_giwauth,
+ [IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= (iw_handler) cfg80211_wext_siwencodeext,
+ [IW_IOCTL_IDX(SIOCSIWPMKSA)] = (iw_handler) cfg80211_wext_siwpmksa,
+};
+
+const struct iw_handler_def cfg80211_wext_handler = {
+ .num_standard = ARRAY_SIZE(cfg80211_handlers),
+ .standard = cfg80211_handlers,
+ .get_wireless_stats = cfg80211_wireless_stats,
+};
diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h
new file mode 100644
index 000000000..94c7405a5
--- /dev/null
+++ b/net/wireless/wext-compat.h
@@ -0,0 +1,63 @@
+#ifndef __WEXT_COMPAT
+#define __WEXT_COMPAT
+
+#include <net/iw_handler.h>
+#include <linux/wireless.h>
+
+#ifdef CONFIG_CFG80211_WEXT_EXPORT
+#define EXPORT_WEXT_HANDLER(h) EXPORT_SYMBOL_GPL(h)
+#else
+#define EXPORT_WEXT_HANDLER(h)
+#endif /* CONFIG_CFG80211_WEXT_EXPORT */
+
+int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *freq, char *extra);
+int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *freq, char *extra);
+int cfg80211_ibss_wext_siwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra);
+int cfg80211_ibss_wext_giwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra);
+int cfg80211_ibss_wext_siwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid);
+int cfg80211_ibss_wext_giwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid);
+
+int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *freq, char *extra);
+int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *freq, char *extra);
+int cfg80211_mgd_wext_siwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra);
+int cfg80211_mgd_wext_giwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra);
+int cfg80211_mgd_wext_siwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid);
+int cfg80211_mgd_wext_giwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid);
+
+int cfg80211_wext_siwmlme(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *extra);
+int cfg80211_wext_siwgenie(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *extra);
+
+
+int cfg80211_wext_freq(struct iw_freq *freq);
+
+
+extern const struct iw_handler_def cfg80211_wext_handler;
+#endif /* __WEXT_COMPAT */
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
new file mode 100644
index 000000000..c8717c1d0
--- /dev/null
+++ b/net/wireless/wext-core.c
@@ -0,0 +1,1088 @@
+/*
+ * This file implement the Wireless Extensions core API.
+ *
+ * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
+ * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved.
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * (As all part of the Linux kernel, this file is GPL)
+ */
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/wireless.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <net/cfg80211.h>
+#include <net/iw_handler.h>
+#include <net/netlink.h>
+#include <net/wext.h>
+#include <net/net_namespace.h>
+
+typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *,
+ unsigned int, struct iw_request_info *,
+ iw_handler);
+
+
+/*
+ * Meta-data about all the standard Wireless Extension request we
+ * know about.
+ */
+static const struct iw_ioctl_description standard_ioctl[] = {
+ [IW_IOCTL_IDX(SIOCSIWCOMMIT)] = {
+ .header_type = IW_HEADER_TYPE_NULL,
+ },
+ [IW_IOCTL_IDX(SIOCGIWNAME)] = {
+ .header_type = IW_HEADER_TYPE_CHAR,
+ .flags = IW_DESCR_FLAG_DUMP,
+ },
+ [IW_IOCTL_IDX(SIOCSIWNWID)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ .flags = IW_DESCR_FLAG_EVENT,
+ },
+ [IW_IOCTL_IDX(SIOCGIWNWID)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ .flags = IW_DESCR_FLAG_DUMP,
+ },
+ [IW_IOCTL_IDX(SIOCSIWFREQ)] = {
+ .header_type = IW_HEADER_TYPE_FREQ,
+ .flags = IW_DESCR_FLAG_EVENT,
+ },
+ [IW_IOCTL_IDX(SIOCGIWFREQ)] = {
+ .header_type = IW_HEADER_TYPE_FREQ,
+ .flags = IW_DESCR_FLAG_DUMP,
+ },
+ [IW_IOCTL_IDX(SIOCSIWMODE)] = {
+ .header_type = IW_HEADER_TYPE_UINT,
+ .flags = IW_DESCR_FLAG_EVENT,
+ },
+ [IW_IOCTL_IDX(SIOCGIWMODE)] = {
+ .header_type = IW_HEADER_TYPE_UINT,
+ .flags = IW_DESCR_FLAG_DUMP,
+ },
+ [IW_IOCTL_IDX(SIOCSIWSENS)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCGIWSENS)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCSIWRANGE)] = {
+ .header_type = IW_HEADER_TYPE_NULL,
+ },
+ [IW_IOCTL_IDX(SIOCGIWRANGE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = sizeof(struct iw_range),
+ .flags = IW_DESCR_FLAG_DUMP,
+ },
+ [IW_IOCTL_IDX(SIOCSIWPRIV)] = {
+ .header_type = IW_HEADER_TYPE_NULL,
+ },
+ [IW_IOCTL_IDX(SIOCGIWPRIV)] = { /* (handled directly by us) */
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = sizeof(struct iw_priv_args),
+ .max_tokens = 16,
+ .flags = IW_DESCR_FLAG_NOMAX,
+ },
+ [IW_IOCTL_IDX(SIOCSIWSTATS)] = {
+ .header_type = IW_HEADER_TYPE_NULL,
+ },
+ [IW_IOCTL_IDX(SIOCGIWSTATS)] = { /* (handled directly by us) */
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = sizeof(struct iw_statistics),
+ .flags = IW_DESCR_FLAG_DUMP,
+ },
+ [IW_IOCTL_IDX(SIOCSIWSPY)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = sizeof(struct sockaddr),
+ .max_tokens = IW_MAX_SPY,
+ },
+ [IW_IOCTL_IDX(SIOCGIWSPY)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = sizeof(struct sockaddr) +
+ sizeof(struct iw_quality),
+ .max_tokens = IW_MAX_SPY,
+ },
+ [IW_IOCTL_IDX(SIOCSIWTHRSPY)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = sizeof(struct iw_thrspy),
+ .min_tokens = 1,
+ .max_tokens = 1,
+ },
+ [IW_IOCTL_IDX(SIOCGIWTHRSPY)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = sizeof(struct iw_thrspy),
+ .min_tokens = 1,
+ .max_tokens = 1,
+ },
+ [IW_IOCTL_IDX(SIOCSIWAP)] = {
+ .header_type = IW_HEADER_TYPE_ADDR,
+ },
+ [IW_IOCTL_IDX(SIOCGIWAP)] = {
+ .header_type = IW_HEADER_TYPE_ADDR,
+ .flags = IW_DESCR_FLAG_DUMP,
+ },
+ [IW_IOCTL_IDX(SIOCSIWMLME)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = sizeof(struct iw_mlme),
+ .max_tokens = sizeof(struct iw_mlme),
+ },
+ [IW_IOCTL_IDX(SIOCGIWAPLIST)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = sizeof(struct sockaddr) +
+ sizeof(struct iw_quality),
+ .max_tokens = IW_MAX_AP,
+ .flags = IW_DESCR_FLAG_NOMAX,
+ },
+ [IW_IOCTL_IDX(SIOCSIWSCAN)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = 0,
+ .max_tokens = sizeof(struct iw_scan_req),
+ },
+ [IW_IOCTL_IDX(SIOCGIWSCAN)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_SCAN_MAX_DATA,
+ .flags = IW_DESCR_FLAG_NOMAX,
+ },
+ [IW_IOCTL_IDX(SIOCSIWESSID)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_ESSID_MAX_SIZE,
+ .flags = IW_DESCR_FLAG_EVENT,
+ },
+ [IW_IOCTL_IDX(SIOCGIWESSID)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_ESSID_MAX_SIZE,
+ .flags = IW_DESCR_FLAG_DUMP,
+ },
+ [IW_IOCTL_IDX(SIOCSIWNICKN)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_ESSID_MAX_SIZE,
+ },
+ [IW_IOCTL_IDX(SIOCGIWNICKN)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_ESSID_MAX_SIZE,
+ },
+ [IW_IOCTL_IDX(SIOCSIWRATE)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCGIWRATE)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCSIWRTS)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCGIWRTS)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCSIWFRAG)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCGIWFRAG)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCSIWTXPOW)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCGIWTXPOW)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCSIWRETRY)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCGIWRETRY)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCSIWENCODE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_ENCODING_TOKEN_MAX,
+ .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT,
+ },
+ [IW_IOCTL_IDX(SIOCGIWENCODE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_ENCODING_TOKEN_MAX,
+ .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT,
+ },
+ [IW_IOCTL_IDX(SIOCSIWPOWER)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCGIWPOWER)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCSIWGENIE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [IW_IOCTL_IDX(SIOCGIWGENIE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [IW_IOCTL_IDX(SIOCSIWAUTH)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCGIWAUTH)] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [IW_IOCTL_IDX(SIOCSIWENCODEEXT)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = sizeof(struct iw_encode_ext),
+ .max_tokens = sizeof(struct iw_encode_ext) +
+ IW_ENCODING_TOKEN_MAX,
+ },
+ [IW_IOCTL_IDX(SIOCGIWENCODEEXT)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = sizeof(struct iw_encode_ext),
+ .max_tokens = sizeof(struct iw_encode_ext) +
+ IW_ENCODING_TOKEN_MAX,
+ },
+ [IW_IOCTL_IDX(SIOCSIWPMKSA)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = sizeof(struct iw_pmksa),
+ .max_tokens = sizeof(struct iw_pmksa),
+ },
+};
+static const unsigned int standard_ioctl_num = ARRAY_SIZE(standard_ioctl);
+
+/*
+ * Meta-data about all the additional standard Wireless Extension events
+ * we know about.
+ */
+static const struct iw_ioctl_description standard_event[] = {
+ [IW_EVENT_IDX(IWEVTXDROP)] = {
+ .header_type = IW_HEADER_TYPE_ADDR,
+ },
+ [IW_EVENT_IDX(IWEVQUAL)] = {
+ .header_type = IW_HEADER_TYPE_QUAL,
+ },
+ [IW_EVENT_IDX(IWEVCUSTOM)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_CUSTOM_MAX,
+ },
+ [IW_EVENT_IDX(IWEVREGISTERED)] = {
+ .header_type = IW_HEADER_TYPE_ADDR,
+ },
+ [IW_EVENT_IDX(IWEVEXPIRED)] = {
+ .header_type = IW_HEADER_TYPE_ADDR,
+ },
+ [IW_EVENT_IDX(IWEVGENIE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [IW_EVENT_IDX(IWEVMICHAELMICFAILURE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = sizeof(struct iw_michaelmicfailure),
+ },
+ [IW_EVENT_IDX(IWEVASSOCREQIE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [IW_EVENT_IDX(IWEVASSOCRESPIE)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [IW_EVENT_IDX(IWEVPMKIDCAND)] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = sizeof(struct iw_pmkid_cand),
+ },
+};
+static const unsigned int standard_event_num = ARRAY_SIZE(standard_event);
+
+/* Size (in bytes) of various events */
+static const int event_type_size[] = {
+ IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */
+ 0,
+ IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */
+ 0,
+ IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */
+ IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */
+ IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */
+ 0,
+ IW_EV_POINT_LEN, /* Without variable payload */
+ IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */
+ IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */
+};
+
+#ifdef CONFIG_COMPAT
+static const int compat_event_type_size[] = {
+ IW_EV_COMPAT_LCP_LEN, /* IW_HEADER_TYPE_NULL */
+ 0,
+ IW_EV_COMPAT_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */
+ 0,
+ IW_EV_COMPAT_UINT_LEN, /* IW_HEADER_TYPE_UINT */
+ IW_EV_COMPAT_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */
+ IW_EV_COMPAT_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */
+ 0,
+ IW_EV_COMPAT_POINT_LEN, /* Without variable payload */
+ IW_EV_COMPAT_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */
+ IW_EV_COMPAT_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */
+};
+#endif
+
+
+/* IW event code */
+
+static int __net_init wext_pernet_init(struct net *net)
+{
+ skb_queue_head_init(&net->wext_nlevents);
+ return 0;
+}
+
+static void __net_exit wext_pernet_exit(struct net *net)
+{
+ skb_queue_purge(&net->wext_nlevents);
+}
+
+static struct pernet_operations wext_pernet_ops = {
+ .init = wext_pernet_init,
+ .exit = wext_pernet_exit,
+};
+
+static int __init wireless_nlevent_init(void)
+{
+ return register_pernet_subsys(&wext_pernet_ops);
+}
+
+subsys_initcall(wireless_nlevent_init);
+
+/* Process events generated by the wireless layer or the driver. */
+static void wireless_nlevent_process(struct work_struct *work)
+{
+ struct sk_buff *skb;
+ struct net *net;
+
+ rtnl_lock();
+
+ for_each_net(net) {
+ while ((skb = skb_dequeue(&net->wext_nlevents)))
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+ GFP_KERNEL);
+ }
+
+ rtnl_unlock();
+}
+
+static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process);
+
+static struct nlmsghdr *rtnetlink_ifinfo_prep(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ struct ifinfomsg *r;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWLINK, sizeof(*r), 0);
+ if (!nlh)
+ return NULL;
+
+ r = nlmsg_data(nlh);
+ r->ifi_family = AF_UNSPEC;
+ r->__ifi_pad = 0;
+ r->ifi_type = dev->type;
+ r->ifi_index = dev->ifindex;
+ r->ifi_flags = dev_get_flags(dev);
+ r->ifi_change = 0; /* Wireless changes don't affect those flags */
+
+ if (nla_put_string(skb, IFLA_IFNAME, dev->name))
+ goto nla_put_failure;
+
+ return nlh;
+ nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return NULL;
+}
+
+
+/*
+ * Main event dispatcher. Called from other parts and drivers.
+ * Send the event on the appropriate channels.
+ * May be called from interrupt context.
+ */
+void wireless_send_event(struct net_device * dev,
+ unsigned int cmd,
+ union iwreq_data * wrqu,
+ const char * extra)
+{
+ const struct iw_ioctl_description * descr = NULL;
+ int extra_len = 0;
+ struct iw_event *event; /* Mallocated whole event */
+ int event_len; /* Its size */
+ int hdr_len; /* Size of the event header */
+ int wrqu_off = 0; /* Offset in wrqu */
+ /* Don't "optimise" the following variable, it will crash */
+ unsigned int cmd_index; /* *MUST* be unsigned */
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ struct nlattr *nla;
+#ifdef CONFIG_COMPAT
+ struct __compat_iw_event *compat_event;
+ struct compat_iw_point compat_wrqu;
+ struct sk_buff *compskb;
+#endif
+
+ /*
+ * Nothing in the kernel sends scan events with data, be safe.
+ * This is necessary because we cannot fix up scan event data
+ * for compat, due to being contained in 'extra', but normally
+ * applications are required to retrieve the scan data anyway
+ * and no data is included in the event, this codifies that
+ * practice.
+ */
+ if (WARN_ON(cmd == SIOCGIWSCAN && extra))
+ extra = NULL;
+
+ /* Get the description of the Event */
+ if (cmd <= SIOCIWLAST) {
+ cmd_index = IW_IOCTL_IDX(cmd);
+ if (cmd_index < standard_ioctl_num)
+ descr = &(standard_ioctl[cmd_index]);
+ } else {
+ cmd_index = IW_EVENT_IDX(cmd);
+ if (cmd_index < standard_event_num)
+ descr = &(standard_event[cmd_index]);
+ }
+ /* Don't accept unknown events */
+ if (descr == NULL) {
+ /* Note : we don't return an error to the driver, because
+ * the driver would not know what to do about it. It can't
+ * return an error to the user, because the event is not
+ * initiated by a user request.
+ * The best the driver could do is to log an error message.
+ * We will do it ourselves instead...
+ */
+ netdev_err(dev, "(WE) : Invalid/Unknown Wireless Event (0x%04X)\n",
+ cmd);
+ return;
+ }
+
+ /* Check extra parameters and set extra_len */
+ if (descr->header_type == IW_HEADER_TYPE_POINT) {
+ /* Check if number of token fits within bounds */
+ if (wrqu->data.length > descr->max_tokens) {
+ netdev_err(dev, "(WE) : Wireless Event (cmd=0x%04X) too big (%d)\n",
+ cmd, wrqu->data.length);
+ return;
+ }
+ if (wrqu->data.length < descr->min_tokens) {
+ netdev_err(dev, "(WE) : Wireless Event (cmd=0x%04X) too small (%d)\n",
+ cmd, wrqu->data.length);
+ return;
+ }
+ /* Calculate extra_len - extra is NULL for restricted events */
+ if (extra != NULL)
+ extra_len = wrqu->data.length * descr->token_size;
+ /* Always at an offset in wrqu */
+ wrqu_off = IW_EV_POINT_OFF;
+ }
+
+ /* Total length of the event */
+ hdr_len = event_type_size[descr->header_type];
+ event_len = hdr_len + extra_len;
+
+ /*
+ * The problem for 64/32 bit.
+ *
+ * On 64-bit, a regular event is laid out as follows:
+ * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ * | event.len | event.cmd | p a d d i n g |
+ * | wrqu data ... (with the correct size) |
+ *
+ * This padding exists because we manipulate event->u,
+ * and 'event' is not packed.
+ *
+ * An iw_point event is laid out like this instead:
+ * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+ * | event.len | event.cmd | p a d d i n g |
+ * | iwpnt.len | iwpnt.flg | p a d d i n g |
+ * | extra data ...
+ *
+ * The second padding exists because struct iw_point is extended,
+ * but this depends on the platform...
+ *
+ * On 32-bit, all the padding shouldn't be there.
+ */
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Send via the RtNetlink event channel */
+ nlh = rtnetlink_ifinfo_prep(dev, skb);
+ if (WARN_ON(!nlh)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* Add the wireless events in the netlink packet */
+ nla = nla_reserve(skb, IFLA_WIRELESS, event_len);
+ if (!nla) {
+ kfree_skb(skb);
+ return;
+ }
+ event = nla_data(nla);
+
+ /* Fill event - first clear to avoid data leaking */
+ memset(event, 0, hdr_len);
+ event->len = event_len;
+ event->cmd = cmd;
+ memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN);
+ if (extra_len)
+ memcpy(((char *) event) + hdr_len, extra, extra_len);
+
+ nlmsg_end(skb, nlh);
+#ifdef CONFIG_COMPAT
+ hdr_len = compat_event_type_size[descr->header_type];
+ event_len = hdr_len + extra_len;
+
+ compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!compskb) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* Send via the RtNetlink event channel */
+ nlh = rtnetlink_ifinfo_prep(dev, compskb);
+ if (WARN_ON(!nlh)) {
+ kfree_skb(skb);
+ kfree_skb(compskb);
+ return;
+ }
+
+ /* Add the wireless events in the netlink packet */
+ nla = nla_reserve(compskb, IFLA_WIRELESS, event_len);
+ if (!nla) {
+ kfree_skb(skb);
+ kfree_skb(compskb);
+ return;
+ }
+ compat_event = nla_data(nla);
+
+ compat_event->len = event_len;
+ compat_event->cmd = cmd;
+ if (descr->header_type == IW_HEADER_TYPE_POINT) {
+ compat_wrqu.length = wrqu->data.length;
+ compat_wrqu.flags = wrqu->data.flags;
+ memcpy(&compat_event->pointer,
+ ((char *) &compat_wrqu) + IW_EV_COMPAT_POINT_OFF,
+ hdr_len - IW_EV_COMPAT_LCP_LEN);
+ if (extra_len)
+ memcpy(((char *) compat_event) + hdr_len,
+ extra, extra_len);
+ } else {
+ /* extra_len must be zero, so no if (extra) needed */
+ memcpy(&compat_event->pointer, wrqu,
+ hdr_len - IW_EV_COMPAT_LCP_LEN);
+ }
+
+ nlmsg_end(compskb, nlh);
+
+ skb_shinfo(skb)->frag_list = compskb;
+#endif
+ skb_queue_tail(&dev_net(dev)->wext_nlevents, skb);
+ schedule_work(&wireless_nlevent_work);
+}
+EXPORT_SYMBOL(wireless_send_event);
+
+
+
+/* IW handlers */
+
+struct iw_statistics *get_wireless_stats(struct net_device *dev)
+{
+#ifdef CONFIG_WIRELESS_EXT
+ if ((dev->wireless_handlers != NULL) &&
+ (dev->wireless_handlers->get_wireless_stats != NULL))
+ return dev->wireless_handlers->get_wireless_stats(dev);
+#endif
+
+#ifdef CONFIG_CFG80211_WEXT
+ if (dev->ieee80211_ptr &&
+ dev->ieee80211_ptr->wiphy &&
+ dev->ieee80211_ptr->wiphy->wext &&
+ dev->ieee80211_ptr->wiphy->wext->get_wireless_stats)
+ return dev->ieee80211_ptr->wiphy->wext->get_wireless_stats(dev);
+#endif
+
+ /* not found */
+ return NULL;
+}
+
+static int iw_handler_get_iwstats(struct net_device * dev,
+ struct iw_request_info * info,
+ union iwreq_data * wrqu,
+ char * extra)
+{
+ /* Get stats from the driver */
+ struct iw_statistics *stats;
+
+ stats = get_wireless_stats(dev);
+ if (stats) {
+ /* Copy statistics to extra */
+ memcpy(extra, stats, sizeof(struct iw_statistics));
+ wrqu->data.length = sizeof(struct iw_statistics);
+
+ /* Check if we need to clear the updated flag */
+ if (wrqu->data.flags != 0)
+ stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
+ return 0;
+ } else
+ return -EOPNOTSUPP;
+}
+
+static iw_handler get_handler(struct net_device *dev, unsigned int cmd)
+{
+ /* Don't "optimise" the following variable, it will crash */
+ unsigned int index; /* *MUST* be unsigned */
+ const struct iw_handler_def *handlers = NULL;
+
+#ifdef CONFIG_CFG80211_WEXT
+ if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy)
+ handlers = dev->ieee80211_ptr->wiphy->wext;
+#endif
+#ifdef CONFIG_WIRELESS_EXT
+ if (dev->wireless_handlers)
+ handlers = dev->wireless_handlers;
+#endif
+
+ if (!handlers)
+ return NULL;
+
+ /* Try as a standard command */
+ index = IW_IOCTL_IDX(cmd);
+ if (index < handlers->num_standard)
+ return handlers->standard[index];
+
+#ifdef CONFIG_WEXT_PRIV
+ /* Try as a private command */
+ index = cmd - SIOCIWFIRSTPRIV;
+ if (index < handlers->num_private)
+ return handlers->private[index];
+#endif
+
+ /* Not found */
+ return NULL;
+}
+
+static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd,
+ const struct iw_ioctl_description *descr,
+ iw_handler handler, struct net_device *dev,
+ struct iw_request_info *info)
+{
+ int err, extra_size, user_length = 0, essid_compat = 0;
+ char *extra;
+
+ /* Calculate space needed by arguments. Always allocate
+ * for max space.
+ */
+ extra_size = descr->max_tokens * descr->token_size;
+
+ /* Check need for ESSID compatibility for WE < 21 */
+ switch (cmd) {
+ case SIOCSIWESSID:
+ case SIOCGIWESSID:
+ case SIOCSIWNICKN:
+ case SIOCGIWNICKN:
+ if (iwp->length == descr->max_tokens + 1)
+ essid_compat = 1;
+ else if (IW_IS_SET(cmd) && (iwp->length != 0)) {
+ char essid[IW_ESSID_MAX_SIZE + 1];
+ unsigned int len;
+ len = iwp->length * descr->token_size;
+
+ if (len > IW_ESSID_MAX_SIZE)
+ return -EFAULT;
+
+ err = copy_from_user(essid, iwp->pointer, len);
+ if (err)
+ return -EFAULT;
+
+ if (essid[iwp->length - 1] == '\0')
+ essid_compat = 1;
+ }
+ break;
+ default:
+ break;
+ }
+
+ iwp->length -= essid_compat;
+
+ /* Check what user space is giving us */
+ if (IW_IS_SET(cmd)) {
+ /* Check NULL pointer */
+ if (!iwp->pointer && iwp->length != 0)
+ return -EFAULT;
+ /* Check if number of token fits within bounds */
+ if (iwp->length > descr->max_tokens)
+ return -E2BIG;
+ if (iwp->length < descr->min_tokens)
+ return -EINVAL;
+ } else {
+ /* Check NULL pointer */
+ if (!iwp->pointer)
+ return -EFAULT;
+ /* Save user space buffer size for checking */
+ user_length = iwp->length;
+
+ /* Don't check if user_length > max to allow forward
+ * compatibility. The test user_length < min is
+ * implied by the test at the end.
+ */
+
+ /* Support for very large requests */
+ if ((descr->flags & IW_DESCR_FLAG_NOMAX) &&
+ (user_length > descr->max_tokens)) {
+ /* Allow userspace to GET more than max so
+ * we can support any size GET requests.
+ * There is still a limit : -ENOMEM.
+ */
+ extra_size = user_length * descr->token_size;
+
+ /* Note : user_length is originally a __u16,
+ * and token_size is controlled by us,
+ * so extra_size won't get negative and
+ * won't overflow...
+ */
+ }
+ }
+
+ /* kzalloc() ensures NULL-termination for essid_compat. */
+ extra = kzalloc(extra_size, GFP_KERNEL);
+ if (!extra)
+ return -ENOMEM;
+
+ /* If it is a SET, get all the extra data in here */
+ if (IW_IS_SET(cmd) && (iwp->length != 0)) {
+ if (copy_from_user(extra, iwp->pointer,
+ iwp->length *
+ descr->token_size)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ if (cmd == SIOCSIWENCODEEXT) {
+ struct iw_encode_ext *ee = (void *) extra;
+
+ if (iwp->length < sizeof(*ee) + ee->key_len) {
+ err = -EFAULT;
+ goto out;
+ }
+ }
+ }
+
+ if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) {
+ /*
+ * If this is a GET, but not NOMAX, it means that the extra
+ * data is not bounded by userspace, but by max_tokens. Thus
+ * set the length to max_tokens. This matches the extra data
+ * allocation.
+ * The driver should fill it with the number of tokens it
+ * provided, and it may check iwp->length rather than having
+ * knowledge of max_tokens. If the driver doesn't change the
+ * iwp->length, this ioctl just copies back max_token tokens
+ * filled with zeroes. Hopefully the driver isn't claiming
+ * them to be valid data.
+ */
+ iwp->length = descr->max_tokens;
+ }
+
+ err = handler(dev, info, (union iwreq_data *) iwp, extra);
+
+ iwp->length += essid_compat;
+
+ /* If we have something to return to the user */
+ if (!err && IW_IS_GET(cmd)) {
+ /* Check if there is enough buffer up there */
+ if (user_length < iwp->length) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ if (copy_to_user(iwp->pointer, extra,
+ iwp->length *
+ descr->token_size)) {
+ err = -EFAULT;
+ goto out;
+ }
+ }
+
+ /* Generate an event to notify listeners of the change */
+ if ((descr->flags & IW_DESCR_FLAG_EVENT) &&
+ ((err == 0) || (err == -EIWCOMMIT))) {
+ union iwreq_data *data = (union iwreq_data *) iwp;
+
+ if (descr->flags & IW_DESCR_FLAG_RESTRICT)
+ /* If the event is restricted, don't
+ * export the payload.
+ */
+ wireless_send_event(dev, cmd, data, NULL);
+ else
+ wireless_send_event(dev, cmd, data, extra);
+ }
+
+out:
+ kfree(extra);
+ return err;
+}
+
+/*
+ * Call the commit handler in the driver
+ * (if exist and if conditions are right)
+ *
+ * Note : our current commit strategy is currently pretty dumb,
+ * but we will be able to improve on that...
+ * The goal is to try to agreagate as many changes as possible
+ * before doing the commit. Drivers that will define a commit handler
+ * are usually those that need a reset after changing parameters, so
+ * we want to minimise the number of reset.
+ * A cool idea is to use a timer : at each "set" command, we re-set the
+ * timer, when the timer eventually fires, we call the driver.
+ * Hopefully, more on that later.
+ *
+ * Also, I'm waiting to see how many people will complain about the
+ * netif_running(dev) test. I'm open on that one...
+ * Hopefully, the driver will remember to do a commit in "open()" ;-)
+ */
+int call_commit_handler(struct net_device *dev)
+{
+#ifdef CONFIG_WIRELESS_EXT
+ if ((netif_running(dev)) &&
+ (dev->wireless_handlers->standard[0] != NULL))
+ /* Call the commit handler on the driver */
+ return dev->wireless_handlers->standard[0](dev, NULL,
+ NULL, NULL);
+ else
+ return 0; /* Command completed successfully */
+#else
+ /* cfg80211 has no commit */
+ return 0;
+#endif
+}
+
+/*
+ * Main IOCTl dispatcher.
+ * Check the type of IOCTL and call the appropriate wrapper...
+ */
+static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
+ unsigned int cmd,
+ struct iw_request_info *info,
+ wext_ioctl_func standard,
+ wext_ioctl_func private)
+{
+ struct iwreq *iwr = (struct iwreq *) ifr;
+ struct net_device *dev;
+ iw_handler handler;
+
+ /* Permissions are already checked in dev_ioctl() before calling us.
+ * The copy_to/from_user() of ifr is also dealt with in there */
+
+ /* Make sure the device exist */
+ if ((dev = __dev_get_by_name(net, ifr->ifr_name)) == NULL)
+ return -ENODEV;
+
+ /* A bunch of special cases, then the generic case...
+ * Note that 'cmd' is already filtered in dev_ioctl() with
+ * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */
+ if (cmd == SIOCGIWSTATS)
+ return standard(dev, iwr, cmd, info,
+ &iw_handler_get_iwstats);
+
+#ifdef CONFIG_WEXT_PRIV
+ if (cmd == SIOCGIWPRIV && dev->wireless_handlers)
+ return standard(dev, iwr, cmd, info,
+ iw_handler_get_private);
+#endif
+
+ /* Basic check */
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ /* New driver API : try to find the handler */
+ handler = get_handler(dev, cmd);
+ if (handler) {
+ /* Standard and private are not the same */
+ if (cmd < SIOCIWFIRSTPRIV)
+ return standard(dev, iwr, cmd, info, handler);
+ else if (private)
+ return private(dev, iwr, cmd, info, handler);
+ }
+ /* Old driver API : call driver ioctl handler */
+ if (dev->netdev_ops->ndo_do_ioctl)
+ return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
+ return -EOPNOTSUPP;
+}
+
+/* If command is `set a parameter', or `get the encoding parameters',
+ * check if the user has the right to do it.
+ */
+static int wext_permission_check(unsigned int cmd)
+{
+ if ((IW_IS_SET(cmd) || cmd == SIOCGIWENCODE ||
+ cmd == SIOCGIWENCODEEXT) &&
+ !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return 0;
+}
+
+/* entry point from dev ioctl */
+static int wext_ioctl_dispatch(struct net *net, struct ifreq *ifr,
+ unsigned int cmd, struct iw_request_info *info,
+ wext_ioctl_func standard,
+ wext_ioctl_func private)
+{
+ int ret = wext_permission_check(cmd);
+
+ if (ret)
+ return ret;
+
+ dev_load(net, ifr->ifr_name);
+ rtnl_lock();
+ ret = wireless_process_ioctl(net, ifr, cmd, info, standard, private);
+ rtnl_unlock();
+
+ return ret;
+}
+
+/*
+ * Wrapper to call a standard Wireless Extension handler.
+ * We do various checks and also take care of moving data between
+ * user space and kernel space.
+ */
+static int ioctl_standard_call(struct net_device * dev,
+ struct iwreq *iwr,
+ unsigned int cmd,
+ struct iw_request_info *info,
+ iw_handler handler)
+{
+ const struct iw_ioctl_description * descr;
+ int ret = -EINVAL;
+
+ /* Get the description of the IOCTL */
+ if (IW_IOCTL_IDX(cmd) >= standard_ioctl_num)
+ return -EOPNOTSUPP;
+ descr = &(standard_ioctl[IW_IOCTL_IDX(cmd)]);
+
+ /* Check if we have a pointer to user space data or not */
+ if (descr->header_type != IW_HEADER_TYPE_POINT) {
+
+ /* No extra arguments. Trivial to handle */
+ ret = handler(dev, info, &(iwr->u), NULL);
+
+ /* Generate an event to notify listeners of the change */
+ if ((descr->flags & IW_DESCR_FLAG_EVENT) &&
+ ((ret == 0) || (ret == -EIWCOMMIT)))
+ wireless_send_event(dev, cmd, &(iwr->u), NULL);
+ } else {
+ ret = ioctl_standard_iw_point(&iwr->u.data, cmd, descr,
+ handler, dev, info);
+ }
+
+ /* Call commit handler if needed and defined */
+ if (ret == -EIWCOMMIT)
+ ret = call_commit_handler(dev);
+
+ /* Here, we will generate the appropriate event if needed */
+
+ return ret;
+}
+
+
+int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd,
+ void __user *arg)
+{
+ struct iw_request_info info = { .cmd = cmd, .flags = 0 };
+ int ret;
+
+ ret = wext_ioctl_dispatch(net, ifr, cmd, &info,
+ ioctl_standard_call,
+ ioctl_private_call);
+ if (ret >= 0 &&
+ IW_IS_GET(cmd) &&
+ copy_to_user(arg, ifr, sizeof(struct iwreq)))
+ return -EFAULT;
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_standard_call(struct net_device *dev,
+ struct iwreq *iwr,
+ unsigned int cmd,
+ struct iw_request_info *info,
+ iw_handler handler)
+{
+ const struct iw_ioctl_description *descr;
+ struct compat_iw_point *iwp_compat;
+ struct iw_point iwp;
+ int err;
+
+ descr = standard_ioctl + IW_IOCTL_IDX(cmd);
+
+ if (descr->header_type != IW_HEADER_TYPE_POINT)
+ return ioctl_standard_call(dev, iwr, cmd, info, handler);
+
+ iwp_compat = (struct compat_iw_point *) &iwr->u.data;
+ iwp.pointer = compat_ptr(iwp_compat->pointer);
+ iwp.length = iwp_compat->length;
+ iwp.flags = iwp_compat->flags;
+
+ err = ioctl_standard_iw_point(&iwp, cmd, descr, handler, dev, info);
+
+ iwp_compat->pointer = ptr_to_compat(iwp.pointer);
+ iwp_compat->length = iwp.length;
+ iwp_compat->flags = iwp.flags;
+
+ return err;
+}
+
+int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct iw_request_info info;
+ struct iwreq iwr;
+ char *colon;
+ int ret;
+
+ if (copy_from_user(&iwr, argp, sizeof(struct iwreq)))
+ return -EFAULT;
+
+ iwr.ifr_name[IFNAMSIZ-1] = 0;
+ colon = strchr(iwr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ info.cmd = cmd;
+ info.flags = IW_REQUEST_FLAG_COMPAT;
+
+ ret = wext_ioctl_dispatch(net, (struct ifreq *) &iwr, cmd, &info,
+ compat_standard_call,
+ compat_private_call);
+
+ if (ret >= 0 &&
+ IW_IS_GET(cmd) &&
+ copy_to_user(argp, &iwr, sizeof(struct iwreq)))
+ return -EFAULT;
+
+ return ret;
+}
+#endif
diff --git a/net/wireless/wext-priv.c b/net/wireless/wext-priv.c
new file mode 100644
index 000000000..674d426a9
--- /dev/null
+++ b/net/wireless/wext-priv.c
@@ -0,0 +1,249 @@
+/*
+ * This file implement the Wireless Extensions priv API.
+ *
+ * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
+ * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved.
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * (As all part of the Linux kernel, this file is GPL)
+ */
+#include <linux/slab.h>
+#include <linux/wireless.h>
+#include <linux/netdevice.h>
+#include <net/iw_handler.h>
+#include <net/wext.h>
+
+int iw_handler_get_private(struct net_device * dev,
+ struct iw_request_info * info,
+ union iwreq_data * wrqu,
+ char * extra)
+{
+ /* Check if the driver has something to export */
+ if ((dev->wireless_handlers->num_private_args == 0) ||
+ (dev->wireless_handlers->private_args == NULL))
+ return -EOPNOTSUPP;
+
+ /* Check if there is enough buffer up there */
+ if (wrqu->data.length < dev->wireless_handlers->num_private_args) {
+ /* User space can't know in advance how large the buffer
+ * needs to be. Give it a hint, so that we can support
+ * any size buffer we want somewhat efficiently... */
+ wrqu->data.length = dev->wireless_handlers->num_private_args;
+ return -E2BIG;
+ }
+
+ /* Set the number of available ioctls. */
+ wrqu->data.length = dev->wireless_handlers->num_private_args;
+
+ /* Copy structure to the user buffer. */
+ memcpy(extra, dev->wireless_handlers->private_args,
+ sizeof(struct iw_priv_args) * wrqu->data.length);
+
+ return 0;
+}
+
+/* Size (in bytes) of the various private data types */
+static const char iw_priv_type_size[] = {
+ 0, /* IW_PRIV_TYPE_NONE */
+ 1, /* IW_PRIV_TYPE_BYTE */
+ 1, /* IW_PRIV_TYPE_CHAR */
+ 0, /* Not defined */
+ sizeof(__u32), /* IW_PRIV_TYPE_INT */
+ sizeof(struct iw_freq), /* IW_PRIV_TYPE_FLOAT */
+ sizeof(struct sockaddr), /* IW_PRIV_TYPE_ADDR */
+ 0, /* Not defined */
+};
+
+static int get_priv_size(__u16 args)
+{
+ int num = args & IW_PRIV_SIZE_MASK;
+ int type = (args & IW_PRIV_TYPE_MASK) >> 12;
+
+ return num * iw_priv_type_size[type];
+}
+
+static int adjust_priv_size(__u16 args, struct iw_point *iwp)
+{
+ int num = iwp->length;
+ int max = args & IW_PRIV_SIZE_MASK;
+ int type = (args & IW_PRIV_TYPE_MASK) >> 12;
+
+ /* Make sure the driver doesn't goof up */
+ if (max < num)
+ num = max;
+
+ return num * iw_priv_type_size[type];
+}
+
+/*
+ * Wrapper to call a private Wireless Extension handler.
+ * We do various checks and also take care of moving data between
+ * user space and kernel space.
+ * It's not as nice and slimline as the standard wrapper. The cause
+ * is struct iw_priv_args, which was not really designed for the
+ * job we are going here.
+ *
+ * IMPORTANT : This function prevent to set and get data on the same
+ * IOCTL and enforce the SET/GET convention. Not doing it would be
+ * far too hairy...
+ * If you need to set and get data at the same time, please don't use
+ * a iw_handler but process it in your ioctl handler (i.e. use the
+ * old driver API).
+ */
+static int get_priv_descr_and_size(struct net_device *dev, unsigned int cmd,
+ const struct iw_priv_args **descrp)
+{
+ const struct iw_priv_args *descr;
+ int i, extra_size;
+
+ descr = NULL;
+ for (i = 0; i < dev->wireless_handlers->num_private_args; i++) {
+ if (cmd == dev->wireless_handlers->private_args[i].cmd) {
+ descr = &dev->wireless_handlers->private_args[i];
+ break;
+ }
+ }
+
+ extra_size = 0;
+ if (descr) {
+ if (IW_IS_SET(cmd)) {
+ int offset = 0; /* For sub-ioctls */
+ /* Check for sub-ioctl handler */
+ if (descr->name[0] == '\0')
+ /* Reserve one int for sub-ioctl index */
+ offset = sizeof(__u32);
+
+ /* Size of set arguments */
+ extra_size = get_priv_size(descr->set_args);
+
+ /* Does it fits in iwr ? */
+ if ((descr->set_args & IW_PRIV_SIZE_FIXED) &&
+ ((extra_size + offset) <= IFNAMSIZ))
+ extra_size = 0;
+ } else {
+ /* Size of get arguments */
+ extra_size = get_priv_size(descr->get_args);
+
+ /* Does it fits in iwr ? */
+ if ((descr->get_args & IW_PRIV_SIZE_FIXED) &&
+ (extra_size <= IFNAMSIZ))
+ extra_size = 0;
+ }
+ }
+ *descrp = descr;
+ return extra_size;
+}
+
+static int ioctl_private_iw_point(struct iw_point *iwp, unsigned int cmd,
+ const struct iw_priv_args *descr,
+ iw_handler handler, struct net_device *dev,
+ struct iw_request_info *info, int extra_size)
+{
+ char *extra;
+ int err;
+
+ /* Check what user space is giving us */
+ if (IW_IS_SET(cmd)) {
+ if (!iwp->pointer && iwp->length != 0)
+ return -EFAULT;
+
+ if (iwp->length > (descr->set_args & IW_PRIV_SIZE_MASK))
+ return -E2BIG;
+ } else if (!iwp->pointer)
+ return -EFAULT;
+
+ extra = kzalloc(extra_size, GFP_KERNEL);
+ if (!extra)
+ return -ENOMEM;
+
+ /* If it is a SET, get all the extra data in here */
+ if (IW_IS_SET(cmd) && (iwp->length != 0)) {
+ if (copy_from_user(extra, iwp->pointer, extra_size)) {
+ err = -EFAULT;
+ goto out;
+ }
+ }
+
+ /* Call the handler */
+ err = handler(dev, info, (union iwreq_data *) iwp, extra);
+
+ /* If we have something to return to the user */
+ if (!err && IW_IS_GET(cmd)) {
+ /* Adjust for the actual length if it's variable,
+ * avoid leaking kernel bits outside.
+ */
+ if (!(descr->get_args & IW_PRIV_SIZE_FIXED))
+ extra_size = adjust_priv_size(descr->get_args, iwp);
+
+ if (copy_to_user(iwp->pointer, extra, extra_size))
+ err = -EFAULT;
+ }
+
+out:
+ kfree(extra);
+ return err;
+}
+
+int ioctl_private_call(struct net_device *dev, struct iwreq *iwr,
+ unsigned int cmd, struct iw_request_info *info,
+ iw_handler handler)
+{
+ int extra_size = 0, ret = -EINVAL;
+ const struct iw_priv_args *descr;
+
+ extra_size = get_priv_descr_and_size(dev, cmd, &descr);
+
+ /* Check if we have a pointer to user space data or not. */
+ if (extra_size == 0) {
+ /* No extra arguments. Trivial to handle */
+ ret = handler(dev, info, &(iwr->u), (char *) &(iwr->u));
+ } else {
+ ret = ioctl_private_iw_point(&iwr->u.data, cmd, descr,
+ handler, dev, info, extra_size);
+ }
+
+ /* Call commit handler if needed and defined */
+ if (ret == -EIWCOMMIT)
+ ret = call_commit_handler(dev);
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_private_call(struct net_device *dev, struct iwreq *iwr,
+ unsigned int cmd, struct iw_request_info *info,
+ iw_handler handler)
+{
+ const struct iw_priv_args *descr;
+ int ret, extra_size;
+
+ extra_size = get_priv_descr_and_size(dev, cmd, &descr);
+
+ /* Check if we have a pointer to user space data or not. */
+ if (extra_size == 0) {
+ /* No extra arguments. Trivial to handle */
+ ret = handler(dev, info, &(iwr->u), (char *) &(iwr->u));
+ } else {
+ struct compat_iw_point *iwp_compat;
+ struct iw_point iwp;
+
+ iwp_compat = (struct compat_iw_point *) &iwr->u.data;
+ iwp.pointer = compat_ptr(iwp_compat->pointer);
+ iwp.length = iwp_compat->length;
+ iwp.flags = iwp_compat->flags;
+
+ ret = ioctl_private_iw_point(&iwp, cmd, descr,
+ handler, dev, info, extra_size);
+
+ iwp_compat->pointer = ptr_to_compat(iwp.pointer);
+ iwp_compat->length = iwp.length;
+ iwp_compat->flags = iwp.flags;
+ }
+
+ /* Call commit handler if needed and defined */
+ if (ret == -EIWCOMMIT)
+ ret = call_commit_handler(dev);
+
+ return ret;
+}
+#endif
diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c
new file mode 100644
index 000000000..e98a01c10
--- /dev/null
+++ b/net/wireless/wext-proc.c
@@ -0,0 +1,156 @@
+/*
+ * This file implement the Wireless Extensions proc API.
+ *
+ * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
+ * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved.
+ *
+ * (As all part of the Linux kernel, this file is GPL)
+ */
+
+/*
+ * The /proc/net/wireless file is a human readable user-space interface
+ * exporting various wireless specific statistics from the wireless devices.
+ * This is the most popular part of the Wireless Extensions ;-)
+ *
+ * This interface is a pure clone of /proc/net/dev (in net/core/dev.c).
+ * The content of the file is basically the content of "struct iw_statistics".
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/wireless.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/iw_handler.h>
+#include <net/wext.h>
+
+
+static void wireless_seq_printf_stats(struct seq_file *seq,
+ struct net_device *dev)
+{
+ /* Get stats from the driver */
+ struct iw_statistics *stats = get_wireless_stats(dev);
+ static struct iw_statistics nullstats = {};
+
+ /* show device if it's wireless regardless of current stats */
+ if (!stats) {
+#ifdef CONFIG_WIRELESS_EXT
+ if (dev->wireless_handlers)
+ stats = &nullstats;
+#endif
+#ifdef CONFIG_CFG80211
+ if (dev->ieee80211_ptr)
+ stats = &nullstats;
+#endif
+ }
+
+ if (stats) {
+ seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d "
+ "%6d %6d %6d\n",
+ dev->name, stats->status, stats->qual.qual,
+ stats->qual.updated & IW_QUAL_QUAL_UPDATED
+ ? '.' : ' ',
+ ((__s32) stats->qual.level) -
+ ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
+ stats->qual.updated & IW_QUAL_LEVEL_UPDATED
+ ? '.' : ' ',
+ ((__s32) stats->qual.noise) -
+ ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
+ stats->qual.updated & IW_QUAL_NOISE_UPDATED
+ ? '.' : ' ',
+ stats->discard.nwid, stats->discard.code,
+ stats->discard.fragment, stats->discard.retries,
+ stats->discard.misc, stats->miss.beacon);
+
+ if (stats != &nullstats)
+ stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
+ }
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Print info for /proc/net/wireless (print all entries)
+ */
+static int wireless_dev_seq_show(struct seq_file *seq, void *v)
+{
+ might_sleep();
+
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, "Inter-| sta-| Quality | Discarded "
+ "packets | Missed | WE\n"
+ " face | tus | link level noise | nwid "
+ "crypt frag retry misc | beacon | %d\n",
+ WIRELESS_EXT);
+ else
+ wireless_seq_printf_stats(seq, v);
+ return 0;
+}
+
+static void *wireless_dev_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ loff_t off;
+ struct net_device *dev;
+
+ rtnl_lock();
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ off = 1;
+ for_each_netdev(net, dev)
+ if (off++ == *pos)
+ return dev;
+ return NULL;
+}
+
+static void *wireless_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+
+ ++*pos;
+
+ return v == SEQ_START_TOKEN ?
+ first_net_device(net) : next_net_device(v);
+}
+
+static void wireless_dev_seq_stop(struct seq_file *seq, void *v)
+{
+ rtnl_unlock();
+}
+
+static const struct seq_operations wireless_seq_ops = {
+ .start = wireless_dev_seq_start,
+ .next = wireless_dev_seq_next,
+ .stop = wireless_dev_seq_stop,
+ .show = wireless_dev_seq_show,
+};
+
+static int seq_open_wireless(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &wireless_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations wireless_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = seq_open_wireless,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+int __net_init wext_proc_init(struct net *net)
+{
+ /* Create /proc/net/wireless entry */
+ if (!proc_create("wireless", S_IRUGO, net->proc_net,
+ &wireless_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __net_exit wext_proc_exit(struct net *net)
+{
+ remove_proc_entry("wireless", net->proc_net);
+}
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
new file mode 100644
index 000000000..a4e8af332
--- /dev/null
+++ b/net/wireless/wext-sme.c
@@ -0,0 +1,414 @@
+/*
+ * cfg80211 wext compat for managed mode.
+ *
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2009 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/export.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <net/cfg80211.h>
+#include <net/cfg80211-wext.h>
+#include "wext-compat.h"
+#include "nl80211.h"
+
+int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ struct cfg80211_cached_keys *ck = NULL;
+ const u8 *prev_bssid = NULL;
+ int err, i;
+
+ ASSERT_RTNL();
+ ASSERT_WDEV_LOCK(wdev);
+
+ if (!netif_running(wdev->netdev))
+ return 0;
+
+ wdev->wext.connect.ie = wdev->wext.ie;
+ wdev->wext.connect.ie_len = wdev->wext.ie_len;
+
+ /* Use default background scan period */
+ wdev->wext.connect.bg_scan_period = -1;
+
+ if (wdev->wext.keys) {
+ wdev->wext.keys->def = wdev->wext.default_key;
+ wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
+ if (wdev->wext.default_key != -1)
+ wdev->wext.connect.privacy = true;
+ }
+
+ if (!wdev->wext.connect.ssid_len)
+ return 0;
+
+ if (wdev->wext.keys) {
+ ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
+ if (!ck)
+ return -ENOMEM;
+ for (i = 0; i < 6; i++)
+ ck->params[i].key = ck->data[i];
+ }
+
+ if (wdev->wext.prev_bssid_valid)
+ prev_bssid = wdev->wext.prev_bssid;
+
+ err = cfg80211_connect(rdev, wdev->netdev,
+ &wdev->wext.connect, ck, prev_bssid);
+ if (err)
+ kzfree(ck);
+
+ return err;
+}
+
+int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *wextfreq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct ieee80211_channel *chan = NULL;
+ int err, freq;
+
+ /* call only for station! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ return -EINVAL;
+
+ freq = cfg80211_wext_freq(wextfreq);
+ if (freq < 0)
+ return freq;
+
+ if (freq) {
+ chan = ieee80211_get_channel(wdev->wiphy, freq);
+ if (!chan)
+ return -EINVAL;
+ if (chan->flags & IEEE80211_CHAN_DISABLED)
+ return -EINVAL;
+ }
+
+ wdev_lock(wdev);
+
+ if (wdev->conn) {
+ bool event = true;
+
+ if (wdev->wext.connect.channel == chan) {
+ err = 0;
+ goto out;
+ }
+
+ /* if SSID set, we'll try right again, avoid event */
+ if (wdev->wext.connect.ssid_len)
+ event = false;
+ err = cfg80211_disconnect(rdev, dev,
+ WLAN_REASON_DEAUTH_LEAVING, event);
+ if (err)
+ goto out;
+ }
+
+
+ wdev->wext.connect.channel = chan;
+
+ /*
+ * SSID is not set, we just want to switch monitor channel,
+ * this is really just backward compatibility, if the SSID
+ * is set then we use the channel to select the BSS to use
+ * to connect to instead. If we were connected on another
+ * channel we disconnected above and reconnect below.
+ */
+ if (chan && !wdev->wext.connect.ssid_len) {
+ struct cfg80211_chan_def chandef = {
+ .width = NL80211_CHAN_WIDTH_20_NOHT,
+ .center_freq1 = freq,
+ };
+
+ chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
+ if (chandef.chan)
+ err = cfg80211_set_monitor_channel(rdev, &chandef);
+ else
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = cfg80211_mgd_wext_connect(rdev, wdev);
+ out:
+ wdev_unlock(wdev);
+ return err;
+}
+
+int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_freq *freq, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct ieee80211_channel *chan = NULL;
+
+ /* call only for station! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ return -EINVAL;
+
+ wdev_lock(wdev);
+ if (wdev->current_bss)
+ chan = wdev->current_bss->pub.channel;
+ else if (wdev->wext.connect.channel)
+ chan = wdev->wext.connect.channel;
+ wdev_unlock(wdev);
+
+ if (chan) {
+ freq->m = chan->center_freq;
+ freq->e = 6;
+ return 0;
+ }
+
+ /* no channel if not joining */
+ return -EINVAL;
+}
+
+int cfg80211_mgd_wext_siwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ size_t len = data->length;
+ int err;
+
+ /* call only for station! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ return -EINVAL;
+
+ if (!data->flags)
+ len = 0;
+
+ /* iwconfig uses nul termination in SSID.. */
+ if (len > 0 && ssid[len - 1] == '\0')
+ len--;
+
+ wdev_lock(wdev);
+
+ err = 0;
+
+ if (wdev->conn) {
+ bool event = true;
+
+ if (wdev->wext.connect.ssid && len &&
+ len == wdev->wext.connect.ssid_len &&
+ memcmp(wdev->wext.connect.ssid, ssid, len) == 0)
+ goto out;
+
+ /* if SSID set now, we'll try to connect, avoid event */
+ if (len)
+ event = false;
+ err = cfg80211_disconnect(rdev, dev,
+ WLAN_REASON_DEAUTH_LEAVING, event);
+ if (err)
+ goto out;
+ }
+
+ wdev->wext.prev_bssid_valid = false;
+ wdev->wext.connect.ssid = wdev->wext.ssid;
+ memcpy(wdev->wext.ssid, ssid, len);
+ wdev->wext.connect.ssid_len = len;
+
+ wdev->wext.connect.crypto.control_port = false;
+ wdev->wext.connect.crypto.control_port_ethertype =
+ cpu_to_be16(ETH_P_PAE);
+
+ err = cfg80211_mgd_wext_connect(rdev, wdev);
+ out:
+ wdev_unlock(wdev);
+ return err;
+}
+
+int cfg80211_mgd_wext_giwessid(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *ssid)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ /* call only for station! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ return -EINVAL;
+
+ data->flags = 0;
+
+ wdev_lock(wdev);
+ if (wdev->current_bss) {
+ const u8 *ie;
+
+ rcu_read_lock();
+ ie = ieee80211_bss_get_ie(&wdev->current_bss->pub,
+ WLAN_EID_SSID);
+ if (ie) {
+ data->flags = 1;
+ data->length = ie[1];
+ memcpy(ssid, ie + 2, data->length);
+ }
+ rcu_read_unlock();
+ } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) {
+ data->flags = 1;
+ data->length = wdev->wext.connect.ssid_len;
+ memcpy(ssid, wdev->wext.connect.ssid, data->length);
+ }
+ wdev_unlock(wdev);
+
+ return 0;
+}
+
+int cfg80211_mgd_wext_siwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ u8 *bssid = ap_addr->sa_data;
+ int err;
+
+ /* call only for station! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ return -EINVAL;
+
+ if (ap_addr->sa_family != ARPHRD_ETHER)
+ return -EINVAL;
+
+ /* automatic mode */
+ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid))
+ bssid = NULL;
+
+ wdev_lock(wdev);
+
+ if (wdev->conn) {
+ err = 0;
+ /* both automatic */
+ if (!bssid && !wdev->wext.connect.bssid)
+ goto out;
+
+ /* fixed already - and no change */
+ if (wdev->wext.connect.bssid && bssid &&
+ ether_addr_equal(bssid, wdev->wext.connect.bssid))
+ goto out;
+
+ err = cfg80211_disconnect(rdev, dev,
+ WLAN_REASON_DEAUTH_LEAVING, false);
+ if (err)
+ goto out;
+ }
+
+ if (bssid) {
+ memcpy(wdev->wext.bssid, bssid, ETH_ALEN);
+ wdev->wext.connect.bssid = wdev->wext.bssid;
+ } else
+ wdev->wext.connect.bssid = NULL;
+
+ err = cfg80211_mgd_wext_connect(rdev, wdev);
+ out:
+ wdev_unlock(wdev);
+ return err;
+}
+
+int cfg80211_mgd_wext_giwap(struct net_device *dev,
+ struct iw_request_info *info,
+ struct sockaddr *ap_addr, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+ /* call only for station! */
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ return -EINVAL;
+
+ ap_addr->sa_family = ARPHRD_ETHER;
+
+ wdev_lock(wdev);
+ if (wdev->current_bss)
+ memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN);
+ else
+ eth_zero_addr(ap_addr->sa_data);
+ wdev_unlock(wdev);
+
+ return 0;
+}
+
+int cfg80211_wext_siwgenie(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ u8 *ie = extra;
+ int ie_len = data->length, err;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION)
+ return -EOPNOTSUPP;
+
+ if (!ie_len)
+ ie = NULL;
+
+ wdev_lock(wdev);
+
+ /* no change */
+ err = 0;
+ if (wdev->wext.ie_len == ie_len &&
+ memcmp(wdev->wext.ie, ie, ie_len) == 0)
+ goto out;
+
+ if (ie_len) {
+ ie = kmemdup(extra, ie_len, GFP_KERNEL);
+ if (!ie) {
+ err = -ENOMEM;
+ goto out;
+ }
+ } else
+ ie = NULL;
+
+ kfree(wdev->wext.ie);
+ wdev->wext.ie = ie;
+ wdev->wext.ie_len = ie_len;
+
+ if (wdev->conn) {
+ err = cfg80211_disconnect(rdev, dev,
+ WLAN_REASON_DEAUTH_LEAVING, false);
+ if (err)
+ goto out;
+ }
+
+ /* userspace better not think we'll reconnect */
+ err = 0;
+ out:
+ wdev_unlock(wdev);
+ return err;
+}
+
+int cfg80211_wext_siwmlme(struct net_device *dev,
+ struct iw_request_info *info,
+ struct iw_point *data, char *extra)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct iw_mlme *mlme = (struct iw_mlme *)extra;
+ struct cfg80211_registered_device *rdev;
+ int err;
+
+ if (!wdev)
+ return -EOPNOTSUPP;
+
+ rdev = wiphy_to_rdev(wdev->wiphy);
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION)
+ return -EINVAL;
+
+ if (mlme->addr.sa_family != ARPHRD_ETHER)
+ return -EINVAL;
+
+ wdev_lock(wdev);
+ switch (mlme->cmd) {
+ case IW_MLME_DEAUTH:
+ case IW_MLME_DISASSOC:
+ err = cfg80211_disconnect(rdev, dev, mlme->reason_code, true);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+ wdev_unlock(wdev);
+
+ return err;
+}
diff --git a/net/wireless/wext-spy.c b/net/wireless/wext-spy.c
new file mode 100644
index 000000000..33bef22e4
--- /dev/null
+++ b/net/wireless/wext-spy.c
@@ -0,0 +1,232 @@
+/*
+ * This file implement the Wireless Extensions spy API.
+ *
+ * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
+ * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved.
+ *
+ * (As all part of the Linux kernel, this file is GPL)
+ */
+
+#include <linux/wireless.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/export.h>
+#include <net/iw_handler.h>
+#include <net/arp.h>
+#include <net/wext.h>
+
+static inline struct iw_spy_data *get_spydata(struct net_device *dev)
+{
+ /* This is the new way */
+ if (dev->wireless_data)
+ return dev->wireless_data->spy_data;
+ return NULL;
+}
+
+int iw_handler_set_spy(struct net_device * dev,
+ struct iw_request_info * info,
+ union iwreq_data * wrqu,
+ char * extra)
+{
+ struct iw_spy_data * spydata = get_spydata(dev);
+ struct sockaddr * address = (struct sockaddr *) extra;
+
+ /* Make sure driver is not buggy or using the old API */
+ if (!spydata)
+ return -EOPNOTSUPP;
+
+ /* Disable spy collection while we copy the addresses.
+ * While we copy addresses, any call to wireless_spy_update()
+ * will NOP. This is OK, as anyway the addresses are changing. */
+ spydata->spy_number = 0;
+
+ /* We want to operate without locking, because wireless_spy_update()
+ * most likely will happen in the interrupt handler, and therefore
+ * have its own locking constraints and needs performance.
+ * The rtnl_lock() make sure we don't race with the other iw_handlers.
+ * This make sure wireless_spy_update() "see" that the spy list
+ * is temporarily disabled. */
+ smp_wmb();
+
+ /* Are there are addresses to copy? */
+ if (wrqu->data.length > 0) {
+ int i;
+
+ /* Copy addresses */
+ for (i = 0; i < wrqu->data.length; i++)
+ memcpy(spydata->spy_address[i], address[i].sa_data,
+ ETH_ALEN);
+ /* Reset stats */
+ memset(spydata->spy_stat, 0,
+ sizeof(struct iw_quality) * IW_MAX_SPY);
+ }
+
+ /* Make sure above is updated before re-enabling */
+ smp_wmb();
+
+ /* Enable addresses */
+ spydata->spy_number = wrqu->data.length;
+
+ return 0;
+}
+EXPORT_SYMBOL(iw_handler_set_spy);
+
+int iw_handler_get_spy(struct net_device * dev,
+ struct iw_request_info * info,
+ union iwreq_data * wrqu,
+ char * extra)
+{
+ struct iw_spy_data * spydata = get_spydata(dev);
+ struct sockaddr * address = (struct sockaddr *) extra;
+ int i;
+
+ /* Make sure driver is not buggy or using the old API */
+ if (!spydata)
+ return -EOPNOTSUPP;
+
+ wrqu->data.length = spydata->spy_number;
+
+ /* Copy addresses. */
+ for (i = 0; i < spydata->spy_number; i++) {
+ memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN);
+ address[i].sa_family = AF_UNIX;
+ }
+ /* Copy stats to the user buffer (just after). */
+ if (spydata->spy_number > 0)
+ memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number),
+ spydata->spy_stat,
+ sizeof(struct iw_quality) * spydata->spy_number);
+ /* Reset updated flags. */
+ for (i = 0; i < spydata->spy_number; i++)
+ spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED;
+ return 0;
+}
+EXPORT_SYMBOL(iw_handler_get_spy);
+
+/*------------------------------------------------------------------*/
+/*
+ * Standard Wireless Handler : set spy threshold
+ */
+int iw_handler_set_thrspy(struct net_device * dev,
+ struct iw_request_info *info,
+ union iwreq_data * wrqu,
+ char * extra)
+{
+ struct iw_spy_data * spydata = get_spydata(dev);
+ struct iw_thrspy * threshold = (struct iw_thrspy *) extra;
+
+ /* Make sure driver is not buggy or using the old API */
+ if (!spydata)
+ return -EOPNOTSUPP;
+
+ /* Just do it */
+ memcpy(&(spydata->spy_thr_low), &(threshold->low),
+ 2 * sizeof(struct iw_quality));
+
+ /* Clear flag */
+ memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under));
+
+ return 0;
+}
+EXPORT_SYMBOL(iw_handler_set_thrspy);
+
+/*------------------------------------------------------------------*/
+/*
+ * Standard Wireless Handler : get spy threshold
+ */
+int iw_handler_get_thrspy(struct net_device * dev,
+ struct iw_request_info *info,
+ union iwreq_data * wrqu,
+ char * extra)
+{
+ struct iw_spy_data * spydata = get_spydata(dev);
+ struct iw_thrspy * threshold = (struct iw_thrspy *) extra;
+
+ /* Make sure driver is not buggy or using the old API */
+ if (!spydata)
+ return -EOPNOTSUPP;
+
+ /* Just do it */
+ memcpy(&(threshold->low), &(spydata->spy_thr_low),
+ 2 * sizeof(struct iw_quality));
+
+ return 0;
+}
+EXPORT_SYMBOL(iw_handler_get_thrspy);
+
+/*------------------------------------------------------------------*/
+/*
+ * Prepare and send a Spy Threshold event
+ */
+static void iw_send_thrspy_event(struct net_device * dev,
+ struct iw_spy_data * spydata,
+ unsigned char * address,
+ struct iw_quality * wstats)
+{
+ union iwreq_data wrqu;
+ struct iw_thrspy threshold;
+
+ /* Init */
+ wrqu.data.length = 1;
+ wrqu.data.flags = 0;
+ /* Copy address */
+ memcpy(threshold.addr.sa_data, address, ETH_ALEN);
+ threshold.addr.sa_family = ARPHRD_ETHER;
+ /* Copy stats */
+ memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality));
+ /* Copy also thresholds */
+ memcpy(&(threshold.low), &(spydata->spy_thr_low),
+ 2 * sizeof(struct iw_quality));
+
+ /* Send event to user space */
+ wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold);
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Call for the driver to update the spy data.
+ * For now, the spy data is a simple array. As the size of the array is
+ * small, this is good enough. If we wanted to support larger number of
+ * spy addresses, we should use something more efficient...
+ */
+void wireless_spy_update(struct net_device * dev,
+ unsigned char * address,
+ struct iw_quality * wstats)
+{
+ struct iw_spy_data * spydata = get_spydata(dev);
+ int i;
+ int match = -1;
+
+ /* Make sure driver is not buggy or using the old API */
+ if (!spydata)
+ return;
+
+ /* Update all records that match */
+ for (i = 0; i < spydata->spy_number; i++)
+ if (ether_addr_equal(address, spydata->spy_address[i])) {
+ memcpy(&(spydata->spy_stat[i]), wstats,
+ sizeof(struct iw_quality));
+ match = i;
+ }
+
+ /* Generate an event if we cross the spy threshold.
+ * To avoid event storms, we have a simple hysteresis : we generate
+ * event only when we go under the low threshold or above the
+ * high threshold. */
+ if (match >= 0) {
+ if (spydata->spy_thr_under[match]) {
+ if (wstats->level > spydata->spy_thr_high.level) {
+ spydata->spy_thr_under[match] = 0;
+ iw_send_thrspy_event(dev, spydata,
+ address, wstats);
+ }
+ } else {
+ if (wstats->level < spydata->spy_thr_low.level) {
+ spydata->spy_thr_under[match] = 1;
+ iw_send_thrspy_event(dev, spydata,
+ address, wstats);
+ }
+ }
+ }
+}
+EXPORT_SYMBOL(wireless_spy_update);
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 000000000..e2fa133f9
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,35 @@
+#
+# CCITT X.25 Packet Layer
+#
+
+config X25
+ tristate "CCITT X.25 Packet Layer"
+ ---help---
+ X.25 is a set of standardized network protocols, similar in scope to
+ frame relay; the one physical line from your box to the X.25 network
+ entry point can carry several logical point-to-point connections
+ (called "virtual circuits") to other computers connected to the X.25
+ network. Governments, banks, and other organizations tend to use it
+ to connect to each other or to form Wide Area Networks (WANs). Many
+ countries have public X.25 networks. X.25 consists of two
+ protocols: the higher level Packet Layer Protocol (PLP) (say Y here
+ if you want that) and the lower level data link layer protocol LAPB
+ (say Y to "LAPB Data Link Driver" below if you want that).
+
+ You can read more about X.25 at <http://www.sangoma.com/tutorials/x25/> and
+ <http://docwiki.cisco.com/wiki/X.25>.
+ Information about X.25 for Linux is contained in the files
+ <file:Documentation/networking/x25.txt> and
+ <file:Documentation/networking/x25-iface.txt>.
+
+ One connects to an X.25 network either with a dedicated network card
+ using the X.21 protocol (not yet supported by Linux) or one can do
+ X.25 over a standard telephone line using an ordinary modem (say Y
+ to "X.25 async driver" below) or over Ethernet using an ordinary
+ Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
+ Driver" and "LAPB over Ethernet driver" below).
+
+ To compile this driver as a module, choose M here: the module
+ will be called x25. If unsure, say N.
+
+
diff --git a/net/x25/Makefile b/net/x25/Makefile
new file mode 100644
index 000000000..a2c34ab6f
--- /dev/null
+++ b/net/x25/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Linux X.25 Packet layer.
+#
+
+obj-$(CONFIG_X25) += x25.o
+
+x25-y := af_x25.o x25_dev.o x25_facilities.o x25_in.o \
+ x25_link.o x25_out.o x25_route.o x25_subr.o \
+ x25_timer.o x25_proc.o x25_forward.o
+x25-$(CONFIG_SYSCTL) += sysctl_net_x25.o
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
new file mode 100644
index 000000000..c3ab230e4
--- /dev/null
+++ b/net/x25/af_x25.c
@@ -0,0 +1,1850 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Jonathan Naylor Started coding.
+ * X.25 002 Jonathan Naylor Centralised disconnect handling.
+ * New timer architecture.
+ * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant.
+ * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of
+ * facilities negotiation and increased
+ * the throughput upper limit.
+ * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups
+ * 2000-09-04 Henner Eisen Set sock->state in x25_accept().
+ * Fixed x25_output() related skb leakage.
+ * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket.
+ * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation.
+ * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN
+ * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
+ * x25_proc.c, using seq_file
+ * 2005-04-02 Shaun Pereira Selective sub address matching
+ * with call user data
+ * 2005-04-15 Shaun Pereira Fast select with no restriction on
+ * response
+ */
+
+#define pr_fmt(fmt) "X25: " fmt
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h> /* For TIOCINQ/OUTQ */
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <linux/ctype.h>
+
+#include <net/x25.h>
+#include <net/compat.h>
+
+int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20;
+int sysctl_x25_call_request_timeout = X25_DEFAULT_T21;
+int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22;
+int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23;
+int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2;
+int sysctl_x25_forward = 0;
+
+HLIST_HEAD(x25_list);
+DEFINE_RWLOCK(x25_list_lock);
+
+static const struct proto_ops x25_proto_ops;
+
+static struct x25_address null_x25_address = {" "};
+
+#ifdef CONFIG_COMPAT
+struct compat_x25_subscrip_struct {
+ char device[200-sizeof(compat_ulong_t)];
+ compat_ulong_t global_facil_mask;
+ compat_uint_t extended;
+};
+#endif
+
+
+int x25_parse_address_block(struct sk_buff *skb,
+ struct x25_address *called_addr,
+ struct x25_address *calling_addr)
+{
+ unsigned char len;
+ int needed;
+ int rc;
+
+ if (!pskb_may_pull(skb, 1)) {
+ /* packet has no address block */
+ rc = 0;
+ goto empty;
+ }
+
+ len = *skb->data;
+ needed = 1 + (len >> 4) + (len & 0x0f);
+
+ if (!pskb_may_pull(skb, needed)) {
+ /* packet is too short to hold the addresses it claims
+ to hold */
+ rc = -1;
+ goto empty;
+ }
+
+ return x25_addr_ntoa(skb->data, called_addr, calling_addr);
+
+empty:
+ *called_addr->x25_addr = 0;
+ *calling_addr->x25_addr = 0;
+
+ return rc;
+}
+
+
+int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr,
+ struct x25_address *calling_addr)
+{
+ unsigned int called_len, calling_len;
+ char *called, *calling;
+ unsigned int i;
+
+ called_len = (*p >> 0) & 0x0F;
+ calling_len = (*p >> 4) & 0x0F;
+
+ called = called_addr->x25_addr;
+ calling = calling_addr->x25_addr;
+ p++;
+
+ for (i = 0; i < (called_len + calling_len); i++) {
+ if (i < called_len) {
+ if (i % 2 != 0) {
+ *called++ = ((*p >> 0) & 0x0F) + '0';
+ p++;
+ } else {
+ *called++ = ((*p >> 4) & 0x0F) + '0';
+ }
+ } else {
+ if (i % 2 != 0) {
+ *calling++ = ((*p >> 0) & 0x0F) + '0';
+ p++;
+ } else {
+ *calling++ = ((*p >> 4) & 0x0F) + '0';
+ }
+ }
+ }
+
+ *called = *calling = '\0';
+
+ return 1 + (called_len + calling_len + 1) / 2;
+}
+
+int x25_addr_aton(unsigned char *p, struct x25_address *called_addr,
+ struct x25_address *calling_addr)
+{
+ unsigned int called_len, calling_len;
+ char *called, *calling;
+ int i;
+
+ called = called_addr->x25_addr;
+ calling = calling_addr->x25_addr;
+
+ called_len = strlen(called);
+ calling_len = strlen(calling);
+
+ *p++ = (calling_len << 4) | (called_len << 0);
+
+ for (i = 0; i < (called_len + calling_len); i++) {
+ if (i < called_len) {
+ if (i % 2 != 0) {
+ *p |= (*called++ - '0') << 0;
+ p++;
+ } else {
+ *p = 0x00;
+ *p |= (*called++ - '0') << 4;
+ }
+ } else {
+ if (i % 2 != 0) {
+ *p |= (*calling++ - '0') << 0;
+ p++;
+ } else {
+ *p = 0x00;
+ *p |= (*calling++ - '0') << 4;
+ }
+ }
+ }
+
+ return 1 + (called_len + calling_len + 1) / 2;
+}
+
+/*
+ * Socket removal during an interrupt is now safe.
+ */
+static void x25_remove_socket(struct sock *sk)
+{
+ write_lock_bh(&x25_list_lock);
+ sk_del_node_init(sk);
+ write_unlock_bh(&x25_list_lock);
+}
+
+/*
+ * Kill all bound sockets on a dropped device.
+ */
+static void x25_kill_by_device(struct net_device *dev)
+{
+ struct sock *s;
+
+ write_lock_bh(&x25_list_lock);
+
+ sk_for_each(s, &x25_list)
+ if (x25_sk(s)->neighbour && x25_sk(s)->neighbour->dev == dev)
+ x25_disconnect(s, ENETUNREACH, 0, 0);
+
+ write_unlock_bh(&x25_list_lock);
+}
+
+/*
+ * Handle device status changes.
+ */
+static int x25_device_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct x25_neigh *nb;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (dev->type == ARPHRD_X25
+#if IS_ENABLED(CONFIG_LLC)
+ || dev->type == ARPHRD_ETHER
+#endif
+ ) {
+ switch (event) {
+ case NETDEV_UP:
+ x25_link_device_up(dev);
+ break;
+ case NETDEV_GOING_DOWN:
+ nb = x25_get_neigh(dev);
+ if (nb) {
+ x25_terminate_link(nb);
+ x25_neigh_put(nb);
+ }
+ break;
+ case NETDEV_DOWN:
+ x25_kill_by_device(dev);
+ x25_route_device_down(dev);
+ x25_link_device_down(dev);
+ break;
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+/*
+ * Add a socket to the bound sockets list.
+ */
+static void x25_insert_socket(struct sock *sk)
+{
+ write_lock_bh(&x25_list_lock);
+ sk_add_node(sk, &x25_list);
+ write_unlock_bh(&x25_list_lock);
+}
+
+/*
+ * Find a socket that wants to accept the Call Request we just
+ * received. Check the full list for an address/cud match.
+ * If no cuds match return the next_best thing, an address match.
+ * Note: if a listening socket has cud set it must only get calls
+ * with matching cud.
+ */
+static struct sock *x25_find_listener(struct x25_address *addr,
+ struct sk_buff *skb)
+{
+ struct sock *s;
+ struct sock *next_best;
+
+ read_lock_bh(&x25_list_lock);
+ next_best = NULL;
+
+ sk_for_each(s, &x25_list)
+ if ((!strcmp(addr->x25_addr,
+ x25_sk(s)->source_addr.x25_addr) ||
+ !strcmp(addr->x25_addr,
+ null_x25_address.x25_addr)) &&
+ s->sk_state == TCP_LISTEN) {
+ /*
+ * Found a listening socket, now check the incoming
+ * call user data vs this sockets call user data
+ */
+ if (x25_sk(s)->cudmatchlength > 0 &&
+ skb->len >= x25_sk(s)->cudmatchlength) {
+ if((memcmp(x25_sk(s)->calluserdata.cuddata,
+ skb->data,
+ x25_sk(s)->cudmatchlength)) == 0) {
+ sock_hold(s);
+ goto found;
+ }
+ } else
+ next_best = s;
+ }
+ if (next_best) {
+ s = next_best;
+ sock_hold(s);
+ goto found;
+ }
+ s = NULL;
+found:
+ read_unlock_bh(&x25_list_lock);
+ return s;
+}
+
+/*
+ * Find a connected X.25 socket given my LCI and neighbour.
+ */
+static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb)
+{
+ struct sock *s;
+
+ sk_for_each(s, &x25_list)
+ if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) {
+ sock_hold(s);
+ goto found;
+ }
+ s = NULL;
+found:
+ return s;
+}
+
+struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb)
+{
+ struct sock *s;
+
+ read_lock_bh(&x25_list_lock);
+ s = __x25_find_socket(lci, nb);
+ read_unlock_bh(&x25_list_lock);
+ return s;
+}
+
+/*
+ * Find a unique LCI for a given device.
+ */
+static unsigned int x25_new_lci(struct x25_neigh *nb)
+{
+ unsigned int lci = 1;
+ struct sock *sk;
+
+ read_lock_bh(&x25_list_lock);
+
+ while ((sk = __x25_find_socket(lci, nb)) != NULL) {
+ sock_put(sk);
+ if (++lci == 4096) {
+ lci = 0;
+ break;
+ }
+ }
+
+ read_unlock_bh(&x25_list_lock);
+ return lci;
+}
+
+/*
+ * Deferred destroy.
+ */
+static void __x25_destroy_socket(struct sock *);
+
+/*
+ * handler for deferred kills.
+ */
+static void x25_destroy_timer(unsigned long data)
+{
+ x25_destroy_socket_from_timer((struct sock *)data);
+}
+
+/*
+ * This is called from user mode and the timers. Thus it protects itself
+ * against interrupt users but doesn't worry about being called during
+ * work. Once it is removed from the queue no interrupt or bottom half
+ * will touch it and we are (fairly 8-) ) safe.
+ * Not static as it's used by the timer
+ */
+static void __x25_destroy_socket(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ x25_stop_heartbeat(sk);
+ x25_stop_timer(sk);
+
+ x25_remove_socket(sk);
+ x25_clear_queues(sk); /* Flush the queues */
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (skb->sk != sk) { /* A pending connection */
+ /*
+ * Queue the unaccepted socket for death
+ */
+ skb->sk->sk_state = TCP_LISTEN;
+ sock_set_flag(skb->sk, SOCK_DEAD);
+ x25_start_heartbeat(skb->sk);
+ x25_sk(skb->sk)->state = X25_STATE_0;
+ }
+
+ kfree_skb(skb);
+ }
+
+ if (sk_has_allocations(sk)) {
+ /* Defer: outstanding buffers */
+ sk->sk_timer.expires = jiffies + 10 * HZ;
+ sk->sk_timer.function = x25_destroy_timer;
+ sk->sk_timer.data = (unsigned long)sk;
+ add_timer(&sk->sk_timer);
+ } else {
+ /* drop last reference so sock_put will free */
+ __sock_put(sk);
+ }
+}
+
+void x25_destroy_socket_from_timer(struct sock *sk)
+{
+ sock_hold(sk);
+ bh_lock_sock(sk);
+ __x25_destroy_socket(sk);
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * Handling for system calls applied via the various interfaces to a
+ * X.25 socket object.
+ */
+
+static int x25_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ int opt;
+ struct sock *sk = sock->sk;
+ int rc = -ENOPROTOOPT;
+
+ if (level != SOL_X25 || optname != X25_QBITINCL)
+ goto out;
+
+ rc = -EINVAL;
+ if (optlen < sizeof(int))
+ goto out;
+
+ rc = -EFAULT;
+ if (get_user(opt, (int __user *)optval))
+ goto out;
+
+ if (opt)
+ set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags);
+ else
+ clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags);
+ rc = 0;
+out:
+ return rc;
+}
+
+static int x25_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int val, len, rc = -ENOPROTOOPT;
+
+ if (level != SOL_X25 || optname != X25_QBITINCL)
+ goto out;
+
+ rc = -EFAULT;
+ if (get_user(len, optlen))
+ goto out;
+
+ len = min_t(unsigned int, len, sizeof(int));
+
+ rc = -EINVAL;
+ if (len < 0)
+ goto out;
+
+ rc = -EFAULT;
+ if (put_user(len, optlen))
+ goto out;
+
+ val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags);
+ rc = copy_to_user(optval, &val, len) ? -EFAULT : 0;
+out:
+ return rc;
+}
+
+static int x25_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int rc = -EOPNOTSUPP;
+
+ lock_sock(sk);
+ if (sk->sk_state != TCP_LISTEN) {
+ memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN);
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_state = TCP_LISTEN;
+ rc = 0;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
+static struct proto x25_proto = {
+ .name = "X25",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct x25_sock),
+};
+
+static struct sock *x25_alloc_socket(struct net *net)
+{
+ struct x25_sock *x25;
+ struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto);
+
+ if (!sk)
+ goto out;
+
+ sock_init_data(NULL, sk);
+
+ x25 = x25_sk(sk);
+ skb_queue_head_init(&x25->ack_queue);
+ skb_queue_head_init(&x25->fragment_queue);
+ skb_queue_head_init(&x25->interrupt_in_queue);
+ skb_queue_head_init(&x25->interrupt_out_queue);
+out:
+ return sk;
+}
+
+static int x25_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct x25_sock *x25;
+ int rc = -EAFNOSUPPORT;
+
+ if (!net_eq(net, &init_net))
+ goto out;
+
+ rc = -ESOCKTNOSUPPORT;
+ if (sock->type != SOCK_SEQPACKET)
+ goto out;
+
+ rc = -EINVAL;
+ if (protocol)
+ goto out;
+
+ rc = -ENOBUFS;
+ if ((sk = x25_alloc_socket(net)) == NULL)
+ goto out;
+
+ x25 = x25_sk(sk);
+
+ sock_init_data(sock, sk);
+
+ x25_init_timers(sk);
+
+ sock->ops = &x25_proto_ops;
+ sk->sk_protocol = protocol;
+ sk->sk_backlog_rcv = x25_backlog_rcv;
+
+ x25->t21 = sysctl_x25_call_request_timeout;
+ x25->t22 = sysctl_x25_reset_request_timeout;
+ x25->t23 = sysctl_x25_clear_request_timeout;
+ x25->t2 = sysctl_x25_ack_holdback_timeout;
+ x25->state = X25_STATE_0;
+ x25->cudmatchlength = 0;
+ set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */
+ /* on call accept */
+
+ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE;
+ x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
+ x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE;
+ x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE;
+ x25->facilities.throughput = 0; /* by default don't negotiate
+ throughput */
+ x25->facilities.reverse = X25_DEFAULT_REVERSE;
+ x25->dte_facilities.calling_len = 0;
+ x25->dte_facilities.called_len = 0;
+ memset(x25->dte_facilities.called_ae, '\0',
+ sizeof(x25->dte_facilities.called_ae));
+ memset(x25->dte_facilities.calling_ae, '\0',
+ sizeof(x25->dte_facilities.calling_ae));
+
+ rc = 0;
+out:
+ return rc;
+}
+
+static struct sock *x25_make_new(struct sock *osk)
+{
+ struct sock *sk = NULL;
+ struct x25_sock *x25, *ox25;
+
+ if (osk->sk_type != SOCK_SEQPACKET)
+ goto out;
+
+ if ((sk = x25_alloc_socket(sock_net(osk))) == NULL)
+ goto out;
+
+ x25 = x25_sk(sk);
+
+ sk->sk_type = osk->sk_type;
+ sk->sk_priority = osk->sk_priority;
+ sk->sk_protocol = osk->sk_protocol;
+ sk->sk_rcvbuf = osk->sk_rcvbuf;
+ sk->sk_sndbuf = osk->sk_sndbuf;
+ sk->sk_state = TCP_ESTABLISHED;
+ sk->sk_backlog_rcv = osk->sk_backlog_rcv;
+ sock_copy_flags(sk, osk);
+
+ ox25 = x25_sk(osk);
+ x25->t21 = ox25->t21;
+ x25->t22 = ox25->t22;
+ x25->t23 = ox25->t23;
+ x25->t2 = ox25->t2;
+ x25->flags = ox25->flags;
+ x25->facilities = ox25->facilities;
+ x25->dte_facilities = ox25->dte_facilities;
+ x25->cudmatchlength = ox25->cudmatchlength;
+
+ clear_bit(X25_INTERRUPT_FLAG, &x25->flags);
+ x25_init_timers(sk);
+out:
+ return sk;
+}
+
+static int x25_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct x25_sock *x25;
+
+ if (!sk)
+ return 0;
+
+ x25 = x25_sk(sk);
+
+ sock_hold(sk);
+ lock_sock(sk);
+ switch (x25->state) {
+
+ case X25_STATE_0:
+ case X25_STATE_2:
+ x25_disconnect(sk, 0, 0, 0);
+ __x25_destroy_socket(sk);
+ goto out;
+
+ case X25_STATE_1:
+ case X25_STATE_3:
+ case X25_STATE_4:
+ x25_clear_queues(sk);
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25_start_t23timer(sk);
+ x25->state = X25_STATE_2;
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_set_flag(sk, SOCK_DESTROY);
+ break;
+ }
+
+ sock_orphan(sk);
+out:
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+
+static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
+ int len, i, rc = 0;
+
+ if (!sock_flag(sk, SOCK_ZAPPED) ||
+ addr_len != sizeof(struct sockaddr_x25) ||
+ addr->sx25_family != AF_X25) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ len = strlen(addr->sx25_addr.x25_addr);
+ for (i = 0; i < len; i++) {
+ if (!isdigit(addr->sx25_addr.x25_addr[i])) {
+ rc = -EINVAL;
+ goto out;
+ }
+ }
+
+ lock_sock(sk);
+ x25_sk(sk)->source_addr = addr->sx25_addr;
+ x25_insert_socket(sk);
+ sock_reset_flag(sk, SOCK_ZAPPED);
+ release_sock(sk);
+ SOCK_DEBUG(sk, "x25_bind: socket is bound\n");
+out:
+ return rc;
+}
+
+static int x25_wait_for_connection_establishment(struct sock *sk)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ int rc;
+
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ for (;;) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ rc = -ERESTARTSYS;
+ if (signal_pending(current))
+ break;
+ rc = sock_error(sk);
+ if (rc) {
+ sk->sk_socket->state = SS_UNCONNECTED;
+ break;
+ }
+ rc = 0;
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
+ } else
+ break;
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return rc;
+}
+
+static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct x25_sock *x25 = x25_sk(sk);
+ struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
+ struct x25_route *rt;
+ int rc = 0;
+
+ lock_sock(sk);
+ if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
+ sock->state = SS_CONNECTED;
+ goto out; /* Connect completed during a ERESTARTSYS event */
+ }
+
+ rc = -ECONNREFUSED;
+ if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
+ sock->state = SS_UNCONNECTED;
+ goto out;
+ }
+
+ rc = -EISCONN; /* No reconnect on a seqpacket socket */
+ if (sk->sk_state == TCP_ESTABLISHED)
+ goto out;
+
+ sk->sk_state = TCP_CLOSE;
+ sock->state = SS_UNCONNECTED;
+
+ rc = -EINVAL;
+ if (addr_len != sizeof(struct sockaddr_x25) ||
+ addr->sx25_family != AF_X25)
+ goto out;
+
+ rc = -ENETUNREACH;
+ rt = x25_get_route(&addr->sx25_addr);
+ if (!rt)
+ goto out;
+
+ x25->neighbour = x25_get_neigh(rt->dev);
+ if (!x25->neighbour)
+ goto out_put_route;
+
+ x25_limit_facilities(&x25->facilities, x25->neighbour);
+
+ x25->lci = x25_new_lci(x25->neighbour);
+ if (!x25->lci)
+ goto out_put_neigh;
+
+ rc = -EINVAL;
+ if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
+ goto out_put_neigh;
+
+ if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr))
+ memset(&x25->source_addr, '\0', X25_ADDR_LEN);
+
+ x25->dest_addr = addr->sx25_addr;
+
+ /* Move to connecting socket, start sending Connect Requests */
+ sock->state = SS_CONNECTING;
+ sk->sk_state = TCP_SYN_SENT;
+
+ x25->state = X25_STATE_1;
+
+ x25_write_internal(sk, X25_CALL_REQUEST);
+
+ x25_start_heartbeat(sk);
+ x25_start_t21timer(sk);
+
+ /* Now the loop */
+ rc = -EINPROGRESS;
+ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK))
+ goto out_put_neigh;
+
+ rc = x25_wait_for_connection_establishment(sk);
+ if (rc)
+ goto out_put_neigh;
+
+ sock->state = SS_CONNECTED;
+ rc = 0;
+out_put_neigh:
+ if (rc)
+ x25_neigh_put(x25->neighbour);
+out_put_route:
+ x25_route_put(rt);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int x25_wait_for_data(struct sock *sk, long timeout)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ int rc = 0;
+
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ for (;;) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ rc = -ERESTARTSYS;
+ if (signal_pending(current))
+ break;
+ rc = -EAGAIN;
+ if (!timeout)
+ break;
+ rc = 0;
+ if (skb_queue_empty(&sk->sk_receive_queue)) {
+ release_sock(sk);
+ timeout = schedule_timeout(timeout);
+ lock_sock(sk);
+ } else
+ break;
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return rc;
+}
+
+static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sock *newsk;
+ struct sk_buff *skb;
+ int rc = -EINVAL;
+
+ if (!sk)
+ goto out;
+
+ rc = -EOPNOTSUPP;
+ if (sk->sk_type != SOCK_SEQPACKET)
+ goto out;
+
+ lock_sock(sk);
+ rc = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out2;
+
+ rc = x25_wait_for_data(sk, sk->sk_rcvtimeo);
+ if (rc)
+ goto out2;
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ rc = -EINVAL;
+ if (!skb->sk)
+ goto out2;
+ newsk = skb->sk;
+ sock_graft(newsk, newsock);
+
+ /* Now attach up the new socket */
+ skb->sk = NULL;
+ kfree_skb(skb);
+ sk->sk_ack_backlog--;
+ newsock->state = SS_CONNECTED;
+ rc = 0;
+out2:
+ release_sock(sk);
+out:
+ return rc;
+}
+
+static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr;
+ struct sock *sk = sock->sk;
+ struct x25_sock *x25 = x25_sk(sk);
+ int rc = 0;
+
+ if (peer) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ sx25->sx25_addr = x25->dest_addr;
+ } else
+ sx25->sx25_addr = x25->source_addr;
+
+ sx25->sx25_family = AF_X25;
+ *uaddr_len = sizeof(*sx25);
+
+out:
+ return rc;
+}
+
+int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
+ unsigned int lci)
+{
+ struct sock *sk;
+ struct sock *make;
+ struct x25_sock *makex25;
+ struct x25_address source_addr, dest_addr;
+ struct x25_facilities facilities;
+ struct x25_dte_facilities dte_facilities;
+ int len, addr_len, rc;
+
+ /*
+ * Remove the LCI and frame type.
+ */
+ skb_pull(skb, X25_STD_MIN_LEN);
+
+ /*
+ * Extract the X.25 addresses and convert them to ASCII strings,
+ * and remove them.
+ *
+ * Address block is mandatory in call request packets
+ */
+ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr);
+ if (addr_len <= 0)
+ goto out_clear_request;
+ skb_pull(skb, addr_len);
+
+ /*
+ * Get the length of the facilities, skip past them for the moment
+ * get the call user data because this is needed to determine
+ * the correct listener
+ *
+ * Facilities length is mandatory in call request packets
+ */
+ if (!pskb_may_pull(skb, 1))
+ goto out_clear_request;
+ len = skb->data[0] + 1;
+ if (!pskb_may_pull(skb, len))
+ goto out_clear_request;
+ skb_pull(skb,len);
+
+ /*
+ * Ensure that the amount of call user data is valid.
+ */
+ if (skb->len > X25_MAX_CUD_LEN)
+ goto out_clear_request;
+
+ /*
+ * Get all the call user data so it can be used in
+ * x25_find_listener and skb_copy_from_linear_data up ahead.
+ */
+ if (!pskb_may_pull(skb, skb->len))
+ goto out_clear_request;
+
+ /*
+ * Find a listener for the particular address/cud pair.
+ */
+ sk = x25_find_listener(&source_addr,skb);
+ skb_push(skb,len);
+
+ if (sk != NULL && sk_acceptq_is_full(sk)) {
+ goto out_sock_put;
+ }
+
+ /*
+ * We dont have any listeners for this incoming call.
+ * Try forwarding it.
+ */
+ if (sk == NULL) {
+ skb_push(skb, addr_len + X25_STD_MIN_LEN);
+ if (sysctl_x25_forward &&
+ x25_forward_call(&dest_addr, nb, skb, lci) > 0)
+ {
+ /* Call was forwarded, dont process it any more */
+ kfree_skb(skb);
+ rc = 1;
+ goto out;
+ } else {
+ /* No listeners, can't forward, clear the call */
+ goto out_clear_request;
+ }
+ }
+
+ /*
+ * Try to reach a compromise on the requested facilities.
+ */
+ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities);
+ if (len == -1)
+ goto out_sock_put;
+
+ /*
+ * current neighbour/link might impose additional limits
+ * on certain facilties
+ */
+
+ x25_limit_facilities(&facilities, nb);
+
+ /*
+ * Try to create a new socket.
+ */
+ make = x25_make_new(sk);
+ if (!make)
+ goto out_sock_put;
+
+ /*
+ * Remove the facilities
+ */
+ skb_pull(skb, len);
+
+ skb->sk = make;
+ make->sk_state = TCP_ESTABLISHED;
+
+ makex25 = x25_sk(make);
+ makex25->lci = lci;
+ makex25->dest_addr = dest_addr;
+ makex25->source_addr = source_addr;
+ makex25->neighbour = nb;
+ makex25->facilities = facilities;
+ makex25->dte_facilities= dte_facilities;
+ makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
+ /* ensure no reverse facil on accept */
+ makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
+ /* ensure no calling address extension on accept */
+ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE;
+ makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
+
+ /* Normally all calls are accepted immediately */
+ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) {
+ x25_write_internal(make, X25_CALL_ACCEPTED);
+ makex25->state = X25_STATE_3;
+ }
+
+ /*
+ * Incoming Call User Data.
+ */
+ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len);
+ makex25->calluserdata.cudlength = skb->len;
+
+ sk->sk_ack_backlog++;
+
+ x25_insert_socket(make);
+
+ skb_queue_head(&sk->sk_receive_queue, skb);
+
+ x25_start_heartbeat(make);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+ rc = 1;
+ sock_put(sk);
+out:
+ return rc;
+out_sock_put:
+ sock_put(sk);
+out_clear_request:
+ rc = 0;
+ x25_transmit_clear_request(nb, lci, 0x01);
+ goto out;
+}
+
+static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct x25_sock *x25 = x25_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name);
+ struct sockaddr_x25 sx25;
+ struct sk_buff *skb;
+ unsigned char *asmptr;
+ int noblock = msg->msg_flags & MSG_DONTWAIT;
+ size_t size;
+ int qbit = 0, rc = -EINVAL;
+
+ lock_sock(sk);
+ if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT))
+ goto out;
+
+ /* we currently don't support segmented records at the user interface */
+ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB)))
+ goto out;
+
+ rc = -EADDRNOTAVAIL;
+ if (sock_flag(sk, SOCK_ZAPPED))
+ goto out;
+
+ rc = -EPIPE;
+ if (sk->sk_shutdown & SEND_SHUTDOWN) {
+ send_sig(SIGPIPE, current, 0);
+ goto out;
+ }
+
+ rc = -ENETUNREACH;
+ if (!x25->neighbour)
+ goto out;
+
+ if (usx25) {
+ rc = -EINVAL;
+ if (msg->msg_namelen < sizeof(sx25))
+ goto out;
+ memcpy(&sx25, usx25, sizeof(sx25));
+ rc = -EISCONN;
+ if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr))
+ goto out;
+ rc = -EINVAL;
+ if (sx25.sx25_family != AF_X25)
+ goto out;
+ } else {
+ /*
+ * FIXME 1003.1g - if the socket is like this because
+ * it has become closed (not started closed) we ought
+ * to SIGPIPE, EPIPE;
+ */
+ rc = -ENOTCONN;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ sx25.sx25_family = AF_X25;
+ sx25.sx25_addr = x25->dest_addr;
+ }
+
+ /* Sanity check the packet size */
+ if (len > 65535) {
+ rc = -EMSGSIZE;
+ goto out;
+ }
+
+ SOCK_DEBUG(sk, "x25_sendmsg: sendto: Addresses built.\n");
+
+ /* Build a packet */
+ SOCK_DEBUG(sk, "x25_sendmsg: sendto: building packet.\n");
+
+ if ((msg->msg_flags & MSG_OOB) && len > 32)
+ len = 32;
+
+ size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN;
+
+ release_sock(sk);
+ skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+ lock_sock(sk);
+ if (!skb)
+ goto out;
+ X25_SKB_CB(skb)->flags = msg->msg_flags;
+
+ skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN);
+
+ /*
+ * Put the data on the end
+ */
+ SOCK_DEBUG(sk, "x25_sendmsg: Copying user data\n");
+
+ skb_reset_transport_header(skb);
+ skb_put(skb, len);
+
+ rc = memcpy_from_msg(skb_transport_header(skb), msg, len);
+ if (rc)
+ goto out_kfree_skb;
+
+ /*
+ * If the Q BIT Include socket option is in force, the first
+ * byte of the user data is the logical value of the Q Bit.
+ */
+ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) {
+ if (!pskb_may_pull(skb, 1))
+ goto out_kfree_skb;
+
+ qbit = skb->data[0];
+ skb_pull(skb, 1);
+ }
+
+ /*
+ * Push down the X.25 header
+ */
+ SOCK_DEBUG(sk, "x25_sendmsg: Building X.25 Header.\n");
+
+ if (msg->msg_flags & MSG_OOB) {
+ if (x25->neighbour->extended) {
+ asmptr = skb_push(skb, X25_STD_MIN_LEN);
+ *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ;
+ *asmptr++ = (x25->lci >> 0) & 0xFF;
+ *asmptr++ = X25_INTERRUPT;
+ } else {
+ asmptr = skb_push(skb, X25_STD_MIN_LEN);
+ *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ;
+ *asmptr++ = (x25->lci >> 0) & 0xFF;
+ *asmptr++ = X25_INTERRUPT;
+ }
+ } else {
+ if (x25->neighbour->extended) {
+ /* Build an Extended X.25 header */
+ asmptr = skb_push(skb, X25_EXT_MIN_LEN);
+ *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ;
+ *asmptr++ = (x25->lci >> 0) & 0xFF;
+ *asmptr++ = X25_DATA;
+ *asmptr++ = X25_DATA;
+ } else {
+ /* Build an Standard X.25 header */
+ asmptr = skb_push(skb, X25_STD_MIN_LEN);
+ *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ;
+ *asmptr++ = (x25->lci >> 0) & 0xFF;
+ *asmptr++ = X25_DATA;
+ }
+
+ if (qbit)
+ skb->data[0] |= X25_Q_BIT;
+ }
+
+ SOCK_DEBUG(sk, "x25_sendmsg: Built header.\n");
+ SOCK_DEBUG(sk, "x25_sendmsg: Transmitting buffer\n");
+
+ rc = -ENOTCONN;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out_kfree_skb;
+
+ if (msg->msg_flags & MSG_OOB)
+ skb_queue_tail(&x25->interrupt_out_queue, skb);
+ else {
+ rc = x25_output(sk, skb);
+ len = rc;
+ if (rc < 0)
+ kfree_skb(skb);
+ else if (test_bit(X25_Q_BIT_FLAG, &x25->flags))
+ len++;
+ }
+
+ x25_kick(sk);
+ rc = len;
+out:
+ release_sock(sk);
+ return rc;
+out_kfree_skb:
+ kfree_skb(skb);
+ goto out;
+}
+
+
+static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct x25_sock *x25 = x25_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name);
+ size_t copied;
+ int qbit, header_len;
+ struct sk_buff *skb;
+ unsigned char *asmptr;
+ int rc = -ENOTCONN;
+
+ lock_sock(sk);
+
+ if (x25->neighbour == NULL)
+ goto out;
+
+ header_len = x25->neighbour->extended ?
+ X25_EXT_MIN_LEN : X25_STD_MIN_LEN;
+
+ /*
+ * This works for seqpacket too. The receiver has ordered the queue for
+ * us! We do one quick check first though
+ */
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ if (flags & MSG_OOB) {
+ rc = -EINVAL;
+ if (sock_flag(sk, SOCK_URGINLINE) ||
+ !skb_peek(&x25->interrupt_in_queue))
+ goto out;
+
+ skb = skb_dequeue(&x25->interrupt_in_queue);
+
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN))
+ goto out_free_dgram;
+
+ skb_pull(skb, X25_STD_MIN_LEN);
+
+ /*
+ * No Q bit information on Interrupt data.
+ */
+ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) {
+ asmptr = skb_push(skb, 1);
+ *asmptr = 0x00;
+ }
+
+ msg->msg_flags |= MSG_OOB;
+ } else {
+ /* Now we can treat all alike */
+ release_sock(sk);
+ skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+ flags & MSG_DONTWAIT, &rc);
+ lock_sock(sk);
+ if (!skb)
+ goto out;
+
+ if (!pskb_may_pull(skb, header_len))
+ goto out_free_dgram;
+
+ qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT;
+
+ skb_pull(skb, header_len);
+
+ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) {
+ asmptr = skb_push(skb, 1);
+ *asmptr = qbit;
+ }
+ }
+
+ skb_reset_transport_header(skb);
+ copied = skb->len;
+
+ if (copied > size) {
+ copied = size;
+ msg->msg_flags |= MSG_TRUNC;
+ }
+
+ /* Currently, each datagram always contains a complete record */
+ msg->msg_flags |= MSG_EOR;
+
+ rc = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (rc)
+ goto out_free_dgram;
+
+ if (sx25) {
+ sx25->sx25_family = AF_X25;
+ sx25->sx25_addr = x25->dest_addr;
+ msg->msg_namelen = sizeof(*sx25);
+ }
+
+ x25_check_rbuf(sk);
+ rc = copied;
+out_free_dgram:
+ skb_free_datagram(sk, skb);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+
+static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ struct x25_sock *x25 = x25_sk(sk);
+ void __user *argp = (void __user *)arg;
+ int rc;
+
+ switch (cmd) {
+ case TIOCOUTQ: {
+ int amount;
+
+ amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+ if (amount < 0)
+ amount = 0;
+ rc = put_user(amount, (unsigned int __user *)argp);
+ break;
+ }
+
+ case TIOCINQ: {
+ struct sk_buff *skb;
+ int amount = 0;
+ /*
+ * These two are safe on a single CPU system as
+ * only user tasks fiddle here
+ */
+ lock_sock(sk);
+ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+ amount = skb->len;
+ release_sock(sk);
+ rc = put_user(amount, (unsigned int __user *)argp);
+ break;
+ }
+
+ case SIOCGSTAMP:
+ rc = -EINVAL;
+ if (sk)
+ rc = sock_get_timestamp(sk,
+ (struct timeval __user *)argp);
+ break;
+ case SIOCGSTAMPNS:
+ rc = -EINVAL;
+ if (sk)
+ rc = sock_get_timestampns(sk,
+ (struct timespec __user *)argp);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFMETRIC:
+ case SIOCSIFMETRIC:
+ rc = -EINVAL;
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ rc = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
+ rc = x25_route_ioctl(cmd, argp);
+ break;
+ case SIOCX25GSUBSCRIP:
+ rc = x25_subscr_ioctl(cmd, argp);
+ break;
+ case SIOCX25SSUBSCRIP:
+ rc = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
+ rc = x25_subscr_ioctl(cmd, argp);
+ break;
+ case SIOCX25GFACILITIES: {
+ lock_sock(sk);
+ rc = copy_to_user(argp, &x25->facilities,
+ sizeof(x25->facilities))
+ ? -EFAULT : 0;
+ release_sock(sk);
+ break;
+ }
+
+ case SIOCX25SFACILITIES: {
+ struct x25_facilities facilities;
+ rc = -EFAULT;
+ if (copy_from_user(&facilities, argp, sizeof(facilities)))
+ break;
+ rc = -EINVAL;
+ lock_sock(sk);
+ if (sk->sk_state != TCP_LISTEN &&
+ sk->sk_state != TCP_CLOSE)
+ goto out_fac_release;
+ if (facilities.pacsize_in < X25_PS16 ||
+ facilities.pacsize_in > X25_PS4096)
+ goto out_fac_release;
+ if (facilities.pacsize_out < X25_PS16 ||
+ facilities.pacsize_out > X25_PS4096)
+ goto out_fac_release;
+ if (facilities.winsize_in < 1 ||
+ facilities.winsize_in > 127)
+ goto out_fac_release;
+ if (facilities.throughput) {
+ int out = facilities.throughput & 0xf0;
+ int in = facilities.throughput & 0x0f;
+ if (!out)
+ facilities.throughput |=
+ X25_DEFAULT_THROUGHPUT << 4;
+ else if (out < 0x30 || out > 0xD0)
+ goto out_fac_release;
+ if (!in)
+ facilities.throughput |=
+ X25_DEFAULT_THROUGHPUT;
+ else if (in < 0x03 || in > 0x0D)
+ goto out_fac_release;
+ }
+ if (facilities.reverse &&
+ (facilities.reverse & 0x81) != 0x81)
+ goto out_fac_release;
+ x25->facilities = facilities;
+ rc = 0;
+out_fac_release:
+ release_sock(sk);
+ break;
+ }
+
+ case SIOCX25GDTEFACILITIES: {
+ lock_sock(sk);
+ rc = copy_to_user(argp, &x25->dte_facilities,
+ sizeof(x25->dte_facilities));
+ release_sock(sk);
+ if (rc)
+ rc = -EFAULT;
+ break;
+ }
+
+ case SIOCX25SDTEFACILITIES: {
+ struct x25_dte_facilities dtefacs;
+ rc = -EFAULT;
+ if (copy_from_user(&dtefacs, argp, sizeof(dtefacs)))
+ break;
+ rc = -EINVAL;
+ lock_sock(sk);
+ if (sk->sk_state != TCP_LISTEN &&
+ sk->sk_state != TCP_CLOSE)
+ goto out_dtefac_release;
+ if (dtefacs.calling_len > X25_MAX_AE_LEN)
+ goto out_dtefac_release;
+ if (dtefacs.calling_ae == NULL)
+ goto out_dtefac_release;
+ if (dtefacs.called_len > X25_MAX_AE_LEN)
+ goto out_dtefac_release;
+ if (dtefacs.called_ae == NULL)
+ goto out_dtefac_release;
+ x25->dte_facilities = dtefacs;
+ rc = 0;
+out_dtefac_release:
+ release_sock(sk);
+ break;
+ }
+
+ case SIOCX25GCALLUSERDATA: {
+ lock_sock(sk);
+ rc = copy_to_user(argp, &x25->calluserdata,
+ sizeof(x25->calluserdata))
+ ? -EFAULT : 0;
+ release_sock(sk);
+ break;
+ }
+
+ case SIOCX25SCALLUSERDATA: {
+ struct x25_calluserdata calluserdata;
+
+ rc = -EFAULT;
+ if (copy_from_user(&calluserdata, argp, sizeof(calluserdata)))
+ break;
+ rc = -EINVAL;
+ if (calluserdata.cudlength > X25_MAX_CUD_LEN)
+ break;
+ lock_sock(sk);
+ x25->calluserdata = calluserdata;
+ release_sock(sk);
+ rc = 0;
+ break;
+ }
+
+ case SIOCX25GCAUSEDIAG: {
+ lock_sock(sk);
+ rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag))
+ ? -EFAULT : 0;
+ release_sock(sk);
+ break;
+ }
+
+ case SIOCX25SCAUSEDIAG: {
+ struct x25_causediag causediag;
+ rc = -EFAULT;
+ if (copy_from_user(&causediag, argp, sizeof(causediag)))
+ break;
+ lock_sock(sk);
+ x25->causediag = causediag;
+ release_sock(sk);
+ rc = 0;
+ break;
+
+ }
+
+ case SIOCX25SCUDMATCHLEN: {
+ struct x25_subaddr sub_addr;
+ rc = -EINVAL;
+ lock_sock(sk);
+ if(sk->sk_state != TCP_CLOSE)
+ goto out_cud_release;
+ rc = -EFAULT;
+ if (copy_from_user(&sub_addr, argp,
+ sizeof(sub_addr)))
+ goto out_cud_release;
+ rc = -EINVAL;
+ if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
+ goto out_cud_release;
+ x25->cudmatchlength = sub_addr.cudmatchlength;
+ rc = 0;
+out_cud_release:
+ release_sock(sk);
+ break;
+ }
+
+ case SIOCX25CALLACCPTAPPRV: {
+ rc = -EINVAL;
+ lock_sock(sk);
+ if (sk->sk_state == TCP_CLOSE) {
+ clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags);
+ rc = 0;
+ }
+ release_sock(sk);
+ break;
+ }
+
+ case SIOCX25SENDCALLACCPT: {
+ rc = -EINVAL;
+ lock_sock(sk);
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out_sendcallaccpt_release;
+ /* must call accptapprv above */
+ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags))
+ goto out_sendcallaccpt_release;
+ x25_write_internal(sk, X25_CALL_ACCEPTED);
+ x25->state = X25_STATE_3;
+ rc = 0;
+out_sendcallaccpt_release:
+ release_sock(sk);
+ break;
+ }
+
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
+ }
+
+ return rc;
+}
+
+static const struct net_proto_family x25_family_ops = {
+ .family = AF_X25,
+ .create = x25_create,
+ .owner = THIS_MODULE,
+};
+
+#ifdef CONFIG_COMPAT
+static int compat_x25_subscr_ioctl(unsigned int cmd,
+ struct compat_x25_subscrip_struct __user *x25_subscr32)
+{
+ struct compat_x25_subscrip_struct x25_subscr;
+ struct x25_neigh *nb;
+ struct net_device *dev;
+ int rc = -EINVAL;
+
+ rc = -EFAULT;
+ if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32)))
+ goto out;
+
+ rc = -EINVAL;
+ dev = x25_dev_get(x25_subscr.device);
+ if (dev == NULL)
+ goto out;
+
+ nb = x25_get_neigh(dev);
+ if (nb == NULL)
+ goto out_dev_put;
+
+ dev_put(dev);
+
+ if (cmd == SIOCX25GSUBSCRIP) {
+ read_lock_bh(&x25_neigh_list_lock);
+ x25_subscr.extended = nb->extended;
+ x25_subscr.global_facil_mask = nb->global_facil_mask;
+ read_unlock_bh(&x25_neigh_list_lock);
+ rc = copy_to_user(x25_subscr32, &x25_subscr,
+ sizeof(*x25_subscr32)) ? -EFAULT : 0;
+ } else {
+ rc = -EINVAL;
+ if (x25_subscr.extended == 0 || x25_subscr.extended == 1) {
+ rc = 0;
+ write_lock_bh(&x25_neigh_list_lock);
+ nb->extended = x25_subscr.extended;
+ nb->global_facil_mask = x25_subscr.global_facil_mask;
+ write_unlock_bh(&x25_neigh_list_lock);
+ }
+ }
+ x25_neigh_put(nb);
+out:
+ return rc;
+out_dev_put:
+ dev_put(dev);
+ goto out;
+}
+
+static int compat_x25_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *argp = compat_ptr(arg);
+ struct sock *sk = sock->sk;
+
+ int rc = -ENOIOCTLCMD;
+
+ switch(cmd) {
+ case TIOCOUTQ:
+ case TIOCINQ:
+ rc = x25_ioctl(sock, cmd, (unsigned long)argp);
+ break;
+ case SIOCGSTAMP:
+ rc = -EINVAL;
+ if (sk)
+ rc = compat_sock_get_timestamp(sk,
+ (struct timeval __user*)argp);
+ break;
+ case SIOCGSTAMPNS:
+ rc = -EINVAL;
+ if (sk)
+ rc = compat_sock_get_timestampns(sk,
+ (struct timespec __user*)argp);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFMETRIC:
+ case SIOCSIFMETRIC:
+ rc = -EINVAL;
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ rc = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
+ rc = x25_route_ioctl(cmd, argp);
+ break;
+ case SIOCX25GSUBSCRIP:
+ rc = compat_x25_subscr_ioctl(cmd, argp);
+ break;
+ case SIOCX25SSUBSCRIP:
+ rc = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
+ rc = compat_x25_subscr_ioctl(cmd, argp);
+ break;
+ case SIOCX25GFACILITIES:
+ case SIOCX25SFACILITIES:
+ case SIOCX25GDTEFACILITIES:
+ case SIOCX25SDTEFACILITIES:
+ case SIOCX25GCALLUSERDATA:
+ case SIOCX25SCALLUSERDATA:
+ case SIOCX25GCAUSEDIAG:
+ case SIOCX25SCAUSEDIAG:
+ case SIOCX25SCUDMATCHLEN:
+ case SIOCX25CALLACCPTAPPRV:
+ case SIOCX25SENDCALLACCPT:
+ rc = x25_ioctl(sock, cmd, (unsigned long)argp);
+ break;
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
+ }
+ return rc;
+}
+#endif
+
+static const struct proto_ops x25_proto_ops = {
+ .family = AF_X25,
+ .owner = THIS_MODULE,
+ .release = x25_release,
+ .bind = x25_bind,
+ .connect = x25_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = x25_accept,
+ .getname = x25_getname,
+ .poll = datagram_poll,
+ .ioctl = x25_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = compat_x25_ioctl,
+#endif
+ .listen = x25_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = x25_setsockopt,
+ .getsockopt = x25_getsockopt,
+ .sendmsg = x25_sendmsg,
+ .recvmsg = x25_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct packet_type x25_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_X25),
+ .func = x25_lapb_receive_frame,
+};
+
+static struct notifier_block x25_dev_notifier = {
+ .notifier_call = x25_device_event,
+};
+
+void x25_kill_by_neigh(struct x25_neigh *nb)
+{
+ struct sock *s;
+
+ write_lock_bh(&x25_list_lock);
+
+ sk_for_each(s, &x25_list)
+ if (x25_sk(s)->neighbour == nb)
+ x25_disconnect(s, ENETUNREACH, 0, 0);
+
+ write_unlock_bh(&x25_list_lock);
+
+ /* Remove any related forwards */
+ x25_clear_forward_by_dev(nb->dev);
+}
+
+static int __init x25_init(void)
+{
+ int rc = proto_register(&x25_proto, 0);
+
+ if (rc != 0)
+ goto out;
+
+ rc = sock_register(&x25_family_ops);
+ if (rc != 0)
+ goto out_proto;
+
+ dev_add_pack(&x25_packet_type);
+
+ rc = register_netdevice_notifier(&x25_dev_notifier);
+ if (rc != 0)
+ goto out_sock;
+
+ pr_info("Linux Version 0.2\n");
+
+ x25_register_sysctl();
+ rc = x25_proc_init();
+ if (rc != 0)
+ goto out_dev;
+out:
+ return rc;
+out_dev:
+ unregister_netdevice_notifier(&x25_dev_notifier);
+out_sock:
+ sock_unregister(AF_X25);
+out_proto:
+ proto_unregister(&x25_proto);
+ goto out;
+}
+module_init(x25_init);
+
+static void __exit x25_exit(void)
+{
+ x25_proc_exit();
+ x25_link_free();
+ x25_route_free();
+
+ x25_unregister_sysctl();
+
+ unregister_netdevice_notifier(&x25_dev_notifier);
+
+ dev_remove_pack(&x25_packet_type);
+
+ sock_unregister(AF_X25);
+ proto_unregister(&x25_proto);
+}
+module_exit(x25_exit);
+
+MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_X25);
diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c
new file mode 100644
index 000000000..43239527a
--- /dev/null
+++ b/net/x25/sysctl_net_x25.c
@@ -0,0 +1,84 @@
+/* -*- linux-c -*-
+ * sysctl_net_x25.c: sysctl interface to net X.25 subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/x25 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/sysctl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <net/x25.h>
+
+static int min_timer[] = { 1 * HZ };
+static int max_timer[] = { 300 * HZ };
+
+static struct ctl_table_header *x25_table_header;
+
+static struct ctl_table x25_table[] = {
+ {
+ .procname = "restart_request_timeout",
+ .data = &sysctl_x25_restart_request_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer,
+ },
+ {
+ .procname = "call_request_timeout",
+ .data = &sysctl_x25_call_request_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer,
+ },
+ {
+ .procname = "reset_request_timeout",
+ .data = &sysctl_x25_reset_request_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer,
+ },
+ {
+ .procname = "clear_request_timeout",
+ .data = &sysctl_x25_clear_request_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer,
+ },
+ {
+ .procname = "acknowledgement_hold_back_timeout",
+ .data = &sysctl_x25_ack_holdback_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_timer,
+ .extra2 = &max_timer,
+ },
+ {
+ .procname = "x25_forward",
+ .data = &sysctl_x25_forward,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { 0, },
+};
+
+void __init x25_register_sysctl(void)
+{
+ x25_table_header = register_net_sysctl(&init_net, "net/x25", x25_table);
+}
+
+void x25_unregister_sysctl(void)
+{
+ unregister_net_sysctl_table(x25_table_header);
+}
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
new file mode 100644
index 000000000..39231237e
--- /dev/null
+++ b/net/x25/x25_dev.c
@@ -0,0 +1,232 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine, randomly fail to work with new
+ * releases, misbehave and/or generally screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Jonathan Naylor Started coding.
+ * 2000-09-04 Henner Eisen Prevent freeing a dangling skb.
+ */
+
+#define pr_fmt(fmt) "X25: " fmt
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/if_arp.h>
+#include <net/x25.h>
+#include <net/x25device.h>
+
+static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
+{
+ struct sock *sk;
+ unsigned short frametype;
+ unsigned int lci;
+
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN))
+ return 0;
+
+ frametype = skb->data[2];
+ lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
+
+ /*
+ * LCI of zero is always for us, and its always a link control
+ * frame.
+ */
+ if (lci == 0) {
+ x25_link_control(skb, nb, frametype);
+ return 0;
+ }
+
+ /*
+ * Find an existing socket.
+ */
+ if ((sk = x25_find_socket(lci, nb)) != NULL) {
+ int queued = 1;
+
+ skb_reset_transport_header(skb);
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ queued = x25_process_rx_frame(sk, skb);
+ } else {
+ queued = !sk_add_backlog(sk, skb, sk->sk_rcvbuf);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+ return queued;
+ }
+
+ /*
+ * Is is a Call Request ? if so process it.
+ */
+ if (frametype == X25_CALL_REQUEST)
+ return x25_rx_call_request(skb, nb, lci);
+
+ /*
+ * Its not a Call Request, nor is it a control frame.
+ * Can we forward it?
+ */
+
+ if (x25_forward_data(lci, nb, skb)) {
+ if (frametype == X25_CLEAR_CONFIRMATION) {
+ x25_clear_forward_by_lci(lci);
+ }
+ kfree_skb(skb);
+ return 1;
+ }
+
+/*
+ x25_transmit_clear_request(nb, lci, 0x0D);
+*/
+
+ if (frametype != X25_CLEAR_CONFIRMATION)
+ pr_debug("x25_receive_data(): unknown frame type %2x\n",frametype);
+
+ return 0;
+}
+
+int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *ptype, struct net_device *orig_dev)
+{
+ struct sk_buff *nskb;
+ struct x25_neigh *nb;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ goto drop;
+
+ nskb = skb_copy(skb, GFP_ATOMIC);
+ if (!nskb)
+ goto drop;
+ kfree_skb(skb);
+ skb = nskb;
+
+ /*
+ * Packet received from unrecognised device, throw it away.
+ */
+ nb = x25_get_neigh(dev);
+ if (!nb) {
+ pr_debug("unknown neighbour - %s\n", dev->name);
+ goto drop;
+ }
+
+ if (!pskb_may_pull(skb, 1))
+ return 0;
+
+ switch (skb->data[0]) {
+
+ case X25_IFACE_DATA:
+ skb_pull(skb, 1);
+ if (x25_receive_data(skb, nb)) {
+ x25_neigh_put(nb);
+ goto out;
+ }
+ break;
+
+ case X25_IFACE_CONNECT:
+ x25_link_established(nb);
+ break;
+
+ case X25_IFACE_DISCONNECT:
+ x25_link_terminated(nb);
+ break;
+ }
+ x25_neigh_put(nb);
+drop:
+ kfree_skb(skb);
+out:
+ return 0;
+}
+
+void x25_establish_link(struct x25_neigh *nb)
+{
+ struct sk_buff *skb;
+ unsigned char *ptr;
+
+ switch (nb->dev->type) {
+ case ARPHRD_X25:
+ if ((skb = alloc_skb(1, GFP_ATOMIC)) == NULL) {
+ pr_err("x25_dev: out of memory\n");
+ return;
+ }
+ ptr = skb_put(skb, 1);
+ *ptr = X25_IFACE_CONNECT;
+ break;
+
+#if IS_ENABLED(CONFIG_LLC)
+ case ARPHRD_ETHER:
+ return;
+#endif
+ default:
+ return;
+ }
+
+ skb->protocol = htons(ETH_P_X25);
+ skb->dev = nb->dev;
+
+ dev_queue_xmit(skb);
+}
+
+void x25_terminate_link(struct x25_neigh *nb)
+{
+ struct sk_buff *skb;
+ unsigned char *ptr;
+
+#if IS_ENABLED(CONFIG_LLC)
+ if (nb->dev->type == ARPHRD_ETHER)
+ return;
+#endif
+ if (nb->dev->type != ARPHRD_X25)
+ return;
+
+ skb = alloc_skb(1, GFP_ATOMIC);
+ if (!skb) {
+ pr_err("x25_dev: out of memory\n");
+ return;
+ }
+
+ ptr = skb_put(skb, 1);
+ *ptr = X25_IFACE_DISCONNECT;
+
+ skb->protocol = htons(ETH_P_X25);
+ skb->dev = nb->dev;
+ dev_queue_xmit(skb);
+}
+
+void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb)
+{
+ unsigned char *dptr;
+
+ skb_reset_network_header(skb);
+
+ switch (nb->dev->type) {
+ case ARPHRD_X25:
+ dptr = skb_push(skb, 1);
+ *dptr = X25_IFACE_DATA;
+ break;
+
+#if IS_ENABLED(CONFIG_LLC)
+ case ARPHRD_ETHER:
+ kfree_skb(skb);
+ return;
+#endif
+ default:
+ kfree_skb(skb);
+ return;
+ }
+
+ skb->protocol = htons(ETH_P_X25);
+ skb->dev = nb->dev;
+
+ dev_queue_xmit(skb);
+}
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
new file mode 100644
index 000000000..7ecd04c21
--- /dev/null
+++ b/net/x25/x25_facilities.c
@@ -0,0 +1,354 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Split from x25_subr.c
+ * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
+ * negotiation.
+ * apr/14/05 Shaun Pereira - Allow fast select with no restriction
+ * on response.
+ */
+
+#define pr_fmt(fmt) "X25: " fmt
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/x25.h>
+
+/**
+ * x25_parse_facilities - Parse facilities from skb into the facilities structs
+ *
+ * @skb: sk_buff to parse
+ * @facilities: Regular facilities, updated as facilities are found
+ * @dte_facs: ITU DTE facilities, updated as DTE facilities are found
+ * @vc_fac_mask: mask is updated with all facilities found
+ *
+ * Return codes:
+ * -1 - Parsing error, caller should drop call and clean up
+ * 0 - Parse OK, this skb has no facilities
+ * >0 - Parse OK, returns the length of the facilities header
+ *
+ */
+int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
+ struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask)
+{
+ unsigned char *p;
+ unsigned int len;
+
+ *vc_fac_mask = 0;
+
+ /*
+ * The kernel knows which facilities were set on an incoming call but
+ * currently this information is not available to userspace. Here we
+ * give userspace who read incoming call facilities 0 length to indicate
+ * it wasn't set.
+ */
+ dte_facs->calling_len = 0;
+ dte_facs->called_len = 0;
+ memset(dte_facs->called_ae, '\0', sizeof(dte_facs->called_ae));
+ memset(dte_facs->calling_ae, '\0', sizeof(dte_facs->calling_ae));
+
+ if (!pskb_may_pull(skb, 1))
+ return 0;
+
+ len = skb->data[0];
+
+ if (!pskb_may_pull(skb, 1 + len))
+ return -1;
+
+ p = skb->data + 1;
+
+ while (len > 0) {
+ switch (*p & X25_FAC_CLASS_MASK) {
+ case X25_FAC_CLASS_A:
+ if (len < 2)
+ return -1;
+ switch (*p) {
+ case X25_FAC_REVERSE:
+ if((p[1] & 0x81) == 0x81) {
+ facilities->reverse = p[1] & 0x81;
+ *vc_fac_mask |= X25_MASK_REVERSE;
+ break;
+ }
+
+ if((p[1] & 0x01) == 0x01) {
+ facilities->reverse = p[1] & 0x01;
+ *vc_fac_mask |= X25_MASK_REVERSE;
+ break;
+ }
+
+ if((p[1] & 0x80) == 0x80) {
+ facilities->reverse = p[1] & 0x80;
+ *vc_fac_mask |= X25_MASK_REVERSE;
+ break;
+ }
+
+ if(p[1] == 0x00) {
+ facilities->reverse
+ = X25_DEFAULT_REVERSE;
+ *vc_fac_mask |= X25_MASK_REVERSE;
+ break;
+ }
+
+ case X25_FAC_THROUGHPUT:
+ facilities->throughput = p[1];
+ *vc_fac_mask |= X25_MASK_THROUGHPUT;
+ break;
+ case X25_MARKER:
+ break;
+ default:
+ pr_debug("unknown facility "
+ "%02X, value %02X\n",
+ p[0], p[1]);
+ break;
+ }
+ p += 2;
+ len -= 2;
+ break;
+ case X25_FAC_CLASS_B:
+ if (len < 3)
+ return -1;
+ switch (*p) {
+ case X25_FAC_PACKET_SIZE:
+ facilities->pacsize_in = p[1];
+ facilities->pacsize_out = p[2];
+ *vc_fac_mask |= X25_MASK_PACKET_SIZE;
+ break;
+ case X25_FAC_WINDOW_SIZE:
+ facilities->winsize_in = p[1];
+ facilities->winsize_out = p[2];
+ *vc_fac_mask |= X25_MASK_WINDOW_SIZE;
+ break;
+ default:
+ pr_debug("unknown facility "
+ "%02X, values %02X, %02X\n",
+ p[0], p[1], p[2]);
+ break;
+ }
+ p += 3;
+ len -= 3;
+ break;
+ case X25_FAC_CLASS_C:
+ if (len < 4)
+ return -1;
+ pr_debug("unknown facility %02X, "
+ "values %02X, %02X, %02X\n",
+ p[0], p[1], p[2], p[3]);
+ p += 4;
+ len -= 4;
+ break;
+ case X25_FAC_CLASS_D:
+ if (len < p[1] + 2)
+ return -1;
+ switch (*p) {
+ case X25_FAC_CALLING_AE:
+ if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
+ return -1;
+ if (p[2] > X25_MAX_AE_LEN)
+ return -1;
+ dte_facs->calling_len = p[2];
+ memcpy(dte_facs->calling_ae, &p[3], p[1] - 1);
+ *vc_fac_mask |= X25_MASK_CALLING_AE;
+ break;
+ case X25_FAC_CALLED_AE:
+ if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
+ return -1;
+ if (p[2] > X25_MAX_AE_LEN)
+ return -1;
+ dte_facs->called_len = p[2];
+ memcpy(dte_facs->called_ae, &p[3], p[1] - 1);
+ *vc_fac_mask |= X25_MASK_CALLED_AE;
+ break;
+ default:
+ pr_debug("unknown facility %02X,"
+ "length %d\n", p[0], p[1]);
+ break;
+ }
+ len -= p[1] + 2;
+ p += p[1] + 2;
+ break;
+ }
+ }
+
+ return p - skb->data;
+}
+
+/*
+ * Create a set of facilities.
+ */
+int x25_create_facilities(unsigned char *buffer,
+ struct x25_facilities *facilities,
+ struct x25_dte_facilities *dte_facs, unsigned long facil_mask)
+{
+ unsigned char *p = buffer + 1;
+ int len;
+
+ if (!facil_mask) {
+ /*
+ * Length of the facilities field in call_req or
+ * call_accept packets
+ */
+ buffer[0] = 0;
+ len = 1; /* 1 byte for the length field */
+ return len;
+ }
+
+ if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
+ *p++ = X25_FAC_REVERSE;
+ *p++ = facilities->reverse;
+ }
+
+ if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
+ *p++ = X25_FAC_THROUGHPUT;
+ *p++ = facilities->throughput;
+ }
+
+ if ((facilities->pacsize_in || facilities->pacsize_out) &&
+ (facil_mask & X25_MASK_PACKET_SIZE)) {
+ *p++ = X25_FAC_PACKET_SIZE;
+ *p++ = facilities->pacsize_in ? : facilities->pacsize_out;
+ *p++ = facilities->pacsize_out ? : facilities->pacsize_in;
+ }
+
+ if ((facilities->winsize_in || facilities->winsize_out) &&
+ (facil_mask & X25_MASK_WINDOW_SIZE)) {
+ *p++ = X25_FAC_WINDOW_SIZE;
+ *p++ = facilities->winsize_in ? : facilities->winsize_out;
+ *p++ = facilities->winsize_out ? : facilities->winsize_in;
+ }
+
+ if (facil_mask & (X25_MASK_CALLING_AE|X25_MASK_CALLED_AE)) {
+ *p++ = X25_MARKER;
+ *p++ = X25_DTE_SERVICES;
+ }
+
+ if (dte_facs->calling_len && (facil_mask & X25_MASK_CALLING_AE)) {
+ unsigned int bytecount = (dte_facs->calling_len + 1) >> 1;
+ *p++ = X25_FAC_CALLING_AE;
+ *p++ = 1 + bytecount;
+ *p++ = dte_facs->calling_len;
+ memcpy(p, dte_facs->calling_ae, bytecount);
+ p += bytecount;
+ }
+
+ if (dte_facs->called_len && (facil_mask & X25_MASK_CALLED_AE)) {
+ unsigned int bytecount = (dte_facs->called_len % 2) ?
+ dte_facs->called_len / 2 + 1 :
+ dte_facs->called_len / 2;
+ *p++ = X25_FAC_CALLED_AE;
+ *p++ = 1 + bytecount;
+ *p++ = dte_facs->called_len;
+ memcpy(p, dte_facs->called_ae, bytecount);
+ p+=bytecount;
+ }
+
+ len = p - buffer;
+ buffer[0] = len - 1;
+
+ return len;
+}
+
+/*
+ * Try to reach a compromise on a set of facilities.
+ *
+ * The only real problem is with reverse charging.
+ */
+int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
+ struct x25_facilities *new, struct x25_dte_facilities *dte)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+ struct x25_facilities *ours = &x25->facilities;
+ struct x25_facilities theirs;
+ int len;
+
+ memset(&theirs, 0, sizeof(theirs));
+ memcpy(new, ours, sizeof(*new));
+
+ len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask);
+ if (len < 0)
+ return len;
+
+ /*
+ * They want reverse charging, we won't accept it.
+ */
+ if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
+ SOCK_DEBUG(sk, "X.25: rejecting reverse charging request\n");
+ return -1;
+ }
+
+ new->reverse = theirs.reverse;
+
+ if (theirs.throughput) {
+ int theirs_in = theirs.throughput & 0x0f;
+ int theirs_out = theirs.throughput & 0xf0;
+ int ours_in = ours->throughput & 0x0f;
+ int ours_out = ours->throughput & 0xf0;
+ if (!ours_in || theirs_in < ours_in) {
+ SOCK_DEBUG(sk, "X.25: inbound throughput negotiated\n");
+ new->throughput = (new->throughput & 0xf0) | theirs_in;
+ }
+ if (!ours_out || theirs_out < ours_out) {
+ SOCK_DEBUG(sk,
+ "X.25: outbound throughput negotiated\n");
+ new->throughput = (new->throughput & 0x0f) | theirs_out;
+ }
+ }
+
+ if (theirs.pacsize_in && theirs.pacsize_out) {
+ if (theirs.pacsize_in < ours->pacsize_in) {
+ SOCK_DEBUG(sk, "X.25: packet size inwards negotiated down\n");
+ new->pacsize_in = theirs.pacsize_in;
+ }
+ if (theirs.pacsize_out < ours->pacsize_out) {
+ SOCK_DEBUG(sk, "X.25: packet size outwards negotiated down\n");
+ new->pacsize_out = theirs.pacsize_out;
+ }
+ }
+
+ if (theirs.winsize_in && theirs.winsize_out) {
+ if (theirs.winsize_in < ours->winsize_in) {
+ SOCK_DEBUG(sk, "X.25: window size inwards negotiated down\n");
+ new->winsize_in = theirs.winsize_in;
+ }
+ if (theirs.winsize_out < ours->winsize_out) {
+ SOCK_DEBUG(sk, "X.25: window size outwards negotiated down\n");
+ new->winsize_out = theirs.winsize_out;
+ }
+ }
+
+ return len;
+}
+
+/*
+ * Limit values of certain facilities according to the capability of the
+ * currently attached x25 link.
+ */
+void x25_limit_facilities(struct x25_facilities *facilities,
+ struct x25_neigh *nb)
+{
+
+ if (!nb->extended) {
+ if (facilities->winsize_in > 7) {
+ pr_debug("incoming winsize limited to 7\n");
+ facilities->winsize_in = 7;
+ }
+ if (facilities->winsize_out > 7) {
+ facilities->winsize_out = 7;
+ pr_debug("outgoing winsize limited to 7\n");
+ }
+ }
+}
diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c
new file mode 100644
index 000000000..cf561f161
--- /dev/null
+++ b/net/x25/x25_forward.c
@@ -0,0 +1,170 @@
+/*
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * 03-01-2007 Added forwarding for x.25 Andrew Hendry
+ */
+
+#define pr_fmt(fmt) "X25: " fmt
+
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/x25.h>
+
+LIST_HEAD(x25_forward_list);
+DEFINE_RWLOCK(x25_forward_list_lock);
+
+int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from,
+ struct sk_buff *skb, int lci)
+{
+ struct x25_route *rt;
+ struct x25_neigh *neigh_new = NULL;
+ struct list_head *entry;
+ struct x25_forward *x25_frwd, *new_frwd;
+ struct sk_buff *skbn;
+ short same_lci = 0;
+ int rc = 0;
+
+ if ((rt = x25_get_route(dest_addr)) == NULL)
+ goto out_no_route;
+
+ if ((neigh_new = x25_get_neigh(rt->dev)) == NULL) {
+ /* This shouldn't happen, if it occurs somehow
+ * do something sensible
+ */
+ goto out_put_route;
+ }
+
+ /* Avoid a loop. This is the normal exit path for a
+ * system with only one x.25 iface and default route
+ */
+ if (rt->dev == from->dev) {
+ goto out_put_nb;
+ }
+
+ /* Remote end sending a call request on an already
+ * established LCI? It shouldn't happen, just in case..
+ */
+ read_lock_bh(&x25_forward_list_lock);
+ list_for_each(entry, &x25_forward_list) {
+ x25_frwd = list_entry(entry, struct x25_forward, node);
+ if (x25_frwd->lci == lci) {
+ pr_warn("call request for lci which is already registered!, transmitting but not registering new pair\n");
+ same_lci = 1;
+ }
+ }
+ read_unlock_bh(&x25_forward_list_lock);
+
+ /* Save the forwarding details for future traffic */
+ if (!same_lci){
+ if ((new_frwd = kmalloc(sizeof(struct x25_forward),
+ GFP_ATOMIC)) == NULL){
+ rc = -ENOMEM;
+ goto out_put_nb;
+ }
+ new_frwd->lci = lci;
+ new_frwd->dev1 = rt->dev;
+ new_frwd->dev2 = from->dev;
+ write_lock_bh(&x25_forward_list_lock);
+ list_add(&new_frwd->node, &x25_forward_list);
+ write_unlock_bh(&x25_forward_list_lock);
+ }
+
+ /* Forward the call request */
+ if ( (skbn = skb_clone(skb, GFP_ATOMIC)) == NULL){
+ goto out_put_nb;
+ }
+ x25_transmit_link(skbn, neigh_new);
+ rc = 1;
+
+
+out_put_nb:
+ x25_neigh_put(neigh_new);
+
+out_put_route:
+ x25_route_put(rt);
+
+out_no_route:
+ return rc;
+}
+
+
+int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) {
+
+ struct x25_forward *frwd;
+ struct list_head *entry;
+ struct net_device *peer = NULL;
+ struct x25_neigh *nb;
+ struct sk_buff *skbn;
+ int rc = 0;
+
+ read_lock_bh(&x25_forward_list_lock);
+ list_for_each(entry, &x25_forward_list) {
+ frwd = list_entry(entry, struct x25_forward, node);
+ if (frwd->lci == lci) {
+ /* The call is established, either side can send */
+ if (from->dev == frwd->dev1) {
+ peer = frwd->dev2;
+ } else {
+ peer = frwd->dev1;
+ }
+ break;
+ }
+ }
+ read_unlock_bh(&x25_forward_list_lock);
+
+ if ( (nb = x25_get_neigh(peer)) == NULL)
+ goto out;
+
+ if ( (skbn = pskb_copy(skb, GFP_ATOMIC)) == NULL){
+ goto output;
+
+ }
+ x25_transmit_link(skbn, nb);
+
+ rc = 1;
+output:
+ x25_neigh_put(nb);
+out:
+ return rc;
+}
+
+void x25_clear_forward_by_lci(unsigned int lci)
+{
+ struct x25_forward *fwd;
+ struct list_head *entry, *tmp;
+
+ write_lock_bh(&x25_forward_list_lock);
+
+ list_for_each_safe(entry, tmp, &x25_forward_list) {
+ fwd = list_entry(entry, struct x25_forward, node);
+ if (fwd->lci == lci) {
+ list_del(&fwd->node);
+ kfree(fwd);
+ }
+ }
+ write_unlock_bh(&x25_forward_list_lock);
+}
+
+
+void x25_clear_forward_by_dev(struct net_device *dev)
+{
+ struct x25_forward *fwd;
+ struct list_head *entry, *tmp;
+
+ write_lock_bh(&x25_forward_list_lock);
+
+ list_for_each_safe(entry, tmp, &x25_forward_list) {
+ fwd = list_entry(entry, struct x25_forward, node);
+ if ((fwd->dev1 == dev) || (fwd->dev2 == dev)){
+ list_del(&fwd->node);
+ kfree(fwd);
+ }
+ }
+ write_unlock_bh(&x25_forward_list_lock);
+}
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
new file mode 100644
index 000000000..7ac50098a
--- /dev/null
+++ b/net/x25/x25_in.c
@@ -0,0 +1,419 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Jonathan Naylor Started coding.
+ * X.25 002 Jonathan Naylor Centralised disconnection code.
+ * New timer architecture.
+ * 2000-03-20 Daniela Squassoni Disabling/enabling of facilities
+ * negotiation.
+ * 2000-11-10 Henner Eisen Check and reset for out-of-sequence
+ * i-frames.
+ */
+
+#define pr_fmt(fmt) "X25: " fmt
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/x25.h>
+
+static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
+{
+ struct sk_buff *skbo, *skbn = skb;
+ struct x25_sock *x25 = x25_sk(sk);
+
+ if (more) {
+ x25->fraglen += skb->len;
+ skb_queue_tail(&x25->fragment_queue, skb);
+ skb_set_owner_r(skb, sk);
+ return 0;
+ }
+
+ if (!more && x25->fraglen > 0) { /* End of fragment */
+ int len = x25->fraglen + skb->len;
+
+ if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL){
+ kfree_skb(skb);
+ return 1;
+ }
+
+ skb_queue_tail(&x25->fragment_queue, skb);
+
+ skb_reset_transport_header(skbn);
+
+ skbo = skb_dequeue(&x25->fragment_queue);
+ skb_copy_from_linear_data(skbo, skb_put(skbn, skbo->len),
+ skbo->len);
+ kfree_skb(skbo);
+
+ while ((skbo =
+ skb_dequeue(&x25->fragment_queue)) != NULL) {
+ skb_pull(skbo, (x25->neighbour->extended) ?
+ X25_EXT_MIN_LEN : X25_STD_MIN_LEN);
+ skb_copy_from_linear_data(skbo,
+ skb_put(skbn, skbo->len),
+ skbo->len);
+ kfree_skb(skbo);
+ }
+
+ x25->fraglen = 0;
+ }
+
+ skb_set_owner_r(skbn, sk);
+ skb_queue_tail(&sk->sk_receive_queue, skbn);
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+
+ return 0;
+}
+
+/*
+ * State machine for state 1, Awaiting Call Accepted State.
+ * The handling of the timer(s) is in file x25_timer.c.
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ struct x25_address source_addr, dest_addr;
+ int len;
+ struct x25_sock *x25 = x25_sk(sk);
+
+ switch (frametype) {
+ case X25_CALL_ACCEPTED: {
+
+ x25_stop_timer(sk);
+ x25->condition = 0x00;
+ x25->vs = 0;
+ x25->va = 0;
+ x25->vr = 0;
+ x25->vl = 0;
+ x25->state = X25_STATE_3;
+ sk->sk_state = TCP_ESTABLISHED;
+ /*
+ * Parse the data in the frame.
+ */
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN))
+ goto out_clear;
+ skb_pull(skb, X25_STD_MIN_LEN);
+
+ len = x25_parse_address_block(skb, &source_addr,
+ &dest_addr);
+ if (len > 0)
+ skb_pull(skb, len);
+ else if (len < 0)
+ goto out_clear;
+
+ len = x25_parse_facilities(skb, &x25->facilities,
+ &x25->dte_facilities,
+ &x25->vc_facil_mask);
+ if (len > 0)
+ skb_pull(skb, len);
+ else if (len < 0)
+ goto out_clear;
+ /*
+ * Copy any Call User Data.
+ */
+ if (skb->len > 0) {
+ if (skb->len > X25_MAX_CUD_LEN)
+ goto out_clear;
+
+ skb_copy_bits(skb, 0, x25->calluserdata.cuddata,
+ skb->len);
+ x25->calluserdata.cudlength = skb->len;
+ }
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ break;
+ }
+ case X25_CLEAR_REQUEST:
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
+ goto out_clear;
+
+ x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+ x25_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+
+out_clear:
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25->state = X25_STATE_2;
+ x25_start_t23timer(sk);
+ return 0;
+}
+
+/*
+ * State machine for state 2, Awaiting Clear Confirmation State.
+ * The handling of the timer(s) is in file x25_timer.c
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ switch (frametype) {
+
+ case X25_CLEAR_REQUEST:
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
+ goto out_clear;
+
+ x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+ x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
+ break;
+
+ case X25_CLEAR_CONFIRMATION:
+ x25_disconnect(sk, 0, 0, 0);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+
+out_clear:
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25_start_t23timer(sk);
+ return 0;
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file x25_timer.c
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m)
+{
+ int queued = 0;
+ int modulus;
+ struct x25_sock *x25 = x25_sk(sk);
+
+ modulus = (x25->neighbour->extended) ? X25_EMODULUS : X25_SMODULUS;
+
+ switch (frametype) {
+
+ case X25_RESET_REQUEST:
+ x25_write_internal(sk, X25_RESET_CONFIRMATION);
+ x25_stop_timer(sk);
+ x25->condition = 0x00;
+ x25->vs = 0;
+ x25->vr = 0;
+ x25->va = 0;
+ x25->vl = 0;
+ x25_requeue_frames(sk);
+ break;
+
+ case X25_CLEAR_REQUEST:
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
+ goto out_clear;
+
+ x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+ x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
+ break;
+
+ case X25_RR:
+ case X25_RNR:
+ if (!x25_validate_nr(sk, nr)) {
+ x25_clear_queues(sk);
+ x25_write_internal(sk, X25_RESET_REQUEST);
+ x25_start_t22timer(sk);
+ x25->condition = 0x00;
+ x25->vs = 0;
+ x25->vr = 0;
+ x25->va = 0;
+ x25->vl = 0;
+ x25->state = X25_STATE_4;
+ } else {
+ x25_frames_acked(sk, nr);
+ if (frametype == X25_RNR) {
+ x25->condition |= X25_COND_PEER_RX_BUSY;
+ } else {
+ x25->condition &= ~X25_COND_PEER_RX_BUSY;
+ }
+ }
+ break;
+
+ case X25_DATA: /* XXX */
+ x25->condition &= ~X25_COND_PEER_RX_BUSY;
+ if ((ns != x25->vr) || !x25_validate_nr(sk, nr)) {
+ x25_clear_queues(sk);
+ x25_write_internal(sk, X25_RESET_REQUEST);
+ x25_start_t22timer(sk);
+ x25->condition = 0x00;
+ x25->vs = 0;
+ x25->vr = 0;
+ x25->va = 0;
+ x25->vl = 0;
+ x25->state = X25_STATE_4;
+ break;
+ }
+ x25_frames_acked(sk, nr);
+ if (ns == x25->vr) {
+ if (x25_queue_rx_frame(sk, skb, m) == 0) {
+ x25->vr = (x25->vr + 1) % modulus;
+ queued = 1;
+ } else {
+ /* Should never happen */
+ x25_clear_queues(sk);
+ x25_write_internal(sk, X25_RESET_REQUEST);
+ x25_start_t22timer(sk);
+ x25->condition = 0x00;
+ x25->vs = 0;
+ x25->vr = 0;
+ x25->va = 0;
+ x25->vl = 0;
+ x25->state = X25_STATE_4;
+ break;
+ }
+ if (atomic_read(&sk->sk_rmem_alloc) >
+ (sk->sk_rcvbuf >> 1))
+ x25->condition |= X25_COND_OWN_RX_BUSY;
+ }
+ /*
+ * If the window is full Ack it immediately, else
+ * start the holdback timer.
+ */
+ if (((x25->vl + x25->facilities.winsize_in) % modulus) == x25->vr) {
+ x25->condition &= ~X25_COND_ACK_PENDING;
+ x25_stop_timer(sk);
+ x25_enquiry_response(sk);
+ } else {
+ x25->condition |= X25_COND_ACK_PENDING;
+ x25_start_t2timer(sk);
+ }
+ break;
+
+ case X25_INTERRUPT_CONFIRMATION:
+ clear_bit(X25_INTERRUPT_FLAG, &x25->flags);
+ break;
+
+ case X25_INTERRUPT:
+ if (sock_flag(sk, SOCK_URGINLINE))
+ queued = !sock_queue_rcv_skb(sk, skb);
+ else {
+ skb_set_owner_r(skb, sk);
+ skb_queue_tail(&x25->interrupt_in_queue, skb);
+ queued = 1;
+ }
+ sk_send_sigurg(sk);
+ x25_write_internal(sk, X25_INTERRUPT_CONFIRMATION);
+ break;
+
+ default:
+ pr_warn("unknown %02X in state 3\n", frametype);
+ break;
+ }
+
+ return queued;
+
+out_clear:
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25->state = X25_STATE_2;
+ x25_start_t23timer(sk);
+ return 0;
+}
+
+/*
+ * State machine for state 4, Awaiting Reset Confirmation State.
+ * The handling of the timer(s) is in file x25_timer.c
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ switch (frametype) {
+
+ case X25_RESET_REQUEST:
+ x25_write_internal(sk, X25_RESET_CONFIRMATION);
+ case X25_RESET_CONFIRMATION: {
+ x25_stop_timer(sk);
+ x25->condition = 0x00;
+ x25->va = 0;
+ x25->vr = 0;
+ x25->vs = 0;
+ x25->vl = 0;
+ x25->state = X25_STATE_3;
+ x25_requeue_frames(sk);
+ break;
+ }
+ case X25_CLEAR_REQUEST:
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2))
+ goto out_clear;
+
+ x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+ x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+
+out_clear:
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25->state = X25_STATE_2;
+ x25_start_t23timer(sk);
+ return 0;
+}
+
+/* Higher level upcall for a LAPB frame */
+int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+ int queued = 0, frametype, ns, nr, q, d, m;
+
+ if (x25->state == X25_STATE_0)
+ return 0;
+
+ frametype = x25_decode(sk, skb, &ns, &nr, &q, &d, &m);
+
+ switch (x25->state) {
+ case X25_STATE_1:
+ queued = x25_state1_machine(sk, skb, frametype);
+ break;
+ case X25_STATE_2:
+ queued = x25_state2_machine(sk, skb, frametype);
+ break;
+ case X25_STATE_3:
+ queued = x25_state3_machine(sk, skb, frametype, ns, nr, q, d, m);
+ break;
+ case X25_STATE_4:
+ queued = x25_state4_machine(sk, skb, frametype);
+ break;
+ }
+
+ x25_kick(sk);
+
+ return queued;
+}
+
+int x25_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int queued = x25_process_rx_frame(sk, skb);
+
+ if (!queued)
+ kfree_skb(skb);
+
+ return 0;
+}
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
new file mode 100644
index 000000000..fd5ffb258
--- /dev/null
+++ b/net/x25/x25_link.c
@@ -0,0 +1,411 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Jonathan Naylor Started coding.
+ * X.25 002 Jonathan Naylor New timer architecture.
+ * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
+ * negotiation.
+ * 2000-09-04 Henner Eisen dev_hold() / dev_put() for x25_neigh.
+ */
+
+#define pr_fmt(fmt) "X25: " fmt
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <net/x25.h>
+
+LIST_HEAD(x25_neigh_list);
+DEFINE_RWLOCK(x25_neigh_list_lock);
+
+static void x25_t20timer_expiry(unsigned long);
+
+static void x25_transmit_restart_confirmation(struct x25_neigh *nb);
+static void x25_transmit_restart_request(struct x25_neigh *nb);
+
+/*
+ * Linux set/reset timer routines
+ */
+static inline void x25_start_t20timer(struct x25_neigh *nb)
+{
+ mod_timer(&nb->t20timer, jiffies + nb->t20);
+}
+
+static void x25_t20timer_expiry(unsigned long param)
+{
+ struct x25_neigh *nb = (struct x25_neigh *)param;
+
+ x25_transmit_restart_request(nb);
+
+ x25_start_t20timer(nb);
+}
+
+static inline void x25_stop_t20timer(struct x25_neigh *nb)
+{
+ del_timer(&nb->t20timer);
+}
+
+static inline int x25_t20timer_pending(struct x25_neigh *nb)
+{
+ return timer_pending(&nb->t20timer);
+}
+
+/*
+ * This handles all restart and diagnostic frames.
+ */
+void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb,
+ unsigned short frametype)
+{
+ struct sk_buff *skbn;
+ int confirm;
+
+ switch (frametype) {
+ case X25_RESTART_REQUEST:
+ confirm = !x25_t20timer_pending(nb);
+ x25_stop_t20timer(nb);
+ nb->state = X25_LINK_STATE_3;
+ if (confirm)
+ x25_transmit_restart_confirmation(nb);
+ break;
+
+ case X25_RESTART_CONFIRMATION:
+ x25_stop_t20timer(nb);
+ nb->state = X25_LINK_STATE_3;
+ break;
+
+ case X25_DIAGNOSTIC:
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 4))
+ break;
+
+ pr_warn("diagnostic #%d - %02X %02X %02X\n",
+ skb->data[3], skb->data[4],
+ skb->data[5], skb->data[6]);
+ break;
+
+ default:
+ pr_warn("received unknown %02X with LCI 000\n",
+ frametype);
+ break;
+ }
+
+ if (nb->state == X25_LINK_STATE_3)
+ while ((skbn = skb_dequeue(&nb->queue)) != NULL)
+ x25_send_frame(skbn, nb);
+}
+
+/*
+ * This routine is called when a Restart Request is needed
+ */
+static void x25_transmit_restart_request(struct x25_neigh *nb)
+{
+ unsigned char *dptr;
+ int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2;
+ struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+
+ if (!skb)
+ return;
+
+ skb_reserve(skb, X25_MAX_L2_LEN);
+
+ dptr = skb_put(skb, X25_STD_MIN_LEN + 2);
+
+ *dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ;
+ *dptr++ = 0x00;
+ *dptr++ = X25_RESTART_REQUEST;
+ *dptr++ = 0x00;
+ *dptr++ = 0;
+
+ skb->sk = NULL;
+
+ x25_send_frame(skb, nb);
+}
+
+/*
+ * This routine is called when a Restart Confirmation is needed
+ */
+static void x25_transmit_restart_confirmation(struct x25_neigh *nb)
+{
+ unsigned char *dptr;
+ int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN;
+ struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+
+ if (!skb)
+ return;
+
+ skb_reserve(skb, X25_MAX_L2_LEN);
+
+ dptr = skb_put(skb, X25_STD_MIN_LEN);
+
+ *dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ;
+ *dptr++ = 0x00;
+ *dptr++ = X25_RESTART_CONFIRMATION;
+
+ skb->sk = NULL;
+
+ x25_send_frame(skb, nb);
+}
+
+/*
+ * This routine is called when a Clear Request is needed outside of the context
+ * of a connected socket.
+ */
+void x25_transmit_clear_request(struct x25_neigh *nb, unsigned int lci,
+ unsigned char cause)
+{
+ unsigned char *dptr;
+ int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2;
+ struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+
+ if (!skb)
+ return;
+
+ skb_reserve(skb, X25_MAX_L2_LEN);
+
+ dptr = skb_put(skb, X25_STD_MIN_LEN + 2);
+
+ *dptr++ = ((lci >> 8) & 0x0F) | (nb->extended ?
+ X25_GFI_EXTSEQ :
+ X25_GFI_STDSEQ);
+ *dptr++ = (lci >> 0) & 0xFF;
+ *dptr++ = X25_CLEAR_REQUEST;
+ *dptr++ = cause;
+ *dptr++ = 0x00;
+
+ skb->sk = NULL;
+
+ x25_send_frame(skb, nb);
+}
+
+void x25_transmit_link(struct sk_buff *skb, struct x25_neigh *nb)
+{
+ switch (nb->state) {
+ case X25_LINK_STATE_0:
+ skb_queue_tail(&nb->queue, skb);
+ nb->state = X25_LINK_STATE_1;
+ x25_establish_link(nb);
+ break;
+ case X25_LINK_STATE_1:
+ case X25_LINK_STATE_2:
+ skb_queue_tail(&nb->queue, skb);
+ break;
+ case X25_LINK_STATE_3:
+ x25_send_frame(skb, nb);
+ break;
+ }
+}
+
+/*
+ * Called when the link layer has become established.
+ */
+void x25_link_established(struct x25_neigh *nb)
+{
+ switch (nb->state) {
+ case X25_LINK_STATE_0:
+ nb->state = X25_LINK_STATE_2;
+ break;
+ case X25_LINK_STATE_1:
+ x25_transmit_restart_request(nb);
+ nb->state = X25_LINK_STATE_2;
+ x25_start_t20timer(nb);
+ break;
+ }
+}
+
+/*
+ * Called when the link layer has terminated, or an establishment
+ * request has failed.
+ */
+
+void x25_link_terminated(struct x25_neigh *nb)
+{
+ nb->state = X25_LINK_STATE_0;
+ /* Out of order: clear existing virtual calls (X.25 03/93 4.6.3) */
+ x25_kill_by_neigh(nb);
+}
+
+/*
+ * Add a new device.
+ */
+void x25_link_device_up(struct net_device *dev)
+{
+ struct x25_neigh *nb = kmalloc(sizeof(*nb), GFP_ATOMIC);
+
+ if (!nb)
+ return;
+
+ skb_queue_head_init(&nb->queue);
+ setup_timer(&nb->t20timer, x25_t20timer_expiry, (unsigned long)nb);
+
+ dev_hold(dev);
+ nb->dev = dev;
+ nb->state = X25_LINK_STATE_0;
+ nb->extended = 0;
+ /*
+ * Enables negotiation
+ */
+ nb->global_facil_mask = X25_MASK_REVERSE |
+ X25_MASK_THROUGHPUT |
+ X25_MASK_PACKET_SIZE |
+ X25_MASK_WINDOW_SIZE;
+ nb->t20 = sysctl_x25_restart_request_timeout;
+ atomic_set(&nb->refcnt, 1);
+
+ write_lock_bh(&x25_neigh_list_lock);
+ list_add(&nb->node, &x25_neigh_list);
+ write_unlock_bh(&x25_neigh_list_lock);
+}
+
+/**
+ * __x25_remove_neigh - remove neighbour from x25_neigh_list
+ * @nb - neigh to remove
+ *
+ * Remove neighbour from x25_neigh_list. If it was there.
+ * Caller must hold x25_neigh_list_lock.
+ */
+static void __x25_remove_neigh(struct x25_neigh *nb)
+{
+ skb_queue_purge(&nb->queue);
+ x25_stop_t20timer(nb);
+
+ if (nb->node.next) {
+ list_del(&nb->node);
+ x25_neigh_put(nb);
+ }
+}
+
+/*
+ * A device has been removed, remove its links.
+ */
+void x25_link_device_down(struct net_device *dev)
+{
+ struct x25_neigh *nb;
+ struct list_head *entry, *tmp;
+
+ write_lock_bh(&x25_neigh_list_lock);
+
+ list_for_each_safe(entry, tmp, &x25_neigh_list) {
+ nb = list_entry(entry, struct x25_neigh, node);
+
+ if (nb->dev == dev) {
+ __x25_remove_neigh(nb);
+ dev_put(dev);
+ }
+ }
+
+ write_unlock_bh(&x25_neigh_list_lock);
+}
+
+/*
+ * Given a device, return the neighbour address.
+ */
+struct x25_neigh *x25_get_neigh(struct net_device *dev)
+{
+ struct x25_neigh *nb, *use = NULL;
+ struct list_head *entry;
+
+ read_lock_bh(&x25_neigh_list_lock);
+ list_for_each(entry, &x25_neigh_list) {
+ nb = list_entry(entry, struct x25_neigh, node);
+
+ if (nb->dev == dev) {
+ use = nb;
+ break;
+ }
+ }
+
+ if (use)
+ x25_neigh_hold(use);
+ read_unlock_bh(&x25_neigh_list_lock);
+ return use;
+}
+
+/*
+ * Handle the ioctls that control the subscription functions.
+ */
+int x25_subscr_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct x25_subscrip_struct x25_subscr;
+ struct x25_neigh *nb;
+ struct net_device *dev;
+ int rc = -EINVAL;
+
+ if (cmd != SIOCX25GSUBSCRIP && cmd != SIOCX25SSUBSCRIP)
+ goto out;
+
+ rc = -EFAULT;
+ if (copy_from_user(&x25_subscr, arg, sizeof(x25_subscr)))
+ goto out;
+
+ rc = -EINVAL;
+ if ((dev = x25_dev_get(x25_subscr.device)) == NULL)
+ goto out;
+
+ if ((nb = x25_get_neigh(dev)) == NULL)
+ goto out_dev_put;
+
+ dev_put(dev);
+
+ if (cmd == SIOCX25GSUBSCRIP) {
+ read_lock_bh(&x25_neigh_list_lock);
+ x25_subscr.extended = nb->extended;
+ x25_subscr.global_facil_mask = nb->global_facil_mask;
+ read_unlock_bh(&x25_neigh_list_lock);
+ rc = copy_to_user(arg, &x25_subscr,
+ sizeof(x25_subscr)) ? -EFAULT : 0;
+ } else {
+ rc = -EINVAL;
+ if (!(x25_subscr.extended && x25_subscr.extended != 1)) {
+ rc = 0;
+ write_lock_bh(&x25_neigh_list_lock);
+ nb->extended = x25_subscr.extended;
+ nb->global_facil_mask = x25_subscr.global_facil_mask;
+ write_unlock_bh(&x25_neigh_list_lock);
+ }
+ }
+ x25_neigh_put(nb);
+out:
+ return rc;
+out_dev_put:
+ dev_put(dev);
+ goto out;
+}
+
+
+/*
+ * Release all memory associated with X.25 neighbour structures.
+ */
+void __exit x25_link_free(void)
+{
+ struct x25_neigh *nb;
+ struct list_head *entry, *tmp;
+
+ write_lock_bh(&x25_neigh_list_lock);
+
+ list_for_each_safe(entry, tmp, &x25_neigh_list) {
+ struct net_device *dev;
+
+ nb = list_entry(entry, struct x25_neigh, node);
+ dev = nb->dev;
+ __x25_remove_neigh(nb);
+ dev_put(dev);
+ }
+ write_unlock_bh(&x25_neigh_list_lock);
+}
diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c
new file mode 100644
index 000000000..0144271d2
--- /dev/null
+++ b/net/x25/x25_out.c
@@ -0,0 +1,231 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Jonathan Naylor Started coding.
+ * X.25 002 Jonathan Naylor New timer architecture.
+ * 2000-09-04 Henner Eisen Prevented x25_output() skb leakage.
+ * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation.
+ * 2000-11-10 Henner Eisen x25_send_iframe(): re-queued frames
+ * needed cleaned seq-number fields.
+ */
+
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/x25.h>
+
+static int x25_pacsize_to_bytes(unsigned int pacsize)
+{
+ int bytes = 1;
+
+ if (!pacsize)
+ return 128;
+
+ while (pacsize-- > 0)
+ bytes *= 2;
+
+ return bytes;
+}
+
+/*
+ * This is where all X.25 information frames pass.
+ *
+ * Returns the amount of user data bytes sent on success
+ * or a negative error code on failure.
+ */
+int x25_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *skbn;
+ unsigned char header[X25_EXT_MIN_LEN];
+ int err, frontlen, len;
+ int sent=0, noblock = X25_SKB_CB(skb)->flags & MSG_DONTWAIT;
+ struct x25_sock *x25 = x25_sk(sk);
+ int header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN :
+ X25_STD_MIN_LEN;
+ int max_len = x25_pacsize_to_bytes(x25->facilities.pacsize_out);
+
+ if (skb->len - header_len > max_len) {
+ /* Save a copy of the Header */
+ skb_copy_from_linear_data(skb, header, header_len);
+ skb_pull(skb, header_len);
+
+ frontlen = skb_headroom(skb);
+
+ while (skb->len > 0) {
+ release_sock(sk);
+ skbn = sock_alloc_send_skb(sk, frontlen + max_len,
+ noblock, &err);
+ lock_sock(sk);
+ if (!skbn) {
+ if (err == -EWOULDBLOCK && noblock){
+ kfree_skb(skb);
+ return sent;
+ }
+ SOCK_DEBUG(sk, "x25_output: fragment alloc"
+ " failed, err=%d, %d bytes "
+ "sent\n", err, sent);
+ return err;
+ }
+
+ skb_reserve(skbn, frontlen);
+
+ len = max_len > skb->len ? skb->len : max_len;
+
+ /* Copy the user data */
+ skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
+ skb_pull(skb, len);
+
+ /* Duplicate the Header */
+ skb_push(skbn, header_len);
+ skb_copy_to_linear_data(skbn, header, header_len);
+
+ if (skb->len > 0) {
+ if (x25->neighbour->extended)
+ skbn->data[3] |= X25_EXT_M_BIT;
+ else
+ skbn->data[2] |= X25_STD_M_BIT;
+ }
+
+ skb_queue_tail(&sk->sk_write_queue, skbn);
+ sent += len;
+ }
+
+ kfree_skb(skb);
+ } else {
+ skb_queue_tail(&sk->sk_write_queue, skb);
+ sent = skb->len - header_len;
+ }
+ return sent;
+}
+
+/*
+ * This procedure is passed a buffer descriptor for an iframe. It builds
+ * the rest of the control part of the frame and then writes it out.
+ */
+static void x25_send_iframe(struct sock *sk, struct sk_buff *skb)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ if (!skb)
+ return;
+
+ if (x25->neighbour->extended) {
+ skb->data[2] = (x25->vs << 1) & 0xFE;
+ skb->data[3] &= X25_EXT_M_BIT;
+ skb->data[3] |= (x25->vr << 1) & 0xFE;
+ } else {
+ skb->data[2] &= X25_STD_M_BIT;
+ skb->data[2] |= (x25->vs << 1) & 0x0E;
+ skb->data[2] |= (x25->vr << 5) & 0xE0;
+ }
+
+ x25_transmit_link(skb, x25->neighbour);
+}
+
+void x25_kick(struct sock *sk)
+{
+ struct sk_buff *skb, *skbn;
+ unsigned short start, end;
+ int modulus;
+ struct x25_sock *x25 = x25_sk(sk);
+
+ if (x25->state != X25_STATE_3)
+ return;
+
+ /*
+ * Transmit interrupt data.
+ */
+ if (skb_peek(&x25->interrupt_out_queue) != NULL &&
+ !test_and_set_bit(X25_INTERRUPT_FLAG, &x25->flags)) {
+
+ skb = skb_dequeue(&x25->interrupt_out_queue);
+ x25_transmit_link(skb, x25->neighbour);
+ }
+
+ if (x25->condition & X25_COND_PEER_RX_BUSY)
+ return;
+
+ if (!skb_peek(&sk->sk_write_queue))
+ return;
+
+ modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS;
+
+ start = skb_peek(&x25->ack_queue) ? x25->vs : x25->va;
+ end = (x25->va + x25->facilities.winsize_out) % modulus;
+
+ if (start == end)
+ return;
+
+ x25->vs = start;
+
+ /*
+ * Transmit data until either we're out of data to send or
+ * the window is full.
+ */
+
+ skb = skb_dequeue(&sk->sk_write_queue);
+
+ do {
+ if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ skb_queue_head(&sk->sk_write_queue, skb);
+ break;
+ }
+
+ skb_set_owner_w(skbn, sk);
+
+ /*
+ * Transmit the frame copy.
+ */
+ x25_send_iframe(sk, skbn);
+
+ x25->vs = (x25->vs + 1) % modulus;
+
+ /*
+ * Requeue the original data frame.
+ */
+ skb_queue_tail(&x25->ack_queue, skb);
+
+ } while (x25->vs != end &&
+ (skb = skb_dequeue(&sk->sk_write_queue)) != NULL);
+
+ x25->vl = x25->vr;
+ x25->condition &= ~X25_COND_ACK_PENDING;
+
+ x25_stop_timer(sk);
+}
+
+/*
+ * The following routines are taken from page 170 of the 7th ARRL Computer
+ * Networking Conference paper, as is the whole state machine.
+ */
+
+void x25_enquiry_response(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ if (x25->condition & X25_COND_OWN_RX_BUSY)
+ x25_write_internal(sk, X25_RNR);
+ else
+ x25_write_internal(sk, X25_RR);
+
+ x25->vl = x25->vr;
+ x25->condition &= ~X25_COND_ACK_PENDING;
+
+ x25_stop_timer(sk);
+}
diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c
new file mode 100644
index 000000000..0917f047f
--- /dev/null
+++ b/net/x25/x25_proc.c
@@ -0,0 +1,248 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.4 with seq_file support
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * 2002/10/06 Arnaldo Carvalho de Melo seq_file support
+ */
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/x25.h>
+
+#ifdef CONFIG_PROC_FS
+
+static void *x25_seq_route_start(struct seq_file *seq, loff_t *pos)
+ __acquires(x25_route_list_lock)
+{
+ read_lock_bh(&x25_route_list_lock);
+ return seq_list_start_head(&x25_route_list, *pos);
+}
+
+static void *x25_seq_route_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &x25_route_list, pos);
+}
+
+static void x25_seq_route_stop(struct seq_file *seq, void *v)
+ __releases(x25_route_list_lock)
+{
+ read_unlock_bh(&x25_route_list_lock);
+}
+
+static int x25_seq_route_show(struct seq_file *seq, void *v)
+{
+ struct x25_route *rt = list_entry(v, struct x25_route, node);
+
+ if (v == &x25_route_list) {
+ seq_puts(seq, "Address Digits Device\n");
+ goto out;
+ }
+
+ rt = v;
+ seq_printf(seq, "%-15s %-6d %-5s\n",
+ rt->address.x25_addr, rt->sigdigits,
+ rt->dev ? rt->dev->name : "???");
+out:
+ return 0;
+}
+
+static void *x25_seq_socket_start(struct seq_file *seq, loff_t *pos)
+ __acquires(x25_list_lock)
+{
+ read_lock_bh(&x25_list_lock);
+ return seq_hlist_start_head(&x25_list, *pos);
+}
+
+static void *x25_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_hlist_next(v, &x25_list, pos);
+}
+
+static void x25_seq_socket_stop(struct seq_file *seq, void *v)
+ __releases(x25_list_lock)
+{
+ read_unlock_bh(&x25_list_lock);
+}
+
+static int x25_seq_socket_show(struct seq_file *seq, void *v)
+{
+ struct sock *s;
+ struct x25_sock *x25;
+ struct net_device *dev;
+ const char *devname;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "dest_addr src_addr dev lci st vs vr "
+ "va t t2 t21 t22 t23 Snd-Q Rcv-Q inode\n");
+ goto out;
+ }
+
+ s = sk_entry(v);
+ x25 = x25_sk(s);
+
+ if (!x25->neighbour || (dev = x25->neighbour->dev) == NULL)
+ devname = "???";
+ else
+ devname = x25->neighbour->dev->name;
+
+ seq_printf(seq, "%-10s %-10s %-5s %3.3X %d %d %d %d %3lu %3lu "
+ "%3lu %3lu %3lu %5d %5d %ld\n",
+ !x25->dest_addr.x25_addr[0] ? "*" : x25->dest_addr.x25_addr,
+ !x25->source_addr.x25_addr[0] ? "*" : x25->source_addr.x25_addr,
+ devname, x25->lci & 0x0FFF, x25->state, x25->vs, x25->vr,
+ x25->va, x25_display_timer(s) / HZ, x25->t2 / HZ,
+ x25->t21 / HZ, x25->t22 / HZ, x25->t23 / HZ,
+ sk_wmem_alloc_get(s),
+ sk_rmem_alloc_get(s),
+ s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L);
+out:
+ return 0;
+}
+
+static void *x25_seq_forward_start(struct seq_file *seq, loff_t *pos)
+ __acquires(x25_forward_list_lock)
+{
+ read_lock_bh(&x25_forward_list_lock);
+ return seq_list_start_head(&x25_forward_list, *pos);
+}
+
+static void *x25_seq_forward_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &x25_forward_list, pos);
+}
+
+static void x25_seq_forward_stop(struct seq_file *seq, void *v)
+ __releases(x25_forward_list_lock)
+{
+ read_unlock_bh(&x25_forward_list_lock);
+}
+
+static int x25_seq_forward_show(struct seq_file *seq, void *v)
+{
+ struct x25_forward *f = list_entry(v, struct x25_forward, node);
+
+ if (v == &x25_forward_list) {
+ seq_printf(seq, "lci dev1 dev2\n");
+ goto out;
+ }
+
+ f = v;
+
+ seq_printf(seq, "%d %-10s %-10s\n",
+ f->lci, f->dev1->name, f->dev2->name);
+out:
+ return 0;
+}
+
+static const struct seq_operations x25_seq_route_ops = {
+ .start = x25_seq_route_start,
+ .next = x25_seq_route_next,
+ .stop = x25_seq_route_stop,
+ .show = x25_seq_route_show,
+};
+
+static const struct seq_operations x25_seq_socket_ops = {
+ .start = x25_seq_socket_start,
+ .next = x25_seq_socket_next,
+ .stop = x25_seq_socket_stop,
+ .show = x25_seq_socket_show,
+};
+
+static const struct seq_operations x25_seq_forward_ops = {
+ .start = x25_seq_forward_start,
+ .next = x25_seq_forward_next,
+ .stop = x25_seq_forward_stop,
+ .show = x25_seq_forward_show,
+};
+
+static int x25_seq_socket_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &x25_seq_socket_ops);
+}
+
+static int x25_seq_route_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &x25_seq_route_ops);
+}
+
+static int x25_seq_forward_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &x25_seq_forward_ops);
+}
+
+static const struct file_operations x25_seq_socket_fops = {
+ .open = x25_seq_socket_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations x25_seq_route_fops = {
+ .open = x25_seq_route_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations x25_seq_forward_fops = {
+ .open = x25_seq_forward_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+int __init x25_proc_init(void)
+{
+ if (!proc_mkdir("x25", init_net.proc_net))
+ return -ENOMEM;
+
+ if (!proc_create("x25/route", S_IRUGO, init_net.proc_net,
+ &x25_seq_route_fops))
+ goto out;
+
+ if (!proc_create("x25/socket", S_IRUGO, init_net.proc_net,
+ &x25_seq_socket_fops))
+ goto out;
+
+ if (!proc_create("x25/forward", S_IRUGO, init_net.proc_net,
+ &x25_seq_forward_fops))
+ goto out;
+ return 0;
+
+out:
+ remove_proc_subtree("x25", init_net.proc_net);
+ return -ENOMEM;
+}
+
+void __exit x25_proc_exit(void)
+{
+ remove_proc_subtree("x25", init_net.proc_net);
+}
+
+#else /* CONFIG_PROC_FS */
+
+int __init x25_proc_init(void)
+{
+ return 0;
+}
+
+void __exit x25_proc_exit(void)
+{
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c
new file mode 100644
index 000000000..277c8d244
--- /dev/null
+++ b/net/x25/x25_route.c
@@ -0,0 +1,226 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Jonathan Naylor Started coding.
+ */
+
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/x25.h>
+
+LIST_HEAD(x25_route_list);
+DEFINE_RWLOCK(x25_route_list_lock);
+
+/*
+ * Add a new route.
+ */
+static int x25_add_route(struct x25_address *address, unsigned int sigdigits,
+ struct net_device *dev)
+{
+ struct x25_route *rt;
+ struct list_head *entry;
+ int rc = -EINVAL;
+
+ write_lock_bh(&x25_route_list_lock);
+
+ list_for_each(entry, &x25_route_list) {
+ rt = list_entry(entry, struct x25_route, node);
+
+ if (!memcmp(&rt->address, address, sigdigits) &&
+ rt->sigdigits == sigdigits)
+ goto out;
+ }
+
+ rt = kmalloc(sizeof(*rt), GFP_ATOMIC);
+ rc = -ENOMEM;
+ if (!rt)
+ goto out;
+
+ strcpy(rt->address.x25_addr, "000000000000000");
+ memcpy(rt->address.x25_addr, address->x25_addr, sigdigits);
+
+ rt->sigdigits = sigdigits;
+ rt->dev = dev;
+ atomic_set(&rt->refcnt, 1);
+
+ list_add(&rt->node, &x25_route_list);
+ rc = 0;
+out:
+ write_unlock_bh(&x25_route_list_lock);
+ return rc;
+}
+
+/**
+ * __x25_remove_route - remove route from x25_route_list
+ * @rt: route to remove
+ *
+ * Remove route from x25_route_list. If it was there.
+ * Caller must hold x25_route_list_lock.
+ */
+static void __x25_remove_route(struct x25_route *rt)
+{
+ if (rt->node.next) {
+ list_del(&rt->node);
+ x25_route_put(rt);
+ }
+}
+
+static int x25_del_route(struct x25_address *address, unsigned int sigdigits,
+ struct net_device *dev)
+{
+ struct x25_route *rt;
+ struct list_head *entry;
+ int rc = -EINVAL;
+
+ write_lock_bh(&x25_route_list_lock);
+
+ list_for_each(entry, &x25_route_list) {
+ rt = list_entry(entry, struct x25_route, node);
+
+ if (!memcmp(&rt->address, address, sigdigits) &&
+ rt->sigdigits == sigdigits && rt->dev == dev) {
+ __x25_remove_route(rt);
+ rc = 0;
+ break;
+ }
+ }
+
+ write_unlock_bh(&x25_route_list_lock);
+ return rc;
+}
+
+/*
+ * A device has been removed, remove its routes.
+ */
+void x25_route_device_down(struct net_device *dev)
+{
+ struct x25_route *rt;
+ struct list_head *entry, *tmp;
+
+ write_lock_bh(&x25_route_list_lock);
+
+ list_for_each_safe(entry, tmp, &x25_route_list) {
+ rt = list_entry(entry, struct x25_route, node);
+
+ if (rt->dev == dev)
+ __x25_remove_route(rt);
+ }
+ write_unlock_bh(&x25_route_list_lock);
+
+ /* Remove any related forwarding */
+ x25_clear_forward_by_dev(dev);
+}
+
+/*
+ * Check that the device given is a valid X.25 interface that is "up".
+ */
+struct net_device *x25_dev_get(char *devname)
+{
+ struct net_device *dev = dev_get_by_name(&init_net, devname);
+
+ if (dev &&
+ (!(dev->flags & IFF_UP) || (dev->type != ARPHRD_X25
+#if IS_ENABLED(CONFIG_LLC)
+ && dev->type != ARPHRD_ETHER
+#endif
+ ))){
+ dev_put(dev);
+ dev = NULL;
+ }
+
+ return dev;
+}
+
+/**
+ * x25_get_route - Find a route given an X.25 address.
+ * @addr - address to find a route for
+ *
+ * Find a route given an X.25 address.
+ */
+struct x25_route *x25_get_route(struct x25_address *addr)
+{
+ struct x25_route *rt, *use = NULL;
+ struct list_head *entry;
+
+ read_lock_bh(&x25_route_list_lock);
+
+ list_for_each(entry, &x25_route_list) {
+ rt = list_entry(entry, struct x25_route, node);
+
+ if (!memcmp(&rt->address, addr, rt->sigdigits)) {
+ if (!use)
+ use = rt;
+ else if (rt->sigdigits > use->sigdigits)
+ use = rt;
+ }
+ }
+
+ if (use)
+ x25_route_hold(use);
+
+ read_unlock_bh(&x25_route_list_lock);
+ return use;
+}
+
+/*
+ * Handle the ioctls that control the routing functions.
+ */
+int x25_route_ioctl(unsigned int cmd, void __user *arg)
+{
+ struct x25_route_struct rt;
+ struct net_device *dev;
+ int rc = -EINVAL;
+
+ if (cmd != SIOCADDRT && cmd != SIOCDELRT)
+ goto out;
+
+ rc = -EFAULT;
+ if (copy_from_user(&rt, arg, sizeof(rt)))
+ goto out;
+
+ rc = -EINVAL;
+ if (rt.sigdigits > 15)
+ goto out;
+
+ dev = x25_dev_get(rt.device);
+ if (!dev)
+ goto out;
+
+ if (cmd == SIOCADDRT)
+ rc = x25_add_route(&rt.address, rt.sigdigits, dev);
+ else
+ rc = x25_del_route(&rt.address, rt.sigdigits, dev);
+ dev_put(dev);
+out:
+ return rc;
+}
+
+/*
+ * Release all memory associated with X.25 routing structures.
+ */
+void __exit x25_route_free(void)
+{
+ struct x25_route *rt;
+ struct list_head *entry, *tmp;
+
+ write_lock_bh(&x25_route_list_lock);
+ list_for_each_safe(entry, tmp, &x25_route_list) {
+ rt = list_entry(entry, struct x25_route, node);
+ __x25_remove_route(rt);
+ }
+ write_unlock_bh(&x25_route_list_lock);
+}
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
new file mode 100644
index 000000000..6b5af65f4
--- /dev/null
+++ b/net/x25/x25_subr.c
@@ -0,0 +1,390 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Jonathan Naylor Started coding.
+ * X.25 002 Jonathan Naylor Centralised disconnection processing.
+ * mar/20/00 Daniela Squassoni Disabling/enabling of facilities
+ * negotiation.
+ * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups
+ * apr/04/15 Shaun Pereira Fast select with no
+ * restriction on response.
+ */
+
+#define pr_fmt(fmt) "X25: " fmt
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/x25.h>
+
+/*
+ * This routine purges all of the queues of frames.
+ */
+void x25_clear_queues(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ skb_queue_purge(&sk->sk_write_queue);
+ skb_queue_purge(&x25->ack_queue);
+ skb_queue_purge(&x25->interrupt_in_queue);
+ skb_queue_purge(&x25->interrupt_out_queue);
+ skb_queue_purge(&x25->fragment_queue);
+}
+
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+*/
+void x25_frames_acked(struct sock *sk, unsigned short nr)
+{
+ struct sk_buff *skb;
+ struct x25_sock *x25 = x25_sk(sk);
+ int modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS;
+
+ /*
+ * Remove all the ack-ed frames from the ack queue.
+ */
+ if (x25->va != nr)
+ while (skb_peek(&x25->ack_queue) && x25->va != nr) {
+ skb = skb_dequeue(&x25->ack_queue);
+ kfree_skb(skb);
+ x25->va = (x25->va + 1) % modulus;
+ }
+}
+
+void x25_requeue_frames(struct sock *sk)
+{
+ struct sk_buff *skb, *skb_prev = NULL;
+
+ /*
+ * Requeue all the un-ack-ed frames on the output queue to be picked
+ * up by x25_kick. This arrangement handles the possibility of an empty
+ * output queue.
+ */
+ while ((skb = skb_dequeue(&x25_sk(sk)->ack_queue)) != NULL) {
+ if (!skb_prev)
+ skb_queue_head(&sk->sk_write_queue, skb);
+ else
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
+ skb_prev = skb;
+ }
+}
+
+/*
+ * Validate that the value of nr is between va and vs. Return true or
+ * false for testing.
+ */
+int x25_validate_nr(struct sock *sk, unsigned short nr)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+ unsigned short vc = x25->va;
+ int modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS;
+
+ while (vc != x25->vs) {
+ if (nr == vc)
+ return 1;
+ vc = (vc + 1) % modulus;
+ }
+
+ return nr == x25->vs ? 1 : 0;
+}
+
+/*
+ * This routine is called when the packet layer internally generates a
+ * control frame.
+ */
+void x25_write_internal(struct sock *sk, int frametype)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+ struct sk_buff *skb;
+ unsigned char *dptr;
+ unsigned char facilities[X25_MAX_FAC_LEN];
+ unsigned char addresses[1 + X25_ADDR_LEN];
+ unsigned char lci1, lci2;
+ /*
+ * Default safe frame size.
+ */
+ int len = X25_MAX_L2_LEN + X25_EXT_MIN_LEN;
+
+ /*
+ * Adjust frame size.
+ */
+ switch (frametype) {
+ case X25_CALL_REQUEST:
+ len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+ break;
+ case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
+ if (x25->facilities.reverse & 0x80) {
+ len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+ } else {
+ len += 1 + X25_MAX_FAC_LEN;
+ }
+ break;
+ case X25_CLEAR_REQUEST:
+ case X25_RESET_REQUEST:
+ len += 2;
+ break;
+ case X25_RR:
+ case X25_RNR:
+ case X25_REJ:
+ case X25_CLEAR_CONFIRMATION:
+ case X25_INTERRUPT_CONFIRMATION:
+ case X25_RESET_CONFIRMATION:
+ break;
+ default:
+ pr_err("invalid frame type %02X\n", frametype);
+ return;
+ }
+
+ if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+ return;
+
+ /*
+ * Space for Ethernet and 802.2 LLC headers.
+ */
+ skb_reserve(skb, X25_MAX_L2_LEN);
+
+ /*
+ * Make space for the GFI and LCI, and fill them in.
+ */
+ dptr = skb_put(skb, 2);
+
+ lci1 = (x25->lci >> 8) & 0x0F;
+ lci2 = (x25->lci >> 0) & 0xFF;
+
+ if (x25->neighbour->extended) {
+ *dptr++ = lci1 | X25_GFI_EXTSEQ;
+ *dptr++ = lci2;
+ } else {
+ *dptr++ = lci1 | X25_GFI_STDSEQ;
+ *dptr++ = lci2;
+ }
+
+ /*
+ * Now fill in the frame type specific information.
+ */
+ switch (frametype) {
+
+ case X25_CALL_REQUEST:
+ dptr = skb_put(skb, 1);
+ *dptr++ = X25_CALL_REQUEST;
+ len = x25_addr_aton(addresses, &x25->dest_addr,
+ &x25->source_addr);
+ dptr = skb_put(skb, len);
+ memcpy(dptr, addresses, len);
+ len = x25_create_facilities(facilities,
+ &x25->facilities,
+ &x25->dte_facilities,
+ x25->neighbour->global_facil_mask);
+ dptr = skb_put(skb, len);
+ memcpy(dptr, facilities, len);
+ dptr = skb_put(skb, x25->calluserdata.cudlength);
+ memcpy(dptr, x25->calluserdata.cuddata,
+ x25->calluserdata.cudlength);
+ x25->calluserdata.cudlength = 0;
+ break;
+
+ case X25_CALL_ACCEPTED:
+ dptr = skb_put(skb, 2);
+ *dptr++ = X25_CALL_ACCEPTED;
+ *dptr++ = 0x00; /* Address lengths */
+ len = x25_create_facilities(facilities,
+ &x25->facilities,
+ &x25->dte_facilities,
+ x25->vc_facil_mask);
+ dptr = skb_put(skb, len);
+ memcpy(dptr, facilities, len);
+
+ /* fast select with no restriction on response
+ allows call user data. Userland must
+ ensure it is ours and not theirs */
+ if(x25->facilities.reverse & 0x80) {
+ dptr = skb_put(skb,
+ x25->calluserdata.cudlength);
+ memcpy(dptr, x25->calluserdata.cuddata,
+ x25->calluserdata.cudlength);
+ }
+ x25->calluserdata.cudlength = 0;
+ break;
+
+ case X25_CLEAR_REQUEST:
+ dptr = skb_put(skb, 3);
+ *dptr++ = frametype;
+ *dptr++ = x25->causediag.cause;
+ *dptr++ = x25->causediag.diagnostic;
+ break;
+
+ case X25_RESET_REQUEST:
+ dptr = skb_put(skb, 3);
+ *dptr++ = frametype;
+ *dptr++ = 0x00; /* XXX */
+ *dptr++ = 0x00; /* XXX */
+ break;
+
+ case X25_RR:
+ case X25_RNR:
+ case X25_REJ:
+ if (x25->neighbour->extended) {
+ dptr = skb_put(skb, 2);
+ *dptr++ = frametype;
+ *dptr++ = (x25->vr << 1) & 0xFE;
+ } else {
+ dptr = skb_put(skb, 1);
+ *dptr = frametype;
+ *dptr++ |= (x25->vr << 5) & 0xE0;
+ }
+ break;
+
+ case X25_CLEAR_CONFIRMATION:
+ case X25_INTERRUPT_CONFIRMATION:
+ case X25_RESET_CONFIRMATION:
+ dptr = skb_put(skb, 1);
+ *dptr = frametype;
+ break;
+ }
+
+ x25_transmit_link(skb, x25->neighbour);
+}
+
+/*
+ * Unpick the contents of the passed X.25 Packet Layer frame.
+ */
+int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q,
+ int *d, int *m)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+ unsigned char *frame;
+
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN))
+ return X25_ILLEGAL;
+ frame = skb->data;
+
+ *ns = *nr = *q = *d = *m = 0;
+
+ switch (frame[2]) {
+ case X25_CALL_REQUEST:
+ case X25_CALL_ACCEPTED:
+ case X25_CLEAR_REQUEST:
+ case X25_CLEAR_CONFIRMATION:
+ case X25_INTERRUPT:
+ case X25_INTERRUPT_CONFIRMATION:
+ case X25_RESET_REQUEST:
+ case X25_RESET_CONFIRMATION:
+ case X25_RESTART_REQUEST:
+ case X25_RESTART_CONFIRMATION:
+ case X25_REGISTRATION_REQUEST:
+ case X25_REGISTRATION_CONFIRMATION:
+ case X25_DIAGNOSTIC:
+ return frame[2];
+ }
+
+ if (x25->neighbour->extended) {
+ if (frame[2] == X25_RR ||
+ frame[2] == X25_RNR ||
+ frame[2] == X25_REJ) {
+ if (!pskb_may_pull(skb, X25_EXT_MIN_LEN))
+ return X25_ILLEGAL;
+ frame = skb->data;
+
+ *nr = (frame[3] >> 1) & 0x7F;
+ return frame[2];
+ }
+ } else {
+ if ((frame[2] & 0x1F) == X25_RR ||
+ (frame[2] & 0x1F) == X25_RNR ||
+ (frame[2] & 0x1F) == X25_REJ) {
+ *nr = (frame[2] >> 5) & 0x07;
+ return frame[2] & 0x1F;
+ }
+ }
+
+ if (x25->neighbour->extended) {
+ if ((frame[2] & 0x01) == X25_DATA) {
+ if (!pskb_may_pull(skb, X25_EXT_MIN_LEN))
+ return X25_ILLEGAL;
+ frame = skb->data;
+
+ *q = (frame[0] & X25_Q_BIT) == X25_Q_BIT;
+ *d = (frame[0] & X25_D_BIT) == X25_D_BIT;
+ *m = (frame[3] & X25_EXT_M_BIT) == X25_EXT_M_BIT;
+ *nr = (frame[3] >> 1) & 0x7F;
+ *ns = (frame[2] >> 1) & 0x7F;
+ return X25_DATA;
+ }
+ } else {
+ if ((frame[2] & 0x01) == X25_DATA) {
+ *q = (frame[0] & X25_Q_BIT) == X25_Q_BIT;
+ *d = (frame[0] & X25_D_BIT) == X25_D_BIT;
+ *m = (frame[2] & X25_STD_M_BIT) == X25_STD_M_BIT;
+ *nr = (frame[2] >> 5) & 0x07;
+ *ns = (frame[2] >> 1) & 0x07;
+ return X25_DATA;
+ }
+ }
+
+ pr_debug("invalid PLP frame %02X %02X %02X\n",
+ frame[0], frame[1], frame[2]);
+
+ return X25_ILLEGAL;
+}
+
+void x25_disconnect(struct sock *sk, int reason, unsigned char cause,
+ unsigned char diagnostic)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ x25_clear_queues(sk);
+ x25_stop_timer(sk);
+
+ x25->lci = 0;
+ x25->state = X25_STATE_0;
+
+ x25->causediag.cause = cause;
+ x25->causediag.diagnostic = diagnostic;
+
+ sk->sk_state = TCP_CLOSE;
+ sk->sk_err = reason;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ }
+}
+
+/*
+ * Clear an own-rx-busy condition and tell the peer about this, provided
+ * that there is a significant amount of free receive buffer space available.
+ */
+void x25_check_rbuf(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf >> 1) &&
+ (x25->condition & X25_COND_OWN_RX_BUSY)) {
+ x25->condition &= ~X25_COND_OWN_RX_BUSY;
+ x25->condition &= ~X25_COND_ACK_PENDING;
+ x25->vl = x25->vr;
+ x25_write_internal(sk, X25_RR);
+ x25_stop_timer(sk);
+ }
+}
+
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
new file mode 100644
index 000000000..5c5db1a36
--- /dev/null
+++ b/net/x25/x25_timer.c
@@ -0,0 +1,174 @@
+/*
+ * X.25 Packet Layer release 002
+ *
+ * This is ALPHA test software. This code may break your machine,
+ * randomly fail to work with new releases, misbehave and/or generally
+ * screw up. It might even work.
+ *
+ * This code REQUIRES 2.1.15 or higher
+ *
+ * This module:
+ * This module is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * History
+ * X.25 001 Jonathan Naylor Started coding.
+ * X.25 002 Jonathan Naylor New timer architecture.
+ * Centralised disconnection processing.
+ */
+
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/x25.h>
+
+static void x25_heartbeat_expiry(unsigned long);
+static void x25_timer_expiry(unsigned long);
+
+void x25_init_timers(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk);
+
+ /* initialized by sock_init_data */
+ sk->sk_timer.data = (unsigned long)sk;
+ sk->sk_timer.function = &x25_heartbeat_expiry;
+}
+
+void x25_start_heartbeat(struct sock *sk)
+{
+ mod_timer(&sk->sk_timer, jiffies + 5 * HZ);
+}
+
+void x25_stop_heartbeat(struct sock *sk)
+{
+ del_timer(&sk->sk_timer);
+}
+
+void x25_start_t2timer(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ mod_timer(&x25->timer, jiffies + x25->t2);
+}
+
+void x25_start_t21timer(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ mod_timer(&x25->timer, jiffies + x25->t21);
+}
+
+void x25_start_t22timer(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ mod_timer(&x25->timer, jiffies + x25->t22);
+}
+
+void x25_start_t23timer(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ mod_timer(&x25->timer, jiffies + x25->t23);
+}
+
+void x25_stop_timer(struct sock *sk)
+{
+ del_timer(&x25_sk(sk)->timer);
+}
+
+unsigned long x25_display_timer(struct sock *sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ if (!timer_pending(&x25->timer))
+ return 0;
+
+ return x25->timer.expires - jiffies;
+}
+
+static void x25_heartbeat_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) /* can currently only occur in state 3 */
+ goto restart_heartbeat;
+
+ switch (x25_sk(sk)->state) {
+
+ case X25_STATE_0:
+ /*
+ * Magic here: If we listen() and a new link dies
+ * before it is accepted() it isn't 'dead' so doesn't
+ * get removed.
+ */
+ if (sock_flag(sk, SOCK_DESTROY) ||
+ (sk->sk_state == TCP_LISTEN &&
+ sock_flag(sk, SOCK_DEAD))) {
+ bh_unlock_sock(sk);
+ x25_destroy_socket_from_timer(sk);
+ return;
+ }
+ break;
+
+ case X25_STATE_3:
+ /*
+ * Check for the state of the receive buffer.
+ */
+ x25_check_rbuf(sk);
+ break;
+ }
+restart_heartbeat:
+ x25_start_heartbeat(sk);
+ bh_unlock_sock(sk);
+}
+
+/*
+ * Timer has expired, it may have been T2, T21, T22, or T23. We can tell
+ * by the state machine state.
+ */
+static inline void x25_do_timer_expiry(struct sock * sk)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ switch (x25->state) {
+
+ case X25_STATE_3: /* T2 */
+ if (x25->condition & X25_COND_ACK_PENDING) {
+ x25->condition &= ~X25_COND_ACK_PENDING;
+ x25_enquiry_response(sk);
+ }
+ break;
+
+ case X25_STATE_1: /* T21 */
+ case X25_STATE_4: /* T22 */
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25->state = X25_STATE_2;
+ x25_start_t23timer(sk);
+ break;
+
+ case X25_STATE_2: /* T23 */
+ x25_disconnect(sk, ETIMEDOUT, 0, 0);
+ break;
+ }
+}
+
+static void x25_timer_expiry(unsigned long param)
+{
+ struct sock *sk = (struct sock *)param;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) { /* can currently only occur in state 3 */
+ if (x25_sk(sk)->state == X25_STATE_3)
+ x25_start_t2timer(sk);
+ } else
+ x25_do_timer_expiry(sk);
+ bh_unlock_sock(sk);
+}
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
new file mode 100644
index 000000000..bda1a1362
--- /dev/null
+++ b/net/xfrm/Kconfig
@@ -0,0 +1,85 @@
+#
+# XFRM configuration
+#
+config XFRM
+ bool
+ depends on NET
+
+config XFRM_ALGO
+ tristate
+ select XFRM
+ select CRYPTO
+
+config XFRM_USER
+ tristate "Transformation user configuration interface"
+ depends on INET
+ select XFRM_ALGO
+ ---help---
+ Support for Transformation(XFRM) user configuration interface
+ like IPsec used by native Linux tools.
+
+ If unsure, say Y.
+
+config XFRM_SUB_POLICY
+ bool "Transformation sub policy support"
+ depends on XFRM
+ ---help---
+ Support sub policy for developers. By using sub policy with main
+ one, two policies can be applied to the same packet at once.
+ Policy which lives shorter time in kernel should be a sub.
+
+ If unsure, say N.
+
+config XFRM_MIGRATE
+ bool "Transformation migrate database"
+ depends on XFRM
+ ---help---
+ A feature to update locator(s) of a given IPsec security
+ association dynamically. This feature is required, for
+ instance, in a Mobile IPv6 environment with IPsec configuration
+ where mobile nodes change their attachment point to the Internet.
+
+ If unsure, say N.
+
+config XFRM_STATISTICS
+ bool "Transformation statistics"
+ depends on INET && XFRM && PROC_FS
+ ---help---
+ This statistics is not a SNMP/MIB specification but shows
+ statistics about transformation error (or almost error) factor
+ at packet processing for developer.
+
+ If unsure, say N.
+
+config XFRM_IPCOMP
+ tristate
+ select XFRM_ALGO
+ select CRYPTO
+ select CRYPTO_DEFLATE
+
+config NET_KEY
+ tristate "PF_KEY sockets"
+ select XFRM_ALGO
+ ---help---
+ PF_KEYv2 socket family, compatible to KAME ones.
+ They are required if you are going to use IPsec tools ported
+ from KAME.
+
+ Say Y unless you know what you are doing.
+
+config NET_KEY_MIGRATE
+ bool "PF_KEY MIGRATE"
+ depends on NET_KEY
+ select XFRM_MIGRATE
+ ---help---
+ Add a PF_KEY MIGRATE message to PF_KEYv2 socket family.
+ The PF_KEY MIGRATE message is used to dynamically update
+ locator(s) of a given IPsec security association.
+ This feature is required, for instance, in a Mobile IPv6
+ environment with IPsec configuration where mobile nodes
+ change their attachment point to the Internet. Detail
+ information can be found in the internet-draft
+ <draft-sugimoto-mip6-pfkey-migrate>.
+
+ If unsure, say N.
+
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
new file mode 100644
index 000000000..c0e961983
--- /dev/null
+++ b/net/xfrm/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the XFRM subsystem.
+#
+
+obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
+ xfrm_input.o xfrm_output.o \
+ xfrm_sysctl.o xfrm_replay.o
+obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
+obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
+obj-$(CONFIG_XFRM_USER) += xfrm_user.o
+obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
new file mode 100644
index 000000000..12e82a5e4
--- /dev/null
+++ b/net/xfrm/xfrm_algo.c
@@ -0,0 +1,800 @@
+/*
+ * xfrm algorithm interface
+ *
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/pfkeyv2.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <net/xfrm.h>
+#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
+#include <net/esp.h>
+#endif
+
+/*
+ * Algorithms supported by IPsec. These entries contain properties which
+ * are used in key negotiation and xfrm processing, and are used to verify
+ * that instantiated crypto transforms have correct parameters for IPsec
+ * purposes.
+ */
+static struct xfrm_algo_desc aead_list[] = {
+{
+ .name = "rfc4106(gcm(aes))",
+
+ .uinfo = {
+ .aead = {
+ .icv_truncbits = 64,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV8,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "rfc4106(gcm(aes))",
+
+ .uinfo = {
+ .aead = {
+ .icv_truncbits = 96,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV12,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "rfc4106(gcm(aes))",
+
+ .uinfo = {
+ .aead = {
+ .icv_truncbits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV16,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "rfc4309(ccm(aes))",
+
+ .uinfo = {
+ .aead = {
+ .icv_truncbits = 64,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV8,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "rfc4309(ccm(aes))",
+
+ .uinfo = {
+ .aead = {
+ .icv_truncbits = 96,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV12,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "rfc4309(ccm(aes))",
+
+ .uinfo = {
+ .aead = {
+ .icv_truncbits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV16,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "rfc4543(gcm(aes))",
+
+ .uinfo = {
+ .aead = {
+ .icv_truncbits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_NULL_AES_GMAC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+};
+
+static struct xfrm_algo_desc aalg_list[] = {
+{
+ .name = "digest_null",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 0,
+ .icv_fullbits = 0,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_AALG_NULL,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 0,
+ .sadb_alg_maxbits = 0
+ }
+},
+{
+ .name = "hmac(md5)",
+ .compat = "md5",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 96,
+ .icv_fullbits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_AALG_MD5HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 128
+ }
+},
+{
+ .name = "hmac(sha1)",
+ .compat = "sha1",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 96,
+ .icv_fullbits = 160,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_AALG_SHA1HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 160,
+ .sadb_alg_maxbits = 160
+ }
+},
+{
+ .name = "hmac(sha256)",
+ .compat = "sha256",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 96,
+ .icv_fullbits = 256,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 256,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "hmac(sha384)",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 192,
+ .icv_fullbits = 384,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_AALG_SHA2_384HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 384,
+ .sadb_alg_maxbits = 384
+ }
+},
+{
+ .name = "hmac(sha512)",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 256,
+ .icv_fullbits = 512,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_AALG_SHA2_512HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 512,
+ .sadb_alg_maxbits = 512
+ }
+},
+{
+ .name = "hmac(rmd160)",
+ .compat = "rmd160",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 96,
+ .icv_fullbits = 160,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 160,
+ .sadb_alg_maxbits = 160
+ }
+},
+{
+ .name = "xcbc(aes)",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 96,
+ .icv_fullbits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_AALG_AES_XCBC_MAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 128
+ }
+},
+{
+ /* rfc4494 */
+ .name = "cmac(aes)",
+
+ .uinfo = {
+ .auth = {
+ .icv_truncbits = 96,
+ .icv_fullbits = 128,
+ }
+ },
+
+ .pfkey_supported = 0,
+},
+};
+
+static struct xfrm_algo_desc ealg_list[] = {
+{
+ .name = "ecb(cipher_null)",
+ .compat = "cipher_null",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 8,
+ .defkeybits = 0,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_EALG_NULL,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 0,
+ .sadb_alg_maxbits = 0
+ }
+},
+{
+ .name = "cbc(des)",
+ .compat = "des",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 64,
+ .defkeybits = 64,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_EALG_DESCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 64,
+ .sadb_alg_maxbits = 64
+ }
+},
+{
+ .name = "cbc(des3_ede)",
+ .compat = "des3_ede",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 64,
+ .defkeybits = 192,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_EALG_3DESCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 192,
+ .sadb_alg_maxbits = 192
+ }
+},
+{
+ .name = "cbc(cast5)",
+ .compat = "cast5",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 64,
+ .defkeybits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_CASTCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 40,
+ .sadb_alg_maxbits = 128
+ }
+},
+{
+ .name = "cbc(blowfish)",
+ .compat = "blowfish",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 64,
+ .defkeybits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 40,
+ .sadb_alg_maxbits = 448
+ }
+},
+{
+ .name = "cbc(aes)",
+ .compat = "aes",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 128,
+ .defkeybits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_AESCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "cbc(serpent)",
+ .compat = "serpent",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 128,
+ .defkeybits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_SERPENTCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256,
+ }
+},
+{
+ .name = "cbc(camellia)",
+ .compat = "camellia",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 128,
+ .defkeybits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_CAMELLIACBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "cbc(twofish)",
+ .compat = "twofish",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 128,
+ .defkeybits = 128,
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_TWOFISHCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+},
+{
+ .name = "rfc3686(ctr(aes))",
+
+ .uinfo = {
+ .encr = {
+ .blockbits = 128,
+ .defkeybits = 160, /* 128-bit key + 32-bit nonce */
+ }
+ },
+
+ .pfkey_supported = 1,
+
+ .desc = {
+ .sadb_alg_id = SADB_X_EALG_AESCTR,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 160,
+ .sadb_alg_maxbits = 288
+ }
+},
+};
+
+static struct xfrm_algo_desc calg_list[] = {
+{
+ .name = "deflate",
+ .uinfo = {
+ .comp = {
+ .threshold = 90,
+ }
+ },
+ .pfkey_supported = 1,
+ .desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE }
+},
+{
+ .name = "lzs",
+ .uinfo = {
+ .comp = {
+ .threshold = 90,
+ }
+ },
+ .pfkey_supported = 1,
+ .desc = { .sadb_alg_id = SADB_X_CALG_LZS }
+},
+{
+ .name = "lzjh",
+ .uinfo = {
+ .comp = {
+ .threshold = 50,
+ }
+ },
+ .pfkey_supported = 1,
+ .desc = { .sadb_alg_id = SADB_X_CALG_LZJH }
+},
+};
+
+static inline int aalg_entries(void)
+{
+ return ARRAY_SIZE(aalg_list);
+}
+
+static inline int ealg_entries(void)
+{
+ return ARRAY_SIZE(ealg_list);
+}
+
+static inline int calg_entries(void)
+{
+ return ARRAY_SIZE(calg_list);
+}
+
+struct xfrm_algo_list {
+ struct xfrm_algo_desc *algs;
+ int entries;
+ u32 type;
+ u32 mask;
+};
+
+static const struct xfrm_algo_list xfrm_aead_list = {
+ .algs = aead_list,
+ .entries = ARRAY_SIZE(aead_list),
+ .type = CRYPTO_ALG_TYPE_AEAD,
+ .mask = CRYPTO_ALG_TYPE_MASK,
+};
+
+static const struct xfrm_algo_list xfrm_aalg_list = {
+ .algs = aalg_list,
+ .entries = ARRAY_SIZE(aalg_list),
+ .type = CRYPTO_ALG_TYPE_HASH,
+ .mask = CRYPTO_ALG_TYPE_HASH_MASK,
+};
+
+static const struct xfrm_algo_list xfrm_ealg_list = {
+ .algs = ealg_list,
+ .entries = ARRAY_SIZE(ealg_list),
+ .type = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .mask = CRYPTO_ALG_TYPE_BLKCIPHER_MASK,
+};
+
+static const struct xfrm_algo_list xfrm_calg_list = {
+ .algs = calg_list,
+ .entries = ARRAY_SIZE(calg_list),
+ .type = CRYPTO_ALG_TYPE_COMPRESS,
+ .mask = CRYPTO_ALG_TYPE_MASK,
+};
+
+static struct xfrm_algo_desc *xfrm_find_algo(
+ const struct xfrm_algo_list *algo_list,
+ int match(const struct xfrm_algo_desc *entry, const void *data),
+ const void *data, int probe)
+{
+ struct xfrm_algo_desc *list = algo_list->algs;
+ int i, status;
+
+ for (i = 0; i < algo_list->entries; i++) {
+ if (!match(list + i, data))
+ continue;
+
+ if (list[i].available)
+ return &list[i];
+
+ if (!probe)
+ break;
+
+ status = crypto_has_alg(list[i].name, algo_list->type,
+ algo_list->mask);
+ if (!status)
+ break;
+
+ list[i].available = status;
+ return &list[i];
+ }
+ return NULL;
+}
+
+static int xfrm_alg_id_match(const struct xfrm_algo_desc *entry,
+ const void *data)
+{
+ return entry->desc.sadb_alg_id == (unsigned long)data;
+}
+
+struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id)
+{
+ return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_id_match,
+ (void *)(unsigned long)alg_id, 1);
+}
+EXPORT_SYMBOL_GPL(xfrm_aalg_get_byid);
+
+struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id)
+{
+ return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_id_match,
+ (void *)(unsigned long)alg_id, 1);
+}
+EXPORT_SYMBOL_GPL(xfrm_ealg_get_byid);
+
+struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id)
+{
+ return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_id_match,
+ (void *)(unsigned long)alg_id, 1);
+}
+EXPORT_SYMBOL_GPL(xfrm_calg_get_byid);
+
+static int xfrm_alg_name_match(const struct xfrm_algo_desc *entry,
+ const void *data)
+{
+ const char *name = data;
+
+ return name && (!strcmp(name, entry->name) ||
+ (entry->compat && !strcmp(name, entry->compat)));
+}
+
+struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe)
+{
+ return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_name_match, name,
+ probe);
+}
+EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname);
+
+struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe)
+{
+ return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_name_match, name,
+ probe);
+}
+EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname);
+
+struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe)
+{
+ return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_name_match, name,
+ probe);
+}
+EXPORT_SYMBOL_GPL(xfrm_calg_get_byname);
+
+struct xfrm_aead_name {
+ const char *name;
+ int icvbits;
+};
+
+static int xfrm_aead_name_match(const struct xfrm_algo_desc *entry,
+ const void *data)
+{
+ const struct xfrm_aead_name *aead = data;
+ const char *name = aead->name;
+
+ return aead->icvbits == entry->uinfo.aead.icv_truncbits && name &&
+ !strcmp(name, entry->name);
+}
+
+struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len, int probe)
+{
+ struct xfrm_aead_name data = {
+ .name = name,
+ .icvbits = icv_len,
+ };
+
+ return xfrm_find_algo(&xfrm_aead_list, xfrm_aead_name_match, &data,
+ probe);
+}
+EXPORT_SYMBOL_GPL(xfrm_aead_get_byname);
+
+struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx)
+{
+ if (idx >= aalg_entries())
+ return NULL;
+
+ return &aalg_list[idx];
+}
+EXPORT_SYMBOL_GPL(xfrm_aalg_get_byidx);
+
+struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx)
+{
+ if (idx >= ealg_entries())
+ return NULL;
+
+ return &ealg_list[idx];
+}
+EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx);
+
+/*
+ * Probe for the availability of crypto algorithms, and set the available
+ * flag for any algorithms found on the system. This is typically called by
+ * pfkey during userspace SA add, update or register.
+ */
+void xfrm_probe_algs(void)
+{
+ int i, status;
+
+ BUG_ON(in_softirq());
+
+ for (i = 0; i < aalg_entries(); i++) {
+ status = crypto_has_hash(aalg_list[i].name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (aalg_list[i].available != status)
+ aalg_list[i].available = status;
+ }
+
+ for (i = 0; i < ealg_entries(); i++) {
+ status = crypto_has_ablkcipher(ealg_list[i].name, 0, 0);
+ if (ealg_list[i].available != status)
+ ealg_list[i].available = status;
+ }
+
+ for (i = 0; i < calg_entries(); i++) {
+ status = crypto_has_comp(calg_list[i].name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (calg_list[i].available != status)
+ calg_list[i].available = status;
+ }
+}
+EXPORT_SYMBOL_GPL(xfrm_probe_algs);
+
+int xfrm_count_pfkey_auth_supported(void)
+{
+ int i, n;
+
+ for (i = 0, n = 0; i < aalg_entries(); i++)
+ if (aalg_list[i].available && aalg_list[i].pfkey_supported)
+ n++;
+ return n;
+}
+EXPORT_SYMBOL_GPL(xfrm_count_pfkey_auth_supported);
+
+int xfrm_count_pfkey_enc_supported(void)
+{
+ int i, n;
+
+ for (i = 0, n = 0; i < ealg_entries(); i++)
+ if (ealg_list[i].available && ealg_list[i].pfkey_supported)
+ n++;
+ return n;
+}
+EXPORT_SYMBOL_GPL(xfrm_count_pfkey_enc_supported);
+
+MODULE_LICENSE("GPL");
diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c
new file mode 100644
index 000000000..1e98bc0fe
--- /dev/null
+++ b/net/xfrm/xfrm_hash.c
@@ -0,0 +1,39 @@
+/* xfrm_hash.c: Common hash table code.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/xfrm.h>
+
+#include "xfrm_hash.h"
+
+struct hlist_head *xfrm_hash_alloc(unsigned int sz)
+{
+ struct hlist_head *n;
+
+ if (sz <= PAGE_SIZE)
+ n = kzalloc(sz, GFP_KERNEL);
+ else if (hashdist)
+ n = vzalloc(sz);
+ else
+ n = (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ get_order(sz));
+
+ return n;
+}
+
+void xfrm_hash_free(struct hlist_head *n, unsigned int sz)
+{
+ if (sz <= PAGE_SIZE)
+ kfree(n);
+ else if (hashdist)
+ vfree(n);
+ else
+ free_pages((unsigned long)n, get_order(sz));
+}
diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
new file mode 100644
index 000000000..666c5ffe9
--- /dev/null
+++ b/net/xfrm/xfrm_hash.h
@@ -0,0 +1,192 @@
+#ifndef _XFRM_HASH_H
+#define _XFRM_HASH_H
+
+#include <linux/xfrm.h>
+#include <linux/socket.h>
+#include <linux/jhash.h>
+
+static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
+{
+ return ntohl(addr->a4);
+}
+
+static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
+{
+ return ntohl(addr->a6[2] ^ addr->a6[3]);
+}
+
+static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr)
+{
+ u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
+ return ntohl((__force __be32)sum);
+}
+
+static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr)
+{
+ return ntohl(daddr->a6[2] ^ daddr->a6[3] ^
+ saddr->a6[2] ^ saddr->a6[3]);
+}
+
+static inline u32 __bits2mask32(__u8 bits)
+{
+ u32 mask32 = 0xffffffff;
+
+ if (bits == 0)
+ mask32 = 0;
+ else if (bits < 32)
+ mask32 <<= (32 - bits);
+
+ return mask32;
+}
+
+static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ __u8 dbits,
+ __u8 sbits)
+{
+ return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits),
+ ntohl(saddr->a4) & __bits2mask32(sbits),
+ 0);
+}
+
+static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr,
+ __u8 prefixlen)
+{
+ int pdw;
+ int pbi;
+ u32 initval = 0;
+
+ pdw = prefixlen >> 5; /* num of whole u32 in prefix */
+ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */
+
+ if (pbi) {
+ __be32 mask;
+
+ mask = htonl((0xffffffff) << (32 - pbi));
+
+ initval = (__force u32)(addr->a6[pdw] & mask);
+ }
+
+ return jhash2((__force u32 *)addr->a6, pdw, initval);
+}
+
+static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ __u8 dbits,
+ __u8 sbits)
+{
+ return __xfrm6_pref_hash(daddr, dbits) ^
+ __xfrm6_pref_hash(saddr, sbits);
+}
+
+static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ u32 reqid, unsigned short family,
+ unsigned int hmask)
+{
+ unsigned int h = family ^ reqid;
+ switch (family) {
+ case AF_INET:
+ h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
+ break;
+ case AF_INET6:
+ h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
+ break;
+ }
+ return (h ^ (h >> 16)) & hmask;
+}
+
+static inline unsigned int __xfrm_src_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ unsigned short family,
+ unsigned int hmask)
+{
+ unsigned int h = family;
+ switch (family) {
+ case AF_INET:
+ h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
+ break;
+ case AF_INET6:
+ h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
+ break;
+ }
+ return (h ^ (h >> 16)) & hmask;
+}
+
+static inline unsigned int
+__xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto,
+ unsigned short family, unsigned int hmask)
+{
+ unsigned int h = (__force u32)spi ^ proto;
+ switch (family) {
+ case AF_INET:
+ h ^= __xfrm4_addr_hash(daddr);
+ break;
+ case AF_INET6:
+ h ^= __xfrm6_addr_hash(daddr);
+ break;
+ }
+ return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
+}
+
+static inline unsigned int __idx_hash(u32 index, unsigned int hmask)
+{
+ return (index ^ (index >> 8)) & hmask;
+}
+
+static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
+ unsigned short family, unsigned int hmask,
+ u8 dbits, u8 sbits)
+{
+ const xfrm_address_t *daddr = &sel->daddr;
+ const xfrm_address_t *saddr = &sel->saddr;
+ unsigned int h = 0;
+
+ switch (family) {
+ case AF_INET:
+ if (sel->prefixlen_d < dbits ||
+ sel->prefixlen_s < sbits)
+ return hmask + 1;
+
+ h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
+ break;
+
+ case AF_INET6:
+ if (sel->prefixlen_d < dbits ||
+ sel->prefixlen_s < sbits)
+ return hmask + 1;
+
+ h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
+ break;
+ }
+ h ^= (h >> 16);
+ return h & hmask;
+}
+
+static inline unsigned int __addr_hash(const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ unsigned short family,
+ unsigned int hmask,
+ u8 dbits, u8 sbits)
+{
+ unsigned int h = 0;
+
+ switch (family) {
+ case AF_INET:
+ h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
+ break;
+
+ case AF_INET6:
+ h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
+ break;
+ }
+ h ^= (h >> 16);
+ return h & hmask;
+}
+
+struct hlist_head *xfrm_hash_alloc(unsigned int sz);
+void xfrm_hash_free(struct hlist_head *n, unsigned int sz);
+
+#endif /* _XFRM_HASH_H */
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
new file mode 100644
index 000000000..b58286ecd
--- /dev/null
+++ b/net/xfrm/xfrm_input.c
@@ -0,0 +1,396 @@
+/*
+ * xfrm_input.c
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific portion
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
+
+static struct kmem_cache *secpath_cachep __read_mostly;
+
+static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
+static struct xfrm_input_afinfo __rcu *xfrm_input_afinfo[NPROTO];
+
+int xfrm_input_register_afinfo(struct xfrm_input_afinfo *afinfo)
+{
+ int err = 0;
+
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+ if (unlikely(afinfo->family >= NPROTO))
+ return -EAFNOSUPPORT;
+ spin_lock_bh(&xfrm_input_afinfo_lock);
+ if (unlikely(xfrm_input_afinfo[afinfo->family] != NULL))
+ err = -ENOBUFS;
+ else
+ rcu_assign_pointer(xfrm_input_afinfo[afinfo->family], afinfo);
+ spin_unlock_bh(&xfrm_input_afinfo_lock);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_input_register_afinfo);
+
+int xfrm_input_unregister_afinfo(struct xfrm_input_afinfo *afinfo)
+{
+ int err = 0;
+
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+ if (unlikely(afinfo->family >= NPROTO))
+ return -EAFNOSUPPORT;
+ spin_lock_bh(&xfrm_input_afinfo_lock);
+ if (likely(xfrm_input_afinfo[afinfo->family] != NULL)) {
+ if (unlikely(xfrm_input_afinfo[afinfo->family] != afinfo))
+ err = -EINVAL;
+ else
+ RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->family], NULL);
+ }
+ spin_unlock_bh(&xfrm_input_afinfo_lock);
+ synchronize_rcu();
+ return err;
+}
+EXPORT_SYMBOL(xfrm_input_unregister_afinfo);
+
+static struct xfrm_input_afinfo *xfrm_input_get_afinfo(unsigned int family)
+{
+ struct xfrm_input_afinfo *afinfo;
+
+ if (unlikely(family >= NPROTO))
+ return NULL;
+ rcu_read_lock();
+ afinfo = rcu_dereference(xfrm_input_afinfo[family]);
+ if (unlikely(!afinfo))
+ rcu_read_unlock();
+ return afinfo;
+}
+
+static void xfrm_input_put_afinfo(struct xfrm_input_afinfo *afinfo)
+{
+ rcu_read_unlock();
+}
+
+static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
+ int err)
+{
+ int ret;
+ struct xfrm_input_afinfo *afinfo = xfrm_input_get_afinfo(family);
+
+ if (!afinfo)
+ return -EAFNOSUPPORT;
+
+ ret = afinfo->callback(skb, protocol, err);
+ xfrm_input_put_afinfo(afinfo);
+
+ return ret;
+}
+
+void __secpath_destroy(struct sec_path *sp)
+{
+ int i;
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_put(sp->xvec[i]);
+ kmem_cache_free(secpath_cachep, sp);
+}
+EXPORT_SYMBOL(__secpath_destroy);
+
+struct sec_path *secpath_dup(struct sec_path *src)
+{
+ struct sec_path *sp;
+
+ sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC);
+ if (!sp)
+ return NULL;
+
+ sp->len = 0;
+ if (src) {
+ int i;
+
+ memcpy(sp, src, sizeof(*sp));
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_hold(sp->xvec[i]);
+ }
+ atomic_set(&sp->refcnt, 1);
+ return sp;
+}
+EXPORT_SYMBOL(secpath_dup);
+
+/* Fetch spi and seq from ipsec header */
+
+int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
+{
+ int offset, offset_seq;
+ int hlen;
+
+ switch (nexthdr) {
+ case IPPROTO_AH:
+ hlen = sizeof(struct ip_auth_hdr);
+ offset = offsetof(struct ip_auth_hdr, spi);
+ offset_seq = offsetof(struct ip_auth_hdr, seq_no);
+ break;
+ case IPPROTO_ESP:
+ hlen = sizeof(struct ip_esp_hdr);
+ offset = offsetof(struct ip_esp_hdr, spi);
+ offset_seq = offsetof(struct ip_esp_hdr, seq_no);
+ break;
+ case IPPROTO_COMP:
+ if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr)))
+ return -EINVAL;
+ *spi = htonl(ntohs(*(__be16 *)(skb_transport_header(skb) + 2)));
+ *seq = 0;
+ return 0;
+ default:
+ return 1;
+ }
+
+ if (!pskb_may_pull(skb, hlen))
+ return -EINVAL;
+
+ *spi = *(__be32 *)(skb_transport_header(skb) + offset);
+ *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq);
+ return 0;
+}
+
+int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct xfrm_mode *inner_mode = x->inner_mode;
+ int err;
+
+ err = x->outer_mode->afinfo->extract_input(x, skb);
+ if (err)
+ return err;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL)
+ return -EAFNOSUPPORT;
+ }
+
+ skb->protocol = inner_mode->afinfo->eth_proto;
+ return inner_mode->input2(x, skb);
+}
+EXPORT_SYMBOL(xfrm_prepare_input);
+
+int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+{
+ struct net *net = dev_net(skb->dev);
+ int err;
+ __be32 seq;
+ __be32 seq_hi;
+ struct xfrm_state *x = NULL;
+ xfrm_address_t *daddr;
+ struct xfrm_mode *inner_mode;
+ u32 mark = skb->mark;
+ unsigned int family;
+ int decaps = 0;
+ int async = 0;
+
+ /* A negative encap_type indicates async resumption. */
+ if (encap_type < 0) {
+ async = 1;
+ x = xfrm_input_state(skb);
+ seq = XFRM_SKB_CB(skb)->seq.input.low;
+ family = x->outer_mode->afinfo->family;
+ goto resume;
+ }
+
+ daddr = (xfrm_address_t *)(skb_network_header(skb) +
+ XFRM_SPI_SKB_CB(skb)->daddroff);
+ family = XFRM_SPI_SKB_CB(skb)->family;
+
+ /* if tunnel is present override skb->mark value with tunnel i_key */
+ if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) {
+ switch (family) {
+ case AF_INET:
+ mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key);
+ break;
+ case AF_INET6:
+ mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key);
+ break;
+ }
+ }
+
+ /* Allocate new secpath or COW existing one. */
+ if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+ struct sec_path *sp;
+
+ sp = secpath_dup(skb->sp);
+ if (!sp) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+ goto drop;
+ }
+ if (skb->sp)
+ secpath_put(skb->sp);
+ skb->sp = sp;
+ }
+
+ seq = 0;
+ if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+ goto drop;
+ }
+
+ do {
+ if (skb->sp->len == XFRM_MAX_DEPTH) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+ goto drop;
+ }
+
+ x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family);
+ if (x == NULL) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
+ xfrm_audit_state_notfound(skb, family, spi, seq);
+ goto drop;
+ }
+
+ skb->sp->xvec[skb->sp->len++] = x;
+
+ spin_lock(&x->lock);
+ if (unlikely(x->km.state == XFRM_STATE_ACQ)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
+ goto drop_unlock;
+ }
+
+ if (unlikely(x->km.state != XFRM_STATE_VALID)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID);
+ goto drop_unlock;
+ }
+
+ if ((x->encap ? x->encap->encap_type : 0) != encap_type) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
+ goto drop_unlock;
+ }
+
+ if (x->repl->check(x, skb, seq)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
+ goto drop_unlock;
+ }
+
+ if (xfrm_state_check_expire(x)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEEXPIRED);
+ goto drop_unlock;
+ }
+
+ spin_unlock(&x->lock);
+
+ if (xfrm_tunnel_check(skb, x, family)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
+ goto drop;
+ }
+
+ seq_hi = htonl(xfrm_replay_seqhi(x, seq));
+
+ XFRM_SKB_CB(skb)->seq.input.low = seq;
+ XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
+
+ skb_dst_force(skb);
+
+ nexthdr = x->type->input(x, skb);
+
+ if (nexthdr == -EINPROGRESS)
+ return 0;
+resume:
+ spin_lock(&x->lock);
+ if (nexthdr <= 0) {
+ if (nexthdr == -EBADMSG) {
+ xfrm_audit_state_icvfail(x, skb,
+ x->type->proto);
+ x->stats.integrity_failed++;
+ }
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR);
+ goto drop_unlock;
+ }
+
+ /* only the first xfrm gets the encap type */
+ encap_type = 0;
+
+ if (async && x->repl->recheck(x, skb, seq)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
+ goto drop_unlock;
+ }
+
+ x->repl->advance(x, seq);
+
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+
+ spin_unlock(&x->lock);
+
+ XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;
+
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL)
+ goto drop;
+ }
+
+ if (inner_mode->input(x, skb)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
+ goto drop;
+ }
+
+ if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) {
+ decaps = 1;
+ break;
+ }
+
+ /*
+ * We need the inner address. However, we only get here for
+ * transport mode so the outer address is identical.
+ */
+ daddr = &x->id.daddr;
+ family = x->outer_mode->afinfo->family;
+
+ err = xfrm_parse_spi(skb, nexthdr, &spi, &seq);
+ if (err < 0) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+ goto drop;
+ }
+ } while (!err);
+
+ err = xfrm_rcv_cb(skb, family, x->type->proto, 0);
+ if (err)
+ goto drop;
+
+ nf_reset(skb);
+
+ if (decaps) {
+ skb_dst_drop(skb);
+ netif_rx(skb);
+ return 0;
+ } else {
+ return x->inner_mode->afinfo->transport_finish(skb, async);
+ }
+
+drop_unlock:
+ spin_unlock(&x->lock);
+drop:
+ xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1);
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_input);
+
+int xfrm_input_resume(struct sk_buff *skb, int nexthdr)
+{
+ return xfrm_input(skb, nexthdr, 0, -1);
+}
+EXPORT_SYMBOL(xfrm_input_resume);
+
+void __init xfrm_input_init(void)
+{
+ secpath_cachep = kmem_cache_create("secpath_cache",
+ sizeof(struct sec_path),
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+}
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
new file mode 100644
index 000000000..ccfdc7115
--- /dev/null
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -0,0 +1,386 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2003-2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Todo:
+ * - Tunable compression parameters.
+ * - Compression stats.
+ * - Adaptive compression.
+ */
+
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/ipcomp.h>
+#include <net/xfrm.h>
+
+struct ipcomp_tfms {
+ struct list_head list;
+ struct crypto_comp * __percpu *tfms;
+ int users;
+};
+
+static DEFINE_MUTEX(ipcomp_resource_mutex);
+static void * __percpu *ipcomp_scratches;
+static int ipcomp_scratch_users;
+static LIST_HEAD(ipcomp_tfms_list);
+
+static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ipcomp_data *ipcd = x->data;
+ const int plen = skb->len;
+ int dlen = IPCOMP_SCRATCH_SIZE;
+ const u8 *start = skb->data;
+ const int cpu = get_cpu();
+ u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+ struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+ int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
+ int len;
+
+ if (err)
+ goto out;
+
+ if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ len = dlen - plen;
+ if (len > skb_tailroom(skb))
+ len = skb_tailroom(skb);
+
+ __skb_put(skb, len);
+
+ len += plen;
+ skb_copy_to_linear_data(skb, scratch, len);
+
+ while ((scratch += len, dlen -= len) > 0) {
+ skb_frag_t *frag;
+ struct page *page;
+
+ err = -EMSGSIZE;
+ if (WARN_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS))
+ goto out;
+
+ frag = skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags;
+ page = alloc_page(GFP_ATOMIC);
+
+ err = -ENOMEM;
+ if (!page)
+ goto out;
+
+ __skb_frag_set_page(frag, page);
+
+ len = PAGE_SIZE;
+ if (dlen < len)
+ len = dlen;
+
+ frag->page_offset = 0;
+ skb_frag_size_set(frag, len);
+ memcpy(skb_frag_address(frag), scratch, len);
+
+ skb->truesize += len;
+ skb->data_len += len;
+ skb->len += len;
+
+ skb_shinfo(skb)->nr_frags++;
+ }
+
+ err = 0;
+
+out:
+ put_cpu();
+ return err;
+}
+
+int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int nexthdr;
+ int err = -ENOMEM;
+ struct ip_comp_hdr *ipch;
+
+ if (skb_linearize_cow(skb))
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Remove ipcomp header and decompress original payload */
+ ipch = (void *)skb->data;
+ nexthdr = ipch->nexthdr;
+
+ skb->transport_header = skb->network_header + sizeof(*ipch);
+ __skb_pull(skb, sizeof(*ipch));
+ err = ipcomp_decompress(x, skb);
+ if (err)
+ goto out;
+
+ err = nexthdr;
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(ipcomp_input);
+
+static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ipcomp_data *ipcd = x->data;
+ const int plen = skb->len;
+ int dlen = IPCOMP_SCRATCH_SIZE;
+ u8 *start = skb->data;
+ struct crypto_comp *tfm;
+ u8 *scratch;
+ int err;
+
+ local_bh_disable();
+ scratch = *this_cpu_ptr(ipcomp_scratches);
+ tfm = *this_cpu_ptr(ipcd->tfms);
+ err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
+ if (err)
+ goto out;
+
+ if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
+ local_bh_enable();
+
+ pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr));
+ return 0;
+
+out:
+ local_bh_enable();
+ return err;
+}
+
+int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct ip_comp_hdr *ipch;
+ struct ipcomp_data *ipcd = x->data;
+
+ if (skb->len < ipcd->threshold) {
+ /* Don't bother compressing */
+ goto out_ok;
+ }
+
+ if (skb_linearize_cow(skb))
+ goto out_ok;
+
+ err = ipcomp_compress(x, skb);
+
+ if (err) {
+ goto out_ok;
+ }
+
+ /* Install ipcomp header, convert into ipcomp datagram. */
+ ipch = ip_comp_hdr(skb);
+ ipch->nexthdr = *skb_mac_header(skb);
+ ipch->flags = 0;
+ ipch->cpi = htons((u16 )ntohl(x->id.spi));
+ *skb_mac_header(skb) = IPPROTO_COMP;
+out_ok:
+ skb_push(skb, -skb_network_offset(skb));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ipcomp_output);
+
+static void ipcomp_free_scratches(void)
+{
+ int i;
+ void * __percpu *scratches;
+
+ if (--ipcomp_scratch_users)
+ return;
+
+ scratches = ipcomp_scratches;
+ if (!scratches)
+ return;
+
+ for_each_possible_cpu(i)
+ vfree(*per_cpu_ptr(scratches, i));
+
+ free_percpu(scratches);
+}
+
+static void * __percpu *ipcomp_alloc_scratches(void)
+{
+ void * __percpu *scratches;
+ int i;
+
+ if (ipcomp_scratch_users++)
+ return ipcomp_scratches;
+
+ scratches = alloc_percpu(void *);
+ if (!scratches)
+ return NULL;
+
+ ipcomp_scratches = scratches;
+
+ for_each_possible_cpu(i) {
+ void *scratch;
+
+ scratch = vmalloc_node(IPCOMP_SCRATCH_SIZE, cpu_to_node(i));
+ if (!scratch)
+ return NULL;
+ *per_cpu_ptr(scratches, i) = scratch;
+ }
+
+ return scratches;
+}
+
+static void ipcomp_free_tfms(struct crypto_comp * __percpu *tfms)
+{
+ struct ipcomp_tfms *pos;
+ int cpu;
+
+ list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+ if (pos->tfms == tfms)
+ break;
+ }
+
+ WARN_ON(!pos);
+
+ if (--pos->users)
+ return;
+
+ list_del(&pos->list);
+ kfree(pos);
+
+ if (!tfms)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct crypto_comp *tfm = *per_cpu_ptr(tfms, cpu);
+ crypto_free_comp(tfm);
+ }
+ free_percpu(tfms);
+}
+
+static struct crypto_comp * __percpu *ipcomp_alloc_tfms(const char *alg_name)
+{
+ struct ipcomp_tfms *pos;
+ struct crypto_comp * __percpu *tfms;
+ int cpu;
+
+
+ list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+ struct crypto_comp *tfm;
+
+ /* This can be any valid CPU ID so we don't need locking. */
+ tfm = __this_cpu_read(*pos->tfms);
+
+ if (!strcmp(crypto_comp_name(tfm), alg_name)) {
+ pos->users++;
+ return pos->tfms;
+ }
+ }
+
+ pos = kmalloc(sizeof(*pos), GFP_KERNEL);
+ if (!pos)
+ return NULL;
+
+ pos->users = 1;
+ INIT_LIST_HEAD(&pos->list);
+ list_add(&pos->list, &ipcomp_tfms_list);
+
+ pos->tfms = tfms = alloc_percpu(struct crypto_comp *);
+ if (!tfms)
+ goto error;
+
+ for_each_possible_cpu(cpu) {
+ struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0,
+ CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ goto error;
+ *per_cpu_ptr(tfms, cpu) = tfm;
+ }
+
+ return tfms;
+
+error:
+ ipcomp_free_tfms(tfms);
+ return NULL;
+}
+
+static void ipcomp_free_data(struct ipcomp_data *ipcd)
+{
+ if (ipcd->tfms)
+ ipcomp_free_tfms(ipcd->tfms);
+ ipcomp_free_scratches();
+}
+
+void ipcomp_destroy(struct xfrm_state *x)
+{
+ struct ipcomp_data *ipcd = x->data;
+ if (!ipcd)
+ return;
+ xfrm_state_delete_tunnel(x);
+ mutex_lock(&ipcomp_resource_mutex);
+ ipcomp_free_data(ipcd);
+ mutex_unlock(&ipcomp_resource_mutex);
+ kfree(ipcd);
+}
+EXPORT_SYMBOL_GPL(ipcomp_destroy);
+
+int ipcomp_init_state(struct xfrm_state *x)
+{
+ int err;
+ struct ipcomp_data *ipcd;
+ struct xfrm_algo_desc *calg_desc;
+
+ err = -EINVAL;
+ if (!x->calg)
+ goto out;
+
+ if (x->encap)
+ goto out;
+
+ err = -ENOMEM;
+ ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL);
+ if (!ipcd)
+ goto out;
+
+ mutex_lock(&ipcomp_resource_mutex);
+ if (!ipcomp_alloc_scratches())
+ goto error;
+
+ ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name);
+ if (!ipcd->tfms)
+ goto error;
+ mutex_unlock(&ipcomp_resource_mutex);
+
+ calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
+ BUG_ON(!calg_desc);
+ ipcd->threshold = calg_desc->uinfo.comp.threshold;
+ x->data = ipcd;
+ err = 0;
+out:
+ return err;
+
+error:
+ ipcomp_free_data(ipcd);
+ mutex_unlock(&ipcomp_resource_mutex);
+ kfree(ipcd);
+ goto out;
+}
+EXPORT_SYMBOL_GPL(ipcomp_init_state);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
new file mode 100644
index 000000000..fbcedbe33
--- /dev/null
+++ b/net/xfrm/xfrm_output.c
@@ -0,0 +1,236 @@
+/*
+ * xfrm_output.c - Common IPsec encapsulation code.
+ *
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+
+static int xfrm_output2(struct sock *sk, struct sk_buff *skb);
+
+static int xfrm_skb_check_space(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ int nhead = dst->header_len + LL_RESERVED_SPACE(dst->dev)
+ - skb_headroom(skb);
+ int ntail = dst->dev->needed_tailroom - skb_tailroom(skb);
+
+ if (nhead <= 0) {
+ if (ntail <= 0)
+ return 0;
+ nhead = 0;
+ } else if (ntail < 0)
+ ntail = 0;
+
+ return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC);
+}
+
+static int xfrm_output_one(struct sk_buff *skb, int err)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+ struct net *net = xs_net(x);
+
+ if (err <= 0)
+ goto resume;
+
+ do {
+ err = xfrm_skb_check_space(skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+ goto error_nolock;
+ }
+
+ err = x->outer_mode->output(x, skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
+ goto error_nolock;
+ }
+
+ spin_lock_bh(&x->lock);
+
+ if (unlikely(x->km.state != XFRM_STATE_VALID)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEINVALID);
+ err = -EINVAL;
+ goto error;
+ }
+
+ err = xfrm_state_check_expire(x);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEEXPIRED);
+ goto error;
+ }
+
+ err = x->repl->overflow(x, skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATESEQERROR);
+ goto error;
+ }
+
+ x->curlft.bytes += skb->len;
+ x->curlft.packets++;
+
+ spin_unlock_bh(&x->lock);
+
+ skb_dst_force(skb);
+
+ err = x->type->output(x, skb);
+ if (err == -EINPROGRESS)
+ goto out;
+
+resume:
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+ goto error_nolock;
+ }
+
+ dst = skb_dst_pop(skb);
+ if (!dst) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+ err = -EHOSTUNREACH;
+ goto error_nolock;
+ }
+ skb_dst_set(skb, dst);
+ x = dst->xfrm;
+ } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
+
+ return 0;
+
+error:
+ spin_unlock_bh(&x->lock);
+error_nolock:
+ kfree_skb(skb);
+out:
+ return err;
+}
+
+int xfrm_output_resume(struct sk_buff *skb, int err)
+{
+ while (likely((err = xfrm_output_one(skb, err)) == 0)) {
+ nf_reset(skb);
+
+ err = skb_dst(skb)->ops->local_out(skb);
+ if (unlikely(err != 1))
+ goto out;
+
+ if (!skb_dst(skb)->xfrm)
+ return dst_output(skb);
+
+ err = nf_hook(skb_dst(skb)->ops->family,
+ NF_INET_POST_ROUTING, skb->sk, skb,
+ NULL, skb_dst(skb)->dev, xfrm_output2);
+ if (unlikely(err != 1))
+ goto out;
+ }
+
+ if (err == -EINPROGRESS)
+ err = 0;
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(xfrm_output_resume);
+
+static int xfrm_output2(struct sock *sk, struct sk_buff *skb)
+{
+ return xfrm_output_resume(skb, 1);
+}
+
+static int xfrm_output_gso(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, 0);
+ kfree_skb(skb);
+ if (IS_ERR(segs))
+ return PTR_ERR(segs);
+ if (segs == NULL)
+ return -EINVAL;
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+ err = xfrm_output2(sk, segs);
+
+ if (unlikely(err)) {
+ kfree_skb_list(nskb);
+ return err;
+ }
+
+ segs = nskb;
+ } while (segs);
+
+ return 0;
+}
+
+int xfrm_output(struct sock *sk, struct sk_buff *skb)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ int err;
+
+ if (skb_is_gso(skb))
+ return xfrm_output_gso(sk, skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ err = skb_checksum_help(skb);
+ if (err) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+ kfree_skb(skb);
+ return err;
+ }
+ }
+
+ return xfrm_output2(sk, skb);
+}
+EXPORT_SYMBOL_GPL(xfrm_output);
+
+int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct xfrm_mode *inner_mode;
+ if (x->sel.family == AF_UNSPEC)
+ inner_mode = xfrm_ip2inner_mode(x,
+ xfrm_af2proto(skb_dst(skb)->ops->family));
+ else
+ inner_mode = x->inner_mode;
+
+ if (inner_mode == NULL)
+ return -EAFNOSUPPORT;
+ return inner_mode->afinfo->extract_output(x, skb);
+}
+EXPORT_SYMBOL_GPL(xfrm_inner_extract_output);
+
+void xfrm_local_error(struct sk_buff *skb, int mtu)
+{
+ unsigned int proto;
+ struct xfrm_state_afinfo *afinfo;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ proto = AF_INET;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ proto = AF_INET6;
+ else
+ return;
+
+ afinfo = xfrm_state_get_afinfo(proto);
+ if (!afinfo)
+ return;
+
+ afinfo->local_error(skb, mtu);
+ xfrm_state_put_afinfo(afinfo);
+}
+EXPORT_SYMBOL_GPL(xfrm_local_error);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
new file mode 100644
index 000000000..638af0655
--- /dev/null
+++ b/net/xfrm/xfrm_policy.c
@@ -0,0 +1,3397 @@
+/*
+ * xfrm_policy.c
+ *
+ * Changes:
+ * Mitsuru KANDA @USAGI
+ * Kazunori MIYAZAWA @USAGI
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * IPv6 support
+ * Kazunori MIYAZAWA @USAGI
+ * YOSHIFUJI Hideaki
+ * Split up af-specific portion
+ * Derek Atkins <derek@ihtfp.com> Add the post_input processor
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/audit.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#ifdef CONFIG_XFRM_STATISTICS
+#include <net/snmp.h>
+#endif
+
+#include "xfrm_hash.h"
+
+#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
+#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
+#define XFRM_MAX_QUEUE_LEN 100
+
+struct xfrm_flo {
+ struct dst_entry *dst_orig;
+ u8 flags;
+};
+
+static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
+static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
+ __read_mostly;
+
+static struct kmem_cache *xfrm_dst_cache __read_mostly;
+
+static void xfrm_init_pmtu(struct dst_entry *dst);
+static int stale_bundle(struct dst_entry *dst);
+static int xfrm_bundle_ok(struct xfrm_dst *xdst);
+static void xfrm_policy_queue_process(unsigned long arg);
+
+static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
+static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
+ int dir);
+
+static inline bool
+__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
+{
+ const struct flowi4 *fl4 = &fl->u.ip4;
+
+ return addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
+ addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
+ !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
+ !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
+ (fl4->flowi4_proto == sel->proto || !sel->proto) &&
+ (fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
+}
+
+static inline bool
+__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
+{
+ const struct flowi6 *fl6 = &fl->u.ip6;
+
+ return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
+ addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
+ !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
+ !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
+ (fl6->flowi6_proto == sel->proto || !sel->proto) &&
+ (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
+}
+
+bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
+ unsigned short family)
+{
+ switch (family) {
+ case AF_INET:
+ return __xfrm4_selector_match(sel, fl);
+ case AF_INET6:
+ return __xfrm6_selector_match(sel, fl);
+ }
+ return false;
+}
+
+static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
+{
+ struct xfrm_policy_afinfo *afinfo;
+
+ if (unlikely(family >= NPROTO))
+ return NULL;
+ rcu_read_lock();
+ afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
+ if (unlikely(!afinfo))
+ rcu_read_unlock();
+ return afinfo;
+}
+
+static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+ rcu_read_unlock();
+}
+
+static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
+ const xfrm_address_t *saddr,
+ const xfrm_address_t *daddr,
+ int family)
+{
+ struct xfrm_policy_afinfo *afinfo;
+ struct dst_entry *dst;
+
+ afinfo = xfrm_policy_get_afinfo(family);
+ if (unlikely(afinfo == NULL))
+ return ERR_PTR(-EAFNOSUPPORT);
+
+ dst = afinfo->dst_lookup(net, tos, saddr, daddr);
+
+ xfrm_policy_put_afinfo(afinfo);
+
+ return dst;
+}
+
+static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
+ xfrm_address_t *prev_saddr,
+ xfrm_address_t *prev_daddr,
+ int family)
+{
+ struct net *net = xs_net(x);
+ xfrm_address_t *saddr = &x->props.saddr;
+ xfrm_address_t *daddr = &x->id.daddr;
+ struct dst_entry *dst;
+
+ if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
+ saddr = x->coaddr;
+ daddr = prev_daddr;
+ }
+ if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
+ saddr = prev_saddr;
+ daddr = x->coaddr;
+ }
+
+ dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family);
+
+ if (!IS_ERR(dst)) {
+ if (prev_saddr != saddr)
+ memcpy(prev_saddr, saddr, sizeof(*prev_saddr));
+ if (prev_daddr != daddr)
+ memcpy(prev_daddr, daddr, sizeof(*prev_daddr));
+ }
+
+ return dst;
+}
+
+static inline unsigned long make_jiffies(long secs)
+{
+ if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
+ return MAX_SCHEDULE_TIMEOUT-1;
+ else
+ return secs*HZ;
+}
+
+static void xfrm_policy_timer(unsigned long data)
+{
+ struct xfrm_policy *xp = (struct xfrm_policy *)data;
+ unsigned long now = get_seconds();
+ long next = LONG_MAX;
+ int warn = 0;
+ int dir;
+
+ read_lock(&xp->lock);
+
+ if (unlikely(xp->walk.dead))
+ goto out;
+
+ dir = xfrm_policy_id2dir(xp->index);
+
+ if (xp->lft.hard_add_expires_seconds) {
+ long tmo = xp->lft.hard_add_expires_seconds +
+ xp->curlft.add_time - now;
+ if (tmo <= 0)
+ goto expired;
+ if (tmo < next)
+ next = tmo;
+ }
+ if (xp->lft.hard_use_expires_seconds) {
+ long tmo = xp->lft.hard_use_expires_seconds +
+ (xp->curlft.use_time ? : xp->curlft.add_time) - now;
+ if (tmo <= 0)
+ goto expired;
+ if (tmo < next)
+ next = tmo;
+ }
+ if (xp->lft.soft_add_expires_seconds) {
+ long tmo = xp->lft.soft_add_expires_seconds +
+ xp->curlft.add_time - now;
+ if (tmo <= 0) {
+ warn = 1;
+ tmo = XFRM_KM_TIMEOUT;
+ }
+ if (tmo < next)
+ next = tmo;
+ }
+ if (xp->lft.soft_use_expires_seconds) {
+ long tmo = xp->lft.soft_use_expires_seconds +
+ (xp->curlft.use_time ? : xp->curlft.add_time) - now;
+ if (tmo <= 0) {
+ warn = 1;
+ tmo = XFRM_KM_TIMEOUT;
+ }
+ if (tmo < next)
+ next = tmo;
+ }
+
+ if (warn)
+ km_policy_expired(xp, dir, 0, 0);
+ if (next != LONG_MAX &&
+ !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
+ xfrm_pol_hold(xp);
+
+out:
+ read_unlock(&xp->lock);
+ xfrm_pol_put(xp);
+ return;
+
+expired:
+ read_unlock(&xp->lock);
+ if (!xfrm_policy_delete(xp, dir))
+ km_policy_expired(xp, dir, 1, 0);
+ xfrm_pol_put(xp);
+}
+
+static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
+{
+ struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+ if (unlikely(pol->walk.dead))
+ flo = NULL;
+ else
+ xfrm_pol_hold(pol);
+
+ return flo;
+}
+
+static int xfrm_policy_flo_check(struct flow_cache_object *flo)
+{
+ struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+ return !pol->walk.dead;
+}
+
+static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
+{
+ xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
+}
+
+static const struct flow_cache_ops xfrm_policy_fc_ops = {
+ .get = xfrm_policy_flo_get,
+ .check = xfrm_policy_flo_check,
+ .delete = xfrm_policy_flo_delete,
+};
+
+/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
+ * SPD calls.
+ */
+
+struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
+{
+ struct xfrm_policy *policy;
+
+ policy = kzalloc(sizeof(struct xfrm_policy), gfp);
+
+ if (policy) {
+ write_pnet(&policy->xp_net, net);
+ INIT_LIST_HEAD(&policy->walk.all);
+ INIT_HLIST_NODE(&policy->bydst);
+ INIT_HLIST_NODE(&policy->byidx);
+ rwlock_init(&policy->lock);
+ atomic_set(&policy->refcnt, 1);
+ skb_queue_head_init(&policy->polq.hold_queue);
+ setup_timer(&policy->timer, xfrm_policy_timer,
+ (unsigned long)policy);
+ setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process,
+ (unsigned long)policy);
+ policy->flo.ops = &xfrm_policy_fc_ops;
+ }
+ return policy;
+}
+EXPORT_SYMBOL(xfrm_policy_alloc);
+
+/* Destroy xfrm_policy: descendant resources must be released to this moment. */
+
+void xfrm_policy_destroy(struct xfrm_policy *policy)
+{
+ BUG_ON(!policy->walk.dead);
+
+ if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
+ BUG();
+
+ security_xfrm_policy_free(policy->security);
+ kfree(policy);
+}
+EXPORT_SYMBOL(xfrm_policy_destroy);
+
+static void xfrm_queue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(list)) != NULL)
+ kfree_skb(skb);
+}
+
+/* Rule must be locked. Release descentant resources, announce
+ * entry dead. The rule must be unlinked from lists to the moment.
+ */
+
+static void xfrm_policy_kill(struct xfrm_policy *policy)
+{
+ policy->walk.dead = 1;
+
+ atomic_inc(&policy->genid);
+
+ if (del_timer(&policy->polq.hold_timer))
+ xfrm_pol_put(policy);
+ xfrm_queue_purge(&policy->polq.hold_queue);
+
+ if (del_timer(&policy->timer))
+ xfrm_pol_put(policy);
+
+ xfrm_pol_put(policy);
+}
+
+static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
+
+static inline unsigned int idx_hash(struct net *net, u32 index)
+{
+ return __idx_hash(index, net->xfrm.policy_idx_hmask);
+}
+
+/* calculate policy hash thresholds */
+static void __get_hash_thresh(struct net *net,
+ unsigned short family, int dir,
+ u8 *dbits, u8 *sbits)
+{
+ switch (family) {
+ case AF_INET:
+ *dbits = net->xfrm.policy_bydst[dir].dbits4;
+ *sbits = net->xfrm.policy_bydst[dir].sbits4;
+ break;
+
+ case AF_INET6:
+ *dbits = net->xfrm.policy_bydst[dir].dbits6;
+ *sbits = net->xfrm.policy_bydst[dir].sbits6;
+ break;
+
+ default:
+ *dbits = 0;
+ *sbits = 0;
+ }
+}
+
+static struct hlist_head *policy_hash_bysel(struct net *net,
+ const struct xfrm_selector *sel,
+ unsigned short family, int dir)
+{
+ unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
+ unsigned int hash;
+ u8 dbits;
+ u8 sbits;
+
+ __get_hash_thresh(net, family, dir, &dbits, &sbits);
+ hash = __sel_hash(sel, family, hmask, dbits, sbits);
+
+ return (hash == hmask + 1 ?
+ &net->xfrm.policy_inexact[dir] :
+ net->xfrm.policy_bydst[dir].table + hash);
+}
+
+static struct hlist_head *policy_hash_direct(struct net *net,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ unsigned short family, int dir)
+{
+ unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
+ unsigned int hash;
+ u8 dbits;
+ u8 sbits;
+
+ __get_hash_thresh(net, family, dir, &dbits, &sbits);
+ hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);
+
+ return net->xfrm.policy_bydst[dir].table + hash;
+}
+
+static void xfrm_dst_hash_transfer(struct net *net,
+ struct hlist_head *list,
+ struct hlist_head *ndsttable,
+ unsigned int nhashmask,
+ int dir)
+{
+ struct hlist_node *tmp, *entry0 = NULL;
+ struct xfrm_policy *pol;
+ unsigned int h0 = 0;
+ u8 dbits;
+ u8 sbits;
+
+redo:
+ hlist_for_each_entry_safe(pol, tmp, list, bydst) {
+ unsigned int h;
+
+ __get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
+ h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
+ pol->family, nhashmask, dbits, sbits);
+ if (!entry0) {
+ hlist_del(&pol->bydst);
+ hlist_add_head(&pol->bydst, ndsttable+h);
+ h0 = h;
+ } else {
+ if (h != h0)
+ continue;
+ hlist_del(&pol->bydst);
+ hlist_add_behind(&pol->bydst, entry0);
+ }
+ entry0 = &pol->bydst;
+ }
+ if (!hlist_empty(list)) {
+ entry0 = NULL;
+ goto redo;
+ }
+}
+
+static void xfrm_idx_hash_transfer(struct hlist_head *list,
+ struct hlist_head *nidxtable,
+ unsigned int nhashmask)
+{
+ struct hlist_node *tmp;
+ struct xfrm_policy *pol;
+
+ hlist_for_each_entry_safe(pol, tmp, list, byidx) {
+ unsigned int h;
+
+ h = __idx_hash(pol->index, nhashmask);
+ hlist_add_head(&pol->byidx, nidxtable+h);
+ }
+}
+
+static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
+{
+ return ((old_hmask + 1) << 1) - 1;
+}
+
+static void xfrm_bydst_resize(struct net *net, int dir)
+{
+ unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
+ unsigned int nhashmask = xfrm_new_hash_mask(hmask);
+ unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
+ struct hlist_head *odst = net->xfrm.policy_bydst[dir].table;
+ struct hlist_head *ndst = xfrm_hash_alloc(nsize);
+ int i;
+
+ if (!ndst)
+ return;
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+
+ for (i = hmask; i >= 0; i--)
+ xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);
+
+ net->xfrm.policy_bydst[dir].table = ndst;
+ net->xfrm.policy_bydst[dir].hmask = nhashmask;
+
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
+}
+
+static void xfrm_byidx_resize(struct net *net, int total)
+{
+ unsigned int hmask = net->xfrm.policy_idx_hmask;
+ unsigned int nhashmask = xfrm_new_hash_mask(hmask);
+ unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
+ struct hlist_head *oidx = net->xfrm.policy_byidx;
+ struct hlist_head *nidx = xfrm_hash_alloc(nsize);
+ int i;
+
+ if (!nidx)
+ return;
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+
+ for (i = hmask; i >= 0; i--)
+ xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
+
+ net->xfrm.policy_byidx = nidx;
+ net->xfrm.policy_idx_hmask = nhashmask;
+
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
+}
+
+static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
+{
+ unsigned int cnt = net->xfrm.policy_count[dir];
+ unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
+
+ if (total)
+ *total += cnt;
+
+ if ((hmask + 1) < xfrm_policy_hashmax &&
+ cnt > hmask)
+ return 1;
+
+ return 0;
+}
+
+static inline int xfrm_byidx_should_resize(struct net *net, int total)
+{
+ unsigned int hmask = net->xfrm.policy_idx_hmask;
+
+ if ((hmask + 1) < xfrm_policy_hashmax &&
+ total > hmask)
+ return 1;
+
+ return 0;
+}
+
+void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
+{
+ read_lock_bh(&net->xfrm.xfrm_policy_lock);
+ si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
+ si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
+ si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
+ si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
+ si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
+ si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
+ si->spdhcnt = net->xfrm.policy_idx_hmask;
+ si->spdhmcnt = xfrm_policy_hashmax;
+ read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+}
+EXPORT_SYMBOL(xfrm_spd_getinfo);
+
+static DEFINE_MUTEX(hash_resize_mutex);
+static void xfrm_hash_resize(struct work_struct *work)
+{
+ struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
+ int dir, total;
+
+ mutex_lock(&hash_resize_mutex);
+
+ total = 0;
+ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
+ if (xfrm_bydst_should_resize(net, dir, &total))
+ xfrm_bydst_resize(net, dir);
+ }
+ if (xfrm_byidx_should_resize(net, total))
+ xfrm_byidx_resize(net, total);
+
+ mutex_unlock(&hash_resize_mutex);
+}
+
+static void xfrm_hash_rebuild(struct work_struct *work)
+{
+ struct net *net = container_of(work, struct net,
+ xfrm.policy_hthresh.work);
+ unsigned int hmask;
+ struct xfrm_policy *pol;
+ struct xfrm_policy *policy;
+ struct hlist_head *chain;
+ struct hlist_head *odst;
+ struct hlist_node *newpos;
+ int i;
+ int dir;
+ unsigned seq;
+ u8 lbits4, rbits4, lbits6, rbits6;
+
+ mutex_lock(&hash_resize_mutex);
+
+ /* read selector prefixlen thresholds */
+ do {
+ seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);
+
+ lbits4 = net->xfrm.policy_hthresh.lbits4;
+ rbits4 = net->xfrm.policy_hthresh.rbits4;
+ lbits6 = net->xfrm.policy_hthresh.lbits6;
+ rbits6 = net->xfrm.policy_hthresh.rbits6;
+ } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+
+ /* reset the bydst and inexact table in all directions */
+ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
+ INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
+ hmask = net->xfrm.policy_bydst[dir].hmask;
+ odst = net->xfrm.policy_bydst[dir].table;
+ for (i = hmask; i >= 0; i--)
+ INIT_HLIST_HEAD(odst + i);
+ if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
+ /* dir out => dst = remote, src = local */
+ net->xfrm.policy_bydst[dir].dbits4 = rbits4;
+ net->xfrm.policy_bydst[dir].sbits4 = lbits4;
+ net->xfrm.policy_bydst[dir].dbits6 = rbits6;
+ net->xfrm.policy_bydst[dir].sbits6 = lbits6;
+ } else {
+ /* dir in/fwd => dst = local, src = remote */
+ net->xfrm.policy_bydst[dir].dbits4 = lbits4;
+ net->xfrm.policy_bydst[dir].sbits4 = rbits4;
+ net->xfrm.policy_bydst[dir].dbits6 = lbits6;
+ net->xfrm.policy_bydst[dir].sbits6 = rbits6;
+ }
+ }
+
+ /* re-insert all policies by order of creation */
+ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
+ newpos = NULL;
+ chain = policy_hash_bysel(net, &policy->selector,
+ policy->family,
+ xfrm_policy_id2dir(policy->index));
+ hlist_for_each_entry(pol, chain, bydst) {
+ if (policy->priority >= pol->priority)
+ newpos = &pol->bydst;
+ else
+ break;
+ }
+ if (newpos)
+ hlist_add_behind(&policy->bydst, newpos);
+ else
+ hlist_add_head(&policy->bydst, chain);
+ }
+
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ mutex_unlock(&hash_resize_mutex);
+}
+
+void xfrm_policy_hash_rebuild(struct net *net)
+{
+ schedule_work(&net->xfrm.policy_hthresh.work);
+}
+EXPORT_SYMBOL(xfrm_policy_hash_rebuild);
+
+/* Generate new index... KAME seems to generate them ordered by cost
+ * of an absolute inpredictability of ordering of rules. This will not pass. */
+static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
+{
+ static u32 idx_generator;
+
+ for (;;) {
+ struct hlist_head *list;
+ struct xfrm_policy *p;
+ u32 idx;
+ int found;
+
+ if (!index) {
+ idx = (idx_generator | dir);
+ idx_generator += 8;
+ } else {
+ idx = index;
+ index = 0;
+ }
+
+ if (idx == 0)
+ idx = 8;
+ list = net->xfrm.policy_byidx + idx_hash(net, idx);
+ found = 0;
+ hlist_for_each_entry(p, list, byidx) {
+ if (p->index == idx) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return idx;
+ }
+}
+
+static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
+{
+ u32 *p1 = (u32 *) s1;
+ u32 *p2 = (u32 *) s2;
+ int len = sizeof(struct xfrm_selector) / sizeof(u32);
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (p1[i] != p2[i])
+ return 1;
+ }
+
+ return 0;
+}
+
+static void xfrm_policy_requeue(struct xfrm_policy *old,
+ struct xfrm_policy *new)
+{
+ struct xfrm_policy_queue *pq = &old->polq;
+ struct sk_buff_head list;
+
+ __skb_queue_head_init(&list);
+
+ spin_lock_bh(&pq->hold_queue.lock);
+ skb_queue_splice_init(&pq->hold_queue, &list);
+ if (del_timer(&pq->hold_timer))
+ xfrm_pol_put(old);
+ spin_unlock_bh(&pq->hold_queue.lock);
+
+ if (skb_queue_empty(&list))
+ return;
+
+ pq = &new->polq;
+
+ spin_lock_bh(&pq->hold_queue.lock);
+ skb_queue_splice(&list, &pq->hold_queue);
+ pq->timeout = XFRM_QUEUE_TMO_MIN;
+ if (!mod_timer(&pq->hold_timer, jiffies))
+ xfrm_pol_hold(new);
+ spin_unlock_bh(&pq->hold_queue.lock);
+}
+
+static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
+ struct xfrm_policy *pol)
+{
+ u32 mark = policy->mark.v & policy->mark.m;
+
+ if (policy->mark.v == pol->mark.v && policy->mark.m == pol->mark.m)
+ return true;
+
+ if ((mark & pol->mark.m) == pol->mark.v &&
+ policy->priority == pol->priority)
+ return true;
+
+ return false;
+}
+
+int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+{
+ struct net *net = xp_net(policy);
+ struct xfrm_policy *pol;
+ struct xfrm_policy *delpol;
+ struct hlist_head *chain;
+ struct hlist_node *newpos;
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
+ delpol = NULL;
+ newpos = NULL;
+ hlist_for_each_entry(pol, chain, bydst) {
+ if (pol->type == policy->type &&
+ !selector_cmp(&pol->selector, &policy->selector) &&
+ xfrm_policy_mark_match(policy, pol) &&
+ xfrm_sec_ctx_match(pol->security, policy->security) &&
+ !WARN_ON(delpol)) {
+ if (excl) {
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return -EEXIST;
+ }
+ delpol = pol;
+ if (policy->priority > pol->priority)
+ continue;
+ } else if (policy->priority >= pol->priority) {
+ newpos = &pol->bydst;
+ continue;
+ }
+ if (delpol)
+ break;
+ }
+ if (newpos)
+ hlist_add_behind(&policy->bydst, newpos);
+ else
+ hlist_add_head(&policy->bydst, chain);
+ __xfrm_policy_link(policy, dir);
+ atomic_inc(&net->xfrm.flow_cache_genid);
+
+ /* After previous checking, family can either be AF_INET or AF_INET6 */
+ if (policy->family == AF_INET)
+ rt_genid_bump_ipv4(net);
+ else
+ rt_genid_bump_ipv6(net);
+
+ if (delpol) {
+ xfrm_policy_requeue(delpol, policy);
+ __xfrm_policy_unlink(delpol, dir);
+ }
+ policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
+ hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
+ policy->curlft.add_time = get_seconds();
+ policy->curlft.use_time = 0;
+ if (!mod_timer(&policy->timer, jiffies + HZ))
+ xfrm_pol_hold(policy);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ if (delpol)
+ xfrm_policy_kill(delpol);
+ else if (xfrm_bydst_should_resize(net, dir, NULL))
+ schedule_work(&net->xfrm.policy_hash_work);
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_policy_insert);
+
+struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
+ int dir, struct xfrm_selector *sel,
+ struct xfrm_sec_ctx *ctx, int delete,
+ int *err)
+{
+ struct xfrm_policy *pol, *ret;
+ struct hlist_head *chain;
+
+ *err = 0;
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ chain = policy_hash_bysel(net, sel, sel->family, dir);
+ ret = NULL;
+ hlist_for_each_entry(pol, chain, bydst) {
+ if (pol->type == type &&
+ (mark & pol->mark.m) == pol->mark.v &&
+ !selector_cmp(sel, &pol->selector) &&
+ xfrm_sec_ctx_match(ctx, pol->security)) {
+ xfrm_pol_hold(pol);
+ if (delete) {
+ *err = security_xfrm_policy_delete(
+ pol->security);
+ if (*err) {
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return pol;
+ }
+ __xfrm_policy_unlink(pol, dir);
+ }
+ ret = pol;
+ break;
+ }
+ }
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ if (ret && delete)
+ xfrm_policy_kill(ret);
+ return ret;
+}
+EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
+
+struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
+ int dir, u32 id, int delete, int *err)
+{
+ struct xfrm_policy *pol, *ret;
+ struct hlist_head *chain;
+
+ *err = -ENOENT;
+ if (xfrm_policy_id2dir(id) != dir)
+ return NULL;
+
+ *err = 0;
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ chain = net->xfrm.policy_byidx + idx_hash(net, id);
+ ret = NULL;
+ hlist_for_each_entry(pol, chain, byidx) {
+ if (pol->type == type && pol->index == id &&
+ (mark & pol->mark.m) == pol->mark.v) {
+ xfrm_pol_hold(pol);
+ if (delete) {
+ *err = security_xfrm_policy_delete(
+ pol->security);
+ if (*err) {
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return pol;
+ }
+ __xfrm_policy_unlink(pol, dir);
+ }
+ ret = pol;
+ break;
+ }
+ }
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ if (ret && delete)
+ xfrm_policy_kill(ret);
+ return ret;
+}
+EXPORT_SYMBOL(xfrm_policy_byid);
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static inline int
+xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
+{
+ int dir, err = 0;
+
+ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
+ struct xfrm_policy *pol;
+ int i;
+
+ hlist_for_each_entry(pol,
+ &net->xfrm.policy_inexact[dir], bydst) {
+ if (pol->type != type)
+ continue;
+ err = security_xfrm_policy_delete(pol->security);
+ if (err) {
+ xfrm_audit_policy_delete(pol, 0, task_valid);
+ return err;
+ }
+ }
+ for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
+ hlist_for_each_entry(pol,
+ net->xfrm.policy_bydst[dir].table + i,
+ bydst) {
+ if (pol->type != type)
+ continue;
+ err = security_xfrm_policy_delete(
+ pol->security);
+ if (err) {
+ xfrm_audit_policy_delete(pol, 0,
+ task_valid);
+ return err;
+ }
+ }
+ }
+ }
+ return err;
+}
+#else
+static inline int
+xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
+{
+ return 0;
+}
+#endif
+
+int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
+{
+ int dir, err = 0, cnt = 0;
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+
+ err = xfrm_policy_flush_secctx_check(net, type, task_valid);
+ if (err)
+ goto out;
+
+ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
+ struct xfrm_policy *pol;
+ int i;
+
+ again1:
+ hlist_for_each_entry(pol,
+ &net->xfrm.policy_inexact[dir], bydst) {
+ if (pol->type != type)
+ continue;
+ __xfrm_policy_unlink(pol, dir);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ cnt++;
+
+ xfrm_audit_policy_delete(pol, 1, task_valid);
+
+ xfrm_policy_kill(pol);
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ goto again1;
+ }
+
+ for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
+ again2:
+ hlist_for_each_entry(pol,
+ net->xfrm.policy_bydst[dir].table + i,
+ bydst) {
+ if (pol->type != type)
+ continue;
+ __xfrm_policy_unlink(pol, dir);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ cnt++;
+
+ xfrm_audit_policy_delete(pol, 1, task_valid);
+ xfrm_policy_kill(pol);
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ goto again2;
+ }
+ }
+
+ }
+ if (!cnt)
+ err = -ESRCH;
+out:
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_policy_flush);
+
+int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
+ int (*func)(struct xfrm_policy *, int, int, void*),
+ void *data)
+{
+ struct xfrm_policy *pol;
+ struct xfrm_policy_walk_entry *x;
+ int error = 0;
+
+ if (walk->type >= XFRM_POLICY_TYPE_MAX &&
+ walk->type != XFRM_POLICY_TYPE_ANY)
+ return -EINVAL;
+
+ if (list_empty(&walk->walk.all) && walk->seq != 0)
+ return 0;
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ if (list_empty(&walk->walk.all))
+ x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
+ else
+ x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all);
+ list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
+ if (x->dead)
+ continue;
+ pol = container_of(x, struct xfrm_policy, walk);
+ if (walk->type != XFRM_POLICY_TYPE_ANY &&
+ walk->type != pol->type)
+ continue;
+ error = func(pol, xfrm_policy_id2dir(pol->index),
+ walk->seq, data);
+ if (error) {
+ list_move_tail(&walk->walk.all, &x->all);
+ goto out;
+ }
+ walk->seq++;
+ }
+ if (walk->seq == 0) {
+ error = -ENOENT;
+ goto out;
+ }
+ list_del_init(&walk->walk.all);
+out:
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return error;
+}
+EXPORT_SYMBOL(xfrm_policy_walk);
+
+void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
+{
+ INIT_LIST_HEAD(&walk->walk.all);
+ walk->walk.dead = 1;
+ walk->type = type;
+ walk->seq = 0;
+}
+EXPORT_SYMBOL(xfrm_policy_walk_init);
+
+void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
+{
+ if (list_empty(&walk->walk.all))
+ return;
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
+ list_del(&walk->walk.all);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+}
+EXPORT_SYMBOL(xfrm_policy_walk_done);
+
+/*
+ * Find policy to apply to this flow.
+ *
+ * Returns 0 if policy found, else an -errno.
+ */
+static int xfrm_policy_match(const struct xfrm_policy *pol,
+ const struct flowi *fl,
+ u8 type, u16 family, int dir)
+{
+ const struct xfrm_selector *sel = &pol->selector;
+ int ret = -ESRCH;
+ bool match;
+
+ if (pol->family != family ||
+ (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
+ pol->type != type)
+ return ret;
+
+ match = xfrm_selector_match(sel, fl, family);
+ if (match)
+ ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
+ dir);
+
+ return ret;
+}
+
+static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
+ const struct flowi *fl,
+ u16 family, u8 dir)
+{
+ int err;
+ struct xfrm_policy *pol, *ret;
+ const xfrm_address_t *daddr, *saddr;
+ struct hlist_head *chain;
+ u32 priority = ~0U;
+
+ daddr = xfrm_flowi_daddr(fl, family);
+ saddr = xfrm_flowi_saddr(fl, family);
+ if (unlikely(!daddr || !saddr))
+ return NULL;
+
+ read_lock_bh(&net->xfrm.xfrm_policy_lock);
+ chain = policy_hash_direct(net, daddr, saddr, family, dir);
+ ret = NULL;
+ hlist_for_each_entry(pol, chain, bydst) {
+ err = xfrm_policy_match(pol, fl, type, family, dir);
+ if (err) {
+ if (err == -ESRCH)
+ continue;
+ else {
+ ret = ERR_PTR(err);
+ goto fail;
+ }
+ } else {
+ ret = pol;
+ priority = ret->priority;
+ break;
+ }
+ }
+ chain = &net->xfrm.policy_inexact[dir];
+ hlist_for_each_entry(pol, chain, bydst) {
+ err = xfrm_policy_match(pol, fl, type, family, dir);
+ if (err) {
+ if (err == -ESRCH)
+ continue;
+ else {
+ ret = ERR_PTR(err);
+ goto fail;
+ }
+ } else if (pol->priority < priority) {
+ ret = pol;
+ break;
+ }
+ }
+ if (ret)
+ xfrm_pol_hold(ret);
+fail:
+ read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ return ret;
+}
+
+static struct xfrm_policy *
+__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+ struct xfrm_policy *pol;
+
+ pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+ if (pol != NULL)
+ return pol;
+#endif
+ return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+}
+
+static int flow_to_policy_dir(int dir)
+{
+ if (XFRM_POLICY_IN == FLOW_DIR_IN &&
+ XFRM_POLICY_OUT == FLOW_DIR_OUT &&
+ XFRM_POLICY_FWD == FLOW_DIR_FWD)
+ return dir;
+
+ switch (dir) {
+ default:
+ case FLOW_DIR_IN:
+ return XFRM_POLICY_IN;
+ case FLOW_DIR_OUT:
+ return XFRM_POLICY_OUT;
+ case FLOW_DIR_FWD:
+ return XFRM_POLICY_FWD;
+ }
+}
+
+static struct flow_cache_object *
+xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
+ u8 dir, struct flow_cache_object *old_obj, void *ctx)
+{
+ struct xfrm_policy *pol;
+
+ if (old_obj)
+ xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
+
+ pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir));
+ if (IS_ERR_OR_NULL(pol))
+ return ERR_CAST(pol);
+
+ /* Resolver returns two references:
+ * one for cache and one for caller of flow_cache_lookup() */
+ xfrm_pol_hold(pol);
+
+ return &pol->flo;
+}
+
+static inline int policy_to_flow_dir(int dir)
+{
+ if (XFRM_POLICY_IN == FLOW_DIR_IN &&
+ XFRM_POLICY_OUT == FLOW_DIR_OUT &&
+ XFRM_POLICY_FWD == FLOW_DIR_FWD)
+ return dir;
+ switch (dir) {
+ default:
+ case XFRM_POLICY_IN:
+ return FLOW_DIR_IN;
+ case XFRM_POLICY_OUT:
+ return FLOW_DIR_OUT;
+ case XFRM_POLICY_FWD:
+ return FLOW_DIR_FWD;
+ }
+}
+
+static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir,
+ const struct flowi *fl)
+{
+ struct xfrm_policy *pol;
+ struct net *net = sock_net(sk);
+
+ read_lock_bh(&net->xfrm.xfrm_policy_lock);
+ if ((pol = sk->sk_policy[dir]) != NULL) {
+ bool match = xfrm_selector_match(&pol->selector, fl,
+ sk->sk_family);
+ int err = 0;
+
+ if (match) {
+ if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
+ pol = NULL;
+ goto out;
+ }
+ err = security_xfrm_policy_lookup(pol->security,
+ fl->flowi_secid,
+ policy_to_flow_dir(dir));
+ if (!err)
+ xfrm_pol_hold(pol);
+ else if (err == -ESRCH)
+ pol = NULL;
+ else
+ pol = ERR_PTR(err);
+ } else
+ pol = NULL;
+ }
+out:
+ read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ return pol;
+}
+
+static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
+{
+ struct net *net = xp_net(pol);
+
+ list_add(&pol->walk.all, &net->xfrm.policy_all);
+ net->xfrm.policy_count[dir]++;
+ xfrm_pol_hold(pol);
+}
+
+static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
+ int dir)
+{
+ struct net *net = xp_net(pol);
+
+ if (list_empty(&pol->walk.all))
+ return NULL;
+
+ /* Socket policies are not hashed. */
+ if (!hlist_unhashed(&pol->bydst)) {
+ hlist_del(&pol->bydst);
+ hlist_del(&pol->byidx);
+ }
+
+ list_del_init(&pol->walk.all);
+ net->xfrm.policy_count[dir]--;
+
+ return pol;
+}
+
+static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir)
+{
+ __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir);
+}
+
+static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
+{
+ __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
+}
+
+int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
+{
+ struct net *net = xp_net(pol);
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ pol = __xfrm_policy_unlink(pol, dir);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ if (pol) {
+ xfrm_policy_kill(pol);
+ return 0;
+ }
+ return -ENOENT;
+}
+EXPORT_SYMBOL(xfrm_policy_delete);
+
+int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
+{
+ struct net *net = xp_net(pol);
+ struct xfrm_policy *old_pol;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+ if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
+ return -EINVAL;
+#endif
+
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ old_pol = sk->sk_policy[dir];
+ sk->sk_policy[dir] = pol;
+ if (pol) {
+ pol->curlft.add_time = get_seconds();
+ pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
+ xfrm_sk_policy_link(pol, dir);
+ }
+ if (old_pol) {
+ if (pol)
+ xfrm_policy_requeue(old_pol, pol);
+
+ /* Unlinking succeeds always. This is the only function
+ * allowed to delete or replace socket policy.
+ */
+ xfrm_sk_policy_unlink(old_pol, dir);
+ }
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ if (old_pol) {
+ xfrm_policy_kill(old_pol);
+ }
+ return 0;
+}
+
+static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
+{
+ struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
+ struct net *net = xp_net(old);
+
+ if (newp) {
+ newp->selector = old->selector;
+ if (security_xfrm_policy_clone(old->security,
+ &newp->security)) {
+ kfree(newp);
+ return NULL; /* ENOMEM */
+ }
+ newp->lft = old->lft;
+ newp->curlft = old->curlft;
+ newp->mark = old->mark;
+ newp->action = old->action;
+ newp->flags = old->flags;
+ newp->xfrm_nr = old->xfrm_nr;
+ newp->index = old->index;
+ newp->type = old->type;
+ memcpy(newp->xfrm_vec, old->xfrm_vec,
+ newp->xfrm_nr*sizeof(struct xfrm_tmpl));
+ write_lock_bh(&net->xfrm.xfrm_policy_lock);
+ xfrm_sk_policy_link(newp, dir);
+ write_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ xfrm_pol_put(newp);
+ }
+ return newp;
+}
+
+int __xfrm_sk_clone_policy(struct sock *sk)
+{
+ struct xfrm_policy *p0 = sk->sk_policy[0],
+ *p1 = sk->sk_policy[1];
+
+ sk->sk_policy[0] = sk->sk_policy[1] = NULL;
+ if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
+ return -ENOMEM;
+ if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+static int
+xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote,
+ unsigned short family)
+{
+ int err;
+ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+ err = afinfo->get_saddr(net, local, remote);
+ xfrm_policy_put_afinfo(afinfo);
+ return err;
+}
+
+/* Resolve list of templates for the flow, given policy. */
+
+static int
+xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
+ struct xfrm_state **xfrm, unsigned short family)
+{
+ struct net *net = xp_net(policy);
+ int nx;
+ int i, error;
+ xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
+ xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
+ xfrm_address_t tmp;
+
+ for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
+ struct xfrm_state *x;
+ xfrm_address_t *remote = daddr;
+ xfrm_address_t *local = saddr;
+ struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
+
+ if (tmpl->mode == XFRM_MODE_TUNNEL ||
+ tmpl->mode == XFRM_MODE_BEET) {
+ remote = &tmpl->id.daddr;
+ local = &tmpl->saddr;
+ if (xfrm_addr_any(local, tmpl->encap_family)) {
+ error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
+ if (error)
+ goto fail;
+ local = &tmp;
+ }
+ }
+
+ x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
+
+ if (x && x->km.state == XFRM_STATE_VALID) {
+ xfrm[nx++] = x;
+ daddr = remote;
+ saddr = local;
+ continue;
+ }
+ if (x) {
+ error = (x->km.state == XFRM_STATE_ERROR ?
+ -EINVAL : -EAGAIN);
+ xfrm_state_put(x);
+ } else if (error == -ESRCH) {
+ error = -EAGAIN;
+ }
+
+ if (!tmpl->optional)
+ goto fail;
+ }
+ return nx;
+
+fail:
+ for (nx--; nx >= 0; nx--)
+ xfrm_state_put(xfrm[nx]);
+ return error;
+}
+
+static int
+xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
+ struct xfrm_state **xfrm, unsigned short family)
+{
+ struct xfrm_state *tp[XFRM_MAX_DEPTH];
+ struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
+ int cnx = 0;
+ int error;
+ int ret;
+ int i;
+
+ for (i = 0; i < npols; i++) {
+ if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
+ error = -ENOBUFS;
+ goto fail;
+ }
+
+ ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
+ if (ret < 0) {
+ error = ret;
+ goto fail;
+ } else
+ cnx += ret;
+ }
+
+ /* found states are sorted for outbound processing */
+ if (npols > 1)
+ xfrm_state_sort(xfrm, tpp, cnx, family);
+
+ return cnx;
+
+ fail:
+ for (cnx--; cnx >= 0; cnx--)
+ xfrm_state_put(tpp[cnx]);
+ return error;
+
+}
+
+/* Check that the bundle accepts the flow and its components are
+ * still valid.
+ */
+
+static inline int xfrm_get_tos(const struct flowi *fl, int family)
+{
+ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ int tos;
+
+ if (!afinfo)
+ return -EINVAL;
+
+ tos = afinfo->get_tos(fl);
+
+ xfrm_policy_put_afinfo(afinfo);
+
+ return tos;
+}
+
+static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
+{
+ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+ struct dst_entry *dst = &xdst->u.dst;
+
+ if (xdst->route == NULL) {
+ /* Dummy bundle - if it has xfrms we were not
+ * able to build bundle as template resolution failed.
+ * It means we need to try again resolving. */
+ if (xdst->num_xfrms > 0)
+ return NULL;
+ } else if (dst->flags & DST_XFRM_QUEUE) {
+ return NULL;
+ } else {
+ /* Real bundle */
+ if (stale_bundle(dst))
+ return NULL;
+ }
+
+ dst_hold(dst);
+ return flo;
+}
+
+static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
+{
+ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+ struct dst_entry *dst = &xdst->u.dst;
+
+ if (!xdst->route)
+ return 0;
+ if (stale_bundle(dst))
+ return 0;
+
+ return 1;
+}
+
+static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
+{
+ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+ struct dst_entry *dst = &xdst->u.dst;
+
+ dst_free(dst);
+}
+
+static const struct flow_cache_ops xfrm_bundle_fc_ops = {
+ .get = xfrm_bundle_flo_get,
+ .check = xfrm_bundle_flo_check,
+ .delete = xfrm_bundle_flo_delete,
+};
+
+static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
+{
+ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ struct dst_ops *dst_ops;
+ struct xfrm_dst *xdst;
+
+ if (!afinfo)
+ return ERR_PTR(-EINVAL);
+
+ switch (family) {
+ case AF_INET:
+ dst_ops = &net->xfrm.xfrm4_dst_ops;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ dst_ops = &net->xfrm.xfrm6_dst_ops;
+ break;
+#endif
+ default:
+ BUG();
+ }
+ xdst = dst_alloc(dst_ops, NULL, 0, DST_OBSOLETE_NONE, 0);
+
+ if (likely(xdst)) {
+ struct dst_entry *dst = &xdst->u.dst;
+
+ memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
+ xdst->flo.ops = &xfrm_bundle_fc_ops;
+ if (afinfo->init_dst)
+ afinfo->init_dst(net, xdst);
+ } else
+ xdst = ERR_PTR(-ENOBUFS);
+
+ xfrm_policy_put_afinfo(afinfo);
+
+ return xdst;
+}
+
+static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+ int nfheader_len)
+{
+ struct xfrm_policy_afinfo *afinfo =
+ xfrm_policy_get_afinfo(dst->ops->family);
+ int err;
+
+ if (!afinfo)
+ return -EINVAL;
+
+ err = afinfo->init_path(path, dst, nfheader_len);
+
+ xfrm_policy_put_afinfo(afinfo);
+
+ return err;
+}
+
+static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+ const struct flowi *fl)
+{
+ struct xfrm_policy_afinfo *afinfo =
+ xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
+ int err;
+
+ if (!afinfo)
+ return -EINVAL;
+
+ err = afinfo->fill_dst(xdst, dev, fl);
+
+ xfrm_policy_put_afinfo(afinfo);
+
+ return err;
+}
+
+
+/* Allocate chain of dst_entry's, attach known xfrm's, calculate
+ * all the metrics... Shortly, bundle a bundle.
+ */
+
+static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
+ struct xfrm_state **xfrm, int nx,
+ const struct flowi *fl,
+ struct dst_entry *dst)
+{
+ struct net *net = xp_net(policy);
+ unsigned long now = jiffies;
+ struct net_device *dev;
+ struct xfrm_mode *inner_mode;
+ struct dst_entry *dst_prev = NULL;
+ struct dst_entry *dst0 = NULL;
+ int i = 0;
+ int err;
+ int header_len = 0;
+ int nfheader_len = 0;
+ int trailer_len = 0;
+ int tos;
+ int family = policy->selector.family;
+ xfrm_address_t saddr, daddr;
+
+ xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
+
+ tos = xfrm_get_tos(fl, family);
+ err = tos;
+ if (tos < 0)
+ goto put_states;
+
+ dst_hold(dst);
+
+ for (; i < nx; i++) {
+ struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
+ struct dst_entry *dst1 = &xdst->u.dst;
+
+ err = PTR_ERR(xdst);
+ if (IS_ERR(xdst)) {
+ dst_release(dst);
+ goto put_states;
+ }
+
+ if (xfrm[i]->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(xfrm[i],
+ xfrm_af2proto(family));
+ if (!inner_mode) {
+ err = -EAFNOSUPPORT;
+ dst_release(dst);
+ goto put_states;
+ }
+ } else
+ inner_mode = xfrm[i]->inner_mode;
+
+ if (!dst_prev)
+ dst0 = dst1;
+ else {
+ dst_prev->child = dst_clone(dst1);
+ dst1->flags |= DST_NOHASH;
+ }
+
+ xdst->route = dst;
+ dst_copy_metrics(dst1, dst);
+
+ if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
+ family = xfrm[i]->props.family;
+ dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr,
+ family);
+ err = PTR_ERR(dst);
+ if (IS_ERR(dst))
+ goto put_states;
+ } else
+ dst_hold(dst);
+
+ dst1->xfrm = xfrm[i];
+ xdst->xfrm_genid = xfrm[i]->genid;
+
+ dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
+ dst1->flags |= DST_HOST;
+ dst1->lastuse = now;
+
+ dst1->input = dst_discard;
+ dst1->output = inner_mode->afinfo->output;
+
+ dst1->next = dst_prev;
+ dst_prev = dst1;
+
+ header_len += xfrm[i]->props.header_len;
+ if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
+ nfheader_len += xfrm[i]->props.header_len;
+ trailer_len += xfrm[i]->props.trailer_len;
+ }
+
+ dst_prev->child = dst;
+ dst0->path = dst;
+
+ err = -ENODEV;
+ dev = dst->dev;
+ if (!dev)
+ goto free_dst;
+
+ xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
+ xfrm_init_pmtu(dst_prev);
+
+ for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) {
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev;
+
+ err = xfrm_fill_dst(xdst, dev, fl);
+ if (err)
+ goto free_dst;
+
+ dst_prev->header_len = header_len;
+ dst_prev->trailer_len = trailer_len;
+ header_len -= xdst->u.dst.xfrm->props.header_len;
+ trailer_len -= xdst->u.dst.xfrm->props.trailer_len;
+ }
+
+out:
+ return dst0;
+
+put_states:
+ for (; i < nx; i++)
+ xfrm_state_put(xfrm[i]);
+free_dst:
+ if (dst0)
+ dst_free(dst0);
+ dst0 = ERR_PTR(err);
+ goto out;
+}
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+static int xfrm_dst_alloc_copy(void **target, const void *src, int size)
+{
+ if (!*target) {
+ *target = kmalloc(size, GFP_ATOMIC);
+ if (!*target)
+ return -ENOMEM;
+ }
+
+ memcpy(*target, src, size);
+ return 0;
+}
+#endif
+
+static int xfrm_dst_update_parent(struct dst_entry *dst,
+ const struct xfrm_selector *sel)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ return xfrm_dst_alloc_copy((void **)&(xdst->partner),
+ sel, sizeof(*sel));
+#else
+ return 0;
+#endif
+}
+
+static int xfrm_dst_update_origin(struct dst_entry *dst,
+ const struct flowi *fl)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
+#else
+ return 0;
+#endif
+}
+
+static int xfrm_expand_policies(const struct flowi *fl, u16 family,
+ struct xfrm_policy **pols,
+ int *num_pols, int *num_xfrms)
+{
+ int i;
+
+ if (*num_pols == 0 || !pols[0]) {
+ *num_pols = 0;
+ *num_xfrms = 0;
+ return 0;
+ }
+ if (IS_ERR(pols[0]))
+ return PTR_ERR(pols[0]);
+
+ *num_xfrms = pols[0]->xfrm_nr;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+ if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
+ pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+ pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
+ XFRM_POLICY_TYPE_MAIN,
+ fl, family,
+ XFRM_POLICY_OUT);
+ if (pols[1]) {
+ if (IS_ERR(pols[1])) {
+ xfrm_pols_put(pols, *num_pols);
+ return PTR_ERR(pols[1]);
+ }
+ (*num_pols)++;
+ (*num_xfrms) += pols[1]->xfrm_nr;
+ }
+ }
+#endif
+ for (i = 0; i < *num_pols; i++) {
+ if (pols[i]->action != XFRM_POLICY_ALLOW) {
+ *num_xfrms = -1;
+ break;
+ }
+ }
+
+ return 0;
+
+}
+
+static struct xfrm_dst *
+xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
+ const struct flowi *fl, u16 family,
+ struct dst_entry *dst_orig)
+{
+ struct net *net = xp_net(pols[0]);
+ struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+ struct dst_entry *dst;
+ struct xfrm_dst *xdst;
+ int err;
+
+ /* Try to instantiate a bundle */
+ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
+ if (err <= 0) {
+ if (err != 0 && err != -EAGAIN)
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+ return ERR_PTR(err);
+ }
+
+ dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+ if (IS_ERR(dst)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
+ return ERR_CAST(dst);
+ }
+
+ xdst = (struct xfrm_dst *)dst;
+ xdst->num_xfrms = err;
+ if (num_pols > 1)
+ err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+ else
+ err = xfrm_dst_update_origin(dst, fl);
+ if (unlikely(err)) {
+ dst_free(dst);
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+ return ERR_PTR(err);
+ }
+
+ xdst->num_pols = num_pols;
+ memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
+ xdst->policy_genid = atomic_read(&pols[0]->genid);
+
+ return xdst;
+}
+
+static void xfrm_policy_queue_process(unsigned long arg)
+{
+ struct sk_buff *skb;
+ struct sock *sk;
+ struct dst_entry *dst;
+ struct xfrm_policy *pol = (struct xfrm_policy *)arg;
+ struct xfrm_policy_queue *pq = &pol->polq;
+ struct flowi fl;
+ struct sk_buff_head list;
+
+ spin_lock(&pq->hold_queue.lock);
+ skb = skb_peek(&pq->hold_queue);
+ if (!skb) {
+ spin_unlock(&pq->hold_queue.lock);
+ goto out;
+ }
+ dst = skb_dst(skb);
+ sk = skb->sk;
+ xfrm_decode_session(skb, &fl, dst->ops->family);
+ spin_unlock(&pq->hold_queue.lock);
+
+ dst_hold(dst->path);
+ dst = xfrm_lookup(xp_net(pol), dst->path, &fl,
+ sk, 0);
+ if (IS_ERR(dst))
+ goto purge_queue;
+
+ if (dst->flags & DST_XFRM_QUEUE) {
+ dst_release(dst);
+
+ if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
+ goto purge_queue;
+
+ pq->timeout = pq->timeout << 1;
+ if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
+ xfrm_pol_hold(pol);
+ goto out;
+ }
+
+ dst_release(dst);
+
+ __skb_queue_head_init(&list);
+
+ spin_lock(&pq->hold_queue.lock);
+ pq->timeout = 0;
+ skb_queue_splice_init(&pq->hold_queue, &list);
+ spin_unlock(&pq->hold_queue.lock);
+
+ while (!skb_queue_empty(&list)) {
+ skb = __skb_dequeue(&list);
+
+ xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
+ dst_hold(skb_dst(skb)->path);
+ dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path,
+ &fl, skb->sk, 0);
+ if (IS_ERR(dst)) {
+ kfree_skb(skb);
+ continue;
+ }
+
+ nf_reset(skb);
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ dst_output(skb);
+ }
+
+out:
+ xfrm_pol_put(pol);
+ return;
+
+purge_queue:
+ pq->timeout = 0;
+ xfrm_queue_purge(&pq->hold_queue);
+ xfrm_pol_put(pol);
+}
+
+static int xdst_queue_output(struct sock *sk, struct sk_buff *skb)
+{
+ unsigned long sched_next;
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
+ struct xfrm_policy *pol = xdst->pols[0];
+ struct xfrm_policy_queue *pq = &pol->polq;
+
+ if (unlikely(skb_fclone_busy(sk, skb))) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
+ kfree_skb(skb);
+ return -EAGAIN;
+ }
+
+ skb_dst_force(skb);
+
+ spin_lock_bh(&pq->hold_queue.lock);
+
+ if (!pq->timeout)
+ pq->timeout = XFRM_QUEUE_TMO_MIN;
+
+ sched_next = jiffies + pq->timeout;
+
+ if (del_timer(&pq->hold_timer)) {
+ if (time_before(pq->hold_timer.expires, sched_next))
+ sched_next = pq->hold_timer.expires;
+ xfrm_pol_put(pol);
+ }
+
+ __skb_queue_tail(&pq->hold_queue, skb);
+ if (!mod_timer(&pq->hold_timer, sched_next))
+ xfrm_pol_hold(pol);
+
+ spin_unlock_bh(&pq->hold_queue.lock);
+
+ return 0;
+}
+
+static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
+ struct xfrm_flo *xflo,
+ const struct flowi *fl,
+ int num_xfrms,
+ u16 family)
+{
+ int err;
+ struct net_device *dev;
+ struct dst_entry *dst;
+ struct dst_entry *dst1;
+ struct xfrm_dst *xdst;
+
+ xdst = xfrm_alloc_dst(net, family);
+ if (IS_ERR(xdst))
+ return xdst;
+
+ if (!(xflo->flags & XFRM_LOOKUP_QUEUE) ||
+ net->xfrm.sysctl_larval_drop ||
+ num_xfrms <= 0)
+ return xdst;
+
+ dst = xflo->dst_orig;
+ dst1 = &xdst->u.dst;
+ dst_hold(dst);
+ xdst->route = dst;
+
+ dst_copy_metrics(dst1, dst);
+
+ dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
+ dst1->flags |= DST_HOST | DST_XFRM_QUEUE;
+ dst1->lastuse = jiffies;
+
+ dst1->input = dst_discard;
+ dst1->output = xdst_queue_output;
+
+ dst_hold(dst);
+ dst1->child = dst;
+ dst1->path = dst;
+
+ xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);
+
+ err = -ENODEV;
+ dev = dst->dev;
+ if (!dev)
+ goto free_dst;
+
+ err = xfrm_fill_dst(xdst, dev, fl);
+ if (err)
+ goto free_dst;
+
+out:
+ return xdst;
+
+free_dst:
+ dst_release(dst1);
+ xdst = ERR_PTR(err);
+ goto out;
+}
+
+static struct flow_cache_object *
+xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
+ struct flow_cache_object *oldflo, void *ctx)
+{
+ struct xfrm_flo *xflo = (struct xfrm_flo *)ctx;
+ struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+ struct xfrm_dst *xdst, *new_xdst;
+ int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
+
+ /* Check if the policies from old bundle are usable */
+ xdst = NULL;
+ if (oldflo) {
+ xdst = container_of(oldflo, struct xfrm_dst, flo);
+ num_pols = xdst->num_pols;
+ num_xfrms = xdst->num_xfrms;
+ pol_dead = 0;
+ for (i = 0; i < num_pols; i++) {
+ pols[i] = xdst->pols[i];
+ pol_dead |= pols[i]->walk.dead;
+ }
+ if (pol_dead) {
+ dst_free(&xdst->u.dst);
+ xdst = NULL;
+ num_pols = 0;
+ num_xfrms = 0;
+ oldflo = NULL;
+ }
+ }
+
+ /* Resolve policies to use if we couldn't get them from
+ * previous cache entry */
+ if (xdst == NULL) {
+ num_pols = 1;
+ pols[0] = __xfrm_policy_lookup(net, fl, family,
+ flow_to_policy_dir(dir));
+ err = xfrm_expand_policies(fl, family, pols,
+ &num_pols, &num_xfrms);
+ if (err < 0)
+ goto inc_error;
+ if (num_pols == 0)
+ return NULL;
+ if (num_xfrms <= 0)
+ goto make_dummy_bundle;
+ }
+
+ new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
+ xflo->dst_orig);
+ if (IS_ERR(new_xdst)) {
+ err = PTR_ERR(new_xdst);
+ if (err != -EAGAIN)
+ goto error;
+ if (oldflo == NULL)
+ goto make_dummy_bundle;
+ dst_hold(&xdst->u.dst);
+ return oldflo;
+ } else if (new_xdst == NULL) {
+ num_xfrms = 0;
+ if (oldflo == NULL)
+ goto make_dummy_bundle;
+ xdst->num_xfrms = 0;
+ dst_hold(&xdst->u.dst);
+ return oldflo;
+ }
+
+ /* Kill the previous bundle */
+ if (xdst) {
+ /* The policies were stolen for newly generated bundle */
+ xdst->num_pols = 0;
+ dst_free(&xdst->u.dst);
+ }
+
+ /* Flow cache does not have reference, it dst_free()'s,
+ * but we do need to return one reference for original caller */
+ dst_hold(&new_xdst->u.dst);
+ return &new_xdst->flo;
+
+make_dummy_bundle:
+ /* We found policies, but there's no bundles to instantiate:
+ * either because the policy blocks, has no transformations or
+ * we could not build template (no xfrm_states).*/
+ xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family);
+ if (IS_ERR(xdst)) {
+ xfrm_pols_put(pols, num_pols);
+ return ERR_CAST(xdst);
+ }
+ xdst->num_pols = num_pols;
+ xdst->num_xfrms = num_xfrms;
+ memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
+
+ dst_hold(&xdst->u.dst);
+ return &xdst->flo;
+
+inc_error:
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+error:
+ if (xdst != NULL)
+ dst_free(&xdst->u.dst);
+ else
+ xfrm_pols_put(pols, num_pols);
+ return ERR_PTR(err);
+}
+
+static struct dst_entry *make_blackhole(struct net *net, u16 family,
+ struct dst_entry *dst_orig)
+{
+ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ struct dst_entry *ret;
+
+ if (!afinfo) {
+ dst_release(dst_orig);
+ return ERR_PTR(-EINVAL);
+ } else {
+ ret = afinfo->blackhole_route(net, dst_orig);
+ }
+ xfrm_policy_put_afinfo(afinfo);
+
+ return ret;
+}
+
+/* Main function: finds/creates a bundle for given flow.
+ *
+ * At the moment we eat a raw IP route. Mostly to speed up lookups
+ * on interfaces with disabled IPsec.
+ */
+struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
+ const struct flowi *fl,
+ struct sock *sk, int flags)
+{
+ struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+ struct flow_cache_object *flo;
+ struct xfrm_dst *xdst;
+ struct dst_entry *dst, *route;
+ u16 family = dst_orig->ops->family;
+ u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+ int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
+
+ dst = NULL;
+ xdst = NULL;
+ route = NULL;
+
+ if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
+ num_pols = 1;
+ pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+ err = xfrm_expand_policies(fl, family, pols,
+ &num_pols, &num_xfrms);
+ if (err < 0)
+ goto dropdst;
+
+ if (num_pols) {
+ if (num_xfrms <= 0) {
+ drop_pols = num_pols;
+ goto no_transform;
+ }
+
+ xdst = xfrm_resolve_and_create_bundle(
+ pols, num_pols, fl,
+ family, dst_orig);
+ if (IS_ERR(xdst)) {
+ xfrm_pols_put(pols, num_pols);
+ err = PTR_ERR(xdst);
+ goto dropdst;
+ } else if (xdst == NULL) {
+ num_xfrms = 0;
+ drop_pols = num_pols;
+ goto no_transform;
+ }
+
+ dst_hold(&xdst->u.dst);
+ xdst->u.dst.flags |= DST_NOCACHE;
+ route = xdst->route;
+ }
+ }
+
+ if (xdst == NULL) {
+ struct xfrm_flo xflo;
+
+ xflo.dst_orig = dst_orig;
+ xflo.flags = flags;
+
+ /* To accelerate a bit... */
+ if ((dst_orig->flags & DST_NOXFRM) ||
+ !net->xfrm.policy_count[XFRM_POLICY_OUT])
+ goto nopol;
+
+ flo = flow_cache_lookup(net, fl, family, dir,
+ xfrm_bundle_lookup, &xflo);
+ if (flo == NULL)
+ goto nopol;
+ if (IS_ERR(flo)) {
+ err = PTR_ERR(flo);
+ goto dropdst;
+ }
+ xdst = container_of(flo, struct xfrm_dst, flo);
+
+ num_pols = xdst->num_pols;
+ num_xfrms = xdst->num_xfrms;
+ memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
+ route = xdst->route;
+ }
+
+ dst = &xdst->u.dst;
+ if (route == NULL && num_xfrms > 0) {
+ /* The only case when xfrm_bundle_lookup() returns a
+ * bundle with null route, is when the template could
+ * not be resolved. It means policies are there, but
+ * bundle could not be created, since we don't yet
+ * have the xfrm_state's. We need to wait for KM to
+ * negotiate new SA's or bail out with error.*/
+ if (net->xfrm.sysctl_larval_drop) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+ err = -EREMOTE;
+ goto error;
+ }
+
+ err = -EAGAIN;
+
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+ goto error;
+ }
+
+no_transform:
+ if (num_pols == 0)
+ goto nopol;
+
+ if ((flags & XFRM_LOOKUP_ICMP) &&
+ !(pols[0]->flags & XFRM_POLICY_ICMP)) {
+ err = -ENOENT;
+ goto error;
+ }
+
+ for (i = 0; i < num_pols; i++)
+ pols[i]->curlft.use_time = get_seconds();
+
+ if (num_xfrms < 0) {
+ /* Prohibit the flow */
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
+ err = -EPERM;
+ goto error;
+ } else if (num_xfrms > 0) {
+ /* Flow transformed */
+ dst_release(dst_orig);
+ } else {
+ /* Flow passes untransformed */
+ dst_release(dst);
+ dst = dst_orig;
+ }
+ok:
+ xfrm_pols_put(pols, drop_pols);
+ if (dst && dst->xfrm &&
+ dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
+ dst->flags |= DST_XFRM_TUNNEL;
+ return dst;
+
+nopol:
+ if (!(flags & XFRM_LOOKUP_ICMP)) {
+ dst = dst_orig;
+ goto ok;
+ }
+ err = -ENOENT;
+error:
+ dst_release(dst);
+dropdst:
+ if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
+ dst_release(dst_orig);
+ xfrm_pols_put(pols, drop_pols);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(xfrm_lookup);
+
+/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
+ * Otherwise we may send out blackholed packets.
+ */
+struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
+ const struct flowi *fl,
+ struct sock *sk, int flags)
+{
+ struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
+ flags | XFRM_LOOKUP_QUEUE |
+ XFRM_LOOKUP_KEEP_DST_REF);
+
+ if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE)
+ return make_blackhole(net, dst_orig->ops->family, dst_orig);
+
+ return dst;
+}
+EXPORT_SYMBOL(xfrm_lookup_route);
+
+static inline int
+xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
+{
+ struct xfrm_state *x;
+
+ if (!skb->sp || idx < 0 || idx >= skb->sp->len)
+ return 0;
+ x = skb->sp->xvec[idx];
+ if (!x->type->reject)
+ return 0;
+ return x->type->reject(x, skb, fl);
+}
+
+/* When skb is transformed back to its "native" form, we have to
+ * check policy restrictions. At the moment we make this in maximally
+ * stupid way. Shame on me. :-) Of course, connected sockets must
+ * have policy cached at them.
+ */
+
+static inline int
+xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
+ unsigned short family)
+{
+ if (xfrm_state_kern(x))
+ return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
+ return x->id.proto == tmpl->id.proto &&
+ (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
+ (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
+ x->props.mode == tmpl->mode &&
+ (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
+ !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
+ !(x->props.mode != XFRM_MODE_TRANSPORT &&
+ xfrm_state_addr_cmp(tmpl, x, family));
+}
+
+/*
+ * 0 or more than 0 is returned when validation is succeeded (either bypass
+ * because of optional transport mode, or next index of the mathced secpath
+ * state with the template.
+ * -1 is returned when no matching template is found.
+ * Otherwise "-2 - errored_index" is returned.
+ */
+static inline int
+xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
+ unsigned short family)
+{
+ int idx = start;
+
+ if (tmpl->optional) {
+ if (tmpl->mode == XFRM_MODE_TRANSPORT)
+ return start;
+ } else
+ start = -1;
+ for (; idx < sp->len; idx++) {
+ if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
+ return ++idx;
+ if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
+ if (start == -1)
+ start = -2-idx;
+ break;
+ }
+ }
+ return start;
+}
+
+int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
+ unsigned int family, int reverse)
+{
+ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+ int err;
+
+ if (unlikely(afinfo == NULL))
+ return -EAFNOSUPPORT;
+
+ afinfo->decode_session(skb, fl, reverse);
+ err = security_xfrm_decode_session(skb, &fl->flowi_secid);
+ xfrm_policy_put_afinfo(afinfo);
+ return err;
+}
+EXPORT_SYMBOL(__xfrm_decode_session);
+
+static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
+{
+ for (; k < sp->len; k++) {
+ if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
+ *idxp = k;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
+ unsigned short family)
+{
+ struct net *net = dev_net(skb->dev);
+ struct xfrm_policy *pol;
+ struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+ int npols = 0;
+ int xfrm_nr;
+ int pi;
+ int reverse;
+ struct flowi fl;
+ u8 fl_dir;
+ int xerr_idx = -1;
+
+ reverse = dir & ~XFRM_POLICY_MASK;
+ dir &= XFRM_POLICY_MASK;
+ fl_dir = policy_to_flow_dir(dir);
+
+ if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+ return 0;
+ }
+
+ nf_nat_decode_session(skb, &fl, family);
+
+ /* First, check used SA against their selectors. */
+ if (skb->sp) {
+ int i;
+
+ for (i = skb->sp->len-1; i >= 0; i--) {
+ struct xfrm_state *x = skb->sp->xvec[i];
+ if (!xfrm_selector_match(&x->sel, &fl, family)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
+ return 0;
+ }
+ }
+ }
+
+ pol = NULL;
+ if (sk && sk->sk_policy[dir]) {
+ pol = xfrm_sk_policy_lookup(sk, dir, &fl);
+ if (IS_ERR(pol)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
+ return 0;
+ }
+ }
+
+ if (!pol) {
+ struct flow_cache_object *flo;
+
+ flo = flow_cache_lookup(net, &fl, family, fl_dir,
+ xfrm_policy_lookup, NULL);
+ if (IS_ERR_OR_NULL(flo))
+ pol = ERR_CAST(flo);
+ else
+ pol = container_of(flo, struct xfrm_policy, flo);
+ }
+
+ if (IS_ERR(pol)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
+ return 0;
+ }
+
+ if (!pol) {
+ if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
+ xfrm_secpath_reject(xerr_idx, skb, &fl);
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
+ return 0;
+ }
+ return 1;
+ }
+
+ pol->curlft.use_time = get_seconds();
+
+ pols[0] = pol;
+ npols++;
+#ifdef CONFIG_XFRM_SUB_POLICY
+ if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+ pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
+ &fl, family,
+ XFRM_POLICY_IN);
+ if (pols[1]) {
+ if (IS_ERR(pols[1])) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
+ return 0;
+ }
+ pols[1]->curlft.use_time = get_seconds();
+ npols++;
+ }
+ }
+#endif
+
+ if (pol->action == XFRM_POLICY_ALLOW) {
+ struct sec_path *sp;
+ static struct sec_path dummy;
+ struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
+ struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
+ struct xfrm_tmpl **tpp = tp;
+ int ti = 0;
+ int i, k;
+
+ if ((sp = skb->sp) == NULL)
+ sp = &dummy;
+
+ for (pi = 0; pi < npols; pi++) {
+ if (pols[pi] != pol &&
+ pols[pi]->action != XFRM_POLICY_ALLOW) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
+ goto reject;
+ }
+ if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+ goto reject_error;
+ }
+ for (i = 0; i < pols[pi]->xfrm_nr; i++)
+ tpp[ti++] = &pols[pi]->xfrm_vec[i];
+ }
+ xfrm_nr = ti;
+ if (npols > 1) {
+ xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net);
+ tpp = stp;
+ }
+
+ /* For each tunnel xfrm, find the first matching tmpl.
+ * For each tmpl before that, find corresponding xfrm.
+ * Order is _important_. Later we will implement
+ * some barriers, but at the moment barriers
+ * are implied between each two transformations.
+ */
+ for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
+ k = xfrm_policy_ok(tpp[i], sp, k, family);
+ if (k < 0) {
+ if (k < -1)
+ /* "-2 - errored_index" returned */
+ xerr_idx = -(2+k);
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
+ goto reject;
+ }
+ }
+
+ if (secpath_has_nontransport(sp, k, &xerr_idx)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
+ goto reject;
+ }
+
+ xfrm_pols_put(pols, npols);
+ return 1;
+ }
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
+
+reject:
+ xfrm_secpath_reject(xerr_idx, skb, &fl);
+reject_error:
+ xfrm_pols_put(pols, npols);
+ return 0;
+}
+EXPORT_SYMBOL(__xfrm_policy_check);
+
+int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
+{
+ struct net *net = dev_net(skb->dev);
+ struct flowi fl;
+ struct dst_entry *dst;
+ int res = 1;
+
+ if (xfrm_decode_session(skb, &fl, family) < 0) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
+ return 0;
+ }
+
+ skb_dst_force(skb);
+
+ dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
+ if (IS_ERR(dst)) {
+ res = 0;
+ dst = NULL;
+ }
+ skb_dst_set(skb, dst);
+ return res;
+}
+EXPORT_SYMBOL(__xfrm_route_forward);
+
+/* Optimize later using cookies and generation ids. */
+
+static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
+ * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
+ * get validated by dst_ops->check on every use. We do this
+ * because when a normal route referenced by an XFRM dst is
+ * obsoleted we do not go looking around for all parent
+ * referencing XFRM dsts so that we can invalidate them. It
+ * is just too much work. Instead we make the checks here on
+ * every use. For example:
+ *
+ * XFRM dst A --> IPv4 dst X
+ *
+ * X is the "xdst->route" of A (X is also the "dst->path" of A
+ * in this example). If X is marked obsolete, "A" will not
+ * notice. That's what we are validating here via the
+ * stale_bundle() check.
+ *
+ * When a policy's bundle is pruned, we dst_free() the XFRM
+ * dst which causes it's ->obsolete field to be set to
+ * DST_OBSOLETE_DEAD. If an XFRM dst has been pruned like
+ * this, we want to force a new route lookup.
+ */
+ if (dst->obsolete < 0 && !stale_bundle(dst))
+ return dst;
+
+ return NULL;
+}
+
+static int stale_bundle(struct dst_entry *dst)
+{
+ return !xfrm_bundle_ok((struct xfrm_dst *)dst);
+}
+
+void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
+{
+ while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
+ dst->dev = dev_net(dev)->loopback_dev;
+ dev_hold(dst->dev);
+ dev_put(dev);
+ }
+}
+EXPORT_SYMBOL(xfrm_dst_ifdown);
+
+static void xfrm_link_failure(struct sk_buff *skb)
+{
+ /* Impossible. Such dst must be popped before reaches point of failure. */
+}
+
+static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
+{
+ if (dst) {
+ if (dst->obsolete) {
+ dst_release(dst);
+ dst = NULL;
+ }
+ }
+ return dst;
+}
+
+void xfrm_garbage_collect(struct net *net)
+{
+ flow_cache_flush(net);
+}
+EXPORT_SYMBOL(xfrm_garbage_collect);
+
+static void xfrm_garbage_collect_deferred(struct net *net)
+{
+ flow_cache_flush_deferred(net);
+}
+
+static void xfrm_init_pmtu(struct dst_entry *dst)
+{
+ do {
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ u32 pmtu, route_mtu_cached;
+
+ pmtu = dst_mtu(dst->child);
+ xdst->child_mtu_cached = pmtu;
+
+ pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
+
+ route_mtu_cached = dst_mtu(xdst->route);
+ xdst->route_mtu_cached = route_mtu_cached;
+
+ if (pmtu > route_mtu_cached)
+ pmtu = route_mtu_cached;
+
+ dst_metric_set(dst, RTAX_MTU, pmtu);
+ } while ((dst = dst->next));
+}
+
+/* Check that the bundle accepts the flow and its components are
+ * still valid.
+ */
+
+static int xfrm_bundle_ok(struct xfrm_dst *first)
+{
+ struct dst_entry *dst = &first->u.dst;
+ struct xfrm_dst *last;
+ u32 mtu;
+
+ if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
+ (dst->dev && !netif_running(dst->dev)))
+ return 0;
+
+ if (dst->flags & DST_XFRM_QUEUE)
+ return 1;
+
+ last = NULL;
+
+ do {
+ struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+ if (dst->xfrm->km.state != XFRM_STATE_VALID)
+ return 0;
+ if (xdst->xfrm_genid != dst->xfrm->genid)
+ return 0;
+ if (xdst->num_pols > 0 &&
+ xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
+ return 0;
+
+ mtu = dst_mtu(dst->child);
+ if (xdst->child_mtu_cached != mtu) {
+ last = xdst;
+ xdst->child_mtu_cached = mtu;
+ }
+
+ if (!dst_check(xdst->route, xdst->route_cookie))
+ return 0;
+ mtu = dst_mtu(xdst->route);
+ if (xdst->route_mtu_cached != mtu) {
+ last = xdst;
+ xdst->route_mtu_cached = mtu;
+ }
+
+ dst = dst->child;
+ } while (dst->xfrm);
+
+ if (likely(!last))
+ return 1;
+
+ mtu = last->child_mtu_cached;
+ for (;;) {
+ dst = &last->u.dst;
+
+ mtu = xfrm_state_mtu(dst->xfrm, mtu);
+ if (mtu > last->route_mtu_cached)
+ mtu = last->route_mtu_cached;
+ dst_metric_set(dst, RTAX_MTU, mtu);
+
+ if (last == first)
+ break;
+
+ last = (struct xfrm_dst *)last->u.dst.next;
+ last->child_mtu_cached = mtu;
+ }
+
+ return 1;
+}
+
+static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
+{
+ return dst_metric_advmss(dst->path);
+}
+
+static unsigned int xfrm_mtu(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst_mtu(dst->path);
+}
+
+static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ return dst->path->ops->neigh_lookup(dst, skb, daddr);
+}
+
+int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+ struct net *net;
+ int err = 0;
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+ if (unlikely(afinfo->family >= NPROTO))
+ return -EAFNOSUPPORT;
+ spin_lock(&xfrm_policy_afinfo_lock);
+ if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
+ err = -ENOBUFS;
+ else {
+ struct dst_ops *dst_ops = afinfo->dst_ops;
+ if (likely(dst_ops->kmem_cachep == NULL))
+ dst_ops->kmem_cachep = xfrm_dst_cache;
+ if (likely(dst_ops->check == NULL))
+ dst_ops->check = xfrm_dst_check;
+ if (likely(dst_ops->default_advmss == NULL))
+ dst_ops->default_advmss = xfrm_default_advmss;
+ if (likely(dst_ops->mtu == NULL))
+ dst_ops->mtu = xfrm_mtu;
+ if (likely(dst_ops->negative_advice == NULL))
+ dst_ops->negative_advice = xfrm_negative_advice;
+ if (likely(dst_ops->link_failure == NULL))
+ dst_ops->link_failure = xfrm_link_failure;
+ if (likely(dst_ops->neigh_lookup == NULL))
+ dst_ops->neigh_lookup = xfrm_neigh_lookup;
+ if (likely(afinfo->garbage_collect == NULL))
+ afinfo->garbage_collect = xfrm_garbage_collect_deferred;
+ rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
+ }
+ spin_unlock(&xfrm_policy_afinfo_lock);
+
+ rtnl_lock();
+ for_each_net(net) {
+ struct dst_ops *xfrm_dst_ops;
+
+ switch (afinfo->family) {
+ case AF_INET:
+ xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops;
+ break;
+#endif
+ default:
+ BUG();
+ }
+ *xfrm_dst_ops = *afinfo->dst_ops;
+ }
+ rtnl_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL(xfrm_policy_register_afinfo);
+
+int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+ int err = 0;
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+ if (unlikely(afinfo->family >= NPROTO))
+ return -EAFNOSUPPORT;
+ spin_lock(&xfrm_policy_afinfo_lock);
+ if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
+ if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
+ err = -EINVAL;
+ else
+ RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family],
+ NULL);
+ }
+ spin_unlock(&xfrm_policy_afinfo_lock);
+ if (!err) {
+ struct dst_ops *dst_ops = afinfo->dst_ops;
+
+ synchronize_rcu();
+
+ dst_ops->kmem_cachep = NULL;
+ dst_ops->check = NULL;
+ dst_ops->negative_advice = NULL;
+ dst_ops->link_failure = NULL;
+ afinfo->garbage_collect = NULL;
+ }
+ return err;
+}
+EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
+
+static void __net_init xfrm_dst_ops_init(struct net *net)
+{
+ struct xfrm_policy_afinfo *afinfo;
+
+ rcu_read_lock();
+ afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]);
+ if (afinfo)
+ net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops;
+#if IS_ENABLED(CONFIG_IPV6)
+ afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]);
+ if (afinfo)
+ net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops;
+#endif
+ rcu_read_unlock();
+}
+
+static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_DOWN:
+ xfrm_garbage_collect(dev_net(dev));
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block xfrm_dev_notifier = {
+ .notifier_call = xfrm_dev_event,
+};
+
+#ifdef CONFIG_XFRM_STATISTICS
+static int __net_init xfrm_statistics_init(struct net *net)
+{
+ int rv;
+ net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib);
+ if (!net->mib.xfrm_statistics)
+ return -ENOMEM;
+ rv = xfrm_proc_init(net);
+ if (rv < 0)
+ free_percpu(net->mib.xfrm_statistics);
+ return rv;
+}
+
+static void xfrm_statistics_fini(struct net *net)
+{
+ xfrm_proc_fini(net);
+ free_percpu(net->mib.xfrm_statistics);
+}
+#else
+static int __net_init xfrm_statistics_init(struct net *net)
+{
+ return 0;
+}
+
+static void xfrm_statistics_fini(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm_policy_init(struct net *net)
+{
+ unsigned int hmask, sz;
+ int dir;
+
+ if (net_eq(net, &init_net))
+ xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
+ sizeof(struct xfrm_dst),
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+
+ hmask = 8 - 1;
+ sz = (hmask+1) * sizeof(struct hlist_head);
+
+ net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
+ if (!net->xfrm.policy_byidx)
+ goto out_byidx;
+ net->xfrm.policy_idx_hmask = hmask;
+
+ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
+ struct xfrm_policy_hash *htab;
+
+ net->xfrm.policy_count[dir] = 0;
+ net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;
+ INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
+
+ htab = &net->xfrm.policy_bydst[dir];
+ htab->table = xfrm_hash_alloc(sz);
+ if (!htab->table)
+ goto out_bydst;
+ htab->hmask = hmask;
+ htab->dbits4 = 32;
+ htab->sbits4 = 32;
+ htab->dbits6 = 128;
+ htab->sbits6 = 128;
+ }
+ net->xfrm.policy_hthresh.lbits4 = 32;
+ net->xfrm.policy_hthresh.rbits4 = 32;
+ net->xfrm.policy_hthresh.lbits6 = 128;
+ net->xfrm.policy_hthresh.rbits6 = 128;
+
+ seqlock_init(&net->xfrm.policy_hthresh.lock);
+
+ INIT_LIST_HEAD(&net->xfrm.policy_all);
+ INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
+ INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
+ if (net_eq(net, &init_net))
+ register_netdevice_notifier(&xfrm_dev_notifier);
+ return 0;
+
+out_bydst:
+ for (dir--; dir >= 0; dir--) {
+ struct xfrm_policy_hash *htab;
+
+ htab = &net->xfrm.policy_bydst[dir];
+ xfrm_hash_free(htab->table, sz);
+ }
+ xfrm_hash_free(net->xfrm.policy_byidx, sz);
+out_byidx:
+ return -ENOMEM;
+}
+
+static void xfrm_policy_fini(struct net *net)
+{
+ unsigned int sz;
+ int dir;
+
+ flush_work(&net->xfrm.policy_hash_work);
+#ifdef CONFIG_XFRM_SUB_POLICY
+ xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
+#endif
+ xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);
+
+ WARN_ON(!list_empty(&net->xfrm.policy_all));
+
+ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
+ struct xfrm_policy_hash *htab;
+
+ WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));
+
+ htab = &net->xfrm.policy_bydst[dir];
+ sz = (htab->hmask + 1) * sizeof(struct hlist_head);
+ WARN_ON(!hlist_empty(htab->table));
+ xfrm_hash_free(htab->table, sz);
+ }
+
+ sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
+ WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
+ xfrm_hash_free(net->xfrm.policy_byidx, sz);
+}
+
+static int __net_init xfrm_net_init(struct net *net)
+{
+ int rv;
+
+ rv = xfrm_statistics_init(net);
+ if (rv < 0)
+ goto out_statistics;
+ rv = xfrm_state_init(net);
+ if (rv < 0)
+ goto out_state;
+ rv = xfrm_policy_init(net);
+ if (rv < 0)
+ goto out_policy;
+ xfrm_dst_ops_init(net);
+ rv = xfrm_sysctl_init(net);
+ if (rv < 0)
+ goto out_sysctl;
+ rv = flow_cache_init(net);
+ if (rv < 0)
+ goto out;
+
+ /* Initialize the per-net locks here */
+ spin_lock_init(&net->xfrm.xfrm_state_lock);
+ rwlock_init(&net->xfrm.xfrm_policy_lock);
+ mutex_init(&net->xfrm.xfrm_cfg_mutex);
+
+ return 0;
+
+out:
+ xfrm_sysctl_fini(net);
+out_sysctl:
+ xfrm_policy_fini(net);
+out_policy:
+ xfrm_state_fini(net);
+out_state:
+ xfrm_statistics_fini(net);
+out_statistics:
+ return rv;
+}
+
+static void __net_exit xfrm_net_exit(struct net *net)
+{
+ flow_cache_fini(net);
+ xfrm_sysctl_fini(net);
+ xfrm_policy_fini(net);
+ xfrm_state_fini(net);
+ xfrm_statistics_fini(net);
+}
+
+static struct pernet_operations __net_initdata xfrm_net_ops = {
+ .init = xfrm_net_init,
+ .exit = xfrm_net_exit,
+};
+
+void __init xfrm_init(void)
+{
+ register_pernet_subsys(&xfrm_net_ops);
+ xfrm_input_init();
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
+ struct audit_buffer *audit_buf)
+{
+ struct xfrm_sec_ctx *ctx = xp->security;
+ struct xfrm_selector *sel = &xp->selector;
+
+ if (ctx)
+ audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
+ ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);
+
+ switch (sel->family) {
+ case AF_INET:
+ audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
+ if (sel->prefixlen_s != 32)
+ audit_log_format(audit_buf, " src_prefixlen=%d",
+ sel->prefixlen_s);
+ audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
+ if (sel->prefixlen_d != 32)
+ audit_log_format(audit_buf, " dst_prefixlen=%d",
+ sel->prefixlen_d);
+ break;
+ case AF_INET6:
+ audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
+ if (sel->prefixlen_s != 128)
+ audit_log_format(audit_buf, " src_prefixlen=%d",
+ sel->prefixlen_s);
+ audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
+ if (sel->prefixlen_d != 128)
+ audit_log_format(audit_buf, " dst_prefixlen=%d",
+ sel->prefixlen_d);
+ break;
+ }
+}
+
+void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid)
+{
+ struct audit_buffer *audit_buf;
+
+ audit_buf = xfrm_audit_start("SPD-add");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_usrinfo(task_valid, audit_buf);
+ audit_log_format(audit_buf, " res=%u", result);
+ xfrm_audit_common_policyinfo(xp, audit_buf);
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);
+
+void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
+ bool task_valid)
+{
+ struct audit_buffer *audit_buf;
+
+ audit_buf = xfrm_audit_start("SPD-delete");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_usrinfo(task_valid, audit_buf);
+ audit_log_format(audit_buf, " res=%u", result);
+ xfrm_audit_common_policyinfo(xp, audit_buf);
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
+#endif
+
+#ifdef CONFIG_XFRM_MIGRATE
+static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
+ const struct xfrm_selector *sel_tgt)
+{
+ if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
+ if (sel_tgt->family == sel_cmp->family &&
+ xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr,
+ sel_cmp->family) &&
+ xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr,
+ sel_cmp->family) &&
+ sel_tgt->prefixlen_d == sel_cmp->prefixlen_d &&
+ sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) {
+ return true;
+ }
+ } else {
+ if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
+ u8 dir, u8 type, struct net *net)
+{
+ struct xfrm_policy *pol, *ret = NULL;
+ struct hlist_head *chain;
+ u32 priority = ~0U;
+
+ read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/
+ chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
+ hlist_for_each_entry(pol, chain, bydst) {
+ if (xfrm_migrate_selector_match(sel, &pol->selector) &&
+ pol->type == type) {
+ ret = pol;
+ priority = ret->priority;
+ break;
+ }
+ }
+ chain = &net->xfrm.policy_inexact[dir];
+ hlist_for_each_entry(pol, chain, bydst) {
+ if (xfrm_migrate_selector_match(sel, &pol->selector) &&
+ pol->type == type &&
+ pol->priority < priority) {
+ ret = pol;
+ break;
+ }
+ }
+
+ if (ret)
+ xfrm_pol_hold(ret);
+
+ read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+
+ return ret;
+}
+
+static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
+{
+ int match = 0;
+
+ if (t->mode == m->mode && t->id.proto == m->proto &&
+ (m->reqid == 0 || t->reqid == m->reqid)) {
+ switch (t->mode) {
+ case XFRM_MODE_TUNNEL:
+ case XFRM_MODE_BEET:
+ if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
+ m->old_family) &&
+ xfrm_addr_equal(&t->saddr, &m->old_saddr,
+ m->old_family)) {
+ match = 1;
+ }
+ break;
+ case XFRM_MODE_TRANSPORT:
+ /* in case of transport mode, template does not store
+ any IP addresses, hence we just compare mode and
+ protocol */
+ match = 1;
+ break;
+ default:
+ break;
+ }
+ }
+ return match;
+}
+
+/* update endpoint address(es) of template(s) */
+static int xfrm_policy_migrate(struct xfrm_policy *pol,
+ struct xfrm_migrate *m, int num_migrate)
+{
+ struct xfrm_migrate *mp;
+ int i, j, n = 0;
+
+ write_lock_bh(&pol->lock);
+ if (unlikely(pol->walk.dead)) {
+ /* target policy has been deleted */
+ write_unlock_bh(&pol->lock);
+ return -ENOENT;
+ }
+
+ for (i = 0; i < pol->xfrm_nr; i++) {
+ for (j = 0, mp = m; j < num_migrate; j++, mp++) {
+ if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
+ continue;
+ n++;
+ if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
+ pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
+ continue;
+ /* update endpoints */
+ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
+ sizeof(pol->xfrm_vec[i].id.daddr));
+ memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
+ sizeof(pol->xfrm_vec[i].saddr));
+ pol->xfrm_vec[i].encap_family = mp->new_family;
+ /* flush bundles */
+ atomic_inc(&pol->genid);
+ }
+ }
+
+ write_unlock_bh(&pol->lock);
+
+ if (!n)
+ return -ENODATA;
+
+ return 0;
+}
+
+static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
+{
+ int i, j;
+
+ if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH)
+ return -EINVAL;
+
+ for (i = 0; i < num_migrate; i++) {
+ if (xfrm_addr_equal(&m[i].old_daddr, &m[i].new_daddr,
+ m[i].old_family) &&
+ xfrm_addr_equal(&m[i].old_saddr, &m[i].new_saddr,
+ m[i].old_family))
+ return -EINVAL;
+ if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
+ xfrm_addr_any(&m[i].new_saddr, m[i].new_family))
+ return -EINVAL;
+
+ /* check if there is any duplicated entry */
+ for (j = i + 1; j < num_migrate; j++) {
+ if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
+ sizeof(m[i].old_daddr)) &&
+ !memcmp(&m[i].old_saddr, &m[j].old_saddr,
+ sizeof(m[i].old_saddr)) &&
+ m[i].proto == m[j].proto &&
+ m[i].mode == m[j].mode &&
+ m[i].reqid == m[j].reqid &&
+ m[i].old_family == m[j].old_family)
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ struct xfrm_migrate *m, int num_migrate,
+ struct xfrm_kmaddress *k, struct net *net)
+{
+ int i, err, nx_cur = 0, nx_new = 0;
+ struct xfrm_policy *pol = NULL;
+ struct xfrm_state *x, *xc;
+ struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
+ struct xfrm_state *x_new[XFRM_MAX_DEPTH];
+ struct xfrm_migrate *mp;
+
+ if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
+ goto out;
+
+ /* Stage 1 - find policy */
+ if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ /* Stage 2 - find and update state(s) */
+ for (i = 0, mp = m; i < num_migrate; i++, mp++) {
+ if ((x = xfrm_migrate_state_find(mp, net))) {
+ x_cur[nx_cur] = x;
+ nx_cur++;
+ if ((xc = xfrm_state_migrate(x, mp))) {
+ x_new[nx_new] = xc;
+ nx_new++;
+ } else {
+ err = -ENODATA;
+ goto restore_state;
+ }
+ }
+ }
+
+ /* Stage 3 - update policy */
+ if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0)
+ goto restore_state;
+
+ /* Stage 4 - delete old state(s) */
+ if (nx_cur) {
+ xfrm_states_put(x_cur, nx_cur);
+ xfrm_states_delete(x_cur, nx_cur);
+ }
+
+ /* Stage 5 - announce */
+ km_migrate(sel, dir, type, m, num_migrate, k);
+
+ xfrm_pol_put(pol);
+
+ return 0;
+out:
+ return err;
+
+restore_state:
+ if (pol)
+ xfrm_pol_put(pol);
+ if (nx_cur)
+ xfrm_states_put(x_cur, nx_cur);
+ if (nx_new)
+ xfrm_states_delete(x_new, nx_new);
+
+ return err;
+}
+EXPORT_SYMBOL(xfrm_migrate);
+#endif
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
new file mode 100644
index 000000000..9c4fbd893
--- /dev/null
+++ b/net/xfrm/xfrm_proc.c
@@ -0,0 +1,86 @@
+/*
+ * xfrm_proc.c
+ *
+ * Copyright (C)2006-2007 USAGI/WIDE Project
+ *
+ * Authors: Masahide NAKAMURA <nakam@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/export.h>
+#include <net/snmp.h>
+#include <net/xfrm.h>
+
+static const struct snmp_mib xfrm_mib_list[] = {
+ SNMP_MIB_ITEM("XfrmInError", LINUX_MIB_XFRMINERROR),
+ SNMP_MIB_ITEM("XfrmInBufferError", LINUX_MIB_XFRMINBUFFERERROR),
+ SNMP_MIB_ITEM("XfrmInHdrError", LINUX_MIB_XFRMINHDRERROR),
+ SNMP_MIB_ITEM("XfrmInNoStates", LINUX_MIB_XFRMINNOSTATES),
+ SNMP_MIB_ITEM("XfrmInStateProtoError", LINUX_MIB_XFRMINSTATEPROTOERROR),
+ SNMP_MIB_ITEM("XfrmInStateModeError", LINUX_MIB_XFRMINSTATEMODEERROR),
+ SNMP_MIB_ITEM("XfrmInStateSeqError", LINUX_MIB_XFRMINSTATESEQERROR),
+ SNMP_MIB_ITEM("XfrmInStateExpired", LINUX_MIB_XFRMINSTATEEXPIRED),
+ SNMP_MIB_ITEM("XfrmInStateMismatch", LINUX_MIB_XFRMINSTATEMISMATCH),
+ SNMP_MIB_ITEM("XfrmInStateInvalid", LINUX_MIB_XFRMINSTATEINVALID),
+ SNMP_MIB_ITEM("XfrmInTmplMismatch", LINUX_MIB_XFRMINTMPLMISMATCH),
+ SNMP_MIB_ITEM("XfrmInNoPols", LINUX_MIB_XFRMINNOPOLS),
+ SNMP_MIB_ITEM("XfrmInPolBlock", LINUX_MIB_XFRMINPOLBLOCK),
+ SNMP_MIB_ITEM("XfrmInPolError", LINUX_MIB_XFRMINPOLERROR),
+ SNMP_MIB_ITEM("XfrmOutError", LINUX_MIB_XFRMOUTERROR),
+ SNMP_MIB_ITEM("XfrmOutBundleGenError", LINUX_MIB_XFRMOUTBUNDLEGENERROR),
+ SNMP_MIB_ITEM("XfrmOutBundleCheckError", LINUX_MIB_XFRMOUTBUNDLECHECKERROR),
+ SNMP_MIB_ITEM("XfrmOutNoStates", LINUX_MIB_XFRMOUTNOSTATES),
+ SNMP_MIB_ITEM("XfrmOutStateProtoError", LINUX_MIB_XFRMOUTSTATEPROTOERROR),
+ SNMP_MIB_ITEM("XfrmOutStateModeError", LINUX_MIB_XFRMOUTSTATEMODEERROR),
+ SNMP_MIB_ITEM("XfrmOutStateSeqError", LINUX_MIB_XFRMOUTSTATESEQERROR),
+ SNMP_MIB_ITEM("XfrmOutStateExpired", LINUX_MIB_XFRMOUTSTATEEXPIRED),
+ SNMP_MIB_ITEM("XfrmOutPolBlock", LINUX_MIB_XFRMOUTPOLBLOCK),
+ SNMP_MIB_ITEM("XfrmOutPolDead", LINUX_MIB_XFRMOUTPOLDEAD),
+ SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR),
+ SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR),
+ SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID),
+ SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR),
+ SNMP_MIB_SENTINEL
+};
+
+static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq->private;
+ int i;
+ for (i = 0; xfrm_mib_list[i].name; i++)
+ seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name,
+ snmp_fold_field(net->mib.xfrm_statistics,
+ xfrm_mib_list[i].entry));
+ return 0;
+}
+
+static int xfrm_statistics_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, xfrm_statistics_seq_show);
+}
+
+static const struct file_operations xfrm_statistics_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = xfrm_statistics_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+int __net_init xfrm_proc_init(struct net *net)
+{
+ if (!proc_create("xfrm_stat", S_IRUGO, net->proc_net,
+ &xfrm_statistics_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+void xfrm_proc_fini(struct net *net)
+{
+ remove_proc_entry("xfrm_stat", net->proc_net);
+}
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
new file mode 100644
index 000000000..4fd725a0c
--- /dev/null
+++ b/net/xfrm/xfrm_replay.c
@@ -0,0 +1,605 @@
+/*
+ * xfrm_replay.c - xfrm replay detection, derived from xfrm_state.c.
+ *
+ * Copyright (C) 2010 secunet Security Networks AG
+ * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/export.h>
+#include <net/xfrm.h>
+
+u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq)
+{
+ u32 seq, seq_hi, bottom;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+ if (!(x->props.flags & XFRM_STATE_ESN))
+ return 0;
+
+ seq = ntohl(net_seq);
+ seq_hi = replay_esn->seq_hi;
+ bottom = replay_esn->seq - replay_esn->replay_window + 1;
+
+ if (likely(replay_esn->seq >= replay_esn->replay_window - 1)) {
+ /* A. same subspace */
+ if (unlikely(seq < bottom))
+ seq_hi++;
+ } else {
+ /* B. window spans two subspaces */
+ if (unlikely(seq >= bottom))
+ seq_hi--;
+ }
+
+ return seq_hi;
+}
+
+static void xfrm_replay_notify(struct xfrm_state *x, int event)
+{
+ struct km_event c;
+ /* we send notify messages in case
+ * 1. we updated on of the sequence numbers, and the seqno difference
+ * is at least x->replay_maxdiff, in this case we also update the
+ * timeout of our timer function
+ * 2. if x->replay_maxage has elapsed since last update,
+ * and there were changes
+ *
+ * The state structure must be locked!
+ */
+
+ switch (event) {
+ case XFRM_REPLAY_UPDATE:
+ if (!x->replay_maxdiff ||
+ ((x->replay.seq - x->preplay.seq < x->replay_maxdiff) &&
+ (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff))) {
+ if (x->xflags & XFRM_TIME_DEFER)
+ event = XFRM_REPLAY_TIMEOUT;
+ else
+ return;
+ }
+
+ break;
+
+ case XFRM_REPLAY_TIMEOUT:
+ if (memcmp(&x->replay, &x->preplay,
+ sizeof(struct xfrm_replay_state)) == 0) {
+ x->xflags |= XFRM_TIME_DEFER;
+ return;
+ }
+
+ break;
+ }
+
+ memcpy(&x->preplay, &x->replay, sizeof(struct xfrm_replay_state));
+ c.event = XFRM_MSG_NEWAE;
+ c.data.aevent = event;
+ km_state_notify(x, &c);
+
+ if (x->replay_maxage &&
+ !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+ x->xflags &= ~XFRM_TIME_DEFER;
+}
+
+static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = 0;
+ struct net *net = xs_net(x);
+
+ if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+ XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq;
+ XFRM_SKB_CB(skb)->seq.output.hi = 0;
+ if (unlikely(x->replay.oseq == 0)) {
+ x->replay.oseq--;
+ xfrm_audit_state_replay_overflow(x, skb);
+ err = -EOVERFLOW;
+
+ return err;
+ }
+ if (xfrm_aevent_is_on(net))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+ }
+
+ return err;
+}
+
+static int xfrm_replay_check(struct xfrm_state *x,
+ struct sk_buff *skb, __be32 net_seq)
+{
+ u32 diff;
+ u32 seq = ntohl(net_seq);
+
+ if (!x->props.replay_window)
+ return 0;
+
+ if (unlikely(seq == 0))
+ goto err;
+
+ if (likely(seq > x->replay.seq))
+ return 0;
+
+ diff = x->replay.seq - seq;
+ if (diff >= x->props.replay_window) {
+ x->stats.replay_window++;
+ goto err;
+ }
+
+ if (x->replay.bitmap & (1U << diff)) {
+ x->stats.replay++;
+ goto err;
+ }
+ return 0;
+
+err:
+ xfrm_audit_state_replay(x, skb, net_seq);
+ return -EINVAL;
+}
+
+static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq)
+{
+ u32 diff;
+ u32 seq = ntohl(net_seq);
+
+ if (!x->props.replay_window)
+ return;
+
+ if (seq > x->replay.seq) {
+ diff = seq - x->replay.seq;
+ if (diff < x->props.replay_window)
+ x->replay.bitmap = ((x->replay.bitmap) << diff) | 1;
+ else
+ x->replay.bitmap = 1;
+ x->replay.seq = seq;
+ } else {
+ diff = x->replay.seq - seq;
+ x->replay.bitmap |= (1U << diff);
+ }
+
+ if (xfrm_aevent_is_on(xs_net(x)))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = 0;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ struct net *net = xs_net(x);
+
+ if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+ XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
+ XFRM_SKB_CB(skb)->seq.output.hi = 0;
+ if (unlikely(replay_esn->oseq == 0)) {
+ replay_esn->oseq--;
+ xfrm_audit_state_replay_overflow(x, skb);
+ err = -EOVERFLOW;
+
+ return err;
+ }
+ if (xfrm_aevent_is_on(net))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+ }
+
+ return err;
+}
+
+static int xfrm_replay_check_bmp(struct xfrm_state *x,
+ struct sk_buff *skb, __be32 net_seq)
+{
+ unsigned int bitnr, nr;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ u32 pos;
+ u32 seq = ntohl(net_seq);
+ u32 diff = replay_esn->seq - seq;
+
+ if (!replay_esn->replay_window)
+ return 0;
+
+ if (unlikely(seq == 0))
+ goto err;
+
+ if (likely(seq > replay_esn->seq))
+ return 0;
+
+ if (diff >= replay_esn->replay_window) {
+ x->stats.replay_window++;
+ goto err;
+ }
+
+ pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+
+ if (pos >= diff)
+ bitnr = (pos - diff) % replay_esn->replay_window;
+ else
+ bitnr = replay_esn->replay_window - (diff - pos);
+
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ if (replay_esn->bmp[nr] & (1U << bitnr))
+ goto err_replay;
+
+ return 0;
+
+err_replay:
+ x->stats.replay++;
+err:
+ xfrm_audit_state_replay(x, skb, net_seq);
+ return -EINVAL;
+}
+
+static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq)
+{
+ unsigned int bitnr, nr, i;
+ u32 diff;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ u32 seq = ntohl(net_seq);
+ u32 pos;
+
+ if (!replay_esn->replay_window)
+ return;
+
+ pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+
+ if (seq > replay_esn->seq) {
+ diff = seq - replay_esn->seq;
+
+ if (diff < replay_esn->replay_window) {
+ for (i = 1; i < diff; i++) {
+ bitnr = (pos + i) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] &= ~(1U << bitnr);
+ }
+ } else {
+ nr = (replay_esn->replay_window - 1) >> 5;
+ for (i = 0; i <= nr; i++)
+ replay_esn->bmp[i] = 0;
+ }
+
+ bitnr = (pos + diff) % replay_esn->replay_window;
+ replay_esn->seq = seq;
+ } else {
+ diff = replay_esn->seq - seq;
+
+ if (pos >= diff)
+ bitnr = (pos - diff) % replay_esn->replay_window;
+ else
+ bitnr = replay_esn->replay_window - (diff - pos);
+ }
+
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+
+ if (xfrm_aevent_is_on(xs_net(x)))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event)
+{
+ struct km_event c;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn;
+
+ /* we send notify messages in case
+ * 1. we updated on of the sequence numbers, and the seqno difference
+ * is at least x->replay_maxdiff, in this case we also update the
+ * timeout of our timer function
+ * 2. if x->replay_maxage has elapsed since last update,
+ * and there were changes
+ *
+ * The state structure must be locked!
+ */
+
+ switch (event) {
+ case XFRM_REPLAY_UPDATE:
+ if (!x->replay_maxdiff ||
+ ((replay_esn->seq - preplay_esn->seq < x->replay_maxdiff) &&
+ (replay_esn->oseq - preplay_esn->oseq
+ < x->replay_maxdiff))) {
+ if (x->xflags & XFRM_TIME_DEFER)
+ event = XFRM_REPLAY_TIMEOUT;
+ else
+ return;
+ }
+
+ break;
+
+ case XFRM_REPLAY_TIMEOUT:
+ if (memcmp(x->replay_esn, x->preplay_esn,
+ xfrm_replay_state_esn_len(replay_esn)) == 0) {
+ x->xflags |= XFRM_TIME_DEFER;
+ return;
+ }
+
+ break;
+ }
+
+ memcpy(x->preplay_esn, x->replay_esn,
+ xfrm_replay_state_esn_len(replay_esn));
+ c.event = XFRM_MSG_NEWAE;
+ c.data.aevent = event;
+ km_state_notify(x, &c);
+
+ if (x->replay_maxage &&
+ !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+ x->xflags &= ~XFRM_TIME_DEFER;
+}
+
+static void xfrm_replay_notify_esn(struct xfrm_state *x, int event)
+{
+ u32 seq_diff, oseq_diff;
+ struct km_event c;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn;
+
+ /* we send notify messages in case
+ * 1. we updated on of the sequence numbers, and the seqno difference
+ * is at least x->replay_maxdiff, in this case we also update the
+ * timeout of our timer function
+ * 2. if x->replay_maxage has elapsed since last update,
+ * and there were changes
+ *
+ * The state structure must be locked!
+ */
+
+ switch (event) {
+ case XFRM_REPLAY_UPDATE:
+ if (x->replay_maxdiff) {
+ if (replay_esn->seq_hi == preplay_esn->seq_hi)
+ seq_diff = replay_esn->seq - preplay_esn->seq;
+ else
+ seq_diff = ~preplay_esn->seq + replay_esn->seq
+ + 1;
+
+ if (replay_esn->oseq_hi == preplay_esn->oseq_hi)
+ oseq_diff = replay_esn->oseq
+ - preplay_esn->oseq;
+ else
+ oseq_diff = ~preplay_esn->oseq
+ + replay_esn->oseq + 1;
+
+ if (seq_diff >= x->replay_maxdiff ||
+ oseq_diff >= x->replay_maxdiff)
+ break;
+ }
+
+ if (x->xflags & XFRM_TIME_DEFER)
+ event = XFRM_REPLAY_TIMEOUT;
+ else
+ return;
+
+ break;
+
+ case XFRM_REPLAY_TIMEOUT:
+ if (memcmp(x->replay_esn, x->preplay_esn,
+ xfrm_replay_state_esn_len(replay_esn)) == 0) {
+ x->xflags |= XFRM_TIME_DEFER;
+ return;
+ }
+
+ break;
+ }
+
+ memcpy(x->preplay_esn, x->replay_esn,
+ xfrm_replay_state_esn_len(replay_esn));
+ c.event = XFRM_MSG_NEWAE;
+ c.data.aevent = event;
+ km_state_notify(x, &c);
+
+ if (x->replay_maxage &&
+ !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+ x->xflags &= ~XFRM_TIME_DEFER;
+}
+
+static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err = 0;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ struct net *net = xs_net(x);
+
+ if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+ XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
+ XFRM_SKB_CB(skb)->seq.output.hi = replay_esn->oseq_hi;
+
+ if (unlikely(replay_esn->oseq == 0)) {
+ XFRM_SKB_CB(skb)->seq.output.hi = ++replay_esn->oseq_hi;
+
+ if (replay_esn->oseq_hi == 0) {
+ replay_esn->oseq--;
+ replay_esn->oseq_hi--;
+ xfrm_audit_state_replay_overflow(x, skb);
+ err = -EOVERFLOW;
+
+ return err;
+ }
+ }
+ if (xfrm_aevent_is_on(net))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+ }
+
+ return err;
+}
+
+static int xfrm_replay_check_esn(struct xfrm_state *x,
+ struct sk_buff *skb, __be32 net_seq)
+{
+ unsigned int bitnr, nr;
+ u32 diff;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+ u32 pos;
+ u32 seq = ntohl(net_seq);
+ u32 wsize = replay_esn->replay_window;
+ u32 top = replay_esn->seq;
+ u32 bottom = top - wsize + 1;
+
+ if (!wsize)
+ return 0;
+
+ if (unlikely(seq == 0 && replay_esn->seq_hi == 0 &&
+ (replay_esn->seq < replay_esn->replay_window - 1)))
+ goto err;
+
+ diff = top - seq;
+
+ if (likely(top >= wsize - 1)) {
+ /* A. same subspace */
+ if (likely(seq > top) || seq < bottom)
+ return 0;
+ } else {
+ /* B. window spans two subspaces */
+ if (likely(seq > top && seq < bottom))
+ return 0;
+ if (seq >= bottom)
+ diff = ~seq + top + 1;
+ }
+
+ if (diff >= replay_esn->replay_window) {
+ x->stats.replay_window++;
+ goto err;
+ }
+
+ pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+
+ if (pos >= diff)
+ bitnr = (pos - diff) % replay_esn->replay_window;
+ else
+ bitnr = replay_esn->replay_window - (diff - pos);
+
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ if (replay_esn->bmp[nr] & (1U << bitnr))
+ goto err_replay;
+
+ return 0;
+
+err_replay:
+ x->stats.replay++;
+err:
+ xfrm_audit_state_replay(x, skb, net_seq);
+ return -EINVAL;
+}
+
+static int xfrm_replay_recheck_esn(struct xfrm_state *x,
+ struct sk_buff *skb, __be32 net_seq)
+{
+ if (unlikely(XFRM_SKB_CB(skb)->seq.input.hi !=
+ htonl(xfrm_replay_seqhi(x, net_seq)))) {
+ x->stats.replay_window++;
+ return -EINVAL;
+ }
+
+ return xfrm_replay_check_esn(x, skb, net_seq);
+}
+
+static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
+{
+ unsigned int bitnr, nr, i;
+ int wrap;
+ u32 diff, pos, seq, seq_hi;
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+ if (!replay_esn->replay_window)
+ return;
+
+ seq = ntohl(net_seq);
+ pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+ seq_hi = xfrm_replay_seqhi(x, net_seq);
+ wrap = seq_hi - replay_esn->seq_hi;
+
+ if ((!wrap && seq > replay_esn->seq) || wrap > 0) {
+ if (likely(!wrap))
+ diff = seq - replay_esn->seq;
+ else
+ diff = ~replay_esn->seq + seq + 1;
+
+ if (diff < replay_esn->replay_window) {
+ for (i = 1; i < diff; i++) {
+ bitnr = (pos + i) % replay_esn->replay_window;
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] &= ~(1U << bitnr);
+ }
+ } else {
+ nr = (replay_esn->replay_window - 1) >> 5;
+ for (i = 0; i <= nr; i++)
+ replay_esn->bmp[i] = 0;
+ }
+
+ bitnr = (pos + diff) % replay_esn->replay_window;
+ replay_esn->seq = seq;
+
+ if (unlikely(wrap > 0))
+ replay_esn->seq_hi++;
+ } else {
+ diff = replay_esn->seq - seq;
+
+ if (pos >= diff)
+ bitnr = (pos - diff) % replay_esn->replay_window;
+ else
+ bitnr = replay_esn->replay_window - (diff - pos);
+ }
+
+ nr = bitnr >> 5;
+ bitnr = bitnr & 0x1F;
+ replay_esn->bmp[nr] |= (1U << bitnr);
+
+ if (xfrm_aevent_is_on(xs_net(x)))
+ x->repl->notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static struct xfrm_replay xfrm_replay_legacy = {
+ .advance = xfrm_replay_advance,
+ .check = xfrm_replay_check,
+ .recheck = xfrm_replay_check,
+ .notify = xfrm_replay_notify,
+ .overflow = xfrm_replay_overflow,
+};
+
+static struct xfrm_replay xfrm_replay_bmp = {
+ .advance = xfrm_replay_advance_bmp,
+ .check = xfrm_replay_check_bmp,
+ .recheck = xfrm_replay_check_bmp,
+ .notify = xfrm_replay_notify_bmp,
+ .overflow = xfrm_replay_overflow_bmp,
+};
+
+static struct xfrm_replay xfrm_replay_esn = {
+ .advance = xfrm_replay_advance_esn,
+ .check = xfrm_replay_check_esn,
+ .recheck = xfrm_replay_recheck_esn,
+ .notify = xfrm_replay_notify_esn,
+ .overflow = xfrm_replay_overflow_esn,
+};
+
+int xfrm_init_replay(struct xfrm_state *x)
+{
+ struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+ if (replay_esn) {
+ if (replay_esn->replay_window >
+ replay_esn->bmp_len * sizeof(__u32) * 8)
+ return -EINVAL;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ if (replay_esn->replay_window == 0)
+ return -EINVAL;
+ x->repl = &xfrm_replay_esn;
+ } else
+ x->repl = &xfrm_replay_bmp;
+ } else
+ x->repl = &xfrm_replay_legacy;
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_init_replay);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
new file mode 100644
index 000000000..96688cd0f
--- /dev/null
+++ b/net/xfrm/xfrm_state.c
@@ -0,0 +1,2294 @@
+/*
+ * xfrm_state.c
+ *
+ * Changes:
+ * Mitsuru KANDA @USAGI
+ * Kazunori MIYAZAWA @USAGI
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * IPv6 support
+ * YOSHIFUJI Hideaki @USAGI
+ * Split up af-specific functions
+ * Derek Atkins <derek@ihtfp.com>
+ * Add UDP Encapsulation
+ *
+ */
+
+#include <linux/workqueue.h>
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/audit.h>
+#include <asm/uaccess.h>
+#include <linux/ktime.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include "xfrm_hash.h"
+
+/* Each xfrm_state may be linked to two tables:
+
+ 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
+ 2. Hash table by (daddr,family,reqid) to find what SAs exist for given
+ destination/tunnel endpoint. (output)
+ */
+
+static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
+
+static inline unsigned int xfrm_dst_hash(struct net *net,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ u32 reqid,
+ unsigned short family)
+{
+ return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask);
+}
+
+static inline unsigned int xfrm_src_hash(struct net *net,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ unsigned short family)
+{
+ return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask);
+}
+
+static inline unsigned int
+xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr,
+ __be32 spi, u8 proto, unsigned short family)
+{
+ return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask);
+}
+
+static void xfrm_hash_transfer(struct hlist_head *list,
+ struct hlist_head *ndsttable,
+ struct hlist_head *nsrctable,
+ struct hlist_head *nspitable,
+ unsigned int nhashmask)
+{
+ struct hlist_node *tmp;
+ struct xfrm_state *x;
+
+ hlist_for_each_entry_safe(x, tmp, list, bydst) {
+ unsigned int h;
+
+ h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
+ x->props.reqid, x->props.family,
+ nhashmask);
+ hlist_add_head(&x->bydst, ndsttable+h);
+
+ h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
+ x->props.family,
+ nhashmask);
+ hlist_add_head(&x->bysrc, nsrctable+h);
+
+ if (x->id.spi) {
+ h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
+ x->id.proto, x->props.family,
+ nhashmask);
+ hlist_add_head(&x->byspi, nspitable+h);
+ }
+ }
+}
+
+static unsigned long xfrm_hash_new_size(unsigned int state_hmask)
+{
+ return ((state_hmask + 1) << 1) * sizeof(struct hlist_head);
+}
+
+static void xfrm_hash_resize(struct work_struct *work)
+{
+ struct net *net = container_of(work, struct net, xfrm.state_hash_work);
+ struct hlist_head *ndst, *nsrc, *nspi, *odst, *osrc, *ospi;
+ unsigned long nsize, osize;
+ unsigned int nhashmask, ohashmask;
+ int i;
+
+ nsize = xfrm_hash_new_size(net->xfrm.state_hmask);
+ ndst = xfrm_hash_alloc(nsize);
+ if (!ndst)
+ return;
+ nsrc = xfrm_hash_alloc(nsize);
+ if (!nsrc) {
+ xfrm_hash_free(ndst, nsize);
+ return;
+ }
+ nspi = xfrm_hash_alloc(nsize);
+ if (!nspi) {
+ xfrm_hash_free(ndst, nsize);
+ xfrm_hash_free(nsrc, nsize);
+ return;
+ }
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+
+ nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
+ for (i = net->xfrm.state_hmask; i >= 0; i--)
+ xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi,
+ nhashmask);
+
+ odst = net->xfrm.state_bydst;
+ osrc = net->xfrm.state_bysrc;
+ ospi = net->xfrm.state_byspi;
+ ohashmask = net->xfrm.state_hmask;
+
+ net->xfrm.state_bydst = ndst;
+ net->xfrm.state_bysrc = nsrc;
+ net->xfrm.state_byspi = nspi;
+ net->xfrm.state_hmask = nhashmask;
+
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+ osize = (ohashmask + 1) * sizeof(struct hlist_head);
+ xfrm_hash_free(odst, osize);
+ xfrm_hash_free(osrc, osize);
+ xfrm_hash_free(ospi, osize);
+}
+
+static DEFINE_SPINLOCK(xfrm_state_afinfo_lock);
+static struct xfrm_state_afinfo __rcu *xfrm_state_afinfo[NPROTO];
+
+static DEFINE_SPINLOCK(xfrm_state_gc_lock);
+
+int __xfrm_state_delete(struct xfrm_state *x);
+
+int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol);
+bool km_is_alive(const struct km_event *c);
+void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
+
+static DEFINE_SPINLOCK(xfrm_type_lock);
+int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
+{
+ struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+ const struct xfrm_type **typemap;
+ int err = 0;
+
+ if (unlikely(afinfo == NULL))
+ return -EAFNOSUPPORT;
+ typemap = afinfo->type_map;
+ spin_lock_bh(&xfrm_type_lock);
+
+ if (likely(typemap[type->proto] == NULL))
+ typemap[type->proto] = type;
+ else
+ err = -EEXIST;
+ spin_unlock_bh(&xfrm_type_lock);
+ xfrm_state_put_afinfo(afinfo);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_register_type);
+
+int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
+{
+ struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+ const struct xfrm_type **typemap;
+ int err = 0;
+
+ if (unlikely(afinfo == NULL))
+ return -EAFNOSUPPORT;
+ typemap = afinfo->type_map;
+ spin_lock_bh(&xfrm_type_lock);
+
+ if (unlikely(typemap[type->proto] != type))
+ err = -ENOENT;
+ else
+ typemap[type->proto] = NULL;
+ spin_unlock_bh(&xfrm_type_lock);
+ xfrm_state_put_afinfo(afinfo);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_unregister_type);
+
+static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
+{
+ struct xfrm_state_afinfo *afinfo;
+ const struct xfrm_type **typemap;
+ const struct xfrm_type *type;
+ int modload_attempted = 0;
+
+retry:
+ afinfo = xfrm_state_get_afinfo(family);
+ if (unlikely(afinfo == NULL))
+ return NULL;
+ typemap = afinfo->type_map;
+
+ type = typemap[proto];
+ if (unlikely(type && !try_module_get(type->owner)))
+ type = NULL;
+ if (!type && !modload_attempted) {
+ xfrm_state_put_afinfo(afinfo);
+ request_module("xfrm-type-%d-%d", family, proto);
+ modload_attempted = 1;
+ goto retry;
+ }
+
+ xfrm_state_put_afinfo(afinfo);
+ return type;
+}
+
+static void xfrm_put_type(const struct xfrm_type *type)
+{
+ module_put(type->owner);
+}
+
+static DEFINE_SPINLOCK(xfrm_mode_lock);
+int xfrm_register_mode(struct xfrm_mode *mode, int family)
+{
+ struct xfrm_state_afinfo *afinfo;
+ struct xfrm_mode **modemap;
+ int err;
+
+ if (unlikely(mode->encap >= XFRM_MODE_MAX))
+ return -EINVAL;
+
+ afinfo = xfrm_state_get_afinfo(family);
+ if (unlikely(afinfo == NULL))
+ return -EAFNOSUPPORT;
+
+ err = -EEXIST;
+ modemap = afinfo->mode_map;
+ spin_lock_bh(&xfrm_mode_lock);
+ if (modemap[mode->encap])
+ goto out;
+
+ err = -ENOENT;
+ if (!try_module_get(afinfo->owner))
+ goto out;
+
+ mode->afinfo = afinfo;
+ modemap[mode->encap] = mode;
+ err = 0;
+
+out:
+ spin_unlock_bh(&xfrm_mode_lock);
+ xfrm_state_put_afinfo(afinfo);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_register_mode);
+
+int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
+{
+ struct xfrm_state_afinfo *afinfo;
+ struct xfrm_mode **modemap;
+ int err;
+
+ if (unlikely(mode->encap >= XFRM_MODE_MAX))
+ return -EINVAL;
+
+ afinfo = xfrm_state_get_afinfo(family);
+ if (unlikely(afinfo == NULL))
+ return -EAFNOSUPPORT;
+
+ err = -ENOENT;
+ modemap = afinfo->mode_map;
+ spin_lock_bh(&xfrm_mode_lock);
+ if (likely(modemap[mode->encap] == mode)) {
+ modemap[mode->encap] = NULL;
+ module_put(mode->afinfo->owner);
+ err = 0;
+ }
+
+ spin_unlock_bh(&xfrm_mode_lock);
+ xfrm_state_put_afinfo(afinfo);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_unregister_mode);
+
+static struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
+{
+ struct xfrm_state_afinfo *afinfo;
+ struct xfrm_mode *mode;
+ int modload_attempted = 0;
+
+ if (unlikely(encap >= XFRM_MODE_MAX))
+ return NULL;
+
+retry:
+ afinfo = xfrm_state_get_afinfo(family);
+ if (unlikely(afinfo == NULL))
+ return NULL;
+
+ mode = afinfo->mode_map[encap];
+ if (unlikely(mode && !try_module_get(mode->owner)))
+ mode = NULL;
+ if (!mode && !modload_attempted) {
+ xfrm_state_put_afinfo(afinfo);
+ request_module("xfrm-mode-%d-%d", family, encap);
+ modload_attempted = 1;
+ goto retry;
+ }
+
+ xfrm_state_put_afinfo(afinfo);
+ return mode;
+}
+
+static void xfrm_put_mode(struct xfrm_mode *mode)
+{
+ module_put(mode->owner);
+}
+
+static void xfrm_state_gc_destroy(struct xfrm_state *x)
+{
+ tasklet_hrtimer_cancel(&x->mtimer);
+ del_timer_sync(&x->rtimer);
+ kfree(x->aalg);
+ kfree(x->ealg);
+ kfree(x->calg);
+ kfree(x->encap);
+ kfree(x->coaddr);
+ kfree(x->replay_esn);
+ kfree(x->preplay_esn);
+ if (x->inner_mode)
+ xfrm_put_mode(x->inner_mode);
+ if (x->inner_mode_iaf)
+ xfrm_put_mode(x->inner_mode_iaf);
+ if (x->outer_mode)
+ xfrm_put_mode(x->outer_mode);
+ if (x->type) {
+ x->type->destructor(x);
+ xfrm_put_type(x->type);
+ }
+ security_xfrm_state_free(x);
+ kfree(x);
+}
+
+static void xfrm_state_gc_task(struct work_struct *work)
+{
+ struct net *net = container_of(work, struct net, xfrm.state_gc_work);
+ struct xfrm_state *x;
+ struct hlist_node *tmp;
+ struct hlist_head gc_list;
+
+ spin_lock_bh(&xfrm_state_gc_lock);
+ hlist_move_list(&net->xfrm.state_gc_list, &gc_list);
+ spin_unlock_bh(&xfrm_state_gc_lock);
+
+ hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
+ xfrm_state_gc_destroy(x);
+}
+
+static inline unsigned long make_jiffies(long secs)
+{
+ if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
+ return MAX_SCHEDULE_TIMEOUT-1;
+ else
+ return secs*HZ;
+}
+
+static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
+{
+ struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
+ struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
+ unsigned long now = get_seconds();
+ long next = LONG_MAX;
+ int warn = 0;
+ int err = 0;
+
+ spin_lock(&x->lock);
+ if (x->km.state == XFRM_STATE_DEAD)
+ goto out;
+ if (x->km.state == XFRM_STATE_EXPIRED)
+ goto expired;
+ if (x->lft.hard_add_expires_seconds) {
+ long tmo = x->lft.hard_add_expires_seconds +
+ x->curlft.add_time - now;
+ if (tmo <= 0) {
+ if (x->xflags & XFRM_SOFT_EXPIRE) {
+ /* enter hard expire without soft expire first?!
+ * setting a new date could trigger this.
+ * workarbound: fix x->curflt.add_time by below:
+ */
+ x->curlft.add_time = now - x->saved_tmo - 1;
+ tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
+ } else
+ goto expired;
+ }
+ if (tmo < next)
+ next = tmo;
+ }
+ if (x->lft.hard_use_expires_seconds) {
+ long tmo = x->lft.hard_use_expires_seconds +
+ (x->curlft.use_time ? : now) - now;
+ if (tmo <= 0)
+ goto expired;
+ if (tmo < next)
+ next = tmo;
+ }
+ if (x->km.dying)
+ goto resched;
+ if (x->lft.soft_add_expires_seconds) {
+ long tmo = x->lft.soft_add_expires_seconds +
+ x->curlft.add_time - now;
+ if (tmo <= 0) {
+ warn = 1;
+ x->xflags &= ~XFRM_SOFT_EXPIRE;
+ } else if (tmo < next) {
+ next = tmo;
+ x->xflags |= XFRM_SOFT_EXPIRE;
+ x->saved_tmo = tmo;
+ }
+ }
+ if (x->lft.soft_use_expires_seconds) {
+ long tmo = x->lft.soft_use_expires_seconds +
+ (x->curlft.use_time ? : now) - now;
+ if (tmo <= 0)
+ warn = 1;
+ else if (tmo < next)
+ next = tmo;
+ }
+
+ x->km.dying = warn;
+ if (warn)
+ km_state_expired(x, 0, 0);
+resched:
+ if (next != LONG_MAX) {
+ tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
+ }
+
+ goto out;
+
+expired:
+ if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0)
+ x->km.state = XFRM_STATE_EXPIRED;
+
+ err = __xfrm_state_delete(x);
+ if (!err)
+ km_state_expired(x, 1, 0);
+
+ xfrm_audit_state_delete(x, err ? 0 : 1, true);
+
+out:
+ spin_unlock(&x->lock);
+ return HRTIMER_NORESTART;
+}
+
+static void xfrm_replay_timer_handler(unsigned long data);
+
+struct xfrm_state *xfrm_state_alloc(struct net *net)
+{
+ struct xfrm_state *x;
+
+ x = kzalloc(sizeof(struct xfrm_state), GFP_ATOMIC);
+
+ if (x) {
+ write_pnet(&x->xs_net, net);
+ atomic_set(&x->refcnt, 1);
+ atomic_set(&x->tunnel_users, 0);
+ INIT_LIST_HEAD(&x->km.all);
+ INIT_HLIST_NODE(&x->bydst);
+ INIT_HLIST_NODE(&x->bysrc);
+ INIT_HLIST_NODE(&x->byspi);
+ tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
+ CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
+ setup_timer(&x->rtimer, xfrm_replay_timer_handler,
+ (unsigned long)x);
+ x->curlft.add_time = get_seconds();
+ x->lft.soft_byte_limit = XFRM_INF;
+ x->lft.soft_packet_limit = XFRM_INF;
+ x->lft.hard_byte_limit = XFRM_INF;
+ x->lft.hard_packet_limit = XFRM_INF;
+ x->replay_maxage = 0;
+ x->replay_maxdiff = 0;
+ x->inner_mode = NULL;
+ x->inner_mode_iaf = NULL;
+ spin_lock_init(&x->lock);
+ }
+ return x;
+}
+EXPORT_SYMBOL(xfrm_state_alloc);
+
+void __xfrm_state_destroy(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+
+ WARN_ON(x->km.state != XFRM_STATE_DEAD);
+
+ spin_lock_bh(&xfrm_state_gc_lock);
+ hlist_add_head(&x->gclist, &net->xfrm.state_gc_list);
+ spin_unlock_bh(&xfrm_state_gc_lock);
+ schedule_work(&net->xfrm.state_gc_work);
+}
+EXPORT_SYMBOL(__xfrm_state_destroy);
+
+int __xfrm_state_delete(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ int err = -ESRCH;
+
+ if (x->km.state != XFRM_STATE_DEAD) {
+ x->km.state = XFRM_STATE_DEAD;
+ spin_lock(&net->xfrm.xfrm_state_lock);
+ list_del(&x->km.all);
+ hlist_del(&x->bydst);
+ hlist_del(&x->bysrc);
+ if (x->id.spi)
+ hlist_del(&x->byspi);
+ net->xfrm.state_num--;
+ spin_unlock(&net->xfrm.xfrm_state_lock);
+
+ /* All xfrm_state objects are created by xfrm_state_alloc.
+ * The xfrm_state_alloc call gives a reference, and that
+ * is what we are dropping here.
+ */
+ xfrm_state_put(x);
+ err = 0;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(__xfrm_state_delete);
+
+int xfrm_state_delete(struct xfrm_state *x)
+{
+ int err;
+
+ spin_lock_bh(&x->lock);
+ err = __xfrm_state_delete(x);
+ spin_unlock_bh(&x->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(xfrm_state_delete);
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static inline int
+xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
+{
+ int i, err = 0;
+
+ for (i = 0; i <= net->xfrm.state_hmask; i++) {
+ struct xfrm_state *x;
+
+ hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
+ if (xfrm_id_proto_match(x->id.proto, proto) &&
+ (err = security_xfrm_state_delete(x)) != 0) {
+ xfrm_audit_state_delete(x, 0, task_valid);
+ return err;
+ }
+ }
+ }
+
+ return err;
+}
+#else
+static inline int
+xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
+{
+ return 0;
+}
+#endif
+
+int xfrm_state_flush(struct net *net, u8 proto, bool task_valid)
+{
+ int i, err = 0, cnt = 0;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ err = xfrm_state_flush_secctx_check(net, proto, task_valid);
+ if (err)
+ goto out;
+
+ err = -ESRCH;
+ for (i = 0; i <= net->xfrm.state_hmask; i++) {
+ struct xfrm_state *x;
+restart:
+ hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
+ if (!xfrm_state_kern(x) &&
+ xfrm_id_proto_match(x->id.proto, proto)) {
+ xfrm_state_hold(x);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+ err = xfrm_state_delete(x);
+ xfrm_audit_state_delete(x, err ? 0 : 1,
+ task_valid);
+ xfrm_state_put(x);
+ if (!err)
+ cnt++;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ goto restart;
+ }
+ }
+ }
+ if (cnt)
+ err = 0;
+
+out:
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_state_flush);
+
+void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
+{
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ si->sadcnt = net->xfrm.state_num;
+ si->sadhcnt = net->xfrm.state_hmask;
+ si->sadhmcnt = xfrm_state_hashmax;
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+}
+EXPORT_SYMBOL(xfrm_sad_getinfo);
+
+static int
+xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
+ const struct xfrm_tmpl *tmpl,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr,
+ unsigned short family)
+{
+ struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+ if (!afinfo)
+ return -1;
+ afinfo->init_tempsel(&x->sel, fl);
+
+ if (family != tmpl->encap_family) {
+ xfrm_state_put_afinfo(afinfo);
+ afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
+ if (!afinfo)
+ return -1;
+ }
+ afinfo->init_temprop(x, tmpl, daddr, saddr);
+ xfrm_state_put_afinfo(afinfo);
+ return 0;
+}
+
+static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
+ const xfrm_address_t *daddr,
+ __be32 spi, u8 proto,
+ unsigned short family)
+{
+ unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
+ struct xfrm_state *x;
+
+ hlist_for_each_entry(x, net->xfrm.state_byspi+h, byspi) {
+ if (x->props.family != family ||
+ x->id.spi != spi ||
+ x->id.proto != proto ||
+ !xfrm_addr_equal(&x->id.daddr, daddr, family))
+ continue;
+
+ if ((mark & x->mark.m) != x->mark.v)
+ continue;
+ xfrm_state_hold(x);
+ return x;
+ }
+
+ return NULL;
+}
+
+static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ u8 proto, unsigned short family)
+{
+ unsigned int h = xfrm_src_hash(net, daddr, saddr, family);
+ struct xfrm_state *x;
+
+ hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) {
+ if (x->props.family != family ||
+ x->id.proto != proto ||
+ !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
+ !xfrm_addr_equal(&x->props.saddr, saddr, family))
+ continue;
+
+ if ((mark & x->mark.m) != x->mark.v)
+ continue;
+ xfrm_state_hold(x);
+ return x;
+ }
+
+ return NULL;
+}
+
+static inline struct xfrm_state *
+__xfrm_state_locate(struct xfrm_state *x, int use_spi, int family)
+{
+ struct net *net = xs_net(x);
+ u32 mark = x->mark.v & x->mark.m;
+
+ if (use_spi)
+ return __xfrm_state_lookup(net, mark, &x->id.daddr,
+ x->id.spi, x->id.proto, family);
+ else
+ return __xfrm_state_lookup_byaddr(net, mark,
+ &x->id.daddr,
+ &x->props.saddr,
+ x->id.proto, family);
+}
+
+static void xfrm_hash_grow_check(struct net *net, int have_hash_collision)
+{
+ if (have_hash_collision &&
+ (net->xfrm.state_hmask + 1) < xfrm_state_hashmax &&
+ net->xfrm.state_num > net->xfrm.state_hmask)
+ schedule_work(&net->xfrm.state_hash_work);
+}
+
+static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
+ const struct flowi *fl, unsigned short family,
+ struct xfrm_state **best, int *acq_in_progress,
+ int *error)
+{
+ /* Resolution logic:
+ * 1. There is a valid state with matching selector. Done.
+ * 2. Valid state with inappropriate selector. Skip.
+ *
+ * Entering area of "sysdeps".
+ *
+ * 3. If state is not valid, selector is temporary, it selects
+ * only session which triggered previous resolution. Key
+ * manager will do something to install a state with proper
+ * selector.
+ */
+ if (x->km.state == XFRM_STATE_VALID) {
+ if ((x->sel.family &&
+ !xfrm_selector_match(&x->sel, fl, x->sel.family)) ||
+ !security_xfrm_state_pol_flow_match(x, pol, fl))
+ return;
+
+ if (!*best ||
+ (*best)->km.dying > x->km.dying ||
+ ((*best)->km.dying == x->km.dying &&
+ (*best)->curlft.add_time < x->curlft.add_time))
+ *best = x;
+ } else if (x->km.state == XFRM_STATE_ACQ) {
+ *acq_in_progress = 1;
+ } else if (x->km.state == XFRM_STATE_ERROR ||
+ x->km.state == XFRM_STATE_EXPIRED) {
+ if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
+ security_xfrm_state_pol_flow_match(x, pol, fl))
+ *error = -ESRCH;
+ }
+}
+
+struct xfrm_state *
+xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
+ const struct flowi *fl, struct xfrm_tmpl *tmpl,
+ struct xfrm_policy *pol, int *err,
+ unsigned short family)
+{
+ static xfrm_address_t saddr_wildcard = { };
+ struct net *net = xp_net(pol);
+ unsigned int h, h_wildcard;
+ struct xfrm_state *x, *x0, *to_put;
+ int acquire_in_progress = 0;
+ int error = 0;
+ struct xfrm_state *best = NULL;
+ u32 mark = pol->mark.v & pol->mark.m;
+ unsigned short encap_family = tmpl->encap_family;
+ struct km_event c;
+
+ to_put = NULL;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
+ hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
+ if (x->props.family == encap_family &&
+ x->props.reqid == tmpl->reqid &&
+ (mark & x->mark.m) == x->mark.v &&
+ !(x->props.flags & XFRM_STATE_WILDRECV) &&
+ xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
+ tmpl->mode == x->props.mode &&
+ tmpl->id.proto == x->id.proto &&
+ (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
+ xfrm_state_look_at(pol, x, fl, encap_family,
+ &best, &acquire_in_progress, &error);
+ }
+ if (best || acquire_in_progress)
+ goto found;
+
+ h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
+ hlist_for_each_entry(x, net->xfrm.state_bydst+h_wildcard, bydst) {
+ if (x->props.family == encap_family &&
+ x->props.reqid == tmpl->reqid &&
+ (mark & x->mark.m) == x->mark.v &&
+ !(x->props.flags & XFRM_STATE_WILDRECV) &&
+ xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
+ tmpl->mode == x->props.mode &&
+ tmpl->id.proto == x->id.proto &&
+ (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
+ xfrm_state_look_at(pol, x, fl, encap_family,
+ &best, &acquire_in_progress, &error);
+ }
+
+found:
+ x = best;
+ if (!x && !error && !acquire_in_progress) {
+ if (tmpl->id.spi &&
+ (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi,
+ tmpl->id.proto, encap_family)) != NULL) {
+ to_put = x0;
+ error = -EEXIST;
+ goto out;
+ }
+
+ c.net = net;
+ /* If the KMs have no listeners (yet...), avoid allocating an SA
+ * for each and every packet - garbage collection might not
+ * handle the flood.
+ */
+ if (!km_is_alive(&c)) {
+ error = -ESRCH;
+ goto out;
+ }
+
+ x = xfrm_state_alloc(net);
+ if (x == NULL) {
+ error = -ENOMEM;
+ goto out;
+ }
+ /* Initialize temporary state matching only
+ * to current session. */
+ xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
+ memcpy(&x->mark, &pol->mark, sizeof(x->mark));
+
+ error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
+ if (error) {
+ x->km.state = XFRM_STATE_DEAD;
+ to_put = x;
+ x = NULL;
+ goto out;
+ }
+
+ if (km_query(x, tmpl, pol) == 0) {
+ x->km.state = XFRM_STATE_ACQ;
+ list_add(&x->km.all, &net->xfrm.state_all);
+ hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+ h = xfrm_src_hash(net, daddr, saddr, encap_family);
+ hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+ if (x->id.spi) {
+ h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
+ hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ }
+ x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
+ tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
+ net->xfrm.state_num++;
+ xfrm_hash_grow_check(net, x->bydst.next != NULL);
+ } else {
+ x->km.state = XFRM_STATE_DEAD;
+ to_put = x;
+ x = NULL;
+ error = -ESRCH;
+ }
+ }
+out:
+ if (x)
+ xfrm_state_hold(x);
+ else
+ *err = acquire_in_progress ? -EAGAIN : error;
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ if (to_put)
+ xfrm_state_put(to_put);
+ return x;
+}
+
+struct xfrm_state *
+xfrm_stateonly_find(struct net *net, u32 mark,
+ xfrm_address_t *daddr, xfrm_address_t *saddr,
+ unsigned short family, u8 mode, u8 proto, u32 reqid)
+{
+ unsigned int h;
+ struct xfrm_state *rx = NULL, *x = NULL;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
+ hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
+ if (x->props.family == family &&
+ x->props.reqid == reqid &&
+ (mark & x->mark.m) == x->mark.v &&
+ !(x->props.flags & XFRM_STATE_WILDRECV) &&
+ xfrm_state_addr_check(x, daddr, saddr, family) &&
+ mode == x->props.mode &&
+ proto == x->id.proto &&
+ x->km.state == XFRM_STATE_VALID) {
+ rx = x;
+ break;
+ }
+ }
+
+ if (rx)
+ xfrm_state_hold(rx);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+
+ return rx;
+}
+EXPORT_SYMBOL(xfrm_stateonly_find);
+
+struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
+ unsigned short family)
+{
+ struct xfrm_state *x;
+ struct xfrm_state_walk *w;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ list_for_each_entry(w, &net->xfrm.state_all, all) {
+ x = container_of(w, struct xfrm_state, km);
+ if (x->props.family != family ||
+ x->id.spi != spi)
+ continue;
+
+ xfrm_state_hold(x);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ return x;
+ }
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(xfrm_state_lookup_byspi);
+
+static void __xfrm_state_insert(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ unsigned int h;
+
+ list_add(&x->km.all, &net->xfrm.state_all);
+
+ h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
+ x->props.reqid, x->props.family);
+ hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+
+ h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
+ hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+
+ if (x->id.spi) {
+ h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
+ x->props.family);
+
+ hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ }
+
+ tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
+ if (x->replay_maxage)
+ mod_timer(&x->rtimer, jiffies + x->replay_maxage);
+
+ net->xfrm.state_num++;
+
+ xfrm_hash_grow_check(net, x->bydst.next != NULL);
+}
+
+/* net->xfrm.xfrm_state_lock is held */
+static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
+{
+ struct net *net = xs_net(xnew);
+ unsigned short family = xnew->props.family;
+ u32 reqid = xnew->props.reqid;
+ struct xfrm_state *x;
+ unsigned int h;
+ u32 mark = xnew->mark.v & xnew->mark.m;
+
+ h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
+ hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
+ if (x->props.family == family &&
+ x->props.reqid == reqid &&
+ (mark & x->mark.m) == x->mark.v &&
+ xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
+ xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
+ x->genid++;
+ }
+}
+
+void xfrm_state_insert(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ __xfrm_state_bump_genids(x);
+ __xfrm_state_insert(x);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+}
+EXPORT_SYMBOL(xfrm_state_insert);
+
+/* net->xfrm.xfrm_state_lock is held */
+static struct xfrm_state *__find_acq_core(struct net *net,
+ const struct xfrm_mark *m,
+ unsigned short family, u8 mode,
+ u32 reqid, u8 proto,
+ const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr,
+ int create)
+{
+ unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
+ struct xfrm_state *x;
+ u32 mark = m->v & m->m;
+
+ hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
+ if (x->props.reqid != reqid ||
+ x->props.mode != mode ||
+ x->props.family != family ||
+ x->km.state != XFRM_STATE_ACQ ||
+ x->id.spi != 0 ||
+ x->id.proto != proto ||
+ (mark & x->mark.m) != x->mark.v ||
+ !xfrm_addr_equal(&x->id.daddr, daddr, family) ||
+ !xfrm_addr_equal(&x->props.saddr, saddr, family))
+ continue;
+
+ xfrm_state_hold(x);
+ return x;
+ }
+
+ if (!create)
+ return NULL;
+
+ x = xfrm_state_alloc(net);
+ if (likely(x)) {
+ switch (family) {
+ case AF_INET:
+ x->sel.daddr.a4 = daddr->a4;
+ x->sel.saddr.a4 = saddr->a4;
+ x->sel.prefixlen_d = 32;
+ x->sel.prefixlen_s = 32;
+ x->props.saddr.a4 = saddr->a4;
+ x->id.daddr.a4 = daddr->a4;
+ break;
+
+ case AF_INET6:
+ x->sel.daddr.in6 = daddr->in6;
+ x->sel.saddr.in6 = saddr->in6;
+ x->sel.prefixlen_d = 128;
+ x->sel.prefixlen_s = 128;
+ x->props.saddr.in6 = saddr->in6;
+ x->id.daddr.in6 = daddr->in6;
+ break;
+ }
+
+ x->km.state = XFRM_STATE_ACQ;
+ x->id.proto = proto;
+ x->props.family = family;
+ x->props.mode = mode;
+ x->props.reqid = reqid;
+ x->mark.v = m->v;
+ x->mark.m = m->m;
+ x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
+ xfrm_state_hold(x);
+ tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
+ list_add(&x->km.all, &net->xfrm.state_all);
+ hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+ h = xfrm_src_hash(net, daddr, saddr, family);
+ hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+
+ net->xfrm.state_num++;
+
+ xfrm_hash_grow_check(net, x->bydst.next != NULL);
+ }
+
+ return x;
+}
+
+static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
+
+int xfrm_state_add(struct xfrm_state *x)
+{
+ struct net *net = xs_net(x);
+ struct xfrm_state *x1, *to_put;
+ int family;
+ int err;
+ u32 mark = x->mark.v & x->mark.m;
+ int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
+
+ family = x->props.family;
+
+ to_put = NULL;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+
+ x1 = __xfrm_state_locate(x, use_spi, family);
+ if (x1) {
+ to_put = x1;
+ x1 = NULL;
+ err = -EEXIST;
+ goto out;
+ }
+
+ if (use_spi && x->km.seq) {
+ x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq);
+ if (x1 && ((x1->id.proto != x->id.proto) ||
+ !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) {
+ to_put = x1;
+ x1 = NULL;
+ }
+ }
+
+ if (use_spi && !x1)
+ x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
+ x->props.reqid, x->id.proto,
+ &x->id.daddr, &x->props.saddr, 0);
+
+ __xfrm_state_bump_genids(x);
+ __xfrm_state_insert(x);
+ err = 0;
+
+out:
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+ if (x1) {
+ xfrm_state_delete(x1);
+ xfrm_state_put(x1);
+ }
+
+ if (to_put)
+ xfrm_state_put(to_put);
+
+ return err;
+}
+EXPORT_SYMBOL(xfrm_state_add);
+
+#ifdef CONFIG_XFRM_MIGRATE
+static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig)
+{
+ struct net *net = xs_net(orig);
+ struct xfrm_state *x = xfrm_state_alloc(net);
+ if (!x)
+ goto out;
+
+ memcpy(&x->id, &orig->id, sizeof(x->id));
+ memcpy(&x->sel, &orig->sel, sizeof(x->sel));
+ memcpy(&x->lft, &orig->lft, sizeof(x->lft));
+ x->props.mode = orig->props.mode;
+ x->props.replay_window = orig->props.replay_window;
+ x->props.reqid = orig->props.reqid;
+ x->props.family = orig->props.family;
+ x->props.saddr = orig->props.saddr;
+
+ if (orig->aalg) {
+ x->aalg = xfrm_algo_auth_clone(orig->aalg);
+ if (!x->aalg)
+ goto error;
+ }
+ x->props.aalgo = orig->props.aalgo;
+
+ if (orig->aead) {
+ x->aead = xfrm_algo_aead_clone(orig->aead);
+ if (!x->aead)
+ goto error;
+ }
+ if (orig->ealg) {
+ x->ealg = xfrm_algo_clone(orig->ealg);
+ if (!x->ealg)
+ goto error;
+ }
+ x->props.ealgo = orig->props.ealgo;
+
+ if (orig->calg) {
+ x->calg = xfrm_algo_clone(orig->calg);
+ if (!x->calg)
+ goto error;
+ }
+ x->props.calgo = orig->props.calgo;
+
+ if (orig->encap) {
+ x->encap = kmemdup(orig->encap, sizeof(*x->encap), GFP_KERNEL);
+ if (!x->encap)
+ goto error;
+ }
+
+ if (orig->coaddr) {
+ x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr),
+ GFP_KERNEL);
+ if (!x->coaddr)
+ goto error;
+ }
+
+ if (orig->replay_esn) {
+ if (xfrm_replay_clone(x, orig))
+ goto error;
+ }
+
+ memcpy(&x->mark, &orig->mark, sizeof(x->mark));
+
+ if (xfrm_init_state(x) < 0)
+ goto error;
+
+ x->props.flags = orig->props.flags;
+ x->props.extra_flags = orig->props.extra_flags;
+
+ x->tfcpad = orig->tfcpad;
+ x->replay_maxdiff = orig->replay_maxdiff;
+ x->replay_maxage = orig->replay_maxage;
+ x->curlft.add_time = orig->curlft.add_time;
+ x->km.state = orig->km.state;
+ x->km.seq = orig->km.seq;
+
+ return x;
+
+ error:
+ xfrm_state_put(x);
+out:
+ return NULL;
+}
+
+struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net)
+{
+ unsigned int h;
+ struct xfrm_state *x = NULL;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+
+ if (m->reqid) {
+ h = xfrm_dst_hash(net, &m->old_daddr, &m->old_saddr,
+ m->reqid, m->old_family);
+ hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
+ if (x->props.mode != m->mode ||
+ x->id.proto != m->proto)
+ continue;
+ if (m->reqid && x->props.reqid != m->reqid)
+ continue;
+ if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr,
+ m->old_family) ||
+ !xfrm_addr_equal(&x->props.saddr, &m->old_saddr,
+ m->old_family))
+ continue;
+ xfrm_state_hold(x);
+ break;
+ }
+ } else {
+ h = xfrm_src_hash(net, &m->old_daddr, &m->old_saddr,
+ m->old_family);
+ hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) {
+ if (x->props.mode != m->mode ||
+ x->id.proto != m->proto)
+ continue;
+ if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr,
+ m->old_family) ||
+ !xfrm_addr_equal(&x->props.saddr, &m->old_saddr,
+ m->old_family))
+ continue;
+ xfrm_state_hold(x);
+ break;
+ }
+ }
+
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+ return x;
+}
+EXPORT_SYMBOL(xfrm_migrate_state_find);
+
+struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
+ struct xfrm_migrate *m)
+{
+ struct xfrm_state *xc;
+
+ xc = xfrm_state_clone(x);
+ if (!xc)
+ return NULL;
+
+ memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr));
+ memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr));
+
+ /* add state */
+ if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) {
+ /* a care is needed when the destination address of the
+ state is to be updated as it is a part of triplet */
+ xfrm_state_insert(xc);
+ } else {
+ if (xfrm_state_add(xc) < 0)
+ goto error;
+ }
+
+ return xc;
+error:
+ xfrm_state_put(xc);
+ return NULL;
+}
+EXPORT_SYMBOL(xfrm_state_migrate);
+#endif
+
+int xfrm_state_update(struct xfrm_state *x)
+{
+ struct xfrm_state *x1, *to_put;
+ int err;
+ int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
+ struct net *net = xs_net(x);
+
+ to_put = NULL;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ x1 = __xfrm_state_locate(x, use_spi, x->props.family);
+
+ err = -ESRCH;
+ if (!x1)
+ goto out;
+
+ if (xfrm_state_kern(x1)) {
+ to_put = x1;
+ err = -EEXIST;
+ goto out;
+ }
+
+ if (x1->km.state == XFRM_STATE_ACQ) {
+ __xfrm_state_insert(x);
+ x = NULL;
+ }
+ err = 0;
+
+out:
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+ if (to_put)
+ xfrm_state_put(to_put);
+
+ if (err)
+ return err;
+
+ if (!x) {
+ xfrm_state_delete(x1);
+ xfrm_state_put(x1);
+ return 0;
+ }
+
+ err = -EINVAL;
+ spin_lock_bh(&x1->lock);
+ if (likely(x1->km.state == XFRM_STATE_VALID)) {
+ if (x->encap && x1->encap)
+ memcpy(x1->encap, x->encap, sizeof(*x1->encap));
+ if (x->coaddr && x1->coaddr) {
+ memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr));
+ }
+ if (!use_spi && memcmp(&x1->sel, &x->sel, sizeof(x1->sel)))
+ memcpy(&x1->sel, &x->sel, sizeof(x1->sel));
+ memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
+ x1->km.dying = 0;
+
+ tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
+ if (x1->curlft.use_time)
+ xfrm_state_check_expire(x1);
+
+ err = 0;
+ x->km.state = XFRM_STATE_DEAD;
+ __xfrm_state_put(x);
+ }
+ spin_unlock_bh(&x1->lock);
+
+ xfrm_state_put(x1);
+
+ return err;
+}
+EXPORT_SYMBOL(xfrm_state_update);
+
+int xfrm_state_check_expire(struct xfrm_state *x)
+{
+ if (!x->curlft.use_time)
+ x->curlft.use_time = get_seconds();
+
+ if (x->curlft.bytes >= x->lft.hard_byte_limit ||
+ x->curlft.packets >= x->lft.hard_packet_limit) {
+ x->km.state = XFRM_STATE_EXPIRED;
+ tasklet_hrtimer_start(&x->mtimer, ktime_set(0, 0), HRTIMER_MODE_REL);
+ return -EINVAL;
+ }
+
+ if (!x->km.dying &&
+ (x->curlft.bytes >= x->lft.soft_byte_limit ||
+ x->curlft.packets >= x->lft.soft_packet_limit)) {
+ x->km.dying = 1;
+ km_state_expired(x, 0, 0);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_state_check_expire);
+
+struct xfrm_state *
+xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi,
+ u8 proto, unsigned short family)
+{
+ struct xfrm_state *x;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ return x;
+}
+EXPORT_SYMBOL(xfrm_state_lookup);
+
+struct xfrm_state *
+xfrm_state_lookup_byaddr(struct net *net, u32 mark,
+ const xfrm_address_t *daddr, const xfrm_address_t *saddr,
+ u8 proto, unsigned short family)
+{
+ struct xfrm_state *x;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ return x;
+}
+EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
+
+struct xfrm_state *
+xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
+ u8 proto, const xfrm_address_t *daddr,
+ const xfrm_address_t *saddr, int create, unsigned short family)
+{
+ struct xfrm_state *x;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+ return x;
+}
+EXPORT_SYMBOL(xfrm_find_acq);
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+int
+xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
+ unsigned short family, struct net *net)
+{
+ int err = 0;
+ struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+ if (!afinfo)
+ return -EAFNOSUPPORT;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/
+ if (afinfo->tmpl_sort)
+ err = afinfo->tmpl_sort(dst, src, n);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ xfrm_state_put_afinfo(afinfo);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_tmpl_sort);
+
+int
+xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
+ unsigned short family)
+{
+ int err = 0;
+ struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+ struct net *net = xs_net(*src);
+
+ if (!afinfo)
+ return -EAFNOSUPPORT;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ if (afinfo->state_sort)
+ err = afinfo->state_sort(dst, src, n);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ xfrm_state_put_afinfo(afinfo);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_state_sort);
+#endif
+
+/* Silly enough, but I'm lazy to build resolution list */
+
+static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq)
+{
+ int i;
+
+ for (i = 0; i <= net->xfrm.state_hmask; i++) {
+ struct xfrm_state *x;
+
+ hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
+ if (x->km.seq == seq &&
+ (mark & x->mark.m) == x->mark.v &&
+ x->km.state == XFRM_STATE_ACQ) {
+ xfrm_state_hold(x);
+ return x;
+ }
+ }
+ }
+ return NULL;
+}
+
+struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq)
+{
+ struct xfrm_state *x;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ x = __xfrm_find_acq_byseq(net, mark, seq);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ return x;
+}
+EXPORT_SYMBOL(xfrm_find_acq_byseq);
+
+u32 xfrm_get_acqseq(void)
+{
+ u32 res;
+ static atomic_t acqseq;
+
+ do {
+ res = atomic_inc_return(&acqseq);
+ } while (!res);
+
+ return res;
+}
+EXPORT_SYMBOL(xfrm_get_acqseq);
+
+int verify_spi_info(u8 proto, u32 min, u32 max)
+{
+ switch (proto) {
+ case IPPROTO_AH:
+ case IPPROTO_ESP:
+ break;
+
+ case IPPROTO_COMP:
+ /* IPCOMP spi is 16-bits. */
+ if (max >= 0x10000)
+ return -EINVAL;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (min > max)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL(verify_spi_info);
+
+int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
+{
+ struct net *net = xs_net(x);
+ unsigned int h;
+ struct xfrm_state *x0;
+ int err = -ENOENT;
+ __be32 minspi = htonl(low);
+ __be32 maxspi = htonl(high);
+ u32 mark = x->mark.v & x->mark.m;
+
+ spin_lock_bh(&x->lock);
+ if (x->km.state == XFRM_STATE_DEAD)
+ goto unlock;
+
+ err = 0;
+ if (x->id.spi)
+ goto unlock;
+
+ err = -ENOENT;
+
+ if (minspi == maxspi) {
+ x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family);
+ if (x0) {
+ xfrm_state_put(x0);
+ goto unlock;
+ }
+ x->id.spi = minspi;
+ } else {
+ u32 spi = 0;
+ for (h = 0; h < high-low+1; h++) {
+ spi = low + prandom_u32()%(high-low+1);
+ x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family);
+ if (x0 == NULL) {
+ x->id.spi = htonl(spi);
+ break;
+ }
+ xfrm_state_put(x0);
+ }
+ }
+ if (x->id.spi) {
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
+ hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+ err = 0;
+ }
+
+unlock:
+ spin_unlock_bh(&x->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(xfrm_alloc_spi);
+
+static bool __xfrm_state_filter_match(struct xfrm_state *x,
+ struct xfrm_address_filter *filter)
+{
+ if (filter) {
+ if ((filter->family == AF_INET ||
+ filter->family == AF_INET6) &&
+ x->props.family != filter->family)
+ return false;
+
+ return addr_match(&x->props.saddr, &filter->saddr,
+ filter->splen) &&
+ addr_match(&x->id.daddr, &filter->daddr,
+ filter->dplen);
+ }
+ return true;
+}
+
+int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
+ int (*func)(struct xfrm_state *, int, void*),
+ void *data)
+{
+ struct xfrm_state *state;
+ struct xfrm_state_walk *x;
+ int err = 0;
+
+ if (walk->seq != 0 && list_empty(&walk->all))
+ return 0;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ if (list_empty(&walk->all))
+ x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all);
+ else
+ x = list_entry(&walk->all, struct xfrm_state_walk, all);
+ list_for_each_entry_from(x, &net->xfrm.state_all, all) {
+ if (x->state == XFRM_STATE_DEAD)
+ continue;
+ state = container_of(x, struct xfrm_state, km);
+ if (!xfrm_id_proto_match(state->id.proto, walk->proto))
+ continue;
+ if (!__xfrm_state_filter_match(state, walk->filter))
+ continue;
+ err = func(state, walk->seq, data);
+ if (err) {
+ list_move_tail(&walk->all, &x->all);
+ goto out;
+ }
+ walk->seq++;
+ }
+ if (walk->seq == 0) {
+ err = -ENOENT;
+ goto out;
+ }
+ list_del_init(&walk->all);
+out:
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_state_walk);
+
+void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
+ struct xfrm_address_filter *filter)
+{
+ INIT_LIST_HEAD(&walk->all);
+ walk->proto = proto;
+ walk->state = XFRM_STATE_DEAD;
+ walk->seq = 0;
+ walk->filter = filter;
+}
+EXPORT_SYMBOL(xfrm_state_walk_init);
+
+void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net)
+{
+ kfree(walk->filter);
+
+ if (list_empty(&walk->all))
+ return;
+
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ list_del(&walk->all);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+}
+EXPORT_SYMBOL(xfrm_state_walk_done);
+
+static void xfrm_replay_timer_handler(unsigned long data)
+{
+ struct xfrm_state *x = (struct xfrm_state *)data;
+
+ spin_lock(&x->lock);
+
+ if (x->km.state == XFRM_STATE_VALID) {
+ if (xfrm_aevent_is_on(xs_net(x)))
+ x->repl->notify(x, XFRM_REPLAY_TIMEOUT);
+ else
+ x->xflags |= XFRM_TIME_DEFER;
+ }
+
+ spin_unlock(&x->lock);
+}
+
+static LIST_HEAD(xfrm_km_list);
+
+void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+ struct xfrm_mgr *km;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(km, &xfrm_km_list, list)
+ if (km->notify_policy)
+ km->notify_policy(xp, dir, c);
+ rcu_read_unlock();
+}
+
+void km_state_notify(struct xfrm_state *x, const struct km_event *c)
+{
+ struct xfrm_mgr *km;
+ rcu_read_lock();
+ list_for_each_entry_rcu(km, &xfrm_km_list, list)
+ if (km->notify)
+ km->notify(x, c);
+ rcu_read_unlock();
+}
+
+EXPORT_SYMBOL(km_policy_notify);
+EXPORT_SYMBOL(km_state_notify);
+
+void km_state_expired(struct xfrm_state *x, int hard, u32 portid)
+{
+ struct km_event c;
+
+ c.data.hard = hard;
+ c.portid = portid;
+ c.event = XFRM_MSG_EXPIRE;
+ km_state_notify(x, &c);
+}
+
+EXPORT_SYMBOL(km_state_expired);
+/*
+ * We send to all registered managers regardless of failure
+ * We are happy with one success
+*/
+int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
+{
+ int err = -EINVAL, acqret;
+ struct xfrm_mgr *km;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(km, &xfrm_km_list, list) {
+ acqret = km->acquire(x, t, pol);
+ if (!acqret)
+ err = acqret;
+ }
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(km_query);
+
+int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
+{
+ int err = -EINVAL;
+ struct xfrm_mgr *km;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(km, &xfrm_km_list, list) {
+ if (km->new_mapping)
+ err = km->new_mapping(x, ipaddr, sport);
+ if (!err)
+ break;
+ }
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(km_new_mapping);
+
+void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid)
+{
+ struct km_event c;
+
+ c.data.hard = hard;
+ c.portid = portid;
+ c.event = XFRM_MSG_POLEXPIRE;
+ km_policy_notify(pol, dir, &c);
+}
+EXPORT_SYMBOL(km_policy_expired);
+
+#ifdef CONFIG_XFRM_MIGRATE
+int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_migrate,
+ const struct xfrm_kmaddress *k)
+{
+ int err = -EINVAL;
+ int ret;
+ struct xfrm_mgr *km;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(km, &xfrm_km_list, list) {
+ if (km->migrate) {
+ ret = km->migrate(sel, dir, type, m, num_migrate, k);
+ if (!ret)
+ err = ret;
+ }
+ }
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(km_migrate);
+#endif
+
+int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr)
+{
+ int err = -EINVAL;
+ int ret;
+ struct xfrm_mgr *km;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(km, &xfrm_km_list, list) {
+ if (km->report) {
+ ret = km->report(net, proto, sel, addr);
+ if (!ret)
+ err = ret;
+ }
+ }
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(km_report);
+
+bool km_is_alive(const struct km_event *c)
+{
+ struct xfrm_mgr *km;
+ bool is_alive = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(km, &xfrm_km_list, list) {
+ if (km->is_alive && km->is_alive(c)) {
+ is_alive = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return is_alive;
+}
+EXPORT_SYMBOL(km_is_alive);
+
+int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen)
+{
+ int err;
+ u8 *data;
+ struct xfrm_mgr *km;
+ struct xfrm_policy *pol = NULL;
+
+ if (optlen <= 0 || optlen > PAGE_SIZE)
+ return -EMSGSIZE;
+
+ data = kmalloc(optlen, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ err = -EFAULT;
+ if (copy_from_user(data, optval, optlen))
+ goto out;
+
+ err = -EINVAL;
+ rcu_read_lock();
+ list_for_each_entry_rcu(km, &xfrm_km_list, list) {
+ pol = km->compile_policy(sk, optname, data,
+ optlen, &err);
+ if (err >= 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err >= 0) {
+ xfrm_sk_policy_insert(sk, err, pol);
+ xfrm_pol_put(pol);
+ err = 0;
+ }
+
+out:
+ kfree(data);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_user_policy);
+
+static DEFINE_SPINLOCK(xfrm_km_lock);
+
+int xfrm_register_km(struct xfrm_mgr *km)
+{
+ spin_lock_bh(&xfrm_km_lock);
+ list_add_tail_rcu(&km->list, &xfrm_km_list);
+ spin_unlock_bh(&xfrm_km_lock);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_register_km);
+
+int xfrm_unregister_km(struct xfrm_mgr *km)
+{
+ spin_lock_bh(&xfrm_km_lock);
+ list_del_rcu(&km->list);
+ spin_unlock_bh(&xfrm_km_lock);
+ synchronize_rcu();
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_unregister_km);
+
+int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
+{
+ int err = 0;
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+ if (unlikely(afinfo->family >= NPROTO))
+ return -EAFNOSUPPORT;
+ spin_lock_bh(&xfrm_state_afinfo_lock);
+ if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
+ err = -ENOBUFS;
+ else
+ rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo);
+ spin_unlock_bh(&xfrm_state_afinfo_lock);
+ return err;
+}
+EXPORT_SYMBOL(xfrm_state_register_afinfo);
+
+int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
+{
+ int err = 0;
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+ if (unlikely(afinfo->family >= NPROTO))
+ return -EAFNOSUPPORT;
+ spin_lock_bh(&xfrm_state_afinfo_lock);
+ if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
+ if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo))
+ err = -EINVAL;
+ else
+ RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL);
+ }
+ spin_unlock_bh(&xfrm_state_afinfo_lock);
+ synchronize_rcu();
+ return err;
+}
+EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
+
+struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
+{
+ struct xfrm_state_afinfo *afinfo;
+ if (unlikely(family >= NPROTO))
+ return NULL;
+ rcu_read_lock();
+ afinfo = rcu_dereference(xfrm_state_afinfo[family]);
+ if (unlikely(!afinfo))
+ rcu_read_unlock();
+ return afinfo;
+}
+
+void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
+{
+ rcu_read_unlock();
+}
+
+/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
+void xfrm_state_delete_tunnel(struct xfrm_state *x)
+{
+ if (x->tunnel) {
+ struct xfrm_state *t = x->tunnel;
+
+ if (atomic_read(&t->tunnel_users) == 2)
+ xfrm_state_delete(t);
+ atomic_dec(&t->tunnel_users);
+ xfrm_state_put(t);
+ x->tunnel = NULL;
+ }
+}
+EXPORT_SYMBOL(xfrm_state_delete_tunnel);
+
+int xfrm_state_mtu(struct xfrm_state *x, int mtu)
+{
+ int res;
+
+ spin_lock_bh(&x->lock);
+ if (x->km.state == XFRM_STATE_VALID &&
+ x->type && x->type->get_mtu)
+ res = x->type->get_mtu(x, mtu);
+ else
+ res = mtu - x->props.header_len;
+ spin_unlock_bh(&x->lock);
+ return res;
+}
+
+int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
+{
+ struct xfrm_state_afinfo *afinfo;
+ struct xfrm_mode *inner_mode;
+ int family = x->props.family;
+ int err;
+
+ err = -EAFNOSUPPORT;
+ afinfo = xfrm_state_get_afinfo(family);
+ if (!afinfo)
+ goto error;
+
+ err = 0;
+ if (afinfo->init_flags)
+ err = afinfo->init_flags(x);
+
+ xfrm_state_put_afinfo(afinfo);
+
+ if (err)
+ goto error;
+
+ err = -EPROTONOSUPPORT;
+
+ if (x->sel.family != AF_UNSPEC) {
+ inner_mode = xfrm_get_mode(x->props.mode, x->sel.family);
+ if (inner_mode == NULL)
+ goto error;
+
+ if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) &&
+ family != x->sel.family) {
+ xfrm_put_mode(inner_mode);
+ goto error;
+ }
+
+ x->inner_mode = inner_mode;
+ } else {
+ struct xfrm_mode *inner_mode_iaf;
+ int iafamily = AF_INET;
+
+ inner_mode = xfrm_get_mode(x->props.mode, x->props.family);
+ if (inner_mode == NULL)
+ goto error;
+
+ if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) {
+ xfrm_put_mode(inner_mode);
+ goto error;
+ }
+ x->inner_mode = inner_mode;
+
+ if (x->props.family == AF_INET)
+ iafamily = AF_INET6;
+
+ inner_mode_iaf = xfrm_get_mode(x->props.mode, iafamily);
+ if (inner_mode_iaf) {
+ if (inner_mode_iaf->flags & XFRM_MODE_FLAG_TUNNEL)
+ x->inner_mode_iaf = inner_mode_iaf;
+ else
+ xfrm_put_mode(inner_mode_iaf);
+ }
+ }
+
+ x->type = xfrm_get_type(x->id.proto, family);
+ if (x->type == NULL)
+ goto error;
+
+ err = x->type->init_state(x);
+ if (err)
+ goto error;
+
+ x->outer_mode = xfrm_get_mode(x->props.mode, family);
+ if (x->outer_mode == NULL) {
+ err = -EPROTONOSUPPORT;
+ goto error;
+ }
+
+ if (init_replay) {
+ err = xfrm_init_replay(x);
+ if (err)
+ goto error;
+ }
+
+ x->km.state = XFRM_STATE_VALID;
+
+error:
+ return err;
+}
+
+EXPORT_SYMBOL(__xfrm_init_state);
+
+int xfrm_init_state(struct xfrm_state *x)
+{
+ return __xfrm_init_state(x, true);
+}
+
+EXPORT_SYMBOL(xfrm_init_state);
+
+int __net_init xfrm_state_init(struct net *net)
+{
+ unsigned int sz;
+
+ INIT_LIST_HEAD(&net->xfrm.state_all);
+
+ sz = sizeof(struct hlist_head) * 8;
+
+ net->xfrm.state_bydst = xfrm_hash_alloc(sz);
+ if (!net->xfrm.state_bydst)
+ goto out_bydst;
+ net->xfrm.state_bysrc = xfrm_hash_alloc(sz);
+ if (!net->xfrm.state_bysrc)
+ goto out_bysrc;
+ net->xfrm.state_byspi = xfrm_hash_alloc(sz);
+ if (!net->xfrm.state_byspi)
+ goto out_byspi;
+ net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1);
+
+ net->xfrm.state_num = 0;
+ INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
+ INIT_HLIST_HEAD(&net->xfrm.state_gc_list);
+ INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task);
+ spin_lock_init(&net->xfrm.xfrm_state_lock);
+ return 0;
+
+out_byspi:
+ xfrm_hash_free(net->xfrm.state_bysrc, sz);
+out_bysrc:
+ xfrm_hash_free(net->xfrm.state_bydst, sz);
+out_bydst:
+ return -ENOMEM;
+}
+
+void xfrm_state_fini(struct net *net)
+{
+ unsigned int sz;
+
+ flush_work(&net->xfrm.state_hash_work);
+ xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
+ flush_work(&net->xfrm.state_gc_work);
+
+ WARN_ON(!list_empty(&net->xfrm.state_all));
+
+ sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head);
+ WARN_ON(!hlist_empty(net->xfrm.state_byspi));
+ xfrm_hash_free(net->xfrm.state_byspi, sz);
+ WARN_ON(!hlist_empty(net->xfrm.state_bysrc));
+ xfrm_hash_free(net->xfrm.state_bysrc, sz);
+ WARN_ON(!hlist_empty(net->xfrm.state_bydst));
+ xfrm_hash_free(net->xfrm.state_bydst, sz);
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+static void xfrm_audit_helper_sainfo(struct xfrm_state *x,
+ struct audit_buffer *audit_buf)
+{
+ struct xfrm_sec_ctx *ctx = x->security;
+ u32 spi = ntohl(x->id.spi);
+
+ if (ctx)
+ audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
+ ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);
+
+ switch (x->props.family) {
+ case AF_INET:
+ audit_log_format(audit_buf, " src=%pI4 dst=%pI4",
+ &x->props.saddr.a4, &x->id.daddr.a4);
+ break;
+ case AF_INET6:
+ audit_log_format(audit_buf, " src=%pI6 dst=%pI6",
+ x->props.saddr.a6, x->id.daddr.a6);
+ break;
+ }
+
+ audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
+}
+
+static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family,
+ struct audit_buffer *audit_buf)
+{
+ const struct iphdr *iph4;
+ const struct ipv6hdr *iph6;
+
+ switch (family) {
+ case AF_INET:
+ iph4 = ip_hdr(skb);
+ audit_log_format(audit_buf, " src=%pI4 dst=%pI4",
+ &iph4->saddr, &iph4->daddr);
+ break;
+ case AF_INET6:
+ iph6 = ipv6_hdr(skb);
+ audit_log_format(audit_buf,
+ " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x",
+ &iph6->saddr, &iph6->daddr,
+ iph6->flow_lbl[0] & 0x0f,
+ iph6->flow_lbl[1],
+ iph6->flow_lbl[2]);
+ break;
+ }
+}
+
+void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid)
+{
+ struct audit_buffer *audit_buf;
+
+ audit_buf = xfrm_audit_start("SAD-add");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_usrinfo(task_valid, audit_buf);
+ xfrm_audit_helper_sainfo(x, audit_buf);
+ audit_log_format(audit_buf, " res=%u", result);
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_add);
+
+void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid)
+{
+ struct audit_buffer *audit_buf;
+
+ audit_buf = xfrm_audit_start("SAD-delete");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_usrinfo(task_valid, audit_buf);
+ xfrm_audit_helper_sainfo(x, audit_buf);
+ audit_log_format(audit_buf, " res=%u", result);
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_delete);
+
+void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
+ struct sk_buff *skb)
+{
+ struct audit_buffer *audit_buf;
+ u32 spi;
+
+ audit_buf = xfrm_audit_start("SA-replay-overflow");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
+ /* don't record the sequence number because it's inherent in this kind
+ * of audit message */
+ spi = ntohl(x->id.spi);
+ audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow);
+
+void xfrm_audit_state_replay(struct xfrm_state *x,
+ struct sk_buff *skb, __be32 net_seq)
+{
+ struct audit_buffer *audit_buf;
+ u32 spi;
+
+ audit_buf = xfrm_audit_start("SA-replayed-pkt");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
+ spi = ntohl(x->id.spi);
+ audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
+ spi, spi, ntohl(net_seq));
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_replay);
+
+void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family)
+{
+ struct audit_buffer *audit_buf;
+
+ audit_buf = xfrm_audit_start("SA-notfound");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_pktinfo(skb, family, audit_buf);
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound_simple);
+
+void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
+ __be32 net_spi, __be32 net_seq)
+{
+ struct audit_buffer *audit_buf;
+ u32 spi;
+
+ audit_buf = xfrm_audit_start("SA-notfound");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_pktinfo(skb, family, audit_buf);
+ spi = ntohl(net_spi);
+ audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
+ spi, spi, ntohl(net_seq));
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound);
+
+void xfrm_audit_state_icvfail(struct xfrm_state *x,
+ struct sk_buff *skb, u8 proto)
+{
+ struct audit_buffer *audit_buf;
+ __be32 net_spi;
+ __be32 net_seq;
+
+ audit_buf = xfrm_audit_start("SA-icv-failure");
+ if (audit_buf == NULL)
+ return;
+ xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
+ if (xfrm_parse_spi(skb, proto, &net_spi, &net_seq) == 0) {
+ u32 spi = ntohl(net_spi);
+ audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
+ spi, spi, ntohl(net_seq));
+ }
+ audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_icvfail);
+#endif /* CONFIG_AUDITSYSCALL */
diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c
new file mode 100644
index 000000000..05a6e3d9c
--- /dev/null
+++ b/net/xfrm/xfrm_sysctl.c
@@ -0,0 +1,86 @@
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/xfrm.h>
+
+static void __net_init __xfrm_sysctl_init(struct net *net)
+{
+ net->xfrm.sysctl_aevent_etime = XFRM_AE_ETIME;
+ net->xfrm.sysctl_aevent_rseqth = XFRM_AE_SEQT_SIZE;
+ net->xfrm.sysctl_larval_drop = 1;
+ net->xfrm.sysctl_acq_expires = 30;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm_table[] = {
+ {
+ .procname = "xfrm_aevent_etime",
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "xfrm_aevent_rseqth",
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "xfrm_larval_drop",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "xfrm_acq_expires",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {}
+};
+
+int __net_init xfrm_sysctl_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ __xfrm_sysctl_init(net);
+
+ table = kmemdup(xfrm_table, sizeof(xfrm_table), GFP_KERNEL);
+ if (!table)
+ goto out_kmemdup;
+ table[0].data = &net->xfrm.sysctl_aevent_etime;
+ table[1].data = &net->xfrm.sysctl_aevent_rseqth;
+ table[2].data = &net->xfrm.sysctl_larval_drop;
+ table[3].data = &net->xfrm.sysctl_acq_expires;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ table[0].procname = NULL;
+
+ net->xfrm.sysctl_hdr = register_net_sysctl(net, "net/core", table);
+ if (!net->xfrm.sysctl_hdr)
+ goto out_register;
+ return 0;
+
+out_register:
+ kfree(table);
+out_kmemdup:
+ return -ENOMEM;
+}
+
+void __net_exit xfrm_sysctl_fini(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->xfrm.sysctl_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->xfrm.sysctl_hdr);
+ kfree(table);
+}
+#else
+int __net_init xfrm_sysctl_init(struct net *net)
+{
+ __xfrm_sysctl_init(net);
+ return 0;
+}
+#endif
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
new file mode 100644
index 000000000..209166429
--- /dev/null
+++ b/net/xfrm/xfrm_user.c
@@ -0,0 +1,3146 @@
+/* xfrm_user.c: User interface to configure xfrm engine.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ * Changes:
+ * Mitsuru KANDA @USAGI
+ * Kazunori MIYAZAWA @USAGI
+ * Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * IPv6 support
+ *
+ */
+
+#include <linux/crypto.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/netlink.h>
+#include <net/ah.h>
+#include <asm/uaccess.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/in6.h>
+#endif
+
+static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
+{
+ struct nlattr *rt = attrs[type];
+ struct xfrm_algo *algp;
+
+ if (!rt)
+ return 0;
+
+ algp = nla_data(rt);
+ if (nla_len(rt) < xfrm_alg_len(algp))
+ return -EINVAL;
+
+ switch (type) {
+ case XFRMA_ALG_AUTH:
+ case XFRMA_ALG_CRYPT:
+ case XFRMA_ALG_COMP:
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0';
+ return 0;
+}
+
+static int verify_auth_trunc(struct nlattr **attrs)
+{
+ struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC];
+ struct xfrm_algo_auth *algp;
+
+ if (!rt)
+ return 0;
+
+ algp = nla_data(rt);
+ if (nla_len(rt) < xfrm_alg_auth_len(algp))
+ return -EINVAL;
+
+ algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0';
+ return 0;
+}
+
+static int verify_aead(struct nlattr **attrs)
+{
+ struct nlattr *rt = attrs[XFRMA_ALG_AEAD];
+ struct xfrm_algo_aead *algp;
+
+ if (!rt)
+ return 0;
+
+ algp = nla_data(rt);
+ if (nla_len(rt) < aead_len(algp))
+ return -EINVAL;
+
+ algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0';
+ return 0;
+}
+
+static void verify_one_addr(struct nlattr **attrs, enum xfrm_attr_type_t type,
+ xfrm_address_t **addrp)
+{
+ struct nlattr *rt = attrs[type];
+
+ if (rt && addrp)
+ *addrp = nla_data(rt);
+}
+
+static inline int verify_sec_ctx_len(struct nlattr **attrs)
+{
+ struct nlattr *rt = attrs[XFRMA_SEC_CTX];
+ struct xfrm_user_sec_ctx *uctx;
+
+ if (!rt)
+ return 0;
+
+ uctx = nla_data(rt);
+ if (uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len))
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline int verify_replay(struct xfrm_usersa_info *p,
+ struct nlattr **attrs)
+{
+ struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL];
+ struct xfrm_replay_state_esn *rs;
+
+ if (p->flags & XFRM_STATE_ESN) {
+ if (!rt)
+ return -EINVAL;
+
+ rs = nla_data(rt);
+
+ if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8)
+ return -EINVAL;
+
+ if (nla_len(rt) < xfrm_replay_state_esn_len(rs) &&
+ nla_len(rt) != sizeof(*rs))
+ return -EINVAL;
+ }
+
+ if (!rt)
+ return 0;
+
+ /* As only ESP and AH support ESN feature. */
+ if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH))
+ return -EINVAL;
+
+ if (p->replay_window != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int verify_newsa_info(struct xfrm_usersa_info *p,
+ struct nlattr **attrs)
+{
+ int err;
+
+ err = -EINVAL;
+ switch (p->family) {
+ case AF_INET:
+ break;
+
+ case AF_INET6:
+#if IS_ENABLED(CONFIG_IPV6)
+ break;
+#else
+ err = -EAFNOSUPPORT;
+ goto out;
+#endif
+
+ default:
+ goto out;
+ }
+
+ err = -EINVAL;
+ switch (p->id.proto) {
+ case IPPROTO_AH:
+ if ((!attrs[XFRMA_ALG_AUTH] &&
+ !attrs[XFRMA_ALG_AUTH_TRUNC]) ||
+ attrs[XFRMA_ALG_AEAD] ||
+ attrs[XFRMA_ALG_CRYPT] ||
+ attrs[XFRMA_ALG_COMP] ||
+ attrs[XFRMA_TFCPAD])
+ goto out;
+ break;
+
+ case IPPROTO_ESP:
+ if (attrs[XFRMA_ALG_COMP])
+ goto out;
+ if (!attrs[XFRMA_ALG_AUTH] &&
+ !attrs[XFRMA_ALG_AUTH_TRUNC] &&
+ !attrs[XFRMA_ALG_CRYPT] &&
+ !attrs[XFRMA_ALG_AEAD])
+ goto out;
+ if ((attrs[XFRMA_ALG_AUTH] ||
+ attrs[XFRMA_ALG_AUTH_TRUNC] ||
+ attrs[XFRMA_ALG_CRYPT]) &&
+ attrs[XFRMA_ALG_AEAD])
+ goto out;
+ if (attrs[XFRMA_TFCPAD] &&
+ p->mode != XFRM_MODE_TUNNEL)
+ goto out;
+ break;
+
+ case IPPROTO_COMP:
+ if (!attrs[XFRMA_ALG_COMP] ||
+ attrs[XFRMA_ALG_AEAD] ||
+ attrs[XFRMA_ALG_AUTH] ||
+ attrs[XFRMA_ALG_AUTH_TRUNC] ||
+ attrs[XFRMA_ALG_CRYPT] ||
+ attrs[XFRMA_TFCPAD] ||
+ (ntohl(p->id.spi) >= 0x10000))
+ goto out;
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ if (attrs[XFRMA_ALG_COMP] ||
+ attrs[XFRMA_ALG_AUTH] ||
+ attrs[XFRMA_ALG_AUTH_TRUNC] ||
+ attrs[XFRMA_ALG_AEAD] ||
+ attrs[XFRMA_ALG_CRYPT] ||
+ attrs[XFRMA_ENCAP] ||
+ attrs[XFRMA_SEC_CTX] ||
+ attrs[XFRMA_TFCPAD] ||
+ !attrs[XFRMA_COADDR])
+ goto out;
+ break;
+#endif
+
+ default:
+ goto out;
+ }
+
+ if ((err = verify_aead(attrs)))
+ goto out;
+ if ((err = verify_auth_trunc(attrs)))
+ goto out;
+ if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH)))
+ goto out;
+ if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT)))
+ goto out;
+ if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP)))
+ goto out;
+ if ((err = verify_sec_ctx_len(attrs)))
+ goto out;
+ if ((err = verify_replay(p, attrs)))
+ goto out;
+
+ err = -EINVAL;
+ switch (p->mode) {
+ case XFRM_MODE_TRANSPORT:
+ case XFRM_MODE_TUNNEL:
+ case XFRM_MODE_ROUTEOPTIMIZATION:
+ case XFRM_MODE_BEET:
+ break;
+
+ default:
+ goto out;
+ }
+
+ err = 0;
+
+out:
+ return err;
+}
+
+static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
+ struct xfrm_algo_desc *(*get_byname)(const char *, int),
+ struct nlattr *rta)
+{
+ struct xfrm_algo *p, *ualg;
+ struct xfrm_algo_desc *algo;
+
+ if (!rta)
+ return 0;
+
+ ualg = nla_data(rta);
+
+ algo = get_byname(ualg->alg_name, 1);
+ if (!algo)
+ return -ENOSYS;
+ *props = algo->desc.sadb_alg_id;
+
+ p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ strcpy(p->alg_name, algo->name);
+ *algpp = p;
+ return 0;
+}
+
+static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
+ struct nlattr *rta)
+{
+ struct xfrm_algo *ualg;
+ struct xfrm_algo_auth *p;
+ struct xfrm_algo_desc *algo;
+
+ if (!rta)
+ return 0;
+
+ ualg = nla_data(rta);
+
+ algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
+ if (!algo)
+ return -ENOSYS;
+ *props = algo->desc.sadb_alg_id;
+
+ p = kmalloc(sizeof(*p) + (ualg->alg_key_len + 7) / 8, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ strcpy(p->alg_name, algo->name);
+ p->alg_key_len = ualg->alg_key_len;
+ p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;
+ memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8);
+
+ *algpp = p;
+ return 0;
+}
+
+static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
+ struct nlattr *rta)
+{
+ struct xfrm_algo_auth *p, *ualg;
+ struct xfrm_algo_desc *algo;
+
+ if (!rta)
+ return 0;
+
+ ualg = nla_data(rta);
+
+ algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
+ if (!algo)
+ return -ENOSYS;
+ if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits)
+ return -EINVAL;
+ *props = algo->desc.sadb_alg_id;
+
+ p = kmemdup(ualg, xfrm_alg_auth_len(ualg), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ strcpy(p->alg_name, algo->name);
+ if (!p->alg_trunc_len)
+ p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;
+
+ *algpp = p;
+ return 0;
+}
+
+static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props,
+ struct nlattr *rta)
+{
+ struct xfrm_algo_aead *p, *ualg;
+ struct xfrm_algo_desc *algo;
+
+ if (!rta)
+ return 0;
+
+ ualg = nla_data(rta);
+
+ algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1);
+ if (!algo)
+ return -ENOSYS;
+ *props = algo->desc.sadb_alg_id;
+
+ p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ strcpy(p->alg_name, algo->name);
+ *algpp = p;
+ return 0;
+}
+
+static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn,
+ struct nlattr *rp)
+{
+ struct xfrm_replay_state_esn *up;
+ int ulen;
+
+ if (!replay_esn || !rp)
+ return 0;
+
+ up = nla_data(rp);
+ ulen = xfrm_replay_state_esn_len(up);
+
+ if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn,
+ struct xfrm_replay_state_esn **preplay_esn,
+ struct nlattr *rta)
+{
+ struct xfrm_replay_state_esn *p, *pp, *up;
+ int klen, ulen;
+
+ if (!rta)
+ return 0;
+
+ up = nla_data(rta);
+ klen = xfrm_replay_state_esn_len(up);
+ ulen = nla_len(rta) >= klen ? klen : sizeof(*up);
+
+ p = kzalloc(klen, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ pp = kzalloc(klen, GFP_KERNEL);
+ if (!pp) {
+ kfree(p);
+ return -ENOMEM;
+ }
+
+ memcpy(p, up, ulen);
+ memcpy(pp, up, ulen);
+
+ *replay_esn = p;
+ *preplay_esn = pp;
+
+ return 0;
+}
+
+static inline int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx)
+{
+ int len = 0;
+
+ if (xfrm_ctx) {
+ len += sizeof(struct xfrm_user_sec_ctx);
+ len += xfrm_ctx->ctx_len;
+ }
+ return len;
+}
+
+static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
+{
+ memcpy(&x->id, &p->id, sizeof(x->id));
+ memcpy(&x->sel, &p->sel, sizeof(x->sel));
+ memcpy(&x->lft, &p->lft, sizeof(x->lft));
+ x->props.mode = p->mode;
+ x->props.replay_window = min_t(unsigned int, p->replay_window,
+ sizeof(x->replay.bitmap) * 8);
+ x->props.reqid = p->reqid;
+ x->props.family = p->family;
+ memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr));
+ x->props.flags = p->flags;
+
+ if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC))
+ x->sel.family = p->family;
+}
+
+/*
+ * someday when pfkey also has support, we could have the code
+ * somehow made shareable and move it to xfrm_state.c - JHS
+ *
+*/
+static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs,
+ int update_esn)
+{
+ struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
+ struct nlattr *re = update_esn ? attrs[XFRMA_REPLAY_ESN_VAL] : NULL;
+ struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
+ struct nlattr *et = attrs[XFRMA_ETIMER_THRESH];
+ struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH];
+
+ if (re) {
+ struct xfrm_replay_state_esn *replay_esn;
+ replay_esn = nla_data(re);
+ memcpy(x->replay_esn, replay_esn,
+ xfrm_replay_state_esn_len(replay_esn));
+ memcpy(x->preplay_esn, replay_esn,
+ xfrm_replay_state_esn_len(replay_esn));
+ }
+
+ if (rp) {
+ struct xfrm_replay_state *replay;
+ replay = nla_data(rp);
+ memcpy(&x->replay, replay, sizeof(*replay));
+ memcpy(&x->preplay, replay, sizeof(*replay));
+ }
+
+ if (lt) {
+ struct xfrm_lifetime_cur *ltime;
+ ltime = nla_data(lt);
+ x->curlft.bytes = ltime->bytes;
+ x->curlft.packets = ltime->packets;
+ x->curlft.add_time = ltime->add_time;
+ x->curlft.use_time = ltime->use_time;
+ }
+
+ if (et)
+ x->replay_maxage = nla_get_u32(et);
+
+ if (rt)
+ x->replay_maxdiff = nla_get_u32(rt);
+}
+
+static struct xfrm_state *xfrm_state_construct(struct net *net,
+ struct xfrm_usersa_info *p,
+ struct nlattr **attrs,
+ int *errp)
+{
+ struct xfrm_state *x = xfrm_state_alloc(net);
+ int err = -ENOMEM;
+
+ if (!x)
+ goto error_no_put;
+
+ copy_from_user_state(x, p);
+
+ if (attrs[XFRMA_SA_EXTRA_FLAGS])
+ x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]);
+
+ if ((err = attach_aead(&x->aead, &x->props.ealgo,
+ attrs[XFRMA_ALG_AEAD])))
+ goto error;
+ if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo,
+ attrs[XFRMA_ALG_AUTH_TRUNC])))
+ goto error;
+ if (!x->props.aalgo) {
+ if ((err = attach_auth(&x->aalg, &x->props.aalgo,
+ attrs[XFRMA_ALG_AUTH])))
+ goto error;
+ }
+ if ((err = attach_one_algo(&x->ealg, &x->props.ealgo,
+ xfrm_ealg_get_byname,
+ attrs[XFRMA_ALG_CRYPT])))
+ goto error;
+ if ((err = attach_one_algo(&x->calg, &x->props.calgo,
+ xfrm_calg_get_byname,
+ attrs[XFRMA_ALG_COMP])))
+ goto error;
+
+ if (attrs[XFRMA_ENCAP]) {
+ x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]),
+ sizeof(*x->encap), GFP_KERNEL);
+ if (x->encap == NULL)
+ goto error;
+ }
+
+ if (attrs[XFRMA_TFCPAD])
+ x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]);
+
+ if (attrs[XFRMA_COADDR]) {
+ x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]),
+ sizeof(*x->coaddr), GFP_KERNEL);
+ if (x->coaddr == NULL)
+ goto error;
+ }
+
+ xfrm_mark_get(attrs, &x->mark);
+
+ err = __xfrm_init_state(x, false);
+ if (err)
+ goto error;
+
+ if (attrs[XFRMA_SEC_CTX] &&
+ security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])))
+ goto error;
+
+ if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
+ attrs[XFRMA_REPLAY_ESN_VAL])))
+ goto error;
+
+ x->km.seq = p->seq;
+ x->replay_maxdiff = net->xfrm.sysctl_aevent_rseqth;
+ /* sysctl_xfrm_aevent_etime is in 100ms units */
+ x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M;
+
+ if ((err = xfrm_init_replay(x)))
+ goto error;
+
+ /* override default values from above */
+ xfrm_update_ae_params(x, attrs, 0);
+
+ return x;
+
+error:
+ x->km.state = XFRM_STATE_DEAD;
+ xfrm_state_put(x);
+error_no_put:
+ *errp = err;
+ return NULL;
+}
+
+static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_usersa_info *p = nlmsg_data(nlh);
+ struct xfrm_state *x;
+ int err;
+ struct km_event c;
+
+ err = verify_newsa_info(p, attrs);
+ if (err)
+ return err;
+
+ x = xfrm_state_construct(net, p, attrs, &err);
+ if (!x)
+ return err;
+
+ xfrm_state_hold(x);
+ if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
+ err = xfrm_state_add(x);
+ else
+ err = xfrm_state_update(x);
+
+ xfrm_audit_state_add(x, err ? 0 : 1, true);
+
+ if (err < 0) {
+ x->km.state = XFRM_STATE_DEAD;
+ __xfrm_state_put(x);
+ goto out;
+ }
+
+ c.seq = nlh->nlmsg_seq;
+ c.portid = nlh->nlmsg_pid;
+ c.event = nlh->nlmsg_type;
+
+ km_state_notify(x, &c);
+out:
+ xfrm_state_put(x);
+ return err;
+}
+
+static struct xfrm_state *xfrm_user_state_lookup(struct net *net,
+ struct xfrm_usersa_id *p,
+ struct nlattr **attrs,
+ int *errp)
+{
+ struct xfrm_state *x = NULL;
+ struct xfrm_mark m;
+ int err;
+ u32 mark = xfrm_mark_get(attrs, &m);
+
+ if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY)) {
+ err = -ESRCH;
+ x = xfrm_state_lookup(net, mark, &p->daddr, p->spi, p->proto, p->family);
+ } else {
+ xfrm_address_t *saddr = NULL;
+
+ verify_one_addr(attrs, XFRMA_SRCADDR, &saddr);
+ if (!saddr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = -ESRCH;
+ x = xfrm_state_lookup_byaddr(net, mark,
+ &p->daddr, saddr,
+ p->proto, p->family);
+ }
+
+ out:
+ if (!x && errp)
+ *errp = err;
+ return x;
+}
+
+static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_state *x;
+ int err = -ESRCH;
+ struct km_event c;
+ struct xfrm_usersa_id *p = nlmsg_data(nlh);
+
+ x = xfrm_user_state_lookup(net, p, attrs, &err);
+ if (x == NULL)
+ return err;
+
+ if ((err = security_xfrm_state_delete(x)) != 0)
+ goto out;
+
+ if (xfrm_state_kern(x)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ err = xfrm_state_delete(x);
+
+ if (err < 0)
+ goto out;
+
+ c.seq = nlh->nlmsg_seq;
+ c.portid = nlh->nlmsg_pid;
+ c.event = nlh->nlmsg_type;
+ km_state_notify(x, &c);
+
+out:
+ xfrm_audit_state_delete(x, err ? 0 : 1, true);
+ xfrm_state_put(x);
+ return err;
+}
+
+static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
+{
+ memset(p, 0, sizeof(*p));
+ memcpy(&p->id, &x->id, sizeof(p->id));
+ memcpy(&p->sel, &x->sel, sizeof(p->sel));
+ memcpy(&p->lft, &x->lft, sizeof(p->lft));
+ memcpy(&p->curlft, &x->curlft, sizeof(p->curlft));
+ memcpy(&p->stats, &x->stats, sizeof(p->stats));
+ memcpy(&p->saddr, &x->props.saddr, sizeof(p->saddr));
+ p->mode = x->props.mode;
+ p->replay_window = x->props.replay_window;
+ p->reqid = x->props.reqid;
+ p->family = x->props.family;
+ p->flags = x->props.flags;
+ p->seq = x->km.seq;
+}
+
+struct xfrm_dump_info {
+ struct sk_buff *in_skb;
+ struct sk_buff *out_skb;
+ u32 nlmsg_seq;
+ u16 nlmsg_flags;
+};
+
+static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb)
+{
+ struct xfrm_user_sec_ctx *uctx;
+ struct nlattr *attr;
+ int ctx_size = sizeof(*uctx) + s->ctx_len;
+
+ attr = nla_reserve(skb, XFRMA_SEC_CTX, ctx_size);
+ if (attr == NULL)
+ return -EMSGSIZE;
+
+ uctx = nla_data(attr);
+ uctx->exttype = XFRMA_SEC_CTX;
+ uctx->len = ctx_size;
+ uctx->ctx_doi = s->ctx_doi;
+ uctx->ctx_alg = s->ctx_alg;
+ uctx->ctx_len = s->ctx_len;
+ memcpy(uctx + 1, s->ctx_str, s->ctx_len);
+
+ return 0;
+}
+
+static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
+{
+ struct xfrm_algo *algo;
+ struct nlattr *nla;
+
+ nla = nla_reserve(skb, XFRMA_ALG_AUTH,
+ sizeof(*algo) + (auth->alg_key_len + 7) / 8);
+ if (!nla)
+ return -EMSGSIZE;
+
+ algo = nla_data(nla);
+ strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name));
+ memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8);
+ algo->alg_key_len = auth->alg_key_len;
+
+ return 0;
+}
+
+/* Don't change this without updating xfrm_sa_len! */
+static int copy_to_user_state_extra(struct xfrm_state *x,
+ struct xfrm_usersa_info *p,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+
+ copy_to_user_state(x, p);
+
+ if (x->props.extra_flags) {
+ ret = nla_put_u32(skb, XFRMA_SA_EXTRA_FLAGS,
+ x->props.extra_flags);
+ if (ret)
+ goto out;
+ }
+
+ if (x->coaddr) {
+ ret = nla_put(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr);
+ if (ret)
+ goto out;
+ }
+ if (x->lastused) {
+ ret = nla_put_u64(skb, XFRMA_LASTUSED, x->lastused);
+ if (ret)
+ goto out;
+ }
+ if (x->aead) {
+ ret = nla_put(skb, XFRMA_ALG_AEAD, aead_len(x->aead), x->aead);
+ if (ret)
+ goto out;
+ }
+ if (x->aalg) {
+ ret = copy_to_user_auth(x->aalg, skb);
+ if (!ret)
+ ret = nla_put(skb, XFRMA_ALG_AUTH_TRUNC,
+ xfrm_alg_auth_len(x->aalg), x->aalg);
+ if (ret)
+ goto out;
+ }
+ if (x->ealg) {
+ ret = nla_put(skb, XFRMA_ALG_CRYPT, xfrm_alg_len(x->ealg), x->ealg);
+ if (ret)
+ goto out;
+ }
+ if (x->calg) {
+ ret = nla_put(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg);
+ if (ret)
+ goto out;
+ }
+ if (x->encap) {
+ ret = nla_put(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
+ if (ret)
+ goto out;
+ }
+ if (x->tfcpad) {
+ ret = nla_put_u32(skb, XFRMA_TFCPAD, x->tfcpad);
+ if (ret)
+ goto out;
+ }
+ ret = xfrm_mark_put(skb, &x->mark);
+ if (ret)
+ goto out;
+ if (x->replay_esn)
+ ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
+ xfrm_replay_state_esn_len(x->replay_esn),
+ x->replay_esn);
+ else
+ ret = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay),
+ &x->replay);
+ if (ret)
+ goto out;
+ if (x->security)
+ ret = copy_sec_ctx(x->security, skb);
+out:
+ return ret;
+}
+
+static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
+{
+ struct xfrm_dump_info *sp = ptr;
+ struct sk_buff *in_skb = sp->in_skb;
+ struct sk_buff *skb = sp->out_skb;
+ struct xfrm_usersa_info *p;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, sp->nlmsg_seq,
+ XFRM_MSG_NEWSA, sizeof(*p), sp->nlmsg_flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ p = nlmsg_data(nlh);
+
+ err = copy_to_user_state_extra(x, p, skb);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_dump_sa_done(struct netlink_callback *cb)
+{
+ struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1];
+ struct sock *sk = cb->skb->sk;
+ struct net *net = sock_net(sk);
+
+ xfrm_state_walk_done(walk, net);
+ return 0;
+}
+
+static const struct nla_policy xfrma_policy[XFRMA_MAX+1];
+static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1];
+ struct xfrm_dump_info info;
+
+ BUILD_BUG_ON(sizeof(struct xfrm_state_walk) >
+ sizeof(cb->args) - sizeof(cb->args[0]));
+
+ info.in_skb = cb->skb;
+ info.out_skb = skb;
+ info.nlmsg_seq = cb->nlh->nlmsg_seq;
+ info.nlmsg_flags = NLM_F_MULTI;
+
+ if (!cb->args[0]) {
+ struct nlattr *attrs[XFRMA_MAX+1];
+ struct xfrm_address_filter *filter = NULL;
+ u8 proto = 0;
+ int err;
+
+ cb->args[0] = 1;
+
+ err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX,
+ xfrma_policy);
+ if (err < 0)
+ return err;
+
+ if (attrs[XFRMA_ADDRESS_FILTER]) {
+ filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+ if (filter == NULL)
+ return -ENOMEM;
+
+ memcpy(filter, nla_data(attrs[XFRMA_ADDRESS_FILTER]),
+ sizeof(*filter));
+ }
+
+ if (attrs[XFRMA_PROTO])
+ proto = nla_get_u8(attrs[XFRMA_PROTO]);
+
+ xfrm_state_walk_init(walk, proto, filter);
+ }
+
+ (void) xfrm_state_walk(net, walk, dump_one_state, &info);
+
+ return skb->len;
+}
+
+static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb,
+ struct xfrm_state *x, u32 seq)
+{
+ struct xfrm_dump_info info;
+ struct sk_buff *skb;
+ int err;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ info.in_skb = in_skb;
+ info.out_skb = skb;
+ info.nlmsg_seq = seq;
+ info.nlmsg_flags = 0;
+
+ err = dump_one_state(x, 0, &info);
+ if (err) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ return skb;
+}
+
+/* A wrapper for nlmsg_multicast() checking that nlsk is still available.
+ * Must be called with RCU read lock.
+ */
+static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb,
+ u32 pid, unsigned int group)
+{
+ struct sock *nlsk = rcu_dereference(net->xfrm.nlsk);
+
+ if (nlsk)
+ return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
+ else
+ return -1;
+}
+
+static inline size_t xfrm_spdinfo_msgsize(void)
+{
+ return NLMSG_ALIGN(4)
+ + nla_total_size(sizeof(struct xfrmu_spdinfo))
+ + nla_total_size(sizeof(struct xfrmu_spdhinfo))
+ + nla_total_size(sizeof(struct xfrmu_spdhthresh))
+ + nla_total_size(sizeof(struct xfrmu_spdhthresh));
+}
+
+static int build_spdinfo(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq, u32 flags)
+{
+ struct xfrmk_spdinfo si;
+ struct xfrmu_spdinfo spc;
+ struct xfrmu_spdhinfo sph;
+ struct xfrmu_spdhthresh spt4, spt6;
+ struct nlmsghdr *nlh;
+ int err;
+ u32 *f;
+ unsigned lseq;
+
+ nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0);
+ if (nlh == NULL) /* shouldn't really happen ... */
+ return -EMSGSIZE;
+
+ f = nlmsg_data(nlh);
+ *f = flags;
+ xfrm_spd_getinfo(net, &si);
+ spc.incnt = si.incnt;
+ spc.outcnt = si.outcnt;
+ spc.fwdcnt = si.fwdcnt;
+ spc.inscnt = si.inscnt;
+ spc.outscnt = si.outscnt;
+ spc.fwdscnt = si.fwdscnt;
+ sph.spdhcnt = si.spdhcnt;
+ sph.spdhmcnt = si.spdhmcnt;
+
+ do {
+ lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock);
+
+ spt4.lbits = net->xfrm.policy_hthresh.lbits4;
+ spt4.rbits = net->xfrm.policy_hthresh.rbits4;
+ spt6.lbits = net->xfrm.policy_hthresh.lbits6;
+ spt6.rbits = net->xfrm.policy_hthresh.rbits6;
+ } while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq));
+
+ err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc);
+ if (!err)
+ err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph);
+ if (!err)
+ err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4);
+ if (!err)
+ err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrmu_spdhthresh *thresh4 = NULL;
+ struct xfrmu_spdhthresh *thresh6 = NULL;
+
+ /* selector prefixlen thresholds to hash policies */
+ if (attrs[XFRMA_SPD_IPV4_HTHRESH]) {
+ struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH];
+
+ if (nla_len(rta) < sizeof(*thresh4))
+ return -EINVAL;
+ thresh4 = nla_data(rta);
+ if (thresh4->lbits > 32 || thresh4->rbits > 32)
+ return -EINVAL;
+ }
+ if (attrs[XFRMA_SPD_IPV6_HTHRESH]) {
+ struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH];
+
+ if (nla_len(rta) < sizeof(*thresh6))
+ return -EINVAL;
+ thresh6 = nla_data(rta);
+ if (thresh6->lbits > 128 || thresh6->rbits > 128)
+ return -EINVAL;
+ }
+
+ if (thresh4 || thresh6) {
+ write_seqlock(&net->xfrm.policy_hthresh.lock);
+ if (thresh4) {
+ net->xfrm.policy_hthresh.lbits4 = thresh4->lbits;
+ net->xfrm.policy_hthresh.rbits4 = thresh4->rbits;
+ }
+ if (thresh6) {
+ net->xfrm.policy_hthresh.lbits6 = thresh6->lbits;
+ net->xfrm.policy_hthresh.rbits6 = thresh6->rbits;
+ }
+ write_sequnlock(&net->xfrm.policy_hthresh.lock);
+
+ xfrm_policy_hash_rebuild(net);
+ }
+
+ return 0;
+}
+
+static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct sk_buff *r_skb;
+ u32 *flags = nlmsg_data(nlh);
+ u32 sportid = NETLINK_CB(skb).portid;
+ u32 seq = nlh->nlmsg_seq;
+
+ r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC);
+ if (r_skb == NULL)
+ return -ENOMEM;
+
+ if (build_spdinfo(r_skb, net, sportid, seq, *flags) < 0)
+ BUG();
+
+ return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);
+}
+
+static inline size_t xfrm_sadinfo_msgsize(void)
+{
+ return NLMSG_ALIGN(4)
+ + nla_total_size(sizeof(struct xfrmu_sadhinfo))
+ + nla_total_size(4); /* XFRMA_SAD_CNT */
+}
+
+static int build_sadinfo(struct sk_buff *skb, struct net *net,
+ u32 portid, u32 seq, u32 flags)
+{
+ struct xfrmk_sadinfo si;
+ struct xfrmu_sadhinfo sh;
+ struct nlmsghdr *nlh;
+ int err;
+ u32 *f;
+
+ nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSADINFO, sizeof(u32), 0);
+ if (nlh == NULL) /* shouldn't really happen ... */
+ return -EMSGSIZE;
+
+ f = nlmsg_data(nlh);
+ *f = flags;
+ xfrm_sad_getinfo(net, &si);
+
+ sh.sadhmcnt = si.sadhmcnt;
+ sh.sadhcnt = si.sadhcnt;
+
+ err = nla_put_u32(skb, XFRMA_SAD_CNT, si.sadcnt);
+ if (!err)
+ err = nla_put(skb, XFRMA_SAD_HINFO, sizeof(sh), &sh);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct sk_buff *r_skb;
+ u32 *flags = nlmsg_data(nlh);
+ u32 sportid = NETLINK_CB(skb).portid;
+ u32 seq = nlh->nlmsg_seq;
+
+ r_skb = nlmsg_new(xfrm_sadinfo_msgsize(), GFP_ATOMIC);
+ if (r_skb == NULL)
+ return -ENOMEM;
+
+ if (build_sadinfo(r_skb, net, sportid, seq, *flags) < 0)
+ BUG();
+
+ return nlmsg_unicast(net->xfrm.nlsk, r_skb, sportid);
+}
+
+static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_usersa_id *p = nlmsg_data(nlh);
+ struct xfrm_state *x;
+ struct sk_buff *resp_skb;
+ int err = -ESRCH;
+
+ x = xfrm_user_state_lookup(net, p, attrs, &err);
+ if (x == NULL)
+ goto out_noput;
+
+ resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
+ if (IS_ERR(resp_skb)) {
+ err = PTR_ERR(resp_skb);
+ } else {
+ err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid);
+ }
+ xfrm_state_put(x);
+out_noput:
+ return err;
+}
+
+static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_state *x;
+ struct xfrm_userspi_info *p;
+ struct sk_buff *resp_skb;
+ xfrm_address_t *daddr;
+ int family;
+ int err;
+ u32 mark;
+ struct xfrm_mark m;
+
+ p = nlmsg_data(nlh);
+ err = verify_spi_info(p->info.id.proto, p->min, p->max);
+ if (err)
+ goto out_noput;
+
+ family = p->info.family;
+ daddr = &p->info.id.daddr;
+
+ x = NULL;
+
+ mark = xfrm_mark_get(attrs, &m);
+ if (p->info.seq) {
+ x = xfrm_find_acq_byseq(net, mark, p->info.seq);
+ if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
+ xfrm_state_put(x);
+ x = NULL;
+ }
+ }
+
+ if (!x)
+ x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
+ p->info.id.proto, daddr,
+ &p->info.saddr, 1,
+ family);
+ err = -ENOENT;
+ if (x == NULL)
+ goto out_noput;
+
+ err = xfrm_alloc_spi(x, p->min, p->max);
+ if (err)
+ goto out;
+
+ resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
+ if (IS_ERR(resp_skb)) {
+ err = PTR_ERR(resp_skb);
+ goto out;
+ }
+
+ err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid);
+
+out:
+ xfrm_state_put(x);
+out_noput:
+ return err;
+}
+
+static int verify_policy_dir(u8 dir)
+{
+ switch (dir) {
+ case XFRM_POLICY_IN:
+ case XFRM_POLICY_OUT:
+ case XFRM_POLICY_FWD:
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int verify_policy_type(u8 type)
+{
+ switch (type) {
+ case XFRM_POLICY_TYPE_MAIN:
+#ifdef CONFIG_XFRM_SUB_POLICY
+ case XFRM_POLICY_TYPE_SUB:
+#endif
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
+{
+ int ret;
+
+ switch (p->share) {
+ case XFRM_SHARE_ANY:
+ case XFRM_SHARE_SESSION:
+ case XFRM_SHARE_USER:
+ case XFRM_SHARE_UNIQUE:
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ switch (p->action) {
+ case XFRM_POLICY_ALLOW:
+ case XFRM_POLICY_BLOCK:
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ switch (p->sel.family) {
+ case AF_INET:
+ break;
+
+ case AF_INET6:
+#if IS_ENABLED(CONFIG_IPV6)
+ break;
+#else
+ return -EAFNOSUPPORT;
+#endif
+
+ default:
+ return -EINVAL;
+ }
+
+ ret = verify_policy_dir(p->dir);
+ if (ret)
+ return ret;
+ if (p->index && ((p->index & XFRM_POLICY_MAX) != p->dir))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs)
+{
+ struct nlattr *rt = attrs[XFRMA_SEC_CTX];
+ struct xfrm_user_sec_ctx *uctx;
+
+ if (!rt)
+ return 0;
+
+ uctx = nla_data(rt);
+ return security_xfrm_policy_alloc(&pol->security, uctx, GFP_KERNEL);
+}
+
+static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
+ int nr)
+{
+ int i;
+
+ xp->xfrm_nr = nr;
+ for (i = 0; i < nr; i++, ut++) {
+ struct xfrm_tmpl *t = &xp->xfrm_vec[i];
+
+ memcpy(&t->id, &ut->id, sizeof(struct xfrm_id));
+ memcpy(&t->saddr, &ut->saddr,
+ sizeof(xfrm_address_t));
+ t->reqid = ut->reqid;
+ t->mode = ut->mode;
+ t->share = ut->share;
+ t->optional = ut->optional;
+ t->aalgos = ut->aalgos;
+ t->ealgos = ut->ealgos;
+ t->calgos = ut->calgos;
+ /* If all masks are ~0, then we allow all algorithms. */
+ t->allalgs = !~(t->aalgos & t->ealgos & t->calgos);
+ t->encap_family = ut->family;
+ }
+}
+
+static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
+{
+ int i;
+
+ if (nr > XFRM_MAX_DEPTH)
+ return -EINVAL;
+
+ for (i = 0; i < nr; i++) {
+ /* We never validated the ut->family value, so many
+ * applications simply leave it at zero. The check was
+ * never made and ut->family was ignored because all
+ * templates could be assumed to have the same family as
+ * the policy itself. Now that we will have ipv4-in-ipv6
+ * and ipv6-in-ipv4 tunnels, this is no longer true.
+ */
+ if (!ut[i].family)
+ ut[i].family = family;
+
+ switch (ut[i].family) {
+ case AF_INET:
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs)
+{
+ struct nlattr *rt = attrs[XFRMA_TMPL];
+
+ if (!rt) {
+ pol->xfrm_nr = 0;
+ } else {
+ struct xfrm_user_tmpl *utmpl = nla_data(rt);
+ int nr = nla_len(rt) / sizeof(*utmpl);
+ int err;
+
+ err = validate_tmpl(nr, utmpl, pol->family);
+ if (err)
+ return err;
+
+ copy_templates(pol, utmpl, nr);
+ }
+ return 0;
+}
+
+static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs)
+{
+ struct nlattr *rt = attrs[XFRMA_POLICY_TYPE];
+ struct xfrm_userpolicy_type *upt;
+ u8 type = XFRM_POLICY_TYPE_MAIN;
+ int err;
+
+ if (rt) {
+ upt = nla_data(rt);
+ type = upt->type;
+ }
+
+ err = verify_policy_type(type);
+ if (err)
+ return err;
+
+ *tp = type;
+ return 0;
+}
+
+static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p)
+{
+ xp->priority = p->priority;
+ xp->index = p->index;
+ memcpy(&xp->selector, &p->sel, sizeof(xp->selector));
+ memcpy(&xp->lft, &p->lft, sizeof(xp->lft));
+ xp->action = p->action;
+ xp->flags = p->flags;
+ xp->family = p->sel.family;
+ /* XXX xp->share = p->share; */
+}
+
+static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir)
+{
+ memset(p, 0, sizeof(*p));
+ memcpy(&p->sel, &xp->selector, sizeof(p->sel));
+ memcpy(&p->lft, &xp->lft, sizeof(p->lft));
+ memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft));
+ p->priority = xp->priority;
+ p->index = xp->index;
+ p->sel.family = xp->family;
+ p->dir = dir;
+ p->action = xp->action;
+ p->flags = xp->flags;
+ p->share = XFRM_SHARE_ANY; /* XXX xp->share */
+}
+
+static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_userpolicy_info *p, struct nlattr **attrs, int *errp)
+{
+ struct xfrm_policy *xp = xfrm_policy_alloc(net, GFP_KERNEL);
+ int err;
+
+ if (!xp) {
+ *errp = -ENOMEM;
+ return NULL;
+ }
+
+ copy_from_user_policy(xp, p);
+
+ err = copy_from_user_policy_type(&xp->type, attrs);
+ if (err)
+ goto error;
+
+ if (!(err = copy_from_user_tmpl(xp, attrs)))
+ err = copy_from_user_sec_ctx(xp, attrs);
+ if (err)
+ goto error;
+
+ xfrm_mark_get(attrs, &xp->mark);
+
+ return xp;
+ error:
+ *errp = err;
+ xp->walk.dead = 1;
+ xfrm_policy_destroy(xp);
+ return NULL;
+}
+
+static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_userpolicy_info *p = nlmsg_data(nlh);
+ struct xfrm_policy *xp;
+ struct km_event c;
+ int err;
+ int excl;
+
+ err = verify_newpolicy_info(p);
+ if (err)
+ return err;
+ err = verify_sec_ctx_len(attrs);
+ if (err)
+ return err;
+
+ xp = xfrm_policy_construct(net, p, attrs, &err);
+ if (!xp)
+ return err;
+
+ /* shouldn't excl be based on nlh flags??
+ * Aha! this is anti-netlink really i.e more pfkey derived
+ * in netlink excl is a flag and you wouldnt need
+ * a type XFRM_MSG_UPDPOLICY - JHS */
+ excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
+ err = xfrm_policy_insert(p->dir, xp, excl);
+ xfrm_audit_policy_add(xp, err ? 0 : 1, true);
+
+ if (err) {
+ security_xfrm_policy_free(xp->security);
+ kfree(xp);
+ return err;
+ }
+
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.portid = nlh->nlmsg_pid;
+ km_policy_notify(xp, p->dir, &c);
+
+ xfrm_pol_put(xp);
+
+ return 0;
+}
+
+static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+ struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH];
+ int i;
+
+ if (xp->xfrm_nr == 0)
+ return 0;
+
+ for (i = 0; i < xp->xfrm_nr; i++) {
+ struct xfrm_user_tmpl *up = &vec[i];
+ struct xfrm_tmpl *kp = &xp->xfrm_vec[i];
+
+ memset(up, 0, sizeof(*up));
+ memcpy(&up->id, &kp->id, sizeof(up->id));
+ up->family = kp->encap_family;
+ memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr));
+ up->reqid = kp->reqid;
+ up->mode = kp->mode;
+ up->share = kp->share;
+ up->optional = kp->optional;
+ up->aalgos = kp->aalgos;
+ up->ealgos = kp->ealgos;
+ up->calgos = kp->calgos;
+ }
+
+ return nla_put(skb, XFRMA_TMPL,
+ sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr, vec);
+}
+
+static inline int copy_to_user_state_sec_ctx(struct xfrm_state *x, struct sk_buff *skb)
+{
+ if (x->security) {
+ return copy_sec_ctx(x->security, skb);
+ }
+ return 0;
+}
+
+static inline int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+ if (xp->security)
+ return copy_sec_ctx(xp->security, skb);
+ return 0;
+}
+static inline size_t userpolicy_type_attrsize(void)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+ return nla_total_size(sizeof(struct xfrm_userpolicy_type));
+#else
+ return 0;
+#endif
+}
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+static int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
+{
+ struct xfrm_userpolicy_type upt = {
+ .type = type,
+ };
+
+ return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
+}
+
+#else
+static inline int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
+{
+ return 0;
+}
+#endif
+
+static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
+{
+ struct xfrm_dump_info *sp = ptr;
+ struct xfrm_userpolicy_info *p;
+ struct sk_buff *in_skb = sp->in_skb;
+ struct sk_buff *skb = sp->out_skb;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, sp->nlmsg_seq,
+ XFRM_MSG_NEWPOLICY, sizeof(*p), sp->nlmsg_flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ p = nlmsg_data(nlh);
+ copy_to_user_policy(xp, p, dir);
+ err = copy_to_user_tmpl(xp, skb);
+ if (!err)
+ err = copy_to_user_sec_ctx(xp, skb);
+ if (!err)
+ err = copy_to_user_policy_type(xp->type, skb);
+ if (!err)
+ err = xfrm_mark_put(skb, &xp->mark);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_dump_policy_done(struct netlink_callback *cb)
+{
+ struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
+ struct net *net = sock_net(cb->skb->sk);
+
+ xfrm_policy_walk_done(walk, net);
+ return 0;
+}
+
+static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
+ struct xfrm_dump_info info;
+
+ BUILD_BUG_ON(sizeof(struct xfrm_policy_walk) >
+ sizeof(cb->args) - sizeof(cb->args[0]));
+
+ info.in_skb = cb->skb;
+ info.out_skb = skb;
+ info.nlmsg_seq = cb->nlh->nlmsg_seq;
+ info.nlmsg_flags = NLM_F_MULTI;
+
+ if (!cb->args[0]) {
+ cb->args[0] = 1;
+ xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
+ }
+
+ (void) xfrm_policy_walk(net, walk, dump_one_policy, &info);
+
+ return skb->len;
+}
+
+static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
+ struct xfrm_policy *xp,
+ int dir, u32 seq)
+{
+ struct xfrm_dump_info info;
+ struct sk_buff *skb;
+ int err;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ info.in_skb = in_skb;
+ info.out_skb = skb;
+ info.nlmsg_seq = seq;
+ info.nlmsg_flags = 0;
+
+ err = dump_one_policy(xp, dir, 0, &info);
+ if (err) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ return skb;
+}
+
+static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_policy *xp;
+ struct xfrm_userpolicy_id *p;
+ u8 type = XFRM_POLICY_TYPE_MAIN;
+ int err;
+ struct km_event c;
+ int delete;
+ struct xfrm_mark m;
+ u32 mark = xfrm_mark_get(attrs, &m);
+
+ p = nlmsg_data(nlh);
+ delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
+
+ err = copy_from_user_policy_type(&type, attrs);
+ if (err)
+ return err;
+
+ err = verify_policy_dir(p->dir);
+ if (err)
+ return err;
+
+ if (p->index)
+ xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err);
+ else {
+ struct nlattr *rt = attrs[XFRMA_SEC_CTX];
+ struct xfrm_sec_ctx *ctx;
+
+ err = verify_sec_ctx_len(attrs);
+ if (err)
+ return err;
+
+ ctx = NULL;
+ if (rt) {
+ struct xfrm_user_sec_ctx *uctx = nla_data(rt);
+
+ err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL);
+ if (err)
+ return err;
+ }
+ xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel,
+ ctx, delete, &err);
+ security_xfrm_policy_free(ctx);
+ }
+ if (xp == NULL)
+ return -ENOENT;
+
+ if (!delete) {
+ struct sk_buff *resp_skb;
+
+ resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq);
+ if (IS_ERR(resp_skb)) {
+ err = PTR_ERR(resp_skb);
+ } else {
+ err = nlmsg_unicast(net->xfrm.nlsk, resp_skb,
+ NETLINK_CB(skb).portid);
+ }
+ } else {
+ xfrm_audit_policy_delete(xp, err ? 0 : 1, true);
+
+ if (err != 0)
+ goto out;
+
+ c.data.byid = p->index;
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.portid = nlh->nlmsg_pid;
+ km_policy_notify(xp, p->dir, &c);
+ }
+
+out:
+ xfrm_pol_put(xp);
+ if (delete && err == 0)
+ xfrm_garbage_collect(net);
+ return err;
+}
+
+static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct km_event c;
+ struct xfrm_usersa_flush *p = nlmsg_data(nlh);
+ int err;
+
+ err = xfrm_state_flush(net, p->proto, true);
+ if (err) {
+ if (err == -ESRCH) /* empty table */
+ return 0;
+ return err;
+ }
+ c.data.proto = p->proto;
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.portid = nlh->nlmsg_pid;
+ c.net = net;
+ km_state_notify(NULL, &c);
+
+ return 0;
+}
+
+static inline size_t xfrm_aevent_msgsize(struct xfrm_state *x)
+{
+ size_t replay_size = x->replay_esn ?
+ xfrm_replay_state_esn_len(x->replay_esn) :
+ sizeof(struct xfrm_replay_state);
+
+ return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id))
+ + nla_total_size(replay_size)
+ + nla_total_size(sizeof(struct xfrm_lifetime_cur))
+ + nla_total_size(sizeof(struct xfrm_mark))
+ + nla_total_size(4) /* XFRM_AE_RTHR */
+ + nla_total_size(4); /* XFRM_AE_ETHR */
+}
+
+static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
+{
+ struct xfrm_aevent_id *id;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_NEWAE, sizeof(*id), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ id = nlmsg_data(nlh);
+ memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr));
+ id->sa_id.spi = x->id.spi;
+ id->sa_id.family = x->props.family;
+ id->sa_id.proto = x->id.proto;
+ memcpy(&id->saddr, &x->props.saddr, sizeof(x->props.saddr));
+ id->reqid = x->props.reqid;
+ id->flags = c->data.aevent;
+
+ if (x->replay_esn) {
+ err = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
+ xfrm_replay_state_esn_len(x->replay_esn),
+ x->replay_esn);
+ } else {
+ err = nla_put(skb, XFRMA_REPLAY_VAL, sizeof(x->replay),
+ &x->replay);
+ }
+ if (err)
+ goto out_cancel;
+ err = nla_put(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft);
+ if (err)
+ goto out_cancel;
+
+ if (id->flags & XFRM_AE_RTHR) {
+ err = nla_put_u32(skb, XFRMA_REPLAY_THRESH, x->replay_maxdiff);
+ if (err)
+ goto out_cancel;
+ }
+ if (id->flags & XFRM_AE_ETHR) {
+ err = nla_put_u32(skb, XFRMA_ETIMER_THRESH,
+ x->replay_maxage * 10 / HZ);
+ if (err)
+ goto out_cancel;
+ }
+ err = xfrm_mark_put(skb, &x->mark);
+ if (err)
+ goto out_cancel;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+out_cancel:
+ nlmsg_cancel(skb, nlh);
+ return err;
+}
+
+static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_state *x;
+ struct sk_buff *r_skb;
+ int err;
+ struct km_event c;
+ u32 mark;
+ struct xfrm_mark m;
+ struct xfrm_aevent_id *p = nlmsg_data(nlh);
+ struct xfrm_usersa_id *id = &p->sa_id;
+
+ mark = xfrm_mark_get(attrs, &m);
+
+ x = xfrm_state_lookup(net, mark, &id->daddr, id->spi, id->proto, id->family);
+ if (x == NULL)
+ return -ESRCH;
+
+ r_skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
+ if (r_skb == NULL) {
+ xfrm_state_put(x);
+ return -ENOMEM;
+ }
+
+ /*
+ * XXX: is this lock really needed - none of the other
+ * gets lock (the concern is things getting updated
+ * while we are still reading) - jhs
+ */
+ spin_lock_bh(&x->lock);
+ c.data.aevent = p->flags;
+ c.seq = nlh->nlmsg_seq;
+ c.portid = nlh->nlmsg_pid;
+
+ if (build_aevent(r_skb, x, &c) < 0)
+ BUG();
+ err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).portid);
+ spin_unlock_bh(&x->lock);
+ xfrm_state_put(x);
+ return err;
+}
+
+static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_state *x;
+ struct km_event c;
+ int err = -EINVAL;
+ u32 mark = 0;
+ struct xfrm_mark m;
+ struct xfrm_aevent_id *p = nlmsg_data(nlh);
+ struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
+ struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];
+ struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
+
+ if (!lt && !rp && !re)
+ return err;
+
+ /* pedantic mode - thou shalt sayeth replaceth */
+ if (!(nlh->nlmsg_flags&NLM_F_REPLACE))
+ return err;
+
+ mark = xfrm_mark_get(attrs, &m);
+
+ x = xfrm_state_lookup(net, mark, &p->sa_id.daddr, p->sa_id.spi, p->sa_id.proto, p->sa_id.family);
+ if (x == NULL)
+ return -ESRCH;
+
+ if (x->km.state != XFRM_STATE_VALID)
+ goto out;
+
+ err = xfrm_replay_verify_len(x->replay_esn, re);
+ if (err)
+ goto out;
+
+ spin_lock_bh(&x->lock);
+ xfrm_update_ae_params(x, attrs, 1);
+ spin_unlock_bh(&x->lock);
+
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.portid = nlh->nlmsg_pid;
+ c.data.aevent = XFRM_AE_CU;
+ km_state_notify(x, &c);
+ err = 0;
+out:
+ xfrm_state_put(x);
+ return err;
+}
+
+static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct km_event c;
+ u8 type = XFRM_POLICY_TYPE_MAIN;
+ int err;
+
+ err = copy_from_user_policy_type(&type, attrs);
+ if (err)
+ return err;
+
+ err = xfrm_policy_flush(net, type, true);
+ if (err) {
+ if (err == -ESRCH) /* empty table */
+ return 0;
+ return err;
+ }
+
+ c.data.type = type;
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.portid = nlh->nlmsg_pid;
+ c.net = net;
+ km_policy_notify(NULL, 0, &c);
+ return 0;
+}
+
+static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_policy *xp;
+ struct xfrm_user_polexpire *up = nlmsg_data(nlh);
+ struct xfrm_userpolicy_info *p = &up->pol;
+ u8 type = XFRM_POLICY_TYPE_MAIN;
+ int err = -ENOENT;
+ struct xfrm_mark m;
+ u32 mark = xfrm_mark_get(attrs, &m);
+
+ err = copy_from_user_policy_type(&type, attrs);
+ if (err)
+ return err;
+
+ err = verify_policy_dir(p->dir);
+ if (err)
+ return err;
+
+ if (p->index)
+ xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err);
+ else {
+ struct nlattr *rt = attrs[XFRMA_SEC_CTX];
+ struct xfrm_sec_ctx *ctx;
+
+ err = verify_sec_ctx_len(attrs);
+ if (err)
+ return err;
+
+ ctx = NULL;
+ if (rt) {
+ struct xfrm_user_sec_ctx *uctx = nla_data(rt);
+
+ err = security_xfrm_policy_alloc(&ctx, uctx, GFP_KERNEL);
+ if (err)
+ return err;
+ }
+ xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir,
+ &p->sel, ctx, 0, &err);
+ security_xfrm_policy_free(ctx);
+ }
+ if (xp == NULL)
+ return -ENOENT;
+
+ if (unlikely(xp->walk.dead))
+ goto out;
+
+ err = 0;
+ if (up->hard) {
+ xfrm_policy_delete(xp, p->dir);
+ xfrm_audit_policy_delete(xp, 1, true);
+ } else {
+ // reset the timers here?
+ WARN(1, "Dont know what to do with soft policy expire\n");
+ }
+ km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid);
+
+out:
+ xfrm_pol_put(xp);
+ return err;
+}
+
+static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_state *x;
+ int err;
+ struct xfrm_user_expire *ue = nlmsg_data(nlh);
+ struct xfrm_usersa_info *p = &ue->state;
+ struct xfrm_mark m;
+ u32 mark = xfrm_mark_get(attrs, &m);
+
+ x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family);
+
+ err = -ENOENT;
+ if (x == NULL)
+ return err;
+
+ spin_lock_bh(&x->lock);
+ err = -EINVAL;
+ if (x->km.state != XFRM_STATE_VALID)
+ goto out;
+ km_state_expired(x, ue->hard, nlh->nlmsg_pid);
+
+ if (ue->hard) {
+ __xfrm_state_delete(x);
+ xfrm_audit_state_delete(x, 1, true);
+ }
+ err = 0;
+out:
+ spin_unlock_bh(&x->lock);
+ xfrm_state_put(x);
+ return err;
+}
+
+static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct net *net = sock_net(skb->sk);
+ struct xfrm_policy *xp;
+ struct xfrm_user_tmpl *ut;
+ int i;
+ struct nlattr *rt = attrs[XFRMA_TMPL];
+ struct xfrm_mark mark;
+
+ struct xfrm_user_acquire *ua = nlmsg_data(nlh);
+ struct xfrm_state *x = xfrm_state_alloc(net);
+ int err = -ENOMEM;
+
+ if (!x)
+ goto nomem;
+
+ xfrm_mark_get(attrs, &mark);
+
+ err = verify_newpolicy_info(&ua->policy);
+ if (err)
+ goto bad_policy;
+
+ /* build an XP */
+ xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);
+ if (!xp)
+ goto free_state;
+
+ memcpy(&x->id, &ua->id, sizeof(ua->id));
+ memcpy(&x->props.saddr, &ua->saddr, sizeof(ua->saddr));
+ memcpy(&x->sel, &ua->sel, sizeof(ua->sel));
+ xp->mark.m = x->mark.m = mark.m;
+ xp->mark.v = x->mark.v = mark.v;
+ ut = nla_data(rt);
+ /* extract the templates and for each call km_key */
+ for (i = 0; i < xp->xfrm_nr; i++, ut++) {
+ struct xfrm_tmpl *t = &xp->xfrm_vec[i];
+ memcpy(&x->id, &t->id, sizeof(x->id));
+ x->props.mode = t->mode;
+ x->props.reqid = t->reqid;
+ x->props.family = ut->family;
+ t->aalgos = ua->aalgos;
+ t->ealgos = ua->ealgos;
+ t->calgos = ua->calgos;
+ err = km_query(x, t, xp);
+
+ }
+
+ kfree(x);
+ kfree(xp);
+
+ return 0;
+
+bad_policy:
+ WARN(1, "BAD policy passed\n");
+free_state:
+ kfree(x);
+nomem:
+ return err;
+}
+
+#ifdef CONFIG_XFRM_MIGRATE
+static int copy_from_user_migrate(struct xfrm_migrate *ma,
+ struct xfrm_kmaddress *k,
+ struct nlattr **attrs, int *num)
+{
+ struct nlattr *rt = attrs[XFRMA_MIGRATE];
+ struct xfrm_user_migrate *um;
+ int i, num_migrate;
+
+ if (k != NULL) {
+ struct xfrm_user_kmaddress *uk;
+
+ uk = nla_data(attrs[XFRMA_KMADDRESS]);
+ memcpy(&k->local, &uk->local, sizeof(k->local));
+ memcpy(&k->remote, &uk->remote, sizeof(k->remote));
+ k->family = uk->family;
+ k->reserved = uk->reserved;
+ }
+
+ um = nla_data(rt);
+ num_migrate = nla_len(rt) / sizeof(*um);
+
+ if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH)
+ return -EINVAL;
+
+ for (i = 0; i < num_migrate; i++, um++, ma++) {
+ memcpy(&ma->old_daddr, &um->old_daddr, sizeof(ma->old_daddr));
+ memcpy(&ma->old_saddr, &um->old_saddr, sizeof(ma->old_saddr));
+ memcpy(&ma->new_daddr, &um->new_daddr, sizeof(ma->new_daddr));
+ memcpy(&ma->new_saddr, &um->new_saddr, sizeof(ma->new_saddr));
+
+ ma->proto = um->proto;
+ ma->mode = um->mode;
+ ma->reqid = um->reqid;
+
+ ma->old_family = um->old_family;
+ ma->new_family = um->new_family;
+ }
+
+ *num = i;
+ return 0;
+}
+
+static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ struct xfrm_userpolicy_id *pi = nlmsg_data(nlh);
+ struct xfrm_migrate m[XFRM_MAX_DEPTH];
+ struct xfrm_kmaddress km, *kmp;
+ u8 type;
+ int err;
+ int n = 0;
+ struct net *net = sock_net(skb->sk);
+
+ if (attrs[XFRMA_MIGRATE] == NULL)
+ return -EINVAL;
+
+ kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL;
+
+ err = copy_from_user_policy_type(&type, attrs);
+ if (err)
+ return err;
+
+ err = copy_from_user_migrate((struct xfrm_migrate *)m, kmp, attrs, &n);
+ if (err)
+ return err;
+
+ if (!n)
+ return 0;
+
+ xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net);
+
+ return 0;
+}
+#else
+static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct nlattr **attrs)
+{
+ return -ENOPROTOOPT;
+}
+#endif
+
+#ifdef CONFIG_XFRM_MIGRATE
+static int copy_to_user_migrate(const struct xfrm_migrate *m, struct sk_buff *skb)
+{
+ struct xfrm_user_migrate um;
+
+ memset(&um, 0, sizeof(um));
+ um.proto = m->proto;
+ um.mode = m->mode;
+ um.reqid = m->reqid;
+ um.old_family = m->old_family;
+ memcpy(&um.old_daddr, &m->old_daddr, sizeof(um.old_daddr));
+ memcpy(&um.old_saddr, &m->old_saddr, sizeof(um.old_saddr));
+ um.new_family = m->new_family;
+ memcpy(&um.new_daddr, &m->new_daddr, sizeof(um.new_daddr));
+ memcpy(&um.new_saddr, &m->new_saddr, sizeof(um.new_saddr));
+
+ return nla_put(skb, XFRMA_MIGRATE, sizeof(um), &um);
+}
+
+static int copy_to_user_kmaddress(const struct xfrm_kmaddress *k, struct sk_buff *skb)
+{
+ struct xfrm_user_kmaddress uk;
+
+ memset(&uk, 0, sizeof(uk));
+ uk.family = k->family;
+ uk.reserved = k->reserved;
+ memcpy(&uk.local, &k->local, sizeof(uk.local));
+ memcpy(&uk.remote, &k->remote, sizeof(uk.remote));
+
+ return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk);
+}
+
+static inline size_t xfrm_migrate_msgsize(int num_migrate, int with_kma)
+{
+ return NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_id))
+ + (with_kma ? nla_total_size(sizeof(struct xfrm_kmaddress)) : 0)
+ + nla_total_size(sizeof(struct xfrm_user_migrate) * num_migrate)
+ + userpolicy_type_attrsize();
+}
+
+static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m,
+ int num_migrate, const struct xfrm_kmaddress *k,
+ const struct xfrm_selector *sel, u8 dir, u8 type)
+{
+ const struct xfrm_migrate *mp;
+ struct xfrm_userpolicy_id *pol_id;
+ struct nlmsghdr *nlh;
+ int i, err;
+
+ nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MIGRATE, sizeof(*pol_id), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ pol_id = nlmsg_data(nlh);
+ /* copy data from selector, dir, and type to the pol_id */
+ memset(pol_id, 0, sizeof(*pol_id));
+ memcpy(&pol_id->sel, sel, sizeof(pol_id->sel));
+ pol_id->dir = dir;
+
+ if (k != NULL) {
+ err = copy_to_user_kmaddress(k, skb);
+ if (err)
+ goto out_cancel;
+ }
+ err = copy_to_user_policy_type(type, skb);
+ if (err)
+ goto out_cancel;
+ for (i = 0, mp = m ; i < num_migrate; i++, mp++) {
+ err = copy_to_user_migrate(mp, skb);
+ if (err)
+ goto out_cancel;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+out_cancel:
+ nlmsg_cancel(skb, nlh);
+ return err;
+}
+
+static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_migrate,
+ const struct xfrm_kmaddress *k)
+{
+ struct net *net = &init_net;
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ /* build migrate */
+ if (build_migrate(skb, m, num_migrate, k, sel, dir, type) < 0)
+ BUG();
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MIGRATE);
+}
+#else
+static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+ const struct xfrm_migrate *m, int num_migrate,
+ const struct xfrm_kmaddress *k)
+{
+ return -ENOPROTOOPT;
+}
+#endif
+
+#define XMSGSIZE(type) sizeof(struct type)
+
+static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
+ [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
+ [XFRM_MSG_DELSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
+ [XFRM_MSG_GETSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
+ [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info),
+ [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+ [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+ [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userspi_info),
+ [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_acquire),
+ [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_expire),
+ [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info),
+ [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
+ [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_polexpire),
+ [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush),
+ [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0,
+ [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
+ [XFRM_MSG_GETAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
+ [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
+ [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+ [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32),
+ [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
+ [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32),
+};
+
+#undef XMSGSIZE
+
+static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
+ [XFRMA_SA] = { .len = sizeof(struct xfrm_usersa_info)},
+ [XFRMA_POLICY] = { .len = sizeof(struct xfrm_userpolicy_info)},
+ [XFRMA_LASTUSED] = { .type = NLA_U64},
+ [XFRMA_ALG_AUTH_TRUNC] = { .len = sizeof(struct xfrm_algo_auth)},
+ [XFRMA_ALG_AEAD] = { .len = sizeof(struct xfrm_algo_aead) },
+ [XFRMA_ALG_AUTH] = { .len = sizeof(struct xfrm_algo) },
+ [XFRMA_ALG_CRYPT] = { .len = sizeof(struct xfrm_algo) },
+ [XFRMA_ALG_COMP] = { .len = sizeof(struct xfrm_algo) },
+ [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) },
+ [XFRMA_TMPL] = { .len = sizeof(struct xfrm_user_tmpl) },
+ [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) },
+ [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) },
+ [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) },
+ [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 },
+ [XFRMA_ETIMER_THRESH] = { .type = NLA_U32 },
+ [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) },
+ [XFRMA_COADDR] = { .len = sizeof(xfrm_address_t) },
+ [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type)},
+ [XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) },
+ [XFRMA_KMADDRESS] = { .len = sizeof(struct xfrm_user_kmaddress) },
+ [XFRMA_MARK] = { .len = sizeof(struct xfrm_mark) },
+ [XFRMA_TFCPAD] = { .type = NLA_U32 },
+ [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) },
+ [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 },
+ [XFRMA_PROTO] = { .type = NLA_U8 },
+ [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) },
+};
+
+static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
+ [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
+ [XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) },
+};
+
+static const struct xfrm_link {
+ int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
+ int (*dump)(struct sk_buff *, struct netlink_callback *);
+ int (*done)(struct netlink_callback *);
+ const struct nla_policy *nla_pol;
+ int nla_max;
+} xfrm_dispatch[XFRM_NR_MSGTYPES] = {
+ [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa },
+ [XFRM_MSG_DELSA - XFRM_MSG_BASE] = { .doit = xfrm_del_sa },
+ [XFRM_MSG_GETSA - XFRM_MSG_BASE] = { .doit = xfrm_get_sa,
+ .dump = xfrm_dump_sa,
+ .done = xfrm_dump_sa_done },
+ [XFRM_MSG_NEWPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy },
+ [XFRM_MSG_DELPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy },
+ [XFRM_MSG_GETPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_get_policy,
+ .dump = xfrm_dump_policy,
+ .done = xfrm_dump_policy_done },
+ [XFRM_MSG_ALLOCSPI - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi },
+ [XFRM_MSG_ACQUIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_acquire },
+ [XFRM_MSG_EXPIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_sa_expire },
+ [XFRM_MSG_UPDPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_add_policy },
+ [XFRM_MSG_UPDSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa },
+ [XFRM_MSG_POLEXPIRE - XFRM_MSG_BASE] = { .doit = xfrm_add_pol_expire},
+ [XFRM_MSG_FLUSHSA - XFRM_MSG_BASE] = { .doit = xfrm_flush_sa },
+ [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy },
+ [XFRM_MSG_NEWAE - XFRM_MSG_BASE] = { .doit = xfrm_new_ae },
+ [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae },
+ [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate },
+ [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo },
+ [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo,
+ .nla_pol = xfrma_spd_policy,
+ .nla_max = XFRMA_SPD_MAX },
+ [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo },
+};
+
+static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *attrs[XFRMA_MAX+1];
+ const struct xfrm_link *link;
+ int type, err;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
+ return -ENOTSUPP;
+#endif
+
+ type = nlh->nlmsg_type;
+ if (type > XFRM_MSG_MAX)
+ return -EINVAL;
+
+ type -= XFRM_MSG_BASE;
+ link = &xfrm_dispatch[type];
+
+ /* All operations require privileges, even GET */
+ if (!netlink_net_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
+ type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
+ (nlh->nlmsg_flags & NLM_F_DUMP)) {
+ if (link->dump == NULL)
+ return -EINVAL;
+
+ {
+ struct netlink_dump_control c = {
+ .dump = link->dump,
+ .done = link->done,
+ };
+ return netlink_dump_start(net->xfrm.nlsk, skb, nlh, &c);
+ }
+ }
+
+ err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs,
+ link->nla_max ? : XFRMA_MAX,
+ link->nla_pol ? : xfrma_policy);
+ if (err < 0)
+ return err;
+
+ if (link->doit == NULL)
+ return -EINVAL;
+
+ return link->doit(skb, nlh, attrs);
+}
+
+static void xfrm_netlink_rcv(struct sk_buff *skb)
+{
+ struct net *net = sock_net(skb->sk);
+
+ mutex_lock(&net->xfrm.xfrm_cfg_mutex);
+ netlink_rcv_skb(skb, &xfrm_user_rcv_msg);
+ mutex_unlock(&net->xfrm.xfrm_cfg_mutex);
+}
+
+static inline size_t xfrm_expire_msgsize(void)
+{
+ return NLMSG_ALIGN(sizeof(struct xfrm_user_expire))
+ + nla_total_size(sizeof(struct xfrm_mark));
+}
+
+static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
+{
+ struct xfrm_user_expire *ue;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, c->portid, 0, XFRM_MSG_EXPIRE, sizeof(*ue), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ue = nlmsg_data(nlh);
+ copy_to_user_state(x, &ue->state);
+ ue->hard = (c->data.hard != 0) ? 1 : 0;
+
+ err = xfrm_mark_put(skb, &x->mark);
+ if (err)
+ return err;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c)
+{
+ struct net *net = xs_net(x);
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(xfrm_expire_msgsize(), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ if (build_expire(skb, x, c) < 0) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE);
+}
+
+static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event *c)
+{
+ struct net *net = xs_net(x);
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ if (build_aevent(skb, x, c) < 0)
+ BUG();
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_AEVENTS);
+}
+
+static int xfrm_notify_sa_flush(const struct km_event *c)
+{
+ struct net *net = c->net;
+ struct xfrm_usersa_flush *p;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int len = NLMSG_ALIGN(sizeof(struct xfrm_usersa_flush));
+
+ skb = nlmsg_new(len, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_FLUSHSA, sizeof(*p), 0);
+ if (nlh == NULL) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ p = nlmsg_data(nlh);
+ p->proto = c->data.proto;
+
+ nlmsg_end(skb, nlh);
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA);
+}
+
+static inline size_t xfrm_sa_len(struct xfrm_state *x)
+{
+ size_t l = 0;
+ if (x->aead)
+ l += nla_total_size(aead_len(x->aead));
+ if (x->aalg) {
+ l += nla_total_size(sizeof(struct xfrm_algo) +
+ (x->aalg->alg_key_len + 7) / 8);
+ l += nla_total_size(xfrm_alg_auth_len(x->aalg));
+ }
+ if (x->ealg)
+ l += nla_total_size(xfrm_alg_len(x->ealg));
+ if (x->calg)
+ l += nla_total_size(sizeof(*x->calg));
+ if (x->encap)
+ l += nla_total_size(sizeof(*x->encap));
+ if (x->tfcpad)
+ l += nla_total_size(sizeof(x->tfcpad));
+ if (x->replay_esn)
+ l += nla_total_size(xfrm_replay_state_esn_len(x->replay_esn));
+ else
+ l += nla_total_size(sizeof(struct xfrm_replay_state));
+ if (x->security)
+ l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) +
+ x->security->ctx_len);
+ if (x->coaddr)
+ l += nla_total_size(sizeof(*x->coaddr));
+ if (x->props.extra_flags)
+ l += nla_total_size(sizeof(x->props.extra_flags));
+
+ /* Must count x->lastused as it may become non-zero behind our back. */
+ l += nla_total_size(sizeof(u64));
+
+ return l;
+}
+
+static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c)
+{
+ struct net *net = xs_net(x);
+ struct xfrm_usersa_info *p;
+ struct xfrm_usersa_id *id;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int len = xfrm_sa_len(x);
+ int headlen, err;
+
+ headlen = sizeof(*p);
+ if (c->event == XFRM_MSG_DELSA) {
+ len += nla_total_size(headlen);
+ headlen = sizeof(*id);
+ len += nla_total_size(sizeof(struct xfrm_mark));
+ }
+ len += NLMSG_ALIGN(headlen);
+
+ skb = nlmsg_new(len, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ nlh = nlmsg_put(skb, c->portid, c->seq, c->event, headlen, 0);
+ err = -EMSGSIZE;
+ if (nlh == NULL)
+ goto out_free_skb;
+
+ p = nlmsg_data(nlh);
+ if (c->event == XFRM_MSG_DELSA) {
+ struct nlattr *attr;
+
+ id = nlmsg_data(nlh);
+ memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
+ id->spi = x->id.spi;
+ id->family = x->props.family;
+ id->proto = x->id.proto;
+
+ attr = nla_reserve(skb, XFRMA_SA, sizeof(*p));
+ err = -EMSGSIZE;
+ if (attr == NULL)
+ goto out_free_skb;
+
+ p = nla_data(attr);
+ }
+ err = copy_to_user_state_extra(x, p, skb);
+ if (err)
+ goto out_free_skb;
+
+ nlmsg_end(skb, nlh);
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_SA);
+
+out_free_skb:
+ kfree_skb(skb);
+ return err;
+}
+
+static int xfrm_send_state_notify(struct xfrm_state *x, const struct km_event *c)
+{
+
+ switch (c->event) {
+ case XFRM_MSG_EXPIRE:
+ return xfrm_exp_state_notify(x, c);
+ case XFRM_MSG_NEWAE:
+ return xfrm_aevent_state_notify(x, c);
+ case XFRM_MSG_DELSA:
+ case XFRM_MSG_UPDSA:
+ case XFRM_MSG_NEWSA:
+ return xfrm_notify_sa(x, c);
+ case XFRM_MSG_FLUSHSA:
+ return xfrm_notify_sa_flush(c);
+ default:
+ printk(KERN_NOTICE "xfrm_user: Unknown SA event %d\n",
+ c->event);
+ break;
+ }
+
+ return 0;
+
+}
+
+static inline size_t xfrm_acquire_msgsize(struct xfrm_state *x,
+ struct xfrm_policy *xp)
+{
+ return NLMSG_ALIGN(sizeof(struct xfrm_user_acquire))
+ + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
+ + nla_total_size(sizeof(struct xfrm_mark))
+ + nla_total_size(xfrm_user_sec_ctx_size(x->security))
+ + userpolicy_type_attrsize();
+}
+
+static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
+ struct xfrm_tmpl *xt, struct xfrm_policy *xp)
+{
+ __u32 seq = xfrm_get_acqseq();
+ struct xfrm_user_acquire *ua;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_ACQUIRE, sizeof(*ua), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ua = nlmsg_data(nlh);
+ memcpy(&ua->id, &x->id, sizeof(ua->id));
+ memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr));
+ memcpy(&ua->sel, &x->sel, sizeof(ua->sel));
+ copy_to_user_policy(xp, &ua->policy, XFRM_POLICY_OUT);
+ ua->aalgos = xt->aalgos;
+ ua->ealgos = xt->ealgos;
+ ua->calgos = xt->calgos;
+ ua->seq = x->km.seq = seq;
+
+ err = copy_to_user_tmpl(xp, skb);
+ if (!err)
+ err = copy_to_user_state_sec_ctx(x, skb);
+ if (!err)
+ err = copy_to_user_policy_type(xp->type, skb);
+ if (!err)
+ err = xfrm_mark_put(skb, &xp->mark);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
+ struct xfrm_policy *xp)
+{
+ struct net *net = xs_net(x);
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(xfrm_acquire_msgsize(x, xp), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ if (build_acquire(skb, x, xt, xp) < 0)
+ BUG();
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_ACQUIRE);
+}
+
+/* User gives us xfrm_user_policy_info followed by an array of 0
+ * or more templates.
+ */
+static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
+ u8 *data, int len, int *dir)
+{
+ struct net *net = sock_net(sk);
+ struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data;
+ struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1);
+ struct xfrm_policy *xp;
+ int nr;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ if (opt != IP_XFRM_POLICY) {
+ *dir = -EOPNOTSUPP;
+ return NULL;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ if (opt != IPV6_XFRM_POLICY) {
+ *dir = -EOPNOTSUPP;
+ return NULL;
+ }
+ break;
+#endif
+ default:
+ *dir = -EINVAL;
+ return NULL;
+ }
+
+ *dir = -EINVAL;
+
+ if (len < sizeof(*p) ||
+ verify_newpolicy_info(p))
+ return NULL;
+
+ nr = ((len - sizeof(*p)) / sizeof(*ut));
+ if (validate_tmpl(nr, ut, p->sel.family))
+ return NULL;
+
+ if (p->dir > XFRM_POLICY_OUT)
+ return NULL;
+
+ xp = xfrm_policy_alloc(net, GFP_ATOMIC);
+ if (xp == NULL) {
+ *dir = -ENOBUFS;
+ return NULL;
+ }
+
+ copy_from_user_policy(xp, p);
+ xp->type = XFRM_POLICY_TYPE_MAIN;
+ copy_templates(xp, ut, nr);
+
+ *dir = p->dir;
+
+ return xp;
+}
+
+static inline size_t xfrm_polexpire_msgsize(struct xfrm_policy *xp)
+{
+ return NLMSG_ALIGN(sizeof(struct xfrm_user_polexpire))
+ + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
+ + nla_total_size(xfrm_user_sec_ctx_size(xp->security))
+ + nla_total_size(sizeof(struct xfrm_mark))
+ + userpolicy_type_attrsize();
+}
+
+static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
+ int dir, const struct km_event *c)
+{
+ struct xfrm_user_polexpire *upe;
+ int hard = c->data.hard;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, c->portid, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ upe = nlmsg_data(nlh);
+ copy_to_user_policy(xp, &upe->pol, dir);
+ err = copy_to_user_tmpl(xp, skb);
+ if (!err)
+ err = copy_to_user_sec_ctx(xp, skb);
+ if (!err)
+ err = copy_to_user_policy_type(xp->type, skb);
+ if (!err)
+ err = xfrm_mark_put(skb, &xp->mark);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+ upe->hard = !!hard;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+ struct net *net = xp_net(xp);
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(xfrm_polexpire_msgsize(xp), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ if (build_polexpire(skb, xp, dir, c) < 0)
+ BUG();
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_EXPIRE);
+}
+
+static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+ int len = nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
+ struct net *net = xp_net(xp);
+ struct xfrm_userpolicy_info *p;
+ struct xfrm_userpolicy_id *id;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int headlen, err;
+
+ headlen = sizeof(*p);
+ if (c->event == XFRM_MSG_DELPOLICY) {
+ len += nla_total_size(headlen);
+ headlen = sizeof(*id);
+ }
+ len += userpolicy_type_attrsize();
+ len += nla_total_size(sizeof(struct xfrm_mark));
+ len += NLMSG_ALIGN(headlen);
+
+ skb = nlmsg_new(len, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ nlh = nlmsg_put(skb, c->portid, c->seq, c->event, headlen, 0);
+ err = -EMSGSIZE;
+ if (nlh == NULL)
+ goto out_free_skb;
+
+ p = nlmsg_data(nlh);
+ if (c->event == XFRM_MSG_DELPOLICY) {
+ struct nlattr *attr;
+
+ id = nlmsg_data(nlh);
+ memset(id, 0, sizeof(*id));
+ id->dir = dir;
+ if (c->data.byid)
+ id->index = xp->index;
+ else
+ memcpy(&id->sel, &xp->selector, sizeof(id->sel));
+
+ attr = nla_reserve(skb, XFRMA_POLICY, sizeof(*p));
+ err = -EMSGSIZE;
+ if (attr == NULL)
+ goto out_free_skb;
+
+ p = nla_data(attr);
+ }
+
+ copy_to_user_policy(xp, p, dir);
+ err = copy_to_user_tmpl(xp, skb);
+ if (!err)
+ err = copy_to_user_policy_type(xp->type, skb);
+ if (!err)
+ err = xfrm_mark_put(skb, &xp->mark);
+ if (err)
+ goto out_free_skb;
+
+ nlmsg_end(skb, nlh);
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY);
+
+out_free_skb:
+ kfree_skb(skb);
+ return err;
+}
+
+static int xfrm_notify_policy_flush(const struct km_event *c)
+{
+ struct net *net = c->net;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int err;
+
+ skb = nlmsg_new(userpolicy_type_attrsize(), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ nlh = nlmsg_put(skb, c->portid, c->seq, XFRM_MSG_FLUSHPOLICY, 0, 0);
+ err = -EMSGSIZE;
+ if (nlh == NULL)
+ goto out_free_skb;
+ err = copy_to_user_policy_type(c->data.type, skb);
+ if (err)
+ goto out_free_skb;
+
+ nlmsg_end(skb, nlh);
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY);
+
+out_free_skb:
+ kfree_skb(skb);
+ return err;
+}
+
+static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+
+ switch (c->event) {
+ case XFRM_MSG_NEWPOLICY:
+ case XFRM_MSG_UPDPOLICY:
+ case XFRM_MSG_DELPOLICY:
+ return xfrm_notify_policy(xp, dir, c);
+ case XFRM_MSG_FLUSHPOLICY:
+ return xfrm_notify_policy_flush(c);
+ case XFRM_MSG_POLEXPIRE:
+ return xfrm_exp_policy_notify(xp, dir, c);
+ default:
+ printk(KERN_NOTICE "xfrm_user: Unknown Policy event %d\n",
+ c->event);
+ }
+
+ return 0;
+
+}
+
+static inline size_t xfrm_report_msgsize(void)
+{
+ return NLMSG_ALIGN(sizeof(struct xfrm_user_report));
+}
+
+static int build_report(struct sk_buff *skb, u8 proto,
+ struct xfrm_selector *sel, xfrm_address_t *addr)
+{
+ struct xfrm_user_report *ur;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_REPORT, sizeof(*ur), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ur = nlmsg_data(nlh);
+ ur->proto = proto;
+ memcpy(&ur->sel, sel, sizeof(ur->sel));
+
+ if (addr) {
+ int err = nla_put(skb, XFRMA_COADDR, sizeof(*addr), addr);
+ if (err) {
+ nlmsg_cancel(skb, nlh);
+ return err;
+ }
+ }
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_send_report(struct net *net, u8 proto,
+ struct xfrm_selector *sel, xfrm_address_t *addr)
+{
+ struct sk_buff *skb;
+
+ skb = nlmsg_new(xfrm_report_msgsize(), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ if (build_report(skb, proto, sel, addr) < 0)
+ BUG();
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_REPORT);
+}
+
+static inline size_t xfrm_mapping_msgsize(void)
+{
+ return NLMSG_ALIGN(sizeof(struct xfrm_user_mapping));
+}
+
+static int build_mapping(struct sk_buff *skb, struct xfrm_state *x,
+ xfrm_address_t *new_saddr, __be16 new_sport)
+{
+ struct xfrm_user_mapping *um;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MAPPING, sizeof(*um), 0);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ um = nlmsg_data(nlh);
+
+ memcpy(&um->id.daddr, &x->id.daddr, sizeof(um->id.daddr));
+ um->id.spi = x->id.spi;
+ um->id.family = x->props.family;
+ um->id.proto = x->id.proto;
+ memcpy(&um->new_saddr, new_saddr, sizeof(um->new_saddr));
+ memcpy(&um->old_saddr, &x->props.saddr, sizeof(um->old_saddr));
+ um->new_sport = new_sport;
+ um->old_sport = x->encap->encap_sport;
+ um->reqid = x->props.reqid;
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
+ __be16 sport)
+{
+ struct net *net = xs_net(x);
+ struct sk_buff *skb;
+
+ if (x->id.proto != IPPROTO_ESP)
+ return -EINVAL;
+
+ if (!x->encap)
+ return -EINVAL;
+
+ skb = nlmsg_new(xfrm_mapping_msgsize(), GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+
+ if (build_mapping(skb, x, ipaddr, sport) < 0)
+ BUG();
+
+ return xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_MAPPING);
+}
+
+static bool xfrm_is_alive(const struct km_event *c)
+{
+ return (bool)xfrm_acquire_is_on(c->net);
+}
+
+static struct xfrm_mgr netlink_mgr = {
+ .id = "netlink",
+ .notify = xfrm_send_state_notify,
+ .acquire = xfrm_send_acquire,
+ .compile_policy = xfrm_compile_policy,
+ .notify_policy = xfrm_send_policy_notify,
+ .report = xfrm_send_report,
+ .migrate = xfrm_send_migrate,
+ .new_mapping = xfrm_send_mapping,
+ .is_alive = xfrm_is_alive,
+};
+
+static int __net_init xfrm_user_net_init(struct net *net)
+{
+ struct sock *nlsk;
+ struct netlink_kernel_cfg cfg = {
+ .groups = XFRMNLGRP_MAX,
+ .input = xfrm_netlink_rcv,
+ };
+
+ nlsk = netlink_kernel_create(net, NETLINK_XFRM, &cfg);
+ if (nlsk == NULL)
+ return -ENOMEM;
+ net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
+ rcu_assign_pointer(net->xfrm.nlsk, nlsk);
+ return 0;
+}
+
+static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
+{
+ struct net *net;
+ list_for_each_entry(net, net_exit_list, exit_list)
+ RCU_INIT_POINTER(net->xfrm.nlsk, NULL);
+ synchronize_net();
+ list_for_each_entry(net, net_exit_list, exit_list)
+ netlink_kernel_release(net->xfrm.nlsk_stash);
+}
+
+static struct pernet_operations xfrm_user_net_ops = {
+ .init = xfrm_user_net_init,
+ .exit_batch = xfrm_user_net_exit,
+};
+
+static int __init xfrm_user_init(void)
+{
+ int rv;
+
+ printk(KERN_INFO "Initializing XFRM netlink socket\n");
+
+ rv = register_pernet_subsys(&xfrm_user_net_ops);
+ if (rv < 0)
+ return rv;
+ rv = xfrm_register_km(&netlink_mgr);
+ if (rv < 0)
+ unregister_pernet_subsys(&xfrm_user_net_ops);
+ return rv;
+}
+
+static void __exit xfrm_user_exit(void)
+{
+ xfrm_unregister_km(&netlink_mgr);
+ unregister_pernet_subsys(&xfrm_user_net_ops);
+}
+
+module_init(xfrm_user_init);
+module_exit(xfrm_user_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
+